[Pkg-ceph-commits] [ceph] 01/03: Imported Upstream version 9.2.0

Mon Nov 9 12:43:56 UTC 2015

This is an automated email from the git hooks/post-receive script.

jamespage pushed a commit to branch experimental
in repository ceph.

commit 754935582e735a88ca68e5c4f5979f0d53fff8f9
Author: James Page <james.page at ubuntu.com>
Date:   Mon Nov 9 12:38:06 2015 +0000

    Imported Upstream version 9.2.0
---
 AUTHORS                                            |   156 +-
 COPYING                                            |     8 +
 ChangeLog                                          |  7107 +++++--
 Makefile.am                                        |    31 +-
 Makefile.in                                        |    48 +-
 README                                             |    33 +-
 autogen.sh                                         |     3 +-
 ceph.spec                                          |   667 +-
 ceph.spec.in                                       |   665 +-
 configure                                          | 11701 ++++++-----
 configure.ac                                       |   277 +-
 doc/Makefile.am                                    |    33 +
 doc/Makefile.in                                    |   551 +
 doc/man/8/ceph-authtool.rst                        |   178 +
 doc/man/8/ceph-clsinfo.rst                         |    49 +
 doc/man/8/ceph-conf.rst                            |   129 +
 doc/man/8/ceph-create-keys.rst                     |    63 +
 doc/man/8/ceph-debugpack.rst                       |    50 +
 doc/man/8/ceph-dencoder.rst                        |   151 +
 doc/man/8/ceph-deploy.rst                          |   608 +
 doc/man/8/ceph-disk.rst                            |   277 +
 doc/man/8/ceph-fuse.rst                            |    64 +
 doc/man/8/ceph-mds.rst                             |    92 +
 doc/man/8/ceph-mon.rst                             |    94 +
 doc/man/8/ceph-osd.rst                             |   122 +
 doc/man/8/ceph-post-file.rst                       |    71 +
 doc/man/8/ceph-rbdnamer.rst                        |    41 +
 doc/man/8/ceph-rest-api.rst                        |   150 +
 doc/man/8/ceph-run.rst                             |    45 +
 doc/man/8/ceph-syn.rst                             |    99 +
 doc/man/8/ceph.rst                                 |  1437 ++
 doc/man/8/cephfs.rst                               |    99 +
 doc/man/8/crushtool.rst                            |   265 +
 doc/man/8/librados-config.rst                      |    46 +
 doc/man/8/monmaptool.rst                           |   107 +
 doc/man/8/mount.ceph.rst                           |   165 +
 doc/man/8/osdmaptool.rst                           |    77 +
 doc/man/8/rados.rst                                |   181 +
 doc/man/8/radosgw-admin.rst                        |   463 +
 doc/man/8/radosgw.rst                              |   256 +
 doc/man/8/rbd-fuse.rst                             |    56 +
 doc/man/8/rbd-replay-many.rst                      |    73 +
 doc/man/8/rbd-replay-prep.rst                      |    55 +
 doc/man/8/rbd-replay.rst                           |    78 +
 doc/man/8/rbd.rst                                  |   522 +
 etc/default/ceph                                   |    12 +
 etc/sysconfig/SuSEfirewall2.d/services/ceph-mon    |     5 +
 .../SuSEfirewall2.d/services/ceph-osd-mds          |     5 +
 etc/sysconfig/ceph                                 |    23 +
 install-deps.sh                                    |   100 +-
 m4/ax_arm.m4                                       |    18 +-
 man/Makefile-server.am                             |     6 +
 man/Makefile.am                                    |    20 +
 man/Makefile.in                                    |   140 +-
 man/ceph-authtool.8                                |    31 +-
 man/ceph-clsinfo.8                                 |    31 +-
 man/ceph-conf.8                                    |   134 +-
 man/ceph-create-keys.8                             |     6 +-
 man/ceph-debugpack.8                               |    31 +-
 man/ceph-dencoder.8                                |    59 +-
 man/ceph-deploy.8                                  |    64 +-
 man/ceph-detect-init.8                             |    78 +
 man/ceph-disk.8                                    |    76 +-
 man/ceph-fuse.8                                    |    31 +-
 man/ceph-mds.8                                     |    54 +-
 man/ceph-mon.8                                     |    44 +-
 man/ceph-osd.8                                     |    44 +-
 man/ceph-post-file.8                               |    31 +-
 man/ceph-rbdnamer.8                                |    31 +-
 man/ceph-rest-api.8                                |    31 +-
 man/ceph-run.8                                     |    31 +-
 man/ceph-syn.8                                     |    31 +-
 man/ceph.8                                         |   118 +-
 man/ceph_selinux.8                                 |   370 +
 man/cephfs.8                                       |    31 +-
 man/conf.py                                        |    59 +
 man/crushtool.8                                    |    61 +-
 man/librados-config.8                              |    31 +-
 man/monmaptool.8                                   |    31 +-
 man/mount.ceph.8                                   |    31 +-
 man/osdmaptool.8                                   |    31 +-
 man/rados.8                                        |    43 +-
 man/radosgw-admin.8                                |   382 +-
 man/radosgw.8                                      |   279 +-
 man/rbd-fuse.8                                     |    31 +-
 man/rbd-replay-many.8                              |    31 +-
 man/rbd-replay-prep.8                              |    36 +-
 man/rbd-replay.8                                   |    31 +-
 man/rbd.8                                          |   254 +-
 .../erasure-code/encode-decode-non-regression.sh   |    38 +
 selinux/Makefile.am                                |    22 +
 selinux/Makefile.in                                |   539 +
 selinux/ceph.fc                                    |    13 +
 selinux/ceph.if                                    |   265 +
 selinux/ceph.te                                    |   111 +
 src/.git_version                                   |     4 +-
 src/Makefile-client.am                             |    23 +-
 src/Makefile-env.am                                |    46 +-
 src/Makefile-rocksdb.am                            |   811 +-
 src/Makefile-server.am                             |     2 -
 src/Makefile.am                                    |    62 +-
 src/Makefile.in                                    |  6876 ++++--
 src/acconfig.h.in                                  |    24 +-
 src/arch/arm.c                                     |     6 +
 src/arch/arm.h                                     |     1 +
 src/auth/Auth.h                                    |     2 +-
 src/auth/Crypto.cc                                 |   529 +-
 src/auth/Crypto.h                                  |   105 +-
 src/auth/KeyRing.cc                                |     9 +-
 src/auth/cephx/CephxKeyServer.cc                   |     2 +-
 src/auth/cephx/CephxProtocol.cc                    |     3 +-
 src/auth/cephx/CephxProtocol.h                     |     5 +-
 src/auth/cephx/CephxServiceHandler.cc              |     2 +-
 src/auth/cephx/CephxSessionHandler.cc              |   147 +-
 src/auth/cephx/CephxSessionHandler.h               |     3 +-
 src/ceph-create-keys                               |    24 +-
 src/ceph-detect-init/AUTHORS.rst                   |     2 +
 src/ceph-detect-init/MANIFEST.in                   |     1 +
 src/ceph-detect-init/Makefile.am                   |    72 +
 src/ceph-detect-init/README.rst                    |    28 +
 src/ceph-detect-init/ceph_detect_init/__init__.py  |   112 +
 .../ceph_detect_init/centos/__init__.py            |    13 +
 src/ceph-detect-init/ceph_detect_init/exc.py       |    35 +
 .../ceph_detect_init/fedora/__init__.py            |    13 +
 src/ceph-detect-init/ceph_detect_init/main.py      |    63 +
 .../ceph_detect_init/rhel/__init__.py              |    13 +
 .../ceph_detect_init/suse/__init__.py              |    17 +
 .../integration/centos-6.dockerfile                |     4 +
 .../integration/centos-7.dockerfile                |     4 +
 .../integration/debian-jessie.dockerfile           |     6 +
 .../integration/debian-sid.dockerfile              |     4 +
 .../integration/debian-squeeze.dockerfile          |     4 +
 .../integration/debian-wheezy.dockerfile           |     4 +
 .../integration/fedora-21.dockerfile               |     3 +
 .../integration/opensuse-13.1.dockerfile           |     3 +
 .../integration/opensuse-13.2.dockerfile           |     3 +
 src/ceph-detect-init/integration/test_main.py      |    94 +
 .../integration/ubuntu-12.04.dockerfile            |     4 +
 .../integration/ubuntu-14.04.dockerfile            |     6 +
 .../integration/ubuntu-15.04.dockerfile            |     4 +
 src/ceph-detect-init/requirements.txt              |     1 +
 src/ceph-detect-init/run-tox.sh                    |    36 +
 src/ceph-detect-init/setup.py                      |    79 +
 src/ceph-detect-init/test-requirements.txt         |    10 +
 src/ceph-detect-init/tests/test_all.py             |   171 +
 src/ceph-detect-init/tox.ini                       |    31 +
 src/ceph-disk                                      |  1210 +-
 src/ceph-disk-activate                             |     3 -
 src/ceph-disk-prepare                              |     3 -
 src/ceph-disk-udev                                 |     4 +-
 src/ceph-osd-prestart.sh                           |    19 +-
 src/ceph-rbdnamer                                  |     2 +-
 src/ceph.in                                        |   170 +-
 src/ceph_common.sh                                 |     4 +-
 src/ceph_fuse.cc                                   |    47 +-
 src/ceph_mds.cc                                    |    12 +-
 src/ceph_mon.cc                                    |    20 +-
 src/ceph_osd.cc                                    |   101 +-
 src/ceph_syn.cc                                    |     6 +-
 src/check_version                                  |    19 -
 src/client/Client.cc                               |  1770 +-
 src/client/Client.h                                |   128 +-
 src/client/Dentry.h                                |    11 +-
 src/client/Fh.h                                    |     6 +-
 src/client/Inode.cc                                |    41 +-
 src/client/Inode.h                                 |    49 +-
 src/client/InodeRef.h                              |    12 +
 src/client/Makefile.am                             |     4 +-
 src/client/MetaRequest.cc                          |    30 -
 src/client/MetaRequest.h                           |    79 +-
 src/client/MetaSession.h                           |     1 +
 src/client/ObjecterWriteback.h                     |     7 -
 src/client/SyntheticClient.cc                      |    10 +-
 src/client/fuse_ll.cc                              |   101 +-
 src/cls/Makefile-client.am                         |    18 +-
 src/cls/Makefile-server.am                         |    14 +
 src/cls/cephfs/cls_cephfs.cc                       |   143 +
 src/cls/cephfs/cls_cephfs.h                        |   127 +
 src/cls/cephfs/cls_cephfs_client.cc                |   146 +
 src/cls/cephfs/cls_cephfs_client.h                 |    26 +
 src/cls/hello/cls_hello.cc                         |    41 +-
 src/cls/lock/cls_lock.cc                           |     8 +-
 src/cls/log/cls_log.cc                             |     1 +
 src/cls/log/cls_log_client.cc                      |     1 +
 src/cls/numops/cls_numops.cc                       |   163 +
 src/cls/numops/cls_numops_client.cc                |    80 +
 src/cls/numops/cls_numops_client.h                 |    49 +
 src/cls/rbd/cls_rbd.cc                             |   444 +-
 src/cls/rbd/cls_rbd_client.cc                      |   115 +-
 src/cls/rbd/cls_rbd_client.h                       |    17 +-
 src/cls/refcount/cls_refcount.cc                   |     1 +
 src/cls/rgw/cls_rgw.cc                             |    84 +-
 src/cls/rgw/cls_rgw_client.h                       |     1 +
 src/cls/rgw/cls_rgw_types.cc                       |     1 +
 src/cls/timeindex/cls_timeindex.cc                 |   273 +
 src/cls/timeindex/cls_timeindex_client.cc          |   157 +
 src/cls/timeindex/cls_timeindex_client.h           |    52 +
 src/cls/timeindex/cls_timeindex_ops.h              |   116 +
 src/cls/timeindex/cls_timeindex_types.h            |    43 +
 src/cls/user/cls_user.cc                           |     2 +-
 src/cls/version/cls_version.cc                     |     1 +
 src/common/ConfUtils.cc                            |     1 +
 src/common/Finisher.cc                             |    14 +-
 src/common/Finisher.h                              |    42 +-
 src/common/Formatter.h                             |     9 +-
 src/common/HeartbeatMap.cc                         |    31 +-
 src/common/HeartbeatMap.h                          |    16 +-
 src/common/Initialize.h                            |     4 +-
 src/common/Makefile.am                             |    35 +-
 src/common/Mutex.cc                                |    23 +-
 src/common/OutputDataSocket.cc                     |     4 +-
 src/common/Preforker.h                             |    43 +-
 src/common/PrioritizedQueue.h                      |    62 +-
 src/common/RWLock.h                                |    32 +-
 src/common/Readahead.cc                            |    12 +
 src/common/RefCountedObj.h                         |     2 +-
 src/common/SubProcess.h                            |   484 +
 src/common/Thread.cc                               |    60 +-
 src/common/Thread.h                                |     8 +-
 src/common/Throttle.cc                             |   118 +-
 src/common/Throttle.h                              |   137 +-
 src/common/TracepointProvider.cc                   |    44 +
 src/common/TracepointProvider.h                    |    83 +
 src/common/TrackedOp.cc                            |    22 +-
 src/common/TrackedOp.h                             |    12 +-
 src/common/WorkQueue.h                             |    84 +-
 src/common/admin_socket.cc                         |     6 +-
 src/common/blkdev.cc                               |    68 +-
 src/common/blkdev.h                                |     2 +
 src/common/buffer.cc                               |   395 +-
 src/common/ceph_argparse.cc                        |   182 +-
 src/common/ceph_argparse.h                         |    16 +-
 src/common/ceph_context.cc                         |    59 +-
 src/common/ceph_context.h                          |    64 +-
 src/common/ceph_crypto.cc                          |     2 +-
 src/common/ceph_crypto.h                           |     8 +-
 src/common/ceph_fs.cc                              |     3 +-
 src/common/ceph_json.h                             |     4 +-
 src/common/ceph_strings.cc                         |     2 +
 src/common/cmdparse.cc                             |     2 +-
 src/common/config.cc                               |    27 +-
 src/common/config.h                                |     4 +-
 src/common/config_obs.h                            |    14 +-
 src/common/config_opts.h                           |   198 +-
 src/common/crc32c.cc                               |     6 +
 src/common/crc32c_aarch64.c                        |    47 +
 src/common/crc32c_aarch64.h                        |    27 +
 src/common/hobject.cc                              |   175 +-
 src/common/hobject.h                               |   287 +-
 src/common/ipaddr.cc                               |     1 +
 src/common/lockdep.cc                              |     5 +-
 src/common/obj_bencher.cc                          |   491 +-
 src/common/obj_bencher.h                           |    40 +-
 src/common/perf_counters.cc                        |    42 +-
 src/common/perf_counters.h                         |    29 +-
 src/common/safe_io.c                               |     2 +
 src/common/shared_cache.hpp                        |    46 +-
 src/common/sharedptr_registry.hpp                  |    20 +-
 src/common/simple_cache.hpp                        |    10 +-
 src/common/str_map.cc                              |    18 +-
 src/common/strtol.cc                               |    41 +-
 src/common/sync_filesystem.h                       |    25 +-
 src/common/tracked_int_ptr.hpp                     |     9 +
 src/common/types.cc                                |     4 +-
 src/common/util.cc                                 |   132 +
 src/common/xattr.c                                 |    28 +-
 src/common/xattr.h                                 |     8 +
 src/compressor/AsyncCompressor.cc                  |   157 +
 src/compressor/AsyncCompressor.h                   |   128 +
 src/compressor/Compressor.cc                       |    25 +
 src/compressor/Compressor.h                        |    30 +
 src/compressor/Makefile.am                         |    11 +
 src/compressor/SnappyCompressor.h                  |    78 +
 src/crush/CrushTester.cc                           |   121 +-
 src/crush/CrushTester.h                            |     2 +-
 src/crush/CrushWrapper.cc                          |    71 +-
 src/crush/CrushWrapper.h                           |    11 +-
 src/crush/Makefile.am                              |     3 +-
 src/crush/builder.c                                |    20 +
 src/crush/builder.h                                |     3 +
 src/crush/crush.c                                  |    33 +-
 src/crush/crush.h                                  |    16 +-
 src/crush/crush_compat.h                           |    39 +
 src/crush/crush_ln_table.h                         |    36 +-
 src/crush/hash.c                                   |    12 +-
 src/crush/hash.h                                   |     6 +
 src/crush/mapper.c                                 |   132 +-
 src/erasure-code/ErasureCode.cc                    |    64 +-
 src/erasure-code/ErasureCode.h                     |    35 +-
 src/erasure-code/ErasureCodeInterface.h            |    40 +
 src/erasure-code/ErasureCodePlugin.cc              |    50 +-
 src/erasure-code/ErasureCodePlugin.h               |    15 +-
 src/erasure-code/isa/ErasureCodeIsa.cc             |    40 +-
 src/erasure-code/isa/ErasureCodeIsa.h              |    28 +-
 src/erasure-code/isa/ErasureCodeIsaTableCache.cc   |    16 +-
 src/erasure-code/isa/ErasureCodePluginIsa.cc       |    23 +-
 src/erasure-code/isa/Makefile.am                   |    20 +-
 src/erasure-code/isa/isa-l/erasure_code/ec_base.c  |    30 +-
 src/erasure-code/isa/isa-l/erasure_code/ec_base.h  |     2 +-
 .../isa/isa-l/erasure_code/ec_highlevel_func.c     |   121 +-
 .../isa/isa-l/erasure_code/ec_multibinary.asm.s    |   257 +-
 .../isa-l/erasure_code/gf_2vect_dot_prod_avx.asm.s |   159 +-
 .../erasure_code/gf_2vect_dot_prod_avx2.asm.s      |   166 +-
 .../isa-l/erasure_code/gf_2vect_dot_prod_sse.asm.s |   161 +-
 .../isa/isa-l/erasure_code/gf_2vect_mad_avx.asm.s  |   236 +
 .../isa/isa-l/erasure_code/gf_2vect_mad_avx2.asm.s |   247 +
 .../isa/isa-l/erasure_code/gf_2vect_mad_sse.asm.s  |   239 +
 .../isa-l/erasure_code/gf_3vect_dot_prod_avx.asm.s |   179 +-
 .../erasure_code/gf_3vect_dot_prod_avx2.asm.s      |   204 +-
 .../isa-l/erasure_code/gf_3vect_dot_prod_sse.asm.s |   179 +-
 .../isa/isa-l/erasure_code/gf_3vect_mad_avx.asm.s  |   288 +
 .../isa/isa-l/erasure_code/gf_3vect_mad_avx2.asm.s |   317 +
 .../isa/isa-l/erasure_code/gf_3vect_mad_sse.asm.s  |   298 +
 .../isa-l/erasure_code/gf_4vect_dot_prod_avx.asm.s |   221 +-
 .../erasure_code/gf_4vect_dot_prod_avx2.asm.s      |   233 +-
 .../isa-l/erasure_code/gf_4vect_dot_prod_sse.asm.s |   241 +-
 .../isa/isa-l/erasure_code/gf_4vect_mad_avx.asm.s  |   336 +
 .../isa/isa-l/erasure_code/gf_4vect_mad_avx2.asm.s |   342 +
 .../isa/isa-l/erasure_code/gf_4vect_mad_sse.asm.s  |   342 +
 .../isa-l/erasure_code/gf_5vect_dot_prod_avx.asm.s |    16 +-
 .../erasure_code/gf_5vect_dot_prod_avx2.asm.s      |    16 +-
 .../isa-l/erasure_code/gf_5vect_dot_prod_sse.asm.s |    36 +-
 .../isa/isa-l/erasure_code/gf_5vect_mad_avx.asm.s  |   365 +
 .../isa/isa-l/erasure_code/gf_5vect_mad_avx2.asm.s |   363 +
 .../isa/isa-l/erasure_code/gf_5vect_mad_sse.asm.s  |   373 +
 .../isa-l/erasure_code/gf_6vect_dot_prod_avx.asm.s |    16 +-
 .../erasure_code/gf_6vect_dot_prod_avx2.asm.s      |    16 +-
 .../isa-l/erasure_code/gf_6vect_dot_prod_sse.asm.s |    40 +-
 .../isa/isa-l/erasure_code/gf_6vect_mad_avx.asm.s  |   394 +
 .../isa/isa-l/erasure_code/gf_6vect_mad_avx2.asm.s |   400 +
 .../isa/isa-l/erasure_code/gf_6vect_mad_sse.asm.s  |   406 +
 .../isa-l/erasure_code/gf_vect_dot_prod_avx.asm.s  |   107 +-
 .../isa-l/erasure_code/gf_vect_dot_prod_avx2.asm.s |   111 +-
 .../isa-l/erasure_code/gf_vect_dot_prod_sse.asm.s  |   106 +-
 .../isa/isa-l/erasure_code/gf_vect_mad_avx.asm.s   |   196 +
 .../isa/isa-l/erasure_code/gf_vect_mad_avx2.asm.s  |   203 +
 .../isa/isa-l/erasure_code/gf_vect_mad_sse.asm.s   |   197 +
 .../isa/isa-l/erasure_code/gf_vect_mul_avx.asm.s   |    16 +-
 .../isa/isa-l/erasure_code/gf_vect_mul_sse.asm.s   |    16 +-
 src/erasure-code/isa/isa-l/include/erasure_code.h  |   370 +-
 src/erasure-code/isa/isa-l/include/gf_vect_mul.h   |     6 +-
 src/erasure-code/isa/isa-l/include/reg_sizes.asm   |    29 +-
 src/erasure-code/isa/isa-l/include/types.h         |    18 +-
 src/erasure-code/jerasure/ErasureCodeJerasure.cc   |   108 +-
 src/erasure-code/jerasure/ErasureCodeJerasure.h    |    82 +-
 .../jerasure/ErasureCodePluginJerasure.cc          |    17 +-
 .../jerasure/ErasureCodePluginSelectJerasure.cc    |    27 +-
 src/erasure-code/jerasure/gf-complete/src/gf.c     |     4 +-
 .../jerasure/gf-complete/src/gf_w128.c             |     4 +-
 src/erasure-code/jerasure/gf-complete/src/gf_w16.c |     5 +-
 src/erasure-code/jerasure/gf-complete/src/gf_w32.c |    18 +-
 src/erasure-code/jerasure/gf-complete/src/gf_w64.c |    18 +-
 .../jerasure/gf-complete/src/gf_wgen.c             |    36 +-
 src/erasure-code/lrc/ErasureCodeLrc.cc             |   153 +-
 src/erasure-code/lrc/ErasureCodeLrc.h              |    21 +-
 src/erasure-code/lrc/ErasureCodePluginLrc.cc       |    18 +-
 .../shec/ErasureCodePluginSelectShec.cc            |   102 +
 src/erasure-code/shec/ErasureCodePluginShec.cc     |    27 +-
 src/erasure-code/shec/ErasureCodeShec.cc           |   538 +-
 src/erasure-code/shec/ErasureCodeShec.h            |    31 +-
 src/erasure-code/shec/ErasureCodeShecTableCache.cc |   256 +-
 src/erasure-code/shec/ErasureCodeShecTableCache.h  |    66 +-
 src/erasure-code/shec/Makefile.am                  |   117 +-
 src/erasure-code/shec/shec.cc                      |   329 -
 src/erasure-code/shec/shec.h                       |    35 -
 src/global/global_context.h                        |     1 -
 src/global/global_init.cc                          |   103 +-
 src/include/CompatSet.h                            |    18 +-
 src/include/Context.h                              |     3 +-
 src/include/Makefile.am                            |     8 +-
 src/include/atomic.h                               |    15 +
 src/include/buffer.h                               |   195 +-
 src/include/ceph_features.h                        |    10 +-
 src/include/ceph_fs.h                              |     6 +
 src/include/cephfs/libcephfs.h                     |   129 +-
 src/include/cmp.h                                  |    34 +
 src/include/compact_map.h                          |   347 +
 src/include/compact_set.h                          |   290 +
 src/include/compat.h                               |    17 +-
 src/include/encoding.h                             |    82 +-
 src/include/filepath.h                             |     2 +-
 src/include/frag.h                                 |    41 +-
 src/include/hash_namespace.h                       |    24 -
 src/include/inline_memory.h                        |   138 +
 src/include/interval_set.h                         |     8 +
 src/include/krbd.h                                 |     2 +
 src/include/memory.h                               |    17 +-
 src/include/object.h                               |    11 +-
 src/include/rados.h                                |    27 +-
 src/include/rados/buffer.h                         |   195 +-
 src/include/rados/librados.h                       |    79 +-
 src/include/rados/librados.hpp                     |    53 +-
 src/include/rados/memory.h                         |    17 +-
 src/include/radosstriper/libradosstriper.h         |    10 +-
 src/include/radosstriper/libradosstriper.hpp       |     8 +-
 src/include/rangeset.h                             |     1 -
 src/include/rbd/features.h                         |    30 +-
 src/include/rbd/librbd.h                           |    46 +-
 src/include/rbd/librbd.hpp                         |    24 +-
 src/include/rbd/object_map_types.h                 |     1 +
 src/include/sock_compat.h                          |    26 +
 src/include/str_list.h                             |    55 +
 src/include/str_map.h                              |     2 +-
 src/include/timegm.h                               |    79 +
 src/include/types.h                                |    48 +-
 src/include/unordered_map.h                        |    13 -
 src/include/unordered_set.h                        |    12 -
 src/include/util.h                                 |    10 +
 src/include/utime.h                                |     5 +-
 src/include/uuid.h                                 |    42 +-
 src/include/xlist.h                                |    14 +-
 src/init-ceph.in                                   |    35 +-
 src/init-rbdmap                                    |    58 +-
 src/java/Makefile.in                               |    10 +-
 src/java/java/com/ceph/fs/CephMount.java           |    36 +-
 src/java/java/com/ceph/fs/CephNativeLoader.java    |    72 +-
 src/java/native/libcephfs_jni.cc                   |    51 +
 src/java/test/com/ceph/fs/CephMountTest.java       |    33 +
 src/key_value_store/kv_flat_btree_async.h          |    28 +-
 src/krbd.cc                                        |   122 +-
 src/libcephfs.cc                                   |    78 +-
 src/librados/IoCtxImpl.cc                          |    30 +-
 src/librados/ListObjectImpl.h                      |    14 +-
 src/librados/Makefile.am                           |     3 -
 src/librados/RadosClient.cc                        |     9 +-
 src/librados/librados.cc                           |   126 +-
 src/libradosstriper/Makefile.am                    |     4 +-
 src/libradosstriper/RadosStriperImpl.cc            |     4 +-
 src/librbd/AioCompletion.cc                        |     7 +-
 src/librbd/AioCompletion.h                         |     8 +-
 src/librbd/AioRequest.cc                           |    84 +-
 src/librbd/AioRequest.h                            |    49 +-
 src/librbd/AsyncFlattenRequest.cc                  |    19 +-
 src/librbd/AsyncFlattenRequest.h                   |     6 +-
 src/librbd/AsyncObjectThrottle.cc                  |    39 +-
 src/librbd/AsyncObjectThrottle.h                   |    35 +-
 src/librbd/AsyncRequest.cc                         |    27 +-
 src/librbd/AsyncRequest.h                          |    27 +-
 src/librbd/AsyncResizeRequest.cc                   |     8 -
 src/librbd/AsyncResizeRequest.h                    |    11 +-
 src/librbd/AsyncTrimRequest.cc                     |   129 +-
 src/librbd/AsyncTrimRequest.h                      |    18 +-
 src/librbd/CopyupRequest.cc                        |   210 +-
 src/librbd/CopyupRequest.h                         |    23 +-
 src/librbd/DiffIterate.cc                          |   457 +
 src/librbd/DiffIterate.h                           |    47 +
 src/librbd/ImageCtx.cc                             |   369 +-
 src/librbd/ImageCtx.h                              |    47 +-
 src/librbd/ImageWatcher.cc                         |   160 +-
 src/librbd/ImageWatcher.h                          |    17 +-
 src/librbd/Makefile.am                             |    10 +-
 src/librbd/ObjectMap.cc                            |   343 +-
 src/librbd/ObjectMap.h                             |    46 +-
 src/librbd/RebuildObjectMapRequest.cc              |   361 +
 src/librbd/RebuildObjectMapRequest.h               |    78 +
 src/librbd/SnapInfo.h                              |     7 +-
 src/librbd/TaskFinisher.h                          |     2 +-
 src/librbd/WatchNotifyTypes.cc                     |    44 +
 src/librbd/WatchNotifyTypes.h                      |    48 +-
 src/librbd/internal.cc                             |   819 +-
 src/librbd/internal.h                              |    27 +-
 src/librbd/librbd.cc                               |   274 +-
 src/librbd/parent_types.h                          |    15 +-
 src/libs3/COPYING                                  |   674 -
 src/libs3/ChangeLog                                |    16 -
 src/libs3/GNUmakefile                              |   419 -
 src/libs3/GNUmakefile.mingw                        |   296 -
 src/libs3/GNUmakefile.osx                          |   305 -
 src/libs3/INSTALL                                  |    73 -
 src/libs3/LICENSE                                  |    20 -
 src/libs3/README                                   |     4 -
 src/libs3/TODO                                     |     3 -
 src/libs3/archlinux/PKGBUILD                       |    28 -
 src/libs3/doxyfile                                 |   886 -
 src/libs3/inc/error_parser.h                       |    82 -
 src/libs3/inc/libs3.h                              |  1892 --
 src/libs3/inc/mingw/pthread.h                      |    45 -
 src/libs3/inc/mingw/sys/select.h                   |    30 -
 src/libs3/inc/mingw/sys/utsname.h                  |    41 -
 src/libs3/inc/request.h                            |   186 -
 src/libs3/inc/request_context.h                    |    40 -
 src/libs3/inc/response_headers_handler.h           |    64 -
 src/libs3/inc/simplexml.h                          |    76 -
 src/libs3/inc/string_buffer.h                      |   107 -
 src/libs3/inc/util.h                               |    98 -
 src/libs3/libs3.spec                               |    81 -
 src/libs3/mswin/libs3.def                          |    27 -
 src/libs3/mswin/rmrf.bat                           |     9 -
 src/libs3/src/acl.c                                |   348 -
 src/libs3/src/bucket.c                             |   743 -
 src/libs3/src/error_parser.c                       |   239 -
 src/libs3/src/general.c                            |   473 -
 src/libs3/src/mingw_functions.c                    |   119 -
 src/libs3/src/mingw_s3_functions.c                 |    37 -
 src/libs3/src/object.c                             |   345 -
 src/libs3/src/request.c                            |  1392 --
 src/libs3/src/request_context.c                    |   190 -
 src/libs3/src/response_headers_handler.c           |   205 -
 src/libs3/src/s3.c                                 |  2787 ---
 src/libs3/src/service.c                            |   191 -
 src/libs3/src/service_access_logging.c             |   555 -
 src/libs3/src/simplexml.c                          |   207 -
 src/libs3/src/testsimplexml.c                      |    87 -
 src/libs3/src/util.c                               |   560 -
 src/libs3/test/badxml_01.xml                       |   105 -
 src/libs3/test/goodxml_01.xml                      |     7 -
 src/libs3/test/goodxml_02.xml                      |   105 -
 src/libs3/test/goodxml_03.xml                      |    13 -
 src/libs3/test/test.sh                             |   173 -
 src/logrotate.conf                                 |    21 +-
 src/make_version                                   |   123 +-
 src/mds/Beacon.cc                                  |   142 +-
 src/mds/Beacon.h                                   |    19 +-
 src/mds/CDentry.cc                                 |    85 +-
 src/mds/CDentry.h                                  |    13 +-
 src/mds/CDir.cc                                    |   523 +-
 src/mds/CDir.h                                     |    45 +-
 src/mds/CInode.cc                                  |   517 +-
 src/mds/CInode.h                                   |   199 +-
 src/mds/InoTable.cc                                |     2 +-
 src/mds/InoTable.h                                 |     4 +-
 src/mds/JournalPointer.cc                          |     6 +-
 src/mds/Locker.cc                                  |   184 +-
 src/mds/Locker.h                                   |    18 +-
 src/mds/LogEvent.cc                                |     2 +-
 src/mds/LogEvent.h                                 |     4 +-
 src/mds/LogSegment.h                               |     7 +-
 src/mds/MDBalancer.cc                              |    18 +-
 src/mds/MDBalancer.h                               |    12 +-
 src/mds/MDCache.cc                                 |  1331 +-
 src/mds/MDCache.h                                  |   116 +-
 src/mds/MDLog.cc                                   |   339 +-
 src/mds/MDLog.h                                    |    40 +-
 src/mds/MDS.cc                                     |  3066 ---
 src/mds/MDS.h                                      |   496 -
 src/mds/MDSContext.cc                              |    22 +-
 src/mds/MDSContext.h                               |    38 +-
 src/mds/MDSDaemon.cc                               |  1348 ++
 src/mds/MDSDaemon.h                                |   207 +
 src/mds/MDSMap.cc                                  |    44 +-
 src/mds/MDSMap.h                                   |    48 +-
 src/mds/MDSRank.cc                                 |  2405 +++
 src/mds/MDSRank.h                                  |   522 +
 src/mds/MDSTable.cc                                |    34 +-
 src/mds/MDSTable.h                                 |     6 +-
 src/mds/MDSTableClient.cc                          |    14 +-
 src/mds/MDSTableClient.h                           |     6 +-
 src/mds/MDSTableServer.cc                          |     2 +-
 src/mds/MDSTableServer.h                           |     2 +-
 src/mds/Makefile-server.am                         |     4 +-
 src/mds/Makefile.am                                |     5 +-
 src/mds/Migrator.cc                                |    27 +-
 src/mds/Migrator.h                                 |     6 +-
 src/mds/Mutation.cc                                |    14 +-
 src/mds/Mutation.h                                 |     7 +-
 src/mds/RecoveryQueue.cc                           |    13 +-
 src/mds/RecoveryQueue.h                            |     9 +-
 src/mds/Server.cc                                  |   651 +-
 src/mds/Server.h                                   |    35 +-
 src/mds/SessionMap.cc                              |   477 +-
 src/mds/SessionMap.h                               |   211 +-
 src/mds/SimpleLock.cc                              |    43 +
 src/mds/SimpleLock.h                               |    11 +-
 src/mds/SnapClient.h                               |    16 +-
 src/mds/SnapRealm.cc                               |    35 +-
 src/mds/SnapRealm.h                                |    15 +-
 src/mds/SnapServer.cc                              |   102 +-
 src/mds/SnapServer.h                               |    17 +-
 src/mds/StrayManager.cc                            |   901 +
 src/mds/StrayManager.h                             |   252 +
 src/mds/events/ECommitted.h                        |     2 +-
 src/mds/events/EExport.h                           |     4 +-
 src/mds/events/EFragment.h                         |     2 +-
 src/mds/events/EImportFinish.h                     |     4 +-
 src/mds/events/EImportStart.h                      |     5 +-
 src/mds/events/EMetaBlob.h                         |    14 +-
 src/mds/events/ENoOp.h                             |     2 +-
 src/mds/events/EOpen.h                             |     2 +-
 src/mds/events/EResetJournal.h                     |     2 +-
 src/mds/events/ESession.h                          |     2 +-
 src/mds/events/ESessions.h                         |     2 +-
 src/mds/events/ESlaveUpdate.h                      |     2 +-
 src/mds/events/ESubtreeMap.h                       |     2 +-
 src/mds/events/ETableClient.h                      |     2 +-
 src/mds/events/ETableServer.h                      |     2 +-
 src/mds/events/EUpdate.h                           |     2 +-
 src/mds/flock.h                                    |     5 +
 src/mds/journal.cc                                 |   180 +-
 src/mds/mdstypes.cc                                |   119 +-
 src/mds/mdstypes.h                                 |   166 +-
 src/messages/MClientCaps.h                         |    15 +-
 src/messages/MClientReconnect.h                    |     1 +
 src/messages/MClientReply.h                        |     9 +-
 src/messages/MClientRequest.h                      |    13 +-
 src/messages/MClientRequestForward.h               |     2 +
 src/messages/MClientSession.h                      |     1 +
 src/messages/MCommand.h                            |     1 -
 src/messages/MDataPing.h                           |     8 +-
 src/messages/MDirUpdate.h                          |     6 +-
 src/messages/MExportDirFinish.h                    |     2 +-
 src/messages/MForward.h                            |    53 +-
 src/messages/MGetPoolStats.h                       |     2 -
 src/messages/MGetPoolStatsReply.h                  |     2 -
 src/messages/MLog.h                                |     1 -
 src/messages/MLogAck.h                             |     2 -
 src/messages/MMDSBeacon.h                          |    20 +-
 src/messages/MMDSMap.h                             |     2 -
 src/messages/MMDSOpenInoReply.h                    |     2 +-
 src/messages/MMonMetadata.h                        |    52 +
 src/messages/MMonPaxos.h                           |     2 +
 src/messages/MMonScrub.h                           |    19 +-
 src/messages/MMonSubscribe.h                       |     1 +
 src/messages/MOSDECSubOpWrite.h                    |     7 +-
 src/messages/MOSDMap.h                             |     1 +
 src/messages/MOSDOp.h                              |    25 +-
 src/messages/MOSDOpReply.h                         |     1 +
 src/messages/MOSDRepOp.h                           |     3 +-
 src/messages/MOSDRepOpReply.h                      |     4 +-
 src/messages/MOSDSubOp.h                           |     2 +-
 src/messages/MRoute.h                              |     2 +
 src/messages/MStatfsReply.h                        |     2 +-
 src/messages/Makefile.am                           |     1 +
 src/mon/AuthMonitor.cc                             |   112 +-
 src/mon/AuthMonitor.h                              |    14 +-
 src/mon/ConfigKeyService.cc                        |    19 +-
 src/mon/ConfigKeyService.h                         |     2 +-
 src/mon/DataHealthService.cc                       |    12 +-
 src/mon/DataHealthService.h                        |     8 +-
 src/mon/DumplingMonCommands.h                      |     5 +-
 src/mon/Elector.cc                                 |    77 +-
 src/mon/Elector.h                                  |    31 +-
 src/mon/HealthMonitor.cc                           |     9 +-
 src/mon/HealthMonitor.h                            |     2 +-
 src/mon/HealthService.h                            |     6 +-
 src/mon/LogMonitor.cc                              |    92 +-
 src/mon/LogMonitor.h                               |    31 +-
 src/mon/MDSMonitor.cc                              |   418 +-
 src/mon/MDSMonitor.h                               |    45 +-
 src/mon/Makefile.am                                |     3 +-
 src/mon/MonClient.cc                               |    31 +-
 src/mon/MonClient.h                                |     9 +-
 src/mon/MonCommands.h                              |    90 +-
 src/mon/MonMap.h                                   |     1 +
 src/mon/MonOpRequest.h                             |   220 +
 src/mon/Monitor.cc                                 |  1036 +-
 src/mon/Monitor.h                                  |   182 +-
 src/mon/MonitorDBStore.h                           |    86 +-
 src/mon/MonitorStore.cc                            |   499 -
 src/mon/MonitorStore.h                             |   109 -
 src/mon/MonmapMonitor.cc                           |    48 +-
 src/mon/MonmapMonitor.h                            |    12 +-
 src/mon/OSDMonitor.cc                              |  1935 +-
 src/mon/OSDMonitor.h                               |   208 +-
 src/mon/PGMap.cc                                   |   227 +-
 src/mon/PGMap.h                                    |    35 +-
 src/mon/PGMonitor.cc                               |   141 +-
 src/mon/PGMonitor.h                                |    46 +-
 src/mon/Paxos.cc                                   |   169 +-
 src/mon/Paxos.h                                    |    62 +-
 src/mon/PaxosService.cc                            |    27 +-
 src/mon/PaxosService.h                             |    87 +-
 src/mon/QuorumService.h                            |     6 +-
 src/mon/Session.h                                  |    15 +-
 src/mon/mon_types.h                                |    31 +
 src/msg/Connection.h                               |     1 -
 src/msg/Dispatcher.h                               |     2 +-
 src/msg/Message.cc                                 |     4 +
 src/msg/Message.h                                  |     8 +-
 src/msg/Messenger.cc                               |    17 +-
 src/msg/Messenger.h                                |    32 +-
 src/msg/async/AsyncConnection.cc                   |   752 +-
 src/msg/async/AsyncConnection.h                    |    79 +-
 src/msg/async/AsyncMessenger.cc                    |    69 +-
 src/msg/async/AsyncMessenger.h                     |    51 +-
 src/msg/async/Event.cc                             |   121 +-
 src/msg/async/Event.h                              |     7 +-
 src/msg/async/EventEpoll.cc                        |    10 +-
 src/msg/async/EventEpoll.h                         |     4 +-
 src/msg/async/EventKqueue.cc                       |     7 +-
 src/msg/async/EventKqueue.h                        |     3 +-
 src/msg/async/EventSelect.cc                       |     3 +-
 src/msg/async/EventSelect.h                        |     4 +-
 src/msg/async/net_handler.cc                       |     5 +-
 src/msg/msg_types.h                                |    34 +-
 src/msg/simple/Accepter.cc                         |    11 +
 src/msg/simple/DispatchQueue.cc                    |    11 +
 src/msg/simple/DispatchQueue.h                     |     1 +
 src/msg/simple/Pipe.cc                             |    75 +-
 src/msg/simple/Pipe.h                              |     4 +-
 src/msg/simple/PipeConnection.h                    |     2 +-
 src/msg/simple/SimpleMessenger.cc                  |     8 +-
 src/msg/simple/SimpleMessenger.h                   |     8 +-
 src/msg/xio/XioConnection.cc                       |    56 +-
 src/msg/xio/XioConnection.h                        |     2 -
 src/msg/xio/XioMessenger.cc                        |   165 +-
 src/msg/xio/XioMessenger.h                         |    17 +-
 src/msg/xio/XioMsg.h                               |    18 +-
 src/msg/xio/XioPool.cc                             |    21 +-
 src/msg/xio/XioPool.h                              |    69 +-
 src/msg/xio/XioPortal.h                            |    48 +-
 src/objclass/class_api.cc                          |    21 +-
 src/objclass/objclass.h                            |    48 +-
 src/ocf/Makefile.in                                |    10 +-
 src/os/CollectionIndex.h                           |    11 +-
 src/os/DBObjectMap.cc                              |    17 +-
 src/os/DBObjectMap.h                               |     4 +-
 src/os/FDCache.h                                   |     4 +-
 src/os/FileJournal.cc                              |   284 +-
 src/os/FileJournal.h                               |    49 +-
 src/os/FileStore.cc                                |  1022 +-
 src/os/FileStore.h                                 |    61 +-
 src/os/FlatIndex.cc                                |   426 -
 src/os/FlatIndex.h                                 |    85 -
 src/os/GenericFileStoreBackend.cc                  |    91 +-
 src/os/GenericFileStoreBackend.h                   |     6 +-
 src/os/GenericObjectMap.cc                         |   125 +-
 src/os/GenericObjectMap.h                          |     6 +-
 src/os/HashIndex.cc                                |   331 +-
 src/os/HashIndex.h                                 |    77 +-
 src/os/IndexManager.cc                             |     6 +-
 src/os/IndexManager.h                              |     3 +-
 src/os/JournalingObjectStore.cc                    |    40 +-
 src/os/JournalingObjectStore.h                     |     4 +-
 src/os/KeyValueDB.cc                               |    10 +-
 src/os/KeyValueDB.h                                |    25 +-
 src/os/KeyValueStore.cc                            |   752 +-
 src/os/KeyValueStore.h                             |    92 +-
 src/os/KineticStore.cc                             |     4 +-
 src/os/LFNIndex.cc                                 |    21 +-
 src/os/LFNIndex.h                                  |    30 +-
 src/os/LevelDBStore.cc                             |    51 +-
 src/os/LevelDBStore.h                              |    12 +-
 src/os/Makefile.am                                 |    21 +-
 src/os/MemStore.cc                                 |   521 +-
 src/os/MemStore.h                                  |   185 +-
 src/os/ObjectStore.cc                              |    56 +-
 src/os/ObjectStore.h                               |   130 +-
 src/os/PageSet.h                                   |   227 +
 src/os/RocksDBStore.cc                             |   252 +-
 src/os/RocksDBStore.h                              |    83 +-
 src/os/Transaction.cc                              |    12 +-
 src/os/WBThrottle.cc                               |    50 +-
 src/os/WBThrottle.h                                |     9 +
 src/os/XfsFileStoreBackend.cc                      |    22 +-
 src/os/chain_xattr.cc                              |    46 +-
 src/os/chain_xattr.h                               |    15 +
 src/os/fs/FS.cc                                    |   153 +
 src/os/fs/FS.h                                     |   132 +
 src/os/fs/XFS.cc                                   |    55 +
 src/os/fs/XFS.h                                    |    31 +
 src/os/newstore/NewStore.cc                        |  4442 ++++
 src/os/newstore/NewStore.h                         |   850 +
 src/os/newstore/newstore_types.cc                  |   304 +
 src/os/newstore/newstore_types.h                   |   192 +
 src/osd/Ager.cc                                    |   270 -
 src/osd/Ager.h                                     |    43 -
 src/osd/ClassHandler.cc                            |    27 +-
 src/osd/ClassHandler.h                             |    28 +
 src/osd/ECBackend.cc                               |   526 +-
 src/osd/ECBackend.h                                |    56 +-
 src/osd/ECMsgTypes.cc                              |    18 +-
 src/osd/ECMsgTypes.h                               |    41 +-
 src/osd/ECTransaction.cc                           |    35 +-
 src/osd/ECTransaction.h                            |     8 +-
 src/osd/ECUtil.cc                                  |    27 +-
 src/osd/HitSet.cc                                  |     3 -
 src/osd/Makefile.am                                |     5 -
 src/osd/OSD.cc                                     |  1580 +-
 src/osd/OSD.h                                      |   656 +-
 src/osd/OSDCap.h                                   |     1 -
 src/osd/OSDMap.cc                                  |    53 +-
 src/osd/OSDMap.h                                   |    12 +-
 src/osd/OpRequest.cc                               |    19 +-
 src/osd/OpRequest.h                                |     5 +
 src/osd/PG.cc                                      |   790 +-
 src/osd/PG.h                                       |   179 +-
 src/osd/PGBackend.cc                               |    79 +-
 src/osd/PGBackend.h                                |    68 +-
 src/osd/PGLog.cc                                   |    59 +-
 src/osd/PGLog.h                                    |    30 +-
 src/osd/ReplicatedBackend.cc                       |   359 +-
 src/osd/ReplicatedBackend.h                        |    32 +-
 src/osd/ReplicatedPG.cc                            |  2150 +-
 src/osd/ReplicatedPG.h                             |   273 +-
 src/osd/SnapMapper.h                               |     8 +-
 src/osd/TierAgentState.h                           |     6 +-
 src/osd/osd_types.cc                               |   582 +-
 src/osd/osd_types.h                                |   319 +-
 src/osdc/Filer.cc                                  |     2 +-
 src/osdc/Filer.h                                   |    20 +-
 src/osdc/Journaler.cc                              |   177 +-
 src/osdc/Journaler.h                               |    31 +-
 src/osdc/ObjectCacher.cc                           |    41 +-
 src/osdc/ObjectCacher.h                            |    10 +
 src/osdc/Objecter.cc                               |   522 +-
 src/osdc/Objecter.h                                |   135 +-
 src/osdc/Striper.cc                                |     8 +-
 src/osdc/WritebackHandler.h                        |     4 -
 src/pybind/ceph_argparse.py                        |   116 +-
 src/pybind/ceph_daemon.py                          |   278 +
 src/pybind/ceph_rest_api.py                        |    86 +-
 src/pybind/cephfs.py                               |   316 +-
 src/pybind/rados.py                                |   658 +-
 src/pybind/rbd.py                                  |   158 +-
 src/rbd.cc                                         |  1321 +-
 src/rbd_fuse/rbd-fuse.cc                           |    29 +-
 src/rbd_replay/ActionTypes.cc                      |   354 +
 src/rbd_replay/ActionTypes.h                       |   277 +
 src/rbd_replay/BufferReader.cc                     |    34 +
 src/rbd_replay/BufferReader.h                      |    33 +
 src/rbd_replay/Deser.cc                            |    67 -
 src/rbd_replay/Deser.hpp                           |    52 -
 src/rbd_replay/Makefile.am                         |    49 +-
 src/rbd_replay/Replayer.cc                         |    83 +-
 src/rbd_replay/Replayer.hpp                        |     3 +-
 src/rbd_replay/Ser.cc                              |    53 -
 src/rbd_replay/Ser.hpp                             |    50 -
 src/rbd_replay/actions.cc                          |   321 +-
 src/rbd_replay/actions.hpp                         |   295 +-
 src/rbd_replay/ios.cc                              |   231 +-
 src/rbd_replay/ios.hpp                             |   143 +-
 src/rbd_replay/rbd-replay-prep.cc                  |   336 +-
 src/rbd_replay/rbd-replay.cc                       |     2 +-
 src/rgw/Makefile.am                                |    14 +-
 src/rgw/logrotate.conf                             |    26 -
 src/rgw/rgw_acl.h                                  |     4 +-
 src/rgw/rgw_acl_s3.cc                              |    28 +
 src/rgw/rgw_acl_s3.h                               |    30 +-
 src/rgw/rgw_acl_swift.h                            |     1 -
 src/rgw/rgw_admin.cc                               |   297 +-
 src/rgw/rgw_bucket.cc                              |    39 +-
 src/rgw/rgw_bucket.h                               |     9 +-
 src/rgw/rgw_cache.cc                               |     2 +-
 src/rgw/rgw_common.cc                              |    34 +-
 src/rgw/rgw_common.h                               |    54 +-
 src/rgw/rgw_cors.h                                 |     1 -
 src/rgw/rgw_cors_s3.h                              |     2 +-
 src/rgw/rgw_cors_swift.h                           |     1 -
 src/rgw/rgw_dencoder.cc                            |    28 +
 src/rgw/rgw_http_errors.h                          |     4 +
 src/rgw/rgw_json_enc.cc                            |     1 +
 src/rgw/rgw_main.cc                                |     8 +-
 src/rgw/rgw_object_expirer.cc                      |   104 +
 src/rgw/rgw_object_expirer_core.cc                 |   263 +
 src/rgw/rgw_object_expirer_core.h                  |    88 +
 src/rgw/rgw_op.cc                                  |   511 +-
 src/rgw/rgw_op.h                                   |    99 +-
 src/rgw/rgw_orphan.cc                              |    25 +-
 src/rgw/rgw_quota.cc                               |     2 +-
 src/rgw/rgw_rados.cc                               |   304 +-
 src/rgw/rgw_rados.h                                |    92 +-
 src/rgw/rgw_rest.cc                                |    52 +-
 src/rgw/rgw_rest.h                                 |    20 +-
 src/rgw/rgw_rest_log.cc                            |     6 +-
 src/rgw/rgw_rest_log.h                             |    12 +-
 src/rgw/rgw_rest_s3.cc                             |    45 +-
 src/rgw/rgw_rest_s3.h                              |     8 +-
 src/rgw/rgw_rest_swift.cc                          |   330 +-
 src/rgw/rgw_rest_swift.h                           |    29 +-
 src/rgw/rgw_rest_user.cc                           |    15 +-
 src/rgw/rgw_swift.cc                               |     2 +-
 src/rgw/rgw_user.cc                                |   167 +-
 src/rgw/rgw_user.h                                 |    47 +-
 src/rgw/rgw_xml.h                                  |     2 +-
 src/rocksdb/.arcconfig                             |    10 -
 src/rocksdb/.clang-format                          |     5 -
 src/rocksdb/.gitignore                             |    22 +-
 src/rocksdb/AUTHORS                                |    11 +
 src/rocksdb/CONTRIBUTING.md                        |     5 +-
 src/rocksdb/HISTORY.md                             |   179 +
 src/rocksdb/INSTALL.md                             |    54 +-
 src/rocksdb/Makefile.am                            |   522 +-
 src/rocksdb/PATENTS                                |    50 +-
 src/rocksdb/README                                 |    82 -
 src/rocksdb/README.md                              |    25 +
 src/rocksdb/USERS.md                               |    36 +
 src/rocksdb/build_tools/build_detect_platform      |   313 -
 src/rocksdb/build_tools/build_detect_version       |    22 -
 src/rocksdb/build_tools/fbcode.clang31.sh          |    74 -
 src/rocksdb/build_tools/fbcode.gcc471.sh           |    70 -
 src/rocksdb/build_tools/fbcode.gcc481.sh           |    81 -
 src/rocksdb/build_tools/format-diff.sh             |   107 -
 src/rocksdb/build_tools/mac-install-gflags.sh      |    25 -
 src/rocksdb/build_tools/make_new_version.sh        |    46 -
 src/rocksdb/build_tools/regression_build_test.sh   |   330 -
 src/rocksdb/build_tools/valgrind_test.sh           |    15 -
 src/rocksdb/configure.ac                           |     9 +-
 src/rocksdb/coverage/coverage_test.sh              |    78 -
 src/rocksdb/coverage/parse_gcov_output.py          |   118 -
 src/rocksdb/db/builder.cc                          |   123 +-
 src/rocksdb/db/builder.h                           |    40 +-
 src/rocksdb/db/c.cc                                |   952 +-
 src/rocksdb/db/c_test.c                            |   474 +-
 src/rocksdb/db/column_family.cc                    |   595 +-
 src/rocksdb/db/column_family.h                     |   286 +-
 src/rocksdb/db/column_family_test.cc               |   295 +-
 src/rocksdb/db/compact_files_test.cc               |   105 +
 src/rocksdb/db/compaction.cc                       |   326 +-
 src/rocksdb/db/compaction.h                        |   216 +-
 src/rocksdb/db/compaction_job.cc                   |  1226 ++
 src/rocksdb/db/compaction_job.h                    |   138 +
 src/rocksdb/db/compaction_job_test.cc              |   189 +
 src/rocksdb/db/compaction_picker.cc                |  1692 +-
 src/rocksdb/db/compaction_picker.h                 |   309 +-
 src/rocksdb/db/compaction_picker_test.cc           |   426 +
 src/rocksdb/db/comparator_db_test.cc               |   440 +
 src/rocksdb/db/corruption_test.cc                  |    66 +-
 src/rocksdb/db/cuckoo_table_db_test.cc             |   321 +
 src/rocksdb/db/db_bench.cc                         |  1418 +-
 src/rocksdb/db/db_filesnapshot.cc                  |   107 +-
 src/rocksdb/db/db_impl.cc                          |  5136 +++--
 src/rocksdb/db/db_impl.h                           |   589 +-
 src/rocksdb/db/db_impl_debug.cc                    |    60 +-
 src/rocksdb/db/db_impl_experimental.cc             |   150 +
 src/rocksdb/db/db_impl_readonly.cc                 |   134 +-
 src/rocksdb/db/db_impl_readonly.h                  |    66 +-
 src/rocksdb/db/db_iter.cc                          |   623 +-
 src/rocksdb/db/db_iter.h                           |    54 +-
 src/rocksdb/db/db_iter_test.cc                     |  1409 ++
 src/rocksdb/db/db_stats_logger.cc                  |    95 -
 src/rocksdb/db/db_test.cc                          |  8945 ++++++--
 src/rocksdb/db/dbformat.cc                         |    29 +-
 src/rocksdb/db/dbformat.h                          |   152 +-
 src/rocksdb/db/dbformat_test.cc                    |    50 +-
 src/rocksdb/db/deletefile_test.cc                  |    90 +-
 src/rocksdb/db/event_logger_helpers.cc             |    46 +
 src/rocksdb/db/event_logger_helpers.h              |    18 +
 src/rocksdb/db/experimental.cc                     |    51 +
 src/rocksdb/db/fault_injection_test.cc             |   804 +
 src/rocksdb/db/file_indexer.cc                     |   149 +-
 src/rocksdb/db/file_indexer.h                      |    55 +-
 src/rocksdb/db/file_indexer_test.cc                |   112 +-
 src/rocksdb/db/filename.cc                         |   146 +-
 src/rocksdb/db/filename.h                          |    49 +-
 src/rocksdb/db/filename_test.cc                    |    86 +-
 src/rocksdb/db/flush_job.cc                        |   305 +
 src/rocksdb/db/flush_job.h                         |    93 +
 src/rocksdb/db/flush_job_test.cc                   |   130 +
 src/rocksdb/db/flush_scheduler.cc                  |    63 +
 src/rocksdb/db/flush_scheduler.h                   |    40 +
 src/rocksdb/db/forward_iterator.cc                 |   541 +
 src/rocksdb/db/forward_iterator.h                  |   110 +
 src/rocksdb/db/internal_stats.cc                   |   836 +-
 src/rocksdb/db/internal_stats.h                    |   344 +-
 src/rocksdb/db/job_context.h                       |   115 +
 src/rocksdb/db/listener_test.cc                    |   407 +
 src/rocksdb/db/log_and_apply_bench.cc              |    79 -
 src/rocksdb/db/log_reader.cc                       |    32 +-
 src/rocksdb/db/log_reader.h                        |     6 +
 src/rocksdb/db/log_test.cc                         |   128 +-
 src/rocksdb/db/log_writer.cc                       |     2 +-
 src/rocksdb/db/log_writer.h                        |    34 +
 src/rocksdb/db/managed_iterator.cc                 |   256 +
 src/rocksdb/db/managed_iterator.h                  |    84 +
 src/rocksdb/db/memtable.cc                         |   236 +-
 src/rocksdb/db/memtable.h                          |   133 +-
 src/rocksdb/db/memtable_allocator.cc               |    52 +
 src/rocksdb/db/memtable_allocator.h                |    47 +
 src/rocksdb/db/memtable_list.cc                    |    99 +-
 src/rocksdb/db/memtable_list.h                     |    53 +-
 src/rocksdb/db/memtable_list_test.cc               |   414 +
 src/rocksdb/db/memtablerep_bench.cc                |   694 +
 src/rocksdb/db/merge_context.h                     |     1 -
 src/rocksdb/db/merge_helper.cc                     |   115 +-
 src/rocksdb/db/merge_helper.h                      |    25 +-
 src/rocksdb/db/merge_test.cc                       |   121 +-
 src/rocksdb/db/perf_context_test.cc                |   384 +-
 src/rocksdb/db/plain_table_db_test.cc              |   477 +-
 src/rocksdb/db/prefix_test.cc                      |    64 +-
 src/rocksdb/db/repair.cc                           |   262 +-
 src/rocksdb/db/simple_table_db_test.cc             |   794 -
 src/rocksdb/db/skiplist.h                          |    46 +-
 src/rocksdb/db/skiplist_test.cc                    |    59 +-
 src/rocksdb/db/slice.cc                            |    24 +
 src/rocksdb/db/snapshot.h                          |    35 +-
 src/rocksdb/db/table_cache.cc                      |   111 +-
 src/rocksdb/db/table_cache.h                       |    39 +-
 src/rocksdb/db/table_properties_collector.cc       |    30 +-
 src/rocksdb/db/table_properties_collector.h        |    86 +-
 src/rocksdb/db/table_properties_collector_test.cc  |   449 +-
 src/rocksdb/db/tailing_iter.cc                     |   221 -
 src/rocksdb/db/tailing_iter.h                      |    97 -
 src/rocksdb/db/transaction_log_impl.cc             |    48 +-
 src/rocksdb/db/transaction_log_impl.h              |    51 +-
 src/rocksdb/db/version_builder.cc                  |   330 +
 src/rocksdb/db/version_builder.h                   |    42 +
 src/rocksdb/db/version_builder_test.cc             |   304 +
 src/rocksdb/db/version_edit.cc                     |   113 +-
 src/rocksdb/db/version_edit.h                      |   145 +-
 src/rocksdb/db/version_edit_test.cc                |    25 +-
 src/rocksdb/db/version_set.cc                      |  2674 +--
 src/rocksdb/db/version_set.h                       |   569 +-
 src/rocksdb/db/version_set_test.cc                 |   279 +-
 src/rocksdb/db/wal_manager.cc                      |   470 +
 src/rocksdb/db/wal_manager.h                       |    95 +
 src/rocksdb/db/wal_manager_test.cc                 |   289 +
 src/rocksdb/db/write_batch.cc                      |   269 +-
 src/rocksdb/db/write_batch_base.cc                 |    46 +
 src/rocksdb/db/write_batch_internal.h              |    35 +-
 src/rocksdb/db/write_batch_test.cc                 |   180 +-
 src/rocksdb/db/write_controller.cc                 |    37 +
 src/rocksdb/db/write_controller.h                  |    78 +
 src/rocksdb/db/write_controller_test.cc            |    43 +
 src/rocksdb/db/write_thread.cc                     |   147 +
 src/rocksdb/db/write_thread.h                      |    81 +
 src/rocksdb/db/writebuffer.h                       |    44 +
 src/rocksdb/doc/index.html                         |     4 -
 src/rocksdb/examples/.gitignore                    |     4 +
 src/rocksdb/examples/Makefile                      |    23 +
 src/rocksdb/examples/README.md                     |     1 +
 src/rocksdb/examples/c_simple_example.c            |    74 +
 src/rocksdb/examples/column_families_example.cc    |    72 +
 src/rocksdb/examples/compact_files_example.cc      |   175 +
 src/rocksdb/examples/simple_example.cc             |    55 +
 src/rocksdb/hdfs/README                            |    13 +-
 src/rocksdb/hdfs/env_hdfs.h                        |   129 +-
 src/rocksdb/hdfs/hdfs.h                            |   477 -
 src/rocksdb/hdfs/libhdfs.a                         |   Bin 65218 -> 0 bytes
 src/rocksdb/hdfs/setup.sh                          |     7 +
 src/rocksdb/helpers/memenv/memenv.cc               |   395 -
 src/rocksdb/helpers/memenv/memenv_test.cc          |   231 -
 src/rocksdb/include/rocksdb/c.h                    |   348 +-
 src/rocksdb/include/rocksdb/cache.h                |    22 +-
 src/rocksdb/include/rocksdb/compaction_filter.h    |     9 +-
 src/rocksdb/include/rocksdb/comparator.h           |     4 +
 src/rocksdb/include/rocksdb/db.h                   |   191 +-
 src/rocksdb/include/rocksdb/env.h                  |   213 +-
 src/rocksdb/include/rocksdb/experimental.h         |    29 +
 src/rocksdb/include/rocksdb/filter_policy.h        |    64 +-
 src/rocksdb/include/rocksdb/flush_block_policy.h   |     6 +-
 src/rocksdb/include/rocksdb/immutable_options.h    |   105 +
 src/rocksdb/include/rocksdb/iostats_context.h      |    36 +
 src/rocksdb/include/rocksdb/ldb_tool.h             |    21 +-
 src/rocksdb/include/rocksdb/listener.h             |   107 +
 src/rocksdb/include/rocksdb/memtablerep.h          |    80 +-
 src/rocksdb/include/rocksdb/metadata.h             |    90 +
 src/rocksdb/include/rocksdb/options.h              |   608 +-
 src/rocksdb/include/rocksdb/perf_context.h         |    12 +
 src/rocksdb/include/rocksdb/rate_limiter.h         |    64 +
 src/rocksdb/include/rocksdb/slice.h                |     8 +-
 src/rocksdb/include/rocksdb/slice_transform.h      |    29 +
 src/rocksdb/include/rocksdb/sst_dump_tool.h        |    17 +
 src/rocksdb/include/rocksdb/statistics.h           |    69 +-
 src/rocksdb/include/rocksdb/status.h               |    37 +-
 src/rocksdb/include/rocksdb/table.h                |   309 +-
 src/rocksdb/include/rocksdb/table_properties.h     |    54 +-
 src/rocksdb/include/rocksdb/thread_status.h        |   191 +
 src/rocksdb/include/rocksdb/universal_compaction.h |    17 +-
 .../include/rocksdb/utilities/backupable_db.h      |   316 +
 src/rocksdb/include/rocksdb/utilities/checkpoint.h |    34 +
 .../include/rocksdb/utilities/convenience.h        |    63 +
 src/rocksdb/include/rocksdb/utilities/db_ttl.h     |    68 +
 .../include/rocksdb/utilities/document_db.h        |   149 +
 src/rocksdb/include/rocksdb/utilities/flashcache.h |    25 +
 src/rocksdb/include/rocksdb/utilities/geo_db.h     |   105 +
 .../include/rocksdb/utilities/json_document.h      |   195 +
 .../include/rocksdb/utilities/leveldb_options.h    |   144 +
 src/rocksdb/include/rocksdb/utilities/spatial_db.h |   238 +
 .../include/rocksdb/utilities/stackable_db.h       |   260 +
 src/rocksdb/include/rocksdb/utilities/utility_db.h |    30 +
 .../rocksdb/utilities/write_batch_with_index.h     |   162 +
 src/rocksdb/include/rocksdb/version.h              |    18 +-
 src/rocksdb/include/rocksdb/write_batch.h          |    48 +-
 src/rocksdb/include/rocksdb/write_batch_base.h     |    72 +
 src/rocksdb/include/utilities/backupable_db.h      |   243 +-
 src/rocksdb/include/utilities/db_ttl.h             |    64 +-
 src/rocksdb/include/utilities/document_db.h        |     8 +
 src/rocksdb/include/utilities/geo_db.h             |   101 +-
 src/rocksdb/include/utilities/json_document.h      |     7 +
 src/rocksdb/include/utilities/stackable_db.h       |   212 +-
 src/rocksdb/include/utilities/utility_db.h         |    27 +-
 src/rocksdb/java/Makefile                          |    31 -
 src/rocksdb/java/RocksDBSample.java                |   253 -
 src/rocksdb/java/jdb_bench.sh                      |     1 -
 src/rocksdb/java/org/rocksdb/BackupableDB.java     |    80 -
 .../java/org/rocksdb/BackupableDBOptions.java      |    44 -
 src/rocksdb/java/org/rocksdb/BloomFilter.java      |    37 -
 src/rocksdb/java/org/rocksdb/Filter.java           |    32 -
 .../org/rocksdb/HashLinkedListMemTableConfig.java  |    52 -
 .../org/rocksdb/HashSkipListMemTableConfig.java    |    97 -
 src/rocksdb/java/org/rocksdb/HistogramData.java    |    43 -
 src/rocksdb/java/org/rocksdb/HistogramType.java    |    39 -
 src/rocksdb/java/org/rocksdb/Iterator.java         |   138 -
 src/rocksdb/java/org/rocksdb/MemTableConfig.java   |    27 -
 src/rocksdb/java/org/rocksdb/Options.java          |  2355 ---
 src/rocksdb/java/org/rocksdb/PlainTableConfig.java |   123 -
 src/rocksdb/java/org/rocksdb/ReadOptions.java      |   130 -
 src/rocksdb/java/org/rocksdb/RocksDB.java          |   376 -
 src/rocksdb/java/org/rocksdb/RocksDBException.java |    23 -
 src/rocksdb/java/org/rocksdb/RocksObject.java      |    35 -
 .../java/org/rocksdb/SkipListMemTableConfig.java   |    15 -
 src/rocksdb/java/org/rocksdb/Statistics.java       |    38 -
 .../java/org/rocksdb/TableFormatConfig.java        |    20 -
 src/rocksdb/java/org/rocksdb/TickerType.java       |   123 -
 .../java/org/rocksdb/VectorMemTableConfig.java     |    40 -
 src/rocksdb/java/org/rocksdb/WriteBatch.java       |   113 -
 src/rocksdb/java/org/rocksdb/WriteBatchTest.java   |   124 -
 src/rocksdb/java/org/rocksdb/WriteOptions.java     |   100 -
 .../java/org/rocksdb/benchmark/DbBenchmark.java    |  1577 --
 .../java/org/rocksdb/test/BackupableDBTest.java    |    41 -
 src/rocksdb/java/org/rocksdb/test/OptionsTest.java |   424 -
 .../java/org/rocksdb/test/ReadOptionsTest.java     |    40 -
 src/rocksdb/java/org/rocksdb/util/Environment.java |    37 -
 src/rocksdb/java/org/rocksdb/util/SizeUnit.java    |    16 -
 src/rocksdb/java/rocksjni/backupablejni.cc         |    85 -
 src/rocksdb/java/rocksjni/filter.cc                |    41 -
 src/rocksdb/java/rocksjni/iterator.cc              |   145 -
 src/rocksdb/java/rocksjni/memtablejni.cc           |    58 -
 src/rocksdb/java/rocksjni/options.cc               |  1807 --
 src/rocksdb/java/rocksjni/portal.h                 |   383 -
 src/rocksdb/java/rocksjni/rocksjni.cc              |   438 -
 src/rocksdb/java/rocksjni/statistics.cc            |    50 -
 src/rocksdb/java/rocksjni/table.cc                 |    25 -
 src/rocksdb/java/rocksjni/write_batch.cc           |   264 -
 src/rocksdb/linters/__phutil_library_init__.php    |     3 -
 src/rocksdb/linters/__phutil_library_map__.php     |    27 -
 .../linters/cpp_linter/ArcanistCpplintLinter.php   |    88 -
 src/rocksdb/linters/cpp_linter/FbcodeCppLinter.php |    99 -
 src/rocksdb/linters/cpp_linter/PfffCppLinter.php   |    68 -
 src/rocksdb/linters/cpp_linter/cpplint.py          |  4767 -----
 .../lint_engine/FacebookFbcodeLintEngine.php       |   147 -
 src/rocksdb/port/atomic_pointer.h                  |   157 -
 src/rocksdb/port/port.h                            |     8 +-
 src/rocksdb/port/port_example.h                    |    29 -
 src/rocksdb/port/port_posix.cc                     |    44 +-
 src/rocksdb/port/port_posix.h                      |   368 +-
 src/rocksdb/port/stack_trace.cc                    |    21 +-
 src/rocksdb/table/adaptive_table_factory.cc        |   115 +
 src/rocksdb/table/adaptive_table_factory.h         |    66 +
 src/rocksdb/table/block.cc                         |   439 +-
 src/rocksdb/table/block.h                          |   161 +-
 src/rocksdb/table/block_based_filter_block.cc      |   255 +
 src/rocksdb/table/block_based_filter_block.h       |   105 +
 src/rocksdb/table/block_based_filter_block_test.cc |   248 +
 src/rocksdb/table/block_based_table_builder.cc     |   522 +-
 src/rocksdb/table/block_based_table_builder.h      |    24 +-
 src/rocksdb/table/block_based_table_factory.cc     |   128 +-
 src/rocksdb/table/block_based_table_factory.h      |    42 +-
 src/rocksdb/table/block_based_table_reader.cc      |  1161 +-
 src/rocksdb/table/block_based_table_reader.h       |    68 +-
 src/rocksdb/table/block_builder.cc                 |    19 +-
 src/rocksdb/table/block_builder.h                  |    14 +-
 src/rocksdb/table/block_hash_index.cc              |    73 +-
 src/rocksdb/table/block_hash_index.h               |    29 +-
 src/rocksdb/table/block_hash_index_test.cc         |    33 +-
 src/rocksdb/table/block_prefix_index.cc            |   236 +
 src/rocksdb/table/block_prefix_index.h             |    67 +
 src/rocksdb/table/block_test.cc                    |    40 +-
 src/rocksdb/table/bloom_block.cc                   |    23 +
 src/rocksdb/table/bloom_block.h                    |    38 +
 src/rocksdb/table/cuckoo_table_builder.cc          |   511 +
 src/rocksdb/table/cuckoo_table_builder.h           |   123 +
 src/rocksdb/table/cuckoo_table_builder_test.cc     |   521 +
 src/rocksdb/table/cuckoo_table_factory.cc          |    69 +
 src/rocksdb/table/cuckoo_table_factory.h           |    79 +
 src/rocksdb/table/cuckoo_table_reader.cc           |   377 +
 src/rocksdb/table/cuckoo_table_reader.h            |    82 +
 src/rocksdb/table/cuckoo_table_reader_test.cc      |   546 +
 src/rocksdb/table/filter_block.cc                  |   187 -
 src/rocksdb/table/filter_block.h                   |    80 +-
 src/rocksdb/table/filter_block_test.cc             |   139 -
 src/rocksdb/table/flush_block_policy.cc            |     6 +-
 src/rocksdb/table/format.cc                        |   343 +-
 src/rocksdb/table/format.h                         |   107 +-
 src/rocksdb/table/full_filter_block.cc             |   100 +
 src/rocksdb/table/full_filter_block.h              |   111 +
 src/rocksdb/table/full_filter_block_test.cc        |   189 +
 src/rocksdb/table/get_context.cc                   |   119 +
 src/rocksdb/table/get_context.h                    |    49 +
 src/rocksdb/table/iterator.cc                      |    45 +-
 src/rocksdb/table/iterator_wrapper.h               |    27 +-
 src/rocksdb/table/merger.cc                        |   151 +-
 src/rocksdb/table/merger.h                         |    33 +-
 src/rocksdb/table/merger_test.cc                   |   201 +
 src/rocksdb/table/meta_blocks.cc                   |   133 +-
 src/rocksdb/table/meta_blocks.h                    |    37 +-
 src/rocksdb/table/mock_table.cc                    |   114 +
 src/rocksdb/table/mock_table.h                     |   181 +
 src/rocksdb/table/plain_table_builder.cc           |   202 +-
 src/rocksdb/table/plain_table_builder.h            |    80 +-
 src/rocksdb/table/plain_table_factory.cc           |    81 +-
 src/rocksdb/table/plain_table_factory.h            |   151 +-
 src/rocksdb/table/plain_table_index.cc             |   215 +
 src/rocksdb/table/plain_table_index.h              |   225 +
 src/rocksdb/table/plain_table_key_coding.cc        |   323 +
 src/rocksdb/table/plain_table_key_coding.h         |    97 +
 src/rocksdb/table/plain_table_reader.cc            |   654 +-
 src/rocksdb/table/plain_table_reader.h             |   241 +-
 src/rocksdb/table/table_builder.h                  |    34 +
 src/rocksdb/table/table_properties.cc              |     5 +-
 src/rocksdb/table/table_properties_internal.h      |    18 +
 src/rocksdb/table/table_reader.h                   |    51 +-
 src/rocksdb/table/table_reader_bench.cc            |   116 +-
 src/rocksdb/table/table_test.cc                    |   961 +-
 src/rocksdb/table/two_level_iterator.cc            |    40 +-
 src/rocksdb/table/two_level_iterator.h             |    11 +-
 src/rocksdb/third-party/fbson/COMMIT.md            |     2 +
 src/rocksdb/third-party/fbson/FbsonDocument.h      |   887 +
 src/rocksdb/third-party/fbson/FbsonJsonParser.h    |   746 +
 src/rocksdb/third-party/fbson/FbsonStream.h        |   183 +
 src/rocksdb/third-party/fbson/FbsonUtil.h          |   168 +
 src/rocksdb/third-party/fbson/FbsonWriter.h        |   435 +
 .../third-party/flashcache/flashcache_ioctl.h      |    55 +
 .../gtest-1.7.0/fused-src/gtest/gtest-all.cc       | 10257 +++++++++
 .../gtest-1.7.0/fused-src/gtest/gtest.h            | 20725 +++++++++++++++++++
 src/rocksdb/tools/auto_sanity_test.sh              |    71 -
 src/rocksdb/tools/blob_store_bench.cc              |   280 -
 src/rocksdb/tools/db_crashtest.py                  |   150 -
 src/rocksdb/tools/db_crashtest2.py                 |   168 -
 src/rocksdb/tools/db_repl_stress.cc                |   134 -
 src/rocksdb/tools/db_sanity_test.cc                |   203 -
 src/rocksdb/tools/db_stress.cc                     |  1732 --
 src/rocksdb/tools/ldb.cc                           |    13 -
 src/rocksdb/tools/ldb_test.py                      |   383 -
 src/rocksdb/tools/reduce_levels_test.cc            |   197 -
 src/rocksdb/tools/sst_dump.cc                      |   367 -
 src/rocksdb/util/allocator.h                       |    32 +
 src/rocksdb/util/arena.cc                          |    75 +-
 src/rocksdb/util/arena.h                           |    40 +-
 src/rocksdb/util/arena_test.cc                     |    85 +-
 src/rocksdb/util/auto_roll_logger.cc               |    51 +-
 src/rocksdb/util/auto_roll_logger.h                |    31 +-
 src/rocksdb/util/auto_roll_logger_test.cc          |   137 +-
 src/rocksdb/util/autovector.h                      |    34 +-
 src/rocksdb/util/autovector_test.cc                |    51 +-
 src/rocksdb/util/benchharness.cc                   |   398 -
 src/rocksdb/util/benchharness.h                    |   357 -
 src/rocksdb/util/benchharness_test.cc              |    67 -
 src/rocksdb/util/blob_store.cc                     |   270 -
 src/rocksdb/util/blob_store.h                      |   163 -
 src/rocksdb/util/blob_store_test.cc                |   200 -
 src/rocksdb/util/bloom.cc                          |   310 +-
 src/rocksdb/util/bloom_test.cc                     |   175 +-
 src/rocksdb/util/build_version.h                   |     5 +-
 src/rocksdb/util/cache.cc                          |   290 +-
 src/rocksdb/util/cache_bench.cc                    |   276 +
 src/rocksdb/util/cache_test.cc                     |   210 +-
 src/rocksdb/util/coding.cc                         |    88 -
 src/rocksdb/util/coding.h                          |    56 +-
 src/rocksdb/util/coding_test.cc                    |    96 +-
 src/rocksdb/util/comparator.cc                     |    34 +-
 src/rocksdb/util/compression.h                     |   553 +
 src/rocksdb/util/crc32c.cc                         |     6 +-
 src/rocksdb/util/crc32c_test.cc                    |     3 +-
 src/rocksdb/util/db_info_dumper.cc                 |   130 +
 src/rocksdb/util/db_info_dumper.h                  |    13 +
 src/rocksdb/util/dynamic_bloom.cc                  |    66 +-
 src/rocksdb/util/dynamic_bloom.h                   |    89 +-
 src/rocksdb/util/dynamic_bloom_test.cc             |   101 +-
 src/rocksdb/util/env.cc                            |    40 +-
 src/rocksdb/util/env_hdfs.cc                       |   232 +-
 src/rocksdb/util/env_posix.cc                      |   597 +-
 src/rocksdb/util/env_test.cc                       |   554 +-
 src/rocksdb/util/event_logger.cc                   |    46 +
 src/rocksdb/util/event_logger.h                    |   170 +
 src/rocksdb/util/event_logger_test.cc              |    43 +
 src/rocksdb/util/file_util.cc                      |    59 +
 src/rocksdb/util/file_util.h                       |    18 +
 src/rocksdb/util/filelock_test.cc                  |     7 +-
 src/rocksdb/util/hash.cc                           |    24 +-
 src/rocksdb/util/hash.h                            |     8 +
 src/rocksdb/util/hash_cuckoo_rep.cc                |    53 +-
 src/rocksdb/util/hash_cuckoo_rep.h                 |     4 +-
 src/rocksdb/util/hash_linklist_rep.cc              |   609 +-
 src/rocksdb/util/hash_linklist_rep.h               |    18 +-
 src/rocksdb/util/hash_skiplist_rep.cc              |   118 +-
 src/rocksdb/util/hash_skiplist_rep.h               |     4 +-
 src/rocksdb/util/histogram.cc                      |     6 +-
 src/rocksdb/util/histogram.h                       |     8 +-
 src/rocksdb/util/histogram_test.cc                 |    14 +-
 src/rocksdb/util/instrumented_mutex.cc             |    76 +
 src/rocksdb/util/instrumented_mutex.h              |    98 +
 src/rocksdb/util/iostats_context.cc                |    32 +
 src/rocksdb/util/iostats_context_imp.h             |    46 +
 src/rocksdb/util/ldb_cmd.cc                        |   549 +-
 src/rocksdb/util/ldb_cmd.h                         |   115 +-
 src/rocksdb/util/ldb_cmd_execute_result.h          |    15 +-
 src/rocksdb/util/ldb_tool.cc                       |    26 +-
 src/rocksdb/util/log_buffer.cc                     |    21 +-
 src/rocksdb/util/log_buffer.h                      |    11 +-
 src/rocksdb/util/log_write_bench.cc                |    19 +-
 src/rocksdb/util/logging.cc                        |    70 +-
 src/rocksdb/util/logging.h                         |    16 +-
 src/rocksdb/util/manual_compaction_test.cc         |    18 +-
 src/rocksdb/util/memenv.cc                         |   433 +
 src/rocksdb/util/memenv_test.cc                    |   241 +
 src/rocksdb/util/mock_env.cc                       |   710 +
 src/rocksdb/util/mock_env.h                        |   110 +
 src/rocksdb/util/mock_env_test.cc                  |   285 +
 src/rocksdb/util/murmurhash.h                      |     2 +-
 src/rocksdb/util/mutable_cf_options.cc             |   121 +
 src/rocksdb/util/mutable_cf_options.h              |   139 +
 src/rocksdb/util/mutexlock.h                       |     4 +-
 src/rocksdb/util/options.cc                        |   374 +-
 src/rocksdb/util/options_builder.cc                |   206 +
 src/rocksdb/util/options_helper.cc                 |   722 +
 src/rocksdb/util/options_helper.h                  |    20 +
 src/rocksdb/util/options_test.cc                   |   710 +
 src/rocksdb/util/perf_context.cc                   |    45 +-
 src/rocksdb/util/perf_context_imp.h                |    44 +-
 src/rocksdb/util/posix_logger.h                    |    23 +-
 src/rocksdb/util/rate_limiter.cc                   |   216 +
 src/rocksdb/util/rate_limiter.h                    |    91 +
 src/rocksdb/util/rate_limiter_test.cc              |    95 +
 src/rocksdb/util/scoped_arena_iterator.h           |    28 +
 src/rocksdb/util/signal_test.cc                    |    34 -
 src/rocksdb/util/skiplistrep.cc                    |   130 +-
 src/rocksdb/util/slice.cc                          |    68 +-
 src/rocksdb/util/slice_transform_test.cc           |   153 +
 src/rocksdb/util/sst_dump_test.cc                  |   182 +
 src/rocksdb/util/sst_dump_tool.cc                  |   423 +
 src/rocksdb/util/sst_dump_tool_imp.h               |    90 +
 src/rocksdb/util/statistics.cc                     |   137 +-
 src/rocksdb/util/statistics.h                      |    60 +-
 src/rocksdb/util/stats_logger.h                    |    26 -
 src/rocksdb/util/status.cc                         |    15 +-
 src/rocksdb/util/stop_watch.h                      |    42 +-
 src/rocksdb/util/string_util.cc                    |     2 +-
 src/rocksdb/util/string_util.h                     |    15 +-
 src/rocksdb/util/sync_point.cc                     |    29 +-
 src/rocksdb/util/sync_point.h                      |    14 +-
 src/rocksdb/util/testharness.cc                    |    60 +-
 src/rocksdb/util/testharness.h                     |   126 +-
 src/rocksdb/util/testutil.cc                       |    55 +
 src/rocksdb/util/testutil.h                        |    43 +-
 src/rocksdb/util/thread_list_test.cc               |   352 +
 src/rocksdb/util/thread_local.cc                   |    23 +-
 src/rocksdb/util/thread_local.h                    |    15 +-
 src/rocksdb/util/thread_local_test.cc              |    59 +-
 src/rocksdb/util/thread_operation.h                |   123 +
 src/rocksdb/util/thread_status_impl.cc             |   167 +
 src/rocksdb/util/thread_status_updater.cc          |   343 +
 src/rocksdb/util/thread_status_updater.h           |   225 +
 src/rocksdb/util/thread_status_updater_debug.cc    |    46 +
 src/rocksdb/util/thread_status_util.cc             |   213 +
 src/rocksdb/util/thread_status_util.h              |   131 +
 src/rocksdb/util/thread_status_util_debug.cc       |    32 +
 src/rocksdb/util/vectorrep.cc                      |    46 +-
 src/rocksdb/util/xfunc.cc                          |    69 +
 src/rocksdb/util/xfunc.h                           |   113 +
 src/rocksdb/util/xxhash.cc                         |     3 +
 src/rocksdb/util/xxhash.h                          |     4 +-
 src/rocksdb/utilities/backupable/backupable_db.cc  |   643 +-
 .../utilities/backupable/backupable_db_test.cc     |   187 +-
 src/rocksdb/utilities/checkpoint/checkpoint.cc     |   168 +
 .../utilities/compacted_db/compacted_db_impl.cc    |   163 +
 .../utilities/compacted_db/compacted_db_impl.h     |    96 +
 src/rocksdb/utilities/convenience/convenience.cc   |    23 +
 src/rocksdb/utilities/document/document_db.cc      |  1192 ++
 src/rocksdb/utilities/document/document_db_test.cc |   324 +
 src/rocksdb/utilities/document/json_document.cc    |   610 +
 .../utilities/document/json_document_builder.cc    |   115 +
 .../utilities/document/json_document_test.cc       |   329 +
 src/rocksdb/utilities/flashcache/flashcache.cc     |   136 +
 src/rocksdb/utilities/flashcache/flashcache.h      |    18 +
 src/rocksdb/utilities/geodb/geodb_impl.cc          |    43 +-
 src/rocksdb/utilities/geodb/geodb_impl.h           |    25 +-
 src/rocksdb/utilities/geodb/geodb_test.cc          |    11 +-
 .../utilities/leveldb_options/leveldb_options.cc   |    56 +
 .../string_append/stringappend_test.cc             |    39 +-
 src/rocksdb/utilities/merge_operators/uint64add.cc |     8 +-
 src/rocksdb/utilities/redis/redis_list_exception.h |     2 +-
 src/rocksdb/utilities/redis/redis_list_iterator.h  |    14 +-
 src/rocksdb/utilities/redis/redis_lists_test.cc    |    32 +-
 src/rocksdb/utilities/spatialdb/spatial_db.cc      |   893 +
 src/rocksdb/utilities/spatialdb/spatial_db_test.cc |   274 +
 src/rocksdb/utilities/spatialdb/utils.h            |    95 +
 src/rocksdb/utilities/ttl/db_ttl_impl.cc           |    34 +-
 src/rocksdb/utilities/ttl/db_ttl_impl.h            |    41 +-
 src/rocksdb/utilities/ttl/ttl_test.cc              |   116 +-
 .../write_batch_with_index.cc                      |   665 +
 .../write_batch_with_index_internal.cc             |   242 +
 .../write_batch_with_index_internal.h              |    96 +
 .../write_batch_with_index_test.cc                 |  1190 ++
 src/sample.ceph.conf                               |     2 -
 src/stop.sh                                        |     6 +-
 src/test/Makefile-client.am                        |    89 +-
 src/test/Makefile-server.am                        |    67 +-
 src/test/Makefile.am                               |   169 +-
 src/test/ObjectMap/KeyValueDBMemory.h              |     2 +-
 src/test/admin_socket.cc                           |    23 +-
 src/test/bench/dumb_backend.h                      |     1 +
 src/test/bench/small_io_bench_fs.cc                |    20 +-
 src/test/bench/testfilestore_backend.cc            |    21 +-
 src/test/bench/tp_bench.cc                         |     1 +
 src/test/bufferlist.cc                             |   207 +-
 src/test/centos-6/ceph.spec.in                     |   665 +-
 src/test/centos-6/install-deps.sh                  |   100 +-
 src/test/centos-7/Dockerfile.in                    |     2 +-
 src/test/centos-7/ceph.spec.in                     |   665 +-
 src/test/centos-7/install-deps.sh                  |   100 +-
 src/test/ceph-disk.sh                              |   333 +-
 src/test/ceph_argparse.cc                          |    43 +-
 src/test/ceph_objectstore_tool.py                  |   776 +-
 src/test/cephtool-test-mds.sh                      |     2 +
 src/test/cephtool-test-mon.sh                      |     4 +
 src/test/cephtool-test-osd.sh                      |     2 +
 src/test/cephtool-test-rados.sh                    |    19 +
 src/test/cli/crushtool/arg-order-checks.t          |   731 +
 src/test/cli/crushtool/build.t                     |    33 +-
 src/test/cli/crushtool/check-names.empty.t         |     3 +-
 src/test/cli/crushtool/check-names.max-id.t        |     2 +-
 src/test/cli/crushtool/help.t                      |    81 +-
 src/test/cli/crushtool/set-choose.t                |     6 +-
 src/test/cli/crushtool/test-map-bobtail-tunables.t |     2 +-
 src/test/cli/crushtool/test-map-firefly-tunables.t |     2 +-
 src/test/cli/crushtool/test-map-indep.t            |     2 +-
 src/test/cli/crushtool/test-map-vary-r-0.t         |     2 +-
 src/test/cli/crushtool/test-map-vary-r-1.t         |     2 +-
 src/test/cli/crushtool/test-map-vary-r-2.t         |     2 +-
 src/test/cli/crushtool/test-map-vary-r-3.t         |     2 +-
 src/test/cli/crushtool/test-map-vary-r-4.t         |     2 +-
 src/test/cli/osdmaptool/pool.t                     |     4 +-
 src/test/cli/osdmaptool/tree.t                     |    19 +
 src/test/cli/radosgw-admin/help.t                  |     4 +
 src/test/cli/rbd/help.t                            |   124 +-
 src/test/cli/rbd/invalid-snap-usage.t              |    97 +-
 src/test/cli/rbd/not-enough-args.t                 |   189 +-
 src/test/cls_hello/test_cls_hello.cc               |    52 +
 src/test/cls_numops/test_cls_numops.cc             |   414 +
 src/test/cls_rbd/test_cls_rbd.cc                   |   284 +-
 src/test/common/Throttle.cc                        |    18 +-
 src/test/common/get_command_descriptions.cc        |     6 +-
 src/test/common/test_async_compressor.cc           |   221 +
 src/test/common/test_context.cc                    |     7 +
 src/test/common/test_crc32c.cc                     |    10 +
 src/test/common/test_prioritized_queue.cc          |   248 +
 src/test/common/test_shared_cache.cc               |    88 +-
 src/test/common/test_sharedptr_registry.cc         |    60 +-
 src/test/common/test_str_map.cc                    |     8 +
 src/test/container-make-check-ubuntu-14.04.sh      |     3 -
 src/test/crypto.cc                                 |    44 +-
 src/test/debian-jessie/install-deps.sh             |   100 +-
 src/test/encoding.cc                               |    31 +-
 src/test/encoding/ceph_dencoder.cc                 |    84 +-
 src/test/encoding/check-generated.sh               |    43 +-
 src/test/encoding/readable.sh                      |    16 +-
 src/test/encoding/types.h                          |    26 +-
 src/test/erasure-code/ErasureCodePluginExample.cc  |     7 +-
 src/test/erasure-code/Makefile.am                  |   133 +-
 src/test/erasure-code/TestErasureCode.cc           |     7 +
 src/test/erasure-code/TestErasureCodeExample.cc    |     2 +
 src/test/erasure-code/TestErasureCodeIsa.cc        |   126 +-
 src/test/erasure-code/TestErasureCodeJerasure.cc   |    85 +-
 src/test/erasure-code/TestErasureCodeLrc.cc        |   361 +-
 src/test/erasure-code/TestErasureCodePlugin.cc     |    55 +-
 src/test/erasure-code/TestErasureCodePluginIsa.cc  |    22 +-
 .../erasure-code/TestErasureCodePluginJerasure.cc  |    71 +-
 src/test/erasure-code/TestErasureCodePluginLrc.cc  |    16 +-
 src/test/erasure-code/TestErasureCodePluginShec.cc |   268 +
 src/test/erasure-code/TestErasureCodeShec.cc       |  1909 +-
 src/test/erasure-code/TestErasureCodeShec_all.cc   |    42 +-
 .../erasure-code/TestErasureCodeShec_arguments.cc  |   412 +
 .../erasure-code/TestErasureCodeShec_thread.cc     |    27 +-
 src/test/erasure-code/TestShecPluginGeneric.cc     |    29 +
 src/test/erasure-code/TestShecPluginNEON.cc        |    29 +
 src/test/erasure-code/TestShecPluginSSE3.cc        |    29 +
 src/test/erasure-code/TestShecPluginSSE4.cc        |    29 +
 src/test/erasure-code/ceph_erasure_code.cc         |    20 +-
 .../erasure-code/ceph_erasure_code_benchmark.cc    |    68 +-
 .../erasure-code/ceph_erasure_code_benchmark.h     |     3 +-
 .../ceph_erasure_code_non_regression.cc            |    44 +-
 src/test/erasure-code/test-erasure-code.sh         |   111 +-
 src/test/erasure-code/test-erasure-eio.sh          |   354 +
 src/test/fedora-21/Dockerfile.in                   |    29 +
 src/test/fedora-21/ceph.spec.in                    |  1317 ++
 src/test/fedora-21/install-deps.sh                 |   147 +
 src/test/libcephfs/flock.cc                        |   638 +
 src/test/libcephfs/test.cc                         |    59 +
 src/test/librados/aio.cc                           |   345 +-
 src/test/librados/c_read_operations.cc             |    20 +
 src/test/librados/c_write_operations.cc            |     2 +-
 src/test/librados/io.cc                            |   116 +
 src/test/librados/misc.cc                          |    68 +-
 src/test/librados/nlist.cc                         |    43 +
 src/test/librados/test.cc                          |    26 +
 src/test/librados/test.h                           |     4 +
 src/test/librados/tier.cc                          |    81 +-
 src/test/librados/watch_notify.cc                  |    39 +-
 src/test/librados_test_stub/LibradosTestStub.cc    |   145 +-
 src/test/librados_test_stub/LibradosTestStub.h     |    23 +
 src/test/librados_test_stub/TestClassHandler.cc    |    14 +-
 src/test/librados_test_stub/TestClassHandler.h     |     7 +-
 src/test/librados_test_stub/TestIoCtxImpl.cc       |    42 +-
 src/test/librados_test_stub/TestIoCtxImpl.h        |    35 +-
 src/test/librados_test_stub/TestMemIoCtxImpl.cc    |    73 +-
 src/test/librados_test_stub/TestMemIoCtxImpl.h     |    23 +-
 src/test/librados_test_stub/TestMemRadosClient.cc  |    14 +-
 src/test/librados_test_stub/TestMemRadosClient.h   |     6 +-
 src/test/librados_test_stub/TestRadosClient.cc     |    20 +-
 src/test/librados_test_stub/TestRadosClient.h      |    29 +
 src/test/libradosstriper/aio.cc                    |   107 +-
 src/test/libradosstriper/rados-striper.sh          |    95 +
 src/test/libradosstriper/striping.cc               |     5 +-
 src/test/librbd/fsx.cc                             |   110 +-
 src/test/librbd/mock/MockContextWQ.h               |    17 +
 src/test/librbd/mock/MockImageCtx.h                |   112 +
 src/test/librbd/mock/MockImageWatcher.h            |    19 +
 src/test/librbd/mock/MockObjectMap.h               |    20 +
 src/test/librbd/test_ImageWatcher.cc               |   144 +-
 src/test/librbd/test_internal.cc                   |   282 +
 src/test/librbd/test_librbd.cc                     |   661 +-
 src/test/librbd/test_mock_fixture.cc               |    68 +
 src/test/librbd/test_mock_fixture.h                |    64 +
 src/test/messenger/message_helper.h                |     4 +-
 src/test/messenger/xio_client.cc                   |     2 +-
 src/test/messenger/xio_dispatcher.h                |     2 +-
 src/test/messenger/xio_server.cc                   |     2 +-
 src/test/mon/PGMap.cc                              |    35 +-
 src/test/mon/misc.sh                               |    76 +-
 src/test/mon/mkfs.sh                               |    10 +-
 src/test/mon/mon-handle-forward.sh                 |    41 +-
 src/test/mon/mon-ping.sh                           |    46 +
 src/test/mon/mon-scrub.sh                          |    49 +
 src/test/mon/mon-test-helpers.sh                   |   124 -
 src/test/mon/osd-crush.sh                          |    87 +-
 src/test/mon/osd-erasure-code-profile.sh           |    85 +-
 src/test/mon/osd-pool-create.sh                    |   131 +-
 src/test/mon/test_mon_workloadgen.cc               |     3 +-
 src/test/msgr/perf_msgr_client.cc                  |   205 +
 src/test/msgr/perf_msgr_server.cc                  |   171 +
 src/test/msgr/test_async_driver.cc                 |    92 +-
 src/test/msgr/test_msgr.cc                         |   469 +-
 src/test/objectstore/DeterministicOpSequence.cc    |    52 +-
 src/test/objectstore/FileStoreDiff.cc              |    37 +-
 src/test/objectstore/FileStoreDiff.h               |     1 -
 src/test/objectstore/FileStoreTracker.cc           |    72 +-
 src/test/objectstore/FileStoreTracker.h            |    52 +-
 .../objectstore/ObjectStoreTransactionBenchmark.cc |     7 +-
 src/test/objectstore/TestObjectStoreState.cc       |    17 +-
 src/test/objectstore/TestObjectStoreState.h        |     8 +-
 src/test/objectstore/TestRocksdbOptionParse.cc     |    85 +
 src/test/objectstore/chain_xattr.cc                |    36 +-
 src/test/objectstore/store_test.cc                 |  1228 +-
 src/test/objectstore/test_idempotent.cc            |    18 +-
 src/test/objectstore/test_idempotent_sequence.cc   |     4 +-
 src/test/objectstore/test_kv.cc                    |   181 +
 src/test/objectstore/workload_generator.cc         |    21 +-
 src/test/objectstore_bench.cc                      |   290 +
 src/test/opensuse-13.2/Dockerfile.in               |    30 +
 src/test/opensuse-13.2/ceph.spec.in                |  1317 ++
 src/test/opensuse-13.2/install-deps.sh             |   147 +
 src/test/os/TestFlatIndex.cc                       |   139 -
 src/test/os/TestLFNIndex.cc                        |    18 +-
 src/test/osd/Object.cc                             |    44 +-
 src/test/osd/Object.h                              |     2 +
 src/test/osd/RadosModel.h                          |   225 +-
 src/test/osd/TestOSDMap.cc                         |     2 +-
 src/test/osd/TestPGLog.cc                          |    14 +-
 src/test/osd/TestRados.cc                          |    14 +-
 src/test/osd/osd-bench.sh                          |    31 +-
 src/test/osd/osd-config.sh                         |    46 +-
 src/test/osd/osd-copy-from.sh                      |    19 +-
 src/test/osd/osd-scrub-repair.sh                   |   129 +-
 src/test/osd/osd-test-helpers.sh                   |    89 -
 src/test/osd/types.cc                              |    82 +
 src/test/osdc/object_cacher_stress.cc              |    14 +-
 src/test/perf_counters.cc                          |    18 +-
 src/test/perf_helper.cc                            |    51 +
 src/test/perf_helper.h                             |    30 +
 src/test/perf_local.cc                             |  1047 +
 src/test/pybind/test_ceph_argparse.py              |    44 +-
 src/test/pybind/test_ceph_daemon.py                |    44 +
 src/test/python/brag-client/setup.py               |    31 +
 .../python/brag-client/tests/test_ceph_brag.py     |    10 +
 src/test/python/brag-client/tox.ini                |    16 +
 src/test/python/ceph-disk/setup.py                 |    27 +
 src/test/python/ceph-disk/tests/test_ceph_disk.py  |   640 +
 src/test/python/ceph-disk/tox.ini                  |    19 +
 src/test/rgw/test_rgw_obj.cc                       |   159 +
 src/test/run-cli-tests                             |     2 +-
 src/test/run-rbd-unit-tests.sh                     |    15 +
 src/test/streamtest.cc                             |     7 +-
 src/test/system/systest_runnable.cc                |    66 +-
 src/test/system/systest_runnable.h                 |     6 +-
 src/test/test-ceph-helpers.sh                      |     2 +-
 src/test/test_arch.cc                              |    14 +
 src/test/test_cors.cc                              |    28 +-
 src/test/test_filejournal.cc                       |   802 +-
 src/test/test_get_blkdev_size.cc                   |     1 +
 src/test/test_objectstore_memstore.sh              |     5 +
 src/test/test_pageset.cc                           |   271 +
 src/test/test_rbd_replay.cc                        |    91 -
 src/test/test_rgw_admin_log.cc                     |    32 +-
 src/test/test_rgw_admin_meta.cc                    |    36 +-
 src/test/test_rgw_admin_opstate.cc                 |     2 +-
 src/test/test_snap_mapper.cc                       |    18 +-
 src/test/test_stress_watch.cc                      |    10 +-
 src/test/test_subprocess.cc                        |   268 +
 src/test/test_trans.cc                             |     7 +-
 src/test/test_xlist.cc                             |   118 +
 src/test/testcrypto.cc                             |     8 +-
 src/test/ubuntu-12.04/Dockerfile.in                |     2 +-
 src/test/ubuntu-12.04/install-deps.sh              |   100 +-
 src/test/ubuntu-14.04/install-deps.sh              |   100 +-
 src/test/xattr_bench.cc                            |    22 +-
 src/tools/Makefile-client.am                       |    10 +-
 src/tools/Makefile-server.am                       |    15 +-
 src/tools/Makefile.am                              |    22 +-
 src/tools/RadosDump.cc                             |   168 +
 src/tools/RadosDump.h                              |   396 +
 src/tools/ceph-client-debug.cc                     |     2 +-
 src/tools/ceph-monstore-update-crush.sh            |   183 +
 src/tools/ceph_authtool.cc                         |     2 +
 src/tools/ceph_conf.cc                             |     3 +-
 src/tools/ceph_kvstore_tool.cc                     |    90 +-
 src/tools/ceph_monstore_tool.cc                    |   233 +-
 src/tools/ceph_objectstore_tool.cc                 |  2263 +-
 src/tools/ceph_objectstore_tool.h                  |    42 +
 src/tools/ceph_osdomap_tool.cc                     |    22 +-
 src/tools/cephfs/DataScan.cc                       |  1376 ++
 src/tools/cephfs/DataScan.h                        |   272 +
 src/tools/cephfs/JournalScanner.cc                 |    14 +
 src/tools/cephfs/JournalTool.cc                    |    37 +-
 src/tools/cephfs/JournalTool.h                     |     2 +-
 src/tools/cephfs/MDSUtility.cc                     |     2 +-
 src/tools/cephfs/Resetter.cc                       |    99 +-
 src/tools/cephfs/Resetter.h                        |    13 +-
 src/tools/cephfs/TableTool.cc                      |   105 +-
 src/tools/cephfs/cephfs-data-scan.cc               |    45 +
 src/tools/crushtool.cc                             |   287 +-
 src/tools/osdmaptool.cc                            |    56 +-
 src/tools/psim.cc                                  |    14 +-
 src/tools/rados/PoolDump.cc                        |   169 +
 src/tools/rados/PoolDump.h                         |    32 +
 src/tools/rados/RadosImport.cc                     |   377 +
 src/tools/rados/RadosImport.h                      |    45 +
 src/tools/rados/rados.cc                           |   380 +-
 src/tools/rados/rados_export.cc                    |   229 -
 src/tools/rados/rados_import.cc                    |   239 -
 src/tools/rados/rados_sync.cc                      |   903 -
 src/tools/rados/rados_sync.h                       |   216 -
 src/tools/rest_bench.cc                            |   802 -
 src/tracing/Makefile.am                            |    95 +-
 src/tracing/Makefile.in                            |   818 -
 src/tracing/librados.c                             |     6 +
 src/tracing/librbd.c                               |     6 +
 src/tracing/librbd.tp                              |   118 +-
 src/tracing/objectstore.c                          |     6 +
 src/tracing/oprequest.c                            |     6 +
 src/tracing/osd.c                                  |     6 +
 src/tracing/pg.c                                   |     6 +
 src/upstart/ceph-disk.conf                         |    10 +
 src/upstart/ceph-mds.conf                          |     4 +-
 src/upstart/ceph-mon.conf                          |     4 +-
 src/upstart/ceph-osd.conf                          |     4 +-
 src/upstart/radosgw.conf                           |     4 +-
 src/verify-mds-journal.sh                          |     8 -
 src/vstart.sh                                      |    99 +-
 systemd/Makefile.am                                |    18 +
 systemd/Makefile.in                                |   588 +
 systemd/ceph                                       |    65 +
 systemd/ceph-create-keys at .service                  |    10 +
 systemd/ceph-disk at .service                         |     8 +
 systemd/ceph-mds at .service                          |    16 +
 systemd/ceph-mon at .service                          |    22 +
 systemd/ceph-osd at .service                          |    17 +
 systemd/ceph-radosgw-prestart.sh                   |   100 +
 systemd/ceph-radosgw at .service                      |    15 +
 systemd/ceph.target                                |     4 +
 systemd/ceph.tmpfiles.d                            |     1 +
 udev/60-ceph-partuuid-workaround.rules             |     2 +-
 udev/95-ceph-osd.rules                             |    74 +-
 1663 files changed, 217915 insertions(+), 98657 deletions(-)

diff --git a/AUTHORS b/AUTHORS
index 40a5316..d6cbb94 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -1,6 +1,4 @@
-9seconds <nineseconds at yandex.ru>
 Abhishek Dixit <dixitabhi at gmail.com>
-Abhishek L <abhishekl.2006 at gmail.com>
 Abhishek Lekshmanan <abhishek.lekshmanan at ril.com>
 Accela Zhao <accelazh at gmail.com>
 Adam C. Emerson <aemerson at linuxbox.com>
@@ -8,16 +6,22 @@ Adam Crume <adamcrume at gmail.com>
 Adam Manzanares <nmtadam at gmail.com>
 Adam Spiers <aspiers at suse.com>
 Adam Twardowski <adam.twardowski at gmail.com>
+Ahoussi Armand <ahoussi.say at telecom-bretagne.eu>
 Ailing Zhang <zhangal1992 at gmail.com>
 Alan Grosskurth <code at alan.grosskurth.ca>
 Alan Somers <asomers at gmail.com>
+Alexander Chuzhoy <schuzhoy at users.noreply.github.com>
 Alexandre Marangone <alexandre.marangone at inktank.com>
+Alexandre Marangone <amarango at redhat.com>
 Alexandre Oliva <oliva at gnu.org>
 Alex Elder <elder at inktank.com>
 Alexey Lapitsky <lex at realisticgroup.com>
+Alexis Normand <n.al3xis at gmail.com>
 Alfredo Deza <adeza at redhat.com>
 Alfredo Deza <alfredo.deza at inktank.com>
 Ali Maredia <ali at linuxbox.com>
+Ali Maredia <amaredia at redhat.com>
+Alistair Strachan <alistair.strachan at imgtec.com>
 Allen Samuels <allen.samuels at sandisk.com>
 Alphat-PC <AlphatPC at gmail.com>
 Anand Bhat <anand.bhat at sandisk.com>
@@ -26,47 +30,56 @@ Andreas Bluemle <andreas.bluemle at itxperts.de>
 Andreas Peters <andreas.joachim.peters at cern.ch>
 Andre Noll <maan at systemlinux.org>
 Andrew Bartlett <abartlet at catalyst.net.nz>
-Andrew Bartlett <abartlet at samba.org>
 Andrew Farmer <andrewf at hq.newdream.net>
 Andrew Leung <aleung at cs.ucsc.edu>
 Andrew Woodward <awoodward at mirantis.com>
 Andrey Kuznetsov <Andrey_Kuznetsov at epam.com>
 Andrey Stepachev <octo at yandex-team.ru>
-Andy Allan <github at gravitystorm.co.uk>
-Anols <ayari_anis at live.fr>
+Andy Allan <andy at gravitystorm.co.uk>
+Anis Ayari <ayari_anis at live.fr>
 Anton Aksola <anton.aksola at nebula.fi>
 Anton Blanchard <anton at samba.org>
 apovzner <apovzner at 29311d96-e01e-0410-9327-a35deaab8ce9>
 Ariela <Dell at ARIELA.(none)>
 Aristoteles Neto <aristoteles.neto at webdrive.co.nz>
 Armando Segnini <armaseg at gmail.com>
+Arthur Gorjux <arthurgorjux at gmail.com>
 Ashish Chandra <ashish.a.chandra at ril.com>
 atwardowski <adam.twardowski at gmail.com>
 Babu Shanmugam <anbu at enovance.com>
-Baptiste Veuillez <baptiste at UbuntuBVM.lan>
+Baptiste Veuillez <baptiste.veuillez--mainard at telecom-bretagne.eu>
 Bastian Blank <waldi at debian.org>
 Benjamin Kerensa <bkerensa at gmail.com>
 Benoît Knecht <benoit.knecht at fsfe.org>
-Billy Olsen <billy.olsen at gmail.com>
+Billy Olsen <billy.olsen at canonical.com>
 BJ Lougee <almightybeeij at gmail.com>
 Bjørnar Ness <bjornar.ness at gmail.com>
 Blaine Gardner <blaine.gardner at hp.com>
+blinke <Burkhard.Linke at computational.bio.uni-giessen.de>
+Bo Cai <cai.bo at h3c.com>
 Boris Ranto <branto at redhat.com>
+Bosse Klykken <larkly at gmail.com>
+Brad Hubbard <bhubbard at redhat.com>
 Brandon Seibel <brandon at seibelnet.ca>
-branto1 <branto at redhat.com>
+Brian Andrus <bandrus+github at gmail.com>
 Brian Chrisman <brchrisman at gmail.com>
 Brian Rak <dn at devicenull.org>
 Brown, David M JR <david.brown at pnl.gov>
+caibo <cai.bo at h3c.com>
 Caleb Miles <caleb.miles at inktank.com>
 Carlos Maltzahn <carlosm at cs.ucsc.edu>
 carsonoid <ca at carsonoid.net>
+Casey Bodley <casey at cohortfs.com>
 Casey Bodley <casey at linuxbox.com>
+Casey Bodley <cbodley at redhat.com>
 Casey Marshall <csm at soe.ucsc.edu>
 CC Lien <cc_lien at tcloudcomputing.com>
+Ce Gu <guce at h3c.com>
 Cesar Mello <cesar at d1.(none)>
 Chen Baozi <baozich at gmail.com>
 Chendi Xue <chendi.xue at intel.com>
 Cheng Cheng <ccheng.leo at gmail.com>
+chenji <insomnia at 139.com>
 Chris Dunlop <chris at onthe.net.au>
 Chris Glass <tribaal at gmail.com>
 Chris Holcombe <chris.holcombe at nebula.com>
@@ -77,10 +90,13 @@ Christophe Courtaut <christophe.courtaut at gmail.com>
 Christopher O'Connell <jwriteclub at gmail.com>
 Christoph Hellwig <hch at infradead.org>
 Christos Stavrakakis <stavr.chris at gmail.com>
+Claire Massot <claire.massot93 at gmail.com>
+Clement Lebrun <clement.lebrun.31 at gmail.com>
 Colin Mattson <colinmattson at gmail.com>
 Colin P. McCabe <colinm at hq.newdream.net>
 Dan Chai <tengweicai at gmail.com>
 Daniel Gollub <d.gollub at telekom.de>
+Daniel Gryniewicz <dang at fprintf.net>
 Daniel J. Hofmann <daniel at trvx.org>
 Dan Mick <dan.mick at inktank.com>
 Dan Mick <dmick at redhat.com>
@@ -91,7 +107,7 @@ David Disseldorp <ddiss at suse.de>
 David Moreau Simard <dmsimard at iweb.com>
 David Zafman <david.zafman at inktank.com>
 David Zafman <dzafman at redhat.com>
-delco225 <delco225>
+Dennis Schafroth <dennis at schafroth.dk>
 Derek Yarnell <derek at umiacs.umd.edu>
 Derrick Schneider <derrick.schneider at opower.com>
 Ding Dinghua <dingdinghua85 at gmail.com>
@@ -99,10 +115,11 @@ Dmitry Smirnov <onlyjob at member.fsf.org>
 Dmitry Yatsushkevich <dyatsushkevich at mirantis.com>
 Dmytro Iurchenko <diurchenko at mirantis.com>
 Dominik Hannen <cantares1+github at gmail.com>
+Donghai Xu <xu.donghai at h3c.com>
 Dongmao Zhang <deanraccoon at gmail.com>
 Dongsu Park <dpark1978 at gmail.com>
 Dong Yuan <yuandong1222 at gmail.com>
-dwj192 <duanweijun at h3c.com>
+Douglas Fuller <dfuller at redhat.com>
 Eleanor Cawthon <eleanor.cawthon at inktank.com>
 Emily Popper <emily.popper at dreamhost.com>
 Eric Mourgaya <eric.mourgaya at arkea.com>
@@ -110,6 +127,7 @@ Erik Logtenberg <erik at logtenberg.eu>
 Erwin, Brock A <Brock.Erwin at pnl.gov>
 Esteban Molina-Estolano <eestolan at lanl.gov>
 Evan Felix <evan.felix at pnnl.gov>
+Fabio Alessandro Locati <fabiolocati at gmail.com>
 fangdong <yp.fangdong at gmail.com>
 Federico Gimenez <fgimenez at coit.es>
 Federico Simoncelli <fsimonce at redhat.com>
@@ -118,29 +136,35 @@ Feng Wang <cyclonew at cs.ucsc.edu>
 Filippos Giannakos <philipgian at grnet.gr>
 Florent Bautista <florent at coppint.com>
 Florent Flament <florent.flament at cloudwatt.com>
+Florian Coste <fcoste21 at gmail.com>
 Florian Haas <florian at hastexo.com>
+Florian Marsylle <florian.marsylle at hotmail.fr>
 Francois Deppierraz <francois at ctrlaltdel.ch>
 François Lafont <flafdivers at free.fr>
 Frank Yu <flyxiaoyu at gmail.com>
 Fred Ar <ar.fred at yahoo.com>
+Gabriel Sentucq <perso at kazhord.fr>
+Gaël Fenet-Garde <gael.fenet.garde at gmail.com>
 Gary Lowell <gary.lowell at inktank.com>
+Gaurav Kumar Garg <garg.gaurav52 at gmail.com>
 George Ryall <george.ryall at stfc.ac.uk>
 Gerben Meijer <gerben at daybyday.nl>
+Gerhard Muntingh <gerhard at warpnet.nl>
+Germain Chipaux <germain.chipaux at gmail.com>
 git-harry <git-harry at live.co.uk>
 Greg Farnum <gfarnum at redhat.com>
 Greg Farnum <greg at inktank.com>
-Guang G Yang <yguang at renownedground.corp.gq1.yahoo.com>
+Gregory Meno <gmeno at redhat.com>
 Guangliang Zhao <guangliang at unitedstack.com>
 Guang Yang <yguang at yahoo-inc.com>
-guce <guce at h3c.com>
 Guilhem Lettron <guilhem at lettron.fr>
 Haifeng Liu <haifeng at yahoo-inc.com>
 Hannes Reinecke <hare at suse.de>
 Hannu Valtonen <hannu.valtonen at ormod.com>
-Haomai Wang <haomaiwang at gmail.com>
-Haomai Wang <yuyuyu101 at 163.com>
+Haomai Wang <haomai at xsky.com>
+Haomai Wang <haomai at xsky.io>
 Harpreet Dhillon <harpreet at ironsystems.com>
-Hazem <hazem at hazem-Inspiron-3537.(none)>
+Hazem Amara <hazem.amara at telecom-bretagne.eu>
 Henry C Chang <henry_c_chang at tcloudcomputing.com>
 Henry Chang <henry at bigtera.com>
 Herb Shiu <herb_shiu at tcloudcomputing.com>
@@ -149,28 +173,38 @@ Holger Macht <hmacht at suse.de>
 Huamin Chen <hchen at redhat.com>
 Huang Jun <hjwsm1989 at gmail.com>
 Ian Holsman <lists at holsman.net>
+Ian Kelling <ian at iankelling.org>
+Ilja Slepnev <islepnev at gmail.com>
 Ilya Dryomov <idryomov at redhat.com>
 Ilya Dryomov <ilya.dryomov at inktank.com>
-islepnev <islepnev at gmail.com>
+Ira Cooper <ira at samba.org>
+Ismael Serrano <ismael.serrano at gmail.com>
 James Page <james.page at ubuntu.com>
 James Ryan Cresawn <jrcresawn at gmail.com>
 Jan Harkes <jaharkes at cs.cmu.edu>
 Janne Grunau <j at jannau.net>
 Jason Dillaman <dillaman at redhat.com>
+Javier Guerra <javier at guerrag.com>
 Javier M. Mellid <jmunhoz at igalia.com>
+Jean-Rémi Deveaux <jeanremi.deveaux at gmail.com>
+Jeff Weber <jweber at cofront.net>
+Jenkins Build Slave User <jenkins-build at jenkins-slave-wheezy.localdomain>
 Jenkins <jenkins at ceph.com>
 Jenkins <jenkins at inktank.com>
 Jens-Christian Fischer <jens-christian.fischer at switch.ch>
-Jerry7X <875016668 at qq.com>
+jepst <jepst79 at gmail.com>
+Jevon Qiao <qiaojianfeng at unitedstack.com>
 Jiang Heng <jiangheng0511 at gmail.com>
 Jiantao He <hejiantao5 at gmail.com>
-Jian Wen <wenjianhn at gmail.com>
 Jian Wen <wenjian at letv.com>
+Jiaying Ren <mikulely at gmail.com>
 Jim Schutt <jaschut at sandia.gov>
 João Eduardo Luís <joao.luis at inktank.com>
 João Eduardo Luís <joao at redhat.com>
 Joao Eduardo Luis <joao at suse.de>
+Joaquim Rocha <joaquim.rocha at cern.ch>
 Joe Buck <jbbuck at gmail.com>
+Joe Handzik <joseph.t.handzik at hp.com>
 Johannes Erdfelt <johannes at erdfelt.com>
 John Spray <john.spray at inktank.com>
 John Spray <jspray at redhat.com>
@@ -181,6 +215,7 @@ Jojy George Varghese <jvarghese at scalecomputing.com>
 Jonathan Davies <jonathan.davies at canonical.com>
 Jonathan Dieter <jdieter at lesbg.com>
 Jon Bernard <jbernard at tuxion.com>
+Jordan Dorne <jordan.dorne at gmail.com>
 Jordi Llonch <llonchj at gmail.com>
 Josef Bacik <josef at redhat.com>
 Joseph McDonald <joseph.mcdonald at alcatel-lucent.com>
@@ -188,35 +223,44 @@ Josh Durgin <jdurgin at redhat.com>
 Josh Durgin <josh.durgin at inktank.com>
 Josh Pieper <jjp at pobox.com>
 JP François <francoisjp at gmail.com>
+Juan A. Suarez Romero <jasuarez at igalia.com>
 JuanJose 'JJ' Galvez <jgalvez at redhat.com>
 Kacper Kowalik <xarthisius at gentoo.org>
+Kadu Ribeiro <mail+github at carlosribeiro.me>
 Kai Zhang <zakir.exe at gmail.com>
 Karel Striegel <karel.striegel at ipc.be>
 Karl Eichwalder <ke at suse.de>
 Kefu Chai <kchai at redhat.com>
-Kefu Chai <tchaikov at gmail.com>
 Ken Dreyer <kdreyer at redhat.com>
 Ken Dreyer <ken.dreyer at inktank.com>
 Ketor Meng <d.ketor at gmail.com>
+Kévin Caradant <kevin.caradant at gmail.com>
 Kevin Cox <kevincox at kevincox.ca>
 Kevin Dalley <kevin at kelphead.org>
 Kevin Jones <k.j.jonez at gmail.com>
 Kim Vandry <vandry at TZoNE.ORG>
+Kiseleva Alyona <akiselyova at mirantis.com>
+Krzysztof Kosiński <krzysztof.kosinski at intel.com>
 Kuan Kai Chiu <big.chiu at bigtera.com>
 Kun Huang <academicgareth at gmail.com>
 Kyle Bader <kyle.bader at dreamhost.com>
 Kyle Marsh <kyle.marsh at dreamhost.com>
 Laszlo Boszormenyi <gcs at debian.hu>
 Laurent Barbe <laurent at ksperis.com>
+Lee Revell <rlrevell at gmail.com>
 Lei Dong <leidong at yahoo-inc.com>
 Liam Monahan <liam at umiacs.umd.edu>
+Li Peng <lip at dtdream.com>
+liumingxin <mingxinliu at ubuntukylin.com>
 Li Wang <liwang at ubuntukylin.com>
 Lluis Pamies-Juarez <lluis.pamies-juarez at hgst.com>
 Loic Dachary <ldachary at redhat.com>
 Loic Dachary <loic-201408 at dachary.org>
 Loic Dachary <loic at dachary.org>
+Lucas Fantinel <lucas.fantinel at gmail.com>
 Luis Pabón <lpabon at redhat.com>
 Lukasz Jagiello <lukasz at wikia-inc.com>
+Lu Shi <shi.lu at h3c.com>
 Ma Jianpeng <jianpeng.ma at intel.com>
 Marco Garcês <marco.garces at bci.co.mz>
 Marcus Sorensen <shadowsor at gmail.com>
@@ -227,10 +271,11 @@ Markus Elfring <elfring at users.sourceforge.net>
 marnberg <marnberg at 29311d96-e01e-0410-9327-a35deaab8ce9>
 Martin Ettl <ettl.martin at gmx.de>
 Matt Benjamin <matt at cohortfs.com>
-Matt Benjamin <matt at linuxbox.com>
+Matt Benjamin <mbenjamin at redhat.com>
 Matthew Roy <matthew at royhousehold.net>
 Matthew Wodrich <matthew.wodrich at dreamhost.com>
 Matt Richards <mattjrichards at gmail.com>
+Maxime Robert <maxime.robert1992 at gmail.com>
 Mehdi Abaakouk <sileht at sileht.net>
 Michael McThrow <mmcthrow at gmail.com>
 Michael Nelson <mikenel at tnld.net>
@@ -240,23 +285,28 @@ Michal Jarzabek <stiopa at gmail.com>
 Mike Kelly <pioto at pioto.org>
 Mike Lundy <mike at fluffypenguin.org>
 Mike Ryan <mike.ryan at inktank.com>
+Milan Broz <mbroz at redhat.com>
+minchen <minchen at ubuntukylin.com>
 Min Chen <minchen at ubuntukylin.com>
+Mingxin Liu <mingxinliu at ubuntukylin.com>
 MingXin Liu <mingxinliu at ubuntukylin.com>
+Mingyue Zhao <zhao.mingyue at h3c.com>
 Mohammad Salehe <salehe+dev at gmail.com>
 Moritz Möller <mm at mxs.de>
 Mouad Benchchaoui <m.benchchaoui at x-ion.de>
 Mykola Golub <mgolub at mirantis.com>
-Mykola Golub <mgolub at zhuzha.mirantis.lviv.net>
-nairolf21 <fcoste21 at gmail.com>
 Nathan Cutler <ncutler at suse.com>
-Nathan Cutler <ncutler at suse.cz>
+Na Xie <xie.na at h3c.com>
+Neha Ummareddy <nehaummareddy at gmail.com>
 Neil Horman <nhorman at tuxdriver.com>
 Neil Levine <neil.levine at inktank.com>
+Nicolas Yong <nicolas.yong93 at gmail.com>
 Nikola Kotur <kotnick at gmail.com>
 Nilamdyuti Goswami <ngoswami at redhat.com>
-Ning Yao <zay11022 at gmail.com>
+Ning Yao <yaoning at ruijie.com.cn>
 Noah Watkins <nwatkins at redhat.com>
 (no author) <(no author)@29311d96-e01e-0410-9327-a35deaab8ce9>
+oddomatik <bandrus+github at gmail.com>
 Orit Wasserman <owasserm at redhat.com>
 Owen Synge <osynge at suse.com>
 Padraig O'Sullivan <posulliv at umd.edu>
@@ -271,10 +321,15 @@ Peter Reiher <reiher at inktank.com>
 Peter Vinson <peter at hq.newdream.net>
 Peter Wienemann <wienemann at physik.uni-bonn.de>
 Pete V <peter at squid.newdream.net>
+Pete Zaitcev <zaitcev at redhat.com>
 Petr Machata <pmachata at redhat.com>
+Pierre Chaumont <pierre.chaumont31 at gmail.com>
 Pierre Rognant <prognant at oodrive.com>
-qiushanggao <qiushanggao at qq.com>
+Piotr Dałek <piotr.dalek at ts.fujitsu.com>
+Qiankun Zheng <zheng.qiankun at h3c.com>
+Radoslaw Zarzynski <rzarzynski at github.com>
 Radoslaw Zarzynski <rzarzynski at mirantis.com>
+Rajesh Nambiar <rajesh.n at msystechnologies.com>
 Raju Kurunkad <raju.kurunkad at sandisk.com>
 Ray Lv <xiangyulv at gmail.com>
 rca <bertosmailbox at gmail.com>
@@ -283,17 +338,21 @@ riccardo80 <riccardo80 at 29311d96-e01e-0410-9327-a35deaab8ce9>
 Riccardo Ferretti <rferrett at soe.ucsc.edu>
 ritz303 <ritz_303 at yahoo.com>
 Roald J. van Loon <roald at roaldvanloon.nl>
-RobertJansen1 <r.jansen86 at gmail.com>
+Robert Jansen <r.jansen at fairbanks.nl>
+Robin Dehu <robindehu at gmail.com>
 Robin H. Johnson <robbat2 at gentoo.org>
+Robin H. Johnson <robin.johnson at dreamhost.com>
+Robin Tang <robintang974 at gmail.com>
 Rohan Mars <code at rohanmars.com>
 Roman Haritonov <reclosedev at gmail.com>
 Ron Allred <rallred at itrefined.com>
 Rongze Zhu <zrzhit at gmail.com>
 root <root at phenom.dyweni.com>
+root <root at ubuntu1.com>
 Ross Turk <ross.turk at inktank.com>
 Ross Turk <rturk at redhat.com>
 Ruben Kerkhof <ruben at rubenkerkhof.com>
-Ruifeng Yang <149233652 at qq.com>
+Ruifeng Yang <yangruifeng.09209 at h3c.com>
 Rutger ter Borg <rutger at terborg.net>
 Sage Weil <sage at inktank.com>
 Sage Weil <sweil at redhat.com>
@@ -303,16 +362,23 @@ Samuel Just <sam.just at inktank.com>
 Samuel Just <sjust at redhat.com>
 Sandon Van Ness <sandon at inktank.com>
 Sandon Van Ness <svanness at redhat.com>
+Sangdi <xu.sangdi at h3c>
+Sangdi Xu <xu.sangdi at h3c.com>
 Scott A. Brandt <scott at cs.ucsc.edu>
 Scott Devoid <devoid at anl.gov>
 Sean Channel <pentabular at gmail.com>
-Sebastien Han <sebastien.han at enovance.com>
+Sébastien Han <shan at redhat.com>
 Sebastien Ponce <sebastien.ponce at cern.ch>
+Sergey Arkhipov <nineseconds at yandex.ru>
+Shanggao Qiu <qiushanggao at qq.com>
 Sharif Olorin <sio at tesser.org>
+shawn <chen.xiaowei at h3c.com>
 Shawn Edwards <lesser.evil at gmail.com>
 shishir gowda <shishir.gowda at sandisk.com>
+Shotaro Kawaguchi <kawaguchi.s at jp.fujitsu.com>
 Shu, Xinxin <xinxin.shu at intel.com>
 Shylesh Kumar <shmohan at redhat.com>
+Siddharth Sharma <siddharth at redhat.com>
 Simone Gotti <simone.gotti at gmail.com>
 Simon Leinen <simon.leinen at switch.ch>
 Somnath Roy <somnath.roy at sandisk.com>
@@ -322,6 +388,7 @@ Stefan Eilemann <Stefan.Eilemann at epfl.ch>
 Stephan Renatus <s.renatus at x-ion.de>
 Stephen F Taylor <steveftaylor at gmail.com>
 Stephen Jahl <stephenjahl at gmail.com>
+Steve Capper <steve.capper at linaro.org>
 Steve MacGregor <grape at lapgoat-0.(none)>
 Steve Stock <steve at technolope.org>
 Stratos Psomadakis <psomas at grnet.gr>
@@ -330,29 +397,35 @@ Sushma Gurram <sushma.gurram at sandisk.com>
 Swami Reddy <swami.reddy at ril.com>
 Sylvain Baubeau <sbaubeau at redhat.com>
 Sylvain Munaut <s.munaut at whatever-company.com>
+Takanori Nakao <nakao.takanori at jp.fujitsu.com>
 Takeshi Miyamae <miyamae.takeshi at jp.fujitsu.com>
 Takuya ASADA <syuu at dokukino.com>
 Tamil Muthamizhan <tamil.muthamizhan at inktank.com>
 Thomas Bechtold <t.bechtold at telekom.de>
-ThomasCantin <thomas.cantin at telecom-bretagne.eu>
+Thomas Cantin <thomas.cantin at telecom-bretagne.eu>
+Thomas Johnson <NTmatter at gmail.com>
+Thomas Laumondais <thomas.laumondais at gmail.com>
 Thomas Mueller <thomas at chaschperli.ch>
 Thorsten Behrens <tbehrens at suse.com>
 Thorsten Glaser <tg at mirbsd.de>
+Tianshan Qu <tianshan at xsky.com>
 Tim Freund <tim at freunds.net>
 Tim Serong <tserong at suse.com>
-t-miyamae <miyamae.takeshi at jp.fujitsu.com>
 tmuthamizhan <tamil.muthamizhan at inktank.com>
+tobe <tobeg3oogle at gmail.com>
 Tobias Florek <tobias.florek at bytesandbutter.de>
 Tomasz Paskowski <ss7pro at gmail.com>
 Tom Callaway <spot at redhat.com>
 Tommi Virtanen <tv at inktank.com>
 topher <topher at 29311d96-e01e-0410-9327-a35deaab8ce9>
-Travis Rhoden <trhoden at gmail.com>
 Travis Rhoden <trhoden at redhat.com>
 Tyler Brekke <tbrekke at redhat.com>
 Tyler Brekke <tyler.brekke at inktank.com>
+Valentin Arshanes Thomas <valentin.arshanes.thomas at gmail.com>
 Vangelis Koukis <vkoukis at cslab.ece.ntua.gr>
-Ved-vampir <akiselyova at mirantis.com>
+Varada Kari <varada.kari at sandisk.com>
+Vartika Rai <vartikarai17 at gmail.com>
+Vasu Kulkarni <vasu at redhat.com>
 Venky Shankar <vshankar at redhat.com>
 Vicente Cheng <freeze.bilsted at gmail.com>
 Vikhyat Umrao <vumrao at redhat.com>
@@ -363,21 +436,30 @@ Vu Pham <vu at mellanox.com>
 Walter Huf <hufman at gmail.com>
 Wang, Yaguang <yaguang.wang at intel.com>
 Warren Usui <warren.usui at inktank.com>
+Weijun Duan <duanweijun at h3c.com>
 Wei Luo <luowei at yahoo-inc.com>
+weiqian <weiq at dtdream.com>
 Wesley Spikes <wesley.spikes at dreamhost.com>
 Wido den Hollander <wido at 42on.com>
 William A. Kennington III <william at wkennington.com>
-wuxingyi <wuxingyi2015 at outlook.com>
-wuxingyi <wuxingyi at letv.com>
+Wu Xingyi <wuxingyi at letv.com>
 Wyllys Ingersoll <wyllys.ingersoll at keepertech.com>
 Xan Peng <xanpeng at gmail.com>
-Xiaowei Chen <cxwshawn at gmail.com>
+Xavier Roche <roche+git at exalead.com>
+Xiaowei Chen <chen.xiaowei at h3c.com>
 Xiaoxi Chen <xiaoxi.chen at intel.com>
+Xie Rui <875016668 at qq.com>
+Xie Rui <jerry.xr86 at gmail.com>
+xiexingguo <258156334 at qq.com>
 Xihui He <xihuihe at gmail.com>
 Xing Lin <xinglin at cs.utah.edu>
-Xinze Chi <xmdxcxz at gmail.com>
+Xingyi Wu <wuxingyi2015 at outlook.com>
+Xinze Chi <xinze at xksy.com>
+Xinze Chi <xinze at xsky.com>
 Xiong Yiliang <xiongyiliang at xunlei.com>
+Xuan Liu <liu.xuan at h3c.com>
 Yann Dupont <yann at objoo.org>
+Yannick Atchy Dalama <yannick.atchy.dalama at gmail.com>
 Yan, Zheng <zheng.z.yan at intel.com>
 Yan, Zheng <zyan at redhat.com>
 Yazen Ghannam <yazen.ghannam at linaro.org>
@@ -388,5 +470,7 @@ Yuan Zhou <yuan.zhou at intel.com>
 Yunchuan Wen <yunchuanwen at ubuntukylin.com>
 Yuri Weinstein <yuri.weinstein at inktank.com>
 Zhe Zhang <zzxuanyuan at gmail.com>
+Zhicheng Wei <zhicheng at opensourceforge.net>
+Zhi (David) Zhang <zhangz at yahoo-inc.com>
 Zhiqiang Wang <zhiqiang.wang at intel.com>
 Zhi Zhang <zhangz.david at outlook.com>
diff --git a/COPYING b/COPYING
index 06e20b1..5efc838 100644
--- a/COPYING
+++ b/COPYING
@@ -11,6 +11,9 @@ Files: doc/*
 Copyright: (c) 2010-2012 New Dream Network and contributors
 License: Creative Commons Attribution-ShareAlike (CC BY-SA)
 
+Files: bin/git-archive-all.sh
+License: GPL3
+
 Files: src/mount/canonicalize.c
 Copyright: Copyright (C) 1993 Rick Sladkey <jrs at world.std.com>
 License: LGPL2 or later
@@ -141,3 +144,8 @@ File: qa/workunits/erasure-code/jquery.js
 Files: qa/workunits/erasure-code/jquery.{flot.categories,flot}.js
   Copyright (c) 2007-2014 IOLA and Ole Laursen.
   Licensed under the MIT license.
+
+Files: src/include/timegm.h
+  Copyright (C) Copyright Howard Hinnant
+  Copyright (C) Copyright 2010-2011 Vicente J. Botet Escriba
+  License: Boost Software License, Version 1.0
diff --git a/ChangeLog b/ChangeLog
index ceb3dac..dd6c2a5 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,532 +1,3662 @@
-9764da5 (HEAD, tag: v0.94.5) 0.94.5
-250dc07 osd/ReplicatedPG: remove stray debug line
-d3abcbe librbd: potential assertion failure during cache read
-991d0f0 tests: reproduce crash during read-induced CoW
-51f3d6a qa: Use public qemu repo
-9529269 (tag: v0.94.4) 0.94.4
-b203979 use git://git.ceph.com
-0f4ef19 qa: http://ceph.com/qa -> http://download.ceph.com/qa
-294f016 (origin/wip-13227-hammer) init-radosgw.sysv: remove
-698d75c (origin/wip-13410-hammer) tests: robust test for the pool create crushmap test
-2a28114 (origin/wip-13401-hammer) crush/CrushTester: test fewer inputs when running crushtool
-abc5b5f tests: update to match crushmap validation message
-25bd277 mon/OSDMonitor: fix crush injection error message
-6635530 mon/OSDMonitor: only test crush ruleset for the newly created pool
-cc1fedd crush/CrushTester: allow testing by ruleset
-3228161 qa/workunits/cephtool/test.sh: don't assume crash_replay_interval=45
-ad83304 rgw:add --reset-regions for regionmap update
-7de65e7 rgw : setting max number of buckets for users via ceph.conf option
-297c04d rgw: init_rados failed leads to repeated delete
-4b0686f rgw: delete finisher only after finalizing watches
-6119b15 rgw: be more flexible with iso8601 timestamps
-607904e init-radosgw: specify pid file to start-stop-daemon
-f51ab26 rgw: fix radosgw start-up script.
-544a98f init-radosgw: unify init-radosgw[.sysv]
-2a733e9 init-radosgw: look in /var/lib/ceph/radosgw
-d00c52b doc: rgw: fix typo in comments
-eb001d3 rgw: init script waits until the radosgw stops
-9ab9c44 rgw: don't read actual data on user manifest HEAD
-9026c4a doc: remove mention of ceph-extra as a requirement
-45ed24d doc: remove ceph-extras
-faccdce doc: correct links to download.ceph.com
-e9f4aec doc: Added "Hammer" in the list of major releases.
-424fc1c rgw: set default value for env->get() call
-e72bdc3 osd/ReplicatedPG: tolerate promotion completion with stopped agent
-a3afb3f rgw: remove trailing :port from host for purposes of subdomain matching
-77cb503 (origin/wip-13015-hammer) rgw: preserve all attrs if intra-zone copy
-b9f2ed3 rgw: don't preserve acls when copying object
-b3822f1 upstart: limit respawn to 3 in 30 mins (instead of 5 in 30s)
-0d6a8c6 Pipe: Drop connect_seq increase line
-4be8a28 osd/PG: peek_map_epoch: skip legacy PGs if infos object is missing
-f237ed9 osd: allow peek_map_epoch to return an error
-3a50b90 crypto: fix unbalanced ceph::crypto::init/ceph::crypto:shutdown
-0a5b856 ReplicatedPG,Objecter: copy_get should include truncate_seq and size
-82ea02a rgw: fix assignment of copy obj attributes
-3b2affc rgw: add delimiter to prefix only when path is specified
-9f69660 tests: tiering agent and proxy read
-5656eec osd: trigger the cache agent after a promotion
-dc693fc lockdep: allow lockdep to be dynamically enabled/disabled
-805732b tests: librbd API test cannot use private md_config_t struct
-7ac0173 tests: ensure old-format RBD tests still work
-b68d757 librados_test_stub: implement conf get/set API methods
-f0fa637 crypto: use NSS_InitContext/NSS_ShutdownContex to avoid memory leak
-3f542aa auth: use crypto_init_mutex to protect NSS_Shutdown()
-e487e8e auth: reinitialize NSS modules after fork()
-00e73ad librbd: prevent race condition between resize requests
-6c4ccc8 librbd: Add a paramter:purge_on_error in ImageCtx::invalidate_cache().
-0573491 librbd: Remvoe unused func ImageCtx::read_from_cache.
-28838f2 osdc: clean up code in ObjectCacher::Object::map_write
-5c4f152 osdc: Don't pass mutex into ObjectCacher::_wait_for_write.
-86e7698 osdc: After write try merge bh.
-c96541a osdc: Make last missing bh to wake up the reader.
-4135b9a osdc: For trust_enoent is true, there is only one extent.
-81376b6 osdc: In _readx() only no error can tidy read result.
-e80bd0a (origin/wip-12859-hammer-loic) rgw: send Content-Length in response for GET on Swift account.
-2e54245 rgw: force content_type for swift bucket stats request
-5d57b63 rgw: we should not overide Swift sent content type
-b8aafbc rgw: enforce Content-Type in Swift responses.
-143cfc3 rgw: force content-type header for swift account responses without body
-b5420d6 rgw: shouldn't return content-type: application/xml if content length is 0
-836f763 OSD: break connection->session->waiting message->connection cycle
-77624af osd/PGLog: dirty_to is inclusive
-aa00373 common: fix code format
-aab35da test: add test case for insert empty ptr when buffer rebuild
-2b0b7ae common: fix insert empty ptr when bufferlist rebuild
-2348a5b osd: copy the RecoveryCtx::handle when creating a new RecoveryCtx instance from another one
-bf72785 config: skip lockdep for intentionally recursive md_config_t lock
-c94fd92 osd: Keep a reference count on Connection while calling send_message()
-059bf98 WBThrottle::clear_object: signal if we cleared an object
-a478385 ceph-disk: always check zap is applied on a full device
-e471c5d librados: Make librados pool_create respect default_crush_ruleset
-35fa47a (origin/wip-corpus-hammer) ceph-object-corpus: add 0.94.2-207-g88e7ee7 hammer objects
-b80859e (origin/wip-11455-hammer) rgw: init some manifest fields when handling explicit objs
-f47ba4b mon: test the crush ruleset when creating a pool
-b58cbba erasure-code: set max_size to chunk_count() instead of 20 for shec
-6f0af18 vstart.sh: set PATH to include pwd
-da00bed rgw: rework X-Trans-Id header to be conform with Swift API.
-9937c81 Transaction Id added in response
-f1c7c62 rgw: api adjustment following a rebase
-85911df rgw: orphans, fix check on number of shards
-c1cf7df rgw: orphans, change default number of shards
-bb1d4cc rgw: change error output related to orphans
-2e0f6fe rgw: orphan, fix truncated detection
-1bfebef radosgw-admin: simplify orphan command
-f244b15 radosgw-admin: stat orphan objects before reporting leakage
-f80e2b2 radosgw-admin: orphans finish command
-88d32c6 rgw: cannot re-init an orphan scan job
-80a4034 rgw: stat_async() sets the object locator appropriately
-0082036 rgw: list_objects() sets namespace appropriately
-1c37072 rgw: modify orphan search fingerprints
-ef81367 rgw: compare oids and dump leaked objects
-f4d0544 rgw: keep accurate state for linked objects orphan scan
-748ea57 rgw: iterate over linked objects, store them
-6c6aa5d rgw: add rgw_obj::parse_raw_oid()
-62d562d rgw: iterate asynchronously over linked objects
-00ecf2d rgw: async object stat functionality
-7d1cc48 rgw-admin: build index of bucket indexes
-c1b0e7a rgw: initial work of orphan detection tool implementation
-b16129c Avoid an extra read on the atomic variable
-1f6916d RGW: Make RADOS handles in RGW to be a configurable option
-a13c7fd rgw:the arguments 'domain' should not be assigned when return false
-6acf36f rgw:segmentation fault when rgw_gc_max_objs > HASH_PRIME
-6b36514 rgw: avoid using slashes for generated secret keys
-8ba6b2f rgw: url encode exposed bucket
-0bc909e (origin/wip-12638-hammer) mon: add a cache layer over MonitorDBStore
-bee8666 Objecter: pg_interval_t::is_new_interval needs pgid from previous pool
-b5418b9 osd_types::is_new_interval: size change triggers new interval
-f028389 (origin/liewegas-wip-hammer-feature-hammer) include/ceph_features: define HAMMER_0_94_4 feature
-95cefea (tag: v0.94.3) 0.94.3
-81a311a (origin/hammer-12709) Workunits : fs/misc/chmod.sh : Include ACL characters in permission check.
-153744d (origin/wip-12682-hammer) tests: increase test coverage for partial encodes/decodes
-fca7876 common: bit_vector extent calculation incorrect for last page
-3396a96 osd/OSDMap: handle incrementals that modify+del pool
-3ab5d82 (origin/wip-12432-hammer) rgw: set http status in civetweb
-10a0383 civetweb: update submodule to support setting of http status
-00d802d hobject_t: fix get_boundary to work with new sorting regime
-9b91adc (origin/wip-osd-compat-hammer) mon: disallow post-hammer OSDs if there are up pre-hammer OSDs
-8a559c1 include/ceph_features: define MON_METADATA feature
-4faa8e0 (origin/wip-12577-hammer) osd: include newlines in scrub errors
-455eb2a osd: fix condition for loggin scrub errors
-67e7946 osd: fix fallback logic; move into be_select_auth_object
-0f57c70 osd: log a scrub error when we can't pick an auth object
-d4f4c5c osd: repair record digest if all replicas match but do not match
-acfed6b osd: move recorded vs on disk digest warning into be_compare_scrubmaps
-674029b osd: be slightly paranoid about value of okseed
-f2002b7 osd: be precise about "known" vs "best guess"
-4e5d146 osd: record digest if object is clean (vs entire scrub chunk)
-1357ed1 hobject_t: decode future hobject_t::get_min() properly
-6d01d6b OSDMonitor::preprocess_get_osdmap: send the last map as well
-2ecb3b7 Fh ref count will leak if readahead does not need to do read from osd
-4c199bf (origin/wip-11998-hammer) debian/control: ceph-common (>> 0.94.2) must be >= 0.94.2-2
-a785193 ceph.spec.in: drop SUSE-specific %py_requires macro
-8804b3f ceph.spec.in: remove SUSE-specific apache2-mod_fcgid dependency
-b575ecc (origin/wip-12236-hammer) tests: verify that image shrink properly handles flush op
-d4eb7bd librbd: invalidate cache outside cache callback context
-92272dd (origin/wip-12235-hammer) librbd: don't cancel request lock early
-58ae92f tests: new test for transitioning exclusive lock
-7b21ccb tests: verify that librbd will periodically resend lock request
-c95b37f common: Mutex shouldn't register w/ lockdep if disabled
-117205a librbd: improve debugging output for ImageWatcher
-08ae012 librados_test_stub: watcher id should be the instance id (gid)
-704c0e0 librbd: retry lock requests periodically until acquired
-dbaaed9 librbd: don't hold owner_lock for write during flush
-e971820 (origin/wip-12345-hammer) lockdep: do not automatically collect all backtraces
-27f7042 librbd: flush operations need to acquire owner lock
-5b39983 librbd: avoid infinite loop if copyup fails
-88b583b librbd: flush pending ops while not holding lock
-a88b180 tests: fix possible deadlock in librbd ImageWatcher tests
-321eb8d tests: enable lockdep for librbd unit tests
-bfe5b90 librbd: owner_lock should be held during flush request
-1e84fb0 osdc: ObjectCacher flusher might needs additional locks
-506a45a librbd: fix recursive locking issues
-acf5125 librbd: simplify state machine handling of exclusive lock
-9454f04 librbd: ObjectMap::aio_update can acquire snap_lock out-of-order
-3e0358e librbd: move copyup class method call to CopyupRequest
-2ee64a8 librbd: simplify AioRequest constructor parameters
-3e71a75 librbd/AioRequest.h: fix UNINIT_CTOR
-cb57fe5 librbd: add object state accessor to ObjectMap
-9249ab7 librbd: AsyncObjectThrottle should always hold owner_lock
-26902b9 librbd: execute flush completion outside of cache_lock
-571220d librbd: add AsyncRequest task enqueue helper method
-8e280f4 librbd: disable lockdep on AioCompletion
-b38da48 librbd: AioCompletion shouldn't hold its lock during callback
-6fdd3f1 librbd: give locks unique names to prevent false lockdep failures
-7004149 librbd: complete cache read in a new thread context
-65ef695 librbd: require callers to ObjectMap::aio_update to acquire lock
-58b8faf log: fix helgrind warnings regarding possible data race
-a5203d3 librados_test_stub: fix helgrind warnings
-b73e87e librados_test_stub: add support for flushing watches
-2fa35b1 common: lockdep now support unregistering once destructed
-7b85c7b common: add valgrind.h convenience wrapper
-6d3db5f librbd: add work queue for op completions
-64425e8 WorkQueue: ContextWQ can now accept a return code
-eccf369 packaging: RGW depends on /etc/mime.types
-e19f928 (origin/wip-12502-hammer) rgw: conversion tool to fix broken multipart objects
-28d32f6 rgw: only scan for objects not in namespace
-e22e2b4 rgw_admin: add --remove-bad flag to bucket check
-7bddf5d   rest_bench: bucketname is not mandatory as we have a default name
-6e7358b   rest_bench: drain the work queue to fix a crash   Fixes: #3896   Signed-off-by: huangjun <hjwsm1989 at gmail.com>
-1e05578 auth: check return value of keyring->get_secret
-256620e Client: check dir is still complete after dropping locks in _readdir_cache_cb
-8a2ad05 TestPGLog: fix invalid proc_replica_log test caes
-df71e6b TestPGLog: fix noop log proc_replica_log test case
-549ff9a TestPGLog: add test for 11358
-c224fc7 PGLog::proc_replica_log: handle split out overlapping entries
-b8176d0 Mutex: fix leak of pthread_mutexattr
-43a72e4 mon/PGMonitor: bug fix pg monitor get crush rule
-0ca93db mon: ceph osd map shows NONE when an osd is missing
-695f782 crush/CrushWrapper: fix adjust_subtree_weight debug
-0bd4c81 crush/CrushWrapper: return changed from adjust_subtree_weight
-05fc59b crush/CrushWrapper: adjust subtree base in adjust_subtree_weight
-d2f31ad unittest_crush_wrapper: test adjust_subtree_weight
-0ccdf34 unittest_crush_wrapper: attach buckets to root in adjust_item_weight test
-1e73753 unittest_crush_wrapper: parse env
-cd11b88 osd: pg_interval_t::check_new_interval should not rely on pool.min_size to determine if the PG was active
-c5f0e22 osd: Move IsRecoverablePredicate/IsReadablePredicate to osd_types.h
-42bff0b mon: OSDMonitor: fix hex output on 'osd reweight'
-e004941 ceph.in: print more detailed warning for 'ceph <type> tell'
-f18900f ceph.in: print more detailed error message for 'tell' command
-9916d37   mon/PGMonitor: avoid uint64_t overflow when checking pool 'target/max' status.   Fixes: #12401
-4457d3e Update OSDMonitor.cc
-add0f1e ceph.in: do not throw on unknown errno
-fa19474 os/chain_xattr: handle read on chnk-aligned xattr
-931ffe3 common/Cycles.cc: skip initialization if rdtsc is not implemented
-0fde3a2 buffer: Fix bufferlist::zero bug with special case
-dabc611 UnittestBuffer: Add bufferlist zero test case
-d08db7a (origin/wip-11470.hammer) mon: PaxosService: call post_refresh() instead of post_paxos_update()
-154f18c (origin/wip-12465-hammer) Log::reopen_log_file: take m_flush_mutex
-b872882 (origin/wip-12237-hammer) librados_test_stub: read op should return number of bytes read
-7d9fce3 tests: fixed TestObjectMap.InvalidateFlagInMemoryOnly
-4a77be0 librbd: don't attempt to invalidate an object map in R/O mode
-0aea70f tests: add new unit tests for object map invalidation
-c732cb8 librbd: move object map codes to common location
-27c99ea librbd: only update image flags when holding exclusive lock
-ef45363 librbd: new ImageWatcher::is_lock_supported method
-e4b55b3 Fixes: #12286 radosgw-admin: after subuser modify print only once user info.
-9458b84 rgw: fix ListParts response
-2357b6c rgw: If the client sends a Connection: close header respond accordingly.
-f819332 ceph.spec.in: install 95-ceph-osd.rules, mount.ceph, and mount.fuse.ceph properly on SUSE
-d8733be debian: move ceph_argparse into ceph-common
-f99f312 (origin/wip-12384-hammer) test: potential memory leak in FlushAioPP
-a4fc63a pybind: fix valgrind warning on rbd_get_parent_info call
-aa3eb28 osdc: invalid read of freed memory
-18ede75 krbd: fix incorrect types in the krbd API
-488578c fsx: cleanup crypto library at exit
-97ff6cb tests: add run-rbd-valgrind-unit-tests.sh
-e690907 valgrind: update valgrind suppressions for lttng-ust
-fe013e0 librbd: TaskFinisher should finish all queued tasks
-43cd3ac tests: fix valgrind errors with librbd unit test
-5d8d6a1 tests: librbd should release global data before exit
-13f926e librados_test_stub: cleanup singleton memory allocation
-1063f52 PG::find_best_info: ignore info.les for incomplete peer
-7132277 Conditional-compile against minimal tcmalloc.
-0818e9f ceph.spec.in: snappy-devel for all supported distros
-8b576bd ceph.spec.in: python-argparse only in Python 2.6
-ad5745b OSD: add command_wq suicide timeout
-059a579 OSD: add remove_wq suicide timeout
-b8826bc OSD: add scrub_wq suicide timeout
-878dd40 OSD: add snap_trim_wq suicide timeout
-1157583 OSD: add recovery_wq suicide timeout
-a82b450 OSD: add op_wq suicide timeout
-89aa8ff (origin/wip-11833-hammer) mon: add an "osd crush tree" command
-03c07d7 librbd: assertion failure race condition if watch disconnected
-5c812c1 librbd: prevent object map updates from being interrupted
-e50caab rgw: fix empty json response when getting user quota
-ec70533 rgw: error out if frontend did not send all data
-557865c rgw: fix reset_loc()
-b1618a9 rgw: fix lack of account name in XML listing of Swift account.
-e39dce7 rgw: generate the "Date" HTTP header for civetweb.
-a5dbcbb Swift: Set Content-Length when requesting/checking Keystone tokens
-cdde626 ceph.spec.in: do not run fdupes, even on SLE/openSUSE
-3c8cdea client: reference counting 'struct Fh'
-c78cc00 rgw: rectify 202 Accepted in response for PUT on existing bucket.
-6417e8e rpm: add missing Java conditionals
-3728477 Add rpm conditionals : cephfs_java
-8f78001 ceph.spec.in: SUSE/openSUSE builds need libbz2-devel
-4eb58ad ceph.spec.in: use _udevrulesdir to eliminate conditionals
-7f1c0cc crush/CrushTester: return EINVAL if crushtool returns non-zero
-2aaeea1 tests: TEST_crush_reject_empty must not run a mon
-80afb81 ceph-helpers: implement test_expect_failure
-6b5e9a1 tests: display the output of failed make check runs
-5141301 (origin/wip-11975-hammer) mon: add "--check" to CrushTester::test_with_crushtool()
-5ec27cf crushtool: rename "--check-names" to "--check"
-2a8fe88 mon: check the new crush map against osdmap.max_osd
-c0b0f52 crushtool: enable check against max_id
-f041bbe crush/CrushTester: check if any item id is too large
-cc1cc03 mon: validate new crush for unknown names
-ff29a7f crushtool: add the "--check-names" option
-960ea49 crush/CrushTester: add check_name_maps() method
-5e72479 (origin/wip-12021-hammer) OSDMonitor: allow addition of cache pool with non-empty snaps with config
-ecac1a4 Always provide summary for non-healthy cluster
-1a32379 mon/PGMap: add more constness
-84ebc3d mon/PGMap: sort pg states by the states in "pg ls" spec
-e310461 mon: s/recovery/recoverying/ in "pg ls*" commands' spec
-524f4a5 mon: always reply mdsbeacon
-413e407 mon/MDSMonitor: rename labels to a better name
-a03968a mon: send no_reply() to peon to drop ignored mdsbeacon
-39f3459 mon: remove unnecessary error handling
-0e5e7e1 mon: remove unused variable
-7034720 ReplicatedPG::finish_promote: handle results->snaps is empty case
-3e44dc1 ReplicatedPG::finish_promote: fix snap promote head snaps
-5ef0846 tools: chunk reads in Dumper
-408880b qa: update to newer Linux tarball
-56c2688 rgw: simplify content length handling
-d9bbef3 rgw: make compatability deconfliction optional.
-0260abd rgw: improve content-length env var handling
-8abc46a rgw: fix data corruption when race condition
-89d0266 (origin/wip-fix-doc-hammer) doc: add the corresponding @endcond command for @cond
-2aa77b3 doc: remove orphan: directive in ceph-create-keys.rst
-ad66e40 doc: let doxygen ignore src/tracing
-356bd2c (origin/wip-11806-hammer) debian: ceph-dbg steals ceph-objectstore-tool from ceph-test-dbg (take 2)
-c5c627f rgw/logrotate.conf: Rename service name
-582cf73 (origin/wip-12109-hammer, origin/wip-12109) tests: add librbd watch/notify version compatibility test
-43b9aef qa/workunits/rbd: add notify_master/slave bootstrap scripts
-f995fb5 qa/workunits/rbd: add new test_librbd_api workunit
-a09da2a tests: create librbd API-only integration test suite
-e149916 Increase max files open limit for OSD daemon.
-22f58ce Makefile: install ceph-post-file keys with mode 600
-3e65a10 ceph-post-file: improve check for a source install
-c1f6743 ceph-post-file: behave when sftp doesn't take -i
-38d36b1 packaging: move SuSEfirewall2 templates out of src
-24bc9f2 packaging: add SuSEfirewall2 service files
-8acfb99 Bug fix to ceph systemV compatability script.
-bd3fd92 Fixes to rcceph script
-37d77d3 Increase max files open limit for OSD daemon.
-78d894a qa/workunits/rados/test-upgarde-v9.0.1: fix exclude syntax
-3e8d60a qa/workunits/rados/test-upgrade-v9.0.1: skip one more evict test
-348a3d3 qa: add compatibility filtered rados api tests for upgrades
-f68bf94 (origin/wip-11493-hammer) OSDMonitor: disallow ec pools as tiers
-13c8d58 mon: prevent pool with snapshot state from being used as a tier
-58e6266 test/librados/tier.cc: destroy and recreate cache pool on every test
-3db1026 doc/release-notes: v0.94.2
-ed5442b (origin/wip-11770-hammer) tests: verify librbd blocking aio code path
-20e1048 librbd: new rbd_non_blocking_aio config option
-b4571b3 PendingReleaseNotes: document changes to librbd's aio_read methods
-9ea1edd librbd: AioRequest::send no longer returns a result
-272df2a tests: update librbd AIO tests to remove result code
-dd2e4c1 librbd: internal AIO methods no longer return result
-dbd4e29 Throttle: added pending_error method to SimpleThrottle
-7df6091 librbd: add new fail method to AioCompletion
-cf6e1f5 librbd: avoid blocking AIO API methods
-e61974a librbd: add task pool / work queue for requests
-5fb8561 (tag: v0.94.2) 0.94.2
-bfb1442 ReplicatedPG::release_op_ctx_locks: requeue in scrub queue if blocked
-c7b6a63 ReplicatedPG::finish_ctx: take excl lock if operation is rw
-1550a56 RadosModel: randomly prefix delete with assert_exists
-4cdc5f7 RadosModel: assert exists on subsequent writes
-25c730b test/librados/snapshots.cc: add test for 11677
-3d74164 ReplicatedPG::trim_object: write filtered snapset while we're at it
-a116154 ReplicatedPG: start_flush: use filtered snapset
-82988d6 ceph-disk: support NVMe device partitions
-bd91fb0 mon: prevent bucket deletion when referenced by a rule
-56565ee crush: fix crash from invalid 'take' argument
-1440122 common/config: detect overflow of float values
-9b947fa common/config: detect overflow of int values
-d723e11 mds: clear CDir::STATE_REJOINUNDEF after fetching dirfrag
-54f4e7d rgw: Use attrs from source bucket on copy
-9dfef60 rgw: always check if token is expired
-fdb43eb ceph.spec.in: tweak ceph-common for SUSE/openSUSE
-ba1a016 admin/build-doc: fix dependency checks
-5a60a03 (origin/wip-4846) man/ceph-create-keys.8: add missing file
-19305b8 doc: add ceph-create-keys.8
-ffd0933 WorkQueue: added virtual destructor
-a28adfb WorkQueue: add new ContextWQ work queue
-1a321e4 rgw: Do not enclose the Bucket header in quotes
-968573b debian: ceph-dbg steals ceph-objectstore-tool from ceph-test-dbg
-a62c3aa ceph.in: handle unknown Exception correctly
-cc7f744 ceph.in: improve the interactive mode
-eb26388 ceph.in: parse quote correctly in interactive mode
-344328d (origin/wip-11733-hammer) debian: set rest-bench-dbg ceph-test-dbg dependencies
-293affe (origin/wip-11622-hammer) rgw: merge manifests correctly when there's prefix override
-a43d248 rgw: restore buffer of multipart upload after EEXIST
-558d639 json_sprit: fix the FTBFS on old gcc
-678b3e6 json_spirit: use utf8 intenally when parsing \uHHHH
-ef6641c ceph.spec: update OpenSUSE BuildRequires
-0b6d442 (origin/wip-hammer-11535-admin-socket) common/admin_socket: close socket descriptor in destructor
-95818da (origin/wip-hammer-package-perf-objectstore) packaging: include ceph_perf_objectstore
-6e2dd40 (origin/hammer-uclient-checking) client: fix error handling in check_pool_perm
-3c2e6ae client: use SaferCond in check_pool_perm
-79b2ac2 client: check OSD caps before read/write
-c548d8d tests: ceph-helpers kill_daemons fails when kill fails
-fb10594 Added a "ceph hello world" for a simple check for ceph-deploy qa suite
-46a4e8a packaging: mv ceph-objectstore-tool to main ceph pkg
-f4a0dab rgw: send Content-Length in response for HEAD on Swift account.
-32f4a74 rgw: send Content-Length in response for DELETE on Swift container.
-d39a660 rgw: send Content-Length in response for PUT on Swift container.
-02a3813 (origin/wip-11370-hammer) librbd: flatten should return -EROFS if image is read-only
-af8939b librbd: allow snapshots to be created when snapshot is active
-d21c0c0 librbd: better handling for duplicate flatten requests
-ec0bd1d librbd: use generic helper for issuing async requests
-8a6e6e4 OSD: handle the case where we resurrected an old, deleted pg
-efbfe6f rgw: improve metadata handling on copy operation of Swift API.
-d164d80 rgw: quota not respected in POST object
-7f2a9ed rgw: fix handling empty metadata items on Swift container.
-8e6efdb rgw: send Content-Length in response for GET on Swift container.
-54b6290 rgw: enable end_header() to handle proposal of Content-Length.
-c87aa11 rgw: Swift API. Complement the response to "show container details"
-2cb5d60 rgw: enforce Content-Length in response for POST on Swift cont/obj.
-d2043a5 rgw: send Last-Modified header in response for PUT on Swift object.
-4d1f3f0 rgw: don't use rgw_socket_path if frontend is configured
-3aef0f2 rgw: update keystone cache with token info
-3edb196 civetweb: update max num of threads
-bc6eb8d rgw: improve code formatting ONLY.
-7aa1ae6 rgw: send X-Copied-From-Last-Modified header of Swift API.
-150b9e2 rgw: dump object metadata in response for COPY request of Swift API.
-e749701 rgw: refactor dumping metadata of Swift objects.
-b034511 rgw: add support for X-Copied-From{-Account} headers of Swift API.
-c9e6a0b rgw: send ETag, Last-Modified in response for copying Swift cobject.
-7f41ff0 rgw: Swift API. Allows setting attributes with COPY object operation.
-2f86995 rgw: improve format of X-Timestamp on Swift objects.
-48b1981 rgw: add support for X-Timestamp on Swift containers.
-893ffd3 tests: AioCompletion incorrectly freed
-96b0db5 librbd: update ref count when queueing AioCompletion
-0944051 librbd: failure to update the object map should always return success
-7ee7dcf tests: librados_test_stub reads should deep-copy
-113f3b1 mon: Total size of OSDs is a maginitude less than it is supposed to be.
-6a04b55 Fix "disk zap" sgdisk invocation
-8996907 osd: refuse to write a new erasure coded object with an offset > 0
-bc51476 ceph_json: add decode / encoder for multimap
-7c7e651 cls_rgw: use multimap to keep pending operations in bucket index
-cb75713 rgw: generate new tag for object when setting object attrs
-7387c43 java: libcephfs_jni.so is in /usr/lib64 on rhel
-856b2fa (origin/hammer-next) rgw-admin: a tool to fix object locator issue
-512ae4c rgw: set a special object locator if object starts with underscore
-da4d227 rgw: use correct oid when creating gc chains
-4e84f31 rgw: civetweb should use unique request id
-4789686 Move ceph-dencoder build to client
-7eabb70 Rework mds/Makefile.am to support a dencoder client build
-da7f683 Add support for PPC architecture, provide fallback
-3001fad Fix clear_pipe after reaping progress
-379ef71 rgw : Issue AIO for next chunk first before flush the (cached) data.
-b903ad2 rgw/Makefile.am: Populate DENCODER_SOURCES properly
-f994483 Dencoder should never be built with tcmalloc
-8709e34 ceph-disk: more robust parted output parser
-8d9f4d4 osdc: add epoch_t last_force_resend in Op/LingerOp.
-2f34d2e rgw: do not pre-fetch data for HEAD requests
-c6edc16 rgw - make starting quota/gc threads configurable
-c615972 librbd: ImageWatcher should cancel in-flight ops on watch error
-0e6a032 librbd: moved snap_create header update notification to initiator
-2864da8 librbd: updated cache max objects calculation
-0cdc93f librbd: acquire cache_lock before refreshing parent
-a1b4aeb librados_test_stub: AIO operation callbacks should be via Finisher
-3a58e30 (origin/hammer-11482) mds: remove caps from revoking list when caps are voluntarily released
-f30fa4a (origin/wip-11453-hammer-rgw-init-as-root) init-radosgw: run RGW as root
-0ee022b osd/ReplicatedPG: don't check order in finish_proxy_read
-8a58d83 Fix ceph_test_async_driver failed
-85a68f9 rgw: remove meta file after deleting bucket The meta file is deleted only if the bucket meta data is not synced
-51f5763 release-notes: backport Hammer release notes
-e4bfad3 (tag: v0.94.1) 0.94.1
-5ca771a crush: fix has_v4_buckets()
-33e79ab crush: fix dump of has_v4_buckets
-bd0ec49 bug fix: test case for lfn index
-b5921d5 crush/mapper: fix divide-by-0 in straw2
-93c8f43 osd: fix negative degraded objects during backfilling
-e61c4f0 (tag: v0.94) 0.94
+bb2ecea (HEAD, tag: v9.2.0, origin/infernalis) 9.2.0
+b9ac90d osd/PG: tolerate missing epoch key
+5c49192 osd: fix OSDService vs Objecter init order
+1560057 ceph.spec.in: We no longer need redhat-lsb-core
+c567341 init-rbdmap: Rewrite to use logger + clean-up
+58414c5 librbd: potential assertion failure during cache read
+011e9e5 tests: reproduce crash during read-induced CoW
+2a6b90f doc/release-notes.rst: recovery isn't in the unified queue yet
+c7d96a5 osd: init objecter after we authenticate
+7673845 osd/PG: make upgrade() use sequencer
+52b79f7 Revert "os/FileStore: require upgrade to hammer before moving beyond"
+41c9466 Revert "osd: require an upgrade to hammer first"
+eed3137 Revert "ceph-objectstore-tool: require hammer upgrade"
+de97840 Revert "osd: drop support for pre-hammer pg metadata"
+dff5783 Revert "ceph-objectstore-tool: drop support for pre-pgmeta PGs"
+9446770 Revert "os: drop deprecated collection_* attr methods"
+0f1b1f0 Revert "os/FileStore: fix version check"
+661e2a0 qa: remove legacy OS support from rbd/qemu-iotests
+1fb9fc9 librbd: fix rebuild_object_map() when no object map exists
+fb62c78 ceph_context: remove unsafe cast for singletons
+477bb06 ceph.spec.in: only run systemd-tmpfiles on ceph run directory
+40336fa CMake: fix rbd_replay error
+0009f34 osd: conditionally initialize the tracepoint provider
+6368c28 librados: conditionally initialize the tracepoint provider
+1a6eea9 librbd: conditionally initialize the tracepoint provider
+2ccef89 common: new tracing config options
+e11d8d6 tracing: merge tracepoint provider makefiles into main makefile
+b3d02cc tracing: dynamic tracepoint provider helper
+a7ed8e1 packaging: add new tracepoint probe shared libraries
+f4feee2 ceph.spec.in: add new tracepoint probe shared libraries
+4a5305e lttng: move tracepoint probes to dynamic libraries
+b61f3e4 osd: fix the snapshot reads of evicted tiering pool
+7060a3b doc/infernalis: hate hate
+e6a9e62 doc/release-notes: i hate rst
+e98408d doc/release-notes: final infernalis notes
+b105449 doc/release-notes: fix some attributions
+e9f200c doc/release-notes: infernalis notable changes
+638738f Revert "common, global: use lttng ust functions for handling fork-like calls"
+fca97db rgw, doc: remove remark for lack of custom account metadata of Swift.
+3be81ae 9.1.0
+036d36f debian/control: python-setuptools is a build dependency
+8e59595 doc/release-notes: 9.1.0
+8855e60 ReplicatedPG::maybe_handle_cache_detail: always populate missing_oid
+da4803e ReplicatedPG::_rollback_to: handle block on full correctly
+2b7ddde osd: Correct the object_info_t::decode() version
+03078ba rgw: location constraints should return api name
+a077301 mon/OSDMonitor: put crushtool error in log
+d36d7f2 ReplicatedPG: allow maybe_handle_cache to return status detailing what happened
+68c722c pybind/rados, get_omap_vals: Fix pydoc type.
+5a6e762 test: pybind/test_rados: add binary data.
+db03d30 pybind/rados: Fix binary omap values.
+5b9c326 rgw: fix wrong etag calculation during POST on S3 bucket.
+cbf36ad LibRBD: Adjust correct op latency scope
+4fdc703 librados_test_stub: prevent interleaving of operations
+d689db8 cls: new force-promotion flag for class methods
+6eca7d0 librados: restored pre-infernalis API compatibility
+cac1d6f buffer: restored pre-infernalis API compatibility
+030f697 rgw: orphan tool shouldn't clean up head objects
+8f28913 rgw, doc: mention that Swift objexp is supported now.
+7250db6 CephxServiceHandler.cc: fix get_auth conditional
+1a2689f ReplicatedPG::maybe_handle_cache: do not promote before checking full
+e0d8cb1 tests: removed obsolete rbd_replay test cases
+c2a83d0 ceph-dencoder: new rbd_replay trace file types
+3ecdae8 rbd-replay: added version control to trace output file
+646e50a rbd-replay-prep: added --verbose command line option
+98f513a rbd-replay-prep: stream events to the prep file
+65fb1b8 rbd-replay-prep: simplify IO dependency calculation
+4f436e1 rbd-replay: improve error messages
+b5b4a9d rbd-replay-prep: support new read/write APIs
+a1e99f0 rbd-replay-prep: avoid using assert statements on user inputs
+858059e qa: avoid using sudo in fsstress
+e049de3 os/FileStore: kludge sloppy hammer temp objects into temp collection
+5a11d76 ceph.spec.in: move python-sphinx BuildRequires to the right section
+96aabe7 ceph.spec.in: move BuildRequires out of subpackages
+d258bf5 ceph.spec.in: drop MY_CONF_OPTS
+468c2dd doc: remove mention of --lazy-remove from radosgw-admin manpage
+98cbf03 osd/PG: fix generate_past_intervals
+e675400 librbd: invalidate object map on error even w/o holding lock
+bc48ef0 selinux: Fix man page location
+378d56d man/Makefile-server.am: conditionalize make ceph_selinux manpage
+fb50ff6 mon: do not remove proxied sessions
+1045291 ceph.spec.in: remove comments regarding ceph UID/GID in SUSE
+800d974 ceph.spec.in: enable OBS post-build-checks to find systemd-tmpfiles
+498578d etc/sysconfig/ceph: add CEPH_AUTO_RESTART_ON_UPGRADE
+f8895fc ceph.spec.in: Standardize systemd preun and postun scripts
+404ffab ceph.spec.in: fix for out-of-memory errors in OBS
+db4727f ceph.spec.in: Use _smp_mflags instead of _NPROCESSORS_ONLN
+3217803 Revert "install-deps.sh: use %bcond_with selinux on SLE/openSUSE"
+5dd93d7 ceph.spec.in: selinux by default on RHEL/Fedora, non-default on SUSE
+9e226a8 ceph.spec.in: use %fillup_only macro in SLE/openSUSE
+2c6977c ceph.spec.in: distro-conditional groupadd and useradd
+0dd9313 ceph.spec.in: move systemd-tmpfiles stuff to ceph-common
+d58d885 ceph.spec.in: remove SUSE-specific curses hack
+bf9ca1e ceph.spec.in: lttng in SLES12 only
+50567b4 ceph.spec.in: drop %insserv_prereq (obsoleted by systemd)
+c84722a ceph.spec.in: fix boost-random build dependency for SLE/openSUSE
+d0ecb0a doc/release-notes: initial v9.1.0 notes
+a6a6923 osdc/Objecter: send FULL_TRY and FULL_FORCE ops despite full flag
+8201f0e mon: allow ping through despite synch/quorum status, with session
+5008da2 mon: drop ops on closed sessions early
+17e55b9 mon: clean up _ms_dispatch
+12bbc08 mon: all MonOpRequests have an attached Session
+6cf34a3 mon: drop any ops from closed sessions in dispatch_op
+a875826 mon: always set up session; move waitlist logic
+e2e1bd9 mds: avoid emitting cap warnings before evicting session
+362b18a mon: fix msg leak in resend_routed_requests
+c9dad52 Mon: Fix decoded message leak when this monitor is leader
+3ed25c1 librados: document new flag
+929e5d0 ceph.spec.in: correctly declare systemd dependency for SLE/openSUSE
+8d8fcee osd/ReplicatedPG: exempt MDS from the failsafe check, too
+81c2374 rgw: improve handling of already removed buckets in object expirer.
+b915952 ceph.spec.in: Do not always restart the daemons on removal
+c95c14b ceph.spec.in: Do not always restart the daemons on upgrades
+b20a1ba ReplicatedPG: consider IGNORE_CACHE for all maybe_handle_cache calls
+1df2cc2 install-deps.sh: use %bcond_with selinux on SLE/openSUSE
+3296274 ceph_test_rados_api_aio: test pool full gets EDQUOT when FULL_TRY flag is set
+8b1f234 librados: expose OPERATION_FULL_TRY flag
+ea93ead osd: return -EDQUOT instead of -ENOSPC if it is a pool quota
+e86d033 osdc/Objecter: distinguish between multiple notify completions
+049ea70 osd: reply to notify request with our unique notify_id
+0f9dca4 install-deps.sh: openSUSE-release/sles-release/sled-release are always present
+04c09ac mds: fix SnapServer crash on deleted pool
+0ce7491 bugfix: should call md_config_t::remove_observer on shutdown
+a965378 ReplicatedPG: clearing a whiteout should create the object
+47f4a03 ceph-objectstore-tool: delete ObjectStore::Sequencer after umount
+f20f67e pybind/cephfs: fix DirEntry helpers
+3f00042 rgw: set default value for env->get() call
+bba3ab3 mon: combine _ms_dispatch and dispatch
+e42c9aa ceph.spec.in: re-re-drop fdupes
+566c872 os/fs: fix aio submit method
+d7b620f ECBackend::handle_recovery_read_complete: do not expose the hash_info when getting the obc
+892800b ECBackend::handle_sub_read: restructure hash check and fix part of 12983
+80b7237 qa/workunits/cephtool/test.sh: don't assume crash_replay_interval=45
+c5a9275 osd/ReplicatedPG: preserve (some) flags when proxying reads
+994ec60 mds: respect max_entries/max_bytes of lssnap request
+2fea3a5 examples/librados/hello_world.cc:missing semicolon
+216eef5 Revert "osd: new pool settings: scrub intervals"
+04679c5 OSDMap: fill in known encode_features where possible
+c7e905e ceph-create-keys: set mds "allow *"
+f1f14f1 erasure-code: shec must compare for equality with epsilon
+e52204c client: fix quote enforcement on subdir mounts
+15e19a4 client: refactor quota check functions
+f1d8a8f Objecter: repeated free op->ontimeout.
+0635b13 Objecter: maybe access wild pointer(op) in _op_submit_with_budget.
+482d4e5 AsyncConnection: Add new debug log
+a1eb380 osd/ReplicatedPG: fix ENOSPC checking
+da7f3ad osd/osd_types: remove [pool_]last_epoch_marked_full from OSDSuperblock
+9c93ee3 osd: remove buggy [pool_]last_map_marked_full tracking
+8ffefec osd: do full check in do_op
+1f2274e osd/PG: set last_epoch_marked_full on map advance
+79475d8 osd/osd_types: add last_epoch_marked_full to pg_history_t
+bf7e937 osdc/Objecter: set FULL_FORCE flag when honor_full is false
+95055e7 osd: add FULL_TRY and FULL_FORCE rados op flags
+7757342 qa: https://ceph.com/git -> https://git.ceph.com
+d4d65fb qa: http://ceph.com/qa -> http://download.ceph.com/qa
+cdccf11 osd/PG: compensate for sloppy hobject scrub bounds from hammer
+acda626 osd: avoid duplicate MMonGetOSDMap requests
+f4bf14d Update Xinze affinity
+eb28eef Update Chen Min affinity
+58b8a8f Update Tianshan affinity
+58ad7b7 Update Haomai Wang affinity
+3ed6045 docs: Update docs to make it clear that there are mirrors worldwide
+efdaa93 mds: fix error reformatting subtreemap_test events
+557df13 tools/cephfs: detect malformed ESubtreeMap::expire_pos
+627756e build/ops: ceph-detect-init must ignore .cache
+9c8200b librbd:reads larger than cache size hang.
+396702a build/ops: make dist needs files with names > 99 characters
+5f7b3f5 filestore: fix peek_queue for OpSequencer
+c053499 osd/: eliminate unnecessary pg_hit_set_history_t::current_info
+f5359f2 osd: print min_last_epoch_clean along with pg dump
+ef909cc mon/Elector: do a trivial write on every election cycle
+2fb7b1f mon/MonitorDBStore: assert/crash if there is a write error
+b1401e5 CMake - fix check for UDEV
+0b27024 CMake - fix check for FUSE
+a866a89 CMake - add check for libedit
+4d02f87 CMake - fix check for Snappy
+3570dd5 CMake - add check for XFS
+571215e CMake - fix check for AIO
+6adf1cb CMake - fix check for blkid
+1a4262c CMake - add check for libcurl
+a6c8b46 CMake - fix check for fcgi
+88f7df7 CMake - add check for libuuid
+a18b649 CMake - fix check for leveldb
+cb7fe45 CMake - fix check for expat
+a7ce8f5 CMake - Add check for keyutils
+dcf647e CMake - fix check for NSS
+b02e0f9 CMake - fix libatomic_ops and gperftools checks
+3123b2c arch/arm: s/false/0/
+7e5980b rgw: improve convenience for key operate.
+36e4a80 ReplicatedPG::hit_set_setup: fix hit_set_remove_all call
+8e5a801 osd/: assert in HitSet constructor if type is TYPE_NONE as well
+ef97305 cls_rgw: fix bucket listing when dealing with invisible entries
+d422f28 OSDService::agent_entry: don't use PG::operator<< without pg lock
+e17c8e1 init-radosgw: specify pid file to start-stop-daemon
+d18cf51 osd: fix requeue of replay requests during activating
+4264358 erasure-code: workaround i386 optimization bug with SHEC
+f4b55f4 journaler: detect unexpected holes in journal objects
+182676d tests: ceph-disk: workunit must fail when test fail
+0cf0e88 tests: ceph-disk: only install multipath on CentOS
+fb4dd7d tests: ceph-disk: inline run_osd
+24b3a80 tests: ceph-disk: wait for the OSD to be up
+46a3c93 tests: ceph-disk: race condition is fixed
+d24f1f1 tests: ceph-disk: multipath now auto activates
+d4d5153 tests: ceph-disk: use sh() instead of helper() when possible
+b647038 tests: doc: ceph-disk workunit reminders
+22a1f4c ceph-disk: reduce debug verbosity
+c86438c ceph-disk: ensure udev add on the data partition is last
+0dd2364 ceph-disk: move update_partition from main_prepare to prepare_dev
+e2c553e ceph-disk: guard partprobe with udevadm settle
+ad2d46c ceph-disk: do not hide ceph-disk prepare stack trace
+5aeca73 ceph-disk: support --verbose with --log-stdout
+9e796dc ceph-disk: prefer sgdisk to blkid to retrieve partition UUID
+3d0e7ca ceph-disk: make ceph-disk list /dev/vdb equivalent to list vdb
+6ca6653 ceph-disk: - is translated into / with systemd
+17c6045 ceph-disk: a journal partition may survive a data partition
+d521a75 ceph-disk: upstart must not run ceph-disk activate concurrently
+f0a4757 ceph-disk: systemd must not kill a running ceph-disk
+cc13fa0 ceph-disk: fix typos in udev rules
+b86d9fd ceph-disk: ensure ceph owner on udev change
+a3a8c85 use simplifed messenger constructor for clients
+e3785b0 msg: add simplified messenger constructor
+66a9bfd osd/: remove unused pg_hit_set_history_t::current_last_stamp
+30332e3 mon/PaxosService: include msg and con ptr in dispatch debug
+930d8eb PG: ignore info from down osd
+4b00350 OSDMap.h: add has_been_up_since
+c27b73f mon/OSDMonitor: respect NODOWN on osd failure checks
+ea97761 systemd: increase nproc ulimit
+9f89ae7 mon/PGMonitor: avoid useless register_new_pgs work
+2a01bbc mon: make all paxos-related timeouts relative to mon_lease
+fd9ce66 osd/ReplicatedPG: tolerate promotion completion with stopped agent
+e65fb1b mds: adjust MDSRank::incarnation according to mdsmap
+30810da osd: new pool settings: scrub intervals
+48db7b1 osd: new pool flags: noscrub, nodeep-scrub
+b97ae76 osd: make 'ceph osd pool get' work for all settable pool flags
+10235e3 osd: refactor setting write_fadvise_dontneed pool flag
+b41f574 Fix unneccessary at/near max target warn in ceph -s when using ecpool When calculated objects needing eviction, we use object - hitset_achieve. So setting max objects = 30000, ceph -s will warn you at/near if there exists hitset_achieve
+21a1e75 tests: update to match crushmap validation message
+4da6793 install-deps: enable python3
+170f9ad doc: do not promise backports to Dumpling
+a6f07e9 doc: remove mention of ceph-extra as a requirement
+c0ef84f doc: remove ceph-extras
+387d780 doc: correct links to download.ceph.com
+c1172ca mon: fix auth get-or-create output
+7b2fa67 ReplicatedPG::get_snapset_context: set exists to false for new ssc with can_create
+4a8b08d rgw: don't read actual data on user manifest HEAD
+bf9c005 ceph-osd-prestart.sh: no ceph-disk chown
+e44d1e0 ceph.spec.in: Fix up (/var)/run/ceph creation
+1b3090d mon/OSDMonitor: fix crush injection error message
+524b0bd mon/OSDMonitor: only test crush ruleset for the newly created pool
+aa238e5 crush/CrushTester: allow testing by ruleset
+4f553b0 Librbd: Fix incorrect metadata filter behavior
+3971274 mon: return size_t from MonitorDBStore::Transaction::size()
+26fee81 osd/ReplicatedPG: using hobject_t::get_snapdir.
+fd0a384 osd/ReplicatedPG: Using Hobject_t::get_head.
+a326bd9 osd/ReplicatedPG: Remove the unuseful judgement in get_object_context.
+6a4501b os/FileStore: s/::close(fd)/VOID_TEMP_FAILURE_RETRY(::close(fd)).
+0d26694 osd/ReplicatedPG: Remove the duplicated code.
+bdfb239 osd/ReplicatedPG: Add verify checksum for SPARSE_READ.
+8bfbcf6 osd/ReplicatedPG:Optimize judgement when calc crc for read whole object.
+7532194 osd/ReplicatedPG: for osd_op_create, if ob existed don't do t->touch.
+c1afc38 unsigned type is short for journal max_size,use uint64_t instead.
+f25b67f rgw: add a new error message for user conflict when using Admin Ops API
+490938e osd/: find_object_context: return obc for head for snapdir if non null
+bdb2fa2 mds: Make sure wanted_state of first MDSBeacon is MDSMap::STATE_BOOT
+85bece7 new release key
+6a24d31 libcephfs: fix calling init() then mount()
+e017aab CMake: fix libcephfs shared lib generation
+7182499 install-deps.sh: disable python3
+a825f68 client/MetaRequest: optimize func can_forward/auth_is_best.
+a195928 unify order limit
+f51afa6 client/MetaRequest: open w/ O_CREAT|O_TRUNC is write.
+8f46bd9 clinet/MetaRequest: Remove the useless condition.
+d0ac68b mon/PGMap: calc min_last_epoch_clean when decode
+d1505b5 doc: delete wrong description of installing RPMs
+1f7a2dc doc:Replaces 'osd host' with 'host'
+c1b2859 radosgw: log to /var/log/ceph instead of /var/log/radosgw
+412e412 osd: permit MDS to send writes on full pools
+eab3008 Revert "osd: purge the object from the cache when proxying and not promoting the op"
+ca7861d osdc/Objecter: make unlocked 'full' helpers private
+4d5d15c osdc: fix honor_osdmap_full
+96486fd packaging: move rbd-replay* to ceph-common
+f162e33 src/test/osd/Object: allow a hole at the end of the object
+f9d107a osd_types.cc: add size to ObjectRecoveryInfo print
+1d3e46b ReplicatedBackend::prepare_pull: set recover_info.size for clones
+c842555 mon: debug refs on output replies
+03b8ed3 msg/simple: discard local queued messages on shutdown
+137eb7e mon/Monitor: fix MonSession Leak
+0b309e9 mon: fix MonSession operator<<
+89cc479 mon: do not leak messages on shutdown
+9546252 mds: cast numbers for mds health to string when print
+71f6529 doc: fix a broken hyperlink
+bbe27dc doc: Fixes a wrong directory name.
+26bcb36 Examples: hello_world.cc, content displayed after read is not null terminated.
+4e8242a mds: reset MDSRank heartbeat timeout even when MDS is laggy
+807a34c common: osd_pg_epoch_persisted_max_stale < map_cache_size
+73d7bed logrotate: logs are now owned by ceph:ceph
+7250fb1 os/OSD.cc cast osd_max_write_size to int64_t
+139b5d6 os: require Sequencer arg for apply_transaction()
+df92112 ceph_objectstore_test: fix warnings
+1002201 os/KeyValueStore: better osr debug
+83c83e4 os/KeyValueStore: kill default_osr
+e7b57cd os/newstore: better sequencer debug
+16c672c os/newstore: kill default_osr
+47e7953 os/FileStore: improve debug output for sequencers
+b46cdc0 os/ObjectStore: fix leak ref on Sequencer_impl
+10b00f0 os/FileStore: assert Sequencer is passed in; kill default_osr
+fedf360 osd: queue all transactions under an osr
+d5f8c8e rgw: unit test for testing rgw_obj encoding correctness
+583a295 rgw: json dump rgw_obj::orig_obj
+988979c rgw: transform from cls_rgw_obj_key to rgw_obj correctly
+470afb9 rgw: rgw_obj encoding fixes
+8e13d89 systemd: eliminate ceph-rgw tmpfiles.d file
+367c794 systemd: no need to preprocess ceph-osd at service
+8453a89 systemd: set nofile limit in unit files
+1ff51a2 osd: drop default map cache size from 500 -> 200
+8678a8a kill /etc/security/limits.d/ceph
+adb8478 include/inline_memory: out-of-bounds read on unaligned memory
+0b03b32 tools:remove the local file when get map failed.
+c2a9764 mon: do not return ref to MonOpRequest::get_session() caller
+d99e689 mon: fix MonSession leak when waitlisting op
+2f663d9 rgw: make radosgw-admin user rm idempotent
+fef7142 ceph: fix rename into sub-directory check
+40c3c85 mon: debug MonSession refs
+640ebbf mon/PGMonitor: fix use-after-free in stats ack
+68ecc55 mon: do not leak ref creating MonOpRequest
+3a7d91d msg/simple: debug refs on sent messages
+7d112c6 mon/MDSMonitor: drop incorrect m->put()
+af39f98 .gitignore: ignore src/ceph.tmpe
+98302ad gmock: ignore *.pyc
+c57e868 rocksdb: ignore m4
+51abff1 ceph.spec: respect CEPH_EXTRA_CONFIGURE_ARGS
+4a5a5b3 qa/workunits/cephtool/test.sh: make mds epoch check more tolerant
+d33fea5 sd/PG: tolerate missing pgmeta object
+f15d958 osd: allow peek_map_epoch to return an error
+ff9600a osd/ReplicatedPG: remove stray debug line
+6e85433 AsyncMessenger: Kepp file_lock hold when accessing its event field
+4dea76e ceph.spec: include /etc/sysconfig/ceph
+8657081 doc: remove references to default data/metadata pools
+c3d23ca ceph-common: explicitly trigger /run/ceph creation
+ea91c4e systemd: tmpfiles.d in /run, not /var/run
+c92d13b debian: fix location of tmpfiles.d
+7384a14 debian/rules: install systemd files and /etc/default/ceph
+cd4bd80 Makefile.am: include etc in tarball
+c8bfc35 ceph.spec: install /etc/sysconfig/ceph
+3aa38bc make /var/run/ceph 770 ceph:ceph
+e7837d1 ceph.spec: make /var/{lib,log,run} owned by ceph
+f167e8d .gitignore: radosgw-object-expirer
+3826203 compat: define O_DSYNC for FreeBSD
+2dbf201 compat: define XATTR_CREATE/XATTR_REPLACE for FreeBSD
+c4fe266 compat: don't reinterpret_cast NULL
+b433083 compat: include proper headers for FreeBSD
+be595df compat: include compat.h for definition of ENODATA
+80655e1 memstore: don't use thread_local on non-linux OS
+e5a8022 ceph.spec.in: ceph-radosgw should require ceph-selinux
+338bd3d selinux: Update policy for radosgw
+645f3e9 mds: For journal, set write iohint flags.
+217e424 logrotate: ignore postrotate error for radosgw
+aa4a5b0 logrotate: ignore exit status 1 from killall
+fa6e4ff osdc/Journaler: add write_ihont filed which record the write fadvise flags.
+7e3c7a4 osdc/Journaler:: For read operation add fadvise-dontneed flag.
+4b45e6d osdc/Objecter: pass extra_ops of read_full into read func.
+9b44fab osdc/Filer: make read/write support iohint flags.
+7c09e50 osdc/Objecter: make sg_read/write support iohint_flags.
+f4498f5 osd:the fuction osd::shutdown Lock failed.
+d7f1d70 memstore: don't encode/decode 'size_t'
+6359f3a cls_numops: don't include asm-generic/errno.h directly
+648c704 os: disable newstore when configure --without-libaio
+d4eeb9b rocksdb: recognized --without-tcmalloc
+a795c88 common/buffer: add the move constructor for bufferlist
+1b43d81 common/buffer: fix the const-ness of bufferlist::contents_equal()
+8ed7242 test/bufferlist: do not expect !is_page_aligned() after unaligned rebuild
+6504e37 common/buffer: add bufferlist::const_iterator
+5a2b688 src/.gitignore: add some more files to be ignored
+1aaccd7 librbd/test_internal.cc: fix -Wsign-compare
+99ea120 objectstore/store_test.cc: fix -Wsign-compare
+d9d4989 Transaction.c: replace deprecated function call
+6e97b0f rbd.cc: fix -Wreturn-type, return result from do_metadata_remove()
+2767736 osdc/Objecter.cc: prefer ++operator for non-primitive iterators
+0297ce1 osd/ReplicatedPG.cc: prefer ++operator for non-primitive iterators
+120071b osd/OSD.cc: prefer ++operator for non-primitive iterators
+a1be9ef NewStore.cc: prefer --/++operator for non-primitive iterators
+534fdd5 ErasureCodeShecTableCache.cc: prefer --operator for non-primitive iterators
+4a2377c rgw_object_expirer_core.cc: remove left over unused variable
+b229162 client/Client.cc: fix memory leak, free sgids in error case
+e243aa8 mds/MDSDaemon.cc: fix resource leak in MDSDaemon
+a29dd45 client/Client.cc: remove only once used variable
+7a7248d ConfigKeyService.cc: move assert before first deref
+742feec mds/MDCache.cc: fix unitialized variable
+8e07c8c rbd: missing return statement within do_metadata_remove
+82ba048 common: add debug option to deliberately leak some memory
+c4401ad test/Makefile-client: ship LibradosTestStub.h in tarball
+90f1d25 msg/simple: start over after fails to bind a port in specified range Fixes: #13002 Signed-off-by: xie.xiexingguo at zte.com.cn
+24b20a8 client: use faked inode number when sizeof(ino_t) < 8
+71909b6 doc: rgw: update x-amz-request-id status
+c503e97 rgw: include RequestId as part of the Error response
+94d84cc test: mon/mon-ping.sh: make sure 'ceph mon ping' works as expected
+6907778 ceph-objectstore-tool: add mark-complete operation
+06147dd rgw: preserve all attrs if intra-zone copy
+293d12a test/Makefile.am: run mon/mon-scrub.sh as part of checks
+6ceb37d test: mon/mon-scrub.sh: port clashed with other tests
+897f074 test_async_compressor.cc: prefer ++operator for non-primitive iterators
+9d9b305 os/KeyValueStore.cc: prefer ++operator for non-primitive iterators
+8810f8f SnappyCompressor.h: prefer ++operator for non-primitive iterators
+4f98dab client/Client.cc: fix realloc memory leak
+6f9ee79 ReplicatedPG,Objecter: copy_get should include truncate_seq and size
+797caae test/librados: add test case for read object (off=0,len=0).
+19a210a osd: make read(off=0,len=0) ec-object work.
+cddca59 interval_set: add lower_bound(T k) member function
+95bd3c2 test: Fix failure test to find message anywhere in stderr
+b968fb3 rados: Fix usage for "notify" command
+d741352 AsyncMessenger: add instance name in debug log when processing msg
+95685c1 rgw:add --reset-regions for regionmap update
+3ccc3bb librbd: diff_iterate needs to handle holes in parent images
+d5650c9 tests: new test case for librbd diff_iterate over discard extents
+d32a3be qa/workunits/rados/test_alloc_hint.sh: sudo to ls files
+ab4232b rgw: init_rados failed leads to repeated delete
+e48cec3 mon: disable gmt_hitset if not supported
+02f4461 test: mon: mon-scrub.sh: test 'mon scrub'
+8c2dfad osd: force promote for ops which ec base pool can't handle
+70d3108 mon: MonitorDBStore: make get_next_key() work properly
+07a64b9 ceph.spec.in: refrain from duplicating %{_sbindir}/rcceph
+e808904 tests: drop docker-tests.sh root and /dev support
+bfde30d tests: CentOS 7 needs systemd-container
+27cf257 rgw: add delimiter to prefix only when path is specified
+d853839 refine tests for metadata ops
+e6fbe53 improve error handle of rbd metadata operation & format output
+68d47f2 fix metadata loading error if we open an image
+438b4e4 msg: we should set the socket options before connect or listen in order to have it take effect.
+3aefd91 erasure-code: fix gf-complete warnings
+82b0243 qa/workunits/post-file.sh: sudo
+bfe359a osd: dump full map bl at 20 when crc doesn't match
+351d957 doc: fix the typo in command example
+7d781f7 doc: 'ceph --admin-daemon ...' -> 'ceph daemon ...'
+404dd16 tests: base gmock class support for librbd
+e8749b2 librbd: support templating of ImageCtx for async state machines
+1c522be ceph.spec.in: put distro conditional around Group:
+0d18f9b librados_test_stub: add mock class for IoCtx operations
+e267128 ceph.spec.in: fix lttng/babeltrace conditionals
+b0714c3 rgw: don't append empty ETag HTTP header.
+406b1d0 doc: Add pgcalc tool link in placement-groups document
+f850d05 rgw: improve debugs in RGWPutObj and RGWPutObj_ObjStore_SWIFT.
+ab430f1 erasure code: shec performance optimization with decoding cache
+58ea8be logrotate.conf: Simplify log files reopening after log rotation
+3dc29de mon: fix the build with boost 1.59
+c0a6218 librados_test_stub: add reference counting to pools
+5755370 gtest: enable use of TR1 tuples
+cfe8fa2 rbd: export diff needs should write chunks in-order
+3fec9da librbd: migrate diff iterate to new OrderedThrottle implementation
+eceadee Throttle: added new OrderedThrottle class
+f6f442d osd/ReplicatedPG: ProxyWriteOp::reqid should not be a ref
+15fa138 osd/osd_types: init coll_t::removal_seq in all ctors
+c3d3794 test_cls_numops: fix iterator use
+1b9fbff Fix casing of Content-Type header
+2d1d2ef debian: package radosgw-object-expirer in radosgw deb
+fbf4e6b ceph.spec: package new rgw files
+3ce06e1 ceph-disk: set ownership of newly mapped dm device
+0f974a3 ceph-disk: use async upstart job for trigger
+bde6ab3 ceph-disk: is_upstart()
+b092bd0 ceph-disk: use blkid for get_partition_{type,uuid}
+6d459c0 CMakeLists.txt: add newstore files
+86da373 .gitignore: ignore build (usually used by cmake)
+64a0f0c rgw/Makefile.am: ship rgw_object_expirer_core.h
+df44a57 ceph.spec: build requires cmake
+322ad80 debian/control: build requires cmake
+f69498f debian/control: build-requires libboost-regex-dev
+50bc48b tests: ceph-erasure-code-corpus must test SIMD variants
+ef74184 ceph.spec.in: fix License line
+fab0a3b cmake: install crushtool to destdir/bin
+4cea74a cmake: add blkid as dependency to libcommon
+57b47b4 cmake: Changed name of crc32 target to crc32c
+18d6196 cmake: Added shell script tests
+55b7c86 cmake: Fixed HAVE_BETTER_YASM_ELF64 variable
+74a9a03 cmake: Removed trailing spaces from isa .s files
+1397f66 cmake: Uncommented erasure-code/shec conditional
+283e81c cmake: Removed traces of CDS, minor cmake fixes
+66ea78a cmake: Fixed rbd_replay build issue
+5268b3d cmake: Removed scripts, check_PROGRAMS included
+8ddca17 cmake: Cleaned up syntax for make check targets
+bf82c65 cmake: check_TESTPROGRAMS tests running
+6e7fafc README.md: Add basic CMake instructions
+d506bf1 vstart: add -c argument to radosgw-admin commands
+e54f896 ceph.spec.in: drop redundant centos from conditionals
+75f2a98 ceph.spec.in: clean up suse_version conditionals
+929ca5b ceph.spec.in: drop lsb-release dependency from ceph-common
+557e581   mon/MonClient: fix error in 'ceph ping mon.id'   Fixes: #12442
+f65267c rgw : setting max number of buckets for users via ceph.conf option
+64962aa qa/workunits/rados/test_alloc_hint.sh: sudo to list files
+75d9f58 osd/ReplicatedPG: use apply_ctx_stats() everywhere
+eb2993a osd/ReplicatedPG: create apply_ctx_stats() helper
+9bf103c osd/ReplicatedPG: snaptimmer: adjust stats through ctx->delta_stats
+3626db4 rgw: don't copy delete_at attr, unless it's intra region copy
+a69a989 rgw: objexp shards index by key
+fa347d8 rgw: delete-at and delete-after also on obj put / copy
+65dcc2d osd: When generating past intervals due to an import end at pg epoch
+cabfe13 osd: check the length of the map before accessing the first element
+ddca321 rbd: add verbose error reporting to merge-diff tool
+6d80ff1 tools: fix do_autogen.sh -R
+c938d1f rocksdb: fix 32-bit build
+9cc1055 AsyncConnection: Close connection when unregistered connection met WAIT
+67f5f52 memstore: fix the build on i386
+89aacaf doc: add the doc for min_write_recency_for_promote
+b02cc06 AsyncConnection: Don't use unsafe feature as message encode feature
+7bfb7f9 librbd: do write_full for whole object write
+b199c49 ceph-osd-prestart.sh: fix osd data dir ownership check
+023c517 vstart.sh: enable all experimental features for vstart
+3a41ef4 ms/async: log message tx/rx at level 1
+2ca2c1b osd, test: Minor clean-up from fast-read and error handling ec changes
+cbe85ec doc: fix the code-block in ruby.rst
+d015d23 osd: sparse reads returning invalid extent map
+7c00bf0 cmake: update FUSE_INCLUDE_DIRS to match autoconf
+c13bb7a os/newstore: fix swarning
+b7c5bd1 ceph_test_keyvaluedb: add simple commit latency benchmark
+05d79b6 os/newstore: update todo
+eab4d53 do_autogen.sh: build static rocksdb by default
+caf28fe rocksdb: update alt dist rule
+9d1582d ceph_test_objectstore: make OMapIterator test work with FileStore
+1fa2ef2 ceph_test_objectstore: enable newstore tests
+9050486 rocksdb: update to 3.11.2
+0d463ff os/RocksDBStore: make other rmkey match
+d6b0e53 os/RocksDBStore: fix rmkey()
+522f850 ceph_test_keyvaluedb: some simple KeyValueDB unit tests
+f3ddb75 os/newstore: fix end bound on collection_list
+c37b06d os/newstore: flush object before doing omap reads
+faca5d0 os/newstore: add 'newstore backend options' to pass options to e.g. rocksdb
+094a190 os/newstore: change escaping chars
+79799ca os/newstore: trim overlay when zeroing extent
+15382c5 os/newstore: tolerate null pnext to collection_list()
+a1f0bdb os/newstore: fix collection range for temp objects
+404cdd2 os/newstore: Implement fiemap
+8ad6b9d os/newstore: make sync/async submit_transaction optional
+35821d3 os/newstore: renamed TransContext::fds -> sync_items
+92979d7 os/newstore: queue kv transactions in kv_sync_thread
+22a6a9f os/newstore: process multiple aio completions at a time
+9c2eb28 os/newstore: clean up kv commit debug output
+90e7f5e os/newstore: only ftruncate if i_size is incorrect
+4c15520 Revert "os/newstore: avoid sync append for small ios"
+e89b247 os/newstore: avoid sync append for small ios
+668c277 rocksdb: fallocate_with_keep_size = false
+08f3efb Revert "os/NewStore: data_map shouldn't be empty when writing all overlays"
+02d0ef8 os/NewStore: delay the read of all the overlays until wal applying
+e3abf24 os/newstore: fix deadlock when newstore_sync_transaction=true
+cdc652e os/NewStore: fix the append of the later overlays when doing combination
+36ed3dd os/Newstore: flush_commit return true on STATE_KV_DONE
+e02e743 os/NewStore: avoid dup the data of the overlays in the WAL
+6399f1d os/newstore: fix multiple aio case
+2a7393a os/newstore: more conservative default for aio queue depth
+37da429 os/newstore:close fd after writting with O_DIRECT
+65055a0 os/NewStore: need to increase the wal op length when combining overlays
+df239f0 os/Newstore:Fix collection_list_range
+4c9e37d os/newstore: fix race in _txc_aio_submit
+1173300 os/newstore : Do not need to call fdatasync if using direct.
+c552cd2 osd/NewStore: fix for skipping the overlay in _do_overlay_trim
+793dcc3 os/NewStore: combine contiguous overlays when writing all the overlays
+29ba720 os/Nestore: batch cleanup
+4eca15a os/newstore: fix _txc_aio_submit
+41886c5 os/newstore: throttle over entire write lifecycle
+b1136fb os/NewStore: data_map shouldn't be empty when writing all overlays
+a165fe8 os/NewStore: clear the shared_overlays after writing all the overlays
+dffa430 os/NewStore: don't clear overlay in the create/append case of write
+f9f9e1b os/newstore: debug io_submit EAGAIN
+dd79b4d os/newstore: release wal throttle when wal completes, not when queued
+715fd3b os/newstore: todo
+3b66712 os/newstore: move toward state-machine
+2317e44 os/newstore: use aio for wal writes, too
+e580a82 os/newstore: a few comments about wal
+5d8e146 os/newstore: combined O_DSYNC with O_DIRECT
+b7a53b5 os/newstore: basic aio support
+ba0d8d7 os/Newstore: add newstore_db_path option
+143d485 os/newstore: throttle wal work
+efe218b os/newstore: show # o_direct buffers in debug output
+7e1af1e os/newstore: use a threadpool for applying wal events
+dfd389e os/newstore: rebuild buffers to be page-aligned for O_DIRECT
+552d952 ceph_test_objectstore: fix omap test cleanup
+04f55d8 os/newstore: use fdatasync instead of fsync
+1321b88 os/newstore: update todo
+6587783 os/Newstore: Check onode.omap_head in valid() and next()
+1a97fd6 Use .str() to output a stringstream.
+9d0e925 os/Newstore: Allow gap in _do_write append mode
+5e9c64b Implement get_omap_iterator
+c864102 os/KeyValueDB: Add raw_key() interface for IteratorImpl
+b595aac test/store_test Add get_omap_iterator test cases
+ca9bc63 os/newstore: drop sync()
+d57547f os/newstore: drop sync()
+205344d os/newstore: drop flush
+f93856f os/newstore: drop sync_and_flush
+28bc4ee os/newstore: use FS::zero()
+c67c9a2 os/newstore: use O_DIRECT is write is page-aligned
+5539a75 os/newstore: pass flags to _{open,create}_fid
+48f639b os/newstore: drop unused FragmentHandle
+93fa4f1 os/newstore: do not call completions from kv thread
+86a3f7d os/newstore: let wal cleanup kv txn get batched
+ec21f57 os/newstore: fix off-by-one on overlay_max_length
+f9a7fd4 os/newstore: use lower_bound for finding overlay extents in map
+66aae98 os/newstore: use overlay even if it is a new object or append
+0981428 os/Newstore:Change assert in get_onode
+97bda73 os/newstore: open by handle
+8f2c2bf os/newstore: use fs abstaction layer
+ef420ba os/newstore: cap fid_max below newstore_max_dir_size
+59cd761 os/newstore: keep smallish overlay extents in kv db
+2af1e37 os/newstore: assigned unique nid to each new object
+713c698 os/newstore: consolite collection_list to a single implementation
+a4d2a53 Clear removed_collections after reap
+d8351a8 os/newstore: ref count OpSequencer
+fbf3d55 os/newstore: send complete overwrite to a new fid
+db87e42 os/newstore: clone omap
+d0a4bba newstore: initial version
+10c0bfe vstart.sh: debug newstore
+be93b09 Revert "os/Makefile.am: add os/fs/XFS.cc"
+32331ed os/Makefile.am: add os/fs/XFS.cc
+32446ff tests: ceph-disk: dmcrypt simplification
+b226fad ceph-disk: systemctl restart the ceph-disk@ service
+00e6534 ceph-disk: be a bit more verbose
+35c9962 ceph-disk: only check partition_type if partition
+fcae145 ceph-disk: fix dmcrypt_map() usage for LUKS activate
+c14c317 ceph-disk: add trigger subcommand
+3662a22 udev: use ceph-disk trigger ... with single set of udev rules
+f1b80e9 systemd: consolidate into a single ceph-disk at .service
+ee20404 osdc/Objecter: optimize Objecter::tick.
+08296dc rados: make 'rados bench' support json format output Fixes: #12864 rados bench add '[--format json]' and '[-o | --output outfile]' support. output option only take effect in json format. now we can use the bench result draw performance graph easily.
+f420fe4 mds: fix shutdown while in standby
+80f10e3 osdc/Objecter: remove the unuseful code.
+7cc963b osdc/Objecter: Don't forget call _op_cancel_map_check when cancel linger op.
+36b6271 osdc/Objecter: In _cancel_linger_op, it should make num_unacked/num_committed decrease.
+076bad9 ceph_test_rados_api_aio: add a test for aio_sparse_read
+4d49206 ceph_test_rados_api_io: add tests for sparse_read
+5ae2e7a ceph_test_rados: also send sparse_read in ReadOp
+a5bfde6 osd: should use ec_pool() when checking for an ecpool
+700d42e osd: translate sparse_read to read for ecpool
+28324fd osd: Fix the diagnostic logging mostly to dout(20)
+e47fa67 vstart.sh: add --mon_num --osd_num --mds_num --rgw_port option add these options to replace shell var MON, OSD, MDS to be more convenient, and add --rgw_port option.
+064e858 osdc/Objeter: When cancel op, decrease num_unacked/num_uncommitted.
+e4ce619 osdc/Objecter: For func op_cancel_writes it can directly call op_cancel.
+89f0112 Objecter: Take RLocker when call is_active.
+6e0f0bb ceph-disk: use /sys/dev/block/maj:min/partition to see if partition
+403144f ceph.spec: package cls_numops
+d05e531 doc: update ruby doc with the aws-sdk gem usage
+13668e6 client: set osdmap epoch for setxattr.
+fe8b1c9 in filestore, OP_SETATTR is implemented in FileStore::_setattrs
+109e5b1 make: do not compile XFS.cc if --without-libxfs
+8ef0742 os/fs: include <sys/mount.h> on osx
+491d893 test_c_headers: don't use -Werror option for clang
+d36e514 ceph: use 'sed -ie' to edit file in-place
+68db9f6 client: fix compile error on OSX
+0b94867 common/blkdev: fix complie error on OSX/FreeBSD
+8d527d4 common/admin_socket: fix compile error on OSX
+2cd7d4f tests: only use posix_fadvise on linux
+c092b4f os/chain_xattr: set CHAIN_XATTR_MAX_NAME_LEN according to max length of xattr name
+b06838a test/admin_socket: check error message according to OS
+f6fa4a2 compat: move definitions in porting.h into include/compat.h
+6a03fa5 TestLFNIndex.cc: don't use "cp --preserve=xattr"
+126ee7d tests: replace std::tr1::shared_ptr with ceph::shared_ptr
+e71269c tests: disable unittest_blkdev on OSX
+9e37a37 test/librados: replace sem_init() with sem_open()
+b82ed61 buffer: make buffer::exception classes undefined in dynamic objects
+c57ef8c test/librados: replace errno -125 with -ECANCELED
+4132805 client: convert XATTR_{CREATE,REPLACE} to CEPH_XATTR_{CREATE,REPLACE}
+5db6915 xattr: convert ENOATTR to ENODATA on DARWIN
+731f6aa test_libcephfs: disable flock test on OSX
+ad056c1 libradosstriper/striping.cc: include "include/types.h"
+0786919 test_c_headers: don't use -Wold-style-declaration option for clang
+bcbddab libcephfs: define loff_t as off_t on OSX
+b800303 tools: link ceph-client-debug to LIBCLIENT
+bb1fa7f init-ceph: check if /lib/lsb/init-functions exists
+4594adb init-ceph: don't use procfs to check if daemon is running
+69e2060 init-ceph: replace 'echo -n' with printf
+4536cb5 vstart.sh: use portable way to get ip address
+e92aaea vstart.sh: append ceph library path to {LD,DYLD}_LIBRARY_PATH
+05fbfd1 On Darwin: subfix of dynamic library is dylib
+a558916 porting.h: add TODO comment
+4ec4177 Makefile: add porting.h to dist tarball
+760f587 Don't use '--exclude-libs' linker option on DARWIN
+d5c43d9 client: don't try trimming kernel dcache on DARWIN/FreeBSD
+aa71c20 client: don't include unsupported mount options on DARWIN/FreeBSD
+f064e90 Link ceph-fuse to fuse on DARWIN
+44458db msg: fix encoding/decoding sockaddr_storage on DARWIN/FreeBSD
+11a936e librbd: Need to include errno.h on DARWIN.
+cd93656 porting.h: add porting.h for porting/compatibility on DARWIN
+fb1b6bc tools/ceph_objectstore_tool: Missing O_LARGEFILE on DARWIN
+0fbacb3 test: Fix error with clang on DARWIN (LLVM 3.6.0svn)
+126c327 rbd-replay: Different location of endian.h on DARWIN
+b3b29c0 rbd-fuse: Add position to set/get xattr on DARWIN
+17db469 os/FileStore: fail with ENOTSUP if using sparse files on DARWIN
+630da9f librbd: Include porting header
+4cfac6c common/xattr: Split out xattr on Linux and DARWIN.
+e69c115 common/util: include sys/param.h and mount.h on DARWIN
+d549f41 common/SubProcess: fix build on DARWIN
+f023422 ceph_fuse: Implement set/get xattr with position parameter on OSX
+15f8363 msg/async: Include porting.h for MSG_*
+7e1d83b msg/simple: Move MSG_ and SO_NOSIGPIPE into porting.h
+dbfac28 bug fix: librados segmentation fault, support RadosStriperImpl::aio_read() method
+90dea96 mds: add osdmap epoch for setxattr of MClientRequest.
+e20195d mds/Server: s/mds->mdcache/mdcache.
+62e1593 KeyValueStore: Fix broken assert statement
+12ebb73 KeyValueStore: Fix getattrs nonexist object need return -ENOENT
+7841455 Mon: Make ceph osd metadata support dump all osds
+3c8ac54 bug fix: librados segmentation fault, extra modify supports aio_xxx() methods
+3a6c246 ceph-disk: fix dmcrypt typo
+d8970f7 ceph-fuse: warn and shut down when there is no MDS present
+a0ea0df ceph_fuse: Adding CEPH_FUSE_NO_MDS_UP flag
+bb96be0 ceph_fuse: Adding fuse_require_active_mds option
+9b815ed configure.ac: check for libboost_random-mt also
+49bd8a8 mailmap: Jevon Qiao affiliation
+6f562c6 mailmap: Jiaying Ren affiliation
+8bd1ac0 mailmap: Jean-Rémi Deveaux affiliation
+2f92eba mailmap: Gaël Fenet-Garde affiliation
+45af3da mailmap: Arthur Gorjux affiliation
+2766443 mailmap: Abhishek Dixit affiliation
+3368f01 mailmap: Vikhyat Umrao affiliation
+a7004d7 mailmap: Ira Cooper affiliation
+aab2583 mailmap: Krzysztof Kosiński affiliation
+552ad88 mailmap: Joe Handzik affiliation
+dbdf48a mailmap: Takanori Nakao affiliation
+94bbd13 mailmap: Shotaro Kawaguchi affiliation
+33f8693 mailmap: Zhi Zhang affiliation
+827fbce mailmap: Yannick Atchy Dalama affiliation
+ffb36bd mailmap: Wu Xingyi affiliation
+c482025 mailmap: Valentin Arshanes affiliation
+33a75f1 mailmap: Thomas Laumondais affiliation
+0f849a8 mailmap: Shawn Chen affiliation
+8e1d9f8 mailmap: Sebastien Han affiliation
+d4f8a5b mailmap: Robin Tang affiliation
+805dcc9 mailmap: Pierre Chaumont affiliation
+6889f35 mailmap: Nicolas Yong affiliation
+6e2cde7 mailmap: Maxime Robert affiliation
+3d4db7e mailmap: Lucas Fantinel affiliation
+8bafdc5 mailmap: Kévin Caradant affiliation
+bb5784d mailmap: Jordan Dorne affiliation
+7040be2 mailmap: Guang Yang affiliation
+1395b51 mailmap: Germain Chipaux affiliation
+593b1a1 mailmap: Gabriel Sentucq affiliation
+f453231 mailmap: Clement Lebrun affiliation
+384cf19 mailmap: Claire Massot affiliation
+c7c59b2 mailmap: update h3c organization mailbox
+5ce7ed1 ceph-disk: integration tests for multipath
+d447098 ceph-disk: implement workunit
+b04bfd1 tests: remove dead scripts
+8c586e6 ceph-disk: CentOS 7 is systemd
+796a140 ceph-disk: implement list --format json
+5fd9486 ceph-disk: fix dmcrypt typo
+60c22a9 ceph-disk: cosmetic: setup_logging function
+982591a ceph-disk: cosmetic: argparse functions
+38d0e7b ceph-disk: use sys.argv instead of implicit
+9a71816 tests: obsolete ceph-disk root tests
+d4869ac ceph-disk: add multipath support
+7e5a69b ceph-disk: is_held must ignore multipath devices
+a101418 ceph-disk: rework get_partition_{type,uuid}
+77ff7c3 ceph-disk: multipath support for split_dev_base_partnum
+3bc95df ceph-disk: multipath support for is_partition and list_partitions
+2fca91e ceph-disk: --verbose shows a stack trace on error
+aac8971 ceph-disk: replace partx with partprobe
+0e34742 ceph-disk: is_mpath predicate for multipath devices
+f9cbd79 tests: ceph-disk tests may use system ceph-{mon,osd}
+42ad86e udev: add devicemapper to partuuid-workaround
+cc21514 ceph-disk: {CentOS,RHEL} >= 7 && Fedora >= 22 are systemd
+a895982 common: 'enable experimental data corrupting features' now understands '*'
+a3fc6e8 CMake: update for boost_random
+afa92e5 common/SubProcess: silence compiler warnings
+cfcacb8 mon: LogMonitor: handle boolean options consistently
+fbbe5b0 mailmap: make h3c mailmap more robust
+51e6b71 mailmap: sort {organization,mail}map
+c901e85 doc:radosgw: correct typos of the command removing a subuser
+e92d2f3 h3c mail organization map
+a9c1601 cls: Fix successful return found by compiler warning
+70e000a test: Fix to expect no errors on 1 bad shard and errors with 2 bad shards
+d3b06ed test: Fix comment in test-erasure-eio.sh
+c09c119 osd: Send reads to other shards if erasure coded chunk reads fail
+5bfa75c osd: Drop errors if enough copies are good redundant reads come in
+a7c6b6a test: Adding testing of shard with incorrect size
+bbdae53 test: Enable EIO test code but expect error instead of osd crash
+f3eea4a test: Fix incorrect syntax in check for subread all feature
+21e9f69 osd: Check CRC when able to on async read
+08f81a9 osd: Check for EC decode errors, though none are possible at this time
+da2987d osd: Fix ECBackend to handle mismatch of total chunk size
+4501492 common, osd: Remove osd_read_eio_on_bad_digest config variable
+1febe89 osd: Avoid confusion by changing EC decode total_chunk_size to total_data_size
+ae1df24 osd: Fix admin socket help output
+54090f1 mon: add a configuration for default fast read setting
+5eb2a77 mon: add a new pool setting to configure fast read for EC pool
+131214d ec: add support for fast read on PGBackend/ECBackend async read
+9db8122 rgw: lock obj expirer shards when processing
+c4a9a4b rgw: objexp related fixes
+e734b0a radosgw-admin: a new command to run objects expirer
+3dbea3c rgw: rename obj expiration hint oids
+eee424c rgw: init object expirer thread
+478b14e rgw: verify Swift object lifetime at POST.
+f2f23c2 rgw: implement object_is_expired function.
+65949bd rgw: add support for X-Delete-After HTTP header of Swift API.
+aa5f1b8 rgw: a few fixes, guard bufferlist decodes
+4f9a843 rgw: add basic support for X-Delete-At header of Swift API.
+2bc5a48 osd: Decode use_gmt_hitset with a unique version
+38465f0 osd: refuse to boot if any pre-hammer or old hammer (<v0.94.4) are running
+f668c6c mon: use HAMMER_0_94_4 feature to require sufficiently new hammer
+470f970 include/ceph_features: define HAMMER_0_94_4 feature
+14e02bc PG::handle_advance_map: on_pool_change after handling the map change
+faac0b1 rgw: create a worker thread for object expiration
+cdce7a2 rgw: integrate Swift object expiration-related things with CMake.
+05c90e6 rgw: split rgw-object-expirer.
+1fa376c rgw: make object removal atomic in rgw-object-expirer.
+3b0636e rgw: make the rgw-object-expirer's options more human readable.
+f572430 rgw: move objexp pool creation into separate function.
+db27ea9 rgw: add garbage collector daemon for expired objects.
+0d792c9 rgw: add support for object expiration in rgw_rados.cc.
+7675aca cls: add timeindex class for radosgw's objects expiration.
+9f47613 rgw: define attribute for storing object expiration info.
+20c7652 ceph-object-corpus: remove hammer foo and bar coll_t's
+4a1cb82 test/encoding/readable: handle nondeterministic items in corpus too
+f078a67 ceph-object-corpus: add 0.94.2-207-g88e7ee7 hammer objects
+5a4f6a8 osd: do not let OSD_HITSET_GMT reuse the feature bit
+d43c10e memstore: use thread_local page vector
+4e6548c memstore: PageSetObject for MemStore integration
+3eb36fc doc/release-notes: v0.94.3
+f5df1e4 osd: separate filter init from construction
+2777438 test: add a test for filter in cls hello
+60d51fc cls: add a filter to the hello class for testing
+e749b21 objclass: enable unregistering filter factory
+e2c3c86 rgw: add key parameter conflict check for radosgw-admin command line.
+328d30c AsyncConnection: Fix uninitialized variable compile warning
+24f4d22 TestMsgr: Fix forever hang under lossless policy and one is WAIT another down
+10fd1a5 test: don't unmount when no store is created
+83ba597 test: handle the case when ObjectStore::create returns NULL
+6a735d6 test: disable newstore test until it's merged
+743b15b debian: /var/run/ceph should be owned by ceph:ceph
+48f98e1 upstart: setuser ceph
+c860553 debian/ceph-common.dirs: install /var/lib/ceph
+94da8c1 debian/ceph-common.postinst: fix adduser, addgroup
+7c96016 debian/ceph-common.postinst: fix /var/log/ceph permissions
+18e0c77 debian: rename ceph-common.postinst
+7cd0749 PendingReleaseNotes: more notes about the 'ceph' user
+b89d752 global_init: ignore --set{user,group} if not root
+aef00eb ceph-disk: fix get_ceph_user
+960139e PendingReleaseNotes: some notes about upgrade and ceph user
+8f3185b systemd: use --setuser and --setgroup for all daemons
+28fdac3 global: implement setuser_match_path
+09db67f ceph-disk: set owner of created files to ceph
+52e978e Set keys owner to ceph user if exists.
+8bd35bd Set Ceph device partitions owner to ceph user in udev.
+bbedc8e ceph-osd-prestart.sh: ensure data dir is root or ceph before start
+25f68ae init-ceph.in: Set ceph user and group when running the daemons
+ceb93e8 ceph.spec.in: User and group must be created in ceph-common pre-install script
+e95904f ceph.spec.in: /var/lib/ceph is owned by ceph package
+d9df52b ceph.spec.in: Fix ceph.limits.d path
+2d4f3a91 ceph.spec.in: Fixup uid/gid setting
+596c9b6 ceph.spec.in: install ceph.limits.d
+ed0cd42 ceph.spec.in: add ceph user/group
+c7ee798 set nofile ulimit in /etc/security/limits.d/ceph only
+7c9fdf4 systemd: make ceph-osd setuid/gid to ceph:ceph
+4dfe0a8 global: add --setuser and --setgroup options
+6532e1c debian: fix /var/lib/ceph/* directory ownership
+b8893f6 systemd: chown ceph:ceph /var/run/ceph
+ec1ee5e systemd: run mon and mds as ceph:ceph
+3c56938 ceph.spec: chown and chmod /var/lib/ceph and /var/log/ceph
+7522650 debian: chown -R ceph:ceph /var/log/ceph
+2ba3d61 debian: chown ceph:ceph /var/llib/ceph
+71a0a02 debian: create ceph user and group
+97aed59 rgw: delete finisher only after finalizing watches
+dd7fe61 memstore: add unit test for PageSet
+51d2553 memstore: add PageSet for MemStore object data
+b0882fb memstore: replace apply_lock with sequencer
+7945482 memstore: move collection lock into get_object
+46f92f0 memstore: BufferlistObject uses spinlock for data
+61cd2da memstore: protect object xattrs with a mutex
+54739a5 memstore: protect object omap with a mutex
+5d8307a memstore: add Object interface to hide bufferlist
+26f716e memstore: use intrusive_ptr instead of shared_ptr
+01a9a79 osbench: add multithreaded objectstore benchmark
+d7bf8cb rgw: init some manifest fields when handling explicit objs
+b610588 ceph.spec.in: remove obsolete SUSE-specific code
+df21a6e osd: expose PGLSFilter in objclass interface
+c318129 ceph.spec.in: Restart services only if they are running
+55cec07 Messenger: Fix rand() generate the same sequence numbers
+15e5ebe common: fix code format
+2d2f0eb test: add test case for insert empty ptr when buffer rebuild
+fb1b6dd common: fix insert empty ptr when bufferlist rebuild
+347ac0f ceph_test_rados_api_tier: make PromoteOn2ndRead tolerate thrashing
+8a08acc common/hobject_t: fix is_temp() off-by-one
+7cc8d86 ceph_test_msgr: parse CEPH_ARGS
+dfd142f include/inline_memcpy: use __builtin_memcpy instead of explicit ptr copies
+98c0606 include/inline_memcpy: make prototype resemble memcpy's
+fc02a8a added boost timegm impl for cross platform support
+da6d5cf osd: bug fix hit_set_map size for tier pool
+582f0f6 doc: Added "Hammer" in the list of major releases.
+855ae1f bug fix: osd: avoid multi set osd_op.outdata in tier pool
+d0386d2 ceph.spec.in: drop sysvinit-specific macros that run only on openSUSE/SLE
+699fca8 doc: Fix typo and redundant word in snaps section
+4dffc16 doc: Fix typo in writeback throttle section
+6ee7068 doc: Fix typo in writeback throttle section
+07a28d6 doc: Fix typo in recovery reservation section
+8c2f8bf osd: no bother to create RecoveryCtx if no recovery op is started
+178d4d5 osd: remove unused parameter of start_recovery_ops
+f77949f bug fix: osd: requeue_scrub when kick_object_context_blocked
+4152269 config: skip lockdep for intentionally recursive md_config_t lock
+8ef2c96 buffer: modify inline memory ops to use packed structs
+dbcaa54 uuid: use boost::random:random_device
+136242b rgw: be more flexible with iso8601 timestamps
+fd72577 tests: fixed rbd cli cram integration tests
+f20f7a2 Objecter: pg_interval_t::is_new_interval needs pgid from previous pool
+cd6ac72 librbd: error closing image while set to invalid snapshot
+622d22e osd: wait for cleanup from bench
+c5895d3 bug fix: osd: do not cache unused buffer in attrs
+888a633 doc/release-notes: v9.0.3
+d63508f make-check: support MAKEOPTS overrides.
+d2e4fe3 set skip promote flag if NOCACHE or DONTNEED flag set
+151c051 common/Mutex: avoid trylock on lock if instrumentation is not enabled
+2c7fe2b osd: remove unused ctor
+ff087f9 osd: remove useless hitset code
+cc2bcf7 mon: print use_gmt_hitset in "ceph osd pool get"
+03a1a3c mon: add "ceph osd pool set $pool use_gmt_hitset true" cmd
+42f8c5d osd: use GMT time for the object name of hitsets
+0045b8d PendingReleaseNotes: make a note about KeyValueStore on-disk format change
+0f82f46 crush/CrushTester: test fewer inputs when running crushtool
+ea8609b mon/OSDMonitor: debug why pool creation fails
+7c08c54 rgw: fix dangerous removal from STL map in filter_out_temp_url().
+88bfd79 test/erasure-code: drop directory from profile
+5df1271 do not include directory in ec profiles
+660ae5b osd: always load erasure plugins from the configured directory
+7295612 9.0.3
+271513f erasure-code: shec plugin feature
+5e99a57 mon: add a cache layer over MonitorDBStore
+2d13a47 rbd: fix the FTBFS on old boost introduced by 2050d08
+d1c4086 osd: fix the FTBFS introduced by be28319
+5fedc84 osd: consolidate encoding snaps into pg log code in finish_ctx
+fd38902 osd: only remove all the hit set objects on primary osd
+5812adb osd: no need to check current hitset object is degraded when persisting hit set
+be28319 osd: implement hit_set_remove_all
+9c325d2 skip promote if flag set when doing proxy read
+faa964d Fix compile warning unused-result
+1546d57 osd: do either flush or evict but not both in agent_work
+5eb80b8 osd: do evict before flush in agent_work
+180ca7b rgw: implement s3 encoding-type for get bucket
+d164ec1 perf_serialize: fix i386 build
+5669077 buffer: move inline memory ops to inline_memory.h; gcc + x86_64 only
+cdaf997 unittest_bufferlist: benchmark buffer::ptr::append, copy_in, copy_out
+d2d540c build/ops: add uuid-runtime to debian/control
+951a3d6 Revert "Fix compile warning unused-result"
+2050d08 rbd:remove the local file when rbd export-diff fail Signed-off-by: Bo Cai <cai.bo at h3c.com>
+69cf089 rgw: url_decode values from X-Object-Manifest during GET on Swift DLO.
+d353dbc rados.py: This module now supports omap operations
+d5a56c4 doc: update some of the outdated cache tiering doc
+de00f6d doc: add the description for min_read_recency_for_promote
+14d3a2b os/FileStore: fix version check
+e7e4aef osd: some debug output during store version upgrades
+ffa224a mon: disallow post-hammer OSDs if there are up pre-hammer OSDs
+893e00b os: drop deprecated collection_* attr methods
+71deb4b test/objectstore/FilestoreDiff: don't diff coll attrs
+b297e6d ceph-objectstore-tool: drop support for pre-pgmeta PGs
+cd4e676 osd: drop support for pre-hammer pg metadata
+b474991 ceph-objectstore-tool: require hammer upgrade
+c0f83df osd: require an upgrade to hammer first
+10cb507 os/FileStore: require upgrade to hammer before moving beyond
+4b75baa mds/MDSRank: less noisy about log_to_monitors
+5ae3e36 Fix compile warning unused-result
+73d4b7d doc: fix the format of peering.rst
+f0ca14d osd/PGLog: dirty_to is inclusive
+f021e91 os/RocksDBStore: log any options we pass to rocksdb
+c7d281b os/FileStore: ensure sync() doesn't wait forever
+4afb084 common/hobject: rename get_bitreverse_key* -> get_bitwise_key*
+1194612 os/GenericObjectMap: use bitwise hash, not nibblewise
+d94a52c common/hobject: rename get_filestore_key* -> get_nibblewise_key*
+333f3a0 rbd: fix bench-write
+c6b4f08 ceph-kvstore-tool: take a db type too
+11a069d common/hobject: make hobject_t::set_key avoid dups
+7ffb7b0 test_objectstore_memstore.sh on make check
+79da9d6 ceph_test_objectstore: test many small writes
+1c48837 ceph_test_objectstore: strengthen clone test w/ new assert
+32bcced ceph_test_objectstore: test omap clone
+cf42c01 ceph_test_objectstore: much cleanup, new tests
+4b9523f os/fs: fix open_handle when name_to_handle not present
+5cde57b os/fs: add zero / hole punch support
+c33b6fb os/fs: add simple FS abstraction layer
+d6b2e68 os/ObjectStore: drop sync(Context *onsync)
+1bb9bd9 os/ObjectStore: drop sync()
+902557e os/ObjectStore: drop flush()
+3516a28 os/ObjectStore: drop sync_and_flush()
+6b6039f osd: improve osd bench
+b339eff buffer: add prepare_iov() method
+ff4c4cc osd: drop dead suicide() code
+0615379 ceph-objectstore-tool: drop explicit sync_and_flush() calls
+de60159 ceph_test_objectstore: drop useless sync_and_flush
+202aa9c osd: sync (no flush) on osd suicide
+5837ec6 osd: drop explicit sync/flush calls before umount
+30e4978 osd: fix flush_journal command
+e7bbafa osd: explicitly use a 'meta' Sequencer
+be6c476 osd: use osr for scan_snaps
+8ac5923 os/ObjectStore: do not let Sequencer::flush_commit delete the context
+c76e3d6 os/ObjectStore: ref count Sequencer_impl
+ec60a2c os/KeyValueDB: return -ENOENT if key doesn't exist
+ed5a8d7 os/ObjectStore: pass bits to create_collection
+8cb4814 osd/osd_types: pg_t: add contains() helper to check objects is in a pg
+1523265 os/ObjectStore: return const refs for oid, cid
+dbaf837 common/WorkQueue: add wait()
+54421fe vstart.sh: allow rocksdb
+8b41618 os/RocksDbStore: init pointers in ctor
+1ca0e7c osd/osd_types: make coll_t operator== less picky
+e69ac3a os/KeyValueDB: add sipler get() variant
+ed8d3c0 common/Formatter: add dump_object helper
+6c54d57 os/FileStore: clear cached fd for renamed-over object
+5d87387 rgw: fix error handling during GET on object through Swift API.
+62bfc7a moved to use boost uuid implementation, based on commit 4fe89a7b14c97b2ed7f357132901beb2bdcec551
+041642e osd/ReplicatedPG: fix missing whitespace in scrub error message
+e4cff03 ceph.spec.in: Require awk for selinux post script
+9038488 selinux: Relabel files if and only if the policy version changed
+aa50321 cmake: add DiffIterate.cc to librbd
+15a3e86 rgw: enable perf counter for unhealthy workers
+de0b66a test: add test for the perf counter of CephContext
+5d109e9 common: support perf counter (for unhealthy workers) on CephContext
+5fa03e9 osd: expose the number of unhealthy threads from heartbeat map
+9b23392 ReplicatedPG::cancel_pull: also finish_degraded_object
+79f310a ReplicatedPG: treat object as degraded until on_global_recover
+5390072 ReplicatedPG: block writes on promote of rollback snap promotion
+35af63b ReplicatedPG: enforce write ordering on rollback
+7d4f695 RadosModel: send writes before and after rollback to verify ordering
+e7107ce Revert "osd: set the blocked_by relationship when rolling back to a degraded"
+61c86a1 test: change the test_tiering test case accordingly for proxy write
+b7225a4 osd: copy the reqids even if the object is deleted during promotion
+da68bb3 osd: purge the object from the cache when proxying and not promoting the op
+aff8aa1 osd: set the blocked_by relationship when rolling back to a degraded object
+fa5b6e6 osd: skip promotion when proxying a delete op
+98fff96 osd: rename SKIP_PROMOTE to SKIP_HANDLE_CACHE
+626c569 osd: force promote for object overwrites on a ec base pool
+bf5d51e osd: explicitly set the reqid when proxying the write op
+ff658bd Objecter: optionally setting the reqid in the mutate interface
+58dd21c osd: add reqid in MOSDOp
+0d7759f osd: turn on proxy write feature bit by default
+b9ec7e6 osd: add proxy write perf counter
+cb9390d osd/ReplicatedPG: add the proxy write feature bit support
+ab39e03 osd/ReplicatedPG: don't check order in finish_proxy_write
+7e27e61 osd/ReplicatedPG: add helper function check_for_promote
+d836a64 osd/ReplicatedPG: minor updates on proxy write
+257c851 mon: add osd pool set/get for min_write_recency_for_promote
+3d5300b osd/ReplicatedPG: promote on 2nd write
+1099ec2 osd/osd_types: add min_write_recency_for_promote in pg_pool_t
+c01d20b osd/ReplicatedPG: set the RWORDERED flag for the promote copy-from op
+90e5f41 osd: tiering: use proxy write in writeback mode
+772617b osd/ReplicatedPG: remove the peer_type assertion in eval_repop
+f8b3a40 osd: tiering: add proxy write support
+792e24b doc: ceph-osd binds to up to 4 ports, not 3
+ec4e22b rgw: Remove useless code in calc_hmac_sha1()
+3aab146 ceph-disk: only call restorecon when available
+28d3c4c Do not use hardcoded paths in spec post section for selinux commands.
+fa2203e Add dependence to selinux-policy-base to fix installation in kickstart.
+14da7e2 Update rpm spec to properly restart systemd ceph.target.
+f92b27e tools/cephfs: account for striping in size estimation
+00c56c6 mds: warn if client does not advance its oldest flush tid
+976b449 mds: re-issue caps when recovering MDS becomes active
+a1d14b7 client: re-send flushing caps (which are revoked) in reconnect stage
+aa7e796 mds: detect completed cap flush
+2d6ab6e mds: record cap flush tid in log event
+0ca7a49 mds: record completed cap flushes in session map
+36b0e3a client: include oldest pending caps flushing in cap message
+3a0c14e client: make wait_sync_caps(uint64_t) check per-session flush TIDs
+4400aa8 client: track cap flushing TIDs for each session
+dafa7d2 client: don't change flush TID when re-send cap flush
+06cf224 client: globally unique cap flush TID
+c8a83cd client: track flush TIDs for all pending flushing caps
+479f2a7 qa/fsstress.sh: fix 'cp not writing through dangling symlink'
+6387ec9 mds: properly set client incarnation
+63a0cf2 qa/workunits/cls: add workunit for cls_numops tests
+d742e79 tests: Add unit tests for CLS numops class
+d17f158 cls_numops: Add cls_numops client
+87f6b73 Add new cls_numops class for numeric operations
+0ba2e14 Revert "osd/ReplicatedPG: snapset is not persisted"
+b18558b osd/OSDMap: test_flag returns bool
+3540fb9 osdc/Objecter: restart listing this PG if sort order changes
+35c1970 osd/ReplicatedPG: fix missing set sort order on [N]PGLS
+fc61fd7 osd/osd_types: add pg_missing_t::resort() method
+517921f osd/osd_types: make pg_missing_t sort order dynamic
+a5e27de osd: refuse to boot if SORTBITWISE not set but backend cannot sort nibblewise
+97c66e3 erasure-code: Update ISA-L to 2.14
+0bb57f1 configure: Fix checking for yasm compability
+2743cc4 java: add libcommon to deps
+5afa21d java: search for JNI bits in common dirs
+af0ebee rbd:improve the error handle of rbd,check the return value.
+4eaa9ea fix print error of rados bench
+7924231 tests: tiering agent and proxy read
+e1f58fe osd: trigger the cache agent after a promotion
+333cd7d mds: check mds up before calling MDSMap::get_mds_info
+7f32a3d rbd:modify the log of purging snaps so that it is more appropriate.
+fabd635 ReplicatedPG: Don't cache recovery and scrub data
+423cf13 rgw: we should not overide Swift sent content type
+623a8e7 test/encoding/check-generated: test sorted json dumps for nondeterministic objects
+23119ff Revert "osd/HitSet: make subclasses dump deterministically"
+c8bb8a2 Revert "mon/PGMap: dump osd_epochs in deterministic order"
+fb02024 rgw: don't preserve acls when copying object
+42a8f7c rgw: cleanup dead init_bucket
+4d4fe9d crypto: fix unbalanced ceph::crypto::init/ceph::crypto:shutdown
+6c803cd ceph.spec.in: test %preun argument is zero for removal-only operations
+773b431 tools/cephfs: use xattr layout if present+valid
+bde85b7 tools/cephfs: pass layouts around in DataScan
+5f41e8d tests: do not test timeout mon add
+f2a38bf client: fix unused var warning
+553bc3f cls: load layout xattr in cephfs cls
+ce03de1 mon/MDSMonitor: fix fs reset map init
+408ada4 mds/Server: clean up code for handle_client_open.
+43529a5 common/ceph_fs: set mode default is -1 for func ceph_flags_to_mode.
+6eea794 Remove redundant line from Makefile
+e1ff09c mds: initialize 'divergent' to false when comparing inode_t
+bf21555 Change key format to preserve order of keys in backend db.
+d33ad15 Adding statfs api to KeyValueDB
+7add646 rbd:Check the dest image name, if it is empty string, refuse to execute and give a message
+9e58c62 mon: show number of PGs in flush/evict mode in command line
+d57d36d osd: add flush/evict mode in pg stats
+af2a38b mon: fix the output of cache_io_rate_summary
+b78883b tests: be more generous with mon tests timeouts
+7e6f819 doc: update rgw configuration on multiple rgw rados handlers feature
+efc8969 Doc: Correcting the default number of copies.
+6f768a7 doc: Removed reference to RAID-4
+c6cf558 CMake: cut down unnecessary linkage on rados tests
+6ee955d CMake: add missing librados nlist test
+b199ac6 doc/rados/operations/add-or-rm-mons: simplify the steps to add a mon
+6ec431b librbd: prevent race condition between resize requests
+e8a53d2 test: add case for pgls filter arg in librados
+4649ba5 librados: partially expose PGLS filter interface
+cabfd5c when the processor start ,listen_sd also can be zero.
+fec459d mon: refactor OSDMonitor::send_incremental()
+de43a02 mon: track osd_epoch of all MonSessions
+c05753e mon: track osd_epoch in MonSession
+f9368333 test/mon: add test for "mon add"
+3e18449 WorkQueue: add/remove_work_queue methods now thread safe
+ff6e975 Fixed mistaken reference to mon in osd section. Changed release name to hammer.
+65e8f19 doc: Clarify how 'ceph auth caps' works.
+efb6220 mon: Improve PGMap::generate_test_instances(), build map per incremental
+c871305 mon: Monitor: set MMonJoin's op type as service
+f04c8da added permission check based on getgrouplist
+a99e733 rbd: export-diff now issues concurrent AIO read requests
+404cab3 tests: verify that diff_iterate callbacks can abort diff
+368f632 librbd: diff_iterate should handle callback errors
+f13bdae librbd: execute multiple object diffs in parallel
+e9c78c6 librbd: move diff_iterate logic to its own class
+3e145f7 tests: increase test coverage for partial encodes/decodes
+c6d9899 common: bit_vector extent calculation incorrect for last page
+16b59c6 added autoconf check for getgrouplist
+c747e29 vstart: detect CEPH_BIN in stop.sh too
+44ed436 ceph.in: detect paths in out of tree build
+0695d17 vstart: detect and handle cmake environ
+dac84a7 vstart: enable more path customization
+64255b9 Makefile: build ceph.in the cmake way
+6545981 ceph.in: use cmake-style substitution
+af31afa init-ceph.in: set executable bit
+1f5ddbf CMake: build dencoder at src/ceph-dencoder
+570887f cmake: fix mds compilation and link
+dcf20b3 rbd: creating refused, if feature is specified when the format is equal to 1
+370e4a7 doc: remove duplicate word in Motivation section.
+1404502 cmake: remove Flat_index stuff
+2c6acfc Removing unwanted -fPIC in cflags
+e5569d4 Removing unwanted -grecord-gcc-switches , relro in cflags and adding -fPIC
+8f06d3e Enable security hardening flags globally
+6b97add test/encoding/check-generated: make error msg cut and pasteable
+34c048b osd/HitSet: make subclasses dump deterministically
+d372718 osd/HitSet: mark subclasses that encode nondeterministically
+3df6438 mon/PGMap: dump osd_epochs in deterministic order
+62f9422 test/encoding/check-generated: skip some tests if non-determinstic encoding
+2bcab03 ceph-dencoder: add 'is_deterministic' command
+9ea0522 ceph-dencoder: clean up macro name
+aaa5b75 ceph-dencoder: mark PGMap with nondeterministic encoding
+82533aa test/encoding: c++11 STL appears to make fewer copies, yay
+e06046c cmake: check for good yasm
+016a5d5 mon/PGMap: make PGMap dump osd_epochs too
+841ae13 mon/OSDMonitor: prevent old OSDs from starting if 'sortbitwise' is enabled
+383185b mon/OSDMonitor: osd set/unset sortbitwise
+f02d7e7 mon/OSDMonitor: sort bitwise on new clusters
+0031e0e osd/PG: select interval sort order based on OSDMap
+bf49bb3 osd/osd_types: pg_interval_t: a change in sort order triggers a new interval
+3092a18 cmake: check for better yasm and fix isa
+968261b osd/OSDMap: add a SORTBITWISE OSDMap flag
+c2fc74d osd/ReplicatedPG: fix filter lifecycle in do_osd_ops
+6221483 fix 'rados ls' issue if using keyvaluestore as objectstore backend
+dcd6e96 ceph-object-corpus: do not try to decode pre-hammer ObjectStore::Transactions
+b98b104 rbdnamer: drop unneccessary tr usage
+3260adf c++11: xlist needs a value_type for back_inserter
+511106e c++11: fix shared_ptr conversions to bool
+3a68080 c++11: remove hash_namespace.h and default to std
+f4d2032 c++11: disambiguate std::isnan
+11a5d29 c++11: fixes for preprocessor string concatenation
+f7c478c c++11: replace boost::assign with initializer list
+9cdc327 AsyncConnection: Fix wrong order of local message delivering
+eaa2312 osd: re-sort object_contects when the sort order changes
+a78cef4 common/shared_cache: allow comparator to be adjusted
+526b63f osd/PG: fix forced backfill when last_backfill sort order doesn't match
+c5a1709 osd/PG: starting or continuing backfill .. be more precise
+03de3b1 osd/PG: do some new interval work on init() too
+8cc861f osd/ReplicatedPG: assert map and set sort order is correct during backfill
+5ea8290 osd/ReplicatedPG: better debug on recover_backfill
+1f15146 osd/ReplicatedPG: fix comparator resets
+3c50766 osd/ReplicatedPG: dynamically vary sort order for backfill sets and maps
+1d66bbf osd: dynamically vary sort order of BackfillInterval map
+579d4f4 include/encoding: add decode_noclear() variant for map<> decode with comparator
+bee3ffe osd/PG: note NIBBLEWISE in PG debug string
+1f9e8b8 osd/PG: use hash instead of filestore key for scrub
+35b521a osd/ReplicatedPG: fix a bit of whitespace
+d90cc26 osd: add and use operator<< for BackfillInterval
+51a02db osd/ReplicatedPG: drop old hobject_t encoding compatibility
+49539ab hobject_t: conditional Comparator object
+64e00ba use explicit ghobject_t comparators
+cdffea9 use explicit hobject_t comparators
+7624125 explicitly specify comparator for all ghobject_t maps and sets
+a208832 explicitly specify comparator for all hobject_t maps and sets
+6038aef hobject_t: typed MIN_* helpers
+35f90d9 common/simple_cache: parameterize map<> comparator
+456540f common/sharedptr_registry: parameterize map<> comparator
+332e1a4 common/shared_cache: parameterize the map<> comparator
+a33895e include/encoding: handle set<> with comparator
+9b8e536 include/types: operator<< for set with comparator
+c56edf8 include/types: map operator<< with comparator
+29e140d include/encoding: map<> encoders when comparator is specified
+5e23f04 common/hobject_t: remove comparison operators for [g]hobject_t
+8063a9d osd: ignore CRUSH_ITEM_NONE when calculating interval features
+25f0afc osd: allow sort order to randomly revert to nibblewise via debug config
+a36cae1 osd/PG: do not trust last_backfill for purposes of missing objects if sort order is wrong
+26d5d7a osd: restart backfill on peers if last_backfill sort is off
+8230375 osd: use accessor to set last_backfill
+897d98f osd: set acting, upacting features from osdmap, not peers
+fd6cfa3 osd/osd_types: clean up pg_info_t comments a bit
+d5ca7e9 osd/osd_types: add last_backfill_bitwise flag to pg_info_t
+e50d2a7 hobject_t: pad hash to 8 digits
+24fdbd1 ceph_test_objectstore: call+test new collection_list variants
+48503d8 os/HashIndex: handle new bitwise sorting
+ea4fe76 os/HashIndex: handle legacy nibblewise sort
+c087d41 os/LFNIndex: pass through sort_bitwise
+bbd1044 os/FileStore: pass sort_bitwise down to CollectionIndex
+466afca os/FileStore: use bitwise sort when we don't care (e.g., split)
+7d62739 os/KeyValueStore: only support bitwise sort
+744508f os/MemStore: only support bitwise sort order
+464bd6b osd/PGBackend: ask PG which sort order to use
+c5bd034 osd: use bitwise collection_list sort when we don't care
+f459bdc qa: fix misc collection_list callers
+f0f87da ceph-objecstore-tool: use new api
+8aaa437 os: add sort type to ObjectStore interface (incomplete)
+73b3ed8 osd: add OSD_BITWISE_HOBJ_SORT feature
+f72ba6e hobject_t: restore default comparators ... bitwise!
+8d8eb2b hobject_t: nibblewise and bitwise comparators
+c589328 os/LFNIndex: return vector from list_subdirs
+c4128f1 os/HashIndex: drop unused lower_bound arg
+89cb2bb os/memstore: fix omap_rmkey_range
+654c16e os/ObjectStore: drop get_ideal_list_min()
+cdb1fc0 os: drop snapid_t arg to collection_list
+e46a855 s/collection_list_impl/collection_list/
+7ee3eef Kill collection_list in CollectionIndex
+5df688a Kill Flat_index.
+c5dc404 Kill collection_list
+2d5ed30 Kill collection_list_partial
+7165510 Kill collection_list_range
+7bf999c os/KeyValuestore:Refactor collection_list_range and collection_list_partial
+26668d6 os/CollectionIndex: use const ref for end
+921c458 os/Filestore:Refactor collection_list_range and collection_list_partial
+d171537 os/Memstore:Refactor collection_list_range and collection_list_partial
+9471bb8 Common/Thread: pthread_attr_destroy(thread_attr) when done with it When a thread attributes object is no longer required, it should be destroyed using the pthread_attr_destroy() function. Destroying a thread attributes object has no effect on threads that were created using that object.
+e3147b8 rgw:segmentation fault when rgw_gc_max_objs > HASH_PRIME
+9420d24 rgw:the arguments 'domain' should not be assigned when return false
+cd4ac1c rbd: support size suffixes for size-based options
+d1735a4 rgw: rework X-Trans-Id header to be conform with Swift API.
+278a6ae qa: add fs layout case for stripe_size decrease
+880ffe9 mds: fix setting whole layout in one vxattr
+1559d5e cmake: add global lib to rbd
+6b29233 mds: initialize InodeStoreBase::damage_flags
+5d7cb4c ceph-dencoder: add RGWRegion, RGWZoneParams, RGWOLHInfo support.
+e67539e Fix location of --access=full in SWIFT user creation
+8c53a58 ceph.spec.in: Make SELinux opt-out, not opt-in
+51aae7b client: ignore permission check when fuse_default_permissions is on
+4866d89 osd: make PGLSFilter xattr read optional
+60e903f osd: enable PGLS filters to see hobject_t
+736fe06 selinux: Add .gitignore file
+c6d6c78 ceph.spec.in: stop/start service on policy upgrade/removal
+73bf34d selinux: Update the SELinux policy rules
+03d7a65 SELinux Makefile can't work in parallel
+c014f2b ceph-disk: set selinux context
+bed5703 selinux: Allow setuid and setgid to ceph-mon and ceph-osd
+d0fd8ff Update selinux policy (after local test).
+9db80da Fix selinux context after intitial OSD mount.
+c52eb99 Add initial SELinux support
+c40df50 librados: fix empty NObjectIterator crash on comparisons
+b04bafc tests: rados striper tests use 7116 instead of 7113
+28bc30c osd/ReplicatedPG: sparse read should return (extent_map, data_bufferlist)
+0cdd77d rgw_user.h: modify interface comments.
+add3014 Revert "rbd: remove dependency on non-ABI controlled CephContext"
+aebb9e7 tools: ceph-release-notes unicode handling
+793fe52 doc: release notes for v0.94.3
+de40c40 client/Makefile: ship InodeRef
+0e69527 mds: open base inode's snaprealm after decoding snapblob
+963c524 doc: add changes to "pg ls*" commands to pending release notes
+09839d0 Allow evict operations to be throttled
+caf98c8 add agent_start/finish_evict_op to control agent_ops
+311ef97 buffer.cc: short-circuit copy_in for small lengths
+ae2396e buffer.cc: postpone crc cache invalidation in bufferlist::rebuild()
+d34e041 buffer.cc: add possibility to omit crc cache invalidation
+472f732 buffer.cc: short-circuit in append for single char
+33e1f17 buffer.cc: short-circuit in copy_out for small lengths
+e79ee92 buffer.cc: is_zero() optimization
+2219200 src/include/encoding.h: reduce amount of 0-byte appends
+03877d6 bufferlist: tuning small appends
+35e4569 FileJournal: reduce time wasted by bufferptr::zero
+10a336f ObjectStore: partially remove op_ptr.zero()
+f68553e osd/osd_types.cc: get rid of str concat when making hash key
+111ecf8 radosgw-admin: use cout not cerr to print help message.
+145364b logrotate: fix log rotation with systemd
+85cb86d doc: change "--keyfile" description in man page of rbd help
+1ca6bf6 common/hobject_t: correctly decode pre-infernalis hobject_t min
+bc0d942 CMake: add crushtool
+fe970bc CMake: fix librados build
+2355c45 CMake: fix rbd build
+22e8a29 CMake: libblkid not only needed with rbd
+57cd851 CMake: smalliobenchrbd should dep on rbd
+1f541b2 CMake: set WITH_CEPHFS by default
+a4a81c3 CMake: fix cephfs cls build
+cc35d9b CMake: fix cephfs test targets
+2761b29 CMake: add cephfs tools
+5416b1f c++11: fixes for std::pair type conversions
+3f52583 c++11: stream output operators for stream types
+5b15ea2 c++11: replace auto_ptr with unique_ptr
+2d18941 c++11: remove references to the std::tr1 namespace
+51a218c autoconf: build as c++11
+b2e484d cmake: build as c++11
+ec8433d Makefile: include systemd udev rules in tarball
+e4c6922 add help to do_autogen.sh
+e28b3d1 Allow do_autogen.sh to pass configure parameters
+217837b client: use smart pointer to track temporary inode reference
+8c45ae1 client: use smart pointer to track 'cwd' and 'root_parents'
+89648e7 client: convert Inode::snapdir_parent to smart pointer
+e7920c9 client: convert CapSnap::in to smart pointer
+dac11e6 client: convert Fh::inode to smart pointer
+fd02f0f client: use smart pointers in MetaRequest
+07f5809 client: convert Dentry::inode to smart pointer
+ad9c22a client: hold reference for returned inode
+6b17e21 osd/recover_backfill: assert(obc) when adding pg stat for backfill objects
+199352d osd/recover_primary: remove the unfound check when recovering an object
+8652a37 osd: avoid unnecessary calculation in agent_choose_mode()
+4d10dc1 systemd: fix ceph-radosgw@ service
+8a67561 ceph.spec: install the new systemd-based udev rules from ddiss
+3d6100e ceph.spec.in:Fixup renamed ceph-rgw.conf
+e3e9548 ceph.spec.in:fixup bad merge of systemd logic
+69cdfcb remove ceph-disk-{activate,prepare} wrappers
+e034c2a ceph.spec: ship ceph-disk-* units
+85a8946 systemd: activate disks via systemd service instead of udev
+ac16d83 ceph.spec: use sysvinit rbdmap script for now
+2943194 ceph-disk: map dmcrypt devices prior to activation
+4fd9cf2 ceph-disk: split get_dmcrypt_key_path from key creation
+6cfb4b3 ceph-disk: add --log-stdout parameter
+d1ce178 ceph.spec: no rcceph[-radosgw] with systemd
+6842a20 systemd: add ceph-create-keys@ service
+e283222 ceph.spec: include systemd units files
+43d6b12 system: include all unit files
+1ba4694 Autotools to install systemd unit files.
+d6213b6 Add unit files to the spec file.
+be4f878 radosgw systemd prestart moved to systemd
+fbc85e3 radosgw systemd support
+ef961be ceph.spec.in: Change redhat defaults for rgw
+6124019 Added tmpfiles.d for rgw: templated user and group.
+95db160 ceph.spec.in:Update rpm hooks and file content
+b8e28ab rgw: set http status in civetweb
+0451f19 civetweb: update submodule to support setting of http status
+3fbcf5e doc: krbd supports clones since 3.10
+5ea618a include/rados: remove op definitions for LOCK ops
+05f70ab osdc/Objecter: remove unused/obsolete lock() operation
+11631a5 osdc/WritebackHandler: remove obsolete lock operation
+399d888 osd/osd_types: remove unused wrlock_by field
+1fb9bac os/FileStore: getattr() should return 0 if success
+5760655 KeyValueStore: Fix incorrect rm_keys behaviors
+81afcc2 doc: fix the build of placement-groups.rst
+79c7d2e fix: qa/workunits/rados/test_cache_pool.sh
+f8554f9 Remove execute mode of source files
+b255b4c make: fix the build of "rbd"
+692289a reset max if runtime configure change
+79560bb rename filestore_ops/bytes to journal_ops/bytes
+1cc9cc8 use throttle framework to throttle ops/bytes for keyvaluestore
+e4fd086 use throttle framework to throttle ops/bytes for filestore
+8a7a52d rbd:'rbd purge image' will return failure without removing any snaps if the image has a protected snap
+9574555 rgw: skip prefetch first chunk if range get falls to shadow objects
+ad5507f rgw: url encode exposed bucket
+a634ab3 remove libs3 submodule
+bbe8457 remove rest-bench
+a8d33c9 doc: fix command line for swift user creation
+77cdb50 No CRUSH involvement in deciding PG target
+8d4932e osd/OSDMap: handle incrementals that modify+del pool
+af0cade lockdep: allow lockdep to be dynamically enabled/disabled
+2c51aad tests: librbd API test cannot use private md_config_t struct
+4d03c66 librados_test_stub: implement conf get/set API methods
+cb51b17 mon: reject over-large values of max_mds
+258cb34 rbd: add "--keyring" option to help message
+5c395ff doc: add bucket object version description. bucket object version has been supported, but do not have description in the docs, so add this part.
+6ab9efe osd: copy the RecoveryCtx::handle when creating a new RecoveryCtx instance from another one
+1320e29 OSDMonitor::preprocess_get_osdmap: send the last map as well
+f217865 test_librbd_fsx: invalidate before discard in krbd mode
+c4872dd Log::reopen_log_file: take m_flush_mutex
+0559fd3 tools/rados: change the first op id to 0
+ee25b42 tools/rados: change the default max_ops to 16
+b97988d tools/rados: update the help message
+08210d6 common/syncfs: fall back to sync(2) if syncfs(2) not available
+fa78739 rbd: remove dependency on non-ABI controlled CephContext
+9fa0112 crypto: use NSS_InitContext/NSS_ShutdownContex to avoid memory leak
+34cb85e tools/cephfs: respect mds_root_ino_[gid|uid]
+4e42414 mds: configurable uid/gid on new root inos
+d8395cf modified librados-intro.rst
+91ecba1 osd/ReplicatedPG: claim the read buffer instead of copying it
+61643c1 mds: reinstate conditional destruction
+37370c7 mds: fix standby handling in map
+4fd6c3b squashme: move peer failure handling up in handle_mds_map
+1cc3cd5 squashme: initialize max_purge_ops
+d563fff mds: make SnapServer handle old osd maps
+7337db5 mds: handle data pools missing from osdmap
+3348413 mds: make Server::mds private
+ede621d mds: s/MDS/MDSDaemon/ for clarity
+e280f72 mds: remove unneeded MDS.h includes
+e065dd7 mds: make `whoami` and `incarnation` private
+49d0f71 mds: separate most of MDSRank from dispatcher-like parts
+e21c699 mds: pass MonClient ref into snapserver & mdbalancer
+7c0b869 mds: reinstate legacy 'tell'
+a4cfd53 mds: don't construct MDSRank until we have a rank
+7e2e3bb mds: big MDS refactor stage 2: encapsulation
+794d13c mon/MDSMonitor: reject illegal want_states from MDS
+119052f mds/StrayManager: don't acces OSDMap during construction
+1b1b0e0 mds: fix STATE_NULL being equal to STATE_DNE
+7222020 mds: always delete MDS
+07afc2e mds: remove MDSRank::want_state
+2d0758f mds: separate MDSMap handling between MDS and MDSRank
+2490a0a mds: move MDS non-core dispatch into MDSRank
+e54135c mds: de-public-ify more members of MDS::
+557af37 mds: Make all subsystems take an MDSRank instead of an MDS
+0531d0b mds: big refactor of MDS class (stage 1)
+5d70e1c messages: include Message.h in MClientRequestForward
+f60da6b Anotate all the .s files
+17a3e4b rbd: rename --object-extents option to --whole-object
+afbf90d rbd: du command should take spec as an argument
+7d65bd9 rbd: assorted cli arg parsing fixes
+b60e144 rbd: error out if dest_snapname is specified
+bbc5c71 rbd: import doesn't require image-spec arg, ditto for export and path
+76989cc doc: use spec syntax in rbd docs
+9909388 rbd: use image-spec and snap-spec in rbd help
+2fe6bd3 osdc: fix pgls_filter op encoding
+0d2467a Compressor: Remove thread affinity options
+3482e68 AsyncConnection: Exit process loop if entering fault
+554c982 test/perf_local: disable tests on unsupported archs
+8778ab3 Log::reopen_log_file: take m_flush_mutex
+6f54c61 debian: Update maintainers and uploaders
+824c541   common: add nested-name-specifier ThreadPool before WorkQueueVal   Fixes: #12459
+992d959 mds: fix val used in inode->last_journaled
+a140085 osd: Keep a reference count on Connection while calling send_message()
+de8a950 qa/workunits/cephtool/test.sh: escape osd.* and mon.*
+8447b08 WBThrottle::clear_object: signal if we cleared an object
+cd2d7d0 ceph-helpers.sh: don't test the osd_max_backfills value
+bc348f6 Fix "was hidden" compilation warnings
+c835422 rgw: fix radosgw start-up script.
+2513ba7 docs: Document md_config_obs_t.
+af0d1ab docs: Document the ThreadPool and WorkQueue classes.
+799fd9f docs: Document the Finisher class.
+329741e mon: fix checks on mds add_data_pool
+8b1df96 doc/cephfs/quota: simple doc
+c8bdf1b mds: fix crash while stopping rank
+cb38aa1 mmds/MDLog: No make sense set offset for LogEvent in _start_entry.
+518c0b3 rgw: check subuser illegal access parameter.
+7a1aff8 use SEEK_HOLE/SEEK_DATA for sparse copy
+10c0b67 blkdev.cc::get_device_by_uuid: do not leak cache
+4d03030 mon: ceph osd map shows NONE when an osd is missing
+e479037 tests: robust test for the pool create crushmap test
+bef3938 os/FileStore: fix pipe file descriptor leak
+1baeb1c doc/rados/configuration: add more scrub related config
+1eb13a2 xio: reduce quantum numbers to grow pool
+09ab814 xio: reduce the default depth of accelio msg queue
+1d728c0 xio: configurable max send inline
+1c5cef7 xio: safely clean up transport messages with valid connection
+7f15e22 OSD: break connection->session->waiting message->connection cycle
+b3a3e0e doc: v0.80.10 changelog
+5483c14 src/script/ceph-release-notes: add --text for plain text output
+64e5041 auth: check return value of keyring->get_secret
+7b8ca74 doc/releases: v0.80.10
+2a61b61 doc/release-notes: v0.80.10
+c0d2976 cmake: add missing blkid libraries
+7fc13c9 Update OSDMonitor.cc
+787fa80 mon: OSDMonitor: fix hex output on 'osd reweight'
+8f33dc3 AsyncConnection: Fix local message dispatch lack of source
+98d53e8 TestMsgr: Add message source check for local message dispatch
+f9dd1ec   mon: added const to dump_* functions in PGMonitor
+3f04a61 rgw: avoid using slashes for generated secret keys
+613f548 client: fix directory fsync
+b8814f4 test/librados/tier.cc: we can exceed the hitset limit while backfilling
+b62c3b9 rgw: doc: Mark S3 object version API as supported
+f84e6b8   mon/PGMonitor: avoid uint64_t overflow when checking pool 'target/max' status.   Fixes: #12401
+8dc6c50 osd: add some mark_delayed for some delay events.
+3244595 osd/PG: add mark_queued_for_pg for OpRequest.
+f676399 mon: add the cache tier IO rate in 'ceph -s'
+e78e03d mon: show cache tier IO rate in 'osd pool stats'
+017e206 qa/workunits: cephtool: take EOPNOTSUPP as an alias of ENOTSUP
+b62a77a doc: Adding Hewlett-Packard copyright messages to files containing more than minor enhancements and build scripting
+6dc5892 test: Make test_rados_tool.sh part of make check
+90fdbbf common, tools, test: Add "rados purge" feature to remove all objects from a pool
+818de1a test: add test for {get,set}-inc-osdmap commands.
+8eb9a34 tools: ceph-monstore-update-crush: fail early if mon's running
+a881f93 tools: ceph_monstore_tool: describe behavior of rewrite command
+2349eb9 osd/OSDMap: fix a typo in the comment
+fbd4d12 osd/OSDMap: remove unused dump_json()
+50a33de package ceph-monstore-update-crush.sh
+9d8b6d8 test: add a test to exercise ceph-monstore-update-crush.sh
+39e25b9 tools: add ceph-monstore-update-crush.sh
+1bb0cf4 PendingReleaseNotes: add the notes about osdmaptool
+ad6e6a1 tool/osdmaptool: replace --dump-json with --dump
+3063734 tools/ceph-monstore-tools: add rewrite command
+614ac0f Thread.cc: remove malloc/free pair
+d8138c8 mon/Monitor.cc: fix potential null deref
+be7e07a mon/OSDMonitor.cc: fix UNINTENDED_INTEGER_DIVISION
+268923c rgw/rgw_admin.cc: remove no longer valid comment
+12e87f9 os/KeyValueStore.cc: pass const parameter by reference
+56a30cf osdc/Journaler.cc: fix integer overflow
+1fece68 rgw_rados.h: init non-static member of 'stat_params' in ctor
+0a5792c rgw_rados.h: init non-static member of 'struct Params' in ctor
+19c4c2e rbd_replay/Replayer.cc: init non-static members in ctor
+b217d19 osd/RadosModel.h: init non-static member in ctor
+4e0ea75 test_rgw_admin_meta.cc: init non-static members in ctor
+5d81c9b test_cors.cc: init non-static members in ctor
+d46fca5 test_rgw_admin_opstate.cc: init non-static members in ctor
+239d229c rados/rados.cc: init non-static members in default ctor
+4c99000 mds/MDSMap.cc: fix swapped arguments in CompatSet() ctor call
+b936da1 rbd.cc: fix swapped arguments in do_lock_remove() call
+5dc0284 ErasureCodeJerasure.cc: remove redundant checks before erase()
+5b8ae62 messages/MOSDRepOpReply.h: init some vars of MOSDRepOpReply in ctor
+fd82535 messages/MOSDRepOp.h: init some vars of MOSDRepOp in ctor
+ba8988b mds/CDir.cc: init some integer vars of C_IO_Dir_OMAP_Fetched in ctor
+f819cf1 mds/MDCache.h: init fragment_info_t::bits in ctor
+2450db6 osd/ReplicatedPG.h: init flushed_version in FlushOp ctor
+bda4cd2 messages/MExportDirFinish.h: init 'last' in default ctor
+fb9268a messages/MMDSOpenInoReply.h: init 'error' in ctor
+08506f3 mds/MDCache.h: init some members of open_ino_info_t in ctor
+fc5093a mds/MDCache.cc: init 'default_file_layout' in ctor
+00a573e msg/async/AsyncMessenger.cc: init listen_sd in ctor
+77528e7 msg/async/EventSelect.h: init max_fd in ctor
+3a89a33 osd/ECMsgTypes.h: init tid in default ctor
+a0dc63d mon/MonMap.h: fix potential segfault, add assert()
+0292f96 mon/Monitor.cc: fix potential null deref
+a48286e mon/MDSMonitor.cc: fix error handling in 'mds getmap'
+d6396a1 objclass/class_api.cc: fix buffer out-of-bounds access
+d3e1782 rgw/rgw_common.cc: fix char array '\0' termination
+323267c client/Inode.cc: cap_is_valid() fix return value
+30adf5b os/FileJournal.cc: fix do-while loop
+df83eb3 osd/ECBackend.cc: fix MISSING_BREAK
+60d28f6 EventEpoll.h: init 'size' in ctor
+947142f osd/ReplicatedPG.cc: fix error handling for CEPH_OSD_OP_NOTIFY_ACK
+8257208 test/perf_local.cc: init src char array before copy from it
+da21d26 TestRocksdbOptionParse.cc: prefer ++operator for non-primitive iter
+7ea7edf ceph_erasure_code_benchmark.cc: prefer ++operator for non-primitive iter
+7efa3fd os/RocksDBStore.cc: prefer ++operator for non-primitive iter
+af55a90 test/perf_local.cc: fix alloc/dealloc mismatch
+6579fcb rados.cc: fix an issue in the output of the 'rados df' command
+1b2e70f pybind/ceph_argparse: do not choke on non-ascii prefix
+6256c10 rgw: doc: adding S3/Swift API link
+2b9071d erasure code: shec add ceph-erasure-code-corpus
+7801bb2 tools/ceph-objectstore-tool: add get-inc-osdmap command
+95344d0 tools/ceph-objectstore-tool: add set-inc-osdmap command
+0257c15 test: add test for {get,set}-osdmap commands
+4b28bcb doc: add v9.0.2 to the release timeline
+69dad39 doc/release-notes: v9.0.2
+9cfa88d test: test_rados_tool.sh update due to new import/export semantics
+d23cd13 tests: test/cephtool-test-mon.sh uses 7202 7203 and 7204
+f0c130d tools/ceph-objectstore-tool: add "get-osdmap" command
+3e30c17 tools/ceph-objectstore-tool: add "set-osdmap" command
+f2e240a messages: MForward: get() message reference
+7e54360 mon: MonOpRequest: dump function as private
+c45e0cf mon: services: assert on unexpected op request type
+9cee74b mon: Monitor: set op request type during dispatch
+b570155 mon: MonOpRequest: allow setting the op type
+1b39f7d mon: Monitor: drop PaxoServiceMessage reply functions
+dce78ee mon: MDSMonitor: use op-related no_reply()
+4c880a5 mon: OSDMonitor: use op requests when sending out full/incrementals
+7ed9fbf mon: OSDMonitor: move failure code to support op requests
+cb6c913 mon: Monitor: have op-related no_reply()
+7797fca mon: MonOpRequest: send_reply() belongs in the Monitor class
+0633354 mon: Monitor: routed requests handling op requests
+39851df mon: Monitor: forward_request_leader() taking ops instead
+2ebbab9 mon: Monitor: drop reply_command(MMonCommand *m,...)
+b3bce04 mon: use op's get_session() instead of getting from connection
+9ba818f mon: services: use op-based reply_command()
+d3851fb tests: test-erasure-code.sh/rados_osds_out_in must wait_for_clean
+1213dde cls: fix the build on i386
+65b5144 mon: Monitor: reply_command() wrapper for ops
+1564b6c mon: Paxos: mark events
+98e470a mon: Monitor: mark events
+427cef8 mon: PaxosService: mark events
+696c2ec mon: MonOpRequest: mark events
+4e76d59 mon: services: mark events
+2179b4d mon: services: use mon->send_reply(op,...) instead
+677372d mon: PaxosService: use wait_for_.*_ctx() in absence of an op
+d240a76 mon: services: use op-based wait_for_* functions
+b9e6696 mon: PaxosService: have wait_for_* functions requiring an op
+2c83e1e mon: Paxos: have wait_for_* functions requiring ops
+5ca1369 mon: PGMonitor: implement C_MonOp on op-related callback contexts
+903e219 mon: OSDMonitor: implement C_MonOp on op-related callback contexts
+fff540d mon: LogMonitor: implements C_MonOp on op-related callback contexts
+ed7e89a mon: PaxosService: implement C_MonOp on op-related callback contexts
+61f7dca mon: Monitor: implement C_MonOp on op-related callback contexts
+7e1c8c9 mon/mon_types.h: add C_MonOp abstract class
+c80bb61 mon: Monitor: mark events
+91457df mon: MonOpRequest: change service names in mark_*_event()
+176d796 mon: MonOpRequest: add service-specific 'mark event' functions
+e28e5ec mon: MonOpRequest: add mark event functions
+7d90cb1 mon: Monitor: add admin socket command 'ops'
+19dac5f mon: MonOpRequest: add dump function
+df9486b mon: Monitor: have reply functions for op requests
+53bd1ba mon: MonOpRequest: add 'send_reply()' function
+5420fdb mon: optracker (3): remove unecessary message variables
+2526347 mon: optracker (2): remove all unecessary message put()
+c713d9a mon: optracker (1): support MonOpRequestRef
+62dd637 Client: check dir is still complete after dropping locks in _readdir_cache_cb
+f1e86be mon: test the crush ruleset when creating a pool
+203cb6a mon: MonOpRequest: have the monitor dealing with operations
+2731b19 messages: MForward: pack a bufferlist instead of a message
+a44499f erasure-code: set max_size to chunk_count() instead of 20 for shec
+1551ebb mon: PaxosService: call post_refresh() instead of post_paxos_update()
+038452e mon: Monitor: use 'ceph mon metadata' instead of 'ceph mon_metadata'
+f3f8d36 ceph.spec.in: remove SUSE-specific apache2-mod_fcgid dependency
+cbf6c7b ceph.spec.in: drop SUSE-specific %py_requires macro
+955dced mon: Monitor: use 'ceph mon sync force' instead of 'ceph sync force'
+574e596 mon: Monitor: use 'ceph mon scrub' instead of 'ceph scrub'
+16dcc40 mon: Monitor: use 'ceph mon compact' instead of 'ceph compact'
+66e7510 qa/workunits: cephtool: test deprecated commands
+c7f70a7 PendingReleaseNotes: note deprecation of commands
+7e8f721 mon: MonCommands.h: DEPRECATE 'ceph sync force'
+9073ff4 mon: MonCommands.h: DEPRECATE 'ceph compact'
+1814d74 mon: MonCommands.h: DEPRECATE 'ceph scrub'
+d9acd68 mon: Monitor: allow deprecating commands and debugging as obsolete
+16df92f mon: Monitor: add is_noforward() helper to MonCommand
+5b2a4eb mon: Monitor: add support to have OBSOLETE commands
+4c5d5ba mon: MonCommand: don't match help string in is_compat()
+607b0e8 mon: MonCommands: add NOFORWARD to 'ceph sync force'
+0438bdd mon: MonCommand: add FLAG_NONE
+af5efdf mon: MonCommand: have flags as uint32_t instead of enum
+b00ea63 mon: MonCommands: accept FLAG(f) instead of 'f' in command sig
+6ed554c automake: Fix out-of-tree build.
+3e96b9a osd: add statistical data for promotion
+072ccdc osd: add statistical data for flush and evict
+cfc9f49 packaging: package libcls_cephfs.so
+0dae022 cls: fix the build on i386
+852b08f   mon/PGMonitor: use poolname reference instead of get it in osdmap   Signed-off-by: huangjun <hjwsm1989 at gmail.com>
+bc56a87   rest_bench: bucketname is not mandatory as we have a default name
+f3d34d8   rest_bench: drain the work queue to fix a crash   Fixes: #3896   Signed-off-by: huangjun <hjwsm1989 at gmail.com>
+f8bcec2 xio: handling connection error event
+e849cf3 xio: correctly set XioConnection features with fake features for now
+1b3f899 rados: Fix bug in export of xattr which dropped first char of key on import
+f08522c mailmap: Kernel neophyte affiliation
+0310fb3 mailmap: Zhe Zhang affiliation
+3cee02d mailmap: Ketor Meng affiliation
+03756c1 mailmap: Jon Bernard affiliation
+4016058 mailmap: Vasu Kulkarni affiliation
+c3ab4a6 mailmap: Shylesh Kumar affiliation
+cb90808 mailmap: Ismael Serrano affiliation
+371d9ba PG::find_best_info: ignore info.les for incomplete peer
+c74a2f8 packaging: RGW depends on /etc/mime.types
+aa1a522 ceph.in: print more detailed warning for 'ceph <type> tell'
+972dc91 ceph.in: print more detailed error message for 'tell' command
+d0a8fd0 mailmap: Anton Aksola affiliation
+3d86299 mailmap: Alistair Strachan affiliation
+b2502b0 mailmap: Javier Mellid affiliation
+555f485 mailmap: Piotr Dałek affiliation
+78de179 mailmap: Casey Bodley affiliation
+bd5a51a mailmap: Joaquim Rocha affiliation
+712831d mailmap: Joseph McDonald affiliation
+0c44747 mailmap: Varada Kari affiliation
+5449b69 mailmap: Nathan Cutler name normalization
+a02ec61 mailmap: Ilja Slepnev affiliation
+f970ff4 mailmap: François Lafont name normalization
+a063de1 mailmap: Dmitry Yatsushkevich name normalization
+2ff6bcf erasure code: shec performance optimization with SIMD instructions
+6e0498d MonitorDBStore : make monitor transaction more readable on dump
+da96a89 librados: Make librados pool_create respect default_crush_ruleset
+be422c8 9.0.2
+bbf5842 AsyncConnection: Make sign_message ahead of construct message bufferlist
+8bbe98a AsyncConnection: Fix non-fastdispatch message doesn't prepare case
+b7e9fe1 extend clone test to mock clone operation
+8cee732 rgw: doc: Fix radosgw stripe size config ref
+069efc1 tools: src/script/ceph-release-notes normalization
+1231ae0 doc/release-notes: update notes for v0.80.10
+8085d26 common: clean up code for OpTracker::check_ops_in_flight.
+8506822 doc: change tcp rcvbuf and tcp nodelay to ms tcp rcvbuf      and ms tcp nodelay
+b7b1bf2 rgw: add minimum support for copy multipart part
+16ead95 qa: update pool quota test for internal retries
+dbcf2e4 Fixes : #12018
+67de12b Fixes : #12018 osd/OSD.cc : drop write if pool is full
+6849274 osd: pg_interval_t::check_new_interval should not rely on pool.min_size to determine if the PG was active
+466b083 osd: Move IsRecoverablePredicate/IsReadablePredicate to osd_types.h
+53072b9 ceph.spec.in: do not run fdupes, even on SLE/openSUSE
+cd6ead3 packaging: add cephfs-data-scan
+1c00b45 tools/cephfs: add cephfs-data-scan
+8eaa2f2 doc: add some docs about cephfs-data-scan
+ae57025 include/ceph_fs.h: define magic LOST+FOUND ino
+ec9b479 cls: add CephFS object class
+e07a0da mds: include damage_flags in inter-mds encoding
+47f6f1e mds: bump required version on fnode_t encoding
+d44c784 mds: add damage_flags_t to inode+frag
+007c39e mds/CInode: move hash_dentry_name up into InodeStore
+56d6be8 tools: fix journal reset error handling
+bb70a33 client: fix typo
+40d0476 mds: correct typo in log message
+79197d3 rgw: If the client sends a Connection: close header respond accordingly.
+ea012fd rgw: Make vstart.sh print out swift user info
+c604dd9 Fixes: #12286 radosgw-admin: after subuser modify print only once user info.
+4aa102f rgw: Make RGW_MAX_PUT_SIZE configurable
+ce02801 tools: Fix rados export to use io_ctx in the right way
+cb03f81 doc/erasure-code: fix couple typos
+127a5f1 tools, test: Some ceph-objectstore-tool error handling fixes
+a3b14ed test: Add debug argument to the ceph-objectstore-tool test
+d846071 tools: Check for valid --op earlier so we can get a better error message
+31fa89f tools, test: Add ceph-objectstore-tool to operate on the meta collection
+6aec38b tools: Fix newlines in output of --op list
+fd1772e tools: Fix dump-super which doesn't require pgid
+5534bc8 tools: Check and specify commands that require the pgid specification
+557c653 osd, tools: Always filter temp objects since not being exported
+4fcf5dd tools: Don't export temporary objects until we have persistent-temp objects
+fb22a9f osd: fix temp clearing in OSD
+1a8e7a7 Document librbd::parent_spec and librbd::parent_info.
+f9378a9 Fix mds dump_ops_in_flight crashing ocassionally
+57fbc23 ReplicatedPG::finish_promote: do not prefill new_clones
+4946d10 OSDMonitor: allow addition of cache pool with non-empty snaps with config
+8a56c48 packaging: add find and which dependencies
+5ce38b9 ceph.spec.in: install 95-ceph-osd.rules, mount.ceph, and mount.fuse.ceph properly on SUSE
+8aa758e ceph.spec.in: use _udevrulesdir to eliminate conditionals
+8f7c163 rgw: fix signed/unsigned compare warning.
+caae6c9 test: fix signed/unsigned compare warning.
+e4634dd ceph.spec.in: snappy-devel for all supported distros
+1abaebd ceph.spec.in: make /var/run/ceph conditional
+d952d59 ceph.spec.in: add missing -%{release}
+d3dbfff Workunits : fs/misc/chmod.sh : Include ACL characters in permission check.
+e6662e5 Workunits : suites/pjd.sh : Do make clean so make can build on current arch.
+624fa43 Fix rest_bench to support https
+43f583d buffer: Fix bufferlist::zero bug with special case
+577acf6 UnittestBuffer: Add bufferlist zero test case
+2674739 Fix mds dump_ops_in_flight crashing ocassionally
+d8a728e rgw: Document the layout of pools and objects
+7b31e11 bufferlist: replace Mutex with RWlock
+7db8a6a Remove git build-time dependency
+cc72dd2 StoreTest: Add zero test for SyntheticTest
+8df81e0 tests: verify erasure code read / write after remapping
+96ec2a7 tests: ceph-helpers.sh get_osds with no trailing whitespace
+d791a72 tests: improve shell output readability
+4c64b01 erasure code: shec's gtest for minimum_to_decode() arguments
+9bdf3e6 erasure code: fix shec's recovery issues found in teuthology test
+1123888 erasure code: move shec.cc into ErasureCodeShec.cc
+aefcf6d tests: ceph-helpers.sh reduce kill_daemon verbosity
+e2454ee AsyncConnection: Only prepare message when it support fast dispatch
+34b939a client: reference counting 'struct Fh'
+998fe78 common/TrackedOp: Make get_duration get correctly value.
+840011b mds: safety around rejoin/resolve_done hooks
+cbc2a0f XIO: Add missing fastpath events to OSD
+b53e3e2 qa: add tests for 'ceph mds metadata' command
+1883e46 mon: MDSMonitor: keep last_metadata in memory
+c5a09a6 mon: MDSMonitor: use pending_mdsmap when selecting items to remove
+60ff337 mon: reset pending_proposal after dumping to log
+32b9d63 generic check return code of get operation
+0f1d7ae Fix indentation
+f02ca61 rgw: conversion tool to fix broken multipart objects
+89c2a0b rbd: recognize queue_depth option
+75a8e23 Add libradosstriper to cmake
+9760c22 erasure-code: do not hide overloaded ErasureCode::parse()
+9bcf5f0 tools: fix race condition in seq/rand bench
+d7585c0 common/TrackedOp: check tracking_enabled for event "initiated/done".
+4d2ca37 mon/LogMonitor: use the configured facility if log to syslog
+edefd05 common/TrackedOp: clean up code make look good.
+64d740f librbd: don't attempt to invalidate an object map in R/O mode
+f8a7b50 librados_test_stub: read op should return number of bytes read
+2ace2b7 tests: fixed TestObjectMap.InvalidateFlagInMemoryOnly
+813897c obj_bencher: remove trailing space
+76ba367 tools/rados: fix the segfault introduced in 0f7aeee
+0e7328c AsyncConnection: Fix incorrect sign message behavior
+4987933 mon/PGMonitor: bug fix pg monitor get crush rule
+d58d7b5 mds: replace MDS::*_done calls with contexts
+c5ec0e1 mds: remove global Filer instance
+4dd1e67 ceph.spec.in: fix _with_systemd conditional
+f11a3a5 ceph-detect-init: do not require argparse on py2.7
+0f7aeee obj_bencher: check run_name and prefix for empty string instead of NULL
+3b5620d common/Thread: added const to 2 functions
+8a25674 rbd: explicitly close images to check for unexpected errors
+273421f pybind: RBD close can now raise an exception on close
+7ef4af5 librbd: closing images now returns a result code
+8cc0cf0 mds: nuke the unused mds_mem_max option
+40399e3 doc: indent warning messages in add-or-rm-osds.rst
+253e256 doc: fix the link in dev/quick_guide
+efccc58 osd/ReplicatedPG: for writefull, offset is zero so replace offset w/ zero.
+89a7b12 osd/Replicated: Using write_update_size_and_usage for  WRITEFULL.
+a1005b1 osd/Replicated: First calc crc then call write_update_size_and_usage.
+df2c984 mon/PGMonitor: Make blocked Op message more readable.
+db1643d osd/ReplicatedPG: For WRITEFULL replica object, only truncate if new size less than old size(only truncate to new size)
+2cb0273 Compressor: Cleanup unnecessary lines
+8e48ba1 Compressor: add decompress failed codes
+8f0919e Compressor: Add compressor infrastructure for ceph
+ce0c8f8 test: ignore symlinked ceph.py file
+8b53568 test: ignore symlinked ceph_disk.py file
+a30aa95 ceph.in: linter cleanup, remove unused imports
+e296793 test: create python test files for ceph cli
+5c3d074 mon: disallow adding a tier on top of another tier
+e819a3c client: return EINVAL if iovcnt < 0 for p{read,write}v()
+19a75f1 tools: fix deprecated warning
+5f8ecf2 crush/CrushTester: fix signed/unsigned warning
+373e065 client: fix signed/unsigned warnings in preadv code
+67fa726 AsyncConnection: Move sign_message to write_message
+504a48a doc : missing link in "quick_guide.rst"
+db16353 mds: change mds_log_max_segments type from int to unsigned.
+8a91daa mds: fix mds crash when mds_max_log_events smaller.
+5614ec6 tests: fix segfault issue in preadv/pwritev tests
+7cf1f37 rgw: api adjustment following a rebase
+47edec3 rgw: orphans, fix check on number of shards
+5528f21 rgw: orphans, change default number of shards
+cac57ca rgw: change error output related to orphans
+d8ce04d rgw: orphan, fix truncated detection
+4980cbd radosgw-admin: simplify orphan command
+66b0090 radosgw-admin: stat orphan objects before reporting leakage
+55d6f5e radosgw-admin: orphans finish command
+f899310 rgw: cannot re-init an orphan scan job
+d7d1171 rgw: stat_async() sets the object locator appropriately
+20bd490 rgw: list_objects() sets namespace appropriately
+fa61ea2 rgw: modify orphan search fingerprints
+1bc63d9 rgw: compare oids and dump leaked objects
+f19b2f0 rgw: keep accurate state for linked objects orphan scan
+75902fd rgw: iterate over linked objects, store them
+7c1aa83 rgw: add rgw_obj::parse_raw_oid()
+05a953d rgw: iterate asynchronously over linked objects
+13adf3c rgw: async object stat functionality
+16a2dbd rgw-admin: build index of bucket indexes
+767fc29 rgw: initial work of orphan detection tool implementation
+8a221c3 doc: homogenize Librados (Python) documentation's code samples
+8103908 rgw: only scan for objects not in namespace
+c418bd9 ceph.spec.in: remove duplicate BuildRequires: sharutils
+39c28b9 ceph.spec.in: fix python-flask dependency for SUSE
+05424a8 logrotate.conf: fixes for systemd
+876fbc6 rgw: Multipart Upload: Support to configure and enforce no of parts allowed
+4524316 Common: Do not use CEPH_PAGE_SIZE when appending buffers in Ceph
+af276de librbd: assertion failure race condition if watch disconnected
+106aeba rgw: enforce Content-Type in Swift responses.
+23171c9 ceph.spec.in: python-argparse only in Python 2.6
+daa679c rgw: error out if frontend did not send all data
+590cdc9 librbd: prevent object map updates from being interrupted
+dd212fd ceph.spec.in: clarify two important comments
+5f47b11 rgw: send Content-Length in response for GET on Swift account.
+39cf071 mds: fix MDLog shutdown process
+f2daa19 mds: drop MDSIOContext on mds->stopping
+ae387b6 mds: refine shutdown, add ::stopping
+8072b19 osdc/Journaler: add a shutdown() method
+fe23c6c common/Thread: print msg on join errors
+0734cd1 packaging: make logrotate a hard dependency across all distros
+95796a3 mds: add a ceph_file_layout wrapper for dencoder
+c097881 test/ceph-dencoder: add boilerplate
+8d9c95f mds: store layout on header object
+c9c655f mds: update CInode::oldest_snap during migration
+f3e4a91 ceph.spec.in: rm reference to EOL Fedoras (< 20)
+2db9480 ceph.spec.in: package rbd-replay-prep on all Fedoras
+43c1784 ceph.spec.in:BuildRequires sharutils
+bdfad0d src/.gitignore: add ceph_perf_msgr_{server,client}
+850879e Fixed inclusion of ceph-helpers file in tests after it was moved to qa/workunits
+8dfcc72 tests: verify that image shrink properly handles flush op
+df539a7 move pgp_num, pg_num check to prepare_new_pool method
+0f0c6f1 xio: fix to work with commit 626360aa
+47a8447 xio: fix to work with the merge of pull request #4707
+ec2afbb xio: sync to work with accellio v1.4
+726d699 librbd: invalidate cache outside cache callback context
+0215e97 tests: add new unit tests for object map invalidation
+eb81a6a librbd: only update image flags when holding exclusive lock
+053ab4c release-notes.txt: fix version number in section heading
+d5ff8a5 doc/release-notes.rst: add missing word to complete sentence
+9624cf2 tests : Fixed radosstriper tests to introduce the needed calls to setup and teardown
+e4c27d8 tests : Fixed broken Makefiles after integration of lttng into rados.
+2409a90 tests: fixed environment for osd-class-dir
+49ab5bb rados: Added --striper option to the command line
+33a8415 rados: For export/import add correctly fadvise flags.
+a72f6c9 librbd: For rbd copy, add correctly fadvise flags.
+868f6b7 rados: For rados export, set all_namespaces when list all objects.
+6613b4b librados: Add new filed src_fadvise_flags in ObjectWriteOperation::copy_from.
+64fceed rgw: fix empty json response when getting user quota
+d261641 tests: reduce duplicate librbd unit test case coverage
+7354d25 lockdep: do not automatically collect all backtraces
+3deb7b6 Objecter.cc: fix name of _take_op_budget() function in comment
+e2cae05 tests: move librbd valgrind test to teuthology
+da259e4 rgw: force content_type for swift bucket stats request
+adb05c9 mds/Migrator: don't use c++11 kw 'final' as var name
+439faeb mds: tweak dirfrag asok messages
+53d6c1f AsyncConnection: Fix msgr send bytes perf counter statistic
+c71a2f5 AsyncConnection: set out seq for message when sending ready
+f335648 test_msgr: Random send different priority message to test sequence
+f028374 osd: remove duplicate checks
+0b2bab4 ceph_osd: Add required feature bits related to this branch to osd_required mask
+949b148 osd: CEPH_FEATURE_INDEP_PG_MAP feature now required
+c982e55 osd: CEPH_FEATURE_OSD_PACKED_RECOVERY feature now required
+77b09ed osd: CEPH_FEATURE_RECOVERY_RESERVATION feature now required
+5405e46 osd: CEPH_FEATURE_BACKFILL_RESERVATION feature now required
+7a10a7e osd: CEPH_FEATURE_CHUNKY_SCRUB feature now required
+626360a msg, ceph_osd: Support feature bits for all message type's local connection
+de04124 osd: Add tracking of acting_features and upacting_features
+6d28cb9 osd: Fix peer_features to include self
+a1e3108 doc: Fixed the picture on page http://ceph.com/docs/master/cephfs/
+f1ff3e4 ceph.spec.in: fix:Add missing directories breaking build
+449948e librbd: If objectmap tell object exist, don't send alloc_hint w/ write command.
+5cf6611 osd: use explicit ghobject_t ctor for lost revert/rollback cases
+46d8d07 common/hobject_t: make ghobjct_t(hobject_t) ctor explicit
+5fdbbe7 ceph-osdomap-tool: use explicit ghobject_t ctor
+1d40bff test: misc objectstore tests: use explicit ghobject_t ctor
+23dcee1 ceph_test_objectstore: use explicit ghobject_t ctor
+48995d4 ceph_test_filestore_idempotent: use explicit ghobject_t ctor
+b34c625 ceph_test_filestore_idempotent_sequence: use explicit ghobject_t ctor
+2fa57b7 ceph_smalliobenchfs: use explicit ghobject_t ctor
+befadd8 osd: use explicit ghobject_t ctor for asock truncate
+924ed4c osd/ReplicatedPG: use explicit ghobject_t ctor for reading hitsets
+d844060 osd/ReplicatedPG: use ghobject_t ctor for fiemap
+4910ef8 osd: use ghobject_t ctor for disk bw test
+d6418ae osd: define OSD metadata objects as ghobject_t, not hobject_t
+4ad318b osd/ReplicatedBackend: use explicit ghobject_t(hobject_t) ctor
+4520b3d osd/SnapMapper: use ghobject_t instead of hobject_t
+76aef7e osd: use explicit ghobject_t() ctor for omap operations
+93aac48 os/{Flat,LFN}Index: use explicit ghobject_t ctor
+6033591 os/FileStore: munge cid -> temp cid on read ops too
+b4ce84b osd: clear temp objects in the OSD, not FileStore
+cc4c608 osd/PGBackend: set correct shard in objects_list_partial
+5a7a52e osd: kill META_COLL constant; use named ctor
+aef3758 os/FileStore: clean up temp collections on startup too
+3414bce os/FileStore: debug omap_get_values returns
+a7ea1c7 os/FileStore: debug error return for get_omap_iterator
+4d6ee79 os/ObjectStore: kill hobject_t convenience wrappers
+9a965d6 osd/PGBackend: ignore temp objects in objects_list_*
+1779bad os/FileStore: init/clear temp collections *after* replay
+df1a215 osd: drop unused args, completion for advance_map
+75e28be osd/PGBackend: specify correct shard for collection_list_range
+e2f8a37 os/FileStore: do not mange temp collection
+27e4187 osd: add coll_t::c_str() back
+c8c9ee0 ceph-object-corpus: drop coll_t 'foo' and 'bar'
+5476f99 osd: cache coll_t string representation in memory
+63521ff osd: drop coll_t cruft
+3da52f2 unittest_osd_types: improve coll_t unit tests
+91d331c osd: make coll_t structured instead of a string
+0efa6fa os/MemStore: fix warnings
+e610fa6 osd: simplify load_pgs
+f9cc138 osd: change coll_t::is_pg() and is_temp() to take a pointer; drop snap
+dfef0cf osd: drop old snap_colls arg from PG::upgrade()
+bb88b2b osd: simplify coll_t::is_removal(); fix removal (pass pgid)
+3c585cd osd/PG: drop pre-snapmapper upgrade
+e15908d osd/osd_types: drop coll_t string ctor
+f5dff8a os/KeyValueStore: change collection master list strategy
+50618cf os/KeyValueStore: drop collection attr support
+de26269 ceph_smalliobenchfs: use valid collection names
+acbbd50 os/FileStore: only recognized valid collection names
+d7a5c68 osd/osd_types: add coll_t::parse() method
+da23c97 ceph_test_objectstore: adjust tests to new sorting regime
+27d935e ceph_test_objectstore: be better about collections
+a009bfe os/KeyValueStore: change naming scheme to work with new ghobject_t sorting
+93f182f os/FileStore: fix remove_collection return value
+91ec201 os/FileStore: better debug output for destroy_collection
+e6f196d os/FileStore: force temp objects into _TEMP temp collection
+f8436f1 shard_id_t: change NO_SHARD to sort before 0 (min instead of max)
+7a42a12 ghobject_t: is_min()
+18f08b8 ghobject_t: make operator<< put shard at front
+85d21a0 osd: some simple osd type sanity tests
+771d0b3 osd: add coll_t is_temp() and is_meta()
+2b9553e osd: use explicit coll_t::make_string_coll() ctor
+e3edc5f osd: use non-string coll_t's
+87a9b0a osd: use coll_t() for meta collection
+b1ad38d osd: add coll_t::get_temp() helper
+eb71bdf os/FileStore: remove unused legacy split/create collection methods
+ac6feac ghobject_t: MIN for default
+426d124 hobject_t: MIN for default
+f8a9f7a ceph_objectstore_test: a few simple collection_list_partial tests
+eba00fa ghobject_t: fix max to make embedded hobject_t also max
+45281ef hobject_t: fix get_boundary to work with new sorting regime
+8800224 ghobject_t: change sort order (max, shard, hobj, gen)
+27bfa13 .gitignore: ignore store_test_temp_dir
+b34b22e hobject_t: adjust comparator to check most-likely-different fields first
+ecdc8f6 hobject_t: modify operator<<
+afae1c7 hobject_t: adjust sort order (pool position)
+ff99af3 hobject_t: change default pool id to INT64_MIN
+959a7ce os/HashIndex: use ghobject_t::get_max() instead of hobject_t one
+3b1b5a9 osd: eliminate temp collections
+a88c3cc osd/osd_types: remove ancient CEPH_*_NS cruft
+18eb2a5 osd: use per-pool temp poolid for temp objects
+406c8c3 ceph_test_rados: add --balance-reads option
+82fdacd osd/ReplicatedPG: avoid spurious omap_setkeys, setattrs during recovery
+75e9fbb os/ObjectStore: deprecated collection_move
+5a5a4c9 osd: use a temporary object for recovery
+85517d6 ceph.spec.in: rbd-replay-prep is not being built on f21+
+ce80a60 configure.ac: Fix junit4.jar detection
+15e9e3d ceph.spec.in: Require git distro-wide
+b4e83f4 doc: Document include/str_list.h Fixes: #12050. Signed-off-by: Claire MASSOT <claire.massot93 at gmail.com> Signed-off-by: Jordan DORNE <jordan.dorne at gmail.com> Signed-off-by: Kévin CARADANT <kevin.caradant at gmail.com> Signed-off-by: Gabriel SENTUCQ <perso at kazhord.fr> Signed-off-by: Maxime ROBERT <maxime.robert1992 at gmail.com>
+4731c1e qa: Use public qemu repo
+350139b doc: Replace requries with requires in python API doc Fixes: #12076 Signed-off-by: Jordan DORNE <jordan.dorne at gmail.com>
+0c4555d doc: Adding a link to documentation on admin socket
+dfda3ff Bug fix to ceph systemV compatability script.
+f105fdc doc: Fixes a missing dot
+7fd1ae8 doc: dev/differences-from-posix Fixes: #11108 Signed-off-by: Claire MASSOT <claire.massot93 at gmail.com>
+b10adf6 doc: Unify ID format
+145fbb7 doc: Fixes a missing word in a printed string
+3cb8c4e doc: Replaced "disk or drive" by "disk" in ceph-disk.rs Fixes: #12062 Signed-off-by: Gabriel SENTUCQ <perso at kazhord.fr>
+7065203 doc: Remove `` before and after 'ceph -s' in titles Fixes: #12059 Signed-off-by: Arthur Gorjux <arthurgorjux at gmail.com>
+cac48bd doc : Change of "ceph-deploy mon create {ceph-node}" with "ceph-deploy mon add {ceph-node}" in "Quick-ceph-deploy.rst"
+33f4b9a doc: Modification of a sentence.
+bfa0c4a Fixes to rcceph script
+451cee4 osdc: refactor use of _is_readable
+9e09e54 osdc: handle corruption in journal
+ae0a28e mds: additional error error handling in CDir
+16f7add mds: refactor BADFRAG setting
+deec710 mds: refactor CDir::_omap_fetched
+47c5435 mds: don't recreate damaged stray dirfrags
+136ebf0 mds: add MDS::damaged_unlocked helper
+1165373 mds: fix MDLog error handling on ENOENT
+e964207 mds: fix a damaged() case
+9677047 mds: fix corrupt CInode handling
+81b51ba mds: fix DAMAGED beacons from rejoin
+b37dd43 mds: handle missing stray dirfrags
+4d65940 include/buffer: fix constness of operator<< for error
+adfa2e0 librbd: flush operations need to acquire owner lock
+d3bd27f rgw: fix reset_loc()
+9298f93 mon/OSDMonitor: fix get_bucket_utilization return value
+e41d97c rgw: fix assignment of copy obj attributes
+4030774 mon: only send MMonMetadata to peer mons that support it
+54a516f configure.ac: Fix JUnit 4 detection on Fedora 22.
+7fbac49 doc: Change the type of list in doc Fixes: #12061. Signed-off-by: Maxime ROBERT <maxime.robert1992 at gmail.com>
+05c56b7 doc: ceph-deploy man page: typo #12063 Replace is initial monitor hostname with is the initial monitor hostname Fixes : #12063 Signed-off-by: CARADANT Kevin <kevin.caradant at gmail.com>
+ca285d4 Fix typos in librados example code
+b5d63eb doc: replace a sentence by another Fixes: #12060 Signed-off-by: Yannick Atchy-Dalama <yannick.atchy.dalama at gmail.com>
+9cdd810 doc: Purpose of a cache pool
+18a9a22 rados/tool: handle --snapid correctly
+e60c450 doc: Wrong restriction for a daemon's ID
+350f43e librbd: Add option to allow disabling issuing alloc hint
+8feb27d common/RWLock: allow disabling read/write lock counts
+5e756ed ceph_spec buildep python sphinx for fedora
+26eba36 Bug fix for man file packaging.
+8e56a5b [rbd] support G/T units in rbd create/resize
+81eee9b doc: explain about pgp num
+18ba022 Bug fix for man file packaging.
+1ea3f47 qa: update to newer Linux tarball
+84e2f8e doc: sync ceph-deploy's man page with the code
+3a55cb0 tests: display the output of failed make check runs
+d38cd63 doc: update openstack and rgw keystone
+905c31e qa: use "sudo rmdir" to remove test data of multiple_rsync.sh
+a9475e1 mon/OSDMonitor : error out if pgp_num > pg_num
+ce86b0a doc/messenger: Add doc for ceph_perf_server/client
+2e7c8e7 libcephfs: add ceph_p{read,write}v
+69316a9 crush: add crush_compat.h
+1db1abc crush: eliminate ad hoc diff between kernel and userspace
+fb71bd9 mds: add dirfrag split/merge asok commands
+ecee227 crush: shared files should use kernel-doc
+9fde186 crush: fix style issues in shared files
+9a8ed8b crush: move safe arithmetic functions to buider.c
+efd46d1 libcephfs: add test for ceph_p{read,write}v
+6ef40d7 cmake: add missing RadosDump.cc, RadosImport.cc and PoolDump.cc
+f6bf6c2 libcephfs: add ceph_p{read,write}v
+51862e3 bug fix: librados segmentation fault, when two read ops share one AioCompletionImpl
+05e95f7  objectstore/store_test.cc : add tests for DBObjectMap::get_keys
+ce5ba43 os/DBObjectMap : fix recurcive lock in get_keys
+440ef3a os/DBOjectMap : iter->valid is always false if we do not initialize the iterator with init(), seek_to_first will initialize this iterator
+ade36cf in _zero() function, we should remove strips to minimize keyvalue DB update
+5436c29 mon: add an "osd crush tree" command
+5187520 osd_types.cc: drop commented-out asserts in add_next_event()
+01b2e1f osd_types.cc: replace count with iterator in add_next_event function
+724c139 osd_types.{cc,h}: fix three typos in comments
+f620a8a Remove rados_sync.cc, rados_import.cc and rados_export.cc deleted files
+55d9747 mon: add 'PGs' column to 'ceph osd df'
+04e91bb rpm: add missing Java conditionals
+45392a2 mailmap: Yuri Weinstein affiliation
+5b29a57 doc: add v0.94.2 to the release timeline
+ddc4d52 test, tools: Improve ceph-objectstore-tool import error handling and add tests
+7618d86 PerfMsgr: Add usage helper info
+d6f6ad0 PerfMsgr: Make Server worker threads configurable
+eba4eb2 PerfMsgr: Add tips for running ceph_perf_msgr
+97ff79d debian, rpm: Add ceph_perf_msgr_* to build file
+372eddf PerfMsgr: Add messenger perf tools
+832f33a qa: use "sudo cp" in multiple_rsync.sh
+290204d common: make safe_splice return if return value is EAGAIN.
+d91808b FileStore: For clone using splice to reduce memcopy.
+7509a6c common/TrackedOp: checking in flight ops fix
+ec3c409 common/OpTracker: don't dump ops if tracking is not enabled
+5c7bacd os/RocksDBStore:Drop buffer_list and key_list in transaction.
+d9616d6 common/TrackedOp: break out of loop when reaching log threshold
+929e593 osd/ReplicatedPG: snapset is not persisted
+1e4dc8d FileStore: Call _do_copy_range in _do_spare_copy_range.
+93d920e doc: update rgw configuration doc on bucket index sharding feature
+2fb380e ECBackend:Only there are push, it queue transaction.
+f23ff73 ceph-osdomap-tool: Fix argument handling
+b4ad260 mon/PGMap: fix pgmap stat adjustment during map_pg_creates()
+c0c6efb test: Add --dry-run and --no-overwrite testing of rados import
+1481950 tools/rados: dry-run/no-overwrite args for import
+d1d6196 tools: better error message in objectstoretool
+83a4220 test: Switch ceph-objectstore-tool import-rados to rados import
+460200c tools: remove objectstoretool's rados-import
+2883314 tools: remove old rados_sync code
+391c72f tools: implement rados import/export
+a45f8fe tools: refactor RadosImport to pass ioctx
+2991bee tools: remove RadosDump::debug in favour of dout
+f477b44 tools: add support for pool exports in RadosImport
+8e5594d tools: clean up errors in ceph-objectstore-tool
+d26086d tools: refactor objectstore tool
+a5d9b49 FileStore and blkdev: Collect device partition information
+488c042 doc: add v9.0.1 to the release timeline
+306345b doc/release-notes: v0.94.2
+f5da2fc doc/release-notes: v9.0.1
+1e77fcf tests: ceph-disk tests need to install pip > 6.1
+2d76e2c tests: erasure-code non regression tests must skip isa on i386
+e68ea2c osdc: Make librbd cache handle NOCACHE fadvise flag.
+ac1e729 rgw: fix data corruption when race condition
+42a3ab9 os/LevelDBStore:Drop buffer_list and key_list in transaction.
+1aa9655 tools: For ec pools list objects in all shards if the pgid doesn't specify
+98e77d5 mon/PGMap: access num pgs by osd
+fa04833 mon/PGMap: fix pg_by_osd
+abe4ec2 rgw: remove trailing :port from host for purposes of subdomain matching
+a3f9cba config_opts: turn down default recovery tunables
+e401115 tests: ceph_erasure_code_non_regression s/stipe/stripe/
+9193afd tests: automake 1.11 needs parallel-tests
+83f8198 Removed unnecessary inclusion of iostream
+57eddf0 osd: using is_omap instead of test_flag(pg_pool_t::FLAG_OMAP)
+1ffced3 osd: add bool supports_omap() in pg_pool_t.
+75465eb osd/ReplicatedPG: Don't create objectcontext when remove tmp-obj for promote failed if there was has tmp-obj.
+cbf1d34 osd/ReplicatedPG: For prmote object, if met error, delete the tmp obj.
+e6334bd osd/ReplicatedPG: Only promote object success, it can requeue proxy-read ops.
+437c9e1 doc: architecture minor fixes in watch notify
+21f9e1f ceph.spec.in: remove duplicate BuildRequires
+b711e31 Transaction Id added in response     Signed-off-by: Abhishek Dixit dixitabhi at gmail.com
+26be86f crush/CrushWrapper: fix adjust_subtree_weight debug
+551ab2d crush/CrushWrapper: return changed from adjust_subtree_weight
+1da96ab crush/CrushWrapper: adjust subtree base in adjust_subtree_weight
+b461dc2 unittest_crush_wrapper: test adjust_subtree_weight
+3e12863 tests: ceph-disk.sh test zap gitbuilder false negative
+4d58c07 tests: skip isa tests if the plugin is not available
+90baef8 tests: use erasure_code_plugin_exists from ceph-helpers.sh
+deb651b tests: implement erasure_code_plugin_exists in ceph-helpers.sh
+fda586f tests: sync ceph-erasure-code-corpus to verify jerasure variants
+864e200 tests: add --{show_,}path to ceph_erasure_code_non_regression
+3ec4103 erasure-code: add sanity check to guard against k=1
+85b327f SubmittingPatches: clarify how Reviewed-by lines are added
+e1cb7e5 ceph.spec.in: move specific BuildRequires to where they belong
+739ef0f install-deps.sh: detect yum-builddep errors
+34e4dd9 ceph.spec.in: add missing BuildRequires from SUSE block
+37f7360 erasure-code: implement ErasureCode::sanity_check_k
+a8351eb unittest_crush_wrapper: attach buckets to root in adjust_item_weight test
+14c60b2 unittest_crush_wrapper: parse env
+997b3f9 9.0.1
+d9dd5c5 librbd: don't cancel request lock early
+f97ce46 tests: new test for transitioning exclusive lock
+d2a1c22 tests: verify that librbd will periodically resend lock request
+879b8a7 common: Mutex shouldn't register w/ lockdep if disabled
+b951a73 librbd: improve debugging output for ImageWatcher
+3e1e561 librados_test_stub: watcher id should be the instance id (gid)
+37c74e6 librbd: retry lock requests periodically until acquired
+2b6d063 librbd: don't hold owner_lock for write during flush
+1e0f128 librbd: CoR should copyup empty object to prevent future CoR attempts
+0766734 librbd: don't attempt to (re-)update the object map for CoW
+9055eb6 librbd: hide ENOENT errors during copyup operations
+43e0e3c librbd: avoid infinite loop if copyup fails
+cd0fd83 librbd: whole-object discards should copyup when snapshots exist
+d345061 tests: verify copyup behavior during object discard
+f8c831e librbd: trim operation should issue object copyups for overlap extent
+64f8418 librbd: new AioTrim operation to force copyup on object removal
+325a066 librbd: default to data_ctx for post-copyup operations
+1525ecb librados_test_stub: truncate should honor op snapshot context
+62f1306 librbd: always use current parent overlap
+7734388 tests: verify that copyup properly handle image shrinking
+da5b565 librbd: add new copyup helper method
+336c3bf librbd: include actual write op type in AbstractWrite debug output
+712b2da librbd: removed unused ImageCtx::parent_io_len method
+63aa2e6 build: Add info about tests to --with-debug option
+7bef2d8 tests: fix test_activate_dmcrypt uuid usage
+3d5cef3 librbd: flush pending ops while not holding lock
+742a85d tests: fix possible deadlock in librbd ImageWatcher tests
+45cb9cb tests: enable lockdep for librbd unit tests
+c9142fe librbd: owner_lock should be held during flush request
+a38f9e5 osdc: ObjectCacher flusher might needs additional locks
+1b57cc1 librbd: fix recursive locking issues
+d6b733d librbd: simplify state machine handling of exclusive lock
+c352bcd librbd: AsyncObjectThrottle should always hold owner_lock
+5f157f2 librbd: execute flush completion outside of cache_lock
+218bc2d librbd: add AsyncRequest task enqueue helper method
+66e7464 librbd: disable lockdep on AioCompletion
+3ad19ae librbd: AioCompletion shouldn't hold its lock during callback
+0024677 librbd: complete cache read in a new thread context
+c474ee4 librbd: give locks unique names to prevent false lockdep failures
+c1e1445 log: fix helgrind warnings regarding possible data race
+b65ae4b librados_test_stub: fix helgrind warnings
+6e400b9 librados_test_stub: add support for flushing watches
+7c7df2c common: lockdep now support unregistering once destructed
+650ad32 common: add valgrind.h convenience wrapper
+21f990e librbd: add work queue for op completions
+e5ffae5 WorkQueue: ContextWQ can now accept a return code
+eb6a1df pybind/rbd.py: update with new features constants
+8baaf51 librbd: describe the purpose of features constants
+fbe328e librbd: add a constant for single-client-only features
+61bad55 vstart.sh: set rgw dns name = localhost
+2eb096a FileJournal: Remove CEPH_PAGE_SIZE assumptions
+6bd425b test: fix the plugin_exists()
+75e87a2 ceph.spec.in: remove extra %else statement
+17a74ed install-deps.sh: s/get_pip_and_wheel/populate_wheelhouse/
+b23fd40 install-deps.sh: create virtualenv only if necessary
+f81f651 install-deps.sh: do not create "wheelhouse" unless it's ready
+35c5fd0 common/Cycles.cc: skip initialization if rdtsc is not implemented
+0d84525 install-deps.sh: refactor pip install a little bit
+f04c882 man/create-create-keys: minor fixes
+0eabf27 Revert "ceph.spec.in:ownership of dirs extension"
+f94f232 ceph.spec.in: rm stray %endif
+91dbc78 vstart.sh: sudo radosgw for low port numbers
+f2a4c2a vstart.sh: show s3 creds at end
+fbb9279 README: simplify build deps section
+eaff6cb upstart: limit respawn to 3 in 30 mins (instead of 5 in 30s)
+8af25fa rgw: Do not enclose the Bucket header in quotes
+601c2f7 test: update argparse tests for "mds rm"
+db2d83c install-deps.sh: clean up after ourselves
+c6c3a3b Add rpm conditionals : libs_compat
+d8abde3 Add rpm conditionals : tcmalloc
+5747ea1 Add rpm conditionals : tests
+c65538f Add rpm conditionals : cephfs_java
+232ec88 pybind: avoid spurious "too many values to unpack"
+e8e8b12 mon: Remove spurious 'who' arg from mds rm
+539c1ba admin/build-doc: fix dependency checks
+e3ddcb8 tools: chunk reads in Dumper
+7aba947 Doc: add write back throttling stuff in document and test scripts
+fa000d0 Osd: implement low speed flush
+f00ecb8 mon/MDSMonitor: rename labels to a better name
+72a37b3 mon: send no_reply() to peon to drop ignored mdsbeacon
+fdcc007 arm: fix build on older kernels
+e9e6865 init-ceph, ceph-osd-prestart: set osd weight with more precision
+79961bc osd: Even in objects_read_sync() case don't read past oi.size
+0de68eb qa: fix multiple_rsync.sh to avoid using /usr/ directly
+4cf4148 tests: verify librbd blocking aio code path
+008a9c4 librbd: add rbd_non_blocking_aio option to image metadata
+de14093 doc: s/osd pool hot-storage/osd pool set hot-storage/
+16e8e2c mon: remove unnecessary error handling
+64ec79f Put new variable initialization in its own block
+769cad1 librbd: new rbd_non_blocking_aio config option
+8f6056a Osd: revise agent_choose_mode() to track the flush mode
+1fb26e3 Mon: add cache_target_dirty_high_ratio related configuration and commands
+4c5f755 Osd: add new field in pg_pool_t
+d7858c6 test/ceph-objectstore-tool: Don't need stderr noise
+23551de test/ceph-objectstore-tool: Show command that should have failed
+ce76f88 test/ceph_objectstore_tool: Improve dump-journal testing
+90202a7 ceph-objectstore-tool: Allow --pgid specified on import (for post split)
+106a1c3 Makefile: install ceph-post-file keys with mode 600
+ee170ea ceph-post-file: improve check for a source install
+b84031e ceph-post-file: behave when sftp doesn't take -i
+51e2798 ceph-objectstore-tool: Invalidate pg stats when objects were skipped during pg import
+7031654 ceph-objectstore-tool, osd: Fix import handling
+99d7663 ceph-objectstore-tool: Add dump-super to show OSDSuperblock in format specified
+f152006 mds, include: Fix dump() numeric char array to include additional alpha chars
+bd2f624 ceph-objectstore-tool: Add dump-journal as not requiring --pgid in usage
+e2e1f35 erasure-code: verify the profile content matches get_profile()
+8e61a99 mon: normalize erasure-code profile for storage and comparison
+77e5330 erasure-code: implement get_profile for every plugins
+ab30921 make_dist.sh: bz2 only
+44f8bbb make_dist.sh: improve version calc
+0f99796 osd/: convert scrub to use the OpWQ
+0f2c556 erasure-code: expect all plugins to implement get_profile
+042e8ff ceph.spec.in summary-ended-with-dot
+f9b11b0 qa: update cephtool test for CephFS tier cases
+11e5faf mon: forbid readonly cache tiers for CephFS
+a50c8f1 mon: refine check_remove_tier checks
+0742d82 ceph.spec.in libcephfs_jni1 has no %post and %postun
+0be8c67 AsyncConnection: Remove unusable status check
+06e6c15 test/perf_local: disable tests if not supported
+63343ce .gitignore: ignore /perf_perf_local
+168354c test_msgr: make available_conns only can own unique (server,client) messenger pair
+55e7cb9 Message: Make encode_payload can be reentrant
+17b2c5b mds: use helper functions to access/modify SnapRealm::open
+c3fa768 mds: remove pruned parent snaprealm from snaprealm's open_past_parents
+55063a9 mds: fix CInode::pop_projected_snaprealm()
+6ed6a68 mds: properly open parents of snaprealm
+cc1ff9b test/librbd/fsx.cc: fix int/unsigned long conversion
+05b050f mds/CInode.cc:  fix FORWARD_NULL issue
+0ae8fca os/HashIndex.cc: add asserts to prevent BAD_SHIFT
+f5ae56b osd/ClassHandler.cc: fix STRING_OVERFLOW
+0a45a7a test/librbd/fsx.cc: fix BUFFER_SIZE_WARNING
+bbeb37f tools/rados/rados.cc: restore ostream precision format
+6ffec51 tools/rados/rados.cc: restore ostream precision format
+3b720f5 tools/rados/rados.cc: restore ostream format
+71df303 mon/OSDMonitor.cc: suppress false positive MISSING_BREAK
+42f93a6 os/FileStore.cc: remove dead code
+258b7b9 rgw/rgw_rados.cc: remove dead code
+af14437 test/librbd/fsx.cc: reduce scope of variables
+83cb609 mds/MDS.cc: silence coverity DIV_BY_ZERO
+958289b test/librbd/fsx.cc: fix format specifier
+b705949 test_internal.cc: prefer ++operator for non-primitive iter
+e60c9c2 rgw/rgw_op.cc: prefer empty() over 'size() == 0' for emptiness check
+d312847 librbd/internal.cc: close resource leak
+7e2a7a8 tests: cephtool/test.sh wait_for_clean after PG (re)mapping
+f6e4000 tests: reduce pg_num in test_mon_osd_pool_set
+d2cc2b1 tests: use get_pg from ceph-helpers.sh
+7d6bef7 COPYING: note git-archive-all.sh license
+31e9390 make_dist.sh: rename from bin/make_dist_tarball.sh
+f3fe18a make_dist_tarball.sh: make gz and bz2 archives
+8e1bcf2 ceph-disk: always check zap is applied on a full device
+c6e6348 mon: add "--check" to CrushTester::test_with_crushtool()
+9381d53 crushtool: rename "--check-names" to "--check"
+22e6bd6 mon: check the new crush map against osdmap.max_osd
+d0658dd crushtool: enable check against max_id
+e640d89 crush/CrushTester: check if any item id is too large
+a955f36 mon: validate new crush for unknown names
+d6b46d4 crushtool: add the "--check-names" option
+b75384d crush/CrushTester: add check_name_maps() method
+f5a9580 unittest_ceph_argparse: test float parsing
+4487d13 Allow float value by allowing . in input value
+64944d4 tests: ceph-helpers.sh use expr instead of (( ))
+d2172b0 tests: move ceph-helpers.sh to qa/workunits
+b8954e6 AsyncConnection: Make header insert when sending
+a199ff9 erasure-code: ErasureCodeShec update default values to string
+395eba9 erasure-code: ErasureCodeLrc update default values to string
+17ecd6f erasure-code: return ErasureCodeLrc::parse_kml error
+14a59e5 erasure-code: ErasureCodeJerasure update default values to string
+6eaeb56 erasure-code: ErasureCodeIsa update default values to string
+4b3ba2a erasure-code: ErasureCode default value is a string
+55b9e4e rgw: Drop a redundant context_io.init
+008d9cf AsyncConnection: make can_write enum type
+80c3150 AsyncConnection: Avoid "lock" acquire in message normal send flow
+947c39d AsyncConnection: Allow msg encode without write_lock holding
+19f681f AsyncMessenger: Add perf counter for each async worker
+27d24a8 AsyncConnection: Use reference instead of value passing for try_send
+aa7674b Event: assert failure when failed to add/del event
+43ef09d AsyncConnection: Avoid event creation leak when stopping
+a14b0e1 Event: Avoid potential wakeup leak between set and read
+84b9088 AsyncConnection: Avoid encoding message with lock holding
+6fcf162 PerfLocal: Doesnt' compile div64 for non-64 bits arch
+ceee695 ceph-erasure-code-corpus: isa: add k=10 m=4
+8199e00 install-deps.sh: do not store pip cache outside of the tree
+e8cc7a8 bin/make_dist_tarball.sh
+8bd27b6 bin/git-all-archive.sh: add --ignore option
+94b6661 bin/git-archive-all.sh
+bbf75f8 install-deps.sh: keep debian alternatives
+cbc96a0 install-deps.sh: robust pip and wheel installation
+be873eb ReplicatedPG::release_op_ctx_locks: requeue in scrub queue if blocked
+5c2b795 ReplicatedPG::finish_ctx: take excl lock if operation is rw
+3e4b852 osd/: convert snap trimming to use OpWQ
+e8cddf8 OSD: add PGQueueable
+517659b FileStore: sync object_map when we syncfs
+8a6d626   osdc/Journaler.h: fix ceph_file_layout dump error in journaler::dump().   Signed-off-by: huangjun <hjwsm1989 at gmail.com>
+b3555e9 mon: always reply mdsbeacon
+6f49597 ceph.spec.in:ownership of dirs extension
+deb8ea4 osd: check pending or active scrub before sched_scrub
+b20ea43 client: start flushing dirty caps in Client::_fsync()
+6883b82 client: make fsync wait for unsafe directory operations
+ce27ae4 client: make fsync waits for single inode's flushing caps
+6bb9e15 client: don't update flushing_cap_seq when there are flushing caps
+2517ea9 mds: don't add setfilelock requests to session's completed_requests
+ae08638 client: exclude setfilelock requests when calculating oldest tid
+83f88e7 commit: test: check daemon is alive or not agagin when sleep 60s
+ddcbb66 tests: add lrc tests to osd-scrub-repair.sh
+77f322b tests: split osd-scrub-repair.sh erasure code test
+70e069d tests: cosmetic move of functions in osd-scrub-repair.sh
+28820d1 erasure-code: s/unreocvery/unfound/
+8b78371 doc: release-notes.rst: fix typo and delete superfluous sentence
+6f11fbf debian: ceph-dbg steals ceph-objectstore-tool from ceph-test-dbg
+735abea Osd: classify flush mode into low speed and high speed modes
+926f2df rgw: update release notes with regard to issue #11442 recovery
+0b5944a PerfLocal: fix i386 compatible with perf_local
+a319976 Forward port suse build deps and recomends
+a808c81 osdmap, mon: switch the params of print_tree()
+aa62dcb osdmaptool: dump 'osd tree' in specified format
+acdfd98 doc: fix typo in placement-groups.rst
+9c8f8d2 doc: fix a wrong quote in release.rst
+2cc7aee mon: MonitorDBStore: get_next_key() only if prefix matches
+2934909 mon: Monitor: allow updating scrub interval on demand
+e77b3f4 mon: Monitor: allow scrub to timeout after a while
+80ce9b0 mon: Monitor: inject missing key failures during scrub
+ba4a2c1 mon: Monitor: inject scrub failures
+4b6d081 mon: Monitor: wait_for_paxos() before scrubbing
+0a19fd4 mon: Monitor: rework scrub (2)
+90eb776 ReplicatedPG::trim_object: write filtered snapset while we're at it
+6051e25 ReplicatedPG: start_flush: use filtered snapset
+933df03 tests: fix the main() documentation of ceph-helpers.sh
+a8e4b4f tests: remove unused CEPH_HELPER_VERBOSE in ceph-helpers.sh
+eb9dbef doc: fix crush-ruleset-name param description
+c00e393 OSDMonitor: fix prepare_pool_crush_ruleset() retval
+29f11c7 CrushWrapper: validate default replicated ruleset config opt
+f032c40 OSDMap: respect default replicated ruleset config opt in build_simple()
+ea4f942 tests: a couple tweaks to osd-pool-create.sh
+81faac7 debian/copyright: update copyright for test/perf*
+cc92872 scripts: Add a helper to make release notes
+2879b0c .gitignore: systemd/ceph-osd at .service
+f814298 packaging: include ceph_perf_local
+61d70db ceph_perf_local: rename from ceph_perf
+178484b librbd: re-add missing discard perf counters
+ff79959 osd_types::is_new_interval: size change triggers new interval
+93af382 PendingReleaseNotes: document changes to librbd's aio_read methods
+948b15e tests: update librbd AIO tests to remove result code
+c77bce3 librbd: AioRequest::send no longer returns a result
+9ab42d6 librbd: internal AIO methods no longer return result
+b88b88c Throttle: added pending_error method to SimpleThrottle
+6d1d0c8 librbd: add new fail method to AioCompletion
+3a7b5e3 librbd: avoid blocking AIO API methods
+afb896d librbd: add task pool / work queue for requests / callbacks
+b3f5a75 WorkQueue: added virtual destructor
+24a33e9 WorkQueue: add new ContextWQ work queue
+a7bce20 doc: updates to v0.94.2 draft release notes
+d7a2349 doc: update the development workflow
+ac347dc Template systemd/ceph-osd at .service with autotools,
+2b23327 Mutex: fix leak of pthread_mutexattr
+38a319d qa/cephtool: add blacklist json output check
+8ef6f86 osd: fix blacklist field in OSDMap::dump
+4cc0f2f KeyValueStore: Add collect_metadata support
+7b5fc50 KeyValueStore: Avoid extra lookup for map
+8113051 os : remove unused GenericObjectMap::sync() funtion since no caller invoke this function
+db7936a erasure-code: implement consistent error stream
+0822922 erasure-code: do not leak shec instance on failure
+6ca6006 erasure-code: lrc size test depends on layer semantic
+21036cf erasure-code: define the ErasureCodeProfile type
+52440c4 rbd: document mount_timeout in the man page
+2daaa61 mds: fix use-after-free in SessionMap::remove_session
+0a98b02 dump keyvaluestore transaction to a file
+2a1493c test/ceph_test_rados: consolidate ReadOps
+9b62cf2 ceph-disk: support NVMe device partitions
+f417eda tests/test-erasure-code: spin off eio tests into another testsuite
+2230def tests: fix the get_config()
+33eae4e xio: fix reuse of outer loop index in inner loop
+367a5fc cmake: add missing source file to test_librbd
+a8fca3c cmake: add missing common/util.cc dependency
+15dd70c cmake: skip man/CMakeLists.txt
+7c1bae5 tests: don't choke on deleted losetup paths
+f9ba711 dev/rbd-diff: clarify encoding of image size
+ab8e9e3 tests: CEPH_CLI_TEST_DUP_COMMAND=1 for qa/workunits/cephtool/test.sh
+5c69f5e tests: ceph create may consume more than one id
+1dac80d rgw: Use attrs from source bucket on copy
+522b495 test/ceph_test_rados: add tests of pipeline reads
+11fef22 doc: recommend opening entire 6800-7300 port range
+4fe7d2a RadosModel: randomly prefix delete with assert_exists
+121aa3b RadosModel: assert exists on subsequent writes
+b50cc94 doc: update OSD port range to 6800-7300
+3dda5fa xio: malloc if xio_mempool_alloc fails
+5c14a69 xio: fix for xio_msg release after teardown
+16d1c1e xio: use ceph clock for timestamps
+c2bba8e xio: save nonce for bind address
+355aa0e xio: check if connection is on list before erasing
+bb621b0 xio: better way to assign connections to specific lane
+855a70d test/aio: aio completion is not released
+39eb454 test/ceph_test_rados: aio completion is not released for ReadOp
+d118e56 osd/ReplicatedPG: need to queue op when avoiding dup promotion in maybe_handle_cache
+115545a ceph-detect-init/debian/__init__: improved syntax
+110608e debian: move ceph_argparse into ceph-common
+ca6abca tools: add --no-verify option to rados bench
+6344fc8 osd: use another name for randomize scrub option
+5e44040 osd: randomize scrub times to avoid scrub wave
+0f7f356 osd: use __func__ in log messages
+2ab0e60 osd: simplify OSD::scrub_load_below_threshold() a little bit
+f9e5b68 qa: unbreak concurrent.sh workunit
+c2d17b9 test/librados/snapshots.cc: add test for 11677
+b894fc7 tools: Don't delete, recreate and re-fill buffers in rados bench.
+55a6f9e bufferlist: implement bufferlist::invalidate_crc()
+a13414d mds: avoid calling StrayManager::eval_stray() recursively
+de8b958 mds: properly invalidate past child snaprealms
+5576686 mds: unlink any stale remote snap dentry when evaluating stray inode
+bf40b9b rbd: expunged xfstests generic/078
+2eca536 DBObjectMap::sync: add comment clarifying locking
+f898ec1 debian: set rest-bench-dbg ceph-test-dbg dependencies
+9ed4919 mds: fix handling missing mydir dirfrag
+e4ca468 tests: reduce make check verbosity
+64f584a ceph-detect-init: fix pep8 extra space
+855aeee ceph-detect-init: run-tox.sh always succeeds
+38e60de Perf: used to perf local hardware capacity
+d723de6 erasure-code: update ceph-erasure-code-corpus for shec
+3cf3ac3 cryptic error message in ceph interactive mode
+d688b0c ceph_detect_init: added linux mint
+e5564a8 tests: {mon,osd}-test-helpers.sh are deprecated
+ae2d489 tests: ceph-helpers.sh remove redundant setup in main
+e985983 tests: test/mon/mon-handle-forward.sh uses ceph-helpers.sh
+1209786 tests: test/mon/osd-pool-create.sh uses ceph-helpers.sh
+88e954b tests: test/osd/osd-bench.sh uses ceph-helpers.sh
+e3f03d8 tests: test/osd/osd-config.sh uses ceph-helpers.sh
+0f56553 tests: test/osd/osd-copy-from.sh uses ceph-helpers.sh
+c693ec1 tests: test/vstart_wrapper.sh uses ceph-helpers.sh
+4a26ca2 tests: test/mon/misc.sh uses ceph-helpers.sh
+aa99929 tests: test/erasure-code/test-erasure-code.sh uses ceph-helpers.sh
+7aca470 tests: test/mon/osd-crush.sh uses ceph-helpers.sh
+f3cb870 tests: test/mon/osd-erasure-code-profile.sh uses ceph-helpers.sh
+66dcda9 tests: kill mon-test-helpers.sh call_TEST_functions
+ea61c8a tests: ceph-helpers.sh do not hardcode id a in run_mon
+b9447bc tests: ceph-helpers.sh shows ceph report if not clean
+de20f61 tests: ceph-helpers.sh implement wait_for_osd
+449ef48   init-ceph.in: Create osd data dir before fs_type check.   One host in cluster crashed and rebuilded, but failed to start osds   because the data dir not exist.
+db7ab8b doc: add rbd cache settings to nova.conf
+886f5a9 common/xattr: #define ENOATTR ENODATA
+b2cd80c os/chain_xattr: s/ENODATA/ENOATTR/
+c6cdb40 os/chain_xattr: stripe shortish xattrs over small chunks for XFS
+8614dce os/chain_xattr: handle read on chnk-aligned xattr
+584ed2e ceph.spec.in: SUSE/openSUSE builds need libbz2-devel
+11b7801 OSDMonitor: disallow ec pools as tiers
+13c0fca src/test/librados/tier.cc: remove OmapOperation test
+363d957 ceph.spec.in: tweak ceph-common for SUSE/openSUSE
+46404dd ceph.spec.in: consolidate centos/rhel macros
+47277c5 watch/notify: Clarify comment where notifies are completed
+bef09e0 test/librados/tier.cc: destroy and recreate cache pool on every test
+bbec53e mon: prevent pool with snapshot state from being used as a tier
+f11de85 mds: fix handle_mds_map in standby_replay
+3bd1cc4 doc: s/relase/release/
+9eb760d KeyValueStore: Fix the prefix comparion to avoid object leaks.
+c199b7b KeyValueStore: Initialize the iterator
+ab30ff2 KeyValueStore: optimize the object header writes
+a46b333 do not return non-exist extents when doing sparse read
+389ae67 rgw: merge manifests correctly when there's prefix override
+9d8c115 init-radosgw.sysv: remove
+1c45f51 init-radosgw: unify init-radosgw[.sysv]
+a4bb992 init-radosgw: look in /var/lib/ceph/radosgw
+0c5b9a2 mds: validate the state+rank in MDS map
+46a8e08 mds: respawn instead of suicide on blacklist
+7fe435c mon: handle DNE beacon from MDS
+c565d87 mds: on suicide(), send a DNE beacon to MDSMonitor
+fdffcc4 doc: customize css for the group title
+f59d6e1 doc: use @name to define a group, not @group
+15b196f test: add test for rmxattr on a removed object
+11f438c osd: return with ENOENT if object doesn't exist during rmattr
+362d3b7 SubmittingPatches: fix typo
+34ef937 pybind: Avoid type error in crush rule (str, not an int)
+dcf8c59 Doc: add a note about a CRUSH corner case with `ceph osd out`
+1bcea5a tools: remove obsolete aio_bench argument placeholder
+fe79bab common: fix ObjBencher::aio_bench signature
+cb1c6b0 OSD::OpWQ: fix spacing
+732e5fe osd/: fast dispatch, standard op path for MOSDRepScrub
+42228d0 doc: recommendations related to performances / cleanup
+9016269 test: Add config changes to all tests to avoid order dependency
+73b1731 osd: Add admin socket feature set_recovery_delay
+545c78c ceph-objectstore-tool: For import/export --debug dump the log
+32be302 test: Use ceph-objectstore-tool --no-overwrite for import-rados
+e0627b0 test: Test ceph-objectstore-tool --dry-run in select operations
+ece630d test: Fix ceph-objectstore-tool test missing fd.close()
+11c13eb test: Fix ceph-objectstore-tool test error message
+c6e4166 ceph-objectstore-tool: If object re-appears after removal, just skip it
+35cdcab ceph-objectstore-tool: Add --no-overwrite flag for import-rados
+3e7fe7e ceph-objectstore-tool: Remove list-lost because now we have --dry-run flag
+2795161 ceph-objectstore-tool: Add --dry-run option
+244092d ceph-objectstore-tool: Add dump-info command to show object info
+c00290b ceph-objectstore-tool: Use empty string for <object> to specify pgmeta object
+fdb1a4b ceph-objectstore-tool: Add a couple of strategically placed prints
+a50679a ceph-objectstore-tool: Clean up error handling
+548147b ceph-objectstore-tool: Create section around log/missing/divergent_priors of --op log
+6ad7e9b ceph-objectstore-tool: Add divergent_priors handling
+94b3d76 test: ceph-objectstore-tool: Remove duplicate debug messages, keep cmd/log/call together
+d80ce66 test: ceph-objectstore-tool import after split testing
+1e18e8a test: Use CEPH_DIR where appropriate
+868cf45 test: Limit how long ceph-objectstore-tool test will wait for health
+dff17a8 test: Add optional arg to vstart() to provide additional args to vstart
+9b08bcf test: Test ceph-objectstore-tool --op dump-journal output
+b014110 test: Pep8 fixes for ceph-objectstore-tool test
+2f4090d ceph-objectstore-tool: Add --force option which is used for import only
+24f0659 ceph-objectstore-tool: Fix pgid scan to skip snapdirs
+381605e ceph-objectstore-tool: Add dump-journal op
+130eba4 osd: FileJournal: Add _fdump() that takes Formatter instead of ostream
+1b9f85b test: Fix ceph-objectstore-tool test, overwrite OTHERFILE so second check is meaningful
+e30efe6 test: ceph_test_filejournal: Conform to test infrastructure requirements
+d540dde test: ceph_test_filejournal need to force aio because testing with a file
+c94fe8d test: ceph_test_filejournal fix missing argument to FileJournal constructor
+8da4d9f test: ceph_test_filejournal  Add check of journalq in WriteTrim test
+159f4cc filestore: Change int open_journal() to void new_journal()
+d580d56 osd: Fix log output in FileJournal::read_entry() coming from do_read_entry()
+96c0d07 os: Add some logging used only by test code
+8d5464c osd: Add simple_dump() to FileJournal for unit testing
+12fdf4c osd: FileJournal clean-up
+3c97b32 osd: Dump header in FileJournal::dump()
+a7cda53 osd: FileJournal::read_entry() can't use a zero seq to check for corruption
+f2d8352 osd: Fix flushing in FileJournal::dump()
+409328b ceph-objectstore-tool: On any exit release CephContext so logging can flush
+fb2f9d5 ceph-objectstore-tool: Check for keyvaluestore experimental feature
+cfca683 ceph-objectstore-tool: Eliminate obscure "Invalid params" error
+565b413 ceph-objectstore-tool: Check pgid validity earlier like we did before
+fb42407 ceph-dencoder: Add ability to import from stdin by specifying "-"
+67b7340 ceph.spec.in: include SUSE in _with_systemd
+25caea8 buffer.h: added some consts
+1b45f2e buffer.h: Removed unnecessary copy constructor and assignment operator
+2b06dc3 rgw: return 412 when bad limit specified on listing buckets
+f0c14e0 test/cli-integration/rbd: verify formatting for 'rbd disk-usage'
+53514ad ceph.in: do not throw on unknown errno
+6306bed qa/workunits/rados/test_rados_tool: added test for omapsetval via stdin
+7ffafc5 rbd: add disk usage tool
+92ed6e5 tests: disable deep-flatten feature for the krbd fsx test
+7dea175 rados cli: setomapval can now read from stdin
+faa210e tests: disable fast-diff feature for the krbd fsx test
+6130365 ceph.spec.in: include SUSE in _with_systemd
+283de97 ceph-detect-init: plain setup.py install on make install
+f88275d tests: tiering health report reworked
+5f252d6 tests: no agent when testing tiering agent border case
+b2c40d5 tests: uncomment tiering agent tests
+8af9c05 doc: more entries for release notes in v0.94.2
+78c73c5 install-deps.sh: only pip install after make clean
+61030e6 client: pin request->target when target dentry is not pinned
+9591df3 client: invalidate kernel dcache when cache size exceeds limits
+54fd4e0 common: fix the macros for malformed_input::what()
+801799d test/KeyValueDBMemory : update init()
+9754764 os/RocksDBStore: interpret some configurations.
+580ccae rgw: restore buffer of multipart upload after EEXIST
+fcea610 common/str_map: trim start/tailing writesapce in key and value.
+9d6f9c9 /s/omap_rocksdb_option/filestore_rocksdb_option
+7f6eed9 os/RocksDBStore: compact_on_mount not support now
+7c16916 keyvaluestore:use keyvaluestore_rocksdb_options if backend is rocksdb
+dac2878 os/FileStore:use omap_rocksdb_options if backend is Rocksdb
+c0311cc mon/MonitorDBStore: use mon_rocksdb_options if backend is Rocksdb
+39dc866 common/config_opts.h: use string for rocksdb options.
+9c64bae Add an test checking if frequent used options works.
+8915dba os/RocksDB: use GetOptionsFromString
+3d591af mon: prevent bucket deletion when referenced by a rule
+9324d0a crush: fix crash from invalid 'take' argument
+944eb28 RBD: expunge test broken by xfs commit
+a97566a doc: add ceph-create-keys.rst to dist tarball
+28f0230 erasure-code: bench.sh compares isa & jerasure, vandermonde & cauchy
+7476e94 Avoid an extra read on the atomic variable
+753122d erasure-code: add --erased option to ceph_erasure_code_benchmark
+613e200 erasure-code: add display_chunks helper to ceph_erasure_code_benchmark
+6760145 tools: fix tabletool reset of nonexistent sessionmap
+a492593 tools: improve journaltool reset
+237f76b mds: expose static default file layout methods
+95beceb tools: fix tabletool reset snap
+0083445 client: subscribe to map on pool full
+4826152 client: clarify some comments
+1fa9282 client: check per-pool full flag in op path
+6eca240 osdc: implement Objecter::osdmap_pool_full
+dbe9ec1 client: op cancellation on per pool full flag
+dbf697d osdc/Objecter: op_cancel_writes return whether anything cancelled
+1d2290e osdc/Objecter: allow per-pool calls to op_cancel_writes
+ab1e539 mds: clear CDir::STATE_REJOINUNDEF after fetching dirfrag
+8ccce25 mailmap: Jean-Charles Lopez affiliation
+738ec60 mailmap: Hervé Rousseau affiliation
+995ab97 mailmap: Rajesh Nambiar affiliation
+ae16ee7 mailmap: Douglas Fuller affiliation
+efa2b3c mailmap: Orit Wasserman affiliation
+39b27be mailmap: Nathan Cutler affiliation
+1203bcd mailmap: Tim Serong affiliation
+dd558b6 mailmap: Fabio Alessandro Locati affiliation
+939fc04 mailmap: Henry Chang affiliation
+ffdd14b mailmap: Gerhard Muntingh affiliation
+64e113a mailmap: Alistair Israel affiliation
+e9300cb mailmap: Sergey Arkhipov affiliation
+7d2e49a mailmap: Neha Ummareddy affiliation
+f76bf6c mailmap: Xingyi Wu affiliation
+0f44127 mailmap: Ning Yao affiliation
+95a881f mailmap: Joao Eduardo Luis affiliation
+2738d02 ECBackend: eliminate transaction append, ECSubWrite copy
+3699a73 mon: fix the FTBFS
+07cf4f7 doc: release notes for hammer v0.94.2
+e1f1c56 mon/PGMap: add more constness
+990dfb6 mon/PGMap: sort pg states by the states in "pg ls" spec
+89f89ca mon: s/recovery/recoverying/ in "pg ls*" commands' spec
+6158f18 Update XIO client connection IP and nonce
+88bf25d doc: rbd-recover-tool: fix typos in comments
+05cf0db doc: rgw: fix typo in comments
+67fd3c4 doc: add release numbering explanation
+0968134 doc: move the release cycle after the timeline
+67da31c debian: add bootstrap-rgw dir to ceph package
+bf93128 packaging: Add rgw placeholder dirs in proper packages
+ee54e71 librbd: object map clean state valid only for fast-diff
+88fabb1 common/admin_socket: close socket descriptor in destructor
+a2ee8d7 AsyncConnection: Avoid assert since replacing may inject when unlock
+67da8fe osd: Show number of divergent_priors in log message
+de81cd2 osd: Fix double counting of of num_read/num_write
+dc863fb install-deps.sh: increase pip install timeout
+b29cc4c doc: Fix .gitignore for 'doc' directory
+f43bf2e cmake: add common_utf8 lib to link unittest_str_map
+1fd2885 cmake: Add missing pthread lib
+6ce5262 cmake add mds and librbd missing files
+31b9dc4 cmake: add crc assembley files
+2df0693 rgw: always check if token is expired
+0e26e9f tests: ceph-helpers kill_daemons fails when kill fails
+e7b196a mon: remove unused variable
+c255e80 install-deps.sh: exit on error if dependencies cannot be installed
+7b28a6f tests: pip must not log in $HOME/.pip
+6b68b27 json_sprit: fix the FTBFS on old gcc
+74f3b5c AsyncConnection: verify connection's state is expected
+b863ccb Added a "ceph hello world" for a simple check for ceph-deploy qa suite
+13abae1 Added a "ceph hello world" for a simple check for ceph-deploy qa suite
+f36bd0e tests: need 1024 files per process not 100024
+8cbd92b librbd: ObjectMap::aio_update can acquire snap_lock out-of-order
+0e95986 tests: fail make check if nproc is too low
+e08cf25 client: fix error handling in check_pool_perm
+289ee3b client: use SaferCond in check_pool_perm
+71bf5f1 mon: add 'node ls {all,mds,mon,osd}' asok
+79439d4 mds,mon: add 'mds metadata' command
+73390d2 mon: extract gid_from_arg() out of fail_mds()
+1b0386b mon: add "mon_metadata <id>" command
+0e3bee6 osd: extract collect_sys_info() out of OSD::_collect_metadata()
+5c6190d common: rados bench data.finished = 0
+1ac7279 common: remove unused maxObjectsToCreate
+7289542 src/CMakeLists.txt: fix build to work with nss
+4bb8843 CMakeLists.txt: fix detection of nss
+67e2caa crypto: fix cmake error '#elif with no expression'
+ff49933 CMake: add FindNSPR.cmake
+b50ec2f client: avoid sending unnessesary FLUSHSNAP message
+2801c1b doc: clarify the release cycle and LTS meaning
+d776f93 doc: add v9.0.0 to the release timeline
+2c62430 doc: replace End Of Life with retirement
+d21d545 doc: fix releases broken links
+6b0bdc3 librbd: copyup now optionally updates the snapshot object maps
+a1eab0f librbd: allow updates to snapshot object maps
+6511eb0 librbd: AsyncRequest is now optional for AsyncObjectThrottle
+d5b4e78 librbd: progress context is now optional for AsyncObjectThrottle
+f76f772 tests: verify object map is updated after full copyup
+e2be341 tests: verify new librbd snapshot copyup behavior
+64cbd90 tests: add new feature code to rbd test list
+e01a064 tests: verify cls_rbd::remove_parent handles snapshots
+9632cba cls_rbd: remove_parent now updates snapshots if deep-flatten enabled
+bc66b78 librbd: always deregister child clone from parent with deep flatten
+1271947 librados_test_stub: support AIO snapshot context
+9fd708b librbd: copyup should use empty snapshot context
+7be3df6 librbd: move copyup class method call to CopyupRequest
+b556d31 librbd: add new deep-flatten RBD feature
+819c980 librbd: fast diff should treat all _EXISTS objects as dirty
+fc42837 librbd: fix warning
+1b6a5e9 librbd: fast diff of first snapshot loads incorrect object map
+baebe74 librbd: fast diff is incorrectly flagging objects as updated
+f25aa5f librbd: ignore lack of support for metadata on older OSDs
+3f2946a OSD: add op_wq suicide timeout
+f2fbfa3 OSD: add remove_wq suicide timeout
+547a704 OSD: add scrub_wq suicide timeout
+e1073a4 OSD: add snap_trim_wq suicide timeout
+85311b6 OSD: add recovery_wq suicide timeout
+df4e5de OSD: add command_wq suicide timeout
+cd0f2b7 obj_bencher: does not accumulate bandwidth that is zero
+70585a6 rados cli: fix documentation for -b option
+fbd6646 test_libcephfs: Fix zero length read tests
+8add15b json_spirit: use utf8 intenally when parsing \uHHHH
+835b12f librbd: fix the image format detection
+899dd23 configure.ac: no use to add "+" before ac_ext=c
+5b2357e configure.ac: add an option: --with-man-pages
+69248bc ceph-disk: use ceph-detect-init instead of hard coded values
+1dad037 tests: ceph-disk.sh must exit 1 on error
+b4a441a ceph-detect-init: package for rpm and debian
+35a16ff ceph-detect-init: integration with automake / make check
+8ee5a82 ceph-disk: implement activate --no-start-daemon
+9b6e1cd ceph-disk: pep8 conformance
+73e2c74 ceph-detect-init: ceph-disk helper to select the init system
+73d16f6 mon: Total size of OSDs is a maginitude less than it is supposed to be.
+a87ac4d obj_bencher: aio_bench - rename op_size to object_size
+0dea11d rados cli: add preventing using --block-size with bench seq and rand
+1d1d0aa obj_bencher: remove excess 'object_size = op_size'
+31d16e9 obj_bencher: remove 'trans_size' as obsolete
+538e6ea doc/release-notes: 9.0.0
+c9a6e60 src/script: remove obsolete scripts
+60be2d6 common/buffer.cc: change linkage of few variables to internal
+9282f15 SubmittingPatches: clarify backport procedure
+41e4cbe tests: add librbd watch/notify version compatibility test
+124b1d3 qa/workunits/rbd: add notify_master/slave bootstrap scripts
+9039955 qa/workunits/rbd: add new test_librbd_api workunit
+6fe94c8 tests: create librbd API-only integration test suite
+2b5f0fc automake: allow multiple {install,all,...}-local targets
+8c7a781 build: make-debs.sh NPROC overrides make -j
+8b7953a install-deps.sh: pip wheel for python dependencies
+9784e5d tests: install sudo on ubuntu-12.04 container
+125c59b tests: erasure coded pools do not allow offset on creation
+a4f1256 osd: refuse to write a new erasure coded object with an offset > 0
+5c9e9da rados: Tell that pool removal failed. Not that it doesn't exist.
+4c4313d mon: Monitor: rework scrub (1)
+cdb37dc mon: Monitor: track scrub's state
+59bc72d mon: Monitor: schedule scrub periodically
+886096d mds: properly set null dentry's first
+fb484b6 mds: only add head dentry to bloom filter
+752a16f AsyncConnection: Avoid lockdep detect failed
+c26b21f AsyncConnection: Don't dispatch event when connection is stopped
+63eb432 librbd: don't notify_change on error when notify async complete
+9da4c45 doc: link to sepia report for lab info
+bd79891 9.0.0
+156e55b man: do not dist man pages if sphinx is not available
+f76293c test_async_driver: add dispatch_external_event tests
+c107b8e configure.ac: do not check for sphinx-build
+0b4315b pycephfs: add tests for open call
+fc51ce2 osd/ReplicatedPG: For obj has omap, it mean have omap data or omap header or have both.
+3034213 test/bufferlist: add test case for bufferlist::splice.
+6b81d5c test_pycephfs: Fix tests for pycephfs
+d039bce pycephfs: add padding buffer to statvfs to avoid os extend
+56b8222 pycephfs: add read/write support
+fa956c6 test_pycephfs: Add tests for open call
+b154a5d pycephfs: Let open call's flags imitate python open
+1006adb bufferptr: Offset should not beyond raw_length rather than _len.
+9209a8f osd: silence gcc -Wparentheses warning
+5a6140f common/CompatSet: mark FeatureSet member vars private
+817b071 mds: don't reintegrate stray inode to remote snap dentry
+1800a99 librbd: For format 2, don't forget set objectcache max objects.
+c60212b rbd: updated the 'rbd --help'
+9d07069 librbd: remove the perfcounter l_librbd_aio_rd/write/discard_*.
+059c0d1 librbd: Remove the redundant judgement.
+e146b04 librbd: Remove the redundant judgement.
+1de75ed Preforker: include acconfig so includers don't have to
+69971f9 osd: dout the relation between osd_reqid_t and journal seq
+4fed200 osd: dout latency info in filestore
+05a5f26 librbd: object map is not properly updated when deleting clean objects
+7213352 atomic_t: change cas to compare_and_swap and return bool
+a0f96de mds: in damaged() call flush_log before ending
+8803776 mon: add MonClient::flush_log
+a5e88fc librbd: invoking RBD::open twice will leak memory
+585bc2b mds: send FLUSHSNAP_ACK even if FLUSHSNAP message is unexpected
+fbfd50d OSD: handle the case where we resurrected an old, deleted pg
+32b8bf5 test: update CMakefile to sync with c44f8e7
+b7f4328 systest_runnable: adjust argument to suite Preforker
+9699246 mds: handle missing mydir dirfrag
+6cb1f6c mds: handle missing/corrupt data during boot
+816d9e0 mds: error handling in CInode::_fetched
+4730a45 mds: remove redundant 'journal' file open
+69a54e7 mds: handle case where rank 0 has no subtrees
+7252975 mds: handle corrupt dentry more cleanly
+49efdb8 mds: more damaged() handling in MDLog
+de5717e ceph_mon: output preforker error message
+9e1514c Preforker: Add child process exit status check
+3f969d6 systest_runnable: Use Preforker to do fork
+e0b389a test_libcephfs: Add tests to cover zero length write
+61cf5da packaging: mv ceph-objectstore-tool to main ceph pkg
+735ab91 Client: Fast return if len is 0
+2586e3b pybind: fix valgrind warning on rbd_get_parent_info call
+5ccc442 osdc: invalid read of freed memory
+740fd27 krbd: fix incorrect types in the krbd API
+c44f8e7 fsx: cleanup crypto library at exit
+6e84ceb autotools: detect presence of valgrind
+30b762b qa/workunits/rbd: add support for running API tests under valgrind
+5534faa tests: run librbd valgrind test during 'make check'
+8d87bdf valgrind: update valgrind suppressions for lttng-ust
+5311eb0 Event: Use atomic cas to avoid missing wakeup
+dd3a3b9 Atomic: Add CAS primitive support
+280ad75 pycephfs: Add stat result class definition for callers
+d7ffc71 pycephfs: Add dirent class definition to ease caller
+0f76ca0 test_cephfs: Add python binding cephfs tests
+14c953f pycephfs: Ensure passing argument is a instance of basestring
+448a6c1 pycephfs: export self and parent dirent too
+940b4f0 pycephfs: Add rename, rmdir and getxattr support
+0803031 pycephfs: Fix getcwd call
+08a60bb pycephfs: Add directory operations for pybind
+38c9d75 config: Document that 'mds_max_file_size' is only read on FS creation
+fdd7f8d Fix "disk zap" sgdisk invocation
+e71e865 run-make-check.sh: stricter check for yum/apt-get/zypper
+05534b8 run-make-check.sh: use /sbin/modprobe
+ae6247a AsyncConnection: Fix connection doesn't exchange in_seq problem
+4cd07f2 AsyncConnection: Ornament log output
+3f39894 librbd: missing an argument when calling invoke_async_request
+7029265 packaging: move SuSEfirewall2 templates out of src
+55414ae ceph-fuse: check return value on system() invocation
+4757bf9 rgw: fix broken account listing of Swift API.
+1ff409e common/config: detect overflow of float values
+d62f80d common/config: detect overflow of int values
+3fb26bc use get_linux_version() instead ad-hoc uname().release parsing
+77685f5 packaging: add SuSEfirewall2 service files
+ab51130 Event: Delete driver after cleanup
+caa9f0e rgw: fix ListParts response
+04b0002 qa/workunits/post-file: pick a dir that's readable by world
+999dcc8 Revert "osd: For object op, first check object whether unfound."
+8e20240 librbd: TaskFinisher should finish all queued tasks
+ea5107c librbd: librados completions are not properly released
+ed5472a tests: fix valgrind errors with librbd unit test
+6ab1bb5 tests: librbd should release global data before exit
+54c8825 librados_test_stub: cleanup singleton memory allocation
+a84337b osd/ReplicatedPG: fix an indent in find_object_context
+f93df43 doc: add giant v0.87.2 to the release timeline
+86788c4 mds: remove caps from revoking list when caps are voluntarily released
+6e50f64 add perf counter for rocksdb to evaluate latency of get and transaction commit
+d2fb5bd add perf counter for leveldb to evaluate latency of get&commit
+fd11e32 doc/release-notes: v0.87.2
+24f4774 Swift: Set Content-Length when requesting/checking Keystone tokens
+972e9a5 FileJournal: fix check_for_full
+3c4028e client: check OSD caps before read/write
+387a09e tests: AioCompletion incorrectly freed
+fd7723a librbd: update ref count when queueing AioCompletion
+0e499f3 tests: rbd cli integration test should explicitly select image format
+6d24907 librbd: do not attempt to retrieve metadata for old format images
+f141e02 librbd: flatten should return -EROFS if image is read-only
+594a661 librbd: allow snapshots to be created when snapshot is active
+32c41f8 cls_rbd: get_features needs to support legacy negative tests
+e97fd50 rgw: simplify content length handling
+79d17af rgw: make compatability deconfliction optional.
+06d67d9 rgw_admin: add --remove-bad flag to bucket check
+8a7e58e AsyncMessenger: Don't need to join thread if not started
+7789eef ceph.in: handle unknown Exception correctly
+da9d2b4 ceph.in: improve the interactive mode
+bc7d8c9 ceph.in: parse quote correctly in interactive mode
+0ea0e01 Fix clear_pipe after reaping progress
+eb6738c civetweb: add .gitignore
+cc5d346 rgw: use correct oid when creating gc chains
+47339c5 init-radosgw: run RGW as root
+c262259 rgw: civetweb should use unique request id
+bd05024 test_librbd_fsx: flush before discard in krbd mode
+92574ab Throttle: reset max only if max changed
+d7de585 Throttle: fix wait/get() with new max
+62522b0 configure.ac: check for libcurl and libxml2 if build libs3
+bc8d14d Throttle: add more constness
+127c931 Throttle: improve doxygen comments in Throttle
+2b2c5d1 libexpat is now used for rgw
+be4355a rgw-admin: a tool to fix object locator issue
+3e38eab rgw: improve content-length env var handling
+3d4a1d2 rgw: set a special object locator if object starts with underscore
+98b59ba rgw: fix s3 list buckets
+0b37894 librbd: updated cache max objects calculation
+27cee2f doc: add ceph-create-keys.8
+a4627aa doc: rework ceph-conf.8
+6fd356b misc: fix make debs script
+8e026c7 Fix the disablement of pgrefdebugging
+37edd16 mds: fix integer truncation
+8f30db8 test: add test-case for repair unrecovery-ec pg.
+bdd6205 osd: Remove the duplicated func MissingLoc::get_all_missing.
+b16b080 osd: For object op, first check object whether unfound.
+d51806f osd: Fix ec pg repair endless when met unrecover object.
+f850470 PG: For needs_recovery, asap return if need recover.
+34cb1f0 PG: remove the duplicated code.
+f4e7d4a PG: fix logic error: don't set need_recovery if peer_missing don't have acting osd.
+40203f7 auth: return error code from encrypt/decrypt; make error string optional
+7762f18 auth: optimize crypto++ key context
+973cd1c auth/Crypto: optimize libnss key
+16b3515 auth: refactor crypto key context
+fb4b6c5 unittest_crypto: benchmark 100,000 CryptoKey::encrypt() calls
+8d16d4c auth/cephx: optimize signature check
+4e14a5f auth/cephx: move signature calc into helper
+e972a69 auth/Crypto: avoid memcpy on libnss crypto operation
+e874a9b do_autogen.sh: unambiguously use nss unless user asks for cryptopp
+ad5a154 auth: make CryptoHandler implementations totally private
+6063a21 logrotate.conf: prefer service over invoke-rc.d
+8aea730 tests: osd-bench.sh must be verbose on failure
+14cb7b8 tests: convert osd-bench.sh to ceph-helpers.sh
+5871781 ceph-helpers: implement test_expect_failure
+7a432f7 civetweb: update max num of threads
+3bfdf54 doc: don't mention ceph osd setmap
+ef7e210 librbd: better handling for duplicate flatten requests
+12f1b30 client: add failure injection for not advancing oldest_tid
+901fd1a mds: include size of completed_requests in session dump
+dcd9302 mds: warn when clients are not advancing their oldest_client_tid
+009664e rgw: force content-type header for swift account responses without body
+b2b443c man: fix the description in NAME section
+d85e0f8 tools: ceph-monstore-tool must do out_store.close()
+ebda4ba Increase max files open limit for OSD daemon.
+a5c0674 client: inlcude inline data in FLUSHSNAP cap message
+2fad86e client: flush dirty snap data before allowing new writes
+d08d834 mds: properly update capability's client_follow
+c9fe13f mds: fix Locker::_do_null_snapflush
+fa34e04 mds: do null snap flush when processing embeded cap releases
+ab44e59 mds: make sure snap inode's last match existing snapshots
+fc758ce mds: remove stray dentry from delayed list when it's queued for purging
+5c09be2 mds: move snap inode to new dirfrag's dirty_rstat_inodes
+d900252 mds: keep auth pin on CDir when marking it dirty
+2e90fa3 osd/PG: check scrub state when handle CEPH_OSD_OP_SCRUB_MAP.
+d2a6728 rgw: don't use end_marker for namespaced object listing
+58a144d rgw: adjust return code if can't find upload
+6ee4f64 rgw: fail if parts not specified on complete-multipart-upload
+66edf62 librbd: always initialize perf counters
+66493b7 cls_rbd: get_features needs to support legacy negative tests
+d49888e rgw: prohibit putting empty attrs on Swift account/cont.
+6322051 rgw: unify mechanisms for setting TempURL and Swift account metadata.
+7dd54fa rgw: make rgw_read_user_buckets() backward compatible.
+f7b92f9 rgw: rectify support for GET on Swift's account with limit == 0.
+837388b rgw: fix lack of account name in XML listing of Swift account.
+16af675 test: add test-case for copy_get flags.
+2b4acfb rados: Add new field flags for ceph_osd_op.copy_get.
+b05d144 AsyncConnection: Fix deadlock cause by throttle block
+0c75b88 AsyncConnection: Avoid lockdep check assert failure
+69bedcd Event: Fix memory leak for notify fd
+cc5f144 mon: osd df: fix average_util calculation
+0374f32 Update Haomai's organization
+e4ebe10 test/cli-integration/rbd: add unmap test
+b1d3f91 rbd: allow unmapping by spec
+087ed06 krbd: add krbd_unmap_by_spec()
+a1363c4 krbd: rename should_match_minor() to have_minor_attr()
+7df2cf2 client: drop inode when rmdir request finishes
+933332b tests: separate check_PROGRAMS from TESTS
+e36df09 doc: Removes references to s3gw.fcgi in simple gateway configuration file.
+41b132f test: return 0 when evicting an not existing object
+1fa343f osd/ReplicatedPG: correctly handle the evict op when obs doesn't exist
+a5b7dec osd: skip promotion for flush/evict op
+d42c2c4 osd: refactor the skip promotion logic
+4b2c6e3 osd/OpRequest: add osd op flag SKIP_PROMOTE
+193f1e3 osd: avoid txn append in ReplicatedBackend::sub_op_modify
+3fdace6 osd: avoid txn append in ReplicatedBackend::submit_transaction
+e89ee9e osd: allow multiple txns to be queued via backend
+0a442ee tests: comment out unstable tiering tests
+34c467b use git://git.ceph.com
+16a3fbd ceph-authtool: exit(1) when printing a non-existing key
+dc43880 releases: table of estimated end of life
+932d59d os/LevelDBStore:fix bug when compact_on_mount
+36db6d2 os/RocksDBStore: fix bug when compact_on_mount
+73fb61a osd: misc latency perf counter for tier pool
+dde64b3 FileStore:: queue_transactions, fine-grain submitManager lock there is no need to lock the code, which is dealing with encoding tls to tbl. The submitManager lock is used to make sure the sequencial submission for op. So encoding transaction data out of lock.
+b3c3a24 RGW: Make RADOS handles in RGW to be a configurable option
+3b60f5f TestPGLog.cc: fix -Wsign-compare
+b1dd699 TestErasureCodeShec.cc: fix -Wsign-compare
+c77ee13 test_async_driver.cc: fix UNINIT
+655c12a psim.cc: change initialzation of VLAs
+c3b2e87 systest_runnable.h: mark copy ctor as explicit
+d508510 common/Thread.h: mark copy/move ctor as explicit
+dfe9f1d cls_rbd.cc: reduce scope of local variable
+dc50f43 ReplicatedBackend.cc: init 'Message *commit' to NULL
+d36f665 osd/PG.h: mark copy ctor explicit
+ed8c284 msg/Dispatcher.h: make copy/move ctor explicit
+cebeac9 test_shared_cache.cc: reduce scope of variable
+feb989c rgw_rest_user.cc: UserQuotas init vars in ctor init list
+8077b52 client/Client.cc: fix uninit variables
+53c9dd4 libcephfs.h: fix unnecessary forward declarations
+4a1bdc7 os/FileStore.cc: fix format qualifier in COMMIT_SNAP_ITEM
+35f61a3 mds/MDCache.cc: fix potential null pointer dereference
+3ed5764 test/../chain_xattr.cc: fix suspicious usage of 'sizeof'
+6a69e90 TestErasureCodeShec: fix some -Wsign-compare
+0074e09 rgw_rest_log.h: remove http_ret from some RGWOp_*_list classes
+e4df034 qa : misc fixes to stabilize test-erasure-code suite
+ba57fb6 ceph-disk: set '--cluster=ceph' in 'prepare' subcommand
+e97111d doc: Changes in region hostname do not seem to take effect until radosgw is restarted.
+b55f3db rgw: Document a lurking problem I see with subdomain/domain splitting
+044060f rgw: Speed up the building of valid de-duplicated hostnames for regions.
+f7311ec doc: Document region hostnames settings.
+d792bd6 tests: verify fast diff support in rebuild object map
+aedcce7 librados_test_stub: add list_snaps implementation for ObjectReadOperation
+6bd2284 librbd: rebuild object map now supports fast diff
+4a6deb3 doc: Corrected the steps of Apache configuration for Debian-based distros in object gateway installation guide.
+882b370 tests: verify RBD flags are updated when enabling/disabling features
+718faf0 mon: make note about osd df calc bug
+f3ce75c mon: fix min variance calc in 'osd df'
+5cc92bb rgw: shouldn't return content-type: application/xml if content length is 0
+46b19cb librbd: update image flags when enabling/disabling features
+9eb1e59 librbd: add new RBD_FLAG_FAST_DIFF_INVALID flag
+ef21647 Move ceph-dencoder build to client
+c2b3a35 Rework mds/Makefile.am to support a dencoder client build
+0b26433 rgw/Makefile.am: Populate DENCODER_SOURCES properly
+fb11c74 Dencoder should never be built with tcmalloc
+570ff6d9 librdb: add perf counters descriptions
+e1f7899 doc: Fix misleading overlay settting in Cache Tier
+b00b821 osd: Break the handling of PG removal event into multiple iterations
+df47e1f client: fix uninline data funtion
+5559462 ceph_json: add decode / encoder for multimap
+4e6a66b cls_rgw: use multimap to keep pending operations in bucket index
+edc0627 rgw: generate new tag for object when setting object attrs
+e5befcd tests: librbd DiffIterateStress now tests object map merging
+b3dab78 tests: add RBD fast diff feature to unit tests
+10f444b pybind: add new diff_iterate features to RBD python bindings
+883169f librbd: snap_remove proxy requests require fast diff feature
+dd82561 tests: relax diff_iterate discard test
+7bffb6d librbd: implement fast diff algorithm
+c26f58c librbd: integrate with new object_map_snap_add/remove cls methods
+3e7a342 tests: correct update_features test for FAST_DIFF feature
+c396918 librbd: ensure fast diff feature relies on object map
+3a239b0 tests: test new RBD fast diff feature
+ed2a1fc librbd: move object map snapshot handling to ObjectMap class
+ff5b849 tests: new test cases for cls_rbd
+d6db277 cls_rbd_client: added object_map_snap_add and object_map_snap_remove methods
+11fd7a9 cls_rbd: added object_map_snap_add and object_map_snap_remove methods
+4ac584c librbd: move object map codes to common location
+11cc7b2 rbd: add support for new fast diff feature
+16ad44c librbd: add new FAST_DIFF feature
+4c1b4a8 tests: add unit tests for proxied snap_remove operation
+63324cc librbd: add snap_remove to image watcher RPC
+639e0c1 packaging: ship systemd/ceph.tmpfiles.d in tarballs
+27ed729 Check that delta_sum.stats.sum.num_object_copies and delta_sum.stats.sum.num_object are greater than zero
+4b23926 mon: remove dead MonitorStore code
+6d22065 doc/release-notes: v0.94.1
+eed80f0 rbd: add new --object-extents option to diff / export-diff
+b3e8a90 tests: add unit test for new diff_iterate2 librbd API method
+6d5b969 librbd: add diff_iterate2 to API
+0d1e770 mon: osd create: add optional 'id' parameter
+190ec03 AsyncConnection: Discard all prefetch buffer when replacing
+464999e tests: display ceph report when stuck
+3ab13e6 install-deps.sh: Debian GNU/Linux wheezy needs backports
+56039b3 do_autogen.sh: add missing '--without-lttng' option to usage
+06f30fc librbd: notify of header update after rebuilding object map
+58be49e librbd: rebuilding object map shouldn't update piecemeal
+3ec68db tests: add test case for cls_rbd object_map_save
+24c923e cls_rbd_client: add object_map_save helper method
+e5adbaa cls_rbd: add object_map_save method
+8ca6220 tests: add ImageWatcher test for new rebuild object map request
+852592a tests: added rebuild_object_map test to test_librbd
+7ba4f10 tests: librados_test_stub reads should deep-copy
+e714800 librbd: allow snapshot object maps to be updated
+92a4256 librbd: update in-memory object map before on-disk update completes
+1b7f8c1 cls_rbd: treat zero-byte object maps as missing
+6ce79ab rbd: add object map rebuild command
+cc3890e librbd: add rebuild_object_map to public API
+844136d librados_test_stub: add another overload of aio_operate
+7ae1b14 librbd: connect async rebuild object map to state machine
+2db758c librbd: require callers to ObjectMap::aio_update to acquire lock
+c0cd382 librbd: added RebuildObjectMapRequest state machine
+1aa801a librbd: correct basic object map errors during refresh
+18fd6ca librbd: use generic helper for issuing async requests
+df55d64 librbd: connect ImageWatch rebuild request to async rebuild method
+d49788b librbd: add preliminary methods for rebuilding object map
+1479161 librbd: add hooks for rebuild object map to ImageWatcher
+a8f828b librbd: add new notify payload for rebuild object map
+99f5a7d librbd: failure to update the object map should always return success
+76fe8d7 tests: librados_test_stub reads should deep-copy
+38b35ab crush: fix has_v4_buckets()
+fc2e511 librbd: ImageWatcher should cancel in-flight ops on watch error
+77788e3 tests: add Dockerfile for fedora 21
+9524957 release-notes: draft v0.94.1 release notes
+5f1259c librbd: Fix image_watcher test
+dcf2b3b Librbd: Don't affect global md_config_t
+16d6616 Librbd: Don't apply to global configserver
+12f5645 osd : populate the needs_recovery_map source fast when only one peer has missing
+15cc83b librbd/AsyncResize: avoid dup incrementing refresh seq
+40aa1a2 monitor: If there is no pg, don't print "too few PGs per OSD" for 'ceph -s'
+4887277 qa/workunits/snaps: add snapshot rename test
+47c2cec mds: update snaptable when renaming snapshot
+f311ac1 mds: rename snapshot support
+d6e2341 crush: fix dump of has_v4_buckets
+1333b67 TestPGLog: fix invalid proc_replica_log test caes
+1e5b220 TestPGLog: fix noop log proc_replica_log test case
+b61e5ae TestPGLog: add test for 11358
+6561e0d PGLog::proc_replica_log: handle split out overlapping entries
+64d1e90 crush/mapper: fix divide-by-0 in straw2
+9914a73 qa/workunits/rbd/copy.sh: removed deprecated --new-format option
+3b95edb tests: ensure old-format RBD tests still work
+808e4a8 rbd: deprecate --new-format command-line option
+4b55af5 common/config_opts: updated rbd_default_format
+0344910 librados_test_stub: added rados_ioctx_cct implementation
+aa88364 ceph.spec.in: set _with_systemd on RHEL 7 and Fedora
+52235e3 qa/workunits/post-file.sh: use /etc/default
+b09b458 doc: when and why publish a point release
+ea6b9a3 doc: add Hammer to the release timeline
+899a0e1 Librbd: aware_metadata_confs->apply_metadata_confs
+31070a5 Librbd: Use boost:assign to init aware_confs
+ea2edf3 Event: process event before pop it to avoid releasing
+e3d62a9 common: make rados bench return correctly errno.
+4ed66a6 librados: For WRITEFULL, check data length don't larger that limit.
+88f5ef4 osd: Fix wrong usage for "ceph tell osd.* debug dump_missing"
+22834ca rgw: add perf counters descriptions
+8e81b0d doc: an erasure code crush ruleset can be dynamically modified
+0498b6a mds: add perf counters descriptions
+8ff8c57 doc/release-notes: note about SHEC
+1cc0181 doc: Corrects rgw.conf file path for Debian-based and RPM-based distros in radosgw man page.
+fb51175 TestCase: Change in testcase output
+b15f6d0  Fix to some of the command line parsing (including rbd)
+b0172d8 rbd: create command throws inappropriate error messages
+d1cb94f RBD: update expunge set for latest test, parameterize test script
+567a7ef RBD: build prerequisites for latest xfstests and update test configuration
+600f2cc README: rm references to old Ubuntu distros
+9fed564 librados: Add config observer for Objecter instance.
+69d680f doc/release-notes: spelling
+6930376 doc/release-notes: correction
+6e20ed6 librbd: moved snap_create header update notification to initiator
+738af38 doc/release-notes: final hammer notes
+2b3dd1b osd: include newlines in scrub errors
+d23766b osd: fix condition for loggin scrub errors
+b597db5 osd: fix fallback logic; move into be_select_auth_object
+a476d8f osd: log a scrub error when we can't pick an auth object
+f581fec osd: repair record digest if all replicas match but do not match
+cf349ff osd: move recorded vs on disk digest warning into be_compare_scrubmaps
+e34d31b osd: be slightly paranoid about value of okseed
+546d1c7 osd: be precise about "known" vs "best guess"
+c39e0e5 osd: record digest if object is clean (vs entire scrub chunk)
+1cf27ae doc/release-notes: make a note about rgw deployment; tweak dedication
+e61c4f0 0.94
+4651597 librbd: simplify AioRequest constructor parameters
+1926bb9 man: enable warning msgs for "all" target
+251ae61 doc: ref 9/ceph-mon using relative path
+a0bdf69 doc: add ":orphan:" field to avoid sphinx warnings
+6dd5704 man: ignore fieldlist when getting desc for manpages
+b532cb3 doc: fix sphinx warnings
+c5f3d4d doc: fix sphinx warnings
+707a2f7 man: appease sphinx by providing a toc doc
+c7effab man: point man/conf.py and Makefile to doc/man
+0e07ccc man: move man/8/*.rst back to doc/man/8
+d8911b4 doc: fix a typo which use "::" for ":"
+1837eb4 doc: fix "Title underline too short" warning
+e8ad0b5 doc: fix sphinx warnings
+2924d3c doc: remove duplicated option doc
+29073d8 rgw: send Content-Length in response for HEAD on Swift account.
+d260a93 rgw: send Content-Length in response for DELETE on Swift container.
+10c1f1a rgw: send Content-Length in response for PUT on Swift container.
+5a64fb5 rgw: send Content-Length in response for GET on Swift container.
+50cf743 rgw: add support for end_marker for GET on Swift container.
+509437f tests: remove dead code in docker-test-helper.sh
+def9862 tests: allow multiple users of docker-tests.sh
+214c8b3 rgw: improve code formatting ONLY.
+d6da80e doc: show href anchor in white in important block
+e7724a1 rgw: send X-Copied-From-Last-Modified header of Swift API.
+6a76b9d mon: add perf counters description
+21b9377 spec.in: sphinx -b man needs sphinx > 1.0 (part 2)
+dfdc7af rgw: remove meta file after deleting bucket The meta file is deleted only if the bucket meta data is not synced
+168881b doc: Replaced with apt word and fixed grammatical error
+e76f84e rgw: quota not respected in POST object
+b47a549 librbd: don't do readahead when m_readahead_pos reaching limit
+04fe269 librbd: start readahead from m_last_pos when the size of the continuing triggering request is big enough
 4347a32 debian: remove lttng checking from rules
-57d2781 (origin/wip-11113-hammer) librbd: snap_remove should ignore -ENOENT errors
+a294daf debian: remove lttng checking from rules
+efb1a2d Update PendingReleaseNotes for firefly change, rgw socket path related
+37b1996 librbd: remove object maps when disabling
+5bc3a45 librbd: flag the snapshot object map as invalid when error occurs
+3a7b28d rbd: add feature enable/disable support
+d1348e5 tests: add update_features test to test_librbd
+28557c3 tests: add update_features test for pybind
+82affca pybind: add update_features to rbd.py
+7cff359 librbd: add update_features to librbd API
+34c347b tests: add test case for cls_rbd set_features
+71b4c12 cls_rbd_client: add set_features helper method
+71dcee7 cls_rbd: add set_features class method
+2e26309 librbd: remove usage of snapshot features
+59b6801 tests: update cls_rbd_client tests for get_features changes
+58d59d6 cls_rbd_client: snapshot_list no longer retrieves features
+281f87f cls_rbd: get_features on snapshots returns HEAD image features
+8f4551c doc/release-notes: 0.67.12 are draft notes
+c0c55c9 doc: add daemon and daemonperf commands description to man
+9839584 ceph daemonperf: add watch interval and count parameters
+8881441 ceph daemonperf: make error message less confusing
+54c8085 doc: Corrects some content for Debian-based distros in simple gateway configuration file.
+106d006 tests: add ceph-disk.sh activate for memstore
+4601e10 tests: move rados put/get tests in a function
+8db870e tests: ceph-disk.sh dmcrypt tests use test_setup_dev_and_run
+8c6d209 tests: merge ceph-disk.sh activate dmcrypt functions
+282ccce tests: add test_setup_dev_and_run to ceph-disk.sh
+c583b57 tests: improve ceph-disk.sh setup/teardown
+eaf8966 tests: do not use -e in ceph-disk.sh
+28269d7 tests: reduce ceph-disk.sh verbosity
+123230c tests: docker-test must bind mount .ccache
+3fa859a osd: kill ager
+b1ab103 osd: remove osd_auto_weight function
+46103b2 ceph_argparse: don't die when called by injectargs
+c579ebf common: ceph_argparse_witharg: pass oss by reference
+0184a40 rgw: add authorization for setting Swift account metadata.
+84d746b rgw: split and refactor RGWPutMetadata classes.
+80570e7 rgw: print TempURL metadata in response for GET/HEAD on Swift account.
+e5a44bb rgw: add support for removing metadata on Swift account.
+f2cc530 rgw: add support for reading metadata on Swift account.
+17536e6 rgw: add support for putting metadata into Swift account.
+366e8a8 rgw : Issue AIO for next chunk first before flush the (cached) data.
+fa359ae client: remove useless perf counters
+66aa905 client: add perf counters description
+2d6e069 os: add perf counters description
+43fe246 common: add perf counters descriptions
+10b882b rgw: implement base for inter-region account metadata.
+8c8ea8a osdc: perf counters description added
+b235a42 Librbd: Discard all global config used for image
+e39070f man: using sphinx-1.0-build if no sphinx-build
+a8eab36 spec.in: sphinx -b man needs sphinx > 1.0
+a3cf004 man: add conf.py to the list of distributed files
+0b20c6b mailmap: Ian Kelling affiliation
+50f4495 mailmap: Vartika Rai affiliation
+0f94587 mailmap: Alexandre Marangone affiliation
+28a0e68 mailmap: Steve Capper affiliation
+3160b39 mailmap: Simon Guinot affiliation
+30ba7f2 mailmap: Yazen Ghannam affiliation
+3d5dae3 mailmap: Lee Revell affiliation
+3a347fd mailmap: Zhi (David) Zhang affiliation And name normalization
+6d534fc mailmap: Bosse Klykken affiliation
+24865de mailmap: Dmitry Yatsushkevich affiliation
+b8da274 mailmap: Greg Farnum affiliation
+ca23e6f mailmap: Boris Ranto affiliation
+81dca50 doc: Fix .gitignore for man directory
+d51b8b3 doc: Add ceph osd pool get <poolname> all to man page
+070cadf mon: Add MIN_READ_RECENCY_FOR_PROMOTE to ONLY_TIER_CHOICES
+f89b60f doc: Updates the radosgw man page with some changes.
+cdd1ed3 doc: Updates simple gateway configuration file with some changes.
+9c00592 doc: Updates gateway installation doc with some changes.
+d198d69 doc: Updates the configuration of mod_proxy_fcgi in terms of localhost tcp and unix domain socket.
+b13e58a doc: Corrects syntax highlighting in the simple gateway configuration doc.
+b1fa7f3 doc: Corrects syntax highlighting in gateway installation doc.
+9d2bc39 doc: Updates the simple gateway configuration doc with configuration for mod_proxy_fcgi instead of mod_fastcgi.
+b09eb13 doc: Replaces reference to mod_fastcgi with mod_proxy_fcgi in gateway installation doc.
+607994d check mdr->ls before trying to write to it
+f8f4e37 doc: Updates the configuration of mod_proxy_fcgi in terms of localhost tcp and unix domain socket.
+46bdf62 doc: Corrects syntax highlighting in the simple gateway configuration doc.
+ef4f2e5 doc: Corrects syntax highlighting in gateway installation doc.
+2d131c1 doc: Updates the simple gateway configuration doc with configuration for mod_proxy_fcgi instead of mod_fastcgi.
+be8cb2d doc: Replaces reference to mod_fastcgi with mod_proxy_fcgi in gateway installation doc.
+6dd7fea doc/release-notes: final dumpling v0.67.12 release notes
+0b8ec6b Scrub: when delete pg, call clear_scrub_reserved().
+137a2d5 osd/PG: Remove the unuseful  judgement.
+a3b000d osd: increment the dirty perf counter when cloning a dirty object in make_writeable
+a0cd70b osd: remove unnecessary code in make_writeable
+6150757 ReplicatedPG::finish_promote: handle results->snaps is empty case
+a45a698 ReplicatedPG::finish_promote: fix snap promote head snaps
+57d2781 librbd: snap_remove should ignore -ENOENT errors
 572a2f5 librbd: get_parent_info should protect against invalid parent
+aa7f0b3 mds: batch up writes in SessionMap::save_if_dirty
+e7812b8 Add rbd_skip_partial_discard flag
+6055df3 Librbd: Make image aware of more configs
+52fd9c6 ceph-disk: follow ceph-osd hints when setting up journal
+542820d ceph-osd: add --check-wants-journal, --check-allows-journal
+0c29343 ceph-osd: fix usage
+81df129 mds: update Makefile for removed script
+aa77a46 ceph_argparse: generalize ceph_argparse_with* routines
+b491c82 rbd cli: remove erroneous arg for ceph_argparse_witharg
+9859cae tests: exercise all RBD features during unit test
+c4c646c make: fix with_rbd guard
+18edc50 make: add with_rbd guard
+7e49804 mds: remove verify-mds-journal.sh script
 0b2e272 ReplicatedPG::cancel_pull: requeue waiters as well
 23efab7 ReplicatedPG: don't write hitset while scrubbing, update scrub_cstat
 85307b9 ReplicatedPG: in do_op, requeue op if we requeue waiters in agent_choose_mode
 b1f078b ReplicatedPG: requeue waiting_for_active before waiting_for_cache_not_full
+d5d6468 doc: fix the architecture diagram in cephfs.rst
+3a5f9c3 rgw: rectify broken statistics during Swift account listing.
 b0a3941 ReplicatedPG::promote_object: do not create obc if not promoting
 bdc664f ECTransaction: write out the hinfo key on touch as well
+486509a mds: make sure lock state not stay in XLOCK/XLOCKDONE indefinitely
+4c122c1 Set disableDataSync to false
+0bd767f Update RocksDB configuration to make it more clear
+febb5a4 librados: define C++ global flags from the C constants
+b2b4369 test: add librados global op flags test
+c7de236 os/KeyValueDB: skip experimental check for test_init
+7e5b81b Revert "librados: remove the unused flags."
 8e5d4c6 osd: drop unused utime_t now arg to issue_repop
 8db4056 osd: do not update mtime when recording digest
+a4c01f3 mark kinetic experimental
+002b7fd mark rocksdb experimental
+1b0b598 mds: persist completed_requests reliably
+c4d8e65 Librbd: Add existing rbd configs to aware table
+cf715bd Librbd: Add tests for aware metadata config
+ccdeaf8 mds: fix out-of-order messages
+364e15b Librbd: Add basic metadata aware method
+59aa670 erasure-code: Update ISA-L to 2.13
+ad15f7d osdc/Striper.cc fix stripe_count == 1 && stripe_unit != object_size
+eaf6e0c Always provide summary for non-healthy cluster.
+c6f1c07 Conditional-compile against minimal tcmalloc.
+aed3434 java: libcephfs_jni.so is in /usr/lib64 on rhel
+f5a95dc java: dump extra info for ftruncate assertion
 491474f test: Add testing for PGLog::filter_log()
 1fcd3fb osd: Create a filter_log for PGLog
 c751191 ceph-objectstore-tool, osd: Filter the pg_log_t for objects no longer in pg
-ccc0839 (origin/wip-11177) common: send cluster log messages to 'cluster' channel by default
+ccc0839 common: send cluster log messages to 'cluster' channel by default
+b489f94 mds: separate MDLog::safe_pos from journaler
+e08bccf vstart.sh: set PATH to include pwd
+9d1391e tests: extend mon_crushmap_validation test
+f4398d2 Fixed the ceph get mdsmap assertion.
+fc9426a xlist: add compare operator for iterator
+21699fc mon: osd setcrushmap: use own timeout implementation
+bc8c2b7 common: SubProcess: timeout support
+ec02441 crush: CrushTester::test_with_crushtool: use SubProcess to spawn crushtool
+5388521 common: SubProcess: helper class to spawn subprocess
+d789f44 mds: properly remove inode after purging stray
+ea32960 cls_rbd: fix read past end of bufferlist c_str() in debug log msg
 f9b98c9 ceph-objectstore-tool: Fix message and make it debug only to stderr
 923d532 ceph-objectstore-tool: Remove bogus comment and eliminate a debug message
-c176ebf (origin/wip-move-code) osd/: Move ReplicatedBackend methods into ReplicatedBackend.cc
+d6acc6a Doc: Incomplete example in erasure-coded-pool.rst
+90c38b5 rocksdb: fix 32-bit build
+ddad2d4 Makefile-rocksdb.am: update for latest rocks
+c176ebf osd/: Move ReplicatedBackend methods into ReplicatedBackend.cc
 e9d6096 ReplicatedPG: remove unused C_OnPushCommit
+6413209 mds: include damaged in MDSMap::dump
+3b2a091 mds: update peer failure response to account for damaged
+f311bb7 mds: add get_down_mds_set method
+7eccf92 mds: account for 'damaged' in MDSMap::is_degraded
+aae265b mds: clarify MDBalancer::send_heartbeat
+c7d5c02 test_libcephfs: do cleanup in ReleaseMounted test
+9418e43 test: add compile-command for test_ceph_daemon.py
+aa6a46c common: print hexadecimal for ghobject_t.generation & shard_id
+4d6b9d1 python-rados: extract type-checking into a decorator
+b9e89fc mon/OSDMonitor: remove trailing whitespaces
+e517612 ReplicatedPG: fix a signed/unsigned comparison warning
+c5cf81d ceph.spec.in: fix _with_systemd conditional
+0c6bd27 client: Hold on to exclusive caps on directories we "own"
+edf64dd osd: do not double-write log entries
+34c7d2c osd: combine info and log writes into single omap_setkeys
+b486e58 osd: pass map to write_info instead of txn
 ddf0292 PG: set/clear CREATING in Primary state entry/exit
-6f218b1 (origin/hammer-11205) qa/workunits/fs/misc: fix filelock_interrupt.py
+6e6771e ceph-disk: add test files to EXTRA_DIST
+b301982 ceph-disk: remove double import
+0f267c1 ceph-disk: create initial structure for tox/unit tests
+2b37d12 osd: more useful message to find out potential unhealth osd
+89fd137 client: conclude -ENOENT when there is null dentry
+ba62027 client: don't clear COMPLETE flag when trimming null dentry
+73e3358 mds: make sure readdir reply include Fs cap for directory
+0c396f8 test: test_lost.sh: update tests
+255dd64 osd: add tests for 'pg mark_unfound_lost' command after osd lost
+5bb5132 osd: fix PG::all_unfound_are_queried_or_lost for non-existent osds
+e6bd722 test: test_common.sh: start_recovery: don't use deprecated command
+5019eae test: test_common.sh: stop_osd: wait for osd termination before return
+560a583 osd/ReplicatedPG: don't check order in finish_proxy_read
+6f218b1 qa/workunits/fs/misc: fix filelock_interrupt.py
+2d62776 mds: drop replayed requests when seesion is closed
+8f91547 mds: fix crash when killing busy session
 1388d6b ReplicatedPG: trim backfill intervals based on peer's last_backfill_started
-924ace8 (origin/wip-11145) rgw: shut down timer before erroring out
+c52b75e rgw: dump object metadata in response for COPY request of Swift API.
+ccf6eaa rgw: refactor dumping metadata of Swift objects.
+94f1375 rgw: add support for X-Copied-From{-Account} headers of Swift API.
+0f92f34 release-notes.rst: update for degraded writes revert
+e1ca446 doc: no longer call out ceph-deploy as new
+ab1740d rocksdb: update to newer version
+c79128a Update RocksDBStore to match new RockDB config API
+b99c714 install-deps: support OpenSUSE
+74d23b6 tests: add OpenSUSE 13.2 Dockerfile
+bdac3dc ceph.spec: update OpenSUSE BuildRequires
+4bd2bd6 librados: remove the unused flags.
+791c387 bug fix: test case for lfn index
+2165d05 test: add test case for ping_monitor
+c3c6090 ceph.in: add ceph ping mon.* for ping all monitor
+f5f4832 doc: remove generated man pages
+301fe6c doc: generate man pages in man/Makefile
+0d35e01 osd: avoid inserting an op into hit set multiple times
+79e1d15 osd/Replicated: Add proper fadvise flags for ops in do_proxy_read.
+ad1fd6e osd/ReplicatedPG: For flush object, set the src object with fadvise sequential/nocache.
+2f61772 osd/ReplicatedPG: using fadvise_dontneed as base tier object fadvise flags when flush object from cache tier to base tier.
+7dcd2ae osd/ReplicatedPG: For promote object, add correctly fadvise flags for copy_get
+c50255f doc/release-notes: draft hammer release notes
+7f03c88 be gender neutral
+924ace8 rgw: shut down timer before erroring out
+7413cd7 Declare libradosstriper library dependencies
 65bb4df ReplicatedPG::promote_object: check scrubber and block if necessary
+8471f92 doc: Fixes spelling and grammatical errors
+005d1f8 doc: Correct git push instructions adding branch name
+1cca0c1 rgw: init script waits until the radosgw stops
+39d0d53 doc: Clarify the requirement that the cinder user's key must be present on the compute nodes
 78c5de7 osd/: s/is_degraded_object/is_degraded_or_backfilling_object
 4a5bd05 Revert "osd/: update peer_missing and local missing if we write a degraded object"
 6ed86b4 Revert "append_log: use remove_snap_mapped_object"
+3944264 obj_bencher: add IOPS metric calculation
+ddb422f obj_bencher: cosmetic display fixes
+e360bfd obj_bencher: generalize vec_stddev function
+069d95e obj_bencher: fix indents
 b84943c Revert "osd/: don't block writes on degraded objects"
 9cefc59 Revert "ReplicatedPG: block writes on degraded objects for ec pools"
 45bff59 Revert "Merge pull request #3641 from athanatos/wip-10731"
 6f23d34 Revert "osd/: s/is_degraded_object/is_degraded_or_backfilling_object"
-fc3ce48 (origin/wip-hammer-rgw) rgw: update makefile to enable civetweb config
+fc3ce48 rgw: update makefile to enable civetweb config
 2f1342e civetweb: update submodule
 e5f3282 rgw: pass civetweb configurables to civetweb
 a8ced2c rgw: don't overwrite bucket / object owner when setting acls
+ece49d1 mds: handle read/replay errors in MDLog with damaged()
+73b591a mds: call damaged() on what were assertions in replay
+6bf1ce7 osdc/Journaler: improved error handling
+2b4d96b mds: catch exceptions in ::decode
+9897369 mds: handle encoding errors on MDSTable load
 3edfa67 Revert "ReplicatedPG: only allow a degraded write if we have at least min_size copies"
 5e4b7b0 Revert "Merge pull request #3911 from athanatos/wip-11057"
+385fe4b rgw: send ETag, Last-Modified in response for copying Swift cobject.
+3e3d954 doc: Adds updated radosgw-admin man page under man/
+e62ff4d doc: Updates radosgw-admin man page.
+923ab6e doc: Adds updated radosgw man page under man/
+e5bc64b doc: Updates radosgw man page.
+db80e45 doc: Adds updated radosgw man page under man/
+7a1984d doc: Updates radosgw man page with configuration for mod_proxy_fcgi .
+777fd88 mds: call damaged() on errors loading SessionMap
+573d94f mon/MDSMonitor: add "mds repaired"
+658d991 mds: emit DAMAGED on MDSTable load error
+c9ff8b4 mds: add damaged() method
+45638cb mds: implement Beacon::send_and_wait
+57e3429 mds: report damaged ranks in health
+42f46b7 mon/MDSMonitor: handle DAMAGED in beacon
+2b4f344 mds: report damaged in MDSMap::print_summary
+d05fe08 mds/MDSMap: update print() for DAMAGED
+9c5af9f mds: add `damaged` set to MDSMap
+d8fa553 mds: introduce DAMAGED state
+699bf38 mds: remove unused Beacon::last_send
+8a813bd mds: remove a spurious #if 1
+0cf1c7a mds: be fairer in enqueuing purges
+5982bbe osd: add two fileds src/dest_obj_fadvise_flags in struct CopyOp.
+23b59f3 librados: add src_fadvise_flags in ceph_osd_op::copy_from
+d9a2ca5 osd: Add func has_flag in MOSDOp.
+c782656 doc: update .rst file with their .8 counterparts
+d7cbf91 osd/ReplicatedPG: For CEPH_OSD_OP_ASSERT_VER, it should use op.assert_ver rather than op.watch.
+27120d7 rbd: move the detection of destination image ealier in rename op
+ada7ec8 test: potential memory leak in FlushAioPP
+865b1e6 rbd: correct the return code and perf counter in discard op
+40c2662 rbd: log perf counters for the read op
+2e69235 rbd: correct the name of mylock in lbirbd::read
 0a0d8f6 doc: Regenerate man/ceph.8 based on ceph.rst changes
 eb890b1 doc: Break ceph osd pool get into sections based on pool type
+3cffbbe osd: return fast if PG::deleting is true in snap_trimmer and PG::scrub
+cd11daa Fix ceph_test_async_driver failed
+c7702bf osd/Replicated: For CEPH_OSD_OP_WRITE, set data digest.
+f6d76f9 osd/ReplicatedPG: Set data/omap digest after promote object.
+ee8c50b osd/ReplicatedPG: cleanup code in ReplicatedPG::process_copy_chunk.
+8f80ae3 osd: write journal header by force when journal write close
 b6512eb erasure code: add shec's documentation / change default layout
+a00cb31 rgw: improve metadata handling on copy operation of Swift API.
+5f7a838 doc: what does it mean for a release to be supported
 175aff8 ceph-objectstore-tool: Use exit status 11 for incompatible import attempt
-68719f5 (origin/wip-omap-clear) osd: fix omap digest clearing for omap write ops
+68719f5 osd: fix omap digest clearing for omap write ops
 d5b3bd7 os/MemStore: make omap_clear zap the omap header too
 d2467e4 ceph_test_rados_api_aio: verify omap_clear clears header, too
 5b23f5b ceph-objectstore-tool: Output only unsupported features when incomatible
-477ac92 (origin/wip-11102) osd: only complain about stored vs actual digest if all peers support it
+cf05817 auth: use crypto_init_mutex to protect NSS_Shutdown()
+ab647bd doc: Add RGW quick start info
+fe7cdb3 mailmap: Wuxingyi affiliation
+a9860f5 doc: remove reference to old ceph-deploy steps
+887cee9 mailmap: Rohan Mars affiliation
+0df83f6 mailmap: Javier Guerra affiliation
+a9ccd55 mailmap: Thomas Johnson affiliation
+336957c mailmap: Alfredo Deza affiliation
+5f8a0d6 mailmap: Vicente Cheng affiliation
+ec0ad93 mailmap: Fix Xan Peng affiliation
+e723116 mailmap: Zhicheng Wei affiliation.
+1bc0960 mailmap: Pete Zaitcev affiliation
+eb4f100 mailmap: Alexis Normand affiliation
+341218c mailmap: Florian Marsylle affiliation
+698905c mailmap: Robin Dehu affiliation
+035ab3c mds: call wait_for_safe twice in command_flush_journal
+4c24d0c auth: reinitialize NSS modules after fork()
+f183cd7 auth: properly return error of cephx_calc_client_server_challenge
+bee7f24 Minor fix: Documentation referred to bootstrap-osd instead of bootstrap-mds
+c087025 mds: improved doxygen comments in StrayManager
+0ae624e mds: update StrayManager op limit on mds_max_purge_ops_per_pg
+3a0caf9 mds: fix parent check in MDCache::eval_remote
+91867b2 mds: make purge op limit dynamic based on PGs/MDSs
+2f1e10c mds: throttle purge stray operations
+de157d9 common/config_opts: add purge throttle options
+f180b64 osdc: make op count in Filer::purge configurable
+763e26c mds: eval stray on remove dentry replica
+28a1016 mds: expire other ranks' mydirs during 'stopping'
+a01e9b1 mds/Server: fix stray reintegration
+9999238 erasure-code: make ErasureCodeIsaTableCache drop entries according to LRU
+9e95952 doc: s/Asphyxiate/Breathe/ in documenting.rst
+e69e850 doc: enable doxygen for enum
+cd69ded doc: Switch doxygen integration back to breathe
+b57e13f test/libcephfs/flock.cc: don't release lock before EWOULDBLOCK check
+ce244ca test/libcephfs/flock.cc: fix synchronization points
+b0c30f6 Thread.cc: Make set_affinity private and correct behavior
+b5c7450 osd/ReplicatedPG: correct the checking if the promoting object is in other hit sets
+537e88e qa/cephtool: Using add-cache rather than add to test ceph health.
+def4fc4 osdc: add epoch_t last_force_resend in Op/LingerOp.
+b509bc7 doc/release-notes: more hammer release notes
+bf814d5 doc/release-notes: make hammer note about filestore format upgrade
+477ac92 osd: only complain about stored vs actual digest if all peers support it
 9a2ff34 PG::find_best_info: reject infos with old last_epoch_started
+936e003 mds: implement 'dump cache' asok command
+abb4a73 mds: use MDSCacheObject::dump in CDir::dump
+107e7c9 mds: add CDentry::dump
+a33d169 mds: extended dump() for CInode
+9184e1d mds: add dump() to MDSCacheObject
+5ca0095 mds: add SimpleLock::dump
+75ff9d6 mds: add a dump() method to MutationImpl
+842a4c3 include/frag: add a dump() method to fragtree_t
+62ee84e mds: move auth_pins attrs into MDSCacheObject
 f96d58b init-radosgw*: don't require rgw_socket_path to be defined
 0712d8d PG: ensure that info.last_epoch_started only increases
 2956ae2 doc: add last_epoch_started.rst
+3998fe7 rgw: rectify 202 Accepted in response for PUT on existing bucket.
+640c235 mds: fix error_str formatting in scrub output
+d5c4420 XIO: Handle queued incoming XIO messages during retry
+cca067e configure.ac: add --disable-gitversion
+6823bcd init-radosgw*: don't require rgw_socket_path to be defined
+36d6eea rgw: don't use rgw_socket_path if frontend is configured
+830752a doc: fix doxygen warnings
+b6cdc56 osd/ReplicatedPG: Fix a memory leak in do_pg_op.
+64851f5 doc: consistent alternate cluster name arguments
+c5835ae mailmap: Xiong Yiliang affiliation
+41859dd mailmap: Raju Kurunkad affiliation
+867d883 mailmap: Gregory Meno affiliation
+634c0e6 mailmap: Gaurav Kumar Garg affiliation
+700973e mailmap: Yuri Weinstein affiliation
+2afe1e3 mailmap: Simon Guinot affiliation
+8118989 common, global: use lttng ust functions for handling fork-like calls
+d5d0aa3 rados.py: fix Object.write() method
 2da9584 PG: make sure to update history.last_epoch_started with first write
+e54d57d tests: remove unused variable
 0d52aca osd: erasure-code-profile incremental rm before set
 b92f9cf mon: informative message when erasure-code-profile set fails
-f5fa25d (origin/wip-rgw-bootstrap) ceph-create-keys: create bootstrap-rgw key
+2395ee1 mds: skip inode that is being puring when opening snap past parent
+0dfab5d mds: don't crash MDS when snapshot data not found
+1956e68 mds: pick head inode when caps follows is zero
+1bb4a9d mds: fix purging snapshotted stray inode
+7f2ddf5 mds: check snapset in journal_cow_dentry()
+7ae66c4 mds: fix fnode.snap_purged_thru check
+635d792 mds: purge stale snap data in CInode with snaprealm
+142a99d mds: fix purging stale snapshot dentries
+b364822 Librbd: Add "start" and "max" arguments to metadata_list
+9df3f79 rgw/logrotate.conf: Rename service name
+f5fa25d ceph-create-keys: create bootstrap-rgw key
 679e266 mon: add 'bootstrap-rgw' profile
-01379bd (origin/wip-refine-build-configuration-hammer) Compile test_build_librgw only if WITH_BUILD_TESTS
-1c20417 (origin/wip-11123) osd: use (robust) helper for setting exists or clearing whiteout
+be81637 doc/release-notes: v0.67.11 draft notes
+f20c22f mailmap: Sage Weil affiliation (fix for faulty 387887893f196ed434c5a06699dde58d1ea1e7bc)
+f8455a1 mailmap: Xavier Roche affiliation
+b3b23d5 mailmap: Matt Benjamin affiliation
+2ce1cf8 mailmap: Haomai Wang affiliation
+765d3c9 mailmap: Travis Rhoden affiliation
+80e89e1 mailmap: Andy Allan affiliation
+01379bd Compile test_build_librgw only if WITH_BUILD_TESTS
+6076233 crc32c: add aarch64 optimized crc32c implementation
+71a5090 ceph.spec.in: fix handling of /var/run/ceph
+1c20417 osd: use (robust) helper for setting exists or clearing whiteout
 f5a2aef PGLog::merge_log: in tail extend case, log.log might be empty
+87528c1 doc-rados-operations-crush-map: Swap Raid4 for Erasure
 88d66ca mon: Support multiple args to ceph pg dump_stuck as in usage and man pages
 a3dfeec Improve "ceph_argparse.py: add stderr note if nonrequired param is invalid"
+9f3f9ef xio: Add xio_transport_type
+8b8a6f1 xio: Update README.xio with dependencies
+3fce475 mds: remove double-define on iterator
+48f18ea librbd/AioRequest.h: fix UNINIT_CTOR
+928eaaa kv_flat_btree_async.h: fix some UNINIT_CTOR issues
+0b40e59 shec/shec.cc: fix uninitialized scalar variable (UNINIT)
+f526dde ErasureCodeShec.cc: fix uninitialized scalar variable (UNINIT)
+a894223 utime.h: fix OVERFLOW_BEFORE_WIDEN
+9a3a8a0 blkdev.cc: fix STRING_OVERFLOW
+e221463 test_async_driver.cc: fix NEGATIVE_RETURNS
+b4cfe73 cls/rgw/cls_rgw_types.cc: fix RESOURCE_LEAK
+d871889 osdc/Striper.cc: fix another OVERFLOW_BEFORE_WIDEN
+3703940 osdc/Striper.cc: fix OVERFLOW_BEFORE_WIDEN
+5951946 qa/cephtool: add ceph for 'ceph osd map pool object namespce'.
+4573f41 mon:make 'ceph osd map' accept namespace.
+c60cd5b doc/rados/operations/add-or-rm-mons: correcting minor typo
+cf80949 librbd: snap_remove should ignore -ENOENT errors
+21afd0e librbd: get_parent_info should protect against invalid parent
 bbe231a PGLog: split divergent priors as well
+d9ea168 Some sanitization work on .mailmap, .organizationmap, .peoplemap : Sorting , Duplicate removal
+6616294 rgw: update keystone cache with token info
 90a0393 PendingReleaseNotes: warn about lttng LD_PRELOAD for daemons
 53cc492 ceph_test_rados_tier: add test case for delete+create compound ops
+c0e6227 mds: give up replicas of a stopping mds's stuff
+d47e622 doc/rados/operations/add-or-rm-mons: revise doc a bit to be less confusing
+8a05092 debian: move /var/lib/ceph/mds to ceph-mds package
+353a325 ceph.spec.in: rm EOL Fedoras; add OBS RHEL5 instead
+703ba37 librbd: acquire cache_lock before refreshing parent
 93ef911 PG: add config to ignore history les in find_best_info
+b38c96f librados_test_stub: AIO operation callbacks should be via Finisher
+3d8c09c AsyncMessenger: Use Thread's method to support set affinity
+489bca1 fix a few minor compilation warnings
+5ba8764 fix doc/dev/network-protocol.rst typo
+d17e2e8 doc: mark the ascii schema as a literal block
+14c02d8 doc: release timeline
 d06c1d7 rhel 5.9 port fixes to compile librados only Signed-off-by: Rohan Mars <code at rohanmars.com>
+b70bd6d Thread: Support set_affinity to bind core
+9a22acc rgw: send Last-Modified header in response for PUT on Swift object.
 f4bc48d doc: Fix ceph command manpage to match ceph -h (hammer)
 9495de47 doc: Fix ceph command manpage to match ceph -h (firefly)
 5680456 PGBackend: do not rewrite ec object oi checksums
 e0bf132 PGBackend: add debug option to rewrite digest even if present
 3858d0b ReplicatedPG: finish_ctx: do not assume that the oi is for the head object
+338b44b packaging: include ceph_perf_objectstore
+3722972 Specify the actual struct used in protocol handshake
+b2781fb Add support for PPC architecture, provide fallback
+fd5df5c librbd: Make metadata_set support multi kv pairs
+978d7b4 librbd: Fix typo and prefix ++
+ba112d5 librbd: Make image metadata copied during image copy
+5cae66a cls_rbd: Fix incorrect assert for prefix metadata key
+5178c09 test_cli: update rbd client help description
+933702d librbd: Add tests for metadata get operation
+da9a057 librbd: Add metadata_get method to support single metadata key retrieve
+dfb3a01 librbd: Make metadata support clone operation
+7380632 rbd.cc: add metadata cli operations
+e926e90 test_librbd: Add librbd interface tests for metadata
+1260af3 test_cls_rbd: add metadata cls tests
+c983c3c librbd: Add librbd interface impl and tracing for metadata
+1b2904e librbd/internal: Add internal implementation for metatdata ops
+47d9cec include/librbd: Add metadata interface to librbd
+ba122b5 cls_rbd: Add cls rbd implementation for librbd metadata
+c1aa344 cls_rbd_client: Add cls rbd client interface to support rbd metadata
+aeca8fb doc: How to generate an object corpus : should be a subsection
+72a26eb doc: s/that will called every time/that will be called every time/
 aceb860 Build ceph-dencoder if server and mds for now
+38bc298 doc: extra \ in CEPH\_AUTH\_UNKNOWN
+10edd5d test_msgr: Add new inject test and add support for handle_reset
+2bb40f9 test_async_driver: Fix incorrect test behavior
+a2f62d3 AsyncConnection: Add SyntheticInjectTest for lossy connection
+f7a1fdb AsyncConnection: Lock existing's lock in advance avoid existing's state changed
+18c3587 AsyncConnnection: Make accept inline to avoid connection marked down
+3afa076 AsyncConnection: Skip _try_send if connection closed
+17955c1 test_msgr: Make each side can initialize connection when !policy.server
+c173cea mds: disable snapshot rstat temporarily
+b815cf6 mds: pin inode when openning snap parent
 c365cac rbd: regenerate rbd(8) man page
 416cbff doc: Fix typo in ceph pg dump_stuck usage in man page
-ba77bda (origin/wip-librbd-invalidate-object-map) librbd: invalidating snapshot object maps should set snapshot flag
+d4b8d90 test_librbd.cc: fix USE_AFTER_FREE
+ba77bda librbd: invalidating snapshot object maps should set snapshot flag
+1b722bb rgw: Swift API. Allows setting attributes with COPY object operation.
 ba4bb22 test_cls_rbd: verify set_flags updates snapshots
 fa25b84 cls_rbd_client: add snap_id param to set_flags
 2d86898 cls_rbd: set_flags can now update snapshots
+e6161b7 auth/Auth.h: mark constructor as explicit
+9921836 sync_filesystem.h: fix unreachable code
+6d9fbbe rbd: fix rw option skipping logic
+2769da4 rbd: recognize cephx_require_signatures and tcp_nodelay options
+84adff4 rbd: add rbd default map options config option
+cd761bb rbd: drop redefining map option warning
+356a749 rbd: regenerate rbd(8) man page
+816619c AsyncConnection: handle socket error ASAP
 c1892a0 rgw_main.cc: fix null-deref in case get_storage() fails
 f5b0151 rgw/rgw_op.cc: fix potential null-deref in strcmp()
 595e87f Fix XioLoopbackConnection Lifecycle.
+d8bf378 osd/ReplicatePG: correctly checking if an object is in hit set
+f07f394 os/FileStore: For getxattr, enlarge the value size avoid try again.
 e1eebb3 osd: refactor RepScrubWQ::_process()
 052debd osd: fix a msg leak when OSD is stopping
+f5735b2 Fix XioLoopbackConnection Lifecycle.
+76d4d4c Fix CMake build w/o RADOSGW.
+f5acf6b ceph-disk: more robust parted output parser
+faa7937 tools/rados/rados.cc: init some vars in constructor
+db175fb rgw_rados: don't assign value to parameter
+778e0ef src/msg/xio/*: reduce scope of some vars
+6f79e4a rgw_quota.cc: init variables in constructor
+16c9237 TestErasureCodeShec_thread.cc: reduce scope of var
+ff760be ObjectStoreTransactionBenchmark.cc: reduce scope of var
+828fc73 objectstore/store_test.cc: reduce scope of variable
+649f0f3 rgw_rados.cc: reduce scope of variable
+0a5e6af rgw_rados.cc: remove unused string vars
+e11f9d2 librbd/internal.cc: reduce scope of some variables
+0d78f44 librbd/ObjectMap.cc: reduce scope of variable, prevent redef
+54aa23a ErasureCodeShec::minimum_to_decode: reduce scope of variables
+de7bcfd rgw/rgw_op.cc: fix potential null-deref in strcmp()
+921c08f rgw_main.cc: fix null-deref in case get_storage() fails
+ab4e37f mds/MDS.cc: add missing asserts to check return values
+a3d0c80 TestErasureCodeShec_all.cc: fix sprintf specifier
+51bec73 rgw: use static_cast instead of c-style cast
+48d27ba osd/ReplicatedPG.cc: use static_cast instead of c-style cast
+5393f32 TestErasureCodeShec_thread.cc: use static_cast instead of c-style
+bfe5883 itest/msgr/test_msgr.cc: prefer ++operator for non-primitive iter
+af3a37f tests: ceph-disk.sh /dev/disk/by-partuuid and /dev/loop fixes
 3ae87c8 XIO: Handle requeue case of XIO messages
+dbb8c93 XIO: Handle requeue case of XIO messages
 5a8d4c5 qa,cephtool: add test case for 'ceph osd tier add-cache'
 83d0581 mon: Don't forget set read_tier/write_tier for 'osd tier add-cache'.
+af0986b cleanup in_prefix function
 23d7991 tests: add unit test for multiple concurrent resize ops
 194fcfd librbd: retrieve image size at start of resize op
 0212ae4 librbd: do not invalidate oversized object map
@@ -538,32 +3668,86 @@ e1eebb3 osd: refactor RepScrubWQ::_process()
 e7f9e3b librbd: use ImageCtx::get_parent_overlap helper function
 286b385 librbd: resize/flatten should validate image is R/W
 9dcd517 librbd: hold write snap_lock when refreshing object map
-ac527a2 (origin/wip-11079) crushtool: improve straw2 compile/decompile test
+5d5b510 doc/release-notes: v0.80.8 and .9 changelogs
+1660d86 rgw: fix handling empty metadata items on Swift container.
+799e05d os, osd: add perf counters description
+af89073 tests: ceph-disk paritition creation and partition table in use
+da22e22 tests: ceph-disk.sh test for second journal partition
+1e462a4 tests: ceph-disk.sh cosmetic changes and reduced verbosity
+064a05e tests: do not use --journal-dio=true
+dda58cd tests: ceph-disk.sh may use uuidgen without PATH
+fbb6df5 tests: ceph-disk.sh can be confused if there are two OSDs
+8d52dc1 tests: teardown on ceph-disk error
+e6dcaa8 rm old ceph-deploy reference
+ac527a2 crushtool: improve straw2 compile/decompile test
 6445d9e crush: fix crush_get_bucket_item_weight and bucket destroy for straw2
 b686eda crushtool: fix straw2 cli test
+75bf4be mds: fix assertion caused by system clock backwards
+830a4a1 tests: install-deps.sh install EPEL and RHEL Optional
 5b58ef1 osdc: fix a memory leak in C_TwoContexts
 e3616a5 doc: rhel6.5 is rhel6
-147b1db (origin/wip-11068-debian-jessie) deb: add zlib1g-dev to Build-Depends for Debian/jessie
+ea61921 mon/OSDMonitor: refactor and new key "all" for osd pool get command
+9b5d79e doc: rm Apache log permissions instructions
+a9c4eda doc/release-notes: update notes for v0.80.9
+76becb2 Event: Let external events ran without lock/unlock
+1c92cb6 AsyncConnection: Don't block process when throttle is full
+147b1db deb: add zlib1g-dev to Build-Depends for Debian/jessie
+aaabd57 doc: fix typo
+302b628 tests: increase the delay waiting for a cluster event
+2c7006f mailmap: Zhi (David) Zhang affiliation
+c327fbb doc: preparing a device implies activate
+35def5c librbd: Add a paramter:purge_on_error in ImageCtx::invalidate_cache().
+101440a librbd: Remvoe unused func ImageCtx::read_from_cache.
+9f80c29 osdc: clean up code in ObjectCacher::Object::map_write
+d7cf7ae osdc: Don't pass mutex into ObjectCacher::_wait_for_write.
+1a48a8a osdc: After write try merge bh.
+d582bda osdc: Make last missing bh to wake up the reader.
+2449fdd osdc: For trust_enoent is true, there is only one extent.
+0bedae3 osdc: In realease(Object *) make the error bh as clean and later remove.
+540346d osdc: In _readx() only no error can tidy read result.
+9748d45 autogen.sh: do not submodule --force if git does not support it
 84f05c0 TestAsyncDriver: Fix typo in ceph_test_async_driver
 918afc0 qa: make rbd-fuse exit cleanly with lttng
 b6326a0 ReplicatedPG: block write on degraded object if there are waiters
-7acfdac (origin/wip-11015) crushtool: test compilation of a rule with straw2 buckets
+7acfdac crushtool: test compilation of a rule with straw2 buckets
 61308cc crush: parse alg names that include a digit
 cb6813c tests: add Debian jessie dockerfile
 657844a tests: DEBIAN_FRONTEND=noninteractive apt-get install
 f8dec72 tests: jq is not available on Ubuntu precise
 4add63c install-deps.sh: strip | in the list of packages
-dbe2b24 (origin/wip-wn-rgw-hammer) rgw: flush watch after unregistering
+dbe2b24 rgw: flush watch after unregistering
 3530a25 Client: do not require successful remount when unmounting
 b90018f The erasure-code is actually required by libcommon
 9b3e1f0 Minor syntax fix-ups for Makefile.am redesign
+3a8b828 doc,tests: force checkout of submodules
+75abe37 tests: sync must also be recursive
 96c685e Minor changes in other Makefile.am files to make them support the build refinement
+f98fd4d rgw: enable end_header() to handle proposal of Content-Length.
 0f04633 Split Makefile.am in src/test
+9cb5742 mds: new OMAP storage for sessionmap
+5a529ea osdc: add omap op perf counters
+48491bd ceph-disk: activate-[all|journal] should suppress
+f33cdbe osdc: fix a memory leak in C_TwoContexts
+afb23f4 FileStore: avoid fiemap detection if explicitly disabled in config
+ef87a25 doc: osd map cache size is a count, not MB
+67b776e FileStore: add config option m_filestore_seek_data_hole
+2b3197e FileStore: moving SEEK_DATA/SEEK_HOLE detection logic into init
+a88712a rgw - make starting quota/gc threads configurable
+9b372fe ceph needs gmock/gtest to be statically linked
 e82ac10 rgw: only finialize finisher if it's not null
 7bab9f7 rgw: fix watch initialization and reinit on error
 387e4f8 rgw: move watch reinit into a finisher
 30c32b4 rgw: add support for new watch/notify functionality
 04437e4 rgw: switch to new watch/notify API
+3113fb0 test: add unit tests for daemonperf formatting
+c3ef640 ceph.in: add 'daemonperf' command
+33a9d70 mds: fix type of some stats
+9ec54a9 mon: add nicknames to some performance counters
+c2fc5d8 osd: add nicks to some stats
+8d94560 osdc: add perf counter nicknames
+ccbf036 mds: mark stats with nicknames
+ef0a7fe common: add optional perf counter nickname
+5eb6e34 common: remove some spurious declarations
 e3a7ab7 Split Makefile.am in src/tools
 14572ff Cleanup src/Makefile's a bit more
 70c89d5 Split Makefile.am in src
@@ -573,49 +3757,128 @@ da9935e Split Makefile.am in cls
 5cbe5ca man: move ENABLE_* to Makefile.am
 e531dd4 Add new configure options to improve build refinement
 c474106 Split Makefile.am in man
+e54fef9 rgw: improve format of X-Timestamp on Swift objects.
+b7e9bf6 rgw: add support for X-Timestamp on Swift containers.
+e7bc673 .gitmodule_mirrors: add gmock.git mirror
+ee54806 osd: add l_osd_op_cache_hit perf counter for cache pool
+a722f0f libcephfs: add libcephfs version code
+87e82ee libcephfs: add ceph_f{get,set,list,remove)_xattr
 fa13a5d qa: fix up rbd permissions test
 cb840cc librbd: remove unneeded assert from unregister_watch()
-8dc0bf8 (origin/wip-10828) osdc/Objecter: clean up oncommit_sync (and fix leak)
+6f5ef73 mon: print warning message if cache tier cache_mode is NONE
+e32da3e rgw: do not pre-fetch data for HEAD requests
+924f85f ceph.spec.in: loosen ceph-test's dependencies
+8dc0bf8 osdc/Objecter: clean up oncommit_sync (and fix leak)
+98a2e5c rados: translate errno to str in CLI
 f7d35b9 osdc/Objecter: count oncommit_sync as uncommitted
-099264f (origin/hammer-backports-hadoop) hadoop: workunits don't need java path
+099264f hadoop: workunits don't need java path
 2f2ace3 qa: update old replication tests
+a3af64c ceph.in: print help on 'osd' or 'mon'
+69161f0 common: add perf counters description
+dce28b4 FileStore: fiemap implementation using SEEK_HOLE/SEEK_DATA
+891f814 mds: flush immediately in do_open_truncate
 a6a6df6 cmake:  build fixes
-137800a (origin/wip-10968) librbd: delay completion of AioRequest::read_from_parent
+137800a librbd: delay completion of AioRequest::read_from_parent
+26b6b01 rados.py: fix pep8 E502, remove redundant '\' in brackets
+1396ac9 rados.py: fix pep E711
+41a8c07 rados.py: fix pep8 indent issues
+12da819 rados.py: fix (white)space issues
+6d078a3 rados.py: fix pep8 E111 indentation (spaces/tab)
+13afbc3 ceph_rest_api.py: fix pep8 indentation errors
+e13d4df ceph_rest_api.py: fix pep8 whitespace issues
+8c8cdf2 ceph_argparse.py: pep8 E721 don't compare types, use 'isinstance()'
+0cc1423 python: fix pep8 'E713 test for membership should be 'not in'
+c64d491 ceph_argparse.py: fix pep8 indentation errors
+1dad726 ceph_argparse.py: fix pep8 E502, remove redundant '\' in brackets
+a1b05f1 ceph_argparse.py: fix pep8 whitespace issues
+afd0f42 cephfs.py: fix pep8 E127/E128 intended lines
+d2dd6dd cephfs.py: pep8 E225. add missing whitespace around operator
+c0d2fcf cephfs.py: fix pep8 E202 whitespaces
+02b9958 cephfs.py: fix inline comments
+eb4f72a rbd.py: remove superfluous-parens around if
+3f30114 add tox.ini for pep8 with some errors to be ignored
+80f1ed9 rbd.py: fix pep8 E101/W191 wrong intended lines
+f4b2040 rbd.py: fix pep8 E128 under-indented lines
+fa93154 rbd.py: fix pep8 E127 over-indented line
+1f211ca python: fix pep8 E302, add missing lines
+71f2686 Client: do not require successful remount when unmounting
+f2b3192 doc/release-notes: add v0.92->0.93 upgrade note
 7d2fe5b librbd: allow AioCompletions to be blocked
 416ce76 librbd: delete Contexts when object map aio_update not required
 1bfd760 librbd: handle possible aio_read return error code
 fb2caa0 librbd: add log message for completion of AioRequest
 1e3f814 mon: ignore crushtool validation if too long
 7a5a635 mon: do not hardwire crushtool command line
-eca153e (origin/wip-10962) test/cli-integration/rbd: updated to new CLI
+eca153e test/cli-integration/rbd: updated to new CLI
 6c2d929 rbd: permit v2 striping for clones and imports
 b5050b6 rbd: fixed formatted output of rbd image features
-6cff494 (origin/wip-10990) qa/workunits/rbd/copy.sh: remove all image locks
+6cff494 qa/workunits/rbd/copy.sh: remove all image locks
+be7b4c3 update some .gitignore files
+a7b3443 submodules: --recursive needed for gtest in gmock
+dbcd55f gmock: add git submodule
+7b41871 gmock: remove in-tree code copy
 0e58463 librbd: missing callback log message for CopyupRequest object map update
 ed9e358 librbd: hide flush log message if no flushes are pending
-e2283e3 (origin/wip-10958) librbd: flush pending AIO after acquiring lock
+d3a3d5a rgw: Swift API. Complement the response to "show container details"
+eb13f2d rgw: don't overwrite bucket / object owner when setting acls
+70fdbc0 tests: keep intermediate docker build layers
+e2283e3 librbd: flush pending AIO after acquiring lock
 472db64 librbd: hold snap_lock between clipping IO and registering AIO
+4ececa3 qa/workunits/fs/misc: fix filelock_interrupt.py
+7f36312 doc: ext4 has a journal
 7e89f51 mon: do not pollute directory with cvs files from crushtool
-2e74959 (origin/wip-10546) mon: Monitor: fix timecheck rounds period
-3f73eb4 (origin/wip-mon-datahealth-fix.hammer) test: encoding: add LevelDBStoreStats and ceph_data_stats to types.h
+7602b12 osd/ReplicatedPG: use swap to reassign src_obc in do_op
+fa8c795 osd/ReplicatedPG: remove dup op mark_started call
+2e74959 mon: Monitor: fix timecheck rounds period
+3f73eb4 test: encoding: add LevelDBStoreStats and ceph_data_stats to types.h
 028806a mon/mon_types.h: allow testing encode/decode of LevelDBStoreStats
 6c7f3a7 include/util.h: allow testing encoding/decoding of ceph_data_stats
 b8c7bae include/util.h: initialize ceph_data_stats to zero
 6c2dea9 mon: mon_types.h: initialize LevelDBStoreStats and avoid craziness
+9ad02c9 cmake:  build fixes
+caa9022 rgw: update makefile to enable civetweb config
+0f8be6f civetweb: update submodule
+4698fbe TestAsyncDriver: Fix typo in ceph_test_async_driver
+3cea092 doc/install/manual-deployment: fix osd install doc
+a23b348 Maipo should also use the local qemu clone
+843ba7d doc/release-notes: fix Takeshi's name
+bc638cf doc/release-notes: fix typo
+6e308dc doc/release-notes: v0.80.9 firefly
+7b1fc5c osd: inject simple sleep in recovery
+3e03b2f doc/release-notes: v0.93 release notes
 1584104 librbd: moved flush / cache invalidate to resize state machine
 9fed4b9 librbd: add AIO version of invalidate_cache
-bebf8e9 (tag: v0.93) 0.93
-6f31458 (origin/wip-hammer-gplv2-text) Add GPLv2 text file
+bebf8e9 0.93
+6f31458 Add GPLv2 text file
+fd0c612 rgw: enforce Content-Length in response for POST on Swift cont/obj.
+ea384f8 rgw: generate the "Date" HTTP header for civetweb.
 2c666f3 librbd: C_SaferCond memory leak
-7ed9640 (origin/wip-fusesystem-10710) ceph-fuse: test dentry invalidation options and fail out if we fail
+e6e493f add more constness
+01c99d0 OSD: mark internal methods as `private`
+14315cf doc: trivial: fix missing newlines in rbd man page
+bc6502f fix unmatched struct versus class warnings
+4791895 common/config: move config_obs_t's dtor back
+24c943d doc/install/manual-deployment: s/mon/osd/ for osd install doc
+8806cd2 doc: add docs about log's "memory level"
+52e87d3 msg/xio: fix build with '--enable-xio'
+652e854 msg/xio: do not reference g_conf and g_ceph_context
+eb422a8 doc: MDS internal data structure
+7ed9640 ceph-fuse: test dentry invalidation options and fail out if we fail
 694529a Client: support using dentry invalidation callbacks on older kernels
 a6ebf67 Client: add functions to test remount functionality
 cd95b29 Client: check for failures on system() invocation
-3ec52da (origin/wip-10961) qa/workunits/rbd/copy.sh: explicitly choose the image format
-286a886 (origin/hammer-10912) client: re-send requsets before composing the cap reconnect message
-f3ad61a (origin/wip-10864-hammer-packaging-rbd-udev) packaging: move rbd udev rules to ceph-common
-ec26f08 (origin/wip-librbd-mdlock) librbd: remove unnecessary md_lock usage
+3ec52da qa/workunits/rbd/copy.sh: explicitly choose the image format
+7d128a0 coverity fix: removing logically dead code
+14d7e36 osd: fix negative degraded objects during backfilling
+224a2d1 TestMsgr: Don't bind addr if not standby
+0047699 AsyncConnection: Drop connect_seq increase line
+67225cb Pipe: Drop connect_seq increase line
+286a886 client: re-send requsets before composing the cap reconnect message
+8ea5a81 client: re-send requsets before composing the cap reconnect message
+f3ad61a packaging: move rbd udev rules to ceph-common
+ec26f08 librbd: remove unnecessary md_lock usage
 1f9782e librbd: move object_map_lock acquisition into refresh()
+40be140 doc: fix typo deebug
 27e5ae6 librbd: don't check if object map is enabled before refreshing
 876f128 librbd: remove object map on rollback if needed
 f4d8d16 librbd: clarify md_lock usage
@@ -626,7 +3889,7 @@ df42fd3 test_librbd: close ioctx after imagectx
 06e5a39 rbd: fix --image-feature parsing
 eef7466 librbd: apply flag read failure to all snaps
 6ac8139 librbd: make ImageCtx->object_map always present
-d611121 (origin/wip-librbd-async-operations) tests: add unit test to verify async requests time out
+d611121 tests: add unit test to verify async requests time out
 c295485 librbd: restart async requests if lock owner doesn't report progress
 c611936 librbd: replace Finisher/SafeTimer use with facade
 41e186a librbd: cancel in-progress maint operations before releasing lock
@@ -637,45 +3900,138 @@ bb4041f librbd: add locking asserts to ImageCtx
 4bcbdbf librbd: fix ImageWatcher::is_lock_supported() locking
 a94ceb6 librbd: add and use a test_features() helper
 cffd93a librbd: use ImageCtx->snap_lock for ImageCtx->features
-468839e (origin/wip-librbd-image-watcher-tests) tests: add additional test coverage for ImageWatcher RPC
+468839e tests: add additional test coverage for ImageWatcher RPC
 915064a librbd: add ostream formatter for NotifyOp
 260c820 fuse: do not invoke ll_register_callbacks() on finalize
-75d8c01 (origin/wip-10862-hammer) mon: do not try and "deactivate" the last MDS
-a79e905 (origin/wip-devel-python-split) qa: fix python-ceph reference
+1c68264 doc/release-notes: final v0.87.1 notes
+ff2d497 TestMsgr: Add inject error tests for lossless_peer_reuse policy
+9f24a8c TestMsgr: Make SyntheticWorkload support policy passed in
+75d8c01 mon: do not try and "deactivate" the last MDS
+ce3d79f mds: remove MDSCacheObject::get_pin_totals()
+d92dbfd mds: optimize CDir::is_{freezing,frozen}_tree()
+d7936da mds: optimize get_projected_{xattrs,srnode}
+96a85e7 mds: use compact_map/compact_set to optimize memory usage of CDir
+151494f mds: dynamiclly allocate data structures for file locks
+00047fb mds: use compact_map/compact_set to optimize memory usage of CInode
+aa46d48 mds: optimize memory usage of inode_t
+3075a07 mds: optimize memory usage of class InodeStore
+dbca3c2 mds: use compact_map to optimize memory usage of MDSCacheObject
+98c368f fragtree_t: define fragtree_t::_splits as compact_map
+c57df9c introduce compact_set and compact_map
+94e47cf common: change default value for perfcounter description ("NO_INFO_FIX">NULL)
+93848b0 mailmap: Kefu Chai affiliation
+061d277 mailmap: Andy Allan affiliation
+e2ba6f3 mailmap: Anis Ayari affiliation
+e57a76f mailmap: Armando Segnini affiliation
+c1fce65 mailmap: Billy Olsen affiliation
+8605e05 mailmap: Ahoussi Armand affiliation
+156a4b9 mailmap: Hazem Amara affiliation
+252f5f0 mailmap: Karel Striegel affiliation
+2042dd4 mailmap: Florian Coste affiliation
+ace7adb mailmap: Petr Machata affiliation
+cac44fd mailmap: Shanggao Qiu affiliation
+39a9b5c mailmap: Thomas Cantin affiliation
+e8ff0a5 mailmap: Tim Freund affiliation
+4de44c4 mailmap: Travis Rhoden affiliation
+857371e mailmap: Kiseleva Alyona affiliation
+5e9acbd mailmap: Baptiste Veuillez affiliation
+b01eb8e mailmap: Viktor Suprun affiliation
+ed8a201 mailmap: Feng He affiliation
+b8afa35 mailmap: Jian Wen affiliation
+aeae8a9 mailmap: Kim Vandry affiliation
+e278484 mailmap: Robert Jansen affiliation
+a6f2f81 mailmap: MingXin Liu affiliation
+3d6750e mailmap: Andrew Bartlett affiliation
+a79e905 qa: fix python-ceph reference
 46b45e3 doc: fix python-ceph refs in docs
 d1c82ea ceph.spec: specify version
 2a23eac debian: split python-ceph
+b3329a9 hadoop: workunits don't need java path
 39982b1 Split python-ceph to appropriate python-* packages
-bd40f23 (origin/wip-librbd-python-tests) tests: speed up Python RBD random data generation
+07ba2df Corrected arch diagram signed off by: pmcgarry at redhat.com
+c7ed277 test/librados/misc.cc: fix -Wsign-compare
+bd40f23 tests: speed up Python RBD random data generation
+d03cc61 doc: add erasure-code-shec to plugin list
+01a113c fix build with clang/clang++
+31b1532 common: add perf counters description
+0be126d erasure-code/shec/shec.cc: fix resource leak
+50b69e7 TestErasureCodeShec_all.cc: prefer ++operator for non-primitive iter
+13f6f26 osd/osd_types.cc: prefer ++operator for non-primitive iter
+5df917c mon/PGMap.cc: prefer ++operator for non-primitive iter
+609a070 erasure-code/shec/shec.cc: reduce scope of variables
+c1abcb7 ErasureCodeShec.cc: prefer ++operator for non-primitive iter
+b7ea692 libradosstriper/striping.cc: fix resource leak
+c1e792d doc: update doc with latest code
+15da810 qa: update old replication tests
+6bc2b024 hadoop: add terasort workunit
 655e616 tests: fix potential race conditions in test_ImageWatcher
-9c03750 (origin/wip-osdc-watch-error) osdc: watch error callback invoked on cancelled context
+9c03750 osdc: watch error callback invoked on cancelled context
 a9bfd5d ceph_test_rados_api_watch_notify: wait longer for watch timeout
-91cda52 (origin/wip-10788) osd: better debug for maybe_handle_cache
+62e7b4a crush: re-organize the help to clarify ordering
+b8d497e crushtool: add test for crushtool ordering
+dca3452 crushtool: send --tree to stdout
+2b92320 crushtool: name osds with --build function
+f52840c crushtool: do not dump tree on build
+b5d6e76 doc/release-notes: v0.87.1
+91cda52 osd: better debug for maybe_handle_cache
 bee9154 osd,mon: explicitly specify OSD features in MOSDBoot
 30c904e osd: do not proxy reads unless all OSDs proxy features too
 e0e765f osd/OSDMap: cache get_up_osd_features
-63f6c9b (origin/wip-librbd-snap-create-race) librbd: fixed snap create race conditions
-69b3857 (origin/wip-10898) librbd: improved ImageWatcher duplicate message detection
+63f6c9b librbd: fixed snap create race conditions
+725822d doc: development workflows
+69b3857 librbd: improved ImageWatcher duplicate message detection
 942f875 librbd: add test instances for watch/notify messages
 10d8686 librbd: minor cleanup of ImageWatcher messages
-0ed296b (origin/wip-librbd-exclusive-lock-config) rbd: disable RBD exclusive locking by default
+0ed296b rbd: disable RBD exclusive locking by default
+280a9bf doc: page title suitable for :doc:
+269df80 doc: spellcheck quick development guide
+89d5200 doc: do not doxygen src/tracing
 2b63dd2 DBObjectMap: lock header_lock on sync()
+a286798 mon: do not try and "deactivate" the last MDS
 170c88d cmake: radosgw, radosgw-admin related fixes
 4feb171 vstart.sh: can use binaries outside of ceph/src
-55d3db9 (origin/wip-10919) cls_rbd: invalidate bufferlist CRC when updating object map
-970bb49 (origin/wip-10883) osd: Fix FileJournal wrap to get header out first
-01f04cb (origin/wip-10914) osdc: pass fadvise op flags to WritebackHandler read requests
+7c8b493 os/chain_xattr: fix wrong `size` for snprintf()
+08503d3 mon: fix bug with counter name (win counter was inc-ed in lose func)
+bbaa113 common: add description in PerfCounter class
+55d3db9 cls_rbd: invalidate bufferlist CRC when updating object map
+1aaff83 doc: explain that tell bench is non-destructive
+970bb49 osd: Fix FileJournal wrap to get header out first
+01f04cb osdc: pass fadvise op flags to WritebackHandler read requests
 c463242 osd/OSDMap: include pg_temp count in summary
+0e99ddb mailmap: Frank Yu affiliation
+daf4630 mailmap: Ali Maredia affiliation
+a28637d mailmap: Takeshi Miyamae affiliation
+1b0c781 mailmap: Wang Zhiqiang affiliation
+5fbb48d mailmap: Haïkel Guémar affiliation
+4d127ba mailmap: Dmytro Iurchenko affilitation
+b58f5dc mailmap: Radoslaw Zarzynski affiliation
+81f9285 mailmap: Mykola Golub name normalization
+9b141e5 mailmap: Vu Phom affiliation
+c8e4b22 mailmap: Min Chen affiliation
 d6e2689 PG: compensate for bug 10780 on older peers
 1d0d3dc PG: add a config option to enable (by default) recovery below min_size
-08eb584 (origin/wip-10899) librbd: fixed ImageWatcher recursive locking issues
+08eb584 librbd: fixed ImageWatcher recursive locking issues
+cd708e2 erasure code: add shec's documentation / change default layout
+21e7b5e mailmap: add Matt Richards to organizationmap
+849de71 mailmap: add Xie Rui to organizationmap
 48466f8 ReplicatedPG: only allow a degraded write if we have at least min_size copies
 d05539e ECBackend: use tbl for empty transaction as well if necessary
 e059d58 ReplicatedBackend: use tbl for empty transaction to backfill peer also
 25f9363 osd/: s/is_degraded_object/is_degraded_or_backfilling_object
-4f55d41 (origin/wip-10900) test/librbd/fsx.c: disable RBD object map for krbd
-487c205 (origin/wip-10892) osd: clear obc cache on_shutdown
-c341c52 (origin/wip-10884-hammer-rpm-devel-split) ceph.spec: split ceph-devel to appropriate *-devel packages
+986d755 rgw: pass civetweb configurables to civetweb
+52772ef mds: wait for mydir replicas to go away in stopping
+dea83ea mds: fix cephfs-table-tool reset snap
+b555e83 mds: fix assertion on HB during suicide
+6164e44 mds: don't try to flush journal on inactive mds
+a30df506 ceph_dencoder: add missing 'skip' from usage
+07f297f mds: remove redundant journal inode
+ab27705 mds: remove some unused declarations
+4f55d41 test/librbd/fsx.c: disable RBD object map for krbd
+37eb8ec osd: dump pg ref ids on shutdown
+44e270d osd/PG: make lock() and unlock() const
+239b198 common/tracked_int_ptr: const variants
+487c205 osd: clear obc cache on_shutdown
+c341c52 ceph.spec: split ceph-devel to appropriate *-devel packages
 ea66f5c doc: RGW is "built on top of librgw" changed to "librados"
 b220b2a librbd: enforce write ordering with snapshot
 0de6b61 librbd: use separate files for snapshot object maps
@@ -686,22 +4042,27 @@ ec922a6 osd: use op (not connection) features for copy-get encoding
 70eab06 osd: pass features through to proxied read op
 248be16 osd: set features explicitly in MOSDOp
 e95d4cc tests: remove tests for when init() is not called in shec (#10839)
+3a5391a Fixed comment (minor)
+20aacf3 Added CephMount.flock() Also added corresponding unit tests
 3e37c13 rm some useless codes
+d231e8b Minor: added owner to debugging output in ceph_flock()
 e7735d3 osd: number of degraded objects in EC pool is wrong when there is OSD down(in)
 e0fbe5c cmake: add librbd/test_fixture.cc
 caf2e1d cmake: add os/XfsFileStoreBackend.cc to rules
 62dd0c2 erasure-code: mark the shec plugin as experimental
 f9c90e7 tests: fix unused variable warning
 3a3bb6d common: capture check_experimental_feature_enabled message
-d8f0215 (origin/wip-10787) mon/OSDMonitor: note osd map epoch in one other place
+d8f0215 mon/OSDMonitor: note osd map epoch in one other place
 d9b0cd3 mon/OSDMonitor: move osd epoch cache update into helper, add fixme
 b80e6ae README.md: fix the indent of a command line usage
+1ea780e AsyncConnection: release message when connection is closed
 4038d21 cleanup: delete useless member variable
 280a198 PG,ReplicatedPG: make missing_digests local to scrub_compare_maps
 8e806bc Docs: OSD name, not id, needs to be given to remove an item from the CRUSH map. Include command for deleteing bucket from the CRUSH map.
+db06582 osd/OSDMap: include pg_temp count in summary
 a5759e9 mon/OSDMonitor: do not trust small values in osd epoch cache
-87544f6 (origin/wip-10844) mon: MonCap: take EntityName instead when expanding profiles
-fd83020 (origin/wip-decoding-oldest-snap) mds: fix decoding of InodeStore::oldest_snap
+87544f6 mon: MonCap: take EntityName instead when expanding profiles
+fd83020 mds: fix decoding of InodeStore::oldest_snap
 6918a98 tests: Dockerfile COPY with two arguments
 98297e0 tests: update docker helper documentation
 1023aa9 tests: add Dockerfile for centos-6
@@ -709,15 +4070,16 @@ b3771eb tests: one Dockerfile per repository:tag
 b9b5868 osd: fix OSDCap parser on old boost/spirit
 4ced591 osd/OpRequest: pass string by const&
 74d5ccf osd/: include version_t in extra_reqids with promote
-de6b53a (origin/hadoop) qa: hadoop plays nice with new teuthology task
+de6b53a qa: hadoop plays nice with new teuthology task
 3c05c9c tests: no need for python-flask at build time
 b24a01b erasure-code: initialize all data members
 1a9d717 erasure-code: fix uninitialized data members on SHEC
 e2a5085 ReplicatedPG::on_change: requeue in_progress_async_reads close to last
 a5ecaa1 ReplicatedPG::on_change: clean up callbacks_for_degraded_object
 2d2dc13 mon/PGMonitor: drop pg ls debug line
-65ce7b7 (origin/wip-da-fix-xio-configure) fix configure to reflect the missing libs for xio
-a5804c4 (origin/wip-early-adopters) doc: add a page for CephFS early adopters
+956d8e7 AsyncConnection: Clean up unused variables
+65ce7b7 fix configure to reflect the missing libs for xio
+a5804c4 doc: add a page for CephFS early adopters
 d237649 doc/cephfs: Ceph FS -> CephFS
 65dfd5d doc/cephfs: clarify the 'not production ready' warning
 700459b build-doc: package checks for fedora
@@ -725,31 +4087,48 @@ c5a74a7 build-doc: don't ignore errors
 94f9f60 tests: clone ceph-erasure-code-corpus from ceph
 807cf57 xio: Fix out-of-order responses
 807af4c cmake: Fix cmakelist to compile librbd
-418ca0c (origin/wip-10784) osd: Update object state after removing watch from object info
+016393d osd: call get_transaction if needed in make_writeable
+79c018d AsyncConnection: Remove useless inject delay in _stop
+a70c111 AsyncConnection: Upper read_until log level
+a640ca9 AsyncConnection: Use retry_global tag instead of retry_session
+f41c640 Erase test files at the end.
+ce7e95d mailmap: add Mykola Golub to organizationmap
+603f4f4 Minor typo and comment addition.
+538f916 Added unit tests for ceph_flock()   * basic unit tests (monothreaded)   * multithreaded unit tests   * multiprocesses unit tests
+2ff5ce6 AsyncConnection: kick out connection if half accept
+7cc109a AsyncConnection: close socket ASAP for standby connection
+418ca0c osd: Update object state after removing watch from object info
 27905fd osd: Simplify handle_watch_timeout() using existing support functions
-45c388f (origin/wip-objecter-linger-locking) objecter: protect linger_op last_error and registered fields
+45c388f objecter: protect linger_op last_error and registered fields
 69ee5ec objecter: remove unused RWLock::Context
+a19cbd3 osd: clear extra get_transaction in do_op
 e01c93d objecter: clarify a few lock assertions
 a60d5fa objecter: remove dead race handling code
 1b2da9b objecter: remove dead _get_op_target_session() method
 4f36eae ceph.spec.in: junit always except for EPEL 6
-9656018 (origin/wip-10791-stopping) MDSMonitor: do not allow MDS to transition from STATE_STOPPING
+9656018 MDSMonitor: do not allow MDS to transition from STATE_STOPPING
 6898f0b rgw: Swift API. The second way of specifying desirable response format.
-e504003 (origin/wip-10719) rgw: remove multipart entries for bucket index when aborting
+e504003 rgw: remove multipart entries for bucket index when aborting
 3e54acb rgw: encode rgw_obj::orig_obj
 64d7265 librados: code reformatting only.
 3f11ab0 librados: add info about restrictions on rados_monitor_log() usage.
 f67bfa2 rgw: Swift API. Support for X-Remove-Container-Meta-{key} header.
 cdfc23f rgw: fix doc, A typo in command line argument name
 2f8d31e rgw: Swift API. Dump container's custom metadata.
+86f50a5 TestMsgr: set "ms_die_on_old_message" to true to indicate out-of-order messages
+25d53f6 AsyncConnection: Allow reply reset tag if first hit
 90d0f0d rados: add 'watch/notify' in rados --help command.
 b623c42 rados: Using readable format to print object mtime when call 'stat obj'.
+2b29165 ReplicatedPG:: find_object_context clear extra creation (head,snapdir) if it is the head object, there is no need to create extra head, snapdir. only creation for other cases we need.
+baaa055 doc: swift tempurl functionality
+92af4fe osd: add EIO injection on EC pool for mdata and data error
 d3fc5bd Fix bug: When run Test_filejournal testcase with gtest argument, all of testcases is failed.
 93f32f0 erasure-code: fix compilation warnings
 01e154d osd: fix PG leak in SnapTrimWQ._clear()
 a850514 fsync-tester: print info about PATH and locations of lsof lookup
-b14ca1f (origin/wip-10737) test_libcephfs: test we can't set layouts on existing files
+b14ca1f test_libcephfs: test we can't set layouts on existing files
 36d37aa rados.py: keep reference to python callbacks
+93e0966 rgw: Adding stats in the header of GET response on account
 7002f93 ShardedThreadPool: make wait timeout on empty queue configurable
 32d7617 osd/PG.c: use boolean to init `bool` variables
 5aa6f91 WorkQueue: make wait timeout on empty queue configurable
@@ -759,7 +4138,7 @@ a3b1583 add tests for ceph pg ls-by-pool
 39e2640 add CLI ceph pg ls-by-osd
 742c5e8 add tests for ceph pg ls-by-primary
 f814262 fix error : ceph pg ls 0
-203c06d (origin/wip-old-gcc) locally disable pragma warnings on gcc < 4.6
+203c06d locally disable pragma warnings on gcc < 4.6
 93629d3 Pipe: conditionally compile IPTOS setting
 b025fbf librbd: consolidate all async operation flush logic
 0a00be2 xlist: added missing include
@@ -771,11 +4150,13 @@ b4f2e75 osd: add perf counter for proxy read
 7920db3 tests: make ceph_watch_wait output watchfile content on failure
 ac9d75c mon: noforward flag for commands that are not supposed to be forwarded
 3ff4821 mon: fix Monitor::_get_moncommand()
-27bab65 (origin/wip-10765) rados: make 'watch' command unwatch when done
-45f842d (origin/wip-doc-ports) doc: remind user to calculate the correct max port
+27bab65 rados: make 'watch' command unwatch when done
+45f842d doc: remind user to calculate the correct max port
 c65d8ed doc: fix "7810" port typo
 fe47e23 doc: correct default of ms_bind_port_max
+0d0b2aa Added ceph_flock() to libcephfs.
 d2eaeea AsyncConnection: fix incorrect condition for exchanging in_seq
+04812b8 crushtool: reorder order of operations
 ab6c65a mon/OSDMonitor: fix nan on 'osd df' variance
 6fe509d mon/OSDMonitor: fix nan on 'osd df' use%
 e070718 osdc/Objecter: do watch/notify op completions synchronously
@@ -785,22 +4166,22 @@ e070718 osdc/Objecter: do watch/notify op completions synchronously
 293cd39 tests: run osd-scrub-repair.sh with make check
 bae1f3e FileJournal: fix journalq population in do_read_entry()
 15350a0 ceph_objectstore_tool: fix check_output on python2.6
-86fd8c3 (origin/wip-10761) librados_test_stub: added new blacklist_add method
+86fd8c3 librados_test_stub: added new blacklist_add method
 7ecf864 tests: update librbd tests to handle blacklisting
 25d7ac2 librbd: optionally blacklist clients before breaking locks
 9dfd4ac mailmap: UMD is University of Maryland
 a616de9 librados: add blacklist_add API method
 6a91f2b libradosstriper: fixed write_full when ENOENT
-0514536 (origin/wip-opath-setattr) client: handle O_PATH fds on setattr too
-636a269 (origin/wip-librbd-maint-tests) tests: add simple tests for proxied maintenance operations
-9f2f306 (origin/wip-10785) librbd: ensure ImageWatcher notifications are idempotent
+0514536 client: handle O_PATH fds on setattr too
+636a269 tests: add simple tests for proxied maintenance operations
+9f2f306 librbd: ensure ImageWatcher notifications are idempotent
 f86fb97 rgw: obj delete operation can use remove_objs param
 538395d rbd.py: Add rbd_read2/rbd_write2 funcion which can handle fadvise flags.
 7890256 librbd: minor ImageWatcher cleanup
 debd7f3 tests: relax librbd ImageWatcher test case state machine
-dfee96e (origin/wip-10770) rgw: send appropriate op to cancel bucket index pending operation
+dfee96e rgw: send appropriate op to cancel bucket index pending operation
 cbfa08e mds: disallow layout changes for files with data
-b3fdf15 (origin/wip-cephfs-snap1) mds: avoid propagrating unnecessary snap rstat to parent
+b3fdf15 mds: avoid propagrating unnecessary snap rstat to parent
 32b0892 mds: update client_need_snapflush when splitting snap inode
 6cb9b01 mds: fix rdlock snap inode
 d62c67c mds: save the oldest snapid of inode
@@ -835,7 +4216,7 @@ dbf09af mds: fix CInode::remove_need_snapflush
 1d9de22 mds: fix MDCache::journal_cow_dentry()
 32b1a9a mds: check snaprealm before drop dentries in deleted directory
 1479300 mds: properly update CInode->first during journal replay
-5b75e30 (origin/wip-10827) osdc/Objecter: linger_register now acquires rwlock
+5b75e30 osdc/Objecter: linger_register now acquires rwlock
 3cf7fcc install-deps.sh: check for debian/control existence
 09c152a run-make-check.sh: git --ancestry-path is not backward compatible
 400ac23 mon: ignore osd failures from before up_from
@@ -852,31 +4233,31 @@ c252e05 AsyncConnection: fixup for 2ffacbe (crc configuration in messenger)
 a3fc9d4 AsyncConnection: fix wrong scope of data blocks
 0db7c48 SubmittingPatches: clarify the use of branches in PRs
 adebf22 rbd_recover_tool: move rbd_recover_tool directory to src/tools subdirectory
-70ae314 (origin/wip-opath) client: fix O_PATH on older Linux systems
+70ae314 client: fix O_PATH on older Linux systems
 2f49de5 ReplicatedPG: block writes on degraded objects unless all peers support it
 2a83ef3 include/encoding: fix an compile warning
 71c6d98 msg: fixup for 2ffacbe (crc configuration in messenger)
-2598fc5 (origin/wip-10734) ObjectStore: fix Transaction encoding version number
-46f9ca4 (origin/wip-10782) pybind: fixed runtime errors with librbdpy
-9124a76 (origin/wip-8600) test/vstart_wrapper.sh: set PATH before calling vstart.sh
+2598fc5 ObjectStore: fix Transaction encoding version number
+46f9ca4 pybind: fixed runtime errors with librbdpy
+9124a76 test/vstart_wrapper.sh: set PATH before calling vstart.sh
 189ef38 init-ceph.in: add $PWD to PATH if running as ./init-ceph
 0d80691 qa: workunits: cephtool/test.sh: test 'ceph osd setcrushmap'
 ff74873 mon: OSDMonitor: use CrushTester::test_with_crushtool
 58ea017 crush: CrushTester: add test_with_crushtool()
 481d563 tools: crushtool: allow '--infile -' to pipe read from STDIN
-b2549a6 (origin/wip-export-dir) mds: implement export dir asok
+b2549a6 mds: implement export dir asok
 190c7e3 client: implement support for O_PATH on Linux.
 911e4c0 client: add support for O_NOFOLLOW in Client::open().
 a7a6fe4 rbd-recover-tool: add usefull information of this tool include README, FAQ, TODO
 ea69219 rbd-recover-tool: add a test case for raw image & snapshot
 4f395eb rbd-recover-tool: implement the function framework add control files & config files
 aa3cda2 rbd-recover-tool: add the basic function files
-97abcdd (origin/wip-traceless-reply) mds: properly record created inode in completed requests list
+97abcdd mds: properly record created inode in completed requests list
 419800f client: re-send request when MDS enters reconnecting stage
 b66caef mds: avoid sending traceless reply for request that created new inode
 d12d92d mds: process completed requests in clientreplay stage
 0548bea doc: Removed references to Inktank suppport.
-2d1803b (origin/wip-da-SCA-20150129) xio/XioMsg.h: use static_cast instead of c-style cast
+2d1803b xio/XioMsg.h: use static_cast instead of c-style cast
 babac0c doc: Removed Inktank reference.
 0d46790 mds/MDS.cc: don't compare integer with bool
 aa7c968 EventKqueue.cc: clarify calculation precedence for '&' and '?'
@@ -914,13 +4295,13 @@ c6a60d9 ReplicatedPG::do_op: ignore snapset_obc if !exists
 85cdd12 tests: add Dockerfile for fedora
 65c7e07 doc/release-notes: v0.92
 f4d6515 install-deps.sh: get lsb_release if needed
-24349a8 (origin/wip-8903) librbd: removed lingering TODO message from ImageWatcher
-17a8330 (origin/wip-10720) mds: fix leaked MDCache::logger
+24349a8 librbd: removed lingering TODO message from ImageWatcher
+17a8330 mds: fix leaked MDCache::logger
 9e9356b librbd: fixed object map issues discovered via fsx
-6071142 (origin/wip-librbd-watch-errors) librados_test_stub: added Rados::watch_flush
+6071142 librados_test_stub: added Rados::watch_flush
 1b110a5 librbd: better handling for image watch errors
 225a19f vstart.sh: fix .ceph_port typo
-8d61c39 (origin/wip-librbd-invalid-map) librbd: prevent copyup during image shrink operations
+8d61c39 librbd: prevent copyup during image shrink operations
 be05d9d rdb: update init-rbdmap to fix duplicate mount point
 fbcde04 cls_rbd: fixed object_map_resize boundary issue
 14424c8 librbd: removing objects can lead to infinite loop
@@ -930,18 +4311,18 @@ bb8c9ae librbd: pending AIO operations are now flushed asynchronously
 d00891f rbd: add image flags to 'rbd info'
 706a655 librbd: added get_flags API methods
 16e4d71 cls_rbd: get_flags now reads all snapshot flags
-7dd2de8 (origin/wip-10707) test/cls_rgw: fix test
-e0f12d9 (origin/wip-autogen) Fix do_autogen.sh so that -L is allowed
-cfab01e (origin/wip-10722) rgw: move perf cleanup before context cleanup
+7dd2de8 test/cls_rgw: fix test
+e0f12d9 Fix do_autogen.sh so that -L is allowed
+cfab01e rgw: move perf cleanup before context cleanup
 4074a91 pybind: fix error hiding and inconsistency on librados load.
 cfcfafc Objecter::_op_submit_with_budget: add timeout before call
-00a3ac3 (tag: v0.92, origin/wip-sam-v0.92) 0.92
+00a3ac3 0.92
 c656bce PGLog: improve PGLog::check() debugging
 05ce2aa qa: use correct binary path on rpm-based systems
-eb526af (origin/wip-10709) rbd: watch command should unwatch before exiting
-2a0e9b7 (origin/wip-optional-encoding) encoding: ignore uninitialized instantiation in boost::optional decode
-f40ee8c (origin/wip-lttng) do_autogen.sh: default to --with-lttng, -L to build without
-7590387 (origin/wip-assert-version) librados: add missing tracepoints
+eb526af rbd: watch command should unwatch before exiting
+2a0e9b7 encoding: ignore uninitialized instantiation in boost::optional decode
+f40ee8c do_autogen.sh: default to --with-lttng, -L to build without
+7590387 librados: add missing tracepoints
 57bac8e osd:  change pg_stat plain to display CRUSH_ITEM_NONE in pgmap output section.
 4aa9f3f man: add rbd status to doc/man/8/rbd.rst
 a007c52 doc: add cephfs disaster recovery guidance
@@ -959,8 +4340,8 @@ a1f634b add CLI ceph pg ls [pool] [state]
 bd8671f ceph_test_rados: fix typo
 ac161bf librados: expose rados_{read|write}_op_assert_version() in C
 11b6424 Add test case for librados assert_version()
-eb45f86 (origin/wip-10106) rgw: flush xml header on get acl request
-de2e5fa (origin/wip-10572) rgw: finalize perfcounters after shutting down storage
+eb45f86 rgw: flush xml header on get acl request
+de2e5fa rgw: finalize perfcounters after shutting down storage
 cab246d librbd: Don't do readahead for random read.
 77689f1 ReplicatedPG::C_ProxyRead: fix dropped lock
 0e3af8d ReplicatedPG: only populate_obc_watchers if active
@@ -985,9 +4366,9 @@ a74296f PG::purge_strays: purge peer_missing as well
 f9abffb osd/: don't block writes on degraded objects
 c632fec osd/: update peer_missing and local missing if we write a degraded object
 56b8fc8 ReplicatedPG: always take recovery_read_lock for recovery
-1c25dba (origin/wip-10701) rgw: use strict_strtoll() for content length
+1c25dba rgw: use strict_strtoll() for content length
 b1435e6 xio: Enforce at least two portals if bind
-cbf0691 (origin/wip-10698) rgw: fail s3 POST auth if keystone not configured
+cbf0691 rgw: fail s3 POST auth if keystone not configured
 10ddab8 doc: Fixed hyperlink.
 a0a5185 rgw: Response 204 when post on containers
 b7ab624 xio: Remove whitespace changes
@@ -1005,20 +4386,20 @@ fc76c89 osdc: add new filed dontneed in BufferHead.
 c83a288 Rework ceph-disk to allow LUKS for encrypted partitions
 707c78b Only create a key of 256 bits length, not 256 bytes
 6a45b8e add all possible ceph-disk run-time requirements to build time deps
-4c50f6a (origin/wip-rgw-versioning-4) rgw: more merge related fixes
+4c50f6a rgw: more merge related fixes
 01cc9d5 rgw: fix merge artifact
 e26023e PG: set scrubber.start = scrubber.end after scrub_compare_maps
 4f9e6ed PG: remove block_writes from scrubber
 bed22b4 doc: improve incomplete state documentation
-89bd6b1 (origin/wip-hints) librados: rename NOREUSE to NOCACHE
-8571739 (origin/wip-10688-boost-157) support Boost 1.57.0
-f623906 (origin/wip-pg-reqids) osd/PGLog: only do slow extra_reqid search of object appears at least once
+89bd6b1 librados: rename NOREUSE to NOCACHE
+8571739 support Boost 1.57.0
+f623906 osd/PGLog: only do slow extra_reqid search of object appears at least once
 eefdb32 osd: preserved extra_reqids on promote, flush
 4d310a8 osd: preserve extra_reqids in PGLog
 2ad229c osd: allow extra reqids to be stashed in each pg_log_entry_t
 ca71376 cls_rgw: fix rgw_bucket_olh_log_entry::epoch initialization
 61378f2 add pg_string_state function
-4feb864 (origin/wip-10194-master) rgw: reuse fcgx connection structure
+4feb864 rgw: reuse fcgx connection structure
 32b9bb7 QueueRing: a reduced contention queue
 ecd5e5a test/encoding/types.h: add new structs for testing
 5849380 cls_rgw: add missing struct test instance generator
@@ -1034,13 +4415,13 @@ caefe69 rgw: version id should not contain underscore
 0b9859b cmake: Fix CMakelist.txt to compile librbd, libcommon
 51dd9b2 xio: Spread and accept connections using different portals
 e761b74 async: directly call delete without check
-68c6f0e (origin/t-miyamae-wip-mshec-r44) mSHEC r44 initial commit
+68c6f0e mSHEC r44 initial commit
 e45e97b run-make-check: use nproc and shorter git expression
 0a998f5 edited failure conclusion
 4527e8f corrected some confusing numbers
 94b60c5 osd: don't log op stats for proxy read in the cache tier
 a66898f cleanup : remove unused perf counter
-a871651 (origin/wip-4087) librbd: object map updates should use AIO
+a871651 librbd: object map updates should use AIO
 4328069 test: correct two issues with the librados_test_stub
 b63246f librbd: refactor existing object map code to its own class
 f4b9442 cls_rbd: object_map_resize shrink protection
@@ -1052,35 +4433,39 @@ f4b9442 cls_rbd: object_map_resize shrink protection
 a41878a librbd: Use object map for IO operations
 7e751ce librbd: Implement object map for tracking in-use objects
 13fd6d1 cls_rbd: Add methods for manipulating an image object map
-ca214c9 (origin/wip-8251-2) rgw: also convert sharded replicalog entries
+ca214c9 rgw: also convert sharded replicalog entries
 0a4956e rgw: get rid of replicalog index_by_instance param
 c4a6eab rgw: fixing rebase casualties
 791c15b rgw: convert old replicalog entries if needed
 778a53a rgw-admin: add replicalog update command
 1cb10d7 async: add same behavior when message seq mismatch with simple
-8d56ade (origin/wip-librbd-tests) tests: ensure RBD integration tests exercise all features
-b6d6f90 (origin/wip-mdsmon-args) mon/MDSMonitor: fix gid/rank/state parsing
+8d56ade tests: ensure RBD integration tests exercise all features
+b6d6f90 mon/MDSMonitor: fix gid/rank/state parsing
 9b9a682 msg/Pipe: set dscp as CS6 for heartbeat socket
 1e236a3 mds: don't join on thread which has not been runned.
-6939e8c (origin/zhouyuan-submodule_https_git) Update git submodule to use the same https protocol
+6939e8c Update git submodule to use the same https protocol
 e393810 librbd: make librbd cache send read op with fadvise_flags.
 a23676b librbd: Don't do readahead for random read.
 8d0295c rgw: extend replica log api (purge-all)
 6b0151c utime: extend utime parsing
 e274e10 rgw: fix replica log indexing
-0be7925 (origin/wip-fuse-regress-note) Update PendingReleaseNotes for ceph-fuse change in Giant
+0be7925 Update PendingReleaseNotes for ceph-fuse change in Giant
 77bd883 test_msgr: add auth enabled basic test
 408db65 async_msgr: crc configuration in messenger
 ce941f6 async: Delete authorizer when connected
-a8e2579 (origin/wip-perf-filter) common: filtering in `perf dump`
+a8e2579 common: filtering in `perf dump`
+e98b5a9 Move the sched_scrub to a new tick timer which does not need to hold the osd_lock.
 e5ddc50 tests: bring back useful test 'ceph tell osd.foo'
+5b8d47c PG looking-up/checking does not need to hold the osd_lock since it is being protected by pg_map_lock, remove the assertion.
 7c59bc0 cleanup: replace some length() with !empty()
 17add06 cleanup: replace some size() with !empty()
 9a9670c tests: better EPEL installation method
-bf05ec1 (origin/wip-gmock) tests: replace existing gtest 1.5.0 with gmock/gtest 1.7.0
-5301b2b (origin/wip-10637) librbd: trim header update not using AIO
+bf05ec1 tests: replace existing gtest 1.5.0 with gmock/gtest 1.7.0
+5301b2b librbd: trim header update not using AIO
+83d1540 bufferlist: add more test case for bufferlist::get_contiguous.
+b14e7d3 bufferlist: Refactor func get_contiguous.
 5cbe0c5 gmock: use Google C++ Mocking Framework for unit tests
-ca1d21e (origin/wip-install-deps) install-deps: fix LC_ALL setting
+ca1d21e install-deps: fix LC_ALL setting
 e4a97c6 librados_test_stub: add IoCtx::get_instance_id()
 f6406c9 qa: small improvements to merge-diff test
 4e88414 rbd-fuse: clean up when shutdown
@@ -1094,28 +4479,30 @@ dfa96c5 librbd: Add maintenance operation requests to ImageWatcher
 e6f1280 librados: Expose RadosClient instance id through librados
 398bc96 librbd: Create async versions of long-running maintenance operations
 87ef462 rgw: format mtime of radosgw-admin bucket stats
-dc1630e (origin/wip-librbd-trim-error) librbd: trim would not complete if exclusive lock is lost
+dc1630e librbd: trim would not complete if exclusive lock is lost
 3347e0d bug: error when installing ceph dependencies with install-deps.sh
-4e90a31 (origin/wip-10617) osd: add failure injection on pg removals
+4e90a31 osd: add failure injection on pg removals
 9b220bd ceph.spec.in: use wildcards to capture man pages
 51e3ffa rgw: reorder bucket cleanup on bucket overwrite
 313d6a5 rgw: access appropriate shard on bi_get(), bi_put()
 b304af3 librbd: clean up log message for copy-on-read
 879fd0c osd: do not ignore deleted pgs on startup
-6f6facb (origin/wip-librbd-close-deadlock) librbd: potential deadlock on close_image
+6f6facb librbd: potential deadlock on close_image
 bda293a librbd: fix copy-on-read / resize down race condition
 e917033 test: add rados_nobjects_list_xyz functions to librados test stub
 7c7f072 librbd: eliminate CoR callback
 9790b0f librbd: use finisher for copy-on-read copyup fulfillment
 cfce41d ReplicatedPG::hit_set_persist: update ssc->snapset as well
 f1f6f0b ReplicatedPG::hit_set_persist: write out oi.digest, these objects are immutable
-4dda030 (origin/wip-noreuse) librados: add FADVISE_NOREUSE
+4dda030 librados: add FADVISE_NOREUSE
 9edd0a5 ceph.spec.in: move rgw logrotate to rgw subpackage
 050a59e osd/OSDMap: remove unused variables
-55dfe03 (origin/wip-4092) librbd: schedule header refresh after watch error
+55dfe03 librbd: schedule header refresh after watch error
 33f0afd doc: Some files still use http://ceph.newdream.net
 2f9b3bd ceph-debugpack: fix bashism of {1..10}
 5652a1d cls_rgw; fix json decoding and encoding of certain type
+dfccd3d bufferptr: Make set_offset() don't beyond the length.
+ed70988 bufferptr: add judgement in set_length() to make don't beyond the raw_length().
 f9d82e6 test: Add --write-fadvise-dontned option to ceph_test_rados
 075c604 librados: Fix error comment.
 f710984 mon: Add "ceph osd pool set/get write_fadvise_dontnned" command
@@ -1137,7 +4524,7 @@ cd7ed04 osd/: s/backfill_read/recovery_read for obc locking
 6a025ef PGLog: trim mod_desc after appending to the log
 a6eadae rbd image_read.sh: disable exclusive locking
 f51ff28 vstart.sh: pull default CEPH_PORT from .ceph_port
-35fcb04 (origin/wip-crush-straw2) Change crush_ln to provide 32 more digits.
+35fcb04 Change crush_ln to provide 32 more digits.
 6289912 crush: improve straw2 adjustment slightly
 32a1ead Add crush_ln to calculate nature log efficently
 0eca13d crush: fix dump for hammer tunables
@@ -1147,37 +4534,42 @@ f51ff28 vstart.sh: pull default CEPH_PORT from .ceph_port
 6e084f6 unittest_crush: rename straw tests
 d4ec757 crush: move default bucket choice into CrushWrapper helper
 14eb1a7 crush/builder: fix warnings
-07eadc4 (origin/wip-10474) FileJournal: Fix hang in FileJournal::close()
+07eadc4 FileJournal: Fix hang in FileJournal::close()
 2865e13 doc: Change Availability text in all of the man pages
 b04f698 Doc: Fix the extra blank space in doc/start/quick-rbd.rst
-9930028 (origin/wip-mon-pgtemp) osd: OSDMap: remove pg temps for inexistent pools
+9930028 osd: OSDMap: remove pg temps for inexistent pools
 34f5c17   use shardid as a key of the shardinfo when "dump_op_pq_state".   Signed-off-by: huangjun <hjwsm1989 at gmail.com>
 804deec Fix memstore free space caculation
 6239151 RocksDBStore: filterpolicy is not deleted when RocksDBStore destructs
-2e8bb1e (origin/wip-10614) test/librbd/fsx.c: disable RBD exclusive locking for krbd
+2e8bb1e test/librbd/fsx.c: disable RBD exclusive locking for krbd
 9ad9ba8 doc: Fix a typo in radosgw-admin doc
 008698b doc: Change Availability text in all of the man pages
 6f44f7a Revert "Revert "Merge remote-tracking branch 'origin/wip-bi-sharding-3' into next""
-90a90bb (origin/wip-rgw-versioning-3) rgw: set default value for swift versioning extension
+90a90bb rgw: set default value for swift versioning extension
 dc11ef1 PGBackend: fix and clarify be_select_auth_object
 26656e3 rgw: fix bucket removal with data purge
 b18b14b ObjectStore::_update_op: treat CLONERANGE2 like CLONE
 4d3b49e rbd: ensure aio_write buffer isn't invalidated during image import
 500f4b4 rgw: assign versioned_epoch on bucket listing response
 8cbfac4 Wrong HTTP header name
-a0af5de (origin/wip-10579) qa: move fs quota to its own dir
+a0af5de qa: move fs quota to its own dir
 2ce38db osd: fix some compile warning
 b9be97f Doc: Fix the typo in doc/rbd/rados-rbd-cmds.rst
+97a6969 mon: do not prime pg_temp when the current acting is < min_size
+4fcbd14 mon: be more careful about when we prime all pgs
+f6fa254 mon: cap the amount of time we spend priming pg_temp
+7a1305b mon: prime pg_temp
+7a04762 mon/PGMap: keep osd -> pg mapping in memory
 fe93f73 test: fix rbd cli tests for new feature bit
 946958c qa: disable automatic locking for manual locking test
 c68ecc7 xio: Enable xio option to call fork init
 b1b299d osd: revert d427ca35404a30e1f428859c3274e030f2f83ef6
 46347fc osd: Transction::append SHOULD NOT modify other.op_bl
 988d007 ReplicatedPG::on_change: clear cache at the end
-8453f71 (origin/wip-10555) mon: Do not allow empty pool names when creating
+8453f71 mon: Do not allow empty pool names when creating
 68af510 ceph-disk: do not reuse partition if encryption required
 48963ba test_msgr: Rename unittest_msgr to ceph_test_msgr
-b311e7c (origin/wip-10257) mon: PGMonitor: skip zeroed osd stats on get_rule_avail()
+b311e7c mon: PGMonitor: skip zeroed osd stats on get_rule_avail()
 8be6a6a mon: PGMonitor: available size 0 if no osds on pool's ruleset
 e84f27e doc: Replace placeholder with 'gateway'
 190c185 librbd: copy-on-read   Addressed Jason's review comments.
@@ -1189,7 +4581,7 @@ f408c8e librbd: copy-on-read for clones, write entire object into child asychron
 08faee0 doc: mon should be listed before osd
 6b1c92c Doc : Documentation#10205
 b8dd228 doc bug 10206
-9147c62 (origin/wip-10576) ceph_test_rados_api_misc: do not assert rbd feature match
+9147c62 ceph_test_rados_api_misc: do not assert rbd feature match
 6d209c2 rgw: cors set attrs on bucket using appropriate call
 2c36798 rgw: switch index ops to new apis
 11befab rgw: break if done when timing out pending olh ops
@@ -1301,7 +4693,7 @@ d53275c ceph_test_objectstore: fix keyvaluestore name
 738f868 PGBackend: do not update oi digest on inconsistent object
 8ab3c41 osd: new pool safeguard flags: nodelete, nopgchange, nosizechange
 6613358 Revert "Merge remote-tracking branch 'origin/wip-bi-sharding-3' into next"
-66f3bd6 (origin/wip-recover-dentries) tools: output per-event errors from recover dentries
+66f3bd6 tools: output per-event errors from recover dentries
 3e15fd5 tools: handle hardlinks in recover_dentries
 9d91757 tools: recover_dentries efficiency
 e2ae91f tweak comment wording in recover_dentries
@@ -1310,7 +4702,7 @@ e2ae91f tweak comment wording in recover_dentries
 d94a7a8 tools: remove duplicated InoTable encoding
 478659f JournalTool: handle corrupt fnodes
 36830eb tools/cephfs: add recover_dentries to journaltool
-46326ff (origin/wip-10388) mds: perfcounters for RecoveryQueue
+46326ff mds: perfcounters for RecoveryQueue
 62b4e43 mds/MDCache: stray/purge perfcounters
 4bd0894 Doc: rbd-snapshot: Fix the typo
 07e1d18 rbd: fix bug about rbd watch command
@@ -1318,7 +4710,7 @@ bbaf582 mon: handle case where mon_globalid_prealloc > max_global_id
 d29aef5 doc: Fix OpenStack Glance configuration
 1d1215f mon: change mon_globalid_prealloc to 10000 (from 100)
 be1416a mon: silently ignore mark_down, mark_disposable on AnonConnection
-b5e1bd1 (origin/wip-osd-df) mon/OSDMonitor: pull totals into the table format
+b5e1bd1 mon/OSDMonitor: pull totals into the table format
 169d680 mon/OSDMonitor: don't -nan for variance when no osds
 9aa0006 common/TextTable: do padding during final step
 ea5166e mon/OSDMonitor: drop unnecessary stringify
@@ -1404,9 +4796,9 @@ b383b52 rgw: enable s3 get/set versioning ops
 0d97b40 rgw: get bucket versioning status op
 8ed79d6 rgw: add versioning_enabled field to bucket info
 50547dc mon: PGMonitor: fix division by zero on stats dump
-dbaa142 (origin/wip-bi-sharding-3) rgw: bilog marker related fixes
-c4548f6 (origin/wip-10439) pybind: ceph_argparse: validate incorrectly formed targets
-80a9d99 (origin/wip-mon-fixes) mon: Monitor: return 'required_features' on get_required_features()
+dbaa142 rgw: bilog marker related fixes
+c4548f6 pybind: ceph_argparse: validate incorrectly formed targets
+80a9d99 mon: Monitor: return 'required_features' on get_required_features()
 ab996c1 mon: Elector: output features in handle_propose()
 bb48ebe mon: Elector: put dangling message reference
 0a369b9 mon: mkfs compatset may be different from runtime compatset
@@ -1421,12 +4813,12 @@ bb582d9 librados: clean up code.
 7cfdba2 librbd: clean up code.
 7372ac7 crush: move two crush tests over
 ddb91b2 crush: rename unit tests
-7e1553c (origin/wip-10553) rgw: fix partial GET in swift
-e7d5fdd (origin/wip-10552) client: fix getting zero-length xattr
+7e1553c rgw: fix partial GET in swift
+e7d5fdd client: fix getting zero-length xattr
 97c1881 osd/PG: do not republish unchanged pg stats
 d2e1049 osd/PG: avoid most of publish_stats_to_osd for non-primary
 65d3342 osd/osd_types: add operator== for pg stat types
-2f31606 (origin/wip-cephfs-tabletool) tools: create cephfs-table-tool
+2f31606 tools: create cephfs-table-tool
 b8ffe5c mds: give MDSTables a `rank` attribute
 ea153c3 mds: abstract SessionMapStore from SessionMap
 d8d608d erasure-code: update links to jerasure upstream
@@ -1437,7 +4829,7 @@ dac666f erasure-code: tests use different pool/profile names
 4d07a32 crush: update tries statistics for indep rules
 6c34528 erasure-code: update jerasure/gf-complete submodules
 fc01b01 doc: add cases where ceph-extras is not needed
-2dc29de (origin/wip-9780-9781) ceph-objectstore-tool, test: Fix objectstore name changed to keyvaluestore
+2dc29de ceph-objectstore-tool, test: Fix objectstore name changed to keyvaluestore
 0c5b66d osd: Get pgid ancestor from last_map when building past intervals
 7fb721c osd: Pass oldest_map when generating past intervals
 b9a0484 osd: build_push_op() handle short reads so recipient doesn't crash
@@ -1490,16 +4882,16 @@ f4fcff1 AsyncMessenger: Bind async thread to special cpu core
 d91fe8b test_shared_cache::get_next: fix get_next call on uninitialized value
 37dc423 shared_cache: reduce duplication between get_next's
 9db5969 fix command 'ceph pg dump_stuck degraded'
-3408156 (origin/wip-mon-drop-conversion) mon: Monitor: drop StoreConverter code
+3408156 mon: Monitor: drop StoreConverter code
 1d814b7 ceph_mon: no longer attempt store conversion on start
-447d469 (origin/wip-9440) mon: Monitor: health to clog writes every X seconds on the second
+447d469 mon: Monitor: health to clog writes every X seconds on the second
 ae1032e mon: Monitor: cache 'summary' string to avoid dups on clog
 fcd7aa0 mon: Monitor: reset health status cache on _reset()
 81a2faf mon: Monitor: write health status to clog every X seconds
 e2d66ae mon: Monitor: 'get_health()' returns overall health status
 7ce770d mon: Monitor: health summary to clog on get_health()
-889969e (origin/wip-fail-idempotent) mon/MDSMonitor: make 'mds fail' idempotent for IDs
-b957fa8 (origin/wip-10547-formatter) tests: adapt to new json-pretty format
+889969e mon/MDSMonitor: make 'mds fail' idempotent for IDs
+b957fa8 tests: adapt to new json-pretty format
 97609a3 test: rename test_activate_osd
 8d8ce96 common: restore format fallback semantic
 e9aeaf8 mailmap: Loic Dachary name normalization
@@ -1508,12 +4900,12 @@ d80ded9 mailmap: David Zhang affiliation
 33ba23f common/shared_cache.hpp: empty() iff weak_refs is empty
 d532f3e remove unused hold_map_lock in _open_lock_pg
 9748655 man: add help for rbd merge-diff command
-6986ec1 (origin/wip-10477) osd/PG: populate blocked_by with peers we are trying to activate
+6986ec1 osd/PG: populate blocked_by with peers we are trying to activate
 5b0e8ae mailmap: Yehuda Sadeh name normalization
 3f03a7b doc/release-notes: v0.91
 4ca6931 doc/release-notes: typo
 e7cc611 qa: ignore duplicates in rados ls
-e5591f8 (origin/wip-10539) qa: fail_all_mds between fs reset and fs rm
+e5591f8 qa: fail_all_mds between fs reset and fs rm
 26a2df2 mailmap: Josh Durgin name normalization
 d6a9d25 doc/release-notes: v0.80.8
 45e9cd5 Fix make check blockers.
@@ -1548,12 +4940,12 @@ a39cbe2 atomic: add and sub return their result
 f57383a Don't use __cplusplus to mean !__KERNEL__
 71e4987 Add missing Messenger::create ms_type in test_msgr.
 3ce683a Fixup int_types.h.
-3424bae (origin/wip-librbd-coverity) librbd: fix coverity false-positives for tests
+3424bae librbd: fix coverity false-positives for tests
 f3a57ee rgw: wait for completion only if not completion available
 833b277 ceph_test_objectstore: enable keyvaluestore experimental option
 204fa0f ReplicatedPG::_scrub: don't record digests for snapdirs
 0172069 common/shared_cache.hpp: restructure clear()
-9daeaec (origin/wip-10382) mds: handle heartbeat_reset during shutdown
+9daeaec mds: handle heartbeat_reset during shutdown
 fc5cb3c osd/ReplicatedPG: remove unnecessary parameters
 78b2cf0 osd: force promotion for watch/notify ops
 c8bef13 osd/OpRequest: add osd op flag CEPH_OSD_RMW_FLAG_PROMOTE
@@ -1593,20 +4985,20 @@ f9b280e Adjust bi log listing to work with multiple bucket shards. Signed-off-by
 751fd07 Adjust rgw bucket prepare/complete OP to work with multiple bucket index shards.
 5d004d3 Implement sharding for bucket creation.
 90a3920 Add a new field to bucket info indicating the number of shards of this bucket and make it configurable.
-364b868 (origin/wip-mon-propose) mon/Paxos: consolidate finish_round()
+364b868 mon/Paxos: consolidate finish_round()
 67a90dd mon: accumulate a single pending transaction and propose it all at once
 d159586 PendingReleaseNotes: make a note about librados flag changes
-725d660 (tag: v0.91) 0.91
-9264d25 (origin/wip-formatter) common/Formatter: new_formatter -> Formatter::create
+725d660 0.91
+9264d25 common/Formatter: new_formatter -> Formatter::create
 617ad5d common/Formatter: improve json-pretty whitespace
 83c3b13 common/Formatter: add newline to flushed output if m_pretty
 e2a7b17 osd/PG: remove unnecessary publish_stats_to_osd() in all_activated_and_committted()
 77bc23c osd/PG: add 'activating' pg state between peering and active
-b578a53 (origin/wip-quota-test) qa: set -e explicitly in quota test
+b578a53 qa: set -e explicitly in quota test
 8d706cd osd/ReplicatedPG: init ignore_cache in second OpContext ctor
 792ac7c osd/ReplicatedPG: fix cancel_proxy_read_ops
 7c664fa Doc: Fix the indentation in doc/rbd/rbd-snapshot.rst
-1c01c3a (origin/wip-librbd-rados-stub) tests: create unittest_librbd
+1c01c3a tests: create unittest_librbd
 c105003 tests: add mock librados API for supporting unit tests
 1e9da43 include: moved RBD's generic FunctionContext wrapper
 1483a43 msg/simple: remove redundant calling of setsockopt
@@ -1614,7 +5006,7 @@ c105003 tests: add mock librados API for supporting unit tests
 3cbe5da mon: check size must larger than zero.
 3f03003 test/mon: Add test case for ceph osd pool set size/min_size.
 e3678f4 mon: check min_size range.
-7945f8d (origin/wip-8900) librbd: flush pending AIO requests under all existing flush scenarios
+7945f8d librbd: flush pending AIO requests under all existing flush scenarios
 2dd0f03 librbd: AIO requests should retry lock requests
 6176ec5 librbd: differentiate between R/O vs R/W RBD features
 544ed96 librbd: Add internal unit test cases
@@ -1624,10 +5016,10 @@ e3678f4 mon: check min_size range.
 ccadff1 librbd: Integrate librbd with new exclusive lock feature
 9ee80b3 librados: bump rados version number
 f5668d6 librbd: Create image exclusive lock watch/notify handler
-2ecd874 (origin/wip-9956) osd: enable filestore_extsize by default
+2ecd874 osd: enable filestore_extsize by default
 b76d0dc os/FileStore: verify kernel is new enough before using extsize ioctl
-f4ff12a (origin/wip-mon-converter) drop ceph_mon_store_converter
-a4152db (origin/wip-fs-reset) mon/MDSMonitor: add confirm flag to fs reset
+f4ff12a drop ceph_mon_store_converter
+a4152db mon/MDSMonitor: add confirm flag to fs reset
 8630696 qa: add `fs reset` to cephtool tests
 1713ffd mon: implement `fs reset`
 1f69476 Fix bug 10503: http://tracker.ceph.com/issues/10503 ceph-fuse: quota code is not 32-bit safe for vxattr output
@@ -1638,7 +5030,7 @@ d300220 rbd: test case for rbd merge-diff
 337f821 rbd: merge two diff files into one single file
 9c82efc rbd: parse diff file body for diff merging
 b4720ae rbd: parse diff file header for diff merging
-f42ef1e (origin/wip-promote) ceph_test_rados: add some debug output
+f42ef1e ceph_test_rados: add some debug output
 1554fbc osd/ReplicatedPG: improve proxy read cancelation
 1c8b679 osd/ReplicatedPG: put proxy read completion on finisher
 650f1d9 osd: tiering: avoid duplicate promotion on proxy read
@@ -1658,7 +5050,7 @@ a4a4149 osd/ReplicatedPG: drop unnecessary cache_mode checks
 1d89f18 osd/ReplicatedPG: make op argument to promote_object optional
 2bbab05 OSD: add a get_latest_osdmap command to the admin socket
 d5e8ef5 doc: Fix PHP librados documentation
-a16e72c (origin/wip-mon-wishlist) doc: mon janitorial list is now a wishlist
+a16e72c doc: mon janitorial list is now a wishlist
 19955fc doc: Replace cloudfiles with swiftclient in Python Swift example
 5461368 mon: paxos: queue next proposal after waking up callbacks
 d375532 rgw: return InvalidAccessKeyId instead of AccessDenied
@@ -1669,14 +5061,14 @@ c1080b2 doc: Clean up pool usage.
 e6a4ab1 client: include ceph and git version in client metadata
 f90c48f Revert "rgw: switch to new watch/notify API"
 f887817 doc: Added section to install priorities/preferences.
-bf8f062 (origin/wip-10311) rgw: only keep track for cleanup of rados objects that were written
+bf8f062 rgw: only keep track for cleanup of rados objects that were written
 55a5c5f tests: temporarily disable unittest_msgr
-26e7d5f (origin/wip-asok-get-subtrees) mds: add asok command for getting subtreemap
+26e7d5f mds: add asok command for getting subtreemap
 1c1897e mds: give CDir a dump() method for JSON output
-a58d104 (origin/wip-mdscacheobject-const) mds: support constness in MDSCacheObjects
-e9033bb (origin/wip-librbd-snap-unprotect) librbd: shadow variable in snap_unprotect and list_children
+a58d104 mds: support constness in MDSCacheObjects
+e9033bb librbd: shadow variable in snap_unprotect and list_children
 cf432fa doc: Add Librados PHP documentation
-d994e60 (origin/wip-table-formatter) common: Formatter: cosmetic re-indent
+d994e60 common: Formatter: cosmetic re-indent
 e797dcf common: Formatter: add TableFormatter class
 c528d87 erasure-code: test repair when file is removed
 9406b7f osd: accumulate authoritative peers during recovery
@@ -1685,11 +5077,11 @@ ecc3bca os: remove debug message leftover in FileJournal
 63c8dd0 msg: initialize AsyncConnection::port
 c60f88b Bump memstore_device_bytes from U32 to U64
 8811df3 FileStore: return error if get_index fails in lfn_open
-20be188 (origin/wip-peeringqueue) osd: assert there is a peering event
+20be188 osd: assert there is a peering event
 492ccc9 osd: requeue PG when we skip handling a peering event
 49d114f librados: Translate operation flags from C APIs
-8d52782 (origin/wip-10445) rgw: use gc for multipart abort
-6edfcc1 (origin/wip-nits) mds: allow 'ops' as shorthand for 'dump_ops_in_flight'
+8d52782 rgw: use gc for multipart abort
+6edfcc1 mds: allow 'ops' as shorthand for 'dump_ops_in_flight'
 1617fbb osd: allow 'ops' as shorthand for 'dump_ops_in_flight'
 3d2fbf7 tests: group clusters in a single directory
 bd7be04 doc: don't suggest mounting xfs with nobarrier
@@ -1697,11 +5089,11 @@ bd7be04 doc: don't suggest mounting xfs with nobarrier
 27cb78b mon, os: check the result of sync_filesystem.
 3646e1e encoding: wrap ENCODE_START/FINISH arguments
 fa96bb4 librbd: Stub out new exclusive image feature
-478629b (origin/wip-10471) rgw: index swift keys appropriately
-97cc409 (origin/wip-mdsmonitor-fixes) mon/MDSMonitor: fix `mds fail` for standby MDSs
+478629b rgw: index swift keys appropriately
+97cc409 mon/MDSMonitor: fix `mds fail` for standby MDSs
 c400ba1 mon/MDSMonitor: respect MDSMAP_DOWN when promoting standbys
-487c22a (origin/wip-10384-ceph-test-helper-races) init-ceph: stop returns before daemons are dead
-3d08a68 (origin/wip-da-SCA-20150102) msg/async/AsyncConnection.cc: reduce scope of variable
+487c22a init-ceph: stop returns before daemons are dead
+3d08a68 msg/async/AsyncConnection.cc: reduce scope of variable
 b99508b osd/ClassHandler.cc: move stat into error handling
 fd4ac46 crush/crush.c: prevent DIVIDE_BY_ZERO
 aacdaae src/common/obj_bencher: fix some UNINIT issues
@@ -1728,7 +5120,7 @@ f19c2e6 mds/Server.cc: fix compiler warning
 3ccd792 client/Client.cc: prefer ++operator for non-primitive iterators
 1a00fb7 test/librados/aio.cc: release completion before exit
 2f54209 test/librados/aio.cc: delete AioCompletion* before return
-485d139 (origin/wip-10041) client: fix mount timeout
+485d139 client: fix mount timeout
 397b261 common: Don't call ioctl(BTRFS_IOC_SYNC) in sync_filesystem.
 3f7faa4 common: Directly return the result of syncfs().
 1c38895 disable tcmalloc by default when enable jemalloc
@@ -1747,14 +5139,16 @@ dbae922 osd: Add Transaction::TransactionData for fast encode/decode
 de02134 tests: resolve ceph-helpers races
 bea2d4a qa: drop tiobench suite
 8618a53 cli: ceph: easier debugging (pdb) with 'CEPH_DBG=1'
-55f8fee (origin/wip-10412) client: fix use-after-free bug in unmount()
+226b61a common/PriorityQueue: add test cases for it
+aff4499 common/PrioritizedQueue: do not 'using namespace std'
+55f8fee client: fix use-after-free bug in unmount()
 93d39a8 cleanup: fix a compile warning
 ea3ed5b TestLFNIndex.cc: For root, dont do permission operations.
-488355c (origin/wip-fix-copyfrom) osd: fix object_copy_data_t compat encoding
-ed50377 (origin/wip-repop) osd/PG: print last_update_applied if != last_update
+488355c osd: fix object_copy_data_t compat encoding
+ed50377 osd/PG: print last_update_applied if != last_update
 78d2d31 Add MOSDRepOp and MOSDRepOpReply
 6ca2eb8 configure: show pkg names when libkeyutils is missing
-5cf84e6 (origin/wip-watch-leak) librados: fix leak of WatchContext on unwatch
+5cf84e6 librados: fix leak of WatchContext on unwatch
 b5c24e4 qa: add test_tell, which currently is used only for testing 'ceph tell mon version'.
 244777d test/bufferlist: For root, don't do permission operation for read_file case.
 3b4e021 qa: refactor 'ceph -w' magic in test_mon_misc by introducing helper functions ceph_watch_start and ceph_watch_wait so they can be reused in other tests.
@@ -1762,8 +5156,8 @@ b5c24e4 qa: add test_tell, which currently is used only for testing 'ceph tell m
 c7cc6af rbd: write a diff description into the merged diff file
 bca862c rbd: parse merge-diff arguments
 0b6ab82 rbd: add an option merge-diff
-bdd0e3c (origin/wip-10351) mount.ceph: avoid spurious error message
-6e67450 (origin/wip-10387) client: close dirfrag when trying to trim an inode
+bdd0e3c mount.ceph: avoid spurious error message
+6e67450 client: close dirfrag when trying to trim an inode
 274b989 doc: rm reference to old Ubuntu release
 0471be6 debian: create a repository from sources
 c8f56ab AsyncMessender: use IOV_MAX instead of IOV_LEN
@@ -1771,10 +5165,10 @@ d49b694 AsyncMessenger: s/sended/sent/
 c9da51d AsyncMessenger: should retry in case of EINTR
 169e8bd msg: fix factory error message
 ec1f58a unittest_msgr: enable experiemntal async msgr
-1265603 (origin/wip-10441-b) osd: force read bit for watch
+1265603 osd: force read bit for watch
 4642441 osd: drop unused watch_info_t w
 cef7a82 KeyValueStore:: do_transactions: clean up code which is never used clean up ops and bytes which is never used in do_transactions()
-dfd6a38 (origin/wip-mon-janitorial-list) src/doc: add monitor janitorial list
+dfd6a38 src/doc: add monitor janitorial list
 1dba143 Make pg dump {,sum,pgs,pgs_brief} work for format=plain
 7d75f0c Makefile: include radosgw-admin in base
 38350a0 client: fix quota signed/unsigned warning
@@ -1782,7 +5176,7 @@ b7b4534 common: log which experimental features are enabled.
 55405db ms: mark async messenger experimental
 44ce7cc os: rename keyvaluestore-dev -> keyvaluestore; mark experimental
 8a55972 common: add 'enable experimental data corrupting features'
-783956c (origin/wip-10341) tools: ceph-monstore-tool: validate start/stop vals
+783956c tools: ceph-monstore-tool: validate start/stop vals
 60e9c3f tools: ceph-monstore-tool: output number of paxos versions dumped
 b27ca4c tools: ceph-monstore-tool: raw map to stdout if outfile not specified
 68f3b77 tools: ceph-monstore-tool: use subcmd parser helper
@@ -1804,8 +5198,9 @@ fe44cd3 doc: Updates information for SSL configuration in Ceph Object Gateway in
 45fb9a3 FileJournal: Don't cache journal data if journal w/o directio mode
 e4e1777 AsyncMessenger: fix the leak of file_events
 70ef30f osd: osd tree to show primary-affinity value
-aa56ee4 (origin/wip-10422) mon: provide encoded canonical full OSDMap from primary
-d7fd6fc (origin/wip-10372) osdc/Objecter: improve pool deletion detection
+45094ff WBThrottle: Don't one by one handle statistics data  in clear().
+aa56ee4 mon: provide encoded canonical full OSDMap from primary
+d7fd6fc osdc/Objecter: improve pool deletion detection
 a540ac3 librados: only call watch_flush if necessary
 6b030aa mds: add default ctor for quota_info_t
 ee6529b AsyncMessenger: Fix leak memory
@@ -1818,19 +5213,21 @@ b41a739 10132: osd: tries to set ioprio when the config option is blank
 6b01b5e AsyncConnection: Ensure reply connect_seq larger than sent
 926a1b7 librados: only call watch_flush if necessary
 73257e8 os: WBThrottle: optimize map to unordered_map Using unordered_map to save the cpu cost and acceralate map::find() operation.
-d3fb563 (origin/wip-10415) libcephfs/test.cc: close fd before umount
+573d2cc WBThrottle:Draw a common func beyond_limit which whether beyond limit.
+6b054fc WBThrottle: call signal only beyond limit when queue entry.
+d3fb563 libcephfs/test.cc: close fd before umount
 b95c73e librados: warn about rados_watch_flush() prior to ioctx shutdown
 93825bf librados: watch_flush() on shutdown
 7de1b4d librados: add rados_watch_flush() call
 5cf4483 osdc/Objecter: do notify completion callback in fast-dispatch context
 1fbe9b6 librados: warn about rados_watch_flush() prior to ioctx shutdown
-4ce6d25 (origin/wip-stop.sh-me) stop.sh: killall -u takes username, not uid
-dab5391 (origin/wip-10414) client: fix uninitialized member
+4ce6d25 stop.sh: killall -u takes username, not uid
+dab5391 client: fix uninitialized member
 3f3f2fa osd: be_compare_scrubmaps uses incorrect j iterator
-d87918a (origin/wip-10409) osd: scrub: only assume shard digest == oi digest for replicated pools
+d87918a osd: scrub: only assume shard digest == oi digest for replicated pools
 a25429c osd: clean up use of hex for digests
 9c96fbb osd/ECBackend: use correct seed for (empty) omap digest
-7f9c03d (origin/wip-pg-stat) mon/PGMap: restructure 'pg stat' formatted output
+7f9c03d mon/PGMap: restructure 'pg stat' formatted output
 e89bafb remove unmatched op code comparasion
 4bb6e29 tests: use port 7111 for osd-copy-from.sh
 6f8aad0 tests: recovery of a lost object in erasure coded pools
@@ -1851,11 +5248,11 @@ f080595 tests: avoid bash == --shell confusion in docker-test-helper.sh
 d8f84d1 tests: docker images must use install-deps.sh
 80f20f8 tests: run-make-check.sh install jq
 2de2c4b install-deps.sh: do not require sudo when root
-e99da68 (origin/wip-mon-health) mon: make 'ceph -s' show pg state counts in reverse descending order
-9c8827a (origin/wip-osd-ctor) osd/ReplicatedPG: initialize new_backfill in ctor
-30678f6 (origin/wip-watch-notify-2) librados: watch_flush() on shutdown
+e99da68 mon: make 'ceph -s' show pg state counts in reverse descending order
+9c8827a osd/ReplicatedPG: initialize new_backfill in ctor
+30678f6 librados: watch_flush() on shutdown
 4ebd4b4 librados: add rados_watch_flush() call
-218de82 (origin/wip-9059-checksums) osd: scrub: wait for digest updates to apply before next scrub chunk
+218de82 osd: scrub: wait for digest updates to apply before next scrub chunk
 1646d17 osd: change omap data encoding in object_copy_data_t
 925f572 test/osd/osd-copy-from: simple test of copy-from and error injection
 6d80078 rados: use copy_from for rados cp (and cppool) command
@@ -1878,30 +5275,30 @@ e68d771 osd: drop vestigal invalid_snapcolls fields from scrub
 056de09 osd/ReplicatedPG: set and invalidate data/omap digests on osd ops
 9d5d491 doc: Adds updated ceph-deploy man page under man/
 8c38cc6 doc: Fixes a typo in ceph-deploy man page.
-46a1a4c (origin/wip-10319-wusui) If trusty, use older version of qemu
-7f1e510 (origin/wip-10400-quota-info-t) mds: add default ctor for quota_info_t
+46a1a4c If trusty, use older version of qemu
+7f1e510 mds: add default ctor for quota_info_t
 18d6b20 doc/release-notes: v0.90
 9b78daf osdc/Objecter: do notify completion callback in fast-dispatch context
 ecbdbb1 tests: temporarily disable unittest_msgr
-08bd1e1 (tag: v0.90) 0.90
+08bd1e1 0.90
 49c2322 doc: Instead of using admin socket, use 'ceph daemon' command.
 a302c44 ceph-disk: Fix wrong string formatting
 2f63e54 cleanup : remove sync_epoch
-19dafe1 (origin/wip-10255-wusui) Remove sepia dependency (use fqdn)
-1eb0cd5 (origin/wip-osdmap) osd: only verfy OSDMap crc if it is known
+19dafe1 Remove sepia dependency (use fqdn)
+1eb0cd5 osd: only verfy OSDMap crc if it is known
 1b7585b stop.sh: only try killing processes that belong to me
-0af2a1c (origin/wip-10326) qa/workunits/rest/test.py: fix pg stat test
+0af2a1c qa/workunits/rest/test.py: fix pg stat test
 2a1bd76 .gitmodules: update ceph-object-corpus auth repo to github
 623ebf0 osd: clear ReplicatedPG::object_contexts when PG start a new interval
 1f9c087 AsyncConnection: Fix time event is called after AsyncMessenger destruction
 20ea086 PipeConnection: Avoid deadlock when calling is_connected
 9783a5c test/msgr/test_msgr: Fix potential unsafe cond wakeup and wrap check
-bba4d35 (origin/wip-librados-init) librados: init last_objver
-2cd9dc0 (origin/wip-caps-init) messages/MClientCaps: init peer.flags
-679652a (origin/wip-osdmap-leak) osd: fix leaked OSDMap
+bba4d35 librados: init last_objver
+2cd9dc0 messages/MClientCaps: init peer.flags
+679652a osd: fix leaked OSDMap
 18f545b librados: Avoid copy data from librados to caller buff when using rados_read_op_read.
 001ea29 Messenger: Create an Messenger implementation by name.
-3a2cb71 (origin/wip-fix-asok) mds: fix asok on rank 0
+3a2cb71 mds: fix asok on rank 0
 8de9a0f doc: Adds updated man page for ceph under man/
 8b79617 doc: Changes format style in ceph to improve readability as html.
 7093cb3 doc: Adds updated man page for ceph-disk under man/
@@ -1915,17 +5312,17 @@ d61b1d9 shared_cache: add lookup_or_create, get_next, etc. and their unittests
 2b577ce packaging: package ceph-deploy(8)
 c664818 doc: Adds updated man page for ceph-deploy under man/
 e638469 doc: Updates man page for ceph-deploy.
-a806778 (origin/wip-qa-empty-xattr) qa: test zero size xattr
+a806778 qa: test zero size xattr
 42dc937 librados: avoid memcopy for rados_getxattr.
-4f72ba5 (origin/wip-cot-rename) ceph_objectstore_tool: Rename generated binary to ceph-objectstore-tool
-1b2b344 (origin/wip-10335) MDS: do not allow invocation of most commands on an inactive MDS
+4f72ba5 ceph_objectstore_tool: Rename generated binary to ceph-objectstore-tool
+1b2b344 MDS: do not allow invocation of most commands on an inactive MDS
 69fa532 ceph.spec.in: quote %files macro in comment
 aea232c client, librados, osdc: do not shadow Dispatcher::cct
 378ebb7 python-rados: refactor class Rados a little bit
 59b70fe Cleanup: Drop hobject_incorrect_pool
 0c9d55d pybind/cephfs: Fix setxattr function. Pass value as character pointer
-023a8ac (origin/wip-test-helpers) tests: ignore xmlstarlet extra empty lines
-e582135 (origin/wip-pgmeta) os/FileStore: better debug on omap_setkeys
+023a8ac tests: ignore xmlstarlet extra empty lines
+e582135 os/FileStore: better debug on omap_setkeys
 f6ca2bc os/FileStore: better debug on unlink
 43f0bcb os/MemStore: drop support for collection attrs
 17e4e19 os/ObjectStore: deprecated collection_getattr et al
@@ -1961,7 +5358,7 @@ c765de6 common/hobject_t: fix whitespace
 69e169d os/DBObjectMap: new version v2; drop support for upgrading from v1
 462bad3 ceph_test_rados: generate mix of small and large xattrs
 456255b os/DBObjectMap: include hash in header key for EC objects
-9f53eeb (origin/wip-7317) doc: add cephfs ENOSPC and eviction information
+9f53eeb doc: add cephfs ENOSPC and eviction information
 a8babcb client: add 'status' asok
 6fdf890 client: propagate flush errors to fclose/fsync
 a8611ac client: handle ENOSPC in _flush
@@ -1972,7 +5369,7 @@ fcc64f3 mds: set epoch barrier on transition to active
 68ba7f5 messages: add osd_epoch_barrier to cap msgs
 fc7d62a mds: return ENOSPC on write ops while osds full
 09287fd osdc: add public Objecter::osdmap_full_flag
-56af795 (origin/wip-10329) rgw: return InvalidAccessKeyId instead of AccessDenied
+56af795 rgw: return InvalidAccessKeyId instead of AccessDenied
 ef75d72 rgw: return SignatureDoesNotMatch instead of AccessDenied
 bab3d3d osdc: remove spurious _maybe_request_map return value
 cf3101a osdc: implement Objecter::set_epoch_barrier
@@ -1982,17 +5379,17 @@ fe32d6e osdc/Objecter: add op_cancel_writes
 a991212 osdc/ObjectCacher: invoke flush_set_callback on purge_set
 bb80437 erasure-code: relax cauchy w restrictions
 058f433 man: Deprecate preferred OSD setting for cephfs
-37a9969 (origin/wip-10299) librbd: complete all pending aio ops prior to closing image
-02fae9f (origin/wip-10262) osd: handle no-op write with snapshot case
+37a9969 librbd: complete all pending aio ops prior to closing image
+02fae9f osd: handle no-op write with snapshot case
 db951ae cls/refcount: ENOENT when put on non-existent object
-9ff9144 (origin/wip-librados-fadvise) librados: do not choke on asphyxiate doc generation
-394fd4c (origin/wip-9405) librbd: fixed garbage output from test LibRBD.TestIOPP
-1853461 (origin/wip-10220) mon: Paxos: reset accept timeout before submiting work to the store
+9ff9144 librados: do not choke on asphyxiate doc generation
+394fd4c librbd: fixed garbage output from test LibRBD.TestIOPP
+1853461 mon: Paxos: reset accept timeout before submiting work to the store
 b5381c2 ReplicatedPG: Make pull and push op use sparse read
 31adfeb doc: fix spelling in corpus page
 12808b9 osdc/Objecter: respect honor_osdmap_full setting
 7e84034 os: free space tracking for MemStore
-e8d3399 (origin/wip-10201-osd-scrub-repair) tests: convert osd-scrub-repair to ceph_objectstore_tool
+e8d3399 tests: convert osd-scrub-repair to ceph_objectstore_tool
 aa54640 tests: shell functions toolbox dedicated to testing
 f0961ae mailmap: Blaine Gardner affiliation
 b0e88a0 mailmap: Lei Dong affiliation
@@ -2009,11 +5406,11 @@ e90818f mailmap: Dan Mick name normalization
 cc05518 mailmap: Adam Spiers affiliation
 41707ee mailmap: Nilamdyuti Goswami affiliation
 3886734 ceph_test_rados_api_io: fix new test
-e3ba3d2 (origin/wip-watch-notify) ceph_test_rados_api_watch_notify: use 5 min timeout for notifies
+e3ba3d2 ceph_test_rados_api_watch_notify: use 5 min timeout for notifies
 a2572c3 ceph_test_stress_watch: do not unwatch if watch failed
-35f084d (origin/wip-test-huge-tickets) qa: add script to test how libceph handles huge auth tickets
-e7d434b (origin/wip-warning) tests: fix signedness compilation warnings
-847e5e1 (origin/wip-9555) osd: check that source OSD is valid for MOSDRepScrub
+35f084d qa: add script to test how libceph handles huge auth tickets
+e7d434b tests: fix signedness compilation warnings
+847e5e1 osd: check that source OSD is valid for MOSDRepScrub
 707a111 osd: pass Message* to most require_* helpers
 09c1648 osd: drop redundant check in handle_replica_op
 b6401c1 ceph-disk: LOG.info instead of print
@@ -2025,10 +5422,10 @@ b9ddf97 documentation: simplify running make check
 85ab278 tests: install hdparm in containers
 60bca67 tests: update centos docker file to epel 7.5
 e038b126 tests: docker tests only need a workdir, not a clone
-53929ba (origin/wip-10270) librbd: gracefully handle deleted/renamed pools
+53929ba librbd: gracefully handle deleted/renamed pools
 f79b7fe librados: Added new API methods to create an ioctx by pool id
-5cf193c (origin/wip-10307) rgw: use s->bucket_attrs instead of trying to read obj attrs
-f773c74 (origin/wip-swift-storage-policy) rgw: cannot modify swift bucket policy
+5cf193c rgw: use s->bucket_attrs instead of trying to read obj attrs
+f773c74 rgw: cannot modify swift bucket policy
 07dc42a rgw: dump X-Storage-Policy header on bucket HEAD
 f16da1c rgw: compare placement target on bucket recreation
 c55d7da librados: remove IoCtxImpl lock member
@@ -2037,7 +5434,7 @@ c598e63 librados: stop using IoCtxImpl lock for watch/notify functions
 50e9e39 librados: do not overload C++ API methods with new signatures
 ffd6c7e doc: Adds man page for ceph under man/.
 76da87a doc: Adds man page for ceph.
-7409ab3 (origin/wip-9254) rgw: http headers need to end with \r\n
+7409ab3 rgw: http headers need to end with \r\n
 0801361 mon/OSDMonitor: dump inc and full maps we get a crc mismatch
 f31135c mon/OSDMonitor: do not reencode OSDMap on 'osd getmap'
 b1f1381 ceph-dencoder: mark OSDMap as featureful, but with stray data
@@ -2045,7 +5442,7 @@ b1f1381 ceph-dencoder: mark OSDMap as featureful, but with stray data
 32a837d osd: add osd_inject_bad_map_crc_probability option
 d898698 osd: verify our OSDMap encoding based on crc
 716f90f mon/OSDMonitor: verify full_crc when available
-5fc7a0b (origin/wip-10271) rgw: change multipart upload id magic
+5fc7a0b rgw: change multipart upload id magic
 7925b82 rgw: Conditional PUT on ETag
 023927b ReplicatedPG: For MAPEXT, using bl.length() instead of op.extent.length as num_rd_kb.
 c1dc361 ReplicatedPG: Don't use the return value of getattr_maybe_cache as attr length.
@@ -2072,7 +5469,7 @@ c5ed33d librados/ObjectOperation: add new function set_op_flags2
 b35cb48 librados: Add read/write fadvise op flags CEPH_OSD_OP_FLAG_FADVISE_*.
 5b54410 librados: Add read/write fadvise op flags in ObjectOperationFlags.
 f5abed7 librados.h: Add read/write op fadvise flags  LIBRADOS_OP_FLAG_FADVISE_*.
-e8e27a8 (origin/wip-10296) unittest_blkdev: test an abbreviated /sys/block dir
+e8e27a8 unittest_blkdev: test an abbreviated /sys/block dir
 5e454a8 common/blkdev: add simple sandboxing function for testing
 9b26de3 ReplicatedPG: fail a non-blocking flush if the object is being scrubbed
 dce6f28 ReplicatedPG::scan_range: an object can disappear between the list and the attr get
@@ -2080,16 +5477,16 @@ dce6f28 ReplicatedPG::scan_range: an object can disappear between the list and t
 2246dca common/blkdev: fix block device discard check
 25e3783 common/blkdev: get_block_device_base
 beaa04e mon: MonitorDBStore: allow randomly injecting random delays on writes
-2ad55cd (origin/wip-10296-journal-discard) tests: do not ignore ceph-disk.sh::test_activate_dev failure
+2ad55cd tests: do not ignore ceph-disk.sh::test_activate_dev failure
 9c5fd8c os: do not attempt to detect discard by default
 21e07eb rgw: url decode http query params correctly
 279c199 ceph_test_rados_api_watch_notify: fix a zillion warnings
 531e8bb ceph_test_rados_api_watch_notify: use GT macro
 008d788 ceph_test_rados_api_watch_notify: print err to debug
 c1dd92b osdc/Objecter: normalize watch error (ENOENT on delete)
-5559e6a (origin/wip-10288) mon: fix `fs ls` on peons
-b025f56 (origin/wip-typo) mds: fix log typo
-a8f85dc (origin/wip-9323) mon: allow full flag to be manually cleared
+5559e6a mon: fix `fs ls` on peons
+b025f56 mds: fix log typo
+a8f85dc mon: allow full flag to be manually cleared
 2e5ee30 Finisher: call signal if necessary in function queue
 d92e2ca lttng: add int type definitions
 060cbaa cycles: add reader for i386 and aarch64
@@ -2097,7 +5494,7 @@ d92e2ca lttng: add int type definitions
 6a45d75 Add diagnostic unshareable buffer.
 28725eb Restore zero-copy buffers in OSD fast path.
 5f551cc Add safe-sharing to buffer::list and buffer::ptr.
-b038e8f (origin/wip-8797) Call Rados.shutdown() explicitly before exit
+b038e8f Call Rados.shutdown() explicitly before exit
 5ba9b8f rados.py: remove Rados.__del__(); it just causes problems
 8b195ec check endpoints is not empty before use it.
 320c256 osdc/Objecter: only issue one error per watch; do it through one path
@@ -2106,33 +5503,33 @@ b038e8f (origin/wip-8797) Call Rados.shutdown() explicitly before exit
 6f43c6c osdc/Objecter: pass correct cookie value to error
 b34e545 os/FileStore.cc: insert not empty list<Context*> to op_finisher/ondisk_finisher.
 7ab4a39 ceph.conf: update sample
-efd9d8d (origin/wip-librbd-test-cleanup) tests: Minor cleanup to librbd test
+efd9d8d tests: Minor cleanup to librbd test
 78a15ee Fix libstriprados::remove, use strtoll insdead of strtol
-2d4dca7 (origin/wip-10029) SimpleMessenger: Retry binding on addresses if binding fails
+2d4dca7 SimpleMessenger: Retry binding on addresses if binding fails
 e8063a1 test: modify cephfs quota test case
 31a0cdc mds: fix parse_quota_vxattr for invalid data
 bab7122 OSD: FileJournal: call writeq_cond.Signal if necessary in submit_entry
 17d6390 os: IndexManager:: optimaze map<coll_t, CollectionIndex* > col_indices col_indices is just used to cache CollectionIndex, and use col_indices.find() to get its corresponding index. Without any using of up_bound() and low_bound(), we can use unordered_map to make it fast. Based on perf, when I chanage map to unordered_map, the cpu cost for get_index(coll_t c, const string& baseDir, Index *index) is much lower.
 4036b91 os: FileJournal:: fix, uninitialization of FileJournal throttle Since after firefly, take() in class throttle add if(0 == max.read()) return. If throttle is not initialized with max throttle value, it actually does not work. So initialize it in FileJournal
-6b51a9f (origin/wip-10164) mds: set dirfrag version when fetching dirfrag is skipped
-17c72f5 (origin/wip-10010) ceph-osd: remove extra close of stderr
+6b51a9f mds: set dirfrag version when fetching dirfrag is skipped
+17c72f5 ceph-osd: remove extra close of stderr
 5836899 Revert "client: support listxattr for quota attributes"
-89b2fee (origin/wip-crush-straw) mon: 'osd crush reweight-all'
+89b2fee mon: 'osd crush reweight-all'
 dd7b58f crush: set straw_calc_version=1 for default+optimal; do not touch for presets
 adf5c6d crush/builder: a note about the original crush_calc_straw()
 9000068 mon: add 'osd crush {get,set}-tunable <name> [value]' commands
-606b004 (origin/wip-gfcomplete-dirty) gf-complete: update submodule to newest upstream
-bf0d8d3 (origin/wip-cleanup-removal) osd: Remove dead code related to old pg removal mechanism
-0827bb7 (origin/wip-10277) client: use remount to trim kernel dcache
+606b004 gf-complete: update submodule to newest upstream
+bf0d8d3 osd: Remove dead code related to old pg removal mechanism
+0827bb7 client: use remount to trim kernel dcache
 dfcb1c9 client: cleanup client callback registration
 2f52202 Revert "client: invalidate kernel dentries one by one"
-9902383 (origin/wip-9998) crush/CrushWrapper: fix create_or_move_item when name exists but item does not
+9902383 crush/CrushWrapper: fix create_or_move_item when name exists but item does not
 8c87e95 crush/builder: prevent bucket weight underflow on item removal
 eeadd60 crush/CrushWrapper: fix _search_item_exists
 a198dee Modifying the docs to add the Get pool commands to match the CLI. Signed-off-by: Chris Holcombe <chris.holcombe at nebula.com>
 3a84602 Include common/likely.h in buffer.h
 e8b412c mailmap: Zhiqiang Wang name normalization
-c0ce4a5 (origin/xiaoxichen-cleanup_getheader) Cleanup:Use get_type()instead of get_header().type
+c0ce4a5 Cleanup:Use get_type()instead of get_header().type
 c2d9333 WBThrottle: make bytes/ios/inode_wb's perf counter effective
 12d85c6 Fix stack buffer overflow reported by ASan.
 f6f6ea2 Fix alloc-dealloc mismatch reported by ASan (new[] vs. delete).
@@ -2163,7 +5560,7 @@ f4735cf .gitignore: add ceph_perf_objectstore
 bcee92e StoreTest: Add tests for clone_range op
 cabb57a FileStore: Fix _do_sparse_copy_range don't truncate zero sections
 980f094 common: bufferlist::get_contiguous return 0 when param len == 0
-be11a45 (origin/wip-fadvise) os/FileStore: stop disabling fadvise on XFS
+be11a45 os/FileStore: stop disabling fadvise on XFS
 d6a7a7c os/FileStore: merge filestore_replica_fadvise -> filestore_fadvise
 5eacd3c ceph_objectstore_tool: Strip _* (always _head) from pgid in list entry output
 b617ee2 ceph_objectstore_tool: BUG: --op list wasn't including snapshots
@@ -2179,11 +5576,11 @@ a90233c objectstore_tool: update usage strings
 58682d1 ceph-disk: dmcrypt file permissions
 2b59c5a common/Initialize.h: add header file to Makefile.am
 5da15ee mds: drop dentry leases in deleted directory
-f55a1f8 (origin/wip-mds-readonly) mds: disallow flush dentry/journal when MDS is readonly
+f55a1f8 mds: disallow flush dentry/journal when MDS is readonly
 f4f1880 mds: properly unregister asok commands
 818a807 mds: drop dirty dentries in deleted directory
 ff901b5 arch: add support for HW_CAP based neon runtime detection
-360d627 (origin/wip-10104) pybind/test_rados: add test for wait_for_complete_and_cb()
+360d627 pybind/test_rados: add test for wait_for_complete_and_cb()
 19212cd rados.py: fix misnamed 'wait_*' routines, add true wait/wait-with-cb
 a53dbab librados:: carry IoCtx ref from WatchInfo
 a989fec osd: allow deletion of objects with watchers
@@ -2269,19 +5666,19 @@ e1a3bc8 ceph_test_rados_api_watch_notify: test notify2 api
 81cdff2 osd: implement notify ack payloads
 7e0c4efc librados: define updated watch/notify interface
 12940c9 librados: drop useless ver arg to _notify_ack
-22c1d9b (origin/wip-rados-include) librados: Fix rados include problem reported by "sponce"
+22c1d9b librados: Fix rados include problem reported by "sponce"
 7b621f4 rgw: run radosgw as apache with systemd
 3f2f42d doc/release-notes: v0.89
 237b29c Changed os-recommendation table sorting to remove implied bias.
 a6102f0 documentation: add centos 7 to ceph-deploy installation
 7631b10 os: add filestore_fadvise to control whether use posix_fadvise.
 1fba28f WBThrottle: Call posix_fadvise to free page cache if nocache set in clear().
-e42df6a (origin/wip-mon-mdsdne) MDSMonitor.cc: fix assertion caused by MDSMap::STATE_DNE state
-d3ee89a (origin/wip-10229) osdc/Filer: use finisher to execute C_Probe and C_PurgeRange
+e42df6a MDSMonitor.cc: fix assertion caused by MDSMap::STATE_DNE state
+d3ee89a osdc/Filer: use finisher to execute C_Probe and C_PurgeRange
 43d5c7c crush: fix crush_calc_straw() scalers when there are duplicate weights
 85498bc crush: fix distortion of straw scalers by 0-weight items
-6edaf4e (origin/wip-10231-gperftools-location) heap_profiler: support new gperftools header locations
-d5e2c2c (origin/wip-unsigned) mon: fix signed/unsigned comparison warning
+6edaf4e heap_profiler: support new gperftools header locations
+d5e2c2c mon: fix signed/unsigned comparison warning
 7c12036 crush/builder: break out new version 1 of crush_calc_straw
 f35a3d8 crush: pass crush_map * to various builder methods
 9565621 crush: default to straw_calc_version 1
@@ -2291,23 +5688,23 @@ c133a83 crush/CrushWrapper: dump chooseleaf_vary_r with other tunables
 294b06c crushtool/CrushTester: output utilization even with 1 batch
 3506293 crush: recalculate straw scalers during a reweight
 7a99b48 osdmaptool: --test-map-pgs-dump
-68fdc0f (tag: v0.89) 0.89
-d98cec7 (origin/wip-cephtool-exec) qa: fix teardown in cephtool's test_mon_mds
-8cd1fdd (origin/wip-10080) SimpleMessenger: allow RESETSESSION whenever we forget an endpoint
+68fdc0f 0.89
+d98cec7 qa: fix teardown in cephtool's test_mon_mds
+8cd1fdd SimpleMessenger: allow RESETSESSION whenever we forget an endpoint
 8e5dbe7 common/perf_counters: Remove the duplicated code.
 963b764 test/perf_counters: Replace perfcounters_dump to perf dump.
 eafb224 test/perf_counters: Add test case for cmd 'perf reset'
 5f8186b common/perf_counters: Add 'perf reset all|perfcountername' for 'ceph --admin-daemon'
 5d3bff0 doc: rm Fedora minor versions in os recommendations
-0d4dd10 (origin/wip-snapmapper) os/ObjectStore: drop tolerate_collection_add_enoent
+0d4dd10 os/ObjectStore: drop tolerate_collection_add_enoent
 bc5a22b osd: require SNAPMAPPER feature from peers
 b743a95 packaging: package ceph-disk(8)
 2497f16 doc: rm old releases in os recommendations page
 0bb2e1e doc: clarify "B" flag in os recommendations page
-3047c9e (origin/wip-cephtool) qa: cephtool/test.sh use regular strings to list tests
+3047c9e qa: cephtool/test.sh use regular strings to list tests
 e3e5741 tests: vstart_wrapper.sh must call vstart.sh with a list of daemons
 946bebd crush: add dprintk's for crush_calc_straw
-4e955f4 (origin/wip-10211-erasure-code-buffer-alignement) erasure-code: enforce chunk size alignment
+4e955f4 erasure-code: enforce chunk size alignment
 73ad2d6 common: allow size alignment that is not a power of two
 9ade88e common: add bufferlist::rebuild_aligned_size_and_memory
 56dae09 doc: Reformatting on rbd replay doc.
@@ -2315,9 +5712,9 @@ b8f6b5f doc: Added rbd-replay-many and restructured index.
 54d5ed3 doc: Fixed index syntax.
 3012c4a doc: add CentOS 7 to recommended OSes
 6862891 doc: Adds man page for ceph disk in TOC.
-491da51 (origin/wip-9997-1) client: invalidate kernel dentries one by one
-2fa4884 (origin/wip-flush-journal-fix) mds: fix race of trimming log segments
-70e1a5d (origin/wip-doc-rbd-replay) doc: Document RBD Replay
+491da51 client: invalidate kernel dentries one by one
+2fa4884 mds: fix race of trimming log segments
+70e1a5d doc: Document RBD Replay
 131f092 mds: don't blindly create empty object when dirfrag is missing
 9b9e3ed mds: allow choosing action for wirte error
 dafef3c mds: add asok command to force MDS readonly
@@ -2330,9 +5727,9 @@ cfef515 mds: disallow slave requests when MDS is readonly
 4aed047 mds: keep locks in sync state when MDS is readonly
 2d4a746 mds: don't trim log when MDS is readonly
 4f6474f mds: disallow write operations when MDS is readonly
-01df222 (origin/wip-10209) osd: tolerate sessionless con in fast dispatch path
+01df222 osd: tolerate sessionless con in fast dispatch path
 0f1c9fd msg: do not const methods that return a mutable pointer
-0d6c803 (origin/wip-kill-category) osd/osd_types: drop category from object_info_t
+0d6c803 osd/osd_types: drop category from object_info_t
 5ecdce3 osdc/Objecter: drop category from copy-get
 d229548 osd/ReplicatedPG: drop cateogry support from CREATE
 4e1f4cf rados: drop optional 'create [category]' arg
@@ -2352,7 +5749,7 @@ e0190bd KnieticStore: add necessary header file
 49bedd4 ObjectStoreBenchmark: Used to perf ObjectStore::Transaction
 929fcd6 better systemd support
 3376fee Cleanup noop in message
-5b5493d (origin/wip-9881) mds: implement "flush journal" asok
+5b5493d mds: implement "flush journal" asok
 efeaaaa mds: in segment expiry, use end instead of start
 2ec6773 mds: fix MDLog I/O callback without finisher
 dba078a Cycles: A precious performance profile tool using rdstc
@@ -2370,7 +5767,7 @@ ea3d4d7 AsyncConnection: Ensure "mark_down" complete when returned
 bdedad2 AsyncMessenger: Async event threads shared by all AsyncMessenger
 c10d639 Add isa-erasure yasm dep for deps.rpm.txt
 6a98dac rpm: unconditionally package rbd-replay-prep(8)
-dbb5a48 (origin/wip-librados-symbols) librados: Only public API symbols from the shared library
+dbb5a48 librados: Only public API symbols from the shared library
 03a61d2 Minor typos and trailing spaces
 782a74c KineticStore: Fix compile error and add _test_init
 21798f9 doc: fix some typos in ObjectStore.h
@@ -2378,7 +5775,7 @@ dbb5a48 (origin/wip-librados-symbols) librados: Only public API symbols from the
 8a48847 doc: Updates the man page for ceph-disk utility with some changes.
 3b00c6f safe_io: do not set ending \0 in safe_read_file()
 e6410eb added some consts Signed-off-by: Michal Jarzabek <stiopa at gmail.com>
-17b5fc9 (origin/wip-10135) mon: OSDMonitor: allow adding tiers to FS pools
+17b5fc9 mon: OSDMonitor: allow adding tiers to FS pools
 9a118d5 doc: Adds man page for ceph-disk utility.
 242dd1c doc: Removes ceph-deploy usage instances from ceph-disk man page.
 cb820f8 erasure-code: test NEON arch selection logic
@@ -2389,19 +5786,19 @@ a788829 erasure-code: include new headers in distribution
 987d54b aarch64: add support for HW_CAP based neon runtime detection
 cec00c9 autotools: ARM/AArch64 NEON detection similar to x86
 ba520bf erasure-code: conditionally build jerasure sse3/sse4 plugins
-0b985d2 (origin/wip-10185-neon) arch: fix neon feature detection
+0b985d2 arch: fix neon feature detection
 24eb564 Remove pidfile and asok after stopping ceph
 7dcc850 doc: Updates man page for ceph-disk utility.
 e983230 remove unneeded include file
 a1ba385 doc: rgw document s3 bucket location features
-0c33930 (origin/wip-10151) mon: fix MDS health status from peons
+0c33930 mon: fix MDS health status from peons
 a450cab doc: Adds man page for ceph-disk utility.
 4b35ae0 rgw: check for timestamp for s3 keystone auth
-61cae54 (origin/wip-10173-autogen) autogen.sh: git submodule sync
+61cae54 autogen.sh: git submodule sync
 00da974 test: extend quota test case
 5515cc4 client: fix problem with move files between quota tree
 b2c15d6 mds: Fix a problem with rstat refresh.
-12f1f21 (origin/wip-mailmap) mailmap: Update Warren Usui's attribution to Red Hat
+12f1f21 mailmap: Update Warren Usui's attribution to Red Hat
 43c75ee mailmap: Update Tyler Brekke's attribution to Red Hat
 9df4624 mailmap: Update Tamil Muthamizhan's attribution to Red Hat
 fb74152 mailmap: Update Sandon Van Ness' attribution to Red Hat
@@ -2426,12 +5823,12 @@ eaa9889 mailmap: Update Dan Mick's attribution
 e424d07 client: Fix problem in get_quota_root with update parent and ancestor
 6c7bb8c blkdev: using strncpy instead of strcpy.
 1fe8b84 PGLog: include rollback_info_trimmed_to in (read|write)_log
-627f138 (origin/wip-sepia-update) Updated sepia hardware list.
+627f138 Updated sepia hardware list.
 7a868fd ReplicatedPG: remove unused parameter in function write_update_size_and_usage
-0d89db5 (origin/wip-9557) mds: store backtrace for straydir
+0d89db5 mds: store backtrace for straydir
 a79ba32 mds: verify backtrace when fetching dirfrag
 5177759 KeyValueStore: Fix parse_header_key
-9d84d2e (origin/wip-10018-primary-erasure-code-hinfo) osd: deep scrub must not abort if hinfo is missing
+9d84d2e osd: deep scrub must not abort if hinfo is missing
 92662a9 mailmap: Loic Dachary name normalization
 77c1a35 rgw: support swift storage policy api
 f692bfe PG: always clear_primary_state on new interval, but only clear pg temp if not primary
@@ -2444,13 +5841,13 @@ dcecfb8 MemStore: Return -ENODATA when collection_getattr hit nonexist attr
 877237f KeyValueStore: Return -ENODATA if collection_getattr is nonexist
 00b275b StoreTest: Add collection_getattr(s) tests
 ffb6f78 KeyValueStore: Remove assert for collection_getattr method
-f3dab44 (origin/wip-10077) ceph_objectstore_tool: Add feature called set-allow-sharded-objects
+f3dab44 ceph_objectstore_tool: Add feature called set-allow-sharded-objects
 b3021b0 ceph_objectstore_tool: Add utility routine get_osdmap()
 86baf2d ceph_objectstore_tool: Clear ...INCOMPAT_SHARDS from feature if exporting replicated pg
 d3d5852 FileJournal: add journal_discard to control ssd whether support discard
 9f9eb67 FileJournal: Add ssd discard for journal which using ssd disk as journal.
-10f6ef1 (origin/wip-10114-fix-warning) erasure-code isa-l: remove duplicated lines (fix warning)
-c912116 (origin/wip-9665-ceph-disk-partprobe) autotools: add --enable-docker
+10f6ef1 erasure-code isa-l: remove duplicated lines (fix warning)
+c912116 autotools: add --enable-docker
 26bf8d1 ceph-disk: test prepare / activate on a device
 ecccb39 tests: helper to run unit / function tests in docker
 23aaf6c ceph-disk: implement init=none for block devices
@@ -2458,14 +5855,14 @@ fed3b06 ceph-disk: run partprobe after zap
 23e71b1 ceph-disk: use update_partition in prepare_dev and main_prepare
 922a15e ceph-disk: encapsulate partprobe / partx calls
 0e6db90 doc: update debian compilation dependencies
-8613984 (origin/sponce-master) Fixed locking issue in the trun method of libradosstriper leading to potential race conditions - Fixes: #10129
-9e53c35 (origin/wip-10128) tests: ceph_objectstore_tool.py test all variants of export/import
+8613984 Fixed locking issue in the trun method of libradosstriper leading to potential race conditions - Fixes: #10129
+9e53c35 tests: ceph_objectstore_tool.py test all variants of export/import
 8c87f32 ceph_objectstore_tool: Make --file option consistent by treating "-" as stdout/stdin
 0d5262a ceph_objectstore_tool: When exporting to stdout, don't cout messages
-0d350b6 (origin/wip-10123) librbd: protect list_children from invalid child pool IoCtxs
+0d350b6 librbd: protect list_children from invalid child pool IoCtxs
 de547c9 Fix bug #10096 (ceph-disk umount race condition)
-06fc39c (origin/wip-9321) mon: PaxosService: can be readable even if proposing
-d8a7db8 (origin/wip-9913) mon: Monitor: use session's entity name for audit log
+06fc39c mon: PaxosService: can be readable even if proposing
+d8a7db8 mon: Monitor: use session's entity name for audit log
 125b58d mon: Monitor: forward session entity name to leader
 ca8e1ef mon: Monitor: stash auth entity name in session
 8773474 messages: MForward: stash auth entity name
@@ -2474,7 +5871,7 @@ fe6679d Fix libstriprados::stat, use strtoll insdead of strtol
 11aa383 doc: fix typos in diagram for incomplete write
 2379c57 doc: fix incorrect equalities
 625dd40 doc: fix typo (superfluous "no")
-cbecab4 (origin/wip-10063-objectstore-tool-erasure-code) tests: ceph_objectstore_tool.py fix list-attr for erasure code
+cbecab4 tests: ceph_objectstore_tool.py fix list-attr for erasure code
 40717aa tests: ceph_objectstore_tool.py check for malformed JSON for erasure code objs
 eaf1d1e tests: ceph_objectstore_tool.py fix off by 1 ATTR_OBJS handling
 dcf09ae common: do not omit shard when ghobject NO_GEN is set
@@ -2490,16 +5887,16 @@ f04d4e7 tests: ceph_objectstore_tool.py run mon and osd on specific port
 74506d2 tests: ceph_objectstore_tool.py use env python
 d1c4e5b vstart: use hostname -s instead of fqdn
 cf2104d common: do not unlock rwlock on destruction
-06a245a (origin/wip-execstack) Add annotation to all assembly files to turn off stack-execute bit
-e25724b (origin/wip-warn-max-pg) mon: tighten pg vs osd warning bounds to 3x above/below 100 (target)
+06a245a Add annotation to all assembly files to turn off stack-execute bit
+e25724b mon: tighten pg vs osd warning bounds to 3x above/below 100 (target)
 7f3dcdb mon/PGMonitor: add max pgs per osd warning
 b70be56 mon/PGMonitor: make min pg warning based on pg *instances*
 5eb662c mon/PGMonitor: fix summation when we adjust up/acting
 d671e1c mon/PGMap: allow stat_pg_{add,sub} that only touches stats
 cf24117 osd/osd_types: include up and acting count in pool_sum_t
 c87bde6 PG: always clear_primary_state when leaving Primary
-b10bd19 (origin/wip-dumper-coverity) tools: Close the fd before returning in Dumper
-50f9edb (origin/wip-da-SCA-20141111) FileJournal.cc: fix _open() to return negative value in error case
+b10bd19 tools: Close the fd before returning in Dumper
+50f9edb FileJournal.cc: fix _open() to return negative value in error case
 0c8955b mailmap: Loic Dachary name normalization
 b6d6180 test_async_driver: fix warning
 175d6e0 doc: Removed separate build file for rpm.
@@ -2509,8 +5906,8 @@ a5a0d5c doc: Adds build-doc guidelines for Fedora and CentOS/RHEL.
 088230e Improve readability of the exception
 3c4b782 rgw: add location when data in another region.
 35ae591 doc: update the OpenStack glance configuration
-83e8b07 (origin/wip-librbd-symbols) librbd: Only public API symbols from the shared library
-8dde6a6 (origin/wip-10052) ceph_test_rados_api_tier: fix cleanup of whiteouts
+83e8b07 librbd: Only public API symbols from the shared library
+8dde6a6 ceph_test_rados_api_tier: fix cleanup of whiteouts
 34e4d24 osd/ReplicatedPG: allow whiteout deletion with IGNORE_CACHE flag
 a04bb13 Mailmap: add Rongze organization
 ef2565e vstart.sh: complain less about fullish mon disks
@@ -2530,12 +5927,12 @@ ed0d3ed st_rados_list_objects.cc: free resources in error case
 c73c47e ceph_erasure_code_benchmark.cc: prefer ++operator for non-primitive iterators
 3b31e74 crush/CrushWrapper.cc: prefer ++operator for non-primitive iterators
 35ab4d7 rgw/rgw_gc.cc: silence cppcheck
-699fc05 (origin/wip-10057) msg/simple: ms_die_on_skipped_message
+699fc05 msg/simple: ms_die_on_skipped_message
 20f99ca mds: don't overwrite reply's snapbl
-6f8b96a (origin/wip-10095) crush/CrushWrapper: fix detach_bucket
+6f8b96a crush/CrushWrapper: fix detach_bucket
 a46fb02 Makefile: include 'ceph' in base target
 585b906 msg/Makefile: add new header to tarball
-2fe5c4c (origin/wip-9921) msg/Pipe: inject delay in stop_and_wait
+2fe5c4c msg/Pipe: inject delay in stop_and_wait
 2d69805 SimpleMessenger: Pipe: do not block on takeover while holding global lock
 6e5bae4 mailmap: Dan Mick name normalization
 5fdec03 mailmap: Xan Peng affiliation
@@ -2551,8 +5948,8 @@ ee45f48 doc: Fix 2 syntax errors.
 ebc8875 AsyncMessenger: Support select for other OS such as Windows
 59d8c94 qa/workunits/fs/misc/quota.sh: simple quota test
 86f87bb fix
-78d1e6c (origin/wip-9439) osd: Check filter ops for pgls and pgnls
-9e05ba0 (origin/wip-9835) osd/OSD: use OSDMap helper to determine if we are correct op target
+78d1e6c osd: Check filter ops for pgls and pgnls
+9e05ba0 osd/OSD: use OSDMap helper to determine if we are correct op target
 89c0263 osd/OSDMap: add osd_is_valid_op_target()
 079a8d7 Add myself to <contact at intel.com>
 5ce0919 ceph_objectstore_tool: Fixes to make import work again
@@ -2567,17 +5964,17 @@ e27b0d9 mon: fix formatted 'pg stat'
 e94d3c1 qa: allow small allocation diffs for exported rbds
 09eefac Fix Yuan's mailmap
 05a1c3f doc/release-notes: v0.88
-90c5869 (origin/wip-7467) rgw: support multiple host names
-1342548 (origin/wip-crush-tree) crush: fix tree bucket functions
+90c5869 rgw: support multiple host names
+1342548 crush: fix tree bucket functions
 e444b22 crush/builder: replace printf with an empty dprintk macro
-109bcd4 (origin/wip-da-revert-10b68b) Revert "osd: detect (some) misordered ondisk tmaps"
-4be687b (tag: v0.88) 0.88
+109bcd4 Revert "osd: detect (some) misordered ondisk tmaps"
+4be687b 0.88
 d12fa35 AsyncMessenger: Fix large bufferlist send segment fault
 d145cca AsyncMessenger: Try send in queue bufferlist
 123d364 AsyncMessenger: Normalise log informations
 fc4e4f2 AsyncMessenger: Release connection if stopped
 4d4b7d0 doc/rados/configuration fix 'ods mkfs options' default
-41b5dd2 (origin/wip-da-SCA-20141028) AuthSessionHandler.h: init protocol in constructor
+41b5dd2 AuthSessionHandler.h: init protocol in constructor
 dbfb63e src/librbd/librbd.cc: fix potential null pointer deref
 4fc9428 osd/ReplicatedPG.cc: remove redundant check
 a4cecda test/librados/c_read_operations.cc: fix invalid checks
@@ -2585,12 +5982,12 @@ a4cecda test/librados/c_read_operations.cc: fix invalid checks
 e590d42 rados_sync.cc: fix xattr_diff() for the only_in_b checks
 04517f0 bench_log.cc: catch ceph::FailedAssertion exception
 60e7da9 osd/PGLog.cc: prefer empty() over size() for emptiness check
-fea7097 (origin/wip-rbd-python-tests) librbd: Python unit tests now use unique pools and images
+fea7097 librbd: Python unit tests now use unique pools and images
 c11cf2a bufferlist: Don't call rebuild(), c_strt() call it if bufferlist not contiguous.
-cd784ea (origin/wip-9854) osdc: Constrain max number of in-flight read requests
+cd784ea osdc: Constrain max number of in-flight read requests
 12913c0 AsyncMessenger: Eating up pipe buffer for waking up
-2704914 (origin/wip-10045) common/Readahead: use correct lock when waiting on the pending ops
-42b0a35 (origin/wip-scrub-coverity) mds: remove dead code from inode_t::compare()
+2704914 common/Readahead: use correct lock when waiting on the pending ops
+42b0a35 mds: remove dead code from inode_t::compare()
 f78cfa7 mds: CInode: catch exception by reference, not value
 dafd335 osd/OSDMap: verify CRC on decode
 33490b7 mon/OSDMonitor: populate Incremental::full_crc
@@ -2627,11 +6024,11 @@ a0c1f22 tests: use kill -0 to check process existence
 c3b51ef tests: remove vstart_wrapped_tests.sh
 7a6ca17 tests: use different ports for each mon
 bdca0ac tests: tolerate a disk 99% full
-f80499e (origin/wip-flush-snaps) osd/ReplicatedPG: flush snaps immediately
-560e22e (origin/wip-gtest-warnings) test: use unsigned ints to compare against size()
+f80499e osd/ReplicatedPG: flush snaps immediately
+560e22e test: use unsigned ints to compare against size()
 a21bca1 mailmap: Loic Dachary affiliation
-740a1bd (origin/wip-10025) tools: error handling on journal import/export
-3e0295f (origin/wip-doc-dumpling-to-firefly) doc: Added Dumpling to Firefly upgrade section.
+740a1bd tools: error handling on journal import/export
+3e0295f doc: Added Dumpling to Firefly upgrade section.
 15d487f MDS: clean up internal MDRequests the standard way
 07e0831 MDS: CInode: break out of validation early on symlinks
 f1677e7 common/ceph_strings: add some MDS internal op names to ceph_mds_op_name()
@@ -2667,19 +6064,19 @@ abc995b qa/workunits/fs/misc: combine sudo and echo effectively
 3aa7797 qa: use sudo even more when rsyncing /usr
 2a61735 Fedora 19 uses systemd but there is no systemd-run available in the release (rhbz#1157938), this patch makes sure that the init scripts check for availability of systemd-run before they use it (otherwise, they fall back to the default method)
 5ac05d4 Fix tests on btrfs: leftover subvolumes removed
-762eda8 (origin/wip-blacklist-respawn) osdc: fix Journaler write error handling
-61ebfeb (origin/wip-10030) librbd: don't close an already closed parent image upon failure
+762eda8 osdc: fix Journaler write error handling
+61ebfeb librbd: don't close an already closed parent image upon failure
 d13b478 EC: Allow bench.sh to test ISA backend also
-52cb44c (origin/wip-9887) osd, mon: Send initial pg create time from mon to osd
-2e63944 (origin/wip-rbd-test-fixtures) librbd: Refactor librbd unit tests to use test fixtures
-5d9f36f (origin/wip-9918) rgw: update swift subuser perm masks when authenticating
-ef6d3ad (origin/wip-9973) rgw: remove swift user manifest (DLO) hash calculation
-b1bfc3a (origin/wip-9479) rgw: send back ETag on S3 object copy
-3196345 (origin/wip-9478) rgw: S3 object copy content type fix
+52cb44c osd, mon: Send initial pg create time from mon to osd
+2e63944 librbd: Refactor librbd unit tests to use test fixtures
+5d9f36f rgw: update swift subuser perm masks when authenticating
+ef6d3ad rgw: remove swift user manifest (DLO) hash calculation
+b1bfc3a rgw: send back ETag on S3 object copy
+3196345 rgw: S3 object copy content type fix
 3103c8a ObjectStore: Fix a error comment.
 a1aa70f ObjectStore: Don't use largest_data_off to calc data_align.
 8ba504d buffer: Add _memcopy_count to track total count of memcopy by rebuild/rebuild_page_aligned/c_str.
-3d45a68 (origin/wip-log-client) mds: fix log_client config
+3d45a68 mds: fix log_client config
 7dc2ca8 osd: configure log_client properly
 da6a8a3 mon: move log config parsing into LogClient.h helper
 0fd54a7 move Monitor::update_log_client to LogChannel::update_config
@@ -2687,17 +6084,17 @@ da6a8a3 mon: move log config parsing into LogClient.h helper
 84fec86 osd: add 'cluster_log [type] [message ...]' tell command
 4f40975 commong/LogEntry: string_to_clog_type
 705a1e3 osd: cache pool: delete dead code in ReplicatedPG::agent_choose_mode
-fe7bf06 (origin/wip-9877) rgw: RGWRados::get_obj() returns wrong len if len == 0
+fe7bf06 rgw: RGWRados::get_obj() returns wrong len if len == 0
 f4ee949 osd: cache pool: flush object ignoring cache min flush age when cache pool is full Signed-off-by: Xinze Chi <xmdxcxz at gmail.com>
 6da9405 doc: Edited Key/Value store config reference.
 03be944 doc: Added Key/Value store config reference to index.
-72fc262 (origin/wip-doc-openstack-juno) doc: Update for OpenStack Juno.
-65c3350 (origin/wip-9977) tools: skip up to expire_pos in journal-tool
-e0166a2 (origin/wip-9986) osdc/Objecter: Fix a bug of dead looping in Objecter::handle_osd_map
+72fc262 doc: Update for OpenStack Juno.
+65c3350 tools: skip up to expire_pos in journal-tool
+e0166a2 osdc/Objecter: Fix a bug of dead looping in Objecter::handle_osd_map
 31c584c osdc/Objecter: e shouldn't be zero in Objecter::handle_osd_map
-093c5f0 (origin/wip-9987) mon/PGMap and PGMonitor: update last_epoch_clean cache from new osd keys
+093c5f0 mon/PGMap and PGMonitor: update last_epoch_clean cache from new osd keys
 3fb731b mon/PGMap: invalidate cached min_last_epoch_clean from new-style pg keys
-8924158 (origin/wip-6756) JounralingObjectStore: journal->committed_thru after replay
+8924158 JounralingObjectStore: journal->committed_thru after replay
 122c503 client: Inform mds file size when approaching quota limit
 a50b8eb client: Enforce quota check when changing file size
 bbfeaae client: Forbid moving files across quota trees
@@ -2722,30 +6119,30 @@ fb57e84 ceph: Add MClientQuota message type
 da488ce test: add test case for quota_info_t
 a071201 ceph: Add quota_info_t to store quota info
 3619ea8 ceph: Add quota feature flags
-a554766 (origin/wip-buffer) buffer: implement list::get_contiguous
+a554766 buffer: implement list::get_contiguous
 4a9ad7d osd/ReplicatedPG: fix compile error
 29d7786 ceph.in: remove '--' from injectargs, if there, as well
 7533245 Fix rados_shutdown hang forever when using radosstriper
-c5f8d6e (origin/wip-9752-past-intervals) osd: past_interval display bug on acting
-50c2c75 (origin/wip-9944) osdc/Objecter: fix null dref when pool dne
-1eb9bcb (origin/wip-9945) messages: fix COMPAT_VERSION on MClientSession
+c5f8d6e osd: past_interval display bug on acting
+50c2c75 osdc/Objecter: fix null dref when pool dne
+1eb9bcb messages: fix COMPAT_VERSION on MClientSession
 c96fe59 doc: update RBD for Juno
 56ee3b4 doc/release-notes: it's 8MB, not 32MB
 f7431cc msg/Pipe: discard delay queue before incoming queue
-c51c8f9 (tag: v0.87) 0.87
+c51c8f9 0.87
 ce6f22d AsyncMessenger: Add kqueue support
-5a4c3aa (origin/wip-9800-giant) client: allow xattr caps in inject_release_failure
-214ac9f (origin/wip-doc-fs-quickstart) doc: include 'fs new' stuff in cephfs quickstart
+5a4c3aa client: allow xattr caps in inject_release_failure
+214ac9f doc: include 'fs new' stuff in cephfs quickstart
 1fef4c3 Get the currently atime of the object in cache pool for eviction
 66b4cd9 tests: fix signed/unsigned warning
 4b87a81 rbd: Fix the rbd export when image size more than 2G
 ef1980f osd: tiering: calculate object age during eviction when there is no hit set
-6fca23f (origin/wip-9919-injectargs-side-effects) qa: avoid qa/workunits/cephtool/test.sh unstability
+6fca23f qa: avoid qa/workunits/cephtool/test.sh unstability
 5691c68 client: allow xattr caps in inject_release_failure
 5950710   Fix the match error when starting OSD daemons.   If we have osd.7 and osd.77 on the same host, osd.7 will not be mounted if   osd.77 is mounted.   Signed-off-by: huangjun <hjwsm1989 at gmail.com>
 abd3fd3 fix can not disable max_size quota
 622c5ac osd: cache tiering: fix the atime logic of the eviction
-b90b483 (origin/wip-objectstore) remove collection_rename
+b90b483 remove collection_rename
 a80f2f6 ceph_filestore_dump: use "remove" marker instead of collection rename
 5dc990c osd: remove pgs with "remove" flag on startup
 856504c ceph_test_filestore_idempotent_sequence: drop collection_rename
@@ -2760,9 +6157,9 @@ de52873 osd, filestore: move automatic upgrade into mount()
 86919f5 osd, filestore: mount in upgrade() caller
 5f8a1df osd, filestore: move convertfs into FileStore
 b889b6b remove ceph_dupstore
-a5184cf (origin/wip-9869) client: cast m->get_client_tid() to compare to 16-bit Inode::flushing_cap_tid
-a4caed8 (origin/wip-9894) client: fix I_COMPLETE_ORDERED checking
-387d5d8 (origin/wip-vstart) init-ceph: make ./init-ceph behave from src dir on systemd
+a5184cf client: cast m->get_client_tid() to compare to 16-bit Inode::flushing_cap_tid
+a4caed8 client: fix I_COMPLETE_ORDERED checking
+387d5d8 init-ceph: make ./init-ceph behave from src dir on systemd
 a6357f2 unittest_shared_cache: fix build line
 1abb620 doc/release-notes: giant notes
 6a43809 rgw_rest_client.cc: remove self assignment of new_info.effective_uri
@@ -2783,7 +6180,7 @@ c952cb9 rgw/rgw_formats.cc: fix realloc memory leak
 5c7b8c6 rgw/rgw_rest_swift.cc: use empty() instead of size()
 17c327f rgw/rgw_rest_s3.cc: use !empty() instead of size() > 0
 84a5c84 vstart.sh: Fix usage
-a9a2185 (origin/wip-crush-link-id) mon: fix `osd crush link` id resolution
+a9a2185 mon: fix `osd crush link` id resolution
 1877670 Remove dead code in EC LRC plugin
 a05dedc check to_remove before setting transaction
 f99c5f9 rgw/rgw_rados.cc: remove unused variable 'objs'
@@ -2815,9 +6212,9 @@ d9abd53 ErasureCodeIsa.cc: reduce scope of variable
 e145bff ErasureCodePlugin.cc: reduce scope of variable
 c3ee13a cls_rbd_client.cc: reduce scope of variable
 53e2ba0 mailmap: Federico Gimenez affiliation
-eea9803 (origin/wip-enoent-race) os/LevelDBStore, RocksDBStore: fix race handling for get store size
+eea9803 os/LevelDBStore, RocksDBStore: fix race handling for get store size
 06a73c3 doc/release-notes: v0.87 giant (draft)
-ecbbf87 (origin/wip-da-SCA-20141010) Paxos.cc: use reinterpret_cast instead of c-style cast
+ecbbf87 Paxos.cc: use reinterpret_cast instead of c-style cast
 784f6d5 fuse_ll.cc: use reinterpret_cast instead of c-style cast
 3994cca librados.cc: use static_cast instead of c-style cast
 719dc17 Revert "Enforce cache size on read requests"
@@ -2826,18 +6223,18 @@ c2cebe5 Revert "osdc: Fix compiler warning"
 a1f23b7 Revert "rbd: ObjectCacher reads can hang when reading sparse files"
 b808cdf Revert "Fix read performance regression in ObjectCacher"
 c4b2abd .gitmodules: ignoring changes in rocksdb submodule
-082416f (origin/wip-rbd-revert) Revert "Enforce cache size on read requests"
+082416f Revert "Enforce cache size on read requests"
 544b9a8 Revert "rbd: ObjectCacher reads can hang when reading sparse files"
 3b1eafc Revert "Fix read performance regression in ObjectCacher"
-d8442eb (origin/wip-rwtimer) common/Timer: kill RWTimer
-f76f83c (origin/wip-9873) osdc/Objecter: fix tick_event handling in shutdown vs tick race
+d8442eb common/Timer: kill RWTimer
+f76f83c osdc/Objecter: fix tick_event handling in shutdown vs tick race
 4848435 common/Timer: recheck stopping before sleep if we dropped the lock
 8d51142 erasure-code: remove LRC dead code
 9d09e37 ECTransaction: Remove unused typedef.
 60eaeca .gitmodules: ignoring changes in rocksdb submodule
 a9dd4af rgw: send http status reason explicitly in fastcgi
-44a8d59 (origin/fix-fstat-mode) java: fill in stat structure correctly
-cb9262a (origin/wip-9806-giant) Objecter: resend linger ops on any interval change
+44a8d59 java: fill in stat structure correctly
+cb9262a Objecter: resend linger ops on any interval change
 1a3ad30 ReplicatedPG: writeout hit_set object with correct prior_version
 8ae942a Remove unnecessary expressions about conf_journal_sz
 024efeb EC: document the LRC per layer plugin configuration
@@ -2847,8 +6244,8 @@ fa07c04 qa: use sudo when rsyncing /usr so we can read everything
 4128814 FDCache: purge hoid on clear
 3abbd4c shared_cache: add purge and tests
 c116b4b shared_cache::add: do not delete value if existed
-227ecd8 (origin/wip-9859) mon: Monitor: MMonGetMap doesn't require caps
-9803ced (origin/wip-init-radosgw) init-radosgw.sysv: set ulimit -n before starting daemon
+227ecd8 mon: Monitor: MMonGetMap doesn't require caps
+9803ced init-radosgw.sysv: set ulimit -n before starting daemon
 e81d887 Make better error reporting for client's call to rados.Rados
 d9ff3a6 PG:: reset_interval_flush and in set_last_peering_reset
 a49d745 objecter: Unlock in shutdown before waiting for timer thread
@@ -2860,21 +6257,21 @@ fdeeeb1 erasure code: use 32-byte aligned buffers
 3ce0615 erasure code: use a function for the chunk mapping index
 6154339 common: add an aligned buffer with less alignment than a page
 681dad8 common: remove dead code in buffer.cc
-12f1151 (origin/wip-9852) mon: MDSMonitor: have 'filesystem_command' return int instead of bool
+12f1151 mon: MDSMonitor: have 'filesystem_command' return int instead of bool
 c589f3f mon: MDSMonitor: proper error output if pool DNE on 'add_data_pool'
 52180a5 mon: MDSMonitor: update function doxygen description
-64cf8ea (origin/wip-msgr) msg/async/AsyncConnection: drop unused port
+64cf8ea msg/async/AsyncConnection: drop unused port
 86d3b18 msg/Messenger: allow ms_type 'random'
 57d8195 msg/Makefile: keep headers
 c025632 msg/async: rename ms_event_ -> ms_async_, drop unused config options
 ddc9bc0 libcephfs.cc: use reinterpret_cast instead of c-style cast
-fbf89f5 (origin/wip-9857) rbd: Correct readahead divide by zero exception
+fbf89f5 rbd: Correct readahead divide by zero exception
 d9e340c Makefile.am: remove also old *.o/lo files
 89eb2fe crush/CrushWrapper.cc: use reinterpret_cast instead of c-style cast
 32665fa client/Client.cc: prefer empty() over size() for emptiness check
 4cf3b5c FileJournal.cc: use reinterpret_cast instead of c-style cast
 e72dfb4 Default configure with nss instead of cryptopp
-2ec3609 (origin/wip-9372-injectargs) cli: CEPH_ARGS must be before injectargs
+2ec3609 cli: CEPH_ARGS must be before injectargs
 a566610 doc: update injectargs syntax
 a458bd8 cli: do not parse injectargs arguments twice
 f1afb18 cli: add verbose information related to argument parsing
@@ -2886,8 +6283,8 @@ d80ea6a mailmap: Mehdi Abaakouk affiliation
 4eee5f0 mailmap: VRan Liu affiliation
 d8b260a mailmap: Yann Dupont affiliation
 65be257 Fix read performance regression in ObjectCacher
-349eb51 (origin/wip-9836-test-mon-id) tests: mon id is incorrectly skipped
-9fbc083 (origin/wip-9031-9262) rados command: Add format support for ls operation
+349eb51 tests: mon id is incorrectly skipped
+9fbc083 rados command: Add format support for ls operation
 81f1a54 doc: Changes to librados to fix doc errors
 09a6186 doc: The dot package is included in graphviz package
 aae0976 doc: Remove unused PendingReleaseNotes file
@@ -2905,8 +6302,8 @@ aa837b0 pybind: Add previously missed basic namespace to python bindings
 21a0eb3 test, key_value_store: Use pool rbd since no default data pool present
 d7a8148 ceph_test_rados_api_*: expose RadosTest::nspace Expose RadosTestEC::nspace Rename others to nspace for consistency
 e42fb5e Revert "ceph_test_rados_api_*: expose nspace"
-ac3c1cb (origin/wip-9820) qa/workunits: cephtool: don't remove self's key on auth tests
-98d7760 (origin/sponce-wip-9356) Added missing semaphore in libradosstriper AIO tests leading to potential memory corruption and thus test failures
+ac3c1cb qa/workunits: cephtool: don't remove self's key on auth tests
+98d7760 Added missing semaphore in libradosstriper AIO tests leading to potential memory corruption and thus test failures
 f26cd1b Fixed remaining part of the seg fault described in bug 9356 by adding reference counting to CompletionData
 783fc66 Fixed part of the seg fault described in bug 9356 by adding reference counting on RadosReadCompletionData
 52c97b0 Fixed part of the seg fault described in bug 9356 by cleaning reference counting on MultiAioCompletionImpl
@@ -2918,18 +6315,18 @@ ea100ac KeyValueStore: Add clone op detect to promote error
 f207416 mailmap: Cheng Cheng name normalization
 f76a676 osd: log when scrub,deep scrub, repair starts
 64d977b client: fix signed/unsigned compilation warnings
-cb290a1 (origin/wip-adamcrume) osdc: Fix compiler warning
+cb290a1 osdc: Fix compiler warning
 2ae1cba mon: MDSMonitor: wait for osdmon to be writable when requesting proposal
 0dd473c mon: MDSMonitor: have management_command() returning int instead of bool
 91b2a2b mon: MDSMonitor: don't return -EINVAL if function is bool
 1d1ae41 mon: MDSMonitor: check all conditions are met *before* osdmon proposal
 07b7f10 mon: MDSMonitor: return if fs exists on 'fs new'
-2c06413 (origin/wip-mds-coverity) mds: fix null dereference
-9e6ae73 (origin/giant-unknown-locktype) mds: reply -EOPNOTSUPP for unknown lock type
+2c06413 mds: fix null dereference
+9e6ae73 mds: reply -EOPNOTSUPP for unknown lock type
 bb9add6 test: fix compile warning in bufferlist.cc
 a49255f librbd: fix compile warning in librbd/internal.cc.
 69b5c72 doc: updates on Backfill Reservation
-4606af7 (origin/wip-9526-crush-rename-bucket) mon: unit tests for osd crush rename-bucket
+4606af7 mon: unit tests for osd crush rename-bucket
 aa67560 mon: add the osd crush rename-bucket command
 29d13d4 crush: unit tests for CrushWrapper rename item/bucket methods
 30f3ac3 crush: add CrushWrapper::rename_bucket and can_rename_bucket
@@ -2937,25 +6334,25 @@ aa67560 mon: add the osd crush rename-bucket command
 f810710 add unittest for Striper::get_num_objects
 236895e crush: improve constness of CrushWrapper methods
 7b66ee4 when non-default striping is used, internal methods can operate on extra objects
-3741aab (origin/wip-9790-display-auid) auth: unit tests for auid display
+3741aab auth: unit tests for auid display
 5558afa qa/workunits/rbd/import_export.sh: be case insensitive
 fcc3692 auth: add display auid to KeyServer::encode_secrets
 fa2ff33 auth: add display auid to KeyRing::encode_formatted
 2cbebc3 doc: correct command of `config push`
 107cb0d rgw: correct "If-Modified-Since" handle.
-ac92c45 (origin/wip-client-flock) qa/workunits/fs/misc: Add a workunit for file lock interruption
+ac92c45 qa/workunits/fs/misc: Add a workunit for file lock interruption
 b0e6e85 mds: fix neighbor lock check
 b61468d doc/dev/sepia: notes about sepia email list and irc channels
 6705180 doc/release-notes: v0.80.7
-c9f9e72 (origin/wip-9598) Revert "Objecter: disable fast dispatch of CEPH_MSG_OSD_OPREPLY messages"
+c9f9e72 Revert "Objecter: disable fast dispatch of CEPH_MSG_OSD_OPREPLY messages"
 00907e0 msg/simple: do not stop_and_wait on mark_down
 22637f4 doc: update lab notes
-7022679 (origin/wip-5977) librbdpy: Added missing method docstrings
+7022679 librbdpy: Added missing method docstrings
 6753923 mds: reply -EOPNOTSUPP for unknown lock type
-c4bac3e (origin/wip-inotable-init) mds: fix inotable initialization/reset
+c4bac3e mds: fix inotable initialization/reset
 c95bb59 mds: fix inotable initialization/reset
 f1fccb1 rpm: 95-ceph-osd-alt.rules is not needed for centos7 / rhel7
-b73fe1a (origin/wip-9730) doc: remove node requirement from 'mon create-initial'
+b73fe1a doc: remove node requirement from 'mon create-initial'
 264f0fc doc: remove whitespace
 20b2766 Update vstart to setup users for s3-tests
 0969945 client: use finisher to abort MDS request
@@ -2967,14 +6364,14 @@ e464a77 client: use atomic variable to track reference of MetaRequeset
 fbf4d47 KeyValueStore: Make clone error message more friendly
 b426460 GenericObjectMap: sync transaction avoid lookup_parent failed
 674c911 StoreTest: Add clone xattr test
-d947050 (origin/wip-9718) osd/osd_types: consider CRUSH_ITEM_NONE in check_new_interval() min_size check
-5ff4a85 (origin/wip-9747-ceph-spec) rpm: 95-ceph-osd-alt.rules is not needed for centos7 / rhel7
+d947050 osd/osd_types: consider CRUSH_ITEM_NONE in check_new_interval() min_size check
+5ff4a85 rpm: 95-ceph-osd-alt.rules is not needed for centos7 / rhel7
 50987ec libcephfs.h libcephfs.cc : Defined error codes for the mount function Used new error codes from libcephfs.h to replace the magic numbers in the mount functon found in libcephfs.cc.
 7bab093 return value of handle_message for MSG_OSD_SUBOP/MSG_OSD_SUBOPREPLY should be true
-d955676 (origin/wip-8983) rados: Use strict_strtoll instead of strtoll
-809ddd2 (origin/wip-9706) osdc/Objecter: fix use-after-frees in close_session, shutdown
+d955676 rados: Use strict_strtoll instead of strtoll
+809ddd2 osdc/Objecter: fix use-after-frees in close_session, shutdown
 72a2ab1 osdc/Objecter: fix tick() session locking
-d98b755 (origin/wip-9716) librados: Fix function prototypes in librados.h
+d98b755 librados: Fix function prototypes in librados.h
 d458b4f PGLog::IndexedLog::trim(): rollback_info_trimmed_to_riter may be log.rend()
 022bace rados: Add tests for CLI strict parsing
 26fe180 test: Fix rmpool in test_rados_tool.sh
@@ -2986,16 +6383,16 @@ e075c27 common: link mds/flock.o to libcommon
 89ebcaa Fixed JSON output for stray OSDs
 a8d597f Fix error message when stripping with format 1
 66afcd9 Check pointer before deleting
-2c7c03c (origin/wip-coverity-20141003) ceph_erasure_code_benchmark: fix parameter handling
+2c7c03c ceph_erasure_code_benchmark: fix parameter handling
 8021581 client: add missing dendl and s/dout/ldout/
-fa539b9 (origin/wip-9692) qa/workunits/fs/misc: fix syntax error
-9b18d99 (origin/wip-9696) PG::choose_acting: in mixed cluster case, acting may include backfill
-3dd4cca (origin/wip-7796) rgw: set length for keystone token validation request
-3cd8a7f (origin/giant-locker-null) mds: Locker: fix a NULL deref in _update_cap_fields
+fa539b9 qa/workunits/fs/misc: fix syntax error
+9b18d99 PG::choose_acting: in mixed cluster case, acting may include backfill
+3dd4cca rgw: set length for keystone token validation request
+3cd8a7f mds: Locker: fix a NULL deref in _update_cap_fields
 3b9dcff rados: Parse command-line arguments strictly
 aa138eb mds: MDirUpdate: initialize discover count to 0 if we're not discovering
 2a9ed93 mds: MDSAuthCaps: init "read" param in default constructor
-ce4436c (origin/wip-print-mode) client: print out mode, uid, gid if they are changed
+ce4436c client: print out mode, uid, gid if they are changed
 7cb2840 test_rgw_admin_log.cc: refactor to use calloc()
 c43c85f test_rgw_admin_log.cc: fix use-after-free
 c60a170 test/librados/c_write_operations.cc: free ressources
@@ -3009,9 +6406,9 @@ d75856b osd/ReplicatedBackend.cc: prefer ++operator for non-primitive iterators
 72e708c osd/OSDMap.cc: prefer ++operator for non-primitive iterators
 4669233 mount.ceph.c: ensure '\0' terminated string
 865a0dc build: add ceph-erasure-code-corpus to gitmodule_mirrors
-f06ffba (origin/wip-da-fix-make_check) Makefile.am: make sure everything was build before make check
+f06ffba Makefile.am: make sure everything was build before make check
 9b3d345 qa: move mon_mds tests last
-e27cf41 (origin/wip-9437) qa: cephtool tests for `tell mds.X`
+e27cf41 qa: cephtool tests for `tell mds.X`
 620a722 qa: fixup cephtool test when MDS exists
 82ecf60 test: unit tests for MDSAuthCaps
 d15ecaf vstart: create fewer pgs for fs pools
@@ -3043,11 +6440,11 @@ ed862ec AsyncMessenger: move Async* related file to msg/async
 268db10 AsyncConnection: Rescheduler write event when connect successfully
 b8ae562 AsyncConnection: Avoid seq ack not send and skip sd<0 shutdown
 e25f2fe AsyncMessenger: Avoid join when worker thread not started
-64ca744 (origin/wip-heap-profiler) doc: update memory-profiling with examples
+64ca744 doc: update memory-profiling with examples
 6f6e7e2 mon,mds: call ceph_heap_profiler_init() at boot time
 472a4b3 perfglue: profiler stats need more than 1024 bytes
 cb36ccb mailmap: add yuandong to organizationmap
-bc0209e (origin/wip-catch-options-errors) test,tools: indentation, whitespace only
+bc0209e test,tools: indentation, whitespace only
 3677fe0 tools: allow_unregistered() options
 29b3771 tools: move po::collect_unrecognized in the try block
 8e5b7ac tools: display boost::program_options errors
@@ -3084,21 +6481,21 @@ d41bb28 Add STANDBY state process
 071ba4a Add event option
 a653af1 Remove DispatchQueue in AsyncMessenger
 d09d28c Add AsyncMessenger support
-82175ec (origin/wip-9513) Fix read performance regression in ObjectCacher
-0dd3afd (origin/wip-9496) mon: PGMonitor: populate scrub timestamps with 'now' on pg creation
+82175ec Fix read performance regression in ObjectCacher
+0dd3afd mon: PGMonitor: populate scrub timestamps with 'now' on pg creation
 ccdbfb4 mon: PGMonitor: prettify access to pg_stats_t in register_pg
-9b39033 (origin/wip-9419) osd: Return EOPNOTSUPP if a set-alloc-hint occurs with OSDs that don't support
+9b39033 osd: Return EOPNOTSUPP if a set-alloc-hint occurs with OSDs that don't support
 70ef4c1 osd: Remove unused PG functions queue_notify(), queue_info(), queue_log()
 d14ca34 0.86
-7c4e278 (origin/wip-9677-ioprio-class-giant) common: ceph_ioprio_string_to_class always returns -EINVAL
+7c4e278 common: ceph_ioprio_string_to_class always returns -EINVAL
 822b088 osd: log error if set_ioprio fails to parse class
 efb23b8 common: set_ioprio debug message including pid
 62f0ef4 common: do not set ioprio if pid is not set
 19c92d8 doc: missing osd prefix to ioprio config options
-97dcc05 (tag: v0.86) 0.86
+97dcc05 0.86
 32e8bcd Run configure without liblttng on squeeze as well.
 be6de4a Run configure without liblttng on squeeze as well.
-3535b7a (origin/wip-9677-ioprio-class) common: ceph_ioprio_string_to_class always returns -EINVAL
+3535b7a common: ceph_ioprio_string_to_class always returns -EINVAL
 5088e0d osd: log error if set_ioprio fails to parse class
 33339c7 common: set_ioprio debug message including pid
 c7e4c0b common: do not set ioprio if pid is not set
@@ -3107,32 +6504,32 @@ b2e4bd5 msg: move SimpleMessenger to msg/simple/
 5a860be librados: use Messenger factory method
 5eff0ee msg: use localized cct for derr
 06aef6f doc/release-notes: v0.86
-10fe7cf (origin/wip-ostool) ceph_objectstore_tool: Accept CEPH_ARGS environment arguments
-6aba0ab (origin/wip-9128) Add reset_tp_timeout in long loop in add_source_info for suicide timeout
-52ac520 (origin/wip-rm-legacy-cli) tools: remove old ceph.cc
+10fe7cf ceph_objectstore_tool: Accept CEPH_ARGS environment arguments
+6aba0ab Add reset_tp_timeout in long loop in add_source_info for suicide timeout
+52ac520 tools: remove old ceph.cc
 63c7e16 test/osd/Object: don't generate length of 0
-abe4c35 (origin/wip-os-recommend-doc-fix) doc: update kernel recommendations, add tunables link
-6b9e20a (origin/wip-9658) ceph.spec: make ceph -> python-ceph dependency specify version
+abe4c35 doc: update kernel recommendations, add tunables link
+6b9e20a ceph.spec: make ceph -> python-ceph dependency specify version
 69acc8a msg: make messenger type configurable
 62b0bfd msg/Message: drop unneeded #include
 e55cb1f mon/MonClient: use generic Messenger factory
-fe57fab (origin/wip-9663) client: clean-up objecter on failed client init
+fe57fab client: clean-up objecter on failed client init
 ca10ce4 Add one more case ( "=" ) and test get_str_vec as well. Signed-off-by: Yann Dupont <yann at objoo.org>
-257dcc9 (origin/wip-osd-op-macro) rados.h: use macros to generate osd ops definitons and strings
-1858f06 (origin/johnugeorge-wip-9492-crush-giant) Crush: Ensuring at most num-rep osds are selected
+257dcc9 rados.h: use macros to generate osd ops definitons and strings
+1858f06 Crush: Ensuring at most num-rep osds are selected
 01e389a Crush: Ensuring at most num-rep osds are selected
-f1becf9 (origin/wip-9655-osd-tell) qa: ceph tell must retry on ENXIO
-234b066 (origin/wip-9492-crush-indep) Crush: Ensuring at most num-rep osds are selected
+f1becf9 qa: ceph tell must retry on ENXIO
+234b066 Crush: Ensuring at most num-rep osds are selected
 5c6c366 debian/control: fix python-ceph -> ceph file move to allow upgrades
-35fd272 (origin/wip-9657) messages: provide an explicit COMPAT_VERSION in MMDSBeacon
-177a33b (origin/wip-9661) MemStore: Need set/get_allow_sharded_objects() to work for ceph_objectstore_tool
+35fd272 messages: provide an explicit COMPAT_VERSION in MMDSBeacon
+177a33b MemStore: Need set/get_allow_sharded_objects() to work for ceph_objectstore_tool
 0b155d0 ceph_objectstore_tool: MemStore needs a CephContext
 7f6c31b debian/control: BuildDepend on lsb-release
-73d5bdb (origin/wip-9245-test-mon-pending-giant) tests: remove tests requiring osdmonitor_prepare_command
+73d5bdb tests: remove tests requiring osdmonitor_prepare_command
 e865781 mon: obsolete osdmonitor_prepare_command
 d0c2d7d test: minor case fix
-d0ceb3a (origin/wip-9545) os/FileStore: do not loop in sync_entry on shutdown
-7e2bd22 (origin/wip-9418) mon: Monitor: let 'handle_command()' deal with caps validation
+d0ceb3a os/FileStore: do not loop in sync_entry on shutdown
+7e2bd22 mon: Monitor: let 'handle_command()' deal with caps validation
 f0653c0 qa/workunits: mon: auth_caps: account for mon blank caps
 87d06f1 mon: MonCommands: adjust indentation for 'auth add'
 c7d5c25 qa/workunits: mon: auth_caps: variables must be local
@@ -3146,59 +6543,59 @@ ea96863 qa/workunits: cephtool: test auth profiles
 d6b702c mon: MonCap: add new profiles
 940c4e2 mon: Monitor: match command module caps against what's on MonCommands.h
 8612461 mon: AuthMonitor: validate caps when creating or changing mon caps
-038de00 (origin/wip-9245-test-mon-pending) tests: remove tests requiring osdmonitor_prepare_command
+038de00 tests: remove tests requiring osdmonitor_prepare_command
 2abc8f4 mon: obsolete osdmonitor_prepare_command
 6f69837 test: minor case fix
-eb2f0f4 (origin/wip-9653-ceph-disk-bootstrap-osd-giant) ceph-disk: bootstrap-osd keyring ignores --statedir
-fa0bd06 (origin/wip-9653-ceph-disk-bootstrap-osd) ceph-disk: bootstrap-osd keyring ignores --statedir
+eb2f0f4 ceph-disk: bootstrap-osd keyring ignores --statedir
+fa0bd06 ceph-disk: bootstrap-osd keyring ignores --statedir
 19be358 PG::actingset should be used when checking the number of acting OSDs for a given PG. Signed-off-by: Guang Yang <yguang at yahoo-inc.com>
 8253ead osdc/Objecter: use SafeTimer; make callbacks race-tolerant
 6c37984 mailmap: Yehuda Sadeh name normalization
 beff616 ceph-disk: set guid if reusing a journal partition
-50e8040 (origin/wip-rados-empty-file) tools: rados put /dev/null should write() and not create()
-0b0a373 (origin/wip-ysw-mailmap) mailmap: update email address
+50e8040 tools: rados put /dev/null should write() and not create()
+0b0a373 mailmap: update email address
 188370a doc/release-notes: fix attributions for 8702 fix
 c0dc3a5 doc/release-notes: v0.80.6
-5b41d80 (origin/wip-8911) rgw: swift GET / HEAD object returns X-Timestamp field
+5b41d80 rgw: swift GET / HEAD object returns X-Timestamp field
 29356d8 qa: fix osd pool ls invalid test
 a1aa06b ReplicatedPG: dump snap_trimq on pg query
 34f38b6 ReplicatedPG: do not queue the snap trimmer constantly
 b29bf00 ReplicatedPG: clean out completed trimmed objects as we go
-3374a0b (origin/wip-coverity-20141002) BtrfsFileStoreBackend.cc: fix string overflow
+3374a0b BtrfsFileStoreBackend.cc: fix string overflow
 8b7fc61 test_librbd.cc: fix compiler warning, cast to size_t
-e506f89 (origin/wip-objecter-shutdown) Objecter: check the 'initialized' atomic_t safely
-0f731ae (origin/revert-2604-wip-9113) Revert "ReplicatedPG: clean out completed trimmed objects as we go"
-fea0154 (origin/wip-mon-status) mon: break recovery status onto separate lines
+e506f89 Objecter: check the 'initialized' atomic_t safely
+0f731ae Revert "ReplicatedPG: clean out completed trimmed objects as we go"
+fea0154 mon: break recovery status onto separate lines
 46a76d5 mon: put 'ceph status' quorum status on new line
 e127c89 mon: put 'ceph status' health items on separate lines
 da9ae5c ceph.spec: fix typo
-63b30d4 (origin/wip-8187) librbd: Skip tier pools in list_children and snap_unprotect
-63d0ec7 (origin/wip-9013) rgw: add civetweb as a default frontend
-d8ae14f (origin/wip-8188) librados: Add rados_pool_get_base_tier call
-5b58f16 (origin/wip-coverity-20141001) test_librbd.cc: fix compiler warning
+63b30d4 librbd: Skip tier pools in list_children and snap_unprotect
+63d0ec7 rgw: add civetweb as a default frontend
+d8ae14f librados: Add rados_pool_get_base_tier call
+5b58f16 test_librbd.cc: fix compiler warning
 58c732f client/SyntheticClient.cc: remove dead code
 3184a20 doc: A couple of fixes to the CloudStack documentation
 88022e8 SyntheticClient.cc: remove dead code
 17fa397 LogEntry.h: init LogEntry::prio in constructor
 7d6e21d osd: fix need_journal call
-8388836 (origin/wip-rpm-epoch) ceph.spec.: add epoch
-a470c96 (origin/wip-fsls-json) mon: fix JSON `fs ls` output
+8388836 ceph.spec.: add epoch
+a470c96 mon: fix JSON `fs ls` output
 2955b3d ObjectStore: Add "need_journal" interface to make aware of journal device
 05fd507 Pipe: avoid redundancy new/delete for struct iovec
-cefb1a3 (origin/wip-9635) mon: wait for paxos writes before touching state
+cefb1a3 mon: wait for paxos writes before touching state
 46c1d93 mon: flush paxos write before setting shutdown state
 4072ef7 mon: move paxos write wait into a helper
 624aaf2 PG: release backfill reservations if a backfill peer rejects
-62e2bca (origin/wip-8822) osd: swap state spinlock for atomic_t
+62e2bca osd: swap state spinlock for atomic_t
 a8ac4b6 osdc/Filer: drop probe/purge locks before calling objecter
 0ea20a6 Locker: accept ctime updates from clients without dirty write caps
 2e9c7c9 test-shared-cache:   Add test for "lower_bound" method.   Add multithread tests for "lookup" and "lower_bound" methods.
 d34c21c test-shared-cache:   Initial draft for the unit test of "common/shared_cache.hpp".
 de87d54 common: document C_GatherBuilder and C_Gather
-a67c2f9 (origin/wip-9628) mds: Add session to sessionmap when its state becomes opening
+a67c2f9 mds: Add session to sessionmap when its state becomes opening
 1feba20 Objecter: init with a constant of the correct type
-46d5518 (origin/wip-osd-objecter) osd: do not bind ms_objecter messenger
-fe3434f (origin/wip-python-flask) debian: move ceph_rest_api.py into ceph
+46d5518 osd: do not bind ms_objecter messenger
+fe3434f debian: move ceph_rest_api.py into ceph
 8cda623 ceph.spec.in: move ceph_rest_api.py into ceph
 b241624 ceph.spec: fix python-flask dependency
 e42424e debian: python-flask is needed by ceph, not python-ceph
@@ -3223,24 +6620,24 @@ b167f70 mailmap: Sahid Ferdjaoui affiliation
 b386b59 mailmap: JuanJose Galvez affiliation
 e6bba0d mailmap: Roman Haritonov affiliation
 e133a92 mailmap: Yongyue Sun affiliation
-beade63 (origin/wip-9620-test-mon-thrash) qa/workunits/cephtool/test.sh: fix thrash (ultimate)
+beade63 qa/workunits/cephtool/test.sh: fix thrash (ultimate)
 5d1d9db librados: cap the IoCtxImpl::{aio_}*{write,append} buffer length
 5f029ff os/FileStore: using FIEMAP_FLAGS_SYNC instead of fsync() before call fiemap.
 69fc171 Clock: Using clock_gettime instead of gettimeofday in linux os.
 becc114 librados: test s/E2BIG/TooBig/
 32195f9 librados: cap the rados_aio_*{write,append} buffer length
 f777fc6 osd: Make RPGTransaction::get_bytes_written return the correct size.
-7849d79 (origin/wip-crush-location) crushtool: add --show-location <id> command
+7849d79 crushtool: add --show-location <id> command
 33501d2 librados: cap the rados*{write,append} buffer length
 f8ac224 ceph-disk: add Scientific Linux as a Redhat clone
-6b4d1aa (origin/johnugeorge-wip-9492-crush) Crush: Ensuring at most num-rep osds are selected
+6b4d1aa Crush: Ensuring at most num-rep osds are selected
 5ca7ea5 ceph-disk: add Scientific Linux as a Redhat clone
 7827e00 os: io_event.res is the size written
 d851c3f osd: improve debug output for do_{notifies,queries,infos}
-126d0b3 (origin/wip-9582) osdc/Objecter: only post_rx_buffer if no op timeout
-ba02a5e (origin/wip-9536-isa-alignment) erasure-code: test isa encode/decode with various object sizes
+126d0b3 osdc/Objecter: only post_rx_buffer if no op timeout
+ba02a5e erasure-code: test isa encode/decode with various object sizes
 eb8fdfa erasure-code: add test for isa chunk_size method
-7a468f3 (origin/wip-msgr-shutdown) msg: allow calling dtor immediately after ctor
+7a468f3 msg: allow calling dtor immediately after ctor
 af07d29 erasure-code: isa encode tests adapted to per chunk alignment
 aa9d70b erasure-code: isa test compare chunks with memcmp instead of strncmp
 ed77178 erasure-code: run isa tests via libtool and valgrind
@@ -3248,26 +6645,26 @@ ed77178 erasure-code: run isa tests via libtool and valgrind
 28c2b6e erasure-code: isa uses per chunk alignment constraints
 6f4909a erasure-code: [ISA] modify get_alignment function to imply a platform/compiler independent alignment constraint of 32-byte aligned buffer addresses & length
 7552571 doc/release-notes: v0.67.11
-8dc94a2 (origin/wip-9562) osdc/Filer: drop probe/purge locks before calling objecter
+8dc94a2 osdc/Filer: drop probe/purge locks before calling objecter
 9593d87 documentation: erasure-code plugin isa does not require k/m
 6886224 mailmap: Yan Zheng affiliation
 fc1380b mailmap: Thorsten Glaser affiliation
-7973280 (origin/wip-zafman-cleanup) osd: Remove unused PG functions queue_notify(), queue_info(), queue_log()
-0f884fd (origin/wip-9008) For pgls OP, get/put budget on per list session basis, instead of per OP basis, which could lead to deadlock.
+7973280 osd: Remove unused PG functions queue_notify(), queue_info(), queue_log()
+0f884fd For pgls OP, get/put budget on per list session basis, instead of per OP basis, which could lead to deadlock.
 7f87cf1 ReplicatedPG: clean out completed trimmed objects as we go
 2cd9b5f tests: use memcmp to compare binary buffers
-c17ac03 (origin/wip-9487) ReplicatedPG: don't move on to the next snap immediately
+c17ac03 ReplicatedPG: don't move on to the next snap immediately
 255b430 osd: initialize purged_snap on backfill start; restart backfill if change
-f833f12 (origin/wip-5595) rgw: rados->set_attrs() updates bucket index
+f833f12 rgw: rados->set_attrs() updates bucket index
 44cfd88 rgw: PutObjMetadata, clear bufferlist before appending into it
 4be53d5 PG: check full ratio again post-reservation
 9c825ec documentation: revise placement group number guide
-7f71c11 (origin/wip-mon-fix-checks) ceph-mon: check fs stats just before preforking
+7f71c11 ceph-mon: check fs stats just before preforking
 9687150 erasure-code: isa/lrc plugin feature
-f51d21b (origin/wip-9572-erasure-code-blaumroth) erasure-code: restore jerasure BlaumRoth default w
+f51d21b erasure-code: restore jerasure BlaumRoth default w
 e270216 mailmap: William A. Kennington III affiliation
 9f0202b mailmap: Jason Dillaman affiliation
-89fceb3 (origin/wip-mon-data-space-die) mon: Monitor: log RO commands on 'debug' level, RWX on 'info'
+89fceb3 mon: Monitor: log RO commands on 'debug' level, RWX on 'info'
 2c5b12d mon: Monitor: use MonCommand::requires_perm() when checking perms
 bb55862 mon: Monitor.h: add 'requires_perm()' function to MonCommand struct
 f1b814e mon: Monitor: log RO admin socket commands on 'debug' level
@@ -3279,16 +6676,16 @@ f1b814e mon: Monitor: log RO admin socket commands on 'debug' level
 9996d44 mon: DataHealthService: use get_fs_stats() instead
 3d74230 common: util: add get_fs_stats() function
 f421d5c documentation: comment the CompatSet data members
-ce8eefc (origin/wip-8629) osd/ReplicatedPG: do not clone or preserve snapdir on cache_evict
+ce8eefc osd/ReplicatedPG: do not clone or preserve snapdir on cache_evict
 398c74e ceph_test_rados_api_tier: add EvictSnap2 test case
 682b9da os/FileJournal: do not request sync while shutting down
 544b8c7 ReplicatedPG::on_removal: clear rollback info
 781f05c Revert "PG::init: clear rollback info for backfill as well"
 a53ead1 osd: Cleanup boost optionals
-28b7b93 (origin/wip-log-msgs) mds: remove spurious logging
-226c0c7 (origin/wip-9547-python-rados-truncate) test: check python rados aio_read with buffers containing null
+28b7b93 mds: remove spurious logging
+226c0c7 test: check python rados aio_read with buffers containing null
 8bda44f pybind: Fix aio_read handling of string buffer
-9d9c8c7 (origin/wip-9539) Filer: add lock to protect strcut PurgeRange
+9d9c8c7 Filer: add lock to protect strcut PurgeRange
 b47fdd4 rbd: Use a rolling average to compute RBD write throughput
 f3acae4 rgw_main.cc: add missing virtual destructor for RGWRequest
 eeb74a1 os/GenericObjectMap.cc: pass big parameter by reference
@@ -3296,10 +6693,10 @@ eeb74a1 os/GenericObjectMap.cc: pass big parameter by reference
 f2a7d62 ErasureCodeLrc.h: fix UNINIT_CTOR
 1a74822 ErasureCodeLrc.h: fix data_chunk_count UNINIT_CTOR
 0e15df4 LogEntry.h: init LogEntry::seq with 0
-5bb94ed (origin/wip-9529) rgw: calculate hash after writing data
+5bb94ed rgw: calculate hash after writing data
 7b13724 crypto: don't hash zero sized buffer
 76eff95 include/util.h: prevent multiple inclusion of header
-b82ceda (origin/wip-coverity-20140919) rgw_main.cc: add missing virtual destructor for RGWRequest
+b82ceda rgw_main.cc: add missing virtual destructor for RGWRequest
 13b8c92 os/GenericObjectMap.cc: pass big parameter by reference
 45e0b33 common/buffer.cc: catch exception by reference
 7e82950 test/kv_store_bench.cc: fix coverity REVERSE_INULL
@@ -3311,36 +6708,36 @@ ea02dc3 ErasureCodeLrc.h: fix UNINIT_CTOR
 af039ce test/librados/aio.cc: close resource leak
 639c981 documentation: explain ceph osd reweight vs crush weight
 f51e368 erasure-code: fix assert overflow
-13780d7 (origin/wip-9538-format-plain) mon: osd find / metadata --format plain fallback
+13780d7 mon: osd find / metadata --format plain fallback
 0fb0967 documentation: tiering typo
-7ac60d8 (origin/wip-cephtool-test) tests: check osd health in cephtool/test.sh
+7ac60d8 tests: check osd health in cephtool/test.sh
 4d75c4c tests: qa/workunits/cephtool/test.sh ! and -e
 be97b7d documentation: fix bugous osd stop/start example
-3f23709 (origin/wip-giant-messenger-fixes) Objecter: disable fast dispatch of CEPH_MSG_OSD_OPREPLY messages
+3f23709 Objecter: disable fast dispatch of CEPH_MSG_OSD_OPREPLY messages
 681a754 Pipe: stop delayed delivery fast_dispatch in stop_and_wait()
 fb5a244 osd/Watch: handle con without sessions
 93dccdb osd/ReplicatedPG: handle sessionless op con in do_osd_op_effects
 5d37850 osd: drop dead shutdown message handler
 bb45621 common: Add cctid meta variable
-c421b55 (origin/wip-9301) mon: re-bootstrap if we get probed by a mon that is way ahead
+c421b55 mon: re-bootstrap if we get probed by a mon that is way ahead
 d81cd7f mon/Paxos: fix off-by-one in last_ vs first_committed check
-9132ca4 (origin/wip-5768) rbd-fuse: Fix memory leak in enumerate_images
-9a1ab95 (origin/wip-6926) rbd: Fix rbd diff for non-existent objects
+9132ca4 rbd-fuse: Fix memory leak in enumerate_images
+9a1ab95 rbd: Fix rbd diff for non-existent objects
 d6913ae mds: fix not journaling client metadata
-1395275 (origin/wip-9518) mds: fix not journaling client metadata
-346c06c (origin/wip-6613) client: preserve ordering of readdir result in cache
+1395275 mds: fix not journaling client metadata
+346c06c client: preserve ordering of readdir result in cache
 600af25 client: introduce a new flag indicating if dentries in directory are sorted
-59c8976 (origin/wip-acl-test) qa/workunits/fs/misc: Add a workunit for ACL
-2bd7cee (origin/wip-9341) client: trim unused inodes before reconnecting to recovering MDS
-95ee699 (origin/wip-rbd-readahead) client: Replace client readahead logic with Readahead
+59c8976 qa/workunits/fs/misc: Add a workunit for ACL
+2bd7cee client: trim unused inodes before reconnecting to recovering MDS
+95ee699 client: Replace client readahead logic with Readahead
 ad45d8c librbd: Add read-ahead
 830373e osdc: Fix ObjectCounter::_readx perf counters
 4809418 doc: Change listed RBD cache defaults to actual defaults
-4089684 (origin/wip-decode-dump) msg: hexdump bad messages at debug level 1 (not 30)
+4089684 msg: hexdump bad messages at debug level 1 (not 30)
 5495570 msg: hexdump messages we can't decode at configurable debug level
 7e13ac8 rgw: Export user stats in get-user-info Adminops API
 480c372 Revert "crushtool: safeguard for missing --num-rep when --test"
-288f05a (origin/wip-9189) mds: use new Session::human_name in health msgs
+288f05a mds: use new Session::human_name in health msgs
 6320e53 mds: implement const SessionMap::get_session
 32b51bb mds: generate friendly name for client sessions
 920cac9 ceph_fuse: include mount point in client metadata
@@ -3353,9 +6750,9 @@ cd215c7 messages: add cephfs client metadata to MClientSession
 0769310 PGLog::claim_log_and_clear_rollback_info: fix rollback_info_trimmed_to
 07f54f2 PG::find_best_info: let history.last_epoch_started provide a lower bound
 92cfd37 PG::choose_acting: let the pg go down if acting is smaller than min_size
-1f450fa (origin/wip-xattr-fix) client: request xattrs if xattr_version is 0
+1f450fa client: request xattrs if xattr_version is 0
 c837fb9 mds: return xattrs when client explictly requests xattrs
-681a49c (origin/wip-9435) mon: forbid tier changes when in use by FS
+681a49c mon: forbid tier changes when in use by FS
 80441cd mon: prevent cache pools being used CephFS
 86a4bed FileStore: Race condition during object delete is fixed
 10b8966 crushtool: safeguard for missing --num-rep when --test
@@ -3375,13 +6772,13 @@ daf6379 mailmap: correcting Zhiqiang Wang's mailmap and org affiliation
 1a8b91b doc: Add keyvaluestore config description
 bb49547 KeyValueStore: Reduce redundancy set_header call
 306fb2f mds: set new inode's xattr version to 1
-1b9226c (origin/wip-9219-giant) osd: subscribe to the newest osdmap when reconnecting to a monitor
-56ba341 (origin/wip-objecter-cancel) osdc/Objecter: fix command op cancellation race
+1b9226c osd: subscribe to the newest osdmap when reconnecting to a monitor
+56ba341 osdc/Objecter: fix command op cancellation race
 baf7be9 osdc/Objecter: cancel timeout before clearing op->session
-1149639 (origin/wip-xfs-inode64) ceph-disk: mount xfs with inode64 by default
+1149639 ceph-disk: mount xfs with inode64 by default
 ded1b30 erasure-code: preload fails if < 0
-27208db (origin/wip-doc-preflight) doc: Added feedback.
-a140439 (origin/wip-9284) mds: limit number of caps inspected in caps_tick
+27208db doc: Added feedback.
+a140439 mds: limit number of caps inspected in caps_tick
 bf590f8 mds: keep per-client revoking caps list
 a6a0fd8 xlist: implement copy constructor
 fd04d5e mds: health metric for late releasing caps
@@ -3396,7 +6793,7 @@ e6062b8 mds: add a health metric for failure to recall caps
 c328486 client: fix crash in trim_caps
 83fd1cf bugfix: wrong socket address in log msg of Pipe.cc
 868b6b9 doc: osd_backfill_scan_(min|max) are object counts
-cdb7675 (origin/wip-rbd-objectcacher-hang) rbd: ObjectCacher reads can hang when reading sparse files
+cdb7675 rbd: ObjectCacher reads can hang when reading sparse files
 ddd52e8 init-radosgw.sysv: Support systemd for starting the gateway
 d32b428 doc: Added bucket management commands to ops/crush-map
 d446a65 documentation: jerasure plugin is sub section of erasure code profile
@@ -3404,28 +6801,28 @@ d446a65 documentation: jerasure plugin is sub section of erasure code profile
 75f0fb2 documentation: erasure code profile update
 b8a1ec0 doc: fixes a formatting error on ops/crush-map
 c3c6468 mds: update segment references during journal rewrite
-a8c943a (origin/wip-9445) log: add simple test to verify an internal SEGV doesn't hang
-2313ce1 (origin/wip-client-msg-leak) client: fix a message leak
+a8c943a log: add simple test to verify an internal SEGV doesn't hang
+2313ce1 client: fix a message leak
 e3fe18a global/signal_handler: do not log if SEGV originated inside log code
 558463e log: add Log::is_inside_log_lock()
-386f2d7 (origin/wip-9427-rewrite) mds: update segment references during journal rewrite
-d41c3e8 (origin/wip-9307) rgw: push hash calculater deeper
+386f2d7 mds: update segment references during journal rewrite
+d41c3e8 rgw: push hash calculater deeper
 28d4460 DBObjectMap: restructure map_header_in_use locking
 0a1abac osd_types: fix pg_log_t tail dump
 4d3579d doc: Added clarifications and added additional preflight steps for CentOS/RHEL.
 1c8485e doc: Deleted redundant text string.
 ce7b2ec erasure-code: fix erasure_code_benchmark goop (decode)
-9ba4e78 (origin/wip-9446) mon: fix MDS health detail output
-2b45bfa (origin/majianpeng-fix3) buffer: Add a test for bufferlist::rebuild_page_aligned
+9ba4e78 mon: fix MDS health detail output
+2b45bfa buffer: Add a test for bufferlist::rebuild_page_aligned
 c268400 buffer: In rebuild_page_aligned for the last ptr is page aligned, no need call rebuild().
-bccb0eb (origin/wip-always-create-pidfile) daemons: write pid file even when told not to daemonize
+bccb0eb daemons: write pid file even when told not to daemonize
 aa5234e doc: Add file system osd config settings Add documentation for osd mkfs and osd mount options.
 f54d5c7 rbd-replay: Add code documentation
 8d23e8d rbd-replay: Add --dump-perf-counters option
 675a6a6 rbd-replay: Add rbd-replay-many
 398ebc0 librbd: Add missing tracepoints
 3911354 rbd-replay: Add --anonymize flag to rbd-replay-prep
-0e0a8d4 (origin/wip-7767-b) msg: prefetch data when doing recv()
+0e0a8d4 msg: prefetch data when doing recv()
 16bd457 ReplicatedPG: cancel cb on blacklisted watcher
 b7bdb93 erasure-code: fix erasure_code_benchmark goop
 11082f7 OpTracker: Race condition removed while dumping ops through admin socket
@@ -3434,27 +6831,27 @@ b7bdb93 erasure-code: fix erasure_code_benchmark goop
 3fac790 OpTracker: Sharding logic is implemented to improve performance
 2fbe82d doc: Fixed syntax error.
 7dbf750 doc: Updated authentication notes. Fixed syntax error.
-7281638 (origin/wip-mds-beacon) mds: sleep in progress thread if laggy and waiting_for_nolaggy waiters
+7281638 mds: sleep in progress thread if laggy and waiting_for_nolaggy waiters
 6fb5769 mds/Beacon: do not reconnect to mon in quick succession
-4ad5db0 (origin/wip-client-ll-ref) client: include ll_ref when printing inode
-cf70b90 (origin/wip-7934) test: Fix ceph_test_rados_watch_notify to delete the pools it creates
+4ad5db0 client: include ll_ref when printing inode
+cf70b90 test: Fix ceph_test_rados_watch_notify to delete the pools it creates
 2e4e98b ReplicatedPG: Make perfcounter record the read-size for  async-read.
 cf34e00 ReplicatedPG: record correctly subop for perfcounter.
-117de00 (origin/wip-9413-erasure-code-version-check) erasure-code: mon, osd etc. depend on the plugins
+117de00 erasure-code: mon, osd etc. depend on the plugins
 5a05e6b [rgw][s3] Allow colon ':' in access key
-f05c977 (origin/wip-pool-ls) mon: add 'osd pool ls [detail]' command
-46bbe30 (origin/wip-osdc-leak) osdc/Objecter: fix leak of MStatfsReply
+f05c977 mon: add 'osd pool ls [detail]' command
+46bbe30 osdc/Objecter: fix leak of MStatfsReply
 1ff94cc EC-ISA: avoid usage of table cache lock outside the class implementation by introducing the setEncodingTable/setEncodingCoefficient methods
 5b41dac EC-ISA: add intelligent table cache
-83bd343 (origin/wip-replay-locking) mds: fix replay locking
+83bd343 mds: fix replay locking
 3448d85 Cache tiering: do not allow flush/evict when there are requests pending
 b40cce7 osd: set min_read_recency_for_promote to default 1 when doing upgrade
 d734600 Change CrushWrapper::crush to private
-5a784cd (origin/wip-9241) osdc/Objecter: drop bad session nref assert
+5a784cd osdc/Objecter: drop bad session nref assert
 3c6e888 osd/ClassHandler: fix build
 d165238 FileStore: report l_os_j_lat as commit latency
-70ce400 (origin/wip-9365) osd/ClassHandler: improve error logging
-2305b28 (origin/wip-9362) osdc/Objecter: revoke rx_buffer on op_cancel
+70ce400 osd/ClassHandler: improve error logging
+2305b28 osdc/Objecter: revoke rx_buffer on op_cancel
 f295c1f ceph_test_rados_api_io: add read timeout test
 977d289 ceph_test_rados_api_*: expose nspace
 1349383 Objecter::_recalc_linger_op: resend for any acting set change
@@ -3464,7 +6861,7 @@ f917166 erasure-code: refactor jerasure Liberation parameters checks
 937d2b0 Revert "ReplicatedPG:start_flush send a second delete"
 78d7499 Revert "osd/ReplicatedPG: avoid dereferencing iterator at end()"
 3578b11 ReplicatedPG: create max hitset size
-e9b09e8 (origin/wip-9381-erasure-code-rpm) packaging: add all erasure code plugins to RPM packages
+e9b09e8 packaging: add all erasure code plugins to RPM packages
 c5bafae erasure-code: Avoid k/m that we set are not equal in erasure-plugin.
 9606232 KeyValueStore: Fix scrub causing inconsistence state
 ae05edd ObjectStore Test: Add test for simulating scrub behavior
@@ -3474,40 +6871,40 @@ c664179 KeyValueStore: Fix upgrade from non-exist superblock OSD version error
 2f2c37f mailmap: Marco Garcês affiliation
 0204998 mailmap: Erik Logtenberg affiliation
 284fb49 mailmap: François Lafont affiliation
-6307536 (origin/wip-yasm) configure: do not try yasm on x32
+6307536 configure: do not try yasm on x32
 d5777c4 doc: Fixed broken links and clarified terms for new authentication docs.
 e2de11a doc: Minor cleanup.
 9ab46dc debian: only B-R yasm on amd64
 836a670 doc/release-notes: v0.85
-591a179 (origin/wip-doc-qemu) doc: Added more robust usage example for uid and conf for QEMU.
+591a179 doc: Added more robust usage example for uid and conf for QEMU.
 b178e97 doc: Clean up syntax to suppress warnings.
 4ad1106 doc: Clean up syntax.
-7531b28 (origin/wip-9366) lttng: Fix 'make tag' when configured --without-lttng
-1c34601 (origin/wip-paxos-dout) mon/Paxos: make is_readable debug output show result
+7531b28 lttng: Fix 'make tag' when configured --without-lttng
+1c34601 mon/Paxos: make is_readable debug output show result
 62ca27d mon/Paxos: don't spam log with is_readable at dout level 1
-a0c2284 (tag: v0.85) 0.85
+a0c2284 0.85
 f0e0c31 Include types.h after stdint.h to accomodate for older distributions. This fixes compilation on CentOS 5.
-9254792 (origin/wip-rbd-force-write-back) rbd should use write-back when caching is enabled
+9254792 rbd should use write-back when caching is enabled
 0fb3e52 correct error of repeatedly adding io_size to off in do_bench_write
-4fc9fff (origin/wip-6494) Enforce cache size on read requests
+4fc9fff Enforce cache size on read requests
 f0a4a2f tests: flush logs before grepping them
 03aa113 erasure-code: init function is extern "C"
 484fb85 doc: Update logging path to reflect log rotate script default path.
 f0a1d28 doc: Add a contributing file at the root of repo
 4673242 rgw: add .log to default log path
-6fac815 (origin/wip-formatter-va) formatter: clean up dump_format()
+6fac815 formatter: clean up dump_format()
 df3c70a qa: scripts to help test ceph-qa-suite
-2280736 (origin/wip-9358) osdc/Journaler: fix try_read_entry()/wait_for_readable() locking
-94173a3 (origin/wip-9282) mds: warn clients which aren't revoking caps
+2280736 osdc/Journaler: fix try_read_entry()/wait_for_readable() locking
+94173a3 mds: warn clients which aren't revoking caps
 d8e672f rgw: Implementation for S3 Get Bucket Location
 9e503b5 test/formatter: Add tests for dump_format_ns
 eaa2bb0 rgw_formats: dump_format_ns implementation
 2fb51b2 Formatter: add a dump_format_ns to base Formatter
-14d87bd (origin/xiaoxichen-refactor_lrc_crush) Refactor ErasureCodeLrc::create_ruleset
-eae88da (origin/wip-8648) mds: clear objects' dirty flags after log segment is expired
+14d87bd Refactor ErasureCodeLrc::create_ruleset
+eae88da mds: clear objects' dirty flags after log segment is expired
 69638df doc: fix missing bracket
 35663fa doc: attempt to get the ayni JS into all head tags
-409c955 (origin/wip-SCA-20140902) OSDMonitor.cc: fix potential division by zero
+409c955 OSDMonitor.cc: fix potential division by zero
 7b77210 objectstore/store_test.cc: fix unintentional integer overflow
 6b70483 os/HashIndex.cc: fix unintentional integer overflow
 f29e5b3 mount.ceph.c: free memory before return
@@ -3515,18 +6912,18 @@ f29e5b3 mount.ceph.c: free memory before return
 ece990d rados_list_parallel.cc: cleanup before return from run()
 135ccad erasure-code: warning fix
 7de8d17 FileStore: Remove unused code.
-41df414 (origin/wip-mds-lock) mds: don't take mds_lock when doing journal operations
+41df414 mds: don't take mds_lock when doing journal operations
 7ccabbf librbd.cc: add check to prevent null deref in rbd_list()
-4f35714 (origin/wip-9311) Parallelize RBD import/export
+4f35714 Parallelize RBD import/export
 f53bf53 PendingIO.cc: use static_cast instead of c-style cast
 cbd324d FileStore.cc: use static_cast instead of c-style cast
-ca6eb61 (origin/wip-9026) client: vxattr support for rstat
-c2443b9 (origin/wip-doc-authentication) doc: Provided additional detail on daemon-specific caps syntax.
+ca6eb61 client: vxattr support for rstat
+c2443b9 doc: Provided additional detail on daemon-specific caps syntax.
 8569b93 doc: Updated Keyring settings from comments and ceph-deploy defaults.
 b6a97d8 documentation: disk thread ioprio options
 1b9e670 systemd support in ceph-disk activate
 6ec08e0 client/Client.cc: fix null pointer checks for 'in'
-c8d2388 (origin/wip-9151) mon: add MDS metric metadata to health detail
+c8d2388 mon: add MDS metric metadata to health detail
 9ee8015 mds: add thread to progress queues outside dispatch
 1058a9e mon: handle beacon health metrics in MDSMonitor
 0fc8a0d mds: populate health metrics in beacon
@@ -3539,8 +6936,8 @@ f5e7a4b mds: use HeartbeatMap to control beacons
 e69f321 include/CompatSet: make merge const()
 46c94e9 mds/CInode.h: prefer ++operator for non-primitive iterators
 a5468ab ceph_objectstore_tool.cc: prefer ++operator for non-primitive iterators
-4d9927e (origin/wip-session-stats) mds: cap and lease count in `session ls` asok
-ea2a8d0 (origin/wip-request-count) mds: Fix Session::get_request_count
+4d9927e mds: cap and lease count in `session ls` asok
+ea2a8d0 mds: Fix Session::get_request_count
 15df8ac rbd_replay/Replayer.cc: pass const string parameter by reference
 9d926f1 ReplicatedPG.cc: prefer ++operator for non-primitive iterators
 dae0593 IndexManager.cc: prefer ++operator for non-primitive iterators
@@ -3554,7 +6951,7 @@ a754ce5 ErasureCodeLrc.cc: fix -Wmaybe-uninitialized compiler warning
 16cbaba osd/PGLog.h: prefer ++operator for non-primitive iterators
 8f368c5 mailmap: Ashish Chandra affiliation
 5fd50c9 mailmap: Boris Ranto affiliation
-a5b4c58 (origin/wip-9309) lockdep: increase max locks (1000 -> 2000)
+a5b4c58 lockdep: increase max locks (1000 -> 2000)
 9fac072 documentation: add the mark_unfound_lost delete option
 bec3032 osd: MissingLoc::get_all_missing is const
 e13ddc7 tests: qa/workunits/cephtool/test.sh early fail
@@ -3562,12 +6959,12 @@ fc499aa mailmap: add .peoplemap
 177202e erasure-code: lowercase LRC plugin name
 4c9fdbf common/LogEntry: fix warning on x86_64
 a24c8ba common/LogEntry: fix warning on i386
-acfe62e (origin/wip-aio-journal) Revert "os/FileJournal: For journal-aio-mode, don't use aio when closing journal."
+acfe62e Revert "os/FileJournal: For journal-aio-mode, don't use aio when closing journal."
 c776a89 os/FileJournal: stop aio completion thread *after* writer thread
-fa45ed8 (origin/wip-9285) osd/ReplicatedPG: do not evict blocked objects
-ded1cf4 (origin/wip-9294) osd/ReplicatedPG: avoid dereferencing iterator at end()
+fa45ed8 osd/ReplicatedPG: do not evict blocked objects
+ded1cf4 osd/ReplicatedPG: avoid dereferencing iterator at end()
 d20432e erasure-code: preload LRC plugin
-982c9dd (origin/wip-hitset-bytes) osd/ReplicatedPG: adjust avg_size calculation in agent_choose_mode
+982c9dd osd/ReplicatedPG: adjust avg_size calculation in agent_choose_mode
 3fc6a73 osd/ReplicatedPG: calculate num_user_bytes
 6ce36b0 osd/ReplicatedPG: scrub and repair hit_set_archive bytes
 3409c38 osd/ReplicatedPG: account for hit_set_archive bytes
@@ -3585,17 +6982,17 @@ c2ca011 erasure-code: delegate chunk remapping to the plugin
 b517ca4 erasure-code: override the default only if the plugin match
 4670d9e erasure-code: relax profile parameter syntax
 e2e07d9 erasure-code: add ErasureCodeInterface::get_coding_chunk_count
-9fc2684 (origin/wip-9281) tools: use cout instead of cerr in journal tool
-a994145 (origin/wip-mds-recover) mds/RecoveryQueue: do not start prioritized items synchronously
-e0b19e3 (origin/wip-hadoop) qa: fix+cleanup hadoop wordcount test
+9fc2684 tools: use cout instead of cerr in journal tool
+a994145 mds/RecoveryQueue: do not start prioritized items synchronously
+e0b19e3 qa: fix+cleanup hadoop wordcount test
 76b8e57 erasure-code: preload the default plugins in the mon
 c3e1466 Test: fixing a compile warning in ceph_objectstore_tool.cc
 1b42726 Cache tiering: use local_mtime to calculate the age of objects during evicting
 335c1f7 doc: Added rbd-replay-prep and rbd-replay manpages to block device TOC.
 b965398 doc: Fixed broken hyperlink.
 7948e13 doc: Added sysctl max thread count discussion.
-a257291 (origin/wip-9266) client: fix dispatcher ordering (broken fuse)
-ce29b76 (origin/wip-8231-forreview) Fix ceph_scratchtoolpp to create and delete its own pool
+a257291 client: fix dispatcher ordering (broken fuse)
+ce29b76 Fix ceph_scratchtoolpp to create and delete its own pool
 a03f719 ceph_objectstore_tool: Bug fixes and test improvements
 23ec93a ceph_objectstore_tool, test: Implement import-rados feature and unit test code
 b193812 tools/rados: Improve trigger for op_size rounding info message
@@ -3638,12 +7035,12 @@ dc6bd3b mds/Locker: if a client asks for file caps, prioritize file recovery
 bd3c8bc mds/RecoveryQueue: add method to prioritize a file recovery; fix logging
 8f4ea81 mds: change mds_max_file_recover from 5 -> 32
 0747ca7 mds: make max file recovers configurable
-848fcf7 (origin/wip-8718) rgw: don't try to authenticate a CORS preflight request
+848fcf7 rgw: don't try to authenticate a CORS preflight request
 e440fb4 test/mon/*: prime mon with initial command before injection
-7a93258 (origin/wip-mds-namespace) mds: remove `using namespace std` in headers
+7a93258 mds: remove `using namespace std` in headers
 fc89c27 messages: remove `using namespace std` in headers
 e5930a3 tests: reduce osd-crush.sh execution time
-b059bcd (origin/wip-jerasure-upgrade) erasure-code: assert the PluginRegistry lock is held when it must
+b059bcd erasure-code: assert the PluginRegistry lock is held when it must
 3c4220e erasure-code: add Ceph version check to plugins
 d4ea682 erasure-code: implement ErasureCodePluginRegistry::remove
 c02c41c erasure-code: wrap lines for emacs compile command
@@ -3656,8 +7053,8 @@ e9a05b8 doc: Update docs about OSD benchmark syntax
 a087f03 Revert "doc: Update docs about OSD benchmark syntax"
 36f15c3 doc: Update docs about OSD benchmark syntax
 ae3d873 PG::can_discard_op: do discard old subopreplies
-97e4e11 (origin/wip-9148) civetweb: update submodule
-1434e5c (origin/wip-blacklist-mds) mds: restart on -EBLACKLISTED
+97e4e11 civetweb: update submodule
+1434e5c mds: restart on -EBLACKLISTED
 bb11936 mon: flush paxos write on shutdown
 b1cf210 mon/Paxos: WRITING != WRITING_PREVIOUS
 96030d6 mon: _reset() before updating state
@@ -3675,7 +7072,7 @@ bb50371 mon/MonitorDBStore: drop useless ctor
 a6a1e99 mon: interact with MonitorDBStore::Transactions by shared_ptr Ref
 cfc6830 common/Finisher: fix perfcounter
 97f9b6d mon/OSDMonitor: fix double-free on old MOSDBoot
-06b92ce (origin/wip-rewrite-PR2189) Move fedora patch file (used by ceph.spec.in) to rpm/ subdir
+06b92ce Move fedora patch file (used by ceph.spec.in) to rpm/ subdir
 7474f72 ceph.spec.in: tests for rhel or centos need to not include _version
 e37b262 ceph.spec.in: Add a small comment on the empty %files section
 875a99e ceph.spec.in: Obsolete all older versions.
@@ -3687,7 +7084,7 @@ e9da2d8 ceph.spec.in: Add obsoletes for libcephfs
 6c264f2 ceph.spec.in: add ceph-libs-compat
 62d74b3 civetweb: update submodule
 11acb70 rgw: convert header field underscores into dashes
-06ec5ef (origin/wip-7988) test: common: test_config: whitelist '$channel' meta-variable
+06ec5ef test: common: test_config: whitelist '$channel' meta-variable
 8a9831f osd/mds/monc: Adjust for LogClient/LogChannel usage
 bb9d4ca mon: LogMonitor: debug update_from_paxos
 8f9e105 mon: LogMonitor: expand meta variables at time-of-call
@@ -3710,43 +7107,43 @@ b1af4bd common: str_map: add helper methods to get values from maps
 0d2a6c7 common: LogEntry: change field name from 'type' to 'prio' (aka priority)
 f25bca3 systemd: ceph-mds and ceph-mon need networking too
 c6f4b42 sample.ceph.conf: some updates
-2b13de1 (origin/wip-8863) osd/PG: fix crash from second backfill reservation rejection
-5a63215 (origin/wip-reweight) mon/OSDMonitor: make reweight_by_* output less misleading
+2b13de1 osd/PG: fix crash from second backfill reservation rejection
+5a63215 mon/OSDMonitor: make reweight_by_* output less misleading
 146e60d osd: Remove extra call to release_op_ctx_locks in eval_repop. It is called in remove_repop.
 3bfbc14 test/mon/mkfs.sh: fix test
-c2f21c0 (origin/wip-9218) osd: fix osd_tp shutdown
+c2f21c0 osd: fix osd_tp shutdown
 8346e10 PG: mark_log_for_rewrite on resurrection
 cb4c5e8 OSD::session_notify_pg_create: requeue at the start of the queue
-3c847c5 (origin/wip-config-diff) common: ceph_context: add admin socket command 'config diff'
+3c847c5 common: ceph_context: add admin socket command 'config diff'
 ef51160 common: config: let us obtain a diff between current and default config
 4b8b25e tests: histogram prevent re-use of local variables
-a525bf3 (origin/wip-9054) ReplicatedPG:start_flush send a second delete
+a525bf3 ReplicatedPG:start_flush send a second delete
 ee02293 tests: histogram prevent re-use of local variables
 66c7439 ReplicatedPG::start_flush: remove superfluous loop
 0416b88 update license for libcephfs
 e6da732 PG: recover from each osd at most once
 8a1723f PG: make the reservation sets more descriptively named
-bf3e483 (origin/wip-9238) mds: fix FP error in ROUND_UP_TO
+bf3e483 mds: fix FP error in ROUND_UP_TO
 4672e50 osd/OSDMap: encode blacklist in deterministic order
 a15ad38 vstart: start rgw on port specified by env var
 dbe6c79 don't update op_rw_rlatency/op_w_rlatency when rlatency is zero
 f3bf246 fix wrong value of op_w_latency perf counter
 501dd3c ceph_filestore_dump: Export omap in batches for large omap case
 398b418 ceph_filestore_dump: Remove unused bufferlist databl
-9449520 (origin/wip-9209-round-up-to) common: ROUND_UP_TO accepts any rounding factor
+9449520 common: ROUND_UP_TO accepts any rounding factor
 df3b6fc Fix syntax error in rst documentation
 cef1299 doc: Added links. Clarified namespace comments. Added limitations.
 d817a6a doc: Minor changes.
 16d946b qa/workunits/hadoop/wordcount: remove bogus rm -r
 5db51d6 doc: Added a few comments and links to other relevant docs.
-751b3e2 (origin/wip-9226) rgw: fix test to identify whether object has tail
-c7e1b9e (origin/wip-9221) ceph_test_rados_api_tier: make PromoteOn2ndRead test tolerate retries
+751b3e2 rgw: fix test to identify whether object has tail
+c7e1b9e ceph_test_rados_api_tier: make PromoteOn2ndRead test tolerate retries
 73733dd documentation: update recovery config options defaults
 97b1916 CollectionIndex: Collection name is added to the access_lock name
 3e85041 rgw: admin ops create user API can not determine existing user
 f7ca1bf Update python.rst
-f651f93 (origin/wip-9079) msg/Accepter: do not unlearn_addr on bind()
-a67421a (origin/wip-objecter) osd: update handle_osd_map call
+f651f93 msg/Accepter: do not unlearn_addr on bind()
+a67421a osd: update handle_osd_map call
 ef44292 common/Timer: fix deadlock in RWTimer::shutdown
 10efe17 osdc/Objecter: fix op_cancel on homeless session
 28110ff osdc/Objecter: hold session ref longer in resend
@@ -3863,25 +7260,25 @@ bcc69ed mds: adapt to new objecter interface
 74ce4f2 rbd: fix signess warning
 f7c0001 common: remove spurious uint32_t in buffer.c
 6ad8e61 Fix FTBFS on alpha due to incorrect check on BLKGETSIZE
-2554243 (origin/wip-mds-pc) mds/Server: rename perfcounters
+2554243 mds/Server: rename perfcounters
 b0cc869 mds: rename a bunch of metrics
 31ef1a9 mds: set l_mds_req on client request
 06682c4 vstart.sh: debug rgw = 20 on -d
-00c677b (origin/wip-civetweb-log) rgw: use a separate callback for civetweb access log
+00c677b rgw: use a separate callback for civetweb access log
 850242c rgw: separate civetweb log from rgw log
-f246b56 (origin/wip-weak-refs) common/shared_cache: dump weak refs on shutdown
+f246b56 common/shared_cache: dump weak refs on shutdown
 6cf583c common/shared_cache: take a cct
 2c27485 Doc: Add config option to turn off crush update
 78e84f3 CrushWrapper: pick a ruleset same as rule_id
 1835778 needs network or it will not start if enabled
 8c69054 osd: make coll_t::META static to each file
-493577b (origin/wip-9173) mds: fix crash killing sessions without conn
+493577b mds: fix crash killing sessions without conn
 dcf8c03 mds: logging in SessionMap
 01ce249 Revert "Merge pull request #2253 from adamcrume/wip-lttng"
-3ced97b (origin/wip-9176) mon: make dispatch(), _ms_dispatch() void
+3ced97b mon: make dispatch(), _ms_dispatch() void
 610f4be mon: always process the message in dispatch
 19df386 mon: fix occasional message leak after session reset
-b245d60 (origin/wip-fingerprint) mon: generate cluster_fingerprint if null
+b245d60 mon: generate cluster_fingerprint if null
 675b004 mon: add a cluster fingerprint
 552c4b4 rbd_replay: Add unit test for batch_unreachable_from
 7616092 rbd-replay: Add unit test for Ser
@@ -3964,10 +7361,10 @@ ae59946 lttng: Trace OpRequest
 3ac99e3 lttng: add pg and osd tracepoints
 7fa513e lttng: trace mutex::unlock
 115cfb3 tracing: bootstrap lttng-ust with mutex events
-e870fd0 (origin/wip-filejournal) os/FileJournal: For journal-aio-mode, don't use aio when closing journal.
+e870fd0 os/FileJournal: For journal-aio-mode, don't use aio when closing journal.
 a66a493 os/FileJournal: Only using aio then alloc the related resources.
 c8e2b89 os/FileJournal: Tune the judge logic for read_header.
-3ed8c68 (origin/wip-9153-jerasure-upgrade) erasure-code: do not preload the isa plugin
+3ed8c68 erasure-code: do not preload the isa plugin
 4c2ae69 add pom.xml so to deploy the libcephfs to maven repository. to build a jar, version=0.80-rc1-2008-gf71c889 mvn package -Dversioin=; and mvn deploy command will deploy the jar to maven central
 e45f5c2 TrackedOp:_dump_op_descriptor is renamed to _dump_op_descriptor_unlocked
 f680a24 TrackedOp: Removed redundant lock in OpTracker::_mark_event()
@@ -3980,8 +7377,8 @@ e665e62 Rebased and changed debug option
 615d2d9 CollectionIndex: Collection name is added to the access_lock name
 9b80270 erasure-code: preload the jerasure plugin
 fc41273 mon: fix signed/unsigned warnings
-1441ffe (origin/wip-8587) rgw: subuser creation fixes
-82409ee (origin/wip-reweight-tunables) mon: make reweight-by-* sanity limits configurable
+1441ffe rgw: subuser creation fixes
+82409ee mon: make reweight-by-* sanity limits configurable
 3304841 mon/OSDMonitor: respect CRUSH weights for reweight-by-pg
 1ecf44e mon/OSDMonitor: reweight-by-pg for pool(s)
 8b971e9 mon/OSDMonitor: adjust weights up, when possible
@@ -3997,9 +7394,9 @@ da37273 Add a new field 'expected_num_objects' to pg_pool_t which denotes the ex
 ab886c4 doc: Removed quick guide and wireshark from top-level IA.
 acee2e5 doc: Move wireshark documentation to dev.
 ce6e9a9 doc/release-notes: v0.84
-b016f84 (origin/wip-fs-docs) doc: add notes on using "ceph fs new"
-948178a (origin/wip-mon-empty-store) ceph_mon: check for existing mon store before opening db
-8336f81 (tag: v0.84) 0.84
+b016f84 doc: add notes on using "ceph fs new"
+948178a ceph_mon: check for existing mon store before opening db
+8336f81 0.84
 bda2301 qa/workunits/rbd/qemu-iotests: touch common.env
 1dc1fb8 qa/workunits/hadoop: move all hadoop tests into a hadoop/ dir
 3d3fcc9 qa/workunits/hadoop-wordcount: fix/use -rmr command
@@ -4019,7 +7416,7 @@ cc3b5ad mailmap: Abhishek Lekshmanan affiliation
 3279f3e qa/workunits/rest/test.py: do snap test on our data2/3 pool
 6d7a229 qa/workunits/rest/test.py: fix rd_kb -> rd_bytes
 0e07f7f osd: fix theoretical use-after-free of OSDMap
-904a5f1 (origin/wip-misplaced) vstart.sh: make filestore fd cache size smaller
+904a5f1 vstart.sh: make filestore fd cache size smaller
 932e478 mon: track stuck undersized
 190dc2f mon: track pgs that get stuck degraded
 5168907 osd: track last_fullsized in pg_stat_t
@@ -4033,14 +7430,14 @@ d734d7f mon: warn about misplaced objects, just like degraded
 a314999 osd: num_objects_misplaced
 14614e0 qa/workunits/rest/test.py: fix 'df' test to use total_used_bytes
 93c5b25 Revert "os/FileJournal: Update the journal header when closing journal"
-29e93f7 (origin/wip-9144) os/FileStore: rename start_sync() -> do_force_sync()
+29e93f7 os/FileStore: rename start_sync() -> do_force_sync()
 dd11042 os/FileStore: fix mount/remount force_sync race
 0395914 mailmap: Loic Dachary affiliation
-c83c90c (origin/wip-8621) rgw: update civetweb submodule
-0d6d1aa (origin/wip-init-ceph) init-ceph: don't use bashism
+c83c90c rgw: update civetweb submodule
+0d6d1aa init-ceph: don't use bashism
 7df67a5 Fix -Wno-format and -Werror=format-security options clash
-ae0b9f1 (origin/wip-osd-mon-feature) osd: fix feature requirement for mons
-0db3e51 (origin/wip-9119) ReplicatedPG::maybe_handle_cache: do not forward RWORDERED reads
+ae0b9f1 osd: fix feature requirement for mons
+0db3e51 ReplicatedPG::maybe_handle_cache: do not forward RWORDERED reads
 5040413 ReplicatedPG::cancel_copy: clear cop->obc
 2f0e295 unittest_osdmap: test EC rule and pool features
 0b27610 Remove Old Wireshark Dissectors
@@ -4048,9 +7445,9 @@ ae0b9f1 (origin/wip-osd-mon-feature) osd: fix feature requirement for mons
 1d95486 crush: add is_v[23]_rule(ruleid) methods
 b22d693 lttng: Add distro packaging
 6891f4e lttng: Fix "make distcheck"
-c54f1e4 (origin/wip-9053) mon/Paxos: share state and verify contiguity early in collect phase
+c54f1e4 mon/Paxos: share state and verify contiguity early in collect phase
 3e5ce5f mon/Paxos: verify all new peons are still contiguous at end of round
-5c2d232 (origin/wip-9025-chunk-remapping) erasure-code: remap chunks if not sequential
+5c2d232 erasure-code: remap chunks if not sequential
 164cfe8 erasure-code: parse function for the mapping parameter
 298da45 erasure-code: ErasureCodeInterface::get_chunk_mapping()
 240764f rgw: update civetweb submodule
@@ -4065,16 +7462,16 @@ b04d84d shared_cache: pass key (K) by const ref in interface methods
 95ac43f FileStore: remove the fdcache_lock
 a9f76d4 FDCache: implement a basic sharding of the FDCache
 4c2828e shared_cache: expose prior existence when inserting an element
-a1e79db (origin/wip-9039) rgw_admin: add --min-rewrite-stripe-size for object rewrite
+a1e79db rgw_admin: add --min-rewrite-stripe-size for object rewrite
 46d8c97 doc: Add documentation about Wireshark dissector.
 6a55543 rgw: fix compilation
 f6771f2 shared_cache: use a single lookup for lookup() too
-cec40da (origin/historic/old-wireshark-dissectors) qa/workunits/cephtool: verify setmaxosd doesn't let you clobber osds
+cec40da qa/workunits/cephtool: verify setmaxosd doesn't let you clobber osds
 a1c3afb OSDMonitor: Do not allow OSD removal using setmaxosd
 16a4360 rgw: pass set_mtime to copy_obj_data()
 800eff2 rgw: copy_obj_data() uses atomic processor
 5d3a7e5 rgw: copy object data if target bucket is in a different pool
-aec684b (origin/wip-9005) add calamari to the api/index section
+aec684b add calamari to the api/index section
 ac70490 doc: update kernel recommendations (avoid 3.15!)
 5374386 doc: Added user management link to quick start.
 5e8eae7 doc: Removed cephx intro. Moved details to user management, config, and architecture.
@@ -4083,13 +7480,13 @@ aac6aa2 doc: Removed auth intro and auth docs and added user management to index
 4c651b7 doc: Restructured auth configuration reference.
 3faf37a doc: Put architectural details of authentication in to architecture doc.
 118ae72 doc: Created a new User Management doc to replace authentication.
-0a49db8 (origin/wip-9062) msg/PipeConnection: make methods behave on 'anon' connection
+0a49db8 msg/PipeConnection: make methods behave on 'anon' connection
 8512904 lttng: Support --with-lttng=check
 bb046ed mon/Paxos: put source mon id in a temp variable
-d74d3f1 (origin/wip-8725) mds/MDSMap: fix incompat version for encoding
+d74d3f1 mds/MDSMap: fix incompat version for encoding
 369c639 mds/MDSMap: drop trailing else in favor of early return
-b2c1fa8 (origin/wip-9087) test/system/systest_runnable.cc: debugging on start and end
-d74ed9d (origin/wip-9102) ceph-disk: linter cleanup
+b2c1fa8 test/system/systest_runnable.cc: debugging on start and end
+d74ed9d ceph-disk: linter cleanup
 3efa30d lttng: Remove tracing from libcommon
 520b75b rbd-replay: Fix bug in rbd-replay-prep, Thread::issued_io with wrong IO
 89c3860 rbd-replay: Remove extent tracepoints and inline extents
@@ -4157,13 +7554,13 @@ d5b16e4 lttng: Trace OpRequest
 c5687b1 mon: fix potential divide by zero on can_mark_{down,out}
 f81d2b0 mon: fix divide by zero when pg_num adjusted and no osds
 8a647f7 mon: fix potential divide by zero on can_mark_{down,out}
-5ed9f4e (origin/wip-9029) mds: Revert from mds_mksnap_ setting to mds_snap_ settings
-6f7798e (origin/wip-ceph-disk) ceph-disk: warn about falling back to sgdisk (once)
+5ed9f4e mds: Revert from mds_mksnap_ setting to mds_snap_ settings
+6f7798e ceph-disk: warn about falling back to sgdisk (once)
 b1651af ceph-disk: only fall back to sgdisk for 'list' if blkid seems old
 b75e8a3 ceph-disk: add get_partition_base() helper
 c7a1ceb ceph-disk: display information about dmcrypted data and journal volumes
 5be56ff osd/ReplicatedPG: only do agent mode calculations for positive values
-7b3714c (origin/wip-9096) osd: fix some line wrapping
+7b3714c osd: fix some line wrapping
 df945a9 osd: fix require_same_peer_instance from fast_dispatch
 3d7e2b3 osd: inline require_osd_up_peer
 e86fdef rgw: move generic server usage after all options
@@ -4172,7 +7569,7 @@ f80ed26 ceph-disk: move fs mount probe into a helper
 6c77f5f ceph-disk: use partition type UUIDs, and blkid
 ea90d9f Revert "ReplicatedPG: do not pass cop into C_Copyfrom"
 300b5e8 ReplicatedPG: do not pass cop into C_Copyfrom
-24aeca9 (origin/wip-9064) ReplicatedPG::maybe_handle_cache: do not skip promote for write_ordered
+24aeca9 ReplicatedPG::maybe_handle_cache: do not skip promote for write_ordered
 984f614 erasure-code: isa plugin must link with ErasureCode.cc
 1088d6c ceph-disk: fix log syntax error
 41e4461 doc/changelog: v0.67.10 notes
@@ -4184,18 +7581,18 @@ ea90d9f Revert "ReplicatedPG: do not pass cop into C_Copyfrom"
 78dc4df doc: Replace [default] with [global].
 e5324ed doc: Added yum-priorities procedure to manual install.
 a01252a doc: Added priority = 2 to packages.
-3dfa72d (origin/wip-8912) librbd: fix error path cleanup for opening an image
+3dfa72d librbd: fix error path cleanup for opening an image
 2edf01f Revert "Fix for bug #6700"
 d6e6ba1 ceph-disk: fix verify_no_in_use check
 0a2b4c2 rgw: Don't send error body when it's a HEAD request
-b0f5ba9 (origin/wip-rados-df-json) rados: remove {read,write}_kb fields from rados df json output
+b0f5ba9 rados: remove {read,write}_kb fields from rados df json output
 5663f91 rados: fix {read,write}_bytes value
-8180713 (origin/wip-9057) msg/Pipe: do not wait for self in Pipe::stop_and_wait()
+8180713 msg/Pipe: do not wait for self in Pipe::stop_and_wait()
 fd421b2 mon/MonitorDBStore: add get_{keys,bytes}() accounting to Transaction
-d7fb7bf (origin/wip-9055) ceph_test_rados_api_tier: fix cache cleanup (ec too)
+d7fb7bf ceph_test_rados_api_tier: fix cache cleanup (ec too)
 ebbe8aa ceph_test_rados_api: fix cleanup of cache pool
 1d199fb librados/TestCase: inheret cleanup_default_namespace
-5808d6a (origin/wip-9044-use-ruleset) osd: improve ruleno/ruleset consistency
+5808d6a osd: improve ruleno/ruleset consistency
 3f5d86a erasure-code: ErasureCodeIsa::create_ruleset must return a ruleset
 0029a35 erasure-code: ErasureCodeJerasure::create_ruleset must return a ruleset
 04a484a erasure-code: OSDMonitor::crush_ruleset_create_erasure needs ruleset
@@ -4206,7 +7603,7 @@ ebbe8aa ceph_test_rados_api: fix cleanup of cache pool
 4a0c941 OSD: introduce require_self_aliveness(OpRequestRef&,epoch_t) function
 48c9b38 OSD: use OpRequestRef& for a few require_* functions
 f86bf1d OSD: introduce require_up_osd_peer() function for gating replica ops
-ee790e3 (origin/wip-problem-osds) osd/osd_types: s/int/int32_t/ in pg_stat_t
+ee790e3 osd/osd_types: s/int/int32_t/ in pg_stat_t
 d862731 osd/PG: bound number of blocked_by OSDs we report per PG
 bee79ec qa/workunits/cephtool: add trivial 'ceph osd blocked-by' test
 6380f47 qa/workunits/cephtool: add simple 'ceph osd perf' test
@@ -4217,9 +7614,9 @@ d55e973 mon/PGMap: track histogram of pg blocked_by
 ceaca9f osd/PG: set blocked_by during peering GetLog
 6e96c2d osd/PG: set blocked_by during peering GetInfo
 e4b0071 RadosClient: Enable pool existence check
-c34f935 (origin/wip-fsx-flatten) test_librbd_fsx: also flatten as part of randomize_parent_overlap
+c34f935 test_librbd_fsx: also flatten as part of randomize_parent_overlap
 214630b mds: Also check min/max uid on snap removal
-a52a855 (origin/wip-pg-epoch) osd: fix pg epoch floor tracking
+a52a855 osd: fix pg epoch floor tracking
 2120f4b OSD: move waiting_for_pg into the session structures
 71cb4d3 doc: Removed Debian reference from Upstart.
 8357cae doc: Incorporated user feedback to clarify upgrade doc.
@@ -4229,15 +7626,15 @@ b6bf33c doc: Added configuration discussion at end of gateway install with links
 11d6e5a OSD: rename session_waiting_for_map_lock to session_waiting_lock
 20fd714 OSD: wake_pg_waiters outside of the pgmap write_lock, pg_lock
 6e7b86a OSD: fix wake_pg_waiters revert error in _open_lock_pg
-4260767 (origin/wip-8625) osd_types: s/stashed/rollback_info_completed and set on create
+4260767 osd_types: s/stashed/rollback_info_completed and set on create
 d0ccb1c make ceph-disk use the new init flag for cluster
 23b4915 allow passing a --cluster flag to the init script
 6bd2b0f mds: Make min/max UID configurable for who is allowed to create a snapshot
 a5ecf15 powerdns: Update README with better markdown
-bf9726a (origin/xiaoxichen-fix_crush_ruleset) mon/OSDMonitor : Use user provided ruleset for replicated pool
+bf9726a mon/OSDMonitor : Use user provided ruleset for replicated pool
 f1aad8b RadosClient: Fixing potential lock leaks.
-26750fc (origin/wip-filestore-bigxattr) os/FileStore: force any new xattr into omap on E2BIG
-cc3112e (origin/wip-rados-xattr) rados: use STD{IN,OUT}_FILENO for magic values
+26750fc os/FileStore: force any new xattr into omap on E2BIG
+cc3112e rados: use STD{IN,OUT}_FILENO for magic values
 e3819b6 qa/workunits/rados/test_rados_tool: add a few xattr tests
 645c28a rados: optionally read setxattr value from stdin
 59a715a rados: don't add \n to getxattr
@@ -4258,25 +7655,25 @@ ee2dbdb mon/PGMonitor: remove {rd,wr}_kb from pool stat dumps
 adb2791 mon/PGMonitor: add _bytes fields for all usage dumps
 895318c README.md: word wrap
 500b95e README: symlink from README.md
-0114b33 (origin/wip-8496-erasure-code-base-class) erasure-code: rework ErasureCode*::parse methods
+0114b33 erasure-code: rework ErasureCode*::parse methods
 77690f6 erasure-code: move to ErasureCode::decode_concat
 54394fa erasure-code: move to ErasureCode::to_{int,bool}
 b4b7c51 erasure-code: move to ErasureCode::minimum_to_decode*
 4ff981f erasure-code: move to ErasureCode::{encode,decode}{,chunk}
 1ebce98 erasure-code: ErasureCode base class
-859944d (origin/wip-test-ceph-disk) test/osd/osd-test-helpers: mkdir -p for ceph-disk
+859944d test/osd/osd-test-helpers: mkdir -p for ceph-disk
 4b45e25 Add handles for the ceph-mon starting with upstart
 c2f58e6 test/ceph-disk.sh: mkdir -p
 a923e2c Renamed README to README.md to render in markdown
 52cf693 Developer quick start guide
 c09036a enable info_log_level config option for rocksdb
-66de51d (origin/wip-8875) be a bit more explicit about 'ceph-deploy new' in quickstart
+66de51d be a bit more explicit about 'ceph-deploy new' in quickstart
 b4c80e3 Do not make directories by mistake.
 f773b24 powerdns: Define a application variable when not invoked from Shell
 1682e62 doc: typo s/loose/lose
 466aba4 osd/osd_types: add blocked_by to pg_stat_t
 b9b022e add annotation for rocksdb config option
-8dcfbd8 (origin/wip-8998) osd: simplify dout_prefix macros
+8dcfbd8 osd: simplify dout_prefix macros
 80829d7 osd: reorder OSDService methods under proper dout_prefix macro
 047c18d doc/release-notes: make note about init-radosgw change
 354c411 doc: Added 'x' to monitor cap.
@@ -4291,7 +7688,7 @@ e11c3fc erasure-code: rework benchmark suite
 90592e9 erasure-code: properly indent ErasureCodePluginSelectJerasure.cc
 be3e1e4 erasure-code: control jerasure plugin variant selection
 5fb4354 erasure-code: reduce jerasure verbosity
-c7daaaf (origin/wip-8475) erasure-code: implement alignment on chunk sizes
+c7daaaf erasure-code: implement alignment on chunk sizes
 3987ac2 erasure-code: cauchy techniques allow w 8,16,32
 cb54605 mailmap: sort entries
 accf8c2 mailmap: Tommi Virtanen is not with Red Hat
@@ -4304,11 +7701,11 @@ b1ba72f mailmap: George Ryall affiliation
 6652494 mailmap: Accela Zhao affiliation
 794f70a mailmap: Kevin Cox affiliation
 d8e6415 mailmap: Ma Jianpeng affiliation
-3230060 (origin/wip-ceph-conf) ceph-conf: flush log on exit
+3230060 ceph-conf: flush log on exit
 076f33a ECBackend: Don't directly use get_recovery_chunk_size() in RecoveryOp::WRITING state.
-98997f3 (origin/wip-8891) msg/SimpleMessenger: drop msgr lock when joining a Pipe
+98997f3 msg/SimpleMessenger: drop msgr lock when joining a Pipe
 e36babc os/MemStore: fix lock leak
-e93818d (origin/wip-rgw-need-to-wait) rgw: need to pass need_to_wait for throttle_data()
+e93818d rgw: need to pass need_to_wait for throttle_data()
 3de7b7c doc/release-notes: fix syntax error
 c95e91e os/KeyValueStore: clean up operator<< for KVSuperBlock
 1417ede ceph_test_rados_api_tier: test promote-on-second-read behavior
@@ -4325,21 +7722,21 @@ da5edb8 add rocksdb bz2 dep
 3329352 rocksdb: require less shiny autoconf
 bcebf9c do_autogen.sh: build with static librocksdb
 eb1cd78 rocksdb: update submodule makefile
-2e549b4 (origin/wip-ec-isa) unittest_erasure_code_[plugin_]isa: conditionally compile
-04037ef (origin/wip-osd-leaks) osd: do not leak Session* ref in _send_boot()
+2e549b4 unittest_erasure_code_[plugin_]isa: conditionally compile
+04037ef osd: do not leak Session* ref in _send_boot()
 d00f23b erasure-code/isa: fix signed/unsigned comparison
 8a193ab erasure-code/isa: don't use typename outside of template
 92d0bbe ECBackend: Using ROUND_UP_TO to refactor function get_recovery_chunk_size()
 e4ed2cb ReplicatedPG: For async-read, set the real result after completing read.
 6e52efa doc/release-notes: tweak quoting
-8091173 (origin/wip-round) use llrintl when converting double to micro
+8091173 use llrintl when converting double to micro
 d8b291f configure: check for 'better' yasm that can build ISA-L
 1520b47 erasure-code/isa/Makefile: add missing \, and missing headers
 e1f32d9 yasm-wrapper: fix -f ...
 1670fd6 yasm-wrapper: turn -I foo into -i foo
 6d95797 yasm-wrapper: echo original args
 b7d0017 EC: add plugin for Intel ISA-L library
-383536a (origin/wip-8982) mon/OSDMonitor: warn when cache pools do not have hit_sets configured
+383536a mon/OSDMonitor: warn when cache pools do not have hit_sets configured
 caf554b osd/ReplicatedPG: improve agent_choose_mode args
 ce4e559 vstart.sh: limit open files
 ea4996d osd/ReplicatedPG: evict blindly if there is no hit_set
@@ -4347,25 +7744,25 @@ ea4996d osd/ReplicatedPG: evict blindly if there is no hit_set
 51c1f2a FileStore: Add omap_backend to "<<" operator
 7faed14 Add superblock to KeyValueStore
 b879e74 KeyValueStore: use generic KeyValueDB::create()
-9df9d28 (origin/wip-8969) mon/OSDMonitor: fix i386 floating point rounding error
-aa9ae1f (origin/wip-8944) qa/workunits/cephtool/test_daemon.sh: verify ceph -c works with daemon
+9df9d28 mon/OSDMonitor: fix i386 floating point rounding error
+aa9ae1f qa/workunits/cephtool/test_daemon.sh: verify ceph -c works with daemon
 22d20f3 qa/workunits/cephtool/test_daemon.sh: typo
 97a8d5a qa/workunits/cephtool/test_daemon.sh: allow local ceph command
 9686312 atomic: fix read() on i386, clean up types
 6d89a99 ceph.in: Pass global args to ceph-conf for proper lookup
-0190df5 (origin/wip-8714) osd: prevent old clients from using tiered pools
-605064d (origin/wip-cli-integration) test/cli-integration/rbd: fix trailing space
-d700076 (origin/wip-double-pc) mon: s/%%/%/
-0f8929a (origin/wip-8972) cls_rgw: fix object name of objects removed on object creation
-061c8e9 (origin/wip-rbd-flush) librbd: enable rbd cache by default; writethrough until flush
-4e1405e (origin/wip-erasure-code-profile-default) erasure-code: create default profile if necessary
+0190df5 osd: prevent old clients from using tiered pools
+605064d test/cli-integration/rbd: fix trailing space
+d700076 mon: s/%%/%/
+0f8929a cls_rgw: fix object name of objects removed on object creation
+061c8e9 librbd: enable rbd cache by default; writethrough until flush
+4e1405e erasure-code: create default profile if necessary
 5f65b4d os/FileJournal: When dump journal, using correctly seq avoid misjudging joural corrupt.
 7b169a0 rocksdb backend optimization
 708b5b8 add --with-librocksdb-static configure options
 101954c get a stable rocksdb (3.0)
 6eea02c add rocksdb submodule
 cbd0043 add rocksdb support
-5d5902a (origin/wip-filestore-omap) os/KeyValueStore: rename osd_keyvaluedb -> keyvaluestore_backend
+5d5902a os/KeyValueStore: rename osd_keyvaluedb -> keyvaluestore_backend
 1a5dea7 os/FileStore: use generic KeyValueDB::create(); store omap_backend in superblock
 a2a3619 mon/MonitorDBStore: use generic KeyValueDB::create()
 86a0b9d os/KeyValueDB: make compaction interface generic
@@ -4373,7 +7770,7 @@ a2a3619 mon/MonitorDBStore: use generic KeyValueDB::create()
 e141872 config: allow unsafe setting of config values
 2f9fe02 mailmap: Red Hat names normalization
 3cfda57 doc/release-notes: v0.83
-18ea2a8 (origin/wip-8586) rgw: fix crash in swift CORS preflight request
+18ea2a8 rgw: fix crash in swift CORS preflight request
 6bb3aea mds: remove some rogue "using namespace std;"
 c283ad4 mds: handle replaying old format journals
 07665ec mds: introduce explicit DaemonState instead of int
@@ -4382,46 +7779,46 @@ c283ad4 mds: handle replaying old format journals
 e587088 mds: remove unused purge_prealloc_ino
 6be8087 mds: separate inode recovery queue from MDCache
 0d70989 python-ceph: require libcephfs.
-78ff1f0 (tag: v0.83) 0.83
+78ff1f0 0.83
 06c4736 Remove reference from mkcephfs.
 4045b2e doc/release-notes: typo
 df1bad8 doc/release-notes: v0.80.5 release notes
-e99acf9 (origin/wip-8880) OSD: add require_same_peer_inst(OpRequestRef&,OSDMap&) helper
+e99acf9 OSD: add require_same_peer_inst(OpRequestRef&,OSDMap&) helper
 e179e92 OSD: introduce require_self_aliveness(OpRequestRef&,epoch_t) function
 f36cffc unittest_crush_wrapper: fix build
 eb2f1ea OSD: use OpRequestRef& for a few require_* functions
 1526546 Remove reference from mkcephfs.
 9b03752 Fix some style and checking issue
-5773a37 (origin/wip-upstart-nfile) upstart/ceph-osd.conf: bump nofile limit up by 10x
-d3e5961 (origin/wip-undump) tools/cephfs: fuller header in dump/undump
-e183a4d (origin/wip-fsx-overlap) test_librbd_fsx: clone/flatten probabilities
+5773a37 upstart/ceph-osd.conf: bump nofile limit up by 10x
+d3e5961 tools/cephfs: fuller header in dump/undump
+e183a4d test_librbd_fsx: clone/flatten probabilities
 bb095ff test_librbd_fsx: randomize_parent_overlap
 f6d1a92 test_librbd_fsx: introduce rbd_image_has_parent()
 eb697dd librbd: make rbd_get_parent_info() accept NULL out params
 04d0526 PGMonitor: fix bug in caculating pool avail space
-b08470f (origin/wip-libs) configure.ac: link libboost_thread only with json-spirit
+b08470f configure.ac: link libboost_thread only with json-spirit
 9d23cc6 configure: don't link blkid, udev to everything
-de9cfca (origin/wip-flush-set) Only write bufferhead when it's dirty
+de9cfca Only write bufferhead when it's dirty
 1c26266 ObjectCacher: fix bh_{add,remove} dirty_or_tx_bh accounting
 727ac1d ObjectCacher: fix dirty_or_tx_bh logic in bh_set_state()
 5283cfe Wait tx state buffer in flush_set
 d858fdc Add rbdcache max dirty object option
 b8a5668 Reduce ObjectCacher flush overhead
 288908b Revert "Merge pull request #2129 from ceph/wip-librbd-oc"
-0553890 (origin/wip-8937) rgw: call processor->handle_data() again if needed
+0553890 rgw: call processor->handle_data() again if needed
 d3de69f mds: fix journal reformat failure in standbyreplay
-8fb761b (origin/wip-8931) osd/ReplicatedPG: requeue cache full waiters if no longer writeback
+8fb761b osd/ReplicatedPG: requeue cache full waiters if no longer writeback
 36aaab9 osd/ReplicatedPG: fix cache full -> not full requeueing when !active
-ba9d52e (origin/wip-librbd-snap-meta) librbd: store and retrieve snapshot metadata based on id
-c5f766b (origin/wip-8932) ceph_test_rados_api_tier: do fewer writes in HitSetWrite
+ba9d52e librbd: store and retrieve snapshot metadata based on id
+c5f766b ceph_test_rados_api_tier: do fewer writes in HitSetWrite
 f360920 common/RefCountedObject: fix use-after-free in debug print
-14cad5e (origin/wip-rgw-align) rgw: object write should not exceed part size
+14cad5e rgw: object write should not exceed part size
 fc83e19 rgw: align object chunk size with pool alignment
 1f9c732 doc: Add additional hyperlink to Cache Tiering defaults.
 4047660 doc: Update doc from user feedback.
 d1dfb9b osd: fix bad Message* defer in C_SendMap and send_map_on_destruct
 5740266 test: catch a straggler still using 'data' pool
-4eb18dd (origin/wip-journal-header) os/FileJournal: Update the journal header when closing journal
+4eb18dd os/FileJournal: Update the journal header when closing journal
 63c1711 msg/SimpleMessenger: drop local_conneciton priv link on shutdwon
 2545e80 librbd: fix crash using clone of flattened image
 4fe0792 doc: Updated mon doc per feedback. Fixed hyperlinks.
@@ -4456,12 +7853,12 @@ afb4c37 msg: factor policy handling out of SimpleMessenger
 6597c20 mon: clean up ref counting for forwarded messages' sessions
 29e04c8 msgr: move PipeConnection out of Connection.h; make anon
 ea14d7b Refactor Messenger class family.
-63abf11 (origin/wip-8882) osd/ReplicatedPG: observe INCOMPLETE_CLONES in is_present_clone()
+63abf11 osd/ReplicatedPG: observe INCOMPLETE_CLONES in is_present_clone()
 4136471 osd/ReplicatedPG: observed INCOMPLETE_CLONES when doing clone subsets
 956f287 osd/ReplicatedPG: do not complain about missing clones when INCOMPLETE_CLONES is set
 54bf055 osd/osd_types: add pg_pool_t FLAG_COMPLETE_CLONES
 67d13d7 mon/OSDMonitor: improve no-op cache_mode set check
-d4faf74 (origin/wip-8701) ceph_test_objectstore: clean up on finish of MoveRename
+d4faf74 ceph_test_objectstore: clean up on finish of MoveRename
 3ec9a42 os/LFNIndex: use FDCloser for fsync_dir
 6fb3260 os/LFNIndex: only consider alt xattr if nlink > 1
 ec36f0a os/LFNIndex: remove alt xattr after unlink
@@ -4476,36 +7873,36 @@ c57811f Fix/add missing dependencies:
 dae6ecb ceph.spec.in: split out ceph-common as in Debian
 a05a0da common/random_cache: fix typo
 5efdc62 common/RandomCache: Fix inconsistence between contents and count
-356af4b (origin/wip-8889) osd/ReplicatedPG: debug obc locks
+356af4b osd/ReplicatedPG: debug obc locks
 6fe2782 osd/ReplicatedPG: greedily take write_lock for copyfrom finish, snapdir
 0962650 osd: allow greedy get_write() for ObjectContext locks
 ccd0eec OSD: introduce require_up_osd_peer() function for gating replica ops
-253ca2b (origin/wip-8897) os: make name/attr max methods unsigned
+253ca2b os: make name/attr max methods unsigned
 daac750 os/KeyValueStore: make get_max_object_name_length() sane
 e311a08 uncomment cleanup command
 c264774 init: add systemd service files
 d87e5b9 powerdns: RADOS Gateway backend for bucket directioning
-b551ae2 (origin/wip-8851) mon: AuthMonitor: always encode full regardless of keyserver having keys
+b551ae2 mon: AuthMonitor: always encode full regardless of keyserver having keys
 1518fa2 osd: init local_connection for fast_dispatch in _send_boot()
-34b0efd (origin/wip-librbd-oc) ObjectCacher: fix bh_{add,remove} dirty_or_tx_bh accounting
+34b0efd ObjectCacher: fix bh_{add,remove} dirty_or_tx_bh accounting
 8a05f1b ObjectCacher: fix dirty_or_tx_bh logic in bh_set_state()
 d358741 Wait tx state buffer in flush_set
 3c7229a Add rbdcache max dirty object option
 5cb4b00 Reduce ObjectCacher flush overhead
 9061988 osd: init local_connection for fast_dispatch in _send_boot()
 b6f3aff Fix mismatched tags (struct vs. class) inconsistency
-2aa3edc (origin/wip-8174) os/FileStore: fix max object name limit
+2aa3edc os/FileStore: fix max object name limit
 f4bffec ceph_test_objectstore: test memstore
 6f312b0 os/MemStore: copy attrs on clone
 8dd6b8f os/MemStore: fix wrlock ordering checks
 a2594a5 osd/MemStore: handle collection_move_rename within the same collection
-3467110 (origin/wip-dencoder) ceph-dencoder: don't link librgw.la (and rados, etc.)
+3467110 ceph-dencoder: don't link librgw.la (and rados, etc.)
 b1a641f rgw: move a bunch of stuff into rgw_dencoder
 1c17077 libosd_types, libos_types, libmon_types
 58cc894 Revert "ceph.spec: move ceph-dencoder to ceph from ceph-common"
 f181f78 Revert "debian: move ceph-dencoder to ceph from ceph-common"
 ad4a4e1 unittest_osdmap: revert a few broken changes
-d7209c1 (origin/wip-8858) rgw: dump prefix unconditionally
+d7209c1 rgw: dump prefix unconditionally
 dc417e4 rgw: list extra objects to set truncation flag correctly
 82d2d61 rgw: account common prefixes for MaxKeys in bucket listing
 924686f rgw: add NextMarker param for bucket listing
@@ -4516,58 +7913,58 @@ e6cf618 rgw: improve delmited listing of bucket
 bd3367e osd: add config for osd_max_attr_name_len = 100
 7c0b2a0 os: add ObjectStore::get_max_attr_name_length()
 7e0aca1 osd: add config for osd_max_object_name_len = 2048 (was hard-coded at 4096)
-e60dd0f (origin/wip-8811) osdc: refactor JOURNAL_FORMAT_* constants to enum
+e60dd0f osdc: refactor JOURNAL_FORMAT_* constants to enum
 8eef89e doc: fix example s/inspect/journal inspect/
 5438500 mds: fix journal reformat failure in standbyreplay
 ed3bc4c osdc/Journaler: validate header on load and save
 18ca6b6 test: add a missing semicolon
-0cd0268 (origin/wip-vstart-existing-mds) qa: generalise cephtool for vstart+MDS
+0cd0268 qa: generalise cephtool for vstart+MDS
 bb5a574 mon: carry last_failure_osd_epoch across `fs new`
 b936a27 mon/MDSMonitor: fix msg on idempotent `fs rm`
 06a8f7b configure: do not link leveldb with everything
 0193d3a AUTHORS
-14a9ca6 (origin/wip-logrotate) logrotate.conf: fix osd log rotation under upstart
+14a9ca6 logrotate.conf: fix osd log rotation under upstart
 7b342ef doc: Add Note about European mirror in Quick Start
-0f11aae (origin/wip-8849) remove suse service restarts
+0f11aae remove suse service restarts
 e3a5756 remove ceph restarts on upgrades for RPMs
 4d6899c qa/workunits/cephtool/test.sh: fix erasure_code_profile get test
 ce9f12d qa/workunits/cephtool/test.sh: test osd pool get erasure_code_profile
 e8ebcb7 mon: OSDMonitor: add "osd pool get <pool> erasure_code_profile" command
 5ccfd37 vstart.sh: default to 3 osds
-5f6b11a (origin/wip-8857) mon/MDSMonitor: make legacy 'newfs' command idempotent
+5f6b11a mon/MDSMonitor: make legacy 'newfs' command idempotent
 b89ab5f rgw: don't try to wait for pending if list is empty
-19e68ac (origin/wip-rbd-defaults) rbd: respect rbd_default_* parameters
+19e68ac rbd: respect rbd_default_* parameters
 e891a93 rbd: remove accidental repeated option
 0f87c55 librbd: use order-agnostic default stripe parameters
-f9f2417 (origin/wip-8846) rgw: don't try to wait for pending if list is empty
-420f0a4 (origin/wip-8813) set the default log level to WARNING
+f9f2417 rgw: don't try to wait for pending if list is empty
+420f0a4 set the default log level to WARNING
 3e0d980 init-ceph: wrap daemon startup with systemd-run when running under systemd
 99dfaf7 doc/release-notes: v0.80.4
 80ea606 Fix size of network protocol intergers.
 2f43cef doc: Extended discussion for building docs on CentOS / RHEL.
 124f97f doc: Added a script to build docs on CentOS / RHEL.
-6c48d07 (origin/wip-set_layout) doc: add cephfs layout documentation
+6c48d07 doc: add cephfs layout documentation
 af740ec cephfs: pool safety & print deprecation warning
 d915ceb update hadoop-wordcount test to be able to run on hadoop 2.x. The hadoop and mapreduce library are no longer hard coded so they can be specified to point to the right path. The relative paths hdfs are changed to absolute paths. A sample command to run the test on hadoop 2.x is TESTDIR=/home/test HADOOP_HOME=/usr/lib/hadoop HADOOP_MR_HOME=/usr/lib/hadoop-mapreduce sh workunits/hadoop-wordcount/test.sh starting hadoop-wordcount test
 cceab2b qa: retire kclient-specific layout test
 95f5a44 ceph.spec: move ceph-dencoder to ceph from ceph-common
 b37e3bd debian: move ceph-dencoder to ceph from ceph-common
-01cd3cd (origin/wip-8830) XfsFileStoreBackend: default to disabling extsize on xfs
+01cd3cd XfsFileStoreBackend: default to disabling extsize on xfs
 fc597e5 doc/release-notes: some additional warnings and recommendations against adjusting tunables
-e17e9d8 (origin/wip-8823) ceph_test_rados_api_tier: fix [EC] HitSet{Read,Write,Trim} tests
-a4ed336 (origin/wip-mds-session-asok-squash) mds: add `session ls` and `session evict` to asok
+e17e9d8 ceph_test_rados_api_tier: fix [EC] HitSet{Read,Write,Trim} tests
+a4ed336 mds: add `session ls` and `session evict` to asok
 0e0be07 client: include ID in mds_sessions asok
 52a2bc5 mon: remove unused attribute notified_global_id
 b120a48 common/admin_socket: remove dead code
 bb47ff3 osd: fix confusing debug output for op_applied
 586d3ee doc: Fixes a broken link on the rados deploy osd page.
 447f849 doc/release-notes: v0.80.3
-29f20b7 (origin/wip-test-post-file) qa/workunits/post-file.sh
+29f20b7 qa/workunits/post-file.sh
 c9e1e82 rbdmap: per-device post-map/pre-unmap hooks
-c93da05 (origin/wip-8815) osd/osd_types: be pedantic about encoding last_force_op_resend without feature bit
+c93da05 osd/osd_types: be pedantic about encoding last_force_op_resend without feature bit
 712d5d1 osd/osd_types: remove useless encode_compat local var for pg_pool_t::encode
-50e93c2 (origin/wip-8696) qa/workunits: cephtool: adjust pool name where missing as it has changed
-cf94cf3 (origin/wip-dump-new-crush) crush: include CRUSH_V3, v2/v3 rules checks in dump_tunables()
+50e93c2 qa/workunits: cephtool: adjust pool name where missing as it has changed
+cf94cf3 crush: include CRUSH_V3, v2/v3 rules checks in dump_tunables()
 daadff4 doc: minor format fix for radosgw admin docs
 b844ec9 rbdmap: per-device mount (Closes: #8538)
 02683ac rbd.cc: Check io-size avoid floating point exception.
@@ -4580,13 +7977,13 @@ df59449 qa/workunits: cephtool: split get/set on tier pools from get/set tests
 64bdf6c osd: pg_pool_t: clear tunables on clear_tier()
 f131dfb mon: OSDMonitor: limit tier-specific pool set/get on non-tier pools
 026b127 doc/changelog/v0.80.2: include tag
-59c00e5 (origin/wip-kinetic-os) os: add prototype KineticStore
+59c00e5 os: add prototype KineticStore
 74f5e5e PG::op_must_wait_for_map: pass the epoch rather than the map
 98f92d8 doc: Added CentOS/RHEL install for git.
 115c078 rgw: modelines
 c4afaf9 rgw: fix RGWObjManifestRule decoder
 0839e2a doc: Added keyring location note to resolve pull abandoned pull request #1946.
-4692257 (origin/wip-nuke-dlist) nuke dlist
+4692257 nuke dlist
 a3e5c6d Add random_cache.hpp to Makefile.am
 f51f162 test: fix make_pair() for c++11
 79e3761 Remove some not-resolving make_pair() invocations.
@@ -4608,37 +8005,37 @@ bb881e5 doc: fix a few typos in architecture page
 fcbdd2f doc: Fix a typo in the rbd man page
 ef117fe Use submit_transaction_sync to make change durable
 6ff5fed Add random cache and replace SharedLRU in KeyValueStore
-c0dc245 (origin/wip-7891) osd: cancel agent_timer events on shutdown
+c0dc245 osd: cancel agent_timer events on shutdown
 ef40737 osd: s/applying repop/canceling repop/
 cafceae osd: clear PGBackend state on shutdown
 e299357 osd: separate cleanup from PGBackend::on_change()
 b16b64e Support for READFORWARD in the caching tier
-b927c0d (origin/wip-8523) qa/workunits: cephtool: test for 'osd pool {get,set}-quota'
+b927c0d qa/workunits: cephtool: test for 'osd pool {get,set}-quota'
 714a9bb mon: OSDMonitor: add 'osd pool get-quota' command
-c92feeb (origin/wip-8727) messages: MForward: fix compat version
+c92feeb messages: MForward: fix compat version
 0bf4f65 osd: clear sessions_waiting_on_map on shutdown
 aefbac5 osd: fix session leak when waiting on map
 17ad083 osd: clear Sessions for loopback Connections on shutdown
 231fe1b Revert "OSD: move waiting_for_pg into Session"
 aa1be2e OSD: fix debug logging output
-b700963 (origin/wip-8306-rebase) ceph.spec.in: add bash completion file for radosgw-admin
+b700963 ceph.spec.in: add bash completion file for radosgw-admin
 235e4c7 ceph.spec.in: rhel7-related changes:
 7cf8132 Fix/add missing dependencies:
 ec8af52 ceph.spec.in: whitespace fixes
 e131b9d ceph.spec.in: split out ceph-common as in Debian
 08fa16b common: seq_read_bench argument order changed The argument order for seq_read_bench in src/common/obj_bencher.h has been changed to match the argument order in obj_bencher.cc
-cca5841 (origin/wip-8751) test: generalise default_pools in test_rados
-a7a631d (origin/wip-8754) tests: don't depend on 'data' pool in rbd test
+cca5841 test: generalise default_pools in test_rados
+a7a631d tests: don't depend on 'data' pool in rbd test
 cf5f535 doc/release-notes: clarify CRUSH notes about tunables
 d84d720 decrement WBThrottle perfcounters in clear_object
-16df4c3 (origin/wip-8745) mds: use client-provided time stamp for user-visible file metadata
+16df4c3 mds: use client-provided time stamp for user-visible file metadata
 73b2928 Remove exclusive lock on GenericObjectMap
 d104979 Add Header cache to KeyValueStore
 c0806bb doc: mention kernel support for rbd format 2
 c7937ff doc: Fix a typo regarding requiretty for RHEL based platforms
-54af810 (origin/wip-8738-next) mon: check changes to the whole CRUSH map and to tunables against cluster features
+54af810 mon: check changes to the whole CRUSH map and to tunables against cluster features
 2280c0e OSDMonitor: fix quorum_features comparison in check_cluster_features
-c0ba58c (origin/wip-refs) msg: debug refs on incoming Messages
+c0ba58c msg: debug refs on incoming Messages
 c0dcf3b common/RefCountedObject: make nref, cct private
 b0da92b msg/Message: use RefCountedObject ctor to set initial ref count
 e621856 msg/Pipe: debug Pipe refs
@@ -4652,34 +8049,34 @@ f7086d3 Automagically setup submodules on first run.
 0c7c722 ceph_argparse_flag has no regular 3rd parameter.
 909850e [werror] Fix c++11-compat-reserved-user-defined-literal
 fde99e6 OSD: adjust share_map() to handle the case that the osd is down
-ddc04c8 (origin/wip-8670) mon: OSDMonitor: 'osd pool' - if we can set it, we must be able to get it
+ddc04c8 mon: OSDMonitor: 'osd pool' - if we can set it, we must be able to get it
 0392ddb ReplicatedPG: Removed the redundant register_snapset_context call
 2f089d8 OpTracker: The optracker enabled/disabled check is added
 63be0f2 OpTracker: use mark_event rather than _mark_event
-bb3e1c9 (origin/wip-8728) qa/workunits/rest/test.py: make osd create test idempotent
-7e1deb6 (origin/wip-async-log) mds: defer encoding/submitting log events to separate thread
+bb3e1c9 qa/workunits/rest/test.py: make osd create test idempotent
+7e1deb6 mds: defer encoding/submitting log events to separate thread
 44199d6 mds: use mutex to protect log segment list
 6d8ccdd mds: add thread to encode/submit log events
-70c0723 (origin/wip-osd-dumpres) osd: add dump_reservations asok command
+70c0723 osd: add dump_reservations asok command
 6483710 common/AsyncReserver: add dump()
-9ce5ff9 (origin/wip-8692) mon: clear osd request queue latency info on down or up
+9ce5ff9 mon: clear osd request queue latency info on down or up
 f8c88a4 OSD: wake_pg_waiters after dropping pg lock
-e2b151d (origin/wip-fs-cmds-oops) mds: Update default FS name
+e2b151d mds: Update default FS name
 f62f7f5 qa: update cephtool EC pools test to respect IDs
 710561c mon/MDSMonitor: EC check in 'fs new' like newfs
 44eb259 qa: add a check for crash_replay_interval autoset
 c0ffa01 mon: Set crash_replay_interval automatically
 82d3fcc qa: Update data pool ID for vxattrs test
 917ef15 test: use 0U with gtest to avoid spurious warnings
-522174b (origin/wip-vstart-wrapped) qa: support running under non privileged user
+522174b qa: support running under non privileged user
 8697d6a OSD: await_reserved_maps() prior to calling mark_down
-6f97206 (origin/wip-osd-map-cache-size) osd: allow osd map cache size to be adjusted at runtime
+6f97206 osd: allow osd map cache size to be adjusted at runtime
 bcc09f9 qa/workunits/cephtool/test.sh: sudo ceph daemon
-959f2b2 (origin/wip-fix-pglog-unittest) PGLog: fix clear() to avoid the IndexLog::zero() asserts
-e0d3b78 (origin/wip-8699) rgw: fix uninit ofs in RGWObjManifect::obj_iterator
+959f2b2 PGLog: fix clear() to avoid the IndexLog::zero() asserts
+e0d3b78 rgw: fix uninit ofs in RGWObjManifect::obj_iterator
 73b929b osd: improve tests for configuration updates
-2dec8a8 (origin/wip-8542) qa/workunits/suites/fsx.sh: don't use zero range
-83f1906 (origin/wip-fs-cmds) mon/MDSMonitor: log warning while MDS up but no FS
+2dec8a8 qa/workunits/suites/fsx.sh: don't use zero range
+83f1906 mon/MDSMonitor: log warning while MDS up but no FS
 b7f09c2 mon/MDSMonitor: fix incorrect comment
 fc0f8bd mon/MDSMonitor: s/enabled/get_enabled()/
 641b419 mds: Handle setting 'enabled' in upgrades
@@ -4697,14 +8094,14 @@ f6d029d mon: warn in newfs if crash_replay_interval=0
 13305d5 mds: no initial filesystem
 8f7900a mds: add 'enabled' flag to MDSMap
 ef7d1bc doc: RPM instructions correction
-fcdf273 (origin/wip-port-fixes) mds: avoid comparing MutationRef with 0
-e473790 (origin/wip-krbd-settle) krbd: rework the unmap retry loop
+fcdf273 mds: avoid comparing MutationRef with 0
+e473790 krbd: rework the unmap retry loop
 59d18ac [RGW, memory leak] Memory leak in RGW has been fixed: deletion of allocated pointer to pointer to Log object has been added to "on_exit" handler.
 8e5c921 [RGW, memory leak] Memory leak in RGW GC (losing pointer during allocating Ceph-context) has been fixed.
 8706b74 [RGW, memory leaks] Memory leak in RGW initialization (Inserting new connection into connections map w/o check) has been fixed.
-9bcc19d (origin/wip-map-unmap) map-unmap.sh: fail if 'rbd rm' fails
+9bcc19d map-unmap.sh: fail if 'rbd rm' fails
 16b14ea map-unmap.sh: drop the get_id() logic
-abdb168 (origin/wip-fsx-random-ctx) test_librbd_fsx: use private RNG context
+abdb168 test_librbd_fsx: use private RNG context
 9517cea os/FileStore: put SUPER usage in ifdef __linux__
 b066e16 common: move #include syscall into ifndef DARWIN
 2db500b client: handle missing O_RSYNC constant
@@ -4733,8 +8130,8 @@ a17462c mds: add get_metablob() to log events
 1c93c61 MOSDOp: The functions are returned by const ref and parameters passed by ref
 ad81a98 Revert "ceph-disk: Enable creating multiple osds per dev"
 e02957d test: use (unsigned)0 with gtest to avoid spurious warnings
-f8df9bd (origin/wip-da-SCA-20140623) scratchtool.c: cleanup do_rados_getxattrs()
-4e9c2c1 (origin/wip-osd-ints) osd: fix pg_stat_t int -> int32_t
+f8df9bd scratchtool.c: cleanup do_rados_getxattrs()
+4e9c2c1 osd: fix pg_stat_t int -> int32_t
 238b1a3 osd: fix pg_shard_t int -> int32_t
 709f0c4 osd: fix pg_interval_t int -> int32_t
 a5f9a09 Fixed build on 32 bits platforms
@@ -4744,7 +8141,7 @@ f5a72b7 Added a couple of ASSERTS for avoiding coverity to complain about the po
 605d180 mount.ceph.c: fix strdup related memory leak
 7f7e56c scratchtool.c: fix resource leak and error handling
 bdb1346 scratchtool.c: fix resource leak
-48e38ac (origin/wip-refactor-cephtool-test) qa/workunits: cephtool: fix 'osd bench' test
+48e38ac qa/workunits: cephtool: fix 'osd bench' test
 802290d osd: OSD: better explanation on 'max_count' calculation for 'osd bench'
 4b0809a qa/workunits: cephtool: only run heap profiler test if tcmalloc enabled
 5c4616e qa/workunits: cephtool: set +e for the tcmalloc tests
@@ -4756,35 +8153,35 @@ f418408 qa/workunits: cephtool: cleanup state after erasure-code-profile test
 3d14a96 qa/workunits: cephtool: split into properly indented functions
 04658b7 qa/workunits: cephtool: move test line to where it's more appropriate
 db6cc13 qa/workunits: cephtool: split into functions
-3953053 (origin/wip-da-fix-make-check) test/ceph-disk.sh: fix for SUSE
+3953053 test/ceph-disk.sh: fix for SUSE
 be70c1f osdmaptool/test-map-pgs.t: fix escaping to fix run
 dc1a4df Revert "Give meaningful error when submodules are not checked out"
 9695535 Make <poolname> in "ceph osd tier --help" clearer.
 76361b8 mon: simplify output
-385fd6c (origin/wip-disable-static) do_autogen.sh: --disable-static
-14085f4 (tag: v0.82) 0.82
+385fd6c do_autogen.sh: --disable-static
+14085f4 0.82
 152bbd6 osd: workaround race condition in tests
-cb740b3 (origin/wip-mon-perf) mon: shut down perfcounters last
+cb740b3 mon: shut down perfcounters last
 524700f doc: Fix malformed parameters in librados.h
 56cad1a libcephfs/test.cc: fix use after free
 a5c704b RadosStriperImpl.cc: catch exception by reference
 6d79863 rgw/rgw_rados.h: use static_cast instead of c-style cast
-0b3a398 (origin/wip-8654) osd/OSD.cc: parse lsb release data via lsb_release
-d7350a3 (origin/wip-fix-rados-tool) rados.cc: fix pool alignment check
-2b007c2 (origin/wip-8624) mon: MDSMonitor: print pool name along with id during 'newfs' errors
+0b3a398 osd/OSD.cc: parse lsb release data via lsb_release
+d7350a3 rados.cc: fix pool alignment check
+2b007c2 mon: MDSMonitor: print pool name along with id during 'newfs' errors
 378b5ad qa/workunit: cephtool: test mds newfs and add_data_pool with ec pools
 d6f6813 mon: MDSMonitor: do not allow ec pools to be used for data or metadata
-20a1664 (origin/wip-mon-sanity-checks) common: LogClient: output to derr (i.e., dout(-1)) on CLOG_ERROR
+20a1664 common: LogClient: output to derr (i.e., dout(-1)) on CLOG_ERROR
 9804360 mon: Monitor: observe conf changes and report on unsage option values
 ec73888 mon: Monitor: sanitize options at start
 87f9dba Give meaningful error when submodules are not checked out
 58212b1 osd: Only normalize extent if op uses extent
-4225e2f (origin/wip-leveldb-stress) osd: remove OSD-specific leveldb options
+4225e2f osd: remove OSD-specific leveldb options
 52b147c ceph-mon: override 'leveldb_*' config options for the monitor
 d42d19d mon: MonitorDBStore: remove mon-specific leveldb options.
 9844885 mon: DataHealthService: s/mon_leveldb_size_warn/mon_data_size_warn/
-57c5d05 (origin/wip-8610) osd: ignore CRUSH_ITEM_NONE in compat_must_dispatch_immediately
-d9073f4 (origin/wip-disk-ioprio) osd: allow io priority to be set for the disk_tp
+57c5d05 osd: ignore CRUSH_ITEM_NONE in compat_must_dispatch_immediately
+d9073f4 osd: allow io priority to be set for the disk_tp
 dd6badc common/WorkQueue: allow io priority to be set for wq
 1b87410 common/Thread: allow io priority to be set for a Thread
 a2b4911 common/io_priority: wrap ioprio_set() and gettid()
@@ -4795,7 +8192,7 @@ da03e9e MDCache.h: init 'umaster::safe' in constructor
 3e93d4a osd: tests for osd bench
 74be320 Use sized integer.
 66a5f3b doc: Fixed Typo in pools documentation - replaced '-' with '_' in example set-quota comands.
-e189a66 (origin/wip-8603) log the command that is being run with subprocess
+e189a66 log the command that is being run with subprocess
 78cbac4 mailmap: Dmitry Smirnov name normalization
 efefbfd mailmap: koleosfuscus affiliation
 1cdea98 mailmap: Walter Huf name normalization
@@ -4811,9 +8208,9 @@ a58fbf7 mailmap: Colin Mattson affliation
 22c028d mailmap: Red Hat acquires InkTank
 c270172 mailmap: Sebastien Ponce affiliation
 39a4b78 mon: test that pools used in tiers cannot be removed
-1de9071 (origin/wip-misc-fixes) osd/osd_types.cc: dump correct pg_log_entry_t member variable
+1de9071 osd/osd_types.cc: dump correct pg_log_entry_t member variable
 363496a osd: use appropriate json types instead of stream when dumping info
-97772c2 (origin/wip-tiermsg) mon: name instead of id in "has tiers" message
+97772c2 mon: name instead of id in "has tiers" message
 4d5469a osd: ECMsgTypes: dump json bools instead of strings
 3f0ea95 osd: have 'tid' dumped as a json unsigned int where appropriate
 debaf61 mon: dump 'epoch' as json unsigned instead of int
@@ -4822,7 +8219,7 @@ debaf61 mon: dump 'epoch' as json unsigned instead of int
 b7ff393 mon: OSDMonitor: dump 'acting' as json array instead of string
 78f94a9 mailmap: Ailing Zhang affiliation
 d2e852e doc: Fixed an incorrect flag in radosgw admin docs.
-741ad3f (origin/fix_ut) autotools: avoid check_SCRIPTS duplication
+741ad3f autotools: avoid check_SCRIPTS duplication
 c0d78c6 Fix dist package run unit test failed.
 b3ace76 tests: prevent gitbuilder trigger in test-erasure-code.sh
 3a9c0fc test: fix -Wsign-compare warnings
@@ -4831,22 +8228,22 @@ c35ceef ReplicatedPG: 'ajusted' typo
 de2c085 rgw-admin: Fix the 'show log' command
 304b08a enforce rados put aligment
 8d9201f tests: remove spurious and harmless find command
-cdca7b7 (origin/wip-osd-stats) osd: move osd_stat into OSDService
+cdca7b7 osd: move osd_stat into OSDService
 4afffb4 osd: fix filestore perf stats update
-0985ae7 (origin/wip-backfill-priority) osd: prioritize backfill based on *how* degraded
+0985ae7 osd: prioritize backfill based on *how* degraded
 d20da8d osd: add osd_min_recovery_priority tunable
 b65ceb6 common/AsyncReserver: add a min_priority knob
 0e7a979 osd: fix word sizes in MBackfillReserve
-c4e8451 (origin/wip-scrub-sleep) osd: introduce simple sleep during scrub
+c4e8451 osd: introduce simple sleep during scrub
 7b580a2 mon: Monitor: complex code deduplication algorithm applied to tick()
 0ed1fe6 mon: Monitor: rework tick() so we don't calculate the same thing over and again
-ef8a128 (origin/wip-6703) support dmcrypt partitions when activating
+ef8a128 support dmcrypt partitions when activating
 7dc93a9   Fix EINVAL err when use "ceph tell osd.* bench"
-3ed7f2d (origin/wip-8593) mon: ensure HealthService warning(s) include a summary
+3ed7f2d mon: ensure HealthService warning(s) include a summary
 82e47db mon: refactor check_health()
 98883f6 mon: fix typos, punctuation for mon disk space warning(s)
 55a9778 mon/OSDMonitor: make down osd count sensible
-c5b5ed6 (origin/wip-ec-hitset) ceph_test_rados_api_tier: disable LibRadosTierECPP::HitSetWrite
+c5b5ed6 ceph_test_rados_api_tier: disable LibRadosTierECPP::HitSetWrite
 0bb0095 Revert "erasure-code: create default profile if necessary"
 f53bed1 mon/OSDMonitor: fix build error
 1c72465 osd: verify osd config sanity checks
@@ -4854,43 +8251,43 @@ f53bed1 mon/OSDMonitor: fix build error
 d93e74e common: Enforces the methods lru_pin() and lru_unpin()
 d48ed68 common: Fixes issue with lru_clear() + add new test
 62aa5c5 common: Adds simple tests to verify good behavior
-64f6232 (origin/wip-ceph-isatty) ceph: output prompt only if stdin is tty
-8d1cb8b (origin/wip-vstart-conf) vstart.sh: echo CEPH_{CONF,KEYRING} exports if necessary
+64f6232 ceph: output prompt only if stdin is tty
+8d1cb8b vstart.sh: echo CEPH_{CONF,KEYRING} exports if necessary
 2eb1f55 vstart.sh: rename conf variable to conf_fn
 18f5807 Make KeyValueStore not use expected_write_size
 360de6a erasure-code: create default profile if necessary
-f3ec7d0 (origin/wip-osd-configs) osd: add sanity check/warning on a few key configs
+f3ec7d0 osd: add sanity check/warning on a few key configs
 4786a48 osd: remove non const get_erasure_code_profile
 a1c13c5 tests: prevent kill race condition
 5c1f9aa osd: improve osd pool create error message readability
 6bf8183 erasure-code: consistent argument parsing for profiles
 3c63811 erasure-code: OSDMonitor::get_erasure_code is a const
 ff2eb23 erasure-code: pool create must not create profiles
-0d63cf2 (origin/wip-import-purge) qa: extend cephfs_journal_tool_smoke
+0d63cf2 qa: extend cephfs_journal_tool_smoke
 acd6ebb qa: set +x on cephfs_journal_tool_smoke
 ee487b4 tools/cephfs: Purge trailing objects during import
 ac05799 tools/cephfs: error handling in journal_export
 381163c tools/cephfs: Clean up waits in Dumper
 3fe1699 osd/OSDMap: do not require ERASURE_CODE feature of clients
 250677c osd/OSDMap: make get_features() take an entity type
-e29beff (origin/wip-8071) erasure-code: remove jerasure internals dependencies
-e720314 (origin/wip-doc-os-recommendations) doc: Updated the OS Recommendations for Firefly.
+e29beff erasure-code: remove jerasure internals dependencies
+e720314 doc: Updated the OS Recommendations for Firefly.
 2e3302c doc: Updated the example configuration.
 5a31df2 doc: Updated doc for more recent versions.
 2eab1c1 Update RBD doc for OpenStack
-a290d34 (origin/wip-fsx-sizeof) test_librbd_fsx: fix sign-compare gcc warning
+a290d34 test_librbd_fsx: fix sign-compare gcc warning
 40c48bc qa: add script to test krbd setting ro/rw ioctl
-b2542f8 (origin/wip-8585) rgw: set a default data extra pool name
+b2542f8 rgw: set a default data extra pool name
 94c8f70 doc: Made mention of "incomplete" status.
 29c33f0 qa: add an fsx run which turns on kernel debugging
-f978722 (origin/wip-xattr-spillout) FileStore: remove the user_only param from _fgetattrs
+f978722 FileStore: remove the user_only param from _fgetattrs
 bb4e3a9 FileStore: remove user_only options from getattrs through the ObjectStore stack
 fbe6009 FileStore: do not use user_only in collection_getattrs
 7267a37 FileStore: remove dead code
 e3b995e FileStore: set XATTR_NO_SPILL_OUT when creating new files.
 239476a FileStore: make _clone() copy spill out marker
 af8052b Add xattr test to ceph_test_objectstore
-d9fac9c (origin/wip-7774-3) rgw: chain to multiple cache entries in one call
+d9fac9c rgw: chain to multiple cache entries in one call
 c616358 rgw: chain binfo_cache to multiple cache entries
 7e81185 rgw: bucket info uses cache chaining
 a2f6709 rgw: user info uses cache chaining
@@ -4912,8 +8309,8 @@ a4923f5 fix compilation warnings
 737c13e Remove SequencerPosition from KeyValueStore
 ef06515 doc: fix typo in erasure coding section
 1080e7a Add upper limit to the write size of set_alloc_hint in KeyValueStore
-6a7e201 (origin/wip-8554) init-ceph: continue after failure doing osd data mount
-1f99cda (origin/wip-mon) mon: gather perf stats on elections
+6a7e201 init-ceph: continue after failure doing osd data mount
+1f99cda mon: gather perf stats on elections
 8f36d96 mon: gather perf stats on session adds, removes, counts
 ecda2fe OSD: move waiting_for_pg into Session
 1f40c35 Add set_alloc_hint test to ceph_test_objectstore
@@ -4922,14 +8319,14 @@ b0c66a7 doc: Fixes spelling errors on README
 910d73c Added RPM and debian packaging for libradosstriper, creating a new package called libradosstriper1
 fa01ca6 Added unit test suite for the Rados striping API.
 d160ce2 Implementation of the radosstriper interface.
-a6c34e4 (origin/wip-flag-known-redirs) osdc/Objecter: mark all ops as known-if-redirected
-bc3b30e (origin/wip-set-extsize-fix) XfsFileStoreBackend: call ioctl(XFS_IOC_FSSETXATTR) less often
+a6c34e4 osdc/Objecter: mark all ops as known-if-redirected
+bc3b30e XfsFileStoreBackend: call ioctl(XFS_IOC_FSSETXATTR) less often
 750b1db XfsFileStoreBackend: nuke redundant goto in set_extsize()
-524a155 (origin/wip-rgw-manifest-iter) rgw: reduce calls to rgw_obj.set_obj()
+524a155 rgw: reduce calls to rgw_obj.set_obj()
 e31d3fe doc: Descrption => Description Correct spelling error.
 0ca43d8 doc: Use write_full for C++ example Latest version of librados uses write_full when writing entire object.
 0bd6f67 OSD::calc_priors_during: handle CRUSH_ITEM_NONE correctly
-2081c99 (origin/wip-i386-atomic) include/atomic: make 32-bit atomic64_t unsigned
+2081c99 include/atomic: make 32-bit atomic64_t unsigned
 64e99d8 ceph-objectstore-test: fix warning in collect_metadata test
 e1ad0bf Added a striper interface on top of rados called radosstriper.
 7ceeb9f Completed librados documentation of rados_write_op_cmpxattr and rados_read_op_cmpxattr concerning their error reporting
@@ -4937,11 +8334,11 @@ e1ad0bf Added a striper interface on top of rados called radosstriper.
 8c12491 Fixed usage of rhel_version in spec file : it should have been rhel
 f92d7bb Fixed missing include of assert.h in RWLock interface
 b9a35b3 Extracted RadosXattrIter from librados.cc into independent .h anc .cc files. This makes this interface usable by clients of librados.
-aede832 (origin/wip-os-rename) os: rename get_*() -> decode_*()
-5bb078f (origin/wip-sahid-dist) Populate ChangeLog when distribute
+aede832 os: rename get_*() -> decode_*()
+5bb078f Populate ChangeLog when distribute
 b22cc85 Populate AUTHORS when distribute
 d76936b OSD::calc_priors_during: fix confusing for loop bracing (cosmetic)
-59d727d (origin/wip-da-SCA-20140604) kv_flat_btree_async.cc: remove consecutive break after return
+59d727d kv_flat_btree_async.cc: remove consecutive break after return
 82f5df4 JournalScanner.cc: catch exception by reference
 53533ee KeyValueStore.cc: silence gcc -Wunused-variable
 e24213e MemStore.cc: silence gcc -Wunused-variable
@@ -4950,21 +8347,21 @@ a325e3e Revert "Remove unused variables in KeyValueStore.cc"
 cac902e os/KeyValueStore.cc: fix possible null pointer deref warning
 3ee3e66 librbd/internal.cc: check earlier for null pointer
 f17a963 test/librbd/fsx.c: fix gcc warning
-f31e4c8 (origin/wip-da-update-libs3) libs3: update to latest git master of ceph/libs3
+f31e4c8 libs3: update to latest git master of ceph/libs3
 18c07ec common/addr_parsing.c: fix realloc memory leak
 5f86652 daemon_config.cc: add some more asserts
-703d0eb (origin/wip-8452-2) rgw: set meta object in extra flag when initializing it
+703d0eb rgw: set meta object in extra flag when initializing it
 23b657c Remove unused variables in KeyValueStore.cc
 307ba48 Remove unused variables in MemStore.cc
-5185a36 (origin/wip-autotools-dummy) automake: add dummy.cc to fix 'make tags'
-35509d274 bloom_filter, add test to validate assignement operator
+5185a36 automake: add dummy.cc to fix 'make tags'
+35509d2 bloom_filter, add test to validate assignement operator
 c50f85e bloom_filter, remove unecessary operators
 90cc6dd bloom_filter, add assertion to test validate element_count()
 c323c5b Fix keyvaluestore fiemap bug
-3ec32a6 (origin/wip-8447) librados: simplify/fix rados_pool_list bounds checks
+3ec32a6 librados: simplify/fix rados_pool_list bounds checks
 5569d40 documentation: add osd erasure-code-profile {set,rm,get,ls}
 8ff4edd documentation: update osd pool create erasure
-22bc886 (origin/wip-rbd-doc-fix) doc: fix 'rbd map' example
+22bc886 doc: fix 'rbd map' example
 4f834fa doc/release-notes: v0.81
 f4e81d3 librbd: clarify license header
 884a6b3 RadosClient: Avoid the DispatchQueue for OSD responses.
@@ -4972,9 +8369,9 @@ f4e81d3 librbd: clarify license header
 9c32cb2 doc: Added usage for pool quotas.
 86754cc doc: Added more discussion of new CRUSH types.
 cabb8f0 doc: Added a section for ceph df.
-8de9501 (tag: v0.81) 0.81
+8de9501 0.81
 4bc5aef doc: Updated packages to reference firefly.
-c18cbef (origin/wip-runxfstests) qa: add run_xfstests_krbd.sh wrapper
+c18cbef qa: add run_xfstests_krbd.sh wrapper
 cd65246 qa: catch up with xfstests changes
 703166c qa: cp run_xfstests.sh run_xfstests-obsolete.sh
 601e25e erasure-code: Ceph distributed storage system
@@ -4983,23 +8380,23 @@ e158ad9 erasure-code: make decode_concat virtual
 6aa45b1 common: s/stringstream/ostream/ in str_map
 319cb50 Make KeyValueStore support "ceph osd perf" command
 06c0a42 Update INSTALL to mention the submodules/recursive
-2dbd85c (origin/wip-sharded-threadpool) WorkQueue: The variable name is corrected. Modified the variable name from shardedpol_cond->shardedpool_cond
+2dbd85c WorkQueue: The variable name is corrected. Modified the variable name from shardedpol_cond->shardedpool_cond
 3e3632e WorkQueue: Removed the unused in_process variable
 b05da1c WorkQueue: Taking care of potential race condition during pause() Introduced two variables to keep track of number of threads paused and drained during threadpool pause/drain. The pause()/drain() call is waiting till number of pause/drain threads equals to toral number of thread pool threads.
 b15bf6b OSD:Derived sharded queue implementation is changed All the threadpool related stuff like stop/pause/drain etc. are not handled by sharded queue anymore. All it is implementing are related to processing,enqueue , signaling of waiting threads and shard queue status. The pg ordering is been taken care of by introducing a map <pg,op> in each shard.
 c24ef00 ceph-common: The Sharded threadpool worker logic changed Now, the _process() of the derived queue is processing one request at a time and the outer loop is controlled by the sharded threadpool. The stop/pause/drain functionalities are controlled by the sharded TP.
 06845d0 OSD: Sharded Op worker queue implementation for handling OSD ops This is the implementation for the client of the sharded thread pool/sharded workQ. Removed the op_wq class and now OSD ops are going through sharded workqueue model which is used by the sharded threadpool. Derived ShardedOpWQ implementation has a data structure called ShardData which has it's own lock/cond and storage. ShardedOpWQ holds a vector of that and the size of the vector is a config option. During enqueue  [...]
 8369c08 ceph-common: Implementation of the sharded threadpool. Threadpool will only be having a single work queue and internally the work queue will be having multiple storage data structures. Based on some logic (which is derived class implementation specific) the work queue will shard the requests among these storage structures. Each storage will be guarded by finer grained sunchronization objects. Sharded threadpool threads will be assigned to work on a shard based on some algorithm w [...]
-83ccba3 (origin/wip-sock-cmp) msg: remove comparison operators for sockaddr_storage
-70afaaa (origin/wip-8452) rgw: fetch object attrs on multipart completion
+83ccba3 msg: remove comparison operators for sockaddr_storage
+70afaaa rgw: fetch object attrs on multipart completion
 669b605 PGLog: initialize complete_to
-b300318 (origin/wip-8311) rgw: if extra data pool name is empty, use data pool name instead
-38405d3 (origin/wip-8265) qa/workunits/cephtool: test setting options using SI units
+b300318 rgw: if extra data pool name is empty, use data pool name instead
+38405d3 qa/workunits/cephtool: test setting options using SI units
 5500437 common/config.cc: allow integer values to be parsed as SI units
 40587d4 test/strtol.cc: Test 'strict_strtosi()'
 67dc575 common/strtol.cc: strict_strtosi() converts str with SI units to uint64_t
 9c56c86 rgw: calc md5 and compare if user provided appropriate header
-cae085b (origin/wip-8472) msg_types.h: Don't use strncmp for comparing sockaddr_in.
+cae085b msg_types.h: Don't use strncmp for comparing sockaddr_in.
 00b9211 doc: Improve man page for rados bench
 2da2699 doc: update pools documentation
 ab59a10 doc: Improve man page for bench
@@ -5012,20 +8409,20 @@ e52b9c6 doc: Added osd pool default size setting example.
 20a04c6 doc: Moved redundant text out of quick-common.
 6786d60 common: WRITE_{EQ,CMP}_OPERATORS_1
 8679cdb osd_types: add pg_log_t::rollback_info_trimmed_to
-f1b890e (origin/wip-8465) osd: fix bad is_active() assert in share_map()
+f1b890e osd: fix bad is_active() assert in share_map()
 f153bc1 doc: Made additional changes s/CRUSH/Ceph for pull request #1855
 c08f481 doc: alter doc CSS for the removal of Apex Sans
 7f46b7b doc: removed Apex Sans font, replace with font from GFE
 692f998 doc: Fixes broken hyperlinks
-93a61df (origin/wip-8259) Makefile: make install-deps rule
+93a61df Makefile: make install-deps rule
 6fe7564 debian: improve package dep list
-e06c58c (origin/wip-smart-df) mon: set min_size to data chunk count for erasure pools
+e06c58c mon: set min_size to data chunk count for erasure pools
 7a9652b mon: include 'max avail' in df output
 2f63a30 mon: right justify df values
 2339d4a vstart.sh: -e to create an ec pool called 'ec'
 297f616 crush: add get_rule_weight_map
 0b5a674 rest-api: key missing for per "rx" and "rwx"
-634780a (origin/wip-8321) remove unused variables, gets all tox envs passing
+634780a remove unused variables, gets all tox envs passing
 23b75b5 add backport of collections.Counter for python2.6
 59b7113 intial take on ceph-brag-client and tox. Python 2.7 passes
 bc85b5d mailmap: Aristoteles Neto affiliation
@@ -5041,45 +8438,45 @@ dbc66d7 mailmap: Walter Huf affiliation
 be9f743 mailmap: Daniel J. Hofmann affiliation
 c758584 mailmap: Kevin Dalley affiliation
 7d1a493 mailmap: Florent Flament affiliation
-2815e4d (origin/wip-tier-doc) doc: Fix cache tier docs
+2815e4d doc: Fix cache tier docs
 08b3cff os: FileStore::create_backend unit tests
 3081652 os: ObjectStore::collect_metadata unit tests
-97023dc (origin/wip-reweight-tree) mon/OSDMonitor: simplify 'osd crush reweight ...' flow
+97023dc mon/OSDMonitor: simplify 'osd crush reweight ...' flow
 576315a mon/OSDMonitor: add 'osd crush reweight-subtree ...'
 2916148 crush: add adjust_subtree_weight()
 4021fb6 osd/: move split_list and split_request into OSD
 26862ca Fix Documentation Typo
 d4e8119 doc: Added primary affinity. Some minor rewrap edits.
-1ac3a50 (origin/wip-8292) better error reporting on incompatible device requirements
+1ac3a50 better error reporting on incompatible device requirements
 c2225f8 mon: fix set cache_target_full_ratio
-4c22c6f (origin/wip-pybind-timeout) pybind/rados: Fix timeouts for small t
-1e0a82f (origin/wip-map-advance) osd: fix map advance limit to handle map gaps
+4c22c6f pybind/rados: Fix timeouts for small t
+1e0a82f osd: fix map advance limit to handle map gaps
 641732d documentation: update pid file description
 c9ff481 Improvements to radosgw docs - Correct a typo (`rados-admin`) in admin docs - Reorder sections in config so "above example" is in expected position
 e741ea4 os/FileStore: include filestore backend, f_type in osd metadata
 827df7d Fix Doc Typo
 9312c5e os/FileStore: refactor backend instantiation
 3cf723c os/FileStore: fix journal-less operation
-200d0ae (origin/wip-multipartition) ceph-disk: Enable creating multiple osds per dev
-2ceb13a (origin/wip-8428) rgw: check appropriate entity permission on put_metadata
+200d0ae ceph-disk: Enable creating multiple osds per dev
+2ceb13a rgw: check appropriate entity permission on put_metadata
 ab0db34 documentation: adapt PG formula for erasure coded pools
 d3af8fa Update architecture.rst:Calculating PG IDs
 99b9682 documentation: update osd pool default size from 2 to 3.
-4d4b77e (origin/wip-java-build) cephfs-java: build against older jni headers
+4d4b77e cephfs-java: build against older jni headers
 6069ff0 doc/release-notes: v0.67.9
-f51e33b (origin/wip-librbd-flush) Avoid extra check for clean object
-9235dcb (origin/wip-mon-get-version) mon: set MMonGetVersionReply tid
+f51e33b Avoid extra check for clean object
+9235dcb mon: set MMonGetVersionReply tid
 ba53889 README: move package dependencies into separate files
 77066a2 README: clean up some whitespace
 c08adbc Fix set_alloc_hint op cause KeyValueStore crash problem
-d04b386 (origin/wip-8342) init-ceph: if we fail to set an OSD's crush position, continue
+d04b386 init-ceph: if we fail to set an OSD's crush position, continue
 46f1eb6 init-ceph: continue loop immediately on daemon start failure
 3bbe29e common/Finisher: add queue(list<Context*>&)
 c2644b2 Update manual-deployment.rst
 74218f3 Define AO_REQUIRE_CAS (fixes FTBFS on 'hppa')
-fb504ba (origin/wip-8373) mon: Fix check of ruleset id on pool update
+fb504ba mon: Fix check of ruleset id on pool update
 1d9e4ac mon: Fix default replicated pool ruleset choice
-799f76a (origin/wip-client-time) mds: remove unused Mutation::now
+799f76a mds: remove unused Mutation::now
 8768857 mds: use mds_stamp for mksnap
 d4bfa39 mds: reset mds_stamp for readdir, rename, link
 a09547b mds: use real timestamp for readdir leases
@@ -5089,8 +8486,8 @@ fd1f9bd mds: do rstat timestamps (rctime, fragstat mtime) in terms of op stamp
 29c6844 mds: make sure mds_stamp is set when we journal
 3569e80 mds: set mds_stamp on lock acquisition
 e4c9c34 mds: add {mds,op}_stamp to Mutation
-401319a (origin/wip-buildroot) ceph.spec.in: remove BuildRoot
-93a5b88 (origin/wip-journal-tool) tools/cephfs: error handling in EventOutput
+401319a ceph.spec.in: remove BuildRoot
+93a5b88 tools/cephfs: error handling in EventOutput
 3207c50 osdc/Journaler: fix obj count in Journaler:erase
 2621b5d tools/cephfs-journal-tool: handle --help cleanly
 d66fa53 tools/MDSUtility: fix crash on bad config
@@ -5126,19 +8523,19 @@ f7e9ff1 tools: Create cephfs-journal-tool
 107821f Fix formatting of header
 ad2e20f client: set timestamp in MDS requests
 a91e072 mds: include timestamp in MClientRequest
-d71839a (origin/wip-doc-openstack) doc: clarify openstack cow sentence
+d71839a doc: clarify openstack cow sentence
 0f7f1e8 doc: note cinder multi-backend restriction
 e92f2d9 doc: link to ephemeral rbd patches
 13d6c3c doc: quick-ceph-deploy cleanup Improve documentation in quick-ceph-deploy.rst Use admin-node consistently. ceph should be installed on admin-node for the following reasons:  "ceph-deploy admin admin-node" assumes that /etc/ceph exists.  "ceph health" requires the use of ceph
 d40ba05 doc: mention admin-node in common documentation
-29f615b (origin/wip-8334) ReplicatedPG::start_flush: fix clone deletion case
-5ff95db (origin/wip-8332) HashIndex: in cleanup, interpret missing dir as completed merge
+29f615b ReplicatedPG::start_flush: fix clone deletion case
+5ff95db HashIndex: in cleanup, interpret missing dir as completed merge
 bc897b8 rados.py: clarify recent change on write return value
 6372118 doc: Clean up pre-flight documentation Mention recent Ceph releases. Move important message about sudo and ceph-deploy closer to the use of ceph-deploy. Mention files created by ceph-deploy comment Separate apt-get from yum command
 06d05fd doc: fix link to FAQ The location of the ceph wiki FAQ has changed. Now, the link from ceph documentation matches the current FAQ location
 03e3ccd doc: Restored glance_api_version=2 setting.
-e8756be (origin/wip-8380-b) osdc/Objecter: flag ops that have been redirected
-cf2b172 (origin/wip-8380) osd: skip out of order op checks on tiered pools
+e8756be osdc/Objecter: flag ops that have been redirected
+cf2b172 osd: skip out of order op checks on tiered pools
 e47049b erasure-code: fix URL in developer notes
 23787ba mailmap: Sahid Orentino Ferdjaoui affiliation
 30ae96a Ensure autogen.sh to be executed at the top-level
@@ -5159,7 +8556,7 @@ e56b88f mds: Add ENoOp for padding journals
 e3a9f66 objecter: Don't warn on multiple admin sockets
 d4a250c common: Add write_stream(ostream) to bufferlist
 5b20deb mds: Add LogEvent::get_type_str()
-d776ec4 (origin/wip-multimds) mds: queue waiters in MDCache::force_dir_fragment()
+d776ec4 mds: queue waiters in MDCache::force_dir_fragment()
 b223055 mds: fix remote auth pin race
 57c89bf mds: fix Server::submit_mdlog_entry()
 f19cd4f mds: drop locks after removing import/export state
@@ -5169,19 +8566,19 @@ affce7a mds: journal rename source inode when rollback rename
 6fe7d17 mds: skip journaling slave rename when possible
 3783653 mds: include all of directory inode's replicas in rmdir witnesses
 08b79ea mds: journal EImportFinish after adjusting subtree auth
-c18da04 (origin/wip-osd-refs) osd: fix narrow session refcount race
+c18da04 osd: fix narrow session refcount race
 2c4391b osd: fix session leak in ms_handle_fast_connect
 15350de Add header cache to DBObjectMap
 ee92a39 MDS: add admin socket cleanup on shutdown
-a78b14e (origin/wip-osdmon-pa-wait) OSDMonitor: set next commit in mon primary-affinity reply
+a78b14e OSDMonitor: set next commit in mon primary-affinity reply
 6dfc544 sample.ceph.conf: minor update
-290ac81 (origin/wip-osdmap-sub-bug) OSD: fix an osdmap_subscribe interface misuse
-6ec3c46 (origin/osd-metadata) osd: include osd_objectstore in metadata reported to mon
+290ac81 OSD: fix an osdmap_subscribe interface misuse
+6ec3c46 osd: include osd_objectstore in metadata reported to mon
 405063b workunits: provide some output in the dirfrag.sh test
-aec5634 (origin/wip-8104) osd_types: remove the pool_id argument from (is|check)_new_interval
+aec5634 osd_types: remove the pool_id argument from (is|check)_new_interval
 f47c160 PG: replace is_split, acting_up_affected with should_restart_peering
 2ee3551 osd_types: factor out is_new_interval from check_new_interval
-c48a4ef (origin/wip-perf-atomic) common/perf_counters: use second atomic to make counters safe to read
+c48a4ef common/perf_counters: use second atomic to make counters safe to read
 ab907c5 doc: Clarified Debian uses sysvinit.
 c71c292 doc: Added rgw print continue guidance.
 b082fd6 doc: Minor edit.
@@ -5194,8 +8591,8 @@ ca833bd doc: Added clarifying text to CRUSH add command.
 26151ec mds: lower IO priority of storing backtrace
 019483f mds: reduce verbosity of handle_client_file_{readlock,setlock}
 ca313c2 mds: add a Server::submit_mdlog_entry() to provide event marking
-9f0825c (origin/wip-8338) OSD: verify that client ops are targeted correctly in the current epoch
-7411477 (origin/wip-8011) ReplicatedPG: block scrub on blocked object contexts
+9f0825c OSD: verify that client ops are targeted correctly in the current epoch
+7411477 ReplicatedPG: block scrub on blocked object contexts
 2ec2182 ReplicatedPG::start_flush: send delete even if there are no snaps
 ca91743 Locker: mark_event in acquire_locks() when blocking or succeeding
 2df68b6 Server: mark events when journaling and replying
@@ -5208,7 +8605,7 @@ ae80a1f MDS: add stubs for an AdminSocketHook
 428319e doc/release-notes: v0.80.1
 19f8849 doc: Improvements to qemu installation.
 6e4455d doc: Added note on Default requiretty for CentOS and others.
-8b682d1 (origin/java-gcj) prioritise use of `javac` executable (gcj provides it through alternatives).
+8b682d1 prioritise use of `javac` executable (gcj provides it through alternatives).
 89fe035 pass '-classpath' option (gcj/javah ignores CLASSPATH environment variable).
 0f4120c look for "jni.h" in gcj-jdk path, needed to find "jni.h" with gcj-jdk_4.9.0
 2001572 mds: deny reconnect for closed session
@@ -5217,7 +8614,7 @@ f35648b mds: properly clear new flag for stale client cap
 58ee556 mds: propagate inode rstat if it has never been propagated
 54a9037 mds: avoid journaling unnecessary dir context
 1f92f55 mds: cleanup usage of MDCache::predirty_journal_parent()
-3d7f527 (origin/wip-da-SCA-20140510) BtrfsFileStoreBackend.cc: fix ::unlinkat() result handling
+3d7f527 BtrfsFileStoreBackend.cc: fix ::unlinkat() result handling
 5f89128 TestLFNIndex.cc: remove unused variable 'mangled_name'
 a445529 rgw_user.cc: remove dead assignment in generate_key()
 b119679 rgw_user.cc: cleanup RGWAccessKeyPool::check_op()
@@ -5238,12 +8635,12 @@ d69fd90 test_rgw_admin_opstate.cc: prefer ++operators for iterators
 0f899c8 test_rgw_admin_meta.cc: prefer ++operators for iterators
 f523d64 TestErasureCodePluginJerasure.cc: prefer ++operators for non-primitive types
 014f050 test/ObjectMap/KeyValueDBMemory.cc: use empty() instead of size()
-d9fff40 (origin/wip-old-out) mon: restore previous weight when auto-marked out osd boots
+d9fff40 mon: restore previous weight when auto-marked out osd boots
 87722a4 mon: remember osd weight when auto-marking osds out
 45281d9 common/perf_counters: use atomics instead of a mutex
 bf3ba60 atomic_t: add atomic64_t
 b24b77a FileStore.cc: remove some dead assignments
-39c071f (origin/wip-update-gitignore) .gitignore: ignore files generated by ctags on topdir
+39c071f .gitignore: ignore files generated by ctags on topdir
 e847d56 add gitignore for wireshark subdir to track *.patch only here
 b9cf708 .gitignore: add some patch/diff related files
 f067013 .gitignore: add no longer used mkcephfs
@@ -5251,7 +8648,7 @@ ea69f6b cls_kvs.cc: return 'r' from get_idata_from_key()
 574a940 cls_kvs.cc: remove dead assignment
 36c1c97 rgw_user.cc:
 a121d01 libcephfs.cc: fix possible NULL pointer deref
-76568aa (origin/wip-8305) Objecter::_op_submit: only replace the tid if it's 0
+76568aa Objecter::_op_submit: only replace the tid if it's 0
 94773ac osd/OSD.cc: fix possible NULL pointer deref in share_map()
 0d67f9b osd/ReplicatedPG: do not queue NULL dup_op
 79c6491 mds/flock.cc: remove dead initialization of 'new_lock_end'
@@ -5263,7 +8660,7 @@ dd700bd osdc/Objecter: resend ops in the last_force_op_resend epoch
 b3203e5 rbd.cc: remove used parameter from set_pool_image_name()
 fe75075 test_librbd.cc: fix sizeof() in malloc call
 eb2def8 CrushWrapper.cc: fix sizeof() call in calloc
-11e5eef (origin/wip-5021) client: fix whitespace in stat relpath
+11e5eef client: fix whitespace in stat relpath
 cdbe6cf client: use __func__ instead of incorrect function name in insert_readdir_results
 3eb2a77 client: make less noise when unlinking during readdir
 d1c872d client: invalidate dentry leases when unlinking
@@ -5271,26 +8668,26 @@ d852a69 client: audit unlink() callers
 3b867d3 TrackedOp: create an "initiated" event on construction
 bdee119 msg: Fix inconsistent message sequence negotiation during connection reset
 b5e4cd1 osd: fix MOSDMarkMeDown name
-6b858be (origin/wip-8319) osd: handle race between osdmap and prepare_to_stop
+6b858be osd: handle race between osdmap and prepare_to_stop
 b640301 osd: fix state method whitespace
 ba01445 Fixed missing initializers issues
 60b1071 Removed extra semicolons
 5986f74 :doc Ceph OSD is standard name This is a method of standardizing the usage of OSD so that "Ceph OSD" is the daemon, and OSD maintains its industry standard usage of Object Storage Device.
-ddc2e1a (origin/wip-8169) rgw: calculate user manifest
-589b639 (origin/wip-7588) osd/ReplicatedPG: carry CopyOpRef in copy_from completion
+ddc2e1a rgw: calculate user manifest
+589b639 osd/ReplicatedPG: carry CopyOpRef in copy_from completion
 db4ccb0 ReplicatedPG: block scrub on blocked object contexts
 3152faf osd/osd_types: add last_force_op_resend to pg_pool_t
-0f19626 (origin/wip-6966) ceph-disk: partprobe before settle when preparing dev
-5690232 (origin/wip-da-sca-20140507) rbd-fuse.c: remove ridiculous linebreak
+0f19626 ceph-disk: partprobe before settle when preparing dev
+5690232 rbd-fuse.c: remove ridiculous linebreak
 7a3724b rbd-fuse.c: fix indentation
 8101f98 rbd-fuse.c: fix -Wmissing-field-initializers
-65ca867 (origin/wip-krbd-fixes) krbd: fix sysfs path in the comment
+65ca867 krbd: fix sysfs path in the comment
 f1d953e krbd: match new with delete, not free()
 082367e rbd.cc: init 'snap_protected' to fix -Wconditional-uninitialized
 0d01563 rbd-fuse.c: init 'rbd' in open_rbd_image()
 cfc885f ObjectCacher::_wait_for_write(): init 'bool done'
 8322878 Objecter::calc_target(): init best_locality with 0
-13750a1 (origin/wip-jcsp-clang) rgw: Remove trailing ; from fn definitions
+13750a1 rgw: Remove trailing ; from fn definitions
 447335a os/FileJournal: remove unused attribute
 f0231ef mon: Fix % escaping (\% should be %%)
 d85b8fa mds: Remove redundant 'using namespace std'
@@ -5310,14 +8707,14 @@ b4b79eb remove superfluous second semicolons at end of lines
 635607f client: skip insert_trace on safe requests
 f1d412c doc: Common graph used in 2 quick start files
 d130763 vstart.sh: fix client admin socket path
-0ee409b (origin/wip-7553) osd: Remove classic scrub code since Argonaut osd can't join
+0ee409b osd: Remove classic scrub code since Argonaut osd can't join
 81c7418 ECUtil.h: clarify calculation with braces
 13f54b7 PG::start_peering_interval: use check_new_interval for same_interval_since
 5752d76 rgw_acl_swift.h: fix #define header guard
 8059c9f rgw_rest_metadata.cc: fix -Wparentheses-equality
 8a0c016 ReplicatedPG.cc: fix -Wparentheses
 a0f59df test_rgw_manifest.cc: fix VLA of non-POD element type
-817985b (origin/wip-fsx-krbd) test_librbd_fsx: align temporary buffers allocated in check_clone()
+817985b test_librbd_fsx: align temporary buffers allocated in check_clone()
 ab9de9c test_librbd_fsx: wire up O_DIRECT mode
 c4a764c test_librbd_fsx: fix a bug in docloseopen()
 421e6c5 test_librbd_fsx: add krbd mode support
@@ -5328,7 +8725,7 @@ d63808e test_librbd_fsx: make resizes sector-size aligned
 7df50ec test_librbd_fsx: align temp_buf by readbdy instead of writebdy
 d13e32e test_librbd_fsx: move prterrcode() and simple_err()
 8d41f86 test_librbd_fsx: update usage
-99400f8 (origin/wip-da-cleanup-includes) osdmaptool.cc: cleanup included headers
+99400f8 osdmaptool.cc: cleanup included headers
 a5e0d80 monmaptool.cc: cleanup included headers
 537385c ceph_osdomap_tool.cc: cleanup included headers
 d57561a ceph_monstore_tool.cc: cleanup included headers
@@ -5355,8 +8752,8 @@ f9a91f2 Update doc to reflect the bahavior change for filestore_merge_threshold
 334c43f client: avoid blindly removing dentries
 25d2469 client: leave NULL dentry in place on ENOENT during lookup
 cc65c39 client: add debugging around traceless reply failures
-545d8ad (origin/wip-8269) rgw: extend manifest to avoid old style manifest
-9968b93 (origin/wip-8299) rgw: fix stripe_size calculation
+545d8ad rgw: extend manifest to avoid old style manifest
+9968b93 rgw: fix stripe_size calculation
 6c2b173 mds: handle export freezen race
 a09070a mds: allow negetive rstat
 22abd7b mds: cancel fragmenting dirfrags when cluster is degraded
@@ -5373,21 +8770,21 @@ f386e16 mds: pre-allocate inode numbers less frequently
 7a066f8 mds: include authpinned objects in remote authpin request
 5b86a13 mds: send dentry unlink message to replicas of stray dentry
 7d1fd66 mds: maintain auth bits during replay
-09beebe (origin/wip-7157) ceph-disk: fix list for encrypted or corrupt volume
+09beebe ceph-disk: fix list for encrypted or corrupt volume
 bd8e026 rgw: don't allow multiple writers to same multiobject part
-03b0d1c (origin/wip-8289) rgw: cut short object read if a chunk returns error
+03b0d1c rgw: cut short object read if a chunk returns error
 2d5d309 Pipe: wait for Pipes to finish running, instead of just stop()ing them
-6ec99f7 (origin/wip-rbd-clang) librbd: check return value during snap_unprotect
+6ec99f7 librbd: check return value during snap_unprotect
 6f2edda ObjectCacher: remove useless assignment
-3e387d6 (origin/wip-8296) osd/ReplicatedPG: fix whiteouts for other cache mode
-5cc5686 (origin/wip-8170) rgw: send user manifest header field
+3e387d6 osd/ReplicatedPG: fix whiteouts for other cache mode
+5cc5686 rgw: send user manifest header field
 e65a9da Revert "Fix installation into user home directory, broken by d3f0c0b"
-b78644e (tag: v0.80) 0.80
+b78644e 0.80
 cdbbf86 doc: Fixed artifacts from merge.
 a31b9e9 doc: Added sudo to setenforce. Restored merge artifact.
 5158272 doc: Added erasure coding and cache tiering notes. Special thanks to Loic Dachary.
 08a4e88 Variable length array of std::strings (not legal in C++) changed to std::vector<std::string>
-ae434a3 (origin/wip-8290) client: check snap_caps in Incode::is_any_caps()
+ae434a3 client: check snap_caps in Incode::is_any_caps()
 4bf20af SimpleMessenger: Don't grab the lock when sending messages if we don't have to
 b038f0c OSD: rename share_map_incoming and share_map_outgoing
 e1277ba OSD: move the peer_epoch and map sharing infrastructure into OSDService
@@ -5436,29 +8833,29 @@ ec16357 OSD: replace handle_pg_scan, handle_pg_backfill with handle_replica_op
 37fac29 OSD::_share_map_incoming: line wrap debug output
 78f310d PG: constify the init() function params
 816b10e RWLock: assert pthread function return values
-e2b62bc (origin/wip-messageless-tracker) TrackedOp: do not require a Message when creating new Ops
+e2b62bc TrackedOp: do not require a Message when creating new Ops
 95fc551 TrackedOp: do not track a Message
 5a3efda TrackedOp: introduce an _unregistered() function to let implementations clean up
 2e674de TrackedOp: rename arrived_at to initiated_at, specify when constructed
 6a559a5 TrackedOp: introduce a _dump_op_descriptor function
 d7e04cc TrackedOp: remove the init_from_message function
-fc3318e (origin/wip-doc-radosgw-80) doc: Fix hyperlink.
+fc3318e doc: Fix hyperlink.
 a7e7219 doc: Index update and librados.
 fcbc5fa doc: Quotas for Admin Ops API.
 e97b56e doc: New Admin Guide for Ceph Object Storage.
 7539281 Fix installation into user home directory, broken by d3f0c0b
 24c5ea8 osd: check blacklisted clients in ReplicatedPG::do_op()
-f92677c (origin/wip-blacklist) osd: check blacklisted clients in ReplicatedPG::do_op()
+f92677c osd: check blacklisted clients in ReplicatedPG::do_op()
 c64b67b ceph-object-corpus: rebase onto firefly corpus
 077e6f8 ceph-object-corpus: v0.80-rc1-35-g4812150
 8bd4e58 Fix out of source builds
 3aee1e0 Fix clone problem
-fd970bb (origin/wip-8155) mon: OSDMonitor: disallow nonsensical cache-mode transitions
-72fdd55 (origin/wip-8283) osd/ReplicatedPG: fix trim of in-flight hit_sets
+fd970bb mon: OSDMonitor: disallow nonsensical cache-mode transitions
+72fdd55 osd/ReplicatedPG: fix trim of in-flight hit_sets
 8472805 Revert "ReplicatedPG: block scrub on blocked object contexts"
-f47f867 (origin/wip-8113) osd: Prevent divide by zero in agent_choose_mode()
+f47f867 osd: Prevent divide by zero in agent_choose_mode()
 b7d31e5 osd, common: If agent_work() finds no objs to work on delay 5 (default) secs
-fe0031d (origin/wip-da-SCA-fixes-20140501) rados.cc: fix typo in help output
+fe0031d rados.cc: fix typo in help output
 8bf039d Dumper::dump_entries(): reduce scope of 'got_data'
 296b8ed PG::read_info(): pass 'const coll_t coll' by reference
 8fad144 PGBackend::be_compare_scrubmaps(): pass pgid by reference
@@ -5478,16 +8875,16 @@ d024594 mailmap: Florent Bautista affiliation
 7b192f7 mailmap: Guang Yang name normalization
 4662890 sample.ceph.conf update:
 9cf470c osd/ReplicatedPG: agent_work() fix next if finished early due to start_max
-9f1a916 (origin/wip-snapmapper-debug) osd/SnapMapper: pass snaps set by const ref
+9f1a916 osd/SnapMapper: pass snaps set by const ref
 6105c35 osd/SnapMapper: debug
-f065809 (origin/wip-7576) mon/OSDMonitor: do not reply to MOSDMarkMeDown if ack is not requested
+f065809 mon/OSDMonitor: do not reply to MOSDMarkMeDown if ack is not requested
 58ace1a osd: fix 'ack' to be 'request_ack' in MOSDMarkMeDown
 49a3b22 osd: ignore MarkMeDown message if we aren't in PREPARING_TO_STOP state
 cf25bdf osd: prevent pgs from getting too far ahead of the min pg epoch
 81e4c47 osd: track per-pg epochs, min
 c879e89 doc: Include links from hardware-recommendations to glossary Included :term: in parts of hardware-recommendations so that glossary links appear. Signed-off-by: Kevin Dalley <kevin at kelphead.org>
-cc04322 (origin/wip-fix-master) mds: note MDiscoverReply encoding change in corpus
-e597068 (origin/wip-mds-shutdown) mds: remove mdsdir in the final step of shutdown MDS
+cc04322 mds: note MDiscoverReply encoding change in corpus
+e597068 mds: remove mdsdir in the final step of shutdown MDS
 1f4a3e1 mds: bump protocol
 1ac05fd doc/release-notes: changelog link
 ffef20f doc/release-notes: final v0.67.8 notes and changelog
@@ -5516,14 +8913,14 @@ ef0de7a OSDMap.cc: prefer prefix ++operator for non-trivial iterator
 5562428 OSDMonitor.cc: prefer prefix ++operator for non-trivial iterator
 e4b3109 KeyValueStore: rename s/logger/perf_logger/
 a84fed6 crush/mapper.c: fix printf format for unsigned variable
-21bbdf5 (origin/wip-early-reply) mds: avoid adding replicas of target dentry to rename witnesses
+21bbdf5 mds: avoid adding replicas of target dentry to rename witnesses
 3a7d668 mds: allow early reply when request's witness list is empty
 41d93aa mds: include authority of the source inode in rename witnesses
-68b440d (origin/wip-8147) osd: automatically scrub PGs with invalid stats
+68b440d osd: automatically scrub PGs with invalid stats
 d01aa5b mon: OSDMonitor: return immediately if 'osd tier cache-mode' is a no-op
-f689e5f (origin/wip-no-anchor) mds: remove discover ino
+f689e5f mds: remove discover ino
 913a5dd mds: remove anchor table
-8217600 (origin/wip-doc-radosgw) doc: Ensure fastcgi socket doesn't clash with gateway daemon socket.
+8217600 doc: Ensure fastcgi socket doesn't clash with gateway daemon socket.
 9c9b92f doc: Verified RHEL configuration.
 ec11bf7 doc: Fixed inconsistent header.
 63b2964 doc: Added rhel-6-server-optional-rpms repo.
@@ -5533,47 +8930,47 @@ f674f36 Copy range using fiemap not entire length
 e08b8b6 librbd: check return code and error out if invalidate_cache fails
 b1df2c3 Changed the -i parameter to -r in order to avoid a conflict with a generic flag interpreted by the common code.
 a027100 rgw: fix url escaping
-9e3b860 (origin/wip-7500-wusui) Fix s3 tests in the rgw workunit.
+9e3b860 Fix s3 tests in the rgw workunit.
 3ec0040 Added a new command line parameter (-i or --image=) that allows rbd-fuse to specify a single image to be made available within the mount directory. The purpose of this is to allow a single RBD to be "mounted" in userspace without opening (and locking) the other RBDs in the pool.
-060105c (origin/wip-8086) ReplicatedPG: we can get EAGAIN on missing clone flush
+060105c ReplicatedPG: we can get EAGAIN on missing clone flush
 d83b8f5 ReplicatedPG: do not preserve op context during flush
 a60e15a doc/release-notes: v0.67.8 notes
-bcf92c4 (origin/wip-8202) rgw: fix url escaping
+bcf92c4 rgw: fix url escaping
 27ec495 Added Java Example
 8f64b5c Update librados-intro.rst
-3e41f92 (origin/wip-client-sleep) client: cleanup unsafe requests if MDS session is reset
+3e41f92 client: cleanup unsafe requests if MDS session is reset
 70ab079 client: wake up cap waiters if MDS session is reset
 b8aa58a client: drop dirty/flushing caps if auth MDS' session is reset
 09a1bc5 client: add asok command to kick sessions that were remote reset
 998b365 Changed the java code example
-5d49782 (origin/wip-7966) mds: terminate readlink result in resapwn
-d0f1806 (origin/wip-8193) ceph_test_rados_api_tier: increase HitSetTrim timeouts
+5d49782 mds: terminate readlink result in resapwn
+d0f1806 ceph_test_rados_api_tier: increase HitSetTrim timeouts
 9ac264a Skipping '_netdev' Debian fstab option
-499adb1 (origin/wip-7941) rados.h,ReplicatedPG: add CEPH_OSD_FLAG_ENFORCE_SNAPC and use on flush
-ddf37d9 (origin/wip-uselocalgithubforqemu-wusui) Use new git mirror for qemu-iotests
-1885792 (origin/wip-8161) ECBackend::continue_recovery_op: handle a source shard going down
+499adb1 rados.h,ReplicatedPG: add CEPH_OSD_FLAG_ENFORCE_SNAPC and use on flush
+ddf37d9 Use new git mirror for qemu-iotests
+1885792 ECBackend::continue_recovery_op: handle a source shard going down
 c0c2361 brag : implement --verbose on client
 7009211 brag : document the zero argument behavior
 2b16a81 brag : meaningfull error messages
-83f8934 (origin/wip-8168) ReplicatedPG::do_osd_ops: consider head whiteout in list-snaps
+83f8934 ReplicatedPG::do_osd_ops: consider head whiteout in list-snaps
 39c1bfc ReplicatedPG::do_op: don't return ENOENT for whiteout on snapdir read
 a83aff5 test_rbd.py: ignore children in cache pools
 aae16ab mon: add ceph osd pool set <pool> auid
-606e725 (origin/wip-7882-wusui) Support latest qemu iotest code
-f631854 (origin/wip-libkrbd) rbd: deprecate --no-settle option
+606e725 Support latest qemu iotest code
+f631854 rbd: deprecate --no-settle option
 0c2b0fb doc: 'rbd showmapped' doesn't need privileges
 4238ffd doc: do not mention modprobe in rbd docs
 0ba3960 rbd: switch to libkrbd for 'rbd {map,showmapped,unmap}' operations
 2521e73 mount.ceph: switch to module_load()
 2651750 rbd: add libkrbd convenience library
-bad34e9 (origin/wip-fs-client) client: check cap ID when handling cap export message
+bad34e9 client: check cap ID when handling cap export message
 383d21d client: avoid releasing caps that are being used
-d726251 (origin/wip-doc-cache-tier) doc: Fix hyperlink to CRUSH maps.
+d726251 doc: Fix hyperlink to CRUSH maps.
 6902e22 doc: Added cache tiering settings to ceph osd pool set.
 0d964bc doc: Added new cache tiering doc to index/TOC.
 44e4e3d doc: Added new cache tiering doc to main docs.
-2182815 (origin/wip-7439) ReplicatedPG: handle ec pools in mark_all_unfound_lost
-6769f4d (tag: v0.80-rc1) 0.80-rc1
+2182815 ReplicatedPG: handle ec pools in mark_all_unfound_lost
+6769f4d 0.80-rc1
 245923e ReplicatedPG: enable mark_unfound_lost delete for ec pools
 009e874 qa/workunits/rbd/copy.sh: skip some tests when tiering is enabled
 c0bff43 qa/workunits/rbd/copy.sh: fix test
@@ -5583,11 +8980,11 @@ ac9b461 common: add module_{load,has_parameter}()
 070a820 configure: check for blkid/blkid.h header
 9004049 rbd: use stringify() in options parsing routines
 be081db stringify: use ostringstream instead of stringstream
-cac15c7 (origin/wip-coverity-respawn) mds: make strncpy in ::respawn safer
-b4eb502 (origin/wip-coverity) osd/osd_types: RWState: initialize snaptrimmer_write_marker
+cac15c7 mds: make strncpy in ::respawn safer
+b4eb502 osd/osd_types: RWState: initialize snaptrimmer_write_marker
 4e5f442 osdc/Objecter: drop unused field
 124a663 doc/release-notes: a bit of prose about firefly
-18aded2 (origin/wip-8139) osd/osd_types: pg_interval_t: include primaries in operator<<
+18aded2 osd/osd_types: pg_interval_t: include primaries in operator<<
 931ae6b osd/osd_types: pg_interval_t: include up_primary in pg_interval_t
 66170f3 osd/osd_types: pg_interval_t: dump primary
 000233f osd: change in up set primary constitutes a peering interval change
@@ -5603,7 +9000,7 @@ c3833d7 doc: Fixed syntax to include 'pool'.
 f3df501 ReplicatedPG: do not create whiteout clones
 caa6356 ReplicatedPG,rados: add CEPH_OSD_[COPY_FROM]_MAP_SNAP_TO_CLONE
 2cb0bac qa/workunits/cephtool/test.sh: make set pg_num test non-racy
-506dce8 (origin/wip-8124) ReplicatedPG: do not use shard for hit_set object names
+506dce8 ReplicatedPG: do not use shard for hit_set object names
 f7e7588 ReplicatedPG::agent_load_hit_sets: take ondisk_read_lock
 16eccdd PG,PGLog: update hit_set during peering
 5821cc7 osd/: propogate hit_set history with repop
@@ -5614,30 +9011,30 @@ ddf1e98 osd: track the number of hit_set archive objects in a pg
 95d0278 ReplicatedPG::mark_all_unfound_lost: delete local copy if necessary
 61b6564 Simple mechanical cleanups
 7a61cdb buffer: adjust #include order
-f9e9365 (origin/wip-8153) Revert "ReplicatedPG::get_snapset_context: assert snap obj is not missing"
-4413670 (origin/wip-throttle-snap-master) osd: throttle snap trimmming with simple delay
+f9e9365 Revert "ReplicatedPG::get_snapset_context: assert snap obj is not missing"
+4413670 osd: throttle snap trimmming with simple delay
 82edda2 test: handle the create-pg delay when testing cache split syntax
-b2112d5 (origin/wip-7784) mon: OSDMonitor: HEALTH_WARN on 'mon osd down out interval == 0'
-09985d2 (origin/wip-7997) mon: wait for PaxosService readable in handle_get_version
+b2112d5 mon: OSDMonitor: HEALTH_WARN on 'mon osd down out interval == 0'
+09985d2 mon: wait for PaxosService readable in handle_get_version
 8fb2388 osd_types: pg_t: add get_ancestor() method
-7e697b1 (origin/wip-8091) ReplicatedPG::recover_replicas: do not recover clones while snap obj is missing
-3ad51c8 (origin/wip-num_objects_omap) osd_types::object_stat_sum_t: fix add/sub for num_objects_omap
-3d0e80a (origin/wip-8048) osd/ReplicatedPG: check clones for degraded
-93c0515 (origin/wip-8130) osdc/Objecter: fix osd target for newly-homeless op
-881680e (origin/wip-8132) mon: set leader commands prior to first election
-40e8dbb (origin/wip-poolset-noblock) mon: EBUSY instead of EAGAIN when pgs creating
+7e697b1 ReplicatedPG::recover_replicas: do not recover clones while snap obj is missing
+3ad51c8 osd_types::object_stat_sum_t: fix add/sub for num_objects_omap
+3d0e80a osd/ReplicatedPG: check clones for degraded
+93c0515 osdc/Objecter: fix osd target for newly-homeless op
+881680e mon: set leader commands prior to first election
+40e8dbb mon: EBUSY instead of EAGAIN when pgs creating
 f22e2e9 spelling corrections
 18caa1c OSD: split pg stats during pg split
 5e4a5dc osd_types::osd_stat_sum_t: fix floor for num_objects_omap
 a3d452a common/obj_bencher: Fix error return check from read that is negative on error
 4db1984 osd/ReplicatedPG: add missing whitespace in debug output
-924064f (origin/wip-mds-op-prio) mds: dynamically adjust priority of committing dirfrags
-0640a08 (origin/wip-8092) mds: fix cap revoke confirmation
+924064f mds: dynamically adjust priority of committing dirfrags
+0640a08 mds: fix cap revoke confirmation
 8c7a5ab Use string instead of char* when saving arguments for rest-bench
 0d2177a ReplicatedPG::get_snapset_context: assert snap obj is not missing
-015df93 (origin/wip-8043) mon/OSDMonitor: require force argument to split a cache pool
-c252345 (origin/wip-8108) osd: OSDMap: have osdmap json dump print valid boolean instead of string
-aa6df59 (origin/wip-7699) mds: Fix respawn (add path resolution)
+015df93 mon/OSDMonitor: require force argument to split a cache pool
+c252345 osd: OSDMap: have osdmap json dump print valid boolean instead of string
+aa6df59 mds: Fix respawn (add path resolution)
 f6db1bc mds: share max size to client who is allowed for WR cap
 358bde5 Add clone test on store_test
 308758b Make rados/rest bench work for multiple write instances without metadata conflict. Signed-off-by: Guang Yang <yguang at yahoo-inc.com>
@@ -5646,8 +9043,8 @@ f6db1bc mds: share max size to client who is allowed for WR cap
 3f7861e ReplicatedPG::agent_work: skip if head is missing
 d39e003 ReplicatedPG::cancel_flush: requeue dup_ops even if !op
 edda6f7 ReplicatedPG::_rollback_to: fix comment, clone certainly could be missing
-37ed4b6 (origin/wip-stress-watch) ceph_test_stress_watch: test over cache pool
-d0a7632 (origin/wip-strerror) Use cpp_strerror() wherever possible, and use autoconf for portability
+37ed4b6 ceph_test_stress_watch: test over cache pool
+d0a7632 Use cpp_strerror() wherever possible, and use autoconf for portability
 502cc61 ReplicatedPG::agent_work: skip hitset objects before getting object context
 0d4aed8 mon: fix Session ref leak in handle_get_version
 d563434 doc: Distro specific rgw.conf example.
@@ -5657,30 +9054,30 @@ bd22cac doc: Distro specific rgw.conf example.
 1310af2 doc: Reworked the simple configuration guide to be more generic.
 6853d21 doc: New admin guide for Ceph Object Gateway. Needs some clarification (todo).
 e02b845 doc: Admin API usage for quotas. Needs additional clarification on syntax.
-d0d162a (origin/wip-8097) buffer: use Mutex instead of Spinlock for raw crcs
+d0d162a buffer: use Mutex instead of Spinlock for raw crcs
 7c17fc4 mds: don't modify inode when calculating client ranges
-65ec24e (origin/wip-client-debug) client: print inode max_size
+65ec24e client: print inode max_size
 d6c71b7 osd/ReplicatedPG: add missing whitespace in debug output
-171d5c5 (origin/wip-librados-tests) ceph_test_rados_api_*: fix build warnings, memset ranges
-8905e3e (origin/wip-8089) osd/ReplicatedPG: handle dup ops earlier in do_op
+171d5c5 ceph_test_rados_api_*: fix build warnings, memset ranges
+8905e3e osd/ReplicatedPG: handle dup ops earlier in do_op
 26659a5 mds: don't issue/revoke caps before client has caps
 bd8aa6f mds: do file recover after authpin inode
-a72bcdd (origin/wip-8085) osd/ReplicatedPG: handle misdirected do_command
-4ccc845 (origin/wip-mds-typo) mds: fix typo in Server::do_rename_rollback()
-809d0fa (origin/wip-hitset-missing) osd/ReplicatedPG: handle missing hit_set on HITSET_GET rados op
+a72bcdd osd/ReplicatedPG: handle misdirected do_command
+4ccc845 mds: fix typo in Server::do_rename_rollback()
+809d0fa osd/ReplicatedPG: handle missing hit_set on HITSET_GET rados op
 53a4b71 ceph_test_rados_api_watch_notify: test over cache pool
 b31107b test/librados/TestCase: add Param option that can set up a cache pool
-3ce4078 (origin/wip-7940) test: Add --pool-snaps option to ceph_test_rados
+3ce4078 test: Add --pool-snaps option to ceph_test_rados
 09b5538 test: Fix ceph_test_rados to not core dump with invalid arguments
 b6b0c3b librados: Add ObjectWriteOperation::snap_rollback() for pool snapshots
 6e0496b librados: Rollback interface additions
-6ff645f (origin/wip-8008) osd/PG: fix repair_object when missing on primary
-19acfeb (origin/wip-8063) ceph_test_librados_tier: tolerage EAGAIN from pg scrub command
-39b9d9d (origin/wip-osd-boot) mon/OSDMonitor: fix osd epoch in boot check
-78df66f (origin/wip-8077) osd/ReplicatedPG: skip missing hit_sets when loading into memory
-7077438 (origin/wip-8054) mds: finish table servers recovery after creating newfs
+6ff645f osd/PG: fix repair_object when missing on primary
+19acfeb ceph_test_librados_tier: tolerage EAGAIN from pg scrub command
+39b9d9d mon/OSDMonitor: fix osd epoch in boot check
+78df66f osd/ReplicatedPG: skip missing hit_sets when loading into memory
+7077438 mds: finish table servers recovery after creating newfs
 052519e Revert "mds: finish table servers recovery after creating newfs"
-82d8397 (origin/wip-5170-firefly) rgw: update bucket / object rewrite
+82d8397 rgw: update bucket / object rewrite
 6f2ee99 radosgw-admin: add some conditions for bucket rewrite
 9130e7d radosgw-admin: new 'bucket rewrite' command
 f12bccc radosgw-admin: check params for object rewrite
@@ -5690,33 +9087,33 @@ ad40356 Included the total cluster size in components_count object
 db3e0b5 erasure-code: document the ruleset-root profile parameter
 6d42bd9 While generating crush_types, avoiding item parsing, and calculating type count by just iterating thorugh buckets list
 1987832 Bug fix in the way crush_type is extracted from osd crush dump
-072d371 (origin/wip-lockdep) RWLock: make lockdep id mutable
+072d371 RWLock: make lockdep id mutable
 da0d382 Revert "RWLock: don't assign the lockdep id more than once"
 632098f common_init: remove dup lockdep message
 3c54a49 Wordsmith the erasure-code doc a bit
 f6c2073 mds: finish table servers recovery after creating newfs
 3db7486 mds: issue new caps before starting log entry
-07e8ee2 (origin/wip-7437) test: Add EC testing to ceph_test_rados_api_aio
+07e8ee2 test: Add EC testing to ceph_test_rados_api_aio
 69afc59 test: Add multiple write test cases to ceph_test_rados_api_aio
 d99f1d9 test, librados: aio read *return_value consistency, fix ceph_test_rados_api_aio
 3d290c2 test: Add EC unaligned append write test to ceph_test_rados_api_io
 39bf68c pybind, test: Add python binding for append and add to test
 d211381 pybind: Check that "key" is a string
 9812720 librados, test: Have write, append and write_full return 0 on success
-008663a (origin/wip-7499) rgw, radosgw-admin: bucket link uses bucket instance id now
-6ce7116 (origin/wip-7786) civetweb: update subproject
+008663a rgw, radosgw-admin: bucket link uses bucket instance id now
+6ce7116 civetweb: update subproject
 43d837d rgw: radosgw-admin object rewrite
-4c99e97 (origin/wip-8062) mon/OSDMonitor: ignore boot message from before last up_from
+4c99e97 mon/OSDMonitor: ignore boot message from before last up_from
 a8f0953 osd/ReplicatedPG: adjust obc + snapset_obc locking strategy
-86b8594 (origin/wip-6789) mon: Monitor: suicide on start if mon has been removed from monmap
+86b8594 mon: Monitor: suicide on start if mon has been removed from monmap
 02048dc mds: guarantee message ordering when importing non-auth caps
 ac51fca mds: include truncate_seq/truncate_size in filelock's state
 808ba13 mds: remove wrong assertion for remote frozen authpin
 860d727 osdc/Objecter: move mapping into struct, helper
-18642ed (origin/wip-8042) mon: tell peers missing features during probe
+18642ed mon: tell peers missing features during probe
 39ca440 mon: move required_features back into Monitor
 c8039ab mon: ignore sync clients without required_features
-50ed65f (origin/wip-6480) auth: remove unused get_global_id() method
+50ed65f auth: remove unused get_global_id() method
 b297689 auth: make AuthClientHandler::validate_ticket() protected
 3ccef66 auth: AuthClientHandler const cleanup
 9af10b2 auth: CephxProtocol const cleanup
@@ -5731,34 +9128,34 @@ b297689 auth: make AuthClientHandler::validate_ticket() protected
 edd542e tools: Improve ceph_scratchtoolpp
 34d69cd mon: refresh elector required_features when they change
 b3b502f mon/Elector: ignore ACK from peers without required features
-f1c6b65 (origin/wip-8028) ceph.spec.in: require redhat-lsb-core
+f1c6b65 ceph.spec.in: require redhat-lsb-core
 ae09361 mailmap: Yan, Zheng name normalization
 5ee0b7b mailmap: Nikola Kotur affiliation
-02aedbc (origin/wip-8004) client: wake up umount waiter if receiving session open message
+02aedbc client: wake up umount waiter if receiving session open message
 409999c rbd: Prevent Seg fault by checking read result in snap_read_header()
 9c6733e librados: Allow rados_pool_list() to get NULL buffer if len == 0
 1848a23 librados: Fix typo for read functions documentation
-a8330f5 (origin/wip-5469) librbd: fix zero length request handling
-22a0c1f (origin/wip-8045) osd: do not block when updating osdmap superblock features
+a8330f5 librbd: fix zero length request handling
+22a0c1f osd: do not block when updating osdmap superblock features
 43f0519 doc: Made minor changes to quick start preflight for RHEL.
 ab7a25c doc: Notes and minor modifications to gateway installation doc.
 1d74170 pipe: only read AuthSessionHandler under pipe_lock
-084db49 (origin/wip-die-mkcephfs) remove mkcephfs
-277e7ac (origin/wip-5835) debian: update ceph description
+084db49 remove mkcephfs
+277e7ac debian: update ceph description
 72dc732 ceph.spec: update ceph description
-79ac2f7 (origin/wip-8001) osd/PG: set CREATING pg state bit until we peer for the first time
-4de49e8 (origin/wip-8019) os/FileStore: reset journal state on umount
+79ac2f7 osd/PG: set CREATING pg state bit until we peer for the first time
+4de49e8 os/FileStore: reset journal state on umount
 1cdb738 vstart.sh: make crush location match up with what init-ceph does
-d2edd9c (origin/wip-8031) osd: drop unused same_for_*() helpers
+d2edd9c osd: drop unused same_for_*() helpers
 5d61161 osd: drop previous interval ops even if primary happens to be the same
 d3833dd osd: make misdirected checks explicit about replicas, flags
-55cfb14 (origin/wip-8026) mds: fix shared_ptr MDRequest bugs
-2ba00bd (origin/wip-7914) erasure-code: thread-safe initialization of gf-complete
+55cfb14 mds: fix shared_ptr MDRequest bugs
+2ba00bd erasure-code: thread-safe initialization of gf-complete
 0022d91 mailmap: Paul Meserve affiliation
 f261ad1 mailmap: Michael Nelson affiliation
 89a5ec7 mailmap: Sandon Van Ness name normalization
 010dff1 osd_types: fix pg_stat_t::encode, object_stat_sum_t::decode version
-22894c3 (origin/wip-4354-shared_ptr) SimpleLock: Switch MutationRef& for MutationRef in get_xlock()
+22894c3 SimpleLock: Switch MutationRef& for MutationRef in get_xlock()
 5f2ccab MDCache: use raw MutationImpl* instead of MutationRef in a few places
 6c3fc3e Locker: use raw MutationImpl* instead of MutationRef in several places
 920fd6c Locker: use a null_ref instead of NULL
@@ -5771,33 +9168,33 @@ a6a0800 Server: use MutationRef instead of raw pointer
 4dedab6 MDS: switch cache object classes to use MutationRef instead of raw pointers
 9a4a429 ceph_test_rados_api_misc: print osd_max_attr_size
 4b66868 doc: Removed --stable arg and replaced with --release arg for ceph-deploy.
-7273d9e (origin/wip-7975) osd/ReplicatedPG: warn if invalid stats prevent us from activating agent
+7273d9e osd/ReplicatedPG: warn if invalid stats prevent us from activating agent
 02d7e84 osd/ReplicatedPG: dump agent state on pg query
 fa46aec osd/ReplicatedPG: kickstart the agent if scrub stats become valid
-07099ec (origin/wip-8003) ReplicatedPG: do not evict head while clone is being promoted
+07099ec ReplicatedPG: do not evict head while clone is being promoted
 67b2342 ReplicatedPG::trim_object: account evicted prev clone for stats
 6ff6c19 ReplicatedPG::make_writeable: check for evicted clone before adjusting for clone_overlap
 a8a49a0 ReplicatedPG: use get_clone_bytes on evict/promote
 3286283 ReplicatedPG::_scrub: account for clone_overlap on each clone
 4d5db06 ReplicatedPG::find_object_context: check obs.exists on clone obc before checking snaps
 d0e2c98 ReplicatedPG::finish_promote: add debugging assert for clone_size
-e213ee1 (origin/wip-7919) qa: workunits: mon: auth_caps.sh: test 'auth' caps requirements
+e213ee1 qa: workunits: mon: auth_caps.sh: test 'auth' caps requirements
 85a1cf3 mon: MonCommands: have all 'auth' commands require 'execute' caps
-4c2d73a (tag: v0.79) 0.79
+4c2d73a 0.79
 51da3bb mds: fix uninit val in MMDSSlaveRequest
-09586ec (origin/wip-shrink-icache) client: pin parent dentry of inode who has ll_ref > 0
+09586ec client: pin parent dentry of inode who has ll_ref > 0
 76cbd5d mds: fix uninit MMDSSlaveRequest lock_type
-4ea9e48 (origin/wip-8002) osd: fix map subscription in YOU_DIED osd_ping handler
+4ea9e48 osd: fix map subscription in YOU_DIED osd_ping handler
 2f7522c msgr: add ms_dump_on_send option
 87e6a62 mds: fix uninitialized fields in MDiscover
 67fd421 mon: wait for quorum for MMonGetVersion
 a75af4c client: try shrinking kernel inode cache when trimming session caps
-82015e4 (origin/wip-7958) client: release clean pages if no open file want RDCACHE
+82015e4 client: release clean pages if no open file want RDCACHE
 9484daf osd: disable agent when stats_invalid (post-split)
-232ac1a (origin/wip-7994) OSD: _share_map_outgoing whenever sending a message to a peer
-6f40b64 (origin/wip-7993) ceph-post-file: use getopt for multiple options, add longopts to help
-2f6a62b (origin/wip-7992) ceph-post-file: fix installation of ssh key files
-e02b7f9 (origin/wip-7983) osd/ReplicatedPG: do not hit_set_persist while potentially backfilling hit_set_*
+232ac1a OSD: _share_map_outgoing whenever sending a message to a peer
+6f40b64 ceph-post-file: use getopt for multiple options, add longopts to help
+2f6a62b ceph-post-file: fix installation of ssh key files
+e02b7f9 osd/ReplicatedPG: do not hit_set_persist while potentially backfilling hit_set_*
 4aef403 doc/release-notes: note about emperor backport of mon auth fix
 db266a3 mon: MonCommands.h: have 'auth' read-only operations require 'x' cap
 9caf3db Migrator: use a null ref instead of NULL when calling into path_traverse
@@ -5819,48 +9216,48 @@ abc19dd client: drop Fr cap before gettattr CEPH_STAT_CAP_SIZE
 2d5bd84 client: assign implemented caps to caps field of MClientCaps
 1538a98 client: hold Fcr caps during readahead
 701c22a client: implement RDCACHE reference tracking
-b219c8f (origin/wip-cache-create-fix) ReplicatedPG: fix CEPH_OSD_OP_CREATE on cache pools
-be8b228 (origin/wip-7922) osd: Send REJECT to all previously acquired reservations
+b219c8f ReplicatedPG: fix CEPH_OSD_OP_CREATE on cache pools
+be8b228 osd: Send REJECT to all previously acquired reservations
 18201ef doc/release-notes: v0.79 release notes
 4dc6266 Fix byte-order dependency in calculation of initial challenge
-6cb50d7 (origin/wip-7964) ReplicatedPG::_delete_oid: adjust num_object_clones
+6cb50d7 ReplicatedPG::_delete_oid: adjust num_object_clones
 0f2ab4d ReplicatedPG::agent_choose_mode: improve debugging
-0552ecb (origin/wip-7978) rgw: only look at next placement rule if we're not at the last rule
+0552ecb rgw: only look at next placement rule if we're not at the last rule
 eb23ac4 ReplicatedPG::agent_choose_mode: use num_user_objects for target_max_bytes calc
 cc9ca67 ReplicatedPG::agent_choose_mode: exclude omap objects for ec base pool
 a130a44 osd/: track num_objects_omap in pg stats
 9894a55 ReplicatedPG: handle FLAG_OMAP on promote and copyfrom
-a11b3e8 (origin/wip-7858) ReplicatedPG::do_op: use get_object_context for list-snaps
+a11b3e8 ReplicatedPG::do_op: use get_object_context for list-snaps
 78e9813 ReplicatedPG: do not create snapdir on head eviction
-31df91e (origin/wip-backfill) osd: add 'osd debug reject backfill probability' option
-d323634 (origin/wip-test-alloc-hint-ec-fix) qa: test_alloc_hint: set ec ruleset-failure-domain to osd
-8e46fe0 (origin/wip-vstop-unmap) stop.sh: unmap rbd images when stopping the whole cluster
+31df91e osd: add 'osd debug reject backfill probability' option
+d323634 qa: test_alloc_hint: set ec ruleset-failure-domain to osd
+8e46fe0 stop.sh: unmap rbd images when stopping the whole cluster
 afc5dc5 stop.sh: do not trace commands
 0110a19 stop.sh: indent 4 spaces universally
-e4a8535 (origin/wip-vstart-erasure-code-default) vstart: set a sensible default for ruleset-failure-domain
-c43822c (origin/wip-7965) lockdep: reset state on shutdown
+e4a8535 vstart: set a sensible default for ruleset-failure-domain
+c43822c lockdep: reset state on shutdown
 7a49f3d lockdep: do not initialize if already started
-6bf46e2 (origin/wip-7915) OSDMap: bump snap_epoch when adding a tier
-01445d5 (origin/wip-7937) ReplicatedPG::_scrub: don't bail early for snapdir
-5f680f9 (origin/wip-7659) ReplicatedPG::_verify_no_head_clones: missing implies that the clone exists
-7909262 (origin/wip-init) debian: fix control to allow upgrades
+6bf46e2 OSDMap: bump snap_epoch when adding a tier
+01445d5 ReplicatedPG::_scrub: don't bail early for snapdir
+5f680f9 ReplicatedPG::_verify_no_head_clones: missing implies that the clone exists
+7909262 debian: fix control to allow upgrades
 17732dc debian: move rbdmap config and sysvinit/upstart scripts into ceph-common
 1d42de5 init.d: correcting rbdmap init order:
-771e88a (origin/wip-dirfrag) mds: fix check for merging/spliting dirfrag
-43bc39b (origin/wip-largedir2, origin/wip-largedir) mds: ignore CDir::check_rstats() when debug_scatterstat is off
+771e88a mds: fix check for merging/spliting dirfrag
+43bc39b mds: ignore CDir::check_rstats() when debug_scatterstat is off
 5a9b99a mds: initialize bloom filter according to dirfrag size
 16af25f mds: add dentries in dirfrag to LRU in reverse order
 06ecb2c mds: handle freeze authpin race
 d1967f3 mds: treat cluster as degraded when there is clientreplay MDS
 b65a818 mds: don't start new segment while finishing disambiguate imports
 ff44a99 mds: trim non-auth subtree more aggressively
-c09f58e (origin/wip-7450) radosgw-admin: don't always generate access key for subusers
+c09f58e radosgw-admin: don't always generate access key for subusers
 e1783a6 rgw: subuser creation can be idempotent
 f54c684 radosgw-admin: fix subuser modify
 e095b1d debian: make ceph-common own etc/ceph, var/log/ceph
-d4d39a0 (origin/wip-7907) osd/ReplicatedPG: mark_unrollbackable when _rollback_to head
+d4d39a0 osd/ReplicatedPG: mark_unrollbackable when _rollback_to head
 1acd547 debian: move ceph-rest-api from ceph-common to ceph
-a76a129 (origin/wip-7939) PG: set role for replicated even if role != shard
+a76a129 PG: set role for replicated even if role != shard
 ef94fcd test: Add EC testing to ceph_test_rados_api_snaphots
 1871fe7 test: Add EC testing to ceph_test_rados_api_lock
 759878c test: Add EC testing to ceph_test_rados_api_tier
@@ -5876,8 +9273,8 @@ c106976 librados: Add missing C variants for alignment
 2d7da1d librados: Add missing rados_wait_for_latest_osdmap()
 dde1c91 osd/ReplicatedPG: continue scrub logic when snapset.head_exists doesn't match
 c2e5a42 osd/ReplicatedPG: handle snapdir properly during scrub
-ed5a5e0 (origin/wip-7935) rgw: reverse logic to identify next part
-48fbce9 (origin/wip-7916) ReplicatedPG: improve get_object_context debugging
+ed5a5e0 rgw: reverse logic to identify next part
+48fbce9 ReplicatedPG: improve get_object_context debugging
 5c9b8a2 osd/PG: debug cached_removed_snaps changes
 824da20 librbd: skip zeroes when copying an image
 e44f85c qa/workunits/cephtool/test.sh: test 'osd pg-temp ...'
@@ -5887,8 +9284,8 @@ e44f85c qa/workunits/cephtool/test.sh: test 'osd pg-temp ...'
 07dcffa OSDMonitor: add 'osd pg-temp ...' command
 d3183e3 java/test: ceph.file.layout xattr is still not there now
 cd1a9c4 Add ceph-client-debug and jerasure shared objects to RPM spec file.
-81853c6 (origin/wip-7912) mon/PGMap: clear pool sum when last pg is deleted
-8c761c4 (origin/wip-ec-profile-idempotent) mon: make 'ceph osd erasure-code-profile set ...' idempotent
+81853c6 mon/PGMap: clear pool sum when last pg is deleted
+8c761c4 mon: make 'ceph osd erasure-code-profile set ...' idempotent
 7d321d8 qa/workunits/rados/test_alloc_hint: fix erasure syntax
 2826fda doc: fix typos in glossary
 7fa025e .gitignore: add examples/librados files
@@ -5897,33 +9294,33 @@ cd1a9c4 Add ceph-client-debug and jerasure shared objects to RPM spec file.
 ff51d3f doc: pgbackend dev doc outdated notice
 d5973aa doc: update jerasure plugin
 402e104 doc: erasure code developer notes updates
-74fa3a7 (origin/wip-fuse-access) fuse: implement 'access' low level function
+74fa3a7 fuse: implement 'access' low level function
 e672c52 osd/ReplicatedPG: fix cache tier scrub again
 ea47b4a ceph_test_rados_api_tier: improve promote+scrub test
-fc1a424 (origin/wip-7849) ceph-conf: use global_pre_init to avoid starting logging
+fc1a424 ceph-conf: use global_pre_init to avoid starting logging
 98551fc global: separate first half of global_init into global_pre_init
-c3292e4 (origin/wip-cache-scrub) ceph_test_rados_api_tier: improve cache tier + scrub test
+c3292e4 ceph_test_rados_api_tier: improve cache tier + scrub test
 cfd6f23 osd/ReplicatedPG: tolerate trailing missing clones on cache tiers
 b8ea656 java/test: ceph.file.layout xattr is not there now
 4f9f7f8 qa/workunits/fs/misc/layout_vxattrs: ceph.file.layout is not listed
 b71e64d mds: find approximal bounds when adjusting subtree auth
 fd28ad5 doc: erasure-code development complete
-399de24 (origin/wip-sse-i386) erasure-code: do not attempt to compile SSE4 on i386
+399de24 erasure-code: do not attempt to compile SSE4 on i386
 07ccc4e mds: commit new dirfrag before splitting it
 bd6e35c rbd.cc: yes, cover formatted output as well.  sigh.
 780fa91 Revert "ceph-conf: do not log"
 b5a6320 Revert "ceph-conf: no admin_socket"
 44afc23 init: fix OSD startup issue
 fd76fec rbd.cc: tolerate lack of NUL-termination on block_name_prefix
-056151a (origin/wip-7888) mon/MonClient: use keepalive2 to verify the mon session is live
+056151a mon/MonClient: use keepalive2 to verify the mon session is live
 d747d79 msgr: add KEEPALIVE2 feature
-1aa1d93 (origin/wip-7904) ReplicatedPG: hit_set_setup, agent_setup, skip if !active
-f1c7b4e (origin/wip-7867) client: pin Inode during readahead
+1aa1d93 ReplicatedPG: hit_set_setup, agent_setup, skip if !active
+f1c7b4e client: pin Inode during readahead
 032d4ec osdc/ObjectCacher: call read completion even when no target buffer
-68dc0c6 (origin/wip-7903) rgw: move max_chunk_size initialization
+68dc0c6 rgw: move max_chunk_size initialization
 a6be1d6 PG: always clear agent_state in clear_primary_state
 dfd3cb5 rgw: only look at prefetched data if we actually prefetched
-d78e678 (origin/wip-7902) osd/PG: fix choose_acting revert to up case
+d78e678 osd/PG: fix choose_acting revert to up case
 0bb911c mds: don't trim non-auth root inode/dirfrag
 90b4e53 mds: include authority of the overwrited inode in rename witnesses
 367987f mds: don't increase nlink when rollback stray reintegration
@@ -5945,7 +9342,7 @@ e535f7f mds: avoid journaling non-auth opened inode
 ffcbcdd mds: handle race between cache rejoin and fragmenting
 6963a8f mds: handle interaction between slave rollback and fragmenting
 72eaa5e doc: fix typos in tiering dev doc
-1b5e8f4 (origin/wip-7880) mds: properly propagate dirty dirstat to auth inode
+1b5e8f4 mds: properly propagate dirty dirstat to auth inode
 38d4c71 Pipe: rename keepalive->send_keepalive
 c64d03d mon/OSDMonitor: require OSD_CACHEPOOL feature before using tiering features
 69321bf mon/OSDMonitor: prevent setting hit_set unless all OSDs support it
@@ -5953,12 +9350,12 @@ eb71924 osd/ReplicatedPG: tolerate missing clones in cache pools
 6508d5e osd/ReplicatedPG: improve clone vs head checking
 9e2cd5f osd/ReplicatedPG: do not assert on clone_size mismatch
 7f026ba ceph_test_rados_api_tier: scrub while cache tier is missing clones
-7cb1d3a (origin/wip-fix-pools) qa/workunits/mon/pool_ops.sh: fix test
+7cb1d3a qa/workunits/mon/pool_ops.sh: fix test
 233801c qa/workunits/mon/pool_ops.sh: use expect_false
-72715b2 (origin/wip-7849-b) ceph-conf: no admin_socket
+72715b2 ceph-conf: no admin_socket
 fb20823 jerasure: fix up .gitignore
 acc31e7 ceph-conf: do not log
-ffd69ab (origin/wip-7876) rgw: use s->content_length instead of s->length
+ffd69ab rgw: use s->content_length instead of s->length
 501e31d logrotate: do not rotate empty logs (2nd logrotate file)
 91176f1 erasure-code: test encode/decode of SSE optimized jerasure plugins
 b76ad97 erasure-code: test jerasure SSE optimized plugins selection
@@ -5970,26 +9367,26 @@ cc0cc15 erasure-code: gf-complete / jerasure modules updates
 12d4f38 erasure-code: allow loading a plugin from factory()
 506d2bb logrotate improvement: do not rotate empty logs
 dc3ce58 osd: do not make pg_pool_t incompat when hit_sets are enabled
-92859ed (origin/wip-7837) ReplicatedPG: include pending_attrs when reseting attrs in WRITEFULL
-b6a431b (origin/wip-7874) ReplicatedPG: disable clone subsets for cache pools
-56974b9 (origin/wip-7860) test: Wait for tier removal before next test starts
-7999bc9 (origin/wip-7828) ReplicatedPG:: s/_delete_head/_delete_oid, adjust head_exists iff is_head
+92859ed ReplicatedPG: include pending_attrs when reseting attrs in WRITEFULL
+b6a431b ReplicatedPG: disable clone subsets for cache pools
+56974b9 test: Wait for tier removal before next test starts
+7999bc9 ReplicatedPG:: s/_delete_head/_delete_oid, adjust head_exists iff is_head
 9865409 rgw: configurable chunk size
-832c007 (origin/wip-7871) RadosModel: allow --no-omap to be specified seperately from --ec-pool
-555ae12 (origin/wip-7870) ReplicatedPG::do_osd_ops: only return ENOTSUP on OMAP write ops
-6cb8595 (origin/wip-7835) ReplicatedPG::make_writeable: fill in ssc on clone
-21fc535 (origin/wip-7823) osd: trim copy-get backend read to object size
+832c007 RadosModel: allow --no-omap to be specified seperately from --ec-pool
+555ae12 ReplicatedPG::do_osd_ops: only return ENOTSUP on OMAP write ops
+6cb8595 ReplicatedPG::make_writeable: fill in ssc on clone
+21fc535 osd: trim copy-get backend read to object size
 18c3e9e osd: fix tests due to no default erasure-code ruleset
 29f7420 Revert "osd: create the erasure-code ruleset in OSDMap::build_simple"
 4cf9a73 fix bug in 'defaultweight' calculation on OSD start.
-2779e2a (origin/wip-7863-wusui) Make sure s3_utilities are found.
+2779e2a Make sure s3_utilities are found.
 38bcd3c osd: start_flush() should check for missing clones and return if requested
 bf87562 osd: Error from start_flush() not checked in agent_maybe_flush()
 ed43aa0 osd: Add logging of missed l_osd_agent_skip cases
 d1d99df osd: Improve logging output including pg_shard_t as osd#(shard)
 4ac7808 minor corrections to package descriptions
 012bb5f minor init.d scripts lintianisation:
-14b743b (origin/wip-7676-firefly) rgw: don't modify rgw_bucket for data extra pool
+14b743b rgw: don't modify rgw_bucket for data extra pool
 7989cbd rgw: multipart meta object uses extra data pool
 f023f90 rgw: zone placement info includes extra data pool
 3677076 rgw: add data_extra pool to bucket
@@ -6001,38 +9398,38 @@ a310ea2 mailmap: Guang Yang affiliation
 2faf271 mailmap: Mohammad Salehe affiliation
 27c28ad mailmap: Sharif Olorin affiliation
 9fd61c7 mailmap: Stephan Renatus affiliation
-01b9966 (origin/wip-dz-watch-test) qa: Add ceph_multi_stress_watch for rep and ec
-6ec28fd (origin/wip-6465) ensure pybind is in the PYTHONPATH
+01b9966 qa: Add ceph_multi_stress_watch for rep and ec
+6ec28fd ensure pybind is in the PYTHONPATH
 37899fa be nitpicky about missing references
 4186916 Revert "erasure-code: gf-complete detects SSE at runtime"
-3f014da (origin/wip-status-function-names) mon: Monitor: s/_mon_status()/get_mon_status()/
+3f014da mon: Monitor: s/_mon_status()/get_mon_status()/
 ed780ad mon: Monitor: s/get_status()/get_cluster_status()/
-08a3d6b (origin/wip-listxattr) client: don't include ceph.{file,dir}.layout vxattr in listxattr()
+08a3d6b client: don't include ceph.{file,dir}.layout vxattr in listxattr()
 1bc680e erasure-code: deactivate SSE optimizations
-8116394 (origin/wip-7827) msg: set recv stamp for local delivery
+8116394 msg: set recv stamp for local delivery
 3ebeb8e mailmap: Allen Samuels affiliation
 a217cd6 mailmap: Warren Usui is with Inktank
 a4c652c document adding dev key for custom Apache/FCGI install
-fb72330 (origin/wip-7810) mds: reset connection priv after connection's session is removed
+fb72330 mds: reset connection priv after connection's session is removed
 e4f2d9f doc/release-notes: 0.78 final notes
 185bcc4 doc/release-notes: final 0.78
 1817c23 rgw: get rid of a memory allocation
 1e7cd10 rgw: remove memory allocation
-f6c746c (tag: v0.78) 0.78
+f6c746c 0.78
 28d8e7f Revert "ReplicatedPG: disallow trans which atomically create and remove an object"
 49a0190 doc/release-notes: 0.78 notes
 dbcf447 erasure-code: gf-complete detects SSE at runtime
 8c7f6c1 autotools: AX_SSE detects the compiler SSE capabilities
-5a3f6c7 (origin/wip-7438) test: Add erasure coding to stress watch test
+5a3f6c7 test: Add erasure coding to stress watch test
 6fb6588 test: Reorg multi_stress_watch to prepare for ec option
 b110275 test: Fix ceph_filestore_dump.sh test for new EC pool creation
 dad0faf tests: use ^ instead of ! as invalid char
 d4d77d7 doc/release-notes: stop confusing sphinx
-78ede90 (origin/wip-7780) objecter: waive OSDMAP_FULL check for MDS
-a4849fb (origin/wip-flush-journal-asok) qa: test_alloc_hint: flush journal before prodding the FS
+78ede90 objecter: waive OSDMAP_FULL check for MDS
+a4849fb qa: test_alloc_hint: flush journal before prodding the FS
 9d31c1b osd: add flush_journal admin socket command
-b444e88 (origin/wip-multimds-fixes) mds: fix typo in MDCache::handle_cache_rejoin_strong
-ddbb2f7 (origin/wip-jerasure-submodules) erasure-code: add gf-complete / jerasure submodules
+b444e88 mds: fix typo in MDCache::handle_cache_rejoin_strong
+ddbb2f7 erasure-code: add gf-complete / jerasure submodules
 5c34a0f erasure-code: remove copy of gf-complete / jerasure
 0d167d2 mds: fix NULL pointer dereference in Server::handle_client_rename
 272b53b mds: avoid infinite loop in MDLog::submit_entry()
@@ -6053,23 +9450,23 @@ e909eaf mds: drop auth pins before waiting for dir unfreeze
 bc3325b mds: fix stack overflow caused by nested dispatch
 63a597b mds: rollback slave request after slave prepare is journalled
 a1ff489 mds: treat flushing as dirty when rejoining scattered locks
-1268dbb (origin/wip-tid_t) Change tid_t to ceph_tid_t to avoid conflicts with a popular type
-b227426 (origin/wip-no-version) Add NO_VERSION to avoid rebuilding ceph_ver.h and relinking
-cfb04b2 (origin/wip-limit-libkeyutils) Makefiles: remove libkeyutils from every binary except two
+1268dbb Change tid_t to ceph_tid_t to avoid conflicts with a popular type
+b227426 Add NO_VERSION to avoid rebuilding ceph_ver.h and relinking
+cfb04b2 Makefiles: remove libkeyutils from every binary except two
 e9eb641 remove gf-complete / jerasure sub modules
-fdcf3eb (origin/wip-7777) ReplicatedPG::do_op: delay if snapdir is unreadable
-7f7a998 (origin/wip-7708) mds/Locker: fix null deref on cap import
+fdcf3eb ReplicatedPG::do_op: delay if snapdir is unreadable
+7f7a998 mds/Locker: fix null deref on cap import
 4221e0d build: add gf-complete/jerasure to gitmodule_mirrors
-25d04fb (origin/wip-pq) osd: dump priority queue state on dequeue at level 30
+25d04fb osd: dump priority queue state on dequeue at level 30
 ff11965 osd: fix OpWQ dump locking
 4a3464c common/PrioritizedQueue: include first item's cost in dump
 de576d5 common/PrioritizedQueue: constify a few things
 8bd8944 common/PrioritizedQueue: fix remove_by_class() corner case
 f1e3bc9 doc/dev/cache-pool: describe the tiering agent
-d728426 (origin/wip-7733) PG::start_peering_interval: always send_notify if !primary
-b4420ff (origin/wip-7755) PG::find_best_info: fix log_tail component
+d728426 PG::start_peering_interval: always send_notify if !primary
+b4420ff PG::find_best_info: fix log_tail component
 1ddae7e erasure-code: jerasure and gf-complete moved to ceph namespace
-01a0f2d (origin/wip-6806) mon: OSDMonitor: don't rely on client-side validation for command arguments
+01a0f2d mon: OSDMonitor: don't rely on client-side validation for command arguments
 074c880 mon: Monitor: handle invalid 'quorum' command argument
 652056e mon: Properly handle errors from 'cmd_getval()' when needed
 543c642 erasure-code: disable SSE extensions
@@ -6087,19 +9484,19 @@ f8aa1ed mon: set the profile and ruleset defaults early
 063de51 osd: obsolete pg_pool_t properties with erasure_code_profile
 04d2fd1 mon: add the erasure-code-profile {set,get,rm,ls} MonCommand
 fa1d957 mon/Paxos: commit only after entire quorum acks
-aed0744 (origin/wip-7738) os/FileJournal: return errors on make_writeable() if reopen fails
-c31f38c (origin/wip-7728) ReplicatedPG: if !obc->obs->exists, lock snapdir obc if exists
+aed0744 os/FileJournal: return errors on make_writeable() if reopen fails
+c31f38c ReplicatedPG: if !obc->obs->exists, lock snapdir obc if exists
 9ee1084 ReplicatedPG: disallow trans which atomically create and remove an object
 f094400 Add file to store mirror location of module's.
-a9f8a9e (origin/wip-cli-err) ceph.in: Better error on bad arg to 'tell'
+a9f8a9e ceph.in: Better error on bad arg to 'tell'
 1a451f2 mon: functional tests teardown must be run on error
 514b5e3 mon: add helper to selection functions implementing tests
 e4b4b1f osd: OSDMap::erasure_code_profile accessors
 c4f8f26 osd: add OSDMap::erasure_code_profile
-2b9bd26 (origin/wip-tmap2omap-warn) mds: avoid spurious TMAP2OMAP warning
-e39c213 (origin/wip-7611) ceph.in: do not allow using 'tell' with interactive mode
-b2af217 (origin/wip-remotebit-dump) mds: Fix remotebit::dump for less common types
-9d77ce1 (origin/wip-rados-outfile) tools/rados: Allow binary file output of omap data
+2b9bd26 mds: avoid spurious TMAP2OMAP warning
+e39c213 ceph.in: do not allow using 'tell' with interactive mode
+b2af217 mds: Fix remotebit::dump for less common types
+9d77ce1 tools/rados: Allow binary file output of omap data
 be31998 erasure-code: make libcommon include liberasure-code
 e6d9066 erasure-code: add ostream to factory()
 de62572 erasure-code: remove dependency to the global context
@@ -6116,53 +9513,53 @@ cf25946 mon: create-erasure uses crush_ruleset_create_erasure
 8e07dbc mon: crush_ruleset_create_erasure error codes
 402e646 mon: crush_ruleset_create_erasure reduce verbosity
 6a16eac mon: create crush_ruleset_create_erasure helper
-1ae3314 (origin/wip-7684) client: force getattr when inline data is missing
-04de781 (origin/wip-7740) OSD::handle_pg_query: on dne pg, send lb=hobject_t() if deleting
+1ae3314 client: force getattr when inline data is missing
+04de781 OSD::handle_pg_query: on dne pg, send lb=hobject_t() if deleting
 9e21840 mds: include inline data in lock messages
 5b3422a mds: fix corner case of pushing inline data
-b2fcc6e (origin/wip-s3pm-wusui) Remove code duplication from s3 tests.
-979e8b4 (origin/wip-7732) PG::build_might_have_unfound: check pg_whomai, not osd whoami
-0f75c54 (origin/wip-7712) osd/ReplicatedPG: fix enqueue_front race
+b2fcc6e Remove code duplication from s3 tests.
+979e8b4 PG::build_might_have_unfound: check pg_whomai, not osd whoami
+0f75c54 osd/ReplicatedPG: fix enqueue_front race
 ef1d7c9 rados.py: Fixed docstring syntax warnings.
 02b746d doc: Fixed release notes syntax warnings.
 9cd67bb doc: Fixed hyperlink.
-599a8d7 (origin/wip-7698) test: Add ceph_filestore_dump.sh to test ceph_filestore_dump
+599a8d7 test: Add ceph_filestore_dump.sh to test ceph_filestore_dump
 31a6679 tools: Fix ceph_filestore_dump to fully operate on EC pools
 f633a03 tools: Cleanups in ceph_filestore_dump
-bbc0d6d (origin/wip-7718) PG::issue_repop: only adjust peer_info last_updates if not temp
+bbc0d6d PG::issue_repop: only adjust peer_info last_updates if not temp
 35f1b04 RGWListBucketMultiparts: init max_uploads/default_max with 0
 ab91b80 mon/OSDMonitor: fix fall-thru case
-fc85075 (origin/wip-cache-last-epoch-clean) mon/PGMap: only recalculate min_last_epoch_clean if incremental touches old min
+fc85075 mon/PGMap: only recalculate min_last_epoch_clean if incremental touches old min
 208959a mon/PGMap: cache min_last_epoch_clean
 6f9db6c unittest_mon_pgmap: fix warnings
 e941fef unittest_ceph_argparse: fix warnings
 f2c6ff3 tools/: make filenames consistent
-e330097 (origin/wip-7719) PG: clear want_pg_temp in clear_primary_state only if primary
+e330097 PG: clear want_pg_temp in clear_primary_state only if primary
 da159f8 doc/release-notes: note that radosgw's should be upgraded together
-f1bd59f (origin/wip-7692) mon: only do timecheck with known monmap
+f1bd59f mon: only do timecheck with known monmap
 fba88de ceph-mon: be a bit more verbose on error
-62af51c (origin/wip-7696) PG::activate: handle peer contigious with primary, but not auth_log
-70d87df (origin/wip-7489) ceph_mon: output error message if unable to bind.
+62af51c PG::activate: handle peer contigious with primary, but not auth_log
+70d87df ceph_mon: output error message if unable to bind.
 5ad9c16 ceph_mon: all output after initial fork go to dout/derr
 c95234a ceph_mon: split postfork() in two and finish postfork just before daemonize
 ceac36b doc/release-notes: 0.78 draft nodes; firefly draft notes
-87c911c (origin/wip-7709) osd/ReplicatedPG: release op locks on on commit+applied
+87c911c osd/ReplicatedPG: release op locks on on commit+applied
 c5b557e qa/workunits: misc -> fs/misc
 8c8b3e9 PGLog: remove unused variable
 282497e osd: add tunables for cache_min_{flush,evict}_age
 fa6887b osd: set default cache_target_{dirty,full}_ratios based on configurable
 a72b636 mds: fix empty fs rstat
-f2124c5 (origin/wip-7705) ceph_test_rados: wait for commit, not ack
+f2124c5 ceph_test_rados: wait for commit, not ack
 dd946e0 MOSDOp: include reassert_version in print
-73f6b4c (origin/wip-7706) config_opts: raise ms_pq_max_tokens_per_priority to 16MB
+73f6b4c config_opts: raise ms_pq_max_tokens_per_priority to 16MB
 2722a0a PrioritizedQueue: cap costs at max_tokens_per_subqueue
-88cb1c4 (origin/wip-3863) tools: Create ceph-client-debug
+88cb1c4 tools: Create ceph-client-debug
 d2ce029 libcephfs: add ceph_ll_lookup_inode
 724f30e client: Add lookup_parent & lookup_name
 f1f2826 mds: Return EINVAL for parent lookup on root ino
-a19ef01 (origin/wip-7703) rgw: manifest hold the actual bucket used for tail objects
+a19ef01 rgw: manifest hold the actual bucket used for tail objects
 33b889f rbd-fuse: fix signed/unsigned warning
-c973e46 (origin/wip-coverity-20140312) mds/Mutation.h: init export_dir with NULL in ctor
+c973e46 mds/Mutation.h: init export_dir with NULL in ctor
 fd383a9 mds/Migrator.h: init some members of import_state_t in ctor
 5a53aa8 mds/Migrator.h: init some export_state_t members in ctor
 b10692f CInode::encode_cap_message: add assert for cap
@@ -6176,33 +9573,33 @@ fe8a715 Make the configuration "filestore merge threshold" can be negative which
 94acb6b test_librbd.cc: add missing va_end() to test_ls_pp
 fb4ca94 mailmap: Danny Al-Gaaf name normalization
 fb8ff44 doc/release-notes: note that WATCH can get ENOENT now
-2cbad1b (origin/wip-7671) test/librados/watch_notify: create foo before watching
+2cbad1b test/librados/watch_notify: create foo before watching
 9d549eb test/system/st_rados_watch: expect ENOENT for watch on non-existent object
 b23a141 RGWListBucketMultiparts: init max_uploads/default_max with 0
 4057a30 AbstractWrite: initialize m_snap_seq with 0
-90a2654 (origin/wip-7682) ReplicatedPG::already_(complete|ack) should skip temp object ops
+90a2654 ReplicatedPG::already_(complete|ack) should skip temp object ops
 72bc1ef AdminSocket: initialize m_getdescs_hook in the constructor
 f7529cf RGWPutCORS_ObjStore_S3::get_params: check data before dereference
 5334d5c mds/Server.cc: check straydn before dereference
 047287a doc: Add "nearest power of two" to PG rule-of-thumb
 7bb0359 OSDMonitor::prepare_pool_op: add missing break in case
-a4a91cc (origin/wip-7649) PG: do not wait for flushed before activation
+a4a91cc PG: do not wait for flushed before activation
 a576eb3 PG: do not serve requests until replicas have activated
-980d2b5 (origin/wip-7681) ECBackend: when removing the temp obj, use the right shard
+980d2b5 ECBackend: when removing the temp obj, use the right shard
 dc00661 osd_types: print lb if incomplete even if empty
-8e76e4e (origin/wip-7695) build-doc: fix checks for required commands for non-debian
-dc82cd7 (origin/wip-7641) debian: make ceph depend on ceph-common >= 0.67
-d573710 (origin/wip-7687) rgw: don't overwrite bucket entry data when syncing user stats
+8e76e4e build-doc: fix checks for required commands for non-debian
+dc82cd7 debian: make ceph depend on ceph-common >= 0.67
+d573710 rgw: don't overwrite bucket entry data when syncing user stats
 2fbd772 qa/workunits/cephtool/test.sh: fix thrash (more)
 64a6b26 doc/release-notes: fill in some firefly history
 f4196cc doc/release-notes: firefly draft release notes
 24774a8 osd/ReplicatedPG: fix typo
 3d5a4b5 ReplicatedPG: CEPH_OSD_OP_WATCH return -ENOENT if !obs.exists
-00bf3b5 (origin/wip-7674) osd/ReplicatedPG: do not include hit_set objects in full calculation
-1836b6c (origin/wip-7592-final) osd: hit_set_persist(): Verify all objects aren't degraded
-42ef8ba (origin/wip-6889) rgw: don't log system requests in usage log
-406ff0b (origin/wip-is_down-boolean) OSDMap: clarify is_{down,in}() definitions
-fe4ad29 (origin/wip-rbd-fuse-enumerate) rbd-fuse: fix enumerate_images() image names buffer size issue
+00bf3b5 osd/ReplicatedPG: do not include hit_set objects in full calculation
+1836b6c osd: hit_set_persist(): Verify all objects aren't degraded
+42ef8ba rgw: don't log system requests in usage log
+406ff0b OSDMap: clarify is_{down,in}() definitions
+fe4ad29 rbd-fuse: fix enumerate_images() image names buffer size issue
 377c919 rados_connect not thread-safe when using nss (documentation)
 861e62a Update Python hashbang to respect environment
 fa30eb5 rados.py: fix typo in Ioctx::read() docstring
@@ -6210,19 +9607,19 @@ fa30eb5 rados.py: fix typo in Ioctx::read() docstring
 745f72c Fixed get_status() to find client.radosgw fields inside of ps output.
 880bc3a Fix get_status() to find client.rados text inside of ps command results.
 fbd9c15 osd: Remove unused checkpoint code
-d3e3df7 (origin/wip-flock) mds: fix owner check of file lock
-8a72de3 (origin/wip-7663) ReplicatedPG: adjust pending_attrs correctly in copy_from
+d3e3df7 mds: fix owner check of file lock
+8a72de3 ReplicatedPG: adjust pending_attrs correctly in copy_from
 6669e4d ReplicatedPG: _delete_head should adjust pending_attrs
 60c1b9a ReplicatedPG: use pending_attrs in rollback
 d7c4d6a doc: Added init caps to Signed-off-by: and Fixes:
-2b28407 (origin/wip-doc-prereq) doc: update build prerequisites
+2b28407 doc: update build prerequisites
 7ac98b2 doc: Updated sign-off by to sign-off-by
-2e420f9 (origin/wip-build-doc) doc: rm duplicate info from release-process
+2e420f9 doc: rm duplicate info from release-process
 52a2d69 doc: fix reST formatting in release-process
 bb73711 doc: update release-process for packages
 dd5a378 doc: misc updates for release-process
-e272f74 (origin/wip-coverity-20140409) rgw_rados.h: RGWRegion: initialize cct and store with NULL
-8de8819 (origin/wip-7672) PG::choose_acting: filter CRUSH_ITEM_NONE out of have
+e272f74 rgw_rados.h: RGWRegion: initialize cct and store with NULL
+8de8819 PG::choose_acting: filter CRUSH_ITEM_NONE out of have
 c7c9ae3 RadosModel.h: initialize CopyFromOp::snap
 d7af7eb RadosModel.h: TestOp: initialize comp with NULL
 9df9f4c ReplicatedPG.h: CopyResults() initialize final_tx with NULL
@@ -6240,7 +9637,7 @@ d61fcfa ceph-filestore-dump.cc: pass OSDSuperblock by reference
 62763d5 watch_info_t: pass parameter by reference
 246564b pg_t::get_split_bits: add assert to silence coverity
 aba5b7c rbdmap: bugfix upstart script
-2e342d6 (origin/wip-libxfs-flag) FileStore: support compiling without libxfs
+2e342d6 FileStore: support compiling without libxfs
 2626604 erasure-code: LARGEST_VECTOR_WORDSIZE is always 16
 2beb2a5 erasure-code: upgrade to jerasure version 2
 b74115a autotools: set SIMD_FLAGS with SSE flags
@@ -6252,8 +9649,8 @@ eb6ffdb erasure-code: remove jerasure version 1
 7884780 osd_types.cc: add missing break in operator<< case handling
 83731a7 ReplicatedPG::finish_ctx: clear object_info if !obs.exists
 a7afa14 config.cc: add debug_ prefix to subsys logging levels
-55c23a1 (origin/wip-hint-tests) qa: add script for testing rados allocation hint
-54ffdcc (origin/wip-da-fix-doc) get-involved.rst: update information
+55c23a1 qa: add script for testing rados allocation hint
+54ffdcc get-involved.rst: update information
 d1a888e swift/containerops.rst: fix some typos
 93b95a2 radosgw/troubleshooting.rst: s/ceph-osd/OSD/
 2223a37 radosgw/config-ref.rst: fix typo
@@ -6269,73 +9666,73 @@ cf9f017 config.rst: fix typo
 f581bda rados: add set-alloc-hint command
 a4cbb19 rados/operations/control.rst: fix typo
 9bf39e2 Broke down sysinfo's format into a histogram with a value and count so that we just see how many of each version/distro/kernel/os/arch/cpu/etc are running
-2591668 (origin/wip-div) PGLog::proc_replica_log: select divergent log entries correctly
+2591668 PGLog::proc_replica_log: select divergent log entries correctly
 3befb93 PGLog::_merge_object_divergent_entries: handle missing.have == prior_version
 7c77ff6 TestPGLog: add a test case verifying case where we have the prior_version
 e830f9f TestPGLog: check on last_update in run_test_case
 4d6a74d TestPGLog::proc_replica_log: call doesn't adjust olog
-71b4474 (origin/wip-nfs-export) client: fix Client::getcwd()
+71b4474 client: fix Client::getcwd()
 617ce67 mds: introduce LOOKUPNAME MDS request
 1c8c618 qa/workunits/cephtool/test.sh: fix 'osd thrash' test
-b62f9f0 (origin/wip-7642) mon/OSDMonitor: feature feature check bit arithmetic
-8d52fb7 (origin/wip-7652) mon/PGMap: send pg create messages to primary, not acting[0]
+b62f9f0 mon/OSDMonitor: feature feature check bit arithmetic
+8d52fb7 mon/PGMap: send pg create messages to primary, not acting[0]
 c8b34f1 mon/PGMonitor: improve debugging on PGMap updates slightly
 819cce2 mon/OSDMonitor: make osdmap feature checks non-racy
 b9bcc15 mon/OSDMonitor: prevent set primary-affinity unless all OSDs support it
-5f7efec (origin/wip-7650) tools/rados/rados.cc: use write_full for sync_write for ec pools
-38fd666 (origin/wip-7210) qa: workunits/mon/rbd_snaps_ops.sh: ENOTSUP on snap rm from copied pool
+5f7efec tools/rados/rados.cc: use write_full for sync_write for ec pools
+38fd666 qa: workunits/mon/rbd_snaps_ops.sh: ENOTSUP on snap rm from copied pool
 c13e1b7 mon: OSDMonitor: don't remove unamanaged snaps from not-unmanaged pools
-135c27e (origin/wip-7575) osd: Add hit_set_flushing to track current flushes and prevent races
+135c27e osd: Add hit_set_flushing to track current flushes and prevent races
 3dd09e3 Removed nw_info from sysinfo
 09a317f Made crush_types to be a map of type to count, so we can tell how many racks/rows/hosts/etc are there
 e53aed2 SubmittingPatches: clarify "github fork" in preference to "clone"
 c9eaa65 Changed Availability section of ceph-mds.rst to reference ceph-mds, not ceph-mon. Signed-off-by: James Ryan Cresawn <jrcresawn at gmail.com>
-09668a4 (origin/wip-7637) osd: fix agent thread shutdown
+09668a4 osd: fix agent thread shutdown
 7411c3c logrotate: copy/paste daemon list from *-all-starter.conf
-b6872b2 (origin/wip-7638) ReplicatedPG::trim_object: use old_snaps for rollback
-b5b67d1 (origin/wip-7634) ReplicatedPG: use hobject_t for snapset_contexts map
-b436930 (origin/wip-firefly-misc) qa/workunits/rest/test.py: do not test 'osd thrash'
+b6872b2 ReplicatedPG::trim_object: use old_snaps for rollback
+b5b67d1 ReplicatedPG: use hobject_t for snapset_contexts map
+b436930 qa/workunits/rest/test.py: do not test 'osd thrash'
 237f0fb os/ObjectStore: dump COLL_MOVE_RENAME
-f888ab4 (origin/wip-7632) ReplicatedPG: consistently use ctx->at_version.version for stashed object
-eca7e63 (origin/wip-7393) ReplicatedPG: clean up num_dirty adjustments
-173116f (origin/wip-scrub-lock) osd: only register for scrub if we are the PG primary
+f888ab4 ReplicatedPG: consistently use ctx->at_version.version for stashed object
+eca7e63 ReplicatedPG: clean up num_dirty adjustments
+173116f osd: only register for scrub if we are the PG primary
 d379b1f osd: bracket role changes with scrub reg/unreg
 c8c2f54 Client: fix build issue (lost semicolon)
-d171418 (origin/wip-7610) obj_bencher: allocate contentsChars to object_size, not op_size
-bafd76a (origin/wip-7624) ReplicatedPG: ensure clones are readable after find_object_context
+d171418 obj_bencher: allocate contentsChars to object_size, not op_size
+bafd76a ReplicatedPG: ensure clones are readable after find_object_context
 90f5a68 1. Removed name of the pool from pool_metadata 2. Included pool type in pool_metadata 3. Renamed rep_size attribute to size in pool_metadata
 da97fee 1. simplified the 'bytes' info to just be bytes 2. prefix all the members of the components_info with 'num_'
-2adc534 (origin/wip-pool-delete) mon/OSDMonitor: fix pool deletion checks, races
-d9e8806 (origin/wip-7618) ReplicatedPG::wait_for_degraded_object: only recover if found
+2adc534 mon/OSDMonitor: fix pool deletion checks, races
+d9e8806 ReplicatedPG::wait_for_degraded_object: only recover if found
 a222e7a ReplicatedPG::recover_replicas: do not assume that missing objects are unfound
 3f59f02 Revert "c_read_operations.cc: fix resource leak"
-a71ddb0 (origin/wip-7562) mon: make quorum list (by name) be in quorum order
-240446e (origin/wip-7487) test: merge unittest_crushwrapper and unittest_crush_wrapper
+a71ddb0 mon: make quorum list (by name) be in quorum order
+240446e test: merge unittest_crushwrapper and unittest_crush_wrapper
 772968e mon/OSDMonitor: disallow crush buckets of type 0
-8b3934f (origin/wip-7616) PGBackend::rollback_stash: remove the correct shard
+8b3934f PGBackend::rollback_stash: remove the correct shard
 1ddec86 FileStore::_collection_move_rename: propogate EEXIST
 ca12e0d qa/workunits/mon/crush_ops: use expect_false
-e016e83 (origin/wip-7608) test: Fix tiering test cases to use ---force-nonempty
-0592368 (origin/wip-cache-warn-full) mon: warn when pool nears target max objects/bytes
-f6edcee (origin/wip-pgmap-stat) mon/PGMap: return empty stats if pool is not in sum
-640ff98 (origin/wip-vstart-paths) test: Use non-default out/ dev/ paths in vstart
-1685c6f (origin/wip-crush-json) crush: revise JSON format for 'item' type
+e016e83 test: Fix tiering test cases to use ---force-nonempty
+0592368 mon: warn when pool nears target max objects/bytes
+f6edcee mon/PGMap: return empty stats if pool is not in sum
+640ff98 test: Use non-default out/ dev/ paths in vstart
+1685c6f crush: revise JSON format for 'item' type
 d4950a1 mailmap: Danny Al-Gaaf affiliation
 0eac1ba mailmap: Bjørnar Ness affiliation
 fdb644a mailmap: Ron Allred affiliation
 a85d0ef mailmap: Steve Stock affiliation
 076bec8 mailmap: Christopher O'Connell affiliation
-4cb1cbf (origin/wip-7607) ReplicatedPG::fill_in_copy_get: fix omap loop conditions
+4cb1cbf ReplicatedPG::fill_in_copy_get: fix omap loop conditions
 11393ab ReplicatedPG::fill_in_copy_get: remove extraneous if statement
 8fdfece ReplicatedPG::fill_in_copy_get: fix early return bug
-364fed8 (origin/wip-debian-files) packaging: use wildcard for test files in Debian
-65f3354 (origin/wip-7293) Make symlink  of librbd to qemu's folder so it can detect it.
-d0b1094 (origin/wip-7447) ECBackend,ReplicatedPG: delete temp if we didn't get the transaction
+364fed8 packaging: use wildcard for test files in Debian
+65f3354 Make symlink  of librbd to qemu's folder so it can detect it.
+d0b1094 ECBackend,ReplicatedPG: delete temp if we didn't get the transaction
 f2a4eec PGBackend/ECBackend: handle temp objects correctly
 308ea1b ECMsgTypes: fix constructor temp_added/temp_removed ordering to match users
 3e21996 ReplicatedPG::finish_ctx: use correct snapdir prior version in events
 a7057e1 doc: Improve CloudStack RBD documentation
-31eb533 (origin/wip-da-SCA-firefly-20140304) librados.cc: remove unused variable cpp_category
+31eb533 librados.cc: remove unused variable cpp_category
 c2cc178 ECBackend.cc: use !empty() instead of size()
 35ae7ed ceph_argparse.cc: prefer prefix ++operator for non-trivial iterator
 39fc1db KeyValueStore: use !empty() instead of size()
@@ -6350,15 +9747,15 @@ c2f3f2b close file descriptor in error case
 724ad02 doc: update the operator create pool reference
 7461410 doc: erasure coded pool developer documentation
 6d323c0 mailmap: Mike Lundy affiliation
-49e54ab (origin/wip-cache-add) mon/OSDMonitor: fix race in 'osd tier remove ...'
+49e54ab mon/OSDMonitor: fix race in 'osd tier remove ...'
 241b9e8 mon/OSDMonitor: fix some whitespace
 c029c2f mon/OSDMonitor: add 'osd tier add-cache <pool> <size>' command
 62e0eb7 mon/OSDMonitor: handle 'osd tier add ...' race/corner case
 0e5fd0e osd: make default bloom hit set fpp configurable
 eddf7b6 osd/ReplicatedPG: fix agent division by zero
-08efb45 (origin/wip-tier-add) OSDMonitor: do not add non-empty tier pool unless forced
-12909bb (origin/wip-2288) mds: check projected xattr when handling setxattr
-20fe162 (origin/wip-7563) TestPGLog: tests for proc_replica_log/merge_log equivalence
+08efb45 OSDMonitor: do not add non-empty tier pool unless forced
+12909bb mds: check projected xattr when handling setxattr
+20fe162 TestPGLog: tests for proc_replica_log/merge_log equivalence
 9a64947 TestPGLog::proc_replica_log: adjust wonky test
 6b6065a TestPGLog::proc_replica_log: adjust to corrected proc_replica_log behavior
 97f3596 TestPGLog::proc_replica_log: add prior_version to some entries
@@ -6370,9 +9767,9 @@ c99b7e1 PG,PGLog: replace _merge_old_entry with _merge_object_divergent_entries
 ff329ac TestPGLog:rewind_divergent_log: set prior_version for delete
 9e43dd6 TestPGLog: ignore merge_old_entry return value
 3cc9e22 TestPGLog: not worth maintaining tests of assert behavior
-a234053 (origin/wip-osd-verbosity) OSD,config_opts: log osd state changes at level 0 instead
+a234053 OSD,config_opts: log osd state changes at level 0 instead
 68890b2 osd: be a bit more verbose on startup
-4e06dfd (origin/wip-messenger-shutdown) msg: Make SimpleMessenger shutdown safer
+4e06dfd msg: Make SimpleMessenger shutdown safer
 371a80c librbd: prefix rbd writes with CEPH_OSD_OP_SETALLOCHINT osd op
 8e49bc3 FileStore: add option to cap alloc hint size
 1f5b796 FileStore: introduce XfsFileStoreBackend class
@@ -6386,8 +9783,8 @@ d00a927 Revert "librbd: remove limit on number of objects in the cache"
 f3d6491 Following changes are made 1. Increased the String length for distro, version and os_desc columns in osds_info table 2. Corrected version information extraction in client/ceph-brag 3. Removed the version_id json entry when version list returned for UUID 4. Updated the README to reflect point 3
 3cc8b27 Modifed the String variables in db.py to be of fixed length to support databases which doesn't have VARCHAR support
 30a5bdb Added an instruction in 'How to deploy' field in README.md
-aca6ac3 (origin/wip-7248) qa: workunits: cephtool: test 'osd bench' limits
-09099c9 (origin/wip-osd-status) osd: 'status' admin socket command
+aca6ac3 qa: workunits: cephtool: test 'osd bench' limits
+09099c9 osd: 'status' admin socket command
 25a9bd3 osd: OSD: limit the value of 'size' and 'count' on 'osd bench'
 ef25135 erasure-code: test rados put and get
 0b612d1 mon: prepend current directory to PATH for tests
@@ -6399,14 +9796,14 @@ e782051 mailmap: Andrey Kuznetsov affiliation
 7b6d417 mailmap: Wang, Yaguang affiliation
 855edc6 Fix typo ceph-disk
 43b7b0b mailmap: The Linux Box affiliations
-62fd382 (origin/wip-7539) osd_types,PG: trim mod_desc for log entries to min size
+62fd382 osd_types,PG: trim mod_desc for log entries to min size
 d4118e1 MOSDECSubOpWrite: drop transaction, log_entries in clear_buffers
 718cda6 TrackedOp: clear_payload as well in unregister_inflight_op
 59ff572 OpTracker: clarify that unregister_inflight_op is only called if enabled
 fc9b8ef MOSDOp: drop ops vector in clear_data()
 1ea59f6 ReplicatedPG: delete mark_all_unfound_lost transactions after completion
-e19dffb (origin/wip-7572) mon: fix 'pg dump' JSON output
-1a4657a (origin/wip-fix-coverity-20140228) req_state: fix uninitialized bool var
+e19dffb mon: fix 'pg dump' JSON output
+1a4657a req_state: fix uninitialized bool var
 605e645 Objecter::recalc_op_target: fix uninitialized scalar variable
 754a368 PGMonitor: fix uninitialized scalar variable
 1747c58 MDCache: fix potential null pointer deref
@@ -6417,8 +9814,8 @@ ad9b6d2 c_write_operations.cc: fix some ioctx resource leaks
 e8533ee ReplicatedBackend: check result of dynamic_cast to fix null pointer deref
 8d6b25a mds: use "lookup-by-ino" helper to handle LOOKUPPARENT request
 dc20c7d OSDMonitor: enable getting hit set parameters We would like to get the hit set parameters: hit_set_type | hit_set_period | hit_set_count | hit_set_fpp via OSDMonitor
-5b88856 (origin/wip-s3radoscheck-wusui) Use pgrep radosgw to determine if rados gateway is running. Fixes: 7528 Signed-off-by: Warren Usui <warren.usui at inktank.com>
-0ed63fd (origin/wip-libcephfs-fixes) client: fix Client::trim_caps()
+5b88856 Use pgrep radosgw to determine if rados gateway is running. Fixes: 7528 Signed-off-by: Warren Usui <warren.usui at inktank.com>
+0ed63fd client: fix Client::trim_caps()
 cdbe413 client: update dirfragtree/fragmap according to mds request reply
 214977a client: choose hash function according to dir layout
 23de48b client: check inode's cap when looking up dentry
@@ -6430,46 +9827,46 @@ a7b7c31 client: use ceph_seq_cmp() to compare cap seq/mseq
 0bf5f86 store_test.cc: fix unchecked return value
 7eefe85 histogram.h: fix potential div by zero
 500206d ReplicatedPG.cc: fix ressource leak, delete cb
-fbb1ec8 (origin/wip-7542) ECBackend: don't leak transactions
+fbb1ec8 ECBackend: don't leak transactions
 b0d4264 OSD::handle_misdirected_op: handle ops to the wrong shard
-123ff9e (origin/wip-7458) osd: stray pg ref on shutdown
-448fc0e (origin/wip-pg-msg) mon/OSDMonitor: missing space in string
+123ff9e osd: stray pg ref on shutdown
+448fc0e mon/OSDMonitor: missing space in string
 799cde0 Fix python-requests package dependencies.
-bfad17b (origin/wip-librados-end-iterator) librados: fix ObjectIterator::operator= for the end iterator
-a850a38 (origin/wip-doc-cache-pool) doc/dev/cache-pool: fix notes
-f0241c8 (origin/wip-cache-pool) mon/OSDMonitor: make default false-positive-probability 5%
+bfad17b librados: fix ObjectIterator::operator= for the end iterator
+a850a38 doc/dev/cache-pool: fix notes
+f0241c8 mon/OSDMonitor: make default false-positive-probability 5%
 30aa2d6 client: clear migrate seq when MDS restarts
 c1e40c6 client: fix race between cap issue and revoke
 5c55eb1 client: check mds_wanted for imported cap
 9476f84 client: allow revoking duplicated caps issued by non-auth MDS
 6797d30 client: call handle_cap_grant() for cap import message
 154efb1 client: don't update i_max_size when handle reply from non-auth MDS
-9a0ef6a (origin/wip-requests-dependencies) Fix python-requests package dependencies.
-c07a758 (origin/wip-mds-dumper) mds: Add dump-journal-entries
+9a0ef6a Fix python-requests package dependencies.
+c07a758 mds: Add dump-journal-entries
 7a985df mds: Create MDSUtility as base for Dumper/Resetter
 410c507 mds: Fix Dumper shutdown
 7ba3200 mds: fix nested_anchors update during journal replay
-1040d1b (origin/wip-primary-temp-fix) osd/OSDMap: respect temp primary without temp acting
-8020dcf (origin/wip-7375-wusui) Fixed get_status() to find client.radosgw fields inside of ps output. Fixes: 7375 Signed-off-by: Warren Usui <warren.usui at inktank.com>
-8200b8a (origin/wip-7374-wusui) Fix get_status() to find client.rados text inside of ps command results. Added port (fixed value for right now in teuthology) to hostname. Fixes: 7374 Signed-off-by: Warren Usui <warren.usui at inktank.com>
-be2748c (origin/wip-7537) OSDMap::_pg_to_up_acting_osds: use _acting_primary unless acting is empty
+1040d1b osd/OSDMap: respect temp primary without temp acting
+8020dcf Fixed get_status() to find client.radosgw fields inside of ps output. Fixes: 7375 Signed-off-by: Warren Usui <warren.usui at inktank.com>
+8200b8a Fix get_status() to find client.rados text inside of ps command results. Added port (fixed value for right now in teuthology) to hostname. Fixes: 7374 Signed-off-by: Warren Usui <warren.usui at inktank.com>
+be2748c OSDMap::_pg_to_up_acting_osds: use _acting_primary unless acting is empty
 dc079eb OSDMonitor: when thrashing, only generate valid temp pg mappings
 891343a rados.py: add aio_remove
-9f7f4ed (origin/wip-dz-scrub-fixes) Revert "osd/PG: fix assert when deep repair finds no errors"
+9f7f4ed Revert "osd/PG: fix assert when deep repair finds no errors"
 728e391 osd: Don't include primary's shard in repair result message
-3ee71a5 (origin/wip-mon-docs) doc: troubleshooting-mons.rst: better document how to troubleshoot mons
-69082a6 (origin/wip-7485) mds: add mds_kill_create_at for testing
+3ee71a5 doc: troubleshooting-mons.rst: better document how to troubleshoot mons
+69082a6 mds: add mds_kill_create_at for testing
 27968a7 ceph_test_objectstore: fix i386 build (again)
-14ea815 (origin/wip-osdmap-inc) mon/OSDMonitor: fix osdmap encode feature logic
-7357b6e (origin/wip-7517) PG: skip pg_whoami.osd, not pg_whoami.shard in scrub feature check
-696fa36 (origin/wip-7512) OSD::project_pg_history needs to account for acting_primary/up_primary
+14ea815 mon/OSDMonitor: fix osdmap encode feature logic
+7357b6e PG: skip pg_whoami.osd, not pg_whoami.shard in scrub feature check
+696fa36 OSD::project_pg_history needs to account for acting_primary/up_primary
 0442b45 Objecter/OSDMap: factor out primary_changed() into static OSDMap method
 d0359f7 PG: clarify same_primary_since updates regarding primary rank
-73c59a0 (origin/wip-last-epoch-clean) mon/PGMap: fix osd_epochs update
+73c59a0 mon/PGMap: fix osd_epochs update
 df3ba72 mon/PGMap: add unit test for min_last_epoch_clean
 8913ab4 ECBackend: skip canceled xattr reads as well
-0e376ee (origin/port/temp-failure-retry) compat: avoid unused warn with TEMP_FAILURE_RETRY
-23e538e (origin/wip-create-null) client: fix possible null dereference in create
+0e376ee compat: avoid unused warn with TEMP_FAILURE_RETRY
+23e538e client: fix possible null dereference in create
 bcc18d4 osd: increase default leveldb write buffer, cache size
 c4a3a86 ceph_test_objectstore: fix i386 build error
 b25c22a ceph_test_objectstore: fix signed/unsigned warning
@@ -6478,10 +9875,10 @@ cfafa0b README: add build dep
 1340b36 erasure-code: test that changing the pool size is not allowed
 d74f128 os/ObjectStore: document interface
 6d8cb22 unittest_ecbackend: fix signed/unsigned warnings
-fed655e (origin/wip-scrub-primary) mon/PGMonitor: fix primary osd check on deep-scrub
-7b0f3d9 (origin/wip-7513) PGLog::activate_not_complete: fix log.complete_to increment typo
-e3e3328 (origin/wip-repair) osd/PG: fix assert when deep repair finds no errors
-9da4d40 (origin/wip-ec-bugs) PGLog: test for missing using the correct shard value
+fed655e mon/PGMonitor: fix primary osd check on deep-scrub
+7b0f3d9 PGLog::activate_not_complete: fix log.complete_to increment typo
+e3e3328 osd/PG: fix assert when deep repair finds no errors
+9da4d40 PGLog: test for missing using the correct shard value
 ace65fe OSD: fix query for ec pgs
 93983c9 Objecter: make is_pg_changed static and const for old/newacting
 8b4cf1c OSDMap: make calc_pg_rank/role const for acting
@@ -6490,7 +9887,7 @@ b7de0fd ECTransaction: require hash_infos for deleted objects
 ab4c9a6 ECUtil: clear() should reset hashes to -1, not 0
 8593ad7 ReplicatedPG::cancel_pull: ECBackend might cancel_pull on a non-missing object
 9b85241 ECBackend::filter_read_op: clean up read state properly
-70d23b9 (origin/wip-subscribe) osd: fix off-by-one is boot subscription
+70d23b9 osd: fix off-by-one is boot subscription
 90ebdcc Rename keyvaluestore_check_backend to keyvaluestore_debug_check_backend
 a52d7cd Add KeyValueStore op thread options
 62c1631 Remove eio inject codes in KeyValueStore
@@ -6513,18 +9910,18 @@ c9fdcee Add test for omap interface
 2b9e893 Move perf counter and add op queue reserve throttle
 5eb7592 Fix deadlock caused by hold collection obj
 c278269 Added a note that the ceph-osd command requires the cluster option.
-0da25e6 (origin/wip-librados-object-iterator) librados: implement ObjectIterator copying and assignment
+0da25e6 librados: implement ObjectIterator copying and assignment
 d9bedb9 Added evaluation of cluster and fixed evaluation when setting initial weight on start in verbose mode
 8a0017a librados: don't destroy ObjListCtx when iterator reaches the end
 d8a7bef librados: remove extra advance from objects_begin(position)
-a9677e1 (origin/wip-6685-firefly) Revert "ReplicatedPG::recover_backfill: adjust last_backfill to HEAD if snapdir"
+a9677e1 Revert "ReplicatedPG::recover_backfill: adjust last_backfill to HEAD if snapdir"
 133ddb7 packaging: match all test files
 1c129df packaging: add missing file
-bd59db2 (origin/wip-corpus) ceph-object-corpus: prune some old releases
-76046cf (origin/port/updates) dencoder: check for radosgw build option
+bd59db2 ceph-object-corpus: prune some old releases
+76046cf dencoder: check for radosgw build option
 5373ee2 osd: use ceph scoped shared_ptr
 0ebb1f8 mon/MDSMonitor: Cope with failures during creation
-c6e6ced (origin/wip-copyget) PG::build_might_have_unfound: skip CRUSH_ITEM_NONE
+c6e6ced PG::build_might_have_unfound: skip CRUSH_ITEM_NONE
 1f30d1a ECBackend: deal with temp collection details in handle_sub_write
 c703a89 ReplicatedPG::on_global_recover: requeue degraded, then unreadable
 caf2edf rgw: minor cleanup
@@ -6533,7 +9930,7 @@ caf2edf rgw: minor cleanup
 2762ede mds/CDir: Remove a few lines of cruft
 a4d0eb1 mds: Remove some unused #defines
 5fb90a2 Fix generate keystone token from credentials
-541beb8 (origin/wip-mds-dencoder) mds: Add encode/decode/dump for use with dencoder
+541beb8 mds: Add encode/decode/dump for use with dencoder
 ebd4397 RadosModel: copyfrom should result in a dirty object
 68184d4 PG: fix scrub feature check
 67d1f36 ReplicatedPG: assert no omap for ec pool in _write_copy_chunk and skip
@@ -6544,13 +9941,13 @@ c337f63 ReplicatedPG: fix stat uses to use the correct shard
 aa110af OSDMap::_apply_primary_affinity: skip CRUSH_ITEM_NONE in non-default check
 b3bb9ef doc/release-notes: v0.77
 fa96de9 doc/release-notes: v0.67.7
-1bca9c5 (tag: v0.77) v0.77
+1bca9c5 v0.77
 40bdcb8 osd/,mon/: add (up|acting)_primary to pg_stat_t
-0427f61 (origin/wip-7099) rgw: fix swift range response
-2b3e3c8 (origin/wip-6830) rgw: fix etag in multipart complete
-859ed33 (origin/wip-6951) rgw: reset objv tracker on bucket recreation
-53de641 (origin/wip-6936) radosgw-admin: don't generate access key if user exists
-9e8882e (origin/wip-da-SCA-20140218) BackedUpObject::get_xattr()  pass function parameter by reference
+0427f61 rgw: fix swift range response
+2b3e3c8 rgw: fix etag in multipart complete
+859ed33 rgw: reset objv tracker on bucket recreation
+53de641 radosgw-admin: don't generate access key if user exists
+9e8882e BackedUpObject::get_xattr()  pass function parameter by reference
 53b3689 TestRados.cc: use !empty() instead of size()
 86b0879 ErasureCodeBench: prefer prefix ++operator for non-trivial iterator
 200ebb9 ceph_erasure_code.cc: prefer prefix ++operator for non-trivial iterator
@@ -6579,20 +9976,20 @@ ab45d44 BarrierContext: prefer prefix ++operator for non-trivial iterator
 4adcfa5 BarrierContext: reduce scope of 'done' variable
 19dad03 MonClient: pass function parameter by reference
 314116e PG: insert into stray set if !us_up() && !is_acting()
-a5c5d92 (origin/wip-agent) osd/ReplicatedPG: EOPNOTSUPP on hit_set_get on non-replicated pools
+a5c5d92 osd/ReplicatedPG: EOPNOTSUPP on hit_set_get on non-replicated pools
 a40cd50 osd/ReplicatedPG: load older HitSets into memory
 0af7375 osd/ReplicatedPG: strengthen agent_work assertions
 1d907c6 OSD: consider up/acting primary for new intervals
-afbd58e (origin/wip-7064) rgw: don't try to read bucket's policy if it doesn't exist
+afbd58e rgw: don't try to read bucket's policy if it doesn't exist
 3ed68eb rgw: return error if accessing object in non-existent bucket
 b0dcc79 radosgw-admin: gc list --include-all
 609f4c5 Throw a Timeout exception on timeout.
-1975441 (origin/wip-rgw-manifest-2) dencoder: fix for new rgw manifest code
+1975441 dencoder: fix for new rgw manifest code
 b3ce188 cls/rgw: fix debug output
 3fb6e25 test/rgw: manifest unitest
 da64bf7 rgw: scalable manifest object
-db88e7f (origin/wip-7463) ceph_rest_api.py: don't fail if no up OSDs found on startup
-7a019b3 (origin/wip-librados-ops-and-rvals) test/librados: remove unused lines and fix return code for cmp guards
+db88e7f ceph_rest_api.py: don't fail if no up OSDs found on startup
+7a019b3 test/librados: remove unused lines and fix return code for cmp guards
 bfbfcd6 ReplicatedPG: fix successful write condition
 18f08cd ReplicatedPG: clear osd op reply output for writes
 1d661ca ReplicatedPG: set individual osd op rvals
@@ -6624,14 +10021,14 @@ d3c6f17 test/librados: use connect_cluster_pp() instead of duplicating it
 9630f2f test/librados: create general test case classes
 6273ba4 test/librados: move test.cc into its own library
 abca34a Objecter: keep ObjectOperation rval pointers for writes
-4bee6ff (origin/wip-cache-perf) osd/ReplicatedPG: clean up agent skip debug output
+4bee6ff osd/ReplicatedPG: clean up agent skip debug output
 d1a185b osd: l_osd_agent_{wake,skip,flush,evict}
 dbec109 osd: l_osd_tier_[dirty|clean]
 150e87a osd: l_osd_tier_whiteout
 2d5371d osd: l_osd_tier_evict
 c45a477 osd: l_osd_tier_[try_]flush[_fail]
 4b0a0a1 osd: l_osd_copyfrom
-5e727bf (origin/wip-ecbackend-for-review, origin/wip-ecbackend-12) RadosModel: only output if missing header is actually a problem
+5e727bf RadosModel: only output if missing header is actually a problem
 11f288e Objecter: track primary explicitly to detect changing primaries
 bc31c4b ReplicatedPG: add some debugging if waiting_for_commit is non-empty
 b90584a osd/: instantiate the right backend based on pool
@@ -6698,12 +10095,12 @@ e0b0508 ReplicatedPG: reject unaligned writes on ec pools
 bd8fcd2 osd: improve whiteout debug prints
 63f5a79 osd/ReplicatedPG: make agent skip blocked obcs
 7997646 ReplicatedBackend: print char ack_type as int
-a71d829 (origin/wip-barrier) client: disable barrier support
+a71d829 client: disable barrier support
 747002c client: fix barrier interval
 d7457f7 client/barrier: drop unused active_commit_interval
 ce643e0 client: don't populate Client::barriers from read accesses
 a1dbc9c qa/workunits/suites/pjd: use test suite with acl tweak
-623748e (origin/wip-dirfrags) mds: bump the protocol version
+623748e mds: bump the protocol version
 2a19a1b osd/ReplicatedPG: allow is_degraded_object() to be called outside of backfil
 e5457df mds: open sessions for rejoin imported caps
 c54b3ce mds: fix slave rename rollback
@@ -6736,7 +10133,7 @@ b88034e mds: use discover_path to open remote inode
 3154ee8 mds: acquire locks required by exporting dir
 3fb408e mds: introduce nonlocking auth pin
 d0df841 mds: allow acquiring wrlock and remote wrlock at the same time
-1dae27c (origin/wip-7371) ReplicatedPG: return no data if read size is trimmed to zero
+1dae27c ReplicatedPG: return no data if read size is trimmed to zero
 774125c osd: set client incarnation for Objecter instance
 0dd1e07 osd: schedule agent from a priority queue
 a812982 osd/ReplicatedPG: simplify agent_choose_mode
@@ -6779,27 +10176,27 @@ fb4152a histogram: move to common, add unit tests
 8b68ad0 histogram: calculate bin position of a value in the histrogram
 d921d9b qa: do not create erasure pools yet
 4560078 common: ping existing admin socket before unlink
-c673f40 (origin/wip-primary-affinity) osd/OSDMap: include primary affinity in OSDMap::print
+c673f40 osd/OSDMap: include primary affinity in OSDMap::print
 87be7c1 osd/OSDMap: remove bad assert
 ba3eef8 mon/OSDMonitor: add 'mon osd allow primary affinity' bool option
 c360c60 ceph_psim: some futzing to test primary_affinity
 f825624 osd/OSDMap: add primary_affinity feature bit
 8ecec02 osd/OSDMap: apply primary_affinity to mapping
-871a5f0 (origin/wip-brag) ceph.spec: add ceph-brag
+871a5f0 ceph.spec: add ceph-brag
 4ea0a25 debian: add ceph-brag
 57d7018 ceph-brag: add Makefile
-cf4f702 (origin/wip-7212-sage-b) mon/Elector: bootstrap on timeout
+cf4f702 mon/Elector: bootstrap on timeout
 4595c44 mon: tell MonmapMonitor first about winning an election
 7bd2104 mon: only learn peer addresses when monmap == 0
-3c76b81 (origin/wip-7395) OSD: use the osdmap_subscribe helper
+3c76b81 OSD: use the osdmap_subscribe helper
 6db3ae8 OSD: create a helper for handling OSDMap subscriptions, and clean them up
 5b9c187 monc: new fsub_want_increment( function to make handling subscriptions easier
 7d398c2 doc/release-notes: v0.67.6
-0ed6a81 (origin/wip-osdmaptool-pool-fix) osdmaptool: add tests for --pool option
+0ed6a81 osdmaptool: add tests for --pool option
 f98435a osdmaptool: add --pool option for --test-map-pgs mode to usage()
 eedbf50 osdmaptool: fix --pool option for --test-map-object mode
 e44122f test: fix signed/unsigned warnings in TestCrushWrapper.cc
-64cedf6 (origin/wip-7394) OSD: disable the PGStatsAck timeout when we are reconnecting to a monitor
+64cedf6 OSD: disable the PGStatsAck timeout when we are reconnecting to a monitor
 794c86f monc: backoff the timeout period when reconnecting
 60da8ab monc: set "hunting" to true when we reopen the mon session
 1a8c434 monc: let users specify a callback when they reopen their monitor session
@@ -6824,8 +10221,8 @@ f77e8ea crush: add CrushWrapper::dump_rules() unit test
 07c494b mon: add --mon-advanced-debug-mode
 5ea156a mon: Monitor::send_reply gracefully handles no connection
 53e92f6 mon: split prepare_command in two
-d012119 (origin/wip-null-xattr) mds: remove xattr when null value is given to setxattr()
-6e6a333 (origin/wip-open-truncate) mds: properly replay dentry trace for open truncate.
+d012119 mds: remove xattr when null value is given to setxattr()
+6e6a333 mds: properly replay dentry trace for open truncate.
 9035227 doc/release-notes: do not downgrade from v0.67.6
 7533b3b doc/release-notes: note about dumpling xattr backport
 cc1e844 PendingReleaseNotes: note about cephfs backtrace updates
@@ -6834,25 +10231,25 @@ e107938 osd/OSDMap: fix _raw_to_up_osds for EC pools
 1cc8c25 mon/OSDMonitor: add 'osd primary-affinity ...' command
 cee9142 osd/OSDMap: add osd_primary_affinity fields, accessors, encoding
 af4c142 mon/OSDMonitor: fix legacy tunables warning
-a0b2c74 (origin/wip-moncap) osd/OSDCap: handle any whitespace (not just space)
+a0b2c74 osd/OSDCap: handle any whitespace (not just space)
 824dd52 mon/MonCap: handle any whitespace (not just space)
 5a6c950 packaging: do not package libdir/ceph recursively
 840e918 tests: fix packaging for s/filestore/objectstore/
 b64f1e3 tests: fix objectstore tests
-f34eb1b (origin/wip-update-backtrace) mds: force update backtraces for previously created FS
+f34eb1b mds: force update backtraces for previously created FS
 b5c10bf Fix bad dealloctor
 a4b3b78 correct one command line at building packages section
 33692a2 osdmaptool: fix cli test
 fed8396 tset_bufferlist: fix signed/unsigned comparison
-8ca3d95 (origin/wip-7346) rgw: multi object delete should be idempotent
-bf38bfb (origin/wip-7271) rgw: set bucket permissions, cors on swift creation
-2682b64 (origin/wip-doc-librados-intro) doc: Incorporated feed back from Loic and Dan.
+8ca3d95 rgw: multi object delete should be idempotent
+bf38bfb rgw: set bucket permissions, cors on swift creation
+2682b64 doc: Incorporated feed back from Loic and Dan.
 0da9621 doc: Adds additional terms for use with librados.
 e1a49e5 ObjectCacher: remove unused target/max setters
 0559d31 librbd: remove limit on number of objects in the cache
 db034ac ObjectCacher: use uint64_t for target and max values
 bf8cf2d ObjectCacher: remove max_bytes and max_ob arguments to trim()
-d136eb4 (origin/wip-crush) mon: allow firefly crush tunables to be selected
+d136eb4 mon: allow firefly crush tunables to be selected
 e3309bc doc/rados/operations/crush: describe new vary_r tunable
 525b2d2 crush: add firefly tunables baseline test
 37f840b crushtool: new cli tests for the vary-r tunable
@@ -6873,17 +10270,17 @@ f2f4eb5 Updated client code to complete PUT and DELETE requests
 d389e61 msg/Pipe: add option to restrict delay injection to specific msg type
 671a76d MonClient: add a timeout on commands for librados
 3e1f7bb Objecter: implement mon and osd operation timeouts
-9bcc42a (origin/wip-7334) alert the user about error messages from partx
+9bcc42a alert the user about error messages from partx
 42900ff use partx for red hat or centos instead of partprobe
-6926272 (origin/wip-fsetpipesz-fix) common/buffer: fix build breakage for CEPH_HAVE_SETPIPE_SZ
+6926272 common/buffer: fix build breakage for CEPH_HAVE_SETPIPE_SZ
 a5f479c configure: fix F_SETPIPE_SZ detection
 450163e configure: don't check for arpa/nameser_compat.h twice
 c1d2a99 libcephfs: fix documentation for ceph_readlink
 dbaf71a mailmap: Moritz Möller is with Bigpoint.com
 4cf2c72 Server changes to deploy in a production env 1. Added the wsgi entry point app.wsgi 2. Updated client code to mandate the update-metadata to have url to publish and unpublish 3. Updated the README to describe a bit about the server operations as well.
-575566b (origin/wip-7329) ceph_test_rados_api_tier: try harder to trigger the flush vs try-flush race
+575566b ceph_test_rados_api_tier: try harder to trigger the flush vs try-flush race
 a8e6c9f crush: add chooseleaf_vary_r tunable
-f17caba (origin/wip-7370) crush: allow crush rules to set (re)tries counts to 0
+f17caba crush: allow crush rules to set (re)tries counts to 0
 795704f crush: fix off-by-one errors in total_tries refactor
 ed32c40 crushtool: add cli test for off-by-one tries vs retries bug
 75c5525 qa/workunits/rest: use larger max_file_size
@@ -6917,7 +10314,7 @@ ff04629 client: add barrier and types
 7f6d755 client: whitespace
 7f9fdc3 osdc/Objecter: whitespace
 548ccd9 mds/Server: whitespace
-3a5fa87 (origin/wip-osdmap-primary) osd/OSDMap: populate *primary when pool dne
+3a5fa87 osd/OSDMap: populate *primary when pool dne
 5b7e2b2 rgw: initialize variable before call
 45d31f0 osdmaptool: tests for --test-map-pgs
 b98eaa5 osdmaptool: test --import/export-crush
@@ -6932,27 +10329,27 @@ e1e6c45 Completed model and controller code 1. GET, PUT and DELETE request are a
 42a64e1 Revert test case of "mon: OSDMonitor: do not allow changing an erasure-coded pool's size"
 0aa25b1 Some suggested changes, both errors and rewordings
 30fd0c5 Intial version of the server code. 1. Database modelling is done 2. PUT request is completed
-fc963ac (origin/wip-erasure-code-directory) erasure-code: move test files to a dedicated directory
+fc963ac erasure-code: move test files to a dedicated directory
 7baa62f erasure-code: move source files to a dedicated directory
-9ecf3467 (origin/wip-rgw-vg) rgw: initialize variable before call
+9ecf346 rgw: initialize variable before call
 ce0e3bd qa/workunits/snaps: New allow_new_snaps syntax
 22b0057 mon: test osd pool create pg_pool_t::stripe_width behavior
 dfc90cf mon: osd pool create sets pg_pool_t::stripe_width
 33b8ad8 common: add osd_pool_erasure_code_stripe_width
 798b56a unittests: update osdmaptools with stripe_width
 11c11ba mon: add erasure-code pg_pool_t::stripe_width
-922e5cf (origin/port/fixes) osd: fix type mismatch warning
+922e5cf osd: fix type mismatch warning
 6fda45b os/kvstore: remove used var
 994bdea os/kvstore: trivial portability fixes
 377a845 common: simpler erasure code technique
-04b1ae4 (origin/wip-7336) rgw: fix rgw_read_user_buckets() use of max param
-fdeb18e (origin/wip-7109) mon: MDSMonitor: Forbid removal of first data pool
+04b1ae4 rgw: fix rgw_read_user_buckets() use of max param
+fdeb18e mon: MDSMonitor: Forbid removal of first data pool
 c7d265a mon: OSDMonitor: Refuse to delete CephFS pools
-e80b084 (origin/wip-erasure-code-command) erasure-code: add ceph_erasure_code debug command
-68e6dad (origin/wip-mds-cluster) mds: avoid sending duplicated discovers during recovery
+e80b084 erasure-code: add ceph_erasure_code debug command
+68e6dad mds: avoid sending duplicated discovers during recovery
 bec1209 erasure-code: benchmark moves to a dedicated directory
 ad8c666 mon: check cluster features before rule create-erasure
-c8c4cc6 (origin/wip-7146) mon: OSDMonitor: do not allow changing an erasure-coded pool's size
+c8c4cc6 mon: OSDMonitor: do not allow changing an erasure-coded pool's size
 7b15cb1 mon: osd-pool-create test no longer use hardcoded ruleset
 428c75e mon: osd-pool-create test EAGAIN when pending
 59ba03b mon: test erasure code pool creation
@@ -6988,10 +10385,10 @@ a23a2c8 os/KeyValueStore: fix warning
 eb9ffd5 mon: use 'mds set inline_data ...' for enable/disable of inline data
 408b0c8 mon: fix 'mds set allow_new_snaps'
 e5ed1b2 mon: do not force proposal when no osds
-3b99013 (tag: v0.76) v0.76
+3b99013 v0.76
 7ff2b54 client: use 64-bit value in sync read eof logic
 2f85b8c doc: Incorporated feedback.
-684e5c4 (origin/wip-performance-configs) Pipe, cephx: Message signing under config option
+684e5c4 Pipe, cephx: Message signing under config option
 5fde828 cmp.h: boost tuple comparison is replaced by regular comparison
 8e19488 TrackedOp: optionally disable the actual tracking operations
 98ae059 Throttle: Turn off throttle based on max bytes
@@ -6999,7 +10396,7 @@ e5ed1b2 mon: do not force proposal when no osds
 ee48c87 common/shared_cache.hpp: compact to a single lookup where possible
 27b5f2b common/shared_cache.hpp: avoid list::size()
 ee4cfda doc: rgw: el6 documentation fixes
-48fbcce (origin/wip-5997) osd: Change some be_compare_scrub_objects() args to const
+48fbcce osd: Change some be_compare_scrub_objects() args to const
 ce1ea61 osd: Change be_scan_list() arg to const
 e1bfed5 common: buffer::ptr::cmp() is a const function
 34eb549 osd: Move the rest of scrubbing routines to the backend
@@ -7009,9 +10406,9 @@ f9128e8 osd: Move PG::_scan_list() to backend as ReplicatedBackend::be_scan_list
 d508079 OSDMonitor: use deepish_copy_from for remove_down_pg_temp
 61914d8 OSDMap: deepish_copy_from()
 802692e os/KeyValueStore: fix warning
-0389f1d (origin/wip-osdmap-features) mon/OSDMonitor: encode full OSDMap with same feature bits as the Incremental
+0389f1d mon/OSDMonitor: encode full OSDMap with same feature bits as the Incremental
 b9208b4 OSDMap: note encoding features in Incremental encoding
-e4dd1be (origin/wip-pybind-enverr) pybind: improve EnvironmentError output
+e4dd1be pybind: improve EnvironmentError output
 754ddb1 rgw: fix build on centos
 1628423 mailmap: Rutger ter Borg affiliation
 3a1a8c3 mailmap: Laurent Barbe affiliation
@@ -7036,7 +10433,7 @@ ad515bf ceph-disk: support and test the absence of PATH
 d70efe9 FileStore: avoid leveldb check for xattr when possible
 6d1daea unittest_striper: fix warning
 f9071a7 doc: cls moved to subdirectory
-e78f756 (origin/wip-rbd-rm-watchers) run-rbd-tests: run remove_with_watcher test
+e78f756 run-rbd-tests: run remove_with_watcher test
 4ebc32f rbd: don't forget to call close_image() if remove_child() fails
 0a553cf rbd: check for watchers before trimming an image on 'rbd rm'
 dcbe872 pybind: work around find_library() not searching LD_LIBRARY_PATH
@@ -7056,9 +10453,9 @@ b31c0f0 civetweb: fix warning
 fd260c9 cls_user: init cls_user_stats fields in ctor
 d0f13f5 OSDMap: fix deepish_copy_from
 d7b0c7f ceph-disk: run the right executables from udev
-318e208 (origin/wip-7190) OSD: don't assume we have the pool in handle_pg_create
+318e208 OSD: don't assume we have the pool in handle_pg_create
 2a737d8 leveldb: add leveldb_* options
-11cf9bb (origin/wip-7169-2) rgw: fix multipart min part size
+11cf9bb rgw: fix multipart min part size
 12ba8a3 Add a virtual interface init, open, create_and_open to KeyValueDB
 2e7bd83 rgw: fix multipart upload listing
 f1b5309 osd: OSDMonitor: ignore pgtemps from removed pool
@@ -7066,8 +10463,8 @@ ff5abfb buffer: make 0-length splice() a no-op
 86c3c5a test/Makefile.am: update test_cls_rgw* rules
 26ace1e test_rgw_admin_opstate: get it compiled
 28c7388 osdc/Striper: test zero-length add_partial_result
-f513f66 (origin/wip-7116-joao) osd: OSDMonitor: ignore pgtemps from removed pool
-3c77c4c (origin/wip-7060) OSDMap: use deepish_copy_from in remove_redundant_temporaries
+f513f66 osd: OSDMonitor: ignore pgtemps from removed pool
+3c77c4c OSDMap: use deepish_copy_from in remove_redundant_temporaries
 368852f OSDMap: fix damaging input osdmap from remove_down_temps
 bd54b98 OSDMap: deepish_copy_from()
 9e52398 packaging: apply udev hack rule to RHEL
@@ -7095,7 +10492,7 @@ eff9f02 If 'ceph osd metadata' isn't working, inform the user about it via stder
 fffbfc9 mon: test for MForward messages
 d9a689d mon: shell test helpers to run MONs from sources
 c06eca2 unittests: fail early when low on disk
-9413a51 (origin/wip-user-quota-2) rgw: simplify a code path
+9413a51 rgw: simplify a code path
 1b0567b radosgw-admin: add usage for user stats [--sync-stats]
 97edd2f doc: Switched logging to true. Now required for sync agent.
 9a55fa1 cls_user: add generate_test_instances() implementation
@@ -7103,7 +10500,7 @@ e5dc8d6 rgw, cls_user: handle error cases related to response decoding
 8a69ac8 rgw: implement restful set user quota request
 92cabd4 rgw: new restful api for retrieving user quota info
 0f7b3c7 rgw: quota thread for full user stats sync
-51ffede (origin/wip-7215-quorum-features) mon: do not use CEPH_FEATURES_ALL for things that touch the disk
+51ffede mon: do not use CEPH_FEATURES_ALL for things that touch the disk
 3d4a673 Elector: send an OP_NAK MMonElection to old peers who support it
 687b570 Elector: ignore messages from mons without required feature capabilities
 41796c0 Monitor: add a function to get the required quorum features from the local compatset
@@ -7143,7 +10540,7 @@ b90570f Fix 404 broken links to logging and debug configuration
 4553e6a Fix trailing space
 7bed2d6 called sysinfo.append(meta) in get_sysinfo within the while loop
 ef3fe7b Updated README with How-to-use instructions
-faae5b9 (origin/use-ceph-sharedptr) libc++: convert tr1 uses
+faae5b9 libc++: convert tr1 uses
 ea026c6 doc: Added domain pool, and changed zone configs to use domain tool so they don't clash.
 0303f0f doc: Change zone domain to include "domain" so it is clear.
 95e63ac doc: Added space between {Query_String} and [E=HTTP ... to solve for 500 error.
@@ -7151,16 +10548,16 @@ ea026c6 doc: Added domain pool, and changed zone configs to use domain tool so t
 df4df46 Monitor: use a single static accessor for getting CompatSet features off disk
 6915053 doc: Adding more information on style and usage for documenting Ceph.
 2216afd doc: Fixed omission of a comma in zone configuration.
-55ab35b (origin/wip-7207) FileStore: perform LFNIndex lookup without holding fdcache lock
-1560cc0 (origin/wip-rgw-civetweb-2) mongoose: git rm src/mongoose
+55ab35b FileStore: perform LFNIndex lookup without holding fdcache lock
+1560cc0 mongoose: git rm src/mongoose
 bd089b1 civetweb: md5.inl -> md5.h
 1a2bed1 civetweb: add include to makefile
 287ea5c rgw: modify the civetweb rule
 d26e766 civetweb: fix module uri
 3f20a41 civetweb: update submodule
 f2f7475 rgw: switch mongoose to civetweb
-08fa34d (origin/wip-erasure-rule) osd/OSDMap: do not create erasure rule by default
-6f8541c (origin/use-unordered-map) osd: use ceph:: scoped hash_map
+08fa34d osd/OSDMap: do not create erasure rule by default
+6f8541c osd: use ceph:: scoped hash_map
 289a400 Corrected ownership info which was presented as string, added stub code for unpublish
 ecbdeb1 PGBackend: clarify rollback_stash name and method comments
 4fc4573 PG: drop messages from down peers
@@ -7222,7 +10619,7 @@ b7d100b FileStore::_collection_move_rename: remove source before closing guard
 06f7a98 ReplicatedBackend.h: don't need to be active for pushes
 518774d ObjectStore: improve name of the queue_transaction which cleans up the transaction
 c03d027 hobject: admit that gen_t is actually version_t
-8060afd (origin/wip-osdmapenc-fix) MOSDMap: reencode maps if target doesn't have OSDMAP_ENC
+8060afd MOSDMap: reencode maps if target doesn't have OSDMAP_ENC
 9792500 doc: add-or-rm-mons.rst: better explanation on number of monitors
 9eac5e3 Added update_metadata, clear_metadata, and usage description code
 dcca413 added perl script for rgw bucket quota tests
@@ -7230,11 +10627,11 @@ dcca413 added perl script for rgw bucket quota tests
 54caa01 removing rgw_tests.sh
 8b8ede7 modified the port to 7280 in the script instead of the default 80
 850b4f4 script for rgw bucket quota testing
-339bed1 (origin/wip-cache-mode-fix) mon/MonCommands: 'invalidate+forward' -> 'forward'
+339bed1 mon/MonCommands: 'invalidate+forward' -> 'forward'
 b88c8ea doc: Added Documenting Ceph (draft) document.
 90aea59 doc: Added documenting Ceph to the toc tree.
 ed2decb doc: Fixed end string from '' to `` to stop warnings. Enforced 80 char line.
-584c2dd (origin/wip-stray-mdsmaps) mon/MDSMonitor: do not generate mdsmaps from already-laggy mds
+584c2dd mon/MDSMonitor: do not generate mdsmaps from already-laggy mds
 4c96853 packaging: ship libdir/ceph
 6bee188 Docs: Add backfill_toofull to list of PG states
 199e614 Initial version of the source code. Statistics extraction is done
@@ -7246,27 +10643,27 @@ c78c300 Remove some almost-duplicate COMMAND definitions
 586ad1f rgw: Use correct secret key for POST authn
 52a9154 rgw: Fix signature variable naming/failure print
 6dcf462 rgw: Document fields for access/secret key
-7314cde (origin/wip-7184) osd: ignore num_objects_dirty for old pools
-1b308b6 (origin/port/libc++) libc++: fix null pointer comparison
+7314cde osd: ignore num_objects_dirty for old pools
+1b308b6 libc++: fix null pointer comparison
 1fcbddc libc++: avoid hash re-definitions
 aae4700 libc++: use ceph::shared_ptr in installed header
 4c4e1d0 libc++: use ceph:: namespaced data types
 8e86720 libc++: create portable smart ptr / hash_map/set
-7e7eda4 (origin/wip-osdmap-7177) OSDMap: Populate primary_temp values a little more carefully
+7e7eda4 OSDMap: Populate primary_temp values a little more carefully
 47bc71a fixed the syntax for test maxmds=2
 a13ebd3 fix for the test maxmds=2
 ad203d5 doc: Fixed ruleset typo.
-6b7f27c (origin/wip-rgw-contention) librados: use rwlock for lookup pool, cache results
+6b7f27c librados: use rwlock for lookup pool, cache results
 4b31456 rgw: add .h file to Makefile rule
 d1de32c doc: Added default-placement to list of placement targets. Added SSL commentary.
 80212ea doc: Added additional comment about port number.
-0215342 (origin/wip-rgw-loadgen) rgw: loadgen, configurable num of objs, buckets
+0215342 rgw: loadgen, configurable num of objs, buckets
 7c86764 rgw: loadgen shutdown, error out on failures
 85267cf rgw: sign loadgen requests
 e8a4b30 rgw: loadgen frontend read uid, init access key
 57137cb rgw: add a load generation frontend
 f01202d Fixes: #7172
-b1a853e (origin/wip-rbd-mount_timeout) rbd: expose mount_timeout map option
+b1a853e rbd: expose mount_timeout map option
 8ec7fa8 PendingReleaseNotes: note ceph -s fix
 6e93132 doc: Removed Calxeda example.
 dd4a9e1 doc: Removed saucy salamander reference.
@@ -7279,7 +10676,7 @@ f7f9bed doc: Added source file for graphic.
 c30b138 doc: Added third monitor icon.
 add59b8 doc: Added additional monitor icon.
 4b03326 doc: Removed old file. Changed file name.
-86c1548 (origin/wip-7168) rgw: handle racing object puts when object doesn't exist
+86c1548 rgw: handle racing object puts when object doesn't exist
 5c24a7e rgw: don't return -ENOENT in put_obj_meta()
 a84cf15 rgw: use rwlock for cache
 790dda9 osd: OSDMap: fix output from ceph status --format=json for num_in_osds num_up_osds returns as an int value, while num_in_osds returns as a string. Since only an int can be returned from get_num_in_osds(), num_in_osds should should also be an int to remain consistant with num_up_osds.
@@ -7288,14 +10685,14 @@ ec5f7a5 client: ceph-fuse use fuse_session_loop_mt to allow multithreaded operat
 e1fd0e8 first commit
 ac5a9fe mon: larger timeout for mon mkfs.sh test
 50808af ceph-disk: larger timeout in the test script
-1aa2601 (origin/wip-7166) osd: OSDMap: build reverse name->pool map upon decoding
-f97264d (origin/wip-rgw-expose-bucket) 6748: rgw: Optionally return the bucket name in a response header.
-2d0d48b (origin/wip-7117-redhat) packaging: ship libdir/ceph
+1aa2601 osd: OSDMap: build reverse name->pool map upon decoding
+f97264d 6748: rgw: Optionally return the bucket name in a response header.
+2d0d48b packaging: ship libdir/ceph
 8b09a43 doc/release-notes: v0.76 draft notes
 2b36761 mon: favor 'mds set max_mds' over 'mds set_max_mds'
 e60dcfa packaging: add missing test files
-5ed7865 (origin/wip-7073) rgw: use configurable prime number for gc hash
-e7b3236 (origin/wip-temp-primary) OSDMonitor: make sure we don't send out maps with a primary_temp mapping
+5ed7865 rgw: use configurable prime number for gc hash
+e7b3236 OSDMonitor: make sure we don't send out maps with a primary_temp mapping
 28e0b76 test: add an OSDMap unittest
 b183115 OSDMap: move temp manipulation functions out of OSDMonitor
 fafc8e9 OSDMap: pay attention to the temp_primary in _get_temp_osds
@@ -7321,8 +10718,8 @@ c1a95f8 OSDMap: remove get_pg_primary() function
 7a9c171 OSDMap: doc the different pg->OSD mapping functions
 268ae82 osd: do not misuse calc_pg_role
 a09d4f1 PG: do not use role == 0 as a determinant of primacy
-644afd6 (origin/wip-3454) radosgw-admin: add temp url params to usage
-fdf1a41 (origin/fix-configure-rerunning) autogen: re-run aclocal after libtoolize
+644afd6 radosgw-admin: add temp url params to usage
+fdf1a41 autogen: re-run aclocal after libtoolize
 970f938 doc: Updated paths for OSDs using the OS disk.
 4425f9e librados: Add C API coverage for atomic write operations
 1ffe422 mailmap: add athanatos <sam.just at inktank.com>
@@ -7334,9 +10731,9 @@ caf7971 mds: Add inline fields to inode_t
 b67e9ef ceph: Add inline state definition
 978ec18 ceph: Add inline data feature
 e268e95 updated "sample.ceph.conf":
-91b8c78 (origin/wip-tier-bits) mon/OSDMonitor: simplify and make 'osd pool set ...' output consistent
+91b8c78 mon/OSDMonitor: simplify and make 'osd pool set ...' output consistent
 413fc23 osd/ReplicatedPG: no HitSet on non-primary
-be8db8c (origin/wip-cache-snap) osd/ReplicatedPG: use get_object_context in trim_object
+be8db8c osd/ReplicatedPG: use get_object_context in trim_object
 b5ae76e ceph_test_rados: do not delete in-use snaps
 8b39719 osd/OSDMonitor: fix 'osd tier add ...' pool mangling
 d41a1d3 osd/ReplicatedPG: update ObjectContext's object_info_t for new hit_set objects
@@ -7384,17 +10781,17 @@ b840aae osd/ReplicatedPG: adjust clone stats when promoting clones
 6dd0a1f osd/ReplicatedPG: include snaps in copy-get results
 d22ecf3 osd/ReplicatedPG: using missing_oid to decide which object to promote
 c3c1541 osd/ReplicatedPG: make find_object_context() pass missing_oid
-33b5ef4 (origin/wip-mon-pgmap) mon/PGMap: make decode version match encode version
+33b5ef4 mon/PGMap: make decode version match encode version
 a5aaab3 ceph-dencoder: include offset in 'stray data' error message
 1308225 buffer: do not append trailing newline when appending empty istream
-946d603 (tag: v0.75) v0.75
+946d603 v0.75
 9034370 doc: Added comment and example for SSL enablement in rgw.conf
 9615645 doc: Added python example of handle, and closing session examples.
-c0d92b6 (origin/wip-5858-rebase) osd: Implement multiple backfill target handling
+c0d92b6 osd: Implement multiple backfill target handling
 a657fad osd: Interim backfill changes
 54e588c doc: Modified doc examples to use rados_create2.
-4c92dc6 (origin/wip-7141) DBObjectMap::clear_keys_header: use generate_new_header, not _generate_new_header
-93a9b68 (origin/wip-vector-op) erasure-code: use uintptr_t instead of long long
+4c92dc6 DBObjectMap::clear_keys_header: use generate_new_header, not _generate_new_header
+93a9b68 erasure-code: use uintptr_t instead of long long
 0082d88 doc: format man pages with s/2013/2014/
 b4054fc doc: copyright s/2013/2014/
 efbdd16 doc: update the crushtool manual page
@@ -7409,15 +10806,15 @@ d3d75a2 crush: crushtool --build informative messages
 2765f81 crush: parse CEPH_ARGS in crushtool
 cf9a764 osd: factorize build_simple_crush_map* rulesets creation
 7676550 osd: ostream is enough for build_simple*
-cec8d85 (origin/wip-omapdirfrag2) mds: require CEPH_FEATURE_OSD_TMAP2OMAP
+cec8d85 mds: require CEPH_FEATURE_OSD_TMAP2OMAP
 1d8429d osd/OSDMap: get_up_osd_features()
 b8dfcc1 mds: use OMAP to store dirfrags
 0f0dd74 osd: introduce TMAP->OMAP conversion operation
-90d6cec (origin/wip-7135) src/test/ObjectMap: test clear_keys_header() call
+90d6cec src/test/ObjectMap: test clear_keys_header() call
 617a50c src/test/librados/misc.cc: verify that omap_clear doesn't blast xattr
 0c81849 os/DBObjectMap, FileStore: omap_clear should not remove xattrs
 6225f2c mailmap: fix typo in Alexandre Maragone
-d429ab5 (origin/wip-tier-snap) osd/OSDMonitor: fix 'osd tier add ...' pool mangling
+d429ab5 osd/OSDMonitor: fix 'osd tier add ...' pool mangling
 f49d9cd osd: fix propagation of removed snaps to other tiers
 3b3511c mon: debug propagate_snaps_to_tiers
 631d0c7 erasure-code: erasure code decode interface helper
@@ -7439,10 +10836,10 @@ f18b310 mailmap: remove company name from Pascal de Bruijn name
 0e43ac9 rgw: add optional tenant name for swift urls
 2626101 rgw: can set temp url key on user
 efb4cf6 doc: Removed extraneous comment from snapshot text.
-5070597 (origin/install-erasure-headers) osd: add missing header to install list
-a5f8cc7 (origin/wip-7110) rgw: convert bucket info if needed
+5070597 osd: add missing header to install list
+a5f8cc7 rgw: convert bucket info if needed
 029b9ef remove spurious executable permissions on files
-995a3cc (origin/wip-max-file-size) mds/MDSMap: include max_file_size in plaintext print output
+995a3cc mds/MDSMap: include max_file_size in plaintext print output
 2de2148 mailmap: add Yan Zheng fixes for Reviewed-by:
 62fbcac mailmap: add Loic Dachary fixes for Reviewed-by:
 3fe5f11 mailmap: add Gary Lowell fixes for Reviewed-by:
@@ -7471,20 +10868,20 @@ b780f4b osd: Remove redundant incompat feature
 31f3745 mailmap: Adds Christophe Courtaut
 7d5674c doc: Added librados introduction doc. Still wip.
 33e78a4 doc: Updated terminology. Added librados intro to index.
-7acb0a1 (origin/port/onexit) on_exit: remove side effects from asserts
+7acb0a1 on_exit: remove side effects from asserts
 e3d0b0a common: fix large output in unittest_daemon_config
 822ad58 configure: support Automake 1.12
-b1976dd (origin/wip-7083) radosgw-admin: fix object policy read op
+b1976dd radosgw-admin: fix object policy read op
 3f34dc7 common: unit tests for config::expand_meta
 9485409 common: recursive implementation of config::expand_meta
 87db534 common: cosmetic inversion of tests arguments
 5bb1545 common: multiple variable occurence expansion test
-98ed9ac (origin/wip-7093) mon: only send messages to current OSDs
+98ed9ac mon: only send messages to current OSDs
 f68de9f osd: ignore OSDMap messages while we are initializing
 35da8f9 osd: do not send peering messages during init
 7aa9805 log: use on exit manager to flush logs on exit
 2181d25 onexit: add an on exit callback utility
-3d19f7c (origin/port/misc) test: disable cross process sem tests on non-Linux
+3d19f7c test: disable cross process sem tests on non-Linux
 6342d05 pipe: handle missing MSG_MORE and MSG_NOSIGNAL
 26c07d9 ipaddr: use IN6_ARE_ADDR_EQUAL for comparison
 ef6a56a keyutils: handle non-linux platform
@@ -7493,7 +10890,7 @@ bb8b750 add autotools-generated files to .gitignore
 f2e33e8 mon: get rid of --keyring /dev/null hack
 d110c91 doc: Removed dash from --dmcrypt option.
 c772b6d ceph-disk: fix false positive for gitbuilder
-e2ee528 (origin/wip-objectcacher-backoff) osdc/ObjectCacher: back off less during flush
+e2ee528 osdc/ObjectCacher: back off less during flush
 daefe81 Be more explicit how to discover available API calls
 de8522f ceph-disk: tests for the --data-dir code path
 a71025d ceph-disk: implement --sysconfdir as /etc/ceph
@@ -7516,7 +10913,7 @@ b82ccfb ceph-disk: fix Error() messages formatting
 6b8d418 init-ceph: pass config file path when adjust crush position
 2ba6930 ceph-disk: cannot run unit tests
 24417f9 test/cli-integration/rbd: silence stderr
-8220549 (origin/port/bitsandints) inttypes: detect and define missing integer types
+8220549 inttypes: detect and define missing integer types
 8f91cac endian: check byte order on OSX
 46a5674 doc/release-notes: fix bobtail version
 f6bbcf4 mon: tests for ceph-mon --mkfs
@@ -7528,10 +10925,10 @@ f6bbcf4 mon: tests for ceph-mon --mkfs
 4c8a313 mon: implement --key for --mkfs
 ab6203f mon: fix indentation
 e946df1 mon: do not use the keyring if auth = none
-f12abbf (origin/wip-6914) mds: don't allow changing layout for non-regular file
+f12abbf mds: don't allow changing layout for non-regular file
 f292992 mds: always store backtrace xattr in the default pool
 cae663a osd/ReplicatedPG: improve debug output from check_local
-ac547a5 (origin/wip-empty-rbd-ls) rbd: return 0 and an empty list when pool is entirely empty
+ac547a5 rbd: return 0 and an empty list when pool is entirely empty
 e91fb91 librbd: better error when unprotect fails on unprotected snap
 42e98ac Be more explicit how to discover available API calls
 eeba294 mon: remove fixture directory between runs
@@ -7544,9 +10941,9 @@ a194513 mon: do not daemonize if CINIT_FLAG_NO_DAEMON_ACTIONS
 e0bae95 Fix typos in erasure code documents
 2b0a435 osd_types: add missing osd op flags
 f8e413f msgr: fix rebind() race stop the accepter and mark all pipes down before rebind to avoid race
-8fcfc91 (origin/wip-listomapvals) qa: test rados listomapvals with >512 keys
+8fcfc91 qa: test rados listomapvals with >512 keys
 be5afa2 rados: allow listomapvals to list all k/v pairs
-c165483 (tag: v0.74) v0.74
+c165483 v0.74
 b4fc16c make: conditionally build filestore backends
 9d41fd2 test_cls_rbd: avoid warning -Wno-unnamed-template-args
 fdd8562 test_cls_rbd: avoid shift overflow
@@ -7560,7 +10957,7 @@ f7a66d6 make: restrict use of --as-needed to Linux
 b5c17f6 vstart: set fsid in [global]
 d7d7ca8 Fix qa/workunits/rados/test_cache_pool.sh typos
 cc67b7b Fix test/filestore/store_test.cc error
-e8e174e (origin/port/public-hdrs) rados: include struct timeval definition
+e8e174e rados: include struct timeval definition
 356d71a inttypes: use portable types in public headers
 76ad85d test: include headers for struct statfs
 93c125c test: remove platform specific header
@@ -7572,7 +10969,7 @@ a48d038 test: fix VLA of non-POD type
 891801e warning: fix typo and -Wmismatched-tags
 c7e1c4b c++11: fix std::lock naming conflicts
 bbcb022 kvstore: only build on linux
-1fec818 (origin/port/spinlock) spinlock: add generic spinlock implementation
+1fec818 spinlock: add generic spinlock implementation
 12f4631 qa/workunits/rest/test.py: rbd pool ruleset is now 0
 b286e4f ceph_test_rados_api_tier: retry EBUSY race checks
 b88af07 libcephfs: get osd location on -1 should return EINVAL
@@ -7580,15 +10977,15 @@ b88af07 libcephfs: get osd location on -1 should return EINVAL
 d4f07cd crush: fix get_full_location_ordered
 fd57d99 Fix rbd bench-write improper behavior
 30078e6 autoconf: trim duplicated headers
-f9f5c37 (origin/wip-rgw-leak) rgw: fix leak of RGWProcess
-96fe80d (origin/wip-snaps) osd: preserve user_version in snaps/clones
+f9f5c37 rgw: fix leak of RGWProcess
+96fe80d osd: preserve user_version in snaps/clones
 80b5487 ceph_test_rados: test read from snapshots
 2f8b602 osd/OSDMap: observe 'osd crush chooseleaf type' option for initial rules
 9b7364d rbd: expose options available to rbd map
 b3bda08 Lack of "start" member function declare in WBThrottle.h
 4a9c770 messages: add tid to string form of MForward
 67f99f3 packaging: make check needs argparse and uuidgen
-4cea789 (origin/wip-rbd-tinc-5426) librbd: call user completion after incrementing perfcounters
+4cea789 librbd: call user completion after incrementing perfcounters
 f8a4001 osd: create default ruleset for erasure pools
 8b2b5a3 mon: implement --osd-pool-default-crush-erasure-ruleset
 dd81858 mon: implement --osd-pool-default-crush-replicated-ruleset
@@ -7604,9 +11001,9 @@ c6d876a mon: osd-pool-create must not loop forever on kill
 272eed3 client: SyntheticClient uses the first available pool
 20b3da0 mon: MDS data and metadata pool numbers are hardcoded
 bf24317 Fix WBThrottle thread disappear problem
-87b8e54 (origin/wip-argparse-fix) ceph_argparse: kill _daemon versions of argparse calls
+87b8e54 ceph_argparse: kill _daemon versions of argparse calls
 ea4724d rados: deprecated attribute has no argument
-e7bf5b2 (origin/wip-librados-lock) librados: lockless get_instance_id()
+e7bf5b2 librados: lockless get_instance_id()
 771da13 objecter, librados: create Objecter::Op in two phases
 5ff30d6 crush/CrushWrapper: note about get_immediate_parent()
 0cdbc97 librados: mark old get_version() as deprecated
@@ -7618,18 +11015,18 @@ ac14d4f osdc/Objecter: maintain crush_location multimap
 dcc5e35 crush/CrushWrapper: add get_common_ancestor_distance()
 0903f3f mon/OSDMonitor: use generic CrushWrapper::parse_loc_map helper
 8f48906 crush/CrushWrapper: add parse_loc_[multi]map helpers
-8fc66a4 (origin/wip-7056) osd/ReplicatedPG: fix copy-get iteration of omap keys
+8fc66a4 osd/ReplicatedPG: fix copy-get iteration of omap keys
 0c9acf1 ceph_test_rados: s/tmap/omap/
 3b0d9b2 vstart/stop: do not loop forever on kill
 4ce6400 config: add 'crush location' option
 19213e6 doc: Fix caps documentation for Admin API
-ac10aa5 (origin/wip-fix-mon-fwd) mon: fix forwarded request features when requests are resent
+ac10aa5 mon: fix forwarded request features when requests are resent
 2e4c61b osd/ReplicatedPG: include omap header in copy-get
-537a7c3 (origin/wip-crush-shrink-diff) crush: misc formatting and whitespace fixes
+537a7c3 crush: misc formatting and whitespace fixes
 fa6a99a crush: use kernel-doc consistently
 6e36794 crush/mapper: unsigned -> unsigned int
 d8512f1 mon: use kill instead of pkill in osd-pool-create
-c030569 (origin/wip-7051) osd: OSDMap: dump osd_xinfo_t::features as an int
+c030569 osd: OSDMap: dump osd_xinfo_t::features as an int
 b4fbe4f mon: Monitor: Forward connection features
 93c44cb mon: unit test for osd pool create
 59941b1 mon: erasure code pool properties defaults
@@ -7664,7 +11061,7 @@ cdc178f Revert "Enable libs3 support for debian packages"
 1e238e6 mon: pool create will not fail if the type differs
 5f1957d doc/release-notes: v0.67.5
 98a1525 unittests: fail if one test fail
-9ab947c (origin/port/buffer) buffer: use int64_t instead of loff_t
+9ab947c buffer: use int64_t instead of loff_t
 03693ac osd: git ignore erasure code benchmark binary
 42b4fe1 osd: erasure code benchmark is installed is part of ceph-test
 81dee1b osd: erasure code benchmark workunit
@@ -7674,7 +11071,7 @@ c7d8ba7 osd: better performances for the erasure code example
 ff9455b osd: conditionally disable dlclose of erasure code plugins
 8879e43 osd: Fix assert which doesn't apply when compat_mode on
 0bd5cb6 Add backward comptible acting set until all OSDs updated
-8d31f71b osd/ReplicatedPG: fix promote cancellation
+8d31f71 osd/ReplicatedPG: fix promote cancellation
 923bff1 osd/ReplicatedPG: drop RepGather::ondone callback
 2a9c6fc vstart.sh: go faster
 bc893f5 osd/ReplicatedPG: fix undirty on clean object
@@ -7699,18 +11096,18 @@ ea519b4 qa/workunits/rados: test cache-{flush,evict,flush-evict-all}
 71cd4a2 rados: add cache-flush, cache-evict, cache-flush-evict-all commands
 ad3b466 osd/ReplicatedPG: implement cache-flush, cache-try-flush
 edaec9a osd: Fix assert which doesn't apply when compat_mode on
-ac16a9d (origin/wip-kill-raid4) osd: remove remaining instances of raid4 pool types (never implemented)
+ac16a9d osd: remove remaining instances of raid4 pool types (never implemented)
 40a48de mds: fix Resetter locking
 087fe57 packaging: revert adding argparse and uuidgen
 8272538 packaging: make check needs argparse and uuidgen
 f193925 autogen: test compare strings with != not -ne
 98af37d Fix segmentation fault when handler is NULL pointer
-44aacae (origin/wip-mds-coverity2) mds: fixes for coverity scan
+44aacae mds: fixes for coverity scan
 8b38f10 crush/mapper: fix crush_choose_firstn comment
 ea3a0bb crush/mapper: attempts -> tries
 0497db4 crush/mapper: finish adding choose_local_[fallback_]tries
-99f41de (origin/sage-valgrind) vstart.sh: NOTE, not WARNING, to make gitbuilder happy
-b3ee598 (origin/wip-6028) qa: workunit: cephtool: test osd pool create with erasure type
+99f41de vstart.sh: NOTE, not WARNING, to make gitbuilder happy
+b3ee598 qa: workunit: cephtool: test osd pool create with erasure type
 250b446 pybind: test_ceph_argparse: test 'ceph osd pool create' with pool type
 c996f66 mon: OSDMonitor: add optional 'pool type' arg to 'osd pool create'
 bdeaa84 osd: OSDMap: add 'get_up_osds()' function
@@ -7729,7 +11126,7 @@ e6ad4d4 osd: make obc copyfrom blocking generic
 8dec2b2 librados, osd: add flags to COPY_FROM
 e624e16 crush: silence error messages in unit tests
 9414970 ARCH: adding SSE2 flag to arch-test
-7e0c84b (origin/wip-crush-2) mon/OSDMonitor: 'osd crush show-tunables'
+7e0c84b mon/OSDMonitor: 'osd crush show-tunables'
 88365c2 crush: expand info about tunables that we dump
 d0f14df mon: warn if crush has non-optimal tunables
 d129e09 crush: add set_choose_local_[fallback_]tries steps
@@ -7737,14 +11134,14 @@ d129e09 crush: add set_choose_local_[fallback_]tries steps
 b86d450 upstart: add rbdmap script
 e4537d3 ARCH: add variable for sse2 register
 0d217cf qa/workunits/cephtool/test.sh: clean up our client.xx.keyring
-7e4a800 (origin/wip-7026) osd/ReplicatedPG: fix hit_set_setup() on_activate()
-19cff89 (origin/wip-6990) Add backward comptible acting set until all OSDs updated
+7e4a800 osd/ReplicatedPG: fix hit_set_setup() on_activate()
+19cff89 Add backward comptible acting set until all OSDs updated
 b153067 erasure-code: tests must use aligned buffers
 f5d32a3 mds: drop unused find_ino_dir
 c60a364 Fix typo in #undef in ceph-dencoder
 9e45655 qa: add ../qa/workunits/cephtool/test.sh to unittests
 c1eb55c qa: vstart wrapper helper for unittests
-0edbda2 (origin/wip-rbd-coverity) rbd: make coverity happy
+0edbda2 rbd: make coverity happy
 d93881f vstart/stop: use pkill instead of killall
 ae56cef qa: recursively remove .gcno and .gcda
 b082c09 crushtool: reorg test-map-* cli tests
@@ -7752,16 +11149,16 @@ b082c09 crushtool: reorg test-map-* cli tests
 685c695 crush/mapper: generalize descend_once
 6f43120 ceph_test_rados_api_tier: fix HitSetTrim vs split, too
 c5bccfe ceph_test_rados_api_tier: fix HitSetRead test race with split
-7e618c9 (origin/wip-7009) mon: move supported_commands fields, methods into Monitor, and fix leak
-deded44 (origin/wip-rgw-standalone-2) mongoose: update submodule
+7e618c9 mon: move supported_commands fields, methods into Monitor, and fix leak
+deded44 mongoose: update submodule
 ef10a5c rgw: fix memory leak
-824b3d8 (origin/wip-linux-version) FileJournal: use pclose() to close a popen() stream
+824b3d8 FileJournal: use pclose() to close a popen() stream
 6696ab6 FileJournal: switch to get_linux_version()
 fcf6e98 common: introduce get_linux_version()
 a2babe2 configure: break up AC_CHECK_HEADERS into one header-file per line
 fae569d Rename filestore_perf_t to objectstore_perf_t
 015e981 Move PerfCounter from ObjectStore.h to FileStore.h
-4526d13 (origin/wip-mds-cluster2) mds: fix stale session handling for multiple mds
+4526d13 mds: fix stale session handling for multiple mds
 43f7268 mds: properly set dirty flag when journalling import
 802df76 mds: properly update mdsdir's authority during recovery
 b6d1d8f mds: finish opening sessions even if import aborted
@@ -7771,7 +11168,7 @@ ebb3ad9 osd/PG: move some pg stat update into a helper
 05274f3 osd: include peer_info in pg query
 5fdcc56 mds: fix bug in MDCache::open_ino_finish
 71d1eb3 mds: add CEPH_FEATURE_EXPORT_PEER and bump the protocal version
-d0b744a1 client: handle session flush message
+d0b744a client: handle session flush message
 05b192f mds: simplify how to export non-auth caps
 9dc52ff mds: send cap import messages to clients after importing subtree succeeds
 6a56588 mds: re-send cap exports in resolve message.
@@ -7870,14 +11267,14 @@ b371dd8 ReplicatedPG: promote: first draft pass at doing object promotion
 a3e50b0 rgw: rework framework configuration
 b7946ff doc: Added additional comments on placement targets and default placement.
 902f19c doc: Updates to federated config.
-29cc722 (origin/wip-hitset) test_ipaddr: add another unit test
+29cc722 test_ipaddr: add another unit test
 026b724 osd/ReplicatedPG: drop unused hit_set_start_stats
 3d768d2 osd/ReplicatedPG: maintain stats for the hit_set_* objects
 9814b93 osd/ReplicatedPG: set object_info_t, SnapSet on hit_set objects
 dabd5d6 vstart.sh: --hitset <pool> <type>
 5bb0476 test/libcephfs: release resources before umount
-897dfc1 (origin/wip-6979) use the new get_command helper in check_call
-eae8531 (origin/wip-rbd-single-major) rbd: modprobe with single_major=Y on newer kernels
+897dfc1 use the new get_command helper in check_call
+eae8531 rbd: modprobe with single_major=Y on newer kernels
 8a473bc rbd: add support for single-major device number allocation scheme
 784cc89 rbd: match against both major and minor on unmap on newer kernels
 462b389 rbd: match against whole disks on unmap
@@ -7885,7 +11282,7 @@ a421305 rbd: switch to strict_strtol for major parsing
 24a048b Document librados's rados_write's behaviour in reguards to return value.
 a865fec osd/ReplicatedPG: debug: improve hit_set func banners
 b6871cf osd/ReplicatedPG: do not update current_last_update on activate
-bcde200 (origin/wip-vstart-memstore) vstart.sh: add --memstore option
+bcde200 vstart.sh: add --memstore option
 a9334a1 use the absolute path for executables if found
 43561f7 remove trailing semicolon
 a33c95f radosgw: increase nofiles ulimit on sysvinit machines
@@ -7896,8 +11293,8 @@ ee3173d doc/release-notes: fix indentation; sigh
 bb50276 Revert "Partial revert "mon: osd pool set syntax relaxed, modify unit tests""
 0cd36e0 mon/OSDMonitor: take 'osd pool set ...' value as a string again
 e19e380 replace sgdisk subprocess calls with a helper
-4b6d721 (origin/wip-hashpspool) osd: enable HASHPSPOOL by default
-fb47d54 (origin/wip-mon-api) mon: if we're the leader, don't validate command matching
+4b6d721 osd: enable HASHPSPOOL by default
+fb47d54 mon: if we're the leader, don't validate command matching
 2bfd34a mon: by default, warn if some members of the quorum are "classic"
 e620057 add apt-get install pkg-config for ubuntu server
 b8884e0 MemStore: update for the new ObjectStore interface
@@ -7907,7 +11304,7 @@ e223e53 Monitor: encode and expose mon command sets
 420a2f1 man: update man/ from doc/man/8
 8d60cd1 man: Ceph is also an object store
 faaf546 os/MemStore: do on_apply_sync callback synchronously
-d8ad51e (tag: v0.73) v0.73
+d8ad51e v0.73
 990b2b5 ceph_test_rados_api_tier: make HitSetWrite handle pg splits
 a6f4d71 Elector: keep a list of classic mons instead of each mon's commands
 a888a57 crush: implement --show-bad-mappings for indep
@@ -7915,7 +11312,7 @@ a888a57 crush: implement --show-bad-mappings for indep
 fbc4f99 crush: remove scary message string
 472f495 crush: document the --test mode of operations
 ea86444 Monitor: Elector: share the classic command set if we have a classic mon
-f1ccdb41 Elector: share local command set when deferring
+f1ccdb4 Elector: share local command set when deferring
 ba673be Monitor: import MonCommands.h from original Dumpling and expose it
 3cb58f7 Monitor: validate incoming commands against the leader's set too
 cb51b1e Monitor: disseminate leader's command set instead of our own
@@ -7935,19 +11332,19 @@ c928f07 crush: output --show-bad-mappings on err
 ef4061f librbd: remove unused private variable
 ad3825c TrackedOp: remove unused private variable
 3b39a8a librbd: rename howmany to avoid conflict
-539fe26 (origin/port/fdatasync) wbthrottle: use feature check for fdatasync
+539fe26 wbthrottle: use feature check for fdatasync
 663da61 rados_sync: fix mismatched tag warning
 60a2509 rados_sync: remove unused private variable
 43c1676 mon: check for sys/vfs.h existence
 c99cf26 make: increase maximum template recursion depth
-e2be099 (origin/port/compat) compat: define replacement TEMP_FAILURE_RETRY
-3b3cbf5 (origin/wip-fix-tunables) crush/CrushCompiler: make current set of tunables 'safe'
+e2be099 compat: define replacement TEMP_FAILURE_RETRY
+3b3cbf5 crush/CrushCompiler: make current set of tunables 'safe'
 8535ced crushtool: remove scary tunables messages
 4eb8891 crush/CrushCompiler: start with legacy tunables when compiling
 e8fdef2 crush: add indep data set to cli tests
-564de6e (origin/wip-fix-3x) osdmaptool: fix cli tests for 3x
+564de6e osdmaptool: fix cli tests for 3x
 6704be6 osd: default to 3x replication
-8d0180b (origin/wip-objecter-full-2) objecter: don't take extra throttle budget for resent ops
+8d0180b objecter: don't take extra throttle budget for resent ops
 38647f7 Revert "osd: default to 3x replication"
 cbeb1f4 crush: detach_bucket must test item >= 0 not > 0
 2cd73f9 crush: remove obsolete comments from link_bucket
@@ -8027,12 +11424,12 @@ cb88763 crush/mapper: fix up the indep tests
 b819018 crush: unittest use const instead of define
 dc09521 crush: unittest CrushWrapper::check_item_loc
 000c59a crush: unittest remove useless c->create()
-7a9a088 (origin/wip-6940) rgw: fix reading bucket policy in RGWBucket::get_policy()
-cb26fbd (origin/wip-3x) osd: default to 3x replication
+7a9a088 rgw: fix reading bucket policy in RGWBucket::get_policy()
+cb26fbd osd: default to 3x replication
 aedbc99 crush: check for invalid names in loc[]
 fe03ad2 osd: queue pg deletion after on_removal txn
 aa63d67 os/MemStore: implement reference 'memstore' backend
-47ee797 (origin/wip-mon-mds-trim) mon: ceph-kvstore-tool: get size of value for prefix/key
+47ee797 mon: ceph-kvstore-tool: get size of value for prefix/key
 c98c104 tools: ceph-kvstore-tool: output value contents to file on 'get'
 00048fe mon: Have 'ceph report' print last committed versions
 cc64382 mon: MDSMonitor: let PaxosService decide on whether to propose
@@ -8051,8 +11448,8 @@ a70200e os/ObjectStore: pass cct to ctor
 cae1083 ObjBencher: add rand_read_bench functions to support rand test in rados-bench
 e829859 doc/rados/operations/crush: fix more
 7709a10 doc/rados/operations/crush: fix rst
-68fdcfa (origin/wip-6922) FileSTore: do not time out threads while they're waiting for op throttle
-7ff7cf2 (origin/wip-doc-build-cluster) doc: Partially incorporated comments form Loic Dachary and Aaron Ten Clay.
+68fdcfa FileSTore: do not time out threads while they're waiting for op throttle
+7ff7cf2 doc: Partially incorporated comments form Loic Dachary and Aaron Ten Clay.
 5e34beb init, upstart: prevent daemons being started by both
 c43c893 crush/mapper: new SET_CHOOSE_LEAF_TRIES command
 2731d30 crush/mapper: pass parent r value for indep call
@@ -8068,7 +11465,7 @@ d51a219 osd/osd_types: pg_pool_t: fix /// -> ///< comments
 86e9780 crush: use breadth-first search for indep mode
 b1d4dd4 crush: return CRUSH_ITEM_UNDEF for failed placements with indep
 1cfe140 crush: eliminate CRUSH_MAX_SET result size limitation
-858a21b (origin/wip-filestore-remount) common/WorkQueue: allow start() after stop()
+858a21b common/WorkQueue: allow start() after stop()
 36505e8 os/FileStore: allow mount after umount
 cd6be29 common/Finisher: allow finisher to be restarted
 941875d doc: Partially incorporated comments from Aaron Ten Clay.
@@ -8082,7 +11479,7 @@ de09778 test/filestore: add check of return values in StoreTest::SetUp
 f502530 pybind: syntax check osd metadata
 6570197 ceph-disk: blacklist /dev/fd0
 bcb6bfd test/ceph_decoder: add return value check for read_file()
-20507b5 (origin/wip-crush-tunables) crush/CrushWrapper: default to the "new" bobtail-era tunables
+20507b5 crush/CrushWrapper: default to the "new" bobtail-era tunables
 07978bb crush/CrushWrapper: explicitly set tunables on create, decode
 3f1352f doc/rados/operations/crush-map: describe crush locations, hook
 396ee0d add script/run-coverity
@@ -8095,11 +11492,11 @@ f57dad6 OSDMonitor: prevent extreme multipliers on PG splits
 c77ce90 doc: Fixed hyperlink to the manual installation section.
 648f3bc doc: Added a link to get packages.
 16b7576 osd/OSDMap: fix typo and crush types helper
-6298a57 (origin/wip-crush-chassis) osd/OSDMap: add region, pdu, pod types while we are at it
+6298a57 osd/OSDMap: add region, pdu, pod types while we are at it
 3a6707a osd/OSDMap: add 'chassis' to default type hierarchy
 5e1fc14 README: update a list of build deps for rpm-based systems
 e80ab94 Partial revert "mon: osd pool set syntax relaxed, modify unit tests"
-fd175ab (origin/wip-6699) sysvinit, upstart: use df -P when weighting new OSDs
+fd175ab sysvinit, upstart: use df -P when weighting new OSDs
 4d140a7 os/ObjectStore: add {read,write}_meta
 6bb42a0 osd: move peek_journal_fsid() into ObjectStore (from OSD)
 bf5a7b2 os/ObjectStore: generalize the FileStore's target_version
@@ -8112,15 +11509,15 @@ ea9dabf mon/OSDMonitor: include osd metadata in 'ceph report'
 c4f817f mon/OSDMonitor: move osd metadata dump into a helper
 838b6c8 PG: don't query unfound on empty pgs
 964c8e9 PG: retry GetLog() each time we get a notify in Incomplete
-14cf4ca (origin/wip-6892) rgw: don't error out on empty owner when setting acls
+14cf4ca rgw: don't error out on empty owner when setting acls
 7ec21ed Mark libcls_kvs as a module
 e5357c0 test: remove zero-copy read_fd test temporarily
 bafb5c3 doc: clarify crush rule create-simple and fix typos
 371dc71 doc: fix formatting typo in mon configuration
 00ee9a5 doc: improve ceph-mon usage, fix ceph tell examples
 4fa8f68 README: add yum command line
-b0dce8a (origin/wip-mds-assert) mds: Add assertion to catch object mutation error
-09a4c1b (origin/wip-5871) mds: remove superfluous warning of releasing lease
+b0dce8a mds: Add assertion to catch object mutation error
+09a4c1b mds: remove superfluous warning of releasing lease
 c409e36 mon: osd dump should dump pool snaps as array, not object
 03d63c4 buffer: turn off zero-copy reads for now
 784d188 mds: Release resource before return
@@ -8130,7 +11527,7 @@ c409e36 mon: osd dump should dump pool snaps as array, not object
 ab05580 Add missing stuff to clean target
 ae46c38 Correctly mark library modules
 94ca1cc ceph-object-corpus: revert accidental revert
-75d4a72 (origin/wip-zero-copy-bufferlist-last) buffer: enable tracking of calls to c_str()
+75d4a72 buffer: enable tracking of calls to c_str()
 445fb18 buffer: try to do zero copy in read_fd
 be29b34 buffer: attempt to size raw_pipe buffers
 3f6fa05 buffer: add methods to read and write using zero copy
@@ -8145,14 +11542,14 @@ fc5789d doc: Added commentary to configure pg defaults. Clarified size commentar
 dceaef4 doc: PG splitting added to docs.
 d39676b doc: Took out "future" reference to namespaces.
 9a55d89 doc: Clarification of terms.
-b35fc1b (origin/wip-6804) rgw: lower some debug message
-561e7b0 (origin/wip-6829) rgw: initialize RGWUserAdminOpState::system_specified
-b2ee935 (origin/wip-6796) PendingReleaseNotes: mention 6796 and 'ceph osd pool set' behavior change
+b35fc1b rgw: lower some debug message
+561e7b0 rgw: initialize RGWUserAdminOpState::system_specified
+b2ee935 PendingReleaseNotes: mention 6796 and 'ceph osd pool set' behavior change
 7c6d43c doc: rados: operations: pools: document 'osd pool set foo hashpspool'
 49d2fb7 mon: OSDMonitor: don't crash if formatter is invalid during osd crush dump
 337195f mon: OSDMonitor: receive CephInt on 'osd pool set' instead on CephString
 7191bb2 mon: OSDMonitor: drop cmdval_get() for unused variable
-50868a5 (origin/wip-6705) qa: workunits: mon: ping.py: test 'ceph ping'
+50868a5 qa: workunits: mon: ping.py: test 'ceph ping'
 6b5aaf3 doc: Minor updates to manual deployment document.
 3502d4f init: fix typo s/{$update_crush/${update_crush/
 29178d8 doc: Cleanup of Add/Remove OSDs.
@@ -8161,7 +11558,7 @@ de2bcd5 doc: Added a manual deployment doc.
 10b4bf6 doc: Added manual deployment section to index.
 f753d56 test: use older names for module setup/teardown
 72bba1f doc: Added fixes to osd reporting section.
-86e4fd4 (origin/wip-flush-5855-review) osd: Backfill peers should not be included in the acting set
+86e4fd4 osd: Backfill peers should not be included in the acting set
 19dbf7b osd: Simple dout() fix
 82e1e7e PG: remove unused Peering::flushed
 9ff0150 PG: don't requeue waiting_for_active unless flushed and active
@@ -8172,7 +11569,7 @@ da77553 ReplicatedPG,PG: move duplicate FlushedEvt logic info on_flushed()
 68e0e06 doc: warn about #6796 in release notes
 574cb61 man: re-generate changed man pages
 0de0efa RBD Documentation and Example fixes for --image-format
-40a76ef (origin/wip-osd-bench-size) osd: fix bench block size
+40a76ef osd: fix bench block size
 703f9a0 Revert "JounralingObjectStore: journal->committed_thru after replay"
 f0c8931 release-notes: clarify that the osd data directory needs to be mounted
 ba67b9f doc/release-notes.rst: v0.72.1 release notes
@@ -8182,10 +11579,10 @@ bf7c09a osd_types: fix object_info_t backwards compatibility
 1212a21 CephContext: unregister lockdep after stopping service thread
 dd9d8b0 ReplicatedPG: test for missing head before find_object_context
 d8d27f1 JounralingObjectStore: journal->committed_thru after replay
-a7063a1 (origin/wip-6768) Use clearer "local monitor storage" in log messages
-dcef9fb (origin/po-pr838) automake: replaced hardcoded '-lboost_program_options' with a macro
+a7063a1 Use clearer "local monitor storage" in log messages
+dcef9fb automake: replaced hardcoded '-lboost_program_options' with a macro
 125582e autoconf: add check for the boost_program_options library
-cfb82a1 (origin/port/fallocate) filejournal: add journal pre-allocate for osx
+cfb82a1 filejournal: add journal pre-allocate for osx
 d39ff4c mon/OSDMonitor: 'osd metadata N' command
 ea16435 mon/OSDMonitor: record osd metadata key/value info
 6d40e94 osd: send host/kernel metadata to mon on boot
@@ -8194,10 +11591,10 @@ aef3402 doc/release-notes: fix dup
 8b5719f doc: Added Emperor upgrade.
 7f45e72 doc: Added dumpling to the sequence.
 efe55b1 doc: Remove redundant command for quick start preflight
-fbdfe61 (origin/port/unused-headers) trace: remove unused header
+fbdfe61 trace: remove unused header
 762acec mon: Monitor: make 'quorum enter/exit' available through the admin socket
-01f7b46 (origin/port/stat) client: use platform-specific stat time members
-2f76ac3 (origin/wip-da-SCA-master) mon/MDSMonitor.cc: remove some unused variables
+01f7b46 client: use platform-specific stat time members
+2f76ac3 mon/MDSMonitor.cc: remove some unused variables
 91627f2 test_seek_read.c: remove unused variable 'off64_t so'
 0da5a01 rgw: remove unused variables
 6566dfb osd/ReplicatedPG.cc: remove unused variable
@@ -8212,18 +11609,18 @@ d0cf2bf ErasureCodeExample.h: prefer prefix ++operator for non-primitive types
 a8e10d3 os/ObjectStore.cc: prefer prefix ++operator for non-primitive types
 555e717 mon/OSDMonitor.cc: prefer prefix ++operator for non-primitive types
 0803d60 common/buffer.cc: prefer prefix ++operator for non-primitive types
-5832e26 (tag: v0.72) v0.72
+5832e26 v0.72
 84fb1bf rgw: deny writes to a secondary zone by non-system users
 d8f0502 doc/release-notes: note crush update timeout on startup change
 1ee112f osdmaptool: fix cli tests
 082e7c9 Ceph: Fix memory leak in chain_flistxattr()
 c7a30b8 ReplicatedPG: don't skip missing if sentries is empty on pgls
 5fe3dc6 objecter: clean pause / unpause logic
-98ab7d6 (origin/wip-objecter-full) objecter: set op->paused in recalc_op_target(), resend in not paused
-afb3566 (origin/port/detect-clang) conf: use better clang detection
-ac04481 (origin/port/func-name) assert: choose function-var name on non-gnu
-1d030d1 (origin/port/gtest-death-tests) test: Only build death tests on platforms that support them
-c6826c1 (origin/wip-6719) PG: fix operator<<,log_wierdness log bound warning
+98ab7d6 objecter: set op->paused in recalc_op_target(), resend in not paused
+afb3566 conf: use better clang detection
+ac04481 assert: choose function-var name on non-gnu
+1d030d1 test: Only build death tests on platforms that support them
+c6826c1 PG: fix operator<<,log_wierdness log bound warning
 f4648bc PGLog::rewind_divergent_log: log may not contain newhead
 25b7349 osd/ErasureCodePlugin: close library before return on error
 c3d20f2 osd/erasurecode: free allocated memory before return NULL
@@ -8231,16 +11628,16 @@ bfd5b90 mon/MDSMonitor: remove unnecessary assignment
 89d5396 osd/erasurecode: correct one variable name in jerasure_matrix_to_bitmatrix()
 2aaed99 Revert "doc: radosgw workaround for OpenStack Horizon bug"
 c5c399d objecter: don't resend paused ops
-5a44e17 (origin/wip-6677) rgw: add compatibility for MultipartUpload
+5a44e17 rgw: add compatibility for MultipartUpload
 a3ccd29 RadosModel: use sharedptr_registry for snaps_in_use
 c22c84a osdmaptool: don't put progress on stdout
-81a3ea1 (origin/port/rgw) rgw: add compat file for name service macros
+81a3ea1 rgw: add compat file for name service macros
 5dc6419 rgw: avoid sighandler_t in favor of sig_t
-e9880cf (origin/port/missing-headers) crush: add mising header for count
+e9880cf crush: add mising header for count
 a10345a auth: add missing header for list
 01a5a83 mon: add missing header for std::find
 e71a2f0 auth: add missing header file for std::replace
-0209568 (origin/port/utime) utime: use to_timespec for conversion
+0209568 utime: use to_timespec for conversion
 4c3b6d6 rgw: allow multiple frontends of the same framework
 e25d32c rgw: clean up shutdown signaling
 6d737ce rgw: clean up front end configuration
@@ -8248,8 +11645,8 @@ abc2177 rgw: more flexible frotnend handler config
 bbcddef Add a verbose argument and some verbosity
 c4b0431 Verify that radosgw started, return appropriate exit code
 0c61091 We should exit 1 if radosgw is not executable
-1cc8558 (origin/port/detect-libresolv) autoconf: check for res_nquery explicitly
-ea725d3 (origin/port/bootstrap) autogen.sh: use glibtoolize when available
+1cc8558 autoconf: check for res_nquery explicitly
+ea725d3 autogen.sh: use glibtoolize when available
 00734ec autogen: set exit on error
 154ee0b FileStore::_collection_move_rename: handle missing dst dir on replay
 1fe762c rgw: refactor request handling processing
@@ -8262,44 +11659,44 @@ fe6cd9b rgw: rebase mongoose prototype
 ed3caf7 mongoose: submodule, v4.1
 2fcaa1f pdatedoc: Added index link to new install doc. Requires merge of wip-doc-install to work.
 917dd71 doc: Deleted old manual install doc.
-195e861 (origin/wip-doc-install) doc: Mentioned that install procedure is for manual deployments, not ceph-deploy or others.
+195e861 doc: Mentioned that install procedure is for manual deployments, not ceph-deploy or others.
 2aa6cde doc: Changed text for ceph-extras. Made it required. Mentioned newer versions.
-d03924c (origin/wip-da-fix-galois-warning) galois.c: fix compiler warning
-6821a6e (origin/wip-static-cast) assert: use feature test for static_cast
+d03924c galois.c: fix compiler warning
+6821a6e assert: use feature test for static_cast
 330a6a7 wbthrottle: use posix_fadvise if available
 2bf8ff4 doc: Added DNS and SSL dialog.
 cd0d612 OSD: allow project_pg_history to handle a missing map
 9ab5133 OSD: don't clear peering_wait_for_split in advance_map()
 545135f ReplicatedPG::recover_backfill: adjust last_backfill to HEAD if snapdir
-324dd54 (origin/wip-blkdev) test: test helper for get_block_device_size
+324dd54 test: test helper for get_block_device_size
 268785f blkdev: support blkdev size query on osx
 75b4b47 doc: Implemented changes suggested from feedback.
-41e052d (origin/revert-struct-init) Revert "fix -Wgnu-designator warnings"
+41e052d Revert "fix -Wgnu-designator warnings"
 96f4607 test/libcephfs: free cmount after tests finishes
 c0bcdc3 osd/erasurecode: correct one variable name in jerasure_matrix_to_bitmatrix()
-09e1597 (origin/wip-pgmap) mon/PGMap: use const ref, not pass-by-value
+09e1597 mon/PGMap: use const ref, not pass-by-value
 fb0f198 rbd: omit 'rw' option during map
 2db20d9 qa: don't run racy xfstest 008
-1bb5aad (origin/wip-6698) upstart: fix ceph-crush-location default
+1bb5aad upstart: fix ceph-crush-location default
 c3c962e doc: radosgw workaround for OpenStack Horizon bug
 cbc15bf doc: fix typo in openstack radosgw integration
 4032501 mon/OSDMonitor: refix warning
-9834ab9 (origin/wip-6673b) OSDMonitor: be a little nicer about letting users do pg splitting
-59a55fd (origin/wip-pipe) pipe: use pipe2 feature test; check fcntl retval
+9834ab9 OSDMonitor: be a little nicer about letting users do pg splitting
+59a55fd pipe: use pipe2 feature test; check fcntl retval
 5d0d0a9 fix -Wmismatched-tags warnings
 6efc2b5 fix -Wgnu-designator warnings
 382149c OSD: remove unused private var [-Wunused-private-field]
 cf29574 CrushTester: remove unused private var [-Wunused-private-field]
 cc08a4a auth: remove unused private var [-Wunused-private-field]
 306ec71 rgw: don't turn 404 into 400 for the replicalog api
-cd30e5f (origin/master-new) common/buffer.cc: fix rebuild_page_aligned typo
+cd30e5f common/buffer.cc: fix rebuild_page_aligned typo
 ffdd30e test: add gcc compile test for installed headers
 12238d0 libcephfs: Fix compilation for C compiler
 b3b1650 ceph: Fix compilation with GCC compiler
 524aee6 Really use the hostname, otherwise ambiguous
 f2622a4 ceph-object-corpus: revert accidental revert
 df3af6c docs: Fix a typo in RGW documentation
-588ed60 (origin/next-new) Wrap hex_to_num table into class HexTable
+588ed60 Wrap hex_to_num table into class HexTable
 148023d [rgw] Set initialized to true after populating table in hex_to_num()
 28e4271 sharedptr_registry.hpp: removed ptrs need to not blast contents
 4402898 prio-q: initialize cur iterator
@@ -8334,44 +11731,44 @@ fba056a doc: Removed install libvirt. Consolidated to install-vm-cloud.
 6b9104d doc/release-notes: formatting
 04710b5 doc/release-notes: fix formatting
 b605c73 doc/release-notes: fix indentation
-1de46d6 (origin/next-fix) os/chain_listxattr: fix leak fix
+1de46d6 os/chain_listxattr: fix leak fix
 6efd82c ceph: Release resource before return in BackedObject::download()
 e22347d ceph: Fix memory leak in chain_listxattr
 905243b Fix memory leak in Backtrace::print()
-be12f7f (origin/wip-6683) mon: OSDMonitor: proper error msg on invalid epoch on 'osd getmap/dump'
+be12f7f mon: OSDMonitor: proper error msg on invalid epoch on 'osd getmap/dump'
 e5efd88 mon: MonmapMonitor: support 'mon getmap [epoch]'
-e11c975 (tag: v0.72-rc1) v0.72-rc1
-a3119bc (origin/wip-crush-hook) upstart, sysvinit: use ceph-crush-location hook
+e11c975 v0.72-rc1
+a3119bc upstart, sysvinit: use ceph-crush-location hook
 9f6af8b ceph-crush-location: new crush location hook
 1e2e429 Revert "ceph-crush-location: new crush location hook"
 22ff717 Revert "upstart, sysvinit: use ceph-crush-location hook"
-0c18609 (origin/wip-6605) mon: OSDMonitor: fix comparison between signed and unsigned integer warning
+0c18609 mon: OSDMonitor: fix comparison between signed and unsigned integer warning
 e02740a mon: OSDMonitor: only allow an osd to boot iff it has the fsid on record
 42c4137 mon: OSDMonitor: fix some annoying whitespace
 60264f9 doc: Fixed formatting. Fixed hyperlink.
 46d897a doc: fix formatting.
 111a37e upstart, sysvinit: use ceph-crush-location hook
 fc49065 ceph-crush-location: new crush location hook
-df229e5 (origin/wip-6673) mon/PGMonitor: always send pg creations after mapping
+df229e5 mon/PGMonitor: always send pg creations after mapping
 2181b4c mon/OSDMonitor: fix signedness warning on poolid
 7a06a71 ReplicatedPG::recover_backfill: update last_backfill to max() when backfill is complete
-e46d2ca (origin/wip-pr781)   fix the bug  ctypes.util.find_library to search for librados failed on Centos6.4.   Signed-off-by: huangjun  <hjwsm1989 at gmail.com>
+e46d2ca   fix the bug  ctypes.util.find_library to search for librados failed on Centos6.4.   Signed-off-by: huangjun  <hjwsm1989 at gmail.com>
 f8fa309 ReplicatedPG: src_obcs can now be empty
-3b99cd0 (origin/wip-readdirend) mds: fix readdir end check
+3b99cd0 mds: fix readdir end check
 6eded8a doc: Fixes to normalize header hierarchy. Tweaked IA slightly.
 bd507ef doc: Updated with a verified installation procedure and latest usage.
-4e48dd5 (origin/wip-6585) osd/ReplicatedPG: use MIN for backfill_pos
+4e48dd5 osd/ReplicatedPG: use MIN for backfill_pos
 4139e75 ReplicatedPG: recover_backfill: don't prematurely adjust last_backfill
 ecddd12 ReplicatedPG: add empty stat when we remove an object in recover_backfill
 9ec35d5 ReplicatedPG: replace backfill_pos with last_backfill_started
 8774f03 PG::BackfillInfo: introduce trim_to
 46dfd91 PG::BackfillInterval: use trim() in pop_front()
 0a9a2d7 ReplicatedPG::prepare_transaction: info.last_backfill is inclusive
-5939eac (origin/wip-5612) upstart: fail osd start if crush update fails
+5939eac upstart: fail osd start if crush update fails
 177e2ab init-ceph: make crush update on osd start time out
-b28b64a (origin/wip-scripts) pybind: use find_library for libcephfs and librbd
-d5d36d0 (origin/wip-6621) radosgw-admin: accept negative values for quota params
-9d136a4 (origin/wip-obc) ReplicatedPG: no need to clear repop->*obc
+b28b64a pybind: use find_library for libcephfs and librbd
+d5d36d0 radosgw-admin: accept negative values for quota params
+9d136a4 ReplicatedPG: no need to clear repop->*obc
 f58396a doc/release-notes: emperor blurb
 8db03ed ReplicatedBackend: don't hold ObjectContexts in pull completion callback
 5a416da ReplicatedPG: put repops even in TrimObjects
@@ -8385,7 +11782,7 @@ ce33892 PG: call on_flushed on FlushEvt
 9b003b3 OpRequest: move method implementations into cc
 c4442d7 ReplicatedPG: reset new_obs and new_snapset in execute_ctx
 8a62bf1    fix the bug if we set pgp_num=-1 using "ceph osd pool set data|metadata|rbd -1"    will set the pgp_num to a hunge number.
-c7d975a (origin/wip-ceph-context) ceph_context: use condition variable for wake-up
+c7d975a ceph_context: use condition variable for wake-up
 e5efc29 test: Use a portable syntax for seq(1)
 fbabd42 test: Change interpreter from /bin/bash to /bin/sh
 0a1579d test: Use portable arguments to /usr/bin/env
@@ -8400,7 +11797,7 @@ f0f6750 common: add an hobject_t::is_min() function
 fe30ac6 rgw: Use JSONFormatter to use keystone API
 5733f9c rgw: Use keystone password to validate token too
 bd04a77 rgw: Adds passwd alternative to keystone admin token
-8282e24 (origin/wip-6635) mon/OSDMonitor: make racing dup pool rename behave
+8282e24 mon/OSDMonitor: make racing dup pool rename behave
 66a9fbe common: rebuild_page_aligned sometimes rebuilds unaligned
 c14c98d mon: OSDMonitor: Make 'osd pool rename' idempotent
 284b73b packages: ceph.spec.in is missing make as a build dependency
@@ -8420,13 +11817,13 @@ e17ff19 osd/osd_types: init SnapSet::seq in ctor
 d2b661d os/FileStore: fix getattr return value when using omap
 3a469bb os/ObjectStore: fix RMATTRS encoding
 847ea60 PGLog::read_log: don't add items past backfill line to missing
-3c0042c (origin/wip-rbd-parent-info) rbd.py: increase parent name size limit
+3c0042c rbd.py: increase parent name size limit
 87d3f88 PGMap::dirty_all should be asserting about osd_epochs, not in.osd_epochs
 0388b71 Update init-rbdmap
 0d326c3 ceph: tolerate commands without any child args
 cfe8451 rgw: eliminate one unnecessary case statement
 80384a1 Update init-rbdmap
-f9a6d71 (origin/wip-rgw-sync-next) radosgw-admin: remove unused function escape_str()
+f9a6d71 radosgw-admin: remove unused function escape_str()
 ec45b3b rgw: escape bucket and object names in StreamReadRequests
 dd308cd rgw: move url escaping to a common place
 e0e8fb1 rgw: update metadata log list to match data log list
@@ -8434,10 +11831,10 @@ c275912 rgw: include marker and truncated flag in data log list api
 e74776f cls_log: always return final marker from log_list
 ea816c1 rgw: skip read_policy checks for system_users
 1d7c204 Add a configurable to allow bucket perms to be checked before key perms through rgw_defer_to_bucket_acls config option.  This configurable defaults to an empty string.  Option values include:
-0e8182e (origin/wip-6620) mds: MDSMap: adjust buffer size for uint64 values with more than 5 chars
+0e8182e mds: MDSMap: adjust buffer size for uint64 values with more than 5 chars
 af1dee5 doc: clarify that mons must have qurorum during deploy
 4c8be79    rename test_arch.c --> test_arch.cc to avoid undefined reference to `__gxx_personality_v0' error.    Signed-off-by: huangjun  <hjwsm1989 at gmail.com>
-7ba4bc4 (origin/wip-monc-ping) cli: ceph: add support to ping monitors
+7ba4bc4 cli: ceph: add support to ping monitors
 400cb18 pybind: rados: ping a monitor via librados
 1a2e0eb pybind: rados: support ETIMEDOUT on make_ex()
 2d7ccab librados: support pinging a monitor without auth via RadosClient
@@ -8464,7 +11861,7 @@ df9315c doc: Removed RGW from quick start, since it is not ceph-deploy enabled.
 8a36503 doc: Moved CPU Profiler docs to dev.
 06ec0f6 doc: Removed Calxeda reference since it is now the same as normal install.
 b422d4a doc: Removed old files. Consolidated contents into new IA.
-1821ad7 (origin/wip-6242-b) pybind/rados: create InterruptedOrTimeoutError exception
+1821ad7 pybind/rados: create InterruptedOrTimeoutError exception
 1230886 ceph: move timeout
 8baeac0 ceph: catch exceptions thrown during the rados handle init
 d60e532 ceph: show basic help before initializing cluster connection
@@ -8473,8 +11870,8 @@ b33c315 ceph: default 5 second timeout for -h
 e922475 ceph: print basic options before connecting
 445e8c9 ceph: fixup do_help() function connection check
 32a23c5 ceph.in: add emacs modeline
-771b0c5 (origin/wip-6606) rgw: don't bother to call c_str() on strings passed to dump_string()
-dff41cd (origin/wip-test-librbd) ceph_test_librbd: fix heap overrun
+771b0c5 rgw: don't bother to call c_str() on strings passed to dump_string()
+dff41cd ceph_test_librbd: fix heap overrun
 eb5dd55   test_ceph_argparse.py: No assert_not_in and assert_in in nose.tools   Signed-off-by: huangjun  <hjwsm1989 at gmail.com>
 ccaab2a rgw: init src_bucket_name, src_object in the S3 handler
 db7eb77 rgw: get rid of req_state.bucket_name
@@ -8482,14 +11879,14 @@ cbf8f9a rgw: turn swift COPY into PUT
 1f6b8b2 librbd: parse args to ceph_test_librbd
 ad4553a librbd: fix build error
 bd2eeb7 ceph-mon: add debug to ip selection
-a107030 (origin/wip-5668-b) librbd: wire up flush counter
-715d2ab (origin/wip-6603) common/BackTrace: fix memory leak
+a107030 librbd: wire up flush counter
+715d2ab common/BackTrace: fix memory leak
 687ecd8 common/cmdparse: fix memory leak
-9fa357d (origin/wip-backtrace) mds: update backtrace when old format inode is touched
-34d0941 (origin/wip-6599) client: fix invalid iterator dereference in Client::trim_caps()
+9fa357d mds: update backtrace when old format inode is touched
+34d0941 client: fix invalid iterator dereference in Client::trim_caps()
 4f299ca autoconf: fix typo on AM_COMMON_CFLAGS
 94080de common: get_command_descriptions use cout instead of dout
-8586c75 (origin/wip-6582) ReplicatedPG: copy: conditionally requeue copy ops when cancelled
+8586c75 ReplicatedPG: copy: conditionally requeue copy ops when cancelled
 6dff926 PG: add a requeue_op() function to complement requeue_ops().
 45d1846 doc: Removed references to Chef.
 89995ef doc/release-notes: missed mds snaps
@@ -8497,21 +11894,21 @@ a107030 (origin/wip-5668-b) librbd: wire up flush counter
 5d2cf46 Update ceph-authtool.rst
 7bcfe09 ceph.spec.in:  Add erasure-code related files. Signed-off-by: huangjun  <hjwsm1989 at gmail.com>
 533626c Add Redhat init script option
-3cfe9f6 (origin/wip-cache-crc) common/buffer: invalidate crc on zero, copy_in
+3cfe9f6 common/buffer: invalidate crc on zero, copy_in
 8ec3aed common/buffer: fix crc_map types
 394ec17 common/buffer: drop unused fields
-2edc04c (origin/wip-6475-gl) ceph.spec.in:  Need to cread radosgw log directory.
+2edc04c ceph.spec.in:  Need to cread radosgw log directory.
 1f291f5 qa/workunits/rest/test.py: fix mds {add,remove}_data_pool test
 11fc80d doc/release-notes: link ot the changelog
 eb0a3b7 doc/release-notes: v0.61.9
 d3f0c0b Makefile: fix /sbin vs /usr/sbin behavior
 15ec533 OSD: check for splitting when processing recover/backfill reservations
 08177f2 ceph: Remove unavailable option with clang
-e509cb1 (tag: v0.71) v0.71
-10b466e (origin/wip-6475) radosgw: create /var/log/radosgw in package, not init script
+e509cb1 v0.71
+10b466e radosgw: create /var/log/radosgw in package, not init script
 5c280a2 .gitignore: ceph-kvstore-tool
 14e91bf debian, specfile: fix ceph-kvstore-tool packaging
-fd6e2b8 (origin/wip-kvstore-tool) ceph-kvstore-tool: copy one leveldb store to some other place
+fd6e2b8 ceph-kvstore-tool: copy one leveldb store to some other place
 85914b2 ceph-kvstore-tool: calc store crc
 da69fa0 tools: move 'test_store_tool' to 'tools/ceph-kvstore-tool'
 eafdc92 common/buffer: behave when cached crc stats don't start at 0
@@ -8527,21 +11924,21 @@ a3e9344 common/buffer: explicitly init zbuf to zeros
 6464516 common/buffer: instrument utilization of cached crcs
 0c23a56 common/buffer: cache crcs in buffer::raw
 8757775 include: add Spinlock
-72ce2ef (origin/wip-4047) cls_rbd: do not make noise in osd log on rbd removal
+72ce2ef cls_rbd: do not make noise in osd log on rbd removal
 e550e3d test_ceph_argparse: fix typo
 dbd6d97 test_ceph_argparse: fix mds {add,remove}_data_pool tests
 5838c09 common: fix non-daemon init
 1d4f501 test/filestore/run_seed_to.sh: avoid obsolete --filestore-xattr-use-omap
 982511e MonCommands: note that pg dump options don't work in plaintext
-c7acc2a (origin/wip-5716) rgw: gracefully handle bad root pool names
+c7acc2a rgw: gracefully handle bad root pool names
 488678f ceph_test_rados: fix snap remove vs rollback fix
 2701231 os/LevelDBStore: handle deletion race when checking store size
 92ea0d1 test/librados/cmd: fix compile error
-c69e76c (origin/wip-6059) ReplicatedPG: remove the other backfill related flushes
+c69e76c ReplicatedPG: remove the other backfill related flushes
 3469dd8 RadosModel: send racing read on write
 0246d47 ReplicatedPG: block reads on an object until the write is committed
 c658258 OSD: ping tphandle during pg removal
-4f403c2 (origin/wip-6334) common: don't do special things for unprivileged daemons
+4f403c2 common: don't do special things for unprivileged daemons
 5aa237e mon, osd: send leveldb log to /dev/null by default
 ab8f9b1 doc: Update from user feedback. Needed to enable S3/Keystone.
 4bb2a4b doc: Updated to ensure that socket name isn't static.
@@ -8550,21 +11947,21 @@ bd7a7dd os/FileStore: fix fiemap double-free(s)
 8b43d72 vstart.sh: create dev/ automatically
 8d7dbf8 rgw: change default log level
 70cc681 mon/PGMonitor: set floor below which we do not warn about objects/pg
-bebbd6c (origin/wip-6553) rgw: fix authenticated users acl group check
+bebbd6c rgw: fix authenticated users acl group check
 08327fe mon: osd pool set syntax relaxed, modify unit tests
 02b5eb2 ceph.spec.in: do not list ceph-rest-api twice; add missing cls_hello files
 dd33c98 osd/osd_types: generalize pg_pool_t::get_flags_string()
-5abe5c2 (origin/wip-6147) mon: OSDMonitor: add 'osd pool stats' command
+5abe5c2 mon: OSDMonitor: add 'osd pool stats' command
 2cd5320 mon: PGMap: rework client IO rate calc and output
 e3ba8e8 mon: PGMap: reuse existing summary functions to output pool stats
 82e3317 mon: PGMap: keep track of per-pool stats deltas
-e2602c5 (origin/wip-pool) mon: make 'mon {add,remove}_data_pool ...' take pool name or id
-d6146b0 (origin/wip-formatter-newlines) common/Formatter: add newline to flushed output if m_pretty
+e2602c5 mon: make 'mon {add,remove}_data_pool ...' take pool name or id
+d6146b0 common/Formatter: add newline to flushed output if m_pretty
 f2645e1 rgw: swift update obj metadata also add generic attrs
 6641273 SignalHandler: fix infinite loop on BSD systems
 2cc5805 doc: Removed underscore for consistency.
-4b8eb4f (origin/wip-rgw-quota) radosgw-admin: add --quota-scope param to usage
-f568501 (origin/wip-5025) mds: flock: fix F_GETLK
+4b8eb4f radosgw-admin: add --quota-scope param to usage
+f568501 mds: flock: fix F_GETLK
 3c6710b qa/workunits/misc/dirfrag: make it work on ubuntu
 b0f49e0 ReplicatedPG.h: while there cannot be a read in progress, there may be a read blocked
 bf82ba9 doc: disable cephx requires auth_supported = none
@@ -8585,14 +11982,14 @@ cdd851b doc: Moved installation portion to the installation section.
 a182535 librados: add some clarifying comments
 7ef5eb0 librados: drop reference to completion in container destructor
 f13cc68 doc: Fixed hyperlinks. Cleanup of old references to Chef.
-70250e8 (origin/wip-mon-pool-set) osd: osd_types: Output pool's flag names during dump
-7113186 (origin/wip-objecter-errors) osdc/Objecter: clean up completion handlers that set *prval=0
+70250e8 osd: osd_types: Output pool's flag names during dump
+7113186 osdc/Objecter: clean up completion handlers that set *prval=0
 82e9330 osdc/Objecter: only make handlers set *prval if EIO
 1c28869 mon: OSDMonitor: allow (un)setting 'hashpspool' flag via 'osd pool set'
 2fe0d0d mon: OSDMonitor: split 'osd pool set' out of 'prepare_command'
 6bbb772 test/filestore/run_seed_to_range.sh: fix -d syntax
-4e2ff53 (origin/wip-truncate2) mds: avoid leaking objects when purging file.
-eb381ff (origin/wip-filerecover) mds: don't decrease file size when recovering file
+4e2ff53 mds: avoid leaking objects when purging file.
+eb381ff mds: don't decrease file size when recovering file
 1803f3b radosgw-admin: limit user bucket-level quota
 18a271d mds: optimize map element dereference
 d8faa82 ReplicatedPG: remove unused RWTracker::ObjState::clear
@@ -8604,8 +12001,8 @@ fc35807 rgw: protect against concurrent async quota updates
 2e4ecc2 rgw: async quota update
 5bc6327 doc: Merge cleanup.
 c0c332c doc: minor clean up.
-007f06e (origin/wip-4405) mds: fix infinite loop of MDCache::populate_mydir().
-1f50750 (origin/wip-5992-3) ReplicatedPG: remove the other backfill related flushes
+007f06e mds: fix infinite loop of MDCache::populate_mydir().
+1f50750 ReplicatedPG: remove the other backfill related flushes
 db6623f RadosModel: send racing read on write
 2b216c3 ReplicatedPG: block reads on an object until the write is committed
 e8a2992 rgw: rearrange includes
@@ -8614,7 +12011,7 @@ e8a2992 rgw: rearrange includes
 768fb0a doc: fix openstack rbd installation command
 e21e573 os: stronger assert on FileStore::lfn_open
 0f323bc common: unintended use of the wrong bloom_filter prototype
-4b911cf (origin/wip-rval) ReplicatedPG: copy: use aggregate return code instead of individual Op return
+4b911cf ReplicatedPG: copy: use aggregate return code instead of individual Op return
 6da4b91 os/FileStore: fix ENOENT error code for getattrs()
 71ee6d7 mon: allow MMonGetMap without authentication
 f279641 mon: do not put() unhandle message
@@ -8647,9 +12044,9 @@ f989396 mds: properly store fragmenting dirfrags
 4014ba2 mds: delete orphan dirfrags during MDS recovers
 a881c1d mds: journal original dirfrags for rollback
 e927941 doc/release-notes: v0.67.4
-3fc6cfb (origin/wip-optracker) Makefile: add include/histogram.h to noinst_HEADERS
-bb9b9c8 (origin/wip-6143) common, os: Perform xattr handling based on detected fs type
-b87bc23 (origin/wip-5992-2) ReplicatedPG: lock snapdir obc during write
+3fc6cfb Makefile: add include/histogram.h to noinst_HEADERS
+bb9b9c8 common, os: Perform xattr handling based on detected fs type
+b87bc23 ReplicatedPG: lock snapdir obc during write
 0c2769d PGLog: on split, leave log head alone
 391a885 FileStore: make _setattrs not return -ENOENT most of the time
 0c1e251 ReplicatedPG: add debugging in recover_replicas for objects added for backfill
@@ -8662,8 +12059,8 @@ ff17e45 PG,ReplicatedPG: expose PGBackend to PG
 e73ec48 common/hobject: add is_degenerate method
 c8a4411 PGMap: calc_min_last_epoch_clean() will now also use osd_epochs
 091809b PGMap,PGMonitor: maintain mapping of osd to recent stat epoch
-e3bb065 (tag: v0.70, tag: mark-v0.70-wip) v0.70
-806725a (origin/wip-start-copy) ReplicatedPG: copy: add op progression output
+e3bb065 v0.70
+806725a ReplicatedPG: copy: add op progression output
 639ff9f ReplicatedPG: copy: don't leak a ctx on failed copy ops
 469d471 ReplicatedPG: assert that we have succeeded in do_osd_ops on copyfrom repeats
 f3733a2 ReplicatedPG: copy: switch CopyCallback to use a GenContext
@@ -8688,7 +12085,7 @@ f1e2393 mon: Monitor: reuse 'src_is_mon' bool on dispatch
 b8a1488 mon: Monitor: dissociate msg handling from session & connection logic
 d0d61b4 mon: Monitor: drop client msg if no session exists and msg is not MAuth
 ed1a54e mon: Monitor: assert on absense of connection during dispatch
-dce3d26 (origin/wip-5896) mon: MonmapMonitor: make 'ceph mon add' idempotent
+dce3d26 mon: MonmapMonitor: make 'ceph mon add' idempotent
 8cfeb83 common/bloom_filter: note that uint32_t interface requires well-mixed values
 9299f50 common/bloom_filter: speed up unit tests a bit
 4b23b65 common/bloom_filter: test binning fpp behavior
@@ -8706,7 +12103,7 @@ a8761a5 TrackedOp: just make CephContext member public
 ebae077 rgw: bucket stats also dump quota info
 7973d44 OpTracker: give TrackedOp a default dump() function
 baf1d40 rgw: init quota
-721f170 (origin/wip-fuse) client: remove requests from closed MetaSession
+721f170 client: remove requests from closed MetaSession
 63f5814 ceph: Update FUSE_USE_VERSION from 26 to 30.
 f8a947d client: trim deleted inode
 563517d rgw: update quota stats when needed
@@ -8783,7 +12180,7 @@ b032931 PendingReleaseNotes: update regarding librados change
 b245ca1 os/FileStore: add sloppy crc tracking
 8912462 rgw: drop async pool create completion reference
 4605792 librados: pool async create / delete does not delete completion handle
-7d1dc55 (origin/wip-doc-quickstart) doc: Diagram update and clarification on ceph-deploy admin results.
+7d1dc55 doc: Diagram update and clarification on ceph-deploy admin results.
 b23718a doc: Diagram update from feedback.
 245296a doc: Diagram updates from feedback.
 71ba833 doc: Updated block device quick start to use ceph-deploy. OS neutral.
@@ -8934,7 +12331,7 @@ ed73e0a doc: Adding context to the federated configuration guide.
 157754b common/config: include --cluster in default usage message
 362dba1 os, ceph_osd: Rename on_disk_version to target_version
 7b7e004 os: Prior version bump should have updated this
-27fb44b (origin/automake-fix-common) make: build common/secret only on linux
+27fb44b make: build common/secret only on linux
 8407669 os: Code conformance of os/LFNIndex.cc
 0d47bf8 os: Fix typo in comment
 c4bcb46 common: Fix get_namespace() definition in hobject_t
@@ -8948,7 +12345,7 @@ c4bcb46 common: Fix get_namespace() definition in hobject_t
 67386e4 mds: don't trim stray inode from the cache.
 0f3ba29 mds: remove unnecessary MDCache::maybe_eval_stray() calls
 cbf1f3c mds: evaluate stray when releasing inode/dentry's reference
-0a43974 (origin/automake-flags) automake: add per-target AM_CPPFLAGS
+0a43974 automake: add per-target AM_CPPFLAGS
 11d8d75 makefile-env: separate cppflags and cflags usage
 2b75abb ceph_test_rados: fix COPY_FROM completion
 935eb22 ceph_test_rados: fix seq_num, improve error output
@@ -8964,7 +12361,7 @@ c55d7ac doc: Updated graphic to use same name as command line examples.
 5eb4db1 doc: Removed Get Involved from Quick Start.
 af7ad1d doc: Changed title, and removed recommendations sections.
 dc19d24 doc: Moved recommendations sections to Intro.
-b1eeadd (origin/wip-6361) qa: workunits: cephtool: check if 'heap' commands are parseable
+b1eeadd qa: workunits: cephtool: check if 'heap' commands are parseable
 296f2d0 osd: OSD: add 'heap' command to known osd commands array
 238fe27 mds: MDS: pass only heap profiler commands instead of the whole cmd vector
 c98b910 perfglue/heap_profiler.cc: expect args as first element on cmd vector
@@ -8990,7 +12387,7 @@ a8bbb81 OpTracker: remove the references to "osd" in config variables
 5c46fc4 doc: Made some changes and incorporated a draft diagram.
 5bb7417 doc: Added draft of region/zone diagram.
 bcc1680 mon: fix inverted test in osd pool create
-f3718c2 (origin/prctl-getname-test) code_env: use feature test for PR_GET_NAME support
+f3718c2 code_env: use feature test for PR_GET_NAME support
 08fe028 rgw: use bufferlist::append() instead of bufferlist::push_back()
 fd6646f Makefile: fix unittest_arch
 5421d6d Makefile: fix unittest_crc32c
@@ -9005,7 +12402,7 @@ e303b96 mds: re-integrate stray when link count >= 1
 9601092 os/FileStore: fix uninitialized var
 b66ac77 osdc/ObjectCacher: finish contexts after dropping object reference
 ce723b5 doc/release-notes: v0.69
-6ca6f2f (tag: v0.69) v0.69
+6ca6f2f v0.69
 5541a1d doc: Updated link to Storage Cluster Quick Start.
 6af8e3c doc: Updated link to Storage Cluster Quick Start.
 b1d58fa doc: Updated link to Storage Cluster Quick Start.
@@ -9031,7 +12428,7 @@ df7c36a osd/ReplicatedPG: factor some bits into finish_copy
 59147be osd: compute full ratio from kb_avail
 89e8b8b ErasureCode: improve API implementation example
 3966e6e ErasureCode: proofread abstract API documentation
-971bf60 (origin/wip-buck-centos-core) Remove unneeded junit4 check
+971bf60 Remove unneeded junit4 check
 e38bd8d Removing extraneous code
 cd6f4bc Use a loop for testing jdk paths
 3cef755 fix some comments
@@ -9045,7 +12442,7 @@ abd2fcd ErasureCode: fix uninitialized variable warning
 4216eac rgw: try to create log pool if doesn't exist
 b86c068 hadoop: remove hadoop shim
 e7f7483 rgw: NULL terminate buffer before parsing it
-3f8c969 (origin/fix-no-tcmalloc-build) make: add tmalloc lib dep in tcmalloc guard
+3f8c969 make: add tmalloc lib dep in tcmalloc guard
 daf417f osd/ReplicatedPG.cc: Verify that recovery is truly complete
 139a714 osd/OSD.cc: Use MIN() so that we don't exceed osd_recovery_max_active
 4633729 mon/OSDMonitor: make busy creating pgs message more explicit
@@ -9098,7 +12495,7 @@ e435468 ErasureCodeJerasure: base class for jerasure ErasureCodeInterface
 661b377 os/FileStore: pass old + new object name to lfn_link
 deea63f osd: expose bytes used/avail via perf / asok
 3bc618b qa: workunits: mon: crush_ops: test 'ceph osd crush move'
-7d3799f (origin/wip-6230) mon: MonCommands: expect a CephString as 1st arg for 'osd crush move'
+7d3799f mon: MonCommands: expect a CephString as 1st arg for 'osd crush move'
 132e403 autoconf: use $(LIBOSD) $(LIBCOMMON) instead of libosd.a libcommon.la
 4c5b3c7 doc: Syntax fix to suppress gitbuilder errors.
 8bf858f doc: Removed mkcephfs reference.
@@ -9169,7 +12566,7 @@ a9a516a Correct syntax for generate swift key
 ab69d99 mon: fix typo and remove redundant sentence
 7c09ede mon: fix typo in comment
 3c9f849 doc: erasure code ghobject is made of gen_t + shard_t
-b4cf0f2 (tag: v0.68) v0.68
+b4cf0f2 v0.68
 dcbdeaf doc: Fix repo URL for Ceph cloning (dev/generatedocs)
 996af2d ceph_test_rados: test COPY_FROM
 ed68079 osd: initial COPY_FROM (not viable for large objects)
@@ -9275,7 +12672,7 @@ ea2fc85 SharedPtrRegistry: get_next must not delete while holding the lock
 af5281e common: move SharedPtrRegistry test after t.join
 c5b5ce1 osd: install admin socket commands after signals
 76a38c3 mon/DataHealthService: preserve compat of data stats dump
-f0805cb (origin/wip-6122) test/librados/cmd.cc: tolerate thrashing on pg_command tests
+f0805cb test/librados/cmd.cc: tolerate thrashing on pg_command tests
 d571825 WBThrottle: use fdatasync instead of fsync
 3528100 FileStore: add config option to disable the wbthrottle
 ed712c1 fix nss lib name
@@ -9458,7 +12855,7 @@ ef9c991 mon: make pg info in 'status' more informative and visually parseable
 f417b10 osdmap: move oneliner summary to separate function
 6f5d803 librados: fix MWatchNotify leak
 810c52d rgw: do not leak handler in get_handler() error path
-e3b7bc5 (tag: v0.67) v0.67
+e3b7bc5 v0.67
 977b7f5 doc/release-notes: fix rst
 f501ec7 doc/release-notes: upgrade sequence
 de7bbdb doc/release-notes: roll-up of upgrade/compat notes from cuttlefish to dumpling
@@ -9572,7 +12969,7 @@ e70e08c cephtool/test.sh: add tests for mon daemon command
 47d0d64 Make all AdminSocket commands use argparse/cmdmap.
 736d6a1 rgw: fix set_buckets_enabled(), set_bucket_owner()
 0e125e0 Objecter: set c->session to NULL if acting is empty
-16adb91 (tag: v0.67-rc3) v0.67-rc3
+16adb91 v0.67-rc3
 e747fa8 Revert "Use dh_installinit to install upstart job files"
 ebab04e mon: add missing state name
 b8af38b mon: allow others to sync from us across bootstrap calls
@@ -9684,7 +13081,7 @@ e4dfe8a test_rgw_admin_meta.cc: remove unused variable 'creds'
 4c778e2 test_rgw_admin_meta.cc: use static_cast<>() instead of C-Style cast
 a8b70f0 doc/release-notes: v0.67-rc2
 41930b5 ceph.spec.in, debian/control: python-ceph depends on python-flask
-0018b45 (tag: v0.67-rc2) v0.67-rc2
+0018b45 v0.67-rc2
 fe2019c rest/test.py: cope with older requests.py versions
 fd1fd66 ceph-disk: use new get_dev_path helper for list
 0b8cad1 ceph_rest_api.py: allow config section fallback
@@ -9744,7 +13141,7 @@ b26b7f6 mon/Paxos: only share uncommitted value if it is next
 99e6054 mon/Paxos: accepted_pn_from has no semantic meaning
 a61635e ceph-monstore-tool: dump paxos transactions
 e60d14d ceph.in: reject --admin-daemon so it can't do harm
-835dd97 (tag: v0.67-rc1) v0.67-rc1
+835dd97 v0.67-rc1
 58c78db FileJournal: fix posix_fallocate error handling
 0897d3a OSD::_make_pg: use createmap, not osdmap
 2dbb273 src/*: make Context::finish private and switch all users to use complete
@@ -10138,7 +13535,7 @@ a498432 ReplicatedPG: pass a PushOp into handle_pull_response
 82cb922 ReplicatedPG: split send_push into build_push_op and send_push_op
 31e19a6 ReplicatedPG: _committed_pushed_object don't pass op
 0f51b60 ReplicatedPG: submit_push_data must take recovery_info as non-const
-b6b48db (tag: v0.66) v0.66
+b6b48db v0.66
 a990664 mon: implement simple 'scrub' command
 afd6c7d mon: fix osdmap stash, trim to retain complete history of full maps
 dd1e6d4 Revert "Makefile: fix ceph_sbindir"
@@ -10279,7 +13676,7 @@ d31ed95 mon/PaxosService: allow paxos service writes while paxos is updating
 7a2566c rgw: remove test placement info
 224130c rgw (test): remove some warnings
 1b162ce rgw: initialize user system flag
-7681c58 (origin/wip-rgw-geo-bucketinstance) rgw: log in the same shard for bucket entry point and instance
+7681c58 rgw: log in the same shard for bucket entry point and instance
 d4e39a7 rgw: unlink/link don't always update entry point
 5680fa1 doc/release-notes: v0.65
 6673b2d rgw: tie metadata put to bucket link/unlink
@@ -10295,7 +13692,7 @@ c8f7936 mon/AuthMonitor: start at format 1 (latest) for new clusters
 0d73eb4 mon/PGMonitor: drop some dead code
 0fd776d mon/PGMap: make int type explicit
 29e14ba mon/PaxosService: s/get_version()/get_last_committed()/
-c2d517e (tag: v0.65) v0.65
+c2d517e v0.65
 3016f46 get_xattr() can return more than 4KB
 6e320a1 skip TEST(EXT4StoreTest, _detect_fs) if DISK or MOUNTPOINT are undefined
 63e81af rgw: multiple fixes related to metadata, bucket creation
@@ -10382,7 +13779,7 @@ ab79ba4 cls_replica_log: integrate with RGWRados
 e4ef5c6 cls_replica_log: add the actual class
 22a02e9 cls_replica_log: add ops for new class
 d1c9594 cls_replica_log: add types for new class
-0deb6d4 (origin/wip-rgw-geo-enovance) rgw: lock related modifications
+0deb6d4 rgw: lock related modifications
 3b4c11b rgw: add max-entries, marker for log operations
 714f212 osdc: re-calculate truncate_size for strip objects
 ebb46c4 qa/workunits/misc/multiple_rsync.sh: wtf
@@ -10471,7 +13868,7 @@ f5f8314 rgw: object mtime the same for both object and bucket index
 92997a4 mon: fix 'osd dump <epoch>'
 8c6b24e ceph-disk: add some notes on wth we are up to
 94b3700 rgw: intra-region copy, preserve mtime
-29eb333 (origin/wip-log-rewrite-sam) test/osd/TestPGLog: %s/dirty()/is_dirty()
+29eb333 test/osd/TestPGLog: %s/dirty()/is_dirty()
 f164a32 PGLog: check for dirty_to != eversion_t() and dirty_from != eversion_t::max()
 c6dd60d PGLog: rename dirty() to is_dirty()
 2ad319b PGLog: only iterate over dirty portions of the log
@@ -10647,7 +14044,7 @@ b70f565 ceph: flush stdout on watch print
 299f6a6 Usage log and ops log are disabled by defaults since 0.56
 de17238 mon: fix 'pg dump_stuck' stuckops type
 afa16b4 qa: multiple_rsync.sh: more output
-42e06c1 (tag: v0.64) v0.64
+42e06c1 v0.64
 68b5fa9 ceph-fuse: older libfuses don't support FUSE_IOCTL_COMPAT
 1577e20 ceph-create-keys: Make sure directories for admin and bootstrap keys exist
 95434d1 rgw: propagate mtime from remote rgw on copy
@@ -10921,7 +14318,7 @@ e634d9d Use new fuse package instead of fuse-utils
 4af917d os/LevelDBStore: do compact_prefix() work asynchronously
 dd35c26 osd: fix note_down_osd
 45b84f3 osd: fix hb con failure handler
-054e96c (tag: v0.63) v0.63
+054e96c v0.63
 64d1178 rgw: mdlog, bilog RESTful api cleanup
 fabe723 ceph: first cut at --completion
 2dcc266 ceph, MonCommands.h:  code format/help format tweaks
@@ -11255,7 +14652,7 @@ f36ec02 doc: Updated architecture document.
 1c53991 fix typos and add hyperlink to peering
 b7d4012 typo s/come/some/
 dbddffe update op added to a waiting queue or discarded
-e9935f2 (origin/wip-rgw-bucketlog-3) ceph_json: fix bool decoding
+e9935f2 ceph_json: fix bool decoding
 67ecd75 rgw: json_encode json a bit differently
 afeb8f2 md/Sever.cc: fix straydn assert
 e69257e rgw/rgw_user.cc: fix possible NULL pointer dereference
@@ -11280,7 +14677,7 @@ cab8e9b test/kv_store_bench.cc: fix resource leak
 d8cb7df filestore/test_idempotent_sequence.cc: fix FileStore leaks
 349cfb4 ceph-filestore-dump.cc: cleanup on error case
 df4c099 ceph-filestore-dump.cc: cleanup resource in error case
-9382379 (tag: v0.62) v0.62
+9382379 v0.62
 c5deb5d doc/release-notes: v0.61.2
 97a7309 rgw: tie bucket/user removal to mdlog differently
 4bb4063 ceph_test_libcephfs: parse environment
@@ -11519,7 +14916,7 @@ c693ba5 rados: add whole-object 'clonedata' command
 bd36e78 osd: make class load errors louder
 0b4c5c1 osd: optionally enable leveldb logging
 c1d5f81 mon: allow leveldb logging
-237f3f1 (tag: v0.61) v0.61
+237f3f1 v0.61
 eb69c7d os/: default to dio for non-block journals
 60603d0 ceph-disk: use separate lock files for prepare, activate
 e662b61 ceph-test.install: add ceph-monstore-tool and ceph-osdomap-tool
@@ -12007,7 +15404,7 @@ fc13f11 PG::_scan_list: assert if error is neither -EIO nor -ENOENT
 fcec1a0 ObjectStore: add allow_eio to read, stat, get_omap_header
 76ad956 librados: test empty ObjectWriteOperation
 690e4df Makefile.am: disable building ceph_test_cors when radosgw is not enabled
-f26f7a3 (tag: v0.60) v0.60
+f26f7a3 v0.60
 267ce0d librados: don't use lockdep for AioCompletionImpl
 78acc5c test: fix signed/unsigned comparison in test_cors
 d5b7970 PG: don't compare auth with itself
@@ -12333,7 +15730,7 @@ a27cb85 ceph-disk: fix adjust_symlink() replace 'canonical' with 'path'
 d3c60dc fix: Redefining name 'uuid' from outer scope (line 14)
 6a8120d ceph-disk: remove unused variables from list_partitions()
 3af7a1a ceph-disk: fix /dev/dm-[0-9] handling list_all_partitions()
-cbae6a4 (tag: v0.59) v0.59
+cbae6a4 v0.59
 9bcf5b6 ceph-disk: rename local variable shadowing builtin
 4adf088 ceph-disk: remove twice defined identical function unmount
 ea26ea0 ceph-disk: remove twice defined function mount
@@ -12734,7 +16131,7 @@ a6196de ceph-disk-prepare: verify device is not in use by device-mapper
 867586c debian: require >= python2.6 for ceph as well
 f03f626 ceph-disk-prepare: clean up stupid check for a digit
 5950554 ceph-disk-prepare: use os.path.realpath()
-ba3f91e (tag: v0.58) v0.58
+ba3f91e v0.58
 66df847 Add X-Python-Version >=2.6 to debian control file.
 a06ea30 PG,ReplicatedPG: use pg_has_reset_since to discard old async events
 83e9aa5 PG::build_scrub_map: detect race with peering via last_peering_reset
@@ -13048,7 +16445,7 @@ dbadb3e PG: remove weirdness log for last_complete < log.tail
 5fc83c8 os/FileStore: check replay guard on src for collection rename
 56c5a07 osd: requeue pg waiters at the front of the finished queue
 f1841e4 osd: pull requeued requests off one at a time
-9a7a9d0 (tag: v0.57) v0.57
+9a7a9d0 v0.57
 4002d70 osd: fix printf warning on pg_log_entry_t::get_key_name
 f80f849 qa: test_mon_workloadgen: use default config file path
 6d33859 qa: mon/workloadgen.sh: drop TEST_CEPH_CONF code
@@ -13501,7 +16898,7 @@ e51299f mds: open mydir after replay
 7cd4e50 client: Wait for caps to flush when flushing metadata.
 907c709 mds: Send created ino in journaled_reply
 cf7c3f7 client: Don't use geteuid/gid for fuse ll_create
-0b66994 (origin/wip-3930) ceph.spec.in:	package rbd udev rule
+0b66994 ceph.spec.in:	package rbd udev rule
 a7d15af mon: smooth pg stat rates over last N pgmaps
 ecda120 doc: fix overly-big fixed-width text in Firefox
 3f6837e mon/PGMap: report IO rates
@@ -13882,7 +17279,7 @@ fcb9f98 mds: use null dentry to find old parent of renamed directory
 2627957 mds: don't trigger assertion when discover races with rename
 e10267b mds: fix Locker::simple_eval()
 7e23321 mds: don't renew revoking lease
-1a32f0a (tag: v0.56) v0.56
+1a32f0a v0.56
 49ebe1e client: fix _create created ino condition
 a10054b libcephfs: choose more unique nonce
 e2fef38 client: fix _create
@@ -14090,7 +17487,7 @@ e6dd068 qa: echo commands run by rbd map-unmap workunit
 ae100cf mount.fuse.ceph: add ceph-fuse mount helper
 ac92e4d /etc/init.d/ceph: fs_type assignment syntax error
 4605fdd filestore: Don't keep checking for syncfs if found
-8e25c8d (tag: v0.55.1) v0.55.1
+8e25c8d v0.55.1
 dba0960 OSD: pg might be removed during disconnect_session_watches
 047aecd PG,ReplicatedPG: handle_watch_timeout must not write during scrub/degraded
 0dfe6c8 ReplicatedPG:, remove_notify, put session after con
@@ -14209,7 +17606,7 @@ b76f12d doc: Edited striping section. Modified stripe graphic to pretty print. A
 8cd8f25 mds: don't create bloom filter for incomplete dir
 3ace9a7 logrotate: do not spam stdout
 a74a4ac doc: Added a striping section for Architecture.
-690f817 (tag: v0.55) v0.55
+690f817 v0.55
 234cc08 ceph.spec.in:  Add SLES and remove Fedora from debug package list.
 2604557 test_rados_api_misc: fix dup rmmkey test
 f2c7a60 doc: Fixed many hyperlinks, a few typos, and some minor clarifications.
@@ -14527,7 +17924,7 @@ a0eb891 osd: default pool min_size to 0 (which gives us size-size/2)
 735df02 mon: helpful warning in 'health detail' output about incomplete pgs
 1679a55 osd: start_boot() after init()
 65961ca vstart.sh: support -X by adding 'auth required = none' entries
-60b84b0 (tag: v0.54) v0.54
+60b84b0 v0.54
 5d27f3d rgw: compile with -Woverloaded-virtual
 1be9923 rgw: fix RGWCache api
 e0e33d2 rgw: fix RGWCache api
@@ -15032,7 +18429,7 @@ d2afddd rgw: multiple coverity fixes
 db97666 mds: explicitly queue messages for unconnected clients
 2542dd5 client: fix implemented caps update on release/flush
 b290dc3 MClientRequest: fix mode formatting
-2528b5e (tag: v0.53) v0.43
+2528b5e v0.43
 0d3a53d msg/Pipe: correctly read from peers without MSG_AUTH
 96e365b radosgw-admin manpage: Fix broken quotes
 412efc1 admin_socket: fix '0' protocol version
@@ -15404,7 +18801,7 @@ c9266d6 rgw: check that realloc succeeded
 25a9620 FileJournal: correctly check return value of lseek in write_fd
 303f640 OSDCap: remove grants added during failed parsing
 3144690 doc: fix injectargs syntax
-e488594 (tag: v0.52) v0.52
+e488594 v0.52
 9e9c5f2 osd: make 'pg <pgid> revert ...' command idempotent
 175465b cephfs:  Fix breakage of cephfs link
 8c3bfaa doc: update ceph-authtool man page
@@ -15933,7 +19330,7 @@ a30f714 rgw-admin: get rid of lazy remove option, other fixes
 721a6be rgw: implement garbage collector
 bd534bf mon: make parse_pos_long() error message more helpful
 c7d11cd osd: turn off lockdep during shutdown signal handler
-c03ca95 (tag: v0.51) v0.51
+c03ca95 v0.51
 aa91cf8 mon: require --id
 5fd2f10 mon: fix int parsing in monmon
 31c8ccb mon: check for int parsing errors in mdsmon
@@ -16090,7 +19487,7 @@ da35b4c msgr: make set_policy_throttler safe, act on default
 8af2cf3 msgr: expose get_policy() through generic Messenger API
 1740bd0 crush: add helper has_nondefault_tunables()
 c8af0fa cephtool: make command wait for osdmap explicit
-a4428bd (tag: v0.50) v0.50
+a4428bd v0.50
 ac02b34 msg/Pipe: discard_queue() -> discard_out_queue()
 d58df35 msg/Pipe: simplify Pipe::tcp_read() return value
 76954c1 msg/Pipe: document tcp_*()
@@ -16240,7 +19637,7 @@ a16d9c6 os: KeyValueDB: allow finer-grained control of transaction operations
 48bd839 librbd: replace assign_bid with client id and random number
 67832c3 osd: fix ACK ordering on resent ops
 96dbc41 rados::cls::lock: move api types into namespace
-ca6265d (tag: v0.49) v0.49
+ca6265d v0.49
 c8f1311 mon: make 'ceph osd rm ...' wipe out all state bits, not just EXISTS
 f42e187 cls_rbd, cls_rbd_client, test_cls_rbd: copyup method
 127ff61 librbd: drop unnecessary std:: and struct prefixes
@@ -16591,7 +19988,7 @@ b7007a1 msgr: preserve incoming message queue when replacing pipes
 2429556 msgr: fix pipe replacement assert
 204bc59 msgr: do not try to reconnect con with CLOSED pipe
 e6ad6d2 msgr: move to STANDBY if we replace during accept and then fail
-c2b20ca (tag: v0.48argonaut) v0.48argonaut
+c2b20ca v0.48argonaut
 b5098b3 ceph.spec.in: Change license of base package to GPL and use SPDX format
 a1fe589 mon: initialize quorum_features
 d82a502 qa: add rbd remove tests
@@ -16690,7 +20087,7 @@ ddf7e83 doc: ceph osd crush add is now ceph osd crush set
 1e539da doc: Normalized shell script syntax. Added generic cookbook path.
 7d38758 doc: Changed libvirt-dev to libvirt-bin, and cleaned up ./autogen.sh
 3e32dd0 doc: Typo.
-c467d9d (tag: v0.47.3) v0.47.3
+c467d9d v0.47.3
 17dcf60 filestore: disable 'filestore fiemap' by default
 88c7629 OSD: clear_temp: split delete into many transactions
 b84e1ed doc: document usage log
@@ -17044,7 +20441,7 @@ f1e4d44 upstart: support mds
 475e07a debian: Create placeholder dirs for mon and osd data directories.
 ab04d95 monmap: return ENOENT from build_from_host_list when no mons
 80c6278 monmap: ignore conf mon entries when -m is specified
-8bf9fde (tag: v0.47.2) v0.47.2
+8bf9fde v0.47.2
 26843ad Makefile: do not install librgw.h
 a330c64 Update ceph.spec for ceph-0.47
 244992d libs3: remove make install target
@@ -17092,7 +20489,7 @@ c9e9896 obj_bencher: adding stddev to bandwidth and latency
 bc9e592 obj_bencher: add min/max bandwidth
 133cd69 mon: fix 'no initial monitors' warning
 3a2dc96 libs3: remove make install target
-f5a9404 (tag: v0.47.1) v0.47.1
+f5a9404 v0.47.1
 4e3807b Makefile.am: only append libs3 to SUBDIRS when needed
 13b350a filestore: test xattrs on temporary file
 6c0a122 ceph-object-corpus: add some old v2 monmaps
@@ -17108,7 +20505,7 @@ dae2f53 mon: fix leak of MonMap
 7b2614b monmap: filter_initial_members -> set_initial_members
 13c86ca Update leveldb to remove CompactionInputErrorParanoid
 7d5fa4e man: Remove leftover --bin from ceph-authtool(8).
-ad663d5 (tag: v0.47) v0.47
+ad663d5 v0.47
 e2e7f58 keys: new release key
 5d2ec1d builder: make reweight helpers static, void
 63580a9 formatter: replace malloc with new
@@ -17386,7 +20783,7 @@ f3771b0 throttle: feed cct, name, and add logging
 7413828 osdmap: do no dereference NULL entity_addr_t pointer in addr accessors
 4c597fa OSD: add different config options for map bl caches
 cefaa7d mon: fix nion -> noin typo
-cb7f1c9 (tag: v0.46) v0.46
+cb7f1c9 v0.46
 88dda3b librbd: use unique error code for image removal failures
 580b520 run_xfstests.sh: drop #62
 4465c3a osdmap: fix identify_osd() and find_osd_on_ip()
@@ -17746,7 +21143,7 @@ cd4a760 osd: fix heartbeat set_port()
 4f030e1 osd_types: fix off by one error in is_temp
 31f16a4 rgw: list multipart response fix
 89fecda Makefile.am: remove some clutter
-0aea1cb (tag: v0.45) v0.45
+0aea1cb v0.45
 d348e1a configure: --with-system-leveldb
 34cc308 filestore: fix leveldb includes
 0b2e1cd cephfs: fix uninit var warning
@@ -17761,7 +21158,7 @@ dd8fd16 configure: HAVE_FALLOCATE -> CEPH_HAVE_FALLOCATE
 dfa043d config: {osd,mon}_data default to /var/lib/ceph/$type/$cluster-$id
 7680cda dencoder, rgw: make ceph-dencoder load much faster
 9832696 encoding: use iterator to copy_in encoded length
-689ac5d (tag: v0.44.2) v0.44.2
+689ac5d v0.44.2
 e0c4db9 FileStore: do not check dbobjectmap without option set
 38e24b1 config: include /etc/ceph/$cluster.keyring in keyring search path
 57dff03 config: expand metavariables for --show-config, --show-config-value
@@ -17867,7 +21264,7 @@ a52d048 rgw: throttle incoming requests
 c3b0464 paxos: share_state sends every unknown value, including the stashed one
 d5c4015 uclient: We want to release cache when we lose the CACHE cap, not gain it!
 d0ba27a doc: add a short thing on kernel client troubleshooting.
-c89b7f2 (tag: v0.44.1) v0.44.1
+c89b7f2 v0.44.1
 6044c5b hadoop: define subsystem, fix logging
 fe56818 config: configure log thresholds
 ce61a83 log: throttle message submission, trim recent
@@ -17922,7 +21319,7 @@ fc7a1bd ReplicatedPG: return -EBUSY on delete for objects with watchers
 c53194d Objecter: add op->resend_on_any_change
 2daff0e ReplicatedPG: osd_max_notify_timeout -> osd_default_notify_timeout
 6a5cbec rgw: replace bucket_id generation
-cdd5298 (tag: v0.44) v0.44
+cdd5298 v0.44
 e42fbb7 rgw: process default alt args before processing conf file
 e0b8f7a rgw: process default alt args before processing conf file
 51a0733 rgw: incrase socket backlog
@@ -18131,7 +21528,7 @@ ffa5955 msgr: Remove SimpleMessenger::register_entity
 3bd1d2a msgr: add start() and wait() stubs to the Messenger interface
 70360f8 github.com/NewDreamNetwork -> github.com/ceph
 cacf0fd filestore: fix rollback safety check
-9fa8781 (tag: v0.43) v0.43
+9fa8781 v0.43
 3a83517 RadosModel: separate initialization and construction
 cd31388 librados: only shutdown objecter after it's initialized
 2c275ef Makefile: add headers for distcheck
@@ -18185,7 +21582,7 @@ ee4d990 journaler: log on unexpected objecter error
 3ad6ccb debian: sdparm|hdparm, new standards version
 266902a rgw: initialize bucket_id in bucket structure
 f8f6e4d rgw: _exit(0) on SIGTERM
-732f3ec (tag: v0.42.2) v0.42.2
+732f3ec v0.42.2
 d85ed91 osd: fix array index
 722e9e5 lockdep: don't make noise on startup
 fdaed0a formatter: fix trailing dump_stream()
@@ -18205,7 +21602,7 @@ c9416e6 osd: 'tell osd.N mark_unfound_lost revert' -> 'pg <pgid> mark_unfound_lo
 5efa821 rgw: swift read acls allow bucket listing
 f09fb87 rgw: fix swift acl enforcement
 7c7349e ceph: fix help.t
-c3e1291 (tag: v0.42.1) v0.42.1
+c3e1291 v0.42.1
 0281f1c debian: add ceph-dencoder
 f6e42a8 ceph.spec.in: add ceph-dencoder
 730b9ee ceph-dencoder: man page
@@ -18268,7 +21665,7 @@ a4f2fdb osdmap: add Incremental::dump()
 d74e029 test/encoding/readable.sh: sh, not dash
 e33bf5a crushtool: fix clitests
 0429aa7 msgr: fix shutdown race again
-b205c64 (tag: v0.42) v0.42
+b205c64 v0.42
 76e88d1 msgr: fix accept shutdown race fault
 ca04ee1 mon: test injected crush map
 5dd24f9 crush: move crushtool --test into CrushTester
@@ -18628,7 +22025,7 @@ dbda1b6 CephContext: add method for retrieving admin socket
 56d164c mon: stale pgs -> HEALTH_WARN
 61c54a7 mon: mark pgs stale in pg_map if primary osd is down
 6e44af9 osd: add STALE pg state bit
-c1345f7 (tag: v0.41) v0.41
+c1345f7 v0.41
 374fec4 objector: document Objecter::init_ops()
 6d37d5c objecter: fix out_* initialization
 9472920 Revert "common/Throttle: Remove unused return type on Throttle::get()"
@@ -18755,7 +22152,7 @@ a6c0610 msgr: uninline operator<< on sockaddr_storage
 0ded7e4 ReplicatedPG: munge truncate_seq 1/truncate_size -1 to seq 0/size 0
 44cb076 rgw: limit object PUT size
 d575337 objecter: fix up stat, getxattrs handlers
-7eea40e (tag: v0.40) v0.40
+7eea40e v0.40
 81c0ad8 librados: make new ObjectReadOperations arguments non-optional
 7347538 rgw: use new librados ObjectReadOperation method arguments
 4815caf ReplicatedPG: Update stat accounting for truncate during write
@@ -19183,7 +22580,7 @@ c883100 rados.py: add list_pools method
 c45a849 mds: remove obsolete doc
 0c183ec crush: ignore forcefed input that doesn't exist
 faf5ce6 Revert "CrushWrapper: ignore forcefeed if it does not exist"
-321ecda (tag: v0.39) v0.39
+321ecda v0.39
 75aff02 OSDMap: build_simple_from_conf pg_num should not be 0 with one osd
 2f5bd5f objecter: initialize global_op_flags to zero
 813523a Doc: delete gratuitous index.html
@@ -19334,7 +22731,7 @@ add04d1 filejournal: fix replay of non-idempotent ops
 71bfe89 test/pybind: add test_rgw
 ea42e02 test/pybind: convert python rados and rgw tests to be runnable by nose
 25cde7f rados.py: fix Snap.get_timestamp
-b600ec2 (tag: v0.38) v0.38
+b600ec2 v0.38
 2a7fbe0c common: return null if mc.init() unsuccessful
 a177a70 rbd.py: fix list when there are no images
 27bb48c mon: overwrite in put_bl
@@ -19626,7 +23023,7 @@ edcd4d9 rgw: some more swift fixes
 9baf5ef ceph.spec: don't chkconfig
 21d941e ceph.spec: work around build.opensuse.org
 195a484 ceph.spec: capitalize first letter to make rpmlint happy
-a6f3bbb (tag: v0.37) v0.37
+a6f3bbb v0.37
 ca8f603 osd: fix assemble_backlog
 2fdec7b osd: fix add_next_event Missing::item::have
 c1cabf5 ceph: don't crash when sending message to !up osd
@@ -19833,7 +23230,7 @@ f85dfa7 osd: combine log_op into append_log
 8111b61 mds: make jouranl writeable in MDLog::append()
 f4e61db mdcache: tolerate no subtrees in create_subtree_map()
 4922757 ceph.spec.in still packages libceph.so in 0.36
-877cacb (tag: v0.36) v0.36
+877cacb v0.36
 c00e06f doc: add documentation for librbd python bindings
 683f4dc rbd.py: add class and method documentation
 1c1785d rbd.py: use a dict to translate error codes
@@ -20046,7 +23443,7 @@ aa666b1 objclass: add stat operation
 2ec5f15 cls_rgw: list marker uses lower_bound
 894a8b6 client: tear down dir when setting I_COMPLETE on empty
 b9e32ff rados: accept '-b' as an argument.
-0afda37 (tag: v0.35) v0.35
+0afda37 v0.35
 2c28e1c Makefile: include config_opts.h
 6068fc8 osd: set reply version for dup requests
 43967f5 auth: keyring: whitespace
@@ -20362,7 +23759,7 @@ fbeafdf osd: make MOSDOp[Reply] encoding backwards compatible
 877aa92 osd: redo pg_t encoding with 64-bit pool id
 d8f1f1d osd: fix pg_pool_t::get_pg_num return types
 0b00350 ceph_context: whitespace
-2f039ee (tag: v0.34) v0.34
+2f039ee v0.34
 44cb5ee rgw: use formatter->dump_string() instead of dump_format()
 0e8bd0d rgw: get bucket info only if bucket name is not empty
 fa757b7 Makefile.am: install coverage files for libtool outputs as well
@@ -20574,7 +23971,7 @@ afdb1f3 rgw: list objects shows time in GMT, thread safety fix
 a311715 src/perfglue/heap_profiler.cc: fix snprintf
 14de6d2 client: compile with older libfuse2
 cb1f050 osd_types: fix merge
-340b434 (tag: v0.33) v0.33
+340b434 v0.33
 89eeb34 osd: pg_stat_t: throw exception on unknown encoding
 ce00564 qa: test rename into snapped dir
 e98669e mds: mark newly created snaprealms as open
@@ -20821,7 +24218,7 @@ e5dfa3d escape_json_attr: don't escape single quotes
 0dc235a mds: request attempt comes from fwd count, not retry flag
 15c344c mds: fix create_subtree_map for new dirs
 534afab vstart: static mapping of names to ranks
-c08d08b (tag: v0.32) v0.32
+c08d08b v0.32
 360ab7f rgw: don't silently ignore bad user/group when setting acl
 94239ec objecter: rename POOL_DISAPPEARED to POOL_DNE
 9447ce6 objecter: check for updated osdmap when requesting a non-existent pool
@@ -21028,7 +24425,7 @@ cbeedeb proflogger: Unlink our UNIX domain sockets on exit
 adafec4 test/proflogger.cc: read length of message first
 f8b4aa3 ProfLogger: write out length of message first
 325951d test/proflogger: Add TeardownSetup and SimpleTest
-134a680a Add test/proflogger.cc, fix ProfLogger::init()
+134a680 Add test/proflogger.cc, fix ProfLogger::init()
 5517b8f Rework ProfLogger
 6424149 osd: remove unused variables
 d07c480 mon: remove unused variables
@@ -21127,7 +24524,7 @@ d2cdbe7 mds: defer lock eval if freezing or frozen
 9bdde93 rgw: cache lru
 dc41c8d client: remove unused variable
 4f73121 rgw: fix put of zero sized objects
-9019c6c (tag: v0.31) v0.31
+9019c6c v0.31
 bae40ea filejournal: parse kernel version 3.0+
 3f708ee rgw: fix copy operation (clean up acls)
 44e76fb rgw: when listing objects, set locator key only when needed
@@ -21237,7 +24634,7 @@ fab24c8 filestore: allow FIEMAP to be disabled via conf
 fb81442 testradospp: fix compilation
 1446d6d mon: Guard injectargs parsing to prevent crashes on bad input
 6a7e635 mds: Explicitly initialize layout fields, and to the correct values.
-64b1b2c (tag: v0.30) v0.30
+64b1b2c v0.30
 842ec22 debian: drop python-support version req
 e27a893 rados: encode bufferlist in watch-notify
 0111835 rados: encode bufferlist in watch-notify
@@ -21473,7 +24870,7 @@ bc1782a osd: fix find_object_context debug output
 9974b7e rgw: user suspension
 3aa6a4d qa: pjd must run as root
 8b4b838 rgw: get multipart list marker to work
-77d38e4 (tag: v0.29.1) v0.29.1
+77d38e4 v0.29.1
 a379c67 rgw: some multipart cleanups, fixes
 515f088 librbd: fix block_completion race condition
 e9e3fee rgw: implement list multiparts
@@ -21659,7 +25056,7 @@ b5011e2 mds: adjust subtree roots on rename
 10750f8 common: add module_type to CephContext; use to initialize rotating keys
 b28ba77 osd: use osd_op.soid in call
 5cc146e osd: allow src_oids to be snapped
-8e69c39 (tag: v0.29) v0.29
+8e69c39 v0.29
 f9af9ce remove dumpjournal
 0baa108 osd: src oid is in OSDOp now
 5a86126 osd: don't crash on malformed clone_range (now really)
@@ -21751,7 +25148,7 @@ e5c9100 osd: fix map sharing due to heartbeats
 5b7c8ae osd: protect recovery_wq ops with the recovery lock
 b3fb58e crushtool: add -v verbose for --test mode
 57ea502 Add content to obsync package
-2324204 (tag: v0.28.2) v0.28.2
+2324204 v0.28.2
 7e1de38 hadoop: track Hadoop API changes
 232cd6b rgw: generate random upload id
 4ddf8df SimpleMessenger: allow multiple calls to shutdown
@@ -21813,7 +25210,7 @@ ce04e3d osd: add ability to explicitly mark unfound as lost
 87309e9 osd: make automatically marking of unfound as lost optional
 cea7b65 mds: clean up get_or_create_stray
 081acc4 mds: initialize stray_index on startup
-d66c6ca (tag: v0.28.1) v0.28.1
+d66c6ca v0.28.1
 9a660ac librads, libceph: store CephContext
 13aed89 Add CephContext
 1c7b982 Split common_init_daemonize from common_init_finish
@@ -21906,7 +25303,7 @@ dbb2c38 PG: _remove_pg, reset info.last_update and info.log_tail on log zero
 14a3f26 Move crush into libcommon
 2fc13de Move crush into libcommon
 0d79f1d man: update cosd man page
-071881d (tag: v0.28) v0.28
+071881d v0.28
 b060f5c Revert "Makefile.am: link some utils with libcrush"
 f1c82aa logclient: get rid of send_log; simplify monitor special casing
 baba0a7 msgr: fix signedness in alloc_aligned_buffer
@@ -22088,7 +25485,7 @@ dc9be20 osdmap: fix temp osd pg mapping
 5c520fe mon: do not stop mds0 unless all other nodes have also stopped
 6ac5572 PG: handle MOSDPGLog messages in Active state.
 79b1a10 ceph_crypto: Fix ceph::crypto::init mutex for NSS.
-44900d4 (tag: v0.27.1) v0.27.1
+44900d4 v0.27.1
 27a48c1 ceph::crypto::init: add PTHREAD_MUTEX_INITIALIZER
 7d12d18 mds: fix --reset-journal
 28ccdf4 obsync: remove misguided strip_owner
@@ -22315,7 +25712,7 @@ a9d12cb mon: fix up pg health report
 70640bf mon: generalize health check a bit
 4368e97 Makefile.am: add -fno-strict-aliasing
 4fe53fc Makefile.am: Wpointer-arith, Wstrict-null-sentinel
-793034c (tag: v0.27) v0.27
+793034c v0.27
 268f189 clitests: fix osdmap unit test
 4b547cb Makefile: add some new warnings to CXXFLAGS
 474be65 mds: don't daemonize when doing journal reset.
@@ -22560,7 +25957,7 @@ b6084cf config: expand metavariables when needed
 94fade2 config: add ability to complain about parse errs
 4a27cec confutils: test unicode parsing
 0e26ece config: fix metavariable substitution
-9981ff9 (tag: v0.26) v0.26
+9981ff9 v0.26
 32e422e configure: change gtk dep to 2.12 instead of 2.13 for lenny
 3227405 config: remove some unecessary g_conf references
 05c281b Revert "autoconf: Complain if tcmalloc is not found."
@@ -22770,7 +26167,7 @@ b77a323 osync: fix automatic bucket creation
 fc99cca osync: add alternate syntax for file:// transfers
 e08b3a5 osync-test: test file-to-bucket transfers
 453e1f9 osync: add SRC_AKEY, etc. environment variables
-07ee631 (tag: v0.25.2) v0.25.2
+07ee631 v0.25.2
 41675b1 Fix manpage typos
 e7abf59 Remove unused cdbs build dependency
 e2f1d78 Change wording of Debian package descriptions
@@ -22820,7 +26217,7 @@ cae43fc Makefile: drop libradosgw_a LDFLAGS
 32fce3c rados_create: correctly handle null id
 f06f4ee librados: always call keyring_init in connect
 586fc66 librados: don't call keyring_init in init_internal
-9e1828af objecter: make response_data bufferlist static
+9e1828a objecter: make response_data bufferlist static
 251fd50 rados_create_internal calls keyring_init
 c548976 rados_create: set id based on parameter
 b1c3321 librados: add rados_create_internal
@@ -22848,7 +26245,7 @@ ca61378 rbd: int -> int64_t on do_export
 58ffd37 rados tool: close dir after reading through
 84b65b5 rados tool: recursively import from dir to pool
 df8c009 cfuse: set proper defaults
-7f4a161 (tag: v0.25.1) v0.25.1
+7f4a161 v0.25.1
 db25852 cfuse: always daemonize hack
 448010f osd: small pull recovery adjustments
 4046c4b ReplicatedPG,OSD: Track which osds we are pulling from
@@ -22987,7 +26384,7 @@ aa251bd safe_io: fix signed/unsigned comparisons
 0cbfbee tests/cli/osdmaptool: default pg_num changed
 7a529d5 crush: remove misleading comment
 46d63d9 crush: fix memory leak
-3fa121b (tag: v0.25) v0.25
+3fa121b v0.25
 4ac0881 Bump librados soname to 2.
 82282f2 config: back to 6 pg bits for now
 08af63d rgw: put object request returns etag
@@ -23310,7 +26707,7 @@ f70d904 config: Remove g_conf.num_osd, num_mds, num_mon
 a350296 common: bufferlist::read_file: use safe_read
 d431295 auth/Crypto.cc: use safe_read_exact
 22dece1 os/FileStore: remove unused variable
-2cd2c56 (tag: v0.24.3) v0.24.3
+2cd2c56 v0.24.3
 b60444b make:add messages/MOSDRepScrub.h to NOINST_HEADERS
 c695a61 cephtool: get_indata should use safe_read_exact
 378ba0a mon/MonitorStore.cc: use safe_read
@@ -23600,7 +26997,7 @@ bc98a0f MDSMonitor: fix bugs with standby-replay assignment.
 0fbbbad os: FileStore::mkfs error handling fixes
 212289f vstart: Add --standby_mds setting, for auto-creating standby-replays.
 58637ac debian: fix publish.sh for ubuntu
-f7572de (tag: v0.24.2) v0.24.2
+f7572de v0.24.2
 4a49a87 msgr: make connection pipe reset atomic
 3a30eb7 msgr: include con in debug output
 943fd14 filestore: don't wait min sync interval on explicit sync()
@@ -23769,7 +27166,7 @@ e189222 ReplicatedPG: Fix bug in rollback
 9b0d577 Use Google Test framework for unit tests.
 1a9ef3f Make git ignore generated files.
 1846355 os: don't crash on no-journal case
-630565f3 (tag: v0.24.1) v0.24.1
+630565f3 v0.24.1
 eace439 test_split.sh: add many_pools test
 a64ddbb ReplicatedPG: get_object_context ssc refcount leak
 fde4b97 mds: fix _dout_lock recursion recursion
@@ -23904,7 +27301,7 @@ ff6e4d4 common: make generic_usage a little prettier
 a410360 test: add TestSignalHandlers
 532522d rgw_admin: fix compile error in usage
 61f964c librados: Fix compile error by adding std:: namespace
-180a417 (tag: v0.24) v0.24
+180a417 v0.24
 69940e2 osd: compensate for replicas with tail > last_complete
 0e510db objectcacher: Fix erroneous reference to "lock" with "flock."
 b04b6f4 mds: make nested scatterlock state change check more robust
@@ -24073,7 +27470,7 @@ a3d8c52 filestore: call lower-level do_transactions() during journal replay
 f9fa855 filestore: fix journal locking on trailing mode
 cbb5620 rbd: use MIN instead of min()
 792b04b client: connect to export targets on cap EXPORT
-5bdae2a (tag: v0.23.2) ceph v0.23.2
+5bdae2a ceph v0.23.2
 bde0c72 filestore: do not autodetect BTRFS_IOC_SNAP_CREATE_ASYNC until interface is finalized
 4592c22 client: fix cap export handler
 15c272e man: fix monmaptool man page
@@ -24190,7 +27587,7 @@ c0c301d osd: PG::read_log: don't be clever with lost xattr
 8461228 Build might_have_unfound set at activation
 36c6569 monmaptool: Return a non-zero error code and print a useful error 	message if unable to read the monmap file.
 fc21254 mds: allow for old fs's with stray instead of stray0
-868665d (tag: v0.23.1) v0.23.1
+868665d v0.23.1
 c327c6a mon: always use send_reply for auth replies
 61dd4f0 mon: simplify send_reply code
 2c71bd3 osd: add assert to _process_pg_info
@@ -24302,7 +27699,7 @@ c5b2d28 uclient: insert lssnap results under snapdir, not live dir
 7ccdae8 msg: fix buffer size for IPv6 address parsing
 ce6d639 timer: rewrite mostly from scratch
 5484899 mds: hit inode created via CREATE
-5d1d8d0 (tag: v0.23) v0.23
+5d1d8d0 v0.23
 3d10b34 mds: fix null_snapflush with multiple intervening snaps
 85e0890 osd: scrub least recently scrubbed pgs first; once a day
 c12829a osd: don't scrub something we just scrubbed
@@ -24473,7 +27870,7 @@ ee3fc3b osd: Add scrub to the names of scrub scheduling-related things.
 e6df807 test: create test_unfound.sh
 1dd5042 fix make distcheck, make uninstall
 c044829 filestore: automatically choose appropriate journaling mode
-9f4fd4a (tag: v0.22.2) v0.22.2
+9f4fd4a v0.22.2
 5b06ca1 filestore: use updated btrfs ioctls
 a831b2a btrfs: update ioctls.h
 bb628d3 Get "make dist" working, fix gui build issues
@@ -24554,7 +27951,7 @@ e27f0b1 filestore: escape the xattr chunk names
 d4bbde5 ./ceph osd setcrushmap: validate crushmap
 394b071 crush: improve error handling in map decoding
 a869b35 cap_reconnect_t: ignore embedded NULLs in the path
-e912e68 (tag: v0.22.1) v0.22.1
+e912e68 v0.22.1
 96d4673 Makefile: add errno.h
 a974cfd mds: be quiet about snaprealm push/pop
 6907894 filestore: ignore ENOSPC on setxattr pending a better workaround
@@ -24641,7 +28038,7 @@ c0db71f debian: update standards-version; fix ceph-client-tools-dbg
 1b2e992 debian: update scripts to do packaging fixes
 d44267c Revert "messenger: introduce a "halt_delivery" flag, checked by queue_delivery."
 69b764a mon: add 'mds rm <gid>' and 'mds rmfailed <id>' commands
-8a7c95f (tag: v0.22) v0.22
+8a7c95f v0.22
 2bc159e debian: no libgoogle-perftools-dev on lenny
 180f441 mds: cleanup: clarify issue_seq in cap release debug output
 b8ab009 mds: cleanup: print waiter masks in hex
@@ -24848,7 +28245,7 @@ f4be4b9 librados: throttle messages via the objecter
 e64109e objecter: enable automatic throttling of all messages
 0c800a9 objecter: add accounting to keep track of total in-flight messages.
 c8de979 Revert "throttle: make count an atomic_t"
-e4e37a9 (tag: v0.21.3) v0.21.3
+e4e37a9 v0.21.3
 df1d5d8 cauthtool: update man page
 ce63cf4 client: increase default cache size to 16K dentries and lru insertion point .25
 487b8a6 debian: allow builddebs.sh build a specific dist
@@ -24964,7 +28361,7 @@ c80a1d0 mds: fix bad iterator usage in process_reconnected_caps()
 5ae8e26 class: fix adding of class library, when previously existed
 46a16b9 mds: fix bad iterator usage in process_reconnected_caps()
 783d2ed mon: more useful class debug output
-f24cee3 (tag: v0.21.2) ceph v0.21.2
+f24cee3 ceph v0.21.2
 85814d4 client: Make truncation work properly The previous if block didn't work because inode->size was usually changed well before handle_cap_trunc was ever invoked, so it never did the truncation in the objectcacher! This was okay if you just truncated a file and then closed it, but if you wrote a file, truncated part of it out, and then wrote past the (new) end you would get reads that returned the previously-truncated data out of what should have been a hole.
 610b2e9 osd: fix class timeouts
 79f86c9 osd: clean up class loading code a bit
@@ -25110,7 +28507,7 @@ dcedfb8 osd: improve l_osd_buf accuracy
 a3eefd5 msgr: don't leak message when sending to a closed connection
 a406cfa osd: log push ops as push, not pull
 d00f43a logger: fix logger counter reset
-7aa332c (tag: v0.21.1) ceph v0.21.1
+7aa332c ceph v0.21.1
 255bf71 mon: use elector's epoch
 acd17a5 Makefile: include logrotate.conf in tarball
 a234df0 debian: fix update_pbuilder.sh
@@ -25288,7 +28685,7 @@ b72c1bb monclient: use default port if not specified via -m
 11dc1af mon: fix signed/unsigned compile warning
 719aa18 debian: add update_pbuilder.sh script; don't do it during build
 f3ac22a bump unstable version to v0.22~rc
-090436f (tag: v0.21) v0.21
+090436f v0.21
 ea2ce9d cclass.in: fix libdir definition
 fd4029c debian/ceph.logrotate: enhance postrotate for "service" util
 6db8975 mon: trim old logm states
@@ -25315,7 +28712,7 @@ b01cc38 rgw: set default log level to 20
 6bd40ac qa: consistent snaptest-%d.sh naming
 9127cd9 mds: fix uninitialized LeaseStat for null lease
 5c714bf osd: log when we get marked down but aren't
-7fbe1655 debug: no name symlink when explicit --log-file
+7fbe165 debug: no name symlink when explicit --log-file
 3de9c8d client: some whitespace cleanup
 8195899 qa: add localized version of Thomas Mueller's snaptest-2.sh
 2d35d24 rgw: exit after 5 seconds from SIGUSR1 anyway
@@ -26017,7 +29414,7 @@ cea221c MPoolOp: rearrange parsing, for easier kernel implementation
 291c0ab filejournal: create if mkfs and journal is small OR dne.
 b53d0ed filejournal: return error instead of asserting
 60095d9 mon: correctly update up map when taking over for creating|starting mds
-266d207 (tag: v0.20) debian: fix up debian scripts
+266d207 debian: fix up debian scripts
 b022a28 ceph.spec.in: updates
 f244067 ceph.spec.in: from Josef
 974f2ee mds: fix trim_dentry on dentry under unlinked dir
@@ -26481,7 +29878,7 @@ ee127d2 src/README is required
 21b88da rados_bencher: Don't re-initialize passed-in pool, it breaks stuff
 99e48ee osd: Fix up OSDCaps::get_pool_cap to work/make more sense
 2860eea rados: add interface to change pool owners
-c2e7eed6 librados: add change_pool_auid functions
+c2e7eed librados: add change_pool_auid functions
 1a4899d objecter: add change_pool_auid function.
 a33e9e7 mds: allow rdlock on replica to request sync from auth
 8d9f4fa mon: Set MForward::client_caps explicitly; this fixes a bad connection deref.
@@ -26785,7 +30182,7 @@ aac8930 debug: fix warnings, use larger path buffers
 f8b2584 mds: add support for directory sticky bit
 4540b5e filestore: only do btrfs_snap if btrfs
 ef27fd6 update release checklist
-98f5be5 (tag: v0.19) v0.19
+98f5be5 v0.19
 801d248 mon: disable 'osd setmap'
 e2ed6db osdmap: fix uninit var warning
 f4a5f53 mon: add 'auth export ]name]' to export a full or partial keyring
@@ -26909,7 +30306,7 @@ ba515fe mkcephfs: generate cephx keys during mkfs
 329178d mount: set flags when getting -o sync
 6ea3030 mds: fix dumpcache
 6285b61 authtool: only create keyring if --create-keyring (or -c)
-f40957eb config: rename 'keys file' to 'keyring'
+f40957e config: rename 'keys file' to 'keyring'
 3ebf9a4 filestore: optionally checkpoint with snaps
 5bdb348 journal: make sure max_size is multiple of block_size
 54898b3 mds: print setattr'd values with MClientRequest
@@ -27137,7 +30534,7 @@ ff4e155 mds: debug reconnect timeouts
 3d97239 debian: only upload tarball whem building amd64
 9bea9d0 logger: use proper format for s64
 ee44121 qa: put qa used files on ceph.newdream.net
-2f10137 (tag: v0.18) v0.18
+2f10137 v0.18
 a08d961 rados: move librados.h, rados_bencher.h
 dc24351 mds: avoid scatter writebehind from predirty_*
 e71e361 crush: fix whitespace
@@ -27154,7 +30551,7 @@ e439bd3 config: add kill arguments for mds import/export
 b709a72 mds: bracket mds journal events with {start,submit}_entry
 95ee211 todo
 adbd7d8 dropped old aleung mds branch code
-714a9af (origin/historic/aleung_mds_security) mon: fix allocation of low global_ids after mon restart
+714a9af mon: fix allocation of low global_ids after mon restart
 89603b6 test_ioctls: print preferred_osd value from GET_LAYOUT ioctl
 54b8537 hash: fix whitespace
 fd0195a mds: set mdr->in[n] in rdlock_path_xlock_dentry
@@ -27419,7 +30816,7 @@ e97c152 auth: fix verify_authorize_reply stupid
 08fbca8 auth: simplify challenge key encoding
 05e198b buffer: make [], hexdump const
 6133f02 auth: break client/server challenge calc into helper
-9ea4e4a5 auth: CEPH_AUTH_NONE auth and caps working
+9ea4e4a auth: CEPH_AUTH_NONE auth and caps working
 e98eb90 auth: return correct auth type
 8de5de0 auth: auth-none protocol stubs
 0717270 auth: redefine CEPH_AUTH_NONE
@@ -27550,7 +30947,7 @@ fa77429 auth: add osd caps parser
 0b26194 osd: assert apply_transaction success
 2109a91 auth: get rid of AuthorizeServer
 1b4db2f todos
-dac3a8c (tag: v0.17) v0.17
+dac3a8c v0.17
 413340e auth: fix vstart.sh
 74be02f debian: don't try to sign packages; it fails anyway
 2d0a8b2 auth: send caps to principals with the ticket
@@ -27717,7 +31114,7 @@ ed08416 move tests around
 4c32af6 Hadoop: Add the patch and a Readme file.
 335cd8f debian: upload tarball before .deb build
 ae5bf1b debian: name unstable package version ${vers}git${stamp}
-53ca2a8 (tag: v0.16.1) v0.16.1
+53ca2a8 v0.16.1
 1e51d19 rgw: fix build errors on i386
 14ad6cd Makefile: include buffer.c in tarball, kclient package
 bfc18d2 rgw: encode time_t, size_t using fixed-size types
@@ -27728,7 +31125,7 @@ bc9b863 kclient: include fs/{Kconfig,Makefile} in patchset
 165a729 uclient: Only connect to offload targets if the mds goes down.
 3a3ccd8 kclient: checkpatch cleanups
 522f570 mds: fix default layout settings
-38dbaa5 (tag: v0.16) v0.16
+38dbaa5 v0.16
 e678fbc msgr: authorizer get/verify callbacks
 faa5fb5 msgr: get_authorizer hook?
 56f45b4 objecter: Session type
@@ -27911,7 +31308,7 @@ f255bbc auth: osd loads rotating keys on startup
 c6faf6a kclient: kill out_qlen
 1744628 make inline string functions non-inline
 c2b4dc3 kclient: whitespace
-ab3c421 (tag: v0.15) v0.15
+ab3c421 v0.15
 c6e5d6e Makefile: include ioctl-number.patch
 1f1ec71 msgr: don't print source in msg debug output
 13e6f34 todo
@@ -28036,7 +31433,7 @@ df985c4 mds: avoid client lookup by using per-connection msgr state
 c2b6534 kclient: checkpatch cleanups
 035b08f kclient: update Documentation/ioctl/ioctl-number.txt
 69f84d1 todo
-1ec1e8d (tag: v0.14) v0.14
+1ec1e8d v0.14
 b985882 kclient: remove msgpools for now
 3046334 init-ceph: fix pre/post commands
 deec333 auth: stop auth sequence after getting the tickets
@@ -28226,7 +31623,7 @@ e9b00e0 Hadoop: Clean up ceph_delete code as it's redundant.
 c5dd75e Hadoop: Add package.html; fix a NULL instead of null error (Java)
 bc372c7 Hadoop: Put guards up to prevent a nasty bug in case of unspecified preferences, and the unlikely case that initialize is called twice.
 1d7726e upload tarball with 'stable' deb
-2b32297 (tag: v0.13) v0.13
+2b32297 v0.13
 884ec32 auth: kernel side aes functions
 43dcbc2 todo
 88179a3 auth: rearrange types
@@ -28398,7 +31795,7 @@ f42bb1d Hadoop: Makes use of newly-available replication reporting.
 90b0a6c uClient: Change file data function names for clarity; add function stubs for Hadoop use.
 290233e Hadoop: time conversion between Ceph and Hadoop; ditch filling in the uid/gid since it's meaningless.
 2387afb uClient: fill_stat now fills times properly
-f524150 (tag: v0.12) v0.12
+f524150 v0.12
 dad494c Makefile: add missing header to noinst_HEADERS
 1b89c16 Makefile: remove java hadoop _SOURCES dep
 6b4d8f9 osdmap: kill unused/useless get_num_osds()
@@ -28506,7 +31903,7 @@ adf0477 Hadoop: Remove the silly clientPointer convention from the Java.
 e18b613 Hadoop: Created new and modified old CephFSInterface.cc methods
 38e8ce2 Hadoop: Newly-generated JNI files into CephFSInterface.h
 ee0e17b Hadoop: Adding JNI header files to repository.
-9fa6f5e (tag: v0.11) v0.11
+9fa6f5e v0.11
 a8de8c7 kclient: simple dout macro by default
 b66da8f todo
 ea54a6a kclient: msgr cleanup
@@ -28606,7 +32003,7 @@ cfc0e0e kclient: kill dout(0, ...) calls
 bc2a567 kclient: __attribute__ ((packed)) all around
 2870371 s3gw: fix operations with url encoded object name
 860f65c osd: don't crash on a zero length write (sage)
-007313f (tag: v0.10) v0.10
+007313f v0.10
 464d248 Makefile: fix up initscript packaging
 6b3309a initscripts: /etc, not /usr/etc
 348c2dd s3gw: prefix/delmiter can be used for listing objects
@@ -28872,7 +32269,7 @@ aa615b8 kclient: clean up unaligned pointer accesses
 1d21494 mon: allow repair of entire osd
 29a2b2f mds: reduce default memory, journal footprint
 e0097bc osd: do NOT include op vector when shipping raw transaction
-4148185 (tag: v0.9) kclient: strip out kernel version compatibility cruft
+4148185 kclient: strip out kernel version compatibility cruft
 7e51c52 kclient: update script importer
 58f2820 todo
 9f58773 osd: on scrub repair, update replica pg stats as necessary
@@ -29257,7 +32654,7 @@ d0bac06 rados: look up pool by name
 36f70f4 header comments
 b566903 osd: get rid of the grep op
 a05f6ef class: able to list loaded classes on the monitor
-284161c (tag: v0.8) kclient: kill dput/d_drop debug cruft
+284161c kclient: kill dput/d_drop debug cruft
 ba73ac8 kclient: remove bookkeeper
 e3efa40 kclient: fix kbuild.patch for 2.6.30
 de954bf changelog
@@ -29390,7 +32787,7 @@ c5e64e1 kclient: a few sparse fixes
 563024f mds: only issue sensible caps on directories
 0c3becd mds: make eval take a mask; kill try_*_eval
 7c344e6 start v0.8
-be072be (tag: v0.7.3) msgr: kill static instance 'rank' of SimpleMessenger
+be072be msgr: kill static instance 'rank' of SimpleMessenger
 2519f3c msgr: clean up refs to static 'rank'
 08daaa4 uclient: clean up mount wrt osdmap, mdsmap
 497ade3 journaler: store layout in journal header
@@ -30094,12 +33491,12 @@ e587771 remove user error induced asserts
 6ea23d7 initscript: fix btrfs path default to osd data
 4b60f10 kclient: add osdc data to sysfs
 c353d14 bump version to v0.7.1 for unstable branch
-c46e1b3 (tag: v0.7) mkcephfs: btrfsctl -a between mkfs and mount
+c46e1b3 mkcephfs: btrfsctl -a between mkfs and mount
 3260c75 kclient: patch series updates
 f5a3d61 kclient: fix /sys/fs/ceph/mdsmap
 e2afd17 kclient: sysfs whitespace cleanup
 cbd43e3 kclient: fix patch series
-c057c834 osdmap: use generic crush_build_bucket
+c057c83 osdmap: use generic crush_build_bucket
 f05b1be kclient: fix osdmap decoding of straw buckets
 a098a3f kclient: sysfs, free path after build_path
 c90746a kclient: sysfs cleanup
@@ -30520,7 +33917,7 @@ f49f78d osd: reset peering, in-flight repops on every pg change
 74f2a9c uclient: update caps code to match kclient logic
 33f4443 kclient: some caps cleanup
 0017712 kclient: improve snap locking scheme
-a4c752d (tag: v0.6) v0.6
+a4c752d v0.6
 00577b5 kclient: rename lookup_mds_session get_mds_session
 b078856 kclient: realm create does not take a ref count
 70365be kclient: split get_snap_realm to lookup and create
@@ -31224,7 +34621,7 @@ a1511eb mon: use generic stash mechism to manage latest version of paxos-managed
 01a3325 kclient: use generic timeout/retry code for various monitor request types
 cd32920 kclient: pick new mon if statfs is unresponsive; clean up other retry code
 e14818c streamtest: fix recursive locking
-1dd4209 (tag: v0.5) journal: detect size of raw block devices properly
+1dd4209 journal: detect size of raw block devices properly
 200c569 osd: only trim pg log if pg contains complete set of osds
 8c1908a osdmap: fix type conversions
 41065ee crush: mention license.  minor cleanup
@@ -31616,7 +35013,7 @@ b3d6eef journal: protect journal access (namely, completions) with mutex
 8618940 debian: include crun in osd, mds, mon packages
 6e669cf osdmaptool: fix silly num_dom bug
 7cb7ada ceph.spec.in: include crun
-13b1bf7 (tag: v0.4) kclient: initializing kaddr (merge fix)
+13b1bf7 kclient: initializing kaddr (merge fix)
 d9262de kclient: use both writepage and crc
 fbd3cfa kclient: disabling the readpage
 34dccf9 kclient: disabling readpage
@@ -32175,7 +35572,7 @@ f254beb mds: follow snaps in path_traverse, returning resulting snapid to caller
 0fcadab client: track snapdir ref to parent explicitly
 5a83812 filepath: don't remove dup /'s
 83f2e4f client: hidden .snap dir, lssnap fixes
-2e986fe (tag: v0.3) include ceph.spec in configure.ac
+2e986fe include ceph.spec in configure.ac
 d38b0af ceph.spec changes
 90a0dfc mds: fix purging for unlinked inodes with caps
 1949cce client: cache versioned inodes.  use high bits of ino in fuse client.
@@ -32674,7 +36071,7 @@ ed523d0 kclient: create -> mknod sets S_IFREG.  debug mode.
 80c023f mds: wake rdlock waiters on xlock_finish
 f56a91d kclient: fix up mdsc spinlocking a bit
 cd5cd72 mds: fix can_rdlock_soon vs xlock on FileLock
-2b5f76f (tag: v0.2) readme
+2b5f76f readme
 7f9f1ea kclient: use d_splice_alias in prepopulate
 41443ce kclient: create fall back to mknod if no nameidata (as with nfs-kernel-server)
 5e5ed11 kclient: drop nameidata printk
@@ -33404,7 +36801,7 @@ bbc61d9 mds: flush file (size+mtime) metadata to journal on close
 35ee017 cleanup
 638a997 added inode revalidation
 f19c71d fix mds cap interaction; echo blah > mnt/blah now works
-204c175 (tag: v0.1) msg: blobhash on entity_name_t causes strange badness... should look into that, but avoid for now
+204c175 msg: blobhash on entity_name_t causes strange badness... should look into that, but avoid for now
 39d2a25 mds: adjust cap issue sequence to avoid unnecessary file_caps messages
 8adc9da debian packing stuffs
 78924fb more automake.  make dist seems to work
@@ -33828,7 +37225,7 @@ d793b0f reworked message encoding/decoding header vs payload vs data payload, al
 d697bfc fixed header encoding/decoding
 ef4a4f1 some client request/reply struct redefinition; no more file_data_version, for now; utime_t now uses ceph_timeval
 425169f no more bufferlist; streamlined ceph_msg object instead
-b3808583 all chunk sizes come first, before payload
+b380858 all chunk sizes come first, before payload
 da820e7 build errors
 e30b0f5 random crap
 6369889 send_message func, creates new connections as needed
@@ -35836,7 +39233,7 @@ eba9e7e *** empty log message ***
 ee63c70 *** empty log message ***
 90a1c02 *** empty log message ***
 bac0031 *** empty log message ***
-25764921 import is currently broken
+2576492 import is currently broken
 b247412 *** empty log message ***
 61a5130 *** empty log message ***
 9475610 *** empty log message ***
diff --git a/Makefile.am b/Makefile.am
index 527dd22..7ff3cf7 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -2,13 +2,18 @@ AUTOMAKE_OPTIONS = gnu
 ACLOCAL_AMFLAGS = -I m4
 EXTRA_DIST = autogen.sh ceph.spec.in ceph.spec install-deps.sh
 # the "." here makes sure check-local builds gtest and gmock before they are used
-SUBDIRS = . src man
+SUBDIRS = . src man doc systemd selinux
 
 EXTRA_DIST += \
 	src/test/run-cli-tests \
 	src/test/run-cli-tests-maybe-unset-ccache \
 	src/test/cli \
 	src/test/downloads \
+	systemd/ceph.tmpfiles.d \
+	etc/default/ceph \
+	etc/sysconfig/ceph \
+	etc/sysconfig/SuSEfirewall2.d/services/ceph-mon \
+	etc/sysconfig/SuSEfirewall2.d/services/ceph-osd-mds \
 	udev/50-rbd.rules \
 	udev/60-ceph-partuuid-workaround.rules \
 	udev/95-ceph-osd.rules \
@@ -18,13 +23,13 @@ EXTRA_DIST += \
 	share/id_dsa_drop.ceph.com.pub
 
 # why is it so hard to make autotools to this?
-install-data-local:
+install-data-local::
 	-mkdir -p $(DESTDIR)$(datadir)/ceph
 	-install -m 600 share/known_hosts_drop.ceph.com $(DESTDIR)$(datadir)/ceph/known_hosts_drop.ceph.com
 	-install -m 600 share/id_dsa_drop.ceph.com $(DESTDIR)$(datadir)/ceph/id_dsa_drop.ceph.com
 	-install -m 600 share/id_dsa_drop.ceph.com.pub $(DESTDIR)$(datadir)/ceph/id_dsa_drop.ceph.com.pub
 
-all-local:
+all-local::
 if WITH_DEBUG
 #	We need gtest to build the rados-api tests. We only build those in
 #	a debug build, though.
@@ -32,13 +37,28 @@ if WITH_DEBUG
 	@cd src/gmock && $(MAKE) $(AM_MAKEFLAGS) lib/libgmock.la lib/libgmock_main.la
 endif
 
-check-local: all
+CHECK_ULIMIT := true
+
+check-local:: all
 #	We build gtest this way, instead of using SUBDIRS, because with that,
 #	gtest's own tests would be run and that would slow us down.
 	@cd src/gmock/gtest && $(MAKE) $(AM_MAKEFLAGS) lib/libgtest.la lib/libgtest_main.la
 	@cd src/gmock && $(MAKE) $(AM_MAKEFLAGS) lib/libgmock.la lib/libgmock_main.la
 #	exercise cli tools
-	$(srcdir)/src/test/run-cli-tests '$(top_builddir)/src/test'
+	u=`ulimit -u` ; \
+	p=`expr $(shell nproc) / 2` ; \
+	n=`expr $$p \* 1024` ; \
+	if ${CHECK_ULIMIT} && echo ${MAKEFLAGS} | grep --quiet -e -j && test $$u -lt $$n ; then \
+		echo "ulimit -u is $$u which is lower than $$n = $$p / 2 * 1024" ; \
+		echo "If running make -j$$p check you will likely exceed this limit" ; \
+		echo "and the tests will fail in mysterious ways." ; \
+		echo "Update /etc/security/limits.conf to increase the limit" ; \
+		echo "or run make CHECK_ULIMIT=false -j4 check to override this safeguard." ; \
+		exit 1 ; \
+	fi
+
+check_SCRIPTS = \
+	src/test/run-cli-tests
 
 # "make distclean" both runs this and recurses into src/gtest, if
 # gtest is in DIST_SUBDIRS. Take extra care to not fail when
@@ -50,6 +70,7 @@ clean-local:
 	fi
 
 	@rm -rf src/test/virtualenv
+	@rm -rf install-deps-*
 
 
 # NOTE: This only works when enough dependencies are installed for
diff --git a/Makefile.in b/Makefile.in
index 7f8b69d..812100b 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -211,6 +211,7 @@ AMTAR = @AMTAR@
 AM_CXXFLAGS = @AM_CXXFLAGS@
 AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
 AR = @AR@
+ARM_CRC_FLAGS = @ARM_CRC_FLAGS@
 ARM_FLAGS = @ARM_FLAGS@
 ARM_NEON_FLAGS = @ARM_NEON_FLAGS@
 AUTOCONF = @AUTOCONF@
@@ -218,6 +219,7 @@ AUTOHEADER = @AUTOHEADER@
 AUTOMAKE = @AUTOMAKE@
 AWK = @AWK@
 BOOST_PROGRAM_OPTIONS_LIBS = @BOOST_PROGRAM_OPTIONS_LIBS@
+BOOST_RANDOM_LIBS = @BOOST_RANDOM_LIBS@
 BOOST_THREAD_LIBS = @BOOST_THREAD_LIBS@
 CC = @CC@
 CCAS = @CCAS@
@@ -275,7 +277,8 @@ LD = @LD@
 LDFLAGS = @LDFLAGS@
 LIBEDIT_CFLAGS = @LIBEDIT_CFLAGS@
 LIBEDIT_LIBS = @LIBEDIT_LIBS@
-LIBFUSE = @LIBFUSE@
+LIBFUSE_CFLAGS = @LIBFUSE_CFLAGS@
+LIBFUSE_LIBS = @LIBFUSE_LIBS@
 LIBJEMALLOC = @LIBJEMALLOC@
 LIBOBJS = @LIBOBJS@
 LIBROCKSDB_CFLAGS = @LIBROCKSDB_CFLAGS@
@@ -326,6 +329,7 @@ RPM_RELEASE = @RPM_RELEASE@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
+SPHINX_BUILD = @SPHINX_BUILD@
 STRIP = @STRIP@
 VERSION = @VERSION@
 WARN_ERROR_FORMAT_SECURITY = @WARN_ERROR_FORMAT_SECURITY@
@@ -359,6 +363,7 @@ datarootdir = @datarootdir@
 docdir = @docdir@
 dvidir = @dvidir@
 exec_prefix = @exec_prefix@
+group_rgw = @group_rgw@
 host = @host@
 host_alias = @host_alias@
 host_cpu = @host_cpu@
@@ -388,6 +393,8 @@ sharedstatedir = @sharedstatedir@
 srcdir = @srcdir@
 subdirs = @subdirs@
 sysconfdir = @sysconfdir@
+systemd_libexec_dir = @systemd_libexec_dir@
+systemd_unit_dir = @systemd_unit_dir@
 target = @target@
 target_alias = @target_alias@
 target_cpu = @target_cpu@
@@ -396,17 +403,26 @@ target_vendor = @target_vendor@
 top_build_prefix = @top_build_prefix@
 top_builddir = @top_builddir@
 top_srcdir = @top_srcdir@
+user_rgw = @user_rgw@
 AUTOMAKE_OPTIONS = gnu
 ACLOCAL_AMFLAGS = -I m4
 EXTRA_DIST = autogen.sh ceph.spec.in ceph.spec install-deps.sh \
 	src/test/run-cli-tests \
 	src/test/run-cli-tests-maybe-unset-ccache src/test/cli \
-	src/test/downloads udev/50-rbd.rules \
-	udev/60-ceph-partuuid-workaround.rules udev/95-ceph-osd.rules \
-	udev/95-ceph-osd-alt.rules share/known_hosts_drop.ceph.com \
-	share/id_dsa_drop.ceph.com share/id_dsa_drop.ceph.com.pub
+	src/test/downloads systemd/ceph.tmpfiles.d etc/default/ceph \
+	etc/sysconfig/ceph \
+	etc/sysconfig/SuSEfirewall2.d/services/ceph-mon \
+	etc/sysconfig/SuSEfirewall2.d/services/ceph-osd-mds \
+	udev/50-rbd.rules udev/60-ceph-partuuid-workaround.rules \
+	udev/95-ceph-osd.rules udev/95-ceph-osd-alt.rules \
+	share/known_hosts_drop.ceph.com share/id_dsa_drop.ceph.com \
+	share/id_dsa_drop.ceph.com.pub
 # the "." here makes sure check-local builds gtest and gmock before they are used
-SUBDIRS = . src man
+SUBDIRS = . src man doc systemd selinux
+CHECK_ULIMIT := true
+check_SCRIPTS = \
+	src/test/run-cli-tests
+
 all: all-recursive
 
 .SUFFIXES:
@@ -754,6 +770,7 @@ distcleancheck: distclean
 	       $(distcleancheck_listfiles) ; \
 	       exit 1; } >&2
 check-am: all-am
+	$(MAKE) $(AM_MAKEFLAGS) $(check_SCRIPTS)
 	$(MAKE) $(AM_MAKEFLAGS) check-local
 check: check-recursive
 all-am: Makefile all-local
@@ -880,25 +897,35 @@ uninstall-am:
 
 
 # why is it so hard to make autotools to this?
-install-data-local:
+install-data-local::
 	-mkdir -p $(DESTDIR)$(datadir)/ceph
 	-install -m 600 share/known_hosts_drop.ceph.com $(DESTDIR)$(datadir)/ceph/known_hosts_drop.ceph.com
 	-install -m 600 share/id_dsa_drop.ceph.com $(DESTDIR)$(datadir)/ceph/id_dsa_drop.ceph.com
 	-install -m 600 share/id_dsa_drop.ceph.com.pub $(DESTDIR)$(datadir)/ceph/id_dsa_drop.ceph.com.pub
 
-all-local:
+all-local::
 #	We need gtest to build the rados-api tests. We only build those in
 #	a debug build, though.
 @WITH_DEBUG_TRUE@	@cd src/gmock/gtest && $(MAKE) $(AM_MAKEFLAGS) lib/libgtest.la lib/libgtest_main.la
 @WITH_DEBUG_TRUE@	@cd src/gmock && $(MAKE) $(AM_MAKEFLAGS) lib/libgmock.la lib/libgmock_main.la
 
-check-local: all
+check-local:: all
 #	We build gtest this way, instead of using SUBDIRS, because with that,
 #	gtest's own tests would be run and that would slow us down.
 	@cd src/gmock/gtest && $(MAKE) $(AM_MAKEFLAGS) lib/libgtest.la lib/libgtest_main.la
 	@cd src/gmock && $(MAKE) $(AM_MAKEFLAGS) lib/libgmock.la lib/libgmock_main.la
 #	exercise cli tools
-	$(srcdir)/src/test/run-cli-tests '$(top_builddir)/src/test'
+	u=`ulimit -u` ; \
+	p=`expr $(shell nproc) / 2` ; \
+	n=`expr $$p \* 1024` ; \
+	if ${CHECK_ULIMIT} && echo ${MAKEFLAGS} | grep --quiet -e -j && test $$u -lt $$n ; then \
+		echo "ulimit -u is $$u which is lower than $$n = $$p / 2 * 1024" ; \
+		echo "If running make -j$$p check you will likely exceed this limit" ; \
+		echo "and the tests will fail in mysterious ways." ; \
+		echo "Update /etc/security/limits.conf to increase the limit" ; \
+		echo "or run make CHECK_ULIMIT=false -j4 check to override this safeguard." ; \
+		exit 1 ; \
+	fi
 
 # "make distclean" both runs this and recurses into src/gtest, if
 # gtest is in DIST_SUBDIRS. Take extra care to not fail when
@@ -910,6 +937,7 @@ clean-local:
 	fi
 
 	@rm -rf src/test/virtualenv
+	@rm -rf install-deps-*
 
 # NOTE: This only works when enough dependencies are installed for
 # autoconf to be happy.  These commands should be run manually to
diff --git a/README b/README
index bbada16..a3684e4 100644
--- a/README
+++ b/README
@@ -26,27 +26,19 @@ contributed under the terms of the applicable license.
 Build Prerequisites
 ===================
 
-debian-based
-------------
-
-The list of debian packages dependencies can be installed with:
+The list of Debian or RPM packages dependencies can be installed with:
 
 	./install-deps.sh
 
 Note: libsnappy-dev and libleveldb-dev are not available upstream for
-natty, oneiric, and squeeze.  Backports for Ceph can be found at
-ceph.com/debian-leveldb.
-
-rpm-based
----------
-
-The list of RPM packages dependencies can be installed with:
-
-	./install-deps.sh
+Debian Squeeze.  Backports for Ceph can be found at ceph.com/debian-leveldb.
 
 Building Ceph
 =============
 
+Autotools
+---------
+
 Developers, please refer to the [Developer
 Guide](doc/dev/quick_guide.rst) for more information, otherwise, you
 can build the server daemons, and FUSE client, by executing the
@@ -58,6 +50,21 @@ following:
 
 (Note that the FUSE client will only be built if libfuse is present.)
 
+CMake
+-----
+
+Prerequisite:
+        CMake 2.8.11
+
+Build instructions:
+
+	mkdir build
+	cd build
+	cmake [options] /path/to/ceph/src/dir
+	make
+
+(Note that /path/to/ceph/src/dir can be in the tree and out of the tree)
+
 Dependencies
 ------------
 
diff --git a/autogen.sh b/autogen.sh
index 650993d..99d4f7b 100755
--- a/autogen.sh
+++ b/autogen.sh
@@ -29,7 +29,8 @@ else
 fi
 
 if test -d ".git" ; then
-  if ! git submodule sync || ! git submodule update --init; then
+  force=$(if git submodule usage 2>&1 | grep --quiet 'update.*--force'; then echo --force ; fi)
+  if ! git submodule sync || ! git submodule update $force --init --recursive; then
     echo "Error: could not initialize submodule projects"
     echo "  Network connectivity might be required."
     exit 1
diff --git a/ceph.spec b/ceph.spec
index 795c126..fee4bbb 100644
--- a/ceph.spec
+++ b/ceph.spec
@@ -1,66 +1,124 @@
+# vim: set noexpandtab ts=8 sw=8 :
 %bcond_with ocf
 %bcond_without cephfs_java
+%bcond_with tests
+%bcond_without tcmalloc
+%bcond_without libs_compat
+%bcond_with lowmem_builder
+%if 0%{?fedora} || 0%{?rhel}
+%bcond_without selinux
+%endif
+%if 0%{?suse_version}
+%bcond_with selinux
+%endif
+
 
-%if ! (0%{?fedora} > 12 || 0%{?rhel} > 5)
+%if (0%{?el5} || (0%{?rhel_version} >= 500 && 0%{?rhel_version} <= 600))
 %{!?python_sitelib: %global python_sitelib %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")}
 %{!?python_sitearch: %global python_sitearch %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib(1))")}
 %endif
 
+%if %{with selinux}
+# get selinux policy version
+%{!?_selinux_policy_version: %global _selinux_policy_version %(sed -e 's,.*selinux-policy-\\([^/]*\\)/.*,\\1,' /usr/share/selinux/devel/policyhelp 2>/dev/null || echo 0.0.0)}
+
+%define relabel_files() \
+restorecon -R /usr/bin/ceph-mon > /dev/null 2>&1; \
+restorecon -R /usr/bin/ceph-osd > /dev/null 2>&1; \
+restorecon -R /usr/bin/ceph-mds > /dev/null 2>&1; \
+restorecon -R /usr/bin/radosgw > /dev/null 2>&1; \
+restorecon -R /etc/rc\.d/init\.d/ceph > /dev/null 2>&1; \
+restorecon -R /etc/rc\.d/init\.d/radosgw > /dev/null 2>&1; \
+restorecon -R /var/run/ceph > /dev/null 2>&1; \
+restorecon -R /var/lib/ceph > /dev/null 2>&1; \
+restorecon -R /var/log/ceph > /dev/null 2>&1;
+%endif
+
 %{!?_udevrulesdir: %global _udevrulesdir /lib/udev/rules.d}
 
+# Use systemd files on RHEL 7 and above and in SUSE/openSUSE.
+# Note: We don't install unit files for the services yet. For now,
+# the _with_systemd variable only implies that we'll install
+# /etc/tmpfiles.d/ceph.conf in order to set up the socket directory in
+# /var/run/ceph.
+%if 0%{?fedora} || 0%{?rhel} >= 7 || 0%{?suse_version} >= 1210
+%global _with_systemd 1
+%endif
+
+# LTTng-UST enabled on Fedora, RHEL 6, and SLES 12
+%if 0%{?fedora} || 0%{?rhel} == 6 || 0%{?suse_version} == 1315
+%global _with_lttng 1
+%endif
+
 #################################################################################
 # common
 #################################################################################
 Name:		ceph
-Version:	0.94.5
+Version:	9.2.0
 Release:	0%{?dist}
 Epoch:		1
 Summary:	User space components of the Ceph file system
-License:	GPL-2.0
-Group:		System Environment/Base
+License:	LGPL-2.1 and CC-BY-SA-1.0 and GPL-2.0 and BSL-1.0 and GPL-2.0-with-autoconf-exception and BSD-3-Clause and MIT
+%if 0%{?suse_version}
+Group:         System/Filesystems
+%endif
 URL:		http://ceph.com/
 Source0:	http://ceph.com/download/%{name}-%{version}.tar.bz2
-%if 0%{?fedora} || 0%{?centos} || 0%{?rhel}
+%if 0%{?fedora} || 0%{?rhel}
 Patch0:		init-ceph.in-fedora.patch
 %endif
+#################################################################################
+# dependencies that apply across all distro families
+#################################################################################
 Requires:	librbd1 = %{epoch}:%{version}-%{release}
 Requires:	librados2 = %{epoch}:%{version}-%{release}
 Requires:	libcephfs1 = %{epoch}:%{version}-%{release}
 Requires:	ceph-common = %{epoch}:%{version}-%{release}
+%if 0%{with selinux}
+Requires:	ceph-selinux = %{epoch}:%{version}-%{release}
+%endif
 Requires:	python-rados = %{epoch}:%{version}-%{release}
 Requires:	python-rbd = %{epoch}:%{version}-%{release}
 Requires:	python-cephfs = %{epoch}:%{version}-%{release}
 Requires:	python
 Requires:	python-requests
-Requires:	python-flask
+Requires:	grep
 Requires:	xfsprogs
+Requires:	logrotate
 Requires:	parted
 Requires:	util-linux
 Requires:	hdparm
 Requires:	cryptsetup
+Requires:	findutils
+Requires:	which
 Requires(post):	binutils
+%if 0%{with cephfs_java}
+BuildRequires:	java-devel
+BuildRequires:	sharutils
+%endif
+%if 0%{with selinux}
+BuildRequires:	checkpolicy
+BuildRequires:	selinux-policy-devel
+BuildRequires:	/usr/share/selinux/devel/policyhelp
+%endif
 BuildRequires:	gcc-c++
 BuildRequires:	boost-devel
-%if 0%{defined suse_version}
-BuildRequires:  libbz2-devel
-%else
-BuildRequires:  bzip2-devel
-%endif
+BuildRequires:  cmake
 BuildRequires:	cryptsetup
+BuildRequires:	fuse-devel
 BuildRequires:	gdbm
 BuildRequires:	hdparm
+BuildRequires:	leveldb-devel > 1.2
 BuildRequires:	libaio-devel
 BuildRequires:	libcurl-devel
 BuildRequires:	libedit-devel
 BuildRequires:	libxml2-devel
-BuildRequires:	libuuid-devel
 BuildRequires:	libblkid-devel >= 2.17
 BuildRequires:	libudev-devel
 BuildRequires:	libtool
-BuildRequires:	leveldb-devel > 1.2
 BuildRequires:	make
-BuildRequires:	perl
 BuildRequires:	parted
+BuildRequires:	perl
 BuildRequires:	pkgconfig
 BuildRequires:	python
 BuildRequires:	python-nose
@@ -72,46 +130,86 @@ BuildRequires:	xfsprogs
 BuildRequires:	xfsprogs-devel
 BuildRequires:	xmlstarlet
 BuildRequires:	yasm
-%if 0%{?suse_version}
-BuildRequires:	net-tools
-%endif
 
 #################################################################################
-# specific
+# distro-conditional dependencies
 #################################################################################
-%if ! 0%{?rhel} || 0%{?fedora}
-BuildRequires:	sharutils
+%if 0%{?suse_version}
+%if 0%{?_with_systemd}
+BuildRequires:  pkgconfig(systemd)
+BuildRequires:	systemd-rpm-macros
+%{?systemd_requires}
 %endif
-
-%if 0%{defined suse_version}
+PreReq:		%fillup_prereq
+Requires:	python-Flask
+BuildRequires:	net-tools
+BuildRequires:	libbz2-devel
 %if 0%{?suse_version} > 1210
 Requires:	gptfdisk
+%if 0%{with tcmalloc}
 BuildRequires:	gperftools-devel
+%endif
 %else
 Requires:	scsirastools
 BuildRequires:	google-perftools-devel
 %endif
-Recommends:	logrotate
-BuildRequires:	%insserv_prereq
 BuildRequires:	mozilla-nss-devel
 BuildRequires:	keyutils-devel
 BuildRequires:	libatomic-ops-devel
 %else
-Requires:	gdisk
+%if 0%{?_with_systemd}
+Requires:	systemd
+%endif
+BuildRequires:  bzip2-devel
 BuildRequires:	nss-devel
 BuildRequires:	keyutils-libs-devel
 BuildRequires:	libatomic_ops-devel
 Requires:	gdisk
 Requires(post):	chkconfig
-Requires(preun):chkconfig
-Requires(preun):initscripts
+Requires(preun):	chkconfig
+Requires(preun):	initscripts
 BuildRequires:	gperftools-devel
+Requires:	python-flask
+%endif
+# boost
+%if 0%{?fedora} || 0%{?rhel} 
+BuildRequires:  boost-random
+%endif
+# python-argparse for distros with Python 2.6 or lower
+%if (0%{?rhel} && 0%{?rhel} <= 6) || (0%{?suse_version} && 0%{?suse_version} <= 1110)
+BuildRequires:	python-argparse
+%endif
+# lttng and babeltrace for rbd-replay-prep
+%if 0%{?_with_lttng}
+%if 0%{?fedora} || 0%{?rhel}
+BuildRequires:	lttng-ust-devel
+BuildRequires:	libbabeltrace-devel
+%endif
+%if 0%{?suse_version}
+BuildRequires:	lttng-ust-devel
+BuildRequires:  babeltrace-devel
+%endif
+%endif
+# expat and fastcgi for RGW
+%if 0%{?suse_version}
+BuildRequires:	libexpat-devel
+BuildRequires:	FastCGI-devel
+%endif
+%if 0%{?rhel} || 0%{?fedora}
+BuildRequires:	expat-devel
+BuildRequires:	fcgi-devel
+%endif
+# python-sphinx
+%if 0%{?rhel} > 0 && 0%{?rhel} < 7
+BuildRequires:	python-sphinx10
+%endif
+%if 0%{?fedora} || 0%{?suse_version} || 0%{?rhel} >= 7
+BuildRequires:	python-sphinx
 %endif
 
 %description
-Ceph is a massively scalable, open-source, distributed
-storage system that runs on commodity hardware and delivers object,
-block and file system storage.
+Ceph is a massively scalable, open-source, distributed storage system that runs
+on commodity hardware and delivers object, block and file system storage.
 
 
 #################################################################################
@@ -126,13 +224,15 @@ Requires:	python-rados = %{epoch}:%{version}-%{release}
 Requires:	python-rbd = %{epoch}:%{version}-%{release}
 Requires:	python-cephfs = %{epoch}:%{version}-%{release}
 Requires:	python-requests
-%if 0%{?rhel} || 0%{?fedora}
-Requires:  redhat-lsb-core
+%if 0%{?_with_systemd}
+%{?systemd_requires}
+%endif
+%if 0%{?suse_version}
+Requires(pre):	pwdutils
 %endif
 # python-argparse is only needed in distros with Python 2.6 or lower
 %if (0%{?rhel} && 0%{?rhel} <= 6) || (0%{?suse_version} && 0%{?suse_version} <= 1110)
 Requires:	python-argparse
-BuildRequires:	python-argparse
 %endif
 %description -n ceph-common
 Common utilities to mount and interact with a ceph storage cluster.
@@ -141,7 +241,6 @@ Common utilities to mount and interact with a ceph storage cluster.
 Summary:	Ceph fuse-based client
 Group:		System Environment/Base
 Requires:	%{name}
-BuildRequires:	fuse-devel
 %description fuse
 FUSE based client for Ceph distributed network file system
 
@@ -151,7 +250,6 @@ Group:		System Environment/Base
 Requires:	%{name}
 Requires:	librados2 = %{epoch}:%{version}-%{release}
 Requires:	librbd1 = %{epoch}:%{version}-%{release}
-BuildRequires:	fuse-devel
 %description -n rbd-fuse
 FUSE based client to map Ceph rbd images to files
 
@@ -159,13 +257,11 @@ FUSE based client to map Ceph rbd images to files
 Summary:	Rados REST gateway
 Group:		Development/Libraries
 Requires:	ceph-common = %{epoch}:%{version}-%{release}
+%if 0%{with selinux}
+Requires:	ceph-selinux = %{epoch}:%{version}-%{release}
+%endif
 Requires:	librados2 = %{epoch}:%{version}-%{release}
-%if 0%{defined suse_version}
-BuildRequires:	libexpat-devel
-BuildRequires:	FastCGI-devel
-%else
-BuildRequires:	expat-devel
-BuildRequires:	fcgi-devel
+%if 0%{?rhel} || 0%{?fedora}
 Requires:	mailcap
 %endif
 %description radosgw
@@ -190,7 +286,7 @@ managers such as Pacemaker.
 Summary:	RADOS distributed object store client library
 Group:		System Environment/Libraries
 License:	LGPL-2.0
-%if 0%{?rhel} || 0%{?centos} || 0%{?fedora}
+%if 0%{?rhel} || 0%{?fedora}
 Obsoletes:	ceph-libs < %{epoch}:%{version}-%{release}
 %endif
 %description -n librados2
@@ -223,7 +319,7 @@ object store.
 Summary:	RADOS striping interface
 Group:		System Environment/Libraries
 License:	LGPL-2.0
-Requires:	librados2 = %{epoch}:%{version}
+Requires:	librados2 = %{epoch}:%{version}-%{release}
 %description -n libradosstriper1
 Striping interface built on top of the rados library, allowing
 to stripe bigger objects onto several standard rados objects using
@@ -245,7 +341,7 @@ Summary:	RADOS block device client library
 Group:		System Environment/Libraries
 License:	LGPL-2.0
 Requires:	librados2 = %{epoch}:%{version}-%{release}
-%if 0%{?rhel} || 0%{?centos} || 0%{?fedora}
+%if 0%{?rhel} || 0%{?fedora}
 Obsoletes:	ceph-libs < %{epoch}:%{version}-%{release}
 %endif
 %description -n librbd1
@@ -280,7 +376,7 @@ block device.
 Summary:	Ceph distributed file system client library
 Group:		System Environment/Libraries
 License:	LGPL-2.0
-%if 0%{?rhel} || 0%{?centos} || 0%{?fedora}
+%if 0%{?rhel} || 0%{?fedora}
 Obsoletes:	ceph-libs < %{epoch}:%{version}-%{release}
 Obsoletes:	ceph-libcephfs
 %endif
@@ -312,41 +408,29 @@ Obsoletes:	python-ceph < %{epoch}:%{version}-%{release}
 This package contains Python libraries for interacting with Cephs distributed
 file system.
 
-%package -n rest-bench
-Summary:	RESTful benchmark
-Group:		System Environment/Libraries
-License:	LGPL-2.0
-Requires:	ceph-common = %{epoch}:%{version}-%{release}
-%description -n rest-bench
-RESTful bencher that can be used to benchmark radosgw performance.
-
 %package -n ceph-test
 Summary:	Ceph benchmarks and test tools
 Group:		System Environment/Libraries
 License:	LGPL-2.0
 Requires:	ceph-common
-%if (0%{?fedora} >= 20 || 0%{?rhel} == 6)
-BuildRequires:	lttng-ust-devel
-BuildRequires:	libbabeltrace-devel
-%endif
+Requires:	xmlstarlet
 %description -n ceph-test
 This package contains Ceph benchmarks and test tools.
 
 %if 0%{with cephfs_java}
 
 %package -n libcephfs_jni1
-Summary:	Java Native Interface library for CephFS Java bindings.
+Summary:	Java Native Interface library for CephFS Java bindings
 Group:		System Environment/Libraries
 License:	LGPL-2.0
 Requires:	java
 Requires:	libcephfs1 = %{epoch}:%{version}-%{release}
-BuildRequires:	java-devel
 %description -n libcephfs_jni1
 This package contains the Java Native Interface library for CephFS Java
 bindings.
 
 %package -n libcephfs_jni1-devel
-Summary:	Development files for CephFS Java Native Interface library.
+Summary:	Development files for CephFS Java Native Interface library
 Group:		System Environment/Libraries
 License:	LGPL-2.0
 Requires:	java
@@ -357,12 +441,11 @@ This package contains the development files for CephFS Java Native Interface
 library.
 
 %package -n cephfs-java
-Summary:	Java libraries for the Ceph File System.
+Summary:	Java libraries for the Ceph File System
 Group:		System Environment/Libraries
 License:	LGPL-2.0
 Requires:	java
 Requires:	libcephfs_jni1 = %{epoch}:%{version}-%{release}
-BuildRequires:	java-devel
 %if 0%{?el6}
 Requires:	junit4
 BuildRequires:	junit4
@@ -375,8 +458,26 @@ This package contains the Java libraries for the Ceph File System.
 
 %endif
 
+%if 0%{with selinux}
+
+%package selinux
+Summary:	SELinux support for Ceph MON, OSD and MDS
+Group:		System Environment/Base
+Requires:	%{name}
+Requires:	policycoreutils, libselinux-utils
+Requires(post): selinux-policy-base >= %{_selinux_policy_version}, policycoreutils, gawk
+Requires(postun): policycoreutils
+%description selinux
+This package contains SELinux support for Ceph MON, OSD and MDS. The package
+also performs file-system relabelling which can take a long time on heavily
+populated file-systems.
+
+%endif
+
+%if 0%{with libs_compat}
+
 %package libs-compat
-Summary:	Meta package to include ceph libraries.
+Summary:	Meta package to include ceph libraries
 Group:		System Environment/Libraries
 License:	LGPL-2.0
 Obsoletes:	ceph-libs
@@ -392,6 +493,8 @@ former ceph-libs package, which is now split up into these three subpackages.
 Packages still depending on ceph-libs should be fixed to depend on librados2,
 librbd1 or libcephfs1 instead.
 
+%endif
+
 %package devel-compat
 Summary:	Compatibility package for Ceph headers
 Group:		Development/Libraries
@@ -427,16 +530,12 @@ python-rados, python-rbd and python-cephfs. Packages still depending on
 python-ceph should be fixed to depend on python-rados, python-rbd or
 python-cephfs instead.
 
-%if 0%{?opensuse} || 0%{?suse_version}
-%debug_package
-%endif
-
 #################################################################################
 # common
 #################################################################################
 %prep
 %setup -q
-%if 0%{?fedora} || 0%{?rhel} || 0%{?centos}
+%if 0%{?fedora} || 0%{?rhel}
 %patch0 -p1 -b .init
 %endif
 
@@ -449,53 +548,91 @@ done
 %endif
 
 ./autogen.sh
-MY_CONF_OPT=""
-
-MY_CONF_OPT="$MY_CONF_OPT --with-radosgw"
 
+%if %{with lowmem_builder}
+RPM_OPT_FLAGS="$RPM_OPT_FLAGS --param ggc-min-expand=20 --param ggc-min-heapsize=32768"
+%endif
 export RPM_OPT_FLAGS=`echo $RPM_OPT_FLAGS | sed -e 's/i386/i486/'`
 
 %{configure}	CPPFLAGS="$java_inc" \
 		--prefix=/usr \
 		--localstatedir=/var \
 		--sysconfdir=/etc \
+%if 0%{?_with_systemd}
+		--with-systemdsystemunitdir=%_unitdir \
+%endif
 		--docdir=%{_docdir}/ceph \
+		--with-man-pages \
+		--mandir="%_mandir" \
 		--with-nss \
 		--without-cryptopp \
-		--with-rest-bench \
 		--with-debug \
 %if 0%{with cephfs_java}
 		--enable-cephfs-java \
 %endif
+%if 0%{with selinux}
+		--with-selinux \
+%endif
 		--with-librocksdb-static=check \
-		$MY_CONF_OPT \
+%if 0%{?rhel} || 0%{?fedora}
+		--with-systemd-libexec-dir=/usr/libexec/ceph \
+		--with-rgw-user=root \
+		--with-rgw-group=root \
+%endif
+%if 0%{?suse_version}
+		--with-systemd-libexec-dir=/usr/lib/ceph/ \
+		--with-rgw-user=wwwrun \
+		--with-rgw-group=www \
+%endif
+		--with-radosgw \
+		$CEPH_EXTRA_CONFIGURE_ARGS \
 		%{?_with_ocf} \
+		%{?_with_tcmalloc} \
 		CFLAGS="$RPM_OPT_FLAGS" CXXFLAGS="$RPM_OPT_FLAGS"
 
-# fix bug in specific version of libedit-devel
-%if 0%{defined suse_version}
-sed -i -e "s/-lcurses/-lncurses/g" Makefile
-sed -i -e "s/-lcurses/-lncurses/g" src/Makefile
-sed -i -e "s/-lcurses/-lncurses/g" man/Makefile
-sed -i -e "s/-lcurses/-lncurses/g" src/ocf/Makefile
-sed -i -e "s/-lcurses/-lncurses/g" src/java/Makefile
+
+make %{?_smp_mflags}
+
+
+%if 0%{with tests}
+%check
+# run in-tree unittests
+make %{?_smp_mflags} check-local
+
 %endif
 
-make -j$(getconf _NPROCESSORS_ONLN)
+
 
 %install
 make DESTDIR=$RPM_BUILD_ROOT install
 find $RPM_BUILD_ROOT -type f -name "*.la" -exec rm -f {} ';'
 find $RPM_BUILD_ROOT -type f -name "*.a" -exec rm -f {} ';'
-install -D src/init-ceph $RPM_BUILD_ROOT%{_initrddir}/ceph
-install -D src/init-radosgw $RPM_BUILD_ROOT%{_initrddir}/ceph-radosgw
-install -D src/init-rbdmap $RPM_BUILD_ROOT%{_initrddir}/rbdmap
 install -D src/rbdmap $RPM_BUILD_ROOT%{_sysconfdir}/ceph/rbdmap
+install -D src/init-rbdmap $RPM_BUILD_ROOT%{_initrddir}/rbdmap
+%if 0%{?fedora} || 0%{?rhel}
+install -m 0644 -D etc/sysconfig/ceph $RPM_BUILD_ROOT%{_sysconfdir}/sysconfig/ceph
+%endif
+%if 0%{?suse_version}
+install -m 0644 -D etc/sysconfig/ceph $RPM_BUILD_ROOT%{_localstatedir}/adm/fillup-templates/sysconfig.%{name}
+%endif
+%if 0%{?_with_systemd}
+  install -m 0644 -D systemd/ceph.tmpfiles.d $RPM_BUILD_ROOT%{_tmpfilesdir}/ceph-common.conf
+  install -m 0644 -D systemd/ceph-osd at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-osd at .service
+  install -m 0644 -D systemd/ceph-mon at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-mon at .service
+  install -m 0644 -D systemd/ceph-create-keys at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-create-keys at .service
+  install -m 0644 -D systemd/ceph-mds at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-mds at .service
+  install -m 0644 -D systemd/ceph-radosgw at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-radosgw at .service
+  install -m 0644 -D systemd/ceph.target $RPM_BUILD_ROOT%{_unitdir}/ceph.target
+  install -m 0644 -D systemd/ceph-disk at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-disk at .service
+  install -m 0755 -D systemd/ceph $RPM_BUILD_ROOT%{_sbindir}/rcceph
+%else
+  install -D src/init-ceph $RPM_BUILD_ROOT%{_initrddir}/ceph
+  install -D src/init-radosgw $RPM_BUILD_ROOT%{_initrddir}/ceph-radosgw
+  ln -sf ../../etc/init.d/ceph %{buildroot}/%{_sbindir}/rcceph
+  ln -sf ../../etc/init.d/ceph-radosgw %{buildroot}/%{_sbindir}/rcceph-radosgw
+%endif
 mkdir -p $RPM_BUILD_ROOT%{_sbindir}
-ln -sf ../../etc/init.d/ceph %{buildroot}/%{_sbindir}/rcceph
-ln -sf ../../etc/init.d/ceph-radosgw %{buildroot}/%{_sbindir}/rcceph-radosgw
 install -m 0644 -D src/logrotate.conf $RPM_BUILD_ROOT%{_sysconfdir}/logrotate.d/ceph
-install -m 0644 -D src/rgw/logrotate.conf $RPM_BUILD_ROOT%{_sysconfdir}/logrotate.d/radosgw
 chmod 0644 $RPM_BUILD_ROOT%{_docdir}/ceph/sample.ceph.conf
 chmod 0644 $RPM_BUILD_ROOT%{_docdir}/ceph/sample.fetch_config
 
@@ -523,40 +660,89 @@ mv $RPM_BUILD_ROOT/sbin/mount.fuse.ceph $RPM_BUILD_ROOT/usr/sbin/mount.fuse.ceph
 
 #set up placeholder directories
 mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/ceph
+%if ! 0%{?_with_systemd}
 mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/run/ceph
+%endif
 mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/log/ceph
 mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/tmp
 mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/mon
 mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/osd
 mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/mds
+mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/radosgw
 mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-osd
 mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-mds
-mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/log/radosgw
+mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-rgw
 
 %clean
 rm -rf $RPM_BUILD_ROOT
 
+%pre
+%if 0%{?_with_systemd}
+  %if 0%{?suse_version}
+    # service_add_pre and friends don't work with parameterized systemd service
+    # instances, only with single services or targets, so we always pass
+    # ceph.target to these macros
+    %service_add_pre ceph.target
+  %endif
+%endif
+
+
 %post
 /sbin/ldconfig
-/sbin/chkconfig --add ceph
-mkdir -p %{_localstatedir}/run/ceph/
+%if 0%{?_with_systemd}
+  %if 0%{?suse_version}
+    %fillup_only
+    %service_add_post ceph.target
+  %endif
+%else
+  /sbin/chkconfig --add ceph
+%endif
 
 %preun
-%if %{defined suse_version}
-%stop_on_removal ceph
+%if 0%{?_with_systemd}
+  %if 0%{?suse_version}
+    %service_del_preun ceph.target
+  %endif
+  # Disable and stop on removal.
+  if [ $1 = 0 ] ; then
+    SERVICE_LIST=$(systemctl | grep -E '^ceph-mon@|^ceph-create-keys@|^ceph-osd@|^ceph-mds@|^ceph-disk-'  | cut -d' ' -f1)
+    if [ -n "$SERVICE_LIST" ]; then
+      for SERVICE in $SERVICE_LIST; do
+        /usr/bin/systemctl --no-reload disable $SERVICE > /dev/null 2>&1 || :
+        /usr/bin/systemctl stop $SERVICE > /dev/null 2>&1 || :
+      done
+    fi
+  fi
+%else
+  %if 0%{?rhel} || 0%{?fedora}
+    if [ $1 = 0 ] ; then
+      /sbin/service ceph stop >/dev/null 2>&1
+      /sbin/chkconfig --del ceph
+    fi
+  %endif
 %endif
-if [ $1 = 0 ] ; then
-    /sbin/service ceph stop >/dev/null 2>&1
-    /sbin/chkconfig --del ceph
-fi
 
 %postun
 /sbin/ldconfig
-%if %{defined suse_version}
-%insserv_cleanup
+%if 0%{?_with_systemd}
+  if [ $1 = 1 ] ; then
+    # Restart on upgrade, but only if "CEPH_AUTO_RESTART_ON_UPGRADE" is set to
+    # "yes". In any case: if units are not running, do not touch them.
+    SYSCONF_CEPH=/etc/sysconfig/ceph
+    if [ -f $SYSCONF_CEPH -a -r $SYSCONF_CEPH ] ; then
+      source $SYSCONF_CEPH
+    fi
+    if [ "X$CEPH_AUTO_RESTART_ON_UPGRADE" = "Xyes" ] ; then
+      SERVICE_LIST=$(systemctl | grep -E '^ceph-mon@|^ceph-create-keys@|^ceph-osd@|^ceph-mds@|^ceph-disk-'  | cut -d' ' -f1)
+      if [ -n "$SERVICE_LIST" ]; then
+        for SERVICE in $SERVICE_LIST; do
+          /usr/bin/systemctl try-restart $SERVICE > /dev/null 2>&1 || :
+        done
+      fi
+    fi
+  fi
 %endif
 
-
 #################################################################################
 # files
 #################################################################################
@@ -578,16 +764,26 @@ fi
 %{_bindir}/ceph-mds
 %{_bindir}/ceph-objectstore-tool
 %{_bindir}/ceph-osd
+%{_bindir}/ceph-detect-init
 %{_bindir}/librados-config
 %{_bindir}/ceph-client-debug
 %{_bindir}/cephfs-journal-tool
 %{_bindir}/cephfs-table-tool
+%{_bindir}/cephfs-data-scan
 %{_bindir}/ceph-debugpack
 %{_bindir}/ceph-coverage
+%if 0%{?_with_systemd}
+%{_unitdir}/ceph-mds at .service
+%{_unitdir}/ceph-mon at .service
+%{_unitdir}/ceph-create-keys at .service
+%{_unitdir}/ceph-osd at .service
+%{_unitdir}/ceph-radosgw at .service
+%{_unitdir}/ceph-disk at .service
+%{_unitdir}/ceph.target
+%else
 %{_initrddir}/ceph
+%endif
 %{_sbindir}/ceph-disk
-%{_sbindir}/ceph-disk-activate
-%{_sbindir}/ceph-disk-prepare
 %{_sbindir}/ceph-disk-udev
 %{_sbindir}/ceph-create-keys
 %{_sbindir}/rcceph
@@ -600,8 +796,10 @@ fi
 %{_libdir}/ceph/ceph_common.sh
 %{_libexecdir}/ceph/ceph-osd-prestart.sh
 %dir %{_libdir}/rados-classes
+%{_libdir}/rados-classes/libcls_cephfs.so*
 %{_libdir}/rados-classes/libcls_rbd.so*
 %{_libdir}/rados-classes/libcls_hello.so*
+%{_libdir}/rados-classes/libcls_numops.so*
 %{_libdir}/rados-classes/libcls_rgw.so*
 %{_libdir}/rados-classes/libcls_lock.so*
 %{_libdir}/rados-classes/libcls_kvs.so*
@@ -609,19 +807,30 @@ fi
 %{_libdir}/rados-classes/libcls_log.so*
 %{_libdir}/rados-classes/libcls_replica_log.so*
 %{_libdir}/rados-classes/libcls_statelog.so*
+%{_libdir}/rados-classes/libcls_timeindex.so*
 %{_libdir}/rados-classes/libcls_user.so*
 %{_libdir}/rados-classes/libcls_version.so*
 %dir %{_libdir}/ceph/erasure-code
 %{_libdir}/ceph/erasure-code/libec_*.so*
+%if 0%{?_with_lttng}
+%{_libdir}/libos_tp.so*
+%{_libdir}/libosd_tp.so*
+%endif
 %{_udevrulesdir}/60-ceph-partuuid-workaround.rules
 %{_udevrulesdir}/95-ceph-osd.rules
 %config %{_sysconfdir}/bash_completion.d/ceph
 %config(noreplace) %{_sysconfdir}/logrotate.d/ceph
+%if 0%{?fedora} || 0%{?rhel}
+%config(noreplace) %{_sysconfdir}/sysconfig/ceph
+%endif
 %if 0%{?suse_version}
+%{_localstatedir}/adm/fillup-templates/sysconfig.*
 %config %{_sysconfdir}/sysconfig/SuSEfirewall2.d/services/ceph-mon
 %config %{_sysconfdir}/sysconfig/SuSEfirewall2.d/services/ceph-osd-mds
 %endif
+%{python_sitelib}/ceph_detect_init*
 %{_mandir}/man8/ceph-deploy.8*
+%{_mandir}/man8/ceph-detect-init.8*
 %{_mandir}/man8/ceph-disk.8*
 %{_mandir}/man8/ceph-create-keys.8*
 %{_mandir}/man8/ceph-mon.8*
@@ -638,14 +847,16 @@ fi
 %{_mandir}/man8/ceph-clsinfo.8*
 %{_mandir}/man8/librados-config.8*
 #set up placeholder directories
-%dir %{_localstatedir}/lib/ceph/
-%dir %{_localstatedir}/lib/ceph/tmp
-%dir %{_localstatedir}/lib/ceph/mon
-%dir %{_localstatedir}/lib/ceph/osd
-%dir %{_localstatedir}/lib/ceph/mds
-%dir %{_localstatedir}/lib/ceph/bootstrap-osd
-%dir %{_localstatedir}/lib/ceph/bootstrap-mds
-%ghost %dir %{_localstatedir}/run/ceph/
+%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/tmp
+%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/mon
+%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/osd
+%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/mds
+%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/bootstrap-osd
+%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/bootstrap-mds
+%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/bootstrap-rgw
+%if ! 0%{?_with_systemd}
+%attr(770,ceph,ceph) %dir %{_localstatedir}/run/ceph
+%endif
 
 #################################################################################
 %files -n ceph-common
@@ -659,8 +870,16 @@ fi
 %{_bindir}/ceph-crush-location
 %{_bindir}/rados
 %{_bindir}/rbd
+%{_bindir}/rbd-replay
+%{_bindir}/rbd-replay-many
+%if 0%{?_with_lttng}
+%{_bindir}/rbd-replay-prep
+%endif
 %{_bindir}/ceph-post-file
 %{_bindir}/ceph-brag
+%if 0%{?_with_systemd}
+%{_tmpfilesdir}/ceph-common.conf
+%endif
 %{_mandir}/man8/ceph-authtool.8*
 %{_mandir}/man8/ceph-conf.8*
 %{_mandir}/man8/ceph-dencoder.8*
@@ -670,17 +889,46 @@ fi
 %{_mandir}/man8/ceph.8*
 %{_mandir}/man8/rados.8*
 %{_mandir}/man8/rbd.8*
+%{_mandir}/man8/rbd-replay.8*
+%{_mandir}/man8/rbd-replay-many.8*
+%{_mandir}/man8/rbd-replay-prep.8*
 %{_datadir}/ceph/known_hosts_drop.ceph.com
 %{_datadir}/ceph/id_dsa_drop.ceph.com
 %{_datadir}/ceph/id_dsa_drop.ceph.com.pub
 %dir %{_sysconfdir}/ceph/
-%dir %{_localstatedir}/log/ceph/
+%dir %{_datarootdir}/ceph/
+%dir %{_libexecdir}/ceph/
 %config %{_sysconfdir}/bash_completion.d/rados
 %config %{_sysconfdir}/bash_completion.d/rbd
 %config(noreplace) %{_sysconfdir}/ceph/rbdmap
 %{_initrddir}/rbdmap
 %{python_sitelib}/ceph_argparse.py*
+%{python_sitelib}/ceph_daemon.py*
 %{_udevrulesdir}/50-rbd.rules
+%attr(3770,ceph,ceph) %dir %{_localstatedir}/log/ceph/
+%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/
+
+%pre -n ceph-common
+CEPH_GROUP_ID=""
+CEPH_USER_ID=""
+%if 0%{?rhel} || 0%{?fedora}
+CEPH_GROUP_ID="-g 167"
+CEPH_USER_ID="-u 167"
+%endif
+%if 0%{?rhel} || 0%{?fedora}
+%{_sbindir}/groupadd ceph $CEPH_GROUP_ID -o -r 2>/dev/null || :
+%{_sbindir}/useradd ceph $CEPH_USER_ID -o -r -g ceph -s /sbin/nologin -c "Ceph daemons" -d %{_localstatedir}/lib/ceph 2> /dev/null || :
+%endif
+%if 0%{?suse_version}
+getent group ceph >/dev/null || groupadd -r ceph
+getent passwd ceph >/dev/null || useradd -r -g ceph -d %{_localstatedir}/lib/ceph -s /sbin/nologin -c "Ceph daemons" ceph
+%endif
+exit 0
+
+%post -n ceph-common
+%if 0%{?_with_systemd}
+systemd-tmpfiles --create --prefix=/run/ceph
+%endif
 
 %postun -n ceph-common
 # Package removal cleanup
@@ -709,38 +957,62 @@ fi
 #################################################################################
 %files radosgw
 %defattr(-,root,root,-)
-%{_initrddir}/ceph-radosgw
 %{_bindir}/radosgw
 %{_bindir}/radosgw-admin
+%{_bindir}/radosgw-object-expirer
 %{_mandir}/man8/radosgw.8*
 %{_mandir}/man8/radosgw-admin.8*
-%{_sbindir}/rcceph-radosgw
-%config(noreplace) %{_sysconfdir}/logrotate.d/radosgw
 %config %{_sysconfdir}/bash_completion.d/radosgw-admin
-%dir %{_localstatedir}/log/radosgw/
+%dir %{_localstatedir}/lib/ceph/radosgw
+%if 0%{?_with_systemd}
+%else
+%{_initrddir}/ceph-radosgw
+%{_sbindir}/rcceph-radosgw
+%endif
 
 %post radosgw
 /sbin/ldconfig
-%if %{defined suse_version}
-%fillup_and_insserv -f -y ceph-radosgw
+%if 0%{?suse_version}
+  # explicit systemctl daemon-reload (that's the only relevant bit of
+  # service_add_post; the rest is all sysvinit --> systemd migration which
+  # isn't applicable in this context (see above comment).
+  /usr/bin/systemctl daemon-reload >/dev/null 2>&1 || :
 %endif
 
 %preun radosgw
-%if %{defined suse_version}
-%stop_on_removal ceph-radosgw
+%if 0%{?_with_systemd}
+  # Disable and stop on removal.
+  if [ $1 = 0 ] ; then
+    SERVICE_LIST=$(systemctl | grep -E '^ceph-radosgw@'  | cut -d' ' -f1)
+    if [ -n "$SERVICE_LIST" ]; then
+      for SERVICE in $SERVICE_LIST; do
+        /usr/bin/systemctl --no-reload disable $SERVICE > /dev/null 2>&1 || :
+        /usr/bin/systemctl stop $SERVICE > /dev/null 2>&1 || :
+      done
+    fi
+  fi
 %endif
 
 %postun radosgw
 /sbin/ldconfig
-%if %{defined suse_version}
-%restart_on_update ceph-radosgw
-%insserv_cleanup
+%if 0%{?_with_systemd}
+  if [ $1 = 1 ] ; then
+    # Restart on upgrade, but only if "CEPH_AUTO_RESTART_ON_UPGRADE" is set to
+    # "yes". In any case: if units are not running, do not touch them.
+    SYSCONF_CEPH=/etc/sysconfig/ceph
+    if [ -f $SYSCONF_CEPH -a -r $SYSCONF_CEPH ] ; then
+      source $SYSCONF_CEPH
+    fi
+    if [ "X$CEPH_AUTO_RESTART_ON_UPGRADE" = "Xyes" ] ; then
+      SERVICE_LIST=$(systemctl | grep -E '^ceph-radosgw@'  | cut -d' ' -f1)
+      if [ -n "$SERVICE_LIST" ]; then
+        for SERVICE in $SERVICE_LIST; do
+          /usr/bin/systemctl try-restart $SERVICE > /dev/null 2>&1 || :
+        done
+      fi
+    fi
+  fi
 %endif
-# Package removal cleanup
-if [ "$1" -eq "0" ] ; then
-    rm -rf /var/log/radosgw
-fi
-
 
 #################################################################################
 %if %{with ocf}
@@ -756,6 +1028,9 @@ fi
 %files -n librados2
 %defattr(-,root,root,-)
 %{_libdir}/librados.so.*
+%if 0%{?_with_lttng}
+%{_libdir}/librados_tp.so.*
+%endif
 
 %post -n librados2
 /sbin/ldconfig
@@ -776,6 +1051,9 @@ fi
 %{_includedir}/rados/rados_types.hpp
 %{_includedir}/rados/memory.h
 %{_libdir}/librados.so
+%if 0%{?_with_lttng}
+%{_libdir}/librados_tp.so
+%endif
 
 #################################################################################
 %files -n python-rados
@@ -805,6 +1083,9 @@ fi
 %files -n librbd1
 %defattr(-,root,root,-)
 %{_libdir}/librbd.so.*
+%if 0%{?_with_lttng}
+%{_libdir}/librbd_tp.so.*
+%endif
 
 %post -n librbd1
 /sbin/ldconfig
@@ -822,6 +1103,9 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 %{_includedir}/rbd/librbd.hpp
 %{_includedir}/rbd/features.h
 %{_libdir}/librbd.so
+%if 0%{?_with_lttng}
+%{_libdir}/librbd_tp.so
+%endif
 
 #################################################################################
 %files -n python-rbd
@@ -852,11 +1136,6 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 %{python_sitelib}/cephfs.py*
 
 #################################################################################
-%files -n rest-bench
-%defattr(-,root,root,-)
-%{_bindir}/rest-bench
-
-#################################################################################
 %files -n ceph-test
 %defattr(-,root,root,-)
 %{_bindir}/ceph_bench_log
@@ -865,7 +1144,11 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 %{_bindir}/ceph_erasure_code
 %{_bindir}/ceph_erasure_code_benchmark
 %{_bindir}/ceph_omapbench
+%{_bindir}/ceph_objectstore_bench
 %{_bindir}/ceph_perf_objectstore
+%{_bindir}/ceph_perf_local
+%{_bindir}/ceph_perf_msgr_client
+%{_bindir}/ceph_perf_msgr_server
 %{_bindir}/ceph_psim
 %{_bindir}/ceph_radosacl
 %{_bindir}/ceph_rgw_jsonparser
@@ -883,14 +1166,8 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 %{_bindir}/ceph-monstore-tool
 %{_bindir}/ceph-osdomap-tool
 %{_bindir}/ceph-kvstore-tool
-%{_mandir}/man8/rbd-replay.8*
-%{_mandir}/man8/rbd-replay-many.8*
-%{_mandir}/man8/rbd-replay-prep.8*
-%{_bindir}/rbd-replay
-%{_bindir}/rbd-replay-many
-%if (0%{?fedora} >= 20 || 0%{?rhel} == 6)
-%{_bindir}/rbd-replay-prep
-%endif
+%dir %{_libdir}/ceph
+%{_libdir}/ceph/ceph-monstore-update-crush.sh
 
 #################################################################################
 %if 0%{with cephfs_java}
@@ -898,6 +1175,12 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 %defattr(-,root,root,-)
 %{_libdir}/libcephfs_jni.so.*
 
+%post -n libcephfs_jni1
+/sbin/ldconfig
+
+%postun -n libcephfs_jni1
+/sbin/ldconfig
+
 #################################################################################
 %files -n libcephfs_jni1-devel
 %defattr(-,root,root,-)
@@ -911,6 +1194,111 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 %endif
 
 #################################################################################
+%if 0%{with selinux}
+%files selinux
+%defattr(-,root,root,-)
+%attr(0600,root,root) %{_datadir}/selinux/packages/ceph.pp
+%{_datadir}/selinux/devel/include/contrib/ceph.if
+%{_mandir}/man8/ceph_selinux.8*
+
+%post selinux
+# Install the policy
+OLD_POLVER=$(%{_sbindir}/semodule -l | grep -P '^ceph[\t ]' | awk '{print $2}')
+%{_sbindir}/semodule -n -i %{_datadir}/selinux/packages/ceph.pp
+NEW_POLVER=$(%{_sbindir}/semodule -l | grep -P '^ceph[\t ]' | awk '{print $2}')
+
+# Load the policy if SELinux is enabled
+if %{_sbindir}/selinuxenabled; then
+    %{_sbindir}/load_policy
+else
+    # Do not relabel if selinux is not enabled
+    exit 0
+fi
+
+if test "$OLD_POLVER" == "$NEW_POLVER"; then
+   # Do not relabel if policy version did not change
+   exit 0
+fi
+
+# Check whether the daemons are running
+%if 0%{?_with_systemd}
+    /usr/bin/systemctl status ceph.target > /dev/null 2>&1
+%else
+    /sbin/service ceph status >/dev/null 2>&1
+%endif
+STATUS=$?
+
+# Stop the daemons if they were running
+if test $STATUS -eq 0; then
+%if 0%{?_with_systemd}
+    /usr/bin/systemctl stop ceph.target > /dev/null 2>&1
+%else
+    /sbin/service ceph stop >/dev/null 2>&1
+%endif
+fi
+
+# Now, relabel the files
+%relabel_files
+
+# Start the daemons iff they were running before
+if test $STATUS -eq 0; then
+%if 0%{?_with_systemd}
+    /usr/bin/systemctl start ceph.target > /dev/null 2>&1 || :
+%else
+    /sbin/service ceph start >/dev/null 2>&1 || :
+%endif
+fi
+
+exit 0
+
+%postun selinux
+if [ $1 -eq 0 ]; then
+    # Remove the module
+    %{_sbindir}/semodule -n -r ceph
+
+    # Reload the policy if SELinux is enabled
+    if %{_sbindir}/selinuxenabled ; then
+        %{_sbindir}/load_policy
+    else
+        # Do not relabel if SELinux is not enabled
+        exit 0
+    fi
+
+    # Check whether the daemons are running
+    %if 0%{?_with_systemd}
+        /usr/bin/systemctl status ceph.target > /dev/null 2>&1
+    %else
+        /sbin/service ceph status >/dev/null 2>&1
+    %endif
+    STATUS=$?
+
+    # Stop the daemons if they were running
+    if test $STATUS -eq 0; then
+    %if 0%{?_with_systemd}
+        /usr/bin/systemctl stop ceph.target > /dev/null 2>&1
+    %else
+        /sbin/service ceph stop >/dev/null 2>&1
+    %endif
+    fi
+
+    # Now, relabel the files
+    %relabel_files
+
+    # Start the daemons if they were running before
+    if test $STATUS -eq 0; then
+    %if 0%{?_with_systemd}
+	/usr/bin/systemctl start ceph.target > /dev/null 2>&1 || :
+    %else
+	/sbin/service ceph start >/dev/null 2>&1 || :
+    %endif
+    fi
+fi
+exit 0
+
+%endif # with selinux
+
+#################################################################################
+%if 0%{with libs_compat}
 %files libs-compat
 # We need an empty %%files list for ceph-libs-compat, to tell rpmbuild to actually
 # build this meta package.
@@ -919,6 +1307,7 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 %files devel-compat
 # We need an empty %%files list for ceph-devel-compat, to tell rpmbuild to
 # actually build this meta package.
+%endif
 
 #################################################################################
 %files -n python-ceph-compat
diff --git a/ceph.spec.in b/ceph.spec.in
index 140e0e3..8f2a6fc 100644
--- a/ceph.spec.in
+++ b/ceph.spec.in
@@ -1,13 +1,55 @@
+# vim: set noexpandtab ts=8 sw=8 :
 %bcond_with ocf
 %bcond_without cephfs_java
+%bcond_with tests
+%bcond_without tcmalloc
+%bcond_without libs_compat
+%bcond_with lowmem_builder
+%if 0%{?fedora} || 0%{?rhel}
+%bcond_without selinux
+%endif
+%if 0%{?suse_version}
+%bcond_with selinux
+%endif
+
 
-%if ! (0%{?fedora} > 12 || 0%{?rhel} > 5)
+%if (0%{?el5} || (0%{?rhel_version} >= 500 && 0%{?rhel_version} <= 600))
 %{!?python_sitelib: %global python_sitelib %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")}
 %{!?python_sitearch: %global python_sitearch %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib(1))")}
 %endif
 
+%if %{with selinux}
+# get selinux policy version
+%{!?_selinux_policy_version: %global _selinux_policy_version %(sed -e 's,.*selinux-policy-\\([^/]*\\)/.*,\\1,' /usr/share/selinux/devel/policyhelp 2>/dev/null || echo 0.0.0)}
+
+%define relabel_files() \
+restorecon -R /usr/bin/ceph-mon > /dev/null 2>&1; \
+restorecon -R /usr/bin/ceph-osd > /dev/null 2>&1; \
+restorecon -R /usr/bin/ceph-mds > /dev/null 2>&1; \
+restorecon -R /usr/bin/radosgw > /dev/null 2>&1; \
+restorecon -R /etc/rc\.d/init\.d/ceph > /dev/null 2>&1; \
+restorecon -R /etc/rc\.d/init\.d/radosgw > /dev/null 2>&1; \
+restorecon -R /var/run/ceph > /dev/null 2>&1; \
+restorecon -R /var/lib/ceph > /dev/null 2>&1; \
+restorecon -R /var/log/ceph > /dev/null 2>&1;
+%endif
+
 %{!?_udevrulesdir: %global _udevrulesdir /lib/udev/rules.d}
 
+# Use systemd files on RHEL 7 and above and in SUSE/openSUSE.
+# Note: We don't install unit files for the services yet. For now,
+# the _with_systemd variable only implies that we'll install
+# /etc/tmpfiles.d/ceph.conf in order to set up the socket directory in
+# /var/run/ceph.
+%if 0%{?fedora} || 0%{?rhel} >= 7 || 0%{?suse_version} >= 1210
+%global _with_systemd 1
+%endif
+
+# LTTng-UST enabled on Fedora, RHEL 6, and SLES 12
+%if 0%{?fedora} || 0%{?rhel} == 6 || 0%{?suse_version} == 1315
+%global _with_lttng 1
+%endif
+
 #################################################################################
 # common
 #################################################################################
@@ -16,51 +58,67 @@ Version:	@VERSION@
 Release:	@RPM_RELEASE@%{?dist}
 Epoch:		1
 Summary:	User space components of the Ceph file system
-License:	GPL-2.0
-Group:		System Environment/Base
+License:	LGPL-2.1 and CC-BY-SA-1.0 and GPL-2.0 and BSL-1.0 and GPL-2.0-with-autoconf-exception and BSD-3-Clause and MIT
+%if 0%{?suse_version}
+Group:         System/Filesystems
+%endif
 URL:		http://ceph.com/
 Source0:	http://ceph.com/download/%{name}-%{version}.tar.bz2
-%if 0%{?fedora} || 0%{?centos} || 0%{?rhel}
+%if 0%{?fedora} || 0%{?rhel}
 Patch0:		init-ceph.in-fedora.patch
 %endif
+#################################################################################
+# dependencies that apply across all distro families
+#################################################################################
 Requires:	librbd1 = %{epoch}:%{version}-%{release}
 Requires:	librados2 = %{epoch}:%{version}-%{release}
 Requires:	libcephfs1 = %{epoch}:%{version}-%{release}
 Requires:	ceph-common = %{epoch}:%{version}-%{release}
+%if 0%{with selinux}
+Requires:	ceph-selinux = %{epoch}:%{version}-%{release}
+%endif
 Requires:	python-rados = %{epoch}:%{version}-%{release}
 Requires:	python-rbd = %{epoch}:%{version}-%{release}
 Requires:	python-cephfs = %{epoch}:%{version}-%{release}
 Requires:	python
 Requires:	python-requests
-Requires:	python-flask
+Requires:	grep
 Requires:	xfsprogs
+Requires:	logrotate
 Requires:	parted
 Requires:	util-linux
 Requires:	hdparm
 Requires:	cryptsetup
+Requires:	findutils
+Requires:	which
 Requires(post):	binutils
+%if 0%{with cephfs_java}
+BuildRequires:	java-devel
+BuildRequires:	sharutils
+%endif
+%if 0%{with selinux}
+BuildRequires:	checkpolicy
+BuildRequires:	selinux-policy-devel
+BuildRequires:	/usr/share/selinux/devel/policyhelp
+%endif
 BuildRequires:	gcc-c++
 BuildRequires:	boost-devel
-%if 0%{defined suse_version}
-BuildRequires:  libbz2-devel
-%else
-BuildRequires:  bzip2-devel
-%endif
+BuildRequires:  cmake
 BuildRequires:	cryptsetup
+BuildRequires:	fuse-devel
 BuildRequires:	gdbm
 BuildRequires:	hdparm
+BuildRequires:	leveldb-devel > 1.2
 BuildRequires:	libaio-devel
 BuildRequires:	libcurl-devel
 BuildRequires:	libedit-devel
 BuildRequires:	libxml2-devel
-BuildRequires:	libuuid-devel
 BuildRequires:	libblkid-devel >= 2.17
 BuildRequires:	libudev-devel
 BuildRequires:	libtool
-BuildRequires:	leveldb-devel > 1.2
 BuildRequires:	make
-BuildRequires:	perl
 BuildRequires:	parted
+BuildRequires:	perl
 BuildRequires:	pkgconfig
 BuildRequires:	python
 BuildRequires:	python-nose
@@ -72,46 +130,86 @@ BuildRequires:	xfsprogs
 BuildRequires:	xfsprogs-devel
 BuildRequires:	xmlstarlet
 BuildRequires:	yasm
-%if 0%{?suse_version}
-BuildRequires:	net-tools
-%endif
 
 #################################################################################
-# specific
+# distro-conditional dependencies
 #################################################################################
-%if ! 0%{?rhel} || 0%{?fedora}
-BuildRequires:	sharutils
+%if 0%{?suse_version}
+%if 0%{?_with_systemd}
+BuildRequires:  pkgconfig(systemd)
+BuildRequires:	systemd-rpm-macros
+%{?systemd_requires}
 %endif
-
-%if 0%{defined suse_version}
+PreReq:		%fillup_prereq
+Requires:	python-Flask
+BuildRequires:	net-tools
+BuildRequires:	libbz2-devel
 %if 0%{?suse_version} > 1210
 Requires:	gptfdisk
+%if 0%{with tcmalloc}
 BuildRequires:	gperftools-devel
+%endif
 %else
 Requires:	scsirastools
 BuildRequires:	google-perftools-devel
 %endif
-Recommends:	logrotate
-BuildRequires:	%insserv_prereq
 BuildRequires:	mozilla-nss-devel
 BuildRequires:	keyutils-devel
 BuildRequires:	libatomic-ops-devel
 %else
-Requires:	gdisk
+%if 0%{?_with_systemd}
+Requires:	systemd
+%endif
+BuildRequires:  bzip2-devel
 BuildRequires:	nss-devel
 BuildRequires:	keyutils-libs-devel
 BuildRequires:	libatomic_ops-devel
 Requires:	gdisk
 Requires(post):	chkconfig
-Requires(preun):chkconfig
-Requires(preun):initscripts
+Requires(preun):	chkconfig
+Requires(preun):	initscripts
 BuildRequires:	gperftools-devel
+Requires:	python-flask
+%endif
+# boost
+%if 0%{?fedora} || 0%{?rhel} 
+BuildRequires:  boost-random
+%endif
+# python-argparse for distros with Python 2.6 or lower
+%if (0%{?rhel} && 0%{?rhel} <= 6) || (0%{?suse_version} && 0%{?suse_version} <= 1110)
+BuildRequires:	python-argparse
+%endif
+# lttng and babeltrace for rbd-replay-prep
+%if 0%{?_with_lttng}
+%if 0%{?fedora} || 0%{?rhel}
+BuildRequires:	lttng-ust-devel
+BuildRequires:	libbabeltrace-devel
+%endif
+%if 0%{?suse_version}
+BuildRequires:	lttng-ust-devel
+BuildRequires:  babeltrace-devel
+%endif
+%endif
+# expat and fastcgi for RGW
+%if 0%{?suse_version}
+BuildRequires:	libexpat-devel
+BuildRequires:	FastCGI-devel
+%endif
+%if 0%{?rhel} || 0%{?fedora}
+BuildRequires:	expat-devel
+BuildRequires:	fcgi-devel
+%endif
+# python-sphinx
+%if 0%{?rhel} > 0 && 0%{?rhel} < 7
+BuildRequires:	python-sphinx10
+%endif
+%if 0%{?fedora} || 0%{?suse_version} || 0%{?rhel} >= 7
+BuildRequires:	python-sphinx
 %endif
 
 %description
-Ceph is a massively scalable, open-source, distributed
-storage system that runs on commodity hardware and delivers object,
-block and file system storage.
+Ceph is a massively scalable, open-source, distributed storage system that runs
+on commodity hardware and delivers object, block and file system storage.
 
 
 #################################################################################
@@ -126,13 +224,15 @@ Requires:	python-rados = %{epoch}:%{version}-%{release}
 Requires:	python-rbd = %{epoch}:%{version}-%{release}
 Requires:	python-cephfs = %{epoch}:%{version}-%{release}
 Requires:	python-requests
-%if 0%{?rhel} || 0%{?fedora}
-Requires:  redhat-lsb-core
+%if 0%{?_with_systemd}
+%{?systemd_requires}
+%endif
+%if 0%{?suse_version}
+Requires(pre):	pwdutils
 %endif
 # python-argparse is only needed in distros with Python 2.6 or lower
 %if (0%{?rhel} && 0%{?rhel} <= 6) || (0%{?suse_version} && 0%{?suse_version} <= 1110)
 Requires:	python-argparse
-BuildRequires:	python-argparse
 %endif
 %description -n ceph-common
 Common utilities to mount and interact with a ceph storage cluster.
@@ -141,7 +241,6 @@ Common utilities to mount and interact with a ceph storage cluster.
 Summary:	Ceph fuse-based client
 Group:		System Environment/Base
 Requires:	%{name}
-BuildRequires:	fuse-devel
 %description fuse
 FUSE based client for Ceph distributed network file system
 
@@ -151,7 +250,6 @@ Group:		System Environment/Base
 Requires:	%{name}
 Requires:	librados2 = %{epoch}:%{version}-%{release}
 Requires:	librbd1 = %{epoch}:%{version}-%{release}
-BuildRequires:	fuse-devel
 %description -n rbd-fuse
 FUSE based client to map Ceph rbd images to files
 
@@ -159,13 +257,11 @@ FUSE based client to map Ceph rbd images to files
 Summary:	Rados REST gateway
 Group:		Development/Libraries
 Requires:	ceph-common = %{epoch}:%{version}-%{release}
+%if 0%{with selinux}
+Requires:	ceph-selinux = %{epoch}:%{version}-%{release}
+%endif
 Requires:	librados2 = %{epoch}:%{version}-%{release}
-%if 0%{defined suse_version}
-BuildRequires:	libexpat-devel
-BuildRequires:	FastCGI-devel
-%else
-BuildRequires:	expat-devel
-BuildRequires:	fcgi-devel
+%if 0%{?rhel} || 0%{?fedora}
 Requires:	mailcap
 %endif
 %description radosgw
@@ -190,7 +286,7 @@ managers such as Pacemaker.
 Summary:	RADOS distributed object store client library
 Group:		System Environment/Libraries
 License:	LGPL-2.0
-%if 0%{?rhel} || 0%{?centos} || 0%{?fedora}
+%if 0%{?rhel} || 0%{?fedora}
 Obsoletes:	ceph-libs < %{epoch}:%{version}-%{release}
 %endif
 %description -n librados2
@@ -223,7 +319,7 @@ object store.
 Summary:	RADOS striping interface
 Group:		System Environment/Libraries
 License:	LGPL-2.0
-Requires:	librados2 = %{epoch}:%{version}
+Requires:	librados2 = %{epoch}:%{version}-%{release}
 %description -n libradosstriper1
 Striping interface built on top of the rados library, allowing
 to stripe bigger objects onto several standard rados objects using
@@ -245,7 +341,7 @@ Summary:	RADOS block device client library
 Group:		System Environment/Libraries
 License:	LGPL-2.0
 Requires:	librados2 = %{epoch}:%{version}-%{release}
-%if 0%{?rhel} || 0%{?centos} || 0%{?fedora}
+%if 0%{?rhel} || 0%{?fedora}
 Obsoletes:	ceph-libs < %{epoch}:%{version}-%{release}
 %endif
 %description -n librbd1
@@ -280,7 +376,7 @@ block device.
 Summary:	Ceph distributed file system client library
 Group:		System Environment/Libraries
 License:	LGPL-2.0
-%if 0%{?rhel} || 0%{?centos} || 0%{?fedora}
+%if 0%{?rhel} || 0%{?fedora}
 Obsoletes:	ceph-libs < %{epoch}:%{version}-%{release}
 Obsoletes:	ceph-libcephfs
 %endif
@@ -312,41 +408,29 @@ Obsoletes:	python-ceph < %{epoch}:%{version}-%{release}
 This package contains Python libraries for interacting with Cephs distributed
 file system.
 
-%package -n rest-bench
-Summary:	RESTful benchmark
-Group:		System Environment/Libraries
-License:	LGPL-2.0
-Requires:	ceph-common = %{epoch}:%{version}-%{release}
-%description -n rest-bench
-RESTful bencher that can be used to benchmark radosgw performance.
-
 %package -n ceph-test
 Summary:	Ceph benchmarks and test tools
 Group:		System Environment/Libraries
 License:	LGPL-2.0
 Requires:	ceph-common
-%if (0%{?fedora} >= 20 || 0%{?rhel} == 6)
-BuildRequires:	lttng-ust-devel
-BuildRequires:	libbabeltrace-devel
-%endif
+Requires:	xmlstarlet
 %description -n ceph-test
 This package contains Ceph benchmarks and test tools.
 
 %if 0%{with cephfs_java}
 
 %package -n libcephfs_jni1
-Summary:	Java Native Interface library for CephFS Java bindings.
+Summary:	Java Native Interface library for CephFS Java bindings
 Group:		System Environment/Libraries
 License:	LGPL-2.0
 Requires:	java
 Requires:	libcephfs1 = %{epoch}:%{version}-%{release}
-BuildRequires:	java-devel
 %description -n libcephfs_jni1
 This package contains the Java Native Interface library for CephFS Java
 bindings.
 
 %package -n libcephfs_jni1-devel
-Summary:	Development files for CephFS Java Native Interface library.
+Summary:	Development files for CephFS Java Native Interface library
 Group:		System Environment/Libraries
 License:	LGPL-2.0
 Requires:	java
@@ -357,12 +441,11 @@ This package contains the development files for CephFS Java Native Interface
 library.
 
 %package -n cephfs-java
-Summary:	Java libraries for the Ceph File System.
+Summary:	Java libraries for the Ceph File System
 Group:		System Environment/Libraries
 License:	LGPL-2.0
 Requires:	java
 Requires:	libcephfs_jni1 = %{epoch}:%{version}-%{release}
-BuildRequires:	java-devel
 %if 0%{?el6}
 Requires:	junit4
 BuildRequires:	junit4
@@ -375,8 +458,26 @@ This package contains the Java libraries for the Ceph File System.
 
 %endif
 
+%if 0%{with selinux}
+
+%package selinux
+Summary:	SELinux support for Ceph MON, OSD and MDS
+Group:		System Environment/Base
+Requires:	%{name}
+Requires:	policycoreutils, libselinux-utils
+Requires(post): selinux-policy-base >= %{_selinux_policy_version}, policycoreutils, gawk
+Requires(postun): policycoreutils
+%description selinux
+This package contains SELinux support for Ceph MON, OSD and MDS. The package
+also performs file-system relabelling which can take a long time on heavily
+populated file-systems.
+
+%endif
+
+%if 0%{with libs_compat}
+
 %package libs-compat
-Summary:	Meta package to include ceph libraries.
+Summary:	Meta package to include ceph libraries
 Group:		System Environment/Libraries
 License:	LGPL-2.0
 Obsoletes:	ceph-libs
@@ -392,6 +493,8 @@ former ceph-libs package, which is now split up into these three subpackages.
 Packages still depending on ceph-libs should be fixed to depend on librados2,
 librbd1 or libcephfs1 instead.
 
+%endif
+
 %package devel-compat
 Summary:	Compatibility package for Ceph headers
 Group:		Development/Libraries
@@ -427,16 +530,12 @@ python-rados, python-rbd and python-cephfs. Packages still depending on
 python-ceph should be fixed to depend on python-rados, python-rbd or
 python-cephfs instead.
 
-%if 0%{?opensuse} || 0%{?suse_version}
-%debug_package
-%endif
-
 #################################################################################
 # common
 #################################################################################
 %prep
 %setup -q
-%if 0%{?fedora} || 0%{?rhel} || 0%{?centos}
+%if 0%{?fedora} || 0%{?rhel}
 %patch0 -p1 -b .init
 %endif
 
@@ -449,53 +548,91 @@ done
 %endif
 
 ./autogen.sh
-MY_CONF_OPT=""
-
-MY_CONF_OPT="$MY_CONF_OPT --with-radosgw"
 
+%if %{with lowmem_builder}
+RPM_OPT_FLAGS="$RPM_OPT_FLAGS --param ggc-min-expand=20 --param ggc-min-heapsize=32768"
+%endif
 export RPM_OPT_FLAGS=`echo $RPM_OPT_FLAGS | sed -e 's/i386/i486/'`
 
 %{configure}	CPPFLAGS="$java_inc" \
 		--prefix=/usr \
 		--localstatedir=/var \
 		--sysconfdir=/etc \
+%if 0%{?_with_systemd}
+		--with-systemdsystemunitdir=%_unitdir \
+%endif
 		--docdir=%{_docdir}/ceph \
+		--with-man-pages \
+		--mandir="%_mandir" \
 		--with-nss \
 		--without-cryptopp \
-		--with-rest-bench \
 		--with-debug \
 %if 0%{with cephfs_java}
 		--enable-cephfs-java \
 %endif
+%if 0%{with selinux}
+		--with-selinux \
+%endif
 		--with-librocksdb-static=check \
-		$MY_CONF_OPT \
+%if 0%{?rhel} || 0%{?fedora}
+		--with-systemd-libexec-dir=/usr/libexec/ceph \
+		--with-rgw-user=root \
+		--with-rgw-group=root \
+%endif
+%if 0%{?suse_version}
+		--with-systemd-libexec-dir=/usr/lib/ceph/ \
+		--with-rgw-user=wwwrun \
+		--with-rgw-group=www \
+%endif
+		--with-radosgw \
+		$CEPH_EXTRA_CONFIGURE_ARGS \
 		%{?_with_ocf} \
+		%{?_with_tcmalloc} \
 		CFLAGS="$RPM_OPT_FLAGS" CXXFLAGS="$RPM_OPT_FLAGS"
 
-# fix bug in specific version of libedit-devel
-%if 0%{defined suse_version}
-sed -i -e "s/-lcurses/-lncurses/g" Makefile
-sed -i -e "s/-lcurses/-lncurses/g" src/Makefile
-sed -i -e "s/-lcurses/-lncurses/g" man/Makefile
-sed -i -e "s/-lcurses/-lncurses/g" src/ocf/Makefile
-sed -i -e "s/-lcurses/-lncurses/g" src/java/Makefile
+
+make %{?_smp_mflags}
+
+
+%if 0%{with tests}
+%check
+# run in-tree unittests
+make %{?_smp_mflags} check-local
+
 %endif
 
-make -j$(getconf _NPROCESSORS_ONLN)
+
 
 %install
 make DESTDIR=$RPM_BUILD_ROOT install
 find $RPM_BUILD_ROOT -type f -name "*.la" -exec rm -f {} ';'
 find $RPM_BUILD_ROOT -type f -name "*.a" -exec rm -f {} ';'
-install -D src/init-ceph $RPM_BUILD_ROOT%{_initrddir}/ceph
-install -D src/init-radosgw $RPM_BUILD_ROOT%{_initrddir}/ceph-radosgw
-install -D src/init-rbdmap $RPM_BUILD_ROOT%{_initrddir}/rbdmap
 install -D src/rbdmap $RPM_BUILD_ROOT%{_sysconfdir}/ceph/rbdmap
+install -D src/init-rbdmap $RPM_BUILD_ROOT%{_initrddir}/rbdmap
+%if 0%{?fedora} || 0%{?rhel}
+install -m 0644 -D etc/sysconfig/ceph $RPM_BUILD_ROOT%{_sysconfdir}/sysconfig/ceph
+%endif
+%if 0%{?suse_version}
+install -m 0644 -D etc/sysconfig/ceph $RPM_BUILD_ROOT%{_localstatedir}/adm/fillup-templates/sysconfig.%{name}
+%endif
+%if 0%{?_with_systemd}
+  install -m 0644 -D systemd/ceph.tmpfiles.d $RPM_BUILD_ROOT%{_tmpfilesdir}/ceph-common.conf
+  install -m 0644 -D systemd/ceph-osd at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-osd at .service
+  install -m 0644 -D systemd/ceph-mon at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-mon at .service
+  install -m 0644 -D systemd/ceph-create-keys at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-create-keys at .service
+  install -m 0644 -D systemd/ceph-mds at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-mds at .service
+  install -m 0644 -D systemd/ceph-radosgw at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-radosgw at .service
+  install -m 0644 -D systemd/ceph.target $RPM_BUILD_ROOT%{_unitdir}/ceph.target
+  install -m 0644 -D systemd/ceph-disk at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-disk at .service
+  install -m 0755 -D systemd/ceph $RPM_BUILD_ROOT%{_sbindir}/rcceph
+%else
+  install -D src/init-ceph $RPM_BUILD_ROOT%{_initrddir}/ceph
+  install -D src/init-radosgw $RPM_BUILD_ROOT%{_initrddir}/ceph-radosgw
+  ln -sf ../../etc/init.d/ceph %{buildroot}/%{_sbindir}/rcceph
+  ln -sf ../../etc/init.d/ceph-radosgw %{buildroot}/%{_sbindir}/rcceph-radosgw
+%endif
 mkdir -p $RPM_BUILD_ROOT%{_sbindir}
-ln -sf ../../etc/init.d/ceph %{buildroot}/%{_sbindir}/rcceph
-ln -sf ../../etc/init.d/ceph-radosgw %{buildroot}/%{_sbindir}/rcceph-radosgw
 install -m 0644 -D src/logrotate.conf $RPM_BUILD_ROOT%{_sysconfdir}/logrotate.d/ceph
-install -m 0644 -D src/rgw/logrotate.conf $RPM_BUILD_ROOT%{_sysconfdir}/logrotate.d/radosgw
 chmod 0644 $RPM_BUILD_ROOT%{_docdir}/ceph/sample.ceph.conf
 chmod 0644 $RPM_BUILD_ROOT%{_docdir}/ceph/sample.fetch_config
 
@@ -523,40 +660,89 @@ mv $RPM_BUILD_ROOT/sbin/mount.fuse.ceph $RPM_BUILD_ROOT/usr/sbin/mount.fuse.ceph
 
 #set up placeholder directories
 mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/ceph
+%if ! 0%{?_with_systemd}
 mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/run/ceph
+%endif
 mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/log/ceph
 mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/tmp
 mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/mon
 mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/osd
 mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/mds
+mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/radosgw
 mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-osd
 mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-mds
-mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/log/radosgw
+mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-rgw
 
 %clean
 rm -rf $RPM_BUILD_ROOT
 
+%pre
+%if 0%{?_with_systemd}
+  %if 0%{?suse_version}
+    # service_add_pre and friends don't work with parameterized systemd service
+    # instances, only with single services or targets, so we always pass
+    # ceph.target to these macros
+    %service_add_pre ceph.target
+  %endif
+%endif
+
+
 %post
 /sbin/ldconfig
-/sbin/chkconfig --add ceph
-mkdir -p %{_localstatedir}/run/ceph/
+%if 0%{?_with_systemd}
+  %if 0%{?suse_version}
+    %fillup_only
+    %service_add_post ceph.target
+  %endif
+%else
+  /sbin/chkconfig --add ceph
+%endif
 
 %preun
-%if %{defined suse_version}
-%stop_on_removal ceph
+%if 0%{?_with_systemd}
+  %if 0%{?suse_version}
+    %service_del_preun ceph.target
+  %endif
+  # Disable and stop on removal.
+  if [ $1 = 0 ] ; then
+    SERVICE_LIST=$(systemctl | grep -E '^ceph-mon@|^ceph-create-keys@|^ceph-osd@|^ceph-mds@|^ceph-disk-'  | cut -d' ' -f1)
+    if [ -n "$SERVICE_LIST" ]; then
+      for SERVICE in $SERVICE_LIST; do
+        /usr/bin/systemctl --no-reload disable $SERVICE > /dev/null 2>&1 || :
+        /usr/bin/systemctl stop $SERVICE > /dev/null 2>&1 || :
+      done
+    fi
+  fi
+%else
+  %if 0%{?rhel} || 0%{?fedora}
+    if [ $1 = 0 ] ; then
+      /sbin/service ceph stop >/dev/null 2>&1
+      /sbin/chkconfig --del ceph
+    fi
+  %endif
 %endif
-if [ $1 = 0 ] ; then
-    /sbin/service ceph stop >/dev/null 2>&1
-    /sbin/chkconfig --del ceph
-fi
 
 %postun
 /sbin/ldconfig
-%if %{defined suse_version}
-%insserv_cleanup
+%if 0%{?_with_systemd}
+  if [ $1 = 1 ] ; then
+    # Restart on upgrade, but only if "CEPH_AUTO_RESTART_ON_UPGRADE" is set to
+    # "yes". In any case: if units are not running, do not touch them.
+    SYSCONF_CEPH=/etc/sysconfig/ceph
+    if [ -f $SYSCONF_CEPH -a -r $SYSCONF_CEPH ] ; then
+      source $SYSCONF_CEPH
+    fi
+    if [ "X$CEPH_AUTO_RESTART_ON_UPGRADE" = "Xyes" ] ; then
+      SERVICE_LIST=$(systemctl | grep -E '^ceph-mon@|^ceph-create-keys@|^ceph-osd@|^ceph-mds@|^ceph-disk-'  | cut -d' ' -f1)
+      if [ -n "$SERVICE_LIST" ]; then
+        for SERVICE in $SERVICE_LIST; do
+          /usr/bin/systemctl try-restart $SERVICE > /dev/null 2>&1 || :
+        done
+      fi
+    fi
+  fi
 %endif
 
-
 #################################################################################
 # files
 #################################################################################
@@ -578,16 +764,26 @@ fi
 %{_bindir}/ceph-mds
 %{_bindir}/ceph-objectstore-tool
 %{_bindir}/ceph-osd
+%{_bindir}/ceph-detect-init
 %{_bindir}/librados-config
 %{_bindir}/ceph-client-debug
 %{_bindir}/cephfs-journal-tool
 %{_bindir}/cephfs-table-tool
+%{_bindir}/cephfs-data-scan
 %{_bindir}/ceph-debugpack
 %{_bindir}/ceph-coverage
+%if 0%{?_with_systemd}
+%{_unitdir}/ceph-mds at .service
+%{_unitdir}/ceph-mon at .service
+%{_unitdir}/ceph-create-keys at .service
+%{_unitdir}/ceph-osd at .service
+%{_unitdir}/ceph-radosgw at .service
+%{_unitdir}/ceph-disk at .service
+%{_unitdir}/ceph.target
+%else
 %{_initrddir}/ceph
+%endif
 %{_sbindir}/ceph-disk
-%{_sbindir}/ceph-disk-activate
-%{_sbindir}/ceph-disk-prepare
 %{_sbindir}/ceph-disk-udev
 %{_sbindir}/ceph-create-keys
 %{_sbindir}/rcceph
@@ -600,8 +796,10 @@ fi
 %{_libdir}/ceph/ceph_common.sh
 %{_libexecdir}/ceph/ceph-osd-prestart.sh
 %dir %{_libdir}/rados-classes
+%{_libdir}/rados-classes/libcls_cephfs.so*
 %{_libdir}/rados-classes/libcls_rbd.so*
 %{_libdir}/rados-classes/libcls_hello.so*
+%{_libdir}/rados-classes/libcls_numops.so*
 %{_libdir}/rados-classes/libcls_rgw.so*
 %{_libdir}/rados-classes/libcls_lock.so*
 %{_libdir}/rados-classes/libcls_kvs.so*
@@ -609,19 +807,30 @@ fi
 %{_libdir}/rados-classes/libcls_log.so*
 %{_libdir}/rados-classes/libcls_replica_log.so*
 %{_libdir}/rados-classes/libcls_statelog.so*
+%{_libdir}/rados-classes/libcls_timeindex.so*
 %{_libdir}/rados-classes/libcls_user.so*
 %{_libdir}/rados-classes/libcls_version.so*
 %dir %{_libdir}/ceph/erasure-code
 %{_libdir}/ceph/erasure-code/libec_*.so*
+%if 0%{?_with_lttng}
+%{_libdir}/libos_tp.so*
+%{_libdir}/libosd_tp.so*
+%endif
 %{_udevrulesdir}/60-ceph-partuuid-workaround.rules
 %{_udevrulesdir}/95-ceph-osd.rules
 %config %{_sysconfdir}/bash_completion.d/ceph
 %config(noreplace) %{_sysconfdir}/logrotate.d/ceph
+%if 0%{?fedora} || 0%{?rhel}
+%config(noreplace) %{_sysconfdir}/sysconfig/ceph
+%endif
 %if 0%{?suse_version}
+%{_localstatedir}/adm/fillup-templates/sysconfig.*
 %config %{_sysconfdir}/sysconfig/SuSEfirewall2.d/services/ceph-mon
 %config %{_sysconfdir}/sysconfig/SuSEfirewall2.d/services/ceph-osd-mds
 %endif
+%{python_sitelib}/ceph_detect_init*
 %{_mandir}/man8/ceph-deploy.8*
+%{_mandir}/man8/ceph-detect-init.8*
 %{_mandir}/man8/ceph-disk.8*
 %{_mandir}/man8/ceph-create-keys.8*
 %{_mandir}/man8/ceph-mon.8*
@@ -638,14 +847,16 @@ fi
 %{_mandir}/man8/ceph-clsinfo.8*
 %{_mandir}/man8/librados-config.8*
 #set up placeholder directories
-%dir %{_localstatedir}/lib/ceph/
-%dir %{_localstatedir}/lib/ceph/tmp
-%dir %{_localstatedir}/lib/ceph/mon
-%dir %{_localstatedir}/lib/ceph/osd
-%dir %{_localstatedir}/lib/ceph/mds
-%dir %{_localstatedir}/lib/ceph/bootstrap-osd
-%dir %{_localstatedir}/lib/ceph/bootstrap-mds
-%ghost %dir %{_localstatedir}/run/ceph/
+%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/tmp
+%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/mon
+%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/osd
+%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/mds
+%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/bootstrap-osd
+%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/bootstrap-mds
+%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/bootstrap-rgw
+%if ! 0%{?_with_systemd}
+%attr(770,ceph,ceph) %dir %{_localstatedir}/run/ceph
+%endif
 
 #################################################################################
 %files -n ceph-common
@@ -659,8 +870,16 @@ fi
 %{_bindir}/ceph-crush-location
 %{_bindir}/rados
 %{_bindir}/rbd
+%{_bindir}/rbd-replay
+%{_bindir}/rbd-replay-many
+%if 0%{?_with_lttng}
+%{_bindir}/rbd-replay-prep
+%endif
 %{_bindir}/ceph-post-file
 %{_bindir}/ceph-brag
+%if 0%{?_with_systemd}
+%{_tmpfilesdir}/ceph-common.conf
+%endif
 %{_mandir}/man8/ceph-authtool.8*
 %{_mandir}/man8/ceph-conf.8*
 %{_mandir}/man8/ceph-dencoder.8*
@@ -670,17 +889,46 @@ fi
 %{_mandir}/man8/ceph.8*
 %{_mandir}/man8/rados.8*
 %{_mandir}/man8/rbd.8*
+%{_mandir}/man8/rbd-replay.8*
+%{_mandir}/man8/rbd-replay-many.8*
+%{_mandir}/man8/rbd-replay-prep.8*
 %{_datadir}/ceph/known_hosts_drop.ceph.com
 %{_datadir}/ceph/id_dsa_drop.ceph.com
 %{_datadir}/ceph/id_dsa_drop.ceph.com.pub
 %dir %{_sysconfdir}/ceph/
-%dir %{_localstatedir}/log/ceph/
+%dir %{_datarootdir}/ceph/
+%dir %{_libexecdir}/ceph/
 %config %{_sysconfdir}/bash_completion.d/rados
 %config %{_sysconfdir}/bash_completion.d/rbd
 %config(noreplace) %{_sysconfdir}/ceph/rbdmap
 %{_initrddir}/rbdmap
 %{python_sitelib}/ceph_argparse.py*
+%{python_sitelib}/ceph_daemon.py*
 %{_udevrulesdir}/50-rbd.rules
+%attr(3770,ceph,ceph) %dir %{_localstatedir}/log/ceph/
+%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/
+
+%pre -n ceph-common
+CEPH_GROUP_ID=""
+CEPH_USER_ID=""
+%if 0%{?rhel} || 0%{?fedora}
+CEPH_GROUP_ID="-g 167"
+CEPH_USER_ID="-u 167"
+%endif
+%if 0%{?rhel} || 0%{?fedora}
+%{_sbindir}/groupadd ceph $CEPH_GROUP_ID -o -r 2>/dev/null || :
+%{_sbindir}/useradd ceph $CEPH_USER_ID -o -r -g ceph -s /sbin/nologin -c "Ceph daemons" -d %{_localstatedir}/lib/ceph 2> /dev/null || :
+%endif
+%if 0%{?suse_version}
+getent group ceph >/dev/null || groupadd -r ceph
+getent passwd ceph >/dev/null || useradd -r -g ceph -d %{_localstatedir}/lib/ceph -s /sbin/nologin -c "Ceph daemons" ceph
+%endif
+exit 0
+
+%post -n ceph-common
+%if 0%{?_with_systemd}
+systemd-tmpfiles --create --prefix=/run/ceph
+%endif
 
 %postun -n ceph-common
 # Package removal cleanup
@@ -709,38 +957,62 @@ fi
 #################################################################################
 %files radosgw
 %defattr(-,root,root,-)
-%{_initrddir}/ceph-radosgw
 %{_bindir}/radosgw
 %{_bindir}/radosgw-admin
+%{_bindir}/radosgw-object-expirer
 %{_mandir}/man8/radosgw.8*
 %{_mandir}/man8/radosgw-admin.8*
-%{_sbindir}/rcceph-radosgw
-%config(noreplace) %{_sysconfdir}/logrotate.d/radosgw
 %config %{_sysconfdir}/bash_completion.d/radosgw-admin
-%dir %{_localstatedir}/log/radosgw/
+%dir %{_localstatedir}/lib/ceph/radosgw
+%if 0%{?_with_systemd}
+%else
+%{_initrddir}/ceph-radosgw
+%{_sbindir}/rcceph-radosgw
+%endif
 
 %post radosgw
 /sbin/ldconfig
-%if %{defined suse_version}
-%fillup_and_insserv -f -y ceph-radosgw
+%if 0%{?suse_version}
+  # explicit systemctl daemon-reload (that's the only relevant bit of
+  # service_add_post; the rest is all sysvinit --> systemd migration which
+  # isn't applicable in this context (see above comment).
+  /usr/bin/systemctl daemon-reload >/dev/null 2>&1 || :
 %endif
 
 %preun radosgw
-%if %{defined suse_version}
-%stop_on_removal ceph-radosgw
+%if 0%{?_with_systemd}
+  # Disable and stop on removal.
+  if [ $1 = 0 ] ; then
+    SERVICE_LIST=$(systemctl | grep -E '^ceph-radosgw@'  | cut -d' ' -f1)
+    if [ -n "$SERVICE_LIST" ]; then
+      for SERVICE in $SERVICE_LIST; do
+        /usr/bin/systemctl --no-reload disable $SERVICE > /dev/null 2>&1 || :
+        /usr/bin/systemctl stop $SERVICE > /dev/null 2>&1 || :
+      done
+    fi
+  fi
 %endif
 
 %postun radosgw
 /sbin/ldconfig
-%if %{defined suse_version}
-%restart_on_update ceph-radosgw
-%insserv_cleanup
+%if 0%{?_with_systemd}
+  if [ $1 = 1 ] ; then
+    # Restart on upgrade, but only if "CEPH_AUTO_RESTART_ON_UPGRADE" is set to
+    # "yes". In any case: if units are not running, do not touch them.
+    SYSCONF_CEPH=/etc/sysconfig/ceph
+    if [ -f $SYSCONF_CEPH -a -r $SYSCONF_CEPH ] ; then
+      source $SYSCONF_CEPH
+    fi
+    if [ "X$CEPH_AUTO_RESTART_ON_UPGRADE" = "Xyes" ] ; then
+      SERVICE_LIST=$(systemctl | grep -E '^ceph-radosgw@'  | cut -d' ' -f1)
+      if [ -n "$SERVICE_LIST" ]; then
+        for SERVICE in $SERVICE_LIST; do
+          /usr/bin/systemctl try-restart $SERVICE > /dev/null 2>&1 || :
+        done
+      fi
+    fi
+  fi
 %endif
-# Package removal cleanup
-if [ "$1" -eq "0" ] ; then
-    rm -rf /var/log/radosgw
-fi
-
 
 #################################################################################
 %if %{with ocf}
@@ -756,6 +1028,9 @@ fi
 %files -n librados2
 %defattr(-,root,root,-)
 %{_libdir}/librados.so.*
+%if 0%{?_with_lttng}
+%{_libdir}/librados_tp.so.*
+%endif
 
 %post -n librados2
 /sbin/ldconfig
@@ -776,6 +1051,9 @@ fi
 %{_includedir}/rados/rados_types.hpp
 %{_includedir}/rados/memory.h
 %{_libdir}/librados.so
+%if 0%{?_with_lttng}
+%{_libdir}/librados_tp.so
+%endif
 
 #################################################################################
 %files -n python-rados
@@ -805,6 +1083,9 @@ fi
 %files -n librbd1
 %defattr(-,root,root,-)
 %{_libdir}/librbd.so.*
+%if 0%{?_with_lttng}
+%{_libdir}/librbd_tp.so.*
+%endif
 
 %post -n librbd1
 /sbin/ldconfig
@@ -822,6 +1103,9 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 %{_includedir}/rbd/librbd.hpp
 %{_includedir}/rbd/features.h
 %{_libdir}/librbd.so
+%if 0%{?_with_lttng}
+%{_libdir}/librbd_tp.so
+%endif
 
 #################################################################################
 %files -n python-rbd
@@ -852,11 +1136,6 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 %{python_sitelib}/cephfs.py*
 
 #################################################################################
-%files -n rest-bench
-%defattr(-,root,root,-)
-%{_bindir}/rest-bench
-
-#################################################################################
 %files -n ceph-test
 %defattr(-,root,root,-)
 %{_bindir}/ceph_bench_log
@@ -865,7 +1144,11 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 %{_bindir}/ceph_erasure_code
 %{_bindir}/ceph_erasure_code_benchmark
 %{_bindir}/ceph_omapbench
+%{_bindir}/ceph_objectstore_bench
 %{_bindir}/ceph_perf_objectstore
+%{_bindir}/ceph_perf_local
+%{_bindir}/ceph_perf_msgr_client
+%{_bindir}/ceph_perf_msgr_server
 %{_bindir}/ceph_psim
 %{_bindir}/ceph_radosacl
 %{_bindir}/ceph_rgw_jsonparser
@@ -883,14 +1166,8 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 %{_bindir}/ceph-monstore-tool
 %{_bindir}/ceph-osdomap-tool
 %{_bindir}/ceph-kvstore-tool
-%{_mandir}/man8/rbd-replay.8*
-%{_mandir}/man8/rbd-replay-many.8*
-%{_mandir}/man8/rbd-replay-prep.8*
-%{_bindir}/rbd-replay
-%{_bindir}/rbd-replay-many
-%if (0%{?fedora} >= 20 || 0%{?rhel} == 6)
-%{_bindir}/rbd-replay-prep
-%endif
+%dir %{_libdir}/ceph
+%{_libdir}/ceph/ceph-monstore-update-crush.sh
 
 #################################################################################
 %if 0%{with cephfs_java}
@@ -898,6 +1175,12 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 %defattr(-,root,root,-)
 %{_libdir}/libcephfs_jni.so.*
 
+%post -n libcephfs_jni1
+/sbin/ldconfig
+
+%postun -n libcephfs_jni1
+/sbin/ldconfig
+
 #################################################################################
 %files -n libcephfs_jni1-devel
 %defattr(-,root,root,-)
@@ -911,6 +1194,111 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 %endif
 
 #################################################################################
+%if 0%{with selinux}
+%files selinux
+%defattr(-,root,root,-)
+%attr(0600,root,root) %{_datadir}/selinux/packages/ceph.pp
+%{_datadir}/selinux/devel/include/contrib/ceph.if
+%{_mandir}/man8/ceph_selinux.8*
+
+%post selinux
+# Install the policy
+OLD_POLVER=$(%{_sbindir}/semodule -l | grep -P '^ceph[\t ]' | awk '{print $2}')
+%{_sbindir}/semodule -n -i %{_datadir}/selinux/packages/ceph.pp
+NEW_POLVER=$(%{_sbindir}/semodule -l | grep -P '^ceph[\t ]' | awk '{print $2}')
+
+# Load the policy if SELinux is enabled
+if %{_sbindir}/selinuxenabled; then
+    %{_sbindir}/load_policy
+else
+    # Do not relabel if selinux is not enabled
+    exit 0
+fi
+
+if test "$OLD_POLVER" == "$NEW_POLVER"; then
+   # Do not relabel if policy version did not change
+   exit 0
+fi
+
+# Check whether the daemons are running
+%if 0%{?_with_systemd}
+    /usr/bin/systemctl status ceph.target > /dev/null 2>&1
+%else
+    /sbin/service ceph status >/dev/null 2>&1
+%endif
+STATUS=$?
+
+# Stop the daemons if they were running
+if test $STATUS -eq 0; then
+%if 0%{?_with_systemd}
+    /usr/bin/systemctl stop ceph.target > /dev/null 2>&1
+%else
+    /sbin/service ceph stop >/dev/null 2>&1
+%endif
+fi
+
+# Now, relabel the files
+%relabel_files
+
+# Start the daemons iff they were running before
+if test $STATUS -eq 0; then
+%if 0%{?_with_systemd}
+    /usr/bin/systemctl start ceph.target > /dev/null 2>&1 || :
+%else
+    /sbin/service ceph start >/dev/null 2>&1 || :
+%endif
+fi
+
+exit 0
+
+%postun selinux
+if [ $1 -eq 0 ]; then
+    # Remove the module
+    %{_sbindir}/semodule -n -r ceph
+
+    # Reload the policy if SELinux is enabled
+    if %{_sbindir}/selinuxenabled ; then
+        %{_sbindir}/load_policy
+    else
+        # Do not relabel if SELinux is not enabled
+        exit 0
+    fi
+
+    # Check whether the daemons are running
+    %if 0%{?_with_systemd}
+        /usr/bin/systemctl status ceph.target > /dev/null 2>&1
+    %else
+        /sbin/service ceph status >/dev/null 2>&1
+    %endif
+    STATUS=$?
+
+    # Stop the daemons if they were running
+    if test $STATUS -eq 0; then
+    %if 0%{?_with_systemd}
+        /usr/bin/systemctl stop ceph.target > /dev/null 2>&1
+    %else
+        /sbin/service ceph stop >/dev/null 2>&1
+    %endif
+    fi
+
+    # Now, relabel the files
+    %relabel_files
+
+    # Start the daemons if they were running before
+    if test $STATUS -eq 0; then
+    %if 0%{?_with_systemd}
+	/usr/bin/systemctl start ceph.target > /dev/null 2>&1 || :
+    %else
+	/sbin/service ceph start >/dev/null 2>&1 || :
+    %endif
+    fi
+fi
+exit 0
+
+%endif # with selinux
+
+#################################################################################
+%if 0%{with libs_compat}
 %files libs-compat
 # We need an empty %%files list for ceph-libs-compat, to tell rpmbuild to actually
 # build this meta package.
@@ -919,6 +1307,7 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 %files devel-compat
 # We need an empty %%files list for ceph-devel-compat, to tell rpmbuild to
 # actually build this meta package.
+%endif
 
 #################################################################################
 %files -n python-ceph-compat
diff --git a/configure b/configure
index ced8866..4b93790 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for ceph 0.94.5.
+# Generated by GNU Autoconf 2.69 for ceph 9.2.0.
 #
 # Report bugs to <ceph-devel at vger.kernel.org>.
 #
@@ -590,8 +590,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='ceph'
 PACKAGE_TARNAME='ceph'
-PACKAGE_VERSION='0.94.5'
-PACKAGE_STRING='ceph 0.94.5'
+PACKAGE_VERSION='9.2.0'
+PACKAGE_STRING='ceph 9.2.0'
 PACKAGE_BUGREPORT='ceph-devel at vger.kernel.org'
 PACKAGE_URL=''
 
@@ -647,6 +647,10 @@ PYTHON_VERSION
 PYTHON
 WITH_BUILD_TESTS_FALSE
 WITH_BUILD_TESTS_TRUE
+systemd_unit_dir
+group_rgw
+user_rgw
+systemd_libexec_dir
 VALGRIND_ENABLED_FALSE
 VALGRIND_ENABLED_TRUE
 HAVE_VALGRIND
@@ -657,6 +661,7 @@ LTTNG_GEN_TP_CHECK
 WITH_LTTNG_FALSE
 WITH_LTTNG_TRUE
 BOOST_PROGRAM_OPTIONS_LIBS
+BOOST_RANDOM_LIBS
 BOOST_THREAD_LIBS
 USE_BOOST_SPIRIT_OLD_HDR_FALSE
 USE_BOOST_SPIRIT_OLD_HDR_TRUE
@@ -668,10 +673,6 @@ WITH_LIBXFS_FALSE
 WITH_LIBXFS_TRUE
 WITH_LIBAIO_FALSE
 WITH_LIBAIO_TRUE
-WITH_REST_BENCH_FALSE
-WITH_REST_BENCH_TRUE
-WITH_SYSTEM_LIBS3_FALSE
-WITH_SYSTEM_LIBS3_TRUE
 WITH_LIBROCKSDB_FALSE
 WITH_LIBROCKSDB_TRUE
 WITH_SLIBROCKSDB_FALSE
@@ -680,7 +681,6 @@ WITH_DLIBROCKSDB_FALSE
 WITH_DLIBROCKSDB_TRUE
 LIBROCKSDB_LIBS
 LIBROCKSDB_CFLAGS
-HAVE_CXX11
 WITH_KINETIC_FALSE
 WITH_KINETIC_TRUE
 HAVE_SSE4_PCLMUL_FALSE
@@ -695,9 +695,12 @@ INTEL_SSSE3_FLAGS
 INTEL_SSE3_FLAGS
 INTEL_SSE2_FLAGS
 INTEL_SSE_FLAGS
+HAVE_ARMV8_CRC_FALSE
+HAVE_ARMV8_CRC_TRUE
 HAVE_NEON_FALSE
 HAVE_NEON_TRUE
 ARM_FLAGS
+ARM_CRC_FLAGS
 ARM_NEON_FLAGS
 WITH_OCF_FALSE
 WITH_OCF_TRUE
@@ -727,7 +730,8 @@ WITH_JEMALLOC_TRUE
 LIBJEMALLOC
 WITH_FUSE_FALSE
 WITH_FUSE_TRUE
-LIBFUSE
+LIBFUSE_LIBS
+LIBFUSE_CFLAGS
 WITH_RADOSGW_FALSE
 WITH_RADOSGW_TRUE
 GCOV_PREFIX_STRIP
@@ -737,8 +741,8 @@ WITH_DEBUG_FALSE
 WITH_DEBUG_TRUE
 WITH_PROFILER_FALSE
 WITH_PROFILER_TRUE
-ENABLE_ROOT_MAKE_CHECK_FALSE
-ENABLE_ROOT_MAKE_CHECK_TRUE
+NO_GIT_VERSION_FALSE
+NO_GIT_VERSION_TRUE
 CRYPTO_LIBS
 CRYPTO_CFLAGS
 NSS_LIBS
@@ -776,6 +780,8 @@ WITH_MON_FALSE
 WITH_MON_TRUE
 WITH_RADOSSTRIPER_FALSE
 WITH_RADOSSTRIPER_TRUE
+WITH_SELINUX_FALSE
+WITH_SELINUX_TRUE
 WITH_CEPHFS_FALSE
 WITH_CEPHFS_TRUE
 WITH_RBD_FALSE
@@ -785,68 +791,29 @@ WITH_RADOS_TRUE
 AM_CXXFLAGS
 CLANG_FALSE
 CLANG_TRUE
-CXXCPP
-am__fastdepCXX_FALSE
-am__fastdepCXX_TRUE
-CXXDEPMODE
-ac_ct_CXX
-CXXFLAGS
-CXX
 DARWIN_FALSE
 DARWIN_TRUE
 FREEBSD_FALSE
 FREEBSD_TRUE
 LINUX_FALSE
 LINUX_TRUE
-am__fastdepCCAS_FALSE
-am__fastdepCCAS_TRUE
-CCASDEPMODE
-CCASFLAGS
-CCAS
-CPP
-OTOOL64
-OTOOL
-LIPO
-NMEDIT
-DSYMUTIL
-MANIFEST_TOOL
-RANLIB
-DLLTOOL
-OBJDUMP
-LN_S
-NM
-ac_ct_DUMPBIN
-DUMPBIN
-LD
-FGREP
-EGREP
-GREP
-SED
-LIBTOOL
 AM_BACKSLASH
 AM_DEFAULT_VERBOSITY
 AM_DEFAULT_V
 AM_V
+am__fastdepCXX_FALSE
+am__fastdepCXX_TRUE
+CXXDEPMODE
 am__fastdepCC_FALSE
 am__fastdepCC_TRUE
 CCDEPMODE
-am__nodep
-AMDEPBACKSLASH
-AMDEP_FALSE
-AMDEP_TRUE
-am__quote
-am__include
-DEPDIR
 am__untar
 am__tar
 AMTAR
-am__leading_dot
 SET_MAKE
-AWK
 mkdir_p
 MKDIR_P
 INSTALL_STRIP_PROGRAM
-STRIP
 install_sh
 MAKEINFO
 AUTOHEADER
@@ -860,11 +827,43 @@ am__isrc
 INSTALL_DATA
 INSTALL_SCRIPT
 INSTALL_PROGRAM
-OBJEXT
-EXEEXT
+am__fastdepCCAS_FALSE
+am__fastdepCCAS_TRUE
+CCASDEPMODE
+am__nodep
+AMDEPBACKSLASH
+AMDEP_FALSE
+AMDEP_TRUE
+am__quote
+am__include
+DEPDIR
+am__leading_dot
+CCASFLAGS
+CCAS
+CXXCPP
+CPP
+OTOOL64
+OTOOL
+LIPO
+NMEDIT
+DSYMUTIL
+MANIFEST_TOOL
+AWK
+RANLIB
+STRIP
+DLLTOOL
+OBJDUMP
+LN_S
+NM
+ac_ct_DUMPBIN
+DUMPBIN
+LD
+FGREP
+EGREP
+GREP
+SED
+LIBTOOL
 ac_ct_CC
-CPPFLAGS
-LDFLAGS
 CFLAGS
 CC
 ac_ct_AR
@@ -882,8 +881,19 @@ build_vendor
 build_cpu
 build
 subdirs
+WITH_MAN_PAGES_FALSE
+WITH_MAN_PAGES_TRUE
+SPHINX_BUILD
 GIT_CHECK
 RPM_RELEASE
+HAVE_CXX11
+OBJEXT
+EXEEXT
+ac_ct_CXX
+CPPFLAGS
+LDFLAGS
+CXXFLAGS
+CXX
 target_alias
 host_alias
 build_alias
@@ -925,8 +935,7 @@ SHELL'
 ac_subst_files=''
 ac_user_opts='
 enable_option_checking
-enable_dependency_tracking
-enable_silent_rules
+with_man_pages
 enable_shared
 enable_static
 with_pic
@@ -934,10 +943,13 @@ enable_fast_install
 with_gnu_ld
 with_sysroot
 enable_libtool_lock
+enable_dependency_tracking
+enable_silent_rules
 with_rados
 with_rbd
 with_cephfs
 with_radosgw
+with_selinux
 with_radosstriper
 with_mon
 with_osd
@@ -946,7 +958,7 @@ enable_client
 enable_server
 with_cryptopp
 with_nss
-enable_root_make_check
+enable_gitversion
 with_profiler
 with_debug
 enable_coverage
@@ -963,30 +975,32 @@ with_ocf
 with_kinetic
 with_librocksdb
 with_librocksdb_static
-with_system_libs3
-with_rest_bench
 with_libaio
 with_libxfs
 with_libzfs
 with_lttng
 with_babeltrace
 enable_valgrind
+with_systemd_libexec_dir
+with_rgw_user
+with_rgw_group
+with_systemd_unit_dir
 '
       ac_precious_vars='build_alias
 host_alias
 target_alias
-CC
-CFLAGS
+CXX
+CXXFLAGS
 LDFLAGS
 LIBS
 CPPFLAGS
+CCC
+CC
+CFLAGS
 CPP
+CXXCPP
 CCAS
 CCASFLAGS
-CXX
-CXXFLAGS
-CCC
-CXXCPP
 PKG_CONFIG
 PKG_CONFIG_PATH
 PKG_CONFIG_LIBDIR
@@ -994,6 +1008,8 @@ CRYPTOPP_CFLAGS
 CRYPTOPP_LIBS
 NSS_CFLAGS
 NSS_LIBS
+LIBFUSE_CFLAGS
+LIBFUSE_LIBS
 LIBEDIT_CFLAGS
 LIBEDIT_LIBS
 LIBROCKSDB_CFLAGS
@@ -1542,7 +1558,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures ceph 0.94.5 to adapt to many kinds of systems.
+\`configure' configures ceph 9.2.0 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1613,7 +1629,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of ceph 0.94.5:";;
+     short | recursive ) echo "Configuration of ceph 9.2.0:";;
    esac
   cat <<\_ACEOF
 
@@ -1621,21 +1637,20 @@ Optional Features:
   --disable-option-checking  ignore unrecognized --enable/--with options
   --disable-FEATURE       do not include FEATURE (same as --enable-FEATURE=no)
   --enable-FEATURE[=ARG]  include FEATURE [ARG=yes]
+  --enable-shared[=PKGS]  build shared libraries [default=yes]
+  --enable-static[=PKGS]  build static libraries [default=yes]
+  --enable-fast-install[=PKGS]
+                          optimize for fast installation [default=yes]
+  --disable-libtool-lock  avoid locking (might break parallel builds)
   --enable-dependency-tracking
                           do not reject slow dependency extractors
   --disable-dependency-tracking
                           speeds up one-time build
   --enable-silent-rules   less verbose build output (undo: "make V=1")
   --disable-silent-rules  verbose build output (undo: "make V=0")
-  --enable-shared[=PKGS]  build shared libraries [default=yes]
-  --enable-static[=PKGS]  build static libraries [default=yes]
-  --enable-fast-install[=PKGS]
-                          optimize for fast installation [default=yes]
-  --disable-libtool-lock  avoid locking (might break parallel builds)
   --enable-client         enable client-side build
   --enable-server         enable server-side build
-  --enable-root-make-check
-                          enable make check tests that require root privileges
+  --enable-gitversion     build Ceph with git version string
   --enable-coverage       enable code coverage tracking
   --enable-pgrefdebugging enable pg ref debugging
   --enable-cephfs-java    build libcephfs Java bindings
@@ -1645,6 +1660,7 @@ Optional Features:
 Optional Packages:
   --with-PACKAGE[=ARG]    use PACKAGE [ARG=yes]
   --without-PACKAGE       do not use PACKAGE (same as --with-PACKAGE=no)
+  --with-man-pages        build man pages
   --with-pic[=PKGS]       try to use only PIC/non-PIC objects [default=use
                           both]
   --with-gnu-ld           assume the C compiler uses GNU ld [default=no]
@@ -1654,6 +1670,7 @@ Optional Packages:
   --with-rbd              build rbd files
   --with-cephfs           build cephfs files
   --with-radosgw          build RADOS gateway
+  --with-selinux          build SELinux policy
   --with-radosstriper     build radosstriper files
   --with-mon              build Ceph monitor software files
   --with-osd              build object store daemon files
@@ -1661,7 +1678,7 @@ Optional Packages:
   --with-cryptopp         Use cryptographic functions from cryptopp
   --with-nss              Use cryptographic functions from nss
   --with-profiler         build extra profiler binaries
-  --with-debug            build extra debug binaries
+  --with-debug            build extra debug binaries and tests
   --without-fuse          disable FUSE userspace client
   --with-jemalloc         enable jemalloc for memory allocations
   --with-tcmalloc-minimal enable minimal tcmalloc support for memory
@@ -1674,28 +1691,36 @@ Optional Packages:
   --with-librocksdb       build rocksdb support
   --with-librocksdb-static
                           build rocksdb support
-  --with-system-libs3     use system libs3
-  --with-rest-bench       enables rest-bench
   --without-libaio        disable libaio use by journal
   --without-libxfs        disable libxfs use by FileStore
   --with-libzfs           build ZFS support
   --with-lttng            Trace with LTTng
   --with-babeltrace       Enable Babeltrace
+  --with-systemd-libexec-dir=DIR
+                          systemd libexec directory [SYSTEMD_LIBEXEC_DIR]
+                          defaults to --libexecdir=DIR
+  --with-rgw-user=USER    systemd unit directory [USER_RGW] Defaults to
+                          "www-data"
+  --with-rgw-group=GROUP  systemd unit directory [GROUP_RGW] Defaults to
+                          "www-data"
+  --with-systemdsystemunitdir=DIR
+                          systemd unit directory [SYSTEMD_UNIT_DIR] Defaults
+                          to the correct value for debian /etc/systemd/system/
 
 Some influential environment variables:
-  CC          C compiler command
-  CFLAGS      C compiler flags
+  CXX         C++ compiler command
+  CXXFLAGS    C++ compiler flags
   LDFLAGS     linker flags, e.g. -L<lib dir> if you have libraries in a
               nonstandard directory <lib dir>
   LIBS        libraries to pass to the linker, e.g. -l<library>
   CPPFLAGS    (Objective) C/C++ preprocessor flags, e.g. -I<include dir> if
               you have headers in a nonstandard directory <include dir>
+  CC          C compiler command
+  CFLAGS      C compiler flags
   CPP         C preprocessor
+  CXXCPP      C++ preprocessor
   CCAS        assembler compiler command (defaults to CC)
   CCASFLAGS   assembler compiler flags (defaults to CFLAGS)
-  CXX         C++ compiler command
-  CXXFLAGS    C++ compiler flags
-  CXXCPP      C++ preprocessor
   PKG_CONFIG  path to pkg-config utility
   PKG_CONFIG_PATH
               directories to add to pkg-config's search path
@@ -1707,6 +1732,10 @@ Some influential environment variables:
               linker flags for CRYPTOPP, overriding pkg-config
   NSS_CFLAGS  C compiler flags for NSS, overriding pkg-config
   NSS_LIBS    linker flags for NSS, overriding pkg-config
+  LIBFUSE_CFLAGS
+              C compiler flags for LIBFUSE, overriding pkg-config
+  LIBFUSE_LIBS
+              linker flags for LIBFUSE, overriding pkg-config
   LIBEDIT_CFLAGS
               C compiler flags for LIBEDIT, overriding pkg-config
   LIBEDIT_LIBS
@@ -1786,7 +1815,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-ceph configure 0.94.5
+ceph configure 9.2.0
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -1800,6 +1829,44 @@ fi
 ## Autoconf initialization. ##
 ## ------------------------ ##
 
+# ac_fn_cxx_try_compile LINENO
+# ----------------------------
+# Try to compile conftest.$ac_ext, and return whether this succeeded.
+ac_fn_cxx_try_compile ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  rm -f conftest.$ac_objext
+  if { { ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_compile") 2>conftest.err
+  ac_status=$?
+  if test -s conftest.err; then
+    grep -v '^ *+' conftest.err >conftest.er1
+    cat conftest.er1 >&5
+    mv -f conftest.er1 conftest.err
+  fi
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; } && {
+	 test -z "$ac_cxx_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then :
+  ac_retval=0
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ac_retval=1
+fi
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+  as_fn_set_status $ac_retval
+
+} # ac_fn_cxx_try_compile
+
 # ac_fn_c_try_compile LINENO
 # --------------------------
 # Try to compile conftest.$ac_ext, and return whether this succeeded.
@@ -2061,44 +2128,6 @@ $as_echo "$ac_res" >&6; }
 
 } # ac_fn_c_check_func
 
-# ac_fn_cxx_try_compile LINENO
-# ----------------------------
-# Try to compile conftest.$ac_ext, and return whether this succeeded.
-ac_fn_cxx_try_compile ()
-{
-  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
-  rm -f conftest.$ac_objext
-  if { { ac_try="$ac_compile"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_compile") 2>conftest.err
-  ac_status=$?
-  if test -s conftest.err; then
-    grep -v '^ *+' conftest.err >conftest.er1
-    cat conftest.er1 >&5
-    mv -f conftest.er1 conftest.err
-  fi
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; } && {
-	 test -z "$ac_cxx_werror_flag" ||
-	 test ! -s conftest.err
-       } && test -s conftest.$ac_objext; then :
-  ac_retval=0
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-	ac_retval=1
-fi
-  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
-  as_fn_set_status $ac_retval
-
-} # ac_fn_cxx_try_compile
-
 # ac_fn_cxx_try_cpp LINENO
 # ------------------------
 # Try to preprocess conftest.$ac_ext, and return whether this succeeded.
@@ -2862,7 +2891,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by ceph $as_me 0.94.5, which was
+It was created by ceph $as_me 9.2.0, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -3211,19 +3240,27 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
 
 
-# Create release string.  Used with VERSION for RPMs.
-RPM_RELEASE=0
-
-if test -d ".git" ; then
-  # Extract the first word of "git", so it can be a program name with args.
-set dummy git; ac_word=$2
+ac_ext=cpp
+ac_cpp='$CXXCPP $CPPFLAGS'
+ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
+if test -z "$CXX"; then
+  if test -n "$CCC"; then
+    CXX=$CCC
+  else
+    if test -n "$ac_tool_prefix"; then
+  for ac_prog in g++ c++ gpp aCC CC cxx cc++ cl.exe FCC KCC RCC xlC_r xlC
+  do
+    # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args.
+set dummy $ac_tool_prefix$ac_prog; ac_word=$2
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
 $as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_GIT_CHECK+:} false; then :
+if ${ac_cv_prog_CXX+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  if test -n "$GIT_CHECK"; then
-  ac_cv_prog_GIT_CHECK="$GIT_CHECK" # Let the user override the test.
+  if test -n "$CXX"; then
+  ac_cv_prog_CXX="$CXX" # Let the user override the test.
 else
 as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
 for as_dir in $PATH
@@ -3232,7 +3269,7 @@ do
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
   if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_prog_GIT_CHECK="yes"
+    ac_cv_prog_CXX="$ac_tool_prefix$ac_prog"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
   fi
@@ -3242,190 +3279,32 @@ IFS=$as_save_IFS
 
 fi
 fi
-GIT_CHECK=$ac_cv_prog_GIT_CHECK
-if test -n "$GIT_CHECK"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $GIT_CHECK" >&5
-$as_echo "$GIT_CHECK" >&6; }
+CXX=$ac_cv_prog_CXX
+if test -n "$CXX"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CXX" >&5
+$as_echo "$CXX" >&6; }
 else
   { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
 fi
 
 
-  if test x"$GIT_CHECK" = x"yes"; then
-    RPM_RELEASE=`if expr index $(git describe --always) '-' > /dev/null ; then git describe --always | cut -d- -f2- | tr '-' '.' ; else echo "0"; fi`
-  fi
+    test -n "$CXX" && break
+  done
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: RPM_RELEASE='$RPM_RELEASE'" >&5
-$as_echo "$as_me: RPM_RELEASE='$RPM_RELEASE'" >&6;}
-
-
-
-ac_aux_dir=
-for ac_dir in "$srcdir" "$srcdir/.." "$srcdir/../.."; do
-  if test -f "$ac_dir/install-sh"; then
-    ac_aux_dir=$ac_dir
-    ac_install_sh="$ac_aux_dir/install-sh -c"
-    break
-  elif test -f "$ac_dir/install.sh"; then
-    ac_aux_dir=$ac_dir
-    ac_install_sh="$ac_aux_dir/install.sh -c"
-    break
-  elif test -f "$ac_dir/shtool"; then
-    ac_aux_dir=$ac_dir
-    ac_install_sh="$ac_aux_dir/shtool install -c"
-    break
-  fi
-done
-if test -z "$ac_aux_dir"; then
-  as_fn_error $? "cannot find install-sh, install.sh, or shtool in \"$srcdir\" \"$srcdir/..\" \"$srcdir/../..\"" "$LINENO" 5
-fi
-
-# These three variables are undocumented and unsupported,
-# and are intended to be withdrawn in a future Autoconf release.
-# They can cause serious problems if a builder's source tree is in a directory
-# whose full name contains unusual characters.
-ac_config_guess="$SHELL $ac_aux_dir/config.guess"  # Please don't use this var.
-ac_config_sub="$SHELL $ac_aux_dir/config.sub"  # Please don't use this var.
-ac_configure="$SHELL $ac_aux_dir/configure"  # Please don't use this var.
-
-
-
-
-subdirs="$subdirs src/gmock"
-
-
-# Environment
-# Make sure we can run config.sub.
-$SHELL "$ac_aux_dir/config.sub" sun4 >/dev/null 2>&1 ||
-  as_fn_error $? "cannot run $SHELL $ac_aux_dir/config.sub" "$LINENO" 5
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking build system type" >&5
-$as_echo_n "checking build system type... " >&6; }
-if ${ac_cv_build+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_build_alias=$build_alias
-test "x$ac_build_alias" = x &&
-  ac_build_alias=`$SHELL "$ac_aux_dir/config.guess"`
-test "x$ac_build_alias" = x &&
-  as_fn_error $? "cannot guess build type; you must specify one" "$LINENO" 5
-ac_cv_build=`$SHELL "$ac_aux_dir/config.sub" $ac_build_alias` ||
-  as_fn_error $? "$SHELL $ac_aux_dir/config.sub $ac_build_alias failed" "$LINENO" 5
-
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_build" >&5
-$as_echo "$ac_cv_build" >&6; }
-case $ac_cv_build in
-*-*-*) ;;
-*) as_fn_error $? "invalid value of canonical build" "$LINENO" 5;;
-esac
-build=$ac_cv_build
-ac_save_IFS=$IFS; IFS='-'
-set x $ac_cv_build
-shift
-build_cpu=$1
-build_vendor=$2
-shift; shift
-# Remember, the first character of IFS is used to create $*,
-# except with old shells:
-build_os=$*
-IFS=$ac_save_IFS
-case $build_os in *\ *) build_os=`echo "$build_os" | sed 's/ /-/g'`;; esac
-
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking host system type" >&5
-$as_echo_n "checking host system type... " >&6; }
-if ${ac_cv_host+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test "x$host_alias" = x; then
-  ac_cv_host=$ac_cv_build
-else
-  ac_cv_host=`$SHELL "$ac_aux_dir/config.sub" $host_alias` ||
-    as_fn_error $? "$SHELL $ac_aux_dir/config.sub $host_alias failed" "$LINENO" 5
-fi
-
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_host" >&5
-$as_echo "$ac_cv_host" >&6; }
-case $ac_cv_host in
-*-*-*) ;;
-*) as_fn_error $? "invalid value of canonical host" "$LINENO" 5;;
-esac
-host=$ac_cv_host
-ac_save_IFS=$IFS; IFS='-'
-set x $ac_cv_host
-shift
-host_cpu=$1
-host_vendor=$2
-shift; shift
-# Remember, the first character of IFS is used to create $*,
-# except with old shells:
-host_os=$*
-IFS=$ac_save_IFS
-case $host_os in *\ *) host_os=`echo "$host_os" | sed 's/ /-/g'`;; esac
-
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking target system type" >&5
-$as_echo_n "checking target system type... " >&6; }
-if ${ac_cv_target+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test "x$target_alias" = x; then
-  ac_cv_target=$ac_cv_host
-else
-  ac_cv_target=`$SHELL "$ac_aux_dir/config.sub" $target_alias` ||
-    as_fn_error $? "$SHELL $ac_aux_dir/config.sub $target_alias failed" "$LINENO" 5
-fi
-
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_target" >&5
-$as_echo "$ac_cv_target" >&6; }
-case $ac_cv_target in
-*-*-*) ;;
-*) as_fn_error $? "invalid value of canonical target" "$LINENO" 5;;
-esac
-target=$ac_cv_target
-ac_save_IFS=$IFS; IFS='-'
-set x $ac_cv_target
-shift
-target_cpu=$1
-target_vendor=$2
-shift; shift
-# Remember, the first character of IFS is used to create $*,
-# except with old shells:
-target_os=$*
-IFS=$ac_save_IFS
-case $target_os in *\ *) target_os=`echo "$target_os" | sed 's/ /-/g'`;; esac
-
-
-# The aliases save the names the user supplied, while $host etc.
-# will get canonicalized.
-test -n "$target_alias" &&
-  test "$program_prefix$program_suffix$program_transform_name" = \
-    NONENONEs,x,x, &&
-  program_prefix=${target_alias}-
-
-# Fix automake problems in 1.12
-# expand $ac_aux_dir to an absolute path
-am_aux_dir=`cd $ac_aux_dir && pwd`
-
-ac_ext=c
-ac_cpp='$CPP $CPPFLAGS'
-ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
-ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_c_compiler_gnu
-if test -n "$ac_tool_prefix"; then
-  # Extract the first word of "${ac_tool_prefix}gcc", so it can be a program name with args.
-set dummy ${ac_tool_prefix}gcc; ac_word=$2
+if test -z "$CXX"; then
+  ac_ct_CXX=$CXX
+  for ac_prog in g++ c++ gpp aCC CC cxx cc++ cl.exe FCC KCC RCC xlC_r xlC
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
 $as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_CC+:} false; then :
+if ${ac_cv_prog_ac_ct_CXX+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  if test -n "$CC"; then
-  ac_cv_prog_CC="$CC" # Let the user override the test.
+  if test -n "$ac_ct_CXX"; then
+  ac_cv_prog_ac_ct_CXX="$ac_ct_CXX" # Let the user override the test.
 else
 as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
 for as_dir in $PATH
@@ -3434,7 +3313,7 @@ do
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
   if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_prog_CC="${ac_tool_prefix}gcc"
+    ac_cv_prog_ac_ct_CXX="$ac_prog"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
   fi
@@ -3444,57 +3323,21 @@ IFS=$as_save_IFS
 
 fi
 fi
-CC=$ac_cv_prog_CC
-if test -n "$CC"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
-$as_echo "$CC" >&6; }
+ac_ct_CXX=$ac_cv_prog_ac_ct_CXX
+if test -n "$ac_ct_CXX"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CXX" >&5
+$as_echo "$ac_ct_CXX" >&6; }
 else
   { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
 fi
 
 
-fi
-if test -z "$ac_cv_prog_CC"; then
-  ac_ct_CC=$CC
-  # Extract the first word of "gcc", so it can be a program name with args.
-set dummy gcc; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_ac_ct_CC+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -n "$ac_ct_CC"; then
-  ac_cv_prog_ac_ct_CC="$ac_ct_CC" # Let the user override the test.
-else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
-  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_prog_ac_ct_CC="gcc"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
+  test -n "$ac_ct_CXX" && break
 done
-  done
-IFS=$as_save_IFS
-
-fi
-fi
-ac_ct_CC=$ac_cv_prog_ac_ct_CC
-if test -n "$ac_ct_CC"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CC" >&5
-$as_echo "$ac_ct_CC" >&6; }
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-fi
 
-  if test "x$ac_ct_CC" = x; then
-    CC=""
+  if test "x$ac_ct_CXX" = x; then
+    CXX="g++"
   else
     case $cross_compiling:$ac_tool_warned in
 yes:)
@@ -3502,220 +3345,14 @@ yes:)
 $as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
 ac_tool_warned=yes ;;
 esac
-    CC=$ac_ct_CC
+    CXX=$ac_ct_CXX
   fi
-else
-  CC="$ac_cv_prog_CC"
 fi
 
-if test -z "$CC"; then
-          if test -n "$ac_tool_prefix"; then
-    # Extract the first word of "${ac_tool_prefix}cc", so it can be a program name with args.
-set dummy ${ac_tool_prefix}cc; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_CC+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -n "$CC"; then
-  ac_cv_prog_CC="$CC" # Let the user override the test.
-else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
-  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_prog_CC="${ac_tool_prefix}cc"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
   fi
-done
-  done
-IFS=$as_save_IFS
-
 fi
-fi
-CC=$ac_cv_prog_CC
-if test -n "$CC"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
-$as_echo "$CC" >&6; }
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-fi
-
-
-  fi
-fi
-if test -z "$CC"; then
-  # Extract the first word of "cc", so it can be a program name with args.
-set dummy cc; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_CC+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -n "$CC"; then
-  ac_cv_prog_CC="$CC" # Let the user override the test.
-else
-  ac_prog_rejected=no
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
-  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    if test "$as_dir/$ac_word$ac_exec_ext" = "/usr/ucb/cc"; then
-       ac_prog_rejected=yes
-       continue
-     fi
-    ac_cv_prog_CC="cc"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-  done
-IFS=$as_save_IFS
-
-if test $ac_prog_rejected = yes; then
-  # We found a bogon in the path, so make sure we never use it.
-  set dummy $ac_cv_prog_CC
-  shift
-  if test $# != 0; then
-    # We chose a different compiler from the bogus one.
-    # However, it has the same basename, so the bogon will be chosen
-    # first if we set CC to just the basename; use the full file name.
-    shift
-    ac_cv_prog_CC="$as_dir/$ac_word${1+' '}$@"
-  fi
-fi
-fi
-fi
-CC=$ac_cv_prog_CC
-if test -n "$CC"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
-$as_echo "$CC" >&6; }
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-fi
-
-
-fi
-if test -z "$CC"; then
-  if test -n "$ac_tool_prefix"; then
-  for ac_prog in cl.exe
-  do
-    # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args.
-set dummy $ac_tool_prefix$ac_prog; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_CC+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -n "$CC"; then
-  ac_cv_prog_CC="$CC" # Let the user override the test.
-else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
-  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_prog_CC="$ac_tool_prefix$ac_prog"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-  done
-IFS=$as_save_IFS
-
-fi
-fi
-CC=$ac_cv_prog_CC
-if test -n "$CC"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
-$as_echo "$CC" >&6; }
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-fi
-
-
-    test -n "$CC" && break
-  done
-fi
-if test -z "$CC"; then
-  ac_ct_CC=$CC
-  for ac_prog in cl.exe
-do
-  # Extract the first word of "$ac_prog", so it can be a program name with args.
-set dummy $ac_prog; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_ac_ct_CC+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -n "$ac_ct_CC"; then
-  ac_cv_prog_ac_ct_CC="$ac_ct_CC" # Let the user override the test.
-else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
-  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_prog_ac_ct_CC="$ac_prog"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-  done
-IFS=$as_save_IFS
-
-fi
-fi
-ac_ct_CC=$ac_cv_prog_ac_ct_CC
-if test -n "$ac_ct_CC"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CC" >&5
-$as_echo "$ac_ct_CC" >&6; }
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-fi
-
-
-  test -n "$ac_ct_CC" && break
-done
-
-  if test "x$ac_ct_CC" = x; then
-    CC=""
-  else
-    case $cross_compiling:$ac_tool_warned in
-yes:)
-{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
-$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
-ac_tool_warned=yes ;;
-esac
-    CC=$ac_ct_CC
-  fi
-fi
-
-fi
-
-
-test -z "$CC" && { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "no acceptable C compiler found in \$PATH
-See \`config.log' for more details" "$LINENO" 5; }
-
 # Provide some information about the compiler.
-$as_echo "$as_me:${as_lineno-$LINENO}: checking for C compiler version" >&5
+$as_echo "$as_me:${as_lineno-$LINENO}: checking for C++ compiler version" >&5
 set X $ac_compile
 ac_compiler=$2
 for ac_option in --version -v -V -qversion; do
@@ -3755,8 +3392,8 @@ ac_clean_files="$ac_clean_files a.out a.out.dSYM a.exe b.out"
 # Try to create an executable without -o first, disregard a.out.
 # It will help us diagnose broken compilers, and finding out an intuition
 # of exeext.
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the C compiler works" >&5
-$as_echo_n "checking whether the C compiler works... " >&6; }
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the C++ compiler works" >&5
+$as_echo_n "checking whether the C++ compiler works... " >&6; }
 ac_link_default=`$as_echo "$ac_link" | sed 's/ -o *conftest[^ ]*//'`
 
 # The possible output files:
@@ -3826,14 +3463,14 @@ sed 's/^/| /' conftest.$ac_ext >&5
 
 { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
 $as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error 77 "C compiler cannot create executables
+as_fn_error 77 "C++ compiler cannot create executables
 See \`config.log' for more details" "$LINENO" 5; }
 else
   { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for C compiler default output file name" >&5
-$as_echo_n "checking for C compiler default output file name... " >&6; }
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for C++ compiler default output file name" >&5
+$as_echo_n "checking for C++ compiler default output file name... " >&6; }
 { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_file" >&5
 $as_echo "$ac_file" >&6; }
 ac_exeext=$ac_cv_exeext
@@ -3927,7 +3564,7 @@ $as_echo "$ac_try_echo"; } >&5
     else
 	{ { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
 $as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "cannot run C compiled programs.
+as_fn_error $? "cannot run C++ compiled programs.
 If you meant to cross compile, use \`--host'.
 See \`config.log' for more details" "$LINENO" 5; }
     fi
@@ -3989,9 +3626,9 @@ fi
 $as_echo "$ac_cv_objext" >&6; }
 OBJEXT=$ac_cv_objext
 ac_objext=$OBJEXT
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are using the GNU C compiler" >&5
-$as_echo_n "checking whether we are using the GNU C compiler... " >&6; }
-if ${ac_cv_c_compiler_gnu+:} false; then :
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are using the GNU C++ compiler" >&5
+$as_echo_n "checking whether we are using the GNU C++ compiler... " >&6; }
+if ${ac_cv_cxx_compiler_gnu+:} false; then :
   $as_echo_n "(cached) " >&6
 else
   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
@@ -4008,33 +3645,33 @@ main ()
   return 0;
 }
 _ACEOF
-if ac_fn_c_try_compile "$LINENO"; then :
+if ac_fn_cxx_try_compile "$LINENO"; then :
   ac_compiler_gnu=yes
 else
   ac_compiler_gnu=no
 fi
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-ac_cv_c_compiler_gnu=$ac_compiler_gnu
+ac_cv_cxx_compiler_gnu=$ac_compiler_gnu
 
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_c_compiler_gnu" >&5
-$as_echo "$ac_cv_c_compiler_gnu" >&6; }
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_cxx_compiler_gnu" >&5
+$as_echo "$ac_cv_cxx_compiler_gnu" >&6; }
 if test $ac_compiler_gnu = yes; then
-  GCC=yes
+  GXX=yes
 else
-  GCC=
+  GXX=
 fi
-ac_test_CFLAGS=${CFLAGS+set}
-ac_save_CFLAGS=$CFLAGS
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CC accepts -g" >&5
-$as_echo_n "checking whether $CC accepts -g... " >&6; }
-if ${ac_cv_prog_cc_g+:} false; then :
+ac_test_CXXFLAGS=${CXXFLAGS+set}
+ac_save_CXXFLAGS=$CXXFLAGS
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CXX accepts -g" >&5
+$as_echo_n "checking whether $CXX accepts -g... " >&6; }
+if ${ac_cv_prog_cxx_g+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  ac_save_c_werror_flag=$ac_c_werror_flag
-   ac_c_werror_flag=yes
-   ac_cv_prog_cc_g=no
-   CFLAGS="-g"
+  ac_save_cxx_werror_flag=$ac_cxx_werror_flag
+   ac_cxx_werror_flag=yes
+   ac_cv_prog_cxx_g=no
+   CXXFLAGS="-g"
    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 
@@ -4046,10 +3683,10 @@ main ()
   return 0;
 }
 _ACEOF
-if ac_fn_c_try_compile "$LINENO"; then :
-  ac_cv_prog_cc_g=yes
+if ac_fn_cxx_try_compile "$LINENO"; then :
+  ac_cv_prog_cxx_g=yes
 else
-  CFLAGS=""
+  CXXFLAGS=""
       cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 
@@ -4061,11 +3698,11 @@ main ()
   return 0;
 }
 _ACEOF
-if ac_fn_c_try_compile "$LINENO"; then :
+if ac_fn_cxx_try_compile "$LINENO"; then :
 
 else
-  ac_c_werror_flag=$ac_save_c_werror_flag
-	 CFLAGS="-g"
+  ac_cxx_werror_flag=$ac_save_cxx_werror_flag
+	 CXXFLAGS="-g"
 	 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 
@@ -4077,245 +3714,248 @@ main ()
   return 0;
 }
 _ACEOF
-if ac_fn_c_try_compile "$LINENO"; then :
-  ac_cv_prog_cc_g=yes
+if ac_fn_cxx_try_compile "$LINENO"; then :
+  ac_cv_prog_cxx_g=yes
 fi
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
 fi
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
 fi
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-   ac_c_werror_flag=$ac_save_c_werror_flag
+   ac_cxx_werror_flag=$ac_save_cxx_werror_flag
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_g" >&5
-$as_echo "$ac_cv_prog_cc_g" >&6; }
-if test "$ac_test_CFLAGS" = set; then
-  CFLAGS=$ac_save_CFLAGS
-elif test $ac_cv_prog_cc_g = yes; then
-  if test "$GCC" = yes; then
-    CFLAGS="-g -O2"
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cxx_g" >&5
+$as_echo "$ac_cv_prog_cxx_g" >&6; }
+if test "$ac_test_CXXFLAGS" = set; then
+  CXXFLAGS=$ac_save_CXXFLAGS
+elif test $ac_cv_prog_cxx_g = yes; then
+  if test "$GXX" = yes; then
+    CXXFLAGS="-g -O2"
   else
-    CFLAGS="-g"
+    CXXFLAGS="-g"
   fi
 else
-  if test "$GCC" = yes; then
-    CFLAGS="-O2"
+  if test "$GXX" = yes; then
+    CXXFLAGS="-O2"
   else
-    CFLAGS=
+    CXXFLAGS=
   fi
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $CC option to accept ISO C89" >&5
-$as_echo_n "checking for $CC option to accept ISO C89... " >&6; }
-if ${ac_cv_prog_cc_c89+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_cv_prog_cc_c89=no
-ac_save_CC=$CC
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+ac_ext=cpp
+ac_cpp='$CXXCPP $CPPFLAGS'
+ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
+
+
+    ax_cxx_compile_cxx11_required=true
+  ac_ext=cpp
+ac_cpp='$CXXCPP $CPPFLAGS'
+ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
+  ac_success=no
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CXX supports C++11 features by default" >&5
+$as_echo_n "checking whether $CXX supports C++11 features by default... " >&6; }
+if ${ax_cv_cxx_compile_cxx11+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
-#include <stdarg.h>
-#include <stdio.h>
-struct stat;
-/* Most of the following tests are stolen from RCS 5.7's src/conf.sh.  */
-struct buf { int x; };
-FILE * (*rcsopen) (struct buf *, struct stat *, int);
-static char *e (p, i)
-     char **p;
-     int i;
-{
-  return p[i];
-}
-static char *f (char * (*g) (char **, int), char **p, ...)
-{
-  char *s;
-  va_list v;
-  va_start (v,p);
-  s = g (p, va_arg (v,int));
-  va_end (v);
-  return s;
-}
 
-/* OSF 4.0 Compaq cc is some sort of almost-ANSI by default.  It has
-   function prototypes and stuff, but not '\xHH' hex character constants.
-   These don't provoke an error unfortunately, instead are silently treated
-   as 'x'.  The following induces an error, until -std is added to get
-   proper ANSI mode.  Curiously '\x00'!='x' always comes out true, for an
-   array size at least.  It's necessary to write '\x00'==0 to get something
-   that's true only with -std.  */
-int osf4_cc_array ['\x00' == 0 ? 1 : -1];
+  template <typename T>
+    struct check
+    {
+      static_assert(sizeof(int) <= sizeof(T), "not big enough");
+    };
 
-/* IBM C 6 for AIX is almost-ANSI by default, but it replaces macro parameters
-   inside strings and character constants.  */
-#define FOO(x) 'x'
-int xlc6_cc_array[FOO(a) == 'x' ? 1 : -1];
+    struct Base {
+    virtual void f() {}
+    };
+    struct Child : public Base {
+    virtual void f() override {}
+    };
 
-int test (int i, double x);
-struct s1 {int (*f) (int a);};
-struct s2 {int (*f) (double a);};
-int pairnames (int, char **, FILE *(*)(struct buf *, struct stat *, int), int, int);
-int argc;
-char **argv;
-int
-main ()
-{
-return f (e, argv, 0) != argv[0]  ||  f (e, argv, 1) != argv[1];
-  ;
-  return 0;
-}
-_ACEOF
-for ac_arg in '' -qlanglvl=extc89 -qlanglvl=ansi -std \
-	-Ae "-Aa -D_HPUX_SOURCE" "-Xc -D__EXTENSIONS__"
-do
-  CC="$ac_save_CC $ac_arg"
-  if ac_fn_c_try_compile "$LINENO"; then :
-  ac_cv_prog_cc_c89=$ac_arg
-fi
-rm -f core conftest.err conftest.$ac_objext
-  test "x$ac_cv_prog_cc_c89" != "xno" && break
-done
-rm -f conftest.$ac_ext
-CC=$ac_save_CC
+    typedef check<check<bool>> right_angle_brackets;
 
-fi
-# AC_CACHE_VAL
-case "x$ac_cv_prog_cc_c89" in
-  x)
-    { $as_echo "$as_me:${as_lineno-$LINENO}: result: none needed" >&5
-$as_echo "none needed" >&6; } ;;
-  xno)
-    { $as_echo "$as_me:${as_lineno-$LINENO}: result: unsupported" >&5
-$as_echo "unsupported" >&6; } ;;
-  *)
-    CC="$CC $ac_cv_prog_cc_c89"
-    { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_c89" >&5
-$as_echo "$ac_cv_prog_cc_c89" >&6; } ;;
-esac
-if test "x$ac_cv_prog_cc_c89" != xno; then :
+    int a;
+    decltype(a) b;
 
-fi
+    typedef check<int> check_type;
+    check_type c;
+    check_type&& cr = static_cast<check_type&&>(c);
 
-ac_ext=c
-ac_cpp='$CPP $CPPFLAGS'
-ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
-ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_c_compiler_gnu
+    auto d = a;
+    auto l = [](){};
 
-ac_ext=c
-ac_cpp='$CPP $CPPFLAGS'
-ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
-ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_c_compiler_gnu
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CC understands -c and -o together" >&5
-$as_echo_n "checking whether $CC understands -c and -o together... " >&6; }
-if ${am_cv_prog_cc_c_o+:} false; then :
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+  ax_cv_cxx_compile_cxx11=yes
+else
+  ax_cv_cxx_compile_cxx11=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_cv_cxx_compile_cxx11" >&5
+$as_echo "$ax_cv_cxx_compile_cxx11" >&6; }
+  if test x$ax_cv_cxx_compile_cxx11 = xyes; then
+    ac_success=yes
+  fi
+
+    if test x$ac_success = xno; then
+    for switch in -std=gnu++11 -std=gnu++0x; do
+      cachevar=`$as_echo "ax_cv_cxx_compile_cxx11_$switch" | $as_tr_sh`
+      { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CXX supports C++11 features with $switch" >&5
+$as_echo_n "checking whether $CXX supports C++11 features with $switch... " >&6; }
+if eval \${$cachevar+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+  ac_save_CXXFLAGS="$CXXFLAGS"
+         CXXFLAGS="$CXXFLAGS $switch"
+         cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 
-int
-main ()
-{
+  template <typename T>
+    struct check
+    {
+      static_assert(sizeof(int) <= sizeof(T), "not big enough");
+    };
 
-  ;
-  return 0;
-}
-_ACEOF
-  # Make sure it works both with $CC and with simple cc.
-  # Following AC_PROG_CC_C_O, we do the test twice because some
-  # compilers refuse to overwrite an existing .o file with -o,
-  # though they will create one.
-  am_cv_prog_cc_c_o=yes
-  for am_i in 1 2; do
-    if { echo "$as_me:$LINENO: $CC -c conftest.$ac_ext -o conftest2.$ac_objext" >&5
-   ($CC -c conftest.$ac_ext -o conftest2.$ac_objext) >&5 2>&5
-   ac_status=$?
-   echo "$as_me:$LINENO: \$? = $ac_status" >&5
-   (exit $ac_status); } \
-         && test -f conftest2.$ac_objext; then
-      : OK
-    else
-      am_cv_prog_cc_c_o=no
-      break
-    fi
-  done
-  rm -f core conftest*
-  unset am_i
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $am_cv_prog_cc_c_o" >&5
-$as_echo "$am_cv_prog_cc_c_o" >&6; }
-if test "$am_cv_prog_cc_c_o" != yes; then
-   # Losing compiler, so override with the script.
-   # FIXME: It is wrong to rewrite CC.
-   # But if we don't then we get into trouble of one sort or another.
-   # A longer-term fix would be to have automake use am__CC in this case,
-   # and then we could set am__CC="\$(top_srcdir)/compile \$(CC)"
-   CC="$am_aux_dir/compile $CC"
-fi
-ac_ext=c
-ac_cpp='$CPP $CPPFLAGS'
-ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
-ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_c_compiler_gnu
+    struct Base {
+    virtual void f() {}
+    };
+    struct Child : public Base {
+    virtual void f() override {}
+    };
 
+    typedef check<check<bool>> right_angle_brackets;
 
+    int a;
+    decltype(a) b;
 
-if test -n "$ac_tool_prefix"; then
-  for ac_prog in ar lib "link -lib"
-  do
-    # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args.
-set dummy $ac_tool_prefix$ac_prog; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_AR+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -n "$AR"; then
-  ac_cv_prog_AR="$AR" # Let the user override the test.
-else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
-  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_prog_AR="$ac_tool_prefix$ac_prog"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-  done
-IFS=$as_save_IFS
+    typedef check<int> check_type;
+    check_type c;
+    check_type&& cr = static_cast<check_type&&>(c);
 
-fi
-fi
-AR=$ac_cv_prog_AR
-if test -n "$AR"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $AR" >&5
-$as_echo "$AR" >&6; }
+    auto d = a;
+    auto l = [](){};
+
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+  eval $cachevar=yes
 else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
+  eval $cachevar=no
 fi
-
-
-    test -n "$AR" && break
-  done
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+         CXXFLAGS="$ac_save_CXXFLAGS"
 fi
-if test -z "$AR"; then
-  ac_ct_AR=$AR
-  for ac_prog in ar lib "link -lib"
-do
-  # Extract the first word of "$ac_prog", so it can be a program name with args.
-set dummy $ac_prog; ac_word=$2
+eval ac_res=\$$cachevar
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+      if eval test x\$$cachevar = xyes; then
+        CXXFLAGS="$CXXFLAGS $switch"
+        ac_success=yes
+        break
+      fi
+    done
+  fi
+
+    if test x$ac_success = xno; then
+    for switch in -std=c++11 -std=c++0x; do
+      cachevar=`$as_echo "ax_cv_cxx_compile_cxx11_$switch" | $as_tr_sh`
+      { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CXX supports C++11 features with $switch" >&5
+$as_echo_n "checking whether $CXX supports C++11 features with $switch... " >&6; }
+if eval \${$cachevar+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_save_CXXFLAGS="$CXXFLAGS"
+         CXXFLAGS="$CXXFLAGS $switch"
+         cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+  template <typename T>
+    struct check
+    {
+      static_assert(sizeof(int) <= sizeof(T), "not big enough");
+    };
+
+    struct Base {
+    virtual void f() {}
+    };
+    struct Child : public Base {
+    virtual void f() override {}
+    };
+
+    typedef check<check<bool>> right_angle_brackets;
+
+    int a;
+    decltype(a) b;
+
+    typedef check<int> check_type;
+    check_type c;
+    check_type&& cr = static_cast<check_type&&>(c);
+
+    auto d = a;
+    auto l = [](){};
+
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+  eval $cachevar=yes
+else
+  eval $cachevar=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+         CXXFLAGS="$ac_save_CXXFLAGS"
+fi
+eval ac_res=\$$cachevar
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+      if eval test x\$$cachevar = xyes; then
+        CXXFLAGS="$CXXFLAGS $switch"
+        ac_success=yes
+        break
+      fi
+    done
+  fi
+  ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+  if test x$ax_cxx_compile_cxx11_required = xtrue; then
+    if test x$ac_success = xno; then
+      as_fn_error $? "*** A compiler with support for C++11 language features is required." "$LINENO" 5
+    fi
+  else
+    if test x$ac_success = xno; then
+      HAVE_CXX11=0
+      { $as_echo "$as_me:${as_lineno-$LINENO}: No compiler with C++11 support was found" >&5
+$as_echo "$as_me: No compiler with C++11 support was found" >&6;}
+    else
+      HAVE_CXX11=1
+
+$as_echo "#define HAVE_CXX11 1" >>confdefs.h
+
+    fi
+
+
+  fi
+
+
+# Create release string.  Used with VERSION for RPMs.
+RPM_RELEASE=0
+
+if test -d ".git" ; then
+  # Extract the first word of "git", so it can be a program name with args.
+set dummy git; ac_word=$2
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
 $as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_ac_ct_AR+:} false; then :
+if ${ac_cv_prog_GIT_CHECK+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  if test -n "$ac_ct_AR"; then
-  ac_cv_prog_ac_ct_AR="$ac_ct_AR" # Let the user override the test.
+  if test -n "$GIT_CHECK"; then
+  ac_cv_prog_GIT_CHECK="$GIT_CHECK" # Let the user override the test.
 else
 as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
 for as_dir in $PATH
@@ -4324,7 +3964,7 @@ do
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
   if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_prog_ac_ct_AR="$ac_prog"
+    ac_cv_prog_GIT_CHECK="yes"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
   fi
@@ -4334,318 +3974,255 @@ IFS=$as_save_IFS
 
 fi
 fi
-ac_ct_AR=$ac_cv_prog_ac_ct_AR
-if test -n "$ac_ct_AR"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_AR" >&5
-$as_echo "$ac_ct_AR" >&6; }
+GIT_CHECK=$ac_cv_prog_GIT_CHECK
+if test -n "$GIT_CHECK"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $GIT_CHECK" >&5
+$as_echo "$GIT_CHECK" >&6; }
 else
   { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
 fi
 
 
-  test -n "$ac_ct_AR" && break
-done
-
-  if test "x$ac_ct_AR" = x; then
-    AR="false"
-  else
-    case $cross_compiling:$ac_tool_warned in
-yes:)
-{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
-$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
-ac_tool_warned=yes ;;
-esac
-    AR=$ac_ct_AR
+  if test x"$GIT_CHECK" = x"yes"; then
+    RPM_RELEASE=`if expr index $(git describe --always) '-' > /dev/null ; then git describe --always | cut -d- -f2- | tr '-' '.' ; else echo "0"; fi`
   fi
 fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: RPM_RELEASE='$RPM_RELEASE'" >&5
+$as_echo "$as_me: RPM_RELEASE='$RPM_RELEASE'" >&6;}
 
-: ${AR=ar}
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking the archiver ($AR) interface" >&5
-$as_echo_n "checking the archiver ($AR) interface... " >&6; }
-if ${am_cv_ar_interface+:} false; then :
-  $as_echo_n "(cached) " >&6
+# Check whether --with-man-pages was given.
+if test "${with_man_pages+set}" = set; then :
+  withval=$with_man_pages;
 else
-  ac_ext=c
-ac_cpp='$CPP $CPPFLAGS'
-ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
-ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_c_compiler_gnu
+  with_man_pages=check
+fi
 
-   am_cv_ar_interface=ar
-   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-int some_variable = 0;
-_ACEOF
-if ac_fn_c_try_compile "$LINENO"; then :
-  am_ar_try='$AR cru libconftest.a conftest.$ac_objext >&5'
-      { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$am_ar_try\""; } >&5
-  (eval $am_ar_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }
-      if test "$ac_status" -eq 0; then
-        am_cv_ar_interface=ar
-      else
-        am_ar_try='$AR -NOLOGO -OUT:conftest.lib conftest.$ac_objext >&5'
-        { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$am_ar_try\""; } >&5
-  (eval $am_ar_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }
-        if test "$ac_status" -eq 0; then
-          am_cv_ar_interface=lib
-        else
-          am_cv_ar_interface=unknown
-        fi
-      fi
-      rm -f conftest.lib libconftest.a
+if test "x$with_man_pages" != "xno"; then :
+  for ac_prog in sphinx-1.0-build sphinx-build
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_SPHINX_BUILD+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$SPHINX_BUILD"; then
+  ac_cv_prog_SPHINX_BUILD="$SPHINX_BUILD" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_SPHINX_BUILD="$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
 
 fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-   ac_ext=c
-ac_cpp='$CPP $CPPFLAGS'
-ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
-ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_c_compiler_gnu
+fi
+SPHINX_BUILD=$ac_cv_prog_SPHINX_BUILD
+if test -n "$SPHINX_BUILD"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $SPHINX_BUILD" >&5
+$as_echo "$SPHINX_BUILD" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
 
+
+  test -n "$SPHINX_BUILD" && break
+done
+
+   if test -z "$SPHINX_BUILD" && \
+          test "x$with_man_pages" = "xyes"; then :
+  as_fn_error $? "sphinx-build not found (python-sphinx)" "$LINENO" 5
+fi
+fi
+ if test -n "$SPHINX_BUILD"; then
+  WITH_MAN_PAGES_TRUE=
+  WITH_MAN_PAGES_FALSE='#'
+else
+  WITH_MAN_PAGES_TRUE='#'
+  WITH_MAN_PAGES_FALSE=
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $am_cv_ar_interface" >&5
-$as_echo "$am_cv_ar_interface" >&6; }
 
-case $am_cv_ar_interface in
-ar)
-  ;;
-lib)
-  # Microsoft lib, so override with the ar-lib wrapper script.
-  # FIXME: It is wrong to rewrite AR.
-  # But if we don't then we get into trouble of one sort or another.
-  # A longer-term fix would be to have automake use am__AR in this case,
-  # and then we could set am__AR="$am_aux_dir/ar-lib \$(AR)" or something
-  # similar.
-  AR="$am_aux_dir/ar-lib $AR"
-  ;;
-unknown)
-  as_fn_error $? "could not determine $AR interface" "$LINENO" 5
-  ;;
-esac
 
 
-# Automake
-am__api_version='1.14'
 
-# Find a good install program.  We prefer a C program (faster),
-# so one script is as good as another.  But avoid the broken or
-# incompatible versions:
-# SysV /etc/install, /usr/sbin/install
-# SunOS /usr/etc/install
-# IRIX /sbin/install
-# AIX /bin/install
-# AmigaOS /C/install, which installs bootblocks on floppy discs
-# AIX 4 /usr/bin/installbsd, which doesn't work without a -g flag
-# AFS /usr/afsws/bin/install, which mishandles nonexistent args
-# SVR4 /usr/ucb/install, which tries to use the nonexistent group "staff"
-# OS/2's system install, which has a completely different semantic
-# ./install, which can be erroneously created by make from ./install.sh.
-# Reject install programs that cannot install multiple files.
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for a BSD-compatible install" >&5
-$as_echo_n "checking for a BSD-compatible install... " >&6; }
-if test -z "$INSTALL"; then
-if ${ac_cv_path_install+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    # Account for people who put trailing slashes in PATH elements.
-case $as_dir/ in #((
-  ./ | .// | /[cC]/* | \
-  /etc/* | /usr/sbin/* | /usr/etc/* | /sbin/* | /usr/afsws/bin/* | \
-  ?:[\\/]os2[\\/]install[\\/]* | ?:[\\/]OS2[\\/]INSTALL[\\/]* | \
-  /usr/ucb/* ) ;;
-  *)
-    # OSF1 and SCO ODT 3.0 have their own names for install.
-    # Don't use installbsd from OSF since it installs stuff as root
-    # by default.
-    for ac_prog in ginstall scoinst install; do
-      for ac_exec_ext in '' $ac_executable_extensions; do
-	if as_fn_executable_p "$as_dir/$ac_prog$ac_exec_ext"; then
-	  if test $ac_prog = install &&
-	    grep dspmsg "$as_dir/$ac_prog$ac_exec_ext" >/dev/null 2>&1; then
-	    # AIX install.  It has an incompatible calling convention.
-	    :
-	  elif test $ac_prog = install &&
-	    grep pwplus "$as_dir/$ac_prog$ac_exec_ext" >/dev/null 2>&1; then
-	    # program-specific install script used by HP pwplus--don't use.
-	    :
-	  else
-	    rm -rf conftest.one conftest.two conftest.dir
-	    echo one > conftest.one
-	    echo two > conftest.two
-	    mkdir conftest.dir
-	    if "$as_dir/$ac_prog$ac_exec_ext" -c conftest.one conftest.two "`pwd`/conftest.dir" &&
-	      test -s conftest.one && test -s conftest.two &&
-	      test -s conftest.dir/conftest.one &&
-	      test -s conftest.dir/conftest.two
-	    then
-	      ac_cv_path_install="$as_dir/$ac_prog$ac_exec_ext -c"
-	      break 3
-	    fi
-	  fi
-	fi
-      done
-    done
-    ;;
-esac
+ac_aux_dir=
+for ac_dir in "$srcdir" "$srcdir/.." "$srcdir/../.."; do
+  if test -f "$ac_dir/install-sh"; then
+    ac_aux_dir=$ac_dir
+    ac_install_sh="$ac_aux_dir/install-sh -c"
+    break
+  elif test -f "$ac_dir/install.sh"; then
+    ac_aux_dir=$ac_dir
+    ac_install_sh="$ac_aux_dir/install.sh -c"
+    break
+  elif test -f "$ac_dir/shtool"; then
+    ac_aux_dir=$ac_dir
+    ac_install_sh="$ac_aux_dir/shtool install -c"
+    break
+  fi
+done
+if test -z "$ac_aux_dir"; then
+  as_fn_error $? "cannot find install-sh, install.sh, or shtool in \"$srcdir\" \"$srcdir/..\" \"$srcdir/../..\"" "$LINENO" 5
+fi
 
-  done
-IFS=$as_save_IFS
+# These three variables are undocumented and unsupported,
+# and are intended to be withdrawn in a future Autoconf release.
+# They can cause serious problems if a builder's source tree is in a directory
+# whose full name contains unusual characters.
+ac_config_guess="$SHELL $ac_aux_dir/config.guess"  # Please don't use this var.
+ac_config_sub="$SHELL $ac_aux_dir/config.sub"  # Please don't use this var.
+ac_configure="$SHELL $ac_aux_dir/configure"  # Please don't use this var.
 
-rm -rf conftest.one conftest.two conftest.dir
 
-fi
-  if test "${ac_cv_path_install+set}" = set; then
-    INSTALL=$ac_cv_path_install
-  else
-    # As a last resort, use the slow shell script.  Don't cache a
-    # value for INSTALL within a source directory, because that will
-    # break other packages using the cache if that directory is
-    # removed, or if the value is a relative name.
-    INSTALL=$ac_install_sh
-  fi
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $INSTALL" >&5
-$as_echo "$INSTALL" >&6; }
 
-# Use test -z because SunOS4 sh mishandles braces in ${var-val}.
-# It thinks the first close brace ends the variable substitution.
-test -z "$INSTALL_PROGRAM" && INSTALL_PROGRAM='${INSTALL}'
 
-test -z "$INSTALL_SCRIPT" && INSTALL_SCRIPT='${INSTALL}'
+subdirs="$subdirs src/gmock"
 
-test -z "$INSTALL_DATA" && INSTALL_DATA='${INSTALL} -m 644'
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether build environment is sane" >&5
-$as_echo_n "checking whether build environment is sane... " >&6; }
-# Reject unsafe characters in $srcdir or the absolute working directory
-# name.  Accept space and tab only in the latter.
-am_lf='
-'
-case `pwd` in
-  *[\\\"\#\$\&\'\`$am_lf]*)
-    as_fn_error $? "unsafe absolute working directory name" "$LINENO" 5;;
-esac
-case $srcdir in
-  *[\\\"\#\$\&\'\`$am_lf\ \	]*)
-    as_fn_error $? "unsafe srcdir value: '$srcdir'" "$LINENO" 5;;
+# Environment
+# Make sure we can run config.sub.
+$SHELL "$ac_aux_dir/config.sub" sun4 >/dev/null 2>&1 ||
+  as_fn_error $? "cannot run $SHELL $ac_aux_dir/config.sub" "$LINENO" 5
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking build system type" >&5
+$as_echo_n "checking build system type... " >&6; }
+if ${ac_cv_build+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_build_alias=$build_alias
+test "x$ac_build_alias" = x &&
+  ac_build_alias=`$SHELL "$ac_aux_dir/config.guess"`
+test "x$ac_build_alias" = x &&
+  as_fn_error $? "cannot guess build type; you must specify one" "$LINENO" 5
+ac_cv_build=`$SHELL "$ac_aux_dir/config.sub" $ac_build_alias` ||
+  as_fn_error $? "$SHELL $ac_aux_dir/config.sub $ac_build_alias failed" "$LINENO" 5
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_build" >&5
+$as_echo "$ac_cv_build" >&6; }
+case $ac_cv_build in
+*-*-*) ;;
+*) as_fn_error $? "invalid value of canonical build" "$LINENO" 5;;
 esac
+build=$ac_cv_build
+ac_save_IFS=$IFS; IFS='-'
+set x $ac_cv_build
+shift
+build_cpu=$1
+build_vendor=$2
+shift; shift
+# Remember, the first character of IFS is used to create $*,
+# except with old shells:
+build_os=$*
+IFS=$ac_save_IFS
+case $build_os in *\ *) build_os=`echo "$build_os" | sed 's/ /-/g'`;; esac
 
-# Do 'set' in a subshell so we don't clobber the current shell's
-# arguments.  Must try -L first in case configure is actually a
-# symlink; some systems play weird games with the mod time of symlinks
-# (eg FreeBSD returns the mod time of the symlink's containing
-# directory).
-if (
-   am_has_slept=no
-   for am_try in 1 2; do
-     echo "timestamp, slept: $am_has_slept" > conftest.file
-     set X `ls -Lt "$srcdir/configure" conftest.file 2> /dev/null`
-     if test "$*" = "X"; then
-	# -L didn't work.
-	set X `ls -t "$srcdir/configure" conftest.file`
-     fi
-     if test "$*" != "X $srcdir/configure conftest.file" \
-	&& test "$*" != "X conftest.file $srcdir/configure"; then
 
-	# If neither matched, then we have a broken ls.  This can happen
-	# if, for instance, CONFIG_SHELL is bash and it inherits a
-	# broken ls alias from the environment.  This has actually
-	# happened.  Such a system could not be considered "sane".
-	as_fn_error $? "ls -t appears to fail.  Make sure there is not a broken
-  alias in your environment" "$LINENO" 5
-     fi
-     if test "$2" = conftest.file || test $am_try -eq 2; then
-       break
-     fi
-     # Just in case.
-     sleep 1
-     am_has_slept=yes
-   done
-   test "$2" = conftest.file
-   )
-then
-   # Ok.
-   :
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking host system type" >&5
+$as_echo_n "checking host system type... " >&6; }
+if ${ac_cv_host+:} false; then :
+  $as_echo_n "(cached) " >&6
 else
-   as_fn_error $? "newly created file is older than distributed files!
-Check your system clock" "$LINENO" 5
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-# If we didn't sleep, we still need to ensure time stamps of config.status and
-# generated files are strictly newer.
-am_sleep_pid=
-if grep 'slept: no' conftest.file >/dev/null 2>&1; then
-  ( sleep 1 ) &
-  am_sleep_pid=$!
+  if test "x$host_alias" = x; then
+  ac_cv_host=$ac_cv_build
+else
+  ac_cv_host=`$SHELL "$ac_aux_dir/config.sub" $host_alias` ||
+    as_fn_error $? "$SHELL $ac_aux_dir/config.sub $host_alias failed" "$LINENO" 5
 fi
 
-rm -f conftest.file
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_host" >&5
+$as_echo "$ac_cv_host" >&6; }
+case $ac_cv_host in
+*-*-*) ;;
+*) as_fn_error $? "invalid value of canonical host" "$LINENO" 5;;
+esac
+host=$ac_cv_host
+ac_save_IFS=$IFS; IFS='-'
+set x $ac_cv_host
+shift
+host_cpu=$1
+host_vendor=$2
+shift; shift
+# Remember, the first character of IFS is used to create $*,
+# except with old shells:
+host_os=$*
+IFS=$ac_save_IFS
+case $host_os in *\ *) host_os=`echo "$host_os" | sed 's/ /-/g'`;; esac
 
-test "$program_prefix" != NONE &&
-  program_transform_name="s&^&$program_prefix&;$program_transform_name"
-# Use a double $ so make ignores it.
-test "$program_suffix" != NONE &&
-  program_transform_name="s&\$&$program_suffix&;$program_transform_name"
-# Double any \ or $.
-# By default was `s,x,x', remove it if useless.
-ac_script='s/[\\$]/&&/g;s/;s,x,x,$//'
-program_transform_name=`$as_echo "$program_transform_name" | sed "$ac_script"`
 
-if test x"${MISSING+set}" != xset; then
-  case $am_aux_dir in
-  *\ * | *\	*)
-    MISSING="\${SHELL} \"$am_aux_dir/missing\"" ;;
-  *)
-    MISSING="\${SHELL} $am_aux_dir/missing" ;;
-  esac
-fi
-# Use eval to expand $SHELL
-if eval "$MISSING --is-lightweight"; then
-  am_missing_run="$MISSING "
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking target system type" >&5
+$as_echo_n "checking target system type... " >&6; }
+if ${ac_cv_target+:} false; then :
+  $as_echo_n "(cached) " >&6
 else
-  am_missing_run=
-  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: 'missing' script is too old or missing" >&5
-$as_echo "$as_me: WARNING: 'missing' script is too old or missing" >&2;}
+  if test "x$target_alias" = x; then
+  ac_cv_target=$ac_cv_host
+else
+  ac_cv_target=`$SHELL "$ac_aux_dir/config.sub" $target_alias` ||
+    as_fn_error $? "$SHELL $ac_aux_dir/config.sub $target_alias failed" "$LINENO" 5
 fi
 
-if test x"${install_sh}" != xset; then
-  case $am_aux_dir in
-  *\ * | *\	*)
-    install_sh="\${SHELL} '$am_aux_dir/install-sh'" ;;
-  *)
-    install_sh="\${SHELL} $am_aux_dir/install-sh"
-  esac
 fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_target" >&5
+$as_echo "$ac_cv_target" >&6; }
+case $ac_cv_target in
+*-*-*) ;;
+*) as_fn_error $? "invalid value of canonical target" "$LINENO" 5;;
+esac
+target=$ac_cv_target
+ac_save_IFS=$IFS; IFS='-'
+set x $ac_cv_target
+shift
+target_cpu=$1
+target_vendor=$2
+shift; shift
+# Remember, the first character of IFS is used to create $*,
+# except with old shells:
+target_os=$*
+IFS=$ac_save_IFS
+case $target_os in *\ *) target_os=`echo "$target_os" | sed 's/ /-/g'`;; esac
 
-# Installed binaries are usually stripped using 'strip' when the user
-# run "make install-strip".  However 'strip' might not be the right
-# tool to use in cross-compilation environments, therefore Automake
-# will honor the 'STRIP' environment variable to overrule this program.
-if test "$cross_compiling" != no; then
-  if test -n "$ac_tool_prefix"; then
-  # Extract the first word of "${ac_tool_prefix}strip", so it can be a program name with args.
-set dummy ${ac_tool_prefix}strip; ac_word=$2
+
+# The aliases save the names the user supplied, while $host etc.
+# will get canonicalized.
+test -n "$target_alias" &&
+  test "$program_prefix$program_suffix$program_transform_name" = \
+    NONENONEs,x,x, &&
+  program_prefix=${target_alias}-
+
+# Fix automake problems in 1.12
+# expand $ac_aux_dir to an absolute path
+am_aux_dir=`cd $ac_aux_dir && pwd`
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+if test -n "$ac_tool_prefix"; then
+  # Extract the first word of "${ac_tool_prefix}gcc", so it can be a program name with args.
+set dummy ${ac_tool_prefix}gcc; ac_word=$2
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
 $as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_STRIP+:} false; then :
+if ${ac_cv_prog_CC+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  if test -n "$STRIP"; then
-  ac_cv_prog_STRIP="$STRIP" # Let the user override the test.
+  if test -n "$CC"; then
+  ac_cv_prog_CC="$CC" # Let the user override the test.
 else
 as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
 for as_dir in $PATH
@@ -4654,7 +4231,7 @@ do
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
   if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_prog_STRIP="${ac_tool_prefix}strip"
+    ac_cv_prog_CC="${ac_tool_prefix}gcc"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
   fi
@@ -4664,10 +4241,10 @@ IFS=$as_save_IFS
 
 fi
 fi
-STRIP=$ac_cv_prog_STRIP
-if test -n "$STRIP"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $STRIP" >&5
-$as_echo "$STRIP" >&6; }
+CC=$ac_cv_prog_CC
+if test -n "$CC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
+$as_echo "$CC" >&6; }
 else
   { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
@@ -4675,17 +4252,17 @@ fi
 
 
 fi
-if test -z "$ac_cv_prog_STRIP"; then
-  ac_ct_STRIP=$STRIP
-  # Extract the first word of "strip", so it can be a program name with args.
-set dummy strip; ac_word=$2
+if test -z "$ac_cv_prog_CC"; then
+  ac_ct_CC=$CC
+  # Extract the first word of "gcc", so it can be a program name with args.
+set dummy gcc; ac_word=$2
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
 $as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_ac_ct_STRIP+:} false; then :
+if ${ac_cv_prog_ac_ct_CC+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  if test -n "$ac_ct_STRIP"; then
-  ac_cv_prog_ac_ct_STRIP="$ac_ct_STRIP" # Let the user override the test.
+  if test -n "$ac_ct_CC"; then
+  ac_cv_prog_ac_ct_CC="$ac_ct_CC" # Let the user override the test.
 else
 as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
 for as_dir in $PATH
@@ -4694,7 +4271,7 @@ do
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
   if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_prog_ac_ct_STRIP="strip"
+    ac_cv_prog_ac_ct_CC="gcc"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
   fi
@@ -4704,17 +4281,17 @@ IFS=$as_save_IFS
 
 fi
 fi
-ac_ct_STRIP=$ac_cv_prog_ac_ct_STRIP
-if test -n "$ac_ct_STRIP"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_STRIP" >&5
-$as_echo "$ac_ct_STRIP" >&6; }
+ac_ct_CC=$ac_cv_prog_ac_ct_CC
+if test -n "$ac_ct_CC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CC" >&5
+$as_echo "$ac_ct_CC" >&6; }
 else
   { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
 fi
 
-  if test "x$ac_ct_STRIP" = x; then
-    STRIP=":"
+  if test "x$ac_ct_CC" = x; then
+    CC=""
   else
     case $cross_compiling:$ac_tool_warned in
 yes:)
@@ -4722,69 +4299,65 @@ yes:)
 $as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
 ac_tool_warned=yes ;;
 esac
-    STRIP=$ac_ct_STRIP
+    CC=$ac_ct_CC
   fi
 else
-  STRIP="$ac_cv_prog_STRIP"
-fi
-
+  CC="$ac_cv_prog_CC"
 fi
-INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s"
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for a thread-safe mkdir -p" >&5
-$as_echo_n "checking for a thread-safe mkdir -p... " >&6; }
-if test -z "$MKDIR_P"; then
-  if ${ac_cv_path_mkdir+:} false; then :
+if test -z "$CC"; then
+          if test -n "$ac_tool_prefix"; then
+    # Extract the first word of "${ac_tool_prefix}cc", so it can be a program name with args.
+set dummy ${ac_tool_prefix}cc; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_CC+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH$PATH_SEPARATOR/opt/sfw/bin
+  if test -n "$CC"; then
+  ac_cv_prog_CC="$CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
 do
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
-    for ac_prog in mkdir gmkdir; do
-	 for ac_exec_ext in '' $ac_executable_extensions; do
-	   as_fn_executable_p "$as_dir/$ac_prog$ac_exec_ext" || continue
-	   case `"$as_dir/$ac_prog$ac_exec_ext" --version 2>&1` in #(
-	     'mkdir (GNU coreutils) '* | \
-	     'mkdir (coreutils) '* | \
-	     'mkdir (fileutils) '4.1*)
-	       ac_cv_path_mkdir=$as_dir/$ac_prog$ac_exec_ext
-	       break 3;;
-	   esac
-	 done
-       done
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_CC="${ac_tool_prefix}cc"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
   done
 IFS=$as_save_IFS
 
 fi
+fi
+CC=$ac_cv_prog_CC
+if test -n "$CC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
+$as_echo "$CC" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
 
-  test -d ./--version && rmdir ./--version
-  if test "${ac_cv_path_mkdir+set}" = set; then
-    MKDIR_P="$ac_cv_path_mkdir -p"
-  else
-    # As a last resort, use the slow shell script.  Don't cache a
-    # value for MKDIR_P within a source directory, because that will
-    # break other packages using the cache if that directory is
-    # removed, or if the value is a relative name.
-    MKDIR_P="$ac_install_sh -d"
   fi
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $MKDIR_P" >&5
-$as_echo "$MKDIR_P" >&6; }
-
-for ac_prog in gawk mawk nawk awk
-do
-  # Extract the first word of "$ac_prog", so it can be a program name with args.
-set dummy $ac_prog; ac_word=$2
+if test -z "$CC"; then
+  # Extract the first word of "cc", so it can be a program name with args.
+set dummy cc; ac_word=$2
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
 $as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_AWK+:} false; then :
+if ${ac_cv_prog_CC+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  if test -n "$AWK"; then
-  ac_cv_prog_AWK="$AWK" # Let the user override the test.
+  if test -n "$CC"; then
+  ac_cv_prog_CC="$CC" # Let the user override the test.
 else
+  ac_prog_rejected=no
 as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
 for as_dir in $PATH
 do
@@ -4792,7 +4365,11 @@ do
   test -z "$as_dir" && as_dir=.
     for ac_exec_ext in '' $ac_executable_extensions; do
   if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_prog_AWK="$ac_prog"
+    if test "$as_dir/$ac_word$ac_exec_ext" = "/usr/ucb/cc"; then
+       ac_prog_rejected=yes
+       continue
+     fi
+    ac_cv_prog_CC="cc"
     $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
   fi
@@ -4800,405 +4377,609 @@ done
   done
 IFS=$as_save_IFS
 
+if test $ac_prog_rejected = yes; then
+  # We found a bogon in the path, so make sure we never use it.
+  set dummy $ac_cv_prog_CC
+  shift
+  if test $# != 0; then
+    # We chose a different compiler from the bogus one.
+    # However, it has the same basename, so the bogon will be chosen
+    # first if we set CC to just the basename; use the full file name.
+    shift
+    ac_cv_prog_CC="$as_dir/$ac_word${1+' '}$@"
+  fi
 fi
 fi
-AWK=$ac_cv_prog_AWK
-if test -n "$AWK"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $AWK" >&5
-$as_echo "$AWK" >&6; }
+fi
+CC=$ac_cv_prog_CC
+if test -n "$CC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
+$as_echo "$CC" >&6; }
 else
   { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
 fi
 
 
-  test -n "$AWK" && break
-done
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ${MAKE-make} sets \$(MAKE)" >&5
-$as_echo_n "checking whether ${MAKE-make} sets \$(MAKE)... " >&6; }
-set x ${MAKE-make}
-ac_make=`$as_echo "$2" | sed 's/+/p/g; s/[^a-zA-Z0-9_]/_/g'`
-if eval \${ac_cv_prog_make_${ac_make}_set+:} false; then :
+fi
+if test -z "$CC"; then
+  if test -n "$ac_tool_prefix"; then
+  for ac_prog in cl.exe
+  do
+    # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args.
+set dummy $ac_tool_prefix$ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_CC+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  cat >conftest.make <<\_ACEOF
-SHELL = /bin/sh
-all:
-	@echo '@@@%%%=$(MAKE)=@@@%%%'
-_ACEOF
-# GNU make sometimes prints "make[1]: Entering ...", which would confuse us.
-case `${MAKE-make} -f conftest.make 2>/dev/null` in
-  *@@@%%%=?*=@@@%%%*)
-    eval ac_cv_prog_make_${ac_make}_set=yes;;
-  *)
-    eval ac_cv_prog_make_${ac_make}_set=no;;
-esac
-rm -f conftest.make
+  if test -n "$CC"; then
+  ac_cv_prog_CC="$CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_CC="$ac_tool_prefix$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
 fi
-if eval test \$ac_cv_prog_make_${ac_make}_set = yes; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-  SET_MAKE=
+fi
+CC=$ac_cv_prog_CC
+if test -n "$CC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
+$as_echo "$CC" >&6; }
 else
   { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
-  SET_MAKE="MAKE=${MAKE-make}"
-fi
-
-rm -rf .tst 2>/dev/null
-mkdir .tst 2>/dev/null
-if test -d .tst; then
-  am__leading_dot=.
-else
-  am__leading_dot=_
 fi
-rmdir .tst 2>/dev/null
-
-DEPDIR="${am__leading_dot}deps"
-
-ac_config_commands="$ac_config_commands depfiles"
 
 
-am_make=${MAKE-make}
-cat > confinc << 'END'
-am__doit:
-	@echo this is the am__doit target
-.PHONY: am__doit
-END
-# If we don't find an include directive, just comment out the code.
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for style of include used by $am_make" >&5
-$as_echo_n "checking for style of include used by $am_make... " >&6; }
-am__include="#"
-am__quote=
-_am_result=none
-# First try GNU make style include.
-echo "include confinc" > confmf
-# Ignore all kinds of additional output from 'make'.
-case `$am_make -s -f confmf 2> /dev/null` in #(
-*the\ am__doit\ target*)
-  am__include=include
-  am__quote=
-  _am_result=GNU
-  ;;
-esac
-# Now try BSD make style include.
-if test "$am__include" = "#"; then
-   echo '.include "confinc"' > confmf
-   case `$am_make -s -f confmf 2> /dev/null` in #(
-   *the\ am__doit\ target*)
-     am__include=.include
-     am__quote="\""
-     _am_result=BSD
-     ;;
-   esac
+    test -n "$CC" && break
+  done
 fi
+if test -z "$CC"; then
+  ac_ct_CC=$CC
+  for ac_prog in cl.exe
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_ac_ct_CC+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$ac_ct_CC"; then
+  ac_cv_prog_ac_ct_CC="$ac_ct_CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_ac_ct_CC="$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
 
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $_am_result" >&5
-$as_echo "$_am_result" >&6; }
-rm -f confinc confmf
-
-# Check whether --enable-dependency-tracking was given.
-if test "${enable_dependency_tracking+set}" = set; then :
-  enableval=$enable_dependency_tracking;
 fi
-
-if test "x$enable_dependency_tracking" != xno; then
-  am_depcomp="$ac_aux_dir/depcomp"
-  AMDEPBACKSLASH='\'
-  am__nodep='_no'
 fi
- if test "x$enable_dependency_tracking" != xno; then
-  AMDEP_TRUE=
-  AMDEP_FALSE='#'
+ac_ct_CC=$ac_cv_prog_ac_ct_CC
+if test -n "$ac_ct_CC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CC" >&5
+$as_echo "$ac_ct_CC" >&6; }
 else
-  AMDEP_TRUE='#'
-  AMDEP_FALSE=
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
 fi
 
 
-# Check whether --enable-silent-rules was given.
-if test "${enable_silent_rules+set}" = set; then :
-  enableval=$enable_silent_rules;
-fi
+  test -n "$ac_ct_CC" && break
+done
 
-case $enable_silent_rules in # (((
-  yes) AM_DEFAULT_VERBOSITY=0;;
-   no) AM_DEFAULT_VERBOSITY=1;;
-    *) AM_DEFAULT_VERBOSITY=1;;
+  if test "x$ac_ct_CC" = x; then
+    CC=""
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
 esac
-am_make=${MAKE-make}
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $am_make supports nested variables" >&5
-$as_echo_n "checking whether $am_make supports nested variables... " >&6; }
-if ${am_cv_make_support_nested_variables+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if $as_echo 'TRUE=$(BAR$(V))
-BAR0=false
-BAR1=true
-V=1
-am__doit:
-	@$(TRUE)
-.PHONY: am__doit' | $am_make -f - >/dev/null 2>&1; then
-  am_cv_make_support_nested_variables=yes
-else
-  am_cv_make_support_nested_variables=no
-fi
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $am_cv_make_support_nested_variables" >&5
-$as_echo "$am_cv_make_support_nested_variables" >&6; }
-if test $am_cv_make_support_nested_variables = yes; then
-    AM_V='$(V)'
-  AM_DEFAULT_V='$(AM_DEFAULT_VERBOSITY)'
-else
-  AM_V=$AM_DEFAULT_VERBOSITY
-  AM_DEFAULT_V=$AM_DEFAULT_VERBOSITY
-fi
-AM_BACKSLASH='\'
-
-if test "`cd $srcdir && pwd`" != "`pwd`"; then
-  # Use -I$(srcdir) only when $(srcdir) != ., so that make's output
-  # is not polluted with repeated "-I."
-  am__isrc=' -I$(srcdir)'
-  # test to see if srcdir already configured
-  if test -f $srcdir/config.status; then
-    as_fn_error $? "source directory already configured; run \"make distclean\" there first" "$LINENO" 5
+    CC=$ac_ct_CC
   fi
 fi
 
-# test whether we have cygpath
-if test -z "$CYGPATH_W"; then
-  if (cygpath --version) >/dev/null 2>/dev/null; then
-    CYGPATH_W='cygpath -w'
-  else
-    CYGPATH_W=echo
-  fi
 fi
 
 
-# Define the identity of the package.
- PACKAGE='ceph'
- VERSION='0.94.5'
+test -z "$CC" && { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "no acceptable C compiler found in \$PATH
+See \`config.log' for more details" "$LINENO" 5; }
 
+# Provide some information about the compiler.
+$as_echo "$as_me:${as_lineno-$LINENO}: checking for C compiler version" >&5
+set X $ac_compile
+ac_compiler=$2
+for ac_option in --version -v -V -qversion; do
+  { { ac_try="$ac_compiler $ac_option >&5"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_compiler $ac_option >&5") 2>conftest.err
+  ac_status=$?
+  if test -s conftest.err; then
+    sed '10a\
+... rest of stderr output deleted ...
+         10q' conftest.err >conftest.er1
+    cat conftest.er1 >&5
+  fi
+  rm -f conftest.er1 conftest.err
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }
+done
 
-cat >>confdefs.h <<_ACEOF
-#define PACKAGE "$PACKAGE"
-_ACEOF
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are using the GNU C compiler" >&5
+$as_echo_n "checking whether we are using the GNU C compiler... " >&6; }
+if ${ac_cv_c_compiler_gnu+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
 
+int
+main ()
+{
+#ifndef __GNUC__
+       choke me
+#endif
 
-cat >>confdefs.h <<_ACEOF
-#define VERSION "$VERSION"
+  ;
+  return 0;
+}
 _ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_compiler_gnu=yes
+else
+  ac_compiler_gnu=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+ac_cv_c_compiler_gnu=$ac_compiler_gnu
 
-# Some tools Automake needs.
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_c_compiler_gnu" >&5
+$as_echo "$ac_cv_c_compiler_gnu" >&6; }
+if test $ac_compiler_gnu = yes; then
+  GCC=yes
+else
+  GCC=
+fi
+ac_test_CFLAGS=${CFLAGS+set}
+ac_save_CFLAGS=$CFLAGS
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CC accepts -g" >&5
+$as_echo_n "checking whether $CC accepts -g... " >&6; }
+if ${ac_cv_prog_cc_g+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_save_c_werror_flag=$ac_c_werror_flag
+   ac_c_werror_flag=yes
+   ac_cv_prog_cc_g=no
+   CFLAGS="-g"
+   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
 
-ACLOCAL=${ACLOCAL-"${am_missing_run}aclocal-${am__api_version}"}
+int
+main ()
+{
 
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_cv_prog_cc_g=yes
+else
+  CFLAGS=""
+      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
 
-AUTOCONF=${AUTOCONF-"${am_missing_run}autoconf"}
-
+int
+main ()
+{
 
-AUTOMAKE=${AUTOMAKE-"${am_missing_run}automake-${am__api_version}"}
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
 
+else
+  ac_c_werror_flag=$ac_save_c_werror_flag
+	 CFLAGS="-g"
+	 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
 
-AUTOHEADER=${AUTOHEADER-"${am_missing_run}autoheader"}
+int
+main ()
+{
 
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_cv_prog_cc_g=yes
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+   ac_c_werror_flag=$ac_save_c_werror_flag
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_g" >&5
+$as_echo "$ac_cv_prog_cc_g" >&6; }
+if test "$ac_test_CFLAGS" = set; then
+  CFLAGS=$ac_save_CFLAGS
+elif test $ac_cv_prog_cc_g = yes; then
+  if test "$GCC" = yes; then
+    CFLAGS="-g -O2"
+  else
+    CFLAGS="-g"
+  fi
+else
+  if test "$GCC" = yes; then
+    CFLAGS="-O2"
+  else
+    CFLAGS=
+  fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $CC option to accept ISO C89" >&5
+$as_echo_n "checking for $CC option to accept ISO C89... " >&6; }
+if ${ac_cv_prog_cc_c89+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_cv_prog_cc_c89=no
+ac_save_CC=$CC
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <stdarg.h>
+#include <stdio.h>
+struct stat;
+/* Most of the following tests are stolen from RCS 5.7's src/conf.sh.  */
+struct buf { int x; };
+FILE * (*rcsopen) (struct buf *, struct stat *, int);
+static char *e (p, i)
+     char **p;
+     int i;
+{
+  return p[i];
+}
+static char *f (char * (*g) (char **, int), char **p, ...)
+{
+  char *s;
+  va_list v;
+  va_start (v,p);
+  s = g (p, va_arg (v,int));
+  va_end (v);
+  return s;
+}
 
-MAKEINFO=${MAKEINFO-"${am_missing_run}makeinfo"}
+/* OSF 4.0 Compaq cc is some sort of almost-ANSI by default.  It has
+   function prototypes and stuff, but not '\xHH' hex character constants.
+   These don't provoke an error unfortunately, instead are silently treated
+   as 'x'.  The following induces an error, until -std is added to get
+   proper ANSI mode.  Curiously '\x00'!='x' always comes out true, for an
+   array size at least.  It's necessary to write '\x00'==0 to get something
+   that's true only with -std.  */
+int osf4_cc_array ['\x00' == 0 ? 1 : -1];
 
-# For better backward compatibility.  To be removed once Automake 1.9.x
-# dies out for good.  For more background, see:
-# <http://lists.gnu.org/archive/html/automake/2012-07/msg00001.html>
-# <http://lists.gnu.org/archive/html/automake/2012-07/msg00014.html>
-mkdir_p='$(MKDIR_P)'
+/* IBM C 6 for AIX is almost-ANSI by default, but it replaces macro parameters
+   inside strings and character constants.  */
+#define FOO(x) 'x'
+int xlc6_cc_array[FOO(a) == 'x' ? 1 : -1];
 
-# We need awk for the "check" target.  The system "awk" is bad on
-# some platforms.
-# Always define AMTAR for backward compatibility.  Yes, it's still used
-# in the wild :-(  We should find a proper way to deprecate it ...
-AMTAR='$${TAR-tar}'
+int test (int i, double x);
+struct s1 {int (*f) (int a);};
+struct s2 {int (*f) (double a);};
+int pairnames (int, char **, FILE *(*)(struct buf *, struct stat *, int), int, int);
+int argc;
+char **argv;
+int
+main ()
+{
+return f (e, argv, 0) != argv[0]  ||  f (e, argv, 1) != argv[1];
+  ;
+  return 0;
+}
+_ACEOF
+for ac_arg in '' -qlanglvl=extc89 -qlanglvl=ansi -std \
+	-Ae "-Aa -D_HPUX_SOURCE" "-Xc -D__EXTENSIONS__"
+do
+  CC="$ac_save_CC $ac_arg"
+  if ac_fn_c_try_compile "$LINENO"; then :
+  ac_cv_prog_cc_c89=$ac_arg
+fi
+rm -f core conftest.err conftest.$ac_objext
+  test "x$ac_cv_prog_cc_c89" != "xno" && break
+done
+rm -f conftest.$ac_ext
+CC=$ac_save_CC
 
+fi
+# AC_CACHE_VAL
+case "x$ac_cv_prog_cc_c89" in
+  x)
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: none needed" >&5
+$as_echo "none needed" >&6; } ;;
+  xno)
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: unsupported" >&5
+$as_echo "unsupported" >&6; } ;;
+  *)
+    CC="$CC $ac_cv_prog_cc_c89"
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_c89" >&5
+$as_echo "$ac_cv_prog_cc_c89" >&6; } ;;
+esac
+if test "x$ac_cv_prog_cc_c89" != xno; then :
 
-# We'll loop over all known methods to create a tar archive until one works.
-_am_tools='gnutar  pax cpio none'
+fi
 
-am__tar='$${TAR-tar} chof - "$$tardir"' am__untar='$${TAR-tar} xf -'
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CC understands -c and -o together" >&5
+$as_echo_n "checking whether $CC understands -c and -o together... " >&6; }
+if ${am_cv_prog_cc_c_o+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
 
+int
+main ()
+{
 
+  ;
+  return 0;
+}
+_ACEOF
+  # Make sure it works both with $CC and with simple cc.
+  # Following AC_PROG_CC_C_O, we do the test twice because some
+  # compilers refuse to overwrite an existing .o file with -o,
+  # though they will create one.
+  am_cv_prog_cc_c_o=yes
+  for am_i in 1 2; do
+    if { echo "$as_me:$LINENO: $CC -c conftest.$ac_ext -o conftest2.$ac_objext" >&5
+   ($CC -c conftest.$ac_ext -o conftest2.$ac_objext) >&5 2>&5
+   ac_status=$?
+   echo "$as_me:$LINENO: \$? = $ac_status" >&5
+   (exit $ac_status); } \
+         && test -f conftest2.$ac_objext; then
+      : OK
+    else
+      am_cv_prog_cc_c_o=no
+      break
+    fi
+  done
+  rm -f core conftest*
+  unset am_i
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $am_cv_prog_cc_c_o" >&5
+$as_echo "$am_cv_prog_cc_c_o" >&6; }
+if test "$am_cv_prog_cc_c_o" != yes; then
+   # Losing compiler, so override with the script.
+   # FIXME: It is wrong to rewrite CC.
+   # But if we don't then we get into trouble of one sort or another.
+   # A longer-term fix would be to have automake use am__CC in this case,
+   # and then we could set am__CC="\$(top_srcdir)/compile \$(CC)"
+   CC="$am_aux_dir/compile $CC"
+fi
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
 
-depcc="$CC"   am_compiler_list=
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking dependency style of $depcc" >&5
-$as_echo_n "checking dependency style of $depcc... " >&6; }
-if ${am_cv_CC_dependencies_compiler_type+:} false; then :
+if test -n "$ac_tool_prefix"; then
+  for ac_prog in ar lib "link -lib"
+  do
+    # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args.
+set dummy $ac_tool_prefix$ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_AR+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  if test -z "$AMDEP_TRUE" && test -f "$am_depcomp"; then
-  # We make a subdir and do the tests there.  Otherwise we can end up
-  # making bogus files that we don't know about and never remove.  For
-  # instance it was reported that on HP-UX the gcc test will end up
-  # making a dummy file named 'D' -- because '-MD' means "put the output
-  # in D".
-  rm -rf conftest.dir
-  mkdir conftest.dir
-  # Copy depcomp to subdir because otherwise we won't find it if we're
-  # using a relative directory.
-  cp "$am_depcomp" conftest.dir
-  cd conftest.dir
-  # We will build objects and dependencies in a subdirectory because
-  # it helps to detect inapplicable dependency modes.  For instance
-  # both Tru64's cc and ICC support -MD to output dependencies as a
-  # side effect of compilation, but ICC will put the dependencies in
-  # the current directory while Tru64 will put them in the object
-  # directory.
-  mkdir sub
-
-  am_cv_CC_dependencies_compiler_type=none
-  if test "$am_compiler_list" = ""; then
-     am_compiler_list=`sed -n 's/^#*\([a-zA-Z0-9]*\))$/\1/p' < ./depcomp`
+  if test -n "$AR"; then
+  ac_cv_prog_AR="$AR" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_AR="$ac_tool_prefix$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
   fi
-  am__universal=false
-  case " $depcc " in #(
-     *\ -arch\ *\ -arch\ *) am__universal=true ;;
-     esac
-
-  for depmode in $am_compiler_list; do
-    # Setup a source with many dependencies, because some compilers
-    # like to wrap large dependency lists on column 80 (with \), and
-    # we should not choose a depcomp mode which is confused by this.
-    #
-    # We need to recreate these files for each test, as the compiler may
-    # overwrite some of them when testing with obscure command lines.
-    # This happens at least with the AIX C compiler.
-    : > sub/conftest.c
-    for i in 1 2 3 4 5 6; do
-      echo '#include "conftst'$i'.h"' >> sub/conftest.c
-      # Using ": > sub/conftst$i.h" creates only sub/conftst1.h with
-      # Solaris 10 /bin/sh.
-      echo '/* dummy */' > sub/conftst$i.h
-    done
-    echo "${am__include} ${am__quote}sub/conftest.Po${am__quote}" > confmf
-
-    # We check with '-c' and '-o' for the sake of the "dashmstdout"
-    # mode.  It turns out that the SunPro C++ compiler does not properly
-    # handle '-M -o', and we need to detect this.  Also, some Intel
-    # versions had trouble with output in subdirs.
-    am__obj=sub/conftest.${OBJEXT-o}
-    am__minus_obj="-o $am__obj"
-    case $depmode in
-    gcc)
-      # This depmode causes a compiler race in universal mode.
-      test "$am__universal" = false || continue
-      ;;
-    nosideeffect)
-      # After this tag, mechanisms are not by side-effect, so they'll
-      # only be used when explicitly requested.
-      if test "x$enable_dependency_tracking" = xyes; then
-	continue
-      else
-	break
-      fi
-      ;;
-    msvc7 | msvc7msys | msvisualcpp | msvcmsys)
-      # This compiler won't grok '-c -o', but also, the minuso test has
-      # not run yet.  These depmodes are late enough in the game, and
-      # so weak that their functioning should not be impacted.
-      am__obj=conftest.${OBJEXT-o}
-      am__minus_obj=
-      ;;
-    none) break ;;
-    esac
-    if depmode=$depmode \
-       source=sub/conftest.c object=$am__obj \
-       depfile=sub/conftest.Po tmpdepfile=sub/conftest.TPo \
-       $SHELL ./depcomp $depcc -c $am__minus_obj sub/conftest.c \
-         >/dev/null 2>conftest.err &&
-       grep sub/conftst1.h sub/conftest.Po > /dev/null 2>&1 &&
-       grep sub/conftst6.h sub/conftest.Po > /dev/null 2>&1 &&
-       grep $am__obj sub/conftest.Po > /dev/null 2>&1 &&
-       ${MAKE-make} -s -f confmf > /dev/null 2>&1; then
-      # icc doesn't choke on unknown options, it will just issue warnings
-      # or remarks (even with -Werror).  So we grep stderr for any message
-      # that says an option was ignored or not supported.
-      # When given -MP, icc 7.0 and 7.1 complain thusly:
-      #   icc: Command line warning: ignoring option '-M'; no argument required
-      # The diagnosis changed in icc 8.0:
-      #   icc: Command line remark: option '-MP' not supported
-      if (grep 'ignoring option' conftest.err ||
-          grep 'not supported' conftest.err) >/dev/null 2>&1; then :; else
-        am_cv_CC_dependencies_compiler_type=$depmode
-        break
-      fi
-    fi
+done
   done
+IFS=$as_save_IFS
 
-  cd ..
-  rm -rf conftest.dir
+fi
+fi
+AR=$ac_cv_prog_AR
+if test -n "$AR"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $AR" >&5
+$as_echo "$AR" >&6; }
 else
-  am_cv_CC_dependencies_compiler_type=none
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
 fi
 
+
+    test -n "$AR" && break
+  done
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $am_cv_CC_dependencies_compiler_type" >&5
-$as_echo "$am_cv_CC_dependencies_compiler_type" >&6; }
-CCDEPMODE=depmode=$am_cv_CC_dependencies_compiler_type
+if test -z "$AR"; then
+  ac_ct_AR=$AR
+  for ac_prog in ar lib "link -lib"
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_ac_ct_AR+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$ac_ct_AR"; then
+  ac_cv_prog_ac_ct_AR="$ac_ct_AR" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_ac_ct_AR="$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
 
- if
-  test "x$enable_dependency_tracking" != xno \
-  && test "$am_cv_CC_dependencies_compiler_type" = gcc3; then
-  am__fastdepCC_TRUE=
-  am__fastdepCC_FALSE='#'
+fi
+fi
+ac_ct_AR=$ac_cv_prog_ac_ct_AR
+if test -n "$ac_ct_AR"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_AR" >&5
+$as_echo "$ac_ct_AR" >&6; }
 else
-  am__fastdepCC_TRUE='#'
-  am__fastdepCC_FALSE=
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
 fi
 
 
+  test -n "$ac_ct_AR" && break
+done
 
-# POSIX will say in a future version that running "rm -f" with no argument
-# is OK; and we want to be able to make that assumption in our Makefile
-# recipes.  So use an aggressive probe to check that the usage we want is
-# actually supported "in the wild" to an acceptable degree.
-# See automake bug#10828.
-# To make any issue more visible, cause the running configure to be aborted
-# by default if the 'rm' program in use doesn't match our expectations; the
-# user can still override this though.
-if rm -f && rm -fr && rm -rf; then : OK; else
-  cat >&2 <<'END'
-Oops!
-
-Your 'rm' program seems unable to run without file operands specified
-on the command line, even when the '-f' option is present.  This is contrary
-to the behaviour of most rm programs out there, and not conforming with
-the upcoming POSIX standard: <http://austingroupbugs.net/view.php?id=542>
+  if test "x$ac_ct_AR" = x; then
+    AR="false"
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+    AR=$ac_ct_AR
+  fi
+fi
 
-Please tell bug-automake at gnu.org about your system, including the value
-of your $PATH and any error possibly output before this message.  This
-can help us improve future automake versions.
+: ${AR=ar}
 
-END
-  if test x"$ACCEPT_INFERIOR_RM_PROGRAM" = x"yes"; then
-    echo 'Configuration will proceed anyway, since you have set the' >&2
-    echo 'ACCEPT_INFERIOR_RM_PROGRAM variable to "yes"' >&2
-    echo >&2
-  else
-    cat >&2 <<'END'
-Aborting the configuration process, to ensure you take notice of the issue.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking the archiver ($AR) interface" >&5
+$as_echo_n "checking the archiver ($AR) interface... " >&6; }
+if ${am_cv_ar_interface+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
-You can download and install GNU coreutils to get an 'rm' implementation
-that behaves properly: <http://www.gnu.org/software/coreutils/>.
+   am_cv_ar_interface=ar
+   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+int some_variable = 0;
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  am_ar_try='$AR cru libconftest.a conftest.$ac_objext >&5'
+      { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$am_ar_try\""; } >&5
+  (eval $am_ar_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }
+      if test "$ac_status" -eq 0; then
+        am_cv_ar_interface=ar
+      else
+        am_ar_try='$AR -NOLOGO -OUT:conftest.lib conftest.$ac_objext >&5'
+        { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$am_ar_try\""; } >&5
+  (eval $am_ar_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }
+        if test "$ac_status" -eq 0; then
+          am_cv_ar_interface=lib
+        else
+          am_cv_ar_interface=unknown
+        fi
+      fi
+      rm -f conftest.lib libconftest.a
 
-If you want to complete the configuration process using your problematic
-'rm' anyway, export the environment variable ACCEPT_INFERIOR_RM_PROGRAM
-to "yes", and re-run configure.
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+   ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
-END
-    as_fn_error $? "Your 'rm' program is bad, sorry." "$LINENO" 5
-  fi
 fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $am_cv_ar_interface" >&5
+$as_echo "$am_cv_ar_interface" >&6; }
+
+case $am_cv_ar_interface in
+ar)
+  ;;
+lib)
+  # Microsoft lib, so override with the ar-lib wrapper script.
+  # FIXME: It is wrong to rewrite AR.
+  # But if we don't then we get into trouble of one sort or another.
+  # A longer-term fix would be to have automake use am__AR in this case,
+  # and then we could set am__AR="$am_aux_dir/ar-lib \$(AR)" or something
+  # similar.
+  AR="$am_aux_dir/ar-lib $AR"
+  ;;
+unknown)
+  as_fn_error $? "could not determine $AR interface" "$LINENO" 5
+  ;;
+esac
+
+
+# Automake
 
 case `pwd` in
   *\ * | *\	*)
@@ -7129,6 +6910,53 @@ esac
 
 
 
+for ac_prog in gawk mawk nawk awk
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_AWK+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$AWK"; then
+  ac_cv_prog_AWK="$AWK" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_AWK="$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+AWK=$ac_cv_prog_AWK
+if test -n "$AWK"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $AWK" >&5
+$as_echo "$AWK" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+  test -n "$AWK" && break
+done
+
+
+
+
+
+
 
 
 
@@ -7142,11 +6970,6 @@ esac
 
 
 
-
-
-
-
-
 # If no C compiler was specified, use CC.
 LTCC=${LTCC-"$CC"}
 
@@ -8714,6 +8537,16 @@ done
 
 
 
+func_stripname_cnf ()
+{
+  case ${2} in
+  .*) func_stripname_result=`$ECHO "${3}" | $SED "s%^${1}%%; s%\\\\${2}\$%%"`;;
+  *)  func_stripname_result=`$ECHO "${3}" | $SED "s%^${1}%%; s%${2}\$%%"`;;
+  esac
+} # func_stripname_cnf
+
+
+
 
 
 # Set options
@@ -12703,1251 +12536,1627 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
 CC="$lt_save_CC"
 
+      if test -n "$CXX" && ( test "X$CXX" != "Xno" &&
+    ( (test "X$CXX" = "Xg++" && `g++ -v >/dev/null 2>&1` ) ||
+    (test "X$CXX" != "Xg++"))) ; then
+  ac_ext=cpp
+ac_cpp='$CXXCPP $CPPFLAGS'
+ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking how to run the C++ preprocessor" >&5
+$as_echo_n "checking how to run the C++ preprocessor... " >&6; }
+if test -z "$CXXCPP"; then
+  if ${ac_cv_prog_CXXCPP+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+      # Double quotes because CXXCPP needs to be expanded
+    for CXXCPP in "$CXX -E" "/lib/cpp"
+    do
+      ac_preproc_ok=false
+for ac_cxx_preproc_warn_flag in '' yes
+do
+  # Use a header file that comes with gcc, so configuring glibc
+  # with a fresh cross-compiler works.
+  # Prefer <limits.h> to <assert.h> if __STDC__ is defined, since
+  # <limits.h> exists even on freestanding compilers.
+  # On the NeXT, cc -E runs the code through the compiler's parser,
+  # not just through cpp. "Syntax error" is here to catch this case.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#ifdef __STDC__
+# include <limits.h>
+#else
+# include <assert.h>
+#endif
+		     Syntax error
+_ACEOF
+if ac_fn_cxx_try_cpp "$LINENO"; then :
 
+else
+  # Broken: fails on valid input.
+continue
+fi
+rm -f conftest.err conftest.i conftest.$ac_ext
 
+  # OK, works on sane cases.  Now check whether nonexistent headers
+  # can be detected and how.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <ac_nonexistent.h>
+_ACEOF
+if ac_fn_cxx_try_cpp "$LINENO"; then :
+  # Broken: success on invalid input.
+continue
+else
+  # Passes both tests.
+ac_preproc_ok=:
+break
+fi
+rm -f conftest.err conftest.i conftest.$ac_ext
 
+done
+# Because of `break', _AC_PREPROC_IFELSE's cleaning code was skipped.
+rm -f conftest.i conftest.err conftest.$ac_ext
+if $ac_preproc_ok; then :
+  break
+fi
 
+    done
+    ac_cv_prog_CXXCPP=$CXXCPP
 
+fi
+  CXXCPP=$ac_cv_prog_CXXCPP
+else
+  ac_cv_prog_CXXCPP=$CXXCPP
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $CXXCPP" >&5
+$as_echo "$CXXCPP" >&6; }
+ac_preproc_ok=false
+for ac_cxx_preproc_warn_flag in '' yes
+do
+  # Use a header file that comes with gcc, so configuring glibc
+  # with a fresh cross-compiler works.
+  # Prefer <limits.h> to <assert.h> if __STDC__ is defined, since
+  # <limits.h> exists even on freestanding compilers.
+  # On the NeXT, cc -E runs the code through the compiler's parser,
+  # not just through cpp. "Syntax error" is here to catch this case.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#ifdef __STDC__
+# include <limits.h>
+#else
+# include <assert.h>
+#endif
+		     Syntax error
+_ACEOF
+if ac_fn_cxx_try_cpp "$LINENO"; then :
 
+else
+  # Broken: fails on valid input.
+continue
+fi
+rm -f conftest.err conftest.i conftest.$ac_ext
 
+  # OK, works on sane cases.  Now check whether nonexistent headers
+  # can be detected and how.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <ac_nonexistent.h>
+_ACEOF
+if ac_fn_cxx_try_cpp "$LINENO"; then :
+  # Broken: success on invalid input.
+continue
+else
+  # Passes both tests.
+ac_preproc_ok=:
+break
+fi
+rm -f conftest.err conftest.i conftest.$ac_ext
 
+done
+# Because of `break', _AC_PREPROC_IFELSE's cleaning code was skipped.
+rm -f conftest.i conftest.err conftest.$ac_ext
+if $ac_preproc_ok; then :
 
+else
+  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "C++ preprocessor \"$CXXCPP\" fails sanity check
+See \`config.log' for more details" "$LINENO" 5; }
+fi
 
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
+else
+  _lt_caught_CXX_error=yes
+fi
 
+ac_ext=cpp
+ac_cpp='$CXXCPP $CPPFLAGS'
+ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
 
+archive_cmds_need_lc_CXX=no
+allow_undefined_flag_CXX=
+always_export_symbols_CXX=no
+archive_expsym_cmds_CXX=
+compiler_needs_object_CXX=no
+export_dynamic_flag_spec_CXX=
+hardcode_direct_CXX=no
+hardcode_direct_absolute_CXX=no
+hardcode_libdir_flag_spec_CXX=
+hardcode_libdir_separator_CXX=
+hardcode_minus_L_CXX=no
+hardcode_shlibpath_var_CXX=unsupported
+hardcode_automatic_CXX=no
+inherit_rpath_CXX=no
+module_cmds_CXX=
+module_expsym_cmds_CXX=
+link_all_deplibs_CXX=unknown
+old_archive_cmds_CXX=$old_archive_cmds
+reload_flag_CXX=$reload_flag
+reload_cmds_CXX=$reload_cmds
+no_undefined_flag_CXX=
+whole_archive_flag_spec_CXX=
+enable_shared_with_static_runtimes_CXX=no
 
-        ac_config_commands="$ac_config_commands libtool"
+# Source file extension for C++ test sources.
+ac_ext=cpp
 
+# Object file extension for compiled C++ test sources.
+objext=o
+objext_CXX=$objext
 
+# No sense in running all these tests if we already determined that
+# the CXX compiler isn't working.  Some variables (like enable_shared)
+# are currently assumed to apply to all compilers on this platform,
+# and will be corrupted by setting them based on a non-working compiler.
+if test "$_lt_caught_CXX_error" != yes; then
+  # Code to be used in simple compile tests
+  lt_simple_compile_test_code="int some_variable = 0;"
 
+  # Code to be used in simple link tests
+  lt_simple_link_test_code='int main(int, char *[]) { return(0); }'
 
-# Only expand once:
+  # ltmain only uses $CC for tagged configurations so make sure $CC is set.
 
 
-# By default we simply use the C compiler to build assembly code.
 
-test "${CCAS+set}" = set || CCAS=$CC
-test "${CCASFLAGS+set}" = set || CCASFLAGS=$CFLAGS
 
 
 
-depcc="$CCAS"   am_compiler_list=
+# If no C compiler was specified, use CC.
+LTCC=${LTCC-"$CC"}
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking dependency style of $depcc" >&5
-$as_echo_n "checking dependency style of $depcc... " >&6; }
-if ${am_cv_CCAS_dependencies_compiler_type+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -z "$AMDEP_TRUE" && test -f "$am_depcomp"; then
-  # We make a subdir and do the tests there.  Otherwise we can end up
-  # making bogus files that we don't know about and never remove.  For
-  # instance it was reported that on HP-UX the gcc test will end up
-  # making a dummy file named 'D' -- because '-MD' means "put the output
-  # in D".
-  rm -rf conftest.dir
-  mkdir conftest.dir
-  # Copy depcomp to subdir because otherwise we won't find it if we're
-  # using a relative directory.
-  cp "$am_depcomp" conftest.dir
-  cd conftest.dir
-  # We will build objects and dependencies in a subdirectory because
-  # it helps to detect inapplicable dependency modes.  For instance
-  # both Tru64's cc and ICC support -MD to output dependencies as a
-  # side effect of compilation, but ICC will put the dependencies in
-  # the current directory while Tru64 will put them in the object
-  # directory.
-  mkdir sub
+# If no C compiler flags were specified, use CFLAGS.
+LTCFLAGS=${LTCFLAGS-"$CFLAGS"}
 
-  am_cv_CCAS_dependencies_compiler_type=none
-  if test "$am_compiler_list" = ""; then
-     am_compiler_list=`sed -n 's/^#*\([a-zA-Z0-9]*\))$/\1/p' < ./depcomp`
+# Allow CC to be a program name with arguments.
+compiler=$CC
+
+
+  # save warnings/boilerplate of simple test code
+  ac_outfile=conftest.$ac_objext
+echo "$lt_simple_compile_test_code" >conftest.$ac_ext
+eval "$ac_compile" 2>&1 >/dev/null | $SED '/^$/d; /^ *+/d' >conftest.err
+_lt_compiler_boilerplate=`cat conftest.err`
+$RM conftest*
+
+  ac_outfile=conftest.$ac_objext
+echo "$lt_simple_link_test_code" >conftest.$ac_ext
+eval "$ac_link" 2>&1 >/dev/null | $SED '/^$/d; /^ *+/d' >conftest.err
+_lt_linker_boilerplate=`cat conftest.err`
+$RM -r conftest*
+
+
+  # Allow CC to be a program name with arguments.
+  lt_save_CC=$CC
+  lt_save_CFLAGS=$CFLAGS
+  lt_save_LD=$LD
+  lt_save_GCC=$GCC
+  GCC=$GXX
+  lt_save_with_gnu_ld=$with_gnu_ld
+  lt_save_path_LD=$lt_cv_path_LD
+  if test -n "${lt_cv_prog_gnu_ldcxx+set}"; then
+    lt_cv_prog_gnu_ld=$lt_cv_prog_gnu_ldcxx
+  else
+    $as_unset lt_cv_prog_gnu_ld
   fi
-  am__universal=false
+  if test -n "${lt_cv_path_LDCXX+set}"; then
+    lt_cv_path_LD=$lt_cv_path_LDCXX
+  else
+    $as_unset lt_cv_path_LD
+  fi
+  test -z "${LDCXX+set}" || LD=$LDCXX
+  CC=${CXX-"c++"}
+  CFLAGS=$CXXFLAGS
+  compiler=$CC
+  compiler_CXX=$CC
+  for cc_temp in $compiler""; do
+  case $cc_temp in
+    compile | *[\\/]compile | ccache | *[\\/]ccache ) ;;
+    distcc | *[\\/]distcc | purify | *[\\/]purify ) ;;
+    \-*) ;;
+    *) break;;
+  esac
+done
+cc_basename=`$ECHO "$cc_temp" | $SED "s%.*/%%; s%^$host_alias-%%"`
 
 
-  for depmode in $am_compiler_list; do
-    # Setup a source with many dependencies, because some compilers
-    # like to wrap large dependency lists on column 80 (with \), and
-    # we should not choose a depcomp mode which is confused by this.
-    #
-    # We need to recreate these files for each test, as the compiler may
-    # overwrite some of them when testing with obscure command lines.
-    # This happens at least with the AIX C compiler.
-    : > sub/conftest.c
-    for i in 1 2 3 4 5 6; do
-      echo '#include "conftst'$i'.h"' >> sub/conftest.c
-      # Using ": > sub/conftst$i.h" creates only sub/conftst1.h with
-      # Solaris 10 /bin/sh.
-      echo '/* dummy */' > sub/conftst$i.h
-    done
-    echo "${am__include} ${am__quote}sub/conftest.Po${am__quote}" > confmf
+  if test -n "$compiler"; then
+    # We don't want -fno-exception when compiling C++ code, so set the
+    # no_builtin_flag separately
+    if test "$GXX" = yes; then
+      lt_prog_compiler_no_builtin_flag_CXX=' -fno-builtin'
+    else
+      lt_prog_compiler_no_builtin_flag_CXX=
+    fi
 
-    # We check with '-c' and '-o' for the sake of the "dashmstdout"
-    # mode.  It turns out that the SunPro C++ compiler does not properly
-    # handle '-M -o', and we need to detect this.  Also, some Intel
-    # versions had trouble with output in subdirs.
-    am__obj=sub/conftest.${OBJEXT-o}
-    am__minus_obj="-o $am__obj"
-    case $depmode in
-    gcc)
-      # This depmode causes a compiler race in universal mode.
-      test "$am__universal" = false || continue
-      ;;
-    nosideeffect)
-      # After this tag, mechanisms are not by side-effect, so they'll
-      # only be used when explicitly requested.
-      if test "x$enable_dependency_tracking" = xyes; then
-	continue
-      else
-	break
-      fi
-      ;;
-    msvc7 | msvc7msys | msvisualcpp | msvcmsys)
-      # This compiler won't grok '-c -o', but also, the minuso test has
-      # not run yet.  These depmodes are late enough in the game, and
-      # so weak that their functioning should not be impacted.
-      am__obj=conftest.${OBJEXT-o}
-      am__minus_obj=
+    if test "$GXX" = yes; then
+      # Set up default GNU C++ configuration
+
+
+
+# Check whether --with-gnu-ld was given.
+if test "${with_gnu_ld+set}" = set; then :
+  withval=$with_gnu_ld; test "$withval" = no || with_gnu_ld=yes
+else
+  with_gnu_ld=no
+fi
+
+ac_prog=ld
+if test "$GCC" = yes; then
+  # Check if gcc -print-prog-name=ld gives a path.
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for ld used by $CC" >&5
+$as_echo_n "checking for ld used by $CC... " >&6; }
+  case $host in
+  *-*-mingw*)
+    # gcc leaves a trailing carriage return which upsets mingw
+    ac_prog=`($CC -print-prog-name=ld) 2>&5 | tr -d '\015'` ;;
+  *)
+    ac_prog=`($CC -print-prog-name=ld) 2>&5` ;;
+  esac
+  case $ac_prog in
+    # Accept absolute paths.
+    [\\/]* | ?:[\\/]*)
+      re_direlt='/[^/][^/]*/\.\./'
+      # Canonicalize the pathname of ld
+      ac_prog=`$ECHO "$ac_prog"| $SED 's%\\\\%/%g'`
+      while $ECHO "$ac_prog" | $GREP "$re_direlt" > /dev/null 2>&1; do
+	ac_prog=`$ECHO $ac_prog| $SED "s%$re_direlt%/%"`
+      done
+      test -z "$LD" && LD="$ac_prog"
       ;;
-    none) break ;;
-    esac
-    if depmode=$depmode \
-       source=sub/conftest.c object=$am__obj \
-       depfile=sub/conftest.Po tmpdepfile=sub/conftest.TPo \
-       $SHELL ./depcomp $depcc -c $am__minus_obj sub/conftest.c \
-         >/dev/null 2>conftest.err &&
-       grep sub/conftst1.h sub/conftest.Po > /dev/null 2>&1 &&
-       grep sub/conftst6.h sub/conftest.Po > /dev/null 2>&1 &&
-       grep $am__obj sub/conftest.Po > /dev/null 2>&1 &&
-       ${MAKE-make} -s -f confmf > /dev/null 2>&1; then
-      # icc doesn't choke on unknown options, it will just issue warnings
-      # or remarks (even with -Werror).  So we grep stderr for any message
-      # that says an option was ignored or not supported.
-      # When given -MP, icc 7.0 and 7.1 complain thusly:
-      #   icc: Command line warning: ignoring option '-M'; no argument required
-      # The diagnosis changed in icc 8.0:
-      #   icc: Command line remark: option '-MP' not supported
-      if (grep 'ignoring option' conftest.err ||
-          grep 'not supported' conftest.err) >/dev/null 2>&1; then :; else
-        am_cv_CCAS_dependencies_compiler_type=$depmode
-        break
-      fi
+  "")
+    # If it fails, then pretend we aren't using GCC.
+    ac_prog=ld
+    ;;
+  *)
+    # If it is relative, then search for the first ld in PATH.
+    with_gnu_ld=unknown
+    ;;
+  esac
+elif test "$with_gnu_ld" = yes; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for GNU ld" >&5
+$as_echo_n "checking for GNU ld... " >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for non-GNU ld" >&5
+$as_echo_n "checking for non-GNU ld... " >&6; }
+fi
+if ${lt_cv_path_LD+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -z "$LD"; then
+  lt_save_ifs="$IFS"; IFS=$PATH_SEPARATOR
+  for ac_dir in $PATH; do
+    IFS="$lt_save_ifs"
+    test -z "$ac_dir" && ac_dir=.
+    if test -f "$ac_dir/$ac_prog" || test -f "$ac_dir/$ac_prog$ac_exeext"; then
+      lt_cv_path_LD="$ac_dir/$ac_prog"
+      # Check to see if the program is GNU ld.  I'd rather use --version,
+      # but apparently some variants of GNU ld only accept -v.
+      # Break only if it was the GNU/non-GNU ld that we prefer.
+      case `"$lt_cv_path_LD" -v 2>&1 </dev/null` in
+      *GNU* | *'with BFD'*)
+	test "$with_gnu_ld" != no && break
+	;;
+      *)
+	test "$with_gnu_ld" != yes && break
+	;;
+      esac
     fi
   done
-
-  cd ..
-  rm -rf conftest.dir
+  IFS="$lt_save_ifs"
 else
-  am_cv_CCAS_dependencies_compiler_type=none
+  lt_cv_path_LD="$LD" # Let the user override the test with a path.
 fi
-
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $am_cv_CCAS_dependencies_compiler_type" >&5
-$as_echo "$am_cv_CCAS_dependencies_compiler_type" >&6; }
-CCASDEPMODE=depmode=$am_cv_CCAS_dependencies_compiler_type
 
- if
-  test "x$enable_dependency_tracking" != xno \
-  && test "$am_cv_CCAS_dependencies_compiler_type" = gcc3; then
-  am__fastdepCCAS_TRUE=
-  am__fastdepCCAS_FALSE='#'
+LD="$lt_cv_path_LD"
+if test -n "$LD"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $LD" >&5
+$as_echo "$LD" >&6; }
 else
-  am__fastdepCCAS_TRUE='#'
-  am__fastdepCCAS_FALSE=
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+test -z "$LD" && as_fn_error $? "no acceptable ld found in \$PATH" "$LINENO" 5
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking if the linker ($LD) is GNU ld" >&5
+$as_echo_n "checking if the linker ($LD) is GNU ld... " >&6; }
+if ${lt_cv_prog_gnu_ld+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  # I'd rather use --version here, but apparently some GNU lds only accept -v.
+case `$LD -v 2>&1 </dev/null` in
+*GNU* | *'with BFD'*)
+  lt_cv_prog_gnu_ld=yes
+  ;;
+*)
+  lt_cv_prog_gnu_ld=no
+  ;;
+esac
 fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_prog_gnu_ld" >&5
+$as_echo "$lt_cv_prog_gnu_ld" >&6; }
+with_gnu_ld=$lt_cv_prog_gnu_ld
 
 
 
 
-# enable make V=0 (if automake >1.11)
-if test "`cd $srcdir && pwd`" != "`pwd`"; then
-  # Use -I$(srcdir) only when $(srcdir) != ., so that make's output
-  # is not polluted with repeated "-I."
-  am__isrc=' -I$(srcdir)'
-  # test to see if srcdir already configured
-  if test -f $srcdir/config.status; then
-    as_fn_error $? "source directory already configured; run \"make distclean\" there first" "$LINENO" 5
-  fi
-fi
 
-# test whether we have cygpath
-if test -z "$CYGPATH_W"; then
-  if (cygpath --version) >/dev/null 2>/dev/null; then
-    CYGPATH_W='cygpath -w'
-  else
-    CYGPATH_W=echo
-  fi
-fi
 
 
-# Define the identity of the package.
- PACKAGE='ceph'
- VERSION='0.94.5'
+      # Check if GNU C++ uses GNU ld as the underlying linker, since the
+      # archiving commands below assume that GNU ld is being used.
+      if test "$with_gnu_ld" = yes; then
+        archive_cmds_CXX='$CC $pic_flag -shared -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname $wl$soname -o $lib'
+        archive_expsym_cmds_CXX='$CC $pic_flag -shared -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
 
+        hardcode_libdir_flag_spec_CXX='${wl}-rpath ${wl}$libdir'
+        export_dynamic_flag_spec_CXX='${wl}--export-dynamic'
 
-cat >>confdefs.h <<_ACEOF
-#define PACKAGE "$PACKAGE"
-_ACEOF
+        # If archive_cmds runs LD, not CC, wlarc should be empty
+        # XXX I think wlarc can be eliminated in ltcf-cxx, but I need to
+        #     investigate it a little bit more. (MM)
+        wlarc='${wl}'
 
+        # ancient GNU ld didn't support --whole-archive et. al.
+        if eval "`$CC -print-prog-name=ld` --help 2>&1" |
+	  $GREP 'no-whole-archive' > /dev/null; then
+          whole_archive_flag_spec_CXX="$wlarc"'--whole-archive$convenience '"$wlarc"'--no-whole-archive'
+        else
+          whole_archive_flag_spec_CXX=
+        fi
+      else
+        with_gnu_ld=no
+        wlarc=
 
-cat >>confdefs.h <<_ACEOF
-#define VERSION "$VERSION"
-_ACEOF
+        # A generic and very simple default shared library creation
+        # command for GNU C++ for the case where it uses the native
+        # linker, instead of GNU ld.  If possible, this setting should
+        # overridden to take advantage of the native linker features on
+        # the platform it is being used on.
+        archive_cmds_CXX='$CC -shared -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -o $lib'
+      fi
 
-# Some tools Automake needs.
+      # Commands to make compiler produce verbose output that lists
+      # what "hidden" libraries, object files and flags are used when
+      # linking a shared library.
+      output_verbose_link_cmd='$CC -shared $CFLAGS -v conftest.$objext 2>&1 | $GREP -v "^Configured with:" | $GREP "\-L"'
 
-ACLOCAL=${ACLOCAL-"${am_missing_run}aclocal-${am__api_version}"}
+    else
+      GXX=no
+      with_gnu_ld=no
+      wlarc=
+    fi
 
+    # PORTME: fill in a description of your system's C++ link characteristics
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the $compiler linker ($LD) supports shared libraries" >&5
+$as_echo_n "checking whether the $compiler linker ($LD) supports shared libraries... " >&6; }
+    ld_shlibs_CXX=yes
+    case $host_os in
+      aix3*)
+        # FIXME: insert proper C++ library support
+        ld_shlibs_CXX=no
+        ;;
+      aix[4-9]*)
+        if test "$host_cpu" = ia64; then
+          # On IA64, the linker does run time linking by default, so we don't
+          # have to do anything special.
+          aix_use_runtimelinking=no
+          exp_sym_flag='-Bexport'
+          no_entry_flag=""
+        else
+          aix_use_runtimelinking=no
 
-AUTOCONF=${AUTOCONF-"${am_missing_run}autoconf"}
+          # Test if we are trying to use run time linking or normal
+          # AIX style linking. If -brtl is somewhere in LDFLAGS, we
+          # need to do runtime linking.
+          case $host_os in aix4.[23]|aix4.[23].*|aix[5-9]*)
+	    for ld_flag in $LDFLAGS; do
+	      case $ld_flag in
+	      *-brtl*)
+	        aix_use_runtimelinking=yes
+	        break
+	        ;;
+	      esac
+	    done
+	    ;;
+          esac
 
+          exp_sym_flag='-bexport'
+          no_entry_flag='-bnoentry'
+        fi
 
-AUTOMAKE=${AUTOMAKE-"${am_missing_run}automake-${am__api_version}"}
+        # When large executables or shared objects are built, AIX ld can
+        # have problems creating the table of contents.  If linking a library
+        # or program results in "error TOC overflow" add -mminimal-toc to
+        # CXXFLAGS/CFLAGS for g++/gcc.  In the cases where that is not
+        # enough to fix the problem, add -Wl,-bbigtoc to LDFLAGS.
 
+        archive_cmds_CXX=''
+        hardcode_direct_CXX=yes
+        hardcode_direct_absolute_CXX=yes
+        hardcode_libdir_separator_CXX=':'
+        link_all_deplibs_CXX=yes
+        file_list_spec_CXX='${wl}-f,'
 
-AUTOHEADER=${AUTOHEADER-"${am_missing_run}autoheader"}
+        if test "$GXX" = yes; then
+          case $host_os in aix4.[012]|aix4.[012].*)
+          # We only want to do this on AIX 4.2 and lower, the check
+          # below for broken collect2 doesn't work under 4.3+
+	  collect2name=`${CC} -print-prog-name=collect2`
+	  if test -f "$collect2name" &&
+	     strings "$collect2name" | $GREP resolve_lib_name >/dev/null
+	  then
+	    # We have reworked collect2
+	    :
+	  else
+	    # We have old collect2
+	    hardcode_direct_CXX=unsupported
+	    # It fails to find uninstalled libraries when the uninstalled
+	    # path is not listed in the libpath.  Setting hardcode_minus_L
+	    # to unsupported forces relinking
+	    hardcode_minus_L_CXX=yes
+	    hardcode_libdir_flag_spec_CXX='-L$libdir'
+	    hardcode_libdir_separator_CXX=
+	  fi
+          esac
+          shared_flag='-shared'
+	  if test "$aix_use_runtimelinking" = yes; then
+	    shared_flag="$shared_flag "'${wl}-G'
+	  fi
+        else
+          # not using gcc
+          if test "$host_cpu" = ia64; then
+	  # VisualAge C++, Version 5.5 for AIX 5L for IA-64, Beta 3 Release
+	  # chokes on -Wl,-G. The following line is correct:
+	  shared_flag='-G'
+          else
+	    if test "$aix_use_runtimelinking" = yes; then
+	      shared_flag='${wl}-G'
+	    else
+	      shared_flag='${wl}-bM:SRE'
+	    fi
+          fi
+        fi
 
+        export_dynamic_flag_spec_CXX='${wl}-bexpall'
+        # It seems that -bexpall does not export symbols beginning with
+        # underscore (_), so it is better to generate a list of symbols to
+	# export.
+        always_export_symbols_CXX=yes
+        if test "$aix_use_runtimelinking" = yes; then
+          # Warning - without using the other runtime loading flags (-brtl),
+          # -berok will link without error, but may produce a broken library.
+          allow_undefined_flag_CXX='-berok'
+          # Determine the default libpath from the value encoded in an empty
+          # executable.
+          if test "${lt_cv_aix_libpath+set}" = set; then
+  aix_libpath=$lt_cv_aix_libpath
+else
+  if ${lt_cv_aix_libpath__CXX+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
 
-MAKEINFO=${MAKEINFO-"${am_missing_run}makeinfo"}
+int
+main ()
+{
 
-# For better backward compatibility.  To be removed once Automake 1.9.x
-# dies out for good.  For more background, see:
-# <http://lists.gnu.org/archive/html/automake/2012-07/msg00001.html>
-# <http://lists.gnu.org/archive/html/automake/2012-07/msg00014.html>
-mkdir_p='$(MKDIR_P)'
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_link "$LINENO"; then :
 
-# We need awk for the "check" target.  The system "awk" is bad on
-# some platforms.
-# Always define AMTAR for backward compatibility.  Yes, it's still used
-# in the wild :-(  We should find a proper way to deprecate it ...
-AMTAR='$${TAR-tar}'
+  lt_aix_libpath_sed='
+      /Import File Strings/,/^$/ {
+	  /^0/ {
+	      s/^0  *\([^ ]*\) *$/\1/
+	      p
+	  }
+      }'
+  lt_cv_aix_libpath__CXX=`dump -H conftest$ac_exeext 2>/dev/null | $SED -n -e "$lt_aix_libpath_sed"`
+  # Check for a 64-bit object if we didn't find anything.
+  if test -z "$lt_cv_aix_libpath__CXX"; then
+    lt_cv_aix_libpath__CXX=`dump -HX64 conftest$ac_exeext 2>/dev/null | $SED -n -e "$lt_aix_libpath_sed"`
+  fi
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+  if test -z "$lt_cv_aix_libpath__CXX"; then
+    lt_cv_aix_libpath__CXX="/usr/lib:/lib"
+  fi
 
+fi
 
-# We'll loop over all known methods to create a tar archive until one works.
-_am_tools='gnutar  pax cpio none'
+  aix_libpath=$lt_cv_aix_libpath__CXX
+fi
 
-am__tar='$${TAR-tar} chof - "$$tardir"' am__untar='$${TAR-tar} xf -'
+          hardcode_libdir_flag_spec_CXX='${wl}-blibpath:$libdir:'"$aix_libpath"
 
+          archive_expsym_cmds_CXX='$CC -o $output_objdir/$soname $libobjs $deplibs '"\${wl}$no_entry_flag"' $compiler_flags `if test "x${allow_undefined_flag}" != "x"; then func_echo_all "${wl}${allow_undefined_flag}"; else :; fi` '"\${wl}$exp_sym_flag:\$export_symbols $shared_flag"
+        else
+          if test "$host_cpu" = ia64; then
+	    hardcode_libdir_flag_spec_CXX='${wl}-R $libdir:/usr/lib:/lib'
+	    allow_undefined_flag_CXX="-z nodefs"
+	    archive_expsym_cmds_CXX="\$CC $shared_flag"' -o $output_objdir/$soname $libobjs $deplibs '"\${wl}$no_entry_flag"' $compiler_flags ${wl}${allow_undefined_flag} '"\${wl}$exp_sym_flag:\$export_symbols"
+          else
+	    # Determine the default libpath from the value encoded in an
+	    # empty executable.
+	    if test "${lt_cv_aix_libpath+set}" = set; then
+  aix_libpath=$lt_cv_aix_libpath
+else
+  if ${lt_cv_aix_libpath__CXX+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
 
+int
+main ()
+{
 
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_link "$LINENO"; then :
 
+  lt_aix_libpath_sed='
+      /Import File Strings/,/^$/ {
+	  /^0/ {
+	      s/^0  *\([^ ]*\) *$/\1/
+	      p
+	  }
+      }'
+  lt_cv_aix_libpath__CXX=`dump -H conftest$ac_exeext 2>/dev/null | $SED -n -e "$lt_aix_libpath_sed"`
+  # Check for a 64-bit object if we didn't find anything.
+  if test -z "$lt_cv_aix_libpath__CXX"; then
+    lt_cv_aix_libpath__CXX=`dump -HX64 conftest$ac_exeext 2>/dev/null | $SED -n -e "$lt_aix_libpath_sed"`
+  fi
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+  if test -z "$lt_cv_aix_libpath__CXX"; then
+    lt_cv_aix_libpath__CXX="/usr/lib:/lib"
+  fi
 
-depcc="$CC"   am_compiler_list=
+fi
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking dependency style of $depcc" >&5
-$as_echo_n "checking dependency style of $depcc... " >&6; }
-if ${am_cv_CC_dependencies_compiler_type+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -z "$AMDEP_TRUE" && test -f "$am_depcomp"; then
-  # We make a subdir and do the tests there.  Otherwise we can end up
-  # making bogus files that we don't know about and never remove.  For
-  # instance it was reported that on HP-UX the gcc test will end up
-  # making a dummy file named 'D' -- because '-MD' means "put the output
-  # in D".
-  rm -rf conftest.dir
-  mkdir conftest.dir
-  # Copy depcomp to subdir because otherwise we won't find it if we're
-  # using a relative directory.
-  cp "$am_depcomp" conftest.dir
-  cd conftest.dir
-  # We will build objects and dependencies in a subdirectory because
-  # it helps to detect inapplicable dependency modes.  For instance
-  # both Tru64's cc and ICC support -MD to output dependencies as a
-  # side effect of compilation, but ICC will put the dependencies in
-  # the current directory while Tru64 will put them in the object
-  # directory.
-  mkdir sub
-
-  am_cv_CC_dependencies_compiler_type=none
-  if test "$am_compiler_list" = ""; then
-     am_compiler_list=`sed -n 's/^#*\([a-zA-Z0-9]*\))$/\1/p' < ./depcomp`
-  fi
-  am__universal=false
-  case " $depcc " in #(
-     *\ -arch\ *\ -arch\ *) am__universal=true ;;
-     esac
-
-  for depmode in $am_compiler_list; do
-    # Setup a source with many dependencies, because some compilers
-    # like to wrap large dependency lists on column 80 (with \), and
-    # we should not choose a depcomp mode which is confused by this.
-    #
-    # We need to recreate these files for each test, as the compiler may
-    # overwrite some of them when testing with obscure command lines.
-    # This happens at least with the AIX C compiler.
-    : > sub/conftest.c
-    for i in 1 2 3 4 5 6; do
-      echo '#include "conftst'$i'.h"' >> sub/conftest.c
-      # Using ": > sub/conftst$i.h" creates only sub/conftst1.h with
-      # Solaris 10 /bin/sh.
-      echo '/* dummy */' > sub/conftst$i.h
-    done
-    echo "${am__include} ${am__quote}sub/conftest.Po${am__quote}" > confmf
-
-    # We check with '-c' and '-o' for the sake of the "dashmstdout"
-    # mode.  It turns out that the SunPro C++ compiler does not properly
-    # handle '-M -o', and we need to detect this.  Also, some Intel
-    # versions had trouble with output in subdirs.
-    am__obj=sub/conftest.${OBJEXT-o}
-    am__minus_obj="-o $am__obj"
-    case $depmode in
-    gcc)
-      # This depmode causes a compiler race in universal mode.
-      test "$am__universal" = false || continue
-      ;;
-    nosideeffect)
-      # After this tag, mechanisms are not by side-effect, so they'll
-      # only be used when explicitly requested.
-      if test "x$enable_dependency_tracking" = xyes; then
-	continue
-      else
-	break
-      fi
-      ;;
-    msvc7 | msvc7msys | msvisualcpp | msvcmsys)
-      # This compiler won't grok '-c -o', but also, the minuso test has
-      # not run yet.  These depmodes are late enough in the game, and
-      # so weak that their functioning should not be impacted.
-      am__obj=conftest.${OBJEXT-o}
-      am__minus_obj=
-      ;;
-    none) break ;;
-    esac
-    if depmode=$depmode \
-       source=sub/conftest.c object=$am__obj \
-       depfile=sub/conftest.Po tmpdepfile=sub/conftest.TPo \
-       $SHELL ./depcomp $depcc -c $am__minus_obj sub/conftest.c \
-         >/dev/null 2>conftest.err &&
-       grep sub/conftst1.h sub/conftest.Po > /dev/null 2>&1 &&
-       grep sub/conftst6.h sub/conftest.Po > /dev/null 2>&1 &&
-       grep $am__obj sub/conftest.Po > /dev/null 2>&1 &&
-       ${MAKE-make} -s -f confmf > /dev/null 2>&1; then
-      # icc doesn't choke on unknown options, it will just issue warnings
-      # or remarks (even with -Werror).  So we grep stderr for any message
-      # that says an option was ignored or not supported.
-      # When given -MP, icc 7.0 and 7.1 complain thusly:
-      #   icc: Command line warning: ignoring option '-M'; no argument required
-      # The diagnosis changed in icc 8.0:
-      #   icc: Command line remark: option '-MP' not supported
-      if (grep 'ignoring option' conftest.err ||
-          grep 'not supported' conftest.err) >/dev/null 2>&1; then :; else
-        am_cv_CC_dependencies_compiler_type=$depmode
-        break
-      fi
-    fi
-  done
-
-  cd ..
-  rm -rf conftest.dir
-else
-  am_cv_CC_dependencies_compiler_type=none
+  aix_libpath=$lt_cv_aix_libpath__CXX
 fi
 
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $am_cv_CC_dependencies_compiler_type" >&5
-$as_echo "$am_cv_CC_dependencies_compiler_type" >&6; }
-CCDEPMODE=depmode=$am_cv_CC_dependencies_compiler_type
+	    hardcode_libdir_flag_spec_CXX='${wl}-blibpath:$libdir:'"$aix_libpath"
+	    # Warning - without using the other run time loading flags,
+	    # -berok will link without error, but may produce a broken library.
+	    no_undefined_flag_CXX=' ${wl}-bernotok'
+	    allow_undefined_flag_CXX=' ${wl}-berok'
+	    if test "$with_gnu_ld" = yes; then
+	      # We only use this code for GNU lds that support --whole-archive.
+	      whole_archive_flag_spec_CXX='${wl}--whole-archive$convenience ${wl}--no-whole-archive'
+	    else
+	      # Exported symbols can be pulled into shared objects from archives
+	      whole_archive_flag_spec_CXX='$convenience'
+	    fi
+	    archive_cmds_need_lc_CXX=yes
+	    # This is similar to how AIX traditionally builds its shared
+	    # libraries.
+	    archive_expsym_cmds_CXX="\$CC $shared_flag"' -o $output_objdir/$soname $libobjs $deplibs ${wl}-bnoentry $compiler_flags ${wl}-bE:$export_symbols${allow_undefined_flag}~$AR $AR_FLAGS $output_objdir/$libname$release.a $output_objdir/$soname'
+          fi
+        fi
+        ;;
 
- if
-  test "x$enable_dependency_tracking" != xno \
-  && test "$am_cv_CC_dependencies_compiler_type" = gcc3; then
-  am__fastdepCC_TRUE=
-  am__fastdepCC_FALSE='#'
-else
-  am__fastdepCC_TRUE='#'
-  am__fastdepCC_FALSE=
-fi
+      beos*)
+	if $LD --help 2>&1 | $GREP ': supported targets:.* elf' > /dev/null; then
+	  allow_undefined_flag_CXX=unsupported
+	  # Joseph Beckenbach <jrb3 at best.com> says some releases of gcc
+	  # support --undefined.  This deserves some investigation.  FIXME
+	  archive_cmds_CXX='$CC -nostart $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+	else
+	  ld_shlibs_CXX=no
+	fi
+	;;
 
+      chorus*)
+        case $cc_basename in
+          *)
+	  # FIXME: insert proper C++ library support
+	  ld_shlibs_CXX=no
+	  ;;
+        esac
+        ;;
 
+      cygwin* | mingw* | pw32* | cegcc*)
+	case $GXX,$cc_basename in
+	,cl* | no,cl*)
+	  # Native MSVC
+	  # hardcode_libdir_flag_spec is actually meaningless, as there is
+	  # no search path for DLLs.
+	  hardcode_libdir_flag_spec_CXX=' '
+	  allow_undefined_flag_CXX=unsupported
+	  always_export_symbols_CXX=yes
+	  file_list_spec_CXX='@'
+	  # Tell ltmain to make .lib files, not .a files.
+	  libext=lib
+	  # Tell ltmain to make .dll files, not .so files.
+	  shrext_cmds=".dll"
+	  # FIXME: Setting linknames here is a bad hack.
+	  archive_cmds_CXX='$CC -o $output_objdir/$soname $libobjs $compiler_flags $deplibs -Wl,-dll~linknames='
+	  archive_expsym_cmds_CXX='if test "x`$SED 1q $export_symbols`" = xEXPORTS; then
+	      $SED -n -e 's/\\\\\\\(.*\\\\\\\)/-link\\\ -EXPORT:\\\\\\\1/' -e '1\\\!p' < $export_symbols > $output_objdir/$soname.exp;
+	    else
+	      $SED -e 's/\\\\\\\(.*\\\\\\\)/-link\\\ -EXPORT:\\\\\\\1/' < $export_symbols > $output_objdir/$soname.exp;
+	    fi~
+	    $CC -o $tool_output_objdir$soname $libobjs $compiler_flags $deplibs "@$tool_output_objdir$soname.exp" -Wl,-DLL,-IMPLIB:"$tool_output_objdir$libname.dll.lib"~
+	    linknames='
+	  # The linker will not automatically build a static lib if we build a DLL.
+	  # _LT_TAGVAR(old_archive_from_new_cmds, CXX)='true'
+	  enable_shared_with_static_runtimes_CXX=yes
+	  # Don't use ranlib
+	  old_postinstall_cmds_CXX='chmod 644 $oldlib'
+	  postlink_cmds_CXX='lt_outputfile="@OUTPUT@"~
+	    lt_tool_outputfile="@TOOL_OUTPUT@"~
+	    case $lt_outputfile in
+	      *.exe|*.EXE) ;;
+	      *)
+		lt_outputfile="$lt_outputfile.exe"
+		lt_tool_outputfile="$lt_tool_outputfile.exe"
+		;;
+	    esac~
+	    func_to_tool_file "$lt_outputfile"~
+	    if test "$MANIFEST_TOOL" != ":" && test -f "$lt_outputfile.manifest"; then
+	      $MANIFEST_TOOL -manifest "$lt_tool_outputfile.manifest" -outputresource:"$lt_tool_outputfile" || exit 1;
+	      $RM "$lt_outputfile.manifest";
+	    fi'
+	  ;;
+	*)
+	  # g++
+	  # _LT_TAGVAR(hardcode_libdir_flag_spec, CXX) is actually meaningless,
+	  # as there is no search path for DLLs.
+	  hardcode_libdir_flag_spec_CXX='-L$libdir'
+	  export_dynamic_flag_spec_CXX='${wl}--export-all-symbols'
+	  allow_undefined_flag_CXX=unsupported
+	  always_export_symbols_CXX=no
+	  enable_shared_with_static_runtimes_CXX=yes
 
-# POSIX will say in a future version that running "rm -f" with no argument
-# is OK; and we want to be able to make that assumption in our Makefile
-# recipes.  So use an aggressive probe to check that the usage we want is
-# actually supported "in the wild" to an acceptable degree.
-# See automake bug#10828.
-# To make any issue more visible, cause the running configure to be aborted
-# by default if the 'rm' program in use doesn't match our expectations; the
-# user can still override this though.
-if rm -f && rm -fr && rm -rf; then : OK; else
-  cat >&2 <<'END'
-Oops!
+	  if $LD --help 2>&1 | $GREP 'auto-import' > /dev/null; then
+	    archive_cmds_CXX='$CC -shared -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -o $output_objdir/$soname ${wl}--enable-auto-image-base -Xlinker --out-implib -Xlinker $lib'
+	    # If the export-symbols file already is a .def file (1st line
+	    # is EXPORTS), use it as is; otherwise, prepend...
+	    archive_expsym_cmds_CXX='if test "x`$SED 1q $export_symbols`" = xEXPORTS; then
+	      cp $export_symbols $output_objdir/$soname.def;
+	    else
+	      echo EXPORTS > $output_objdir/$soname.def;
+	      cat $export_symbols >> $output_objdir/$soname.def;
+	    fi~
+	    $CC -shared -nostdlib $output_objdir/$soname.def $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -o $output_objdir/$soname ${wl}--enable-auto-image-base -Xlinker --out-implib -Xlinker $lib'
+	  else
+	    ld_shlibs_CXX=no
+	  fi
+	  ;;
+	esac
+	;;
+      darwin* | rhapsody*)
 
-Your 'rm' program seems unable to run without file operands specified
-on the command line, even when the '-f' option is present.  This is contrary
-to the behaviour of most rm programs out there, and not conforming with
-the upcoming POSIX standard: <http://austingroupbugs.net/view.php?id=542>
 
-Please tell bug-automake at gnu.org about your system, including the value
-of your $PATH and any error possibly output before this message.  This
-can help us improve future automake versions.
+  archive_cmds_need_lc_CXX=no
+  hardcode_direct_CXX=no
+  hardcode_automatic_CXX=yes
+  hardcode_shlibpath_var_CXX=unsupported
+  if test "$lt_cv_ld_force_load" = "yes"; then
+    whole_archive_flag_spec_CXX='`for conv in $convenience\"\"; do test  -n \"$conv\" && new_convenience=\"$new_convenience ${wl}-force_load,$conv\"; done; func_echo_all \"$new_convenience\"`'
 
-END
-  if test x"$ACCEPT_INFERIOR_RM_PROGRAM" = x"yes"; then
-    echo 'Configuration will proceed anyway, since you have set the' >&2
-    echo 'ACCEPT_INFERIOR_RM_PROGRAM variable to "yes"' >&2
-    echo >&2
   else
-    cat >&2 <<'END'
-Aborting the configuration process, to ensure you take notice of the issue.
-
-You can download and install GNU coreutils to get an 'rm' implementation
-that behaves properly: <http://www.gnu.org/software/coreutils/>.
-
-If you want to complete the configuration process using your problematic
-'rm' anyway, export the environment variable ACCEPT_INFERIOR_RM_PROGRAM
-to "yes", and re-run configure.
-
-END
-    as_fn_error $? "Your 'rm' program is bad, sorry." "$LINENO" 5
+    whole_archive_flag_spec_CXX=''
   fi
-fi
-# Check whether --enable-silent-rules was given.
-if test "${enable_silent_rules+set}" = set; then :
-  enableval=$enable_silent_rules;
-fi
+  link_all_deplibs_CXX=yes
+  allow_undefined_flag_CXX="$_lt_dar_allow_undefined"
+  case $cc_basename in
+     ifort*) _lt_dar_can_shared=yes ;;
+     *) _lt_dar_can_shared=$GCC ;;
+  esac
+  if test "$_lt_dar_can_shared" = "yes"; then
+    output_verbose_link_cmd=func_echo_all
+    archive_cmds_CXX="\$CC -dynamiclib \$allow_undefined_flag -o \$lib \$libobjs \$deplibs \$compiler_flags -install_name \$rpath/\$soname \$verstring $_lt_dar_single_mod${_lt_dsymutil}"
+    module_cmds_CXX="\$CC \$allow_undefined_flag -o \$lib -bundle \$libobjs \$deplibs \$compiler_flags${_lt_dsymutil}"
+    archive_expsym_cmds_CXX="sed 's,^,_,' < \$export_symbols > \$output_objdir/\${libname}-symbols.expsym~\$CC -dynamiclib \$allow_undefined_flag -o \$lib \$libobjs \$deplibs \$compiler_flags -install_name \$rpath/\$soname \$verstring ${_lt_dar_single_mod}${_lt_dar_export_syms}${_lt_dsymutil}"
+    module_expsym_cmds_CXX="sed -e 's,^,_,' < \$export_symbols > \$output_objdir/\${libname}-symbols.expsym~\$CC \$allow_undefined_flag -o \$lib -bundle \$libobjs \$deplibs \$compiler_flags${_lt_dar_export_syms}${_lt_dsymutil}"
+       if test "$lt_cv_apple_cc_single_mod" != "yes"; then
+      archive_cmds_CXX="\$CC -r -keep_private_externs -nostdlib -o \${lib}-master.o \$libobjs~\$CC -dynamiclib \$allow_undefined_flag -o \$lib \${lib}-master.o \$deplibs \$compiler_flags -install_name \$rpath/\$soname \$verstring${_lt_dsymutil}"
+      archive_expsym_cmds_CXX="sed 's,^,_,' < \$export_symbols > \$output_objdir/\${libname}-symbols.expsym~\$CC -r -keep_private_externs -nostdlib -o \${lib}-master.o \$libobjs~\$CC -dynamiclib \$allow_undefined_flag -o \$lib \${lib}-master.o \$deplibs \$compiler_flags -install_name \$rpath/\$soname \$verstring${_lt_dar_export_syms}${_lt_dsymutil}"
+    fi
 
-case $enable_silent_rules in # (((
-  yes) AM_DEFAULT_VERBOSITY=0;;
-   no) AM_DEFAULT_VERBOSITY=1;;
-    *) AM_DEFAULT_VERBOSITY=0;;
-esac
-am_make=${MAKE-make}
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $am_make supports nested variables" >&5
-$as_echo_n "checking whether $am_make supports nested variables... " >&6; }
-if ${am_cv_make_support_nested_variables+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if $as_echo 'TRUE=$(BAR$(V))
-BAR0=false
-BAR1=true
-V=1
-am__doit:
-	@$(TRUE)
-.PHONY: am__doit' | $am_make -f - >/dev/null 2>&1; then
-  am_cv_make_support_nested_variables=yes
-else
-  am_cv_make_support_nested_variables=no
-fi
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $am_cv_make_support_nested_variables" >&5
-$as_echo "$am_cv_make_support_nested_variables" >&6; }
-if test $am_cv_make_support_nested_variables = yes; then
-    AM_V='$(V)'
-  AM_DEFAULT_V='$(AM_DEFAULT_VERBOSITY)'
-else
-  AM_V=$AM_DEFAULT_VERBOSITY
-  AM_DEFAULT_V=$AM_DEFAULT_VERBOSITY
-fi
-AM_BACKSLASH='\'
+  else
+  ld_shlibs_CXX=no
+  fi
 
+	;;
 
-# Platform
-case "${target_os}" in
-darwin*)
+      dgux*)
+        case $cc_basename in
+          ec++*)
+	    # FIXME: insert proper C++ library support
+	    ld_shlibs_CXX=no
+	    ;;
+          ghcx*)
+	    # Green Hills C++ Compiler
+	    # FIXME: insert proper C++ library support
+	    ld_shlibs_CXX=no
+	    ;;
+          *)
+	    # FIXME: insert proper C++ library support
+	    ld_shlibs_CXX=no
+	    ;;
+        esac
+        ;;
 
-$as_echo "#define DARWIN 1" >>confdefs.h
+      freebsd2.*)
+        # C++ shared libraries reported to be fairly broken before
+	# switch to ELF
+        ld_shlibs_CXX=no
+        ;;
 
-    darwin="yes"
-	;;
-linux*)
-	linux="yes"
-	;;
-freebsd*)
-	freebsd="yes"
-	;;
-esac
- if test x"$linux" = x"yes"; then
-  LINUX_TRUE=
-  LINUX_FALSE='#'
-else
-  LINUX_TRUE='#'
-  LINUX_FALSE=
-fi
+      freebsd-elf*)
+        archive_cmds_need_lc_CXX=no
+        ;;
 
- if test x"$freebsd" = x"yes"; then
-  FREEBSD_TRUE=
-  FREEBSD_FALSE='#'
-else
-  FREEBSD_TRUE='#'
-  FREEBSD_FALSE=
-fi
+      freebsd* | dragonfly*)
+        # FreeBSD 3 and later use GNU C++ and GNU ld with standard ELF
+        # conventions
+        ld_shlibs_CXX=yes
+        ;;
 
- if test x"$darwin" = x"yes"; then
-  DARWIN_TRUE=
-  DARWIN_FALSE='#'
-else
-  DARWIN_TRUE='#'
-  DARWIN_FALSE=
-fi
+      haiku*)
+        archive_cmds_CXX='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+        link_all_deplibs_CXX=yes
+        ;;
 
+      hpux9*)
+        hardcode_libdir_flag_spec_CXX='${wl}+b ${wl}$libdir'
+        hardcode_libdir_separator_CXX=:
+        export_dynamic_flag_spec_CXX='${wl}-E'
+        hardcode_direct_CXX=yes
+        hardcode_minus_L_CXX=yes # Not in the search PATH,
+				             # but as the default
+				             # location of the library.
 
-# Checks for programs.
-ac_ext=cpp
-ac_cpp='$CXXCPP $CPPFLAGS'
-ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
-ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
-if test -z "$CXX"; then
-  if test -n "$CCC"; then
-    CXX=$CCC
-  else
-    if test -n "$ac_tool_prefix"; then
-  for ac_prog in g++ c++ gpp aCC CC cxx cc++ cl.exe FCC KCC RCC xlC_r xlC
-  do
-    # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args.
-set dummy $ac_tool_prefix$ac_prog; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_CXX+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -n "$CXX"; then
-  ac_cv_prog_CXX="$CXX" # Let the user override the test.
-else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
-  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_prog_CXX="$ac_tool_prefix$ac_prog"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-  done
-IFS=$as_save_IFS
+        case $cc_basename in
+          CC*)
+            # FIXME: insert proper C++ library support
+            ld_shlibs_CXX=no
+            ;;
+          aCC*)
+            archive_cmds_CXX='$RM $output_objdir/$soname~$CC -b ${wl}+b ${wl}$install_libdir -o $output_objdir/$soname $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~test $output_objdir/$soname = $lib || mv $output_objdir/$soname $lib'
+            # Commands to make compiler produce verbose output that lists
+            # what "hidden" libraries, object files and flags are used when
+            # linking a shared library.
+            #
+            # There doesn't appear to be a way to prevent this compiler from
+            # explicitly linking system object files so we need to strip them
+            # from the output so that they don't get included in the library
+            # dependencies.
+            output_verbose_link_cmd='templist=`($CC -b $CFLAGS -v conftest.$objext 2>&1) | $EGREP "\-L"`; list=""; for z in $templist; do case $z in conftest.$objext) list="$list $z";; *.$objext);; *) list="$list $z";;esac; done; func_echo_all "$list"'
+            ;;
+          *)
+            if test "$GXX" = yes; then
+              archive_cmds_CXX='$RM $output_objdir/$soname~$CC -shared -nostdlib $pic_flag ${wl}+b ${wl}$install_libdir -o $output_objdir/$soname $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~test $output_objdir/$soname = $lib || mv $output_objdir/$soname $lib'
+            else
+              # FIXME: insert proper C++ library support
+              ld_shlibs_CXX=no
+            fi
+            ;;
+        esac
+        ;;
 
-fi
-fi
-CXX=$ac_cv_prog_CXX
-if test -n "$CXX"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CXX" >&5
-$as_echo "$CXX" >&6; }
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-fi
+      hpux10*|hpux11*)
+        if test $with_gnu_ld = no; then
+	  hardcode_libdir_flag_spec_CXX='${wl}+b ${wl}$libdir'
+	  hardcode_libdir_separator_CXX=:
 
+          case $host_cpu in
+            hppa*64*|ia64*)
+              ;;
+            *)
+	      export_dynamic_flag_spec_CXX='${wl}-E'
+              ;;
+          esac
+        fi
+        case $host_cpu in
+          hppa*64*|ia64*)
+            hardcode_direct_CXX=no
+            hardcode_shlibpath_var_CXX=no
+            ;;
+          *)
+            hardcode_direct_CXX=yes
+            hardcode_direct_absolute_CXX=yes
+            hardcode_minus_L_CXX=yes # Not in the search PATH,
+					         # but as the default
+					         # location of the library.
+            ;;
+        esac
 
-    test -n "$CXX" && break
-  done
-fi
-if test -z "$CXX"; then
-  ac_ct_CXX=$CXX
-  for ac_prog in g++ c++ gpp aCC CC cxx cc++ cl.exe FCC KCC RCC xlC_r xlC
-do
-  # Extract the first word of "$ac_prog", so it can be a program name with args.
-set dummy $ac_prog; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_ac_ct_CXX+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -n "$ac_ct_CXX"; then
-  ac_cv_prog_ac_ct_CXX="$ac_ct_CXX" # Let the user override the test.
-else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
-  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_prog_ac_ct_CXX="$ac_prog"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-  done
-IFS=$as_save_IFS
+        case $cc_basename in
+          CC*)
+	    # FIXME: insert proper C++ library support
+	    ld_shlibs_CXX=no
+	    ;;
+          aCC*)
+	    case $host_cpu in
+	      hppa*64*)
+	        archive_cmds_CXX='$CC -b ${wl}+h ${wl}$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
+	        ;;
+	      ia64*)
+	        archive_cmds_CXX='$CC -b ${wl}+h ${wl}$soname ${wl}+nodefaultrpath -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
+	        ;;
+	      *)
+	        archive_cmds_CXX='$CC -b ${wl}+h ${wl}$soname ${wl}+b ${wl}$install_libdir -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
+	        ;;
+	    esac
+	    # Commands to make compiler produce verbose output that lists
+	    # what "hidden" libraries, object files and flags are used when
+	    # linking a shared library.
+	    #
+	    # There doesn't appear to be a way to prevent this compiler from
+	    # explicitly linking system object files so we need to strip them
+	    # from the output so that they don't get included in the library
+	    # dependencies.
+	    output_verbose_link_cmd='templist=`($CC -b $CFLAGS -v conftest.$objext 2>&1) | $GREP "\-L"`; list=""; for z in $templist; do case $z in conftest.$objext) list="$list $z";; *.$objext);; *) list="$list $z";;esac; done; func_echo_all "$list"'
+	    ;;
+          *)
+	    if test "$GXX" = yes; then
+	      if test $with_gnu_ld = no; then
+	        case $host_cpu in
+	          hppa*64*)
+	            archive_cmds_CXX='$CC -shared -nostdlib -fPIC ${wl}+h ${wl}$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
+	            ;;
+	          ia64*)
+	            archive_cmds_CXX='$CC -shared -nostdlib $pic_flag ${wl}+h ${wl}$soname ${wl}+nodefaultrpath -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
+	            ;;
+	          *)
+	            archive_cmds_CXX='$CC -shared -nostdlib $pic_flag ${wl}+h ${wl}$soname ${wl}+b ${wl}$install_libdir -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
+	            ;;
+	        esac
+	      fi
+	    else
+	      # FIXME: insert proper C++ library support
+	      ld_shlibs_CXX=no
+	    fi
+	    ;;
+        esac
+        ;;
 
-fi
-fi
-ac_ct_CXX=$ac_cv_prog_ac_ct_CXX
-if test -n "$ac_ct_CXX"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CXX" >&5
-$as_echo "$ac_ct_CXX" >&6; }
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-fi
+      interix[3-9]*)
+	hardcode_direct_CXX=no
+	hardcode_shlibpath_var_CXX=no
+	hardcode_libdir_flag_spec_CXX='${wl}-rpath,$libdir'
+	export_dynamic_flag_spec_CXX='${wl}-E'
+	# Hack: On Interix 3.x, we cannot compile PIC because of a broken gcc.
+	# Instead, shared libraries are loaded at an image base (0x10000000 by
+	# default) and relocated if they conflict, which is a slow very memory
+	# consuming and fragmenting process.  To avoid this, we pick a random,
+	# 256 KiB-aligned image base between 0x50000000 and 0x6FFC0000 at link
+	# time.  Moving up from 0x10000000 also allows more sbrk(2) space.
+	archive_cmds_CXX='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-h,$soname ${wl}--image-base,`expr ${RANDOM-$$} % 4096 / 2 \* 262144 + 1342177280` -o $lib'
+	archive_expsym_cmds_CXX='sed "s,^,_," $export_symbols >$output_objdir/$soname.expsym~$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-h,$soname ${wl}--retain-symbols-file,$output_objdir/$soname.expsym ${wl}--image-base,`expr ${RANDOM-$$} % 4096 / 2 \* 262144 + 1342177280` -o $lib'
+	;;
+      irix5* | irix6*)
+        case $cc_basename in
+          CC*)
+	    # SGI C++
+	    archive_cmds_CXX='$CC -shared -all -multigot $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry ${output_objdir}/so_locations -o $lib'
 
+	    # Archives containing C++ object files must be created using
+	    # "CC -ar", where "CC" is the IRIX C++ compiler.  This is
+	    # necessary to make sure instantiated templates are included
+	    # in the archive.
+	    old_archive_cmds_CXX='$CC -ar -WR,-u -o $oldlib $oldobjs'
+	    ;;
+          *)
+	    if test "$GXX" = yes; then
+	      if test "$with_gnu_ld" = no; then
+	        archive_cmds_CXX='$CC -shared $pic_flag -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && func_echo_all "${wl}-set_version ${wl}$verstring"` ${wl}-update_registry ${wl}${output_objdir}/so_locations -o $lib'
+	      else
+	        archive_cmds_CXX='$CC -shared $pic_flag -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && func_echo_all "${wl}-set_version ${wl}$verstring"` -o $lib'
+	      fi
+	    fi
+	    link_all_deplibs_CXX=yes
+	    ;;
+        esac
+        hardcode_libdir_flag_spec_CXX='${wl}-rpath ${wl}$libdir'
+        hardcode_libdir_separator_CXX=:
+        inherit_rpath_CXX=yes
+        ;;
 
-  test -n "$ac_ct_CXX" && break
-done
+      linux* | k*bsd*-gnu | kopensolaris*-gnu | gnu*)
+        case $cc_basename in
+          KCC*)
+	    # Kuck and Associates, Inc. (KAI) C++ Compiler
 
-  if test "x$ac_ct_CXX" = x; then
-    CXX="g++"
-  else
-    case $cross_compiling:$ac_tool_warned in
-yes:)
-{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
-$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
-ac_tool_warned=yes ;;
-esac
-    CXX=$ac_ct_CXX
-  fi
-fi
+	    # KCC will only create a shared library if the output file
+	    # ends with ".so" (or ".sl" for HP-UX), so rename the library
+	    # to its proper name (with version) after linking.
+	    archive_cmds_CXX='tempext=`echo $shared_ext | $SED -e '\''s/\([^()0-9A-Za-z{}]\)/\\\\\1/g'\''`; templib=`echo $lib | $SED -e "s/\${tempext}\..*/.so/"`; $CC $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags --soname $soname -o \$templib; mv \$templib $lib'
+	    archive_expsym_cmds_CXX='tempext=`echo $shared_ext | $SED -e '\''s/\([^()0-9A-Za-z{}]\)/\\\\\1/g'\''`; templib=`echo $lib | $SED -e "s/\${tempext}\..*/.so/"`; $CC $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags --soname $soname -o \$templib ${wl}-retain-symbols-file,$export_symbols; mv \$templib $lib'
+	    # Commands to make compiler produce verbose output that lists
+	    # what "hidden" libraries, object files and flags are used when
+	    # linking a shared library.
+	    #
+	    # There doesn't appear to be a way to prevent this compiler from
+	    # explicitly linking system object files so we need to strip them
+	    # from the output so that they don't get included in the library
+	    # dependencies.
+	    output_verbose_link_cmd='templist=`$CC $CFLAGS -v conftest.$objext -o libconftest$shared_ext 2>&1 | $GREP "ld"`; rm -f libconftest$shared_ext; list=""; for z in $templist; do case $z in conftest.$objext) list="$list $z";; *.$objext);; *) list="$list $z";;esac; done; func_echo_all "$list"'
 
-  fi
-fi
-# Provide some information about the compiler.
-$as_echo "$as_me:${as_lineno-$LINENO}: checking for C++ compiler version" >&5
-set X $ac_compile
-ac_compiler=$2
-for ac_option in --version -v -V -qversion; do
-  { { ac_try="$ac_compiler $ac_option >&5"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_compiler $ac_option >&5") 2>conftest.err
-  ac_status=$?
-  if test -s conftest.err; then
-    sed '10a\
-... rest of stderr output deleted ...
-         10q' conftest.err >conftest.er1
-    cat conftest.er1 >&5
-  fi
-  rm -f conftest.er1 conftest.err
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }
-done
+	    hardcode_libdir_flag_spec_CXX='${wl}-rpath,$libdir'
+	    export_dynamic_flag_spec_CXX='${wl}--export-dynamic'
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are using the GNU C++ compiler" >&5
-$as_echo_n "checking whether we are using the GNU C++ compiler... " >&6; }
-if ${ac_cv_cxx_compiler_gnu+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
+	    # Archives containing C++ object files must be created using
+	    # "CC -Bstatic", where "CC" is the KAI C++ compiler.
+	    old_archive_cmds_CXX='$CC -Bstatic -o $oldlib $oldobjs'
+	    ;;
+	  icpc* | ecpc* )
+	    # Intel C++
+	    with_gnu_ld=yes
+	    # version 8.0 and above of icpc choke on multiply defined symbols
+	    # if we add $predep_objects and $postdep_objects, however 7.1 and
+	    # earlier do not add the objects themselves.
+	    case `$CC -V 2>&1` in
+	      *"Version 7."*)
+	        archive_cmds_CXX='$CC -shared $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname $wl$soname -o $lib'
+		archive_expsym_cmds_CXX='$CC -shared $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
+		;;
+	      *)  # Version 8.0 or newer
+	        tmp_idyn=
+	        case $host_cpu in
+		  ia64*) tmp_idyn=' -i_dynamic';;
+		esac
+	        archive_cmds_CXX='$CC -shared'"$tmp_idyn"' $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+		archive_expsym_cmds_CXX='$CC -shared'"$tmp_idyn"' $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
+		;;
+	    esac
+	    archive_cmds_need_lc_CXX=no
+	    hardcode_libdir_flag_spec_CXX='${wl}-rpath,$libdir'
+	    export_dynamic_flag_spec_CXX='${wl}--export-dynamic'
+	    whole_archive_flag_spec_CXX='${wl}--whole-archive$convenience ${wl}--no-whole-archive'
+	    ;;
+          pgCC* | pgcpp*)
+            # Portland Group C++ compiler
+	    case `$CC -V` in
+	    *pgCC\ [1-5].* | *pgcpp\ [1-5].*)
+	      prelink_cmds_CXX='tpldir=Template.dir~
+		rm -rf $tpldir~
+		$CC --prelink_objects --instantiation_dir $tpldir $objs $libobjs $compile_deplibs~
+		compile_command="$compile_command `find $tpldir -name \*.o | sort | $NL2SP`"'
+	      old_archive_cmds_CXX='tpldir=Template.dir~
+		rm -rf $tpldir~
+		$CC --prelink_objects --instantiation_dir $tpldir $oldobjs$old_deplibs~
+		$AR $AR_FLAGS $oldlib$oldobjs$old_deplibs `find $tpldir -name \*.o | sort | $NL2SP`~
+		$RANLIB $oldlib'
+	      archive_cmds_CXX='tpldir=Template.dir~
+		rm -rf $tpldir~
+		$CC --prelink_objects --instantiation_dir $tpldir $predep_objects $libobjs $deplibs $convenience $postdep_objects~
+		$CC -shared $pic_flag $predep_objects $libobjs $deplibs `find $tpldir -name \*.o | sort | $NL2SP` $postdep_objects $compiler_flags ${wl}-soname ${wl}$soname -o $lib'
+	      archive_expsym_cmds_CXX='tpldir=Template.dir~
+		rm -rf $tpldir~
+		$CC --prelink_objects --instantiation_dir $tpldir $predep_objects $libobjs $deplibs $convenience $postdep_objects~
+		$CC -shared $pic_flag $predep_objects $libobjs $deplibs `find $tpldir -name \*.o | sort | $NL2SP` $postdep_objects $compiler_flags ${wl}-soname ${wl}$soname ${wl}-retain-symbols-file ${wl}$export_symbols -o $lib'
+	      ;;
+	    *) # Version 6 and above use weak symbols
+	      archive_cmds_CXX='$CC -shared $pic_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname ${wl}$soname -o $lib'
+	      archive_expsym_cmds_CXX='$CC -shared $pic_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname ${wl}$soname ${wl}-retain-symbols-file ${wl}$export_symbols -o $lib'
+	      ;;
+	    esac
 
-int
-main ()
-{
-#ifndef __GNUC__
-       choke me
-#endif
+	    hardcode_libdir_flag_spec_CXX='${wl}--rpath ${wl}$libdir'
+	    export_dynamic_flag_spec_CXX='${wl}--export-dynamic'
+	    whole_archive_flag_spec_CXX='${wl}--whole-archive`for conv in $convenience\"\"; do test  -n \"$conv\" && new_convenience=\"$new_convenience,$conv\"; done; func_echo_all \"$new_convenience\"` ${wl}--no-whole-archive'
+            ;;
+	  cxx*)
+	    # Compaq C++
+	    archive_cmds_CXX='$CC -shared $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname $wl$soname -o $lib'
+	    archive_expsym_cmds_CXX='$CC -shared $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname $wl$soname  -o $lib ${wl}-retain-symbols-file $wl$export_symbols'
 
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_cxx_try_compile "$LINENO"; then :
-  ac_compiler_gnu=yes
-else
-  ac_compiler_gnu=no
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-ac_cv_cxx_compiler_gnu=$ac_compiler_gnu
+	    runpath_var=LD_RUN_PATH
+	    hardcode_libdir_flag_spec_CXX='-rpath $libdir'
+	    hardcode_libdir_separator_CXX=:
 
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_cxx_compiler_gnu" >&5
-$as_echo "$ac_cv_cxx_compiler_gnu" >&6; }
-if test $ac_compiler_gnu = yes; then
-  GXX=yes
-else
-  GXX=
-fi
-ac_test_CXXFLAGS=${CXXFLAGS+set}
-ac_save_CXXFLAGS=$CXXFLAGS
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CXX accepts -g" >&5
-$as_echo_n "checking whether $CXX accepts -g... " >&6; }
-if ${ac_cv_prog_cxx_g+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_save_cxx_werror_flag=$ac_cxx_werror_flag
-   ac_cxx_werror_flag=yes
-   ac_cv_prog_cxx_g=no
-   CXXFLAGS="-g"
-   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
+	    # Commands to make compiler produce verbose output that lists
+	    # what "hidden" libraries, object files and flags are used when
+	    # linking a shared library.
+	    #
+	    # There doesn't appear to be a way to prevent this compiler from
+	    # explicitly linking system object files so we need to strip them
+	    # from the output so that they don't get included in the library
+	    # dependencies.
+	    output_verbose_link_cmd='templist=`$CC -shared $CFLAGS -v conftest.$objext 2>&1 | $GREP "ld"`; templist=`func_echo_all "$templist" | $SED "s/\(^.*ld.*\)\( .*ld .*$\)/\1/"`; list=""; for z in $templist; do case $z in conftest.$objext) list="$list $z";; *.$objext);; *) list="$list $z";;esac; done; func_echo_all "X$list" | $Xsed'
+	    ;;
+	  xl* | mpixl* | bgxl*)
+	    # IBM XL 8.0 on PPC, with GNU ld
+	    hardcode_libdir_flag_spec_CXX='${wl}-rpath ${wl}$libdir'
+	    export_dynamic_flag_spec_CXX='${wl}--export-dynamic'
+	    archive_cmds_CXX='$CC -qmkshrobj $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+	    if test "x$supports_anon_versioning" = xyes; then
+	      archive_expsym_cmds_CXX='echo "{ global:" > $output_objdir/$libname.ver~
+		cat $export_symbols | sed -e "s/\(.*\)/\1;/" >> $output_objdir/$libname.ver~
+		echo "local: *; };" >> $output_objdir/$libname.ver~
+		$CC -qmkshrobj $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-version-script ${wl}$output_objdir/$libname.ver -o $lib'
+	    fi
+	    ;;
+	  *)
+	    case `$CC -V 2>&1 | sed 5q` in
+	    *Sun\ C*)
+	      # Sun C++ 5.9
+	      no_undefined_flag_CXX=' -zdefs'
+	      archive_cmds_CXX='$CC -G${allow_undefined_flag} -h$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
+	      archive_expsym_cmds_CXX='$CC -G${allow_undefined_flag} -h$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-retain-symbols-file ${wl}$export_symbols'
+	      hardcode_libdir_flag_spec_CXX='-R$libdir'
+	      whole_archive_flag_spec_CXX='${wl}--whole-archive`new_convenience=; for conv in $convenience\"\"; do test -z \"$conv\" || new_convenience=\"$new_convenience,$conv\"; done; func_echo_all \"$new_convenience\"` ${wl}--no-whole-archive'
+	      compiler_needs_object_CXX=yes
 
-int
-main ()
-{
+	      # Not sure whether something based on
+	      # $CC $CFLAGS -v conftest.$objext -o libconftest$shared_ext 2>&1
+	      # would be better.
+	      output_verbose_link_cmd='func_echo_all'
 
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_cxx_try_compile "$LINENO"; then :
-  ac_cv_prog_cxx_g=yes
-else
-  CXXFLAGS=""
-      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
+	      # Archives containing C++ object files must be created using
+	      # "CC -xar", where "CC" is the Sun C++ compiler.  This is
+	      # necessary to make sure instantiated templates are included
+	      # in the archive.
+	      old_archive_cmds_CXX='$CC -xar -o $oldlib $oldobjs'
+	      ;;
+	    esac
+	    ;;
+	esac
+	;;
 
-int
-main ()
-{
+      lynxos*)
+        # FIXME: insert proper C++ library support
+	ld_shlibs_CXX=no
+	;;
 
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_cxx_try_compile "$LINENO"; then :
+      m88k*)
+        # FIXME: insert proper C++ library support
+        ld_shlibs_CXX=no
+	;;
 
-else
-  ac_cxx_werror_flag=$ac_save_cxx_werror_flag
-	 CXXFLAGS="-g"
-	 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
+      mvs*)
+        case $cc_basename in
+          cxx*)
+	    # FIXME: insert proper C++ library support
+	    ld_shlibs_CXX=no
+	    ;;
+	  *)
+	    # FIXME: insert proper C++ library support
+	    ld_shlibs_CXX=no
+	    ;;
+	esac
+	;;
 
-int
-main ()
-{
+      netbsd*)
+        if echo __ELF__ | $CC -E - | $GREP __ELF__ >/dev/null; then
+	  archive_cmds_CXX='$LD -Bshareable  -o $lib $predep_objects $libobjs $deplibs $postdep_objects $linker_flags'
+	  wlarc=
+	  hardcode_libdir_flag_spec_CXX='-R$libdir'
+	  hardcode_direct_CXX=yes
+	  hardcode_shlibpath_var_CXX=no
+	fi
+	# Workaround some broken pre-1.5 toolchains
+	output_verbose_link_cmd='$CC -shared $CFLAGS -v conftest.$objext 2>&1 | $GREP conftest.$objext | $SED -e "s:-lgcc -lc -lgcc::"'
+	;;
 
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_cxx_try_compile "$LINENO"; then :
-  ac_cv_prog_cxx_g=yes
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-   ac_cxx_werror_flag=$ac_save_cxx_werror_flag
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cxx_g" >&5
-$as_echo "$ac_cv_prog_cxx_g" >&6; }
-if test "$ac_test_CXXFLAGS" = set; then
-  CXXFLAGS=$ac_save_CXXFLAGS
-elif test $ac_cv_prog_cxx_g = yes; then
-  if test "$GXX" = yes; then
-    CXXFLAGS="-g -O2"
-  else
-    CXXFLAGS="-g"
-  fi
-else
-  if test "$GXX" = yes; then
-    CXXFLAGS="-O2"
-  else
-    CXXFLAGS=
-  fi
-fi
-ac_ext=c
-ac_cpp='$CPP $CPPFLAGS'
-ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
-ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_c_compiler_gnu
+      *nto* | *qnx*)
+        ld_shlibs_CXX=yes
+	;;
 
-depcc="$CXX"  am_compiler_list=
+      openbsd2*)
+        # C++ shared libraries are fairly broken
+	ld_shlibs_CXX=no
+	;;
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking dependency style of $depcc" >&5
-$as_echo_n "checking dependency style of $depcc... " >&6; }
-if ${am_cv_CXX_dependencies_compiler_type+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -z "$AMDEP_TRUE" && test -f "$am_depcomp"; then
-  # We make a subdir and do the tests there.  Otherwise we can end up
-  # making bogus files that we don't know about and never remove.  For
-  # instance it was reported that on HP-UX the gcc test will end up
-  # making a dummy file named 'D' -- because '-MD' means "put the output
-  # in D".
-  rm -rf conftest.dir
-  mkdir conftest.dir
-  # Copy depcomp to subdir because otherwise we won't find it if we're
-  # using a relative directory.
-  cp "$am_depcomp" conftest.dir
-  cd conftest.dir
-  # We will build objects and dependencies in a subdirectory because
-  # it helps to detect inapplicable dependency modes.  For instance
-  # both Tru64's cc and ICC support -MD to output dependencies as a
-  # side effect of compilation, but ICC will put the dependencies in
-  # the current directory while Tru64 will put them in the object
-  # directory.
-  mkdir sub
+      openbsd*)
+	if test -f /usr/libexec/ld.so; then
+	  hardcode_direct_CXX=yes
+	  hardcode_shlibpath_var_CXX=no
+	  hardcode_direct_absolute_CXX=yes
+	  archive_cmds_CXX='$CC -shared $pic_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -o $lib'
+	  hardcode_libdir_flag_spec_CXX='${wl}-rpath,$libdir'
+	  if test -z "`echo __ELF__ | $CC -E - | grep __ELF__`" || test "$host_os-$host_cpu" = "openbsd2.8-powerpc"; then
+	    archive_expsym_cmds_CXX='$CC -shared $pic_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-retain-symbols-file,$export_symbols -o $lib'
+	    export_dynamic_flag_spec_CXX='${wl}-E'
+	    whole_archive_flag_spec_CXX="$wlarc"'--whole-archive$convenience '"$wlarc"'--no-whole-archive'
+	  fi
+	  output_verbose_link_cmd=func_echo_all
+	else
+	  ld_shlibs_CXX=no
+	fi
+	;;
 
-  am_cv_CXX_dependencies_compiler_type=none
-  if test "$am_compiler_list" = ""; then
-     am_compiler_list=`sed -n 's/^#*\([a-zA-Z0-9]*\))$/\1/p' < ./depcomp`
-  fi
-  am__universal=false
-  case " $depcc " in #(
-     *\ -arch\ *\ -arch\ *) am__universal=true ;;
-     esac
+      osf3* | osf4* | osf5*)
+        case $cc_basename in
+          KCC*)
+	    # Kuck and Associates, Inc. (KAI) C++ Compiler
 
-  for depmode in $am_compiler_list; do
-    # Setup a source with many dependencies, because some compilers
-    # like to wrap large dependency lists on column 80 (with \), and
-    # we should not choose a depcomp mode which is confused by this.
-    #
-    # We need to recreate these files for each test, as the compiler may
-    # overwrite some of them when testing with obscure command lines.
-    # This happens at least with the AIX C compiler.
-    : > sub/conftest.c
-    for i in 1 2 3 4 5 6; do
-      echo '#include "conftst'$i'.h"' >> sub/conftest.c
-      # Using ": > sub/conftst$i.h" creates only sub/conftst1.h with
-      # Solaris 10 /bin/sh.
-      echo '/* dummy */' > sub/conftst$i.h
-    done
-    echo "${am__include} ${am__quote}sub/conftest.Po${am__quote}" > confmf
+	    # KCC will only create a shared library if the output file
+	    # ends with ".so" (or ".sl" for HP-UX), so rename the library
+	    # to its proper name (with version) after linking.
+	    archive_cmds_CXX='tempext=`echo $shared_ext | $SED -e '\''s/\([^()0-9A-Za-z{}]\)/\\\\\1/g'\''`; templib=`echo "$lib" | $SED -e "s/\${tempext}\..*/.so/"`; $CC $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags --soname $soname -o \$templib; mv \$templib $lib'
 
-    # We check with '-c' and '-o' for the sake of the "dashmstdout"
-    # mode.  It turns out that the SunPro C++ compiler does not properly
-    # handle '-M -o', and we need to detect this.  Also, some Intel
-    # versions had trouble with output in subdirs.
-    am__obj=sub/conftest.${OBJEXT-o}
-    am__minus_obj="-o $am__obj"
-    case $depmode in
-    gcc)
-      # This depmode causes a compiler race in universal mode.
-      test "$am__universal" = false || continue
-      ;;
-    nosideeffect)
-      # After this tag, mechanisms are not by side-effect, so they'll
-      # only be used when explicitly requested.
-      if test "x$enable_dependency_tracking" = xyes; then
-	continue
-      else
-	break
-      fi
-      ;;
-    msvc7 | msvc7msys | msvisualcpp | msvcmsys)
-      # This compiler won't grok '-c -o', but also, the minuso test has
-      # not run yet.  These depmodes are late enough in the game, and
-      # so weak that their functioning should not be impacted.
-      am__obj=conftest.${OBJEXT-o}
-      am__minus_obj=
-      ;;
-    none) break ;;
-    esac
-    if depmode=$depmode \
-       source=sub/conftest.c object=$am__obj \
-       depfile=sub/conftest.Po tmpdepfile=sub/conftest.TPo \
-       $SHELL ./depcomp $depcc -c $am__minus_obj sub/conftest.c \
-         >/dev/null 2>conftest.err &&
-       grep sub/conftst1.h sub/conftest.Po > /dev/null 2>&1 &&
-       grep sub/conftst6.h sub/conftest.Po > /dev/null 2>&1 &&
-       grep $am__obj sub/conftest.Po > /dev/null 2>&1 &&
-       ${MAKE-make} -s -f confmf > /dev/null 2>&1; then
-      # icc doesn't choke on unknown options, it will just issue warnings
-      # or remarks (even with -Werror).  So we grep stderr for any message
-      # that says an option was ignored or not supported.
-      # When given -MP, icc 7.0 and 7.1 complain thusly:
-      #   icc: Command line warning: ignoring option '-M'; no argument required
-      # The diagnosis changed in icc 8.0:
-      #   icc: Command line remark: option '-MP' not supported
-      if (grep 'ignoring option' conftest.err ||
-          grep 'not supported' conftest.err) >/dev/null 2>&1; then :; else
-        am_cv_CXX_dependencies_compiler_type=$depmode
-        break
-      fi
-    fi
-  done
-
-  cd ..
-  rm -rf conftest.dir
-else
-  am_cv_CXX_dependencies_compiler_type=none
-fi
+	    hardcode_libdir_flag_spec_CXX='${wl}-rpath,$libdir'
+	    hardcode_libdir_separator_CXX=:
 
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $am_cv_CXX_dependencies_compiler_type" >&5
-$as_echo "$am_cv_CXX_dependencies_compiler_type" >&6; }
-CXXDEPMODE=depmode=$am_cv_CXX_dependencies_compiler_type
+	    # Archives containing C++ object files must be created using
+	    # the KAI C++ compiler.
+	    case $host in
+	      osf3*) old_archive_cmds_CXX='$CC -Bstatic -o $oldlib $oldobjs' ;;
+	      *) old_archive_cmds_CXX='$CC -o $oldlib $oldobjs' ;;
+	    esac
+	    ;;
+          RCC*)
+	    # Rational C++ 2.4.1
+	    # FIXME: insert proper C++ library support
+	    ld_shlibs_CXX=no
+	    ;;
+          cxx*)
+	    case $host in
+	      osf3*)
+	        allow_undefined_flag_CXX=' ${wl}-expect_unresolved ${wl}\*'
+	        archive_cmds_CXX='$CC -shared${allow_undefined_flag} $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname $soname `test -n "$verstring" && func_echo_all "${wl}-set_version $verstring"` -update_registry ${output_objdir}/so_locations -o $lib'
+	        hardcode_libdir_flag_spec_CXX='${wl}-rpath ${wl}$libdir'
+		;;
+	      *)
+	        allow_undefined_flag_CXX=' -expect_unresolved \*'
+	        archive_cmds_CXX='$CC -shared${allow_undefined_flag} $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -msym -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry ${output_objdir}/so_locations -o $lib'
+	        archive_expsym_cmds_CXX='for i in `cat $export_symbols`; do printf "%s %s\\n" -exported_symbol "\$i" >> $lib.exp; done~
+	          echo "-hidden">> $lib.exp~
+	          $CC -shared$allow_undefined_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -msym -soname $soname ${wl}-input ${wl}$lib.exp  `test -n "$verstring" && $ECHO "-set_version $verstring"` -update_registry ${output_objdir}/so_locations -o $lib~
+	          $RM $lib.exp'
+	        hardcode_libdir_flag_spec_CXX='-rpath $libdir'
+		;;
+	    esac
 
- if
-  test "x$enable_dependency_tracking" != xno \
-  && test "$am_cv_CXX_dependencies_compiler_type" = gcc3; then
-  am__fastdepCXX_TRUE=
-  am__fastdepCXX_FALSE='#'
-else
-  am__fastdepCXX_TRUE='#'
-  am__fastdepCXX_FALSE=
-fi
+	    hardcode_libdir_separator_CXX=:
 
+	    # Commands to make compiler produce verbose output that lists
+	    # what "hidden" libraries, object files and flags are used when
+	    # linking a shared library.
+	    #
+	    # There doesn't appear to be a way to prevent this compiler from
+	    # explicitly linking system object files so we need to strip them
+	    # from the output so that they don't get included in the library
+	    # dependencies.
+	    output_verbose_link_cmd='templist=`$CC -shared $CFLAGS -v conftest.$objext 2>&1 | $GREP "ld" | $GREP -v "ld:"`; templist=`func_echo_all "$templist" | $SED "s/\(^.*ld.*\)\( .*ld.*$\)/\1/"`; list=""; for z in $templist; do case $z in conftest.$objext) list="$list $z";; *.$objext);; *) list="$list $z";;esac; done; func_echo_all "$list"'
+	    ;;
+	  *)
+	    if test "$GXX" = yes && test "$with_gnu_ld" = no; then
+	      allow_undefined_flag_CXX=' ${wl}-expect_unresolved ${wl}\*'
+	      case $host in
+	        osf3*)
+	          archive_cmds_CXX='$CC -shared -nostdlib ${allow_undefined_flag} $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && func_echo_all "${wl}-set_version ${wl}$verstring"` ${wl}-update_registry ${wl}${output_objdir}/so_locations -o $lib'
+		  ;;
+	        *)
+	          archive_cmds_CXX='$CC -shared $pic_flag -nostdlib ${allow_undefined_flag} $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-msym ${wl}-soname ${wl}$soname `test -n "$verstring" && func_echo_all "${wl}-set_version ${wl}$verstring"` ${wl}-update_registry ${wl}${output_objdir}/so_locations -o $lib'
+		  ;;
+	      esac
 
+	      hardcode_libdir_flag_spec_CXX='${wl}-rpath ${wl}$libdir'
+	      hardcode_libdir_separator_CXX=:
 
+	      # Commands to make compiler produce verbose output that lists
+	      # what "hidden" libraries, object files and flags are used when
+	      # linking a shared library.
+	      output_verbose_link_cmd='$CC -shared $CFLAGS -v conftest.$objext 2>&1 | $GREP -v "^Configured with:" | $GREP "\-L"'
 
-func_stripname_cnf ()
-{
-  case ${2} in
-  .*) func_stripname_result=`$ECHO "${3}" | $SED "s%^${1}%%; s%\\\\${2}\$%%"`;;
-  *)  func_stripname_result=`$ECHO "${3}" | $SED "s%^${1}%%; s%${2}\$%%"`;;
-  esac
-} # func_stripname_cnf
+	    else
+	      # FIXME: insert proper C++ library support
+	      ld_shlibs_CXX=no
+	    fi
+	    ;;
+        esac
+        ;;
 
-      if test -n "$CXX" && ( test "X$CXX" != "Xno" &&
-    ( (test "X$CXX" = "Xg++" && `g++ -v >/dev/null 2>&1` ) ||
-    (test "X$CXX" != "Xg++"))) ; then
-  ac_ext=cpp
-ac_cpp='$CXXCPP $CPPFLAGS'
-ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
-ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking how to run the C++ preprocessor" >&5
-$as_echo_n "checking how to run the C++ preprocessor... " >&6; }
-if test -z "$CXXCPP"; then
-  if ${ac_cv_prog_CXXCPP+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-      # Double quotes because CXXCPP needs to be expanded
-    for CXXCPP in "$CXX -E" "/lib/cpp"
-    do
-      ac_preproc_ok=false
-for ac_cxx_preproc_warn_flag in '' yes
-do
-  # Use a header file that comes with gcc, so configuring glibc
-  # with a fresh cross-compiler works.
-  # Prefer <limits.h> to <assert.h> if __STDC__ is defined, since
-  # <limits.h> exists even on freestanding compilers.
-  # On the NeXT, cc -E runs the code through the compiler's parser,
-  # not just through cpp. "Syntax error" is here to catch this case.
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#ifdef __STDC__
-# include <limits.h>
-#else
-# include <assert.h>
-#endif
-		     Syntax error
-_ACEOF
-if ac_fn_cxx_try_cpp "$LINENO"; then :
+      psos*)
+        # FIXME: insert proper C++ library support
+        ld_shlibs_CXX=no
+        ;;
 
-else
-  # Broken: fails on valid input.
-continue
-fi
-rm -f conftest.err conftest.i conftest.$ac_ext
+      sunos4*)
+        case $cc_basename in
+          CC*)
+	    # Sun C++ 4.x
+	    # FIXME: insert proper C++ library support
+	    ld_shlibs_CXX=no
+	    ;;
+          lcc*)
+	    # Lucid
+	    # FIXME: insert proper C++ library support
+	    ld_shlibs_CXX=no
+	    ;;
+          *)
+	    # FIXME: insert proper C++ library support
+	    ld_shlibs_CXX=no
+	    ;;
+        esac
+        ;;
 
-  # OK, works on sane cases.  Now check whether nonexistent headers
-  # can be detected and how.
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#include <ac_nonexistent.h>
-_ACEOF
-if ac_fn_cxx_try_cpp "$LINENO"; then :
-  # Broken: success on invalid input.
-continue
-else
-  # Passes both tests.
-ac_preproc_ok=:
-break
-fi
-rm -f conftest.err conftest.i conftest.$ac_ext
+      solaris*)
+        case $cc_basename in
+          CC* | sunCC*)
+	    # Sun C++ 4.2, 5.x and Centerline C++
+            archive_cmds_need_lc_CXX=yes
+	    no_undefined_flag_CXX=' -zdefs'
+	    archive_cmds_CXX='$CC -G${allow_undefined_flag}  -h$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
+	    archive_expsym_cmds_CXX='echo "{ global:" > $lib.exp~cat $export_symbols | $SED -e "s/\(.*\)/\1;/" >> $lib.exp~echo "local: *; };" >> $lib.exp~
+	      $CC -G${allow_undefined_flag} ${wl}-M ${wl}$lib.exp -h$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~$RM $lib.exp'
 
-done
-# Because of `break', _AC_PREPROC_IFELSE's cleaning code was skipped.
-rm -f conftest.i conftest.err conftest.$ac_ext
-if $ac_preproc_ok; then :
-  break
-fi
+	    hardcode_libdir_flag_spec_CXX='-R$libdir'
+	    hardcode_shlibpath_var_CXX=no
+	    case $host_os in
+	      solaris2.[0-5] | solaris2.[0-5].*) ;;
+	      *)
+		# The compiler driver will combine and reorder linker options,
+		# but understands `-z linker_flag'.
+	        # Supported since Solaris 2.6 (maybe 2.5.1?)
+		whole_archive_flag_spec_CXX='-z allextract$convenience -z defaultextract'
+	        ;;
+	    esac
+	    link_all_deplibs_CXX=yes
 
-    done
-    ac_cv_prog_CXXCPP=$CXXCPP
+	    output_verbose_link_cmd='func_echo_all'
 
-fi
-  CXXCPP=$ac_cv_prog_CXXCPP
-else
-  ac_cv_prog_CXXCPP=$CXXCPP
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $CXXCPP" >&5
-$as_echo "$CXXCPP" >&6; }
-ac_preproc_ok=false
-for ac_cxx_preproc_warn_flag in '' yes
-do
-  # Use a header file that comes with gcc, so configuring glibc
-  # with a fresh cross-compiler works.
-  # Prefer <limits.h> to <assert.h> if __STDC__ is defined, since
-  # <limits.h> exists even on freestanding compilers.
-  # On the NeXT, cc -E runs the code through the compiler's parser,
-  # not just through cpp. "Syntax error" is here to catch this case.
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#ifdef __STDC__
-# include <limits.h>
-#else
-# include <assert.h>
-#endif
-		     Syntax error
-_ACEOF
-if ac_fn_cxx_try_cpp "$LINENO"; then :
-
-else
-  # Broken: fails on valid input.
-continue
-fi
-rm -f conftest.err conftest.i conftest.$ac_ext
+	    # Archives containing C++ object files must be created using
+	    # "CC -xar", where "CC" is the Sun C++ compiler.  This is
+	    # necessary to make sure instantiated templates are included
+	    # in the archive.
+	    old_archive_cmds_CXX='$CC -xar -o $oldlib $oldobjs'
+	    ;;
+          gcx*)
+	    # Green Hills C++ Compiler
+	    archive_cmds_CXX='$CC -shared $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-h $wl$soname -o $lib'
 
-  # OK, works on sane cases.  Now check whether nonexistent headers
-  # can be detected and how.
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#include <ac_nonexistent.h>
-_ACEOF
-if ac_fn_cxx_try_cpp "$LINENO"; then :
-  # Broken: success on invalid input.
-continue
-else
-  # Passes both tests.
-ac_preproc_ok=:
-break
-fi
-rm -f conftest.err conftest.i conftest.$ac_ext
+	    # The C++ compiler must be used to create the archive.
+	    old_archive_cmds_CXX='$CC $LDFLAGS -archive -o $oldlib $oldobjs'
+	    ;;
+          *)
+	    # GNU C++ compiler with Solaris linker
+	    if test "$GXX" = yes && test "$with_gnu_ld" = no; then
+	      no_undefined_flag_CXX=' ${wl}-z ${wl}defs'
+	      if $CC --version | $GREP -v '^2\.7' > /dev/null; then
+	        archive_cmds_CXX='$CC -shared $pic_flag -nostdlib $LDFLAGS $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-h $wl$soname -o $lib'
+	        archive_expsym_cmds_CXX='echo "{ global:" > $lib.exp~cat $export_symbols | $SED -e "s/\(.*\)/\1;/" >> $lib.exp~echo "local: *; };" >> $lib.exp~
+		  $CC -shared $pic_flag -nostdlib ${wl}-M $wl$lib.exp -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~$RM $lib.exp'
 
-done
-# Because of `break', _AC_PREPROC_IFELSE's cleaning code was skipped.
-rm -f conftest.i conftest.err conftest.$ac_ext
-if $ac_preproc_ok; then :
+	        # Commands to make compiler produce verbose output that lists
+	        # what "hidden" libraries, object files and flags are used when
+	        # linking a shared library.
+	        output_verbose_link_cmd='$CC -shared $CFLAGS -v conftest.$objext 2>&1 | $GREP -v "^Configured with:" | $GREP "\-L"'
+	      else
+	        # g++ 2.7 appears to require `-G' NOT `-shared' on this
+	        # platform.
+	        archive_cmds_CXX='$CC -G -nostdlib $LDFLAGS $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-h $wl$soname -o $lib'
+	        archive_expsym_cmds_CXX='echo "{ global:" > $lib.exp~cat $export_symbols | $SED -e "s/\(.*\)/\1;/" >> $lib.exp~echo "local: *; };" >> $lib.exp~
+		  $CC -G -nostdlib ${wl}-M $wl$lib.exp -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~$RM $lib.exp'
 
-else
-  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "C++ preprocessor \"$CXXCPP\" fails sanity check
-See \`config.log' for more details" "$LINENO" 5; }
-fi
+	        # Commands to make compiler produce verbose output that lists
+	        # what "hidden" libraries, object files and flags are used when
+	        # linking a shared library.
+	        output_verbose_link_cmd='$CC -G $CFLAGS -v conftest.$objext 2>&1 | $GREP -v "^Configured with:" | $GREP "\-L"'
+	      fi
 
-ac_ext=c
-ac_cpp='$CPP $CPPFLAGS'
-ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
-ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_c_compiler_gnu
+	      hardcode_libdir_flag_spec_CXX='${wl}-R $wl$libdir'
+	      case $host_os in
+		solaris2.[0-5] | solaris2.[0-5].*) ;;
+		*)
+		  whole_archive_flag_spec_CXX='${wl}-z ${wl}allextract$convenience ${wl}-z ${wl}defaultextract'
+		  ;;
+	      esac
+	    fi
+	    ;;
+        esac
+        ;;
 
-else
-  _lt_caught_CXX_error=yes
-fi
+    sysv4*uw2* | sysv5OpenUNIX* | sysv5UnixWare7.[01].[10]* | unixware7* | sco3.2v5.0.[024]*)
+      no_undefined_flag_CXX='${wl}-z,text'
+      archive_cmds_need_lc_CXX=no
+      hardcode_shlibpath_var_CXX=no
+      runpath_var='LD_RUN_PATH'
 
-ac_ext=cpp
-ac_cpp='$CXXCPP $CPPFLAGS'
-ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
-ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
+      case $cc_basename in
+        CC*)
+	  archive_cmds_CXX='$CC -G ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+	  archive_expsym_cmds_CXX='$CC -G ${wl}-Bexport:$export_symbols ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+	  ;;
+	*)
+	  archive_cmds_CXX='$CC -shared ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+	  archive_expsym_cmds_CXX='$CC -shared ${wl}-Bexport:$export_symbols ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+	  ;;
+      esac
+      ;;
 
-archive_cmds_need_lc_CXX=no
-allow_undefined_flag_CXX=
-always_export_symbols_CXX=no
-archive_expsym_cmds_CXX=
-compiler_needs_object_CXX=no
-export_dynamic_flag_spec_CXX=
-hardcode_direct_CXX=no
-hardcode_direct_absolute_CXX=no
-hardcode_libdir_flag_spec_CXX=
-hardcode_libdir_separator_CXX=
-hardcode_minus_L_CXX=no
-hardcode_shlibpath_var_CXX=unsupported
-hardcode_automatic_CXX=no
-inherit_rpath_CXX=no
-module_cmds_CXX=
-module_expsym_cmds_CXX=
-link_all_deplibs_CXX=unknown
-old_archive_cmds_CXX=$old_archive_cmds
-reload_flag_CXX=$reload_flag
-reload_cmds_CXX=$reload_cmds
-no_undefined_flag_CXX=
-whole_archive_flag_spec_CXX=
-enable_shared_with_static_runtimes_CXX=no
+      sysv5* | sco3.2v5* | sco5v6*)
+	# Note: We can NOT use -z defs as we might desire, because we do not
+	# link with -lc, and that would cause any symbols used from libc to
+	# always be unresolved, which means just about no library would
+	# ever link correctly.  If we're not using GNU ld we use -z text
+	# though, which does catch some bad symbols but isn't as heavy-handed
+	# as -z defs.
+	no_undefined_flag_CXX='${wl}-z,text'
+	allow_undefined_flag_CXX='${wl}-z,nodefs'
+	archive_cmds_need_lc_CXX=no
+	hardcode_shlibpath_var_CXX=no
+	hardcode_libdir_flag_spec_CXX='${wl}-R,$libdir'
+	hardcode_libdir_separator_CXX=':'
+	link_all_deplibs_CXX=yes
+	export_dynamic_flag_spec_CXX='${wl}-Bexport'
+	runpath_var='LD_RUN_PATH'
 
-# Source file extension for C++ test sources.
-ac_ext=cpp
+	case $cc_basename in
+          CC*)
+	    archive_cmds_CXX='$CC -G ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+	    archive_expsym_cmds_CXX='$CC -G ${wl}-Bexport:$export_symbols ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+	    old_archive_cmds_CXX='$CC -Tprelink_objects $oldobjs~
+	      '"$old_archive_cmds_CXX"
+	    reload_cmds_CXX='$CC -Tprelink_objects $reload_objs~
+	      '"$reload_cmds_CXX"
+	    ;;
+	  *)
+	    archive_cmds_CXX='$CC -shared ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+	    archive_expsym_cmds_CXX='$CC -shared ${wl}-Bexport:$export_symbols ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+	    ;;
+	esac
+      ;;
 
-# Object file extension for compiled C++ test sources.
-objext=o
-objext_CXX=$objext
+      tandem*)
+        case $cc_basename in
+          NCC*)
+	    # NonStop-UX NCC 3.20
+	    # FIXME: insert proper C++ library support
+	    ld_shlibs_CXX=no
+	    ;;
+          *)
+	    # FIXME: insert proper C++ library support
+	    ld_shlibs_CXX=no
+	    ;;
+        esac
+        ;;
 
-# No sense in running all these tests if we already determined that
-# the CXX compiler isn't working.  Some variables (like enable_shared)
-# are currently assumed to apply to all compilers on this platform,
-# and will be corrupted by setting them based on a non-working compiler.
-if test "$_lt_caught_CXX_error" != yes; then
-  # Code to be used in simple compile tests
-  lt_simple_compile_test_code="int some_variable = 0;"
+      vxworks*)
+        # FIXME: insert proper C++ library support
+        ld_shlibs_CXX=no
+        ;;
 
-  # Code to be used in simple link tests
-  lt_simple_link_test_code='int main(int, char *[]) { return(0); }'
+      *)
+        # FIXME: insert proper C++ library support
+        ld_shlibs_CXX=no
+        ;;
+    esac
 
-  # ltmain only uses $CC for tagged configurations so make sure $CC is set.
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ld_shlibs_CXX" >&5
+$as_echo "$ld_shlibs_CXX" >&6; }
+    test "$ld_shlibs_CXX" = no && can_build_shared=no
 
+    GCC_CXX="$GXX"
+    LD_CXX="$LD"
 
+    ## CAVEAT EMPTOR:
+    ## There is no encapsulation within the following macros, do not change
+    ## the running order or otherwise move them around unless you know exactly
+    ## what you are doing...
+    # Dependencies to place before and after the object being linked:
+predep_objects_CXX=
+postdep_objects_CXX=
+predeps_CXX=
+postdeps_CXX=
+compiler_lib_search_path_CXX=
 
+cat > conftest.$ac_ext <<_LT_EOF
+class Foo
+{
+public:
+  Foo (void) { a = 0; }
+private:
+  int a;
+};
+_LT_EOF
 
 
+_lt_libdeps_save_CFLAGS=$CFLAGS
+case "$CC $CFLAGS " in #(
+*\ -flto*\ *) CFLAGS="$CFLAGS -fno-lto" ;;
+*\ -fwhopr*\ *) CFLAGS="$CFLAGS -fno-whopr" ;;
+*\ -fuse-linker-plugin*\ *) CFLAGS="$CFLAGS -fno-use-linker-plugin" ;;
+esac
 
-# If no C compiler was specified, use CC.
-LTCC=${LTCC-"$CC"}
+if { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_compile\""; } >&5
+  (eval $ac_compile) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; then
+  # Parse the compiler output and extract the necessary
+  # objects, libraries and library flags.
 
-# If no C compiler flags were specified, use CFLAGS.
-LTCFLAGS=${LTCFLAGS-"$CFLAGS"}
+  # Sentinel used to keep track of whether or not we are before
+  # the conftest object file.
+  pre_test_object_deps_done=no
 
-# Allow CC to be a program name with arguments.
-compiler=$CC
+  for p in `eval "$output_verbose_link_cmd"`; do
+    case ${prev}${p} in
 
+    -L* | -R* | -l*)
+       # Some compilers place space between "-{L,R}" and the path.
+       # Remove the space.
+       if test $p = "-L" ||
+          test $p = "-R"; then
+	 prev=$p
+	 continue
+       fi
 
-  # save warnings/boilerplate of simple test code
-  ac_outfile=conftest.$ac_objext
-echo "$lt_simple_compile_test_code" >conftest.$ac_ext
-eval "$ac_compile" 2>&1 >/dev/null | $SED '/^$/d; /^ *+/d' >conftest.err
-_lt_compiler_boilerplate=`cat conftest.err`
-$RM conftest*
+       # Expand the sysroot to ease extracting the directories later.
+       if test -z "$prev"; then
+         case $p in
+         -L*) func_stripname_cnf '-L' '' "$p"; prev=-L; p=$func_stripname_result ;;
+         -R*) func_stripname_cnf '-R' '' "$p"; prev=-R; p=$func_stripname_result ;;
+         -l*) func_stripname_cnf '-l' '' "$p"; prev=-l; p=$func_stripname_result ;;
+         esac
+       fi
+       case $p in
+       =*) func_stripname_cnf '=' '' "$p"; p=$lt_sysroot$func_stripname_result ;;
+       esac
+       if test "$pre_test_object_deps_done" = no; then
+	 case ${prev} in
+	 -L | -R)
+	   # Internal compiler library paths should come after those
+	   # provided the user.  The postdeps already come after the
+	   # user supplied libs so there is no need to process them.
+	   if test -z "$compiler_lib_search_path_CXX"; then
+	     compiler_lib_search_path_CXX="${prev}${p}"
+	   else
+	     compiler_lib_search_path_CXX="${compiler_lib_search_path_CXX} ${prev}${p}"
+	   fi
+	   ;;
+	 # The "-l" case would never come before the object being
+	 # linked, so don't bother handling this case.
+	 esac
+       else
+	 if test -z "$postdeps_CXX"; then
+	   postdeps_CXX="${prev}${p}"
+	 else
+	   postdeps_CXX="${postdeps_CXX} ${prev}${p}"
+	 fi
+       fi
+       prev=
+       ;;
 
-  ac_outfile=conftest.$ac_objext
-echo "$lt_simple_link_test_code" >conftest.$ac_ext
-eval "$ac_link" 2>&1 >/dev/null | $SED '/^$/d; /^ *+/d' >conftest.err
-_lt_linker_boilerplate=`cat conftest.err`
-$RM -r conftest*
+    *.lto.$objext) ;; # Ignore GCC LTO objects
+    *.$objext)
+       # This assumes that the test object file only shows up
+       # once in the compiler output.
+       if test "$p" = "conftest.$objext"; then
+	 pre_test_object_deps_done=yes
+	 continue
+       fi
 
+       if test "$pre_test_object_deps_done" = no; then
+	 if test -z "$predep_objects_CXX"; then
+	   predep_objects_CXX="$p"
+	 else
+	   predep_objects_CXX="$predep_objects_CXX $p"
+	 fi
+       else
+	 if test -z "$postdep_objects_CXX"; then
+	   postdep_objects_CXX="$p"
+	 else
+	   postdep_objects_CXX="$postdep_objects_CXX $p"
+	 fi
+       fi
+       ;;
 
-  # Allow CC to be a program name with arguments.
-  lt_save_CC=$CC
-  lt_save_CFLAGS=$CFLAGS
-  lt_save_LD=$LD
-  lt_save_GCC=$GCC
-  GCC=$GXX
-  lt_save_with_gnu_ld=$with_gnu_ld
-  lt_save_path_LD=$lt_cv_path_LD
-  if test -n "${lt_cv_prog_gnu_ldcxx+set}"; then
-    lt_cv_prog_gnu_ld=$lt_cv_prog_gnu_ldcxx
-  else
-    $as_unset lt_cv_prog_gnu_ld
-  fi
-  if test -n "${lt_cv_path_LDCXX+set}"; then
-    lt_cv_path_LD=$lt_cv_path_LDCXX
-  else
-    $as_unset lt_cv_path_LD
-  fi
-  test -z "${LDCXX+set}" || LD=$LDCXX
-  CC=${CXX-"c++"}
-  CFLAGS=$CXXFLAGS
-  compiler=$CC
-  compiler_CXX=$CC
-  for cc_temp in $compiler""; do
-  case $cc_temp in
-    compile | *[\\/]compile | ccache | *[\\/]ccache ) ;;
-    distcc | *[\\/]distcc | purify | *[\\/]purify ) ;;
-    \-*) ;;
-    *) break;;
-  esac
-done
-cc_basename=`$ECHO "$cc_temp" | $SED "s%.*/%%; s%^$host_alias-%%"`
+    *) ;; # Ignore the rest.
 
+    esac
+  done
 
-  if test -n "$compiler"; then
-    # We don't want -fno-exception when compiling C++ code, so set the
-    # no_builtin_flag separately
-    if test "$GXX" = yes; then
-      lt_prog_compiler_no_builtin_flag_CXX=' -fno-builtin'
-    else
-      lt_prog_compiler_no_builtin_flag_CXX=
-    fi
+  # Clean up.
+  rm -f a.out a.exe
+else
+  echo "libtool.m4: error: problem compiling CXX test program"
+fi
 
-    if test "$GXX" = yes; then
-      # Set up default GNU C++ configuration
+$RM -f confest.$objext
+CFLAGS=$_lt_libdeps_save_CFLAGS
 
+# PORTME: override above test on systems where it is broken
+case $host_os in
+interix[3-9]*)
+  # Interix 3.5 installs completely hosed .la files for C++, so rather than
+  # hack all around it, let's just trust "g++" to DTRT.
+  predep_objects_CXX=
+  postdep_objects_CXX=
+  postdeps_CXX=
+  ;;
 
+linux*)
+  case `$CC -V 2>&1 | sed 5q` in
+  *Sun\ C*)
+    # Sun C++ 5.9
 
-# Check whether --with-gnu-ld was given.
-if test "${with_gnu_ld+set}" = set; then :
-  withval=$with_gnu_ld; test "$withval" = no || with_gnu_ld=yes
-else
-  with_gnu_ld=no
-fi
+    # The more standards-conforming stlport4 library is
+    # incompatible with the Cstd library. Avoid specifying
+    # it if it's in CXXFLAGS. Ignore libCrun as
+    # -library=stlport4 depends on it.
+    case " $CXX $CXXFLAGS " in
+    *" -library=stlport4 "*)
+      solaris_use_stlport4=yes
+      ;;
+    esac
 
-ac_prog=ld
-if test "$GCC" = yes; then
-  # Check if gcc -print-prog-name=ld gives a path.
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for ld used by $CC" >&5
-$as_echo_n "checking for ld used by $CC... " >&6; }
-  case $host in
-  *-*-mingw*)
-    # gcc leaves a trailing carriage return which upsets mingw
-    ac_prog=`($CC -print-prog-name=ld) 2>&5 | tr -d '\015'` ;;
-  *)
-    ac_prog=`($CC -print-prog-name=ld) 2>&5` ;;
+    if test "$solaris_use_stlport4" != yes; then
+      postdeps_CXX='-library=Cstd -library=Crun'
+    fi
+    ;;
   esac
-  case $ac_prog in
-    # Accept absolute paths.
-    [\\/]* | ?:[\\/]*)
-      re_direlt='/[^/][^/]*/\.\./'
-      # Canonicalize the pathname of ld
-      ac_prog=`$ECHO "$ac_prog"| $SED 's%\\\\%/%g'`
-      while $ECHO "$ac_prog" | $GREP "$re_direlt" > /dev/null 2>&1; do
-	ac_prog=`$ECHO $ac_prog| $SED "s%$re_direlt%/%"`
-      done
-      test -z "$LD" && LD="$ac_prog"
+  ;;
+
+solaris*)
+  case $cc_basename in
+  CC* | sunCC*)
+    # The more standards-conforming stlport4 library is
+    # incompatible with the Cstd library. Avoid specifying
+    # it if it's in CXXFLAGS. Ignore libCrun as
+    # -library=stlport4 depends on it.
+    case " $CXX $CXXFLAGS " in
+    *" -library=stlport4 "*)
+      solaris_use_stlport4=yes
       ;;
-  "")
-    # If it fails, then pretend we aren't using GCC.
-    ac_prog=ld
-    ;;
-  *)
-    # If it is relative, then search for the first ld in PATH.
-    with_gnu_ld=unknown
+    esac
+
+    # Adding this requires a known-good setup of shared libraries for
+    # Sun compiler versions before 5.6, else PIC objects from an old
+    # archive will be linked into the output, leading to subtle bugs.
+    if test "$solaris_use_stlport4" != yes; then
+      postdeps_CXX='-library=Cstd -library=Crun'
+    fi
     ;;
   esac
-elif test "$with_gnu_ld" = yes; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for GNU ld" >&5
-$as_echo_n "checking for GNU ld... " >&6; }
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for non-GNU ld" >&5
-$as_echo_n "checking for non-GNU ld... " >&6; }
-fi
-if ${lt_cv_path_LD+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -z "$LD"; then
-  lt_save_ifs="$IFS"; IFS=$PATH_SEPARATOR
-  for ac_dir in $PATH; do
-    IFS="$lt_save_ifs"
-    test -z "$ac_dir" && ac_dir=.
-    if test -f "$ac_dir/$ac_prog" || test -f "$ac_dir/$ac_prog$ac_exeext"; then
-      lt_cv_path_LD="$ac_dir/$ac_prog"
-      # Check to see if the program is GNU ld.  I'd rather use --version,
-      # but apparently some variants of GNU ld only accept -v.
-      # Break only if it was the GNU/non-GNU ld that we prefer.
-      case `"$lt_cv_path_LD" -v 2>&1 </dev/null` in
-      *GNU* | *'with BFD'*)
-	test "$with_gnu_ld" != no && break
-	;;
-      *)
-	test "$with_gnu_ld" != yes && break
-	;;
-      esac
-    fi
-  done
-  IFS="$lt_save_ifs"
-else
-  lt_cv_path_LD="$LD" # Let the user override the test with a path.
-fi
-fi
+  ;;
+esac
 
-LD="$lt_cv_path_LD"
-if test -n "$LD"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $LD" >&5
-$as_echo "$LD" >&6; }
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-fi
-test -z "$LD" && as_fn_error $? "no acceptable ld found in \$PATH" "$LINENO" 5
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking if the linker ($LD) is GNU ld" >&5
-$as_echo_n "checking if the linker ($LD) is GNU ld... " >&6; }
-if ${lt_cv_prog_gnu_ld+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  # I'd rather use --version here, but apparently some GNU lds only accept -v.
-case `$LD -v 2>&1 </dev/null` in
-*GNU* | *'with BFD'*)
-  lt_cv_prog_gnu_ld=yes
-  ;;
-*)
-  lt_cv_prog_gnu_ld=no
-  ;;
+
+case " $postdeps_CXX " in
+*" -lc "*) archive_cmds_need_lc_CXX=no ;;
 esac
+ compiler_lib_search_dirs_CXX=
+if test -n "${compiler_lib_search_path_CXX}"; then
+ compiler_lib_search_dirs_CXX=`echo " ${compiler_lib_search_path_CXX}" | ${SED} -e 's! -L! !g' -e 's!^ !!'`
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_prog_gnu_ld" >&5
-$as_echo "$lt_cv_prog_gnu_ld" >&6; }
-with_gnu_ld=$lt_cv_prog_gnu_ld
 
 
 
@@ -13955,1972 +14164,1478 @@ with_gnu_ld=$lt_cv_prog_gnu_ld
 
 
 
-      # Check if GNU C++ uses GNU ld as the underlying linker, since the
-      # archiving commands below assume that GNU ld is being used.
-      if test "$with_gnu_ld" = yes; then
-        archive_cmds_CXX='$CC $pic_flag -shared -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname $wl$soname -o $lib'
-        archive_expsym_cmds_CXX='$CC $pic_flag -shared -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
 
-        hardcode_libdir_flag_spec_CXX='${wl}-rpath ${wl}$libdir'
-        export_dynamic_flag_spec_CXX='${wl}--export-dynamic'
 
-        # If archive_cmds runs LD, not CC, wlarc should be empty
-        # XXX I think wlarc can be eliminated in ltcf-cxx, but I need to
-        #     investigate it a little bit more. (MM)
-        wlarc='${wl}'
 
-        # ancient GNU ld didn't support --whole-archive et. al.
-        if eval "`$CC -print-prog-name=ld` --help 2>&1" |
-	  $GREP 'no-whole-archive' > /dev/null; then
-          whole_archive_flag_spec_CXX="$wlarc"'--whole-archive$convenience '"$wlarc"'--no-whole-archive'
-        else
-          whole_archive_flag_spec_CXX=
-        fi
-      else
-        with_gnu_ld=no
-        wlarc=
 
-        # A generic and very simple default shared library creation
-        # command for GNU C++ for the case where it uses the native
-        # linker, instead of GNU ld.  If possible, this setting should
-        # overridden to take advantage of the native linker features on
-        # the platform it is being used on.
-        archive_cmds_CXX='$CC -shared -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -o $lib'
-      fi
 
-      # Commands to make compiler produce verbose output that lists
-      # what "hidden" libraries, object files and flags are used when
-      # linking a shared library.
-      output_verbose_link_cmd='$CC -shared $CFLAGS -v conftest.$objext 2>&1 | $GREP -v "^Configured with:" | $GREP "\-L"'
 
-    else
-      GXX=no
-      with_gnu_ld=no
-      wlarc=
-    fi
 
-    # PORTME: fill in a description of your system's C++ link characteristics
-    { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the $compiler linker ($LD) supports shared libraries" >&5
-$as_echo_n "checking whether the $compiler linker ($LD) supports shared libraries... " >&6; }
-    ld_shlibs_CXX=yes
-    case $host_os in
-      aix3*)
-        # FIXME: insert proper C++ library support
-        ld_shlibs_CXX=no
-        ;;
-      aix[4-9]*)
-        if test "$host_cpu" = ia64; then
-          # On IA64, the linker does run time linking by default, so we don't
-          # have to do anything special.
-          aix_use_runtimelinking=no
-          exp_sym_flag='-Bexport'
-          no_entry_flag=""
-        else
-          aix_use_runtimelinking=no
 
-          # Test if we are trying to use run time linking or normal
-          # AIX style linking. If -brtl is somewhere in LDFLAGS, we
-          # need to do runtime linking.
-          case $host_os in aix4.[23]|aix4.[23].*|aix[5-9]*)
-	    for ld_flag in $LDFLAGS; do
-	      case $ld_flag in
-	      *-brtl*)
-	        aix_use_runtimelinking=yes
-	        break
-	        ;;
-	      esac
-	    done
-	    ;;
-          esac
 
-          exp_sym_flag='-bexport'
-          no_entry_flag='-bnoentry'
-        fi
 
-        # When large executables or shared objects are built, AIX ld can
-        # have problems creating the table of contents.  If linking a library
-        # or program results in "error TOC overflow" add -mminimal-toc to
-        # CXXFLAGS/CFLAGS for g++/gcc.  In the cases where that is not
-        # enough to fix the problem, add -Wl,-bbigtoc to LDFLAGS.
 
-        archive_cmds_CXX=''
-        hardcode_direct_CXX=yes
-        hardcode_direct_absolute_CXX=yes
-        hardcode_libdir_separator_CXX=':'
-        link_all_deplibs_CXX=yes
-        file_list_spec_CXX='${wl}-f,'
 
-        if test "$GXX" = yes; then
-          case $host_os in aix4.[012]|aix4.[012].*)
-          # We only want to do this on AIX 4.2 and lower, the check
-          # below for broken collect2 doesn't work under 4.3+
-	  collect2name=`${CC} -print-prog-name=collect2`
-	  if test -f "$collect2name" &&
-	     strings "$collect2name" | $GREP resolve_lib_name >/dev/null
-	  then
-	    # We have reworked collect2
-	    :
-	  else
-	    # We have old collect2
-	    hardcode_direct_CXX=unsupported
-	    # It fails to find uninstalled libraries when the uninstalled
-	    # path is not listed in the libpath.  Setting hardcode_minus_L
-	    # to unsupported forces relinking
-	    hardcode_minus_L_CXX=yes
-	    hardcode_libdir_flag_spec_CXX='-L$libdir'
-	    hardcode_libdir_separator_CXX=
-	  fi
-          esac
-          shared_flag='-shared'
-	  if test "$aix_use_runtimelinking" = yes; then
-	    shared_flag="$shared_flag "'${wl}-G'
-	  fi
-        else
-          # not using gcc
-          if test "$host_cpu" = ia64; then
-	  # VisualAge C++, Version 5.5 for AIX 5L for IA-64, Beta 3 Release
-	  # chokes on -Wl,-G. The following line is correct:
-	  shared_flag='-G'
-          else
-	    if test "$aix_use_runtimelinking" = yes; then
-	      shared_flag='${wl}-G'
-	    else
-	      shared_flag='${wl}-bM:SRE'
-	    fi
-          fi
-        fi
 
-        export_dynamic_flag_spec_CXX='${wl}-bexpall'
-        # It seems that -bexpall does not export symbols beginning with
-        # underscore (_), so it is better to generate a list of symbols to
-	# export.
-        always_export_symbols_CXX=yes
-        if test "$aix_use_runtimelinking" = yes; then
-          # Warning - without using the other runtime loading flags (-brtl),
-          # -berok will link without error, but may produce a broken library.
-          allow_undefined_flag_CXX='-berok'
-          # Determine the default libpath from the value encoded in an empty
-          # executable.
-          if test "${lt_cv_aix_libpath+set}" = set; then
-  aix_libpath=$lt_cv_aix_libpath
-else
-  if ${lt_cv_aix_libpath__CXX+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
 
-int
-main ()
-{
 
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_cxx_try_link "$LINENO"; then :
 
-  lt_aix_libpath_sed='
-      /Import File Strings/,/^$/ {
-	  /^0/ {
-	      s/^0  *\([^ ]*\) *$/\1/
-	      p
-	  }
-      }'
-  lt_cv_aix_libpath__CXX=`dump -H conftest$ac_exeext 2>/dev/null | $SED -n -e "$lt_aix_libpath_sed"`
-  # Check for a 64-bit object if we didn't find anything.
-  if test -z "$lt_cv_aix_libpath__CXX"; then
-    lt_cv_aix_libpath__CXX=`dump -HX64 conftest$ac_exeext 2>/dev/null | $SED -n -e "$lt_aix_libpath_sed"`
-  fi
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-  if test -z "$lt_cv_aix_libpath__CXX"; then
-    lt_cv_aix_libpath__CXX="/usr/lib:/lib"
-  fi
 
-fi
 
-  aix_libpath=$lt_cv_aix_libpath__CXX
-fi
 
-          hardcode_libdir_flag_spec_CXX='${wl}-blibpath:$libdir:'"$aix_libpath"
 
-          archive_expsym_cmds_CXX='$CC -o $output_objdir/$soname $libobjs $deplibs '"\${wl}$no_entry_flag"' $compiler_flags `if test "x${allow_undefined_flag}" != "x"; then func_echo_all "${wl}${allow_undefined_flag}"; else :; fi` '"\${wl}$exp_sym_flag:\$export_symbols $shared_flag"
-        else
-          if test "$host_cpu" = ia64; then
-	    hardcode_libdir_flag_spec_CXX='${wl}-R $libdir:/usr/lib:/lib'
-	    allow_undefined_flag_CXX="-z nodefs"
-	    archive_expsym_cmds_CXX="\$CC $shared_flag"' -o $output_objdir/$soname $libobjs $deplibs '"\${wl}$no_entry_flag"' $compiler_flags ${wl}${allow_undefined_flag} '"\${wl}$exp_sym_flag:\$export_symbols"
-          else
-	    # Determine the default libpath from the value encoded in an
-	    # empty executable.
-	    if test "${lt_cv_aix_libpath+set}" = set; then
-  aix_libpath=$lt_cv_aix_libpath
-else
-  if ${lt_cv_aix_libpath__CXX+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
 
-int
-main ()
-{
 
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_cxx_try_link "$LINENO"; then :
 
-  lt_aix_libpath_sed='
-      /Import File Strings/,/^$/ {
-	  /^0/ {
-	      s/^0  *\([^ ]*\) *$/\1/
-	      p
-	  }
-      }'
-  lt_cv_aix_libpath__CXX=`dump -H conftest$ac_exeext 2>/dev/null | $SED -n -e "$lt_aix_libpath_sed"`
-  # Check for a 64-bit object if we didn't find anything.
-  if test -z "$lt_cv_aix_libpath__CXX"; then
-    lt_cv_aix_libpath__CXX=`dump -HX64 conftest$ac_exeext 2>/dev/null | $SED -n -e "$lt_aix_libpath_sed"`
-  fi
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-  if test -z "$lt_cv_aix_libpath__CXX"; then
-    lt_cv_aix_libpath__CXX="/usr/lib:/lib"
-  fi
 
-fi
+    lt_prog_compiler_wl_CXX=
+lt_prog_compiler_pic_CXX=
+lt_prog_compiler_static_CXX=
 
-  aix_libpath=$lt_cv_aix_libpath__CXX
-fi
 
-	    hardcode_libdir_flag_spec_CXX='${wl}-blibpath:$libdir:'"$aix_libpath"
-	    # Warning - without using the other run time loading flags,
-	    # -berok will link without error, but may produce a broken library.
-	    no_undefined_flag_CXX=' ${wl}-bernotok'
-	    allow_undefined_flag_CXX=' ${wl}-berok'
-	    if test "$with_gnu_ld" = yes; then
-	      # We only use this code for GNU lds that support --whole-archive.
-	      whole_archive_flag_spec_CXX='${wl}--whole-archive$convenience ${wl}--no-whole-archive'
-	    else
-	      # Exported symbols can be pulled into shared objects from archives
-	      whole_archive_flag_spec_CXX='$convenience'
-	    fi
-	    archive_cmds_need_lc_CXX=yes
-	    # This is similar to how AIX traditionally builds its shared
-	    # libraries.
-	    archive_expsym_cmds_CXX="\$CC $shared_flag"' -o $output_objdir/$soname $libobjs $deplibs ${wl}-bnoentry $compiler_flags ${wl}-bE:$export_symbols${allow_undefined_flag}~$AR $AR_FLAGS $output_objdir/$libname$release.a $output_objdir/$soname'
-          fi
-        fi
+  # C++ specific cases for pic, static, wl, etc.
+  if test "$GXX" = yes; then
+    lt_prog_compiler_wl_CXX='-Wl,'
+    lt_prog_compiler_static_CXX='-static'
+
+    case $host_os in
+    aix*)
+      # All AIX code is PIC.
+      if test "$host_cpu" = ia64; then
+	# AIX 5 now supports IA64 processor
+	lt_prog_compiler_static_CXX='-Bstatic'
+      fi
+      ;;
+
+    amigaos*)
+      case $host_cpu in
+      powerpc)
+            # see comment about AmigaOS4 .so support
+            lt_prog_compiler_pic_CXX='-fPIC'
+        ;;
+      m68k)
+            # FIXME: we need at least 68020 code to build shared libraries, but
+            # adding the `-m68020' flag to GCC prevents building anything better,
+            # like `-m68040'.
+            lt_prog_compiler_pic_CXX='-m68020 -resident32 -malways-restore-a4'
         ;;
+      esac
+      ;;
 
-      beos*)
-	if $LD --help 2>&1 | $GREP ': supported targets:.* elf' > /dev/null; then
-	  allow_undefined_flag_CXX=unsupported
-	  # Joseph Beckenbach <jrb3 at best.com> says some releases of gcc
-	  # support --undefined.  This deserves some investigation.  FIXME
-	  archive_cmds_CXX='$CC -nostart $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+    beos* | irix5* | irix6* | nonstopux* | osf3* | osf4* | osf5*)
+      # PIC is the default for these OSes.
+      ;;
+    mingw* | cygwin* | os2* | pw32* | cegcc*)
+      # This hack is so that the source file can tell whether it is being
+      # built for inclusion in a dll (and should export symbols for example).
+      # Although the cygwin gcc ignores -fPIC, still need this for old-style
+      # (--disable-auto-import) libraries
+      lt_prog_compiler_pic_CXX='-DDLL_EXPORT'
+      ;;
+    darwin* | rhapsody*)
+      # PIC is the default on this platform
+      # Common symbols not allowed in MH_DYLIB files
+      lt_prog_compiler_pic_CXX='-fno-common'
+      ;;
+    *djgpp*)
+      # DJGPP does not support shared libraries at all
+      lt_prog_compiler_pic_CXX=
+      ;;
+    haiku*)
+      # PIC is the default for Haiku.
+      # The "-static" flag exists, but is broken.
+      lt_prog_compiler_static_CXX=
+      ;;
+    interix[3-9]*)
+      # Interix 3.x gcc -fpic/-fPIC options generate broken code.
+      # Instead, we relocate shared libraries at runtime.
+      ;;
+    sysv4*MP*)
+      if test -d /usr/nec; then
+	lt_prog_compiler_pic_CXX=-Kconform_pic
+      fi
+      ;;
+    hpux*)
+      # PIC is the default for 64-bit PA HP-UX, but not for 32-bit
+      # PA HP-UX.  On IA64 HP-UX, PIC is the default but the pic flag
+      # sets the default TLS model and affects inlining.
+      case $host_cpu in
+      hppa*64*)
+	;;
+      *)
+	lt_prog_compiler_pic_CXX='-fPIC'
+	;;
+      esac
+      ;;
+    *qnx* | *nto*)
+      # QNX uses GNU C++, but need to define -shared option too, otherwise
+      # it will coredump.
+      lt_prog_compiler_pic_CXX='-fPIC -shared'
+      ;;
+    *)
+      lt_prog_compiler_pic_CXX='-fPIC'
+      ;;
+    esac
+  else
+    case $host_os in
+      aix[4-9]*)
+	# All AIX code is PIC.
+	if test "$host_cpu" = ia64; then
+	  # AIX 5 now supports IA64 processor
+	  lt_prog_compiler_static_CXX='-Bstatic'
 	else
-	  ld_shlibs_CXX=no
+	  lt_prog_compiler_static_CXX='-bnso -bI:/lib/syscalls.exp'
 	fi
 	;;
-
       chorus*)
-        case $cc_basename in
-          *)
-	  # FIXME: insert proper C++ library support
-	  ld_shlibs_CXX=no
-	  ;;
-        esac
-        ;;
-
-      cygwin* | mingw* | pw32* | cegcc*)
-	case $GXX,$cc_basename in
-	,cl* | no,cl*)
-	  # Native MSVC
-	  # hardcode_libdir_flag_spec is actually meaningless, as there is
-	  # no search path for DLLs.
-	  hardcode_libdir_flag_spec_CXX=' '
-	  allow_undefined_flag_CXX=unsupported
-	  always_export_symbols_CXX=yes
-	  file_list_spec_CXX='@'
-	  # Tell ltmain to make .lib files, not .a files.
-	  libext=lib
-	  # Tell ltmain to make .dll files, not .so files.
-	  shrext_cmds=".dll"
-	  # FIXME: Setting linknames here is a bad hack.
-	  archive_cmds_CXX='$CC -o $output_objdir/$soname $libobjs $compiler_flags $deplibs -Wl,-dll~linknames='
-	  archive_expsym_cmds_CXX='if test "x`$SED 1q $export_symbols`" = xEXPORTS; then
-	      $SED -n -e 's/\\\\\\\(.*\\\\\\\)/-link\\\ -EXPORT:\\\\\\\1/' -e '1\\\!p' < $export_symbols > $output_objdir/$soname.exp;
-	    else
-	      $SED -e 's/\\\\\\\(.*\\\\\\\)/-link\\\ -EXPORT:\\\\\\\1/' < $export_symbols > $output_objdir/$soname.exp;
-	    fi~
-	    $CC -o $tool_output_objdir$soname $libobjs $compiler_flags $deplibs "@$tool_output_objdir$soname.exp" -Wl,-DLL,-IMPLIB:"$tool_output_objdir$libname.dll.lib"~
-	    linknames='
-	  # The linker will not automatically build a static lib if we build a DLL.
-	  # _LT_TAGVAR(old_archive_from_new_cmds, CXX)='true'
-	  enable_shared_with_static_runtimes_CXX=yes
-	  # Don't use ranlib
-	  old_postinstall_cmds_CXX='chmod 644 $oldlib'
-	  postlink_cmds_CXX='lt_outputfile="@OUTPUT@"~
-	    lt_tool_outputfile="@TOOL_OUTPUT@"~
-	    case $lt_outputfile in
-	      *.exe|*.EXE) ;;
-	      *)
-		lt_outputfile="$lt_outputfile.exe"
-		lt_tool_outputfile="$lt_tool_outputfile.exe"
-		;;
-	    esac~
-	    func_to_tool_file "$lt_outputfile"~
-	    if test "$MANIFEST_TOOL" != ":" && test -f "$lt_outputfile.manifest"; then
-	      $MANIFEST_TOOL -manifest "$lt_tool_outputfile.manifest" -outputresource:"$lt_tool_outputfile" || exit 1;
-	      $RM "$lt_outputfile.manifest";
-	    fi'
-	  ;;
-	*)
-	  # g++
-	  # _LT_TAGVAR(hardcode_libdir_flag_spec, CXX) is actually meaningless,
-	  # as there is no search path for DLLs.
-	  hardcode_libdir_flag_spec_CXX='-L$libdir'
-	  export_dynamic_flag_spec_CXX='${wl}--export-all-symbols'
-	  allow_undefined_flag_CXX=unsupported
-	  always_export_symbols_CXX=no
-	  enable_shared_with_static_runtimes_CXX=yes
-
-	  if $LD --help 2>&1 | $GREP 'auto-import' > /dev/null; then
-	    archive_cmds_CXX='$CC -shared -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -o $output_objdir/$soname ${wl}--enable-auto-image-base -Xlinker --out-implib -Xlinker $lib'
-	    # If the export-symbols file already is a .def file (1st line
-	    # is EXPORTS), use it as is; otherwise, prepend...
-	    archive_expsym_cmds_CXX='if test "x`$SED 1q $export_symbols`" = xEXPORTS; then
-	      cp $export_symbols $output_objdir/$soname.def;
-	    else
-	      echo EXPORTS > $output_objdir/$soname.def;
-	      cat $export_symbols >> $output_objdir/$soname.def;
-	    fi~
-	    $CC -shared -nostdlib $output_objdir/$soname.def $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -o $output_objdir/$soname ${wl}--enable-auto-image-base -Xlinker --out-implib -Xlinker $lib'
-	  else
-	    ld_shlibs_CXX=no
-	  fi
+	case $cc_basename in
+	cxch68*)
+	  # Green Hills C++ Compiler
+	  # _LT_TAGVAR(lt_prog_compiler_static, CXX)="--no_auto_instantiation -u __main -u __premain -u _abort -r $COOL_DIR/lib/libOrb.a $MVME_DIR/lib/CC/libC.a $MVME_DIR/lib/classix/libcx.s.a"
 	  ;;
 	esac
 	;;
-      darwin* | rhapsody*)
-
-
-  archive_cmds_need_lc_CXX=no
-  hardcode_direct_CXX=no
-  hardcode_automatic_CXX=yes
-  hardcode_shlibpath_var_CXX=unsupported
-  if test "$lt_cv_ld_force_load" = "yes"; then
-    whole_archive_flag_spec_CXX='`for conv in $convenience\"\"; do test  -n \"$conv\" && new_convenience=\"$new_convenience ${wl}-force_load,$conv\"; done; func_echo_all \"$new_convenience\"`'
-
-  else
-    whole_archive_flag_spec_CXX=''
-  fi
-  link_all_deplibs_CXX=yes
-  allow_undefined_flag_CXX="$_lt_dar_allow_undefined"
-  case $cc_basename in
-     ifort*) _lt_dar_can_shared=yes ;;
-     *) _lt_dar_can_shared=$GCC ;;
-  esac
-  if test "$_lt_dar_can_shared" = "yes"; then
-    output_verbose_link_cmd=func_echo_all
-    archive_cmds_CXX="\$CC -dynamiclib \$allow_undefined_flag -o \$lib \$libobjs \$deplibs \$compiler_flags -install_name \$rpath/\$soname \$verstring $_lt_dar_single_mod${_lt_dsymutil}"
-    module_cmds_CXX="\$CC \$allow_undefined_flag -o \$lib -bundle \$libobjs \$deplibs \$compiler_flags${_lt_dsymutil}"
-    archive_expsym_cmds_CXX="sed 's,^,_,' < \$export_symbols > \$output_objdir/\${libname}-symbols.expsym~\$CC -dynamiclib \$allow_undefined_flag -o \$lib \$libobjs \$deplibs \$compiler_flags -install_name \$rpath/\$soname \$verstring ${_lt_dar_single_mod}${_lt_dar_export_syms}${_lt_dsymutil}"
-    module_expsym_cmds_CXX="sed -e 's,^,_,' < \$export_symbols > \$output_objdir/\${libname}-symbols.expsym~\$CC \$allow_undefined_flag -o \$lib -bundle \$libobjs \$deplibs \$compiler_flags${_lt_dar_export_syms}${_lt_dsymutil}"
-       if test "$lt_cv_apple_cc_single_mod" != "yes"; then
-      archive_cmds_CXX="\$CC -r -keep_private_externs -nostdlib -o \${lib}-master.o \$libobjs~\$CC -dynamiclib \$allow_undefined_flag -o \$lib \${lib}-master.o \$deplibs \$compiler_flags -install_name \$rpath/\$soname \$verstring${_lt_dsymutil}"
-      archive_expsym_cmds_CXX="sed 's,^,_,' < \$export_symbols > \$output_objdir/\${libname}-symbols.expsym~\$CC -r -keep_private_externs -nostdlib -o \${lib}-master.o \$libobjs~\$CC -dynamiclib \$allow_undefined_flag -o \$lib \${lib}-master.o \$deplibs \$compiler_flags -install_name \$rpath/\$soname \$verstring${_lt_dar_export_syms}${_lt_dsymutil}"
-    fi
-
-  else
-  ld_shlibs_CXX=no
-  fi
-
+      mingw* | cygwin* | os2* | pw32* | cegcc*)
+	# This hack is so that the source file can tell whether it is being
+	# built for inclusion in a dll (and should export symbols for example).
+	lt_prog_compiler_pic_CXX='-DDLL_EXPORT'
 	;;
-
       dgux*)
-        case $cc_basename in
-          ec++*)
-	    # FIXME: insert proper C++ library support
-	    ld_shlibs_CXX=no
+	case $cc_basename in
+	  ec++*)
+	    lt_prog_compiler_pic_CXX='-KPIC'
 	    ;;
-          ghcx*)
+	  ghcx*)
 	    # Green Hills C++ Compiler
-	    # FIXME: insert proper C++ library support
-	    ld_shlibs_CXX=no
+	    lt_prog_compiler_pic_CXX='-pic'
 	    ;;
-          *)
-	    # FIXME: insert proper C++ library support
-	    ld_shlibs_CXX=no
+	  *)
 	    ;;
-        esac
-        ;;
-
-      freebsd2.*)
-        # C++ shared libraries reported to be fairly broken before
-	# switch to ELF
-        ld_shlibs_CXX=no
-        ;;
-
-      freebsd-elf*)
-        archive_cmds_need_lc_CXX=no
-        ;;
-
+	esac
+	;;
       freebsd* | dragonfly*)
-        # FreeBSD 3 and later use GNU C++ and GNU ld with standard ELF
-        # conventions
-        ld_shlibs_CXX=yes
-        ;;
-
-      haiku*)
-        archive_cmds_CXX='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
-        link_all_deplibs_CXX=yes
-        ;;
-
-      hpux9*)
-        hardcode_libdir_flag_spec_CXX='${wl}+b ${wl}$libdir'
-        hardcode_libdir_separator_CXX=:
-        export_dynamic_flag_spec_CXX='${wl}-E'
-        hardcode_direct_CXX=yes
-        hardcode_minus_L_CXX=yes # Not in the search PATH,
-				             # but as the default
-				             # location of the library.
-
-        case $cc_basename in
-          CC*)
-            # FIXME: insert proper C++ library support
-            ld_shlibs_CXX=no
-            ;;
-          aCC*)
-            archive_cmds_CXX='$RM $output_objdir/$soname~$CC -b ${wl}+b ${wl}$install_libdir -o $output_objdir/$soname $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~test $output_objdir/$soname = $lib || mv $output_objdir/$soname $lib'
-            # Commands to make compiler produce verbose output that lists
-            # what "hidden" libraries, object files and flags are used when
-            # linking a shared library.
-            #
-            # There doesn't appear to be a way to prevent this compiler from
-            # explicitly linking system object files so we need to strip them
-            # from the output so that they don't get included in the library
-            # dependencies.
-            output_verbose_link_cmd='templist=`($CC -b $CFLAGS -v conftest.$objext 2>&1) | $EGREP "\-L"`; list=""; for z in $templist; do case $z in conftest.$objext) list="$list $z";; *.$objext);; *) list="$list $z";;esac; done; func_echo_all "$list"'
-            ;;
-          *)
-            if test "$GXX" = yes; then
-              archive_cmds_CXX='$RM $output_objdir/$soname~$CC -shared -nostdlib $pic_flag ${wl}+b ${wl}$install_libdir -o $output_objdir/$soname $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~test $output_objdir/$soname = $lib || mv $output_objdir/$soname $lib'
-            else
-              # FIXME: insert proper C++ library support
-              ld_shlibs_CXX=no
-            fi
-            ;;
-        esac
-        ;;
-
-      hpux10*|hpux11*)
-        if test $with_gnu_ld = no; then
-	  hardcode_libdir_flag_spec_CXX='${wl}+b ${wl}$libdir'
-	  hardcode_libdir_separator_CXX=:
-
-          case $host_cpu in
-            hppa*64*|ia64*)
-              ;;
-            *)
-	      export_dynamic_flag_spec_CXX='${wl}-E'
-              ;;
-          esac
-        fi
-        case $host_cpu in
-          hppa*64*|ia64*)
-            hardcode_direct_CXX=no
-            hardcode_shlibpath_var_CXX=no
-            ;;
-          *)
-            hardcode_direct_CXX=yes
-            hardcode_direct_absolute_CXX=yes
-            hardcode_minus_L_CXX=yes # Not in the search PATH,
-					         # but as the default
-					         # location of the library.
-            ;;
-        esac
-
-        case $cc_basename in
-          CC*)
-	    # FIXME: insert proper C++ library support
-	    ld_shlibs_CXX=no
+	# FreeBSD uses GNU C++
+	;;
+      hpux9* | hpux10* | hpux11*)
+	case $cc_basename in
+	  CC*)
+	    lt_prog_compiler_wl_CXX='-Wl,'
+	    lt_prog_compiler_static_CXX='${wl}-a ${wl}archive'
+	    if test "$host_cpu" != ia64; then
+	      lt_prog_compiler_pic_CXX='+Z'
+	    fi
 	    ;;
-          aCC*)
+	  aCC*)
+	    lt_prog_compiler_wl_CXX='-Wl,'
+	    lt_prog_compiler_static_CXX='${wl}-a ${wl}archive'
 	    case $host_cpu in
-	      hppa*64*)
-	        archive_cmds_CXX='$CC -b ${wl}+h ${wl}$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
-	        ;;
-	      ia64*)
-	        archive_cmds_CXX='$CC -b ${wl}+h ${wl}$soname ${wl}+nodefaultrpath -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
-	        ;;
-	      *)
-	        archive_cmds_CXX='$CC -b ${wl}+h ${wl}$soname ${wl}+b ${wl}$install_libdir -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
-	        ;;
+	    hppa*64*|ia64*)
+	      # +Z the default
+	      ;;
+	    *)
+	      lt_prog_compiler_pic_CXX='+Z'
+	      ;;
 	    esac
-	    # Commands to make compiler produce verbose output that lists
-	    # what "hidden" libraries, object files and flags are used when
-	    # linking a shared library.
-	    #
-	    # There doesn't appear to be a way to prevent this compiler from
-	    # explicitly linking system object files so we need to strip them
-	    # from the output so that they don't get included in the library
-	    # dependencies.
-	    output_verbose_link_cmd='templist=`($CC -b $CFLAGS -v conftest.$objext 2>&1) | $GREP "\-L"`; list=""; for z in $templist; do case $z in conftest.$objext) list="$list $z";; *.$objext);; *) list="$list $z";;esac; done; func_echo_all "$list"'
 	    ;;
-          *)
-	    if test "$GXX" = yes; then
-	      if test $with_gnu_ld = no; then
-	        case $host_cpu in
-	          hppa*64*)
-	            archive_cmds_CXX='$CC -shared -nostdlib -fPIC ${wl}+h ${wl}$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
-	            ;;
-	          ia64*)
-	            archive_cmds_CXX='$CC -shared -nostdlib $pic_flag ${wl}+h ${wl}$soname ${wl}+nodefaultrpath -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
-	            ;;
-	          *)
-	            archive_cmds_CXX='$CC -shared -nostdlib $pic_flag ${wl}+h ${wl}$soname ${wl}+b ${wl}$install_libdir -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
-	            ;;
-	        esac
-	      fi
-	    else
-	      # FIXME: insert proper C++ library support
-	      ld_shlibs_CXX=no
-	    fi
+	  *)
 	    ;;
-        esac
-        ;;
-
-      interix[3-9]*)
-	hardcode_direct_CXX=no
-	hardcode_shlibpath_var_CXX=no
-	hardcode_libdir_flag_spec_CXX='${wl}-rpath,$libdir'
-	export_dynamic_flag_spec_CXX='${wl}-E'
-	# Hack: On Interix 3.x, we cannot compile PIC because of a broken gcc.
-	# Instead, shared libraries are loaded at an image base (0x10000000 by
-	# default) and relocated if they conflict, which is a slow very memory
-	# consuming and fragmenting process.  To avoid this, we pick a random,
-	# 256 KiB-aligned image base between 0x50000000 and 0x6FFC0000 at link
-	# time.  Moving up from 0x10000000 also allows more sbrk(2) space.
-	archive_cmds_CXX='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-h,$soname ${wl}--image-base,`expr ${RANDOM-$$} % 4096 / 2 \* 262144 + 1342177280` -o $lib'
-	archive_expsym_cmds_CXX='sed "s,^,_," $export_symbols >$output_objdir/$soname.expsym~$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-h,$soname ${wl}--retain-symbols-file,$output_objdir/$soname.expsym ${wl}--image-base,`expr ${RANDOM-$$} % 4096 / 2 \* 262144 + 1342177280` -o $lib'
+	esac
 	;;
-      irix5* | irix6*)
-        case $cc_basename in
-          CC*)
-	    # SGI C++
-	    archive_cmds_CXX='$CC -shared -all -multigot $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry ${output_objdir}/so_locations -o $lib'
-
-	    # Archives containing C++ object files must be created using
-	    # "CC -ar", where "CC" is the IRIX C++ compiler.  This is
-	    # necessary to make sure instantiated templates are included
-	    # in the archive.
-	    old_archive_cmds_CXX='$CC -ar -WR,-u -o $oldlib $oldobjs'
+      interix*)
+	# This is c89, which is MS Visual C++ (no shared libs)
+	# Anyone wants to do a port?
+	;;
+      irix5* | irix6* | nonstopux*)
+	case $cc_basename in
+	  CC*)
+	    lt_prog_compiler_wl_CXX='-Wl,'
+	    lt_prog_compiler_static_CXX='-non_shared'
+	    # CC pic flag -KPIC is the default.
 	    ;;
-          *)
-	    if test "$GXX" = yes; then
-	      if test "$with_gnu_ld" = no; then
-	        archive_cmds_CXX='$CC -shared $pic_flag -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && func_echo_all "${wl}-set_version ${wl}$verstring"` ${wl}-update_registry ${wl}${output_objdir}/so_locations -o $lib'
-	      else
-	        archive_cmds_CXX='$CC -shared $pic_flag -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && func_echo_all "${wl}-set_version ${wl}$verstring"` -o $lib'
-	      fi
-	    fi
-	    link_all_deplibs_CXX=yes
+	  *)
 	    ;;
-        esac
-        hardcode_libdir_flag_spec_CXX='${wl}-rpath ${wl}$libdir'
-        hardcode_libdir_separator_CXX=:
-        inherit_rpath_CXX=yes
-        ;;
-
+	esac
+	;;
       linux* | k*bsd*-gnu | kopensolaris*-gnu | gnu*)
-        case $cc_basename in
-          KCC*)
-	    # Kuck and Associates, Inc. (KAI) C++ Compiler
-
-	    # KCC will only create a shared library if the output file
-	    # ends with ".so" (or ".sl" for HP-UX), so rename the library
-	    # to its proper name (with version) after linking.
-	    archive_cmds_CXX='tempext=`echo $shared_ext | $SED -e '\''s/\([^()0-9A-Za-z{}]\)/\\\\\1/g'\''`; templib=`echo $lib | $SED -e "s/\${tempext}\..*/.so/"`; $CC $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags --soname $soname -o \$templib; mv \$templib $lib'
-	    archive_expsym_cmds_CXX='tempext=`echo $shared_ext | $SED -e '\''s/\([^()0-9A-Za-z{}]\)/\\\\\1/g'\''`; templib=`echo $lib | $SED -e "s/\${tempext}\..*/.so/"`; $CC $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags --soname $soname -o \$templib ${wl}-retain-symbols-file,$export_symbols; mv \$templib $lib'
-	    # Commands to make compiler produce verbose output that lists
-	    # what "hidden" libraries, object files and flags are used when
-	    # linking a shared library.
-	    #
-	    # There doesn't appear to be a way to prevent this compiler from
-	    # explicitly linking system object files so we need to strip them
-	    # from the output so that they don't get included in the library
-	    # dependencies.
-	    output_verbose_link_cmd='templist=`$CC $CFLAGS -v conftest.$objext -o libconftest$shared_ext 2>&1 | $GREP "ld"`; rm -f libconftest$shared_ext; list=""; for z in $templist; do case $z in conftest.$objext) list="$list $z";; *.$objext);; *) list="$list $z";;esac; done; func_echo_all "$list"'
-
-	    hardcode_libdir_flag_spec_CXX='${wl}-rpath,$libdir'
-	    export_dynamic_flag_spec_CXX='${wl}--export-dynamic'
-
-	    # Archives containing C++ object files must be created using
-	    # "CC -Bstatic", where "CC" is the KAI C++ compiler.
-	    old_archive_cmds_CXX='$CC -Bstatic -o $oldlib $oldobjs'
+	case $cc_basename in
+	  KCC*)
+	    # KAI C++ Compiler
+	    lt_prog_compiler_wl_CXX='--backend -Wl,'
+	    lt_prog_compiler_pic_CXX='-fPIC'
 	    ;;
-	  icpc* | ecpc* )
-	    # Intel C++
-	    with_gnu_ld=yes
-	    # version 8.0 and above of icpc choke on multiply defined symbols
-	    # if we add $predep_objects and $postdep_objects, however 7.1 and
-	    # earlier do not add the objects themselves.
-	    case `$CC -V 2>&1` in
-	      *"Version 7."*)
-	        archive_cmds_CXX='$CC -shared $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname $wl$soname -o $lib'
-		archive_expsym_cmds_CXX='$CC -shared $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
-		;;
-	      *)  # Version 8.0 or newer
-	        tmp_idyn=
-	        case $host_cpu in
-		  ia64*) tmp_idyn=' -i_dynamic';;
-		esac
-	        archive_cmds_CXX='$CC -shared'"$tmp_idyn"' $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
-		archive_expsym_cmds_CXX='$CC -shared'"$tmp_idyn"' $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
-		;;
-	    esac
-	    archive_cmds_need_lc_CXX=no
-	    hardcode_libdir_flag_spec_CXX='${wl}-rpath,$libdir'
-	    export_dynamic_flag_spec_CXX='${wl}--export-dynamic'
-	    whole_archive_flag_spec_CXX='${wl}--whole-archive$convenience ${wl}--no-whole-archive'
+	  ecpc* )
+	    # old Intel C++ for x86_64 which still supported -KPIC.
+	    lt_prog_compiler_wl_CXX='-Wl,'
+	    lt_prog_compiler_pic_CXX='-KPIC'
+	    lt_prog_compiler_static_CXX='-static'
+	    ;;
+	  icpc* )
+	    # Intel C++, used to be incompatible with GCC.
+	    # ICC 10 doesn't accept -KPIC any more.
+	    lt_prog_compiler_wl_CXX='-Wl,'
+	    lt_prog_compiler_pic_CXX='-fPIC'
+	    lt_prog_compiler_static_CXX='-static'
+	    ;;
+	  pgCC* | pgcpp*)
+	    # Portland Group C++ compiler
+	    lt_prog_compiler_wl_CXX='-Wl,'
+	    lt_prog_compiler_pic_CXX='-fpic'
+	    lt_prog_compiler_static_CXX='-Bstatic'
 	    ;;
-          pgCC* | pgcpp*)
-            # Portland Group C++ compiler
-	    case `$CC -V` in
-	    *pgCC\ [1-5].* | *pgcpp\ [1-5].*)
-	      prelink_cmds_CXX='tpldir=Template.dir~
-		rm -rf $tpldir~
-		$CC --prelink_objects --instantiation_dir $tpldir $objs $libobjs $compile_deplibs~
-		compile_command="$compile_command `find $tpldir -name \*.o | sort | $NL2SP`"'
-	      old_archive_cmds_CXX='tpldir=Template.dir~
-		rm -rf $tpldir~
-		$CC --prelink_objects --instantiation_dir $tpldir $oldobjs$old_deplibs~
-		$AR $AR_FLAGS $oldlib$oldobjs$old_deplibs `find $tpldir -name \*.o | sort | $NL2SP`~
-		$RANLIB $oldlib'
-	      archive_cmds_CXX='tpldir=Template.dir~
-		rm -rf $tpldir~
-		$CC --prelink_objects --instantiation_dir $tpldir $predep_objects $libobjs $deplibs $convenience $postdep_objects~
-		$CC -shared $pic_flag $predep_objects $libobjs $deplibs `find $tpldir -name \*.o | sort | $NL2SP` $postdep_objects $compiler_flags ${wl}-soname ${wl}$soname -o $lib'
-	      archive_expsym_cmds_CXX='tpldir=Template.dir~
-		rm -rf $tpldir~
-		$CC --prelink_objects --instantiation_dir $tpldir $predep_objects $libobjs $deplibs $convenience $postdep_objects~
-		$CC -shared $pic_flag $predep_objects $libobjs $deplibs `find $tpldir -name \*.o | sort | $NL2SP` $postdep_objects $compiler_flags ${wl}-soname ${wl}$soname ${wl}-retain-symbols-file ${wl}$export_symbols -o $lib'
-	      ;;
-	    *) # Version 6 and above use weak symbols
-	      archive_cmds_CXX='$CC -shared $pic_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname ${wl}$soname -o $lib'
-	      archive_expsym_cmds_CXX='$CC -shared $pic_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname ${wl}$soname ${wl}-retain-symbols-file ${wl}$export_symbols -o $lib'
-	      ;;
-	    esac
-
-	    hardcode_libdir_flag_spec_CXX='${wl}--rpath ${wl}$libdir'
-	    export_dynamic_flag_spec_CXX='${wl}--export-dynamic'
-	    whole_archive_flag_spec_CXX='${wl}--whole-archive`for conv in $convenience\"\"; do test  -n \"$conv\" && new_convenience=\"$new_convenience,$conv\"; done; func_echo_all \"$new_convenience\"` ${wl}--no-whole-archive'
-            ;;
 	  cxx*)
 	    # Compaq C++
-	    archive_cmds_CXX='$CC -shared $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname $wl$soname -o $lib'
-	    archive_expsym_cmds_CXX='$CC -shared $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname $wl$soname  -o $lib ${wl}-retain-symbols-file $wl$export_symbols'
-
-	    runpath_var=LD_RUN_PATH
-	    hardcode_libdir_flag_spec_CXX='-rpath $libdir'
-	    hardcode_libdir_separator_CXX=:
-
-	    # Commands to make compiler produce verbose output that lists
-	    # what "hidden" libraries, object files and flags are used when
-	    # linking a shared library.
-	    #
-	    # There doesn't appear to be a way to prevent this compiler from
-	    # explicitly linking system object files so we need to strip them
-	    # from the output so that they don't get included in the library
-	    # dependencies.
-	    output_verbose_link_cmd='templist=`$CC -shared $CFLAGS -v conftest.$objext 2>&1 | $GREP "ld"`; templist=`func_echo_all "$templist" | $SED "s/\(^.*ld.*\)\( .*ld .*$\)/\1/"`; list=""; for z in $templist; do case $z in conftest.$objext) list="$list $z";; *.$objext);; *) list="$list $z";;esac; done; func_echo_all "X$list" | $Xsed'
+	    # Make sure the PIC flag is empty.  It appears that all Alpha
+	    # Linux and Compaq Tru64 Unix objects are PIC.
+	    lt_prog_compiler_pic_CXX=
+	    lt_prog_compiler_static_CXX='-non_shared'
 	    ;;
-	  xl* | mpixl* | bgxl*)
-	    # IBM XL 8.0 on PPC, with GNU ld
-	    hardcode_libdir_flag_spec_CXX='${wl}-rpath ${wl}$libdir'
-	    export_dynamic_flag_spec_CXX='${wl}--export-dynamic'
-	    archive_cmds_CXX='$CC -qmkshrobj $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
-	    if test "x$supports_anon_versioning" = xyes; then
-	      archive_expsym_cmds_CXX='echo "{ global:" > $output_objdir/$libname.ver~
-		cat $export_symbols | sed -e "s/\(.*\)/\1;/" >> $output_objdir/$libname.ver~
-		echo "local: *; };" >> $output_objdir/$libname.ver~
-		$CC -qmkshrobj $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-version-script ${wl}$output_objdir/$libname.ver -o $lib'
-	    fi
+	  xlc* | xlC* | bgxl[cC]* | mpixl[cC]*)
+	    # IBM XL 8.0, 9.0 on PPC and BlueGene
+	    lt_prog_compiler_wl_CXX='-Wl,'
+	    lt_prog_compiler_pic_CXX='-qpic'
+	    lt_prog_compiler_static_CXX='-qstaticlink'
 	    ;;
 	  *)
 	    case `$CC -V 2>&1 | sed 5q` in
 	    *Sun\ C*)
 	      # Sun C++ 5.9
-	      no_undefined_flag_CXX=' -zdefs'
-	      archive_cmds_CXX='$CC -G${allow_undefined_flag} -h$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
-	      archive_expsym_cmds_CXX='$CC -G${allow_undefined_flag} -h$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-retain-symbols-file ${wl}$export_symbols'
-	      hardcode_libdir_flag_spec_CXX='-R$libdir'
-	      whole_archive_flag_spec_CXX='${wl}--whole-archive`new_convenience=; for conv in $convenience\"\"; do test -z \"$conv\" || new_convenience=\"$new_convenience,$conv\"; done; func_echo_all \"$new_convenience\"` ${wl}--no-whole-archive'
-	      compiler_needs_object_CXX=yes
-
-	      # Not sure whether something based on
-	      # $CC $CFLAGS -v conftest.$objext -o libconftest$shared_ext 2>&1
-	      # would be better.
-	      output_verbose_link_cmd='func_echo_all'
-
-	      # Archives containing C++ object files must be created using
-	      # "CC -xar", where "CC" is the Sun C++ compiler.  This is
-	      # necessary to make sure instantiated templates are included
-	      # in the archive.
-	      old_archive_cmds_CXX='$CC -xar -o $oldlib $oldobjs'
+	      lt_prog_compiler_pic_CXX='-KPIC'
+	      lt_prog_compiler_static_CXX='-Bstatic'
+	      lt_prog_compiler_wl_CXX='-Qoption ld '
 	      ;;
 	    esac
 	    ;;
 	esac
 	;;
-
       lynxos*)
-        # FIXME: insert proper C++ library support
-	ld_shlibs_CXX=no
 	;;
-
       m88k*)
-        # FIXME: insert proper C++ library support
-        ld_shlibs_CXX=no
 	;;
-
       mvs*)
-        case $cc_basename in
-          cxx*)
-	    # FIXME: insert proper C++ library support
-	    ld_shlibs_CXX=no
+	case $cc_basename in
+	  cxx*)
+	    lt_prog_compiler_pic_CXX='-W c,exportall'
 	    ;;
 	  *)
-	    # FIXME: insert proper C++ library support
-	    ld_shlibs_CXX=no
 	    ;;
 	esac
 	;;
-
-      netbsd*)
-        if echo __ELF__ | $CC -E - | $GREP __ELF__ >/dev/null; then
-	  archive_cmds_CXX='$LD -Bshareable  -o $lib $predep_objects $libobjs $deplibs $postdep_objects $linker_flags'
-	  wlarc=
-	  hardcode_libdir_flag_spec_CXX='-R$libdir'
-	  hardcode_direct_CXX=yes
-	  hardcode_shlibpath_var_CXX=no
-	fi
-	# Workaround some broken pre-1.5 toolchains
-	output_verbose_link_cmd='$CC -shared $CFLAGS -v conftest.$objext 2>&1 | $GREP conftest.$objext | $SED -e "s:-lgcc -lc -lgcc::"'
-	;;
-
-      *nto* | *qnx*)
-        ld_shlibs_CXX=yes
-	;;
-
-      openbsd2*)
-        # C++ shared libraries are fairly broken
-	ld_shlibs_CXX=no
-	;;
-
-      openbsd*)
-	if test -f /usr/libexec/ld.so; then
-	  hardcode_direct_CXX=yes
-	  hardcode_shlibpath_var_CXX=no
-	  hardcode_direct_absolute_CXX=yes
-	  archive_cmds_CXX='$CC -shared $pic_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -o $lib'
-	  hardcode_libdir_flag_spec_CXX='${wl}-rpath,$libdir'
-	  if test -z "`echo __ELF__ | $CC -E - | grep __ELF__`" || test "$host_os-$host_cpu" = "openbsd2.8-powerpc"; then
-	    archive_expsym_cmds_CXX='$CC -shared $pic_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-retain-symbols-file,$export_symbols -o $lib'
-	    export_dynamic_flag_spec_CXX='${wl}-E'
-	    whole_archive_flag_spec_CXX="$wlarc"'--whole-archive$convenience '"$wlarc"'--no-whole-archive'
-	  fi
-	  output_verbose_link_cmd=func_echo_all
-	else
-	  ld_shlibs_CXX=no
-	fi
+      netbsd* | netbsdelf*-gnu)
 	;;
-
+      *qnx* | *nto*)
+        # QNX uses GNU C++, but need to define -shared option too, otherwise
+        # it will coredump.
+        lt_prog_compiler_pic_CXX='-fPIC -shared'
+        ;;
       osf3* | osf4* | osf5*)
-        case $cc_basename in
-          KCC*)
-	    # Kuck and Associates, Inc. (KAI) C++ Compiler
-
-	    # KCC will only create a shared library if the output file
-	    # ends with ".so" (or ".sl" for HP-UX), so rename the library
-	    # to its proper name (with version) after linking.
-	    archive_cmds_CXX='tempext=`echo $shared_ext | $SED -e '\''s/\([^()0-9A-Za-z{}]\)/\\\\\1/g'\''`; templib=`echo "$lib" | $SED -e "s/\${tempext}\..*/.so/"`; $CC $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags --soname $soname -o \$templib; mv \$templib $lib'
-
-	    hardcode_libdir_flag_spec_CXX='${wl}-rpath,$libdir'
-	    hardcode_libdir_separator_CXX=:
-
-	    # Archives containing C++ object files must be created using
-	    # the KAI C++ compiler.
-	    case $host in
-	      osf3*) old_archive_cmds_CXX='$CC -Bstatic -o $oldlib $oldobjs' ;;
-	      *) old_archive_cmds_CXX='$CC -o $oldlib $oldobjs' ;;
-	    esac
+	case $cc_basename in
+	  KCC*)
+	    lt_prog_compiler_wl_CXX='--backend -Wl,'
 	    ;;
-          RCC*)
+	  RCC*)
 	    # Rational C++ 2.4.1
-	    # FIXME: insert proper C++ library support
-	    ld_shlibs_CXX=no
+	    lt_prog_compiler_pic_CXX='-pic'
 	    ;;
-          cxx*)
-	    case $host in
-	      osf3*)
-	        allow_undefined_flag_CXX=' ${wl}-expect_unresolved ${wl}\*'
-	        archive_cmds_CXX='$CC -shared${allow_undefined_flag} $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname $soname `test -n "$verstring" && func_echo_all "${wl}-set_version $verstring"` -update_registry ${output_objdir}/so_locations -o $lib'
-	        hardcode_libdir_flag_spec_CXX='${wl}-rpath ${wl}$libdir'
-		;;
-	      *)
-	        allow_undefined_flag_CXX=' -expect_unresolved \*'
-	        archive_cmds_CXX='$CC -shared${allow_undefined_flag} $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -msym -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry ${output_objdir}/so_locations -o $lib'
-	        archive_expsym_cmds_CXX='for i in `cat $export_symbols`; do printf "%s %s\\n" -exported_symbol "\$i" >> $lib.exp; done~
-	          echo "-hidden">> $lib.exp~
-	          $CC -shared$allow_undefined_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -msym -soname $soname ${wl}-input ${wl}$lib.exp  `test -n "$verstring" && $ECHO "-set_version $verstring"` -update_registry ${output_objdir}/so_locations -o $lib~
-	          $RM $lib.exp'
-	        hardcode_libdir_flag_spec_CXX='-rpath $libdir'
-		;;
-	    esac
-
-	    hardcode_libdir_separator_CXX=:
-
-	    # Commands to make compiler produce verbose output that lists
-	    # what "hidden" libraries, object files and flags are used when
-	    # linking a shared library.
-	    #
-	    # There doesn't appear to be a way to prevent this compiler from
-	    # explicitly linking system object files so we need to strip them
-	    # from the output so that they don't get included in the library
-	    # dependencies.
-	    output_verbose_link_cmd='templist=`$CC -shared $CFLAGS -v conftest.$objext 2>&1 | $GREP "ld" | $GREP -v "ld:"`; templist=`func_echo_all "$templist" | $SED "s/\(^.*ld.*\)\( .*ld.*$\)/\1/"`; list=""; for z in $templist; do case $z in conftest.$objext) list="$list $z";; *.$objext);; *) list="$list $z";;esac; done; func_echo_all "$list"'
+	  cxx*)
+	    # Digital/Compaq C++
+	    lt_prog_compiler_wl_CXX='-Wl,'
+	    # Make sure the PIC flag is empty.  It appears that all Alpha
+	    # Linux and Compaq Tru64 Unix objects are PIC.
+	    lt_prog_compiler_pic_CXX=
+	    lt_prog_compiler_static_CXX='-non_shared'
 	    ;;
 	  *)
-	    if test "$GXX" = yes && test "$with_gnu_ld" = no; then
-	      allow_undefined_flag_CXX=' ${wl}-expect_unresolved ${wl}\*'
-	      case $host in
-	        osf3*)
-	          archive_cmds_CXX='$CC -shared -nostdlib ${allow_undefined_flag} $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && func_echo_all "${wl}-set_version ${wl}$verstring"` ${wl}-update_registry ${wl}${output_objdir}/so_locations -o $lib'
-		  ;;
-	        *)
-	          archive_cmds_CXX='$CC -shared $pic_flag -nostdlib ${allow_undefined_flag} $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-msym ${wl}-soname ${wl}$soname `test -n "$verstring" && func_echo_all "${wl}-set_version ${wl}$verstring"` ${wl}-update_registry ${wl}${output_objdir}/so_locations -o $lib'
-		  ;;
-	      esac
-
-	      hardcode_libdir_flag_spec_CXX='${wl}-rpath ${wl}$libdir'
-	      hardcode_libdir_separator_CXX=:
-
-	      # Commands to make compiler produce verbose output that lists
-	      # what "hidden" libraries, object files and flags are used when
-	      # linking a shared library.
-	      output_verbose_link_cmd='$CC -shared $CFLAGS -v conftest.$objext 2>&1 | $GREP -v "^Configured with:" | $GREP "\-L"'
-
-	    else
-	      # FIXME: insert proper C++ library support
-	      ld_shlibs_CXX=no
-	    fi
 	    ;;
-        esac
-        ;;
-
+	esac
+	;;
       psos*)
-        # FIXME: insert proper C++ library support
-        ld_shlibs_CXX=no
-        ;;
-
-      sunos4*)
-        case $cc_basename in
-          CC*)
-	    # Sun C++ 4.x
-	    # FIXME: insert proper C++ library support
-	    ld_shlibs_CXX=no
-	    ;;
-          lcc*)
-	    # Lucid
-	    # FIXME: insert proper C++ library support
-	    ld_shlibs_CXX=no
-	    ;;
-          *)
-	    # FIXME: insert proper C++ library support
-	    ld_shlibs_CXX=no
-	    ;;
-        esac
-        ;;
-
+	;;
       solaris*)
-        case $cc_basename in
-          CC* | sunCC*)
+	case $cc_basename in
+	  CC* | sunCC*)
 	    # Sun C++ 4.2, 5.x and Centerline C++
-            archive_cmds_need_lc_CXX=yes
-	    no_undefined_flag_CXX=' -zdefs'
-	    archive_cmds_CXX='$CC -G${allow_undefined_flag}  -h$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
-	    archive_expsym_cmds_CXX='echo "{ global:" > $lib.exp~cat $export_symbols | $SED -e "s/\(.*\)/\1;/" >> $lib.exp~echo "local: *; };" >> $lib.exp~
-	      $CC -G${allow_undefined_flag} ${wl}-M ${wl}$lib.exp -h$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~$RM $lib.exp'
-
-	    hardcode_libdir_flag_spec_CXX='-R$libdir'
-	    hardcode_shlibpath_var_CXX=no
-	    case $host_os in
-	      solaris2.[0-5] | solaris2.[0-5].*) ;;
-	      *)
-		# The compiler driver will combine and reorder linker options,
-		# but understands `-z linker_flag'.
-	        # Supported since Solaris 2.6 (maybe 2.5.1?)
-		whole_archive_flag_spec_CXX='-z allextract$convenience -z defaultextract'
-	        ;;
-	    esac
-	    link_all_deplibs_CXX=yes
-
-	    output_verbose_link_cmd='func_echo_all'
-
-	    # Archives containing C++ object files must be created using
-	    # "CC -xar", where "CC" is the Sun C++ compiler.  This is
-	    # necessary to make sure instantiated templates are included
-	    # in the archive.
-	    old_archive_cmds_CXX='$CC -xar -o $oldlib $oldobjs'
+	    lt_prog_compiler_pic_CXX='-KPIC'
+	    lt_prog_compiler_static_CXX='-Bstatic'
+	    lt_prog_compiler_wl_CXX='-Qoption ld '
 	    ;;
-          gcx*)
+	  gcx*)
 	    # Green Hills C++ Compiler
-	    archive_cmds_CXX='$CC -shared $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-h $wl$soname -o $lib'
-
-	    # The C++ compiler must be used to create the archive.
-	    old_archive_cmds_CXX='$CC $LDFLAGS -archive -o $oldlib $oldobjs'
+	    lt_prog_compiler_pic_CXX='-PIC'
 	    ;;
-          *)
-	    # GNU C++ compiler with Solaris linker
-	    if test "$GXX" = yes && test "$with_gnu_ld" = no; then
-	      no_undefined_flag_CXX=' ${wl}-z ${wl}defs'
-	      if $CC --version | $GREP -v '^2\.7' > /dev/null; then
-	        archive_cmds_CXX='$CC -shared $pic_flag -nostdlib $LDFLAGS $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-h $wl$soname -o $lib'
-	        archive_expsym_cmds_CXX='echo "{ global:" > $lib.exp~cat $export_symbols | $SED -e "s/\(.*\)/\1;/" >> $lib.exp~echo "local: *; };" >> $lib.exp~
-		  $CC -shared $pic_flag -nostdlib ${wl}-M $wl$lib.exp -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~$RM $lib.exp'
-
-	        # Commands to make compiler produce verbose output that lists
-	        # what "hidden" libraries, object files and flags are used when
-	        # linking a shared library.
-	        output_verbose_link_cmd='$CC -shared $CFLAGS -v conftest.$objext 2>&1 | $GREP -v "^Configured with:" | $GREP "\-L"'
-	      else
-	        # g++ 2.7 appears to require `-G' NOT `-shared' on this
-	        # platform.
-	        archive_cmds_CXX='$CC -G -nostdlib $LDFLAGS $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-h $wl$soname -o $lib'
-	        archive_expsym_cmds_CXX='echo "{ global:" > $lib.exp~cat $export_symbols | $SED -e "s/\(.*\)/\1;/" >> $lib.exp~echo "local: *; };" >> $lib.exp~
-		  $CC -G -nostdlib ${wl}-M $wl$lib.exp -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~$RM $lib.exp'
-
-	        # Commands to make compiler produce verbose output that lists
-	        # what "hidden" libraries, object files and flags are used when
-	        # linking a shared library.
-	        output_verbose_link_cmd='$CC -G $CFLAGS -v conftest.$objext 2>&1 | $GREP -v "^Configured with:" | $GREP "\-L"'
-	      fi
-
-	      hardcode_libdir_flag_spec_CXX='${wl}-R $wl$libdir'
-	      case $host_os in
-		solaris2.[0-5] | solaris2.[0-5].*) ;;
-		*)
-		  whole_archive_flag_spec_CXX='${wl}-z ${wl}allextract$convenience ${wl}-z ${wl}defaultextract'
-		  ;;
-	      esac
-	    fi
+	  *)
 	    ;;
-        esac
-        ;;
-
-    sysv4*uw2* | sysv5OpenUNIX* | sysv5UnixWare7.[01].[10]* | unixware7* | sco3.2v5.0.[024]*)
-      no_undefined_flag_CXX='${wl}-z,text'
-      archive_cmds_need_lc_CXX=no
-      hardcode_shlibpath_var_CXX=no
-      runpath_var='LD_RUN_PATH'
-
-      case $cc_basename in
-        CC*)
-	  archive_cmds_CXX='$CC -G ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
-	  archive_expsym_cmds_CXX='$CC -G ${wl}-Bexport:$export_symbols ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
-	  ;;
-	*)
-	  archive_cmds_CXX='$CC -shared ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
-	  archive_expsym_cmds_CXX='$CC -shared ${wl}-Bexport:$export_symbols ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
-	  ;;
-      esac
-      ;;
-
-      sysv5* | sco3.2v5* | sco5v6*)
-	# Note: We can NOT use -z defs as we might desire, because we do not
-	# link with -lc, and that would cause any symbols used from libc to
-	# always be unresolved, which means just about no library would
-	# ever link correctly.  If we're not using GNU ld we use -z text
-	# though, which does catch some bad symbols but isn't as heavy-handed
-	# as -z defs.
-	no_undefined_flag_CXX='${wl}-z,text'
-	allow_undefined_flag_CXX='${wl}-z,nodefs'
-	archive_cmds_need_lc_CXX=no
-	hardcode_shlibpath_var_CXX=no
-	hardcode_libdir_flag_spec_CXX='${wl}-R,$libdir'
-	hardcode_libdir_separator_CXX=':'
-	link_all_deplibs_CXX=yes
-	export_dynamic_flag_spec_CXX='${wl}-Bexport'
-	runpath_var='LD_RUN_PATH'
-
+	esac
+	;;
+      sunos4*)
 	case $cc_basename in
-          CC*)
-	    archive_cmds_CXX='$CC -G ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
-	    archive_expsym_cmds_CXX='$CC -G ${wl}-Bexport:$export_symbols ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
-	    old_archive_cmds_CXX='$CC -Tprelink_objects $oldobjs~
-	      '"$old_archive_cmds_CXX"
-	    reload_cmds_CXX='$CC -Tprelink_objects $reload_objs~
-	      '"$reload_cmds_CXX"
+	  CC*)
+	    # Sun C++ 4.x
+	    lt_prog_compiler_pic_CXX='-pic'
+	    lt_prog_compiler_static_CXX='-Bstatic'
+	    ;;
+	  lcc*)
+	    # Lucid
+	    lt_prog_compiler_pic_CXX='-pic'
 	    ;;
 	  *)
-	    archive_cmds_CXX='$CC -shared ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
-	    archive_expsym_cmds_CXX='$CC -shared ${wl}-Bexport:$export_symbols ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
 	    ;;
 	esac
-      ;;
-
+	;;
+      sysv5* | unixware* | sco3.2v5* | sco5v6* | OpenUNIX*)
+	case $cc_basename in
+	  CC*)
+	    lt_prog_compiler_wl_CXX='-Wl,'
+	    lt_prog_compiler_pic_CXX='-KPIC'
+	    lt_prog_compiler_static_CXX='-Bstatic'
+	    ;;
+	esac
+	;;
       tandem*)
-        case $cc_basename in
-          NCC*)
+	case $cc_basename in
+	  NCC*)
 	    # NonStop-UX NCC 3.20
-	    # FIXME: insert proper C++ library support
-	    ld_shlibs_CXX=no
+	    lt_prog_compiler_pic_CXX='-KPIC'
 	    ;;
-          *)
-	    # FIXME: insert proper C++ library support
-	    ld_shlibs_CXX=no
+	  *)
 	    ;;
-        esac
-        ;;
-
+	esac
+	;;
       vxworks*)
-        # FIXME: insert proper C++ library support
-        ld_shlibs_CXX=no
-        ;;
-
+	;;
       *)
-        # FIXME: insert proper C++ library support
-        ld_shlibs_CXX=no
-        ;;
+	lt_prog_compiler_can_build_shared_CXX=no
+	;;
     esac
+  fi
 
-    { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ld_shlibs_CXX" >&5
-$as_echo "$ld_shlibs_CXX" >&6; }
-    test "$ld_shlibs_CXX" = no && can_build_shared=no
-
-    GCC_CXX="$GXX"
-    LD_CXX="$LD"
-
-    ## CAVEAT EMPTOR:
-    ## There is no encapsulation within the following macros, do not change
-    ## the running order or otherwise move them around unless you know exactly
-    ## what you are doing...
-    # Dependencies to place before and after the object being linked:
-predep_objects_CXX=
-postdep_objects_CXX=
-predeps_CXX=
-postdeps_CXX=
-compiler_lib_search_path_CXX=
-
-cat > conftest.$ac_ext <<_LT_EOF
-class Foo
-{
-public:
-  Foo (void) { a = 0; }
-private:
-  int a;
-};
-_LT_EOF
-
-
-_lt_libdeps_save_CFLAGS=$CFLAGS
-case "$CC $CFLAGS " in #(
-*\ -flto*\ *) CFLAGS="$CFLAGS -fno-lto" ;;
-*\ -fwhopr*\ *) CFLAGS="$CFLAGS -fno-whopr" ;;
-*\ -fuse-linker-plugin*\ *) CFLAGS="$CFLAGS -fno-use-linker-plugin" ;;
+case $host_os in
+  # For platforms which do not support PIC, -DPIC is meaningless:
+  *djgpp*)
+    lt_prog_compiler_pic_CXX=
+    ;;
+  *)
+    lt_prog_compiler_pic_CXX="$lt_prog_compiler_pic_CXX -DPIC"
+    ;;
 esac
 
-if { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_compile\""; } >&5
-  (eval $ac_compile) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; then
-  # Parse the compiler output and extract the necessary
-  # objects, libraries and library flags.
-
-  # Sentinel used to keep track of whether or not we are before
-  # the conftest object file.
-  pre_test_object_deps_done=no
-
-  for p in `eval "$output_verbose_link_cmd"`; do
-    case ${prev}${p} in
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $compiler option to produce PIC" >&5
+$as_echo_n "checking for $compiler option to produce PIC... " >&6; }
+if ${lt_cv_prog_compiler_pic_CXX+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  lt_cv_prog_compiler_pic_CXX=$lt_prog_compiler_pic_CXX
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_prog_compiler_pic_CXX" >&5
+$as_echo "$lt_cv_prog_compiler_pic_CXX" >&6; }
+lt_prog_compiler_pic_CXX=$lt_cv_prog_compiler_pic_CXX
 
-    -L* | -R* | -l*)
-       # Some compilers place space between "-{L,R}" and the path.
-       # Remove the space.
-       if test $p = "-L" ||
-          test $p = "-R"; then
-	 prev=$p
-	 continue
-       fi
-
-       # Expand the sysroot to ease extracting the directories later.
-       if test -z "$prev"; then
-         case $p in
-         -L*) func_stripname_cnf '-L' '' "$p"; prev=-L; p=$func_stripname_result ;;
-         -R*) func_stripname_cnf '-R' '' "$p"; prev=-R; p=$func_stripname_result ;;
-         -l*) func_stripname_cnf '-l' '' "$p"; prev=-l; p=$func_stripname_result ;;
-         esac
-       fi
-       case $p in
-       =*) func_stripname_cnf '=' '' "$p"; p=$lt_sysroot$func_stripname_result ;;
-       esac
-       if test "$pre_test_object_deps_done" = no; then
-	 case ${prev} in
-	 -L | -R)
-	   # Internal compiler library paths should come after those
-	   # provided the user.  The postdeps already come after the
-	   # user supplied libs so there is no need to process them.
-	   if test -z "$compiler_lib_search_path_CXX"; then
-	     compiler_lib_search_path_CXX="${prev}${p}"
-	   else
-	     compiler_lib_search_path_CXX="${compiler_lib_search_path_CXX} ${prev}${p}"
-	   fi
-	   ;;
-	 # The "-l" case would never come before the object being
-	 # linked, so don't bother handling this case.
-	 esac
-       else
-	 if test -z "$postdeps_CXX"; then
-	   postdeps_CXX="${prev}${p}"
-	 else
-	   postdeps_CXX="${postdeps_CXX} ${prev}${p}"
-	 fi
-       fi
-       prev=
-       ;;
-
-    *.lto.$objext) ;; # Ignore GCC LTO objects
-    *.$objext)
-       # This assumes that the test object file only shows up
-       # once in the compiler output.
-       if test "$p" = "conftest.$objext"; then
-	 pre_test_object_deps_done=yes
-	 continue
-       fi
-
-       if test "$pre_test_object_deps_done" = no; then
-	 if test -z "$predep_objects_CXX"; then
-	   predep_objects_CXX="$p"
-	 else
-	   predep_objects_CXX="$predep_objects_CXX $p"
-	 fi
-       else
-	 if test -z "$postdep_objects_CXX"; then
-	   postdep_objects_CXX="$p"
-	 else
-	   postdep_objects_CXX="$postdep_objects_CXX $p"
-	 fi
-       fi
-       ;;
-
-    *) ;; # Ignore the rest.
+#
+# Check to make sure the PIC flag actually works.
+#
+if test -n "$lt_prog_compiler_pic_CXX"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking if $compiler PIC flag $lt_prog_compiler_pic_CXX works" >&5
+$as_echo_n "checking if $compiler PIC flag $lt_prog_compiler_pic_CXX works... " >&6; }
+if ${lt_cv_prog_compiler_pic_works_CXX+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  lt_cv_prog_compiler_pic_works_CXX=no
+   ac_outfile=conftest.$ac_objext
+   echo "$lt_simple_compile_test_code" > conftest.$ac_ext
+   lt_compiler_flag="$lt_prog_compiler_pic_CXX -DPIC"
+   # Insert the option either (1) after the last *FLAGS variable, or
+   # (2) before a word containing "conftest.", or (3) at the end.
+   # Note that $ac_compile itself does not contain backslashes and begins
+   # with a dollar sign (not a hyphen), so the echo should work correctly.
+   # The option is referenced via a variable to avoid confusing sed.
+   lt_compile=`echo "$ac_compile" | $SED \
+   -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
+   -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
+   -e 's:$: $lt_compiler_flag:'`
+   (eval echo "\"\$as_me:$LINENO: $lt_compile\"" >&5)
+   (eval "$lt_compile" 2>conftest.err)
+   ac_status=$?
+   cat conftest.err >&5
+   echo "$as_me:$LINENO: \$? = $ac_status" >&5
+   if (exit $ac_status) && test -s "$ac_outfile"; then
+     # The compiler can only warn and ignore the option if not recognized
+     # So say no if there are warnings other than the usual output.
+     $ECHO "$_lt_compiler_boilerplate" | $SED '/^$/d' >conftest.exp
+     $SED '/^$/d; /^ *+/d' conftest.err >conftest.er2
+     if test ! -s conftest.er2 || diff conftest.exp conftest.er2 >/dev/null; then
+       lt_cv_prog_compiler_pic_works_CXX=yes
+     fi
+   fi
+   $RM conftest*
 
-    esac
-  done
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_prog_compiler_pic_works_CXX" >&5
+$as_echo "$lt_cv_prog_compiler_pic_works_CXX" >&6; }
 
-  # Clean up.
-  rm -f a.out a.exe
+if test x"$lt_cv_prog_compiler_pic_works_CXX" = xyes; then
+    case $lt_prog_compiler_pic_CXX in
+     "" | " "*) ;;
+     *) lt_prog_compiler_pic_CXX=" $lt_prog_compiler_pic_CXX" ;;
+     esac
 else
-  echo "libtool.m4: error: problem compiling CXX test program"
+    lt_prog_compiler_pic_CXX=
+     lt_prog_compiler_can_build_shared_CXX=no
 fi
 
-$RM -f confest.$objext
-CFLAGS=$_lt_libdeps_save_CFLAGS
-
-# PORTME: override above test on systems where it is broken
-case $host_os in
-interix[3-9]*)
-  # Interix 3.5 installs completely hosed .la files for C++, so rather than
-  # hack all around it, let's just trust "g++" to DTRT.
-  predep_objects_CXX=
-  postdep_objects_CXX=
-  postdeps_CXX=
-  ;;
-
-linux*)
-  case `$CC -V 2>&1 | sed 5q` in
-  *Sun\ C*)
-    # Sun C++ 5.9
-
-    # The more standards-conforming stlport4 library is
-    # incompatible with the Cstd library. Avoid specifying
-    # it if it's in CXXFLAGS. Ignore libCrun as
-    # -library=stlport4 depends on it.
-    case " $CXX $CXXFLAGS " in
-    *" -library=stlport4 "*)
-      solaris_use_stlport4=yes
-      ;;
-    esac
-
-    if test "$solaris_use_stlport4" != yes; then
-      postdeps_CXX='-library=Cstd -library=Crun'
-    fi
-    ;;
-  esac
-  ;;
-
-solaris*)
-  case $cc_basename in
-  CC* | sunCC*)
-    # The more standards-conforming stlport4 library is
-    # incompatible with the Cstd library. Avoid specifying
-    # it if it's in CXXFLAGS. Ignore libCrun as
-    # -library=stlport4 depends on it.
-    case " $CXX $CXXFLAGS " in
-    *" -library=stlport4 "*)
-      solaris_use_stlport4=yes
-      ;;
-    esac
-
-    # Adding this requires a known-good setup of shared libraries for
-    # Sun compiler versions before 5.6, else PIC objects from an old
-    # archive will be linked into the output, leading to subtle bugs.
-    if test "$solaris_use_stlport4" != yes; then
-      postdeps_CXX='-library=Cstd -library=Crun'
-    fi
-    ;;
-  esac
-  ;;
-esac
-
-
-case " $postdeps_CXX " in
-*" -lc "*) archive_cmds_need_lc_CXX=no ;;
-esac
- compiler_lib_search_dirs_CXX=
-if test -n "${compiler_lib_search_path_CXX}"; then
- compiler_lib_search_dirs_CXX=`echo " ${compiler_lib_search_path_CXX}" | ${SED} -e 's! -L! !g' -e 's!^ !!'`
 fi
 
 
 
 
 
+#
+# Check to make sure the static flag actually works.
+#
+wl=$lt_prog_compiler_wl_CXX eval lt_tmp_static_flag=\"$lt_prog_compiler_static_CXX\"
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking if $compiler static flag $lt_tmp_static_flag works" >&5
+$as_echo_n "checking if $compiler static flag $lt_tmp_static_flag works... " >&6; }
+if ${lt_cv_prog_compiler_static_works_CXX+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  lt_cv_prog_compiler_static_works_CXX=no
+   save_LDFLAGS="$LDFLAGS"
+   LDFLAGS="$LDFLAGS $lt_tmp_static_flag"
+   echo "$lt_simple_link_test_code" > conftest.$ac_ext
+   if (eval $ac_link 2>conftest.err) && test -s conftest$ac_exeext; then
+     # The linker can only warn and ignore the option if not recognized
+     # So say no if there are warnings
+     if test -s conftest.err; then
+       # Append any errors to the config.log.
+       cat conftest.err 1>&5
+       $ECHO "$_lt_linker_boilerplate" | $SED '/^$/d' > conftest.exp
+       $SED '/^$/d; /^ *+/d' conftest.err >conftest.er2
+       if diff conftest.exp conftest.er2 >/dev/null; then
+         lt_cv_prog_compiler_static_works_CXX=yes
+       fi
+     else
+       lt_cv_prog_compiler_static_works_CXX=yes
+     fi
+   fi
+   $RM -r conftest*
+   LDFLAGS="$save_LDFLAGS"
 
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_prog_compiler_static_works_CXX" >&5
+$as_echo "$lt_cv_prog_compiler_static_works_CXX" >&6; }
 
+if test x"$lt_cv_prog_compiler_static_works_CXX" = xyes; then
+    :
+else
+    lt_prog_compiler_static_CXX=
+fi
 
 
 
 
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking if $compiler supports -c -o file.$ac_objext" >&5
+$as_echo_n "checking if $compiler supports -c -o file.$ac_objext... " >&6; }
+if ${lt_cv_prog_compiler_c_o_CXX+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  lt_cv_prog_compiler_c_o_CXX=no
+   $RM -r conftest 2>/dev/null
+   mkdir conftest
+   cd conftest
+   mkdir out
+   echo "$lt_simple_compile_test_code" > conftest.$ac_ext
 
+   lt_compiler_flag="-o out/conftest2.$ac_objext"
+   # Insert the option either (1) after the last *FLAGS variable, or
+   # (2) before a word containing "conftest.", or (3) at the end.
+   # Note that $ac_compile itself does not contain backslashes and begins
+   # with a dollar sign (not a hyphen), so the echo should work correctly.
+   lt_compile=`echo "$ac_compile" | $SED \
+   -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
+   -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
+   -e 's:$: $lt_compiler_flag:'`
+   (eval echo "\"\$as_me:$LINENO: $lt_compile\"" >&5)
+   (eval "$lt_compile" 2>out/conftest.err)
+   ac_status=$?
+   cat out/conftest.err >&5
+   echo "$as_me:$LINENO: \$? = $ac_status" >&5
+   if (exit $ac_status) && test -s out/conftest2.$ac_objext
+   then
+     # The compiler can only warn and ignore the option if not recognized
+     # So say no if there are warnings
+     $ECHO "$_lt_compiler_boilerplate" | $SED '/^$/d' > out/conftest.exp
+     $SED '/^$/d; /^ *+/d' out/conftest.err >out/conftest.er2
+     if test ! -s out/conftest.er2 || diff out/conftest.exp out/conftest.er2 >/dev/null; then
+       lt_cv_prog_compiler_c_o_CXX=yes
+     fi
+   fi
+   chmod u+w . 2>&5
+   $RM conftest*
+   # SGI C++ compiler will create directory out/ii_files/ for
+   # template instantiation
+   test -d out/ii_files && $RM out/ii_files/* && rmdir out/ii_files
+   $RM out/* && rmdir out
+   cd ..
+   $RM -r conftest
+   $RM conftest*
 
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_prog_compiler_c_o_CXX" >&5
+$as_echo "$lt_cv_prog_compiler_c_o_CXX" >&6; }
 
 
 
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking if $compiler supports -c -o file.$ac_objext" >&5
+$as_echo_n "checking if $compiler supports -c -o file.$ac_objext... " >&6; }
+if ${lt_cv_prog_compiler_c_o_CXX+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  lt_cv_prog_compiler_c_o_CXX=no
+   $RM -r conftest 2>/dev/null
+   mkdir conftest
+   cd conftest
+   mkdir out
+   echo "$lt_simple_compile_test_code" > conftest.$ac_ext
 
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-    lt_prog_compiler_wl_CXX=
-lt_prog_compiler_pic_CXX=
-lt_prog_compiler_static_CXX=
-
-
-  # C++ specific cases for pic, static, wl, etc.
-  if test "$GXX" = yes; then
-    lt_prog_compiler_wl_CXX='-Wl,'
-    lt_prog_compiler_static_CXX='-static'
-
-    case $host_os in
-    aix*)
-      # All AIX code is PIC.
-      if test "$host_cpu" = ia64; then
-	# AIX 5 now supports IA64 processor
-	lt_prog_compiler_static_CXX='-Bstatic'
-      fi
-      ;;
-
-    amigaos*)
-      case $host_cpu in
-      powerpc)
-            # see comment about AmigaOS4 .so support
-            lt_prog_compiler_pic_CXX='-fPIC'
-        ;;
-      m68k)
-            # FIXME: we need at least 68020 code to build shared libraries, but
-            # adding the `-m68020' flag to GCC prevents building anything better,
-            # like `-m68040'.
-            lt_prog_compiler_pic_CXX='-m68020 -resident32 -malways-restore-a4'
-        ;;
-      esac
-      ;;
-
-    beos* | irix5* | irix6* | nonstopux* | osf3* | osf4* | osf5*)
-      # PIC is the default for these OSes.
-      ;;
-    mingw* | cygwin* | os2* | pw32* | cegcc*)
-      # This hack is so that the source file can tell whether it is being
-      # built for inclusion in a dll (and should export symbols for example).
-      # Although the cygwin gcc ignores -fPIC, still need this for old-style
-      # (--disable-auto-import) libraries
-      lt_prog_compiler_pic_CXX='-DDLL_EXPORT'
-      ;;
-    darwin* | rhapsody*)
-      # PIC is the default on this platform
-      # Common symbols not allowed in MH_DYLIB files
-      lt_prog_compiler_pic_CXX='-fno-common'
-      ;;
-    *djgpp*)
-      # DJGPP does not support shared libraries at all
-      lt_prog_compiler_pic_CXX=
-      ;;
-    haiku*)
-      # PIC is the default for Haiku.
-      # The "-static" flag exists, but is broken.
-      lt_prog_compiler_static_CXX=
-      ;;
-    interix[3-9]*)
-      # Interix 3.x gcc -fpic/-fPIC options generate broken code.
-      # Instead, we relocate shared libraries at runtime.
-      ;;
-    sysv4*MP*)
-      if test -d /usr/nec; then
-	lt_prog_compiler_pic_CXX=-Kconform_pic
-      fi
-      ;;
-    hpux*)
-      # PIC is the default for 64-bit PA HP-UX, but not for 32-bit
-      # PA HP-UX.  On IA64 HP-UX, PIC is the default but the pic flag
-      # sets the default TLS model and affects inlining.
-      case $host_cpu in
-      hppa*64*)
-	;;
-      *)
-	lt_prog_compiler_pic_CXX='-fPIC'
-	;;
-      esac
-      ;;
-    *qnx* | *nto*)
-      # QNX uses GNU C++, but need to define -shared option too, otherwise
-      # it will coredump.
-      lt_prog_compiler_pic_CXX='-fPIC -shared'
+   lt_compiler_flag="-o out/conftest2.$ac_objext"
+   # Insert the option either (1) after the last *FLAGS variable, or
+   # (2) before a word containing "conftest.", or (3) at the end.
+   # Note that $ac_compile itself does not contain backslashes and begins
+   # with a dollar sign (not a hyphen), so the echo should work correctly.
+   lt_compile=`echo "$ac_compile" | $SED \
+   -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
+   -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
+   -e 's:$: $lt_compiler_flag:'`
+   (eval echo "\"\$as_me:$LINENO: $lt_compile\"" >&5)
+   (eval "$lt_compile" 2>out/conftest.err)
+   ac_status=$?
+   cat out/conftest.err >&5
+   echo "$as_me:$LINENO: \$? = $ac_status" >&5
+   if (exit $ac_status) && test -s out/conftest2.$ac_objext
+   then
+     # The compiler can only warn and ignore the option if not recognized
+     # So say no if there are warnings
+     $ECHO "$_lt_compiler_boilerplate" | $SED '/^$/d' > out/conftest.exp
+     $SED '/^$/d; /^ *+/d' out/conftest.err >out/conftest.er2
+     if test ! -s out/conftest.er2 || diff out/conftest.exp out/conftest.er2 >/dev/null; then
+       lt_cv_prog_compiler_c_o_CXX=yes
+     fi
+   fi
+   chmod u+w . 2>&5
+   $RM conftest*
+   # SGI C++ compiler will create directory out/ii_files/ for
+   # template instantiation
+   test -d out/ii_files && $RM out/ii_files/* && rmdir out/ii_files
+   $RM out/* && rmdir out
+   cd ..
+   $RM -r conftest
+   $RM conftest*
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_prog_compiler_c_o_CXX" >&5
+$as_echo "$lt_cv_prog_compiler_c_o_CXX" >&6; }
+
+
+
+
+hard_links="nottested"
+if test "$lt_cv_prog_compiler_c_o_CXX" = no && test "$need_locks" != no; then
+  # do not overwrite the value of need_locks provided by the user
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking if we can lock with hard links" >&5
+$as_echo_n "checking if we can lock with hard links... " >&6; }
+  hard_links=yes
+  $RM conftest*
+  ln conftest.a conftest.b 2>/dev/null && hard_links=no
+  touch conftest.a
+  ln conftest.a conftest.b 2>&5 || hard_links=no
+  ln conftest.a conftest.b 2>/dev/null && hard_links=no
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $hard_links" >&5
+$as_echo "$hard_links" >&6; }
+  if test "$hard_links" = no; then
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: \`$CC' does not support \`-c -o', so \`make -j' may be unsafe" >&5
+$as_echo "$as_me: WARNING: \`$CC' does not support \`-c -o', so \`make -j' may be unsafe" >&2;}
+    need_locks=warn
+  fi
+else
+  need_locks=no
+fi
+
+
+
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the $compiler linker ($LD) supports shared libraries" >&5
+$as_echo_n "checking whether the $compiler linker ($LD) supports shared libraries... " >&6; }
+
+  export_symbols_cmds_CXX='$NM $libobjs $convenience | $global_symbol_pipe | $SED '\''s/.* //'\'' | sort | uniq > $export_symbols'
+  exclude_expsyms_CXX='_GLOBAL_OFFSET_TABLE_|_GLOBAL__F[ID]_.*'
+  case $host_os in
+  aix[4-9]*)
+    # If we're using GNU nm, then we don't want the "-C" option.
+    # -C means demangle to AIX nm, but means don't demangle with GNU nm
+    # Also, AIX nm treats weak defined symbols like other global defined
+    # symbols, whereas GNU nm marks them as "W".
+    if $NM -V 2>&1 | $GREP 'GNU' > /dev/null; then
+      export_symbols_cmds_CXX='$NM -Bpg $libobjs $convenience | awk '\''{ if (((\$ 2 == "T") || (\$ 2 == "D") || (\$ 2 == "B") || (\$ 2 == "W")) && (substr(\$ 3,1,1) != ".")) { print \$ 3 } }'\'' | sort -u > $export_symbols'
+    else
+      export_symbols_cmds_CXX='$NM -BCpg $libobjs $convenience | awk '\''{ if (((\$ 2 == "T") || (\$ 2 == "D") || (\$ 2 == "B")) && (substr(\$ 3,1,1) != ".")) { print \$ 3 } }'\'' | sort -u > $export_symbols'
+    fi
+    ;;
+  pw32*)
+    export_symbols_cmds_CXX="$ltdll_cmds"
+    ;;
+  cygwin* | mingw* | cegcc*)
+    case $cc_basename in
+    cl*)
+      exclude_expsyms_CXX='_NULL_IMPORT_DESCRIPTOR|_IMPORT_DESCRIPTOR_.*'
       ;;
     *)
-      lt_prog_compiler_pic_CXX='-fPIC'
+      export_symbols_cmds_CXX='$NM $libobjs $convenience | $global_symbol_pipe | $SED -e '\''/^[BCDGRS][ ]/s/.*[ ]\([^ ]*\)/\1 DATA/;s/^.*[ ]__nm__\([^ ]*\)[ ][^ ]*/\1 DATA/;/^I[ ]/d;/^[AITW][ ]/s/.* //'\'' | sort | uniq > $export_symbols'
+      exclude_expsyms_CXX='[_]+GLOBAL_OFFSET_TABLE_|[_]+GLOBAL__[FID]_.*|[_]+head_[A-Za-z0-9_]+_dll|[A-Za-z0-9_]+_dll_iname'
       ;;
     esac
-  else
-    case $host_os in
-      aix[4-9]*)
-	# All AIX code is PIC.
-	if test "$host_cpu" = ia64; then
-	  # AIX 5 now supports IA64 processor
-	  lt_prog_compiler_static_CXX='-Bstatic'
-	else
-	  lt_prog_compiler_static_CXX='-bnso -bI:/lib/syscalls.exp'
-	fi
-	;;
-      chorus*)
-	case $cc_basename in
-	cxch68*)
-	  # Green Hills C++ Compiler
-	  # _LT_TAGVAR(lt_prog_compiler_static, CXX)="--no_auto_instantiation -u __main -u __premain -u _abort -r $COOL_DIR/lib/libOrb.a $MVME_DIR/lib/CC/libC.a $MVME_DIR/lib/classix/libcx.s.a"
-	  ;;
-	esac
-	;;
-      mingw* | cygwin* | os2* | pw32* | cegcc*)
-	# This hack is so that the source file can tell whether it is being
-	# built for inclusion in a dll (and should export symbols for example).
-	lt_prog_compiler_pic_CXX='-DDLL_EXPORT'
-	;;
-      dgux*)
-	case $cc_basename in
-	  ec++*)
-	    lt_prog_compiler_pic_CXX='-KPIC'
-	    ;;
-	  ghcx*)
-	    # Green Hills C++ Compiler
-	    lt_prog_compiler_pic_CXX='-pic'
-	    ;;
-	  *)
-	    ;;
-	esac
-	;;
-      freebsd* | dragonfly*)
-	# FreeBSD uses GNU C++
-	;;
-      hpux9* | hpux10* | hpux11*)
-	case $cc_basename in
-	  CC*)
-	    lt_prog_compiler_wl_CXX='-Wl,'
-	    lt_prog_compiler_static_CXX='${wl}-a ${wl}archive'
-	    if test "$host_cpu" != ia64; then
-	      lt_prog_compiler_pic_CXX='+Z'
-	    fi
-	    ;;
-	  aCC*)
-	    lt_prog_compiler_wl_CXX='-Wl,'
-	    lt_prog_compiler_static_CXX='${wl}-a ${wl}archive'
-	    case $host_cpu in
-	    hppa*64*|ia64*)
-	      # +Z the default
-	      ;;
-	    *)
-	      lt_prog_compiler_pic_CXX='+Z'
-	      ;;
-	    esac
-	    ;;
-	  *)
-	    ;;
-	esac
-	;;
-      interix*)
-	# This is c89, which is MS Visual C++ (no shared libs)
-	# Anyone wants to do a port?
-	;;
-      irix5* | irix6* | nonstopux*)
-	case $cc_basename in
-	  CC*)
-	    lt_prog_compiler_wl_CXX='-Wl,'
-	    lt_prog_compiler_static_CXX='-non_shared'
-	    # CC pic flag -KPIC is the default.
-	    ;;
-	  *)
-	    ;;
-	esac
-	;;
-      linux* | k*bsd*-gnu | kopensolaris*-gnu | gnu*)
-	case $cc_basename in
-	  KCC*)
-	    # KAI C++ Compiler
-	    lt_prog_compiler_wl_CXX='--backend -Wl,'
-	    lt_prog_compiler_pic_CXX='-fPIC'
-	    ;;
-	  ecpc* )
-	    # old Intel C++ for x86_64 which still supported -KPIC.
-	    lt_prog_compiler_wl_CXX='-Wl,'
-	    lt_prog_compiler_pic_CXX='-KPIC'
-	    lt_prog_compiler_static_CXX='-static'
-	    ;;
-	  icpc* )
-	    # Intel C++, used to be incompatible with GCC.
-	    # ICC 10 doesn't accept -KPIC any more.
-	    lt_prog_compiler_wl_CXX='-Wl,'
-	    lt_prog_compiler_pic_CXX='-fPIC'
-	    lt_prog_compiler_static_CXX='-static'
-	    ;;
-	  pgCC* | pgcpp*)
-	    # Portland Group C++ compiler
-	    lt_prog_compiler_wl_CXX='-Wl,'
-	    lt_prog_compiler_pic_CXX='-fpic'
-	    lt_prog_compiler_static_CXX='-Bstatic'
-	    ;;
-	  cxx*)
-	    # Compaq C++
-	    # Make sure the PIC flag is empty.  It appears that all Alpha
-	    # Linux and Compaq Tru64 Unix objects are PIC.
-	    lt_prog_compiler_pic_CXX=
-	    lt_prog_compiler_static_CXX='-non_shared'
-	    ;;
-	  xlc* | xlC* | bgxl[cC]* | mpixl[cC]*)
-	    # IBM XL 8.0, 9.0 on PPC and BlueGene
-	    lt_prog_compiler_wl_CXX='-Wl,'
-	    lt_prog_compiler_pic_CXX='-qpic'
-	    lt_prog_compiler_static_CXX='-qstaticlink'
-	    ;;
-	  *)
-	    case `$CC -V 2>&1 | sed 5q` in
-	    *Sun\ C*)
-	      # Sun C++ 5.9
-	      lt_prog_compiler_pic_CXX='-KPIC'
-	      lt_prog_compiler_static_CXX='-Bstatic'
-	      lt_prog_compiler_wl_CXX='-Qoption ld '
-	      ;;
-	    esac
-	    ;;
-	esac
-	;;
-      lynxos*)
-	;;
-      m88k*)
-	;;
-      mvs*)
-	case $cc_basename in
-	  cxx*)
-	    lt_prog_compiler_pic_CXX='-W c,exportall'
-	    ;;
-	  *)
-	    ;;
-	esac
-	;;
-      netbsd* | netbsdelf*-gnu)
-	;;
-      *qnx* | *nto*)
-        # QNX uses GNU C++, but need to define -shared option too, otherwise
-        # it will coredump.
-        lt_prog_compiler_pic_CXX='-fPIC -shared'
-        ;;
-      osf3* | osf4* | osf5*)
-	case $cc_basename in
-	  KCC*)
-	    lt_prog_compiler_wl_CXX='--backend -Wl,'
-	    ;;
-	  RCC*)
-	    # Rational C++ 2.4.1
-	    lt_prog_compiler_pic_CXX='-pic'
-	    ;;
-	  cxx*)
-	    # Digital/Compaq C++
-	    lt_prog_compiler_wl_CXX='-Wl,'
-	    # Make sure the PIC flag is empty.  It appears that all Alpha
-	    # Linux and Compaq Tru64 Unix objects are PIC.
-	    lt_prog_compiler_pic_CXX=
-	    lt_prog_compiler_static_CXX='-non_shared'
-	    ;;
-	  *)
-	    ;;
-	esac
-	;;
-      psos*)
-	;;
-      solaris*)
-	case $cc_basename in
-	  CC* | sunCC*)
-	    # Sun C++ 4.2, 5.x and Centerline C++
-	    lt_prog_compiler_pic_CXX='-KPIC'
-	    lt_prog_compiler_static_CXX='-Bstatic'
-	    lt_prog_compiler_wl_CXX='-Qoption ld '
-	    ;;
-	  gcx*)
-	    # Green Hills C++ Compiler
-	    lt_prog_compiler_pic_CXX='-PIC'
-	    ;;
-	  *)
-	    ;;
-	esac
-	;;
-      sunos4*)
-	case $cc_basename in
-	  CC*)
-	    # Sun C++ 4.x
-	    lt_prog_compiler_pic_CXX='-pic'
-	    lt_prog_compiler_static_CXX='-Bstatic'
-	    ;;
-	  lcc*)
-	    # Lucid
-	    lt_prog_compiler_pic_CXX='-pic'
-	    ;;
-	  *)
-	    ;;
-	esac
-	;;
-      sysv5* | unixware* | sco3.2v5* | sco5v6* | OpenUNIX*)
-	case $cc_basename in
-	  CC*)
-	    lt_prog_compiler_wl_CXX='-Wl,'
-	    lt_prog_compiler_pic_CXX='-KPIC'
-	    lt_prog_compiler_static_CXX='-Bstatic'
-	    ;;
-	esac
-	;;
-      tandem*)
-	case $cc_basename in
-	  NCC*)
-	    # NonStop-UX NCC 3.20
-	    lt_prog_compiler_pic_CXX='-KPIC'
-	    ;;
-	  *)
-	    ;;
-	esac
-	;;
-      vxworks*)
-	;;
-      *)
-	lt_prog_compiler_can_build_shared_CXX=no
-	;;
+    ;;
+  linux* | k*bsd*-gnu | gnu*)
+    link_all_deplibs_CXX=no
+    ;;
+  *)
+    export_symbols_cmds_CXX='$NM $libobjs $convenience | $global_symbol_pipe | $SED '\''s/.* //'\'' | sort | uniq > $export_symbols'
+    ;;
+  esac
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ld_shlibs_CXX" >&5
+$as_echo "$ld_shlibs_CXX" >&6; }
+test "$ld_shlibs_CXX" = no && can_build_shared=no
+
+with_gnu_ld_CXX=$with_gnu_ld
+
+
+
+
+
+
+#
+# Do we need to explicitly link libc?
+#
+case "x$archive_cmds_need_lc_CXX" in
+x|xyes)
+  # Assume -lc should be added
+  archive_cmds_need_lc_CXX=yes
+
+  if test "$enable_shared" = yes && test "$GCC" = yes; then
+    case $archive_cmds_CXX in
+    *'~'*)
+      # FIXME: we may have to deal with multi-command sequences.
+      ;;
+    '$CC '*)
+      # Test whether the compiler implicitly links with -lc since on some
+      # systems, -lgcc has to come before -lc. If gcc already passes -lc
+      # to ld, don't add -lc before -lgcc.
+      { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether -lc should be explicitly linked in" >&5
+$as_echo_n "checking whether -lc should be explicitly linked in... " >&6; }
+if ${lt_cv_archive_cmds_need_lc_CXX+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  $RM conftest*
+	echo "$lt_simple_compile_test_code" > conftest.$ac_ext
+
+	if { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_compile\""; } >&5
+  (eval $ac_compile) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; } 2>conftest.err; then
+	  soname=conftest
+	  lib=conftest
+	  libobjs=conftest.$ac_objext
+	  deplibs=
+	  wl=$lt_prog_compiler_wl_CXX
+	  pic_flag=$lt_prog_compiler_pic_CXX
+	  compiler_flags=-v
+	  linker_flags=-v
+	  verstring=
+	  output_objdir=.
+	  libname=conftest
+	  lt_save_allow_undefined_flag=$allow_undefined_flag_CXX
+	  allow_undefined_flag_CXX=
+	  if { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$archive_cmds_CXX 2\>\&1 \| $GREP \" -lc \" \>/dev/null 2\>\&1\""; } >&5
+  (eval $archive_cmds_CXX 2\>\&1 \| $GREP \" -lc \" \>/dev/null 2\>\&1) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }
+	  then
+	    lt_cv_archive_cmds_need_lc_CXX=no
+	  else
+	    lt_cv_archive_cmds_need_lc_CXX=yes
+	  fi
+	  allow_undefined_flag_CXX=$lt_save_allow_undefined_flag
+	else
+	  cat conftest.err 1>&5
+	fi
+	$RM conftest*
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_archive_cmds_need_lc_CXX" >&5
+$as_echo "$lt_cv_archive_cmds_need_lc_CXX" >&6; }
+      archive_cmds_need_lc_CXX=$lt_cv_archive_cmds_need_lc_CXX
+      ;;
+    esac
+  fi
+  ;;
+esac
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking dynamic linker characteristics" >&5
+$as_echo_n "checking dynamic linker characteristics... " >&6; }
+
+library_names_spec=
+libname_spec='lib$name'
+soname_spec=
+shrext_cmds=".so"
+postinstall_cmds=
+postuninstall_cmds=
+finish_cmds=
+finish_eval=
+shlibpath_var=
+shlibpath_overrides_runpath=unknown
+version_type=none
+dynamic_linker="$host_os ld.so"
+sys_lib_dlsearch_path_spec="/lib /usr/lib"
+need_lib_prefix=unknown
+hardcode_into_libs=no
+
+# when you set need_version to no, make sure it does not cause -set_version
+# flags to be left without arguments
+need_version=unknown
+
+case $host_os in
+aix3*)
+  version_type=linux # correct to gnu/linux during the next big refactor
+  library_names_spec='${libname}${release}${shared_ext}$versuffix $libname.a'
+  shlibpath_var=LIBPATH
+
+  # AIX 3 has no versioning support, so we append a major version to the name.
+  soname_spec='${libname}${release}${shared_ext}$major'
+  ;;
+
+aix[4-9]*)
+  version_type=linux # correct to gnu/linux during the next big refactor
+  need_lib_prefix=no
+  need_version=no
+  hardcode_into_libs=yes
+  if test "$host_cpu" = ia64; then
+    # AIX 5 supports IA64
+    library_names_spec='${libname}${release}${shared_ext}$major ${libname}${release}${shared_ext}$versuffix $libname${shared_ext}'
+    shlibpath_var=LD_LIBRARY_PATH
+  else
+    # With GCC up to 2.95.x, collect2 would create an import file
+    # for dependence libraries.  The import file would start with
+    # the line `#! .'.  This would cause the generated library to
+    # depend on `.', always an invalid library.  This was fixed in
+    # development snapshots of GCC prior to 3.0.
+    case $host_os in
+      aix4 | aix4.[01] | aix4.[01].*)
+      if { echo '#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 97)'
+	   echo ' yes '
+	   echo '#endif'; } | ${CC} -E - | $GREP yes > /dev/null; then
+	:
+      else
+	can_build_shared=no
+      fi
+      ;;
+    esac
+    # AIX (on Power*) has no versioning support, so currently we can not hardcode correct
+    # soname into executable. Probably we can add versioning support to
+    # collect2, so additional links can be useful in future.
+    if test "$aix_use_runtimelinking" = yes; then
+      # If using run time linking (on AIX 4.2 or later) use lib<name>.so
+      # instead of lib<name>.a to let people know that these are not
+      # typical AIX shared libraries.
+      library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+    else
+      # We preserve .a as extension for shared libraries through AIX4.2
+      # and later when we are not doing run time linking.
+      library_names_spec='${libname}${release}.a $libname.a'
+      soname_spec='${libname}${release}${shared_ext}$major'
+    fi
+    shlibpath_var=LIBPATH
+  fi
+  ;;
+
+amigaos*)
+  case $host_cpu in
+  powerpc)
+    # Since July 2007 AmigaOS4 officially supports .so libraries.
+    # When compiling the executable, add -use-dynld -Lsobjs: to the compileline.
+    library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+    ;;
+  m68k)
+    library_names_spec='$libname.ixlibrary $libname.a'
+    # Create ${libname}_ixlibrary.a entries in /sys/libs.
+    finish_eval='for lib in `ls $libdir/*.ixlibrary 2>/dev/null`; do libname=`func_echo_all "$lib" | $SED '\''s%^.*/\([^/]*\)\.ixlibrary$%\1%'\''`; test $RM /sys/libs/${libname}_ixlibrary.a; $show "cd /sys/libs && $LN_S $lib ${libname}_ixlibrary.a"; cd /sys/libs && $LN_S $lib ${libname}_ixlibrary.a || exit 1; done'
+    ;;
+  esac
+  ;;
+
+beos*)
+  library_names_spec='${libname}${shared_ext}'
+  dynamic_linker="$host_os ld.so"
+  shlibpath_var=LIBRARY_PATH
+  ;;
+
+bsdi[45]*)
+  version_type=linux # correct to gnu/linux during the next big refactor
+  need_version=no
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+  soname_spec='${libname}${release}${shared_ext}$major'
+  finish_cmds='PATH="\$PATH:/sbin" ldconfig $libdir'
+  shlibpath_var=LD_LIBRARY_PATH
+  sys_lib_search_path_spec="/shlib /usr/lib /usr/X11/lib /usr/contrib/lib /lib /usr/local/lib"
+  sys_lib_dlsearch_path_spec="/shlib /usr/lib /usr/local/lib"
+  # the default ld.so.conf also contains /usr/contrib/lib and
+  # /usr/X11R6/lib (/usr/X11 is a link to /usr/X11R6), but let us allow
+  # libtool to hard-code these into programs
+  ;;
+
+cygwin* | mingw* | pw32* | cegcc*)
+  version_type=windows
+  shrext_cmds=".dll"
+  need_version=no
+  need_lib_prefix=no
+
+  case $GCC,$cc_basename in
+  yes,*)
+    # gcc
+    library_names_spec='$libname.dll.a'
+    # DLL is installed to $(libdir)/../bin by postinstall_cmds
+    postinstall_cmds='base_file=`basename \${file}`~
+      dlpath=`$SHELL 2>&1 -c '\''. $dir/'\''\${base_file}'\''i; echo \$dlname'\''`~
+      dldir=$destdir/`dirname \$dlpath`~
+      test -d \$dldir || mkdir -p \$dldir~
+      $install_prog $dir/$dlname \$dldir/$dlname~
+      chmod a+x \$dldir/$dlname~
+      if test -n '\''$stripme'\'' && test -n '\''$striplib'\''; then
+        eval '\''$striplib \$dldir/$dlname'\'' || exit \$?;
+      fi'
+    postuninstall_cmds='dldll=`$SHELL 2>&1 -c '\''. $file; echo \$dlname'\''`~
+      dlpath=$dir/\$dldll~
+       $RM \$dlpath'
+    shlibpath_overrides_runpath=yes
+
+    case $host_os in
+    cygwin*)
+      # Cygwin DLLs use 'cyg' prefix rather than 'lib'
+      soname_spec='`echo ${libname} | sed -e 's/^lib/cyg/'``echo ${release} | $SED -e 's/[.]/-/g'`${versuffix}${shared_ext}'
+
+      ;;
+    mingw* | cegcc*)
+      # MinGW DLLs use traditional 'lib' prefix
+      soname_spec='${libname}`echo ${release} | $SED -e 's/[.]/-/g'`${versuffix}${shared_ext}'
+      ;;
+    pw32*)
+      # pw32 DLLs use 'pw' prefix rather than 'lib'
+      library_names_spec='`echo ${libname} | sed -e 's/^lib/pw/'``echo ${release} | $SED -e 's/[.]/-/g'`${versuffix}${shared_ext}'
+      ;;
+    esac
+    dynamic_linker='Win32 ld.exe'
+    ;;
+
+  *,cl*)
+    # Native MSVC
+    libname_spec='$name'
+    soname_spec='${libname}`echo ${release} | $SED -e 's/[.]/-/g'`${versuffix}${shared_ext}'
+    library_names_spec='${libname}.dll.lib'
+
+    case $build_os in
+    mingw*)
+      sys_lib_search_path_spec=
+      lt_save_ifs=$IFS
+      IFS=';'
+      for lt_path in $LIB
+      do
+        IFS=$lt_save_ifs
+        # Let DOS variable expansion print the short 8.3 style file name.
+        lt_path=`cd "$lt_path" 2>/dev/null && cmd //C "for %i in (".") do @echo %~si"`
+        sys_lib_search_path_spec="$sys_lib_search_path_spec $lt_path"
+      done
+      IFS=$lt_save_ifs
+      # Convert to MSYS style.
+      sys_lib_search_path_spec=`$ECHO "$sys_lib_search_path_spec" | sed -e 's|\\\\|/|g' -e 's| \\([a-zA-Z]\\):| /\\1|g' -e 's|^ ||'`
+      ;;
+    cygwin*)
+      # Convert to unix form, then to dos form, then back to unix form
+      # but this time dos style (no spaces!) so that the unix form looks
+      # like /cygdrive/c/PROGRA~1:/cygdr...
+      sys_lib_search_path_spec=`cygpath --path --unix "$LIB"`
+      sys_lib_search_path_spec=`cygpath --path --dos "$sys_lib_search_path_spec" 2>/dev/null`
+      sys_lib_search_path_spec=`cygpath --path --unix "$sys_lib_search_path_spec" | $SED -e "s/$PATH_SEPARATOR/ /g"`
+      ;;
+    *)
+      sys_lib_search_path_spec="$LIB"
+      if $ECHO "$sys_lib_search_path_spec" | $GREP ';[c-zC-Z]:/' >/dev/null; then
+        # It is most probably a Windows format PATH.
+        sys_lib_search_path_spec=`$ECHO "$sys_lib_search_path_spec" | $SED -e 's/;/ /g'`
+      else
+        sys_lib_search_path_spec=`$ECHO "$sys_lib_search_path_spec" | $SED -e "s/$PATH_SEPARATOR/ /g"`
+      fi
+      # FIXME: find the short name or the path components, as spaces are
+      # common. (e.g. "Program Files" -> "PROGRA~1")
+      ;;
+    esac
+
+    # DLL is installed to $(libdir)/../bin by postinstall_cmds
+    postinstall_cmds='base_file=`basename \${file}`~
+      dlpath=`$SHELL 2>&1 -c '\''. $dir/'\''\${base_file}'\''i; echo \$dlname'\''`~
+      dldir=$destdir/`dirname \$dlpath`~
+      test -d \$dldir || mkdir -p \$dldir~
+      $install_prog $dir/$dlname \$dldir/$dlname'
+    postuninstall_cmds='dldll=`$SHELL 2>&1 -c '\''. $file; echo \$dlname'\''`~
+      dlpath=$dir/\$dldll~
+       $RM \$dlpath'
+    shlibpath_overrides_runpath=yes
+    dynamic_linker='Win32 link.exe'
+    ;;
+
+  *)
+    # Assume MSVC wrapper
+    library_names_spec='${libname}`echo ${release} | $SED -e 's/[.]/-/g'`${versuffix}${shared_ext} $libname.lib'
+    dynamic_linker='Win32 ld.exe'
+    ;;
+  esac
+  # FIXME: first we should search . and the directory the executable is in
+  shlibpath_var=PATH
+  ;;
+
+darwin* | rhapsody*)
+  dynamic_linker="$host_os dyld"
+  version_type=darwin
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}${major}$shared_ext ${libname}$shared_ext'
+  soname_spec='${libname}${release}${major}$shared_ext'
+  shlibpath_overrides_runpath=yes
+  shlibpath_var=DYLD_LIBRARY_PATH
+  shrext_cmds='`test .$module = .yes && echo .so || echo .dylib`'
+
+  sys_lib_dlsearch_path_spec='/usr/local/lib /lib /usr/lib'
+  ;;
+
+dgux*)
+  version_type=linux # correct to gnu/linux during the next big refactor
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname$shared_ext'
+  soname_spec='${libname}${release}${shared_ext}$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  ;;
+
+freebsd* | dragonfly*)
+  # DragonFly does not have aout.  When/if they implement a new
+  # versioning mechanism, adjust this.
+  if test -x /usr/bin/objformat; then
+    objformat=`/usr/bin/objformat`
+  else
+    case $host_os in
+    freebsd[23].*) objformat=aout ;;
+    *) objformat=elf ;;
     esac
   fi
+  version_type=freebsd-$objformat
+  case $version_type in
+    freebsd-elf*)
+      library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext} $libname${shared_ext}'
+      need_version=no
+      need_lib_prefix=no
+      ;;
+    freebsd-*)
+      library_names_spec='${libname}${release}${shared_ext}$versuffix $libname${shared_ext}$versuffix'
+      need_version=yes
+      ;;
+  esac
+  shlibpath_var=LD_LIBRARY_PATH
+  case $host_os in
+  freebsd2.*)
+    shlibpath_overrides_runpath=yes
+    ;;
+  freebsd3.[01]* | freebsdelf3.[01]*)
+    shlibpath_overrides_runpath=yes
+    hardcode_into_libs=yes
+    ;;
+  freebsd3.[2-9]* | freebsdelf3.[2-9]* | \
+  freebsd4.[0-5] | freebsdelf4.[0-5] | freebsd4.1.1 | freebsdelf4.1.1)
+    shlibpath_overrides_runpath=no
+    hardcode_into_libs=yes
+    ;;
+  *) # from 4.6 on, and DragonFly
+    shlibpath_overrides_runpath=yes
+    hardcode_into_libs=yes
+    ;;
+  esac
+  ;;
+
+haiku*)
+  version_type=linux # correct to gnu/linux during the next big refactor
+  need_lib_prefix=no
+  need_version=no
+  dynamic_linker="$host_os runtime_loader"
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}${major} ${libname}${shared_ext}'
+  soname_spec='${libname}${release}${shared_ext}$major'
+  shlibpath_var=LIBRARY_PATH
+  shlibpath_overrides_runpath=yes
+  sys_lib_dlsearch_path_spec='/boot/home/config/lib /boot/common/lib /boot/system/lib'
+  hardcode_into_libs=yes
+  ;;
 
-case $host_os in
-  # For platforms which do not support PIC, -DPIC is meaningless:
-  *djgpp*)
-    lt_prog_compiler_pic_CXX=
+hpux9* | hpux10* | hpux11*)
+  # Give a soname corresponding to the major version so that dld.sl refuses to
+  # link against other versions.
+  version_type=sunos
+  need_lib_prefix=no
+  need_version=no
+  case $host_cpu in
+  ia64*)
+    shrext_cmds='.so'
+    hardcode_into_libs=yes
+    dynamic_linker="$host_os dld.so"
+    shlibpath_var=LD_LIBRARY_PATH
+    shlibpath_overrides_runpath=yes # Unless +noenvvar is specified.
+    library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+    soname_spec='${libname}${release}${shared_ext}$major'
+    if test "X$HPUX_IA64_MODE" = X32; then
+      sys_lib_search_path_spec="/usr/lib/hpux32 /usr/local/lib/hpux32 /usr/local/lib"
+    else
+      sys_lib_search_path_spec="/usr/lib/hpux64 /usr/local/lib/hpux64"
+    fi
+    sys_lib_dlsearch_path_spec=$sys_lib_search_path_spec
+    ;;
+  hppa*64*)
+    shrext_cmds='.sl'
+    hardcode_into_libs=yes
+    dynamic_linker="$host_os dld.sl"
+    shlibpath_var=LD_LIBRARY_PATH # How should we handle SHLIB_PATH
+    shlibpath_overrides_runpath=yes # Unless +noenvvar is specified.
+    library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+    soname_spec='${libname}${release}${shared_ext}$major'
+    sys_lib_search_path_spec="/usr/lib/pa20_64 /usr/ccs/lib/pa20_64"
+    sys_lib_dlsearch_path_spec=$sys_lib_search_path_spec
     ;;
   *)
-    lt_prog_compiler_pic_CXX="$lt_prog_compiler_pic_CXX -DPIC"
+    shrext_cmds='.sl'
+    dynamic_linker="$host_os dld.sl"
+    shlibpath_var=SHLIB_PATH
+    shlibpath_overrides_runpath=no # +s is required to enable SHLIB_PATH
+    library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+    soname_spec='${libname}${release}${shared_ext}$major'
     ;;
-esac
+  esac
+  # HP-UX runs *really* slowly unless shared libraries are mode 555, ...
+  postinstall_cmds='chmod 555 $lib'
+  # or fails outright, so override atomically:
+  install_override_mode=555
+  ;;
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $compiler option to produce PIC" >&5
-$as_echo_n "checking for $compiler option to produce PIC... " >&6; }
-if ${lt_cv_prog_compiler_pic_CXX+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  lt_cv_prog_compiler_pic_CXX=$lt_prog_compiler_pic_CXX
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_prog_compiler_pic_CXX" >&5
-$as_echo "$lt_cv_prog_compiler_pic_CXX" >&6; }
-lt_prog_compiler_pic_CXX=$lt_cv_prog_compiler_pic_CXX
+interix[3-9]*)
+  version_type=linux # correct to gnu/linux during the next big refactor
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major ${libname}${shared_ext}'
+  soname_spec='${libname}${release}${shared_ext}$major'
+  dynamic_linker='Interix 3.x ld.so.1 (PE, like ELF)'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=no
+  hardcode_into_libs=yes
+  ;;
 
-#
-# Check to make sure the PIC flag actually works.
-#
-if test -n "$lt_prog_compiler_pic_CXX"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking if $compiler PIC flag $lt_prog_compiler_pic_CXX works" >&5
-$as_echo_n "checking if $compiler PIC flag $lt_prog_compiler_pic_CXX works... " >&6; }
-if ${lt_cv_prog_compiler_pic_works_CXX+:} false; then :
+irix5* | irix6* | nonstopux*)
+  case $host_os in
+    nonstopux*) version_type=nonstopux ;;
+    *)
+	if test "$lt_cv_prog_gnu_ld" = yes; then
+		version_type=linux # correct to gnu/linux during the next big refactor
+	else
+		version_type=irix
+	fi ;;
+  esac
+  need_lib_prefix=no
+  need_version=no
+  soname_spec='${libname}${release}${shared_ext}$major'
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major ${libname}${release}${shared_ext} $libname${shared_ext}'
+  case $host_os in
+  irix5* | nonstopux*)
+    libsuff= shlibsuff=
+    ;;
+  *)
+    case $LD in # libtool.m4 will add one of these switches to LD
+    *-32|*"-32 "|*-melf32bsmip|*"-melf32bsmip ")
+      libsuff= shlibsuff= libmagic=32-bit;;
+    *-n32|*"-n32 "|*-melf32bmipn32|*"-melf32bmipn32 ")
+      libsuff=32 shlibsuff=N32 libmagic=N32;;
+    *-64|*"-64 "|*-melf64bmip|*"-melf64bmip ")
+      libsuff=64 shlibsuff=64 libmagic=64-bit;;
+    *) libsuff= shlibsuff= libmagic=never-match;;
+    esac
+    ;;
+  esac
+  shlibpath_var=LD_LIBRARY${shlibsuff}_PATH
+  shlibpath_overrides_runpath=no
+  sys_lib_search_path_spec="/usr/lib${libsuff} /lib${libsuff} /usr/local/lib${libsuff}"
+  sys_lib_dlsearch_path_spec="/usr/lib${libsuff} /lib${libsuff}"
+  hardcode_into_libs=yes
+  ;;
+
+# No shared lib support for Linux oldld, aout, or coff.
+linux*oldld* | linux*aout* | linux*coff*)
+  dynamic_linker=no
+  ;;
+
+# This must be glibc/ELF.
+linux* | k*bsd*-gnu | kopensolaris*-gnu | gnu*)
+  version_type=linux # correct to gnu/linux during the next big refactor
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+  soname_spec='${libname}${release}${shared_ext}$major'
+  finish_cmds='PATH="\$PATH:/sbin" ldconfig -n $libdir'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=no
+
+  # Some binutils ld are patched to set DT_RUNPATH
+  if ${lt_cv_shlibpath_overrides_runpath+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  lt_cv_prog_compiler_pic_works_CXX=no
-   ac_outfile=conftest.$ac_objext
-   echo "$lt_simple_compile_test_code" > conftest.$ac_ext
-   lt_compiler_flag="$lt_prog_compiler_pic_CXX -DPIC"
-   # Insert the option either (1) after the last *FLAGS variable, or
-   # (2) before a word containing "conftest.", or (3) at the end.
-   # Note that $ac_compile itself does not contain backslashes and begins
-   # with a dollar sign (not a hyphen), so the echo should work correctly.
-   # The option is referenced via a variable to avoid confusing sed.
-   lt_compile=`echo "$ac_compile" | $SED \
-   -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
-   -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
-   -e 's:$: $lt_compiler_flag:'`
-   (eval echo "\"\$as_me:$LINENO: $lt_compile\"" >&5)
-   (eval "$lt_compile" 2>conftest.err)
-   ac_status=$?
-   cat conftest.err >&5
-   echo "$as_me:$LINENO: \$? = $ac_status" >&5
-   if (exit $ac_status) && test -s "$ac_outfile"; then
-     # The compiler can only warn and ignore the option if not recognized
-     # So say no if there are warnings other than the usual output.
-     $ECHO "$_lt_compiler_boilerplate" | $SED '/^$/d' >conftest.exp
-     $SED '/^$/d; /^ *+/d' conftest.err >conftest.er2
-     if test ! -s conftest.er2 || diff conftest.exp conftest.er2 >/dev/null; then
-       lt_cv_prog_compiler_pic_works_CXX=yes
-     fi
-   fi
-   $RM conftest*
+  lt_cv_shlibpath_overrides_runpath=no
+    save_LDFLAGS=$LDFLAGS
+    save_libdir=$libdir
+    eval "libdir=/foo; wl=\"$lt_prog_compiler_wl_CXX\"; \
+	 LDFLAGS=\"\$LDFLAGS $hardcode_libdir_flag_spec_CXX\""
+    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
 
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_prog_compiler_pic_works_CXX" >&5
-$as_echo "$lt_cv_prog_compiler_pic_works_CXX" >&6; }
+int
+main ()
+{
 
-if test x"$lt_cv_prog_compiler_pic_works_CXX" = xyes; then
-    case $lt_prog_compiler_pic_CXX in
-     "" | " "*) ;;
-     *) lt_prog_compiler_pic_CXX=" $lt_prog_compiler_pic_CXX" ;;
-     esac
-else
-    lt_prog_compiler_pic_CXX=
-     lt_prog_compiler_can_build_shared_CXX=no
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_link "$LINENO"; then :
+  if  ($OBJDUMP -p conftest$ac_exeext) 2>/dev/null | grep "RUNPATH.*$libdir" >/dev/null; then :
+  lt_cv_shlibpath_overrides_runpath=yes
 fi
-
 fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+    LDFLAGS=$save_LDFLAGS
+    libdir=$save_libdir
 
+fi
 
+  shlibpath_overrides_runpath=$lt_cv_shlibpath_overrides_runpath
 
+  # This implies no fast_install, which is unacceptable.
+  # Some rework will be needed to allow for fast_install
+  # before this can be enabled.
+  hardcode_into_libs=yes
 
+  # Append ld.so.conf contents to the search path
+  if test -f /etc/ld.so.conf; then
+    lt_ld_extra=`awk '/^include / { system(sprintf("cd /etc; cat %s 2>/dev/null", \$2)); skip = 1; } { if (!skip) print \$0; skip = 0; }' < /etc/ld.so.conf | $SED -e 's/#.*//;/^[	 ]*hwcap[	 ]/d;s/[:,	]/ /g;s/=[^=]*$//;s/=[^= ]* / /g;s/"//g;/^$/d' | tr '\n' ' '`
+    sys_lib_dlsearch_path_spec="/lib /usr/lib $lt_ld_extra"
+  fi
 
-#
-# Check to make sure the static flag actually works.
-#
-wl=$lt_prog_compiler_wl_CXX eval lt_tmp_static_flag=\"$lt_prog_compiler_static_CXX\"
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking if $compiler static flag $lt_tmp_static_flag works" >&5
-$as_echo_n "checking if $compiler static flag $lt_tmp_static_flag works... " >&6; }
-if ${lt_cv_prog_compiler_static_works_CXX+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  lt_cv_prog_compiler_static_works_CXX=no
-   save_LDFLAGS="$LDFLAGS"
-   LDFLAGS="$LDFLAGS $lt_tmp_static_flag"
-   echo "$lt_simple_link_test_code" > conftest.$ac_ext
-   if (eval $ac_link 2>conftest.err) && test -s conftest$ac_exeext; then
-     # The linker can only warn and ignore the option if not recognized
-     # So say no if there are warnings
-     if test -s conftest.err; then
-       # Append any errors to the config.log.
-       cat conftest.err 1>&5
-       $ECHO "$_lt_linker_boilerplate" | $SED '/^$/d' > conftest.exp
-       $SED '/^$/d; /^ *+/d' conftest.err >conftest.er2
-       if diff conftest.exp conftest.er2 >/dev/null; then
-         lt_cv_prog_compiler_static_works_CXX=yes
-       fi
-     else
-       lt_cv_prog_compiler_static_works_CXX=yes
-     fi
-   fi
-   $RM -r conftest*
-   LDFLAGS="$save_LDFLAGS"
+  # We used to test for /lib/ld.so.1 and disable shared libraries on
+  # powerpc, because MkLinux only supported shared libraries with the
+  # GNU dynamic linker.  Since this was broken with cross compilers,
+  # most powerpc-linux boxes support dynamic linking these days and
+  # people can always --disable-shared, the test was removed, and we
+  # assume the GNU/Linux dynamic linker is in use.
+  dynamic_linker='GNU/Linux ld.so'
+  ;;
 
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_prog_compiler_static_works_CXX" >&5
-$as_echo "$lt_cv_prog_compiler_static_works_CXX" >&6; }
+netbsdelf*-gnu)
+  version_type=linux
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major ${libname}${shared_ext}'
+  soname_spec='${libname}${release}${shared_ext}$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=no
+  hardcode_into_libs=yes
+  dynamic_linker='NetBSD ld.elf_so'
+  ;;
 
-if test x"$lt_cv_prog_compiler_static_works_CXX" = xyes; then
-    :
-else
-    lt_prog_compiler_static_CXX=
-fi
+netbsd*)
+  version_type=sunos
+  need_lib_prefix=no
+  need_version=no
+  if echo __ELF__ | $CC -E - | $GREP __ELF__ >/dev/null; then
+    library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${shared_ext}$versuffix'
+    finish_cmds='PATH="\$PATH:/sbin" ldconfig -m $libdir'
+    dynamic_linker='NetBSD (a.out) ld.so'
+  else
+    library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major ${libname}${shared_ext}'
+    soname_spec='${libname}${release}${shared_ext}$major'
+    dynamic_linker='NetBSD ld.elf_so'
+  fi
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=yes
+  hardcode_into_libs=yes
+  ;;
 
+newsos6)
+  version_type=linux # correct to gnu/linux during the next big refactor
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=yes
+  ;;
 
+*nto* | *qnx*)
+  version_type=qnx
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+  soname_spec='${libname}${release}${shared_ext}$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=no
+  hardcode_into_libs=yes
+  dynamic_linker='ldqnx.so'
+  ;;
 
+openbsd*)
+  version_type=sunos
+  sys_lib_dlsearch_path_spec="/usr/lib"
+  need_lib_prefix=no
+  # Some older versions of OpenBSD (3.3 at least) *do* need versioned libs.
+  case $host_os in
+    openbsd3.3 | openbsd3.3.*)	need_version=yes ;;
+    *)				need_version=no  ;;
+  esac
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${shared_ext}$versuffix'
+  finish_cmds='PATH="\$PATH:/sbin" ldconfig -m $libdir'
+  shlibpath_var=LD_LIBRARY_PATH
+  if test -z "`echo __ELF__ | $CC -E - | $GREP __ELF__`" || test "$host_os-$host_cpu" = "openbsd2.8-powerpc"; then
+    case $host_os in
+      openbsd2.[89] | openbsd2.[89].*)
+	shlibpath_overrides_runpath=no
+	;;
+      *)
+	shlibpath_overrides_runpath=yes
+	;;
+      esac
+  else
+    shlibpath_overrides_runpath=yes
+  fi
+  ;;
 
-    { $as_echo "$as_me:${as_lineno-$LINENO}: checking if $compiler supports -c -o file.$ac_objext" >&5
-$as_echo_n "checking if $compiler supports -c -o file.$ac_objext... " >&6; }
-if ${lt_cv_prog_compiler_c_o_CXX+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  lt_cv_prog_compiler_c_o_CXX=no
-   $RM -r conftest 2>/dev/null
-   mkdir conftest
-   cd conftest
-   mkdir out
-   echo "$lt_simple_compile_test_code" > conftest.$ac_ext
+os2*)
+  libname_spec='$name'
+  shrext_cmds=".dll"
+  need_lib_prefix=no
+  library_names_spec='$libname${shared_ext} $libname.a'
+  dynamic_linker='OS/2 ld.exe'
+  shlibpath_var=LIBPATH
+  ;;
 
-   lt_compiler_flag="-o out/conftest2.$ac_objext"
-   # Insert the option either (1) after the last *FLAGS variable, or
-   # (2) before a word containing "conftest.", or (3) at the end.
-   # Note that $ac_compile itself does not contain backslashes and begins
-   # with a dollar sign (not a hyphen), so the echo should work correctly.
-   lt_compile=`echo "$ac_compile" | $SED \
-   -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
-   -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
-   -e 's:$: $lt_compiler_flag:'`
-   (eval echo "\"\$as_me:$LINENO: $lt_compile\"" >&5)
-   (eval "$lt_compile" 2>out/conftest.err)
-   ac_status=$?
-   cat out/conftest.err >&5
-   echo "$as_me:$LINENO: \$? = $ac_status" >&5
-   if (exit $ac_status) && test -s out/conftest2.$ac_objext
-   then
-     # The compiler can only warn and ignore the option if not recognized
-     # So say no if there are warnings
-     $ECHO "$_lt_compiler_boilerplate" | $SED '/^$/d' > out/conftest.exp
-     $SED '/^$/d; /^ *+/d' out/conftest.err >out/conftest.er2
-     if test ! -s out/conftest.er2 || diff out/conftest.exp out/conftest.er2 >/dev/null; then
-       lt_cv_prog_compiler_c_o_CXX=yes
-     fi
-   fi
-   chmod u+w . 2>&5
-   $RM conftest*
-   # SGI C++ compiler will create directory out/ii_files/ for
-   # template instantiation
-   test -d out/ii_files && $RM out/ii_files/* && rmdir out/ii_files
-   $RM out/* && rmdir out
-   cd ..
-   $RM -r conftest
-   $RM conftest*
+osf3* | osf4* | osf5*)
+  version_type=osf
+  need_lib_prefix=no
+  need_version=no
+  soname_spec='${libname}${release}${shared_ext}$major'
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+  shlibpath_var=LD_LIBRARY_PATH
+  sys_lib_search_path_spec="/usr/shlib /usr/ccs/lib /usr/lib/cmplrs/cc /usr/lib /usr/local/lib /var/shlib"
+  sys_lib_dlsearch_path_spec="$sys_lib_search_path_spec"
+  ;;
 
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_prog_compiler_c_o_CXX" >&5
-$as_echo "$lt_cv_prog_compiler_c_o_CXX" >&6; }
+rdos*)
+  dynamic_linker=no
+  ;;
 
+solaris*)
+  version_type=linux # correct to gnu/linux during the next big refactor
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+  soname_spec='${libname}${release}${shared_ext}$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=yes
+  hardcode_into_libs=yes
+  # ldd complains unless libraries are executable
+  postinstall_cmds='chmod +x $lib'
+  ;;
 
+sunos4*)
+  version_type=sunos
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${shared_ext}$versuffix'
+  finish_cmds='PATH="\$PATH:/usr/etc" ldconfig $libdir'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=yes
+  if test "$with_gnu_ld" = yes; then
+    need_lib_prefix=no
+  fi
+  need_version=yes
+  ;;
 
-    { $as_echo "$as_me:${as_lineno-$LINENO}: checking if $compiler supports -c -o file.$ac_objext" >&5
-$as_echo_n "checking if $compiler supports -c -o file.$ac_objext... " >&6; }
-if ${lt_cv_prog_compiler_c_o_CXX+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  lt_cv_prog_compiler_c_o_CXX=no
-   $RM -r conftest 2>/dev/null
-   mkdir conftest
-   cd conftest
-   mkdir out
-   echo "$lt_simple_compile_test_code" > conftest.$ac_ext
+sysv4 | sysv4.3*)
+  version_type=linux # correct to gnu/linux during the next big refactor
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+  soname_spec='${libname}${release}${shared_ext}$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  case $host_vendor in
+    sni)
+      shlibpath_overrides_runpath=no
+      need_lib_prefix=no
+      runpath_var=LD_RUN_PATH
+      ;;
+    siemens)
+      need_lib_prefix=no
+      ;;
+    motorola)
+      need_lib_prefix=no
+      need_version=no
+      shlibpath_overrides_runpath=no
+      sys_lib_search_path_spec='/lib /usr/lib /usr/ccs/lib'
+      ;;
+  esac
+  ;;
 
-   lt_compiler_flag="-o out/conftest2.$ac_objext"
-   # Insert the option either (1) after the last *FLAGS variable, or
-   # (2) before a word containing "conftest.", or (3) at the end.
-   # Note that $ac_compile itself does not contain backslashes and begins
-   # with a dollar sign (not a hyphen), so the echo should work correctly.
-   lt_compile=`echo "$ac_compile" | $SED \
-   -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
-   -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
-   -e 's:$: $lt_compiler_flag:'`
-   (eval echo "\"\$as_me:$LINENO: $lt_compile\"" >&5)
-   (eval "$lt_compile" 2>out/conftest.err)
-   ac_status=$?
-   cat out/conftest.err >&5
-   echo "$as_me:$LINENO: \$? = $ac_status" >&5
-   if (exit $ac_status) && test -s out/conftest2.$ac_objext
-   then
-     # The compiler can only warn and ignore the option if not recognized
-     # So say no if there are warnings
-     $ECHO "$_lt_compiler_boilerplate" | $SED '/^$/d' > out/conftest.exp
-     $SED '/^$/d; /^ *+/d' out/conftest.err >out/conftest.er2
-     if test ! -s out/conftest.er2 || diff out/conftest.exp out/conftest.er2 >/dev/null; then
-       lt_cv_prog_compiler_c_o_CXX=yes
-     fi
-   fi
-   chmod u+w . 2>&5
-   $RM conftest*
-   # SGI C++ compiler will create directory out/ii_files/ for
-   # template instantiation
-   test -d out/ii_files && $RM out/ii_files/* && rmdir out/ii_files
-   $RM out/* && rmdir out
-   cd ..
-   $RM -r conftest
-   $RM conftest*
+sysv4*MP*)
+  if test -d /usr/nec ;then
+    version_type=linux # correct to gnu/linux during the next big refactor
+    library_names_spec='$libname${shared_ext}.$versuffix $libname${shared_ext}.$major $libname${shared_ext}'
+    soname_spec='$libname${shared_ext}.$major'
+    shlibpath_var=LD_LIBRARY_PATH
+  fi
+  ;;
 
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_prog_compiler_c_o_CXX" >&5
-$as_echo "$lt_cv_prog_compiler_c_o_CXX" >&6; }
+sysv5* | sco3.2v5* | sco5v6* | unixware* | OpenUNIX* | sysv4*uw2*)
+  version_type=freebsd-elf
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext} $libname${shared_ext}'
+  soname_spec='${libname}${release}${shared_ext}$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=yes
+  hardcode_into_libs=yes
+  if test "$with_gnu_ld" = yes; then
+    sys_lib_search_path_spec='/usr/local/lib /usr/gnu/lib /usr/ccs/lib /usr/lib /lib'
+  else
+    sys_lib_search_path_spec='/usr/ccs/lib /usr/lib'
+    case $host_os in
+      sco3.2v5*)
+        sys_lib_search_path_spec="$sys_lib_search_path_spec /lib"
+	;;
+    esac
+  fi
+  sys_lib_dlsearch_path_spec='/usr/lib'
+  ;;
 
+tpf*)
+  # TPF is a cross-target only.  Preferred cross-host = GNU/Linux.
+  version_type=linux # correct to gnu/linux during the next big refactor
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=no
+  hardcode_into_libs=yes
+  ;;
 
+uts4*)
+  version_type=linux # correct to gnu/linux during the next big refactor
+  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+  soname_spec='${libname}${release}${shared_ext}$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  ;;
 
+*)
+  dynamic_linker=no
+  ;;
+esac
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $dynamic_linker" >&5
+$as_echo "$dynamic_linker" >&6; }
+test "$dynamic_linker" = no && can_build_shared=no
 
-hard_links="nottested"
-if test "$lt_cv_prog_compiler_c_o_CXX" = no && test "$need_locks" != no; then
-  # do not overwrite the value of need_locks provided by the user
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking if we can lock with hard links" >&5
-$as_echo_n "checking if we can lock with hard links... " >&6; }
-  hard_links=yes
-  $RM conftest*
-  ln conftest.a conftest.b 2>/dev/null && hard_links=no
-  touch conftest.a
-  ln conftest.a conftest.b 2>&5 || hard_links=no
-  ln conftest.a conftest.b 2>/dev/null && hard_links=no
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $hard_links" >&5
-$as_echo "$hard_links" >&6; }
-  if test "$hard_links" = no; then
-    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: \`$CC' does not support \`-c -o', so \`make -j' may be unsafe" >&5
-$as_echo "$as_me: WARNING: \`$CC' does not support \`-c -o', so \`make -j' may be unsafe" >&2;}
-    need_locks=warn
-  fi
-else
-  need_locks=no
+variables_saved_for_relink="PATH $shlibpath_var $runpath_var"
+if test "$GCC" = yes; then
+  variables_saved_for_relink="$variables_saved_for_relink GCC_EXEC_PREFIX COMPILER_PATH LIBRARY_PATH"
 fi
 
+if test "${lt_cv_sys_lib_search_path_spec+set}" = set; then
+  sys_lib_search_path_spec="$lt_cv_sys_lib_search_path_spec"
+fi
+if test "${lt_cv_sys_lib_dlsearch_path_spec+set}" = set; then
+  sys_lib_dlsearch_path_spec="$lt_cv_sys_lib_dlsearch_path_spec"
+fi
 
 
-    { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the $compiler linker ($LD) supports shared libraries" >&5
-$as_echo_n "checking whether the $compiler linker ($LD) supports shared libraries... " >&6; }
-
-  export_symbols_cmds_CXX='$NM $libobjs $convenience | $global_symbol_pipe | $SED '\''s/.* //'\'' | sort | uniq > $export_symbols'
-  exclude_expsyms_CXX='_GLOBAL_OFFSET_TABLE_|_GLOBAL__F[ID]_.*'
-  case $host_os in
-  aix[4-9]*)
-    # If we're using GNU nm, then we don't want the "-C" option.
-    # -C means demangle to AIX nm, but means don't demangle with GNU nm
-    # Also, AIX nm treats weak defined symbols like other global defined
-    # symbols, whereas GNU nm marks them as "W".
-    if $NM -V 2>&1 | $GREP 'GNU' > /dev/null; then
-      export_symbols_cmds_CXX='$NM -Bpg $libobjs $convenience | awk '\''{ if (((\$ 2 == "T") || (\$ 2 == "D") || (\$ 2 == "B") || (\$ 2 == "W")) && (substr(\$ 3,1,1) != ".")) { print \$ 3 } }'\'' | sort -u > $export_symbols'
-    else
-      export_symbols_cmds_CXX='$NM -BCpg $libobjs $convenience | awk '\''{ if (((\$ 2 == "T") || (\$ 2 == "D") || (\$ 2 == "B")) && (substr(\$ 3,1,1) != ".")) { print \$ 3 } }'\'' | sort -u > $export_symbols'
-    fi
-    ;;
-  pw32*)
-    export_symbols_cmds_CXX="$ltdll_cmds"
-    ;;
-  cygwin* | mingw* | cegcc*)
-    case $cc_basename in
-    cl*)
-      exclude_expsyms_CXX='_NULL_IMPORT_DESCRIPTOR|_IMPORT_DESCRIPTOR_.*'
-      ;;
-    *)
-      export_symbols_cmds_CXX='$NM $libobjs $convenience | $global_symbol_pipe | $SED -e '\''/^[BCDGRS][ ]/s/.*[ ]\([^ ]*\)/\1 DATA/;s/^.*[ ]__nm__\([^ ]*\)[ ][^ ]*/\1 DATA/;/^I[ ]/d;/^[AITW][ ]/s/.* //'\'' | sort | uniq > $export_symbols'
-      exclude_expsyms_CXX='[_]+GLOBAL_OFFSET_TABLE_|[_]+GLOBAL__[FID]_.*|[_]+head_[A-Za-z0-9_]+_dll|[A-Za-z0-9_]+_dll_iname'
-      ;;
-    esac
-    ;;
-  linux* | k*bsd*-gnu | gnu*)
-    link_all_deplibs_CXX=no
-    ;;
-  *)
-    export_symbols_cmds_CXX='$NM $libobjs $convenience | $global_symbol_pipe | $SED '\''s/.* //'\'' | sort | uniq > $export_symbols'
-    ;;
-  esac
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ld_shlibs_CXX" >&5
-$as_echo "$ld_shlibs_CXX" >&6; }
-test "$ld_shlibs_CXX" = no && can_build_shared=no
 
-with_gnu_ld_CXX=$with_gnu_ld
 
 
 
 
 
 
-#
-# Do we need to explicitly link libc?
-#
-case "x$archive_cmds_need_lc_CXX" in
-x|xyes)
-  # Assume -lc should be added
-  archive_cmds_need_lc_CXX=yes
 
-  if test "$enable_shared" = yes && test "$GCC" = yes; then
-    case $archive_cmds_CXX in
-    *'~'*)
-      # FIXME: we may have to deal with multi-command sequences.
-      ;;
-    '$CC '*)
-      # Test whether the compiler implicitly links with -lc since on some
-      # systems, -lgcc has to come before -lc. If gcc already passes -lc
-      # to ld, don't add -lc before -lgcc.
-      { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether -lc should be explicitly linked in" >&5
-$as_echo_n "checking whether -lc should be explicitly linked in... " >&6; }
-if ${lt_cv_archive_cmds_need_lc_CXX+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  $RM conftest*
-	echo "$lt_simple_compile_test_code" > conftest.$ac_ext
 
-	if { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_compile\""; } >&5
-  (eval $ac_compile) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; } 2>conftest.err; then
-	  soname=conftest
-	  lib=conftest
-	  libobjs=conftest.$ac_objext
-	  deplibs=
-	  wl=$lt_prog_compiler_wl_CXX
-	  pic_flag=$lt_prog_compiler_pic_CXX
-	  compiler_flags=-v
-	  linker_flags=-v
-	  verstring=
-	  output_objdir=.
-	  libname=conftest
-	  lt_save_allow_undefined_flag=$allow_undefined_flag_CXX
-	  allow_undefined_flag_CXX=
-	  if { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$archive_cmds_CXX 2\>\&1 \| $GREP \" -lc \" \>/dev/null 2\>\&1\""; } >&5
-  (eval $archive_cmds_CXX 2\>\&1 \| $GREP \" -lc \" \>/dev/null 2\>\&1) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }
-	  then
-	    lt_cv_archive_cmds_need_lc_CXX=no
-	  else
-	    lt_cv_archive_cmds_need_lc_CXX=yes
-	  fi
-	  allow_undefined_flag_CXX=$lt_save_allow_undefined_flag
-	else
-	  cat conftest.err 1>&5
-	fi
-	$RM conftest*
 
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_archive_cmds_need_lc_CXX" >&5
-$as_echo "$lt_cv_archive_cmds_need_lc_CXX" >&6; }
-      archive_cmds_need_lc_CXX=$lt_cv_archive_cmds_need_lc_CXX
-      ;;
-    esac
-  fi
-  ;;
-esac
 
 
 
@@ -15946,8 +15661,43 @@ esac
 
 
 
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking how to hardcode library paths into programs" >&5
+$as_echo_n "checking how to hardcode library paths into programs... " >&6; }
+hardcode_action_CXX=
+if test -n "$hardcode_libdir_flag_spec_CXX" ||
+   test -n "$runpath_var_CXX" ||
+   test "X$hardcode_automatic_CXX" = "Xyes" ; then
 
+  # We can hardcode non-existent directories.
+  if test "$hardcode_direct_CXX" != no &&
+     # If the only mechanism to avoid hardcoding is shlibpath_var, we
+     # have to relink, otherwise we might link with an installed library
+     # when we should be linking with a yet-to-be-installed one
+     ## test "$_LT_TAGVAR(hardcode_shlibpath_var, CXX)" != no &&
+     test "$hardcode_minus_L_CXX" != no; then
+    # Linking always hardcodes the temporary library directory.
+    hardcode_action_CXX=relink
+  else
+    # We can link without hardcoding, and we can hardcode nonexisting dirs.
+    hardcode_action_CXX=immediate
+  fi
+else
+  # We cannot hardcode anything, or else we can only hardcode existing
+  # directories.
+  hardcode_action_CXX=unsupported
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $hardcode_action_CXX" >&5
+$as_echo "$hardcode_action_CXX" >&6; }
 
+if test "$hardcode_action_CXX" = relink ||
+   test "$inherit_rpath_CXX" = yes; then
+  # Fast installation is not supported
+  enable_fast_install=no
+elif test "$shlibpath_overrides_runpath" = yes ||
+     test "$enable_shared" = no; then
+  # Fast installation is not necessary
+  enable_fast_install=needless
+fi
 
 
 
@@ -15955,8 +15705,25 @@ esac
 
 
 
+  fi # test -n "$compiler"
 
+  CC=$lt_save_CC
+  CFLAGS=$lt_save_CFLAGS
+  LDCXX=$LD
+  LD=$lt_save_LD
+  GCC=$lt_save_GCC
+  with_gnu_ld=$lt_save_with_gnu_ld
+  lt_cv_path_LDCXX=$lt_cv_path_LD
+  lt_cv_path_LD=$lt_save_path_LD
+  lt_cv_prog_gnu_ldcxx=$lt_cv_prog_gnu_ld
+  lt_cv_prog_gnu_ld=$lt_save_with_gnu_ld
+fi # test "$_lt_caught_CXX_error" != yes
 
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
 
 
@@ -15972,930 +15739,1455 @@ esac
 
 
 
+        ac_config_commands="$ac_config_commands libtool"
 
 
 
 
+# Only expand once:
 
 
+rm -rf .tst 2>/dev/null
+mkdir .tst 2>/dev/null
+if test -d .tst; then
+  am__leading_dot=.
+else
+  am__leading_dot=_
+fi
+rmdir .tst 2>/dev/null
 
+DEPDIR="${am__leading_dot}deps"
 
+ac_config_commands="$ac_config_commands depfiles"
 
 
+am_make=${MAKE-make}
+cat > confinc << 'END'
+am__doit:
+	@echo this is the am__doit target
+.PHONY: am__doit
+END
+# If we don't find an include directive, just comment out the code.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for style of include used by $am_make" >&5
+$as_echo_n "checking for style of include used by $am_make... " >&6; }
+am__include="#"
+am__quote=
+_am_result=none
+# First try GNU make style include.
+echo "include confinc" > confmf
+# Ignore all kinds of additional output from 'make'.
+case `$am_make -s -f confmf 2> /dev/null` in #(
+*the\ am__doit\ target*)
+  am__include=include
+  am__quote=
+  _am_result=GNU
+  ;;
+esac
+# Now try BSD make style include.
+if test "$am__include" = "#"; then
+   echo '.include "confinc"' > confmf
+   case `$am_make -s -f confmf 2> /dev/null` in #(
+   *the\ am__doit\ target*)
+     am__include=.include
+     am__quote="\""
+     _am_result=BSD
+     ;;
+   esac
+fi
 
-    { $as_echo "$as_me:${as_lineno-$LINENO}: checking dynamic linker characteristics" >&5
-$as_echo_n "checking dynamic linker characteristics... " >&6; }
 
-library_names_spec=
-libname_spec='lib$name'
-soname_spec=
-shrext_cmds=".so"
-postinstall_cmds=
-postuninstall_cmds=
-finish_cmds=
-finish_eval=
-shlibpath_var=
-shlibpath_overrides_runpath=unknown
-version_type=none
-dynamic_linker="$host_os ld.so"
-sys_lib_dlsearch_path_spec="/lib /usr/lib"
-need_lib_prefix=unknown
-hardcode_into_libs=no
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $_am_result" >&5
+$as_echo "$_am_result" >&6; }
+rm -f confinc confmf
 
-# when you set need_version to no, make sure it does not cause -set_version
-# flags to be left without arguments
-need_version=unknown
+# Check whether --enable-dependency-tracking was given.
+if test "${enable_dependency_tracking+set}" = set; then :
+  enableval=$enable_dependency_tracking;
+fi
 
-case $host_os in
-aix3*)
-  version_type=linux # correct to gnu/linux during the next big refactor
-  library_names_spec='${libname}${release}${shared_ext}$versuffix $libname.a'
-  shlibpath_var=LIBPATH
+if test "x$enable_dependency_tracking" != xno; then
+  am_depcomp="$ac_aux_dir/depcomp"
+  AMDEPBACKSLASH='\'
+  am__nodep='_no'
+fi
+ if test "x$enable_dependency_tracking" != xno; then
+  AMDEP_TRUE=
+  AMDEP_FALSE='#'
+else
+  AMDEP_TRUE='#'
+  AMDEP_FALSE=
+fi
 
-  # AIX 3 has no versioning support, so we append a major version to the name.
-  soname_spec='${libname}${release}${shared_ext}$major'
-  ;;
 
-aix[4-9]*)
-  version_type=linux # correct to gnu/linux during the next big refactor
-  need_lib_prefix=no
-  need_version=no
-  hardcode_into_libs=yes
-  if test "$host_cpu" = ia64; then
-    # AIX 5 supports IA64
-    library_names_spec='${libname}${release}${shared_ext}$major ${libname}${release}${shared_ext}$versuffix $libname${shared_ext}'
-    shlibpath_var=LD_LIBRARY_PATH
-  else
-    # With GCC up to 2.95.x, collect2 would create an import file
-    # for dependence libraries.  The import file would start with
-    # the line `#! .'.  This would cause the generated library to
-    # depend on `.', always an invalid library.  This was fixed in
-    # development snapshots of GCC prior to 3.0.
-    case $host_os in
-      aix4 | aix4.[01] | aix4.[01].*)
-      if { echo '#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 97)'
-	   echo ' yes '
-	   echo '#endif'; } | ${CC} -E - | $GREP yes > /dev/null; then
-	:
-      else
-	can_build_shared=no
-      fi
-      ;;
-    esac
-    # AIX (on Power*) has no versioning support, so currently we can not hardcode correct
-    # soname into executable. Probably we can add versioning support to
-    # collect2, so additional links can be useful in future.
-    if test "$aix_use_runtimelinking" = yes; then
-      # If using run time linking (on AIX 4.2 or later) use lib<name>.so
-      # instead of lib<name>.a to let people know that these are not
-      # typical AIX shared libraries.
-      library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-    else
-      # We preserve .a as extension for shared libraries through AIX4.2
-      # and later when we are not doing run time linking.
-      library_names_spec='${libname}${release}.a $libname.a'
-      soname_spec='${libname}${release}${shared_ext}$major'
-    fi
-    shlibpath_var=LIBPATH
-  fi
-  ;;
+# By default we simply use the C compiler to build assembly code.
 
-amigaos*)
-  case $host_cpu in
-  powerpc)
-    # Since July 2007 AmigaOS4 officially supports .so libraries.
-    # When compiling the executable, add -use-dynld -Lsobjs: to the compileline.
-    library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-    ;;
-  m68k)
-    library_names_spec='$libname.ixlibrary $libname.a'
-    # Create ${libname}_ixlibrary.a entries in /sys/libs.
-    finish_eval='for lib in `ls $libdir/*.ixlibrary 2>/dev/null`; do libname=`func_echo_all "$lib" | $SED '\''s%^.*/\([^/]*\)\.ixlibrary$%\1%'\''`; test $RM /sys/libs/${libname}_ixlibrary.a; $show "cd /sys/libs && $LN_S $lib ${libname}_ixlibrary.a"; cd /sys/libs && $LN_S $lib ${libname}_ixlibrary.a || exit 1; done'
-    ;;
-  esac
-  ;;
+test "${CCAS+set}" = set || CCAS=$CC
+test "${CCASFLAGS+set}" = set || CCASFLAGS=$CFLAGS
 
-beos*)
-  library_names_spec='${libname}${shared_ext}'
-  dynamic_linker="$host_os ld.so"
-  shlibpath_var=LIBRARY_PATH
-  ;;
 
-bsdi[45]*)
-  version_type=linux # correct to gnu/linux during the next big refactor
-  need_version=no
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-  soname_spec='${libname}${release}${shared_ext}$major'
-  finish_cmds='PATH="\$PATH:/sbin" ldconfig $libdir'
-  shlibpath_var=LD_LIBRARY_PATH
-  sys_lib_search_path_spec="/shlib /usr/lib /usr/X11/lib /usr/contrib/lib /lib /usr/local/lib"
-  sys_lib_dlsearch_path_spec="/shlib /usr/lib /usr/local/lib"
-  # the default ld.so.conf also contains /usr/contrib/lib and
-  # /usr/X11R6/lib (/usr/X11 is a link to /usr/X11R6), but let us allow
-  # libtool to hard-code these into programs
-  ;;
 
-cygwin* | mingw* | pw32* | cegcc*)
-  version_type=windows
-  shrext_cmds=".dll"
-  need_version=no
-  need_lib_prefix=no
+depcc="$CCAS"   am_compiler_list=
 
-  case $GCC,$cc_basename in
-  yes,*)
-    # gcc
-    library_names_spec='$libname.dll.a'
-    # DLL is installed to $(libdir)/../bin by postinstall_cmds
-    postinstall_cmds='base_file=`basename \${file}`~
-      dlpath=`$SHELL 2>&1 -c '\''. $dir/'\''\${base_file}'\''i; echo \$dlname'\''`~
-      dldir=$destdir/`dirname \$dlpath`~
-      test -d \$dldir || mkdir -p \$dldir~
-      $install_prog $dir/$dlname \$dldir/$dlname~
-      chmod a+x \$dldir/$dlname~
-      if test -n '\''$stripme'\'' && test -n '\''$striplib'\''; then
-        eval '\''$striplib \$dldir/$dlname'\'' || exit \$?;
-      fi'
-    postuninstall_cmds='dldll=`$SHELL 2>&1 -c '\''. $file; echo \$dlname'\''`~
-      dlpath=$dir/\$dldll~
-       $RM \$dlpath'
-    shlibpath_overrides_runpath=yes
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking dependency style of $depcc" >&5
+$as_echo_n "checking dependency style of $depcc... " >&6; }
+if ${am_cv_CCAS_dependencies_compiler_type+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -z "$AMDEP_TRUE" && test -f "$am_depcomp"; then
+  # We make a subdir and do the tests there.  Otherwise we can end up
+  # making bogus files that we don't know about and never remove.  For
+  # instance it was reported that on HP-UX the gcc test will end up
+  # making a dummy file named 'D' -- because '-MD' means "put the output
+  # in D".
+  rm -rf conftest.dir
+  mkdir conftest.dir
+  # Copy depcomp to subdir because otherwise we won't find it if we're
+  # using a relative directory.
+  cp "$am_depcomp" conftest.dir
+  cd conftest.dir
+  # We will build objects and dependencies in a subdirectory because
+  # it helps to detect inapplicable dependency modes.  For instance
+  # both Tru64's cc and ICC support -MD to output dependencies as a
+  # side effect of compilation, but ICC will put the dependencies in
+  # the current directory while Tru64 will put them in the object
+  # directory.
+  mkdir sub
 
-    case $host_os in
-    cygwin*)
-      # Cygwin DLLs use 'cyg' prefix rather than 'lib'
-      soname_spec='`echo ${libname} | sed -e 's/^lib/cyg/'``echo ${release} | $SED -e 's/[.]/-/g'`${versuffix}${shared_ext}'
+  am_cv_CCAS_dependencies_compiler_type=none
+  if test "$am_compiler_list" = ""; then
+     am_compiler_list=`sed -n 's/^#*\([a-zA-Z0-9]*\))$/\1/p' < ./depcomp`
+  fi
+  am__universal=false
 
-      ;;
-    mingw* | cegcc*)
-      # MinGW DLLs use traditional 'lib' prefix
-      soname_spec='${libname}`echo ${release} | $SED -e 's/[.]/-/g'`${versuffix}${shared_ext}'
-      ;;
-    pw32*)
-      # pw32 DLLs use 'pw' prefix rather than 'lib'
-      library_names_spec='`echo ${libname} | sed -e 's/^lib/pw/'``echo ${release} | $SED -e 's/[.]/-/g'`${versuffix}${shared_ext}'
-      ;;
-    esac
-    dynamic_linker='Win32 ld.exe'
-    ;;
 
-  *,cl*)
-    # Native MSVC
-    libname_spec='$name'
-    soname_spec='${libname}`echo ${release} | $SED -e 's/[.]/-/g'`${versuffix}${shared_ext}'
-    library_names_spec='${libname}.dll.lib'
+  for depmode in $am_compiler_list; do
+    # Setup a source with many dependencies, because some compilers
+    # like to wrap large dependency lists on column 80 (with \), and
+    # we should not choose a depcomp mode which is confused by this.
+    #
+    # We need to recreate these files for each test, as the compiler may
+    # overwrite some of them when testing with obscure command lines.
+    # This happens at least with the AIX C compiler.
+    : > sub/conftest.c
+    for i in 1 2 3 4 5 6; do
+      echo '#include "conftst'$i'.h"' >> sub/conftest.c
+      # Using ": > sub/conftst$i.h" creates only sub/conftst1.h with
+      # Solaris 10 /bin/sh.
+      echo '/* dummy */' > sub/conftst$i.h
+    done
+    echo "${am__include} ${am__quote}sub/conftest.Po${am__quote}" > confmf
 
-    case $build_os in
-    mingw*)
-      sys_lib_search_path_spec=
-      lt_save_ifs=$IFS
-      IFS=';'
-      for lt_path in $LIB
-      do
-        IFS=$lt_save_ifs
-        # Let DOS variable expansion print the short 8.3 style file name.
-        lt_path=`cd "$lt_path" 2>/dev/null && cmd //C "for %i in (".") do @echo %~si"`
-        sys_lib_search_path_spec="$sys_lib_search_path_spec $lt_path"
-      done
-      IFS=$lt_save_ifs
-      # Convert to MSYS style.
-      sys_lib_search_path_spec=`$ECHO "$sys_lib_search_path_spec" | sed -e 's|\\\\|/|g' -e 's| \\([a-zA-Z]\\):| /\\1|g' -e 's|^ ||'`
-      ;;
-    cygwin*)
-      # Convert to unix form, then to dos form, then back to unix form
-      # but this time dos style (no spaces!) so that the unix form looks
-      # like /cygdrive/c/PROGRA~1:/cygdr...
-      sys_lib_search_path_spec=`cygpath --path --unix "$LIB"`
-      sys_lib_search_path_spec=`cygpath --path --dos "$sys_lib_search_path_spec" 2>/dev/null`
-      sys_lib_search_path_spec=`cygpath --path --unix "$sys_lib_search_path_spec" | $SED -e "s/$PATH_SEPARATOR/ /g"`
+    # We check with '-c' and '-o' for the sake of the "dashmstdout"
+    # mode.  It turns out that the SunPro C++ compiler does not properly
+    # handle '-M -o', and we need to detect this.  Also, some Intel
+    # versions had trouble with output in subdirs.
+    am__obj=sub/conftest.${OBJEXT-o}
+    am__minus_obj="-o $am__obj"
+    case $depmode in
+    gcc)
+      # This depmode causes a compiler race in universal mode.
+      test "$am__universal" = false || continue
       ;;
-    *)
-      sys_lib_search_path_spec="$LIB"
-      if $ECHO "$sys_lib_search_path_spec" | $GREP ';[c-zC-Z]:/' >/dev/null; then
-        # It is most probably a Windows format PATH.
-        sys_lib_search_path_spec=`$ECHO "$sys_lib_search_path_spec" | $SED -e 's/;/ /g'`
+    nosideeffect)
+      # After this tag, mechanisms are not by side-effect, so they'll
+      # only be used when explicitly requested.
+      if test "x$enable_dependency_tracking" = xyes; then
+	continue
       else
-        sys_lib_search_path_spec=`$ECHO "$sys_lib_search_path_spec" | $SED -e "s/$PATH_SEPARATOR/ /g"`
+	break
       fi
-      # FIXME: find the short name or the path components, as spaces are
-      # common. (e.g. "Program Files" -> "PROGRA~1")
       ;;
+    msvc7 | msvc7msys | msvisualcpp | msvcmsys)
+      # This compiler won't grok '-c -o', but also, the minuso test has
+      # not run yet.  These depmodes are late enough in the game, and
+      # so weak that their functioning should not be impacted.
+      am__obj=conftest.${OBJEXT-o}
+      am__minus_obj=
+      ;;
+    none) break ;;
     esac
+    if depmode=$depmode \
+       source=sub/conftest.c object=$am__obj \
+       depfile=sub/conftest.Po tmpdepfile=sub/conftest.TPo \
+       $SHELL ./depcomp $depcc -c $am__minus_obj sub/conftest.c \
+         >/dev/null 2>conftest.err &&
+       grep sub/conftst1.h sub/conftest.Po > /dev/null 2>&1 &&
+       grep sub/conftst6.h sub/conftest.Po > /dev/null 2>&1 &&
+       grep $am__obj sub/conftest.Po > /dev/null 2>&1 &&
+       ${MAKE-make} -s -f confmf > /dev/null 2>&1; then
+      # icc doesn't choke on unknown options, it will just issue warnings
+      # or remarks (even with -Werror).  So we grep stderr for any message
+      # that says an option was ignored or not supported.
+      # When given -MP, icc 7.0 and 7.1 complain thusly:
+      #   icc: Command line warning: ignoring option '-M'; no argument required
+      # The diagnosis changed in icc 8.0:
+      #   icc: Command line remark: option '-MP' not supported
+      if (grep 'ignoring option' conftest.err ||
+          grep 'not supported' conftest.err) >/dev/null 2>&1; then :; else
+        am_cv_CCAS_dependencies_compiler_type=$depmode
+        break
+      fi
+    fi
+  done
+
+  cd ..
+  rm -rf conftest.dir
+else
+  am_cv_CCAS_dependencies_compiler_type=none
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $am_cv_CCAS_dependencies_compiler_type" >&5
+$as_echo "$am_cv_CCAS_dependencies_compiler_type" >&6; }
+CCASDEPMODE=depmode=$am_cv_CCAS_dependencies_compiler_type
+
+ if
+  test "x$enable_dependency_tracking" != xno \
+  && test "$am_cv_CCAS_dependencies_compiler_type" = gcc3; then
+  am__fastdepCCAS_TRUE=
+  am__fastdepCCAS_FALSE='#'
+else
+  am__fastdepCCAS_TRUE='#'
+  am__fastdepCCAS_FALSE=
+fi
+
+
 
-    # DLL is installed to $(libdir)/../bin by postinstall_cmds
-    postinstall_cmds='base_file=`basename \${file}`~
-      dlpath=`$SHELL 2>&1 -c '\''. $dir/'\''\${base_file}'\''i; echo \$dlname'\''`~
-      dldir=$destdir/`dirname \$dlpath`~
-      test -d \$dldir || mkdir -p \$dldir~
-      $install_prog $dir/$dlname \$dldir/$dlname'
-    postuninstall_cmds='dldll=`$SHELL 2>&1 -c '\''. $file; echo \$dlname'\''`~
-      dlpath=$dir/\$dldll~
-       $RM \$dlpath'
-    shlibpath_overrides_runpath=yes
-    dynamic_linker='Win32 link.exe'
-    ;;
 
+am__api_version='1.14'
+
+# Find a good install program.  We prefer a C program (faster),
+# so one script is as good as another.  But avoid the broken or
+# incompatible versions:
+# SysV /etc/install, /usr/sbin/install
+# SunOS /usr/etc/install
+# IRIX /sbin/install
+# AIX /bin/install
+# AmigaOS /C/install, which installs bootblocks on floppy discs
+# AIX 4 /usr/bin/installbsd, which doesn't work without a -g flag
+# AFS /usr/afsws/bin/install, which mishandles nonexistent args
+# SVR4 /usr/ucb/install, which tries to use the nonexistent group "staff"
+# OS/2's system install, which has a completely different semantic
+# ./install, which can be erroneously created by make from ./install.sh.
+# Reject install programs that cannot install multiple files.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for a BSD-compatible install" >&5
+$as_echo_n "checking for a BSD-compatible install... " >&6; }
+if test -z "$INSTALL"; then
+if ${ac_cv_path_install+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    # Account for people who put trailing slashes in PATH elements.
+case $as_dir/ in #((
+  ./ | .// | /[cC]/* | \
+  /etc/* | /usr/sbin/* | /usr/etc/* | /sbin/* | /usr/afsws/bin/* | \
+  ?:[\\/]os2[\\/]install[\\/]* | ?:[\\/]OS2[\\/]INSTALL[\\/]* | \
+  /usr/ucb/* ) ;;
   *)
-    # Assume MSVC wrapper
-    library_names_spec='${libname}`echo ${release} | $SED -e 's/[.]/-/g'`${versuffix}${shared_ext} $libname.lib'
-    dynamic_linker='Win32 ld.exe'
+    # OSF1 and SCO ODT 3.0 have their own names for install.
+    # Don't use installbsd from OSF since it installs stuff as root
+    # by default.
+    for ac_prog in ginstall scoinst install; do
+      for ac_exec_ext in '' $ac_executable_extensions; do
+	if as_fn_executable_p "$as_dir/$ac_prog$ac_exec_ext"; then
+	  if test $ac_prog = install &&
+	    grep dspmsg "$as_dir/$ac_prog$ac_exec_ext" >/dev/null 2>&1; then
+	    # AIX install.  It has an incompatible calling convention.
+	    :
+	  elif test $ac_prog = install &&
+	    grep pwplus "$as_dir/$ac_prog$ac_exec_ext" >/dev/null 2>&1; then
+	    # program-specific install script used by HP pwplus--don't use.
+	    :
+	  else
+	    rm -rf conftest.one conftest.two conftest.dir
+	    echo one > conftest.one
+	    echo two > conftest.two
+	    mkdir conftest.dir
+	    if "$as_dir/$ac_prog$ac_exec_ext" -c conftest.one conftest.two "`pwd`/conftest.dir" &&
+	      test -s conftest.one && test -s conftest.two &&
+	      test -s conftest.dir/conftest.one &&
+	      test -s conftest.dir/conftest.two
+	    then
+	      ac_cv_path_install="$as_dir/$ac_prog$ac_exec_ext -c"
+	      break 3
+	    fi
+	  fi
+	fi
+      done
+    done
     ;;
-  esac
-  # FIXME: first we should search . and the directory the executable is in
-  shlibpath_var=PATH
-  ;;
-
-darwin* | rhapsody*)
-  dynamic_linker="$host_os dyld"
-  version_type=darwin
-  need_lib_prefix=no
-  need_version=no
-  library_names_spec='${libname}${release}${major}$shared_ext ${libname}$shared_ext'
-  soname_spec='${libname}${release}${major}$shared_ext'
-  shlibpath_overrides_runpath=yes
-  shlibpath_var=DYLD_LIBRARY_PATH
-  shrext_cmds='`test .$module = .yes && echo .so || echo .dylib`'
+esac
 
-  sys_lib_dlsearch_path_spec='/usr/local/lib /lib /usr/lib'
-  ;;
+  done
+IFS=$as_save_IFS
 
-dgux*)
-  version_type=linux # correct to gnu/linux during the next big refactor
-  need_lib_prefix=no
-  need_version=no
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname$shared_ext'
-  soname_spec='${libname}${release}${shared_ext}$major'
-  shlibpath_var=LD_LIBRARY_PATH
-  ;;
+rm -rf conftest.one conftest.two conftest.dir
 
-freebsd* | dragonfly*)
-  # DragonFly does not have aout.  When/if they implement a new
-  # versioning mechanism, adjust this.
-  if test -x /usr/bin/objformat; then
-    objformat=`/usr/bin/objformat`
+fi
+  if test "${ac_cv_path_install+set}" = set; then
+    INSTALL=$ac_cv_path_install
   else
-    case $host_os in
-    freebsd[23].*) objformat=aout ;;
-    *) objformat=elf ;;
-    esac
+    # As a last resort, use the slow shell script.  Don't cache a
+    # value for INSTALL within a source directory, because that will
+    # break other packages using the cache if that directory is
+    # removed, or if the value is a relative name.
+    INSTALL=$ac_install_sh
   fi
-  version_type=freebsd-$objformat
-  case $version_type in
-    freebsd-elf*)
-      library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext} $libname${shared_ext}'
-      need_version=no
-      need_lib_prefix=no
-      ;;
-    freebsd-*)
-      library_names_spec='${libname}${release}${shared_ext}$versuffix $libname${shared_ext}$versuffix'
-      need_version=yes
-      ;;
-  esac
-  shlibpath_var=LD_LIBRARY_PATH
-  case $host_os in
-  freebsd2.*)
-    shlibpath_overrides_runpath=yes
-    ;;
-  freebsd3.[01]* | freebsdelf3.[01]*)
-    shlibpath_overrides_runpath=yes
-    hardcode_into_libs=yes
-    ;;
-  freebsd3.[2-9]* | freebsdelf3.[2-9]* | \
-  freebsd4.[0-5] | freebsdelf4.[0-5] | freebsd4.1.1 | freebsdelf4.1.1)
-    shlibpath_overrides_runpath=no
-    hardcode_into_libs=yes
-    ;;
-  *) # from 4.6 on, and DragonFly
-    shlibpath_overrides_runpath=yes
-    hardcode_into_libs=yes
-    ;;
-  esac
-  ;;
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $INSTALL" >&5
+$as_echo "$INSTALL" >&6; }
+
+# Use test -z because SunOS4 sh mishandles braces in ${var-val}.
+# It thinks the first close brace ends the variable substitution.
+test -z "$INSTALL_PROGRAM" && INSTALL_PROGRAM='${INSTALL}'
+
+test -z "$INSTALL_SCRIPT" && INSTALL_SCRIPT='${INSTALL}'
+
+test -z "$INSTALL_DATA" && INSTALL_DATA='${INSTALL} -m 644'
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether build environment is sane" >&5
+$as_echo_n "checking whether build environment is sane... " >&6; }
+# Reject unsafe characters in $srcdir or the absolute working directory
+# name.  Accept space and tab only in the latter.
+am_lf='
+'
+case `pwd` in
+  *[\\\"\#\$\&\'\`$am_lf]*)
+    as_fn_error $? "unsafe absolute working directory name" "$LINENO" 5;;
+esac
+case $srcdir in
+  *[\\\"\#\$\&\'\`$am_lf\ \	]*)
+    as_fn_error $? "unsafe srcdir value: '$srcdir'" "$LINENO" 5;;
+esac
+
+# Do 'set' in a subshell so we don't clobber the current shell's
+# arguments.  Must try -L first in case configure is actually a
+# symlink; some systems play weird games with the mod time of symlinks
+# (eg FreeBSD returns the mod time of the symlink's containing
+# directory).
+if (
+   am_has_slept=no
+   for am_try in 1 2; do
+     echo "timestamp, slept: $am_has_slept" > conftest.file
+     set X `ls -Lt "$srcdir/configure" conftest.file 2> /dev/null`
+     if test "$*" = "X"; then
+	# -L didn't work.
+	set X `ls -t "$srcdir/configure" conftest.file`
+     fi
+     if test "$*" != "X $srcdir/configure conftest.file" \
+	&& test "$*" != "X conftest.file $srcdir/configure"; then
 
-haiku*)
-  version_type=linux # correct to gnu/linux during the next big refactor
-  need_lib_prefix=no
-  need_version=no
-  dynamic_linker="$host_os runtime_loader"
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}${major} ${libname}${shared_ext}'
-  soname_spec='${libname}${release}${shared_ext}$major'
-  shlibpath_var=LIBRARY_PATH
-  shlibpath_overrides_runpath=yes
-  sys_lib_dlsearch_path_spec='/boot/home/config/lib /boot/common/lib /boot/system/lib'
-  hardcode_into_libs=yes
-  ;;
+	# If neither matched, then we have a broken ls.  This can happen
+	# if, for instance, CONFIG_SHELL is bash and it inherits a
+	# broken ls alias from the environment.  This has actually
+	# happened.  Such a system could not be considered "sane".
+	as_fn_error $? "ls -t appears to fail.  Make sure there is not a broken
+  alias in your environment" "$LINENO" 5
+     fi
+     if test "$2" = conftest.file || test $am_try -eq 2; then
+       break
+     fi
+     # Just in case.
+     sleep 1
+     am_has_slept=yes
+   done
+   test "$2" = conftest.file
+   )
+then
+   # Ok.
+   :
+else
+   as_fn_error $? "newly created file is older than distributed files!
+Check your system clock" "$LINENO" 5
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+# If we didn't sleep, we still need to ensure time stamps of config.status and
+# generated files are strictly newer.
+am_sleep_pid=
+if grep 'slept: no' conftest.file >/dev/null 2>&1; then
+  ( sleep 1 ) &
+  am_sleep_pid=$!
+fi
 
-hpux9* | hpux10* | hpux11*)
-  # Give a soname corresponding to the major version so that dld.sl refuses to
-  # link against other versions.
-  version_type=sunos
-  need_lib_prefix=no
-  need_version=no
-  case $host_cpu in
-  ia64*)
-    shrext_cmds='.so'
-    hardcode_into_libs=yes
-    dynamic_linker="$host_os dld.so"
-    shlibpath_var=LD_LIBRARY_PATH
-    shlibpath_overrides_runpath=yes # Unless +noenvvar is specified.
-    library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-    soname_spec='${libname}${release}${shared_ext}$major'
-    if test "X$HPUX_IA64_MODE" = X32; then
-      sys_lib_search_path_spec="/usr/lib/hpux32 /usr/local/lib/hpux32 /usr/local/lib"
-    else
-      sys_lib_search_path_spec="/usr/lib/hpux64 /usr/local/lib/hpux64"
-    fi
-    sys_lib_dlsearch_path_spec=$sys_lib_search_path_spec
-    ;;
-  hppa*64*)
-    shrext_cmds='.sl'
-    hardcode_into_libs=yes
-    dynamic_linker="$host_os dld.sl"
-    shlibpath_var=LD_LIBRARY_PATH # How should we handle SHLIB_PATH
-    shlibpath_overrides_runpath=yes # Unless +noenvvar is specified.
-    library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-    soname_spec='${libname}${release}${shared_ext}$major'
-    sys_lib_search_path_spec="/usr/lib/pa20_64 /usr/ccs/lib/pa20_64"
-    sys_lib_dlsearch_path_spec=$sys_lib_search_path_spec
-    ;;
-  *)
-    shrext_cmds='.sl'
-    dynamic_linker="$host_os dld.sl"
-    shlibpath_var=SHLIB_PATH
-    shlibpath_overrides_runpath=no # +s is required to enable SHLIB_PATH
-    library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-    soname_spec='${libname}${release}${shared_ext}$major'
-    ;;
-  esac
-  # HP-UX runs *really* slowly unless shared libraries are mode 555, ...
-  postinstall_cmds='chmod 555 $lib'
-  # or fails outright, so override atomically:
-  install_override_mode=555
-  ;;
+rm -f conftest.file
 
-interix[3-9]*)
-  version_type=linux # correct to gnu/linux during the next big refactor
-  need_lib_prefix=no
-  need_version=no
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major ${libname}${shared_ext}'
-  soname_spec='${libname}${release}${shared_ext}$major'
-  dynamic_linker='Interix 3.x ld.so.1 (PE, like ELF)'
-  shlibpath_var=LD_LIBRARY_PATH
-  shlibpath_overrides_runpath=no
-  hardcode_into_libs=yes
-  ;;
+test "$program_prefix" != NONE &&
+  program_transform_name="s&^&$program_prefix&;$program_transform_name"
+# Use a double $ so make ignores it.
+test "$program_suffix" != NONE &&
+  program_transform_name="s&\$&$program_suffix&;$program_transform_name"
+# Double any \ or $.
+# By default was `s,x,x', remove it if useless.
+ac_script='s/[\\$]/&&/g;s/;s,x,x,$//'
+program_transform_name=`$as_echo "$program_transform_name" | sed "$ac_script"`
 
-irix5* | irix6* | nonstopux*)
-  case $host_os in
-    nonstopux*) version_type=nonstopux ;;
-    *)
-	if test "$lt_cv_prog_gnu_ld" = yes; then
-		version_type=linux # correct to gnu/linux during the next big refactor
-	else
-		version_type=irix
-	fi ;;
+if test x"${MISSING+set}" != xset; then
+  case $am_aux_dir in
+  *\ * | *\	*)
+    MISSING="\${SHELL} \"$am_aux_dir/missing\"" ;;
+  *)
+    MISSING="\${SHELL} $am_aux_dir/missing" ;;
   esac
-  need_lib_prefix=no
-  need_version=no
-  soname_spec='${libname}${release}${shared_ext}$major'
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major ${libname}${release}${shared_ext} $libname${shared_ext}'
-  case $host_os in
-  irix5* | nonstopux*)
-    libsuff= shlibsuff=
-    ;;
+fi
+# Use eval to expand $SHELL
+if eval "$MISSING --is-lightweight"; then
+  am_missing_run="$MISSING "
+else
+  am_missing_run=
+  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: 'missing' script is too old or missing" >&5
+$as_echo "$as_me: WARNING: 'missing' script is too old or missing" >&2;}
+fi
+
+if test x"${install_sh}" != xset; then
+  case $am_aux_dir in
+  *\ * | *\	*)
+    install_sh="\${SHELL} '$am_aux_dir/install-sh'" ;;
   *)
-    case $LD in # libtool.m4 will add one of these switches to LD
-    *-32|*"-32 "|*-melf32bsmip|*"-melf32bsmip ")
-      libsuff= shlibsuff= libmagic=32-bit;;
-    *-n32|*"-n32 "|*-melf32bmipn32|*"-melf32bmipn32 ")
-      libsuff=32 shlibsuff=N32 libmagic=N32;;
-    *-64|*"-64 "|*-melf64bmip|*"-melf64bmip ")
-      libsuff=64 shlibsuff=64 libmagic=64-bit;;
-    *) libsuff= shlibsuff= libmagic=never-match;;
-    esac
-    ;;
+    install_sh="\${SHELL} $am_aux_dir/install-sh"
   esac
-  shlibpath_var=LD_LIBRARY${shlibsuff}_PATH
-  shlibpath_overrides_runpath=no
-  sys_lib_search_path_spec="/usr/lib${libsuff} /lib${libsuff} /usr/local/lib${libsuff}"
-  sys_lib_dlsearch_path_spec="/usr/lib${libsuff} /lib${libsuff}"
-  hardcode_into_libs=yes
-  ;;
+fi
 
-# No shared lib support for Linux oldld, aout, or coff.
-linux*oldld* | linux*aout* | linux*coff*)
-  dynamic_linker=no
-  ;;
+# Installed binaries are usually stripped using 'strip' when the user
+# run "make install-strip".  However 'strip' might not be the right
+# tool to use in cross-compilation environments, therefore Automake
+# will honor the 'STRIP' environment variable to overrule this program.
+if test "$cross_compiling" != no; then
+  if test -n "$ac_tool_prefix"; then
+  # Extract the first word of "${ac_tool_prefix}strip", so it can be a program name with args.
+set dummy ${ac_tool_prefix}strip; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_STRIP+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$STRIP"; then
+  ac_cv_prog_STRIP="$STRIP" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_STRIP="${ac_tool_prefix}strip"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+STRIP=$ac_cv_prog_STRIP
+if test -n "$STRIP"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $STRIP" >&5
+$as_echo "$STRIP" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+fi
+if test -z "$ac_cv_prog_STRIP"; then
+  ac_ct_STRIP=$STRIP
+  # Extract the first word of "strip", so it can be a program name with args.
+set dummy strip; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_ac_ct_STRIP+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$ac_ct_STRIP"; then
+  ac_cv_prog_ac_ct_STRIP="$ac_ct_STRIP" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_ac_ct_STRIP="strip"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+ac_ct_STRIP=$ac_cv_prog_ac_ct_STRIP
+if test -n "$ac_ct_STRIP"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_STRIP" >&5
+$as_echo "$ac_ct_STRIP" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
 
-# This must be glibc/ELF.
-linux* | k*bsd*-gnu | kopensolaris*-gnu | gnu*)
-  version_type=linux # correct to gnu/linux during the next big refactor
-  need_lib_prefix=no
-  need_version=no
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-  soname_spec='${libname}${release}${shared_ext}$major'
-  finish_cmds='PATH="\$PATH:/sbin" ldconfig -n $libdir'
-  shlibpath_var=LD_LIBRARY_PATH
-  shlibpath_overrides_runpath=no
+  if test "x$ac_ct_STRIP" = x; then
+    STRIP=":"
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+    STRIP=$ac_ct_STRIP
+  fi
+else
+  STRIP="$ac_cv_prog_STRIP"
+fi
 
-  # Some binutils ld are patched to set DT_RUNPATH
-  if ${lt_cv_shlibpath_overrides_runpath+:} false; then :
+fi
+INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s"
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for a thread-safe mkdir -p" >&5
+$as_echo_n "checking for a thread-safe mkdir -p... " >&6; }
+if test -z "$MKDIR_P"; then
+  if ${ac_cv_path_mkdir+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  lt_cv_shlibpath_overrides_runpath=no
-    save_LDFLAGS=$LDFLAGS
-    save_libdir=$libdir
-    eval "libdir=/foo; wl=\"$lt_prog_compiler_wl_CXX\"; \
-	 LDFLAGS=\"\$LDFLAGS $hardcode_libdir_flag_spec_CXX\""
-    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH$PATH_SEPARATOR/opt/sfw/bin
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_prog in mkdir gmkdir; do
+	 for ac_exec_ext in '' $ac_executable_extensions; do
+	   as_fn_executable_p "$as_dir/$ac_prog$ac_exec_ext" || continue
+	   case `"$as_dir/$ac_prog$ac_exec_ext" --version 2>&1` in #(
+	     'mkdir (GNU coreutils) '* | \
+	     'mkdir (coreutils) '* | \
+	     'mkdir (fileutils) '4.1*)
+	       ac_cv_path_mkdir=$as_dir/$ac_prog$ac_exec_ext
+	       break 3;;
+	   esac
+	 done
+       done
+  done
+IFS=$as_save_IFS
 
-int
-main ()
-{
+fi
 
-  ;
-  return 0;
-}
+  test -d ./--version && rmdir ./--version
+  if test "${ac_cv_path_mkdir+set}" = set; then
+    MKDIR_P="$ac_cv_path_mkdir -p"
+  else
+    # As a last resort, use the slow shell script.  Don't cache a
+    # value for MKDIR_P within a source directory, because that will
+    # break other packages using the cache if that directory is
+    # removed, or if the value is a relative name.
+    MKDIR_P="$ac_install_sh -d"
+  fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $MKDIR_P" >&5
+$as_echo "$MKDIR_P" >&6; }
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ${MAKE-make} sets \$(MAKE)" >&5
+$as_echo_n "checking whether ${MAKE-make} sets \$(MAKE)... " >&6; }
+set x ${MAKE-make}
+ac_make=`$as_echo "$2" | sed 's/+/p/g; s/[^a-zA-Z0-9_]/_/g'`
+if eval \${ac_cv_prog_make_${ac_make}_set+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat >conftest.make <<\_ACEOF
+SHELL = /bin/sh
+all:
+	@echo '@@@%%%=$(MAKE)=@@@%%%'
 _ACEOF
-if ac_fn_cxx_try_link "$LINENO"; then :
-  if  ($OBJDUMP -p conftest$ac_exeext) 2>/dev/null | grep "RUNPATH.*$libdir" >/dev/null; then :
-  lt_cv_shlibpath_overrides_runpath=yes
+# GNU make sometimes prints "make[1]: Entering ...", which would confuse us.
+case `${MAKE-make} -f conftest.make 2>/dev/null` in
+  *@@@%%%=?*=@@@%%%*)
+    eval ac_cv_prog_make_${ac_make}_set=yes;;
+  *)
+    eval ac_cv_prog_make_${ac_make}_set=no;;
+esac
+rm -f conftest.make
 fi
+if eval test \$ac_cv_prog_make_${ac_make}_set = yes; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+  SET_MAKE=
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+  SET_MAKE="MAKE=${MAKE-make}"
 fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-    LDFLAGS=$save_LDFLAGS
-    libdir=$save_libdir
 
+# Check whether --enable-silent-rules was given.
+if test "${enable_silent_rules+set}" = set; then :
+  enableval=$enable_silent_rules;
 fi
 
-  shlibpath_overrides_runpath=$lt_cv_shlibpath_overrides_runpath
-
-  # This implies no fast_install, which is unacceptable.
-  # Some rework will be needed to allow for fast_install
-  # before this can be enabled.
-  hardcode_into_libs=yes
+case $enable_silent_rules in # (((
+  yes) AM_DEFAULT_VERBOSITY=0;;
+   no) AM_DEFAULT_VERBOSITY=1;;
+    *) AM_DEFAULT_VERBOSITY=1;;
+esac
+am_make=${MAKE-make}
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $am_make supports nested variables" >&5
+$as_echo_n "checking whether $am_make supports nested variables... " >&6; }
+if ${am_cv_make_support_nested_variables+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if $as_echo 'TRUE=$(BAR$(V))
+BAR0=false
+BAR1=true
+V=1
+am__doit:
+	@$(TRUE)
+.PHONY: am__doit' | $am_make -f - >/dev/null 2>&1; then
+  am_cv_make_support_nested_variables=yes
+else
+  am_cv_make_support_nested_variables=no
+fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $am_cv_make_support_nested_variables" >&5
+$as_echo "$am_cv_make_support_nested_variables" >&6; }
+if test $am_cv_make_support_nested_variables = yes; then
+    AM_V='$(V)'
+  AM_DEFAULT_V='$(AM_DEFAULT_VERBOSITY)'
+else
+  AM_V=$AM_DEFAULT_VERBOSITY
+  AM_DEFAULT_V=$AM_DEFAULT_VERBOSITY
+fi
+AM_BACKSLASH='\'
 
-  # Append ld.so.conf contents to the search path
-  if test -f /etc/ld.so.conf; then
-    lt_ld_extra=`awk '/^include / { system(sprintf("cd /etc; cat %s 2>/dev/null", \$2)); skip = 1; } { if (!skip) print \$0; skip = 0; }' < /etc/ld.so.conf | $SED -e 's/#.*//;/^[	 ]*hwcap[	 ]/d;s/[:,	]/ /g;s/=[^=]*$//;s/=[^= ]* / /g;s/"//g;/^$/d' | tr '\n' ' '`
-    sys_lib_dlsearch_path_spec="/lib /usr/lib $lt_ld_extra"
+if test "`cd $srcdir && pwd`" != "`pwd`"; then
+  # Use -I$(srcdir) only when $(srcdir) != ., so that make's output
+  # is not polluted with repeated "-I."
+  am__isrc=' -I$(srcdir)'
+  # test to see if srcdir already configured
+  if test -f $srcdir/config.status; then
+    as_fn_error $? "source directory already configured; run \"make distclean\" there first" "$LINENO" 5
   fi
+fi
 
-  # We used to test for /lib/ld.so.1 and disable shared libraries on
-  # powerpc, because MkLinux only supported shared libraries with the
-  # GNU dynamic linker.  Since this was broken with cross compilers,
-  # most powerpc-linux boxes support dynamic linking these days and
-  # people can always --disable-shared, the test was removed, and we
-  # assume the GNU/Linux dynamic linker is in use.
-  dynamic_linker='GNU/Linux ld.so'
-  ;;
-
-netbsdelf*-gnu)
-  version_type=linux
-  need_lib_prefix=no
-  need_version=no
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major ${libname}${shared_ext}'
-  soname_spec='${libname}${release}${shared_ext}$major'
-  shlibpath_var=LD_LIBRARY_PATH
-  shlibpath_overrides_runpath=no
-  hardcode_into_libs=yes
-  dynamic_linker='NetBSD ld.elf_so'
-  ;;
-
-netbsd*)
-  version_type=sunos
-  need_lib_prefix=no
-  need_version=no
-  if echo __ELF__ | $CC -E - | $GREP __ELF__ >/dev/null; then
-    library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${shared_ext}$versuffix'
-    finish_cmds='PATH="\$PATH:/sbin" ldconfig -m $libdir'
-    dynamic_linker='NetBSD (a.out) ld.so'
+# test whether we have cygpath
+if test -z "$CYGPATH_W"; then
+  if (cygpath --version) >/dev/null 2>/dev/null; then
+    CYGPATH_W='cygpath -w'
   else
-    library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major ${libname}${shared_ext}'
-    soname_spec='${libname}${release}${shared_ext}$major'
-    dynamic_linker='NetBSD ld.elf_so'
+    CYGPATH_W=echo
   fi
-  shlibpath_var=LD_LIBRARY_PATH
-  shlibpath_overrides_runpath=yes
-  hardcode_into_libs=yes
-  ;;
+fi
 
-newsos6)
-  version_type=linux # correct to gnu/linux during the next big refactor
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-  shlibpath_var=LD_LIBRARY_PATH
-  shlibpath_overrides_runpath=yes
-  ;;
 
-*nto* | *qnx*)
-  version_type=qnx
-  need_lib_prefix=no
-  need_version=no
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-  soname_spec='${libname}${release}${shared_ext}$major'
-  shlibpath_var=LD_LIBRARY_PATH
-  shlibpath_overrides_runpath=no
-  hardcode_into_libs=yes
-  dynamic_linker='ldqnx.so'
-  ;;
+# Define the identity of the package.
+ PACKAGE='ceph'
+ VERSION='9.2.0'
 
-openbsd*)
-  version_type=sunos
-  sys_lib_dlsearch_path_spec="/usr/lib"
-  need_lib_prefix=no
-  # Some older versions of OpenBSD (3.3 at least) *do* need versioned libs.
-  case $host_os in
-    openbsd3.3 | openbsd3.3.*)	need_version=yes ;;
-    *)				need_version=no  ;;
-  esac
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${shared_ext}$versuffix'
-  finish_cmds='PATH="\$PATH:/sbin" ldconfig -m $libdir'
-  shlibpath_var=LD_LIBRARY_PATH
-  if test -z "`echo __ELF__ | $CC -E - | $GREP __ELF__`" || test "$host_os-$host_cpu" = "openbsd2.8-powerpc"; then
-    case $host_os in
-      openbsd2.[89] | openbsd2.[89].*)
-	shlibpath_overrides_runpath=no
-	;;
-      *)
-	shlibpath_overrides_runpath=yes
-	;;
-      esac
-  else
-    shlibpath_overrides_runpath=yes
-  fi
-  ;;
 
-os2*)
-  libname_spec='$name'
-  shrext_cmds=".dll"
-  need_lib_prefix=no
-  library_names_spec='$libname${shared_ext} $libname.a'
-  dynamic_linker='OS/2 ld.exe'
-  shlibpath_var=LIBPATH
-  ;;
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE "$PACKAGE"
+_ACEOF
+
 
-osf3* | osf4* | osf5*)
-  version_type=osf
-  need_lib_prefix=no
-  need_version=no
-  soname_spec='${libname}${release}${shared_ext}$major'
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-  shlibpath_var=LD_LIBRARY_PATH
-  sys_lib_search_path_spec="/usr/shlib /usr/ccs/lib /usr/lib/cmplrs/cc /usr/lib /usr/local/lib /var/shlib"
-  sys_lib_dlsearch_path_spec="$sys_lib_search_path_spec"
-  ;;
+cat >>confdefs.h <<_ACEOF
+#define VERSION "$VERSION"
+_ACEOF
 
-rdos*)
-  dynamic_linker=no
-  ;;
+# Some tools Automake needs.
 
-solaris*)
-  version_type=linux # correct to gnu/linux during the next big refactor
-  need_lib_prefix=no
-  need_version=no
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-  soname_spec='${libname}${release}${shared_ext}$major'
-  shlibpath_var=LD_LIBRARY_PATH
-  shlibpath_overrides_runpath=yes
-  hardcode_into_libs=yes
-  # ldd complains unless libraries are executable
-  postinstall_cmds='chmod +x $lib'
-  ;;
+ACLOCAL=${ACLOCAL-"${am_missing_run}aclocal-${am__api_version}"}
 
-sunos4*)
-  version_type=sunos
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${shared_ext}$versuffix'
-  finish_cmds='PATH="\$PATH:/usr/etc" ldconfig $libdir'
-  shlibpath_var=LD_LIBRARY_PATH
-  shlibpath_overrides_runpath=yes
-  if test "$with_gnu_ld" = yes; then
-    need_lib_prefix=no
-  fi
-  need_version=yes
-  ;;
 
-sysv4 | sysv4.3*)
-  version_type=linux # correct to gnu/linux during the next big refactor
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-  soname_spec='${libname}${release}${shared_ext}$major'
-  shlibpath_var=LD_LIBRARY_PATH
-  case $host_vendor in
-    sni)
-      shlibpath_overrides_runpath=no
-      need_lib_prefix=no
-      runpath_var=LD_RUN_PATH
+AUTOCONF=${AUTOCONF-"${am_missing_run}autoconf"}
+
+
+AUTOMAKE=${AUTOMAKE-"${am_missing_run}automake-${am__api_version}"}
+
+
+AUTOHEADER=${AUTOHEADER-"${am_missing_run}autoheader"}
+
+
+MAKEINFO=${MAKEINFO-"${am_missing_run}makeinfo"}
+
+# For better backward compatibility.  To be removed once Automake 1.9.x
+# dies out for good.  For more background, see:
+# <http://lists.gnu.org/archive/html/automake/2012-07/msg00001.html>
+# <http://lists.gnu.org/archive/html/automake/2012-07/msg00014.html>
+mkdir_p='$(MKDIR_P)'
+
+# We need awk for the "check" target.  The system "awk" is bad on
+# some platforms.
+# Always define AMTAR for backward compatibility.  Yes, it's still used
+# in the wild :-(  We should find a proper way to deprecate it ...
+AMTAR='$${TAR-tar}'
+
+
+# We'll loop over all known methods to create a tar archive until one works.
+_am_tools='gnutar plaintar pax cpio none'
+
+# The POSIX 1988 'ustar' format is defined with fixed-size fields.
+      # There is notably a 21 bits limit for the UID and the GID.  In fact,
+      # the 'pax' utility can hang on bigger UID/GID (see automake bug#8343
+      # and bug#13588).
+      am_max_uid=2097151 # 2^21 - 1
+      am_max_gid=$am_max_uid
+      # The $UID and $GID variables are not portable, so we need to resort
+      # to the POSIX-mandated id(1) utility.  Errors in the 'id' calls
+      # below are definitely unexpected, so allow the users to see them
+      # (that is, avoid stderr redirection).
+      am_uid=`id -u || echo unknown`
+      am_gid=`id -g || echo unknown`
+      { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether UID '$am_uid' is supported by ustar format" >&5
+$as_echo_n "checking whether UID '$am_uid' is supported by ustar format... " >&6; }
+      if test $am_uid -le $am_max_uid; then
+         { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+      else
+         { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+         _am_tools=none
+      fi
+      { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether GID '$am_gid' is supported by ustar format" >&5
+$as_echo_n "checking whether GID '$am_gid' is supported by ustar format... " >&6; }
+      if test $am_gid -le $am_max_gid; then
+         { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+      else
+        { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+        _am_tools=none
+      fi
+
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking how to create a ustar tar archive" >&5
+$as_echo_n "checking how to create a ustar tar archive... " >&6; }
+
+  # Go ahead even if we have the value already cached.  We do so because we
+  # need to set the values for the 'am__tar' and 'am__untar' variables.
+  _am_tools=${am_cv_prog_tar_ustar-$_am_tools}
+
+  for _am_tool in $_am_tools; do
+    case $_am_tool in
+    gnutar)
+      for _am_tar in tar gnutar gtar; do
+        { echo "$as_me:$LINENO: $_am_tar --version" >&5
+   ($_am_tar --version) >&5 2>&5
+   ac_status=$?
+   echo "$as_me:$LINENO: \$? = $ac_status" >&5
+   (exit $ac_status); } && break
+      done
+      am__tar="$_am_tar --format=ustar -chf - "'"$$tardir"'
+      am__tar_="$_am_tar --format=ustar -chf - "'"$tardir"'
+      am__untar="$_am_tar -xf -"
       ;;
-    siemens)
-      need_lib_prefix=no
+    plaintar)
+      # Must skip GNU tar: if it does not support --format= it doesn't create
+      # ustar tarball either.
+      (tar --version) >/dev/null 2>&1 && continue
+      am__tar='tar chf - "$$tardir"'
+      am__tar_='tar chf - "$tardir"'
+      am__untar='tar xf -'
       ;;
-    motorola)
-      need_lib_prefix=no
-      need_version=no
-      shlibpath_overrides_runpath=no
-      sys_lib_search_path_spec='/lib /usr/lib /usr/ccs/lib'
+    pax)
+      am__tar='pax -L -x ustar -w "$$tardir"'
+      am__tar_='pax -L -x ustar -w "$tardir"'
+      am__untar='pax -r'
       ;;
-  esac
-  ;;
+    cpio)
+      am__tar='find "$$tardir" -print | cpio -o -H ustar -L'
+      am__tar_='find "$tardir" -print | cpio -o -H ustar -L'
+      am__untar='cpio -i -H ustar -d'
+      ;;
+    none)
+      am__tar=false
+      am__tar_=false
+      am__untar=false
+      ;;
+    esac
 
-sysv4*MP*)
-  if test -d /usr/nec ;then
-    version_type=linux # correct to gnu/linux during the next big refactor
-    library_names_spec='$libname${shared_ext}.$versuffix $libname${shared_ext}.$major $libname${shared_ext}'
-    soname_spec='$libname${shared_ext}.$major'
-    shlibpath_var=LD_LIBRARY_PATH
-  fi
-  ;;
+    # If the value was cached, stop now.  We just wanted to have am__tar
+    # and am__untar set.
+    test -n "${am_cv_prog_tar_ustar}" && break
 
-sysv5* | sco3.2v5* | sco5v6* | unixware* | OpenUNIX* | sysv4*uw2*)
-  version_type=freebsd-elf
-  need_lib_prefix=no
-  need_version=no
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext} $libname${shared_ext}'
-  soname_spec='${libname}${release}${shared_ext}$major'
-  shlibpath_var=LD_LIBRARY_PATH
-  shlibpath_overrides_runpath=yes
-  hardcode_into_libs=yes
-  if test "$with_gnu_ld" = yes; then
-    sys_lib_search_path_spec='/usr/local/lib /usr/gnu/lib /usr/ccs/lib /usr/lib /lib'
-  else
-    sys_lib_search_path_spec='/usr/ccs/lib /usr/lib'
-    case $host_os in
-      sco3.2v5*)
-        sys_lib_search_path_spec="$sys_lib_search_path_spec /lib"
-	;;
-    esac
-  fi
-  sys_lib_dlsearch_path_spec='/usr/lib'
-  ;;
+    # tar/untar a dummy directory, and stop if the command works.
+    rm -rf conftest.dir
+    mkdir conftest.dir
+    echo GrepMe > conftest.dir/file
+    { echo "$as_me:$LINENO: tardir=conftest.dir && eval $am__tar_ >conftest.tar" >&5
+   (tardir=conftest.dir && eval $am__tar_ >conftest.tar) >&5 2>&5
+   ac_status=$?
+   echo "$as_me:$LINENO: \$? = $ac_status" >&5
+   (exit $ac_status); }
+    rm -rf conftest.dir
+    if test -s conftest.tar; then
+      { echo "$as_me:$LINENO: $am__untar <conftest.tar" >&5
+   ($am__untar <conftest.tar) >&5 2>&5
+   ac_status=$?
+   echo "$as_me:$LINENO: \$? = $ac_status" >&5
+   (exit $ac_status); }
+      { echo "$as_me:$LINENO: cat conftest.dir/file" >&5
+   (cat conftest.dir/file) >&5 2>&5
+   ac_status=$?
+   echo "$as_me:$LINENO: \$? = $ac_status" >&5
+   (exit $ac_status); }
+      grep GrepMe conftest.dir/file >/dev/null 2>&1 && break
+    fi
+  done
+  rm -rf conftest.dir
 
-tpf*)
-  # TPF is a cross-target only.  Preferred cross-host = GNU/Linux.
-  version_type=linux # correct to gnu/linux during the next big refactor
-  need_lib_prefix=no
-  need_version=no
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-  shlibpath_var=LD_LIBRARY_PATH
-  shlibpath_overrides_runpath=no
-  hardcode_into_libs=yes
-  ;;
+  if ${am_cv_prog_tar_ustar+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  am_cv_prog_tar_ustar=$_am_tool
+fi
 
-uts4*)
-  version_type=linux # correct to gnu/linux during the next big refactor
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-  soname_spec='${libname}${release}${shared_ext}$major'
-  shlibpath_var=LD_LIBRARY_PATH
-  ;;
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $am_cv_prog_tar_ustar" >&5
+$as_echo "$am_cv_prog_tar_ustar" >&6; }
 
-*)
-  dynamic_linker=no
-  ;;
-esac
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $dynamic_linker" >&5
-$as_echo "$dynamic_linker" >&6; }
-test "$dynamic_linker" = no && can_build_shared=no
 
-variables_saved_for_relink="PATH $shlibpath_var $runpath_var"
-if test "$GCC" = yes; then
-  variables_saved_for_relink="$variables_saved_for_relink GCC_EXEC_PREFIX COMPILER_PATH LIBRARY_PATH"
+
+
+
+depcc="$CC"   am_compiler_list=
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking dependency style of $depcc" >&5
+$as_echo_n "checking dependency style of $depcc... " >&6; }
+if ${am_cv_CC_dependencies_compiler_type+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -z "$AMDEP_TRUE" && test -f "$am_depcomp"; then
+  # We make a subdir and do the tests there.  Otherwise we can end up
+  # making bogus files that we don't know about and never remove.  For
+  # instance it was reported that on HP-UX the gcc test will end up
+  # making a dummy file named 'D' -- because '-MD' means "put the output
+  # in D".
+  rm -rf conftest.dir
+  mkdir conftest.dir
+  # Copy depcomp to subdir because otherwise we won't find it if we're
+  # using a relative directory.
+  cp "$am_depcomp" conftest.dir
+  cd conftest.dir
+  # We will build objects and dependencies in a subdirectory because
+  # it helps to detect inapplicable dependency modes.  For instance
+  # both Tru64's cc and ICC support -MD to output dependencies as a
+  # side effect of compilation, but ICC will put the dependencies in
+  # the current directory while Tru64 will put them in the object
+  # directory.
+  mkdir sub
+
+  am_cv_CC_dependencies_compiler_type=none
+  if test "$am_compiler_list" = ""; then
+     am_compiler_list=`sed -n 's/^#*\([a-zA-Z0-9]*\))$/\1/p' < ./depcomp`
+  fi
+  am__universal=false
+  case " $depcc " in #(
+     *\ -arch\ *\ -arch\ *) am__universal=true ;;
+     esac
+
+  for depmode in $am_compiler_list; do
+    # Setup a source with many dependencies, because some compilers
+    # like to wrap large dependency lists on column 80 (with \), and
+    # we should not choose a depcomp mode which is confused by this.
+    #
+    # We need to recreate these files for each test, as the compiler may
+    # overwrite some of them when testing with obscure command lines.
+    # This happens at least with the AIX C compiler.
+    : > sub/conftest.c
+    for i in 1 2 3 4 5 6; do
+      echo '#include "conftst'$i'.h"' >> sub/conftest.c
+      # Using ": > sub/conftst$i.h" creates only sub/conftst1.h with
+      # Solaris 10 /bin/sh.
+      echo '/* dummy */' > sub/conftst$i.h
+    done
+    echo "${am__include} ${am__quote}sub/conftest.Po${am__quote}" > confmf
+
+    # We check with '-c' and '-o' for the sake of the "dashmstdout"
+    # mode.  It turns out that the SunPro C++ compiler does not properly
+    # handle '-M -o', and we need to detect this.  Also, some Intel
+    # versions had trouble with output in subdirs.
+    am__obj=sub/conftest.${OBJEXT-o}
+    am__minus_obj="-o $am__obj"
+    case $depmode in
+    gcc)
+      # This depmode causes a compiler race in universal mode.
+      test "$am__universal" = false || continue
+      ;;
+    nosideeffect)
+      # After this tag, mechanisms are not by side-effect, so they'll
+      # only be used when explicitly requested.
+      if test "x$enable_dependency_tracking" = xyes; then
+	continue
+      else
+	break
+      fi
+      ;;
+    msvc7 | msvc7msys | msvisualcpp | msvcmsys)
+      # This compiler won't grok '-c -o', but also, the minuso test has
+      # not run yet.  These depmodes are late enough in the game, and
+      # so weak that their functioning should not be impacted.
+      am__obj=conftest.${OBJEXT-o}
+      am__minus_obj=
+      ;;
+    none) break ;;
+    esac
+    if depmode=$depmode \
+       source=sub/conftest.c object=$am__obj \
+       depfile=sub/conftest.Po tmpdepfile=sub/conftest.TPo \
+       $SHELL ./depcomp $depcc -c $am__minus_obj sub/conftest.c \
+         >/dev/null 2>conftest.err &&
+       grep sub/conftst1.h sub/conftest.Po > /dev/null 2>&1 &&
+       grep sub/conftst6.h sub/conftest.Po > /dev/null 2>&1 &&
+       grep $am__obj sub/conftest.Po > /dev/null 2>&1 &&
+       ${MAKE-make} -s -f confmf > /dev/null 2>&1; then
+      # icc doesn't choke on unknown options, it will just issue warnings
+      # or remarks (even with -Werror).  So we grep stderr for any message
+      # that says an option was ignored or not supported.
+      # When given -MP, icc 7.0 and 7.1 complain thusly:
+      #   icc: Command line warning: ignoring option '-M'; no argument required
+      # The diagnosis changed in icc 8.0:
+      #   icc: Command line remark: option '-MP' not supported
+      if (grep 'ignoring option' conftest.err ||
+          grep 'not supported' conftest.err) >/dev/null 2>&1; then :; else
+        am_cv_CC_dependencies_compiler_type=$depmode
+        break
+      fi
+    fi
+  done
+
+  cd ..
+  rm -rf conftest.dir
+else
+  am_cv_CC_dependencies_compiler_type=none
 fi
 
-if test "${lt_cv_sys_lib_search_path_spec+set}" = set; then
-  sys_lib_search_path_spec="$lt_cv_sys_lib_search_path_spec"
-fi
-if test "${lt_cv_sys_lib_dlsearch_path_spec+set}" = set; then
-  sys_lib_dlsearch_path_spec="$lt_cv_sys_lib_dlsearch_path_spec"
 fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $am_cv_CC_dependencies_compiler_type" >&5
+$as_echo "$am_cv_CC_dependencies_compiler_type" >&6; }
+CCDEPMODE=depmode=$am_cv_CC_dependencies_compiler_type
 
+ if
+  test "x$enable_dependency_tracking" != xno \
+  && test "$am_cv_CC_dependencies_compiler_type" = gcc3; then
+  am__fastdepCC_TRUE=
+  am__fastdepCC_FALSE='#'
+else
+  am__fastdepCC_TRUE='#'
+  am__fastdepCC_FALSE=
+fi
 
 
+depcc="$CXX"  am_compiler_list=
 
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking dependency style of $depcc" >&5
+$as_echo_n "checking dependency style of $depcc... " >&6; }
+if ${am_cv_CXX_dependencies_compiler_type+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -z "$AMDEP_TRUE" && test -f "$am_depcomp"; then
+  # We make a subdir and do the tests there.  Otherwise we can end up
+  # making bogus files that we don't know about and never remove.  For
+  # instance it was reported that on HP-UX the gcc test will end up
+  # making a dummy file named 'D' -- because '-MD' means "put the output
+  # in D".
+  rm -rf conftest.dir
+  mkdir conftest.dir
+  # Copy depcomp to subdir because otherwise we won't find it if we're
+  # using a relative directory.
+  cp "$am_depcomp" conftest.dir
+  cd conftest.dir
+  # We will build objects and dependencies in a subdirectory because
+  # it helps to detect inapplicable dependency modes.  For instance
+  # both Tru64's cc and ICC support -MD to output dependencies as a
+  # side effect of compilation, but ICC will put the dependencies in
+  # the current directory while Tru64 will put them in the object
+  # directory.
+  mkdir sub
 
+  am_cv_CXX_dependencies_compiler_type=none
+  if test "$am_compiler_list" = ""; then
+     am_compiler_list=`sed -n 's/^#*\([a-zA-Z0-9]*\))$/\1/p' < ./depcomp`
+  fi
+  am__universal=false
+  case " $depcc " in #(
+     *\ -arch\ *\ -arch\ *) am__universal=true ;;
+     esac
 
+  for depmode in $am_compiler_list; do
+    # Setup a source with many dependencies, because some compilers
+    # like to wrap large dependency lists on column 80 (with \), and
+    # we should not choose a depcomp mode which is confused by this.
+    #
+    # We need to recreate these files for each test, as the compiler may
+    # overwrite some of them when testing with obscure command lines.
+    # This happens at least with the AIX C compiler.
+    : > sub/conftest.c
+    for i in 1 2 3 4 5 6; do
+      echo '#include "conftst'$i'.h"' >> sub/conftest.c
+      # Using ": > sub/conftst$i.h" creates only sub/conftst1.h with
+      # Solaris 10 /bin/sh.
+      echo '/* dummy */' > sub/conftst$i.h
+    done
+    echo "${am__include} ${am__quote}sub/conftest.Po${am__quote}" > confmf
 
+    # We check with '-c' and '-o' for the sake of the "dashmstdout"
+    # mode.  It turns out that the SunPro C++ compiler does not properly
+    # handle '-M -o', and we need to detect this.  Also, some Intel
+    # versions had trouble with output in subdirs.
+    am__obj=sub/conftest.${OBJEXT-o}
+    am__minus_obj="-o $am__obj"
+    case $depmode in
+    gcc)
+      # This depmode causes a compiler race in universal mode.
+      test "$am__universal" = false || continue
+      ;;
+    nosideeffect)
+      # After this tag, mechanisms are not by side-effect, so they'll
+      # only be used when explicitly requested.
+      if test "x$enable_dependency_tracking" = xyes; then
+	continue
+      else
+	break
+      fi
+      ;;
+    msvc7 | msvc7msys | msvisualcpp | msvcmsys)
+      # This compiler won't grok '-c -o', but also, the minuso test has
+      # not run yet.  These depmodes are late enough in the game, and
+      # so weak that their functioning should not be impacted.
+      am__obj=conftest.${OBJEXT-o}
+      am__minus_obj=
+      ;;
+    none) break ;;
+    esac
+    if depmode=$depmode \
+       source=sub/conftest.c object=$am__obj \
+       depfile=sub/conftest.Po tmpdepfile=sub/conftest.TPo \
+       $SHELL ./depcomp $depcc -c $am__minus_obj sub/conftest.c \
+         >/dev/null 2>conftest.err &&
+       grep sub/conftst1.h sub/conftest.Po > /dev/null 2>&1 &&
+       grep sub/conftst6.h sub/conftest.Po > /dev/null 2>&1 &&
+       grep $am__obj sub/conftest.Po > /dev/null 2>&1 &&
+       ${MAKE-make} -s -f confmf > /dev/null 2>&1; then
+      # icc doesn't choke on unknown options, it will just issue warnings
+      # or remarks (even with -Werror).  So we grep stderr for any message
+      # that says an option was ignored or not supported.
+      # When given -MP, icc 7.0 and 7.1 complain thusly:
+      #   icc: Command line warning: ignoring option '-M'; no argument required
+      # The diagnosis changed in icc 8.0:
+      #   icc: Command line remark: option '-MP' not supported
+      if (grep 'ignoring option' conftest.err ||
+          grep 'not supported' conftest.err) >/dev/null 2>&1; then :; else
+        am_cv_CXX_dependencies_compiler_type=$depmode
+        break
+      fi
+    fi
+  done
 
+  cd ..
+  rm -rf conftest.dir
+else
+  am_cv_CXX_dependencies_compiler_type=none
+fi
 
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $am_cv_CXX_dependencies_compiler_type" >&5
+$as_echo "$am_cv_CXX_dependencies_compiler_type" >&6; }
+CXXDEPMODE=depmode=$am_cv_CXX_dependencies_compiler_type
 
+ if
+  test "x$enable_dependency_tracking" != xno \
+  && test "$am_cv_CXX_dependencies_compiler_type" = gcc3; then
+  am__fastdepCXX_TRUE=
+  am__fastdepCXX_FALSE='#'
+else
+  am__fastdepCXX_TRUE='#'
+  am__fastdepCXX_FALSE=
+fi
 
 
 
+# POSIX will say in a future version that running "rm -f" with no argument
+# is OK; and we want to be able to make that assumption in our Makefile
+# recipes.  So use an aggressive probe to check that the usage we want is
+# actually supported "in the wild" to an acceptable degree.
+# See automake bug#10828.
+# To make any issue more visible, cause the running configure to be aborted
+# by default if the 'rm' program in use doesn't match our expectations; the
+# user can still override this though.
+if rm -f && rm -fr && rm -rf; then : OK; else
+  cat >&2 <<'END'
+Oops!
 
+Your 'rm' program seems unable to run without file operands specified
+on the command line, even when the '-f' option is present.  This is contrary
+to the behaviour of most rm programs out there, and not conforming with
+the upcoming POSIX standard: <http://austingroupbugs.net/view.php?id=542>
 
+Please tell bug-automake at gnu.org about your system, including the value
+of your $PATH and any error possibly output before this message.  This
+can help us improve future automake versions.
 
+END
+  if test x"$ACCEPT_INFERIOR_RM_PROGRAM" = x"yes"; then
+    echo 'Configuration will proceed anyway, since you have set the' >&2
+    echo 'ACCEPT_INFERIOR_RM_PROGRAM variable to "yes"' >&2
+    echo >&2
+  else
+    cat >&2 <<'END'
+Aborting the configuration process, to ensure you take notice of the issue.
 
+You can download and install GNU coreutils to get an 'rm' implementation
+that behaves properly: <http://www.gnu.org/software/coreutils/>.
 
+If you want to complete the configuration process using your problematic
+'rm' anyway, export the environment variable ACCEPT_INFERIOR_RM_PROGRAM
+to "yes", and re-run configure.
 
+END
+    as_fn_error $? "Your 'rm' program is bad, sorry." "$LINENO" 5
+  fi
+fi
+# enable make V=0 (if automake >1.11)
+# Check whether --enable-silent-rules was given.
+if test "${enable_silent_rules+set}" = set; then :
+  enableval=$enable_silent_rules;
+fi
 
+case $enable_silent_rules in # (((
+  yes) AM_DEFAULT_VERBOSITY=0;;
+   no) AM_DEFAULT_VERBOSITY=1;;
+    *) AM_DEFAULT_VERBOSITY=0;;
+esac
+am_make=${MAKE-make}
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $am_make supports nested variables" >&5
+$as_echo_n "checking whether $am_make supports nested variables... " >&6; }
+if ${am_cv_make_support_nested_variables+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if $as_echo 'TRUE=$(BAR$(V))
+BAR0=false
+BAR1=true
+V=1
+am__doit:
+	@$(TRUE)
+.PHONY: am__doit' | $am_make -f - >/dev/null 2>&1; then
+  am_cv_make_support_nested_variables=yes
+else
+  am_cv_make_support_nested_variables=no
+fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $am_cv_make_support_nested_variables" >&5
+$as_echo "$am_cv_make_support_nested_variables" >&6; }
+if test $am_cv_make_support_nested_variables = yes; then
+    AM_V='$(V)'
+  AM_DEFAULT_V='$(AM_DEFAULT_VERBOSITY)'
+else
+  AM_V=$AM_DEFAULT_VERBOSITY
+  AM_DEFAULT_V=$AM_DEFAULT_VERBOSITY
+fi
+AM_BACKSLASH='\'
 
 
+# Platform
+case "${target_os}" in
+darwin*)
 
+$as_echo "#define DARWIN 1" >>confdefs.h
 
+    darwin="yes"
+	;;
+linux*)
+	linux="yes"
+	;;
+freebsd*)
+	freebsd="yes"
+	;;
+esac
+ if test x"$linux" = x"yes"; then
+  LINUX_TRUE=
+  LINUX_FALSE='#'
+else
+  LINUX_TRUE='#'
+  LINUX_FALSE=
+fi
 
+ if test x"$freebsd" = x"yes"; then
+  FREEBSD_TRUE=
+  FREEBSD_FALSE='#'
+else
+  FREEBSD_TRUE='#'
+  FREEBSD_FALSE=
+fi
 
+ if test x"$darwin" = x"yes"; then
+  DARWIN_TRUE=
+  DARWIN_FALSE='#'
+else
+  DARWIN_TRUE='#'
+  DARWIN_FALSE=
+fi
 
 
-
-
-
-
-
-
-
-
-
-
-    { $as_echo "$as_me:${as_lineno-$LINENO}: checking how to hardcode library paths into programs" >&5
-$as_echo_n "checking how to hardcode library paths into programs... " >&6; }
-hardcode_action_CXX=
-if test -n "$hardcode_libdir_flag_spec_CXX" ||
-   test -n "$runpath_var_CXX" ||
-   test "X$hardcode_automatic_CXX" = "Xyes" ; then
-
-  # We can hardcode non-existent directories.
-  if test "$hardcode_direct_CXX" != no &&
-     # If the only mechanism to avoid hardcoding is shlibpath_var, we
-     # have to relink, otherwise we might link with an installed library
-     # when we should be linking with a yet-to-be-installed one
-     ## test "$_LT_TAGVAR(hardcode_shlibpath_var, CXX)" != no &&
-     test "$hardcode_minus_L_CXX" != no; then
-    # Linking always hardcodes the temporary library directory.
-    hardcode_action_CXX=relink
+# Checks for programs.
+ac_ext=cpp
+ac_cpp='$CXXCPP $CPPFLAGS'
+ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
+if test -z "$CXX"; then
+  if test -n "$CCC"; then
+    CXX=$CCC
   else
-    # We can link without hardcoding, and we can hardcode nonexisting dirs.
-    hardcode_action_CXX=immediate
+    if test -n "$ac_tool_prefix"; then
+  for ac_prog in g++ c++ gpp aCC CC cxx cc++ cl.exe FCC KCC RCC xlC_r xlC
+  do
+    # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args.
+set dummy $ac_tool_prefix$ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_CXX+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$CXX"; then
+  ac_cv_prog_CXX="$CXX" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_CXX="$ac_tool_prefix$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
   fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+CXX=$ac_cv_prog_CXX
+if test -n "$CXX"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CXX" >&5
+$as_echo "$CXX" >&6; }
 else
-  # We cannot hardcode anything, or else we can only hardcode existing
-  # directories.
-  hardcode_action_CXX=unsupported
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $hardcode_action_CXX" >&5
-$as_echo "$hardcode_action_CXX" >&6; }
 
-if test "$hardcode_action_CXX" = relink ||
-   test "$inherit_rpath_CXX" = yes; then
-  # Fast installation is not supported
-  enable_fast_install=no
-elif test "$shlibpath_overrides_runpath" = yes ||
-     test "$enable_shared" = no; then
-  # Fast installation is not necessary
-  enable_fast_install=needless
-fi
 
+    test -n "$CXX" && break
+  done
+fi
+if test -z "$CXX"; then
+  ac_ct_CXX=$CXX
+  for ac_prog in g++ c++ gpp aCC CC cxx cc++ cl.exe FCC KCC RCC xlC_r xlC
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_ac_ct_CXX+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$ac_ct_CXX"; then
+  ac_cv_prog_ac_ct_CXX="$ac_ct_CXX" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_ac_ct_CXX="$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
 
+fi
+fi
+ac_ct_CXX=$ac_cv_prog_ac_ct_CXX
+if test -n "$ac_ct_CXX"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CXX" >&5
+$as_echo "$ac_ct_CXX" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
 
 
+  test -n "$ac_ct_CXX" && break
+done
 
+  if test "x$ac_ct_CXX" = x; then
+    CXX="g++"
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+    CXX=$ac_ct_CXX
+  fi
+fi
 
+  fi
+fi
+# Provide some information about the compiler.
+$as_echo "$as_me:${as_lineno-$LINENO}: checking for C++ compiler version" >&5
+set X $ac_compile
+ac_compiler=$2
+for ac_option in --version -v -V -qversion; do
+  { { ac_try="$ac_compiler $ac_option >&5"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_compiler $ac_option >&5") 2>conftest.err
+  ac_status=$?
+  if test -s conftest.err; then
+    sed '10a\
+... rest of stderr output deleted ...
+         10q' conftest.err >conftest.er1
+    cat conftest.er1 >&5
+  fi
+  rm -f conftest.er1 conftest.err
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }
+done
 
-  fi # test -n "$compiler"
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are using the GNU C++ compiler" >&5
+$as_echo_n "checking whether we are using the GNU C++ compiler... " >&6; }
+if ${ac_cv_cxx_compiler_gnu+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
 
-  CC=$lt_save_CC
-  CFLAGS=$lt_save_CFLAGS
-  LDCXX=$LD
-  LD=$lt_save_LD
-  GCC=$lt_save_GCC
-  with_gnu_ld=$lt_save_with_gnu_ld
-  lt_cv_path_LDCXX=$lt_cv_path_LD
-  lt_cv_path_LD=$lt_save_path_LD
-  lt_cv_prog_gnu_ldcxx=$lt_cv_prog_gnu_ld
-  lt_cv_prog_gnu_ld=$lt_save_with_gnu_ld
-fi # test "$_lt_caught_CXX_error" != yes
+int
+main ()
+{
+#ifndef __GNUC__
+       choke me
+#endif
 
-ac_ext=c
-ac_cpp='$CPP $CPPFLAGS'
-ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
-ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_c_compiler_gnu
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+  ac_compiler_gnu=yes
+else
+  ac_compiler_gnu=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+ac_cv_cxx_compiler_gnu=$ac_compiler_gnu
 
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_cxx_compiler_gnu" >&5
+$as_echo "$ac_cv_cxx_compiler_gnu" >&6; }
+if test $ac_compiler_gnu = yes; then
+  GXX=yes
+else
+  GXX=
+fi
+ac_test_CXXFLAGS=${CXXFLAGS+set}
+ac_save_CXXFLAGS=$CXXFLAGS
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CXX accepts -g" >&5
+$as_echo_n "checking whether $CXX accepts -g... " >&6; }
+if ${ac_cv_prog_cxx_g+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_save_cxx_werror_flag=$ac_cxx_werror_flag
+   ac_cxx_werror_flag=yes
+   ac_cv_prog_cxx_g=no
+   CXXFLAGS="-g"
+   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
 
-depcc="$CXX"  am_compiler_list=
+int
+main ()
+{
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking dependency style of $depcc" >&5
-$as_echo_n "checking dependency style of $depcc... " >&6; }
-if ${am_cv_CXX_dependencies_compiler_type+:} false; then :
-  $as_echo_n "(cached) " >&6
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+  ac_cv_prog_cxx_g=yes
 else
-  if test -z "$AMDEP_TRUE" && test -f "$am_depcomp"; then
-  # We make a subdir and do the tests there.  Otherwise we can end up
-  # making bogus files that we don't know about and never remove.  For
-  # instance it was reported that on HP-UX the gcc test will end up
-  # making a dummy file named 'D' -- because '-MD' means "put the output
-  # in D".
-  rm -rf conftest.dir
-  mkdir conftest.dir
-  # Copy depcomp to subdir because otherwise we won't find it if we're
-  # using a relative directory.
-  cp "$am_depcomp" conftest.dir
-  cd conftest.dir
-  # We will build objects and dependencies in a subdirectory because
-  # it helps to detect inapplicable dependency modes.  For instance
-  # both Tru64's cc and ICC support -MD to output dependencies as a
-  # side effect of compilation, but ICC will put the dependencies in
-  # the current directory while Tru64 will put them in the object
-  # directory.
-  mkdir sub
-
-  am_cv_CXX_dependencies_compiler_type=none
-  if test "$am_compiler_list" = ""; then
-     am_compiler_list=`sed -n 's/^#*\([a-zA-Z0-9]*\))$/\1/p' < ./depcomp`
-  fi
-  am__universal=false
-  case " $depcc " in #(
-     *\ -arch\ *\ -arch\ *) am__universal=true ;;
-     esac
+  CXXFLAGS=""
+      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
 
-  for depmode in $am_compiler_list; do
-    # Setup a source with many dependencies, because some compilers
-    # like to wrap large dependency lists on column 80 (with \), and
-    # we should not choose a depcomp mode which is confused by this.
-    #
-    # We need to recreate these files for each test, as the compiler may
-    # overwrite some of them when testing with obscure command lines.
-    # This happens at least with the AIX C compiler.
-    : > sub/conftest.c
-    for i in 1 2 3 4 5 6; do
-      echo '#include "conftst'$i'.h"' >> sub/conftest.c
-      # Using ": > sub/conftst$i.h" creates only sub/conftst1.h with
-      # Solaris 10 /bin/sh.
-      echo '/* dummy */' > sub/conftst$i.h
-    done
-    echo "${am__include} ${am__quote}sub/conftest.Po${am__quote}" > confmf
+int
+main ()
+{
 
-    # We check with '-c' and '-o' for the sake of the "dashmstdout"
-    # mode.  It turns out that the SunPro C++ compiler does not properly
-    # handle '-M -o', and we need to detect this.  Also, some Intel
-    # versions had trouble with output in subdirs.
-    am__obj=sub/conftest.${OBJEXT-o}
-    am__minus_obj="-o $am__obj"
-    case $depmode in
-    gcc)
-      # This depmode causes a compiler race in universal mode.
-      test "$am__universal" = false || continue
-      ;;
-    nosideeffect)
-      # After this tag, mechanisms are not by side-effect, so they'll
-      # only be used when explicitly requested.
-      if test "x$enable_dependency_tracking" = xyes; then
-	continue
-      else
-	break
-      fi
-      ;;
-    msvc7 | msvc7msys | msvisualcpp | msvcmsys)
-      # This compiler won't grok '-c -o', but also, the minuso test has
-      # not run yet.  These depmodes are late enough in the game, and
-      # so weak that their functioning should not be impacted.
-      am__obj=conftest.${OBJEXT-o}
-      am__minus_obj=
-      ;;
-    none) break ;;
-    esac
-    if depmode=$depmode \
-       source=sub/conftest.c object=$am__obj \
-       depfile=sub/conftest.Po tmpdepfile=sub/conftest.TPo \
-       $SHELL ./depcomp $depcc -c $am__minus_obj sub/conftest.c \
-         >/dev/null 2>conftest.err &&
-       grep sub/conftst1.h sub/conftest.Po > /dev/null 2>&1 &&
-       grep sub/conftst6.h sub/conftest.Po > /dev/null 2>&1 &&
-       grep $am__obj sub/conftest.Po > /dev/null 2>&1 &&
-       ${MAKE-make} -s -f confmf > /dev/null 2>&1; then
-      # icc doesn't choke on unknown options, it will just issue warnings
-      # or remarks (even with -Werror).  So we grep stderr for any message
-      # that says an option was ignored or not supported.
-      # When given -MP, icc 7.0 and 7.1 complain thusly:
-      #   icc: Command line warning: ignoring option '-M'; no argument required
-      # The diagnosis changed in icc 8.0:
-      #   icc: Command line remark: option '-MP' not supported
-      if (grep 'ignoring option' conftest.err ||
-          grep 'not supported' conftest.err) >/dev/null 2>&1; then :; else
-        am_cv_CXX_dependencies_compiler_type=$depmode
-        break
-      fi
-    fi
-  done
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
 
-  cd ..
-  rm -rf conftest.dir
 else
-  am_cv_CXX_dependencies_compiler_type=none
-fi
+  ac_cxx_werror_flag=$ac_save_cxx_werror_flag
+	 CXXFLAGS="-g"
+	 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
 
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $am_cv_CXX_dependencies_compiler_type" >&5
-$as_echo "$am_cv_CXX_dependencies_compiler_type" >&6; }
-CXXDEPMODE=depmode=$am_cv_CXX_dependencies_compiler_type
+int
+main ()
+{
 
- if
-  test "x$enable_dependency_tracking" != xno \
-  && test "$am_cv_CXX_dependencies_compiler_type" = gcc3; then
-  am__fastdepCXX_TRUE=
-  am__fastdepCXX_FALSE='#'
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+  ac_cv_prog_cxx_g=yes
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+   ac_cxx_werror_flag=$ac_save_cxx_werror_flag
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cxx_g" >&5
+$as_echo "$ac_cv_prog_cxx_g" >&6; }
+if test "$ac_test_CXXFLAGS" = set; then
+  CXXFLAGS=$ac_save_CXXFLAGS
+elif test $ac_cv_prog_cxx_g = yes; then
+  if test "$GXX" = yes; then
+    CXXFLAGS="-g -O2"
+  else
+    CXXFLAGS="-g"
+  fi
 else
-  am__fastdepCXX_TRUE='#'
-  am__fastdepCXX_FALSE=
+  if test "$GXX" = yes; then
+    CXXFLAGS="-O2"
+  else
+    CXXFLAGS=
+  fi
 fi
-
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
 if test "$CXX" = no || test "$CXX:$GXX" = "g++:"; then
   as_fn_error $? "no C++ compiler found" "$LINENO" 5
@@ -16975,9 +17267,7 @@ fi
 
 AM_CXXFLAGS="${AM_CXXFLAGS}"
 
-
-###### PATCH STARTS HERE ######
-# Find out what to build (default is all of these)
+# Find out what to build (default is most of these)
 
 # rados?
 
@@ -17051,6 +17341,73 @@ fi
 # AM_CONDITIONAL is defined later -- we need to check whether we can enable radosgw if no option is present
 #AS_IF([test "$with_radosgw" = "yes"], [AC_DEFINE([WITH_RADOS, WITH_RADOSGW])])
 
+
+# Check whether --with-selinux was given.
+if test "${with_selinux+set}" = set; then :
+  withval=$with_selinux;
+else
+  with_selinux=no
+fi
+
+ if test "$with_selinux" = "yes"; then
+  WITH_SELINUX_TRUE=
+  WITH_SELINUX_FALSE='#'
+else
+  WITH_SELINUX_TRUE='#'
+  WITH_SELINUX_FALSE=
+fi
+
+if test "x$with_selinux" = x"yes"; then
+	{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for /usr/share/selinux/devel/policyhelp" >&5
+$as_echo_n "checking for /usr/share/selinux/devel/policyhelp... " >&6; }
+if ${ac_cv_file__usr_share_selinux_devel_policyhelp+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  test "$cross_compiling" = yes &&
+  as_fn_error $? "cannot check for file existence when cross compiling" "$LINENO" 5
+if test -r "/usr/share/selinux/devel/policyhelp"; then
+  ac_cv_file__usr_share_selinux_devel_policyhelp=yes
+else
+  ac_cv_file__usr_share_selinux_devel_policyhelp=no
+fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_file__usr_share_selinux_devel_policyhelp" >&5
+$as_echo "$ac_cv_file__usr_share_selinux_devel_policyhelp" >&6; }
+if test "x$ac_cv_file__usr_share_selinux_devel_policyhelp" = xyes; then :
+  true
+else
+  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "No SELinux found
+See \`config.log' for more details" "$LINENO" 5; }
+fi
+
+	{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for /usr/share/selinux/devel/include/Makefile" >&5
+$as_echo_n "checking for /usr/share/selinux/devel/include/Makefile... " >&6; }
+if ${ac_cv_file__usr_share_selinux_devel_include_Makefile+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  test "$cross_compiling" = yes &&
+  as_fn_error $? "cannot check for file existence when cross compiling" "$LINENO" 5
+if test -r "/usr/share/selinux/devel/include/Makefile"; then
+  ac_cv_file__usr_share_selinux_devel_include_Makefile=yes
+else
+  ac_cv_file__usr_share_selinux_devel_include_Makefile=no
+fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_file__usr_share_selinux_devel_include_Makefile" >&5
+$as_echo "$ac_cv_file__usr_share_selinux_devel_include_Makefile" >&6; }
+if test "x$ac_cv_file__usr_share_selinux_devel_include_Makefile" = xyes; then :
+  true
+else
+  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "No SELinux Makefile found
+See \`config.log' for more details" "$LINENO" 5; }
+fi
+
+fi
+
 # radosstriper?
 
 # Check whether --with-radosstriper was given.
@@ -17260,8 +17617,6 @@ See \`config.log' for more details" "$LINENO" 5; }
 fi
 
 fi
-###### PATCH ENDS HERE ######
-
 
 # Check for yasm
 # Extract the first word of "yasm", so it can be a program name with args.
@@ -17334,7 +17689,7 @@ $as_echo "#define HAVE_GOOD_YASM_ELF64 1" >>confdefs.h
 
            with_good_yasm=yes
 
-           if yasm -f elf64 -i src/ceph/src/ceph/src/erasure-code/isa/isa-l/include/ src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx2.asm.s -o /dev/null 2> /dev/null ; then
+           if yasm -f elf64 -i src/erasure-code/isa/isa-l/include/ src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx2.asm.s -o /dev/null 2> /dev/null ; then
               echo 'yasm can also build the isa-l stuff'
 
 $as_echo "#define HAVE_BETTER_YASM_ELF64 1" >>confdefs.h
@@ -18240,24 +18595,81 @@ $as_echo "#define HAVE_PTHREAD 1" >>confdefs.h
 
         :
 else
-        acx_pthread_ok=no
-
+        acx_pthread_ok=no
+
+fi
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+
+
+
+#Linux only dependencies
+if test x"$linux" = x"yes"; then
+  # libblkid
+  ac_fn_c_check_header_mongrel "$LINENO" "blkid/blkid.h" "ac_cv_header_blkid_blkid_h" "$ac_includes_default"
+if test "x$ac_cv_header_blkid_blkid_h" = xyes; then :
+
+else
+  as_fn_error $? "blkid/blkid.h not found (libblkid-dev, libblkid-devel)" "$LINENO" 5
+fi
+
+
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for blkid_get_cache in -lblkid" >&5
+$as_echo_n "checking for blkid_get_cache in -lblkid... " >&6; }
+if ${ac_cv_lib_blkid_blkid_get_cache+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lblkid  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char blkid_get_cache ();
+int
+main ()
+{
+return blkid_get_cache ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_lib_blkid_blkid_get_cache=yes
+else
+  ac_cv_lib_blkid_blkid_get_cache=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_blkid_blkid_get_cache" >&5
+$as_echo "$ac_cv_lib_blkid_blkid_get_cache" >&6; }
+if test "x$ac_cv_lib_blkid_blkid_get_cache" = xyes; then :
+  true
+else
+  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "libblkid not found
+See \`config.log' for more details" "$LINENO" 5; }
 fi
-ac_ext=c
-ac_cpp='$CPP $CPPFLAGS'
-ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
-ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_c_compiler_gnu
-
-
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for uuid_parse in -luuid" >&5
-$as_echo_n "checking for uuid_parse in -luuid... " >&6; }
-if ${ac_cv_lib_uuid_uuid_parse+:} false; then :
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for blkid_find_dev_with_tag in -lblkid" >&5
+$as_echo_n "checking for blkid_find_dev_with_tag in -lblkid... " >&6; }
+if ${ac_cv_lib_blkid_blkid_find_dev_with_tag+:} false; then :
   $as_echo_n "(cached) " >&6
 else
   ac_check_lib_save_LIBS=$LIBS
-LIBS="-luuid  $LIBS"
+LIBS="-lblkid  $LIBS"
 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 
@@ -18267,48 +18679,85 @@ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 #ifdef __cplusplus
 extern "C"
 #endif
-char uuid_parse ();
+char blkid_find_dev_with_tag ();
 int
 main ()
 {
-return uuid_parse ();
+return blkid_find_dev_with_tag ();
   ;
   return 0;
 }
 _ACEOF
 if ac_fn_c_try_link "$LINENO"; then :
-  ac_cv_lib_uuid_uuid_parse=yes
+  ac_cv_lib_blkid_blkid_find_dev_with_tag=yes
 else
-  ac_cv_lib_uuid_uuid_parse=no
+  ac_cv_lib_blkid_blkid_find_dev_with_tag=no
 fi
 rm -f core conftest.err conftest.$ac_objext \
     conftest$ac_exeext conftest.$ac_ext
 LIBS=$ac_check_lib_save_LIBS
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_uuid_uuid_parse" >&5
-$as_echo "$ac_cv_lib_uuid_uuid_parse" >&6; }
-if test "x$ac_cv_lib_uuid_uuid_parse" = xyes; then :
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_blkid_blkid_find_dev_with_tag" >&5
+$as_echo "$ac_cv_lib_blkid_blkid_find_dev_with_tag" >&6; }
+if test "x$ac_cv_lib_blkid_blkid_find_dev_with_tag" = xyes; then :
   true
 else
   { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
 $as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "libuuid not found
+as_fn_error $? "libblkid not found
 See \`config.log' for more details" "$LINENO" 5; }
 fi
 
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for blkid_dev_devname in -lblkid" >&5
+$as_echo_n "checking for blkid_dev_devname in -lblkid... " >&6; }
+if ${ac_cv_lib_blkid_blkid_dev_devname+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lblkid  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
 
-# rbd {map,unmap,showmapped} dependencies, Linux only
-if test x"$linux" = x"yes" -a x"$with_rbd" = x"yes"; then
-  # libblkid
-  ac_fn_c_check_header_mongrel "$LINENO" "blkid/blkid.h" "ac_cv_header_blkid_blkid_h" "$ac_includes_default"
-if test "x$ac_cv_header_blkid_blkid_h" = xyes; then :
-
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char blkid_dev_devname ();
+int
+main ()
+{
+return blkid_dev_devname ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_lib_blkid_blkid_dev_devname=yes
 else
-  as_fn_error $? "blkid/blkid.h not found (libblkid-dev, libblkid-devel)" "$LINENO" 5
+  ac_cv_lib_blkid_blkid_dev_devname=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_blkid_blkid_dev_devname" >&5
+$as_echo "$ac_cv_lib_blkid_blkid_dev_devname" >&6; }
+if test "x$ac_cv_lib_blkid_blkid_dev_devname" = xyes; then :
+  true
+else
+  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "libblkid not found
+See \`config.log' for more details" "$LINENO" 5; }
 fi
 
 
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for blkid_devno_to_wholedisk in -lblkid" >&5
+  # rbd {map,unmap,showmapped} dependencies, Linux only
+  if test x"$with_rbd" = x"yes"; then
+    # libblkid
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for blkid_devno_to_wholedisk in -lblkid" >&5
 $as_echo_n "checking for blkid_devno_to_wholedisk in -lblkid... " >&6; }
 if ${ac_cv_lib_blkid_blkid_devno_to_wholedisk+:} false; then :
   $as_echo_n "(cached) " >&6
@@ -18354,8 +18803,8 @@ See \`config.log' for more details" "$LINENO" 5; }
 fi
 
 
-  # libudev
-  ac_fn_c_check_header_mongrel "$LINENO" "libudev.h" "ac_cv_header_libudev_h" "$ac_includes_default"
+    # libudev
+    ac_fn_c_check_header_mongrel "$LINENO" "libudev.h" "ac_cv_header_libudev_h" "$ac_includes_default"
 if test "x$ac_cv_header_libudev_h" = xyes; then :
 
 else
@@ -18363,7 +18812,7 @@ else
 fi
 
 
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for udev_monitor_receive_device in -ludev" >&5
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for udev_monitor_receive_device in -ludev" >&5
 $as_echo_n "checking for udev_monitor_receive_device in -ludev... " >&6; }
 if ${ac_cv_lib_udev_udev_monitor_receive_device+:} false; then :
   $as_echo_n "(cached) " >&6
@@ -18408,6 +18857,62 @@ as_fn_error $? "libudev not found
 See \`config.log' for more details" "$LINENO" 5; }
 fi
 
+
+    # libexpat
+    ac_fn_c_check_header_mongrel "$LINENO" "expat.h" "ac_cv_header_expat_h" "$ac_includes_default"
+if test "x$ac_cv_header_expat_h" = xyes; then :
+
+else
+  as_fn_error $? "expat.h not found (libexpat-devel)" "$LINENO" 5
+fi
+
+
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for XML_Parse in -lexpat" >&5
+$as_echo_n "checking for XML_Parse in -lexpat... " >&6; }
+if ${ac_cv_lib_expat_XML_Parse+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lexpat  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char XML_Parse ();
+int
+main ()
+{
+return XML_Parse ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_lib_expat_XML_Parse=yes
+else
+  ac_cv_lib_expat_XML_Parse=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_expat_XML_Parse" >&5
+$as_echo "$ac_cv_lib_expat_XML_Parse" >&6; }
+if test "x$ac_cv_lib_expat_XML_Parse" = xyes; then :
+  true
+else
+  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "libexpat not found
+See \`config.log' for more details" "$LINENO" 5; }
+fi
+
+  fi
 fi
 
 #
@@ -19102,19 +19607,20 @@ as_fn_error $? "no suitable crypto library found
 See \`config.log' for more details" "$LINENO" 5; }
 fi
 
-# Check whether --enable-root-make-check was given.
-if test "${enable_root_make_check+set}" = set; then :
-  enableval=$enable_root_make_check;
+# Check whether --enable-gitversion was given.
+if test "${enable_gitversion+set}" = set; then :
+  enableval=$enable_gitversion;
 else
-  enable_root_make_check=no
+  enable_gitversion=yes
 fi
 
- if test "x$enable_root_make_check" != xno; then
-  ENABLE_ROOT_MAKE_CHECK_TRUE=
-  ENABLE_ROOT_MAKE_CHECK_FALSE='#'
+
+ if test "x$enable_gitversion" = "xno"; then
+  NO_GIT_VERSION_TRUE=
+  NO_GIT_VERSION_FALSE='#'
 else
-  ENABLE_ROOT_MAKE_CHECK_TRUE='#'
-  ENABLE_ROOT_MAKE_CHECK_FALSE=
+  NO_GIT_VERSION_TRUE='#'
+  NO_GIT_VERSION_FALSE=
 fi
 
 
@@ -19502,70 +20008,83 @@ fi
 
 LIBFUSE=
 if test "x$with_fuse" != xno; then :
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for fuse_main in -lfuse" >&5
-$as_echo_n "checking for fuse_main in -lfuse... " >&6; }
-if ${ac_cv_lib_fuse_fuse_main+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_check_lib_save_LIBS=$LIBS
-LIBS="-lfuse  $LIBS"
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
 
-/* Override any GCC internal prototype to avoid an error.
-   Use char because int might match the return type of a GCC
-   builtin and then its argument prototype would still apply.  */
-#ifdef __cplusplus
-extern "C"
-#endif
-char fuse_main ();
-int
-main ()
-{
-return fuse_main ();
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
-  ac_cv_lib_fuse_fuse_main=yes
+
+pkg_failed=no
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for LIBFUSE" >&5
+$as_echo_n "checking for LIBFUSE... " >&6; }
+
+if test -n "$LIBFUSE_CFLAGS"; then
+    pkg_cv_LIBFUSE_CFLAGS="$LIBFUSE_CFLAGS"
+ elif test -n "$PKG_CONFIG"; then
+    if test -n "$PKG_CONFIG" && \
+    { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"fuse\""; } >&5
+  ($PKG_CONFIG --exists --print-errors "fuse") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; then
+  pkg_cv_LIBFUSE_CFLAGS=`$PKG_CONFIG --cflags "fuse" 2>/dev/null`
+		      test "x$?" != "x0" && pkg_failed=yes
 else
-  ac_cv_lib_fuse_fuse_main=no
+  pkg_failed=yes
 fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-LIBS=$ac_check_lib_save_LIBS
+ else
+    pkg_failed=untried
+fi
+if test -n "$LIBFUSE_LIBS"; then
+    pkg_cv_LIBFUSE_LIBS="$LIBFUSE_LIBS"
+ elif test -n "$PKG_CONFIG"; then
+    if test -n "$PKG_CONFIG" && \
+    { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"fuse\""; } >&5
+  ($PKG_CONFIG --exists --print-errors "fuse") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; then
+  pkg_cv_LIBFUSE_LIBS=`$PKG_CONFIG --libs "fuse" 2>/dev/null`
+		      test "x$?" != "x0" && pkg_failed=yes
+else
+  pkg_failed=yes
+fi
+ else
+    pkg_failed=untried
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_fuse_fuse_main" >&5
-$as_echo "$ac_cv_lib_fuse_fuse_main" >&6; }
-if test "x$ac_cv_lib_fuse_fuse_main" = xyes; then :
-  LIBFUSE="-lfuse"
 
 
-$as_echo "#define HAVE_LIBFUSE 1" >>confdefs.h
 
-               HAVE_LIBFUSE=1
-	       # look for fuse_getgroups and define FUSE_GETGROUPS if found
-           LIBS_saved="$LIBS"
-           LIBS="$LIBS -lfuse"
-	       for ac_func in fuse_getgroups
-do :
-  ac_fn_c_check_func "$LINENO" "fuse_getgroups" "ac_cv_func_fuse_getgroups"
-if test "x$ac_cv_func_fuse_getgroups" = xyes; then :
-  cat >>confdefs.h <<_ACEOF
-#define HAVE_FUSE_GETGROUPS 1
-_ACEOF
+if test $pkg_failed = yes; then
+   	{ $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
 
+if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then
+        _pkg_short_errors_supported=yes
+else
+        _pkg_short_errors_supported=no
 fi
-done
-
-           LIBS="$LIBS_saved"
+        if test $_pkg_short_errors_supported = yes; then
+	        LIBFUSE_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "fuse" 2>&1`
+        else
+	        LIBFUSE_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "fuse" 2>&1`
+        fi
+	# Put the nasty error message in config.log where it belongs
+	echo "$LIBFUSE_PKG_ERRORS" >&5
 
-else
-  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+	{ { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "no FUSE found (use --without-fuse to disable)
+See \`config.log' for more details" "$LINENO" 5; }
+elif test $pkg_failed = untried; then
+     	{ $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+	{ { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
 $as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
 as_fn_error $? "no FUSE found (use --without-fuse to disable)
 See \`config.log' for more details" "$LINENO" 5; }
+else
+	LIBFUSE_CFLAGS=$pkg_cv_LIBFUSE_CFLAGS
+	LIBFUSE_LIBS=$pkg_cv_LIBFUSE_LIBS
+        { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+	HAVE_LIBFUSE=1
 fi
 
 fi
@@ -19821,11 +20340,16 @@ fi
 # Check whether --enable-pgrefdebugging was given.
 if test "${enable_pgrefdebugging+set}" = set; then :
   enableval=$enable_pgrefdebugging;
+else
+  enable_pgrefdebugging=no
+fi
+
+if test "x$enable_pgrefdebugging" = "xyes"; then :
+
 $as_echo "#define PG_DEBUG_REFS 1" >>confdefs.h
 
 fi
 
-
 #
 # Java is painful
 #   - adapted from OMPI wrappers package
@@ -19906,10 +20430,10 @@ fi
         if test "x$with_debug" = "xyes"; then :
 
         	dir='/usr/share/java'
-	        junit4_jar=`find $dir -name junit4.jar | head -n 1`
+	        junit4_jar=`( find $dir -name junit4.jar;find $dir -name junit.jar ) | head -n 1`
 		if test -r "$junit4_jar"; then :
 
-		      EXTRA_CLASSPATH_JAR=`dirname $junit4_jar`/junit4.jar
+		      EXTRA_CLASSPATH_JAR="$junit4_jar"
 
 		      have_junit4=1
 else
@@ -20028,7 +20552,7 @@ else
 JAVA_TEST=Test.java
 CLASS_TEST=Test.class
 cat << \EOF > $JAVA_TEST
-/* #line 20031 "configure" */
+/* #line 20555 "configure" */
 public class Test {
 }
 EOF
@@ -20791,16 +21315,100 @@ fi
 
 
 
-  case $target_cpu in
-    arm*)
-      { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts -mfpu=neon" >&5
-$as_echo_n "checking whether C compiler accepts -mfpu=neon... " >&6; }
-if ${ax_cv_check_cflags___mfpu_neon+:} false; then :
+  case $target_cpu in
+    arm*)
+      { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts -mfpu=neon" >&5
+$as_echo_n "checking whether C compiler accepts -mfpu=neon... " >&6; }
+if ${ax_cv_check_cflags___mfpu_neon+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+
+  ax_check_save_flags=$CFLAGS
+  CFLAGS="$CFLAGS  -mfpu=neon"
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ax_cv_check_cflags___mfpu_neon=yes
+else
+  ax_cv_check_cflags___mfpu_neon=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+  CFLAGS=$ax_check_save_flags
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_cv_check_cflags___mfpu_neon" >&5
+$as_echo "$ax_cv_check_cflags___mfpu_neon" >&6; }
+if test x"$ax_cv_check_cflags___mfpu_neon" = xyes; then :
+  ax_cv_support_neon_ext=yes
+else
+  :
+fi
+
+      if test x"$ax_cv_support_neon_ext" = x"yes"; then
+        ARM_NEON_FLAGS="-mfpu=neon -DARM_NEON"
+
+        ARM_FLAGS="$ARM_FLAGS $ARM_NEON_FLAGS"
+
+$as_echo "#define HAVE_NEON /**/" >>confdefs.h
+
+      fi
+    ;;
+    aarch64*)
+      { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts -march=armv8-a" >&5
+$as_echo_n "checking whether C compiler accepts -march=armv8-a... " >&6; }
+if ${ax_cv_check_cflags___march_armv8_a+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+
+  ax_check_save_flags=$CFLAGS
+  CFLAGS="$CFLAGS  -march=armv8-a"
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ax_cv_check_cflags___march_armv8_a=yes
+else
+  ax_cv_check_cflags___march_armv8_a=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+  CFLAGS=$ax_check_save_flags
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_cv_check_cflags___march_armv8_a" >&5
+$as_echo "$ax_cv_check_cflags___march_armv8_a" >&6; }
+if test x"$ax_cv_check_cflags___march_armv8_a" = xyes; then :
+  ax_cv_support_armv8=yes
+else
+  :
+fi
+
+      if test x"$ax_cv_support_armv8" = x"yes"; then
+        ARM_ARCH_FLAGS="-march=armv8-a"
+        ARM_DEFINE_FLAGS="-DARCH_AARCH64"
+      fi
+      { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts -march=armv8-a+simd" >&5
+$as_echo_n "checking whether C compiler accepts -march=armv8-a+simd... " >&6; }
+if ${ax_cv_check_cflags___march_armv8_apsimd+:} false; then :
   $as_echo_n "(cached) " >&6
 else
 
   ax_check_save_flags=$CFLAGS
-  CFLAGS="$CFLAGS  -mfpu=neon"
+  CFLAGS="$CFLAGS  -march=armv8-a+simd"
   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 
@@ -20813,39 +21421,38 @@ main ()
 }
 _ACEOF
 if ac_fn_c_try_compile "$LINENO"; then :
-  ax_cv_check_cflags___mfpu_neon=yes
+  ax_cv_check_cflags___march_armv8_apsimd=yes
 else
-  ax_cv_check_cflags___mfpu_neon=no
+  ax_cv_check_cflags___march_armv8_apsimd=no
 fi
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
   CFLAGS=$ax_check_save_flags
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_cv_check_cflags___mfpu_neon" >&5
-$as_echo "$ax_cv_check_cflags___mfpu_neon" >&6; }
-if test x"$ax_cv_check_cflags___mfpu_neon" = xyes; then :
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_cv_check_cflags___march_armv8_apsimd" >&5
+$as_echo "$ax_cv_check_cflags___march_armv8_apsimd" >&6; }
+if test x"$ax_cv_check_cflags___march_armv8_apsimd" = xyes; then :
   ax_cv_support_neon_ext=yes
 else
   :
 fi
 
       if test x"$ax_cv_support_neon_ext" = x"yes"; then
-        ARM_NEON_FLAGS="-mfpu=neon -DARM_NEON"
-
-        ARM_FLAGS="$ARM_FLAGS $ARM_NEON_FLAGS"
+        ARM_ARCH_FLAGS="$ARM_ARCH_FLAGS+simd"
+        ARM_DEFINE_FLAGS="$ARM_DEFINE_FLAGS -DARM_NEON"
+        ARM_NEON_FLAGS="-march=armv8-a+simd -DARCH_AARCH64 -DARM_NEON"
 
 $as_echo "#define HAVE_NEON /**/" >>confdefs.h
 
+
       fi
-    ;;
-    aarch64*)
-      { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts -march=armv8-a+simd" >&5
-$as_echo_n "checking whether C compiler accepts -march=armv8-a+simd... " >&6; }
-if ${ax_cv_check_cflags___march_armv8_apsimd+:} false; then :
+      { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts -march=armv8-a+crc" >&5
+$as_echo_n "checking whether C compiler accepts -march=armv8-a+crc... " >&6; }
+if ${ax_cv_check_cflags___march_armv8_apcrc+:} false; then :
   $as_echo_n "(cached) " >&6
 else
 
   ax_check_save_flags=$CFLAGS
-  CFLAGS="$CFLAGS  -march=armv8-a+simd"
+  CFLAGS="$CFLAGS  -march=armv8-a+crc"
   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 
@@ -20858,29 +21465,30 @@ main ()
 }
 _ACEOF
 if ac_fn_c_try_compile "$LINENO"; then :
-  ax_cv_check_cflags___march_armv8_apsimd=yes
+  ax_cv_check_cflags___march_armv8_apcrc=yes
 else
-  ax_cv_check_cflags___march_armv8_apsimd=no
+  ax_cv_check_cflags___march_armv8_apcrc=no
 fi
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
   CFLAGS=$ax_check_save_flags
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_cv_check_cflags___march_armv8_apsimd" >&5
-$as_echo "$ax_cv_check_cflags___march_armv8_apsimd" >&6; }
-if test x"$ax_cv_check_cflags___march_armv8_apsimd" = xyes; then :
-  ax_cv_support_neon_ext=yes
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_cv_check_cflags___march_armv8_apcrc" >&5
+$as_echo "$ax_cv_check_cflags___march_armv8_apcrc" >&6; }
+if test x"$ax_cv_check_cflags___march_armv8_apcrc" = xyes; then :
+  ax_cv_support_crc_ext=yes
 else
   :
 fi
 
-      if test x"$ax_cv_support_neon_ext" = x"yes"; then
-        ARM_NEON_FLAGS="-march=armv8-a+simd -DARCH_AARCH64 -DARM_NEON"
+      if test x"$ax_cv_support_crc_ext" = x"yes"; then
+        ARM_ARCH_FLAGS="$ARM_ARCH_FLAGS+crc"
+        ARM_CRC_FLAGS="-march=armv8-a+crc -DARCH_AARCH64"
 
-        ARM_FLAGS="$ARM_FLAGS $ARM_NEON_FLAGS"
+$as_echo "#define HAVE_ARMV8_CRC /**/" >>confdefs.h
 
-$as_echo "#define HAVE_NEON /**/" >>confdefs.h
 
       fi
+        ARM_FLAGS="$ARM_ARCH_FLAGS $ARM_DEFINE_FLAGS"
     ;;
   esac
 
@@ -20894,6 +21502,14 @@ else
   HAVE_NEON_FALSE=
 fi
 
+ if  test "x$ax_cv_support_crc_ext" = "xyes"; then
+  HAVE_ARMV8_CRC_TRUE=
+  HAVE_ARMV8_CRC_FALSE='#'
+else
+  HAVE_ARMV8_CRC_TRUE='#'
+  HAVE_ARMV8_CRC_FALSE=
+fi
+
 
 
 
@@ -21647,23 +22263,15 @@ See \`config.log' for more details" "$LINENO" 5; }
 	fi
 fi
 
-# use system libs3?
-
-# Check whether --with-system-libs3 was given.
-if test "${with_system_libs3+set}" = set; then :
-  withval=$with_system_libs3;
-else
-  with_system_libs3=no
-fi
-
-if test "x$with_system_libs3" = xyes; then :
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for S3_initialize in -ls3" >&5
-$as_echo_n "checking for S3_initialize in -ls3... " >&6; }
-if ${ac_cv_lib_s3_S3_initialize+:} false; then :
+# needs libcurl and libxml2
+if test "x$with_rest_bench" = xyes && test "x$with_system_libs3" = xno; then
+   { $as_echo "$as_me:${as_lineno-$LINENO}: checking for curl_easy_init in -lcurl" >&5
+$as_echo_n "checking for curl_easy_init in -lcurl... " >&6; }
+if ${ac_cv_lib_curl_curl_easy_init+:} false; then :
   $as_echo_n "(cached) " >&6
 else
   ac_check_lib_save_LIBS=$LIBS
-LIBS="-ls3 -lpthread $LIBS"
+LIBS="-lcurl  $LIBS"
 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 
@@ -21673,43 +22281,44 @@ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 #ifdef __cplusplus
 extern "C"
 #endif
-char S3_initialize ();
+char curl_easy_init ();
 int
 main ()
 {
-return S3_initialize ();
+return curl_easy_init ();
   ;
   return 0;
 }
 _ACEOF
 if ac_fn_c_try_link "$LINENO"; then :
-  ac_cv_lib_s3_S3_initialize=yes
+  ac_cv_lib_curl_curl_easy_init=yes
 else
-  ac_cv_lib_s3_S3_initialize=no
+  ac_cv_lib_curl_curl_easy_init=no
 fi
 rm -f core conftest.err conftest.$ac_objext \
     conftest$ac_exeext conftest.$ac_ext
 LIBS=$ac_check_lib_save_LIBS
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_s3_S3_initialize" >&5
-$as_echo "$ac_cv_lib_s3_S3_initialize" >&6; }
-if test "x$ac_cv_lib_s3_S3_initialize" = xyes; then :
-  true
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_curl_curl_easy_init" >&5
+$as_echo "$ac_cv_lib_curl_curl_easy_init" >&6; }
+if test "x$ac_cv_lib_curl_curl_easy_init" = xyes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_LIBCURL 1
+_ACEOF
+
+  LIBS="-lcurl $LIBS"
+
 else
-  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "libs3 not found
-See \`config.log' for more details" "$LINENO" 5; }
+  as_fn_error $? "libcurl not found" "$LINENO" 5
 fi
 
-fi
-if test "x$with_system_libs3" = xcheck; then :
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for library containing S3_initialize" >&5
-$as_echo_n "checking for library containing S3_initialize... " >&6; }
-if ${ac_cv_search_S3_initialize+:} false; then :
+   { $as_echo "$as_me:${as_lineno-$LINENO}: checking for xmlParseChunk in -lxml2" >&5
+$as_echo_n "checking for xmlParseChunk in -lxml2... " >&6; }
+if ${ac_cv_lib_xml2_xmlParseChunk+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  ac_func_search_save_LIBS=$LIBS
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lxml2  $LIBS"
 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 
@@ -21719,77 +22328,39 @@ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 #ifdef __cplusplus
 extern "C"
 #endif
-char S3_initialize ();
+char xmlParseChunk ();
 int
 main ()
 {
-return S3_initialize ();
+return xmlParseChunk ();
   ;
   return 0;
 }
 _ACEOF
-for ac_lib in '' s3; do
-  if test -z "$ac_lib"; then
-    ac_res="none required"
-  else
-    ac_res=-l$ac_lib
-    LIBS="-l$ac_lib -lpthread $ac_func_search_save_LIBS"
-  fi
-  if ac_fn_c_try_link "$LINENO"; then :
-  ac_cv_search_S3_initialize=$ac_res
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext
-  if ${ac_cv_search_S3_initialize+:} false; then :
-  break
-fi
-done
-if ${ac_cv_search_S3_initialize+:} false; then :
-
-else
-  ac_cv_search_S3_initialize=no
-fi
-rm conftest.$ac_ext
-LIBS=$ac_func_search_save_LIBS
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_search_S3_initialize" >&5
-$as_echo "$ac_cv_search_S3_initialize" >&6; }
-ac_res=$ac_cv_search_S3_initialize
-if test "$ac_res" != no; then :
-  test "$ac_res" = "none required" || LIBS="$ac_res $LIBS"
-  with_system_libs3=yes
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_lib_xml2_xmlParseChunk=yes
 else
-  true
-fi
-
+  ac_cv_lib_xml2_xmlParseChunk=no
 fi
- if  test "$with_system_libs3" = "yes" ; then
-  WITH_SYSTEM_LIBS3_TRUE=
-  WITH_SYSTEM_LIBS3_FALSE='#'
-else
-  WITH_SYSTEM_LIBS3_TRUE='#'
-  WITH_SYSTEM_LIBS3_FALSE=
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
 fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_xml2_xmlParseChunk" >&5
+$as_echo "$ac_cv_lib_xml2_xmlParseChunk" >&6; }
+if test "x$ac_cv_lib_xml2_xmlParseChunk" = xyes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_LIBXML2 1
+_ACEOF
 
+  LIBS="-lxml2 $LIBS"
 
-# rest-bench?
-
-# Check whether --with-rest-bench was given.
-if test "${with_rest_bench+set}" = set; then :
-  withval=$with_rest_bench;
 else
-  with_rest_bench=no
+  as_fn_error $? "libxml2 not found" "$LINENO" 5
 fi
 
- if  test "$with_rest_bench" = "yes" ; then
-  WITH_REST_BENCH_TRUE=
-  WITH_REST_BENCH_FALSE='#'
-else
-  WITH_REST_BENCH_TRUE='#'
-  WITH_REST_BENCH_FALSE=
 fi
 
-
 # use libaio?
 
 # Check whether --with-libaio was given.
@@ -22582,6 +23153,98 @@ BOOST_THREAD_LIBS="${LIBS}"
 LIBS="${saved_LIBS}"
 
 
+# boost-random
+BOOST_RANDOM_LIBS=""
+saved_LIBS="${LIBS}"
+LIBS=""
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for main in -lboost_random-mt" >&5
+$as_echo_n "checking for main in -lboost_random-mt... " >&6; }
+if ${ac_cv_lib_boost_random_mt_main+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lboost_random-mt  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+
+int
+main ()
+{
+return main ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_link "$LINENO"; then :
+  ac_cv_lib_boost_random_mt_main=yes
+else
+  ac_cv_lib_boost_random_mt_main=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_boost_random_mt_main" >&5
+$as_echo "$ac_cv_lib_boost_random_mt_main" >&6; }
+if test "x$ac_cv_lib_boost_random_mt_main" = xyes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_LIBBOOST_RANDOM_MT 1
+_ACEOF
+
+  LIBS="-lboost_random-mt $LIBS"
+
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for main in -lboost_random" >&5
+$as_echo_n "checking for main in -lboost_random... " >&6; }
+if ${ac_cv_lib_boost_random_main+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lboost_random  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+
+int
+main ()
+{
+return main ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_link "$LINENO"; then :
+  ac_cv_lib_boost_random_main=yes
+else
+  ac_cv_lib_boost_random_main=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_boost_random_main" >&5
+$as_echo "$ac_cv_lib_boost_random_main" >&6; }
+if test "x$ac_cv_lib_boost_random_main" = xyes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_LIBBOOST_RANDOM 1
+_ACEOF
+
+  LIBS="-lboost_random $LIBS"
+
+else
+  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "\"Boost random library not found.\"
+See \`config.log' for more details" "$LINENO" 5; }
+fi
+
+fi
+
+BOOST_RANDOM_LIBS="${LIBS}"
+LIBS="${saved_LIBS}"
+
+
 #
 # Check for boost_program_options library (defines BOOST_PROGRAM_OPTIONS_LIBS).
 #
@@ -22726,6 +23389,15 @@ fi
 done
 
 
+# name_to_handle_at
+ac_fn_c_check_func "$LINENO" "name_to_handle_at" "ac_cv_func_name_to_handle_at"
+if test "x$ac_cv_func_name_to_handle_at" = xyes; then :
+
+$as_echo "#define HAVE_NAME_TO_HANDLE_AT /**/" >>confdefs.h
+
+fi
+
+
 # sync_file_range
 ac_fn_c_check_func "$LINENO" "sync_file_range" "ac_cv_func_sync_file_range"
 if test "x$ac_cv_func_sync_file_range" = xyes; then :
@@ -22744,6 +23416,19 @@ $as_echo "#define CEPH_HAVE_FALLOCATE /**/" >>confdefs.h
 fi
 
 
+# getgrouplist
+for ac_func in getgrouplist
+do :
+  ac_fn_c_check_func "$LINENO" "getgrouplist" "ac_cv_func_getgrouplist"
+if test "x$ac_cv_func_getgrouplist" = xyes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_GETGROUPLIST 1
+_ACEOF
+
+fi
+done
+
+
 #
 # Test for time-related `struct stat` members.
 #
@@ -22902,7 +23587,7 @@ rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
 
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for sched.h" >&5
 $as_echo_n "checking for sched.h... " >&6; }
-+ac_ext=cpp
+ac_ext=cpp
 ac_cpp='$CXXCPP $CPPFLAGS'
 ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
 ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
@@ -22944,7 +23629,7 @@ $as_echo "no" >&6; }
 
 fi
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-+ac_ext=c
+ac_ext=c
 ac_cpp='$CPP $CPPFLAGS'
 ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
 ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
@@ -23508,28 +24193,126 @@ elif test "x$enable_valgrind" = "x"; then :
   enable_valgrind=yes
 fi
 
- if test "x$enable_valgrind" = "xyes"; then
-  VALGRIND_ENABLED_TRUE=
-  VALGRIND_ENABLED_FALSE='#'
+ if test "x$enable_valgrind" = "xyes"; then
+  VALGRIND_ENABLED_TRUE=
+  VALGRIND_ENABLED_FALSE='#'
+else
+  VALGRIND_ENABLED_TRUE='#'
+  VALGRIND_ENABLED_FALSE=
+fi
+
+if test "x$enable_valgrind" = "xyes"; then
+  for ac_header in valgrind/helgrind.h
+do :
+  ac_fn_c_check_header_mongrel "$LINENO" "valgrind/helgrind.h" "ac_cv_header_valgrind_helgrind_h" "$ac_includes_default"
+if test "x$ac_cv_header_valgrind_helgrind_h" = xyes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_VALGRIND_HELGRIND_H 1
+_ACEOF
+
+fi
+
+done
+
+fi
+
+
+
+# Check whether --with-systemd-libexec-dir was given.
+if test "${with_systemd_libexec_dir+set}" = set; then :
+  withval=$with_systemd_libexec_dir;
+	    systemd_libexec_dir="$withval"
+
+else
+
+        if test "x$SYSTEMD_LIBEXEC_DIR" = "x"; then
+
+            prefix_save=$prefix
+            exec_prefix_save=$exec_prefix
+
+                        if test "x$prefix" = "xNONE"; then
+                prefix="$ac_default_prefix"
+            fi
+                        if test "x$exec_prefix" = "xNONE"; then
+                exec_prefix=$prefix
+            fi
+
+                        systemd_libexec_dir="`eval exec_prefix=$exec_prefix prefix=$prefix echo $libexecdir`"
+
+
+            prefix=$prefix_save
+            exec_prefix=$exec_prefix_save
+        else
+            systemd_libexec_dir="$SYSTEMD_LIBEXEC_DIR"
+        fi
+
+
+fi
+
+
+
+
+
+# Check whether --with-rgw-user was given.
+if test "${with_rgw_user+set}" = set; then :
+  withval=$with_rgw_user;
+        user_rgw="$withval"
+
+else
+
+        if test "x$USER_RGW" = "x"; then
+            user_rgw=www-data
+        else
+            user_rgw="$USER_RGW"
+        fi
+
+
+fi
+
+
+
+
+# Check whether --with-rgw-group was given.
+if test "${with_rgw_group+set}" = set; then :
+  withval=$with_rgw_group;
+        group_rgw="$withval"
+
+else
+
+        if test "x$GROUP_RGW" = "x"; then
+            group_rgw=www-data
+        else
+            group_rgw="$GROUP_RGW"
+        fi
+
+
+fi
+
+
+
+
+
+# Check whether --with-systemd-unit-dir was given.
+if test "${with_systemd_unit_dir+set}" = set; then :
+  withval=$with_systemd_unit_dir;
+	    systemd_unit_dir="$withval"
+
 else
-  VALGRIND_ENABLED_TRUE='#'
-  VALGRIND_ENABLED_FALSE=
-fi
 
-if test "x$enable_valgrind" = "xyes"; then
-  for ac_header in valgrind/helgrind.h
-do :
-  ac_fn_c_check_header_mongrel "$LINENO" "valgrind/helgrind.h" "ac_cv_header_valgrind_helgrind_h" "$ac_includes_default"
-if test "x$ac_cv_header_valgrind_helgrind_h" = xyes; then :
-  cat >>confdefs.h <<_ACEOF
-#define HAVE_VALGRIND_HELGRIND_H 1
-_ACEOF
+        # default to the systemd admin unit directory
+        which pkg-config
+        pkg_config_exists=$?
+        if test x"$pkg_config_exists" = x"0"; then
+            systemd_unit_dir=`pkg-config systemd --variable=systemdsystemunitdir`
+        else
+            systemd_unit_dir="/etc/systemd/system/"
+        fi
+
 
 fi
 
-done
 
-fi
+
 
 
 # Checks for typedefs, structures, and compiler characteristics.
@@ -23956,7 +24739,7 @@ $as_echo "$am_cv_python_pyexecdir" >&6; }
 
 ac_config_headers="$ac_config_headers src/acconfig.h"
 
-ac_config_files="$ac_config_files Makefile src/Makefile src/ocf/Makefile src/ocf/ceph src/ocf/rbd src/java/Makefile src/tracing/Makefile man/Makefile ceph.spec"
+ac_config_files="$ac_config_files Makefile src/Makefile src/ocf/Makefile src/ocf/ceph src/ocf/rbd src/java/Makefile systemd/Makefile man/Makefile doc/Makefile selinux/Makefile ceph.spec"
 
 cat >confcache <<\_ACEOF
 # This file is a shell script that caches the results of configure
@@ -24067,6 +24850,18 @@ LIBOBJS=$ac_libobjs
 LTLIBOBJS=$ac_ltlibobjs
 
 
+if test -z "${WITH_MAN_PAGES_TRUE}" && test -z "${WITH_MAN_PAGES_FALSE}"; then
+  as_fn_error $? "conditional \"WITH_MAN_PAGES\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${AMDEP_TRUE}" && test -z "${AMDEP_FALSE}"; then
+  as_fn_error $? "conditional \"AMDEP\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${am__fastdepCCAS_TRUE}" && test -z "${am__fastdepCCAS_FALSE}"; then
+  as_fn_error $? "conditional \"am__fastdepCCAS\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking that generated files are newer than configure" >&5
 $as_echo_n "checking that generated files are newer than configure... " >&6; }
    if test -n "$am_sleep_pid"; then
@@ -24075,28 +24870,12 @@ $as_echo_n "checking that generated files are newer than configure... " >&6; }
    fi
    { $as_echo "$as_me:${as_lineno-$LINENO}: result: done" >&5
 $as_echo "done" >&6; }
-if test -z "${AMDEP_TRUE}" && test -z "${AMDEP_FALSE}"; then
-  as_fn_error $? "conditional \"AMDEP\" was never defined.
-Usually this means the macro was only invoked conditionally." "$LINENO" 5
-fi
 if test -z "${am__fastdepCC_TRUE}" && test -z "${am__fastdepCC_FALSE}"; then
   as_fn_error $? "conditional \"am__fastdepCC\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
- if test -n "$EXEEXT"; then
-  am__EXEEXT_TRUE=
-  am__EXEEXT_FALSE='#'
-else
-  am__EXEEXT_TRUE='#'
-  am__EXEEXT_FALSE=
-fi
-
-if test -z "${am__fastdepCCAS_TRUE}" && test -z "${am__fastdepCCAS_FALSE}"; then
-  as_fn_error $? "conditional \"am__fastdepCCAS\" was never defined.
-Usually this means the macro was only invoked conditionally." "$LINENO" 5
-fi
-if test -z "${am__fastdepCC_TRUE}" && test -z "${am__fastdepCC_FALSE}"; then
-  as_fn_error $? "conditional \"am__fastdepCC\" was never defined.
+if test -z "${am__fastdepCXX_TRUE}" && test -z "${am__fastdepCXX_FALSE}"; then
+  as_fn_error $? "conditional \"am__fastdepCXX\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
  if test -n "$EXEEXT"; then
@@ -24119,14 +24898,6 @@ if test -z "${DARWIN_TRUE}" && test -z "${DARWIN_FALSE}"; then
   as_fn_error $? "conditional \"DARWIN\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
-if test -z "${am__fastdepCXX_TRUE}" && test -z "${am__fastdepCXX_FALSE}"; then
-  as_fn_error $? "conditional \"am__fastdepCXX\" was never defined.
-Usually this means the macro was only invoked conditionally." "$LINENO" 5
-fi
-if test -z "${am__fastdepCXX_TRUE}" && test -z "${am__fastdepCXX_FALSE}"; then
-  as_fn_error $? "conditional \"am__fastdepCXX\" was never defined.
-Usually this means the macro was only invoked conditionally." "$LINENO" 5
-fi
 if test -z "${CLANG_TRUE}" && test -z "${CLANG_FALSE}"; then
   as_fn_error $? "conditional \"CLANG\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
@@ -24143,6 +24914,10 @@ if test -z "${WITH_CEPHFS_TRUE}" && test -z "${WITH_CEPHFS_FALSE}"; then
   as_fn_error $? "conditional \"WITH_CEPHFS\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
+if test -z "${WITH_SELINUX_TRUE}" && test -z "${WITH_SELINUX_FALSE}"; then
+  as_fn_error $? "conditional \"WITH_SELINUX\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
 if test -z "${WITH_RADOSSTRIPER_TRUE}" && test -z "${WITH_RADOSSTRIPER_FALSE}"; then
   as_fn_error $? "conditional \"WITH_RADOSSTRIPER\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
@@ -24179,8 +24954,8 @@ if test -z "${COMPILER_HAS_VTA_TRUE}" && test -z "${COMPILER_HAS_VTA_FALSE}"; th
   as_fn_error $? "conditional \"COMPILER_HAS_VTA\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
-if test -z "${ENABLE_ROOT_MAKE_CHECK_TRUE}" && test -z "${ENABLE_ROOT_MAKE_CHECK_FALSE}"; then
-  as_fn_error $? "conditional \"ENABLE_ROOT_MAKE_CHECK\" was never defined.
+if test -z "${NO_GIT_VERSION_TRUE}" && test -z "${NO_GIT_VERSION_FALSE}"; then
+  as_fn_error $? "conditional \"NO_GIT_VERSION\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
 if test -z "${WITH_PROFILER_TRUE}" && test -z "${WITH_PROFILER_FALSE}"; then
@@ -24239,6 +25014,10 @@ if test -z "${HAVE_NEON_TRUE}" && test -z "${HAVE_NEON_FALSE}"; then
   as_fn_error $? "conditional \"HAVE_NEON\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
+if test -z "${HAVE_ARMV8_CRC_TRUE}" && test -z "${HAVE_ARMV8_CRC_FALSE}"; then
+  as_fn_error $? "conditional \"HAVE_ARMV8_CRC\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
 if test -z "${HAVE_SSSE3_TRUE}" && test -z "${HAVE_SSSE3_FALSE}"; then
   as_fn_error $? "conditional \"HAVE_SSSE3\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
@@ -24263,14 +25042,6 @@ if test -z "${WITH_LIBROCKSDB_TRUE}" && test -z "${WITH_LIBROCKSDB_FALSE}"; then
   as_fn_error $? "conditional \"WITH_LIBROCKSDB\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
-if test -z "${WITH_SYSTEM_LIBS3_TRUE}" && test -z "${WITH_SYSTEM_LIBS3_FALSE}"; then
-  as_fn_error $? "conditional \"WITH_SYSTEM_LIBS3\" was never defined.
-Usually this means the macro was only invoked conditionally." "$LINENO" 5
-fi
-if test -z "${WITH_REST_BENCH_TRUE}" && test -z "${WITH_REST_BENCH_FALSE}"; then
-  as_fn_error $? "conditional \"WITH_REST_BENCH\" was never defined.
-Usually this means the macro was only invoked conditionally." "$LINENO" 5
-fi
 if test -z "${WITH_LIBAIO_TRUE}" && test -z "${WITH_LIBAIO_FALSE}"; then
   as_fn_error $? "conditional \"WITH_LIBAIO\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
@@ -24700,7 +25471,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by ceph $as_me 0.94.5, which was
+This file was extended by ceph $as_me 9.2.0, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -24766,7 +25537,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-ceph config.status 0.94.5
+ceph config.status 9.2.0
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
@@ -24885,7 +25656,6 @@ cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 #
 # INIT-COMMANDS
 #
-AMDEP_TRUE="$AMDEP_TRUE" ac_aux_dir="$ac_aux_dir"
 
 
 # The HP-UX ksh and POSIX shell print the target directory to stdout
@@ -25261,6 +26031,7 @@ fi
 
 
 
+AMDEP_TRUE="$AMDEP_TRUE" ac_aux_dir="$ac_aux_dir"
 
 _ACEOF
 
@@ -25270,8 +26041,8 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 for ac_config_target in $ac_config_targets
 do
   case $ac_config_target in
-    "depfiles") CONFIG_COMMANDS="$CONFIG_COMMANDS depfiles" ;;
     "libtool") CONFIG_COMMANDS="$CONFIG_COMMANDS libtool" ;;
+    "depfiles") CONFIG_COMMANDS="$CONFIG_COMMANDS depfiles" ;;
     "src/acconfig.h") CONFIG_HEADERS="$CONFIG_HEADERS src/acconfig.h" ;;
     "Makefile") CONFIG_FILES="$CONFIG_FILES Makefile" ;;
     "src/Makefile") CONFIG_FILES="$CONFIG_FILES src/Makefile" ;;
@@ -25279,8 +26050,10 @@ do
     "src/ocf/ceph") CONFIG_FILES="$CONFIG_FILES src/ocf/ceph" ;;
     "src/ocf/rbd") CONFIG_FILES="$CONFIG_FILES src/ocf/rbd" ;;
     "src/java/Makefile") CONFIG_FILES="$CONFIG_FILES src/java/Makefile" ;;
-    "src/tracing/Makefile") CONFIG_FILES="$CONFIG_FILES src/tracing/Makefile" ;;
+    "systemd/Makefile") CONFIG_FILES="$CONFIG_FILES systemd/Makefile" ;;
     "man/Makefile") CONFIG_FILES="$CONFIG_FILES man/Makefile" ;;
+    "doc/Makefile") CONFIG_FILES="$CONFIG_FILES doc/Makefile" ;;
+    "selinux/Makefile") CONFIG_FILES="$CONFIG_FILES selinux/Makefile" ;;
     "ceph.spec") CONFIG_FILES="$CONFIG_FILES ceph.spec" ;;
 
   *) as_fn_error $? "invalid argument: \`$ac_config_target'" "$LINENO" 5;;
@@ -25877,99 +26650,6 @@ $as_echo "$as_me: executing $ac_file commands" >&6;}
 
 
   case $ac_file$ac_mode in
-    "depfiles":C) test x"$AMDEP_TRUE" != x"" || {
-  # Older Autoconf quotes --file arguments for eval, but not when files
-  # are listed without --file.  Let's play safe and only enable the eval
-  # if we detect the quoting.
-  case $CONFIG_FILES in
-  *\'*) eval set x "$CONFIG_FILES" ;;
-  *)   set x $CONFIG_FILES ;;
-  esac
-  shift
-  for mf
-  do
-    # Strip MF so we end up with the name of the file.
-    mf=`echo "$mf" | sed -e 's/:.*$//'`
-    # Check whether this is an Automake generated Makefile or not.
-    # We used to match only the files named 'Makefile.in', but
-    # some people rename them; so instead we look at the file content.
-    # Grep'ing the first line is not enough: some people post-process
-    # each Makefile.in and add a new line on top of each file to say so.
-    # Grep'ing the whole file is not good either: AIX grep has a line
-    # limit of 2048, but all sed's we know have understand at least 4000.
-    if sed -n 's,^#.*generated by automake.*,X,p' "$mf" | grep X >/dev/null 2>&1; then
-      dirpart=`$as_dirname -- "$mf" ||
-$as_expr X"$mf" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
-	 X"$mf" : 'X\(//\)[^/]' \| \
-	 X"$mf" : 'X\(//\)$' \| \
-	 X"$mf" : 'X\(/\)' \| . 2>/dev/null ||
-$as_echo X"$mf" |
-    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\/\)[^/].*/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\/\)$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\).*/{
-	    s//\1/
-	    q
-	  }
-	  s/.*/./; q'`
-    else
-      continue
-    fi
-    # Extract the definition of DEPDIR, am__include, and am__quote
-    # from the Makefile without running 'make'.
-    DEPDIR=`sed -n 's/^DEPDIR = //p' < "$mf"`
-    test -z "$DEPDIR" && continue
-    am__include=`sed -n 's/^am__include = //p' < "$mf"`
-    test -z "$am__include" && continue
-    am__quote=`sed -n 's/^am__quote = //p' < "$mf"`
-    # Find all dependency output files, they are included files with
-    # $(DEPDIR) in their names.  We invoke sed twice because it is the
-    # simplest approach to changing $(DEPDIR) to its actual value in the
-    # expansion.
-    for file in `sed -n "
-      s/^$am__include $am__quote\(.*(DEPDIR).*\)$am__quote"'$/\1/p' <"$mf" | \
-	 sed -e 's/\$(DEPDIR)/'"$DEPDIR"'/g'`; do
-      # Make sure the directory exists.
-      test -f "$dirpart/$file" && continue
-      fdir=`$as_dirname -- "$file" ||
-$as_expr X"$file" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
-	 X"$file" : 'X\(//\)[^/]' \| \
-	 X"$file" : 'X\(//\)$' \| \
-	 X"$file" : 'X\(/\)' \| . 2>/dev/null ||
-$as_echo X"$file" |
-    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\/\)[^/].*/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\/\)$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\).*/{
-	    s//\1/
-	    q
-	  }
-	  s/.*/./; q'`
-      as_dir=$dirpart/$fdir; as_fn_mkdir_p
-      # echo "creating $dirpart/$file"
-      echo '# dummy' > "$dirpart/$file"
-    done
-  done
-}
- ;;
     "libtool":C)
 
     # See if we are running on zsh, and set the options which allow our
@@ -26767,6 +27447,99 @@ compiler_lib_search_path=$lt_compiler_lib_search_path_CXX
 _LT_EOF
 
  ;;
+    "depfiles":C) test x"$AMDEP_TRUE" != x"" || {
+  # Older Autoconf quotes --file arguments for eval, but not when files
+  # are listed without --file.  Let's play safe and only enable the eval
+  # if we detect the quoting.
+  case $CONFIG_FILES in
+  *\'*) eval set x "$CONFIG_FILES" ;;
+  *)   set x $CONFIG_FILES ;;
+  esac
+  shift
+  for mf
+  do
+    # Strip MF so we end up with the name of the file.
+    mf=`echo "$mf" | sed -e 's/:.*$//'`
+    # Check whether this is an Automake generated Makefile or not.
+    # We used to match only the files named 'Makefile.in', but
+    # some people rename them; so instead we look at the file content.
+    # Grep'ing the first line is not enough: some people post-process
+    # each Makefile.in and add a new line on top of each file to say so.
+    # Grep'ing the whole file is not good either: AIX grep has a line
+    # limit of 2048, but all sed's we know have understand at least 4000.
+    if sed -n 's,^#.*generated by automake.*,X,p' "$mf" | grep X >/dev/null 2>&1; then
+      dirpart=`$as_dirname -- "$mf" ||
+$as_expr X"$mf" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+	 X"$mf" : 'X\(//\)[^/]' \| \
+	 X"$mf" : 'X\(//\)$' \| \
+	 X"$mf" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X"$mf" |
+    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)[^/].*/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+    else
+      continue
+    fi
+    # Extract the definition of DEPDIR, am__include, and am__quote
+    # from the Makefile without running 'make'.
+    DEPDIR=`sed -n 's/^DEPDIR = //p' < "$mf"`
+    test -z "$DEPDIR" && continue
+    am__include=`sed -n 's/^am__include = //p' < "$mf"`
+    test -z "$am__include" && continue
+    am__quote=`sed -n 's/^am__quote = //p' < "$mf"`
+    # Find all dependency output files, they are included files with
+    # $(DEPDIR) in their names.  We invoke sed twice because it is the
+    # simplest approach to changing $(DEPDIR) to its actual value in the
+    # expansion.
+    for file in `sed -n "
+      s/^$am__include $am__quote\(.*(DEPDIR).*\)$am__quote"'$/\1/p' <"$mf" | \
+	 sed -e 's/\$(DEPDIR)/'"$DEPDIR"'/g'`; do
+      # Make sure the directory exists.
+      test -f "$dirpart/$file" && continue
+      fdir=`$as_dirname -- "$file" ||
+$as_expr X"$file" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+	 X"$file" : 'X\(//\)[^/]' \| \
+	 X"$file" : 'X\(//\)$' \| \
+	 X"$file" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X"$file" |
+    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)[^/].*/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+      as_dir=$dirpart/$fdir; as_fn_mkdir_p
+      # echo "creating $dirpart/$file"
+      echo '# dummy' > "$dirpart/$file"
+    done
+  done
+}
+ ;;
 
   esac
 done # for ac_tag
diff --git a/configure.ac b/configure.ac
index 969baed..3ed4b04 100644
--- a/configure.ac
+++ b/configure.ac
@@ -8,7 +8,9 @@ AC_PREREQ(2.59)
 # VERSION define is not used by the code.  It gets a version string
 # from 'git describe'; see src/ceph_ver.[ch]
 
-AC_INIT([ceph], [0.94.5], [ceph-devel at vger.kernel.org])
+AC_INIT([ceph], [9.2.0], [ceph-devel at vger.kernel.org])
+
+AX_CXX_COMPILE_STDCXX_11(, mandatory)
 
 # Create release string.  Used with VERSION for RPMs.
 RPM_RELEASE=0
@@ -21,6 +23,17 @@ if test -d ".git" ; then
 fi
 AC_MSG_NOTICE([RPM_RELEASE='$RPM_RELEASE'])
 
+AC_ARG_WITH([man-pages],
+    [AS_HELP_STRING([--with-man-pages], [build man pages])],
+    [],
+    [with_man_pages=check])
+AS_IF([test "x$with_man_pages" != "xno"],
+  [AC_CHECK_PROGS(SPHINX_BUILD, sphinx-1.0-build sphinx-build)
+   AS_IF([test -z "$SPHINX_BUILD" && \
+          test "x$with_man_pages" = "xyes"],
+     [AC_MSG_ERROR([sphinx-build not found (python-sphinx)])])])
+AM_CONDITIONAL(WITH_MAN_PAGES, test -n "$SPHINX_BUILD")
+
 AC_CONFIG_MACRO_DIR([m4])
 
 AC_CONFIG_SUBDIRS([src/gmock])
@@ -33,14 +46,13 @@ AC_CANONICAL_TARGET
 m4_ifdef([AM_PROG_AR], [AM_PROG_AR])
 
 # Automake
-AM_INIT_AUTOMAKE
 AM_PROG_CC_C_O
 AM_PROG_LIBTOOL
 AM_PROG_AS
 
 
+AM_INIT_AUTOMAKE([foreign parallel-tests tar-ustar])
 # enable make V=0 (if automake >1.11)
-AM_INIT_AUTOMAKE([foreign])
 m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
 
 # Platform
@@ -86,9 +98,7 @@ AC_PROG_LIBTOOL
 AC_SUBST(AM_CXXFLAGS)
 AM_CXXFLAGS="${AM_CXXFLAGS}"
 
-
-###### PATCH STARTS HERE ######
-# Find out what to build (default is all of these)
+# Find out what to build (default is most of these)
 
 # rados?
 AC_ARG_WITH([rados],
@@ -119,12 +129,22 @@ AM_CONDITIONAL(WITH_CEPHFS, test "$with_cephfs" = "yes")
 # radosgw?
 # radosgw requires rados
 AC_ARG_WITH([radosgw],
-            [AS_HELP_STRING([--with-radosgw], [build RADOS gateway])],
-            [],
-            [with_radosgw=check])
+	[AS_HELP_STRING([--with-radosgw], [build RADOS gateway])],
+	[],
+	[with_radosgw=check])
 # AM_CONDITIONAL is defined later -- we need to check whether we can enable radosgw if no option is present
 #AS_IF([test "$with_radosgw" = "yes"], [AC_DEFINE([WITH_RADOS, WITH_RADOSGW])])
 
+AC_ARG_WITH([selinux],
+	[AS_HELP_STRING([--with-selinux], [build SELinux policy])],
+	[],
+	[with_selinux=no])
+AM_CONDITIONAL(WITH_SELINUX, test "$with_selinux" = "yes")
+if test "x$with_selinux" = x"yes"; then
+	AC_CHECK_FILE([/usr/share/selinux/devel/policyhelp], [true], [AC_MSG_FAILURE([No SELinux found])])
+	AC_CHECK_FILE([/usr/share/selinux/devel/include/Makefile], [true],  [AC_MSG_FAILURE([No SELinux Makefile found])])
+fi
+
 # radosstriper?
 AC_ARG_WITH([radosstriper],
 	[AS_HELP_STRING([--with-radosstriper], [build radosstriper files])],
@@ -180,8 +200,6 @@ AS_IF([test "$enable_server" = "yes" -a \( "$with_osd" = "yes" -o "$with_mon" =
 # cond-check leveldb, necessary if server, osd or mon enabled
 AS_IF([test "$enable_server" = "yes" -a \( "$with_osd" = "yes" -o "$with_mon" = "yes" \)],
 	[AC_CHECK_LIB([leveldb], [leveldb_open], [true], [AC_MSG_FAILURE([libleveldb not found])], [-lsnappy -lpthread])])
-###### PATCH ENDS HERE ######
-
 
 # Check for yasm
 AC_CHECK_PROG(YASM_CHECK, yasm, yes)
@@ -200,7 +218,7 @@ if test x"$YASM_CHECK" = x"yes"; then
            AC_DEFINE([HAVE_GOOD_YASM_ELF64], [1], [we have a recent yasm and are x86_64])
            with_good_yasm=yes
   
-           if yasm -f elf64 -i src/ceph/src/ceph/src/erasure-code/isa/isa-l/include/ src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx2.asm.s -o /dev/null 2> /dev/null ; then
+           if yasm -f elf64 -i src/erasure-code/isa/isa-l/include/ src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx2.asm.s -o /dev/null 2> /dev/null ; then
               echo 'yasm can also build the isa-l stuff'
               AC_DEFINE([HAVE_BETTER_YASM_ELF64], [1], [yasm can also build the isa-l])
   	    with_better_yasm=yes
@@ -255,21 +273,37 @@ AX_C_PRETTY_FUNC
 # Checks for libraries.
 ACX_PTHREAD
 
-AC_CHECK_LIB([uuid], [uuid_parse], [true], AC_MSG_FAILURE([libuuid not found]))
 
-# rbd {map,unmap,showmapped} dependencies, Linux only
-if test x"$linux" = x"yes" -a x"$with_rbd" = x"yes"; then
+#Linux only dependencies
+if test x"$linux" = x"yes"; then
   # libblkid
   AC_CHECK_HEADER([blkid/blkid.h], [],
     AC_MSG_ERROR([blkid/blkid.h not found (libblkid-dev, libblkid-devel)]))
-  AC_CHECK_LIB([blkid], [blkid_devno_to_wholedisk], [true],
+  AC_CHECK_LIB([blkid], [blkid_get_cache], [true],
+    AC_MSG_FAILURE([libblkid not found]))
+  AC_CHECK_LIB([blkid], [blkid_find_dev_with_tag], [true],
+    AC_MSG_FAILURE([libblkid not found]))
+  AC_CHECK_LIB([blkid], [blkid_dev_devname], [true],
     AC_MSG_FAILURE([libblkid not found]))
 
-  # libudev
-  AC_CHECK_HEADER([libudev.h], [],
-    AC_MSG_ERROR([libudev.h not found (libudev-dev, libudev-devel)]))
-  AC_CHECK_LIB([udev], [udev_monitor_receive_device], [true],
-    AC_MSG_FAILURE([libudev not found]))
+  # rbd {map,unmap,showmapped} dependencies, Linux only
+  if test x"$with_rbd" = x"yes"; then
+    # libblkid
+    AC_CHECK_LIB([blkid], [blkid_devno_to_wholedisk], [true],
+      AC_MSG_FAILURE([libblkid not found]))
+
+    # libudev
+    AC_CHECK_HEADER([libudev.h], [],
+      AC_MSG_ERROR([libudev.h not found (libudev-dev, libudev-devel)]))
+    AC_CHECK_LIB([udev], [udev_monitor_receive_device], [true],
+      AC_MSG_FAILURE([libudev not found]))
+
+    # libexpat
+    AC_CHECK_HEADER([expat.h], [],
+      AC_MSG_ERROR([expat.h not found (libexpat-devel)]))
+    AC_CHECK_LIB([expat], [XML_Parse], [true],
+      AC_MSG_FAILURE([libexpat not found]))
+  fi
 fi
 
 #
@@ -377,11 +411,11 @@ else
     AC_MSG_FAILURE([no suitable crypto library found])
 fi
 
-AC_ARG_ENABLE([root-make-check],
-            [AS_HELP_STRING([--enable-root-make-check], [enable make check tests that require root privileges])],
-            [],
-            [enable_root_make_check=no])
-AM_CONDITIONAL(ENABLE_ROOT_MAKE_CHECK, test "x$enable_root_make_check" != xno)
+AC_ARG_ENABLE(gitversion,
+    [AC_HELP_STRING([--enable-gitversion], [build Ceph with git version string])],
+    [], [enable_gitversion=yes])
+
+AM_CONDITIONAL(NO_GIT_VERSION, [test "x$enable_gitversion" = "xno"])
 
 # profiler?
 AC_ARG_WITH([profiler],
@@ -409,7 +443,7 @@ AS_IF([test "$with_profiler" = "yes"],
 
 # debug crap?
 AC_ARG_WITH([debug],
-            [AS_HELP_STRING([--with-debug], [build extra debug binaries])],
+            [AS_HELP_STRING([--with-debug], [build extra debug binaries and tests])],
             [case "${withval}" in
 		  yes) with_debug=yes ;;
 		  no)  with_debug=no ;;
@@ -467,20 +501,11 @@ AC_ARG_WITH([fuse],
             [],
             [with_fuse=yes])
 LIBFUSE=
-AS_IF([test "x$with_fuse" != xno],
-	    [AC_CHECK_LIB([fuse], [fuse_main],
-             [AC_SUBST([LIBFUSE], ["-lfuse"])
-               AC_DEFINE([HAVE_LIBFUSE], [1],
-                         [Define if you have fuse])
-               HAVE_LIBFUSE=1
-	       # look for fuse_getgroups and define FUSE_GETGROUPS if found
-           LIBS_saved="$LIBS"
-           LIBS="$LIBS -lfuse"
-	       AC_CHECK_FUNCS([fuse_getgroups])
-           LIBS="$LIBS_saved"
-              ],
-             [AC_MSG_FAILURE(
-                   [no FUSE found (use --without-fuse to disable)])])])
+AS_IF([test "x$with_fuse" != xno], [
+  PKG_CHECK_MODULES([LIBFUSE], [fuse],
+    [HAVE_LIBFUSE=1],
+    [AC_MSG_FAILURE([no FUSE found (use --without-fuse to disable)])])
+])
 AM_CONDITIONAL(WITH_FUSE, [test "$HAVE_LIBFUSE" = "1"])
 
 # jemalloc?
@@ -551,8 +576,10 @@ fi
 #set pg ref debugging?
 AC_ARG_ENABLE([pgrefdebugging],
 	    [AS_HELP_STRING([--enable-pgrefdebugging], [enable pg ref debugging])],
-			[AC_DEFINE([PG_DEBUG_REFS], [1], [Defined if you want pg ref debugging])],
-			[])
+			[], [enable_pgrefdebugging=no])
+AS_IF([test "x$enable_pgrefdebugging" = "xyes"],
+	  [AC_DEFINE([PG_DEBUG_REFS], [1], [Defined if you want pg ref debugging])],
+	  [])
 
 #
 # Java is painful
@@ -604,9 +631,9 @@ if test "x$enable_cephfs_java" = "xyes"; then
         # the search path.
         AS_IF([test "x$with_debug" = "xyes"], [
         	dir='/usr/share/java'
-	        junit4_jar=`find $dir -name junit4.jar | head -n 1`
+	        junit4_jar=`( find $dir -name junit4.jar;find $dir -name junit.jar ) | head -n 1`
 		AS_IF([test -r "$junit4_jar"], [
-		      EXTRA_CLASSPATH_JAR=`dirname $junit4_jar`/junit4.jar
+		      EXTRA_CLASSPATH_JAR="$junit4_jar"
 		      AC_SUBST(EXTRA_CLASSPATH_JAR)
 		      [have_junit4=1]], [
 		      AC_MSG_NOTICE([Cannot find junit4.jar (apt-get install junit4)])
@@ -723,6 +750,7 @@ fi
 # Find supported SIMD / NEON / SSE extensions supported by the compiler
 AX_ARM_FEATURES()
 AM_CONDITIONAL(HAVE_NEON, [ test "x$ax_cv_support_neon_ext" = "xyes"])
+AM_CONDITIONAL(HAVE_ARMV8_CRC, [ test "x$ax_cv_support_crc_ext" = "xyes"])
 AX_INTEL_FEATURES()
 AM_CONDITIONAL(HAVE_SSSE3, [ test "x$ax_cv_support_ssse3_ext" = "xyes"])
 AM_CONDITIONAL(HAVE_SSE4_PCLMUL, [ test "x$ax_cv_support_pclmuldq_ext" = "xyes"])
@@ -776,23 +804,11 @@ if test "x$with_jemalloc" = "xyes"; then
 	fi
 fi
 
-# use system libs3?
-AC_ARG_WITH([system-libs3],
-	[AS_HELP_STRING([--with-system-libs3], [use system libs3])],
-	,
-	[with_system_libs3=no])
-AS_IF([test "x$with_system_libs3" = xyes],
-            [AC_CHECK_LIB([s3], [S3_initialize], [true], [AC_MSG_FAILURE([libs3 not found])], [-lpthread])])
-AS_IF([test "x$with_system_libs3" = xcheck],
-            [AC_SEARCH_LIBS([S3_initialize], [s3], [with_system_libs3=yes], [true], [-lpthread])])
-AM_CONDITIONAL(WITH_SYSTEM_LIBS3, [ test "$with_system_libs3" = "yes" ])
-
-# rest-bench?
-AC_ARG_WITH([rest-bench],
-	    [AS_HELP_STRING([--with-rest-bench], [enables rest-bench])],
-	    [],
-	    [with_rest_bench=no])
-AM_CONDITIONAL(WITH_REST_BENCH, [ test "$with_rest_bench" = "yes" ])
+# needs libcurl and libxml2
+if test "x$with_rest_bench" = xyes && test "x$with_system_libs3" = xno; then
+   AC_CHECK_LIB([curl], [curl_easy_init], [], AC_MSG_ERROR([libcurl not found]))
+   AC_CHECK_LIB([xml2], [xmlParseChunk], [], AC_MSG_ERROR([libxml2 not found]))
+fi
 
 # use libaio?
 AC_ARG_WITH([libaio],
@@ -883,6 +899,17 @@ BOOST_THREAD_LIBS="${LIBS}"
 LIBS="${saved_LIBS}"
 AC_SUBST(BOOST_THREAD_LIBS)
 
+# boost-random
+BOOST_RANDOM_LIBS=""
+saved_LIBS="${LIBS}"
+LIBS=""
+AC_CHECK_LIB(boost_random-mt, main, [],
+    [AC_CHECK_LIB(boost_random, main, [],
+        AC_MSG_FAILURE(["Boost random library not found."]))])
+BOOST_RANDOM_LIBS="${LIBS}"
+LIBS="${saved_LIBS}"
+AC_SUBST(BOOST_RANDOM_LIBS)
+
 #
 # Check for boost_program_options library (defines BOOST_PROGRAM_OPTIONS_LIBS).
 #
@@ -922,6 +949,11 @@ AC_CHECK_HEADERS([ \
 	utime.h \
 ])
 
+# name_to_handle_at
+AC_CHECK_FUNC([name_to_handle_at],
+	[AC_DEFINE([HAVE_NAME_TO_HANDLE_AT], [], [name_to_handle_at exists])],
+	[])
+
 # sync_file_range
 AC_CHECK_FUNC([sync_file_range],
 	[AC_DEFINE([HAVE_SYNC_FILE_RANGE], [], [sync_file_range(2) is supported])],
@@ -932,6 +964,9 @@ AC_CHECK_FUNC([fallocate],
 	[AC_DEFINE([CEPH_HAVE_FALLOCATE], [], [fallocate(2) is supported])],
 	[])
 
+# getgrouplist
+AC_CHECK_FUNCS([getgrouplist])
+
 #
 # Test for time-related `struct stat` members.
 #
@@ -988,7 +1023,7 @@ AC_MSG_RESULT([no])
 ])
 
 AC_MSG_CHECKING([for sched.h])
-+AC_LANG_PUSH([C++])
+AC_LANG_PUSH([C++])
 AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
 #define _GNU_SOURCE
 #include <sched.h>
@@ -1005,7 +1040,7 @@ AC_DEFINE([HAVE_SCHED], 1, [Define to 1 if you have sched.h.])
 ], [
 AC_MSG_RESULT([no])
 ])
-+AC_LANG_POP([C++])
+AC_LANG_POP([C++])
 
 
 #
@@ -1131,6 +1166,116 @@ if test "x$enable_valgrind" = "xyes"; then
   AC_CHECK_HEADERS([valgrind/helgrind.h])
 fi
 
+dnl systemd-libexec-dir
+AC_SUBST(systemd_libexec_dir)
+AC_ARG_WITH(
+    systemd-libexec-dir,
+    AS_HELP_STRING(
+	    [--with-systemd-libexec-dir=DIR],
+	    [systemd libexec directory @<:@SYSTEMD_LIBEXEC_DIR@:>@
+        defaults to --libexecdir=DIR]
+    ),
+    [
+	    systemd_libexec_dir="$withval"
+    ],
+    [
+        if test "x$SYSTEMD_LIBEXEC_DIR" = "x"; then
+            dnl store old values
+
+            prefix_save=$prefix
+            exec_prefix_save=$exec_prefix
+
+            dnl if no prefix given, then use /usr/local, the default prefix
+            if test "x$prefix" = "xNONE"; then
+                prefix="$ac_default_prefix"
+            fi
+            dnl if no exec_prefix given, then use prefix
+            if test "x$exec_prefix" = "xNONE"; then
+                exec_prefix=$prefix
+            fi
+
+            dnl now get the expanded default
+            systemd_libexec_dir="`eval exec_prefix=$exec_prefix prefix=$prefix echo $libexecdir`"
+
+            dnl now cleanup prefix and exec_prefix
+
+            prefix=$prefix_save
+            exec_prefix=$exec_prefix_save
+        else
+            systemd_libexec_dir="$SYSTEMD_LIBEXEC_DIR"
+        fi
+    ]
+)
+
+
+dnl rgw-user
+AC_SUBST(user_rgw)
+AC_ARG_WITH(
+    rgw-user,
+    AS_HELP_STRING(
+        [--with-rgw-user=USER],
+        [systemd unit directory @<:@USER_RGW@:>@
+        Defaults to "www-data"]
+    ),
+    [
+        user_rgw="$withval"
+    ],
+    [
+        if test "x$USER_RGW" = "x"; then
+            user_rgw=www-data
+        else
+            user_rgw="$USER_RGW"
+        fi
+    ]
+)
+
+dnl rgw-group
+AC_SUBST(group_rgw)
+AC_ARG_WITH(
+    rgw-group,
+    AS_HELP_STRING(
+        [--with-rgw-group=GROUP],
+        [systemd unit directory @<:@GROUP_RGW@:>@
+        Defaults to "www-data"]
+    ),
+    [
+        group_rgw="$withval"
+    ],
+    [
+        if test "x$GROUP_RGW" = "x"; then
+            group_rgw=www-data
+        else
+            group_rgw="$GROUP_RGW"
+        fi
+    ]
+)
+
+
+AC_SUBST(systemd_unit_dir)
+AC_ARG_WITH(
+    systemd-unit-dir,
+    AS_HELP_STRING(
+	    [--with-systemdsystemunitdir=DIR],
+	    [systemd unit directory @<:@SYSTEMD_UNIT_DIR@:>@
+        Defaults to the correct value for debian /etc/systemd/system/]
+    ),
+    [
+	    systemd_unit_dir="$withval"
+    ],
+    [
+        # default to the systemd admin unit directory
+        which pkg-config
+        pkg_config_exists=$?
+        if test x"$pkg_config_exists" = x"0"; then
+            systemd_unit_dir=`pkg-config systemd --variable=systemdsystemunitdir`
+        else
+            systemd_unit_dir="/etc/systemd/system/"
+        fi
+    ]
+)
+
+
+
 
 # Checks for typedefs, structures, and compiler characteristics.
 #AC_HEADER_STDBOOL
@@ -1191,7 +1336,9 @@ AC_CONFIG_FILES([Makefile
 	src/ocf/ceph
 	src/ocf/rbd
 	src/java/Makefile
-	src/tracing/Makefile
+	systemd/Makefile
 	man/Makefile
+	doc/Makefile
+	selinux/Makefile
 	ceph.spec])
 AC_OUTPUT
diff --git a/doc/Makefile.am b/doc/Makefile.am
new file mode 100644
index 0000000..344bd89
--- /dev/null
+++ b/doc/Makefile.am
@@ -0,0 +1,33 @@
+EXTRA_DIST = \
+	man/8/ceph-authtool.rst	\
+	man/8/ceph-clsinfo.rst	\
+	man/8/ceph-conf.rst	\
+	man/8/ceph-create-keys.rst	\
+	man/8/ceph-debugpack.rst	\
+	man/8/ceph-dencoder.rst	\
+	man/8/ceph-deploy.rst	\
+	man/8/ceph-disk.rst	\
+	man/8/cephfs.rst	\
+	man/8/ceph-fuse.rst	\
+	man/8/ceph-mds.rst	\
+	man/8/ceph-mon.rst	\
+	man/8/ceph-osd.rst	\
+	man/8/ceph-post-file.rst	\
+	man/8/ceph-rbdnamer.rst	\
+	man/8/ceph-rest-api.rst	\
+	man/8/ceph.rst	\
+	man/8/ceph-run.rst	\
+	man/8/ceph-syn.rst	\
+	man/8/crushtool.rst	\
+	man/8/librados-config.rst	\
+	man/8/monmaptool.rst	\
+	man/8/mount.ceph.rst	\
+	man/8/osdmaptool.rst	\
+	man/8/radosgw-admin.rst	\
+	man/8/radosgw.rst	\
+	man/8/rados.rst	\
+	man/8/rbd-fuse.rst	\
+	man/8/rbd-replay-many.rst	\
+	man/8/rbd-replay-prep.rst	\
+	man/8/rbd-replay.rst	\
+	man/8/rbd.rst
diff --git a/doc/Makefile.in b/doc/Makefile.in
new file mode 100644
index 0000000..b8b4876
--- /dev/null
+++ b/doc/Makefile.in
@@ -0,0 +1,551 @@
+# Makefile.in generated by automake 1.14.1 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994-2013 Free Software Foundation, Inc.
+
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+ at SET_MAKE@
+VPATH = @srcdir@
+am__is_gnu_make = test -n '$(MAKEFILE_LIST)' && test -n '$(MAKELEVEL)'
+am__make_running_with_option = \
+  case $${target_option-} in \
+      ?) ;; \
+      *) echo "am__make_running_with_option: internal error: invalid" \
+              "target option '$${target_option-}' specified" >&2; \
+         exit 1;; \
+  esac; \
+  has_opt=no; \
+  sane_makeflags=$$MAKEFLAGS; \
+  if $(am__is_gnu_make); then \
+    sane_makeflags=$$MFLAGS; \
+  else \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        bs=\\; \
+        sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
+          | sed "s/$$bs$$bs[$$bs $$bs	]*//g"`;; \
+    esac; \
+  fi; \
+  skip_next=no; \
+  strip_trailopt () \
+  { \
+    flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
+  }; \
+  for flg in $$sane_makeflags; do \
+    test $$skip_next = yes && { skip_next=no; continue; }; \
+    case $$flg in \
+      *=*|--*) continue;; \
+        -*I) strip_trailopt 'I'; skip_next=yes;; \
+      -*I?*) strip_trailopt 'I';; \
+        -*O) strip_trailopt 'O'; skip_next=yes;; \
+      -*O?*) strip_trailopt 'O';; \
+        -*l) strip_trailopt 'l'; skip_next=yes;; \
+      -*l?*) strip_trailopt 'l';; \
+      -[dEDm]) skip_next=yes;; \
+      -[JT]) skip_next=yes;; \
+    esac; \
+    case $$flg in \
+      *$$target_option*) has_opt=yes; break;; \
+    esac; \
+  done; \
+  test $$has_opt = yes
+am__make_dryrun = (target_option=n; $(am__make_running_with_option))
+am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+target_triplet = @target@
+subdir = doc
+DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/Makefile.am
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/m4/ac_check_classpath.m4 \
+	$(top_srcdir)/m4/ac_prog_jar.m4 \
+	$(top_srcdir)/m4/ac_prog_javac.m4 \
+	$(top_srcdir)/m4/ac_prog_javac_works.m4 \
+	$(top_srcdir)/m4/ac_prog_javah.m4 \
+	$(top_srcdir)/m4/acx_pthread.m4 $(top_srcdir)/m4/ax_arm.m4 \
+	$(top_srcdir)/m4/ax_c_pretty_func.m4 \
+	$(top_srcdir)/m4/ax_c_var_func.m4 \
+	$(top_srcdir)/m4/ax_check_compile_flag.m4 \
+	$(top_srcdir)/m4/ax_cxx_compile_stdcxx_11.m4 \
+	$(top_srcdir)/m4/ax_cxx_static_cast.m4 \
+	$(top_srcdir)/m4/ax_intel.m4 $(top_srcdir)/m4/libtool.m4 \
+	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
+	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
+	$(top_srcdir)/m4/pkg.m4 $(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/src/acconfig.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+AM_V_P = $(am__v_P_ at AM_V@)
+am__v_P_ = $(am__v_P_ at AM_DEFAULT_V@)
+am__v_P_0 = false
+am__v_P_1 = :
+AM_V_GEN = $(am__v_GEN_ at AM_V@)
+am__v_GEN_ = $(am__v_GEN_ at AM_DEFAULT_V@)
+am__v_GEN_0 = @echo "  GEN     " $@;
+am__v_GEN_1 = 
+AM_V_at = $(am__v_at_ at AM_V@)
+am__v_at_ = $(am__v_at_ at AM_DEFAULT_V@)
+am__v_at_0 = @
+am__v_at_1 = 
+SOURCES =
+DIST_SOURCES =
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+ACLOCAL = @ACLOCAL@
+AMTAR = @AMTAR@
+AM_CXXFLAGS = @AM_CXXFLAGS@
+AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
+AR = @AR@
+ARM_CRC_FLAGS = @ARM_CRC_FLAGS@
+ARM_FLAGS = @ARM_FLAGS@
+ARM_NEON_FLAGS = @ARM_NEON_FLAGS@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AWK = @AWK@
+BOOST_PROGRAM_OPTIONS_LIBS = @BOOST_PROGRAM_OPTIONS_LIBS@
+BOOST_RANDOM_LIBS = @BOOST_RANDOM_LIBS@
+BOOST_THREAD_LIBS = @BOOST_THREAD_LIBS@
+CC = @CC@
+CCAS = @CCAS@
+CCASDEPMODE = @CCASDEPMODE@
+CCASFLAGS = @CCASFLAGS@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CRYPTOPP_CFLAGS = @CRYPTOPP_CFLAGS@
+CRYPTOPP_LIBS = @CRYPTOPP_LIBS@
+CRYPTO_CFLAGS = @CRYPTO_CFLAGS@
+CRYPTO_LIBS = @CRYPTO_LIBS@
+CXX = @CXX@
+CXXCPP = @CXXCPP@
+CXXDEPMODE = @CXXDEPMODE@
+CXXFLAGS = @CXXFLAGS@
+CYGPATH_W = @CYGPATH_W@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+EXTRA_CLASSPATH_JAR = @EXTRA_CLASSPATH_JAR@
+FGREP = @FGREP@
+GCOV_PREFIX_STRIP = @GCOV_PREFIX_STRIP@
+GIT_CHECK = @GIT_CHECK@
+GREP = @GREP@
+HAVE_CXX11 = @HAVE_CXX11@
+HAVE_VALGRIND = @HAVE_VALGRIND@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+INTEL_FLAGS = @INTEL_FLAGS@
+INTEL_PCLMUL_FLAGS = @INTEL_PCLMUL_FLAGS@
+INTEL_SSE2_FLAGS = @INTEL_SSE2_FLAGS@
+INTEL_SSE3_FLAGS = @INTEL_SSE3_FLAGS@
+INTEL_SSE4_1_FLAGS = @INTEL_SSE4_1_FLAGS@
+INTEL_SSE4_2_FLAGS = @INTEL_SSE4_2_FLAGS@
+INTEL_SSE_FLAGS = @INTEL_SSE_FLAGS@
+INTEL_SSSE3_FLAGS = @INTEL_SSSE3_FLAGS@
+JAR = @JAR@
+JAVAC = @JAVAC@
+JAVAH = @JAVAH@
+JDK_CPPFLAGS = @JDK_CPPFLAGS@
+KEYUTILS_LIB = @KEYUTILS_LIB@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBEDIT_CFLAGS = @LIBEDIT_CFLAGS@
+LIBEDIT_LIBS = @LIBEDIT_LIBS@
+LIBFUSE_CFLAGS = @LIBFUSE_CFLAGS@
+LIBFUSE_LIBS = @LIBFUSE_LIBS@
+LIBJEMALLOC = @LIBJEMALLOC@
+LIBOBJS = @LIBOBJS@
+LIBROCKSDB_CFLAGS = @LIBROCKSDB_CFLAGS@
+LIBROCKSDB_LIBS = @LIBROCKSDB_LIBS@
+LIBS = @LIBS@
+LIBTCMALLOC = @LIBTCMALLOC@
+LIBTOOL = @LIBTOOL@
+LIBZFS_CFLAGS = @LIBZFS_CFLAGS@
+LIBZFS_LIBS = @LIBZFS_LIBS@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+LTTNG_GEN_TP_CHECK = @LTTNG_GEN_TP_CHECK@
+LTTNG_GEN_TP_PROG = @LTTNG_GEN_TP_PROG@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+NM = @NM@
+NMEDIT = @NMEDIT@
+NSS_CFLAGS = @NSS_CFLAGS@
+NSS_LIBS = @NSS_LIBS@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+PKG_CONFIG = @PKG_CONFIG@
+PKG_CONFIG_LIBDIR = @PKG_CONFIG_LIBDIR@
+PKG_CONFIG_PATH = @PKG_CONFIG_PATH@
+PTHREAD_CC = @PTHREAD_CC@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+PYTHON = @PYTHON@
+PYTHON_EXEC_PREFIX = @PYTHON_EXEC_PREFIX@
+PYTHON_PLATFORM = @PYTHON_PLATFORM@
+PYTHON_PREFIX = @PYTHON_PREFIX@
+PYTHON_VERSION = @PYTHON_VERSION@
+RANLIB = @RANLIB@
+RESOLV_LIBS = @RESOLV_LIBS@
+RPM_RELEASE = @RPM_RELEASE@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHELL = @SHELL@
+SPHINX_BUILD = @SPHINX_BUILD@
+STRIP = @STRIP@
+VERSION = @VERSION@
+WARN_ERROR_FORMAT_SECURITY = @WARN_ERROR_FORMAT_SECURITY@
+WARN_IGNORED_QUALIFIERS = @WARN_IGNORED_QUALIFIERS@
+WARN_TYPE_LIMITS = @WARN_TYPE_LIMITS@
+XIO_LIBS = @XIO_LIBS@
+YASM_CHECK = @YASM_CHECK@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_CXX = @ac_ct_CXX@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+acx_pthread_config = @acx_pthread_config@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+group_rgw = @group_rgw@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+pkgpyexecdir = @pkgpyexecdir@
+pkgpythondir = @pkgpythondir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+pyexecdir = @pyexecdir@
+pythondir = @pythondir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+subdirs = @subdirs@
+sysconfdir = @sysconfdir@
+systemd_libexec_dir = @systemd_libexec_dir@
+systemd_unit_dir = @systemd_unit_dir@
+target = @target@
+target_alias = @target_alias@
+target_cpu = @target_cpu@
+target_os = @target_os@
+target_vendor = @target_vendor@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+user_rgw = @user_rgw@
+EXTRA_DIST = \
+	man/8/ceph-authtool.rst	\
+	man/8/ceph-clsinfo.rst	\
+	man/8/ceph-conf.rst	\
+	man/8/ceph-create-keys.rst	\
+	man/8/ceph-debugpack.rst	\
+	man/8/ceph-dencoder.rst	\
+	man/8/ceph-deploy.rst	\
+	man/8/ceph-disk.rst	\
+	man/8/cephfs.rst	\
+	man/8/ceph-fuse.rst	\
+	man/8/ceph-mds.rst	\
+	man/8/ceph-mon.rst	\
+	man/8/ceph-osd.rst	\
+	man/8/ceph-post-file.rst	\
+	man/8/ceph-rbdnamer.rst	\
+	man/8/ceph-rest-api.rst	\
+	man/8/ceph.rst	\
+	man/8/ceph-run.rst	\
+	man/8/ceph-syn.rst	\
+	man/8/crushtool.rst	\
+	man/8/librados-config.rst	\
+	man/8/monmaptool.rst	\
+	man/8/mount.ceph.rst	\
+	man/8/osdmaptool.rst	\
+	man/8/radosgw-admin.rst	\
+	man/8/radosgw.rst	\
+	man/8/rados.rst	\
+	man/8/rbd-fuse.rst	\
+	man/8/rbd-replay-many.rst	\
+	man/8/rbd-replay-prep.rst	\
+	man/8/rbd-replay.rst	\
+	man/8/rbd.rst
+
+all: all-am
+
+.SUFFIXES:
+$(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign doc/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --foreign doc/Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure:  $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4):  $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+tags TAGS:
+
+ctags CTAGS:
+
+cscope cscopelist:
+
+
+distdir: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: check-am
+all-am: Makefile
+installdirs:
+install: install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+clean: clean-am
+
+clean-am: clean-generic clean-libtool mostlyclean-am
+
+distclean: distclean-am
+	-rm -f Makefile
+distclean-am: clean-am distclean-generic
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+html-am:
+
+info: info-am
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-am
+
+install-dvi-am:
+
+install-exec-am:
+
+install-html: install-html-am
+
+install-html-am:
+
+install-info: install-info-am
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-am
+
+install-pdf-am:
+
+install-ps: install-ps-am
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-generic mostlyclean-libtool
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am:
+
+.MAKE: install-am install-strip
+
+.PHONY: all all-am check check-am clean clean-generic clean-libtool \
+	cscopelist-am ctags-am distclean distclean-generic \
+	distclean-libtool distdir dvi dvi-am html html-am info info-am \
+	install install-am install-data install-data-am install-dvi \
+	install-dvi-am install-exec install-exec-am install-html \
+	install-html-am install-info install-info-am install-man \
+	install-pdf install-pdf-am install-ps install-ps-am \
+	install-strip installcheck installcheck-am installdirs \
+	maintainer-clean maintainer-clean-generic mostlyclean \
+	mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \
+	tags-am uninstall uninstall-am
+
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff --git a/doc/man/8/ceph-authtool.rst b/doc/man/8/ceph-authtool.rst
new file mode 100644
index 0000000..523d14d
--- /dev/null
+++ b/doc/man/8/ceph-authtool.rst
@@ -0,0 +1,178 @@
+:orphan:
+
+=================================================
+ ceph-authtool -- ceph keyring manipulation tool
+=================================================
+
+.. program:: ceph-authtool
+
+Synopsis
+========
+
+| **ceph-authtool** *keyringfile* [ -l | --list ] [ -C | --create-keyring
+  ] [ -p | --print ] [ -n | --name *entityname* ] [ --gen-key ] [ -a |
+  --add-key *base64_key* ] [ --caps *capfile* ]
+
+
+Description
+===========
+
+**ceph-authtool** is a utility to create, view, and modify a Ceph keyring
+file. A keyring file stores one or more Ceph authentication keys and
+possibly an associated capability specification. Each key is
+associated with an entity name, of the form
+``{client,mon,mds,osd}.name``.
+
+**WARNING** Ceph provides authentication and protection against
+man-in-the-middle attacks once secret keys are in place.  However,
+data over the wire is not encrypted, which may include the messages
+used to configure said keys.  The system is primarily intended to be
+used in trusted environments.
+
+Options
+=======
+
+.. option:: -l, --list
+
+   will list all keys and capabilities present in the keyring
+
+.. option:: -p, --print
+
+   will print an encoded key for the specified entityname. This is
+   suitable for the ``mount -o secret=`` argument
+
+.. option:: -C, --create-keyring
+
+   will create a new keyring, overwriting any existing keyringfile
+
+.. option:: --gen-key
+
+   will generate a new secret key for the specified entityname
+
+.. option:: --add-key
+
+   will add an encoded key to the keyring
+
+.. option:: --cap subsystem capability
+
+   will set the capability for given subsystem
+
+.. option:: --caps capsfile
+
+   will set all of capabilities associated with a given key, for all subsystems
+
+
+Capabilities
+============
+
+The subsystem is the name of a Ceph subsystem: ``mon``, ``mds``, or
+``osd``.
+
+The capability is a string describing what the given user is allowed
+to do. This takes the form of a comma separated list of allow
+clauses with a permission specifier containing one or more of rwx for
+read, write, and execute permission. The ``allow *`` grants full
+superuser permissions for the given subsystem.
+
+For example::
+
+	# can read, write, and execute objects
+        osd = "allow rwx"
+
+	# can access mds server
+        mds = "allow"
+
+	# can modify cluster state (i.e., is a server daemon)
+        mon = "allow rwx"
+
+A librados user restricted to a single pool might look like::
+
+        mon = "allow r"
+
+        osd = "allow rw pool foo"
+
+A client using rbd with read access to one pool and read/write access to another::
+
+        mon = "allow r"
+
+        osd = "allow class-read object_prefix rbd_children, allow pool templates r class-read, allow pool vms rwx"
+
+A client mounting the file system with minimal permissions would need caps like::
+
+        mds = "allow"
+
+        osd = "allow rw pool data"
+
+        mon = "allow r"
+
+
+OSD Capabilities
+================
+
+In general, an osd capability follows the grammar::
+
+        osdcap  := grant[,grant...]
+        grant   := allow (match capspec | capspec match)
+        match   := [pool[=]<poolname> | object_prefix <prefix>]
+        capspec := * | [r][w][x] [class-read] [class-write]
+
+The capspec determines what kind of operations the entity can perform::
+
+    r           = read access to objects
+    w           = write access to objects
+    x           = can call any class method (same as class-read class-write)
+    class-read  = can call class methods that are reads
+    class-write = can call class methods that are writes
+    *           = equivalent to rwx, plus the ability to run osd admin commands,
+                  i.e. ceph osd tell ...
+
+The match criteria restrict a grant based on the pool being accessed.
+Grants are additive if the client fulfills the match condition. For
+example, if a client has the osd capabilities: "allow r object_prefix
+prefix, allow w pool foo, allow x pool bar", then it has rw access to
+pool foo, rx access to pool bar, and r access to objects whose
+names begin with 'prefix' in any pool.
+
+Caps file format
+================
+
+The caps file format consists of zero or more key/value pairs, one per
+line. The key and value are separated by an ``=``, and the value must
+be quoted (with ``'`` or ``"``) if it contains any whitespace. The key
+is the name of the Ceph subsystem (``osd``, ``mds``, ``mon``), and the
+value is the capability string (see above).
+
+
+Example
+=======
+
+To create a new keyring containing a key for client.foo::
+
+        ceph-authtool -C -n client.foo --gen-key keyring
+
+To associate some capabilities with the key (namely, the ability to
+mount a Ceph filesystem)::
+
+        ceph-authtool -n client.foo --cap mds 'allow' --cap osd 'allow rw pool=data' --cap mon 'allow r' keyring
+
+To display the contents of the keyring::
+
+        ceph-authtool -l keyring
+
+When mounting a Ceph file system, you can grab the appropriately encoded secret key with::
+
+        mount -t ceph serverhost:/ mountpoint -o name=foo,secret=`ceph-authtool -p -n client.foo keyring`
+
+
+Availability
+============
+
+**ceph-authtool** is part of Ceph, a massively scalable, open-source, distributed storage system. Please
+refer to the Ceph documentation at http://ceph.com/docs for more
+information.
+
+
+See also
+========
+
+:doc:`ceph <ceph>`\(8)
diff --git a/doc/man/8/ceph-clsinfo.rst b/doc/man/8/ceph-clsinfo.rst
new file mode 100644
index 0000000..0188ce1
--- /dev/null
+++ b/doc/man/8/ceph-clsinfo.rst
@@ -0,0 +1,49 @@
+:orphan:
+
+===============================================
+ ceph-clsinfo -- show class object information
+===============================================
+
+.. program:: ceph-clsinfo
+
+Synopsis
+========
+
+| **ceph-clsinfo** [ *options* ] ... *filename*
+
+
+Description
+===========
+
+**ceph-clsinfo** can show name, version, and architecture information
+about a specific class object.
+
+
+Options
+=======
+
+.. option:: -n, --name
+
+   Shows the class name
+
+.. option:: -v, --version
+
+   Shows the class version
+
+.. option:: -a, --arch
+
+   Shows the class architecture
+
+
+Availability
+============
+
+**ceph-clsinfo** is part of Ceph, a massively scalable, open-source, distributed storage system. Please
+refer to the Ceph documentation at http://ceph.com/docs for more
+information.
+
+
+See also
+========
+
+:doc:`ceph <ceph>`\(8)
diff --git a/doc/man/8/ceph-conf.rst b/doc/man/8/ceph-conf.rst
new file mode 100644
index 0000000..9782e38
--- /dev/null
+++ b/doc/man/8/ceph-conf.rst
@@ -0,0 +1,129 @@
+:orphan:
+
+==================================
+ ceph-conf -- ceph conf file tool
+==================================
+
+.. program:: ceph-conf
+
+Synopsis
+========
+
+| **ceph-conf** -c *conffile* --list-all-sections
+| **ceph-conf** -c *conffile* -L
+| **ceph-conf** -c *conffile* -l *prefix*
+| **ceph-conf** *key* -s *section1* ...
+| **ceph-conf** [-s *section* ] [-r] --lookup *key*
+| **ceph-conf** [-s *section* ] *key*
+
+
+Description
+===========
+
+**ceph-conf** is a utility for getting information about a ceph
+configuration file. As with most Ceph programs, you can specify which
+Ceph configuration file to use with the ``-c`` flag.
+
+
+Actions
+=======
+
+**ceph-conf** performs one of the following actions:
+
+.. option:: -L, --list-all-sections
+
+   list all sections in the configuration file.
+
+.. option:: -l, --list-sections *prefix*
+
+   list the sections with the given *prefix*. For example, ``--list-sections mon``
+   would list all sections beginning with ``mon``.
+
+.. option:: --lookup *key*
+
+   search and print the specified configuration setting. Note:  ``--lookup`` is
+   the default action. If no other actions are given on the command line, we will
+   default to doing a lookup.
+
+.. option:: -h, --help
+
+   print a summary of usage.
+
+
+Options
+=======
+
+.. option:: -c *conffile*
+
+   the Ceph configuration file.
+
+.. option:: --filter-key *key*
+
+   filter section list to only include sections with given *key* defined.
+
+.. option:: --filter-key-value *key* ``=`` *value*
+
+   filter section list to only include sections with given *key*/*value* pair.
+
+.. option:: --name *type.id*
+
+   the Ceph name in which the sections are searched (default 'client.admin').
+   For example, if we specify ``--name osd.0``, the following sections will be
+   searched: [osd.0], [osd], [global]
+
+.. option:: -r, --resolve-search
+
+   search for the first file that exists and can be opened in the resulted
+   comma delimited search list.
+
+.. option:: -s, --section
+
+   additional sections to search.  These additional sections will be searched
+   before the sections that would normally be searched. As always, the first
+   matching entry we find will be returned.
+
+
+Examples
+========
+
+To find out what value osd 0 will use for the "osd data" option::
+
+        ceph-conf -c foo.conf  --name osd.0 --lookup "osd data"
+
+To find out what value will mds a use for the "log file" option::
+
+        ceph-conf -c foo.conf  --name mds.a "log file"
+
+To list all sections that begin with "osd"::
+
+        ceph-conf -c foo.conf -l osd
+
+To list all sections::
+
+        ceph-conf -c foo.conf -L
+
+To print the path of the "keyring" used by "client.0"::
+
+       ceph-conf --name client.0 -r -l keyring
+
+
+Files
+=====
+
+``/etc/ceph/$cluster.conf``, ``~/.ceph/$cluster.conf``, ``$cluster.conf``
+
+the Ceph configuration files to use if not specified.
+
+
+Availability
+============
+
+**ceph-conf** is part of Ceph, a massively scalable, open-source, distributed storage system.  Please refer
+to the Ceph documentation at http://ceph.com/docs for more
+information.
+
+
+See also
+========
+
+:doc:`ceph <ceph>`\(8),
diff --git a/doc/man/8/ceph-create-keys.rst b/doc/man/8/ceph-create-keys.rst
new file mode 100644
index 0000000..8d1dc91
--- /dev/null
+++ b/doc/man/8/ceph-create-keys.rst
@@ -0,0 +1,63 @@
+:orphan:
+
+===============================================
+ceph-create-keys -- ceph keyring generate tool
+===============================================
+
+.. program:: ceph-create-keys
+
+Synopsis
+========
+
+| **ceph-create-keys** [-h] [-v] [--cluster *name*] --id *id*
+
+
+Description
+===========
+
+:program:`ceph-create-keys` is a utility to generate bootstrap keyrings using
+the given monitor when it is ready.
+
+It creates following auth entities (or users)
+
+``client.admin``
+
+    and its key for your client host.
+
+``client.bootstrap-{osd, rgw, mds}``
+
+    and their keys for bootstrapping corresponding services
+
+To list all users in the cluster::
+
+    ceph auth list
+
+
+Options
+=======
+
+.. option:: --cluster
+
+   name of the cluster (default 'ceph').
+
+.. option:: -i, --id
+
+   id of a ceph-mon that is coming up. **ceph-create-keys** will wait until it joins quorum.
+
+.. option:: -v, --verbose
+
+   be more verbose.
+
+
+Availability
+============
+
+**ceph-create-keys** is part of Ceph, a massively scalable, open-source, distributed storage system.  Please refer
+to the Ceph documentation at http://ceph.com/docs for more
+information.
+
+
+See also
+========
+
+:doc:`ceph <ceph>`\(8)
diff --git a/doc/man/8/ceph-debugpack.rst b/doc/man/8/ceph-debugpack.rst
new file mode 100644
index 0000000..4f2c4f2
--- /dev/null
+++ b/doc/man/8/ceph-debugpack.rst
@@ -0,0 +1,50 @@
+:orphan:
+
+=============================================
+ ceph-debugpack -- ceph debug packer utility
+=============================================
+
+.. program:: ceph-debugpack
+
+Synopsis
+========
+
+| **ceph-debugpack** [ *options* ] *filename.tar.gz*
+
+
+Description
+===========
+
+**ceph-debugpack** will build a tarball containing various items that are
+useful for debugging crashes. The resulting tarball can be shared with
+Ceph developers when debugging a problem.
+
+The tarball will include the binaries for ceph-mds, ceph-osd, and ceph-mon, radosgw, any
+log files, the ceph.conf configuration file, any core files we can
+find, and (if the system is running) dumps of the current cluster state
+as reported by 'ceph report'.
+
+
+Options
+=======
+
+.. option:: -c ceph.conf, --conf=ceph.conf
+
+   Use *ceph.conf* configuration file instead of the default
+   ``/etc/ceph/ceph.conf`` to determine monitor addresses during
+   startup.
+
+
+Availability
+============
+
+**ceph-debugpack** is part of Ceph, a massively scalable, open-source, distributed storage system. Please
+refer to the Ceph documentation at http://ceph.com/docs for more
+information.
+
+
+See also
+========
+
+:doc:`ceph <ceph>`\(8)
+:doc:`ceph-post-file <ceph-post-file>`\(8)
diff --git a/doc/man/8/ceph-dencoder.rst b/doc/man/8/ceph-dencoder.rst
new file mode 100644
index 0000000..cf2e429
--- /dev/null
+++ b/doc/man/8/ceph-dencoder.rst
@@ -0,0 +1,151 @@
+:orphan:
+
+==============================================
+ ceph-dencoder -- ceph encoder/decoder utility
+==============================================
+
+.. program:: ceph-dencoder
+
+Synopsis
+========
+
+| **ceph-dencoder** [commands...]
+
+
+Description
+===========
+
+**ceph-dencoder** is a utility to encode, decode, and dump ceph data
+structures.  It is used for debugging and for testing inter-version
+compatibility.
+
+**ceph-dencoder** takes a simple list of commands and performs them
+in order.
+
+Commands
+========
+
+.. option:: version
+
+   Print the version string for the **ceph-dencoder** binary.
+
+.. option:: import <file>
+
+   Read a binary blob of encoded data from the given file.  It will be
+   placed in an in-memory buffer.
+
+.. option:: export <file>
+
+   Write the contents of the current in-memory buffer to the given
+   file.
+
+.. option:: list_types
+
+   List the data types known to this build of **ceph-dencoder**.
+
+.. option:: type <name>
+
+   Select the given type for future ``encode`` or ``decode`` operations.
+
+.. option:: skip <bytes>
+
+   Seek <bytes> into the imported file before reading data structure, use
+   this with objects that have a preamble/header before the object of interest.
+
+.. option:: decode
+
+   Decode the contents of the in-memory buffer into an instance of the
+   previously selected type.  If there is an error, report it.
+
+.. option:: encode
+
+   Encode the contents of the in-memory instance of the previously
+   selected type to the in-memory buffer.
+
+.. option:: dump_json
+
+   Print a JSON-formatted description of the in-memory object.
+
+.. option:: count_tests
+
+   Print the number of built-in test instances of the previosly
+   selected type that **ceph-dencoder** is able to generate.
+
+.. option:: select_test <n>
+
+   Select the given build-in test instance as a the in-memory instance
+   of the type.
+
+.. option:: get_features
+
+   Print the decimal value of the feature set supported by this version
+   of **ceph-dencoder**.  Each bit represents a feature.  These correspond to
+   CEPH_FEATURE_* defines in src/include/ceph_features.h.
+
+.. option:: set_features <f>
+
+   Set the feature bits provided to ``encode`` to *f*.  This allows
+   you to encode objects such that they can be understood by old
+   versions of the software (for those types that support it).
+
+Example
+=======
+
+Say you want to examine an attribute on an object stored by ``ceph-osd``.  You can do this:
+
+::
+
+    $ cd /mnt/osd.12/current/2.b_head
+    $ attr -l foo_bar_head_EFE6384B
+    Attribute "ceph.snapset" has a 31 byte value for foo_bar_head_EFE6384B
+    Attribute "ceph._" has a 195 byte value for foo_bar_head_EFE6384B
+    $ attr foo_bar_head_EFE6384B -g ceph._ -q > /tmp/a
+    $ ceph-dencoder type object_info_t import /tmp/a decode dump_json
+    { "oid": { "oid": "foo",
+          "key": "bar",
+          "snapid": -2,
+          "hash": 4024842315,
+          "max": 0},
+      "locator": { "pool": 2,
+          "preferred": -1,
+          "key": "bar"},
+      "category": "",
+      "version": "9'1",
+      "prior_version": "0'0",
+      "last_reqid": "client.4116.0:1",
+      "size": 1681,
+      "mtime": "2012-02-21 08:58:23.666639",
+      "lost": 0,
+      "wrlock_by": "unknown.0.0:0",
+      "snaps": [],
+      "truncate_seq": 0,
+      "truncate_size": 0,
+      "watchers": {}}
+
+Alternatively, perhaps you wish to dump an internal CephFS metadata object, you might
+do that like this:
+
+::
+
+   $ rados -p metadata get mds_snaptable mds_snaptable.bin
+   $ ceph-dencoder type SnapServer skip 8 import mds_snaptable.bin decode dump_json
+   { "snapserver": { "last_snap": 1,
+      "pending_noop": [],
+      "snaps": [],
+      "need_to_purge": {},
+      "pending_create": [],
+      "pending_destroy": []}} 
+
+
+Availability
+============
+
+**ceph-dencoder** is part of Ceph, a massively scalable, open-source, distributed storage system. Please
+refer to the Ceph documentation at http://ceph.com/docs for more
+information.
+
+
+See also
+========
+
+:doc:`ceph <ceph>`\(8)
diff --git a/doc/man/8/ceph-deploy.rst b/doc/man/8/ceph-deploy.rst
new file mode 100644
index 0000000..8a04ef3
--- /dev/null
+++ b/doc/man/8/ceph-deploy.rst
@@ -0,0 +1,608 @@
+:orphan:
+
+=====================================
+ ceph-deploy -- Ceph deployment tool
+=====================================
+
+.. program:: ceph-deploy
+
+Synopsis
+========
+
+| **ceph-deploy** **new** [*initial-monitor-node(s)*]
+
+| **ceph-deploy** **install** [*ceph-node*] [*ceph-node*...]
+
+| **ceph-deploy** **mon** *create-initial*
+
+| **ceph-deploy** **osd** *prepare* [*ceph-node*]:[*dir-path*]
+
+| **ceph-deploy** **osd** *activate* [*ceph-node*]:[*dir-path*]
+
+| **ceph-deploy** **osd** *create* [*ceph-node*]:[*dir-path*]
+
+| **ceph-deploy** **admin** [*admin-node*][*ceph-node*...]
+
+| **ceph-deploy** **purgedata** [*ceph-node*][*ceph-node*...]
+
+| **ceph-deploy** **forgetkeys**
+
+Description
+===========
+
+:program:`ceph-deploy` is a tool which allows easy and quick deployment of a
+Ceph cluster without involving complex and detailed manual configuration. It
+uses ssh to gain access to other Ceph nodes from the admin node, sudo for
+administrator privileges on them and the underlying Python scripts automates
+the manual process of Ceph installation on each node from the admin node itself.
+It can be easily run on an workstation and doesn't require servers, databases or
+any other automated tools. With :program:`ceph-deploy`, it is really easy to set
+up and take down a cluster. However, it is not a generic deployment tool. It is
+a specific tool which is designed for those who want to get Ceph up and running
+quickly with only the unavoidable initial configuration settings and without the
+overhead of installing other tools like ``Chef``, ``Puppet`` or ``Juju``. Those
+who want to customize security settings, partitions or directory locations and
+want to set up a cluster following detailed manual steps, should use other tools
+i.e, ``Chef``, ``Puppet``, ``Juju`` or ``Crowbar``.
+
+With :program:`ceph-deploy`, you can install Ceph packages on remote nodes,
+create a cluster, add monitors, gather/forget keys, add OSDs and metadata
+servers, configure admin hosts or take down the cluster.
+
+Commands
+========
+
+new
+---
+
+Start deploying a new cluster and write a configuration file and keyring for it.
+It tries to copy ssh keys from admin node to gain passwordless ssh to monitor
+node(s), validates host IP, creates a cluster with a new initial monitor node or
+nodes for monitor quorum, a ceph configuration file, a monitor secret keyring and
+a log file for the new cluster. It populates the newly created Ceph configuration
+file with ``fsid`` of cluster, hostnames and IP addresses of initial monitor
+members under ``[global]`` section.
+
+Usage::
+
+	ceph-deploy new [MON][MON...]
+
+Here, [MON] is the initial monitor hostname (short hostname i.e, ``hostname -s``).
+
+Other options like :option:`--no-ssh-copykey`, :option:`--fsid`,
+:option:`--cluster-network` and :option:`--public-network` can also be used with
+this command.
+
+If more than one network interface is used, ``public network`` setting has to be
+added under ``[global]`` section of Ceph configuration file. If the public subnet
+is given, ``new`` command will choose the one IP from the remote host that exists
+within the subnet range. Public network can also be added at runtime using
+:option:`--public-network` option with the command as mentioned above.
+
+
+install
+-------
+
+Install Ceph packages on remote hosts. As a first step it installs
+``yum-plugin-priorities`` in admin and other nodes using passwordless ssh and sudo
+so that Ceph packages from upstream repository get more priority. It then detects
+the platform and distribution for the hosts and installs Ceph normally by
+downloading distro compatible packages if adequate repo for Ceph is already added.
+``--release`` flag is used to get the latest release for installation. During
+detection of platform and distribution before installation, if it finds the
+``distro.init`` to be ``sysvinit`` (Fedora, CentOS/RHEL etc), it doesn't allow
+installation with custom cluster name and uses the default name ``ceph`` for the
+cluster.
+
+If the user explicitly specifies a custom repo url with :option:`--repo-url` for
+installation, anything detected from the configuration will be overridden and
+the custom repository location will be used for installation of Ceph packages.
+If required, valid custom repositories are also detected and installed. In case
+of installation from a custom repo a boolean is used to determine the logic
+needed to proceed with a custom repo installation. A custom repo install helper
+is used that goes through config checks to retrieve repos (and any extra repos
+defined) and installs them. ``cd_conf`` is the object built from ``argparse``
+that holds the flags and information needed to determine what metadata from the
+configuration is to be used.
+
+A user can also opt to install only the repository without installing Ceph and
+its dependencies by using :option:`--repo` option.
+
+Usage::
+
+	ceph-deploy install [HOST][HOST...]
+
+Here, [HOST] is/are the host node(s) where Ceph is to be installed.
+
+An option ``--release`` is used to install a release known as CODENAME
+(default: firefly).
+
+Other options like :option:`--testing`, :option:`--dev`, :option:`--adjust-repos`,
+:option:`--no-adjust-repos`, :option:`--repo`, :option:`--local-mirror`,
+:option:`--repo-url` and :option:`--gpg-url` can also be used with this command.
+
+
+mds
+---
+
+Deploy Ceph mds on remote hosts. A metadata server is needed to use CephFS and
+the ``mds`` command is used to create one on the desired host node. It uses the
+subcommand ``create`` to do so. ``create`` first gets the hostname and distro
+information of the desired mds host. It then tries to read the ``bootstrap-mds``
+key for the cluster and deploy it in the desired host. The key generally has a
+format of ``{cluster}.bootstrap-mds.keyring``. If it doesn't finds a keyring,
+it runs ``gatherkeys`` to get the keyring. It then creates a mds on the desired
+host under the path ``/var/lib/ceph/mds/`` in ``/var/lib/ceph/mds/{cluster}-{name}``
+format and a bootstrap keyring under ``/var/lib/ceph/bootstrap-mds/`` in
+``/var/lib/ceph/bootstrap-mds/{cluster}.keyring`` format. It then runs appropriate
+commands based on ``distro.init`` to start the ``mds``. To remove the mds,
+subcommand ``destroy`` is used.
+
+Usage::
+
+	ceph-deploy mds create [HOST[:DAEMON-NAME]] [HOST[:DAEMON-NAME]...]
+
+	ceph-deploy mds destroy [HOST[:DAEMON-NAME]] [HOST[:DAEMON-NAME]...]
+
+The [DAEMON-NAME] is optional.
+
+
+mon
+---
+
+Deploy Ceph monitor on remote hosts. ``mon`` makes use of certain subcommands
+to deploy Ceph monitors on other nodes.
+
+Subcommand ``create-initial`` deploys for monitors defined in
+``mon initial members`` under ``[global]`` section in Ceph configuration file,
+wait until they form quorum and then gatherkeys, reporting the monitor status
+along the process. If monitors don't form quorum the command will eventually
+time out.
+
+Usage::
+
+	ceph-deploy mon create-initial
+
+Subcommand ``create`` is used to deploy Ceph monitors by explicitly specifying
+the hosts which are desired to be made monitors. If no hosts are specified it
+will default to use the ``mon initial members`` defined under ``[global]``
+section of Ceph configuration file. ``create`` first detects platform and distro
+for desired hosts and checks if hostname is compatible for deployment. It then
+uses the monitor keyring initially created using ``new`` command and deploys the
+monitor in desired host. If multiple hosts were specified during ``new`` command
+i.e, if there are multiple hosts in ``mon initial members`` and multiple keyrings
+were created then a concatenated keyring is used for deployment of monitors. In
+this process a keyring parser is used which looks for ``[entity]`` sections in
+monitor keyrings and returns a list of those sections. A helper is then used to
+collect all keyrings into a single blob that will be used to inject it to monitors
+with :option:`--mkfs` on remote nodes. All keyring files are concatenated to be
+in a directory ending with ``.keyring``. During this process the helper uses list
+of sections returned by keyring parser to check if an entity is already present
+in a keyring and if not, adds it. The concatenated keyring is used for deployment
+of monitors to desired multiple hosts.
+
+Usage::
+
+	ceph-deploy mon create [HOST] [HOST...]
+
+Here, [HOST] is hostname of desired monitor host(s).
+
+Subcommand ``add`` is used to add a monitor to an existing cluster. It first
+detects platform and distro for desired host and checks if hostname is compatible
+for deployment. It then uses the monitor keyring, ensures configuration for new
+monitor host and adds the monitor to the cluster. If the section for the monitor
+exists and defines a mon addr that will be used, otherwise it will fallback by
+resolving the hostname to an IP. If :option:`--address` is used it will override
+all other options. After adding the monitor to the cluster, it gives it some time
+to start. It then looks for any monitor errors and checks monitor status. Monitor
+errors arise if the monitor is not added in ``mon initial members``, if it doesn't
+exist in ``monmap`` and if neither ``public_addr`` nor ``public_network`` keys
+were defined for monitors. Under such conditions, monitors may not be able to
+form quorum. Monitor status tells if the monitor is up and running normally. The
+status is checked by running ``ceph daemon mon.hostname mon_status`` on remote
+end which provides the output and returns a boolean status of what is going on.
+``False`` means a monitor that is not fine even if it is up and running, while
+``True`` means the monitor is up and running correctly.
+
+Usage::
+
+	ceph-deploy mon add [HOST]
+
+	ceph-deploy mon add [HOST] --address [IP]
+
+Here, [HOST] is the hostname and [IP] is the IP address of the desired monitor
+node. Please note, unlike other ``mon`` subcommands, only one node can be
+specified at a time.
+
+Subcommand ``destroy`` is used to completely remove monitors on remote hosts.
+It takes hostnames as arguments. It stops the monitor, verifies if ``ceph-mon``
+daemon really stopped, creates an archive directory ``mon-remove`` under
+``/var/lib/ceph/``, archives old monitor directory in
+``{cluster}-{hostname}-{stamp}`` format in it and removes the monitor from
+cluster by running ``ceph remove...`` command.
+
+Usage::
+
+	ceph-deploy mon destroy [HOST] [HOST...]
+
+Here, [HOST] is hostname of monitor that is to be removed.
+
+
+gatherkeys
+----------
+
+Gather authentication keys for provisioning new nodes. It takes hostnames as
+arguments. It checks for and fetches ``client.admin`` keyring, monitor keyring
+and ``bootstrap-mds/bootstrap-osd`` keyring from monitor host. These
+authentication keys are used when new ``monitors/OSDs/MDS`` are added to the
+cluster.
+
+Usage::
+
+	ceph-deploy gatherkeys [HOST] [HOST...]
+
+Here, [HOST] is hostname of the monitor from where keys are to be pulled.
+
+
+disk
+----
+
+Manage disks on a remote host. It actually triggers the ``ceph-disk`` utility
+and it's subcommands to manage disks.
+
+Subcommand ``list`` lists disk partitions and Ceph OSDs.
+
+Usage::
+
+	ceph-deploy disk list [HOST:[DISK]]
+
+Here, [HOST] is hostname of the node and [DISK] is disk name or path.
+
+Subcommand ``prepare`` prepares a directory, disk or drive for a Ceph OSD. It
+creates a GPT partition, marks the partition with Ceph type uuid, creates a
+file system, marks the file system as ready for Ceph consumption, uses entire
+partition and adds a new partition to the journal disk.
+
+Usage::
+
+	ceph-deploy disk prepare [HOST:[DISK]]
+
+Here, [HOST] is hostname of the node and [DISK] is disk name or path.
+
+Subcommand ``activate`` activates the Ceph OSD. It mounts the volume in a
+temporary location, allocates an OSD id (if needed), remounts in the correct
+location ``/var/lib/ceph/osd/$cluster-$id`` and starts ``ceph-osd``. It is
+triggered by ``udev`` when it sees the OSD GPT partition type or on ceph service
+start with ``ceph disk activate-all``.
+
+Usage::
+
+	ceph-deploy disk activate [HOST:[DISK]]
+
+Here, [HOST] is hostname of the node and [DISK] is disk name or path.
+
+Subcommand ``zap`` zaps/erases/destroys a device's partition table and contents.
+It actually uses ``sgdisk`` and it's option ``--zap-all`` to destroy both GPT and
+MBR data structures so that the disk becomes suitable for repartitioning.
+``sgdisk`` then uses ``--mbrtogpt`` to convert the MBR or BSD disklabel disk to a
+GPT disk. The ``prepare`` subcommand can now be executed which will create a new
+GPT partition.
+
+Usage::
+
+	ceph-deploy disk zap [HOST:[DISK]]
+
+Here, [HOST] is hostname of the node and [DISK] is disk name or path.
+
+
+osd
+---
+
+Manage OSDs by preparing data disk on remote host. ``osd`` makes use of certain
+subcommands for managing OSDs.
+
+Subcommand ``prepare`` prepares a directory, disk or drive for a Ceph OSD. It
+first checks against multiple OSDs getting created and warns about the
+possibility of more than the recommended which would cause issues with max
+allowed PIDs in a system. It then reads the bootstrap-osd key for the cluster or
+writes the bootstrap key if not found. It then uses :program:`ceph-disk`
+utility's ``prepare`` subcommand to prepare the disk, journal and deploy the OSD
+on the desired host. Once prepared, it gives some time to the OSD to settle and
+checks for any possible errors and if found, reports to the user.
+
+Usage::
+
+	ceph-deploy osd prepare HOST:DISK[:JOURNAL] [HOST:DISK[:JOURNAL]...]
+
+Subcommand ``activate`` activates the OSD prepared using ``prepare`` subcommand.
+It actually uses :program:`ceph-disk` utility's ``activate`` subcommand with
+appropriate init type based on distro to activate the OSD. Once activated, it
+gives some time to the OSD to start and checks for any possible errors and if
+found, reports to the user. It checks the status of the prepared OSD, checks the
+OSD tree and makes sure the OSDs are up and in.
+
+Usage::
+
+	ceph-deploy osd activate HOST:DISK[:JOURNAL] [HOST:DISK[:JOURNAL]...]
+
+Subcommand ``create`` uses ``prepare`` and ``activate`` subcommands to create an
+OSD.
+
+Usage::
+
+	ceph-deploy osd create HOST:DISK[:JOURNAL] [HOST:DISK[:JOURNAL]...]
+
+Subcommand ``list`` lists disk partitions, Ceph OSDs and prints OSD metadata.
+It gets the osd tree from a monitor host, uses the ``ceph-disk-list`` output
+and gets the mount point by matching the line where the partition mentions
+the OSD name, reads metadata from files, checks if a journal path exists,
+if the OSD is in a OSD tree and prints the OSD metadata.
+
+Usage::
+
+	ceph-deploy osd list HOST:DISK[:JOURNAL] [HOST:DISK[:JOURNAL]...]
+
+
+admin
+-----
+
+Push configuration and ``client.admin`` key to a remote host. It takes
+the ``{cluster}.client.admin.keyring`` from admin node and writes it under
+``/etc/ceph`` directory of desired node.
+
+Usage::
+
+	ceph-deploy admin [HOST] [HOST...]
+
+Here, [HOST] is desired host to be configured for Ceph administration.
+
+
+config
+------
+
+Push/pull configuration file to/from a remote host. It uses ``push`` subcommand
+to takes the configuration file from admin host and write it to remote host under
+``/etc/ceph`` directory. It uses ``pull`` subcommand to do the opposite i.e, pull
+the configuration file under ``/etc/ceph`` directory of remote host to admin node.
+
+Usage::
+
+	ceph-deploy push [HOST] [HOST...]
+
+	ceph-deploy pull [HOST] [HOST...]
+
+Here, [HOST] is the hostname of the node where config file will be pushed to or
+pulled from.
+
+
+uninstall
+---------
+
+Remove Ceph packages from remote hosts. It detects the platform and distro of
+selected host and uninstalls Ceph packages from it. However, some dependencies
+like ``librbd1`` and ``librados2`` will not be removed because they can cause
+issues with ``qemu-kvm``.
+
+Usage::
+
+	ceph-deploy uninstall [HOST] [HOST...]
+
+Here, [HOST] is hostname of the node from where Ceph will be uninstalled.
+
+
+purge
+-----
+
+Remove Ceph packages from remote hosts and purge all data. It detects the
+platform and distro of selected host, uninstalls Ceph packages and purges all
+data. However, some dependencies like ``librbd1`` and ``librados2`` will not be
+removed because they can cause issues with ``qemu-kvm``.
+
+Usage::
+
+	ceph-deploy purge [HOST] [HOST...]
+
+Here, [HOST] is hostname of the node from where Ceph will be purged.
+
+
+purgedata
+---------
+
+Purge (delete, destroy, discard, shred) any Ceph data from ``/var/lib/ceph``.
+Once it detects the platform and distro of desired host, it first checks if Ceph
+is still installed on the selected host and if installed, it won't purge data
+from it. If Ceph is already uninstalled from the host, it tries to remove the
+contents of ``/var/lib/ceph``. If it fails then probably OSDs are still mounted
+and needs to be unmounted to continue. It unmount the OSDs and tries to remove
+the contents of ``/var/lib/ceph`` again and checks for errors. It also removes
+contents of ``/etc/ceph``. Once all steps are successfully completed, all the
+Ceph data from the selected host are removed.
+
+Usage::
+
+	ceph-deploy purgedata [HOST] [HOST...]
+
+Here, [HOST] is hostname of the node from where Ceph data will be purged.
+
+
+forgetkeys
+----------
+
+Remove authentication keys from the local directory. It removes all the
+authentication keys i.e, monitor keyring, client.admin keyring, bootstrap-osd
+and bootstrap-mds keyring from the node.
+
+Usage::
+
+	ceph-deploy forgetkeys
+
+
+pkg
+---
+
+Manage packages on remote hosts. It is used for installing or removing packages
+from remote hosts. The package names for installation or removal are to be
+specified after the command. Two options :option:`--install` and
+:option:`--remove` are used for this purpose.
+
+Usage::
+
+	ceph-deploy pkg --install [PKGs] [HOST] [HOST...]
+
+	ceph-deploy pkg --remove [PKGs] [HOST] [HOST...]
+
+Here, [PKGs] is comma-separated package names and [HOST] is hostname of the
+remote node where packages are to be installed or removed from.
+
+
+calamari
+--------
+
+Install and configure Calamari nodes. It first checks if distro is supported
+for Calamari installation by ceph-deploy. An argument ``connect`` is used for
+installation and configuration. It checks for ``ceph-deploy`` configuration
+file (cd_conf) and Calamari release repo or ``calamari-minion`` repo. It relies
+on default for repo installation as it doesn't install Ceph unless specified
+otherwise. ``options`` dictionary is also defined because ``ceph-deploy``
+pops items internally which causes issues when those items are needed to be
+available for every host. If the distro is Debian/Ubuntu, it is ensured that
+proxy is disabled for ``calamari-minion`` repo. ``calamari-minion`` package is
+then installed and custom repository files are added. minion config  is placed
+prior to installation so that it is present when the minion first starts.
+config directory, calamari salt config are created and salt-minion package
+is installed. If the distro is Redhat/CentOS, the salt-minion service needs to
+be started.
+
+Usage::
+
+	ceph-deploy calamari {connect} [HOST] [HOST...]
+
+Here, [HOST] is the hostname where Calamari is to be installed.
+
+An option ``--release`` can be used to use a given release from repositories
+defined in :program:`ceph-deploy`'s configuration. Defaults to ``calamari-minion``.
+
+Another option :option:`--master` can also be used with this command.
+
+Options
+=======
+
+.. option:: --version
+
+	The current installed version of :program:`ceph-deploy`.
+
+.. option:: --username
+
+	The username to connect to the remote host.
+
+.. option:: --overwrite-conf
+
+	Overwrite an existing conf file on remote host (if present).
+
+.. option:: --cluster
+
+	Name of the cluster.
+
+.. option:: --ceph-conf
+
+	Use (or reuse) a given ``ceph.conf`` file.
+
+.. option:: --no-ssh-copykey
+
+	Do not attempt to copy ssh keys.
+
+.. option:: --fsid
+
+	Provide an alternate FSID for ``ceph.conf`` generation.
+
+.. option:: --cluster-network
+
+	Specify the (internal) cluster network.
+
+.. option:: --public-network
+
+	Specify the public network for a cluster.
+
+.. option:: --testing
+
+	Install the latest development release.
+
+.. option:: --dev
+
+	Install a bleeding edge built from Git branch or tag (default: master).
+
+.. option:: --adjust-repos
+
+	Install packages modifying source repos.
+
+.. option:: --no-adjust-repos
+
+	Install packages without modifying source repos.
+
+.. option:: --repo
+
+	Install repo files only (skips package installation).
+
+.. option:: --local-mirror
+
+	Fetch packages and push them to hosts for a local repo mirror.
+
+.. option:: --repo-url
+
+	Specify a repo url that mirrors/contains Ceph packages.
+
+.. option:: --gpg-url
+
+	Specify a GPG key url to be used with custom repos (defaults to ceph.com).
+
+.. option:: --address
+
+	IP address of the host node to be added to the cluster.
+
+.. option:: --keyrings
+
+	Concatenate multiple keyrings to be seeded on new monitors.
+
+.. option:: --zap-disk
+
+	Destroy the partition table and content of a disk.
+
+.. option:: --fs-type
+
+	Filesystem to use to format disk ``(xfs, btrfs or ext4)``.
+
+.. option:: --dmcrypt
+
+	Encrypt [data-path] and/or journal devices with ``dm-crypt``.
+
+.. option:: --dmcrypt-key-dir
+
+	Directory where ``dm-crypt`` keys are stored.
+
+.. option:: --install
+
+	Comma-separated package(s) to install on remote hosts.
+
+.. option:: --remove
+
+	Comma-separated package(s) to remove from remote hosts.
+
+.. option:: --master
+
+	The domain for the Calamari master server.
+
+
+Availability
+============
+
+:program:`ceph-deploy` is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to
+the documentation at http://ceph.com/ceph-deploy/docs for more information.
+
+
+See also
+========
+
+:doc:`ceph-mon <ceph-mon>`\(8),
+:doc:`ceph-osd <ceph-osd>`\(8),
+:doc:`ceph-disk <ceph-disk>`\(8),
+:doc:`ceph-mds <ceph-mds>`\(8)
diff --git a/doc/man/8/ceph-disk.rst b/doc/man/8/ceph-disk.rst
new file mode 100644
index 0000000..bb67163
--- /dev/null
+++ b/doc/man/8/ceph-disk.rst
@@ -0,0 +1,277 @@
+:orphan:
+
+===================================================================
+ ceph-disk -- Ceph disk preparation and activation utility for OSD
+===================================================================
+
+.. program:: ceph-disk
+
+Synopsis
+========
+
+| **ceph-disk** **prepare** [--cluster *clustername*] [--cluster-uuid *uuid*]
+	[--fs-type *xfs|ext4|btrfs*] [*data-path*] [*journal-path*]
+
+| **ceph-disk** **activate** [*data-path*] [--activate-key *path*]
+        [--mark-init *sysvinit|upstart|systemd|auto|none*]
+        [--no-start-daemon]
+
+| **ceph-disk** **activate-all**
+
+| **ceph-disk** **list**
+
+Description
+===========
+
+:program:`ceph-disk` is a utility that can prepare and activate a disk, partition or
+directory as a Ceph OSD. It is run directly or triggered by :program:`ceph-deploy`
+or ``udev``. It can also be triggered by other deployment utilities like ``Chef``,
+``Juju``, ``Puppet`` etc.
+
+It actually automates the multiple steps involved in manual creation and start
+of an OSD into two steps of preparing and activating the OSD by using the
+subcommands ``prepare`` and ``activate``.
+
+Subcommands
+============
+
+prepare
+--------
+
+Prepare a directory, disk for a Ceph OSD. It creates a GPT partition,
+marks the partition with Ceph type ``uuid``, creates a file system, marks the
+file system as ready for Ceph consumption, uses entire partition and adds a new
+partition to the journal disk. It is run directly or triggered by
+:program:`ceph-deploy`.
+
+Usage::
+
+	ceph-disk prepare --cluster [cluster-name] --cluster-uuid [uuid] --fs-type
+	[ext4|xfs|btrfs] [data-path] [journal-path]
+
+Other options like :option:`--osd-uuid`, :option:`--journal-uuid`,
+:option:`--zap-disk`, :option:`--data-dir`, :option:`--data-dev`,
+:option:`--journal-file`, :option:`--journal-dev`, :option:`--dmcrypt`
+and :option:`--dmcrypt-key-dir` can also be used with the subcommand.
+
+activate
+--------
+
+Activate the Ceph OSD. It mounts the volume in a temporary location, allocates
+an OSD id (if needed), remounts in the correct location
+``/var/lib/ceph/osd/$cluster-$id`` and starts ceph-osd. It is triggered by
+``udev`` when it sees the OSD GPT partition type or on ceph service start with
+``ceph disk activate-all``. It is also run directly or triggered by
+:program:`ceph-deploy`.
+
+Usage::
+
+	ceph-disk activate [PATH]
+
+Here, [PATH] is path to a block device or a directory.
+
+An additional option :option:`--activate-key` has to be used with this
+subcommand when a copy of ``/var/lib/ceph/bootstrap-osd/{cluster}.keyring``
+isn't present in the OSD node.
+
+Usage::
+
+	ceph-disk activate [PATH] [--activate-key PATH]
+
+Another option :option:`--mark-init` can also be used with this
+subcommand.  ``--mark-init`` provides init system to manage the OSD
+directory. It defaults to ``auto`` which detects the init system
+suitable for ceph (either ``sysvinit``, ``systemd`` or
+``upstart``). The argument can be used to override the init system. It
+may be convenient when an operating system supports multiple init
+systems, such as Debian GNU/Linux jessie with ``systemd`` and
+``sysvinit``. If the argument is ``none``, the OSD is not marked with
+any init system and ``ceph-disk activate`` needs to be called
+explicitely after each reboot.
+
+
+Usage::
+
+	ceph-disk activate [PATH] [--mark-init *sysvinit|upstart|systemd|auto|none*]
+
+If the option :option:`--no-start-daemon` is given, the activation
+steps are performed but the OSD daemon is not started.
+
+activate-journal
+----------------
+
+Activate an OSD via it's journal device. ``udev`` triggers
+``ceph-disk activate-journal <dev>`` based on the partition type.
+
+Usage::
+
+	ceph-disk activate-journal [DEV]
+
+Here, [DEV] is the path to a journal block device.
+
+Others options like :option:`--activate-key` and :option:`--mark-init` can also
+be used with this subcommand.
+
+``--mark-init`` provides init system to manage the OSD directory.
+
+Usage::
+
+	ceph-disk activate-journal [--activate-key PATH] [--mark-init INITSYSTEM] [DEV]
+
+activate-all
+------------
+
+Activate all tagged OSD partitions. ``activate-all`` relies on
+``/dev/disk/by-parttype-uuid/$typeuuid.$uuid`` to find all partitions. Special
+``udev`` rules are installed to create these links. It is triggered on ceph
+service start or run directly.
+
+Usage::
+
+	ceph-disk activate-all
+
+Others options like :option:`--activate-key` and :option:`--mark-init` can
+also be used with this subcommand.
+
+``--mark-init`` provides init system to manage the OSD directory.
+
+Usage::
+
+	ceph-disk activate-all [--activate-key PATH] [--mark-init INITSYSTEM]
+
+list
+----
+
+List disk partitions and Ceph OSDs. It is run directly or triggered by
+:program:`ceph-deploy`.
+
+Usage::
+
+	ceph-disk list
+
+suppress-activate
+-----------------
+
+Suppress activate on a device (prefix). Mark devices that you don't want to
+activate with a file like ``/var/lib/ceph/tmp/suppress-activate.sdb`` where the
+last bit is the sanitized device name (/dev/X without the /dev/ prefix). A
+function ``is_suppressed()`` checks for and  matches a prefix (/dev/). It means
+suppressing sdb will stop activate on sdb1, sdb2, etc.
+
+Usage::
+
+	ceph-disk suppress-activate [PATH]
+
+Here, [PATH] is path to a block device or a directory.
+
+unsuppress-activate
+-------------------
+
+Stop suppressing activate on a device (prefix). It is used to activate a device
+that was earlier kept deactivated using ``suppress-activate``.
+
+Usage::
+
+	ceph-disk unsuppress-activate [PATH]
+
+Here, [PATH] is path to a block device or a directory.
+
+zap
+---
+
+Zap/erase/destroy a device's partition table and contents. It actually uses
+``sgdisk`` and it's option ``--zap-all`` to destroy both GPT and MBR data
+structures so that the disk becomes suitable for repartitioning. ``sgdisk``
+then uses ``--mbrtogpt`` to convert the MBR or BSD disklabel disk to a GPT
+disk. The ``prepare`` subcommand can now be executed which will create a new
+GPT partition. It is also run directly or triggered by :program:`ceph-deploy`.
+
+Usage::
+
+	ceph-disk zap [DEV]
+
+Here, [DEV] is path to a block device.
+
+Options
+=======
+
+.. option:: --prepend-to-path PATH
+
+   Prepend PATH to $PATH for backward compatibility (default ``/usr/bin``).
+
+.. option:: --statedir PATH
+
+   Directory in which ceph configuration is preserved (default ``/usr/lib/ceph``).
+
+.. option:: --sysconfdir PATH
+
+   Directory in which ceph configuration files are found (default ``/etc/ceph``).
+
+.. option:: --cluster
+
+   Provide name of the ceph cluster in which the OSD is being prepared.
+
+.. option:: --cluster-uuid
+
+   Provide uuid of the ceph cluster in which the OSD is being prepared.
+
+.. option:: --fs-type
+
+   Provide the filesytem type for the OSD. e.g. ``xfs/ext4/btrfs``.
+
+.. option:: --osd-uuid
+
+	Unique OSD uuid to assign to the disk.
+
+.. option:: --journal-uuid
+
+	Unique uuid to assign to the journal.
+
+.. option:: --zap-disk
+
+	Destroy the partition table and content of a disk.
+
+.. option:: --data-dir
+
+	Verify that ``[data-path]`` is of a directory.
+
+.. option:: --data-dev
+
+	Verify that ``[data-path]`` is of a block device.
+
+.. option:: --journal-file
+
+	Verify that journal is a file.
+
+.. option:: --journal-dev
+
+	Verify that journal is a block device.
+
+.. option:: --dmcrypt
+
+	Encrypt ``[data-path]`` and/or journal devices with ``dm-crypt``.
+
+.. option:: --dmcrypt-key-dir
+
+	Directory where ``dm-crypt`` keys are stored.
+
+.. option:: --activate-key
+
+   Use when a copy of ``/var/lib/ceph/bootstrap-osd/{cluster}.keyring`` isn't
+   present in the OSD node. Suffix the option by the path to the keyring.
+
+.. option:: --mark-init
+
+   Provide init system to manage the OSD directory.
+
+Availability
+============
+
+:program:`ceph-disk` is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to
+the Ceph documentation at http://ceph.com/docs for more information.
+
+See also
+========
+
+:doc:`ceph-osd <ceph-osd>`\(8),
+:doc:`ceph-deploy <ceph-deploy>`\(8)
diff --git a/doc/man/8/ceph-fuse.rst b/doc/man/8/ceph-fuse.rst
new file mode 100644
index 0000000..cede60e
--- /dev/null
+++ b/doc/man/8/ceph-fuse.rst
@@ -0,0 +1,64 @@
+:orphan:
+
+=========================================
+ ceph-fuse -- FUSE-based client for ceph
+=========================================
+
+.. program:: ceph-fuse
+
+Synopsis
+========
+
+| **ceph-fuse** [ -m *monaddr*:*port* ] *mountpoint* [ *fuse options* ]
+
+
+Description
+===========
+
+**ceph-fuse** is a FUSE (File system in USErspace) client for Ceph
+distributed file system. It will mount a ceph file system (specified
+via the -m option for described by ceph.conf (see below) at the
+specific mount point.
+
+The file system can be unmounted with::
+
+        fusermount -u mountpoint
+
+or by sending ``SIGINT`` to the ``ceph-fuse`` process.
+
+
+Options
+=======
+
+Any options not recognized by ceph-fuse will be passed on to libfuse.
+
+.. option:: -d
+
+   Detach from console and daemonize after startup.
+
+.. option:: -c ceph.conf, --conf=ceph.conf
+
+   Use *ceph.conf* configuration file instead of the default
+   ``/etc/ceph/ceph.conf`` to determine monitor addresses during startup.
+
+.. option:: -m monaddress[:port]
+
+   Connect to specified monitor (instead of looking through ceph.conf).
+
+.. option:: -r root_directory
+
+   Use root_directory as the mounted root, rather than the full Ceph tree.
+
+
+Availability
+============
+
+**ceph-fuse** is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to
+the Ceph documentation at http://ceph.com/docs for more information.
+
+
+See also
+========
+
+fusermount(8),
+:doc:`ceph <ceph>`\(8)
diff --git a/doc/man/8/ceph-mds.rst b/doc/man/8/ceph-mds.rst
new file mode 100644
index 0000000..af1f3c7
--- /dev/null
+++ b/doc/man/8/ceph-mds.rst
@@ -0,0 +1,92 @@
+:orphan:
+
+=========================================
+ ceph-mds -- ceph metadata server daemon
+=========================================
+
+.. program:: ceph-mds
+
+Synopsis
+========
+
+| **ceph-mds** -i *name* [[ --hot-standby [*rank*] ]|[--journal_check *rank*]]
+
+
+Description
+===========
+
+**ceph-mds** is the metadata server daemon for the Ceph distributed file
+system. One or more instances of ceph-mds collectively manage the file
+system namespace, coordinating access to the shared OSD cluster.
+
+Each ceph-mds daemon instance should have a unique name. The name is used
+to identify daemon instances in the ceph.conf.
+
+Once the daemon has started, the monitor cluster will normally assign
+it a logical rank, or put it in a standby pool to take over for
+another daemon that crashes. Some of the specified options can cause
+other behaviors.
+
+If you specify hot-standby or journal-check, you must either specify
+the rank on the command line, or specify one of the
+mds_standby_for_[rank|name] parameters in the config.  The command
+line specification overrides the config, and specifying the rank
+overrides specifying the name.
+
+
+Options
+=======
+
+.. option:: -f, --foreground
+
+   Foreground: do not daemonize after startup (run in foreground). Do
+   not generate a pid file. Useful when run via :doc:`ceph-run
+   <ceph-run>`\(8).
+
+.. option:: -d
+
+   Debug mode: like ``-f``, but also send all log output to stderr.
+
+.. option:: --setuser userorgid
+
+   Set uid after starting.  If a username is specified, the user
+   record is looked up to get a uid and a gid, and the gid is also set
+   as well, unless --setgroup is also specified.
+
+.. option:: --setgroup grouporgid
+
+   Set gid after starting.  If a group name is specified the group
+   record is looked up to get a gid.
+
+.. option:: -c ceph.conf, --conf=ceph.conf
+
+   Use *ceph.conf* configuration file instead of the default
+   ``/etc/ceph/ceph.conf`` to determine monitor addresses during
+   startup.
+
+.. option:: -m monaddress[:port]
+
+   Connect to specified monitor (instead of looking through
+   ``ceph.conf``).
+
+.. option:: --journal-check <rank>
+
+    Attempt to replay the journal for MDS <rank>, then exit.
+
+.. option:: --hot-standby <rank>
+
+    Start as a hot standby for MDS <rank>.
+
+Availability
+============
+
+**ceph-mds** is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to the Ceph documentation at
+http://ceph.com/docs for more information.
+
+
+See also
+========
+
+:doc:`ceph <ceph>`\(8),
+:doc:`ceph-mon <ceph-mon>`\(8),
+:doc:`ceph-osd <ceph-osd>`\(8)
diff --git a/doc/man/8/ceph-mon.rst b/doc/man/8/ceph-mon.rst
new file mode 100644
index 0000000..7a2cd03
--- /dev/null
+++ b/doc/man/8/ceph-mon.rst
@@ -0,0 +1,94 @@
+:orphan:
+
+=================================
+ ceph-mon -- ceph monitor daemon
+=================================
+
+.. program:: ceph-mon
+
+Synopsis
+========
+
+| **ceph-mon** -i *monid* [ --mon-data *mondatapath* ]
+
+
+Description
+===========
+
+**ceph-mon** is the cluster monitor daemon for the Ceph distributed
+file system. One or more instances of **ceph-mon** form a Paxos
+part-time parliament cluster that provides extremely reliable and
+durable storage of cluster membership, configuration, and state.
+
+The *mondatapath* refers to a directory on a local file system storing
+monitor data. It is normally specified via the ``mon data`` option in
+the configuration file.
+
+Options
+=======
+
+.. option:: -f, --foreground
+
+   Foreground: do not daemonize after startup (run in foreground). Do
+   not generate a pid file. Useful when run via :doc:`ceph-run <ceph-run>`\(8).
+
+.. option:: -d
+
+   Debug mode: like ``-f``, but also send all log output to stderr.
+
+.. option:: --setuser userorgid
+
+   Set uid after starting.  If a username is specified, the user
+   record is looked up to get a uid and a gid, and the gid is also set
+   as well, unless --setgroup is also specified.
+
+.. option:: --setgroup grouporgid
+
+   Set gid after starting.  If a group name is specified the group
+   record is looked up to get a gid.
+
+.. option:: -c ceph.conf, --conf=ceph.conf
+
+   Use *ceph.conf* configuration file instead of the default
+   ``/etc/ceph/ceph.conf`` to determine monitor addresses during
+   startup.
+
+.. option:: --mkfs
+
+   Initialize the ``mon data`` directory with seed information to form
+   and initial ceph file system or to join an existing monitor
+   cluster.  Three pieces of information must be provided:
+
+   - The cluster fsid.  This can come from a monmap (``--monmap <path>``) or
+     explicitly via ``--fsid <uuid>``.
+   - A list of monitors and their addresses.  This list of monitors
+     can come from a monmap (``--monmap <path>``), the ``mon host``
+     configuration value (in *ceph.conf* or via ``-m
+     host1,host2,...``), or ``mon addr`` lines in *ceph.conf*.  If this
+     monitor is to be part of the initial monitor quorum for a new
+     Ceph cluster, then it must be included in the initial list,
+     matching either the name or address of a monitor in the list.
+     When matching by address, either the ``public addr`` or ``public
+     subnet`` options may be used.
+   - The monitor secret key ``mon.``.  This must be included in the
+     keyring provided via ``--keyring <path>``.
+
+.. option:: --keyring
+
+   Specify a keyring for use with ``--mkfs``.
+
+
+Availability
+============
+
+**ceph-mon** is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer
+to the Ceph documentation at http://ceph.com/docs for more
+information.
+
+
+See also
+========
+
+:doc:`ceph <ceph>`\(8),
+:doc:`ceph-mds <ceph-mds>`\(8),
+:doc:`ceph-osd <ceph-osd>`\(8)
diff --git a/doc/man/8/ceph-osd.rst b/doc/man/8/ceph-osd.rst
new file mode 100644
index 0000000..e8b2805
--- /dev/null
+++ b/doc/man/8/ceph-osd.rst
@@ -0,0 +1,122 @@
+:orphan:
+
+========================================
+ ceph-osd -- ceph object storage daemon
+========================================
+
+.. program:: ceph-osd
+
+Synopsis
+========
+
+| **ceph-osd** -i *osdnum* [ --osd-data *datapath* ] [ --osd-journal
+  *journal* ] [ --mkfs ] [ --mkjournal ] [ --mkkey ]
+
+
+Description
+===========
+
+**ceph-osd** is the object storage daemon for the Ceph distributed file
+system. It is responsible for storing objects on a local file system
+and providing access to them over the network.
+
+The datapath argument should be a directory on a btrfs file system
+where the object data resides. The journal is optional, and is only
+useful performance-wise when it resides on a different disk than
+datapath with low latency (ideally, an NVRAM device).
+
+
+Options
+=======
+
+.. option:: -f, --foreground
+
+   Foreground: do not daemonize after startup (run in foreground). Do
+   not generate a pid file. Useful when run via :doc:`ceph-run <ceph-run>`\(8).
+
+.. option:: -d
+
+   Debug mode: like ``-f``, but also send all log output to stderr.
+
+.. option:: --setuser userorgid
+
+   Set uid after starting.  If a username is specified, the user
+   record is looked up to get a uid and a gid, and the gid is also set
+   as well, unless --setgroup is also specified.
+
+.. option:: --setgroup grouporgid
+
+   Set gid after starting.  If a group name is specified the group
+   record is looked up to get a gid.
+
+.. option:: --osd-data osddata
+
+   Use object store at *osddata*.
+
+.. option:: --osd-journal journal
+
+   Journal updates to *journal*.
+
+.. option:: --mkfs
+
+   Create an empty object repository. This also initializes the journal
+   (if one is defined).
+
+.. option:: --mkkey
+
+   Generate a new secret key. This is normally used in combination
+   with ``--mkfs`` as it is more convenient than generating a key by
+   hand with :doc:`ceph-authtool <ceph-authtool>`\(8).
+
+.. option:: --mkjournal
+
+   Create a new journal file to match an existing object repository.
+   This is useful if the journal device or file is wiped out due to a
+   disk or file system failure.
+
+.. option:: --flush-journal
+
+   Flush the journal to permanent store. This runs in the foreground
+   so you know when it's completed. This can be useful if you want to
+   resize the journal or need to otherwise destroy it: this guarantees
+   you won't lose data.
+
+.. option:: --get-cluster-fsid
+
+   Print the cluster fsid (uuid) and exit.
+
+.. option:: --get-osd-fsid
+
+   Print the OSD's fsid and exit.  The OSD's uuid is generated at
+   --mkfs time and is thus unique to a particular instantiation of
+   this OSD.
+
+.. option:: --get-journal-fsid
+
+   Print the journal's uuid.  The journal fsid is set to match the OSD
+   fsid at --mkfs time.
+
+.. option:: -c ceph.conf, --conf=ceph.conf
+
+   Use *ceph.conf* configuration file instead of the default
+   ``/etc/ceph/ceph.conf`` for runtime configuration options.
+
+.. option:: -m monaddress[:port]
+
+   Connect to specified monitor (instead of looking through
+   ``ceph.conf``).
+
+
+Availability
+============
+
+**ceph-osd** is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to
+the Ceph documentation at http://ceph.com/docs for more information.
+
+See also
+========
+
+:doc:`ceph <ceph>`\(8),
+:doc:`ceph-mds <ceph-mds>`\(8),
+:doc:`ceph-mon <ceph-mon>`\(8),
+:doc:`ceph-authtool <ceph-authtool>`\(8)
diff --git a/doc/man/8/ceph-post-file.rst b/doc/man/8/ceph-post-file.rst
new file mode 100644
index 0000000..7e4899f
--- /dev/null
+++ b/doc/man/8/ceph-post-file.rst
@@ -0,0 +1,71 @@
+:orphan:
+
+==================================================
+ ceph-post-file -- post files for ceph developers
+==================================================
+
+.. program:: ceph-post-file
+
+Synopsis
+========
+
+| **ceph-post-file** [-d *description] [-u *user*] *file or dir* ...
+
+
+Description
+===========
+
+**ceph-post-file** will upload files or directories to ceph.com for
+later analysis by Ceph developers.
+
+Each invocation uploads files or directories to a separate directory
+with a unique tag.  That tag can be passed to a developer or
+referenced in a bug report (http://tracker.ceph.com/).  Once the
+upload completes, the directory is marked non-readable and
+non-writeable to prevent access or modification by other users.
+
+Warning
+=======
+
+Basic measures are taken to make posted data be visible only to
+developers with access to ceph.com infrastructure. However, users
+should think twice and/or take appropriate precautions before
+posting potentially sensitive data (for example, logs or data
+directories that contain Ceph secrets).
+
+
+Options
+=======
+
+.. option:: -d *description*, --description *description*
+
+   Add a short description for the upload.  This is a good opportunity
+   to reference a bug number.  There is no default value.
+
+.. option:: -u *user*
+
+   Set the user metadata for the upload.  This defaults to `whoami`@`hostname -f`.
+
+Examples
+========
+
+To upload a single log::
+
+   ceph-post-file /var/log/ceph/ceph-mon.`hostname`.log
+
+To upload several directories::
+
+   ceph-post-file -d 'mon data directories' /var/log/ceph/mon/*
+
+
+Availability
+============
+
+**ceph-post-file** is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to
+the Ceph documentation at http://ceph.com/docs for more information.
+
+See also
+========
+
+:doc:`ceph <ceph>`\(8),
+:doc:`ceph-debugpack <ceph-debugpack>`\(8),
diff --git a/doc/man/8/ceph-rbdnamer.rst b/doc/man/8/ceph-rbdnamer.rst
new file mode 100644
index 0000000..123c6e2
--- /dev/null
+++ b/doc/man/8/ceph-rbdnamer.rst
@@ -0,0 +1,41 @@
+:orphan:
+
+==================================================
+ ceph-rbdnamer -- udev helper to name RBD devices
+==================================================
+
+.. program:: ceph-rbdnamer
+
+
+Synopsis
+========
+
+| **ceph-rbdnamer** *num*
+
+
+Description
+===========
+
+**ceph-rbdnamer** prints the pool and image name for the given RBD devices
+to stdout. It is used by `udev` (using a rule like the one below) to
+set up a device symlink.
+
+
+::
+
+        KERNEL=="rbd[0-9]*", PROGRAM="/usr/bin/ceph-rbdnamer %n", SYMLINK+="rbd/%c{1}/%c{2}"
+
+
+Availability
+============
+
+**ceph-rbdnamer** is part of Ceph, a massively scalable, open-source, distributed storage system.  Please
+refer to the Ceph documentation at http://ceph.com/docs for more
+information.
+
+
+See also
+========
+
+:doc:`rbd <rbd>`\(8),
+:doc:`ceph <ceph>`\(8)
diff --git a/doc/man/8/ceph-rest-api.rst b/doc/man/8/ceph-rest-api.rst
new file mode 100644
index 0000000..f9eb3d4
--- /dev/null
+++ b/doc/man/8/ceph-rest-api.rst
@@ -0,0 +1,150 @@
+:orphan:
+
+=====================================================
+ ceph-rest-api -- ceph RESTlike administration server
+=====================================================
+
+.. program:: ceph-rest-api
+
+Synopsis
+========
+
+| **ceph-rest-api** [ -c *conffile* ] [--cluster *clustername* ] [ -n *name* ] [-i *id* ]
+
+
+Description
+===========
+
+**ceph-rest-api** is a WSGI application that can run as a
+standalone web service or run under a web server that supports
+WSGI.  It provides much of the functionality of the **ceph**
+command-line tool through an HTTP-accessible interface.
+
+Options
+=======
+
+.. option:: -c/--conf conffile
+
+    names the ceph.conf file to use for configuration.  If -c is not
+    specified, the default depends on the state of the --cluster option
+    (default 'ceph'; see below).  The configuration file is searched
+    for in this order:
+
+    * $CEPH_CONF
+    * /etc/ceph/${cluster}.conf
+    * ~/.ceph/${cluster}.conf
+    * ${cluster}.conf (in the current directory)
+  
+    so you can also pass this option in the environment as CEPH_CONF.
+
+.. option:: --cluster clustername
+
+    set *clustername* for use in the $cluster metavariable, for
+    locating the ceph.conf file.  The default is 'ceph'.
+
+.. option:: -n/--name name
+
+    specifies the client 'name', which is used to find the
+    client-specific configuration options in the config file, and
+    also is the name used for authentication when connecting
+    to the cluster (the entity name appearing in ceph auth list output,
+    for example).  The default is 'client.restapi'. 
+
+.. option:: -i/--id id
+
+   specifies the client 'id', which will form the clientname
+   as 'client.<id>' if clientname is not set.  If -n/-name is
+   set, that takes precedence.
+
+   Also, global Ceph options are supported.
+ 
+
+Configuration parameters
+========================
+
+Supported configuration parameters include:
+
+* **keyring** the keyring file holding the key for 'clientname'
+* **public addr** ip:port to listen on (default 0.0.0.0:5000)
+* **log file** (usual Ceph default)
+* **restapi base url** the base URL to answer requests on (default /api/v0.1)
+* **restapi log level** critical, error, warning, info, debug (default warning)
+
+Configuration parameters are searched in the standard order:
+first in the section named '<clientname>', then 'client', then 'global'.
+
+<clientname> is either supplied by -n/--name, "client.<id>" where
+<id> is supplied by -i/--id, or 'client.restapi' if neither option
+is present.
+
+A single-threaded server will run on **public addr** if the ceph-rest-api
+executed directly; otherwise, configuration is specified by the enclosing
+WSGI web server.
+
+Commands
+========
+
+Commands are submitted with HTTP GET requests (for commands that
+primarily return data) or PUT (for commands that affect cluster state).
+HEAD and OPTIONS are also supported.  Standard HTTP status codes
+are returned.
+
+For commands that return bulk data, the request can include
+Accept: application/json or Accept: application/xml to select the
+desired structured output, or you may use a .json or .xml addition
+to the requested PATH.  Parameters are supplied as query parameters
+in the request; for parameters that take more than one value, repeat
+the key=val construct.  For instance, to remove OSDs 2 and 3,
+send a PUT request to ``osd/rm?ids=2&ids=3``.
+
+Discovery
+=========
+
+Human-readable discovery of supported commands and parameters, along
+with a small description of each command, is provided when the requested
+path is incomplete/partially matching.  Requesting / will redirect to
+the value of  **restapi base url**, and that path will give a full list
+of all known commands.
+For example, requesting ``api/vX.X/mon`` will return the list of API calls for
+monitors - ``api/vX.X/osd`` will return the list of API calls for OSD and so on.
+
+The command set is very similar to the commands
+supported by the **ceph** tool.  One notable exception is that the
+``ceph pg <pgid> <command>`` style of commands is supported here
+as ``tell/<pgid>/command?args``.
+
+Deployment as WSGI application
+==============================
+
+When deploying as WSGI application (say, with Apache/mod_wsgi,
+or nginx/uwsgi, or gunicorn, etc.), use the ``ceph_rest_api.py`` module
+(``ceph-rest-api`` is a thin layer around this module).  The standalone web
+server is of course not used, so address/port configuration is done in
+the WSGI server.  Use a python .wsgi module or the equivalent to call
+``app = generate_app(conf, cluster, clientname, clientid, args)`` where:
+
+* conf is as -c/--conf above
+* cluster is as --cluster above
+* clientname, -n/--name
+* clientid, -i/--id, and
+* args are any other generic Ceph arguments
+
+When app is returned, it will have attributes 'ceph_addr' and 'ceph_port'
+set to what the address and port are in the Ceph configuration;
+those may be used for the server, or ignored.
+
+Any errors reading configuration or connecting to the cluster cause an
+exception to be raised; see your WSGI server documentation for how to
+see those messages in case of problem.
+
+Availability
+============
+
+**ceph-rest-api** is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to the Ceph documentation at
+http://ceph.com/docs for more information.
+
+
+See also
+========
+
+:doc:`ceph <ceph>`\(8)
diff --git a/doc/man/8/ceph-run.rst b/doc/man/8/ceph-run.rst
new file mode 100644
index 0000000..ed76c28
--- /dev/null
+++ b/doc/man/8/ceph-run.rst
@@ -0,0 +1,45 @@
+:orphan:
+
+=========================================
+ ceph-run -- restart daemon on core dump
+=========================================
+
+.. program:: ceph-run
+
+Synopsis
+========
+
+| **ceph-run** *command* ...
+
+
+Description
+===========
+
+**ceph-run** is a simple wrapper that will restart a daemon if it exits
+with a signal indicating it crashed and possibly core dumped (that is,
+signals 3, 4, 5, 6, 8, or 11).
+
+The command should run the daemon in the foreground. For Ceph daemons,
+that means the ``-f`` option.
+
+
+Options
+=======
+
+None
+
+
+Availability
+============
+
+**ceph-run** is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to
+the Ceph documentation at http://ceph.com/docs for more information.
+
+
+See also
+========
+
+:doc:`ceph <ceph>`\(8),
+:doc:`ceph-mon <ceph-mon>`\(8),
+:doc:`ceph-mds <ceph-mds>`\(8),
+:doc:`ceph-osd <ceph-osd>`\(8)
diff --git a/doc/man/8/ceph-syn.rst b/doc/man/8/ceph-syn.rst
new file mode 100644
index 0000000..a30c460
--- /dev/null
+++ b/doc/man/8/ceph-syn.rst
@@ -0,0 +1,99 @@
+:orphan:
+
+===============================================
+ ceph-syn -- ceph synthetic workload generator
+===============================================
+
+.. program:: ceph-syn
+
+Synopsis
+========
+
+| **ceph-syn** [ -m *monaddr*:*port* ] --syn *command* *...*
+
+
+Description
+===========
+
+**ceph-syn** is a simple synthetic workload generator for the Ceph
+distributed file system. It uses the userspace client library to
+generate simple workloads against a currently running file system. The
+file system need not be mounted via ceph-fuse(8) or the kernel client.
+
+One or more ``--syn`` command arguments specify the particular
+workload, as documented below.
+
+
+Options
+=======
+
+.. option:: -d
+
+   Detach from console and daemonize after startup.
+
+.. option:: -c ceph.conf, --conf=ceph.conf
+
+   Use *ceph.conf* configuration file instead of the default
+   ``/etc/ceph/ceph.conf`` to determine monitor addresses during
+   startup.
+
+.. option:: -m monaddress[:port]
+
+   Connect to specified monitor (instead of looking through
+   ``ceph.conf``).
+
+.. option:: --num_client num
+
+   Run num different clients, each in a separate thread.
+
+.. option:: --syn workloadspec
+
+   Run the given workload. May be specified as many times as
+   needed. Workloads will normally run sequentially.
+
+
+Workloads
+=========
+
+Each workload should be preceded by ``--syn`` on the command
+line. This is not a complete list.
+
+:command:`mknap` *path* *snapname*
+  Create a snapshot called *snapname* on *path*.
+
+:command:`rmsnap` *path* *snapname*
+  Delete snapshot called *snapname* on *path*.
+
+:command:`rmfile` *path*
+  Delete/unlink *path*.
+
+:command:`writefile` *sizeinmb* *blocksize*
+  Create a file, named after our client id, that is *sizeinmb* MB by
+  writing *blocksize* chunks.
+
+:command:`readfile` *sizeinmb* *blocksize*
+  Read file, named after our client id, that is *sizeinmb* MB by
+  writing *blocksize* chunks.
+
+:command:`rw` *sizeinmb* *blocksize*
+  Write file, then read it back, as above.
+
+:command:`makedirs` *numsubdirs* *numfiles* *depth*
+  Create a hierarchy of directories that is *depth* levels deep. Give
+  each directory *numsubdirs* subdirectories and *numfiles* files.
+
+:command:`walk`
+  Recursively walk the file system (like find).
+
+
+Availability
+============
+
+**ceph-syn** is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to
+the Ceph documentation at http://ceph.com/docs for more information.
+
+See also
+========
+
+:doc:`ceph <ceph>`\(8),
+:doc:`ceph-fuse <ceph-fuse>`\(8)
diff --git a/doc/man/8/ceph.rst b/doc/man/8/ceph.rst
new file mode 100644
index 0000000..37bb897
--- /dev/null
+++ b/doc/man/8/ceph.rst
@@ -0,0 +1,1437 @@
+:orphan:
+
+==================================
+ ceph -- ceph administration tool
+==================================
+
+.. program:: ceph
+
+Synopsis
+========
+
+| **ceph** **auth** [ *add* \| *caps* \| *del* \| *export* \| *get* \| *get-key* \| *get-or-create* \| *get-or-create-key* \| *import* \| *list* \| *print-key* \| *print_key* ] ...
+
+| **ceph** **compact**
+
+| **ceph** **config-key** [ *del* | *exists* | *get* | *list* | *put* ] ...
+
+| **ceph** **daemon** *<name>* \| *<path>* *<command>* ...
+
+| **ceph** **daemonperf** *<name>* \| *<path>* [ *interval* [ *count* ] ]
+
+| **ceph** **df** *{detail}*
+
+| **ceph** **fs** [ *ls* \| *new* \| *reset* \| *rm* ] ...
+
+| **ceph** **fsid**
+
+| **ceph** **health** *{detail}*
+
+| **ceph** **heap** [ *dump* \| *start_profiler* \| *stop_profiler* \| *release* \| *stats* ] ...
+
+| **ceph** **injectargs** *<injectedargs>* [ *<injectedargs>*... ]
+
+| **ceph** **log** *<logtext>* [ *<logtext>*... ]
+
+| **ceph** **mds** [ *add_data_pool* \| *cluster_down* \| *cluster_up* \| *compat* \| *deactivate* \| *dump* \| *fail* \| *getmap* \| *newfs* \| *remove_data_pool* \| *rm* \| *rmfailed* \| *set* \| *set_max_mds* \| *set_state* \| *setmap* \| *stat* \| *stop* \| *tell* ] ...
+
+| **ceph** **mon** [ *add* \| *dump* \| *getmap* \| *remove* \| *stat* ] ...
+
+| **ceph** **mon_status**
+
+| **ceph** **osd** [ *blacklist* \| *blocked-by* \| *create* \| *deep-scrub* \| *df* \| *down* \| *dump* \| *erasure-code-profile* \| *find* \| *getcrushmap* \| *getmap* \| *getmaxosd* \| *in* \| *lspools* \| *map* \| *metadata* \| *out* \| *pause* \| *perf* \| *pg-temp* \| *primary-affinity* \| *primary-temp* \| *repair* \| *reweight* \| *reweight-by-pg* \| *rm* \| *scrub* \| *set* \| *setcrushmap* \| *setmaxosd*  \| *stat* \| *thrash* \| *tree* \| *unpause* \| *unset* ] ...
+
+| **ceph** **osd** **crush** [ *add* \| *add-bucket* \| *create-or-move* \| *dump* \| *get-tunable* \| *link* \| *move* \| *remove* \| *rename-bucket* \| *reweight* \| *reweight-all* \| *reweight-subtree* \| *rm* \| *rule* \| *set* \| *set-tunable* \| *show-tunables* \| *tunables* \| *unlink* ] ...
+
+| **ceph** **osd** **pool** [ *create* \| *delete* \| *get* \| *get-quota* \| *ls* \| *mksnap* \| *rename* \| *rmsnap* \| *set* \| *set-quota* \| *stats* ] ...
+
+| **ceph** **osd** **tier** [ *add* \| *add-cache* \| *cache-mode* \| *remove* \| *remove-overlay* \| *set-overlay* ] ...
+
+| **ceph** **pg** [ *debug* \| *deep-scrub* \| *dump* \| *dump_json* \| *dump_pools_json* \| *dump_stuck* \| *force_create_pg* \| *getmap* \| *ls* \| *ls-by-osd* \| *ls-by-pool* \| *ls-by-primary* \| *map* \| *repair* \| *scrub* \| *send_pg_creates* \| *set_full_ratio* \| *set_nearfull_ratio* \| *stat* ] ...
+
+| **ceph** **quorum** [ *enter* \| *exit* ]
+
+| **ceph** **quorum_status**
+
+| **ceph** **report** { *<tags>* [ *<tags>...* ] }
+
+| **ceph** **scrub**
+
+| **ceph** **status**
+
+| **ceph** **sync** **force** {--yes-i-really-mean-it} {--i-know-what-i-am-doing}
+
+| **ceph** **tell** *<name (type.id)> <args> [<args>...]*
+
+| **ceph** **version**
+
+Description
+===========
+
+:program:`ceph` is a control utility which is used for manual deployment and maintenance
+of a Ceph cluster. It provides a diverse set of commands that allows deployment of
+monitors, OSDs, placement groups, MDS and overall maintenance, administration
+of the cluster.
+
+Commands
+========
+
+auth
+----
+
+Manage authentication keys. It is used for adding, removing, exporting
+or updating of authentication keys for a particular  entity such as a monitor or
+OSD. It uses some additional subcommands.
+
+Subcommand ``add`` adds authentication info for a particular entity from input
+file, or random key if no input is given and/or any caps specified in the command.
+
+Usage::
+
+	ceph auth add <entity> {<caps> [<caps>...]}
+
+Subcommand ``caps`` updates caps for **name** from caps specified in the command.
+
+Usage::
+
+	ceph auth caps <entity> <caps> [<caps>...]
+
+Subcommand ``del`` deletes all caps for ``name``.
+
+Usage::
+
+	ceph auth del <entity>
+
+Subcommand ``export`` writes keyring for requested entity, or master keyring if
+none given.
+
+Usage::
+
+	ceph auth export {<entity>}
+
+Subcommand ``get`` writes keyring file with requested key.
+
+Usage::
+
+	ceph auth get <entity>
+
+Subcommand ``get-key`` displays requested key.
+
+Usage::
+
+	ceph auth get-key <entity>
+
+Subcommand ``get-or-create`` adds authentication info for a particular entity
+from input file, or random key if no input given and/or any caps specified in the
+command.
+
+Usage::
+
+	ceph auth get-or-create <entity> {<caps> [<caps>...]}
+
+Subcommand ``get-or-create-key`` gets or adds key for ``name`` from system/caps
+pairs specified in the command.  If key already exists, any given caps must match
+the existing caps for that key.
+
+Usage::
+
+	ceph auth get-or-create-key <entity> {<caps> [<caps>...]}
+
+Subcommand ``import`` reads keyring from input file.
+
+Usage::
+
+	ceph auth import
+
+Subcommand ``list`` lists authentication state.
+
+Usage::
+
+	ceph auth list
+
+Subcommand ``print-key`` displays requested key.
+
+Usage::
+
+	ceph auth print-key <entity>
+
+Subcommand ``print_key`` displays requested key.
+
+Usage::
+
+	ceph auth print_key <entity>
+
+
+compact
+-------
+
+Causes compaction of monitor's leveldb storage.
+
+Usage::
+
+	ceph compact
+
+
+config-key
+----------
+
+Manage configuration key. It uses some additional subcommands.
+
+Subcommand ``del`` deletes configuration key.
+
+Usage::
+
+	ceph config-key del <key>
+
+Subcommand ``exists`` checks for configuration keys existence.
+
+Usage::
+
+	ceph config-key exists <key>
+
+Subcommand ``get`` gets the configuration key.
+
+Usage::
+
+	ceph config-key get <key>
+
+Subcommand ``list`` lists configuration keys.
+
+Usage::
+
+	ceph config-key list
+
+Subcommand ``put`` puts configuration key and values.
+
+Usage::
+
+	ceph config-key put <key> {<val>}
+
+
+daemon
+------
+
+Submit admin-socket commands.
+
+Usage::
+
+	ceph daemon {daemon_name|socket_path} {command} ...
+
+Example::
+
+	ceph daemon osd.0 help
+
+
+daemonperf
+----------
+
+Watch performance counters from a Ceph daemon.
+
+Usage::
+
+	ceph daemonperf {daemon_name|socket_path} [{interval} [{count}]]
+
+
+df
+--
+
+Show cluster's free space status.
+
+Usage::
+
+	ceph df {detail}
+
+
+fs
+--
+
+Manage cephfs filesystems. It uses some additional subcommands.
+
+Subcommand ``ls`` to list filesystems
+
+Usage::
+
+	ceph fs ls
+
+Subcommand ``new`` to make a new filesystem using named pools <metadata> and <data>
+
+Usage::
+
+	ceph fs new <fs_name> <metadata> <data>
+
+Subcommand ``reset`` is used for disaster recovery only: reset to a single-MDS map
+
+Usage::
+
+	ceph fs reset <fs_name> {--yes-i-really-mean-it}
+
+Subcommand ``rm`` to disable the named filesystem
+
+Usage::
+
+	ceph fs rm <fs_name> {--yes-i-really-mean-it}
+
+
+fsid
+----
+
+Show cluster's FSID/UUID.
+
+Usage::
+
+	ceph fsid
+
+
+health
+------
+
+Show cluster's health.
+
+Usage::
+
+	ceph health {detail}
+
+
+heap
+----
+
+Show heap usage info (available only if compiled with tcmalloc)
+
+Usage::
+
+	ceph heap dump|start_profiler|stop_profiler|release|stats
+
+
+injectargs
+----------
+
+Inject configuration arguments into monitor.
+
+Usage::
+
+	ceph injectargs <injected_args> [<injected_args>...]
+
+
+log
+---
+
+Log supplied text to the monitor log.
+
+Usage::
+
+	ceph log <logtext> [<logtext>...]
+
+
+mds
+---
+
+Manage metadata server configuration and administration. It uses some
+additional subcommands.
+
+Subcommand ``add_data_pool`` adds data pool.
+
+Usage::
+
+	ceph mds add_data_pool <pool>
+
+Subcommand ``cluster_down`` takes mds cluster down.
+
+Usage::
+
+	ceph mds cluster_down
+
+Subcommand ``cluster_up`` brings mds cluster up.
+
+Usage::
+
+	ceph mds cluster_up
+
+Subcommand ``compat`` manages compatible features. It uses some additional
+subcommands.
+
+Subcommand ``rm_compat`` removes compatible feature.
+
+Usage::
+
+	ceph mds compat rm_compat <int[0-]>
+
+Subcommand ``rm_incompat`` removes incompatible feature.
+
+Usage::
+
+	ceph mds compat rm_incompat <int[0-]>
+
+Subcommand ``show`` shows mds compatibility settings.
+
+Usage::
+
+	ceph mds compat show
+
+Subcommand ``deactivate`` stops mds.
+
+Usage::
+
+	ceph mds deactivate <who>
+
+Subcommand ``dump`` dumps information, optionally from epoch.
+
+Usage::
+
+	ceph mds dump {<int[0-]>}
+
+Subcommand ``fail`` forces mds to status fail.
+
+Usage::
+
+	ceph mds fail <who>
+
+Subcommand ``getmap`` gets MDS map, optionally from epoch.
+
+Usage::
+
+	ceph mds getmap {<int[0-]>}
+
+Subcommand ``newfs`` makes new filesystem using pools <metadata> and <data>.
+
+Usage::
+
+	ceph mds newfs <int[0-]> <int[0-]> {--yes-i-really-mean-it}
+
+Subcommand ``remove_data_pool`` removes data pool.
+
+Usage::
+
+	ceph mds remove_data_pool <pool>
+
+Subcommand ``rm`` removes inactive mds.
+
+Usage::
+
+	ceph mds rm <int[0-]> <name> (type.id)>
+
+Subcommand ``rmfailed`` removes failed mds.
+
+Usage::
+
+	ceph mds rmfailed <int[0-]>
+
+Subcommand ``set`` set mds parameter <var> to <val>
+
+Usage::
+
+	ceph mds set max_mds|max_file_size|allow_new_snaps|inline_data <va> {<confirm>}
+
+Subcommand ``set_max_mds`` sets max MDS index.
+
+Usage::
+
+	ceph mds set_max_mds <int[0-]>
+
+Subcommand ``set_state`` sets mds state of <gid> to <numeric-state>.
+
+Usage::
+
+	ceph mds set_state <int[0-]> <int[0-20]>
+
+Subcommand ``setmap`` sets mds map; must supply correct epoch number.
+
+Usage::
+
+	ceph mds setmap <int[0-]>
+
+Subcommand ``stat`` shows MDS status.
+
+Usage::
+
+	ceph mds stat
+
+Subcommand ``stop`` stops mds.
+
+Usage::
+
+	ceph mds stop <who>
+
+Subcommand ``tell`` sends command to particular mds.
+
+Usage::
+
+	ceph mds tell <who> <args> [<args>...]
+
+mon
+---
+
+Manage monitor configuration and administration. It uses some additional
+subcommands.
+
+Subcommand ``add`` adds new monitor named <name> at <addr>.
+
+Usage::
+
+	ceph mon add <name> <IPaddr[:port]>
+
+Subcommand ``dump`` dumps formatted monmap (optionally from epoch)
+
+Usage::
+
+	ceph mon dump {<int[0-]>}
+
+Subcommand ``getmap`` gets monmap.
+
+Usage::
+
+	ceph mon getmap {<int[0-]>}
+
+Subcommand ``remove`` removes monitor named <name>.
+
+Usage::
+
+	ceph mon remove <name>
+
+Subcommand ``stat`` summarizes monitor status.
+
+Usage::
+
+	ceph mon stat
+
+mon_status
+----------
+
+Reports status of monitors.
+
+Usage::
+
+	ceph mon_status
+
+osd
+---
+
+Manage OSD configuration and administration. It uses some additional
+subcommands.
+
+Subcommand ``blacklist`` manage blacklisted clients. It uses some additional
+subcommands.
+
+Subcommand ``add`` add <addr> to blacklist (optionally until <expire> seconds
+from now)
+
+Usage::
+
+	ceph osd blacklist add <EntityAddr> {<float[0.0-]>}
+
+Subcommand ``ls`` show blacklisted clients
+
+Usage::
+
+	ceph osd blacklist ls
+
+Subcommand ``rm`` remove <addr> from blacklist
+
+Usage::
+
+	ceph osd blacklist rm <EntityAddr>
+
+Subcommand ``blocked-by`` prints a histogram of which OSDs are blocking their peers
+
+Usage::
+
+	ceph osd blocked-by
+
+Subcommand ``create`` creates new osd (with optional UUID and ID).
+
+Usage::
+
+	ceph osd create {<uuid>} {<id>}
+
+Subcommand ``crush`` is used for CRUSH management. It uses some additional
+subcommands.
+
+Subcommand ``add`` adds or updates crushmap position and weight for <name> with
+<weight> and location <args>.
+
+Usage::
+
+	ceph osd crush add <osdname (id|osd.id)> <float[0.0-]> <args> [<args>...]
+
+Subcommand ``add-bucket`` adds no-parent (probably root) crush bucket <name> of
+type <type>.
+
+Usage::
+
+	ceph osd crush add-bucket <name> <type>
+
+Subcommand ``create-or-move`` creates entry or moves existing entry for <name>
+<weight> at/to location <args>.
+
+Usage::
+
+	ceph osd crush create-or-move <osdname (id|osd.id)> <float[0.0-]> <args>
+	[<args>...]
+
+Subcommand ``dump`` dumps crush map.
+
+Usage::
+
+	ceph osd crush dump
+
+Subcommand ``get-tunable`` get crush tunable straw_calc_version
+
+Usage::
+
+	ceph osd crush get-tunable straw_calc_version
+
+Subcommand ``link`` links existing entry for <name> under location <args>.
+
+Usage::
+
+	ceph osd crush link <name> <args> [<args>...]
+
+Subcommand ``move`` moves existing entry for <name> to location <args>.
+
+Usage::
+
+	ceph osd crush move <name> <args> [<args>...]
+
+Subcommand ``remove`` removes <name> from crush map (everywhere, or just at
+<ancestor>).
+
+Usage::
+
+	ceph osd crush remove <name> {<ancestor>}
+
+Subcommand ``rename-bucket`` renames buchket <srcname> to <stname>
+
+Usage::
+
+	ceph osd crush rename-bucket <srcname> <dstname>
+
+Subcommand ``reweight`` change <name>'s weight to <weight> in crush map.
+
+Usage::
+
+	ceph osd crush reweight <name> <float[0.0-]>
+
+Subcommand ``reweight-all`` recalculate the weights for the tree to
+ensure they sum correctly
+
+Usage::
+
+	ceph osd crush reweight-all
+
+Subcommand ``reweight-subtree`` changes all leaf items beneath <name>
+to <weight> in crush map
+
+Usage::
+
+	ceph osd crush reweight-subtree <name> <weight>
+
+Subcommand ``rm`` removes <name> from crush map (everywhere, or just at
+<ancestor>).
+
+Usage::
+
+	ceph osd crush rm <name> {<ancestor>}
+
+Subcommand ``rule`` is used for creating crush rules. It uses some additional
+subcommands.
+
+Subcommand ``create-erasure`` creates crush rule <name> for erasure coded pool
+created with <profile> (default default).
+
+Usage::
+
+	ceph osd crush rule create-erasure <name> {<profile>}
+
+Subcommand ``create-simple`` creates crush rule <name> to start from <root>,
+replicate across buckets of type <type>, using a choose mode of <firstn|indep>
+(default firstn; indep best for erasure pools).
+
+Usage::
+
+	ceph osd crush rule create-simple <name> <root> <type> {firstn|indep}
+
+Subcommand ``dump`` dumps crush rule <name> (default all).
+
+Usage::
+
+	ceph osd crush rule dump {<name>}
+
+Subcommand ``list`` lists crush rules.
+
+Usage::
+
+	ceph osd crush rule list
+
+Subcommand ``ls`` lists crush rules.
+
+Usage::
+
+	ceph osd crush rule ls
+
+Subcommand ``rm`` removes crush rule <name>.
+
+Usage::
+
+	ceph osd crush rule rm <name>
+
+Subcommand ``set`` used alone, sets crush map from input file.
+
+Usage::
+
+	ceph osd crush set
+
+Subcommand ``set`` with osdname/osd.id update crushmap position and weight
+for <name> to <weight> with location <args>.
+
+Usage::
+
+	ceph osd crush set <osdname (id|osd.id)> <float[0.0-]> <args> [<args>...]
+
+Subcommand ``set-tunable`` set crush tunable <tunable> to <value>.  The only
+tunable that can be set is straw_calc_version.
+
+Usage::
+
+	ceph osd crush set-tunable straw_calc_version <value>
+
+Subcommand ``show-tunables`` shows current crush tunables.
+
+Usage::
+
+	ceph osd crush show-tunables
+
+Subcommand ``tree`` shows the crush buckets and items in a tree view.
+
+Usage::
+
+	ceph osd crush tree
+
+Subcommand ``tunables`` sets crush tunables values to <profile>.
+
+Usage::
+
+	ceph osd crush tunables legacy|argonaut|bobtail|firefly|hammer|optimal|default
+
+Subcommand ``unlink`` unlinks <name> from crush map (everywhere, or just at
+<ancestor>).
+
+Usage::
+
+	ceph osd crush unlink <name> {<ancestor>}
+
+Subcommand ``df`` shows OSD utilization
+
+Usage::
+
+	ceph osd df {plain|tree}
+
+Subcommand ``deep-scrub`` initiates deep scrub on specified osd.
+
+Usage::
+
+	ceph osd deep-scrub <who>
+
+Subcommand ``down`` sets osd(s) <id> [<id>...] down.
+
+Usage::
+
+	ceph osd down <ids> [<ids>...]
+
+Subcommand ``dump`` prints summary of OSD map.
+
+Usage::
+
+	ceph osd dump {<int[0-]>}
+
+Subcommand ``erasure-code-profile`` is used for managing the erasure code
+profiles. It uses some additional subcommands.
+
+Subcommand ``get`` gets erasure code profile <name>.
+
+Usage::
+
+	ceph osd erasure-code-profile get <name>
+
+Subcommand ``ls`` lists all erasure code profiles.
+
+Usage::
+
+	ceph osd erasure-code-profile ls
+
+Subcommand ``rm`` removes erasure code profile <name>.
+
+Usage::
+
+	ceph osd erasure-code-profile rm <name>
+
+Subcommand ``set`` creates erasure code profile <name> with [<key[=value]> ...]
+pairs. Add a --force at the end to override an existing profile (IT IS RISKY).
+
+Usage::
+
+	ceph osd erasure-code-profile set <name> {<profile> [<profile>...]}
+
+Subcommand ``find`` find osd <id> in the CRUSH map and shows its location.
+
+Usage::
+
+	ceph osd find <int[0-]>
+
+Subcommand ``getcrushmap`` gets CRUSH map.
+
+Usage::
+
+	ceph osd getcrushmap {<int[0-]>}
+
+Subcommand ``getmap`` gets OSD map.
+
+Usage::
+
+	ceph osd getmap {<int[0-]>}
+
+Subcommand ``getmaxosd`` shows largest OSD id.
+
+Usage::
+
+	ceph osd getmaxosd
+
+Subcommand ``in`` sets osd(s) <id> [<id>...] in.
+
+Usage::
+
+	ceph osd in <ids> [<ids>...]
+
+Subcommand ``lost`` marks osd as permanently lost. THIS DESTROYS DATA IF NO
+MORE REPLICAS EXIST, BE CAREFUL.
+
+Usage::
+
+	ceph osd lost <int[0-]> {--yes-i-really-mean-it}
+
+Subcommand ``ls`` shows all OSD ids.
+
+Usage::
+
+	ceph osd ls {<int[0-]>}
+
+Subcommand ``lspools`` lists pools.
+
+Usage::
+
+	ceph osd lspools {<int>}
+
+Subcommand ``map`` finds pg for <object> in <pool>.
+
+Usage::
+
+	ceph osd map <poolname> <objectname>
+
+Subcommand ``metadata`` fetches metadata for osd <id>.
+
+Usage::
+
+	ceph osd metadata {int[0-]} (default all)
+
+Subcommand ``out`` sets osd(s) <id> [<id>...] out.
+
+Usage::
+
+	ceph osd out <ids> [<ids>...]
+
+Subcommand ``pause`` pauses osd.
+
+Usage::
+
+	ceph osd pause
+
+Subcommand ``perf`` prints dump of OSD perf summary stats.
+
+Usage::
+
+	ceph osd perf
+
+Subcommand ``pg-temp`` set pg_temp mapping pgid:[<id> [<id>...]] (developers
+only).
+
+Usage::
+
+	ceph osd pg-temp <pgid> {<id> [<id>...]}
+
+Subcommand ``pool`` is used for managing data pools. It uses some additional
+subcommands.
+
+Subcommand ``create`` creates pool.
+
+Usage::
+
+	ceph osd pool create <poolname> <int[0-]> {<int[0-]>} {replicated|erasure}
+	{<erasure_code_profile>} {<ruleset>} {<int>}
+
+Subcommand ``delete`` deletes pool.
+
+Usage::
+
+	ceph osd pool delete <poolname> {<poolname>} {--yes-i-really-really-mean-it}
+
+Subcommand ``get`` gets pool parameter <var>.
+
+Usage::
+
+	ceph osd pool get <poolname> size|min_size|crash_replay_interval|pg_num|
+	pgp_num|crush_ruleset|auid|write_fadvise_dontneed
+
+Only for tiered pools::
+
+	ceph osd pool get <poolname> hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|
+	target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_dirty_high_ratio|
+	cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|
+	min_read_recency_for_promote
+
+Only for erasure coded pools::
+
+	ceph osd pool get <poolname> erasure_code_profile
+
+Use ``all`` to get all pool parameters that apply to the pool's type::
+
+	ceph osd pool get <poolname> all
+
+Subcommand ``get-quota`` obtains object or byte limits for pool.
+
+Usage::
+
+	ceph osd pool get-quota <poolname>
+
+Subcommand ``ls`` list pools
+
+Usage::
+
+	ceph osd pool ls {detail}
+
+Subcommand ``mksnap`` makes snapshot <snap> in <pool>.
+
+Usage::
+
+	ceph osd pool mksnap <poolname> <snap>
+
+Subcommand ``rename`` renames <srcpool> to <destpool>.
+
+Usage::
+
+	ceph osd pool rename <poolname> <poolname>
+
+Subcommand ``rmsnap`` removes snapshot <snap> from <pool>.
+
+Usage::
+
+	ceph osd pool rmsnap <poolname> <snap>
+
+Subcommand ``set`` sets pool parameter <var> to <val>.
+
+Usage::
+
+	ceph osd pool set <poolname> size|min_size|crash_replay_interval|pg_num|
+	pgp_num|crush_ruleset|hashpspool|nodelete|nopgchange|nosizechange|
+	hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|debug_fake_ec_pool|
+	target_max_bytes|target_max_objects|cache_target_dirty_ratio|
+	cache_target_dirty_high_ratio|
+	cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|auid|
+	min_read_recency_for_promote|write_fadvise_dontneed
+	<val> {--yes-i-really-mean-it}
+
+Subcommand ``set-quota`` sets object or byte limit on pool.
+
+Usage::
+
+	ceph osd pool set-quota <poolname> max_objects|max_bytes <val>
+
+Subcommand ``stats`` obtain stats from all pools, or from specified pool.
+
+Usage::
+
+	ceph osd pool stats {<name>}
+
+Subcommand ``primary-affinity`` adjust osd primary-affinity from 0.0 <=<weight>
+<= 1.0
+
+Usage::
+
+	ceph osd primary-affinity <osdname (id|osd.id)> <float[0.0-1.0]>
+
+Subcommand ``primary-temp`` sets primary_temp mapping pgid:<id>|-1 (developers
+only).
+
+Usage::
+
+	ceph osd primary-temp <pgid> <id>
+
+Subcommand ``repair`` initiates repair on a specified osd.
+
+Usage::
+
+	ceph osd repair <who>
+
+Subcommand ``reweight`` reweights osd to 0.0 < <weight> < 1.0.
+
+Usage::
+
+	osd reweight <int[0-]> <float[0.0-1.0]>
+
+Subcommand ``reweight-by-pg`` reweight OSDs by PG distribution
+[overload-percentage-for-consideration, default 120].
+
+Usage::
+
+	ceph osd reweight-by-pg {<int[100-]>} {<poolname> [<poolname...]}
+
+Subcommand ``reweight-by-utilization`` reweight OSDs by utilization
+[overload-percentage-for-consideration, default 120].
+
+Usage::
+
+	ceph osd reweight-by-utilization {<int[100-]>}
+
+Subcommand ``rm`` removes osd(s) <id> [<id>...] in the cluster.
+
+Usage::
+
+	ceph osd rm <ids> [<ids>...]
+
+Subcommand ``scrub`` initiates scrub on specified osd.
+
+Usage::
+
+	ceph osd scrub <who>
+
+Subcommand ``set`` sets <key>.
+
+Usage::
+
+	ceph osd set full|pause|noup|nodown|noout|noin|nobackfill|
+	norebalance|norecover|noscrub|nodeep-scrub|notieragent
+
+Subcommand ``setcrushmap`` sets crush map from input file.
+
+Usage::
+
+	ceph osd setcrushmap
+
+Subcommand ``setmaxosd`` sets new maximum osd value.
+
+Usage::
+
+	ceph osd setmaxosd <int[0-]>
+
+Subcommand ``stat`` prints summary of OSD map.
+
+Usage::
+
+	ceph osd stat
+
+Subcommand ``thrash`` thrashes OSDs for <num_epochs>.
+
+Usage::
+
+	ceph osd thrash <int[0-]>
+
+Subcommand ``tier`` is used for managing tiers. It uses some additional
+subcommands.
+
+Subcommand ``add`` adds the tier <tierpool> (the second one) to base pool <pool>
+(the first one).
+
+Usage::
+
+	ceph osd tier add <poolname> <poolname> {--force-nonempty}
+
+Subcommand ``add-cache`` adds a cache <tierpool> (the second one) of size <size>
+to existing pool <pool> (the first one).
+
+Usage::
+
+	ceph osd tier add-cache <poolname> <poolname> <int[0-]>
+
+Subcommand ``cache-mode`` specifies the caching mode for cache tier <pool>.
+
+Usage::
+
+	ceph osd tier cache-mode <poolname> none|writeback|forward|readonly|
+	readforward|readproxy
+
+Subcommand ``remove`` removes the tier <tierpool> (the second one) from base pool
+<pool> (the first one).
+
+Usage::
+
+	ceph osd tier remove <poolname> <poolname>
+
+Subcommand ``remove-overlay`` removes the overlay pool for base pool <pool>.
+
+Usage::
+
+	ceph osd tier remove-overlay <poolname>
+
+Subcommand ``set-overlay`` set the overlay pool for base pool <pool> to be
+<overlaypool>.
+
+Usage::
+
+	ceph osd tier set-overlay <poolname> <poolname>
+
+Subcommand ``tree`` prints OSD tree.
+
+Usage::
+
+	ceph osd tree {<int[0-]>}
+
+Subcommand ``unpause`` unpauses osd.
+
+Usage::
+
+	ceph osd unpause
+
+Subcommand ``unset`` unsets <key>.
+
+Usage::
+
+	ceph osd unset full|pause|noup|nodown|noout|noin|nobackfill|
+	norebalance|norecover|noscrub|nodeep-scrub|notieragent
+
+
+pg
+--
+
+It is used for managing the placement groups in OSDs. It uses some
+additional subcommands.
+
+Subcommand ``debug`` shows debug info about pgs.
+
+Usage::
+
+	ceph pg debug unfound_objects_exist|degraded_pgs_exist
+
+Subcommand ``deep-scrub`` starts deep-scrub on <pgid>.
+
+Usage::
+
+	ceph pg deep-scrub <pgid>
+
+Subcommand ``dump`` shows human-readable versions of pg map (only 'all' valid
+with plain).
+
+Usage::
+
+	ceph pg dump {all|summary|sum|delta|pools|osds|pgs|pgs_brief} [{all|summary|sum|delta|pools|osds|pgs|pgs_brief...]}
+
+Subcommand ``dump_json`` shows human-readable version of pg map in json only.
+
+Usage::
+
+	ceph pg dump_json {all|summary|sum|delta|pools|osds|pgs|pgs_brief} [{all|summary|sum|delta|pools|osds|pgs|pgs_brief...]}
+
+Subcommand ``dump_pools_json`` shows pg pools info in json only.
+
+Usage::
+
+	ceph pg dump_pools_json
+
+Subcommand ``dump_stuck`` shows information about stuck pgs.
+
+Usage::
+
+	ceph pg dump_stuck {inactive|unclean|stale|undersized|degraded [inactive|unclean|stale|undersized|degraded...]}
+	{<int>}
+
+Subcommand ``force_create_pg`` forces creation of pg <pgid>.
+
+Usage::
+
+	ceph pg force_create_pg <pgid>
+
+Subcommand ``getmap`` gets binary pg map to -o/stdout.
+
+Usage::
+
+	ceph pg getmap
+
+Subcommand ``ls`` lists pg with specific pool, osd, state
+
+Usage::
+
+	ceph pg ls {<int>} {active|clean|down|replay|splitting|
+	scrubbing|scrubq|degraded|inconsistent|peering|repair|
+	recovery|backfill_wait|incomplete|stale| remapped|
+	deep_scrub|backfill|backfill_toofull|recovery_wait|
+	undersized [active|clean|down|replay|splitting|
+	scrubbing|scrubq|degraded|inconsistent|peering|repair|
+	recovery|backfill_wait|incomplete|stale|remapped|
+	deep_scrub|backfill|backfill_toofull|recovery_wait|
+	undersized...]}
+
+Subcommand ``ls-by-osd`` lists pg on osd [osd]
+
+Usage::
+
+	ceph pg ls-by-osd <osdname (id|osd.id)> {<int>}
+	{active|clean|down|replay|splitting|
+	scrubbing|scrubq|degraded|inconsistent|peering|repair|
+	recovery|backfill_wait|incomplete|stale| remapped|
+	deep_scrub|backfill|backfill_toofull|recovery_wait|
+	undersized [active|clean|down|replay|splitting|
+	scrubbing|scrubq|degraded|inconsistent|peering|repair|
+	recovery|backfill_wait|incomplete|stale|remapped|
+	deep_scrub|backfill|backfill_toofull|recovery_wait|
+	undersized...]}
+
+Subcommand ``ls-by-pool`` lists pg with pool = [poolname | poolid]
+
+Usage::
+
+	ceph pg ls-by-pool <poolstr> {<int>} {active|
+	clean|down|replay|splitting|
+	scrubbing|scrubq|degraded|inconsistent|peering|repair|
+	recovery|backfill_wait|incomplete|stale| remapped|
+	deep_scrub|backfill|backfill_toofull|recovery_wait|
+	undersized [active|clean|down|replay|splitting|
+	scrubbing|scrubq|degraded|inconsistent|peering|repair|
+	recovery|backfill_wait|incomplete|stale|remapped|
+	deep_scrub|backfill|backfill_toofull|recovery_wait|
+	undersized...]}
+
+Subcommand ``ls-by-primary`` lists pg with primary = [osd]
+
+Usage::
+
+	ceph pg ls-by-primary <osdname (id|osd.id)> {<int>}
+	{active|clean|down|replay|splitting|
+	scrubbing|scrubq|degraded|inconsistent|peering|repair|
+	recovery|backfill_wait|incomplete|stale| remapped|
+	deep_scrub|backfill|backfill_toofull|recovery_wait|
+	undersized [active|clean|down|replay|splitting|
+	scrubbing|scrubq|degraded|inconsistent|peering|repair|
+	recovery|backfill_wait|incomplete|stale|remapped|
+	deep_scrub|backfill|backfill_toofull|recovery_wait|
+	undersized...]}
+
+Subcommand ``map`` shows mapping of pg to osds.
+
+Usage::
+
+	ceph pg map <pgid>
+
+Subcommand ``repair`` starts repair on <pgid>.
+
+Usage::
+
+	ceph pg repair <pgid>
+
+Subcommand ``scrub`` starts scrub on <pgid>.
+
+Usage::
+
+	ceph pg scrub <pgid>
+
+Subcommand ``send_pg_creates`` triggers pg creates to be issued.
+
+Usage::
+
+	ceph pg send_pg_creates
+
+Subcommand ``set_full_ratio`` sets ratio at which pgs are considered full.
+
+Usage::
+
+	ceph pg set_full_ratio <float[0.0-1.0]>
+
+Subcommand ``set_nearfull_ratio`` sets ratio at which pgs are considered nearly
+full.
+
+Usage::
+
+	ceph pg set_nearfull_ratio <float[0.0-1.0]>
+
+Subcommand ``stat`` shows placement group status.
+
+Usage::
+
+	ceph pg stat
+
+
+quorum
+------
+
+Enter or exit quorum.
+
+Usage::
+
+	ceph quorum enter|exit
+
+
+quorum_status
+-------------
+
+Reports status of monitor quorum.
+
+Usage::
+
+	ceph quorum_status
+
+
+report
+------
+
+Reports full status of cluster, optional title tag strings.
+
+Usage::
+
+	ceph report {<tags> [<tags>...]}
+
+
+scrub
+-----
+
+Scrubs the monitor stores.
+
+Usage::
+
+	ceph scrub
+
+
+status
+------
+
+Shows cluster status.
+
+Usage::
+
+	ceph status
+
+
+sync force
+----------
+
+Forces sync of and clear monitor store.
+
+Usage::
+
+	ceph sync force {--yes-i-really-mean-it} {--i-know-what-i-am-doing}
+
+
+tell
+----
+
+Sends a command to a specific daemon.
+
+Usage::
+
+	ceph tell <name (type.id)> <args> [<args>...]
+
+version
+-------
+
+Show mon daemon version
+
+Usage::
+
+	ceph version
+
+Options
+=======
+
+.. option:: -i infile
+
+   will specify an input file to be passed along as a payload with the
+   command to the monitor cluster. This is only used for specific
+   monitor commands.
+
+.. option:: -o outfile
+
+   will write any payload returned by the monitor cluster with its
+   reply to outfile.  Only specific monitor commands (e.g. osd getmap)
+   return a payload.
+
+.. option:: -c ceph.conf, --conf=ceph.conf
+
+   Use ceph.conf configuration file instead of the default
+   ``/etc/ceph/ceph.conf`` to determine monitor addresses during startup.
+
+.. option:: --id CLIENT_ID, --user CLIENT_ID
+
+   Client id for authentication.
+
+.. option:: --name CLIENT_NAME, -n CLIENT_NAME
+
+	Client name for authentication.
+
+.. option:: --cluster CLUSTER
+
+	Name of the Ceph cluster.
+
+.. option:: daemon ADMIN_SOCKET, daemon DAEMON_NAME, --admin-socket ADMIN_SOCKET, --admin-socket DAEMON_NAME
+
+	Submit admin-socket commands via admin sockets in /var/run/ceph.
+
+.. option:: --admin-socket ADMIN_SOCKET_NOPE
+
+	You probably mean --admin-daemon
+
+.. option:: -s, --status
+
+	Show cluster status.
+
+.. option:: -w, --watch
+
+	Watch live cluster changes.
+
+.. option:: --watch-debug
+
+	Watch debug events.
+
+.. option:: --watch-info
+
+	Watch info events.
+
+.. option:: --watch-sec
+
+	Watch security events.
+
+.. option:: --watch-warn
+
+	Watch warning events.
+
+.. option:: --watch-error
+
+	Watch error events.
+
+.. option:: --version, -v
+
+	Display version.
+
+.. option:: --verbose
+
+	Make verbose.
+
+.. option:: --concise
+
+	Make less verbose.
+
+.. option:: -f {json,json-pretty,xml,xml-pretty,plain}, --format
+
+	Format of output.
+
+.. option:: --connect-timeout CLUSTER_TIMEOUT
+
+	Set a timeout for connecting to the cluster.
+
+
+Availability
+============
+
+:program:`ceph` is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to
+the Ceph documentation at http://ceph.com/docs for more information.
+
+
+See also
+========
+
+:doc:`ceph-mon <ceph-mon>`\(8),
+:doc:`ceph-osd <ceph-osd>`\(8),
+:doc:`ceph-mds <ceph-mds>`\(8)
diff --git a/doc/man/8/cephfs.rst b/doc/man/8/cephfs.rst
new file mode 100644
index 0000000..0ad91d0
--- /dev/null
+++ b/doc/man/8/cephfs.rst
@@ -0,0 +1,99 @@
+:orphan:
+
+============================================
+ cephfs -- ceph file system options utility
+============================================
+
+.. program:: cephfs
+
+Synopsis
+========
+
+| **cephfs** [ *path* *command* *options* ]
+
+
+Description
+===========
+
+**cephfs** is a control utility for accessing and manipulating file
+layout and location data in the Ceph distributed storage system.
+
+.. TODO format this like a proper man page
+
+Choose one of the following three commands:
+
+- ``show_layout`` View the layout information on a file or directory
+- ``set_layout`` Set the layout information on a file or directory
+- ``show_location`` View the location information on a file
+
+
+Options
+=======
+
+Your applicable options differ depending on whether you are setting or viewing layout/location.
+
+Viewing options:
+----------------
+
+.. option:: -l --offset
+
+    Specify an offset for which to retrieve location data
+
+Setting options:
+----------------
+
+.. option:: -u --stripe_unit
+
+   Set the size of each stripe
+
+.. option:: -c --stripe_count
+
+   Set the number of objects to stripe across
+
+.. option:: -s --object_size
+
+   Set the size of the objects to stripe across
+
+.. option:: -p --pool
+
+   Set the pool (by numeric value, not name!) to use
+
+.. option:: -o --osd
+
+   Set the preferred OSD to use as the primary (deprecated and ignored)
+
+
+Limitations
+===========
+
+When setting layout data, the specified object size must evenly divide
+by the specified stripe unit. Any parameters you don't set
+explicitly are left at the system defaults.
+
+Obviously setting the layout of a file and a directory means different
+things. Setting the layout of a file specifies exactly how to place
+the individual file. This must be done before writing *any* data to
+it. Truncating a file does not allow you to change the layout either.
+
+Setting the layout of a directory sets the "default layout", which is
+used to set the file layouts on any files subsequently created in the
+directory (or any subdirectory).  Pre-existing files do not have their
+layouts changed.
+
+You'll notice that the layout information allows you to specify a
+preferred OSD for placement. This feature is unsupported and ignored
+in modern versions of the Ceph servers; do not use it.
+
+
+Availability
+============
+
+**cephfs** is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer
+to the Ceph documentation at http://ceph.com/docs for more
+information.
+
+
+See also
+========
+
+:doc:`ceph <ceph>`\(8)
diff --git a/doc/man/8/crushtool.rst b/doc/man/8/crushtool.rst
new file mode 100644
index 0000000..3c44947
--- /dev/null
+++ b/doc/man/8/crushtool.rst
@@ -0,0 +1,265 @@
+:orphan:
+
+==========================================
+ crushtool -- CRUSH map manipulation tool
+==========================================
+
+.. program:: crushtool
+
+Synopsis
+========
+
+| **crushtool** ( -d *map* | -c *map.txt* | --build --num_osds *numosds*
+  *layer1* *...* | --test ) [ -o *outfile* ]
+
+
+Description
+===========
+
+**crushtool** is a utility that lets you create, compile, decompile
+ and test CRUSH map files.
+
+CRUSH is a pseudo-random data distribution algorithm that efficiently
+maps input values (typically data objects) across a heterogeneous,
+hierarchically structured device map. The algorithm was originally
+described in detail in the following paper (although it has evolved
+some since then):
+
+       http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf
+
+The tool has four modes of operation.
+
+.. option:: --compile|-c map.txt
+
+   will compile a plaintext map.txt into a binary map file.
+
+.. option:: --decompile|-d map
+
+   will take the compiled map and decompile it into a plaintext source
+   file, suitable for editing.
+
+.. option:: --build --num_osds {num-osds} layer1 ...
+
+   will create map with the given layer structure. See below for a
+   detailed explanation.
+
+.. option:: --test
+
+   will perform a dry run of a CRUSH mapping for a range of input
+   object names. See below for a detailed explanation.
+
+Unlike other Ceph tools, **crushtool** does not accept generic options
+such as **--debug-crush** from the command line. They can, however, be
+provided via the CEPH_ARGS environment variable. For instance, to
+silence all output from the CRUSH subsystem::
+
+    CEPH_ARGS="--debug-crush 0" crushtool ...
+
+
+Running tests with --test
+=========================
+
+The test mode will use the input crush map ( as specified with **-i
+map** ) and perform a dry run of CRUSH mapping or random placement (
+if **--simulate** is set ). On completion, two kinds of reports can be
+created. 
+1) The **--show-...** option outputs human readable information
+on stderr. 
+2) The **--output-csv** option creates CSV files that are
+documented by the **--help-output** option.
+
+.. option:: --show-statistics
+
+   For each rule, displays the mapping of each object. For instance::
+
+       CRUSH rule 1 x 24 [11,6]
+
+   shows that object **24** is mapped to devices **[11,6]** by rule
+   **1**. At the end of the mapping details, a summary of the
+   distribution is displayed. For instance::
+
+       rule 1 (metadata) num_rep 5 result size == 5:	1024/1024
+
+   shows that rule **1** which is named **metadata** successfully
+   mapped **1024** objects to **result size == 5** devices when trying
+   to map them to **num_rep 5** replicas. When it fails to provide the
+   required mapping, presumably because the number of **tries** must
+   be increased, a breakdown of the failures is displayed. For instance::
+
+       rule 1 (metadata) num_rep 10 result size == 8:	4/1024
+       rule 1 (metadata) num_rep 10 result size == 9:	93/1024
+       rule 1 (metadata) num_rep 10 result size == 10:	927/1024
+
+   shows that although **num_rep 10** replicas were required, **4**
+   out of **1024** objects ( **4/1024** ) were mapped to **result size
+   == 8** devices only.
+
+.. option:: --show-bad-mappings
+
+   Displays which object failed to be mapped to the required number of
+   devices. For instance::
+
+     bad mapping rule 1 x 781 num_rep 7 result [8,10,2,11,6,9]
+
+   shows that when rule **1** was required to map **7** devices, it
+   could map only six : **[8,10,2,11,6,9]**.
+
+.. option:: --show-utilization
+
+   Displays the expected and actual utilisation for each device, for
+   each number of replicas. For instance::
+
+     device 0: stored : 951      expected : 853.333
+     device 1: stored : 963      expected : 853.333
+     ...
+
+   shows that device **0** stored **951** objects and was expected to store **853**.
+   Implies **--show-statistics**.
+
+.. option:: --show-utilization-all
+
+   Displays the same as **--show-utilization** but does not suppress
+   output when the weight of a device is zero.
+   Implies **--show-statistics**.
+
+.. option:: --show-choose-tries
+
+   Displays how many attempts were needed to find a device mapping.
+   For instance::
+
+      0:     95224
+      1:      3745
+      2:      2225
+      ..
+
+   shows that **95224** mappings succeeded without retries, **3745**
+   mappings succeeded with one attempts, etc. There are as many rows
+   as the value of the **--set-choose-total-tries** option.
+
+.. option:: --output-csv
+
+   Creates CSV files (in the current directory) containing information
+   documented by **--help-output**. The files are named after the rule
+   used when collecting the statistics. For instance, if the rule
+   : 'metadata' is used, the CSV files will be::
+
+      metadata-absolute_weights.csv
+      metadata-device_utilization.csv
+      ...
+
+   The first line of the file shortly explains the column layout. For
+   instance::
+
+      metadata-absolute_weights.csv
+      Device ID, Absolute Weight
+      0,1
+      ...
+
+.. option:: --output-name NAME
+
+   Prepend **NAME** to the file names generated when **--output-csv**
+   is specified. For instance **--output-name FOO** will create
+   files::
+
+      FOO-metadata-absolute_weights.csv
+      FOO-metadata-device_utilization.csv
+      ...
+
+The **--set-...** options can be used to modify the tunables of the
+input crush map. The input crush map is modified in
+memory. For example::
+
+      $ crushtool -i mymap --test --show-bad-mappings
+      bad mapping rule 1 x 781 num_rep 7 result [8,10,2,11,6,9]
+
+could be fixed by increasing the **choose-total-tries** as follows:
+
+      $ crushtool -i mymap --test \
+          --show-bad-mappings \
+          --set-choose-total-tries 500
+
+Building a map with --build
+===========================
+
+The build mode will generate hierarchical maps. The first argument
+specifies the number of devices (leaves) in the CRUSH hierarchy. Each
+layer describes how the layer (or devices) preceding it should be
+grouped.
+
+Each layer consists of::
+
+       bucket ( uniform | list | tree | straw ) size
+
+The **bucket** is the type of the buckets in the layer
+(e.g. "rack"). Each bucket name will be built by appending a unique
+number to the **bucket** string (e.g. "rack0", "rack1"...).
+
+The second component is the type of bucket: **straw** should be used
+most of the time.
+
+The third component is the maximum size of the bucket. A size of zero
+means a bucket of infinite capacity.
+
+
+Example
+=======
+
+Suppose we have two rows with two racks each and 20 nodes per rack. Suppose
+each node contains 4 storage devices for Ceph OSD Daemons. This configuration
+allows us to deploy 320 Ceph OSD Daemons. Lets assume a 42U rack with 2U nodes,
+leaving an extra 2U for a rack switch.
+
+To reflect our hierarchy of devices, nodes, racks and rows, we would execute
+the following::
+
+    $ crushtool -o crushmap --build --num_osds 320 \
+           node straw 4 \
+           rack straw 20 \
+           row straw 2 \
+           root straw 0
+    # id	weight	type name	reweight
+    -87	320	root root
+    -85	160		row row0
+    -81	80			rack rack0
+    -1	4				node node0
+    0	1					osd.0	1
+    1	1					osd.1	1
+    2	1					osd.2	1
+    3	1					osd.3	1
+    -2	4				node node1
+    4	1					osd.4	1
+    5	1					osd.5	1
+    ...
+
+CRUSH rulesets are created so the generated crushmap can be
+tested. They are the same rulesets as the one created by default when
+creating a new Ceph cluster. They can be further edited with::
+
+       # decompile
+       crushtool -d crushmap -o map.txt
+
+       # edit
+       emacs map.txt
+
+       # recompile
+       crushtool -c map.txt -o crushmap
+
+
+Availability
+============
+
+**crushtool** is part of Ceph, a massively scalable, open-source, distributed storage system. Please
+refer to the Ceph documentation at http://ceph.com/docs for more
+information.
+
+
+See also
+========
+
+:doc:`ceph <ceph>`\(8),
+:doc:`osdmaptool <osdmaptool>`\(8),
+
+Authors
+=======
+
+John Wilkins, Sage Weil, Loic Dachary
diff --git a/doc/man/8/librados-config.rst b/doc/man/8/librados-config.rst
new file mode 100644
index 0000000..940e8c2
--- /dev/null
+++ b/doc/man/8/librados-config.rst
@@ -0,0 +1,46 @@
+:orphan:
+
+=======================================================
+ librados-config -- display information about librados
+=======================================================
+
+.. program:: librados-config
+
+Synopsis
+========
+
+| **librados-config** [ --version ] [ --vernum ]
+
+
+Description
+===========
+
+**librados-config** is a utility that displays information about the
+  installed ``librados``.
+
+
+Options
+=======
+
+.. option:: --version
+
+   Display ``librados`` version
+
+.. option:: --vernum
+
+   Display the ``librados`` version code
+
+
+Availability
+============
+
+**librados-config** is part of Ceph, a massively scalable, open-source, distributed storage system.
+Please refer to the Ceph documentation at http://ceph.com/docs for
+more information.
+
+
+See also
+========
+
+:doc:`ceph <ceph>`\(8),
+:doc:`rados <rados>`\(8)
diff --git a/doc/man/8/monmaptool.rst b/doc/man/8/monmaptool.rst
new file mode 100644
index 0000000..97d5d40
--- /dev/null
+++ b/doc/man/8/monmaptool.rst
@@ -0,0 +1,107 @@
+:orphan:
+
+==========================================================
+ monmaptool -- ceph monitor cluster map manipulation tool
+==========================================================
+
+.. program:: monmaptool
+
+Synopsis
+========
+
+| **monmaptool** *mapfilename* [ --clobber ] [ --print ] [ --create ]
+  [ --add *ip*:*port* *...* ] [ --rm *ip*:*port* *...* ]
+
+
+Description
+===========
+
+**monmaptool** is a utility to create, view, and modify a monitor
+cluster map for the Ceph distributed storage system. The monitor map
+specifies the only fixed addresses in the Ceph distributed system.
+All other daemons bind to arbitrary addresses and register themselves
+with the monitors.
+
+When creating a map with --create, a new monitor map with a new,
+random UUID will be created. It should be followed by one or more
+monitor addresses.
+
+The default Ceph monitor port is 6789.
+
+
+Options
+=======
+
+.. option:: --print
+
+   will print a plaintext dump of the map, after any modifications are
+   made.
+
+.. option:: --clobber
+
+   will allow monmaptool to overwrite mapfilename if changes are made.
+
+.. option:: --create
+
+   will create a new monitor map with a new UUID (and with it, a new,
+   empty Ceph file system).
+
+.. option:: --generate
+
+   generate a new monmap based on the values on the command line or specified
+   in the ceph configuration.  This is, in order of preference,
+
+      #. ``--monmap filename`` to specify a monmap to load
+      #. ``--mon-host 'host1,ip2'`` to specify a list of hosts or ip addresses
+      #. ``[mon.foo]`` sections containing ``mon addr`` settings in the config
+
+.. option:: --filter-initial-members
+
+   filter the initial monmap by applying the ``mon initial members``
+   setting.  Monitors not present in that list will be removed, and
+   initial members not present in the map will be added with dummy
+   addresses.
+
+.. option:: --add name ip:port
+
+   will add a monitor with the specified ip:port to the map.
+
+.. option:: --rm name
+
+    will remove the monitor with the specified ip:port from the map.
+
+.. option:: --fsid uuid
+
+    will set the fsid to the given uuid.  If not specified with --create, a random fsid will be generated.
+
+
+Example
+=======
+
+To create a new map with three monitors (for a fresh Ceph file system)::
+
+        monmaptool  --create  --add  mon.a 192.168.0.10:6789 --add mon.b 192.168.0.11:6789 \
+          --add mon.c 192.168.0.12:6789 --clobber monmap
+
+To display the contents of the map::
+
+        monmaptool --print monmap
+
+To replace one monitor::
+
+        monmaptool --rm mon.a --add mon.a 192.168.0.9:6789 --clobber monmap
+
+
+Availability
+============
+
+**monmaptool** is part of Ceph, a massively scalable, open-source, distributed storage system.  Please
+refer to the Ceph documentation at http://ceph.com/docs for more
+information.
+
+
+See also
+========
+
+:doc:`ceph <ceph>`\(8),
+:doc:`crushtool <crushtool>`\(8),
diff --git a/doc/man/8/mount.ceph.rst b/doc/man/8/mount.ceph.rst
new file mode 100644
index 0000000..c257a70
--- /dev/null
+++ b/doc/man/8/mount.ceph.rst
@@ -0,0 +1,165 @@
+:orphan:
+
+========================================
+ mount.ceph -- mount a ceph file system
+========================================
+
+.. program:: mount.ceph
+
+Synopsis
+========
+
+| **mount.ceph** *monaddr1*\ [,\ *monaddr2*\ ,...]:/[*subdir*] *dir* [
+  -o *options* ]
+
+
+Description
+===========
+
+**mount.ceph** is a simple helper for mounting the Ceph file system on
+a Linux host. It serves to resolve monitor hostname(s) into IP
+addresses and read authentication keys from disk; the Linux kernel
+client component does most of the real work. In fact, it is possible
+to mount a non-authenticated Ceph file system without mount.ceph by
+specifying monitor address(es) by IP::
+
+        mount -t ceph 1.2.3.4:/ mountpoint
+
+Each monitor address monaddr takes the form host[:port]. If the port
+is not specified, the Ceph default of 6789 is assumed.
+
+Multiple monitor addresses can be separated by commas. Only one
+responsible monitor is needed to successfully mount; the client will
+learn about all monitors from any responsive monitor. However, it is a
+good idea to specify more than one in case one happens to be down at
+the time of mount.
+
+A subdirectory subdir may be specified if a subset of the file system
+is to be mounted.
+
+Mount helper application conventions dictate that the first two
+options are device to be mounted and destination path. Options must be
+passed only after these fixed arguments.
+
+
+Options
+=======
+
+:command:`wsize`
+  int, max write size. Default: none (writeback uses smaller of wsize
+  and stripe unit)
+
+:command:`rsize`
+  int (bytes), max readahead, multiple of 1024, Default: 524288
+  (512*1024)
+
+:command:`osdtimeout`
+  int (seconds), Default: 60
+
+:command:`osdkeepalivetimeout`
+  int, Default: 5
+
+:command:`mount_timeout`
+  int (seconds), Default: 60
+
+:command:`osd_idle_ttl`
+  int (seconds), Default: 60
+
+:command:`caps_wanted_delay_min`
+  int, cap release delay, Default: 5
+
+:command:`caps_wanted_delay_max`
+  int, cap release delay, Default: 60
+
+:command:`cap_release_safety`
+  int, Default: calculated
+
+:command:`readdir_max_entries`
+  int, Default: 1024
+
+:command:`readdir_max_bytes`
+  int, Default: 524288 (512*1024)
+
+:command:`write_congestion_kb`
+  int (kb), max writeback in flight. scale with available
+  memory. Default: calculated from available memory
+
+:command:`snapdirname`
+  string, set the name of the hidden snapdir. Default: .snap
+
+:command:`name`
+  RADOS user to authenticate as when using cephx. Default: guest
+
+:command:`secret`
+  secret key for use with cephx. This option is insecure because it exposes
+  the secret on the command line. To avoid this, use the secretfile option.
+
+:command:`secretfile`
+  path to file containing the secret key to use with cephx
+
+:command:`ip`
+  my ip
+
+:command:`noshare`
+  create a new client instance, instead of sharing an existing
+  instance of a client mounting the same cluster
+
+:command:`dirstat`
+  funky `cat dirname` for stats, Default: off
+
+:command:`nodirstat`
+  no funky `cat dirname` for stats
+
+:command:`rbytes`
+  Report the recursive size of the directory contents for st_size on
+  directories.  Default: on
+
+:command:`norbytes`
+  Do not report the recursive size of the directory contents for
+  st_size on directories.
+
+:command:`nocrc`
+  no data crc on writes
+
+:command:`noasyncreaddir`
+  no dcache readdir
+
+
+Examples
+========
+
+Mount the full file system::
+
+        mount.ceph monhost:/ /mnt/foo
+
+If there are multiple monitors::
+
+        mount.ceph monhost1,monhost2,monhost3:/ /mnt/foo
+
+If :doc:`ceph-mon <ceph-mon>`\(8) is running on a non-standard
+port::
+
+        mount.ceph monhost1:7000,monhost2:7000,monhost3:7000:/ /mnt/foo
+
+To mount only part of the namespace::
+
+        mount.ceph monhost1:/some/small/thing /mnt/thing
+
+Assuming mount.ceph(8) is installed properly, it should be
+automatically invoked by mount(8) like so::
+
+        mount -t ceph monhost:/ /mnt/foo
+
+
+Availability
+============
+
+**mount.ceph** is part of Ceph, a massively scalable, open-source, distributed storage system. Please
+refer to the Ceph documentation at http://ceph.com/docs for more
+information.
+
+See also
+========
+
+:doc:`ceph-fuse <ceph-fuse>`\(8),
+:doc:`ceph <ceph>`\(8)
diff --git a/doc/man/8/osdmaptool.rst b/doc/man/8/osdmaptool.rst
new file mode 100644
index 0000000..cf27424
--- /dev/null
+++ b/doc/man/8/osdmaptool.rst
@@ -0,0 +1,77 @@
+:orphan:
+
+======================================================
+ osdmaptool -- ceph osd cluster map manipulation tool
+======================================================
+
+.. program:: osdmaptool
+
+Synopsis
+========
+
+| **osdmaptool** *mapfilename* [--print] [--createsimple *numosd*
+  [--pgbits *bitsperosd* ] ] [--clobber]
+
+
+Description
+===========
+
+**osdmaptool** is a utility that lets you create, view, and manipulate
+OSD cluster maps from the Ceph distributed storage system. Notably, it
+lets you extract the embedded CRUSH map or import a new CRUSH map.
+
+
+Options
+=======
+
+.. option:: --print
+
+   will simply make the tool print a plaintext dump of the map, after
+   any modifications are made.
+
+.. option:: --clobber
+
+   will allow osdmaptool to overwrite mapfilename if changes are made.
+
+.. option:: --import-crush mapfile
+
+   will load the CRUSH map from mapfile and embed it in the OSD map.
+
+.. option:: --export-crush mapfile
+
+   will extract the CRUSH map from the OSD map and write it to
+   mapfile.
+
+.. option:: --createsimple numosd [--pgbits bitsperosd]
+
+   will create a relatively generic OSD map with the numosd devices.
+   If --pgbits is specified, the initial placement group counts will
+   be set with bitsperosd bits per OSD. That is, the pg_num map
+   attribute will be set to numosd shifted by bitsperosd.
+
+
+Example
+=======
+
+To create a simple map with 16 devices::
+
+        osdmaptool --createsimple 16 osdmap --clobber
+
+To view the result::
+
+        osdmaptool --print osdmap
+
+
+Availability
+============
+
+**osdmaptool** is part of Ceph, a massively scalable, open-source, distributed storage system.  Please
+refer to the Ceph documentation at http://ceph.com/docs for more
+information.
+
+
+See also
+========
+
+:doc:`ceph <ceph>`\(8),
+:doc:`crushtool <crushtool>`\(8),
diff --git a/doc/man/8/rados.rst b/doc/man/8/rados.rst
new file mode 100644
index 0000000..ce8a803
--- /dev/null
+++ b/doc/man/8/rados.rst
@@ -0,0 +1,181 @@
+:orphan:
+
+=======================================
+ rados -- rados object storage utility
+=======================================
+
+.. program:: rados
+
+Synopsis
+========
+
+| **rados** [ -m *monaddr* ] [ mkpool | rmpool *foo* ] [ -p | --pool
+  *pool* ] [ -s | --snap *snap* ] [ -i *infile* ] [ -o *outfile* ]
+  *command* ...
+
+
+Description
+===========
+
+**rados** is a utility for interacting with a Ceph object storage
+cluster (RADOS), part of the Ceph distributed storage system.
+
+
+Options
+=======
+
+.. option:: -p pool, --pool pool
+
+   Interact with the given pool. Required by most commands.
+
+.. option:: -s snap, --snap snap
+
+   Read from the given pool snapshot. Valid for all pool-specific read operations.
+
+.. option:: -i infile
+
+   will specify an input file to be passed along as a payload with the
+   command to the monitor cluster. This is only used for specific
+   monitor commands.
+
+.. option:: -o outfile
+
+   will write any payload returned by the monitor cluster with its
+   reply to outfile. Only specific monitor commands (e.g. osd getmap)
+   return a payload.
+
+.. option:: -c ceph.conf, --conf=ceph.conf
+
+   Use ceph.conf configuration file instead of the default
+   /etc/ceph/ceph.conf to determine monitor addresses during startup.
+
+.. option:: -m monaddress[:port]
+
+   Connect to specified monitor (instead of looking through ceph.conf).
+
+.. option:: -b block_size
+
+  Set the block size for put/get ops and for write benchmarking.
+
+.. option:: --striper
+
+   Uses the striping API of rados rather than the default one.
+   Available for stat, get, put, truncate, rm, ls and all xattr related operation
+
+
+Global commands
+===============
+
+:command:`lspools`
+  List object pools
+
+:command:`df`
+  Show utilization statistics, including disk usage (bytes) and object
+  counts, over the entire system and broken down by pool.
+
+:command:`mkpool` *foo*
+  Create a pool with name foo.
+
+:command:`rmpool` *foo* [ *foo* --yes-i-really-really-mean-it ]
+  Delete the pool foo (and all its data)
+
+
+Pool specific commands
+======================
+
+:command:`get` *name* *outfile*
+  Read object name from the cluster and write it to outfile.
+
+:command:`put` *name* *infile*
+  Write object name to the cluster with contents from infile.
+
+:command:`rm` *name*
+  Remove object name.
+
+:command:`ls` *outfile*
+  List objects in given pool and write to outfile.
+
+:command:`lssnap`
+  List snapshots for given pool.
+
+:command:`clonedata` *srcname* *dstname* --object-locator *key*
+  Clone object byte data from *srcname* to *dstname*.  Both objects must be stored with the locator key *key* (usually either *srcname* or *dstname*).  Object attributes and omap keys are not copied or cloned.
+
+:command:`mksnap` *foo*
+  Create pool snapshot named *foo*.
+
+:command:`rmsnap` *foo*
+  Remove pool snapshot named *foo*.
+
+:command:`bench` *seconds* *mode* [ -b *objsize* ] [ -t *threads* ]
+  Benchmark for *seconds*. The mode can be *write*, *seq*, or
+  *rand*. *seq* and *rand* are read benchmarks, either
+  sequential or random. Before running one of the reading benchmarks,
+  run a write benchmark with the *--no-cleanup* option. The default
+  object size is 4 MB, and the default number of simulated threads
+  (parallel writes) is 16.
+  Note: -b *objsize* option is valid only in *write* mode.
+
+:command:`cleanup`
+
+:command:`listomapkeys` *name*
+  List all the keys stored in the object map of object name.
+
+:command:`listomapvals` *name*
+  List all key/value pairs stored in the object map of object name.
+  The values are dumped in hexadecimal.
+
+:command:`getomapval` *name* *key*
+  Dump the hexadecimal value of key in the object map of object name.
+
+:command:`setomapval` *name* *key* *value*
+  Set the value of key in the object map of object name.
+
+:command:`rmomapkey` *name* *key*
+  Remove key from the object map of object name.
+
+:command:`getomapheader` *name*
+  Dump the hexadecimal value of the object map header of object name.
+
+:command:`setomapheader` *name* *value*
+  Set the value of the object map header of object name.
+
+Examples
+========
+
+To view cluster utilization::
+
+       rados df
+
+To get a list object in pool foo sent to stdout::
+
+       rados -p foo ls -
+
+To write an object::
+
+       rados -p foo put myobject blah.txt
+
+To create a snapshot::
+
+       rados -p foo mksnap mysnap
+
+To delete the object::
+
+       rados -p foo rm myobject
+
+To read a previously snapshotted version of an object::
+
+       rados -p foo -s mysnap get myobject blah.txt.old
+
+
+Availability
+============
+
+**rados** is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to
+the Ceph documentation at http://ceph.com/docs for more information.
+
+
+See also
+========
+
+:doc:`ceph <ceph>`\(8)
diff --git a/doc/man/8/radosgw-admin.rst b/doc/man/8/radosgw-admin.rst
new file mode 100644
index 0000000..f0c19f4
--- /dev/null
+++ b/doc/man/8/radosgw-admin.rst
@@ -0,0 +1,463 @@
+:orphan:
+
+=================================================================
+ radosgw-admin -- rados REST gateway user administration utility
+=================================================================
+
+.. program:: radosgw-admin
+
+Synopsis
+========
+
+| **radosgw-admin** *command* [ *options* *...* ]
+
+
+Description
+===========
+
+:program:`radosgw-admin` is a RADOS gateway user administration utility. It
+allows creating and modifying users.
+
+
+Commands
+========
+
+:program:`radosgw-admin` utility uses many commands for administration purpose
+which are as follows:
+
+:command:`user create`
+  Create a new user.
+
+:command:`user modify`
+  Modify a user.
+
+:command:`user info`
+  Display information of a user, and any potentially available
+  subusers and keys.
+
+:command:`user rm`
+  Remove a user.
+
+:command:`user suspend`
+  Suspend a user.
+
+:command:`user enable`
+  Re-enable user after suspension.
+
+:command:`user check`
+  Check user info.
+
+:command:`user stats`
+  Show user stats as accounted by quota subsystem.
+
+:command:`caps add`
+  Add user capabilities.
+
+:command:`caps rm`
+  Remove user capabilities.
+
+:command:`subuser create`
+  Create a new subuser (primarily useful for clients using the Swift API).
+
+:command:`subuser modify`
+  Modify a subuser.
+
+:command:`subuser rm`
+  Remove a subuser.
+
+:command:`key create`
+  Create access key.
+
+:command:`key rm`
+  Remove access key.
+
+:command:`bucket list`
+  List all buckets.
+
+:command:`bucket link`
+  Link bucket to specified user.
+
+:command:`bucket unlink`
+  Unlink bucket from specified user.
+
+:command:`bucket stats`
+  Returns bucket statistics.
+
+:command:`bucket rm`
+  Remove a bucket.
+
+:command:`bucket check`
+  Check bucket index.
+
+:command:`object rm`
+  Remove an object.
+
+:command:`object unlink`
+  Unlink object from bucket index.
+
+:command:`quota set`
+  Set quota params.
+
+:command:`quota enable`
+  Enable quota.
+
+:command:`quota disable`
+  Disable quota.
+
+:command:`region get`
+  Show region info.
+
+:command:`regions list`
+  List all regions set on this cluster.
+
+:command:`region set`
+  Set region info (requires infile).
+
+:command:`region default`
+  Set default region.
+
+:command:`region-map get`
+  Show region-map.
+
+:command:`region-map set`
+  Set region-map (requires infile).
+
+:command:`zone get`
+  Show zone cluster params.
+
+:command:`zone set`
+  Set zone cluster params (requires infile).
+
+:command:`zone list`
+  List all zones set on this cluster.
+
+:command:`pool add`
+  Add an existing pool for data placement.
+
+:command:`pool rm`
+  Remove an existing pool from data placement set.
+
+:command:`pools list`
+  List placement active set.
+
+:command:`policy`
+  Display bucket/object policy.
+
+:command:`log list`
+  List log objects.
+
+:command:`log show`
+  Dump a log from specific object or (bucket + date + bucket-id).
+
+:command:`log rm`
+  Remove log object.
+
+:command:`usage show`
+  Show the usage information (with optional user and date range).
+
+:command:`usage trim`
+  Trim usage information (with optional user and date range).
+
+:command:`temp remove`
+  Remove temporary objects that were created up to specified date
+  (and optional time).
+
+:command:`gc list`
+  Dump expired garbage collection objects (specify --include-all to list all
+  entries, including unexpired).
+
+:command:`gc process`
+  Manually process garbage.
+
+:command:`metadata get`
+  Get metadata info.
+
+:command:`metadata put`
+  Put metadata info.
+
+:command:`metadata rm`
+  Remove metadata info.
+
+:command:`metadata list`
+  List metadata info.
+
+:command:`mdlog list`
+  List metadata log.
+
+:command:`mdlog trim`
+  Trim metadata log.
+
+:command:`bilog list`
+  List bucket index log.
+
+:command:`bilog trim`
+  Trim bucket index log (use start-marker, end-marker).
+
+:command:`datalog list`
+  List data log.
+
+:command:`datalog trim`
+  Trim data log.
+
+:command:`opstate list`
+  List stateful operations entries (use client_id, op_id, object).
+
+:command:`opstate set`
+  Set state on an entry (use client_id, op_id, object, state).
+
+:command:`opstate renew`
+  Renew state on an entry (use client_id, op_id, object).
+
+:command:`opstate rm`
+  Remove entry (use client_id, op_id, object).
+
+:command:`replicalog get`
+  Get replica metadata log entry.
+
+:command:`replicalog delete`
+  Delete replica metadata log entry.
+
+
+Options
+=======
+
+.. option:: -c ceph.conf, --conf=ceph.conf
+
+   Use ``ceph.conf`` configuration file instead of the default
+   ``/etc/ceph/ceph.conf`` to determine monitor addresses during
+   startup.
+
+.. option:: -m monaddress[:port]
+
+   Connect to specified monitor (instead of looking through ceph.conf).
+
+.. option:: --uid=uid
+
+   The radosgw user ID.
+
+.. option:: --subuser=<name>
+
+	Name of the subuser.
+
+.. option:: --email=email
+
+   The e-mail address of the user.
+
+.. option:: --display-name=name
+
+   Configure the display name of the user.
+
+.. option:: --access-key=<key>
+
+	S3 access key.
+
+.. option:: --gen-access-key
+
+	Generate random access key (for S3).
+
+.. option:: --secret=secret
+
+   The secret associated with a given key.
+
+.. option:: --gen-secret
+
+	Generate random secret key.
+
+.. option:: --key-type=<type>
+
+	key type, options are: swift, S3.
+
+.. option:: --temp-url-key[-2]=<key>
+
+	Temporary url key.
+
+.. option:: --system
+
+	Set the system flag on the user.
+
+.. option:: --bucket=bucket
+
+   Specify the bucket name.
+
+.. option:: --object=object
+
+   Specify the object name.
+
+.. option:: --date=yyyy-mm-dd
+
+   The date needed for some commands.
+
+.. option:: --start-date=yyyy-mm-dd
+
+   The start date needed for some commands.
+
+.. option:: --end-date=yyyy-mm-dd
+
+   The end date needed for some commands.
+
+.. option:: --shard-id=<shard-id>
+
+	Optional for mdlog list. Required for ``mdlog trim``,
+	``replica mdlog get/delete``, ``replica datalog get/delete``.
+
+.. option:: --auth-uid=auid
+
+   The librados auid.
+
+.. option:: --purge-data
+
+   Remove user data before user removal.
+
+.. option:: --purge-keys
+
+	When specified, subuser removal will also purge all the subuser keys.
+   
+.. option:: --purge-objects
+
+   Remove all objects before bucket removal.
+
+.. option:: --metadata-key=<key>
+
+	Key to retrieve metadata from with ``metadata get``.
+
+.. option:: --rgw-region=<region>
+
+	Region in which radosgw is running.
+
+.. option:: --rgw-zone=<zone>
+
+	Zone in which radosgw is running.
+
+.. option:: --fix
+
+	Besides checking bucket index, will also fix it.
+
+.. option:: --check-objects
+
+	bucket check: Rebuilds bucket index according to actual objects state.
+
+.. option:: --format=<format>
+
+	Specify output format for certain operations: xml, json.
+
+.. option:: --sync-stats
+
+	Option to 'user stats', update user stats with current stats reported by
+	user's buckets indexes.
+
+.. option:: --show-log-entries=<flag>
+
+	Enable/disable dump of log entries on log show.
+
+.. option:: --show-log-sum=<flag>
+
+	Enable/disable dump of log summation on log show.
+
+.. option:: --skip-zero-entries
+
+	Log show only dumps entries that don't have zero value in one of the numeric
+	field.
+
+.. option:: --infile
+
+	Specify a file to read in when setting data.
+
+.. option:: --state=<state string>
+
+	Specify a state for the opstate set command.
+
+.. option:: --replica-log-type
+
+	Replica log type (metadata, data, bucket), required for replica log
+	operations.
+
+.. option:: --categories=<list>
+
+	Comma separated list of categories, used in usage show.
+
+.. option:: --caps=<caps>
+
+	List of caps (e.g., "usage=read, write; user=read".
+
+.. option:: --yes-i-really-mean-it
+
+	Required for certain operations.
+
+
+Quota Options
+=============
+
+.. option:: --max-objects
+
+	Specify max objects (negative value to disable).
+
+.. option:: --max-size
+
+	Specify max size (in bytes, negative value to disable).
+
+.. option:: --quota-scope
+
+	Scope of quota (bucket, user).
+
+
+Examples
+========
+
+Generate a new user::
+
+        $ radosgw-admin user create --display-name="johnny rotten" --uid=johnny
+        { "user_id": "johnny",
+          "rados_uid": 0,
+          "display_name": "johnny rotten",
+          "email": "",
+          "suspended": 0,
+          "subusers": [],
+          "keys": [
+                { "user": "johnny",
+                  "access_key": "TCICW53D9BQ2VGC46I44",
+                  "secret_key": "tfm9aHMI8X76L3UdgE+ZQaJag1vJQmE6HDb5Lbrz"}],
+          "swift_keys": []}
+
+Remove a user::
+
+        $ radosgw-admin user rm --uid=johnny
+        
+Remove a user and all associated buckets with their contents::
+
+        $ radosgw-admin user rm --uid=johnny --purge-data
+
+Remove a bucket::
+
+        $ radosgw-admin bucket unlink --bucket=foo
+
+Show the logs of a bucket from April 1st, 2012::
+
+        $ radosgw-admin log show --bucket=foo --date=2012-04-01
+
+Show usage information for user from March 1st to (but not including) April 1st, 2012::
+
+        $ radosgw-admin usage show --uid=johnny \
+                        --start-date=2012-03-01 --end-date=2012-04-01
+
+Show only summary of usage information for all users::
+
+        $ radosgw-admin usage show --show-log-entries=false
+
+Trim usage information for user until March 1st, 2012::
+
+        $ radosgw-admin usage trim --uid=johnny --end-date=2012-04-01
+
+
+Availability
+============
+
+:program:`radosgw-admin` is part of Ceph, a massively scalable, open-source,
+distributed storage system.  Please refer to the Ceph documentation at
+http://ceph.com/docs for more information.
+
+
+See also
+========
+
+:doc:`ceph <ceph>`\(8)
+:doc:`radosgw <radosgw>`\(8)
diff --git a/doc/man/8/radosgw.rst b/doc/man/8/radosgw.rst
new file mode 100644
index 0000000..f57b346
--- /dev/null
+++ b/doc/man/8/radosgw.rst
@@ -0,0 +1,256 @@
+:orphan:
+
+===============================
+ radosgw -- rados REST gateway
+===============================
+
+.. program:: radosgw
+
+Synopsis
+========
+
+| **radosgw**
+
+
+Description
+===========
+
+:program:`radosgw` is an HTTP REST gateway for the RADOS object store, a part
+of the Ceph distributed storage system. It is implemented as a FastCGI
+module using libfcgi, and can be used in conjunction with any FastCGI
+capable web server.
+
+
+Options
+=======
+
+.. option:: -c ceph.conf, --conf=ceph.conf
+
+   Use ``ceph.conf`` configuration file instead of the default
+   ``/etc/ceph/ceph.conf`` to determine monitor addresses during startup.
+
+.. option:: -m monaddress[:port]
+
+   Connect to specified monitor (instead of looking through ``ceph.conf``).
+
+.. option:: -i ID, --id ID
+
+   Set the ID portion of name for radosgw
+
+.. option:: -n TYPE.ID, --name TYPE.ID
+
+   Set the rados user name for the gateway (eg. client.radosgw.gateway)
+
+.. option:: --cluster NAME
+
+   Set the cluster name (default: ceph)
+
+.. option:: -d
+
+   Run in foreground, log to stderr
+
+.. option:: -f
+
+   Run in foreground, log to usual location
+
+.. option:: --rgw-socket-path=path
+
+   Specify a unix domain socket path.
+
+.. option:: --rgw-region=region
+
+   The region where radosgw runs
+
+.. option:: --rgw-zone=zone
+
+   The zone where radosgw runs
+
+
+Configuration
+=============
+
+Earlier RADOS Gateway had to be configured with ``Apache`` and ``mod_fastcgi``.
+Now, ``mod_proxy_fcgi`` module is used instead of ``mod_fastcgi``.
+``mod_proxy_fcgi`` works differently than a traditional FastCGI module. This
+module requires the service of ``mod_proxy`` which provides support for the
+FastCGI protocol. So, to be able to handle FastCGI protocol, both ``mod_proxy``
+and ``mod_proxy_fcgi`` have to be present in the server. Unlike ``mod_fastcgi``,
+``mod_proxy_fcgi`` cannot start the application process. Some platforms have
+``fcgistarter`` for that purpose. However, external launching of application
+or process management may be available in the FastCGI application framework
+in use.
+
+``Apache`` can be configured in a way that enables ``mod_proxy_fcgi`` to be used
+with localhost tcp or through unix domain socket. ``mod_proxy_fcgi`` that doesn't
+support unix domain socket such as the ones in Apache 2.2 and earlier versions of
+Apache 2.4, needs to be configured for use with localhost tcp. Later versions of
+Apache like Apache 2.4.9 or later support unix domain socket and as such they
+allow for the configuration with unix domain socket instead of localhost tcp.
+
+The following steps show the configuration in Ceph's configuration file i.e,
+``/etc/ceph/ceph.conf`` and the gateway configuration file i.e,
+``/etc/httpd/conf.d/rgw.conf`` (RPM-based distros) or
+``/etc/apache2/conf-available/rgw.conf`` (Debian-based distros) with localhost
+tcp and through unix domain socket:
+
+#. For distros with Apache 2.2 and early versions of Apache 2.4 that use
+   localhost TCP and do not support Unix Domain Socket, append the following
+   contents to ``/etc/ceph/ceph.conf``::
+
+	[client.radosgw.gateway]
+	host = {hostname}
+	keyring = /etc/ceph/ceph.client.radosgw.keyring
+	rgw socket path = ""
+	log file = /var/log/ceph/client.radosgw.gateway.log
+	rgw frontends = fastcgi socket_port=9000 socket_host=0.0.0.0
+	rgw print continue = false
+
+#. Add the following content in the gateway configuration file:
+
+   For Debian/Ubuntu add in ``/etc/apache2/conf-available/rgw.conf``::
+
+		<VirtualHost *:80>
+		ServerName localhost
+		DocumentRoot /var/www/html
+
+		ErrorLog /var/log/apache2/rgw_error.log
+		CustomLog /var/log/apache2/rgw_access.log combined
+
+		# LogLevel debug
+
+		RewriteEngine On
+
+		RewriteRule .* - [E=HTTP_AUTHORIZATION:%{HTTP:Authorization},L]
+
+		SetEnv proxy-nokeepalive 1
+
+		ProxyPass / fcgi://localhost:9000/
+
+		</VirtualHost>
+
+   For CentOS/RHEL add in ``/etc/httpd/conf.d/rgw.conf``::
+
+		<VirtualHost *:80>
+		ServerName localhost
+		DocumentRoot /var/www/html
+
+		ErrorLog /var/log/httpd/rgw_error.log
+		CustomLog /var/log/httpd/rgw_access.log combined
+
+		# LogLevel debug
+
+		RewriteEngine On
+
+		RewriteRule .* - [E=HTTP_AUTHORIZATION:%{HTTP:Authorization},L]
+
+		SetEnv proxy-nokeepalive 1
+
+		ProxyPass / fcgi://localhost:9000/
+
+		</VirtualHost>
+
+#. For distros with Apache 2.4.9 or later that support Unix Domain Socket,
+   append the following configuration to ``/etc/ceph/ceph.conf``::
+
+	[client.radosgw.gateway]
+	host = {hostname}
+	keyring = /etc/ceph/ceph.client.radosgw.keyring
+	rgw socket path = /var/run/ceph/ceph.radosgw.gateway.fastcgi.sock
+	log file = /var/log/ceph/client.radosgw.gateway.log
+	rgw print continue = false
+
+#. Add the following content in the gateway configuration file:
+
+   For CentOS/RHEL add in ``/etc/httpd/conf.d/rgw.conf``::
+
+		<VirtualHost *:80>
+		ServerName localhost
+		DocumentRoot /var/www/html
+
+		ErrorLog /var/log/httpd/rgw_error.log
+		CustomLog /var/log/httpd/rgw_access.log combined
+
+		# LogLevel debug
+
+		RewriteEngine On
+
+		RewriteRule .* - [E=HTTP_AUTHORIZATION:%{HTTP:Authorization},L]
+
+		SetEnv proxy-nokeepalive 1
+
+		ProxyPass / unix:///var/run/ceph/ceph.radosgw.gateway.fastcgi.sock|fcgi://localhost:9000/
+
+		</VirtualHost>
+
+   The latest version of Ubuntu i.e, 14.04 ships with ``Apache 2.4.7`` that
+   does not have Unix Domain Socket support in it and as such it has to be
+   configured with localhost tcp. The Unix Domain Socket support is available in
+   ``Apache 2.4.9`` and later versions. A bug has been filed to backport the UDS
+   support to ``Apache 2.4.7`` for ``Ubuntu 14.04``.
+   See: https://bugs.launchpad.net/ubuntu/+source/apache2/+bug/1411030
+
+#. Generate a key for radosgw to use for authentication with the cluster. ::
+
+	ceph-authtool -C -n client.radosgw.gateway --gen-key /etc/ceph/keyring.radosgw.gateway
+	ceph-authtool -n client.radosgw.gateway --cap mon 'allow rw' --cap osd 'allow rwx' /etc/ceph/keyring.radosgw.gateway
+
+#. Add the key to the auth entries. ::
+
+	ceph auth add client.radosgw.gateway --in-file=keyring.radosgw.gateway
+
+#. Start Apache and radosgw.
+
+   Debian/Ubuntu::
+
+		sudo /etc/init.d/apache2 start
+		sudo /etc/init.d/radosgw start
+
+   CentOS/RHEL::
+
+		sudo apachectl start
+		sudo /etc/init.d/ceph-radosgw start
+
+Usage Logging
+=============
+
+:program:`radosgw` maintains an asynchronous usage log. It accumulates
+statistics about user operations and flushes it periodically. The
+logs can be accessed and managed through :program:`radosgw-admin`.
+
+The information that is being logged contains total data transfer,
+total operations, and total successful operations. The data is being
+accounted in an hourly resolution under the bucket owner, unless the
+operation was done on the service (e.g., when listing a bucket) in
+which case it is accounted under the operating user.
+
+Following is an example configuration::
+
+        [client.radosgw.gateway]
+            rgw enable usage log = true
+            rgw usage log tick interval = 30
+            rgw usage log flush threshold = 1024
+            rgw usage max shards = 32
+            rgw usage max user shards = 1
+
+
+The total number of shards determines how many total objects hold the
+usage log information. The per-user number of shards specify how many
+objects hold usage information for a single user. The tick interval
+configures the number of seconds between log flushes, and the flush
+threshold specify how many entries can be kept before resorting to
+synchronous flush.
+
+
+Availability
+============
+
+:program:`radosgw` is part of Ceph, a massively scalable, open-source, distributed
+storage system. Please refer to the Ceph documentation at http://ceph.com/docs for
+more information.
+
+
+See also
+========
+
+:doc:`ceph <ceph>`\(8)
+:doc:`radosgw-admin <radosgw-admin>`\(8)
diff --git a/doc/man/8/rbd-fuse.rst b/doc/man/8/rbd-fuse.rst
new file mode 100644
index 0000000..394bdba
--- /dev/null
+++ b/doc/man/8/rbd-fuse.rst
@@ -0,0 +1,56 @@
+:orphan:
+
+=======================================
+ rbd-fuse -- expose rbd images as files
+=======================================
+
+.. program:: rbd-fuse
+
+Synopsis
+========
+
+| **rbd-fuse** [ -p pool ] [-c conffile] *mountpoint* [ *fuse options* ]
+
+
+Description
+===========
+
+**rbd-fuse** is a FUSE (File system in USErspace) client for RADOS
+block device (rbd) images.  Given a pool containing rbd images,
+it will mount a userspace filesystem allowing access to those images
+as regular files at **mountpoint**.
+
+The file system can be unmounted with::
+
+        fusermount -u mountpoint
+
+or by sending ``SIGINT`` to the ``rbd-fuse`` process.
+
+
+Options
+=======
+
+Any options not recognized by rbd-fuse will be passed on to libfuse.
+
+.. option:: -c ceph.conf
+
+   Use *ceph.conf* configuration file instead of the default
+   ``/etc/ceph/ceph.conf`` to determine monitor addresses during startup.
+
+.. option:: -p pool
+
+   Use *pool* as the pool to search for rbd images.  Default is ``rbd``.
+
+
+Availability
+============
+
+**rbd-fuse** is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to
+the Ceph documentation at http://ceph.com/docs for more information.
+
+
+See also
+========
+
+fusermount(8),
+:doc:`rbd <rbd>`\(8)
diff --git a/doc/man/8/rbd-replay-many.rst b/doc/man/8/rbd-replay-many.rst
new file mode 100644
index 0000000..5fb9349
--- /dev/null
+++ b/doc/man/8/rbd-replay-many.rst
@@ -0,0 +1,73 @@
+:orphan:
+
+==================================================================================
+ rbd-replay-many -- replay a rados block device (RBD) workload on several clients
+==================================================================================
+
+.. program:: rbd-replay-many
+
+Synopsis
+========
+
+| **rbd-replay-many** [ *options* ] --original-image *name* *host1* [ *host2* [ ... ] ] -- *rbd_replay_args*
+
+
+Description
+===========
+
+**rbd-replay-many** is a utility for replaying a rados block device (RBD) workload on several clients.
+Although all clients use the same workload, they replay against separate images.
+This matches normal use of librbd, where each original client is a VM with its own image.
+
+Configuration and replay files are not automatically copied to clients.
+Replay images must already exist.
+
+
+Options
+=======
+
+.. option:: --original-image name
+
+   Specifies the name (and snap) of the originally traced image.
+   Necessary for correct name mapping.
+
+.. option:: --image-prefix prefix
+
+   Prefix of image names to replay against.
+   Specifying --image-prefix=foo results in clients replaying against foo-0, foo-1, etc.
+   Defaults to the original image name.
+
+.. option:: --exec program
+
+   Path to the rbd-replay executable.
+
+.. option:: --delay seconds
+
+   Delay between starting each client.  Defaults to 0.
+
+
+Examples
+========
+
+Typical usage::
+
+       rbd-replay-many host-0 host-1 --original-image=image -- -c ceph.conf replay.bin
+
+This results in the following commands being executed::
+
+       ssh host-0 'rbd-replay' --map-image 'image=image-0' -c ceph.conf replay.bin
+       ssh host-1 'rbd-replay' --map-image 'image=image-1' -c ceph.conf replay.bin
+
+
+Availability
+============
+
+**rbd-replay-many** is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to
+the Ceph documentation at http://ceph.com/docs for more information.
+
+
+See also
+========
+
+:doc:`rbd-replay <rbd-replay>`\(8),
+:doc:`rbd <rbd>`\(8)
diff --git a/doc/man/8/rbd-replay-prep.rst b/doc/man/8/rbd-replay-prep.rst
new file mode 100644
index 0000000..abb08de
--- /dev/null
+++ b/doc/man/8/rbd-replay-prep.rst
@@ -0,0 +1,55 @@
+:orphan:
+
+====================================================================================
+ rbd-replay-prep -- prepare captured rados block device (RBD) workloads for replay
+====================================================================================
+
+.. program:: rbd-replay-prep
+
+Synopsis
+========
+
+| **rbd-replay-prep** [ --window *seconds* ] [ --anonymize ] *trace_dir* *replay_file*
+
+
+Description
+===========
+
+**rbd-replay-prep** processes raw rados block device (RBD) traces to prepare them for **rbd-replay**.
+
+
+Options
+=======
+
+.. option:: --window seconds
+
+   Requests further apart than 'seconds' seconds are assumed to be independent.
+
+.. option:: --anonymize
+
+   Anonymizes image and snap names.
+
+.. option:: --verbose
+
+   Print all processed events to console
+
+Examples
+========
+
+To prepare workload1-trace for replay::
+
+       rbd-replay-prep workload1-trace/ust/uid/1000/64-bit workload1
+
+
+Availability
+============
+
+**rbd-replay-prep** is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to
+the Ceph documentation at http://ceph.com/docs for more information.
+
+
+See also
+========
+
+:doc:`rbd-replay <rbd-replay>`\(8),
+:doc:`rbd <rbd>`\(8)
diff --git a/doc/man/8/rbd-replay.rst b/doc/man/8/rbd-replay.rst
new file mode 100644
index 0000000..74b8018
--- /dev/null
+++ b/doc/man/8/rbd-replay.rst
@@ -0,0 +1,78 @@
+:orphan:
+
+=========================================================
+ rbd-replay -- replay rados block device (RBD) workloads
+=========================================================
+
+.. program:: rbd-replay
+
+Synopsis
+========
+
+| **rbd-replay** [ *options* ] *replay_file*
+
+
+Description
+===========
+
+**rbd-replay** is a utility for replaying rados block device (RBD) workloads.
+
+
+Options
+=======
+
+.. option:: -c ceph.conf, --conf ceph.conf
+
+   Use ceph.conf configuration file instead of the default /etc/ceph/ceph.conf to
+   determine monitor addresses during startup.
+
+.. option:: -p pool, --pool pool
+
+   Interact with the given pool.  Defaults to 'rbd'.
+
+.. option:: --latency-multiplier
+
+   Multiplies inter-request latencies.  Default: 1.
+
+.. option:: --read-only
+
+   Only replay non-destructive requests.
+
+.. option:: --map-image rule
+
+   Add a rule to map image names in the trace to image names in the replay cluster.
+   A rule of image1 at snap1=image2 at snap2 would map snap1 of image1 to snap2 of image2.
+
+.. option:: --dump-perf-counters
+
+   **Experimental**
+   Dump performance counters to standard out before an image is closed.
+   Performance counters may be dumped multiple times if multiple images are closed,
+   or if the same image is opened and closed multiple times.
+   Performance counters and their meaning may change between versions.
+
+
+Examples
+========
+
+To replay workload1 as fast as possible::
+
+       rbd-replay --latency-multiplier=0 workload1
+
+To replay workload1 but use test_image instead of prod_image::
+
+       rbd-replay --map-image=prod_image=test_image workload1
+
+
+Availability
+============
+
+**rbd-replay** is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to
+the Ceph documentation at http://ceph.com/docs for more information.
+
+
+See also
+========
+
+:doc:`rbd-replay-prep <rbd-replay-prep>`\(8),
+:doc:`rbd <rbd>`\(8)
diff --git a/doc/man/8/rbd.rst b/doc/man/8/rbd.rst
new file mode 100644
index 0000000..d7eb72d
--- /dev/null
+++ b/doc/man/8/rbd.rst
@@ -0,0 +1,522 @@
+:orphan:
+
+===============================================
+ rbd -- manage rados block device (RBD) images
+===============================================
+
+.. program:: rbd
+
+Synopsis
+========
+
+| **rbd** [ -c *ceph.conf* ] [ -m *monaddr* ] [ -p | --pool *pool* ] [
+  --size *size* ] [ --order *bits* ] [ *command* ... ]
+
+
+Description
+===========
+
+**rbd** is a utility for manipulating rados block device (RBD) images,
+used by the Linux rbd driver and the rbd storage driver for Qemu/KVM.
+RBD images are simple block devices that are striped over objects and
+stored in a RADOS object store. The size of the objects the image is
+striped over must be a power of two.
+
+
+Options
+=======
+
+.. option:: -c ceph.conf, --conf ceph.conf
+
+   Use ceph.conf configuration file instead of the default /etc/ceph/ceph.conf to
+   determine monitor addresses during startup.
+
+.. option:: -m monaddress[:port]
+
+   Connect to specified monitor (instead of looking through ceph.conf).
+
+.. option:: -p pool-name, --pool pool-name
+
+   Interact with the given pool. Required by most commands.
+
+.. option:: --no-progress
+
+   Do not output progress information (goes to standard error by
+   default for some commands).
+
+
+Parameters
+==========
+
+.. option:: --image-format format-id
+
+   Specifies which object layout to use. The default is 1.
+
+   * format 1 - Use the original format for a new rbd image. This format is
+     understood by all versions of librbd and the kernel rbd module, but
+     does not support newer features like cloning.
+
+   * format 2 - Use the second rbd format, which is supported by
+     librbd and kernel since version 3.11 (except for striping). This adds
+     support for cloning and is more easily extensible to allow more
+     features in the future.
+
+.. option:: --size size-in-M/G/T
+
+   Specifies the size (in M/G/T) of the new rbd image.
+
+.. option:: --order bits
+
+   Specifies the object size expressed as a number of bits, such that
+   the object size is ``1 << order``. The default is 22 (4 MB).
+
+.. option:: --stripe-unit size-in-B/K/M
+
+   Specifies the stripe unit size in B/K/M.  See striping section (below) for more details.
+
+.. option:: --stripe-count num
+
+   Specifies the number of objects to stripe over before looping back
+   to the first object.  See striping section (below) for more details.
+
+.. option:: --snap snap
+
+   Specifies the snapshot name for the specific operation.
+
+.. option:: --id username
+
+   Specifies the username (without the ``client.`` prefix) to use with the map command.
+
+.. option:: --keyring filename
+
+   Specifies a keyring file containing a secret for the specified user
+   to use with the map command.  If not specified, the default keyring
+   locations will be searched.
+
+.. option:: --keyfile filename
+
+   Specifies a file containing the secret key of ``--id user`` to use with the map command.
+   This option is overridden by ``--keyring`` if the latter is also specified.
+
+.. option:: --shared lock-tag
+
+   Option for `lock add` that allows multiple clients to lock the
+   same image if they use the same tag. The tag is an arbitrary
+   string. This is useful for situations where an image must
+   be open from more than one client at once, like during
+   live migration of a virtual machine, or for use underneath
+   a clustered filesystem.
+
+.. option:: --format format
+
+   Specifies output formatting (default: plain, json, xml)
+
+.. option:: --pretty-format
+
+   Make json or xml formatted output more human-readable.
+
+.. option:: -o map-options, --options map-options
+
+   Specifies which options to use when mapping an image.  map-options is
+   a comma-separated string of options (similar to mount(8) mount options).
+   See map options section below for more details.
+
+.. option:: --read-only
+
+   Map the image read-only.  Equivalent to -o ro.
+
+.. option:: --image-feature feature-name
+
+   Specifies which RBD format 2 feature should be enabled when creating
+   an image. Multiple features can be enabled by repeating this option
+   multiple times. The following features are supported:
+
+   * layering: layering support
+   * striping: striping v2 support
+   * exclusive-lock: exclusive locking support
+   * object-map: object map support (requires exclusive-lock)
+   * fast-diff: fast diff calculations (requires object-map)
+   * deep-flatten: snapshot flatten support
+
+.. option:: --image-shared
+
+   Specifies that the image will be used concurrently by multiple clients.
+   This will disable features that are dependent upon exclusive ownership
+   of the image.
+
+.. option:: --whole-object
+
+   Specifies that the diff should be limited to the extents of a full object
+   instead of showing intra-object deltas. When the object map feature is
+   enabled on an image, limiting the diff to the object extents will
+   dramatically improve performance since the differences can be computed
+   by examining the in-memory object map instead of querying RADOS for each
+   object within the image.
+
+Commands
+========
+
+.. TODO rst "option" directive seems to require --foo style options, parsing breaks on subcommands.. the args show up as bold too
+
+:command:`ls` [-l | --long] [*pool-name*]
+  Will list all rbd images listed in the rbd_directory object.  With
+  -l, also show snapshots, and use longer-format output including
+  size, parent (if clone), format, etc.
+
+:command:`du` [-p | --pool *pool-name*] [*image-spec* | *snap-spec*]
+  Will calculate the provisioned and actual disk usage of all images and
+  associated snapshots within the specified pool.  It can also be used against
+  individual images and snapshots.
+
+  If the RBD fast-diff feature isn't enabled on images, this operation will
+  require querying the OSDs for every potential object within the image.
+
+:command:`info` *image-spec* | *snap-spec*
+  Will dump information (such as size and order) about a specific rbd image.
+  If image is a clone, information about its parent is also displayed.
+  If a snapshot is specified, whether it is protected is shown as well.
+
+:command:`create` (-s | --size *size-in-M/G/T*) [--image-format *format-id*] [--order *bits*] [--stripe-unit *size-in-B/K/M* --stripe-count *num*] [--image-feature *feature-name*]... [--image-shared] *image-spec*
+  Will create a new rbd image. You must also specify the size via --size.  The
+  --stripe-unit and --stripe-count arguments are optional, but must be used together.
+
+:command:`clone` [--order *bits*] [--stripe-unit *size-in-B/K/M* --stripe-count *num*] [--image-feature *feature-name*] [--image-shared] *parent-snap-spec* *child-image-spec*
+  Will create a clone (copy-on-write child) of the parent snapshot.
+  Object order will be identical to that of the parent image unless
+  specified. Size will be the same as the parent snapshot. The --stripe-unit
+  and --stripe-count arguments are optional, but must be used together.
+
+  The parent snapshot must be protected (see `rbd snap protect`).
+  This requires image format 2.
+
+:command:`flatten` *image-spec*
+  If image is a clone, copy all shared blocks from the parent snapshot and
+  make the child independent of the parent, severing the link between
+  parent snap and child.  The parent snapshot can be unprotected and
+  deleted if it has no further dependent clones.
+
+  This requires image format 2.
+
+:command:`children` *snap-spec*
+  List the clones of the image at the given snapshot. This checks
+  every pool, and outputs the resulting poolname/imagename.
+
+  This requires image format 2.
+
+:command:`resize` (-s | --size *size-in-M/G/T*) [--allow-shrink] *image-spec*
+  Resizes rbd image. The size parameter also needs to be specified.
+  The --allow-shrink option lets the size be reduced.
+
+:command:`rm` *image-spec*
+  Deletes an rbd image (including all data blocks). If the image has
+  snapshots, this fails and nothing is deleted.
+
+:command:`export` (*image-spec* | *snap-spec*) [*dest-path*]
+  Exports image to dest path (use - for stdout).
+
+:command:`import` [--image-format *format-id*] [--order *bits*] [--stripe-unit *size-in-B/K/M* --stripe-count *num*] [--image-feature *feature-name*]... [--image-shared] *src-path* [*image-spec*]
+  Creates a new image and imports its data from path (use - for
+  stdin).  The import operation will try to create sparse rbd images 
+  if possible.  For import from stdin, the sparsification unit is
+  the data block size of the destination image (1 << order).
+
+  The --stripe-unit and --stripe-count arguments are optional, but must be
+  used together.
+
+:command:`export-diff` [--from-snap *snap-name*] [--whole-object] (*image-spec* | *snap-spec*) *dest-path*
+  Exports an incremental diff for an image to dest path (use - for stdout).  If
+  an initial snapshot is specified, only changes since that snapshot are included; otherwise,
+  any regions of the image that contain data are included.  The end snapshot is specified
+  using the standard --snap option or @snap syntax (see below).  The image diff format includes
+  metadata about image size changes, and the start and end snapshots.  It efficiently represents
+  discarded or 'zero' regions of the image.
+
+:command:`merge-diff` *first-diff-path* *second-diff-path* *merged-diff-path*
+  Merge two continuous incremental diffs of an image into one single diff. The
+  first diff's end snapshot must be equal with the second diff's start snapshot.
+  The first diff could be - for stdin, and merged diff could be - for stdout, which
+  enables multiple diff files to be merged using something like
+  'rbd merge-diff first second - | rbd merge-diff - third result'. Note this command
+  currently only support the source incremental diff with stripe_count == 1
+
+:command:`import-diff` *src-path* *image-spec*
+  Imports an incremental diff of an image and applies it to the current image.  If the diff
+  was generated relative to a start snapshot, we verify that snapshot already exists before
+  continuing.  If there was an end snapshot we verify it does not already exist before
+  applying the changes, and create the snapshot when we are done.
+
+:command:`diff` [--from-snap *snap-name*] [--whole-object] *image-spec* | *snap-spec*
+  Dump a list of byte extents in the image that have changed since the specified start
+  snapshot, or since the image was created.  Each output line includes the starting offset
+  (in bytes), the length of the region (in bytes), and either 'zero' or 'data' to indicate
+  whether the region is known to be zeros or may contain other data.
+
+:command:`cp` (*src-image-spec* | *src-snap-spec*) *dest-image-spec*
+  Copies the content of a src-image into the newly created dest-image.
+  dest-image will have the same size, order, and image format as src-image.
+
+:command:`mv` *src-image-spec* *dest-image-spec*
+  Renames an image.  Note: rename across pools is not supported.
+
+:command:`image-meta list` *image-spec*
+  Show metadata held on the image. The first column is the key
+  and the second column is the value.
+
+:command:`image-meta get` *image-spec* *key*
+  Get metadata value with the key.
+
+:command:`image-meta set` *image-spec* *key* *value*
+  Set metadata key with the value. They will displayed in `image-meta list`.
+
+:command:`image-meta remove` *image-spec* *key*
+  Remove metadata key with the value.
+
+:command:`object-map rebuild` *image-spec* | *snap-spec*
+  Rebuilds an invalid object map for the specified image. An image snapshot can be
+  specified to rebuild an invalid object map for a snapshot.
+
+:command:`snap ls` *image-spec*
+  Dumps the list of snapshots inside a specific image.
+
+:command:`snap create` *snap-spec*
+  Creates a new snapshot. Requires the snapshot name parameter specified.
+
+:command:`snap rollback` *snap-spec*
+  Rollback image content to snapshot. This will iterate through the entire blocks
+  array and update the data head content to the snapshotted version.
+
+:command:`snap rm` *snap-spec*
+  Removes the specified snapshot.
+
+:command:`snap purge` *image-spec*
+  Removes all snapshots from an image.
+
+:command:`snap protect` *snap-spec*
+  Protect a snapshot from deletion, so that clones can be made of it
+  (see `rbd clone`).  Snapshots must be protected before clones are made;
+  protection implies that there exist dependent cloned children that
+  refer to this snapshot.  `rbd clone` will fail on a nonprotected
+  snapshot.
+
+  This requires image format 2.
+
+:command:`snap unprotect` *snap-spec*
+  Unprotect a snapshot from deletion (undo `snap protect`).  If cloned
+  children remain, `snap unprotect` fails.  (Note that clones may exist
+  in different pools than the parent snapshot.)
+
+  This requires image format 2.
+
+:command:`map` [-o | --options *map-options* ] [--read-only] *image-spec* | *snap-spec*
+  Maps the specified image to a block device via the rbd kernel module.
+
+:command:`unmap` *image-spec* | *snap-spec* | *device-path*
+  Unmaps the block device that was mapped via the rbd kernel module.
+
+:command:`showmapped`
+  Show the rbd images that are mapped via the rbd kernel module.
+
+:command:`status` *image-spec*
+  Show the status of the image, including which clients have it open.
+
+:command:`feature disable` *image-spec* *feature-name*...
+  Disables the specified feature on the specified image. Multiple features can
+  be specified.
+
+:command:`feature enable` *image-spec* *feature-name*...
+  Enables the specified feature on the specified image. Multiple features can
+  be specified.
+
+:command:`lock list` *image-spec*
+  Show locks held on the image. The first column is the locker
+  to use with the `lock remove` command.
+
+:command:`lock add` [--shared *lock-tag*] *image-spec* *lock-id*
+  Lock an image. The lock-id is an arbitrary name for the user's
+  convenience. By default, this is an exclusive lock, meaning it
+  will fail if the image is already locked. The --shared option
+  changes this behavior. Note that locking does not affect
+  any operation other than adding a lock. It does not
+  protect an image from being deleted.
+
+:command:`lock remove` *image-spec* *lock-id* *locker*
+  Release a lock on an image. The lock id and locker are
+  as output by lock ls.
+
+:command:`bench-write` [--io-size *size-in-B/K/M/G/T*] [--io-threads *num-ios-in-flight*] [--io-total *total-size-to-write-in-B/K/M/G/T*] [--io-pattern seq | rand] *image-spec*
+  Generate a series of writes to the image and measure the write throughput and
+  latency.  Defaults are: --io-size 4096, --io-threads 16, --io-total 1G,
+  --io-pattern seq.
+
+Image and snap specs
+====================
+
+| *image-spec* is [*pool-name*]/*image-name*
+| *snap-spec*  is [*pool-name*]/*image-name*\ @\ *snap-name*
+
+The default for *pool-name* is "rbd".  If an image name contains a slash
+character ('/'), *pool-name* is required.
+
+You may specify each name individually, using --pool, --image and --snap
+options, but this is discouraged in favor of the above spec syntax.
+
+Striping
+========
+
+RBD images are striped over many objects, which are then stored by the
+Ceph distributed object store (RADOS).  As a result, read and write
+requests for the image are distributed across many nodes in the
+cluster, generally preventing any single node from becoming a
+bottleneck when individual images get large or busy.
+
+The striping is controlled by three parameters:
+
+.. option:: order
+
+  The size of objects we stripe over is a power of two, specifically 2^[*order*] bytes.  The default
+  is 22, or 4 MB.
+
+.. option:: stripe_unit
+
+  Each [*stripe_unit*] contiguous bytes are stored adjacently in the same object, before we move on
+  to the next object.
+
+.. option:: stripe_count
+
+  After we write [*stripe_unit*] bytes to [*stripe_count*] objects, we loop back to the initial object
+  and write another stripe, until the object reaches its maximum size (as specified by [*order*].  At that
+  point, we move on to the next [*stripe_count*] objects.
+
+By default, [*stripe_unit*] is the same as the object size and [*stripe_count*] is 1.  Specifying a different
+[*stripe_unit*] requires that the STRIPINGV2 feature be supported (added in Ceph v0.53) and format 2 images be
+used.
+
+
+Map options
+===========
+
+Most of these options are useful mainly for debugging and benchmarking.  The
+default values are set in the kernel and may therefore depend on the version of
+the running kernel.
+
+libceph (per client instance) options:
+
+* fsid=aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee - FSID that should be assumed by
+  the client.
+
+* ip=a.b.c.d[:p] - IP and, optionally, port the client should use.
+
+* share - Enable sharing of client instances with other mappings (default).
+
+* noshare - Disable sharing of client instances with other mappings.
+
+* crc - Enable CRC32C checksumming for data writes (default).
+
+* nocrc - Disable CRC32C checksumming for data writes.
+
+* cephx_require_signatures - Require cephx message signing (since 3.19,
+  default).
+
+* nocephx_require_signatures - Don't require cephx message signing (since
+  3.19).
+
+* tcp_nodelay - Disable Nagle's algorithm on client sockets (since 4.0,
+  default).
+
+* notcp_nodelay - Enable Nagle's algorithm on client sockets (since 4.0).
+
+* mount_timeout=x - A timeout on various steps in `rbd map` and `rbd unmap`
+  sequences (default is 60 seconds).  In particular, since 4.2 this can be used
+  to ensure that `rbd unmap` eventually times out when there is no network
+  connection to a cluster.
+
+* osdkeepalive=x - OSD keepalive timeout (default is 5 seconds).
+
+* osd_idle_ttl=x - OSD idle TTL (default is 60 seconds).
+
+Mapping (per block device) options:
+
+* rw - Map the image read-write (default).
+
+* ro - Map the image read-only.  Equivalent to --read-only.
+
+* queue_depth=x - queue depth (since 4.2, default is 128 requests).
+
+
+Examples
+========
+
+To create a new rbd image that is 100 GB::
+
+       rbd create mypool/myimage --size 102400
+
+To use a non-default object size (8 MB)::
+
+       rbd create mypool/myimage --size 102400 --order 23
+
+To delete an rbd image (be careful!)::
+
+       rbd rm mypool/myimage
+
+To create a new snapshot::
+
+       rbd snap create mypool/myimage at mysnap
+
+To create a copy-on-write clone of a protected snapshot::
+
+       rbd clone mypool/myimage at mysnap otherpool/cloneimage
+
+To see which clones of a snapshot exist::
+
+       rbd children mypool/myimage at mysnap
+
+To delete a snapshot::
+
+       rbd snap rm mypool/myimage at mysnap
+
+To map an image via the kernel with cephx enabled::
+
+       rbd map mypool/myimage --id admin --keyfile secretfile
+
+To unmap an image::
+
+       rbd unmap /dev/rbd0
+
+To create an image and a clone from it::
+
+       rbd import --image-format 2 image mypool/parent
+       rbd snap create mypool/parent at snap
+       rbd snap protect mypool/parent at snap
+       rbd clone mypool/parent at snap otherpool/child
+
+To create an image with a smaller stripe_unit (to better distribute small writes in some workloads)::
+
+       rbd create mypool/myimage --size 102400 --stripe-unit 65536B --stripe-count 16
+
+To change an image from one image format to another, export it and then
+import it as the desired image format::
+
+       rbd export mypool/myimage at snap /tmp/img
+       rbd import --image-format 2 /tmp/img mypool/myimage2
+
+To lock an image for exclusive use::
+
+       rbd lock add mypool/myimage mylockid
+
+To release a lock::
+
+       rbd lock remove mypool/myimage mylockid client.2485
+
+
+Availability
+============
+
+**rbd** is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to
+the Ceph documentation at http://ceph.com/docs for more information.
+
+
+See also
+========
+
+:doc:`ceph <ceph>`\(8),
+:doc:`rados <rados>`\(8)
diff --git a/etc/default/ceph b/etc/default/ceph
new file mode 100644
index 0000000..6d6f40e
--- /dev/null
+++ b/etc/default/ceph
@@ -0,0 +1,12 @@
+# /etc/default/ceph
+#
+# Environment file for ceph daemon systemd unit files.
+#
+
+## use jemalloc instead of tcmalloc
+#
+# jemalloc is generally faster for small IO workloads and when
+# ceph-osd is backed by SSDs.  However, memory usage is usually
+# higher by 200-300mb.
+#
+#LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so.1
diff --git a/etc/sysconfig/SuSEfirewall2.d/services/ceph-mon b/etc/sysconfig/SuSEfirewall2.d/services/ceph-mon
new file mode 100644
index 0000000..7a28e73
--- /dev/null
+++ b/etc/sysconfig/SuSEfirewall2.d/services/ceph-mon
@@ -0,0 +1,5 @@
+## Name: Ceph MON
+## Description: Open port for Ceph Monitor
+
+# space separated list of allowed TCP ports
+TCP="6789"
diff --git a/etc/sysconfig/SuSEfirewall2.d/services/ceph-osd-mds b/etc/sysconfig/SuSEfirewall2.d/services/ceph-osd-mds
new file mode 100644
index 0000000..0109fde
--- /dev/null
+++ b/etc/sysconfig/SuSEfirewall2.d/services/ceph-osd-mds
@@ -0,0 +1,5 @@
+## Name: Ceph OSD/MDS
+## Description: Open ports for Ceph OSDs and Metadata Servers (max: 166 per node)
+
+# space separated list of allowed TCP ports
+TCP="6800:7300"
diff --git a/etc/sysconfig/ceph b/etc/sysconfig/ceph
new file mode 100644
index 0000000..5f41a00
--- /dev/null
+++ b/etc/sysconfig/ceph
@@ -0,0 +1,23 @@
+# /etc/sysconfig/ceph
+#
+# Environment file for ceph daemon systemd unit files.
+#
+
+## use jemalloc instead of tcmalloc
+#
+# jemalloc is generally faster for small IO workloads and when
+# ceph-osd is backed by SSDs.  However, memory usage is usually
+# higher by 200-300mb.
+#
+#LD_PRELOAD=/usr/lib64/libjemalloc.so.1
+
+## automatically restart systemd units on upgrade
+#
+# By default, it is left to the administrator to restart
+# ceph daemons (or their related systemd units) manually
+# when the 'ceph' package is upgraded. By setting this
+# parameter to "yes", package upgrade will trigger a 
+# "systemctl try-restart" on all the ceph systemd units
+# currently active on the node.
+#
+CEPH_AUTO_RESTART_ON_UPGRADE=no
diff --git a/install-deps.sh b/install-deps.sh
index 129b238..1bebf09 100755
--- a/install-deps.sh
+++ b/install-deps.sh
@@ -1,8 +1,8 @@
-#!/bin/bash
+#!/bin/bash -e
 #
 # Ceph distributed storage system
 #
-# Copyright (C) 2014 Red Hat <contact at redhat.com>
+# Copyright (C) 2014, 2015 Red Hat <contact at redhat.com>
 #
 # Author: Loic Dachary <loic at dachary.org>
 #
@@ -23,10 +23,14 @@ if test -f /etc/redhat-release ; then
     $SUDO yum install -y redhat-lsb-core
 fi
 
-if which apt-get > /dev/null ; then
+if type apt-get > /dev/null 2>&1 ; then
     $SUDO apt-get install -y lsb-release
 fi
 
+if type zypper > /dev/null 2>&1 ; then
+    $SUDO zypper --gpg-auto-import-keys --non-interactive install lsb-release
+fi
+
 case $(lsb_release -si) in
 Ubuntu|Debian|Devuan)
         $SUDO apt-get install -y dpkg-dev
@@ -38,30 +42,106 @@ Ubuntu|Debian|Devuan)
         packages=$(dpkg-checkbuilddeps --admindir=$DIR debian/control 2>&1 | \
             perl -p -e 's/.*Unmet build dependencies: *//;' \
             -e 's/build-essential:native/build-essential/;' \
-            -e 's/\|//g;' \
+            -e 's/\s*\|\s*/\|/g;' \
             -e 's/\(.*?\)//g;' \
             -e 's/ +/\n/g;' | sort)
         case $(lsb_release -sc) in
             squeeze|wheezy)
                 packages=$(echo $packages | perl -pe 's/[-\w]*babeltrace[-\w]*//g')
+                backports="-t $(lsb_release -sc)-backports"
                 ;;
         esac
         packages=$(echo $packages) # change newlines into spaces
-        $SUDO bash -c "DEBIAN_FRONTEND=noninteractive apt-get install -y $packages"
+        $SUDO env DEBIAN_FRONTEND=noninteractive apt-get install $backports -y $packages || exit 1
         ;;
-CentOS|Fedora|SUSE*|RedHatEnterpriseServer)
+CentOS|Fedora|RedHatEnterpriseServer)
         case $(lsb_release -si) in
-            SUSE*)
-                $SUDO zypper -y yum-utils
+            Fedora)
+                $SUDO yum install -y yum-utils
                 ;;
-            *)
+            CentOS|RedHatEnterpriseServer)
                 $SUDO yum install -y yum-utils
+                MAJOR_VERSION=$(lsb_release -rs | cut -f1 -d.)
+                if test $(lsb_release -si) == RedHatEnterpriseServer ; then
+                    $SUDO yum install subscription-manager
+                    $SUDO subscription-manager repos --enable=rhel-$MAJOR_VERSION-server-optional-rpms
+                fi
+                $SUDO yum-config-manager --add-repo https://dl.fedoraproject.org/pub/epel/$MAJOR_VERSION/x86_64/ 
+                $SUDO yum install --nogpgcheck -y epel-release
+                $SUDO rpm --import /etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-$MAJOR_VERSION
+                $SUDO rm -f /etc/yum.repos.d/dl.fedoraproject.org*
                 ;;
         esac
         sed -e 's/@//g' < ceph.spec.in > $DIR/ceph.spec
-        $SUDO yum-builddep -y $DIR/ceph.spec
+        $SUDO yum-builddep -y $DIR/ceph.spec 2>&1 | tee $DIR/yum-builddep.out
+        ! grep -q -i error: $DIR/yum-builddep.out || exit 1
+        ;;
+*SUSE*)
+        sed -e 's/@//g' < ceph.spec.in > $DIR/ceph.spec
+        $SUDO zypper --non-interactive install $(rpmspec -q --buildrequires $DIR/ceph.spec) || exit 1
         ;;
 *)
         echo "$(lsb_release -si) is unknown, dependencies will have to be installed manually."
         ;;
 esac
+
+function populate_wheelhouse() {
+    local install=$1
+    shift
+
+    # Ubuntu-12.04 and Python 2.7.3 require this line
+    pip --timeout 300 $install 'distribute >= 0.7.3' || return 1
+    # although pip comes with virtualenv, having a recent version
+    # of pip matters when it comes to using wheel packages
+    pip --timeout 300 $install 'setuptools >= 0.8' 'pip >= 7.0' 'wheel >= 0.24' || return 1
+    if test $# != 0 ; then
+        pip --timeout 300 $install $@ || return 1
+    fi
+}
+
+function activate_virtualenv() {
+    local top_srcdir=$1
+    local interpreter=$2
+    local env_dir=$top_srcdir/install-deps-$interpreter
+
+    if ! test -d $env_dir ; then
+        virtualenv --python $interpreter $env_dir
+        . $env_dir/bin/activate
+        if ! populate_wheelhouse install ; then
+            rm -rf $env_dir
+            return 1
+        fi
+    fi
+    . $env_dir/bin/activate
+}
+
+# use pip cache if possible but do not store it outside of the source
+# tree
+# see https://pip.pypa.io/en/stable/reference/pip_install.html#caching
+mkdir -p install-deps-cache
+top_srcdir=$(pwd)
+export XDG_CACHE_HOME=$top_srcdir/install-deps-cache
+wip_wheelhouse=wheelhouse-wip
+
+#
+# preload python modules so that tox can run without network access
+#
+find . -name tox.ini | while read ini ; do
+    (
+        cd $(dirname $ini)
+        require=$(ls *requirements.txt 2>/dev/null | sed -e 's/^/-r /')
+        if test "$require" && ! test -d wheelhouse ; then
+            for interpreter in python2.7 python3 ; do
+                type $interpreter > /dev/null 2>&1 || continue
+                activate_virtualenv $top_srcdir $interpreter || exit 1
+                populate_wheelhouse "wheel -w $wip_wheelhouse" $require || exit 1
+            done
+            mv $wip_wheelhouse wheelhouse
+        fi
+    )
+done
+
+for interpreter in python2.7 python3 ; do
+    rm -rf $top_srcdir/install-deps-$interpreter
+done
+rm -rf $XDG_CACHE_HOME
diff --git a/m4/ax_arm.m4 b/m4/ax_arm.m4
index 2ccc9a9..37ea0aa 100644
--- a/m4/ax_arm.m4
+++ b/m4/ax_arm.m4
@@ -13,13 +13,27 @@ AC_DEFUN([AX_ARM_FEATURES],
       fi
     ;;
     aarch64*)
+      AX_CHECK_COMPILE_FLAG(-march=armv8-a, ax_cv_support_armv8=yes, [])
+      if test x"$ax_cv_support_armv8" = x"yes"; then
+        ARM_ARCH_FLAGS="-march=armv8-a"
+        ARM_DEFINE_FLAGS="-DARCH_AARCH64"
+      fi
       AX_CHECK_COMPILE_FLAG(-march=armv8-a+simd, ax_cv_support_neon_ext=yes, [])
       if test x"$ax_cv_support_neon_ext" = x"yes"; then
+        ARM_ARCH_FLAGS="$ARM_ARCH_FLAGS+simd"
+        ARM_DEFINE_FLAGS="$ARM_DEFINE_FLAGS -DARM_NEON"
         ARM_NEON_FLAGS="-march=armv8-a+simd -DARCH_AARCH64 -DARM_NEON"
-        AC_SUBST(ARM_NEON_FLAGS)
-        ARM_FLAGS="$ARM_FLAGS $ARM_NEON_FLAGS"
         AC_DEFINE(HAVE_NEON,,[Support NEON instructions])
+        AC_SUBST(ARM_NEON_FLAGS)
+      fi
+      AX_CHECK_COMPILE_FLAG(-march=armv8-a+crc, ax_cv_support_crc_ext=yes, [])
+      if test x"$ax_cv_support_crc_ext" = x"yes"; then
+        ARM_ARCH_FLAGS="$ARM_ARCH_FLAGS+crc"
+        ARM_CRC_FLAGS="-march=armv8-a+crc -DARCH_AARCH64"
+        AC_DEFINE(HAVE_ARMV8_CRC,,[Support ARMv8 CRC instructions])
+        AC_SUBST(ARM_CRC_FLAGS)
       fi
+        ARM_FLAGS="$ARM_ARCH_FLAGS $ARM_DEFINE_FLAGS"
     ;;
   esac
 
diff --git a/man/Makefile-server.am b/man/Makefile-server.am
index a780ebe..6387346 100644
--- a/man/Makefile-server.am
+++ b/man/Makefile-server.am
@@ -7,6 +7,11 @@ dist_man_MANS += \
 	ceph-rest-api.8 \
 	ceph-debugpack.8
 
+if WITH_SELINUX
+dist_man_MANS += \
+	ceph_selinux.8
+endif
+
 if WITH_MON
 dist_man_MANS += \
 	ceph-mon.8 \
@@ -16,6 +21,7 @@ endif
 if WITH_OSD
 dist_man_MANS += \
 	ceph-clsinfo.8 \
+	ceph-detect-init.8 \
 	ceph-disk.8 \
 	ceph-osd.8 \
 	osdmaptool.8
diff --git a/man/Makefile.am b/man/Makefile.am
index 64f07cc..40506ba 100644
--- a/man/Makefile.am
+++ b/man/Makefile.am
@@ -1,7 +1,10 @@
 AUTOMAKE_OPTIONS = gnu
 
+EXTRA_DIST = conf.py ceph_selinux.8
+
 dist_man_MANS = 
 
+if WITH_MAN_PAGES
 if ENABLE_CLIENT
 include Makefile-client.am
 endif
@@ -9,3 +12,20 @@ endif
 if ENABLE_SERVER
 include Makefile-server.am
 endif
+
+# prevent `make` from running in parallel, sphinx runs better in batch mode.
+.PHONY: sphinx-build.stamp
+
+$(dist_man_MANS): sphinx-build.stamp
+
+# in a tree populated from dist tarball, the $(top_srcdir)/doc is not included
+sphinx-build.stamp:
+	if [ -d $(top_srcdir)/doc/man ] ; then \
+		${SPHINX_BUILD} -b man -d doctrees -c $(top_srcdir)/man $(top_srcdir)/doc/man $(top_builddir)/man; \
+	fi
+
+clean-local:
+	@rm -rf doctrees
+
+MAINTAINERCLEANFILES = $(dist_man_MANS)
+endif
diff --git a/man/Makefile.in b/man/Makefile.in
index 30f2088..2bf8c61 100644
--- a/man/Makefile.in
+++ b/man/Makefile.in
@@ -81,57 +81,61 @@ target_triplet = @target@
 DIST_COMMON = $(srcdir)/Makefile-client.am \
 	$(srcdir)/Makefile-server.am $(srcdir)/Makefile.in \
 	$(srcdir)/Makefile.am $(dist_man_MANS)
- at ENABLE_CLIENT_TRUE@am__append_1 = \
- at ENABLE_CLIENT_TRUE@	ceph-syn.8 \
- at ENABLE_CLIENT_TRUE@	ceph-conf.8 \
- at ENABLE_CLIENT_TRUE@	ceph.8 \
- at ENABLE_CLIENT_TRUE@	ceph-authtool.8 \
- at ENABLE_CLIENT_TRUE@	rados.8 \
- at ENABLE_CLIENT_TRUE@	rbd.8 \
- at ENABLE_CLIENT_TRUE@	ceph-post-file.8 \
- at ENABLE_CLIENT_TRUE@	ceph-dencoder.8
-
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_2 = \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	librados-config.8
-
- at ENABLE_CLIENT_TRUE@@WITH_RBD_TRUE at am__append_3 = \
- at ENABLE_CLIENT_TRUE@@WITH_RBD_TRUE@	ceph-rbdnamer.8 \
- at ENABLE_CLIENT_TRUE@@WITH_RBD_TRUE@	rbd-replay.8 \
- at ENABLE_CLIENT_TRUE@@WITH_RBD_TRUE@	rbd-replay-many.8 \
- at ENABLE_CLIENT_TRUE@@WITH_RBD_TRUE@	rbd-replay-prep.8
-
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE at am__append_4 = \
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@	cephfs.8
-
- at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE at am__append_5 = \
- at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@	rbd-fuse.8 \
- at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@	ceph-fuse.8
-
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE at am__append_6 = \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@	radosgw.8 \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@	radosgw-admin.8
-
- at ENABLE_SERVER_TRUE@am__append_7 = \
- at ENABLE_SERVER_TRUE@	ceph-deploy.8 \
- at ENABLE_SERVER_TRUE@	crushtool.8 \
- at ENABLE_SERVER_TRUE@	ceph-run.8 \
- at ENABLE_SERVER_TRUE@	mount.ceph.8 \
- at ENABLE_SERVER_TRUE@	ceph-create-keys.8 \
- at ENABLE_SERVER_TRUE@	ceph-rest-api.8 \
- at ENABLE_SERVER_TRUE@	ceph-debugpack.8
-
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am__append_8 = \
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	ceph-mon.8 \
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	monmaptool.8
-
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_9 = \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	ceph-clsinfo.8 \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	ceph-disk.8 \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	ceph-osd.8 \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osdmaptool.8
-
- at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at am__append_10 = \
- at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	ceph-mds.8
+ at ENABLE_CLIENT_TRUE@@WITH_MAN_PAGES_TRUE at am__append_1 = \
+ at ENABLE_CLIENT_TRUE@@WITH_MAN_PAGES_TRUE@	ceph-syn.8 \
+ at ENABLE_CLIENT_TRUE@@WITH_MAN_PAGES_TRUE@	ceph-conf.8 \
+ at ENABLE_CLIENT_TRUE@@WITH_MAN_PAGES_TRUE@	ceph.8 \
+ at ENABLE_CLIENT_TRUE@@WITH_MAN_PAGES_TRUE@	ceph-authtool.8 \
+ at ENABLE_CLIENT_TRUE@@WITH_MAN_PAGES_TRUE@	rados.8 \
+ at ENABLE_CLIENT_TRUE@@WITH_MAN_PAGES_TRUE@	rbd.8 \
+ at ENABLE_CLIENT_TRUE@@WITH_MAN_PAGES_TRUE@	ceph-post-file.8 \
+ at ENABLE_CLIENT_TRUE@@WITH_MAN_PAGES_TRUE@	ceph-dencoder.8
+
+ at ENABLE_CLIENT_TRUE@@WITH_MAN_PAGES_TRUE@@WITH_RADOS_TRUE at am__append_2 = \
+ at ENABLE_CLIENT_TRUE@@WITH_MAN_PAGES_TRUE@@WITH_RADOS_TRUE@	librados-config.8
+
+ at ENABLE_CLIENT_TRUE@@WITH_MAN_PAGES_TRUE@@WITH_RBD_TRUE at am__append_3 = \
+ at ENABLE_CLIENT_TRUE@@WITH_MAN_PAGES_TRUE@@WITH_RBD_TRUE@	ceph-rbdnamer.8 \
+ at ENABLE_CLIENT_TRUE@@WITH_MAN_PAGES_TRUE@@WITH_RBD_TRUE@	rbd-replay.8 \
+ at ENABLE_CLIENT_TRUE@@WITH_MAN_PAGES_TRUE@@WITH_RBD_TRUE@	rbd-replay-many.8 \
+ at ENABLE_CLIENT_TRUE@@WITH_MAN_PAGES_TRUE@@WITH_RBD_TRUE@	rbd-replay-prep.8
+
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_MAN_PAGES_TRUE at am__append_4 = \
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_MAN_PAGES_TRUE@	cephfs.8
+
+ at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_MAN_PAGES_TRUE at am__append_5 = \
+ at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_MAN_PAGES_TRUE@	rbd-fuse.8 \
+ at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_MAN_PAGES_TRUE@	ceph-fuse.8
+
+ at ENABLE_CLIENT_TRUE@@WITH_MAN_PAGES_TRUE@@WITH_RADOSGW_TRUE at am__append_6 = \
+ at ENABLE_CLIENT_TRUE@@WITH_MAN_PAGES_TRUE@@WITH_RADOSGW_TRUE@	radosgw.8 \
+ at ENABLE_CLIENT_TRUE@@WITH_MAN_PAGES_TRUE@@WITH_RADOSGW_TRUE@	radosgw-admin.8
+
+ at ENABLE_SERVER_TRUE@@WITH_MAN_PAGES_TRUE at am__append_7 = \
+ at ENABLE_SERVER_TRUE@@WITH_MAN_PAGES_TRUE@	ceph-deploy.8 \
+ at ENABLE_SERVER_TRUE@@WITH_MAN_PAGES_TRUE@	crushtool.8 \
+ at ENABLE_SERVER_TRUE@@WITH_MAN_PAGES_TRUE@	ceph-run.8 \
+ at ENABLE_SERVER_TRUE@@WITH_MAN_PAGES_TRUE@	mount.ceph.8 \
+ at ENABLE_SERVER_TRUE@@WITH_MAN_PAGES_TRUE@	ceph-create-keys.8 \
+ at ENABLE_SERVER_TRUE@@WITH_MAN_PAGES_TRUE@	ceph-rest-api.8 \
+ at ENABLE_SERVER_TRUE@@WITH_MAN_PAGES_TRUE@	ceph-debugpack.8
+
+ at ENABLE_SERVER_TRUE@@WITH_MAN_PAGES_TRUE@@WITH_SELINUX_TRUE at am__append_8 = \
+ at ENABLE_SERVER_TRUE@@WITH_MAN_PAGES_TRUE@@WITH_SELINUX_TRUE@	ceph_selinux.8
+
+ at ENABLE_SERVER_TRUE@@WITH_MAN_PAGES_TRUE@@WITH_MON_TRUE at am__append_9 = \
+ at ENABLE_SERVER_TRUE@@WITH_MAN_PAGES_TRUE@@WITH_MON_TRUE@	ceph-mon.8 \
+ at ENABLE_SERVER_TRUE@@WITH_MAN_PAGES_TRUE@@WITH_MON_TRUE@	monmaptool.8
+
+ at ENABLE_SERVER_TRUE@@WITH_MAN_PAGES_TRUE@@WITH_OSD_TRUE at am__append_10 = \
+ at ENABLE_SERVER_TRUE@@WITH_MAN_PAGES_TRUE@@WITH_OSD_TRUE@	ceph-clsinfo.8 \
+ at ENABLE_SERVER_TRUE@@WITH_MAN_PAGES_TRUE@@WITH_OSD_TRUE@	ceph-detect-init.8 \
+ at ENABLE_SERVER_TRUE@@WITH_MAN_PAGES_TRUE@@WITH_OSD_TRUE@	ceph-disk.8 \
+ at ENABLE_SERVER_TRUE@@WITH_MAN_PAGES_TRUE@@WITH_OSD_TRUE@	ceph-osd.8 \
+ at ENABLE_SERVER_TRUE@@WITH_MAN_PAGES_TRUE@@WITH_OSD_TRUE@	osdmaptool.8
+
+ at ENABLE_SERVER_TRUE@@WITH_MAN_PAGES_TRUE@@WITH_MDS_TRUE at am__append_11 = \
+ at ENABLE_SERVER_TRUE@@WITH_MAN_PAGES_TRUE@@WITH_MDS_TRUE@	ceph-mds.8
 
 subdir = man
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
@@ -213,6 +217,7 @@ AMTAR = @AMTAR@
 AM_CXXFLAGS = @AM_CXXFLAGS@
 AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
 AR = @AR@
+ARM_CRC_FLAGS = @ARM_CRC_FLAGS@
 ARM_FLAGS = @ARM_FLAGS@
 ARM_NEON_FLAGS = @ARM_NEON_FLAGS@
 AUTOCONF = @AUTOCONF@
@@ -220,6 +225,7 @@ AUTOHEADER = @AUTOHEADER@
 AUTOMAKE = @AUTOMAKE@
 AWK = @AWK@
 BOOST_PROGRAM_OPTIONS_LIBS = @BOOST_PROGRAM_OPTIONS_LIBS@
+BOOST_RANDOM_LIBS = @BOOST_RANDOM_LIBS@
 BOOST_THREAD_LIBS = @BOOST_THREAD_LIBS@
 CC = @CC@
 CCAS = @CCAS@
@@ -277,7 +283,8 @@ LD = @LD@
 LDFLAGS = @LDFLAGS@
 LIBEDIT_CFLAGS = @LIBEDIT_CFLAGS@
 LIBEDIT_LIBS = @LIBEDIT_LIBS@
-LIBFUSE = @LIBFUSE@
+LIBFUSE_CFLAGS = @LIBFUSE_CFLAGS@
+LIBFUSE_LIBS = @LIBFUSE_LIBS@
 LIBJEMALLOC = @LIBJEMALLOC@
 LIBOBJS = @LIBOBJS@
 LIBROCKSDB_CFLAGS = @LIBROCKSDB_CFLAGS@
@@ -328,6 +335,7 @@ RPM_RELEASE = @RPM_RELEASE@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
+SPHINX_BUILD = @SPHINX_BUILD@
 STRIP = @STRIP@
 VERSION = @VERSION@
 WARN_ERROR_FORMAT_SECURITY = @WARN_ERROR_FORMAT_SECURITY@
@@ -361,6 +369,7 @@ datarootdir = @datarootdir@
 docdir = @docdir@
 dvidir = @dvidir@
 exec_prefix = @exec_prefix@
+group_rgw = @group_rgw@
 host = @host@
 host_alias = @host_alias@
 host_cpu = @host_cpu@
@@ -390,6 +399,8 @@ sharedstatedir = @sharedstatedir@
 srcdir = @srcdir@
 subdirs = @subdirs@
 sysconfdir = @sysconfdir@
+systemd_libexec_dir = @systemd_libexec_dir@
+systemd_unit_dir = @systemd_unit_dir@
 target = @target@
 target_alias = @target_alias@
 target_cpu = @target_cpu@
@@ -398,11 +409,14 @@ target_vendor = @target_vendor@
 top_build_prefix = @top_build_prefix@
 top_builddir = @top_builddir@
 top_srcdir = @top_srcdir@
+user_rgw = @user_rgw@
 AUTOMAKE_OPTIONS = gnu
+EXTRA_DIST = conf.py ceph_selinux.8
 dist_man_MANS = $(am__append_1) $(am__append_2) $(am__append_3) \
 	$(am__append_4) $(am__append_5) $(am__append_6) \
 	$(am__append_7) $(am__append_8) $(am__append_9) \
-	$(am__append_10)
+	$(am__append_10) $(am__append_11)
+ at WITH_MAN_PAGES_TRUE@MAINTAINERCLEANFILES = $(dist_man_MANS)
 all: all-am
 
 .SUFFIXES:
@@ -560,9 +574,11 @@ distclean-generic:
 maintainer-clean-generic:
 	@echo "This command is intended for maintainers to use"
 	@echo "it deletes files that may require special tools to rebuild."
+	-test -z "$(MAINTAINERCLEANFILES)" || rm -f $(MAINTAINERCLEANFILES)
+ at WITH_MAN_PAGES_FALSE@clean-local:
 clean: clean-am
 
-clean-am: clean-generic clean-libtool mostlyclean-am
+clean-am: clean-generic clean-libtool clean-local mostlyclean-am
 
 distclean: distclean-am
 	-rm -f Makefile
@@ -631,7 +647,7 @@ uninstall-man: uninstall-man8
 .MAKE: install-am install-strip
 
 .PHONY: all all-am check check-am clean clean-generic clean-libtool \
-	cscopelist-am ctags-am distclean distclean-generic \
+	clean-local cscopelist-am ctags-am distclean distclean-generic \
 	distclean-libtool distdir dvi dvi-am html html-am info info-am \
 	install install-am install-data install-data-am install-dvi \
 	install-dvi-am install-exec install-exec-am install-html \
@@ -644,6 +660,20 @@ uninstall-man: uninstall-man8
 	uninstall-man8
 
 
+# prevent `make` from running in parallel, sphinx runs better in batch mode.
+ at WITH_MAN_PAGES_TRUE@.PHONY: sphinx-build.stamp
+
+ at WITH_MAN_PAGES_TRUE@$(dist_man_MANS): sphinx-build.stamp
+
+# in a tree populated from dist tarball, the $(top_srcdir)/doc is not included
+ at WITH_MAN_PAGES_TRUE@sphinx-build.stamp:
+ at WITH_MAN_PAGES_TRUE@	if [ -d $(top_srcdir)/doc/man ] ; then \
+ at WITH_MAN_PAGES_TRUE@		${SPHINX_BUILD} -b man -d doctrees -c $(top_srcdir)/man $(top_srcdir)/doc/man $(top_builddir)/man; \
+ at WITH_MAN_PAGES_TRUE@	fi
+
+ at WITH_MAN_PAGES_TRUE@clean-local:
+ at WITH_MAN_PAGES_TRUE@	@rm -rf doctrees
+
 # Tell versions [3.59,3.63) of GNU make to not export all variables.
 # Otherwise a system limit (for SysV at least) may be exceeded.
 .NOEXPORT:
diff --git a/man/ceph-authtool.8 b/man/ceph-authtool.8
index ec4c7b2..7dc4922 100644
--- a/man/ceph-authtool.8
+++ b/man/ceph-authtool.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-AUTHTOOL" "8" "November 30, 2014" "dev" "Ceph"
+.TH "CEPH-AUTHTOOL" "8" "November 03, 2015" "dev" "Ceph"
 .SH NAME
 ceph-authtool \- ceph keyring manipulation tool
 .
@@ -30,33 +30,6 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .in \\n[rst2man-indent\\n[rst2man-indent-level]]u
 ..
-.
-.nr rst2man-indent-level 0
-.
-.de1 rstReportMargin
-\\$1 \\n[an-margin]
-level \\n[rst2man-indent-level]
-level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
--
-\\n[rst2man-indent0]
-\\n[rst2man-indent1]
-\\n[rst2man-indent2]
-..
-.de1 INDENT
-.\" .rstReportMargin pre:
-. RS \\$1
-. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin]
-. nr rst2man-indent-level +1
-.\" .rstReportMargin post:
-..
-.de UNINDENT
-. RE
-.\" indent \\n[an-margin]
-.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.nr rst2man-indent-level -1
-.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.in \\n[rst2man-indent\\n[rst2man-indent-level]]u
-..
 .SH SYNOPSIS
 .nf
 \fBceph\-authtool\fP \fIkeyringfile\fP [ \-l | \-\-list ] [ \-C | \-\-create\-keyring
@@ -287,7 +260,7 @@ mount \-t ceph serverhost:/ mountpoint \-o name=foo,secret=\(gaceph\-authtool \-
 .UNINDENT
 .SH AVAILABILITY
 .sp
-\fBceph\-authtool\fP is part of Ceph, a massively scalable, open-source, distributed storage system. Please
+\fBceph\-authtool\fP is part of Ceph, a massively scalable, open\-source, distributed storage system. Please
 refer to the Ceph documentation at \fI\%http://ceph.com/docs\fP for more
 information.
 .SH SEE ALSO
diff --git a/man/ceph-clsinfo.8 b/man/ceph-clsinfo.8
index 684cb9e..bf86fe9 100644
--- a/man/ceph-clsinfo.8
+++ b/man/ceph-clsinfo.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-CLSINFO" "8" "January 12, 2014" "dev" "Ceph"
+.TH "CEPH-CLSINFO" "8" "November 03, 2015" "dev" "Ceph"
 .SH NAME
 ceph-clsinfo \- show class object information
 .
@@ -30,33 +30,6 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .in \\n[rst2man-indent\\n[rst2man-indent-level]]u
 ..
-.
-.nr rst2man-indent-level 0
-.
-.de1 rstReportMargin
-\\$1 \\n[an-margin]
-level \\n[rst2man-indent-level]
-level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
--
-\\n[rst2man-indent0]
-\\n[rst2man-indent1]
-\\n[rst2man-indent2]
-..
-.de1 INDENT
-.\" .rstReportMargin pre:
-. RS \\$1
-. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin]
-. nr rst2man-indent-level +1
-.\" .rstReportMargin post:
-..
-.de UNINDENT
-. RE
-.\" indent \\n[an-margin]
-.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.nr rst2man-indent-level -1
-.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.in \\n[rst2man-indent\\n[rst2man-indent-level]]u
-..
 .SH SYNOPSIS
 .nf
 \fBceph\-clsinfo\fP [ \fIoptions\fP ] ... \fIfilename\fP
@@ -84,7 +57,7 @@ Shows the class architecture
 .UNINDENT
 .SH AVAILABILITY
 .sp
-\fBceph\-clsinfo\fP is part of Ceph, a massively scalable, open-source, distributed storage system. Please
+\fBceph\-clsinfo\fP is part of Ceph, a massively scalable, open\-source, distributed storage system. Please
 refer to the Ceph documentation at \fI\%http://ceph.com/docs\fP for more
 information.
 .SH SEE ALSO
diff --git a/man/ceph-conf.8 b/man/ceph-conf.8
index cbe59a0..c297fdf 100644
--- a/man/ceph-conf.8
+++ b/man/ceph-conf.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-CONF" "8" "January 12, 2014" "dev" "Ceph"
+.TH "CEPH-CONF" "8" "November 03, 2015" "dev" "Ceph"
 .SH NAME
 ceph-conf \- ceph conf file tool
 .
@@ -30,40 +30,13 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .in \\n[rst2man-indent\\n[rst2man-indent-level]]u
 ..
-.
-.nr rst2man-indent-level 0
-.
-.de1 rstReportMargin
-\\$1 \\n[an-margin]
-level \\n[rst2man-indent-level]
-level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
--
-\\n[rst2man-indent0]
-\\n[rst2man-indent1]
-\\n[rst2man-indent2]
-..
-.de1 INDENT
-.\" .rstReportMargin pre:
-. RS \\$1
-. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin]
-. nr rst2man-indent-level +1
-.\" .rstReportMargin post:
-..
-.de UNINDENT
-. RE
-.\" indent \\n[an-margin]
-.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.nr rst2man-indent-level -1
-.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.in \\n[rst2man-indent\\n[rst2man-indent-level]]u
-..
 .SH SYNOPSIS
 .nf
 \fBceph\-conf\fP \-c \fIconffile\fP \-\-list\-all\-sections
 \fBceph\-conf\fP \-c \fIconffile\fP \-L
 \fBceph\-conf\fP \-c \fIconffile\fP \-l \fIprefix\fP
 \fBceph\-conf\fP \fIkey\fP \-s \fIsection1\fP ...
-\fBceph\-conf\fP [\-s \fIsection\fP ] \-\-lookup \fIkey\fP
+\fBceph\-conf\fP [\-s \fIsection\fP ] [\-r] \-\-lookup \fIkey\fP
 \fBceph\-conf\fP [\-s \fIsection\fP ] \fIkey\fP
 .fi
 .sp
@@ -74,28 +47,66 @@ configuration file. As with most Ceph programs, you can specify which
 Ceph configuration file to use with the \fB\-c\fP flag.
 .SH ACTIONS
 .sp
-\fBceph\-conf\fP will perform one of the following actions:
-.sp
-\-\-list\-all\-sections or \-L prints out a list of all the section names in the configuration
-file.
-.sp
-\-\-list\-sections or \-l prints out a list of all the sections that begin
-with a given prefix. For example, \-\-list\-sections mon would list all
-sections beginning with mon.
-.sp
-\-\-lookup will search the configuration for a given value.  By default, the sections  that
-are searched are determined by the Ceph name that we are using. The Ceph name defaults to
-client.admin. It can be specified with \-\-name.
-.sp
-For example, if we specify  \-\-name  osd.0,  the  following  sections  will  be  searched:
-[osd.0], [osd], [global]
-.sp
-You  can  specify  additional  sections to search with \-\-section or \-s.  These additional
-sections will be searched before the sections that would normally be searched. As always,
-the first matching entry we find will be returned.
-.sp
-Note:  \-\-lookup is the default action. If no other actions are given on the command line,
-we will default to doing a lookup.
+\fBceph\-conf\fP performs one of the following actions:
+.INDENT 0.0
+.TP
+.B \-L, \-\-list\-all\-sections
+list all sections in the configuration file.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-l, \-\-list\-sections *prefix*
+list the sections with the given \fIprefix\fP\&. For example, \fB\-\-list\-sections mon\fP
+would list all sections beginning with \fBmon\fP\&.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-lookup *key*
+search and print the specified configuration setting. Note:  \fB\-\-lookup\fP is
+the default action. If no other actions are given on the command line, we will
+default to doing a lookup.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-h, \-\-help
+print a summary of usage.
+.UNINDENT
+.SH OPTIONS
+.INDENT 0.0
+.TP
+.B \-c *conffile*
+the Ceph configuration file.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-filter\-key *key*
+filter section list to only include sections with given \fIkey\fP defined.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-filter\-key\-value *key* \(ga\(ga=\(ga\(ga *value*
+filter section list to only include sections with given \fIkey\fP/\fIvalue\fP pair.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-name *type.id*
+the Ceph name in which the sections are searched (default \(aqclient.admin\(aq).
+For example, if we specify \fB\-\-name osd.0\fP, the following sections will be
+searched: [osd.0], [osd], [global]
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-r, \-\-resolve\-search
+search for the first file that exists and can be opened in the resulted
+comma delimited search list.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-s, \-\-section
+additional sections to search.  These additional sections will be searched
+before the sections that would normally be searched. As always, the first
+matching entry we find will be returned.
+.UNINDENT
 .SH EXAMPLES
 .sp
 To find out what value osd 0 will use for the "osd data" option:
@@ -122,7 +133,7 @@ ceph\-conf \-c foo.conf  \-\-name mds.a "log file"
 .UNINDENT
 .UNINDENT
 .sp
-To list all sections that begin with osd:
+To list all sections that begin with "osd":
 .INDENT 0.0
 .INDENT 3.5
 .sp
@@ -145,9 +156,26 @@ ceph\-conf \-c foo.conf \-L
 .fi
 .UNINDENT
 .UNINDENT
+.sp
+To print the path of the "keyring" used by "client.0":
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph\-conf \-\-name client.0 \-r \-l keyring
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.SH FILES
+.sp
+\fB/etc/ceph/$cluster.conf\fP, \fB~/.ceph/$cluster.conf\fP, \fB$cluster.conf\fP
+.sp
+the Ceph configuration files to use if not specified.
 .SH AVAILABILITY
 .sp
-\fBceph\-conf\fP is part of Ceph, a massively scalable, open-source, distributed storage system.  Please refer
+\fBceph\-conf\fP is part of Ceph, a massively scalable, open\-source, distributed storage system.  Please refer
 to the Ceph documentation at \fI\%http://ceph.com/docs\fP for more
 information.
 .SH SEE ALSO
diff --git a/man/ceph-create-keys.8 b/man/ceph-create-keys.8
index a634e7a..5c6ebc1 100644
--- a/man/ceph-create-keys.8
+++ b/man/ceph-create-keys.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-CREATE-KEYS" "8" "June 02, 2015" "dev" "Ceph"
+.TH "CEPH-CREATE-KEYS" "8" "November 03, 2015" "dev" "Ceph"
 .SH NAME
 ceph-create-keys \- ceph keyring generate tool
 .
@@ -40,7 +40,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
 \fBceph\-create\-keys\fP is a utility to generate bootstrap keyrings using
 the given monitor when it is ready.
 .sp
-it creates following auth entities (or users)
+It creates following auth entities (or users)
 .sp
 \fBclient.admin\fP
 .INDENT 0.0
@@ -56,7 +56,7 @@ and their keys for bootstrapping corresponding services
 .UNINDENT
 .UNINDENT
 .sp
-To list all users in cluster:
+To list all users in the cluster:
 .INDENT 0.0
 .INDENT 3.5
 .sp
diff --git a/man/ceph-debugpack.8 b/man/ceph-debugpack.8
index e232ee6..57cb828 100644
--- a/man/ceph-debugpack.8
+++ b/man/ceph-debugpack.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-DEBUGPACK" "8" "January 12, 2014" "dev" "Ceph"
+.TH "CEPH-DEBUGPACK" "8" "November 03, 2015" "dev" "Ceph"
 .SH NAME
 ceph-debugpack \- ceph debug packer utility
 .
@@ -30,33 +30,6 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .in \\n[rst2man-indent\\n[rst2man-indent-level]]u
 ..
-.
-.nr rst2man-indent-level 0
-.
-.de1 rstReportMargin
-\\$1 \\n[an-margin]
-level \\n[rst2man-indent-level]
-level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
--
-\\n[rst2man-indent0]
-\\n[rst2man-indent1]
-\\n[rst2man-indent2]
-..
-.de1 INDENT
-.\" .rstReportMargin pre:
-. RS \\$1
-. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin]
-. nr rst2man-indent-level +1
-.\" .rstReportMargin post:
-..
-.de UNINDENT
-. RE
-.\" indent \\n[an-margin]
-.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.nr rst2man-indent-level -1
-.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.in \\n[rst2man-indent\\n[rst2man-indent-level]]u
-..
 .SH SYNOPSIS
 .nf
 \fBceph\-debugpack\fP [ \fIoptions\fP ] \fIfilename.tar.gz\fP
@@ -82,7 +55,7 @@ startup.
 .UNINDENT
 .SH AVAILABILITY
 .sp
-\fBceph\-debugpack\fP is part of Ceph, a massively scalable, open-source, distributed storage system. Please
+\fBceph\-debugpack\fP is part of Ceph, a massively scalable, open\-source, distributed storage system. Please
 refer to the Ceph documentation at \fI\%http://ceph.com/docs\fP for more
 information.
 .SH SEE ALSO
diff --git a/man/ceph-dencoder.8 b/man/ceph-dencoder.8
index caf626a..e798079 100644
--- a/man/ceph-dencoder.8
+++ b/man/ceph-dencoder.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-DENCODER" "8" "January 12, 2014" "dev" "Ceph"
+.TH "CEPH-DENCODER" "8" "November 03, 2015" "dev" "Ceph"
 .SH NAME
 ceph-dencoder \- ceph encoder/decoder utility
 .
@@ -30,33 +30,6 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .in \\n[rst2man-indent\\n[rst2man-indent-level]]u
 ..
-.
-.nr rst2man-indent-level 0
-.
-.de1 rstReportMargin
-\\$1 \\n[an-margin]
-level \\n[rst2man-indent-level]
-level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
--
-\\n[rst2man-indent0]
-\\n[rst2man-indent1]
-\\n[rst2man-indent2]
-..
-.de1 INDENT
-.\" .rstReportMargin pre:
-. RS \\$1
-. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin]
-. nr rst2man-indent-level +1
-.\" .rstReportMargin post:
-..
-.de UNINDENT
-. RE
-.\" indent \\n[an-margin]
-.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.nr rst2man-indent-level -1
-.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.in \\n[rst2man-indent\\n[rst2man-indent-level]]u
-..
 .SH SYNOPSIS
 .nf
 \fBceph\-dencoder\fP [commands...]
@@ -100,6 +73,12 @@ Select the given type for future \fBencode\fP or \fBdecode\fP operations.
 .UNINDENT
 .INDENT 0.0
 .TP
+.B skip <bytes>
+Seek <bytes> into the imported file before reading data structure, use
+this with objects that have a preamble/header before the object of interest.
+.UNINDENT
+.INDENT 0.0
+.TP
 .B decode
 Decode the contents of the in\-memory buffer into an instance of the
 previously selected type.  If there is an error, report it.
@@ -143,7 +122,7 @@ versions of the software (for those types that support it).
 .UNINDENT
 .SH EXAMPLE
 .sp
-Say you want to examine an attribute on an object stored by \fBceph\-osd\fP\&.  You can do:
+Say you want to examine an attribute on an object stored by \fBceph\-osd\fP\&.  You can do this:
 .INDENT 0.0
 .INDENT 3.5
 .sp
@@ -179,9 +158,29 @@ $ ceph\-dencoder type object_info_t import /tmp/a decode dump_json
 .fi
 .UNINDENT
 .UNINDENT
+.sp
+Alternatively, perhaps you wish to dump an internal CephFS metadata object, you might
+do that like this:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+$ rados \-p metadata get mds_snaptable mds_snaptable.bin
+$ ceph\-dencoder type SnapServer skip 8 import mds_snaptable.bin decode dump_json
+{ "snapserver": { "last_snap": 1,
+   "pending_noop": [],
+   "snaps": [],
+   "need_to_purge": {},
+   "pending_create": [],
+   "pending_destroy": []}}
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
 .SH AVAILABILITY
 .sp
-\fBceph\-dencoder\fP is part of Ceph, a massively scalable, open-source, distributed storage system. Please
+\fBceph\-dencoder\fP is part of Ceph, a massively scalable, open\-source, distributed storage system. Please
 refer to the Ceph documentation at \fI\%http://ceph.com/docs\fP for more
 information.
 .SH SEE ALSO
diff --git a/man/ceph-deploy.8 b/man/ceph-deploy.8
index 3a9a120..1559dbf 100644
--- a/man/ceph-deploy.8
+++ b/man/ceph-deploy.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-DEPLOY" "8" "December 20, 2014" "dev" "Ceph"
+.TH "CEPH-DEPLOY" "8" "November 03, 2015" "dev" "Ceph"
 .SH NAME
 ceph-deploy \- Ceph deployment tool
 .
@@ -30,33 +30,6 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .in \\n[rst2man-indent\\n[rst2man-indent-level]]u
 ..
-.
-.nr rst2man-indent-level 0
-.
-.de1 rstReportMargin
-\\$1 \\n[an-margin]
-level \\n[rst2man-indent-level]
-level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
--
-\\n[rst2man-indent0]
-\\n[rst2man-indent1]
-\\n[rst2man-indent2]
-..
-.de1 INDENT
-.\" .rstReportMargin pre:
-. RS \\$1
-. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin]
-. nr rst2man-indent-level +1
-.\" .rstReportMargin post:
-..
-.de UNINDENT
-. RE
-.\" indent \\n[an-margin]
-.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.nr rst2man-indent-level -1
-.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.in \\n[rst2man-indent\\n[rst2man-indent-level]]u
-..
 .SH SYNOPSIS
 .nf
 \fBceph\-deploy\fP \fBnew\fP [\fIinitial\-monitor\-node(s)\fP]
@@ -137,17 +110,17 @@ ceph\-deploy new [MON][MON...]
 .UNINDENT
 .UNINDENT
 .sp
-Here, [MON] is initial monitor hostname (short hostname i.e, \fBhostname \-s\fP).
+Here, [MON] is the initial monitor hostname (short hostname i.e, \fBhostname \-s\fP).
 .sp
-Other options like \fI\%--no-ssh-copykey\fP, \fI\%--fsid\fP,
-\fI\%--cluster-network\fP and \fI\%--public-network\fP can also be used with
+Other options like \fI\%\-\-no\-ssh\-copykey\fP, \fI\%\-\-fsid\fP,
+\fI\%\-\-cluster\-network\fP and \fI\%\-\-public\-network\fP can also be used with
 this command.
 .sp
 If more than one network interface is used, \fBpublic network\fP setting has to be
 added under \fB[global]\fP section of Ceph configuration file. If the public subnet
 is given, \fBnew\fP command will choose the one IP from the remote host that exists
 within the subnet range. Public network can also be added at runtime using
-\fI\%--public-network\fP option with the command as mentioned above.
+\fI\%\-\-public\-network\fP option with the command as mentioned above.
 .SS install
 .sp
 Install Ceph packages on remote hosts. As a first step it installs
@@ -161,7 +134,7 @@ detection of platform and distribution before installation, if it finds the
 installation with custom cluster name and uses the default name \fBceph\fP for the
 cluster.
 .sp
-If the user explicitly specifies a custom repo url with \fI\%--repo-url\fP for
+If the user explicitly specifies a custom repo url with \fI\%\-\-repo\-url\fP for
 installation, anything detected from the configuration will be overridden and
 the custom repository location will be used for installation of Ceph packages.
 If required, valid custom repositories are also detected and installed. In case
@@ -173,7 +146,7 @@ that holds the flags and information needed to determine what metadata from the
 configuration is to be used.
 .sp
 A user can also opt to install only the repository without installing Ceph and
-its dependencies by using \fI\%--repo\fP option.
+its dependencies by using \fI\%\-\-repo\fP option.
 .sp
 Usage:
 .INDENT 0.0
@@ -192,9 +165,9 @@ Here, [HOST] is/are the host node(s) where Ceph is to be installed.
 An option \fB\-\-release\fP is used to install a release known as CODENAME
 (default: firefly).
 .sp
-Other options like \fI\%--testing\fP, \fI\%--dev\fP, \fI\%--adjust-repos\fP,
-\fI\%--no-adjust-repos\fP, \fI\%--repo\fP, \fI\%--local-mirror\fP,
-\fI\%--repo-url\fP and \fI\%--gpg-url\fP can also be used with this command.
+Other options like \fI\%\-\-testing\fP, \fI\%\-\-dev\fP, \fI\%\-\-adjust\-repos\fP,
+\fI\%\-\-no\-adjust\-repos\fP, \fI\%\-\-repo\fP, \fI\%\-\-local\-mirror\fP,
+\fI\%\-\-repo\-url\fP and \fI\%\-\-gpg\-url\fP can also be used with this command.
 .SS mds
 .sp
 Deploy Ceph mds on remote hosts. A metadata server is needed to use CephFS and
@@ -285,7 +258,7 @@ detects platform and distro for desired host and checks if hostname is compatibl
 for deployment. It then uses the monitor keyring, ensures configuration for new
 monitor host and adds the monitor to the cluster. If the section for the monitor
 exists and defines a mon addr that will be used, otherwise it will fallback by
-resolving the hostname to an IP. If \fI\%--address\fP is used it will override
+resolving the hostname to an IP. If \fI\%\-\-address\fP is used it will override
 all other options. After adding the monitor to the cluster, it gives it some time
 to start. It then looks for any monitor errors and checks monitor status. Monitor
 errors arise if the monitor is not added in \fBmon initial members\fP, if it doesn\(aqt
@@ -312,7 +285,8 @@ ceph\-deploy mon add [HOST] \-\-address [IP]
 .UNINDENT
 .sp
 Here, [HOST] is the hostname and [IP] is the IP address of the desired monitor
-node.
+node. Please note, unlike other \fBmon\fP subcommands, only one node can be
+specified at a time.
 .sp
 Subcommand \fBdestroy\fP is used to completely remove monitors on remote hosts.
 It takes hostnames as arguments. It stops the monitor, verifies if \fBceph\-mon\fP
@@ -327,7 +301,7 @@ Usage:
 .sp
 .nf
 .ft C
-ceph\-deploy mon destroy [HOST]
+ceph\-deploy mon destroy [HOST] [HOST...]
 .ft P
 .fi
 .UNINDENT
@@ -640,8 +614,8 @@ ceph\-deploy forgetkeys
 .sp
 Manage packages on remote hosts. It is used for installing or removing packages
 from remote hosts. The package names for installation or removal are to be
-specified after the command. Two options \fI\%--install\fP and
-\fI\%--remove\fP are used for this purpose.
+specified after the command. Two options \fI\%\-\-install\fP and
+\fI\%\-\-remove\fP are used for this purpose.
 .sp
 Usage:
 .INDENT 0.0
@@ -693,7 +667,7 @@ Here, [HOST] is the hostname where Calamari is to be installed.
 An option \fB\-\-release\fP can be used to use a given release from repositories
 defined in \fBceph\-deploy\fP\(aqs configuration. Defaults to \fBcalamari\-minion\fP\&.
 .sp
-Another option \fI\%--master\fP can also be used with this command.
+Another option \fI\%\-\-master\fP can also be used with this command.
 .SH OPTIONS
 .INDENT 0.0
 .TP
@@ -827,8 +801,8 @@ The domain for the Calamari master server.
 .UNINDENT
 .SH AVAILABILITY
 .sp
-\fBceph\-deploy\fP is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to
-the documentation at \fI\%http://ceph.com/ceph-deploy/docs\fP for more information.
+\fBceph\-deploy\fP is part of Ceph, a massively scalable, open\-source, distributed storage system. Please refer to
+the documentation at \fI\%http://ceph.com/ceph\-deploy/docs\fP for more information.
 .SH SEE ALSO
 .sp
 \fBceph\-mon\fP(8),
diff --git a/man/ceph-detect-init.8 b/man/ceph-detect-init.8
new file mode 100644
index 0000000..5645738
--- /dev/null
+++ b/man/ceph-detect-init.8
@@ -0,0 +1,78 @@
+.\" Man page generated from reStructuredText.
+.
+.TH "CEPH-DETECT-INIT" "8" "November 03, 2015" "dev" "Ceph"
+.SH NAME
+ceph-detect-init \- display the init system Ceph should use
+.
+.nr rst2man-indent-level 0
+.
+.de1 rstReportMargin
+\\$1 \\n[an-margin]
+level \\n[rst2man-indent-level]
+level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
+-
+\\n[rst2man-indent0]
+\\n[rst2man-indent1]
+\\n[rst2man-indent2]
+..
+.de1 INDENT
+.\" .rstReportMargin pre:
+. RS \\$1
+. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin]
+. nr rst2man-indent-level +1
+.\" .rstReportMargin post:
+..
+.de UNINDENT
+. RE
+.\" indent \\n[an-margin]
+.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]]
+.nr rst2man-indent-level -1
+.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
+.in \\n[rst2man-indent\\n[rst2man-indent-level]]u
+..
+.SH SYNOPSIS
+.nf
+\fBceph\-detect\-init\fP [\-\-verbose] [\-\-use\-rhceph] [\-\-default \fIinit\fP]
+.fi
+.sp
+.SH DESCRIPTION
+.sp
+\fBceph\-detect\-init\fP is a utility that prints the init system
+Ceph uses. It can be one of \fBsysvinit\fP, \fBupstart\fP or \fBsystemd\fP\&.
+The init system Ceph uses may not be the default init system of the
+host operating system. For instance on Debian Jessie, Ceph may use
+\fBsysvinit\fP although \fBsystemd\fP is the default.
+.sp
+If the init system of the host operating system is unknown, return on
+error, unless \fI\%\-\-default\fP is specified.
+.SH OPTIONS
+.INDENT 0.0
+.TP
+.B \-\-use\-rhceph
+When an operating system identifies itself as Red Hat, it is
+treated as if it was CentOS. With \fI\%\-\-use\-rhceph\fP it is
+treated as RHEL instead.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-default INIT
+If the init system of the host operating system is unkown, return
+the value of \fIINIT\fP instead of failing with an error.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-verbose
+Display additional information for debugging.
+.UNINDENT
+.SH AVAILABILITY
+.sp
+\fBceph\-detect\-init\fP is part of Ceph, a massively scalable, open\-source, distributed storage system. Please refer to
+the Ceph documentation at \fI\%http://ceph.com/docs\fP for more information.
+.SH SEE ALSO
+.sp
+\fBceph\-disk\fP(8),
+\fBceph\-deploy\fP(8)
+.SH COPYRIGHT
+2010-2014, Inktank Storage, Inc. and contributors. Licensed under Creative Commons BY-SA
+.\" Generated by docutils manpage writer.
+.
diff --git a/man/ceph-disk.8 b/man/ceph-disk.8
index 41a5139..2dd7a3e 100644
--- a/man/ceph-disk.8
+++ b/man/ceph-disk.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-DISK" "8" "December 18, 2014" "dev" "Ceph"
+.TH "CEPH-DISK" "8" "November 03, 2015" "dev" "Ceph"
 .SH NAME
 ceph-disk \- Ceph disk preparation and activation utility for OSD
 .
@@ -30,33 +30,6 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .in \\n[rst2man-indent\\n[rst2man-indent-level]]u
 ..
-.
-.nr rst2man-indent-level 0
-.
-.de1 rstReportMargin
-\\$1 \\n[an-margin]
-level \\n[rst2man-indent-level]
-level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
--
-\\n[rst2man-indent0]
-\\n[rst2man-indent1]
-\\n[rst2man-indent2]
-..
-.de1 INDENT
-.\" .rstReportMargin pre:
-. RS \\$1
-. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin]
-. nr rst2man-indent-level +1
-.\" .rstReportMargin post:
-..
-.de UNINDENT
-. RE
-.\" indent \\n[an-margin]
-.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.nr rst2man-indent-level -1
-.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.in \\n[rst2man-indent\\n[rst2man-indent-level]]u
-..
 .SH SYNOPSIS
 .nf
 \fBceph\-disk\fP \fBprepare\fP [\-\-cluster \fIclustername\fP] [\-\-cluster\-uuid \fIuuid\fP]
@@ -65,6 +38,8 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .sp
 .nf
 \fBceph\-disk\fP \fBactivate\fP [\fIdata\-path\fP] [\-\-activate\-key \fIpath\fP]
+[\-\-mark\-init \fIsysvinit|upstart|systemd|auto|none\fP]
+[\-\-no\-start\-daemon]
 .fi
 .sp
 .nf
@@ -88,7 +63,7 @@ subcommands \fBprepare\fP and \fBactivate\fP\&.
 .SH SUBCOMMANDS
 .SS prepare
 .sp
-Prepare a directory, disk or drive for a Ceph OSD. It creates a GPT partition,
+Prepare a directory, disk for a Ceph OSD. It creates a GPT partition,
 marks the partition with Ceph type \fBuuid\fP, creates a file system, marks the
 file system as ready for Ceph consumption, uses entire partition and adds a new
 partition to the journal disk. It is run directly or triggered by
@@ -107,10 +82,10 @@ ceph\-disk prepare \-\-cluster [cluster\-name] \-\-cluster\-uuid [uuid] \-\-fs\-
 .UNINDENT
 .UNINDENT
 .sp
-Other options like \fI\%--osd-uuid\fP, \fI\%--journal-uuid\fP,
-\fI\%--zap-disk\fP, \fI\%--data-dir\fP, \fI\%--data-dev\fP,
-\fI\%--journal-file\fP, \fI\%--journal-dev\fP, \fI\%--dmcrypt\fP
-and \fI\%--dmcrypt-key-dir\fP can also be used with the subcommand.
+Other options like \fI\%\-\-osd\-uuid\fP, \fI\%\-\-journal\-uuid\fP,
+\fI\%\-\-zap\-disk\fP, \fI\%\-\-data\-dir\fP, \fI\%\-\-data\-dev\fP,
+\fI\%\-\-journal\-file\fP, \fI\%\-\-journal\-dev\fP, \fI\%\-\-dmcrypt\fP
+and \fI\%\-\-dmcrypt\-key\-dir\fP can also be used with the subcommand.
 .SS activate
 .sp
 Activate the Ceph OSD. It mounts the volume in a temporary location, allocates
@@ -134,7 +109,7 @@ ceph\-disk activate [PATH]
 .sp
 Here, [PATH] is path to a block device or a directory.
 .sp
-An additional option \fI\%--activate-key\fP has to be used with this
+An additional option \fI\%\-\-activate\-key\fP has to be used with this
 subcommand when a copy of \fB/var/lib/ceph/bootstrap\-osd/{cluster}.keyring\fP
 isn\(aqt present in the OSD node.
 .sp
@@ -150,8 +125,31 @@ ceph\-disk activate [PATH] [\-\-activate\-key PATH]
 .UNINDENT
 .UNINDENT
 .sp
-Another option \fI\%--mark-init\fP can also be used with this subcommand.
-\fB\-\-mark\-init\fP provides init system to manage the OSD directory.
+Another option \fI\%\-\-mark\-init\fP can also be used with this
+subcommand.  \fB\-\-mark\-init\fP provides init system to manage the OSD
+directory. It defaults to \fBauto\fP which detects the init system
+suitable for ceph (either \fBsysvinit\fP, \fBsystemd\fP or
+\fBupstart\fP). The argument can be used to override the init system. It
+may be convenient when an operating system supports multiple init
+systems, such as Debian GNU/Linux jessie with \fBsystemd\fP and
+\fBsysvinit\fP\&. If the argument is \fBnone\fP, the OSD is not marked with
+any init system and \fBceph\-disk activate\fP needs to be called
+explicitely after each reboot.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph\-disk activate [PATH] [\-\-mark\-init *sysvinit|upstart|systemd|auto|none*]
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+If the option \fI\-\-no\-start\-daemon\fP is given, the activation
+steps are performed but the OSD daemon is not started.
 .SS activate\-journal
 .sp
 Activate an OSD via it\(aqs journal device. \fBudev\fP triggers
@@ -171,7 +169,7 @@ ceph\-disk activate\-journal [DEV]
 .sp
 Here, [DEV] is the path to a journal block device.
 .sp
-Others options like \fI\%--activate-key\fP and \fI\%--mark-init\fP can also
+Others options like \fI\%\-\-activate\-key\fP and \fI\%\-\-mark\-init\fP can also
 be used with this subcommand.
 .sp
 \fB\-\-mark\-init\fP provides init system to manage the OSD directory.
@@ -206,7 +204,7 @@ ceph\-disk activate\-all
 .UNINDENT
 .UNINDENT
 .sp
-Others options like \fI\%--activate-key\fP and \fI\%--mark-init\fP can
+Others options like \fI\%\-\-activate\-key\fP and \fI\%\-\-mark\-init\fP can
 also be used with this subcommand.
 .sp
 \fB\-\-mark\-init\fP provides init system to manage the OSD directory.
@@ -388,7 +386,7 @@ Provide init system to manage the OSD directory.
 .UNINDENT
 .SH AVAILABILITY
 .sp
-\fBceph\-disk\fP is a part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to
+\fBceph\-disk\fP is part of Ceph, a massively scalable, open\-source, distributed storage system. Please refer to
 the Ceph documentation at \fI\%http://ceph.com/docs\fP for more information.
 .SH SEE ALSO
 .sp
diff --git a/man/ceph-fuse.8 b/man/ceph-fuse.8
index 60bc7d2..0303b14 100644
--- a/man/ceph-fuse.8
+++ b/man/ceph-fuse.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-FUSE" "8" "January 12, 2014" "dev" "Ceph"
+.TH "CEPH-FUSE" "8" "November 03, 2015" "dev" "Ceph"
 .SH NAME
 ceph-fuse \- FUSE-based client for ceph
 .
@@ -30,33 +30,6 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .in \\n[rst2man-indent\\n[rst2man-indent-level]]u
 ..
-.
-.nr rst2man-indent-level 0
-.
-.de1 rstReportMargin
-\\$1 \\n[an-margin]
-level \\n[rst2man-indent-level]
-level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
--
-\\n[rst2man-indent0]
-\\n[rst2man-indent1]
-\\n[rst2man-indent2]
-..
-.de1 INDENT
-.\" .rstReportMargin pre:
-. RS \\$1
-. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin]
-. nr rst2man-indent-level +1
-.\" .rstReportMargin post:
-..
-.de UNINDENT
-. RE
-.\" indent \\n[an-margin]
-.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.nr rst2man-indent-level -1
-.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.in \\n[rst2man-indent\\n[rst2man-indent-level]]u
-..
 .SH SYNOPSIS
 .nf
 \fBceph\-fuse\fP [ \-m \fImonaddr\fP:\fIport\fP ] \fImountpoint\fP [ \fIfuse options\fP ]
@@ -108,7 +81,7 @@ Use root_directory as the mounted root, rather than the full Ceph tree.
 .UNINDENT
 .SH AVAILABILITY
 .sp
-\fBceph\-fuse\fP is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to
+\fBceph\-fuse\fP is part of Ceph, a massively scalable, open\-source, distributed storage system. Please refer to
 the Ceph documentation at \fI\%http://ceph.com/docs\fP for more information.
 .SH SEE ALSO
 .sp
diff --git a/man/ceph-mds.8 b/man/ceph-mds.8
index 3667a1b..b4d3009 100644
--- a/man/ceph-mds.8
+++ b/man/ceph-mds.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-MDS" "8" "January 12, 2014" "dev" "Ceph"
+.TH "CEPH-MDS" "8" "November 03, 2015" "dev" "Ceph"
 .SH NAME
 ceph-mds \- ceph metadata server daemon
 .
@@ -30,33 +30,6 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .in \\n[rst2man-indent\\n[rst2man-indent-level]]u
 ..
-.
-.nr rst2man-indent-level 0
-.
-.de1 rstReportMargin
-\\$1 \\n[an-margin]
-level \\n[rst2man-indent-level]
-level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
--
-\\n[rst2man-indent0]
-\\n[rst2man-indent1]
-\\n[rst2man-indent2]
-..
-.de1 INDENT
-.\" .rstReportMargin pre:
-. RS \\$1
-. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin]
-. nr rst2man-indent-level +1
-.\" .rstReportMargin post:
-..
-.de UNINDENT
-. RE
-.\" indent \\n[an-margin]
-.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.nr rst2man-indent-level -1
-.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.in \\n[rst2man-indent\\n[rst2man-indent-level]]u
-..
 .SH SYNOPSIS
 .nf
 \fBceph\-mds\fP \-i \fIname\fP [[ \-\-hot\-standby [\fIrank\fP] ]|[\-\-journal_check \fIrank\fP]]
@@ -95,6 +68,19 @@ Debug mode: like \fB\-f\fP, but also send all log output to stderr.
 .UNINDENT
 .INDENT 0.0
 .TP
+.B \-\-setuser userorgid
+Set uid after starting.  If a username is specified, the user
+record is looked up to get a uid and a gid, and the gid is also set
+as well, unless \-\-setgroup is also specified.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-setgroup grouporgid
+Set gid after starting.  If a group name is specified the group
+record is looked up to get a gid.
+.UNINDENT
+.INDENT 0.0
+.TP
 .B \-c ceph.conf, \-\-conf=ceph.conf
 Use \fIceph.conf\fP configuration file instead of the default
 \fB/etc/ceph/ceph.conf\fP to determine monitor addresses during
@@ -106,9 +92,19 @@ startup.
 Connect to specified monitor (instead of looking through
 \fBceph.conf\fP).
 .UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-journal\-check <rank>
+Attempt to replay the journal for MDS <rank>, then exit.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-hot\-standby <rank>
+Start as a hot standby for MDS <rank>.
+.UNINDENT
 .SH AVAILABILITY
 .sp
-\fBceph\-mon\fP is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to the Ceph documentation at
+\fBceph\-mds\fP is part of Ceph, a massively scalable, open\-source, distributed storage system. Please refer to the Ceph documentation at
 \fI\%http://ceph.com/docs\fP for more information.
 .SH SEE ALSO
 .sp
diff --git a/man/ceph-mon.8 b/man/ceph-mon.8
index 6474f76..86c1a59 100644
--- a/man/ceph-mon.8
+++ b/man/ceph-mon.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-MON" "8" "January 12, 2014" "dev" "Ceph"
+.TH "CEPH-MON" "8" "November 03, 2015" "dev" "Ceph"
 .SH NAME
 ceph-mon \- ceph monitor daemon
 .
@@ -30,33 +30,6 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .in \\n[rst2man-indent\\n[rst2man-indent-level]]u
 ..
-.
-.nr rst2man-indent-level 0
-.
-.de1 rstReportMargin
-\\$1 \\n[an-margin]
-level \\n[rst2man-indent-level]
-level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
--
-\\n[rst2man-indent0]
-\\n[rst2man-indent1]
-\\n[rst2man-indent2]
-..
-.de1 INDENT
-.\" .rstReportMargin pre:
-. RS \\$1
-. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin]
-. nr rst2man-indent-level +1
-.\" .rstReportMargin post:
-..
-.de UNINDENT
-. RE
-.\" indent \\n[an-margin]
-.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.nr rst2man-indent-level -1
-.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.in \\n[rst2man-indent\\n[rst2man-indent-level]]u
-..
 .SH SYNOPSIS
 .nf
 \fBceph\-mon\fP \-i \fImonid\fP [ \-\-mon\-data \fImondatapath\fP ]
@@ -86,6 +59,19 @@ Debug mode: like \fB\-f\fP, but also send all log output to stderr.
 .UNINDENT
 .INDENT 0.0
 .TP
+.B \-\-setuser userorgid
+Set uid after starting.  If a username is specified, the user
+record is looked up to get a uid and a gid, and the gid is also set
+as well, unless \-\-setgroup is also specified.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-setgroup grouporgid
+Set gid after starting.  If a group name is specified the group
+record is looked up to get a gid.
+.UNINDENT
+.INDENT 0.0
+.TP
 .B \-c ceph.conf, \-\-conf=ceph.conf
 Use \fIceph.conf\fP configuration file instead of the default
 \fB/etc/ceph/ceph.conf\fP to determine monitor addresses during
@@ -123,7 +109,7 @@ Specify a keyring for use with \fB\-\-mkfs\fP\&.
 .UNINDENT
 .SH AVAILABILITY
 .sp
-\fBceph\-mon\fP is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer
+\fBceph\-mon\fP is part of Ceph, a massively scalable, open\-source, distributed storage system. Please refer
 to the Ceph documentation at \fI\%http://ceph.com/docs\fP for more
 information.
 .SH SEE ALSO
diff --git a/man/ceph-osd.8 b/man/ceph-osd.8
index 4e953de..764900e 100644
--- a/man/ceph-osd.8
+++ b/man/ceph-osd.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-OSD" "8" "January 12, 2014" "dev" "Ceph"
+.TH "CEPH-OSD" "8" "November 03, 2015" "dev" "Ceph"
 .SH NAME
 ceph-osd \- ceph object storage daemon
 .
@@ -30,33 +30,6 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .in \\n[rst2man-indent\\n[rst2man-indent-level]]u
 ..
-.
-.nr rst2man-indent-level 0
-.
-.de1 rstReportMargin
-\\$1 \\n[an-margin]
-level \\n[rst2man-indent-level]
-level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
--
-\\n[rst2man-indent0]
-\\n[rst2man-indent1]
-\\n[rst2man-indent2]
-..
-.de1 INDENT
-.\" .rstReportMargin pre:
-. RS \\$1
-. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin]
-. nr rst2man-indent-level +1
-.\" .rstReportMargin post:
-..
-.de UNINDENT
-. RE
-.\" indent \\n[an-margin]
-.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.nr rst2man-indent-level -1
-.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.in \\n[rst2man-indent\\n[rst2man-indent-level]]u
-..
 .SH SYNOPSIS
 .nf
 \fBceph\-osd\fP \-i \fIosdnum\fP [ \-\-osd\-data \fIdatapath\fP ] [ \-\-osd\-journal
@@ -87,6 +60,19 @@ Debug mode: like \fB\-f\fP, but also send all log output to stderr.
 .UNINDENT
 .INDENT 0.0
 .TP
+.B \-\-setuser userorgid
+Set uid after starting.  If a username is specified, the user
+record is looked up to get a uid and a gid, and the gid is also set
+as well, unless \-\-setgroup is also specified.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-setgroup grouporgid
+Set gid after starting.  If a group name is specified the group
+record is looked up to get a gid.
+.UNINDENT
+.INDENT 0.0
+.TP
 .B \-\-osd\-data osddata
 Use object store at \fIosddata\fP\&.
 .UNINDENT
@@ -155,7 +141,7 @@ Connect to specified monitor (instead of looking through
 .UNINDENT
 .SH AVAILABILITY
 .sp
-\fBceph\-osd\fP is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to
+\fBceph\-osd\fP is part of Ceph, a massively scalable, open\-source, distributed storage system. Please refer to
 the Ceph documentation at \fI\%http://ceph.com/docs\fP for more information.
 .SH SEE ALSO
 .sp
diff --git a/man/ceph-post-file.8 b/man/ceph-post-file.8
index b3ecb20..83dea9e 100644
--- a/man/ceph-post-file.8
+++ b/man/ceph-post-file.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-POST-FILE" "8" "January 12, 2014" "dev" "Ceph"
+.TH "CEPH-POST-FILE" "8" "November 03, 2015" "dev" "Ceph"
 .SH NAME
 ceph-post-file \- post files for ceph developers
 .
@@ -30,33 +30,6 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .in \\n[rst2man-indent\\n[rst2man-indent-level]]u
 ..
-.
-.nr rst2man-indent-level 0
-.
-.de1 rstReportMargin
-\\$1 \\n[an-margin]
-level \\n[rst2man-indent-level]
-level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
--
-\\n[rst2man-indent0]
-\\n[rst2man-indent1]
-\\n[rst2man-indent2]
-..
-.de1 INDENT
-.\" .rstReportMargin pre:
-. RS \\$1
-. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin]
-. nr rst2man-indent-level +1
-.\" .rstReportMargin post:
-..
-.de UNINDENT
-. RE
-.\" indent \\n[an-margin]
-.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.nr rst2man-indent-level -1
-.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.in \\n[rst2man-indent\\n[rst2man-indent-level]]u
-..
 .SH SYNOPSIS
 .nf
 \fBceph\-post\-file\fP [\-d \fIdescription] [\-u *user\fP] \fIfile or dir\fP ...
@@ -118,7 +91,7 @@ ceph\-post\-file \-d \(aqmon data directories\(aq /var/log/ceph/mon/*
 .UNINDENT
 .SH AVAILABILITY
 .sp
-\fBceph\-post\-file\fP is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to
+\fBceph\-post\-file\fP is part of Ceph, a massively scalable, open\-source, distributed storage system. Please refer to
 the Ceph documentation at \fI\%http://ceph.com/docs\fP for more information.
 .SH SEE ALSO
 .sp
diff --git a/man/ceph-rbdnamer.8 b/man/ceph-rbdnamer.8
index d6f7fd4..a32bdbd 100644
--- a/man/ceph-rbdnamer.8
+++ b/man/ceph-rbdnamer.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-RBDNAMER" "8" "January 12, 2014" "dev" "Ceph"
+.TH "CEPH-RBDNAMER" "8" "November 03, 2015" "dev" "Ceph"
 .SH NAME
 ceph-rbdnamer \- udev helper to name RBD devices
 .
@@ -30,33 +30,6 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .in \\n[rst2man-indent\\n[rst2man-indent-level]]u
 ..
-.
-.nr rst2man-indent-level 0
-.
-.de1 rstReportMargin
-\\$1 \\n[an-margin]
-level \\n[rst2man-indent-level]
-level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
--
-\\n[rst2man-indent0]
-\\n[rst2man-indent1]
-\\n[rst2man-indent2]
-..
-.de1 INDENT
-.\" .rstReportMargin pre:
-. RS \\$1
-. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin]
-. nr rst2man-indent-level +1
-.\" .rstReportMargin post:
-..
-.de UNINDENT
-. RE
-.\" indent \\n[an-margin]
-.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.nr rst2man-indent-level -1
-.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.in \\n[rst2man-indent\\n[rst2man-indent-level]]u
-..
 .SH SYNOPSIS
 .nf
 \fBceph\-rbdnamer\fP \fInum\fP
@@ -79,7 +52,7 @@ KERNEL=="rbd[0\-9]*", PROGRAM="/usr/bin/ceph\-rbdnamer %n", SYMLINK+="rbd/%c{1}/
 .UNINDENT
 .SH AVAILABILITY
 .sp
-\fBceph\-rbdnamer\fP is part of Ceph, a massively scalable, open-source, distributed storage system.  Please
+\fBceph\-rbdnamer\fP is part of Ceph, a massively scalable, open\-source, distributed storage system.  Please
 refer to the Ceph documentation at \fI\%http://ceph.com/docs\fP for more
 information.
 .SH SEE ALSO
diff --git a/man/ceph-rest-api.8 b/man/ceph-rest-api.8
index 41120c8..f34fbb8 100644
--- a/man/ceph-rest-api.8
+++ b/man/ceph-rest-api.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-REST-API" "8" "January 12, 2014" "dev" "Ceph"
+.TH "CEPH-REST-API" "8" "November 03, 2015" "dev" "Ceph"
 .SH NAME
 ceph-rest-api \- ceph RESTlike administration server
 .
@@ -30,33 +30,6 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .in \\n[rst2man-indent\\n[rst2man-indent-level]]u
 ..
-.
-.nr rst2man-indent-level 0
-.
-.de1 rstReportMargin
-\\$1 \\n[an-margin]
-level \\n[rst2man-indent-level]
-level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
--
-\\n[rst2man-indent0]
-\\n[rst2man-indent1]
-\\n[rst2man-indent2]
-..
-.de1 INDENT
-.\" .rstReportMargin pre:
-. RS \\$1
-. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin]
-. nr rst2man-indent-level +1
-.\" .rstReportMargin post:
-..
-.de UNINDENT
-. RE
-.\" indent \\n[an-margin]
-.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.nr rst2man-indent-level -1
-.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.in \\n[rst2man-indent\\n[rst2man-indent-level]]u
-..
 .SH SYNOPSIS
 .nf
 \fBceph\-rest\-api\fP [ \-c \fIconffile\fP ] [\-\-cluster \fIclustername\fP ] [ \-n \fIname\fP ] [\-i \fIid\fP ]
@@ -197,7 +170,7 @@ exception to be raised; see your WSGI server documentation for how to
 see those messages in case of problem.
 .SH AVAILABILITY
 .sp
-\fBceph\-rest\-api\fP is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to the Ceph documentation at
+\fBceph\-rest\-api\fP is part of Ceph, a massively scalable, open\-source, distributed storage system. Please refer to the Ceph documentation at
 \fI\%http://ceph.com/docs\fP for more information.
 .SH SEE ALSO
 .sp
diff --git a/man/ceph-run.8 b/man/ceph-run.8
index dab4d12..caeab9b 100644
--- a/man/ceph-run.8
+++ b/man/ceph-run.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-RUN" "8" "January 12, 2014" "dev" "Ceph"
+.TH "CEPH-RUN" "8" "November 03, 2015" "dev" "Ceph"
 .SH NAME
 ceph-run \- restart daemon on core dump
 .
@@ -30,33 +30,6 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .in \\n[rst2man-indent\\n[rst2man-indent-level]]u
 ..
-.
-.nr rst2man-indent-level 0
-.
-.de1 rstReportMargin
-\\$1 \\n[an-margin]
-level \\n[rst2man-indent-level]
-level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
--
-\\n[rst2man-indent0]
-\\n[rst2man-indent1]
-\\n[rst2man-indent2]
-..
-.de1 INDENT
-.\" .rstReportMargin pre:
-. RS \\$1
-. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin]
-. nr rst2man-indent-level +1
-.\" .rstReportMargin post:
-..
-.de UNINDENT
-. RE
-.\" indent \\n[an-margin]
-.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.nr rst2man-indent-level -1
-.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.in \\n[rst2man-indent\\n[rst2man-indent-level]]u
-..
 .SH SYNOPSIS
 .nf
 \fBceph\-run\fP \fIcommand\fP ...
@@ -75,7 +48,7 @@ that means the \fB\-f\fP option.
 None
 .SH AVAILABILITY
 .sp
-\fBceph\-run\fP is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to
+\fBceph\-run\fP is part of Ceph, a massively scalable, open\-source, distributed storage system. Please refer to
 the Ceph documentation at \fI\%http://ceph.com/docs\fP for more information.
 .SH SEE ALSO
 .sp
diff --git a/man/ceph-syn.8 b/man/ceph-syn.8
index 92aeb5a..1e59f85 100644
--- a/man/ceph-syn.8
+++ b/man/ceph-syn.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-SYN" "8" "January 12, 2014" "dev" "Ceph"
+.TH "CEPH-SYN" "8" "November 03, 2015" "dev" "Ceph"
 .SH NAME
 ceph-syn \- ceph synthetic workload generator
 .
@@ -30,33 +30,6 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .in \\n[rst2man-indent\\n[rst2man-indent-level]]u
 ..
-.
-.nr rst2man-indent-level 0
-.
-.de1 rstReportMargin
-\\$1 \\n[an-margin]
-level \\n[rst2man-indent-level]
-level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
--
-\\n[rst2man-indent0]
-\\n[rst2man-indent1]
-\\n[rst2man-indent2]
-..
-.de1 INDENT
-.\" .rstReportMargin pre:
-. RS \\$1
-. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin]
-. nr rst2man-indent-level +1
-.\" .rstReportMargin post:
-..
-.de UNINDENT
-. RE
-.\" indent \\n[an-margin]
-.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.nr rst2man-indent-level -1
-.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.in \\n[rst2man-indent\\n[rst2man-indent-level]]u
-..
 .SH SYNOPSIS
 .nf
 \fBceph\-syn\fP [ \-m \fImonaddr\fP:\fIport\fP ] \-\-syn \fIcommand\fP \fI\&...\fP
@@ -136,7 +109,7 @@ Recursively walk the file system (like find).
 .UNINDENT
 .SH AVAILABILITY
 .sp
-\fBceph\-syn\fP is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to
+\fBceph\-syn\fP is part of Ceph, a massively scalable, open\-source, distributed storage system. Please refer to
 the Ceph documentation at \fI\%http://ceph.com/docs\fP for more information.
 .SH SEE ALSO
 .sp
diff --git a/man/ceph.8 b/man/ceph.8
index 0ec9d53..7a77e4f 100644
--- a/man/ceph.8
+++ b/man/ceph.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH" "8" "March 22, 2015" "dev" "Ceph"
+.TH "CEPH" "8" "November 03, 2015" "dev" "Ceph"
 .SH NAME
 ceph \- ceph administration tool
 .
@@ -30,33 +30,6 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .in \\n[rst2man-indent\\n[rst2man-indent-level]]u
 ..
-.
-.nr rst2man-indent-level 0
-.
-.de1 rstReportMargin
-\\$1 \\n[an-margin]
-level \\n[rst2man-indent-level]
-level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
--
-\\n[rst2man-indent0]
-\\n[rst2man-indent1]
-\\n[rst2man-indent2]
-..
-.de1 INDENT
-.\" .rstReportMargin pre:
-. RS \\$1
-. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin]
-. nr rst2man-indent-level +1
-.\" .rstReportMargin post:
-..
-.de UNINDENT
-. RE
-.\" indent \\n[an-margin]
-.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.nr rst2man-indent-level -1
-.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.in \\n[rst2man-indent\\n[rst2man-indent-level]]u
-..
 .SH SYNOPSIS
 .nf
 \fBceph\fP \fBauth\fP [ \fIadd\fP | \fIcaps\fP | \fIdel\fP | \fIexport\fP | \fIget\fP | \fIget\-key\fP | \fIget\-or\-create\fP | \fIget\-or\-create\-key\fP | \fIimport\fP | \fIlist\fP | \fIprint\-key\fP | \fIprint_key\fP ] ...
@@ -71,6 +44,14 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .fi
 .sp
 .nf
+\fBceph\fP \fBdaemon\fP \fI<name>\fP | \fI<path>\fP \fI<command>\fP ...
+.fi
+.sp
+.nf
+\fBceph\fP \fBdaemonperf\fP \fI<name>\fP | \fI<path>\fP [ \fIinterval\fP [ \fIcount\fP ] ]
+.fi
+.sp
+.nf
 \fBceph\fP \fBdf\fP \fI{detail}\fP
 .fi
 .sp
@@ -436,6 +417,48 @@ ceph config\-key put <key> {<val>}
 .fi
 .UNINDENT
 .UNINDENT
+.SS daemon
+.sp
+Submit admin\-socket commands.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph daemon {daemon_name|socket_path} {command} ...
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Example:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph daemon osd.0 help
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.SS daemonperf
+.sp
+Watch performance counters from a Ceph daemon.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph daemonperf {daemon_name|socket_path} [{interval} [{count}]]
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
 .SS df
 .sp
 Show cluster\(aqs free space status.
@@ -1040,7 +1063,7 @@ ceph osd blocked\-by
 .UNINDENT
 .UNINDENT
 .sp
-Subcommand \fBcreate\fP creates new osd (with optional UUID).
+Subcommand \fBcreate\fP creates new osd (with optional UUID and ID).
 .sp
 Usage:
 .INDENT 0.0
@@ -1048,7 +1071,7 @@ Usage:
 .sp
 .nf
 .ft C
-ceph osd create {<uuid>}
+ceph osd create {<uuid>} {<id>}
 .ft P
 .fi
 .UNINDENT
@@ -1395,6 +1418,20 @@ ceph osd crush show\-tunables
 .UNINDENT
 .UNINDENT
 .sp
+Subcommand \fBtree\fP shows the crush buckets and items in a tree view.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd crush tree
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
 Subcommand \fBtunables\fP sets crush tunables values to <profile>.
 .sp
 Usage:
@@ -1675,7 +1712,7 @@ Usage:
 .sp
 .nf
 .ft C
-ceph osd metadata <int[0\-]>
+ceph osd metadata {int[0\-]} (default all)
 .ft P
 .fi
 .UNINDENT
@@ -1792,7 +1829,7 @@ Only for tiered pools:
 .nf
 .ft C
 ceph osd pool get <poolname> hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|
-target_max_objects|target_max_bytes|cache_target_dirty_ratio|
+target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_dirty_high_ratio|
 cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|
 min_read_recency_for_promote
 .ft P
@@ -1812,6 +1849,18 @@ ceph osd pool get <poolname> erasure_code_profile
 .UNINDENT
 .UNINDENT
 .sp
+Use \fBall\fP to get all pool parameters that apply to the pool\(aqs type:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph osd pool get <poolname> all
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
 Subcommand \fBget\-quota\fP obtains object or byte limits for pool.
 .sp
 Usage:
@@ -1894,6 +1943,7 @@ ceph osd pool set <poolname> size|min_size|crash_replay_interval|pg_num|
 pgp_num|crush_ruleset|hashpspool|nodelete|nopgchange|nosizechange|
 hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|debug_fake_ec_pool|
 target_max_bytes|target_max_objects|cache_target_dirty_ratio|
+cache_target_dirty_high_ratio|
 cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|auid|
 min_read_recency_for_promote|write_fadvise_dontneed
 <val> {\-\-yes\-i\-really\-mean\-it}
@@ -2717,8 +2767,8 @@ Name of the Ceph cluster.
 .UNINDENT
 .INDENT 0.0
 .TP
-.B \-\-admin\-daemon ADMIN_SOCKET
-Submit admin\-socket commands.
+.B daemon ADMIN_SOCKET, daemon DAEMON_NAME, \-\-admin\-socket ADMIN_SOCKET, \-\-admin\-socket DAEMON_NAME
+Submit admin\-socket commands via admin sockets in /var/run/ceph.
 .UNINDENT
 .INDENT 0.0
 .TP
diff --git a/man/ceph_selinux.8 b/man/ceph_selinux.8
new file mode 100644
index 0000000..a646374
--- /dev/null
+++ b/man/ceph_selinux.8
@@ -0,0 +1,370 @@
+.TH  "ceph_selinux"  "8"  "15-08-10" "ceph" "SELinux Policy ceph"
+.SH "NAME"
+ceph_selinux \- Security Enhanced Linux Policy for the ceph processes
+.SH "DESCRIPTION"
+
+Security-Enhanced Linux secures the ceph processes via flexible mandatory access control.
+
+The ceph processes execute with the ceph_t SELinux type. You can check if you have these processes running by executing the \fBps\fP command with the \fB\-Z\fP qualifier.
+
+For example:
+
+.B ps -eZ | grep ceph_t
+
+
+.SH "ENTRYPOINTS"
+
+The ceph_t SELinux type can be entered via the \fBceph_exec_t\fP file type.
+
+The default entrypoint paths for the ceph_t domain are the following:
+
+/usr/bin/radosgw, /usr/bin/ceph-mon, /usr/bin/ceph-mds, /usr/bin/ceph-osd
+.SH PROCESS TYPES
+SELinux defines process types (domains) for each process running on the system
+.PP
+You can see the context of a process using the \fB\-Z\fP option to \fBps\bP
+.PP
+Policy governs the access confined processes have to files.
+SELinux ceph policy is very flexible allowing users to setup their ceph processes in as secure a method as possible.
+.PP
+The following process types are defined for ceph:
+
+.EX
+.B ceph_t
+.EE
+.PP
+Note:
+.B semanage permissive -a ceph_t
+can be used to make the process type ceph_t permissive. SELinux does not deny access to permissive process types, but the AVC (SELinux denials) messages are still generated.
+
+.SH BOOLEANS
+SELinux policy is customizable based on least access required.  ceph policy is extremely flexible and has several booleans that allow you to manipulate the policy and run ceph with the tightest access possible.
+
+
+.PP
+If you want to allow users to resolve user passwd entries directly from ldap rather then using a sssd server, you must turn on the authlogin_nsswitch_use_ldap boolean. Disabled by default.
+
+.EX
+.B setsebool -P authlogin_nsswitch_use_ldap 1
+
+.EE
+
+.PP
+If you want to allow all daemons to write corefiles to /, you must turn on the daemons_dump_core boolean. Disabled by default.
+
+.EX
+.B setsebool -P daemons_dump_core 1
+
+.EE
+
+.PP
+If you want to enable cluster mode for daemons, you must turn on the daemons_enable_cluster_mode boolean. Disabled by default.
+
+.EX
+.B setsebool -P daemons_enable_cluster_mode 1
+
+.EE
+
+.PP
+If you want to allow all daemons to use tcp wrappers, you must turn on the daemons_use_tcp_wrapper boolean. Disabled by default.
+
+.EX
+.B setsebool -P daemons_use_tcp_wrapper 1
+
+.EE
+
+.PP
+If you want to allow all daemons the ability to read/write terminals, you must turn on the daemons_use_tty boolean. Disabled by default.
+
+.EX
+.B setsebool -P daemons_use_tty 1
+
+.EE
+
+.PP
+If you want to deny any process from ptracing or debugging any other processes, you must turn on the deny_ptrace boolean. Disabled by default.
+
+.EX
+.B setsebool -P deny_ptrace 1
+
+.EE
+
+.PP
+If you want to allow all domains to use other domains file descriptors, you must turn on the domain_fd_use boolean. Enabled by default.
+
+.EX
+.B setsebool -P domain_fd_use 1
+
+.EE
+
+.PP
+If you want to allow all domains to have the kernel load modules, you must turn on the domain_kernel_load_modules boolean. Disabled by default.
+
+.EX
+.B setsebool -P domain_kernel_load_modules 1
+
+.EE
+
+.PP
+If you want to allow all domains to execute in fips_mode, you must turn on the fips_mode boolean. Enabled by default.
+
+.EX
+.B setsebool -P fips_mode 1
+
+.EE
+
+.PP
+If you want to enable reading of urandom for all domains, you must turn on the global_ssp boolean. Disabled by default.
+
+.EX
+.B setsebool -P global_ssp 1
+
+.EE
+
+.PP
+If you want to allow confined applications to run with kerberos, you must turn on the kerberos_enabled boolean. Enabled by default.
+
+.EX
+.B setsebool -P kerberos_enabled 1
+
+.EE
+
+.PP
+If you want to allow system to run with NIS, you must turn on the nis_enabled boolean. Disabled by default.
+
+.EX
+.B setsebool -P nis_enabled 1
+
+.EE
+
+.PP
+If you want to allow confined applications to use nscd shared memory, you must turn on the nscd_use_shm boolean. Enabled by default.
+
+.EX
+.B setsebool -P nscd_use_shm 1
+
+.EE
+
+.SH NSSWITCH DOMAIN
+
+.PP
+If you want to allow users to resolve user passwd entries directly from ldap rather then using a sssd server for the ceph_t, you must turn on the authlogin_nsswitch_use_ldap boolean.
+
+.EX
+.B setsebool -P authlogin_nsswitch_use_ldap 1
+.EE
+
+.PP
+If you want to allow confined applications to run with kerberos for the ceph_t, you must turn on the kerberos_enabled boolean.
+
+.EX
+.B setsebool -P kerberos_enabled 1
+.EE
+
+.SH "MANAGED FILES"
+
+The SELinux process type ceph_t can manage files labeled with the following file types.  The paths listed are the default paths for these file types.  Note the processes UID still need to have DAC permissions.
+
+.br
+.B ceph_log_t
+
+	/var/log/ceph(/.*)?
+.br
+
+.br
+.B ceph_var_lib_t
+
+	/var/lib/ceph(/.*)?
+.br
+
+.br
+.B ceph_var_run_t
+
+	/var/run/ceph(/.*)?
+.br
+
+.br
+.B cluster_conf_t
+
+	/etc/cluster(/.*)?
+.br
+
+.br
+.B cluster_var_lib_t
+
+	/var/lib/pcsd(/.*)?
+.br
+	/var/lib/cluster(/.*)?
+.br
+	/var/lib/openais(/.*)?
+.br
+	/var/lib/pengine(/.*)?
+.br
+	/var/lib/corosync(/.*)?
+.br
+	/usr/lib/heartbeat(/.*)?
+.br
+	/var/lib/heartbeat(/.*)?
+.br
+	/var/lib/pacemaker(/.*)?
+.br
+
+.br
+.B cluster_var_run_t
+
+	/var/run/crm(/.*)?
+.br
+	/var/run/cman_.*
+.br
+	/var/run/rsctmp(/.*)?
+.br
+	/var/run/aisexec.*
+.br
+	/var/run/heartbeat(/.*)?
+.br
+	/var/run/cpglockd\.pid
+.br
+	/var/run/corosync\.pid
+.br
+	/var/run/rgmanager\.pid
+.br
+	/var/run/cluster/rgmanager\.sk
+.br
+
+.br
+.B fsadm_var_run_t
+
+	/var/run/blkid(/.*)?
+.br
+
+.br
+.B root_t
+
+	/
+.br
+	/initrd
+.br
+
+.br
+.B var_run_t
+
+	/run/.*
+.br
+	/var/run/.*
+.br
+	/run
+.br
+	/var/run
+.br
+	/var/run
+.br
+	/var/spool/postfix/pid
+.br
+
+.SH FILE CONTEXTS
+SELinux requires files to have an extended attribute to define the file type.
+.PP
+You can see the context of a file using the \fB\-Z\fP option to \fBls\bP
+.PP
+Policy governs the access confined processes have to these files.
+SELinux ceph policy is very flexible allowing users to setup their ceph processes in as secure a method as possible.
+.PP
+
+.PP
+.B STANDARD FILE CONTEXT
+
+SELinux defines the file context types for the ceph, if you wanted to
+store files with these types in a diffent paths, you need to execute the semanage command to sepecify alternate labeling and then use restorecon to put the labels on disk.
+
+.B semanage fcontext -a -t ceph_exec_t '/srv/ceph/content(/.*)?'
+.br
+.B restorecon -R -v /srv/myceph_content
+
+Note: SELinux often uses regular expressions to specify labels that match multiple files.
+
+.I The following file types are defined for ceph:
+
+
+.EX
+.PP
+.B ceph_exec_t
+.EE
+
+- Set files with the ceph_exec_t type, if you want to transition an executable to the ceph_t domain.
+
+.br
+.TP 5
+Paths:
+/usr/bin/radosgw, /usr/bin/ceph-mon, /usr/bin/ceph-mds, /usr/bin/ceph-osd
+
+.EX
+.PP
+.B ceph_initrc_exec_t
+.EE
+
+- Set files with the ceph_initrc_exec_t type, if you want to transition an executable to the ceph_initrc_t domain.
+
+.br
+.TP 5
+Paths:
+/etc/rc\.d/init\.d/ceph, /etc/rc\.d/init\.d/radosgw
+
+.EX
+.PP
+.B ceph_log_t
+.EE
+
+- Set files with the ceph_log_t type, if you want to treat the data as ceph log data, usually stored under the /var/log directory.
+
+.br
+.TP 5
+Paths:
+/var/log/ceph(/.*)?
+
+.EX
+.PP
+.B ceph_var_lib_t
+.EE
+
+- Set files with the ceph_var_lib_t type, if you want to store the ceph files under the /var/lib directory.
+
+
+.EX
+.PP
+.B ceph_var_run_t
+.EE
+
+- Set files with the ceph_var_run_t type, if you want to store the ceph files under the /run or /var/run directory.
+
+
+.PP
+Note: File context can be temporarily modified with the chcon command.  If you want to permanently change the file context you need to use the
+.B semanage fcontext
+command.  This will modify the SELinux labeling database.  You will need to use
+.B restorecon
+to apply the labels.
+
+.SH "COMMANDS"
+.B semanage fcontext
+can also be used to manipulate default file context mappings.
+.PP
+.B semanage permissive
+can also be used to manipulate whether or not a process type is permissive.
+.PP
+.B semanage module
+can also be used to enable/disable/install/remove policy modules.
+
+.B semanage boolean
+can also be used to manipulate the booleans
+
+.PP
+.B system-config-selinux
+is a GUI tool available to customize SELinux policy settings.
+
+.SH AUTHOR
+This manual page was auto-generated using
+.B "sepolicy manpage".
+
+.SH "SEE ALSO"
+selinux(8), ceph(8), semanage(8), restorecon(8), chcon(1), sepolicy(8)
+, setsebool(8)
diff --git a/man/cephfs.8 b/man/cephfs.8
index 767fd38..078795b 100644
--- a/man/cephfs.8
+++ b/man/cephfs.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPHFS" "8" "January 12, 2014" "dev" "Ceph"
+.TH "CEPHFS" "8" "November 03, 2015" "dev" "Ceph"
 .SH NAME
 cephfs \- ceph file system options utility
 .
@@ -30,33 +30,6 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .in \\n[rst2man-indent\\n[rst2man-indent-level]]u
 ..
-.
-.nr rst2man-indent-level 0
-.
-.de1 rstReportMargin
-\\$1 \\n[an-margin]
-level \\n[rst2man-indent-level]
-level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
--
-\\n[rst2man-indent0]
-\\n[rst2man-indent1]
-\\n[rst2man-indent2]
-..
-.de1 INDENT
-.\" .rstReportMargin pre:
-. RS \\$1
-. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin]
-. nr rst2man-indent-level +1
-.\" .rstReportMargin post:
-..
-.de UNINDENT
-. RE
-.\" indent \\n[an-margin]
-.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.nr rst2man-indent-level -1
-.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.in \\n[rst2man-indent\\n[rst2man-indent-level]]u
-..
 .SH SYNOPSIS
 .nf
 \fBcephfs\fP [ \fIpath\fP \fIcommand\fP \fIoptions\fP ]
@@ -132,7 +105,7 @@ preferred OSD for placement. This feature is unsupported and ignored
 in modern versions of the Ceph servers; do not use it.
 .SH AVAILABILITY
 .sp
-\fBcephfs\fP is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer
+\fBcephfs\fP is part of Ceph, a massively scalable, open\-source, distributed storage system. Please refer
 to the Ceph documentation at \fI\%http://ceph.com/docs\fP for more
 information.
 .SH SEE ALSO
diff --git a/man/conf.py b/man/conf.py
new file mode 100644
index 0000000..2b24223
--- /dev/null
+++ b/man/conf.py
@@ -0,0 +1,59 @@
+import os
+
+project = u'Ceph'
+copyright = u'2010-2014, Inktank Storage, Inc. and contributors. Licensed under Creative Commons BY-SA'
+version = 'dev'
+release = 'dev'
+
+exclude_patterns = ['**/.#*', '**/*~']
+
+
+def _get_description(fname, base):
+    with file(fname) as f:
+        one = None
+        while True:
+            line = f.readline().rstrip('\n')
+            if not line:
+                continue
+            if line.startswith(':') and line.endswith(':'):
+                continue
+            one = line
+            break
+        two = f.readline().rstrip('\n')
+        three = f.readline().rstrip('\n')
+        assert one == three
+        assert all(c=='=' for c in one)
+        name, description = two.split('--', 1)
+        assert name.strip() == base
+        return description.strip()
+
+
+def _get_manpages():
+    src_dir = os.path.dirname(__file__)
+    top_srcdir = os.path.dirname(src_dir)
+    man_dir = os.path.join(top_srcdir, 'doc', 'man')
+    sections = os.listdir(man_dir)
+    for section in sections:
+        section_dir = os.path.join(man_dir, section)
+        if not os.path.isdir(section_dir):
+            continue
+        for filename in os.listdir(section_dir):
+            base, ext = os.path.splitext(filename)
+            if ext != '.rst':
+                continue
+            if base == 'index':
+                continue
+            path = os.path.join(section_dir, filename)
+            description = _get_description(path, base)
+            yield (
+                os.path.join(section, base),
+                base,
+                description,
+                '',
+                section,
+                )
+
+man_pages = list(_get_manpages())
+# sphinx warns if no toc is found, so feed it with a random file
+# which is also rendered in this run.
+master_doc = '8/ceph'
diff --git a/man/crushtool.8 b/man/crushtool.8
index 1fad24f..190ae6e 100644
--- a/man/crushtool.8
+++ b/man/crushtool.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CRUSHTOOL" "8" "January 12, 2014" "dev" "Ceph"
+.TH "CRUSHTOOL" "8" "November 03, 2015" "dev" "Ceph"
 .SH NAME
 crushtool \- CRUSH map manipulation tool
 .
@@ -30,33 +30,6 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .in \\n[rst2man-indent\\n[rst2man-indent-level]]u
 ..
-.
-.nr rst2man-indent-level 0
-.
-.de1 rstReportMargin
-\\$1 \\n[an-margin]
-level \\n[rst2man-indent-level]
-level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
--
-\\n[rst2man-indent0]
-\\n[rst2man-indent1]
-\\n[rst2man-indent2]
-..
-.de1 INDENT
-.\" .rstReportMargin pre:
-. RS \\$1
-. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin]
-. nr rst2man-indent-level +1
-.\" .rstReportMargin post:
-..
-.de UNINDENT
-. RE
-.\" indent \\n[an-margin]
-.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.nr rst2man-indent-level -1
-.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.in \\n[rst2man-indent\\n[rst2man-indent-level]]u
-..
 .SH SYNOPSIS
 .nf
 \fBcrushtool\fP ( \-d \fImap\fP | \-c \fImap.txt\fP | \-\-build \-\-num_osds \fInumosds\fP
@@ -77,7 +50,7 @@ described in detail in the following paper (although it has evolved
 some since then):
 .INDENT 0.0
 .INDENT 3.5
-\fI\%http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf\fP
+\fI\%http://www.ssrc.ucsc.edu/Papers/weil\-sc06.pdf\fP
 .UNINDENT
 .UNINDENT
 .sp
@@ -107,7 +80,7 @@ object names. See below for a detailed explanation.
 .UNINDENT
 .sp
 Unlike other Ceph tools, \fBcrushtool\fP does not accept generic options
-such as \fB\-\-debug\-crush\fP from the command line. They can however be
+such as \fB\-\-debug\-crush\fP from the command line. They can, however, be
 provided via the CEPH_ARGS environment variable. For instance, to
 silence all output from the CRUSH subsystem:
 .INDENT 0.0
@@ -125,13 +98,15 @@ CEPH_ARGS="\-\-debug\-crush 0" crushtool ...
 The test mode will use the input crush map ( as specified with \fB\-i
 map\fP ) and perform a dry run of CRUSH mapping or random placement (
 if \fB\-\-simulate\fP is set ). On completion, two kinds of reports can be
-created. The \fB\-\-show\-...\fP options output human readable information
-on stderr. The \fB\-\-output\-csv\fP option creates CSV files that are
+created.
+1) The \fB\-\-show\-...\fP option outputs human readable information
+on stderr.
+2) The \fB\-\-output\-csv\fP option creates CSV files that are
 documented by the \fB\-\-help\-output\fP option.
 .INDENT 0.0
 .TP
 .B \-\-show\-statistics
-for each rule display the mapping of each object. For instance:
+For each rule, displays the mapping of each object. For instance:
 .INDENT 7.0
 .INDENT 3.5
 .sp
@@ -161,7 +136,7 @@ shows that rule \fB1\fP which is named \fBmetadata\fP successfully
 mapped \fB1024\fP objects to \fBresult size == 5\fP devices when trying
 to map them to \fBnum_rep 5\fP replicas. When it fails to provide the
 required mapping, presumably because the number of \fBtries\fP must
-be increased, a breakdown of the failures is displays. For instance:
+be increased, a breakdown of the failures is displayed. For instance:
 .INDENT 7.0
 .INDENT 3.5
 .sp
@@ -182,7 +157,7 @@ out of \fB1024\fP objects ( \fB4/1024\fP ) were mapped to \fBresult size
 .INDENT 0.0
 .TP
 .B \-\-show\-bad\-mappings
-display which object failed to be mapped to the required number of
+Displays which object failed to be mapped to the required number of
 devices. For instance:
 .INDENT 7.0
 .INDENT 3.5
@@ -196,12 +171,12 @@ bad mapping rule 1 x 781 num_rep 7 result [8,10,2,11,6,9]
 .UNINDENT
 .sp
 shows that when rule \fB1\fP was required to map \fB7\fP devices, it
-could only map six : \fB[8,10,2,11,6,9]\fP\&.
+could map only six : \fB[8,10,2,11,6,9]\fP\&.
 .UNINDENT
 .INDENT 0.0
 .TP
 .B \-\-show\-utilization
-display the expected and actual utilisation for each device, for
+Displays the expected and actual utilisation for each device, for
 each number of replicas. For instance:
 .INDENT 7.0
 .INDENT 3.5
@@ -222,14 +197,14 @@ Implies \fB\-\-show\-statistics\fP\&.
 .INDENT 0.0
 .TP
 .B \-\-show\-utilization\-all
-displays the same as \fB\-\-show\-utilization\fP but does not suppress
+Displays the same as \fB\-\-show\-utilization\fP but does not suppress
 output when the weight of a device is zero.
 Implies \fB\-\-show\-statistics\fP\&.
 .UNINDENT
 .INDENT 0.0
 .TP
 .B \-\-show\-choose\-tries
-display how many attempts were needed to find a device mapping.
+Displays how many attempts were needed to find a device mapping.
 For instance:
 .INDENT 7.0
 .INDENT 3.5
@@ -252,10 +227,10 @@ as the value of the \fB\-\-set\-choose\-total\-tries\fP option.
 .INDENT 0.0
 .TP
 .B \-\-output\-csv
-create CSV files (in the current directory) containing information
+Creates CSV files (in the current directory) containing information
 documented by \fB\-\-help\-output\fP\&. The files are named after the rule
 used when collecting the statistics. For instance, if the rule
-metadata is used, the CSV files will be:
+: \(aqmetadata\(aq is used, the CSV files will be:
 .INDENT 7.0
 .INDENT 3.5
 .sp
@@ -288,7 +263,7 @@ Device ID, Absolute Weight
 .INDENT 0.0
 .TP
 .B \-\-output\-name NAME
-prepend \fBNAME\fP to the file names generated when \fB\-\-output\-csv\fP
+Prepend \fBNAME\fP to the file names generated when \fB\-\-output\-csv\fP
 is specified. For instance \fB\-\-output\-name FOO\fP will create
 files:
 .INDENT 7.0
@@ -417,7 +392,7 @@ crushtool \-c map.txt \-o crushmap
 .UNINDENT
 .SH AVAILABILITY
 .sp
-\fBcrushtool\fP is part of Ceph, a massively scalable, open-source, distributed storage system. Please
+\fBcrushtool\fP is part of Ceph, a massively scalable, open\-source, distributed storage system. Please
 refer to the Ceph documentation at \fI\%http://ceph.com/docs\fP for more
 information.
 .SH SEE ALSO
diff --git a/man/librados-config.8 b/man/librados-config.8
index 1e728c7..731888c 100644
--- a/man/librados-config.8
+++ b/man/librados-config.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "LIBRADOS-CONFIG" "8" "January 12, 2014" "dev" "Ceph"
+.TH "LIBRADOS-CONFIG" "8" "November 03, 2015" "dev" "Ceph"
 .SH NAME
 librados-config \- display information about librados
 .
@@ -30,33 +30,6 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .in \\n[rst2man-indent\\n[rst2man-indent-level]]u
 ..
-.
-.nr rst2man-indent-level 0
-.
-.de1 rstReportMargin
-\\$1 \\n[an-margin]
-level \\n[rst2man-indent-level]
-level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
--
-\\n[rst2man-indent0]
-\\n[rst2man-indent1]
-\\n[rst2man-indent2]
-..
-.de1 INDENT
-.\" .rstReportMargin pre:
-. RS \\$1
-. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin]
-. nr rst2man-indent-level +1
-.\" .rstReportMargin post:
-..
-.de UNINDENT
-. RE
-.\" indent \\n[an-margin]
-.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.nr rst2man-indent-level -1
-.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.in \\n[rst2man-indent\\n[rst2man-indent-level]]u
-..
 .SH SYNOPSIS
 .nf
 \fBlibrados\-config\fP [ \-\-version ] [ \-\-vernum ]
@@ -81,7 +54,7 @@ Display the \fBlibrados\fP version code
 .UNINDENT
 .SH AVAILABILITY
 .sp
-\fBlibrados\-config\fP is part of Ceph, a massively scalable, open-source, distributed storage system.
+\fBlibrados\-config\fP is part of Ceph, a massively scalable, open\-source, distributed storage system.
 Please refer to the Ceph documentation at \fI\%http://ceph.com/docs\fP for
 more information.
 .SH SEE ALSO
diff --git a/man/monmaptool.8 b/man/monmaptool.8
index a8862e3..95ed4c4 100644
--- a/man/monmaptool.8
+++ b/man/monmaptool.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "MONMAPTOOL" "8" "January 12, 2014" "dev" "Ceph"
+.TH "MONMAPTOOL" "8" "November 03, 2015" "dev" "Ceph"
 .SH NAME
 monmaptool \- ceph monitor cluster map manipulation tool
 .
@@ -30,33 +30,6 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .in \\n[rst2man-indent\\n[rst2man-indent-level]]u
 ..
-.
-.nr rst2man-indent-level 0
-.
-.de1 rstReportMargin
-\\$1 \\n[an-margin]
-level \\n[rst2man-indent-level]
-level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
--
-\\n[rst2man-indent0]
-\\n[rst2man-indent1]
-\\n[rst2man-indent2]
-..
-.de1 INDENT
-.\" .rstReportMargin pre:
-. RS \\$1
-. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin]
-. nr rst2man-indent-level +1
-.\" .rstReportMargin post:
-..
-.de UNINDENT
-. RE
-.\" indent \\n[an-margin]
-.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.nr rst2man-indent-level -1
-.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.in \\n[rst2man-indent\\n[rst2man-indent-level]]u
-..
 .SH SYNOPSIS
 .nf
 \fBmonmaptool\fP \fImapfilename\fP [ \-\-clobber ] [ \-\-print ] [ \-\-create ]
@@ -175,7 +148,7 @@ monmaptool \-\-rm mon.a \-\-add mon.a 192.168.0.9:6789 \-\-clobber monmap
 .UNINDENT
 .SH AVAILABILITY
 .sp
-\fBmonmaptool\fP is part of Ceph, a massively scalable, open-source, distributed storage system.  Please
+\fBmonmaptool\fP is part of Ceph, a massively scalable, open\-source, distributed storage system.  Please
 refer to the Ceph documentation at \fI\%http://ceph.com/docs\fP for more
 information.
 .SH SEE ALSO
diff --git a/man/mount.ceph.8 b/man/mount.ceph.8
index 1159d91..c5de08a 100644
--- a/man/mount.ceph.8
+++ b/man/mount.ceph.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "MOUNT.CEPH" "8" "January 12, 2014" "dev" "Ceph"
+.TH "MOUNT.CEPH" "8" "November 03, 2015" "dev" "Ceph"
 .SH NAME
 mount.ceph \- mount a ceph file system
 .
@@ -30,33 +30,6 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .in \\n[rst2man-indent\\n[rst2man-indent-level]]u
 ..
-.
-.nr rst2man-indent-level 0
-.
-.de1 rstReportMargin
-\\$1 \\n[an-margin]
-level \\n[rst2man-indent-level]
-level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
--
-\\n[rst2man-indent0]
-\\n[rst2man-indent1]
-\\n[rst2man-indent2]
-..
-.de1 INDENT
-.\" .rstReportMargin pre:
-. RS \\$1
-. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin]
-. nr rst2man-indent-level +1
-.\" .rstReportMargin post:
-..
-.de UNINDENT
-. RE
-.\" indent \\n[an-margin]
-.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.nr rst2man-indent-level -1
-.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.in \\n[rst2man-indent\\n[rst2man-indent-level]]u
-..
 .SH SYNOPSIS
 .nf
 \fBmount.ceph\fP \fImonaddr1\fP[,\fImonaddr2\fP,...]:/[\fIsubdir\fP] \fIdir\fP [
@@ -244,7 +217,7 @@ mount \-t ceph monhost:/ /mnt/foo
 .UNINDENT
 .SH AVAILABILITY
 .sp
-\fBmount.ceph\fP is part of Ceph, a massively scalable, open-source, distributed storage system. Please
+\fBmount.ceph\fP is part of Ceph, a massively scalable, open\-source, distributed storage system. Please
 refer to the Ceph documentation at \fI\%http://ceph.com/docs\fP for more
 information.
 .SH SEE ALSO
diff --git a/man/osdmaptool.8 b/man/osdmaptool.8
index 99b7bac..7810e08 100644
--- a/man/osdmaptool.8
+++ b/man/osdmaptool.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "OSDMAPTOOL" "8" "January 12, 2014" "dev" "Ceph"
+.TH "OSDMAPTOOL" "8" "November 03, 2015" "dev" "Ceph"
 .SH NAME
 osdmaptool \- ceph osd cluster map manipulation tool
 .
@@ -30,33 +30,6 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .in \\n[rst2man-indent\\n[rst2man-indent-level]]u
 ..
-.
-.nr rst2man-indent-level 0
-.
-.de1 rstReportMargin
-\\$1 \\n[an-margin]
-level \\n[rst2man-indent-level]
-level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
--
-\\n[rst2man-indent0]
-\\n[rst2man-indent1]
-\\n[rst2man-indent2]
-..
-.de1 INDENT
-.\" .rstReportMargin pre:
-. RS \\$1
-. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin]
-. nr rst2man-indent-level +1
-.\" .rstReportMargin post:
-..
-.de UNINDENT
-. RE
-.\" indent \\n[an-margin]
-.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.nr rst2man-indent-level -1
-.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.in \\n[rst2man-indent\\n[rst2man-indent-level]]u
-..
 .SH SYNOPSIS
 .nf
 \fBosdmaptool\fP \fImapfilename\fP [\-\-print] [\-\-createsimple \fInumosd\fP
@@ -126,7 +99,7 @@ osdmaptool \-\-print osdmap
 .UNINDENT
 .SH AVAILABILITY
 .sp
-\fBosdmaptool\fP is part of Ceph, a massively scalable, open-source, distributed storage system.  Please
+\fBosdmaptool\fP is part of Ceph, a massively scalable, open\-source, distributed storage system.  Please
 refer to the Ceph documentation at \fI\%http://ceph.com/docs\fP for more
 information.
 .SH SEE ALSO
diff --git a/man/rados.8 b/man/rados.8
index 94b67c9..a9a0d96 100644
--- a/man/rados.8
+++ b/man/rados.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "RADOS" "8" "May 29, 2014" "dev" "Ceph"
+.TH "RADOS" "8" "November 03, 2015" "dev" "Ceph"
 .SH NAME
 rados \- rados object storage utility
 .
@@ -30,33 +30,6 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .in \\n[rst2man-indent\\n[rst2man-indent-level]]u
 ..
-.
-.nr rst2man-indent-level 0
-.
-.de1 rstReportMargin
-\\$1 \\n[an-margin]
-level \\n[rst2man-indent-level]
-level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
--
-\\n[rst2man-indent0]
-\\n[rst2man-indent1]
-\\n[rst2man-indent2]
-..
-.de1 INDENT
-.\" .rstReportMargin pre:
-. RS \\$1
-. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin]
-. nr rst2man-indent-level +1
-.\" .rstReportMargin post:
-..
-.de UNINDENT
-. RE
-.\" indent \\n[an-margin]
-.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.nr rst2man-indent-level -1
-.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.in \\n[rst2man-indent\\n[rst2man-indent-level]]u
-..
 .SH SYNOPSIS
 .nf
 \fBrados\fP [ \-m \fImonaddr\fP ] [ mkpool | rmpool \fIfoo\fP ] [ \-p | \-\-pool
@@ -104,6 +77,17 @@ Use ceph.conf configuration file instead of the default
 .B \-m monaddress[:port]
 Connect to specified monitor (instead of looking through ceph.conf).
 .UNINDENT
+.INDENT 0.0
+.TP
+.B \-b block_size
+Set the block size for put/get ops and for write benchmarking.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-striper
+Uses the striping API of rados rather than the default one.
+Available for stat, get, put, truncate, rm, ls and all xattr related operation
+.UNINDENT
 .SH GLOBAL COMMANDS
 .INDENT 0.0
 .TP
@@ -154,6 +138,7 @@ sequential or random. Before running one of the reading benchmarks,
 run a write benchmark with the \fI\-\-no\-cleanup\fP option. The default
 object size is 4 MB, and the default number of simulated threads
 (parallel writes) is 16.
+Note: \-b \fIobjsize\fP option is valid only in \fIwrite\fP mode.
 .UNINDENT
 .sp
 \fBcleanup\fP
@@ -256,7 +241,7 @@ rados \-p foo \-s mysnap get myobject blah.txt.old
 .UNINDENT
 .SH AVAILABILITY
 .sp
-\fBrados\fP is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to
+\fBrados\fP is part of Ceph, a massively scalable, open\-source, distributed storage system. Please refer to
 the Ceph documentation at \fI\%http://ceph.com/docs\fP for more information.
 .SH SEE ALSO
 .sp
diff --git a/man/radosgw-admin.8 b/man/radosgw-admin.8
index d465d8a..d3c33e1 100644
--- a/man/radosgw-admin.8
+++ b/man/radosgw-admin.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "RADOSGW-ADMIN" "8" "January 12, 2014" "dev" "Ceph"
+.TH "RADOSGW-ADMIN" "8" "November 03, 2015" "dev" "Ceph"
 .SH NAME
 radosgw-admin \- rados REST gateway user administration utility
 .
@@ -30,33 +30,6 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .in \\n[rst2man-indent\\n[rst2man-indent-level]]u
 ..
-.
-.nr rst2man-indent-level 0
-.
-.de1 rstReportMargin
-\\$1 \\n[an-margin]
-level \\n[rst2man-indent-level]
-level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
--
-\\n[rst2man-indent0]
-\\n[rst2man-indent1]
-\\n[rst2man-indent2]
-..
-.de1 INDENT
-.\" .rstReportMargin pre:
-. RS \\$1
-. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin]
-. nr rst2man-indent-level +1
-.\" .rstReportMargin post:
-..
-.de UNINDENT
-. RE
-.\" indent \\n[an-margin]
-.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.nr rst2man-indent-level -1
-.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.in \\n[rst2man-indent\\n[rst2man-indent-level]]u
-..
 .SH SYNOPSIS
 .nf
 \fBradosgw\-admin\fP \fIcommand\fP [ \fIoptions\fP \fI\&...\fP ]
@@ -68,75 +41,207 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
 allows creating and modifying users.
 .SH COMMANDS
 .sp
-\fIcommand\fP can be one of the following options:
+\fBradosgw\-admin\fP utility uses many commands for administration purpose
+which are as follows:
 .INDENT 0.0
 .TP
 .B \fBuser create\fP
-Create a new user
+Create a new user.
 .TP
 .B \fBuser modify\fP
-Modify a user
+Modify a user.
 .TP
 .B \fBuser info\fP
 Display information of a user, and any potentially available
-subusers and keys
+subusers and keys.
 .TP
 .B \fBuser rm\fP
-Remove a user
+Remove a user.
+.TP
+.B \fBuser suspend\fP
+Suspend a user.
+.TP
+.B \fBuser enable\fP
+Re\-enable user after suspension.
+.TP
+.B \fBuser check\fP
+Check user info.
+.TP
+.B \fBuser stats\fP
+Show user stats as accounted by quota subsystem.
+.TP
+.B \fBcaps add\fP
+Add user capabilities.
+.TP
+.B \fBcaps rm\fP
+Remove user capabilities.
 .TP
 .B \fBsubuser create\fP
-Create a new subuser (primarily useful for clients using the Swift API)
+Create a new subuser (primarily useful for clients using the Swift API).
 .TP
 .B \fBsubuser modify\fP
-Modify a subuser
+Modify a subuser.
 .TP
 .B \fBsubuser rm\fP
-Remove a subuser
+Remove a subuser.
+.TP
+.B \fBkey create\fP
+Create access key.
+.TP
+.B \fBkey rm\fP
+Remove access key.
 .TP
 .B \fBbucket list\fP
-List all buckets
+List all buckets.
+.TP
+.B \fBbucket link\fP
+Link bucket to specified user.
 .TP
 .B \fBbucket unlink\fP
-Remove a bucket
+Unlink bucket from specified user.
+.TP
+.B \fBbucket stats\fP
+Returns bucket statistics.
 .TP
 .B \fBbucket rm\fP
-Remove a bucket
+Remove a bucket.
+.TP
+.B \fBbucket check\fP
+Check bucket index.
 .TP
 .B \fBobject rm\fP
-Remove an object
+Remove an object.
 .TP
-.B \fBkey create\fP
-Create an access key
+.B \fBobject unlink\fP
+Unlink object from bucket index.
 .TP
-.B \fBkey rm\fP
-Remove an access key
+.B \fBquota set\fP
+Set quota params.
+.TP
+.B \fBquota enable\fP
+Enable quota.
+.TP
+.B \fBquota disable\fP
+Disable quota.
+.TP
+.B \fBregion get\fP
+Show region info.
+.TP
+.B \fBregions list\fP
+List all regions set on this cluster.
+.TP
+.B \fBregion set\fP
+Set region info (requires infile).
+.TP
+.B \fBregion default\fP
+Set default region.
+.TP
+.B \fBregion\-map get\fP
+Show region\-map.
+.TP
+.B \fBregion\-map set\fP
+Set region\-map (requires infile).
+.TP
+.B \fBzone get\fP
+Show zone cluster params.
+.TP
+.B \fBzone set\fP
+Set zone cluster params (requires infile).
+.TP
+.B \fBzone list\fP
+List all zones set on this cluster.
 .TP
 .B \fBpool add\fP
-Add an existing pool for data placement
+Add an existing pool for data placement.
 .TP
 .B \fBpool rm\fP
-Remove an existing pool from data placement set
+Remove an existing pool from data placement set.
 .TP
 .B \fBpools list\fP
-List placement active set
+List placement active set.
 .TP
 .B \fBpolicy\fP
-Display bucket/object policy
+Display bucket/object policy.
+.TP
+.B \fBlog list\fP
+List log objects.
 .TP
 .B \fBlog show\fP
-Show the log of a bucket (with a specified date)
+Dump a log from specific object or (bucket + date + bucket\-id).
+.TP
+.B \fBlog rm\fP
+Remove log object.
 .TP
 .B \fBusage show\fP
-Show the usage information (with optional user and date range)
+Show the usage information (with optional user and date range).
 .TP
 .B \fBusage trim\fP
-Trim usage information (with optional user and date range)
+Trim usage information (with optional user and date range).
+.TP
+.B \fBtemp remove\fP
+Remove temporary objects that were created up to specified date
+(and optional time).
+.TP
+.B \fBgc list\fP
+Dump expired garbage collection objects (specify \-\-include\-all to list all
+entries, including unexpired).
+.TP
+.B \fBgc process\fP
+Manually process garbage.
+.TP
+.B \fBmetadata get\fP
+Get metadata info.
+.TP
+.B \fBmetadata put\fP
+Put metadata info.
+.TP
+.B \fBmetadata rm\fP
+Remove metadata info.
+.TP
+.B \fBmetadata list\fP
+List metadata info.
+.TP
+.B \fBmdlog list\fP
+List metadata log.
+.TP
+.B \fBmdlog trim\fP
+Trim metadata log.
+.TP
+.B \fBbilog list\fP
+List bucket index log.
+.TP
+.B \fBbilog trim\fP
+Trim bucket index log (use start\-marker, end\-marker).
+.TP
+.B \fBdatalog list\fP
+List data log.
+.TP
+.B \fBdatalog trim\fP
+Trim data log.
+.TP
+.B \fBopstate list\fP
+List stateful operations entries (use client_id, op_id, object).
+.TP
+.B \fBopstate set\fP
+Set state on an entry (use client_id, op_id, object, state).
+.TP
+.B \fBopstate renew\fP
+Renew state on an entry (use client_id, op_id, object).
+.TP
+.B \fBopstate rm\fP
+Remove entry (use client_id, op_id, object).
+.TP
+.B \fBreplicalog get\fP
+Get replica metadata log entry.
+.TP
+.B \fBreplicalog delete\fP
+Delete replica metadata log entry.
 .UNINDENT
 .SH OPTIONS
 .INDENT 0.0
 .TP
 .B \-c ceph.conf, \-\-conf=ceph.conf
-Use \fIceph.conf\fP configuration file instead of the default
+Use \fBceph.conf\fP configuration file instead of the default
 \fB/etc/ceph/ceph.conf\fP to determine monitor addresses during
 startup.
 .UNINDENT
@@ -152,8 +257,13 @@ The radosgw user ID.
 .UNINDENT
 .INDENT 0.0
 .TP
-.B \-\-secret=secret
-The secret associated with a given key.
+.B \-\-subuser=<name>
+Name of the subuser.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-email=email
+The e\-mail address of the user.
 .UNINDENT
 .INDENT 0.0
 .TP
@@ -162,8 +272,38 @@ Configure the display name of the user.
 .UNINDENT
 .INDENT 0.0
 .TP
-.B \-\-email=email
-The e\-mail address of the user
+.B \-\-access\-key=<key>
+S3 access key.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-gen\-access\-key
+Generate random access key (for S3).
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-secret=secret
+The secret associated with a given key.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-gen\-secret
+Generate random secret key.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-key\-type=<type>
+key type, options are: swift, S3.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-temp\-url\-key[\-2]=<key>
+Temporary url key.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-system
+Set the system flag on the user.
 .UNINDENT
 .INDENT 0.0
 .TP
@@ -178,37 +318,142 @@ Specify the object name.
 .INDENT 0.0
 .TP
 .B \-\-date=yyyy\-mm\-dd
-The date needed for some commands
+The date needed for some commands.
 .UNINDENT
 .INDENT 0.0
 .TP
 .B \-\-start\-date=yyyy\-mm\-dd
-The start date needed for some commands
+The start date needed for some commands.
 .UNINDENT
 .INDENT 0.0
 .TP
 .B \-\-end\-date=yyyy\-mm\-dd
-The end date needed for some commands
+The end date needed for some commands.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-shard\-id=<shard\-id>
+Optional for mdlog list. Required for \fBmdlog trim\fP,
+\fBreplica mdlog get/delete\fP, \fBreplica datalog get/delete\fP\&.
 .UNINDENT
 .INDENT 0.0
 .TP
 .B \-\-auth\-uid=auid
-The librados auid
+The librados auid.
 .UNINDENT
 .INDENT 0.0
 .TP
 .B \-\-purge\-data
-Remove user data before user removal
+Remove user data before user removal.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-purge\-keys
+When specified, subuser removal will also purge all the subuser keys.
 .UNINDENT
 .INDENT 0.0
 .TP
 .B \-\-purge\-objects
-Remove all objects before bucket removal
+Remove all objects before bucket removal.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-metadata\-key=<key>
+Key to retrieve metadata from with \fBmetadata get\fP\&.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-rgw\-region=<region>
+Region in which radosgw is running.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-rgw\-zone=<zone>
+Zone in which radosgw is running.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-fix
+Besides checking bucket index, will also fix it.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-check\-objects
+bucket check: Rebuilds bucket index according to actual objects state.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-format=<format>
+Specify output format for certain operations: xml, json.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-sync\-stats
+Option to \(aquser stats\(aq, update user stats with current stats reported by
+user\(aqs buckets indexes.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-show\-log\-entries=<flag>
+Enable/disable dump of log entries on log show.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-show\-log\-sum=<flag>
+Enable/disable dump of log summation on log show.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-skip\-zero\-entries
+Log show only dumps entries that don\(aqt have zero value in one of the numeric
+field.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-infile
+Specify a file to read in when setting data.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-state=<state string>
+Specify a state for the opstate set command.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-replica\-log\-type
+Replica log type (metadata, data, bucket), required for replica log
+operations.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-categories=<list>
+Comma separated list of categories, used in usage show.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-caps=<caps>
+List of caps (e.g., "usage=read, write; user=read".
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-yes\-i\-really\-mean\-it
+Required for certain operations.
+.UNINDENT
+.SH QUOTA OPTIONS
+.INDENT 0.0
+.TP
+.B \-\-max\-objects
+Specify max objects (negative value to disable).
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-max\-size
+Specify max size (in bytes, negative value to disable).
 .UNINDENT
 .INDENT 0.0
 .TP
-.B \-\-lazy\-remove
-Defer removal of object tail
+.B \-\-quota\-scope
+Scope of quota (bucket, user).
 .UNINDENT
 .SH EXAMPLES
 .sp
@@ -277,7 +522,7 @@ Show the logs of a bucket from April 1st, 2012:
 .sp
 .nf
 .ft C
-$ radosgw\-admin log show \-\-bucket=foo \-\-date=2012=04\-01
+$ radosgw\-admin log show \-\-bucket=foo \-\-date=2012\-04\-01
 .ft P
 .fi
 .UNINDENT
@@ -321,12 +566,13 @@ $ radosgw\-admin usage trim \-\-uid=johnny \-\-end\-date=2012\-04\-01
 .UNINDENT
 .SH AVAILABILITY
 .sp
-\fBradosgw\-admin\fP is part of Ceph, a massively scalable, open-source, distributed storage system.  Please
-refer to the Ceph documentation at \fI\%http://ceph.com/docs\fP for more
-information.
+\fBradosgw\-admin\fP is part of Ceph, a massively scalable, open\-source,
+distributed storage system.  Please refer to the Ceph documentation at
+\fI\%http://ceph.com/docs\fP for more information.
 .SH SEE ALSO
 .sp
 \fBceph\fP(8)
+\fBradosgw\fP(8)
 .SH COPYRIGHT
 2010-2014, Inktank Storage, Inc. and contributors. Licensed under Creative Commons BY-SA
 .\" Generated by docutils manpage writer.
diff --git a/man/radosgw.8 b/man/radosgw.8
index 41e348b..2f077ad 100644
--- a/man/radosgw.8
+++ b/man/radosgw.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "RADOSGW" "8" "January 12, 2014" "dev" "Ceph"
+.TH "RADOSGW" "8" "November 03, 2015" "dev" "Ceph"
 .SH NAME
 radosgw \- rados REST gateway
 .
@@ -30,33 +30,6 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .in \\n[rst2man-indent\\n[rst2man-indent-level]]u
 ..
-.
-.nr rst2man-indent-level 0
-.
-.de1 rstReportMargin
-\\$1 \\n[an-margin]
-level \\n[rst2man-indent-level]
-level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
--
-\\n[rst2man-indent0]
-\\n[rst2man-indent1]
-\\n[rst2man-indent2]
-..
-.de1 INDENT
-.\" .rstReportMargin pre:
-. RS \\$1
-. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin]
-. nr rst2man-indent-level +1
-.\" .rstReportMargin post:
-..
-.de UNINDENT
-. RE
-.\" indent \\n[an-margin]
-.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.nr rst2man-indent-level -1
-.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.in \\n[rst2man-indent\\n[rst2man-indent-level]]u
-..
 .SH SYNOPSIS
 .nf
 \fBradosgw\fP
@@ -72,91 +45,219 @@ capable web server.
 .INDENT 0.0
 .TP
 .B \-c ceph.conf, \-\-conf=ceph.conf
-Use \fIceph.conf\fP configuration file instead of the default
+Use \fBceph.conf\fP configuration file instead of the default
 \fB/etc/ceph/ceph.conf\fP to determine monitor addresses during startup.
 .UNINDENT
 .INDENT 0.0
 .TP
 .B \-m monaddress[:port]
-Connect to specified monitor (instead of looking through
-\fBceph.conf\fP).
+Connect to specified monitor (instead of looking through \fBceph.conf\fP).
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-i ID, \-\-id ID
+Set the ID portion of name for radosgw
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-n TYPE.ID, \-\-name TYPE.ID
+Set the rados user name for the gateway (eg. client.radosgw.gateway)
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-cluster NAME
+Set the cluster name (default: ceph)
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-d
+Run in foreground, log to stderr
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-f
+Run in foreground, log to usual location
 .UNINDENT
 .INDENT 0.0
 .TP
 .B \-\-rgw\-socket\-path=path
 Specify a unix domain socket path.
 .UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-rgw\-region=region
+The region where radosgw runs
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-rgw\-zone=zone
+The zone where radosgw runs
+.UNINDENT
 .SH CONFIGURATION
 .sp
-Currently it\(aqs the easiest to use the RADOS Gateway with Apache and mod_fastcgi:
+Earlier RADOS Gateway had to be configured with \fBApache\fP and \fBmod_fastcgi\fP\&.
+Now, \fBmod_proxy_fcgi\fP module is used instead of \fBmod_fastcgi\fP\&.
+\fBmod_proxy_fcgi\fP works differently than a traditional FastCGI module. This
+module requires the service of \fBmod_proxy\fP which provides support for the
+FastCGI protocol. So, to be able to handle FastCGI protocol, both \fBmod_proxy\fP
+and \fBmod_proxy_fcgi\fP have to be present in the server. Unlike \fBmod_fastcgi\fP,
+\fBmod_proxy_fcgi\fP cannot start the application process. Some platforms have
+\fBfcgistarter\fP for that purpose. However, external launching of application
+or process management may be available in the FastCGI application framework
+in use.
+.sp
+\fBApache\fP can be configured in a way that enables \fBmod_proxy_fcgi\fP to be used
+with localhost tcp or through unix domain socket. \fBmod_proxy_fcgi\fP that doesn\(aqt
+support unix domain socket such as the ones in Apache 2.2 and earlier versions of
+Apache 2.4, needs to be configured for use with localhost tcp. Later versions of
+Apache like Apache 2.4.9 or later support unix domain socket and as such they
+allow for the configuration with unix domain socket instead of localhost tcp.
+.sp
+The following steps show the configuration in Ceph\(aqs configuration file i.e,
+\fB/etc/ceph/ceph.conf\fP and the gateway configuration file i.e,
+\fB/etc/httpd/conf.d/rgw.conf\fP (RPM\-based distros) or
+\fB/etc/apache2/conf\-available/rgw.conf\fP (Debian\-based distros) with localhost
+tcp and through unix domain socket:
 .INDENT 0.0
+.IP 1. 3
+For distros with Apache 2.2 and early versions of Apache 2.4 that use
+localhost TCP and do not support Unix Domain Socket, append the following
+contents to \fB/etc/ceph/ceph.conf\fP:
+.INDENT 3.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+[client.radosgw.gateway]
+host = {hostname}
+keyring = /etc/ceph/ceph.client.radosgw.keyring
+rgw socket path = ""
+log file = /var/log/ceph/client.radosgw.gateway.log
+rgw frontends = fastcgi socket_port=9000 socket_host=0.0.0.0
+rgw print continue = false
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.IP 2. 3
+Add the following content in the gateway configuration file:
+.sp
+For Debian/Ubuntu add in \fB/etc/apache2/conf\-available/rgw.conf\fP:
+.INDENT 3.0
 .INDENT 3.5
 .sp
 .nf
 .ft C
-FastCgiExternalServer /var/www/s3gw.fcgi \-socket /tmp/radosgw.sock
-
 <VirtualHost *:80>
-  ServerName rgw.example1.com
-  ServerAlias rgw
-  ServerAdmin webmaster at example1.com
-  DocumentRoot /var/www
-
-  RewriteEngine On
-  RewriteRule ^/([a\-zA\-Z0\-9\-_.]*)([/]?.*) /s3gw.fcgi?page=$1&params=$2&%{QUERY_STRING} [E=HTTP_AUTHORIZATION:%{HTTP:Authorization},L]
-
-  <IfModule mod_fastcgi.c>
-    <Directory /var/www>
-      Options +ExecCGI
-      AllowOverride All
-      SetHandler fastcgi\-script
-      Order allow,deny
-      Allow from all
-      AuthBasicAuthoritative Off
-    </Directory>
-  </IfModule>
-
-  AllowEncodedSlashes On
-  ServerSignature Off
+ServerName localhost
+DocumentRoot /var/www/html
+
+ErrorLog /var/log/apache2/rgw_error.log
+CustomLog /var/log/apache2/rgw_access.log combined
+
+# LogLevel debug
+
+RewriteEngine On
+
+RewriteRule .* \- [E=HTTP_AUTHORIZATION:%{HTTP:Authorization},L]
+
+SetEnv proxy\-nokeepalive 1
+
+ProxyPass / fcgi://localhost:9000/
+
 </VirtualHost>
 .ft P
 .fi
 .UNINDENT
 .UNINDENT
 .sp
-And the corresponding radosgw script (/var/www/s3gw.fcgi):
-.INDENT 0.0
+For CentOS/RHEL add in \fB/etc/httpd/conf.d/rgw.conf\fP:
+.INDENT 3.0
 .INDENT 3.5
 .sp
 .nf
 .ft C
-#!/bin/sh
-exec /usr/bin/radosgw \-c /etc/ceph/ceph.conf \-n client.radosgw.gateway
+<VirtualHost *:80>
+ServerName localhost
+DocumentRoot /var/www/html
+
+ErrorLog /var/log/httpd/rgw_error.log
+CustomLog /var/log/httpd/rgw_access.log combined
+
+# LogLevel debug
+
+RewriteEngine On
+
+RewriteRule .* \- [E=HTTP_AUTHORIZATION:%{HTTP:Authorization},L]
+
+SetEnv proxy\-nokeepalive 1
+
+ProxyPass / fcgi://localhost:9000/
+
+</VirtualHost>
 .ft P
 .fi
 .UNINDENT
 .UNINDENT
-.sp
-The radosgw daemon is a standalone process which needs a configuration
-section in the ceph.conf The section name should start with
-\(aqclient.radosgw.\(aq as specified in /etc/init.d/radosgw:
-.INDENT 0.0
+.IP 3. 3
+For distros with Apache 2.4.9 or later that support Unix Domain Socket,
+append the following configuration to \fB/etc/ceph/ceph.conf\fP:
+.INDENT 3.0
 .INDENT 3.5
 .sp
 .nf
 .ft C
 [client.radosgw.gateway]
-    host = gateway
-    keyring = /etc/ceph/keyring.radosgw.gateway
-    rgw socket path = /tmp/radosgw.sock
+host = {hostname}
+keyring = /etc/ceph/ceph.client.radosgw.keyring
+rgw socket path = /var/run/ceph/ceph.radosgw.gateway.fastcgi.sock
+log file = /var/log/ceph/client.radosgw.gateway.log
+rgw print continue = false
 .ft P
 .fi
 .UNINDENT
 .UNINDENT
+.IP 4. 3
+Add the following content in the gateway configuration file:
 .sp
-You will also have to generate a key for the radosgw to use for
-authentication with the cluster:
-.INDENT 0.0
+For CentOS/RHEL add in \fB/etc/httpd/conf.d/rgw.conf\fP:
+.INDENT 3.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+<VirtualHost *:80>
+ServerName localhost
+DocumentRoot /var/www/html
+
+ErrorLog /var/log/httpd/rgw_error.log
+CustomLog /var/log/httpd/rgw_access.log combined
+
+# LogLevel debug
+
+RewriteEngine On
+
+RewriteRule .* \- [E=HTTP_AUTHORIZATION:%{HTTP:Authorization},L]
+
+SetEnv proxy\-nokeepalive 1
+
+ProxyPass / unix:///var/run/ceph/ceph.radosgw.gateway.fastcgi.sock|fcgi://localhost:9000/
+
+</VirtualHost>
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+The latest version of Ubuntu i.e, 14.04 ships with \fBApache 2.4.7\fP that
+does not have Unix Domain Socket support in it and as such it has to be
+configured with localhost tcp. The Unix Domain Socket support is available in
+\fBApache 2.4.9\fP and later versions. A bug has been filed to backport the UDS
+support to \fBApache 2.4.7\fP for \fBUbuntu 14.04\fP\&.
+See: \fI\%https://bugs.launchpad.net/ubuntu/+source/apache2/+bug/1411030\fP
+.IP 5. 3
+Generate a key for radosgw to use for authentication with the cluster.
+.INDENT 3.0
 .INDENT 3.5
 .sp
 .nf
@@ -167,9 +268,9 @@ ceph\-authtool \-n client.radosgw.gateway \-\-cap mon \(aqallow rw\(aq \-\-cap o
 .fi
 .UNINDENT
 .UNINDENT
-.sp
-And add the key to the auth entries:
-.INDENT 0.0
+.IP 6. 3
+Add the key to the auth entries.
+.INDENT 3.0
 .INDENT 3.5
 .sp
 .nf
@@ -179,22 +280,38 @@ ceph auth add client.radosgw.gateway \-\-in\-file=keyring.radosgw.gateway
 .fi
 .UNINDENT
 .UNINDENT
+.IP 7. 3
+Start Apache and radosgw.
 .sp
-Now you can start Apache and the radosgw daemon:
-.INDENT 0.0
+Debian/Ubuntu:
+.INDENT 3.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+sudo /etc/init.d/apache2 start
+sudo /etc/init.d/radosgw start
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+CentOS/RHEL:
+.INDENT 3.0
 .INDENT 3.5
 .sp
 .nf
 .ft C
-/etc/init.d/apache2 start
-/etc/init.d/radosgw start
+sudo apachectl start
+sudo /etc/init.d/ceph\-radosgw start
 .ft P
 .fi
 .UNINDENT
 .UNINDENT
+.UNINDENT
 .SH USAGE LOGGING
 .sp
-The \fBradosgw\fP maintains an asynchronous usage log. It accumulates
+\fBradosgw\fP maintains an asynchronous usage log. It accumulates
 statistics about user operations and flushes it periodically. The
 logs can be accessed and managed through \fBradosgw\-admin\fP\&.
 .sp
@@ -229,9 +346,9 @@ threshold specify how many entries can be kept before resorting to
 synchronous flush.
 .SH AVAILABILITY
 .sp
-\fBradosgw\fP is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer
-to the Ceph documentation at \fI\%http://ceph.com/docs\fP for more
-information.
+\fBradosgw\fP is part of Ceph, a massively scalable, open\-source, distributed
+storage system. Please refer to the Ceph documentation at \fI\%http://ceph.com/docs\fP for
+more information.
 .SH SEE ALSO
 .sp
 \fBceph\fP(8)
diff --git a/man/rbd-fuse.8 b/man/rbd-fuse.8
index 8e23798..e6a1a6d 100644
--- a/man/rbd-fuse.8
+++ b/man/rbd-fuse.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "RBD-FUSE" "8" "January 12, 2014" "dev" "Ceph"
+.TH "RBD-FUSE" "8" "November 03, 2015" "dev" "Ceph"
 .SH NAME
 rbd-fuse \- expose rbd images as files
 .
@@ -30,33 +30,6 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .in \\n[rst2man-indent\\n[rst2man-indent-level]]u
 ..
-.
-.nr rst2man-indent-level 0
-.
-.de1 rstReportMargin
-\\$1 \\n[an-margin]
-level \\n[rst2man-indent-level]
-level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
--
-\\n[rst2man-indent0]
-\\n[rst2man-indent1]
-\\n[rst2man-indent2]
-..
-.de1 INDENT
-.\" .rstReportMargin pre:
-. RS \\$1
-. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin]
-. nr rst2man-indent-level +1
-.\" .rstReportMargin post:
-..
-.de UNINDENT
-. RE
-.\" indent \\n[an-margin]
-.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.nr rst2man-indent-level -1
-.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.in \\n[rst2man-indent\\n[rst2man-indent-level]]u
-..
 .SH SYNOPSIS
 .nf
 \fBrbd\-fuse\fP [ \-p pool ] [\-c conffile] \fImountpoint\fP [ \fIfuse options\fP ]
@@ -98,7 +71,7 @@ Use \fIpool\fP as the pool to search for rbd images.  Default is \fBrbd\fP\&.
 .UNINDENT
 .SH AVAILABILITY
 .sp
-\fBrbd\-fuse\fP is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to
+\fBrbd\-fuse\fP is part of Ceph, a massively scalable, open\-source, distributed storage system. Please refer to
 the Ceph documentation at \fI\%http://ceph.com/docs\fP for more information.
 .SH SEE ALSO
 .sp
diff --git a/man/rbd-replay-many.8 b/man/rbd-replay-many.8
index 1d6c0bf..929d92d 100644
--- a/man/rbd-replay-many.8
+++ b/man/rbd-replay-many.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "RBD-REPLAY-MANY" "8" "September 04, 2014" "dev" "Ceph"
+.TH "RBD-REPLAY-MANY" "8" "November 03, 2015" "dev" "Ceph"
 .SH NAME
 rbd-replay-many \- replay a rados block device (RBD) workload on several clients
 .
@@ -30,33 +30,6 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .in \\n[rst2man-indent\\n[rst2man-indent-level]]u
 ..
-.
-.nr rst2man-indent-level 0
-.
-.de1 rstReportMargin
-\\$1 \\n[an-margin]
-level \\n[rst2man-indent-level]
-level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
--
-\\n[rst2man-indent0]
-\\n[rst2man-indent1]
-\\n[rst2man-indent2]
-..
-.de1 INDENT
-.\" .rstReportMargin pre:
-. RS \\$1
-. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin]
-. nr rst2man-indent-level +1
-.\" .rstReportMargin post:
-..
-.de UNINDENT
-. RE
-.\" indent \\n[an-margin]
-.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.nr rst2man-indent-level -1
-.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.in \\n[rst2man-indent\\n[rst2man-indent-level]]u
-..
 .SH SYNOPSIS
 .nf
 \fBrbd\-replay\-many\fP [ \fIoptions\fP ] \-\-original\-image \fIname\fP \fIhost1\fP [ \fIhost2\fP [ ... ] ] \-\- \fIrbd_replay_args\fP
@@ -122,7 +95,7 @@ ssh host\-1 \(aqrbd\-replay\(aq \-\-map\-image \(aqimage=image\-1\(aq \-c ceph.c
 .UNINDENT
 .SH AVAILABILITY
 .sp
-\fBrbd\-replay\-many\fP is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to
+\fBrbd\-replay\-many\fP is part of Ceph, a massively scalable, open\-source, distributed storage system. Please refer to
 the Ceph documentation at \fI\%http://ceph.com/docs\fP for more information.
 .SH SEE ALSO
 .sp
diff --git a/man/rbd-replay-prep.8 b/man/rbd-replay-prep.8
index 13ab33a..102c2b2 100644
--- a/man/rbd-replay-prep.8
+++ b/man/rbd-replay-prep.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "RBD-REPLAY-PREP" "8" "August 21, 2014" "dev" "Ceph"
+.TH "RBD-REPLAY-PREP" "8" "November 03, 2015" "dev" "Ceph"
 .SH NAME
 rbd-replay-prep \- prepare captured rados block device (RBD) workloads for replay
 .
@@ -30,33 +30,6 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .in \\n[rst2man-indent\\n[rst2man-indent-level]]u
 ..
-.
-.nr rst2man-indent-level 0
-.
-.de1 rstReportMargin
-\\$1 \\n[an-margin]
-level \\n[rst2man-indent-level]
-level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
--
-\\n[rst2man-indent0]
-\\n[rst2man-indent1]
-\\n[rst2man-indent2]
-..
-.de1 INDENT
-.\" .rstReportMargin pre:
-. RS \\$1
-. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin]
-. nr rst2man-indent-level +1
-.\" .rstReportMargin post:
-..
-.de UNINDENT
-. RE
-.\" indent \\n[an-margin]
-.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.nr rst2man-indent-level -1
-.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.in \\n[rst2man-indent\\n[rst2man-indent-level]]u
-..
 .SH SYNOPSIS
 .nf
 \fBrbd\-replay\-prep\fP [ \-\-window \fIseconds\fP ] [ \-\-anonymize ] \fItrace_dir\fP \fIreplay_file\fP
@@ -76,6 +49,11 @@ Requests further apart than \(aqseconds\(aq seconds are assumed to be independen
 .B \-\-anonymize
 Anonymizes image and snap names.
 .UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-verbose
+Print all processed events to console
+.UNINDENT
 .SH EXAMPLES
 .sp
 To prepare workload1\-trace for replay:
@@ -91,7 +69,7 @@ rbd\-replay\-prep workload1\-trace/ust/uid/1000/64\-bit workload1
 .UNINDENT
 .SH AVAILABILITY
 .sp
-\fBrbd\-replay\-prep\fP is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to
+\fBrbd\-replay\-prep\fP is part of Ceph, a massively scalable, open\-source, distributed storage system. Please refer to
 the Ceph documentation at \fI\%http://ceph.com/docs\fP for more information.
 .SH SEE ALSO
 .sp
diff --git a/man/rbd-replay.8 b/man/rbd-replay.8
index cefb925..5368d18 100644
--- a/man/rbd-replay.8
+++ b/man/rbd-replay.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "RBD-REPLAY" "8" "September 10, 2014" "dev" "Ceph"
+.TH "RBD-REPLAY" "8" "November 03, 2015" "dev" "Ceph"
 .SH NAME
 rbd-replay \- replay rados block device (RBD) workloads
 .
@@ -30,33 +30,6 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .in \\n[rst2man-indent\\n[rst2man-indent-level]]u
 ..
-.
-.nr rst2man-indent-level 0
-.
-.de1 rstReportMargin
-\\$1 \\n[an-margin]
-level \\n[rst2man-indent-level]
-level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
--
-\\n[rst2man-indent0]
-\\n[rst2man-indent1]
-\\n[rst2man-indent2]
-..
-.de1 INDENT
-.\" .rstReportMargin pre:
-. RS \\$1
-. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin]
-. nr rst2man-indent-level +1
-.\" .rstReportMargin post:
-..
-.de UNINDENT
-. RE
-.\" indent \\n[an-margin]
-.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.nr rst2man-indent-level -1
-.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.in \\n[rst2man-indent\\n[rst2man-indent-level]]u
-..
 .SH SYNOPSIS
 .nf
 \fBrbd\-replay\fP [ \fIoptions\fP ] \fIreplay_file\fP
@@ -129,7 +102,7 @@ rbd\-replay \-\-map\-image=prod_image=test_image workload1
 .UNINDENT
 .SH AVAILABILITY
 .sp
-\fBrbd\-replay\fP is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to
+\fBrbd\-replay\fP is part of Ceph, a massively scalable, open\-source, distributed storage system. Please refer to
 the Ceph documentation at \fI\%http://ceph.com/docs\fP for more information.
 .SH SEE ALSO
 .sp
diff --git a/man/rbd.8 b/man/rbd.8
index 4185dfc..60fe011 100644
--- a/man/rbd.8
+++ b/man/rbd.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "RBD" "8" "March 12, 2015" "dev" "Ceph"
+.TH "RBD" "8" "November 03, 2015" "dev" "Ceph"
 .SH NAME
 rbd \- manage rados block device (RBD) images
 .
@@ -30,33 +30,6 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .in \\n[rst2man-indent\\n[rst2man-indent-level]]u
 ..
-.
-.nr rst2man-indent-level 0
-.
-.de1 rstReportMargin
-\\$1 \\n[an-margin]
-level \\n[rst2man-indent-level]
-level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
--
-\\n[rst2man-indent0]
-\\n[rst2man-indent1]
-\\n[rst2man-indent2]
-..
-.de1 INDENT
-.\" .rstReportMargin pre:
-. RS \\$1
-. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin]
-. nr rst2man-indent-level +1
-.\" .rstReportMargin post:
-..
-.de UNINDENT
-. RE
-.\" indent \\n[an-margin]
-.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.nr rst2man-indent-level -1
-.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
-.in \\n[rst2man-indent\\n[rst2man-indent-level]]u
-..
 .SH SYNOPSIS
 .nf
 \fBrbd\fP [ \-c \fIceph.conf\fP ] [ \-m \fImonaddr\fP ] [ \-p | \-\-pool \fIpool\fP ] [
@@ -84,7 +57,7 @@ Connect to specified monitor (instead of looking through ceph.conf).
 .UNINDENT
 .INDENT 0.0
 .TP
-.B \-p pool, \-\-pool pool
+.B \-p pool\-name, \-\-pool pool\-name
 Interact with the given pool. Required by most commands.
 .UNINDENT
 .INDENT 0.0
@@ -96,7 +69,7 @@ default for some commands).
 .SH PARAMETERS
 .INDENT 0.0
 .TP
-.B \-\-image\-format format
+.B \-\-image\-format format\-id
 Specifies which object layout to use. The default is 1.
 .INDENT 7.0
 .IP \(bu 2
@@ -112,8 +85,8 @@ features in the future.
 .UNINDENT
 .INDENT 0.0
 .TP
-.B \-\-size size\-in\-mb
-Specifies the size (in megabytes) of the new rbd image.
+.B \-\-size size\-in\-M/G/T
+Specifies the size (in M/G/T) of the new rbd image.
 .UNINDENT
 .INDENT 0.0
 .TP
@@ -123,8 +96,8 @@ the object size is \fB1 << order\fP\&. The default is 22 (4 MB).
 .UNINDENT
 .INDENT 0.0
 .TP
-.B \-\-stripe\-unit size\-in\-bytes
-Specifies the stripe unit size in bytes.  See striping section (below) for more details.
+.B \-\-stripe\-unit size\-in\-B/K/M
+Specifies the stripe unit size in B/K/M.  See striping section (below) for more details.
 .UNINDENT
 .INDENT 0.0
 .TP
@@ -144,12 +117,6 @@ Specifies the username (without the \fBclient.\fP prefix) to use with the map co
 .UNINDENT
 .INDENT 0.0
 .TP
-.B \-\-keyfile filename
-Specifies a file containing the secret to use with the map command.
-If not specified, \fBclient.admin\fP will be used by default.
-.UNINDENT
-.INDENT 0.0
-.TP
 .B \-\-keyring filename
 Specifies a keyring file containing a secret for the specified user
 to use with the map command.  If not specified, the default keyring
@@ -157,7 +124,13 @@ locations will be searched.
 .UNINDENT
 .INDENT 0.0
 .TP
-.B \-\-shared tag
+.B \-\-keyfile filename
+Specifies a file containing the secret key of \fB\-\-id user\fP to use with the map command.
+This option is overridden by \fB\-\-keyring\fP if the latter is also specified.
+.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-shared lock\-tag
 Option for \fIlock add\fP that allows multiple clients to lock the
 same image if they use the same tag. The tag is an arbitrary
 string. This is useful for situations where an image must
@@ -189,15 +162,24 @@ Map the image read\-only.  Equivalent to \-o ro.
 .UNINDENT
 .INDENT 0.0
 .TP
-.B \-\-image\-features features
-Specifies which RBD format 2 features are to be enabled when creating
-an image. The numbers from the desired features below should be added
-to compute the parameter value:
-.sp
-+1: layering support
-+2: striping v2 support
-+4: exclusive locking support
-+8: object map support
+.B \-\-image\-feature feature\-name
+Specifies which RBD format 2 feature should be enabled when creating
+an image. Multiple features can be enabled by repeating this option
+multiple times. The following features are supported:
+.INDENT 7.0
+.IP \(bu 2
+layering: layering support
+.IP \(bu 2
+striping: striping v2 support
+.IP \(bu 2
+exclusive\-lock: exclusive locking support
+.IP \(bu 2
+object\-map: object map support (requires exclusive\-lock)
+.IP \(bu 2
+fast\-diff: fast diff calculations (requires object\-map)
+.IP \(bu 2
+deep\-flatten: snapshot flatten support
+.UNINDENT
 .UNINDENT
 .INDENT 0.0
 .TP
@@ -206,24 +188,42 @@ Specifies that the image will be used concurrently by multiple clients.
 This will disable features that are dependent upon exclusive ownership
 of the image.
 .UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-whole\-object
+Specifies that the diff should be limited to the extents of a full object
+instead of showing intra\-object deltas. When the object map feature is
+enabled on an image, limiting the diff to the object extents will
+dramatically improve performance since the differences can be computed
+by examining the in\-memory object map instead of querying RADOS for each
+object within the image.
+.UNINDENT
 .SH COMMANDS
 .INDENT 0.0
 .TP
-.B \fBls\fP [\-l | \-\-long] [pool\-name]
+.B \fBls\fP [\-l | \-\-long] [\fIpool\-name\fP]
 Will list all rbd images listed in the rbd_directory object.  With
 \-l, also show snapshots, and use longer\-format output including
 size, parent (if clone), format, etc.
 .TP
-.B \fBinfo\fP [\fIimage\-name\fP]
+.B \fBdu\fP [\-p | \-\-pool \fIpool\-name\fP] [\fIimage\-spec\fP | \fIsnap\-spec\fP]
+Will calculate the provisioned and actual disk usage of all images and
+associated snapshots within the specified pool.  It can also be used against
+individual images and snapshots.
+.sp
+If the RBD fast\-diff feature isn\(aqt enabled on images, this operation will
+require querying the OSDs for every potential object within the image.
+.TP
+.B \fBinfo\fP \fIimage\-spec\fP | \fIsnap\-spec\fP
 Will dump information (such as size and order) about a specific rbd image.
 If image is a clone, information about its parent is also displayed.
 If a snapshot is specified, whether it is protected is shown as well.
 .TP
-.B \fBcreate\fP [\fIimage\-name\fP]
+.B \fBcreate\fP (\-s | \-\-size \fIsize\-in\-M/G/T\fP) [\-\-image\-format \fIformat\-id\fP] [\-\-order \fIbits\fP] [\-\-stripe\-unit \fIsize\-in\-B/K/M\fP \-\-stripe\-count \fInum\fP] [\-\-image\-feature \fIfeature\-name\fP]... [\-\-image\-shared] \fIimage\-spec\fP
 Will create a new rbd image. You must also specify the size via \-\-size.  The
 \-\-stripe\-unit and \-\-stripe\-count arguments are optional, but must be used together.
 .TP
-.B \fBclone\fP [\fIparent\-snapname\fP] [\fIimage\-name\fP]
+.B \fBclone\fP [\-\-order \fIbits\fP] [\-\-stripe\-unit \fIsize\-in\-B/K/M\fP \-\-stripe\-count \fInum\fP] [\-\-image\-feature \fIfeature\-name\fP] [\-\-image\-shared] \fIparent\-snap\-spec\fP \fIchild\-image\-spec\fP
 Will create a clone (copy\-on\-write child) of the parent snapshot.
 Object order will be identical to that of the parent image unless
 specified. Size will be the same as the parent snapshot. The \-\-stripe\-unit
@@ -232,7 +232,7 @@ and \-\-stripe\-count arguments are optional, but must be used together.
 The parent snapshot must be protected (see \fIrbd snap protect\fP).
 This requires image format 2.
 .TP
-.B \fBflatten\fP [\fIimage\-name\fP]
+.B \fBflatten\fP \fIimage\-spec\fP
 If image is a clone, copy all shared blocks from the parent snapshot and
 make the child independent of the parent, severing the link between
 parent snap and child.  The parent snapshot can be unprotected and
@@ -240,24 +240,24 @@ deleted if it has no further dependent clones.
 .sp
 This requires image format 2.
 .TP
-.B \fBchildren\fP [\fIimage\-name\fP]
+.B \fBchildren\fP \fIsnap\-spec\fP
 List the clones of the image at the given snapshot. This checks
 every pool, and outputs the resulting poolname/imagename.
 .sp
 This requires image format 2.
 .TP
-.B \fBresize\fP [\fIimage\-name\fP] [\-\-allow\-shrink]
+.B \fBresize\fP (\-s | \-\-size \fIsize\-in\-M/G/T\fP) [\-\-allow\-shrink] \fIimage\-spec\fP
 Resizes rbd image. The size parameter also needs to be specified.
 The \-\-allow\-shrink option lets the size be reduced.
 .TP
-.B \fBrm\fP [\fIimage\-name\fP]
+.B \fBrm\fP \fIimage\-spec\fP
 Deletes an rbd image (including all data blocks). If the image has
 snapshots, this fails and nothing is deleted.
 .TP
-.B \fBexport\fP [\fIimage\-name\fP] [\fIdest\-path\fP]
+.B \fBexport\fP (\fIimage\-spec\fP | \fIsnap\-spec\fP) [\fIdest\-path\fP]
 Exports image to dest path (use \- for stdout).
 .TP
-.B \fBimport\fP [\fIpath\fP] [\fIdest\-image\fP]
+.B \fBimport\fP [\-\-image\-format \fIformat\-id\fP] [\-\-order \fIbits\fP] [\-\-stripe\-unit \fIsize\-in\-B/K/M\fP \-\-stripe\-count \fInum\fP] [\-\-image\-feature \fIfeature\-name\fP]... [\-\-image\-shared] \fIsrc\-path\fP [\fIimage\-spec\fP]
 Creates a new image and imports its data from path (use \- for
 stdin).  The import operation will try to create sparse rbd images
 if possible.  For import from stdin, the sparsification unit is
@@ -266,7 +266,7 @@ the data block size of the destination image (1 << order).
 The \-\-stripe\-unit and \-\-stripe\-count arguments are optional, but must be
 used together.
 .TP
-.B \fBexport\-diff\fP [\fIimage\-name\fP] [\fIdest\-path\fP] [\-\-from\-snap \fIsnapname\fP]
+.B \fBexport\-diff\fP [\-\-from\-snap \fIsnap\-name\fP] [\-\-whole\-object] (\fIimage\-spec\fP | \fIsnap\-spec\fP) \fIdest\-path\fP
 Exports an incremental diff for an image to dest path (use \- for stdout).  If
 an initial snapshot is specified, only changes since that snapshot are included; otherwise,
 any regions of the image that contain data are included.  The end snapshot is specified
@@ -274,7 +274,7 @@ using the standard \-\-snap option or @snap syntax (see below).  The image diff
 metadata about image size changes, and the start and end snapshots.  It efficiently represents
 discarded or \(aqzero\(aq regions of the image.
 .TP
-.B \fBmerge\-diff\fP [\fIfirst\-diff\-path\fP] [\fIsecond\-diff\-path\fP] [\fImerged\-diff\-path\fP]
+.B \fBmerge\-diff\fP \fIfirst\-diff\-path\fP \fIsecond\-diff\-path\fP \fImerged\-diff\-path\fP
 Merge two continuous incremental diffs of an image into one single diff. The
 first diff\(aqs end snapshot must be equal with the second diff\(aqs start snapshot.
 The first diff could be \- for stdin, and merged diff could be \- for stdout, which
@@ -282,42 +282,59 @@ enables multiple diff files to be merged using something like
 \(aqrbd merge\-diff first second \- | rbd merge\-diff \- third result\(aq. Note this command
 currently only support the source incremental diff with stripe_count == 1
 .TP
-.B \fBimport\-diff\fP [\fIsrc\-path\fP] [\fIimage\-name\fP]
+.B \fBimport\-diff\fP \fIsrc\-path\fP \fIimage\-spec\fP
 Imports an incremental diff of an image and applies it to the current image.  If the diff
 was generated relative to a start snapshot, we verify that snapshot already exists before
 continuing.  If there was an end snapshot we verify it does not already exist before
 applying the changes, and create the snapshot when we are done.
 .TP
-.B \fBdiff\fP [\fIimage\-name\fP] [\-\-from\-snap \fIsnapname\fP]
+.B \fBdiff\fP [\-\-from\-snap \fIsnap\-name\fP] [\-\-whole\-object] \fIimage\-spec\fP | \fIsnap\-spec\fP
 Dump a list of byte extents in the image that have changed since the specified start
 snapshot, or since the image was created.  Each output line includes the starting offset
 (in bytes), the length of the region (in bytes), and either \(aqzero\(aq or \(aqdata\(aq to indicate
 whether the region is known to be zeros or may contain other data.
 .TP
-.B \fBcp\fP [\fIsrc\-image\fP] [\fIdest\-image\fP]
+.B \fBcp\fP (\fIsrc\-image\-spec\fP | \fIsrc\-snap\-spec\fP) \fIdest\-image\-spec\fP
 Copies the content of a src\-image into the newly created dest\-image.
 dest\-image will have the same size, order, and image format as src\-image.
 .TP
-.B \fBmv\fP [\fIsrc\-image\fP] [\fIdest\-image\fP]
+.B \fBmv\fP \fIsrc\-image\-spec\fP \fIdest\-image\-spec\fP
 Renames an image.  Note: rename across pools is not supported.
 .TP
-.B \fBsnap\fP ls [\fIimage\-name\fP]
+.B \fBimage\-meta list\fP \fIimage\-spec\fP
+Show metadata held on the image. The first column is the key
+and the second column is the value.
+.TP
+.B \fBimage\-meta get\fP \fIimage\-spec\fP \fIkey\fP
+Get metadata value with the key.
+.TP
+.B \fBimage\-meta set\fP \fIimage\-spec\fP \fIkey\fP \fIvalue\fP
+Set metadata key with the value. They will displayed in \fIimage\-meta list\fP\&.
+.TP
+.B \fBimage\-meta remove\fP \fIimage\-spec\fP \fIkey\fP
+Remove metadata key with the value.
+.TP
+.B \fBobject\-map rebuild\fP \fIimage\-spec\fP | \fIsnap\-spec\fP
+Rebuilds an invalid object map for the specified image. An image snapshot can be
+specified to rebuild an invalid object map for a snapshot.
+.TP
+.B \fBsnap ls\fP \fIimage\-spec\fP
 Dumps the list of snapshots inside a specific image.
 .TP
-.B \fBsnap\fP create [\fIimage\-name\fP]
+.B \fBsnap create\fP \fIsnap\-spec\fP
 Creates a new snapshot. Requires the snapshot name parameter specified.
 .TP
-.B \fBsnap\fP rollback [\fIimage\-name\fP]
+.B \fBsnap rollback\fP \fIsnap\-spec\fP
 Rollback image content to snapshot. This will iterate through the entire blocks
 array and update the data head content to the snapshotted version.
 .TP
-.B \fBsnap\fP rm [\fIimage\-name\fP]
+.B \fBsnap rm\fP \fIsnap\-spec\fP
 Removes the specified snapshot.
 .TP
-.B \fBsnap\fP purge [\fIimage\-name\fP]
+.B \fBsnap purge\fP \fIimage\-spec\fP
 Removes all snapshots from an image.
 .TP
-.B \fBsnap\fP protect [\fIimage\-name\fP]
+.B \fBsnap protect\fP \fIsnap\-spec\fP
 Protect a snapshot from deletion, so that clones can be made of it
 (see \fIrbd clone\fP).  Snapshots must be protected before clones are made;
 protection implies that there exist dependent cloned children that
@@ -326,30 +343,38 @@ snapshot.
 .sp
 This requires image format 2.
 .TP
-.B \fBsnap\fP unprotect [\fIimage\-name\fP]
+.B \fBsnap unprotect\fP \fIsnap\-spec\fP
 Unprotect a snapshot from deletion (undo \fIsnap protect\fP).  If cloned
 children remain, \fIsnap unprotect\fP fails.  (Note that clones may exist
 in different pools than the parent snapshot.)
 .sp
 This requires image format 2.
 .TP
-.B \fBmap\fP [\fIimage\-name\fP] [\-o | \-\-options \fImap\-options\fP ] [\-\-read\-only]
+.B \fBmap\fP [\-o | \-\-options \fImap\-options\fP ] [\-\-read\-only] \fIimage\-spec\fP | \fIsnap\-spec\fP
 Maps the specified image to a block device via the rbd kernel module.
 .TP
-.B \fBunmap\fP [\fIdevice\-path\fP]
+.B \fBunmap\fP \fIimage\-spec\fP | \fIsnap\-spec\fP | \fIdevice\-path\fP
 Unmaps the block device that was mapped via the rbd kernel module.
 .TP
 .B \fBshowmapped\fP
 Show the rbd images that are mapped via the rbd kernel module.
 .TP
-.B \fBstatus\fP [\fIimage\-name\fP]
+.B \fBstatus\fP \fIimage\-spec\fP
 Show the status of the image, including which clients have it open.
 .TP
-.B \fBlock\fP list [\fIimage\-name\fP]
+.B \fBfeature disable\fP \fIimage\-spec\fP \fIfeature\-name\fP\&...
+Disables the specified feature on the specified image. Multiple features can
+be specified.
+.TP
+.B \fBfeature enable\fP \fIimage\-spec\fP \fIfeature\-name\fP\&...
+Enables the specified feature on the specified image. Multiple features can
+be specified.
+.TP
+.B \fBlock list\fP \fIimage\-spec\fP
 Show locks held on the image. The first column is the locker
 to use with the \fIlock remove\fP command.
 .TP
-.B \fBlock\fP add [\fIimage\-name\fP] [\fIlock\-id\fP]
+.B \fBlock add\fP [\-\-shared \fIlock\-tag\fP] \fIimage\-spec\fP \fIlock\-id\fP
 Lock an image. The lock\-id is an arbitrary name for the user\(aqs
 convenience. By default, this is an exclusive lock, meaning it
 will fail if the image is already locked. The \-\-shared option
@@ -357,32 +382,27 @@ changes this behavior. Note that locking does not affect
 any operation other than adding a lock. It does not
 protect an image from being deleted.
 .TP
-.B \fBlock\fP remove [\fIimage\-name\fP] [\fIlock\-id\fP] [\fIlocker\fP]
+.B \fBlock remove\fP \fIimage\-spec\fP \fIlock\-id\fP \fIlocker\fP
 Release a lock on an image. The lock id and locker are
 as output by lock ls.
 .TP
-.B \fBbench\-write\fP [\fIimage\-name\fP] \-\-io\-size [\fIio\-size\-in\-bytes\fP] \-\-io\-threads [\fInum\-ios\-in\-flight\fP] \-\-io\-total [\fItotal\-bytes\-to\-write\fP]
-Generate a series of sequential writes to the image and measure the
-write throughput and latency.  Defaults are: \-\-io\-size 4096, \-\-io\-threads 16,
-\-\-io\-total 1GB
+.B \fBbench\-write\fP [\-\-io\-size \fIsize\-in\-B/K/M/G/T\fP] [\-\-io\-threads \fInum\-ios\-in\-flight\fP] [\-\-io\-total \fItotal\-size\-to\-write\-in\-B/K/M/G/T\fP] [\-\-io\-pattern seq | rand] \fIimage\-spec\fP
+Generate a series of writes to the image and measure the write throughput and
+latency.  Defaults are: \-\-io\-size 4096, \-\-io\-threads 16, \-\-io\-total 1G,
+\-\-io\-pattern seq.
 .UNINDENT
-.SH IMAGE NAME
-.sp
-In addition to using the \-\-pool and the \-\-snap options, the image name can include both
-the pool name and the snapshot name. The image name format is as follows:
-.INDENT 0.0
-.INDENT 3.5
-.sp
+.SH IMAGE AND SNAP SPECS
 .nf
-.ft C
-[pool/]image\-name[@snap]
-.ft P
+\fIimage\-spec\fP is [\fIpool\-name\fP]/\fIimage\-name\fP
+\fIsnap\-spec\fP  is [\fIpool\-name\fP]/\fIimage\-name\fP@\fIsnap\-name\fP
 .fi
-.UNINDENT
-.UNINDENT
 .sp
-Thus an image name that contains a slash character (\(aq/\(aq) requires specifying the pool
-name explicitly.
+.sp
+The default for \fIpool\-name\fP is "rbd".  If an image name contains a slash
+character (\(aq/\(aq), \fIpool\-name\fP is required.
+.sp
+You may specify each name individually, using \-\-pool, \-\-image and \-\-snap
+options, but this is discouraged in favor of the above spec syntax.
 .SH STRIPING
 .sp
 RBD images are striped over many objects, which are then stored by the
@@ -420,6 +440,8 @@ used.
 Most of these options are useful mainly for debugging and benchmarking.  The
 default values are set in the kernel and may therefore depend on the version of
 the running kernel.
+.sp
+libceph (per client instance) options:
 .INDENT 0.0
 .IP \(bu 2
 fsid=aaaaaaaa\-bbbb\-cccc\-dddd\-eeeeeeeeeeee \- FSID that should be assumed by
@@ -435,13 +457,35 @@ crc \- Enable CRC32C checksumming for data writes (default).
 .IP \(bu 2
 nocrc \- Disable CRC32C checksumming for data writes.
 .IP \(bu 2
+cephx_require_signatures \- Require cephx message signing (since 3.19,
+default).
+.IP \(bu 2
+nocephx_require_signatures \- Don\(aqt require cephx message signing (since
+3.19).
+.IP \(bu 2
+tcp_nodelay \- Disable Nagle\(aqs algorithm on client sockets (since 4.0,
+default).
+.IP \(bu 2
+notcp_nodelay \- Enable Nagle\(aqs algorithm on client sockets (since 4.0).
+.IP \(bu 2
+mount_timeout=x \- A timeout on various steps in \fIrbd map\fP and \fIrbd unmap\fP
+sequences (default is 60 seconds).  In particular, since 4.2 this can be used
+to ensure that \fIrbd unmap\fP eventually times out when there is no network
+connection to a cluster.
+.IP \(bu 2
 osdkeepalive=x \- OSD keepalive timeout (default is 5 seconds).
 .IP \(bu 2
 osd_idle_ttl=x \- OSD idle TTL (default is 60 seconds).
+.UNINDENT
+.sp
+Mapping (per block device) options:
+.INDENT 0.0
 .IP \(bu 2
 rw \- Map the image read\-write (default).
 .IP \(bu 2
 ro \- Map the image read\-only.  Equivalent to \-\-read\-only.
+.IP \(bu 2
+queue_depth=x \- queue depth (since 4.2, default is 128 requests).
 .UNINDENT
 .SH EXAMPLES
 .sp
@@ -451,18 +495,6 @@ To create a new rbd image that is 100 GB:
 .sp
 .nf
 .ft C
-rbd \-p mypool create myimage \-\-size 102400
-.ft P
-.fi
-.UNINDENT
-.UNINDENT
-.sp
-or alternatively:
-.INDENT 0.0
-.INDENT 3.5
-.sp
-.nf
-.ft C
 rbd create mypool/myimage \-\-size 102400
 .ft P
 .fi
@@ -572,7 +604,7 @@ To create an image and a clone from it:
 .nf
 .ft C
 rbd import \-\-image\-format 2 image mypool/parent
-rbd snap create \-\-snap snapname mypool/parent
+rbd snap create mypool/parent at snap
 rbd snap protect mypool/parent at snap
 rbd clone mypool/parent at snap otherpool/child
 .ft P
@@ -586,7 +618,7 @@ To create an image with a smaller stripe_unit (to better distribute small writes
 .sp
 .nf
 .ft C
-rbd \-p mypool create myimage \-\-size 102400 \-\-stripe\-unit 65536 \-\-stripe\-count 16
+rbd create mypool/myimage \-\-size 102400 \-\-stripe\-unit 65536B \-\-stripe\-count 16
 .ft P
 .fi
 .UNINDENT
diff --git a/qa/workunits/erasure-code/encode-decode-non-regression.sh b/qa/workunits/erasure-code/encode-decode-non-regression.sh
new file mode 100755
index 0000000..539b912
--- /dev/null
+++ b/qa/workunits/erasure-code/encode-decode-non-regression.sh
@@ -0,0 +1,38 @@
+#!/bin/bash -ex
+#
+# Copyright (C) 2014 Red Hat <contact at redhat.com>
+#
+# Author: Loic Dachary <loic at dachary.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Library Public License for more details.
+#
+: ${CORPUS:=https://github.com/ceph/ceph-erasure-code-corpus.git}
+: ${DIRECTORY:=../ceph-erasure-code-corpus}
+
+# when running from sources, the current directory must have precedence
+export PATH=:$PATH
+
+if ! test -d $DIRECTORY ; then
+    git clone $CORPUS $DIRECTORY
+fi
+
+my_version=v$(ceph --version | cut -f3 -d ' ')
+
+all_versions=$((ls -d $DIRECTORY/v* ; echo $DIRECTORY/$my_version ) | sort)
+
+for version in $all_versions ; do
+    if test -d $version ; then
+        $version/non-regression.sh
+    fi
+    if test $version = $DIRECTORY/$my_version ; then
+        break
+    fi
+done
diff --git a/selinux/Makefile.am b/selinux/Makefile.am
new file mode 100644
index 0000000..280e7ec
--- /dev/null
+++ b/selinux/Makefile.am
@@ -0,0 +1,22 @@
+EXTRA_DIST = \
+	ceph.te \
+	ceph.fc \
+	ceph.if
+
+SELINUXROOT = $(DESTDIR)$(datadir)/selinux
+
+ceph.pp: ceph.te ceph.fc ceph.if
+	$(MAKE) -j1 -f $(datadir)/selinux/devel/Makefile ceph.pp
+
+if ENABLE_SERVER
+if WITH_SELINUX
+all-local: ceph.pp
+
+install-exec-local:
+	$(INSTALL) -d $(SELINUXROOT)/packages
+	$(INSTALL) -m 644 ceph.pp $(SELINUXROOT)/packages/
+	$(INSTALL) -d $(SELINUXROOT)/devel/include/contrib
+	$(INSTALL) -m 644 ceph.if $(SELINUXROOT)/devel/include/contrib/
+
+endif
+endif
diff --git a/selinux/Makefile.in b/selinux/Makefile.in
new file mode 100644
index 0000000..0192ca9
--- /dev/null
+++ b/selinux/Makefile.in
@@ -0,0 +1,539 @@
+# Makefile.in generated by automake 1.14.1 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994-2013 Free Software Foundation, Inc.
+
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+ at SET_MAKE@
+VPATH = @srcdir@
+am__is_gnu_make = test -n '$(MAKEFILE_LIST)' && test -n '$(MAKELEVEL)'
+am__make_running_with_option = \
+  case $${target_option-} in \
+      ?) ;; \
+      *) echo "am__make_running_with_option: internal error: invalid" \
+              "target option '$${target_option-}' specified" >&2; \
+         exit 1;; \
+  esac; \
+  has_opt=no; \
+  sane_makeflags=$$MAKEFLAGS; \
+  if $(am__is_gnu_make); then \
+    sane_makeflags=$$MFLAGS; \
+  else \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        bs=\\; \
+        sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
+          | sed "s/$$bs$$bs[$$bs $$bs	]*//g"`;; \
+    esac; \
+  fi; \
+  skip_next=no; \
+  strip_trailopt () \
+  { \
+    flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
+  }; \
+  for flg in $$sane_makeflags; do \
+    test $$skip_next = yes && { skip_next=no; continue; }; \
+    case $$flg in \
+      *=*|--*) continue;; \
+        -*I) strip_trailopt 'I'; skip_next=yes;; \
+      -*I?*) strip_trailopt 'I';; \
+        -*O) strip_trailopt 'O'; skip_next=yes;; \
+      -*O?*) strip_trailopt 'O';; \
+        -*l) strip_trailopt 'l'; skip_next=yes;; \
+      -*l?*) strip_trailopt 'l';; \
+      -[dEDm]) skip_next=yes;; \
+      -[JT]) skip_next=yes;; \
+    esac; \
+    case $$flg in \
+      *$$target_option*) has_opt=yes; break;; \
+    esac; \
+  done; \
+  test $$has_opt = yes
+am__make_dryrun = (target_option=n; $(am__make_running_with_option))
+am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+target_triplet = @target@
+subdir = selinux
+DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/Makefile.am
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/m4/ac_check_classpath.m4 \
+	$(top_srcdir)/m4/ac_prog_jar.m4 \
+	$(top_srcdir)/m4/ac_prog_javac.m4 \
+	$(top_srcdir)/m4/ac_prog_javac_works.m4 \
+	$(top_srcdir)/m4/ac_prog_javah.m4 \
+	$(top_srcdir)/m4/acx_pthread.m4 $(top_srcdir)/m4/ax_arm.m4 \
+	$(top_srcdir)/m4/ax_c_pretty_func.m4 \
+	$(top_srcdir)/m4/ax_c_var_func.m4 \
+	$(top_srcdir)/m4/ax_check_compile_flag.m4 \
+	$(top_srcdir)/m4/ax_cxx_compile_stdcxx_11.m4 \
+	$(top_srcdir)/m4/ax_cxx_static_cast.m4 \
+	$(top_srcdir)/m4/ax_intel.m4 $(top_srcdir)/m4/libtool.m4 \
+	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
+	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
+	$(top_srcdir)/m4/pkg.m4 $(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/src/acconfig.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+AM_V_P = $(am__v_P_ at AM_V@)
+am__v_P_ = $(am__v_P_ at AM_DEFAULT_V@)
+am__v_P_0 = false
+am__v_P_1 = :
+AM_V_GEN = $(am__v_GEN_ at AM_V@)
+am__v_GEN_ = $(am__v_GEN_ at AM_DEFAULT_V@)
+am__v_GEN_0 = @echo "  GEN     " $@;
+am__v_GEN_1 = 
+AM_V_at = $(am__v_at_ at AM_V@)
+am__v_at_ = $(am__v_at_ at AM_DEFAULT_V@)
+am__v_at_0 = @
+am__v_at_1 = 
+SOURCES =
+DIST_SOURCES =
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+ACLOCAL = @ACLOCAL@
+AMTAR = @AMTAR@
+AM_CXXFLAGS = @AM_CXXFLAGS@
+AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
+AR = @AR@
+ARM_CRC_FLAGS = @ARM_CRC_FLAGS@
+ARM_FLAGS = @ARM_FLAGS@
+ARM_NEON_FLAGS = @ARM_NEON_FLAGS@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AWK = @AWK@
+BOOST_PROGRAM_OPTIONS_LIBS = @BOOST_PROGRAM_OPTIONS_LIBS@
+BOOST_RANDOM_LIBS = @BOOST_RANDOM_LIBS@
+BOOST_THREAD_LIBS = @BOOST_THREAD_LIBS@
+CC = @CC@
+CCAS = @CCAS@
+CCASDEPMODE = @CCASDEPMODE@
+CCASFLAGS = @CCASFLAGS@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CRYPTOPP_CFLAGS = @CRYPTOPP_CFLAGS@
+CRYPTOPP_LIBS = @CRYPTOPP_LIBS@
+CRYPTO_CFLAGS = @CRYPTO_CFLAGS@
+CRYPTO_LIBS = @CRYPTO_LIBS@
+CXX = @CXX@
+CXXCPP = @CXXCPP@
+CXXDEPMODE = @CXXDEPMODE@
+CXXFLAGS = @CXXFLAGS@
+CYGPATH_W = @CYGPATH_W@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+EXTRA_CLASSPATH_JAR = @EXTRA_CLASSPATH_JAR@
+FGREP = @FGREP@
+GCOV_PREFIX_STRIP = @GCOV_PREFIX_STRIP@
+GIT_CHECK = @GIT_CHECK@
+GREP = @GREP@
+HAVE_CXX11 = @HAVE_CXX11@
+HAVE_VALGRIND = @HAVE_VALGRIND@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+INTEL_FLAGS = @INTEL_FLAGS@
+INTEL_PCLMUL_FLAGS = @INTEL_PCLMUL_FLAGS@
+INTEL_SSE2_FLAGS = @INTEL_SSE2_FLAGS@
+INTEL_SSE3_FLAGS = @INTEL_SSE3_FLAGS@
+INTEL_SSE4_1_FLAGS = @INTEL_SSE4_1_FLAGS@
+INTEL_SSE4_2_FLAGS = @INTEL_SSE4_2_FLAGS@
+INTEL_SSE_FLAGS = @INTEL_SSE_FLAGS@
+INTEL_SSSE3_FLAGS = @INTEL_SSSE3_FLAGS@
+JAR = @JAR@
+JAVAC = @JAVAC@
+JAVAH = @JAVAH@
+JDK_CPPFLAGS = @JDK_CPPFLAGS@
+KEYUTILS_LIB = @KEYUTILS_LIB@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBEDIT_CFLAGS = @LIBEDIT_CFLAGS@
+LIBEDIT_LIBS = @LIBEDIT_LIBS@
+LIBFUSE_CFLAGS = @LIBFUSE_CFLAGS@
+LIBFUSE_LIBS = @LIBFUSE_LIBS@
+LIBJEMALLOC = @LIBJEMALLOC@
+LIBOBJS = @LIBOBJS@
+LIBROCKSDB_CFLAGS = @LIBROCKSDB_CFLAGS@
+LIBROCKSDB_LIBS = @LIBROCKSDB_LIBS@
+LIBS = @LIBS@
+LIBTCMALLOC = @LIBTCMALLOC@
+LIBTOOL = @LIBTOOL@
+LIBZFS_CFLAGS = @LIBZFS_CFLAGS@
+LIBZFS_LIBS = @LIBZFS_LIBS@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+LTTNG_GEN_TP_CHECK = @LTTNG_GEN_TP_CHECK@
+LTTNG_GEN_TP_PROG = @LTTNG_GEN_TP_PROG@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+NM = @NM@
+NMEDIT = @NMEDIT@
+NSS_CFLAGS = @NSS_CFLAGS@
+NSS_LIBS = @NSS_LIBS@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+PKG_CONFIG = @PKG_CONFIG@
+PKG_CONFIG_LIBDIR = @PKG_CONFIG_LIBDIR@
+PKG_CONFIG_PATH = @PKG_CONFIG_PATH@
+PTHREAD_CC = @PTHREAD_CC@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+PYTHON = @PYTHON@
+PYTHON_EXEC_PREFIX = @PYTHON_EXEC_PREFIX@
+PYTHON_PLATFORM = @PYTHON_PLATFORM@
+PYTHON_PREFIX = @PYTHON_PREFIX@
+PYTHON_VERSION = @PYTHON_VERSION@
+RANLIB = @RANLIB@
+RESOLV_LIBS = @RESOLV_LIBS@
+RPM_RELEASE = @RPM_RELEASE@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHELL = @SHELL@
+SPHINX_BUILD = @SPHINX_BUILD@
+STRIP = @STRIP@
+VERSION = @VERSION@
+WARN_ERROR_FORMAT_SECURITY = @WARN_ERROR_FORMAT_SECURITY@
+WARN_IGNORED_QUALIFIERS = @WARN_IGNORED_QUALIFIERS@
+WARN_TYPE_LIMITS = @WARN_TYPE_LIMITS@
+XIO_LIBS = @XIO_LIBS@
+YASM_CHECK = @YASM_CHECK@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_CXX = @ac_ct_CXX@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+acx_pthread_config = @acx_pthread_config@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+group_rgw = @group_rgw@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+pkgpyexecdir = @pkgpyexecdir@
+pkgpythondir = @pkgpythondir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+pyexecdir = @pyexecdir@
+pythondir = @pythondir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+subdirs = @subdirs@
+sysconfdir = @sysconfdir@
+systemd_libexec_dir = @systemd_libexec_dir@
+systemd_unit_dir = @systemd_unit_dir@
+target = @target@
+target_alias = @target_alias@
+target_cpu = @target_cpu@
+target_os = @target_os@
+target_vendor = @target_vendor@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+user_rgw = @user_rgw@
+EXTRA_DIST = \
+	ceph.te \
+	ceph.fc \
+	ceph.if
+
+SELINUXROOT = $(DESTDIR)$(datadir)/selinux
+all: all-am
+
+.SUFFIXES:
+$(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign selinux/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --foreign selinux/Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure:  $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4):  $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+tags TAGS:
+
+ctags CTAGS:
+
+cscope cscopelist:
+
+
+distdir: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: check-am
+ at ENABLE_SERVER_FALSE@all-local:
+ at WITH_SELINUX_FALSE@all-local:
+all-am: Makefile all-local
+installdirs:
+install: install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+ at ENABLE_SERVER_FALSE@install-exec-local:
+ at WITH_SELINUX_FALSE@install-exec-local:
+clean: clean-am
+
+clean-am: clean-generic clean-libtool mostlyclean-am
+
+distclean: distclean-am
+	-rm -f Makefile
+distclean-am: clean-am distclean-generic
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+html-am:
+
+info: info-am
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-am
+
+install-dvi-am:
+
+install-exec-am: install-exec-local
+
+install-html: install-html-am
+
+install-html-am:
+
+install-info: install-info-am
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-am
+
+install-pdf-am:
+
+install-ps: install-ps-am
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-generic mostlyclean-libtool
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am:
+
+.MAKE: install-am install-strip
+
+.PHONY: all all-am all-local check check-am clean clean-generic \
+	clean-libtool cscopelist-am ctags-am distclean \
+	distclean-generic distclean-libtool distdir dvi dvi-am html \
+	html-am info info-am install install-am install-data \
+	install-data-am install-dvi install-dvi-am install-exec \
+	install-exec-am install-exec-local install-html \
+	install-html-am install-info install-info-am install-man \
+	install-pdf install-pdf-am install-ps install-ps-am \
+	install-strip installcheck installcheck-am installdirs \
+	maintainer-clean maintainer-clean-generic mostlyclean \
+	mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \
+	tags-am uninstall uninstall-am
+
+
+ceph.pp: ceph.te ceph.fc ceph.if
+	$(MAKE) -j1 -f $(datadir)/selinux/devel/Makefile ceph.pp
+
+ at ENABLE_SERVER_TRUE@@WITH_SELINUX_TRUE at all-local: ceph.pp
+
+ at ENABLE_SERVER_TRUE@@WITH_SELINUX_TRUE at install-exec-local:
+ at ENABLE_SERVER_TRUE@@WITH_SELINUX_TRUE@	$(INSTALL) -d $(SELINUXROOT)/packages
+ at ENABLE_SERVER_TRUE@@WITH_SELINUX_TRUE@	$(INSTALL) -m 644 ceph.pp $(SELINUXROOT)/packages/
+ at ENABLE_SERVER_TRUE@@WITH_SELINUX_TRUE@	$(INSTALL) -d $(SELINUXROOT)/devel/include/contrib
+ at ENABLE_SERVER_TRUE@@WITH_SELINUX_TRUE@	$(INSTALL) -m 644 ceph.if $(SELINUXROOT)/devel/include/contrib/
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff --git a/selinux/ceph.fc b/selinux/ceph.fc
new file mode 100644
index 0000000..6b8d062
--- /dev/null
+++ b/selinux/ceph.fc
@@ -0,0 +1,13 @@
+/etc/rc\.d/init\.d/ceph		--	gen_context(system_u:object_r:ceph_initrc_exec_t,s0)
+/etc/rc\.d/init\.d/radosgw	--	gen_context(system_u:object_r:ceph_initrc_exec_t,s0)
+
+/usr/bin/ceph-mon		--	gen_context(system_u:object_r:ceph_exec_t,s0)
+/usr/bin/ceph-mds		--	gen_context(system_u:object_r:ceph_exec_t,s0)
+/usr/bin/ceph-osd		--	gen_context(system_u:object_r:ceph_exec_t,s0)
+/usr/bin/radosgw		--	gen_context(system_u:object_r:ceph_exec_t,s0)
+
+/var/lib/ceph(/.*)?		gen_context(system_u:object_r:ceph_var_lib_t,s0)
+
+/var/log/ceph(/.*)?		gen_context(system_u:object_r:ceph_log_t,s0)
+
+/var/run/ceph(/.*)?		gen_context(system_u:object_r:ceph_var_run_t,s0)
diff --git a/selinux/ceph.if b/selinux/ceph.if
new file mode 100644
index 0000000..ed747a8
--- /dev/null
+++ b/selinux/ceph.if
@@ -0,0 +1,265 @@
+
+## <summary>policy for ceph</summary>
+
+########################################
+## <summary>
+##	Execute ceph_exec_t in the ceph domain.
+## </summary>
+## <param name="domain">
+## <summary>
+##	Domain allowed to transition.
+## </summary>
+## </param>
+#
+interface(`ceph_domtrans',`
+	gen_require(`
+		type ceph_t, ceph_exec_t;
+	')
+
+	corecmd_search_bin($1)
+	domtrans_pattern($1, ceph_exec_t, ceph_t)
+')
+
+######################################
+## <summary>
+##	Execute ceph in the caller domain.
+## </summary>
+## <param name="domain">
+##	<summary>
+##	Domain allowed access.
+##	</summary>
+## </param>
+#
+interface(`ceph_exec',`
+	gen_require(`
+		type ceph_exec_t;
+	')
+
+	corecmd_search_bin($1)
+	can_exec($1, ceph_exec_t)
+')
+
+########################################
+## <summary>
+##	Execute ceph server in the ceph domain.
+## </summary>
+## <param name="domain">
+##	<summary>
+##	Domain allowed access.
+##	</summary>
+## </param>
+#
+interface(`ceph_initrc_domtrans',`
+	gen_require(`
+		type ceph_initrc_exec_t;
+	')
+
+	init_labeled_script_domtrans($1, ceph_initrc_exec_t)
+')
+########################################
+## <summary>
+##	Read ceph's log files.
+## </summary>
+## <param name="domain">
+##	<summary>
+##	Domain allowed access.
+##	</summary>
+## </param>
+## <rolecap/>
+#
+interface(`ceph_read_log',`
+	gen_require(`
+		type ceph_log_t;
+	')
+
+	logging_search_logs($1)
+	read_files_pattern($1, ceph_log_t, ceph_log_t)
+')
+
+########################################
+## <summary>
+##	Append to ceph log files.
+## </summary>
+## <param name="domain">
+##	<summary>
+##	Domain allowed access.
+##	</summary>
+## </param>
+#
+interface(`ceph_append_log',`
+	gen_require(`
+		type ceph_log_t;
+	')
+
+	logging_search_logs($1)
+	append_files_pattern($1, ceph_log_t, ceph_log_t)
+')
+
+########################################
+## <summary>
+##	Manage ceph log files
+## </summary>
+## <param name="domain">
+##	<summary>
+##	Domain allowed access.
+##	</summary>
+## </param>
+#
+interface(`ceph_manage_log',`
+	gen_require(`
+		type ceph_log_t;
+	')
+
+	logging_search_logs($1)
+	manage_dirs_pattern($1, ceph_log_t, ceph_log_t)
+	manage_files_pattern($1, ceph_log_t, ceph_log_t)
+	manage_lnk_files_pattern($1, ceph_log_t, ceph_log_t)
+')
+
+########################################
+## <summary>
+##	Search ceph lib directories.
+## </summary>
+## <param name="domain">
+##	<summary>
+##	Domain allowed access.
+##	</summary>
+## </param>
+#
+interface(`ceph_search_lib',`
+	gen_require(`
+		type ceph_var_lib_t;
+	')
+
+	allow $1 ceph_var_lib_t:dir search_dir_perms;
+	files_search_var_lib($1)
+')
+
+########################################
+## <summary>
+##	Read ceph lib files.
+## </summary>
+## <param name="domain">
+##	<summary>
+##	Domain allowed access.
+##	</summary>
+## </param>
+#
+interface(`ceph_read_lib_files',`
+	gen_require(`
+		type ceph_var_lib_t;
+	')
+
+	files_search_var_lib($1)
+	read_files_pattern($1, ceph_var_lib_t, ceph_var_lib_t)
+')
+
+########################################
+## <summary>
+##	Manage ceph lib files.
+## </summary>
+## <param name="domain">
+##	<summary>
+##	Domain allowed access.
+##	</summary>
+## </param>
+#
+interface(`ceph_manage_lib_files',`
+	gen_require(`
+		type ceph_var_lib_t;
+	')
+
+	files_search_var_lib($1)
+	manage_files_pattern($1, ceph_var_lib_t, ceph_var_lib_t)
+')
+
+########################################
+## <summary>
+##	Manage ceph lib directories.
+## </summary>
+## <param name="domain">
+##	<summary>
+##	Domain allowed access.
+##	</summary>
+## </param>
+#
+interface(`ceph_manage_lib_dirs',`
+	gen_require(`
+		type ceph_var_lib_t;
+	')
+
+	files_search_var_lib($1)
+	manage_dirs_pattern($1, ceph_var_lib_t, ceph_var_lib_t)
+')
+
+########################################
+## <summary>
+##	Read ceph PID files.
+## </summary>
+## <param name="domain">
+##	<summary>
+##	Domain allowed access.
+##	</summary>
+## </param>
+#
+interface(`ceph_read_pid_files',`
+	gen_require(`
+		type ceph_var_run_t;
+	')
+
+	files_search_pids($1)
+	read_files_pattern($1, ceph_var_run_t, ceph_var_run_t)
+')
+
+
+########################################
+## <summary>
+##	All of the rules required to administrate
+##	an ceph environment
+## </summary>
+## <param name="domain">
+##	<summary>
+##	Domain allowed access.
+##	</summary>
+## </param>
+## <param name="role">
+##	<summary>
+##	Role allowed access.
+##	</summary>
+## </param>
+## <rolecap/>
+#
+interface(`ceph_admin',`
+	gen_require(`
+		type ceph_t;
+		type ceph_initrc_exec_t;
+		type ceph_log_t;
+		type ceph_var_lib_t;
+		type ceph_var_run_t;
+	')
+
+	allow $1 ceph_t:process { signal_perms };
+	ps_process_pattern($1, ceph_t)
+
+    tunable_policy(`deny_ptrace',`',`
+        allow $1 ceph_t:process ptrace;
+    ')
+
+	ceph_initrc_domtrans($1)
+	domain_system_change_exemption($1)
+	role_transition $2 ceph_initrc_exec_t system_r;
+	allow $2 system_r;
+
+	logging_search_logs($1)
+	admin_pattern($1, ceph_log_t)
+
+	files_search_var_lib($1)
+	admin_pattern($1, ceph_var_lib_t)
+
+	files_search_pids($1)
+	admin_pattern($1, ceph_var_run_t)
+	optional_policy(`
+		systemd_passwd_agent_exec($1)
+		systemd_read_fifo_file_passwd_run($1)
+	')
+')
diff --git a/selinux/ceph.te b/selinux/ceph.te
new file mode 100644
index 0000000..a215df8
--- /dev/null
+++ b/selinux/ceph.te
@@ -0,0 +1,111 @@
+policy_module(ceph, 1.1.0)
+
+require {
+	type sysfs_t;
+	type var_run_t;
+	type random_device_t;
+	type urandom_device_t;
+        type setfiles_t;
+	class sock_file unlink;
+	class lnk_file read;
+	class dir read;
+	class file { getattr read open };
+}
+
+########################################
+#
+# Declarations
+#
+
+type ceph_t;
+type ceph_exec_t;
+init_daemon_domain(ceph_t, ceph_exec_t)
+
+permissive ceph_t;
+
+type ceph_initrc_exec_t;
+init_script_file(ceph_initrc_exec_t)
+
+type ceph_log_t;
+logging_log_file(ceph_log_t)
+
+type ceph_var_lib_t;
+files_type(ceph_var_lib_t)
+
+type ceph_var_run_t;
+files_pid_file(ceph_var_run_t)
+
+########################################
+#
+# ceph local policy
+#
+
+allow ceph_t self:process { signal_perms };
+allow ceph_t self:fifo_file rw_fifo_file_perms;
+allow ceph_t self:unix_stream_socket create_stream_socket_perms;
+allow ceph_t self:capability { setuid setgid };
+
+manage_dirs_pattern(ceph_t, ceph_log_t, ceph_log_t)
+manage_files_pattern(ceph_t, ceph_log_t, ceph_log_t)
+manage_lnk_files_pattern(ceph_t, ceph_log_t, ceph_log_t)
+
+manage_dirs_pattern(ceph_t, ceph_var_lib_t, ceph_var_lib_t)
+manage_files_pattern(ceph_t, ceph_var_lib_t, ceph_var_lib_t)
+manage_lnk_files_pattern(ceph_t, ceph_var_lib_t, ceph_var_lib_t)
+
+manage_dirs_pattern(ceph_t, ceph_var_run_t, ceph_var_run_t)
+manage_files_pattern(ceph_t, ceph_var_run_t, ceph_var_run_t)
+manage_lnk_files_pattern(ceph_t, ceph_var_run_t, ceph_var_run_t)
+
+kernel_read_system_state(ceph_t)
+
+corenet_all_recvfrom_unlabeled(ceph_t)
+corenet_all_recvfrom_netlabel(ceph_t)
+corenet_udp_sendrecv_generic_if(ceph_t)
+corenet_udp_sendrecv_generic_node(ceph_t)
+corenet_udp_bind_generic_node(ceph_t)
+corenet_tcp_bind_generic_node(ceph_t)
+
+corenet_sendrecv_cyphesis_server_packets(ceph_t)
+corenet_tcp_bind_cyphesis_port(ceph_t)
+corenet_tcp_sendrecv_cyphesis_port(ceph_t)
+
+corecmd_exec_bin(ceph_t)
+corecmd_exec_shell(ceph_t)
+
+dev_read_urand(ceph_t)
+
+fs_getattr_all_fs(ceph_t)
+
+auth_use_nsswitch(ceph_t)
+
+logging_send_syslog_msg(ceph_t)
+
+sysnet_dns_name_resolve(ceph_t)
+
+# basis for future security review
+allow ceph_t ceph_var_run_t:sock_file { create unlink write };
+allow ceph_t self:capability sys_rawio;
+
+allow ceph_t self:tcp_socket { accept listen };
+corenet_tcp_connect_cyphesis_port(ceph_t)
+corenet_tcp_connect_generic_port(ceph_t)
+files_list_tmp(ceph_t)
+fstools_exec(ceph_t)
+nis_use_ypbind_uncond(ceph_t)
+storage_raw_rw_fixed_disk(ceph_t)
+
+allow ceph_t sysfs_t:dir read;
+allow ceph_t sysfs_t:file { read getattr open };
+allow ceph_t sysfs_t:lnk_file read;
+
+allow ceph_t random_device_t:chr_file getattr;
+allow ceph_t urandom_device_t:chr_file getattr;
+allow ceph_t self:process setpgid;
+allow ceph_t var_run_t:dir { write create add_name };
+allow ceph_t var_run_t:file { write create open getattr };
+
+fsadm_manage_pid(ceph_t)
+
+#============= setfiles_t ==============
+allow setfiles_t ceph_var_lib_t:file write;
diff --git a/src/.git_version b/src/.git_version
index 31baffc..3717ae8 100644
--- a/src/.git_version
+++ b/src/.git_version
@@ -1,2 +1,2 @@
-9764da52395923e0b32908d83a9f7304401fee43
-v0.94.5
+bb2ecea240f3a1d525bcb35670cb07bd1f0ca299
+v9.2.0
diff --git a/src/Makefile-client.am b/src/Makefile-client.am
index 587bdf3..dcd1835 100644
--- a/src/Makefile-client.am
+++ b/src/Makefile-client.am
@@ -5,7 +5,9 @@ bin_SCRIPTS += \
 	ceph \
 	ceph-post-file
 
-python_PYTHON += pybind/ceph_argparse.py
+python_PYTHON += \
+	pybind/ceph_argparse.py \
+	pybind/ceph_daemon.py
 
 ceph_syn_SOURCES = ceph_syn.cc
 ceph_syn_SOURCES += client/SyntheticClient.cc # uses g_conf.. needs cleanup
@@ -17,11 +19,10 @@ bin_PROGRAMS += ceph-syn
 
 ceph: ceph.in ./ceph_ver.h Makefile
 	rm -f $@ $@.tmp
-	echo "#!/usr/bin/env python" >$@.tmp
-	grep "#define CEPH_GIT_NICE_VER" ./ceph_ver.h | \
-		sed -e 's/#define \(.*VER\) /\1=/' >>$@.tmp
-	grep "#define CEPH_GIT_VER" ./ceph_ver.h | \
-	  sed -e 's/#define \(.*VER\) /\1=/' -e 's/=\(.*\)$$/="\1"/' >>$@.tmp
+	cp $@.in $@.tmp
+	sed -ie "s|@PYTHON_EXECUTABLE@|/usr/bin/env python|" $@.tmp
+	grep CEPH_GIT_NICE_VER ./ceph_ver.h | cut -f 3 -d " " | sed s/\"//g | xargs -I "{}" sed -ie "s/@CEPH_GIT_NICE_VER@/{}/g" $@.tmp
+	grep CEPH_GIT_VER ./ceph_ver.h | cut -f 3 -d " " | sed s/\"//g | xargs -I "{}" sed -ie "s/@CEPH_GIT_VER@/{}/g" $@.tmp
 	cat $(srcdir)/$@.in >>$@.tmp
 	chmod a+x $@.tmp
 	chmod a-w $@.tmp
@@ -72,9 +73,12 @@ ceph_fuse_SOURCES = ceph_fuse.cc
 ceph_fuse_LDADD = $(LIBCLIENT_FUSE) $(CEPH_GLOBAL)
 bin_PROGRAMS += ceph-fuse
 
+if WITH_RBD
 rbd_fuse_SOURCES = rbd_fuse/rbd-fuse.cc
-rbd_fuse_LDADD = -lfuse $(LIBRBD) $(LIBRADOS) $(CEPH_GLOBAL)
+rbd_fuse_CXXFLAGS = $(AM_CXXFLAGS) $(LIBFUSE_CFLAGS)
+rbd_fuse_LDADD = $(LIBFUSE_LIBS) $(LIBRBD) $(LIBRADOS) $(CEPH_GLOBAL)
 bin_PROGRAMS += rbd-fuse
+endif # WITH_RBD
 endif # WITH_FUSE
 
 
@@ -90,6 +94,9 @@ python_PYTHON += pybind/cephfs.py
 libcephfs_la_SOURCES = libcephfs.cc
 libcephfs_la_LIBADD = $(LIBCLIENT) $(LIBCOMMON) $(PTHREAD_LIBS) $(CRYPTO_LIBS) $(EXTRALIBS)
 libcephfs_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 -export-symbols-regex '^ceph_.*'
+if LINUX
+libcephfs_la_LDFLAGS += -Xcompiler -Xlinker -Xcompiler '--exclude-libs=libcommon.a'
+endif # LINUX
 lib_LTLIBRARIES += libcephfs.la
 
 # jni library (java source is in src/java)
@@ -100,7 +107,7 @@ libcephfs_jni_la_SOURCES = \
 	java/native/ScopedLocalRef.h \
 	java/native/JniConstants.cpp \
 	java/native/JniConstants.h
-libcephfs_jni_la_LIBADD = $(LIBCEPHFS) $(EXTRALIBS)
+libcephfs_jni_la_LIBADD = $(LIBCEPHFS) $(LIBCOMMON) $(EXTRALIBS)
 libcephfs_jni_la_CPPFLAGS = $(JDK_CPPFLAGS) $(AM_CPPFLAGS)
 libcephfs_jni_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0
 lib_LTLIBRARIES += libcephfs_jni.la
diff --git a/src/Makefile-env.am b/src/Makefile-env.am
index b9374ad..63ea49f 100644
--- a/src/Makefile-env.am
+++ b/src/Makefile-env.am
@@ -5,6 +5,7 @@ DIST_SUBDIRS =
 BUILT_SOURCES =
 EXTRA_DIST =
 CLEANFILES =
+dist_noinst_DATA =
 
 noinst_HEADERS =
 bin_PROGRAMS =
@@ -29,8 +30,9 @@ ceph_sbindir = $(sbindir)
 # certain things go straight into /sbin, though!
 su_sbindir = /sbin
 
-# C/C++ tests to build will be appended to this
-check_PROGRAMS =
+# C/C++ tests to build and executed will be appended to this
+check_TESTPROGRAMS =
+check_PROGRAMS = $(check_TESTPROGRAMS)
 
 # tests scripts will be appended to this
 check_SCRIPTS =
@@ -50,6 +52,31 @@ endif
 ##################################
 ## automake environment
 
+HARDENING_CFLAGS = \
+                   -O2 \
+                   -g \
+                   -pipe \
+                   -Wall \
+                   -Wp,-D_FORTIFY_SOURCE=2 \
+                   -fexceptions \
+                   --param=ssp-buffer-size=4 \
+                   -fPIE
+
+SET_STACK_PROTECTOR_STRONG = $(shell expr `gcc -dumpversion` \>= 4.9)
+
+		ifeq ($(SET_STACK_PROTECTOR_STRONG),1)
+				HARDENING_CFLAGS += -fstack-protector-strong
+		else
+				HARDENING_CFLAGS += -fstack-protector
+		endif
+
+
+HARDENING_LDFLAGS =  \
+                     -pie \
+                     -Wl,-z,relro \
+                     -Wl,-z,now
+
+
 AM_COMMON_CPPFLAGS = \
 	-D__CEPH__ \
 	-D_FILE_OFFSET_BITS=64 \
@@ -59,7 +86,7 @@ AM_COMMON_CPPFLAGS = \
 	-D_GNU_SOURCE \
 	-DCEPH_LIBDIR=\"${libdir}\" \
 	-DCEPH_PKGLIBDIR=\"${pkglibdir}\" \
-	-DGTEST_HAS_TR1_TUPLE=0
+	-DGTEST_USE_OWN_TR1_TUPLE=0
 
 AM_COMMON_CFLAGS = \
 	-Wall \
@@ -74,14 +101,14 @@ if !CLANG
 	AM_COMMON_CFLAGS += -rdynamic
 endif
 
-AM_CFLAGS = $(AM_COMMON_CFLAGS)
+AM_CFLAGS = $(AM_COMMON_CFLAGS) $(HARDENING_CFLAGS)
 AM_CPPFLAGS = $(AM_COMMON_CPPFLAGS)
 AM_CXXFLAGS = \
 	@AM_CXXFLAGS@ \
 	$(AM_COMMON_CFLAGS) \
 	-ftemplate-depth-1024 \
 	-Wnon-virtual-dtor \
-	-Wno-invalid-offsetof
+	-Wno-invalid-offsetof $(HARDENING_CFLAGS)
 if !CLANG
 	AM_CXXFLAGS += -Wstrict-null-sentinel
 endif
@@ -96,7 +123,7 @@ endif
 # http://sigquit.wordpress.com/2011/02/16/why-asneeded-doesnt-work-as-expected-for-your-libraries-on-your-autotools-project/
 AM_LDFLAGS =
 if LINUX
-AM_LDFLAGS += -Wl,--as-needed
+AM_LDFLAGS += -Wl,--as-needed $(HARDENING_LDFLAGS)
 endif
 
 if USE_BOOST_SPIRIT_OLD_HDR
@@ -119,7 +146,7 @@ AM_CCASFLAGS = -f elf64
 #####################
 ## library definitions and dependencies
 
-EXTRALIBS = -luuid -lm
+EXTRALIBS = -lm
 if FREEBSD
 EXTRALIBS += -lexecinfo
 endif # FREEBSD
@@ -140,6 +167,7 @@ LIBPERFGLUE = libperfglue.la
 LIBAUTH = libauth.la
 LIBMSG = libmsg.la
 LIBCRUSH = libcrush.la
+LIBCOMPRESSOR = libcompressor.la -lsnappy
 LIBJSON_SPIRIT = libjson_spirit.la
 LIBLOG = liblog.la
 LIBOS = libos.la
@@ -161,10 +189,6 @@ LIBRBD_TYPES = librbd_types.la
 LIBKRBD = libkrbd.la
 LIBCEPHFS = libcephfs.la
 LIBERASURE_CODE = liberasure_code.la
-LIBOSD_TP = tracing/libosd_tp.la
-LIBRADOS_TP = tracing/librados_tp.la
-LIBRBD_TP = tracing/librbd_tp.la
-LIBOS_TP = tracing/libos_tp.la
 
 if WITH_LIBAIO
 LIBOS += -laio
diff --git a/src/Makefile-rocksdb.am b/src/Makefile-rocksdb.am
index 7904c0e..9d45f48 100644
--- a/src/Makefile-rocksdb.am
+++ b/src/Makefile-rocksdb.am
@@ -2,371 +2,462 @@ if WITH_SLIBROCKSDB
   SUBDIRS += rocksdb
 else
   EXTRA_DIST += \
-        rocksdb/.arcconfig \
-        rocksdb/.clang-format \
-        rocksdb/.gitignore \
+	rocksdb/.gitignore \
         rocksdb/CONTRIBUTING.md \
         rocksdb/HISTORY.md \
         rocksdb/INSTALL.md \
         rocksdb/LICENSE \
         rocksdb/Makefile.am \
         rocksdb/PATENTS \
-        rocksdb/README \
+        rocksdb/README.md \
         rocksdb/ROCKSDB_LITE.md \
-        rocksdb/build_tools/build_detect_platform \
-        rocksdb/build_tools/build_detect_version \
-        rocksdb/build_tools/fbcode.clang31.sh \
-        rocksdb/build_tools/fbcode.gcc471.sh \
-        rocksdb/build_tools/fbcode.gcc481.sh \
-        rocksdb/build_tools/format-diff.sh \
-        rocksdb/build_tools/mac-install-gflags.sh \
-        rocksdb/build_tools/make_new_version.sh \
-        rocksdb/build_tools/regression_build_test.sh \
-        rocksdb/build_tools/valgrind_test.sh \
-        rocksdb/configure.ac \
-        rocksdb/coverage/coverage_test.sh \
-        rocksdb/coverage/parse_gcov_output.py \
-        rocksdb/db/builder.cc \
-        rocksdb/db/builder.h \
-        rocksdb/db/c.cc \
-        rocksdb/db/c_test.c \
-        rocksdb/db/column_family.cc \
-        rocksdb/db/column_family.h \
-        rocksdb/db/column_family_test.cc \
-        rocksdb/db/compaction.cc \
-        rocksdb/db/compaction.h \
-        rocksdb/db/compaction_picker.cc \
-        rocksdb/db/compaction_picker.h \
-        rocksdb/db/corruption_test.cc \
-        rocksdb/db/db_bench.cc \
-        rocksdb/db/db_filesnapshot.cc \
-        rocksdb/db/db_impl.cc \
-        rocksdb/db/db_impl.h \
-        rocksdb/db/db_impl_debug.cc \
-        rocksdb/db/db_impl_readonly.cc \
-        rocksdb/db/db_impl_readonly.h \
-        rocksdb/db/db_iter.cc \
-        rocksdb/db/db_iter.h \
-        rocksdb/db/db_stats_logger.cc \
-        rocksdb/db/db_test.cc \
-        rocksdb/db/dbformat.cc \
-        rocksdb/db/dbformat.h \
-        rocksdb/db/dbformat_test.cc \
-        rocksdb/db/deletefile_test.cc \
-        rocksdb/db/file_indexer.cc \
-        rocksdb/db/file_indexer.h \
-        rocksdb/db/file_indexer_test.cc \
-        rocksdb/db/filename.cc \
-        rocksdb/db/filename.h \
-        rocksdb/db/filename_test.cc \
-        rocksdb/db/internal_stats.cc \
-        rocksdb/db/internal_stats.h \
-        rocksdb/db/log_and_apply_bench.cc \
-        rocksdb/db/log_format.h \
-        rocksdb/db/log_reader.cc \
-        rocksdb/db/log_reader.h \
-        rocksdb/db/log_test.cc \
-        rocksdb/db/log_writer.cc \
-        rocksdb/db/log_writer.h \
-        rocksdb/db/memtable.cc \
-        rocksdb/db/memtable.h \
-        rocksdb/db/memtable_list.cc \
-        rocksdb/db/memtable_list.h \
-        rocksdb/db/merge_context.h \
-        rocksdb/db/merge_helper.cc \
-        rocksdb/db/merge_helper.h \
-        rocksdb/db/merge_operator.cc \
-        rocksdb/db/merge_test.cc \
-        rocksdb/db/perf_context_test.cc \
-        rocksdb/db/plain_table_db_test.cc \
-        rocksdb/db/prefix_test.cc \
-        rocksdb/db/repair.cc \
-        rocksdb/db/simple_table_db_test.cc \
-        rocksdb/db/skiplist.h \
-        rocksdb/db/skiplist_test.cc \
-        rocksdb/db/snapshot.h \
-        rocksdb/db/table_cache.cc \
-        rocksdb/db/table_cache.h \
-        rocksdb/db/table_properties_collector.cc \
-        rocksdb/db/table_properties_collector.h \
-        rocksdb/db/table_properties_collector_test.cc \
-        rocksdb/db/tailing_iter.cc \
-        rocksdb/db/tailing_iter.h \
-        rocksdb/db/transaction_log_impl.cc \
-        rocksdb/db/transaction_log_impl.h \
-        rocksdb/db/version_edit.cc \
-        rocksdb/db/version_edit.h \
-        rocksdb/db/version_edit_test.cc \
-        rocksdb/db/version_set.cc \
-        rocksdb/db/version_set.h \
-        rocksdb/db/version_set_test.cc \
-        rocksdb/db/write_batch.cc \
-        rocksdb/db/write_batch_internal.h \
-        rocksdb/db/write_batch_test.cc \
-        rocksdb/doc/doc.css \
-        rocksdb/doc/index.html \
-        rocksdb/doc/log_format.txt \
-        rocksdb/doc/rockslogo.jpg \
-        rocksdb/doc/rockslogo.png \
-        rocksdb/hdfs/README \
-        rocksdb/hdfs/env_hdfs.h \
-        rocksdb/hdfs/hdfs.h \
-        rocksdb/hdfs/libhdfs.a \
-        rocksdb/helpers/memenv/memenv.cc \
-        rocksdb/helpers/memenv/memenv_test.cc \
-        rocksdb/include/rocksdb/c.h \
-        rocksdb/include/rocksdb/cache.h \
-        rocksdb/include/rocksdb/compaction_filter.h \
-        rocksdb/include/rocksdb/comparator.h \
-        rocksdb/include/rocksdb/db.h \
-        rocksdb/include/rocksdb/env.h \
-        rocksdb/include/rocksdb/filter_policy.h \
-        rocksdb/include/rocksdb/flush_block_policy.h \
-        rocksdb/include/rocksdb/iterator.h \
-        rocksdb/include/rocksdb/ldb_tool.h \
-        rocksdb/include/rocksdb/memtablerep.h \
-        rocksdb/include/rocksdb/merge_operator.h \
-        rocksdb/include/rocksdb/options.h \
-        rocksdb/include/rocksdb/perf_context.h \
-        rocksdb/include/rocksdb/slice.h \
-        rocksdb/include/rocksdb/slice_transform.h \
-        rocksdb/include/rocksdb/statistics.h \
-        rocksdb/include/rocksdb/status.h \
-        rocksdb/include/rocksdb/table.h \
-        rocksdb/include/rocksdb/table_properties.h \
-        rocksdb/include/rocksdb/transaction_log.h \
-        rocksdb/include/rocksdb/types.h \
-        rocksdb/include/rocksdb/universal_compaction.h \
-        rocksdb/include/rocksdb/version.h \
-        rocksdb/include/rocksdb/write_batch.h \
-        rocksdb/include/utilities/backupable_db.h \
-        rocksdb/include/utilities/db_ttl.h \
-        rocksdb/include/utilities/geo_db.h \
-        rocksdb/include/utilities/stackable_db.h \
-        rocksdb/include/utilities/utility_db.h \
-        rocksdb/java/Makefile \
-        rocksdb/java/RocksDBSample.java \
-        rocksdb/java/jdb_bench.sh \
-        rocksdb/java/org/rocksdb/BackupableDB.java \
-        rocksdb/java/org/rocksdb/BackupableDBOptions.java \
-        rocksdb/java/org/rocksdb/BloomFilter.java \
-        rocksdb/java/org/rocksdb/Filter.java \
-        rocksdb/java/org/rocksdb/HashLinkedListMemTableConfig.java \
-        rocksdb/java/org/rocksdb/HashSkipListMemTableConfig.java \
-        rocksdb/java/org/rocksdb/HistogramData.java \
-        rocksdb/java/org/rocksdb/HistogramType.java \
-        rocksdb/java/org/rocksdb/Iterator.java \
-        rocksdb/java/org/rocksdb/MemTableConfig.java \
-        rocksdb/java/org/rocksdb/Options.java \
-        rocksdb/java/org/rocksdb/PlainTableConfig.java \
-        rocksdb/java/org/rocksdb/ReadOptions.java \
-        rocksdb/java/org/rocksdb/RocksDB.java \
-        rocksdb/java/org/rocksdb/RocksDBException.java \
-        rocksdb/java/org/rocksdb/RocksObject.java \
-        rocksdb/java/org/rocksdb/SkipListMemTableConfig.java \
-        rocksdb/java/org/rocksdb/Statistics.java \
-        rocksdb/java/org/rocksdb/TableFormatConfig.java \
-        rocksdb/java/org/rocksdb/TickerType.java \
-        rocksdb/java/org/rocksdb/VectorMemTableConfig.java \
-        rocksdb/java/org/rocksdb/WriteBatch.java \
-        rocksdb/java/org/rocksdb/WriteBatchTest.java \
-        rocksdb/java/org/rocksdb/WriteOptions.java \
-        rocksdb/java/org/rocksdb/benchmark/DbBenchmark.java \
-        rocksdb/java/org/rocksdb/test/BackupableDBTest.java \
-        rocksdb/java/org/rocksdb/test/OptionsTest.java \
-        rocksdb/java/org/rocksdb/test/ReadOptionsTest.java \
-        rocksdb/java/org/rocksdb/util/Environment.java \
-        rocksdb/java/org/rocksdb/util/SizeUnit.java \
-        rocksdb/java/rocksjni/backupablejni.cc \
-        rocksdb/java/rocksjni/filter.cc \
-        rocksdb/java/rocksjni/iterator.cc \
-        rocksdb/java/rocksjni/memtablejni.cc \
-        rocksdb/java/rocksjni/options.cc \
-        rocksdb/java/rocksjni/portal.h \
-        rocksdb/java/rocksjni/rocksjni.cc \
-        rocksdb/java/rocksjni/statistics.cc \
-        rocksdb/java/rocksjni/table.cc \
-        rocksdb/java/rocksjni/write_batch.cc \
-        rocksdb/linters/__phutil_library_init__.php \
-        rocksdb/linters/__phutil_library_map__.php \
-        rocksdb/linters/cpp_linter/ArcanistCpplintLinter.php \
-        rocksdb/linters/cpp_linter/FbcodeCppLinter.php \
-        rocksdb/linters/cpp_linter/PfffCppLinter.php \
-        rocksdb/linters/cpp_linter/cpplint.py \
-        rocksdb/linters/lint_engine/FacebookFbcodeLintEngine.php \
-        rocksdb/m4/libtool.m4 \
-        rocksdb/m4/ltoptions.m4 \
-        rocksdb/m4/ltsugar.m4 \
-        rocksdb/m4/ltversion.m4 \
-        rocksdb/m4/lt~obsolete.m4 \
-        rocksdb/port/README \
-        rocksdb/port/atomic_pointer.h \
-        rocksdb/port/likely.h \
-        rocksdb/port/port.h \
-        rocksdb/port/port_example.h \
-        rocksdb/port/port_posix.cc \
-        rocksdb/port/port_posix.h \
-        rocksdb/port/stack_trace.cc \
-        rocksdb/port/stack_trace.h \
-        rocksdb/port/win/stdint.h \
-        rocksdb/table/block.cc \
-        rocksdb/table/block.h \
-        rocksdb/table/block_based_table_builder.cc \
-        rocksdb/table/block_based_table_builder.h \
-        rocksdb/table/block_based_table_factory.cc \
-        rocksdb/table/block_based_table_factory.h \
-        rocksdb/table/block_based_table_reader.cc \
-        rocksdb/table/block_based_table_reader.h \
-        rocksdb/table/block_builder.cc \
-        rocksdb/table/block_builder.h \
-        rocksdb/table/block_hash_index.cc \
-        rocksdb/table/block_hash_index.h \
-        rocksdb/table/block_hash_index_test.cc \
-        rocksdb/table/block_test.cc \
-        rocksdb/table/filter_block.cc \
-        rocksdb/table/filter_block.h \
-        rocksdb/table/filter_block_test.cc \
-        rocksdb/table/flush_block_policy.cc \
-        rocksdb/table/format.cc \
-        rocksdb/table/format.h \
-        rocksdb/table/iter_heap.h \
-        rocksdb/table/iterator.cc \
-        rocksdb/table/iterator_wrapper.h \
-        rocksdb/table/merger.cc \
-        rocksdb/table/merger.h \
-        rocksdb/table/meta_blocks.cc \
-        rocksdb/table/meta_blocks.h \
-        rocksdb/table/plain_table_builder.cc \
-        rocksdb/table/plain_table_builder.h \
-        rocksdb/table/plain_table_factory.cc \
-        rocksdb/table/plain_table_factory.h \
-        rocksdb/table/plain_table_reader.cc \
-        rocksdb/table/plain_table_reader.h \
-        rocksdb/table/table_builder.h \
-        rocksdb/table/table_properties.cc \
-        rocksdb/table/table_reader.h \
-        rocksdb/table/table_reader_bench.cc \
-        rocksdb/table/table_test.cc \
-        rocksdb/table/two_level_iterator.cc \
-        rocksdb/table/two_level_iterator.h \
-        rocksdb/tools/auto_sanity_test.sh \
-        rocksdb/tools/blob_store_bench.cc \
-        rocksdb/tools/db_crashtest.py \
-        rocksdb/tools/db_crashtest2.py \
-        rocksdb/tools/db_repl_stress.cc \
-        rocksdb/tools/db_sanity_test.cc \
-        rocksdb/tools/db_stress.cc \
-        rocksdb/tools/ldb.cc \
-        rocksdb/tools/ldb_test.py \
-        rocksdb/tools/reduce_levels_test.cc \
-        rocksdb/tools/sst_dump.cc \
-        rocksdb/util/arena.cc \
-        rocksdb/util/arena.h \
-        rocksdb/util/arena_test.cc \
-        rocksdb/util/auto_roll_logger.cc \
-        rocksdb/util/auto_roll_logger.h \
-        rocksdb/util/auto_roll_logger_test.cc \
-        rocksdb/util/autovector.h \
-        rocksdb/util/autovector_test.cc \
-        rocksdb/util/benchharness.cc \
-        rocksdb/util/benchharness.h \
-        rocksdb/util/benchharness_test.cc \
-        rocksdb/util/blob_store.cc \
-        rocksdb/util/blob_store.h \
-        rocksdb/util/blob_store_test.cc \
-        rocksdb/util/bloom.cc \
-        rocksdb/util/bloom_test.cc \
-        rocksdb/util/build_version.h \
-        rocksdb/util/cache.cc \
-        rocksdb/util/cache_test.cc \
-        rocksdb/util/coding.cc \
-        rocksdb/util/coding.h \
-        rocksdb/util/coding_test.cc \
-        rocksdb/util/comparator.cc \
-        rocksdb/util/crc32c.cc \
-        rocksdb/util/crc32c.h \
-        rocksdb/util/crc32c_test.cc \
-        rocksdb/util/dynamic_bloom.cc \
-        rocksdb/util/dynamic_bloom.h \
-        rocksdb/util/dynamic_bloom_test.cc \
-        rocksdb/util/env.cc \
-        rocksdb/util/env_hdfs.cc \
-        rocksdb/util/env_posix.cc \
-        rocksdb/util/env_test.cc \
-        rocksdb/util/filelock_test.cc \
-        rocksdb/util/filter_policy.cc \
-        rocksdb/util/hash.cc \
-        rocksdb/util/hash.h \
-        rocksdb/util/hash_cuckoo_rep.cc \
-        rocksdb/util/hash_cuckoo_rep.h \
-        rocksdb/util/hash_linklist_rep.cc \
-        rocksdb/util/hash_linklist_rep.h \
-        rocksdb/util/hash_skiplist_rep.cc \
-        rocksdb/util/hash_skiplist_rep.h \
-        rocksdb/util/histogram.cc \
-        rocksdb/util/histogram.h \
-        rocksdb/util/histogram_test.cc \
-        rocksdb/util/ldb_cmd.cc \
-        rocksdb/util/ldb_cmd.h \
-        rocksdb/util/ldb_cmd_execute_result.h \
-        rocksdb/util/ldb_tool.cc \
-        rocksdb/util/log_buffer.cc \
-        rocksdb/util/log_buffer.h \
-        rocksdb/util/log_write_bench.cc \
-        rocksdb/util/logging.cc \
-        rocksdb/util/logging.h \
-        rocksdb/util/manual_compaction_test.cc \
-        rocksdb/util/murmurhash.cc \
-        rocksdb/util/murmurhash.h \
-        rocksdb/util/mutexlock.h \
-        rocksdb/util/options.cc \
-        rocksdb/util/perf_context.cc \
-        rocksdb/util/perf_context_imp.h \
-        rocksdb/util/posix_logger.h \
-        rocksdb/util/random.h \
-        rocksdb/util/signal_test.cc \
-        rocksdb/util/skiplistrep.cc \
-        rocksdb/util/slice.cc \
-        rocksdb/util/statistics.cc \
-        rocksdb/util/statistics.h \
-        rocksdb/util/stats_logger.h \
-        rocksdb/util/status.cc \
-        rocksdb/util/stl_wrappers.h \
-        rocksdb/util/stop_watch.h \
-        rocksdb/util/string_util.cc \
-        rocksdb/util/string_util.h \
-        rocksdb/util/sync_point.cc \
-        rocksdb/util/sync_point.h \
-        rocksdb/util/testharness.cc \
-        rocksdb/util/testharness.h \
-        rocksdb/util/testutil.cc \
-        rocksdb/util/testutil.h \
-        rocksdb/util/thread_local.cc \
-        rocksdb/util/thread_local.h \
-        rocksdb/util/thread_local_test.cc \
-        rocksdb/util/vectorrep.cc \
-        rocksdb/util/xxhash.cc \
-        rocksdb/util/xxhash.h \
-        rocksdb/utilities/backupable/backupable_db.cc \
-        rocksdb/utilities/backupable/backupable_db_test.cc \
-        rocksdb/utilities/geodb/geodb_impl.cc \
-        rocksdb/utilities/geodb/geodb_impl.h \
-        rocksdb/utilities/geodb/geodb_test.cc \
-        rocksdb/utilities/merge_operators.h \
-        rocksdb/utilities/merge_operators/put.cc \
-        rocksdb/utilities/merge_operators/string_append/stringappend.cc \
-        rocksdb/utilities/merge_operators/string_append/stringappend.h \
-        rocksdb/utilities/merge_operators/string_append/stringappend2.cc \
-        rocksdb/utilities/merge_operators/string_append/stringappend2.h \
-        rocksdb/utilities/merge_operators/string_append/stringappend_test.cc \
-        rocksdb/utilities/merge_operators/uint64add.cc \
-        rocksdb/utilities/redis/README \
-        rocksdb/utilities/redis/redis_list_exception.h \
-        rocksdb/utilities/redis/redis_list_iterator.h \
-        rocksdb/utilities/redis/redis_lists.cc \
-        rocksdb/utilities/redis/redis_lists.h \
-        rocksdb/utilities/redis/redis_lists_test.cc \
-        rocksdb/utilities/ttl/db_ttl_impl.cc \
-        rocksdb/utilities/ttl/db_ttl_impl.h \
-        rocksdb/utilities/ttl/ttl_test.cc
+	rocksdb/AUTHORS \
+	rocksdb/configure.ac \
+	rocksdb/CONTRIBUTING.md \
+	rocksdb/db/builder.cc \
+	rocksdb/db/builder.h \
+	rocksdb/db/c.cc \
+	rocksdb/db/column_family.cc \
+	rocksdb/db/column_family.h \
+	rocksdb/db/column_family_test.cc \
+	rocksdb/db/compact_files_test.cc \
+	rocksdb/db/compaction.cc \
+	rocksdb/db/compaction.h \
+	rocksdb/db/compaction_job.cc \
+	rocksdb/db/compaction_job.h \
+	rocksdb/db/compaction_job_test.cc \
+	rocksdb/db/compaction_picker.cc \
+	rocksdb/db/compaction_picker.h \
+	rocksdb/db/compaction_picker_test.cc \
+	rocksdb/db/comparator_db_test.cc \
+	rocksdb/db/corruption_test.cc \
+	rocksdb/db/c_test.c \
+	rocksdb/db/cuckoo_table_db_test.cc \
+	rocksdb/db/db_bench.cc \
+	rocksdb/db/db_filesnapshot.cc \
+	rocksdb/db/dbformat.cc \
+	rocksdb/db/dbformat.h \
+	rocksdb/db/dbformat_test.cc \
+	rocksdb/db/db_impl.cc \
+	rocksdb/db/db_impl_debug.cc \
+	rocksdb/db/db_impl_experimental.cc \
+	rocksdb/db/db_impl.h \
+	rocksdb/db/db_impl_readonly.cc \
+	rocksdb/db/db_impl_readonly.h \
+	rocksdb/db/db_iter.cc \
+	rocksdb/db/db_iter.h \
+	rocksdb/db/db_iter_test.cc \
+	rocksdb/db/db_test.cc \
+	rocksdb/db/deletefile_test.cc \
+	rocksdb/db/event_logger_helpers.cc \
+	rocksdb/db/event_logger_helpers.h \
+	rocksdb/db/experimental.cc \
+	rocksdb/db/fault_injection_test.cc \
+	rocksdb/db/file_indexer.cc \
+	rocksdb/db/file_indexer.h \
+	rocksdb/db/file_indexer_test.cc \
+	rocksdb/db/filename.cc \
+	rocksdb/db/filename.h \
+	rocksdb/db/filename_test.cc \
+	rocksdb/db/flush_job.cc \
+	rocksdb/db/flush_job.h \
+	rocksdb/db/flush_job_test.cc \
+	rocksdb/db/flush_scheduler.cc \
+	rocksdb/db/flush_scheduler.h \
+	rocksdb/db/forward_iterator.cc \
+	rocksdb/db/forward_iterator.h \
+	rocksdb/db/internal_stats.cc \
+	rocksdb/db/internal_stats.h \
+	rocksdb/db/job_context.h \
+	rocksdb/db/listener_test.cc \
+	rocksdb/db/log_format.h \
+	rocksdb/db/log_reader.cc \
+	rocksdb/db/log_reader.h \
+	rocksdb/db/log_test.cc \
+	rocksdb/db/log_writer.cc \
+	rocksdb/db/log_writer.h \
+	rocksdb/db/managed_iterator.cc \
+	rocksdb/db/managed_iterator.h \
+	rocksdb/db/memtable_allocator.cc \
+	rocksdb/db/memtable_allocator.h \
+	rocksdb/db/memtable.cc \
+	rocksdb/db/memtable.h \
+	rocksdb/db/memtable_list.cc \
+	rocksdb/db/memtable_list.h \
+	rocksdb/db/memtable_list_test.cc \
+	rocksdb/db/memtablerep_bench.cc \
+	rocksdb/db/merge_context.h \
+	rocksdb/db/merge_helper.cc \
+	rocksdb/db/merge_helper.h \
+	rocksdb/db/merge_operator.cc \
+	rocksdb/db/merge_test.cc \
+	rocksdb/db/perf_context_test.cc \
+	rocksdb/db/plain_table_db_test.cc \
+	rocksdb/db/prefix_test.cc \
+	rocksdb/db/repair.cc \
+	rocksdb/db/skiplist.h \
+	rocksdb/db/skiplist_test.cc \
+	rocksdb/db/slice.cc \
+	rocksdb/db/snapshot.h \
+	rocksdb/db/table_cache.cc \
+	rocksdb/db/table_cache.h \
+	rocksdb/db/table_properties_collector.cc \
+	rocksdb/db/table_properties_collector.h \
+	rocksdb/db/table_properties_collector_test.cc \
+	rocksdb/db/transaction_log_impl.cc \
+	rocksdb/db/transaction_log_impl.h \
+	rocksdb/db/version_builder.cc \
+	rocksdb/db/version_builder.h \
+	rocksdb/db/version_builder_test.cc \
+	rocksdb/db/version_edit.cc \
+	rocksdb/db/version_edit.h \
+	rocksdb/db/version_edit_test.cc \
+	rocksdb/db/version_set.cc \
+	rocksdb/db/version_set.h \
+	rocksdb/db/version_set_test.cc \
+	rocksdb/db/wal_manager.cc \
+	rocksdb/db/wal_manager.h \
+	rocksdb/db/wal_manager_test.cc \
+	rocksdb/db/write_batch_base.cc \
+	rocksdb/db/write_batch.cc \
+	rocksdb/db/write_batch_internal.h \
+	rocksdb/db/write_batch_test.cc \
+	rocksdb/db/writebuffer.h \
+	rocksdb/db/write_controller.cc \
+	rocksdb/db/write_controller.h \
+	rocksdb/db/write_controller_test.cc \
+	rocksdb/db/write_thread.cc \
+	rocksdb/db/write_thread.h \
+	rocksdb/doc/doc.css \
+	rocksdb/doc/index.html \
+	rocksdb/doc/log_format.txt \
+	rocksdb/doc/rockslogo.jpg \
+	rocksdb/doc/rockslogo.png \
+	rocksdb/examples/column_families_example.cc \
+	rocksdb/examples/compact_files_example.cc \
+	rocksdb/examples/c_simple_example.c \
+	rocksdb/examples/.gitignore \
+	rocksdb/examples/Makefile \
+	rocksdb/examples/README.md \
+	rocksdb/examples/simple_example.cc \
+	rocksdb/hdfs/env_hdfs.h \
+	rocksdb/hdfs/README \
+	rocksdb/hdfs/setup.sh \
+	rocksdb/HISTORY.md \
+	rocksdb/include/rocksdb/cache.h \
+	rocksdb/include/rocksdb/c.h \
+	rocksdb/include/rocksdb/compaction_filter.h \
+	rocksdb/include/rocksdb/comparator.h \
+	rocksdb/include/rocksdb/db.h \
+	rocksdb/include/rocksdb/env.h \
+	rocksdb/include/rocksdb/experimental.h \
+	rocksdb/include/rocksdb/filter_policy.h \
+	rocksdb/include/rocksdb/flush_block_policy.h \
+	rocksdb/include/rocksdb/immutable_options.h \
+	rocksdb/include/rocksdb/iostats_context.h \
+	rocksdb/include/rocksdb/iterator.h \
+	rocksdb/include/rocksdb/ldb_tool.h \
+	rocksdb/include/rocksdb/listener.h \
+	rocksdb/include/rocksdb/memtablerep.h \
+	rocksdb/include/rocksdb/merge_operator.h \
+	rocksdb/include/rocksdb/metadata.h \
+	rocksdb/include/rocksdb/options.h \
+	rocksdb/include/rocksdb/perf_context.h \
+	rocksdb/include/rocksdb/rate_limiter.h \
+	rocksdb/include/rocksdb/slice.h \
+	rocksdb/include/rocksdb/slice_transform.h \
+	rocksdb/include/rocksdb/sst_dump_tool.h \
+	rocksdb/include/rocksdb/statistics.h \
+	rocksdb/include/rocksdb/status.h \
+	rocksdb/include/rocksdb/table.h \
+	rocksdb/include/rocksdb/table_properties.h \
+	rocksdb/include/rocksdb/thread_status.h \
+	rocksdb/include/rocksdb/transaction_log.h \
+	rocksdb/include/rocksdb/types.h \
+	rocksdb/include/rocksdb/universal_compaction.h \
+	rocksdb/include/rocksdb/utilities/backupable_db.h \
+	rocksdb/include/rocksdb/utilities/checkpoint.h \
+	rocksdb/include/rocksdb/utilities/convenience.h \
+	rocksdb/include/rocksdb/utilities/db_ttl.h \
+	rocksdb/include/rocksdb/utilities/document_db.h \
+	rocksdb/include/rocksdb/utilities/flashcache.h \
+	rocksdb/include/rocksdb/utilities/geo_db.h \
+	rocksdb/include/rocksdb/utilities/json_document.h \
+	rocksdb/include/rocksdb/utilities/leveldb_options.h \
+	rocksdb/include/rocksdb/utilities/spatial_db.h \
+	rocksdb/include/rocksdb/utilities/stackable_db.h \
+	rocksdb/include/rocksdb/utilities/utility_db.h \
+	rocksdb/include/rocksdb/utilities/write_batch_with_index.h \
+	rocksdb/include/rocksdb/version.h \
+	rocksdb/include/rocksdb/write_batch_base.h \
+	rocksdb/include/rocksdb/write_batch.h \
+	rocksdb/include/utilities/backupable_db.h \
+	rocksdb/include/utilities/db_ttl.h \
+	rocksdb/include/utilities/document_db.h \
+	rocksdb/include/utilities/geo_db.h \
+	rocksdb/include/utilities/json_document.h \
+	rocksdb/include/utilities/stackable_db.h \
+	rocksdb/include/utilities/utility_db.h \
+	rocksdb/INSTALL.md \
+	rocksdb/LICENSE \
+	rocksdb/m4/libtool.m4 \
+	rocksdb/m4/lt~obsolete.m4 \
+	rocksdb/m4/ltoptions.m4 \
+	rocksdb/m4/ltsugar.m4 \
+	rocksdb/m4/ltversion.m4 \
+	rocksdb/Makefile.am \
+	rocksdb/PATENTS \
+	rocksdb/port/likely.h \
+	rocksdb/port/port_example.h \
+	rocksdb/port/port.h \
+	rocksdb/port/port_posix.cc \
+	rocksdb/port/port_posix.h \
+	rocksdb/port/README \
+	rocksdb/port/stack_trace.cc \
+	rocksdb/port/stack_trace.h \
+	rocksdb/port/win/stdint.h \
+	rocksdb/README.md \
+	rocksdb/ROCKSDB_LITE.md \
+	rocksdb/table/adaptive_table_factory.cc \
+	rocksdb/table/adaptive_table_factory.h \
+	rocksdb/table/block_based_filter_block.cc \
+	rocksdb/table/block_based_filter_block.h \
+	rocksdb/table/block_based_filter_block_test.cc \
+	rocksdb/table/block_based_table_builder.cc \
+	rocksdb/table/block_based_table_builder.h \
+	rocksdb/table/block_based_table_factory.cc \
+	rocksdb/table/block_based_table_factory.h \
+	rocksdb/table/block_based_table_reader.cc \
+	rocksdb/table/block_based_table_reader.h \
+	rocksdb/table/block_builder.cc \
+	rocksdb/table/block_builder.h \
+	rocksdb/table/block.cc \
+	rocksdb/table/block.h \
+	rocksdb/table/block_hash_index.cc \
+	rocksdb/table/block_hash_index.h \
+	rocksdb/table/block_hash_index_test.cc \
+	rocksdb/table/block_prefix_index.cc \
+	rocksdb/table/block_prefix_index.h \
+	rocksdb/table/block_test.cc \
+	rocksdb/table/bloom_block.cc \
+	rocksdb/table/bloom_block.h \
+	rocksdb/table/cuckoo_table_builder.cc \
+	rocksdb/table/cuckoo_table_builder.h \
+	rocksdb/table/cuckoo_table_builder_test.cc \
+	rocksdb/table/cuckoo_table_factory.cc \
+	rocksdb/table/cuckoo_table_factory.h \
+	rocksdb/table/cuckoo_table_reader.cc \
+	rocksdb/table/cuckoo_table_reader.h \
+	rocksdb/table/cuckoo_table_reader_test.cc \
+	rocksdb/table/filter_block.h \
+	rocksdb/table/flush_block_policy.cc \
+	rocksdb/table/format.cc \
+	rocksdb/table/format.h \
+	rocksdb/table/full_filter_block.cc \
+	rocksdb/table/full_filter_block.h \
+	rocksdb/table/full_filter_block_test.cc \
+	rocksdb/table/get_context.cc \
+	rocksdb/table/get_context.h \
+	rocksdb/table/iterator.cc \
+	rocksdb/table/iterator_wrapper.h \
+	rocksdb/table/iter_heap.h \
+	rocksdb/table/merger.cc \
+	rocksdb/table/merger.h \
+	rocksdb/table/merger_test.cc \
+	rocksdb/table/meta_blocks.cc \
+	rocksdb/table/meta_blocks.h \
+	rocksdb/table/mock_table.cc \
+	rocksdb/table/mock_table.h \
+	rocksdb/table/plain_table_builder.cc \
+	rocksdb/table/plain_table_builder.h \
+	rocksdb/table/plain_table_factory.cc \
+	rocksdb/table/plain_table_factory.h \
+	rocksdb/table/plain_table_index.cc \
+	rocksdb/table/plain_table_index.h \
+	rocksdb/table/plain_table_key_coding.cc \
+	rocksdb/table/plain_table_key_coding.h \
+	rocksdb/table/plain_table_reader.cc \
+	rocksdb/table/plain_table_reader.h \
+	rocksdb/table/table_builder.h \
+	rocksdb/table/table_properties.cc \
+	rocksdb/table/table_properties_internal.h \
+	rocksdb/table/table_reader_bench.cc \
+	rocksdb/table/table_reader.h \
+	rocksdb/table/table_test.cc \
+	rocksdb/table/two_level_iterator.cc \
+	rocksdb/table/two_level_iterator.h \
+	rocksdb/third-party/fbson/COMMIT.md \
+	rocksdb/third-party/fbson/FbsonDocument.h \
+	rocksdb/third-party/fbson/FbsonJsonParser.h \
+	rocksdb/third-party/fbson/FbsonStream.h \
+	rocksdb/third-party/fbson/FbsonUtil.h \
+	rocksdb/third-party/fbson/FbsonWriter.h \
+	rocksdb/third-party/flashcache/flashcache_ioctl.h \
+	rocksdb/third-party/gtest-1.7.0/fused-src/gtest/gtest-all.cc \
+	rocksdb/third-party/gtest-1.7.0/fused-src/gtest/gtest.h \
+	rocksdb/USERS.md \
+	rocksdb/util/allocator.h \
+	rocksdb/util/arena.cc \
+	rocksdb/util/arena.h \
+	rocksdb/util/arena_test.cc \
+	rocksdb/util/auto_roll_logger.cc \
+	rocksdb/util/auto_roll_logger.h \
+	rocksdb/util/auto_roll_logger_test.cc \
+	rocksdb/util/autovector.h \
+	rocksdb/util/autovector_test.cc \
+	rocksdb/util/bloom.cc \
+	rocksdb/util/bloom_test.cc \
+	rocksdb/util/build_version.h \
+	rocksdb/util/cache_bench.cc \
+	rocksdb/util/cache.cc \
+	rocksdb/util/cache_test.cc \
+	rocksdb/util/coding.cc \
+	rocksdb/util/coding.h \
+	rocksdb/util/coding_test.cc \
+	rocksdb/util/comparator.cc \
+	rocksdb/util/compression.h \
+	rocksdb/util/crc32c.cc \
+	rocksdb/util/crc32c.h \
+	rocksdb/util/crc32c_test.cc \
+	rocksdb/util/db_info_dumper.cc \
+	rocksdb/util/db_info_dumper.h \
+	rocksdb/util/dynamic_bloom.cc \
+	rocksdb/util/dynamic_bloom.h \
+	rocksdb/util/dynamic_bloom_test.cc \
+	rocksdb/util/env.cc \
+	rocksdb/util/env_hdfs.cc \
+	rocksdb/util/env_posix.cc \
+	rocksdb/util/env_test.cc \
+	rocksdb/util/event_logger.cc \
+	rocksdb/util/event_logger.h \
+	rocksdb/util/event_logger_test.cc \
+	rocksdb/util/filelock_test.cc \
+	rocksdb/util/file_util.cc \
+	rocksdb/util/file_util.h \
+	rocksdb/util/filter_policy.cc \
+	rocksdb/util/hash.cc \
+	rocksdb/util/hash_cuckoo_rep.cc \
+	rocksdb/util/hash_cuckoo_rep.h \
+	rocksdb/util/hash.h \
+	rocksdb/util/hash_linklist_rep.cc \
+	rocksdb/util/hash_linklist_rep.h \
+	rocksdb/util/hash_skiplist_rep.cc \
+	rocksdb/util/hash_skiplist_rep.h \
+	rocksdb/util/histogram.cc \
+	rocksdb/util/histogram.h \
+	rocksdb/util/histogram_test.cc \
+	rocksdb/util/instrumented_mutex.cc \
+	rocksdb/util/instrumented_mutex.h \
+	rocksdb/util/iostats_context.cc \
+	rocksdb/util/iostats_context_imp.h \
+	rocksdb/utilities/backupable/backupable_db.cc \
+	rocksdb/utilities/backupable/backupable_db_test.cc \
+	rocksdb/utilities/checkpoint/checkpoint.cc \
+	rocksdb/utilities/compacted_db/compacted_db_impl.cc \
+	rocksdb/utilities/compacted_db/compacted_db_impl.h \
+	rocksdb/utilities/convenience/convenience.cc \
+	rocksdb/utilities/document/document_db.cc \
+	rocksdb/utilities/document/document_db_test.cc \
+	rocksdb/utilities/document/json_document_builder.cc \
+	rocksdb/utilities/document/json_document.cc \
+	rocksdb/utilities/document/json_document_test.cc \
+	rocksdb/utilities/flashcache/flashcache.cc \
+	rocksdb/utilities/flashcache/flashcache.h \
+	rocksdb/utilities/geodb/geodb_impl.cc \
+	rocksdb/utilities/geodb/geodb_impl.h \
+	rocksdb/utilities/geodb/geodb_test.cc \
+	rocksdb/utilities/leveldb_options/leveldb_options.cc \
+	rocksdb/utilities/merge_operators.h \
+	rocksdb/utilities/merge_operators/put.cc \
+	rocksdb/utilities/merge_operators/string_append/stringappend2.cc \
+	rocksdb/utilities/merge_operators/string_append/stringappend2.h \
+	rocksdb/utilities/merge_operators/string_append/stringappend.cc \
+	rocksdb/utilities/merge_operators/string_append/stringappend.h \
+	rocksdb/utilities/merge_operators/string_append/stringappend_test.cc \
+	rocksdb/utilities/merge_operators/uint64add.cc \
+	rocksdb/utilities/redis/README \
+	rocksdb/utilities/redis/redis_list_exception.h \
+	rocksdb/utilities/redis/redis_list_iterator.h \
+	rocksdb/utilities/redis/redis_lists.cc \
+	rocksdb/utilities/redis/redis_lists.h \
+	rocksdb/utilities/redis/redis_lists_test.cc \
+	rocksdb/utilities/spatialdb/spatial_db.cc \
+	rocksdb/utilities/spatialdb/spatial_db_test.cc \
+	rocksdb/utilities/spatialdb/utils.h \
+	rocksdb/utilities/ttl/db_ttl_impl.cc \
+	rocksdb/utilities/ttl/db_ttl_impl.h \
+	rocksdb/utilities/ttl/ttl_test.cc \
+	rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc \
+	rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc \
+	rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h \
+	rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc \
+	rocksdb/util/ldb_cmd.cc \
+	rocksdb/util/ldb_cmd_execute_result.h \
+	rocksdb/util/ldb_cmd.h \
+	rocksdb/util/ldb_tool.cc \
+	rocksdb/util/log_buffer.cc \
+	rocksdb/util/log_buffer.h \
+	rocksdb/util/logging.cc \
+	rocksdb/util/logging.h \
+	rocksdb/util/log_write_bench.cc \
+	rocksdb/util/manual_compaction_test.cc \
+	rocksdb/util/memenv.cc \
+	rocksdb/util/memenv_test.cc \
+	rocksdb/util/mock_env.cc \
+	rocksdb/util/mock_env.h \
+	rocksdb/util/mock_env_test.cc \
+	rocksdb/util/murmurhash.cc \
+	rocksdb/util/murmurhash.h \
+	rocksdb/util/mutable_cf_options.cc \
+	rocksdb/util/mutable_cf_options.h \
+	rocksdb/util/mutexlock.h \
+	rocksdb/util/options_builder.cc \
+	rocksdb/util/options.cc \
+	rocksdb/util/options_helper.cc \
+	rocksdb/util/options_helper.h \
+	rocksdb/util/options_test.cc \
+	rocksdb/util/perf_context.cc \
+	rocksdb/util/perf_context_imp.h \
+	rocksdb/util/posix_logger.h \
+	rocksdb/util/random.h \
+	rocksdb/util/rate_limiter.cc \
+	rocksdb/util/rate_limiter.h \
+	rocksdb/util/rate_limiter_test.cc \
+	rocksdb/util/scoped_arena_iterator.h \
+	rocksdb/util/skiplistrep.cc \
+	rocksdb/util/slice.cc \
+	rocksdb/util/slice_transform_test.cc \
+	rocksdb/util/sst_dump_test.cc \
+	rocksdb/util/sst_dump_tool.cc \
+	rocksdb/util/sst_dump_tool_imp.h \
+	rocksdb/util/statistics.cc \
+	rocksdb/util/statistics.h \
+	rocksdb/util/status.cc \
+	rocksdb/util/stl_wrappers.h \
+	rocksdb/util/stop_watch.h \
+	rocksdb/util/string_util.cc \
+	rocksdb/util/string_util.h \
+	rocksdb/util/sync_point.cc \
+	rocksdb/util/sync_point.h \
+	rocksdb/util/testharness.cc \
+	rocksdb/util/testharness.h \
+	rocksdb/util/testutil.cc \
+	rocksdb/util/testutil.h \
+	rocksdb/util/thread_list_test.cc \
+	rocksdb/util/thread_local.cc \
+	rocksdb/util/thread_local.h \
+	rocksdb/util/thread_local_test.cc \
+	rocksdb/util/thread_operation.h \
+	rocksdb/util/thread_status_impl.cc \
+	rocksdb/util/thread_status_updater.cc \
+	rocksdb/util/thread_status_updater_debug.cc \
+	rocksdb/util/thread_status_updater.h \
+	rocksdb/util/thread_status_util.cc \
+	rocksdb/util/thread_status_util_debug.cc \
+	rocksdb/util/thread_status_util.h \
+	rocksdb/util/vectorrep.cc \
+	rocksdb/util/xfunc.cc \
+	rocksdb/util/xfunc.h \
+	rocksdb/util/xxhash.cc \
+	rocksdb/util/xxhash.h
 endif # WITH_SLIBROCKSDB
diff --git a/src/Makefile-server.am b/src/Makefile-server.am
index 3cd5229..689b5c4 100644
--- a/src/Makefile-server.am
+++ b/src/Makefile-server.am
@@ -37,8 +37,6 @@ if WITH_OSD
 
 ceph_sbin_SCRIPTS += \
 	ceph-disk \
-	ceph-disk-prepare \
-	ceph-disk-activate \
 	ceph-disk-udev
 
 bin_SCRIPTS += \
diff --git a/src/Makefile.am b/src/Makefile.am
index b0f505a..90ec3f1 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -1,14 +1,18 @@
 include Makefile-env.am
 
-SUBDIRS += ocf java tracing
-DIST_SUBDIRS += gmock ocf libs3 java tracing
+SUBDIRS += ocf java
+DIST_SUBDIRS += gmock ocf java
 
+if NO_GIT_VERSION
+export NO_VERSION="yes"
+endif
 
 # subdirs
 
 include arch/Makefile.am
 include auth/Makefile.am
 include brag/Makefile.am
+include ceph-detect-init/Makefile.am
 include crush/Makefile.am
 include mon/Makefile.am
 include mds/Makefile.am
@@ -35,6 +39,8 @@ include rbd_replay/Makefile.am
 include test/Makefile.am
 include tools/Makefile.am
 include Makefile-rocksdb.am
+include compressor/Makefile.am
+include tracing/Makefile.am
 
 
 # shell scripts
@@ -63,7 +69,6 @@ CLEANFILES += $(shell_scripts)
 # extra bits
 
 EXTRA_DIST += \
-	$(srcdir)/verify-mds-journal.sh \
 	$(srcdir)/vstart.sh \
 	$(srcdir)/stop.sh \
 	ceph-run \
@@ -73,11 +78,11 @@ EXTRA_DIST += \
 	$(srcdir)/init-rbdmap \
 	$(srcdir)/ceph-clsinfo \
 	$(srcdir)/make_version \
-	$(srcdir)/check_version \
 	$(srcdir)/.git_version \
 	$(srcdir)/ceph-rbdnamer \
-	$(srcdir)/test/encoding/readable.sh \
+	$(srcdir)/tools/ceph-monstore-update-crush.sh \
 	$(srcdir)/upstart/ceph-all.conf \
+	$(srcdir)/upstart/ceph-disk.conf \
 	$(srcdir)/upstart/ceph-mon.conf \
 	$(srcdir)/upstart/ceph-mon-all.conf \
 	$(srcdir)/upstart/ceph-mon-all-starter.conf \
@@ -94,8 +99,6 @@ EXTRA_DIST += \
 	$(srcdir)/upstart/rbdmap.conf \
 	ceph.in \
 	ceph-disk \
-	ceph-disk-prepare \
-	ceph-disk-activate \
 	ceph-disk-udev \
 	ceph-create-keys \
 	ceph-rest-api \
@@ -106,23 +109,6 @@ EXTRA_DIST += \
 	yasm-wrapper
 
 EXTRA_DIST += \
-	libs3/COPYING \
-	libs3/ChangeLog \
-	libs3/GNUmakefile \
-	libs3/GNUmakefile.mingw \
-	libs3/GNUmakefile.osx \
-	libs3/INSTALL \
-	libs3/LICENSE \
-	libs3/README \
-	libs3/TODO \
-	libs3/archlinux \
-	libs3/debian \
-	libs3/doxyfile \
-	libs3/inc \
-	libs3/libs3.spec \
-	libs3/mswin \
-	libs3/src \
-	libs3/test \
 	unittest_bufferlist.sh
 
 
@@ -146,12 +132,12 @@ ceph_libexec_SCRIPTS = ceph-osd-prestart.sh
 # target by target
 
 TESTS = \
-	$(check_PROGRAMS) \
+	$(check_TESTPROGRAMS) \
 	$(check_SCRIPTS)
 
-check-local:
-	$(top_srcdir)/qa/workunits/erasure-code/encode-decode-non-regression.sh 
-	$(srcdir)/test/encoding/readable.sh ../ceph-object-corpus
+check_SCRIPTS += \
+	../qa/workunits/erasure-code/encode-decode-non-regression.sh \
+	test/encoding/readable.sh
 
 if WITH_LTTNG
 # TODO: If we're running the parallel test harness (the preferred harness), this should be AM_TESTS_ENVIRONMENT instead.
@@ -175,19 +161,17 @@ base: core-daemons admin-tools \
 
 FORCE:
 .git_version: FORCE
-	$(srcdir)/check_version $(srcdir)/.git_version
+	$(srcdir)/make_version -g $(srcdir)/.git_version
 
 # if NO_VERSION is set, only generate a new ceph_ver.h if there currently 
 # is none, and call "make_version -n" to fill it with a fixed string.
 # Otherwise, set it from the contents of .git_version.
 
-ceph_ver.h: .git_version
+ceph_ver.h: .git_version FORCE
 	if [ -n "$$NO_VERSION" ] ; then \
-	    if [ ! -f ./ceph_ver.h ] ; then \
-	        $(srcdir)/make_version -n ./ceph_ver.h ; \
-	    fi; \
-        else \
-	    $(srcdir)/make_version $(srcdir)/.git_version ./ceph_ver.h ; \
+		$(srcdir)/make_version -g $(srcdir)/.git_version -c $(srcdir)/ceph_ver.h -n ; \
+	else \
+		$(srcdir)/make_version -g $(srcdir)/.git_version -c $(srcdir)/ceph_ver.h ; \
 	fi
 
 ceph_ver.c: ./ceph_ver.h
@@ -198,14 +182,14 @@ sample.fetch_config: fetch_config
 	cp -f $(srcdir)/fetch_config ./sample.fetch_config
 
 dist-hook:
-	$(srcdir)/check_version $(srcdir)/.git_version
+	$(srcdir)/make_version -g $(srcdir)/.git_version
 
 CLEANFILES += ceph_ver.h sample.fetch_config
 
 
 # cleaning
 
-clean-local:
+clean-local::
 	rm -f *.so 
 	find . -name '*.gcno' -o -name '*.gcda' -o -name '*.lcov' -o -name "*.o" -o -name "*.lo" | xargs rm -f
 	rm -f ceph java/java/com/ceph/crush/Bucket.class
@@ -255,12 +239,12 @@ if ENABLE_COVERAGE
 	-test/coverage.sh -d $(srcdir) -o check-coverage make check
 endif
 
-install-data-local: install-coverage
+install-data-local:: install-coverage
 	-mkdir -p $(DESTDIR)$(sysconfdir)/ceph
 	-mkdir -p $(DESTDIR)$(localstatedir)/log/ceph
 	-mkdir -p $(DESTDIR)$(localstatedir)/lib/ceph/tmp
 
-uninstall-local: uninstall-coverage
+uninstall-local:: uninstall-coverage
 	-rmdir -p $(DESTDIR)$(sysconfdir)/ceph/
 	-rmdir -p $(DESTDIR)$(localstatedir)/log/ceph
 	-rmdir -p $(DESTDIR)$(localstatedir)/lib/ceph/tmp
diff --git a/src/Makefile.in b/src/Makefile.in
index 3b60555..15b44e7 100644
--- a/src/Makefile.in
+++ b/src/Makefile.in
@@ -14,8 +14,6 @@
 
 @SET_MAKE@
 
-# SHEC plugin
-
 
 
 
@@ -88,6 +86,7 @@ host_triplet = @host@
 target_triplet = @target@
 DIST_COMMON = $(srcdir)/Makefile-env.am $(srcdir)/arch/Makefile.am \
 	$(srcdir)/auth/Makefile.am $(srcdir)/brag/Makefile.am \
+	$(srcdir)/ceph-detect-init/Makefile.am \
 	$(srcdir)/crush/Makefile.am $(srcdir)/mon/Makefile.am \
 	$(srcdir)/mds/Makefile.am $(srcdir)/mds/Makefile-client.am \
 	$(srcdir)/mds/Makefile-server.am $(srcdir)/os/Makefile.am \
@@ -114,59 +113,32 @@ DIST_COMMON = $(srcdir)/Makefile-env.am $(srcdir)/arch/Makefile.am \
 	$(srcdir)/test/Makefile-server.am $(srcdir)/tools/Makefile.am \
 	$(srcdir)/tools/Makefile-client.am \
 	$(srcdir)/tools/Makefile-server.am \
-	$(srcdir)/Makefile-rocksdb.am $(srcdir)/Makefile-client.am \
+	$(srcdir)/Makefile-rocksdb.am $(srcdir)/compressor/Makefile.am \
+	$(srcdir)/tracing/Makefile.am $(srcdir)/Makefile-client.am \
 	$(srcdir)/Makefile-server.am $(srcdir)/Makefile.in \
 	$(srcdir)/Makefile.am $(srcdir)/acconfig.h.in \
 	$(dist_bin_SCRIPTS) $(top_srcdir)/depcomp \
 	$(am__python_PYTHON_DIST) $(top_srcdir)/py-compile \
-	$(am__noinst_HEADERS_DIST) $(top_srcdir)/test-driver README \
-	TODO
+	$(dist_noinst_DATA) $(am__noinst_HEADERS_DIST) \
+	$(top_srcdir)/test-driver README TODO
 bin_PROGRAMS = $(am__EXEEXT_27) $(am__EXEEXT_28) $(am__EXEEXT_29) \
 	$(am__EXEEXT_30) $(am__EXEEXT_31) $(am__EXEEXT_32) \
 	$(am__EXEEXT_33) $(am__EXEEXT_34) monmaptool$(EXEEXT) \
-	crushtool$(EXEEXT) osdmaptool$(EXEEXT) $(am__EXEEXT_35) \
-	ceph-conf$(EXEEXT) ceph-authtool$(EXEEXT) $(am__EXEEXT_36) \
+	crushtool$(EXEEXT) osdmaptool$(EXEEXT) ceph-conf$(EXEEXT) \
+	ceph-authtool$(EXEEXT) $(am__EXEEXT_35) $(am__EXEEXT_36) \
 	$(am__EXEEXT_37) $(am__EXEEXT_38) $(am__EXEEXT_39) \
 	$(am__EXEEXT_40) $(am__EXEEXT_41) $(am__EXEEXT_42) \
 	$(am__EXEEXT_43)
-noinst_PROGRAMS = $(am__EXEEXT_54) $(am__EXEEXT_55) $(am__EXEEXT_56)
+noinst_PROGRAMS = $(am__EXEEXT_59) $(am__EXEEXT_60) $(am__EXEEXT_61)
 sbin_PROGRAMS =
-su_sbin_PROGRAMS = $(am__EXEEXT_57)
-check_PROGRAMS = $(am__EXEEXT_44) $(am__EXEEXT_45) $(am__EXEEXT_46) \
-	$(am__EXEEXT_47) $(am__EXEEXT_48) $(am__EXEEXT_49) \
-	$(am__EXEEXT_50) $(am__EXEEXT_51) $(am__EXEEXT_52) \
-	$(am__EXEEXT_53) unittest_addrs$(EXEEXT) \
-	unittest_blkdev$(EXEEXT) unittest_bloom_filter$(EXEEXT) \
-	unittest_histogram$(EXEEXT) unittest_str_map$(EXEEXT) \
-	unittest_sharedptr_registry$(EXEEXT) \
-	unittest_shared_cache$(EXEEXT) \
-	unittest_sloppy_crc_map$(EXEEXT) unittest_util$(EXEEXT) \
-	unittest_crush_wrapper$(EXEEXT) unittest_crush$(EXEEXT) \
-	unittest_osdmap$(EXEEXT) unittest_workqueue$(EXEEXT) \
-	unittest_striper$(EXEEXT) \
-	unittest_prebufferedstreambuf$(EXEEXT) \
-	unittest_str_list$(EXEEXT) unittest_log$(EXEEXT) \
-	unittest_throttle$(EXEEXT) unittest_ceph_argparse$(EXEEXT) \
-	unittest_ceph_compatset$(EXEEXT) unittest_mds_types$(EXEEXT) \
-	unittest_osd_types$(EXEEXT) unittest_lru$(EXEEXT) \
-	unittest_io_priority$(EXEEXT) unittest_gather$(EXEEXT) \
-	unittest_signals$(EXEEXT) unittest_bufferlist$(EXEEXT) \
-	unittest_crc32c$(EXEEXT) unittest_arch$(EXEEXT) \
-	unittest_crypto$(EXEEXT) unittest_crypto_init$(EXEEXT) \
-	unittest_perf_counters$(EXEEXT) unittest_admin_socket$(EXEEXT) \
-	unittest_ceph_crypto$(EXEEXT) unittest_utf8$(EXEEXT) \
-	unittest_mime$(EXEEXT) unittest_escape$(EXEEXT) \
-	unittest_strtol$(EXEEXT) unittest_confutils$(EXEEXT) \
-	unittest_config$(EXEEXT) unittest_context$(EXEEXT) \
-	unittest_safe_io$(EXEEXT) unittest_heartbeatmap$(EXEEXT) \
-	unittest_formatter$(EXEEXT) unittest_daemon_config$(EXEEXT) \
-	unittest_ipaddr$(EXEEXT) unittest_texttable$(EXEEXT) \
-	unittest_on_exit$(EXEEXT) unittest_readahead$(EXEEXT) \
-	unittest_tableformatter$(EXEEXT) unittest_bit_vector$(EXEEXT)
+su_sbin_PROGRAMS = $(am__EXEEXT_62)
+check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
+	unittest_subprocess$(EXEEXT) \
+	unittest_async_compressor$(EXEEXT)
 
 # when doing a debug build, make sure to make the targets
 @WITH_DEBUG_TRUE at am__append_1 = $(bin_DEBUGPROGRAMS)
- at LINUX_TRUE@am__append_2 = -Wl,--as-needed
+ at LINUX_TRUE@am__append_2 = -Wl,--as-needed $(HARDENING_LDFLAGS)
 @USE_BOOST_SPIRIT_OLD_HDR_TRUE at am__append_3 = -DUSE_BOOST_SPIRIT_OLD_HDR
 @WITH_LIBATOMIC_TRUE at am__append_4 = -latomic_ops
 @ENABLE_COVERAGE_TRUE at am__append_5 = -fprofile-arcs -ftest-coverage
@@ -204,8 +176,8 @@ check_PROGRAMS = $(am__EXEEXT_44) $(am__EXEEXT_45) $(am__EXEEXT_46) \
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/DumplingMonCommands.h \
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/MonMap.h \
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/Monitor.h \
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/MonitorStore.h \
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/MonitorDBStore.h \
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/MonOpRequest.h \
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/OSDMonitor.h \
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/PGMap.h \
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/PGMonitor.h \
@@ -222,7 +194,8 @@ check_PROGRAMS = $(am__EXEEXT_44) $(am__EXEEXT_45) $(am__EXEEXT_46) \
 @ENABLE_CLIENT_TRUE@	libcls_lock_client.la \
 @ENABLE_CLIENT_TRUE@	libcls_refcount_client.la \
 @ENABLE_CLIENT_TRUE@	libcls_replica_log_client.a \
- at ENABLE_CLIENT_TRUE@	libcls_rgw_client.la libcls_user_client.a
+ at ENABLE_CLIENT_TRUE@	libcls_rgw_client.la libcls_user_client.a \
+ at ENABLE_CLIENT_TRUE@	libcls_numops_client.la
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at am__append_26 = libmds.la
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at am__append_27 =  \
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	mds/inode_backtrace.h \
@@ -238,7 +211,9 @@ check_PROGRAMS = $(am__EXEEXT_44) $(am__EXEEXT_45) $(am__EXEEXT_46) \
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	mds/MDBalancer.h \
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	mds/MDCache.h \
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	mds/RecoveryQueue.h \
- at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	mds/MDLog.h mds/MDS.h \
+ at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	mds/StrayManager.h \
+ at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	mds/MDLog.h mds/MDSRank.h \
+ at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	mds/MDSDaemon.h \
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	mds/Beacon.h \
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	mds/MDSContext.h \
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	mds/MDSAuthCaps.h \
@@ -275,21 +250,28 @@ check_PROGRAMS = $(am__EXEEXT_44) $(am__EXEEXT_45) $(am__EXEEXT_46) \
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	mds/events/ETableServer.h \
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	mds/events/EUpdate.h
 @ENABLE_SERVER_TRUE@@LINUX_TRUE at am__append_28 = os/BtrfsFileStoreBackend.cc
- at ENABLE_SERVER_TRUE@@WITH_LIBXFS_TRUE at am__append_29 = os/XfsFileStoreBackend.cc
- at ENABLE_SERVER_TRUE@@WITH_LIBZFS_TRUE at am__append_30 = os/ZFSFileStoreBackend.cc
- at ENABLE_SERVER_TRUE@@WITH_LTTNG_TRUE at am__append_31 = $(LIBOS_TP)
- at ENABLE_SERVER_TRUE@am__append_32 = libos.la
- at ENABLE_SERVER_TRUE@am__append_33 = \
+ at ENABLE_SERVER_TRUE@@WITH_LIBAIO_TRUE at am__append_29 = os/newstore/newstore_types.cc
+ at ENABLE_SERVER_TRUE@@WITH_LIBAIO_TRUE at am__append_30 = os/newstore/NewStore.cc
+ at ENABLE_SERVER_TRUE@@WITH_LIBXFS_TRUE at am__append_31 = \
+ at ENABLE_SERVER_TRUE@@WITH_LIBXFS_TRUE@    os/fs/XFS.cc \
+ at ENABLE_SERVER_TRUE@@WITH_LIBXFS_TRUE@    os/XfsFileStoreBackend.cc
+
+ at ENABLE_SERVER_TRUE@@WITH_LIBZFS_TRUE at am__append_32 = os/ZFSFileStoreBackend.cc
+ at ENABLE_SERVER_TRUE@am__append_33 = libos.la
+ at ENABLE_SERVER_TRUE@am__append_34 = \
 @ENABLE_SERVER_TRUE@	os/btrfs_ioctl.h \
 @ENABLE_SERVER_TRUE@	os/chain_xattr.h \
+ at ENABLE_SERVER_TRUE@	os/newstore/newstore_types.h \
+ at ENABLE_SERVER_TRUE@	os/newstore/NewStore.h \
 @ENABLE_SERVER_TRUE@	os/BtrfsFileStoreBackend.h \
 @ENABLE_SERVER_TRUE@	os/CollectionIndex.h \
 @ENABLE_SERVER_TRUE@	os/DBObjectMap.h \
 @ENABLE_SERVER_TRUE@	os/GenericObjectMap.h \
 @ENABLE_SERVER_TRUE@	os/FileJournal.h \
 @ENABLE_SERVER_TRUE@	os/FileStore.h \
- at ENABLE_SERVER_TRUE@	os/FlatIndex.h \
 @ENABLE_SERVER_TRUE@	os/FDCache.h \
+ at ENABLE_SERVER_TRUE@	os/fs/FS.h \
+ at ENABLE_SERVER_TRUE@	os/fs/XFS.h \
 @ENABLE_SERVER_TRUE@	os/GenericFileStoreBackend.h \
 @ENABLE_SERVER_TRUE@	os/HashIndex.h \
 @ENABLE_SERVER_TRUE@	os/IndexManager.h \
@@ -302,26 +284,25 @@ check_PROGRAMS = $(am__EXEEXT_44) $(am__EXEEXT_45) $(am__EXEEXT_46) \
 @ENABLE_SERVER_TRUE@	os/KeyValueStore.h \
 @ENABLE_SERVER_TRUE@	os/ObjectMap.h \
 @ENABLE_SERVER_TRUE@	os/ObjectStore.h \
+ at ENABLE_SERVER_TRUE@	os/PageSet.h \
 @ENABLE_SERVER_TRUE@	os/SequencerPosition.h \
 @ENABLE_SERVER_TRUE@	os/WBThrottle.h \
 @ENABLE_SERVER_TRUE@	os/XfsFileStoreBackend.h \
 @ENABLE_SERVER_TRUE@	os/ZFSFileStoreBackend.h
 
- at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE at am__append_34 = libos_rocksdb.la
- at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE at am__append_35 = os/RocksDBStore.h
- at ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_TRUE at am__append_36 = libos_rocksdb.la
- at ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_TRUE at am__append_37 = os/RocksDBStore.h
- at ENABLE_SERVER_TRUE@@WITH_LIBZFS_TRUE at am__append_38 = libos_zfs.a
- at ENABLE_SERVER_TRUE@@WITH_LIBZFS_TRUE at am__append_39 = os/ZFS.h
- at ENABLE_SERVER_TRUE@@WITH_KINETIC_TRUE at am__append_40 = os/KineticStore.cc
- at ENABLE_SERVER_TRUE@@WITH_KINETIC_TRUE at am__append_41 = -std=gnu++11
- at ENABLE_SERVER_TRUE@@WITH_KINETIC_TRUE at am__append_42 = -lkinetic_client -lprotobuf -lglog -lgflags libcrypto.a
- at ENABLE_SERVER_TRUE@@WITH_KINETIC_TRUE at am__append_43 = os/KineticStore.h
- at ENABLE_SERVER_TRUE@@WITH_KINETIC_TRUE@@WITH_OSD_TRUE at am__append_44 = -std=gnu++11
- at ENABLE_SERVER_TRUE@@WITH_LTTNG_TRUE@@WITH_OSD_TRUE at am__append_45 = $(LIBOSD_TP)
+ at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE at am__append_35 = libos_rocksdb.la
+ at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE at am__append_36 = os/RocksDBStore.h
+ at ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_TRUE at am__append_37 = libos_rocksdb.la
+ at ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_TRUE at am__append_38 = os/RocksDBStore.h
+ at ENABLE_SERVER_TRUE@@WITH_LIBZFS_TRUE at am__append_39 = libos_zfs.a
+ at ENABLE_SERVER_TRUE@@WITH_LIBZFS_TRUE at am__append_40 = os/ZFS.h
+ at ENABLE_SERVER_TRUE@@WITH_KINETIC_TRUE at am__append_41 = os/KineticStore.cc
+ at ENABLE_SERVER_TRUE@@WITH_KINETIC_TRUE at am__append_42 = -std=gnu++11
+ at ENABLE_SERVER_TRUE@@WITH_KINETIC_TRUE at am__append_43 = -lkinetic_client -lprotobuf -lglog -lgflags libcrypto.a
+ at ENABLE_SERVER_TRUE@@WITH_KINETIC_TRUE at am__append_44 = os/KineticStore.h
+ at ENABLE_SERVER_TRUE@@WITH_KINETIC_TRUE@@WITH_OSD_TRUE at am__append_45 = -std=gnu++11
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_46 = libosd.la
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_47 = \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/Ager.h \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/ClassHandler.h \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/HitSet.h \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/OSD.h \
@@ -353,9 +334,16 @@ check_PROGRAMS = $(am__EXEEXT_44) $(am__EXEEXT_45) $(am__EXEEXT_46) \
 @LINUX_TRUE at am__append_55 = -export-symbols-regex '.*__erasure_code_.*'
 @LINUX_TRUE at am__append_56 = -export-symbols-regex '.*__erasure_code_.*'
 @LINUX_TRUE at am__append_57 = -export-symbols-regex '.*__erasure_code_.*'
+ at LINUX_TRUE@am__append_58 = -export-symbols-regex '.*__erasure_code_.*'
+ at HAVE_NEON_TRUE@am__append_59 = libec_shec_neon.la
+ at LINUX_TRUE@am__append_60 = -export-symbols-regex '.*__erasure_code_.*'
+ at HAVE_SSSE3_TRUE@am__append_61 = libec_shec_sse3.la
+ at LINUX_TRUE@am__append_62 = -export-symbols-regex '.*__erasure_code_.*'
+ at HAVE_SSE4_PCLMUL_TRUE@am__append_63 = libec_shec_sse4.la
+ at LINUX_TRUE@am__append_64 = -export-symbols-regex '.*__erasure_code_.*'
 
 # ISA
- at WITH_BETTER_YASM_ELF64_TRUE@am__append_58 = \
+ at WITH_BETTER_YASM_ELF64_TRUE@am__append_65 = \
 @WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/ErasureCodeIsa.h \
 @WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/ErasureCodeIsaTableCache.h \
 @WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/xor_op.h \
@@ -366,15 +354,16 @@ check_PROGRAMS = $(am__EXEEXT_44) $(am__EXEEXT_45) $(am__EXEEXT_46) \
 @WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/include/gf_vect_mul.h \
 @WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/include/types.h
 
- at LINUX_TRUE@@WITH_BETTER_YASM_ELF64_TRUE at am__append_59 = -export-symbols-regex '.*__erasure_code_.*'
- at WITH_BETTER_YASM_ELF64_TRUE@am__append_60 = libec_isa.la
- at ENABLE_CLIENT_TRUE@am__append_61 = libclient.la
- at ENABLE_CLIENT_TRUE@am__append_62 = \
+ at LINUX_TRUE@@WITH_BETTER_YASM_ELF64_TRUE at am__append_66 = -export-symbols-regex '.*__erasure_code_.*'
+ at WITH_BETTER_YASM_ELF64_TRUE@am__append_67 = libec_isa.la
+ at ENABLE_CLIENT_TRUE@am__append_68 = libclient.la
+ at ENABLE_CLIENT_TRUE@am__append_69 = \
 @ENABLE_CLIENT_TRUE@	client/Client.h \
 @ENABLE_CLIENT_TRUE@	client/Dentry.h \
 @ENABLE_CLIENT_TRUE@	client/Dir.h \
 @ENABLE_CLIENT_TRUE@	client/Fh.h \
 @ENABLE_CLIENT_TRUE@	client/Inode.h \
+ at ENABLE_CLIENT_TRUE@	client/InodeRef.h \
 @ENABLE_CLIENT_TRUE@	client/MetaRequest.h \
 @ENABLE_CLIENT_TRUE@	client/MetaSession.h \
 @ENABLE_CLIENT_TRUE@	client/ClientSnapRealm.h \
@@ -383,37 +372,36 @@ check_PROGRAMS = $(am__EXEEXT_44) $(am__EXEEXT_45) $(am__EXEEXT_46) \
 @ENABLE_CLIENT_TRUE@	client/ioctl.h \
 @ENABLE_CLIENT_TRUE@	client/ObjecterWriteback.h
 
- at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE at am__append_63 = libclient_fuse.la
- at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE at am__append_64 = client/fuse_ll.h
- at ENABLE_CLIENT_TRUE@am__append_65 = ceph_test_ioctls
- at WITH_TCMALLOC_TRUE@am__append_66 = perfglue/heap_profiler.cc
- at WITH_TCMALLOC_TRUE@am__append_67 = -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free
- at WITH_TCMALLOC_TRUE@am__append_68 = -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free
- at WITH_TCMALLOC_FALSE@@WITH_TCMALLOC_MINIMAL_TRUE at am__append_69 = perfglue/heap_profiler.cc
- at WITH_TCMALLOC_FALSE@@WITH_TCMALLOC_MINIMAL_TRUE at am__append_70 = -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free
- at WITH_TCMALLOC_FALSE@@WITH_TCMALLOC_MINIMAL_TRUE at am__append_71 = -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free
- at WITH_TCMALLOC_FALSE@@WITH_TCMALLOC_MINIMAL_FALSE at am__append_72 = perfglue/disabled_heap_profiler.cc
- at WITH_PROFILER_TRUE@am__append_73 = perfglue/cpu_profiler.cc
- at WITH_PROFILER_FALSE@am__append_74 = perfglue/disabled_stubs.cc
- at WITH_RBD_TRUE@am__append_75 = \
- at WITH_RBD_TRUE@	common/blkdev.cc
-
- at ENABLE_XIO_TRUE@am__append_76 = \
+ at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE at am__append_70 = libclient_fuse.la
+ at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE at am__append_71 = client/fuse_ll.h
+ at ENABLE_CLIENT_TRUE@am__append_72 = ceph_test_ioctls
+ at WITH_TCMALLOC_TRUE@am__append_73 = perfglue/heap_profiler.cc
+ at WITH_TCMALLOC_TRUE@am__append_74 = -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free
+ at WITH_TCMALLOC_TRUE@am__append_75 = -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free
+ at WITH_TCMALLOC_FALSE@@WITH_TCMALLOC_MINIMAL_TRUE at am__append_76 = perfglue/heap_profiler.cc
+ at WITH_TCMALLOC_FALSE@@WITH_TCMALLOC_MINIMAL_TRUE at am__append_77 = -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free
+ at WITH_TCMALLOC_FALSE@@WITH_TCMALLOC_MINIMAL_TRUE at am__append_78 = -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free
+ at WITH_TCMALLOC_FALSE@@WITH_TCMALLOC_MINIMAL_FALSE at am__append_79 = perfglue/disabled_heap_profiler.cc
+ at WITH_PROFILER_TRUE@am__append_80 = perfglue/cpu_profiler.cc
+ at WITH_PROFILER_FALSE@am__append_81 = perfglue/disabled_stubs.cc
+ at ENABLE_XIO_TRUE@am__append_82 = \
 @ENABLE_XIO_TRUE@	common/address_helper.cc
 
- at WITH_GOOD_YASM_ELF64_TRUE@am__append_77 = common/crc32c_intel_fast_asm.S common/crc32c_intel_fast_zero_asm.S
- at LINUX_TRUE@am__append_78 = -lrt
- at ENABLE_XIO_TRUE@am__append_79 = \
+ at WITH_GOOD_YASM_ELF64_TRUE@am__append_83 = common/crc32c_intel_fast_asm.S common/crc32c_intel_fast_zero_asm.S
+ at HAVE_ARMV8_CRC_TRUE@am__append_84 = libcommon_crc_aarch64.la
+ at HAVE_ARMV8_CRC_TRUE@am__append_85 = libcommon_crc_aarch64.la
+ at LINUX_TRUE@am__append_86 = -lrt -lblkid
+ at ENABLE_XIO_TRUE@am__append_87 = \
 @ENABLE_XIO_TRUE@	common/address_helper.h
 
- at LINUX_TRUE@am__append_80 = libsecret.la
- at LINUX_TRUE@am__append_81 = msg/async/EventEpoll.cc
- at DARWIN_TRUE@am__append_82 = msg/async/EventKqueue.cc
- at FREEBSD_TRUE@am__append_83 = msg/async/EventKqueue.cc
- at LINUX_TRUE@am__append_84 = msg/async/EventEpoll.h
- at DARWIN_TRUE@am__append_85 = msg/async/EventKqueue.h
- at FREEBSD_TRUE@am__append_86 = msg/async/EventKqueue.h
- at ENABLE_XIO_TRUE@am__append_87 = \
+ at LINUX_TRUE@am__append_88 = libsecret.la
+ at LINUX_TRUE@am__append_89 = msg/async/EventEpoll.cc
+ at DARWIN_TRUE@am__append_90 = msg/async/EventKqueue.cc
+ at FREEBSD_TRUE@am__append_91 = msg/async/EventKqueue.cc
+ at LINUX_TRUE@am__append_92 = msg/async/EventEpoll.h
+ at DARWIN_TRUE@am__append_93 = msg/async/EventKqueue.h
+ at FREEBSD_TRUE@am__append_94 = msg/async/EventKqueue.h
+ at ENABLE_XIO_TRUE@am__append_95 = \
 @ENABLE_XIO_TRUE@	msg/xio/QueueStrategy.cc \
 @ENABLE_XIO_TRUE@	msg/xio/XioConnection.cc \
 @ENABLE_XIO_TRUE@	msg/xio/XioMessenger.cc \
@@ -421,7 +409,7 @@ check_PROGRAMS = $(am__EXEEXT_44) $(am__EXEEXT_45) $(am__EXEEXT_46) \
 @ENABLE_XIO_TRUE@	msg/xio/XioPortal.cc \
 @ENABLE_XIO_TRUE@	msg/xio/XioPool.cc
 
- at ENABLE_XIO_TRUE@am__append_88 = \
+ at ENABLE_XIO_TRUE@am__append_96 = \
 @ENABLE_XIO_TRUE@	msg/xio/DispatchStrategy.h \
 @ENABLE_XIO_TRUE@	msg/xio/FastStrategy.h \
 @ENABLE_XIO_TRUE@	msg/xio/QueueStrategy.h \
@@ -433,18 +421,17 @@ check_PROGRAMS = $(am__EXEEXT_44) $(am__EXEEXT_45) $(am__EXEEXT_46) \
 @ENABLE_XIO_TRUE@	msg/xio/XioPortal.h \
 @ENABLE_XIO_TRUE@	msg/xio/XioSubmit.h
 
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_89 =  \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_97 =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	librados_internal.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	librados_api.la
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_90 = \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_98 = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	librados_internal.la libcls_lock_client.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBOSDC) $(LIBCOMMON_DEPS)
 
- at ENABLE_CLIENT_TRUE@@WITH_LTTNG_TRUE@@WITH_RADOS_TRUE at am__append_91 = $(LIBRADOS_TP)
- at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE at am__append_92 = -fvisibility=hidden -fvisibility-inlines-hidden
- at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE at am__append_93 = -Xcompiler -Xlinker -Xcompiler '--exclude-libs=ALL'
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_94 = librados.la
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_95 = \
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE at am__append_99 = -fvisibility=hidden -fvisibility-inlines-hidden
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE at am__append_100 = -Xcompiler -Xlinker -Xcompiler '--exclude-libs=ALL'
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_101 = librados.la
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_102 = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	librados/snap_set_diff.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	librados/AioCompletionImpl.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	librados/IoCtxImpl.h \
@@ -453,18 +440,17 @@ check_PROGRAMS = $(am__EXEEXT_44) $(am__EXEEXT_45) $(am__EXEEXT_46) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	librados/RadosXattrIter.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	librados/ListObjectImpl.h
 
- at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at am__append_96 = -export-symbols-regex '^radosstriper_.*'
- at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at am__append_97 = libradosstriper.la
- at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at am__append_98 = \
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at am__append_103 = -export-symbols-regex '^radosstriper_.*'
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at am__append_104 = libradosstriper.la
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at am__append_105 = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	libradosstriper/RadosStriperImpl.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	libradosstriper/MultiAioCompletionImpl.h
 
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_99 = librbd_internal.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_106 = librbd_internal.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_api.la
- at ENABLE_CLIENT_TRUE@@WITH_LTTNG_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_100 = $(LIBRBD_TP)
- at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_101 = -Xcompiler -Xlinker -Xcompiler '--exclude-libs=ALL'
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_102 = librbd.la
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_103 = \
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_107 = -Xcompiler -Xlinker -Xcompiler '--exclude-libs=ALL'
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_108 = librbd.la
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_109 = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AioCompletion.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AioRequest.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AsyncFlattenRequest.h \
@@ -474,32 +460,35 @@ check_PROGRAMS = $(am__EXEEXT_44) $(am__EXEEXT_45) $(am__EXEEXT_46) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AsyncResizeRequest.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AsyncTrimRequest.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/CopyupRequest.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/DiffIterate.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/ImageCtx.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/ImageWatcher.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/internal.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/LibrbdWriteback.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/ObjectMap.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/parent_types.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/RebuildObjectMapRequest.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/SnapInfo.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/TaskFinisher.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/WatchNotifyTypes.h
 
 
 # inject rgw stuff in the decoder testcase
- at ENABLE_CLIENT_TRUE@am__append_104 = \
+ at ENABLE_CLIENT_TRUE@am__append_110 = \
 @ENABLE_CLIENT_TRUE@	rgw/rgw_dencoder.cc \
 @ENABLE_CLIENT_TRUE@	rgw/rgw_acl.cc \
 @ENABLE_CLIENT_TRUE@	rgw/rgw_common.cc \
 @ENABLE_CLIENT_TRUE@	rgw/rgw_env.cc \
 @ENABLE_CLIENT_TRUE@	rgw/rgw_json_enc.cc
 
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__append_105 = librgw.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__append_111 = librgw.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcivetweb.la
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__append_106 = \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__append_112 = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_rgw_client.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_log_client.a \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_statelog_client.a \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_timeindex_client.a \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_user_client.a \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_replica_log_client.a \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_lock_client.la \
@@ -511,12 +500,12 @@ check_PROGRAMS = $(am__EXEEXT_44) $(am__EXEEXT_45) $(am__EXEEXT_46) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	-lfcgi \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	-ldl
 
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__append_107 = radosgw \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	radosgw-admin
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__append_108 = ceph_rgw_multiparser \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__append_113 = radosgw \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	radosgw-admin \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	radosgw-object-expirer
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__append_114 = ceph_rgw_multiparser \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	ceph_rgw_jsonparser
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__append_109 = \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/logrotate.conf \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__append_115 = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_acl.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_acl_s3.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_acl_swift.h \
@@ -538,6 +527,7 @@ check_PROGRAMS = $(am__EXEEXT_44) $(am__EXEEXT_45) $(am__EXEEXT_46) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_gc.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_metadata.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_multi_del.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_object_expirer_core.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_op.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_orphan.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_http_client.h \
@@ -574,18 +564,22 @@ check_PROGRAMS = $(am__EXEEXT_44) $(am__EXEEXT_45) $(am__EXEEXT_46) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	civetweb/include/civetweb_conf.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	civetweb/src/md5.h
 
- at ENABLE_CLIENT_TRUE@am__append_110 = libcls_lock_client.la \
+ at ENABLE_CLIENT_TRUE@am__append_116 = libcls_lock_client.la \
 @ENABLE_CLIENT_TRUE@	libcls_refcount_client.la \
- at ENABLE_CLIENT_TRUE@	libcls_rgw_client.la libcls_rbd_client.la
- at ENABLE_CLIENT_TRUE@am__append_111 = libcls_version_client.a \
+ at ENABLE_CLIENT_TRUE@	libcls_rgw_client.la libcls_rbd_client.la \
+ at ENABLE_CLIENT_TRUE@	libcls_cephfs_client.la \
+ at ENABLE_CLIENT_TRUE@	libcls_numops_client.la
+ at ENABLE_CLIENT_TRUE@am__append_117 = libcls_version_client.a \
 @ENABLE_CLIENT_TRUE@	libcls_log_client.a \
 @ENABLE_CLIENT_TRUE@	libcls_statelog_client.a \
+ at ENABLE_CLIENT_TRUE@	libcls_timeindex_client.a \
 @ENABLE_CLIENT_TRUE@	libcls_replica_log_client.a \
 @ENABLE_CLIENT_TRUE@	libcls_user_client.a
- at ENABLE_CLIENT_TRUE@am__append_112 = \
+ at ENABLE_CLIENT_TRUE@am__append_118 = \
 @ENABLE_CLIENT_TRUE@	cls/lock/cls_lock_types.h \
 @ENABLE_CLIENT_TRUE@	cls/lock/cls_lock_ops.h \
 @ENABLE_CLIENT_TRUE@	cls/lock/cls_lock_client.h \
+ at ENABLE_CLIENT_TRUE@	cls/numops/cls_numops_client.h \
 @ENABLE_CLIENT_TRUE@	cls/rbd/cls_rbd.h \
 @ENABLE_CLIENT_TRUE@	cls/rbd/cls_rbd_client.h \
 @ENABLE_CLIENT_TRUE@	cls/refcount/cls_refcount_ops.h \
@@ -599,6 +593,9 @@ check_PROGRAMS = $(am__EXEEXT_44) $(am__EXEEXT_45) $(am__EXEEXT_46) \
 @ENABLE_CLIENT_TRUE@	cls/statelog/cls_statelog_types.h \
 @ENABLE_CLIENT_TRUE@	cls/statelog/cls_statelog_ops.h \
 @ENABLE_CLIENT_TRUE@	cls/statelog/cls_statelog_client.h \
+ at ENABLE_CLIENT_TRUE@	cls/timeindex/cls_timeindex_types.h \
+ at ENABLE_CLIENT_TRUE@	cls/timeindex/cls_timeindex_ops.h \
+ at ENABLE_CLIENT_TRUE@	cls/timeindex/cls_timeindex_client.h \
 @ENABLE_CLIENT_TRUE@	cls/replica_log/cls_replica_log_types.h \
 @ENABLE_CLIENT_TRUE@	cls/replica_log/cls_replica_log_ops.h \
 @ENABLE_CLIENT_TRUE@	cls/replica_log/cls_replica_log_client.h \
@@ -607,52 +604,59 @@ check_PROGRAMS = $(am__EXEEXT_44) $(am__EXEEXT_45) $(am__EXEEXT_46) \
 @ENABLE_CLIENT_TRUE@	cls/rgw/cls_rgw_types.h \
 @ENABLE_CLIENT_TRUE@	cls/user/cls_user_client.h \
 @ENABLE_CLIENT_TRUE@	cls/user/cls_user_ops.h \
- at ENABLE_CLIENT_TRUE@	cls/user/cls_user_types.h
+ at ENABLE_CLIENT_TRUE@	cls/user/cls_user_types.h \
+ at ENABLE_CLIENT_TRUE@	cls/cephfs/cls_cephfs.h \
+ at ENABLE_CLIENT_TRUE@	cls/cephfs/cls_cephfs_client.h
 
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_113 = libcls_hello.la \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_119 = libcls_hello.la \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libcls_numops.la \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libcls_rbd.la \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libcls_lock.la \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libcls_refcount.la \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libcls_version.la \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libcls_log.la \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libcls_statelog.la \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libcls_timeindex.la \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libcls_replica_log.la \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libcls_user.la \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libcls_rgw.la
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_114 = libcls_kvs.la
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_115 = \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libcls_rgw.la \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libcls_cephfs.la
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_120 = libcls_kvs.la
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_121 = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	key_value_store/key_value_structure.h \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	key_value_store/kv_flat_btree_async.h \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	key_value_store/kvs_arg_types.h
 
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_116 = librbd_replay.la \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_replay_ios.la
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_117 = rbd_replay/BoundedBuffer.hpp \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_122 = rbd_replay/ActionTypes.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd_replay/actions.hpp \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd_replay/Deser.hpp \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd_replay/BoundedBuffer.hpp \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd_replay/BufferReader.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd_replay/ImageNameMap.hpp \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd_replay/ios.hpp \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd_replay/PendingIO.hpp \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd_replay/rbd_loc.hpp \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd_replay/rbd_replay_debug.hpp \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd_replay/Replayer.hpp \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd_replay/Ser.hpp
-
- at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_118 = rbd-replay
- at ENABLE_CLIENT_TRUE@@WITH_BABELTRACE_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_119 = rbd-replay-prep
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_120 = \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	test/erasure-code/test-erasure-code.sh
-
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_121 = test/erasure-code/ceph_erasure_code_benchmark.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd_replay/Replayer.hpp
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_123 = librbd_replay_types.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_replay.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_replay_ios.la
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_124 = librbd_replay_types.la
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_125 = rbd-replay
+ at ENABLE_CLIENT_TRUE@@WITH_BABELTRACE_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_126 = rbd-replay-prep
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_127 = \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	test/erasure-code/test-erasure-code.sh \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	test/erasure-code/test-erasure-eio.sh
+
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_128 = test/erasure-code/ceph_erasure_code_benchmark.h \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	test/erasure-code/ceph_erasure_code_benchmark.h \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	test/erasure-code/ErasureCodeExample.h
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_122 = -ldl
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_123 = ceph_erasure_code_benchmark \
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_129 = -ldl
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_130 = ceph_erasure_code_benchmark \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	ceph_erasure_code
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_124 = -ldl
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_125 = ceph_erasure_code_non_regression
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_126 = -ldl
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_127 = libec_example.la \
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_131 = -ldl
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_132 = ceph_erasure_code_non_regression
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_133 = -ldl
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_134 = libec_example.la \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libec_missing_entry_point.la \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libec_missing_version.la \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libec_hangs.la \
@@ -661,60 +665,69 @@ check_PROGRAMS = $(am__EXEEXT_44) $(am__EXEEXT_45) $(am__EXEEXT_46) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libec_test_jerasure_neon.la \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libec_test_jerasure_sse4.la \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libec_test_jerasure_sse3.la \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libec_test_jerasure_generic.la
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_128 = -ldl
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_129 = unittest_erasure_code_plugin \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libec_test_jerasure_generic.la \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libec_test_shec_neon.la \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libec_test_shec_sse4.la \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libec_test_shec_sse3.la \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libec_test_shec_generic.la
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_135 = -ldl
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_136 = unittest_erasure_code_plugin \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code_jerasure \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code_plugin_jerasure
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_130 = -ldl
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_131 = -ldl
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE at am__append_132 = -ldl
- at ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE at am__append_133 = unittest_erasure_code_isa \
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_137 = -ldl
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_138 = -ldl
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE at am__append_139 = -ldl
+ at ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE at am__append_140 = unittest_erasure_code_isa \
 @ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code_plugin_isa
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE at am__append_134 = -ldl
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_135 = -ldl
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_136 =  \
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE at am__append_141 = -ldl
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_142 = -ldl
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_143 =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code_lrc \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code_plugin_lrc \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code_shec \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code_shec_all \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code_shec_thread \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code_shec_arguments \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code_plugin_shec \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code_example
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_137 = -ldl
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_138 = -ldl
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_139 = -ldl
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_140 = -ldl
- at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE at am__append_141 = test/messenger/message_helper.h \
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_144 = -ldl
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_145 = -ldl
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_146 = -ldl
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_147 = -ldl
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_148 = -ldl
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_149 = -ldl
+ at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE at am__append_150 = test/messenger/message_helper.h \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	test/messenger/simple_dispatcher.h \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	test/messenger/xio_dispatcher.h
- at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@@LINUX_TRUE at am__append_142 = -ldl
- at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@@LINUX_TRUE at am__append_143 = -ldl
- at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE at am__append_144 = simple_server \
+ at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@@LINUX_TRUE at am__append_151 = -ldl
+ at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@@LINUX_TRUE at am__append_152 = -ldl
+ at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE at am__append_153 = simple_server \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	simple_client xio_server \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	xio_client
- at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@@LINUX_TRUE at am__append_145 = -ldl
- at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@@LINUX_TRUE at am__append_146 = -ldl
- at COMPILER_HAS_VTA_TRUE@@ENABLE_CLIENT_TRUE at am__append_147 = -fno-var-tracking-assignments
- at COMPILER_HAS_VTA_TRUE@@ENABLE_CLIENT_TRUE at am__append_148 = -fno-var-tracking-assignments
- at ENABLE_CLIENT_TRUE@am__append_149 = ceph-dencoder
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_150 = libradostest.la \
+ at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@@LINUX_TRUE at am__append_154 = -ldl
+ at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@@LINUX_TRUE at am__append_155 = -ldl
+ at COMPILER_HAS_VTA_TRUE@@ENABLE_CLIENT_TRUE at am__append_156 = -fno-var-tracking-assignments
+ at COMPILER_HAS_VTA_TRUE@@ENABLE_CLIENT_TRUE at am__append_157 = -fno-var-tracking-assignments
+ at ENABLE_CLIENT_TRUE@am__append_158 = ceph-dencoder
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_159 = libradostest.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	librados_test_stub.la
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_151 = ceph_test_rados \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_160 = ceph_test_rados \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_test_mutate
- at ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOS_TRUE at am__append_152 = test_build_librados
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_153 =  \
+ at ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOS_TRUE at am__append_161 = test_build_librados
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_162 =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_smalliobench \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_omapbench
- at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE at am__append_154 = ceph_kvstorebench \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_omapbench \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_objectstore_bench
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE at am__append_163 = ceph_kvstorebench \
 @ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@	ceph_test_rados_list_parallel \
 @ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@	ceph_test_rados_open_pools_parallel \
 @ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@	ceph_test_rados_delete_pools_parallel \
 @ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@	ceph_test_rados_watch_notify
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_155 =  \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_164 =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	unittest_librados \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	unittest_librados_config
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_156 =  \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_165 =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_multi_stress_watch \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_test_cls_rbd \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_test_cls_refcount \
@@ -724,6 +737,7 @@ check_PROGRAMS = $(am__EXEEXT_44) $(am__EXEEXT_45) $(am__EXEEXT_46) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_test_cls_replica_log \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_test_cls_lock \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_test_cls_hello \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_test_cls_numops \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_test_rados_api_cmd \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_test_rados_api_io \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_test_rados_api_c_write_operations \
@@ -740,7 +754,8 @@ check_PROGRAMS = $(am__EXEEXT_44) $(am__EXEEXT_45) $(am__EXEEXT_46) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_test_rados_api_tier \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_test_rados_api_lock \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_test_stress_watch
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_157 = \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_166 = \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	test/librados_test_stub/LibradosTestStub.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	test/librados_test_stub/TestClassHandler.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	test/librados_test_stub/TestRadosClient.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	test/librados_test_stub/TestMemRadosClient.h \
@@ -748,43 +763,57 @@ check_PROGRAMS = $(am__EXEEXT_44) $(am__EXEEXT_45) $(am__EXEEXT_46) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	test/librados_test_stub/TestMemIoCtxImpl.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	test/librados_test_stub/TestIoCtxImpl.h
 
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_158 = ceph_smalliobenchrbd \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_167 = ceph_smalliobenchrbd \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	ceph_test_librbd \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	ceph_test_librbd_api
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_159 = unittest_rbd_replay \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	unittest_librbd
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_160 = librbd_test.la
- at ENABLE_CLIENT_TRUE@@WITH_LTTNG_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_161 = $(LIBRBD_TP)
- at ENABLE_CLIENT_TRUE@@WITH_LTTNG_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_162 = $(LIBRBD_TP)
- at ENABLE_CLIENT_TRUE@@WITH_LTTNG_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_163 = $(LIBRBD_TP)
- at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_164 = ceph_test_librbd_fsx
- at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at am__append_165 = libradosstripertest.la
- at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at am__append_166 = ceph_test_rados_striper_api_io \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_168 = unittest_rbd_replay
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_169 = librbd_test.la
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_170 = unittest_librbd
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_171 = test/run-rbd-unit-tests.sh
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_172 = \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/test_fixture.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/test_mock_fixture.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/test_support.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/mock/MockContextWQ.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/mock/MockImageCtx.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/mock/MockImageWatcher.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/mock/MockObjectMap.h
+
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_173 = ceph_test_librbd_fsx
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at am__append_174 = libradosstripertest.la
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at am__append_175 = ceph_test_rados_striper_api_io \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	ceph_test_rados_striper_api_aio \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	ceph_test_rados_striper_api_striping
- at ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_167 = test_build_libcephfs
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_168 = unittest_encoding \
+ at ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_176 = test_build_libcephfs
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_177 = unittest_encoding \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	unittest_base64 \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	unittest_run_cmd \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	unittest_simple_spin \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	unittest_libcephfs_config
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_169 = ceph_test_libcephfs \
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_178 = test/libcephfs/flock.cc
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_179 = ceph_test_libcephfs \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	ceph_test_c_headers
- at ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__append_170 = test_build_librgw
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__append_171 = ceph_test_cors \
+ at CLANG_FALSE@@ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_180 = -Werror -Wold-style-declaration
+ at ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__append_181 = test_build_librgw
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__append_182 = ceph_test_cors \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	ceph_test_rgw_manifest \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	ceph_test_rgw_obj \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	ceph_test_cls_rgw_meta \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	ceph_test_cls_rgw_log \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	ceph_test_cls_rgw_opstate \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	ceph_test_cls_rgw
- at ENABLE_SERVER_TRUE@am__append_172 = ceph_test_async_driver \
+ at ENABLE_SERVER_TRUE@am__append_183 = ceph_test_async_driver \
 @ENABLE_SERVER_TRUE@	ceph_test_msgr ceph_streamtest \
 @ENABLE_SERVER_TRUE@	ceph_test_trans ceph_test_mon_workloadgen \
- at ENABLE_SERVER_TRUE@	ceph_test_mon_msg ceph_perf_objectstore
- at ENABLE_SERVER_TRUE@@LINUX_TRUE at am__append_173 =  \
+ at ENABLE_SERVER_TRUE@	ceph_test_mon_msg ceph_perf_objectstore \
+ at ENABLE_SERVER_TRUE@	ceph_perf_local ceph_perf_msgr_server \
+ at ENABLE_SERVER_TRUE@	ceph_perf_msgr_client
+ at ENABLE_SERVER_TRUE@am__append_184 = test/perf_helper.h
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE at am__append_185 =  \
 @ENABLE_SERVER_TRUE@@LINUX_TRUE@	ceph_test_objectstore \
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@	ceph_test_keyvaluedb \
 @ENABLE_SERVER_TRUE@@LINUX_TRUE@	ceph_test_filestore
- at ENABLE_SERVER_TRUE@am__append_174 =  \
+ at ENABLE_SERVER_TRUE@am__append_186 =  \
 @ENABLE_SERVER_TRUE@	ceph_test_objectstore_workloadgen \
 @ENABLE_SERVER_TRUE@	ceph_test_filestore_idempotent \
 @ENABLE_SERVER_TRUE@	ceph_test_filestore_idempotent_sequence \
@@ -792,460 +821,567 @@ check_PROGRAMS = $(am__EXEEXT_44) $(am__EXEEXT_45) $(am__EXEEXT_46) \
 @ENABLE_SERVER_TRUE@	ceph_test_object_map \
 @ENABLE_SERVER_TRUE@	ceph_test_keyvaluedb_atomicity \
 @ENABLE_SERVER_TRUE@	ceph_test_keyvaluedb_iterators
- at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_RADOS_TRUE at am__append_175 = ceph_smalliobenchfs \
+ at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_RADOS_TRUE at am__append_187 = ceph_smalliobenchfs \
 @ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_RADOS_TRUE@	ceph_smalliobenchdumb \
 @ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_RADOS_TRUE@	ceph_tpbench
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am__append_176 = ceph_test_keys
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am__append_177 = get_command_descriptions
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am__append_178 =  \
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am__append_188 = ceph_test_keys
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am__append_189 = get_command_descriptions
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am__append_190 =  \
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	unittest_mon_moncap \
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	unittest_mon_pgmap
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_179 =  \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_191 =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_ecbackend \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_osdscrub \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_pglog \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_hitset \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_osd_osdcap
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_180 = -ldl
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_181 = -ldl
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_182 = ceph_test_snap_mapper
- at ENABLE_SERVER_TRUE@am__append_183 = unittest_chain_xattr \
- at ENABLE_SERVER_TRUE@	unittest_flatindex unittest_lfnindex
- at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at am__append_184 = unittest_mds_authcap
- at WITH_BUILD_TESTS_TRUE@am__append_185 = test_build_libcommon
- at LINUX_TRUE@am__append_186 = libsystest.la
- at ENABLE_ROOT_MAKE_CHECK_TRUE@am__append_187 = test/ceph-disk-root.sh
- at ENABLE_ROOT_MAKE_CHECK_FALSE@am__append_188 = test/ceph-disk.sh
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_189 =  \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_osd_osdcap \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_pageset
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_192 = -ldl
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_193 = -ldl
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_194 = ceph_test_snap_mapper
+ at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE at am__append_195 = unittest_rocksdb_option_static
+ at ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_TRUE at am__append_196 = unittest_rocksdb_option
+ at ENABLE_SERVER_TRUE@am__append_197 = unittest_chain_xattr \
+ at ENABLE_SERVER_TRUE@	unittest_lfnindex
+ at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at am__append_198 = unittest_mds_authcap
+ at WITH_BUILD_TESTS_TRUE@am__append_199 = test_build_libcommon
+ at LINUX_TRUE@am__append_200 = libsystest.la
+ at LINUX_TRUE@am__append_201 = unittest_blkdev
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_202 =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_scratchtool \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_scratchtoolpp \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_radosacl
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_190 = rados
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_191 = ceph-client-debug
- at ENABLE_SERVER_TRUE@am__append_192 = ceph-osdomap-tool \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_203 = rados
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_204 = ceph-client-debug
+ at ENABLE_SERVER_TRUE@am__append_205 = ceph-osdomap-tool \
 @ENABLE_SERVER_TRUE@	ceph-monstore-tool ceph-kvstore-tool
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_193 = -ldl
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_194 = ceph-objectstore-tool
- at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE at am__append_195 = cephfs-journal-tool \
- at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE@	cephfs-table-tool
- at WITH_REST_BENCH_TRUE@am__append_196 = rest-bench
- at WITH_REST_BENCH_TRUE@@WITH_SYSTEM_LIBS3_TRUE at am__append_197 = -ls3
- at WITH_REST_BENCH_TRUE@@WITH_SYSTEM_LIBS3_FALSE at am__append_198 = libs3/build/lib/libs3.a -lcurl -lxml2
- at WITH_REST_BENCH_TRUE@@WITH_SYSTEM_LIBS3_FALSE at am__append_199 = libs3
- at WITH_SLIBROCKSDB_TRUE@am__append_200 = rocksdb
- at WITH_SLIBROCKSDB_FALSE@am__append_201 = \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/.arcconfig \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/.clang-format \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/.gitignore \
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_206 = -ldl
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_207 = ceph-objectstore-tool
+ at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE at am__append_208 = cephfs-journal-tool \
+ at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE@	cephfs-table-tool \
+ at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE@	cephfs-data-scan
+ at WITH_SLIBROCKSDB_TRUE@am__append_209 = rocksdb
+ at WITH_SLIBROCKSDB_FALSE@am__append_210 = \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/.gitignore \
 @WITH_SLIBROCKSDB_FALSE@        rocksdb/CONTRIBUTING.md \
 @WITH_SLIBROCKSDB_FALSE@        rocksdb/HISTORY.md \
 @WITH_SLIBROCKSDB_FALSE@        rocksdb/INSTALL.md \
 @WITH_SLIBROCKSDB_FALSE@        rocksdb/LICENSE \
 @WITH_SLIBROCKSDB_FALSE@        rocksdb/Makefile.am \
 @WITH_SLIBROCKSDB_FALSE@        rocksdb/PATENTS \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/README \
+ at WITH_SLIBROCKSDB_FALSE@        rocksdb/README.md \
 @WITH_SLIBROCKSDB_FALSE@        rocksdb/ROCKSDB_LITE.md \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/build_tools/build_detect_platform \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/build_tools/build_detect_version \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/build_tools/fbcode.clang31.sh \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/build_tools/fbcode.gcc471.sh \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/build_tools/fbcode.gcc481.sh \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/build_tools/format-diff.sh \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/build_tools/mac-install-gflags.sh \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/build_tools/make_new_version.sh \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/build_tools/regression_build_test.sh \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/build_tools/valgrind_test.sh \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/configure.ac \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/coverage/coverage_test.sh \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/coverage/parse_gcov_output.py \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/builder.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/builder.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/c.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/c_test.c \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/column_family.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/column_family.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/column_family_test.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/compaction.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/compaction.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/compaction_picker.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/compaction_picker.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/corruption_test.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/db_bench.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/db_filesnapshot.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/db_impl.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/db_impl.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/db_impl_debug.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/db_impl_readonly.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/db_impl_readonly.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/db_iter.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/db_iter.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/db_stats_logger.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/db_test.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/dbformat.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/dbformat.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/dbformat_test.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/deletefile_test.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/file_indexer.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/file_indexer.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/file_indexer_test.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/filename.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/filename.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/filename_test.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/internal_stats.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/internal_stats.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/log_and_apply_bench.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/log_format.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/log_reader.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/log_reader.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/log_test.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/log_writer.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/log_writer.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/memtable.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/memtable.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/memtable_list.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/memtable_list.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/merge_context.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/merge_helper.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/merge_helper.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/merge_operator.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/merge_test.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/perf_context_test.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/plain_table_db_test.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/prefix_test.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/repair.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/simple_table_db_test.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/skiplist.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/skiplist_test.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/snapshot.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/table_cache.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/table_cache.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/table_properties_collector.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/table_properties_collector.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/table_properties_collector_test.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/tailing_iter.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/tailing_iter.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/transaction_log_impl.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/transaction_log_impl.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/version_edit.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/version_edit.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/version_edit_test.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/version_set.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/version_set.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/version_set_test.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/write_batch.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/write_batch_internal.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/db/write_batch_test.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/doc/doc.css \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/doc/index.html \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/doc/log_format.txt \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/doc/rockslogo.jpg \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/doc/rockslogo.png \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/hdfs/README \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/hdfs/env_hdfs.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/hdfs/hdfs.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/hdfs/libhdfs.a \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/helpers/memenv/memenv.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/helpers/memenv/memenv_test.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/include/rocksdb/c.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/include/rocksdb/cache.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/include/rocksdb/compaction_filter.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/include/rocksdb/comparator.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/include/rocksdb/db.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/include/rocksdb/env.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/include/rocksdb/filter_policy.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/include/rocksdb/flush_block_policy.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/include/rocksdb/iterator.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/include/rocksdb/ldb_tool.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/include/rocksdb/memtablerep.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/include/rocksdb/merge_operator.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/include/rocksdb/options.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/include/rocksdb/perf_context.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/include/rocksdb/slice.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/include/rocksdb/slice_transform.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/include/rocksdb/statistics.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/include/rocksdb/status.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/include/rocksdb/table.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/include/rocksdb/table_properties.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/include/rocksdb/transaction_log.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/include/rocksdb/types.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/include/rocksdb/universal_compaction.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/include/rocksdb/version.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/include/rocksdb/write_batch.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/include/utilities/backupable_db.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/include/utilities/db_ttl.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/include/utilities/geo_db.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/include/utilities/stackable_db.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/include/utilities/utility_db.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/java/Makefile \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/java/RocksDBSample.java \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/java/jdb_bench.sh \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/java/org/rocksdb/BackupableDB.java \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/java/org/rocksdb/BackupableDBOptions.java \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/java/org/rocksdb/BloomFilter.java \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/java/org/rocksdb/Filter.java \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/java/org/rocksdb/HashLinkedListMemTableConfig.java \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/java/org/rocksdb/HashSkipListMemTableConfig.java \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/java/org/rocksdb/HistogramData.java \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/java/org/rocksdb/HistogramType.java \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/java/org/rocksdb/Iterator.java \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/java/org/rocksdb/MemTableConfig.java \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/java/org/rocksdb/Options.java \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/java/org/rocksdb/PlainTableConfig.java \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/java/org/rocksdb/ReadOptions.java \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/java/org/rocksdb/RocksDB.java \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/java/org/rocksdb/RocksDBException.java \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/java/org/rocksdb/RocksObject.java \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/java/org/rocksdb/SkipListMemTableConfig.java \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/java/org/rocksdb/Statistics.java \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/java/org/rocksdb/TableFormatConfig.java \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/java/org/rocksdb/TickerType.java \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/java/org/rocksdb/VectorMemTableConfig.java \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/java/org/rocksdb/WriteBatch.java \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/java/org/rocksdb/WriteBatchTest.java \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/java/org/rocksdb/WriteOptions.java \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/java/org/rocksdb/benchmark/DbBenchmark.java \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/java/org/rocksdb/test/BackupableDBTest.java \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/java/org/rocksdb/test/OptionsTest.java \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/java/org/rocksdb/test/ReadOptionsTest.java \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/java/org/rocksdb/util/Environment.java \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/java/org/rocksdb/util/SizeUnit.java \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/java/rocksjni/backupablejni.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/java/rocksjni/filter.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/java/rocksjni/iterator.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/java/rocksjni/memtablejni.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/java/rocksjni/options.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/java/rocksjni/portal.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/java/rocksjni/rocksjni.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/java/rocksjni/statistics.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/java/rocksjni/table.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/java/rocksjni/write_batch.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/linters/__phutil_library_init__.php \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/linters/__phutil_library_map__.php \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/linters/cpp_linter/ArcanistCpplintLinter.php \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/linters/cpp_linter/FbcodeCppLinter.php \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/linters/cpp_linter/PfffCppLinter.php \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/linters/cpp_linter/cpplint.py \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/linters/lint_engine/FacebookFbcodeLintEngine.php \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/m4/libtool.m4 \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/m4/ltoptions.m4 \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/m4/ltsugar.m4 \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/m4/ltversion.m4 \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/m4/lt~obsolete.m4 \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/port/README \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/port/atomic_pointer.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/port/likely.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/port/port.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/port/port_example.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/port/port_posix.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/port/port_posix.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/port/stack_trace.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/port/stack_trace.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/port/win/stdint.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/table/block.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/table/block.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/table/block_based_table_builder.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/table/block_based_table_builder.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/table/block_based_table_factory.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/table/block_based_table_factory.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/table/block_based_table_reader.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/table/block_based_table_reader.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/table/block_builder.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/table/block_builder.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/table/block_hash_index.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/table/block_hash_index.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/table/block_hash_index_test.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/table/block_test.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/table/filter_block.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/table/filter_block.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/table/filter_block_test.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/table/flush_block_policy.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/table/format.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/table/format.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/table/iter_heap.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/table/iterator.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/table/iterator_wrapper.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/table/merger.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/table/merger.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/table/meta_blocks.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/table/meta_blocks.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/table/plain_table_builder.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/table/plain_table_builder.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/table/plain_table_factory.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/table/plain_table_factory.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/table/plain_table_reader.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/table/plain_table_reader.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/table/table_builder.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/table/table_properties.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/table/table_reader.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/table/table_reader_bench.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/table/table_test.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/table/two_level_iterator.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/table/two_level_iterator.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/tools/auto_sanity_test.sh \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/tools/blob_store_bench.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/tools/db_crashtest.py \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/tools/db_crashtest2.py \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/tools/db_repl_stress.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/tools/db_sanity_test.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/tools/db_stress.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/tools/ldb.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/tools/ldb_test.py \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/tools/reduce_levels_test.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/tools/sst_dump.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/arena.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/arena.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/arena_test.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/auto_roll_logger.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/auto_roll_logger.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/auto_roll_logger_test.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/autovector.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/autovector_test.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/benchharness.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/benchharness.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/benchharness_test.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/blob_store.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/blob_store.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/blob_store_test.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/bloom.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/bloom_test.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/build_version.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/cache.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/cache_test.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/coding.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/coding.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/coding_test.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/comparator.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/crc32c.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/crc32c.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/crc32c_test.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/dynamic_bloom.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/dynamic_bloom.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/dynamic_bloom_test.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/env.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/env_hdfs.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/env_posix.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/env_test.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/filelock_test.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/filter_policy.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/hash.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/hash.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/hash_cuckoo_rep.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/hash_cuckoo_rep.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/hash_linklist_rep.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/hash_linklist_rep.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/hash_skiplist_rep.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/hash_skiplist_rep.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/histogram.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/histogram.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/histogram_test.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/ldb_cmd.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/ldb_cmd.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/ldb_cmd_execute_result.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/ldb_tool.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/log_buffer.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/log_buffer.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/log_write_bench.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/logging.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/logging.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/manual_compaction_test.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/murmurhash.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/murmurhash.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/mutexlock.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/options.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/perf_context.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/perf_context_imp.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/posix_logger.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/random.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/signal_test.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/skiplistrep.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/slice.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/statistics.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/statistics.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/stats_logger.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/status.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/stl_wrappers.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/stop_watch.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/string_util.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/string_util.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/sync_point.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/sync_point.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/testharness.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/testharness.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/testutil.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/testutil.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/thread_local.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/thread_local.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/thread_local_test.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/vectorrep.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/xxhash.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/util/xxhash.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/utilities/backupable/backupable_db.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/utilities/backupable/backupable_db_test.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/utilities/geodb/geodb_impl.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/utilities/geodb/geodb_impl.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/utilities/geodb/geodb_test.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/utilities/merge_operators.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/utilities/merge_operators/put.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/utilities/merge_operators/string_append/stringappend.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/utilities/merge_operators/string_append/stringappend.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/utilities/merge_operators/string_append/stringappend2.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/utilities/merge_operators/string_append/stringappend2.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/utilities/merge_operators/string_append/stringappend_test.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/utilities/merge_operators/uint64add.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/utilities/redis/README \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/utilities/redis/redis_list_exception.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/utilities/redis/redis_list_iterator.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/utilities/redis/redis_lists.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/utilities/redis/redis_lists.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/utilities/redis/redis_lists_test.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/utilities/ttl/db_ttl_impl.cc \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/utilities/ttl/db_ttl_impl.h \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/utilities/ttl/ttl_test.cc
-
- at ENABLE_CLIENT_TRUE@am__append_202 = pybind/ceph_argparse.py
- at ENABLE_CLIENT_TRUE@am__append_203 = ceph-syn
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_204 = \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/AUTHORS \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/configure.ac \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/CONTRIBUTING.md \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/builder.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/builder.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/c.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/column_family.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/column_family.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/column_family_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/compact_files_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/compaction.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/compaction.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/compaction_job.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/compaction_job.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/compaction_job_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/compaction_picker.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/compaction_picker.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/compaction_picker_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/comparator_db_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/corruption_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/c_test.c \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/cuckoo_table_db_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/db_bench.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/db_filesnapshot.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/dbformat.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/dbformat.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/dbformat_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/db_impl.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/db_impl_debug.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/db_impl_experimental.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/db_impl.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/db_impl_readonly.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/db_impl_readonly.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/db_iter.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/db_iter.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/db_iter_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/db_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/deletefile_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/event_logger_helpers.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/event_logger_helpers.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/experimental.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/fault_injection_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/file_indexer.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/file_indexer.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/file_indexer_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/filename.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/filename.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/filename_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/flush_job.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/flush_job.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/flush_job_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/flush_scheduler.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/flush_scheduler.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/forward_iterator.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/forward_iterator.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/internal_stats.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/internal_stats.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/job_context.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/listener_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/log_format.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/log_reader.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/log_reader.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/log_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/log_writer.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/log_writer.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/managed_iterator.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/managed_iterator.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/memtable_allocator.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/memtable_allocator.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/memtable.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/memtable.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/memtable_list.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/memtable_list.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/memtable_list_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/memtablerep_bench.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/merge_context.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/merge_helper.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/merge_helper.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/merge_operator.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/merge_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/perf_context_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/plain_table_db_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/prefix_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/repair.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/skiplist.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/skiplist_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/slice.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/snapshot.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/table_cache.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/table_cache.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/table_properties_collector.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/table_properties_collector.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/table_properties_collector_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/transaction_log_impl.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/transaction_log_impl.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/version_builder.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/version_builder.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/version_builder_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/version_edit.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/version_edit.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/version_edit_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/version_set.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/version_set.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/version_set_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/wal_manager.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/wal_manager.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/wal_manager_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/write_batch_base.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/write_batch.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/write_batch_internal.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/write_batch_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/writebuffer.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/write_controller.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/write_controller.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/write_controller_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/write_thread.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/write_thread.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/doc/doc.css \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/doc/index.html \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/doc/log_format.txt \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/doc/rockslogo.jpg \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/doc/rockslogo.png \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/examples/column_families_example.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/examples/compact_files_example.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/examples/c_simple_example.c \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/examples/.gitignore \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/examples/Makefile \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/examples/README.md \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/examples/simple_example.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/hdfs/env_hdfs.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/hdfs/README \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/hdfs/setup.sh \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/HISTORY.md \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/cache.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/c.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/compaction_filter.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/comparator.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/db.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/env.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/experimental.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/filter_policy.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/flush_block_policy.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/immutable_options.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/iostats_context.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/iterator.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/ldb_tool.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/listener.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/memtablerep.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/merge_operator.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/metadata.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/options.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/perf_context.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/rate_limiter.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/slice.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/slice_transform.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/sst_dump_tool.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/statistics.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/status.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/table.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/table_properties.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/thread_status.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/transaction_log.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/types.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/universal_compaction.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/utilities/backupable_db.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/utilities/checkpoint.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/utilities/convenience.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/utilities/db_ttl.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/utilities/document_db.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/utilities/flashcache.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/utilities/geo_db.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/utilities/json_document.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/utilities/leveldb_options.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/utilities/spatial_db.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/utilities/stackable_db.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/utilities/utility_db.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/utilities/write_batch_with_index.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/version.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/write_batch_base.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/write_batch.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/utilities/backupable_db.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/utilities/db_ttl.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/utilities/document_db.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/utilities/geo_db.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/utilities/json_document.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/utilities/stackable_db.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/utilities/utility_db.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/INSTALL.md \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/LICENSE \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/m4/libtool.m4 \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/m4/lt~obsolete.m4 \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/m4/ltoptions.m4 \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/m4/ltsugar.m4 \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/m4/ltversion.m4 \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/Makefile.am \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/PATENTS \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/port/likely.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/port/port_example.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/port/port.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/port/port_posix.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/port/port_posix.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/port/README \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/port/stack_trace.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/port/stack_trace.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/port/win/stdint.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/README.md \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/ROCKSDB_LITE.md \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/adaptive_table_factory.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/adaptive_table_factory.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/block_based_filter_block.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/block_based_filter_block.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/block_based_filter_block_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/block_based_table_builder.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/block_based_table_builder.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/block_based_table_factory.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/block_based_table_factory.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/block_based_table_reader.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/block_based_table_reader.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/block_builder.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/block_builder.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/block.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/block.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/block_hash_index.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/block_hash_index.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/block_hash_index_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/block_prefix_index.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/block_prefix_index.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/block_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/bloom_block.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/bloom_block.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/cuckoo_table_builder.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/cuckoo_table_builder.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/cuckoo_table_builder_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/cuckoo_table_factory.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/cuckoo_table_factory.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/cuckoo_table_reader.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/cuckoo_table_reader.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/cuckoo_table_reader_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/filter_block.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/flush_block_policy.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/format.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/format.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/full_filter_block.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/full_filter_block.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/full_filter_block_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/get_context.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/get_context.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/iterator.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/iterator_wrapper.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/iter_heap.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/merger.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/merger.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/merger_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/meta_blocks.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/meta_blocks.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/mock_table.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/mock_table.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/plain_table_builder.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/plain_table_builder.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/plain_table_factory.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/plain_table_factory.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/plain_table_index.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/plain_table_index.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/plain_table_key_coding.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/plain_table_key_coding.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/plain_table_reader.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/plain_table_reader.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/table_builder.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/table_properties.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/table_properties_internal.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/table_reader_bench.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/table_reader.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/table_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/two_level_iterator.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/two_level_iterator.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/third-party/fbson/COMMIT.md \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/third-party/fbson/FbsonDocument.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/third-party/fbson/FbsonJsonParser.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/third-party/fbson/FbsonStream.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/third-party/fbson/FbsonUtil.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/third-party/fbson/FbsonWriter.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/third-party/flashcache/flashcache_ioctl.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/third-party/gtest-1.7.0/fused-src/gtest/gtest-all.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/third-party/gtest-1.7.0/fused-src/gtest/gtest.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/USERS.md \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/allocator.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/arena.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/arena.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/arena_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/auto_roll_logger.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/auto_roll_logger.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/auto_roll_logger_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/autovector.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/autovector_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/bloom.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/bloom_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/build_version.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/cache_bench.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/cache.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/cache_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/coding.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/coding.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/coding_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/comparator.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/compression.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/crc32c.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/crc32c.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/crc32c_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/db_info_dumper.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/db_info_dumper.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/dynamic_bloom.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/dynamic_bloom.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/dynamic_bloom_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/env.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/env_hdfs.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/env_posix.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/env_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/event_logger.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/event_logger.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/event_logger_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/filelock_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/file_util.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/file_util.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/filter_policy.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/hash.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/hash_cuckoo_rep.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/hash_cuckoo_rep.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/hash.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/hash_linklist_rep.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/hash_linklist_rep.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/hash_skiplist_rep.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/hash_skiplist_rep.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/histogram.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/histogram.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/histogram_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/instrumented_mutex.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/instrumented_mutex.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/iostats_context.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/iostats_context_imp.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/backupable/backupable_db.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/backupable/backupable_db_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/checkpoint/checkpoint.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/compacted_db/compacted_db_impl.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/compacted_db/compacted_db_impl.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/convenience/convenience.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/document/document_db.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/document/document_db_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/document/json_document_builder.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/document/json_document.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/document/json_document_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/flashcache/flashcache.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/flashcache/flashcache.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/geodb/geodb_impl.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/geodb/geodb_impl.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/geodb/geodb_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/leveldb_options/leveldb_options.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/merge_operators.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/merge_operators/put.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/merge_operators/string_append/stringappend2.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/merge_operators/string_append/stringappend2.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/merge_operators/string_append/stringappend.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/merge_operators/string_append/stringappend.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/merge_operators/string_append/stringappend_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/merge_operators/uint64add.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/redis/README \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/redis/redis_list_exception.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/redis/redis_list_iterator.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/redis/redis_lists.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/redis/redis_lists.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/redis/redis_lists_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/spatialdb/spatial_db.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/spatialdb/spatial_db_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/spatialdb/utils.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/ttl/db_ttl_impl.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/ttl/db_ttl_impl.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/ttl/ttl_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/ldb_cmd.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/ldb_cmd_execute_result.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/ldb_cmd.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/ldb_tool.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/log_buffer.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/log_buffer.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/logging.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/logging.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/log_write_bench.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/manual_compaction_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/memenv.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/memenv_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/mock_env.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/mock_env.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/mock_env_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/murmurhash.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/murmurhash.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/mutable_cf_options.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/mutable_cf_options.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/mutexlock.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/options_builder.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/options.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/options_helper.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/options_helper.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/options_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/perf_context.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/perf_context_imp.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/posix_logger.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/random.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/rate_limiter.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/rate_limiter.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/rate_limiter_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/scoped_arena_iterator.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/skiplistrep.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/slice.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/slice_transform_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/sst_dump_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/sst_dump_tool.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/sst_dump_tool_imp.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/statistics.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/statistics.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/status.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/stl_wrappers.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/stop_watch.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/string_util.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/string_util.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/sync_point.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/sync_point.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/testharness.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/testharness.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/testutil.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/testutil.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/thread_list_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/thread_local.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/thread_local.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/thread_local_test.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/thread_operation.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/thread_status_impl.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/thread_status_updater.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/thread_status_updater_debug.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/thread_status_updater.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/thread_status_util.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/thread_status_util_debug.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/thread_status_util.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/vectorrep.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/xfunc.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/xfunc.h \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/xxhash.cc \
+ at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/xxhash.h
+
+ at WITH_LTTNG_TRUE@am__append_211 = \
+ at WITH_LTTNG_TRUE@	libosd_tp.la \
+ at WITH_LTTNG_TRUE@	libos_tp.la \
+ at WITH_LTTNG_TRUE@	librados_tp.la \
+ at WITH_LTTNG_TRUE@	librbd_tp.la
+
+ at WITH_LTTNG_TRUE@am__append_212 = \
+ at WITH_LTTNG_TRUE@	tracing/librados.h \
+ at WITH_LTTNG_TRUE@	tracing/librbd.h \
+ at WITH_LTTNG_TRUE@	tracing/objectstore.h \
+ at WITH_LTTNG_TRUE@	tracing/oprequest.h \
+ at WITH_LTTNG_TRUE@	tracing/osd.h \
+ at WITH_LTTNG_TRUE@	tracing/pg.h
+
+TESTS = $(am__EXEEXT_57) $(check_SCRIPTS)
+ at ENABLE_CLIENT_TRUE@am__append_213 = \
+ at ENABLE_CLIENT_TRUE@	pybind/ceph_argparse.py \
+ at ENABLE_CLIENT_TRUE@	pybind/ceph_daemon.py
+
+ at ENABLE_CLIENT_TRUE@am__append_214 = ceph-syn
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_215 = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(srcdir)/bash_completion/rados \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(srcdir)/bash_completion/radosgw-admin
 
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_205 = pybind/rados.py
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_206 = librados-config
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_207 = \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_216 = pybind/rados.py
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_217 = librados-config
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_218 = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(srcdir)/bash_completion/rbd
 
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_208 = \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_219 = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	ceph-rbdnamer \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd-replay-many
 
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_209 = pybind/rbd.py
- at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_210 = libkrbd.la
- at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_211 = rbd
- at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_RADOS_TRUE at am__append_212 = ceph-fuse \
- at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_RADOS_TRUE@	rbd-fuse
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_213 = cephfs
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_214 = pybind/cephfs.py
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_215 = libcephfs.la
- at ENABLE_CEPHFS_JAVA_TRUE@@ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_216 = libcephfs_jni.la
- at ENABLE_SERVER_TRUE@am__append_217 = ceph-run ceph-rest-api \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_220 = pybind/rbd.py
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_221 = libkrbd.la
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_222 = rbd
+ at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_RADOS_TRUE at am__append_223 = ceph-fuse
+ at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_224 = rbd-fuse
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_225 = cephfs
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_226 = pybind/cephfs.py
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_227 = -Xcompiler -Xlinker -Xcompiler '--exclude-libs=libcommon.a'
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_228 = libcephfs.la
+ at ENABLE_CEPHFS_JAVA_TRUE@@ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_229 = libcephfs_jni.la
+ at ENABLE_SERVER_TRUE@am__append_230 = ceph-run ceph-rest-api \
 @ENABLE_SERVER_TRUE@	ceph-debugpack ceph-crush-location \
 @ENABLE_SERVER_TRUE@	ceph-coverage
- at ENABLE_SERVER_TRUE@am__append_218 = pybind/ceph_rest_api.py
- at ENABLE_SERVER_TRUE@am__append_219 = ceph-coverage init-ceph
- at ENABLE_SERVER_TRUE@am__append_220 = init-ceph
- at ENABLE_SERVER_TRUE@@LINUX_TRUE at am__append_221 = mount.ceph
- at ENABLE_SERVER_TRUE@am__append_222 = mount.fuse.ceph
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am__append_223 = ceph-mon
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_224 = \
+ at ENABLE_SERVER_TRUE@am__append_231 = pybind/ceph_rest_api.py
+ at ENABLE_SERVER_TRUE@am__append_232 = ceph-coverage init-ceph
+ at ENABLE_SERVER_TRUE@am__append_233 = init-ceph
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE at am__append_234 = mount.ceph
+ at ENABLE_SERVER_TRUE@am__append_235 = mount.fuse.ceph
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am__append_236 = ceph-mon
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_237 = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	ceph-disk \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	ceph-disk-prepare \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	ceph-disk-activate \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	ceph-disk-udev
 
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_225 = \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_238 = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	ceph-clsinfo
 
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_226 = ceph-osd
- at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at am__append_227 = ceph-mds
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_239 = ceph-osd
+ at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at am__append_240 = ceph-mds
 subdir = src
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
 am__aclocal_m4_deps = $(top_srcdir)/m4/ac_check_classpath.m4 \
@@ -1300,6 +1436,13 @@ am__libcls_statelog_client_a_SOURCES_DIST =  \
 @ENABLE_CLIENT_TRUE at am_libcls_statelog_client_a_OBJECTS = cls/statelog/cls_statelog_client.$(OBJEXT)
 libcls_statelog_client_a_OBJECTS =  \
 	$(am_libcls_statelog_client_a_OBJECTS)
+libcls_timeindex_client_a_AR = $(AR) $(ARFLAGS)
+libcls_timeindex_client_a_LIBADD =
+am__libcls_timeindex_client_a_SOURCES_DIST =  \
+	cls/timeindex/cls_timeindex_client.cc
+ at ENABLE_CLIENT_TRUE@am_libcls_timeindex_client_a_OBJECTS = cls/timeindex/cls_timeindex_client.$(OBJEXT)
+libcls_timeindex_client_a_OBJECTS =  \
+	$(am_libcls_timeindex_client_a_OBJECTS)
 libcls_user_client_a_AR = $(AR) $(ARFLAGS)
 libcls_user_client_a_LIBADD =
 am__libcls_user_client_a_SOURCES_DIST = cls/user/cls_user_client.cc \
@@ -1355,11 +1498,13 @@ am__installdirs = "$(DESTDIR)$(erasure_codelibdir)" \
 	"$(DESTDIR)$(libdir)" "$(DESTDIR)$(radoslibdir)" \
 	"$(DESTDIR)$(bindir)" "$(DESTDIR)$(sbindir)" \
 	"$(DESTDIR)$(su_sbindir)" "$(DESTDIR)$(bindir)" \
-	"$(DESTDIR)$(ceph_libexecdir)" "$(DESTDIR)$(ceph_sbindir)" \
-	"$(DESTDIR)$(bindir)" "$(DESTDIR)$(sbindir)" \
-	"$(DESTDIR)$(shell_commondir)" "$(DESTDIR)$(su_sbindir)" \
-	"$(DESTDIR)$(pythondir)" "$(DESTDIR)$(bash_completiondir)" \
-	"$(DESTDIR)$(docdir)" "$(DESTDIR)$(libcephfs_includedir)" \
+	"$(DESTDIR)$(ceph_libexecdir)" \
+	"$(DESTDIR)$(ceph_monstore_update_crushdir)" \
+	"$(DESTDIR)$(ceph_sbindir)" "$(DESTDIR)$(bindir)" \
+	"$(DESTDIR)$(sbindir)" "$(DESTDIR)$(shell_commondir)" \
+	"$(DESTDIR)$(su_sbindir)" "$(DESTDIR)$(pythondir)" \
+	"$(DESTDIR)$(bash_completiondir)" "$(DESTDIR)$(docdir)" \
+	"$(DESTDIR)$(libcephfs_includedir)" \
 	"$(DESTDIR)$(librbd_includedir)" \
 	"$(DESTDIR)$(rados_includedir)" \
 	"$(DESTDIR)$(radosstriper_includedir)"
@@ -1402,6 +1547,7 @@ libcephfs_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	-rpath \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(libdir)
 @ENABLE_CEPHFS_JAVA_TRUE@@ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at libcephfs_jni_la_DEPENDENCIES = $(LIBCEPHFS) \
+ at ENABLE_CEPHFS_JAVA_TRUE@@ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(LIBCOMMON) \
 @ENABLE_CEPHFS_JAVA_TRUE@@ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_2)
 am__libcephfs_jni_la_SOURCES_DIST = java/native/libcephfs_jni.cc \
 	java/native/ScopedLocalRef.h java/native/JniConstants.cpp \
@@ -1441,12 +1587,37 @@ am__libclient_la_SOURCES_DIST = client/Client.cc client/Inode.cc \
 libclient_la_OBJECTS = $(am_libclient_la_OBJECTS)
 @ENABLE_CLIENT_TRUE at am_libclient_la_rpath =
 @ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE at libclient_fuse_la_DEPENDENCIES =  \
- at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@	libclient.la
+ at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@	libclient.la \
+ at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@	$(am__DEPENDENCIES_1)
 am__libclient_fuse_la_SOURCES_DIST = client/fuse_ll.cc
- at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE at am_libclient_fuse_la_OBJECTS =  \
- at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@	client/fuse_ll.lo
+ at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE at am_libclient_fuse_la_OBJECTS = client/libclient_fuse_la-fuse_ll.lo
 libclient_fuse_la_OBJECTS = $(am_libclient_fuse_la_OBJECTS)
+libclient_fuse_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
+	$(libclient_fuse_la_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
 @ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE at am_libclient_fuse_la_rpath =
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_cephfs_la_DEPENDENCIES =  \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_2)
+am__libcls_cephfs_la_SOURCES_DIST = cls/cephfs/cls_cephfs.cc
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libcls_cephfs_la_OBJECTS =  \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	cls/cephfs/cls_cephfs.lo
+libcls_cephfs_la_OBJECTS = $(am_libcls_cephfs_la_OBJECTS)
+libcls_cephfs_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
+	$(AM_CXXFLAGS) $(CXXFLAGS) $(libcls_cephfs_la_LDFLAGS) \
+	$(LDFLAGS) -o $@
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libcls_cephfs_la_rpath = -rpath \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(radoslibdir)
+libcls_cephfs_client_la_LIBADD =
+am__libcls_cephfs_client_la_SOURCES_DIST =  \
+	cls/cephfs/cls_cephfs_client.cc
+ at ENABLE_CLIENT_TRUE@am_libcls_cephfs_client_la_OBJECTS =  \
+ at ENABLE_CLIENT_TRUE@	cls/cephfs/cls_cephfs_client.lo
+libcls_cephfs_client_la_OBJECTS =  \
+	$(am_libcls_cephfs_client_la_OBJECTS)
+ at ENABLE_CLIENT_TRUE@am_libcls_cephfs_client_la_rpath =
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_hello_la_DEPENDENCIES =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_2)
@@ -1507,6 +1678,25 @@ libcls_log_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	-o $@
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libcls_log_la_rpath = -rpath \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(radoslibdir)
+libcls_numops_la_LIBADD =
+am__libcls_numops_la_SOURCES_DIST = cls/numops/cls_numops.cc
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libcls_numops_la_OBJECTS =  \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	cls/numops/cls_numops.lo
+libcls_numops_la_OBJECTS = $(am_libcls_numops_la_OBJECTS)
+libcls_numops_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
+	$(AM_CXXFLAGS) $(CXXFLAGS) $(libcls_numops_la_LDFLAGS) \
+	$(LDFLAGS) -o $@
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libcls_numops_la_rpath = -rpath \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(radoslibdir)
+libcls_numops_client_la_LIBADD =
+am__libcls_numops_client_la_SOURCES_DIST =  \
+	cls/numops/cls_numops_client.cc
+ at ENABLE_CLIENT_TRUE@am_libcls_numops_client_la_OBJECTS =  \
+ at ENABLE_CLIENT_TRUE@	cls/numops/cls_numops_client.lo
+libcls_numops_client_la_OBJECTS =  \
+	$(am_libcls_numops_client_la_OBJECTS)
+ at ENABLE_CLIENT_TRUE@am_libcls_numops_client_la_rpath =
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_rbd_la_DEPENDENCIES =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_2)
@@ -1605,6 +1795,18 @@ libcls_statelog_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(LDFLAGS) -o $@
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libcls_statelog_la_rpath =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	-rpath $(radoslibdir)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_timeindex_la_DEPENDENCIES =  \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_2)
+am__libcls_timeindex_la_SOURCES_DIST = cls/timeindex/cls_timeindex.cc
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libcls_timeindex_la_OBJECTS = cls/timeindex/cls_timeindex.lo
+libcls_timeindex_la_OBJECTS = $(am_libcls_timeindex_la_OBJECTS)
+libcls_timeindex_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
+	$(AM_CXXFLAGS) $(CXXFLAGS) $(libcls_timeindex_la_LDFLAGS) \
+	$(LDFLAGS) -o $@
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libcls_timeindex_la_rpath =  \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	-rpath $(radoslibdir)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_user_la_DEPENDENCIES =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_2)
@@ -1632,18 +1834,12 @@ libcls_version_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libcls_version_la_rpath =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	-rpath $(radoslibdir)
 am__DEPENDENCIES_3 = libcommon_internal.la libcommon_crc.la \
-	$(LIBERASURE_CODE) $(LIBMSG) $(LIBAUTH) $(LIBCRUSH) \
-	$(LIBJSON_SPIRIT) $(LIBLOG) $(LIBARCH) $(am__DEPENDENCIES_1)
-libcommon_la_DEPENDENCIES = $(am__DEPENDENCIES_3) libcommon_api.la
-am_libcommon_la_OBJECTS =
+	$(am__append_84) $(LIBERASURE_CODE) $(LIBMSG) $(LIBAUTH) \
+	$(LIBCRUSH) $(LIBJSON_SPIRIT) $(LIBLOG) $(LIBARCH) \
+	$(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1)
+libcommon_la_DEPENDENCIES = $(am__DEPENDENCIES_3)
+am_libcommon_la_OBJECTS = common/buffer.lo
 libcommon_la_OBJECTS = $(am_libcommon_la_OBJECTS)
-libcommon_api_la_LIBADD =
-am_libcommon_api_la_OBJECTS = common/libcommon_api_la-buffer.lo
-libcommon_api_la_OBJECTS = $(am_libcommon_api_la_OBJECTS)
-libcommon_api_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
-	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
-	$(libcommon_api_la_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
-	$(LDFLAGS) -o $@
 libcommon_crc_la_LIBADD =
 am__libcommon_crc_la_SOURCES_DIST = common/sctp_crc32.c \
 	common/crc32c.cc common/crc32c_intel_baseline.c \
@@ -1660,6 +1856,16 @@ libcommon_crc_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(libcommon_crc_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link \
 	$(CXXLD) $(AM_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) \
 	-o $@
+libcommon_crc_aarch64_la_LIBADD =
+am__libcommon_crc_aarch64_la_SOURCES_DIST = common/crc32c_aarch64.c
+ at HAVE_ARMV8_CRC_TRUE@am_libcommon_crc_aarch64_la_OBJECTS = common/libcommon_crc_aarch64_la-crc32c_aarch64.lo
+libcommon_crc_aarch64_la_OBJECTS =  \
+	$(am_libcommon_crc_aarch64_la_OBJECTS)
+libcommon_crc_aarch64_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CCLD) \
+	$(libcommon_crc_aarch64_la_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
+ at HAVE_ARMV8_CRC_TRUE@am_libcommon_crc_aarch64_la_rpath =
 libcommon_internal_la_LIBADD =
 am__libcommon_internal_la_SOURCES_DIST = ceph_ver.c \
 	common/DecayCounter.cc common/LogClient.cc common/LogEntry.cc \
@@ -1687,13 +1893,12 @@ am__libcommon_internal_la_SOURCES_DIST = ceph_ver.c \
 	common/ceph_frag.cc common/addr_parsing.c common/hobject.cc \
 	common/bloom_filter.cc common/linux_version.c common/module.c \
 	common/Readahead.cc common/Cycles.cc \
-	common/ContextCompletion.cc common/blkdev.cc \
-	common/address_helper.cc mon/MonCap.cc mon/MonClient.cc \
-	mon/MonMap.cc osd/OSDMap.cc osd/osd_types.cc osd/ECMsgTypes.cc \
-	osd/HitSet.cc mds/MDSMap.cc mds/inode_backtrace.cc \
-	mds/mdstypes.cc mds/flock.cc
- at WITH_RBD_TRUE@am__objects_2 = common/blkdev.lo
- at ENABLE_XIO_TRUE@am__objects_3 = common/address_helper.lo
+	common/ContextCompletion.cc common/TracepointProvider.cc \
+	common/blkdev.cc common/address_helper.cc mon/MonCap.cc \
+	mon/MonClient.cc mon/MonMap.cc osd/OSDMap.cc osd/osd_types.cc \
+	osd/ECMsgTypes.cc osd/HitSet.cc mds/MDSMap.cc \
+	mds/inode_backtrace.cc mds/mdstypes.cc mds/flock.cc
+ at ENABLE_XIO_TRUE@am__objects_2 = common/address_helper.lo
 am_libcommon_internal_la_OBJECTS = ceph_ver.lo common/DecayCounter.lo \
 	common/LogClient.lo common/LogEntry.lo \
 	common/PrebufferedStreambuf.lo common/SloppyCRCMap.lo \
@@ -1721,11 +1926,16 @@ am_libcommon_internal_la_OBJECTS = ceph_ver.lo common/DecayCounter.lo \
 	common/addr_parsing.lo common/hobject.lo \
 	common/bloom_filter.lo common/linux_version.lo \
 	common/module.lo common/Readahead.lo common/Cycles.lo \
-	common/ContextCompletion.lo $(am__objects_2) $(am__objects_3) \
-	mon/MonCap.lo mon/MonClient.lo mon/MonMap.lo osd/OSDMap.lo \
-	osd/osd_types.lo osd/ECMsgTypes.lo osd/HitSet.lo mds/MDSMap.lo \
+	common/ContextCompletion.lo common/TracepointProvider.lo \
+	common/blkdev.lo $(am__objects_2) mon/MonCap.lo \
+	mon/MonClient.lo mon/MonMap.lo osd/OSDMap.lo osd/osd_types.lo \
+	osd/ECMsgTypes.lo osd/HitSet.lo mds/MDSMap.lo \
 	mds/inode_backtrace.lo mds/mdstypes.lo mds/flock.lo
 libcommon_internal_la_OBJECTS = $(am_libcommon_internal_la_OBJECTS)
+libcompressor_la_DEPENDENCIES = $(LIBCOMMON)
+am_libcompressor_la_OBJECTS = compressor/Compressor.lo \
+	compressor/AsyncCompressor.lo
+libcompressor_la_OBJECTS = $(am_libcompressor_la_OBJECTS)
 libcrush_la_LIBADD =
 am_libcrush_la_OBJECTS = crush/builder.lo crush/mapper.lo \
 	crush/crush.lo crush/hash.lo crush/CrushWrapper.lo \
@@ -1735,9 +1945,10 @@ libcrush_la_OBJECTS = $(am_libcrush_la_OBJECTS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBCRUSH) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_2)
-am__libec_example_la_SOURCES_DIST =  \
+am__libec_example_la_SOURCES_DIST = erasure-code/ErasureCode.cc \
 	test/erasure-code/ErasureCodePluginExample.cc
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libec_example_la_OBJECTS = test/erasure-code/libec_example_la-ErasureCodePluginExample.lo
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libec_example_la_OBJECTS = erasure-code/libec_example_la-ErasureCode.lo \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	test/erasure-code/libec_example_la-ErasureCodePluginExample.lo
 libec_example_la_OBJECTS = $(am_libec_example_la_OBJECTS)
 libec_example_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -1813,13 +2024,31 @@ am__libec_isa_la_SOURCES_DIST = erasure-code/ErasureCode.cc \
 	erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx2.asm.s \
 	erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx.asm.s \
 	erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_sse.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_avx2.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_avx.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_sse.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_avx2.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_avx.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_sse.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_avx2.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_avx.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_sse.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_avx2.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_avx.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_sse.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_avx2.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_avx.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_sse.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_vect_mad_avx2.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_vect_mad_avx.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_vect_mad_sse.asm.s \
 	erasure-code/isa/isa-l/erasure_code/gf_vect_mul_avx.asm.s \
 	erasure-code/isa/isa-l/erasure_code/gf_vect_mul_sse.asm.s \
 	erasure-code/isa/ErasureCodeIsa.cc \
 	erasure-code/isa/ErasureCodeIsaTableCache.cc \
 	erasure-code/isa/ErasureCodePluginIsa.cc \
 	erasure-code/isa/xor_op.cc
- at WITH_BETTER_YASM_ELF64_TRUE@am__objects_4 = erasure-code/libec_isa_la-ErasureCode.lo \
+ at WITH_BETTER_YASM_ELF64_TRUE@am__objects_3 = erasure-code/libec_isa_la-ErasureCode.lo \
 @WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/libec_isa_la-ec_base.lo \
 @WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/libec_isa_la-ec_highlevel_func.lo \
 @WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/libec_isa_la-ec_multibinary.asm.lo \
@@ -1841,6 +2070,24 @@ am__libec_isa_la_SOURCES_DIST = erasure-code/ErasureCode.cc \
 @WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_vect_dot_prod_avx2.asm.lo \
 @WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_vect_dot_prod_avx.asm.lo \
 @WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_vect_dot_prod_sse.asm.lo \
+ at WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_2vect_mad_avx2.asm.lo \
+ at WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_2vect_mad_avx.asm.lo \
+ at WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_2vect_mad_sse.asm.lo \
+ at WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_3vect_mad_avx2.asm.lo \
+ at WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_3vect_mad_avx.asm.lo \
+ at WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_3vect_mad_sse.asm.lo \
+ at WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_4vect_mad_avx2.asm.lo \
+ at WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_4vect_mad_avx.asm.lo \
+ at WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_4vect_mad_sse.asm.lo \
+ at WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_5vect_mad_avx2.asm.lo \
+ at WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_5vect_mad_avx.asm.lo \
+ at WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_5vect_mad_sse.asm.lo \
+ at WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_6vect_mad_avx2.asm.lo \
+ at WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_6vect_mad_avx.asm.lo \
+ at WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_6vect_mad_sse.asm.lo \
+ at WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_vect_mad_avx2.asm.lo \
+ at WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_vect_mad_avx.asm.lo \
+ at WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_vect_mad_sse.asm.lo \
 @WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_vect_mul_avx.asm.lo \
 @WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_vect_mul_sse.asm.lo \
 @WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/libec_isa_la-ErasureCodeIsa.lo \
@@ -1848,7 +2095,7 @@ am__libec_isa_la_SOURCES_DIST = erasure-code/ErasureCode.cc \
 @WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/libec_isa_la-ErasureCodePluginIsa.lo \
 @WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/libec_isa_la-xor_op.lo
 @WITH_BETTER_YASM_ELF64_TRUE at am_libec_isa_la_OBJECTS =  \
- at WITH_BETTER_YASM_ELF64_TRUE@	$(am__objects_4)
+ at WITH_BETTER_YASM_ELF64_TRUE@	$(am__objects_3)
 libec_isa_la_OBJECTS = $(am_libec_isa_la_OBJECTS)
 libec_isa_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(libec_isa_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link \
@@ -1866,7 +2113,7 @@ libec_jerasure_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(libec_jerasure_la_LDFLAGS) $(LDFLAGS) -o $@
 libec_jerasure_generic_la_DEPENDENCIES = $(LIBCRUSH) \
 	$(am__DEPENDENCIES_1) $(am__DEPENDENCIES_2)
-am__objects_5 = erasure-code/libec_jerasure_generic_la-ErasureCode.lo \
+am__objects_4 = erasure-code/libec_jerasure_generic_la-ErasureCode.lo \
 	erasure-code/jerasure/jerasure/src/libec_jerasure_generic_la-cauchy.lo \
 	erasure-code/jerasure/jerasure/src/libec_jerasure_generic_la-galois.lo \
 	erasure-code/jerasure/jerasure/src/libec_jerasure_generic_la-jerasure.lo \
@@ -1885,7 +2132,7 @@ am__objects_5 = erasure-code/libec_jerasure_generic_la-ErasureCode.lo \
 	erasure-code/jerasure/gf-complete/src/libec_jerasure_generic_la-gf_w8.lo \
 	erasure-code/jerasure/libec_jerasure_generic_la-ErasureCodePluginJerasure.lo \
 	erasure-code/jerasure/libec_jerasure_generic_la-ErasureCodeJerasure.lo
-am_libec_jerasure_generic_la_OBJECTS = $(am__objects_5)
+am_libec_jerasure_generic_la_OBJECTS = $(am__objects_4)
 libec_jerasure_generic_la_OBJECTS =  \
 	$(am_libec_jerasure_generic_la_OBJECTS)
 libec_jerasure_generic_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
@@ -1894,7 +2141,7 @@ libec_jerasure_generic_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(libec_jerasure_generic_la_LDFLAGS) $(LDFLAGS) -o $@
 libec_jerasure_neon_la_DEPENDENCIES = $(LIBCRUSH) \
 	$(am__DEPENDENCIES_1) $(am__DEPENDENCIES_2)
-am__objects_6 = erasure-code/libec_jerasure_neon_la-ErasureCode.lo \
+am__objects_5 = erasure-code/libec_jerasure_neon_la-ErasureCode.lo \
 	erasure-code/jerasure/jerasure/src/libec_jerasure_neon_la-cauchy.lo \
 	erasure-code/jerasure/jerasure/src/libec_jerasure_neon_la-galois.lo \
 	erasure-code/jerasure/jerasure/src/libec_jerasure_neon_la-jerasure.lo \
@@ -1913,7 +2160,7 @@ am__objects_6 = erasure-code/libec_jerasure_neon_la-ErasureCode.lo \
 	erasure-code/jerasure/gf-complete/src/libec_jerasure_neon_la-gf_w8.lo \
 	erasure-code/jerasure/libec_jerasure_neon_la-ErasureCodePluginJerasure.lo \
 	erasure-code/jerasure/libec_jerasure_neon_la-ErasureCodeJerasure.lo
-am_libec_jerasure_neon_la_OBJECTS = $(am__objects_6) \
+am_libec_jerasure_neon_la_OBJECTS = $(am__objects_5) \
 	erasure-code/jerasure/gf-complete/src/neon/libec_jerasure_neon_la-gf_w4_neon.lo \
 	erasure-code/jerasure/gf-complete/src/neon/libec_jerasure_neon_la-gf_w8_neon.lo \
 	erasure-code/jerasure/gf-complete/src/neon/libec_jerasure_neon_la-gf_w16_neon.lo \
@@ -1928,7 +2175,7 @@ libec_jerasure_neon_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 @HAVE_NEON_TRUE@	$(erasure_codelibdir)
 libec_jerasure_sse3_la_DEPENDENCIES = $(LIBCRUSH) \
 	$(am__DEPENDENCIES_1) $(am__DEPENDENCIES_2)
-am__objects_7 = erasure-code/libec_jerasure_sse3_la-ErasureCode.lo \
+am__objects_6 = erasure-code/libec_jerasure_sse3_la-ErasureCode.lo \
 	erasure-code/jerasure/jerasure/src/libec_jerasure_sse3_la-cauchy.lo \
 	erasure-code/jerasure/jerasure/src/libec_jerasure_sse3_la-galois.lo \
 	erasure-code/jerasure/jerasure/src/libec_jerasure_sse3_la-jerasure.lo \
@@ -1947,7 +2194,7 @@ am__objects_7 = erasure-code/libec_jerasure_sse3_la-ErasureCode.lo \
 	erasure-code/jerasure/gf-complete/src/libec_jerasure_sse3_la-gf_w8.lo \
 	erasure-code/jerasure/libec_jerasure_sse3_la-ErasureCodePluginJerasure.lo \
 	erasure-code/jerasure/libec_jerasure_sse3_la-ErasureCodeJerasure.lo
-am_libec_jerasure_sse3_la_OBJECTS = $(am__objects_7)
+am_libec_jerasure_sse3_la_OBJECTS = $(am__objects_6)
 libec_jerasure_sse3_la_OBJECTS = $(am_libec_jerasure_sse3_la_OBJECTS)
 libec_jerasure_sse3_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -1957,7 +2204,7 @@ libec_jerasure_sse3_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 @HAVE_SSSE3_TRUE@	$(erasure_codelibdir)
 libec_jerasure_sse4_la_DEPENDENCIES = $(LIBCRUSH) \
 	$(am__DEPENDENCIES_1) $(am__DEPENDENCIES_2)
-am__objects_8 = erasure-code/libec_jerasure_sse4_la-ErasureCode.lo \
+am__objects_7 = erasure-code/libec_jerasure_sse4_la-ErasureCode.lo \
 	erasure-code/jerasure/jerasure/src/libec_jerasure_sse4_la-cauchy.lo \
 	erasure-code/jerasure/jerasure/src/libec_jerasure_sse4_la-galois.lo \
 	erasure-code/jerasure/jerasure/src/libec_jerasure_sse4_la-jerasure.lo \
@@ -1976,7 +2223,7 @@ am__objects_8 = erasure-code/libec_jerasure_sse4_la-ErasureCode.lo \
 	erasure-code/jerasure/gf-complete/src/libec_jerasure_sse4_la-gf_w8.lo \
 	erasure-code/jerasure/libec_jerasure_sse4_la-ErasureCodePluginJerasure.lo \
 	erasure-code/jerasure/libec_jerasure_sse4_la-ErasureCodeJerasure.lo
-am_libec_jerasure_sse4_la_OBJECTS = $(am__objects_8)
+am_libec_jerasure_sse4_la_OBJECTS = $(am__objects_7)
 libec_jerasure_sse4_la_OBJECTS = $(am_libec_jerasure_sse4_la_OBJECTS)
 libec_jerasure_sse4_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -1986,10 +2233,10 @@ libec_jerasure_sse4_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 @HAVE_SSE4_PCLMUL_TRUE@	$(erasure_codelibdir)
 libec_lrc_la_DEPENDENCIES = $(LIBCRUSH) $(am__DEPENDENCIES_1) \
 	$(LIBJSON_SPIRIT)
-am__objects_9 = erasure-code/libec_lrc_la-ErasureCode.lo \
+am__objects_8 = erasure-code/libec_lrc_la-ErasureCode.lo \
 	erasure-code/lrc/libec_lrc_la-ErasureCodePluginLrc.lo \
 	erasure-code/lrc/libec_lrc_la-ErasureCodeLrc.lo
-am_libec_lrc_la_OBJECTS = $(am__objects_9) \
+am_libec_lrc_la_OBJECTS = $(am__objects_8) \
 	common/libec_lrc_la-str_map.lo
 libec_lrc_la_OBJECTS = $(am_libec_lrc_la_OBJECTS)
 libec_lrc_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \
@@ -2027,33 +2274,139 @@ libec_missing_version_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(erasure_codelibdir)
 libec_shec_la_DEPENDENCIES = $(LIBCRUSH) $(am__DEPENDENCIES_1) \
 	$(am__DEPENDENCIES_2)
-am_libec_shec_la_OBJECTS = erasure-code/libec_shec_la-ErasureCode.lo \
-	erasure-code/shec/libec_shec_la-ErasureCodePluginShec.lo \
-	erasure-code/shec/libec_shec_la-ErasureCodeShec.lo \
-	erasure-code/shec/libec_shec_la-ErasureCodeShecTableCache.lo \
-	erasure-code/shec/libec_shec_la-shec.lo \
-	erasure-code/shec/libec_shec_la-determinant.lo \
-	erasure-code/jerasure/jerasure/src/libec_shec_la-cauchy.lo \
-	erasure-code/jerasure/jerasure/src/libec_shec_la-galois.lo \
-	erasure-code/jerasure/jerasure/src/libec_shec_la-jerasure.lo \
-	erasure-code/jerasure/jerasure/src/libec_shec_la-liberation.lo \
-	erasure-code/jerasure/jerasure/src/libec_shec_la-reed_sol.lo \
-	erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_wgen.lo \
-	erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_method.lo \
-	erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_w16.lo \
-	erasure-code/jerasure/gf-complete/src/libec_shec_la-gf.lo \
-	erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_w32.lo \
-	erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_w64.lo \
-	erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_w128.lo \
-	erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_general.lo \
-	erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_w4.lo \
-	erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_rand.lo \
-	erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_w8.lo
+am_libec_shec_la_OBJECTS = erasure-code/shec/libec_shec_la-ErasureCodePluginSelectShec.lo
 libec_shec_la_OBJECTS = $(am_libec_shec_la_OBJECTS)
 libec_shec_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(libec_shec_la_CXXFLAGS) $(CXXFLAGS) $(libec_shec_la_LDFLAGS) \
 	$(LDFLAGS) -o $@
+libec_shec_generic_la_DEPENDENCIES = $(LIBCRUSH) $(am__DEPENDENCIES_1) \
+	$(am__DEPENDENCIES_2)
+am__objects_9 = erasure-code/libec_shec_generic_la-ErasureCode.lo \
+	erasure-code/shec/libec_shec_generic_la-ErasureCodePluginShec.lo \
+	erasure-code/shec/libec_shec_generic_la-ErasureCodeShec.lo \
+	erasure-code/shec/libec_shec_generic_la-ErasureCodeShecTableCache.lo \
+	erasure-code/shec/libec_shec_generic_la-determinant.lo \
+	erasure-code/jerasure/jerasure/src/libec_shec_generic_la-cauchy.lo \
+	erasure-code/jerasure/jerasure/src/libec_shec_generic_la-galois.lo \
+	erasure-code/jerasure/jerasure/src/libec_shec_generic_la-jerasure.lo \
+	erasure-code/jerasure/jerasure/src/libec_shec_generic_la-liberation.lo \
+	erasure-code/jerasure/jerasure/src/libec_shec_generic_la-reed_sol.lo \
+	erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_wgen.lo \
+	erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_method.lo \
+	erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_w16.lo \
+	erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf.lo \
+	erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_w32.lo \
+	erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_w64.lo \
+	erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_w128.lo \
+	erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_general.lo \
+	erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_w4.lo \
+	erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_rand.lo \
+	erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_w8.lo
+am_libec_shec_generic_la_OBJECTS = $(am__objects_9)
+libec_shec_generic_la_OBJECTS = $(am_libec_shec_generic_la_OBJECTS)
+libec_shec_generic_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
+	$(libec_shec_generic_la_CXXFLAGS) $(CXXFLAGS) \
+	$(libec_shec_generic_la_LDFLAGS) $(LDFLAGS) -o $@
+libec_shec_neon_la_DEPENDENCIES = $(LIBCRUSH) $(am__DEPENDENCIES_1) \
+	$(am__DEPENDENCIES_2)
+am__objects_10 = erasure-code/libec_shec_neon_la-ErasureCode.lo \
+	erasure-code/shec/libec_shec_neon_la-ErasureCodePluginShec.lo \
+	erasure-code/shec/libec_shec_neon_la-ErasureCodeShec.lo \
+	erasure-code/shec/libec_shec_neon_la-ErasureCodeShecTableCache.lo \
+	erasure-code/shec/libec_shec_neon_la-determinant.lo \
+	erasure-code/jerasure/jerasure/src/libec_shec_neon_la-cauchy.lo \
+	erasure-code/jerasure/jerasure/src/libec_shec_neon_la-galois.lo \
+	erasure-code/jerasure/jerasure/src/libec_shec_neon_la-jerasure.lo \
+	erasure-code/jerasure/jerasure/src/libec_shec_neon_la-liberation.lo \
+	erasure-code/jerasure/jerasure/src/libec_shec_neon_la-reed_sol.lo \
+	erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_wgen.lo \
+	erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_method.lo \
+	erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_w16.lo \
+	erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf.lo \
+	erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_w32.lo \
+	erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_w64.lo \
+	erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_w128.lo \
+	erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_general.lo \
+	erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_w4.lo \
+	erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_rand.lo \
+	erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_w8.lo
+am_libec_shec_neon_la_OBJECTS = $(am__objects_10) \
+	erasure-code/jerasure/gf-complete/src/neon/libec_shec_neon_la-gf_w4_neon.lo \
+	erasure-code/jerasure/gf-complete/src/neon/libec_shec_neon_la-gf_w8_neon.lo \
+	erasure-code/jerasure/gf-complete/src/neon/libec_shec_neon_la-gf_w16_neon.lo \
+	erasure-code/jerasure/gf-complete/src/neon/libec_shec_neon_la-gf_w32_neon.lo \
+	erasure-code/jerasure/gf-complete/src/neon/libec_shec_neon_la-gf_w64_neon.lo
+libec_shec_neon_la_OBJECTS = $(am_libec_shec_neon_la_OBJECTS)
+libec_shec_neon_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
+	$(libec_shec_neon_la_CXXFLAGS) $(CXXFLAGS) \
+	$(libec_shec_neon_la_LDFLAGS) $(LDFLAGS) -o $@
+ at HAVE_NEON_TRUE@am_libec_shec_neon_la_rpath = -rpath \
+ at HAVE_NEON_TRUE@	$(erasure_codelibdir)
+libec_shec_sse3_la_DEPENDENCIES = $(LIBCRUSH) $(am__DEPENDENCIES_1) \
+	$(am__DEPENDENCIES_2)
+am__objects_11 = erasure-code/libec_shec_sse3_la-ErasureCode.lo \
+	erasure-code/shec/libec_shec_sse3_la-ErasureCodePluginShec.lo \
+	erasure-code/shec/libec_shec_sse3_la-ErasureCodeShec.lo \
+	erasure-code/shec/libec_shec_sse3_la-ErasureCodeShecTableCache.lo \
+	erasure-code/shec/libec_shec_sse3_la-determinant.lo \
+	erasure-code/jerasure/jerasure/src/libec_shec_sse3_la-cauchy.lo \
+	erasure-code/jerasure/jerasure/src/libec_shec_sse3_la-galois.lo \
+	erasure-code/jerasure/jerasure/src/libec_shec_sse3_la-jerasure.lo \
+	erasure-code/jerasure/jerasure/src/libec_shec_sse3_la-liberation.lo \
+	erasure-code/jerasure/jerasure/src/libec_shec_sse3_la-reed_sol.lo \
+	erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_wgen.lo \
+	erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_method.lo \
+	erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_w16.lo \
+	erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf.lo \
+	erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_w32.lo \
+	erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_w64.lo \
+	erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_w128.lo \
+	erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_general.lo \
+	erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_w4.lo \
+	erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_rand.lo \
+	erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_w8.lo
+am_libec_shec_sse3_la_OBJECTS = $(am__objects_11)
+libec_shec_sse3_la_OBJECTS = $(am_libec_shec_sse3_la_OBJECTS)
+libec_shec_sse3_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
+	$(libec_shec_sse3_la_CXXFLAGS) $(CXXFLAGS) \
+	$(libec_shec_sse3_la_LDFLAGS) $(LDFLAGS) -o $@
+ at HAVE_SSSE3_TRUE@am_libec_shec_sse3_la_rpath = -rpath \
+ at HAVE_SSSE3_TRUE@	$(erasure_codelibdir)
+libec_shec_sse4_la_DEPENDENCIES = $(LIBCRUSH) $(am__DEPENDENCIES_1) \
+	$(am__DEPENDENCIES_2)
+am__objects_12 = erasure-code/libec_shec_sse4_la-ErasureCode.lo \
+	erasure-code/shec/libec_shec_sse4_la-ErasureCodePluginShec.lo \
+	erasure-code/shec/libec_shec_sse4_la-ErasureCodeShec.lo \
+	erasure-code/shec/libec_shec_sse4_la-ErasureCodeShecTableCache.lo \
+	erasure-code/shec/libec_shec_sse4_la-determinant.lo \
+	erasure-code/jerasure/jerasure/src/libec_shec_sse4_la-cauchy.lo \
+	erasure-code/jerasure/jerasure/src/libec_shec_sse4_la-galois.lo \
+	erasure-code/jerasure/jerasure/src/libec_shec_sse4_la-jerasure.lo \
+	erasure-code/jerasure/jerasure/src/libec_shec_sse4_la-liberation.lo \
+	erasure-code/jerasure/jerasure/src/libec_shec_sse4_la-reed_sol.lo \
+	erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_wgen.lo \
+	erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_method.lo \
+	erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_w16.lo \
+	erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf.lo \
+	erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_w32.lo \
+	erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_w64.lo \
+	erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_w128.lo \
+	erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_general.lo \
+	erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_w4.lo \
+	erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_rand.lo \
+	erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_w8.lo
+am_libec_shec_sse4_la_OBJECTS = $(am__objects_12)
+libec_shec_sse4_la_OBJECTS = $(am_libec_shec_sse4_la_OBJECTS)
+libec_shec_sse4_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
+	$(libec_shec_sse4_la_CXXFLAGS) $(CXXFLAGS) \
+	$(libec_shec_sse4_la_LDFLAGS) $(LDFLAGS) -o $@
+ at HAVE_SSE4_PCLMUL_TRUE@am_libec_shec_sse4_la_rpath = -rpath \
+ at HAVE_SSE4_PCLMUL_TRUE@	$(erasure_codelibdir)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_test_jerasure_generic_la_DEPENDENCIES =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_2)
@@ -2114,6 +2467,66 @@ libec_test_jerasure_sse4_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libec_test_jerasure_sse4_la_rpath =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	-rpath \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(erasure_codelibdir)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_test_shec_generic_la_DEPENDENCIES =  \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_2)
+am__libec_test_shec_generic_la_SOURCES_DIST =  \
+	test/erasure-code/TestShecPluginGeneric.cc
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libec_test_shec_generic_la_OBJECTS = test/erasure-code/libec_test_shec_generic_la-TestShecPluginGeneric.lo
+libec_test_shec_generic_la_OBJECTS =  \
+	$(am_libec_test_shec_generic_la_OBJECTS)
+libec_test_shec_generic_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
+	$(libec_test_shec_generic_la_CXXFLAGS) $(CXXFLAGS) \
+	$(libec_test_shec_generic_la_LDFLAGS) $(LDFLAGS) -o $@
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libec_test_shec_generic_la_rpath =  \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	-rpath \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(erasure_codelibdir)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_test_shec_neon_la_DEPENDENCIES =  \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_2)
+am__libec_test_shec_neon_la_SOURCES_DIST =  \
+	test/erasure-code/TestShecPluginNEON.cc
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libec_test_shec_neon_la_OBJECTS = test/erasure-code/libec_test_shec_neon_la-TestShecPluginNEON.lo
+libec_test_shec_neon_la_OBJECTS =  \
+	$(am_libec_test_shec_neon_la_OBJECTS)
+libec_test_shec_neon_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
+	$(libec_test_shec_neon_la_CXXFLAGS) $(CXXFLAGS) \
+	$(libec_test_shec_neon_la_LDFLAGS) $(LDFLAGS) -o $@
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libec_test_shec_neon_la_rpath =  \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	-rpath \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(erasure_codelibdir)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_test_shec_sse3_la_DEPENDENCIES =  \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_2)
+am__libec_test_shec_sse3_la_SOURCES_DIST =  \
+	test/erasure-code/TestShecPluginSSE3.cc
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libec_test_shec_sse3_la_OBJECTS = test/erasure-code/libec_test_shec_sse3_la-TestShecPluginSSE3.lo
+libec_test_shec_sse3_la_OBJECTS =  \
+	$(am_libec_test_shec_sse3_la_OBJECTS)
+libec_test_shec_sse3_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
+	$(libec_test_shec_sse3_la_CXXFLAGS) $(CXXFLAGS) \
+	$(libec_test_shec_sse3_la_LDFLAGS) $(LDFLAGS) -o $@
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libec_test_shec_sse3_la_rpath =  \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	-rpath \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(erasure_codelibdir)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_test_shec_sse4_la_DEPENDENCIES =  \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_2)
+am__libec_test_shec_sse4_la_SOURCES_DIST =  \
+	test/erasure-code/TestShecPluginSSE4.cc
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libec_test_shec_sse4_la_OBJECTS = test/erasure-code/libec_test_shec_sse4_la-TestShecPluginSSE4.lo
+libec_test_shec_sse4_la_OBJECTS =  \
+	$(am_libec_test_shec_sse4_la_OBJECTS)
+libec_test_shec_sse4_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
+	$(libec_test_shec_sse4_la_CXXFLAGS) $(CXXFLAGS) \
+	$(libec_test_shec_sse4_la_LDFLAGS) $(LDFLAGS) -o $@
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libec_test_shec_sse4_la_rpath =  \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	-rpath \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(erasure_codelibdir)
 am_liberasure_code_la_OBJECTS = erasure-code/ErasureCodePlugin.lo
 liberasure_code_la_OBJECTS = $(am_liberasure_code_la_OBJECTS)
 libglobal_la_DEPENDENCIES = $(LIBCOMMON)
@@ -2137,26 +2550,30 @@ am_liblog_la_OBJECTS = log/Log.lo log/SubsystemMap.lo
 liblog_la_OBJECTS = $(am_liblog_la_OBJECTS)
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at libmds_la_DEPENDENCIES =  \
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	$(LIBMDS_DEPS)
-am__libmds_la_SOURCES_DIST = mds/Capability.cc mds/MDS.cc \
-	mds/Beacon.cc mds/locks.c mds/journal.cc mds/Server.cc \
-	mds/Mutation.cc mds/MDCache.cc mds/RecoveryQueue.cc \
-	mds/Locker.cc mds/Migrator.cc mds/MDBalancer.cc mds/CDentry.cc \
-	mds/CDir.cc mds/CInode.cc mds/LogEvent.cc mds/MDSTable.cc \
-	mds/InoTable.cc mds/JournalPointer.cc mds/MDSTableClient.cc \
-	mds/MDSTableServer.cc mds/SnapRealm.cc mds/SnapServer.cc \
-	mds/snap.cc mds/SessionMap.cc mds/MDSContext.cc \
-	mds/MDSAuthCaps.cc mds/MDLog.cc common/TrackedOp.cc
-am__objects_10 = mds/Capability.lo mds/MDS.lo mds/Beacon.lo \
-	mds/locks.lo mds/journal.lo mds/Server.lo mds/Mutation.lo \
-	mds/MDCache.lo mds/RecoveryQueue.lo mds/Locker.lo \
-	mds/Migrator.lo mds/MDBalancer.lo mds/CDentry.lo mds/CDir.lo \
-	mds/CInode.lo mds/LogEvent.lo mds/MDSTable.lo mds/InoTable.lo \
+am__libmds_la_SOURCES_DIST = mds/Capability.cc mds/MDSDaemon.cc \
+	mds/MDSRank.cc mds/Beacon.cc mds/locks.c mds/journal.cc \
+	mds/Server.cc mds/Mutation.cc mds/MDCache.cc \
+	mds/RecoveryQueue.cc mds/StrayManager.cc mds/Locker.cc \
+	mds/Migrator.cc mds/MDBalancer.cc mds/CDentry.cc mds/CDir.cc \
+	mds/CInode.cc mds/LogEvent.cc mds/MDSTable.cc mds/InoTable.cc \
+	mds/JournalPointer.cc mds/MDSTableClient.cc \
+	mds/MDSTableServer.cc mds/SimpleLock.cc mds/SnapRealm.cc \
+	mds/SnapServer.cc mds/snap.cc mds/SessionMap.cc \
+	mds/MDSContext.cc mds/MDSAuthCaps.cc mds/MDLog.cc \
+	common/TrackedOp.cc
+am__objects_13 = mds/Capability.lo mds/MDSDaemon.lo mds/MDSRank.lo \
+	mds/Beacon.lo mds/locks.lo mds/journal.lo mds/Server.lo \
+	mds/Mutation.lo mds/MDCache.lo mds/RecoveryQueue.lo \
+	mds/StrayManager.lo mds/Locker.lo mds/Migrator.lo \
+	mds/MDBalancer.lo mds/CDentry.lo mds/CDir.lo mds/CInode.lo \
+	mds/LogEvent.lo mds/MDSTable.lo mds/InoTable.lo \
 	mds/JournalPointer.lo mds/MDSTableClient.lo \
-	mds/MDSTableServer.lo mds/SnapRealm.lo mds/SnapServer.lo \
-	mds/snap.lo mds/SessionMap.lo mds/MDSContext.lo \
-	mds/MDSAuthCaps.lo mds/MDLog.lo common/TrackedOp.lo
+	mds/MDSTableServer.lo mds/SimpleLock.lo mds/SnapRealm.lo \
+	mds/SnapServer.lo mds/snap.lo mds/SessionMap.lo \
+	mds/MDSContext.lo mds/MDSAuthCaps.lo mds/MDLog.lo \
+	common/TrackedOp.lo
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at am_libmds_la_OBJECTS =  \
- at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	$(am__objects_10)
+ at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	$(am__objects_13)
 libmds_la_OBJECTS = $(am_libmds_la_OBJECTS)
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at am_libmds_la_rpath =
 @WITH_LIBZFS_TRUE at am__DEPENDENCIES_4 = libos_zfs.a
@@ -2169,9 +2586,8 @@ am__DEPENDENCIES_5 = libos.la $(am__DEPENDENCIES_1) \
 am__libmon_la_SOURCES_DIST = mon/Monitor.cc mon/Paxos.cc \
 	mon/PaxosService.cc mon/OSDMonitor.cc mon/MDSMonitor.cc \
 	mon/MonmapMonitor.cc mon/PGMonitor.cc mon/LogMonitor.cc \
-	mon/AuthMonitor.cc mon/Elector.cc mon/MonitorStore.cc \
-	mon/HealthMonitor.cc mon/DataHealthService.cc \
-	mon/ConfigKeyService.cc
+	mon/AuthMonitor.cc mon/Elector.cc mon/HealthMonitor.cc \
+	mon/DataHealthService.cc mon/ConfigKeyService.cc
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am_libmon_la_OBJECTS =  \
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/Monitor.lo mon/Paxos.lo \
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/PaxosService.lo \
@@ -2182,7 +2598,6 @@ am__libmon_la_SOURCES_DIST = mon/Monitor.cc mon/Paxos.cc \
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/LogMonitor.lo \
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/AuthMonitor.lo \
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/Elector.lo \
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/MonitorStore.lo \
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/HealthMonitor.lo \
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/DataHealthService.lo \
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/ConfigKeyService.lo
@@ -2203,11 +2618,11 @@ am__libmsg_la_SOURCES_DIST = msg/Message.cc msg/Messenger.cc \
 	msg/async/EventKqueue.h msg/xio/QueueStrategy.cc \
 	msg/xio/XioConnection.cc msg/xio/XioMessenger.cc \
 	msg/xio/XioMsg.cc msg/xio/XioPortal.cc msg/xio/XioPool.cc
- at LINUX_TRUE@am__objects_11 = msg/async/EventEpoll.lo
- at DARWIN_TRUE@am__objects_12 = msg/async/EventKqueue.lo
- at FREEBSD_TRUE@am__objects_13 = msg/async/EventKqueue.lo
-am__objects_14 =
- at ENABLE_XIO_TRUE@am__objects_15 = msg/xio/QueueStrategy.lo \
+ at LINUX_TRUE@am__objects_14 = msg/async/EventEpoll.lo
+ at DARWIN_TRUE@am__objects_15 = msg/async/EventKqueue.lo
+ at FREEBSD_TRUE@am__objects_16 = msg/async/EventKqueue.lo
+am__objects_17 =
+ at ENABLE_XIO_TRUE@am__objects_18 = msg/xio/QueueStrategy.lo \
 @ENABLE_XIO_TRUE@	msg/xio/XioConnection.lo \
 @ENABLE_XIO_TRUE@	msg/xio/XioMessenger.lo msg/xio/XioMsg.lo \
 @ENABLE_XIO_TRUE@	msg/xio/XioPortal.lo msg/xio/XioPool.lo
@@ -2217,33 +2632,37 @@ am_libmsg_la_OBJECTS = msg/Message.lo msg/Messenger.lo \
 	msg/simple/PipeConnection.lo msg/simple/SimpleMessenger.lo \
 	msg/async/AsyncConnection.lo msg/async/AsyncMessenger.lo \
 	msg/async/Event.lo msg/async/net_handler.lo \
-	msg/async/EventSelect.lo $(am__objects_11) $(am__objects_12) \
-	$(am__objects_13) $(am__objects_14) $(am__objects_14) \
-	$(am__objects_14) $(am__objects_15)
+	msg/async/EventSelect.lo $(am__objects_14) $(am__objects_15) \
+	$(am__objects_16) $(am__objects_17) $(am__objects_17) \
+	$(am__objects_17) $(am__objects_18)
 libmsg_la_OBJECTS = $(am_libmsg_la_OBJECTS)
 @ENABLE_SERVER_TRUE@@WITH_KINETIC_TRUE at am__DEPENDENCIES_6 =  \
 @ENABLE_SERVER_TRUE@@WITH_KINETIC_TRUE@	libcrypto.a
 @ENABLE_SERVER_TRUE at libos_la_DEPENDENCIES = $(LIBOS_TYPES) \
- at ENABLE_SERVER_TRUE@	$(am__append_31) $(am__DEPENDENCIES_6)
-am__libos_la_SOURCES_DIST = os/chain_xattr.cc os/DBObjectMap.cc \
-	os/GenericObjectMap.cc os/FileJournal.cc os/FileStore.cc \
-	os/FlatIndex.cc os/GenericFileStoreBackend.cc os/HashIndex.cc \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_6)
+am__libos_la_SOURCES_DIST = os/chain_xattr.cc os/fs/FS.cc \
+	os/DBObjectMap.cc os/GenericObjectMap.cc os/FileJournal.cc \
+	os/FileStore.cc os/GenericFileStoreBackend.cc os/HashIndex.cc \
 	os/IndexManager.cc os/JournalingObjectStore.cc \
 	os/LevelDBStore.cc os/LFNIndex.cc os/MemStore.cc \
 	os/KeyValueDB.cc os/KeyValueStore.cc os/ObjectStore.cc \
 	os/WBThrottle.cc common/TrackedOp.cc \
-	os/BtrfsFileStoreBackend.cc os/XfsFileStoreBackend.cc \
+	os/BtrfsFileStoreBackend.cc os/newstore/NewStore.cc \
+	os/fs/XFS.cc os/XfsFileStoreBackend.cc \
 	os/ZFSFileStoreBackend.cc os/KineticStore.cc
- at ENABLE_SERVER_TRUE@@LINUX_TRUE at am__objects_16 = os/libos_la-BtrfsFileStoreBackend.lo
- at ENABLE_SERVER_TRUE@@WITH_LIBXFS_TRUE at am__objects_17 = os/libos_la-XfsFileStoreBackend.lo
- at ENABLE_SERVER_TRUE@@WITH_LIBZFS_TRUE at am__objects_18 = os/libos_la-ZFSFileStoreBackend.lo
- at ENABLE_SERVER_TRUE@@WITH_KINETIC_TRUE at am__objects_19 = os/libos_la-KineticStore.lo
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE at am__objects_19 = os/libos_la-BtrfsFileStoreBackend.lo
+ at ENABLE_SERVER_TRUE@@WITH_LIBAIO_TRUE at am__objects_20 = os/newstore/libos_la-NewStore.lo
+ at ENABLE_SERVER_TRUE@@WITH_LIBXFS_TRUE at am__objects_21 =  \
+ at ENABLE_SERVER_TRUE@@WITH_LIBXFS_TRUE@	os/fs/libos_la-XFS.lo \
+ at ENABLE_SERVER_TRUE@@WITH_LIBXFS_TRUE@	os/libos_la-XfsFileStoreBackend.lo
+ at ENABLE_SERVER_TRUE@@WITH_LIBZFS_TRUE at am__objects_22 = os/libos_la-ZFSFileStoreBackend.lo
+ at ENABLE_SERVER_TRUE@@WITH_KINETIC_TRUE at am__objects_23 = os/libos_la-KineticStore.lo
 @ENABLE_SERVER_TRUE at am_libos_la_OBJECTS = os/libos_la-chain_xattr.lo \
+ at ENABLE_SERVER_TRUE@	os/fs/libos_la-FS.lo \
 @ENABLE_SERVER_TRUE@	os/libos_la-DBObjectMap.lo \
 @ENABLE_SERVER_TRUE@	os/libos_la-GenericObjectMap.lo \
 @ENABLE_SERVER_TRUE@	os/libos_la-FileJournal.lo \
 @ENABLE_SERVER_TRUE@	os/libos_la-FileStore.lo \
- at ENABLE_SERVER_TRUE@	os/libos_la-FlatIndex.lo \
 @ENABLE_SERVER_TRUE@	os/libos_la-GenericFileStoreBackend.lo \
 @ENABLE_SERVER_TRUE@	os/libos_la-HashIndex.lo \
 @ENABLE_SERVER_TRUE@	os/libos_la-IndexManager.lo \
@@ -2255,10 +2674,10 @@ am__libos_la_SOURCES_DIST = os/chain_xattr.cc os/DBObjectMap.cc \
 @ENABLE_SERVER_TRUE@	os/libos_la-KeyValueStore.lo \
 @ENABLE_SERVER_TRUE@	os/libos_la-ObjectStore.lo \
 @ENABLE_SERVER_TRUE@	os/libos_la-WBThrottle.lo \
- at ENABLE_SERVER_TRUE@	os/libos_la-KeyValueDB.lo \
 @ENABLE_SERVER_TRUE@	common/libos_la-TrackedOp.lo \
- at ENABLE_SERVER_TRUE@	$(am__objects_16) $(am__objects_17) \
- at ENABLE_SERVER_TRUE@	$(am__objects_18) $(am__objects_19)
+ at ENABLE_SERVER_TRUE@	$(am__objects_19) $(am__objects_20) \
+ at ENABLE_SERVER_TRUE@	$(am__objects_21) $(am__objects_22) \
+ at ENABLE_SERVER_TRUE@	$(am__objects_23)
 libos_la_OBJECTS = $(am_libos_la_OBJECTS)
 libos_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \
 	$(LIBTOOLFLAGS) --mode=link $(CXXLD) $(libos_la_CXXFLAGS) \
@@ -2275,8 +2694,23 @@ libos_rocksdb_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(LDFLAGS) -o $@
 @ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_TRUE at am_libos_rocksdb_la_rpath =
 @ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE at am_libos_rocksdb_la_rpath =
+libos_tp_la_DEPENDENCIES =
+am__libos_tp_la_SOURCES_DIST = tracing/objectstore.c
+ at WITH_LTTNG_TRUE@am_libos_tp_la_OBJECTS =  \
+ at WITH_LTTNG_TRUE@	tracing/libos_tp_la-objectstore.lo
+nodist_libos_tp_la_OBJECTS =
+libos_tp_la_OBJECTS = $(am_libos_tp_la_OBJECTS) \
+	$(nodist_libos_tp_la_OBJECTS)
+libos_tp_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(libos_tp_la_CFLAGS) \
+	$(CFLAGS) $(libos_tp_la_LDFLAGS) $(LDFLAGS) -o $@
+ at WITH_LTTNG_TRUE@am_libos_tp_la_rpath = -rpath $(libdir)
 libos_types_la_LIBADD =
-am_libos_types_la_OBJECTS = os/libos_types_la-Transaction.lo
+am__libos_types_la_SOURCES_DIST = os/Transaction.cc \
+	os/newstore/newstore_types.cc
+ at ENABLE_SERVER_TRUE@@WITH_LIBAIO_TRUE at am__objects_24 = os/newstore/libos_types_la-newstore_types.lo
+am_libos_types_la_OBJECTS = os/libos_types_la-Transaction.lo \
+	$(am__objects_24)
 libos_types_la_OBJECTS = $(am_libos_types_la_OBJECTS)
 libos_types_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -2286,14 +2720,13 @@ libos_types_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSDC) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_5) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD_TYPES) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOS_TYPES) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_45)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOS_TYPES)
 am__libosd_la_SOURCES_DIST = osd/PG.cc osd/ReplicatedPG.cc \
 	osd/ReplicatedBackend.cc osd/ECBackend.cc osd/ECMsgTypes.cc \
-	osd/ECTransaction.cc osd/PGBackend.cc osd/Ager.cc \
-	osd/HitSet.cc osd/OSD.cc osd/OSDCap.cc osd/Watch.cc \
-	osd/ClassHandler.cc osd/OpRequest.cc common/TrackedOp.cc \
-	osd/SnapMapper.cc objclass/class_api.cc
+	osd/ECTransaction.cc osd/PGBackend.cc osd/HitSet.cc osd/OSD.cc \
+	osd/OSDCap.cc osd/Watch.cc osd/ClassHandler.cc \
+	osd/OpRequest.cc common/TrackedOp.cc osd/SnapMapper.cc \
+	objclass/class_api.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libosd_la_OBJECTS =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/libosd_la-PG.lo \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/libosd_la-ReplicatedPG.lo \
@@ -2302,7 +2735,6 @@ am__libosd_la_SOURCES_DIST = osd/PG.cc osd/ReplicatedPG.cc \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/libosd_la-ECMsgTypes.lo \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/libosd_la-ECTransaction.lo \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/libosd_la-PGBackend.lo \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/libosd_la-Ager.lo \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/libosd_la-HitSet.lo \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/libosd_la-OSD.lo \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/libosd_la-OSDCap.lo \
@@ -2317,6 +2749,20 @@ libosd_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \
 	$(LIBTOOLFLAGS) --mode=link $(CXXLD) $(libosd_la_CXXFLAGS) \
 	$(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libosd_la_rpath =
+libosd_tp_la_DEPENDENCIES =
+am__libosd_tp_la_SOURCES_DIST = tracing/oprequest.c tracing/osd.c \
+	tracing/pg.c
+ at WITH_LTTNG_TRUE@am_libosd_tp_la_OBJECTS =  \
+ at WITH_LTTNG_TRUE@	tracing/libosd_tp_la-oprequest.lo \
+ at WITH_LTTNG_TRUE@	tracing/libosd_tp_la-osd.lo \
+ at WITH_LTTNG_TRUE@	tracing/libosd_tp_la-pg.lo
+nodist_libosd_tp_la_OBJECTS =
+libosd_tp_la_OBJECTS = $(am_libosd_tp_la_OBJECTS) \
+	$(nodist_libosd_tp_la_OBJECTS)
+libosd_tp_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(libosd_tp_la_CFLAGS) \
+	$(CFLAGS) $(libosd_tp_la_LDFLAGS) $(LDFLAGS) -o $@
+ at WITH_LTTNG_TRUE@am_libosd_tp_la_rpath = -rpath $(libdir)
 libosd_types_la_LIBADD =
 am_libosd_types_la_OBJECTS = osd/libosd_types_la-PGLog.lo \
 	osd/libosd_types_la-osd_types.lo osd/libosd_types_la-ECUtil.lo
@@ -2333,20 +2779,20 @@ libperfglue_la_DEPENDENCIES =
 am__libperfglue_la_SOURCES_DIST = perfglue/heap_profiler.cc \
 	perfglue/disabled_heap_profiler.cc perfglue/cpu_profiler.cc \
 	perfglue/disabled_stubs.cc
- at WITH_TCMALLOC_TRUE@am__objects_20 = perfglue/heap_profiler.lo
- at WITH_TCMALLOC_FALSE@@WITH_TCMALLOC_MINIMAL_TRUE at am__objects_21 = perfglue/heap_profiler.lo
- at WITH_TCMALLOC_FALSE@@WITH_TCMALLOC_MINIMAL_FALSE at am__objects_22 = perfglue/disabled_heap_profiler.lo
- at WITH_PROFILER_TRUE@am__objects_23 = perfglue/cpu_profiler.lo
- at WITH_PROFILER_FALSE@am__objects_24 = perfglue/disabled_stubs.lo
-am_libperfglue_la_OBJECTS = $(am__objects_20) $(am__objects_21) \
-	$(am__objects_22) $(am__objects_23) $(am__objects_24)
+ at WITH_TCMALLOC_TRUE@am__objects_25 = perfglue/heap_profiler.lo
+ at WITH_TCMALLOC_FALSE@@WITH_TCMALLOC_MINIMAL_TRUE at am__objects_26 = perfglue/heap_profiler.lo
+ at WITH_TCMALLOC_FALSE@@WITH_TCMALLOC_MINIMAL_FALSE at am__objects_27 = perfglue/disabled_heap_profiler.lo
+ at WITH_PROFILER_TRUE@am__objects_28 = perfglue/cpu_profiler.lo
+ at WITH_PROFILER_FALSE@am__objects_29 = perfglue/disabled_stubs.lo
+am_libperfglue_la_OBJECTS = $(am__objects_25) $(am__objects_26) \
+	$(am__objects_27) $(am__objects_28) $(am__objects_29)
 libperfglue_la_OBJECTS = $(am_libperfglue_la_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__DEPENDENCIES_7 =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	librados_internal.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	libcls_lock_client.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBOSDC) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_3)
-am__DEPENDENCIES_8 = $(am__DEPENDENCIES_7) $(am__append_91)
+am__DEPENDENCIES_8 = $(am__DEPENDENCIES_7)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at librados_la_DEPENDENCIES =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_1) \
@@ -2398,8 +2844,27 @@ am__librados_test_stub_la_SOURCES_DIST =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	test/librados_test_stub/TestWatchNotify.lo
 librados_test_stub_la_OBJECTS = $(am_librados_test_stub_la_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am_librados_test_stub_la_rpath =
- at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at am__DEPENDENCIES_9 = $(am__DEPENDENCIES_8)
- at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at libradosstriper_la_DEPENDENCIES = $(am__DEPENDENCIES_9)
+librados_tp_la_DEPENDENCIES =
+am__librados_tp_la_SOURCES_DIST = tracing/librados.c
+ at WITH_LTTNG_TRUE@am_librados_tp_la_OBJECTS =  \
+ at WITH_LTTNG_TRUE@	tracing/librados_tp_la-librados.lo
+nodist_librados_tp_la_OBJECTS =
+librados_tp_la_OBJECTS = $(am_librados_tp_la_OBJECTS) \
+	$(nodist_librados_tp_la_OBJECTS)
+librados_tp_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CCLD) \
+	$(librados_tp_la_CFLAGS) $(CFLAGS) $(librados_tp_la_LDFLAGS) \
+	$(LDFLAGS) -o $@
+ at WITH_LTTNG_TRUE@am_librados_tp_la_rpath = -rpath $(libdir)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at am__DEPENDENCIES_9 = librados_internal.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	libcls_lock_client.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	$(LIBOSDC) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_3)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at libradosstriper_la_DEPENDENCIES = $(am__DEPENDENCIES_9) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_1) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_1) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_2)
 am__libradosstriper_la_SOURCES_DIST =  \
 	libradosstriper/libradosstriper.cc \
 	libradosstriper/RadosStriperImpl.cc \
@@ -2444,8 +2909,7 @@ libradostest_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	libcls_rbd_client.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	libcls_lock_client.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_2) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__append_100)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_2)
 am__librbd_la_SOURCES_DIST = librbd/librbd.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am_librbd_la_OBJECTS = librbd/librbd_la-librbd.lo
 librbd_la_OBJECTS = $(am_librbd_la_OBJECTS)
@@ -2466,8 +2930,10 @@ am__librbd_internal_la_SOURCES_DIST = librbd/AioCompletion.cc \
 	librbd/AsyncObjectThrottle.cc librbd/AsyncOperation.cc \
 	librbd/AsyncRequest.cc librbd/AsyncResizeRequest.cc \
 	librbd/AsyncTrimRequest.cc librbd/CopyupRequest.cc \
-	librbd/ImageCtx.cc librbd/ImageWatcher.cc librbd/internal.cc \
-	librbd/LibrbdWriteback.cc librbd/ObjectMap.cc
+	librbd/DiffIterate.cc librbd/ImageCtx.cc \
+	librbd/ImageWatcher.cc librbd/internal.cc \
+	librbd/LibrbdWriteback.cc librbd/ObjectMap.cc \
+	librbd/RebuildObjectMapRequest.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am_librbd_internal_la_OBJECTS = librbd/AioCompletion.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AioRequest.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AsyncFlattenRequest.lo \
@@ -2477,11 +2943,13 @@ am__librbd_internal_la_SOURCES_DIST = librbd/AioCompletion.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AsyncResizeRequest.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AsyncTrimRequest.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/CopyupRequest.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/DiffIterate.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/ImageCtx.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/ImageWatcher.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/internal.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/LibrbdWriteback.lo \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/ObjectMap.lo
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/ObjectMap.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/RebuildObjectMapRequest.lo
 librbd_internal_la_OBJECTS = $(am_librbd_internal_la_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am_librbd_internal_la_rpath =
 am__DEPENDENCIES_10 = $(LIBGLOBAL) $(LIBCOMMON) $(am__DEPENDENCIES_1) \
@@ -2491,16 +2959,15 @@ am__DEPENDENCIES_10 = $(LIBGLOBAL) $(LIBCOMMON) $(am__DEPENDENCIES_1) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRADOS) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_10)
 am__librbd_replay_la_SOURCES_DIST = rbd_replay/actions.cc \
-	rbd_replay/Deser.cc rbd_replay/ImageNameMap.cc \
+	rbd_replay/BufferReader.cc rbd_replay/ImageNameMap.cc \
 	rbd_replay/PendingIO.cc rbd_replay/rbd_loc.cc \
-	rbd_replay/Replayer.cc rbd_replay/Ser.cc
+	rbd_replay/Replayer.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am_librbd_replay_la_OBJECTS = rbd_replay/actions.lo \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd_replay/Deser.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd_replay/BufferReader.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd_replay/ImageNameMap.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd_replay/PendingIO.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd_replay/rbd_loc.lo \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd_replay/Replayer.lo \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd_replay/Ser.lo
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd_replay/Replayer.lo
 librbd_replay_la_OBJECTS = $(am_librbd_replay_la_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am_librbd_replay_la_rpath =
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at librbd_replay_ios_la_DEPENDENCIES =  \
@@ -2512,6 +2979,11 @@ am__librbd_replay_ios_la_SOURCES_DIST = rbd_replay/ios.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am_librbd_replay_ios_la_OBJECTS = rbd_replay/ios.lo
 librbd_replay_ios_la_OBJECTS = $(am_librbd_replay_ios_la_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am_librbd_replay_ios_la_rpath =
+librbd_replay_types_la_LIBADD =
+am__librbd_replay_types_la_SOURCES_DIST = rbd_replay/ActionTypes.cc
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am_librbd_replay_types_la_OBJECTS = rbd_replay/ActionTypes.lo
+librbd_replay_types_la_OBJECTS = $(am_librbd_replay_types_la_OBJECTS)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am_librbd_replay_types_la_rpath =
 librbd_test_la_LIBADD =
 am__librbd_test_la_SOURCES_DIST = test/librbd/test_fixture.cc \
 	test/librbd/test_support.cc test/librbd/test_librbd.cc \
@@ -2529,6 +3001,17 @@ librbd_test_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(librbd_test_la_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
 	$(LDFLAGS) -o $@
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am_librbd_test_la_rpath =
+librbd_tp_la_DEPENDENCIES =
+am__librbd_tp_la_SOURCES_DIST = tracing/librbd.c
+ at WITH_LTTNG_TRUE@am_librbd_tp_la_OBJECTS =  \
+ at WITH_LTTNG_TRUE@	tracing/librbd_tp_la-librbd.lo
+nodist_librbd_tp_la_OBJECTS =
+librbd_tp_la_OBJECTS = $(am_librbd_tp_la_OBJECTS) \
+	$(nodist_librbd_tp_la_OBJECTS)
+librbd_tp_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(librbd_tp_la_CFLAGS) \
+	$(CFLAGS) $(librbd_tp_la_LDFLAGS) $(LDFLAGS) -o $@
+ at WITH_LTTNG_TRUE@am_librbd_tp_la_rpath = -rpath $(libdir)
 librbd_types_la_LIBADD =
 am_librbd_types_la_OBJECTS = librbd/WatchNotifyTypes.lo
 librbd_types_la_OBJECTS = $(am_librbd_types_la_OBJECTS)
@@ -2544,7 +3027,8 @@ am__librgw_la_SOURCES_DIST = rgw/librgw.cc rgw/rgw_acl.cc \
 	rgw/rgw_gc.cc rgw/rgw_multi_del.cc rgw/rgw_env.cc \
 	rgw/rgw_cors.cc rgw/rgw_cors_s3.cc rgw/rgw_auth_s3.cc \
 	rgw/rgw_metadata.cc rgw/rgw_replica_log.cc rgw/rgw_keystone.cc \
-	rgw/rgw_quota.cc rgw/rgw_dencoder.cc
+	rgw/rgw_quota.cc rgw/rgw_dencoder.cc \
+	rgw/rgw_object_expirer_core.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am_librgw_la_OBJECTS = rgw/librgw_la-librgw.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/librgw_la-rgw_acl.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/librgw_la-rgw_acl_s3.lo \
@@ -2578,7 +3062,8 @@ am__librgw_la_SOURCES_DIST = rgw/librgw.cc rgw/rgw_acl.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/librgw_la-rgw_replica_log.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/librgw_la-rgw_keystone.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/librgw_la-rgw_quota.lo \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/librgw_la-rgw_dencoder.lo
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/librgw_la-rgw_dencoder.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/librgw_la-rgw_object_expirer_core.lo
 librgw_la_OBJECTS = $(am_librgw_la_OBJECTS)
 librgw_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \
 	$(LIBTOOLFLAGS) --mode=link $(CXXLD) $(librgw_la_CXXFLAGS) \
@@ -2608,7 +3093,8 @@ libsystest_la_OBJECTS = $(am_libsystest_la_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_test_mutate$(EXEEXT)
 @ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOS_TRUE at am__EXEEXT_5 = test_build_librados$(EXEEXT)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__EXEEXT_6 = ceph_smalliobench$(EXEEXT) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_omapbench$(EXEEXT)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_omapbench$(EXEEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_objectstore_bench$(EXEEXT)
 @ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE at am__EXEEXT_7 = ceph_kvstorebench$(EXEEXT) \
 @ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@	ceph_test_rados_list_parallel$(EXEEXT) \
 @ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@	ceph_test_rados_open_pools_parallel$(EXEEXT) \
@@ -2623,6 +3109,7 @@ libsystest_la_OBJECTS = $(am_libsystest_la_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_test_cls_replica_log$(EXEEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_test_cls_lock$(EXEEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_test_cls_hello$(EXEEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_test_cls_numops$(EXEEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_test_rados_api_cmd$(EXEEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_test_rados_api_io$(EXEEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_test_rados_api_c_write_operations$(EXEEXT) \
@@ -2652,6 +3139,7 @@ libsystest_la_OBJECTS = $(am_libsystest_la_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__EXEEXT_14 = test_build_librgw$(EXEEXT)
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__EXEEXT_15 = ceph_test_cors$(EXEEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	ceph_test_rgw_manifest$(EXEEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	ceph_test_rgw_obj$(EXEEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	ceph_test_cls_rgw_meta$(EXEEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	ceph_test_cls_rgw_log$(EXEEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	ceph_test_cls_rgw_opstate$(EXEEXT) \
@@ -2662,8 +3150,12 @@ libsystest_la_OBJECTS = $(am_libsystest_la_OBJECTS)
 @ENABLE_SERVER_TRUE@	ceph_test_trans$(EXEEXT) \
 @ENABLE_SERVER_TRUE@	ceph_test_mon_workloadgen$(EXEEXT) \
 @ENABLE_SERVER_TRUE@	ceph_test_mon_msg$(EXEEXT) \
- at ENABLE_SERVER_TRUE@	ceph_perf_objectstore$(EXEEXT)
+ at ENABLE_SERVER_TRUE@	ceph_perf_objectstore$(EXEEXT) \
+ at ENABLE_SERVER_TRUE@	ceph_perf_local$(EXEEXT) \
+ at ENABLE_SERVER_TRUE@	ceph_perf_msgr_server$(EXEEXT) \
+ at ENABLE_SERVER_TRUE@	ceph_perf_msgr_client$(EXEEXT)
 @ENABLE_SERVER_TRUE@@LINUX_TRUE at am__EXEEXT_17 = ceph_test_objectstore$(EXEEXT) \
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@	ceph_test_keyvaluedb$(EXEEXT) \
 @ENABLE_SERVER_TRUE@@LINUX_TRUE@	ceph_test_filestore$(EXEEXT)
 @ENABLE_SERVER_TRUE at am__EXEEXT_18 = ceph_test_objectstore_workloadgen$(EXEEXT) \
 @ENABLE_SERVER_TRUE@	ceph_test_filestore_idempotent$(EXEEXT) \
@@ -2703,21 +3195,22 @@ am__EXEEXT_26 = $(am__EXEEXT_1) $(am__EXEEXT_2) $(am__EXEEXT_3) \
 	$(am__EXEEXT_24) $(am__EXEEXT_25) ceph_psim$(EXEEXT)
 @WITH_DEBUG_TRUE at am__EXEEXT_27 = $(am__EXEEXT_26)
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__EXEEXT_28 = radosgw$(EXEEXT) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	radosgw-admin$(EXEEXT)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	radosgw-admin$(EXEEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	radosgw-object-expirer$(EXEEXT)
 @ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__EXEEXT_29 = rbd-replay$(EXEEXT)
 @ENABLE_CLIENT_TRUE@@WITH_BABELTRACE_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__EXEEXT_30 = rbd-replay-prep$(EXEEXT)
 @ENABLE_CLIENT_TRUE at am__EXEEXT_31 = ceph-dencoder$(EXEEXT)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__EXEEXT_32 = rados$(EXEEXT)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__EXEEXT_33 = ceph-objectstore-tool$(EXEEXT)
 @ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE at am__EXEEXT_34 = cephfs-journal-tool$(EXEEXT) \
- at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE@	cephfs-table-tool$(EXEEXT)
- at WITH_REST_BENCH_TRUE@am__EXEEXT_35 = rest-bench$(EXEEXT)
- at ENABLE_CLIENT_TRUE@am__EXEEXT_36 = ceph-syn$(EXEEXT)
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__EXEEXT_37 =  \
+ at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE@	cephfs-table-tool$(EXEEXT) \
+ at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE@	cephfs-data-scan$(EXEEXT)
+ at ENABLE_CLIENT_TRUE@am__EXEEXT_35 = ceph-syn$(EXEEXT)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__EXEEXT_36 =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	librados-config$(EXEEXT)
- at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__EXEEXT_38 = rbd$(EXEEXT)
- at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_RADOS_TRUE at am__EXEEXT_39 = ceph-fuse$(EXEEXT) \
- at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_RADOS_TRUE@	rbd-fuse$(EXEEXT)
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__EXEEXT_37 = rbd$(EXEEXT)
+ at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_RADOS_TRUE at am__EXEEXT_38 = ceph-fuse$(EXEEXT)
+ at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__EXEEXT_39 = rbd-fuse$(EXEEXT)
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__EXEEXT_40 = cephfs$(EXEEXT)
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am__EXEEXT_41 = ceph-mon$(EXEEXT)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__EXEEXT_42 = ceph-osd$(EXEEXT)
@@ -2733,11 +3226,12 @@ am__EXEEXT_26 = $(am__EXEEXT_1) $(am__EXEEXT_2) $(am__EXEEXT_3) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code_shec$(EXEEXT) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code_shec_all$(EXEEXT) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code_shec_thread$(EXEEXT) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code_shec_arguments$(EXEEXT) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code_plugin_shec$(EXEEXT) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code_example$(EXEEXT)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__EXEEXT_47 = unittest_librados$(EXEEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	unittest_librados_config$(EXEEXT)
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__EXEEXT_48 = unittest_rbd_replay$(EXEEXT) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	unittest_librbd$(EXEEXT)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__EXEEXT_48 = unittest_rbd_replay$(EXEEXT)
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__EXEEXT_49 = unittest_encoding$(EXEEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	unittest_base64$(EXEEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	unittest_run_cmd$(EXEEXT) \
@@ -2749,19 +3243,56 @@ am__EXEEXT_26 = $(am__EXEEXT_1) $(am__EXEEXT_2) $(am__EXEEXT_3) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_osdscrub$(EXEEXT) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_pglog$(EXEEXT) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_hitset$(EXEEXT) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_osd_osdcap$(EXEEXT)
- at ENABLE_SERVER_TRUE@am__EXEEXT_52 = unittest_chain_xattr$(EXEEXT) \
- at ENABLE_SERVER_TRUE@	unittest_flatindex$(EXEEXT) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_osd_osdcap$(EXEEXT) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_pageset$(EXEEXT)
+ at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE at am__EXEEXT_52 = unittest_rocksdb_option_static$(EXEEXT)
+ at ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_TRUE at am__EXEEXT_53 = unittest_rocksdb_option$(EXEEXT)
+ at ENABLE_SERVER_TRUE@am__EXEEXT_54 = unittest_chain_xattr$(EXEEXT) \
 @ENABLE_SERVER_TRUE@	unittest_lfnindex$(EXEEXT)
- at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at am__EXEEXT_53 = unittest_mds_authcap$(EXEEXT)
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__EXEEXT_54 = ceph_erasure_code_non_regression$(EXEEXT)
- at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE at am__EXEEXT_55 =  \
+ at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at am__EXEEXT_55 = unittest_mds_authcap$(EXEEXT)
+ at LINUX_TRUE@am__EXEEXT_56 = unittest_blkdev$(EXEEXT)
+am__EXEEXT_57 = $(am__EXEEXT_44) $(am__EXEEXT_45) $(am__EXEEXT_46) \
+	$(am__EXEEXT_47) $(am__EXEEXT_48) $(am__EXEEXT_49) \
+	$(am__EXEEXT_50) $(am__EXEEXT_51) $(am__EXEEXT_52) \
+	$(am__EXEEXT_53) $(am__EXEEXT_54) $(am__EXEEXT_55) \
+	unittest_addrs$(EXEEXT) $(am__EXEEXT_56) \
+	unittest_bloom_filter$(EXEEXT) unittest_histogram$(EXEEXT) \
+	unittest_prioritized_queue$(EXEEXT) unittest_str_map$(EXEEXT) \
+	unittest_sharedptr_registry$(EXEEXT) \
+	unittest_shared_cache$(EXEEXT) \
+	unittest_sloppy_crc_map$(EXEEXT) unittest_util$(EXEEXT) \
+	unittest_crush_wrapper$(EXEEXT) unittest_crush$(EXEEXT) \
+	unittest_osdmap$(EXEEXT) unittest_workqueue$(EXEEXT) \
+	unittest_striper$(EXEEXT) \
+	unittest_prebufferedstreambuf$(EXEEXT) \
+	unittest_str_list$(EXEEXT) unittest_log$(EXEEXT) \
+	unittest_throttle$(EXEEXT) unittest_ceph_argparse$(EXEEXT) \
+	unittest_ceph_compatset$(EXEEXT) unittest_mds_types$(EXEEXT) \
+	unittest_osd_types$(EXEEXT) unittest_lru$(EXEEXT) \
+	unittest_io_priority$(EXEEXT) unittest_gather$(EXEEXT) \
+	unittest_signals$(EXEEXT) unittest_bufferlist$(EXEEXT) \
+	unittest_xlist$(EXEEXT) unittest_crc32c$(EXEEXT) \
+	unittest_arch$(EXEEXT) unittest_crypto$(EXEEXT) \
+	unittest_crypto_init$(EXEEXT) unittest_perf_counters$(EXEEXT) \
+	unittest_admin_socket$(EXEEXT) unittest_ceph_crypto$(EXEEXT) \
+	unittest_utf8$(EXEEXT) unittest_mime$(EXEEXT) \
+	unittest_escape$(EXEEXT) unittest_strtol$(EXEEXT) \
+	unittest_confutils$(EXEEXT) unittest_config$(EXEEXT) \
+	unittest_context$(EXEEXT) unittest_safe_io$(EXEEXT) \
+	unittest_heartbeatmap$(EXEEXT) unittest_formatter$(EXEEXT) \
+	unittest_daemon_config$(EXEEXT) unittest_ipaddr$(EXEEXT) \
+	unittest_texttable$(EXEEXT) unittest_on_exit$(EXEEXT) \
+	unittest_readahead$(EXEEXT) unittest_tableformatter$(EXEEXT) \
+	unittest_bit_vector$(EXEEXT)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__EXEEXT_58 = unittest_librbd$(EXEEXT)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__EXEEXT_59 = ceph_erasure_code_non_regression$(EXEEXT)
+ at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE at am__EXEEXT_60 =  \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	simple_server$(EXEEXT) \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	simple_client$(EXEEXT) \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	xio_server$(EXEEXT) \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	xio_client$(EXEEXT)
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am__EXEEXT_56 = get_command_descriptions$(EXEEXT)
- at ENABLE_SERVER_TRUE@@LINUX_TRUE at am__EXEEXT_57 = mount.ceph$(EXEEXT)
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am__EXEEXT_61 = get_command_descriptions$(EXEEXT)
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE at am__EXEEXT_62 = mount.ceph$(EXEEXT)
 PROGRAMS = $(bin_PROGRAMS) $(noinst_PROGRAMS) $(sbin_PROGRAMS) \
 	$(su_sbin_PROGRAMS)
 am_ceph_authtool_OBJECTS = tools/ceph_authtool.$(OBJEXT)
@@ -2771,26 +3302,29 @@ am__ceph_client_debug_SOURCES_DIST = tools/ceph-client-debug.cc
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am_ceph_client_debug_OBJECTS = tools/ceph-client-debug.$(OBJEXT)
 ceph_client_debug_OBJECTS = $(am_ceph_client_debug_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at ceph_client_debug_DEPENDENCIES = $(LIBCEPHFS) \
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(LIBCLIENT) \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10) \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(LIBCOMMON)
 am_ceph_conf_OBJECTS = tools/ceph_conf.$(OBJEXT)
 ceph_conf_OBJECTS = $(am_ceph_conf_OBJECTS)
 ceph_conf_DEPENDENCIES = $(am__DEPENDENCIES_10) $(LIBCOMMON)
 am__ceph_dencoder_SOURCES_DIST = test/encoding/ceph_dencoder.cc \
-	mds/Capability.cc mds/MDS.cc mds/Beacon.cc mds/locks.c \
-	mds/journal.cc mds/Server.cc mds/Mutation.cc mds/MDCache.cc \
-	mds/RecoveryQueue.cc mds/Locker.cc mds/Migrator.cc \
+	mds/Capability.cc mds/MDSDaemon.cc mds/MDSRank.cc \
+	mds/Beacon.cc mds/locks.c mds/journal.cc mds/Server.cc \
+	mds/Mutation.cc mds/MDCache.cc mds/RecoveryQueue.cc \
+	mds/StrayManager.cc mds/Locker.cc mds/Migrator.cc \
 	mds/MDBalancer.cc mds/CDentry.cc mds/CDir.cc mds/CInode.cc \
 	mds/LogEvent.cc mds/MDSTable.cc mds/InoTable.cc \
 	mds/JournalPointer.cc mds/MDSTableClient.cc \
-	mds/MDSTableServer.cc mds/SnapRealm.cc mds/SnapServer.cc \
-	mds/snap.cc mds/SessionMap.cc mds/MDSContext.cc \
-	mds/MDSAuthCaps.cc mds/MDLog.cc common/TrackedOp.cc \
-	perfglue/disabled_heap_profiler.cc perfglue/disabled_stubs.cc \
-	rgw/rgw_dencoder.cc rgw/rgw_acl.cc rgw/rgw_common.cc \
-	rgw/rgw_env.cc rgw/rgw_json_enc.cc
-am__objects_25 = mds/ceph_dencoder-Capability.$(OBJEXT) \
-	mds/ceph_dencoder-MDS.$(OBJEXT) \
+	mds/MDSTableServer.cc mds/SimpleLock.cc mds/SnapRealm.cc \
+	mds/SnapServer.cc mds/snap.cc mds/SessionMap.cc \
+	mds/MDSContext.cc mds/MDSAuthCaps.cc mds/MDLog.cc \
+	common/TrackedOp.cc perfglue/disabled_heap_profiler.cc \
+	perfglue/disabled_stubs.cc rgw/rgw_dencoder.cc rgw/rgw_acl.cc \
+	rgw/rgw_common.cc rgw/rgw_env.cc rgw/rgw_json_enc.cc
+am__objects_30 = mds/ceph_dencoder-Capability.$(OBJEXT) \
+	mds/ceph_dencoder-MDSDaemon.$(OBJEXT) \
+	mds/ceph_dencoder-MDSRank.$(OBJEXT) \
 	mds/ceph_dencoder-Beacon.$(OBJEXT) \
 	mds/ceph_dencoder-locks.$(OBJEXT) \
 	mds/ceph_dencoder-journal.$(OBJEXT) \
@@ -2798,6 +3332,7 @@ am__objects_25 = mds/ceph_dencoder-Capability.$(OBJEXT) \
 	mds/ceph_dencoder-Mutation.$(OBJEXT) \
 	mds/ceph_dencoder-MDCache.$(OBJEXT) \
 	mds/ceph_dencoder-RecoveryQueue.$(OBJEXT) \
+	mds/ceph_dencoder-StrayManager.$(OBJEXT) \
 	mds/ceph_dencoder-Locker.$(OBJEXT) \
 	mds/ceph_dencoder-Migrator.$(OBJEXT) \
 	mds/ceph_dencoder-MDBalancer.$(OBJEXT) \
@@ -2810,6 +3345,7 @@ am__objects_25 = mds/ceph_dencoder-Capability.$(OBJEXT) \
 	mds/ceph_dencoder-JournalPointer.$(OBJEXT) \
 	mds/ceph_dencoder-MDSTableClient.$(OBJEXT) \
 	mds/ceph_dencoder-MDSTableServer.$(OBJEXT) \
+	mds/ceph_dencoder-SimpleLock.$(OBJEXT) \
 	mds/ceph_dencoder-SnapRealm.$(OBJEXT) \
 	mds/ceph_dencoder-SnapServer.$(OBJEXT) \
 	mds/ceph_dencoder-snap.$(OBJEXT) \
@@ -2818,19 +3354,19 @@ am__objects_25 = mds/ceph_dencoder-Capability.$(OBJEXT) \
 	mds/ceph_dencoder-MDSAuthCaps.$(OBJEXT) \
 	mds/ceph_dencoder-MDLog.$(OBJEXT) \
 	common/ceph_dencoder-TrackedOp.$(OBJEXT)
- at ENABLE_CLIENT_TRUE@am__objects_26 = $(am__objects_25)
- at ENABLE_CLIENT_TRUE@am__objects_27 =  \
+ at ENABLE_CLIENT_TRUE@am__objects_31 = $(am__objects_30)
+ at ENABLE_CLIENT_TRUE@am__objects_32 =  \
 @ENABLE_CLIENT_TRUE@	rgw/ceph_dencoder-rgw_dencoder.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@	rgw/ceph_dencoder-rgw_acl.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@	rgw/ceph_dencoder-rgw_common.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@	rgw/ceph_dencoder-rgw_env.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@	rgw/ceph_dencoder-rgw_json_enc.$(OBJEXT)
-am__objects_28 = $(am__objects_26) \
+am__objects_33 = $(am__objects_31) \
 	perfglue/ceph_dencoder-disabled_heap_profiler.$(OBJEXT) \
 	perfglue/ceph_dencoder-disabled_stubs.$(OBJEXT) \
-	$(am__objects_27)
+	$(am__objects_32)
 @ENABLE_CLIENT_TRUE at am_ceph_dencoder_OBJECTS = test/encoding/ceph_dencoder-ceph_dencoder.$(OBJEXT) \
- at ENABLE_CLIENT_TRUE@	$(am__objects_28)
+ at ENABLE_CLIENT_TRUE@	$(am__objects_33)
 ceph_dencoder_OBJECTS = $(am_ceph_dencoder_OBJECTS)
 @ENABLE_CLIENT_TRUE at ceph_dencoder_DEPENDENCIES = $(LIBRBD_TYPES) \
 @ENABLE_CLIENT_TRUE@	$(LIBOSD_TYPES) $(LIBOS_TYPES) \
@@ -2888,8 +3424,9 @@ ceph_monstore_tool_OBJECTS = $(am_ceph_monstore_tool_OBJECTS)
 @ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_10) \
 @ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_1)
 am__ceph_objectstore_tool_SOURCES_DIST =  \
-	tools/ceph_objectstore_tool.cc
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_ceph_objectstore_tool_OBJECTS = tools/ceph_objectstore_tool.$(OBJEXT)
+	tools/ceph_objectstore_tool.cc tools/RadosDump.cc
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_ceph_objectstore_tool_OBJECTS = tools/ceph_objectstore_tool.$(OBJEXT) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	tools/RadosDump.$(OBJEXT)
 ceph_objectstore_tool_OBJECTS = $(am_ceph_objectstore_tool_OBJECTS)
 am__DEPENDENCIES_14 = libosd.la $(am__DEPENDENCIES_1) $(LIBOSDC) \
 	$(am__DEPENDENCIES_5) $(am__DEPENDENCIES_11)
@@ -2898,7 +3435,6 @@ am__DEPENDENCIES_14 = libosd.la $(am__DEPENDENCIES_1) $(LIBOSDC) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_5) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_10) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBRADOS) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1)
 am__ceph_osd_SOURCES_DIST = ceph_osd.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_ceph_osd_OBJECTS =  \
@@ -2975,20 +3511,63 @@ ceph_multi_stress_watch_OBJECTS =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(RADOS_TEST_LDADD)
+am__ceph_objectstore_bench_SOURCES_DIST = test/objectstore_bench.cc
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am_ceph_objectstore_bench_OBJECTS = test/objectstore_bench.$(OBJEXT)
+ceph_objectstore_bench_OBJECTS = $(am_ceph_objectstore_bench_OBJECTS)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_objectstore_bench_DEPENDENCIES =  \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_5) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10)
 am__ceph_omapbench_SOURCES_DIST = test/omap_bench.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am_ceph_omapbench_OBJECTS = test/omap_bench.$(OBJEXT)
 ceph_omapbench_OBJECTS = $(am_ceph_omapbench_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_omapbench_DEPENDENCIES =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10)
-am__ceph_perf_objectstore_SOURCES_DIST =  \
-	test/objectstore/ObjectStoreTransactionBenchmark.cc
- at ENABLE_SERVER_TRUE@am_ceph_perf_objectstore_OBJECTS = test/objectstore/ceph_perf_objectstore-ObjectStoreTransactionBenchmark.$(OBJEXT)
-ceph_perf_objectstore_OBJECTS = $(am_ceph_perf_objectstore_OBJECTS)
+am__ceph_perf_local_SOURCES_DIST = test/perf_local.cc \
+	test/perf_helper.cc
+ at ENABLE_SERVER_TRUE@am_ceph_perf_local_OBJECTS =  \
+ at ENABLE_SERVER_TRUE@	test/ceph_perf_local-perf_local.$(OBJEXT) \
+ at ENABLE_SERVER_TRUE@	test/ceph_perf_local-perf_helper.$(OBJEXT)
+ceph_perf_local_OBJECTS = $(am_ceph_perf_local_OBJECTS)
+ at ENABLE_SERVER_TRUE@ceph_perf_local_DEPENDENCIES =  \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_5) \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_10)
+ceph_perf_local_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
+	$(ceph_perf_local_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
+am__ceph_perf_msgr_client_SOURCES_DIST =  \
+	test/msgr/perf_msgr_client.cc
+ at ENABLE_SERVER_TRUE@am_ceph_perf_msgr_client_OBJECTS = test/msgr/ceph_perf_msgr_client-perf_msgr_client.$(OBJEXT)
+ceph_perf_msgr_client_OBJECTS = $(am_ceph_perf_msgr_client_OBJECTS)
 am__DEPENDENCIES_15 = $(top_builddir)/src/gmock/lib/libgmock_main.la \
 	$(top_builddir)/src/gmock/lib/libgmock.la \
 	$(top_builddir)/src/gmock/gtest/lib/libgtest.la \
 	$(am__DEPENDENCIES_1)
+ at ENABLE_SERVER_TRUE@ceph_perf_msgr_client_DEPENDENCIES =  \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_5) \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_15) \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_10)
+ceph_perf_msgr_client_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
+	$(ceph_perf_msgr_client_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
+am__ceph_perf_msgr_server_SOURCES_DIST =  \
+	test/msgr/perf_msgr_server.cc
+ at ENABLE_SERVER_TRUE@am_ceph_perf_msgr_server_OBJECTS = test/msgr/ceph_perf_msgr_server-perf_msgr_server.$(OBJEXT)
+ceph_perf_msgr_server_OBJECTS = $(am_ceph_perf_msgr_server_OBJECTS)
+ at ENABLE_SERVER_TRUE@ceph_perf_msgr_server_DEPENDENCIES =  \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_5) \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_15) \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_10)
+ceph_perf_msgr_server_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
+	$(ceph_perf_msgr_server_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
+am__ceph_perf_objectstore_SOURCES_DIST =  \
+	test/objectstore/ObjectStoreTransactionBenchmark.cc
+ at ENABLE_SERVER_TRUE@am_ceph_perf_objectstore_OBJECTS = test/objectstore/ceph_perf_objectstore-ObjectStoreTransactionBenchmark.$(OBJEXT)
+ceph_perf_objectstore_OBJECTS = $(am_ceph_perf_objectstore_OBJECTS)
 @ENABLE_SERVER_TRUE at ceph_perf_objectstore_DEPENDENCIES =  \
 @ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_5) \
 @ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_15) \
@@ -3019,6 +3598,7 @@ am__DEPENDENCIES_16 = librgw.la $(am__DEPENDENCIES_1)
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_rgw_client.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_log_client.a \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_statelog_client.a \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_timeindex_client.a \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_user_client.a \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_replica_log_client.a \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_lock_client.la \
@@ -3169,6 +3749,20 @@ ceph_test_cls_log_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(ceph_test_cls_log_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
 	$(LDFLAGS) -o $@
+am__ceph_test_cls_numops_SOURCES_DIST =  \
+	test/cls_numops/test_cls_numops.cc
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am_ceph_test_cls_numops_OBJECTS = test/cls_numops/ceph_test_cls_numops-test_cls_numops.$(OBJEXT)
+ceph_test_cls_numops_OBJECTS = $(am_ceph_test_cls_numops_OBJECTS)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_cls_numops_DEPENDENCIES =  \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	libcls_numops_client.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_15) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(RADOS_TEST_LDADD)
+ceph_test_cls_numops_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
+	$(ceph_test_cls_numops_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
 am__ceph_test_cls_rbd_SOURCES_DIST = test/cls_rbd/test_cls_rbd.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am_ceph_test_cls_rbd_OBJECTS = test/cls_rbd/ceph_test_cls_rbd-test_cls_rbd.$(OBJEXT)
 ceph_test_cls_rbd_OBJECTS = $(am_ceph_test_cls_rbd_OBJECTS)
@@ -3178,7 +3772,9 @@ ceph_test_cls_rbd_OBJECTS = $(am_ceph_test_cls_rbd_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	libcls_lock_client.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBCOMMON) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(RADOS_TEST_LDADD)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(RADOS_TEST_LDADD) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_1) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_2)
 ceph_test_cls_rbd_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(ceph_test_cls_rbd_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -3275,11 +3871,13 @@ ceph_test_cls_rgw_opstate_OBJECTS =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_1) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_version_client.a \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_log_client.a \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_timeindex_client.a \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_statelog_client.a \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_refcount_client.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_rgw_client.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_user_client.a \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_lock_client.la
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_lock_client.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS)
 ceph_test_cls_rgw_opstate_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(ceph_test_cls_rgw_opstate_CXXFLAGS) $(CXXFLAGS) \
@@ -3391,6 +3989,17 @@ ceph_test_keys_OBJECTS = $(am_ceph_test_keys_OBJECTS)
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE at ceph_test_keys_DEPENDENCIES =  \
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	$(am__DEPENDENCIES_13) \
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	$(am__DEPENDENCIES_10)
+am__ceph_test_keyvaluedb_SOURCES_DIST = test/objectstore/test_kv.cc
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE at am_ceph_test_keyvaluedb_OBJECTS = test/objectstore/ceph_test_keyvaluedb-test_kv.$(OBJEXT)
+ceph_test_keyvaluedb_OBJECTS = $(am_ceph_test_keyvaluedb_OBJECTS)
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE at ceph_test_keyvaluedb_DEPENDENCIES =  \
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@	$(am__DEPENDENCIES_5) \
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@	$(am__DEPENDENCIES_15) \
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@	$(am__DEPENDENCIES_10)
+ceph_test_keyvaluedb_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
+	$(ceph_test_keyvaluedb_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
 am__ceph_test_keyvaluedb_atomicity_SOURCES_DIST =  \
 	test/ObjectMap/test_keyvaluedb_atomicity.cc
 @ENABLE_SERVER_TRUE at am_ceph_test_keyvaluedb_atomicity_OBJECTS = test/ObjectMap/ceph_test_keyvaluedb_atomicity-test_keyvaluedb_atomicity.$(OBJEXT)
@@ -3421,11 +4030,13 @@ ceph_test_keyvaluedb_iterators_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LDFLAGS) $(LDFLAGS) -o $@
 am__ceph_test_libcephfs_SOURCES_DIST = test/libcephfs/test.cc \
 	test/libcephfs/readdir_r_cb.cc test/libcephfs/caps.cc \
-	test/libcephfs/multiclient.cc
+	test/libcephfs/multiclient.cc test/libcephfs/flock.cc
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__objects_34 = test/libcephfs/ceph_test_libcephfs-flock.$(OBJEXT)
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am_ceph_test_libcephfs_OBJECTS = test/libcephfs/ceph_test_libcephfs-test.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	test/libcephfs/ceph_test_libcephfs-readdir_r_cb.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	test/libcephfs/ceph_test_libcephfs-caps.$(OBJEXT) \
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	test/libcephfs/ceph_test_libcephfs-multiclient.$(OBJEXT)
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	test/libcephfs/ceph_test_libcephfs-multiclient.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__objects_34)
 ceph_test_libcephfs_OBJECTS = $(am_ceph_test_libcephfs_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at ceph_test_libcephfs_DEPENDENCIES = $(LIBCEPHFS) \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_15)
@@ -3446,8 +4057,7 @@ ceph_test_librbd_OBJECTS = $(am_ceph_test_librbd_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_8) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_15) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_10) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(RADOS_TEST_LDADD) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__append_162)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(RADOS_TEST_LDADD)
 ceph_test_librbd_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(ceph_test_librbd_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -3463,8 +4073,7 @@ ceph_test_librbd_api_OBJECTS = $(am_ceph_test_librbd_api_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRADOS) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBCOMMON) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(RADOS_TEST_LDADD) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__append_163)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(RADOS_TEST_LDADD)
 ceph_test_librbd_api_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(ceph_test_librbd_api_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -3878,6 +4487,19 @@ ceph_test_rgw_manifest_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(ceph_test_rgw_manifest_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
 	$(LDFLAGS) -o $@
+am__ceph_test_rgw_obj_SOURCES_DIST = test/rgw/test_rgw_obj.cc
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am_ceph_test_rgw_obj_OBJECTS = test/rgw/ceph_test_rgw_obj-test_rgw_obj.$(OBJEXT)
+ceph_test_rgw_obj_OBJECTS = $(am_ceph_test_rgw_obj_OBJECTS)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at ceph_test_rgw_obj_DEPENDENCIES = $(LIBRADOS) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_18) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_15) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_1)
+ceph_test_rgw_obj_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
+	$(ceph_test_rgw_obj_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
 am_ceph_test_signal_handlers_OBJECTS =  \
 	test/TestSignalHandlers.$(OBJEXT)
 ceph_test_signal_handlers_OBJECTS =  \
@@ -3939,6 +4561,16 @@ am__cephfs_SOURCES_DIST = cephfs.cc
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am_cephfs_OBJECTS = cephfs.$(OBJEXT)
 cephfs_OBJECTS = $(am_cephfs_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at cephfs_DEPENDENCIES = $(LIBCOMMON)
+am__cephfs_data_scan_SOURCES_DIST = tools/cephfs/cephfs-data-scan.cc \
+	tools/cephfs/DataScan.cc tools/cephfs/MDSUtility.cc
+ at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE at am_cephfs_data_scan_OBJECTS = tools/cephfs/cephfs-data-scan.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE@	tools/cephfs/DataScan.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE@	tools/cephfs/MDSUtility.$(OBJEXT)
+cephfs_data_scan_OBJECTS = $(am_cephfs_data_scan_OBJECTS)
+ at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE at cephfs_data_scan_DEPENDENCIES = $(am__DEPENDENCIES_12) \
+ at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE@	libcls_cephfs_client.la \
+ at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
+ at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10)
 am__cephfs_journal_tool_SOURCES_DIST =  \
 	tools/cephfs/cephfs-journal-tool.cc \
 	tools/cephfs/JournalTool.cc tools/cephfs/JournalFilter.cc \
@@ -3997,18 +4629,19 @@ mount_ceph_OBJECTS = $(am_mount_ceph_OBJECTS)
 am_osdmaptool_OBJECTS = tools/osdmaptool.$(OBJEXT)
 osdmaptool_OBJECTS = $(am_osdmaptool_OBJECTS)
 osdmaptool_DEPENDENCIES = $(am__DEPENDENCIES_10)
-am__rados_SOURCES_DIST = tools/rados/rados.cc \
-	tools/rados/rados_import.cc tools/rados/rados_export.cc \
-	tools/rados/rados_sync.cc common/obj_bencher.cc
+am__rados_SOURCES_DIST = tools/rados/rados.cc tools/RadosDump.cc \
+	tools/rados/RadosImport.cc tools/rados/PoolDump.cc \
+	common/obj_bencher.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am_rados_OBJECTS = tools/rados/rados.$(OBJEXT) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	tools/rados/rados_import.$(OBJEXT) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	tools/rados/rados_export.$(OBJEXT) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	tools/rados/rados_sync.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	tools/RadosDump.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	tools/rados/RadosImport.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	tools/rados/PoolDump.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	common/obj_bencher.$(OBJEXT)
 rados_OBJECTS = $(am_rados_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at rados_DEPENDENCIES =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	libcls_lock_client.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOSSTRIPER) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10)
 am__radosgw_SOURCES_DIST = rgw/rgw_resolve.cc rgw/rgw_rest.cc \
 	rgw/rgw_rest_swift.cc rgw/rgw_rest_s3.cc rgw/rgw_rest_usage.cc \
@@ -4049,6 +4682,12 @@ radosgw_admin_OBJECTS = $(am_radosgw_admin_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at radosgw_admin_DEPENDENCIES = $(am__DEPENDENCIES_16) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_18) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10)
+am__radosgw_object_expirer_SOURCES_DIST = rgw/rgw_object_expirer.cc
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am_radosgw_object_expirer_OBJECTS = rgw/rgw_object_expirer.$(OBJEXT)
+radosgw_object_expirer_OBJECTS = $(am_radosgw_object_expirer_OBJECTS)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at radosgw_object_expirer_DEPENDENCIES = $(am__DEPENDENCIES_16) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_18) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10)
 am__rbd_SOURCES_DIST = rbd.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am_rbd_OBJECTS = rbd.$(OBJEXT)
 rbd_OBJECTS = $(am_rbd_OBJECTS)
@@ -4057,42 +4696,34 @@ rbd_OBJECTS = $(am_rbd_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRADOS) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_10)
 am__rbd_fuse_SOURCES_DIST = rbd_fuse/rbd-fuse.cc
- at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_RADOS_TRUE at am_rbd_fuse_OBJECTS = rbd_fuse/rbd-fuse.$(OBJEXT)
+ at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am_rbd_fuse_OBJECTS = rbd_fuse/rbd_fuse-rbd-fuse.$(OBJEXT)
 rbd_fuse_OBJECTS = $(am_rbd_fuse_OBJECTS)
- at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_RADOS_TRUE at rbd_fuse_DEPENDENCIES = $(LIBRBD) \
- at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
- at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at rbd_fuse_DEPENDENCIES = $(am__DEPENDENCIES_1) \
+ at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRBD) \
+ at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRADOS) \
+ at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_10)
+rbd_fuse_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CXXLD) $(rbd_fuse_CXXFLAGS) \
+	$(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
 am__rbd_replay_SOURCES_DIST = rbd_replay/rbd-replay.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am_rbd_replay_OBJECTS = rbd_replay/rbd-replay.$(OBJEXT)
 rbd_replay_OBJECTS = $(am_rbd_replay_OBJECTS)
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at rbd_replay_DEPENDENCIES =  \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at rbd_replay_DEPENDENCIES = librbd_replay.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_replay_types.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRBD) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRADOS) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_10) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_replay.la
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBCOMMON)
 am__rbd_replay_prep_SOURCES_DIST = rbd_replay/rbd-replay-prep.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am_rbd_replay_prep_OBJECTS = rbd_replay/rbd-replay-prep.$(OBJEXT)
 rbd_replay_prep_OBJECTS = $(am_rbd_replay_prep_OBJECTS)
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at rbd_replay_prep_DEPENDENCIES =  \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at rbd_replay_prep_DEPENDENCIES = librbd_replay.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_replay_ios.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_replay_types.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRBD) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRADOS) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_10) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_replay.la \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_replay_ios.la
-am__rest_bench_SOURCES_DIST = tools/rest_bench.cc \
-	common/obj_bencher.cc
- at WITH_REST_BENCH_TRUE@am_rest_bench_OBJECTS =  \
- at WITH_REST_BENCH_TRUE@	tools/rest_bench-rest_bench.$(OBJEXT) \
- at WITH_REST_BENCH_TRUE@	common/rest_bench-obj_bencher.$(OBJEXT)
-rest_bench_OBJECTS = $(am_rest_bench_OBJECTS)
- at WITH_REST_BENCH_TRUE@@WITH_SYSTEM_LIBS3_FALSE at am__DEPENDENCIES_19 = libs3/build/lib/libs3.a
- at WITH_REST_BENCH_TRUE@rest_bench_DEPENDENCIES =  \
- at WITH_REST_BENCH_TRUE@	$(am__DEPENDENCIES_10) \
- at WITH_REST_BENCH_TRUE@	$(am__DEPENDENCIES_1) \
- at WITH_REST_BENCH_TRUE@	$(am__DEPENDENCIES_19)
-rest_bench_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \
-	$(LIBTOOLFLAGS) --mode=link $(CXXLD) $(rest_bench_CXXFLAGS) \
-	$(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBCOMMON)
 am__simple_client_SOURCES_DIST = test/messenger/simple_client.cc \
 	test/messenger/simple_dispatcher.cc
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE at am_simple_client_OBJECTS = test/messenger/simple_client-simple_client.$(OBJEXT) \
@@ -4128,13 +4759,13 @@ simple_server_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 am__test_build_libcephfs_SOURCES_DIST = test/buildtest_skeleton.cc \
 	osdc/Objecter.cc osdc/ObjectCacher.cc osdc/Filer.cc \
 	osdc/Striper.cc osdc/Journaler.cc
-am__objects_29 = osdc/test_build_libcephfs-Objecter.$(OBJEXT) \
+am__objects_35 = osdc/test_build_libcephfs-Objecter.$(OBJEXT) \
 	osdc/test_build_libcephfs-ObjectCacher.$(OBJEXT) \
 	osdc/test_build_libcephfs-Filer.$(OBJEXT) \
 	osdc/test_build_libcephfs-Striper.$(OBJEXT) \
 	osdc/test_build_libcephfs-Journaler.$(OBJEXT)
 @ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am_test_build_libcephfs_OBJECTS = test/test_build_libcephfs-buildtest_skeleton.$(OBJEXT) \
- at ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__objects_29)
+ at ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__objects_35)
 test_build_libcephfs_OBJECTS = $(am_test_build_libcephfs_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at test_build_libcephfs_DEPENDENCIES = $(LIBCEPHFS) \
 @ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_1) \
@@ -4144,9 +4775,11 @@ test_build_libcephfs_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(test_build_libcephfs_CXXFLAGS) $(CXXFLAGS) \
 	$(test_build_libcephfs_LDFLAGS) $(LDFLAGS) -o $@
-am__test_build_libcommon_SOURCES_DIST = test/buildtest_skeleton.cc
+am__test_build_libcommon_SOURCES_DIST = test/buildtest_skeleton.cc \
+	common/buffer.cc
+am__objects_36 = common/test_build_libcommon-buffer.$(OBJEXT)
 @WITH_BUILD_TESTS_TRUE at am_test_build_libcommon_OBJECTS = test/test_build_libcommon-buildtest_skeleton.$(OBJEXT) \
- at WITH_BUILD_TESTS_TRUE@	$(am__objects_14)
+ at WITH_BUILD_TESTS_TRUE@	$(am__objects_36)
 test_build_libcommon_OBJECTS = $(am_test_build_libcommon_OBJECTS)
 @WITH_BUILD_TESTS_TRUE at test_build_libcommon_DEPENDENCIES =  \
 @WITH_BUILD_TESTS_TRUE@	$(am__DEPENDENCIES_3) \
@@ -4159,10 +4792,10 @@ test_build_libcommon_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(test_build_libcommon_LDFLAGS) $(LDFLAGS) -o $@
 am__test_build_librados_SOURCES_DIST = test/buildtest_skeleton.cc \
 	common/buffer.cc librados/librados.cc
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__objects_30 = common/test_build_librados-buffer.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__objects_37 = common/test_build_librados-buffer.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	librados/test_build_librados-librados.$(OBJEXT)
 @ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOS_TRUE at am_test_build_librados_OBJECTS = test/test_build_librados-buildtest_skeleton.$(OBJEXT) \
- at ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOS_TRUE@	$(am__objects_30)
+ at ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOS_TRUE@	$(am__objects_37)
 test_build_librados_OBJECTS = $(am_test_build_librados_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOS_TRUE at test_build_librados_DEPENDENCIES = $(am__DEPENDENCIES_8) \
 @ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_1) \
@@ -4184,8 +4817,8 @@ am__test_build_librgw_SOURCES_DIST = test/buildtest_skeleton.cc \
 	rgw/rgw_multi_del.cc rgw/rgw_env.cc rgw/rgw_cors.cc \
 	rgw/rgw_cors_s3.cc rgw/rgw_auth_s3.cc rgw/rgw_metadata.cc \
 	rgw/rgw_replica_log.cc rgw/rgw_keystone.cc rgw/rgw_quota.cc \
-	rgw/rgw_dencoder.cc
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__objects_31 = rgw/test_build_librgw-librgw.$(OBJEXT) \
+	rgw/rgw_dencoder.cc rgw/rgw_object_expirer_core.cc
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__objects_38 = rgw/test_build_librgw-librgw.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/test_build_librgw-rgw_acl.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/test_build_librgw-rgw_acl_s3.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/test_build_librgw-rgw_acl_swift.$(OBJEXT) \
@@ -4218,9 +4851,10 @@ am__test_build_librgw_SOURCES_DIST = test/buildtest_skeleton.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/test_build_librgw-rgw_replica_log.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/test_build_librgw-rgw_keystone.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/test_build_librgw-rgw_quota.$(OBJEXT) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/test_build_librgw-rgw_dencoder.$(OBJEXT)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/test_build_librgw-rgw_dencoder.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/test_build_librgw-rgw_object_expirer_core.$(OBJEXT)
 @ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am_test_build_librgw_OBJECTS = test/test_build_librgw-buildtest_skeleton.$(OBJEXT) \
- at ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__objects_31)
+ at ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__objects_38)
 test_build_librgw_OBJECTS = $(am_test_build_librgw_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at test_build_librgw_DEPENDENCIES = $(am__DEPENDENCIES_18) \
 @ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_1) \
@@ -4256,6 +4890,16 @@ unittest_arch_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_arch_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) \
 	-o $@
+am_unittest_async_compressor_OBJECTS = test/common/unittest_async_compressor-test_async_compressor.$(OBJEXT)
+unittest_async_compressor_OBJECTS =  \
+	$(am_unittest_async_compressor_OBJECTS)
+am__DEPENDENCIES_19 = libcompressor.la
+unittest_async_compressor_DEPENDENCIES = $(am__DEPENDENCIES_15) \
+	$(am__DEPENDENCIES_10) $(am__DEPENDENCIES_19)
+unittest_async_compressor_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
+	$(unittest_async_compressor_CXXFLAGS) $(CXXFLAGS) \
+	$(AM_LDFLAGS) $(LDFLAGS) -o $@
 am__unittest_base64_SOURCES_DIST = test/base64.cc
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am_unittest_base64_OBJECTS = test/unittest_base64-base64.$(OBJEXT)
 unittest_base64_OBJECTS = $(am_unittest_base64_OBJECTS)
@@ -4512,7 +5156,7 @@ am__unittest_erasure_code_jerasure_SOURCES_DIST =  \
 	erasure-code/jerasure/gf-complete/src/gf_w8.c \
 	erasure-code/jerasure/ErasureCodePluginJerasure.cc \
 	erasure-code/jerasure/ErasureCodeJerasure.cc
-am__objects_32 = erasure-code/unittest_erasure_code_jerasure-ErasureCode.$(OBJEXT) \
+am__objects_39 = erasure-code/unittest_erasure_code_jerasure-ErasureCode.$(OBJEXT) \
 	erasure-code/jerasure/jerasure/src/unittest_erasure_code_jerasure-cauchy.$(OBJEXT) \
 	erasure-code/jerasure/jerasure/src/unittest_erasure_code_jerasure-galois.$(OBJEXT) \
 	erasure-code/jerasure/jerasure/src/unittest_erasure_code_jerasure-jerasure.$(OBJEXT) \
@@ -4532,7 +5176,7 @@ am__objects_32 = erasure-code/unittest_erasure_code_jerasure-ErasureCode.$(OBJEX
 	erasure-code/jerasure/unittest_erasure_code_jerasure-ErasureCodePluginJerasure.$(OBJEXT) \
 	erasure-code/jerasure/unittest_erasure_code_jerasure-ErasureCodeJerasure.$(OBJEXT)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_unittest_erasure_code_jerasure_OBJECTS = test/erasure-code/unittest_erasure_code_jerasure-TestErasureCodeJerasure.$(OBJEXT) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__objects_32)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__objects_39)
 unittest_erasure_code_jerasure_OBJECTS =  \
 	$(am_unittest_erasure_code_jerasure_OBJECTS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_jerasure_DEPENDENCIES =  \
@@ -4550,12 +5194,12 @@ am__unittest_erasure_code_lrc_SOURCES_DIST =  \
 	erasure-code/ErasureCode.cc \
 	erasure-code/lrc/ErasureCodePluginLrc.cc \
 	erasure-code/lrc/ErasureCodeLrc.cc
-am__objects_33 =  \
+am__objects_40 =  \
 	erasure-code/unittest_erasure_code_lrc-ErasureCode.$(OBJEXT) \
 	erasure-code/lrc/unittest_erasure_code_lrc-ErasureCodePluginLrc.$(OBJEXT) \
 	erasure-code/lrc/unittest_erasure_code_lrc-ErasureCodeLrc.$(OBJEXT)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_unittest_erasure_code_lrc_OBJECTS = test/erasure-code/unittest_erasure_code_lrc-TestErasureCodeLrc.$(OBJEXT) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__objects_33)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__objects_40)
 unittest_erasure_code_lrc_OBJECTS =  \
 	$(am_unittest_erasure_code_lrc_OBJECTS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_lrc_DEPENDENCIES =  \
@@ -4633,13 +5277,28 @@ unittest_erasure_code_plugin_lrc_LINK = $(LIBTOOL) $(AM_V_lt) \
 	--tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link \
 	$(CXXLD) $(unittest_erasure_code_plugin_lrc_CXXFLAGS) \
 	$(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__unittest_erasure_code_plugin_shec_SOURCES_DIST =  \
+	test/erasure-code/TestErasureCodePluginShec.cc
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_unittest_erasure_code_plugin_shec_OBJECTS = test/erasure-code/unittest_erasure_code_plugin_shec-TestErasureCodePluginShec.$(OBJEXT)
+unittest_erasure_code_plugin_shec_OBJECTS =  \
+	$(am_unittest_erasure_code_plugin_shec_OBJECTS)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_plugin_shec_DEPENDENCIES =  \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_14) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBCOMMON) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_15) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_10) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1)
+unittest_erasure_code_plugin_shec_LINK = $(LIBTOOL) $(AM_V_lt) \
+	--tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link \
+	$(CXXLD) $(unittest_erasure_code_plugin_shec_CXXFLAGS) \
+	$(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
 am__unittest_erasure_code_shec_SOURCES_DIST =  \
 	test/erasure-code/TestErasureCodeShec.cc \
 	erasure-code/ErasureCode.cc \
 	erasure-code/shec/ErasureCodePluginShec.cc \
 	erasure-code/shec/ErasureCodeShec.cc \
 	erasure-code/shec/ErasureCodeShecTableCache.cc \
-	erasure-code/shec/shec.cc erasure-code/shec/determinant.c \
+	erasure-code/shec/determinant.c \
 	erasure-code/jerasure/jerasure/src/cauchy.c \
 	erasure-code/jerasure/jerasure/src/galois.c \
 	erasure-code/jerasure/jerasure/src/jerasure.c \
@@ -4656,12 +5315,11 @@ am__unittest_erasure_code_shec_SOURCES_DIST =  \
 	erasure-code/jerasure/gf-complete/src/gf_w4.c \
 	erasure-code/jerasure/gf-complete/src/gf_rand.c \
 	erasure-code/jerasure/gf-complete/src/gf_w8.c
-am__objects_34 =  \
+am__objects_41 =  \
 	erasure-code/unittest_erasure_code_shec-ErasureCode.$(OBJEXT) \
 	erasure-code/shec/unittest_erasure_code_shec-ErasureCodePluginShec.$(OBJEXT) \
 	erasure-code/shec/unittest_erasure_code_shec-ErasureCodeShec.$(OBJEXT) \
 	erasure-code/shec/unittest_erasure_code_shec-ErasureCodeShecTableCache.$(OBJEXT) \
-	erasure-code/shec/unittest_erasure_code_shec-shec.$(OBJEXT) \
 	erasure-code/shec/unittest_erasure_code_shec-determinant.$(OBJEXT) \
 	erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec-cauchy.$(OBJEXT) \
 	erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec-galois.$(OBJEXT) \
@@ -4680,7 +5338,7 @@ am__objects_34 =  \
 	erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec-gf_rand.$(OBJEXT) \
 	erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec-gf_w8.$(OBJEXT)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_unittest_erasure_code_shec_OBJECTS = test/erasure-code/unittest_erasure_code_shec-TestErasureCodeShec.$(OBJEXT) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__objects_34)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__objects_41)
 unittest_erasure_code_shec_OBJECTS =  \
 	$(am_unittest_erasure_code_shec_OBJECTS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_shec_DEPENDENCIES =  \
@@ -4699,7 +5357,7 @@ am__unittest_erasure_code_shec_all_SOURCES_DIST =  \
 	erasure-code/shec/ErasureCodePluginShec.cc \
 	erasure-code/shec/ErasureCodeShec.cc \
 	erasure-code/shec/ErasureCodeShecTableCache.cc \
-	erasure-code/shec/shec.cc erasure-code/shec/determinant.c \
+	erasure-code/shec/determinant.c \
 	erasure-code/jerasure/jerasure/src/cauchy.c \
 	erasure-code/jerasure/jerasure/src/galois.c \
 	erasure-code/jerasure/jerasure/src/jerasure.c \
@@ -4716,11 +5374,10 @@ am__unittest_erasure_code_shec_all_SOURCES_DIST =  \
 	erasure-code/jerasure/gf-complete/src/gf_w4.c \
 	erasure-code/jerasure/gf-complete/src/gf_rand.c \
 	erasure-code/jerasure/gf-complete/src/gf_w8.c
-am__objects_35 = erasure-code/unittest_erasure_code_shec_all-ErasureCode.$(OBJEXT) \
+am__objects_42 = erasure-code/unittest_erasure_code_shec_all-ErasureCode.$(OBJEXT) \
 	erasure-code/shec/unittest_erasure_code_shec_all-ErasureCodePluginShec.$(OBJEXT) \
 	erasure-code/shec/unittest_erasure_code_shec_all-ErasureCodeShec.$(OBJEXT) \
 	erasure-code/shec/unittest_erasure_code_shec_all-ErasureCodeShecTableCache.$(OBJEXT) \
-	erasure-code/shec/unittest_erasure_code_shec_all-shec.$(OBJEXT) \
 	erasure-code/shec/unittest_erasure_code_shec_all-determinant.$(OBJEXT) \
 	erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_all-cauchy.$(OBJEXT) \
 	erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_all-galois.$(OBJEXT) \
@@ -4739,7 +5396,7 @@ am__objects_35 = erasure-code/unittest_erasure_code_shec_all-ErasureCode.$(OBJEX
 	erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_all-gf_rand.$(OBJEXT) \
 	erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_all-gf_w8.$(OBJEXT)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_unittest_erasure_code_shec_all_OBJECTS = test/erasure-code/unittest_erasure_code_shec_all-TestErasureCodeShec_all.$(OBJEXT) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__objects_35)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__objects_42)
 unittest_erasure_code_shec_all_OBJECTS =  \
 	$(am_unittest_erasure_code_shec_all_OBJECTS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_shec_all_DEPENDENCIES =  \
@@ -4752,13 +5409,71 @@ unittest_erasure_code_shec_all_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_erasure_code_shec_all_CXXFLAGS) $(CXXFLAGS) \
 	$(AM_LDFLAGS) $(LDFLAGS) -o $@
+am__unittest_erasure_code_shec_arguments_SOURCES_DIST =  \
+	test/erasure-code/TestErasureCodeShec_arguments.cc \
+	erasure-code/ErasureCode.cc \
+	erasure-code/shec/ErasureCodePluginShec.cc \
+	erasure-code/shec/ErasureCodeShec.cc \
+	erasure-code/shec/ErasureCodeShecTableCache.cc \
+	erasure-code/shec/determinant.c \
+	erasure-code/jerasure/jerasure/src/cauchy.c \
+	erasure-code/jerasure/jerasure/src/galois.c \
+	erasure-code/jerasure/jerasure/src/jerasure.c \
+	erasure-code/jerasure/jerasure/src/liberation.c \
+	erasure-code/jerasure/jerasure/src/reed_sol.c \
+	erasure-code/jerasure/gf-complete/src/gf_wgen.c \
+	erasure-code/jerasure/gf-complete/src/gf_method.c \
+	erasure-code/jerasure/gf-complete/src/gf_w16.c \
+	erasure-code/jerasure/gf-complete/src/gf.c \
+	erasure-code/jerasure/gf-complete/src/gf_w32.c \
+	erasure-code/jerasure/gf-complete/src/gf_w64.c \
+	erasure-code/jerasure/gf-complete/src/gf_w128.c \
+	erasure-code/jerasure/gf-complete/src/gf_general.c \
+	erasure-code/jerasure/gf-complete/src/gf_w4.c \
+	erasure-code/jerasure/gf-complete/src/gf_rand.c \
+	erasure-code/jerasure/gf-complete/src/gf_w8.c
+am__objects_43 = erasure-code/unittest_erasure_code_shec_arguments-ErasureCode.$(OBJEXT) \
+	erasure-code/shec/unittest_erasure_code_shec_arguments-ErasureCodePluginShec.$(OBJEXT) \
+	erasure-code/shec/unittest_erasure_code_shec_arguments-ErasureCodeShec.$(OBJEXT) \
+	erasure-code/shec/unittest_erasure_code_shec_arguments-ErasureCodeShecTableCache.$(OBJEXT) \
+	erasure-code/shec/unittest_erasure_code_shec_arguments-determinant.$(OBJEXT) \
+	erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-cauchy.$(OBJEXT) \
+	erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-galois.$(OBJEXT) \
+	erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-jerasure.$(OBJEXT) \
+	erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-liberation.$(OBJEXT) \
+	erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-reed_sol.$(OBJEXT) \
+	erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_wgen.$(OBJEXT) \
+	erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_method.$(OBJEXT) \
+	erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w16.$(OBJEXT) \
+	erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf.$(OBJEXT) \
+	erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w32.$(OBJEXT) \
+	erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w64.$(OBJEXT) \
+	erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w128.$(OBJEXT) \
+	erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_general.$(OBJEXT) \
+	erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w4.$(OBJEXT) \
+	erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_rand.$(OBJEXT) \
+	erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w8.$(OBJEXT)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_unittest_erasure_code_shec_arguments_OBJECTS = test/erasure-code/unittest_erasure_code_shec_arguments-TestErasureCodeShec_arguments.$(OBJEXT) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__objects_43)
+unittest_erasure_code_shec_arguments_OBJECTS =  \
+	$(am_unittest_erasure_code_shec_arguments_OBJECTS)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_shec_arguments_DEPENDENCIES =  \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_14) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBCOMMON) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_15) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_10) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1)
+unittest_erasure_code_shec_arguments_LINK = $(LIBTOOL) $(AM_V_lt) \
+	--tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link \
+	$(CXXLD) $(unittest_erasure_code_shec_arguments_CXXFLAGS) \
+	$(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
 am__unittest_erasure_code_shec_thread_SOURCES_DIST =  \
 	test/erasure-code/TestErasureCodeShec_thread.cc \
 	erasure-code/ErasureCode.cc \
 	erasure-code/shec/ErasureCodePluginShec.cc \
 	erasure-code/shec/ErasureCodeShec.cc \
 	erasure-code/shec/ErasureCodeShecTableCache.cc \
-	erasure-code/shec/shec.cc erasure-code/shec/determinant.c \
+	erasure-code/shec/determinant.c \
 	erasure-code/jerasure/jerasure/src/cauchy.c \
 	erasure-code/jerasure/jerasure/src/galois.c \
 	erasure-code/jerasure/jerasure/src/jerasure.c \
@@ -4775,11 +5490,10 @@ am__unittest_erasure_code_shec_thread_SOURCES_DIST =  \
 	erasure-code/jerasure/gf-complete/src/gf_w4.c \
 	erasure-code/jerasure/gf-complete/src/gf_rand.c \
 	erasure-code/jerasure/gf-complete/src/gf_w8.c
-am__objects_36 = erasure-code/unittest_erasure_code_shec_thread-ErasureCode.$(OBJEXT) \
+am__objects_44 = erasure-code/unittest_erasure_code_shec_thread-ErasureCode.$(OBJEXT) \
 	erasure-code/shec/unittest_erasure_code_shec_thread-ErasureCodePluginShec.$(OBJEXT) \
 	erasure-code/shec/unittest_erasure_code_shec_thread-ErasureCodeShec.$(OBJEXT) \
 	erasure-code/shec/unittest_erasure_code_shec_thread-ErasureCodeShecTableCache.$(OBJEXT) \
-	erasure-code/shec/unittest_erasure_code_shec_thread-shec.$(OBJEXT) \
 	erasure-code/shec/unittest_erasure_code_shec_thread-determinant.$(OBJEXT) \
 	erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_thread-cauchy.$(OBJEXT) \
 	erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_thread-galois.$(OBJEXT) \
@@ -4798,7 +5512,7 @@ am__objects_36 = erasure-code/unittest_erasure_code_shec_thread-ErasureCode.$(OB
 	erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_thread-gf_rand.$(OBJEXT) \
 	erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_thread-gf_w8.$(OBJEXT)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_unittest_erasure_code_shec_thread_OBJECTS = test/erasure-code/unittest_erasure_code_shec_thread-TestErasureCodeShec_thread.$(OBJEXT) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__objects_36)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__objects_44)
 unittest_erasure_code_shec_thread_OBJECTS =  \
 	$(am_unittest_erasure_code_shec_thread_OBJECTS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_shec_thread_DEPENDENCIES =  \
@@ -4819,17 +5533,6 @@ unittest_escape_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_escape_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
 	$(LDFLAGS) -o $@
-am__unittest_flatindex_SOURCES_DIST = test/os/TestFlatIndex.cc
- at ENABLE_SERVER_TRUE@am_unittest_flatindex_OBJECTS = test/os/unittest_flatindex-TestFlatIndex.$(OBJEXT)
-unittest_flatindex_OBJECTS = $(am_unittest_flatindex_OBJECTS)
- at ENABLE_SERVER_TRUE@unittest_flatindex_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_5) \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_10)
-unittest_flatindex_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
-	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
-	$(unittest_flatindex_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
-	$(LDFLAGS) -o $@
 am_unittest_formatter_OBJECTS =  \
 	test/unittest_formatter-formatter.$(OBJEXT) \
 	rgw/unittest_formatter-rgw_formats.$(OBJEXT)
@@ -4941,8 +5644,10 @@ unittest_librados_config_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_librados_config_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
 	$(LDFLAGS) -o $@
-am__unittest_librbd_SOURCES_DIST = test/librbd/test_main.cc
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am_unittest_librbd_OBJECTS = test/librbd/unittest_librbd-test_main.$(OBJEXT)
+am__unittest_librbd_SOURCES_DIST = test/librbd/test_main.cc \
+	test/librbd/test_mock_fixture.cc
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am_unittest_librbd_OBJECTS = test/librbd/unittest_librbd-test_main.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/unittest_librbd-test_mock_fixture.$(OBJEXT)
 unittest_librbd_OBJECTS = $(am_unittest_librbd_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at unittest_librbd_DEPENDENCIES = librbd_test.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_api.la \
@@ -4955,8 +5660,7 @@ unittest_librbd_OBJECTS = $(am_unittest_librbd_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBOSDC) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_15) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_10) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(RADOS_TEST_LDADD) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__append_161)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(RADOS_TEST_LDADD)
 unittest_librbd_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_librbd_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -5068,6 +5772,15 @@ unittest_osdscrub_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_osdscrub_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
 	$(LDFLAGS) -o $@
+am__unittest_pageset_SOURCES_DIST = test/test_pageset.cc
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_unittest_pageset_OBJECTS = test/unittest_pageset-test_pageset.$(OBJEXT)
+unittest_pageset_OBJECTS = $(am_unittest_pageset_OBJECTS)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_pageset_DEPENDENCIES =  \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_15)
+unittest_pageset_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
+	$(unittest_pageset_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
 am_unittest_perf_counters_OBJECTS =  \
 	test/unittest_perf_counters-perf_counters.$(OBJEXT)
 unittest_perf_counters_OBJECTS = $(am_unittest_perf_counters_OBJECTS)
@@ -5098,6 +5811,15 @@ unittest_prebufferedstreambuf_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_prebufferedstreambuf_CXXFLAGS) $(CXXFLAGS) \
 	$(AM_LDFLAGS) $(LDFLAGS) -o $@
+am_unittest_prioritized_queue_OBJECTS = test/common/unittest_prioritized_queue-test_prioritized_queue.$(OBJEXT)
+unittest_prioritized_queue_OBJECTS =  \
+	$(am_unittest_prioritized_queue_OBJECTS)
+unittest_prioritized_queue_DEPENDENCIES = $(am__DEPENDENCIES_15) \
+	$(am__DEPENDENCIES_10)
+unittest_prioritized_queue_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
+	$(unittest_prioritized_queue_CXXFLAGS) $(CXXFLAGS) \
+	$(AM_LDFLAGS) $(LDFLAGS) -o $@
 am__unittest_rbd_replay_SOURCES_DIST = test/test_rbd_replay.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am_unittest_rbd_replay_OBJECTS = test/unittest_rbd_replay-test_rbd_replay.$(OBJEXT)
 unittest_rbd_replay_OBJECTS = $(am_unittest_rbd_replay_OBJECTS)
@@ -5121,6 +5843,31 @@ unittest_readahead_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_readahead_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
 	$(LDFLAGS) -o $@
+am__unittest_rocksdb_option_SOURCES_DIST =  \
+	test/objectstore/TestRocksdbOptionParse.cc
+ at ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_TRUE at am_unittest_rocksdb_option_OBJECTS = test/objectstore/unittest_rocksdb_option-TestRocksdbOptionParse.$(OBJEXT)
+unittest_rocksdb_option_OBJECTS =  \
+	$(am_unittest_rocksdb_option_OBJECTS)
+ at ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_TRUE at unittest_rocksdb_option_DEPENDENCIES = $(am__DEPENDENCIES_5) \
+ at ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_TRUE@	$(am__DEPENDENCIES_15) \
+ at ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_TRUE@	$(am__DEPENDENCIES_10)
+unittest_rocksdb_option_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
+	$(unittest_rocksdb_option_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
+am__unittest_rocksdb_option_static_SOURCES_DIST =  \
+	test/objectstore/TestRocksdbOptionParse.cc
+ at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE at am_unittest_rocksdb_option_static_OBJECTS = test/objectstore/unittest_rocksdb_option_static-TestRocksdbOptionParse.$(OBJEXT)
+unittest_rocksdb_option_static_OBJECTS =  \
+	$(am_unittest_rocksdb_option_static_OBJECTS)
+ at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE at unittest_rocksdb_option_static_DEPENDENCIES = $(am__DEPENDENCIES_5) \
+ at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE@	$(am__DEPENDENCIES_15) \
+ at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE@	$(am__DEPENDENCIES_10) \
+ at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE@	rocksdb/librocksdb.la
+unittest_rocksdb_option_static_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
+	$(unittest_rocksdb_option_static_CXXFLAGS) $(CXXFLAGS) \
+	$(AM_LDFLAGS) $(LDFLAGS) -o $@
 am__unittest_run_cmd_SOURCES_DIST = test/run_cmd.cc
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am_unittest_run_cmd_OBJECTS = test/unittest_run_cmd-run_cmd.$(OBJEXT)
 unittest_run_cmd_OBJECTS = $(am_unittest_run_cmd_OBJECTS)
@@ -5220,6 +5967,14 @@ unittest_strtol_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_strtol_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
 	$(LDFLAGS) -o $@
+am_unittest_subprocess_OBJECTS =  \
+	test/unittest_subprocess-test_subprocess.$(OBJEXT)
+unittest_subprocess_OBJECTS = $(am_unittest_subprocess_OBJECTS)
+unittest_subprocess_DEPENDENCIES = $(LIBCOMMON) $(am__DEPENDENCIES_15)
+unittest_subprocess_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
+	$(unittest_subprocess_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
 am_unittest_tableformatter_OBJECTS = test/common/unittest_tableformatter-test_tableformatter.$(OBJEXT)
 unittest_tableformatter_OBJECTS =  \
 	$(am_unittest_tableformatter_OBJECTS)
@@ -5272,6 +6027,13 @@ unittest_workqueue_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_workqueue_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
 	$(LDFLAGS) -o $@
+am_unittest_xlist_OBJECTS = test/unittest_xlist-test_xlist.$(OBJEXT)
+unittest_xlist_OBJECTS = $(am_unittest_xlist_OBJECTS)
+unittest_xlist_DEPENDENCIES = $(am__DEPENDENCIES_15) $(LIBCOMMON)
+unittest_xlist_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
+	$(unittest_xlist_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
 am__xio_client_SOURCES_DIST = test/messenger/xio_client.cc \
 	test/messenger/xio_dispatcher.cc
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE at am_xio_client_OBJECTS = test/messenger/xio_client-xio_client.$(OBJEXT) \
@@ -5302,7 +6064,8 @@ xio_server_OBJECTS = $(am_xio_server_OBJECTS)
 xio_server_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \
 	$(LIBTOOLFLAGS) --mode=link $(CXXLD) $(xio_server_CXXFLAGS) \
 	$(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
-SCRIPTS = $(bin_SCRIPTS) $(ceph_libexec_SCRIPTS) $(ceph_sbin_SCRIPTS) \
+SCRIPTS = $(bin_SCRIPTS) $(ceph_libexec_SCRIPTS) \
+	$(ceph_monstore_update_crush_SCRIPTS) $(ceph_sbin_SCRIPTS) \
 	$(dist_bin_SCRIPTS) $(sbin_SCRIPTS) $(shell_common_SCRIPTS) \
 	$(su_sbin_SCRIPTS)
 AM_V_P = $(am__v_P_ at AM_V@)
@@ -5378,22 +6141,27 @@ am__v_CCAS_1 =
 SOURCES = $(libcls_log_client_a_SOURCES) \
 	$(libcls_replica_log_client_a_SOURCES) \
 	$(libcls_statelog_client_a_SOURCES) \
+	$(libcls_timeindex_client_a_SOURCES) \
 	$(libcls_user_client_a_SOURCES) \
 	$(libcls_version_client_a_SOURCES) $(libos_zfs_a_SOURCES) \
 	$(libarch_la_SOURCES) $(libauth_la_SOURCES) \
 	$(libcephfs_la_SOURCES) $(libcephfs_jni_la_SOURCES) \
 	$(libcivetweb_la_SOURCES) $(libclient_la_SOURCES) \
-	$(libclient_fuse_la_SOURCES) $(libcls_hello_la_SOURCES) \
+	$(libclient_fuse_la_SOURCES) $(libcls_cephfs_la_SOURCES) \
+	$(libcls_cephfs_client_la_SOURCES) $(libcls_hello_la_SOURCES) \
 	$(libcls_kvs_la_SOURCES) $(libcls_lock_la_SOURCES) \
 	$(libcls_lock_client_la_SOURCES) $(libcls_log_la_SOURCES) \
+	$(libcls_numops_la_SOURCES) $(libcls_numops_client_la_SOURCES) \
 	$(libcls_rbd_la_SOURCES) $(libcls_rbd_client_la_SOURCES) \
 	$(libcls_refcount_la_SOURCES) \
 	$(libcls_refcount_client_la_SOURCES) \
 	$(libcls_replica_log_la_SOURCES) $(libcls_rgw_la_SOURCES) \
 	$(libcls_rgw_client_la_SOURCES) $(libcls_statelog_la_SOURCES) \
-	$(libcls_user_la_SOURCES) $(libcls_version_la_SOURCES) \
-	$(libcommon_la_SOURCES) $(libcommon_api_la_SOURCES) \
-	$(libcommon_crc_la_SOURCES) $(libcommon_internal_la_SOURCES) \
+	$(libcls_timeindex_la_SOURCES) $(libcls_user_la_SOURCES) \
+	$(libcls_version_la_SOURCES) $(libcommon_la_SOURCES) \
+	$(libcommon_crc_la_SOURCES) \
+	$(libcommon_crc_aarch64_la_SOURCES) \
+	$(libcommon_internal_la_SOURCES) $(libcompressor_la_SOURCES) \
 	$(libcrush_la_SOURCES) $(libec_example_la_SOURCES) \
 	$(libec_fail_to_initialize_la_SOURCES) \
 	$(libec_fail_to_register_la_SOURCES) $(libec_hangs_la_SOURCES) \
@@ -5404,24 +6172,35 @@ SOURCES = $(libcls_log_client_a_SOURCES) \
 	$(libec_jerasure_sse4_la_SOURCES) $(libec_lrc_la_SOURCES) \
 	$(libec_missing_entry_point_la_SOURCES) \
 	$(libec_missing_version_la_SOURCES) $(libec_shec_la_SOURCES) \
+	$(libec_shec_generic_la_SOURCES) $(libec_shec_neon_la_SOURCES) \
+	$(libec_shec_sse3_la_SOURCES) $(libec_shec_sse4_la_SOURCES) \
 	$(libec_test_jerasure_generic_la_SOURCES) \
 	$(libec_test_jerasure_neon_la_SOURCES) \
 	$(libec_test_jerasure_sse3_la_SOURCES) \
 	$(libec_test_jerasure_sse4_la_SOURCES) \
+	$(libec_test_shec_generic_la_SOURCES) \
+	$(libec_test_shec_neon_la_SOURCES) \
+	$(libec_test_shec_sse3_la_SOURCES) \
+	$(libec_test_shec_sse4_la_SOURCES) \
 	$(liberasure_code_la_SOURCES) $(libglobal_la_SOURCES) \
 	$(libjson_spirit_la_SOURCES) $(libkrbd_la_SOURCES) \
 	$(liblog_la_SOURCES) $(libmds_la_SOURCES) $(libmon_la_SOURCES) \
 	$(libmon_types_la_SOURCES) $(libmsg_la_SOURCES) \
 	$(libos_la_SOURCES) $(libos_rocksdb_la_SOURCES) \
+	$(libos_tp_la_SOURCES) $(nodist_libos_tp_la_SOURCES) \
 	$(libos_types_la_SOURCES) $(libosd_la_SOURCES) \
+	$(libosd_tp_la_SOURCES) $(nodist_libosd_tp_la_SOURCES) \
 	$(libosd_types_la_SOURCES) $(libosdc_la_SOURCES) \
 	$(libperfglue_la_SOURCES) $(librados_la_SOURCES) \
 	$(librados_api_la_SOURCES) $(librados_internal_la_SOURCES) \
-	$(librados_test_stub_la_SOURCES) $(libradosstriper_la_SOURCES) \
+	$(librados_test_stub_la_SOURCES) $(librados_tp_la_SOURCES) \
+	$(nodist_librados_tp_la_SOURCES) $(libradosstriper_la_SOURCES) \
 	$(libradosstripertest_la_SOURCES) $(libradostest_la_SOURCES) \
 	$(librbd_la_SOURCES) $(librbd_api_la_SOURCES) \
 	$(librbd_internal_la_SOURCES) $(librbd_replay_la_SOURCES) \
-	$(librbd_replay_ios_la_SOURCES) $(librbd_test_la_SOURCES) \
+	$(librbd_replay_ios_la_SOURCES) \
+	$(librbd_replay_types_la_SOURCES) $(librbd_test_la_SOURCES) \
+	$(librbd_tp_la_SOURCES) $(nodist_librbd_tp_la_SOURCES) \
 	$(librbd_types_la_SOURCES) $(librgw_la_SOURCES) \
 	$(libsecret_la_SOURCES) $(libsystest_la_SOURCES) \
 	$(ceph_authtool_SOURCES) $(ceph_client_debug_SOURCES) \
@@ -5435,7 +6214,10 @@ SOURCES = $(libcls_log_client_a_SOURCES) \
 	$(ceph_erasure_code_benchmark_SOURCES) \
 	$(ceph_erasure_code_non_regression_SOURCES) \
 	$(ceph_kvstorebench_SOURCES) \
-	$(ceph_multi_stress_watch_SOURCES) $(ceph_omapbench_SOURCES) \
+	$(ceph_multi_stress_watch_SOURCES) \
+	$(ceph_objectstore_bench_SOURCES) $(ceph_omapbench_SOURCES) \
+	$(ceph_perf_local_SOURCES) $(ceph_perf_msgr_client_SOURCES) \
+	$(ceph_perf_msgr_server_SOURCES) \
 	$(ceph_perf_objectstore_SOURCES) $(ceph_psim_SOURCES) \
 	$(ceph_radosacl_SOURCES) $(ceph_rgw_jsonparser_SOURCES) \
 	$(ceph_rgw_multiparser_SOURCES) $(ceph_scratchtool_SOURCES) \
@@ -5446,8 +6228,8 @@ SOURCES = $(libcls_log_client_a_SOURCES) \
 	$(ceph_test_c_headers_SOURCES) \
 	$(ceph_test_cfuse_cache_invalidate_SOURCES) \
 	$(ceph_test_cls_hello_SOURCES) $(ceph_test_cls_lock_SOURCES) \
-	$(ceph_test_cls_log_SOURCES) $(ceph_test_cls_rbd_SOURCES) \
-	$(ceph_test_cls_refcount_SOURCES) \
+	$(ceph_test_cls_log_SOURCES) $(ceph_test_cls_numops_SOURCES) \
+	$(ceph_test_cls_rbd_SOURCES) $(ceph_test_cls_refcount_SOURCES) \
 	$(ceph_test_cls_replica_log_SOURCES) \
 	$(ceph_test_cls_rgw_SOURCES) $(ceph_test_cls_rgw_log_SOURCES) \
 	$(ceph_test_cls_rgw_meta_SOURCES) \
@@ -5460,6 +6242,7 @@ SOURCES = $(libcls_log_client_a_SOURCES) \
 	$(ceph_test_filestore_idempotent_sequence_SOURCES) \
 	$(ceph_test_get_blkdev_size_SOURCES) \
 	$(ceph_test_ioctls_SOURCES) $(ceph_test_keys_SOURCES) \
+	$(ceph_test_keyvaluedb_SOURCES) \
 	$(ceph_test_keyvaluedb_atomicity_SOURCES) \
 	$(ceph_test_keyvaluedb_iterators_SOURCES) \
 	$(ceph_test_libcephfs_SOURCES) $(ceph_test_librbd_SOURCES) \
@@ -5493,24 +6276,25 @@ SOURCES = $(libcls_log_client_a_SOURCES) \
 	$(ceph_test_rados_striper_api_striping_SOURCES) \
 	$(ceph_test_rados_watch_notify_SOURCES) \
 	$(ceph_test_rewrite_latency_SOURCES) \
-	$(ceph_test_rgw_manifest_SOURCES) \
+	$(ceph_test_rgw_manifest_SOURCES) $(ceph_test_rgw_obj_SOURCES) \
 	$(ceph_test_signal_handlers_SOURCES) \
 	$(ceph_test_snap_mapper_SOURCES) \
 	$(ceph_test_stress_watch_SOURCES) $(ceph_test_timers_SOURCES) \
 	$(ceph_test_trans_SOURCES) $(ceph_tpbench_SOURCES) \
 	$(ceph_xattr_bench_SOURCES) $(cephfs_SOURCES) \
-	$(cephfs_journal_tool_SOURCES) $(cephfs_table_tool_SOURCES) \
-	$(crushtool_SOURCES) $(get_command_descriptions_SOURCES) \
-	$(librados_config_SOURCES) $(monmaptool_SOURCES) \
-	$(mount_ceph_SOURCES) $(osdmaptool_SOURCES) $(rados_SOURCES) \
-	$(radosgw_SOURCES) $(radosgw_admin_SOURCES) $(rbd_SOURCES) \
-	$(rbd_fuse_SOURCES) $(rbd_replay_SOURCES) \
-	$(rbd_replay_prep_SOURCES) $(rest_bench_SOURCES) \
-	$(simple_client_SOURCES) $(simple_server_SOURCES) \
-	$(test_build_libcephfs_SOURCES) \
+	$(cephfs_data_scan_SOURCES) $(cephfs_journal_tool_SOURCES) \
+	$(cephfs_table_tool_SOURCES) $(crushtool_SOURCES) \
+	$(get_command_descriptions_SOURCES) $(librados_config_SOURCES) \
+	$(monmaptool_SOURCES) $(mount_ceph_SOURCES) \
+	$(osdmaptool_SOURCES) $(rados_SOURCES) $(radosgw_SOURCES) \
+	$(radosgw_admin_SOURCES) $(radosgw_object_expirer_SOURCES) \
+	$(rbd_SOURCES) $(rbd_fuse_SOURCES) $(rbd_replay_SOURCES) \
+	$(rbd_replay_prep_SOURCES) $(simple_client_SOURCES) \
+	$(simple_server_SOURCES) $(test_build_libcephfs_SOURCES) \
 	$(test_build_libcommon_SOURCES) $(test_build_librados_SOURCES) \
 	$(test_build_librgw_SOURCES) $(unittest_addrs_SOURCES) \
 	$(unittest_admin_socket_SOURCES) $(unittest_arch_SOURCES) \
+	$(unittest_async_compressor_SOURCES) \
 	$(unittest_base64_SOURCES) $(unittest_bit_vector_SOURCES) \
 	$(unittest_blkdev_SOURCES) $(unittest_bloom_filter_SOURCES) \
 	$(unittest_bufferlist_SOURCES) \
@@ -5533,14 +6317,16 @@ SOURCES = $(libcls_log_client_a_SOURCES) \
 	$(unittest_erasure_code_plugin_isa_SOURCES) \
 	$(unittest_erasure_code_plugin_jerasure_SOURCES) \
 	$(unittest_erasure_code_plugin_lrc_SOURCES) \
+	$(unittest_erasure_code_plugin_shec_SOURCES) \
 	$(unittest_erasure_code_shec_SOURCES) \
 	$(unittest_erasure_code_shec_all_SOURCES) \
+	$(unittest_erasure_code_shec_arguments_SOURCES) \
 	$(unittest_erasure_code_shec_thread_SOURCES) \
-	$(unittest_escape_SOURCES) $(unittest_flatindex_SOURCES) \
-	$(unittest_formatter_SOURCES) $(unittest_gather_SOURCES) \
-	$(unittest_heartbeatmap_SOURCES) $(unittest_histogram_SOURCES) \
-	$(unittest_hitset_SOURCES) $(unittest_io_priority_SOURCES) \
-	$(unittest_ipaddr_SOURCES) $(unittest_lfnindex_SOURCES) \
+	$(unittest_escape_SOURCES) $(unittest_formatter_SOURCES) \
+	$(unittest_gather_SOURCES) $(unittest_heartbeatmap_SOURCES) \
+	$(unittest_histogram_SOURCES) $(unittest_hitset_SOURCES) \
+	$(unittest_io_priority_SOURCES) $(unittest_ipaddr_SOURCES) \
+	$(unittest_lfnindex_SOURCES) \
 	$(unittest_libcephfs_config_SOURCES) \
 	$(unittest_librados_SOURCES) \
 	$(unittest_librados_config_SOURCES) $(unittest_librbd_SOURCES) \
@@ -5550,9 +6336,13 @@ SOURCES = $(libcls_log_client_a_SOURCES) \
 	$(unittest_mon_pgmap_SOURCES) $(unittest_on_exit_SOURCES) \
 	$(unittest_osd_osdcap_SOURCES) $(unittest_osd_types_SOURCES) \
 	$(unittest_osdmap_SOURCES) $(unittest_osdscrub_SOURCES) \
-	$(unittest_perf_counters_SOURCES) $(unittest_pglog_SOURCES) \
+	$(unittest_pageset_SOURCES) $(unittest_perf_counters_SOURCES) \
+	$(unittest_pglog_SOURCES) \
 	$(unittest_prebufferedstreambuf_SOURCES) \
+	$(unittest_prioritized_queue_SOURCES) \
 	$(unittest_rbd_replay_SOURCES) $(unittest_readahead_SOURCES) \
+	$(unittest_rocksdb_option_SOURCES) \
+	$(unittest_rocksdb_option_static_SOURCES) \
 	$(unittest_run_cmd_SOURCES) $(unittest_safe_io_SOURCES) \
 	$(unittest_shared_cache_SOURCES) \
 	$(unittest_sharedptr_registry_SOURCES) \
@@ -5560,14 +6350,16 @@ SOURCES = $(libcls_log_client_a_SOURCES) \
 	$(unittest_sloppy_crc_map_SOURCES) \
 	$(unittest_str_list_SOURCES) $(unittest_str_map_SOURCES) \
 	$(unittest_striper_SOURCES) $(unittest_strtol_SOURCES) \
+	$(unittest_subprocess_SOURCES) \
 	$(unittest_tableformatter_SOURCES) \
 	$(unittest_texttable_SOURCES) $(unittest_throttle_SOURCES) \
 	$(unittest_utf8_SOURCES) $(unittest_util_SOURCES) \
-	$(unittest_workqueue_SOURCES) $(xio_client_SOURCES) \
-	$(xio_server_SOURCES)
+	$(unittest_workqueue_SOURCES) $(unittest_xlist_SOURCES) \
+	$(xio_client_SOURCES) $(xio_server_SOURCES)
 DIST_SOURCES = $(am__libcls_log_client_a_SOURCES_DIST) \
 	$(am__libcls_replica_log_client_a_SOURCES_DIST) \
 	$(am__libcls_statelog_client_a_SOURCES_DIST) \
+	$(am__libcls_timeindex_client_a_SOURCES_DIST) \
 	$(am__libcls_user_client_a_SOURCES_DIST) \
 	$(am__libcls_version_client_a_SOURCES_DIST) \
 	$(am__libos_zfs_a_SOURCES_DIST) $(libarch_la_SOURCES) \
@@ -5576,11 +6368,15 @@ DIST_SOURCES = $(am__libcls_log_client_a_SOURCES_DIST) \
 	$(am__libcivetweb_la_SOURCES_DIST) \
 	$(am__libclient_la_SOURCES_DIST) \
 	$(am__libclient_fuse_la_SOURCES_DIST) \
+	$(am__libcls_cephfs_la_SOURCES_DIST) \
+	$(am__libcls_cephfs_client_la_SOURCES_DIST) \
 	$(am__libcls_hello_la_SOURCES_DIST) \
 	$(am__libcls_kvs_la_SOURCES_DIST) \
 	$(am__libcls_lock_la_SOURCES_DIST) \
 	$(am__libcls_lock_client_la_SOURCES_DIST) \
 	$(am__libcls_log_la_SOURCES_DIST) \
+	$(am__libcls_numops_la_SOURCES_DIST) \
+	$(am__libcls_numops_client_la_SOURCES_DIST) \
 	$(am__libcls_rbd_la_SOURCES_DIST) \
 	$(am__libcls_rbd_client_la_SOURCES_DIST) \
 	$(am__libcls_refcount_la_SOURCES_DIST) \
@@ -5589,12 +6385,14 @@ DIST_SOURCES = $(am__libcls_log_client_a_SOURCES_DIST) \
 	$(am__libcls_rgw_la_SOURCES_DIST) \
 	$(am__libcls_rgw_client_la_SOURCES_DIST) \
 	$(am__libcls_statelog_la_SOURCES_DIST) \
+	$(am__libcls_timeindex_la_SOURCES_DIST) \
 	$(am__libcls_user_la_SOURCES_DIST) \
 	$(am__libcls_version_la_SOURCES_DIST) $(libcommon_la_SOURCES) \
-	$(libcommon_api_la_SOURCES) \
 	$(am__libcommon_crc_la_SOURCES_DIST) \
+	$(am__libcommon_crc_aarch64_la_SOURCES_DIST) \
 	$(am__libcommon_internal_la_SOURCES_DIST) \
-	$(libcrush_la_SOURCES) $(am__libec_example_la_SOURCES_DIST) \
+	$(libcompressor_la_SOURCES) $(libcrush_la_SOURCES) \
+	$(am__libec_example_la_SOURCES_DIST) \
 	$(am__libec_fail_to_initialize_la_SOURCES_DIST) \
 	$(am__libec_fail_to_register_la_SOURCES_DIST) \
 	$(am__libec_hangs_la_SOURCES_DIST) \
@@ -5605,23 +6403,33 @@ DIST_SOURCES = $(am__libcls_log_client_a_SOURCES_DIST) \
 	$(libec_jerasure_sse4_la_SOURCES) $(libec_lrc_la_SOURCES) \
 	$(am__libec_missing_entry_point_la_SOURCES_DIST) \
 	$(am__libec_missing_version_la_SOURCES_DIST) \
-	$(libec_shec_la_SOURCES) \
+	$(libec_shec_la_SOURCES) $(libec_shec_generic_la_SOURCES) \
+	$(libec_shec_neon_la_SOURCES) $(libec_shec_sse3_la_SOURCES) \
+	$(libec_shec_sse4_la_SOURCES) \
 	$(am__libec_test_jerasure_generic_la_SOURCES_DIST) \
 	$(am__libec_test_jerasure_neon_la_SOURCES_DIST) \
 	$(am__libec_test_jerasure_sse3_la_SOURCES_DIST) \
 	$(am__libec_test_jerasure_sse4_la_SOURCES_DIST) \
+	$(am__libec_test_shec_generic_la_SOURCES_DIST) \
+	$(am__libec_test_shec_neon_la_SOURCES_DIST) \
+	$(am__libec_test_shec_sse3_la_SOURCES_DIST) \
+	$(am__libec_test_shec_sse4_la_SOURCES_DIST) \
 	$(liberasure_code_la_SOURCES) $(libglobal_la_SOURCES) \
 	$(libjson_spirit_la_SOURCES) $(am__libkrbd_la_SOURCES_DIST) \
 	$(liblog_la_SOURCES) $(am__libmds_la_SOURCES_DIST) \
 	$(am__libmon_la_SOURCES_DIST) $(libmon_types_la_SOURCES) \
 	$(am__libmsg_la_SOURCES_DIST) $(am__libos_la_SOURCES_DIST) \
-	$(am__libos_rocksdb_la_SOURCES_DIST) $(libos_types_la_SOURCES) \
-	$(am__libosd_la_SOURCES_DIST) $(libosd_types_la_SOURCES) \
-	$(libosdc_la_SOURCES) $(am__libperfglue_la_SOURCES_DIST) \
+	$(am__libos_rocksdb_la_SOURCES_DIST) \
+	$(am__libos_tp_la_SOURCES_DIST) \
+	$(am__libos_types_la_SOURCES_DIST) \
+	$(am__libosd_la_SOURCES_DIST) $(am__libosd_tp_la_SOURCES_DIST) \
+	$(libosd_types_la_SOURCES) $(libosdc_la_SOURCES) \
+	$(am__libperfglue_la_SOURCES_DIST) \
 	$(am__librados_la_SOURCES_DIST) \
 	$(am__librados_api_la_SOURCES_DIST) \
 	$(am__librados_internal_la_SOURCES_DIST) \
 	$(am__librados_test_stub_la_SOURCES_DIST) \
+	$(am__librados_tp_la_SOURCES_DIST) \
 	$(am__libradosstriper_la_SOURCES_DIST) \
 	$(am__libradosstripertest_la_SOURCES_DIST) \
 	$(am__libradostest_la_SOURCES_DIST) \
@@ -5630,7 +6438,9 @@ DIST_SOURCES = $(am__libcls_log_client_a_SOURCES_DIST) \
 	$(am__librbd_internal_la_SOURCES_DIST) \
 	$(am__librbd_replay_la_SOURCES_DIST) \
 	$(am__librbd_replay_ios_la_SOURCES_DIST) \
-	$(am__librbd_test_la_SOURCES_DIST) $(librbd_types_la_SOURCES) \
+	$(am__librbd_replay_types_la_SOURCES_DIST) \
+	$(am__librbd_test_la_SOURCES_DIST) \
+	$(am__librbd_tp_la_SOURCES_DIST) $(librbd_types_la_SOURCES) \
 	$(am__librgw_la_SOURCES_DIST) $(libsecret_la_SOURCES) \
 	$(am__libsystest_la_SOURCES_DIST) $(ceph_authtool_SOURCES) \
 	$(am__ceph_client_debug_SOURCES_DIST) $(ceph_conf_SOURCES) \
@@ -5648,7 +6458,11 @@ DIST_SOURCES = $(am__libcls_log_client_a_SOURCES_DIST) \
 	$(am__ceph_erasure_code_non_regression_SOURCES_DIST) \
 	$(am__ceph_kvstorebench_SOURCES_DIST) \
 	$(am__ceph_multi_stress_watch_SOURCES_DIST) \
+	$(am__ceph_objectstore_bench_SOURCES_DIST) \
 	$(am__ceph_omapbench_SOURCES_DIST) \
+	$(am__ceph_perf_local_SOURCES_DIST) \
+	$(am__ceph_perf_msgr_client_SOURCES_DIST) \
+	$(am__ceph_perf_msgr_server_SOURCES_DIST) \
 	$(am__ceph_perf_objectstore_SOURCES_DIST) $(ceph_psim_SOURCES) \
 	$(am__ceph_radosacl_SOURCES_DIST) \
 	$(am__ceph_rgw_jsonparser_SOURCES_DIST) \
@@ -5666,6 +6480,7 @@ DIST_SOURCES = $(am__libcls_log_client_a_SOURCES_DIST) \
 	$(am__ceph_test_cls_hello_SOURCES_DIST) \
 	$(am__ceph_test_cls_lock_SOURCES_DIST) \
 	$(am__ceph_test_cls_log_SOURCES_DIST) \
+	$(am__ceph_test_cls_numops_SOURCES_DIST) \
 	$(am__ceph_test_cls_rbd_SOURCES_DIST) \
 	$(am__ceph_test_cls_refcount_SOURCES_DIST) \
 	$(am__ceph_test_cls_replica_log_SOURCES_DIST) \
@@ -5683,6 +6498,7 @@ DIST_SOURCES = $(am__libcls_log_client_a_SOURCES_DIST) \
 	$(ceph_test_get_blkdev_size_SOURCES) \
 	$(am__ceph_test_ioctls_SOURCES_DIST) \
 	$(am__ceph_test_keys_SOURCES_DIST) \
+	$(am__ceph_test_keyvaluedb_SOURCES_DIST) \
 	$(am__ceph_test_keyvaluedb_atomicity_SOURCES_DIST) \
 	$(am__ceph_test_keyvaluedb_iterators_SOURCES_DIST) \
 	$(am__ceph_test_libcephfs_SOURCES_DIST) \
@@ -5722,6 +6538,7 @@ DIST_SOURCES = $(am__libcls_log_client_a_SOURCES_DIST) \
 	$(am__ceph_test_rados_watch_notify_SOURCES_DIST) \
 	$(ceph_test_rewrite_latency_SOURCES) \
 	$(am__ceph_test_rgw_manifest_SOURCES_DIST) \
+	$(am__ceph_test_rgw_obj_SOURCES_DIST) \
 	$(ceph_test_signal_handlers_SOURCES) \
 	$(am__ceph_test_snap_mapper_SOURCES_DIST) \
 	$(am__ceph_test_stress_watch_SOURCES_DIST) \
@@ -5730,16 +6547,18 @@ DIST_SOURCES = $(am__libcls_log_client_a_SOURCES_DIST) \
 	$(am__ceph_tpbench_SOURCES_DIST) \
 	$(am__ceph_xattr_bench_SOURCES_DIST) \
 	$(am__cephfs_SOURCES_DIST) \
+	$(am__cephfs_data_scan_SOURCES_DIST) \
 	$(am__cephfs_journal_tool_SOURCES_DIST) \
 	$(am__cephfs_table_tool_SOURCES_DIST) $(crushtool_SOURCES) \
 	$(am__get_command_descriptions_SOURCES_DIST) \
 	$(am__librados_config_SOURCES_DIST) $(monmaptool_SOURCES) \
 	$(am__mount_ceph_SOURCES_DIST) $(osdmaptool_SOURCES) \
 	$(am__rados_SOURCES_DIST) $(am__radosgw_SOURCES_DIST) \
-	$(am__radosgw_admin_SOURCES_DIST) $(am__rbd_SOURCES_DIST) \
-	$(am__rbd_fuse_SOURCES_DIST) $(am__rbd_replay_SOURCES_DIST) \
+	$(am__radosgw_admin_SOURCES_DIST) \
+	$(am__radosgw_object_expirer_SOURCES_DIST) \
+	$(am__rbd_SOURCES_DIST) $(am__rbd_fuse_SOURCES_DIST) \
+	$(am__rbd_replay_SOURCES_DIST) \
 	$(am__rbd_replay_prep_SOURCES_DIST) \
-	$(am__rest_bench_SOURCES_DIST) \
 	$(am__simple_client_SOURCES_DIST) \
 	$(am__simple_server_SOURCES_DIST) \
 	$(am__test_build_libcephfs_SOURCES_DIST) \
@@ -5747,7 +6566,8 @@ DIST_SOURCES = $(am__libcls_log_client_a_SOURCES_DIST) \
 	$(am__test_build_librados_SOURCES_DIST) \
 	$(am__test_build_librgw_SOURCES_DIST) \
 	$(unittest_addrs_SOURCES) $(unittest_admin_socket_SOURCES) \
-	$(unittest_arch_SOURCES) $(am__unittest_base64_SOURCES_DIST) \
+	$(unittest_arch_SOURCES) $(unittest_async_compressor_SOURCES) \
+	$(am__unittest_base64_SOURCES_DIST) \
 	$(unittest_bit_vector_SOURCES) $(unittest_blkdev_SOURCES) \
 	$(unittest_bloom_filter_SOURCES) \
 	$(unittest_bufferlist_SOURCES) \
@@ -5771,13 +6591,14 @@ DIST_SOURCES = $(am__libcls_log_client_a_SOURCES_DIST) \
 	$(am__unittest_erasure_code_plugin_isa_SOURCES_DIST) \
 	$(am__unittest_erasure_code_plugin_jerasure_SOURCES_DIST) \
 	$(am__unittest_erasure_code_plugin_lrc_SOURCES_DIST) \
+	$(am__unittest_erasure_code_plugin_shec_SOURCES_DIST) \
 	$(am__unittest_erasure_code_shec_SOURCES_DIST) \
 	$(am__unittest_erasure_code_shec_all_SOURCES_DIST) \
+	$(am__unittest_erasure_code_shec_arguments_SOURCES_DIST) \
 	$(am__unittest_erasure_code_shec_thread_SOURCES_DIST) \
-	$(unittest_escape_SOURCES) \
-	$(am__unittest_flatindex_SOURCES_DIST) \
-	$(unittest_formatter_SOURCES) $(unittest_gather_SOURCES) \
-	$(unittest_heartbeatmap_SOURCES) $(unittest_histogram_SOURCES) \
+	$(unittest_escape_SOURCES) $(unittest_formatter_SOURCES) \
+	$(unittest_gather_SOURCES) $(unittest_heartbeatmap_SOURCES) \
+	$(unittest_histogram_SOURCES) \
 	$(am__unittest_hitset_SOURCES_DIST) \
 	$(unittest_io_priority_SOURCES) $(unittest_ipaddr_SOURCES) \
 	$(am__unittest_lfnindex_SOURCES_DIST) \
@@ -5794,11 +6615,15 @@ DIST_SOURCES = $(am__libcls_log_client_a_SOURCES_DIST) \
 	$(am__unittest_osd_osdcap_SOURCES_DIST) \
 	$(unittest_osd_types_SOURCES) $(unittest_osdmap_SOURCES) \
 	$(am__unittest_osdscrub_SOURCES_DIST) \
+	$(am__unittest_pageset_SOURCES_DIST) \
 	$(unittest_perf_counters_SOURCES) \
 	$(am__unittest_pglog_SOURCES_DIST) \
 	$(unittest_prebufferedstreambuf_SOURCES) \
+	$(unittest_prioritized_queue_SOURCES) \
 	$(am__unittest_rbd_replay_SOURCES_DIST) \
 	$(unittest_readahead_SOURCES) \
+	$(am__unittest_rocksdb_option_SOURCES_DIST) \
+	$(am__unittest_rocksdb_option_static_SOURCES_DIST) \
 	$(am__unittest_run_cmd_SOURCES_DIST) \
 	$(unittest_safe_io_SOURCES) $(unittest_shared_cache_SOURCES) \
 	$(unittest_sharedptr_registry_SOURCES) \
@@ -5807,11 +6632,12 @@ DIST_SOURCES = $(am__libcls_log_client_a_SOURCES_DIST) \
 	$(unittest_sloppy_crc_map_SOURCES) \
 	$(unittest_str_list_SOURCES) $(unittest_str_map_SOURCES) \
 	$(unittest_striper_SOURCES) $(unittest_strtol_SOURCES) \
+	$(unittest_subprocess_SOURCES) \
 	$(unittest_tableformatter_SOURCES) \
 	$(unittest_texttable_SOURCES) $(unittest_throttle_SOURCES) \
 	$(unittest_utf8_SOURCES) $(unittest_util_SOURCES) \
-	$(unittest_workqueue_SOURCES) $(am__xio_client_SOURCES_DIST) \
-	$(am__xio_server_SOURCES_DIST)
+	$(unittest_workqueue_SOURCES) $(unittest_xlist_SOURCES) \
+	$(am__xio_client_SOURCES_DIST) $(am__xio_server_SOURCES_DIST)
 RECURSIVE_TARGETS = all-recursive check-recursive cscopelist-recursive \
 	ctags-recursive dvi-recursive html-recursive info-recursive \
 	install-data-recursive install-dvi-recursive \
@@ -5825,15 +6651,16 @@ am__can_run_installinfo = \
     n|no|NO) false;; \
     *) (install-info --version) >/dev/null 2>&1;; \
   esac
-am__python_PYTHON_DIST = pybind/ceph_argparse.py pybind/rados.py \
-	pybind/rbd.py pybind/cephfs.py pybind/ceph_rest_api.py
+am__python_PYTHON_DIST = pybind/ceph_argparse.py pybind/ceph_daemon.py \
+	pybind/rados.py pybind/rbd.py pybind/cephfs.py \
+	pybind/ceph_rest_api.py
 am__py_compile = PYTHON=$(PYTHON) $(SHELL) $(py_compile)
 am__pep3147_tweak = \
   sed -e 's|\.py$$||' -e 's|[^/]*$$|__pycache__/&.*.py|'
 py_compile = $(top_srcdir)/py-compile
-DATA = $(bash_completion_DATA) $(doc_DATA) $(libcephfs_include_DATA) \
-	$(librbd_include_DATA) $(rados_include_DATA) \
-	$(radosstriper_include_DATA)
+DATA = $(bash_completion_DATA) $(dist_noinst_DATA) $(doc_DATA) \
+	$(libcephfs_include_DATA) $(librbd_include_DATA) \
+	$(rados_include_DATA) $(radosstriper_include_DATA)
 am__noinst_HEADERS_DIST = arch/intel.h arch/arm.h arch/probe.h \
 	auth/cephx/CephxAuthorizeHandler.h auth/cephx/CephxKeyServer.h \
 	auth/cephx/CephxProtocol.h auth/cephx/CephxClientHandler.h \
@@ -5855,45 +6682,47 @@ am__noinst_HEADERS_DIST = arch/intel.h arch/arm.h arch/probe.h \
 	auth/RotatingKeyRing.h auth/Crypto.h crush/CrushCompiler.h \
 	crush/CrushTester.h crush/CrushTreeDumper.h \
 	crush/CrushWrapper.h crush/CrushWrapper.i crush/builder.h \
-	crush/crush.h crush/grammar.h crush/hash.h \
-	crush/crush_ln_table.h crush/mapper.h crush/sample.txt \
+	crush/crush.h crush/crush_compat.h crush/crush_ln_table.h \
+	crush/grammar.h crush/hash.h crush/mapper.h crush/sample.txt \
 	crush/types.h mon/AuthMonitor.h mon/DataHealthService.h \
 	mon/Elector.h mon/LogMonitor.h mon/ConfigKeyService.h \
 	mon/HealthMonitor.h mon/HealthService.h mon/MDSMonitor.h \
 	mon/MonmapMonitor.h mon/MonCap.h mon/MonClient.h \
 	mon/MonCommands.h mon/DumplingMonCommands.h mon/MonMap.h \
-	mon/Monitor.h mon/MonitorStore.h mon/MonitorDBStore.h \
+	mon/Monitor.h mon/MonitorDBStore.h mon/MonOpRequest.h \
 	mon/OSDMonitor.h mon/PGMap.h mon/PGMonitor.h mon/Paxos.h \
 	mon/PaxosService.h mon/QuorumService.h mon/Session.h \
 	mon/mon_types.h mds/inode_backtrace.h mds/flock.h mds/locks.c \
 	mds/locks.h mds/CDentry.h mds/CDir.h mds/CInode.h \
 	mds/Capability.h mds/InoTable.h mds/JournalPointer.h \
 	mds/LocalLock.h mds/Locker.h mds/LogEvent.h mds/LogSegment.h \
-	mds/MDBalancer.h mds/MDCache.h mds/RecoveryQueue.h mds/MDLog.h \
-	mds/MDS.h mds/Beacon.h mds/MDSContext.h mds/MDSAuthCaps.h \
-	mds/MDSMap.h mds/MDSTable.h mds/MDSTableServer.h \
-	mds/MDSTableClient.h mds/Mutation.h mds/Migrator.h \
-	mds/ScatterLock.h mds/Server.h mds/SessionMap.h \
-	mds/SimpleLock.h mds/SnapClient.h mds/SnapRealm.h \
-	mds/SnapServer.h mds/mds_table_types.h mds/mdstypes.h \
-	mds/snap.h mds/MDSContinuation.h mds/events/ECommitted.h \
-	mds/events/EExport.h mds/events/EFragment.h \
-	mds/events/EImportFinish.h mds/events/EImportStart.h \
-	mds/events/EMetaBlob.h mds/events/ENoOp.h mds/events/EOpen.h \
+	mds/MDBalancer.h mds/MDCache.h mds/RecoveryQueue.h \
+	mds/StrayManager.h mds/MDLog.h mds/MDSRank.h mds/MDSDaemon.h \
+	mds/Beacon.h mds/MDSContext.h mds/MDSAuthCaps.h mds/MDSMap.h \
+	mds/MDSTable.h mds/MDSTableServer.h mds/MDSTableClient.h \
+	mds/Mutation.h mds/Migrator.h mds/ScatterLock.h mds/Server.h \
+	mds/SessionMap.h mds/SimpleLock.h mds/SnapClient.h \
+	mds/SnapRealm.h mds/SnapServer.h mds/mds_table_types.h \
+	mds/mdstypes.h mds/snap.h mds/MDSContinuation.h \
+	mds/events/ECommitted.h mds/events/EExport.h \
+	mds/events/EFragment.h mds/events/EImportFinish.h \
+	mds/events/EImportStart.h mds/events/EMetaBlob.h \
+	mds/events/ENoOp.h mds/events/EOpen.h \
 	mds/events/EResetJournal.h mds/events/ESession.h \
 	mds/events/ESessions.h mds/events/ESlaveUpdate.h \
 	mds/events/ESubtreeMap.h mds/events/ETableClient.h \
 	mds/events/ETableServer.h mds/events/EUpdate.h \
-	os/btrfs_ioctl.h os/chain_xattr.h os/BtrfsFileStoreBackend.h \
+	os/btrfs_ioctl.h os/chain_xattr.h os/newstore/newstore_types.h \
+	os/newstore/NewStore.h os/BtrfsFileStoreBackend.h \
 	os/CollectionIndex.h os/DBObjectMap.h os/GenericObjectMap.h \
-	os/FileJournal.h os/FileStore.h os/FlatIndex.h os/FDCache.h \
-	os/GenericFileStoreBackend.h os/HashIndex.h os/IndexManager.h \
-	os/Journal.h os/JournalingObjectStore.h os/KeyValueDB.h \
-	os/LevelDBStore.h os/LFNIndex.h os/MemStore.h \
+	os/FileJournal.h os/FileStore.h os/FDCache.h os/fs/FS.h \
+	os/fs/XFS.h os/GenericFileStoreBackend.h os/HashIndex.h \
+	os/IndexManager.h os/Journal.h os/JournalingObjectStore.h \
+	os/KeyValueDB.h os/LevelDBStore.h os/LFNIndex.h os/MemStore.h \
 	os/KeyValueStore.h os/ObjectMap.h os/ObjectStore.h \
-	os/SequencerPosition.h os/WBThrottle.h \
+	os/PageSet.h os/SequencerPosition.h os/WBThrottle.h \
 	os/XfsFileStoreBackend.h os/ZFSFileStoreBackend.h \
-	os/RocksDBStore.h os/ZFS.h os/KineticStore.h osd/Ager.h \
+	os/RocksDBStore.h os/ZFS.h os/KineticStore.h \
 	osd/ClassHandler.h osd/HitSet.h osd/OSD.h osd/OSDCap.h \
 	osd/OSDMap.h osd/ObjectVersioner.h osd/OpRequest.h \
 	osd/SnapMapper.h osd/PG.h osd/PGLog.h osd/ReplicatedPG.h \
@@ -5919,7 +6748,7 @@ am__noinst_HEADERS_DIST = arch/intel.h arch/arm.h arch/probe.h \
 	erasure-code/lrc/ErasureCodeLrc.h \
 	erasure-code/shec/ErasureCodeShec.h \
 	erasure-code/shec/ErasureCodeShecTableCache.h \
-	erasure-code/shec/shec.h erasure-code/isa/ErasureCodeIsa.h \
+	erasure-code/isa/ErasureCodeIsa.h \
 	erasure-code/isa/ErasureCodeIsaTableCache.h \
 	erasure-code/isa/xor_op.h \
 	erasure-code/isa/isa-l/erasure_code/ec_base.h \
@@ -5931,12 +6760,13 @@ am__noinst_HEADERS_DIST = arch/intel.h arch/arm.h arch/probe.h \
 	erasure-code/ErasureCodePlugin.h osdc/Filer.h osdc/Journaler.h \
 	osdc/ObjectCacher.h osdc/Objecter.h osdc/Striper.h \
 	osdc/WritebackHandler.h client/Client.h client/Dentry.h \
-	client/Dir.h client/Fh.h client/Inode.h client/MetaRequest.h \
-	client/MetaSession.h client/ClientSnapRealm.h \
-	client/SyntheticClient.h client/Trace.h client/ioctl.h \
-	client/ObjecterWriteback.h client/fuse_ll.h global/pidfile.h \
-	global/global_init.h global/global_context.h \
-	global/signal_handler.h json_spirit/json_spirit.h \
+	client/Dir.h client/Fh.h client/Inode.h client/InodeRef.h \
+	client/MetaRequest.h client/MetaSession.h \
+	client/ClientSnapRealm.h client/SyntheticClient.h \
+	client/Trace.h client/ioctl.h client/ObjecterWriteback.h \
+	client/fuse_ll.h global/pidfile.h global/global_init.h \
+	global/global_context.h global/signal_handler.h \
+	json_spirit/json_spirit.h \
 	json_spirit/json_spirit_error_position.h \
 	json_spirit/json_spirit_reader.h \
 	json_spirit/json_spirit_reader_template.h \
@@ -5950,9 +6780,10 @@ am__noinst_HEADERS_DIST = arch/intel.h arch/arm.h arch/probe.h \
 	perfglue/cpu_profiler.h perfglue/heap_profiler.h \
 	common/bloom_filter.hpp common/sctp_crc32.h \
 	common/crc32c_intel_baseline.h common/crc32c_intel_fast.h \
-	common/BackTrace.h common/RefCountedObj.h \
-	common/HeartbeatMap.h common/LogClient.h common/LogEntry.h \
-	common/Preforker.h common/SloppyCRCMap.h common/WorkQueue.h \
+	common/crc32c_aarch64.h common/BackTrace.h \
+	common/RefCountedObj.h common/HeartbeatMap.h \
+	common/LogClient.h common/LogEntry.h common/Preforker.h \
+	common/SloppyCRCMap.h common/WorkQueue.h \
 	common/PrioritizedQueue.h common/ceph_argparse.h \
 	common/ceph_context.h common/xattr.h common/blkdev.h \
 	common/compiler_extensions.h common/debug.h common/dout.h \
@@ -5982,20 +6813,21 @@ am__noinst_HEADERS_DIST = arch/intel.h arch/arm.h arch/probe.h \
 	common/linux_version.h common/module.h common/Continuation.h \
 	common/Readahead.h common/Cycles.h common/Initialize.h \
 	common/ContextCompletion.h common/bit_vector.hpp \
-	common/valgrind.h common/address_helper.h common/secret.h \
-	msg/Connection.h msg/Dispatcher.h msg/Message.h \
-	msg/Messenger.h msg/SimplePolicyMessenger.h msg/msg_types.h \
-	msg/simple/Accepter.h msg/simple/DispatchQueue.h \
-	msg/simple/Pipe.h msg/simple/PipeConnection.h \
-	msg/simple/SimpleMessenger.h msg/async/AsyncConnection.h \
-	msg/async/AsyncMessenger.h msg/async/Event.h \
-	msg/async/EventEpoll.h msg/async/EventSelect.h \
-	msg/async/net_handler.h msg/xio/DispatchStrategy.h \
-	msg/xio/FastStrategy.h msg/xio/QueueStrategy.h \
-	msg/xio/XioConnection.h msg/xio/XioInSeq.h \
-	msg/xio/XioMessenger.h msg/xio/XioMsg.h msg/xio/XioPool.h \
-	msg/xio/XioPortal.h msg/xio/XioSubmit.h messages/MAuth.h \
-	messages/MAuthReply.h messages/MCacheExpire.h \
+	common/SubProcess.h common/valgrind.h \
+	common/TracepointProvider.h common/address_helper.h \
+	common/secret.h msg/Connection.h msg/Dispatcher.h \
+	msg/Message.h msg/Messenger.h msg/SimplePolicyMessenger.h \
+	msg/msg_types.h msg/simple/Accepter.h \
+	msg/simple/DispatchQueue.h msg/simple/Pipe.h \
+	msg/simple/PipeConnection.h msg/simple/SimpleMessenger.h \
+	msg/async/AsyncConnection.h msg/async/AsyncMessenger.h \
+	msg/async/Event.h msg/async/EventEpoll.h \
+	msg/async/EventSelect.h msg/async/net_handler.h \
+	msg/xio/DispatchStrategy.h msg/xio/FastStrategy.h \
+	msg/xio/QueueStrategy.h msg/xio/XioConnection.h \
+	msg/xio/XioInSeq.h msg/xio/XioMessenger.h msg/xio/XioMsg.h \
+	msg/xio/XioPool.h msg/xio/XioPortal.h msg/xio/XioSubmit.h \
+	messages/MAuth.h messages/MAuthReply.h messages/MCacheExpire.h \
 	messages/MClientCaps.h messages/MClientCapRelease.h \
 	messages/MClientLease.h messages/MClientReconnect.h \
 	messages/MClientReply.h messages/MClientRequest.h \
@@ -6026,7 +6858,8 @@ am__noinst_HEADERS_DIST = arch/intel.h arch/arm.h arch/probe.h \
 	messages/MMonGetOSDMap.h messages/MMonGetVersion.h \
 	messages/MMonGetVersionReply.h messages/MMonGlobalID.h \
 	messages/MMonHealth.h messages/MMonJoin.h messages/MMonMap.h \
-	messages/MMonPaxos.h messages/MMonProbe.h messages/MMonScrub.h \
+	messages/MMonMetadata.h messages/MMonPaxos.h \
+	messages/MMonProbe.h messages/MMonScrub.h \
 	messages/MMonSubscribe.h messages/MMonSubscribeAck.h \
 	messages/MMonSync.h messages/MOSDAlive.h messages/MOSDBoot.h \
 	messages/MOSDFailure.h messages/MOSDMarkMeDown.h \
@@ -6058,27 +6891,29 @@ am__noinst_HEADERS_DIST = arch/intel.h arch/arm.h arch/probe.h \
 	include/cephfs/libcephfs.h include/ceph_features.h \
 	include/ceph_frag.h include/ceph_fs.h include/ceph_hash.h \
 	include/cmp.h include/color.h include/compat.h \
-	include/crc32c.h include/encoding.h include/err.h \
-	include/error.h include/filepath.h include/frag.h \
-	include/hash.h include/intarith.h include/interval_set.h \
-	include/int_types.h include/ipaddr.h include/krbd.h \
-	include/linux_fiemap.h include/lru.h include/msgr.h \
-	include/object.h include/page.h include/rangeset.h \
-	include/rados.h include/rbd_types.h include/statlite.h \
-	include/str_list.h include/str_map.h include/stringify.h \
-	include/types.h include/utime.h include/elist.h include/uuid.h \
-	include/xlist.h include/rados/librados.h \
-	include/rados/rados_types.h include/rados/rados_types.hpp \
-	include/rados/librados.hpp include/rados/librgw.h \
-	include/rados/page.h include/rados/crc32c.h \
-	include/rados/buffer.h include/radosstriper/libradosstriper.h \
+	include/sock_compat.h include/crc32c.h include/encoding.h \
+	include/err.h include/error.h include/filepath.h \
+	include/frag.h include/hash.h include/inline_memory.h \
+	include/intarith.h include/interval_set.h include/int_types.h \
+	include/ipaddr.h include/krbd.h include/linux_fiemap.h \
+	include/lru.h include/msgr.h include/object.h include/page.h \
+	include/rangeset.h include/rados.h include/rbd_types.h \
+	include/statlite.h include/str_list.h include/str_map.h \
+	include/stringify.h include/types.h include/utime.h \
+	include/elist.h include/uuid.h include/xlist.h \
+	include/compact_map.h include/compact_set.h \
+	include/rados/librados.h include/rados/rados_types.h \
+	include/rados/rados_types.hpp include/rados/librados.hpp \
+	include/rados/librgw.h include/rados/page.h \
+	include/rados/crc32c.h include/rados/buffer.h \
+	include/radosstriper/libradosstriper.h \
 	include/radosstriper/libradosstriper.hpp \
 	include/rbd/features.h include/rbd/librbd.h \
 	include/rbd/librbd.hpp include/rbd/object_map_types.h \
 	include/util.h include/stat.h include/on_exit.h \
 	include/memory.h include/rados/memory.h \
-	include/hash_namespace.h include/unordered_set.h \
-	include/unordered_map.h librados/snap_set_diff.h \
+	include/unordered_set.h include/unordered_map.h \
+	include/timegm.h librados/snap_set_diff.h \
 	librados/AioCompletionImpl.h librados/IoCtxImpl.h \
 	librados/PoolAsyncCompletionImpl.h librados/RadosClient.h \
 	librados/RadosXattrIter.h librados/ListObjectImpl.h \
@@ -6088,17 +6923,19 @@ am__noinst_HEADERS_DIST = arch/intel.h arch/arm.h arch/probe.h \
 	librbd/AsyncFlattenRequest.h librbd/AsyncObjectThrottle.h \
 	librbd/AsyncOperation.h librbd/AsyncRequest.h \
 	librbd/AsyncResizeRequest.h librbd/AsyncTrimRequest.h \
-	librbd/CopyupRequest.h librbd/ImageCtx.h librbd/ImageWatcher.h \
-	librbd/internal.h librbd/LibrbdWriteback.h librbd/ObjectMap.h \
-	librbd/parent_types.h librbd/SnapInfo.h librbd/TaskFinisher.h \
-	librbd/WatchNotifyTypes.h rgw/logrotate.conf rgw/rgw_acl.h \
-	rgw/rgw_acl_s3.h rgw/rgw_acl_swift.h rgw/rgw_client_io.h \
-	rgw/rgw_fcgi.h rgw/rgw_xml.h rgw/rgw_cache.h rgw/rgw_common.h \
-	rgw/rgw_cors.h rgw/rgw_cors_s3.h rgw/rgw_cors_swift.h \
-	rgw/rgw_string.h rgw/rgw_formats.h rgw/rgw_http_errors.h \
-	rgw/rgw_log.h rgw/rgw_loadgen.h rgw/rgw_multi.h \
-	rgw/rgw_policy_s3.h rgw/rgw_gc.h rgw/rgw_metadata.h \
-	rgw/rgw_multi_del.h rgw/rgw_op.h rgw/rgw_orphan.h \
+	librbd/CopyupRequest.h librbd/DiffIterate.h librbd/ImageCtx.h \
+	librbd/ImageWatcher.h librbd/internal.h \
+	librbd/LibrbdWriteback.h librbd/ObjectMap.h \
+	librbd/parent_types.h librbd/RebuildObjectMapRequest.h \
+	librbd/SnapInfo.h librbd/TaskFinisher.h \
+	librbd/WatchNotifyTypes.h rgw/rgw_acl.h rgw/rgw_acl_s3.h \
+	rgw/rgw_acl_swift.h rgw/rgw_client_io.h rgw/rgw_fcgi.h \
+	rgw/rgw_xml.h rgw/rgw_cache.h rgw/rgw_common.h rgw/rgw_cors.h \
+	rgw/rgw_cors_s3.h rgw/rgw_cors_swift.h rgw/rgw_string.h \
+	rgw/rgw_formats.h rgw/rgw_http_errors.h rgw/rgw_log.h \
+	rgw/rgw_loadgen.h rgw/rgw_multi.h rgw/rgw_policy_s3.h \
+	rgw/rgw_gc.h rgw/rgw_metadata.h rgw/rgw_multi_del.h \
+	rgw/rgw_object_expirer_core.h rgw/rgw_op.h rgw/rgw_orphan.h \
 	rgw/rgw_http_client.h rgw/rgw_swift.h rgw/rgw_swift_auth.h \
 	rgw/rgw_quota.h rgw/rgw_rados.h rgw/rgw_replica_log.h \
 	rgw/rgw_resolve.h rgw/rgw_rest.h rgw/rgw_rest_swift.h \
@@ -6113,8 +6950,8 @@ am__noinst_HEADERS_DIST = arch/intel.h arch/arm.h arch/probe.h \
 	civetweb/include/civetweb.h civetweb/include/civetweb_conf.h \
 	civetweb/src/md5.h cls/lock/cls_lock_types.h \
 	cls/lock/cls_lock_ops.h cls/lock/cls_lock_client.h \
-	cls/rbd/cls_rbd.h cls/rbd/cls_rbd_client.h \
-	cls/refcount/cls_refcount_ops.h \
+	cls/numops/cls_numops_client.h cls/rbd/cls_rbd.h \
+	cls/rbd/cls_rbd_client.h cls/refcount/cls_refcount_ops.h \
 	cls/refcount/cls_refcount_client.h \
 	cls/version/cls_version_types.h cls/version/cls_version_ops.h \
 	cls/version/cls_version_client.h cls/log/cls_log_types.h \
@@ -6122,35 +6959,46 @@ am__noinst_HEADERS_DIST = arch/intel.h arch/arm.h arch/probe.h \
 	cls/statelog/cls_statelog_types.h \
 	cls/statelog/cls_statelog_ops.h \
 	cls/statelog/cls_statelog_client.h \
+	cls/timeindex/cls_timeindex_types.h \
+	cls/timeindex/cls_timeindex_ops.h \
+	cls/timeindex/cls_timeindex_client.h \
 	cls/replica_log/cls_replica_log_types.h \
 	cls/replica_log/cls_replica_log_ops.h \
 	cls/replica_log/cls_replica_log_client.h \
 	cls/rgw/cls_rgw_client.h cls/rgw/cls_rgw_ops.h \
 	cls/rgw/cls_rgw_types.h cls/user/cls_user_client.h \
 	cls/user/cls_user_ops.h cls/user/cls_user_types.h \
+	cls/cephfs/cls_cephfs.h cls/cephfs/cls_cephfs_client.h \
 	key_value_store/key_value_structure.h \
 	key_value_store/kv_flat_btree_async.h \
-	key_value_store/kvs_arg_types.h rbd_replay/BoundedBuffer.hpp \
-	rbd_replay/actions.hpp rbd_replay/Deser.hpp \
-	rbd_replay/ImageNameMap.hpp rbd_replay/ios.hpp \
-	rbd_replay/PendingIO.hpp rbd_replay/rbd_loc.hpp \
-	rbd_replay/rbd_replay_debug.hpp rbd_replay/Replayer.hpp \
-	rbd_replay/Ser.hpp \
+	key_value_store/kvs_arg_types.h rbd_replay/ActionTypes.h \
+	rbd_replay/actions.hpp rbd_replay/BoundedBuffer.hpp \
+	rbd_replay/BufferReader.h rbd_replay/ImageNameMap.hpp \
+	rbd_replay/ios.hpp rbd_replay/PendingIO.hpp \
+	rbd_replay/rbd_loc.hpp rbd_replay/rbd_replay_debug.hpp \
+	rbd_replay/Replayer.hpp \
 	test/erasure-code/ceph_erasure_code_benchmark.h \
 	test/erasure-code/ErasureCodeExample.h \
 	test/messenger/message_helper.h \
 	test/messenger/simple_dispatcher.h \
 	test/messenger/xio_dispatcher.h \
+	test/librados_test_stub/LibradosTestStub.h \
 	test/librados_test_stub/TestClassHandler.h \
 	test/librados_test_stub/TestRadosClient.h \
 	test/librados_test_stub/TestMemRadosClient.h \
 	test/librados_test_stub/TestWatchNotify.h \
 	test/librados_test_stub/TestMemIoCtxImpl.h \
-	test/librados_test_stub/TestIoCtxImpl.h test/bench/backend.h \
-	test/bench/bencher.h test/bench/detailed_stat_collector.h \
-	test/bench/distribution.h test/bench/dumb_backend.h \
-	test/bench/rados_backend.h test/bench/rbd_backend.h \
-	test/bench/stat_collector.h test/bench/testfilestore_backend.h \
+	test/librados_test_stub/TestIoCtxImpl.h \
+	test/librbd/test_fixture.h test/librbd/test_mock_fixture.h \
+	test/librbd/test_support.h test/librbd/mock/MockContextWQ.h \
+	test/librbd/mock/MockImageCtx.h \
+	test/librbd/mock/MockImageWatcher.h \
+	test/librbd/mock/MockObjectMap.h test/perf_helper.h \
+	test/bench/backend.h test/bench/bencher.h \
+	test/bench/detailed_stat_collector.h test/bench/distribution.h \
+	test/bench/dumb_backend.h test/bench/rados_backend.h \
+	test/bench/rbd_backend.h test/bench/stat_collector.h \
+	test/bench/testfilestore_backend.h \
 	test/common/ObjectContents.h test/encoding/types.h \
 	test/objectstore/DeterministicOpSequence.h \
 	test/objectstore/FileStoreDiff.h \
@@ -6158,9 +7006,9 @@ am__noinst_HEADERS_DIST = arch/intel.h arch/arm.h arch/probe.h \
 	test/objectstore/TestObjectStoreState.h \
 	test/objectstore/workload_generator.h test/kv_store_bench.h \
 	test/librados/test.h test/librados/TestCase.h \
-	test/libradosstriper/TestCase.h test/librbd/test_fixture.h \
-	test/librbd/test_support.h test/ObjectMap/KeyValueDBMemory.h \
-	test/omap_bench.h test/osdc/FakeWriteback.h test/osd/Object.h \
+	test/libradosstriper/TestCase.h \
+	test/ObjectMap/KeyValueDBMemory.h test/omap_bench.h \
+	test/osdc/FakeWriteback.h test/osd/Object.h \
 	test/osd/RadosModel.h test/osd/TestOpStat.h \
 	test/system/cross_process_sem.h \
 	test/system/st_rados_create_pool.h \
@@ -6173,8 +7021,12 @@ am__noinst_HEADERS_DIST = arch/intel.h arch/arm.h arch/probe.h \
 	tools/cephfs/JournalScanner.h tools/cephfs/JournalFilter.h \
 	tools/cephfs/EventOutput.h tools/cephfs/Resetter.h \
 	tools/cephfs/Dumper.h tools/cephfs/TableTool.h \
-	tools/cephfs/MDSUtility.h tools/rados/rados_sync.h cls_acl.cc \
-	cls_crypto.cc fetch_config logrotate.conf sample.ceph.conf \
+	tools/cephfs/MDSUtility.h tools/RadosDump.h \
+	tools/rados/RadosImport.h tools/ceph_objectstore_tool.h \
+	tools/rados/PoolDump.h tools/cephfs/DataScan.h \
+	compressor/Compressor.h compressor/AsyncCompressor.h \
+	compressor/SnappyCompressor.h cls_acl.cc cls_crypto.cc \
+	fetch_config logrotate.conf sample.ceph.conf \
 	bash_completion/ceph bash_completion/rados bash_completion/rbd \
 	bash_completion/radosgw-admin mount/canonicalize.c \
 	mount/mtab.c objclass/objclass.h
@@ -6412,10 +7264,12 @@ am__relativize = \
 ACLOCAL = @ACLOCAL@
 AMTAR = @AMTAR@
 AM_CXXFLAGS = @AM_CXXFLAGS@ $(AM_COMMON_CFLAGS) -ftemplate-depth-1024 \
-	-Wnon-virtual-dtor -Wno-invalid-offsetof $(am__append_3) \
-	$(am__append_6) $(am__append_68) $(am__append_71)
+	-Wnon-virtual-dtor -Wno-invalid-offsetof $(HARDENING_CFLAGS) \
+	$(am__append_3) $(am__append_6) $(am__append_75) \
+	$(am__append_78)
 AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
 AR = @AR@
+ARM_CRC_FLAGS = @ARM_CRC_FLAGS@
 ARM_FLAGS = @ARM_FLAGS@
 ARM_NEON_FLAGS = @ARM_NEON_FLAGS@
 AUTOCONF = @AUTOCONF@
@@ -6423,6 +7277,7 @@ AUTOHEADER = @AUTOHEADER@
 AUTOMAKE = @AUTOMAKE@
 AWK = @AWK@
 BOOST_PROGRAM_OPTIONS_LIBS = @BOOST_PROGRAM_OPTIONS_LIBS@
+BOOST_RANDOM_LIBS = @BOOST_RANDOM_LIBS@
 BOOST_THREAD_LIBS = @BOOST_THREAD_LIBS@
 CC = @CC@
 CCAS = ${srcdir}/yasm-wrapper
@@ -6480,7 +7335,8 @@ LD = @LD@
 LDFLAGS = @LDFLAGS@
 LIBEDIT_CFLAGS = @LIBEDIT_CFLAGS@
 LIBEDIT_LIBS = @LIBEDIT_LIBS@
-LIBFUSE = @LIBFUSE@
+LIBFUSE_CFLAGS = @LIBFUSE_CFLAGS@
+LIBFUSE_LIBS = @LIBFUSE_LIBS@
 LIBJEMALLOC = @LIBJEMALLOC@
 LIBOBJS = @LIBOBJS@
 LIBROCKSDB_CFLAGS = @LIBROCKSDB_CFLAGS@
@@ -6531,6 +7387,7 @@ RPM_RELEASE = @RPM_RELEASE@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
+SPHINX_BUILD = @SPHINX_BUILD@
 STRIP = @STRIP@
 VERSION = @VERSION@
 WARN_ERROR_FORMAT_SECURITY = @WARN_ERROR_FORMAT_SECURITY@
@@ -6564,6 +7421,7 @@ datarootdir = @datarootdir@
 docdir = @docdir@
 dvidir = @dvidir@
 exec_prefix = @exec_prefix@
+group_rgw = @group_rgw@
 host = @host@
 host_alias = @host_alias@
 host_cpu = @host_cpu@
@@ -6593,6 +7451,8 @@ sharedstatedir = @sharedstatedir@
 srcdir = @srcdir@
 subdirs = @subdirs@
 sysconfdir = @sysconfdir@
+systemd_libexec_dir = @systemd_libexec_dir@
+systemd_unit_dir = @systemd_unit_dir@
 target = @target@
 target_alias = @target_alias@
 target_cpu = @target_cpu@
@@ -6601,13 +7461,46 @@ target_vendor = @target_vendor@
 top_build_prefix = @top_build_prefix@
 top_builddir = @top_builddir@
 top_srcdir = @top_srcdir@
+user_rgw = @user_rgw@
 AUTOMAKE_OPTIONS = gnu subdir-objects
-SUBDIRS = ocf java tracing $(am__append_199) $(am__append_200)
-DIST_SUBDIRS = gmock ocf libs3 java tracing
-BUILT_SOURCES = $(am__append_220)
+SUBDIRS = ocf java $(am__append_209)
+DIST_SUBDIRS = gmock ocf java
+BUILT_SOURCES = $(am__append_212) $(am__append_233)
 
 # extra bits
-EXTRA_DIST = $(am__append_21) \
+EXTRA_DIST = $(am__append_21) ceph-detect-init/AUTHORS.rst \
+	ceph-detect-init/ceph_detect_init/centos/__init__.py \
+	ceph-detect-init/ceph_detect_init/exc.py \
+	ceph-detect-init/ceph_detect_init/main.py \
+	ceph-detect-init/ceph_detect_init/__init__.py \
+	ceph-detect-init/ceph_detect_init/rhel/__init__.py \
+	ceph-detect-init/ceph_detect_init/fedora/__init__.py \
+	ceph-detect-init/ceph_detect_init/debian/__init__.py \
+	ceph-detect-init/ceph_detect_init/suse/__init__.py \
+	ceph-detect-init/integration/centos-6.dockerfile \
+	ceph-detect-init/integration/debian-wheezy.dockerfile \
+	ceph-detect-init/integration/debian-sid.dockerfile \
+	ceph-detect-init/integration/debian-jessie.dockerfile \
+	ceph-detect-init/integration/opensuse-13.1.dockerfile \
+	ceph-detect-init/integration/fedora-21.dockerfile \
+	ceph-detect-init/integration/ubuntu-14.04.dockerfile \
+	ceph-detect-init/integration/test_main.py \
+	ceph-detect-init/integration/opensuse-13.2.dockerfile \
+	ceph-detect-init/integration/ubuntu-12.04.dockerfile \
+	ceph-detect-init/integration/centos-7.dockerfile \
+	ceph-detect-init/integration/ubuntu-15.04.dockerfile \
+	ceph-detect-init/integration/debian-squeeze.dockerfile \
+	ceph-detect-init/Makefile.am ceph-detect-init/MANIFEST.in \
+	ceph-detect-init/README.rst ceph-detect-init/requirements.txt \
+	ceph-detect-init/run-tox.sh ceph-detect-init/setup.py \
+	ceph-detect-init/test-requirements.txt \
+	ceph-detect-init/tests/test_all.py ceph-detect-init/tox.ini \
+	$(srcdir)/test/python/ceph-disk/setup.py \
+	$(srcdir)/test/python/ceph-disk/tox.ini \
+	$(srcdir)/test/python/ceph-disk/tests/test_ceph_disk.py \
+	$(srcdir)/test/python/brag-client/setup.py \
+	$(srcdir)/test/python/brag-client/tox.ini \
+	$(srcdir)/test/python/brag-client/tests/test_ceph_brag.py \
 	$(srcdir)/test/debian-jessie/Dockerfile.in \
 	$(srcdir)/test/debian-jessie/install-deps.sh \
 	$(srcdir)/test/debian-jessie/debian \
@@ -6617,26 +7510,29 @@ EXTRA_DIST = $(am__append_21) \
 	$(srcdir)/test/ubuntu-14.04/Dockerfile.in \
 	$(srcdir)/test/ubuntu-14.04/install-deps.sh \
 	$(srcdir)/test/ubuntu-14.04/debian \
+	$(srcdir)/test/fedora-21/Dockerfile.in \
+	$(srcdir)/test/fedora-21/install-deps.sh \
+	$(srcdir)/test/fedora-21/ceph.spec.in \
 	$(srcdir)/test/centos-6/Dockerfile.in \
 	$(srcdir)/test/centos-6/install-deps.sh \
 	$(srcdir)/test/centos-6/ceph.spec.in \
 	$(srcdir)/test/centos-7/Dockerfile.in \
 	$(srcdir)/test/centos-7/install-deps.sh \
 	$(srcdir)/test/centos-7/ceph.spec.in \
-	$(srcdir)/test/mon/mon-test-helpers.sh \
-	$(srcdir)/test/osd/osd-test-helpers.sh \
+	$(srcdir)/test/opensuse-13.2/Dockerfile.in \
+	$(srcdir)/test/opensuse-13.2/install-deps.sh \
+	$(srcdir)/test/opensuse-13.2/ceph.spec.in \
 	$(srcdir)/test/coverage.sh $(patsubst \
-	%,$(srcdir)/%,$(check_SCRIPTS)) \
-	$(srcdir)/test/container-make-check-ubuntu-14.04.sh \
-	$(am__append_201) $(srcdir)/$(shell_scripts:%=%.in) \
-	$(srcdir)/verify-mds-journal.sh $(srcdir)/vstart.sh \
-	$(srcdir)/stop.sh ceph-run $(srcdir)/ceph-osd-prestart.sh \
-	$(srcdir)/ceph_common.sh $(srcdir)/init-radosgw \
-	$(srcdir)/init-rbdmap $(srcdir)/ceph-clsinfo \
-	$(srcdir)/make_version $(srcdir)/check_version \
+	%,$(srcdir)/%,$(check_SCRIPTS)) $(am__append_210) \
+	tracing/tracing-common.h $(srcdir)/$(shell_scripts:%=%.in) \
+	$(srcdir)/vstart.sh $(srcdir)/stop.sh ceph-run \
+	$(srcdir)/ceph-osd-prestart.sh $(srcdir)/ceph_common.sh \
+	$(srcdir)/init-radosgw $(srcdir)/init-rbdmap \
+	$(srcdir)/ceph-clsinfo $(srcdir)/make_version \
 	$(srcdir)/.git_version $(srcdir)/ceph-rbdnamer \
-	$(srcdir)/test/encoding/readable.sh \
+	$(srcdir)/tools/ceph-monstore-update-crush.sh \
 	$(srcdir)/upstart/ceph-all.conf \
+	$(srcdir)/upstart/ceph-disk.conf \
 	$(srcdir)/upstart/ceph-mon.conf \
 	$(srcdir)/upstart/ceph-mon-all.conf \
 	$(srcdir)/upstart/ceph-mon-all-starter.conf \
@@ -6650,16 +7546,15 @@ EXTRA_DIST = $(am__append_21) \
 	$(srcdir)/upstart/radosgw.conf \
 	$(srcdir)/upstart/radosgw-all.conf \
 	$(srcdir)/upstart/radosgw-all-starter.conf \
-	$(srcdir)/upstart/rbdmap.conf ceph.in ceph-disk \
-	ceph-disk-prepare ceph-disk-activate ceph-disk-udev \
+	$(srcdir)/upstart/rbdmap.conf ceph.in ceph-disk ceph-disk-udev \
 	ceph-create-keys ceph-rest-api ceph-crush-location \
 	mount.fuse.ceph rbd-replay-many rbdmap yasm-wrapper \
-	libs3/COPYING libs3/ChangeLog libs3/GNUmakefile \
-	libs3/GNUmakefile.mingw libs3/GNUmakefile.osx libs3/INSTALL \
-	libs3/LICENSE libs3/README libs3/TODO libs3/archlinux \
-	libs3/debian libs3/doxyfile libs3/inc libs3/libs3.spec \
-	libs3/mswin libs3/src libs3/test unittest_bufferlist.sh
-CLEANFILES = $(shell_scripts) ceph_ver.h sample.fetch_config
+	unittest_bufferlist.sh
+CLEANFILES = $(BUILT_SOURCES) $(shell_scripts) ceph_ver.h \
+	sample.fetch_config
+dist_noinst_DATA = tracing/librados.tp tracing/librbd.tp \
+	tracing/oprequest.tp tracing/osd.tp tracing/pg.tp \
+	tracing/objectstore.tp
 
 # jerasure plugin
 
@@ -6687,11 +7582,11 @@ noinst_HEADERS = arch/intel.h arch/arm.h arch/probe.h \
 	auth/RotatingKeyRing.h auth/Crypto.h crush/CrushCompiler.h \
 	crush/CrushTester.h crush/CrushTreeDumper.h \
 	crush/CrushWrapper.h crush/CrushWrapper.i crush/builder.h \
-	crush/crush.h crush/grammar.h crush/hash.h \
-	crush/crush_ln_table.h crush/mapper.h crush/sample.txt \
+	crush/crush.h crush/crush_compat.h crush/crush_ln_table.h \
+	crush/grammar.h crush/hash.h crush/mapper.h crush/sample.txt \
 	crush/types.h $(am__append_23) $(am__append_27) \
-	$(am__append_33) $(am__append_35) $(am__append_37) \
-	$(am__append_39) $(am__append_43) $(am__append_47) \
+	$(am__append_34) $(am__append_36) $(am__append_38) \
+	$(am__append_40) $(am__append_44) $(am__append_47) \
 	erasure-code/jerasure/gf-complete/include/gf_complete.h \
 	erasure-code/jerasure/gf-complete/include/gf_general.h \
 	erasure-code/jerasure/gf-complete/include/gf_int.h \
@@ -6711,7 +7606,6 @@ noinst_HEADERS = arch/intel.h arch/arm.h arch/probe.h \
 	erasure-code/lrc/ErasureCodeLrc.h \
 	erasure-code/shec/ErasureCodeShec.h \
 	erasure-code/shec/ErasureCodeShecTableCache.h \
-	erasure-code/shec/shec.h \
 	erasure-code/jerasure/jerasure/include/cauchy.h \
 	erasure-code/jerasure/jerasure/include/galois.h \
 	erasure-code/jerasure/jerasure/include/jerasure.h \
@@ -6722,11 +7616,11 @@ noinst_HEADERS = arch/intel.h arch/arm.h arch/probe.h \
 	erasure-code/jerasure/gf-complete/include/gf_rand.h \
 	erasure-code/jerasure/gf-complete/include/gf_method.h \
 	erasure-code/jerasure/gf-complete/include/gf_general.h \
-	$(am__append_58) erasure-code/ErasureCode.h \
+	$(am__append_65) erasure-code/ErasureCode.h \
 	erasure-code/ErasureCodeInterface.h \
 	erasure-code/ErasureCodePlugin.h osdc/Filer.h osdc/Journaler.h \
 	osdc/ObjectCacher.h osdc/Objecter.h osdc/Striper.h \
-	osdc/WritebackHandler.h $(am__append_62) $(am__append_64) \
+	osdc/WritebackHandler.h $(am__append_69) $(am__append_71) \
 	global/pidfile.h global/global_init.h global/global_context.h \
 	global/signal_handler.h json_spirit/json_spirit.h \
 	json_spirit/json_spirit_error_position.h \
@@ -6742,9 +7636,10 @@ noinst_HEADERS = arch/intel.h arch/arm.h arch/probe.h \
 	perfglue/cpu_profiler.h perfglue/heap_profiler.h \
 	common/bloom_filter.hpp common/sctp_crc32.h \
 	common/crc32c_intel_baseline.h common/crc32c_intel_fast.h \
-	common/BackTrace.h common/RefCountedObj.h \
-	common/HeartbeatMap.h common/LogClient.h common/LogEntry.h \
-	common/Preforker.h common/SloppyCRCMap.h common/WorkQueue.h \
+	common/crc32c_aarch64.h common/BackTrace.h \
+	common/RefCountedObj.h common/HeartbeatMap.h \
+	common/LogClient.h common/LogEntry.h common/Preforker.h \
+	common/SloppyCRCMap.h common/WorkQueue.h \
 	common/PrioritizedQueue.h common/ceph_argparse.h \
 	common/ceph_context.h common/xattr.h common/blkdev.h \
 	common/compiler_extensions.h common/debug.h common/dout.h \
@@ -6774,7 +7669,8 @@ noinst_HEADERS = arch/intel.h arch/arm.h arch/probe.h \
 	common/linux_version.h common/module.h common/Continuation.h \
 	common/Readahead.h common/Cycles.h common/Initialize.h \
 	common/ContextCompletion.h common/bit_vector.hpp \
-	common/valgrind.h $(am__append_79) common/secret.h \
+	common/SubProcess.h common/valgrind.h \
+	common/TracepointProvider.h $(am__append_87) common/secret.h \
 	msg/Connection.h msg/Dispatcher.h msg/Message.h \
 	msg/Messenger.h msg/SimplePolicyMessenger.h msg/msg_types.h \
 	msg/simple/Accepter.h msg/simple/DispatchQueue.h \
@@ -6782,7 +7678,7 @@ noinst_HEADERS = arch/intel.h arch/arm.h arch/probe.h \
 	msg/simple/SimpleMessenger.h msg/async/AsyncConnection.h \
 	msg/async/AsyncMessenger.h msg/async/Event.h \
 	msg/async/EventEpoll.h msg/async/EventSelect.h \
-	msg/async/net_handler.h $(am__append_88) messages/MAuth.h \
+	msg/async/net_handler.h $(am__append_96) messages/MAuth.h \
 	messages/MAuthReply.h messages/MCacheExpire.h \
 	messages/MClientCaps.h messages/MClientCapRelease.h \
 	messages/MClientLease.h messages/MClientReconnect.h \
@@ -6814,7 +7710,8 @@ noinst_HEADERS = arch/intel.h arch/arm.h arch/probe.h \
 	messages/MMonGetOSDMap.h messages/MMonGetVersion.h \
 	messages/MMonGetVersionReply.h messages/MMonGlobalID.h \
 	messages/MMonHealth.h messages/MMonJoin.h messages/MMonMap.h \
-	messages/MMonPaxos.h messages/MMonProbe.h messages/MMonScrub.h \
+	messages/MMonMetadata.h messages/MMonPaxos.h \
+	messages/MMonProbe.h messages/MMonScrub.h \
 	messages/MMonSubscribe.h messages/MMonSubscribeAck.h \
 	messages/MMonSync.h messages/MOSDAlive.h messages/MOSDBoot.h \
 	messages/MOSDFailure.h messages/MOSDMarkMeDown.h \
@@ -6846,34 +7743,37 @@ noinst_HEADERS = arch/intel.h arch/arm.h arch/probe.h \
 	include/cephfs/libcephfs.h include/ceph_features.h \
 	include/ceph_frag.h include/ceph_fs.h include/ceph_hash.h \
 	include/cmp.h include/color.h include/compat.h \
-	include/crc32c.h include/encoding.h include/err.h \
-	include/error.h include/filepath.h include/frag.h \
-	include/hash.h include/intarith.h include/interval_set.h \
-	include/int_types.h include/ipaddr.h include/krbd.h \
-	include/linux_fiemap.h include/lru.h include/msgr.h \
-	include/object.h include/page.h include/rangeset.h \
-	include/rados.h include/rbd_types.h include/statlite.h \
-	include/str_list.h include/str_map.h include/stringify.h \
-	include/types.h include/utime.h include/elist.h include/uuid.h \
-	include/xlist.h include/rados/librados.h \
-	include/rados/rados_types.h include/rados/rados_types.hpp \
-	include/rados/librados.hpp include/rados/librgw.h \
-	include/rados/page.h include/rados/crc32c.h \
-	include/rados/buffer.h include/radosstriper/libradosstriper.h \
+	include/sock_compat.h include/crc32c.h include/encoding.h \
+	include/err.h include/error.h include/filepath.h \
+	include/frag.h include/hash.h include/inline_memory.h \
+	include/intarith.h include/interval_set.h include/int_types.h \
+	include/ipaddr.h include/krbd.h include/linux_fiemap.h \
+	include/lru.h include/msgr.h include/object.h include/page.h \
+	include/rangeset.h include/rados.h include/rbd_types.h \
+	include/statlite.h include/str_list.h include/str_map.h \
+	include/stringify.h include/types.h include/utime.h \
+	include/elist.h include/uuid.h include/xlist.h \
+	include/compact_map.h include/compact_set.h \
+	include/rados/librados.h include/rados/rados_types.h \
+	include/rados/rados_types.hpp include/rados/librados.hpp \
+	include/rados/librgw.h include/rados/page.h \
+	include/rados/crc32c.h include/rados/buffer.h \
+	include/radosstriper/libradosstriper.h \
 	include/radosstriper/libradosstriper.hpp \
 	include/rbd/features.h include/rbd/librbd.h \
 	include/rbd/librbd.hpp include/rbd/object_map_types.h \
 	include/util.h include/stat.h include/on_exit.h \
 	include/memory.h include/rados/memory.h \
-	include/hash_namespace.h include/unordered_set.h \
-	include/unordered_map.h $(am__append_95) $(am__append_98) \
-	$(am__append_103) $(am__append_109) $(am__append_112) \
-	$(am__append_115) $(am__append_117) $(am__append_121) \
-	$(am__append_141) $(am__append_157) test/bench/backend.h \
-	test/bench/bencher.h test/bench/detailed_stat_collector.h \
-	test/bench/distribution.h test/bench/dumb_backend.h \
-	test/bench/rados_backend.h test/bench/rbd_backend.h \
-	test/bench/stat_collector.h test/bench/testfilestore_backend.h \
+	include/unordered_set.h include/unordered_map.h \
+	include/timegm.h $(am__append_102) $(am__append_105) \
+	$(am__append_109) $(am__append_115) $(am__append_118) \
+	$(am__append_121) $(am__append_122) $(am__append_128) \
+	$(am__append_150) $(am__append_166) $(am__append_172) \
+	$(am__append_184) test/bench/backend.h test/bench/bencher.h \
+	test/bench/detailed_stat_collector.h test/bench/distribution.h \
+	test/bench/dumb_backend.h test/bench/rados_backend.h \
+	test/bench/rbd_backend.h test/bench/stat_collector.h \
+	test/bench/testfilestore_backend.h \
 	test/common/ObjectContents.h test/encoding/types.h \
 	test/objectstore/DeterministicOpSequence.h \
 	test/objectstore/FileStoreDiff.h \
@@ -6881,9 +7781,9 @@ noinst_HEADERS = arch/intel.h arch/arm.h arch/probe.h \
 	test/objectstore/TestObjectStoreState.h \
 	test/objectstore/workload_generator.h test/kv_store_bench.h \
 	test/librados/test.h test/librados/TestCase.h \
-	test/libradosstriper/TestCase.h test/librbd/test_fixture.h \
-	test/librbd/test_support.h test/ObjectMap/KeyValueDBMemory.h \
-	test/omap_bench.h test/osdc/FakeWriteback.h test/osd/Object.h \
+	test/libradosstriper/TestCase.h \
+	test/ObjectMap/KeyValueDBMemory.h test/omap_bench.h \
+	test/osdc/FakeWriteback.h test/osd/Object.h \
 	test/osd/RadosModel.h test/osd/TestOpStat.h \
 	test/system/cross_process_sem.h \
 	test/system/st_rados_create_pool.h \
@@ -6896,46 +7796,51 @@ noinst_HEADERS = arch/intel.h arch/arm.h arch/probe.h \
 	tools/cephfs/JournalScanner.h tools/cephfs/JournalFilter.h \
 	tools/cephfs/EventOutput.h tools/cephfs/Resetter.h \
 	tools/cephfs/Dumper.h tools/cephfs/TableTool.h \
-	tools/cephfs/MDSUtility.h tools/rados/rados_sync.h cls_acl.cc \
-	cls_crypto.cc fetch_config logrotate.conf sample.ceph.conf \
+	tools/cephfs/MDSUtility.h tools/RadosDump.h \
+	tools/rados/RadosImport.h tools/ceph_objectstore_tool.h \
+	tools/rados/PoolDump.h tools/cephfs/DataScan.h \
+	compressor/Compressor.h compressor/AsyncCompressor.h \
+	compressor/SnappyCompressor.h cls_acl.cc cls_crypto.cc \
+	fetch_config logrotate.conf sample.ceph.conf \
 	bash_completion/ceph bash_completion/rados bash_completion/rbd \
 	bash_completion/radosgw-admin mount/canonicalize.c \
 	mount/mtab.c objclass/objclass.h
-bin_SCRIPTS = $(am__append_20) $(am__append_208) $(am__append_217) \
-	$(am__append_225)
+bin_SCRIPTS = $(am__append_20) $(am__append_219) $(am__append_230) \
+	$(am__append_238)
 sbin_SCRIPTS = 
-su_sbin_SCRIPTS = $(am__append_222)
+su_sbin_SCRIPTS = $(am__append_235)
 dist_bin_SCRIPTS = 
-lib_LTLIBRARIES = $(am__append_94) $(am__append_97) $(am__append_102) \
-	$(am__append_215) $(am__append_216)
+lib_LTLIBRARIES = $(am__append_101) $(am__append_104) \
+	$(am__append_108) $(am__append_211) $(am__append_228) \
+	$(am__append_229)
 noinst_LTLIBRARIES = libarch.la libauth.la libcrush.la libmon_types.la \
 	$(am__append_22) $(am__append_26) libos_types.la \
-	$(am__append_32) $(am__append_34) $(am__append_36) \
+	$(am__append_33) $(am__append_35) $(am__append_37) \
 	libosd_types.la $(am__append_46) liberasure_code.la libosdc.la \
-	$(am__append_61) $(am__append_63) libglobal.la \
+	$(am__append_68) $(am__append_70) libglobal.la \
 	libjson_spirit.la liblog.la libperfglue.la \
-	libcommon_internal.la libcommon_api.la libcommon_crc.la \
-	libcommon.la $(am__append_80) libmsg.la $(am__append_89) \
-	librbd_types.la $(am__append_99) $(am__append_105) \
-	$(am__append_110) $(am__append_116) $(am__append_150) \
-	$(am__append_160) $(am__append_165) $(am__append_186) \
-	$(am__append_210)
-noinst_LIBRARIES = $(am__append_38) $(am__append_111)
-radoslib_LTLIBRARIES = $(am__append_113) $(am__append_114)
+	libcommon_internal.la libcommon_crc.la $(am__append_85) \
+	libcommon.la $(am__append_88) libmsg.la $(am__append_97) \
+	librbd_types.la $(am__append_106) $(am__append_111) \
+	$(am__append_116) $(am__append_123) $(am__append_159) \
+	$(am__append_169) $(am__append_174) $(am__append_200) \
+	libcompressor.la $(am__append_221)
+noinst_LIBRARIES = $(am__append_39) $(am__append_117)
+radoslib_LTLIBRARIES = $(am__append_119) $(am__append_120)
 
 # like bin_PROGRAMS, but these targets are only built for debug builds
-bin_DEBUGPROGRAMS = $(am__append_65) $(am__append_108) \
-	$(am__append_123) $(am__append_151) $(am__append_152) \
-	$(am__append_153) $(am__append_154) $(am__append_156) \
-	$(am__append_158) $(am__append_164) $(am__append_166) \
-	$(am__append_167) $(am__append_169) $(am__append_170) \
-	$(am__append_171) $(am__append_172) $(am__append_173) \
-	$(am__append_174) $(am__append_175) $(am__append_176) \
-	$(am__append_182) ceph_test_timers ceph_test_signal_handlers \
-	ceph_test_rewrite_latency ceph_test_crypto $(am__append_185) \
+bin_DEBUGPROGRAMS = $(am__append_72) $(am__append_114) \
+	$(am__append_130) $(am__append_160) $(am__append_161) \
+	$(am__append_162) $(am__append_163) $(am__append_165) \
+	$(am__append_167) $(am__append_173) $(am__append_175) \
+	$(am__append_176) $(am__append_179) $(am__append_181) \
+	$(am__append_182) $(am__append_183) $(am__append_185) \
+	$(am__append_186) $(am__append_187) $(am__append_188) \
+	$(am__append_194) ceph_test_timers ceph_test_signal_handlers \
+	ceph_test_rewrite_latency ceph_test_crypto $(am__append_199) \
 	ceph_bench_log ceph_test_objectcacher_stress \
 	ceph_test_cfuse_cache_invalidate ceph_test_get_blkdev_size \
-	$(am__append_189) $(am__append_191) $(am__append_192) \
+	$(am__append_202) $(am__append_204) $(am__append_205) \
 	ceph_psim
 
 # like sbin_SCRIPTS but can be used to install to e.g. /usr/sbin
@@ -6944,20 +7849,90 @@ ceph_sbindir = $(sbindir)
 # certain things go straight into /sbin, though!
 su_sbindir = /sbin
 
+# C/C++ tests to build and executed will be appended to this
+check_TESTPROGRAMS = $(am__append_136) $(am__append_140) \
+	$(am__append_143) $(am__append_164) $(am__append_168) \
+	$(am__append_177) $(am__append_190) $(am__append_191) \
+	$(am__append_195) $(am__append_196) $(am__append_197) \
+	$(am__append_198) unittest_addrs $(am__append_201) \
+	unittest_bloom_filter unittest_histogram \
+	unittest_prioritized_queue unittest_str_map \
+	unittest_sharedptr_registry unittest_shared_cache \
+	unittest_sloppy_crc_map unittest_util unittest_crush_wrapper \
+	unittest_crush unittest_osdmap unittest_workqueue \
+	unittest_striper unittest_prebufferedstreambuf \
+	unittest_str_list unittest_log unittest_throttle \
+	unittest_ceph_argparse unittest_ceph_compatset \
+	unittest_mds_types unittest_osd_types unittest_lru \
+	unittest_io_priority unittest_gather unittest_signals \
+	unittest_bufferlist unittest_xlist unittest_crc32c \
+	unittest_arch unittest_crypto unittest_crypto_init \
+	unittest_perf_counters unittest_admin_socket \
+	unittest_ceph_crypto unittest_utf8 unittest_mime \
+	unittest_escape unittest_strtol unittest_confutils \
+	unittest_config unittest_context unittest_safe_io \
+	unittest_heartbeatmap unittest_formatter \
+	unittest_daemon_config unittest_ipaddr unittest_texttable \
+	unittest_on_exit unittest_readahead unittest_tableformatter \
+	unittest_bit_vector
+
 # tests scripts will be appended to this
-check_SCRIPTS = $(am__append_120) test/ceph_objectstore_tool.py \
+
+#
+# Copyright (C) 2015 SUSE LINUX GmbH
+# Copyright (C) 2015 <contact at redhat.com>
+#
+# Author: Owen Synge <osynge at suse.com>
+# Author: Loic Dachary <loic at dachary.org>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see `<http://www.gnu.org/licenses/>`.
+#
+check_SCRIPTS = ceph-detect-init/run-tox.sh $(am__append_127) \
+	$(am__append_171) test/ceph_objectstore_tool.py \
 	test/test-ceph-helpers.sh test/cephtool-test-osd.sh \
 	test/cephtool-test-mon.sh test/cephtool-test-mds.sh \
-	unittest_bufferlist.sh test/encoding/check-generated.sh \
-	test/mon/osd-pool-create.sh test/mon/misc.sh \
-	test/mon/osd-crush.sh test/mon/osd-erasure-code-profile.sh \
-	test/mon/mkfs.sh test/osd/osd-scrub-repair.sh \
+	test/cephtool-test-rados.sh unittest_bufferlist.sh \
+	test/encoding/check-generated.sh test/mon/osd-pool-create.sh \
+	test/mon/misc.sh test/mon/osd-crush.sh test/mon/mon-ping.sh \
+	test/mon/osd-erasure-code-profile.sh test/mon/mkfs.sh \
+	test/mon/mon-scrub.sh test/osd/osd-scrub-repair.sh \
 	test/osd/osd-config.sh test/osd/osd-bench.sh \
 	test/osd/osd-copy-from.sh test/mon/mon-handle-forward.sh \
-	$(am__append_187) $(am__append_188) \
-	test/pybind/test_ceph_argparse.py
+	test/libradosstriper/rados-striper.sh \
+	test/test_objectstore_memstore.sh test/ceph-disk.sh \
+	test/pybind/test_ceph_argparse.py \
+	test/pybind/test_ceph_daemon.py \
+	../qa/workunits/erasure-code/encode-decode-non-regression.sh \
+	test/encoding/readable.sh
 
 ##################################
+HARDENING_CFLAGS = \
+                   -O2 \
+                   -g \
+                   -pipe \
+                   -Wall \
+                   -Wp,-D_FORTIFY_SOURCE=2 \
+                   -fexceptions \
+                   --param=ssp-buffer-size=4 \
+                   -fPIE
+
+SET_STACK_PROTECTOR_STRONG = $(shell expr `gcc -dumpversion` \>= 4.9)
+HARDENING_LDFLAGS = \
+                     -pie \
+                     -Wl,-z,relro \
+                     -Wl,-z,now
+
 AM_COMMON_CPPFLAGS = \
 	-D__CEPH__ \
 	-D_FILE_OFFSET_BITS=64 \
@@ -6967,7 +7942,7 @@ AM_COMMON_CPPFLAGS = \
 	-D_GNU_SOURCE \
 	-DCEPH_LIBDIR=\"${libdir}\" \
 	-DCEPH_PKGLIBDIR=\"${pkglibdir}\" \
-	-DGTEST_HAS_TR1_TUPLE=0
+	-DGTEST_USE_OWN_TR1_TUPLE=0
 
 AM_COMMON_CFLAGS = \
 	-Wall \
@@ -6979,8 +7954,8 @@ AM_COMMON_CFLAGS = \
 	-fno-strict-aliasing \
 	-fsigned-char
 
-AM_CFLAGS = $(AM_COMMON_CFLAGS) $(am__append_5) $(am__append_67) \
-	$(am__append_70)
+AM_CFLAGS = $(AM_COMMON_CFLAGS) $(HARDENING_CFLAGS) $(am__append_5) \
+	$(am__append_74) $(am__append_77)
 AM_CPPFLAGS = $(AM_COMMON_CPPFLAGS)
 
 # note: this is position dependant, it affects the -l options that
@@ -6995,7 +7970,7 @@ AM_LDFLAGS = $(am__append_2) $(am__append_4)
 AM_CCASFLAGS = -f elf64
 
 #####################
-EXTRALIBS = -luuid -lm $(am__append_7) $(am__append_8) $(am__append_9) \
+EXTRALIBS = -lm $(am__append_7) $(am__append_8) $(am__append_9) \
 	$(am__append_19)
 LIBGLOBAL = libglobal.la
 LIBCOMMON = libcommon.la
@@ -7005,6 +7980,7 @@ LIBPERFGLUE = libperfglue.la $(am__append_13) $(am__append_14)
 LIBAUTH = libauth.la
 LIBMSG = libmsg.la
 LIBCRUSH = libcrush.la
+LIBCOMPRESSOR = libcompressor.la -lsnappy
 LIBJSON_SPIRIT = libjson_spirit.la
 LIBLOG = liblog.la
 
@@ -7033,10 +8009,6 @@ LIBRBD_TYPES = librbd_types.la
 LIBKRBD = libkrbd.la
 LIBCEPHFS = libcephfs.la
 LIBERASURE_CODE = liberasure_code.la
-LIBOSD_TP = tracing/libosd_tp.la
-LIBRADOS_TP = tracing/librados_tp.la
-LIBRBD_TP = tracing/librbd_tp.la
-LIBOS_TP = tracing/libos_tp.la
 
 # Use this for binaries requiring libglobal
 CEPH_GLOBAL = $(LIBGLOBAL) $(LIBCOMMON) $(PTHREAD_LIBS) -lm $(CRYPTO_LIBS) $(EXTRALIBS)
@@ -7045,17 +8017,18 @@ CEPH_GLOBAL = $(LIBGLOBAL) $(LIBCOMMON) $(PTHREAD_LIBS) -lm $(CRYPTO_LIBS) $(EXT
 
 # important; libmsg before libauth!
 LIBCOMMON_DEPS = libcommon_internal.la libcommon_crc.la \
-	$(LIBERASURE_CODE) $(LIBMSG) $(LIBAUTH) $(LIBCRUSH) \
-	$(LIBJSON_SPIRIT) $(LIBLOG) $(LIBARCH) $(am__append_78)
-LIBRADOS_DEPS = $(am__append_90) $(am__append_91)
-LIBRGW_DEPS = $(am__append_106)
+	$(am__append_84) $(LIBERASURE_CODE) $(LIBMSG) $(LIBAUTH) \
+	$(LIBCRUSH) $(LIBJSON_SPIRIT) $(LIBLOG) $(LIBARCH) \
+	$(BOOST_RANDOM_LIBS) $(am__append_86)
+LIBRADOS_DEPS = $(am__append_98)
+LIBRGW_DEPS = $(am__append_112)
 
 # This is used by the dencoder test
 
 # Do not use TCMALLOC with dencoder
 DENCODER_SOURCES = $(am__append_24) perfglue/disabled_heap_profiler.cc \
-	perfglue/disabled_stubs.cc $(am__append_104)
-DENCODER_DEPS = $(am__append_25)
+	perfglue/disabled_stubs.cc $(am__append_110)
+DENCODER_DEPS = $(am__append_25) $(am__append_124)
 radoslibdir = $(libdir)/rados-classes
 libarch_la_SOURCES = \
 	arch/intel.c \
@@ -7103,7 +8076,6 @@ libmon_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/LogMonitor.cc \
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/AuthMonitor.cc \
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/Elector.cc \
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/MonitorStore.cc \
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/HealthMonitor.cc \
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/DataHealthService.cc \
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/ConfigKeyService.cc
@@ -7111,7 +8083,8 @@ libmon_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE at libmon_la_LIBADD = $(LIBAUTH) $(LIBCOMMON) $(LIBOS) $(LIBMON_TYPES)
 LIBMDS_SOURCES = \
 	mds/Capability.cc \
-	mds/MDS.cc \
+	mds/MDSDaemon.cc \
+	mds/MDSRank.cc \
 	mds/Beacon.cc \
 	mds/locks.c \
 	mds/journal.cc \
@@ -7119,6 +8092,7 @@ LIBMDS_SOURCES = \
 	mds/Mutation.cc \
 	mds/MDCache.cc \
 	mds/RecoveryQueue.cc \
+	mds/StrayManager.cc \
 	mds/Locker.cc \
 	mds/Migrator.cc \
 	mds/MDBalancer.cc \
@@ -7131,6 +8105,7 @@ LIBMDS_SOURCES = \
 	mds/JournalPointer.cc \
 	mds/MDSTableClient.cc \
 	mds/MDSTableServer.cc \
+	mds/SimpleLock.cc \
 	mds/SnapRealm.cc \
 	mds/SnapServer.cc \
 	mds/snap.cc \
@@ -7143,28 +8118,24 @@ LIBMDS_SOURCES = \
 LIBMDS_DEPS = $(LIBOSDC)
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at libmds_la_SOURCES = $(LIBMDS_SOURCES)
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at libmds_la_LIBADD = $(LIBMDS_DEPS)
-libos_types_la_SOURCES = \
-	os/Transaction.cc
-
+libos_types_la_SOURCES = os/Transaction.cc $(am__append_29)
 libos_types_la_CXXFLAGS = ${AM_CXXFLAGS}
- at ENABLE_SERVER_TRUE@libos_la_SOURCES = os/chain_xattr.cc \
+ at ENABLE_SERVER_TRUE@libos_la_SOURCES = os/chain_xattr.cc os/fs/FS.cc \
 @ENABLE_SERVER_TRUE@	os/DBObjectMap.cc os/GenericObjectMap.cc \
 @ENABLE_SERVER_TRUE@	os/FileJournal.cc os/FileStore.cc \
- at ENABLE_SERVER_TRUE@	os/FlatIndex.cc \
 @ENABLE_SERVER_TRUE@	os/GenericFileStoreBackend.cc \
 @ENABLE_SERVER_TRUE@	os/HashIndex.cc os/IndexManager.cc \
 @ENABLE_SERVER_TRUE@	os/JournalingObjectStore.cc \
 @ENABLE_SERVER_TRUE@	os/LevelDBStore.cc os/LFNIndex.cc \
 @ENABLE_SERVER_TRUE@	os/MemStore.cc os/KeyValueDB.cc \
 @ENABLE_SERVER_TRUE@	os/KeyValueStore.cc os/ObjectStore.cc \
- at ENABLE_SERVER_TRUE@	os/WBThrottle.cc os/KeyValueDB.cc \
- at ENABLE_SERVER_TRUE@	common/TrackedOp.cc $(am__append_28) \
- at ENABLE_SERVER_TRUE@	$(am__append_29) $(am__append_30) \
- at ENABLE_SERVER_TRUE@	$(am__append_40)
- at ENABLE_SERVER_TRUE@libos_la_CXXFLAGS = ${AM_CXXFLAGS} \
+ at ENABLE_SERVER_TRUE@	os/WBThrottle.cc common/TrackedOp.cc \
+ at ENABLE_SERVER_TRUE@	$(am__append_28) $(am__append_30) \
+ at ENABLE_SERVER_TRUE@	$(am__append_31) $(am__append_32) \
 @ENABLE_SERVER_TRUE@	$(am__append_41)
- at ENABLE_SERVER_TRUE@libos_la_LIBADD = $(LIBOS_TYPES) $(am__append_31) \
+ at ENABLE_SERVER_TRUE@libos_la_CXXFLAGS = ${AM_CXXFLAGS} \
 @ENABLE_SERVER_TRUE@	$(am__append_42)
+ at ENABLE_SERVER_TRUE@libos_la_LIBADD = $(LIBOS_TYPES) $(am__append_43)
 @ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_TRUE at libos_rocksdb_la_SOURCES = os/RocksDBStore.cc
 @ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE at libos_rocksdb_la_SOURCES = os/RocksDBStore.cc
 @ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_TRUE at libos_rocksdb_la_CXXFLAGS = ${AM_CXXFLAGS} ${LIBROCKSDB_CFLAGS} -std=gnu++11
@@ -7187,7 +8158,6 @@ libosd_types_la_CXXFLAGS = ${AM_CXXFLAGS}
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/ECMsgTypes.cc \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/ECTransaction.cc \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/PGBackend.cc \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/Ager.cc \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/HitSet.cc \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/OSD.cc \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/OSDCap.cc \
@@ -7200,16 +8170,14 @@ libosd_types_la_CXXFLAGS = ${AM_CXXFLAGS}
 
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libosd_la_CXXFLAGS =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	${AM_CXXFLAGS} \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_44)
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libosd_la_LIBADD = $(LIBOSDC) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOS) $(LIBOSD_TYPES) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOS_TYPES) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_45)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libosd_la_LIBADD = $(LIBOSDC) $(LIBOS) $(LIBOSD_TYPES) $(LIBOS_TYPES)
 erasure_codelibdir = $(pkglibdir)/erasure-code
 erasure_codelib_LTLIBRARIES = libec_jerasure_generic.la \
 	$(am__append_50) $(am__append_52) $(am__append_54) \
-	libec_jerasure.la libec_lrc.la libec_shec.la $(am__append_60) \
-	$(am__append_127)
+	libec_jerasure.la libec_lrc.la libec_shec_generic.la \
+	$(am__append_59) $(am__append_61) $(am__append_63) \
+	libec_shec.la $(am__append_67) $(am__append_134)
 jerasure_sources = \
   erasure-code/ErasureCode.cc \
   erasure-code/jerasure/jerasure/src/cauchy.c \
@@ -7326,12 +8294,13 @@ libec_lrc_la_CXXFLAGS = ${AM_CXXFLAGS}
 libec_lrc_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(LIBJSON_SPIRIT)
 libec_lrc_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 \
 	$(am__append_56)
-libec_shec_la_SOURCES = \
+
+# SHEC plugin
+shec_sources = \
 	erasure-code/ErasureCode.cc \
 	erasure-code/shec/ErasureCodePluginShec.cc \
 	erasure-code/shec/ErasureCodeShec.cc \
 	erasure-code/shec/ErasureCodeShecTableCache.cc \
-	erasure-code/shec/shec.cc \
 	erasure-code/shec/determinant.c \
 	erasure-code/jerasure/jerasure/src/cauchy.c \
 	erasure-code/jerasure/jerasure/src/galois.c \
@@ -7350,23 +8319,106 @@ libec_shec_la_SOURCES = \
 	erasure-code/jerasure/gf-complete/src/gf_rand.c \
 	erasure-code/jerasure/gf-complete/src/gf_w8.c
 
-libec_shec_la_CFLAGS = ${AM_CFLAGS} \
+libec_shec_generic_la_SOURCES = ${shec_sources}
+libec_shec_generic_la_CFLAGS = ${AM_CFLAGS}  \
+	-I$(srcdir)/erasure-code/jerasure/jerasure/include \
+	-I$(srcdir)/erasure-code/jerasure/gf-complete/include \
+	-I$(srcdir)/erasure-code/jerasure \
+	-I$(srcdir)/erasure-code/shec
+
+libec_shec_generic_la_CXXFLAGS = ${AM_CXXFLAGS} \
+	-I$(srcdir)/erasure-code/jerasure/jerasure/include \
+	-I$(srcdir)/erasure-code/jerasure/gf-complete/include \
+	-I$(srcdir)/erasure-code/jerasure \
+	-I$(srcdir)/erasure-code/shec
+
+libec_shec_generic_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(EXTRALIBS)
+libec_shec_generic_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 \
+	$(am__append_57)
+libec_shec_neon_la_SOURCES = ${shec_sources} \
+	erasure-code/jerasure/gf-complete/src/neon/gf_w4_neon.c \
+	erasure-code/jerasure/gf-complete/src/neon/gf_w8_neon.c \
+	erasure-code/jerasure/gf-complete/src/neon/gf_w16_neon.c \
+	erasure-code/jerasure/gf-complete/src/neon/gf_w32_neon.c \
+	erasure-code/jerasure/gf-complete/src/neon/gf_w64_neon.c
+
+libec_shec_neon_la_CFLAGS = ${AM_CFLAGS}  \
+	${ARM_NEON_FLAGS} \
+	-I$(srcdir)/erasure-code/jerasure/jerasure/include \
+	-I$(srcdir)/erasure-code/jerasure/gf-complete/include \
+	-I$(srcdir)/erasure-code/jerasure \
+	-I$(srcdir)/erasure-code/shec
+
+libec_shec_neon_la_CXXFLAGS = ${AM_CXXFLAGS} \
+	${ARM_NEON_FLAGS} \
+	-I$(srcdir)/erasure-code/jerasure/jerasure/include \
+	-I$(srcdir)/erasure-code/jerasure/gf-complete/include \
+	-I$(srcdir)/erasure-code/jerasure \
+	-I$(srcdir)/erasure-code/shec
+
+libec_shec_neon_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(EXTRALIBS)
+libec_shec_neon_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 \
+	$(am__append_58)
+libec_shec_sse3_la_SOURCES = ${shec_sources}
+libec_shec_sse3_la_CFLAGS = ${AM_CFLAGS}  \
+	${INTEL_SSE_FLAGS} \
+	${INTEL_SSE2_FLAGS} \
+	${INTEL_SSE3_FLAGS} \
+	${INTEL_SSSE3_FLAGS} \
+	-I$(srcdir)/erasure-code/jerasure/jerasure/include \
+	-I$(srcdir)/erasure-code/jerasure/gf-complete/include \
+	-I$(srcdir)/erasure-code/jerasure \
+	-I$(srcdir)/erasure-code/shec
+
+libec_shec_sse3_la_CXXFLAGS = ${AM_CXXFLAGS} \
+	${INTEL_SSE_FLAGS} \
+	${INTEL_SSE2_FLAGS} \
+	${INTEL_SSE3_FLAGS} \
+	${INTEL_SSSE3_FLAGS} \
+	-I$(srcdir)/erasure-code/jerasure/jerasure/include \
+	-I$(srcdir)/erasure-code/jerasure/gf-complete/include \
+	-I$(srcdir)/erasure-code/jerasure \
+	-I$(srcdir)/erasure-code/shec
+
+libec_shec_sse3_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(EXTRALIBS)
+libec_shec_sse3_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 \
+	$(am__append_60)
+libec_shec_sse4_la_SOURCES = ${shec_sources}
+libec_shec_sse4_la_CFLAGS = ${AM_CFLAGS}  \
+	${INTEL_SSE_FLAGS} \
+	${INTEL_SSE2_FLAGS} \
+	${INTEL_SSE3_FLAGS} \
+	${INTEL_SSSE3_FLAGS} \
+	${INTEL_SSE4_1_FLAGS} \
+	${INTEL_SSE4_2_FLAGS} \
 	-I$(srcdir)/erasure-code/jerasure/jerasure/include \
 	-I$(srcdir)/erasure-code/jerasure/gf-complete/include \
 	-I$(srcdir)/erasure-code/jerasure \
 	-I$(srcdir)/erasure-code/shec
 
-libec_shec_la_CXXFLAGS = ${AM_CXXFLAGS} \
+libec_shec_sse4_la_CXXFLAGS = ${AM_CXXFLAGS} \
+	${INTEL_SSE_FLAGS} \
+	${INTEL_SSE2_FLAGS} \
+	${INTEL_SSE3_FLAGS} \
+	${INTEL_SSSE3_FLAGS} \
+	${INTEL_SSE4_1_FLAGS} \
+	${INTEL_SSE4_2_FLAGS} \
 	-I$(srcdir)/erasure-code/jerasure/jerasure/include \
 	-I$(srcdir)/erasure-code/jerasure/gf-complete/include \
 	-I$(srcdir)/erasure-code/jerasure \
 	-I$(srcdir)/erasure-code/shec
 
+libec_shec_sse4_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(EXTRALIBS)
+libec_shec_sse4_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 \
+	$(am__append_62)
+libec_shec_la_SOURCES = \
+	erasure-code/shec/ErasureCodePluginSelectShec.cc
+
+libec_shec_la_CFLAGS = ${AM_CFLAGS}
+libec_shec_la_CXXFLAGS = ${AM_CXXFLAGS}
 libec_shec_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(EXTRALIBS)
-#libec_shec_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
-#libec_shec_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 -export-symbols-regex '.*__erasure_code_.*'
 libec_shec_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 \
-	$(am__append_57)
+	$(am__append_64)
 @WITH_BETTER_YASM_ELF64_TRUE at isa_sources = \
 @WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/ErasureCode.cc \
 @WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/ec_base.c \
@@ -7390,6 +8442,24 @@ libec_shec_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 \
 @WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx2.asm.s \
 @WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx.asm.s \
 @WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_sse.asm.s \
+ at WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_avx2.asm.s \
+ at WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_avx.asm.s \
+ at WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_sse.asm.s \
+ at WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_avx2.asm.s \
+ at WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_avx.asm.s \
+ at WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_sse.asm.s \
+ at WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_avx2.asm.s \
+ at WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_avx.asm.s \
+ at WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_sse.asm.s \
+ at WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_avx2.asm.s \
+ at WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_avx.asm.s \
+ at WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_sse.asm.s \
+ at WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_avx2.asm.s \
+ at WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_avx.asm.s \
+ at WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_sse.asm.s \
+ at WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/gf_vect_mad_avx2.asm.s \
+ at WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/gf_vect_mad_avx.asm.s \
+ at WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/gf_vect_mad_sse.asm.s \
 @WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/gf_vect_mul_avx.asm.s \
 @WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/gf_vect_mul_sse.asm.s \
 @WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/ErasureCodeIsa.cc \
@@ -7403,8 +8473,8 @@ libec_shec_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 \
 @WITH_BETTER_YASM_ELF64_TRUE at libec_isa_la_CCASFLAGS = ${AM_CCASFLAGS} -I $(abs_srcdir)/erasure-code/isa/isa-l/include/
 @WITH_BETTER_YASM_ELF64_TRUE at libec_isa_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(EXTRALIBS)
 @WITH_BETTER_YASM_ELF64_TRUE at libec_isa_la_LDFLAGS = ${AM_LDFLAGS} \
- at WITH_BETTER_YASM_ELF64_TRUE@	-version-info 2:10:0 \
- at WITH_BETTER_YASM_ELF64_TRUE@	$(am__append_59)
+ at WITH_BETTER_YASM_ELF64_TRUE@	-version-info 2:14:0 \
+ at WITH_BETTER_YASM_ELF64_TRUE@	$(am__append_66)
 @WITH_BETTER_YASM_ELF64_TRUE at libec_isa_la_LIBTOOLFLAGS = --tag=CC
 liberasure_code_la_SOURCES = \
 	erasure-code/ErasureCodePlugin.cc
@@ -7429,7 +8499,8 @@ libosdc_la_SOURCES = \
 
 @ENABLE_CLIENT_TRUE at libclient_la_LIBADD = $(LIBOSDC) $(LIBEDIT_LIBS)
 @ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE at libclient_fuse_la_SOURCES = client/fuse_ll.cc
- at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE at libclient_fuse_la_LIBADD = libclient.la -lfuse
+ at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE at libclient_fuse_la_LIBADD = libclient.la $(LIBFUSE_LIBS)
+ at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE at libclient_fuse_la_CXXFLAGS = $(AM_CXXFLAGS) $(LIBFUSE_CFLAGS)
 @ENABLE_CLIENT_TRUE at ceph_test_ioctls_SOURCES = client/test_ioctls.c
 libglobal_la_SOURCES = \
 	global/global_context.cc \
@@ -7447,8 +8518,8 @@ liblog_la_SOURCES = \
 	log/Log.cc \
 	log/SubsystemMap.cc
 
-libperfglue_la_SOURCES = $(am__append_66) $(am__append_69) \
-	$(am__append_72) $(am__append_73) $(am__append_74)
+libperfglue_la_SOURCES = $(am__append_73) $(am__append_76) \
+	$(am__append_79) $(am__append_80) $(am__append_81)
 @WITH_TCMALLOC_FALSE@@WITH_TCMALLOC_MINIMAL_TRUE at libperfglue_la_LIBADD = -ltcmalloc_minimal
 @WITH_TCMALLOC_TRUE at libperfglue_la_LIBADD = -ltcmalloc
 
@@ -7479,22 +8550,21 @@ libcommon_internal_la_SOURCES = ceph_ver.c common/DecayCounter.cc \
 	common/ceph_frag.cc common/addr_parsing.c common/hobject.cc \
 	common/bloom_filter.cc common/linux_version.c common/module.c \
 	common/Readahead.cc common/Cycles.cc \
-	common/ContextCompletion.cc $(am__append_75) $(am__append_76) \
-	mon/MonCap.cc mon/MonClient.cc mon/MonMap.cc osd/OSDMap.cc \
-	osd/osd_types.cc osd/ECMsgTypes.cc osd/HitSet.cc mds/MDSMap.cc \
+	common/ContextCompletion.cc common/TracepointProvider.cc \
+	common/blkdev.cc $(am__append_82) mon/MonCap.cc \
+	mon/MonClient.cc mon/MonMap.cc osd/OSDMap.cc osd/osd_types.cc \
+	osd/ECMsgTypes.cc osd/HitSet.cc mds/MDSMap.cc \
 	mds/inode_backtrace.cc mds/mdstypes.cc mds/flock.cc
-libcommon_api_la_SOURCES = \
-	common/buffer.cc
-
- at LINUX_TRUE@libcommon_api_la_CXXFLAGS = -fvisibility=hidden -fvisibility-inlines-hidden
 
 # inject crc in common
 libcommon_crc_la_SOURCES = common/sctp_crc32.c common/crc32c.cc \
 	common/crc32c_intel_baseline.c common/crc32c_intel_fast.c \
-	$(am__append_77)
+	$(am__append_83)
 @WITH_GOOD_YASM_ELF64_TRUE at libcommon_crc_la_LIBTOOLFLAGS = --tag=CC
-libcommon_la_SOURCES = 
-libcommon_la_LIBADD = $(LIBCOMMON_DEPS) libcommon_api.la
+ at HAVE_ARMV8_CRC_TRUE@libcommon_crc_aarch64_la_SOURCES = common/crc32c_aarch64.c
+ at HAVE_ARMV8_CRC_TRUE@libcommon_crc_aarch64_la_CFLAGS = $(AM_CFLAGS) $(ARM_CRC_FLAGS)
+libcommon_la_SOURCES = common/buffer.cc
+libcommon_la_LIBADD = $(LIBCOMMON_DEPS)
 libsecret_la_SOURCES = common/secret.c
 libsecret_la_LIBADD = $(KEYUTILS_LIB)
 libmsg_la_SOURCES = msg/Message.cc msg/Messenger.cc msg/msg_types.cc \
@@ -7503,9 +8573,9 @@ libmsg_la_SOURCES = msg/Message.cc msg/Messenger.cc msg/msg_types.cc \
 	msg/simple/SimpleMessenger.cc msg/async/AsyncConnection.cc \
 	msg/async/AsyncMessenger.cc msg/async/Event.cc \
 	msg/async/net_handler.cc msg/async/EventSelect.cc \
-	$(am__append_81) $(am__append_82) $(am__append_83) \
-	$(am__append_84) $(am__append_85) $(am__append_86) \
-	$(am__append_87)
+	$(am__append_89) $(am__append_90) $(am__append_91) \
+	$(am__append_92) $(am__append_93) $(am__append_94) \
+	$(am__append_95)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at rados_includedir = $(includedir)/rados
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at rados_include_DATA = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(srcdir)/include/rados/librados.h \
@@ -7548,12 +8618,12 @@ libmsg_la_SOURCES = msg/Message.cc msg/Messenger.cc msg/msg_types.cc \
 # We need this to avoid basename conflicts with the librados build tests in test/Makefile.am
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at librados_la_CXXFLAGS =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	${AM_CXXFLAGS} \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__append_92)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__append_99)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at librados_la_LIBADD = $(LIBRADOS_DEPS) $(PTHREAD_LIBS) $(CRYPTO_LIBS) $(EXTRALIBS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at librados_la_LDFLAGS =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	${AM_LDFLAGS} \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	-version-info 2:0:0 \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__append_93)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__append_100)
 @ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at libradosstriper_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	libradosstriper/libradosstriper.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	libradosstriper/RadosStriperImpl.cc \
@@ -7562,12 +8632,12 @@ libmsg_la_SOURCES = msg/Message.cc msg/Messenger.cc msg/msg_types.cc \
 
 # We need this to avoid basename conflicts with the libradosstriper build tests in test/Makefile.am
 @ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at libradosstriper_la_CXXFLAGS = ${AM_CXXFLAGS}
- at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at LIBRADOSSTRIPER_DEPS = $(LIBRADOS_DEPS)
- at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at libradosstriper_la_LIBADD = $(LIBRADOSSTRIPER_DEPS)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at LIBRADOSSTRIPER_DEPS = librados_internal.la libcls_lock_client.la $(LIBOSDC) $(LIBCOMMON_DEPS)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at libradosstriper_la_LIBADD = $(LIBRADOSSTRIPER_DEPS) $(LIBRADOS) $(PTHREAD_LIBS) $(CRYPTO_LIBS) $(EXTRALIBS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at libradosstriper_la_LDFLAGS = ${AM_LDFLAGS} \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	-version-info \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	1:0:0 \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	$(am__append_96)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	$(am__append_103)
 librbd_types_la_SOURCES = \
 	librbd/WatchNotifyTypes.cc
 
@@ -7581,11 +8651,13 @@ librbd_types_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AsyncResizeRequest.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AsyncTrimRequest.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/CopyupRequest.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/DiffIterate.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/ImageCtx.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/ImageWatcher.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/internal.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/LibrbdWriteback.cc \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/ObjectMap.cc
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/ObjectMap.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/RebuildObjectMapRequest.cc
 
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at librbd_api_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/librbd.cc
@@ -7593,21 +8665,18 @@ librbd_types_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at librbd_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/librbd.cc
 
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at librbd_la_LIBADD = librbd_internal.la \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRBD_TYPES) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRADOS) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBCOMMON) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBOSDC) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at librbd_la_LIBADD = \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_internal.la $(LIBRBD_TYPES) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRADOS) $(LIBCOMMON) $(LIBOSDC) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librados_internal.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	libcls_rbd_client.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	libcls_lock_client.la \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(PTHREAD_LIBS) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(EXTRALIBS) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__append_100)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(PTHREAD_LIBS) $(EXTRALIBS)
+
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at librbd_la_LDFLAGS = ${AM_LDFLAGS} \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	-version-info \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	1:0:0 \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__append_101)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__append_107)
 @ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at librbd_la_CXXFLAGS = -fvisibility=hidden -fvisibility-inlines-hidden
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at librgw_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/librgw.cc \
@@ -7643,17 +8712,18 @@ librbd_types_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_replica_log.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_keystone.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_quota.cc \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_dencoder.cc
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_dencoder.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_object_expirer_core.cc
 
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at librgw_la_CXXFLAGS = -Woverloaded-virtual ${AM_CXXFLAGS}
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at CIVETWEB_INCLUDE = --include civetweb/include/civetweb_conf.h
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at CIVETWEB_INCLUDE = --include $(srcdir)/civetweb/include/civetweb_conf.h
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at libcivetweb_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_civetweb.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_civetweb_log.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	civetweb/src/civetweb.c
 
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at libcivetweb_la_CXXFLAGS = ${CIVETWEB_INCLUDE} -Woverloaded-virtual ${AM_CXXFLAGS}
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at libcivetweb_la_CFLAGS = -Icivetweb/include ${CIVETWEB_INCLUDE}
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at libcivetweb_la_CFLAGS = -I$(srcdir)/civetweb/include ${CIVETWEB_INCLUDE}
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at radosgw_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_resolve.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_rest.cc \
@@ -7678,6 +8748,8 @@ librbd_types_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at radosgw_LDADD = $(LIBRGW) $(LIBCIVETWEB) $(LIBRGW_DEPS) $(RESOLV_LIBS) $(CEPH_GLOBAL)
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at radosgw_admin_SOURCES = rgw/rgw_admin.cc rgw/rgw_orphan.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at radosgw_admin_LDADD = $(LIBRGW) $(LIBRGW_DEPS) $(CEPH_GLOBAL)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at radosgw_object_expirer_SOURCES = rgw/rgw_object_expirer.cc
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at radosgw_object_expirer_LDADD = $(LIBRGW) $(LIBRGW_DEPS) $(CEPH_GLOBAL)
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at ceph_rgw_multiparser_SOURCES = rgw/rgw_multiparser.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at ceph_rgw_multiparser_LDADD = $(LIBRGW) $(LIBRGW_DEPS) $(CEPH_GLOBAL)
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at ceph_rgw_jsonparser_SOURCES = \
@@ -7702,6 +8774,7 @@ librbd_types_la_SOURCES = \
 
 @ENABLE_CLIENT_TRUE at libcls_log_client_a_SOURCES = cls/log/cls_log_client.cc
 @ENABLE_CLIENT_TRUE at libcls_statelog_client_a_SOURCES = cls/statelog/cls_statelog_client.cc
+ at ENABLE_CLIENT_TRUE@libcls_timeindex_client_a_SOURCES = cls/timeindex/cls_timeindex_client.cc
 @ENABLE_CLIENT_TRUE at libcls_replica_log_client_a_SOURCES = \
 @ENABLE_CLIENT_TRUE@	cls/replica_log/cls_replica_log_types.cc \
 @ENABLE_CLIENT_TRUE@	cls/replica_log/cls_replica_log_ops.cc \
@@ -7717,9 +8790,13 @@ librbd_types_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@	cls/user/cls_user_types.cc \
 @ENABLE_CLIENT_TRUE@	cls/user/cls_user_ops.cc
 
+ at ENABLE_CLIENT_TRUE@libcls_cephfs_client_la_SOURCES = cls/cephfs/cls_cephfs_client.cc
+ at ENABLE_CLIENT_TRUE@libcls_numops_client_la_SOURCES = cls/numops/cls_numops_client.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_hello_la_SOURCES = cls/hello/cls_hello.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_hello_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_hello_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*'
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_numops_la_SOURCES = cls/numops/cls_numops.cc
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_numops_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*'
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_rbd_la_SOURCES = cls/rbd/cls_rbd.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_rbd_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_rbd_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*'
@@ -7742,6 +8819,9 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_statelog_la_SOURCES = cls/statelog/cls_statelog.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_statelog_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_statelog_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*'
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_timeindex_la_SOURCES = cls/timeindex/cls_timeindex.cc
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_timeindex_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_timeindex_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*'
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_replica_log_la_SOURCES = cls/replica_log/cls_replica_log.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_replica_log_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_replica_log_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*'
@@ -7756,28 +8836,40 @@ librbd_types_la_SOURCES = \
 
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_rgw_la_LIBADD = libjson_spirit.la $(PTHREAD_LIBS) $(EXTRALIBS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_rgw_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*'
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_cephfs_la_SOURCES = cls/cephfs/cls_cephfs.cc
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_cephfs_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_cephfs_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*'
 @ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at libcls_kvs_la_SOURCES = key_value_store/cls_kvs.cc
 @ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at libcls_kvs_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
 @ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at libcls_kvs_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*'
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at librbd_replay_types_la_SOURCES = \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd_replay/ActionTypes.cc
+
 
 # librbd_replay_la exists only to help with unit tests
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at librbd_replay_la_SOURCES = rbd_replay/actions.cc \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd_replay/Deser.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at librbd_replay_la_SOURCES = \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd_replay/actions.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd_replay/BufferReader.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd_replay/ImageNameMap.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd_replay/PendingIO.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd_replay/rbd_loc.cc \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd_replay/Replayer.cc \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd_replay/Ser.cc
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd_replay/Replayer.cc
 
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at librbd_replay_la_LIBADD = $(LIBRBD) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at librbd_replay_la_LIBADD = \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRBD) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRADOS) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(CEPH_GLOBAL)
 
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at rbd_replay_SOURCES = rbd_replay/rbd-replay.cc
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at rbd_replay_LDADD = $(LIBRBD) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at rbd_replay_SOURCES = \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd_replay/rbd-replay.cc
+
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at rbd_replay_LDADD = \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_replay.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_replay_types.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRBD) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRADOS) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(CEPH_GLOBAL) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_replay.la
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBCOMMON)
 
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at librbd_replay_ios_la_SOURCES = rbd_replay/ios.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at librbd_replay_ios_la_LIBADD = $(LIBRBD) \
@@ -7785,12 +8877,17 @@ librbd_types_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(CEPH_GLOBAL) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_replay.la
 
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at rbd_replay_prep_SOURCES = rbd_replay/rbd-replay-prep.cc
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at rbd_replay_prep_LDADD = $(LIBRBD) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRADOS) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(CEPH_GLOBAL) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at rbd_replay_prep_SOURCES = \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd_replay/rbd-replay-prep.cc
+
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at rbd_replay_prep_LDADD = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_replay.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_replay_ios.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_replay_types.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRBD) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRADOS) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(CEPH_GLOBAL) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBCOMMON) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	-lbabeltrace \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	-lbabeltrace-ctf \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	-lboost_date_time
@@ -7803,7 +8900,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD) $(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(BOOST_PROGRAM_OPTIONS_LIBS) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_122)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_129)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at ceph_erasure_code_non_regression_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	test/erasure-code/ceph_erasure_code_non_regression.cc
 
@@ -7811,7 +8908,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD) $(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(BOOST_PROGRAM_OPTIONS_LIBS) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_124)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_131)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at ceph_erasure_code_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	test/erasure-code/ceph_erasure_code.cc
 
@@ -7819,8 +8916,11 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD) $(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(BOOST_PROGRAM_OPTIONS_LIBS) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_126)
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_example_la_SOURCES = test/erasure-code/ErasureCodePluginExample.cc
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_133)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_example_la_SOURCES = \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	erasure-code/ErasureCode.cc \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	test/erasure-code/ErasureCodePluginExample.cc
+
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_example_la_CFLAGS = ${AM_CFLAGS}
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_example_la_CXXFLAGS = ${AM_CXXFLAGS}
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_example_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(EXTRALIBS)
@@ -7879,7 +8979,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD) $(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(UNITTEST_LDADD) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_128)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_135)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	erasure-code/ErasureCode.cc \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	test/erasure-code/TestErasureCode.cc
@@ -7902,7 +9002,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD) $(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(UNITTEST_LDADD) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_130)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_137)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_plugin_jerasure_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	test/erasure-code/TestErasureCodePluginJerasure.cc
 
@@ -7911,7 +9011,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD) $(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(UNITTEST_LDADD) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_131)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_138)
 @ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_isa_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	erasure-code/ErasureCode.cc \
 @ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	test/erasure-code/TestErasureCodeIsa.cc
@@ -7923,7 +9023,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
 @ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	.libs/libec_isa.la \
 @ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	$(LIBERASURE_CODE) \
- at ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	$(am__append_132)
+ at ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	$(am__append_139)
 @ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_plugin_isa_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	erasure-code/ErasureCode.cc \
 @ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	test/erasure-code/TestErasureCodePluginIsa.cc
@@ -7935,7 +9035,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
 @ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	.libs/libec_isa.la \
 @ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	$(LIBERASURE_CODE) \
- at ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	$(am__append_134)
+ at ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	$(am__append_141)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_lrc_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	test/erasure-code/TestErasureCodeLrc.cc \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	${lrc_sources}
@@ -7945,7 +9045,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD) $(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(UNITTEST_LDADD) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_135)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_142)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_plugin_lrc_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	test/erasure-code/TestErasureCodePluginLrc.cc
 
@@ -7954,40 +9054,120 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD) $(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(UNITTEST_LDADD) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_137)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_144)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_shec_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	test/erasure-code/TestErasureCodeShec.cc \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	${libec_shec_la_SOURCES}
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	${shec_sources}
+
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_shec_CFLAGS = ${libec_shec_la_CFLAGS} \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	-I$(srcdir)/erasure-code/jerasure/jerasure/include \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	-I$(srcdir)/erasure-code/jerasure/gf-complete/include \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	-I$(srcdir)/erasure-code/jerasure \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	-I$(srcdir)/erasure-code/shec
+
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_shec_CXXFLAGS = ${libec_shec_la_CXXFLAGS} $(UNITTEST_CXXFLAGS) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	-I$(srcdir)/erasure-code/jerasure/jerasure/include \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	-I$(srcdir)/erasure-code/jerasure/gf-complete/include \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	-I$(srcdir)/erasure-code/jerasure \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	-I$(srcdir)/erasure-code/shec
 
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_shec_CFLAGS = ${libec_shec_la_CFLAGS}
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_shec_CXXFLAGS = ${libec_shec_la_CXXFLAGS} $(UNITTEST_CXXFLAGS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_shec_LDADD =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD) $(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(UNITTEST_LDADD) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_138)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_145)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_shec_all_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	test/erasure-code/TestErasureCodeShec_all.cc \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	${libec_shec_la_SOURCES}
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	${shec_sources}
+
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_shec_all_CFLAGS = ${libec_shec_la_CFLAGS} \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	-I$(srcdir)/erasure-code/jerasure/jerasure/include \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	-I$(srcdir)/erasure-code/jerasure/gf-complete/include \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	-I$(srcdir)/erasure-code/jerasure \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	-I$(srcdir)/erasure-code/shec
+
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_shec_all_CXXFLAGS = ${libec_shec_la_CXXFLAGS} $(UNITTEST_CXXFLAGS) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	-I$(srcdir)/erasure-code/jerasure/jerasure/include \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	-I$(srcdir)/erasure-code/jerasure/gf-complete/include \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	-I$(srcdir)/erasure-code/jerasure \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	-I$(srcdir)/erasure-code/shec
 
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_shec_all_CFLAGS = ${libec_shec_la_CFLAGS}
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_shec_all_CXXFLAGS = ${libec_shec_la_CXXFLAGS} $(UNITTEST_CXXFLAGS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_shec_all_LDADD =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD) $(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(UNITTEST_LDADD) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_139)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_146)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_shec_thread_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	test/erasure-code/TestErasureCodeShec_thread.cc \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	${libec_shec_la_SOURCES}
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	${shec_sources}
+
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_shec_thread_CFLAGS = ${libec_shec_la_CFLAGS} \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	-I$(srcdir)/erasure-code/jerasure/jerasure/include \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	-I$(srcdir)/erasure-code/jerasure/gf-complete/include \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	-I$(srcdir)/erasure-code/jerasure \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	-I$(srcdir)/erasure-code/shec
+
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_shec_thread_CXXFLAGS = ${libec_shec_la_CXXFLAGS} $(UNITTEST_CXXFLAGS) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	-I$(srcdir)/erasure-code/jerasure/jerasure/include \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	-I$(srcdir)/erasure-code/jerasure/gf-complete/include \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	-I$(srcdir)/erasure-code/jerasure \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	-I$(srcdir)/erasure-code/shec
 
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_shec_thread_CFLAGS = ${libec_shec_la_CFLAGS}
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_shec_thread_CXXFLAGS = ${libec_shec_la_CXXFLAGS} $(UNITTEST_CXXFLAGS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_shec_thread_LDADD =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD) $(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(UNITTEST_LDADD) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_140)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_147)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_shec_arguments_SOURCES = \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	test/erasure-code/TestErasureCodeShec_arguments.cc \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	${shec_sources}
+
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_shec_arguments_CFLAGS = ${libec_shec_la_CFLAGS} \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	-I$(srcdir)/erasure-code/jerasure/jerasure/include \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	-I$(srcdir)/erasure-code/jerasure/gf-complete/include \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	-I$(srcdir)/erasure-code/jerasure \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	-I$(srcdir)/erasure-code/shec
+
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_shec_arguments_CXXFLAGS = ${libec_shec_la_CXXFLAGS} $(UNITTEST_CXXFLAGS) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	-I$(srcdir)/erasure-code/jerasure/jerasure/include \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	-I$(srcdir)/erasure-code/jerasure/gf-complete/include \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	-I$(srcdir)/erasure-code/jerasure \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	-I$(srcdir)/erasure-code/shec
+
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_shec_arguments_LDADD =  \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD) $(LIBCOMMON) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(UNITTEST_LDADD) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_148)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_plugin_shec_SOURCES = \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@        test/erasure-code/TestErasureCodePluginShec.cc
+
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_plugin_shec_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_plugin_shec_LDADD =  \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD) $(LIBCOMMON) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(UNITTEST_LDADD) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_149)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_test_shec_neon_la_SOURCES = test/erasure-code/TestShecPluginNEON.cc
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_test_shec_neon_la_CFLAGS = ${AM_CFLAGS}
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_test_shec_neon_la_CXXFLAGS = ${AM_CXXFLAGS}
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_test_shec_neon_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_test_shec_neon_la_LDFLAGS = ${AM_LDFLAGS} -export-symbols-regex '.*__erasure_code_.*'
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_test_shec_sse4_la_SOURCES = test/erasure-code/TestShecPluginSSE4.cc
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_test_shec_sse4_la_CFLAGS = ${AM_CFLAGS}
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_test_shec_sse4_la_CXXFLAGS = ${AM_CXXFLAGS}
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_test_shec_sse4_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_test_shec_sse4_la_LDFLAGS = ${AM_LDFLAGS} -export-symbols-regex '.*__erasure_code_.*'
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_test_shec_sse3_la_SOURCES = test/erasure-code/TestShecPluginSSE3.cc
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_test_shec_sse3_la_CFLAGS = ${AM_CFLAGS}
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_test_shec_sse3_la_CXXFLAGS = ${AM_CXXFLAGS}
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_test_shec_sse3_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_test_shec_sse3_la_LDFLAGS = ${AM_LDFLAGS} -export-symbols-regex '.*__erasure_code_.*'
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_test_shec_generic_la_SOURCES = test/erasure-code/TestShecPluginGeneric.cc
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_test_shec_generic_la_CFLAGS = ${AM_CFLAGS}
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_test_shec_generic_la_CXXFLAGS = ${AM_CXXFLAGS}
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_test_shec_generic_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_test_shec_generic_la_LDFLAGS = ${AM_LDFLAGS} -export-symbols-regex '.*__erasure_code_.*'
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_example_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	erasure-code/ErasureCode.cc \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	test/erasure-code/TestErasureCodeExample.cc
@@ -8005,7 +9185,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(CEPH_GLOBAL) \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(PTHREAD_LIBS) \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(EXTRALIBS) \
- at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__append_142)
+ at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__append_151)
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE at simple_client_SOURCES = \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	test/messenger/simple_client.cc \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	test/messenger/simple_dispatcher.cc
@@ -8017,7 +9197,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(CEPH_GLOBAL) \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(PTHREAD_LIBS) \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(EXTRALIBS) \
- at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__append_143)
+ at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__append_152)
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE at xio_server_SOURCES = \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	test/messenger/xio_server.cc \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	test/messenger/xio_dispatcher.cc
@@ -8029,7 +9209,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(PTHREAD_LIBS) \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(EXTRALIBS) \
- at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__append_145)
+ at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__append_154)
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE at xio_client_SOURCES = \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	test/messenger/xio_client.cc \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	test/messenger/xio_dispatcher.cc
@@ -8041,7 +9221,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(PTHREAD_LIBS) \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(EXTRALIBS) \
- at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__append_146)
+ at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__append_155)
 
 # This should use LIBMDS_TYPES once it exists
 @ENABLE_CLIENT_TRUE at ceph_dencoder_SOURCES = \
@@ -8059,9 +9239,9 @@ librbd_types_la_SOURCES = \
 
 # These should always use explicit _CFLAGS/_CXXFLAGS so avoid basename conflicts
 @ENABLE_CLIENT_TRUE at ceph_dencoder_CFLAGS = ${AM_CFLAGS} \
- at ENABLE_CLIENT_TRUE@	$(am__append_147)
+ at ENABLE_CLIENT_TRUE@	$(am__append_156)
 @ENABLE_CLIENT_TRUE at ceph_dencoder_CXXFLAGS = ${AM_CXXFLAGS} \
- at ENABLE_CLIENT_TRUE@	$(am__append_148)
+ at ENABLE_CLIENT_TRUE@	$(am__append_157)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at libradostest_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	test/librados/test.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	test/librados/TestCase.cc
@@ -8097,6 +9277,8 @@ librbd_types_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_smalliobench_LDADD = $(LIBRADOS) $(BOOST_PROGRAM_OPTIONS_LIBS) $(CEPH_GLOBAL)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_omapbench_SOURCES = test/omap_bench.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_omapbench_LDADD = $(LIBRADOS) $(CEPH_GLOBAL)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_objectstore_bench_SOURCES = test/objectstore_bench.cc
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_objectstore_bench_LDADD = $(LIBOS) $(CEPH_GLOBAL)
 @ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE at ceph_kvstorebench_SOURCES = \
 @ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@	test/kv_store_bench.cc \
 @ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@	key_value_store/kv_flat_btree_async.cc
@@ -8140,7 +9322,8 @@ librbd_types_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_cls_rbd_SOURCES = test/cls_rbd/test_cls_rbd.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_cls_rbd_LDADD = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) libcls_rbd_client.la libcls_lock_client.la \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBCOMMON) $(UNITTEST_LDADD) $(RADOS_TEST_LDADD)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBCOMMON) $(UNITTEST_LDADD) $(RADOS_TEST_LDADD) $(CRYPTO_LIBS) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(EXTRALIBS)
 
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_cls_rbd_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_cls_refcount_SOURCES = test/cls_refcount/test_cls_refcount.cc
@@ -8173,10 +9356,16 @@ librbd_types_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(UNITTEST_LDADD) $(CEPH_GLOBAL) $(RADOS_TEST_LDADD)
 
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_cls_hello_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_cls_numops_SOURCES = test/cls_numops/test_cls_numops.cc
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_cls_numops_LDADD = \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@    $(LIBRADOS) libcls_numops_client.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@    $(UNITTEST_LDADD) $(CEPH_GLOBAL) $(RADOS_TEST_LDADD)
+
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_cls_numops_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_rados_api_cmd_SOURCES = test/librados/cmd.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_rados_api_cmd_LDADD = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBCOMMON) $(LIBRADOS) $(CRYPTO_LIBS) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(UNITTEST_LDADD) $(RADOS_TEST_LDADD) -luuid
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(UNITTEST_LDADD) $(RADOS_TEST_LDADD)
 
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_rados_api_cmd_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_rados_api_io_SOURCES = test/librados/io.cc
@@ -8270,55 +9459,40 @@ librbd_types_la_SOURCES = \
 
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at librbd_test_la_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at unittest_librbd_SOURCES = \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@        test/librbd/test_main.cc
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@        test/librbd/test_main.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/test_mock_fixture.cc
 
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at unittest_librbd_CXXFLAGS = $(UNITTEST_CXXFLAGS) -DTEST_LIBRBD_INTERNALS
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at unittest_librbd_LDADD = librbd_test.la \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_api.la \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_internal.la \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRBD_TYPES) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	libcls_rbd_client.la \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	libcls_lock_client.la \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librados_test_stub.la \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librados_internal.la \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBOSDC) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(UNITTEST_LDADD) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(CEPH_GLOBAL) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(RADOS_TEST_LDADD) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__append_161)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at unittest_librbd_LDADD = \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_test.la librbd_api.la librbd_internal.la $(LIBRBD_TYPES) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	libcls_rbd_client.la libcls_lock_client.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librados_test_stub.la librados_internal.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBOSDC) $(UNITTEST_LDADD) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(CEPH_GLOBAL) $(RADOS_TEST_LDADD)
+
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at ceph_test_librbd_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@        test/librbd/test_main.cc
 
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at ceph_test_librbd_CXXFLAGS = $(UNITTEST_CXXFLAGS) -DTEST_LIBRBD_INTERNALS
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at ceph_test_librbd_LDADD = librbd_test.la \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_api.la \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_internal.la \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRBD_TYPES) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	libcls_rbd_client.la \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	libcls_lock_client.la \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librados_api.la \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRADOS_DEPS) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(UNITTEST_LDADD) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(CEPH_GLOBAL) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(RADOS_TEST_LDADD) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__append_162)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at ceph_test_librbd_LDADD = \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_test.la librbd_api.la librbd_internal.la $(LIBRBD_TYPES) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	libcls_rbd_client.la libcls_lock_client.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librados_api.la $(LIBRADOS_DEPS) $(UNITTEST_LDADD) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(CEPH_GLOBAL) $(RADOS_TEST_LDADD)
+
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at ceph_test_librbd_api_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/test_support.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/test_librbd.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/test_main.cc
 
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at ceph_test_librbd_api_CXXFLAGS = $(UNITTEST_CXXFLAGS)
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at ceph_test_librbd_api_LDADD =  \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRBD) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRADOS) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBCOMMON) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(UNITTEST_LDADD) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(RADOS_TEST_LDADD) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__append_163)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at ceph_test_librbd_api_LDADD = \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRBD) $(LIBRADOS) $(LIBCOMMON) $(UNITTEST_LDADD) $(RADOS_TEST_LDADD)
+
 @ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at ceph_test_librbd_fsx_SOURCES = test/librbd/fsx.cc
 @ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at ceph_test_librbd_fsx_LDADD = \
 @ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBKRBD) $(LIBRBD) $(LIBRADOS) \
- at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(CRYPTO_LIBS) $(PTHREAD_LIBS) -luuid
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(CRYPTO_LIBS) $(PTHREAD_LIBS)
 
 @ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at ceph_test_librbd_fsx_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at libradosstripertest_la_SOURCES = test/libradosstriper/TestCase.cc
@@ -8364,19 +9538,17 @@ librbd_types_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at unittest_libcephfs_config_SOURCES = test/libcephfs_config.cc
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at unittest_libcephfs_config_LDADD = $(LIBCEPHFS) $(CEPH_GLOBAL) $(UNITTEST_LDADD)
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at unittest_libcephfs_config_CXXFLAGS = $(UNITTEST_CXXFLAGS)
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at ceph_test_libcephfs_SOURCES = \
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	test/libcephfs/test.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at ceph_test_libcephfs_SOURCES = test/libcephfs/test.cc \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	test/libcephfs/readdir_r_cb.cc \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	test/libcephfs/caps.cc \
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	test/libcephfs/multiclient.cc
-
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	test/libcephfs/multiclient.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__append_178)
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at ceph_test_libcephfs_LDADD = $(LIBCEPHFS) $(UNITTEST_LDADD)
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at ceph_test_libcephfs_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at unittest_encoding_SOURCES = test/encoding.cc
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at ceph_test_c_headers_SOURCES = test/test_c_headers.c
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at ceph_test_c_headers_LDADD = $(LIBRADOS) $(LIBCEPHFS)
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at ceph_test_c_headers_CFLAGS = $(AM_CFLAGS) \
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	-Werror \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	-Wstrict-prototypes \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	-Wredundant-decls \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	-Wall \
@@ -8391,10 +9563,9 @@ librbd_types_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	-Wformat-y2k \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	-Winit-self \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	-Wignored-qualifiers \
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	-Wold-style-declaration \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	-Wold-style-definition \
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	-Wtype-limits
-
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	-Wtype-limits \
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__append_180)
 @ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at test_build_librgw_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	test/buildtest_skeleton.cc \
 @ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(librgw_la_SOURCES)
@@ -8412,32 +9583,39 @@ librbd_types_la_SOURCES = \
 #unittest_librgw_link_LDFLAGS = $(PTHREAD_CFLAGS) ${AM_LDFLAGS}
 #unittest_librgw_link_LDADD = $(LIBRGW) ${UNITTEST_LDADD}
 #unittest_librgw_link_CXXFLAGS = ${CRYPTO_CFLAGS} ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-#check_PROGRAMS += unittest_librgw_link
+#check_TESTPROGRAMS += unittest_librgw_link
 
 #unittest_librgw_SOURCES = test/librgw.cc
 #unittest_librgw_LDFLAGS = -lrt $(PTHREAD_CFLAGS) -lcurl ${AM_LDFLAGS}
 #unittest_librgw_LDADD =  librgw.la $(LIBRADOS) ${UNITTEST_LDADD} -lexpat $(CEPH_GLOBAL)
 #unittest_librgw_CXXFLAGS = ${CRYPTO_CFLAGS} ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-#check_PROGRAMS += unittest_librgw
+#check_TESTPROGRAMS += unittest_librgw
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at ceph_test_cors_SOURCES = test/test_cors.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at ceph_test_cors_LDADD = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) $(LIBRGW) $(CEPH_GLOBAL) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(UNITTEST_LDADD) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	-lcurl -luuid -lexpat
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	-lcurl -lexpat
 
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at ceph_test_cors_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at ceph_test_rgw_manifest_SOURCES = test/rgw/test_rgw_manifest.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at ceph_test_rgw_manifest_LDADD = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) $(LIBRGW) $(LIBRGW_DEPS) $(CEPH_GLOBAL) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(UNITTEST_LDADD) $(CRYPTO_LIBS) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	-lcurl -luuid -lexpat
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	-lcurl -lexpat
 
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at ceph_test_rgw_manifest_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at ceph_test_rgw_obj_SOURCES = test/rgw/test_rgw_obj.cc
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at ceph_test_rgw_obj_LDADD = \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) $(LIBRGW) $(LIBRGW_DEPS) $(CEPH_GLOBAL) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(UNITTEST_LDADD) $(CRYPTO_LIBS) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	-lcurl -luuid -lexpat
+
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at ceph_test_rgw_obj_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at ceph_test_cls_rgw_meta_SOURCES = test/test_rgw_admin_meta.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at ceph_test_cls_rgw_meta_LDADD = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) $(LIBRGW) $(CEPH_GLOBAL) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(UNITTEST_LDADD) $(CRYPTO_LIBS) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	-lcurl -luuid -lexpat \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	-lcurl -lexpat \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_version_client.a libcls_log_client.a \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_statelog_client.a libcls_refcount_client.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_rgw_client.la libcls_user_client.a libcls_lock_client.la
@@ -8447,7 +9625,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at ceph_test_cls_rgw_log_LDADD = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) $(LIBRGW) $(CEPH_GLOBAL) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(UNITTEST_LDADD) $(CRYPTO_LIBS) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	-lcurl -luuid -lexpat \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	-lcurl -lexpat \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_version_client.a libcls_log_client.a \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_statelog_client.a libcls_refcount_client.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_rgw_client.la libcls_user_client.a libcls_lock_client.la
@@ -8457,10 +9635,11 @@ librbd_types_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at ceph_test_cls_rgw_opstate_LDADD = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) $(LIBRGW) $(CEPH_GLOBAL) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(UNITTEST_LDADD) $(CRYPTO_LIBS) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	-lcurl -luuid -lexpat \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_version_client.a libcls_log_client.a \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	-lcurl -lexpat \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_version_client.a libcls_log_client.a  libcls_timeindex_client.a \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_statelog_client.a libcls_refcount_client.la \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_rgw_client.la libcls_user_client.a libcls_lock_client.la
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_rgw_client.la libcls_user_client.a libcls_lock_client.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS)
 
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at ceph_test_cls_rgw_opstate_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at ceph_test_cls_rgw_SOURCES = test/cls_rgw/test_cls_rgw.cc
@@ -8487,9 +9666,24 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE at ceph_perf_objectstore_SOURCES = test/objectstore/ObjectStoreTransactionBenchmark.cc
 @ENABLE_SERVER_TRUE at ceph_perf_objectstore_LDADD = $(LIBOS) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 @ENABLE_SERVER_TRUE at ceph_perf_objectstore_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+ at ENABLE_SERVER_TRUE@ceph_perf_local_SOURCES = test/perf_local.cc test/perf_helper.cc
+ at ENABLE_SERVER_TRUE@ceph_perf_local_LDADD = $(LIBOS) $(CEPH_GLOBAL)
+ at ENABLE_SERVER_TRUE@ceph_perf_local_CXXFLAGS = ${AM_CXXFLAGS} 	\
+ at ENABLE_SERVER_TRUE@	${INTEL_SSE_FLAGS} \
+ at ENABLE_SERVER_TRUE@	${INTEL_SSE2_FLAGS}
+
+ at ENABLE_SERVER_TRUE@ceph_perf_msgr_server_SOURCES = test/msgr/perf_msgr_server.cc
+ at ENABLE_SERVER_TRUE@ceph_perf_msgr_server_LDADD = $(LIBOS) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+ at ENABLE_SERVER_TRUE@ceph_perf_msgr_server_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+ at ENABLE_SERVER_TRUE@ceph_perf_msgr_client_SOURCES = test/msgr/perf_msgr_client.cc
+ at ENABLE_SERVER_TRUE@ceph_perf_msgr_client_LDADD = $(LIBOS) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+ at ENABLE_SERVER_TRUE@ceph_perf_msgr_client_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 @ENABLE_SERVER_TRUE@@LINUX_TRUE at ceph_test_objectstore_SOURCES = test/objectstore/store_test.cc
 @ENABLE_SERVER_TRUE@@LINUX_TRUE at ceph_test_objectstore_LDADD = $(LIBOS) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 @ENABLE_SERVER_TRUE@@LINUX_TRUE at ceph_test_objectstore_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE at ceph_test_keyvaluedb_SOURCES = test/objectstore/test_kv.cc
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE at ceph_test_keyvaluedb_LDADD = $(LIBOS) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE at ceph_test_keyvaluedb_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 @ENABLE_SERVER_TRUE@@LINUX_TRUE at ceph_test_filestore_SOURCES = test/filestore/TestFileStore.cc
 @ENABLE_SERVER_TRUE@@LINUX_TRUE at ceph_test_filestore_LDADD = $(LIBOS) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 @ENABLE_SERVER_TRUE@@LINUX_TRUE at ceph_test_filestore_CXXFLAGS = $(UNITTEST_CXXFLAGS)
@@ -8569,13 +9763,13 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_osdscrub_LDADD =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD) $(UNITTEST_LDADD) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_180)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_192)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_pglog_SOURCES = test/osd/TestPGLog.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_pglog_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_pglog_LDADD = $(LIBOSD) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(UNITTEST_LDADD) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_181)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_193)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_hitset_SOURCES = test/osd/hitset.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_hitset_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_hitset_LDADD = $(LIBOSD) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
@@ -8585,12 +9779,18 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at ceph_test_snap_mapper_SOURCES = test/test_snap_mapper.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at ceph_test_snap_mapper_LDADD = $(LIBOSD) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at ceph_test_snap_mapper_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_pageset_SOURCES = test/test_pageset.cc
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_pageset_LDADD = $(UNITTEST_LDADD)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_pageset_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+ at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE at unittest_rocksdb_option_static_SOURCES = test/objectstore/TestRocksdbOptionParse.cc
+ at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE at unittest_rocksdb_option_static_LDADD = $(LIBOS) $(UNITTEST_LDADD) $(CEPH_GLOBAL) rocksdb/librocksdb.la
+ at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE at unittest_rocksdb_option_static_CXXFLAGS = $(UNITTEST_CXXFLAGS) ${AM_CXXFLAGS} ${LIBROCKSDB_CFLAGS} -std=gnu++11 -I rocksdb/include
+ at ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_TRUE at unittest_rocksdb_option_SOURCES = test/objectstore/TestRocksdbOptionParse.cc
+ at ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_TRUE at unittest_rocksdb_option_LDADD = $(LIBOS) $(UNITTEST_LDADD) $(CEPH_GLOBAL) -lrocksdb
+ at ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_TRUE at unittest_rocksdb_option_CXXFLAGS = $(UNITTEST_CXXFLAGS) ${AM_CXXFLAGS} ${LIBROCKSDB_CFLAGS} -std=gnu++11
 @ENABLE_SERVER_TRUE at unittest_chain_xattr_SOURCES = test/objectstore/chain_xattr.cc
 @ENABLE_SERVER_TRUE at unittest_chain_xattr_LDADD = $(LIBOS) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 @ENABLE_SERVER_TRUE at unittest_chain_xattr_CXXFLAGS = $(UNITTEST_CXXFLAGS)
- at ENABLE_SERVER_TRUE@unittest_flatindex_SOURCES = test/os/TestFlatIndex.cc
- at ENABLE_SERVER_TRUE@unittest_flatindex_LDADD = $(LIBOS) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
- at ENABLE_SERVER_TRUE@unittest_flatindex_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 @ENABLE_SERVER_TRUE at unittest_lfnindex_SOURCES = test/os/TestLFNIndex.cc
 @ENABLE_SERVER_TRUE at unittest_lfnindex_LDADD = $(LIBOS) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 @ENABLE_SERVER_TRUE at unittest_lfnindex_CXXFLAGS = $(UNITTEST_CXXFLAGS)
@@ -8651,6 +9851,9 @@ unittest_bloom_filter_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 unittest_histogram_SOURCES = test/common/histogram.cc
 unittest_histogram_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_histogram_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+unittest_prioritized_queue_SOURCES = test/common/test_prioritized_queue.cc
+unittest_prioritized_queue_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+unittest_prioritized_queue_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 unittest_str_map_SOURCES = test/common/test_str_map.cc
 unittest_str_map_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_str_map_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
@@ -8678,10 +9881,10 @@ unittest_osdmap_LDADD = $(UNITTEST_LDADD) $(LIBCOMMON) $(CEPH_GLOBAL)
 unittest_workqueue_SOURCES = test/test_workqueue.cc
 unittest_workqueue_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_workqueue_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
-unittest_striper_SOURCES = test/test_striper.cc 
+unittest_striper_SOURCES = test/test_striper.cc
 unittest_striper_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_striper_LDADD = $(LIBOSDC) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
-unittest_prebufferedstreambuf_SOURCES = test/test_prebufferedstreambuf.cc 
+unittest_prebufferedstreambuf_SOURCES = test/test_prebufferedstreambuf.cc
 unittest_prebufferedstreambuf_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_prebufferedstreambuf_LDADD = $(LIBCOMMON) $(UNITTEST_LDADD) $(EXTRALIBS)
 unittest_str_list_SOURCES = test/test_str_list.cc
@@ -8704,7 +9907,7 @@ unittest_mds_types_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_mds_types_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 unittest_osd_types_SOURCES = test/osd/types.cc
 unittest_osd_types_CXXFLAGS = $(UNITTEST_CXXFLAGS)
-unittest_osd_types_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL) 
+unittest_osd_types_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 unittest_lru_SOURCES = test/common/test_lru.cc
 unittest_lru_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_lru_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
@@ -8718,8 +9921,11 @@ unittest_signals_SOURCES = test/signals.cc
 unittest_signals_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 unittest_signals_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_bufferlist_SOURCES = test/bufferlist.cc
-unittest_bufferlist_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL) 
+unittest_bufferlist_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 unittest_bufferlist_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+unittest_xlist_SOURCES = test/test_xlist.cc
+unittest_xlist_LDADD = $(UNITTEST_LDADD) $(LIBCOMMON)
+unittest_xlist_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_crc32c_SOURCES = test/common/test_crc32c.cc
 unittest_crc32c_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 unittest_crc32c_CXXFLAGS = $(UNITTEST_CXXFLAGS)
@@ -8796,6 +10002,12 @@ unittest_tableformatter_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 unittest_bit_vector_SOURCES = test/common/test_bit_vector.cc
 unittest_bit_vector_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_bit_vector_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+unittest_subprocess_SOURCES = test/test_subprocess.cc
+unittest_subprocess_LDADD = $(LIBCOMMON) $(UNITTEST_LDADD)
+unittest_subprocess_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+unittest_async_compressor_SOURCES = test/common/test_async_compressor.cc
+unittest_async_compressor_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+unittest_async_compressor_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL) $(LIBCOMPRESSOR)
 ceph_test_objectcacher_stress_SOURCES = \
 	test/osdc/object_cacher_stress.cc \
 	test/osdc/FakeWriteback.cc
@@ -8812,15 +10024,15 @@ ceph_test_get_blkdev_size_LDADD = $(LIBCOMMON)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_radosacl_LDADD = $(LIBRADOS) $(CEPH_GLOBAL)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at rados_SOURCES =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	tools/rados/rados.cc \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	tools/rados/rados_import.cc \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	tools/rados/rados_export.cc \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	tools/rados/rados_sync.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	tools/RadosDump.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	tools/rados/RadosImport.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	tools/rados/PoolDump.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	common/obj_bencher.cc # \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	needs cleanup so it can \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	go in libcommon.la
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at rados_LDADD = libcls_lock_client.la $(LIBRADOS) $(CEPH_GLOBAL)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at rados_LDADD = libcls_lock_client.la $(LIBRADOS) $(LIBRADOSSTRIPER) $(CEPH_GLOBAL)
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at ceph_client_debug_SOURCES = tools/ceph-client-debug.cc
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at ceph_client_debug_LDADD = $(LIBCEPHFS) $(CEPH_GLOBAL) $(LIBCOMMON)
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at ceph_client_debug_LDADD = $(LIBCEPHFS) $(LIBCLIENT) $(CEPH_GLOBAL) $(LIBCOMMON)
 @ENABLE_SERVER_TRUE at ceph_osdomap_tool_SOURCES = tools/ceph_osdomap_tool.cc
 @ENABLE_SERVER_TRUE at ceph_osdomap_tool_LDADD = $(LIBOS) $(CEPH_GLOBAL) $(BOOST_PROGRAM_OPTIONS_LIBS)
 @ENABLE_SERVER_TRUE at ceph_monstore_tool_SOURCES = tools/ceph_monstore_tool.cc
@@ -8828,13 +10040,14 @@ ceph_test_get_blkdev_size_LDADD = $(LIBCOMMON)
 @ENABLE_SERVER_TRUE at ceph_kvstore_tool_SOURCES = tools/ceph_kvstore_tool.cc
 @ENABLE_SERVER_TRUE at ceph_kvstore_tool_LDADD = $(LIBOS) $(CEPH_GLOBAL)
 @ENABLE_SERVER_TRUE at ceph_kvstore_tool_CXXFLAGS = $(UNITTEST_CXXFLAGS)
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at ceph_objectstore_tool_SOURCES = tools/ceph_objectstore_tool.cc
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at ceph_monstore_update_crushdir = $(libdir)/ceph
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at ceph_monstore_update_crush_SCRIPTS = tools/ceph-monstore-update-crush.sh
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at ceph_objectstore_tool_SOURCES = tools/ceph_objectstore_tool.cc tools/RadosDump.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at ceph_objectstore_tool_LDADD =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD) $(LIBOS) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(BOOST_PROGRAM_OPTIONS_LIBS) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBRADOS) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_193)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_206)
 @ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE at cephfs_journal_tool_SOURCES = \
 @ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE@	tools/cephfs/cephfs-journal-tool.cc \
 @ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE@	tools/cephfs/JournalTool.cc \
@@ -8852,6 +10065,12 @@ ceph_test_get_blkdev_size_LDADD = $(LIBCOMMON)
 @ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE@	tools/cephfs/MDSUtility.cc
 
 @ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE at cephfs_table_tool_LDADD = $(LIBMDS) $(LIBRADOS) $(CEPH_GLOBAL)
+ at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE at cephfs_data_scan_SOURCES = \
+ at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE@	tools/cephfs/cephfs-data-scan.cc \
+ at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE@	tools/cephfs/DataScan.cc \
+ at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE@	tools/cephfs/MDSUtility.cc
+
+ at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE at cephfs_data_scan_LDADD = $(LIBMDS) libcls_cephfs_client.la $(LIBRADOS) $(CEPH_GLOBAL)
 monmaptool_SOURCES = tools/monmaptool.cc
 monmaptool_LDADD = $(CEPH_GLOBAL) $(LIBCOMMON)
 crushtool_SOURCES = tools/crushtool.cc
@@ -8860,16 +10079,55 @@ osdmaptool_SOURCES = tools/osdmaptool.cc
 osdmaptool_LDADD = $(CEPH_GLOBAL)
 ceph_psim_SOURCES = tools/psim.cc
 ceph_psim_LDADD = $(CEPH_GLOBAL)
- at WITH_REST_BENCH_TRUE@rest_bench_SOURCES = tools/rest_bench.cc \
- at WITH_REST_BENCH_TRUE@	common/obj_bencher.cc # needs cleanup so \
- at WITH_REST_BENCH_TRUE@	it can go in libcommon.la
- at WITH_REST_BENCH_TRUE@rest_bench_LDADD = $(CEPH_GLOBAL) \
- at WITH_REST_BENCH_TRUE@	$(am__append_197) $(am__append_198)
- at WITH_REST_BENCH_TRUE@@WITH_SYSTEM_LIBS3_FALSE at rest_bench_CXXFLAGS = ${AM_CXXFLAGS} -I$(top_srcdir)/src/libs3/inc
 ceph_conf_SOURCES = tools/ceph_conf.cc
 ceph_conf_LDADD = $(CEPH_GLOBAL) $(LIBCOMMON)
 ceph_authtool_SOURCES = tools/ceph_authtool.cc
 ceph_authtool_LDADD = $(CEPH_GLOBAL) $(LIBCOMMON)
+libcompressor_la_SOURCES = \
+	compressor/Compressor.cc \
+	compressor/AsyncCompressor.cc
+
+libcompressor_la_LIBADD = $(LIBCOMMON)
+ at WITH_LTTNG_TRUE@libosd_tp_la_SOURCES = \
+ at WITH_LTTNG_TRUE@	tracing/oprequest.c \
+ at WITH_LTTNG_TRUE@	tracing/osd.c \
+ at WITH_LTTNG_TRUE@	tracing/pg.c
+
+ at WITH_LTTNG_TRUE@nodist_libosd_tp_la_SOURCES = \
+ at WITH_LTTNG_TRUE@	tracing/oprequest.h \
+ at WITH_LTTNG_TRUE@	tracing/osd.h \
+ at WITH_LTTNG_TRUE@	tracing/pg.h
+
+libosd_tp_la_LIBADD = -llttng-ust -ldl
+libosd_tp_la_CFLAGS = -I$(top_srcdir)/src/tracing -I$(top_srcdir)/src $(AM_CFLAGS) -fpic
+libosd_tp_la_LDFLAGS = -version-info 1:0:0
+ at WITH_LTTNG_TRUE@librados_tp_la_SOURCES = \
+ at WITH_LTTNG_TRUE@	tracing/librados.c
+
+ at WITH_LTTNG_TRUE@nodist_librados_tp_la_SOURCES = \
+ at WITH_LTTNG_TRUE@	tracing/librados.h
+
+librados_tp_la_LIBADD = -llttng-ust -ldl
+librados_tp_la_CFLAGS = -I$(top_srcdir)/src/tracing -I$(top_srcdir)/src $(AM_CFLAGS) -fpic
+librados_tp_la_LDFLAGS = -version-info 2:0:0
+ at WITH_LTTNG_TRUE@librbd_tp_la_SOURCES = \
+ at WITH_LTTNG_TRUE@	tracing/librbd.c
+
+ at WITH_LTTNG_TRUE@nodist_librbd_tp_la_SOURCES = \
+ at WITH_LTTNG_TRUE@	tracing/librbd.h
+
+librbd_tp_la_LIBADD = -llttng-ust -ldl
+librbd_tp_la_CFLAGS = -I$(top_srcdir)/src/tracing -I$(top_srcdir)/src $(AM_CFLAGS) -fpic
+librbd_tp_la_LDFLAGS = -version-info 1:0:0
+ at WITH_LTTNG_TRUE@libos_tp_la_SOURCES = \
+ at WITH_LTTNG_TRUE@	tracing/objectstore.c
+
+ at WITH_LTTNG_TRUE@nodist_libos_tp_la_SOURCES = \
+ at WITH_LTTNG_TRUE@	tracing/objectstore.h
+
+libos_tp_la_LIBADD = -llttng-ust -ldl
+libos_tp_la_CFLAGS = -I$(top_srcdir)/src/tracing -I$(top_srcdir)/src $(AM_CFLAGS) -fpic
+libos_tp_la_LDFLAGS = -version-info 1:0:0
 
 # subdirs
 
@@ -8884,7 +10142,7 @@ editpaths = sed \
 	-e 's|@@GCOV_PREFIX_STRIP[@][@]|$(GCOV_PREFIX_STRIP)|g'
 
 shell_scripts = ceph-debugpack ceph-post-file ceph-crush-location \
-	$(am__append_219)
+	$(am__append_232)
 doc_DATA = $(srcdir)/sample.ceph.conf sample.fetch_config
 
 # various scripts
@@ -8893,26 +10151,18 @@ shell_common_SCRIPTS = ceph_common.sh
 ceph_libexecdir = $(libexecdir)/ceph
 ceph_libexec_SCRIPTS = ceph-osd-prestart.sh
 
-# tests to actually run on "make check"; if you need extra, non-test,
-# executables built, you need to replace this with manual assignments
-# target by target
-TESTS = \
-	$(check_PROGRAMS) \
-	$(check_SCRIPTS)
-
-
 # TODO: If we're running the parallel test harness (the preferred harness), this should be AM_TESTS_ENVIRONMENT instead.
 # See: https://www.gnu.org/software/automake/manual/html_node/Scripts_002dbased-Testsuites.html
 # I don't see the serial-tests Automake option anywhere, but my AM_TESTS_ENVIRONMENT was being ignored.
 @WITH_LTTNG_TRUE at TESTS_ENVIRONMENT = LD_PRELOAD=liblttng-ust-fork.so; export LD_PRELOAD; echo "LD_PRELOAD=$${LD_PRELOAD}";
 
 # pybind
-python_PYTHON = $(am__append_202) $(am__append_205) $(am__append_209) \
-	$(am__append_214) $(am__append_218)
+python_PYTHON = $(am__append_213) $(am__append_216) $(am__append_220) \
+	$(am__append_226) $(am__append_231)
 @ENABLE_CLIENT_TRUE at bash_completiondir = $(sysconfdir)/bash_completion.d
 @ENABLE_CLIENT_TRUE at bash_completion_DATA =  \
 @ENABLE_CLIENT_TRUE@	$(srcdir)/bash_completion/ceph \
- at ENABLE_CLIENT_TRUE@	$(am__append_204) $(am__append_207)
+ at ENABLE_CLIENT_TRUE@	$(am__append_215) $(am__append_218)
 @ENABLE_CLIENT_TRUE at ceph_syn_SOURCES = ceph_syn.cc \
 @ENABLE_CLIENT_TRUE@	client/SyntheticClient.cc # uses g_conf.. \
 @ENABLE_CLIENT_TRUE@	needs cleanup
@@ -8927,15 +10177,21 @@ python_PYTHON = $(am__append_202) $(am__append_205) $(am__append_209) \
 # Fuse targets
 @ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_RADOS_TRUE at ceph_fuse_SOURCES = ceph_fuse.cc
 @ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_RADOS_TRUE at ceph_fuse_LDADD = $(LIBCLIENT_FUSE) $(CEPH_GLOBAL)
- at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_RADOS_TRUE at rbd_fuse_SOURCES = rbd_fuse/rbd-fuse.cc
- at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_RADOS_TRUE at rbd_fuse_LDADD = -lfuse $(LIBRBD) $(LIBRADOS) $(CEPH_GLOBAL)
+ at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at rbd_fuse_SOURCES = rbd_fuse/rbd-fuse.cc
+ at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at rbd_fuse_CXXFLAGS = $(AM_CXXFLAGS) $(LIBFUSE_CFLAGS)
+ at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at rbd_fuse_LDADD = $(LIBFUSE_LIBS) $(LIBRBD) $(LIBRADOS) $(CEPH_GLOBAL)
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at cephfs_SOURCES = cephfs.cc
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at cephfs_LDADD = $(LIBCOMMON)
 
 # libcephfs (this should go somewhere else in the future)
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at libcephfs_la_SOURCES = libcephfs.cc
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at libcephfs_la_LIBADD = $(LIBCLIENT) $(LIBCOMMON) $(PTHREAD_LIBS) $(CRYPTO_LIBS) $(EXTRALIBS)
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at libcephfs_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 -export-symbols-regex '^ceph_.*'
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at libcephfs_la_LDFLAGS = ${AM_LDFLAGS} \
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	-version-info \
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	1:0:0 \
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	-export-symbols-regex \
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	'^ceph_.*' \
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__append_227)
 
 # jni library (java source is in src/java)
 @ENABLE_CEPHFS_JAVA_TRUE@@ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at libcephfs_jni_la_SOURCES = \
@@ -8944,11 +10200,11 @@ python_PYTHON = $(am__append_202) $(am__append_205) $(am__append_209) \
 @ENABLE_CEPHFS_JAVA_TRUE@@ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	java/native/JniConstants.cpp \
 @ENABLE_CEPHFS_JAVA_TRUE@@ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	java/native/JniConstants.h
 
- at ENABLE_CEPHFS_JAVA_TRUE@@ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at libcephfs_jni_la_LIBADD = $(LIBCEPHFS) $(EXTRALIBS)
+ at ENABLE_CEPHFS_JAVA_TRUE@@ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at libcephfs_jni_la_LIBADD = $(LIBCEPHFS) $(LIBCOMMON) $(EXTRALIBS)
 @ENABLE_CEPHFS_JAVA_TRUE@@ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at libcephfs_jni_la_CPPFLAGS = $(JDK_CPPFLAGS) $(AM_CPPFLAGS)
 @ENABLE_CEPHFS_JAVA_TRUE@@ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at libcephfs_jni_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0
 @ENABLE_SERVER_TRUE at ceph_sbin_SCRIPTS = ceph-create-keys \
- at ENABLE_SERVER_TRUE@	$(am__append_224)
+ at ENABLE_SERVER_TRUE@	$(am__append_237)
 @ENABLE_SERVER_TRUE at mount_ceph_SOURCES = mount/mount.ceph.c
 @ENABLE_SERVER_TRUE at mount_ceph_LDADD = $(LIBSECRET) $(LIBCOMMON)
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE at ceph_mon_SOURCES = ceph_mon.cc
@@ -8965,7 +10221,7 @@ all: $(BUILT_SOURCES) acconfig.h
 
 .SUFFIXES:
 .SUFFIXES: .S .c .cc .cpp .lo .log .o .obj .s .test .test$(EXEEXT) .trs
-$(srcdir)/Makefile.in:  $(srcdir)/Makefile.am $(srcdir)/Makefile-env.am $(srcdir)/arch/Makefile.am $(srcdir)/auth/Makefile.am $(srcdir)/brag/Makefile.am $(srcdir)/crush/Makefile.am $(srcdir)/mon/Makefile.am $(srcdir)/mds/Makefile.am $(srcdir)/mds/Makefile-client.am $(srcdir)/mds/Makefile-server.am $(srcdir)/os/Makefile.am $(srcdir)/osd/Makefile.am $(srcdir)/erasure-code/Makefile.am $(srcdir)/erasure-code/jerasure/Makefile.am $(srcdir)/erasure-code/lrc/Makefile.am $(srcdir)/erasure-code/s [...]
+$(srcdir)/Makefile.in:  $(srcdir)/Makefile.am $(srcdir)/Makefile-env.am $(srcdir)/arch/Makefile.am $(srcdir)/auth/Makefile.am $(srcdir)/brag/Makefile.am $(srcdir)/ceph-detect-init/Makefile.am $(srcdir)/crush/Makefile.am $(srcdir)/mon/Makefile.am $(srcdir)/mds/Makefile.am $(srcdir)/mds/Makefile-client.am $(srcdir)/mds/Makefile-server.am $(srcdir)/os/Makefile.am $(srcdir)/osd/Makefile.am $(srcdir)/erasure-code/Makefile.am $(srcdir)/erasure-code/jerasure/Makefile.am $(srcdir)/erasure-code/l [...]
 	@for dep in $?; do \
 	  case '$(am__configure_deps)' in \
 	    *$$dep*) \
@@ -8986,7 +10242,7 @@ Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
 	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
 	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
 	esac;
-$(srcdir)/Makefile-env.am $(srcdir)/arch/Makefile.am $(srcdir)/auth/Makefile.am $(srcdir)/brag/Makefile.am $(srcdir)/crush/Makefile.am $(srcdir)/mon/Makefile.am $(srcdir)/mds/Makefile.am $(srcdir)/mds/Makefile-client.am $(srcdir)/mds/Makefile-server.am $(srcdir)/os/Makefile.am $(srcdir)/osd/Makefile.am $(srcdir)/erasure-code/Makefile.am $(srcdir)/erasure-code/jerasure/Makefile.am $(srcdir)/erasure-code/lrc/Makefile.am $(srcdir)/erasure-code/shec/Makefile.am $(srcdir)/erasure-code/isa/Mak [...]
+$(srcdir)/Makefile-env.am $(srcdir)/arch/Makefile.am $(srcdir)/auth/Makefile.am $(srcdir)/brag/Makefile.am $(srcdir)/ceph-detect-init/Makefile.am $(srcdir)/crush/Makefile.am $(srcdir)/mon/Makefile.am $(srcdir)/mds/Makefile.am $(srcdir)/mds/Makefile-client.am $(srcdir)/mds/Makefile-server.am $(srcdir)/os/Makefile.am $(srcdir)/osd/Makefile.am $(srcdir)/erasure-code/Makefile.am $(srcdir)/erasure-code/jerasure/Makefile.am $(srcdir)/erasure-code/lrc/Makefile.am $(srcdir)/erasure-code/shec/Mak [...]
 
 $(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
@@ -9061,6 +10317,20 @@ libcls_statelog_client.a: $(libcls_statelog_client_a_OBJECTS) $(libcls_statelog_
 	$(AM_V_at)-rm -f libcls_statelog_client.a
 	$(AM_V_AR)$(libcls_statelog_client_a_AR) libcls_statelog_client.a $(libcls_statelog_client_a_OBJECTS) $(libcls_statelog_client_a_LIBADD)
 	$(AM_V_at)$(RANLIB) libcls_statelog_client.a
+cls/timeindex/$(am__dirstamp):
+	@$(MKDIR_P) cls/timeindex
+	@: > cls/timeindex/$(am__dirstamp)
+cls/timeindex/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) cls/timeindex/$(DEPDIR)
+	@: > cls/timeindex/$(DEPDIR)/$(am__dirstamp)
+cls/timeindex/cls_timeindex_client.$(OBJEXT):  \
+	cls/timeindex/$(am__dirstamp) \
+	cls/timeindex/$(DEPDIR)/$(am__dirstamp)
+
+libcls_timeindex_client.a: $(libcls_timeindex_client_a_OBJECTS) $(libcls_timeindex_client_a_DEPENDENCIES) $(EXTRA_libcls_timeindex_client_a_DEPENDENCIES) 
+	$(AM_V_at)-rm -f libcls_timeindex_client.a
+	$(AM_V_AR)$(libcls_timeindex_client_a_AR) libcls_timeindex_client.a $(libcls_timeindex_client_a_OBJECTS) $(libcls_timeindex_client_a_LIBADD)
+	$(AM_V_at)$(RANLIB) libcls_timeindex_client.a
 cls/user/$(am__dirstamp):
 	@$(MKDIR_P) cls/user
 	@: > cls/user/$(am__dirstamp)
@@ -9354,11 +10624,27 @@ client/Trace.lo: client/$(am__dirstamp) \
 
 libclient.la: $(libclient_la_OBJECTS) $(libclient_la_DEPENDENCIES) $(EXTRA_libclient_la_DEPENDENCIES) 
 	$(AM_V_CXXLD)$(CXXLINK) $(am_libclient_la_rpath) $(libclient_la_OBJECTS) $(libclient_la_LIBADD) $(LIBS)
-client/fuse_ll.lo: client/$(am__dirstamp) \
+client/libclient_fuse_la-fuse_ll.lo: client/$(am__dirstamp) \
 	client/$(DEPDIR)/$(am__dirstamp)
 
 libclient_fuse.la: $(libclient_fuse_la_OBJECTS) $(libclient_fuse_la_DEPENDENCIES) $(EXTRA_libclient_fuse_la_DEPENDENCIES) 
-	$(AM_V_CXXLD)$(CXXLINK) $(am_libclient_fuse_la_rpath) $(libclient_fuse_la_OBJECTS) $(libclient_fuse_la_LIBADD) $(LIBS)
+	$(AM_V_CXXLD)$(libclient_fuse_la_LINK) $(am_libclient_fuse_la_rpath) $(libclient_fuse_la_OBJECTS) $(libclient_fuse_la_LIBADD) $(LIBS)
+cls/cephfs/$(am__dirstamp):
+	@$(MKDIR_P) cls/cephfs
+	@: > cls/cephfs/$(am__dirstamp)
+cls/cephfs/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) cls/cephfs/$(DEPDIR)
+	@: > cls/cephfs/$(DEPDIR)/$(am__dirstamp)
+cls/cephfs/cls_cephfs.lo: cls/cephfs/$(am__dirstamp) \
+	cls/cephfs/$(DEPDIR)/$(am__dirstamp)
+
+libcls_cephfs.la: $(libcls_cephfs_la_OBJECTS) $(libcls_cephfs_la_DEPENDENCIES) $(EXTRA_libcls_cephfs_la_DEPENDENCIES) 
+	$(AM_V_CXXLD)$(libcls_cephfs_la_LINK) $(am_libcls_cephfs_la_rpath) $(libcls_cephfs_la_OBJECTS) $(libcls_cephfs_la_LIBADD) $(LIBS)
+cls/cephfs/cls_cephfs_client.lo: cls/cephfs/$(am__dirstamp) \
+	cls/cephfs/$(DEPDIR)/$(am__dirstamp)
+
+libcls_cephfs_client.la: $(libcls_cephfs_client_la_OBJECTS) $(libcls_cephfs_client_la_DEPENDENCIES) $(EXTRA_libcls_cephfs_client_la_DEPENDENCIES) 
+	$(AM_V_CXXLD)$(CXXLINK) $(am_libcls_cephfs_client_la_rpath) $(libcls_cephfs_client_la_OBJECTS) $(libcls_cephfs_client_la_LIBADD) $(LIBS)
 cls/hello/$(am__dirstamp):
 	@$(MKDIR_P) cls/hello
 	@: > cls/hello/$(am__dirstamp)
@@ -9406,6 +10692,22 @@ cls/log/cls_log.lo: cls/log/$(am__dirstamp) \
 
 libcls_log.la: $(libcls_log_la_OBJECTS) $(libcls_log_la_DEPENDENCIES) $(EXTRA_libcls_log_la_DEPENDENCIES) 
 	$(AM_V_CXXLD)$(libcls_log_la_LINK) $(am_libcls_log_la_rpath) $(libcls_log_la_OBJECTS) $(libcls_log_la_LIBADD) $(LIBS)
+cls/numops/$(am__dirstamp):
+	@$(MKDIR_P) cls/numops
+	@: > cls/numops/$(am__dirstamp)
+cls/numops/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) cls/numops/$(DEPDIR)
+	@: > cls/numops/$(DEPDIR)/$(am__dirstamp)
+cls/numops/cls_numops.lo: cls/numops/$(am__dirstamp) \
+	cls/numops/$(DEPDIR)/$(am__dirstamp)
+
+libcls_numops.la: $(libcls_numops_la_OBJECTS) $(libcls_numops_la_DEPENDENCIES) $(EXTRA_libcls_numops_la_DEPENDENCIES) 
+	$(AM_V_CXXLD)$(libcls_numops_la_LINK) $(am_libcls_numops_la_rpath) $(libcls_numops_la_OBJECTS) $(libcls_numops_la_LIBADD) $(LIBS)
+cls/numops/cls_numops_client.lo: cls/numops/$(am__dirstamp) \
+	cls/numops/$(DEPDIR)/$(am__dirstamp)
+
+libcls_numops_client.la: $(libcls_numops_client_la_OBJECTS) $(libcls_numops_client_la_DEPENDENCIES) $(EXTRA_libcls_numops_client_la_DEPENDENCIES) 
+	$(AM_V_CXXLD)$(CXXLINK) $(am_libcls_numops_client_la_rpath) $(libcls_numops_client_la_OBJECTS) $(libcls_numops_client_la_LIBADD) $(LIBS)
 cls/rbd/$(am__dirstamp):
 	@$(MKDIR_P) cls/rbd
 	@: > cls/rbd/$(am__dirstamp)
@@ -9478,6 +10780,11 @@ cls/statelog/cls_statelog.lo: cls/statelog/$(am__dirstamp) \
 
 libcls_statelog.la: $(libcls_statelog_la_OBJECTS) $(libcls_statelog_la_DEPENDENCIES) $(EXTRA_libcls_statelog_la_DEPENDENCIES) 
 	$(AM_V_CXXLD)$(libcls_statelog_la_LINK) $(am_libcls_statelog_la_rpath) $(libcls_statelog_la_OBJECTS) $(libcls_statelog_la_LIBADD) $(LIBS)
+cls/timeindex/cls_timeindex.lo: cls/timeindex/$(am__dirstamp) \
+	cls/timeindex/$(DEPDIR)/$(am__dirstamp)
+
+libcls_timeindex.la: $(libcls_timeindex_la_OBJECTS) $(libcls_timeindex_la_DEPENDENCIES) $(EXTRA_libcls_timeindex_la_DEPENDENCIES) 
+	$(AM_V_CXXLD)$(libcls_timeindex_la_LINK) $(am_libcls_timeindex_la_rpath) $(libcls_timeindex_la_OBJECTS) $(libcls_timeindex_la_LIBADD) $(LIBS)
 cls/user/cls_user.lo: cls/user/$(am__dirstamp) \
 	cls/user/$(DEPDIR)/$(am__dirstamp)
 
@@ -9488,14 +10795,11 @@ cls/version/cls_version.lo: cls/version/$(am__dirstamp) \
 
 libcls_version.la: $(libcls_version_la_OBJECTS) $(libcls_version_la_DEPENDENCIES) $(EXTRA_libcls_version_la_DEPENDENCIES) 
 	$(AM_V_CXXLD)$(libcls_version_la_LINK) $(am_libcls_version_la_rpath) $(libcls_version_la_OBJECTS) $(libcls_version_la_LIBADD) $(LIBS)
-
-libcommon.la: $(libcommon_la_OBJECTS) $(libcommon_la_DEPENDENCIES) $(EXTRA_libcommon_la_DEPENDENCIES) 
-	$(AM_V_CCLD)$(LINK)  $(libcommon_la_OBJECTS) $(libcommon_la_LIBADD) $(LIBS)
-common/libcommon_api_la-buffer.lo: common/$(am__dirstamp) \
+common/buffer.lo: common/$(am__dirstamp) \
 	common/$(DEPDIR)/$(am__dirstamp)
 
-libcommon_api.la: $(libcommon_api_la_OBJECTS) $(libcommon_api_la_DEPENDENCIES) $(EXTRA_libcommon_api_la_DEPENDENCIES) 
-	$(AM_V_CXXLD)$(libcommon_api_la_LINK)  $(libcommon_api_la_OBJECTS) $(libcommon_api_la_LIBADD) $(LIBS)
+libcommon.la: $(libcommon_la_OBJECTS) $(libcommon_la_DEPENDENCIES) $(EXTRA_libcommon_la_DEPENDENCIES) 
+	$(AM_V_CXXLD)$(CXXLINK)  $(libcommon_la_OBJECTS) $(libcommon_la_LIBADD) $(LIBS)
 common/libcommon_crc_la-sctp_crc32.lo: common/$(am__dirstamp) \
 	common/$(DEPDIR)/$(am__dirstamp)
 common/libcommon_crc_la-crc32c.lo: common/$(am__dirstamp) \
@@ -9511,6 +10815,11 @@ common/libcommon_crc_la-crc32c_intel_fast_zero_asm.lo:  \
 
 libcommon_crc.la: $(libcommon_crc_la_OBJECTS) $(libcommon_crc_la_DEPENDENCIES) $(EXTRA_libcommon_crc_la_DEPENDENCIES) 
 	$(AM_V_CXXLD)$(libcommon_crc_la_LINK)  $(libcommon_crc_la_OBJECTS) $(libcommon_crc_la_LIBADD) $(LIBS)
+common/libcommon_crc_aarch64_la-crc32c_aarch64.lo:  \
+	common/$(am__dirstamp) common/$(DEPDIR)/$(am__dirstamp)
+
+libcommon_crc_aarch64.la: $(libcommon_crc_aarch64_la_OBJECTS) $(libcommon_crc_aarch64_la_DEPENDENCIES) $(EXTRA_libcommon_crc_aarch64_la_DEPENDENCIES) 
+	$(AM_V_CCLD)$(libcommon_crc_aarch64_la_LINK) $(am_libcommon_crc_aarch64_la_rpath) $(libcommon_crc_aarch64_la_OBJECTS) $(libcommon_crc_aarch64_la_LIBADD) $(LIBS)
 common/DecayCounter.lo: common/$(am__dirstamp) \
 	common/$(DEPDIR)/$(am__dirstamp)
 common/LogClient.lo: common/$(am__dirstamp) \
@@ -9655,6 +10964,8 @@ common/Cycles.lo: common/$(am__dirstamp) \
 	common/$(DEPDIR)/$(am__dirstamp)
 common/ContextCompletion.lo: common/$(am__dirstamp) \
 	common/$(DEPDIR)/$(am__dirstamp)
+common/TracepointProvider.lo: common/$(am__dirstamp) \
+	common/$(DEPDIR)/$(am__dirstamp)
 common/blkdev.lo: common/$(am__dirstamp) \
 	common/$(DEPDIR)/$(am__dirstamp)
 common/address_helper.lo: common/$(am__dirstamp) \
@@ -9692,6 +11003,19 @@ mds/flock.lo: mds/$(am__dirstamp) mds/$(DEPDIR)/$(am__dirstamp)
 
 libcommon_internal.la: $(libcommon_internal_la_OBJECTS) $(libcommon_internal_la_DEPENDENCIES) $(EXTRA_libcommon_internal_la_DEPENDENCIES) 
 	$(AM_V_CXXLD)$(CXXLINK)  $(libcommon_internal_la_OBJECTS) $(libcommon_internal_la_LIBADD) $(LIBS)
+compressor/$(am__dirstamp):
+	@$(MKDIR_P) compressor
+	@: > compressor/$(am__dirstamp)
+compressor/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) compressor/$(DEPDIR)
+	@: > compressor/$(DEPDIR)/$(am__dirstamp)
+compressor/Compressor.lo: compressor/$(am__dirstamp) \
+	compressor/$(DEPDIR)/$(am__dirstamp)
+compressor/AsyncCompressor.lo: compressor/$(am__dirstamp) \
+	compressor/$(DEPDIR)/$(am__dirstamp)
+
+libcompressor.la: $(libcompressor_la_OBJECTS) $(libcompressor_la_DEPENDENCIES) $(EXTRA_libcompressor_la_DEPENDENCIES) 
+	$(AM_V_CXXLD)$(CXXLINK)  $(libcompressor_la_OBJECTS) $(libcompressor_la_LIBADD) $(LIBS)
 crush/$(am__dirstamp):
 	@$(MKDIR_P) crush
 	@: > crush/$(am__dirstamp)
@@ -9712,6 +11036,15 @@ crush/CrushTester.lo: crush/$(am__dirstamp) \
 
 libcrush.la: $(libcrush_la_OBJECTS) $(libcrush_la_DEPENDENCIES) $(EXTRA_libcrush_la_DEPENDENCIES) 
 	$(AM_V_CXXLD)$(CXXLINK)  $(libcrush_la_OBJECTS) $(libcrush_la_LIBADD) $(LIBS)
+erasure-code/$(am__dirstamp):
+	@$(MKDIR_P) erasure-code
+	@: > erasure-code/$(am__dirstamp)
+erasure-code/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) erasure-code/$(DEPDIR)
+	@: > erasure-code/$(DEPDIR)/$(am__dirstamp)
+erasure-code/libec_example_la-ErasureCode.lo:  \
+	erasure-code/$(am__dirstamp) \
+	erasure-code/$(DEPDIR)/$(am__dirstamp)
 test/erasure-code/$(am__dirstamp):
 	@$(MKDIR_P) test/erasure-code
 	@: > test/erasure-code/$(am__dirstamp)
@@ -9742,12 +11075,6 @@ test/erasure-code/libec_hangs_la-ErasureCodePluginHangs.lo:  \
 
 libec_hangs.la: $(libec_hangs_la_OBJECTS) $(libec_hangs_la_DEPENDENCIES) $(EXTRA_libec_hangs_la_DEPENDENCIES) 
 	$(AM_V_CXXLD)$(libec_hangs_la_LINK) $(am_libec_hangs_la_rpath) $(libec_hangs_la_OBJECTS) $(libec_hangs_la_LIBADD) $(LIBS)
-erasure-code/$(am__dirstamp):
-	@$(MKDIR_P) erasure-code
-	@: > erasure-code/$(am__dirstamp)
-erasure-code/$(DEPDIR)/$(am__dirstamp):
-	@$(MKDIR_P) erasure-code/$(DEPDIR)
-	@: > erasure-code/$(DEPDIR)/$(am__dirstamp)
 erasure-code/libec_isa_la-ErasureCode.lo:  \
 	erasure-code/$(am__dirstamp) \
 	erasure-code/$(DEPDIR)/$(am__dirstamp)
@@ -9820,6 +11147,60 @@ erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_vect_dot_prod_avx.asm.lo:  \
 erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_vect_dot_prod_sse.asm.lo:  \
 	erasure-code/isa/isa-l/erasure_code/$(am__dirstamp) \
 	erasure-code/isa/isa-l/erasure_code/$(DEPDIR)/$(am__dirstamp)
+erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_2vect_mad_avx2.asm.lo:  \
+	erasure-code/isa/isa-l/erasure_code/$(am__dirstamp) \
+	erasure-code/isa/isa-l/erasure_code/$(DEPDIR)/$(am__dirstamp)
+erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_2vect_mad_avx.asm.lo:  \
+	erasure-code/isa/isa-l/erasure_code/$(am__dirstamp) \
+	erasure-code/isa/isa-l/erasure_code/$(DEPDIR)/$(am__dirstamp)
+erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_2vect_mad_sse.asm.lo:  \
+	erasure-code/isa/isa-l/erasure_code/$(am__dirstamp) \
+	erasure-code/isa/isa-l/erasure_code/$(DEPDIR)/$(am__dirstamp)
+erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_3vect_mad_avx2.asm.lo:  \
+	erasure-code/isa/isa-l/erasure_code/$(am__dirstamp) \
+	erasure-code/isa/isa-l/erasure_code/$(DEPDIR)/$(am__dirstamp)
+erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_3vect_mad_avx.asm.lo:  \
+	erasure-code/isa/isa-l/erasure_code/$(am__dirstamp) \
+	erasure-code/isa/isa-l/erasure_code/$(DEPDIR)/$(am__dirstamp)
+erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_3vect_mad_sse.asm.lo:  \
+	erasure-code/isa/isa-l/erasure_code/$(am__dirstamp) \
+	erasure-code/isa/isa-l/erasure_code/$(DEPDIR)/$(am__dirstamp)
+erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_4vect_mad_avx2.asm.lo:  \
+	erasure-code/isa/isa-l/erasure_code/$(am__dirstamp) \
+	erasure-code/isa/isa-l/erasure_code/$(DEPDIR)/$(am__dirstamp)
+erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_4vect_mad_avx.asm.lo:  \
+	erasure-code/isa/isa-l/erasure_code/$(am__dirstamp) \
+	erasure-code/isa/isa-l/erasure_code/$(DEPDIR)/$(am__dirstamp)
+erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_4vect_mad_sse.asm.lo:  \
+	erasure-code/isa/isa-l/erasure_code/$(am__dirstamp) \
+	erasure-code/isa/isa-l/erasure_code/$(DEPDIR)/$(am__dirstamp)
+erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_5vect_mad_avx2.asm.lo:  \
+	erasure-code/isa/isa-l/erasure_code/$(am__dirstamp) \
+	erasure-code/isa/isa-l/erasure_code/$(DEPDIR)/$(am__dirstamp)
+erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_5vect_mad_avx.asm.lo:  \
+	erasure-code/isa/isa-l/erasure_code/$(am__dirstamp) \
+	erasure-code/isa/isa-l/erasure_code/$(DEPDIR)/$(am__dirstamp)
+erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_5vect_mad_sse.asm.lo:  \
+	erasure-code/isa/isa-l/erasure_code/$(am__dirstamp) \
+	erasure-code/isa/isa-l/erasure_code/$(DEPDIR)/$(am__dirstamp)
+erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_6vect_mad_avx2.asm.lo:  \
+	erasure-code/isa/isa-l/erasure_code/$(am__dirstamp) \
+	erasure-code/isa/isa-l/erasure_code/$(DEPDIR)/$(am__dirstamp)
+erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_6vect_mad_avx.asm.lo:  \
+	erasure-code/isa/isa-l/erasure_code/$(am__dirstamp) \
+	erasure-code/isa/isa-l/erasure_code/$(DEPDIR)/$(am__dirstamp)
+erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_6vect_mad_sse.asm.lo:  \
+	erasure-code/isa/isa-l/erasure_code/$(am__dirstamp) \
+	erasure-code/isa/isa-l/erasure_code/$(DEPDIR)/$(am__dirstamp)
+erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_vect_mad_avx2.asm.lo:  \
+	erasure-code/isa/isa-l/erasure_code/$(am__dirstamp) \
+	erasure-code/isa/isa-l/erasure_code/$(DEPDIR)/$(am__dirstamp)
+erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_vect_mad_avx.asm.lo:  \
+	erasure-code/isa/isa-l/erasure_code/$(am__dirstamp) \
+	erasure-code/isa/isa-l/erasure_code/$(DEPDIR)/$(am__dirstamp)
+erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_vect_mad_sse.asm.lo:  \
+	erasure-code/isa/isa-l/erasure_code/$(am__dirstamp) \
+	erasure-code/isa/isa-l/erasure_code/$(DEPDIR)/$(am__dirstamp)
 erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_vect_mul_avx.asm.lo:  \
 	erasure-code/isa/isa-l/erasure_code/$(am__dirstamp) \
 	erasure-code/isa/isa-l/erasure_code/$(DEPDIR)/$(am__dirstamp)
@@ -10164,81 +11545,297 @@ test/erasure-code/libec_missing_version_la-ErasureCodePluginMissingVersion.lo:
 
 libec_missing_version.la: $(libec_missing_version_la_OBJECTS) $(libec_missing_version_la_DEPENDENCIES) $(EXTRA_libec_missing_version_la_DEPENDENCIES) 
 	$(AM_V_CXXLD)$(libec_missing_version_la_LINK) $(am_libec_missing_version_la_rpath) $(libec_missing_version_la_OBJECTS) $(libec_missing_version_la_LIBADD) $(LIBS)
-erasure-code/libec_shec_la-ErasureCode.lo:  \
-	erasure-code/$(am__dirstamp) \
-	erasure-code/$(DEPDIR)/$(am__dirstamp)
 erasure-code/shec/$(am__dirstamp):
 	@$(MKDIR_P) erasure-code/shec
 	@: > erasure-code/shec/$(am__dirstamp)
 erasure-code/shec/$(DEPDIR)/$(am__dirstamp):
 	@$(MKDIR_P) erasure-code/shec/$(DEPDIR)
 	@: > erasure-code/shec/$(DEPDIR)/$(am__dirstamp)
-erasure-code/shec/libec_shec_la-ErasureCodePluginShec.lo:  \
+erasure-code/shec/libec_shec_la-ErasureCodePluginSelectShec.lo:  \
 	erasure-code/shec/$(am__dirstamp) \
 	erasure-code/shec/$(DEPDIR)/$(am__dirstamp)
-erasure-code/shec/libec_shec_la-ErasureCodeShec.lo:  \
+
+libec_shec.la: $(libec_shec_la_OBJECTS) $(libec_shec_la_DEPENDENCIES) $(EXTRA_libec_shec_la_DEPENDENCIES) 
+	$(AM_V_CXXLD)$(libec_shec_la_LINK) -rpath $(erasure_codelibdir) $(libec_shec_la_OBJECTS) $(libec_shec_la_LIBADD) $(LIBS)
+erasure-code/libec_shec_generic_la-ErasureCode.lo:  \
+	erasure-code/$(am__dirstamp) \
+	erasure-code/$(DEPDIR)/$(am__dirstamp)
+erasure-code/shec/libec_shec_generic_la-ErasureCodePluginShec.lo:  \
 	erasure-code/shec/$(am__dirstamp) \
 	erasure-code/shec/$(DEPDIR)/$(am__dirstamp)
-erasure-code/shec/libec_shec_la-ErasureCodeShecTableCache.lo:  \
+erasure-code/shec/libec_shec_generic_la-ErasureCodeShec.lo:  \
 	erasure-code/shec/$(am__dirstamp) \
 	erasure-code/shec/$(DEPDIR)/$(am__dirstamp)
-erasure-code/shec/libec_shec_la-shec.lo:  \
+erasure-code/shec/libec_shec_generic_la-ErasureCodeShecTableCache.lo:  \
 	erasure-code/shec/$(am__dirstamp) \
 	erasure-code/shec/$(DEPDIR)/$(am__dirstamp)
-erasure-code/shec/libec_shec_la-determinant.lo:  \
+erasure-code/shec/libec_shec_generic_la-determinant.lo:  \
 	erasure-code/shec/$(am__dirstamp) \
 	erasure-code/shec/$(DEPDIR)/$(am__dirstamp)
-erasure-code/jerasure/jerasure/src/libec_shec_la-cauchy.lo:  \
+erasure-code/jerasure/jerasure/src/libec_shec_generic_la-cauchy.lo:  \
 	erasure-code/jerasure/jerasure/src/$(am__dirstamp) \
 	erasure-code/jerasure/jerasure/src/$(DEPDIR)/$(am__dirstamp)
-erasure-code/jerasure/jerasure/src/libec_shec_la-galois.lo:  \
+erasure-code/jerasure/jerasure/src/libec_shec_generic_la-galois.lo:  \
 	erasure-code/jerasure/jerasure/src/$(am__dirstamp) \
 	erasure-code/jerasure/jerasure/src/$(DEPDIR)/$(am__dirstamp)
-erasure-code/jerasure/jerasure/src/libec_shec_la-jerasure.lo:  \
+erasure-code/jerasure/jerasure/src/libec_shec_generic_la-jerasure.lo:  \
 	erasure-code/jerasure/jerasure/src/$(am__dirstamp) \
 	erasure-code/jerasure/jerasure/src/$(DEPDIR)/$(am__dirstamp)
-erasure-code/jerasure/jerasure/src/libec_shec_la-liberation.lo:  \
+erasure-code/jerasure/jerasure/src/libec_shec_generic_la-liberation.lo:  \
 	erasure-code/jerasure/jerasure/src/$(am__dirstamp) \
 	erasure-code/jerasure/jerasure/src/$(DEPDIR)/$(am__dirstamp)
-erasure-code/jerasure/jerasure/src/libec_shec_la-reed_sol.lo:  \
+erasure-code/jerasure/jerasure/src/libec_shec_generic_la-reed_sol.lo:  \
 	erasure-code/jerasure/jerasure/src/$(am__dirstamp) \
 	erasure-code/jerasure/jerasure/src/$(DEPDIR)/$(am__dirstamp)
-erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_wgen.lo:  \
+erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_wgen.lo:  \
 	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
 	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
-erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_method.lo:  \
+erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_method.lo:  \
 	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
 	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
-erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_w16.lo:  \
+erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_w16.lo:  \
 	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
 	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
-erasure-code/jerasure/gf-complete/src/libec_shec_la-gf.lo:  \
+erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf.lo:  \
 	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
 	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
-erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_w32.lo:  \
+erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_w32.lo:  \
 	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
 	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
-erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_w64.lo:  \
+erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_w64.lo:  \
 	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
 	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
-erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_w128.lo:  \
+erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_w128.lo:  \
 	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
 	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
-erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_general.lo:  \
+erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_general.lo:  \
 	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
 	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
-erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_w4.lo:  \
+erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_w4.lo:  \
 	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
 	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
-erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_rand.lo:  \
+erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_rand.lo:  \
 	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
 	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
-erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_w8.lo:  \
+erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_w8.lo:  \
 	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
 	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
 
-libec_shec.la: $(libec_shec_la_OBJECTS) $(libec_shec_la_DEPENDENCIES) $(EXTRA_libec_shec_la_DEPENDENCIES) 
-	$(AM_V_CXXLD)$(libec_shec_la_LINK) -rpath $(erasure_codelibdir) $(libec_shec_la_OBJECTS) $(libec_shec_la_LIBADD) $(LIBS)
+libec_shec_generic.la: $(libec_shec_generic_la_OBJECTS) $(libec_shec_generic_la_DEPENDENCIES) $(EXTRA_libec_shec_generic_la_DEPENDENCIES) 
+	$(AM_V_CXXLD)$(libec_shec_generic_la_LINK) -rpath $(erasure_codelibdir) $(libec_shec_generic_la_OBJECTS) $(libec_shec_generic_la_LIBADD) $(LIBS)
+erasure-code/libec_shec_neon_la-ErasureCode.lo:  \
+	erasure-code/$(am__dirstamp) \
+	erasure-code/$(DEPDIR)/$(am__dirstamp)
+erasure-code/shec/libec_shec_neon_la-ErasureCodePluginShec.lo:  \
+	erasure-code/shec/$(am__dirstamp) \
+	erasure-code/shec/$(DEPDIR)/$(am__dirstamp)
+erasure-code/shec/libec_shec_neon_la-ErasureCodeShec.lo:  \
+	erasure-code/shec/$(am__dirstamp) \
+	erasure-code/shec/$(DEPDIR)/$(am__dirstamp)
+erasure-code/shec/libec_shec_neon_la-ErasureCodeShecTableCache.lo:  \
+	erasure-code/shec/$(am__dirstamp) \
+	erasure-code/shec/$(DEPDIR)/$(am__dirstamp)
+erasure-code/shec/libec_shec_neon_la-determinant.lo:  \
+	erasure-code/shec/$(am__dirstamp) \
+	erasure-code/shec/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/jerasure/src/libec_shec_neon_la-cauchy.lo:  \
+	erasure-code/jerasure/jerasure/src/$(am__dirstamp) \
+	erasure-code/jerasure/jerasure/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/jerasure/src/libec_shec_neon_la-galois.lo:  \
+	erasure-code/jerasure/jerasure/src/$(am__dirstamp) \
+	erasure-code/jerasure/jerasure/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/jerasure/src/libec_shec_neon_la-jerasure.lo:  \
+	erasure-code/jerasure/jerasure/src/$(am__dirstamp) \
+	erasure-code/jerasure/jerasure/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/jerasure/src/libec_shec_neon_la-liberation.lo:  \
+	erasure-code/jerasure/jerasure/src/$(am__dirstamp) \
+	erasure-code/jerasure/jerasure/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/jerasure/src/libec_shec_neon_la-reed_sol.lo:  \
+	erasure-code/jerasure/jerasure/src/$(am__dirstamp) \
+	erasure-code/jerasure/jerasure/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_wgen.lo:  \
+	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
+	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_method.lo:  \
+	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
+	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_w16.lo:  \
+	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
+	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf.lo:  \
+	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
+	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_w32.lo:  \
+	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
+	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_w64.lo:  \
+	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
+	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_w128.lo:  \
+	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
+	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_general.lo:  \
+	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
+	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_w4.lo:  \
+	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
+	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_rand.lo:  \
+	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
+	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_w8.lo:  \
+	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
+	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/gf-complete/src/neon/libec_shec_neon_la-gf_w4_neon.lo:  \
+	erasure-code/jerasure/gf-complete/src/neon/$(am__dirstamp) \
+	erasure-code/jerasure/gf-complete/src/neon/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/gf-complete/src/neon/libec_shec_neon_la-gf_w8_neon.lo:  \
+	erasure-code/jerasure/gf-complete/src/neon/$(am__dirstamp) \
+	erasure-code/jerasure/gf-complete/src/neon/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/gf-complete/src/neon/libec_shec_neon_la-gf_w16_neon.lo:  \
+	erasure-code/jerasure/gf-complete/src/neon/$(am__dirstamp) \
+	erasure-code/jerasure/gf-complete/src/neon/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/gf-complete/src/neon/libec_shec_neon_la-gf_w32_neon.lo:  \
+	erasure-code/jerasure/gf-complete/src/neon/$(am__dirstamp) \
+	erasure-code/jerasure/gf-complete/src/neon/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/gf-complete/src/neon/libec_shec_neon_la-gf_w64_neon.lo:  \
+	erasure-code/jerasure/gf-complete/src/neon/$(am__dirstamp) \
+	erasure-code/jerasure/gf-complete/src/neon/$(DEPDIR)/$(am__dirstamp)
+
+libec_shec_neon.la: $(libec_shec_neon_la_OBJECTS) $(libec_shec_neon_la_DEPENDENCIES) $(EXTRA_libec_shec_neon_la_DEPENDENCIES) 
+	$(AM_V_CXXLD)$(libec_shec_neon_la_LINK) $(am_libec_shec_neon_la_rpath) $(libec_shec_neon_la_OBJECTS) $(libec_shec_neon_la_LIBADD) $(LIBS)
+erasure-code/libec_shec_sse3_la-ErasureCode.lo:  \
+	erasure-code/$(am__dirstamp) \
+	erasure-code/$(DEPDIR)/$(am__dirstamp)
+erasure-code/shec/libec_shec_sse3_la-ErasureCodePluginShec.lo:  \
+	erasure-code/shec/$(am__dirstamp) \
+	erasure-code/shec/$(DEPDIR)/$(am__dirstamp)
+erasure-code/shec/libec_shec_sse3_la-ErasureCodeShec.lo:  \
+	erasure-code/shec/$(am__dirstamp) \
+	erasure-code/shec/$(DEPDIR)/$(am__dirstamp)
+erasure-code/shec/libec_shec_sse3_la-ErasureCodeShecTableCache.lo:  \
+	erasure-code/shec/$(am__dirstamp) \
+	erasure-code/shec/$(DEPDIR)/$(am__dirstamp)
+erasure-code/shec/libec_shec_sse3_la-determinant.lo:  \
+	erasure-code/shec/$(am__dirstamp) \
+	erasure-code/shec/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/jerasure/src/libec_shec_sse3_la-cauchy.lo:  \
+	erasure-code/jerasure/jerasure/src/$(am__dirstamp) \
+	erasure-code/jerasure/jerasure/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/jerasure/src/libec_shec_sse3_la-galois.lo:  \
+	erasure-code/jerasure/jerasure/src/$(am__dirstamp) \
+	erasure-code/jerasure/jerasure/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/jerasure/src/libec_shec_sse3_la-jerasure.lo:  \
+	erasure-code/jerasure/jerasure/src/$(am__dirstamp) \
+	erasure-code/jerasure/jerasure/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/jerasure/src/libec_shec_sse3_la-liberation.lo:  \
+	erasure-code/jerasure/jerasure/src/$(am__dirstamp) \
+	erasure-code/jerasure/jerasure/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/jerasure/src/libec_shec_sse3_la-reed_sol.lo:  \
+	erasure-code/jerasure/jerasure/src/$(am__dirstamp) \
+	erasure-code/jerasure/jerasure/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_wgen.lo:  \
+	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
+	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_method.lo:  \
+	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
+	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_w16.lo:  \
+	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
+	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf.lo:  \
+	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
+	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_w32.lo:  \
+	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
+	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_w64.lo:  \
+	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
+	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_w128.lo:  \
+	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
+	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_general.lo:  \
+	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
+	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_w4.lo:  \
+	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
+	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_rand.lo:  \
+	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
+	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_w8.lo:  \
+	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
+	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
+
+libec_shec_sse3.la: $(libec_shec_sse3_la_OBJECTS) $(libec_shec_sse3_la_DEPENDENCIES) $(EXTRA_libec_shec_sse3_la_DEPENDENCIES) 
+	$(AM_V_CXXLD)$(libec_shec_sse3_la_LINK) $(am_libec_shec_sse3_la_rpath) $(libec_shec_sse3_la_OBJECTS) $(libec_shec_sse3_la_LIBADD) $(LIBS)
+erasure-code/libec_shec_sse4_la-ErasureCode.lo:  \
+	erasure-code/$(am__dirstamp) \
+	erasure-code/$(DEPDIR)/$(am__dirstamp)
+erasure-code/shec/libec_shec_sse4_la-ErasureCodePluginShec.lo:  \
+	erasure-code/shec/$(am__dirstamp) \
+	erasure-code/shec/$(DEPDIR)/$(am__dirstamp)
+erasure-code/shec/libec_shec_sse4_la-ErasureCodeShec.lo:  \
+	erasure-code/shec/$(am__dirstamp) \
+	erasure-code/shec/$(DEPDIR)/$(am__dirstamp)
+erasure-code/shec/libec_shec_sse4_la-ErasureCodeShecTableCache.lo:  \
+	erasure-code/shec/$(am__dirstamp) \
+	erasure-code/shec/$(DEPDIR)/$(am__dirstamp)
+erasure-code/shec/libec_shec_sse4_la-determinant.lo:  \
+	erasure-code/shec/$(am__dirstamp) \
+	erasure-code/shec/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/jerasure/src/libec_shec_sse4_la-cauchy.lo:  \
+	erasure-code/jerasure/jerasure/src/$(am__dirstamp) \
+	erasure-code/jerasure/jerasure/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/jerasure/src/libec_shec_sse4_la-galois.lo:  \
+	erasure-code/jerasure/jerasure/src/$(am__dirstamp) \
+	erasure-code/jerasure/jerasure/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/jerasure/src/libec_shec_sse4_la-jerasure.lo:  \
+	erasure-code/jerasure/jerasure/src/$(am__dirstamp) \
+	erasure-code/jerasure/jerasure/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/jerasure/src/libec_shec_sse4_la-liberation.lo:  \
+	erasure-code/jerasure/jerasure/src/$(am__dirstamp) \
+	erasure-code/jerasure/jerasure/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/jerasure/src/libec_shec_sse4_la-reed_sol.lo:  \
+	erasure-code/jerasure/jerasure/src/$(am__dirstamp) \
+	erasure-code/jerasure/jerasure/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_wgen.lo:  \
+	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
+	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_method.lo:  \
+	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
+	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_w16.lo:  \
+	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
+	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf.lo:  \
+	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
+	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_w32.lo:  \
+	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
+	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_w64.lo:  \
+	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
+	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_w128.lo:  \
+	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
+	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_general.lo:  \
+	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
+	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_w4.lo:  \
+	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
+	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_rand.lo:  \
+	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
+	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_w8.lo:  \
+	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
+	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
+
+libec_shec_sse4.la: $(libec_shec_sse4_la_OBJECTS) $(libec_shec_sse4_la_DEPENDENCIES) $(EXTRA_libec_shec_sse4_la_DEPENDENCIES) 
+	$(AM_V_CXXLD)$(libec_shec_sse4_la_LINK) $(am_libec_shec_sse4_la_rpath) $(libec_shec_sse4_la_OBJECTS) $(libec_shec_sse4_la_LIBADD) $(LIBS)
 test/erasure-code/libec_test_jerasure_generic_la-TestJerasurePluginGeneric.lo:  \
 	test/erasure-code/$(am__dirstamp) \
 	test/erasure-code/$(DEPDIR)/$(am__dirstamp)
@@ -10263,6 +11860,30 @@ test/erasure-code/libec_test_jerasure_sse4_la-TestJerasurePluginSSE4.lo:  \
 
 libec_test_jerasure_sse4.la: $(libec_test_jerasure_sse4_la_OBJECTS) $(libec_test_jerasure_sse4_la_DEPENDENCIES) $(EXTRA_libec_test_jerasure_sse4_la_DEPENDENCIES) 
 	$(AM_V_CXXLD)$(libec_test_jerasure_sse4_la_LINK) $(am_libec_test_jerasure_sse4_la_rpath) $(libec_test_jerasure_sse4_la_OBJECTS) $(libec_test_jerasure_sse4_la_LIBADD) $(LIBS)
+test/erasure-code/libec_test_shec_generic_la-TestShecPluginGeneric.lo:  \
+	test/erasure-code/$(am__dirstamp) \
+	test/erasure-code/$(DEPDIR)/$(am__dirstamp)
+
+libec_test_shec_generic.la: $(libec_test_shec_generic_la_OBJECTS) $(libec_test_shec_generic_la_DEPENDENCIES) $(EXTRA_libec_test_shec_generic_la_DEPENDENCIES) 
+	$(AM_V_CXXLD)$(libec_test_shec_generic_la_LINK) $(am_libec_test_shec_generic_la_rpath) $(libec_test_shec_generic_la_OBJECTS) $(libec_test_shec_generic_la_LIBADD) $(LIBS)
+test/erasure-code/libec_test_shec_neon_la-TestShecPluginNEON.lo:  \
+	test/erasure-code/$(am__dirstamp) \
+	test/erasure-code/$(DEPDIR)/$(am__dirstamp)
+
+libec_test_shec_neon.la: $(libec_test_shec_neon_la_OBJECTS) $(libec_test_shec_neon_la_DEPENDENCIES) $(EXTRA_libec_test_shec_neon_la_DEPENDENCIES) 
+	$(AM_V_CXXLD)$(libec_test_shec_neon_la_LINK) $(am_libec_test_shec_neon_la_rpath) $(libec_test_shec_neon_la_OBJECTS) $(libec_test_shec_neon_la_LIBADD) $(LIBS)
+test/erasure-code/libec_test_shec_sse3_la-TestShecPluginSSE3.lo:  \
+	test/erasure-code/$(am__dirstamp) \
+	test/erasure-code/$(DEPDIR)/$(am__dirstamp)
+
+libec_test_shec_sse3.la: $(libec_test_shec_sse3_la_OBJECTS) $(libec_test_shec_sse3_la_DEPENDENCIES) $(EXTRA_libec_test_shec_sse3_la_DEPENDENCIES) 
+	$(AM_V_CXXLD)$(libec_test_shec_sse3_la_LINK) $(am_libec_test_shec_sse3_la_rpath) $(libec_test_shec_sse3_la_OBJECTS) $(libec_test_shec_sse3_la_LIBADD) $(LIBS)
+test/erasure-code/libec_test_shec_sse4_la-TestShecPluginSSE4.lo:  \
+	test/erasure-code/$(am__dirstamp) \
+	test/erasure-code/$(DEPDIR)/$(am__dirstamp)
+
+libec_test_shec_sse4.la: $(libec_test_shec_sse4_la_OBJECTS) $(libec_test_shec_sse4_la_DEPENDENCIES) $(EXTRA_libec_test_shec_sse4_la_DEPENDENCIES) 
+	$(AM_V_CXXLD)$(libec_test_shec_sse4_la_LINK) $(am_libec_test_shec_sse4_la_rpath) $(libec_test_shec_sse4_la_OBJECTS) $(libec_test_shec_sse4_la_LIBADD) $(LIBS)
 erasure-code/ErasureCodePlugin.lo: erasure-code/$(am__dirstamp) \
 	erasure-code/$(DEPDIR)/$(am__dirstamp)
 
@@ -10313,7 +11934,8 @@ log/SubsystemMap.lo: log/$(am__dirstamp) log/$(DEPDIR)/$(am__dirstamp)
 liblog.la: $(liblog_la_OBJECTS) $(liblog_la_DEPENDENCIES) $(EXTRA_liblog_la_DEPENDENCIES) 
 	$(AM_V_CXXLD)$(CXXLINK)  $(liblog_la_OBJECTS) $(liblog_la_LIBADD) $(LIBS)
 mds/Capability.lo: mds/$(am__dirstamp) mds/$(DEPDIR)/$(am__dirstamp)
-mds/MDS.lo: mds/$(am__dirstamp) mds/$(DEPDIR)/$(am__dirstamp)
+mds/MDSDaemon.lo: mds/$(am__dirstamp) mds/$(DEPDIR)/$(am__dirstamp)
+mds/MDSRank.lo: mds/$(am__dirstamp) mds/$(DEPDIR)/$(am__dirstamp)
 mds/Beacon.lo: mds/$(am__dirstamp) mds/$(DEPDIR)/$(am__dirstamp)
 mds/locks.lo: mds/$(am__dirstamp) mds/$(DEPDIR)/$(am__dirstamp)
 mds/journal.lo: mds/$(am__dirstamp) mds/$(DEPDIR)/$(am__dirstamp)
@@ -10322,6 +11944,7 @@ mds/Mutation.lo: mds/$(am__dirstamp) mds/$(DEPDIR)/$(am__dirstamp)
 mds/MDCache.lo: mds/$(am__dirstamp) mds/$(DEPDIR)/$(am__dirstamp)
 mds/RecoveryQueue.lo: mds/$(am__dirstamp) \
 	mds/$(DEPDIR)/$(am__dirstamp)
+mds/StrayManager.lo: mds/$(am__dirstamp) mds/$(DEPDIR)/$(am__dirstamp)
 mds/Locker.lo: mds/$(am__dirstamp) mds/$(DEPDIR)/$(am__dirstamp)
 mds/Migrator.lo: mds/$(am__dirstamp) mds/$(DEPDIR)/$(am__dirstamp)
 mds/MDBalancer.lo: mds/$(am__dirstamp) mds/$(DEPDIR)/$(am__dirstamp)
@@ -10337,6 +11960,7 @@ mds/MDSTableClient.lo: mds/$(am__dirstamp) \
 	mds/$(DEPDIR)/$(am__dirstamp)
 mds/MDSTableServer.lo: mds/$(am__dirstamp) \
 	mds/$(DEPDIR)/$(am__dirstamp)
+mds/SimpleLock.lo: mds/$(am__dirstamp) mds/$(DEPDIR)/$(am__dirstamp)
 mds/SnapRealm.lo: mds/$(am__dirstamp) mds/$(DEPDIR)/$(am__dirstamp)
 mds/SnapServer.lo: mds/$(am__dirstamp) mds/$(DEPDIR)/$(am__dirstamp)
 mds/snap.lo: mds/$(am__dirstamp) mds/$(DEPDIR)/$(am__dirstamp)
@@ -10360,7 +11984,6 @@ mon/PGMonitor.lo: mon/$(am__dirstamp) mon/$(DEPDIR)/$(am__dirstamp)
 mon/LogMonitor.lo: mon/$(am__dirstamp) mon/$(DEPDIR)/$(am__dirstamp)
 mon/AuthMonitor.lo: mon/$(am__dirstamp) mon/$(DEPDIR)/$(am__dirstamp)
 mon/Elector.lo: mon/$(am__dirstamp) mon/$(DEPDIR)/$(am__dirstamp)
-mon/MonitorStore.lo: mon/$(am__dirstamp) mon/$(DEPDIR)/$(am__dirstamp)
 mon/HealthMonitor.lo: mon/$(am__dirstamp) \
 	mon/$(DEPDIR)/$(am__dirstamp)
 mon/DataHealthService.lo: mon/$(am__dirstamp) \
@@ -10442,6 +12065,14 @@ libmsg.la: $(libmsg_la_OBJECTS) $(libmsg_la_DEPENDENCIES) $(EXTRA_libmsg_la_DEPE
 	$(AM_V_CXXLD)$(CXXLINK)  $(libmsg_la_OBJECTS) $(libmsg_la_LIBADD) $(LIBS)
 os/libos_la-chain_xattr.lo: os/$(am__dirstamp) \
 	os/$(DEPDIR)/$(am__dirstamp)
+os/fs/$(am__dirstamp):
+	@$(MKDIR_P) os/fs
+	@: > os/fs/$(am__dirstamp)
+os/fs/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) os/fs/$(DEPDIR)
+	@: > os/fs/$(DEPDIR)/$(am__dirstamp)
+os/fs/libos_la-FS.lo: os/fs/$(am__dirstamp) \
+	os/fs/$(DEPDIR)/$(am__dirstamp)
 os/libos_la-DBObjectMap.lo: os/$(am__dirstamp) \
 	os/$(DEPDIR)/$(am__dirstamp)
 os/libos_la-GenericObjectMap.lo: os/$(am__dirstamp) \
@@ -10450,8 +12081,6 @@ os/libos_la-FileJournal.lo: os/$(am__dirstamp) \
 	os/$(DEPDIR)/$(am__dirstamp)
 os/libos_la-FileStore.lo: os/$(am__dirstamp) \
 	os/$(DEPDIR)/$(am__dirstamp)
-os/libos_la-FlatIndex.lo: os/$(am__dirstamp) \
-	os/$(DEPDIR)/$(am__dirstamp)
 os/libos_la-GenericFileStoreBackend.lo: os/$(am__dirstamp) \
 	os/$(DEPDIR)/$(am__dirstamp)
 os/libos_la-HashIndex.lo: os/$(am__dirstamp) \
@@ -10478,6 +12107,16 @@ common/libos_la-TrackedOp.lo: common/$(am__dirstamp) \
 	common/$(DEPDIR)/$(am__dirstamp)
 os/libos_la-BtrfsFileStoreBackend.lo: os/$(am__dirstamp) \
 	os/$(DEPDIR)/$(am__dirstamp)
+os/newstore/$(am__dirstamp):
+	@$(MKDIR_P) os/newstore
+	@: > os/newstore/$(am__dirstamp)
+os/newstore/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) os/newstore/$(DEPDIR)
+	@: > os/newstore/$(DEPDIR)/$(am__dirstamp)
+os/newstore/libos_la-NewStore.lo: os/newstore/$(am__dirstamp) \
+	os/newstore/$(DEPDIR)/$(am__dirstamp)
+os/fs/libos_la-XFS.lo: os/fs/$(am__dirstamp) \
+	os/fs/$(DEPDIR)/$(am__dirstamp)
 os/libos_la-XfsFileStoreBackend.lo: os/$(am__dirstamp) \
 	os/$(DEPDIR)/$(am__dirstamp)
 os/libos_la-ZFSFileStoreBackend.lo: os/$(am__dirstamp) \
@@ -10492,8 +12131,22 @@ os/libos_rocksdb_la-RocksDBStore.lo: os/$(am__dirstamp) \
 
 libos_rocksdb.la: $(libos_rocksdb_la_OBJECTS) $(libos_rocksdb_la_DEPENDENCIES) $(EXTRA_libos_rocksdb_la_DEPENDENCIES) 
 	$(AM_V_CXXLD)$(libos_rocksdb_la_LINK) $(am_libos_rocksdb_la_rpath) $(libos_rocksdb_la_OBJECTS) $(libos_rocksdb_la_LIBADD) $(LIBS)
+tracing/$(am__dirstamp):
+	@$(MKDIR_P) tracing
+	@: > tracing/$(am__dirstamp)
+tracing/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) tracing/$(DEPDIR)
+	@: > tracing/$(DEPDIR)/$(am__dirstamp)
+tracing/libos_tp_la-objectstore.lo: tracing/$(am__dirstamp) \
+	tracing/$(DEPDIR)/$(am__dirstamp)
+
+libos_tp.la: $(libos_tp_la_OBJECTS) $(libos_tp_la_DEPENDENCIES) $(EXTRA_libos_tp_la_DEPENDENCIES) 
+	$(AM_V_CCLD)$(libos_tp_la_LINK) $(am_libos_tp_la_rpath) $(libos_tp_la_OBJECTS) $(libos_tp_la_LIBADD) $(LIBS)
 os/libos_types_la-Transaction.lo: os/$(am__dirstamp) \
 	os/$(DEPDIR)/$(am__dirstamp)
+os/newstore/libos_types_la-newstore_types.lo:  \
+	os/newstore/$(am__dirstamp) \
+	os/newstore/$(DEPDIR)/$(am__dirstamp)
 
 libos_types.la: $(libos_types_la_OBJECTS) $(libos_types_la_DEPENDENCIES) $(EXTRA_libos_types_la_DEPENDENCIES) 
 	$(AM_V_CXXLD)$(libos_types_la_LINK)  $(libos_types_la_OBJECTS) $(libos_types_la_LIBADD) $(LIBS)
@@ -10510,8 +12163,6 @@ osd/libosd_la-ECTransaction.lo: osd/$(am__dirstamp) \
 	osd/$(DEPDIR)/$(am__dirstamp)
 osd/libosd_la-PGBackend.lo: osd/$(am__dirstamp) \
 	osd/$(DEPDIR)/$(am__dirstamp)
-osd/libosd_la-Ager.lo: osd/$(am__dirstamp) \
-	osd/$(DEPDIR)/$(am__dirstamp)
 osd/libosd_la-HitSet.lo: osd/$(am__dirstamp) \
 	osd/$(DEPDIR)/$(am__dirstamp)
 osd/libosd_la-OSD.lo: osd/$(am__dirstamp) \
@@ -10539,6 +12190,15 @@ objclass/libosd_la-class_api.lo: objclass/$(am__dirstamp) \
 
 libosd.la: $(libosd_la_OBJECTS) $(libosd_la_DEPENDENCIES) $(EXTRA_libosd_la_DEPENDENCIES) 
 	$(AM_V_CXXLD)$(libosd_la_LINK) $(am_libosd_la_rpath) $(libosd_la_OBJECTS) $(libosd_la_LIBADD) $(LIBS)
+tracing/libosd_tp_la-oprequest.lo: tracing/$(am__dirstamp) \
+	tracing/$(DEPDIR)/$(am__dirstamp)
+tracing/libosd_tp_la-osd.lo: tracing/$(am__dirstamp) \
+	tracing/$(DEPDIR)/$(am__dirstamp)
+tracing/libosd_tp_la-pg.lo: tracing/$(am__dirstamp) \
+	tracing/$(DEPDIR)/$(am__dirstamp)
+
+libosd_tp.la: $(libosd_tp_la_OBJECTS) $(libosd_tp_la_DEPENDENCIES) $(EXTRA_libosd_tp_la_DEPENDENCIES) 
+	$(AM_V_CCLD)$(libosd_tp_la_LINK) $(am_libosd_tp_la_rpath) $(libosd_tp_la_OBJECTS) $(libosd_tp_la_LIBADD) $(LIBS)
 osd/libosd_types_la-PGLog.lo: osd/$(am__dirstamp) \
 	osd/$(DEPDIR)/$(am__dirstamp)
 osd/libosd_types_la-osd_types.lo: osd/$(am__dirstamp) \
@@ -10593,8 +12253,6 @@ librados/librados_la-librados.lo: librados/$(am__dirstamp) \
 
 librados.la: $(librados_la_OBJECTS) $(librados_la_DEPENDENCIES) $(EXTRA_librados_la_DEPENDENCIES) 
 	$(AM_V_CXXLD)$(librados_la_LINK) $(am_librados_la_rpath) $(librados_la_OBJECTS) $(librados_la_LIBADD) $(LIBS)
-common/buffer.lo: common/$(am__dirstamp) \
-	common/$(DEPDIR)/$(am__dirstamp)
 librados/librados.lo: librados/$(am__dirstamp) \
 	librados/$(DEPDIR)/$(am__dirstamp)
 
@@ -10641,6 +12299,11 @@ test/librados_test_stub/TestWatchNotify.lo:  \
 
 librados_test_stub.la: $(librados_test_stub_la_OBJECTS) $(librados_test_stub_la_DEPENDENCIES) $(EXTRA_librados_test_stub_la_DEPENDENCIES) 
 	$(AM_V_CXXLD)$(CXXLINK) $(am_librados_test_stub_la_rpath) $(librados_test_stub_la_OBJECTS) $(librados_test_stub_la_LIBADD) $(LIBS)
+tracing/librados_tp_la-librados.lo: tracing/$(am__dirstamp) \
+	tracing/$(DEPDIR)/$(am__dirstamp)
+
+librados_tp.la: $(librados_tp_la_OBJECTS) $(librados_tp_la_DEPENDENCIES) $(EXTRA_librados_tp_la_DEPENDENCIES) 
+	$(AM_V_CCLD)$(librados_tp_la_LINK) $(am_librados_tp_la_rpath) $(librados_tp_la_OBJECTS) $(librados_tp_la_LIBADD) $(LIBS)
 libradosstriper/$(am__dirstamp):
 	@$(MKDIR_P) libradosstriper
 	@: > libradosstriper/$(am__dirstamp)
@@ -10719,6 +12382,8 @@ librbd/AsyncTrimRequest.lo: librbd/$(am__dirstamp) \
 	librbd/$(DEPDIR)/$(am__dirstamp)
 librbd/CopyupRequest.lo: librbd/$(am__dirstamp) \
 	librbd/$(DEPDIR)/$(am__dirstamp)
+librbd/DiffIterate.lo: librbd/$(am__dirstamp) \
+	librbd/$(DEPDIR)/$(am__dirstamp)
 librbd/ImageCtx.lo: librbd/$(am__dirstamp) \
 	librbd/$(DEPDIR)/$(am__dirstamp)
 librbd/ImageWatcher.lo: librbd/$(am__dirstamp) \
@@ -10729,6 +12394,8 @@ librbd/LibrbdWriteback.lo: librbd/$(am__dirstamp) \
 	librbd/$(DEPDIR)/$(am__dirstamp)
 librbd/ObjectMap.lo: librbd/$(am__dirstamp) \
 	librbd/$(DEPDIR)/$(am__dirstamp)
+librbd/RebuildObjectMapRequest.lo: librbd/$(am__dirstamp) \
+	librbd/$(DEPDIR)/$(am__dirstamp)
 
 librbd_internal.la: $(librbd_internal_la_OBJECTS) $(librbd_internal_la_DEPENDENCIES) $(EXTRA_librbd_internal_la_DEPENDENCIES) 
 	$(AM_V_CXXLD)$(CXXLINK) $(am_librbd_internal_la_rpath) $(librbd_internal_la_OBJECTS) $(librbd_internal_la_LIBADD) $(LIBS)
@@ -10740,7 +12407,7 @@ rbd_replay/$(DEPDIR)/$(am__dirstamp):
 	@: > rbd_replay/$(DEPDIR)/$(am__dirstamp)
 rbd_replay/actions.lo: rbd_replay/$(am__dirstamp) \
 	rbd_replay/$(DEPDIR)/$(am__dirstamp)
-rbd_replay/Deser.lo: rbd_replay/$(am__dirstamp) \
+rbd_replay/BufferReader.lo: rbd_replay/$(am__dirstamp) \
 	rbd_replay/$(DEPDIR)/$(am__dirstamp)
 rbd_replay/ImageNameMap.lo: rbd_replay/$(am__dirstamp) \
 	rbd_replay/$(DEPDIR)/$(am__dirstamp)
@@ -10750,8 +12417,6 @@ rbd_replay/rbd_loc.lo: rbd_replay/$(am__dirstamp) \
 	rbd_replay/$(DEPDIR)/$(am__dirstamp)
 rbd_replay/Replayer.lo: rbd_replay/$(am__dirstamp) \
 	rbd_replay/$(DEPDIR)/$(am__dirstamp)
-rbd_replay/Ser.lo: rbd_replay/$(am__dirstamp) \
-	rbd_replay/$(DEPDIR)/$(am__dirstamp)
 
 librbd_replay.la: $(librbd_replay_la_OBJECTS) $(librbd_replay_la_DEPENDENCIES) $(EXTRA_librbd_replay_la_DEPENDENCIES) 
 	$(AM_V_CXXLD)$(CXXLINK) $(am_librbd_replay_la_rpath) $(librbd_replay_la_OBJECTS) $(librbd_replay_la_LIBADD) $(LIBS)
@@ -10760,6 +12425,11 @@ rbd_replay/ios.lo: rbd_replay/$(am__dirstamp) \
 
 librbd_replay_ios.la: $(librbd_replay_ios_la_OBJECTS) $(librbd_replay_ios_la_DEPENDENCIES) $(EXTRA_librbd_replay_ios_la_DEPENDENCIES) 
 	$(AM_V_CXXLD)$(CXXLINK) $(am_librbd_replay_ios_la_rpath) $(librbd_replay_ios_la_OBJECTS) $(librbd_replay_ios_la_LIBADD) $(LIBS)
+rbd_replay/ActionTypes.lo: rbd_replay/$(am__dirstamp) \
+	rbd_replay/$(DEPDIR)/$(am__dirstamp)
+
+librbd_replay_types.la: $(librbd_replay_types_la_OBJECTS) $(librbd_replay_types_la_DEPENDENCIES) $(EXTRA_librbd_replay_types_la_DEPENDENCIES) 
+	$(AM_V_CXXLD)$(CXXLINK) $(am_librbd_replay_types_la_rpath) $(librbd_replay_types_la_OBJECTS) $(librbd_replay_types_la_LIBADD) $(LIBS)
 test/librbd/$(am__dirstamp):
 	@$(MKDIR_P) test/librbd
 	@: > test/librbd/$(am__dirstamp)
@@ -10787,6 +12457,11 @@ test/librbd/librbd_test_la-test_ObjectMap.lo:  \
 
 librbd_test.la: $(librbd_test_la_OBJECTS) $(librbd_test_la_DEPENDENCIES) $(EXTRA_librbd_test_la_DEPENDENCIES) 
 	$(AM_V_CXXLD)$(librbd_test_la_LINK) $(am_librbd_test_la_rpath) $(librbd_test_la_OBJECTS) $(librbd_test_la_LIBADD) $(LIBS)
+tracing/librbd_tp_la-librbd.lo: tracing/$(am__dirstamp) \
+	tracing/$(DEPDIR)/$(am__dirstamp)
+
+librbd_tp.la: $(librbd_tp_la_OBJECTS) $(librbd_tp_la_DEPENDENCIES) $(EXTRA_librbd_tp_la_DEPENDENCIES) 
+	$(AM_V_CCLD)$(librbd_tp_la_LINK) $(am_librbd_tp_la_rpath) $(librbd_tp_la_OBJECTS) $(librbd_tp_la_LIBADD) $(LIBS)
 librbd/WatchNotifyTypes.lo: librbd/$(am__dirstamp) \
 	librbd/$(DEPDIR)/$(am__dirstamp)
 
@@ -10860,6 +12535,8 @@ rgw/librgw_la-rgw_quota.lo: rgw/$(am__dirstamp) \
 	rgw/$(DEPDIR)/$(am__dirstamp)
 rgw/librgw_la-rgw_dencoder.lo: rgw/$(am__dirstamp) \
 	rgw/$(DEPDIR)/$(am__dirstamp)
+rgw/librgw_la-rgw_object_expirer_core.lo: rgw/$(am__dirstamp) \
+	rgw/$(DEPDIR)/$(am__dirstamp)
 
 librgw.la: $(librgw_la_OBJECTS) $(librgw_la_DEPENDENCIES) $(EXTRA_librgw_la_DEPENDENCIES) 
 	$(AM_V_CXXLD)$(librgw_la_LINK) $(am_librgw_la_rpath) $(librgw_la_OBJECTS) $(librgw_la_LIBADD) $(LIBS)
@@ -11083,7 +12760,9 @@ test/encoding/ceph_dencoder-ceph_dencoder.$(OBJEXT):  \
 	test/encoding/$(DEPDIR)/$(am__dirstamp)
 mds/ceph_dencoder-Capability.$(OBJEXT): mds/$(am__dirstamp) \
 	mds/$(DEPDIR)/$(am__dirstamp)
-mds/ceph_dencoder-MDS.$(OBJEXT): mds/$(am__dirstamp) \
+mds/ceph_dencoder-MDSDaemon.$(OBJEXT): mds/$(am__dirstamp) \
+	mds/$(DEPDIR)/$(am__dirstamp)
+mds/ceph_dencoder-MDSRank.$(OBJEXT): mds/$(am__dirstamp) \
 	mds/$(DEPDIR)/$(am__dirstamp)
 mds/ceph_dencoder-Beacon.$(OBJEXT): mds/$(am__dirstamp) \
 	mds/$(DEPDIR)/$(am__dirstamp)
@@ -11099,6 +12778,8 @@ mds/ceph_dencoder-MDCache.$(OBJEXT): mds/$(am__dirstamp) \
 	mds/$(DEPDIR)/$(am__dirstamp)
 mds/ceph_dencoder-RecoveryQueue.$(OBJEXT): mds/$(am__dirstamp) \
 	mds/$(DEPDIR)/$(am__dirstamp)
+mds/ceph_dencoder-StrayManager.$(OBJEXT): mds/$(am__dirstamp) \
+	mds/$(DEPDIR)/$(am__dirstamp)
 mds/ceph_dencoder-Locker.$(OBJEXT): mds/$(am__dirstamp) \
 	mds/$(DEPDIR)/$(am__dirstamp)
 mds/ceph_dencoder-Migrator.$(OBJEXT): mds/$(am__dirstamp) \
@@ -11123,6 +12804,8 @@ mds/ceph_dencoder-MDSTableClient.$(OBJEXT): mds/$(am__dirstamp) \
 	mds/$(DEPDIR)/$(am__dirstamp)
 mds/ceph_dencoder-MDSTableServer.$(OBJEXT): mds/$(am__dirstamp) \
 	mds/$(DEPDIR)/$(am__dirstamp)
+mds/ceph_dencoder-SimpleLock.$(OBJEXT): mds/$(am__dirstamp) \
+	mds/$(DEPDIR)/$(am__dirstamp)
 mds/ceph_dencoder-SnapRealm.$(OBJEXT): mds/$(am__dirstamp) \
 	mds/$(DEPDIR)/$(am__dirstamp)
 mds/ceph_dencoder-SnapServer.$(OBJEXT): mds/$(am__dirstamp) \
@@ -11183,6 +12866,8 @@ ceph-monstore-tool$(EXEEXT): $(ceph_monstore_tool_OBJECTS) $(ceph_monstore_tool_
 	$(AM_V_CXXLD)$(CXXLINK) $(ceph_monstore_tool_OBJECTS) $(ceph_monstore_tool_LDADD) $(LIBS)
 tools/ceph_objectstore_tool.$(OBJEXT): tools/$(am__dirstamp) \
 	tools/$(DEPDIR)/$(am__dirstamp)
+tools/RadosDump.$(OBJEXT): tools/$(am__dirstamp) \
+	tools/$(DEPDIR)/$(am__dirstamp)
 
 ceph-objectstore-tool$(EXEEXT): $(ceph_objectstore_tool_OBJECTS) $(ceph_objectstore_tool_DEPENDENCIES) $(EXTRA_ceph_objectstore_tool_DEPENDENCIES) 
 	@rm -f ceph-objectstore-tool$(EXEEXT)
@@ -11253,12 +12938,44 @@ test/multi_stress_watch.$(OBJEXT): test/$(am__dirstamp) \
 ceph_multi_stress_watch$(EXEEXT): $(ceph_multi_stress_watch_OBJECTS) $(ceph_multi_stress_watch_DEPENDENCIES) $(EXTRA_ceph_multi_stress_watch_DEPENDENCIES) 
 	@rm -f ceph_multi_stress_watch$(EXEEXT)
 	$(AM_V_CXXLD)$(CXXLINK) $(ceph_multi_stress_watch_OBJECTS) $(ceph_multi_stress_watch_LDADD) $(LIBS)
+test/objectstore_bench.$(OBJEXT): test/$(am__dirstamp) \
+	test/$(DEPDIR)/$(am__dirstamp)
+
+ceph_objectstore_bench$(EXEEXT): $(ceph_objectstore_bench_OBJECTS) $(ceph_objectstore_bench_DEPENDENCIES) $(EXTRA_ceph_objectstore_bench_DEPENDENCIES) 
+	@rm -f ceph_objectstore_bench$(EXEEXT)
+	$(AM_V_CXXLD)$(CXXLINK) $(ceph_objectstore_bench_OBJECTS) $(ceph_objectstore_bench_LDADD) $(LIBS)
 test/omap_bench.$(OBJEXT): test/$(am__dirstamp) \
 	test/$(DEPDIR)/$(am__dirstamp)
 
 ceph_omapbench$(EXEEXT): $(ceph_omapbench_OBJECTS) $(ceph_omapbench_DEPENDENCIES) $(EXTRA_ceph_omapbench_DEPENDENCIES) 
 	@rm -f ceph_omapbench$(EXEEXT)
 	$(AM_V_CXXLD)$(CXXLINK) $(ceph_omapbench_OBJECTS) $(ceph_omapbench_LDADD) $(LIBS)
+test/ceph_perf_local-perf_local.$(OBJEXT): test/$(am__dirstamp) \
+	test/$(DEPDIR)/$(am__dirstamp)
+test/ceph_perf_local-perf_helper.$(OBJEXT): test/$(am__dirstamp) \
+	test/$(DEPDIR)/$(am__dirstamp)
+
+ceph_perf_local$(EXEEXT): $(ceph_perf_local_OBJECTS) $(ceph_perf_local_DEPENDENCIES) $(EXTRA_ceph_perf_local_DEPENDENCIES) 
+	@rm -f ceph_perf_local$(EXEEXT)
+	$(AM_V_CXXLD)$(ceph_perf_local_LINK) $(ceph_perf_local_OBJECTS) $(ceph_perf_local_LDADD) $(LIBS)
+test/msgr/$(am__dirstamp):
+	@$(MKDIR_P) test/msgr
+	@: > test/msgr/$(am__dirstamp)
+test/msgr/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) test/msgr/$(DEPDIR)
+	@: > test/msgr/$(DEPDIR)/$(am__dirstamp)
+test/msgr/ceph_perf_msgr_client-perf_msgr_client.$(OBJEXT):  \
+	test/msgr/$(am__dirstamp) test/msgr/$(DEPDIR)/$(am__dirstamp)
+
+ceph_perf_msgr_client$(EXEEXT): $(ceph_perf_msgr_client_OBJECTS) $(ceph_perf_msgr_client_DEPENDENCIES) $(EXTRA_ceph_perf_msgr_client_DEPENDENCIES) 
+	@rm -f ceph_perf_msgr_client$(EXEEXT)
+	$(AM_V_CXXLD)$(ceph_perf_msgr_client_LINK) $(ceph_perf_msgr_client_OBJECTS) $(ceph_perf_msgr_client_LDADD) $(LIBS)
+test/msgr/ceph_perf_msgr_server-perf_msgr_server.$(OBJEXT):  \
+	test/msgr/$(am__dirstamp) test/msgr/$(DEPDIR)/$(am__dirstamp)
+
+ceph_perf_msgr_server$(EXEEXT): $(ceph_perf_msgr_server_OBJECTS) $(ceph_perf_msgr_server_DEPENDENCIES) $(EXTRA_ceph_perf_msgr_server_DEPENDENCIES) 
+	@rm -f ceph_perf_msgr_server$(EXEEXT)
+	$(AM_V_CXXLD)$(ceph_perf_msgr_server_LINK) $(ceph_perf_msgr_server_OBJECTS) $(ceph_perf_msgr_server_LDADD) $(LIBS)
 test/objectstore/$(am__dirstamp):
 	@$(MKDIR_P) test/objectstore
 	@: > test/objectstore/$(am__dirstamp)
@@ -11364,12 +13081,6 @@ test/streamtest.$(OBJEXT): test/$(am__dirstamp) \
 ceph_streamtest$(EXEEXT): $(ceph_streamtest_OBJECTS) $(ceph_streamtest_DEPENDENCIES) $(EXTRA_ceph_streamtest_DEPENDENCIES) 
 	@rm -f ceph_streamtest$(EXEEXT)
 	$(AM_V_CXXLD)$(CXXLINK) $(ceph_streamtest_OBJECTS) $(ceph_streamtest_LDADD) $(LIBS)
-test/msgr/$(am__dirstamp):
-	@$(MKDIR_P) test/msgr
-	@: > test/msgr/$(am__dirstamp)
-test/msgr/$(DEPDIR)/$(am__dirstamp):
-	@$(MKDIR_P) test/msgr/$(DEPDIR)
-	@: > test/msgr/$(DEPDIR)/$(am__dirstamp)
 test/msgr/ceph_test_async_driver-test_async_driver.$(OBJEXT):  \
 	test/msgr/$(am__dirstamp) test/msgr/$(DEPDIR)/$(am__dirstamp)
 
@@ -11427,6 +13138,19 @@ test/cls_log/ceph_test_cls_log-test_cls_log.$(OBJEXT):  \
 ceph_test_cls_log$(EXEEXT): $(ceph_test_cls_log_OBJECTS) $(ceph_test_cls_log_DEPENDENCIES) $(EXTRA_ceph_test_cls_log_DEPENDENCIES) 
 	@rm -f ceph_test_cls_log$(EXEEXT)
 	$(AM_V_CXXLD)$(ceph_test_cls_log_LINK) $(ceph_test_cls_log_OBJECTS) $(ceph_test_cls_log_LDADD) $(LIBS)
+test/cls_numops/$(am__dirstamp):
+	@$(MKDIR_P) test/cls_numops
+	@: > test/cls_numops/$(am__dirstamp)
+test/cls_numops/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) test/cls_numops/$(DEPDIR)
+	@: > test/cls_numops/$(DEPDIR)/$(am__dirstamp)
+test/cls_numops/ceph_test_cls_numops-test_cls_numops.$(OBJEXT):  \
+	test/cls_numops/$(am__dirstamp) \
+	test/cls_numops/$(DEPDIR)/$(am__dirstamp)
+
+ceph_test_cls_numops$(EXEEXT): $(ceph_test_cls_numops_OBJECTS) $(ceph_test_cls_numops_DEPENDENCIES) $(EXTRA_ceph_test_cls_numops_DEPENDENCIES) 
+	@rm -f ceph_test_cls_numops$(EXEEXT)
+	$(AM_V_CXXLD)$(ceph_test_cls_numops_LINK) $(ceph_test_cls_numops_OBJECTS) $(ceph_test_cls_numops_LDADD) $(LIBS)
 test/cls_rbd/$(am__dirstamp):
 	@$(MKDIR_P) test/cls_rbd
 	@: > test/cls_rbd/$(am__dirstamp)
@@ -11606,6 +13330,13 @@ test/testkeys.$(OBJEXT): test/$(am__dirstamp) \
 ceph_test_keys$(EXEEXT): $(ceph_test_keys_OBJECTS) $(ceph_test_keys_DEPENDENCIES) $(EXTRA_ceph_test_keys_DEPENDENCIES) 
 	@rm -f ceph_test_keys$(EXEEXT)
 	$(AM_V_CXXLD)$(CXXLINK) $(ceph_test_keys_OBJECTS) $(ceph_test_keys_LDADD) $(LIBS)
+test/objectstore/ceph_test_keyvaluedb-test_kv.$(OBJEXT):  \
+	test/objectstore/$(am__dirstamp) \
+	test/objectstore/$(DEPDIR)/$(am__dirstamp)
+
+ceph_test_keyvaluedb$(EXEEXT): $(ceph_test_keyvaluedb_OBJECTS) $(ceph_test_keyvaluedb_DEPENDENCIES) $(EXTRA_ceph_test_keyvaluedb_DEPENDENCIES) 
+	@rm -f ceph_test_keyvaluedb$(EXEEXT)
+	$(AM_V_CXXLD)$(ceph_test_keyvaluedb_LINK) $(ceph_test_keyvaluedb_OBJECTS) $(ceph_test_keyvaluedb_LDADD) $(LIBS)
 test/ObjectMap/$(am__dirstamp):
 	@$(MKDIR_P) test/ObjectMap
 	@: > test/ObjectMap/$(am__dirstamp)
@@ -11647,6 +13378,9 @@ test/libcephfs/ceph_test_libcephfs-caps.$(OBJEXT):  \
 test/libcephfs/ceph_test_libcephfs-multiclient.$(OBJEXT):  \
 	test/libcephfs/$(am__dirstamp) \
 	test/libcephfs/$(DEPDIR)/$(am__dirstamp)
+test/libcephfs/ceph_test_libcephfs-flock.$(OBJEXT):  \
+	test/libcephfs/$(am__dirstamp) \
+	test/libcephfs/$(DEPDIR)/$(am__dirstamp)
 
 ceph_test_libcephfs$(EXEEXT): $(ceph_test_libcephfs_OBJECTS) $(ceph_test_libcephfs_DEPENDENCIES) $(EXTRA_ceph_test_libcephfs_DEPENDENCIES) 
 	@rm -f ceph_test_libcephfs$(EXEEXT)
@@ -11953,6 +13687,12 @@ test/rgw/ceph_test_rgw_manifest-test_rgw_manifest.$(OBJEXT):  \
 ceph_test_rgw_manifest$(EXEEXT): $(ceph_test_rgw_manifest_OBJECTS) $(ceph_test_rgw_manifest_DEPENDENCIES) $(EXTRA_ceph_test_rgw_manifest_DEPENDENCIES) 
 	@rm -f ceph_test_rgw_manifest$(EXEEXT)
 	$(AM_V_CXXLD)$(ceph_test_rgw_manifest_LINK) $(ceph_test_rgw_manifest_OBJECTS) $(ceph_test_rgw_manifest_LDADD) $(LIBS)
+test/rgw/ceph_test_rgw_obj-test_rgw_obj.$(OBJEXT):  \
+	test/rgw/$(am__dirstamp) test/rgw/$(DEPDIR)/$(am__dirstamp)
+
+ceph_test_rgw_obj$(EXEEXT): $(ceph_test_rgw_obj_OBJECTS) $(ceph_test_rgw_obj_DEPENDENCIES) $(EXTRA_ceph_test_rgw_obj_DEPENDENCIES) 
+	@rm -f ceph_test_rgw_obj$(EXEEXT)
+	$(AM_V_CXXLD)$(ceph_test_rgw_obj_LINK) $(ceph_test_rgw_obj_OBJECTS) $(ceph_test_rgw_obj_LDADD) $(LIBS)
 test/TestSignalHandlers.$(OBJEXT): test/$(am__dirstamp) \
 	test/$(DEPDIR)/$(am__dirstamp)
 
@@ -12005,6 +13745,16 @@ tools/cephfs/$(am__dirstamp):
 tools/cephfs/$(DEPDIR)/$(am__dirstamp):
 	@$(MKDIR_P) tools/cephfs/$(DEPDIR)
 	@: > tools/cephfs/$(DEPDIR)/$(am__dirstamp)
+tools/cephfs/cephfs-data-scan.$(OBJEXT): tools/cephfs/$(am__dirstamp) \
+	tools/cephfs/$(DEPDIR)/$(am__dirstamp)
+tools/cephfs/DataScan.$(OBJEXT): tools/cephfs/$(am__dirstamp) \
+	tools/cephfs/$(DEPDIR)/$(am__dirstamp)
+tools/cephfs/MDSUtility.$(OBJEXT): tools/cephfs/$(am__dirstamp) \
+	tools/cephfs/$(DEPDIR)/$(am__dirstamp)
+
+cephfs-data-scan$(EXEEXT): $(cephfs_data_scan_OBJECTS) $(cephfs_data_scan_DEPENDENCIES) $(EXTRA_cephfs_data_scan_DEPENDENCIES) 
+	@rm -f cephfs-data-scan$(EXEEXT)
+	$(AM_V_CXXLD)$(CXXLINK) $(cephfs_data_scan_OBJECTS) $(cephfs_data_scan_LDADD) $(LIBS)
 tools/cephfs/cephfs-journal-tool.$(OBJEXT):  \
 	tools/cephfs/$(am__dirstamp) \
 	tools/cephfs/$(DEPDIR)/$(am__dirstamp)
@@ -12020,8 +13770,6 @@ tools/cephfs/Dumper.$(OBJEXT): tools/cephfs/$(am__dirstamp) \
 	tools/cephfs/$(DEPDIR)/$(am__dirstamp)
 tools/cephfs/Resetter.$(OBJEXT): tools/cephfs/$(am__dirstamp) \
 	tools/cephfs/$(DEPDIR)/$(am__dirstamp)
-tools/cephfs/MDSUtility.$(OBJEXT): tools/cephfs/$(am__dirstamp) \
-	tools/cephfs/$(DEPDIR)/$(am__dirstamp)
 
 cephfs-journal-tool$(EXEEXT): $(cephfs_journal_tool_OBJECTS) $(cephfs_journal_tool_DEPENDENCIES) $(EXTRA_cephfs_journal_tool_DEPENDENCIES) 
 	@rm -f cephfs-journal-tool$(EXEEXT)
@@ -12084,11 +13832,9 @@ tools/rados/$(DEPDIR)/$(am__dirstamp):
 	@: > tools/rados/$(DEPDIR)/$(am__dirstamp)
 tools/rados/rados.$(OBJEXT): tools/rados/$(am__dirstamp) \
 	tools/rados/$(DEPDIR)/$(am__dirstamp)
-tools/rados/rados_import.$(OBJEXT): tools/rados/$(am__dirstamp) \
-	tools/rados/$(DEPDIR)/$(am__dirstamp)
-tools/rados/rados_export.$(OBJEXT): tools/rados/$(am__dirstamp) \
+tools/rados/RadosImport.$(OBJEXT): tools/rados/$(am__dirstamp) \
 	tools/rados/$(DEPDIR)/$(am__dirstamp)
-tools/rados/rados_sync.$(OBJEXT): tools/rados/$(am__dirstamp) \
+tools/rados/PoolDump.$(OBJEXT): tools/rados/$(am__dirstamp) \
 	tools/rados/$(DEPDIR)/$(am__dirstamp)
 common/obj_bencher.$(OBJEXT): common/$(am__dirstamp) \
 	common/$(DEPDIR)/$(am__dirstamp)
@@ -12144,6 +13890,12 @@ rgw/rgw_orphan.$(OBJEXT): rgw/$(am__dirstamp) \
 radosgw-admin$(EXEEXT): $(radosgw_admin_OBJECTS) $(radosgw_admin_DEPENDENCIES) $(EXTRA_radosgw_admin_DEPENDENCIES) 
 	@rm -f radosgw-admin$(EXEEXT)
 	$(AM_V_CXXLD)$(CXXLINK) $(radosgw_admin_OBJECTS) $(radosgw_admin_LDADD) $(LIBS)
+rgw/rgw_object_expirer.$(OBJEXT): rgw/$(am__dirstamp) \
+	rgw/$(DEPDIR)/$(am__dirstamp)
+
+radosgw-object-expirer$(EXEEXT): $(radosgw_object_expirer_OBJECTS) $(radosgw_object_expirer_DEPENDENCIES) $(EXTRA_radosgw_object_expirer_DEPENDENCIES) 
+	@rm -f radosgw-object-expirer$(EXEEXT)
+	$(AM_V_CXXLD)$(CXXLINK) $(radosgw_object_expirer_OBJECTS) $(radosgw_object_expirer_LDADD) $(LIBS)
 
 rbd$(EXEEXT): $(rbd_OBJECTS) $(rbd_DEPENDENCIES) $(EXTRA_rbd_DEPENDENCIES) 
 	@rm -f rbd$(EXEEXT)
@@ -12154,12 +13906,12 @@ rbd_fuse/$(am__dirstamp):
 rbd_fuse/$(DEPDIR)/$(am__dirstamp):
 	@$(MKDIR_P) rbd_fuse/$(DEPDIR)
 	@: > rbd_fuse/$(DEPDIR)/$(am__dirstamp)
-rbd_fuse/rbd-fuse.$(OBJEXT): rbd_fuse/$(am__dirstamp) \
+rbd_fuse/rbd_fuse-rbd-fuse.$(OBJEXT): rbd_fuse/$(am__dirstamp) \
 	rbd_fuse/$(DEPDIR)/$(am__dirstamp)
 
 rbd-fuse$(EXEEXT): $(rbd_fuse_OBJECTS) $(rbd_fuse_DEPENDENCIES) $(EXTRA_rbd_fuse_DEPENDENCIES) 
 	@rm -f rbd-fuse$(EXEEXT)
-	$(AM_V_CXXLD)$(CXXLINK) $(rbd_fuse_OBJECTS) $(rbd_fuse_LDADD) $(LIBS)
+	$(AM_V_CXXLD)$(rbd_fuse_LINK) $(rbd_fuse_OBJECTS) $(rbd_fuse_LDADD) $(LIBS)
 rbd_replay/rbd-replay.$(OBJEXT): rbd_replay/$(am__dirstamp) \
 	rbd_replay/$(DEPDIR)/$(am__dirstamp)
 
@@ -12172,14 +13924,6 @@ rbd_replay/rbd-replay-prep.$(OBJEXT): rbd_replay/$(am__dirstamp) \
 rbd-replay-prep$(EXEEXT): $(rbd_replay_prep_OBJECTS) $(rbd_replay_prep_DEPENDENCIES) $(EXTRA_rbd_replay_prep_DEPENDENCIES) 
 	@rm -f rbd-replay-prep$(EXEEXT)
 	$(AM_V_CXXLD)$(CXXLINK) $(rbd_replay_prep_OBJECTS) $(rbd_replay_prep_LDADD) $(LIBS)
-tools/rest_bench-rest_bench.$(OBJEXT): tools/$(am__dirstamp) \
-	tools/$(DEPDIR)/$(am__dirstamp)
-common/rest_bench-obj_bencher.$(OBJEXT): common/$(am__dirstamp) \
-	common/$(DEPDIR)/$(am__dirstamp)
-
-rest-bench$(EXEEXT): $(rest_bench_OBJECTS) $(rest_bench_DEPENDENCIES) $(EXTRA_rest_bench_DEPENDENCIES) 
-	@rm -f rest-bench$(EXEEXT)
-	$(AM_V_CXXLD)$(rest_bench_LINK) $(rest_bench_OBJECTS) $(rest_bench_LDADD) $(LIBS)
 test/messenger/$(am__dirstamp):
 	@$(MKDIR_P) test/messenger
 	@: > test/messenger/$(am__dirstamp)
@@ -12224,6 +13968,8 @@ test_build_libcephfs$(EXEEXT): $(test_build_libcephfs_OBJECTS) $(test_build_libc
 	$(AM_V_CXXLD)$(test_build_libcephfs_LINK) $(test_build_libcephfs_OBJECTS) $(test_build_libcephfs_LDADD) $(LIBS)
 test/test_build_libcommon-buildtest_skeleton.$(OBJEXT):  \
 	test/$(am__dirstamp) test/$(DEPDIR)/$(am__dirstamp)
+common/test_build_libcommon-buffer.$(OBJEXT): common/$(am__dirstamp) \
+	common/$(DEPDIR)/$(am__dirstamp)
 
 test_build_libcommon$(EXEEXT): $(test_build_libcommon_OBJECTS) $(test_build_libcommon_DEPENDENCIES) $(EXTRA_test_build_libcommon_DEPENDENCIES) 
 	@rm -f test_build_libcommon$(EXEEXT)
@@ -12308,6 +14054,8 @@ rgw/test_build_librgw-rgw_quota.$(OBJEXT): rgw/$(am__dirstamp) \
 	rgw/$(DEPDIR)/$(am__dirstamp)
 rgw/test_build_librgw-rgw_dencoder.$(OBJEXT): rgw/$(am__dirstamp) \
 	rgw/$(DEPDIR)/$(am__dirstamp)
+rgw/test_build_librgw-rgw_object_expirer_core.$(OBJEXT):  \
+	rgw/$(am__dirstamp) rgw/$(DEPDIR)/$(am__dirstamp)
 
 test_build_librgw$(EXEEXT): $(test_build_librgw_OBJECTS) $(test_build_librgw_DEPENDENCIES) $(EXTRA_test_build_librgw_DEPENDENCIES) 
 	@rm -f test_build_librgw$(EXEEXT)
@@ -12330,6 +14078,13 @@ test/unittest_arch-test_arch.$(OBJEXT): test/$(am__dirstamp) \
 unittest_arch$(EXEEXT): $(unittest_arch_OBJECTS) $(unittest_arch_DEPENDENCIES) $(EXTRA_unittest_arch_DEPENDENCIES) 
 	@rm -f unittest_arch$(EXEEXT)
 	$(AM_V_CXXLD)$(unittest_arch_LINK) $(unittest_arch_OBJECTS) $(unittest_arch_LDADD) $(LIBS)
+test/common/unittest_async_compressor-test_async_compressor.$(OBJEXT):  \
+	test/common/$(am__dirstamp) \
+	test/common/$(DEPDIR)/$(am__dirstamp)
+
+unittest_async_compressor$(EXEEXT): $(unittest_async_compressor_OBJECTS) $(unittest_async_compressor_DEPENDENCIES) $(EXTRA_unittest_async_compressor_DEPENDENCIES) 
+	@rm -f unittest_async_compressor$(EXEEXT)
+	$(AM_V_CXXLD)$(unittest_async_compressor_LINK) $(unittest_async_compressor_OBJECTS) $(unittest_async_compressor_LDADD) $(LIBS)
 test/unittest_base64-base64.$(OBJEXT): test/$(am__dirstamp) \
 	test/$(DEPDIR)/$(am__dirstamp)
 
@@ -12608,6 +14363,13 @@ test/erasure-code/unittest_erasure_code_plugin_lrc-TestErasureCodePluginLrc.$(OB
 unittest_erasure_code_plugin_lrc$(EXEEXT): $(unittest_erasure_code_plugin_lrc_OBJECTS) $(unittest_erasure_code_plugin_lrc_DEPENDENCIES) $(EXTRA_unittest_erasure_code_plugin_lrc_DEPENDENCIES) 
 	@rm -f unittest_erasure_code_plugin_lrc$(EXEEXT)
 	$(AM_V_CXXLD)$(unittest_erasure_code_plugin_lrc_LINK) $(unittest_erasure_code_plugin_lrc_OBJECTS) $(unittest_erasure_code_plugin_lrc_LDADD) $(LIBS)
+test/erasure-code/unittest_erasure_code_plugin_shec-TestErasureCodePluginShec.$(OBJEXT):  \
+	test/erasure-code/$(am__dirstamp) \
+	test/erasure-code/$(DEPDIR)/$(am__dirstamp)
+
+unittest_erasure_code_plugin_shec$(EXEEXT): $(unittest_erasure_code_plugin_shec_OBJECTS) $(unittest_erasure_code_plugin_shec_DEPENDENCIES) $(EXTRA_unittest_erasure_code_plugin_shec_DEPENDENCIES) 
+	@rm -f unittest_erasure_code_plugin_shec$(EXEEXT)
+	$(AM_V_CXXLD)$(unittest_erasure_code_plugin_shec_LINK) $(unittest_erasure_code_plugin_shec_OBJECTS) $(unittest_erasure_code_plugin_shec_LDADD) $(LIBS)
 test/erasure-code/unittest_erasure_code_shec-TestErasureCodeShec.$(OBJEXT):  \
 	test/erasure-code/$(am__dirstamp) \
 	test/erasure-code/$(DEPDIR)/$(am__dirstamp)
@@ -12623,9 +14385,6 @@ erasure-code/shec/unittest_erasure_code_shec-ErasureCodeShec.$(OBJEXT):  \
 erasure-code/shec/unittest_erasure_code_shec-ErasureCodeShecTableCache.$(OBJEXT):  \
 	erasure-code/shec/$(am__dirstamp) \
 	erasure-code/shec/$(DEPDIR)/$(am__dirstamp)
-erasure-code/shec/unittest_erasure_code_shec-shec.$(OBJEXT):  \
-	erasure-code/shec/$(am__dirstamp) \
-	erasure-code/shec/$(DEPDIR)/$(am__dirstamp)
 erasure-code/shec/unittest_erasure_code_shec-determinant.$(OBJEXT):  \
 	erasure-code/shec/$(am__dirstamp) \
 	erasure-code/shec/$(DEPDIR)/$(am__dirstamp)
@@ -12696,9 +14455,6 @@ erasure-code/shec/unittest_erasure_code_shec_all-ErasureCodeShec.$(OBJEXT):  \
 erasure-code/shec/unittest_erasure_code_shec_all-ErasureCodeShecTableCache.$(OBJEXT):  \
 	erasure-code/shec/$(am__dirstamp) \
 	erasure-code/shec/$(DEPDIR)/$(am__dirstamp)
-erasure-code/shec/unittest_erasure_code_shec_all-shec.$(OBJEXT):  \
-	erasure-code/shec/$(am__dirstamp) \
-	erasure-code/shec/$(DEPDIR)/$(am__dirstamp)
 erasure-code/shec/unittest_erasure_code_shec_all-determinant.$(OBJEXT):  \
 	erasure-code/shec/$(am__dirstamp) \
 	erasure-code/shec/$(DEPDIR)/$(am__dirstamp)
@@ -12754,6 +14510,76 @@ erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_all-gf_w8.$(OBJ
 unittest_erasure_code_shec_all$(EXEEXT): $(unittest_erasure_code_shec_all_OBJECTS) $(unittest_erasure_code_shec_all_DEPENDENCIES) $(EXTRA_unittest_erasure_code_shec_all_DEPENDENCIES) 
 	@rm -f unittest_erasure_code_shec_all$(EXEEXT)
 	$(AM_V_CXXLD)$(unittest_erasure_code_shec_all_LINK) $(unittest_erasure_code_shec_all_OBJECTS) $(unittest_erasure_code_shec_all_LDADD) $(LIBS)
+test/erasure-code/unittest_erasure_code_shec_arguments-TestErasureCodeShec_arguments.$(OBJEXT):  \
+	test/erasure-code/$(am__dirstamp) \
+	test/erasure-code/$(DEPDIR)/$(am__dirstamp)
+erasure-code/unittest_erasure_code_shec_arguments-ErasureCode.$(OBJEXT):  \
+	erasure-code/$(am__dirstamp) \
+	erasure-code/$(DEPDIR)/$(am__dirstamp)
+erasure-code/shec/unittest_erasure_code_shec_arguments-ErasureCodePluginShec.$(OBJEXT):  \
+	erasure-code/shec/$(am__dirstamp) \
+	erasure-code/shec/$(DEPDIR)/$(am__dirstamp)
+erasure-code/shec/unittest_erasure_code_shec_arguments-ErasureCodeShec.$(OBJEXT):  \
+	erasure-code/shec/$(am__dirstamp) \
+	erasure-code/shec/$(DEPDIR)/$(am__dirstamp)
+erasure-code/shec/unittest_erasure_code_shec_arguments-ErasureCodeShecTableCache.$(OBJEXT):  \
+	erasure-code/shec/$(am__dirstamp) \
+	erasure-code/shec/$(DEPDIR)/$(am__dirstamp)
+erasure-code/shec/unittest_erasure_code_shec_arguments-determinant.$(OBJEXT):  \
+	erasure-code/shec/$(am__dirstamp) \
+	erasure-code/shec/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-cauchy.$(OBJEXT):  \
+	erasure-code/jerasure/jerasure/src/$(am__dirstamp) \
+	erasure-code/jerasure/jerasure/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-galois.$(OBJEXT):  \
+	erasure-code/jerasure/jerasure/src/$(am__dirstamp) \
+	erasure-code/jerasure/jerasure/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-jerasure.$(OBJEXT):  \
+	erasure-code/jerasure/jerasure/src/$(am__dirstamp) \
+	erasure-code/jerasure/jerasure/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-liberation.$(OBJEXT):  \
+	erasure-code/jerasure/jerasure/src/$(am__dirstamp) \
+	erasure-code/jerasure/jerasure/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-reed_sol.$(OBJEXT):  \
+	erasure-code/jerasure/jerasure/src/$(am__dirstamp) \
+	erasure-code/jerasure/jerasure/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_wgen.$(OBJEXT):  \
+	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
+	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_method.$(OBJEXT):  \
+	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
+	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w16.$(OBJEXT):  \
+	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
+	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf.$(OBJEXT):  \
+	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
+	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w32.$(OBJEXT):  \
+	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
+	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w64.$(OBJEXT):  \
+	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
+	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w128.$(OBJEXT):  \
+	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
+	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_general.$(OBJEXT):  \
+	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
+	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w4.$(OBJEXT):  \
+	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
+	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_rand.$(OBJEXT):  \
+	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
+	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
+erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w8.$(OBJEXT):  \
+	erasure-code/jerasure/gf-complete/src/$(am__dirstamp) \
+	erasure-code/jerasure/gf-complete/src/$(DEPDIR)/$(am__dirstamp)
+
+unittest_erasure_code_shec_arguments$(EXEEXT): $(unittest_erasure_code_shec_arguments_OBJECTS) $(unittest_erasure_code_shec_arguments_DEPENDENCIES) $(EXTRA_unittest_erasure_code_shec_arguments_DEPENDENCIES) 
+	@rm -f unittest_erasure_code_shec_arguments$(EXEEXT)
+	$(AM_V_CXXLD)$(unittest_erasure_code_shec_arguments_LINK) $(unittest_erasure_code_shec_arguments_OBJECTS) $(unittest_erasure_code_shec_arguments_LDADD) $(LIBS)
 test/erasure-code/unittest_erasure_code_shec_thread-TestErasureCodeShec_thread.$(OBJEXT):  \
 	test/erasure-code/$(am__dirstamp) \
 	test/erasure-code/$(DEPDIR)/$(am__dirstamp)
@@ -12769,9 +14595,6 @@ erasure-code/shec/unittest_erasure_code_shec_thread-ErasureCodeShec.$(OBJEXT):
 erasure-code/shec/unittest_erasure_code_shec_thread-ErasureCodeShecTableCache.$(OBJEXT):  \
 	erasure-code/shec/$(am__dirstamp) \
 	erasure-code/shec/$(DEPDIR)/$(am__dirstamp)
-erasure-code/shec/unittest_erasure_code_shec_thread-shec.$(OBJEXT):  \
-	erasure-code/shec/$(am__dirstamp) \
-	erasure-code/shec/$(DEPDIR)/$(am__dirstamp)
 erasure-code/shec/unittest_erasure_code_shec_thread-determinant.$(OBJEXT):  \
 	erasure-code/shec/$(am__dirstamp) \
 	erasure-code/shec/$(DEPDIR)/$(am__dirstamp)
@@ -12833,18 +14656,6 @@ test/unittest_escape-escape.$(OBJEXT): test/$(am__dirstamp) \
 unittest_escape$(EXEEXT): $(unittest_escape_OBJECTS) $(unittest_escape_DEPENDENCIES) $(EXTRA_unittest_escape_DEPENDENCIES) 
 	@rm -f unittest_escape$(EXEEXT)
 	$(AM_V_CXXLD)$(unittest_escape_LINK) $(unittest_escape_OBJECTS) $(unittest_escape_LDADD) $(LIBS)
-test/os/$(am__dirstamp):
-	@$(MKDIR_P) test/os
-	@: > test/os/$(am__dirstamp)
-test/os/$(DEPDIR)/$(am__dirstamp):
-	@$(MKDIR_P) test/os/$(DEPDIR)
-	@: > test/os/$(DEPDIR)/$(am__dirstamp)
-test/os/unittest_flatindex-TestFlatIndex.$(OBJEXT):  \
-	test/os/$(am__dirstamp) test/os/$(DEPDIR)/$(am__dirstamp)
-
-unittest_flatindex$(EXEEXT): $(unittest_flatindex_OBJECTS) $(unittest_flatindex_DEPENDENCIES) $(EXTRA_unittest_flatindex_DEPENDENCIES) 
-	@rm -f unittest_flatindex$(EXEEXT)
-	$(AM_V_CXXLD)$(unittest_flatindex_LINK) $(unittest_flatindex_OBJECTS) $(unittest_flatindex_LDADD) $(LIBS)
 test/unittest_formatter-formatter.$(OBJEXT): test/$(am__dirstamp) \
 	test/$(DEPDIR)/$(am__dirstamp)
 rgw/unittest_formatter-rgw_formats.$(OBJEXT): rgw/$(am__dirstamp) \
@@ -12891,6 +14702,12 @@ test/unittest_ipaddr-test_ipaddr.$(OBJEXT): test/$(am__dirstamp) \
 unittest_ipaddr$(EXEEXT): $(unittest_ipaddr_OBJECTS) $(unittest_ipaddr_DEPENDENCIES) $(EXTRA_unittest_ipaddr_DEPENDENCIES) 
 	@rm -f unittest_ipaddr$(EXEEXT)
 	$(AM_V_CXXLD)$(unittest_ipaddr_LINK) $(unittest_ipaddr_OBJECTS) $(unittest_ipaddr_LDADD) $(LIBS)
+test/os/$(am__dirstamp):
+	@$(MKDIR_P) test/os
+	@: > test/os/$(am__dirstamp)
+test/os/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) test/os/$(DEPDIR)
+	@: > test/os/$(DEPDIR)/$(am__dirstamp)
 test/os/unittest_lfnindex-TestLFNIndex.$(OBJEXT):  \
 	test/os/$(am__dirstamp) test/os/$(DEPDIR)/$(am__dirstamp)
 
@@ -12920,6 +14737,9 @@ unittest_librados_config$(EXEEXT): $(unittest_librados_config_OBJECTS) $(unittes
 test/librbd/unittest_librbd-test_main.$(OBJEXT):  \
 	test/librbd/$(am__dirstamp) \
 	test/librbd/$(DEPDIR)/$(am__dirstamp)
+test/librbd/unittest_librbd-test_mock_fixture.$(OBJEXT):  \
+	test/librbd/$(am__dirstamp) \
+	test/librbd/$(DEPDIR)/$(am__dirstamp)
 
 unittest_librbd$(EXEEXT): $(unittest_librbd_OBJECTS) $(unittest_librbd_DEPENDENCIES) $(EXTRA_unittest_librbd_DEPENDENCIES) 
 	@rm -f unittest_librbd$(EXEEXT)
@@ -13009,6 +14829,12 @@ test/osd/unittest_osdscrub-TestOSDScrub.$(OBJEXT):  \
 unittest_osdscrub$(EXEEXT): $(unittest_osdscrub_OBJECTS) $(unittest_osdscrub_DEPENDENCIES) $(EXTRA_unittest_osdscrub_DEPENDENCIES) 
 	@rm -f unittest_osdscrub$(EXEEXT)
 	$(AM_V_CXXLD)$(unittest_osdscrub_LINK) $(unittest_osdscrub_OBJECTS) $(unittest_osdscrub_LDADD) $(LIBS)
+test/unittest_pageset-test_pageset.$(OBJEXT): test/$(am__dirstamp) \
+	test/$(DEPDIR)/$(am__dirstamp)
+
+unittest_pageset$(EXEEXT): $(unittest_pageset_OBJECTS) $(unittest_pageset_DEPENDENCIES) $(EXTRA_unittest_pageset_DEPENDENCIES) 
+	@rm -f unittest_pageset$(EXEEXT)
+	$(AM_V_CXXLD)$(unittest_pageset_LINK) $(unittest_pageset_OBJECTS) $(unittest_pageset_LDADD) $(LIBS)
 test/unittest_perf_counters-perf_counters.$(OBJEXT):  \
 	test/$(am__dirstamp) test/$(DEPDIR)/$(am__dirstamp)
 
@@ -13027,6 +14853,13 @@ test/unittest_prebufferedstreambuf-test_prebufferedstreambuf.$(OBJEXT):  \
 unittest_prebufferedstreambuf$(EXEEXT): $(unittest_prebufferedstreambuf_OBJECTS) $(unittest_prebufferedstreambuf_DEPENDENCIES) $(EXTRA_unittest_prebufferedstreambuf_DEPENDENCIES) 
 	@rm -f unittest_prebufferedstreambuf$(EXEEXT)
 	$(AM_V_CXXLD)$(unittest_prebufferedstreambuf_LINK) $(unittest_prebufferedstreambuf_OBJECTS) $(unittest_prebufferedstreambuf_LDADD) $(LIBS)
+test/common/unittest_prioritized_queue-test_prioritized_queue.$(OBJEXT):  \
+	test/common/$(am__dirstamp) \
+	test/common/$(DEPDIR)/$(am__dirstamp)
+
+unittest_prioritized_queue$(EXEEXT): $(unittest_prioritized_queue_OBJECTS) $(unittest_prioritized_queue_DEPENDENCIES) $(EXTRA_unittest_prioritized_queue_DEPENDENCIES) 
+	@rm -f unittest_prioritized_queue$(EXEEXT)
+	$(AM_V_CXXLD)$(unittest_prioritized_queue_LINK) $(unittest_prioritized_queue_OBJECTS) $(unittest_prioritized_queue_LDADD) $(LIBS)
 test/unittest_rbd_replay-test_rbd_replay.$(OBJEXT):  \
 	test/$(am__dirstamp) test/$(DEPDIR)/$(am__dirstamp)
 
@@ -13040,6 +14873,20 @@ test/common/unittest_readahead-Readahead.$(OBJEXT):  \
 unittest_readahead$(EXEEXT): $(unittest_readahead_OBJECTS) $(unittest_readahead_DEPENDENCIES) $(EXTRA_unittest_readahead_DEPENDENCIES) 
 	@rm -f unittest_readahead$(EXEEXT)
 	$(AM_V_CXXLD)$(unittest_readahead_LINK) $(unittest_readahead_OBJECTS) $(unittest_readahead_LDADD) $(LIBS)
+test/objectstore/unittest_rocksdb_option-TestRocksdbOptionParse.$(OBJEXT):  \
+	test/objectstore/$(am__dirstamp) \
+	test/objectstore/$(DEPDIR)/$(am__dirstamp)
+
+unittest_rocksdb_option$(EXEEXT): $(unittest_rocksdb_option_OBJECTS) $(unittest_rocksdb_option_DEPENDENCIES) $(EXTRA_unittest_rocksdb_option_DEPENDENCIES) 
+	@rm -f unittest_rocksdb_option$(EXEEXT)
+	$(AM_V_CXXLD)$(unittest_rocksdb_option_LINK) $(unittest_rocksdb_option_OBJECTS) $(unittest_rocksdb_option_LDADD) $(LIBS)
+test/objectstore/unittest_rocksdb_option_static-TestRocksdbOptionParse.$(OBJEXT):  \
+	test/objectstore/$(am__dirstamp) \
+	test/objectstore/$(DEPDIR)/$(am__dirstamp)
+
+unittest_rocksdb_option_static$(EXEEXT): $(unittest_rocksdb_option_static_OBJECTS) $(unittest_rocksdb_option_static_DEPENDENCIES) $(EXTRA_unittest_rocksdb_option_static_DEPENDENCIES) 
+	@rm -f unittest_rocksdb_option_static$(EXEEXT)
+	$(AM_V_CXXLD)$(unittest_rocksdb_option_static_LINK) $(unittest_rocksdb_option_static_OBJECTS) $(unittest_rocksdb_option_static_LDADD) $(LIBS)
 test/unittest_run_cmd-run_cmd.$(OBJEXT): test/$(am__dirstamp) \
 	test/$(DEPDIR)/$(am__dirstamp)
 
@@ -13111,6 +14958,12 @@ test/unittest_strtol-strtol.$(OBJEXT): test/$(am__dirstamp) \
 unittest_strtol$(EXEEXT): $(unittest_strtol_OBJECTS) $(unittest_strtol_DEPENDENCIES) $(EXTRA_unittest_strtol_DEPENDENCIES) 
 	@rm -f unittest_strtol$(EXEEXT)
 	$(AM_V_CXXLD)$(unittest_strtol_LINK) $(unittest_strtol_OBJECTS) $(unittest_strtol_LDADD) $(LIBS)
+test/unittest_subprocess-test_subprocess.$(OBJEXT):  \
+	test/$(am__dirstamp) test/$(DEPDIR)/$(am__dirstamp)
+
+unittest_subprocess$(EXEEXT): $(unittest_subprocess_OBJECTS) $(unittest_subprocess_DEPENDENCIES) $(EXTRA_unittest_subprocess_DEPENDENCIES) 
+	@rm -f unittest_subprocess$(EXEEXT)
+	$(AM_V_CXXLD)$(unittest_subprocess_LINK) $(unittest_subprocess_OBJECTS) $(unittest_subprocess_LDADD) $(LIBS)
 test/common/unittest_tableformatter-test_tableformatter.$(OBJEXT):  \
 	test/common/$(am__dirstamp) \
 	test/common/$(DEPDIR)/$(am__dirstamp)
@@ -13150,6 +15003,12 @@ test/unittest_workqueue-test_workqueue.$(OBJEXT):  \
 unittest_workqueue$(EXEEXT): $(unittest_workqueue_OBJECTS) $(unittest_workqueue_DEPENDENCIES) $(EXTRA_unittest_workqueue_DEPENDENCIES) 
 	@rm -f unittest_workqueue$(EXEEXT)
 	$(AM_V_CXXLD)$(unittest_workqueue_LINK) $(unittest_workqueue_OBJECTS) $(unittest_workqueue_LDADD) $(LIBS)
+test/unittest_xlist-test_xlist.$(OBJEXT): test/$(am__dirstamp) \
+	test/$(DEPDIR)/$(am__dirstamp)
+
+unittest_xlist$(EXEEXT): $(unittest_xlist_OBJECTS) $(unittest_xlist_DEPENDENCIES) $(EXTRA_unittest_xlist_DEPENDENCIES) 
+	@rm -f unittest_xlist$(EXEEXT)
+	$(AM_V_CXXLD)$(unittest_xlist_LINK) $(unittest_xlist_OBJECTS) $(unittest_xlist_LDADD) $(LIBS)
 test/messenger/xio_client-xio_client.$(OBJEXT):  \
 	test/messenger/$(am__dirstamp) \
 	test/messenger/$(DEPDIR)/$(am__dirstamp)
@@ -13240,6 +15099,41 @@ uninstall-ceph_libexecSCRIPTS:
 	files=`for p in $$list; do echo "$$p"; done | \
 	       sed -e 's,.*/,,;$(transform)'`; \
 	dir='$(DESTDIR)$(ceph_libexecdir)'; $(am__uninstall_files_from_dir)
+install-ceph_monstore_update_crushSCRIPTS: $(ceph_monstore_update_crush_SCRIPTS)
+	@$(NORMAL_INSTALL)
+	@list='$(ceph_monstore_update_crush_SCRIPTS)'; test -n "$(ceph_monstore_update_crushdir)" || list=; \
+	if test -n "$$list"; then \
+	  echo " $(MKDIR_P) '$(DESTDIR)$(ceph_monstore_update_crushdir)'"; \
+	  $(MKDIR_P) "$(DESTDIR)$(ceph_monstore_update_crushdir)" || exit 1; \
+	fi; \
+	for p in $$list; do \
+	  if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \
+	  if test -f "$$d$$p"; then echo "$$d$$p"; echo "$$p"; else :; fi; \
+	done | \
+	sed -e 'p;s,.*/,,;n' \
+	    -e 'h;s|.*|.|' \
+	    -e 'p;x;s,.*/,,;$(transform)' | sed 'N;N;N;s,\n, ,g' | \
+	$(AWK) 'BEGIN { files["."] = ""; dirs["."] = 1; } \
+	  { d=$$3; if (dirs[d] != 1) { print "d", d; dirs[d] = 1 } \
+	    if ($$2 == $$4) { files[d] = files[d] " " $$1; \
+	      if (++n[d] == $(am__install_max)) { \
+		print "f", d, files[d]; n[d] = 0; files[d] = "" } } \
+	    else { print "f", d "/" $$4, $$1 } } \
+	  END { for (d in files) print "f", d, files[d] }' | \
+	while read type dir files; do \
+	     if test "$$dir" = .; then dir=; else dir=/$$dir; fi; \
+	     test -z "$$files" || { \
+	       echo " $(INSTALL_SCRIPT) $$files '$(DESTDIR)$(ceph_monstore_update_crushdir)$$dir'"; \
+	       $(INSTALL_SCRIPT) $$files "$(DESTDIR)$(ceph_monstore_update_crushdir)$$dir" || exit $$?; \
+	     } \
+	; done
+
+uninstall-ceph_monstore_update_crushSCRIPTS:
+	@$(NORMAL_UNINSTALL)
+	@list='$(ceph_monstore_update_crush_SCRIPTS)'; test -n "$(ceph_monstore_update_crushdir)" || exit 0; \
+	files=`for p in $$list; do echo "$$p"; done | \
+	       sed -e 's,.*/,,;$(transform)'`; \
+	dir='$(DESTDIR)$(ceph_monstore_update_crushdir)'; $(am__uninstall_files_from_dir)
 install-ceph_sbinSCRIPTS: $(ceph_sbin_SCRIPTS)
 	@$(NORMAL_INSTALL)
 	@list='$(ceph_sbin_SCRIPTS)'; test -n "$(ceph_sbindir)" || list=; \
@@ -13432,12 +15326,16 @@ mostlyclean-compile:
 	-rm -f civetweb/src/*.lo
 	-rm -f client/*.$(OBJEXT)
 	-rm -f client/*.lo
+	-rm -f cls/cephfs/*.$(OBJEXT)
+	-rm -f cls/cephfs/*.lo
 	-rm -f cls/hello/*.$(OBJEXT)
 	-rm -f cls/hello/*.lo
 	-rm -f cls/lock/*.$(OBJEXT)
 	-rm -f cls/lock/*.lo
 	-rm -f cls/log/*.$(OBJEXT)
 	-rm -f cls/log/*.lo
+	-rm -f cls/numops/*.$(OBJEXT)
+	-rm -f cls/numops/*.lo
 	-rm -f cls/rbd/*.$(OBJEXT)
 	-rm -f cls/rbd/*.lo
 	-rm -f cls/refcount/*.$(OBJEXT)
@@ -13448,12 +15346,16 @@ mostlyclean-compile:
 	-rm -f cls/rgw/*.lo
 	-rm -f cls/statelog/*.$(OBJEXT)
 	-rm -f cls/statelog/*.lo
+	-rm -f cls/timeindex/*.$(OBJEXT)
+	-rm -f cls/timeindex/*.lo
 	-rm -f cls/user/*.$(OBJEXT)
 	-rm -f cls/user/*.lo
 	-rm -f cls/version/*.$(OBJEXT)
 	-rm -f cls/version/*.lo
 	-rm -f common/*.$(OBJEXT)
 	-rm -f common/*.lo
+	-rm -f compressor/*.$(OBJEXT)
+	-rm -f compressor/*.lo
 	-rm -f crush/*.$(OBJEXT)
 	-rm -f crush/*.lo
 	-rm -f erasure-code/*.$(OBJEXT)
@@ -13507,6 +15409,10 @@ mostlyclean-compile:
 	-rm -f objclass/*.lo
 	-rm -f os/*.$(OBJEXT)
 	-rm -f os/*.lo
+	-rm -f os/fs/*.$(OBJEXT)
+	-rm -f os/fs/*.lo
+	-rm -f os/newstore/*.$(OBJEXT)
+	-rm -f os/newstore/*.lo
 	-rm -f osd/*.$(OBJEXT)
 	-rm -f osd/*.lo
 	-rm -f osdc/*.$(OBJEXT)
@@ -13524,6 +15430,7 @@ mostlyclean-compile:
 	-rm -f test/cls_hello/*.$(OBJEXT)
 	-rm -f test/cls_lock/*.$(OBJEXT)
 	-rm -f test/cls_log/*.$(OBJEXT)
+	-rm -f test/cls_numops/*.$(OBJEXT)
 	-rm -f test/cls_rbd/*.$(OBJEXT)
 	-rm -f test/cls_refcount/*.$(OBJEXT)
 	-rm -f test/cls_replica_log/*.$(OBJEXT)
@@ -13560,6 +15467,8 @@ mostlyclean-compile:
 	-rm -f tools/*.$(OBJEXT)
 	-rm -f tools/cephfs/*.$(OBJEXT)
 	-rm -f tools/rados/*.$(OBJEXT)
+	-rm -f tracing/*.$(OBJEXT)
+	-rm -f tracing/*.lo
 
 distclean-compile:
 	-rm -f *.tab.c
@@ -13603,8 +15512,10 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at client/$(DEPDIR)/MetaSession.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at client/$(DEPDIR)/SyntheticClient.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at client/$(DEPDIR)/Trace.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at client/$(DEPDIR)/fuse_ll.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at client/$(DEPDIR)/libclient_fuse_la-fuse_ll.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at client/$(DEPDIR)/test_ioctls.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at cls/cephfs/$(DEPDIR)/cls_cephfs.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at cls/cephfs/$(DEPDIR)/cls_cephfs_client.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at cls/hello/$(DEPDIR)/cls_hello.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at cls/lock/$(DEPDIR)/cls_lock.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at cls/lock/$(DEPDIR)/cls_lock_client.Plo at am__quote@
@@ -13612,6 +15523,8 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at cls/lock/$(DEPDIR)/cls_lock_types.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at cls/log/$(DEPDIR)/cls_log.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at cls/log/$(DEPDIR)/cls_log_client.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at cls/numops/$(DEPDIR)/cls_numops.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at cls/numops/$(DEPDIR)/cls_numops_client.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at cls/rbd/$(DEPDIR)/cls_rbd.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at cls/rbd/$(DEPDIR)/cls_rbd_client.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at cls/refcount/$(DEPDIR)/cls_refcount.Plo at am__quote@
@@ -13627,6 +15540,8 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at cls/rgw/$(DEPDIR)/cls_rgw_types.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at cls/statelog/$(DEPDIR)/cls_statelog.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at cls/statelog/$(DEPDIR)/cls_statelog_client.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at cls/timeindex/$(DEPDIR)/cls_timeindex.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at cls/timeindex/$(DEPDIR)/cls_timeindex_client.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at cls/user/$(DEPDIR)/cls_user.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at cls/user/$(DEPDIR)/cls_user_client.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at cls/user/$(DEPDIR)/cls_user_ops.Po at am__quote@
@@ -13656,6 +15571,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/Thread.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/Throttle.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/Timer.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/TracepointProvider.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/TrackedOp.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/WorkQueue.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/addr_parsing.Plo at am__quote@
@@ -13692,7 +15608,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/hobject.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/io_priority.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/ipaddr.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/libcommon_api_la-buffer.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/libcommon_crc_aarch64_la-crc32c_aarch64.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/libcommon_crc_la-crc32c.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/libcommon_crc_la-crc32c_intel_baseline.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/libcommon_crc_la-crc32c_intel_fast.Plo at am__quote@
@@ -13712,7 +15628,6 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/perf_counters.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/pick_address.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/pipe.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/rest_bench-obj_bencher.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/run_cmd.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/safe_io.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/secret.Plo at am__quote@
@@ -13722,12 +15637,15 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/str_list.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/str_map.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/strtol.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/test_build_libcommon-buffer.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/test_build_librados-buffer.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/types.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/utf8.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/util.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/version.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/xattr.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at compressor/$(DEPDIR)/AsyncCompressor.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at compressor/$(DEPDIR)/Compressor.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at crush/$(DEPDIR)/CrushCompiler.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at crush/$(DEPDIR)/CrushTester.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at crush/$(DEPDIR)/CrushWrapper.Plo at am__quote@
@@ -13737,13 +15655,17 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at crush/$(DEPDIR)/mapper.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/$(DEPDIR)/ErasureCode.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/$(DEPDIR)/ErasureCodePlugin.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/$(DEPDIR)/libec_example_la-ErasureCode.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/$(DEPDIR)/libec_isa_la-ErasureCode.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/$(DEPDIR)/libec_jerasure_generic_la-ErasureCode.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/$(DEPDIR)/libec_jerasure_neon_la-ErasureCode.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/$(DEPDIR)/libec_jerasure_sse3_la-ErasureCode.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/$(DEPDIR)/libec_jerasure_sse4_la-ErasureCode.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/$(DEPDIR)/libec_lrc_la-ErasureCode.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/$(DEPDIR)/libec_shec_la-ErasureCode.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/$(DEPDIR)/libec_shec_generic_la-ErasureCode.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/$(DEPDIR)/libec_shec_neon_la-ErasureCode.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/$(DEPDIR)/libec_shec_sse3_la-ErasureCode.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/$(DEPDIR)/libec_shec_sse4_la-ErasureCode.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/$(DEPDIR)/unittest_erasure_code-ErasureCode.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/$(DEPDIR)/unittest_erasure_code_example-ErasureCode.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/$(DEPDIR)/unittest_erasure_code_isa-ErasureCode.Po at am__quote@
@@ -13753,6 +15675,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/$(DEPDIR)/unittest_erasure_code_plugin_isa-ErasureCode.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/$(DEPDIR)/unittest_erasure_code_shec-ErasureCode.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/$(DEPDIR)/unittest_erasure_code_shec_all-ErasureCode.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/$(DEPDIR)/unittest_erasure_code_shec_arguments-ErasureCode.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/$(DEPDIR)/unittest_erasure_code_shec_thread-ErasureCode.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/isa/$(DEPDIR)/libec_isa_la-ErasureCodeIsa.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/isa/$(DEPDIR)/libec_isa_la-ErasureCodeIsaTableCache.Plo at am__quote@
@@ -13815,17 +15738,50 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_jerasure_sse4_la-gf_w64.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_jerasure_sse4_la-gf_w8.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_jerasure_sse4_la-gf_wgen.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_la-gf.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_la-gf_general.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_la-gf_method.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_la-gf_rand.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_la-gf_w128.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_la-gf_w16.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_la-gf_w32.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_la-gf_w4.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_la-gf_w64.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_la-gf_w8.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_la-gf_wgen.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_generic_la-gf.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_generic_la-gf_general.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_generic_la-gf_method.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_generic_la-gf_rand.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_generic_la-gf_w128.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_generic_la-gf_w16.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_generic_la-gf_w32.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_generic_la-gf_w4.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_generic_la-gf_w64.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_generic_la-gf_w8.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_generic_la-gf_wgen.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_neon_la-gf.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_neon_la-gf_general.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_neon_la-gf_method.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_neon_la-gf_rand.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_neon_la-gf_w128.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_neon_la-gf_w16.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_neon_la-gf_w32.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_neon_la-gf_w4.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_neon_la-gf_w64.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_neon_la-gf_w8.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_neon_la-gf_wgen.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse3_la-gf.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse3_la-gf_general.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse3_la-gf_method.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse3_la-gf_rand.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse3_la-gf_w128.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse3_la-gf_w16.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse3_la-gf_w32.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse3_la-gf_w4.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse3_la-gf_w64.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse3_la-gf_w8.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse3_la-gf_wgen.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse4_la-gf.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse4_la-gf_general.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse4_la-gf_method.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse4_la-gf_rand.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse4_la-gf_w128.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse4_la-gf_w16.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse4_la-gf_w32.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse4_la-gf_w4.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse4_la-gf_w64.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse4_la-gf_w8.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse4_la-gf_wgen.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_jerasure-gf.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_jerasure-gf_general.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_jerasure-gf_method.Po at am__quote@
@@ -13859,6 +15815,17 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_all-gf_w64.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_all-gf_w8.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_all-gf_wgen.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_general.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_method.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_rand.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_w128.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_w16.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_w32.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_w4.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_w64.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_w8.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_wgen.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_thread-gf.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_thread-gf_general.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_thread-gf_method.Po at am__quote@
@@ -13875,6 +15842,11 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/neon/$(DEPDIR)/libec_jerasure_neon_la-gf_w4_neon.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/neon/$(DEPDIR)/libec_jerasure_neon_la-gf_w64_neon.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/neon/$(DEPDIR)/libec_jerasure_neon_la-gf_w8_neon.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/neon/$(DEPDIR)/libec_shec_neon_la-gf_w16_neon.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/neon/$(DEPDIR)/libec_shec_neon_la-gf_w32_neon.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/neon/$(DEPDIR)/libec_shec_neon_la-gf_w4_neon.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/neon/$(DEPDIR)/libec_shec_neon_la-gf_w64_neon.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/gf-complete/src/neon/$(DEPDIR)/libec_shec_neon_la-gf_w8_neon.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_jerasure_generic_la-cauchy.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_jerasure_generic_la-galois.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_jerasure_generic_la-jerasure.Plo at am__quote@
@@ -13895,11 +15867,26 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_jerasure_sse4_la-jerasure.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_jerasure_sse4_la-liberation.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_jerasure_sse4_la-reed_sol.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_la-cauchy.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_la-galois.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_la-jerasure.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_la-liberation.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_la-reed_sol.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_generic_la-cauchy.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_generic_la-galois.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_generic_la-jerasure.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_generic_la-liberation.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_generic_la-reed_sol.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_neon_la-cauchy.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_neon_la-galois.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_neon_la-jerasure.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_neon_la-liberation.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_neon_la-reed_sol.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_sse3_la-cauchy.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_sse3_la-galois.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_sse3_la-jerasure.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_sse3_la-liberation.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_sse3_la-reed_sol.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_sse4_la-cauchy.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_sse4_la-galois.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_sse4_la-jerasure.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_sse4_la-liberation.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_sse4_la-reed_sol.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/jerasure/src/$(DEPDIR)/unittest_erasure_code_jerasure-cauchy.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/jerasure/src/$(DEPDIR)/unittest_erasure_code_jerasure-galois.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/jerasure/src/$(DEPDIR)/unittest_erasure_code_jerasure-jerasure.Po at am__quote@
@@ -13915,6 +15902,11 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/jerasure/src/$(DEPDIR)/unittest_erasure_code_shec_all-jerasure.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/jerasure/src/$(DEPDIR)/unittest_erasure_code_shec_all-liberation.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/jerasure/src/$(DEPDIR)/unittest_erasure_code_shec_all-reed_sol.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/jerasure/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-cauchy.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/jerasure/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-galois.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/jerasure/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-jerasure.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/jerasure/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-liberation.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/jerasure/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-reed_sol.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/jerasure/src/$(DEPDIR)/unittest_erasure_code_shec_thread-cauchy.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/jerasure/src/$(DEPDIR)/unittest_erasure_code_shec_thread-galois.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/jerasure/jerasure/src/$(DEPDIR)/unittest_erasure_code_shec_thread-jerasure.Po at am__quote@
@@ -13924,26 +15916,39 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/lrc/$(DEPDIR)/libec_lrc_la-ErasureCodePluginLrc.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/lrc/$(DEPDIR)/unittest_erasure_code_lrc-ErasureCodeLrc.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/lrc/$(DEPDIR)/unittest_erasure_code_lrc-ErasureCodePluginLrc.Po at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/shec/$(DEPDIR)/libec_shec_la-ErasureCodePluginShec.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/shec/$(DEPDIR)/libec_shec_la-ErasureCodeShec.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/shec/$(DEPDIR)/libec_shec_la-ErasureCodeShecTableCache.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/shec/$(DEPDIR)/libec_shec_la-determinant.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/shec/$(DEPDIR)/libec_shec_la-shec.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/shec/$(DEPDIR)/libec_shec_generic_la-ErasureCodePluginShec.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/shec/$(DEPDIR)/libec_shec_generic_la-ErasureCodeShec.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/shec/$(DEPDIR)/libec_shec_generic_la-ErasureCodeShecTableCache.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/shec/$(DEPDIR)/libec_shec_generic_la-determinant.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/shec/$(DEPDIR)/libec_shec_la-ErasureCodePluginSelectShec.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/shec/$(DEPDIR)/libec_shec_neon_la-ErasureCodePluginShec.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/shec/$(DEPDIR)/libec_shec_neon_la-ErasureCodeShec.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/shec/$(DEPDIR)/libec_shec_neon_la-ErasureCodeShecTableCache.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/shec/$(DEPDIR)/libec_shec_neon_la-determinant.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/shec/$(DEPDIR)/libec_shec_sse3_la-ErasureCodePluginShec.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/shec/$(DEPDIR)/libec_shec_sse3_la-ErasureCodeShec.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/shec/$(DEPDIR)/libec_shec_sse3_la-ErasureCodeShecTableCache.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/shec/$(DEPDIR)/libec_shec_sse3_la-determinant.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/shec/$(DEPDIR)/libec_shec_sse4_la-ErasureCodePluginShec.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/shec/$(DEPDIR)/libec_shec_sse4_la-ErasureCodeShec.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/shec/$(DEPDIR)/libec_shec_sse4_la-ErasureCodeShecTableCache.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/shec/$(DEPDIR)/libec_shec_sse4_la-determinant.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec-ErasureCodePluginShec.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec-ErasureCodeShec.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec-ErasureCodeShecTableCache.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec-determinant.Po at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec-shec.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_all-ErasureCodePluginShec.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_all-ErasureCodeShec.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_all-ErasureCodeShecTableCache.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_all-determinant.Po at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_all-shec.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_arguments-ErasureCodePluginShec.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_arguments-ErasureCodeShec.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_arguments-ErasureCodeShecTableCache.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_arguments-determinant.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_thread-ErasureCodePluginShec.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_thread-ErasureCodeShec.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_thread-ErasureCodeShecTableCache.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_thread-determinant.Po at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_thread-shec.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at global/$(DEPDIR)/global_context.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at global/$(DEPDIR)/global_init.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at global/$(DEPDIR)/pidfile.Plo at am__quote@
@@ -13973,10 +15978,12 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/AsyncResizeRequest.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/AsyncTrimRequest.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/CopyupRequest.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/DiffIterate.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/ImageCtx.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/ImageWatcher.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/LibrbdWriteback.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/ObjectMap.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/RebuildObjectMapRequest.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/WatchNotifyTypes.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/internal.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/librbd.Plo at am__quote@
@@ -13996,10 +16003,11 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/MDBalancer.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/MDCache.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/MDLog.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/MDS.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/MDSAuthCaps.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/MDSContext.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/MDSDaemon.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/MDSMap.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/MDSRank.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/MDSTable.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/MDSTableClient.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/MDSTableServer.Plo at am__quote@
@@ -14008,8 +16016,10 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/RecoveryQueue.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/Server.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/SessionMap.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/SimpleLock.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/SnapRealm.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/SnapServer.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/StrayManager.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/ceph_dencoder-Beacon.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/ceph_dencoder-CDentry.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/ceph_dencoder-CDir.Po at am__quote@
@@ -14022,9 +16032,10 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/ceph_dencoder-MDBalancer.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/ceph_dencoder-MDCache.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/ceph_dencoder-MDLog.Po at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/ceph_dencoder-MDS.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/ceph_dencoder-MDSAuthCaps.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/ceph_dencoder-MDSContext.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/ceph_dencoder-MDSDaemon.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/ceph_dencoder-MDSRank.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/ceph_dencoder-MDSTable.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/ceph_dencoder-MDSTableClient.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/ceph_dencoder-MDSTableServer.Po at am__quote@
@@ -14033,8 +16044,10 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/ceph_dencoder-RecoveryQueue.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/ceph_dencoder-Server.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/ceph_dencoder-SessionMap.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/ceph_dencoder-SimpleLock.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/ceph_dencoder-SnapRealm.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/ceph_dencoder-SnapServer.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/ceph_dencoder-StrayManager.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/ceph_dencoder-journal.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/ceph_dencoder-locks.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/ceph_dencoder-snap.Po at am__quote@
@@ -14055,7 +16068,6 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at mon/$(DEPDIR)/MonClient.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mon/$(DEPDIR)/MonMap.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mon/$(DEPDIR)/Monitor.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at mon/$(DEPDIR)/MonitorStore.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mon/$(DEPDIR)/MonmapMonitor.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mon/$(DEPDIR)/OSDMonitor.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mon/$(DEPDIR)/PGMap.Plo at am__quote@
@@ -14089,7 +16101,6 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/libos_la-DBObjectMap.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/libos_la-FileJournal.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/libos_la-FileStore.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/libos_la-FlatIndex.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/libos_la-GenericFileStoreBackend.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/libos_la-GenericObjectMap.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/libos_la-HashIndex.Plo at am__quote@
@@ -14109,11 +16120,14 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/libos_rocksdb_la-RocksDBStore.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/libos_types_la-Transaction.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/libos_zfs_a-ZFS.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at os/fs/$(DEPDIR)/libos_la-FS.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at os/fs/$(DEPDIR)/libos_la-XFS.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at os/newstore/$(DEPDIR)/libos_la-NewStore.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at os/newstore/$(DEPDIR)/libos_types_la-newstore_types.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at osd/$(DEPDIR)/ECMsgTypes.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at osd/$(DEPDIR)/HitSet.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at osd/$(DEPDIR)/OSDMap.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at osd/$(DEPDIR)/ceph_test_rados_api_tier-HitSet.Po at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at osd/$(DEPDIR)/libosd_la-Ager.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at osd/$(DEPDIR)/libosd_la-ClassHandler.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at osd/$(DEPDIR)/libosd_la-ECBackend.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at osd/$(DEPDIR)/libosd_la-ECMsgTypes.Plo at am__quote@
@@ -14148,12 +16162,12 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at perfglue/$(DEPDIR)/disabled_heap_profiler.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at perfglue/$(DEPDIR)/disabled_stubs.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at perfglue/$(DEPDIR)/heap_profiler.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at rbd_fuse/$(DEPDIR)/rbd-fuse.Po at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at rbd_replay/$(DEPDIR)/Deser.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at rbd_fuse/$(DEPDIR)/rbd_fuse-rbd-fuse.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at rbd_replay/$(DEPDIR)/ActionTypes.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at rbd_replay/$(DEPDIR)/BufferReader.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rbd_replay/$(DEPDIR)/ImageNameMap.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rbd_replay/$(DEPDIR)/PendingIO.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rbd_replay/$(DEPDIR)/Replayer.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at rbd_replay/$(DEPDIR)/Ser.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rbd_replay/$(DEPDIR)/actions.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rbd_replay/$(DEPDIR)/ios.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rbd_replay/$(DEPDIR)/rbd-replay-prep.Po at am__quote@
@@ -14189,6 +16203,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/librgw_la-rgw_metadata.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/librgw_la-rgw_multi.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/librgw_la-rgw_multi_del.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/librgw_la-rgw_object_expirer_core.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/librgw_la-rgw_op.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/librgw_la-rgw_policy_s3.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/librgw_la-rgw_quota.Plo at am__quote@
@@ -14209,6 +16224,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/rgw_loadgen.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/rgw_main.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/rgw_multiparser.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/rgw_object_expirer.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/rgw_orphan.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/rgw_replica_log.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/rgw_resolve.Po at am__quote@
@@ -14248,6 +16264,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/test_build_librgw-rgw_metadata.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/test_build_librgw-rgw_multi.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/test_build_librgw-rgw_multi_del.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/test_build_librgw-rgw_object_expirer_core.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/test_build_librgw-rgw_op.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/test_build_librgw-rgw_policy_s3.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/test_build_librgw-rgw_quota.Po at am__quote@
@@ -14263,6 +16280,8 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at test/$(DEPDIR)/TestSignalHandlers.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/$(DEPDIR)/TestTimers.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/$(DEPDIR)/bench_log.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/$(DEPDIR)/ceph_perf_local-perf_helper.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/$(DEPDIR)/ceph_perf_local-perf_local.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/$(DEPDIR)/ceph_test_c_headers-test_c_headers.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/$(DEPDIR)/ceph_test_cls_rgw_log-test_rgw_admin_log.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/$(DEPDIR)/ceph_test_cls_rgw_meta-test_rgw_admin_meta.Po at am__quote@
@@ -14274,6 +16293,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at test/$(DEPDIR)/ceph_xattr_bench-xattr_bench.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/$(DEPDIR)/kv_store_bench.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/$(DEPDIR)/multi_stress_watch.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/$(DEPDIR)/objectstore_bench.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/$(DEPDIR)/omap_bench.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/$(DEPDIR)/on_exit.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/$(DEPDIR)/streamtest.Po at am__quote@
@@ -14308,6 +16328,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at test/$(DEPDIR)/unittest_ipaddr-test_ipaddr.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/$(DEPDIR)/unittest_libcephfs_config-libcephfs_config.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/$(DEPDIR)/unittest_mime-mime.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/$(DEPDIR)/unittest_pageset-test_pageset.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/$(DEPDIR)/unittest_perf_counters-perf_counters.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/$(DEPDIR)/unittest_prebufferedstreambuf-test_prebufferedstreambuf.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/$(DEPDIR)/unittest_rbd_replay-test_rbd_replay.Po at am__quote@
@@ -14317,9 +16338,11 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at test/$(DEPDIR)/unittest_str_list-test_str_list.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/$(DEPDIR)/unittest_striper-test_striper.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/$(DEPDIR)/unittest_strtol-strtol.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/$(DEPDIR)/unittest_subprocess-test_subprocess.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/$(DEPDIR)/unittest_texttable-test_texttable.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/$(DEPDIR)/unittest_utf8-utf8.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/$(DEPDIR)/unittest_workqueue-test_workqueue.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/$(DEPDIR)/unittest_xlist-test_xlist.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/ObjectMap/$(DEPDIR)/ceph_test_keyvaluedb_atomicity-test_keyvaluedb_atomicity.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/ObjectMap/$(DEPDIR)/ceph_test_keyvaluedb_iterators-KeyValueDBMemory.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/ObjectMap/$(DEPDIR)/ceph_test_keyvaluedb_iterators-test_keyvaluedb_iterators.Po at am__quote@
@@ -14339,6 +16362,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at test/cls_hello/$(DEPDIR)/ceph_test_cls_hello-test_cls_hello.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/cls_lock/$(DEPDIR)/ceph_test_cls_lock-test_cls_lock.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/cls_log/$(DEPDIR)/ceph_test_cls_log-test_cls_log.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/cls_numops/$(DEPDIR)/ceph_test_cls_numops-test_cls_numops.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/cls_rbd/$(DEPDIR)/ceph_test_cls_rbd-test_cls_rbd.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/cls_refcount/$(DEPDIR)/ceph_test_cls_refcount-test_cls_refcount.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/cls_replica_log/$(DEPDIR)/ceph_test_cls_replica_log-test_cls_replica_log.Po at am__quote@
@@ -14347,6 +16371,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at test/cls_version/$(DEPDIR)/ceph_test_cls_version-test_cls_version.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/common/$(DEPDIR)/ObjectContents.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/common/$(DEPDIR)/get_command_descriptions.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/common/$(DEPDIR)/unittest_async_compressor-test_async_compressor.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/common/$(DEPDIR)/unittest_bit_vector-test_bit_vector.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/common/$(DEPDIR)/unittest_blkdev-test_blkdev.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/common/$(DEPDIR)/unittest_bloom_filter-test_bloom_filter.Po at am__quote@
@@ -14356,6 +16381,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at test/common/$(DEPDIR)/unittest_histogram-histogram.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/common/$(DEPDIR)/unittest_io_priority-test_io_priority.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/common/$(DEPDIR)/unittest_lru-test_lru.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/common/$(DEPDIR)/unittest_prioritized_queue-test_prioritized_queue.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/common/$(DEPDIR)/unittest_readahead-Readahead.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/common/$(DEPDIR)/unittest_safe_io-test_safe_io.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/common/$(DEPDIR)/unittest_shared_cache-test_shared_cache.Po at am__quote@
@@ -14381,6 +16407,10 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at test/erasure-code/$(DEPDIR)/libec_test_jerasure_neon_la-TestJerasurePluginNEON.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/erasure-code/$(DEPDIR)/libec_test_jerasure_sse3_la-TestJerasurePluginSSE3.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/erasure-code/$(DEPDIR)/libec_test_jerasure_sse4_la-TestJerasurePluginSSE4.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/erasure-code/$(DEPDIR)/libec_test_shec_generic_la-TestShecPluginGeneric.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/erasure-code/$(DEPDIR)/libec_test_shec_neon_la-TestShecPluginNEON.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/erasure-code/$(DEPDIR)/libec_test_shec_sse3_la-TestShecPluginSSE3.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/erasure-code/$(DEPDIR)/libec_test_shec_sse4_la-TestShecPluginSSE4.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/erasure-code/$(DEPDIR)/unittest_erasure_code-TestErasureCode.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/erasure-code/$(DEPDIR)/unittest_erasure_code_example-TestErasureCodeExample.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/erasure-code/$(DEPDIR)/unittest_erasure_code_isa-TestErasureCodeIsa.Po at am__quote@
@@ -14390,12 +16420,15 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at test/erasure-code/$(DEPDIR)/unittest_erasure_code_plugin_isa-TestErasureCodePluginIsa.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/erasure-code/$(DEPDIR)/unittest_erasure_code_plugin_jerasure-TestErasureCodePluginJerasure.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/erasure-code/$(DEPDIR)/unittest_erasure_code_plugin_lrc-TestErasureCodePluginLrc.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/erasure-code/$(DEPDIR)/unittest_erasure_code_plugin_shec-TestErasureCodePluginShec.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/erasure-code/$(DEPDIR)/unittest_erasure_code_shec-TestErasureCodeShec.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/erasure-code/$(DEPDIR)/unittest_erasure_code_shec_all-TestErasureCodeShec_all.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/erasure-code/$(DEPDIR)/unittest_erasure_code_shec_arguments-TestErasureCodeShec_arguments.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/erasure-code/$(DEPDIR)/unittest_erasure_code_shec_thread-TestErasureCodeShec_thread.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/filestore/$(DEPDIR)/ceph_test_filestore-TestFileStore.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/fs/$(DEPDIR)/unittest_mds_types-mds_types.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/libcephfs/$(DEPDIR)/ceph_test_libcephfs-caps.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/libcephfs/$(DEPDIR)/ceph_test_libcephfs-flock.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/libcephfs/$(DEPDIR)/ceph_test_libcephfs-multiclient.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/libcephfs/$(DEPDIR)/ceph_test_libcephfs-readdir_r_cb.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/libcephfs/$(DEPDIR)/ceph_test_libcephfs-test.Po at am__quote@
@@ -14441,6 +16474,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at test/librbd/$(DEPDIR)/librbd_test_la-test_librbd.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/librbd/$(DEPDIR)/librbd_test_la-test_support.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/librbd/$(DEPDIR)/unittest_librbd-test_main.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/librbd/$(DEPDIR)/unittest_librbd-test_mock_fixture.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/mds/$(DEPDIR)/unittest_mds_authcap-TestMDSAuthCaps.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/messenger/$(DEPDIR)/simple_client-simple_client.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/messenger/$(DEPDIR)/simple_client-simple_dispatcher.Po at am__quote@
@@ -14454,6 +16488,8 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at test/mon/$(DEPDIR)/test_mon_workloadgen.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/mon/$(DEPDIR)/unittest_mon_moncap-moncap.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/mon/$(DEPDIR)/unittest_mon_pgmap-PGMap.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/msgr/$(DEPDIR)/ceph_perf_msgr_client-perf_msgr_client.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/msgr/$(DEPDIR)/ceph_perf_msgr_server-perf_msgr_server.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/msgr/$(DEPDIR)/ceph_test_async_driver-test_async_driver.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/msgr/$(DEPDIR)/ceph_test_msgr-test_msgr.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/objectstore/$(DEPDIR)/DeterministicOpSequence.Po at am__quote@
@@ -14461,12 +16497,14 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at test/objectstore/$(DEPDIR)/FileStoreTracker.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/objectstore/$(DEPDIR)/TestObjectStoreState.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/objectstore/$(DEPDIR)/ceph_perf_objectstore-ObjectStoreTransactionBenchmark.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/objectstore/$(DEPDIR)/ceph_test_keyvaluedb-test_kv.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/objectstore/$(DEPDIR)/ceph_test_objectstore-store_test.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/objectstore/$(DEPDIR)/test_idempotent.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/objectstore/$(DEPDIR)/test_idempotent_sequence.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/objectstore/$(DEPDIR)/unittest_chain_xattr-chain_xattr.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/objectstore/$(DEPDIR)/unittest_rocksdb_option-TestRocksdbOptionParse.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/objectstore/$(DEPDIR)/unittest_rocksdb_option_static-TestRocksdbOptionParse.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/objectstore/$(DEPDIR)/workload_generator.Po at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at test/os/$(DEPDIR)/unittest_flatindex-TestFlatIndex.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/os/$(DEPDIR)/unittest_lfnindex-TestLFNIndex.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/osd/$(DEPDIR)/Object.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/osd/$(DEPDIR)/RadosModel.Po at am__quote@
@@ -14482,6 +16520,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at test/osdc/$(DEPDIR)/FakeWriteback.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/osdc/$(DEPDIR)/object_cacher_stress.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/rgw/$(DEPDIR)/ceph_test_rgw_manifest-test_rgw_manifest.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/rgw/$(DEPDIR)/ceph_test_rgw_obj-test_rgw_obj.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/system/$(DEPDIR)/cross_process_sem.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/system/$(DEPDIR)/rados_delete_pools_parallel.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/system/$(DEPDIR)/rados_list_parallel.Po at am__quote@
@@ -14495,6 +16534,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at test/system/$(DEPDIR)/st_rados_watch.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/system/$(DEPDIR)/systest_runnable.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/system/$(DEPDIR)/systest_settings.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at tools/$(DEPDIR)/RadosDump.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at tools/$(DEPDIR)/ceph-client-debug.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at tools/$(DEPDIR)/ceph_authtool.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at tools/$(DEPDIR)/ceph_conf.Po at am__quote@
@@ -14507,9 +16547,9 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at tools/$(DEPDIR)/osdmaptool.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at tools/$(DEPDIR)/psim.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at tools/$(DEPDIR)/radosacl.Po at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at tools/$(DEPDIR)/rest_bench-rest_bench.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at tools/$(DEPDIR)/scratchtool.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at tools/$(DEPDIR)/scratchtoolpp.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at tools/cephfs/$(DEPDIR)/DataScan.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at tools/cephfs/$(DEPDIR)/Dumper.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at tools/cephfs/$(DEPDIR)/EventOutput.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at tools/cephfs/$(DEPDIR)/JournalFilter.Po at am__quote@
@@ -14518,12 +16558,18 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at tools/cephfs/$(DEPDIR)/MDSUtility.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at tools/cephfs/$(DEPDIR)/Resetter.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at tools/cephfs/$(DEPDIR)/TableTool.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at tools/cephfs/$(DEPDIR)/cephfs-data-scan.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at tools/cephfs/$(DEPDIR)/cephfs-journal-tool.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at tools/cephfs/$(DEPDIR)/cephfs-table-tool.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at tools/rados/$(DEPDIR)/PoolDump.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at tools/rados/$(DEPDIR)/RadosImport.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at tools/rados/$(DEPDIR)/rados.Po at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at tools/rados/$(DEPDIR)/rados_export.Po at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at tools/rados/$(DEPDIR)/rados_import.Po at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at tools/rados/$(DEPDIR)/rados_sync.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at tracing/$(DEPDIR)/libos_tp_la-objectstore.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at tracing/$(DEPDIR)/libosd_tp_la-oprequest.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at tracing/$(DEPDIR)/libosd_tp_la-osd.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at tracing/$(DEPDIR)/libosd_tp_la-pg.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at tracing/$(DEPDIR)/librados_tp_la-librados.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at tracing/$(DEPDIR)/librbd_tp_la-librbd.Plo at am__quote@
 
 .S.o:
 @am__fastdepCCAS_TRUE@	$(AM_V_CPPAS)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.o$$||'`;\
@@ -14615,6 +16661,13 @@ common/libcommon_crc_la-crc32c_intel_fast.lo: common/crc32c_intel_fast.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(libcommon_crc_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o common/libcommon_crc_la-crc32c_intel_fast.lo `test -f 'common/crc32c_intel_fast.c' || echo '$(srcdir)/'`common/crc32c_intel_fast.c
 
+common/libcommon_crc_aarch64_la-crc32c_aarch64.lo: common/crc32c_aarch64.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libcommon_crc_aarch64_la_CFLAGS) $(CFLAGS) -MT common/libcommon_crc_aarch64_la-crc32c_aarch64.lo -MD -MP -MF common/$(DEPDIR)/libcommon_crc_aarch64_la-crc32c_aarch64.Tpo -c -o common/libcommon_crc_aarch64_la-crc32c_aarch64.lo `test -f 'common/crc32c_aarch64.c' || echo '$(srcdir)/'`common/crc32c_aarch64.c
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) common/$(DEPDIR)/libcommon_crc_aarch64_la-crc32c_aarch64.Tpo common/$(DEPDIR)/libcommon_crc_aarch64_la-crc32c_aarch64.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='common/crc32c_aarch64.c' object='common/libcommon_crc_aarch64_la-crc32c_aarch64.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libcommon_crc_aarch64_la_CFLAGS) $(CFLAGS) -c -o common/libcommon_crc_aarch64_la-crc32c_aarch64.lo `test -f 'common/crc32c_aarch64.c' || echo '$(srcdir)/'`common/crc32c_aarch64.c
+
 erasure-code/isa/isa-l/erasure_code/libec_isa_la-ec_base.lo: erasure-code/isa/isa-l/erasure_code/ec_base.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(libec_isa_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_isa_la_CFLAGS) $(CFLAGS) -MT erasure-code/isa/isa-l/erasure_code/libec_isa_la-ec_base.lo -MD -MP -MF erasure-code/isa/isa-l/erasure_code/$(DEPDIR)/libec_isa_la-ec_base.Tpo -c -o erasure-code/isa/isa-l/erasure_code/libec_isa_la-ec_base.lo `test -f 'erasure-code/isa/isa-l/erasure_code/ec_ba [...]
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/isa/isa-l/erasure_code/$(DEPDIR)/libec_isa_la-ec_base.Tpo erasure-code/isa/isa-l/erasure_code/$(DEPDIR)/libec_isa_la-ec_base.Plo
@@ -15112,124 +17165,558 @@ erasure-code/jerasure/gf-complete/src/libec_jerasure_sse4_la-gf_w8.lo: erasure-c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_jerasure_sse4_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_jerasure_sse4_la-gf_w8.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_w8.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_w8.c
 
-erasure-code/shec/libec_shec_la-determinant.lo: erasure-code/shec/determinant.c
- at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_la_CFLAGS) $(CFLAGS) -MT erasure-code/shec/libec_shec_la-determinant.lo -MD -MP -MF erasure-code/shec/$(DEPDIR)/libec_shec_la-determinant.Tpo -c -o erasure-code/shec/libec_shec_la-determinant.lo `test -f 'erasure-code/shec/determinant.c' || echo '$(srcdir)/'`erasure-code/shec/determinant.c
- at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/shec/$(DEPDIR)/libec_shec_la-determinant.Tpo erasure-code/shec/$(DEPDIR)/libec_shec_la-determinant.Plo
- at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/shec/determinant.c' object='erasure-code/shec/libec_shec_la-determinant.lo' libtool=yes @AMDEPBACKSLASH@
+erasure-code/shec/libec_shec_generic_la-determinant.lo: erasure-code/shec/determinant.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_generic_la_CFLAGS) $(CFLAGS) -MT erasure-code/shec/libec_shec_generic_la-determinant.lo -MD -MP -MF erasure-code/shec/$(DEPDIR)/libec_shec_generic_la-determinant.Tpo -c -o erasure-code/shec/libec_shec_generic_la-determinant.lo `test -f 'erasure-code/shec/determinant.c' || echo '$(srcdir)/'`era [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/shec/$(DEPDIR)/libec_shec_generic_la-determinant.Tpo erasure-code/shec/$(DEPDIR)/libec_shec_generic_la-determinant.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/shec/determinant.c' object='erasure-code/shec/libec_shec_generic_la-determinant.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_generic_la_CFLAGS) $(CFLAGS) -c -o erasure-code/shec/libec_shec_generic_la-determinant.lo `test -f 'erasure-code/shec/determinant.c' || echo '$(srcdir)/'`erasure-code/shec/determinant.c
+
+erasure-code/jerasure/jerasure/src/libec_shec_generic_la-cauchy.lo: erasure-code/jerasure/jerasure/src/cauchy.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_generic_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/jerasure/src/libec_shec_generic_la-cauchy.lo -MD -MP -MF erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_generic_la-cauchy.Tpo -c -o erasure-code/jerasure/jerasure/src/libec_shec_generic_la-cauchy.lo `test -f 'erasure-code/jerasure [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_generic_la-cauchy.Tpo erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_generic_la-cauchy.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/jerasure/src/cauchy.c' object='erasure-code/jerasure/jerasure/src/libec_shec_generic_la-cauchy.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_generic_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/jerasure/src/libec_shec_generic_la-cauchy.lo `test -f 'erasure-code/jerasure/jerasure/src/cauchy.c' || echo '$(srcdir)/'`erasure-code/jerasure/jerasure/src/cauchy.c
+
+erasure-code/jerasure/jerasure/src/libec_shec_generic_la-galois.lo: erasure-code/jerasure/jerasure/src/galois.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_generic_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/jerasure/src/libec_shec_generic_la-galois.lo -MD -MP -MF erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_generic_la-galois.Tpo -c -o erasure-code/jerasure/jerasure/src/libec_shec_generic_la-galois.lo `test -f 'erasure-code/jerasure [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_generic_la-galois.Tpo erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_generic_la-galois.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/jerasure/src/galois.c' object='erasure-code/jerasure/jerasure/src/libec_shec_generic_la-galois.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_generic_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/jerasure/src/libec_shec_generic_la-galois.lo `test -f 'erasure-code/jerasure/jerasure/src/galois.c' || echo '$(srcdir)/'`erasure-code/jerasure/jerasure/src/galois.c
+
+erasure-code/jerasure/jerasure/src/libec_shec_generic_la-jerasure.lo: erasure-code/jerasure/jerasure/src/jerasure.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_generic_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/jerasure/src/libec_shec_generic_la-jerasure.lo -MD -MP -MF erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_generic_la-jerasure.Tpo -c -o erasure-code/jerasure/jerasure/src/libec_shec_generic_la-jerasure.lo `test -f 'erasure-code/je [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_generic_la-jerasure.Tpo erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_generic_la-jerasure.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/jerasure/src/jerasure.c' object='erasure-code/jerasure/jerasure/src/libec_shec_generic_la-jerasure.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_generic_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/jerasure/src/libec_shec_generic_la-jerasure.lo `test -f 'erasure-code/jerasure/jerasure/src/jerasure.c' || echo '$(srcdir)/'`erasure-code/jerasure/jerasure/src/jerasure.c
+
+erasure-code/jerasure/jerasure/src/libec_shec_generic_la-liberation.lo: erasure-code/jerasure/jerasure/src/liberation.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_generic_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/jerasure/src/libec_shec_generic_la-liberation.lo -MD -MP -MF erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_generic_la-liberation.Tpo -c -o erasure-code/jerasure/jerasure/src/libec_shec_generic_la-liberation.lo `test -f 'erasure-c [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_generic_la-liberation.Tpo erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_generic_la-liberation.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/jerasure/src/liberation.c' object='erasure-code/jerasure/jerasure/src/libec_shec_generic_la-liberation.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_generic_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/jerasure/src/libec_shec_generic_la-liberation.lo `test -f 'erasure-code/jerasure/jerasure/src/liberation.c' || echo '$(srcdir)/'`erasure-code/jerasure/jerasure/src/liberation.c
+
+erasure-code/jerasure/jerasure/src/libec_shec_generic_la-reed_sol.lo: erasure-code/jerasure/jerasure/src/reed_sol.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_generic_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/jerasure/src/libec_shec_generic_la-reed_sol.lo -MD -MP -MF erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_generic_la-reed_sol.Tpo -c -o erasure-code/jerasure/jerasure/src/libec_shec_generic_la-reed_sol.lo `test -f 'erasure-code/je [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_generic_la-reed_sol.Tpo erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_generic_la-reed_sol.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/jerasure/src/reed_sol.c' object='erasure-code/jerasure/jerasure/src/libec_shec_generic_la-reed_sol.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_generic_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/jerasure/src/libec_shec_generic_la-reed_sol.lo `test -f 'erasure-code/jerasure/jerasure/src/reed_sol.c' || echo '$(srcdir)/'`erasure-code/jerasure/jerasure/src/reed_sol.c
+
+erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_wgen.lo: erasure-code/jerasure/gf-complete/src/gf_wgen.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_generic_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_wgen.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_generic_la-gf_wgen.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_wgen.lo `test -f 'erasure-c [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_generic_la-gf_wgen.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_generic_la-gf_wgen.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_wgen.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_wgen.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_generic_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_wgen.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_wgen.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_wgen.c
+
+erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_method.lo: erasure-code/jerasure/gf-complete/src/gf_method.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_generic_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_method.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_generic_la-gf_method.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_method.lo `test -f 'era [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_generic_la-gf_method.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_generic_la-gf_method.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_method.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_method.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_generic_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_method.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_method.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_method.c
+
+erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_w16.lo: erasure-code/jerasure/gf-complete/src/gf_w16.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_generic_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_w16.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_generic_la-gf_w16.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_w16.lo `test -f 'erasure-code [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_generic_la-gf_w16.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_generic_la-gf_w16.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_w16.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_w16.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_generic_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_w16.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_w16.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_w16.c
+
+erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf.lo: erasure-code/jerasure/gf-complete/src/gf.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_generic_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_generic_la-gf.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf.lo `test -f 'erasure-code/jerasure/gf [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_generic_la-gf.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_generic_la-gf.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_generic_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf.c
+
+erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_w32.lo: erasure-code/jerasure/gf-complete/src/gf_w32.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_generic_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_w32.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_generic_la-gf_w32.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_w32.lo `test -f 'erasure-code [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_generic_la-gf_w32.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_generic_la-gf_w32.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_w32.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_w32.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_generic_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_w32.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_w32.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_w32.c
+
+erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_w64.lo: erasure-code/jerasure/gf-complete/src/gf_w64.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_generic_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_w64.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_generic_la-gf_w64.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_w64.lo `test -f 'erasure-code [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_generic_la-gf_w64.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_generic_la-gf_w64.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_w64.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_w64.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_generic_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_w64.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_w64.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_w64.c
+
+erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_w128.lo: erasure-code/jerasure/gf-complete/src/gf_w128.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_generic_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_w128.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_generic_la-gf_w128.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_w128.lo `test -f 'erasure-c [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_generic_la-gf_w128.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_generic_la-gf_w128.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_w128.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_w128.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_generic_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_w128.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_w128.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_w128.c
+
+erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_general.lo: erasure-code/jerasure/gf-complete/src/gf_general.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_generic_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_general.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_generic_la-gf_general.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_general.lo `test -f ' [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_generic_la-gf_general.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_generic_la-gf_general.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_general.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_general.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_generic_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_general.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_general.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_general.c
+
+erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_w4.lo: erasure-code/jerasure/gf-complete/src/gf_w4.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_generic_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_w4.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_generic_la-gf_w4.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_w4.lo `test -f 'erasure-code/je [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_generic_la-gf_w4.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_generic_la-gf_w4.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_w4.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_w4.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_generic_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_w4.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_w4.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_w4.c
+
+erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_rand.lo: erasure-code/jerasure/gf-complete/src/gf_rand.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_generic_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_rand.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_generic_la-gf_rand.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_rand.lo `test -f 'erasure-c [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_generic_la-gf_rand.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_generic_la-gf_rand.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_rand.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_rand.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_generic_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_rand.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_rand.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_rand.c
+
+erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_w8.lo: erasure-code/jerasure/gf-complete/src/gf_w8.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_generic_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_w8.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_generic_la-gf_w8.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_w8.lo `test -f 'erasure-code/je [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_generic_la-gf_w8.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_generic_la-gf_w8.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_w8.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_w8.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_generic_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_w8.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_w8.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_w8.c
+
+erasure-code/shec/libec_shec_neon_la-determinant.lo: erasure-code/shec/determinant.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CFLAGS) $(CFLAGS) -MT erasure-code/shec/libec_shec_neon_la-determinant.lo -MD -MP -MF erasure-code/shec/$(DEPDIR)/libec_shec_neon_la-determinant.Tpo -c -o erasure-code/shec/libec_shec_neon_la-determinant.lo `test -f 'erasure-code/shec/determinant.c' || echo '$(srcdir)/'`erasure-code/sh [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/shec/$(DEPDIR)/libec_shec_neon_la-determinant.Tpo erasure-code/shec/$(DEPDIR)/libec_shec_neon_la-determinant.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/shec/determinant.c' object='erasure-code/shec/libec_shec_neon_la-determinant.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CFLAGS) $(CFLAGS) -c -o erasure-code/shec/libec_shec_neon_la-determinant.lo `test -f 'erasure-code/shec/determinant.c' || echo '$(srcdir)/'`erasure-code/shec/determinant.c
+
+erasure-code/jerasure/jerasure/src/libec_shec_neon_la-cauchy.lo: erasure-code/jerasure/jerasure/src/cauchy.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/jerasure/src/libec_shec_neon_la-cauchy.lo -MD -MP -MF erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_neon_la-cauchy.Tpo -c -o erasure-code/jerasure/jerasure/src/libec_shec_neon_la-cauchy.lo `test -f 'erasure-code/jerasure/jerasure/sr [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_neon_la-cauchy.Tpo erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_neon_la-cauchy.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/jerasure/src/cauchy.c' object='erasure-code/jerasure/jerasure/src/libec_shec_neon_la-cauchy.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/jerasure/src/libec_shec_neon_la-cauchy.lo `test -f 'erasure-code/jerasure/jerasure/src/cauchy.c' || echo '$(srcdir)/'`erasure-code/jerasure/jerasure/src/cauchy.c
+
+erasure-code/jerasure/jerasure/src/libec_shec_neon_la-galois.lo: erasure-code/jerasure/jerasure/src/galois.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/jerasure/src/libec_shec_neon_la-galois.lo -MD -MP -MF erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_neon_la-galois.Tpo -c -o erasure-code/jerasure/jerasure/src/libec_shec_neon_la-galois.lo `test -f 'erasure-code/jerasure/jerasure/sr [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_neon_la-galois.Tpo erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_neon_la-galois.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/jerasure/src/galois.c' object='erasure-code/jerasure/jerasure/src/libec_shec_neon_la-galois.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/jerasure/src/libec_shec_neon_la-galois.lo `test -f 'erasure-code/jerasure/jerasure/src/galois.c' || echo '$(srcdir)/'`erasure-code/jerasure/jerasure/src/galois.c
+
+erasure-code/jerasure/jerasure/src/libec_shec_neon_la-jerasure.lo: erasure-code/jerasure/jerasure/src/jerasure.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/jerasure/src/libec_shec_neon_la-jerasure.lo -MD -MP -MF erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_neon_la-jerasure.Tpo -c -o erasure-code/jerasure/jerasure/src/libec_shec_neon_la-jerasure.lo `test -f 'erasure-code/jerasure/jeras [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_neon_la-jerasure.Tpo erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_neon_la-jerasure.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/jerasure/src/jerasure.c' object='erasure-code/jerasure/jerasure/src/libec_shec_neon_la-jerasure.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/jerasure/src/libec_shec_neon_la-jerasure.lo `test -f 'erasure-code/jerasure/jerasure/src/jerasure.c' || echo '$(srcdir)/'`erasure-code/jerasure/jerasure/src/jerasure.c
+
+erasure-code/jerasure/jerasure/src/libec_shec_neon_la-liberation.lo: erasure-code/jerasure/jerasure/src/liberation.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/jerasure/src/libec_shec_neon_la-liberation.lo -MD -MP -MF erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_neon_la-liberation.Tpo -c -o erasure-code/jerasure/jerasure/src/libec_shec_neon_la-liberation.lo `test -f 'erasure-code/jerasure [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_neon_la-liberation.Tpo erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_neon_la-liberation.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/jerasure/src/liberation.c' object='erasure-code/jerasure/jerasure/src/libec_shec_neon_la-liberation.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/jerasure/src/libec_shec_neon_la-liberation.lo `test -f 'erasure-code/jerasure/jerasure/src/liberation.c' || echo '$(srcdir)/'`erasure-code/jerasure/jerasure/src/liberation.c
+
+erasure-code/jerasure/jerasure/src/libec_shec_neon_la-reed_sol.lo: erasure-code/jerasure/jerasure/src/reed_sol.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/jerasure/src/libec_shec_neon_la-reed_sol.lo -MD -MP -MF erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_neon_la-reed_sol.Tpo -c -o erasure-code/jerasure/jerasure/src/libec_shec_neon_la-reed_sol.lo `test -f 'erasure-code/jerasure/jeras [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_neon_la-reed_sol.Tpo erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_neon_la-reed_sol.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/jerasure/src/reed_sol.c' object='erasure-code/jerasure/jerasure/src/libec_shec_neon_la-reed_sol.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/jerasure/src/libec_shec_neon_la-reed_sol.lo `test -f 'erasure-code/jerasure/jerasure/src/reed_sol.c' || echo '$(srcdir)/'`erasure-code/jerasure/jerasure/src/reed_sol.c
+
+erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_wgen.lo: erasure-code/jerasure/gf-complete/src/gf_wgen.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_wgen.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_neon_la-gf_wgen.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_wgen.lo `test -f 'erasure-code/jerasure [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_neon_la-gf_wgen.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_neon_la-gf_wgen.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_wgen.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_wgen.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_wgen.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_wgen.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_wgen.c
+
+erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_method.lo: erasure-code/jerasure/gf-complete/src/gf_method.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_method.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_neon_la-gf_method.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_method.lo `test -f 'erasure-code/je [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_neon_la-gf_method.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_neon_la-gf_method.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_method.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_method.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_method.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_method.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_method.c
+
+erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_w16.lo: erasure-code/jerasure/gf-complete/src/gf_w16.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_w16.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_neon_la-gf_w16.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_w16.lo `test -f 'erasure-code/jerasure/gf [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_neon_la-gf_w16.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_neon_la-gf_w16.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_w16.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_w16.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_w16.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_w16.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_w16.c
+
+erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf.lo: erasure-code/jerasure/gf-complete/src/gf.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_neon_la-gf.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf.lo `test -f 'erasure-code/jerasure/gf-complete/sr [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_neon_la-gf.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_neon_la-gf.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf.c
+
+erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_w32.lo: erasure-code/jerasure/gf-complete/src/gf_w32.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_w32.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_neon_la-gf_w32.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_w32.lo `test -f 'erasure-code/jerasure/gf [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_neon_la-gf_w32.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_neon_la-gf_w32.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_w32.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_w32.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_w32.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_w32.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_w32.c
+
+erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_w64.lo: erasure-code/jerasure/gf-complete/src/gf_w64.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_w64.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_neon_la-gf_w64.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_w64.lo `test -f 'erasure-code/jerasure/gf [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_neon_la-gf_w64.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_neon_la-gf_w64.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_w64.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_w64.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_w64.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_w64.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_w64.c
+
+erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_w128.lo: erasure-code/jerasure/gf-complete/src/gf_w128.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_w128.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_neon_la-gf_w128.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_w128.lo `test -f 'erasure-code/jerasure [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_neon_la-gf_w128.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_neon_la-gf_w128.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_w128.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_w128.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_w128.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_w128.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_w128.c
+
+erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_general.lo: erasure-code/jerasure/gf-complete/src/gf_general.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_general.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_neon_la-gf_general.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_general.lo `test -f 'erasure-code [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_neon_la-gf_general.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_neon_la-gf_general.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_general.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_general.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_general.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_general.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_general.c
+
+erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_w4.lo: erasure-code/jerasure/gf-complete/src/gf_w4.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_w4.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_neon_la-gf_w4.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_w4.lo `test -f 'erasure-code/jerasure/gf-co [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_neon_la-gf_w4.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_neon_la-gf_w4.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_w4.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_w4.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_w4.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_w4.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_w4.c
+
+erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_rand.lo: erasure-code/jerasure/gf-complete/src/gf_rand.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_rand.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_neon_la-gf_rand.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_rand.lo `test -f 'erasure-code/jerasure [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_neon_la-gf_rand.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_neon_la-gf_rand.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_rand.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_rand.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_rand.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_rand.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_rand.c
+
+erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_w8.lo: erasure-code/jerasure/gf-complete/src/gf_w8.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_w8.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_neon_la-gf_w8.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_w8.lo `test -f 'erasure-code/jerasure/gf-co [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_neon_la-gf_w8.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_neon_la-gf_w8.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_w8.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_w8.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_w8.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_w8.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_w8.c
+
+erasure-code/jerasure/gf-complete/src/neon/libec_shec_neon_la-gf_w4_neon.lo: erasure-code/jerasure/gf-complete/src/neon/gf_w4_neon.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/neon/libec_shec_neon_la-gf_w4_neon.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/neon/$(DEPDIR)/libec_shec_neon_la-gf_w4_neon.Tpo -c -o erasure-code/jerasure/gf-complete/src/neon/libec_shec_neon_la-gf_w4_neon.lo `test - [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/neon/$(DEPDIR)/libec_shec_neon_la-gf_w4_neon.Tpo erasure-code/jerasure/gf-complete/src/neon/$(DEPDIR)/libec_shec_neon_la-gf_w4_neon.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/neon/gf_w4_neon.c' object='erasure-code/jerasure/gf-complete/src/neon/libec_shec_neon_la-gf_w4_neon.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/neon/libec_shec_neon_la-gf_w4_neon.lo `test -f 'erasure-code/jerasure/gf-complete/src/neon/gf_w4_neon.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/neon/gf_w4_neon.c
+
+erasure-code/jerasure/gf-complete/src/neon/libec_shec_neon_la-gf_w8_neon.lo: erasure-code/jerasure/gf-complete/src/neon/gf_w8_neon.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/neon/libec_shec_neon_la-gf_w8_neon.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/neon/$(DEPDIR)/libec_shec_neon_la-gf_w8_neon.Tpo -c -o erasure-code/jerasure/gf-complete/src/neon/libec_shec_neon_la-gf_w8_neon.lo `test - [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/neon/$(DEPDIR)/libec_shec_neon_la-gf_w8_neon.Tpo erasure-code/jerasure/gf-complete/src/neon/$(DEPDIR)/libec_shec_neon_la-gf_w8_neon.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/neon/gf_w8_neon.c' object='erasure-code/jerasure/gf-complete/src/neon/libec_shec_neon_la-gf_w8_neon.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/neon/libec_shec_neon_la-gf_w8_neon.lo `test -f 'erasure-code/jerasure/gf-complete/src/neon/gf_w8_neon.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/neon/gf_w8_neon.c
+
+erasure-code/jerasure/gf-complete/src/neon/libec_shec_neon_la-gf_w16_neon.lo: erasure-code/jerasure/gf-complete/src/neon/gf_w16_neon.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/neon/libec_shec_neon_la-gf_w16_neon.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/neon/$(DEPDIR)/libec_shec_neon_la-gf_w16_neon.Tpo -c -o erasure-code/jerasure/gf-complete/src/neon/libec_shec_neon_la-gf_w16_neon.lo `tes [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/neon/$(DEPDIR)/libec_shec_neon_la-gf_w16_neon.Tpo erasure-code/jerasure/gf-complete/src/neon/$(DEPDIR)/libec_shec_neon_la-gf_w16_neon.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/neon/gf_w16_neon.c' object='erasure-code/jerasure/gf-complete/src/neon/libec_shec_neon_la-gf_w16_neon.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/neon/libec_shec_neon_la-gf_w16_neon.lo `test -f 'erasure-code/jerasure/gf-complete/src/neon/gf_w16_neon.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/neon/gf_w16_neon.c
+
+erasure-code/jerasure/gf-complete/src/neon/libec_shec_neon_la-gf_w32_neon.lo: erasure-code/jerasure/gf-complete/src/neon/gf_w32_neon.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/neon/libec_shec_neon_la-gf_w32_neon.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/neon/$(DEPDIR)/libec_shec_neon_la-gf_w32_neon.Tpo -c -o erasure-code/jerasure/gf-complete/src/neon/libec_shec_neon_la-gf_w32_neon.lo `tes [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/neon/$(DEPDIR)/libec_shec_neon_la-gf_w32_neon.Tpo erasure-code/jerasure/gf-complete/src/neon/$(DEPDIR)/libec_shec_neon_la-gf_w32_neon.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/neon/gf_w32_neon.c' object='erasure-code/jerasure/gf-complete/src/neon/libec_shec_neon_la-gf_w32_neon.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/neon/libec_shec_neon_la-gf_w32_neon.lo `test -f 'erasure-code/jerasure/gf-complete/src/neon/gf_w32_neon.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/neon/gf_w32_neon.c
+
+erasure-code/jerasure/gf-complete/src/neon/libec_shec_neon_la-gf_w64_neon.lo: erasure-code/jerasure/gf-complete/src/neon/gf_w64_neon.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/neon/libec_shec_neon_la-gf_w64_neon.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/neon/$(DEPDIR)/libec_shec_neon_la-gf_w64_neon.Tpo -c -o erasure-code/jerasure/gf-complete/src/neon/libec_shec_neon_la-gf_w64_neon.lo `tes [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/neon/$(DEPDIR)/libec_shec_neon_la-gf_w64_neon.Tpo erasure-code/jerasure/gf-complete/src/neon/$(DEPDIR)/libec_shec_neon_la-gf_w64_neon.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/neon/gf_w64_neon.c' object='erasure-code/jerasure/gf-complete/src/neon/libec_shec_neon_la-gf_w64_neon.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/neon/libec_shec_neon_la-gf_w64_neon.lo `test -f 'erasure-code/jerasure/gf-complete/src/neon/gf_w64_neon.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/neon/gf_w64_neon.c
+
+erasure-code/shec/libec_shec_sse3_la-determinant.lo: erasure-code/shec/determinant.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse3_la_CFLAGS) $(CFLAGS) -MT erasure-code/shec/libec_shec_sse3_la-determinant.lo -MD -MP -MF erasure-code/shec/$(DEPDIR)/libec_shec_sse3_la-determinant.Tpo -c -o erasure-code/shec/libec_shec_sse3_la-determinant.lo `test -f 'erasure-code/shec/determinant.c' || echo '$(srcdir)/'`erasure-code/sh [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/shec/$(DEPDIR)/libec_shec_sse3_la-determinant.Tpo erasure-code/shec/$(DEPDIR)/libec_shec_sse3_la-determinant.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/shec/determinant.c' object='erasure-code/shec/libec_shec_sse3_la-determinant.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse3_la_CFLAGS) $(CFLAGS) -c -o erasure-code/shec/libec_shec_sse3_la-determinant.lo `test -f 'erasure-code/shec/determinant.c' || echo '$(srcdir)/'`erasure-code/shec/determinant.c
+
+erasure-code/jerasure/jerasure/src/libec_shec_sse3_la-cauchy.lo: erasure-code/jerasure/jerasure/src/cauchy.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse3_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/jerasure/src/libec_shec_sse3_la-cauchy.lo -MD -MP -MF erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_sse3_la-cauchy.Tpo -c -o erasure-code/jerasure/jerasure/src/libec_shec_sse3_la-cauchy.lo `test -f 'erasure-code/jerasure/jerasure/sr [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_sse3_la-cauchy.Tpo erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_sse3_la-cauchy.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/jerasure/src/cauchy.c' object='erasure-code/jerasure/jerasure/src/libec_shec_sse3_la-cauchy.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse3_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/jerasure/src/libec_shec_sse3_la-cauchy.lo `test -f 'erasure-code/jerasure/jerasure/src/cauchy.c' || echo '$(srcdir)/'`erasure-code/jerasure/jerasure/src/cauchy.c
+
+erasure-code/jerasure/jerasure/src/libec_shec_sse3_la-galois.lo: erasure-code/jerasure/jerasure/src/galois.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse3_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/jerasure/src/libec_shec_sse3_la-galois.lo -MD -MP -MF erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_sse3_la-galois.Tpo -c -o erasure-code/jerasure/jerasure/src/libec_shec_sse3_la-galois.lo `test -f 'erasure-code/jerasure/jerasure/sr [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_sse3_la-galois.Tpo erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_sse3_la-galois.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/jerasure/src/galois.c' object='erasure-code/jerasure/jerasure/src/libec_shec_sse3_la-galois.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse3_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/jerasure/src/libec_shec_sse3_la-galois.lo `test -f 'erasure-code/jerasure/jerasure/src/galois.c' || echo '$(srcdir)/'`erasure-code/jerasure/jerasure/src/galois.c
+
+erasure-code/jerasure/jerasure/src/libec_shec_sse3_la-jerasure.lo: erasure-code/jerasure/jerasure/src/jerasure.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse3_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/jerasure/src/libec_shec_sse3_la-jerasure.lo -MD -MP -MF erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_sse3_la-jerasure.Tpo -c -o erasure-code/jerasure/jerasure/src/libec_shec_sse3_la-jerasure.lo `test -f 'erasure-code/jerasure/jeras [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_sse3_la-jerasure.Tpo erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_sse3_la-jerasure.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/jerasure/src/jerasure.c' object='erasure-code/jerasure/jerasure/src/libec_shec_sse3_la-jerasure.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse3_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/jerasure/src/libec_shec_sse3_la-jerasure.lo `test -f 'erasure-code/jerasure/jerasure/src/jerasure.c' || echo '$(srcdir)/'`erasure-code/jerasure/jerasure/src/jerasure.c
+
+erasure-code/jerasure/jerasure/src/libec_shec_sse3_la-liberation.lo: erasure-code/jerasure/jerasure/src/liberation.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse3_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/jerasure/src/libec_shec_sse3_la-liberation.lo -MD -MP -MF erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_sse3_la-liberation.Tpo -c -o erasure-code/jerasure/jerasure/src/libec_shec_sse3_la-liberation.lo `test -f 'erasure-code/jerasure [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_sse3_la-liberation.Tpo erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_sse3_la-liberation.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/jerasure/src/liberation.c' object='erasure-code/jerasure/jerasure/src/libec_shec_sse3_la-liberation.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse3_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/jerasure/src/libec_shec_sse3_la-liberation.lo `test -f 'erasure-code/jerasure/jerasure/src/liberation.c' || echo '$(srcdir)/'`erasure-code/jerasure/jerasure/src/liberation.c
+
+erasure-code/jerasure/jerasure/src/libec_shec_sse3_la-reed_sol.lo: erasure-code/jerasure/jerasure/src/reed_sol.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse3_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/jerasure/src/libec_shec_sse3_la-reed_sol.lo -MD -MP -MF erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_sse3_la-reed_sol.Tpo -c -o erasure-code/jerasure/jerasure/src/libec_shec_sse3_la-reed_sol.lo `test -f 'erasure-code/jerasure/jeras [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_sse3_la-reed_sol.Tpo erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_sse3_la-reed_sol.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/jerasure/src/reed_sol.c' object='erasure-code/jerasure/jerasure/src/libec_shec_sse3_la-reed_sol.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse3_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/jerasure/src/libec_shec_sse3_la-reed_sol.lo `test -f 'erasure-code/jerasure/jerasure/src/reed_sol.c' || echo '$(srcdir)/'`erasure-code/jerasure/jerasure/src/reed_sol.c
+
+erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_wgen.lo: erasure-code/jerasure/gf-complete/src/gf_wgen.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse3_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_wgen.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse3_la-gf_wgen.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_wgen.lo `test -f 'erasure-code/jerasure [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse3_la-gf_wgen.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse3_la-gf_wgen.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_wgen.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_wgen.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse3_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_wgen.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_wgen.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_wgen.c
+
+erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_method.lo: erasure-code/jerasure/gf-complete/src/gf_method.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse3_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_method.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse3_la-gf_method.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_method.lo `test -f 'erasure-code/je [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse3_la-gf_method.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse3_la-gf_method.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_method.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_method.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse3_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_method.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_method.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_method.c
+
+erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_w16.lo: erasure-code/jerasure/gf-complete/src/gf_w16.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse3_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_w16.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse3_la-gf_w16.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_w16.lo `test -f 'erasure-code/jerasure/gf [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse3_la-gf_w16.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse3_la-gf_w16.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_w16.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_w16.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse3_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_w16.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_w16.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_w16.c
+
+erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf.lo: erasure-code/jerasure/gf-complete/src/gf.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse3_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse3_la-gf.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf.lo `test -f 'erasure-code/jerasure/gf-complete/sr [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse3_la-gf.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse3_la-gf.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse3_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf.c
+
+erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_w32.lo: erasure-code/jerasure/gf-complete/src/gf_w32.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse3_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_w32.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse3_la-gf_w32.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_w32.lo `test -f 'erasure-code/jerasure/gf [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse3_la-gf_w32.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse3_la-gf_w32.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_w32.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_w32.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse3_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_w32.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_w32.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_w32.c
+
+erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_w64.lo: erasure-code/jerasure/gf-complete/src/gf_w64.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse3_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_w64.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse3_la-gf_w64.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_w64.lo `test -f 'erasure-code/jerasure/gf [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse3_la-gf_w64.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse3_la-gf_w64.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_w64.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_w64.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_la_CFLAGS) $(CFLAGS) -c -o erasure-code/shec/libec_shec_la-determinant.lo `test -f 'erasure-code/shec/determinant.c' || echo '$(srcdir)/'`erasure-code/shec/determinant.c
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse3_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_w64.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_w64.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_w64.c
 
-erasure-code/jerasure/jerasure/src/libec_shec_la-cauchy.lo: erasure-code/jerasure/jerasure/src/cauchy.c
- at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/jerasure/src/libec_shec_la-cauchy.lo -MD -MP -MF erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_la-cauchy.Tpo -c -o erasure-code/jerasure/jerasure/src/libec_shec_la-cauchy.lo `test -f 'erasure-code/jerasure/jerasure/src/cauchy.c' || echo  [...]
- at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_la-cauchy.Tpo erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_la-cauchy.Plo
- at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/jerasure/src/cauchy.c' object='erasure-code/jerasure/jerasure/src/libec_shec_la-cauchy.lo' libtool=yes @AMDEPBACKSLASH@
+erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_w128.lo: erasure-code/jerasure/gf-complete/src/gf_w128.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse3_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_w128.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse3_la-gf_w128.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_w128.lo `test -f 'erasure-code/jerasure [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse3_la-gf_w128.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse3_la-gf_w128.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_w128.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_w128.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/jerasure/src/libec_shec_la-cauchy.lo `test -f 'erasure-code/jerasure/jerasure/src/cauchy.c' || echo '$(srcdir)/'`erasure-code/jerasure/jerasure/src/cauchy.c
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse3_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_w128.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_w128.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_w128.c
 
-erasure-code/jerasure/jerasure/src/libec_shec_la-galois.lo: erasure-code/jerasure/jerasure/src/galois.c
- at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/jerasure/src/libec_shec_la-galois.lo -MD -MP -MF erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_la-galois.Tpo -c -o erasure-code/jerasure/jerasure/src/libec_shec_la-galois.lo `test -f 'erasure-code/jerasure/jerasure/src/galois.c' || echo  [...]
- at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_la-galois.Tpo erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_la-galois.Plo
- at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/jerasure/src/galois.c' object='erasure-code/jerasure/jerasure/src/libec_shec_la-galois.lo' libtool=yes @AMDEPBACKSLASH@
+erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_general.lo: erasure-code/jerasure/gf-complete/src/gf_general.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse3_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_general.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse3_la-gf_general.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_general.lo `test -f 'erasure-code [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse3_la-gf_general.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse3_la-gf_general.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_general.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_general.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/jerasure/src/libec_shec_la-galois.lo `test -f 'erasure-code/jerasure/jerasure/src/galois.c' || echo '$(srcdir)/'`erasure-code/jerasure/jerasure/src/galois.c
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse3_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_general.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_general.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_general.c
 
-erasure-code/jerasure/jerasure/src/libec_shec_la-jerasure.lo: erasure-code/jerasure/jerasure/src/jerasure.c
- at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/jerasure/src/libec_shec_la-jerasure.lo -MD -MP -MF erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_la-jerasure.Tpo -c -o erasure-code/jerasure/jerasure/src/libec_shec_la-jerasure.lo `test -f 'erasure-code/jerasure/jerasure/src/jerasure.c'  [...]
- at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_la-jerasure.Tpo erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_la-jerasure.Plo
- at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/jerasure/src/jerasure.c' object='erasure-code/jerasure/jerasure/src/libec_shec_la-jerasure.lo' libtool=yes @AMDEPBACKSLASH@
+erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_w4.lo: erasure-code/jerasure/gf-complete/src/gf_w4.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse3_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_w4.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse3_la-gf_w4.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_w4.lo `test -f 'erasure-code/jerasure/gf-co [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse3_la-gf_w4.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse3_la-gf_w4.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_w4.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_w4.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/jerasure/src/libec_shec_la-jerasure.lo `test -f 'erasure-code/jerasure/jerasure/src/jerasure.c' || echo '$(srcdir)/'`erasure-code/jerasure/jerasure/src/jerasure.c
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse3_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_w4.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_w4.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_w4.c
 
-erasure-code/jerasure/jerasure/src/libec_shec_la-liberation.lo: erasure-code/jerasure/jerasure/src/liberation.c
- at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/jerasure/src/libec_shec_la-liberation.lo -MD -MP -MF erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_la-liberation.Tpo -c -o erasure-code/jerasure/jerasure/src/libec_shec_la-liberation.lo `test -f 'erasure-code/jerasure/jerasure/src/libera [...]
- at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_la-liberation.Tpo erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_la-liberation.Plo
- at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/jerasure/src/liberation.c' object='erasure-code/jerasure/jerasure/src/libec_shec_la-liberation.lo' libtool=yes @AMDEPBACKSLASH@
+erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_rand.lo: erasure-code/jerasure/gf-complete/src/gf_rand.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse3_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_rand.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse3_la-gf_rand.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_rand.lo `test -f 'erasure-code/jerasure [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse3_la-gf_rand.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse3_la-gf_rand.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_rand.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_rand.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/jerasure/src/libec_shec_la-liberation.lo `test -f 'erasure-code/jerasure/jerasure/src/liberation.c' || echo '$(srcdir)/'`erasure-code/jerasure/jerasure/src/liberation.c
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse3_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_rand.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_rand.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_rand.c
 
-erasure-code/jerasure/jerasure/src/libec_shec_la-reed_sol.lo: erasure-code/jerasure/jerasure/src/reed_sol.c
- at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/jerasure/src/libec_shec_la-reed_sol.lo -MD -MP -MF erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_la-reed_sol.Tpo -c -o erasure-code/jerasure/jerasure/src/libec_shec_la-reed_sol.lo `test -f 'erasure-code/jerasure/jerasure/src/reed_sol.c'  [...]
- at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_la-reed_sol.Tpo erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_la-reed_sol.Plo
- at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/jerasure/src/reed_sol.c' object='erasure-code/jerasure/jerasure/src/libec_shec_la-reed_sol.lo' libtool=yes @AMDEPBACKSLASH@
+erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_w8.lo: erasure-code/jerasure/gf-complete/src/gf_w8.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse3_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_w8.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse3_la-gf_w8.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_w8.lo `test -f 'erasure-code/jerasure/gf-co [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse3_la-gf_w8.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse3_la-gf_w8.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_w8.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_w8.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/jerasure/src/libec_shec_la-reed_sol.lo `test -f 'erasure-code/jerasure/jerasure/src/reed_sol.c' || echo '$(srcdir)/'`erasure-code/jerasure/jerasure/src/reed_sol.c
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse3_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_w8.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_w8.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_w8.c
 
-erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_wgen.lo: erasure-code/jerasure/gf-complete/src/gf_wgen.c
- at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_wgen.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_la-gf_wgen.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_wgen.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_ [...]
- at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_la-gf_wgen.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_la-gf_wgen.Plo
- at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_wgen.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_wgen.lo' libtool=yes @AMDEPBACKSLASH@
+erasure-code/shec/libec_shec_sse4_la-determinant.lo: erasure-code/shec/determinant.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse4_la_CFLAGS) $(CFLAGS) -MT erasure-code/shec/libec_shec_sse4_la-determinant.lo -MD -MP -MF erasure-code/shec/$(DEPDIR)/libec_shec_sse4_la-determinant.Tpo -c -o erasure-code/shec/libec_shec_sse4_la-determinant.lo `test -f 'erasure-code/shec/determinant.c' || echo '$(srcdir)/'`erasure-code/sh [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/shec/$(DEPDIR)/libec_shec_sse4_la-determinant.Tpo erasure-code/shec/$(DEPDIR)/libec_shec_sse4_la-determinant.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/shec/determinant.c' object='erasure-code/shec/libec_shec_sse4_la-determinant.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_wgen.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_wgen.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_wgen.c
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse4_la_CFLAGS) $(CFLAGS) -c -o erasure-code/shec/libec_shec_sse4_la-determinant.lo `test -f 'erasure-code/shec/determinant.c' || echo '$(srcdir)/'`erasure-code/shec/determinant.c
 
-erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_method.lo: erasure-code/jerasure/gf-complete/src/gf_method.c
- at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_method.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_la-gf_method.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_method.lo `test -f 'erasure-code/jerasure/gf-complete/s [...]
- at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_la-gf_method.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_la-gf_method.Plo
- at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_method.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_method.lo' libtool=yes @AMDEPBACKSLASH@
+erasure-code/jerasure/jerasure/src/libec_shec_sse4_la-cauchy.lo: erasure-code/jerasure/jerasure/src/cauchy.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse4_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/jerasure/src/libec_shec_sse4_la-cauchy.lo -MD -MP -MF erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_sse4_la-cauchy.Tpo -c -o erasure-code/jerasure/jerasure/src/libec_shec_sse4_la-cauchy.lo `test -f 'erasure-code/jerasure/jerasure/sr [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_sse4_la-cauchy.Tpo erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_sse4_la-cauchy.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/jerasure/src/cauchy.c' object='erasure-code/jerasure/jerasure/src/libec_shec_sse4_la-cauchy.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_method.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_method.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_method.c
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse4_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/jerasure/src/libec_shec_sse4_la-cauchy.lo `test -f 'erasure-code/jerasure/jerasure/src/cauchy.c' || echo '$(srcdir)/'`erasure-code/jerasure/jerasure/src/cauchy.c
 
-erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_w16.lo: erasure-code/jerasure/gf-complete/src/gf_w16.c
- at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_w16.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_la-gf_w16.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_w16.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_w16 [...]
- at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_la-gf_w16.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_la-gf_w16.Plo
- at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_w16.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_w16.lo' libtool=yes @AMDEPBACKSLASH@
+erasure-code/jerasure/jerasure/src/libec_shec_sse4_la-galois.lo: erasure-code/jerasure/jerasure/src/galois.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse4_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/jerasure/src/libec_shec_sse4_la-galois.lo -MD -MP -MF erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_sse4_la-galois.Tpo -c -o erasure-code/jerasure/jerasure/src/libec_shec_sse4_la-galois.lo `test -f 'erasure-code/jerasure/jerasure/sr [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_sse4_la-galois.Tpo erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_sse4_la-galois.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/jerasure/src/galois.c' object='erasure-code/jerasure/jerasure/src/libec_shec_sse4_la-galois.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_w16.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_w16.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_w16.c
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse4_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/jerasure/src/libec_shec_sse4_la-galois.lo `test -f 'erasure-code/jerasure/jerasure/src/galois.c' || echo '$(srcdir)/'`erasure-code/jerasure/jerasure/src/galois.c
 
-erasure-code/jerasure/gf-complete/src/libec_shec_la-gf.lo: erasure-code/jerasure/gf-complete/src/gf.c
- at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_la-gf.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_la-gf.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_la-gf.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf.c' || echo '$(s [...]
- at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_la-gf.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_la-gf.Plo
- at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_la-gf.lo' libtool=yes @AMDEPBACKSLASH@
+erasure-code/jerasure/jerasure/src/libec_shec_sse4_la-jerasure.lo: erasure-code/jerasure/jerasure/src/jerasure.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse4_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/jerasure/src/libec_shec_sse4_la-jerasure.lo -MD -MP -MF erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_sse4_la-jerasure.Tpo -c -o erasure-code/jerasure/jerasure/src/libec_shec_sse4_la-jerasure.lo `test -f 'erasure-code/jerasure/jeras [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_sse4_la-jerasure.Tpo erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_sse4_la-jerasure.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/jerasure/src/jerasure.c' object='erasure-code/jerasure/jerasure/src/libec_shec_sse4_la-jerasure.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_la-gf.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf.c
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse4_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/jerasure/src/libec_shec_sse4_la-jerasure.lo `test -f 'erasure-code/jerasure/jerasure/src/jerasure.c' || echo '$(srcdir)/'`erasure-code/jerasure/jerasure/src/jerasure.c
 
-erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_w32.lo: erasure-code/jerasure/gf-complete/src/gf_w32.c
- at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_w32.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_la-gf_w32.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_w32.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_w32 [...]
- at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_la-gf_w32.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_la-gf_w32.Plo
- at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_w32.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_w32.lo' libtool=yes @AMDEPBACKSLASH@
+erasure-code/jerasure/jerasure/src/libec_shec_sse4_la-liberation.lo: erasure-code/jerasure/jerasure/src/liberation.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse4_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/jerasure/src/libec_shec_sse4_la-liberation.lo -MD -MP -MF erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_sse4_la-liberation.Tpo -c -o erasure-code/jerasure/jerasure/src/libec_shec_sse4_la-liberation.lo `test -f 'erasure-code/jerasure [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_sse4_la-liberation.Tpo erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_sse4_la-liberation.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/jerasure/src/liberation.c' object='erasure-code/jerasure/jerasure/src/libec_shec_sse4_la-liberation.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_w32.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_w32.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_w32.c
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse4_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/jerasure/src/libec_shec_sse4_la-liberation.lo `test -f 'erasure-code/jerasure/jerasure/src/liberation.c' || echo '$(srcdir)/'`erasure-code/jerasure/jerasure/src/liberation.c
 
-erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_w64.lo: erasure-code/jerasure/gf-complete/src/gf_w64.c
- at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_w64.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_la-gf_w64.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_w64.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_w64 [...]
- at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_la-gf_w64.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_la-gf_w64.Plo
- at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_w64.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_w64.lo' libtool=yes @AMDEPBACKSLASH@
+erasure-code/jerasure/jerasure/src/libec_shec_sse4_la-reed_sol.lo: erasure-code/jerasure/jerasure/src/reed_sol.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse4_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/jerasure/src/libec_shec_sse4_la-reed_sol.lo -MD -MP -MF erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_sse4_la-reed_sol.Tpo -c -o erasure-code/jerasure/jerasure/src/libec_shec_sse4_la-reed_sol.lo `test -f 'erasure-code/jerasure/jeras [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_sse4_la-reed_sol.Tpo erasure-code/jerasure/jerasure/src/$(DEPDIR)/libec_shec_sse4_la-reed_sol.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/jerasure/src/reed_sol.c' object='erasure-code/jerasure/jerasure/src/libec_shec_sse4_la-reed_sol.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_w64.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_w64.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_w64.c
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse4_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/jerasure/src/libec_shec_sse4_la-reed_sol.lo `test -f 'erasure-code/jerasure/jerasure/src/reed_sol.c' || echo '$(srcdir)/'`erasure-code/jerasure/jerasure/src/reed_sol.c
 
-erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_w128.lo: erasure-code/jerasure/gf-complete/src/gf_w128.c
- at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_w128.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_la-gf_w128.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_w128.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_ [...]
- at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_la-gf_w128.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_la-gf_w128.Plo
- at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_w128.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_w128.lo' libtool=yes @AMDEPBACKSLASH@
+erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_wgen.lo: erasure-code/jerasure/gf-complete/src/gf_wgen.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse4_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_wgen.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse4_la-gf_wgen.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_wgen.lo `test -f 'erasure-code/jerasure [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse4_la-gf_wgen.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse4_la-gf_wgen.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_wgen.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_wgen.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_w128.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_w128.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_w128.c
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse4_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_wgen.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_wgen.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_wgen.c
 
-erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_general.lo: erasure-code/jerasure/gf-complete/src/gf_general.c
- at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_general.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_la-gf_general.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_general.lo `test -f 'erasure-code/jerasure/gf-complet [...]
- at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_la-gf_general.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_la-gf_general.Plo
- at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_general.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_general.lo' libtool=yes @AMDEPBACKSLASH@
+erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_method.lo: erasure-code/jerasure/gf-complete/src/gf_method.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse4_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_method.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse4_la-gf_method.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_method.lo `test -f 'erasure-code/je [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse4_la-gf_method.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse4_la-gf_method.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_method.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_method.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_general.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_general.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_general.c
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse4_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_method.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_method.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_method.c
 
-erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_w4.lo: erasure-code/jerasure/gf-complete/src/gf_w4.c
- at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_w4.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_la-gf_w4.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_w4.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_w4.c'  [...]
- at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_la-gf_w4.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_la-gf_w4.Plo
- at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_w4.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_w4.lo' libtool=yes @AMDEPBACKSLASH@
+erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_w16.lo: erasure-code/jerasure/gf-complete/src/gf_w16.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse4_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_w16.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse4_la-gf_w16.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_w16.lo `test -f 'erasure-code/jerasure/gf [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse4_la-gf_w16.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse4_la-gf_w16.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_w16.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_w16.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_w4.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_w4.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_w4.c
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse4_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_w16.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_w16.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_w16.c
 
-erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_rand.lo: erasure-code/jerasure/gf-complete/src/gf_rand.c
- at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_rand.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_la-gf_rand.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_rand.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_ [...]
- at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_la-gf_rand.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_la-gf_rand.Plo
- at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_rand.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_rand.lo' libtool=yes @AMDEPBACKSLASH@
+erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf.lo: erasure-code/jerasure/gf-complete/src/gf.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse4_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse4_la-gf.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf.lo `test -f 'erasure-code/jerasure/gf-complete/sr [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse4_la-gf.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse4_la-gf.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_rand.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_rand.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_rand.c
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse4_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf.c
 
-erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_w8.lo: erasure-code/jerasure/gf-complete/src/gf_w8.c
- at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_w8.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_la-gf_w8.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_w8.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_w8.c'  [...]
- at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_la-gf_w8.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_la-gf_w8.Plo
- at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_w8.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_w8.lo' libtool=yes @AMDEPBACKSLASH@
+erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_w32.lo: erasure-code/jerasure/gf-complete/src/gf_w32.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse4_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_w32.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse4_la-gf_w32.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_w32.lo `test -f 'erasure-code/jerasure/gf [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse4_la-gf_w32.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse4_la-gf_w32.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_w32.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_w32.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_la-gf_w8.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_w8.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_w8.c
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse4_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_w32.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_w32.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_w32.c
+
+erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_w64.lo: erasure-code/jerasure/gf-complete/src/gf_w64.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse4_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_w64.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse4_la-gf_w64.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_w64.lo `test -f 'erasure-code/jerasure/gf [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse4_la-gf_w64.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse4_la-gf_w64.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_w64.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_w64.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse4_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_w64.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_w64.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_w64.c
+
+erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_w128.lo: erasure-code/jerasure/gf-complete/src/gf_w128.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse4_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_w128.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse4_la-gf_w128.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_w128.lo `test -f 'erasure-code/jerasure [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse4_la-gf_w128.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse4_la-gf_w128.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_w128.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_w128.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse4_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_w128.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_w128.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_w128.c
+
+erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_general.lo: erasure-code/jerasure/gf-complete/src/gf_general.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse4_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_general.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse4_la-gf_general.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_general.lo `test -f 'erasure-code [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse4_la-gf_general.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse4_la-gf_general.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_general.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_general.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse4_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_general.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_general.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_general.c
+
+erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_w4.lo: erasure-code/jerasure/gf-complete/src/gf_w4.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse4_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_w4.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse4_la-gf_w4.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_w4.lo `test -f 'erasure-code/jerasure/gf-co [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse4_la-gf_w4.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse4_la-gf_w4.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_w4.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_w4.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse4_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_w4.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_w4.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_w4.c
+
+erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_rand.lo: erasure-code/jerasure/gf-complete/src/gf_rand.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse4_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_rand.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse4_la-gf_rand.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_rand.lo `test -f 'erasure-code/jerasure [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse4_la-gf_rand.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse4_la-gf_rand.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_rand.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_rand.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse4_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_rand.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_rand.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_rand.c
+
+erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_w8.lo: erasure-code/jerasure/gf-complete/src/gf_w8.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse4_la_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_w8.lo -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse4_la-gf_w8.Tpo -c -o erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_w8.lo `test -f 'erasure-code/jerasure/gf-co [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse4_la-gf_w8.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/libec_shec_sse4_la-gf_w8.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_w8.c' object='erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_w8.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse4_la_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_w8.lo `test -f 'erasure-code/jerasure/gf-complete/src/gf_w8.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_w8.c
+
+tracing/libos_tp_la-objectstore.lo: tracing/objectstore.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_tp_la_CFLAGS) $(CFLAGS) -MT tracing/libos_tp_la-objectstore.lo -MD -MP -MF tracing/$(DEPDIR)/libos_tp_la-objectstore.Tpo -c -o tracing/libos_tp_la-objectstore.lo `test -f 'tracing/objectstore.c' || echo '$(srcdir)/'`tracing/objectstore.c
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) tracing/$(DEPDIR)/libos_tp_la-objectstore.Tpo tracing/$(DEPDIR)/libos_tp_la-objectstore.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='tracing/objectstore.c' object='tracing/libos_tp_la-objectstore.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_tp_la_CFLAGS) $(CFLAGS) -c -o tracing/libos_tp_la-objectstore.lo `test -f 'tracing/objectstore.c' || echo '$(srcdir)/'`tracing/objectstore.c
+
+tracing/libosd_tp_la-oprequest.lo: tracing/oprequest.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_tp_la_CFLAGS) $(CFLAGS) -MT tracing/libosd_tp_la-oprequest.lo -MD -MP -MF tracing/$(DEPDIR)/libosd_tp_la-oprequest.Tpo -c -o tracing/libosd_tp_la-oprequest.lo `test -f 'tracing/oprequest.c' || echo '$(srcdir)/'`tracing/oprequest.c
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) tracing/$(DEPDIR)/libosd_tp_la-oprequest.Tpo tracing/$(DEPDIR)/libosd_tp_la-oprequest.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='tracing/oprequest.c' object='tracing/libosd_tp_la-oprequest.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_tp_la_CFLAGS) $(CFLAGS) -c -o tracing/libosd_tp_la-oprequest.lo `test -f 'tracing/oprequest.c' || echo '$(srcdir)/'`tracing/oprequest.c
+
+tracing/libosd_tp_la-osd.lo: tracing/osd.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_tp_la_CFLAGS) $(CFLAGS) -MT tracing/libosd_tp_la-osd.lo -MD -MP -MF tracing/$(DEPDIR)/libosd_tp_la-osd.Tpo -c -o tracing/libosd_tp_la-osd.lo `test -f 'tracing/osd.c' || echo '$(srcdir)/'`tracing/osd.c
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) tracing/$(DEPDIR)/libosd_tp_la-osd.Tpo tracing/$(DEPDIR)/libosd_tp_la-osd.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='tracing/osd.c' object='tracing/libosd_tp_la-osd.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_tp_la_CFLAGS) $(CFLAGS) -c -o tracing/libosd_tp_la-osd.lo `test -f 'tracing/osd.c' || echo '$(srcdir)/'`tracing/osd.c
+
+tracing/libosd_tp_la-pg.lo: tracing/pg.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_tp_la_CFLAGS) $(CFLAGS) -MT tracing/libosd_tp_la-pg.lo -MD -MP -MF tracing/$(DEPDIR)/libosd_tp_la-pg.Tpo -c -o tracing/libosd_tp_la-pg.lo `test -f 'tracing/pg.c' || echo '$(srcdir)/'`tracing/pg.c
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) tracing/$(DEPDIR)/libosd_tp_la-pg.Tpo tracing/$(DEPDIR)/libosd_tp_la-pg.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='tracing/pg.c' object='tracing/libosd_tp_la-pg.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_tp_la_CFLAGS) $(CFLAGS) -c -o tracing/libosd_tp_la-pg.lo `test -f 'tracing/pg.c' || echo '$(srcdir)/'`tracing/pg.c
+
+tracing/librados_tp_la-librados.lo: tracing/librados.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(librados_tp_la_CFLAGS) $(CFLAGS) -MT tracing/librados_tp_la-librados.lo -MD -MP -MF tracing/$(DEPDIR)/librados_tp_la-librados.Tpo -c -o tracing/librados_tp_la-librados.lo `test -f 'tracing/librados.c' || echo '$(srcdir)/'`tracing/librados.c
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) tracing/$(DEPDIR)/librados_tp_la-librados.Tpo tracing/$(DEPDIR)/librados_tp_la-librados.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='tracing/librados.c' object='tracing/librados_tp_la-librados.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(librados_tp_la_CFLAGS) $(CFLAGS) -c -o tracing/librados_tp_la-librados.lo `test -f 'tracing/librados.c' || echo '$(srcdir)/'`tracing/librados.c
+
+tracing/librbd_tp_la-librbd.lo: tracing/librbd.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(librbd_tp_la_CFLAGS) $(CFLAGS) -MT tracing/librbd_tp_la-librbd.lo -MD -MP -MF tracing/$(DEPDIR)/librbd_tp_la-librbd.Tpo -c -o tracing/librbd_tp_la-librbd.lo `test -f 'tracing/librbd.c' || echo '$(srcdir)/'`tracing/librbd.c
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) tracing/$(DEPDIR)/librbd_tp_la-librbd.Tpo tracing/$(DEPDIR)/librbd_tp_la-librbd.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='tracing/librbd.c' object='tracing/librbd_tp_la-librbd.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(librbd_tp_la_CFLAGS) $(CFLAGS) -c -o tracing/librbd_tp_la-librbd.lo `test -f 'tracing/librbd.c' || echo '$(srcdir)/'`tracing/librbd.c
 
 mds/ceph_dencoder-locks.o: mds/locks.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_dencoder_CFLAGS) $(CFLAGS) -MT mds/ceph_dencoder-locks.o -MD -MP -MF mds/$(DEPDIR)/ceph_dencoder-locks.Tpo -c -o mds/ceph_dencoder-locks.o `test -f 'mds/locks.c' || echo '$(srcdir)/'`mds/locks.c
@@ -15959,6 +18446,244 @@ erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_all-gf_w8.obj:
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_all_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_all-gf_w8.obj `if test -f 'erasure-code/jerasure/gf-complete/src/gf_w8.c'; then $(CYGPATH_W) 'erasure-code/jerasure/gf-complete/src/gf_w8.c'; else $(CYGPATH_W) '$(srcdir)/erasure-code/jerasure/gf-complete/src/gf_w8.c'; fi`
 
+erasure-code/shec/unittest_erasure_code_shec_arguments-determinant.o: erasure-code/shec/determinant.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -MT erasure-code/shec/unittest_erasure_code_shec_arguments-determinant.o -MD -MP -MF erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_arguments-determinant.Tpo -c -o erasure-code/shec/unittest_erasure_code_shec_arguments-determinant.o `test -f 'erasure-code/shec/determinant.c' || echo '$(srcdir)/'`erasure-code/shec/determinant.c
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_arguments-determinant.Tpo erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_arguments-determinant.Po
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/shec/determinant.c' object='erasure-code/shec/unittest_erasure_code_shec_arguments-determinant.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -c -o erasure-code/shec/unittest_erasure_code_shec_arguments-determinant.o `test -f 'erasure-code/shec/determinant.c' || echo '$(srcdir)/'`erasure-code/shec/determinant.c
+
+erasure-code/shec/unittest_erasure_code_shec_arguments-determinant.obj: erasure-code/shec/determinant.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -MT erasure-code/shec/unittest_erasure_code_shec_arguments-determinant.obj -MD -MP -MF erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_arguments-determinant.Tpo -c -o erasure-code/shec/unittest_erasure_code_shec_arguments-determinant.obj `if test -f 'erasure-code/shec/determinant.c'; then $(CYGPATH_W) 'erasure-code/shec/de [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_arguments-determinant.Tpo erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_arguments-determinant.Po
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/shec/determinant.c' object='erasure-code/shec/unittest_erasure_code_shec_arguments-determinant.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -c -o erasure-code/shec/unittest_erasure_code_shec_arguments-determinant.obj `if test -f 'erasure-code/shec/determinant.c'; then $(CYGPATH_W) 'erasure-code/shec/determinant.c'; else $(CYGPATH_W) '$(srcdir)/erasure-code/shec/determinant.c'; fi`
+
+erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-cauchy.o: erasure-code/jerasure/jerasure/src/cauchy.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-cauchy.o -MD -MP -MF erasure-code/jerasure/jerasure/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-cauchy.Tpo -c -o erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-cauchy.o `test -f 'erasure-code/jerasure/jerasure/src/cauchy.c' [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/jerasure/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-cauchy.Tpo erasure-code/jerasure/jerasure/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-cauchy.Po
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/jerasure/src/cauchy.c' object='erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-cauchy.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-cauchy.o `test -f 'erasure-code/jerasure/jerasure/src/cauchy.c' || echo '$(srcdir)/'`erasure-code/jerasure/jerasure/src/cauchy.c
+
+erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-cauchy.obj: erasure-code/jerasure/jerasure/src/cauchy.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-cauchy.obj -MD -MP -MF erasure-code/jerasure/jerasure/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-cauchy.Tpo -c -o erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-cauchy.obj `if test -f 'erasure-code/jerasure/jerasure/src/ca [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/jerasure/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-cauchy.Tpo erasure-code/jerasure/jerasure/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-cauchy.Po
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/jerasure/src/cauchy.c' object='erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-cauchy.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-cauchy.obj `if test -f 'erasure-code/jerasure/jerasure/src/cauchy.c'; then $(CYGPATH_W) 'erasure-code/jerasure/jerasure/src/cauchy.c'; else $(CYGPATH_W) '$(srcdir)/erasure-code/jerasure/jerasure/src/cauchy.c'; fi`
+
+erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-galois.o: erasure-code/jerasure/jerasure/src/galois.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-galois.o -MD -MP -MF erasure-code/jerasure/jerasure/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-galois.Tpo -c -o erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-galois.o `test -f 'erasure-code/jerasure/jerasure/src/galois.c' [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/jerasure/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-galois.Tpo erasure-code/jerasure/jerasure/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-galois.Po
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/jerasure/src/galois.c' object='erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-galois.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-galois.o `test -f 'erasure-code/jerasure/jerasure/src/galois.c' || echo '$(srcdir)/'`erasure-code/jerasure/jerasure/src/galois.c
+
+erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-galois.obj: erasure-code/jerasure/jerasure/src/galois.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-galois.obj -MD -MP -MF erasure-code/jerasure/jerasure/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-galois.Tpo -c -o erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-galois.obj `if test -f 'erasure-code/jerasure/jerasure/src/ga [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/jerasure/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-galois.Tpo erasure-code/jerasure/jerasure/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-galois.Po
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/jerasure/src/galois.c' object='erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-galois.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-galois.obj `if test -f 'erasure-code/jerasure/jerasure/src/galois.c'; then $(CYGPATH_W) 'erasure-code/jerasure/jerasure/src/galois.c'; else $(CYGPATH_W) '$(srcdir)/erasure-code/jerasure/jerasure/src/galois.c'; fi`
+
+erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-jerasure.o: erasure-code/jerasure/jerasure/src/jerasure.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-jerasure.o -MD -MP -MF erasure-code/jerasure/jerasure/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-jerasure.Tpo -c -o erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-jerasure.o `test -f 'erasure-code/jerasure/jerasure/src/jer [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/jerasure/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-jerasure.Tpo erasure-code/jerasure/jerasure/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-jerasure.Po
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/jerasure/src/jerasure.c' object='erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-jerasure.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-jerasure.o `test -f 'erasure-code/jerasure/jerasure/src/jerasure.c' || echo '$(srcdir)/'`erasure-code/jerasure/jerasure/src/jerasure.c
+
+erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-jerasure.obj: erasure-code/jerasure/jerasure/src/jerasure.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-jerasure.obj -MD -MP -MF erasure-code/jerasure/jerasure/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-jerasure.Tpo -c -o erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-jerasure.obj `if test -f 'erasure-code/jerasure/jerasure/ [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/jerasure/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-jerasure.Tpo erasure-code/jerasure/jerasure/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-jerasure.Po
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/jerasure/src/jerasure.c' object='erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-jerasure.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-jerasure.obj `if test -f 'erasure-code/jerasure/jerasure/src/jerasure.c'; then $(CYGPATH_W) 'erasure-code/jerasure/jerasure/src/jerasure.c'; else $(CYGPATH_W) '$(srcdir)/erasure-code/jerasure/jerasure/src/jerasure.c'; fi`
+
+erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-liberation.o: erasure-code/jerasure/jerasure/src/liberation.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-liberation.o -MD -MP -MF erasure-code/jerasure/jerasure/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-liberation.Tpo -c -o erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-liberation.o `test -f 'erasure-code/jerasure/jerasure/s [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/jerasure/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-liberation.Tpo erasure-code/jerasure/jerasure/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-liberation.Po
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/jerasure/src/liberation.c' object='erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-liberation.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-liberation.o `test -f 'erasure-code/jerasure/jerasure/src/liberation.c' || echo '$(srcdir)/'`erasure-code/jerasure/jerasure/src/liberation.c
+
+erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-liberation.obj: erasure-code/jerasure/jerasure/src/liberation.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-liberation.obj -MD -MP -MF erasure-code/jerasure/jerasure/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-liberation.Tpo -c -o erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-liberation.obj `if test -f 'erasure-code/jerasure/jer [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/jerasure/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-liberation.Tpo erasure-code/jerasure/jerasure/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-liberation.Po
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/jerasure/src/liberation.c' object='erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-liberation.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-liberation.obj `if test -f 'erasure-code/jerasure/jerasure/src/liberation.c'; then $(CYGPATH_W) 'erasure-code/jerasure/jerasure/src/liberation.c'; else $(CYGPATH_W) '$(srcdir)/erasure-code/jerasure/jerasure/src/liberation.c'; fi`
+
+erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-reed_sol.o: erasure-code/jerasure/jerasure/src/reed_sol.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-reed_sol.o -MD -MP -MF erasure-code/jerasure/jerasure/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-reed_sol.Tpo -c -o erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-reed_sol.o `test -f 'erasure-code/jerasure/jerasure/src/ree [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/jerasure/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-reed_sol.Tpo erasure-code/jerasure/jerasure/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-reed_sol.Po
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/jerasure/src/reed_sol.c' object='erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-reed_sol.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-reed_sol.o `test -f 'erasure-code/jerasure/jerasure/src/reed_sol.c' || echo '$(srcdir)/'`erasure-code/jerasure/jerasure/src/reed_sol.c
+
+erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-reed_sol.obj: erasure-code/jerasure/jerasure/src/reed_sol.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-reed_sol.obj -MD -MP -MF erasure-code/jerasure/jerasure/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-reed_sol.Tpo -c -o erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-reed_sol.obj `if test -f 'erasure-code/jerasure/jerasure/ [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/jerasure/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-reed_sol.Tpo erasure-code/jerasure/jerasure/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-reed_sol.Po
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/jerasure/src/reed_sol.c' object='erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-reed_sol.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/jerasure/src/unittest_erasure_code_shec_arguments-reed_sol.obj `if test -f 'erasure-code/jerasure/jerasure/src/reed_sol.c'; then $(CYGPATH_W) 'erasure-code/jerasure/jerasure/src/reed_sol.c'; else $(CYGPATH_W) '$(srcdir)/erasure-code/jerasure/jerasure/src/reed_sol.c'; fi`
+
+erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_wgen.o: erasure-code/jerasure/gf-complete/src/gf_wgen.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_wgen.o -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_wgen.Tpo -c -o erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_wgen.o `test -f 'erasure-code/jerasure/gf-complet [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_wgen.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_wgen.Po
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_wgen.c' object='erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_wgen.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_wgen.o `test -f 'erasure-code/jerasure/gf-complete/src/gf_wgen.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_wgen.c
+
+erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_wgen.obj: erasure-code/jerasure/gf-complete/src/gf_wgen.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_wgen.obj -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_wgen.Tpo -c -o erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_wgen.obj `if test -f 'erasure-code/jerasure/gf- [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_wgen.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_wgen.Po
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_wgen.c' object='erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_wgen.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_wgen.obj `if test -f 'erasure-code/jerasure/gf-complete/src/gf_wgen.c'; then $(CYGPATH_W) 'erasure-code/jerasure/gf-complete/src/gf_wgen.c'; else $(CYGPATH_W) '$(srcdir)/erasure-code/jerasure/gf-complete/src/gf_wgen.c'; fi`
+
+erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_method.o: erasure-code/jerasure/gf-complete/src/gf_method.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_method.o -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_method.Tpo -c -o erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_method.o `test -f 'erasure-code/jerasure/gf-c [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_method.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_method.Po
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_method.c' object='erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_method.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_method.o `test -f 'erasure-code/jerasure/gf-complete/src/gf_method.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_method.c
+
+erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_method.obj: erasure-code/jerasure/gf-complete/src/gf_method.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_method.obj -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_method.Tpo -c -o erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_method.obj `if test -f 'erasure-code/jerasu [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_method.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_method.Po
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_method.c' object='erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_method.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_method.obj `if test -f 'erasure-code/jerasure/gf-complete/src/gf_method.c'; then $(CYGPATH_W) 'erasure-code/jerasure/gf-complete/src/gf_method.c'; else $(CYGPATH_W) '$(srcdir)/erasure-code/jerasure/gf-complete/src/gf_method.c'; fi`
+
+erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w16.o: erasure-code/jerasure/gf-complete/src/gf_w16.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w16.o -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_w16.Tpo -c -o erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w16.o `test -f 'erasure-code/jerasure/gf-complete/s [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_w16.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_w16.Po
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_w16.c' object='erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w16.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w16.o `test -f 'erasure-code/jerasure/gf-complete/src/gf_w16.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_w16.c
+
+erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w16.obj: erasure-code/jerasure/gf-complete/src/gf_w16.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w16.obj -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_w16.Tpo -c -o erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w16.obj `if test -f 'erasure-code/jerasure/gf-com [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_w16.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_w16.Po
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_w16.c' object='erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w16.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w16.obj `if test -f 'erasure-code/jerasure/gf-complete/src/gf_w16.c'; then $(CYGPATH_W) 'erasure-code/jerasure/gf-complete/src/gf_w16.c'; else $(CYGPATH_W) '$(srcdir)/erasure-code/jerasure/gf-complete/src/gf_w16.c'; fi`
+
+erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf.o: erasure-code/jerasure/gf-complete/src/gf.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf.o -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf.Tpo -c -o erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf.o `test -f 'erasure-code/jerasure/gf-complete/src/gf.c' ||  [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf.Po
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf.c' object='erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf.o `test -f 'erasure-code/jerasure/gf-complete/src/gf.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf.c
+
+erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf.obj: erasure-code/jerasure/gf-complete/src/gf.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf.obj -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf.Tpo -c -o erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf.obj `if test -f 'erasure-code/jerasure/gf-complete/src/gf [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf.Po
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf.c' object='erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf.obj `if test -f 'erasure-code/jerasure/gf-complete/src/gf.c'; then $(CYGPATH_W) 'erasure-code/jerasure/gf-complete/src/gf.c'; else $(CYGPATH_W) '$(srcdir)/erasure-code/jerasure/gf-complete/src/gf.c'; fi`
+
+erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w32.o: erasure-code/jerasure/gf-complete/src/gf_w32.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w32.o -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_w32.Tpo -c -o erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w32.o `test -f 'erasure-code/jerasure/gf-complete/s [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_w32.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_w32.Po
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_w32.c' object='erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w32.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w32.o `test -f 'erasure-code/jerasure/gf-complete/src/gf_w32.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_w32.c
+
+erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w32.obj: erasure-code/jerasure/gf-complete/src/gf_w32.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w32.obj -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_w32.Tpo -c -o erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w32.obj `if test -f 'erasure-code/jerasure/gf-com [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_w32.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_w32.Po
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_w32.c' object='erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w32.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w32.obj `if test -f 'erasure-code/jerasure/gf-complete/src/gf_w32.c'; then $(CYGPATH_W) 'erasure-code/jerasure/gf-complete/src/gf_w32.c'; else $(CYGPATH_W) '$(srcdir)/erasure-code/jerasure/gf-complete/src/gf_w32.c'; fi`
+
+erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w64.o: erasure-code/jerasure/gf-complete/src/gf_w64.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w64.o -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_w64.Tpo -c -o erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w64.o `test -f 'erasure-code/jerasure/gf-complete/s [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_w64.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_w64.Po
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_w64.c' object='erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w64.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w64.o `test -f 'erasure-code/jerasure/gf-complete/src/gf_w64.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_w64.c
+
+erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w64.obj: erasure-code/jerasure/gf-complete/src/gf_w64.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w64.obj -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_w64.Tpo -c -o erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w64.obj `if test -f 'erasure-code/jerasure/gf-com [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_w64.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_w64.Po
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_w64.c' object='erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w64.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w64.obj `if test -f 'erasure-code/jerasure/gf-complete/src/gf_w64.c'; then $(CYGPATH_W) 'erasure-code/jerasure/gf-complete/src/gf_w64.c'; else $(CYGPATH_W) '$(srcdir)/erasure-code/jerasure/gf-complete/src/gf_w64.c'; fi`
+
+erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w128.o: erasure-code/jerasure/gf-complete/src/gf_w128.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w128.o -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_w128.Tpo -c -o erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w128.o `test -f 'erasure-code/jerasure/gf-complet [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_w128.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_w128.Po
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_w128.c' object='erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w128.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w128.o `test -f 'erasure-code/jerasure/gf-complete/src/gf_w128.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_w128.c
+
+erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w128.obj: erasure-code/jerasure/gf-complete/src/gf_w128.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w128.obj -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_w128.Tpo -c -o erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w128.obj `if test -f 'erasure-code/jerasure/gf- [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_w128.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_w128.Po
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_w128.c' object='erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w128.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w128.obj `if test -f 'erasure-code/jerasure/gf-complete/src/gf_w128.c'; then $(CYGPATH_W) 'erasure-code/jerasure/gf-complete/src/gf_w128.c'; else $(CYGPATH_W) '$(srcdir)/erasure-code/jerasure/gf-complete/src/gf_w128.c'; fi`
+
+erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_general.o: erasure-code/jerasure/gf-complete/src/gf_general.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_general.o -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_general.Tpo -c -o erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_general.o `test -f 'erasure-code/jerasure/g [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_general.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_general.Po
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_general.c' object='erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_general.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_general.o `test -f 'erasure-code/jerasure/gf-complete/src/gf_general.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_general.c
+
+erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_general.obj: erasure-code/jerasure/gf-complete/src/gf_general.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_general.obj -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_general.Tpo -c -o erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_general.obj `if test -f 'erasure-code/jer [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_general.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_general.Po
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_general.c' object='erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_general.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_general.obj `if test -f 'erasure-code/jerasure/gf-complete/src/gf_general.c'; then $(CYGPATH_W) 'erasure-code/jerasure/gf-complete/src/gf_general.c'; else $(CYGPATH_W) '$(srcdir)/erasure-code/jerasure/gf-complete/src/gf_general.c'; fi`
+
+erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w4.o: erasure-code/jerasure/gf-complete/src/gf_w4.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w4.o -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_w4.Tpo -c -o erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w4.o `test -f 'erasure-code/jerasure/gf-complete/src/ [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_w4.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_w4.Po
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_w4.c' object='erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w4.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w4.o `test -f 'erasure-code/jerasure/gf-complete/src/gf_w4.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_w4.c
+
+erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w4.obj: erasure-code/jerasure/gf-complete/src/gf_w4.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w4.obj -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_w4.Tpo -c -o erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w4.obj `if test -f 'erasure-code/jerasure/gf-comple [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_w4.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_w4.Po
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_w4.c' object='erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w4.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w4.obj `if test -f 'erasure-code/jerasure/gf-complete/src/gf_w4.c'; then $(CYGPATH_W) 'erasure-code/jerasure/gf-complete/src/gf_w4.c'; else $(CYGPATH_W) '$(srcdir)/erasure-code/jerasure/gf-complete/src/gf_w4.c'; fi`
+
+erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_rand.o: erasure-code/jerasure/gf-complete/src/gf_rand.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_rand.o -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_rand.Tpo -c -o erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_rand.o `test -f 'erasure-code/jerasure/gf-complet [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_rand.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_rand.Po
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_rand.c' object='erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_rand.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_rand.o `test -f 'erasure-code/jerasure/gf-complete/src/gf_rand.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_rand.c
+
+erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_rand.obj: erasure-code/jerasure/gf-complete/src/gf_rand.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_rand.obj -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_rand.Tpo -c -o erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_rand.obj `if test -f 'erasure-code/jerasure/gf- [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_rand.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_rand.Po
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_rand.c' object='erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_rand.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_rand.obj `if test -f 'erasure-code/jerasure/gf-complete/src/gf_rand.c'; then $(CYGPATH_W) 'erasure-code/jerasure/gf-complete/src/gf_rand.c'; else $(CYGPATH_W) '$(srcdir)/erasure-code/jerasure/gf-complete/src/gf_rand.c'; fi`
+
+erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w8.o: erasure-code/jerasure/gf-complete/src/gf_w8.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w8.o -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_w8.Tpo -c -o erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w8.o `test -f 'erasure-code/jerasure/gf-complete/src/ [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_w8.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_w8.Po
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_w8.c' object='erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w8.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w8.o `test -f 'erasure-code/jerasure/gf-complete/src/gf_w8.c' || echo '$(srcdir)/'`erasure-code/jerasure/gf-complete/src/gf_w8.c
+
+erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w8.obj: erasure-code/jerasure/gf-complete/src/gf_w8.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -MT erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w8.obj -MD -MP -MF erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_w8.Tpo -c -o erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w8.obj `if test -f 'erasure-code/jerasure/gf-comple [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_w8.Tpo erasure-code/jerasure/gf-complete/src/$(DEPDIR)/unittest_erasure_code_shec_arguments-gf_w8.Po
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='erasure-code/jerasure/gf-complete/src/gf_w8.c' object='erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w8.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CFLAGS) $(CFLAGS) -c -o erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w8.obj `if test -f 'erasure-code/jerasure/gf-complete/src/gf_w8.c'; then $(CYGPATH_W) 'erasure-code/jerasure/gf-complete/src/gf_w8.c'; else $(CYGPATH_W) '$(srcdir)/erasure-code/jerasure/gf-complete/src/gf_w8.c'; fi`
+
 erasure-code/shec/unittest_erasure_code_shec_thread-determinant.o: erasure-code/shec/determinant.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_thread_CFLAGS) $(CFLAGS) -MT erasure-code/shec/unittest_erasure_code_shec_thread-determinant.o -MD -MP -MF erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_thread-determinant.Tpo -c -o erasure-code/shec/unittest_erasure_code_shec_thread-determinant.o `test -f 'erasure-code/shec/determinant.c' || echo '$(srcdir)/'`erasure-code/shec/determinant.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_thread-determinant.Tpo erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_thread-determinant.Po
@@ -16263,12 +18988,12 @@ rgw/libcivetweb_la-rgw_civetweb_log.lo: rgw/rgw_civetweb_log.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libcivetweb_la_CXXFLAGS) $(CXXFLAGS) -c -o rgw/libcivetweb_la-rgw_civetweb_log.lo `test -f 'rgw/rgw_civetweb_log.cc' || echo '$(srcdir)/'`rgw/rgw_civetweb_log.cc
 
-common/libcommon_api_la-buffer.lo: common/buffer.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libcommon_api_la_CXXFLAGS) $(CXXFLAGS) -MT common/libcommon_api_la-buffer.lo -MD -MP -MF common/$(DEPDIR)/libcommon_api_la-buffer.Tpo -c -o common/libcommon_api_la-buffer.lo `test -f 'common/buffer.cc' || echo '$(srcdir)/'`common/buffer.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) common/$(DEPDIR)/libcommon_api_la-buffer.Tpo common/$(DEPDIR)/libcommon_api_la-buffer.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='common/buffer.cc' object='common/libcommon_api_la-buffer.lo' libtool=yes @AMDEPBACKSLASH@
+client/libclient_fuse_la-fuse_ll.lo: client/fuse_ll.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libclient_fuse_la_CXXFLAGS) $(CXXFLAGS) -MT client/libclient_fuse_la-fuse_ll.lo -MD -MP -MF client/$(DEPDIR)/libclient_fuse_la-fuse_ll.Tpo -c -o client/libclient_fuse_la-fuse_ll.lo `test -f 'client/fuse_ll.cc' || echo '$(srcdir)/'`client/fuse_ll.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) client/$(DEPDIR)/libclient_fuse_la-fuse_ll.Tpo client/$(DEPDIR)/libclient_fuse_la-fuse_ll.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='client/fuse_ll.cc' object='client/libclient_fuse_la-fuse_ll.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libcommon_api_la_CXXFLAGS) $(CXXFLAGS) -c -o common/libcommon_api_la-buffer.lo `test -f 'common/buffer.cc' || echo '$(srcdir)/'`common/buffer.cc
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libclient_fuse_la_CXXFLAGS) $(CXXFLAGS) -c -o client/libclient_fuse_la-fuse_ll.lo `test -f 'client/fuse_ll.cc' || echo '$(srcdir)/'`client/fuse_ll.cc
 
 common/libcommon_crc_la-crc32c.lo: common/crc32c.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(libcommon_crc_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT common/libcommon_crc_la-crc32c.lo -MD -MP -MF common/$(DEPDIR)/libcommon_crc_la-crc32c.Tpo -c -o common/libcommon_crc_la-crc32c.lo `test -f 'common/crc32c.cc' || echo '$(srcdir)/'`common/crc32c.cc
@@ -16277,6 +19002,13 @@ common/libcommon_crc_la-crc32c.lo: common/crc32c.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(libcommon_crc_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o common/libcommon_crc_la-crc32c.lo `test -f 'common/crc32c.cc' || echo '$(srcdir)/'`common/crc32c.cc
 
+erasure-code/libec_example_la-ErasureCode.lo: erasure-code/ErasureCode.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_example_la_CXXFLAGS) $(CXXFLAGS) -MT erasure-code/libec_example_la-ErasureCode.lo -MD -MP -MF erasure-code/$(DEPDIR)/libec_example_la-ErasureCode.Tpo -c -o erasure-code/libec_example_la-ErasureCode.lo `test -f 'erasure-code/ErasureCode.cc' || echo '$(srcdir)/'`erasure-code/ErasureCode.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) erasure-code/$(DEPDIR)/libec_example_la-ErasureCode.Tpo erasure-code/$(DEPDIR)/libec_example_la-ErasureCode.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='erasure-code/ErasureCode.cc' object='erasure-code/libec_example_la-ErasureCode.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_example_la_CXXFLAGS) $(CXXFLAGS) -c -o erasure-code/libec_example_la-ErasureCode.lo `test -f 'erasure-code/ErasureCode.cc' || echo '$(srcdir)/'`erasure-code/ErasureCode.cc
+
 test/erasure-code/libec_example_la-ErasureCodePluginExample.lo: test/erasure-code/ErasureCodePluginExample.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_example_la_CXXFLAGS) $(CXXFLAGS) -MT test/erasure-code/libec_example_la-ErasureCodePluginExample.lo -MD -MP -MF test/erasure-code/$(DEPDIR)/libec_example_la-ErasureCodePluginExample.Tpo -c -o test/erasure-code/libec_example_la-ErasureCodePluginExample.lo `test -f 'test/erasure-code/ErasureCodeP [...]
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/erasure-code/$(DEPDIR)/libec_example_la-ErasureCodePluginExample.Tpo test/erasure-code/$(DEPDIR)/libec_example_la-ErasureCodePluginExample.Plo
@@ -16459,54 +19191,138 @@ common/libec_lrc_la-str_map.lo: common/str_map.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_lrc_la_CXXFLAGS) $(CXXFLAGS) -c -o common/libec_lrc_la-str_map.lo `test -f 'common/str_map.cc' || echo '$(srcdir)/'`common/str_map.cc
 
-test/erasure-code/libec_missing_entry_point_la-ErasureCodePluginMissingEntryPoint.lo: test/erasure-code/ErasureCodePluginMissingEntryPoint.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_missing_entry_point_la_CXXFLAGS) $(CXXFLAGS) -MT test/erasure-code/libec_missing_entry_point_la-ErasureCodePluginMissingEntryPoint.lo -MD -MP -MF test/erasure-code/$(DEPDIR)/libec_missing_entry_point_la-ErasureCodePluginMissingEntryPoint.Tpo -c -o test/erasure-code/libec_missing_entry_point_la- [...]
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/erasure-code/$(DEPDIR)/libec_missing_entry_point_la-ErasureCodePluginMissingEntryPoint.Tpo test/erasure-code/$(DEPDIR)/libec_missing_entry_point_la-ErasureCodePluginMissingEntryPoint.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/erasure-code/ErasureCodePluginMissingEntryPoint.cc' object='test/erasure-code/libec_missing_entry_point_la-ErasureCodePluginMissingEntryPoint.lo' libtool=yes @AMDEPBACKSLASH@
+test/erasure-code/libec_missing_entry_point_la-ErasureCodePluginMissingEntryPoint.lo: test/erasure-code/ErasureCodePluginMissingEntryPoint.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_missing_entry_point_la_CXXFLAGS) $(CXXFLAGS) -MT test/erasure-code/libec_missing_entry_point_la-ErasureCodePluginMissingEntryPoint.lo -MD -MP -MF test/erasure-code/$(DEPDIR)/libec_missing_entry_point_la-ErasureCodePluginMissingEntryPoint.Tpo -c -o test/erasure-code/libec_missing_entry_point_la- [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/erasure-code/$(DEPDIR)/libec_missing_entry_point_la-ErasureCodePluginMissingEntryPoint.Tpo test/erasure-code/$(DEPDIR)/libec_missing_entry_point_la-ErasureCodePluginMissingEntryPoint.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/erasure-code/ErasureCodePluginMissingEntryPoint.cc' object='test/erasure-code/libec_missing_entry_point_la-ErasureCodePluginMissingEntryPoint.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_missing_entry_point_la_CXXFLAGS) $(CXXFLAGS) -c -o test/erasure-code/libec_missing_entry_point_la-ErasureCodePluginMissingEntryPoint.lo `test -f 'test/erasure-code/ErasureCodePluginMissingEntryPoint.cc' || echo '$(srcdir)/'`test/erasure-code/ErasureCodePluginMissingEntryPoint.cc
+
+test/erasure-code/libec_missing_version_la-ErasureCodePluginMissingVersion.lo: test/erasure-code/ErasureCodePluginMissingVersion.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_missing_version_la_CXXFLAGS) $(CXXFLAGS) -MT test/erasure-code/libec_missing_version_la-ErasureCodePluginMissingVersion.lo -MD -MP -MF test/erasure-code/$(DEPDIR)/libec_missing_version_la-ErasureCodePluginMissingVersion.Tpo -c -o test/erasure-code/libec_missing_version_la-ErasureCodePluginMissi [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/erasure-code/$(DEPDIR)/libec_missing_version_la-ErasureCodePluginMissingVersion.Tpo test/erasure-code/$(DEPDIR)/libec_missing_version_la-ErasureCodePluginMissingVersion.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/erasure-code/ErasureCodePluginMissingVersion.cc' object='test/erasure-code/libec_missing_version_la-ErasureCodePluginMissingVersion.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_missing_version_la_CXXFLAGS) $(CXXFLAGS) -c -o test/erasure-code/libec_missing_version_la-ErasureCodePluginMissingVersion.lo `test -f 'test/erasure-code/ErasureCodePluginMissingVersion.cc' || echo '$(srcdir)/'`test/erasure-code/ErasureCodePluginMissingVersion.cc
+
+erasure-code/shec/libec_shec_la-ErasureCodePluginSelectShec.lo: erasure-code/shec/ErasureCodePluginSelectShec.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_la_CXXFLAGS) $(CXXFLAGS) -MT erasure-code/shec/libec_shec_la-ErasureCodePluginSelectShec.lo -MD -MP -MF erasure-code/shec/$(DEPDIR)/libec_shec_la-ErasureCodePluginSelectShec.Tpo -c -o erasure-code/shec/libec_shec_la-ErasureCodePluginSelectShec.lo `test -f 'erasure-code/shec/ErasureCodePlug [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) erasure-code/shec/$(DEPDIR)/libec_shec_la-ErasureCodePluginSelectShec.Tpo erasure-code/shec/$(DEPDIR)/libec_shec_la-ErasureCodePluginSelectShec.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='erasure-code/shec/ErasureCodePluginSelectShec.cc' object='erasure-code/shec/libec_shec_la-ErasureCodePluginSelectShec.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_la_CXXFLAGS) $(CXXFLAGS) -c -o erasure-code/shec/libec_shec_la-ErasureCodePluginSelectShec.lo `test -f 'erasure-code/shec/ErasureCodePluginSelectShec.cc' || echo '$(srcdir)/'`erasure-code/shec/ErasureCodePluginSelectShec.cc
+
+erasure-code/libec_shec_generic_la-ErasureCode.lo: erasure-code/ErasureCode.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_generic_la_CXXFLAGS) $(CXXFLAGS) -MT erasure-code/libec_shec_generic_la-ErasureCode.lo -MD -MP -MF erasure-code/$(DEPDIR)/libec_shec_generic_la-ErasureCode.Tpo -c -o erasure-code/libec_shec_generic_la-ErasureCode.lo `test -f 'erasure-code/ErasureCode.cc' || echo '$(srcdir)/'`erasure-code/E [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) erasure-code/$(DEPDIR)/libec_shec_generic_la-ErasureCode.Tpo erasure-code/$(DEPDIR)/libec_shec_generic_la-ErasureCode.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='erasure-code/ErasureCode.cc' object='erasure-code/libec_shec_generic_la-ErasureCode.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_generic_la_CXXFLAGS) $(CXXFLAGS) -c -o erasure-code/libec_shec_generic_la-ErasureCode.lo `test -f 'erasure-code/ErasureCode.cc' || echo '$(srcdir)/'`erasure-code/ErasureCode.cc
+
+erasure-code/shec/libec_shec_generic_la-ErasureCodePluginShec.lo: erasure-code/shec/ErasureCodePluginShec.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_generic_la_CXXFLAGS) $(CXXFLAGS) -MT erasure-code/shec/libec_shec_generic_la-ErasureCodePluginShec.lo -MD -MP -MF erasure-code/shec/$(DEPDIR)/libec_shec_generic_la-ErasureCodePluginShec.Tpo -c -o erasure-code/shec/libec_shec_generic_la-ErasureCodePluginShec.lo `test -f 'erasure-code/shec/E [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) erasure-code/shec/$(DEPDIR)/libec_shec_generic_la-ErasureCodePluginShec.Tpo erasure-code/shec/$(DEPDIR)/libec_shec_generic_la-ErasureCodePluginShec.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='erasure-code/shec/ErasureCodePluginShec.cc' object='erasure-code/shec/libec_shec_generic_la-ErasureCodePluginShec.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_generic_la_CXXFLAGS) $(CXXFLAGS) -c -o erasure-code/shec/libec_shec_generic_la-ErasureCodePluginShec.lo `test -f 'erasure-code/shec/ErasureCodePluginShec.cc' || echo '$(srcdir)/'`erasure-code/shec/ErasureCodePluginShec.cc
+
+erasure-code/shec/libec_shec_generic_la-ErasureCodeShec.lo: erasure-code/shec/ErasureCodeShec.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_generic_la_CXXFLAGS) $(CXXFLAGS) -MT erasure-code/shec/libec_shec_generic_la-ErasureCodeShec.lo -MD -MP -MF erasure-code/shec/$(DEPDIR)/libec_shec_generic_la-ErasureCodeShec.Tpo -c -o erasure-code/shec/libec_shec_generic_la-ErasureCodeShec.lo `test -f 'erasure-code/shec/ErasureCodeShec.cc' [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) erasure-code/shec/$(DEPDIR)/libec_shec_generic_la-ErasureCodeShec.Tpo erasure-code/shec/$(DEPDIR)/libec_shec_generic_la-ErasureCodeShec.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='erasure-code/shec/ErasureCodeShec.cc' object='erasure-code/shec/libec_shec_generic_la-ErasureCodeShec.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_generic_la_CXXFLAGS) $(CXXFLAGS) -c -o erasure-code/shec/libec_shec_generic_la-ErasureCodeShec.lo `test -f 'erasure-code/shec/ErasureCodeShec.cc' || echo '$(srcdir)/'`erasure-code/shec/ErasureCodeShec.cc
+
+erasure-code/shec/libec_shec_generic_la-ErasureCodeShecTableCache.lo: erasure-code/shec/ErasureCodeShecTableCache.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_generic_la_CXXFLAGS) $(CXXFLAGS) -MT erasure-code/shec/libec_shec_generic_la-ErasureCodeShecTableCache.lo -MD -MP -MF erasure-code/shec/$(DEPDIR)/libec_shec_generic_la-ErasureCodeShecTableCache.Tpo -c -o erasure-code/shec/libec_shec_generic_la-ErasureCodeShecTableCache.lo `test -f 'erasure [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) erasure-code/shec/$(DEPDIR)/libec_shec_generic_la-ErasureCodeShecTableCache.Tpo erasure-code/shec/$(DEPDIR)/libec_shec_generic_la-ErasureCodeShecTableCache.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='erasure-code/shec/ErasureCodeShecTableCache.cc' object='erasure-code/shec/libec_shec_generic_la-ErasureCodeShecTableCache.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_generic_la_CXXFLAGS) $(CXXFLAGS) -c -o erasure-code/shec/libec_shec_generic_la-ErasureCodeShecTableCache.lo `test -f 'erasure-code/shec/ErasureCodeShecTableCache.cc' || echo '$(srcdir)/'`erasure-code/shec/ErasureCodeShecTableCache.cc
+
+erasure-code/libec_shec_neon_la-ErasureCode.lo: erasure-code/ErasureCode.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CXXFLAGS) $(CXXFLAGS) -MT erasure-code/libec_shec_neon_la-ErasureCode.lo -MD -MP -MF erasure-code/$(DEPDIR)/libec_shec_neon_la-ErasureCode.Tpo -c -o erasure-code/libec_shec_neon_la-ErasureCode.lo `test -f 'erasure-code/ErasureCode.cc' || echo '$(srcdir)/'`erasure-code/ErasureCode.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) erasure-code/$(DEPDIR)/libec_shec_neon_la-ErasureCode.Tpo erasure-code/$(DEPDIR)/libec_shec_neon_la-ErasureCode.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='erasure-code/ErasureCode.cc' object='erasure-code/libec_shec_neon_la-ErasureCode.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CXXFLAGS) $(CXXFLAGS) -c -o erasure-code/libec_shec_neon_la-ErasureCode.lo `test -f 'erasure-code/ErasureCode.cc' || echo '$(srcdir)/'`erasure-code/ErasureCode.cc
+
+erasure-code/shec/libec_shec_neon_la-ErasureCodePluginShec.lo: erasure-code/shec/ErasureCodePluginShec.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CXXFLAGS) $(CXXFLAGS) -MT erasure-code/shec/libec_shec_neon_la-ErasureCodePluginShec.lo -MD -MP -MF erasure-code/shec/$(DEPDIR)/libec_shec_neon_la-ErasureCodePluginShec.Tpo -c -o erasure-code/shec/libec_shec_neon_la-ErasureCodePluginShec.lo `test -f 'erasure-code/shec/ErasureCodePl [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) erasure-code/shec/$(DEPDIR)/libec_shec_neon_la-ErasureCodePluginShec.Tpo erasure-code/shec/$(DEPDIR)/libec_shec_neon_la-ErasureCodePluginShec.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='erasure-code/shec/ErasureCodePluginShec.cc' object='erasure-code/shec/libec_shec_neon_la-ErasureCodePluginShec.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CXXFLAGS) $(CXXFLAGS) -c -o erasure-code/shec/libec_shec_neon_la-ErasureCodePluginShec.lo `test -f 'erasure-code/shec/ErasureCodePluginShec.cc' || echo '$(srcdir)/'`erasure-code/shec/ErasureCodePluginShec.cc
+
+erasure-code/shec/libec_shec_neon_la-ErasureCodeShec.lo: erasure-code/shec/ErasureCodeShec.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CXXFLAGS) $(CXXFLAGS) -MT erasure-code/shec/libec_shec_neon_la-ErasureCodeShec.lo -MD -MP -MF erasure-code/shec/$(DEPDIR)/libec_shec_neon_la-ErasureCodeShec.Tpo -c -o erasure-code/shec/libec_shec_neon_la-ErasureCodeShec.lo `test -f 'erasure-code/shec/ErasureCodeShec.cc' || echo '$( [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) erasure-code/shec/$(DEPDIR)/libec_shec_neon_la-ErasureCodeShec.Tpo erasure-code/shec/$(DEPDIR)/libec_shec_neon_la-ErasureCodeShec.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='erasure-code/shec/ErasureCodeShec.cc' object='erasure-code/shec/libec_shec_neon_la-ErasureCodeShec.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CXXFLAGS) $(CXXFLAGS) -c -o erasure-code/shec/libec_shec_neon_la-ErasureCodeShec.lo `test -f 'erasure-code/shec/ErasureCodeShec.cc' || echo '$(srcdir)/'`erasure-code/shec/ErasureCodeShec.cc
+
+erasure-code/shec/libec_shec_neon_la-ErasureCodeShecTableCache.lo: erasure-code/shec/ErasureCodeShecTableCache.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CXXFLAGS) $(CXXFLAGS) -MT erasure-code/shec/libec_shec_neon_la-ErasureCodeShecTableCache.lo -MD -MP -MF erasure-code/shec/$(DEPDIR)/libec_shec_neon_la-ErasureCodeShecTableCache.Tpo -c -o erasure-code/shec/libec_shec_neon_la-ErasureCodeShecTableCache.lo `test -f 'erasure-code/shec/E [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) erasure-code/shec/$(DEPDIR)/libec_shec_neon_la-ErasureCodeShecTableCache.Tpo erasure-code/shec/$(DEPDIR)/libec_shec_neon_la-ErasureCodeShecTableCache.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='erasure-code/shec/ErasureCodeShecTableCache.cc' object='erasure-code/shec/libec_shec_neon_la-ErasureCodeShecTableCache.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_neon_la_CXXFLAGS) $(CXXFLAGS) -c -o erasure-code/shec/libec_shec_neon_la-ErasureCodeShecTableCache.lo `test -f 'erasure-code/shec/ErasureCodeShecTableCache.cc' || echo '$(srcdir)/'`erasure-code/shec/ErasureCodeShecTableCache.cc
+
+erasure-code/libec_shec_sse3_la-ErasureCode.lo: erasure-code/ErasureCode.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse3_la_CXXFLAGS) $(CXXFLAGS) -MT erasure-code/libec_shec_sse3_la-ErasureCode.lo -MD -MP -MF erasure-code/$(DEPDIR)/libec_shec_sse3_la-ErasureCode.Tpo -c -o erasure-code/libec_shec_sse3_la-ErasureCode.lo `test -f 'erasure-code/ErasureCode.cc' || echo '$(srcdir)/'`erasure-code/ErasureCode.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) erasure-code/$(DEPDIR)/libec_shec_sse3_la-ErasureCode.Tpo erasure-code/$(DEPDIR)/libec_shec_sse3_la-ErasureCode.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='erasure-code/ErasureCode.cc' object='erasure-code/libec_shec_sse3_la-ErasureCode.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse3_la_CXXFLAGS) $(CXXFLAGS) -c -o erasure-code/libec_shec_sse3_la-ErasureCode.lo `test -f 'erasure-code/ErasureCode.cc' || echo '$(srcdir)/'`erasure-code/ErasureCode.cc
+
+erasure-code/shec/libec_shec_sse3_la-ErasureCodePluginShec.lo: erasure-code/shec/ErasureCodePluginShec.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse3_la_CXXFLAGS) $(CXXFLAGS) -MT erasure-code/shec/libec_shec_sse3_la-ErasureCodePluginShec.lo -MD -MP -MF erasure-code/shec/$(DEPDIR)/libec_shec_sse3_la-ErasureCodePluginShec.Tpo -c -o erasure-code/shec/libec_shec_sse3_la-ErasureCodePluginShec.lo `test -f 'erasure-code/shec/ErasureCodePl [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) erasure-code/shec/$(DEPDIR)/libec_shec_sse3_la-ErasureCodePluginShec.Tpo erasure-code/shec/$(DEPDIR)/libec_shec_sse3_la-ErasureCodePluginShec.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='erasure-code/shec/ErasureCodePluginShec.cc' object='erasure-code/shec/libec_shec_sse3_la-ErasureCodePluginShec.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_missing_entry_point_la_CXXFLAGS) $(CXXFLAGS) -c -o test/erasure-code/libec_missing_entry_point_la-ErasureCodePluginMissingEntryPoint.lo `test -f 'test/erasure-code/ErasureCodePluginMissingEntryPoint.cc' || echo '$(srcdir)/'`test/erasure-code/ErasureCodePluginMissingEntryPoint.cc
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse3_la_CXXFLAGS) $(CXXFLAGS) -c -o erasure-code/shec/libec_shec_sse3_la-ErasureCodePluginShec.lo `test -f 'erasure-code/shec/ErasureCodePluginShec.cc' || echo '$(srcdir)/'`erasure-code/shec/ErasureCodePluginShec.cc
 
-test/erasure-code/libec_missing_version_la-ErasureCodePluginMissingVersion.lo: test/erasure-code/ErasureCodePluginMissingVersion.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_missing_version_la_CXXFLAGS) $(CXXFLAGS) -MT test/erasure-code/libec_missing_version_la-ErasureCodePluginMissingVersion.lo -MD -MP -MF test/erasure-code/$(DEPDIR)/libec_missing_version_la-ErasureCodePluginMissingVersion.Tpo -c -o test/erasure-code/libec_missing_version_la-ErasureCodePluginMissi [...]
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/erasure-code/$(DEPDIR)/libec_missing_version_la-ErasureCodePluginMissingVersion.Tpo test/erasure-code/$(DEPDIR)/libec_missing_version_la-ErasureCodePluginMissingVersion.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/erasure-code/ErasureCodePluginMissingVersion.cc' object='test/erasure-code/libec_missing_version_la-ErasureCodePluginMissingVersion.lo' libtool=yes @AMDEPBACKSLASH@
+erasure-code/shec/libec_shec_sse3_la-ErasureCodeShec.lo: erasure-code/shec/ErasureCodeShec.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse3_la_CXXFLAGS) $(CXXFLAGS) -MT erasure-code/shec/libec_shec_sse3_la-ErasureCodeShec.lo -MD -MP -MF erasure-code/shec/$(DEPDIR)/libec_shec_sse3_la-ErasureCodeShec.Tpo -c -o erasure-code/shec/libec_shec_sse3_la-ErasureCodeShec.lo `test -f 'erasure-code/shec/ErasureCodeShec.cc' || echo '$( [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) erasure-code/shec/$(DEPDIR)/libec_shec_sse3_la-ErasureCodeShec.Tpo erasure-code/shec/$(DEPDIR)/libec_shec_sse3_la-ErasureCodeShec.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='erasure-code/shec/ErasureCodeShec.cc' object='erasure-code/shec/libec_shec_sse3_la-ErasureCodeShec.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_missing_version_la_CXXFLAGS) $(CXXFLAGS) -c -o test/erasure-code/libec_missing_version_la-ErasureCodePluginMissingVersion.lo `test -f 'test/erasure-code/ErasureCodePluginMissingVersion.cc' || echo '$(srcdir)/'`test/erasure-code/ErasureCodePluginMissingVersion.cc
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse3_la_CXXFLAGS) $(CXXFLAGS) -c -o erasure-code/shec/libec_shec_sse3_la-ErasureCodeShec.lo `test -f 'erasure-code/shec/ErasureCodeShec.cc' || echo '$(srcdir)/'`erasure-code/shec/ErasureCodeShec.cc
 
-erasure-code/libec_shec_la-ErasureCode.lo: erasure-code/ErasureCode.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_la_CXXFLAGS) $(CXXFLAGS) -MT erasure-code/libec_shec_la-ErasureCode.lo -MD -MP -MF erasure-code/$(DEPDIR)/libec_shec_la-ErasureCode.Tpo -c -o erasure-code/libec_shec_la-ErasureCode.lo `test -f 'erasure-code/ErasureCode.cc' || echo '$(srcdir)/'`erasure-code/ErasureCode.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) erasure-code/$(DEPDIR)/libec_shec_la-ErasureCode.Tpo erasure-code/$(DEPDIR)/libec_shec_la-ErasureCode.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='erasure-code/ErasureCode.cc' object='erasure-code/libec_shec_la-ErasureCode.lo' libtool=yes @AMDEPBACKSLASH@
+erasure-code/shec/libec_shec_sse3_la-ErasureCodeShecTableCache.lo: erasure-code/shec/ErasureCodeShecTableCache.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse3_la_CXXFLAGS) $(CXXFLAGS) -MT erasure-code/shec/libec_shec_sse3_la-ErasureCodeShecTableCache.lo -MD -MP -MF erasure-code/shec/$(DEPDIR)/libec_shec_sse3_la-ErasureCodeShecTableCache.Tpo -c -o erasure-code/shec/libec_shec_sse3_la-ErasureCodeShecTableCache.lo `test -f 'erasure-code/shec/E [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) erasure-code/shec/$(DEPDIR)/libec_shec_sse3_la-ErasureCodeShecTableCache.Tpo erasure-code/shec/$(DEPDIR)/libec_shec_sse3_la-ErasureCodeShecTableCache.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='erasure-code/shec/ErasureCodeShecTableCache.cc' object='erasure-code/shec/libec_shec_sse3_la-ErasureCodeShecTableCache.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_la_CXXFLAGS) $(CXXFLAGS) -c -o erasure-code/libec_shec_la-ErasureCode.lo `test -f 'erasure-code/ErasureCode.cc' || echo '$(srcdir)/'`erasure-code/ErasureCode.cc
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse3_la_CXXFLAGS) $(CXXFLAGS) -c -o erasure-code/shec/libec_shec_sse3_la-ErasureCodeShecTableCache.lo `test -f 'erasure-code/shec/ErasureCodeShecTableCache.cc' || echo '$(srcdir)/'`erasure-code/shec/ErasureCodeShecTableCache.cc
 
-erasure-code/shec/libec_shec_la-ErasureCodePluginShec.lo: erasure-code/shec/ErasureCodePluginShec.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_la_CXXFLAGS) $(CXXFLAGS) -MT erasure-code/shec/libec_shec_la-ErasureCodePluginShec.lo -MD -MP -MF erasure-code/shec/$(DEPDIR)/libec_shec_la-ErasureCodePluginShec.Tpo -c -o erasure-code/shec/libec_shec_la-ErasureCodePluginShec.lo `test -f 'erasure-code/shec/ErasureCodePluginShec.cc' || echo [...]
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) erasure-code/shec/$(DEPDIR)/libec_shec_la-ErasureCodePluginShec.Tpo erasure-code/shec/$(DEPDIR)/libec_shec_la-ErasureCodePluginShec.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='erasure-code/shec/ErasureCodePluginShec.cc' object='erasure-code/shec/libec_shec_la-ErasureCodePluginShec.lo' libtool=yes @AMDEPBACKSLASH@
+erasure-code/libec_shec_sse4_la-ErasureCode.lo: erasure-code/ErasureCode.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse4_la_CXXFLAGS) $(CXXFLAGS) -MT erasure-code/libec_shec_sse4_la-ErasureCode.lo -MD -MP -MF erasure-code/$(DEPDIR)/libec_shec_sse4_la-ErasureCode.Tpo -c -o erasure-code/libec_shec_sse4_la-ErasureCode.lo `test -f 'erasure-code/ErasureCode.cc' || echo '$(srcdir)/'`erasure-code/ErasureCode.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) erasure-code/$(DEPDIR)/libec_shec_sse4_la-ErasureCode.Tpo erasure-code/$(DEPDIR)/libec_shec_sse4_la-ErasureCode.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='erasure-code/ErasureCode.cc' object='erasure-code/libec_shec_sse4_la-ErasureCode.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_la_CXXFLAGS) $(CXXFLAGS) -c -o erasure-code/shec/libec_shec_la-ErasureCodePluginShec.lo `test -f 'erasure-code/shec/ErasureCodePluginShec.cc' || echo '$(srcdir)/'`erasure-code/shec/ErasureCodePluginShec.cc
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse4_la_CXXFLAGS) $(CXXFLAGS) -c -o erasure-code/libec_shec_sse4_la-ErasureCode.lo `test -f 'erasure-code/ErasureCode.cc' || echo '$(srcdir)/'`erasure-code/ErasureCode.cc
 
-erasure-code/shec/libec_shec_la-ErasureCodeShec.lo: erasure-code/shec/ErasureCodeShec.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_la_CXXFLAGS) $(CXXFLAGS) -MT erasure-code/shec/libec_shec_la-ErasureCodeShec.lo -MD -MP -MF erasure-code/shec/$(DEPDIR)/libec_shec_la-ErasureCodeShec.Tpo -c -o erasure-code/shec/libec_shec_la-ErasureCodeShec.lo `test -f 'erasure-code/shec/ErasureCodeShec.cc' || echo '$(srcdir)/'`erasure-co [...]
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) erasure-code/shec/$(DEPDIR)/libec_shec_la-ErasureCodeShec.Tpo erasure-code/shec/$(DEPDIR)/libec_shec_la-ErasureCodeShec.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='erasure-code/shec/ErasureCodeShec.cc' object='erasure-code/shec/libec_shec_la-ErasureCodeShec.lo' libtool=yes @AMDEPBACKSLASH@
+erasure-code/shec/libec_shec_sse4_la-ErasureCodePluginShec.lo: erasure-code/shec/ErasureCodePluginShec.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse4_la_CXXFLAGS) $(CXXFLAGS) -MT erasure-code/shec/libec_shec_sse4_la-ErasureCodePluginShec.lo -MD -MP -MF erasure-code/shec/$(DEPDIR)/libec_shec_sse4_la-ErasureCodePluginShec.Tpo -c -o erasure-code/shec/libec_shec_sse4_la-ErasureCodePluginShec.lo `test -f 'erasure-code/shec/ErasureCodePl [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) erasure-code/shec/$(DEPDIR)/libec_shec_sse4_la-ErasureCodePluginShec.Tpo erasure-code/shec/$(DEPDIR)/libec_shec_sse4_la-ErasureCodePluginShec.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='erasure-code/shec/ErasureCodePluginShec.cc' object='erasure-code/shec/libec_shec_sse4_la-ErasureCodePluginShec.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_la_CXXFLAGS) $(CXXFLAGS) -c -o erasure-code/shec/libec_shec_la-ErasureCodeShec.lo `test -f 'erasure-code/shec/ErasureCodeShec.cc' || echo '$(srcdir)/'`erasure-code/shec/ErasureCodeShec.cc
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse4_la_CXXFLAGS) $(CXXFLAGS) -c -o erasure-code/shec/libec_shec_sse4_la-ErasureCodePluginShec.lo `test -f 'erasure-code/shec/ErasureCodePluginShec.cc' || echo '$(srcdir)/'`erasure-code/shec/ErasureCodePluginShec.cc
 
-erasure-code/shec/libec_shec_la-ErasureCodeShecTableCache.lo: erasure-code/shec/ErasureCodeShecTableCache.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_la_CXXFLAGS) $(CXXFLAGS) -MT erasure-code/shec/libec_shec_la-ErasureCodeShecTableCache.lo -MD -MP -MF erasure-code/shec/$(DEPDIR)/libec_shec_la-ErasureCodeShecTableCache.Tpo -c -o erasure-code/shec/libec_shec_la-ErasureCodeShecTableCache.lo `test -f 'erasure-code/shec/ErasureCodeShecTableC [...]
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) erasure-code/shec/$(DEPDIR)/libec_shec_la-ErasureCodeShecTableCache.Tpo erasure-code/shec/$(DEPDIR)/libec_shec_la-ErasureCodeShecTableCache.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='erasure-code/shec/ErasureCodeShecTableCache.cc' object='erasure-code/shec/libec_shec_la-ErasureCodeShecTableCache.lo' libtool=yes @AMDEPBACKSLASH@
+erasure-code/shec/libec_shec_sse4_la-ErasureCodeShec.lo: erasure-code/shec/ErasureCodeShec.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse4_la_CXXFLAGS) $(CXXFLAGS) -MT erasure-code/shec/libec_shec_sse4_la-ErasureCodeShec.lo -MD -MP -MF erasure-code/shec/$(DEPDIR)/libec_shec_sse4_la-ErasureCodeShec.Tpo -c -o erasure-code/shec/libec_shec_sse4_la-ErasureCodeShec.lo `test -f 'erasure-code/shec/ErasureCodeShec.cc' || echo '$( [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) erasure-code/shec/$(DEPDIR)/libec_shec_sse4_la-ErasureCodeShec.Tpo erasure-code/shec/$(DEPDIR)/libec_shec_sse4_la-ErasureCodeShec.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='erasure-code/shec/ErasureCodeShec.cc' object='erasure-code/shec/libec_shec_sse4_la-ErasureCodeShec.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_la_CXXFLAGS) $(CXXFLAGS) -c -o erasure-code/shec/libec_shec_la-ErasureCodeShecTableCache.lo `test -f 'erasure-code/shec/ErasureCodeShecTableCache.cc' || echo '$(srcdir)/'`erasure-code/shec/ErasureCodeShecTableCache.cc
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse4_la_CXXFLAGS) $(CXXFLAGS) -c -o erasure-code/shec/libec_shec_sse4_la-ErasureCodeShec.lo `test -f 'erasure-code/shec/ErasureCodeShec.cc' || echo '$(srcdir)/'`erasure-code/shec/ErasureCodeShec.cc
 
-erasure-code/shec/libec_shec_la-shec.lo: erasure-code/shec/shec.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_la_CXXFLAGS) $(CXXFLAGS) -MT erasure-code/shec/libec_shec_la-shec.lo -MD -MP -MF erasure-code/shec/$(DEPDIR)/libec_shec_la-shec.Tpo -c -o erasure-code/shec/libec_shec_la-shec.lo `test -f 'erasure-code/shec/shec.cc' || echo '$(srcdir)/'`erasure-code/shec/shec.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) erasure-code/shec/$(DEPDIR)/libec_shec_la-shec.Tpo erasure-code/shec/$(DEPDIR)/libec_shec_la-shec.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='erasure-code/shec/shec.cc' object='erasure-code/shec/libec_shec_la-shec.lo' libtool=yes @AMDEPBACKSLASH@
+erasure-code/shec/libec_shec_sse4_la-ErasureCodeShecTableCache.lo: erasure-code/shec/ErasureCodeShecTableCache.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse4_la_CXXFLAGS) $(CXXFLAGS) -MT erasure-code/shec/libec_shec_sse4_la-ErasureCodeShecTableCache.lo -MD -MP -MF erasure-code/shec/$(DEPDIR)/libec_shec_sse4_la-ErasureCodeShecTableCache.Tpo -c -o erasure-code/shec/libec_shec_sse4_la-ErasureCodeShecTableCache.lo `test -f 'erasure-code/shec/E [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) erasure-code/shec/$(DEPDIR)/libec_shec_sse4_la-ErasureCodeShecTableCache.Tpo erasure-code/shec/$(DEPDIR)/libec_shec_sse4_la-ErasureCodeShecTableCache.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='erasure-code/shec/ErasureCodeShecTableCache.cc' object='erasure-code/shec/libec_shec_sse4_la-ErasureCodeShecTableCache.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_la_CXXFLAGS) $(CXXFLAGS) -c -o erasure-code/shec/libec_shec_la-shec.lo `test -f 'erasure-code/shec/shec.cc' || echo '$(srcdir)/'`erasure-code/shec/shec.cc
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_shec_sse4_la_CXXFLAGS) $(CXXFLAGS) -c -o erasure-code/shec/libec_shec_sse4_la-ErasureCodeShecTableCache.lo `test -f 'erasure-code/shec/ErasureCodeShecTableCache.cc' || echo '$(srcdir)/'`erasure-code/shec/ErasureCodeShecTableCache.cc
 
 test/erasure-code/libec_test_jerasure_generic_la-TestJerasurePluginGeneric.lo: test/erasure-code/TestJerasurePluginGeneric.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_test_jerasure_generic_la_CXXFLAGS) $(CXXFLAGS) -MT test/erasure-code/libec_test_jerasure_generic_la-TestJerasurePluginGeneric.lo -MD -MP -MF test/erasure-code/$(DEPDIR)/libec_test_jerasure_generic_la-TestJerasurePluginGeneric.Tpo -c -o test/erasure-code/libec_test_jerasure_generic_la-TestJerasu [...]
@@ -16536,6 +19352,34 @@ test/erasure-code/libec_test_jerasure_sse4_la-TestJerasurePluginSSE4.lo: test/er
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_test_jerasure_sse4_la_CXXFLAGS) $(CXXFLAGS) -c -o test/erasure-code/libec_test_jerasure_sse4_la-TestJerasurePluginSSE4.lo `test -f 'test/erasure-code/TestJerasurePluginSSE4.cc' || echo '$(srcdir)/'`test/erasure-code/TestJerasurePluginSSE4.cc
 
+test/erasure-code/libec_test_shec_generic_la-TestShecPluginGeneric.lo: test/erasure-code/TestShecPluginGeneric.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_test_shec_generic_la_CXXFLAGS) $(CXXFLAGS) -MT test/erasure-code/libec_test_shec_generic_la-TestShecPluginGeneric.lo -MD -MP -MF test/erasure-code/$(DEPDIR)/libec_test_shec_generic_la-TestShecPluginGeneric.Tpo -c -o test/erasure-code/libec_test_shec_generic_la-TestShecPluginGeneric.lo `test -f  [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/erasure-code/$(DEPDIR)/libec_test_shec_generic_la-TestShecPluginGeneric.Tpo test/erasure-code/$(DEPDIR)/libec_test_shec_generic_la-TestShecPluginGeneric.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/erasure-code/TestShecPluginGeneric.cc' object='test/erasure-code/libec_test_shec_generic_la-TestShecPluginGeneric.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_test_shec_generic_la_CXXFLAGS) $(CXXFLAGS) -c -o test/erasure-code/libec_test_shec_generic_la-TestShecPluginGeneric.lo `test -f 'test/erasure-code/TestShecPluginGeneric.cc' || echo '$(srcdir)/'`test/erasure-code/TestShecPluginGeneric.cc
+
+test/erasure-code/libec_test_shec_neon_la-TestShecPluginNEON.lo: test/erasure-code/TestShecPluginNEON.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_test_shec_neon_la_CXXFLAGS) $(CXXFLAGS) -MT test/erasure-code/libec_test_shec_neon_la-TestShecPluginNEON.lo -MD -MP -MF test/erasure-code/$(DEPDIR)/libec_test_shec_neon_la-TestShecPluginNEON.Tpo -c -o test/erasure-code/libec_test_shec_neon_la-TestShecPluginNEON.lo `test -f 'test/erasure-code/Te [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/erasure-code/$(DEPDIR)/libec_test_shec_neon_la-TestShecPluginNEON.Tpo test/erasure-code/$(DEPDIR)/libec_test_shec_neon_la-TestShecPluginNEON.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/erasure-code/TestShecPluginNEON.cc' object='test/erasure-code/libec_test_shec_neon_la-TestShecPluginNEON.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_test_shec_neon_la_CXXFLAGS) $(CXXFLAGS) -c -o test/erasure-code/libec_test_shec_neon_la-TestShecPluginNEON.lo `test -f 'test/erasure-code/TestShecPluginNEON.cc' || echo '$(srcdir)/'`test/erasure-code/TestShecPluginNEON.cc
+
+test/erasure-code/libec_test_shec_sse3_la-TestShecPluginSSE3.lo: test/erasure-code/TestShecPluginSSE3.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_test_shec_sse3_la_CXXFLAGS) $(CXXFLAGS) -MT test/erasure-code/libec_test_shec_sse3_la-TestShecPluginSSE3.lo -MD -MP -MF test/erasure-code/$(DEPDIR)/libec_test_shec_sse3_la-TestShecPluginSSE3.Tpo -c -o test/erasure-code/libec_test_shec_sse3_la-TestShecPluginSSE3.lo `test -f 'test/erasure-code/Te [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/erasure-code/$(DEPDIR)/libec_test_shec_sse3_la-TestShecPluginSSE3.Tpo test/erasure-code/$(DEPDIR)/libec_test_shec_sse3_la-TestShecPluginSSE3.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/erasure-code/TestShecPluginSSE3.cc' object='test/erasure-code/libec_test_shec_sse3_la-TestShecPluginSSE3.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_test_shec_sse3_la_CXXFLAGS) $(CXXFLAGS) -c -o test/erasure-code/libec_test_shec_sse3_la-TestShecPluginSSE3.lo `test -f 'test/erasure-code/TestShecPluginSSE3.cc' || echo '$(srcdir)/'`test/erasure-code/TestShecPluginSSE3.cc
+
+test/erasure-code/libec_test_shec_sse4_la-TestShecPluginSSE4.lo: test/erasure-code/TestShecPluginSSE4.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_test_shec_sse4_la_CXXFLAGS) $(CXXFLAGS) -MT test/erasure-code/libec_test_shec_sse4_la-TestShecPluginSSE4.lo -MD -MP -MF test/erasure-code/$(DEPDIR)/libec_test_shec_sse4_la-TestShecPluginSSE4.Tpo -c -o test/erasure-code/libec_test_shec_sse4_la-TestShecPluginSSE4.lo `test -f 'test/erasure-code/Te [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/erasure-code/$(DEPDIR)/libec_test_shec_sse4_la-TestShecPluginSSE4.Tpo test/erasure-code/$(DEPDIR)/libec_test_shec_sse4_la-TestShecPluginSSE4.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/erasure-code/TestShecPluginSSE4.cc' object='test/erasure-code/libec_test_shec_sse4_la-TestShecPluginSSE4.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_test_shec_sse4_la_CXXFLAGS) $(CXXFLAGS) -c -o test/erasure-code/libec_test_shec_sse4_la-TestShecPluginSSE4.lo `test -f 'test/erasure-code/TestShecPluginSSE4.cc' || echo '$(srcdir)/'`test/erasure-code/TestShecPluginSSE4.cc
+
 os/libos_la-chain_xattr.lo: os/chain_xattr.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -MT os/libos_la-chain_xattr.lo -MD -MP -MF os/$(DEPDIR)/libos_la-chain_xattr.Tpo -c -o os/libos_la-chain_xattr.lo `test -f 'os/chain_xattr.cc' || echo '$(srcdir)/'`os/chain_xattr.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) os/$(DEPDIR)/libos_la-chain_xattr.Tpo os/$(DEPDIR)/libos_la-chain_xattr.Plo
@@ -16543,6 +19387,13 @@ os/libos_la-chain_xattr.lo: os/chain_xattr.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -c -o os/libos_la-chain_xattr.lo `test -f 'os/chain_xattr.cc' || echo '$(srcdir)/'`os/chain_xattr.cc
 
+os/fs/libos_la-FS.lo: os/fs/FS.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -MT os/fs/libos_la-FS.lo -MD -MP -MF os/fs/$(DEPDIR)/libos_la-FS.Tpo -c -o os/fs/libos_la-FS.lo `test -f 'os/fs/FS.cc' || echo '$(srcdir)/'`os/fs/FS.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) os/fs/$(DEPDIR)/libos_la-FS.Tpo os/fs/$(DEPDIR)/libos_la-FS.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='os/fs/FS.cc' object='os/fs/libos_la-FS.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -c -o os/fs/libos_la-FS.lo `test -f 'os/fs/FS.cc' || echo '$(srcdir)/'`os/fs/FS.cc
+
 os/libos_la-DBObjectMap.lo: os/DBObjectMap.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -MT os/libos_la-DBObjectMap.lo -MD -MP -MF os/$(DEPDIR)/libos_la-DBObjectMap.Tpo -c -o os/libos_la-DBObjectMap.lo `test -f 'os/DBObjectMap.cc' || echo '$(srcdir)/'`os/DBObjectMap.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) os/$(DEPDIR)/libos_la-DBObjectMap.Tpo os/$(DEPDIR)/libos_la-DBObjectMap.Plo
@@ -16571,13 +19422,6 @@ os/libos_la-FileStore.lo: os/FileStore.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -c -o os/libos_la-FileStore.lo `test -f 'os/FileStore.cc' || echo '$(srcdir)/'`os/FileStore.cc
 
-os/libos_la-FlatIndex.lo: os/FlatIndex.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -MT os/libos_la-FlatIndex.lo -MD -MP -MF os/$(DEPDIR)/libos_la-FlatIndex.Tpo -c -o os/libos_la-FlatIndex.lo `test -f 'os/FlatIndex.cc' || echo '$(srcdir)/'`os/FlatIndex.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) os/$(DEPDIR)/libos_la-FlatIndex.Tpo os/$(DEPDIR)/libos_la-FlatIndex.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='os/FlatIndex.cc' object='os/libos_la-FlatIndex.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -c -o os/libos_la-FlatIndex.lo `test -f 'os/FlatIndex.cc' || echo '$(srcdir)/'`os/FlatIndex.cc
-
 os/libos_la-GenericFileStoreBackend.lo: os/GenericFileStoreBackend.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -MT os/libos_la-GenericFileStoreBackend.lo -MD -MP -MF os/$(DEPDIR)/libos_la-GenericFileStoreBackend.Tpo -c -o os/libos_la-GenericFileStoreBackend.lo `test -f 'os/GenericFileStoreBackend.cc' || echo '$(srcdir)/'`os/GenericFileStoreBackend.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) os/$(DEPDIR)/libos_la-GenericFileStoreBackend.Tpo os/$(DEPDIR)/libos_la-GenericFileStoreBackend.Plo
@@ -16669,6 +19513,20 @@ os/libos_la-BtrfsFileStoreBackend.lo: os/BtrfsFileStoreBackend.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -c -o os/libos_la-BtrfsFileStoreBackend.lo `test -f 'os/BtrfsFileStoreBackend.cc' || echo '$(srcdir)/'`os/BtrfsFileStoreBackend.cc
 
+os/newstore/libos_la-NewStore.lo: os/newstore/NewStore.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -MT os/newstore/libos_la-NewStore.lo -MD -MP -MF os/newstore/$(DEPDIR)/libos_la-NewStore.Tpo -c -o os/newstore/libos_la-NewStore.lo `test -f 'os/newstore/NewStore.cc' || echo '$(srcdir)/'`os/newstore/NewStore.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) os/newstore/$(DEPDIR)/libos_la-NewStore.Tpo os/newstore/$(DEPDIR)/libos_la-NewStore.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='os/newstore/NewStore.cc' object='os/newstore/libos_la-NewStore.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -c -o os/newstore/libos_la-NewStore.lo `test -f 'os/newstore/NewStore.cc' || echo '$(srcdir)/'`os/newstore/NewStore.cc
+
+os/fs/libos_la-XFS.lo: os/fs/XFS.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -MT os/fs/libos_la-XFS.lo -MD -MP -MF os/fs/$(DEPDIR)/libos_la-XFS.Tpo -c -o os/fs/libos_la-XFS.lo `test -f 'os/fs/XFS.cc' || echo '$(srcdir)/'`os/fs/XFS.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) os/fs/$(DEPDIR)/libos_la-XFS.Tpo os/fs/$(DEPDIR)/libos_la-XFS.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='os/fs/XFS.cc' object='os/fs/libos_la-XFS.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -c -o os/fs/libos_la-XFS.lo `test -f 'os/fs/XFS.cc' || echo '$(srcdir)/'`os/fs/XFS.cc
+
 os/libos_la-XfsFileStoreBackend.lo: os/XfsFileStoreBackend.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -MT os/libos_la-XfsFileStoreBackend.lo -MD -MP -MF os/$(DEPDIR)/libos_la-XfsFileStoreBackend.Tpo -c -o os/libos_la-XfsFileStoreBackend.lo `test -f 'os/XfsFileStoreBackend.cc' || echo '$(srcdir)/'`os/XfsFileStoreBackend.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) os/$(DEPDIR)/libos_la-XfsFileStoreBackend.Tpo os/$(DEPDIR)/libos_la-XfsFileStoreBackend.Plo
@@ -16704,6 +19562,13 @@ os/libos_types_la-Transaction.lo: os/Transaction.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_types_la_CXXFLAGS) $(CXXFLAGS) -c -o os/libos_types_la-Transaction.lo `test -f 'os/Transaction.cc' || echo '$(srcdir)/'`os/Transaction.cc
 
+os/newstore/libos_types_la-newstore_types.lo: os/newstore/newstore_types.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_types_la_CXXFLAGS) $(CXXFLAGS) -MT os/newstore/libos_types_la-newstore_types.lo -MD -MP -MF os/newstore/$(DEPDIR)/libos_types_la-newstore_types.Tpo -c -o os/newstore/libos_types_la-newstore_types.lo `test -f 'os/newstore/newstore_types.cc' || echo '$(srcdir)/'`os/newstore/newstore_types.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) os/newstore/$(DEPDIR)/libos_types_la-newstore_types.Tpo os/newstore/$(DEPDIR)/libos_types_la-newstore_types.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='os/newstore/newstore_types.cc' object='os/newstore/libos_types_la-newstore_types.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_types_la_CXXFLAGS) $(CXXFLAGS) -c -o os/newstore/libos_types_la-newstore_types.lo `test -f 'os/newstore/newstore_types.cc' || echo '$(srcdir)/'`os/newstore/newstore_types.cc
+
 osd/libosd_la-PG.lo: osd/PG.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_la_CXXFLAGS) $(CXXFLAGS) -MT osd/libosd_la-PG.lo -MD -MP -MF osd/$(DEPDIR)/libosd_la-PG.Tpo -c -o osd/libosd_la-PG.lo `test -f 'osd/PG.cc' || echo '$(srcdir)/'`osd/PG.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) osd/$(DEPDIR)/libosd_la-PG.Tpo osd/$(DEPDIR)/libosd_la-PG.Plo
@@ -16753,13 +19618,6 @@ osd/libosd_la-PGBackend.lo: osd/PGBackend.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_la_CXXFLAGS) $(CXXFLAGS) -c -o osd/libosd_la-PGBackend.lo `test -f 'osd/PGBackend.cc' || echo '$(srcdir)/'`osd/PGBackend.cc
 
-osd/libosd_la-Ager.lo: osd/Ager.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_la_CXXFLAGS) $(CXXFLAGS) -MT osd/libosd_la-Ager.lo -MD -MP -MF osd/$(DEPDIR)/libosd_la-Ager.Tpo -c -o osd/libosd_la-Ager.lo `test -f 'osd/Ager.cc' || echo '$(srcdir)/'`osd/Ager.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) osd/$(DEPDIR)/libosd_la-Ager.Tpo osd/$(DEPDIR)/libosd_la-Ager.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='osd/Ager.cc' object='osd/libosd_la-Ager.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_la_CXXFLAGS) $(CXXFLAGS) -c -o osd/libosd_la-Ager.lo `test -f 'osd/Ager.cc' || echo '$(srcdir)/'`osd/Ager.cc
-
 osd/libosd_la-HitSet.lo: osd/HitSet.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_la_CXXFLAGS) $(CXXFLAGS) -MT osd/libosd_la-HitSet.lo -MD -MP -MF osd/$(DEPDIR)/libosd_la-HitSet.Tpo -c -o osd/libosd_la-HitSet.lo `test -f 'osd/HitSet.cc' || echo '$(srcdir)/'`osd/HitSet.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) osd/$(DEPDIR)/libosd_la-HitSet.Tpo osd/$(DEPDIR)/libosd_la-HitSet.Plo
@@ -17187,6 +20045,13 @@ rgw/librgw_la-rgw_dencoder.lo: rgw/rgw_dencoder.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(librgw_la_CXXFLAGS) $(CXXFLAGS) -c -o rgw/librgw_la-rgw_dencoder.lo `test -f 'rgw/rgw_dencoder.cc' || echo '$(srcdir)/'`rgw/rgw_dencoder.cc
 
+rgw/librgw_la-rgw_object_expirer_core.lo: rgw/rgw_object_expirer_core.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(librgw_la_CXXFLAGS) $(CXXFLAGS) -MT rgw/librgw_la-rgw_object_expirer_core.lo -MD -MP -MF rgw/$(DEPDIR)/librgw_la-rgw_object_expirer_core.Tpo -c -o rgw/librgw_la-rgw_object_expirer_core.lo `test -f 'rgw/rgw_object_expirer_core.cc' || echo '$(srcdir)/'`rgw/rgw_object_expirer_core.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) rgw/$(DEPDIR)/librgw_la-rgw_object_expirer_core.Tpo rgw/$(DEPDIR)/librgw_la-rgw_object_expirer_core.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='rgw/rgw_object_expirer_core.cc' object='rgw/librgw_la-rgw_object_expirer_core.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(librgw_la_CXXFLAGS) $(CXXFLAGS) -c -o rgw/librgw_la-rgw_object_expirer_core.lo `test -f 'rgw/rgw_object_expirer_core.cc' || echo '$(srcdir)/'`rgw/rgw_object_expirer_core.cc
+
 test/encoding/ceph_dencoder-ceph_dencoder.o: test/encoding/ceph_dencoder.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_dencoder_CXXFLAGS) $(CXXFLAGS) -MT test/encoding/ceph_dencoder-ceph_dencoder.o -MD -MP -MF test/encoding/$(DEPDIR)/ceph_dencoder-ceph_dencoder.Tpo -c -o test/encoding/ceph_dencoder-ceph_dencoder.o `test -f 'test/encoding/ceph_dencoder.cc' || echo '$(srcdir)/'`test/encoding/ceph_dencoder.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/encoding/$(DEPDIR)/ceph_dencoder-ceph_dencoder.Tpo test/encoding/$(DEPDIR)/ceph_dencoder-ceph_dencoder.Po
@@ -17215,19 +20080,33 @@ mds/ceph_dencoder-Capability.obj: mds/Capability.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_dencoder_CXXFLAGS) $(CXXFLAGS) -c -o mds/ceph_dencoder-Capability.obj `if test -f 'mds/Capability.cc'; then $(CYGPATH_W) 'mds/Capability.cc'; else $(CYGPATH_W) '$(srcdir)/mds/Capability.cc'; fi`
 
-mds/ceph_dencoder-MDS.o: mds/MDS.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_dencoder_CXXFLAGS) $(CXXFLAGS) -MT mds/ceph_dencoder-MDS.o -MD -MP -MF mds/$(DEPDIR)/ceph_dencoder-MDS.Tpo -c -o mds/ceph_dencoder-MDS.o `test -f 'mds/MDS.cc' || echo '$(srcdir)/'`mds/MDS.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) mds/$(DEPDIR)/ceph_dencoder-MDS.Tpo mds/$(DEPDIR)/ceph_dencoder-MDS.Po
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='mds/MDS.cc' object='mds/ceph_dencoder-MDS.o' libtool=no @AMDEPBACKSLASH@
+mds/ceph_dencoder-MDSDaemon.o: mds/MDSDaemon.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_dencoder_CXXFLAGS) $(CXXFLAGS) -MT mds/ceph_dencoder-MDSDaemon.o -MD -MP -MF mds/$(DEPDIR)/ceph_dencoder-MDSDaemon.Tpo -c -o mds/ceph_dencoder-MDSDaemon.o `test -f 'mds/MDSDaemon.cc' || echo '$(srcdir)/'`mds/MDSDaemon.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) mds/$(DEPDIR)/ceph_dencoder-MDSDaemon.Tpo mds/$(DEPDIR)/ceph_dencoder-MDSDaemon.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='mds/MDSDaemon.cc' object='mds/ceph_dencoder-MDSDaemon.o' libtool=no @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_dencoder_CXXFLAGS) $(CXXFLAGS) -c -o mds/ceph_dencoder-MDS.o `test -f 'mds/MDS.cc' || echo '$(srcdir)/'`mds/MDS.cc
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_dencoder_CXXFLAGS) $(CXXFLAGS) -c -o mds/ceph_dencoder-MDSDaemon.o `test -f 'mds/MDSDaemon.cc' || echo '$(srcdir)/'`mds/MDSDaemon.cc
 
-mds/ceph_dencoder-MDS.obj: mds/MDS.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_dencoder_CXXFLAGS) $(CXXFLAGS) -MT mds/ceph_dencoder-MDS.obj -MD -MP -MF mds/$(DEPDIR)/ceph_dencoder-MDS.Tpo -c -o mds/ceph_dencoder-MDS.obj `if test -f 'mds/MDS.cc'; then $(CYGPATH_W) 'mds/MDS.cc'; else $(CYGPATH_W) '$(srcdir)/mds/MDS.cc'; fi`
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) mds/$(DEPDIR)/ceph_dencoder-MDS.Tpo mds/$(DEPDIR)/ceph_dencoder-MDS.Po
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='mds/MDS.cc' object='mds/ceph_dencoder-MDS.obj' libtool=no @AMDEPBACKSLASH@
+mds/ceph_dencoder-MDSDaemon.obj: mds/MDSDaemon.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_dencoder_CXXFLAGS) $(CXXFLAGS) -MT mds/ceph_dencoder-MDSDaemon.obj -MD -MP -MF mds/$(DEPDIR)/ceph_dencoder-MDSDaemon.Tpo -c -o mds/ceph_dencoder-MDSDaemon.obj `if test -f 'mds/MDSDaemon.cc'; then $(CYGPATH_W) 'mds/MDSDaemon.cc'; else $(CYGPATH_W) '$(srcdir)/mds/MDSDaemon.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) mds/$(DEPDIR)/ceph_dencoder-MDSDaemon.Tpo mds/$(DEPDIR)/ceph_dencoder-MDSDaemon.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='mds/MDSDaemon.cc' object='mds/ceph_dencoder-MDSDaemon.obj' libtool=no @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_dencoder_CXXFLAGS) $(CXXFLAGS) -c -o mds/ceph_dencoder-MDS.obj `if test -f 'mds/MDS.cc'; then $(CYGPATH_W) 'mds/MDS.cc'; else $(CYGPATH_W) '$(srcdir)/mds/MDS.cc'; fi`
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_dencoder_CXXFLAGS) $(CXXFLAGS) -c -o mds/ceph_dencoder-MDSDaemon.obj `if test -f 'mds/MDSDaemon.cc'; then $(CYGPATH_W) 'mds/MDSDaemon.cc'; else $(CYGPATH_W) '$(srcdir)/mds/MDSDaemon.cc'; fi`
+
+mds/ceph_dencoder-MDSRank.o: mds/MDSRank.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_dencoder_CXXFLAGS) $(CXXFLAGS) -MT mds/ceph_dencoder-MDSRank.o -MD -MP -MF mds/$(DEPDIR)/ceph_dencoder-MDSRank.Tpo -c -o mds/ceph_dencoder-MDSRank.o `test -f 'mds/MDSRank.cc' || echo '$(srcdir)/'`mds/MDSRank.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) mds/$(DEPDIR)/ceph_dencoder-MDSRank.Tpo mds/$(DEPDIR)/ceph_dencoder-MDSRank.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='mds/MDSRank.cc' object='mds/ceph_dencoder-MDSRank.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_dencoder_CXXFLAGS) $(CXXFLAGS) -c -o mds/ceph_dencoder-MDSRank.o `test -f 'mds/MDSRank.cc' || echo '$(srcdir)/'`mds/MDSRank.cc
+
+mds/ceph_dencoder-MDSRank.obj: mds/MDSRank.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_dencoder_CXXFLAGS) $(CXXFLAGS) -MT mds/ceph_dencoder-MDSRank.obj -MD -MP -MF mds/$(DEPDIR)/ceph_dencoder-MDSRank.Tpo -c -o mds/ceph_dencoder-MDSRank.obj `if test -f 'mds/MDSRank.cc'; then $(CYGPATH_W) 'mds/MDSRank.cc'; else $(CYGPATH_W) '$(srcdir)/mds/MDSRank.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) mds/$(DEPDIR)/ceph_dencoder-MDSRank.Tpo mds/$(DEPDIR)/ceph_dencoder-MDSRank.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='mds/MDSRank.cc' object='mds/ceph_dencoder-MDSRank.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_dencoder_CXXFLAGS) $(CXXFLAGS) -c -o mds/ceph_dencoder-MDSRank.obj `if test -f 'mds/MDSRank.cc'; then $(CYGPATH_W) 'mds/MDSRank.cc'; else $(CYGPATH_W) '$(srcdir)/mds/MDSRank.cc'; fi`
 
 mds/ceph_dencoder-Beacon.o: mds/Beacon.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_dencoder_CXXFLAGS) $(CXXFLAGS) -MT mds/ceph_dencoder-Beacon.o -MD -MP -MF mds/$(DEPDIR)/ceph_dencoder-Beacon.Tpo -c -o mds/ceph_dencoder-Beacon.o `test -f 'mds/Beacon.cc' || echo '$(srcdir)/'`mds/Beacon.cc
@@ -17313,6 +20192,20 @@ mds/ceph_dencoder-RecoveryQueue.obj: mds/RecoveryQueue.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_dencoder_CXXFLAGS) $(CXXFLAGS) -c -o mds/ceph_dencoder-RecoveryQueue.obj `if test -f 'mds/RecoveryQueue.cc'; then $(CYGPATH_W) 'mds/RecoveryQueue.cc'; else $(CYGPATH_W) '$(srcdir)/mds/RecoveryQueue.cc'; fi`
 
+mds/ceph_dencoder-StrayManager.o: mds/StrayManager.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_dencoder_CXXFLAGS) $(CXXFLAGS) -MT mds/ceph_dencoder-StrayManager.o -MD -MP -MF mds/$(DEPDIR)/ceph_dencoder-StrayManager.Tpo -c -o mds/ceph_dencoder-StrayManager.o `test -f 'mds/StrayManager.cc' || echo '$(srcdir)/'`mds/StrayManager.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) mds/$(DEPDIR)/ceph_dencoder-StrayManager.Tpo mds/$(DEPDIR)/ceph_dencoder-StrayManager.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='mds/StrayManager.cc' object='mds/ceph_dencoder-StrayManager.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_dencoder_CXXFLAGS) $(CXXFLAGS) -c -o mds/ceph_dencoder-StrayManager.o `test -f 'mds/StrayManager.cc' || echo '$(srcdir)/'`mds/StrayManager.cc
+
+mds/ceph_dencoder-StrayManager.obj: mds/StrayManager.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_dencoder_CXXFLAGS) $(CXXFLAGS) -MT mds/ceph_dencoder-StrayManager.obj -MD -MP -MF mds/$(DEPDIR)/ceph_dencoder-StrayManager.Tpo -c -o mds/ceph_dencoder-StrayManager.obj `if test -f 'mds/StrayManager.cc'; then $(CYGPATH_W) 'mds/StrayManager.cc'; else $(CYGPATH_W) '$(srcdir)/mds/StrayManager.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) mds/$(DEPDIR)/ceph_dencoder-StrayManager.Tpo mds/$(DEPDIR)/ceph_dencoder-StrayManager.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='mds/StrayManager.cc' object='mds/ceph_dencoder-StrayManager.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_dencoder_CXXFLAGS) $(CXXFLAGS) -c -o mds/ceph_dencoder-StrayManager.obj `if test -f 'mds/StrayManager.cc'; then $(CYGPATH_W) 'mds/StrayManager.cc'; else $(CYGPATH_W) '$(srcdir)/mds/StrayManager.cc'; fi`
+
 mds/ceph_dencoder-Locker.o: mds/Locker.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_dencoder_CXXFLAGS) $(CXXFLAGS) -MT mds/ceph_dencoder-Locker.o -MD -MP -MF mds/$(DEPDIR)/ceph_dencoder-Locker.Tpo -c -o mds/ceph_dencoder-Locker.o `test -f 'mds/Locker.cc' || echo '$(srcdir)/'`mds/Locker.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) mds/$(DEPDIR)/ceph_dencoder-Locker.Tpo mds/$(DEPDIR)/ceph_dencoder-Locker.Po
@@ -17481,6 +20374,20 @@ mds/ceph_dencoder-MDSTableServer.obj: mds/MDSTableServer.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_dencoder_CXXFLAGS) $(CXXFLAGS) -c -o mds/ceph_dencoder-MDSTableServer.obj `if test -f 'mds/MDSTableServer.cc'; then $(CYGPATH_W) 'mds/MDSTableServer.cc'; else $(CYGPATH_W) '$(srcdir)/mds/MDSTableServer.cc'; fi`
 
+mds/ceph_dencoder-SimpleLock.o: mds/SimpleLock.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_dencoder_CXXFLAGS) $(CXXFLAGS) -MT mds/ceph_dencoder-SimpleLock.o -MD -MP -MF mds/$(DEPDIR)/ceph_dencoder-SimpleLock.Tpo -c -o mds/ceph_dencoder-SimpleLock.o `test -f 'mds/SimpleLock.cc' || echo '$(srcdir)/'`mds/SimpleLock.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) mds/$(DEPDIR)/ceph_dencoder-SimpleLock.Tpo mds/$(DEPDIR)/ceph_dencoder-SimpleLock.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='mds/SimpleLock.cc' object='mds/ceph_dencoder-SimpleLock.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_dencoder_CXXFLAGS) $(CXXFLAGS) -c -o mds/ceph_dencoder-SimpleLock.o `test -f 'mds/SimpleLock.cc' || echo '$(srcdir)/'`mds/SimpleLock.cc
+
+mds/ceph_dencoder-SimpleLock.obj: mds/SimpleLock.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_dencoder_CXXFLAGS) $(CXXFLAGS) -MT mds/ceph_dencoder-SimpleLock.obj -MD -MP -MF mds/$(DEPDIR)/ceph_dencoder-SimpleLock.Tpo -c -o mds/ceph_dencoder-SimpleLock.obj `if test -f 'mds/SimpleLock.cc'; then $(CYGPATH_W) 'mds/SimpleLock.cc'; else $(CYGPATH_W) '$(srcdir)/mds/SimpleLock.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) mds/$(DEPDIR)/ceph_dencoder-SimpleLock.Tpo mds/$(DEPDIR)/ceph_dencoder-SimpleLock.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='mds/SimpleLock.cc' object='mds/ceph_dencoder-SimpleLock.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_dencoder_CXXFLAGS) $(CXXFLAGS) -c -o mds/ceph_dencoder-SimpleLock.obj `if test -f 'mds/SimpleLock.cc'; then $(CYGPATH_W) 'mds/SimpleLock.cc'; else $(CYGPATH_W) '$(srcdir)/mds/SimpleLock.cc'; fi`
+
 mds/ceph_dencoder-SnapRealm.o: mds/SnapRealm.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_dencoder_CXXFLAGS) $(CXXFLAGS) -MT mds/ceph_dencoder-SnapRealm.o -MD -MP -MF mds/$(DEPDIR)/ceph_dencoder-SnapRealm.Tpo -c -o mds/ceph_dencoder-SnapRealm.o `test -f 'mds/SnapRealm.cc' || echo '$(srcdir)/'`mds/SnapRealm.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) mds/$(DEPDIR)/ceph_dencoder-SnapRealm.Tpo mds/$(DEPDIR)/ceph_dencoder-SnapRealm.Po
@@ -17705,6 +20612,62 @@ tools/ceph_kvstore_tool-ceph_kvstore_tool.obj: tools/ceph_kvstore_tool.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_kvstore_tool_CXXFLAGS) $(CXXFLAGS) -c -o tools/ceph_kvstore_tool-ceph_kvstore_tool.obj `if test -f 'tools/ceph_kvstore_tool.cc'; then $(CYGPATH_W) 'tools/ceph_kvstore_tool.cc'; else $(CYGPATH_W) '$(srcdir)/tools/ceph_kvstore_tool.cc'; fi`
 
+test/ceph_perf_local-perf_local.o: test/perf_local.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_perf_local_CXXFLAGS) $(CXXFLAGS) -MT test/ceph_perf_local-perf_local.o -MD -MP -MF test/$(DEPDIR)/ceph_perf_local-perf_local.Tpo -c -o test/ceph_perf_local-perf_local.o `test -f 'test/perf_local.cc' || echo '$(srcdir)/'`test/perf_local.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/$(DEPDIR)/ceph_perf_local-perf_local.Tpo test/$(DEPDIR)/ceph_perf_local-perf_local.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/perf_local.cc' object='test/ceph_perf_local-perf_local.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_perf_local_CXXFLAGS) $(CXXFLAGS) -c -o test/ceph_perf_local-perf_local.o `test -f 'test/perf_local.cc' || echo '$(srcdir)/'`test/perf_local.cc
+
+test/ceph_perf_local-perf_local.obj: test/perf_local.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_perf_local_CXXFLAGS) $(CXXFLAGS) -MT test/ceph_perf_local-perf_local.obj -MD -MP -MF test/$(DEPDIR)/ceph_perf_local-perf_local.Tpo -c -o test/ceph_perf_local-perf_local.obj `if test -f 'test/perf_local.cc'; then $(CYGPATH_W) 'test/perf_local.cc'; else $(CYGPATH_W) '$(srcdir)/test/perf_local.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/$(DEPDIR)/ceph_perf_local-perf_local.Tpo test/$(DEPDIR)/ceph_perf_local-perf_local.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/perf_local.cc' object='test/ceph_perf_local-perf_local.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_perf_local_CXXFLAGS) $(CXXFLAGS) -c -o test/ceph_perf_local-perf_local.obj `if test -f 'test/perf_local.cc'; then $(CYGPATH_W) 'test/perf_local.cc'; else $(CYGPATH_W) '$(srcdir)/test/perf_local.cc'; fi`
+
+test/ceph_perf_local-perf_helper.o: test/perf_helper.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_perf_local_CXXFLAGS) $(CXXFLAGS) -MT test/ceph_perf_local-perf_helper.o -MD -MP -MF test/$(DEPDIR)/ceph_perf_local-perf_helper.Tpo -c -o test/ceph_perf_local-perf_helper.o `test -f 'test/perf_helper.cc' || echo '$(srcdir)/'`test/perf_helper.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/$(DEPDIR)/ceph_perf_local-perf_helper.Tpo test/$(DEPDIR)/ceph_perf_local-perf_helper.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/perf_helper.cc' object='test/ceph_perf_local-perf_helper.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_perf_local_CXXFLAGS) $(CXXFLAGS) -c -o test/ceph_perf_local-perf_helper.o `test -f 'test/perf_helper.cc' || echo '$(srcdir)/'`test/perf_helper.cc
+
+test/ceph_perf_local-perf_helper.obj: test/perf_helper.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_perf_local_CXXFLAGS) $(CXXFLAGS) -MT test/ceph_perf_local-perf_helper.obj -MD -MP -MF test/$(DEPDIR)/ceph_perf_local-perf_helper.Tpo -c -o test/ceph_perf_local-perf_helper.obj `if test -f 'test/perf_helper.cc'; then $(CYGPATH_W) 'test/perf_helper.cc'; else $(CYGPATH_W) '$(srcdir)/test/perf_helper.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/$(DEPDIR)/ceph_perf_local-perf_helper.Tpo test/$(DEPDIR)/ceph_perf_local-perf_helper.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/perf_helper.cc' object='test/ceph_perf_local-perf_helper.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_perf_local_CXXFLAGS) $(CXXFLAGS) -c -o test/ceph_perf_local-perf_helper.obj `if test -f 'test/perf_helper.cc'; then $(CYGPATH_W) 'test/perf_helper.cc'; else $(CYGPATH_W) '$(srcdir)/test/perf_helper.cc'; fi`
+
+test/msgr/ceph_perf_msgr_client-perf_msgr_client.o: test/msgr/perf_msgr_client.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_perf_msgr_client_CXXFLAGS) $(CXXFLAGS) -MT test/msgr/ceph_perf_msgr_client-perf_msgr_client.o -MD -MP -MF test/msgr/$(DEPDIR)/ceph_perf_msgr_client-perf_msgr_client.Tpo -c -o test/msgr/ceph_perf_msgr_client-perf_msgr_client.o `test -f 'test/msgr/perf_msgr_client.cc' || echo '$(srcdir)/'`test/msgr/perf_msgr_client.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/msgr/$(DEPDIR)/ceph_perf_msgr_client-perf_msgr_client.Tpo test/msgr/$(DEPDIR)/ceph_perf_msgr_client-perf_msgr_client.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/msgr/perf_msgr_client.cc' object='test/msgr/ceph_perf_msgr_client-perf_msgr_client.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_perf_msgr_client_CXXFLAGS) $(CXXFLAGS) -c -o test/msgr/ceph_perf_msgr_client-perf_msgr_client.o `test -f 'test/msgr/perf_msgr_client.cc' || echo '$(srcdir)/'`test/msgr/perf_msgr_client.cc
+
+test/msgr/ceph_perf_msgr_client-perf_msgr_client.obj: test/msgr/perf_msgr_client.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_perf_msgr_client_CXXFLAGS) $(CXXFLAGS) -MT test/msgr/ceph_perf_msgr_client-perf_msgr_client.obj -MD -MP -MF test/msgr/$(DEPDIR)/ceph_perf_msgr_client-perf_msgr_client.Tpo -c -o test/msgr/ceph_perf_msgr_client-perf_msgr_client.obj `if test -f 'test/msgr/perf_msgr_client.cc'; then $(CYGPATH_W) 'test/msgr/perf_msgr_client.cc'; else $(CYGPATH_W) '$(srcdir)/test/msgr/perf_msgr_cli [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/msgr/$(DEPDIR)/ceph_perf_msgr_client-perf_msgr_client.Tpo test/msgr/$(DEPDIR)/ceph_perf_msgr_client-perf_msgr_client.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/msgr/perf_msgr_client.cc' object='test/msgr/ceph_perf_msgr_client-perf_msgr_client.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_perf_msgr_client_CXXFLAGS) $(CXXFLAGS) -c -o test/msgr/ceph_perf_msgr_client-perf_msgr_client.obj `if test -f 'test/msgr/perf_msgr_client.cc'; then $(CYGPATH_W) 'test/msgr/perf_msgr_client.cc'; else $(CYGPATH_W) '$(srcdir)/test/msgr/perf_msgr_client.cc'; fi`
+
+test/msgr/ceph_perf_msgr_server-perf_msgr_server.o: test/msgr/perf_msgr_server.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_perf_msgr_server_CXXFLAGS) $(CXXFLAGS) -MT test/msgr/ceph_perf_msgr_server-perf_msgr_server.o -MD -MP -MF test/msgr/$(DEPDIR)/ceph_perf_msgr_server-perf_msgr_server.Tpo -c -o test/msgr/ceph_perf_msgr_server-perf_msgr_server.o `test -f 'test/msgr/perf_msgr_server.cc' || echo '$(srcdir)/'`test/msgr/perf_msgr_server.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/msgr/$(DEPDIR)/ceph_perf_msgr_server-perf_msgr_server.Tpo test/msgr/$(DEPDIR)/ceph_perf_msgr_server-perf_msgr_server.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/msgr/perf_msgr_server.cc' object='test/msgr/ceph_perf_msgr_server-perf_msgr_server.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_perf_msgr_server_CXXFLAGS) $(CXXFLAGS) -c -o test/msgr/ceph_perf_msgr_server-perf_msgr_server.o `test -f 'test/msgr/perf_msgr_server.cc' || echo '$(srcdir)/'`test/msgr/perf_msgr_server.cc
+
+test/msgr/ceph_perf_msgr_server-perf_msgr_server.obj: test/msgr/perf_msgr_server.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_perf_msgr_server_CXXFLAGS) $(CXXFLAGS) -MT test/msgr/ceph_perf_msgr_server-perf_msgr_server.obj -MD -MP -MF test/msgr/$(DEPDIR)/ceph_perf_msgr_server-perf_msgr_server.Tpo -c -o test/msgr/ceph_perf_msgr_server-perf_msgr_server.obj `if test -f 'test/msgr/perf_msgr_server.cc'; then $(CYGPATH_W) 'test/msgr/perf_msgr_server.cc'; else $(CYGPATH_W) '$(srcdir)/test/msgr/perf_msgr_ser [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/msgr/$(DEPDIR)/ceph_perf_msgr_server-perf_msgr_server.Tpo test/msgr/$(DEPDIR)/ceph_perf_msgr_server-perf_msgr_server.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/msgr/perf_msgr_server.cc' object='test/msgr/ceph_perf_msgr_server-perf_msgr_server.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_perf_msgr_server_CXXFLAGS) $(CXXFLAGS) -c -o test/msgr/ceph_perf_msgr_server-perf_msgr_server.obj `if test -f 'test/msgr/perf_msgr_server.cc'; then $(CYGPATH_W) 'test/msgr/perf_msgr_server.cc'; else $(CYGPATH_W) '$(srcdir)/test/msgr/perf_msgr_server.cc'; fi`
+
 test/objectstore/ceph_perf_objectstore-ObjectStoreTransactionBenchmark.o: test/objectstore/ObjectStoreTransactionBenchmark.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_perf_objectstore_CXXFLAGS) $(CXXFLAGS) -MT test/objectstore/ceph_perf_objectstore-ObjectStoreTransactionBenchmark.o -MD -MP -MF test/objectstore/$(DEPDIR)/ceph_perf_objectstore-ObjectStoreTransactionBenchmark.Tpo -c -o test/objectstore/ceph_perf_objectstore-ObjectStoreTransactionBenchmark.o `test -f 'test/objectstore/ObjectStoreTransactionBenchmark.cc' || echo '$(srcdir)/'`te [...]
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/objectstore/$(DEPDIR)/ceph_perf_objectstore-ObjectStoreTransactionBenchmark.Tpo test/objectstore/$(DEPDIR)/ceph_perf_objectstore-ObjectStoreTransactionBenchmark.Po
@@ -17775,6 +20738,20 @@ test/cls_log/ceph_test_cls_log-test_cls_log.obj: test/cls_log/test_cls_log.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_test_cls_log_CXXFLAGS) $(CXXFLAGS) -c -o test/cls_log/ceph_test_cls_log-test_cls_log.obj `if test -f 'test/cls_log/test_cls_log.cc'; then $(CYGPATH_W) 'test/cls_log/test_cls_log.cc'; else $(CYGPATH_W) '$(srcdir)/test/cls_log/test_cls_log.cc'; fi`
 
+test/cls_numops/ceph_test_cls_numops-test_cls_numops.o: test/cls_numops/test_cls_numops.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_test_cls_numops_CXXFLAGS) $(CXXFLAGS) -MT test/cls_numops/ceph_test_cls_numops-test_cls_numops.o -MD -MP -MF test/cls_numops/$(DEPDIR)/ceph_test_cls_numops-test_cls_numops.Tpo -c -o test/cls_numops/ceph_test_cls_numops-test_cls_numops.o `test -f 'test/cls_numops/test_cls_numops.cc' || echo '$(srcdir)/'`test/cls_numops/test_cls_numops.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/cls_numops/$(DEPDIR)/ceph_test_cls_numops-test_cls_numops.Tpo test/cls_numops/$(DEPDIR)/ceph_test_cls_numops-test_cls_numops.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/cls_numops/test_cls_numops.cc' object='test/cls_numops/ceph_test_cls_numops-test_cls_numops.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_test_cls_numops_CXXFLAGS) $(CXXFLAGS) -c -o test/cls_numops/ceph_test_cls_numops-test_cls_numops.o `test -f 'test/cls_numops/test_cls_numops.cc' || echo '$(srcdir)/'`test/cls_numops/test_cls_numops.cc
+
+test/cls_numops/ceph_test_cls_numops-test_cls_numops.obj: test/cls_numops/test_cls_numops.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_test_cls_numops_CXXFLAGS) $(CXXFLAGS) -MT test/cls_numops/ceph_test_cls_numops-test_cls_numops.obj -MD -MP -MF test/cls_numops/$(DEPDIR)/ceph_test_cls_numops-test_cls_numops.Tpo -c -o test/cls_numops/ceph_test_cls_numops-test_cls_numops.obj `if test -f 'test/cls_numops/test_cls_numops.cc'; then $(CYGPATH_W) 'test/cls_numops/test_cls_numops.cc'; else $(CYGPATH_W) '$(srcdir)/te [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/cls_numops/$(DEPDIR)/ceph_test_cls_numops-test_cls_numops.Tpo test/cls_numops/$(DEPDIR)/ceph_test_cls_numops-test_cls_numops.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/cls_numops/test_cls_numops.cc' object='test/cls_numops/ceph_test_cls_numops-test_cls_numops.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_test_cls_numops_CXXFLAGS) $(CXXFLAGS) -c -o test/cls_numops/ceph_test_cls_numops-test_cls_numops.obj `if test -f 'test/cls_numops/test_cls_numops.cc'; then $(CYGPATH_W) 'test/cls_numops/test_cls_numops.cc'; else $(CYGPATH_W) '$(srcdir)/test/cls_numops/test_cls_numops.cc'; fi`
+
 test/cls_rbd/ceph_test_cls_rbd-test_cls_rbd.o: test/cls_rbd/test_cls_rbd.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_test_cls_rbd_CXXFLAGS) $(CXXFLAGS) -MT test/cls_rbd/ceph_test_cls_rbd-test_cls_rbd.o -MD -MP -MF test/cls_rbd/$(DEPDIR)/ceph_test_cls_rbd-test_cls_rbd.Tpo -c -o test/cls_rbd/ceph_test_cls_rbd-test_cls_rbd.o `test -f 'test/cls_rbd/test_cls_rbd.cc' || echo '$(srcdir)/'`test/cls_rbd/test_cls_rbd.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/cls_rbd/$(DEPDIR)/ceph_test_cls_rbd-test_cls_rbd.Tpo test/cls_rbd/$(DEPDIR)/ceph_test_cls_rbd-test_cls_rbd.Po
@@ -17943,6 +20920,20 @@ test/filestore/ceph_test_filestore-TestFileStore.obj: test/filestore/TestFileSto
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_test_filestore_CXXFLAGS) $(CXXFLAGS) -c -o test/filestore/ceph_test_filestore-TestFileStore.obj `if test -f 'test/filestore/TestFileStore.cc'; then $(CYGPATH_W) 'test/filestore/TestFileStore.cc'; else $(CYGPATH_W) '$(srcdir)/test/filestore/TestFileStore.cc'; fi`
 
+test/objectstore/ceph_test_keyvaluedb-test_kv.o: test/objectstore/test_kv.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_test_keyvaluedb_CXXFLAGS) $(CXXFLAGS) -MT test/objectstore/ceph_test_keyvaluedb-test_kv.o -MD -MP -MF test/objectstore/$(DEPDIR)/ceph_test_keyvaluedb-test_kv.Tpo -c -o test/objectstore/ceph_test_keyvaluedb-test_kv.o `test -f 'test/objectstore/test_kv.cc' || echo '$(srcdir)/'`test/objectstore/test_kv.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/objectstore/$(DEPDIR)/ceph_test_keyvaluedb-test_kv.Tpo test/objectstore/$(DEPDIR)/ceph_test_keyvaluedb-test_kv.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/objectstore/test_kv.cc' object='test/objectstore/ceph_test_keyvaluedb-test_kv.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_test_keyvaluedb_CXXFLAGS) $(CXXFLAGS) -c -o test/objectstore/ceph_test_keyvaluedb-test_kv.o `test -f 'test/objectstore/test_kv.cc' || echo '$(srcdir)/'`test/objectstore/test_kv.cc
+
+test/objectstore/ceph_test_keyvaluedb-test_kv.obj: test/objectstore/test_kv.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_test_keyvaluedb_CXXFLAGS) $(CXXFLAGS) -MT test/objectstore/ceph_test_keyvaluedb-test_kv.obj -MD -MP -MF test/objectstore/$(DEPDIR)/ceph_test_keyvaluedb-test_kv.Tpo -c -o test/objectstore/ceph_test_keyvaluedb-test_kv.obj `if test -f 'test/objectstore/test_kv.cc'; then $(CYGPATH_W) 'test/objectstore/test_kv.cc'; else $(CYGPATH_W) '$(srcdir)/test/objectstore/test_kv.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/objectstore/$(DEPDIR)/ceph_test_keyvaluedb-test_kv.Tpo test/objectstore/$(DEPDIR)/ceph_test_keyvaluedb-test_kv.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/objectstore/test_kv.cc' object='test/objectstore/ceph_test_keyvaluedb-test_kv.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_test_keyvaluedb_CXXFLAGS) $(CXXFLAGS) -c -o test/objectstore/ceph_test_keyvaluedb-test_kv.obj `if test -f 'test/objectstore/test_kv.cc'; then $(CYGPATH_W) 'test/objectstore/test_kv.cc'; else $(CYGPATH_W) '$(srcdir)/test/objectstore/test_kv.cc'; fi`
+
 test/ObjectMap/ceph_test_keyvaluedb_atomicity-test_keyvaluedb_atomicity.o: test/ObjectMap/test_keyvaluedb_atomicity.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_test_keyvaluedb_atomicity_CXXFLAGS) $(CXXFLAGS) -MT test/ObjectMap/ceph_test_keyvaluedb_atomicity-test_keyvaluedb_atomicity.o -MD -MP -MF test/ObjectMap/$(DEPDIR)/ceph_test_keyvaluedb_atomicity-test_keyvaluedb_atomicity.Tpo -c -o test/ObjectMap/ceph_test_keyvaluedb_atomicity-test_keyvaluedb_atomicity.o `test -f 'test/ObjectMap/test_keyvaluedb_atomicity.cc' || echo '$(srcdir)/ [...]
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/ObjectMap/$(DEPDIR)/ceph_test_keyvaluedb_atomicity-test_keyvaluedb_atomicity.Tpo test/ObjectMap/$(DEPDIR)/ceph_test_keyvaluedb_atomicity-test_keyvaluedb_atomicity.Po
@@ -18041,6 +21032,20 @@ test/libcephfs/ceph_test_libcephfs-multiclient.obj: test/libcephfs/multiclient.c
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_test_libcephfs_CXXFLAGS) $(CXXFLAGS) -c -o test/libcephfs/ceph_test_libcephfs-multiclient.obj `if test -f 'test/libcephfs/multiclient.cc'; then $(CYGPATH_W) 'test/libcephfs/multiclient.cc'; else $(CYGPATH_W) '$(srcdir)/test/libcephfs/multiclient.cc'; fi`
 
+test/libcephfs/ceph_test_libcephfs-flock.o: test/libcephfs/flock.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_test_libcephfs_CXXFLAGS) $(CXXFLAGS) -MT test/libcephfs/ceph_test_libcephfs-flock.o -MD -MP -MF test/libcephfs/$(DEPDIR)/ceph_test_libcephfs-flock.Tpo -c -o test/libcephfs/ceph_test_libcephfs-flock.o `test -f 'test/libcephfs/flock.cc' || echo '$(srcdir)/'`test/libcephfs/flock.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/libcephfs/$(DEPDIR)/ceph_test_libcephfs-flock.Tpo test/libcephfs/$(DEPDIR)/ceph_test_libcephfs-flock.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/libcephfs/flock.cc' object='test/libcephfs/ceph_test_libcephfs-flock.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_test_libcephfs_CXXFLAGS) $(CXXFLAGS) -c -o test/libcephfs/ceph_test_libcephfs-flock.o `test -f 'test/libcephfs/flock.cc' || echo '$(srcdir)/'`test/libcephfs/flock.cc
+
+test/libcephfs/ceph_test_libcephfs-flock.obj: test/libcephfs/flock.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_test_libcephfs_CXXFLAGS) $(CXXFLAGS) -MT test/libcephfs/ceph_test_libcephfs-flock.obj -MD -MP -MF test/libcephfs/$(DEPDIR)/ceph_test_libcephfs-flock.Tpo -c -o test/libcephfs/ceph_test_libcephfs-flock.obj `if test -f 'test/libcephfs/flock.cc'; then $(CYGPATH_W) 'test/libcephfs/flock.cc'; else $(CYGPATH_W) '$(srcdir)/test/libcephfs/flock.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/libcephfs/$(DEPDIR)/ceph_test_libcephfs-flock.Tpo test/libcephfs/$(DEPDIR)/ceph_test_libcephfs-flock.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/libcephfs/flock.cc' object='test/libcephfs/ceph_test_libcephfs-flock.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_test_libcephfs_CXXFLAGS) $(CXXFLAGS) -c -o test/libcephfs/ceph_test_libcephfs-flock.obj `if test -f 'test/libcephfs/flock.cc'; then $(CYGPATH_W) 'test/libcephfs/flock.cc'; else $(CYGPATH_W) '$(srcdir)/test/libcephfs/flock.cc'; fi`
+
 test/librbd/ceph_test_librbd-test_main.o: test/librbd/test_main.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_test_librbd_CXXFLAGS) $(CXXFLAGS) -MT test/librbd/ceph_test_librbd-test_main.o -MD -MP -MF test/librbd/$(DEPDIR)/ceph_test_librbd-test_main.Tpo -c -o test/librbd/ceph_test_librbd-test_main.o `test -f 'test/librbd/test_main.cc' || echo '$(srcdir)/'`test/librbd/test_main.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/librbd/$(DEPDIR)/ceph_test_librbd-test_main.Tpo test/librbd/$(DEPDIR)/ceph_test_librbd-test_main.Po
@@ -18461,6 +21466,20 @@ test/rgw/ceph_test_rgw_manifest-test_rgw_manifest.obj: test/rgw/test_rgw_manifes
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_test_rgw_manifest_CXXFLAGS) $(CXXFLAGS) -c -o test/rgw/ceph_test_rgw_manifest-test_rgw_manifest.obj `if test -f 'test/rgw/test_rgw_manifest.cc'; then $(CYGPATH_W) 'test/rgw/test_rgw_manifest.cc'; else $(CYGPATH_W) '$(srcdir)/test/rgw/test_rgw_manifest.cc'; fi`
 
+test/rgw/ceph_test_rgw_obj-test_rgw_obj.o: test/rgw/test_rgw_obj.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_test_rgw_obj_CXXFLAGS) $(CXXFLAGS) -MT test/rgw/ceph_test_rgw_obj-test_rgw_obj.o -MD -MP -MF test/rgw/$(DEPDIR)/ceph_test_rgw_obj-test_rgw_obj.Tpo -c -o test/rgw/ceph_test_rgw_obj-test_rgw_obj.o `test -f 'test/rgw/test_rgw_obj.cc' || echo '$(srcdir)/'`test/rgw/test_rgw_obj.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/rgw/$(DEPDIR)/ceph_test_rgw_obj-test_rgw_obj.Tpo test/rgw/$(DEPDIR)/ceph_test_rgw_obj-test_rgw_obj.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/rgw/test_rgw_obj.cc' object='test/rgw/ceph_test_rgw_obj-test_rgw_obj.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_test_rgw_obj_CXXFLAGS) $(CXXFLAGS) -c -o test/rgw/ceph_test_rgw_obj-test_rgw_obj.o `test -f 'test/rgw/test_rgw_obj.cc' || echo '$(srcdir)/'`test/rgw/test_rgw_obj.cc
+
+test/rgw/ceph_test_rgw_obj-test_rgw_obj.obj: test/rgw/test_rgw_obj.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_test_rgw_obj_CXXFLAGS) $(CXXFLAGS) -MT test/rgw/ceph_test_rgw_obj-test_rgw_obj.obj -MD -MP -MF test/rgw/$(DEPDIR)/ceph_test_rgw_obj-test_rgw_obj.Tpo -c -o test/rgw/ceph_test_rgw_obj-test_rgw_obj.obj `if test -f 'test/rgw/test_rgw_obj.cc'; then $(CYGPATH_W) 'test/rgw/test_rgw_obj.cc'; else $(CYGPATH_W) '$(srcdir)/test/rgw/test_rgw_obj.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/rgw/$(DEPDIR)/ceph_test_rgw_obj-test_rgw_obj.Tpo test/rgw/$(DEPDIR)/ceph_test_rgw_obj-test_rgw_obj.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/rgw/test_rgw_obj.cc' object='test/rgw/ceph_test_rgw_obj-test_rgw_obj.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_test_rgw_obj_CXXFLAGS) $(CXXFLAGS) -c -o test/rgw/ceph_test_rgw_obj-test_rgw_obj.obj `if test -f 'test/rgw/test_rgw_obj.cc'; then $(CYGPATH_W) 'test/rgw/test_rgw_obj.cc'; else $(CYGPATH_W) '$(srcdir)/test/rgw/test_rgw_obj.cc'; fi`
+
 test/ceph_test_snap_mapper-test_snap_mapper.o: test/test_snap_mapper.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_test_snap_mapper_CXXFLAGS) $(CXXFLAGS) -MT test/ceph_test_snap_mapper-test_snap_mapper.o -MD -MP -MF test/$(DEPDIR)/ceph_test_snap_mapper-test_snap_mapper.Tpo -c -o test/ceph_test_snap_mapper-test_snap_mapper.o `test -f 'test/test_snap_mapper.cc' || echo '$(srcdir)/'`test/test_snap_mapper.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/$(DEPDIR)/ceph_test_snap_mapper-test_snap_mapper.Tpo test/$(DEPDIR)/ceph_test_snap_mapper-test_snap_mapper.Po
@@ -18503,33 +21522,19 @@ test/ceph_xattr_bench-xattr_bench.obj: test/xattr_bench.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_xattr_bench_CXXFLAGS) $(CXXFLAGS) -c -o test/ceph_xattr_bench-xattr_bench.obj `if test -f 'test/xattr_bench.cc'; then $(CYGPATH_W) 'test/xattr_bench.cc'; else $(CYGPATH_W) '$(srcdir)/test/xattr_bench.cc'; fi`
 
-tools/rest_bench-rest_bench.o: tools/rest_bench.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(rest_bench_CXXFLAGS) $(CXXFLAGS) -MT tools/rest_bench-rest_bench.o -MD -MP -MF tools/$(DEPDIR)/rest_bench-rest_bench.Tpo -c -o tools/rest_bench-rest_bench.o `test -f 'tools/rest_bench.cc' || echo '$(srcdir)/'`tools/rest_bench.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) tools/$(DEPDIR)/rest_bench-rest_bench.Tpo tools/$(DEPDIR)/rest_bench-rest_bench.Po
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='tools/rest_bench.cc' object='tools/rest_bench-rest_bench.o' libtool=no @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(rest_bench_CXXFLAGS) $(CXXFLAGS) -c -o tools/rest_bench-rest_bench.o `test -f 'tools/rest_bench.cc' || echo '$(srcdir)/'`tools/rest_bench.cc
-
-tools/rest_bench-rest_bench.obj: tools/rest_bench.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(rest_bench_CXXFLAGS) $(CXXFLAGS) -MT tools/rest_bench-rest_bench.obj -MD -MP -MF tools/$(DEPDIR)/rest_bench-rest_bench.Tpo -c -o tools/rest_bench-rest_bench.obj `if test -f 'tools/rest_bench.cc'; then $(CYGPATH_W) 'tools/rest_bench.cc'; else $(CYGPATH_W) '$(srcdir)/tools/rest_bench.cc'; fi`
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) tools/$(DEPDIR)/rest_bench-rest_bench.Tpo tools/$(DEPDIR)/rest_bench-rest_bench.Po
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='tools/rest_bench.cc' object='tools/rest_bench-rest_bench.obj' libtool=no @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(rest_bench_CXXFLAGS) $(CXXFLAGS) -c -o tools/rest_bench-rest_bench.obj `if test -f 'tools/rest_bench.cc'; then $(CYGPATH_W) 'tools/rest_bench.cc'; else $(CYGPATH_W) '$(srcdir)/tools/rest_bench.cc'; fi`
-
-common/rest_bench-obj_bencher.o: common/obj_bencher.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(rest_bench_CXXFLAGS) $(CXXFLAGS) -MT common/rest_bench-obj_bencher.o -MD -MP -MF common/$(DEPDIR)/rest_bench-obj_bencher.Tpo -c -o common/rest_bench-obj_bencher.o `test -f 'common/obj_bencher.cc' || echo '$(srcdir)/'`common/obj_bencher.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) common/$(DEPDIR)/rest_bench-obj_bencher.Tpo common/$(DEPDIR)/rest_bench-obj_bencher.Po
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='common/obj_bencher.cc' object='common/rest_bench-obj_bencher.o' libtool=no @AMDEPBACKSLASH@
+rbd_fuse/rbd_fuse-rbd-fuse.o: rbd_fuse/rbd-fuse.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(rbd_fuse_CXXFLAGS) $(CXXFLAGS) -MT rbd_fuse/rbd_fuse-rbd-fuse.o -MD -MP -MF rbd_fuse/$(DEPDIR)/rbd_fuse-rbd-fuse.Tpo -c -o rbd_fuse/rbd_fuse-rbd-fuse.o `test -f 'rbd_fuse/rbd-fuse.cc' || echo '$(srcdir)/'`rbd_fuse/rbd-fuse.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) rbd_fuse/$(DEPDIR)/rbd_fuse-rbd-fuse.Tpo rbd_fuse/$(DEPDIR)/rbd_fuse-rbd-fuse.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='rbd_fuse/rbd-fuse.cc' object='rbd_fuse/rbd_fuse-rbd-fuse.o' libtool=no @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(rest_bench_CXXFLAGS) $(CXXFLAGS) -c -o common/rest_bench-obj_bencher.o `test -f 'common/obj_bencher.cc' || echo '$(srcdir)/'`common/obj_bencher.cc
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(rbd_fuse_CXXFLAGS) $(CXXFLAGS) -c -o rbd_fuse/rbd_fuse-rbd-fuse.o `test -f 'rbd_fuse/rbd-fuse.cc' || echo '$(srcdir)/'`rbd_fuse/rbd-fuse.cc
 
-common/rest_bench-obj_bencher.obj: common/obj_bencher.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(rest_bench_CXXFLAGS) $(CXXFLAGS) -MT common/rest_bench-obj_bencher.obj -MD -MP -MF common/$(DEPDIR)/rest_bench-obj_bencher.Tpo -c -o common/rest_bench-obj_bencher.obj `if test -f 'common/obj_bencher.cc'; then $(CYGPATH_W) 'common/obj_bencher.cc'; else $(CYGPATH_W) '$(srcdir)/common/obj_bencher.cc'; fi`
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) common/$(DEPDIR)/rest_bench-obj_bencher.Tpo common/$(DEPDIR)/rest_bench-obj_bencher.Po
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='common/obj_bencher.cc' object='common/rest_bench-obj_bencher.obj' libtool=no @AMDEPBACKSLASH@
+rbd_fuse/rbd_fuse-rbd-fuse.obj: rbd_fuse/rbd-fuse.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(rbd_fuse_CXXFLAGS) $(CXXFLAGS) -MT rbd_fuse/rbd_fuse-rbd-fuse.obj -MD -MP -MF rbd_fuse/$(DEPDIR)/rbd_fuse-rbd-fuse.Tpo -c -o rbd_fuse/rbd_fuse-rbd-fuse.obj `if test -f 'rbd_fuse/rbd-fuse.cc'; then $(CYGPATH_W) 'rbd_fuse/rbd-fuse.cc'; else $(CYGPATH_W) '$(srcdir)/rbd_fuse/rbd-fuse.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) rbd_fuse/$(DEPDIR)/rbd_fuse-rbd-fuse.Tpo rbd_fuse/$(DEPDIR)/rbd_fuse-rbd-fuse.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='rbd_fuse/rbd-fuse.cc' object='rbd_fuse/rbd_fuse-rbd-fuse.obj' libtool=no @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(rest_bench_CXXFLAGS) $(CXXFLAGS) -c -o common/rest_bench-obj_bencher.obj `if test -f 'common/obj_bencher.cc'; then $(CYGPATH_W) 'common/obj_bencher.cc'; else $(CYGPATH_W) '$(srcdir)/common/obj_bencher.cc'; fi`
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(rbd_fuse_CXXFLAGS) $(CXXFLAGS) -c -o rbd_fuse/rbd_fuse-rbd-fuse.obj `if test -f 'rbd_fuse/rbd-fuse.cc'; then $(CYGPATH_W) 'rbd_fuse/rbd-fuse.cc'; else $(CYGPATH_W) '$(srcdir)/rbd_fuse/rbd-fuse.cc'; fi`
 
 test/messenger/simple_client-simple_client.o: test/messenger/simple_client.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(simple_client_CXXFLAGS) $(CXXFLAGS) -MT test/messenger/simple_client-simple_client.o -MD -MP -MF test/messenger/$(DEPDIR)/simple_client-simple_client.Tpo -c -o test/messenger/simple_client-simple_client.o `test -f 'test/messenger/simple_client.cc' || echo '$(srcdir)/'`test/messenger/simple_client.cc
@@ -18685,6 +21690,20 @@ test/test_build_libcommon-buildtest_skeleton.obj: test/buildtest_skeleton.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_build_libcommon_CXXFLAGS) $(CXXFLAGS) -c -o test/test_build_libcommon-buildtest_skeleton.obj `if test -f 'test/buildtest_skeleton.cc'; then $(CYGPATH_W) 'test/buildtest_skeleton.cc'; else $(CYGPATH_W) '$(srcdir)/test/buildtest_skeleton.cc'; fi`
 
+common/test_build_libcommon-buffer.o: common/buffer.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_build_libcommon_CXXFLAGS) $(CXXFLAGS) -MT common/test_build_libcommon-buffer.o -MD -MP -MF common/$(DEPDIR)/test_build_libcommon-buffer.Tpo -c -o common/test_build_libcommon-buffer.o `test -f 'common/buffer.cc' || echo '$(srcdir)/'`common/buffer.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) common/$(DEPDIR)/test_build_libcommon-buffer.Tpo common/$(DEPDIR)/test_build_libcommon-buffer.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='common/buffer.cc' object='common/test_build_libcommon-buffer.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_build_libcommon_CXXFLAGS) $(CXXFLAGS) -c -o common/test_build_libcommon-buffer.o `test -f 'common/buffer.cc' || echo '$(srcdir)/'`common/buffer.cc
+
+common/test_build_libcommon-buffer.obj: common/buffer.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_build_libcommon_CXXFLAGS) $(CXXFLAGS) -MT common/test_build_libcommon-buffer.obj -MD -MP -MF common/$(DEPDIR)/test_build_libcommon-buffer.Tpo -c -o common/test_build_libcommon-buffer.obj `if test -f 'common/buffer.cc'; then $(CYGPATH_W) 'common/buffer.cc'; else $(CYGPATH_W) '$(srcdir)/common/buffer.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) common/$(DEPDIR)/test_build_libcommon-buffer.Tpo common/$(DEPDIR)/test_build_libcommon-buffer.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='common/buffer.cc' object='common/test_build_libcommon-buffer.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_build_libcommon_CXXFLAGS) $(CXXFLAGS) -c -o common/test_build_libcommon-buffer.obj `if test -f 'common/buffer.cc'; then $(CYGPATH_W) 'common/buffer.cc'; else $(CYGPATH_W) '$(srcdir)/common/buffer.cc'; fi`
+
 test/test_build_librados-buildtest_skeleton.o: test/buildtest_skeleton.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_build_librados_CXXFLAGS) $(CXXFLAGS) -MT test/test_build_librados-buildtest_skeleton.o -MD -MP -MF test/$(DEPDIR)/test_build_librados-buildtest_skeleton.Tpo -c -o test/test_build_librados-buildtest_skeleton.o `test -f 'test/buildtest_skeleton.cc' || echo '$(srcdir)/'`test/buildtest_skeleton.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/$(DEPDIR)/test_build_librados-buildtest_skeleton.Tpo test/$(DEPDIR)/test_build_librados-buildtest_skeleton.Po
@@ -19217,6 +22236,20 @@ rgw/test_build_librgw-rgw_dencoder.obj: rgw/rgw_dencoder.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_build_librgw_CXXFLAGS) $(CXXFLAGS) -c -o rgw/test_build_librgw-rgw_dencoder.obj `if test -f 'rgw/rgw_dencoder.cc'; then $(CYGPATH_W) 'rgw/rgw_dencoder.cc'; else $(CYGPATH_W) '$(srcdir)/rgw/rgw_dencoder.cc'; fi`
 
+rgw/test_build_librgw-rgw_object_expirer_core.o: rgw/rgw_object_expirer_core.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_build_librgw_CXXFLAGS) $(CXXFLAGS) -MT rgw/test_build_librgw-rgw_object_expirer_core.o -MD -MP -MF rgw/$(DEPDIR)/test_build_librgw-rgw_object_expirer_core.Tpo -c -o rgw/test_build_librgw-rgw_object_expirer_core.o `test -f 'rgw/rgw_object_expirer_core.cc' || echo '$(srcdir)/'`rgw/rgw_object_expirer_core.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) rgw/$(DEPDIR)/test_build_librgw-rgw_object_expirer_core.Tpo rgw/$(DEPDIR)/test_build_librgw-rgw_object_expirer_core.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='rgw/rgw_object_expirer_core.cc' object='rgw/test_build_librgw-rgw_object_expirer_core.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_build_librgw_CXXFLAGS) $(CXXFLAGS) -c -o rgw/test_build_librgw-rgw_object_expirer_core.o `test -f 'rgw/rgw_object_expirer_core.cc' || echo '$(srcdir)/'`rgw/rgw_object_expirer_core.cc
+
+rgw/test_build_librgw-rgw_object_expirer_core.obj: rgw/rgw_object_expirer_core.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_build_librgw_CXXFLAGS) $(CXXFLAGS) -MT rgw/test_build_librgw-rgw_object_expirer_core.obj -MD -MP -MF rgw/$(DEPDIR)/test_build_librgw-rgw_object_expirer_core.Tpo -c -o rgw/test_build_librgw-rgw_object_expirer_core.obj `if test -f 'rgw/rgw_object_expirer_core.cc'; then $(CYGPATH_W) 'rgw/rgw_object_expirer_core.cc'; else $(CYGPATH_W) '$(srcdir)/rgw/rgw_object_expirer_core.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) rgw/$(DEPDIR)/test_build_librgw-rgw_object_expirer_core.Tpo rgw/$(DEPDIR)/test_build_librgw-rgw_object_expirer_core.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='rgw/rgw_object_expirer_core.cc' object='rgw/test_build_librgw-rgw_object_expirer_core.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_build_librgw_CXXFLAGS) $(CXXFLAGS) -c -o rgw/test_build_librgw-rgw_object_expirer_core.obj `if test -f 'rgw/rgw_object_expirer_core.cc'; then $(CYGPATH_W) 'rgw/rgw_object_expirer_core.cc'; else $(CYGPATH_W) '$(srcdir)/rgw/rgw_object_expirer_core.cc'; fi`
+
 test/unittest_addrs-test_addrs.o: test/test_addrs.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_addrs_CXXFLAGS) $(CXXFLAGS) -MT test/unittest_addrs-test_addrs.o -MD -MP -MF test/$(DEPDIR)/unittest_addrs-test_addrs.Tpo -c -o test/unittest_addrs-test_addrs.o `test -f 'test/test_addrs.cc' || echo '$(srcdir)/'`test/test_addrs.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/$(DEPDIR)/unittest_addrs-test_addrs.Tpo test/$(DEPDIR)/unittest_addrs-test_addrs.Po
@@ -19259,6 +22292,20 @@ test/unittest_arch-test_arch.obj: test/test_arch.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_arch_CXXFLAGS) $(CXXFLAGS) -c -o test/unittest_arch-test_arch.obj `if test -f 'test/test_arch.cc'; then $(CYGPATH_W) 'test/test_arch.cc'; else $(CYGPATH_W) '$(srcdir)/test/test_arch.cc'; fi`
 
+test/common/unittest_async_compressor-test_async_compressor.o: test/common/test_async_compressor.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_async_compressor_CXXFLAGS) $(CXXFLAGS) -MT test/common/unittest_async_compressor-test_async_compressor.o -MD -MP -MF test/common/$(DEPDIR)/unittest_async_compressor-test_async_compressor.Tpo -c -o test/common/unittest_async_compressor-test_async_compressor.o `test -f 'test/common/test_async_compressor.cc' || echo '$(srcdir)/'`test/common/test_async_compressor.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/common/$(DEPDIR)/unittest_async_compressor-test_async_compressor.Tpo test/common/$(DEPDIR)/unittest_async_compressor-test_async_compressor.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/common/test_async_compressor.cc' object='test/common/unittest_async_compressor-test_async_compressor.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_async_compressor_CXXFLAGS) $(CXXFLAGS) -c -o test/common/unittest_async_compressor-test_async_compressor.o `test -f 'test/common/test_async_compressor.cc' || echo '$(srcdir)/'`test/common/test_async_compressor.cc
+
+test/common/unittest_async_compressor-test_async_compressor.obj: test/common/test_async_compressor.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_async_compressor_CXXFLAGS) $(CXXFLAGS) -MT test/common/unittest_async_compressor-test_async_compressor.obj -MD -MP -MF test/common/$(DEPDIR)/unittest_async_compressor-test_async_compressor.Tpo -c -o test/common/unittest_async_compressor-test_async_compressor.obj `if test -f 'test/common/test_async_compressor.cc'; then $(CYGPATH_W) 'test/common/test_async_compressor.cc'; e [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/common/$(DEPDIR)/unittest_async_compressor-test_async_compressor.Tpo test/common/$(DEPDIR)/unittest_async_compressor-test_async_compressor.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/common/test_async_compressor.cc' object='test/common/unittest_async_compressor-test_async_compressor.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_async_compressor_CXXFLAGS) $(CXXFLAGS) -c -o test/common/unittest_async_compressor-test_async_compressor.obj `if test -f 'test/common/test_async_compressor.cc'; then $(CYGPATH_W) 'test/common/test_async_compressor.cc'; else $(CYGPATH_W) '$(srcdir)/test/common/test_async_compressor.cc'; fi`
+
 test/unittest_base64-base64.o: test/base64.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_base64_CXXFLAGS) $(CXXFLAGS) -MT test/unittest_base64-base64.o -MD -MP -MF test/$(DEPDIR)/unittest_base64-base64.Tpo -c -o test/unittest_base64-base64.o `test -f 'test/base64.cc' || echo '$(srcdir)/'`test/base64.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/$(DEPDIR)/unittest_base64-base64.Tpo test/$(DEPDIR)/unittest_base64-base64.Po
@@ -19819,6 +22866,20 @@ test/erasure-code/unittest_erasure_code_plugin_lrc-TestErasureCodePluginLrc.obj:
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_plugin_lrc_CXXFLAGS) $(CXXFLAGS) -c -o test/erasure-code/unittest_erasure_code_plugin_lrc-TestErasureCodePluginLrc.obj `if test -f 'test/erasure-code/TestErasureCodePluginLrc.cc'; then $(CYGPATH_W) 'test/erasure-code/TestErasureCodePluginLrc.cc'; else $(CYGPATH_W) '$(srcdir)/test/erasure-code/TestErasureCodePluginLrc.cc'; fi`
 
+test/erasure-code/unittest_erasure_code_plugin_shec-TestErasureCodePluginShec.o: test/erasure-code/TestErasureCodePluginShec.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_plugin_shec_CXXFLAGS) $(CXXFLAGS) -MT test/erasure-code/unittest_erasure_code_plugin_shec-TestErasureCodePluginShec.o -MD -MP -MF test/erasure-code/$(DEPDIR)/unittest_erasure_code_plugin_shec-TestErasureCodePluginShec.Tpo -c -o test/erasure-code/unittest_erasure_code_plugin_shec-TestErasureCodePluginShec.o `test -f 'test/erasure-code/TestErasureCodePluginShec [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/erasure-code/$(DEPDIR)/unittest_erasure_code_plugin_shec-TestErasureCodePluginShec.Tpo test/erasure-code/$(DEPDIR)/unittest_erasure_code_plugin_shec-TestErasureCodePluginShec.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/erasure-code/TestErasureCodePluginShec.cc' object='test/erasure-code/unittest_erasure_code_plugin_shec-TestErasureCodePluginShec.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_plugin_shec_CXXFLAGS) $(CXXFLAGS) -c -o test/erasure-code/unittest_erasure_code_plugin_shec-TestErasureCodePluginShec.o `test -f 'test/erasure-code/TestErasureCodePluginShec.cc' || echo '$(srcdir)/'`test/erasure-code/TestErasureCodePluginShec.cc
+
+test/erasure-code/unittest_erasure_code_plugin_shec-TestErasureCodePluginShec.obj: test/erasure-code/TestErasureCodePluginShec.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_plugin_shec_CXXFLAGS) $(CXXFLAGS) -MT test/erasure-code/unittest_erasure_code_plugin_shec-TestErasureCodePluginShec.obj -MD -MP -MF test/erasure-code/$(DEPDIR)/unittest_erasure_code_plugin_shec-TestErasureCodePluginShec.Tpo -c -o test/erasure-code/unittest_erasure_code_plugin_shec-TestErasureCodePluginShec.obj `if test -f 'test/erasure-code/TestErasureCodePlu [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/erasure-code/$(DEPDIR)/unittest_erasure_code_plugin_shec-TestErasureCodePluginShec.Tpo test/erasure-code/$(DEPDIR)/unittest_erasure_code_plugin_shec-TestErasureCodePluginShec.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/erasure-code/TestErasureCodePluginShec.cc' object='test/erasure-code/unittest_erasure_code_plugin_shec-TestErasureCodePluginShec.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_plugin_shec_CXXFLAGS) $(CXXFLAGS) -c -o test/erasure-code/unittest_erasure_code_plugin_shec-TestErasureCodePluginShec.obj `if test -f 'test/erasure-code/TestErasureCodePluginShec.cc'; then $(CYGPATH_W) 'test/erasure-code/TestErasureCodePluginShec.cc'; else $(CYGPATH_W) '$(srcdir)/test/erasure-code/TestErasureCodePluginShec.cc'; fi`
+
 test/erasure-code/unittest_erasure_code_shec-TestErasureCodeShec.o: test/erasure-code/TestErasureCodeShec.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_CXXFLAGS) $(CXXFLAGS) -MT test/erasure-code/unittest_erasure_code_shec-TestErasureCodeShec.o -MD -MP -MF test/erasure-code/$(DEPDIR)/unittest_erasure_code_shec-TestErasureCodeShec.Tpo -c -o test/erasure-code/unittest_erasure_code_shec-TestErasureCodeShec.o `test -f 'test/erasure-code/TestErasureCodeShec.cc' || echo '$(srcdir)/'`test/erasure-code/TestEras [...]
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/erasure-code/$(DEPDIR)/unittest_erasure_code_shec-TestErasureCodeShec.Tpo test/erasure-code/$(DEPDIR)/unittest_erasure_code_shec-TestErasureCodeShec.Po
@@ -19889,20 +22950,6 @@ erasure-code/shec/unittest_erasure_code_shec-ErasureCodeShecTableCache.obj: eras
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_CXXFLAGS) $(CXXFLAGS) -c -o erasure-code/shec/unittest_erasure_code_shec-ErasureCodeShecTableCache.obj `if test -f 'erasure-code/shec/ErasureCodeShecTableCache.cc'; then $(CYGPATH_W) 'erasure-code/shec/ErasureCodeShecTableCache.cc'; else $(CYGPATH_W) '$(srcdir)/erasure-code/shec/ErasureCodeShecTableCache.cc'; fi`
 
-erasure-code/shec/unittest_erasure_code_shec-shec.o: erasure-code/shec/shec.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_CXXFLAGS) $(CXXFLAGS) -MT erasure-code/shec/unittest_erasure_code_shec-shec.o -MD -MP -MF erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec-shec.Tpo -c -o erasure-code/shec/unittest_erasure_code_shec-shec.o `test -f 'erasure-code/shec/shec.cc' || echo '$(srcdir)/'`erasure-code/shec/shec.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec-shec.Tpo erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec-shec.Po
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='erasure-code/shec/shec.cc' object='erasure-code/shec/unittest_erasure_code_shec-shec.o' libtool=no @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_CXXFLAGS) $(CXXFLAGS) -c -o erasure-code/shec/unittest_erasure_code_shec-shec.o `test -f 'erasure-code/shec/shec.cc' || echo '$(srcdir)/'`erasure-code/shec/shec.cc
-
-erasure-code/shec/unittest_erasure_code_shec-shec.obj: erasure-code/shec/shec.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_CXXFLAGS) $(CXXFLAGS) -MT erasure-code/shec/unittest_erasure_code_shec-shec.obj -MD -MP -MF erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec-shec.Tpo -c -o erasure-code/shec/unittest_erasure_code_shec-shec.obj `if test -f 'erasure-code/shec/shec.cc'; then $(CYGPATH_W) 'erasure-code/shec/shec.cc'; else $(CYGPATH_W) '$(srcdir)/erasure-code/shec/shec. [...]
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec-shec.Tpo erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec-shec.Po
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='erasure-code/shec/shec.cc' object='erasure-code/shec/unittest_erasure_code_shec-shec.obj' libtool=no @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_CXXFLAGS) $(CXXFLAGS) -c -o erasure-code/shec/unittest_erasure_code_shec-shec.obj `if test -f 'erasure-code/shec/shec.cc'; then $(CYGPATH_W) 'erasure-code/shec/shec.cc'; else $(CYGPATH_W) '$(srcdir)/erasure-code/shec/shec.cc'; fi`
-
 test/erasure-code/unittest_erasure_code_shec_all-TestErasureCodeShec_all.o: test/erasure-code/TestErasureCodeShec_all.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_all_CXXFLAGS) $(CXXFLAGS) -MT test/erasure-code/unittest_erasure_code_shec_all-TestErasureCodeShec_all.o -MD -MP -MF test/erasure-code/$(DEPDIR)/unittest_erasure_code_shec_all-TestErasureCodeShec_all.Tpo -c -o test/erasure-code/unittest_erasure_code_shec_all-TestErasureCodeShec_all.o `test -f 'test/erasure-code/TestErasureCodeShec_all.cc' || echo '$(srcd [...]
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/erasure-code/$(DEPDIR)/unittest_erasure_code_shec_all-TestErasureCodeShec_all.Tpo test/erasure-code/$(DEPDIR)/unittest_erasure_code_shec_all-TestErasureCodeShec_all.Po
@@ -19973,19 +23020,75 @@ erasure-code/shec/unittest_erasure_code_shec_all-ErasureCodeShecTableCache.obj:
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_all_CXXFLAGS) $(CXXFLAGS) -c -o erasure-code/shec/unittest_erasure_code_shec_all-ErasureCodeShecTableCache.obj `if test -f 'erasure-code/shec/ErasureCodeShecTableCache.cc'; then $(CYGPATH_W) 'erasure-code/shec/ErasureCodeShecTableCache.cc'; else $(CYGPATH_W) '$(srcdir)/erasure-code/shec/ErasureCodeShecTableCache.cc'; fi`
 
-erasure-code/shec/unittest_erasure_code_shec_all-shec.o: erasure-code/shec/shec.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_all_CXXFLAGS) $(CXXFLAGS) -MT erasure-code/shec/unittest_erasure_code_shec_all-shec.o -MD -MP -MF erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_all-shec.Tpo -c -o erasure-code/shec/unittest_erasure_code_shec_all-shec.o `test -f 'erasure-code/shec/shec.cc' || echo '$(srcdir)/'`erasure-code/shec/shec.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_all-shec.Tpo erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_all-shec.Po
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='erasure-code/shec/shec.cc' object='erasure-code/shec/unittest_erasure_code_shec_all-shec.o' libtool=no @AMDEPBACKSLASH@
+test/erasure-code/unittest_erasure_code_shec_arguments-TestErasureCodeShec_arguments.o: test/erasure-code/TestErasureCodeShec_arguments.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CXXFLAGS) $(CXXFLAGS) -MT test/erasure-code/unittest_erasure_code_shec_arguments-TestErasureCodeShec_arguments.o -MD -MP -MF test/erasure-code/$(DEPDIR)/unittest_erasure_code_shec_arguments-TestErasureCodeShec_arguments.Tpo -c -o test/erasure-code/unittest_erasure_code_shec_arguments-TestErasureCodeShec_arguments.o `test -f 'test/erasure-code/T [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/erasure-code/$(DEPDIR)/unittest_erasure_code_shec_arguments-TestErasureCodeShec_arguments.Tpo test/erasure-code/$(DEPDIR)/unittest_erasure_code_shec_arguments-TestErasureCodeShec_arguments.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/erasure-code/TestErasureCodeShec_arguments.cc' object='test/erasure-code/unittest_erasure_code_shec_arguments-TestErasureCodeShec_arguments.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CXXFLAGS) $(CXXFLAGS) -c -o test/erasure-code/unittest_erasure_code_shec_arguments-TestErasureCodeShec_arguments.o `test -f 'test/erasure-code/TestErasureCodeShec_arguments.cc' || echo '$(srcdir)/'`test/erasure-code/TestErasureCodeShec_arguments.cc
+
+test/erasure-code/unittest_erasure_code_shec_arguments-TestErasureCodeShec_arguments.obj: test/erasure-code/TestErasureCodeShec_arguments.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CXXFLAGS) $(CXXFLAGS) -MT test/erasure-code/unittest_erasure_code_shec_arguments-TestErasureCodeShec_arguments.obj -MD -MP -MF test/erasure-code/$(DEPDIR)/unittest_erasure_code_shec_arguments-TestErasureCodeShec_arguments.Tpo -c -o test/erasure-code/unittest_erasure_code_shec_arguments-TestErasureCodeShec_arguments.obj `if test -f 'test/erasure [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/erasure-code/$(DEPDIR)/unittest_erasure_code_shec_arguments-TestErasureCodeShec_arguments.Tpo test/erasure-code/$(DEPDIR)/unittest_erasure_code_shec_arguments-TestErasureCodeShec_arguments.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/erasure-code/TestErasureCodeShec_arguments.cc' object='test/erasure-code/unittest_erasure_code_shec_arguments-TestErasureCodeShec_arguments.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CXXFLAGS) $(CXXFLAGS) -c -o test/erasure-code/unittest_erasure_code_shec_arguments-TestErasureCodeShec_arguments.obj `if test -f 'test/erasure-code/TestErasureCodeShec_arguments.cc'; then $(CYGPATH_W) 'test/erasure-code/TestErasureCodeShec_arguments.cc'; else $(CYGPATH_W) '$(srcdir)/test/erasure-code/TestErasureCodeShec_arguments.cc'; fi`
+
+erasure-code/unittest_erasure_code_shec_arguments-ErasureCode.o: erasure-code/ErasureCode.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CXXFLAGS) $(CXXFLAGS) -MT erasure-code/unittest_erasure_code_shec_arguments-ErasureCode.o -MD -MP -MF erasure-code/$(DEPDIR)/unittest_erasure_code_shec_arguments-ErasureCode.Tpo -c -o erasure-code/unittest_erasure_code_shec_arguments-ErasureCode.o `test -f 'erasure-code/ErasureCode.cc' || echo '$(srcdir)/'`erasure-code/ErasureCode.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) erasure-code/$(DEPDIR)/unittest_erasure_code_shec_arguments-ErasureCode.Tpo erasure-code/$(DEPDIR)/unittest_erasure_code_shec_arguments-ErasureCode.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='erasure-code/ErasureCode.cc' object='erasure-code/unittest_erasure_code_shec_arguments-ErasureCode.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CXXFLAGS) $(CXXFLAGS) -c -o erasure-code/unittest_erasure_code_shec_arguments-ErasureCode.o `test -f 'erasure-code/ErasureCode.cc' || echo '$(srcdir)/'`erasure-code/ErasureCode.cc
+
+erasure-code/unittest_erasure_code_shec_arguments-ErasureCode.obj: erasure-code/ErasureCode.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CXXFLAGS) $(CXXFLAGS) -MT erasure-code/unittest_erasure_code_shec_arguments-ErasureCode.obj -MD -MP -MF erasure-code/$(DEPDIR)/unittest_erasure_code_shec_arguments-ErasureCode.Tpo -c -o erasure-code/unittest_erasure_code_shec_arguments-ErasureCode.obj `if test -f 'erasure-code/ErasureCode.cc'; then $(CYGPATH_W) 'erasure-code/ErasureCode.cc'; el [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) erasure-code/$(DEPDIR)/unittest_erasure_code_shec_arguments-ErasureCode.Tpo erasure-code/$(DEPDIR)/unittest_erasure_code_shec_arguments-ErasureCode.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='erasure-code/ErasureCode.cc' object='erasure-code/unittest_erasure_code_shec_arguments-ErasureCode.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CXXFLAGS) $(CXXFLAGS) -c -o erasure-code/unittest_erasure_code_shec_arguments-ErasureCode.obj `if test -f 'erasure-code/ErasureCode.cc'; then $(CYGPATH_W) 'erasure-code/ErasureCode.cc'; else $(CYGPATH_W) '$(srcdir)/erasure-code/ErasureCode.cc'; fi`
+
+erasure-code/shec/unittest_erasure_code_shec_arguments-ErasureCodePluginShec.o: erasure-code/shec/ErasureCodePluginShec.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CXXFLAGS) $(CXXFLAGS) -MT erasure-code/shec/unittest_erasure_code_shec_arguments-ErasureCodePluginShec.o -MD -MP -MF erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_arguments-ErasureCodePluginShec.Tpo -c -o erasure-code/shec/unittest_erasure_code_shec_arguments-ErasureCodePluginShec.o `test -f 'erasure-code/shec/ErasureCodePluginShec.cc' [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_arguments-ErasureCodePluginShec.Tpo erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_arguments-ErasureCodePluginShec.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='erasure-code/shec/ErasureCodePluginShec.cc' object='erasure-code/shec/unittest_erasure_code_shec_arguments-ErasureCodePluginShec.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CXXFLAGS) $(CXXFLAGS) -c -o erasure-code/shec/unittest_erasure_code_shec_arguments-ErasureCodePluginShec.o `test -f 'erasure-code/shec/ErasureCodePluginShec.cc' || echo '$(srcdir)/'`erasure-code/shec/ErasureCodePluginShec.cc
+
+erasure-code/shec/unittest_erasure_code_shec_arguments-ErasureCodePluginShec.obj: erasure-code/shec/ErasureCodePluginShec.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CXXFLAGS) $(CXXFLAGS) -MT erasure-code/shec/unittest_erasure_code_shec_arguments-ErasureCodePluginShec.obj -MD -MP -MF erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_arguments-ErasureCodePluginShec.Tpo -c -o erasure-code/shec/unittest_erasure_code_shec_arguments-ErasureCodePluginShec.obj `if test -f 'erasure-code/shec/ErasureCodePluginS [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_arguments-ErasureCodePluginShec.Tpo erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_arguments-ErasureCodePluginShec.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='erasure-code/shec/ErasureCodePluginShec.cc' object='erasure-code/shec/unittest_erasure_code_shec_arguments-ErasureCodePluginShec.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CXXFLAGS) $(CXXFLAGS) -c -o erasure-code/shec/unittest_erasure_code_shec_arguments-ErasureCodePluginShec.obj `if test -f 'erasure-code/shec/ErasureCodePluginShec.cc'; then $(CYGPATH_W) 'erasure-code/shec/ErasureCodePluginShec.cc'; else $(CYGPATH_W) '$(srcdir)/erasure-code/shec/ErasureCodePluginShec.cc'; fi`
+
+erasure-code/shec/unittest_erasure_code_shec_arguments-ErasureCodeShec.o: erasure-code/shec/ErasureCodeShec.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CXXFLAGS) $(CXXFLAGS) -MT erasure-code/shec/unittest_erasure_code_shec_arguments-ErasureCodeShec.o -MD -MP -MF erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_arguments-ErasureCodeShec.Tpo -c -o erasure-code/shec/unittest_erasure_code_shec_arguments-ErasureCodeShec.o `test -f 'erasure-code/shec/ErasureCodeShec.cc' || echo '$(srcdir)/'`er [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_arguments-ErasureCodeShec.Tpo erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_arguments-ErasureCodeShec.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='erasure-code/shec/ErasureCodeShec.cc' object='erasure-code/shec/unittest_erasure_code_shec_arguments-ErasureCodeShec.o' libtool=no @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_all_CXXFLAGS) $(CXXFLAGS) -c -o erasure-code/shec/unittest_erasure_code_shec_all-shec.o `test -f 'erasure-code/shec/shec.cc' || echo '$(srcdir)/'`erasure-code/shec/shec.cc
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CXXFLAGS) $(CXXFLAGS) -c -o erasure-code/shec/unittest_erasure_code_shec_arguments-ErasureCodeShec.o `test -f 'erasure-code/shec/ErasureCodeShec.cc' || echo '$(srcdir)/'`erasure-code/shec/ErasureCodeShec.cc
 
-erasure-code/shec/unittest_erasure_code_shec_all-shec.obj: erasure-code/shec/shec.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_all_CXXFLAGS) $(CXXFLAGS) -MT erasure-code/shec/unittest_erasure_code_shec_all-shec.obj -MD -MP -MF erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_all-shec.Tpo -c -o erasure-code/shec/unittest_erasure_code_shec_all-shec.obj `if test -f 'erasure-code/shec/shec.cc'; then $(CYGPATH_W) 'erasure-code/shec/shec.cc'; else $(CYGPATH_W) '$(srcdir)/erasure [...]
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_all-shec.Tpo erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_all-shec.Po
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='erasure-code/shec/shec.cc' object='erasure-code/shec/unittest_erasure_code_shec_all-shec.obj' libtool=no @AMDEPBACKSLASH@
+erasure-code/shec/unittest_erasure_code_shec_arguments-ErasureCodeShec.obj: erasure-code/shec/ErasureCodeShec.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CXXFLAGS) $(CXXFLAGS) -MT erasure-code/shec/unittest_erasure_code_shec_arguments-ErasureCodeShec.obj -MD -MP -MF erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_arguments-ErasureCodeShec.Tpo -c -o erasure-code/shec/unittest_erasure_code_shec_arguments-ErasureCodeShec.obj `if test -f 'erasure-code/shec/ErasureCodeShec.cc'; then $(CYGPATH_ [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_arguments-ErasureCodeShec.Tpo erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_arguments-ErasureCodeShec.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='erasure-code/shec/ErasureCodeShec.cc' object='erasure-code/shec/unittest_erasure_code_shec_arguments-ErasureCodeShec.obj' libtool=no @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_all_CXXFLAGS) $(CXXFLAGS) -c -o erasure-code/shec/unittest_erasure_code_shec_all-shec.obj `if test -f 'erasure-code/shec/shec.cc'; then $(CYGPATH_W) 'erasure-code/shec/shec.cc'; else $(CYGPATH_W) '$(srcdir)/erasure-code/shec/shec.cc'; fi`
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CXXFLAGS) $(CXXFLAGS) -c -o erasure-code/shec/unittest_erasure_code_shec_arguments-ErasureCodeShec.obj `if test -f 'erasure-code/shec/ErasureCodeShec.cc'; then $(CYGPATH_W) 'erasure-code/shec/ErasureCodeShec.cc'; else $(CYGPATH_W) '$(srcdir)/erasure-code/shec/ErasureCodeShec.cc'; fi`
+
+erasure-code/shec/unittest_erasure_code_shec_arguments-ErasureCodeShecTableCache.o: erasure-code/shec/ErasureCodeShecTableCache.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CXXFLAGS) $(CXXFLAGS) -MT erasure-code/shec/unittest_erasure_code_shec_arguments-ErasureCodeShecTableCache.o -MD -MP -MF erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_arguments-ErasureCodeShecTableCache.Tpo -c -o erasure-code/shec/unittest_erasure_code_shec_arguments-ErasureCodeShecTableCache.o `test -f 'erasure-code/shec/ErasureCodeSh [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_arguments-ErasureCodeShecTableCache.Tpo erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_arguments-ErasureCodeShecTableCache.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='erasure-code/shec/ErasureCodeShecTableCache.cc' object='erasure-code/shec/unittest_erasure_code_shec_arguments-ErasureCodeShecTableCache.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CXXFLAGS) $(CXXFLAGS) -c -o erasure-code/shec/unittest_erasure_code_shec_arguments-ErasureCodeShecTableCache.o `test -f 'erasure-code/shec/ErasureCodeShecTableCache.cc' || echo '$(srcdir)/'`erasure-code/shec/ErasureCodeShecTableCache.cc
+
+erasure-code/shec/unittest_erasure_code_shec_arguments-ErasureCodeShecTableCache.obj: erasure-code/shec/ErasureCodeShecTableCache.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CXXFLAGS) $(CXXFLAGS) -MT erasure-code/shec/unittest_erasure_code_shec_arguments-ErasureCodeShecTableCache.obj -MD -MP -MF erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_arguments-ErasureCodeShecTableCache.Tpo -c -o erasure-code/shec/unittest_erasure_code_shec_arguments-ErasureCodeShecTableCache.obj `if test -f 'erasure-code/shec/Erasur [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_arguments-ErasureCodeShecTableCache.Tpo erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_arguments-ErasureCodeShecTableCache.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='erasure-code/shec/ErasureCodeShecTableCache.cc' object='erasure-code/shec/unittest_erasure_code_shec_arguments-ErasureCodeShecTableCache.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_arguments_CXXFLAGS) $(CXXFLAGS) -c -o erasure-code/shec/unittest_erasure_code_shec_arguments-ErasureCodeShecTableCache.obj `if test -f 'erasure-code/shec/ErasureCodeShecTableCache.cc'; then $(CYGPATH_W) 'erasure-code/shec/ErasureCodeShecTableCache.cc'; else $(CYGPATH_W) '$(srcdir)/erasure-code/shec/ErasureCodeShecTableCache.cc'; fi`
 
 test/erasure-code/unittest_erasure_code_shec_thread-TestErasureCodeShec_thread.o: test/erasure-code/TestErasureCodeShec_thread.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_thread_CXXFLAGS) $(CXXFLAGS) -MT test/erasure-code/unittest_erasure_code_shec_thread-TestErasureCodeShec_thread.o -MD -MP -MF test/erasure-code/$(DEPDIR)/unittest_erasure_code_shec_thread-TestErasureCodeShec_thread.Tpo -c -o test/erasure-code/unittest_erasure_code_shec_thread-TestErasureCodeShec_thread.o `test -f 'test/erasure-code/TestErasureCodeShec_th [...]
@@ -20057,20 +23160,6 @@ erasure-code/shec/unittest_erasure_code_shec_thread-ErasureCodeShecTableCache.ob
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_thread_CXXFLAGS) $(CXXFLAGS) -c -o erasure-code/shec/unittest_erasure_code_shec_thread-ErasureCodeShecTableCache.obj `if test -f 'erasure-code/shec/ErasureCodeShecTableCache.cc'; then $(CYGPATH_W) 'erasure-code/shec/ErasureCodeShecTableCache.cc'; else $(CYGPATH_W) '$(srcdir)/erasure-code/shec/ErasureCodeShecTableCache.cc'; fi`
 
-erasure-code/shec/unittest_erasure_code_shec_thread-shec.o: erasure-code/shec/shec.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_thread_CXXFLAGS) $(CXXFLAGS) -MT erasure-code/shec/unittest_erasure_code_shec_thread-shec.o -MD -MP -MF erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_thread-shec.Tpo -c -o erasure-code/shec/unittest_erasure_code_shec_thread-shec.o `test -f 'erasure-code/shec/shec.cc' || echo '$(srcdir)/'`erasure-code/shec/shec.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_thread-shec.Tpo erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_thread-shec.Po
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='erasure-code/shec/shec.cc' object='erasure-code/shec/unittest_erasure_code_shec_thread-shec.o' libtool=no @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_thread_CXXFLAGS) $(CXXFLAGS) -c -o erasure-code/shec/unittest_erasure_code_shec_thread-shec.o `test -f 'erasure-code/shec/shec.cc' || echo '$(srcdir)/'`erasure-code/shec/shec.cc
-
-erasure-code/shec/unittest_erasure_code_shec_thread-shec.obj: erasure-code/shec/shec.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_thread_CXXFLAGS) $(CXXFLAGS) -MT erasure-code/shec/unittest_erasure_code_shec_thread-shec.obj -MD -MP -MF erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_thread-shec.Tpo -c -o erasure-code/shec/unittest_erasure_code_shec_thread-shec.obj `if test -f 'erasure-code/shec/shec.cc'; then $(CYGPATH_W) 'erasure-code/shec/shec.cc'; else $(CYGPATH_W) '$(src [...]
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_thread-shec.Tpo erasure-code/shec/$(DEPDIR)/unittest_erasure_code_shec_thread-shec.Po
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='erasure-code/shec/shec.cc' object='erasure-code/shec/unittest_erasure_code_shec_thread-shec.obj' libtool=no @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_erasure_code_shec_thread_CXXFLAGS) $(CXXFLAGS) -c -o erasure-code/shec/unittest_erasure_code_shec_thread-shec.obj `if test -f 'erasure-code/shec/shec.cc'; then $(CYGPATH_W) 'erasure-code/shec/shec.cc'; else $(CYGPATH_W) '$(srcdir)/erasure-code/shec/shec.cc'; fi`
-
 test/unittest_escape-escape.o: test/escape.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_escape_CXXFLAGS) $(CXXFLAGS) -MT test/unittest_escape-escape.o -MD -MP -MF test/$(DEPDIR)/unittest_escape-escape.Tpo -c -o test/unittest_escape-escape.o `test -f 'test/escape.cc' || echo '$(srcdir)/'`test/escape.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/$(DEPDIR)/unittest_escape-escape.Tpo test/$(DEPDIR)/unittest_escape-escape.Po
@@ -20085,20 +23174,6 @@ test/unittest_escape-escape.obj: test/escape.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_escape_CXXFLAGS) $(CXXFLAGS) -c -o test/unittest_escape-escape.obj `if test -f 'test/escape.cc'; then $(CYGPATH_W) 'test/escape.cc'; else $(CYGPATH_W) '$(srcdir)/test/escape.cc'; fi`
 
-test/os/unittest_flatindex-TestFlatIndex.o: test/os/TestFlatIndex.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_flatindex_CXXFLAGS) $(CXXFLAGS) -MT test/os/unittest_flatindex-TestFlatIndex.o -MD -MP -MF test/os/$(DEPDIR)/unittest_flatindex-TestFlatIndex.Tpo -c -o test/os/unittest_flatindex-TestFlatIndex.o `test -f 'test/os/TestFlatIndex.cc' || echo '$(srcdir)/'`test/os/TestFlatIndex.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/os/$(DEPDIR)/unittest_flatindex-TestFlatIndex.Tpo test/os/$(DEPDIR)/unittest_flatindex-TestFlatIndex.Po
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/os/TestFlatIndex.cc' object='test/os/unittest_flatindex-TestFlatIndex.o' libtool=no @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_flatindex_CXXFLAGS) $(CXXFLAGS) -c -o test/os/unittest_flatindex-TestFlatIndex.o `test -f 'test/os/TestFlatIndex.cc' || echo '$(srcdir)/'`test/os/TestFlatIndex.cc
-
-test/os/unittest_flatindex-TestFlatIndex.obj: test/os/TestFlatIndex.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_flatindex_CXXFLAGS) $(CXXFLAGS) -MT test/os/unittest_flatindex-TestFlatIndex.obj -MD -MP -MF test/os/$(DEPDIR)/unittest_flatindex-TestFlatIndex.Tpo -c -o test/os/unittest_flatindex-TestFlatIndex.obj `if test -f 'test/os/TestFlatIndex.cc'; then $(CYGPATH_W) 'test/os/TestFlatIndex.cc'; else $(CYGPATH_W) '$(srcdir)/test/os/TestFlatIndex.cc'; fi`
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/os/$(DEPDIR)/unittest_flatindex-TestFlatIndex.Tpo test/os/$(DEPDIR)/unittest_flatindex-TestFlatIndex.Po
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/os/TestFlatIndex.cc' object='test/os/unittest_flatindex-TestFlatIndex.obj' libtool=no @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_flatindex_CXXFLAGS) $(CXXFLAGS) -c -o test/os/unittest_flatindex-TestFlatIndex.obj `if test -f 'test/os/TestFlatIndex.cc'; then $(CYGPATH_W) 'test/os/TestFlatIndex.cc'; else $(CYGPATH_W) '$(srcdir)/test/os/TestFlatIndex.cc'; fi`
-
 test/unittest_formatter-formatter.o: test/formatter.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_formatter_CXXFLAGS) $(CXXFLAGS) -MT test/unittest_formatter-formatter.o -MD -MP -MF test/$(DEPDIR)/unittest_formatter-formatter.Tpo -c -o test/unittest_formatter-formatter.o `test -f 'test/formatter.cc' || echo '$(srcdir)/'`test/formatter.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/$(DEPDIR)/unittest_formatter-formatter.Tpo test/$(DEPDIR)/unittest_formatter-formatter.Po
@@ -20281,6 +23356,20 @@ test/librbd/unittest_librbd-test_main.obj: test/librbd/test_main.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -c -o test/librbd/unittest_librbd-test_main.obj `if test -f 'test/librbd/test_main.cc'; then $(CYGPATH_W) 'test/librbd/test_main.cc'; else $(CYGPATH_W) '$(srcdir)/test/librbd/test_main.cc'; fi`
 
+test/librbd/unittest_librbd-test_mock_fixture.o: test/librbd/test_mock_fixture.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -MT test/librbd/unittest_librbd-test_mock_fixture.o -MD -MP -MF test/librbd/$(DEPDIR)/unittest_librbd-test_mock_fixture.Tpo -c -o test/librbd/unittest_librbd-test_mock_fixture.o `test -f 'test/librbd/test_mock_fixture.cc' || echo '$(srcdir)/'`test/librbd/test_mock_fixture.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/librbd/$(DEPDIR)/unittest_librbd-test_mock_fixture.Tpo test/librbd/$(DEPDIR)/unittest_librbd-test_mock_fixture.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/librbd/test_mock_fixture.cc' object='test/librbd/unittest_librbd-test_mock_fixture.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -c -o test/librbd/unittest_librbd-test_mock_fixture.o `test -f 'test/librbd/test_mock_fixture.cc' || echo '$(srcdir)/'`test/librbd/test_mock_fixture.cc
+
+test/librbd/unittest_librbd-test_mock_fixture.obj: test/librbd/test_mock_fixture.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -MT test/librbd/unittest_librbd-test_mock_fixture.obj -MD -MP -MF test/librbd/$(DEPDIR)/unittest_librbd-test_mock_fixture.Tpo -c -o test/librbd/unittest_librbd-test_mock_fixture.obj `if test -f 'test/librbd/test_mock_fixture.cc'; then $(CYGPATH_W) 'test/librbd/test_mock_fixture.cc'; else $(CYGPATH_W) '$(srcdir)/test/librbd/test_mock_fixture.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/librbd/$(DEPDIR)/unittest_librbd-test_mock_fixture.Tpo test/librbd/$(DEPDIR)/unittest_librbd-test_mock_fixture.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/librbd/test_mock_fixture.cc' object='test/librbd/unittest_librbd-test_mock_fixture.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_librbd_CXXFLAGS) $(CXXFLAGS) -c -o test/librbd/unittest_librbd-test_mock_fixture.obj `if test -f 'test/librbd/test_mock_fixture.cc'; then $(CYGPATH_W) 'test/librbd/test_mock_fixture.cc'; else $(CYGPATH_W) '$(srcdir)/test/librbd/test_mock_fixture.cc'; fi`
+
 log/unittest_log-test.o: log/test.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_log_CXXFLAGS) $(CXXFLAGS) -MT log/unittest_log-test.o -MD -MP -MF log/$(DEPDIR)/unittest_log-test.Tpo -c -o log/unittest_log-test.o `test -f 'log/test.cc' || echo '$(srcdir)/'`log/test.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) log/$(DEPDIR)/unittest_log-test.Tpo log/$(DEPDIR)/unittest_log-test.Po
@@ -20435,6 +23524,20 @@ test/osd/unittest_osdscrub-TestOSDScrub.obj: test/osd/TestOSDScrub.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_osdscrub_CXXFLAGS) $(CXXFLAGS) -c -o test/osd/unittest_osdscrub-TestOSDScrub.obj `if test -f 'test/osd/TestOSDScrub.cc'; then $(CYGPATH_W) 'test/osd/TestOSDScrub.cc'; else $(CYGPATH_W) '$(srcdir)/test/osd/TestOSDScrub.cc'; fi`
 
+test/unittest_pageset-test_pageset.o: test/test_pageset.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_pageset_CXXFLAGS) $(CXXFLAGS) -MT test/unittest_pageset-test_pageset.o -MD -MP -MF test/$(DEPDIR)/unittest_pageset-test_pageset.Tpo -c -o test/unittest_pageset-test_pageset.o `test -f 'test/test_pageset.cc' || echo '$(srcdir)/'`test/test_pageset.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/$(DEPDIR)/unittest_pageset-test_pageset.Tpo test/$(DEPDIR)/unittest_pageset-test_pageset.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/test_pageset.cc' object='test/unittest_pageset-test_pageset.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_pageset_CXXFLAGS) $(CXXFLAGS) -c -o test/unittest_pageset-test_pageset.o `test -f 'test/test_pageset.cc' || echo '$(srcdir)/'`test/test_pageset.cc
+
+test/unittest_pageset-test_pageset.obj: test/test_pageset.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_pageset_CXXFLAGS) $(CXXFLAGS) -MT test/unittest_pageset-test_pageset.obj -MD -MP -MF test/$(DEPDIR)/unittest_pageset-test_pageset.Tpo -c -o test/unittest_pageset-test_pageset.obj `if test -f 'test/test_pageset.cc'; then $(CYGPATH_W) 'test/test_pageset.cc'; else $(CYGPATH_W) '$(srcdir)/test/test_pageset.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/$(DEPDIR)/unittest_pageset-test_pageset.Tpo test/$(DEPDIR)/unittest_pageset-test_pageset.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/test_pageset.cc' object='test/unittest_pageset-test_pageset.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_pageset_CXXFLAGS) $(CXXFLAGS) -c -o test/unittest_pageset-test_pageset.obj `if test -f 'test/test_pageset.cc'; then $(CYGPATH_W) 'test/test_pageset.cc'; else $(CYGPATH_W) '$(srcdir)/test/test_pageset.cc'; fi`
+
 test/unittest_perf_counters-perf_counters.o: test/perf_counters.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_perf_counters_CXXFLAGS) $(CXXFLAGS) -MT test/unittest_perf_counters-perf_counters.o -MD -MP -MF test/$(DEPDIR)/unittest_perf_counters-perf_counters.Tpo -c -o test/unittest_perf_counters-perf_counters.o `test -f 'test/perf_counters.cc' || echo '$(srcdir)/'`test/perf_counters.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/$(DEPDIR)/unittest_perf_counters-perf_counters.Tpo test/$(DEPDIR)/unittest_perf_counters-perf_counters.Po
@@ -20477,6 +23580,20 @@ test/unittest_prebufferedstreambuf-test_prebufferedstreambuf.obj: test/test_preb
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_prebufferedstreambuf_CXXFLAGS) $(CXXFLAGS) -c -o test/unittest_prebufferedstreambuf-test_prebufferedstreambuf.obj `if test -f 'test/test_prebufferedstreambuf.cc'; then $(CYGPATH_W) 'test/test_prebufferedstreambuf.cc'; else $(CYGPATH_W) '$(srcdir)/test/test_prebufferedstreambuf.cc'; fi`
 
+test/common/unittest_prioritized_queue-test_prioritized_queue.o: test/common/test_prioritized_queue.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_prioritized_queue_CXXFLAGS) $(CXXFLAGS) -MT test/common/unittest_prioritized_queue-test_prioritized_queue.o -MD -MP -MF test/common/$(DEPDIR)/unittest_prioritized_queue-test_prioritized_queue.Tpo -c -o test/common/unittest_prioritized_queue-test_prioritized_queue.o `test -f 'test/common/test_prioritized_queue.cc' || echo '$(srcdir)/'`test/common/test_prioritized_queue.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/common/$(DEPDIR)/unittest_prioritized_queue-test_prioritized_queue.Tpo test/common/$(DEPDIR)/unittest_prioritized_queue-test_prioritized_queue.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/common/test_prioritized_queue.cc' object='test/common/unittest_prioritized_queue-test_prioritized_queue.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_prioritized_queue_CXXFLAGS) $(CXXFLAGS) -c -o test/common/unittest_prioritized_queue-test_prioritized_queue.o `test -f 'test/common/test_prioritized_queue.cc' || echo '$(srcdir)/'`test/common/test_prioritized_queue.cc
+
+test/common/unittest_prioritized_queue-test_prioritized_queue.obj: test/common/test_prioritized_queue.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_prioritized_queue_CXXFLAGS) $(CXXFLAGS) -MT test/common/unittest_prioritized_queue-test_prioritized_queue.obj -MD -MP -MF test/common/$(DEPDIR)/unittest_prioritized_queue-test_prioritized_queue.Tpo -c -o test/common/unittest_prioritized_queue-test_prioritized_queue.obj `if test -f 'test/common/test_prioritized_queue.cc'; then $(CYGPATH_W) 'test/common/test_prioritized_que [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/common/$(DEPDIR)/unittest_prioritized_queue-test_prioritized_queue.Tpo test/common/$(DEPDIR)/unittest_prioritized_queue-test_prioritized_queue.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/common/test_prioritized_queue.cc' object='test/common/unittest_prioritized_queue-test_prioritized_queue.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_prioritized_queue_CXXFLAGS) $(CXXFLAGS) -c -o test/common/unittest_prioritized_queue-test_prioritized_queue.obj `if test -f 'test/common/test_prioritized_queue.cc'; then $(CYGPATH_W) 'test/common/test_prioritized_queue.cc'; else $(CYGPATH_W) '$(srcdir)/test/common/test_prioritized_queue.cc'; fi`
+
 test/unittest_rbd_replay-test_rbd_replay.o: test/test_rbd_replay.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_rbd_replay_CXXFLAGS) $(CXXFLAGS) -MT test/unittest_rbd_replay-test_rbd_replay.o -MD -MP -MF test/$(DEPDIR)/unittest_rbd_replay-test_rbd_replay.Tpo -c -o test/unittest_rbd_replay-test_rbd_replay.o `test -f 'test/test_rbd_replay.cc' || echo '$(srcdir)/'`test/test_rbd_replay.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/$(DEPDIR)/unittest_rbd_replay-test_rbd_replay.Tpo test/$(DEPDIR)/unittest_rbd_replay-test_rbd_replay.Po
@@ -20505,6 +23622,34 @@ test/common/unittest_readahead-Readahead.obj: test/common/Readahead.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_readahead_CXXFLAGS) $(CXXFLAGS) -c -o test/common/unittest_readahead-Readahead.obj `if test -f 'test/common/Readahead.cc'; then $(CYGPATH_W) 'test/common/Readahead.cc'; else $(CYGPATH_W) '$(srcdir)/test/common/Readahead.cc'; fi`
 
+test/objectstore/unittest_rocksdb_option-TestRocksdbOptionParse.o: test/objectstore/TestRocksdbOptionParse.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_rocksdb_option_CXXFLAGS) $(CXXFLAGS) -MT test/objectstore/unittest_rocksdb_option-TestRocksdbOptionParse.o -MD -MP -MF test/objectstore/$(DEPDIR)/unittest_rocksdb_option-TestRocksdbOptionParse.Tpo -c -o test/objectstore/unittest_rocksdb_option-TestRocksdbOptionParse.o `test -f 'test/objectstore/TestRocksdbOptionParse.cc' || echo '$(srcdir)/'`test/objectstore/TestRocksdbOp [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/objectstore/$(DEPDIR)/unittest_rocksdb_option-TestRocksdbOptionParse.Tpo test/objectstore/$(DEPDIR)/unittest_rocksdb_option-TestRocksdbOptionParse.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/objectstore/TestRocksdbOptionParse.cc' object='test/objectstore/unittest_rocksdb_option-TestRocksdbOptionParse.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_rocksdb_option_CXXFLAGS) $(CXXFLAGS) -c -o test/objectstore/unittest_rocksdb_option-TestRocksdbOptionParse.o `test -f 'test/objectstore/TestRocksdbOptionParse.cc' || echo '$(srcdir)/'`test/objectstore/TestRocksdbOptionParse.cc
+
+test/objectstore/unittest_rocksdb_option-TestRocksdbOptionParse.obj: test/objectstore/TestRocksdbOptionParse.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_rocksdb_option_CXXFLAGS) $(CXXFLAGS) -MT test/objectstore/unittest_rocksdb_option-TestRocksdbOptionParse.obj -MD -MP -MF test/objectstore/$(DEPDIR)/unittest_rocksdb_option-TestRocksdbOptionParse.Tpo -c -o test/objectstore/unittest_rocksdb_option-TestRocksdbOptionParse.obj `if test -f 'test/objectstore/TestRocksdbOptionParse.cc'; then $(CYGPATH_W) 'test/objectstore/TestRoc [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/objectstore/$(DEPDIR)/unittest_rocksdb_option-TestRocksdbOptionParse.Tpo test/objectstore/$(DEPDIR)/unittest_rocksdb_option-TestRocksdbOptionParse.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/objectstore/TestRocksdbOptionParse.cc' object='test/objectstore/unittest_rocksdb_option-TestRocksdbOptionParse.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_rocksdb_option_CXXFLAGS) $(CXXFLAGS) -c -o test/objectstore/unittest_rocksdb_option-TestRocksdbOptionParse.obj `if test -f 'test/objectstore/TestRocksdbOptionParse.cc'; then $(CYGPATH_W) 'test/objectstore/TestRocksdbOptionParse.cc'; else $(CYGPATH_W) '$(srcdir)/test/objectstore/TestRocksdbOptionParse.cc'; fi`
+
+test/objectstore/unittest_rocksdb_option_static-TestRocksdbOptionParse.o: test/objectstore/TestRocksdbOptionParse.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_rocksdb_option_static_CXXFLAGS) $(CXXFLAGS) -MT test/objectstore/unittest_rocksdb_option_static-TestRocksdbOptionParse.o -MD -MP -MF test/objectstore/$(DEPDIR)/unittest_rocksdb_option_static-TestRocksdbOptionParse.Tpo -c -o test/objectstore/unittest_rocksdb_option_static-TestRocksdbOptionParse.o `test -f 'test/objectstore/TestRocksdbOptionParse.cc' || echo '$(srcdir)/'`te [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/objectstore/$(DEPDIR)/unittest_rocksdb_option_static-TestRocksdbOptionParse.Tpo test/objectstore/$(DEPDIR)/unittest_rocksdb_option_static-TestRocksdbOptionParse.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/objectstore/TestRocksdbOptionParse.cc' object='test/objectstore/unittest_rocksdb_option_static-TestRocksdbOptionParse.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_rocksdb_option_static_CXXFLAGS) $(CXXFLAGS) -c -o test/objectstore/unittest_rocksdb_option_static-TestRocksdbOptionParse.o `test -f 'test/objectstore/TestRocksdbOptionParse.cc' || echo '$(srcdir)/'`test/objectstore/TestRocksdbOptionParse.cc
+
+test/objectstore/unittest_rocksdb_option_static-TestRocksdbOptionParse.obj: test/objectstore/TestRocksdbOptionParse.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_rocksdb_option_static_CXXFLAGS) $(CXXFLAGS) -MT test/objectstore/unittest_rocksdb_option_static-TestRocksdbOptionParse.obj -MD -MP -MF test/objectstore/$(DEPDIR)/unittest_rocksdb_option_static-TestRocksdbOptionParse.Tpo -c -o test/objectstore/unittest_rocksdb_option_static-TestRocksdbOptionParse.obj `if test -f 'test/objectstore/TestRocksdbOptionParse.cc'; then $(CYGPATH_ [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/objectstore/$(DEPDIR)/unittest_rocksdb_option_static-TestRocksdbOptionParse.Tpo test/objectstore/$(DEPDIR)/unittest_rocksdb_option_static-TestRocksdbOptionParse.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/objectstore/TestRocksdbOptionParse.cc' object='test/objectstore/unittest_rocksdb_option_static-TestRocksdbOptionParse.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_rocksdb_option_static_CXXFLAGS) $(CXXFLAGS) -c -o test/objectstore/unittest_rocksdb_option_static-TestRocksdbOptionParse.obj `if test -f 'test/objectstore/TestRocksdbOptionParse.cc'; then $(CYGPATH_W) 'test/objectstore/TestRocksdbOptionParse.cc'; else $(CYGPATH_W) '$(srcdir)/test/objectstore/TestRocksdbOptionParse.cc'; fi`
+
 test/unittest_run_cmd-run_cmd.o: test/run_cmd.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_run_cmd_CXXFLAGS) $(CXXFLAGS) -MT test/unittest_run_cmd-run_cmd.o -MD -MP -MF test/$(DEPDIR)/unittest_run_cmd-run_cmd.Tpo -c -o test/unittest_run_cmd-run_cmd.o `test -f 'test/run_cmd.cc' || echo '$(srcdir)/'`test/run_cmd.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/$(DEPDIR)/unittest_run_cmd-run_cmd.Tpo test/$(DEPDIR)/unittest_run_cmd-run_cmd.Po
@@ -20659,6 +23804,20 @@ test/unittest_strtol-strtol.obj: test/strtol.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_strtol_CXXFLAGS) $(CXXFLAGS) -c -o test/unittest_strtol-strtol.obj `if test -f 'test/strtol.cc'; then $(CYGPATH_W) 'test/strtol.cc'; else $(CYGPATH_W) '$(srcdir)/test/strtol.cc'; fi`
 
+test/unittest_subprocess-test_subprocess.o: test/test_subprocess.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_subprocess_CXXFLAGS) $(CXXFLAGS) -MT test/unittest_subprocess-test_subprocess.o -MD -MP -MF test/$(DEPDIR)/unittest_subprocess-test_subprocess.Tpo -c -o test/unittest_subprocess-test_subprocess.o `test -f 'test/test_subprocess.cc' || echo '$(srcdir)/'`test/test_subprocess.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/$(DEPDIR)/unittest_subprocess-test_subprocess.Tpo test/$(DEPDIR)/unittest_subprocess-test_subprocess.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/test_subprocess.cc' object='test/unittest_subprocess-test_subprocess.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_subprocess_CXXFLAGS) $(CXXFLAGS) -c -o test/unittest_subprocess-test_subprocess.o `test -f 'test/test_subprocess.cc' || echo '$(srcdir)/'`test/test_subprocess.cc
+
+test/unittest_subprocess-test_subprocess.obj: test/test_subprocess.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_subprocess_CXXFLAGS) $(CXXFLAGS) -MT test/unittest_subprocess-test_subprocess.obj -MD -MP -MF test/$(DEPDIR)/unittest_subprocess-test_subprocess.Tpo -c -o test/unittest_subprocess-test_subprocess.obj `if test -f 'test/test_subprocess.cc'; then $(CYGPATH_W) 'test/test_subprocess.cc'; else $(CYGPATH_W) '$(srcdir)/test/test_subprocess.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/$(DEPDIR)/unittest_subprocess-test_subprocess.Tpo test/$(DEPDIR)/unittest_subprocess-test_subprocess.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/test_subprocess.cc' object='test/unittest_subprocess-test_subprocess.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_subprocess_CXXFLAGS) $(CXXFLAGS) -c -o test/unittest_subprocess-test_subprocess.obj `if test -f 'test/test_subprocess.cc'; then $(CYGPATH_W) 'test/test_subprocess.cc'; else $(CYGPATH_W) '$(srcdir)/test/test_subprocess.cc'; fi`
+
 test/common/unittest_tableformatter-test_tableformatter.o: test/common/test_tableformatter.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_tableformatter_CXXFLAGS) $(CXXFLAGS) -MT test/common/unittest_tableformatter-test_tableformatter.o -MD -MP -MF test/common/$(DEPDIR)/unittest_tableformatter-test_tableformatter.Tpo -c -o test/common/unittest_tableformatter-test_tableformatter.o `test -f 'test/common/test_tableformatter.cc' || echo '$(srcdir)/'`test/common/test_tableformatter.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/common/$(DEPDIR)/unittest_tableformatter-test_tableformatter.Tpo test/common/$(DEPDIR)/unittest_tableformatter-test_tableformatter.Po
@@ -20743,6 +23902,20 @@ test/unittest_workqueue-test_workqueue.obj: test/test_workqueue.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_workqueue_CXXFLAGS) $(CXXFLAGS) -c -o test/unittest_workqueue-test_workqueue.obj `if test -f 'test/test_workqueue.cc'; then $(CYGPATH_W) 'test/test_workqueue.cc'; else $(CYGPATH_W) '$(srcdir)/test/test_workqueue.cc'; fi`
 
+test/unittest_xlist-test_xlist.o: test/test_xlist.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_xlist_CXXFLAGS) $(CXXFLAGS) -MT test/unittest_xlist-test_xlist.o -MD -MP -MF test/$(DEPDIR)/unittest_xlist-test_xlist.Tpo -c -o test/unittest_xlist-test_xlist.o `test -f 'test/test_xlist.cc' || echo '$(srcdir)/'`test/test_xlist.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/$(DEPDIR)/unittest_xlist-test_xlist.Tpo test/$(DEPDIR)/unittest_xlist-test_xlist.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/test_xlist.cc' object='test/unittest_xlist-test_xlist.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_xlist_CXXFLAGS) $(CXXFLAGS) -c -o test/unittest_xlist-test_xlist.o `test -f 'test/test_xlist.cc' || echo '$(srcdir)/'`test/test_xlist.cc
+
+test/unittest_xlist-test_xlist.obj: test/test_xlist.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_xlist_CXXFLAGS) $(CXXFLAGS) -MT test/unittest_xlist-test_xlist.obj -MD -MP -MF test/$(DEPDIR)/unittest_xlist-test_xlist.Tpo -c -o test/unittest_xlist-test_xlist.obj `if test -f 'test/test_xlist.cc'; then $(CYGPATH_W) 'test/test_xlist.cc'; else $(CYGPATH_W) '$(srcdir)/test/test_xlist.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/$(DEPDIR)/unittest_xlist-test_xlist.Tpo test/$(DEPDIR)/unittest_xlist-test_xlist.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/test_xlist.cc' object='test/unittest_xlist-test_xlist.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_xlist_CXXFLAGS) $(CXXFLAGS) -c -o test/unittest_xlist-test_xlist.obj `if test -f 'test/test_xlist.cc'; then $(CYGPATH_W) 'test/test_xlist.cc'; else $(CYGPATH_W) '$(srcdir)/test/test_xlist.cc'; fi`
+
 test/messenger/xio_client-xio_client.o: test/messenger/xio_client.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(xio_client_CXXFLAGS) $(CXXFLAGS) -MT test/messenger/xio_client-xio_client.o -MD -MP -MF test/messenger/$(DEPDIR)/xio_client-xio_client.Tpo -c -o test/messenger/xio_client-xio_client.o `test -f 'test/messenger/xio_client.cc' || echo '$(srcdir)/'`test/messenger/xio_client.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/messenger/$(DEPDIR)/xio_client-xio_client.Tpo test/messenger/$(DEPDIR)/xio_client-xio_client.Po
@@ -20889,6 +24062,60 @@ erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_vect_dot_prod_avx.asm.lo: er
 erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_vect_dot_prod_sse.asm.lo: erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_sse.asm.s
 	$(AM_V_CCAS)$(LIBTOOL) $(AM_V_lt) $(libec_isa_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(libec_isa_la_CCASFLAGS) $(CCASFLAGS) -c -o erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_vect_dot_prod_sse.asm.lo `test -f 'erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_sse.asm.s' || echo '$(srcdir)/'`erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_sse.asm.s
 
+erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_2vect_mad_avx2.asm.lo: erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_avx2.asm.s
+	$(AM_V_CCAS)$(LIBTOOL) $(AM_V_lt) $(libec_isa_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(libec_isa_la_CCASFLAGS) $(CCASFLAGS) -c -o erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_2vect_mad_avx2.asm.lo `test -f 'erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_avx2.asm.s' || echo '$(srcdir)/'`erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_avx2.asm.s
+
+erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_2vect_mad_avx.asm.lo: erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_avx.asm.s
+	$(AM_V_CCAS)$(LIBTOOL) $(AM_V_lt) $(libec_isa_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(libec_isa_la_CCASFLAGS) $(CCASFLAGS) -c -o erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_2vect_mad_avx.asm.lo `test -f 'erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_avx.asm.s' || echo '$(srcdir)/'`erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_avx.asm.s
+
+erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_2vect_mad_sse.asm.lo: erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_sse.asm.s
+	$(AM_V_CCAS)$(LIBTOOL) $(AM_V_lt) $(libec_isa_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(libec_isa_la_CCASFLAGS) $(CCASFLAGS) -c -o erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_2vect_mad_sse.asm.lo `test -f 'erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_sse.asm.s' || echo '$(srcdir)/'`erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_sse.asm.s
+
+erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_3vect_mad_avx2.asm.lo: erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_avx2.asm.s
+	$(AM_V_CCAS)$(LIBTOOL) $(AM_V_lt) $(libec_isa_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(libec_isa_la_CCASFLAGS) $(CCASFLAGS) -c -o erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_3vect_mad_avx2.asm.lo `test -f 'erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_avx2.asm.s' || echo '$(srcdir)/'`erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_avx2.asm.s
+
+erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_3vect_mad_avx.asm.lo: erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_avx.asm.s
+	$(AM_V_CCAS)$(LIBTOOL) $(AM_V_lt) $(libec_isa_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(libec_isa_la_CCASFLAGS) $(CCASFLAGS) -c -o erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_3vect_mad_avx.asm.lo `test -f 'erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_avx.asm.s' || echo '$(srcdir)/'`erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_avx.asm.s
+
+erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_3vect_mad_sse.asm.lo: erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_sse.asm.s
+	$(AM_V_CCAS)$(LIBTOOL) $(AM_V_lt) $(libec_isa_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(libec_isa_la_CCASFLAGS) $(CCASFLAGS) -c -o erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_3vect_mad_sse.asm.lo `test -f 'erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_sse.asm.s' || echo '$(srcdir)/'`erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_sse.asm.s
+
+erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_4vect_mad_avx2.asm.lo: erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_avx2.asm.s
+	$(AM_V_CCAS)$(LIBTOOL) $(AM_V_lt) $(libec_isa_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(libec_isa_la_CCASFLAGS) $(CCASFLAGS) -c -o erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_4vect_mad_avx2.asm.lo `test -f 'erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_avx2.asm.s' || echo '$(srcdir)/'`erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_avx2.asm.s
+
+erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_4vect_mad_avx.asm.lo: erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_avx.asm.s
+	$(AM_V_CCAS)$(LIBTOOL) $(AM_V_lt) $(libec_isa_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(libec_isa_la_CCASFLAGS) $(CCASFLAGS) -c -o erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_4vect_mad_avx.asm.lo `test -f 'erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_avx.asm.s' || echo '$(srcdir)/'`erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_avx.asm.s
+
+erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_4vect_mad_sse.asm.lo: erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_sse.asm.s
+	$(AM_V_CCAS)$(LIBTOOL) $(AM_V_lt) $(libec_isa_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(libec_isa_la_CCASFLAGS) $(CCASFLAGS) -c -o erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_4vect_mad_sse.asm.lo `test -f 'erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_sse.asm.s' || echo '$(srcdir)/'`erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_sse.asm.s
+
+erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_5vect_mad_avx2.asm.lo: erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_avx2.asm.s
+	$(AM_V_CCAS)$(LIBTOOL) $(AM_V_lt) $(libec_isa_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(libec_isa_la_CCASFLAGS) $(CCASFLAGS) -c -o erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_5vect_mad_avx2.asm.lo `test -f 'erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_avx2.asm.s' || echo '$(srcdir)/'`erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_avx2.asm.s
+
+erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_5vect_mad_avx.asm.lo: erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_avx.asm.s
+	$(AM_V_CCAS)$(LIBTOOL) $(AM_V_lt) $(libec_isa_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(libec_isa_la_CCASFLAGS) $(CCASFLAGS) -c -o erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_5vect_mad_avx.asm.lo `test -f 'erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_avx.asm.s' || echo '$(srcdir)/'`erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_avx.asm.s
+
+erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_5vect_mad_sse.asm.lo: erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_sse.asm.s
+	$(AM_V_CCAS)$(LIBTOOL) $(AM_V_lt) $(libec_isa_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(libec_isa_la_CCASFLAGS) $(CCASFLAGS) -c -o erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_5vect_mad_sse.asm.lo `test -f 'erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_sse.asm.s' || echo '$(srcdir)/'`erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_sse.asm.s
+
+erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_6vect_mad_avx2.asm.lo: erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_avx2.asm.s
+	$(AM_V_CCAS)$(LIBTOOL) $(AM_V_lt) $(libec_isa_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(libec_isa_la_CCASFLAGS) $(CCASFLAGS) -c -o erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_6vect_mad_avx2.asm.lo `test -f 'erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_avx2.asm.s' || echo '$(srcdir)/'`erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_avx2.asm.s
+
+erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_6vect_mad_avx.asm.lo: erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_avx.asm.s
+	$(AM_V_CCAS)$(LIBTOOL) $(AM_V_lt) $(libec_isa_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(libec_isa_la_CCASFLAGS) $(CCASFLAGS) -c -o erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_6vect_mad_avx.asm.lo `test -f 'erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_avx.asm.s' || echo '$(srcdir)/'`erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_avx.asm.s
+
+erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_6vect_mad_sse.asm.lo: erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_sse.asm.s
+	$(AM_V_CCAS)$(LIBTOOL) $(AM_V_lt) $(libec_isa_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(libec_isa_la_CCASFLAGS) $(CCASFLAGS) -c -o erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_6vect_mad_sse.asm.lo `test -f 'erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_sse.asm.s' || echo '$(srcdir)/'`erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_sse.asm.s
+
+erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_vect_mad_avx2.asm.lo: erasure-code/isa/isa-l/erasure_code/gf_vect_mad_avx2.asm.s
+	$(AM_V_CCAS)$(LIBTOOL) $(AM_V_lt) $(libec_isa_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(libec_isa_la_CCASFLAGS) $(CCASFLAGS) -c -o erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_vect_mad_avx2.asm.lo `test -f 'erasure-code/isa/isa-l/erasure_code/gf_vect_mad_avx2.asm.s' || echo '$(srcdir)/'`erasure-code/isa/isa-l/erasure_code/gf_vect_mad_avx2.asm.s
+
+erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_vect_mad_avx.asm.lo: erasure-code/isa/isa-l/erasure_code/gf_vect_mad_avx.asm.s
+	$(AM_V_CCAS)$(LIBTOOL) $(AM_V_lt) $(libec_isa_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(libec_isa_la_CCASFLAGS) $(CCASFLAGS) -c -o erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_vect_mad_avx.asm.lo `test -f 'erasure-code/isa/isa-l/erasure_code/gf_vect_mad_avx.asm.s' || echo '$(srcdir)/'`erasure-code/isa/isa-l/erasure_code/gf_vect_mad_avx.asm.s
+
+erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_vect_mad_sse.asm.lo: erasure-code/isa/isa-l/erasure_code/gf_vect_mad_sse.asm.s
+	$(AM_V_CCAS)$(LIBTOOL) $(AM_V_lt) $(libec_isa_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(libec_isa_la_CCASFLAGS) $(CCASFLAGS) -c -o erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_vect_mad_sse.asm.lo `test -f 'erasure-code/isa/isa-l/erasure_code/gf_vect_mad_sse.asm.s' || echo '$(srcdir)/'`erasure-code/isa/isa-l/erasure_code/gf_vect_mad_sse.asm.s
+
 erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_vect_mul_avx.asm.lo: erasure-code/isa/isa-l/erasure_code/gf_vect_mul_avx.asm.s
 	$(AM_V_CCAS)$(LIBTOOL) $(AM_V_lt) $(libec_isa_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(libec_isa_la_CCASFLAGS) $(CCASFLAGS) -c -o erasure-code/isa/isa-l/erasure_code/libec_isa_la-gf_vect_mul_avx.asm.lo `test -f 'erasure-code/isa/isa-l/erasure_code/gf_vect_mul_avx.asm.s' || echo '$(srcdir)/'`erasure-code/isa/isa-l/erasure_code/gf_vect_mul_avx.asm.s
 
@@ -20907,17 +24134,21 @@ clean-libtool:
 	-rm -rf auth/unknown/.libs auth/unknown/_libs
 	-rm -rf civetweb/src/.libs civetweb/src/_libs
 	-rm -rf client/.libs client/_libs
+	-rm -rf cls/cephfs/.libs cls/cephfs/_libs
 	-rm -rf cls/hello/.libs cls/hello/_libs
 	-rm -rf cls/lock/.libs cls/lock/_libs
 	-rm -rf cls/log/.libs cls/log/_libs
+	-rm -rf cls/numops/.libs cls/numops/_libs
 	-rm -rf cls/rbd/.libs cls/rbd/_libs
 	-rm -rf cls/refcount/.libs cls/refcount/_libs
 	-rm -rf cls/replica_log/.libs cls/replica_log/_libs
 	-rm -rf cls/rgw/.libs cls/rgw/_libs
 	-rm -rf cls/statelog/.libs cls/statelog/_libs
+	-rm -rf cls/timeindex/.libs cls/timeindex/_libs
 	-rm -rf cls/user/.libs cls/user/_libs
 	-rm -rf cls/version/.libs cls/version/_libs
 	-rm -rf common/.libs common/_libs
+	-rm -rf compressor/.libs compressor/_libs
 	-rm -rf crush/.libs crush/_libs
 	-rm -rf erasure-code/.libs erasure-code/_libs
 	-rm -rf erasure-code/isa/.libs erasure-code/isa/_libs
@@ -20944,6 +24175,8 @@ clean-libtool:
 	-rm -rf msg/xio/.libs msg/xio/_libs
 	-rm -rf objclass/.libs objclass/_libs
 	-rm -rf os/.libs os/_libs
+	-rm -rf os/fs/.libs os/fs/_libs
+	-rm -rf os/newstore/.libs os/newstore/_libs
 	-rm -rf osd/.libs osd/_libs
 	-rm -rf osdc/.libs osdc/_libs
 	-rm -rf perfglue/.libs perfglue/_libs
@@ -20955,6 +24188,7 @@ clean-libtool:
 	-rm -rf test/libradosstriper/.libs test/libradosstriper/_libs
 	-rm -rf test/librbd/.libs test/librbd/_libs
 	-rm -rf test/system/.libs test/system/_libs
+	-rm -rf tracing/.libs tracing/_libs
 install-pythonPYTHON: $(python_PYTHON)
 	@$(NORMAL_INSTALL)
 	@list='$(python_PYTHON)'; dlist=; list2=; test -n "$(pythondir)" || list=; \
@@ -21447,6 +24681,20 @@ unittest_erasure_code_shec_thread.log: unittest_erasure_code_shec_thread$(EXEEXT
 	--log-file $$b.log --trs-file $$b.trs \
 	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
 	"$$tst" $(AM_TESTS_FD_REDIRECT)
+unittest_erasure_code_shec_arguments.log: unittest_erasure_code_shec_arguments$(EXEEXT)
+	@p='unittest_erasure_code_shec_arguments$(EXEEXT)'; \
+	b='unittest_erasure_code_shec_arguments'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+unittest_erasure_code_plugin_shec.log: unittest_erasure_code_plugin_shec$(EXEEXT)
+	@p='unittest_erasure_code_plugin_shec$(EXEEXT)'; \
+	b='unittest_erasure_code_plugin_shec'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
 unittest_erasure_code_example.log: unittest_erasure_code_example$(EXEEXT)
 	@p='unittest_erasure_code_example$(EXEEXT)'; \
 	b='unittest_erasure_code_example'; \
@@ -21475,13 +24723,6 @@ unittest_rbd_replay.log: unittest_rbd_replay$(EXEEXT)
 	--log-file $$b.log --trs-file $$b.trs \
 	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
 	"$$tst" $(AM_TESTS_FD_REDIRECT)
-unittest_librbd.log: unittest_librbd$(EXEEXT)
-	@p='unittest_librbd$(EXEEXT)'; \
-	b='unittest_librbd'; \
-	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
-	--log-file $$b.log --trs-file $$b.trs \
-	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
-	"$$tst" $(AM_TESTS_FD_REDIRECT)
 unittest_encoding.log: unittest_encoding$(EXEEXT)
 	@p='unittest_encoding$(EXEEXT)'; \
 	b='unittest_encoding'; \
@@ -21566,16 +24807,30 @@ unittest_osd_osdcap.log: unittest_osd_osdcap$(EXEEXT)
 	--log-file $$b.log --trs-file $$b.trs \
 	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
 	"$$tst" $(AM_TESTS_FD_REDIRECT)
-unittest_chain_xattr.log: unittest_chain_xattr$(EXEEXT)
-	@p='unittest_chain_xattr$(EXEEXT)'; \
-	b='unittest_chain_xattr'; \
+unittest_pageset.log: unittest_pageset$(EXEEXT)
+	@p='unittest_pageset$(EXEEXT)'; \
+	b='unittest_pageset'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+unittest_rocksdb_option_static.log: unittest_rocksdb_option_static$(EXEEXT)
+	@p='unittest_rocksdb_option_static$(EXEEXT)'; \
+	b='unittest_rocksdb_option_static'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+unittest_rocksdb_option.log: unittest_rocksdb_option$(EXEEXT)
+	@p='unittest_rocksdb_option$(EXEEXT)'; \
+	b='unittest_rocksdb_option'; \
 	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
 	--log-file $$b.log --trs-file $$b.trs \
 	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
 	"$$tst" $(AM_TESTS_FD_REDIRECT)
-unittest_flatindex.log: unittest_flatindex$(EXEEXT)
-	@p='unittest_flatindex$(EXEEXT)'; \
-	b='unittest_flatindex'; \
+unittest_chain_xattr.log: unittest_chain_xattr$(EXEEXT)
+	@p='unittest_chain_xattr$(EXEEXT)'; \
+	b='unittest_chain_xattr'; \
 	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
 	--log-file $$b.log --trs-file $$b.trs \
 	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
@@ -21622,6 +24877,13 @@ unittest_histogram.log: unittest_histogram$(EXEEXT)
 	--log-file $$b.log --trs-file $$b.trs \
 	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
 	"$$tst" $(AM_TESTS_FD_REDIRECT)
+unittest_prioritized_queue.log: unittest_prioritized_queue$(EXEEXT)
+	@p='unittest_prioritized_queue$(EXEEXT)'; \
+	b='unittest_prioritized_queue'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
 unittest_str_map.log: unittest_str_map$(EXEEXT)
 	@p='unittest_str_map$(EXEEXT)'; \
 	b='unittest_str_map'; \
@@ -21783,6 +25045,13 @@ unittest_bufferlist.log: unittest_bufferlist$(EXEEXT)
 	--log-file $$b.log --trs-file $$b.trs \
 	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
 	"$$tst" $(AM_TESTS_FD_REDIRECT)
+unittest_xlist.log: unittest_xlist$(EXEEXT)
+	@p='unittest_xlist$(EXEEXT)'; \
+	b='unittest_xlist'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
 unittest_crc32c.log: unittest_crc32c$(EXEEXT)
 	@p='unittest_crc32c$(EXEEXT)'; \
 	b='unittest_crc32c'; \
@@ -21951,6 +25220,13 @@ unittest_bit_vector.log: unittest_bit_vector$(EXEEXT)
 	--log-file $$b.log --trs-file $$b.trs \
 	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
 	"$$tst" $(AM_TESTS_FD_REDIRECT)
+ceph-detect-init/run-tox.sh.log: ceph-detect-init/run-tox.sh
+	@p='ceph-detect-init/run-tox.sh'; \
+	b='ceph-detect-init/run-tox.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
 test/erasure-code/test-erasure-code.sh.log: test/erasure-code/test-erasure-code.sh
 	@p='test/erasure-code/test-erasure-code.sh'; \
 	b='test/erasure-code/test-erasure-code.sh'; \
@@ -21958,6 +25234,20 @@ test/erasure-code/test-erasure-code.sh.log: test/erasure-code/test-erasure-code.
 	--log-file $$b.log --trs-file $$b.trs \
 	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
 	"$$tst" $(AM_TESTS_FD_REDIRECT)
+test/erasure-code/test-erasure-eio.sh.log: test/erasure-code/test-erasure-eio.sh
+	@p='test/erasure-code/test-erasure-eio.sh'; \
+	b='test/erasure-code/test-erasure-eio.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+test/run-rbd-unit-tests.sh.log: test/run-rbd-unit-tests.sh
+	@p='test/run-rbd-unit-tests.sh'; \
+	b='test/run-rbd-unit-tests.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
 test/ceph_objectstore_tool.py.log: test/ceph_objectstore_tool.py
 	@p='test/ceph_objectstore_tool.py'; \
 	b='test/ceph_objectstore_tool.py'; \
@@ -21993,6 +25283,13 @@ test/cephtool-test-mds.sh.log: test/cephtool-test-mds.sh
 	--log-file $$b.log --trs-file $$b.trs \
 	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
 	"$$tst" $(AM_TESTS_FD_REDIRECT)
+test/cephtool-test-rados.sh.log: test/cephtool-test-rados.sh
+	@p='test/cephtool-test-rados.sh'; \
+	b='test/cephtool-test-rados.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
 unittest_bufferlist.sh.log: unittest_bufferlist.sh
 	@p='unittest_bufferlist.sh'; \
 	b='unittest_bufferlist.sh'; \
@@ -22028,6 +25325,13 @@ test/mon/osd-crush.sh.log: test/mon/osd-crush.sh
 	--log-file $$b.log --trs-file $$b.trs \
 	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
 	"$$tst" $(AM_TESTS_FD_REDIRECT)
+test/mon/mon-ping.sh.log: test/mon/mon-ping.sh
+	@p='test/mon/mon-ping.sh'; \
+	b='test/mon/mon-ping.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
 test/mon/osd-erasure-code-profile.sh.log: test/mon/osd-erasure-code-profile.sh
 	@p='test/mon/osd-erasure-code-profile.sh'; \
 	b='test/mon/osd-erasure-code-profile.sh'; \
@@ -22042,6 +25346,13 @@ test/mon/mkfs.sh.log: test/mon/mkfs.sh
 	--log-file $$b.log --trs-file $$b.trs \
 	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
 	"$$tst" $(AM_TESTS_FD_REDIRECT)
+test/mon/mon-scrub.sh.log: test/mon/mon-scrub.sh
+	@p='test/mon/mon-scrub.sh'; \
+	b='test/mon/mon-scrub.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
 test/osd/osd-scrub-repair.sh.log: test/osd/osd-scrub-repair.sh
 	@p='test/osd/osd-scrub-repair.sh'; \
 	b='test/osd/osd-scrub-repair.sh'; \
@@ -22077,9 +25388,16 @@ test/mon/mon-handle-forward.sh.log: test/mon/mon-handle-forward.sh
 	--log-file $$b.log --trs-file $$b.trs \
 	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
 	"$$tst" $(AM_TESTS_FD_REDIRECT)
-test/ceph-disk-root.sh.log: test/ceph-disk-root.sh
-	@p='test/ceph-disk-root.sh'; \
-	b='test/ceph-disk-root.sh'; \
+test/libradosstriper/rados-striper.sh.log: test/libradosstriper/rados-striper.sh
+	@p='test/libradosstriper/rados-striper.sh'; \
+	b='test/libradosstriper/rados-striper.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+test/test_objectstore_memstore.sh.log: test/test_objectstore_memstore.sh
+	@p='test/test_objectstore_memstore.sh'; \
+	b='test/test_objectstore_memstore.sh'; \
 	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
 	--log-file $$b.log --trs-file $$b.trs \
 	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
@@ -22098,6 +25416,27 @@ test/pybind/test_ceph_argparse.py.log: test/pybind/test_ceph_argparse.py
 	--log-file $$b.log --trs-file $$b.trs \
 	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
 	"$$tst" $(AM_TESTS_FD_REDIRECT)
+test/pybind/test_ceph_daemon.py.log: test/pybind/test_ceph_daemon.py
+	@p='test/pybind/test_ceph_daemon.py'; \
+	b='test/pybind/test_ceph_daemon.py'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+../qa/workunits/erasure-code/encode-decode-non-regression.sh.log: ../qa/workunits/erasure-code/encode-decode-non-regression.sh
+	@p='../qa/workunits/erasure-code/encode-decode-non-regression.sh'; \
+	b='../qa/workunits/erasure-code/encode-decode-non-regression.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+test/encoding/readable.sh.log: test/encoding/readable.sh
+	@p='test/encoding/readable.sh'; \
+	b='test/encoding/readable.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
 .test.log:
 	@p='$<'; \
 	$(am__set_b); \
@@ -22173,16 +25512,16 @@ distdir: $(DISTFILES)
 	  dist-hook
 check-am: all-am
 	$(MAKE) $(AM_MAKEFLAGS) $(check_PROGRAMS) $(check_SCRIPTS)
-	$(MAKE) $(AM_MAKEFLAGS) check-TESTS check-local
+	$(MAKE) $(AM_MAKEFLAGS) check-TESTS
 check: $(BUILT_SOURCES)
 	$(MAKE) $(AM_MAKEFLAGS) check-recursive
 all-am: Makefile $(LIBRARIES) $(LTLIBRARIES) $(PROGRAMS) $(SCRIPTS) \
-		$(DATA) $(HEADERS) acconfig.h
+		$(DATA) $(HEADERS) acconfig.h all-local
 install-binPROGRAMS: install-libLTLIBRARIES
 
 installdirs: installdirs-recursive
 installdirs-am:
-	for dir in "$(DESTDIR)$(erasure_codelibdir)" "$(DESTDIR)$(libdir)" "$(DESTDIR)$(radoslibdir)" "$(DESTDIR)$(bindir)" "$(DESTDIR)$(sbindir)" "$(DESTDIR)$(su_sbindir)" "$(DESTDIR)$(bindir)" "$(DESTDIR)$(ceph_libexecdir)" "$(DESTDIR)$(ceph_sbindir)" "$(DESTDIR)$(bindir)" "$(DESTDIR)$(sbindir)" "$(DESTDIR)$(shell_commondir)" "$(DESTDIR)$(su_sbindir)" "$(DESTDIR)$(pythondir)" "$(DESTDIR)$(bash_completiondir)" "$(DESTDIR)$(docdir)" "$(DESTDIR)$(libcephfs_includedir)" "$(DESTDIR)$(librbd_includ [...]
+	for dir in "$(DESTDIR)$(erasure_codelibdir)" "$(DESTDIR)$(libdir)" "$(DESTDIR)$(radoslibdir)" "$(DESTDIR)$(bindir)" "$(DESTDIR)$(sbindir)" "$(DESTDIR)$(su_sbindir)" "$(DESTDIR)$(bindir)" "$(DESTDIR)$(ceph_libexecdir)" "$(DESTDIR)$(ceph_monstore_update_crushdir)" "$(DESTDIR)$(ceph_sbindir)" "$(DESTDIR)$(bindir)" "$(DESTDIR)$(sbindir)" "$(DESTDIR)$(shell_commondir)" "$(DESTDIR)$(su_sbindir)" "$(DESTDIR)$(pythondir)" "$(DESTDIR)$(bash_completiondir)" "$(DESTDIR)$(docdir)" "$(DESTDIR)$(libc [...]
 	  test -z "$$dir" || $(MKDIR_P) "$$dir"; \
 	done
 install: $(BUILT_SOURCES)
@@ -22230,12 +25569,16 @@ distclean-generic:
 	-rm -f civetweb/src/$(am__dirstamp)
 	-rm -f client/$(DEPDIR)/$(am__dirstamp)
 	-rm -f client/$(am__dirstamp)
+	-rm -f cls/cephfs/$(DEPDIR)/$(am__dirstamp)
+	-rm -f cls/cephfs/$(am__dirstamp)
 	-rm -f cls/hello/$(DEPDIR)/$(am__dirstamp)
 	-rm -f cls/hello/$(am__dirstamp)
 	-rm -f cls/lock/$(DEPDIR)/$(am__dirstamp)
 	-rm -f cls/lock/$(am__dirstamp)
 	-rm -f cls/log/$(DEPDIR)/$(am__dirstamp)
 	-rm -f cls/log/$(am__dirstamp)
+	-rm -f cls/numops/$(DEPDIR)/$(am__dirstamp)
+	-rm -f cls/numops/$(am__dirstamp)
 	-rm -f cls/rbd/$(DEPDIR)/$(am__dirstamp)
 	-rm -f cls/rbd/$(am__dirstamp)
 	-rm -f cls/refcount/$(DEPDIR)/$(am__dirstamp)
@@ -22246,12 +25589,16 @@ distclean-generic:
 	-rm -f cls/rgw/$(am__dirstamp)
 	-rm -f cls/statelog/$(DEPDIR)/$(am__dirstamp)
 	-rm -f cls/statelog/$(am__dirstamp)
+	-rm -f cls/timeindex/$(DEPDIR)/$(am__dirstamp)
+	-rm -f cls/timeindex/$(am__dirstamp)
 	-rm -f cls/user/$(DEPDIR)/$(am__dirstamp)
 	-rm -f cls/user/$(am__dirstamp)
 	-rm -f cls/version/$(DEPDIR)/$(am__dirstamp)
 	-rm -f cls/version/$(am__dirstamp)
 	-rm -f common/$(DEPDIR)/$(am__dirstamp)
 	-rm -f common/$(am__dirstamp)
+	-rm -f compressor/$(DEPDIR)/$(am__dirstamp)
+	-rm -f compressor/$(am__dirstamp)
 	-rm -f crush/$(DEPDIR)/$(am__dirstamp)
 	-rm -f crush/$(am__dirstamp)
 	-rm -f erasure-code/$(DEPDIR)/$(am__dirstamp)
@@ -22306,6 +25653,10 @@ distclean-generic:
 	-rm -f objclass/$(am__dirstamp)
 	-rm -f os/$(DEPDIR)/$(am__dirstamp)
 	-rm -f os/$(am__dirstamp)
+	-rm -f os/fs/$(DEPDIR)/$(am__dirstamp)
+	-rm -f os/fs/$(am__dirstamp)
+	-rm -f os/newstore/$(DEPDIR)/$(am__dirstamp)
+	-rm -f os/newstore/$(am__dirstamp)
 	-rm -f osd/$(DEPDIR)/$(am__dirstamp)
 	-rm -f osd/$(am__dirstamp)
 	-rm -f osdc/$(DEPDIR)/$(am__dirstamp)
@@ -22330,6 +25681,8 @@ distclean-generic:
 	-rm -f test/cls_lock/$(am__dirstamp)
 	-rm -f test/cls_log/$(DEPDIR)/$(am__dirstamp)
 	-rm -f test/cls_log/$(am__dirstamp)
+	-rm -f test/cls_numops/$(DEPDIR)/$(am__dirstamp)
+	-rm -f test/cls_numops/$(am__dirstamp)
 	-rm -f test/cls_rbd/$(DEPDIR)/$(am__dirstamp)
 	-rm -f test/cls_rbd/$(am__dirstamp)
 	-rm -f test/cls_refcount/$(DEPDIR)/$(am__dirstamp)
@@ -22390,6 +25743,8 @@ distclean-generic:
 	-rm -f tools/cephfs/$(am__dirstamp)
 	-rm -f tools/rados/$(DEPDIR)/$(am__dirstamp)
 	-rm -f tools/rados/$(am__dirstamp)
+	-rm -f tracing/$(DEPDIR)/$(am__dirstamp)
+	-rm -f tracing/$(am__dirstamp)
 
 maintainer-clean-generic:
 	@echo "This command is intended for maintainers to use"
@@ -22405,7 +25760,7 @@ clean-am: clean-binPROGRAMS clean-checkPROGRAMS \
 	clean-sbinPROGRAMS clean-su_sbinPROGRAMS mostlyclean-am
 
 distclean: distclean-recursive
-	-rm -rf ./$(DEPDIR) arch/$(DEPDIR) auth/$(DEPDIR) auth/cephx/$(DEPDIR) auth/none/$(DEPDIR) auth/unknown/$(DEPDIR) civetweb/src/$(DEPDIR) client/$(DEPDIR) cls/hello/$(DEPDIR) cls/lock/$(DEPDIR) cls/log/$(DEPDIR) cls/rbd/$(DEPDIR) cls/refcount/$(DEPDIR) cls/replica_log/$(DEPDIR) cls/rgw/$(DEPDIR) cls/statelog/$(DEPDIR) cls/user/$(DEPDIR) cls/version/$(DEPDIR) common/$(DEPDIR) crush/$(DEPDIR) erasure-code/$(DEPDIR) erasure-code/isa/$(DEPDIR) erasure-code/isa/isa-l/erasure_code/$(DEPDIR) er [...]
+	-rm -rf ./$(DEPDIR) arch/$(DEPDIR) auth/$(DEPDIR) auth/cephx/$(DEPDIR) auth/none/$(DEPDIR) auth/unknown/$(DEPDIR) civetweb/src/$(DEPDIR) client/$(DEPDIR) cls/cephfs/$(DEPDIR) cls/hello/$(DEPDIR) cls/lock/$(DEPDIR) cls/log/$(DEPDIR) cls/numops/$(DEPDIR) cls/rbd/$(DEPDIR) cls/refcount/$(DEPDIR) cls/replica_log/$(DEPDIR) cls/rgw/$(DEPDIR) cls/statelog/$(DEPDIR) cls/timeindex/$(DEPDIR) cls/user/$(DEPDIR) cls/version/$(DEPDIR) common/$(DEPDIR) compressor/$(DEPDIR) crush/$(DEPDIR) erasure-cod [...]
 	-rm -f Makefile
 distclean-am: clean-am distclean-compile distclean-generic \
 	distclean-hdr distclean-tags
@@ -22422,8 +25777,9 @@ info: info-recursive
 
 info-am:
 
-install-data-am: install-bash_completionDATA install-ceph_sbinSCRIPTS \
-	install-data-local install-docDATA \
+install-data-am: install-bash_completionDATA \
+	install-ceph_monstore_update_crushSCRIPTS \
+	install-ceph_sbinSCRIPTS install-data-local install-docDATA \
 	install-erasure_codelibLTLIBRARIES \
 	install-libcephfs_includeDATA install-librbd_includeDATA \
 	install-pythonPYTHON install-rados_includeDATA \
@@ -22461,7 +25817,7 @@ install-ps-am:
 installcheck-am:
 
 maintainer-clean: maintainer-clean-recursive
-	-rm -rf ./$(DEPDIR) arch/$(DEPDIR) auth/$(DEPDIR) auth/cephx/$(DEPDIR) auth/none/$(DEPDIR) auth/unknown/$(DEPDIR) civetweb/src/$(DEPDIR) client/$(DEPDIR) cls/hello/$(DEPDIR) cls/lock/$(DEPDIR) cls/log/$(DEPDIR) cls/rbd/$(DEPDIR) cls/refcount/$(DEPDIR) cls/replica_log/$(DEPDIR) cls/rgw/$(DEPDIR) cls/statelog/$(DEPDIR) cls/user/$(DEPDIR) cls/version/$(DEPDIR) common/$(DEPDIR) crush/$(DEPDIR) erasure-code/$(DEPDIR) erasure-code/isa/$(DEPDIR) erasure-code/isa/isa-l/erasure_code/$(DEPDIR) er [...]
+	-rm -rf ./$(DEPDIR) arch/$(DEPDIR) auth/$(DEPDIR) auth/cephx/$(DEPDIR) auth/none/$(DEPDIR) auth/unknown/$(DEPDIR) civetweb/src/$(DEPDIR) client/$(DEPDIR) cls/cephfs/$(DEPDIR) cls/hello/$(DEPDIR) cls/lock/$(DEPDIR) cls/log/$(DEPDIR) cls/numops/$(DEPDIR) cls/rbd/$(DEPDIR) cls/refcount/$(DEPDIR) cls/replica_log/$(DEPDIR) cls/rgw/$(DEPDIR) cls/statelog/$(DEPDIR) cls/timeindex/$(DEPDIR) cls/user/$(DEPDIR) cls/version/$(DEPDIR) common/$(DEPDIR) compressor/$(DEPDIR) crush/$(DEPDIR) erasure-cod [...]
 	-rm -f Makefile
 maintainer-clean-am: distclean-am maintainer-clean-generic
 
@@ -22480,6 +25836,7 @@ ps-am:
 
 uninstall-am: uninstall-bash_completionDATA uninstall-binPROGRAMS \
 	uninstall-binSCRIPTS uninstall-ceph_libexecSCRIPTS \
+	uninstall-ceph_monstore_update_crushSCRIPTS \
 	uninstall-ceph_sbinSCRIPTS uninstall-dist_binSCRIPTS \
 	uninstall-docDATA uninstall-erasure_codelibLTLIBRARIES \
 	uninstall-libLTLIBRARIES uninstall-libcephfs_includeDATA \
@@ -22493,8 +25850,8 @@ uninstall-am: uninstall-bash_completionDATA uninstall-binPROGRAMS \
 .MAKE: $(am__recursive_targets) all check check-am install install-am \
 	install-strip
 
-.PHONY: $(am__recursive_targets) CTAGS GTAGS TAGS all all-am check \
-	check-TESTS check-am check-local clean clean-binPROGRAMS \
+.PHONY: $(am__recursive_targets) CTAGS GTAGS TAGS all all-am all-local \
+	check check-TESTS check-am clean clean-binPROGRAMS \
 	clean-checkPROGRAMS clean-erasure_codelibLTLIBRARIES \
 	clean-generic clean-libLTLIBRARIES clean-libtool clean-local \
 	clean-noinstLIBRARIES clean-noinstLTLIBRARIES \
@@ -22505,12 +25862,13 @@ uninstall-am: uninstall-bash_completionDATA uninstall-binPROGRAMS \
 	distclean-tags distdir dvi dvi-am html html-am info info-am \
 	install install-am install-bash_completionDATA \
 	install-binPROGRAMS install-binSCRIPTS \
-	install-ceph_libexecSCRIPTS install-ceph_sbinSCRIPTS \
-	install-data install-data-am install-data-local \
-	install-dist_binSCRIPTS install-docDATA install-dvi \
-	install-dvi-am install-erasure_codelibLTLIBRARIES install-exec \
-	install-exec-am install-html install-html-am install-info \
-	install-info-am install-libLTLIBRARIES \
+	install-ceph_libexecSCRIPTS \
+	install-ceph_monstore_update_crushSCRIPTS \
+	install-ceph_sbinSCRIPTS install-data install-data-am \
+	install-data-local install-dist_binSCRIPTS install-docDATA \
+	install-dvi install-dvi-am install-erasure_codelibLTLIBRARIES \
+	install-exec install-exec-am install-html install-html-am \
+	install-info install-info-am install-libLTLIBRARIES \
 	install-libcephfs_includeDATA install-librbd_includeDATA \
 	install-man install-pdf install-pdf-am install-ps \
 	install-ps-am install-pythonPYTHON install-rados_includeDATA \
@@ -22524,6 +25882,7 @@ uninstall-am: uninstall-bash_completionDATA uninstall-binPROGRAMS \
 	recheck tags tags-am uninstall uninstall-am \
 	uninstall-bash_completionDATA uninstall-binPROGRAMS \
 	uninstall-binSCRIPTS uninstall-ceph_libexecSCRIPTS \
+	uninstall-ceph_monstore_update_crushSCRIPTS \
 	uninstall-ceph_sbinSCRIPTS uninstall-dist_binSCRIPTS \
 	uninstall-docDATA uninstall-erasure_codelibLTLIBRARIES \
 	uninstall-libLTLIBRARIES uninstall-libcephfs_includeDATA \
@@ -22540,9 +25899,35 @@ export VERBOSE = true
 
 # python unit tests need to know where the scripts are located
 export PYTHONPATH=$(top_srcdir)/src/pybind
+
+		ifeq ($(SET_STACK_PROTECTOR_STRONG),1)
+				HARDENING_CFLAGS += -fstack-protector-strong
+		else
+				HARDENING_CFLAGS += -fstack-protector
+		endif
 @CLANG_FALSE@	AM_COMMON_CFLAGS += -rdynamic
 @CLANG_FALSE@	AM_CXXFLAGS += -Wstrict-null-sentinel
 
+ at NO_GIT_VERSION_TRUE@export NO_VERSION="yes"
+
+all-local::
+	cd $(srcdir)/ceph-detect-init ; python setup.py build
+
+clean-local::
+	cd $(srcdir)/ceph-detect-init ; python setup.py clean ; rm -fr wheelhouse .tox build .coverage *.egg-info
+
+install-data-local::
+	cd $(srcdir)/ceph-detect-init ; \
+	if test "$(DESTDIR)" ; then \
+		if lsb_release -si | grep --quiet 'Ubuntu\|Debian\|Devuan' ; then \
+			options=--install-layout=deb ; \
+		else \
+			options=--prefix=/usr ; \
+		fi ; \
+		root="--root=$(DESTDIR)" ; \
+	fi ; \
+	python setup.py install $$root $$options
+
 #crush_includedir = $(includedir)/crush
 #crush_include_DATA = \
 #	$(srcdir)/crush/hash.h \
@@ -22558,6 +25943,8 @@ erasure-code/lrc/ErasureCodePluginLrc.cc: ./ceph_ver.h
 
 erasure-code/shec/ErasureCodePluginShec.cc: ./ceph_ver.h
 
+erasure-code/shec/ErasureCodePluginSelectShec.cc: ./ceph_ver.h
+
 @WITH_BETTER_YASM_ELF64_TRUE at erasure-code/isa/ErasureCodePluginIsa.cc: ./ceph_ver.h
 erasure-code/ErasureCodePlugin.cc: ./ceph_ver.h
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at test/erasure-code/ErasureCodePluginExample.cc: ./ceph_ver.h
@@ -22569,14 +25956,16 @@ erasure-code/ErasureCodePlugin.cc: ./ceph_ver.h
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at test/erasure-code/TestJerasurePluginSSE4.cc: ./ceph_ver.h
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at test/erasure-code/TestJerasurePluginSSE3.cc: ./ceph_ver.h
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at test/erasure-code/TestJerasurePluginGeneric.cc: ./ceph_ver.h
-
-docker-check:
-	$(srcdir)/test/container-make-check-ubuntu-14.04.sh
-	$(srcdir)/test/container-make-check-centos-centos7.sh
-	$(srcdir)/test/container-make-check-centos-centos7.sh
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at test/erasure-code/TestShecPluginNEON.cc: ./ceph_ver.h
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at test/erasure-code/TestShecPluginSSE4.cc: ./ceph_ver.h
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at test/erasure-code/TestShecPluginSSE3.cc: ./ceph_ver.h
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at test/erasure-code/TestShecPluginGeneric.cc: ./ceph_ver.h
 
 # target to build but not run the unit tests
 unittests:: $(check_PROGRAMS)
+
+ at WITH_LTTNG_TRUE@tracing/%.h: tracing/%.tp
+ at WITH_LTTNG_TRUE@	$(LTTNG_GEN_TP_PROG) $< -o tracing/$*.h
 $(shell_scripts): Makefile
 $(shell_scripts): %: %.in
 	rm -f $@ $@.tmp
@@ -22589,10 +25978,6 @@ $(shell_scripts): %: %.in
 # NOTE: this won't work on suse, where docdir is /usr/share/doc/packages/$package.
 docdir ?= ${datadir}/doc/ceph
 
-check-local:
-	$(top_srcdir)/qa/workunits/erasure-code/encode-decode-non-regression.sh 
-	$(srcdir)/test/encoding/readable.sh ../ceph-object-corpus
-
 # base targets
 
 core-daemons: ceph-mon ceph-osd ceph-mds radosgw
@@ -22607,19 +25992,17 @@ base: core-daemons admin-tools \
 
 FORCE:
 .git_version: FORCE
-	$(srcdir)/check_version $(srcdir)/.git_version
+	$(srcdir)/make_version -g $(srcdir)/.git_version
 
 # if NO_VERSION is set, only generate a new ceph_ver.h if there currently 
 # is none, and call "make_version -n" to fill it with a fixed string.
 # Otherwise, set it from the contents of .git_version.
 
-ceph_ver.h: .git_version
+ceph_ver.h: .git_version FORCE
 	if [ -n "$$NO_VERSION" ] ; then \
-	    if [ ! -f ./ceph_ver.h ] ; then \
-	        $(srcdir)/make_version -n ./ceph_ver.h ; \
-	    fi; \
-        else \
-	    $(srcdir)/make_version $(srcdir)/.git_version ./ceph_ver.h ; \
+		$(srcdir)/make_version -g $(srcdir)/.git_version -c $(srcdir)/ceph_ver.h -n ; \
+	else \
+		$(srcdir)/make_version -g $(srcdir)/.git_version -c $(srcdir)/ceph_ver.h ; \
 	fi
 
 ceph_ver.c: ./ceph_ver.h
@@ -22630,11 +26013,11 @@ sample.fetch_config: fetch_config
 	cp -f $(srcdir)/fetch_config ./sample.fetch_config
 
 dist-hook:
-	$(srcdir)/check_version $(srcdir)/.git_version
+	$(srcdir)/make_version -g $(srcdir)/.git_version
 
 # cleaning
 
-clean-local:
+clean-local::
 	rm -f *.so 
 	find . -name '*.gcno' -o -name '*.gcda' -o -name '*.lcov' -o -name "*.o" -o -name "*.lo" | xargs rm -f
 	rm -f ceph java/java/com/ceph/crush/Bucket.class
@@ -22655,12 +26038,12 @@ uninstall-coverage:
 check-coverage:
 @ENABLE_COVERAGE_TRUE@	-test/coverage.sh -d $(srcdir) -o check-coverage make check
 
-install-data-local: install-coverage
+install-data-local:: install-coverage
 	-mkdir -p $(DESTDIR)$(sysconfdir)/ceph
 	-mkdir -p $(DESTDIR)$(localstatedir)/log/ceph
 	-mkdir -p $(DESTDIR)$(localstatedir)/lib/ceph/tmp
 
-uninstall-local: uninstall-coverage
+uninstall-local:: uninstall-coverage
 	-rmdir -p $(DESTDIR)$(sysconfdir)/ceph/
 	-rmdir -p $(DESTDIR)$(localstatedir)/log/ceph
 	-rmdir -p $(DESTDIR)$(localstatedir)/lib/ceph/tmp
@@ -22687,11 +26070,10 @@ coverity-submit:
 
 @ENABLE_CLIENT_TRUE at ceph: ceph.in ./ceph_ver.h Makefile
 @ENABLE_CLIENT_TRUE@	rm -f $@ $@.tmp
- at ENABLE_CLIENT_TRUE@	echo "#!/usr/bin/env python" >$@.tmp
- at ENABLE_CLIENT_TRUE@	grep "#define CEPH_GIT_NICE_VER" ./ceph_ver.h | \
- at ENABLE_CLIENT_TRUE@		sed -e 's/#define \(.*VER\) /\1=/' >>$@.tmp
- at ENABLE_CLIENT_TRUE@	grep "#define CEPH_GIT_VER" ./ceph_ver.h | \
- at ENABLE_CLIENT_TRUE@	  sed -e 's/#define \(.*VER\) /\1=/' -e 's/=\(.*\)$$/="\1"/' >>$@.tmp
+ at ENABLE_CLIENT_TRUE@	cp $@.in $@.tmp
+ at ENABLE_CLIENT_TRUE@	sed -ie "s|@PYTHON_EXECUTABLE@|/usr/bin/env python|" $@.tmp
+ at ENABLE_CLIENT_TRUE@	grep CEPH_GIT_NICE_VER ./ceph_ver.h | cut -f 3 -d " " | sed s/\"//g | xargs -I "{}" sed -ie "s/@CEPH_GIT_NICE_VER@/{}/g" $@.tmp
+ at ENABLE_CLIENT_TRUE@	grep CEPH_GIT_VER ./ceph_ver.h | cut -f 3 -d " " | sed s/\"//g | xargs -I "{}" sed -ie "s/@CEPH_GIT_VER@/{}/g" $@.tmp
 @ENABLE_CLIENT_TRUE@	cat $(srcdir)/$@.in >>$@.tmp
 @ENABLE_CLIENT_TRUE@	chmod a+x $@.tmp
 @ENABLE_CLIENT_TRUE@	chmod a-w $@.tmp
diff --git a/src/acconfig.h.in b/src/acconfig.h.in
index 2e8dbfd..8b74b6c 100644
--- a/src/acconfig.h.in
+++ b/src/acconfig.h.in
@@ -21,6 +21,9 @@
 /* FastCGI headers are in /usr/include/fastcgi */
 #undef FASTCGI_INCLUDE_DIR
 
+/* Support ARMv8 CRC instructions */
+#undef HAVE_ARMV8_CRC
+
 /* Define to 1 if you have the <arpa/inet.h> header file. */
 #undef HAVE_ARPA_INET_H
 
@@ -65,8 +68,8 @@
 /* Define if the C complier supports __func__ */
 #undef HAVE_FUNC
 
-/* Define to 1 if you have the `fuse_getgroups' function. */
-#undef HAVE_FUSE_GETGROUPS
+/* Define to 1 if you have the `getgrouplist' function. */
+#undef HAVE_GETGROUPLIST
 
 /* we have a recent yasm and are x86_64 */
 #undef HAVE_GOOD_YASM_ELF64
@@ -112,6 +115,13 @@
    (-lboost_program_options-mt). */
 #undef HAVE_LIBBOOST_PROGRAM_OPTIONS_MT
 
+/* Define to 1 if you have the `boost_random' library (-lboost_random). */
+#undef HAVE_LIBBOOST_RANDOM
+
+/* Define to 1 if you have the `boost_random-mt' library (-lboost_random-mt).
+   */
+#undef HAVE_LIBBOOST_RANDOM_MT
+
 /* Define to 1 if you have the `boost_regex' library (-lboost_regex). */
 #undef HAVE_LIBBOOST_REGEX
 
@@ -129,8 +139,8 @@
    */
 #undef HAVE_LIBBOOST_THREAD_MT
 
-/* Define if you have fuse */
-#undef HAVE_LIBFUSE
+/* Define to 1 if you have the `curl' library (-lcurl). */
+#undef HAVE_LIBCURL
 
 /* Define to 1 if you have the `ibverbs' library (-libverbs). */
 #undef HAVE_LIBIBVERBS
@@ -159,6 +169,9 @@
 /* Define to 1 if you have the `xio' library (-lxio). */
 #undef HAVE_LIBXIO
 
+/* Define to 1 if you have the `xml2' library (-lxml2). */
+#undef HAVE_LIBXML2
+
 /* Defined if you have libzfs enabled */
 #undef HAVE_LIBZFS
 
@@ -171,6 +184,9 @@
 /* Define to 1 if you have the <memory.h> header file. */
 #undef HAVE_MEMORY_H
 
+/* name_to_handle_at exists */
+#undef HAVE_NAME_TO_HANDLE_AT
+
 /* Define to 1 if you have the <ndir.h> header file, and it defines `DIR'. */
 #undef HAVE_NDIR_H
 
diff --git a/src/arch/arm.c b/src/arch/arm.c
index 93d079a..67c5373 100644
--- a/src/arch/arm.c
+++ b/src/arch/arm.c
@@ -2,6 +2,7 @@
 
 /* flags we export */
 int ceph_arch_neon = 0;
+int ceph_arch_aarch64_crc32 = 0;
 
 #include <stdio.h>
 
@@ -47,6 +48,11 @@ int ceph_arch_arm_probe(void)
 	ceph_arch_neon = (get_hwcap() & HWCAP_NEON) == HWCAP_NEON;
 #elif __aarch64__ && __linux__
 	ceph_arch_neon = (get_hwcap() & HWCAP_ASIMD) == HWCAP_ASIMD;
+# ifdef HWCAP_CRC32
+	ceph_arch_aarch64_crc32 = (get_hwcap() & HWCAP_CRC32) == HWCAP_CRC32;
+# else
+	ceph_arch_aarch64_crc32 = 0;  // sorry!
+# endif
 #else
 	if (0)
 		get_hwcap();  // make compiler shut up
diff --git a/src/arch/arm.h b/src/arch/arm.h
index f613438..1659b2e 100644
--- a/src/arch/arm.h
+++ b/src/arch/arm.h
@@ -6,6 +6,7 @@ extern "C" {
 #endif
 
 extern int ceph_arch_neon;  /* true if we have ARM NEON or ASIMD abilities */
+extern int ceph_arch_aarch64_crc32;  /* true if we have AArch64 CRC32/CRC32C abilities */
 
 extern int ceph_arch_arm_probe(void);
 
diff --git a/src/auth/Auth.h b/src/auth/Auth.h
index 2d89a03..12d4909 100644
--- a/src/auth/Auth.h
+++ b/src/auth/Auth.h
@@ -136,7 +136,7 @@ struct AuthAuthorizer {
   bufferlist bl;
   CryptoKey session_key;
 
-  AuthAuthorizer(__u32 p) : protocol(p) {}
+  explicit AuthAuthorizer(__u32 p) : protocol(p) {}
   virtual ~AuthAuthorizer() {}
   virtual bool verify_reply(bufferlist::iterator& reply) = 0;
 };
diff --git a/src/auth/Crypto.cc b/src/auth/Crypto.cc
index e401c96..24c4bd0 100644
--- a/src/auth/Crypto.cc
+++ b/src/auth/Crypto.cc
@@ -17,7 +17,7 @@
 # include <cryptopp/modes.h>
 # include <cryptopp/aes.h>
 # include <cryptopp/filters.h>
-#elif USE_NSS
+#elif defined(USE_NSS)
 # include <nspr.h>
 # include <nss.h>
 # include <pk11pub.h>
@@ -62,160 +62,273 @@ uint64_t get_random(uint64_t min_val, uint64_t max_val)
   return r;
 }
 
+
 // ---------------------------------------------------
 
-int CryptoNone::create(bufferptr& secret)
-{
-  return 0;
-}
+class CryptoNoneKeyHandler : public CryptoKeyHandler {
+public:
+  int encrypt(const bufferlist& in,
+	       bufferlist& out, std::string *error) const {
+    out = in;
+    return 0;
+  }
+  int decrypt(const bufferlist& in,
+	      bufferlist& out, std::string *error) const {
+    out = in;
+    return 0;
+  }
+};
+
+class CryptoNone : public CryptoHandler {
+public:
+  CryptoNone() { }
+  ~CryptoNone() {}
+  int get_type() const {
+    return CEPH_CRYPTO_NONE;
+  }
+  int create(bufferptr& secret) {
+    return 0;
+  }
+  int validate_secret(const bufferptr& secret) {
+    return 0;
+  }
+  CryptoKeyHandler *get_key_handler(const bufferptr& secret, string& error) {
+    return new CryptoNoneKeyHandler;
+  }
+};
 
-int CryptoNone::validate_secret(bufferptr& secret)
-{
-  return 0;
-}
 
-void CryptoNone::encrypt(const bufferptr& secret, const bufferlist& in,
-			 bufferlist& out, std::string &error) const
-{
-  out = in;
-}
+// ---------------------------------------------------
 
-void CryptoNone::decrypt(const bufferptr& secret, const bufferlist& in,
-			 bufferlist& out, std::string &error) const
-{
-  out = in;
-}
 
+class CryptoAES : public CryptoHandler {
+public:
+  CryptoAES() { }
+  ~CryptoAES() {}
+  int get_type() const {
+    return CEPH_CRYPTO_AES;
+  }
+  int create(bufferptr& secret);
+  int validate_secret(const bufferptr& secret);
+  CryptoKeyHandler *get_key_handler(const bufferptr& secret, string& error);
+};
 
-// ---------------------------------------------------
 #ifdef USE_CRYPTOPP
 # define AES_KEY_LEN     ((size_t)CryptoPP::AES::DEFAULT_KEYLENGTH)
 # define AES_BLOCK_LEN   ((size_t)CryptoPP::AES::BLOCKSIZE)
-#elif USE_NSS
-// when we say AES, we mean AES-128
-# define AES_KEY_LEN	16
-# define AES_BLOCK_LEN   16
 
-static void nss_aes_operation(CK_ATTRIBUTE_TYPE op, const bufferptr& secret,
-			     const bufferlist& in, bufferlist& out, std::string &error)
-{
-  const CK_MECHANISM_TYPE mechanism = CKM_AES_CBC_PAD;
+class CryptoAESKeyHandler : public CryptoKeyHandler {
+public:
+  CryptoPP::AES::Encryption *enc_key;
+  CryptoPP::AES::Decryption *dec_key;
+
+  CryptoAESKeyHandler()
+    : enc_key(NULL),
+      dec_key(NULL) {}
+  ~CryptoAESKeyHandler() {
+    delete enc_key;
+    delete dec_key;
+  }
 
-  // sample source said this has to be at least size of input + 8,
-  // but i see 15 still fail with SEC_ERROR_OUTPUT_LEN
-  bufferptr out_tmp(in.length()+16);
+  int init(const bufferptr& s, ostringstream& err) {
+    secret = s;
 
-  PK11SlotInfo *slot;
+    enc_key = new CryptoPP::AES::Encryption(
+      (byte*)secret.c_str(), CryptoPP::AES::DEFAULT_KEYLENGTH);
+    dec_key = new CryptoPP::AES::Decryption(
+      (byte*)secret.c_str(), CryptoPP::AES::DEFAULT_KEYLENGTH);
 
-  slot = PK11_GetBestSlot(mechanism, NULL);
-  if (!slot) {
-    ostringstream oss;
-    oss << "cannot find NSS slot to use: " << PR_GetError();
-    error = oss.str();
-    goto err;
+    return 0;
   }
 
-  SECItem keyItem;
-
-  keyItem.type = siBuffer;
-  keyItem.data = (unsigned char*)secret.c_str();
-  keyItem.len = secret.length();
-
-  PK11SymKey *key;
+  int encrypt(const bufferlist& in,
+	      bufferlist& out, std::string *error) const {
+    string ciphertext;
+    CryptoPP::StringSink *sink = new CryptoPP::StringSink(ciphertext);
+    CryptoPP::CBC_Mode_ExternalCipher::Encryption cbc(
+      *enc_key, (const byte*)CEPH_AES_IV);
+    CryptoPP::StreamTransformationFilter stfEncryptor(cbc, sink);
 
-  key = PK11_ImportSymKey(slot, mechanism, PK11_OriginUnwrap, CKA_ENCRYPT,
-			  &keyItem, NULL);
-  if (!key) {
-    ostringstream oss;
-    oss << "cannot convert AES key for NSS: " << PR_GetError();
-    error = oss.str();
-    goto err_slot;
+    for (std::list<bufferptr>::const_iterator it = in.buffers().begin();
+	 it != in.buffers().end(); ++it) {
+      const unsigned char *in_buf = (const unsigned char *)it->c_str();
+      stfEncryptor.Put(in_buf, it->length());
+    }
+    try {
+      stfEncryptor.MessageEnd();
+    } catch (CryptoPP::Exception& e) {
+      if (error) {
+	ostringstream oss;
+	oss << "encryptor.MessageEnd::Exception: " << e.GetWhat();
+	*error = oss.str();
+      }
+      return -1;
+    }
+    out.append((const char *)ciphertext.c_str(), ciphertext.length());
+    return 0;
   }
 
-  SECItem ivItem;
-
-  ivItem.type = siBuffer;
-  // losing constness due to SECItem.data; IV should never be
-  // modified, regardless
-  ivItem.data = (unsigned char*)CEPH_AES_IV;
-  ivItem.len = sizeof(CEPH_AES_IV);
+  int decrypt(const bufferlist& in,
+	      bufferlist& out, std::string *error) const {
+    string decryptedtext;
+    CryptoPP::StringSink *sink = new CryptoPP::StringSink(decryptedtext);
+    CryptoPP::CBC_Mode_ExternalCipher::Decryption cbc(
+      *dec_key, (const byte*)CEPH_AES_IV );
+    CryptoPP::StreamTransformationFilter stfDecryptor(cbc, sink);
+    for (std::list<bufferptr>::const_iterator it = in.buffers().begin();
+	 it != in.buffers().end(); ++it) {
+      const unsigned char *in_buf = (const unsigned char *)it->c_str();
+      stfDecryptor.Put(in_buf, it->length());
+    }
 
-  SECItem *param;
+    try {
+      stfDecryptor.MessageEnd();
+    } catch (CryptoPP::Exception& e) {
+      if (error) {
+	ostringstream oss;
+	oss << "decryptor.MessageEnd::Exception: " << e.GetWhat();
+	*error = oss.str();
+      }
+      return -1;
+    }
 
-  param = PK11_ParamFromIV(mechanism, &ivItem);
-  if (!param) {
-    ostringstream oss;
-    oss << "cannot set NSS IV param: " << PR_GetError();
-    error = oss.str();
-    goto err_key;
+    out.append((const char *)decryptedtext.c_str(), decryptedtext.length());
+    return 0;
   }
+};
 
-  PK11Context *ctx;
+#elif defined(USE_NSS)
+// when we say AES, we mean AES-128
+# define AES_KEY_LEN	16
+# define AES_BLOCK_LEN   16
 
-  ctx = PK11_CreateContextBySymKey(mechanism, op, key, param);
-  if (!ctx) {
-    ostringstream oss;
-    oss << "cannot create NSS context: " << PR_GetError();
-    error = oss.str();
-    goto err_param;
-  }
+static int nss_aes_operation(CK_ATTRIBUTE_TYPE op,
+			     CK_MECHANISM_TYPE mechanism,
+			     PK11SymKey *key,
+			     SECItem *param,
+			     const bufferlist& in, bufferlist& out,
+			     std::string *error)
+{
+  // sample source said this has to be at least size of input + 8,
+  // but i see 15 still fail with SEC_ERROR_OUTPUT_LEN
+  bufferptr out_tmp(in.length()+16);
+  bufferlist incopy;
 
   SECStatus ret;
   int written;
-  // in is const, and PK11_CipherOp is not; C++ makes this hard to cheat,
-  // so just copy it to a temp buffer, at least for now
-  unsigned in_len;
   unsigned char *in_buf;
-  in_len = in.length();
-  in_buf = (unsigned char*)malloc(in_len);
-  if (!in_buf)
-    throw std::bad_alloc();
-  in.copy(0, in_len, (char*)in_buf);
-  ret = PK11_CipherOp(ctx, (unsigned char*)out_tmp.c_str(), &written, out_tmp.length(),
+
+  PK11Context *ectx;
+  ectx = PK11_CreateContextBySymKey(mechanism, op, key, param);
+  assert(ectx);
+
+  incopy = in;  // it's a shallow copy!
+  in_buf = (unsigned char*)incopy.c_str();
+  ret = PK11_CipherOp(ectx,
+		      (unsigned char*)out_tmp.c_str(), &written, out_tmp.length(),
 		      in_buf, in.length());
-  free(in_buf);
   if (ret != SECSuccess) {
-    ostringstream oss;
-    oss << "NSS AES failed: " << PR_GetError();
-    error = oss.str();
-    goto err_op;
+    PK11_DestroyContext(ectx, PR_TRUE);
+    if (error) {
+      ostringstream oss;
+      oss << "NSS AES failed: " << PR_GetError();
+      *error = oss.str();
+    }
+    return -1;
   }
 
   unsigned int written2;
-  ret = PK11_DigestFinal(ctx, (unsigned char*)out_tmp.c_str()+written, &written2,
+  ret = PK11_DigestFinal(ectx,
+			 (unsigned char*)out_tmp.c_str()+written, &written2,
 			 out_tmp.length()-written);
+  PK11_DestroyContext(ectx, PR_TRUE);
   if (ret != SECSuccess) {
-    ostringstream oss;
-    oss << "NSS AES final round failed: " << PR_GetError();
-    error = oss.str();
-    goto err_op;
+    PK11_DestroyContext(ectx, PR_TRUE);
+    if (error) {
+      ostringstream oss;
+      oss << "NSS AES final round failed: " << PR_GetError();
+      *error = oss.str();
+    }
+    return -1;
   }
 
   out_tmp.set_length(written + written2);
   out.append(out_tmp);
-
-  PK11_DestroyContext(ctx, PR_TRUE);
-  SECITEM_FreeItem(param, PR_TRUE);
-  PK11_FreeSymKey(key);
-  PK11_FreeSlot(slot);
-  return;
-
- err_op:
-  PK11_DestroyContext(ctx, PR_TRUE);
- err_param:
-  SECITEM_FreeItem(param, PR_TRUE);
- err_key:
-  PK11_FreeSymKey(key);
- err_slot:
-  PK11_FreeSlot(slot);
- err:
-  ;
+  return 0;
 }
 
+class CryptoAESKeyHandler : public CryptoKeyHandler {
+  CK_MECHANISM_TYPE mechanism;
+  PK11SlotInfo *slot;
+  PK11SymKey *key;
+  SECItem *param;
+
+public:
+  CryptoAESKeyHandler()
+    : mechanism(CKM_AES_CBC_PAD),
+      slot(NULL),
+      key(NULL),
+      param(NULL) {}
+  ~CryptoAESKeyHandler() {
+    SECITEM_FreeItem(param, PR_TRUE);
+    PK11_FreeSymKey(key);
+    PK11_FreeSlot(slot);
+  }
+
+  int init(const bufferptr& s, ostringstream& err) {
+    secret = s;
+
+    slot = PK11_GetBestSlot(mechanism, NULL);
+    if (!slot) {
+      err << "cannot find NSS slot to use: " << PR_GetError();
+      return -1;
+    }
+
+    SECItem keyItem;
+    keyItem.type = siBuffer;
+    keyItem.data = (unsigned char*)secret.c_str();
+    keyItem.len = secret.length();
+    key = PK11_ImportSymKey(slot, mechanism, PK11_OriginUnwrap, CKA_ENCRYPT,
+			    &keyItem, NULL);
+    if (!key) {
+      err << "cannot convert AES key for NSS: " << PR_GetError();
+      return -1;
+    }
+
+    SECItem ivItem;
+    ivItem.type = siBuffer;
+    // losing constness due to SECItem.data; IV should never be
+    // modified, regardless
+    ivItem.data = (unsigned char*)CEPH_AES_IV;
+    ivItem.len = sizeof(CEPH_AES_IV);
+
+    param = PK11_ParamFromIV(mechanism, &ivItem);
+    if (!param) {
+      err << "cannot set NSS IV param: " << PR_GetError();
+      return -1;
+    }
+
+    return 0;
+  }
+
+  int encrypt(const bufferlist& in,
+	      bufferlist& out, std::string *error) const {
+    return nss_aes_operation(CKA_ENCRYPT, mechanism, key, param, in, out, error);
+  }
+  int decrypt(const bufferlist& in,
+	       bufferlist& out, std::string *error) const {
+    return nss_aes_operation(CKA_DECRYPT, mechanism, key, param, in, out, error);
+  }
+};
+
 #else
 # error "No supported crypto implementation found."
 #endif
 
+
+
+// ------------------------------------------------------------
+
 int CryptoAES::create(bufferptr& secret)
 {
   bufferlist bl;
@@ -226,7 +339,7 @@ int CryptoAES::create(bufferptr& secret)
   return 0;
 }
 
-int CryptoAES::validate_secret(bufferptr& secret)
+int CryptoAES::validate_secret(const bufferptr& secret)
 {
   if (secret.length() < (size_t)AES_KEY_LEN) {
     return -EINVAL;
@@ -235,140 +348,103 @@ int CryptoAES::validate_secret(bufferptr& secret)
   return 0;
 }
 
-void CryptoAES::encrypt(const bufferptr& secret, const bufferlist& in, bufferlist& out,
-			std::string &error) const
+CryptoKeyHandler *CryptoAES::get_key_handler(const bufferptr& secret,
+					     string& error)
 {
-  if (secret.length() < AES_KEY_LEN) {
-    error = "key is too short";
-    return;
-  }
-#ifdef USE_CRYPTOPP
-  {
-    const unsigned char *key = (const unsigned char *)secret.c_str();
-
-    string ciphertext;
-    CryptoPP::AES::Encryption aesEncryption(key, CryptoPP::AES::DEFAULT_KEYLENGTH);
-    CryptoPP::CBC_Mode_ExternalCipher::Encryption cbcEncryption( aesEncryption, (const byte*)CEPH_AES_IV );
-    CryptoPP::StringSink *sink = new CryptoPP::StringSink(ciphertext);
-    CryptoPP::StreamTransformationFilter stfEncryptor(cbcEncryption, sink);
-
-    for (std::list<bufferptr>::const_iterator it = in.buffers().begin();
-	 it != in.buffers().end(); ++it) {
-      const unsigned char *in_buf = (const unsigned char *)it->c_str();
-      stfEncryptor.Put(in_buf, it->length());
-    }
-    try {
-      stfEncryptor.MessageEnd();
-    } catch (CryptoPP::Exception& e) {
-      ostringstream oss;
-      oss << "encryptor.MessageEnd::Exception: " << e.GetWhat();
-      error = oss.str();
-      return;
-    }
-    out.append((const char *)ciphertext.c_str(), ciphertext.length());
+  CryptoAESKeyHandler *ckh = new CryptoAESKeyHandler;
+  ostringstream oss;
+  if (ckh->init(secret, oss) < 0) {
+    error = oss.str();
+    return NULL;
   }
-#elif USE_NSS
-  nss_aes_operation(CKA_ENCRYPT, secret, in, out, error);
-#else
-# error "No supported crypto implementation found."
-#endif
+  return ckh;
 }
 
-void CryptoAES::decrypt(const bufferptr& secret, const bufferlist& in, 
-			bufferlist& out, std::string &error) const
-{
-#ifdef USE_CRYPTOPP
-  const unsigned char *key = (const unsigned char *)secret.c_str();
-
-  CryptoPP::AES::Decryption aesDecryption(key, CryptoPP::AES::DEFAULT_KEYLENGTH);
-  CryptoPP::CBC_Mode_ExternalCipher::Decryption cbcDecryption( aesDecryption, (const byte*)CEPH_AES_IV );
 
-  string decryptedtext;
-  CryptoPP::StringSink *sink = new CryptoPP::StringSink(decryptedtext);
-  CryptoPP::StreamTransformationFilter stfDecryptor(cbcDecryption, sink);
-  for (std::list<bufferptr>::const_iterator it = in.buffers().begin(); 
-       it != in.buffers().end(); ++it) {
-      const unsigned char *in_buf = (const unsigned char *)it->c_str();
-      stfDecryptor.Put(in_buf, it->length());
-  }
 
-  try {
-    stfDecryptor.MessageEnd();
-  } catch (CryptoPP::Exception& e) {
-    ostringstream oss;
-    oss << "decryptor.MessageEnd::Exception: " << e.GetWhat();
-    error = oss.str();
-    return;
-  }
 
-  out.append((const char *)decryptedtext.c_str(), decryptedtext.length());
-#elif USE_NSS
-  nss_aes_operation(CKA_DECRYPT, secret, in, out, error);
-#else
-# error "No supported crypto implementation found."
-#endif
-}
+// --
 
 
 // ---------------------------------------------------
 
-int CryptoKey::set_secret(CephContext *cct, int type, bufferptr& s)
-{
-  this->type = type;
-  created = ceph_clock_now(cct);
-
-  CryptoHandler *h = cct->get_crypto_handler(type);
-  if (!h) {
-    lderr(cct) << "ERROR: cct->get_crypto_handler(type=" << type << ") returned NULL" << dendl;
-    return -EOPNOTSUPP;
-  }
-  int ret = h->validate_secret(s);
 
-  if (ret < 0)
-    return ret;
+void CryptoKey::encode(bufferlist& bl) const
+{
+  ::encode(type, bl);
+  ::encode(created, bl);
+  __u16 len = secret.length();
+  ::encode(len, bl);
+  bl.append(secret);
+}
 
-  secret = s;
+void CryptoKey::decode(bufferlist::iterator& bl)
+{
+  ::decode(type, bl);
+  ::decode(created, bl);
+  __u16 len;
+  ::decode(len, bl);
+  bufferptr tmp;
+  bl.copy(len, tmp);
+  if (_set_secret(type, tmp) < 0)
+    throw buffer::malformed_input("malformed secret");
+}
 
+int CryptoKey::set_secret(int type, const bufferptr& s, utime_t c)
+{
+  int r = _set_secret(type, s);
+  if (r < 0)
+    return r;
+  this->created = c;
   return 0;
 }
 
-int CryptoKey::create(CephContext *cct, int t)
+int CryptoKey::_set_secret(int t, const bufferptr& s)
 {
-  type = t;
-  created = ceph_clock_now(cct);
-
-  CryptoHandler *h = cct->get_crypto_handler(type);
-  if (!h) {
-    lderr(cct) << "ERROR: cct->get_crypto_handler(type=" << type << ") returned NULL" << dendl;
-    return -EOPNOTSUPP;
+  if (s.length() == 0) {
+    secret = s;
+    ckh.reset();
+    return 0;
   }
-  return h->create(secret);
-}
 
-void CryptoKey::encrypt(CephContext *cct, const bufferlist& in, bufferlist& out, std::string &error) const
-{
-  if (!ch || ch->get_type() != type) {
-    ch = cct->get_crypto_handler(type);
-    if (!ch) {
-      ostringstream oss;
-      oss << "CryptoKey::encrypt: key type " << type << " not supported.";
-      return;
+  CryptoHandler *ch = CryptoHandler::create(t);
+  if (ch) {
+    int ret = ch->validate_secret(s);
+    if (ret < 0) {
+      delete ch;
+      return ret;
+    }
+    string error;
+    ckh.reset(ch->get_key_handler(s, error));
+    delete ch;
+    if (error.length()) {
+      return -EIO;
     }
   }
-  ch->encrypt(this->secret, in, out, error);
+  type = t;
+  secret = s;
+  return 0;
 }
 
-void CryptoKey::decrypt(CephContext *cct, const bufferlist& in, bufferlist& out, std::string &error) const
+int CryptoKey::create(CephContext *cct, int t)
 {
-  if (!ch || ch->get_type() != type) {
-    ch = cct->get_crypto_handler(type);
-    if (!ch) {
-      ostringstream oss;
-      oss << "CryptoKey::decrypt: key type " << type << " not supported.";
-      return;
-    }
+  CryptoHandler *ch = CryptoHandler::create(t);
+  if (!ch) {
+    if (cct)
+      lderr(cct) << "ERROR: cct->get_crypto_handler(type=" << t << ") returned NULL" << dendl;
+    return -EOPNOTSUPP;
   }
-  ch->decrypt(this->secret, in, out, error);
+  bufferptr s;
+  int r = ch->create(s);
+  delete ch;
+  if (r < 0)
+    return r;
+
+  r = _set_secret(t, s);
+  if (r < 0)
+    return r;
+  created = ceph_clock_now(cct);
+  return r;
 }
 
 void CryptoKey::print(std::ostream &out) const
@@ -396,3 +472,18 @@ void CryptoKey::encode_plaintext(bufferlist &bl)
 {
   bl.append(encode_base64());
 }
+
+
+// ------------------
+
+CryptoHandler *CryptoHandler::create(int type)
+{
+  switch (type) {
+  case CEPH_CRYPTO_NONE:
+    return new CryptoNone;
+  case CEPH_CRYPTO_AES:
+    return new CryptoAES;
+  default:
+    return NULL;
+  }
+}
diff --git a/src/auth/Crypto.h b/src/auth/Crypto.h
index c811222..3bfc5aa 100644
--- a/src/auth/Crypto.h
+++ b/src/auth/Crypto.h
@@ -17,6 +17,7 @@
 
 #include "include/types.h"
 #include "include/utime.h"
+#include "include/memory.h"
 
 #include "common/Formatter.h"
 #include "include/buffer.h"
@@ -25,6 +26,22 @@
 
 class CephContext;
 class CryptoHandler;
+class CryptoKeyContext;
+
+/*
+ * some per-key context that is specific to a particular crypto backend
+ */
+class CryptoKeyHandler {
+public:
+  bufferptr secret;
+
+  virtual ~CryptoKeyHandler() {}
+
+  virtual int encrypt(const bufferlist& in,
+		       bufferlist& out, std::string *error) const = 0;
+  virtual int decrypt(const bufferlist& in,
+		       bufferlist& out, std::string *error) const = 0;
+};
 
 /*
  * match encoding of struct ceph_secret
@@ -33,38 +50,32 @@ class CryptoKey {
 protected:
   __u16 type;
   utime_t created;
-  bufferptr secret;
+  bufferptr secret;   // must set this via set_secret()!
 
-  // cache a pointer to the handler, so we don't have to look it up
-  // for each crypto operation
-  mutable CryptoHandler *ch;
+  // cache a pointer to the implementation-specific key handler, so we
+  // don't have to create it for every crypto operation.
+  mutable ceph::shared_ptr<CryptoKeyHandler> ckh;
+
+  int _set_secret(int type, const bufferptr& s);
 
 public:
-  CryptoKey() : type(0), ch(NULL) { }
-  CryptoKey(int t, utime_t c, bufferptr& s) : type(t), created(c), secret(s), ch(NULL) { }
-
-  void encode(bufferlist& bl) const {
-    ::encode(type, bl);
-    ::encode(created, bl);
-    __u16 len = secret.length();
-    ::encode(len, bl);
-    bl.append(secret);
+  CryptoKey() : type(0) { }
+  CryptoKey(int t, utime_t c, bufferptr& s)
+    : created(c) {
+    _set_secret(t, s);
   }
-  void decode(bufferlist::iterator& bl) {
-    ::decode(type, bl);
-    ::decode(created, bl);
-    __u16 len;
-    ::decode(len, bl);
-    bl.copy(len, secret);
-    secret.c_str();   // make sure it's a single buffer!
+  ~CryptoKey() {
   }
 
+  void encode(bufferlist& bl) const;
+  void decode(bufferlist::iterator& bl);
+
   int get_type() const { return type; }
   utime_t get_created() const { return created; }
   void print(std::ostream& out) const;
 
-  int set_secret(CephContext *cct, int type, bufferptr& s);
-  bufferptr& get_secret() { return secret; }
+  int set_secret(int type, const bufferptr& s, utime_t created);
+  const bufferptr& get_secret() { return secret; }
   const bufferptr& get_secret() const { return secret; }
 
   void encode_base64(string& s) const {
@@ -94,8 +105,14 @@ public:
 
   // --
   int create(CephContext *cct, int type);
-  void encrypt(CephContext *cct, const bufferlist& in, bufferlist& out, std::string &error) const;
-  void decrypt(CephContext *cct, const bufferlist& in, bufferlist& out, std::string &error) const;
+  int encrypt(CephContext *cct, const bufferlist& in, bufferlist& out,
+	       std::string *error) const {
+    return ckh->encrypt(in, out, error);
+  }
+  int decrypt(CephContext *cct, const bufferlist& in, bufferlist& out,
+	       std::string *error) const {
+    return ckh->decrypt(in, out, error);
+  }
 
   void to_str(std::string& s) const;
 };
@@ -119,44 +136,14 @@ public:
   virtual ~CryptoHandler() {}
   virtual int get_type() const = 0;
   virtual int create(bufferptr& secret) = 0;
-  virtual int validate_secret(bufferptr& secret) = 0;
-  virtual void encrypt(const bufferptr& secret, const bufferlist& in,
-		      bufferlist& out, std::string &error) const = 0;
-  virtual void decrypt(const bufferptr& secret, const bufferlist& in,
-		      bufferlist& out, std::string &error) const = 0;
+  virtual int validate_secret(const bufferptr& secret) = 0;
+  virtual CryptoKeyHandler *get_key_handler(const bufferptr& secret,
+					    string& error) = 0;
+
+  static CryptoHandler *create(int type);
 };
 
 extern int get_random_bytes(char *buf, int len);
 extern uint64_t get_random(uint64_t min_val, uint64_t max_val);
 
-class CryptoNone : public CryptoHandler {
-public:
-  CryptoNone() { }
-  ~CryptoNone() {}
-  int get_type() const {
-    return CEPH_CRYPTO_NONE;
-  }
-  int create(bufferptr& secret);
-  int validate_secret(bufferptr& secret);
-  void encrypt(const bufferptr& secret, const bufferlist& in,
-	      bufferlist& out, std::string &error) const;
-  void decrypt(const bufferptr& secret, const bufferlist& in,
-	      bufferlist& out, std::string &error) const;
-};
-
-class CryptoAES : public CryptoHandler {
-public:
-  CryptoAES() { }
-  ~CryptoAES() {}
-  int get_type() const {
-    return CEPH_CRYPTO_AES;
-  }
-  int create(bufferptr& secret);
-  int validate_secret(bufferptr& secret);
-  void encrypt(const bufferptr& secret, const bufferlist& in,
-	       bufferlist& out, std::string &error) const;
-  void decrypt(const bufferptr& secret, const bufferlist& in, 
-	      bufferlist& out, std::string &error) const;
-};
-
 #endif
diff --git a/src/auth/KeyRing.cc b/src/auth/KeyRing.cc
index 7aeb9e8..f579323 100644
--- a/src/auth/KeyRing.cc
+++ b/src/auth/KeyRing.cc
@@ -33,21 +33,22 @@
 #undef dout_prefix
 #define dout_prefix *_dout << "auth: "
 
-using std::auto_ptr;
 using namespace std;
 
 int KeyRing::from_ceph_context(CephContext *cct)
 {
   const md_config_t *conf = cct->_conf;
-
-  int ret = -ENOENT;
   string filename;
 
-  if (ceph_resolve_file_search(conf->keyring, filename)) {
+  int ret = ceph_resolve_file_search(conf->keyring, filename);
+  if (!ret) {
     ret = load(cct, filename);
     if (ret < 0)
       lderr(cct) << "failed to load " << filename
 		 << ": " << cpp_strerror(ret) << dendl;
+  } else {
+    lderr(cct) << "unable to find a keyring on " << conf->keyring
+	       << ": " << cpp_strerror(ret) << dendl;
   }
 
   if (!conf->key.empty()) {
diff --git a/src/auth/cephx/CephxKeyServer.cc b/src/auth/cephx/CephxKeyServer.cc
index b2c0c67..81c0a66 100644
--- a/src/auth/cephx/CephxKeyServer.cc
+++ b/src/auth/cephx/CephxKeyServer.cc
@@ -268,7 +268,7 @@ bool KeyServer::generate_secret(CryptoKey& secret)
   if (crypto->create(bp) < 0)
     return false;
 
-  secret.set_secret(cct, CEPH_CRYPTO_AES, bp);
+  secret.set_secret(CEPH_CRYPTO_AES, bp, ceph_clock_now(NULL));
 
   return true;
 }
diff --git a/src/auth/cephx/CephxProtocol.cc b/src/auth/cephx/CephxProtocol.cc
index f57f063..f2a00dd 100644
--- a/src/auth/cephx/CephxProtocol.cc
+++ b/src/auth/cephx/CephxProtocol.cc
@@ -25,14 +25,13 @@
 
 
 void cephx_calc_client_server_challenge(CephContext *cct, CryptoKey& secret, uint64_t server_challenge, 
-		  uint64_t client_challenge, uint64_t *key, std::string &ret)
+		  uint64_t client_challenge, uint64_t *key, std::string &error)
 {
   CephXChallengeBlob b;
   b.server_challenge = server_challenge;
   b.client_challenge = client_challenge;
 
   bufferlist enc;
-  std::string error;
   if (encode_encrypt(cct, b, secret, enc, error))
     return;
 
diff --git a/src/auth/cephx/CephxProtocol.h b/src/auth/cephx/CephxProtocol.h
index d72a23d..f08f07d 100644
--- a/src/auth/cephx/CephxProtocol.h
+++ b/src/auth/cephx/CephxProtocol.h
@@ -433,8 +433,7 @@ void decode_decrypt_enc_bl(CephContext *cct, T& t, CryptoKey key, bufferlist& bl
   uint64_t magic;
   bufferlist bl;
 
-  key.decrypt(cct, bl_enc, bl, error);
-  if (!error.empty())
+  if (key.decrypt(cct, bl_enc, bl, &error) < 0)
     return;
 
   bufferlist::iterator iter2 = bl.begin();
@@ -462,7 +461,7 @@ void encode_encrypt_enc_bl(CephContext *cct, const T& t, const CryptoKey& key,
   ::encode(magic, bl);
   ::encode(t, bl);
 
-  key.encrypt(cct, bl, out, error);
+  key.encrypt(cct, bl, out, &error);
 }
 
 template <typename T>
diff --git a/src/auth/cephx/CephxServiceHandler.cc b/src/auth/cephx/CephxServiceHandler.cc
index c5d91d9..d65ac79 100644
--- a/src/auth/cephx/CephxServiceHandler.cc
+++ b/src/auth/cephx/CephxServiceHandler.cc
@@ -97,7 +97,7 @@ int CephxServiceHandler::handle_request(bufferlist::iterator& indata, bufferlist
       bool should_enc_ticket = false;
 
       EntityAuth eauth;
-      if (key_server->get_auth(entity_name, eauth) < 0) {
+      if (! key_server->get_auth(entity_name, eauth)) {
 	ret = -EPERM;
 	break;
       }
diff --git a/src/auth/cephx/CephxSessionHandler.cc b/src/auth/cephx/CephxSessionHandler.cc
index b2d402d..eaebd15 100644
--- a/src/auth/cephx/CephxSessionHandler.cc
+++ b/src/auth/cephx/CephxSessionHandler.cc
@@ -24,47 +24,65 @@
 
 #define dout_subsys ceph_subsys_auth
 
+int CephxSessionHandler::_calc_signature(Message *m, uint64_t *psig)
+{
+  const ceph_msg_header& header = m->get_header();
+  const ceph_msg_footer& footer = m->get_footer();
+
+  // optimized signature calculation
+  // - avoid temporary allocated buffers from encode_encrypt[_enc_bl]
+  // - skip the leading 4 byte wrapper from encode_encrypt
+  struct {
+    __u8 v;
+    __le64 magic;
+    __le32 len;
+    __le32 header_crc;
+    __le32 front_crc;
+    __le32 middle_crc;
+    __le32 data_crc;
+  } __attribute__ ((packed)) sigblock = {
+    1, AUTH_ENC_MAGIC, 4*4,
+    header.crc, footer.front_crc, footer.middle_crc, footer.data_crc
+  };
+  bufferlist bl_plaintext;
+  bl_plaintext.append(buffer::create_static(sizeof(sigblock), (char*)&sigblock));
+
+  bufferlist bl_ciphertext;
+  if (key.encrypt(cct, bl_plaintext, bl_ciphertext, NULL) < 0) {
+    lderr(cct) << __func__ << " failed to encrypt signature block" << dendl;
+    return -1;
+  }
+
+  bufferlist::iterator ci = bl_ciphertext.begin();
+  ::decode(*psig, ci);
+
+  ldout(cct, 10) << __func__ << " seq " << m->get_seq()
+		 << " front_crc_ = " << footer.front_crc
+		 << " middle_crc = " << footer.middle_crc
+		 << " data_crc = " << footer.data_crc
+		 << " sig = " << *psig
+		 << dendl;
+  return 0;
+}
+
 int CephxSessionHandler::sign_message(Message *m)
 {
   // If runtime signing option is off, just return success without signing.
   if (!cct->_conf->cephx_sign_messages) {
     return 0;
   }
-  bufferlist bl_plaintext, bl_encrypted;
-  ceph_msg_header header = m->get_header();
-  std::string error;
-
-  ceph_msg_footer& en_footer = m->get_footer();
-
-  ::encode(header.crc, bl_plaintext);
-  ::encode(en_footer.front_crc, bl_plaintext);
-  ::encode(en_footer.middle_crc, bl_plaintext);
-  ::encode(en_footer.data_crc, bl_plaintext);
-
-  ldout(cct, 10) <<  "sign_message: seq # " << header.seq << " CRCs are: header " << header.crc
-		 << " front " << en_footer.front_crc << " middle " << en_footer.middle_crc
-		 << " data " << en_footer.data_crc << dendl;
-
-  if (encode_encrypt(cct, bl_plaintext, key, bl_encrypted, error)) {
-    ldout(cct, 0) << "error encrypting message signature: " << error << dendl;
-    ldout(cct, 0) << "no signature put on message" << dendl;
-    return SESSION_SIGNATURE_FAILURE;
-  } 
-
-  bufferlist::iterator ci = bl_encrypted.begin();
-  // Skip the magic number up front. PLR
-  ci.advance(4);
-  ::decode(en_footer.sig, ci);
-
-  // There's potentially an issue with whether the encoding and decoding done here will work
-  // properly when a big endian and little endian machine are talking.  We think it's OK,
-  // but it should be tested to be sure.  PLR
-
-  // Receiver won't trust this flag to decide if msg should have been signed.  It's primarily
-  // to debug problems where sender and receiver disagree on need to sign msg.  PLR
-  en_footer.flags = (unsigned)en_footer.flags | CEPH_MSG_FOOTER_SIGNED;
+
+  uint64_t sig;
+  int r = _calc_signature(m, &sig);
+  if (r < 0)
+    return r;
+
+  ceph_msg_footer& f = m->get_footer();
+  f.sig = sig;
+  f.flags = (unsigned)f.flags | CEPH_MSG_FOOTER_SIGNED;
   messages_signed++;
-  ldout(cct, 20) << "Putting signature in client message(seq # " << header.seq << "): sig = " << en_footer.sig << dendl;
+  ldout(cct, 20) << "Putting signature in client message(seq # " << m->get_seq()
+		 << "): sig = " << sig << dendl;
   return 0;
 }
 
@@ -74,57 +92,34 @@ int CephxSessionHandler::check_message_signature(Message *m)
   if (!cct->_conf->cephx_sign_messages) {
     return 0;
   }
-
-  bufferlist bl_plaintext, bl_ciphertext;
-  std::string sig_error;
-  ceph_msg_header& header = m->get_header();
-  ceph_msg_footer& footer = m->get_footer();
-
   if ((features & CEPH_FEATURE_MSG_AUTH) == 0) {
     // it's fine, we didn't negotiate this feature.
     return 0;
   }
 
-  signatures_checked++;
+  uint64_t sig;
+  int r = _calc_signature(m, &sig);
+  if (r < 0)
+    return r;
 
-  ldout(cct, 10) << "check_message_signature: seq # = " << m->get_seq() << " front_crc_ = " << footer.front_crc
-		 << " middle_crc = " << footer.middle_crc << " data_crc = " << footer.data_crc << dendl;
-  ::encode(header.crc, bl_plaintext);
-  ::encode(footer.front_crc, bl_plaintext);
-  ::encode(footer.middle_crc, bl_plaintext);
-  ::encode(footer.data_crc, bl_plaintext);
-
-  // Encrypt the buffer containing the checksums to calculate the signature. PLR
-  if (encode_encrypt(cct, bl_plaintext, key, bl_ciphertext, sig_error)) {
-    ldout(cct, 0) << "error in encryption for checking message signature: " << sig_error << dendl;
-    return (SESSION_SIGNATURE_FAILURE);
-  } 
-
-  bufferlist::iterator ci = bl_ciphertext.begin();
-  // Skip the magic number at the front. PLR
-  ci.advance(4);
-  uint64_t sig_check;
-  ::decode(sig_check, ci);
-
-  // There's potentially an issue with whether the encoding and decoding done here will work
-  // properly when a big endian and little endian machine are talking.  We think it's OK,
-  // but it should be tested to be sure.  PLR
+  signatures_checked++;
 
-  if (sig_check != footer.sig) {
+  if (sig != m->get_footer().sig) {
     // Should have been signed, but signature check failed.  PLR
-    if (!(footer.flags & CEPH_MSG_FOOTER_SIGNED)) {
-      ldout(cct, 0) << "SIGN: MSG " << header.seq << " Sender did not set CEPH_MSG_FOOTER_SIGNED." << dendl;
+    if (!(m->get_footer().flags & CEPH_MSG_FOOTER_SIGNED)) {
+      ldout(cct, 0) << "SIGN: MSG " << m->get_seq() << " Sender did not set CEPH_MSG_FOOTER_SIGNED." << dendl;
     }
-    ldout(cct, 0) << "SIGN: MSG " << header.seq << " Message signature does not match contents." << dendl;
-    ldout(cct, 0) << "SIGN: MSG " << header.seq << "Signature on message:" << dendl;
-    ldout(cct, 0) << "SIGN: MSG " << header.seq << "    sig: " << footer.sig << dendl;
-    ldout(cct, 0) << "SIGN: MSG " << header.seq << "Locally calculated signature:" << dendl;
-    ldout(cct, 0) << "SIGN: MSG " << header.seq << "    sig_check:" << sig_check << dendl;
-
-    // For the moment, printing an error message to the log and returning failure is sufficient.
-    // In the long term, we should probably have code parsing the log looking for this kind
-    // of security failure, particularly when there are large numbers of them, since the latter
-    // is a potential sign of an attack.  PLR
+    ldout(cct, 0) << "SIGN: MSG " << m->get_seq() << " Message signature does not match contents." << dendl;
+    ldout(cct, 0) << "SIGN: MSG " << m->get_seq() << "Signature on message:" << dendl;
+    ldout(cct, 0) << "SIGN: MSG " << m->get_seq() << "    sig: " << m->get_footer().sig << dendl;
+    ldout(cct, 0) << "SIGN: MSG " << m->get_seq() << "Locally calculated signature:" << dendl;
+    ldout(cct, 0) << "SIGN: MSG " << m->get_seq() << "    sig_check:" << sig << dendl;
+
+    // For the moment, printing an error message to the log and
+    // returning failure is sufficient.  In the long term, we should
+    // probably have code parsing the log looking for this kind of
+    // security failure, particularly when there are large numbers of
+    // them, since the latter is a potential sign of an attack.  PLR
 
     signatures_failed++;
     ldout(cct, 0) << "Signature failed." << dendl;
diff --git a/src/auth/cephx/CephxSessionHandler.h b/src/auth/cephx/CephxSessionHandler.h
index 52a112e..7b46e07 100644
--- a/src/auth/cephx/CephxSessionHandler.h
+++ b/src/auth/cephx/CephxSessionHandler.h
@@ -31,8 +31,9 @@ public:
     return false;
   }
 
-  int sign_message(Message *m);
+  int _calc_signature(Message *m, uint64_t *psig);
 
+  int sign_message(Message *m);
   int check_message_signature(Message *m) ;
 
   // Cephx does not currently encrypt messages, so just return 0 if called.  PLR
diff --git a/src/ceph-create-keys b/src/ceph-create-keys
index 57eaf17..f9cc219 100755
--- a/src/ceph-create-keys
+++ b/src/ceph-create-keys
@@ -7,12 +7,28 @@ import os
 import subprocess
 import sys
 import time
+import pwd
+import grp
 
 
 LOG = logging.getLogger(os.path.basename(sys.argv[0]))
 
 QUORUM_STATES = ['leader', 'peon']
 
+def get_ceph_uid():
+    try:
+        uid = pwd.getpwnam('ceph').pw_uid
+    except:
+        uid = -1
+    return uid
+
+def get_ceph_gid():
+    try:
+        gid = grp.getgrnam('ceph').gr_gid
+    except:
+        gid = -1
+    return gid
+
 def wait_for_quorum(cluster, mon_id):
     while True:
         p = subprocess.Popen(
@@ -68,10 +84,13 @@ def get_key(cluster, mon_id):
     pathdir = os.path.dirname(path)
     if not os.path.exists(pathdir):
         os.makedirs(pathdir)
+        os.chmod(pathdir, 0770)
+        os.chown(pathdir, get_ceph_uid(), get_ceph_gid())
     while True:
         try:
             with file(tmp, 'w') as f:
                 os.fchmod(f.fileno(), 0600)
+                os.fchown(f.fileno(), get_ceph_uid(), get_ceph_gid())
                 LOG.info('Talking to monitor...')
                 returncode = subprocess.call(
                     args=[
@@ -87,7 +106,7 @@ def get_key(cluster, mon_id):
                         'client.admin',
                         'mon', 'allow *',
                         'osd', 'allow *',
-                        'mds', 'allow',
+                        'mds', 'allow *',
                         ],
                     stdout=f,
                     )
@@ -137,11 +156,14 @@ def bootstrap_key(cluster, type_):
     pathdir = os.path.dirname(path)
     if not os.path.exists(pathdir):
         os.makedirs(pathdir)
+        os.chmod(pathdir, 0770)
+        os.chown(pathdir, get_ceph_uid(), get_ceph_gid())
 
     while True:
         try:
             with file(tmp, 'w') as f:
                 os.fchmod(f.fileno(), 0600)
+                os.fchown(f.fileno(), get_ceph_uid(), get_ceph_gid())
                 LOG.info('Talking to monitor...')
                 returncode = subprocess.call(
                     args=args,
diff --git a/src/ceph-detect-init/AUTHORS.rst b/src/ceph-detect-init/AUTHORS.rst
new file mode 100644
index 0000000..3818d35
--- /dev/null
+++ b/src/ceph-detect-init/AUTHORS.rst
@@ -0,0 +1,2 @@
+- Owen Synge <osynge at suse.com>
+- Loic Dachary <loic at dachary.org>
diff --git a/src/ceph-detect-init/MANIFEST.in b/src/ceph-detect-init/MANIFEST.in
new file mode 100644
index 0000000..23abe0d
--- /dev/null
+++ b/src/ceph-detect-init/MANIFEST.in
@@ -0,0 +1 @@
+include AUTHORS.rst
diff --git a/src/ceph-detect-init/Makefile.am b/src/ceph-detect-init/Makefile.am
new file mode 100644
index 0000000..a2c885a
--- /dev/null
+++ b/src/ceph-detect-init/Makefile.am
@@ -0,0 +1,72 @@
+#
+# Copyright (C) 2015 SUSE LINUX GmbH
+# Copyright (C) 2015 <contact at redhat.com>
+#
+# Author: Owen Synge <osynge at suse.com>
+# Author: Loic Dachary <loic at dachary.org>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see `<http://www.gnu.org/licenses/>`.
+#
+check_SCRIPTS += ceph-detect-init/run-tox.sh
+
+EXTRA_DIST += \
+	ceph-detect-init/AUTHORS.rst \
+	ceph-detect-init/ceph_detect_init/centos/__init__.py \
+	ceph-detect-init/ceph_detect_init/exc.py \
+	ceph-detect-init/ceph_detect_init/main.py \
+	ceph-detect-init/ceph_detect_init/__init__.py \
+	ceph-detect-init/ceph_detect_init/rhel/__init__.py \
+	ceph-detect-init/ceph_detect_init/fedora/__init__.py \
+	ceph-detect-init/ceph_detect_init/debian/__init__.py \
+	ceph-detect-init/ceph_detect_init/suse/__init__.py \
+	ceph-detect-init/integration/centos-6.dockerfile \
+	ceph-detect-init/integration/debian-wheezy.dockerfile \
+	ceph-detect-init/integration/debian-sid.dockerfile \
+	ceph-detect-init/integration/debian-jessie.dockerfile \
+	ceph-detect-init/integration/opensuse-13.1.dockerfile \
+	ceph-detect-init/integration/fedora-21.dockerfile \
+	ceph-detect-init/integration/ubuntu-14.04.dockerfile \
+	ceph-detect-init/integration/test_main.py \
+	ceph-detect-init/integration/opensuse-13.2.dockerfile \
+	ceph-detect-init/integration/ubuntu-12.04.dockerfile \
+	ceph-detect-init/integration/centos-7.dockerfile \
+	ceph-detect-init/integration/ubuntu-15.04.dockerfile \
+	ceph-detect-init/integration/debian-squeeze.dockerfile \
+	ceph-detect-init/Makefile.am \
+	ceph-detect-init/MANIFEST.in \
+	ceph-detect-init/README.rst \
+	ceph-detect-init/requirements.txt \
+	ceph-detect-init/run-tox.sh \
+	ceph-detect-init/setup.py \
+	ceph-detect-init/test-requirements.txt \
+	ceph-detect-init/tests/test_all.py \
+	ceph-detect-init/tox.ini
+
+all-local::
+	cd $(srcdir)/ceph-detect-init ; python setup.py build
+
+clean-local::
+	cd $(srcdir)/ceph-detect-init ; python setup.py clean ; rm -fr wheelhouse .tox build .coverage *.egg-info
+
+install-data-local::
+	cd $(srcdir)/ceph-detect-init ; \
+	if test "$(DESTDIR)" ; then \
+		if lsb_release -si | grep --quiet 'Ubuntu\|Debian\|Devuan' ; then \
+			options=--install-layout=deb ; \
+		else \
+			options=--prefix=/usr ; \
+		fi ; \
+		root="--root=$(DESTDIR)" ; \
+	fi ; \
+	python setup.py install $$root $$options
diff --git a/src/ceph-detect-init/README.rst b/src/ceph-detect-init/README.rst
new file mode 100644
index 0000000..e40f22f
--- /dev/null
+++ b/src/ceph-detect-init/README.rst
@@ -0,0 +1,28 @@
+ceph-detect-init
+================
+
+ceph-detect-init is a command line tool that displays a normalized
+string describing the init system of the host on which it is running:
+
+Home page : https://pypi.python.org/pypi/ceph-detect-init
+
+Hacking
+=======
+
+* Get the code : git clone https://git.ceph.com/ceph.git
+* Run the unit tests : tox
+* Run the integration tests (requires docker) : tox -e integration
+* Check the documentation : rst2html < README.rst > /tmp/a.html
+* Prepare a new version
+
+ - version=1.0.0 ; perl -pi -e "s/^version.*/version='$version',/" setup.py ; do python setup.py sdist ; amend=$(git log -1 --oneline | grep --quiet "version $version" && echo --amend) ; git commit $amend -m "version $version" setup.py ; git tag -a -f -m "version $version" $version ; done
+
+* Publish a new version
+
+ - python setup.py sdist upload --sign
+ - git push ; git push --tags
+
+* pypi maintenance
+
+ - python setup.py register # if the project does not yet exist
+ - trim old versions at https://pypi.python.org/pypi/ceph-detect-init
diff --git a/src/ceph-detect-init/ceph_detect_init/__init__.py b/src/ceph-detect-init/ceph_detect_init/__init__.py
new file mode 100644
index 0000000..cc9b2c0
--- /dev/null
+++ b/src/ceph-detect-init/ceph_detect_init/__init__.py
@@ -0,0 +1,112 @@
+#!/usr/bin/env python
+#
+# Copyright (C) 2015 <contact at redhat.com>
+#
+# Author: Alfredo Deza <adeza at redhat.com>
+# Author: Loic Dachary <loic at dachary.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Library Public License for more details.
+#
+from ceph_detect_init import centos
+from ceph_detect_init import debian
+from ceph_detect_init import exc
+from ceph_detect_init import fedora
+from ceph_detect_init import rhel
+from ceph_detect_init import suse
+import logging
+import platform
+
+
+def get(use_rhceph=False):
+    distro_name, release, codename = platform_information()
+    if not codename or not _get_distro(distro_name):
+        raise exc.UnsupportedPlatform(
+            distro=distro_name,
+            codename=codename,
+            release=release)
+
+    module = _get_distro(distro_name, use_rhceph=use_rhceph)
+    module.name = distro_name
+    module.normalized_name = _normalized_distro_name(distro_name)
+    module.distro = module.normalized_name
+    module.is_el = module.normalized_name in ['redhat', 'centos',
+                                              'fedora', 'scientific']
+    module.release = release
+    module.codename = codename
+    module.init = module.choose_init()
+    return module
+
+
+def _get_distro(distro, use_rhceph=False):
+    if not distro:
+        return
+
+    distro = _normalized_distro_name(distro)
+    distributions = {
+        'debian': debian,
+        'ubuntu': debian,
+        'linuxmint': debian,
+        'centos': centos,
+        'scientific': centos,
+        'redhat': centos,
+        'fedora': fedora,
+        'suse': suse,
+    }
+
+    if distro == 'redhat' and use_rhceph:
+        return rhel
+    else:
+        return distributions.get(distro)
+
+
+def _normalized_distro_name(distro):
+    distro = distro.lower()
+    if distro.startswith(('redhat', 'red hat')):
+        return 'redhat'
+    elif distro.startswith(('scientific', 'scientific linux')):
+        return 'scientific'
+    elif distro.startswith(('suse', 'opensuse')):
+        return 'suse'
+    elif distro.startswith('centos'):
+        return 'centos'
+    return distro
+
+
+def platform_information():
+    """detect platform information from remote host."""
+    logging.debug('platform_information: linux_distribution = ' +
+                  str(platform.linux_distribution()))
+    distro, release, codename = platform.linux_distribution()
+    # this could be an empty string in Debian
+    if not codename and 'debian' in distro.lower():
+        debian_codenames = {
+            '8': 'jessie',
+            '7': 'wheezy',
+            '6': 'squeeze',
+        }
+        major_version = release.split('.')[0]
+        codename = debian_codenames.get(major_version, '')
+
+        # In order to support newer jessie/sid or wheezy/sid strings
+        # we test this if sid is buried in the minor, we should use
+        # sid anyway.
+        if not codename and '/' in release:
+            major, minor = release.split('/')
+            if minor == 'sid':
+                codename = minor
+            else:
+                codename = major
+
+    return (
+        str(distro).rstrip(),
+        str(release).rstrip(),
+        str(codename).rstrip()
+    )
diff --git a/src/ceph-detect-init/ceph_detect_init/centos/__init__.py b/src/ceph-detect-init/ceph_detect_init/centos/__init__.py
new file mode 100644
index 0000000..b9738a7
--- /dev/null
+++ b/src/ceph-detect-init/ceph_detect_init/centos/__init__.py
@@ -0,0 +1,13 @@
+distro = None
+release = None
+codename = None
+
+
+def choose_init():
+    """Select a init system
+
+    Returns the name of a init system (upstart, sysvinit ...).
+    """
+    if release and int(release.split('.')[0]) >= 7:
+        return 'systemd'
+    return 'sysvinit'
diff --git a/src/ceph-detect-init/ceph_detect_init/exc.py b/src/ceph-detect-init/ceph_detect_init/exc.py
new file mode 100644
index 0000000..61d9752
--- /dev/null
+++ b/src/ceph-detect-init/ceph_detect_init/exc.py
@@ -0,0 +1,35 @@
+#
+# Copyright (C) 2015 <contact at redhat.com>
+#
+# Author: Alfredo Deza <adeza at redhat.com>
+# Author: Loic Dachary <loic at dachary.org>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see `<http://www.gnu.org/licenses/>`.
+#
+
+
+class UnsupportedPlatform(Exception):
+    """Platform is not supported."""
+    def __init__(self, distro, codename, release):
+        self.distro = distro
+        self.codename = codename
+        self.release = release
+
+    def __str__(self):
+        return '{doc}: {distro} {codename} {release}'.format(
+            doc=self.__doc__.strip(),
+            distro=self.distro,
+            codename=self.codename,
+            release=self.release,
+        )
diff --git a/src/ceph-detect-init/ceph_detect_init/fedora/__init__.py b/src/ceph-detect-init/ceph_detect_init/fedora/__init__.py
new file mode 100644
index 0000000..566f8e3
--- /dev/null
+++ b/src/ceph-detect-init/ceph_detect_init/fedora/__init__.py
@@ -0,0 +1,13 @@
+distro = None
+release = None
+codename = None
+
+
+def choose_init():
+    """Select a init system
+
+    Returns the name of a init system (upstart, sysvinit ...).
+    """
+    if release and int(release.split('.')[0]) >= 22:
+        return 'systemd'
+    return 'sysvinit'
diff --git a/src/ceph-detect-init/ceph_detect_init/main.py b/src/ceph-detect-init/ceph_detect_init/main.py
new file mode 100644
index 0000000..320ae17
--- /dev/null
+++ b/src/ceph-detect-init/ceph_detect_init/main.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python
+#
+# Copyright (C) 2015 <contact at redhat.com>
+# Copyright (C) 2015 SUSE LINUX GmbH
+#
+# Author: Alfredo Deza <alfredo.deza at inktank.com>
+# Author: Owen Synge <osynge at suse.com>
+# Author: Loic Dachary <loic at dachary.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Library Public License for more details.
+#
+import argparse
+import logging
+
+import ceph_detect_init
+from ceph_detect_init import exc
+
+
+def parser():
+    parser = argparse.ArgumentParser(
+        'ceph-detect-init',
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+        default=None,
+    )
+    parser.add_argument(
+        "--use-rhceph",
+        action="store_true",
+        default=False,
+    )
+    parser.add_argument(
+        "--default",
+        default=None,
+    )
+    return parser
+
+
+def run(argv=None, namespace=None):
+    args = parser().parse_args(argv, namespace)
+
+    if args.verbose:
+        logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s',
+                            level=logging.DEBUG)
+    try:
+        print(ceph_detect_init.get(args.use_rhceph).init)
+    except exc.UnsupportedPlatform:
+        if args.default:
+            print(args.default)
+        else:
+            raise
+
+    return 0
diff --git a/src/ceph-detect-init/ceph_detect_init/rhel/__init__.py b/src/ceph-detect-init/ceph_detect_init/rhel/__init__.py
new file mode 100644
index 0000000..b9738a7
--- /dev/null
+++ b/src/ceph-detect-init/ceph_detect_init/rhel/__init__.py
@@ -0,0 +1,13 @@
+distro = None
+release = None
+codename = None
+
+
+def choose_init():
+    """Select a init system
+
+    Returns the name of a init system (upstart, sysvinit ...).
+    """
+    if release and int(release.split('.')[0]) >= 7:
+        return 'systemd'
+    return 'sysvinit'
diff --git a/src/ceph-detect-init/ceph_detect_init/suse/__init__.py b/src/ceph-detect-init/ceph_detect_init/suse/__init__.py
new file mode 100644
index 0000000..69bf7c4
--- /dev/null
+++ b/src/ceph-detect-init/ceph_detect_init/suse/__init__.py
@@ -0,0 +1,17 @@
+distro = None
+release = None
+codename = None
+
+
+def choose_init():
+    """Select a init system
+
+    Returns the name of a init system (upstart, sysvinit ...).
+    """
+    init_mapping = {
+        '11': 'sysvinit',   # SLE_11
+        '12': 'systemd',    # SLE_12
+        '13.1': 'systemd',  # openSUSE_13.1
+        '13.2': 'systemd',  # openSUSE_13.2
+    }
+    return init_mapping.get(release, 'sysvinit')
diff --git a/src/ceph-detect-init/integration/centos-6.dockerfile b/src/ceph-detect-init/integration/centos-6.dockerfile
new file mode 100644
index 0000000..7cb5095
--- /dev/null
+++ b/src/ceph-detect-init/integration/centos-6.dockerfile
@@ -0,0 +1,4 @@
+FROM centos:6
+
+RUN yum install -y yum-utils && yum-config-manager --add-repo https://dl.fedoraproject.org/pub/epel/6/x86_64/ && yum install --nogpgcheck -y epel-release && rpm --import /etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-6 && rm /etc/yum.repos.d/dl.fedoraproject.org*
+RUN yum install -y python-pip python-virtualenv git
diff --git a/src/ceph-detect-init/integration/centos-7.dockerfile b/src/ceph-detect-init/integration/centos-7.dockerfile
new file mode 100644
index 0000000..59a5748
--- /dev/null
+++ b/src/ceph-detect-init/integration/centos-7.dockerfile
@@ -0,0 +1,4 @@
+FROM centos:7
+
+RUN yum install -y yum-utils && yum-config-manager --add-repo https://dl.fedoraproject.org/pub/epel/7/x86_64/ && yum install --nogpgcheck -y epel-release && rpm --import /etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-7 && rm /etc/yum.repos.d/dl.fedoraproject.org*
+RUN yum install -y python-pip python-virtualenv git
diff --git a/src/ceph-detect-init/integration/debian-jessie.dockerfile b/src/ceph-detect-init/integration/debian-jessie.dockerfile
new file mode 100644
index 0000000..bca22d5
--- /dev/null
+++ b/src/ceph-detect-init/integration/debian-jessie.dockerfile
@@ -0,0 +1,6 @@
+FROM debian:jessie
+
+RUN apt-get update
+RUN apt-get install -y python-virtualenv python-pip git
+
+
diff --git a/src/ceph-detect-init/integration/debian-sid.dockerfile b/src/ceph-detect-init/integration/debian-sid.dockerfile
new file mode 100644
index 0000000..00e4472
--- /dev/null
+++ b/src/ceph-detect-init/integration/debian-sid.dockerfile
@@ -0,0 +1,4 @@
+FROM debian:sid
+
+RUN apt-get update
+RUN apt-get install -y python-virtualenv python-pip git
diff --git a/src/ceph-detect-init/integration/debian-squeeze.dockerfile b/src/ceph-detect-init/integration/debian-squeeze.dockerfile
new file mode 100644
index 0000000..e5080f6
--- /dev/null
+++ b/src/ceph-detect-init/integration/debian-squeeze.dockerfile
@@ -0,0 +1,4 @@
+FROM debian:squeeze
+
+RUN apt-get update
+RUN apt-get install -y python-virtualenv python-pip git
diff --git a/src/ceph-detect-init/integration/debian-wheezy.dockerfile b/src/ceph-detect-init/integration/debian-wheezy.dockerfile
new file mode 100644
index 0000000..e03e30e
--- /dev/null
+++ b/src/ceph-detect-init/integration/debian-wheezy.dockerfile
@@ -0,0 +1,4 @@
+FROM debian:wheezy
+
+RUN apt-get update
+RUN apt-get install -y python-virtualenv python-pip git
diff --git a/src/ceph-detect-init/integration/fedora-21.dockerfile b/src/ceph-detect-init/integration/fedora-21.dockerfile
new file mode 100644
index 0000000..ee2ac93
--- /dev/null
+++ b/src/ceph-detect-init/integration/fedora-21.dockerfile
@@ -0,0 +1,3 @@
+FROM fedora:21
+
+RUN yum install -y python-pip python-virtualenv git
diff --git a/src/ceph-detect-init/integration/opensuse-13.1.dockerfile b/src/ceph-detect-init/integration/opensuse-13.1.dockerfile
new file mode 100644
index 0000000..00a5a28
--- /dev/null
+++ b/src/ceph-detect-init/integration/opensuse-13.1.dockerfile
@@ -0,0 +1,3 @@
+FROM opensuse:13.1
+
+RUN zypper --non-interactive --gpg-auto-import-keys install lsb python-pip python-virtualenv git
diff --git a/src/ceph-detect-init/integration/opensuse-13.2.dockerfile b/src/ceph-detect-init/integration/opensuse-13.2.dockerfile
new file mode 100644
index 0000000..26f591b
--- /dev/null
+++ b/src/ceph-detect-init/integration/opensuse-13.2.dockerfile
@@ -0,0 +1,3 @@
+FROM opensuse:13.2
+
+RUN zypper --non-interactive --gpg-auto-import-keys install python-pip python-virtualenv git
diff --git a/src/ceph-detect-init/integration/test_main.py b/src/ceph-detect-init/integration/test_main.py
new file mode 100644
index 0000000..e7a620e
--- /dev/null
+++ b/src/ceph-detect-init/integration/test_main.py
@@ -0,0 +1,94 @@
+#!/usr/bin/env python
+#
+# Copyright (C) 2015 SUSE LINUX GmbH
+# Copyright (C) 2015 <contact at redhat.com>
+#
+# Author: Owen Synge <osynge at suse.com>
+# Author: Loic Dachary <loic at dachary.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Library Public License for more details.
+#
+import logging
+import shutil
+import subprocess
+import testtools
+
+from ceph_detect_init import main
+
+logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s',
+                    level=logging.DEBUG)
+
+
+def run(os):
+    name = 'ceph-detect-init-' + os
+    shutil.rmtree(name, ignore_errors=True)
+    script = """\
+docker build -t {name} --file integration/{os}.dockerfile .
+toplevel=$(git rev-parse --show-toplevel)
+mkdir {name}
+cat > {name}/try.sh <<EOF
+  virtualenv {name}
+  . {name}/bin/activate
+  pip install -r requirements.txt
+  python setup.py install
+  ceph-detect-init > {name}/init
+EOF
+
+docker run -v $toplevel:$toplevel -w $(pwd) --user $(id -u) {name} bash -x {name}/try.sh
+""".format(name=name,
+           os=os)
+    subprocess.check_call(script, shell=True)
+    init = open(name + '/init').read().strip()
+    shutil.rmtree(name)
+    return init
+    
+
+class TestCephDetectInit(testtools.TestCase):
+
+    def test_centos_6(self):
+        self.assertEqual('sysvinit', run('centos-6'))
+
+    def test_centos_7(self):
+        self.assertEqual('sysvinit', run('centos-7'))
+
+    def test_ubuntu_12_04(self):
+        self.assertEqual('upstart', run('ubuntu-12.04'))
+
+    def test_ubuntu_14_04(self):
+        self.assertEqual('upstart', run('ubuntu-14.04'))
+
+    def test_ubuntu_15_04(self):
+        self.assertEqual('upstart', run('ubuntu-15.04'))
+
+    def test_debian_squeeze(self):
+        self.assertEqual('sysvinit', run('debian-squeeze'))
+
+    def test_debian_wheezy(self):
+        self.assertEqual('sysvinit', run('debian-wheezy'))
+
+    def test_debian_jessie(self):
+        self.assertEqual('sysvinit', run('debian-jessie'))
+
+    def test_debian_sid(self):
+        self.assertEqual('sysvinit', run('debian-sid'))
+
+    def test_fedora_21(self):
+        self.assertEqual('sysvinit', run('fedora-21'))
+
+    def test_opensuse_13_1(self):
+        self.assertEqual('systemd', run('opensuse-13.1'))
+
+    def test_opensuse_13_2(self):
+        self.assertEqual('systemd', run('opensuse-13.2'))
+
+# Local Variables:
+# compile-command: "cd .. ; .tox/py27/bin/py.test integration/test_main.py"
+# End:
diff --git a/src/ceph-detect-init/integration/ubuntu-12.04.dockerfile b/src/ceph-detect-init/integration/ubuntu-12.04.dockerfile
new file mode 100644
index 0000000..dda1a62
--- /dev/null
+++ b/src/ceph-detect-init/integration/ubuntu-12.04.dockerfile
@@ -0,0 +1,4 @@
+FROM ubuntu:12.04
+
+RUN apt-get update
+RUN apt-get install -y python-virtualenv python-pip git
diff --git a/src/ceph-detect-init/integration/ubuntu-14.04.dockerfile b/src/ceph-detect-init/integration/ubuntu-14.04.dockerfile
new file mode 100644
index 0000000..4f7a698
--- /dev/null
+++ b/src/ceph-detect-init/integration/ubuntu-14.04.dockerfile
@@ -0,0 +1,6 @@
+FROM ubuntu:14.04
+
+RUN apt-get update
+# http://stackoverflow.com/questions/27341064/how-do-i-fix-importerror-cannot-import-name-incompleteread
+RUN apt-get install -y python-setuptools && easy_install -U pip
+RUN apt-get install -y python-virtualenv git
diff --git a/src/ceph-detect-init/integration/ubuntu-15.04.dockerfile b/src/ceph-detect-init/integration/ubuntu-15.04.dockerfile
new file mode 100644
index 0000000..29b5776
--- /dev/null
+++ b/src/ceph-detect-init/integration/ubuntu-15.04.dockerfile
@@ -0,0 +1,4 @@
+FROM ubuntu:15.04
+
+RUN apt-get update
+RUN apt-get install -y python-pip python-virtualenv git
diff --git a/src/ceph-detect-init/requirements.txt b/src/ceph-detect-init/requirements.txt
new file mode 100644
index 0000000..1352d5e
--- /dev/null
+++ b/src/ceph-detect-init/requirements.txt
@@ -0,0 +1 @@
+argparse
diff --git a/src/ceph-detect-init/run-tox.sh b/src/ceph-detect-init/run-tox.sh
new file mode 100755
index 0000000..206938e
--- /dev/null
+++ b/src/ceph-detect-init/run-tox.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+#
+# Copyright (C) 2015 SUSE LINUX GmbH
+# Copyright (C) 2015 <contact at redhat.com>
+#
+# Author: Owen Synge <osynge at suse.com>
+# Author: Loic Dachary <loic at dachary.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Library Public License for more details.
+#
+
+# run from the ceph-detect-init directory or from its parent
+test -d ceph-detect-init && cd ceph-detect-init
+trap "rm -fr make-check" EXIT
+virtualenv make-check
+. make-check/bin/activate
+# older versions of pip will not install wrap_console scripts
+# when using wheel packages
+pip --log make-check/log.txt install --upgrade 'pip >= 6.1'
+if test -d wheelhouse ; then
+    export NO_INDEX=--no-index
+fi
+pip --log make-check/log.txt install $NO_INDEX --use-wheel --find-links=file://$(pwd)/wheelhouse --upgrade distribute
+pip --log make-check/log.txt install $NO_INDEX --use-wheel --find-links=file://$(pwd)/wheelhouse 'tox >=1.9' 
+tox > make-check/tox.out 2>&1 
+status=$?
+grep -v InterpreterNotFound < make-check/tox.out
+exit $status
diff --git a/src/ceph-detect-init/setup.py b/src/ceph-detect-init/setup.py
new file mode 100644
index 0000000..dea9637
--- /dev/null
+++ b/src/ceph-detect-init/setup.py
@@ -0,0 +1,79 @@
+#
+# Copyright (C) 2015 SUSE LINUX GmbH
+# Copyright (C) 2015 <contact at redhat.com>
+#
+# Author: Owen Synge <osynge at suse.com>
+# Author: Loic Dachary <loic at dachary.org>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see `<http://www.gnu.org/licenses/>`.
+#
+import os
+import sys
+from setuptools import setup
+from setuptools import find_packages
+
+def read(fname):
+    path = os.path.join(os.path.dirname(__file__), fname)
+    f = open(path)
+    return f.read()
+
+
+def filter_included_modules(*m):
+    modules = sum(m, [])
+    if sys.version_info[0] == 2 and sys.version_info[1] <= 6:
+        return modules
+    included_modules = set(['argparse', 'importlib', 'sysconfig'])
+    return list(set(modules) - included_modules)
+
+
+install_requires = read('requirements.txt').split()
+tests_require = read('test-requirements.txt').split()
+
+setup(
+    name='ceph-detect-init',
+    version='1.0.1',
+    packages=find_packages(),
+
+    author='Owen Synge, Loic Dachary',
+    author_email='osynge at suse.de, loic at dachary.org',
+    description='display the normalized name of the init system',
+    long_description=read('README.rst'),
+    license='LGPLv2+',
+    keywords='ceph',
+    url="https://git.ceph.com/?p=ceph.git;a=summary",
+
+    install_requires=filter_included_modules(['setuptools'],
+                                             install_requires),
+    tests_require=filter_included_modules(tests_require),
+
+    classifiers=[
+        'Environment :: Console',
+        'Intended Audience :: Information Technology',
+        'Intended Audience :: System Administrators',
+        'Operating System :: POSIX :: Linux',
+        'License :: OSI Approved :: GNU Lesser General Public License v2 or later (LGPLv2+)',
+        'Programming Language :: Python',
+        'Programming Language :: Python :: 2',
+        'Programming Language :: Python :: 3',
+        'Topic :: Utilities',
+    ],
+
+    entry_points={
+
+        'console_scripts': [
+            'ceph-detect-init = ceph_detect_init.main:run',
+            ],
+
+        },
+    )
diff --git a/src/ceph-detect-init/test-requirements.txt b/src/ceph-detect-init/test-requirements.txt
new file mode 100644
index 0000000..5a0761c
--- /dev/null
+++ b/src/ceph-detect-init/test-requirements.txt
@@ -0,0 +1,10 @@
+coverage>=3.6
+discover
+fixtures>=0.3.14
+python-subunit
+testrepository>=0.0.17
+testtools>=0.9.32
+mock
+pytest
+tox
+flake8
diff --git a/src/ceph-detect-init/tests/test_all.py b/src/ceph-detect-init/tests/test_all.py
new file mode 100644
index 0000000..069a0ed
--- /dev/null
+++ b/src/ceph-detect-init/tests/test_all.py
@@ -0,0 +1,171 @@
+#
+# Copyright (C) 2015 SUSE LINUX GmbH
+# Copyright (C) 2015 <contact at redhat.com>
+#
+# Author: Owen Synge <osynge at suse.com>
+# Author: Loic Dachary <loic at dachary.org>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see `<http://www.gnu.org/licenses/>`.
+#
+import logging
+import mock
+import testtools
+
+import ceph_detect_init
+from ceph_detect_init import centos
+from ceph_detect_init import debian
+from ceph_detect_init import exc
+from ceph_detect_init import fedora
+from ceph_detect_init import main
+from ceph_detect_init import rhel
+from ceph_detect_init import suse
+
+logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s',
+                    level=logging.DEBUG)
+
+
+class TestCephDetectInit(testtools.TestCase):
+
+    def test_centos(self):
+        with mock.patch('ceph_detect_init.centos.release',
+                        '7.0'):
+            self.assertEqual('systemd', centos.choose_init())
+        self.assertEqual('sysvinit', centos.choose_init())
+
+    def test_debian(self):
+        with mock.patch('ceph_detect_init.debian.distro',
+                        'debian'):
+            self.assertEqual('sysvinit', debian.choose_init())
+        with mock.patch('ceph_detect_init.debian.distro',
+                        'ubuntu'):
+            self.assertEqual('upstart', debian.choose_init())
+
+    def test_fedora(self):
+        with mock.patch('ceph_detect_init.fedora.release',
+                        '22'):
+            self.assertEqual('systemd', fedora.choose_init())
+        self.assertEqual('sysvinit', fedora.choose_init())
+
+    def test_rhel(self):
+        with mock.patch('ceph_detect_init.rhel.release',
+                        '7.0'):
+            self.assertEqual('systemd', rhel.choose_init())
+        self.assertEqual('sysvinit', rhel.choose_init())
+
+    def test_suse(self):
+        with mock.patch('ceph_detect_init.suse.release',
+                        '11'):
+            self.assertEqual('sysvinit', suse.choose_init())
+        with mock.patch('ceph_detect_init.suse.release',
+                        '12'):
+            self.assertEqual('systemd', suse.choose_init())
+        with mock.patch('ceph_detect_init.suse.release',
+                        '13.1'):
+            self.assertEqual('systemd', suse.choose_init())
+        with mock.patch('ceph_detect_init.suse.release',
+                        '13.2'):
+            self.assertEqual('systemd', suse.choose_init())
+
+    def test_get(self):
+        g = ceph_detect_init.get
+        with mock.patch('platform.linux_distribution',
+                        lambda: (('unknown', '', ''))):
+            self.assertRaises(exc.UnsupportedPlatform, g)
+            try:
+                g()
+            except exc.UnsupportedPlatform as e:
+                self.assertIn('Platform is not supported', str(e))
+
+        with mock.patch('platform.linux_distribution',
+                        lambda: (('debian', '6.0', ''))):
+            distro = ceph_detect_init.get()
+            self.assertEqual(debian, distro)
+            self.assertEqual('debian', distro.name)
+            self.assertEqual('debian', distro.normalized_name)
+            self.assertEqual('debian', distro.distro)
+            self.assertEqual(False, distro.is_el)
+            self.assertEqual('6.0', distro.release)
+            self.assertEqual('squeeze', distro.codename)
+            self.assertEqual('sysvinit', distro.init)
+
+    def test_get_distro(self):
+        g = ceph_detect_init._get_distro
+        self.assertEqual(None, g(None))
+        self.assertEqual(debian, g('debian'))
+        self.assertEqual(debian, g('ubuntu'))
+        self.assertEqual(centos, g('centos'))
+        self.assertEqual(centos, g('scientific'))
+        self.assertEqual(fedora, g('fedora'))
+        self.assertEqual(suse, g('suse'))
+        self.assertEqual(rhel, g('redhat', use_rhceph=True))
+
+    def test_normalized_distro_name(self):
+        n = ceph_detect_init._normalized_distro_name
+        self.assertEqual('redhat', n('RedHat'))
+        self.assertEqual('redhat', n('redhat'))
+        self.assertEqual('redhat', n('Red Hat'))
+        self.assertEqual('redhat', n('red hat'))
+        self.assertEqual('scientific', n('scientific'))
+        self.assertEqual('scientific', n('Scientific'))
+        self.assertEqual('scientific', n('Scientific Linux'))
+        self.assertEqual('scientific', n('scientific linux'))
+        self.assertEqual('suse', n('SUSE'))
+        self.assertEqual('suse', n('suse'))
+        self.assertEqual('suse', n('openSUSE'))
+        self.assertEqual('suse', n('opensuse'))
+        self.assertEqual('centos', n('CentOS'))
+        self.assertEqual('centos', n('centos'))
+        self.assertEqual('debian', n('Debian'))
+        self.assertEqual('debian', n('debian'))
+        self.assertEqual('ubuntu', n('Ubuntu'))
+        self.assertEqual('ubuntu', n('ubuntu'))
+
+    def test_platform_information(self):
+        with mock.patch('platform.linux_distribution',
+                        lambda: (('debian', '6.0', ''))):
+            self.assertEqual(('debian', '6.0', 'squeeze'),
+                             ceph_detect_init.platform_information())
+
+        with mock.patch('platform.linux_distribution',
+                        lambda: (('debian', '7.0', ''))):
+            self.assertEqual(('debian', '7.0', 'wheezy'),
+                             ceph_detect_init.platform_information())
+
+        with mock.patch('platform.linux_distribution',
+                        lambda: (('debian', '8.0', ''))):
+            self.assertEqual(('debian', '8.0', 'jessie'),
+                             ceph_detect_init.platform_information())
+
+        with mock.patch('platform.linux_distribution',
+                        lambda: (('debian', 'jessie/sid', ''))):
+            self.assertEqual(('debian', 'jessie/sid', 'sid'),
+                             ceph_detect_init.platform_information())
+
+        with mock.patch('platform.linux_distribution',
+                        lambda: (('debian', 'sid/jessie', ''))):
+            self.assertEqual(('debian', 'sid/jessie', 'sid'),
+                             ceph_detect_init.platform_information())
+
+    def test_run(self):
+        argv = ['--use-rhceph', '--verbose']
+        self.assertEqual(0, main.run(argv))
+
+        with mock.patch('platform.linux_distribution',
+                        lambda: (('unknown', '', ''))):
+            self.assertRaises(exc.UnsupportedPlatform, main.run, argv)
+            self.assertEqual(0, main.run(argv + ['--default=sysvinit']))
+
+# Local Variables:
+# compile-command: "cd .. ; .tox/py27/bin/py.test tests/test_all.py"
+# End:
diff --git a/src/ceph-detect-init/tox.ini b/src/ceph-detect-init/tox.ini
new file mode 100644
index 0000000..3da7065
--- /dev/null
+++ b/src/ceph-detect-init/tox.ini
@@ -0,0 +1,31 @@
+[tox]
+envlist = pep8,py27,py3
+skip_missing_interpreters = True
+
+[testenv]
+basepython =
+    py27: python2.7
+    py3: python3
+setenv = VIRTUAL_ENV={envdir}
+usedevelop = true
+deps =
+  {env:NO_INDEX:}
+  --use-wheel
+  --find-links=file://{toxinidir}/wheelhouse
+  -r{toxinidir}/requirements.txt
+  -r{toxinidir}/test-requirements.txt
+
+commands = coverage run --source=ceph_detect_init {envbindir}/py.test -v tests
+           coverage report --omit=*test*,*tox* --show-missing --fail-under=100
+
+[testenv:pep8]
+basepython = python2
+commands = flake8 ceph_detect_init tests
+
+[testenv:integration]
+basepython = python2
+setenv = VIRTUAL_ENV={envdir}
+deps = -r{toxinidir}/requirements.txt
+  -r{toxinidir}/test-requirements.txt
+
+commands = {envbindir}/py.test -v integration/test_main.py
diff --git a/src/ceph-disk b/src/ceph-disk
index 4a48520..3f00951 100755
--- a/src/ceph-disk
+++ b/src/ceph-disk
@@ -20,6 +20,7 @@
 import argparse
 import errno
 import fcntl
+import json
 import logging
 import os
 import os.path
@@ -32,7 +33,8 @@ import tempfile
 import uuid
 import time
 import shlex
-import stat
+import pwd
+import grp
 
 """
 Prepare:
@@ -46,6 +48,7 @@ Prepare:
  - triggered by administrator or ceph-deploy, e.g.  'ceph-disk <data disk> [journal disk]
 
 Activate:
+ - if encrypted, map the dmcrypt volume
  - mount the volume in a temp location
  - allocate an osd id (if needed)
  - remount in the correct location /var/lib/ceph/osd/$cluster-$id
@@ -78,16 +81,20 @@ knew the GPT partition type.
 CEPH_OSD_ONDISK_MAGIC = 'ceph osd volume v026'
 
 JOURNAL_UUID =              '45b0969e-9b03-4f30-b4c6-b4b80ceff106'
+MPATH_JOURNAL_UUID =        '45b0969e-8ae0-4982-bf9d-5a8d867af560'
 DMCRYPT_JOURNAL_UUID =      '45b0969e-9b03-4f30-b4c6-5ec00ceff106'
 DMCRYPT_LUKS_JOURNAL_UUID = '45b0969e-9b03-4f30-b4c6-35865ceff106'
 OSD_UUID =                  '4fbd7e29-9d25-41b8-afd0-062c0ceff05d'
+MPATH_OSD_UUID =            '4fbd7e29-8ae0-4982-bf9d-5a8d867af560'
 DMCRYPT_OSD_UUID =          '4fbd7e29-9d25-41b8-afd0-5ec00ceff05d'
 DMCRYPT_LUKS_OSD_UUID =     '4fbd7e29-9d25-41b8-afd0-35865ceff05d'
 TOBE_UUID =                 '89c57f98-2fe5-4dc0-89c1-f3ad0ceff2be'
+MPATH_TOBE_UUID =           '89c57f98-8ae0-4982-bf9d-5a8d867af560'
 DMCRYPT_TOBE_UUID =         '89c57f98-2fe5-4dc0-89c1-5ec00ceff2be'
 DMCRYPT_JOURNAL_TOBE_UUID = '89c57f98-2fe5-4dc0-89c1-35865ceff2be'
 
 DEFAULT_FS_TYPE = 'xfs'
+SYSFS = '/sys'
 
 MOUNT_OPTIONS = dict(
     btrfs='noatime,user_subvol_rm_allowed',
@@ -225,6 +232,24 @@ class ExecutableNotFound(CephDiskException):
 
 ####### utils
 
+def is_systemd():
+    """
+    Detect whether systemd is running
+    """
+    with file('/proc/1/comm', 'rb') as i:
+        for line in i:
+            if 'systemd' in line:
+                return True
+    return False
+
+def is_upstart():
+    """
+    Detect whether upstart is running
+    """
+    (out, _) = command(['init', '--version'])
+    if 'upstart' in out:
+        return True
+    return False
 
 def maybe_mkdir(*a, **kw):
     """
@@ -263,7 +288,8 @@ def which(executable):
 
     for location in locations:
         executable_path = os.path.join(location, executable)
-        if os.path.exists(executable_path):
+        if (os.path.isfile(executable_path) and
+            os.access(executable_path, os.X_OK)):
             return executable_path
 
 
@@ -358,6 +384,56 @@ def platform_information():
         str(codename).strip()
     )
 
+#
+# An alternative block_path implementation would be
+#
+#   name = basename(dev)
+#   return /sys/devices/virtual/block/$name
+#
+# It is however more fragile because it relies on the fact
+# that the basename of the device the user will use always
+# matches the one the driver will use. On Ubuntu 14.04, for
+# instance, when multipath creates a partition table on
+#
+#   /dev/mapper/353333330000007d0 -> ../dm-0
+#
+# it will create partition devices named
+#
+#   /dev/mapper/353333330000007d0-part1
+#
+# which is the same device as /dev/dm-1 but not a symbolic
+# link to it:
+#
+#   ubuntu at other:~$ ls -l /dev/mapper /dev/dm-1
+#   brw-rw---- 1 root disk 252, 1 Aug 15 17:52 /dev/dm-1
+#   lrwxrwxrwx 1 root root        7 Aug 15 17:52 353333330000007d0 -> ../dm-0
+#   brw-rw---- 1 root disk 252,   1 Aug 15 17:52 353333330000007d0-part1
+#
+# Using the basename in this case fails.
+#
+def block_path(dev):
+    path = os.path.realpath(dev)
+    rdev = os.stat(path).st_rdev
+    (M, m) = (os.major(rdev), os.minor(rdev))
+    return "{sysfs}/dev/block/{M}:{m}".format(sysfs=SYSFS, M=M, m=m)
+
+def get_dm_uuid(dev):
+    uuid_path = os.path.join(block_path(dev), 'dm', 'uuid')
+    LOG.debug("get_dm_uuid " + dev + " uuid path is " + uuid_path)
+    if not os.path.exists(uuid_path):
+        return False
+    uuid = open(uuid_path, 'r').read()
+    LOG.debug("get_dm_uuid " + dev + " uuid is " + uuid)
+    return uuid
+
+def is_mpath(dev):
+    """
+    True if the path is managed by multipath
+    """
+    uuid = get_dm_uuid(dev)
+    return (uuid and
+            (re.match('part\d+-mpath-', uuid) or
+             re.match('mpath-', uuid)))
 
 def get_dev_name(path):
     """
@@ -418,6 +494,14 @@ def get_dev_size(dev, size='megabytes'):
         os.close(fd)
 
 
+def get_partition_mpath(dev, pnum):
+    part_re = "part{pnum}-mpath-".format(pnum=pnum)
+    partitions = list_partitions_mpath(dev, part_re)
+    if partitions:
+        return partitions[0]
+    else:
+        return None
+
 def get_partition_dev(dev, pnum):
     """
     get the device name for a partition
@@ -428,40 +512,66 @@ def get_partition_dev(dev, pnum):
        sda 1 -> sda1
        cciss/c0d1 1 -> cciss!c0d1p1
     """
-    name = get_dev_name(os.path.realpath(dev))
     partname = None
-    for f in os.listdir(os.path.join('/sys/block', name)):
-        if f.startswith(name) and f.endswith(str(pnum)):
-            # we want the shortest name that starts with the base name and ends with the partition number
-            if not partname or len(f) < len(partname):
-                partname = f
+    if is_mpath(dev):
+        partname = get_partition_mpath(dev, pnum)
+    else:
+        name = get_dev_name(os.path.realpath(dev))
+        for f in os.listdir(os.path.join('/sys/block', name)):
+            if f.startswith(name) and f.endswith(str(pnum)):
+                # we want the shortest name that starts with the base name and ends with the partition number
+                if not partname or len(f) < len(partname):
+                    partname = f
     if partname:
         return get_dev_path(partname)
     else:
         raise Error('partition %d for %s does not appear to exist' % (pnum, dev))
 
 
-def list_all_partitions():
+def list_all_partitions(names):
     """
     Return a list of devices and partitions
     """
+    if names:
+        names = map(lambda x: re.sub('^/dev/', '', x), names)
+    else:
+        names = os.listdir('/sys/block')
     dev_part_list = {}
-    for name in os.listdir('/sys/block'):
+    for name in names:
+        LOG.debug("list_all_partitions: " + name)
         # /dev/fd0 may hang http://tracker.ceph.com/issues/6827
         if re.match(r'^fd\d$', name):
             continue
-        if not os.path.exists(os.path.join('/sys/block', name, 'device')):
-            continue
-        dev_part_list[name] = list_partitions(name)
+        dev_part_list[name] = list_partitions(os.path.join('/dev', name))
     return dev_part_list
 
+def list_partitions(dev):
+    dev = os.path.realpath(dev)
+    if is_mpath(dev):
+        return list_partitions_mpath(dev)
+    else:
+        return list_partitions_device(dev)
+
+def list_partitions_mpath(dev, part_re="part\d+-mpath-"):
+    p = block_path(dev)
+    partitions = []
+    holders = os.path.join(p, 'holders')
+    for holder in os.listdir(holders):
+        uuid_path = os.path.join(holders, holder, 'dm', 'uuid')
+        uuid = open(uuid_path, 'r').read()
+        LOG.debug("list_partitions_mpath: " + uuid_path + " uuid = " + uuid)
+        if re.match(part_re, uuid):
+            partitions.append(holder)
+    return partitions
+
 
-def list_partitions(basename):
+def list_partitions_device(dev):
     """
     Return a list of partitions on the given device name
     """
     partitions = []
-    for name in os.listdir(os.path.join('/sys/block', basename)):
+    basename = os.path.basename(dev)
+    for name in os.listdir(block_path(dev)):
         if name.startswith(basename):
             partitions.append(name)
     return partitions
@@ -484,12 +594,32 @@ def get_partition_base(dev):
             return '/dev/' + basename
     raise Error('no parent device for partition', dev)
 
+def is_partition_mpath(dev):
+    uuid = get_dm_uuid(dev)
+    return bool(re.match('part\d+-mpath-', uuid))
+
+def partnum_mpath(dev):
+    uuid = get_dm_uuid(dev)
+    return re.findall('part(\d+)-mpath-', uuid)[0]
+
+def get_partition_base_mpath(dev):
+    slave_path = os.path.join(block_path(dev), 'slaves')
+    slaves = os.listdir(slave_path)
+    assert slaves
+    name_path = os.path.join(slave_path, slaves[0], 'dm', 'name')
+    name = open(name_path, 'r').read().strip()
+    return os.path.join('/dev/mapper', name)
+
 def is_partition(dev):
     """
     Check whether a given device path is a partition or a full disk.
     """
+    if is_mpath(dev):
+        return is_partition_mpath(dev)
+
     dev = os.path.realpath(dev)
-    if not stat.S_ISBLK(os.lstat(dev).st_mode):
+    st = os.lstat(dev)
+    if not stat.S_ISBLK(st.st_mode):
         raise Error('not a block device', dev)
 
     name = get_dev_name(dev)
@@ -497,9 +627,10 @@ def is_partition(dev):
         return False
 
     # make sure it is a partition of something else
-    for basename in os.listdir('/sys/block'):
-        if os.path.exists(os.path.join('/sys/block', basename, name)):
-            return True
+    major = os.major(st.st_rdev)
+    minor = os.minor(st.st_rdev)
+    if os.path.exists('/sys/dev/block/%d:%d/partition' % (major, minor)):
+        return True
 
     raise Error('not a disk or partition', dev)
 
@@ -528,6 +659,9 @@ def is_held(dev):
     Check if a device is held by another device (e.g., a dm-crypt mapping)
     """
     assert os.path.exists(dev)
+    if is_mpath(dev):
+        return []
+
     dev = os.path.realpath(dev)
     base = get_dev_name(dev)
 
@@ -561,8 +695,7 @@ def verify_not_in_use(dev, check_partitions=False):
         raise Error('Device %s is in use by a device-mapper mapping (dm-crypt?)' % dev, ','.join(holders))
 
     if check_partitions and not is_partition(dev):
-        basename = get_dev_name(os.path.realpath(dev))
-        for partname in list_partitions(basename):
+        for partname in list_partitions(dev):
             partition = get_dev_path(partname)
             if is_mounted(partition):
                 raise Error('Device is mounted', partition)
@@ -626,9 +759,24 @@ def write_one_line(parent, name, text):
     with file(tmp, 'wb') as tmp_file:
         tmp_file.write(text + '\n')
         os.fsync(tmp_file.fileno())
+    path_set_context(tmp)
     os.rename(tmp, path)
 
 
+def init_get():
+    """
+    Get a init system using 'ceph-detect-init'
+    """
+    init = _check_output(
+        args=[
+            'ceph-detect-init',
+            '--default', 'sysvinit',
+            ],
+        )
+    init = must_be_one_line(init)
+    return init
+
+
 def check_osd_magic(path):
     """
     Check that this path has the Ceph OSD magic.
@@ -692,6 +840,32 @@ def get_osd_id(path):
         check_osd_id(osd_id)
     return osd_id
 
+def get_ceph_user():
+    try:
+        pwd.getpwnam('ceph')
+        grp.getgrnam('ceph')
+        return 'ceph'
+    except KeyError:
+        return 'root'
+
+def path_set_context(path):
+    # restore selinux context to default policy values
+    if which('restorecon'):
+        command(
+           [
+                'restorecon', '-R',
+                path,
+                ],
+           )
+
+    # if ceph user exists, set owner to ceph
+    if get_ceph_user() == 'ceph':
+        command(
+           [
+                'chown', '-R', 'ceph:ceph',
+                path,
+                 ],
+           )
 
 def _check_output(args=None, **kwargs):
     out, ret = command(args, **kwargs)
@@ -780,23 +954,36 @@ def get_fsid(cluster):
     return fsid.lower()
 
 
-def get_or_create_dmcrypt_key(
+def get_dmcrypt_key_path(
     _uuid,
     key_dir,
-    key_size,
     luks
     ):
     """
-    Get path to dmcrypt key or create a new key file.
+    Get path to dmcrypt key file.
 
-    :return: Path to the dmcrypt key file.
+    :return: Path to the dmcrypt key file, callers should check for existence.
     """
     if luks:
         path = os.path.join(key_dir, _uuid + ".luks.key")
     else:
         path = os.path.join(key_dir, _uuid)
 
-    # already have it?
+    return path
+
+
+def get_or_create_dmcrypt_key(
+    _uuid,
+    key_dir,
+    key_size,
+    luks
+    ):
+    """
+    Get path to existing dmcrypt key or create a new key file.
+
+    :return: Path to the dmcrypt key file.
+    """
+    path = get_dmcrypt_key_path(_uuid, key_dir, luks)
     if os.path.exists(path):
         return path
 
@@ -820,7 +1007,8 @@ def dmcrypt_map(
     keypath,
     _uuid,
     cryptsetup_parameters,
-    luks
+    luks,
+    format_dev=False,
     ):
     """
     Maps a device to a dmcrypt device.
@@ -857,11 +1045,14 @@ def dmcrypt_map(
 
     try:
         if luks:
-            command_check_call(luksFormat_args)
+            if format_dev:
+		    command_check_call(luksFormat_args)
             command_check_call(luksOpen_args)
         else:
             # Plain mode has no format function, nor any validation that the key is correct.
             command_check_call(create_args)
+        # set proper ownership of mapped device
+        command_check_call(['chown', 'ceph:ceph', dev])
         return dev
 
     except subprocess.CalledProcessError as e:
@@ -923,6 +1114,13 @@ def mount(
                 path,
                 ],
             )
+        if which('restorecon'):
+            command(
+                [
+                    'restorecon',
+                    path,
+                ],
+            )
     except subprocess.CalledProcessError as e:
         try:
             os.rmdir(path)
@@ -1006,34 +1204,22 @@ def get_free_partition_index(dev):
         return 1
 
 
-def update_partition(action, dev, description):
-     # try to make sure the kernel refreshes the table.  note
-     # that if this gets ebusy, we are probably racing with
-     # udev because it already updated it.. ignore failure here.
-
-     # On RHEL and CentOS distros, calling partprobe forces a reboot of the
-     # server. Since we are not resizing partitons so we rely on calling
-     # partx
-     if platform_distro().startswith(('centos', 'red', 'scientific')):
-         LOG.info('calling partx on %s device %s', description, dev)
-         LOG.info('re-reading known partitions will display errors')
-         command(
-             [
-                 'partx',
-                 action,
-                 dev,
-             ],
-         )
-
-     else:
-         LOG.debug('Calling partprobe on %s device %s', description, dev)
-         command(
-             [
-                 'partprobe',
-                 dev,
-             ],
-         )
-
+def update_partition(dev, description):
+    """
+    Must be called after modifying a partition table so the kernel
+    know about the change and fire udev events accordingly. A side
+    effect of partprobe is to remove partitions and add them again.
+    The first udevadm settle waits for ongoing udev events to
+    complete, just in case one of them rely on an existing partition
+    on dev.  The second udevadm settle guarantees to the caller that
+    all udev events related to the partition table change have been
+    processed, i.e. the 95-ceph-osd.rules actions and mode changes,
+    group changes etc. are complete.
+    """
+    LOG.debug('Calling partprobe on %s device %s', description, dev)
+    command_check_call(['udevadm', 'settle'])
+    command_check_call(['partprobe', dev])
+    command_check_call(['udevadm', 'settle'])
 
 def zap(dev):
     """
@@ -1071,7 +1257,7 @@ def zap(dev):
             ],
         )
 
-        update_partition('-d', dev, 'zapped')
+        update_partition(dev, 'zapped')
 
     except subprocess.CalledProcessError as e:
         raise Error(e)
@@ -1095,7 +1281,7 @@ def prepare_journal_dev(
                         ' and --dmcrypt specified')
         LOG.debug('Journal %s is a partition', journal)
         LOG.warning('OSD will not be hot-swappable if journal is not the same device as the osd data')
-        if get_partition_type(journal) == JOURNAL_UUID:
+        if get_partition_type(journal) in (JOURNAL_UUID, MPATH_JOURNAL_UUID):
             LOG.debug('Journal %s was previously prepared with ceph-disk. Reusing it.', journal)
             reusing_partition = True
             # Read and reuse the partition uuid from this journal's previous life.
@@ -1127,6 +1313,9 @@ def prepare_journal_dev(
 
     ptype = JOURNAL_UUID
     ptype_tobe = JOURNAL_UUID
+    if is_mpath(journal):
+        ptype = MPATH_JOURNAL_UUID
+        ptype_tobe = MPATH_JOURNAL_UUID
     if journal_dm_keypath:
         if luks:
             ptype = DMCRYPT_LUKS_JOURNAL_UUID
@@ -1188,15 +1377,7 @@ def prepare_journal_dev(
                 ]
             )
 
-        update_partition('-a', journal, 'prepared')
-
-        # wait for udev event queue to clear
-        command(
-            [
-                'udevadm',
-                'settle',
-                ],
-            )
+        update_partition(journal, 'prepared')
 
         LOG.debug('Journal is GPT partition %s', journal_symlink)
 
@@ -1377,6 +1558,10 @@ def prepare_dev(
 
     ptype_tobe = TOBE_UUID
     ptype_osd = OSD_UUID
+    if is_mpath(data):
+        ptype_tobe = MPATH_TOBE_UUID
+        ptype_osd = MPATH_OSD_UUID
+
     if osd_dm_keypath:
         ptype_tobe = DMCRYPT_TOBE_UUID
         if luks:
@@ -1404,14 +1589,7 @@ def prepare_dev(
                     data,
                 ],
             )
-            update_partition('-a', data, 'created')
-            command(
-                [
-                    # wait for udev event queue to clear
-                    'udevadm',
-                    'settle',
-                    ],
-                )
+            update_partition(data, 'created')
         except subprocess.CalledProcessError as e:
             raise Error(e)
 
@@ -1419,7 +1597,14 @@ def prepare_dev(
 
     dev = None
     if osd_dm_keypath:
-        dev = dmcrypt_map(rawdev, osd_dm_keypath, osd_uuid, cryptsetup_parameters, luks)
+        dev = dmcrypt_map(
+                rawdev=rawdev,
+                keypath=osd_dm_keypath,
+                _uuid=osd_uuid,
+                cryptsetup_parameters=cryptsetup_parameters,
+                luks=luks,
+                format_dev=True,
+                )
     else:
         dev = rawdev
 
@@ -1461,6 +1646,7 @@ def prepare_dev(
                 journal_dmcrypt=journal_dmcrypt,
                 )
         finally:
+            path_set_context(path)
             unmount(path)
     finally:
         if rawdev != dev:
@@ -1478,13 +1664,38 @@ def prepare_dev(
             )
         except subprocess.CalledProcessError as e:
             raise Error(e)
-
+        update_partition(data, 'prepared')
+        command_check_call(['udevadm', 'trigger',
+                            '--action=add',
+                            '--sysname-match',
+                            os.path.basename(rawdev)])
+
+def check_journal_reqs(args):
+    _, allows_journal = command([
+        'ceph-osd', '--check-allows-journal',
+        '-i', '0',
+        '--cluster', args.cluster,
+    ])
+    _, wants_journal = command([
+        'ceph-osd', '--check-wants-journal',
+        '-i', '0',
+        '--cluster', args.cluster,
+    ])
+    _, needs_journal = command([
+        'ceph-osd', '--check-needs-journal',
+        '-i', '0',
+        '--cluster', args.cluster,
+    ])
+    return (not allows_journal, not wants_journal, not needs_journal)
 
 def main_prepare(args):
     journal_dm_keypath = None
     osd_dm_keypath = None
 
     try:
+        # first learn what the osd allows/wants/needs
+        (allows_journal, wants_journal, needs_journal) = check_journal_reqs(args)
+
         prepare_lock.acquire()  # noqa
         if not os.path.exists(args.data):
             if args.data_dev:
@@ -1498,6 +1709,9 @@ def main_prepare(args):
         if stat.S_ISBLK(dmode):
             verify_not_in_use(args.data, True)
 
+        if args.journal and not allows_journal:
+            raise Error('journal specified but not allowed by osd backend')
+
         if args.journal and os.path.exists(args.journal):
             jmode = os.stat(args.journal).st_mode
             if stat.S_ISBLK(jmode):
@@ -1617,31 +1831,36 @@ def main_prepare(args):
             raise Error('invalid osd_dmcrypt_type parameter (must be luks or plain): ', dmcrypt_type)
 
         # colocate journal with data?
-        if stat.S_ISBLK(dmode) and not is_partition(args.data) and args.journal is None and args.journal_file is None:
+        if wants_journal and stat.S_ISBLK(dmode) and not is_partition(args.data) and args.journal is None and args.journal_file is None:
             LOG.info('Will colocate journal with data on %s', args.data)
             args.journal = args.data
 
-        if args.journal_uuid is None:
+        if args.journal and args.journal_uuid is None:
             args.journal_uuid = str(uuid.uuid4())
         if args.osd_uuid is None:
             args.osd_uuid = str(uuid.uuid4())
 
         # dm-crypt keys?
         if args.dmcrypt:
-            journal_dm_keypath = get_or_create_dmcrypt_key(args.journal_uuid, args.dmcrypt_key_dir, dmcrypt_keysize, luks)
+            if args.journal:
+                journal_dm_keypath = get_or_create_dmcrypt_key(args.journal_uuid, args.dmcrypt_key_dir, dmcrypt_keysize, luks)
             osd_dm_keypath = get_or_create_dmcrypt_key(args.osd_uuid, args.dmcrypt_key_dir, dmcrypt_keysize, luks)
 
         # prepare journal
-        (journal_symlink, journal_dmcrypt, journal_uuid) = prepare_journal(
-            data=args.data,
-            journal=args.journal,
-            journal_size=journal_size,
-            journal_uuid=args.journal_uuid,
-            force_file=args.journal_file,
-            force_dev=args.journal_dev,
-            journal_dm_keypath=journal_dm_keypath,
-            cryptsetup_parameters=cryptsetup_parameters,
-            luks=luks
+        journal_symlink = None
+        journal_dmcrypt = None
+        journal_uuid = None
+        if args.journal:
+            (journal_symlink, journal_dmcrypt, journal_uuid) = prepare_journal(
+                data=args.data,
+                journal=args.journal,
+                journal_size=journal_size,
+                journal_uuid=args.journal_uuid,
+                force_file=args.journal_file,
+                force_dev=args.journal_dev,
+                journal_dm_keypath=journal_dm_keypath,
+                cryptsetup_parameters=cryptsetup_parameters,
+                luks=luks
             )
 
         # prepare data
@@ -1677,9 +1896,6 @@ def main_prepare(args):
             raise Error('not a dir or block device', args.data)
         prepare_lock.release()  # noqa
 
-        if stat.S_ISBLK(dmode):
-            update_partition('-a', args.data, 'prepared')
-
     except Error as e:
         if journal_dm_keypath:
             try:
@@ -1694,7 +1910,7 @@ def main_prepare(args):
                 if e2.errno != errno.ENOENT: # errno.ENOENT = no such file or directory
                     raise # re-raise exception if a different error occured
         prepare_lock.release()  # noqa
-        raise e
+        raise
 
 
 ###########################
@@ -1730,6 +1946,8 @@ def mkfs(
             '--osd-journal', os.path.join(path, 'journal'),
             '--osd-uuid', fsid,
             '--keyring', os.path.join(path, 'keyring'),
+            '--setuser', get_ceph_user(),
+            '--setgroup', get_ceph_user(),
             ],
         )
     # TODO ceph-osd --mkfs removes the monmap file?
@@ -1833,9 +2051,8 @@ def start_daemon(
     path = (STATEDIR + '/osd/{cluster}-{osd_id}').format(
         cluster=cluster, osd_id=osd_id)
 
-    # upstart?
     try:
-        if os.path.exists(os.path.join(path,'upstart')):
+        if os.path.exists(os.path.join(path, 'upstart')):
             command_check_call(
                 [
                     '/sbin/initctl',
@@ -1913,8 +2130,34 @@ def mount_activate(
     dev,
     activate_key_template,
     init,
+    dmcrypt,
+    dmcrypt_key_dir,
     ):
 
+    if dmcrypt:
+            # dev corresponds to a dmcrypt cyphertext device - map it before
+            # proceeding.
+            rawdev = dev
+            ptype = get_partition_type(rawdev)
+            if ptype in [DMCRYPT_OSD_UUID]:
+                luks = False
+                cryptsetup_parameters = ['--key-size', '256']
+            elif ptype in [DMCRYPT_LUKS_OSD_UUID]:
+                luks = True
+                cryptsetup_parameters = []
+            else:
+                raise Error('activate --dmcrypt called for invalid dev %s' % (dev))
+            part_uuid = get_partition_uuid(rawdev)
+            dmcrypt_key_path = get_dmcrypt_key_path(part_uuid, dmcrypt_key_dir, luks)
+            dev = dmcrypt_map(
+                    rawdev=rawdev,
+                    keypath=dmcrypt_key_path,
+                    _uuid=part_uuid,
+                    cryptsetup_parameters=cryptsetup_parameters,
+                    luks=luks,
+                    format_dev=False,
+                    )
+
     try:
         fstype = detect_fstype(dev=dev)
     except (subprocess.CalledProcessError,
@@ -2126,11 +2369,7 @@ def activate(
             if conf_val is not None:
                 init = conf_val
             else:
-                (distro, release, codename) = platform.dist()
-                if distro == 'Ubuntu':
-                    init = 'upstart'
-                else:
-                    init = 'sysvinit'
+                init = init_get()
 
         LOG.debug('Marking with init system %s', init)
         with file(os.path.join(path, init), 'w'):
@@ -2172,10 +2411,17 @@ def main_activate(args):
     try:
         mode = os.stat(args.path).st_mode
         if stat.S_ISBLK(mode):
+            if (is_partition(args.path) and
+                get_partition_type(args.path) == MPATH_OSD_UUID and
+                not is_mpath(args.path)):
+                raise Error('%s is not a multipath block device' %
+                            args.path)
             (cluster, osd_id) = mount_activate(
                 dev=args.path,
                 activate_key_template=args.activate_key_template,
                 init=args.mark_init,
+                dmcrypt=args.dmcrypt,
+                dmcrypt_key_dir=args.dmcrypt_key_dir,
                 )
             osd_data = get_mount_point(cluster, osd_id)
 
@@ -2190,7 +2436,7 @@ def main_activate(args):
         else:
             raise Error('%s is not a directory or block device' % args.path)
 
-        if args.mark_init == 'none':
+        if (not args.no_start_daemon and args.mark_init == 'none'):
             command_check_call(
                 [
                     'ceph-osd',
@@ -2201,7 +2447,8 @@ def main_activate(args):
                 ],
             )
 
-        if args.mark_init not in (None, 'none' ):
+        if (not args.no_start_daemon and
+            args.mark_init not in (None, 'none' )):
 
             start_daemon(
                 cluster=cluster,
@@ -2222,6 +2469,12 @@ def get_journal_osd_uuid(path):
     if not stat.S_ISBLK(mode):
         raise Error('%s is not a block device' % path)
 
+    if (is_partition(path) and
+        get_partition_type(path) == MPATH_JOURNAL_UUID and
+        not is_mpath(path)):
+        raise Error('%s is not a multipath block device' %
+                    path)
+
     try:
         out = _check_output(
             args=[
@@ -2250,15 +2503,51 @@ def main_activate_journal(args):
     cluster = None
     osd_id = None
     osd_uuid = None
+    dev = None
     activate_lock.acquire()  # noqa
     try:
-        osd_uuid = get_journal_osd_uuid(args.dev)
+        if args.dmcrypt:
+            # journal dev corresponds to a dmcrypt cyphertext device - map
+            # it before proceeding.
+            rawdev = args.dev
+            ptype = get_partition_type(rawdev)
+            if ptype in [DMCRYPT_JOURNAL_UUID]:
+                luks = False
+                cryptsetup_parameters = ['--key-size', '256']
+            elif ptype in [DMCRYPT_LUKS_JOURNAL_UUID]:
+                luks = True
+                cryptsetup_parameters = []
+            else:
+                raise Error('activate-journal --dmcrypt called for invalid dev %s' % (rawdev))
+            part_uuid = get_partition_uuid(rawdev)
+            dmcrypt_key_path = get_dmcrypt_key_path(part_uuid, args.dmcrypt_key_dir, luks)
+            dev = dmcrypt_map(
+                    rawdev=rawdev,
+                    keypath=dmcrypt_key_path,
+                    _uuid=part_uuid,
+                    cryptsetup_parameters=cryptsetup_parameters,
+                    luks=luks,
+                    format_dev=False,
+                    )
+        else:
+            dev = args.dev
+
+        # FIXME: For an encrypted journal dev, does this return the cyphertext
+        # or plaintext dev uuid!? Also, if the journal is encrypted, is the data
+        # partition also always encrypted, or are mixed pairs supported!?
+        osd_uuid = get_journal_osd_uuid(dev)
         path = os.path.join('/dev/disk/by-partuuid/', osd_uuid.lower())
 
+        if is_suppressed(path):
+            LOG.info('suppressed activate request on %s', path)
+            return
+
         (cluster, osd_id) = mount_activate(
             dev=path,
             activate_key_template=args.activate_key_template,
             init=args.mark_init,
+            dmcrypt=args.dmcrypt,
+            dmcrypt_key_dir=args.dmcrypt_key_dir,
             )
 
         start_daemon(
@@ -2284,20 +2573,30 @@ def main_activate_all(args):
             continue
         (tag, uuid) = name.split('.')
 
-        if tag == OSD_UUID or tag == DMCRYPT_OSD_UUID or tag == DMCRYPT_LUKS_OSD_UUID:
+        if tag in (OSD_UUID,
+                   MPATH_OSD_UUID,
+                   DMCRYPT_OSD_UUID,
+                   DMCRYPT_LUKS_OSD_UUID):
 
             if tag == DMCRYPT_OSD_UUID or tag == DMCRYPT_LUKS_OSD_UUID:
                 path = os.path.join('/dev/mapper', uuid)
             else:
                 path = os.path.join(dir, name)
 
+            if is_suppressed(path):
+                LOG.info('suppressed activate request on %s', path)
+                continue
+
             LOG.info('Activating %s', path)
             activate_lock.acquire()  # noqa
             try:
+                # never map dmcrypt cyphertext devices
                 (cluster, osd_id) = mount_activate(
                     dev=path,
                     activate_key_template=args.activate_key_template,
                     init=args.mark_init,
+                    dmcrypt=False,
+                    dmcrypt_key_dir='',
                     )
                 start_daemon(
                     cluster=cluster,
@@ -2357,218 +2656,203 @@ def get_dev_fs(dev):
     else:
         return None
 
-
 def split_dev_base_partnum(dev):
-    if 'loop' in dev or 'cciss' in dev or 'nvme' in dev:
-        return re.match('(.*\d+)p(\d+)', dev).group(1, 2)
+    if is_mpath(dev):
+        partnum = partnum_mpath(dev)
+        base = get_partition_base_mpath(dev)
     else:
-        return re.match('(\D+)(\d+)', dev).group(1, 2)
-
+        b = block_path(dev)
+        partnum = open(os.path.join(b, 'partition')).read().strip()
+        base = get_partition_base(dev)
+    return (base, partnum)
 
 def get_partition_type(part):
-    """
-    Get the GPT partition type UUID.  If we have an old blkid and can't
-    get it that way, use sgdisk and use the description instead (and hope
-    dmcrypt isn't being used).
-    """
-    blkid, _ = command(
-        [
-            'blkid',
-            '-p',
-            '-o', 'udev',
-            part,
-        ]
-    )
-    saw_part_entry = False
-    for line in blkid.splitlines():
-        (key, value) = line.split('=')
-        if key == 'ID_PART_ENTRY_TYPE':
-            return value
-        if key == 'ID_PART_ENTRY_SCHEME':
-            table_type = value
-        if key.startswith('ID_PART_ENTRY_'):
-            saw_part_entry = True
-
-    # hmm, is it in fact GPT?
-    table_type = None
-    base = get_partition_base(part)
-    blkid, _ = command(
-        [
-            'blkid',
-            '-p',
-            '-o', 'udev',
-            base
-        ]
-    )
-    for line in blkid.splitlines():
-        (key, value) = line.split('=')
-        if key == 'ID_PART_TABLE_TYPE':
-            table_type = value
-    if table_type != 'gpt':
-        return None    # not even GPT
-
-    if saw_part_entry:
-        return None    # GPT, and blkid appears to be new, so we're done.
-
-    # bah, fall back to sgdisk.
-    if 'blkid' not in warned_about:
-        LOG.warning('Old blkid does not support ID_PART_ENTRY_* fields, trying sgdisk; may not correctly identify ceph volumes with dmcrypt')
-        warned_about['blkid'] = True
-    (base, partnum) = split_dev_base_partnum(part)
-    sgdisk, _ = command(
-        [
-            'sgdisk',
-            '-p',
-            base,
-        ]
-    )
+    return get_sgdisk_partition_info(part, 'Partition GUID code: (\S+)')
 
-    for line in sgdisk.splitlines():
-        m = re.search('\s+(\d+)\s+\d+\s+\d+\s+\S+ \S+B\s+\S+\s+(.*)', line)
-        if m is not None:
-            num = m.group(1)
-            if num != partnum:
-                continue
-            desc = m.group(2)
-            # assume unencrypted ... blkid has failed us :(
-            if desc == 'ceph data':
-                return OSD_UUID
-            if desc == 'ceph journal':
-                return JOURNAL_UUID
+def get_partition_uuid(part):
+    return get_sgdisk_partition_info(part, 'Partition unique GUID: (\S+)')
 
-    return None
-
-
-def get_partition_uuid(dev):
+def get_sgdisk_partition_info(dev, regexp):
     (base, partnum) = split_dev_base_partnum(dev)
     out, _ = command(['sgdisk', '-i', partnum, base])
     for line in out.splitlines():
-        m = re.match('Partition unique GUID: (\S+)', line)
+        m = re.match(regexp, line)
         if m:
             return m.group(1).lower()
     return None
 
-
-def more_osd_info(path, uuid_map):
-    desc = []
-    ceph_fsid = get_oneliner(path, 'ceph_fsid')
-    if ceph_fsid:
-        cluster = find_cluster_by_uuid(ceph_fsid)
-        if cluster:
-            desc.append('cluster ' + cluster)
-        else:
-            desc.append('unknown cluster ' + ceph_fsid)
-
-    who = get_oneliner(path, 'whoami')
-    if who:
-        desc.append('osd.%s' % who)
-
-    journal_uuid = get_oneliner(path, 'journal_uuid')
-    if journal_uuid:
-        journal_uuid = journal_uuid.lower()
-        if journal_uuid in uuid_map:
-            desc.append('journal %s' % uuid_map[journal_uuid])
-
-    return desc
-
-def list_dev_osd(dev, uuid_map):
-    path = is_mounted(dev)
-    fs_type = get_dev_fs(dev)
-    desc = []
-    if path:
-        desc.append('active')
-        desc.extend(more_osd_info(path, uuid_map))
-    elif fs_type:
+def more_osd_info(path, uuid_map, desc):
+    desc['ceph_fsid'] = get_oneliner(path, 'ceph_fsid')
+    if desc['ceph_fsid']:
+        desc['cluster'] = find_cluster_by_uuid(desc['ceph_fsid'])
+    desc['whoami'] = get_oneliner(path, 'whoami')
+    desc['journal_uuid'] = get_oneliner(path, 'journal_uuid')
+    if desc['journal_uuid']:
+        desc['journal_uuid'] = desc['journal_uuid'].lower()
+        if desc['journal_uuid'] in uuid_map:
+            desc['journal_dev'] = uuid_map[desc['journal_uuid']]
+
+def list_dev_osd(dev, uuid_map, desc):
+    desc['mount'] = is_mounted(dev)
+    desc['fs_type'] = get_dev_fs(dev)
+    desc['state'] = 'unprepared'
+    if desc['mount']:
+        desc['state'] = 'active'
+        more_osd_info(desc['mount'], uuid_map, desc)
+    elif desc['fs_type']:
         try:
-            tpath = mount(dev=dev, fstype=fs_type, options='')
+            tpath = mount(dev=dev, fstype=desc['fs_type'], options='')
             if tpath:
                 try:
                     magic = get_oneliner(tpath, 'magic')
                     if magic is not None:
-                        desc.append('prepared')
-                        desc.extend(more_osd_info(tpath, uuid_map))
+                        desc['magic'] = magic
+                        desc['state'] = 'prepared'
+                        more_osd_info(tpath, uuid_map, desc)
                 finally:
                     unmount(tpath)
         except MountError:
             pass
-    return desc
 
-def list_dev(dev, uuid_map, journal_map):
-    ptype = 'unknown'
-    prefix = ''
-    if is_partition(dev):
-        ptype = get_partition_type(dev)
-        prefix = ' '
+def list_format_more_osd_info_plain(dev):
+    desc = []
+    if dev.get('ceph_fsid'):
+        if dev.get('cluster'):
+            desc.append('cluster ' + dev['cluster'])
+        else:
+            desc.append('unknown cluster ' + dev['ceph_fsid'])
+    if dev.get('whoami'):
+        desc.append('osd.%s' % dev['whoami'])
+    if dev.get('journal_dev'):
+        desc.append('journal %s' % dev['journal_dev'])
+    return desc
 
+def list_format_dev_plain(dev, devices=[], prefix=''):
     desc = []
-    if ptype == OSD_UUID:
-        desc = list_dev_osd(dev, uuid_map)
-        if desc:
-            desc = ['ceph data'] + desc
+    if dev['ptype'] == OSD_UUID:
+        desc = ['ceph data', dev['state']] + list_format_more_osd_info_plain(dev)
+    elif dev['ptype'] in (DMCRYPT_OSD_UUID,
+                          DMCRYPT_LUKS_OSD_UUID):
+        dmcrypt = dev['dmcrypt']
+        if not dmcrypt['holders']:
+            desc = ['ceph data (dmcrypt %s)' % dmcrypt['type'], 'not currently mapped']
+        elif len(dmcrypt['holders']) == 1:
+            holder = '/dev/' + dmcrypt['holders'][0]
+            def lookup_dev(devices, path):
+                for device in devices:
+                    if device['path'] == path:
+                        return device
+            holder_dev = lookup_dev(devices, holder)
+            desc = ['ceph data (dmcrypt %s %s)' % (dmcrypt['type'], holder)] + list_format_more_osd_info_plain(holder_dev)
+        else:
+            desc = ['ceph data (dmcrypt %s)' % dmcrypt['type'], 'holders: ' + ','.join(dmcrypt['holders'])]
+    elif dev['ptype'] == JOURNAL_UUID:
+        desc.append('ceph journal')
+        if dev.get('journal_for'):
+            desc.append('for %s' % dev['journal_for'])
+    elif dev['ptype'] in (DMCRYPT_JOURNAL_UUID,
+                          DMCRYPT_LUKS_JOURNAL_UUID):
+        dmcrypt = dev['dmcrypt']
+        if dmcrypt['holders'] and len(dmcrypt['holders']) == 1:
+            desc = ['ceph journal (dmcrypt %s /dev/%s)' % (dmcrypt['type'], dmcrypt['holders'][0])]
         else:
-            desc = ['ceph data', 'unprepared']
+            desc = ['ceph journal (dmcrypt %s)' % dmcrypt['type']]
+        if dev.get('journal_for'):
+            desc.append('for %s' % dev['journal_for'])
+    else:
+        desc.append(dev['type'])
+        if dev.get('fs_type'):
+            desc.append(dev['fs_type'])
+        elif dev.get('ptype'):
+            desc.append(dev['ptype'])
+        if dev.get('mount'):
+            desc.append('mounted on %s' % dev['mount'])
+    return '%s%s %s' % (prefix, dev['path'], ', '.join(desc))
+
+def list_format_plain(devices):
+    lines = []
+    for device in devices:
+        if device.get('partitions'):
+            lines.append('%s :' % device['path'])
+            for p in sorted(device['partitions']):
+                lines.append(list_format_dev_plain(dev=p,
+                                                   devices=devices,
+                                                   prefix=' '))
+        else:
+            lines.append(list_format_dev_plain(dev=device,
+                                               devices=devices,
+                                               prefix=''))
+    return "\n".join(lines)
+
+def list_dev(dev, uuid_map, journal_map):
+    info = {
+        'path': dev,
+        'dmcrypt': {},
+    }
+
+    info['is_partition'] = is_partition(dev)
+    if info['is_partition']:
+        ptype = get_partition_type(dev)
+        info['uuid'] = get_partition_uuid(dev)
+    else:
+        ptype = 'unknown'
+    info['ptype'] = ptype
+    LOG.info("list_dev(dev = " + dev + ", ptype = " + str(ptype) + ")")
+    if ptype in (OSD_UUID, MPATH_OSD_UUID):
+        info['type'] = 'data'
+        if ptype == MPATH_OSD_UUID:
+            info['multipath'] = True
+        list_dev_osd(dev, uuid_map, info)
     elif ptype == DMCRYPT_OSD_UUID:
         holders = is_held(dev)
-        if not holders:
-            desc = ['ceph data (dmcrypt plain)', 'not currently mapped']
-        elif len(holders) == 1:
-            holder = '/dev/' + holders[0]
-            fs_desc = list_dev_osd(holder, uuid_map)
-            desc = ['ceph data (dmcrypt plain %s)' % holder] + fs_desc
-        else:
-            desc = ['ceph data (dmcrypt plain)', 'holders: ' + ','.join(holders)]
+        info['type'] = 'data'
+        info['dmcrypt']['holders'] = holders
+        info['dmcrypt']['type'] = 'plain'
+        if len(holders) == 1:
+            list_dev_osd('/dev/' + holders[0], uuid_map, info)
     elif ptype == DMCRYPT_LUKS_OSD_UUID:
         holders = is_held(dev)
-        if not holders:
-            desc = ['ceph data (dmcrypt LUKS)', 'not currently mapped']
-        elif len(holders) == 1:
-            holder = '/dev/' + holders[0]
-            fs_desc = list_dev_osd(holder, uuid_map)
-            desc = ['ceph data (dmcrypt LUKS %s)' % holder] + fs_desc
-        else:
-            desc = ['ceph data (dmcrypt LUKS)', 'holders: ' + ','.join(holders)]
-    elif ptype == JOURNAL_UUID:
-        desc.append('ceph journal')
-        part_uuid = get_partition_uuid(dev)
-        if part_uuid and part_uuid in journal_map:
-            desc.append('for %s' % journal_map[part_uuid])
+        info['type'] = 'data'
+        info['dmcrypt']['holders'] = holders
+        info['dmcrypt']['type'] = 'LUKS'
+        if len(holders) == 1:
+            list_dev_osd('/dev/' + holders[0], uuid_map, info)
+    elif ptype in (JOURNAL_UUID, MPATH_JOURNAL_UUID):
+        info['type'] = 'journal'
+        if ptype == MPATH_JOURNAL_UUID:
+            info['multipath'] = True
+        if info.get('uuid') in journal_map:
+            info['journal_for'] = journal_map[info['uuid']]
     elif ptype == DMCRYPT_JOURNAL_UUID:
         holders = is_held(dev)
-        if len(holders) == 1:
-            desc = ['ceph journal (dmcrypt plain /dev/%s)' % holders[0]]
-        else:
-            desc = ['ceph journal (dmcrypt plain)']
-        part_uuid = get_partition_uuid(dev)
-        if part_uuid and part_uuid in journal_map:
-            desc.append('for %s' % journal_map[part_uuid])
+        info['type'] = 'journal'
+        info['dmcrypt']['type'] = 'plain'
+        info['dmcrypt']['holders'] = holders
+        if info.get('uuid') in journal_map:
+            info['journal_for'] = journal_map[info['uuid']]
     elif ptype == DMCRYPT_LUKS_JOURNAL_UUID:
         holders = is_held(dev)
-        if len(holders) == 1:
-            desc = ['ceph journal (dmcrypt LUKS /dev/%s)' % holders[0]]
-        else:
-            desc = ['ceph journal (dmcrypt LUKS)']
-        part_uuid = get_partition_uuid(dev)
-        if part_uuid and part_uuid in journal_map:
-            desc.append('for %s' % journal_map[part_uuid])
+        info['type'] = 'journal'
+        info['dmcrypt']['type'] = 'LUKS'
+        info['dmcrypt']['holders'] = holders
+        if info.get('uuid') in journal_map:
+            info['journal_for'] = journal_map[info['uuid']]
     else:
         path = is_mounted(dev)
         fs_type = get_dev_fs(dev)
         if is_swap(dev):
-            desc.append('swap')
+            info['type'] = 'swap'
         else:
-            desc.append('other')
+            info['type'] = 'other'
         if fs_type:
-            desc.append(fs_type)
-        elif ptype:
-            desc.append(ptype)
+            info['fs_type'] = fs_type
         if path:
-            desc.append('mounted on %s' % path)
-
-    print '%s%s %s' % (prefix, dev, ', '.join(desc))
+            info['mount'] = path
 
+    return info
 
-def main_list(args):
-    partmap = list_all_partitions()
+def list_devices(args):
+    partmap = list_all_partitions(args.path)
 
     uuid_map = {}
     journal_map = {}
@@ -2579,11 +2863,26 @@ def main_list(args):
             if part_uuid:
                 uuid_map[part_uuid] = dev
             ptype = get_partition_type(dev)
-            if ptype == OSD_UUID:
-                fs_type = get_dev_fs(dev)
+            LOG.debug("main_list: " + dev +
+                      " ptype = " + str(ptype) +
+                      " uuid = " + str(part_uuid))
+            if ptype in (OSD_UUID,
+                         DMCRYPT_OSD_UUID,
+                         DMCRYPT_LUKS_OSD_UUID):
+                if ptype in (DMCRYPT_OSD_UUID,
+                             DMCRYPT_LUKS_OSD_UUID):
+                    holders = is_held(dev)
+                    if len(holders) != 1:
+                        continue
+                    dev_to_mount = '/dev/' + holders[0]
+                else:
+                    dev_to_mount = dev
+
+                fs_type = get_dev_fs(dev_to_mount)
                 if fs_type is not None:
                     try:
-                        tpath = mount(dev=dev, fstype=fs_type, options='')
+                        tpath = mount(dev=dev_to_mount,
+                                      fstype=fs_type, options='')
                         try:
                             journal_uuid = get_oneliner(tpath, 'journal_uuid')
                             if journal_uuid:
@@ -2592,30 +2891,34 @@ def main_list(args):
                             unmount(tpath)
                     except MountError:
                         pass
-            if ptype == DMCRYPT_OSD_UUID or ptype == DMCRYPT_LUKS_OSD_UUID:
-                holders = is_held(dev)
-                if len(holders) == 1:
-                    holder = '/dev/' + holders[0]
-                    fs_type = get_dev_fs(holder)
-                    if fs_type is not None:
-                        try:
-                            tpath = mount(dev=holder, fstype=fs_type, options='')
-                            try:
-                                journal_uuid = get_oneliner(tpath, 'journal_uuid')
-                                if journal_uuid:
-                                    journal_map[journal_uuid.lower()] = dev
-                            finally:
-                                unmount(tpath)
-                        except MountError:
-                            pass
 
+    LOG.debug("main_list: " + str(partmap) + ", uuid_map = " +
+              str(uuid_map) + ", journal_map = " + str(journal_map))
+
+    devices = []
     for base, parts in sorted(partmap.iteritems()):
         if parts:
-            print '%s :' % get_dev_path(base)
+            disk = { 'path': get_dev_path(base) }
+            partitions = []
             for p in sorted(parts):
-                list_dev(get_dev_path(p), uuid_map, journal_map)
+                partitions.append(list_dev(get_dev_path(p), uuid_map, journal_map))
+            disk['partitions'] = partitions
+            devices.append(disk)
         else:
-            list_dev(get_dev_path(base), uuid_map, journal_map)
+            device = list_dev(get_dev_path(base), uuid_map, journal_map)
+            device['path'] = get_dev_path(base)
+            devices.append(device)
+    LOG.debug("list_devices: " + str(devices))
+    return devices
+
+def main_list(args):
+    devices = list_devices(args)
+    if args.format == 'json':
+        print json.dumps(devices)
+    else:
+        output = list_format_plain(devices)
+        if output:
+            print output
 
 
 ###########################
@@ -2633,7 +2936,7 @@ def main_list(args):
 def is_suppressed(path):
     disk = os.path.realpath(path)
     try:
-        if not disk.startswith('/dev/') or not stat.S_ISBLK(os.lstat(path).st_mode):
+        if not disk.startswith('/dev/') or not stat.S_ISBLK(os.lstat(disk).st_mode):
             return False
         base = get_dev_name(disk)
         while len(base):
@@ -2691,6 +2994,185 @@ def main_zap(args):
 
 ###########################
 
+def main_trigger(args):
+    LOG.debug("main_trigger: " + str(args))
+    if is_systemd() and not args.sync:
+        # http://www.freedesktop.org/software/systemd/man/systemd-escape.html
+        escaped_dev = args.dev.replace('-', '\\x2d')
+        service='ceph-disk@{dev}.service'.format(dev=escaped_dev)
+        LOG.info('systemd detected, triggering %s' % service)
+        command(
+            [
+                'systemctl',
+                '--no-block',
+                'restart',
+                service,
+            ]
+        )
+        return
+    if is_upstart() and not args.sync:
+        LOG.info('upstart detected, triggering ceph-disk task')
+        command(
+            [
+                'initctl',
+                'emit',
+                'ceph-disk',
+                'dev={dev}'.format(dev=args.dev),
+                'pid={pid}'.format(pid=os.getpid()),
+            ]
+        )
+        return
+
+    parttype = get_partition_type(args.dev)
+    partid = get_partition_uuid(args.dev)
+
+    LOG.info('trigger {dev} parttype {parttype} uuid {partid}'.format(
+        dev=args.dev,
+        parttype=parttype,
+        partid=partid,
+        )
+    )
+
+    if parttype in [OSD_UUID, MPATH_OSD_UUID]:
+        command(
+            [
+                'ceph-disk',
+                'activate',
+                args.dev,
+            ]
+        )
+    elif parttype in [JOURNAL_UUID, MPATH_JOURNAL_UUID]:
+        command(
+            [
+                'ceph-disk',
+                'activate-journal',
+                args.dev,
+            ]
+        )
+
+        # journals are easy: map, chown, activate-journal
+    elif parttype == DMCRYPT_JOURNAL_UUID:
+        command(
+            [
+                '/sbin/cryptsetup',
+                '--key-file',
+                '/etc/ceph/dmcrypt-keys/{partid}'.format(partid=partid),
+                '--key-size',
+                '256',
+                'create',
+                partid,
+                args.dev,
+            ]
+        )
+        newdev='/dev/mapper/' + partid
+        count=0
+        while not os.path.exists(newdev) and count <= 10:
+            time.sleep(1)
+            count += 1
+        command(
+            [
+                '/bin/chown',
+                'ceph:ceph',
+                newdev,
+            ]
+        )
+        command(
+            [
+                '/usr/sbin/ceph-disk',
+                'activate-journal',
+                newdev,
+            ]
+            )
+    elif parttype == DMCRYPT_LUKS_JOURNAL_UUID:
+        command(
+            [
+                '/sbin/cryptsetup',
+                '--key-file',
+                '/etc/ceph/dmcrypt-keys/{partid}.luks.key'.format(
+                    partid=partid),
+                'luksOpen',
+                args.dev,
+                partid,
+            ]
+        )
+        newdev='/dev/mapper/' + partid
+        count=0
+        while not os.path.exists(newdev) and count <= 10:
+            time.sleep(1)
+            count += 1
+        command(
+            [
+                '/bin/chown',
+                'ceph:ceph',
+                newdev,
+            ]
+        )
+        command(
+            [
+                '/usr/sbin/ceph-disk',
+                'activate-journal',
+                newdev,
+            ]
+            )
+
+        # osd data: map, activate
+    elif parttype == DMCRYPT_OSD_UUID:
+        command(
+            [
+                '/sbin/cryptsetup',
+                '--key-file',
+                '/etc/ceph/dmcrypt-keys/{partid}'.format(partid=partid),
+                '--key-size',
+                '256',
+                'create',
+                partid,
+                args.dev,
+            ]
+        )
+        newdev='/dev/mapper/' + partid
+        count=0
+        while not os.path.exists(newdev) and count <= 10:
+            time.sleep(1)
+            count += 1
+        command(
+            [
+                '/usr/sbin/ceph-disk',
+                'activate',
+                newdev,
+            ]
+        )
+
+    elif parttype == DMCRYPT_LUKS_OSD_UUID:
+        command(
+            [
+                '/sbin/cryptsetup',
+                '--key-file',
+                '/etc/ceph/dmcrypt-keys/{partid}.luks.key'.format(
+                    partid=partid),
+                'luksOpen',
+                args.dev,
+                partid,
+            ]
+        )
+        newdev='/dev/mapper/' + partid
+        count=0
+        while not os.path.exists(newdev) and count <= 10:
+            time.sleep(1)
+            count += 1
+        command(
+            [
+                '/usr/sbin/ceph-disk',
+                'activate',
+                newdev,
+            ]
+        )
+
+    else:
+        raise Error('unrecognized partition type %s' % parttype)
+
+
+
+###########################
 
 def setup_statedir(dir):
     # XXX The following use of globals makes linting
@@ -2719,7 +3201,7 @@ def setup_sysconfdir(dir):
     SYSCONFDIR = dir
 
 
-def parse_args():
+def parse_args(argv):
     parser = argparse.ArgumentParser(
         'ceph-disk',
         )
@@ -2729,6 +3211,11 @@ def parse_args():
         help='be more verbose',
         )
     parser.add_argument(
+        '--log-stdout',
+        action='store_true', default=None,
+        help='log to stdout',
+        )
+    parser.add_argument(
         '--prepend-to-path',
         metavar='PATH',
         default='/usr/bin',
@@ -2749,7 +3236,6 @@ def parse_args():
     parser.set_defaults(
         # we want to hold on to this, for later
         prog=parser.prog,
-        cluster='ceph',
         )
 
     subparsers = parser.add_subparsers(
@@ -2758,10 +3244,40 @@ def parse_args():
         help='sub-command help',
         )
 
+    make_prepare_parser(subparsers)
+    make_activate_parser(subparsers)
+    make_activate_journal_parser(subparsers)
+    make_activate_all_parser(subparsers)
+    make_list_parser(subparsers)
+    make_suppress_parser(subparsers)
+    make_zap_parser(subparsers)
+    make_trigger_parser(subparsers)
+
+    args = parser.parse_args(argv)
+    return args
+
+def make_trigger_parser(subparsers):
+    trigger_parser = subparsers.add_parser('trigger', help='Trigger an event (caled by udev)')
+    trigger_parser.add_argument(
+        'dev',
+        help=('device'),
+        )
+    trigger_parser.add_argument(
+        '--sync',
+        action='store_true', default=None,
+        help=('do operation synchronously; do not trigger systemd'),
+        )
+    trigger_parser.set_defaults(
+        func=main_trigger,
+        )
+    return trigger_parser
+
+def make_prepare_parser(subparsers):
     prepare_parser = subparsers.add_parser('prepare', help='Prepare a directory or disk for a Ceph OSD')
     prepare_parser.add_argument(
         '--cluster',
         metavar='NAME',
+        default='ceph',
         help='cluster name to assign this disk to',
         )
     prepare_parser.add_argument(
@@ -2834,7 +3350,9 @@ def parse_args():
     prepare_parser.set_defaults(
         func=main_prepare,
         )
+    return prepare_parser
 
+def make_activate_parser(subparsers):
     activate_parser = subparsers.add_parser('activate', help='Activate a Ceph OSD')
     activate_parser.add_argument(
         '--mount',
@@ -2855,16 +3373,34 @@ def parse_args():
         choices=INIT_SYSTEMS,
         )
     activate_parser.add_argument(
+        '--no-start-daemon',
+        action='store_true', default=None,
+        help='do not start the daemon',
+        )
+    activate_parser.add_argument(
         'path',
         metavar='PATH',
         nargs='?',
         help='path to block device or directory',
         )
+    activate_parser.add_argument(
+        '--dmcrypt',
+        action='store_true', default=None,
+        help='map DATA and/or JOURNAL devices with dm-crypt',
+        )
+    activate_parser.add_argument(
+        '--dmcrypt-key-dir',
+        metavar='KEYDIR',
+        default='/etc/ceph/dmcrypt-keys',
+        help='directory where dm-crypt keys are stored',
+        )
     activate_parser.set_defaults(
         activate_key_template='{statedir}/bootstrap-osd/{cluster}.keyring',
         func=main_activate,
         )
+    return activate_parser
 
+def make_activate_journal_parser(subparsers):
     activate_journal_parser = subparsers.add_parser('activate-journal', help='Activate an OSD via its journal device')
     activate_journal_parser.add_argument(
         'dev',
@@ -2884,11 +3420,24 @@ def parse_args():
         default='auto',
         choices=INIT_SYSTEMS,
         )
+    activate_journal_parser.add_argument(
+        '--dmcrypt',
+        action='store_true', default=None,
+        help='map DATA and/or JOURNAL devices with dm-crypt',
+        )
+    activate_journal_parser.add_argument(
+        '--dmcrypt-key-dir',
+        metavar='KEYDIR',
+        default='/etc/ceph/dmcrypt-keys',
+        help='directory where dm-crypt keys are stored',
+        )
     activate_journal_parser.set_defaults(
         activate_key_template='{statedir}/bootstrap-osd/{cluster}.keyring',
         func=main_activate_journal,
         )
+    return activate_journal_parser
 
+def make_activate_all_parser(subparsers):
     activate_all_parser = subparsers.add_parser('activate-all', help='Activate all tagged OSD partitions')
     activate_all_parser.add_argument(
         '--activate-key',
@@ -2907,12 +3456,28 @@ def parse_args():
         activate_key_template='{statedir}/bootstrap-osd/{cluster}.keyring',
         func=main_activate_all,
         )
+    return activate_all_parser
 
+def make_list_parser(subparsers):
     list_parser = subparsers.add_parser('list', help='List disks, partitions, and Ceph OSDs')
+    list_parser.add_argument(
+        '--format',
+        help='output format',
+        default='plain',
+        choices=['json','plain'],
+        )
+    list_parser.add_argument(
+        'path',
+        metavar='PATH',
+        nargs='*',
+        help='path to block devices, relative to /sys/block',
+        )
     list_parser.set_defaults(
         func=main_list,
         )
+    return list_parser
 
+def make_suppress_parser(subparsers):
     suppress_parser = subparsers.add_parser('suppress-activate', help='Suppress activate on a device (prefix)')
     suppress_parser.add_argument(
         'path',
@@ -2934,7 +3499,9 @@ def parse_args():
     unsuppress_parser.set_defaults(
         func=main_unsuppress,
         )
+    return suppress_parser
 
+def make_zap_parser(subparsers):
     zap_parser = subparsers.add_parser('zap', help='Zap/erase/destroy a device\'s partition table (and contents)')
     zap_parser.add_argument(
         'dev',
@@ -2945,21 +3512,12 @@ def parse_args():
     zap_parser.set_defaults(
         func=main_zap,
         )
+    return zap_parser
 
-    args = parser.parse_args()
-    return args
+def main(argv):
+    args = parse_args(argv)
 
-
-def main():
-    args = parse_args()
-
-    loglevel = logging.WARNING
-    if args.verbose:
-        loglevel = logging.DEBUG
-
-    logging.basicConfig(
-        level=loglevel,
-        )
+    setup_logging(args.verbose, args.log_stdout)
 
     if args.prepend_to_path != '':
         path = os.environ.get('PATH', os.defpath)
@@ -2968,8 +3526,32 @@ def main():
     setup_statedir(args.statedir)
     setup_sysconfdir(args.sysconfdir)
 
-    try:
+    if args.verbose:
         args.func(args)
+    else:
+        main_catch(args.func, args)
+
+def setup_logging(verbose, log_stdout):
+    loglevel = logging.WARNING
+    if verbose:
+        loglevel = logging.DEBUG
+
+    if log_stdout:
+        ch = logging.StreamHandler(stream=sys.stdout)
+        ch.setLevel(loglevel)
+        formatter = logging.Formatter('%(filename): %(message)s')
+        ch.setFormatter(formatter)
+        LOG.addHandler(ch)
+        LOG.setLevel(loglevel)
+    else:
+        logging.basicConfig(
+            level=loglevel,
+            )
+
+def main_catch(func, args):
+
+    try:
+        func(args)
 
     except Error as e:
         raise SystemExit(
@@ -2991,5 +3573,5 @@ def main():
 
 
 if __name__ == '__main__':
-    main()
+    main(sys.argv[1:])
     warned_about = {}
diff --git a/src/ceph-disk-activate b/src/ceph-disk-activate
deleted file mode 100755
index 72e89f9..0000000
--- a/src/ceph-disk-activate
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/sh
-dir=`dirname $0`
-$dir/ceph-disk activate $*
diff --git a/src/ceph-disk-prepare b/src/ceph-disk-prepare
deleted file mode 100755
index f9255eb..0000000
--- a/src/ceph-disk-prepare
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/sh
-dir=`dirname $0`
-$dir/ceph-disk prepare $*
diff --git a/src/ceph-disk-udev b/src/ceph-disk-udev
index dd2ac08..8846d26 100755
--- a/src/ceph-disk-udev
+++ b/src/ceph-disk-udev
@@ -56,7 +56,7 @@ case $ID_PART_ENTRY_TYPE in
     # for dm-crypted data devices
     /sbin/cryptsetup --key-file /etc/ceph/dmcrypt-keys/${ID_PART_ENTRY_UUID} --key-size 256 create ${ID_PART_ENTRY_UUID} /dev/${NAME}
     bash -c 'while [ ! -e /dev/mapper/${ID_PART_ENTRY_UUID} ];do sleep 1; done'
-    /usr/sbin/ceph-disk-activate /dev/mapper/${ID_PART_ENTRY_UUID}
+    /usr/sbin/ceph-disk activate /dev/mapper/${ID_PART_ENTRY_UUID}
     ;;
 
 4fbd7e29-9d25-41b8-afd0-35865ceff05d)
@@ -65,7 +65,7 @@ case $ID_PART_ENTRY_TYPE in
     # for dm-crypted data devices
     /sbin/cryptsetup --key-file /etc/ceph/dmcrypt-keys/${ID_PART_ENTRY_UUID} luksOpen /dev/${NAME} ${ID_PART_ENTRY_UUID}
     bash -c 'while [ ! -e /dev/mapper/${ID_PART_ENTRY_UUID} ];do sleep 1; done'
-    /usr/sbin/ceph-disk-activate /dev/mapper/${ID_PART_ENTRY_UUID}
+    /usr/sbin/ceph-disk activate /dev/mapper/${ID_PART_ENTRY_UUID}
     ;;
 
 89c57f98-2fe5-4dc0-89c1-f3ad0ceff2be)
diff --git a/src/ceph-osd-prestart.sh b/src/ceph-osd-prestart.sh
index 77153c9..cefca85 100644
--- a/src/ceph-osd-prestart.sh
+++ b/src/ceph-osd-prestart.sh
@@ -17,6 +17,9 @@ if [ -z "$id"  ]; then
     exit 1;
 fi
 
+data="/var/lib/ceph/osd/${cluster:-ceph}-$id"
+journal="$data/journal"
+
 update="$(ceph-conf --cluster=${cluster:-ceph} --name=osd.$id --lookup osd_crush_update_on_start || :)"
 
 if [ "${update:-1}" = "1" -o "${update:-1}" = "true" ]; then
@@ -27,11 +30,11 @@ if [ "${update:-1}" = "1" -o "${update:-1}" = "true" ]; then
     fi
     location="$($hook --cluster ${cluster:-ceph} --id $id --type osd)"
     weight="$(ceph-conf --cluster=${cluster:-ceph} --name=osd.$id --lookup osd_crush_initial_weight || :)"
-    defaultweight=`df -P -k /var/lib/ceph/osd/${cluster:-ceph}-$id/ | tail -1 | awk '{ d= $2/1073741824 ; r = sprintf("%.2f", d); print r }'`
+    defaultweight=`df -P -k $data/ | tail -1 | awk '{ d= $2/1073741824 ; r = sprintf("%.4f", d); print r }'`
     ceph \
         --cluster="${cluster:-ceph}" \
         --name="osd.$id" \
-        --keyring="/var/lib/ceph/osd/${cluster:-ceph}-$id/keyring" \
+        --keyring="$data/keyring" \
         osd crush create-or-move \
         -- \
         "$id" \
@@ -39,7 +42,6 @@ if [ "${update:-1}" = "1" -o "${update:-1}" = "true" ]; then
         $location
 fi
 
-journal="/var/lib/ceph/osd/${cluster:-ceph}-$id/journal"
 if [ -L "$journal" -a ! -e "$journal" ]; then
     udevadm settle --timeout=5 || :
     if [ -L "$journal" -a ! -e "$journal" ]; then
@@ -48,3 +50,14 @@ if [ -L "$journal" -a ! -e "$journal" ]; then
         exit 0
     fi
 fi
+
+
+# ensure ownership is correct
+owner=`stat -c %U $data/.`
+if [ $owner != 'ceph' -a $owner != 'root' ]; then
+    echo "ceph-osd data dir $data is not owned by 'ceph' or 'root'"
+    echo "you must 'chown -R ceph:ceph ...' or similar to fix ownership"
+    exit 1
+fi
+
+exit 0
diff --git a/src/ceph-rbdnamer b/src/ceph-rbdnamer
index efb6804..846f321 100755
--- a/src/ceph-rbdnamer
+++ b/src/ceph-rbdnamer
@@ -1,7 +1,7 @@
 #!/bin/sh
 
 DEV=$1
-NUM=`echo $DEV | sed 's#p.*##g' | tr -d 'a-z'`
+NUM=`echo $DEV | sed 's#p.*##g; s#[a-z]##g'`
 POOL=`cat /sys/devices/rbd/$NUM/pool`
 IMAGE=`cat /sys/devices/rbd/$NUM/name`
 SNAP=`cat /sys/devices/rbd/$NUM/current_snap`
diff --git a/src/ceph.in b/src/ceph.in
index 9f857ec..c6c7c49 100755
--- a/src/ceph.in
+++ b/src/ceph.in
@@ -1,3 +1,4 @@
+#!@PYTHON_EXECUTABLE@
 # -*- mode:python -*-
 # vim: ts=4 sw=4 smarttab expandtab
 #
@@ -22,6 +23,9 @@ import os
 import sys
 import platform
 
+CEPH_GIT_VER="@CEPH_GIT_VER@"
+CEPH_GIT_NICE_VER="@CEPH_GIT_NICE_VER@"
+
 # Make life easier on developers:
 # If in src/, and .libs and pybind exist here, assume we're running
 # from a Ceph source dir and tweak PYTHONPATH and LD_LIBRARY_PATH
@@ -31,9 +35,10 @@ MYPATH = os.path.abspath(__file__)
 MYDIR = os.path.dirname(MYPATH)
 DEVMODEMSG = '*** DEVELOPER MODE: setting PATH, PYTHONPATH and LD_LIBRARY_PATH ***'
 
-if MYDIR.endswith('src') and \
-   os.path.exists(os.path.join(MYDIR, '.libs')) and \
-   os.path.exists(os.path.join(MYDIR, 'pybind')):
+def respawn_in_path(lib_path, pybind_path):
+    execv_cmd = ['python']
+    if 'CEPH_DBG' in os.environ:
+        execv_cmd += ['-mpdb']
 
     if platform.system() == "Darwin":
         lib_path_var = "DYLD_LIBRARY_PATH"
@@ -41,32 +46,55 @@ if MYDIR.endswith('src') and \
         lib_path_var = "LD_LIBRARY_PATH"
 
     py_binary = os.environ.get("PYTHON", "python")
-    MYLIBPATH = os.path.join(MYDIR, '.libs')
-    execv_cmd = ['python']
-    if 'CEPH_DBG' in os.environ:
-        execv_cmd += ['-mpdb']
+
     if lib_path_var in os.environ:
-        if MYLIBPATH not in os.environ[lib_path_var]:
-            os.environ[lib_path_var] += ':' + MYLIBPATH
+        if lib_path not in os.environ[lib_path_var]:
+            os.environ[lib_path_var] += ':' + lib_path
             print >> sys.stderr, DEVMODEMSG
             os.execvp(py_binary, execv_cmd + sys.argv)
     else:
-        os.environ[lib_path_var] = MYLIBPATH
+        os.environ[lib_path_var] = lib_path
         print >> sys.stderr, DEVMODEMSG
         os.execvp(py_binary, execv_cmd + sys.argv)
-    sys.path.insert(0, os.path.join(MYDIR, 'pybind'))
+    sys.path.insert(0, os.path.join(MYDIR, pybind_path))
+
+if MYDIR.endswith('src') and \
+   os.path.exists(os.path.join(MYDIR, '.libs')) and \
+   os.path.exists(os.path.join(MYDIR, 'pybind')):
+
+    respawn_in_path(os.path.join(MYDIR, '.libs'), "pybind")
     if os.environ.has_key('PATH') and MYDIR not in os.environ['PATH']:
         os.environ['PATH'] += ':' + MYDIR
 
+elif os.path.exists(os.path.join(os.getcwd(), "CMakeCache.txt")) \
+     and os.path.exists(os.path.join(os.getcwd(), "init-ceph")):
+    src_path = None
+    for l in open("./CMakeCache.txt").readlines():
+        if l.startswith("Ceph_SOURCE_DIR:STATIC="):
+            src_path = l.split("=")[1].strip()
+
+    if src_path is None:
+        # Huh, maybe we're not really in a cmake environment?
+        pass
+    else:
+        # Developer mode, but in a cmake build dir instead of the src dir
+        lib_path = os.path.join(os.getcwd(), "src")
+        pybind_path = os.path.join(src_path, "src", "pybind")
+        respawn_in_path(lib_path, pybind_path)
+
+    sys.path.insert(0, os.path.join(MYDIR, pybind_path))
+
+    # Add src/ to path for e.g. ceph-conf
+    if os.environ.has_key('PATH') and lib_path not in os.environ['PATH']:
+        os.environ['PATH'] += ':' + lib_path
+
 import argparse
 import errno
 import json
 import rados
 import shlex
 import signal
-import socket
 import string
-import struct
 import subprocess
 
 from ceph_argparse import \
@@ -74,6 +102,8 @@ from ceph_argparse import \
     matchnum, validate_command, find_cmd_target, \
     send_command, json_command
 
+from ceph_daemon import DaemonWatcher, admin_socket
+
 # just a couple of globals
 
 verbose = False
@@ -209,7 +239,7 @@ def do_extended_help(parser, args):
 
     def help_for_target(target, partial=None):
         ret, outbuf, outs = json_command(cluster_handle, target=target,
-                                         prefix='get_command_descriptions', 
+                                         prefix='get_command_descriptions',
                                          timeout=10)
         if ret:
             print >> sys.stderr, \
@@ -304,60 +334,6 @@ def format_help(cmddict, partial=None):
 
     return fullusage
 
-def admin_socket(asok_path, cmd, format=''):
-    """
-    Send a daemon (--admin-daemon) command 'cmd'.  asok_path is the
-    path to the admin socket; cmd is a list of strings; format may be
-    set to one of the formatted forms to get output in that form
-    (daemon commands don't support 'plain' output).
-    """
-
-    def do_sockio(path, cmd):
-        """ helper: do all the actual low-level stream I/O """
-        sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
-        sock.connect(path)
-        try:
-            sock.sendall(cmd + '\0')
-            len_str = sock.recv(4)
-            if len(len_str) < 4:
-                raise RuntimeError("no data returned from admin socket")
-            l, = struct.unpack(">I", len_str)
-            ret = ''
-
-            got = 0
-            while got < l:
-                bit = sock.recv(l - got)
-                ret += bit
-                got += len(bit)
-
-        except Exception as e:
-            raise RuntimeError('exception: ' + str(e))
-        return ret
-
-    try:
-        cmd_json = do_sockio(asok_path,
-            json.dumps({"prefix":"get_command_descriptions"}))
-    except Exception as e:
-        raise RuntimeError('exception getting command descriptions: ' + str(e))
-
-    if cmd == 'get_command_descriptions':
-        return cmd_json
-
-    sigdict = parse_json_funcsigs(cmd_json, 'cli')
-    valid_dict = validate_command(sigdict, cmd)
-    if not valid_dict:
-        raise RuntimeError('invalid command')
-
-    if format:
-        valid_dict['format'] = format
-
-    try:
-        ret = do_sockio(asok_path, json.dumps(valid_dict))
-    except Exception as e:
-        raise RuntimeError('exception: ' + str(e))
-
-    return ret
-
 
 def ceph_conf(parsed_args, field, name):
     args=['ceph-conf']
@@ -389,7 +365,7 @@ if sys.stdin.isatty():
     def read_input():
         while True:
             line = raw_input(PROMPT).rstrip()
-            if line in ['q', 'quit', 'Q']:
+            if line in ['q', 'quit', 'Q', 'exit']:
                 return None
             if line:
                 return line
@@ -434,7 +410,7 @@ def new_style_command(parsed_args, cmdargs, target, sigdict, inbuf, verbose):
             if sys.stdin.isatty():
                 # do the command-interpreter looping
                 # for raw_input to do readline cmd editing
-                import readline
+                import readline  # noqa
 
             while True:
                 interactive_input = read_input()
@@ -475,7 +451,7 @@ def new_style_command(parsed_args, cmdargs, target, sigdict, inbuf, verbose):
 def complete(sigdict, args, target):
     """
     Command completion.  Match as much of [args] as possible,
-    and print every possible match separated by newlines. 
+    and print every possible match separated by newlines.
     Return exitcode.
     """
     # XXX this looks a lot like the front of validate_command().  Refactor?
@@ -534,14 +510,20 @@ def complete(sigdict, args, target):
 ###
 # ping a monitor
 ###
-def ping_monitor(cluster_handle, name):
+def ping_monitor(cluster_handle, name, timeout):
     if 'mon.' not in name:
         print >> sys.stderr, '"ping" expects a monitor to ping; try "ping mon.<id>"'
         return 1
 
     mon_id = name[len('mon.'):]
-    s = cluster_handle.ping_monitor(mon_id)
-    print s
+    if (mon_id == '*') :
+        cluster_handle.connect(timeout=timeout)
+        for m in monids() :
+            s = cluster_handle.ping_monitor(m)
+            print "mon.{0}".format(m) + '\n' + s
+    else :
+            s = cluster_handle.ping_monitor(mon_id)
+            print s
     return 0
 
 ###
@@ -559,7 +541,7 @@ def main():
     parser, parsed_args, childargs = parse_cmdargs()
 
     if parsed_args.version:
-        print 'ceph version {0} ({1})'.format(CEPH_GIT_NICE_VER, CEPH_GIT_VER)
+        print 'ceph version {0} ({1})'.format(CEPH_GIT_NICE_VER, CEPH_GIT_VER)  # noqa
         return 0
 
     global verbose
@@ -589,12 +571,16 @@ def main():
 
     format = parsed_args.output_format
 
+    daemon_perf = False
     sockpath = None
     if parsed_args.admin_socket:
         sockpath = parsed_args.admin_socket
-    elif len(childargs) > 0 and childargs[0] == "daemon":
+    elif len(childargs) > 0 and childargs[0] in ["daemon", "daemonperf"]:
+        daemon_perf = (childargs[0] == "daemonperf")
         # Treat "daemon <path>" or "daemon <name>" like --admin_daemon <path>
-        if len(childargs) > 2:
+        # Handle "daemonperf <path>" the same but requires no trailing args
+        require_args = 2 if daemon_perf else 3
+        if len(childargs) >= require_args:
             if childargs[1].find('/') >= 0:
                 sockpath = childargs[1]
             else:
@@ -609,10 +595,29 @@ def main():
             # for both:
             childargs = childargs[2:]
         else:
-            print >> sys.stderr, 'daemon requires at least 3 arguments'
+            print >> sys.stderr, '{0} requires at least {1} arguments'.format(
+                childargs[0], require_args)
             return errno.EINVAL
 
-    if sockpath:
+    if sockpath and daemon_perf:
+        interval = 1
+        count = None
+        if len(childargs) > 0:
+            try:
+                interval = float(childargs[0])
+                if interval < 0:
+                    raise ValueError
+            except ValueError:
+                print >> sys.stderr, 'daemonperf: interval should be a positive number'
+                return errno.EINVAL
+        if len(childargs) > 1:
+            if not childargs[1].isdigit():
+                print >> sys.stderr, 'daemonperf: count should be a positive integer'
+                return errno.EINVAL
+            count = int(childargs[1])
+        DaemonWatcher(sockpath).run(interval, count)
+        return 0
+    elif sockpath:
         try:
             print admin_socket(sockpath, childargs, format)
         except Exception as e:
@@ -682,6 +687,9 @@ def main():
         print >> sys.stderr, '"{0} tell" is deprecated; try "tell {0}.<id> <command> [options...]" instead (id can be "*") '.format(childargs[0])
         return 1
 
+    if childargs in [['mon'], ['osd']]:
+        parsed_args.help = True
+
     if parsed_args.help:
         # short default timeout for -h
         if not timeout:
@@ -697,7 +705,7 @@ def main():
 
     try:
         if childargs and childargs[0] == 'ping':
-            return ping_monitor(cluster_handle, childargs[1])
+            return ping_monitor(cluster_handle, childargs[1], timeout)
         cluster_handle.connect(timeout=timeout)
     except KeyboardInterrupt:
         print >> sys.stderr, 'Cluster connection aborted'
@@ -741,7 +749,7 @@ def main():
 
         # this instance keeps the watch connection alive, but is
         # otherwise unused
-        logwatch = rados.MonitorLog(cluster_handle, level, watch_cb, 0)
+        rados.MonitorLog(cluster_handle, level, watch_cb, 0)
 
         # loop forever letting watch_cb print lines
         try:
diff --git a/src/ceph_common.sh b/src/ceph_common.sh
index 07faddc..0a4ac22 100644
--- a/src/ceph_common.sh
+++ b/src/ceph_common.sh
@@ -220,10 +220,10 @@ get_conf() {
 
 	if [ -z "$1" ]; then
 	    [ "$verbose" -eq 1 ] && echo "$CCONF -c $conf -n $type.$id \"$key\""
-	    eval "$var=\"`$CCONF -c $conf -n $type.$id \"$key\" || eval echo -n \"$def\"`\""
+	    eval "$var=\"`$CCONF -c $conf -n $type.$id \"$key\" || printf \"$def\"`\""
 	else
 	    [ "$verbose" -eq 1 ] && echo "$CCONF -c $conf -s $1 \"$key\""
-	    eval "$var=\"`$CCONF -c $conf -s $1 \"$key\" || eval echo -n \"$def\"`\""
+	    eval "$var=\"`$CCONF -c $conf -s $1 \"$key\" || eval printf \"$def\"`\""
 	fi
 }
 
diff --git a/src/ceph_fuse.cc b/src/ceph_fuse.cc
index 7616850..9a00c29 100644
--- a/src/ceph_fuse.cc
+++ b/src/ceph_fuse.cc
@@ -30,10 +30,11 @@ using namespace std;
 
 #include "common/Timer.h"
 #include "common/ceph_argparse.h"
+#include "common/linux_version.h"
 #include "global/global_init.h"
 #include "common/safe_io.h"
        
-#ifndef DARWIN
+#if !defined(DARWIN) && !defined(__FreeBSD__)
 #include <envz.h>
 #endif // DARWIN
 
@@ -121,20 +122,12 @@ int main(int argc, const char **argv, const char *envp[]) {
       }
       virtual ~RemountTest() {}
       virtual void *entry() {
-	struct utsname os_info;
-	int tr = uname(&os_info);
-	assert(tr == 0);
-	assert(memcmp(os_info.sysname, "Linux", 5) == 0);
-	int major, minor;	
-	char *end_num;
-	major = strtol(os_info.release, &end_num, 10);
-	assert(major > 0);
-	++end_num;
-	minor = strtol(end_num, NULL, 10);
+#if defined(__linux__)
+	int ver = get_linux_version();
+	assert(ver != 0);
 	bool can_invalidate_dentries = g_conf->client_try_dentry_invalidate &&
-	  (major < 3 ||
-	   (major == 3 && minor < 18));
-	tr = client->test_dentry_handling(can_invalidate_dentries);
+				       ver < KERNEL_VERSION(3, 18, 0);
+	int tr = client->test_dentry_handling(can_invalidate_dentries);
 	if (tr != 0) {
 	  cerr << "ceph-fuse[" << getpid()
 	       << "]: fuse failed dentry invalidate/remount test with error "
@@ -143,9 +136,25 @@ int main(int argc, const char **argv, const char *envp[]) {
 	  char buf[5050];
 	  string mountpoint = cfuse->get_mount_point();
 	  snprintf(buf, 5049, "fusermount -u -z %s", mountpoint.c_str());
-	  system(buf);
+	  int umount_r = system(buf);
+	  if (umount_r) {
+	    if (umount_r != -1) {
+	      if (WIFEXITED(umount_r)) {
+		umount_r = WEXITSTATUS(umount_r);
+		cerr << "got error " << umount_r
+		     << " when unmounting Ceph on failed remount test!" << std::endl;
+	      } else {
+		cerr << "attempt to umount on failed remount test failed (on a signal?)" << std::endl;
+	      }
+	    } else {
+	      cerr << "system() invocation failed during remount test" << std::endl;
+	    }
+	  }
 	}
 	return reinterpret_cast<void*>(tr);
+#else
+	return reinterpret_cast<void*>(0);
+#endif
       }
     } tester;
 
@@ -163,9 +172,7 @@ int main(int argc, const char **argv, const char *envp[]) {
       goto out_mc_start_failed;
 
     // start up network
-    messenger = Messenger::create(g_ceph_context, g_conf->ms_type,
-				  entity_name_t::CLIENT(), "client",
-				  getpid());
+    messenger = Messenger::create_client_messenger(g_ceph_context, "client");
     messenger->set_default_policy(Messenger::Policy::lossy_client(0, 0));
     messenger->set_policy(entity_name_t::TYPE_MDS,
 			  Messenger::Policy::lossless_client(0, 0));
@@ -201,8 +208,10 @@ int main(int argc, const char **argv, const char *envp[]) {
 
     // start up fuse
     // use my argc, argv (make sure you pass a mount point!)
-    r = client->mount(g_conf->client_mountpoint.c_str());
+    r = client->mount(g_conf->client_mountpoint.c_str(), g_ceph_context->_conf->fuse_require_active_mds);
     if (r < 0) {
+      if (r == CEPH_FUSE_NO_MDS_UP)
+        cerr << "ceph-fuse[" << getpid() << "]: probably no MDS server is up?" << std::endl;
       cerr << "ceph-fuse[" << getpid() << "]: ceph mount failed with " << cpp_strerror(-r) << std::endl;
       goto out_shutdown;
     }
diff --git a/src/ceph_mds.cc b/src/ceph_mds.cc
index 06269fb..91ff002 100644
--- a/src/ceph_mds.cc
+++ b/src/ceph_mds.cc
@@ -26,7 +26,7 @@ using namespace std;
 #include "common/strtol.h"
 
 #include "mon/MonMap.h"
-#include "mds/MDS.h"
+#include "mds/MDSDaemon.h"
 
 #include "msg/Messenger.h"
 
@@ -78,7 +78,7 @@ static int parse_rank(const char *opt_name, const std::string &val)
 
 
 
-MDS *mds = NULL;
+MDSDaemon *mds = NULL;
 
 
 static void handle_mds_signal(int signum)
@@ -199,13 +199,13 @@ int main(int argc, const char **argv)
   msgr->start();
 
   // start mds
-  mds = new MDS(g_conf->name.get_id().c_str(), msgr, &mc);
+  mds = new MDSDaemon(g_conf->name.get_id().c_str(), msgr, &mc);
 
   // in case we have to respawn...
   mds->orig_argc = argc;
   mds->orig_argv = argv;
 
-  if (shadow)
+  if (shadow != MDSMap::STATE_NULL)
     r = mds->init(shadow);
   else
     r = mds->init();
@@ -238,8 +238,10 @@ int main(int argc, const char **argv)
 
   // only delete if it was a clean shutdown (to aid memory leak
   // detection, etc.).  don't bother if it was a suicide.
-  if (mds->is_stopped())
+  if (mds->is_clean_shutdown()) {
     delete mds;
+    delete msgr;
+  }
 
   g_ceph_context->put();
 
diff --git a/src/ceph_mon.cc b/src/ceph_mon.cc
index e0ca6e2..3180b65 100644
--- a/src/ceph_mon.cc
+++ b/src/ceph_mon.cc
@@ -188,12 +188,12 @@ void usage()
 
 int preload_erasure_code()
 {
-  string directory = g_conf->osd_pool_default_erasure_code_directory;
   string plugins = g_conf->osd_erasure_code_plugins;
   stringstream ss;
-  int r = ErasureCodePluginRegistry::instance().preload(plugins,
-							directory,
-							ss);
+  int r = ErasureCodePluginRegistry::instance().preload(
+    plugins,
+    g_conf->erasure_code_dir,
+    &ss);
   if (r)
     derr << ss.str() << dendl;
   else
@@ -497,9 +497,17 @@ int main(int argc, const char **argv)
   Preforker prefork;
   if (!(flags & CINIT_FLAG_NO_DAEMON_ACTIONS)) {
     if (global_init_prefork(g_ceph_context, 0) >= 0) {
-      prefork.prefork();
+      string err_msg;
+      err = prefork.prefork(err_msg);
+      if (err < 0) {
+        cerr << err_msg << std::endl;
+        prefork.exit(err);
+      }
       if (prefork.is_parent()) {
-	return prefork.parent_wait();
+        err = prefork.parent_wait(err_msg);
+        if (err < 0)
+          cerr << err_msg << std::endl;
+        prefork.exit(err);
       }
       global_init_postfork_start(g_ceph_context);
     }
diff --git a/src/ceph_osd.cc b/src/ceph_osd.cc
index 884b7ed..68d2c91 100644
--- a/src/ceph_osd.cc
+++ b/src/ceph_osd.cc
@@ -15,7 +15,6 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <fcntl.h>
-#include <uuid/uuid.h>
 #include <boost/scoped_ptr.hpp>
 
 #include <iostream>
@@ -34,6 +33,7 @@ using namespace std;
 #include "msg/Messenger.h"
 
 #include "common/Timer.h"
+#include "common/TracepointProvider.h"
 #include "common/ceph_argparse.h"
 
 #include "global/global_init.h"
@@ -51,6 +51,15 @@ using namespace std;
 
 #define dout_subsys ceph_subsys_osd
 
+namespace {
+
+TracepointProvider::Traits osd_tracepoint_traits("libosd_tp.so",
+                                                 "osd_tracing");
+TracepointProvider::Traits os_tracepoint_traits("libos_tp.so",
+                                                "osd_objectstore_tracing");
+
+} // anonymous namespace
+
 OSD *osd = NULL;
 
 void handle_osd_signal(int signum)
@@ -61,20 +70,35 @@ void handle_osd_signal(int signum)
 
 void usage() 
 {
-  derr << "usage: ceph-osd -i osdid [--osd-data=path] [--osd-journal=path] "
-       << "[--mkfs] [--mkjournal] [--convert-filestore]" << dendl;
-  derr << "   --debug_osd N   set debug level (e.g. 10)" << dendl;
+  cout << "usage: ceph-osd -i <osdid>\n"
+       << "  --osd-data=path   data directory\n"
+       << "  --osd-journal=path\n"
+       << "                    journal file or block device\n"
+       << "  --mkfs            create a [new] data directory\n"
+       << "  --convert-filestore\n"
+       << "                    run any pending upgrade operations\n"
+       << "  --flush-journal   flush all data out of journal\n"
+       << "  --mkjournal       initialize a new journal\n"
+       << "  --check-wants-journal\n"
+       << "                    check whether a journal is desired\n"
+       << "  --check-allows-journal\n"
+       << "                    check whether a journal is allowed\n"
+       << "  --check-needs-journal\n"
+       << "                    check whether a journal is required\n"
+       << "  --debug_osd N     set debug level (e.g. 10)"
+       << std::endl;
   generic_server_usage();
+  cout.flush();
 }
 
 int preload_erasure_code()
 {
-  string directory = g_conf->osd_pool_default_erasure_code_directory;
   string plugins = g_conf->osd_erasure_code_plugins;
   stringstream ss;
-  int r = ErasureCodePluginRegistry::instance().preload(plugins,
-							directory,
-							ss);
+  int r = ErasureCodePluginRegistry::instance().preload(
+    plugins,
+    g_conf->erasure_code_dir,
+    &ss);
   if (r)
     derr << ss.str() << dendl;
   else
@@ -99,6 +123,9 @@ int main(int argc, const char **argv)
   // osd specific args
   bool mkfs = false;
   bool mkjournal = false;
+  bool check_wants_journal = false;
+  bool check_allows_journal = false;
+  bool check_needs_journal = false;
   bool mkkey = false;
   bool flushjournal = false;
   bool dump_journal = false;
@@ -106,7 +133,6 @@ int main(int argc, const char **argv)
   bool get_journal_fsid = false;
   bool get_osd_fsid = false;
   bool get_cluster_fsid = false;
-  bool check_need_journal = false;
   std::string dump_pg_log;
 
   std::string val;
@@ -120,6 +146,12 @@ int main(int argc, const char **argv)
       mkfs = true;
     } else if (ceph_argparse_flag(args, i, "--mkjournal", (char*)NULL)) {
       mkjournal = true;
+    } else if (ceph_argparse_flag(args, i, "--check-allows-journal", (char*)NULL)) {
+      check_allows_journal = true;
+    } else if (ceph_argparse_flag(args, i, "--check-wants-journal", (char*)NULL)) {
+      check_wants_journal = true;
+    } else if (ceph_argparse_flag(args, i, "--check-needs-journal", (char*)NULL)) {
+      check_needs_journal = true;
     } else if (ceph_argparse_flag(args, i, "--mkkey", (char*)NULL)) {
       mkkey = true;
     } else if (ceph_argparse_flag(args, i, "--flush-journal", (char*)NULL)) {
@@ -136,8 +168,6 @@ int main(int argc, const char **argv)
       get_osd_fsid = true;
     } else if (ceph_argparse_flag(args, i, "--get-journal-fsid", "--get-journal-uuid", (char*)NULL)) {
       get_journal_fsid = true;
-    } else if (ceph_argparse_flag(args, i, "--check-needs-journal", (char*)NULL)) {
-      check_need_journal = true;
     } else {
       ++i;
     }
@@ -259,6 +289,33 @@ int main(int argc, const char **argv)
 	 << " for object store " << g_conf->osd_data << dendl;
     exit(0);
   }
+  if (check_wants_journal) {
+    if (store->wants_journal()) {
+      cout << "yes" << std::endl;
+      exit(0);
+    } else {
+      cout << "no" << std::endl;
+      exit(1);
+    }
+  }
+  if (check_allows_journal) {
+    if (store->allows_journal()) {
+      cout << "yes" << std::endl;
+      exit(0);
+    } else {
+      cout << "no" << std::endl;
+      exit(1);
+    }
+  }
+  if (check_needs_journal) {
+    if (store->needs_journal()) {
+      cout << "yes" << std::endl;
+      exit(0);
+    } else {
+      cout << "no" << std::endl;
+      exit(1);
+    }
+  }
   if (flushjournal) {
     common_init_finish(g_ceph_context);
     int err = store->mount();
@@ -268,7 +325,6 @@ int main(int argc, const char **argv)
 	   << ": " << cpp_strerror(-err) << TEXT_NORMAL << dendl;
       exit(1);
     }
-    store->sync_and_flush();
     store->umount();
     derr << "flushed journal " << g_conf->osd_journal
 	 << " for object store " << g_conf->osd_data
@@ -317,14 +373,6 @@ int main(int argc, const char **argv)
     exit(r);
   }
 
-  if (check_need_journal) {
-    if (store->need_journal())
-      cout << "yes" << std::endl;
-    else
-      cout << "no" << std::endl;
-    exit(0);
-  }
-
   string magic;
   uuid_d cluster_fsid, osd_fsid;
   int w;
@@ -373,7 +421,7 @@ int main(int argc, const char **argv)
 					   getpid());
   Messenger *ms_cluster = Messenger::create(g_ceph_context, g_conf->ms_type,
 					    entity_name_t::OSD(whoami), "cluster",
-					    getpid());
+					    getpid(), CEPH_FEATURES_ALL);
   Messenger *ms_hbclient = Messenger::create(g_ceph_context, g_conf->ms_type,
 					     entity_name_t::OSD(whoami), "hbclient",
 					     getpid());
@@ -412,11 +460,17 @@ int main(int argc, const char **argv)
     CEPH_FEATURE_MSG_AUTH |
     CEPH_FEATURE_OSD_ERASURE_CODES;
 
+  // All feature bits 0 - 34 should be present from dumpling v0.67 forward
   uint64_t osd_required =
     CEPH_FEATURE_UID |
     CEPH_FEATURE_PGID64 |
     CEPH_FEATURE_OSDENC |
-    CEPH_FEATURE_OSD_SNAPMAPPER;
+    CEPH_FEATURE_OSD_SNAPMAPPER |
+    CEPH_FEATURE_INDEP_PG_MAP |
+    CEPH_FEATURE_OSD_PACKED_RECOVERY |
+    CEPH_FEATURE_RECOVERY_RESERVATION |
+    CEPH_FEATURE_BACKFILL_RESERVATION |
+    CEPH_FEATURE_CHUNKY_SCRUB;
 
   ms_public->set_default_policy(Messenger::Policy::stateless_server(supported, 0));
   ms_public->set_policy_throttlers(entity_name_t::TYPE_CLIENT,
@@ -484,6 +538,9 @@ int main(int argc, const char **argv)
   global_init_daemonize(g_ceph_context, 0);
   common_init_finish(g_ceph_context);
 
+  TracepointProvider::initialize<osd_tracepoint_traits>(g_ceph_context);
+  TracepointProvider::initialize<os_tracepoint_traits>(g_ceph_context);
+
   MonClient mc(g_ceph_context);
   if (mc.build_initial_monmap() < 0)
     return -1;
diff --git a/src/ceph_syn.cc b/src/ceph_syn.cc
index 71db206..1d10fa2 100644
--- a/src/ceph_syn.cc
+++ b/src/ceph_syn.cc
@@ -65,10 +65,8 @@ int main(int argc, const char **argv, char *envp[])
 
   cout << "ceph-syn: starting " << g_conf->num_client << " syn client(s)" << std::endl;
   for (int i=0; i<g_conf->num_client; i++) {
-      messengers[i] = Messenger::create(
-	g_ceph_context, g_conf->ms_type,
-	entity_name_t(entity_name_t::TYPE_CLIENT,-1), "synclient",
-	i * 1000000 + getpid());
+    messengers[i] = Messenger::create_client_messenger(g_ceph_context,
+						       "synclient");
     messengers[i]->bind(g_conf->public_addr);
     mclients[i] = new MonClient(g_ceph_context);
     mclients[i]->build_initial_monmap();
diff --git a/src/check_version b/src/check_version
deleted file mode 100755
index 8600c55..0000000
--- a/src/check_version
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/sh
-
-dname=`dirname $0`
-
-if [ ! -d $dname/../.git ]; then
-    echo "not updating .git_version (no $dname/../.git)"
-    exit 0
-fi
-
-cur=`cd $dname && git rev-parse HEAD 2>/dev/null; git describe 2>/dev/null`
-[ -e $1 ] && old=`cat $1`
-
-if [ "$cur" != "$old" ]; then
-    echo regenerating $1 with $cur
-    echo "$cur" > $1
-else
-    echo $1 is up to date.
-fi
-
diff --git a/src/client/Client.cc b/src/client/Client.cc
index 0d85db2..546066a 100644
--- a/src/client/Client.cc
+++ b/src/client/Client.cc
@@ -22,6 +22,17 @@
 #include <sys/param.h>
 #include <fcntl.h>
 #include <sys/utsname.h>
+#include <sys/uio.h>
+
+#include <boost/lexical_cast.hpp>
+#include <boost/fusion/include/std_pair.hpp>
+
+#if defined(__FreeBSD__)
+#define XATTR_CREATE    0x1
+#define XATTR_REPLACE   0x2
+#else
+#include <sys/xattr.h>
+#endif
 
 #if defined(__linux__)
 #include <linux/falloc.h>
@@ -74,8 +85,8 @@ using namespace std;
 #define dout_subsys ceph_subsys_client
 
 #include "include/lru.h"
-
 #include "include/compat.h"
+#include "include/stringify.h"
 
 #include "Client.h"
 #include "Inode.h"
@@ -90,11 +101,21 @@ using namespace std;
 #include "include/assert.h"
 #include "include/stat.h"
 
+#if HAVE_GETGROUPLIST
+#include <grp.h>
+#include <pwd.h>
+#include <unistd.h>
+#endif
+
 #undef dout_prefix
 #define dout_prefix *_dout << "client." << whoami << " "
 
 #define  tout(cct)       if (!cct->_conf->client_trace.empty()) traceout
 
+// FreeBSD fails to define this
+#ifndef O_DSYNC
+#define O_DSYNC 0x0
+#endif
 // Darwin fails to define this
 #ifndef O_RSYNC
 #define O_RSYNC 0x0
@@ -148,7 +169,60 @@ dir_result_t::dir_result_t(Inode *in)
   : inode(in), offset(0), this_offset(2), next_offset(2),
     release_count(0), ordered_count(0), start_shared_gen(0),
     buffer(0) {
-  inode->get();
+}
+
+void Client::_reset_faked_inos()
+{
+  ino_t start = 1024;
+  free_faked_inos.clear();
+  free_faked_inos.insert(start, (uint32_t)-1 - start + 1);
+  last_used_faked_ino = 0;
+  _use_faked_inos = sizeof(ino_t) < 8 || cct->_conf->client_use_faked_inos;
+}
+
+void Client::_assign_faked_ino(Inode *in)
+{
+  interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
+  if (it == free_faked_inos.end() && last_used_faked_ino > 0) {
+    last_used_faked_ino = 0;
+    it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
+  }
+  assert(it != free_faked_inos.end());
+  if (last_used_faked_ino < it.get_start()) {
+    assert(it.get_len() > 0);
+    last_used_faked_ino = it.get_start();
+  } else {
+    ++last_used_faked_ino;
+    assert(it.get_start() + it.get_len() > last_used_faked_ino);
+  }
+  in->faked_ino = last_used_faked_ino;
+  free_faked_inos.erase(in->faked_ino);
+  faked_ino_map[in->faked_ino] = in->vino();
+}
+
+void Client::_release_faked_ino(Inode *in)
+{
+  free_faked_inos.insert(in->faked_ino);
+  faked_ino_map.erase(in->faked_ino);
+}
+
+vinodeno_t Client::_map_faked_ino(ino_t ino)
+{
+  vinodeno_t vino;
+  if (ino == 1)
+    vino = root->vino();
+  else if (faked_ino_map.count(ino))
+    vino = faked_ino_map[ino];
+  else
+    vino = vinodeno_t(0, CEPH_NOSNAP);
+  ldout(cct, 10) << "map_faked_ino " << ino << " -> " << vino << dendl;
+  return vino;
+}
+
+vinodeno_t Client::map_faked_ino(ino_t ino)
+{
+  Mutex::Locker lock(client_lock);
+  return _map_faked_ino(ino);
 }
 
 // cons/des
@@ -174,6 +248,7 @@ Client::Client(Messenger *m, MonClient *mc)
     tick_event(NULL),
     monclient(mc), messenger(m), whoami(m->get_myname().num()),
     cap_epoch_barrier(0),
+    last_tid(0), oldest_tid(0), last_flush_tid(1),
     initialized(false), authenticated(false),
     mounted(false), unmounting(false),
     local_osd(-1), local_osd_epoch(0),
@@ -182,11 +257,7 @@ Client::Client(Messenger *m, MonClient *mc)
 {
   monclient->set_messenger(m);
 
-  last_tid = 0;
-  last_flush_seq = 0;
-
-  cwd = NULL;
-
+  _reset_faked_inos();
   //
   root = 0;
 
@@ -241,11 +312,6 @@ Client::~Client()
   delete logger;
 }
 
-
-
-
-
-
 void Client::tear_down_cache()
 {
   // fd's
@@ -272,12 +338,10 @@ void Client::tear_down_cache()
     delete root;
     root = 0;
     root_ancestor = 0;
-    while (!root_parents.empty()) {
-      Inode *in = root_parents.begin()->second;
+    while (!root_parents.empty())
       root_parents.erase(root_parents.begin());
-      delete in;
-    }
     inode_map.clear();
+    _reset_faked_inos();
   }
 
   assert(inode_map.empty());
@@ -330,7 +394,7 @@ void Client::dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconne
 	f->close_section();
       }	
       if (it->second->inode)
-	dump_inode(f, it->second->inode, did, false);
+	dump_inode(f, it->second->inode.get(), did, false);
     }
   }
 }
@@ -422,16 +486,16 @@ int Client::init()
 
   // logger
   PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last);
-  plb.add_time_avg(l_c_reply, "reply");
-  plb.add_time_avg(l_c_lat, "lat");
-  plb.add_time_avg(l_c_wrlat, "wrlat");
-  plb.add_time_avg(l_c_owrlat, "owrlat");
-  plb.add_time_avg(l_c_ordlat, "ordlat");
+  plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request");
+  plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request");
+  plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation");
   logger = plb.create_perf_counters();
   cct->get_perfcounters_collection()->add(logger);
 
   client_lock.Unlock();
 
+  cct->_conf->add_observer(this);
+
   AdminSocket* admin_socket = cct->get_admin_socket();
   int ret = admin_socket->register_command("mds_requests",
 					   "mds_requests",
@@ -486,6 +550,8 @@ void Client::shutdown()
 {
   ldout(cct, 1) << "shutdown" << dendl;
 
+  cct->_conf->remove_observer(this);
+
   AdminSocket* admin_socket = cct->get_admin_socket();
   admin_socket->unregister_command("mds_requests");
   admin_socket->unregister_command("mds_sessions");
@@ -544,7 +610,7 @@ void Client::shutdown()
 // ===================
 // metadata cache stuff
 
-void Client::trim_cache()
+void Client::trim_cache(bool trim_kernel_dcache)
 {
   ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << lru.lru_get_max() << dendl;
   unsigned last = 0;
@@ -561,18 +627,19 @@ void Client::trim_cache()
     trim_dentry(dn);
   }
 
+  if (trim_kernel_dcache && lru.lru_get_size() > lru.lru_get_max())
+    _invalidate_kernel_dcache();
+
   // hose root?
   if (lru.lru_get_size() == 0 && root && root->get_num_ref() == 0 && inode_map.size() == 1 + root_parents.size()) {
     ldout(cct, 15) << "trim_cache trimmed root " << root << dendl;
     delete root;
     root = 0;
     root_ancestor = 0;
-    while (!root_parents.empty()) {
-      Inode *in = root_parents.begin()->second;
+    while (!root_parents.empty())
       root_parents.erase(root_parents.begin());
-      delete in;
-    }
     inode_map.clear();
+    _reset_faked_inos();
   }
 }
 
@@ -611,10 +678,13 @@ void Client::trim_dentry(Dentry *dn)
   ldout(cct, 15) << "trim_dentry unlinking dn " << dn->name 
 		 << " in dir " << hex << dn->dir->parent_inode->ino 
 		 << dendl;
-  dn->dir->release_count++;
-  if (dn->dir->parent_inode->flags & I_COMPLETE) {
-    ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *dn->dir->parent_inode << dendl;
-    dn->dir->parent_inode->flags &= ~(I_COMPLETE | I_DIR_ORDERED);
+  if (dn->inode) {
+    dn->dir->release_count++;
+    if (dn->dir->parent_inode->flags & I_COMPLETE) {
+      ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on "
+		     << *dn->dir->parent_inode << dendl;
+      dn->dir->parent_inode->flags &= ~(I_COMPLETE | I_DIR_ORDERED);
+    }
   }
   unlink(dn, false, false);  // drop dir, drop dentry
 }
@@ -740,17 +810,19 @@ Inode * Client::add_update_inode(InodeStat *st, utime_t from,
     in = inode_map[st->vino];
     ldout(cct, 12) << "add_update_inode had " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
   } else {
-    in = new Inode(cct, st->vino, &st->layout);
+    in = new Inode(this, st->vino, &st->layout);
     inode_map[st->vino] = in;
+
+    if (use_faked_inos())
+      _assign_faked_ino(in);
+
     if (!root) {
       root = in;
       root_ancestor = in;
       cwd = root;
-      cwd->get();
     } else if (!mounted) {
       root_parents[root_ancestor] = in;
       root_ancestor = in;
-      in->get();
     }
 
     // immutable bits
@@ -892,8 +964,8 @@ Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dl
     }
   }
   
-  if (!dn || dn->inode == 0) {
-    in->get();
+  if (!dn || !dn->inode) {
+    InodeRef tmp_ref(in);
     if (old_dentry) {
       if (old_dentry->dir != dir) {
 	old_dentry->dir->ordered_count++;
@@ -911,7 +983,6 @@ Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dl
 	dir->parent_inode->flags &= ~I_DIR_ORDERED;
     }
     dn = link(dir, dname, in, dn);
-    put_inode(in);
   }
 
   update_dentry_lease(dn, dlease, from, session);
@@ -1059,8 +1130,7 @@ void Client::insert_readdir_results(MetaRequest *request, MetaSession *session,
       dn->offset = dir_result_t::make_fpos(fg, i + readdir_offset);
 
       // add to cached result list
-      in->get();
-      request->readdir_result.push_back(pair<string,Inode*>(dname, in));
+      request->readdir_result.push_back(pair<string,InodeRef>(dname, in));
 
       ldout(cct, 15) << __func__ << "  " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl;
     }
@@ -1201,9 +1271,15 @@ Inode* Client::insert_trace(MetaRequest *request, MetaSession *session)
     }
   }
 
-  if (in && (reply->head.op == CEPH_MDS_OP_READDIR ||
-	     reply->head.op == CEPH_MDS_OP_LSSNAP)) {
-    insert_readdir_results(request, session, in);
+  if (in) {
+    if (reply->head.op == CEPH_MDS_OP_READDIR ||
+	reply->head.op == CEPH_MDS_OP_LSSNAP)
+      insert_readdir_results(request, session, in);
+
+    if (request->dentry() == NULL && in != request->inode()) {
+      // pin the target inode if its parent dentry is not pinned
+      request->set_other_inode(in);
+    }
   }
 
   request->target = in;
@@ -1245,7 +1321,7 @@ mds_rank_t Client::choose_target_mds(MetaRequest *req)
     }
   } else if (de) {
     if (de->inode) {
-      in = de->inode;
+      in = de->inode.get();
       ldout(cct, 20) << "choose_target_mds starting with req->dentry inode " << *in << dendl;
     } else {
       in = de->dir->parent_inode;
@@ -1261,12 +1337,12 @@ mds_rank_t Client::choose_target_mds(MetaRequest *req)
       ldout(cct, 10) << "choose_target_mds " << *in << " is snapped, using nonsnap parent" << dendl;
       while (in->snapid != CEPH_NOSNAP) {
         if (in->snapid == CEPH_SNAPDIR)
-  	in = in->snapdir_parent;
+	  in = in->snapdir_parent.get();
         else if (!in->dn_set.empty())
           /* In most cases there will only be one dentry, so getting it
            * will be the correct action. If there are multiple hard links,
            * I think the MDS should be able to redirect as needed*/
-  	in = in->get_first_parent()->dir->parent_inode;
+	  in = in->get_first_parent()->dir->parent_inode;
         else {
           ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl;
           break;
@@ -1353,7 +1429,7 @@ void Client::dump_mds_requests(Formatter *f)
 
 int Client::verify_reply_trace(int r,
 			       MetaRequest *request, MClientReply *reply,
-			       Inode **ptarget, bool *pcreated,
+			       InodeRef *ptarget, bool *pcreated,
 			       int uid, int gid)
 {
   // check whether this request actually did the create, and set created flag
@@ -1376,17 +1452,17 @@ int Client::verify_reply_trace(int r,
     *pcreated = got_created_ino;
 
   if (request->target) {
-    *ptarget = request->target;
-    ldout(cct, 20) << "make_request target is " << *request->target << dendl;
+    ptarget->swap(request->target);
+    ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl;
   } else {
     if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) {
       (*ptarget) = p->second;
-      ldout(cct, 20) << "make_request created, target is " << **ptarget << dendl;
+      ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl;
     } else {
       // we got a traceless reply, and need to look up what we just
       // created.  for now, do this by name.  someday, do this by the
       // ino... which we know!  FIXME.
-      Inode *target = 0;  // ptarget may be NULL
+      InodeRef target;
       Dentry *d = request->dentry();
       if (d) {
 	if (d->dir) {
@@ -1408,15 +1484,14 @@ int Client::verify_reply_trace(int r,
 	target = in;
       }
       if (r >= 0) {
-	if (ptarget)
-	  *ptarget = target;
-
 	// verify ino returned in reply and trace_dist are the same
 	if (got_created_ino &&
 	    created_ino.val != target->ino.val) {
 	  ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl;
 	  r = -EINTR;
 	}
+	if (ptarget)
+	  ptarget->swap(target);
       }
     }
   }
@@ -1447,7 +1522,7 @@ int Client::verify_reply_trace(int r,
  */
 int Client::make_request(MetaRequest *request, 
 			 int uid, int gid, 
-			 Inode **ptarget, bool *pcreated,
+			 InodeRef *ptarget, bool *pcreated,
 			 int use_mds,
 			 bufferlist *pdirbl)
 {
@@ -1462,6 +1537,9 @@ int Client::make_request(MetaRequest *request,
 
   // make note
   mds_requests[tid] = request->get();
+  if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK)\
+    oldest_tid = tid;
+
   if (uid < 0) {
     uid = geteuid();
     gid = getegid();
@@ -1469,10 +1547,12 @@ int Client::make_request(MetaRequest *request,
   request->set_caller_uid(uid);
   request->set_caller_gid(gid);
 
-  if (!mds_requests.empty()) 
-    request->set_oldest_client_tid(mds_requests.begin()->first);
-  else
-    request->set_oldest_client_tid(tid); // this one is the oldest.
+  if (cct->_conf->client_inject_fixed_oldest_tid) {
+    ldout(cct, 20) << __func__ << " injecting fixed oldest_client_tid(1)" << dendl;
+    request->set_oldest_client_tid(1);
+  } else {
+    request->set_oldest_client_tid(oldest_tid);
+  }
 
   // hack target mds?
   if (use_mds >= 0)
@@ -1544,8 +1624,7 @@ int Client::make_request(MetaRequest *request,
     assert(request->aborted);
     assert(!request->got_unsafe);
     request->item.remove_myself();
-    mds_requests.erase(tid);
-    put_request(request); // request map's
+    unregister_request(request);
     put_request(request); // ours
     return -ETIMEDOUT;
   }
@@ -1554,6 +1633,8 @@ int Client::make_request(MetaRequest *request,
   MClientReply *reply = request->reply;
   request->reply = NULL;
   r = reply->get_result();
+  if (r >= 0)
+    request->success = true;
 
   // kick dispatcher (we've got it!)
   assert(request->dispatch_cond);
@@ -1580,16 +1661,41 @@ int Client::make_request(MetaRequest *request,
   return r;
 }
 
+void Client::unregister_request(MetaRequest *req)
+{
+  mds_requests.erase(req->tid);
+  if (req->tid == oldest_tid) {
+    map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.upper_bound(oldest_tid);
+    while (true) {
+      if (p == mds_requests.end()) {
+	oldest_tid = 0;
+	break;
+      }
+      if (p->second->get_op() != CEPH_MDS_OP_SETFILELOCK) {
+	oldest_tid = p->first;
+	break;
+      }
+      ++p;
+    }
+  }
+  put_request(req);
+}
+
 void Client::put_request(MetaRequest *request)
 {
   if (request->_put()) {
-    if (request->inode())
-      put_inode(request->take_inode());
-    if (request->old_inode())
-      put_inode(request->take_old_inode());
-    if (request->other_inode())
-      put_inode(request->take_other_inode());
+    int op = -1;
+    if (request->success)
+      op = request->get_op();
+    InodeRef other_in;
+    request->take_other_inode(&other_in);
     delete request;
+
+    if (other_in) {
+      if (other_in->dir &&
+	  (op == CEPH_MDS_OP_RMDIR || op == CEPH_MDS_OP_RENAME))
+	_try_to_trim_inode(other_in.get());
+    }
   }
 }
 
@@ -1883,6 +1989,11 @@ void Client::send_request(MetaRequest *request, MetaSession *session,
       r->releases.swap(request->cap_releases);
   }
   r->set_mdsmap_epoch(mdsmap->get_epoch());
+  if (r->head.op == CEPH_MDS_OP_SETXATTR) {
+    const OSDMap *osdmap = objecter->get_osdmap_read();
+    r->set_osdmap_epoch(osdmap->get_epoch());
+    objecter->put_osdmap_read();
+  }
 
   if (request->mds == -1) {
     request->sent_stamp = ceph_clock_now(cct);
@@ -1975,6 +2086,17 @@ void Client::handle_client_request_forward(MClientRequestForward *fwd)
   fwd->put();
 }
 
+bool Client::is_dir_operation(MetaRequest *req)
+{
+  int op = req->get_op();
+  if (op == CEPH_MDS_OP_MKNOD || op == CEPH_MDS_OP_LINK ||
+      op == CEPH_MDS_OP_UNLINK || op == CEPH_MDS_OP_RENAME ||
+      op == CEPH_MDS_OP_MKDIR || op == CEPH_MDS_OP_RMDIR ||
+      op == CEPH_MDS_OP_SYMLINK || op == CEPH_MDS_OP_CREATE)
+    return true;
+  return false;
+}
+
 void Client::handle_client_reply(MClientReply *reply)
 {
   mds_rank_t mds_num = mds_rank_t(reply->get_source().num());
@@ -2040,6 +2162,11 @@ void Client::handle_client_reply(MClientReply *reply)
   if (!is_safe) {
     request->got_unsafe = true;
     session->unsafe_requests.push_back(&request->unsafe_item);
+    if (is_dir_operation(request)) {
+      Inode *dir = request->inode();
+      assert(dir);
+      dir->unsafe_dir_ops.push_back(&request->unsafe_dir_item);
+    }
   }
 
   // Only signal the caller once (on the first reply):
@@ -2064,46 +2191,88 @@ void Client::handle_client_reply(MClientReply *reply)
     // we're done, clean up
     if (request->got_unsafe) {
       request->unsafe_item.remove_myself();
+      request->unsafe_dir_item.remove_myself();
+      signal_cond_list(request->waitfor_safe);
     }
     request->item.remove_myself();
-    mds_requests.erase(tid);
-    put_request(request);
+    unregister_request(request);
   }
   if (unmounting)
     mount_cond.Signal();
 }
 
+void Client::_handle_full_flag(int64_t pool)
+{
+  ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations "
+    << "on " << pool << dendl;
+  // Cancel all outstanding ops in this pool with -ENOSPC: it is necessary
+  // to do this rather than blocking, because otherwise when we fill up we
+  // potentially lock caps forever on files with dirty pages, and we need
+  // to be able to release those caps to the MDS so that it can delete files
+  // and free up space.
+  epoch_t cancelled_epoch = objecter->op_cancel_writes(-ENOSPC, pool);
+
+  // For all inodes with layouts in this pool and a pending flush write op
+  // (i.e. one of the ones we will cancel), we've got to purge_set their data
+  // from ObjectCacher so that it doesn't re-issue the write in response to
+  // the ENOSPC error.
+  // Fortunately since we're cancelling everything in a given pool, we don't
+  // need to know which ops belong to which ObjectSet, we can just blow all
+  // the un-flushed cached data away and mark any dirty inodes' async_err
+  // field with -ENOSPC as long as we're sure all the ops we cancelled were
+  // affecting this pool, and all the objectsets we're purging were also
+  // in this pool.
+  for (unordered_map<vinodeno_t,Inode*>::iterator i = inode_map.begin();
+       i != inode_map.end(); ++i)
+  {
+    Inode *inode = i->second;
+    if (inode->oset.dirty_or_tx
+        && (pool == -1 || inode->layout.fl_pg_pool == pool)) {
+      ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec
+        << " has dirty objects, purging and setting ENOSPC" << dendl;
+      objectcacher->purge_set(&inode->oset);
+      inode->async_err = -ENOSPC;
+    }
+  }
+
+  if (cancelled_epoch != (epoch_t)-1) {
+    set_cap_epoch_barrier(cancelled_epoch);
+  }
+}
+
 void Client::handle_osd_map(MOSDMap *m)
 {
   if (objecter->osdmap_full_flag()) {
-    ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations" << dendl;
-    // Cancel all outstanding ops with -ENOSPC: it is necessary to do this rather than blocking,
-    // because otherwise when we fill up we potentially lock caps forever on files with
-    // dirty pages, and we need to be able to release those caps to the MDS so that it can
-    // delete files and free up space.
-    epoch_t cancelled_epoch = objecter->op_cancel_writes(-ENOSPC);
-
-    // For all inodes with a pending flush write op (i.e. one of the ones we
-    // will cancel), we've got to purge_set their data from ObjectCacher
-    // so that it doesn't re-issue the write in response to the ENOSPC error.
-    // Fortunately since we're cancelling *everything*, we don't need to know
-    // which ops belong to which ObjectSet, we can just blow all the un-flushed
-    // cached data away and mark any dirty inodes' async_err field with -ENOSPC
-    // (i.e. we only need to know which inodes had outstanding ops, not the exact
-    // op-to-inode relation)
-    for (unordered_map<vinodeno_t,Inode*>::iterator i = inode_map.begin();
-         i != inode_map.end(); ++i)
-    {
-      Inode *inode = i->second;
-      if (inode->oset.dirty_or_tx) {
-        ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec
-          << " has dirty objects, purging and setting ENOSPC" << dendl;
-        objectcacher->purge_set(&inode->oset);
-        inode->async_err = -ENOSPC;
+    _handle_full_flag(-1);
+  } else {
+    // Accumulate local list of full pools so that I can drop
+    // the objecter lock before re-entering objecter in
+    // cancel_writes
+    std::vector<int64_t> full_pools;
+
+    const OSDMap *osd_map = objecter->get_osdmap_read();
+    const map<int64_t,pg_pool_t>& pools = osd_map->get_pools();
+    for (map<int64_t,pg_pool_t>::const_iterator i = pools.begin();
+         i != pools.end(); ++i) {
+      if (i->second.has_flag(pg_pool_t::FLAG_FULL)) {
+        full_pools.push_back(i->first);
       }
     }
 
-    set_cap_epoch_barrier(cancelled_epoch);
+    objecter->put_osdmap_read();
+
+    for (std::vector<int64_t>::iterator i = full_pools.begin();
+         i != full_pools.end(); ++i) {
+      _handle_full_flag(*i);
+    }
+
+    // Subscribe to subsequent maps to watch for the full flag going
+    // away.  For the global full flag objecter does this for us, but
+    // it pays no attention to the per-pool full flag so in this branch
+    // we do it ourselves.
+    if (!full_pools.empty()) {
+      objecter->maybe_request_map();
+    }
   }
 
   m->put();
@@ -2336,6 +2505,9 @@ void Client::send_reconnect(MetaSession *session)
       }	
     }
   }
+
+  early_kick_flushing_caps(session);
+
   session->con->send_message(m);
 
   mount_cond.Signal();
@@ -2396,8 +2568,9 @@ void Client::kick_requests_closed(MetaSession *session)
       if (req->got_unsafe) {
 	lderr(cct) << "kick_requests_closed removing unsafe request " << req->get_tid() << dendl;
 	req->unsafe_item.remove_myself();
-	mds_requests.erase(req->get_tid());
-	put_request(req);
+	req->unsafe_dir_item.remove_myself();
+	signal_cond_list(req->waitfor_safe);
+	unregister_request(req);
       }
     }
   }
@@ -2476,19 +2649,18 @@ void Client::put_inode(Inode *in, int n)
     bool unclean = objectcacher->release_set(&in->oset);
     assert(!unclean);
     put_qtree(in);
-    if (in->snapdir_parent)
-      put_inode(in->snapdir_parent);
     inode_map.erase(in->vino());
+    if (use_faked_inos())
+      _release_faked_ino(in);
+
     in->cap_item.remove_myself();
     in->snaprealm_item.remove_myself();
+    in->snapdir_parent.reset();
     if (in == root) {
       root = 0;
       root_ancestor = 0;
-      while (!root_parents.empty()) {
-        Inode *in = root_parents.begin()->second;
+      while (!root_parents.empty())
         root_parents.erase(root_parents.begin());
-        put_inode(in);
-      }
     }
 
     if (!in->oset.objects.empty()) {
@@ -2546,7 +2718,6 @@ Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn)
 
   if (in) {    // link to inode
     dn->inode = in;
-    in->get();
     if (in->is_dir()) {
       if (in->dir)
 	dn->get(); // dir -> dn pin
@@ -2573,13 +2744,14 @@ Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn)
 
 void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry)
 {
-  Inode *in = dn->inode;
+  InodeRef in;
+  in.swap(dn->inode);
   ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn
 		 << " inode " << dn->inode << dendl;
 
   // unlink from inode
   if (in) {
-    invalidate_quota_tree(in);
+    invalidate_quota_tree(in.get());
     if (in->is_dir()) {
       if (in->dir)
 	dn->put(); // dir -> dn pin
@@ -2590,7 +2762,6 @@ void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry)
     assert(in->dn_set.count(dn));
     in->dn_set.erase(dn);
     ldout(cct, 20) << "unlink  inode " << in << " parents now " << in->dn_set << dendl; 
-    put_inode(in);
   }
 
   if (keepdentry) {
@@ -2611,6 +2782,28 @@ void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry)
   }
 }
 
+/**
+ * For asynchronous flushes, check for errors from the IO and
+ * update the inode if necessary
+ */
+class C_Client_FlushComplete : public Context {
+private:
+  Client *client;
+  InodeRef inode;
+public:
+  C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { }
+  void finish(int r) {
+    assert(client->client_lock.is_locked_by_me());
+    if (r != 0) {
+      client_t const whoami = client->whoami;  // For the benefit of ldout prefix
+      ldout(client->cct, 1) << "I/O error from flush on inode " << inode
+        << " 0x" << std::hex << inode->ino << std::dec
+        << ": " << r << "(" << cpp_strerror(r) << ")" << dendl;
+      inode->async_err = r;
+    }
+  }
+};
+
 
 /****
  * caps
@@ -2684,22 +2877,46 @@ int Client::get_caps(Inode *in, int need, int want, int *phave, loff_t endoff)
     if (!in->is_any_caps())
       return -ESTALE;
 
-    if (endoff > 0 &&
-	(endoff >= (loff_t)in->max_size ||
-	 endoff > (loff_t)(in->size << 1)) &&
-	endoff > (loff_t)in->wanted_max_size) {
-      ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl;
-      in->wanted_max_size = endoff;
-      check_caps(in, false);
+    int implemented;
+    int have = in->caps_issued(&implemented);
+
+    bool waitfor_caps = false;
+    bool waitfor_commit = false;
+
+    if (have & need & CEPH_CAP_FILE_WR) {
+      if (endoff > 0 &&
+	  (endoff >= (loff_t)in->max_size ||
+	   endoff > (loff_t)(in->size << 1)) &&
+	  endoff > (loff_t)in->wanted_max_size) {
+	ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl;
+	in->wanted_max_size = endoff;
+	check_caps(in, false);
+      }
+
+      if (endoff >= 0 && endoff > (loff_t)in->max_size) {
+	ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl;
+	waitfor_caps = true;
+      }
+      if (!in->cap_snaps.empty()) {
+	if (in->cap_snaps.rbegin()->second->writing) {
+	  ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl;
+	  waitfor_caps = true;
+	}
+	for (map<snapid_t,CapSnap*>::iterator p = in->cap_snaps.begin();
+	    p != in->cap_snaps.end();
+	    ++p)
+	  if (p->second->dirty_data) {
+	    waitfor_commit = true;
+	    break;
+	  }
+	if (waitfor_commit) {
+	  _flush(in, new C_Client_FlushComplete(this, in));
+	  ldout(cct, 10) << "waiting for WRBUFFER to get dropped" << dendl;
+	}
+      }
     }
 
-    if (endoff >= 0 && endoff > (loff_t)in->max_size) {
-      ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl;
-    } else if (!in->cap_snaps.empty() && in->cap_snaps.rbegin()->second->writing) {
-      ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl;
-    } else {
-      int implemented;
-      int have = in->caps_issued(&implemented);
+    if (!waitfor_caps && !waitfor_commit) {
       if ((have & need) == need) {
 	int butnot = want & ~(have & need);
 	int revoking = implemented & ~have;
@@ -2714,13 +2931,17 @@ int Client::get_caps(Inode *in, int need, int want, int *phave, loff_t endoff)
 	}
       }
       ldout(cct, 10) << "waiting for caps need " << ccap_string(need) << " want " << ccap_string(want) << dendl;
+      waitfor_caps = true;
     }
 
     if ((need & CEPH_CAP_FILE_WR) && in->auth_cap &&
 	in->auth_cap->session->readonly)
       return -EROFS;
-    
-    wait_on_list(in->waitfor_caps);
+
+    if (waitfor_caps)
+      wait_on_list(in->waitfor_caps);
+    else if (waitfor_commit)
+      wait_on_list(in->waitfor_commit);
   }
 }
 
@@ -2742,7 +2963,8 @@ void Client::cap_delay_requeue(Inode *in)
 }
 
 void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
-		      int used, int want, int retain, int flush)
+		      int used, int want, int retain, int flush,
+		      ceph_tid_t flush_tid)
 {
   int held = cap->issued | cap->implemented;
   int revoking = cap->implemented & ~cap->issued;
@@ -2785,17 +3007,10 @@ void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
     cap->implemented &= cap->issued | used;
   }
 
-  uint64_t flush_tid = 0;
   snapid_t follows = 0;
 
-  if (flush) {
-    flush_tid = ++in->last_flush_tid;
-    for (int i = 0; i < CEPH_CAP_BITS; ++i) {
-      if (flush & (1<<i))
-	in->flushing_cap_tid[i] = flush_tid;
-    }
+  if (flush)
     follows = in->snaprealm->get_snap_context().seq;
-  }
   
   MClientCaps *m = new MClientCaps(op,
 				   in->ino,
@@ -2843,6 +3058,10 @@ void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
     in->requested_max_size = in->wanted_max_size;
     ldout(cct, 15) << "auth cap, setting max_size = " << in->requested_max_size << dendl;
   }
+
+  if (!session->flushing_caps_tids.empty())
+    m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
+
   session->con->send_message(m);
 }
 
@@ -2853,6 +3072,13 @@ void Client::check_caps(Inode *in, bool is_delayed)
   unsigned used = get_caps_used(in);
   unsigned cap_used;
 
+  if (in->is_dir() && (in->flags & I_COMPLETE)) {
+    // we do this here because we don't want to drop to Fs (and then
+    // drop the Fs if we do a create!) if that alone makes us send lookups
+    // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
+    wanted |= CEPH_CAP_FILE_EXCL;
+  }
+
   int retain = wanted | used | CEPH_CAP_PIN;
   if (!unmounting) {
     if (wanted)
@@ -2943,21 +3169,24 @@ void Client::check_caps(Inode *in, bool is_delayed)
 
   ack:
     int flushing;
-    if (in->auth_cap == cap && in->dirty_caps)
-      flushing = mark_caps_flushing(in);
-    else
+    ceph_tid_t flush_tid;
+    if (in->auth_cap == cap && in->dirty_caps) {
+      flushing = mark_caps_flushing(in, &flush_tid);
+    } else {
       flushing = 0;
+      flush_tid = 0;
+    }
 
-    send_cap(in, session, cap, cap_used, wanted, retain, flushing);
+    send_cap(in, session, cap, cap_used, wanted, retain, flushing, flush_tid);
   }
 }
 
 
-void Client::queue_cap_snap(Inode *in, snapid_t seq)
+void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc)
 {
   int used = get_caps_used(in);
   int dirty = in->caps_dirty();
-  ldout(cct, 10) << "queue_cap_snap " << *in << " seq " << seq << " used " << ccap_string(used) << dendl;
+  ldout(cct, 10) << "queue_cap_snap " << *in << " snapc " << old_snapc << " used " << ccap_string(used) << dendl;
 
   if (in->cap_snaps.size() &&
       in->cap_snaps.rbegin()->second->writing) {
@@ -2966,12 +3195,11 @@ void Client::queue_cap_snap(Inode *in, snapid_t seq)
   } else if (in->caps_dirty() ||
             (used & CEPH_CAP_FILE_WR) ||
 	     (dirty & CEPH_CAP_ANY_WR)) {
-    in->get();
     CapSnap *capsnap = new CapSnap(in);
-    in->cap_snaps[seq] = capsnap;
-    capsnap->context = in->snaprealm->get_snap_context();
+    in->cap_snaps[old_snapc.seq] = capsnap;
+    capsnap->context = old_snapc;
     capsnap->issued = in->caps_issued();
-    capsnap->dirty = in->caps_dirty();  // a bit conservative?
+    capsnap->dirty = in->caps_dirty();
     
     capsnap->dirty_data = (used & CEPH_CAP_FILE_BUFFER);
     
@@ -3001,6 +3229,13 @@ void Client::finish_cap_snap(Inode *in, CapSnap *capsnap, int used)
   capsnap->ctime = in->ctime;
   capsnap->time_warp_seq = in->time_warp_seq;
 
+  capsnap->dirty |= in->caps_dirty();
+
+  if (capsnap->dirty & CEPH_CAP_FILE_WR) {
+    capsnap->inline_data = in->inline_data;
+    capsnap->inline_version = in->inline_version;
+  }
+
   if (used & CEPH_CAP_FILE_BUFFER) {
     ldout(cct, 10) << "finish_cap_snap " << *in << " cap_snap " << capsnap << " used " << used
 	     << " WRBUFFER, delaying" << dendl;
@@ -3054,7 +3289,7 @@ void Client::flush_snaps(Inode *in, bool all_again, CapSnap *again)
     
     in->auth_cap->session->flushing_capsnaps.push_back(&capsnap->flushing_item);
 
-    capsnap->flush_tid = ++in->last_flush_tid;
+    capsnap->flush_tid = ++last_flush_tid;
     MClientCaps *m = new MClientCaps(CEPH_CAP_OP_FLUSHSNAP, in->ino, in->snaprealm->ino, 0, mseq,
         cap_epoch_barrier);
     m->set_client_tid(capsnap->flush_tid);
@@ -3077,6 +3312,11 @@ void Client::flush_snaps(Inode *in, bool all_again, CapSnap *again)
     capsnap->atime.encode_timeval(&m->head.atime);
     m->head.time_warp_seq = capsnap->time_warp_seq;
 
+    if (capsnap->dirty & CEPH_CAP_FILE_WR) {
+      m->inline_version = in->inline_version;
+      m->inline_data = in->inline_data;
+    }
+
     session->con->send_message(m);
   }
 }
@@ -3130,13 +3370,12 @@ void Client::wake_inode_waiters(MetaSession *s)
 class C_Client_CacheInvalidate : public Context  {
 private:
   Client *client;
-  Inode *inode;
+  InodeRef inode;
   int64_t offset, length;
   bool keep_caps;
 public:
   C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len, bool keep) :
 			   client(c), inode(in), offset(off), length(len), keep_caps(keep) {
-    inode->get();
   }
   void finish(int r) {
     // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
@@ -3145,15 +3384,18 @@ public:
   }
 };
 
-void Client::_async_invalidate(Inode *in, int64_t off, int64_t len, bool keep_caps)
+void Client::_async_invalidate(InodeRef& in, int64_t off, int64_t len, bool keep_caps)
 {
   ldout(cct, 10) << "_async_invalidate " << off << "~" << len << (keep_caps ? " keep_caps" : "") << dendl;
-  ino_invalidate_cb(callback_handle, in->vino(), off, len);
+  if (use_faked_inos())
+    ino_invalidate_cb(callback_handle, vinodeno_t(in->faked_ino, CEPH_NOSNAP), off, len);
+  else
+    ino_invalidate_cb(callback_handle, in->vino(), off, len);
 
   client_lock.Lock();
   if (!keep_caps)
-    check_caps(in, false);
-  put_inode(in);
+    check_caps(in.get(), false);
+  in.reset(); // put inode inside client_lock
   client_lock.Unlock();
   ldout(cct, 10) << "_async_invalidate " << off << "~" << len << (keep_caps ? " keep_caps" : "") << " done" << dendl;
 }
@@ -3210,7 +3452,7 @@ bool Client::_flush(Inode *in, Context *onfinish)
     return true;
   }
 
-  if (objecter->osdmap_full_flag()) {
+  if (objecter->osdmap_pool_full(in->layout.fl_pg_pool)) {
     ldout(cct, 1) << __func__ << ": FULL, purging for ENOSPC" << dendl;
     objectcacher->purge_set(&in->oset);
     if (onfinish) {
@@ -3335,7 +3577,7 @@ void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id
       if (in->auth_cap && in->flushing_cap_item.is_on_list()) {
 	ldout(cct, 10) << "add_update_cap changing auth cap: "
 		       << "add myself to new auth MDS' flushing caps list" << dendl;
-	mds_session->flushing_caps.push_back(&in->flushing_cap_item);
+	adjust_session_flushing_caps(in, in->auth_cap->session, mds_session);
       }
       in->auth_cap = cap;
     }
@@ -3434,13 +3676,16 @@ void Client::remove_session_caps(MetaSession *s)
     signal_cond_list(in->waitfor_caps);
     if (dirty_caps) {
       lderr(cct) << "remove_session_caps still has dirty|flushing caps on " << *in << dendl;
-      if (in->flushing_caps)
+      if (in->flushing_caps) {
 	num_flushing_caps--;
+	in->flushing_cap_tids.clear();
+      }
       in->flushing_caps = 0;
       in->dirty_caps = 0;
       put_inode(in);
     }
   }
+  s->flushing_caps_tids.clear();
   sync_cond.Signal();
 }
 
@@ -3504,7 +3749,7 @@ void Client::trim_caps(MetaSession *s, int max)
       ldout(cct, 20) << " trying to trim dentries for " << *in << dendl;
       bool all = true;
       set<Dentry*>::iterator q = in->dn_set.begin();
-      in->get();
+      InodeRef tmp_ref(in);
       while (q != in->dn_set.end()) {
 	Dentry *dn = *q++;
 	if (dn->lru_is_expireable()) {
@@ -3525,8 +3770,6 @@ void Client::trim_caps(MetaSession *s, int max)
         ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl;
 	trimmed++;
       }
-
-      put_inode(in);
     }
 
     ++p;
@@ -3560,14 +3803,17 @@ void Client::mark_caps_dirty(Inode *in, int caps)
   in->dirty_caps |= caps;
 }
 
-int Client::mark_caps_flushing(Inode *in)
+int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid)
 {
   MetaSession *session = in->auth_cap->session;
 
   int flushing = in->dirty_caps;
   assert(flushing);
 
-  if (flushing && !in->flushing_caps) {
+  ceph_tid_t flush_tid = ++last_flush_tid;
+  in->flushing_cap_tids[flush_tid] = flushing;
+
+  if (!in->flushing_caps) {
     ldout(cct, 10) << "mark_caps_flushing " << ccap_string(flushing) << " " << *in << dendl;
     num_flushing_caps++;
   } else {
@@ -3577,13 +3823,25 @@ int Client::mark_caps_flushing(Inode *in)
   in->flushing_caps |= flushing;
   in->dirty_caps = 0;
  
-  in->flushing_cap_seq = ++last_flush_seq;
 
   session->flushing_caps.push_back(&in->flushing_cap_item);
+  session->flushing_caps_tids.insert(flush_tid);
 
+  *ptid = flush_tid;
   return flushing;
 }
 
+void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s,  MetaSession *new_s)
+{
+  for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
+       it != in->flushing_cap_tids.end();
+       ++it) {
+    old_s->flushing_caps_tids.erase(it->first);
+    new_s->flushing_caps_tids.insert(it->first);
+  }
+  new_s->flushing_caps.push_back(&in->flushing_cap_item);
+}
+
 void Client::flush_caps()
 {
   ldout(cct, 10) << "flush_caps" << dendl;
@@ -3610,24 +3868,44 @@ void Client::flush_caps(Inode *in, MetaSession *session)
   Cap *cap = in->auth_cap;
   assert(cap->session == session);
 
-  send_cap(in, session, cap, get_caps_used(in), in->caps_wanted(),
-	   (cap->issued | cap->implemented), in->flushing_caps);
+  for (map<ceph_tid_t,int>::iterator p = in->flushing_cap_tids.begin();
+       p != in->flushing_cap_tids.end();
+       ++p) {
+    send_cap(in, session, cap, (get_caps_used(in) | in->caps_dirty()),
+	     in->caps_wanted(), (cap->issued | cap->implemented),
+	     p->second, p->first);
+  }
+}
+
+void Client::wait_sync_caps(Inode *in, ceph_tid_t want)
+{
+  while (in->flushing_caps) {
+    map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
+    assert(it != in->flushing_cap_tids.end());
+    if (it->first > want)
+      break;
+    ldout(cct, 10) << "wait_sync_caps on " << *in << " flushing "
+		   << ccap_string(it->second) << " want " << want
+		   << " last " << it->first << dendl;
+    wait_on_list(in->waitfor_caps);
+  }
 }
 
-void Client::wait_sync_caps(uint64_t want)
+void Client::wait_sync_caps(ceph_tid_t want)
 {
  retry:
-  ldout(cct, 10) << "wait_sync_caps want " << want << " (last is " << last_flush_seq << ", "
+  ldout(cct, 10) << "wait_sync_caps want " << want  << " (last is " << last_flush_tid << ", "
 	   << num_flushing_caps << " total flushing)" << dendl;
   for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
        p != mds_sessions.end();
        ++p) {
-    if (p->second->flushing_caps.empty())
+    MetaSession *s = p->second;
+    if (s->flushing_caps_tids.empty())
 	continue;
-    Inode *in = p->second->flushing_caps.front();
-    if (in->flushing_cap_seq <= want) {
-      ldout(cct, 10) << " waiting on mds." << p->first << " tid " << in->flushing_cap_seq
-	       << " (want " << want << ")" << dendl;
+    ceph_tid_t oldest_tid = *s->flushing_caps_tids.begin();
+    if (oldest_tid <= want) {
+      ldout(cct, 10) << " waiting on mds." << p->first << " tid " << oldest_tid
+		     << " (want " << want << ")" << dendl;
       sync_cond.Wait(client_lock);
       goto retry;
     }
@@ -3641,10 +3919,10 @@ void Client::kick_flushing_caps(MetaSession *session)
 
   for (xlist<CapSnap*>::iterator p = session->flushing_capsnaps.begin(); !p.end(); ++p) {
     CapSnap *capsnap = *p;
-    Inode *in = capsnap->in;
+    InodeRef& in = capsnap->in;
     ldout(cct, 20) << " reflushing capsnap " << capsnap
 		   << " on " << *in << " to mds." << mds << dendl;
-    flush_snaps(in, false, capsnap);
+    flush_snaps(in.get(), false, capsnap);
   }
   for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
     Inode *in = *p;
@@ -3654,6 +3932,36 @@ void Client::kick_flushing_caps(MetaSession *session)
   }
 }
 
+void Client::early_kick_flushing_caps(MetaSession *session)
+{
+  for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
+    Inode *in = *p;
+    if (!in->flushing_caps)
+      continue;
+    assert(in->auth_cap);
+    Cap *cap = in->auth_cap;
+
+    // if flushing caps were revoked, we re-send the cap flush in client reconnect
+    // stage. This guarantees that MDS processes the cap flush message before issuing
+    // the flushing caps to other client.
+    bool send_now = (in->flushing_caps & in->auth_cap->issued) != in->flushing_caps;
+
+    if (send_now)
+      ldout(cct, 20) << " reflushing caps (revoked) on " << *in
+		     << " to mds." << session->mds_num << dendl;
+
+    for (map<ceph_tid_t,int>::iterator q = in->flushing_cap_tids.begin();
+	 q != in->flushing_cap_tids.end();
+	 ++q) {
+      if (send_now) {
+	send_cap(in, session, cap, (get_caps_used(in) | in->caps_dirty()),
+		 in->caps_wanted(), (cap->issued | cap->implemented),
+		 q->second, q->first);
+      }
+    }
+  }
+}
+
 void Client::kick_maxsize_requests(MetaSession *session)
 {
   xlist<Cap*>::iterator iter = session->caps.begin();
@@ -3768,12 +4076,20 @@ bool Client::adjust_realm_parent(SnapRealm *realm, inodeno_t parent)
   return false;
 }
 
+static bool has_new_snaps(const SnapContext& old_snapc,
+			  const SnapContext& new_snapc)
+{
+	return !new_snapc.snaps.empty() && new_snapc.snaps[0] > old_snapc.seq;
+}
+
 
 inodeno_t Client::update_snap_trace(bufferlist& bl, bool flush)
 {
   inodeno_t first_realm = 0;
   ldout(cct, 10) << "update_snap_trace len " << bl.length() << dendl;
 
+  map<SnapRealm*, SnapContext> dirty_realms;
+
   bufferlist::iterator p = bl.begin();
   while (!p.end()) {
     SnapRealmInfo info;
@@ -3782,6 +4098,8 @@ inodeno_t Client::update_snap_trace(bufferlist& bl, bool flush)
       first_realm = info.ino();
     SnapRealm *realm = get_snap_realm(info.ino());
 
+    bool invalidate = false;
+
     if (info.seq() > realm->seq) {
       ldout(cct, 10) << "update_snap_trace " << *realm << " seq " << info.seq() << " > " << realm->seq
 	       << dendl;
@@ -3794,28 +4112,19 @@ inodeno_t Client::update_snap_trace(bufferlist& bl, bool flush)
 	while (!q.empty()) {
 	  SnapRealm *realm = q.front();
 	  q.pop_front();
-	  ldout(cct, 10) << " flushing caps on " << *realm << dendl;
-
-	  xlist<Inode*>::iterator p = realm->inodes_with_caps.begin();
-	  while (!p.end()) {
-	    Inode *in = *p;
-	    ++p;
-	    queue_cap_snap(in, realm->get_snap_context().seq);
-	  }
 
 	  for (set<SnapRealm*>::iterator p = realm->pchildren.begin(); 
-	       p != realm->pchildren.end(); 
+	       p != realm->pchildren.end();
 	       ++p)
 	    q.push_back(*p);
+
+	  if (dirty_realms.count(realm) == 0) {
+	    realm->nref++;
+	    dirty_realms[realm] = realm->get_snap_context();
+	  }
 	}
       }
 
-    }
-
-    // _always_ verify parent
-    bool invalidate = adjust_realm_parent(realm, info.parent());
-
-    if (info.seq() > realm->seq) {
       // update
       realm->seq = info.seq();
       realm->created = info.created();
@@ -3824,6 +4133,11 @@ inodeno_t Client::update_snap_trace(bufferlist& bl, bool flush)
       realm->my_snaps = info.my_snaps;
       invalidate = true;
     }
+
+    // _always_ verify parent
+    if (adjust_realm_parent(realm, info.parent()))
+      invalidate = true;
+
     if (invalidate) {
       invalidate_snaprealm_and_children(realm);
       ldout(cct, 15) << "update_snap_trace " << *realm << " self|parent updated" << dendl;
@@ -3836,6 +4150,25 @@ inodeno_t Client::update_snap_trace(bufferlist& bl, bool flush)
     put_snap_realm(realm);
   }
 
+  for (map<SnapRealm*, SnapContext>::iterator q = dirty_realms.begin();
+       q != dirty_realms.end();
+       ++q) {
+    SnapRealm *realm = q->first;
+    // if there are new snaps ?
+    if (has_new_snaps(q->second, realm->get_snap_context())) { 
+      ldout(cct, 10) << " flushing caps on " << *realm << dendl;
+      xlist<Inode*>::iterator r = realm->inodes_with_caps.begin();
+      while (!r.end()) {
+	Inode *in = *r;
+	++r;
+	queue_cap_snap(in, q->second);
+      }
+    } else {
+      ldout(cct, 10) << " no new snap on " << *realm << dendl;
+    }
+    put_snap_realm(realm);
+  }
+
   return first_realm;
 }
 
@@ -3853,6 +4186,7 @@ void Client::handle_snap(MClientSnap *m)
 
   list<Inode*> to_move;
   SnapRealm *realm = 0;
+  SnapContext old_snapc;
 
   if (m->head.op == CEPH_SNAP_OP_SPLIT) {
     assert(m->head.split);
@@ -3864,6 +4198,7 @@ void Client::handle_snap(MClientSnap *m)
     // flush, then move, ino's.
     realm = get_snap_realm(info.ino());
     ldout(cct, 10) << " splitting off " << *realm << dendl;
+    old_snapc = realm->get_snap_context();
     for (vector<inodeno_t>::iterator p = m->split_inos.begin();
 	 p != m->split_inos.end();
 	 ++p) {
@@ -3879,8 +4214,6 @@ void Client::handle_snap(MClientSnap *m)
 	}
 	ldout(cct, 10) << " moving " << *in << " from " << *in->snaprealm << dendl;
 
-	// queue for snap writeback
-	queue_cap_snap(in, in->snaprealm->get_snap_context().seq);
 
 	in->snaprealm_item.remove_myself();
 	put_snap_realm(in->snaprealm);
@@ -3904,11 +4237,15 @@ void Client::handle_snap(MClientSnap *m)
   update_snap_trace(m->bl, m->head.op != CEPH_SNAP_OP_DESTROY);
 
   if (realm) {
+    bool queue_snap = has_new_snaps(old_snapc, realm->get_snap_context());
     for (list<Inode*>::iterator p = to_move.begin(); p != to_move.end(); ++p) {
       Inode *in = *p;
       in->snaprealm = realm;
       realm->inodes_with_caps.push_back(&in->snaprealm_item);
       realm->nref++;
+      // queue for snap writeback
+      if (queue_snap)
+	queue_cap_snap(in, old_snapc);
     }
     put_snap_realm(realm);
   }
@@ -4078,7 +4415,7 @@ void Client::handle_cap_export(MetaSession *session, Inode *in, MClientCaps *m)
 	  if (cap == in->auth_cap)
 	    in->auth_cap = tcap;
 	  if (in->auth_cap == tcap && in->flushing_cap_item.is_on_list())
-	    tsession->flushing_caps.push_back(&in->flushing_cap_item);
+	    adjust_session_flushing_caps(in, session, tsession);
 	}
       } else {
 	add_update_cap(in, tsession, m->peer.cap_id, cap->issued,
@@ -4115,20 +4452,37 @@ void Client::handle_cap_trunc(MetaSession *session, Inode *in, MClientCaps *m)
 
 void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, MClientCaps *m)
 {
-  mds_rank_t mds = session->mds_num;
+  ceph_tid_t flush_ack_tid = m->get_client_tid();
   int dirty = m->get_dirty();
   int cleaned = 0;
-  uint16_t flush_ack_tid = static_cast<uint16_t>(m->get_client_tid());
-  for (int i = 0; i < CEPH_CAP_BITS; ++i) {
-    if ((dirty & (1 << i)) &&
-	(flush_ack_tid == in->flushing_cap_tid[i]))
-      cleaned |= 1 << i;
+  int flushed = 0;
+
+  for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
+       it != in->flushing_cap_tids.end(); ) {
+    if (it->first == flush_ack_tid)
+      cleaned = it->second;
+    if (it->first <= flush_ack_tid) {
+      session->flushing_caps_tids.erase(it->first);
+      in->flushing_cap_tids.erase(it++);
+      ++flushed;
+      continue;
+    }
+    cleaned &= ~it->second;
+    if (!cleaned)
+      break;
+    ++it;
   }
 
-  ldout(cct, 5) << "handle_cap_flush_ack mds." << mds
+  ldout(cct, 5) << "handle_cap_flush_ack mds." << session->mds_num
 	  << " cleaned " << ccap_string(cleaned) << " on " << *in
 	  << " with " << ccap_string(dirty) << dendl;
 
+  if (flushed) {
+    signal_cond_list(in->waitfor_caps);
+    if (session->flushing_caps_tids.empty() ||
+	*session->flushing_caps_tids.begin() > flush_ack_tid)
+      sync_cond.Signal();
+  }
 
   if (!cleaned) {
     ldout(cct, 10) << " tid " << m->get_client_tid() << " != any cap bit tids" << dendl;
@@ -4141,7 +4495,6 @@ void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, MCl
 	ldout(cct, 10) << " " << *in << " !flushing" << dendl;
 	in->flushing_cap_item.remove_myself();
 	num_flushing_caps--;
-	sync_cond.Signal();
       }
       if (!in->caps_dirty())
 	put_inode(in);
@@ -4165,10 +4518,9 @@ void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, MClientCa
     } else {
       ldout(cct, 5) << "handle_cap_flushedsnap mds." << mds << " flushed snap follows " << follows
 	      << " on " << *in << dendl;
+      in->cap_snaps.erase(follows);
       capsnap->flushing_item.remove_myself();
       delete capsnap;
-      in->cap_snaps.erase(follows);
-      put_inode(in);
     }
   } else {
     ldout(cct, 5) << "handle_cap_flushedsnap DUP(?) mds." << mds << " flushed snap follows " << follows
@@ -4188,10 +4540,16 @@ private:
 public:
   C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) :
     client(c), name(dn->name) {
-      dirino = dn->dir->parent_inode->vino();
-      if (del)
-	ino = dn->inode->vino();
-      else
+      if (client->use_faked_inos()) {
+	dirino.ino = dn->dir->parent_inode->faked_ino;
+	if (del)
+	  ino.ino = dn->inode->faked_ino;
+      } else {
+	dirino = dn->dir->parent_inode->vino();
+	if (del)
+	  ino = dn->inode->vino();
+      }
+      if (!del)
 	ino.ino = inodeno_t();
   }
   void finish(int r) {
@@ -4242,34 +4600,6 @@ void Client::_try_to_trim_inode(Inode *in)
   }
 }
 
-/**
- * For asynchronous flushes, check for errors from the IO and
- * update the inode if necessary
- */
-class C_Client_FlushComplete : public Context {
-  private:
-  Client *client;
-  Inode *inode;
-
-  public:
-  C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in)
-  {
-    inode->get();
-  }
-
-  void finish(int r) {
-    assert(client->client_lock.is_locked_by_me());
-    if (r != 0) {
-      client_t const whoami = client->whoami;  // For the benefit of ldout prefix
-      ldout(client->cct, 1) << "I/O error from flush on inode " << inode
-        << " 0x" << std::hex << inode->ino << std::dec
-        << ": " << r << "(" << cpp_strerror(r) << ")" << dendl;
-      inode->async_err = r;
-    }
-    client->put_inode(inode);
-  }
-};
-
 void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClientCaps *m)
 {
   mds_rank_t mds = session->mds_num;
@@ -4392,11 +4722,54 @@ int Client::check_permissions(Inode *in, int flags, int uid, int gid)
       return sgid_count;
     }
   }
+#if HAVE_GETGROUPLIST
+  else {
+    //use PAM to get the group list
+    // initial number of group entries, defaults to posix standard of 16
+    // PAM implementations may provide more than 16 groups....
+    sgid_count = 16;
+    sgids = (gid_t*)malloc(sgid_count * sizeof(gid_t));
+    if (sgids == NULL) {
+      ldout(cct, 3) << "allocating group memory failed" << dendl;
+      return -EACCES;
+    }
+    struct passwd *pw;
+    pw = getpwuid(uid);
+    if (pw == NULL) {
+      ldout(cct, 3) << "getting user entry failed" << dendl;
+      free(sgids); 
+      return -EACCES;
+    }
+    while (1) {
+#if defined(__APPLE__)
+      if (getgrouplist(pw->pw_name, gid, (int *)sgids, &sgid_count) == -1) {
+#else
+      if (getgrouplist(pw->pw_name, gid, sgids, &sgid_count) == -1) {
+#endif
+        // we need to resize the group list and try again
+	void *_realloc = NULL;
+        if ((_realloc = realloc(sgids, sgid_count * sizeof(gid_t))) == NULL) {
+          ldout(cct, 3) << "allocating group memory failed" << dendl;
+	  free(sgids);
+          return -EACCES;
+        }
+	sgids = (gid_t*)_realloc;
+        continue;
+      }
+      // list was successfully retrieved
+      break;
+    }    
+  }
+#endif
+
   // check permissions before doing anything else
+  int ret = 0;
   if (uid != 0 && !in->check_mode(uid, gid, sgids, sgid_count, flags)) {
-    return -EACCES;
+    ret = -EACCES;
   }
-  return 0;
+  if (sgids)
+    free(sgids);
+  return ret;
 }
 
 vinodeno_t Client::_get_vino(Inode *in)
@@ -4627,7 +5000,7 @@ void Client::handle_command_reply(MCommandReply *m)
 // -------------------
 // MOUNT
 
-int Client::mount(const std::string &mount_root)
+int Client::mount(const std::string &mount_root, bool require_mds)
 {
   Mutex::Locker lock(client_lock);
 
@@ -4644,6 +5017,20 @@ int Client::mount(const std::string &mount_root)
   tick(); // start tick
   
   ldout(cct, 2) << "mounted: have mdsmap " << mdsmap->get_epoch() << dendl;
+  if (require_mds) {
+    while (1) {
+      if (mdsmap->get_epoch() > 0) {
+        if (mdsmap->get_num_mds(CEPH_MDS_STATE_ACTIVE) == 0) {
+          ldout(cct, 10) << "no mds up: epoch=" << mdsmap->get_epoch() << dendl;
+          return CEPH_FUSE_NO_MDS_UP;
+        } else {
+          break;
+        }
+      } else {
+        wait_on_list(waiting_for_mdsmap);
+      }
+    }
+  }
 
   // hack: get+pin root inode.
   //  fuse assumes it's always there.
@@ -4714,9 +5101,7 @@ void Client::unmount()
     timer.cancel_event(tick_event);
   tick_event = 0;
 
-  if (cwd)
-    put_inode(cwd);
-  cwd = NULL;
+  cwd.reset();
 
   // clean up any unclosed files
   while (!fd_map.empty()) {
@@ -4747,16 +5132,15 @@ void Client::unmount()
 	assert(in);
       }
       if (!in->caps.empty()) {
-	in->get();
+	InodeRef tmp_ref(in);
 	_release(in);
 	_flush(in, new C_Client_FlushComplete(this, in));
-	put_inode(in);
       }
     }
   }
 
   flush_caps();
-  wait_sync_caps(last_flush_seq);
+  wait_sync_caps(last_flush_tid);
 
   // empty lru cache
   lru.lru_set_max(0);
@@ -4886,6 +5270,7 @@ void Client::tick()
     check_caps(in, true);
   }
 
+  trim_cache(true);
 }
 
 void Client::renew_caps()
@@ -4914,7 +5299,7 @@ void Client::renew_caps(MetaSession *session)
 // ===============================================================
 // high level (POSIXy) interface
 
-int Client::_do_lookup(Inode *dir, const string& name, Inode **target)
+int Client::_do_lookup(Inode *dir, const string& name, InodeRef *target)
 {
   int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
   MetaRequest *req = new MetaRequest(op);
@@ -4931,7 +5316,7 @@ int Client::_do_lookup(Inode *dir, const string& name, Inode **target)
   return r;
 }
 
-int Client::_lookup(Inode *dir, const string& dname, Inode **target)
+int Client::_lookup(Inode *dir, const string& dname, InodeRef *target)
 {
   int r = 0;
   Dentry *dn = NULL;
@@ -4991,9 +5376,14 @@ int Client::_lookup(Inode *dir, const string& dname, Inode **target)
 		       << " vs lease_gen " << dn->lease_gen << dendl;
       }
       // dir lease?
-      if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED) &&
-	  dn->cap_shared_gen == dir->shared_gen) {
-	goto hit_dn;
+      if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED)) {
+	if (dn->cap_shared_gen == dir->shared_gen)
+	  goto hit_dn;
+	if (!dn->inode && (dir->flags & I_COMPLETE)) {
+	  ldout(cct, 10) << "_lookup concluded ENOENT locally for "
+			 << *dir << " dn '" << dname << "'" << dendl;
+	  return -ENOENT;
+	}
       }
     } else {
       ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl;
@@ -5058,10 +5448,10 @@ int Client::get_or_create(Inode *dir, const char* name,
   return 0;
 }
 
-int Client::path_walk(const filepath& origpath, Inode **final, bool followsym)
+int Client::path_walk(const filepath& origpath, InodeRef *end, bool followsym)
 {
   filepath path = origpath;
-  Inode *cur;
+  InodeRef cur;
   if (origpath.absolute())
     cur = root;
   else
@@ -5077,8 +5467,8 @@ int Client::path_walk(const filepath& origpath, Inode **final, bool followsym)
     const string &dname = path[i];
     ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl;
     ldout(cct, 20) << "  (path is " << path << ")" << dendl;
-    Inode *next;
-    int r = _lookup(cur, dname, &next);
+    InodeRef next;
+    int r = _lookup(cur.get(), dname, &next);
     if (r < 0)
       return r;
     // only follow trailing symlink if followsym.  always follow
@@ -5118,13 +5508,13 @@ int Client::path_walk(const filepath& origpath, Inode **final, bool followsym)
 	continue;
       }
     }
-    cur = next;
+    cur.swap(next);
     i++;
   }
   if (!cur)
     return -ENOENT;
-  if (final)
-    *final = cur;
+  if (end)
+    end->swap(cur);
   return 0;
 }
 
@@ -5143,18 +5533,15 @@ int Client::link(const char *relexisting, const char *relpath)
   string name = path.last_dentry();
   path.pop_dentry();
 
-  Inode *in, *dir;
+  InodeRef in, dir;
   int r;
   r = path_walk(existing, &in);
   if (r < 0)
     goto out;
-  in->get();
   r = path_walk(path, &dir);
   if (r < 0)
-    goto out_unlock;
-  r = _link(in, dir, name.c_str());
- out_unlock:
-  put_inode(in);
+    goto out;
+  r = _link(in.get(), dir.get(), name.c_str());
  out:
   return r;
 }
@@ -5168,11 +5555,11 @@ int Client::unlink(const char *relpath)
   filepath path(relpath);
   string name = path.last_dentry();
   path.pop_dentry();
-  Inode *dir;
+  InodeRef dir;
   int r = path_walk(path, &dir);
   if (r < 0)
     return r;
-  return _unlink(dir, name.c_str());
+  return _unlink(dir.get(), name.c_str());
 }
 
 int Client::rename(const char *relfrom, const char *relto)
@@ -5189,21 +5576,16 @@ int Client::rename(const char *relfrom, const char *relto)
   string toname = to.last_dentry();
   to.pop_dentry();
 
-  Inode *fromdir, *todir;
+  InodeRef fromdir, todir;
   int r;
 
   r = path_walk(from, &fromdir);
   if (r < 0)
     goto out;
-  fromdir->get();
   r = path_walk(to, &todir);
   if (r < 0)
-    goto out_unlock;
-  todir->get();
-  r = _rename(fromdir, fromname.c_str(), todir, toname.c_str());
-  put_inode(todir);
- out_unlock:
-  put_inode(fromdir);
+    goto out;
+  r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str());
  out:
   return r;
 }
@@ -5221,12 +5603,12 @@ int Client::mkdir(const char *relpath, mode_t mode)
   filepath path(relpath);
   string name = path.last_dentry();
   path.pop_dentry();
-  Inode *dir;
+  InodeRef dir;
   int r = path_walk(path, &dir);
   if (r < 0) {
     return r;
   }
-  return _mkdir(dir, name.c_str(), mode);
+  return _mkdir(dir.get(), name.c_str(), mode);
 }
 
 int Client::mkdirs(const char *relpath, mode_t mode)
@@ -5241,12 +5623,12 @@ int Client::mkdirs(const char *relpath, mode_t mode)
   filepath path(relpath);
   unsigned int i;
   int r=0;
-  Inode *cur = cwd;
-  Inode *next;
+  InodeRef cur, next;
+  cur = cwd;
   for (i=0; i<path.depth(); ++i) {
-    r=_lookup(cur, path[i].c_str(), &next);
+    r=_lookup(cur.get(), path[i].c_str(), &next);
     if (r < 0) break;
-    cur = next;
+    cur.swap(next);
   }
   //check that we have work left to do
   if (i==path.depth()) return -EEXIST;
@@ -5255,17 +5637,17 @@ int Client::mkdirs(const char *relpath, mode_t mode)
   //make new directory at each level
   for (; i<path.depth(); ++i) {
     //make new dir
-    r = _mkdir(cur, path[i].c_str(), mode);
+    r = _mkdir(cur.get(), path[i].c_str(), mode);
     //check proper creation/existence
     if (r < 0) return r;
-    r = _lookup(cur, path[i], &next);
+    r = _lookup(cur.get(), path[i], &next);
     if(r < 0) {
       ldout(cct, 0) << "mkdirs: successfully created new directory " << path[i]
 	      << " but can't _lookup it!" << dendl;
       return r;
     }
     //move to new dir and continue
-    cur = next;
+    cur.swap(next);
     ldout(cct, 20) << "mkdirs: successfully created directory "
 	     << filepath(cur->ino).get_path() << dendl;
   }
@@ -5280,11 +5662,11 @@ int Client::rmdir(const char *relpath)
   filepath path(relpath);
   string name = path.last_dentry();
   path.pop_dentry();
-  Inode *dir;
+  InodeRef dir;
   int r = path_walk(path, &dir);
   if (r < 0)
     return r;
-  return _rmdir(dir, name.c_str());
+  return _rmdir(dir.get(), name.c_str());
 }
 
 int Client::mknod(const char *relpath, mode_t mode, dev_t rdev) 
@@ -5297,11 +5679,11 @@ int Client::mknod(const char *relpath, mode_t mode, dev_t rdev)
   filepath path(relpath);
   string name = path.last_dentry();
   path.pop_dentry();
-  Inode *in;
+  InodeRef in;
   int r = path_walk(path, &in);
   if (r < 0)
     return r;
-  return _mknod(in, name.c_str(), mode, rdev);
+  return _mknod(in.get(), name.c_str(), mode, rdev);
 }
 
 // symlinks
@@ -5316,11 +5698,11 @@ int Client::symlink(const char *target, const char *relpath)
   filepath path(relpath);
   string name = path.last_dentry();
   path.pop_dentry();
-  Inode *dir;
+  InodeRef dir;
   int r = path_walk(path, &dir);
   if (r < 0)
     return r;
-  return _symlink(dir, name.c_str(), target);
+  return _symlink(dir.get(), name.c_str(), target);
 }
 
 int Client::readlink(const char *relpath, char *buf, loff_t size) 
@@ -5330,12 +5712,12 @@ int Client::readlink(const char *relpath, char *buf, loff_t size)
   tout(cct) << relpath << std::endl;
 
   filepath path(relpath);
-  Inode *in;
+  InodeRef in;
   int r = path_walk(path, &in, false);
   if (r < 0)
     return r;
 
-  return _readlink(in, buf, size);
+  return _readlink(in.get(), buf, size);
 }
 
 int Client::_readlink(Inode *in, char *buf, size_t size)
@@ -5375,7 +5757,7 @@ int Client::_getattr(Inode *in, int mask, int uid, int gid, bool force)
 }
 
 int Client::_setattr(Inode *in, struct stat *attr, int mask, int uid, int gid,
-		     Inode **inp)
+		     InodeRef *inp)
 {
   int issued = in->caps_issued();
 
@@ -5504,11 +5886,11 @@ int Client::setattr(const char *relpath, struct stat *attr, int mask)
   tout(cct) << mask  << std::endl;
 
   filepath path(relpath);
-  Inode *in;
+  InodeRef in;
   int r = path_walk(path, &in);
   if (r < 0)
     return r;
-  return _setattr(in, attr, mask); 
+  return _setattr(in, attr, mask);
 }
 
 int Client::fsetattr(int fd, struct stat *attr, int mask)
@@ -5525,7 +5907,7 @@ int Client::fsetattr(int fd, struct stat *attr, int mask)
   if (f->flags & O_PATH)
     return -EBADF;
 #endif
-  return _setattr(f->inode, attr, mask); 
+  return _setattr(f->inode, attr, mask);
 }
 
 int Client::stat(const char *relpath, struct stat *stbuf,
@@ -5536,7 +5918,7 @@ int Client::stat(const char *relpath, struct stat *stbuf,
   tout(cct) << "stat" << std::endl;
   tout(cct) << relpath << std::endl;
   filepath path(relpath);
-  Inode *in;
+  InodeRef in;
   int r = path_walk(path, &in);
   if (r < 0)
     return r;
@@ -5558,7 +5940,7 @@ int Client::lstat(const char *relpath, struct stat *stbuf,
   tout(cct) << "lstat" << std::endl;
   tout(cct) << relpath << std::endl;
   filepath path(relpath);
-  Inode *in;
+  InodeRef in;
   // don't follow symlinks
   int r = path_walk(path, &in, false);
   if (r < 0)
@@ -5579,7 +5961,10 @@ int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_inf
 	   << " mode 0" << oct << in->mode << dec
 	   << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
   memset(st, 0, sizeof(struct stat));
-  st->st_ino = in->ino;
+  if (use_faked_inos())
+    st->st_ino = in->faked_ino;
+  else
+    st->st_ino = in->ino;
   st->st_dev = in->snapid;
   st->st_mode = in->mode;
   st->st_rdev = in->rdev;
@@ -5627,7 +6012,7 @@ int Client::chmod(const char *relpath, mode_t mode)
   tout(cct) << relpath << std::endl;
   tout(cct) << mode << std::endl;
   filepath path(relpath);
-  Inode *in;
+  InodeRef in;
   int r = path_walk(path, &in);
   if (r < 0)
     return r;
@@ -5661,7 +6046,7 @@ int Client::lchmod(const char *relpath, mode_t mode)
   tout(cct) << relpath << std::endl;
   tout(cct) << mode << std::endl;
   filepath path(relpath);
-  Inode *in;
+  InodeRef in;
   // don't follow symlinks
   int r = path_walk(path, &in, false);
   if (r < 0)
@@ -5679,7 +6064,7 @@ int Client::chown(const char *relpath, int uid, int gid)
   tout(cct) << uid << std::endl;
   tout(cct) << gid << std::endl;
   filepath path(relpath);
-  Inode *in;
+  InodeRef in;
   int r = path_walk(path, &in);
   if (r < 0)
     return r;
@@ -5723,7 +6108,7 @@ int Client::lchown(const char *relpath, int uid, int gid)
   tout(cct) << uid << std::endl;
   tout(cct) << gid << std::endl;
   filepath path(relpath);
-  Inode *in;
+  InodeRef in;
   // don't follow symlinks
   int r = path_walk(path, &in, false);
   if (r < 0)
@@ -5745,7 +6130,7 @@ int Client::utime(const char *relpath, struct utimbuf *buf)
   tout(cct) << buf->modtime << std::endl;
   tout(cct) << buf->actime << std::endl;
   filepath path(relpath);
-  Inode *in;
+  InodeRef in;
   int r = path_walk(path, &in);
   if (r < 0)
     return r;
@@ -5765,7 +6150,7 @@ int Client::lutime(const char *relpath, struct utimbuf *buf)
   tout(cct) << buf->modtime << std::endl;
   tout(cct) << buf->actime << std::endl;
   filepath path(relpath);
-  Inode *in;
+  InodeRef in;
   // don't follow symlinks
   int r = path_walk(path, &in, false);
   if (r < 0)
@@ -5778,17 +6163,31 @@ int Client::lutime(const char *relpath, struct utimbuf *buf)
   return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME);
 }
 
+int Client::flock(int fd, int operation, uint64_t owner)
+{
+  Mutex::Locker lock(client_lock);
+  tout(cct) << "flock" << std::endl;
+  tout(cct) << fd << std::endl;
+  tout(cct) << operation << std::endl;
+  tout(cct) << owner << std::endl;
+  Fh *f = get_filehandle(fd);
+  if (!f)
+    return -EBADF;
+
+  return _flock(f, operation, owner, NULL);
+}
+
 int Client::opendir(const char *relpath, dir_result_t **dirpp) 
 {
   Mutex::Locker lock(client_lock);
   tout(cct) << "opendir" << std::endl;
   tout(cct) << relpath << std::endl;
   filepath path(relpath);
-  Inode *in;
+  InodeRef in;
   int r = path_walk(path, &in);
   if (r < 0)
     return r;
-  r = _opendir(in, dirpp);
+  r = _opendir(in.get(), dirpp);
   tout(cct) << (unsigned long)*dirpp << std::endl;
   return r;
 }
@@ -5826,8 +6225,7 @@ void Client::_closedir(dir_result_t *dirp)
   ldout(cct, 10) << "_closedir(" << dirp << ")" << dendl;
   if (dirp->inode) {
     ldout(cct, 10) << "_closedir detaching inode " << dirp->inode << dendl;
-    put_inode(dirp->inode);
-    dirp->inode = 0;
+    dirp->inode.reset();
   }
   _readdir_drop_dirp_buffer(dirp);
   delete dirp;
@@ -5928,8 +6326,6 @@ void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp)
 {
   ldout(cct, 10) << "_readdir_drop_dirp_buffer " << dirp << dendl;
   if (dirp->buffer) {
-    for (unsigned i = 0; i < dirp->buffer->size(); i++)
-      put_inode((*dirp->buffer)[i].second);
     delete dirp->buffer;
     dirp->buffer = NULL;
   }
@@ -5951,13 +6347,13 @@ int Client::_readdir_get_frag(dir_result_t *dirp)
   if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR)
     op = CEPH_MDS_OP_LSSNAP;
 
-  Inode *diri = dirp->inode;
+  InodeRef& diri = dirp->inode;
 
   MetaRequest *req = new MetaRequest(op);
   filepath path;
   diri->make_nosnap_relative_path(path);
   req->set_filepath(path); 
-  req->set_inode(diri);
+  req->set_inode(diri.get());
   req->head.args.readdir.frag = fg;
   if (dirp->last_name.length()) {
     req->path2.set_path(dirp->last_name.c_str());
@@ -5982,7 +6378,7 @@ int Client::_readdir_get_frag(dir_result_t *dirp)
 
     _readdir_drop_dirp_buffer(dirp);
 
-    dirp->buffer = new vector<pair<string,Inode*> >;
+    dirp->buffer = new vector<pair<string,InodeRef> >;
     dirp->buffer->swap(req->readdir_result);
 
     if (fg != req->readdir_reply_frag) {
@@ -6061,7 +6457,7 @@ int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p)
 
     struct stat st;
     struct dirent de;
-    int stmask = fill_stat(dn->inode, &st);  
+    int stmask = fill_stat(dn->inode, &st);
     fill_dirent(&de, dn->name.c_str(), st.st_mode, st.st_ino, dirp->offset + 1);
       
     uint64_t next_off = dn->offset + 1;
@@ -6112,7 +6508,7 @@ int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p)
   frag_t fg = dirp->frag();
   uint32_t off = dirp->fragpos();
 
-  Inode *diri = dirp->inode;
+  InodeRef& diri = dirp->inode;
 
   if (dirp->at_end())
     return 0;
@@ -6122,9 +6518,8 @@ int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p)
     assert(diri->dn_set.size() < 2); // can't have multiple hard-links to a dir
     uint64_t next_off = 1;
 
-    fill_dirent(&de, ".", S_IFDIR, diri->ino, next_off);
-
     fill_stat(diri, &st);
+    fill_dirent(&de, ".", S_IFDIR, st.st_ino, next_off);
 
     client_lock.Unlock();
     int r = cb(p, &de, &st, -1, next_off);
@@ -6140,9 +6535,9 @@ int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p)
   if (dirp->offset == 1) {
     ldout(cct, 15) << " including .." << dendl;
     if (!diri->dn_set.empty()) {
-      Inode* in = diri->get_first_parent()->inode;
-      fill_dirent(&de, "..", S_IFDIR, in->ino, 2);
+      InodeRef& in = diri->get_first_parent()->inode;
       fill_stat(in, &st);
+      fill_dirent(&de, "..", S_IFDIR, st.st_ino, 2);
     } else {
       /* must be at the root (no parent),
        * so we add the dotdot with a special inode (3) */
@@ -6201,9 +6596,9 @@ int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p)
     dirp->offset = dir_result_t::make_fpos(fg, off);
     while (off >= dirp->this_offset &&
 	   off - dirp->this_offset < dirp->buffer->size()) {
-      pair<string,Inode*>& ent = (*dirp->buffer)[off - dirp->this_offset];
+      pair<string,InodeRef>& ent = (*dirp->buffer)[off - dirp->this_offset];
 
-      int stmask = fill_stat(ent.second, &st);  
+      int stmask = fill_stat(ent.second, &st);
       fill_dirent(&de, ent.first.c_str(), st.st_mode, st.st_ino, dirp->offset + 1);
       
       client_lock.Unlock();
@@ -6455,7 +6850,7 @@ int Client::open(const char *relpath, int flags, mode_t mode, int stripe_unit,
 #endif
 
   filepath path(relpath);
-  Inode *in;
+  InodeRef in;
   bool created = false;
   /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
   bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL)));
@@ -6475,11 +6870,11 @@ int Client::open(const char *relpath, int flags, mode_t mode, int stripe_unit,
     filepath dirpath = path;
     string dname = dirpath.last_dentry();
     dirpath.pop_dentry();
-    Inode *dir;
+    InodeRef dir;
     r = path_walk(dirpath, &dir);
     if (r < 0)
       return r;
-    r = _create(dir, dname.c_str(), flags, mode, &in, &fh, stripe_unit,
+    r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit,
                 stripe_count, object_size, data_pool, &created);
   }
   if (r < 0)
@@ -6489,17 +6884,16 @@ int Client::open(const char *relpath, int flags, mode_t mode, int stripe_unit,
     // posix says we can only check permissions of existing files
     uid_t uid = geteuid();
     gid_t gid = getegid();
-    r = check_permissions(in, flags, uid, gid);
+    r = check_permissions(in.get(), flags, uid, gid);
     if (r < 0)
       goto out;
   }
 
   if (!fh)
-    r = _open(in, flags, mode, &fh);
+    r = _open(in.get(), flags, mode, &fh);
   if (r >= 0) {
     // allocate a integer file descriptor
     assert(fh);
-    assert(in);
     r = get_fd();
     assert(fd_map.count(r) == 0);
     fd_map[r] = fh;
@@ -6589,11 +6983,12 @@ int Client::lookup_parent(Inode *ino, Inode **parent)
   req->set_filepath(path);
   req->set_inode(ino);
 
-  int r = make_request(req, -1, -1, NULL, NULL, rand() % mdsmap->get_num_in_mds());
+  InodeRef target;
+  int r = make_request(req, -1, -1, &target, NULL, rand() % mdsmap->get_num_in_mds());
   // Give caller a reference to the parent ino if they provided a pointer.
   if (parent != NULL) {
     if (r == 0) {
-      *parent = req->target;
+      *parent = target.get();
       _ll_get(*parent);
       ldout(cct, 3) << "lookup_parent found parent " << (*parent)->ino << dendl;
     } else {
@@ -6637,7 +7032,6 @@ Fh *Client::_create_fh(Inode *in, int flags, int cmode)
   // inode
   assert(in);
   f->inode = in;
-  f->inode->get();
 
   ldout(cct, 10) << "_create_fh " << in->ino << " mode " << cmode << dendl;
 
@@ -6671,7 +7065,7 @@ int Client::_release_fh(Fh *f)
 {
   //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
   //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
-  Inode *in = f->inode;
+  Inode *in = f->inode.get();
   ldout(cct, 5) << "_release_fh " << f << " mode " << f->mode << " on " << *in << dendl;
 
   if (in->snapid == CEPH_NOSNAP) {
@@ -6709,10 +7103,8 @@ int Client::_release_fh(Fh *f)
 void Client::_put_fh(Fh *f)
 {
   int left = f->put();
-  if (!left) {
-    put_inode(f->inode);
+  if (!left)
     delete f;
-  }
 }
 
 int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp, int uid, int gid)
@@ -6800,7 +7192,7 @@ loff_t Client::lseek(int fd, loff_t offset, int whence)
 
 loff_t Client::_lseek(Fh *f, loff_t offset, int whence)
 {
-  Inode *in = f->inode;
+  Inode *in = f->inode.get();
   int r;
 
   switch (whence) {
@@ -6885,7 +7277,7 @@ int Client::uninline_data(Inode *in, Context *onfinish)
                         inline_version_bl);
   bufferlist inline_data = in->inline_data;
   uninline_ops.write(0, inline_data, in->truncate_size, in->truncate_seq);
-  uninline_ops.setxattr("inline_version", inline_version_bl);
+  uninline_ops.setxattr("inline_version", stringify(in->inline_version));
 
   objecter->mutate(oid,
                    OSDMap::file_to_object_locator(in->layout),
@@ -6928,10 +7320,17 @@ int Client::read(int fd, char *buf, loff_t size, loff_t offset)
   return r;
 }
 
+int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset)
+{
+  if (iovcnt < 0)
+    return EINVAL;
+  return _preadv_pwritev(fd, iov, iovcnt, offset, false);
+}
+
 int Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
 {
   const md_config_t *conf = cct->_conf;
-  Inode *in = f->inode;
+  Inode *in = f->inode.get();
 
   //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
 
@@ -7059,31 +7458,30 @@ Client::C_Readahead::C_Readahead(Client *c, Fh *f) :
     f->get();
 }
 
-Client::C_Readahead::~C_Readahead() {
-  client->_put_fh(f);
-}
-
 void Client::C_Readahead::finish(int r) {
   lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl;
-  client->put_cap_ref(f->inode, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
+  client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
   f->readahead.dec_pending();
+  client->_put_fh(f);
 }
 
 int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
 {
   const md_config_t *conf = cct->_conf;
-  Inode *in = f->inode;
+  Inode *in = f->inode.get();
 
   ldout(cct, 10) << "_read_async " << *in << " " << off << "~" << len << dendl;
 
   // trim read based on file size?
   if (off >= in->size)
     return 0;
+  if (len == 0)
+    return 0;
   if (off + len > in->size) {
     len = in->size - off;    
   }
 
-  ldout(cct, 10) << " max_byes=" << conf->client_readahead_max_bytes
+  ldout(cct, 10) << " max_bytes=" << conf->client_readahead_max_bytes
 		 << " max_periods=" << conf->client_readahead_max_periods << dendl;
 
   // read (and possibly block)
@@ -7136,7 +7534,7 @@ int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
 int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl,
 		       bool *checkeof)
 {
-  Inode *in = f->inode;
+  Inode *in = f->inode.get();
   uint64_t pos = off;
   int left = len;
   int read = 0;
@@ -7207,11 +7605,9 @@ int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl,
  */
 class C_Client_SyncCommit : public Context {
   Client *cl;
-  Inode *in;
+  InodeRef in;
 public:
-  C_Client_SyncCommit(Client *c, Inode *i) : cl(c), in(i) {
-    in->get();
-  }
+  C_Client_SyncCommit(Client *c, Inode *i) : cl(c), in(i) {}
   void finish(int) {
     // Called back by Filter, then Client is responsible for taking its own lock
     assert(!cl->client_lock.is_locked_by_me()); 
@@ -7219,14 +7615,14 @@ public:
   }
 };
 
-void Client::sync_write_commit(Inode *in)
+void Client::sync_write_commit(InodeRef& in)
 {
   Mutex::Locker l(client_lock);
 
   assert(unsafe_sync_write > 0);
   unsafe_sync_write--;
 
-  put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
+  put_cap_ref(in.get(), CEPH_CAP_FILE_BUFFER);
 
   ldout(cct, 15) << "sync_write_commit unsafe_sync_write = " << unsafe_sync_write << dendl;
   if (unsafe_sync_write == 0 && unmounting) {
@@ -7234,7 +7630,7 @@ void Client::sync_write_commit(Inode *in)
     mount_cond.Signal();
   }
 
-  put_inode(in);
+  in.reset(); // put inode inside client_lock
 }
 
 int Client::write(int fd, const char *buf, loff_t size, loff_t offset) 
@@ -7252,24 +7648,75 @@ int Client::write(int fd, const char *buf, loff_t size, loff_t offset)
   if (fh->flags & O_PATH)
     return -EBADF;
 #endif
-  int r = _write(fh, offset, size, buf);
+  int r = _write(fh, offset, size, buf, NULL, 0);
   ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl;
   return r;
 }
 
+int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset)
+{
+  if (iovcnt < 0)
+    return EINVAL;
+  return _preadv_pwritev(fd, iov, iovcnt, offset, true);
+}
+
+int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write)
+{
+    Mutex::Locker lock(client_lock);
+    tout(cct) << fd << std::endl;
+    tout(cct) << offset << std::endl;
+
+    Fh *fh = get_filehandle(fd);
+    if (!fh)
+        return -EBADF;
+#if defined(__linux__) && defined(O_PATH)
+    if (fh->flags & O_PATH)
+        return -EBADF;
+#endif
+    loff_t totallen = 0;
+    for (unsigned i = 0; i < iovcnt; i++) {
+        totallen += iov[i].iov_len;
+    }
+    if (write) {
+        int w = _write(fh, offset, totallen, NULL, iov, iovcnt);
+        ldout(cct, 3) << "pwritev(" << fd << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl;
+        return w;
+    } else {
+        bufferlist bl;
+        int r = _read(fh, offset, totallen, &bl);
+        ldout(cct, 3) << "preadv(" << fd << ", " <<  offset << ") = " << r << dendl;
+        int bufoff = 0;
+        for (unsigned j = 0, resid = r; j < iovcnt && resid > 0; j++) {
+               /*
+                * This piece of code aims to handle the case that bufferlist does not have enough data 
+                * to fill in the iov 
+                */
+               if (resid < iov[j].iov_len) {
+                    bl.copy(bufoff, resid, (char *)iov[j].iov_base);
+                    break;
+               } else {
+                    bl.copy(bufoff, iov[j].iov_len, (char *)iov[j].iov_base);
+               }
+               resid -= iov[j].iov_len;
+               bufoff += iov[j].iov_len;
+        }
+        return r;  
+    }
+}
 
-int Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf)
+int Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
+                  const struct iovec *iov, int iovcnt)
 {
   if ((uint64_t)(offset+size) > mdsmap->get_max_filesize()) //too large!
     return -EFBIG;
 
-  if (objecter->osdmap_full_flag()) {
+  //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
+  Inode *in = f->inode.get();
+
+  if (objecter->osdmap_pool_full(in->layout.fl_pg_pool)) {
     return -ENOSPC;
   }
 
-  //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
-  Inode *in = f->inode;
-
   assert(in->snapid == CEPH_NOSNAP);
 
   // was Fh opened as writeable?
@@ -7310,10 +7757,22 @@ int Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf)
   }
 
   // copy into fresh buffer (since our write may be resub, async)
-  bufferptr bp;
-  if (size > 0) bp = buffer::copy(buf, size);
   bufferlist bl;
-  bl.push_back( bp );
+  bufferptr *bparr = NULL;
+  if (buf) {
+      bufferptr bp;
+      if (size > 0) bp = buffer::copy(buf, size);
+      bl.push_back( bp );
+  } else if (iov){
+      //iov case 
+      bparr = new bufferptr[iovcnt];
+      for (int i = 0; i < iovcnt; i++) {
+        if (iov[i].iov_len > 0) {
+            bparr[i] = buffer::copy((char*)iov[i].iov_base, iov[i].iov_len);
+        }
+        bl.push_back( bparr[i] );
+      }
+  }
 
   utime_t lat;
   uint64_t totalwritten;
@@ -7461,12 +7920,13 @@ done:
   }
 
   put_cap_ref(in, CEPH_CAP_FILE_WR);
+  delete[] bparr; 
   return r;
 }
 
 int Client::_flush(Fh *f)
 {
-  Inode *in = f->inode;
+  Inode *in = f->inode.get();
   int err = in->async_err;
   if (err != 0) {
     ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = "
@@ -7523,37 +7983,29 @@ int Client::fsync(int fd, bool syncdataonly)
   return r;
 }
 
-int Client::_fsync(Fh *f, bool syncdataonly)
+int Client::_fsync(Inode *in, bool syncdataonly)
 {
   int r = 0;
-
-  Inode *in = f->inode;
-  ceph_tid_t wait_on_flush = 0;
   bool flushed_metadata = false;
   Mutex lock("Client::_fsync::lock");
   Cond cond;
   bool done = false;
   C_SafeCond *object_cacher_completion = NULL;
+  InodeRef tmp_ref;
 
-  ldout(cct, 3) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
+  ldout(cct, 3) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl;
   
   if (cct->_conf->client_oc) {
     object_cacher_completion = new C_SafeCond(&lock, &cond, &done, &r);
-    in->get(); // take a reference; C_SafeCond doesn't and _flush won't either
+    tmp_ref = in; // take a reference; C_SafeCond doesn't and _flush won't either
     _flush(in, object_cacher_completion);
     ldout(cct, 15) << "using return-valued form of _fsync" << dendl;
   }
   
   if (!syncdataonly && (in->dirty_caps & ~CEPH_CAP_ANY_FILE_WR)) {
-    for (map<mds_rank_t, Cap*>::iterator iter = in->caps.begin(); iter != in->caps.end(); ++iter) {
-      if (iter->second->implemented & ~CEPH_CAP_ANY_FILE_WR) {
-	MetaSession *session = mds_sessions[iter->first];
-	assert(session);
-        flush_caps(in, session);
-      }
-    }
-    wait_on_flush = in->last_flush_tid;
-    flushed_metadata = true;
+    check_caps(in, true);
+    if (in->flushing_caps)
+      flushed_metadata = true;
   } else ldout(cct, 10) << "no metadata needs to commit" << dendl;
 
   if (object_cacher_completion) { // wait on a real reply instead of guessing
@@ -7564,7 +8016,6 @@ int Client::_fsync(Fh *f, bool syncdataonly)
       cond.Wait(lock);
     lock.Unlock();
     client_lock.Lock();
-    put_inode(in);
     ldout(cct, 15) << "got " << r << " from flush writeback" << dendl;
   } else {
     // FIXME: this can starve
@@ -7575,10 +8026,24 @@ int Client::_fsync(Fh *f, bool syncdataonly)
     }
   }
 
+  if (!in->unsafe_dir_ops.empty()) {
+    MetaRequest *req = in->unsafe_dir_ops.back();
+    uint64_t last_tid = req->get_tid();
+    ldout(cct, 15) << "waiting on unsafe requests, last tid " << last_tid <<  dendl;
+
+    do {
+      req->get();
+      wait_on_list(req->waitfor_safe);
+      put_request(req);
+      if (in->unsafe_dir_ops.empty())
+	break;
+      req = in->unsafe_dir_ops.front();
+    } while (req->tid < last_tid);
+  }
+
   if (!r) {
-    if (flushed_metadata) wait_sync_caps(wait_on_flush);
-    // this could wait longer than strictly necessary,
-    // but on a sync the user can put up with it
+    if (flushed_metadata)
+      wait_sync_caps(in, last_flush_tid);
 
     ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl;
   } else {
@@ -7595,6 +8060,12 @@ int Client::_fsync(Fh *f, bool syncdataonly)
   return r;
 }
 
+int Client::_fsync(Fh *f, bool syncdataonly)
+{
+  ldout(cct, 3) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
+  return _fsync(f->inode.get(), syncdataonly);
+}
+
 int Client::fstat(int fd, struct stat *stbuf) 
 {
   Mutex::Locker lock(client_lock);
@@ -7621,15 +8092,12 @@ int Client::chdir(const char *relpath)
   tout(cct) << "chdir" << std::endl;
   tout(cct) << relpath << std::endl;
   filepath path(relpath);
-  Inode *in;
+  InodeRef in;
   int r = path_walk(path, &in);
   if (r < 0)
     return r;
-  if (cwd != in) {
-    in->get();
-    put_inode(cwd);
-    cwd = in;
-  }
+  if (cwd != in)
+    cwd.swap(in);
   ldout(cct, 3) << "chdir(" << relpath << ")  cwd now " << cwd->ino << dendl;
   return 0;
 }
@@ -7639,7 +8107,7 @@ void Client::getcwd(string& dir)
   filepath path;
   ldout(cct, 10) << "getcwd " << *cwd << dendl;
 
-  Inode *in = cwd;
+  Inode *in = cwd.get();
   while (in != root) {
     assert(in->dn_set.size() < 2); // dirs can't be hard-linked
     Dentry *dn = in->get_first_parent();
@@ -7656,7 +8124,7 @@ void Client::getcwd(string& dir)
 
       // start over
       path = filepath();
-      in = cwd;
+      in = cwd.get();
       continue;
     }
     path.push_front_dentry(dn->name);
@@ -7797,8 +8265,10 @@ int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
 	if (!in->flock_locks)
 	  in->flock_locks = new ceph_lock_state_t(cct);
 	lock_state = in->flock_locks;
-      } else
+      } else {
 	assert(0);
+	return -EINVAL;
+      }
       _update_lock_state(fl, owner, lock_state);
 
       if (fh) {
@@ -7828,8 +8298,10 @@ int Client::_interrupt_filelock(MetaRequest *req)
     lock_type = CEPH_LOCK_FLOCK_INTR;
   else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL)
     lock_type = CEPH_LOCK_FCNTL_INTR;
-  else
+  else {
     assert(0);
+    return -EINVAL;
+  }
 
   MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK);
   filepath path;
@@ -7877,7 +8349,7 @@ void Client::_release_filelocks(Fh *fh)
   if (!fh->fcntl_locks && !fh->flock_locks)
     return;
 
-  Inode *in = fh->inode;
+  Inode *in = fh->inode.get();
   ldout(cct, 10) << "_release_filelocks " << fh << " ino " << in->ino << dendl;
 
   list<pair<int, ceph_filelock> > to_release;
@@ -7948,7 +8420,7 @@ void Client::_update_lock_state(struct flock *fl, uint64_t owner,
 
 int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner)
 {
-  Inode *in = fh->inode;
+  Inode *in = fh->inode.get();
   ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl;
   int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner);
   return ret;
@@ -7956,7 +8428,7 @@ int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner)
 
 int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep, void *fuse_req)
 {
-  Inode *in = fh->inode;
+  Inode *in = fh->inode.get();
   ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl;
   int ret =  _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner, fuse_req);
   ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl;
@@ -7965,7 +8437,7 @@ int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep, void *fu
 
 int Client::_flock(Fh *fh, int cmd, uint64_t owner, void *fuse_req)
 {
-  Inode *in = fh->inode;
+  Inode *in = fh->inode.get();
   ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl;
 
   int sleep = !(cmd & LOCK_NB);
@@ -8073,7 +8545,7 @@ int Client::_sync_fs()
   
   // flush caps
   flush_caps();
-  wait_sync_caps(last_flush_seq);
+  wait_sync_caps(last_flush_tid);
 
   // flush file data
   // FIXME
@@ -8119,7 +8591,7 @@ int Client::lazyio_synchronize(int fd, loff_t offset, size_t count)
   Fh *f = get_filehandle(fd);
   if (!f)
     return -EBADF;
-  Inode *in = f->inode;
+  Inode *in = f->inode.get();
   
   _fsync(f, true);
   _release(in);
@@ -8134,22 +8606,22 @@ int Client::mksnap(const char *relpath, const char *name)
 {
   Mutex::Locker l(client_lock);
   filepath path(relpath);
-  Inode *in;
+  InodeRef in;
   int r = path_walk(path, &in);
   if (r < 0)
     return r;
-  Inode *snapdir = open_snapdir(in);
+  Inode *snapdir = open_snapdir(in.get());
   return _mkdir(snapdir, name, 0);
 }
 int Client::rmsnap(const char *relpath, const char *name)
 {
   Mutex::Locker l(client_lock);
   filepath path(relpath);
-  Inode *in;
+  InodeRef in;
   int r = path_walk(path, &in);
   if (r < 0)
     return r;
-  Inode *snapdir = open_snapdir(in);
+  Inode *snapdir = open_snapdir(in.get());
   return _rmdir(snapdir, name);
 }
 
@@ -8171,7 +8643,7 @@ int Client::get_caps_issued(const char *path) {
 
   Mutex::Locker lock(client_lock);
   filepath p(path);
-  Inode *in;
+  InodeRef in;
   int r = path_walk(p, &in, true);
   if (r < 0)
     return r;
@@ -8186,7 +8658,7 @@ Inode *Client::open_snapdir(Inode *diri)
   Inode *in;
   vinodeno_t vino(diri->ino, CEPH_SNAPDIR);
   if (!inode_map.count(vino)) {
-    in = new Inode(cct, vino, &diri->layout);
+    in = new Inode(this, vino, &diri->layout);
 
     in->ino = diri->ino;
     in->snapid = CEPH_SNAPDIR;
@@ -8198,9 +8670,10 @@ Inode *Client::open_snapdir(Inode *diri)
     in->size = diri->size;
 
     in->dirfragtree.clear();
-    inode_map[vino] = in;
     in->snapdir_parent = diri;
-    diri->get();
+    inode_map[vino] = in;
+    if (use_faked_inos())
+      _assign_faked_ino(in);
     ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl;
   } else {
     in = inode_map[vino];
@@ -8218,7 +8691,7 @@ int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
   tout(cct) << name << std::endl;
 
   string dname(name);
-  Inode *in;
+  InodeRef in;
   int r = 0;
 
   r = _lookup(parent, dname, &in);
@@ -8229,40 +8702,38 @@ int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
 
   assert(in);
   fill_stat(in, attr);
-  _ll_get(in);
+  _ll_get(in.get());
 
  out:
   ldout(cct, 3) << "ll_lookup " << parent << " " << name
 	  << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
   tout(cct) << attr->st_ino << std::endl;
-  *out = in;
+  *out = in.get();
   return r;
 }
 
-int Client::ll_walk(const char* name, Inode **i, struct stat *attr)
+int Client::ll_walk(const char* name, Inode **out, struct stat *attr)
 {
   Mutex::Locker lock(client_lock);
   filepath fp(name, 0);
-  Inode *destination = NULL;
+  InodeRef in;
   int rc;
 
   ldout(cct, 3) << "ll_walk" << name << dendl;
   tout(cct) << "ll_walk" << std::endl;
   tout(cct) << name << std::endl;
 
-  rc = path_walk(fp, &destination, false);
-  if (rc < 0)
-    {
-      attr->st_ino = 0;
-      *i = NULL;
-      return rc;
-    }
-  else
-    {
-      fill_stat(destination, attr);
-      *i = destination;
-      return 0;
-    }
+  rc = path_walk(fp, &in, false);
+  if (rc < 0) {
+    attr->st_ino = 0;
+    *out = NULL;
+    return rc;
+  } else {
+    assert(in);
+    fill_stat(in, attr);
+    *out = in.get();
+    return 0;
+  }
 }
 
 
@@ -8348,6 +8819,18 @@ snapid_t Client::ll_get_snapid(Inode *in)
   return in->snapid;
 }
 
+Inode *Client::ll_get_inode(ino_t ino)
+{
+  Mutex::Locker lock(client_lock);
+  vinodeno_t vino = _map_faked_ino(ino);
+  unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
+  if (p == inode_map.end())
+    return NULL;
+  Inode *in = p->second;
+  _ll_get(in);
+  return in;
+}
+
 Inode *Client::ll_get_inode(vinodeno_t vino)
 {
   Mutex::Locker lock(client_lock);
@@ -8406,12 +8889,13 @@ int Client::ll_setattr(Inode *in, struct stat *attr, int mask, int uid,
   tout(cct) << attr->st_atime << std::endl;
   tout(cct) << mask << std::endl;
 
-  Inode *target = in;
+  InodeRef target(in);
   int res = _setattr(in, attr, mask, uid, gid, &target);
   if (res == 0) {
-    assert(in == target);
+    assert(in == target.get());
     fill_stat(in, attr);
   }
+
   ldout(cct, 3) << "ll_setattr " << vino << " = " << res << dendl;
   return res;
 }
@@ -8423,81 +8907,117 @@ int Client::ll_setattr(Inode *in, struct stat *attr, int mask, int uid,
 int Client::getxattr(const char *path, const char *name, void *value, size_t size)
 {
   Mutex::Locker lock(client_lock);
-  Inode *ceph_inode;
-  int r = Client::path_walk(path, &ceph_inode, true);
+  InodeRef in;
+  int r = Client::path_walk(path, &in, true);
   if (r < 0)
     return r;
-  return Client::_getxattr(ceph_inode, name, value, size, getuid(), getgid());
+  return Client::_getxattr(in.get(), name, value, size, getuid(), getgid());
 }
 
 int Client::lgetxattr(const char *path, const char *name, void *value, size_t size)
 {
   Mutex::Locker lock(client_lock);
-  Inode *ceph_inode;
-  int r = Client::path_walk(path, &ceph_inode, false);
+  InodeRef in;
+  int r = Client::path_walk(path, &in, false);
   if (r < 0)
     return r;
-  return Client::_getxattr(ceph_inode, name, value, size, getuid(), getgid());
+  return Client::_getxattr(in.get(), name, value, size, getuid(), getgid());
+}
+
+int Client::fgetxattr(int fd, const char *name, void *value, size_t size)
+{
+  Mutex::Locker lock(client_lock);
+  Fh *f = get_filehandle(fd);
+  if (!f)
+    return -EBADF;
+  return Client::_getxattr(f->inode.get(), name, value, size, getuid(), getgid());
 }
 
 int Client::listxattr(const char *path, char *list, size_t size)
 {
   Mutex::Locker lock(client_lock);
-  Inode *ceph_inode;
-  int r = Client::path_walk(path, &ceph_inode, true);
+  InodeRef in;
+  int r = Client::path_walk(path, &in, true);
   if (r < 0)
     return r;
-  return Client::_listxattr(ceph_inode, list, size, getuid(), getgid());
+  return Client::_listxattr(in.get(), list, size, getuid(), getgid());
 }
 
 int Client::llistxattr(const char *path, char *list, size_t size)
 {
   Mutex::Locker lock(client_lock);
-  Inode *ceph_inode;
-  int r = Client::path_walk(path, &ceph_inode, false);
+  InodeRef in;
+  int r = Client::path_walk(path, &in, false);
   if (r < 0)
     return r;
-  return Client::_listxattr(ceph_inode, list, size, getuid(), getgid());
+  return Client::_listxattr(in.get(), list, size, getuid(), getgid());
+}
+
+int Client::flistxattr(int fd, char *list, size_t size)
+{
+  Mutex::Locker lock(client_lock);
+  Fh *f = get_filehandle(fd);
+  if (!f)
+    return -EBADF;
+  return Client::_listxattr(f->inode.get(), list, size, getuid(), getgid());
 }
 
 int Client::removexattr(const char *path, const char *name)
 {
   Mutex::Locker lock(client_lock);
-  Inode *ceph_inode;
-  int r = Client::path_walk(path, &ceph_inode, true);
+  InodeRef in;
+  int r = Client::path_walk(path, &in, true);
   if (r < 0)
     return r;
-  return Client::_removexattr(ceph_inode, name, getuid(), getgid());
+  return Client::_removexattr(in.get(), name, getuid(), getgid());
 }
 
 int Client::lremovexattr(const char *path, const char *name)
 {
   Mutex::Locker lock(client_lock);
-  Inode *ceph_inode;
-  int r = Client::path_walk(path, &ceph_inode, false);
+  InodeRef in;
+  int r = Client::path_walk(path, &in, false);
   if (r < 0)
     return r;
-  return Client::_removexattr(ceph_inode, name, getuid(), getgid());
+  return Client::_removexattr(in.get(), name, getuid(), getgid());
+}
+
+int Client::fremovexattr(int fd, const char *name)
+{
+  Mutex::Locker lock(client_lock);
+  Fh *f = get_filehandle(fd);
+  if (!f)
+    return -EBADF;
+  return Client::_removexattr(f->inode.get(), name, getuid(), getgid());
 }
 
 int Client::setxattr(const char *path, const char *name, const void *value, size_t size, int flags)
 {
   Mutex::Locker lock(client_lock);
-  Inode *ceph_inode;
-  int r = Client::path_walk(path, &ceph_inode, true);
+  InodeRef in;
+  int r = Client::path_walk(path, &in, true);
   if (r < 0)
     return r;
-  return Client::_setxattr(ceph_inode, name, value, size, flags, getuid(), getgid());
+  return Client::_setxattr(in.get(), name, value, size, flags, getuid(), getgid());
 }
 
 int Client::lsetxattr(const char *path, const char *name, const void *value, size_t size, int flags)
 {
   Mutex::Locker lock(client_lock);
-  Inode *ceph_inode;
-  int r = Client::path_walk(path, &ceph_inode, false);
+  InodeRef in;
+  int r = Client::path_walk(path, &in, false);
   if (r < 0)
     return r;
-  return Client::_setxattr(ceph_inode, name, value, size, flags, getuid(), getgid());
+  return Client::_setxattr(in.get(), name, value, size, flags, getuid(), getgid());
+}
+
+int Client::fsetxattr(int fd, const char *name, const void *value, size_t size, int flags)
+{
+  Mutex::Locker lock(client_lock);
+  Fh *f = get_filehandle(fd);
+  if (!f)
+    return -EBADF;
+  return Client::_setxattr(f->inode.get(), name, value, size, flags, getuid(), getgid());
 }
 
 int Client::_getxattr(Inode *in, const char *name, void *value, size_t size,
@@ -8635,8 +9155,13 @@ int Client::_setxattr(Inode *in, const char *name, const void *value,
   if (vxattr && vxattr->readonly)
     return -EOPNOTSUPP;
 
+  int xattr_flags = 0;
   if (!value)
-    flags |= CEPH_XATTR_REMOVE;
+    xattr_flags |= CEPH_XATTR_REMOVE;
+  if (flags & XATTR_CREATE)
+    xattr_flags |= CEPH_XATTR_CREATE;
+  if (flags & XATTR_REPLACE)
+    xattr_flags |= CEPH_XATTR_REPLACE;
 
   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR);
   filepath path;
@@ -8644,7 +9169,7 @@ int Client::_setxattr(Inode *in, const char *name, const void *value,
   req->set_filepath(path);
   req->set_string2(name);
   req->set_inode(in);
-  req->head.args.setxattr.flags = flags;
+  req->head.args.setxattr.flags = xattr_flags;
 
   bufferlist bl;
   bl.append((const char*)value, size);
@@ -8658,9 +9183,67 @@ int Client::_setxattr(Inode *in, const char *name, const void *value,
   return res;
 }
 
+int Client::check_data_pool_exist(string name, string value, const OSDMap *osdmap)
+{
+  string tmp;
+  if (name == "layout") {
+    string::iterator begin = value.begin();
+    string::iterator end = value.end();
+    keys_and_values<string::iterator> p;    // create instance of parser
+    std::map<string, string> m;             // map to receive results
+    if (!qi::parse(begin, end, p, m)) {     // returns true if successful
+      return -EINVAL;
+    }
+    if (begin != end)
+      return -EINVAL;
+    for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
+      if (q->first == "pool") {
+	tmp = q->second;
+	break;
+      }
+    }
+  } else if (name == "layout.pool") {
+    tmp = value;
+  }
+
+  if (tmp.length()) {
+    int64_t pool;
+    try {
+      pool = boost::lexical_cast<unsigned>(tmp);
+      if (!osdmap->have_pg_pool(pool))
+	return -ENOENT;
+    } catch (boost::bad_lexical_cast const&) {
+      pool = osdmap->lookup_pg_pool_name(tmp);
+      if (pool < 0) {
+	return -ENOENT;
+      }
+    }
+  }
+
+  return 0;
+}
+
 int Client::ll_setxattr(Inode *in, const char *name, const void *value,
 			size_t size, int flags, int uid, int gid)
 {
+  // For setting pool of layout, MetaRequest need osdmap epoch.
+  // There is a race which create a new data pool but client and mds both don't have.
+  // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
+  if (strcmp(name, "ceph.file.layout.pool") == 0 ||  strcmp(name, "ceph.dir.layout.pool") == 0 ||
+      strcmp(name, "ceph.file.layout") == 0 || strcmp(name, "ceph.dir.layout") == 0) {
+    string rest(strstr(name, "layout"));
+    string v((const char*)value);
+    const OSDMap *osdmap = objecter->get_osdmap_read();
+    int r = check_data_pool_exist(rest, v, osdmap);
+    objecter->put_osdmap_read();
+
+    if (r == -ENOENT) {
+      C_SaferCond ctx;
+      objecter->wait_for_latest_osdmap(&ctx);
+      ctx.wait();
+    }
+  }
+
   Mutex::Locker lock(client_lock);
 
   vinodeno_t vino = _get_vino(in);
@@ -8953,7 +9536,7 @@ int Client::ll_readlink(Inode *in, char *buf, size_t buflen, int uid, int gid)
 }
 
 int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev,
-		   int uid, int gid, Inode **inp)
+		   int uid, int gid, InodeRef *inp)
 {
   ldout(cct, 3) << "_mknod(" << dir->ino << " " << name << ", 0" << oct
 		<< mode << dec << ", " << rdev << ", uid " << uid << ", gid "
@@ -9014,21 +9597,21 @@ int Client::ll_mknod(Inode *parent, const char *name, mode_t mode,
   tout(cct) << mode << std::endl;
   tout(cct) << rdev << std::endl;
 
-  Inode *in = NULL;
+  InodeRef in;
   int r = _mknod(parent, name, mode, rdev, uid, gid, &in);
   if (r == 0) {
     fill_stat(in, attr);
-    _ll_get(in);
+    _ll_get(in.get());
   }
   tout(cct) << attr->st_ino << std::endl;
   ldout(cct, 3) << "ll_mknod " << vparent << " " << name
 	  << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
-  *out = in;
+  *out = in.get();
   return r;
 }
 
 int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
-		    Inode **inp, Fh **fhp, int stripe_unit, int stripe_count,
+		    InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count,
 		    int object_size, const char *data_pool, bool *created,
 		    int uid, int gid)
 {
@@ -9093,7 +9676,7 @@ int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
   /* If the caller passed a value in fhp, do the open */
   if(fhp) {
     (*inp)->get_open_ref(cmode);
-    *fhp = _create_fh(*inp, flags, cmode);
+    *fhp = _create_fh(inp->get(), flags, cmode);
   }
 
  reply_error:
@@ -9113,7 +9696,7 @@ int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
 
 
 int Client::_mkdir(Inode *dir, const char *name, mode_t mode, int uid, int gid,
-		   Inode **inp)
+		   InodeRef *inp)
 {
   ldout(cct, 3) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct
 		<< mode << dec << ", uid " << uid << ", gid " << gid << ")"
@@ -9173,21 +9756,21 @@ int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode,
   tout(cct) << name << std::endl;
   tout(cct) << mode << std::endl;
 
-  Inode *in = NULL;
+  InodeRef in;
   int r = _mkdir(parent, name, mode, uid, gid, &in);
   if (r == 0) {
     fill_stat(in, attr);
-    _ll_get(in);
+    _ll_get(in.get());
   }
   tout(cct) << attr->st_ino << std::endl;
   ldout(cct, 3) << "ll_mkdir " << vparent << " " << name
 	  << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
-  *out = in;
+  *out = in.get();
   return r;
 }
 
 int Client::_symlink(Inode *dir, const char *name, const char *target, int uid,
-		     int gid, Inode **inp)
+		     int gid, InodeRef *inp)
 {
   ldout(cct, 3) << "_symlink(" << dir->ino << " " << name << ", " << target
 	  << ", uid " << uid << ", gid " << gid << ")" << dendl;
@@ -9245,16 +9828,16 @@ int Client::ll_symlink(Inode *parent, const char *name, const char *value,
   tout(cct) << name << std::endl;
   tout(cct) << value << std::endl;
 
-  Inode *in = NULL;
+  InodeRef in;
   int r = _symlink(parent, name, value, uid, gid, &in);
   if (r == 0) {
     fill_stat(in, attr);
-    _ll_get(in);
+    _ll_get(in.get());
   }
   tout(cct) << attr->st_ino << std::endl;
   ldout(cct, 3) << "ll_symlink " << vparent << " " << name
 	  << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
-  *out = in;
+  *out = in.get();
   return r;
 }
 
@@ -9273,6 +9856,8 @@ int Client::_unlink(Inode *dir, const char *name, int uid, int gid)
   path.push_dentry(name);
   req->set_filepath(path);
 
+  InodeRef otherin;
+
   Dentry *de;
   int res = get_or_create(dir, name, &de);
   if (res < 0)
@@ -9281,11 +9866,10 @@ int Client::_unlink(Inode *dir, const char *name, int uid, int gid)
   req->dentry_drop = CEPH_CAP_FILE_SHARED;
   req->dentry_unless = CEPH_CAP_FILE_EXCL;
 
-  Inode *otherin;
   res = _lookup(dir, name, &otherin);
   if (res < 0)
     goto fail;
-  req->set_other_inode(otherin);
+  req->set_other_inode(otherin.get());
   req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
 
   req->set_inode(dir);
@@ -9332,19 +9916,21 @@ int Client::_rmdir(Inode *dir, const char *name, int uid, int gid)
 
   req->dentry_drop = CEPH_CAP_FILE_SHARED;
   req->dentry_unless = CEPH_CAP_FILE_EXCL;
-  req->inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
+  req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
+
+  InodeRef in;
 
   Dentry *de;
   int res = get_or_create(dir, name, &de);
   if (res < 0)
     goto fail;
-  Inode *in;
   res = _lookup(dir, name, &in);
   if (res < 0)
     goto fail;
   if (req->get_op() == CEPH_MDS_OP_RMDIR) {
+    req->set_inode(dir);
     req->set_dentry(de);
-    req->set_inode(in);
+    req->set_other_inode(in.get());
   } else {
     unlink(de, true, true);
   }
@@ -9379,9 +9965,15 @@ int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const ch
   ldout(cct, 3) << "_rename(" << fromdir->ino << " " << fromname << " to " << todir->ino << " " << toname
 	  << " uid " << uid << " gid " << gid << ")" << dendl;
 
-  if (fromdir->snapid != CEPH_NOSNAP ||
-      todir->snapid != CEPH_NOSNAP) {
-    return -EROFS;
+  if (fromdir->snapid != todir->snapid)
+    return -EXDEV;
+
+  int op = CEPH_MDS_OP_RENAME;
+  if (fromdir->snapid != CEPH_NOSNAP) {
+    if (fromdir == todir && fromdir->snapid == CEPH_SNAPDIR)
+      op = CEPH_MDS_OP_RENAMESNAP;
+    else
+      return -EROFS;
   }
   if (cct->_conf->client_quota &&
       fromdir != todir &&
@@ -9391,7 +9983,8 @@ int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const ch
     return -EXDEV;
   }
 
-  MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RENAME);
+  InodeRef target;
+  MetaRequest *req = new MetaRequest(op);
 
   filepath from;
   fromdir->make_nosnap_relative_path(from);
@@ -9406,39 +9999,44 @@ int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const ch
   int res = get_or_create(fromdir, fromname, &oldde);
   if (res < 0)
     goto fail;
-  req->set_old_dentry(oldde);
-  req->old_dentry_drop = CEPH_CAP_FILE_SHARED;
-  req->old_dentry_unless = CEPH_CAP_FILE_EXCL;
-
   Dentry *de;
   res = get_or_create(todir, toname, &de);
   if (res < 0)
     goto fail;
-  req->set_dentry(de);
-  req->dentry_drop = CEPH_CAP_FILE_SHARED;
-  req->dentry_unless = CEPH_CAP_FILE_EXCL;
 
-  Inode *oldin;
-  res = _lookup(fromdir, fromname, &oldin);
-  if (res < 0)
-    goto fail;
-  req->set_old_inode(oldin);
-  req->old_inode_drop = CEPH_CAP_LINK_SHARED;
+  if (op == CEPH_MDS_OP_RENAME) {
+    req->set_old_dentry(oldde);
+    req->old_dentry_drop = CEPH_CAP_FILE_SHARED;
+    req->old_dentry_unless = CEPH_CAP_FILE_EXCL;
 
-  Inode *otherin;
-  res = _lookup(todir, toname, &otherin);
-  if (res != 0 && res != -ENOENT) {
-    goto fail;
-  } else if (res == 0) {
-    req->set_other_inode(otherin);
-    req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
-  }
+    req->set_dentry(de);
+    req->dentry_drop = CEPH_CAP_FILE_SHARED;
+    req->dentry_unless = CEPH_CAP_FILE_EXCL;
 
-  req->set_inode(todir);
+    InodeRef oldin, otherin;
+    res = _lookup(fromdir, fromname, &oldin);
+    if (res < 0)
+      goto fail;
+    req->set_old_inode(oldin.get());
+    req->old_inode_drop = CEPH_CAP_LINK_SHARED;
 
-  Inode *target;
-  res = make_request(req, uid, gid, &target);
+    res = _lookup(todir, toname, &otherin);
+    if (res != 0 && res != -ENOENT) {
+      goto fail;
+    } else if (res == 0) {
+      req->set_other_inode(otherin.get());
+      req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
+    }
+
+    req->set_inode(todir);
+  } else {
+    // renamesnap reply contains no tracedn, so we need to invalidate
+    // dentry manually
+    unlink(oldde, true, true);
+    unlink(de, true, true);
+  }
 
+  res = make_request(req, uid, gid, &target);
   ldout(cct, 10) << "rename result is " << res << dendl;
 
   // renamed item from our cache
@@ -9471,7 +10069,7 @@ int Client::ll_rename(Inode *parent, const char *name, Inode *newparent,
   return _rename(parent, name, newparent, newname, uid, gid);
 }
 
-int Client::_link(Inode *in, Inode *dir, const char *newname, int uid, int gid, Inode **inp)
+int Client::_link(Inode *in, Inode *dir, const char *newname, int uid, int gid, InodeRef *inp)
 {
   ldout(cct, 3) << "_link(" << in->ino << " to " << dir->ino << " " << newname
 	  << " uid " << uid << " gid " << gid << ")" << dendl;
@@ -9530,10 +10128,12 @@ int Client::ll_link(Inode *parent, Inode *newparent, const char *newname,
   tout(cct) << vnewparent << std::endl;
   tout(cct) << newname << std::endl;
 
-  int r = _link(parent, newparent, newname, uid, gid, &parent);
+  InodeRef target;
+  int r = _link(parent, newparent, newname, uid, gid, &target);
   if (r == 0) {
-    fill_stat(parent, attr);
-    _ll_get(parent);
+    assert(target);
+    fill_stat(target, attr);
+    _ll_get(target.get());
   }
   return r;
 }
@@ -9583,6 +10183,11 @@ int Client::ll_file_layout(Inode *in, ceph_file_layout *layout)
   return 0;
 }
 
+int Client::ll_file_layout(Fh *fh, ceph_file_layout *layout)
+{
+  return ll_file_layout(fh->inode.get(), layout);
+}
+
 /* Currently we cannot take advantage of redundancy in reads, since we
    would have to go through all possible placement groups (a
    potentially quite large number determined by a hash), and use CRUSH
@@ -9664,6 +10269,16 @@ int Client::ll_releasedir(dir_result_t *dirp)
   return 0;
 }
 
+int Client::ll_fsyncdir(dir_result_t *dirp)
+{
+  Mutex::Locker lock(client_lock);
+  ldout(cct, 3) << "ll_fsyncdir " << dirp << dendl;
+  tout(cct) << "ll_fsyncdir" << std::endl;
+  tout(cct) << (unsigned long)dirp << std::endl;
+
+  return _fsync(dirp->inode.get(), false);
+}
+
 int Client::ll_open(Inode *in, int flags, Fh **fhp, int uid, int gid)
 {
   assert(!(flags & O_CREAT));
@@ -9682,9 +10297,11 @@ int Client::ll_open(Inode *in, int flags, Fh **fhp, int uid, int gid)
     uid = geteuid();
     gid = getegid();
   }
-  r = check_permissions(in, flags, uid, gid);
-  if (r < 0)
-    goto out;
+  if (!cct->_conf->fuse_default_permissions) {
+    r = check_permissions(in, flags, uid, gid);
+    if (r < 0)
+      goto out;
+  }
 
   r = _open(in, flags, 0, fhp /* may be NULL */, uid, gid);
 
@@ -9713,7 +10330,7 @@ int Client::ll_create(Inode *parent, const char *name, mode_t mode,
   tout(cct) << flags << std::endl;
 
   bool created = false;
-  Inode *in = NULL;
+  InodeRef in;
   int r = _lookup(parent, name, &in);
 
   if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
@@ -9724,9 +10341,6 @@ int Client::ll_create(Inode *parent, const char *name, mode_t mode,
 	        0, 0, 0, NULL, &created, uid, gid);
     if (r < 0)
       goto out;
-
-    if ((!in) && fhp)
-      in = (*fhp)->inode;
   }
 
   if (r < 0)
@@ -9737,16 +10351,18 @@ int Client::ll_create(Inode *parent, const char *name, mode_t mode,
 
   ldout(cct, 20) << "ll_create created = " << created << dendl;
   if (!created) {
-    r = check_permissions(in, flags, uid, gid);
-    if (r < 0) {
-      if (fhp && *fhp) {
-	int release_r = _release_fh(*fhp);
-        assert(release_r == 0);  // during create, no async data ops should have happened
+    if (!cct->_conf->fuse_default_permissions) {
+      r = check_permissions(in.get(), flags, uid, gid);
+      if (r < 0) {
+	if (fhp && *fhp) {
+	  int release_r = _release_fh(*fhp);
+	  assert(release_r == 0);  // during create, no async data ops should have happened
+	}
+	goto out;
       }
-      goto out;
     }
     if (fhp && (*fhp == NULL)) {
-      r = _open(in, flags, mode, fhp);
+      r = _open(in.get(), flags, mode, fhp);
       if (r < 0)
 	goto out;
     }
@@ -9766,8 +10382,8 @@ out:
   // passing an Inode in outp requires an additional ref
   if (outp) {
     if (in)
-      _ll_get(in);
-    *outp = in;
+      _ll_get(in.get());
+    *outp = in.get();
   }
 
   return r;
@@ -9942,7 +10558,7 @@ int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
   tout(cct) << off << std::endl;
   tout(cct) << len << std::endl;
 
-  int r = _write(fh, off, len, data);
+  int r = _write(fh, off, len, data, NULL, 0);
   ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
 		<< dendl;
   return r;
@@ -9981,10 +10597,12 @@ int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
   if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE))
     return -EOPNOTSUPP;
 
-  if (objecter->osdmap_full_flag() && !(mode & FALLOC_FL_PUNCH_HOLE))
-    return -ENOSPC;
+  Inode *in = fh->inode.get();
 
-  Inode *in = fh->inode;
+  if (objecter->osdmap_pool_full(in->layout.fl_pg_pool)
+      && !(mode & FALLOC_FL_PUNCH_HOLE)) {
+    return -ENOSPC;
+  }
 
   if (in->snapid != CEPH_NOSNAP)
     return -EROFS;
@@ -10216,7 +10834,7 @@ int Client::describe_layout(const char *relpath, ceph_file_layout *lp)
   Mutex::Locker lock(client_lock);
 
   filepath path(relpath);
-  Inode *in;
+  InodeRef in;
   int r = path_walk(path, &in);
   if (r < 0)
     return r;
@@ -10234,7 +10852,7 @@ int Client::fdescribe_layout(int fd, ceph_file_layout *lp)
   Fh *f = get_filehandle(fd);
   if (!f)
     return -EBADF;
-  Inode *in = f->inode;
+  Inode *in = f->inode.get();
 
   *lp = in->layout;
 
@@ -10285,7 +10903,7 @@ int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& o
   Fh *f = get_filehandle(fd);
   if (!f)
     return -EBADF;
-  Inode *in = f->inode;
+  Inode *in = f->inode.get();
 
   vector<ObjectExtent> extents;
   Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents);
@@ -10339,7 +10957,7 @@ int Client::get_file_stripe_address(int fd, loff_t offset, vector<entity_addr_t>
   Fh *f = get_filehandle(fd);
   if (!f)
     return -EBADF;
-  Inode *in = f->inode;
+  Inode *in = f->inode.get();
 
   // which object?
   vector<ObjectExtent> extents;
@@ -10385,7 +11003,7 @@ int Client::enumerate_layout(int fd, vector<ObjectExtent>& result,
   Fh *f = get_filehandle(fd);
   if (!f)
     return -EBADF;
-  Inode *in = f->inode;
+  Inode *in = f->inode.get();
 
   // map to a list of extents
   Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result);
@@ -10534,7 +11152,7 @@ Inode *Client::get_quota_root(Inode *in)
     if (!in->dn_set.empty())
       in = in->get_first_parent()->dir->parent_inode;
     else if (root_parents.count(in))
-      in = root_parents[in];
+      in = root_parents[in].get();
     else
       in = NULL;
   }
@@ -10572,63 +11190,68 @@ Inode *Client::get_quota_root(Inode *in)
   return ancestor->in();
 }
 
-bool Client::is_quota_files_exceeded(Inode *in)
+/**
+ * Traverse quota ancestors of the Inode, return true
+ * if any of them passes the passed function
+ */
+bool Client::check_quota_condition(
+    Inode *in, std::function<bool (const Inode &in)> test)
 {
   if (!cct->_conf->client_quota)
     return false;
 
-  while (in != root_ancestor) {
-    quota_info_t *quota = &in->quota;
-    nest_info_t *rstat = &in->rstat;
-
-    if (quota->max_files && rstat->rsize() >= quota->max_files)
+  while (true) {
+    assert(in != NULL);
+    if (test(*in)) {
       return true;
+    }
 
-    in = get_quota_root(in);
+    if (in == root_ancestor) {
+      // We're done traversing, drop out
+      return false;
+    } else {
+      // Continue up the tree
+      in = get_quota_root(in);
+    }
   }
+
   return false;
 }
 
-bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes)
+bool Client::is_quota_files_exceeded(Inode *in)
 {
-  if (!cct->_conf->client_quota)
-    return false;
-
-  while (in != root_ancestor) {
-    quota_info_t *quota = &in->quota;
-    nest_info_t *rstat = &in->rstat;
-
-    if (quota->max_bytes && (rstat->rbytes + new_bytes) > quota->max_bytes)
-      return true;
+  return check_quota_condition(in, 
+      [](const Inode &in) {
+        return in.quota.max_files && in.rstat.rsize() >= in.quota.max_files;
+      });
+}
 
-    in = get_quota_root(in);
-  }
-  return false;
+bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes)
+{
+  return check_quota_condition(in, 
+      [&new_bytes](const Inode &in) {
+        return in.quota.max_bytes && (in.rstat.rbytes + new_bytes)
+               > in.quota.max_bytes;
+      });
 }
 
 bool Client::is_quota_bytes_approaching(Inode *in)
 {
-  if (!cct->_conf->client_quota)
-    return false;
-
-  while (in != root_ancestor) {
-    quota_info_t *quota = &in->quota;
-    nest_info_t *rstat = &in->rstat;
-
-    if (quota->max_bytes) {
-      if (rstat->rbytes >= quota->max_bytes)
-        return true;
-
-      assert(in->size >= in->reported_size);
-      uint64_t space = quota->max_bytes - rstat->rbytes;
-      uint64_t size = in->size - in->reported_size;
-      if ((space >> 4) < size)
-        return true;
-    }
-
-    in = get_quota_root(in);
-  }
-  return false;
+  return check_quota_condition(in, 
+      [](const Inode &in) {
+        if (in.quota.max_bytes) {
+          if (in.rstat.rbytes >= in.quota.max_bytes) {
+            return true;
+          }
+
+          assert(in.size >= in.reported_size);
+          const uint64_t space = in.quota.max_bytes - in.rstat.rbytes;
+          const uint64_t size = in.size - in.reported_size;
+          return (space >> 4) < size;
+        } else {
+          return false;
+        }
+      });
 }
 
 enum {
@@ -10760,3 +11383,32 @@ void Client::set_cap_epoch_barrier(epoch_t e)
   cap_epoch_barrier = e;
 }
 
+const char** Client::get_tracked_conf_keys() const
+{
+  static const char* keys[] = {
+    "client_cache_size",
+    "client_cache_mid",
+    NULL
+  };
+  return keys;
+}
+
+void Client::handle_conf_change(const struct md_config_t *conf,
+				const std::set <std::string> &changed)
+{
+  if (changed.count("client_cache_size") ||
+      changed.count("client_cache_mid")) {
+    lru.lru_set_max(cct->_conf->client_cache_size);
+    lru.lru_set_midpoint(cct->_conf->client_cache_mid);
+  }
+}
+
+void intrusive_ptr_add_ref(Inode *in)
+{
+  in->get();
+}
+		
+void intrusive_ptr_release(Inode *in)
+{
+  in->client->put_inode(in);
+}
diff --git a/src/client/Client.h b/src/client/Client.h
index b476f5e..d92609f 100644
--- a/src/client/Client.h
+++ b/src/client/Client.h
@@ -19,6 +19,7 @@
 #include "include/types.h"
 
 // stl
+#include <functional>
 #include <string>
 #include <memory>
 #include <set>
@@ -47,12 +48,13 @@ using std::fstream;
 #include "common/Mutex.h"
 #include "common/Timer.h"
 #include "common/Finisher.h"
-
 #include "common/compiler_extensions.h"
 #include "common/cmdparse.h"
 
 #include "osdc/ObjectCacher.h"
 
+#include "InodeRef.h"
+
 class MDSMap;
 class MonClient;
 
@@ -80,8 +82,6 @@ enum {
   l_c_first = 20000,
   l_c_reply,
   l_c_lat,
-  l_c_owrlat,
-  l_c_ordlat,
   l_c_wrlat,
   l_c_last,
 };
@@ -97,6 +97,8 @@ struct CommandOp
   std::string  *outs;
 };
 
+/* error code for ceph_fuse */
+#define CEPH_FUSE_NO_MDS_UP    -(1<<2) /* no mds up deteced in ceph_fuse */
 
 // ============================================
 // types for my local metadata cache
@@ -119,7 +121,6 @@ struct DirEntry {
   DirEntry(const string &n, struct stat& s, int stm) : d_name(n), st(s), stmask(stm) {}
 };
 
-struct Inode;
 struct Cap;
 class Dir;
 class Dentry;
@@ -169,7 +170,7 @@ struct dir_result_t {
   }
 
 
-  Inode *inode;
+  InodeRef inode;
 
   int64_t offset;        // high bits: frag_t, low bits: an offset
 
@@ -182,7 +183,7 @@ struct dir_result_t {
   int start_shared_gen;  // dir shared_gen at start of readdir
 
   frag_t buffer_frag;
-  vector<pair<string,Inode*> > *buffer;
+  vector<pair<string,InodeRef> > *buffer;
 
   string at_cache_name;  // last entry we successfully returned
 
@@ -216,7 +217,7 @@ struct dir_result_t {
   }
 };
 
-class Client : public Dispatcher {
+class Client : public Dispatcher, public md_config_obs_t {
  public:
   using Dispatcher::cct;
 
@@ -292,20 +293,25 @@ public:
   void resend_unsafe_requests(MetaSession *s);
 
   // mds requests
-  ceph_tid_t last_tid, last_flush_seq;
+  ceph_tid_t last_tid;
+  ceph_tid_t oldest_tid; // oldest incomplete mds request, excluding setfilelock requests
   map<ceph_tid_t, MetaRequest*> mds_requests;
 
+  // cap flushing
+  ceph_tid_t last_flush_tid;
+
   void dump_mds_requests(Formatter *f);
   void dump_mds_sessions(Formatter *f);
 
   int make_request(MetaRequest *req, int uid, int gid,
 		   //MClientRequest *req, int uid, int gid,
-		   Inode **ptarget = 0, bool *pcreated = 0,
+		   InodeRef *ptarget = 0, bool *pcreated = 0,
 		   int use_mds=-1, bufferlist *pdirbl=0);
   void put_request(MetaRequest *request);
+  void unregister_request(MetaRequest *request);
 
   int verify_reply_trace(int r, MetaRequest *request, MClientReply *reply,
-			 Inode **ptarget, bool *pcreated, int uid, int gid);
+			 InodeRef *ptarget, bool *pcreated, int uid, int gid);
   void encode_cap_releases(MetaRequest *request, mds_rank_t mds);
   int encode_inode_release(Inode *in, MetaRequest *req,
 			   mds_rank_t mds, int drop,
@@ -321,6 +327,7 @@ public:
   void kick_requests_closed(MetaSession *session);
   void handle_client_request_forward(MClientRequestForward *reply);
   void handle_client_reply(MClientReply *reply);
+  bool is_dir_operation(MetaRequest *request);
 
   bool   initialized;
   bool   authenticated;
@@ -334,7 +341,7 @@ public:
 
 public:
   entity_name_t get_myname() { return messenger->get_myname(); } 
-  void sync_write_commit(Inode *in);
+  void sync_write_commit(InodeRef& in);
 
 protected:
   Filer                 *filer;     
@@ -344,8 +351,19 @@ protected:
 
   // cache
   ceph::unordered_map<vinodeno_t, Inode*> inode_map;
+
+  // fake inode number for 32-bits ino_t
+  ceph::unordered_map<ino_t, vinodeno_t> faked_ino_map;
+  interval_set<ino_t> free_faked_inos;
+  ino_t last_used_faked_ino;
+  void _assign_faked_ino(Inode *in);
+  void _release_faked_ino(Inode *in);
+  bool _use_faked_inos;
+  void _reset_faked_inos();
+  vinodeno_t _map_faked_ino(ino_t ino);
+
   Inode*                 root;
-  map<Inode*, Inode*>    root_parents;
+  map<Inode*, InodeRef>  root_parents;
   Inode*                 root_ancestor;
   LRU                    lru;    // lru list of Dentry's in our local metadata cache.
 
@@ -422,6 +440,7 @@ protected:
   friend class C_Client_SyncCommit; // Asserts on client_lock
   friend class C_Client_RequestInterrupt;
   friend class C_Client_Remount;
+  friend void intrusive_ptr_release(Inode *in);
 
   //int get_cache_size() { return lru.lru_get_size(); }
   //void set_cache_size(int m) { lru.lru_set_max(m); }
@@ -435,13 +454,16 @@ protected:
   void unlink(Dentry *dn, bool keepdir, bool keepdentry);
 
   // path traversal for high-level interface
-  Inode *cwd;
-  int path_walk(const filepath& fp, Inode **end, bool followsym=true);
+  InodeRef cwd;
+  int path_walk(const filepath& fp, InodeRef *end, bool followsym=true);
   int fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat=0, nest_info_t *rstat=0);
+  int fill_stat(InodeRef& in, struct stat *st, frag_info_t *dirstat=0, nest_info_t *rstat=0) {
+    return fill_stat(in.get(), st, dirstat, rstat);
+  }
   void touch_dn(Dentry *dn);
 
   // trim cache.
-  void trim_cache();
+  void trim_cache(bool trim_kernel_dcache=false);
   void trim_cache_for_reconnect(MetaSession *s);
   void trim_dentry(Dentry *dn);
   void trim_caps(MetaSession *s, int max);
@@ -476,6 +498,10 @@ protected:
   void put_qtree(Inode *in);
   void invalidate_quota_tree(Inode *in);
   Inode* get_quota_root(Inode *in);
+
+  bool check_quota_condition(
+      Inode *in,
+      std::function<bool (const Inode &)> test);
   bool is_quota_files_exceeded(Inode *in);
   bool is_quota_bytes_exceeded(Inode *in, int64_t new_bytes);
   bool is_quota_bytes_approaching(Inode *in);
@@ -484,6 +510,14 @@ protected:
   list<Cond*> waiting_for_pool_perm;
   int check_pool_perm(Inode *in, int need);
 
+  /**
+   * Call this when an OSDMap is seen with a full flag (global or per pool)
+   * set.
+   *
+   * @param pool the pool ID affected, or -1 if all.
+   */
+  void _handle_full_flag(int64_t pool);
+
  public:
   void set_filer_flags(int flags);
   void clear_filer_flags(int flags);
@@ -520,10 +554,12 @@ protected:
   void remove_all_caps(Inode *in);
   void remove_session_caps(MetaSession *session);
   void mark_caps_dirty(Inode *in, int caps);
-  int mark_caps_flushing(Inode *in);
+  int mark_caps_flushing(Inode *in, ceph_tid_t *ptid);
+  void adjust_session_flushing_caps(Inode *in, MetaSession *old_s, MetaSession *new_s);
   void flush_caps();
   void flush_caps(Inode *in, MetaSession *session);
   void kick_flushing_caps(MetaSession *session);
+  void early_kick_flushing_caps(MetaSession *session);
   void kick_maxsize_requests(MetaSession *session);
   int get_caps(Inode *in, int need, int want, int *have, loff_t endoff);
   int get_caps_used(Inode *in);
@@ -542,13 +578,15 @@ protected:
   void handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, class MClientCaps *m);
   void cap_delay_requeue(Inode *in);
   void send_cap(Inode *in, MetaSession *session, Cap *cap,
-		int used, int want, int retain, int flush);
+		int used, int want, int retain, int flush,
+		ceph_tid_t flush_tid);
   void check_caps(Inode *in, bool is_delayed);
   void get_cap_ref(Inode *in, int cap);
   void put_cap_ref(Inode *in, int cap);
   void flush_snaps(Inode *in, bool all_again=false, CapSnap *again=0);
-  void wait_sync_caps(uint64_t want);
-  void queue_cap_snap(Inode *in, snapid_t seq=0);
+  void wait_sync_caps(Inode *in, ceph_tid_t want);
+  void wait_sync_caps(ceph_tid_t want);
+  void queue_cap_snap(Inode *in, SnapContext &old_snapc);
   void finish_cap_snap(Inode *in, CapSnap *capsnap, int used);
   void _flushed_cap_snap(Inode *in, snapid_t seq);
 
@@ -559,7 +597,7 @@ protected:
   void _schedule_invalidate_callback(Inode *in, int64_t off, int64_t len, bool keep_caps);
   void _invalidate_inode_cache(Inode *in);
   void _invalidate_inode_cache(Inode *in, int64_t off, int64_t len);
-  void _async_invalidate(Inode *in, int64_t off, int64_t len, bool keep_caps);
+  void _async_invalidate(InodeRef& in, int64_t off, int64_t len, bool keep_caps);
   void _release(Inode *in);
   
   /**
@@ -600,6 +638,8 @@ protected:
 			      Dentry *old_dentry = NULL);
   void update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session);
 
+  bool use_faked_inos() { return _use_faked_inos; }
+  vinodeno_t map_faked_ino(ino_t ino);
 
   // ----------------------
   // fs ops.
@@ -635,7 +675,6 @@ private:
     Client *client;
     Fh *f;
     C_Readahead(Client *c, Fh *f);
-    ~C_Readahead();
     void finish(int r);
   };
 
@@ -644,32 +683,42 @@ private:
 
   // internal interface
   //   call these with client_lock held!
-  int _do_lookup(Inode *dir, const string& name, Inode **target);
-  int _lookup(Inode *dir, const string& dname, Inode **target);
+  int _do_lookup(Inode *dir, const string& name, InodeRef *target);
+  int _lookup(Inode *dir, const string& dname, InodeRef *target);
 
-  int _link(Inode *in, Inode *dir, const char *name, int uid=-1, int gid=-1, Inode **inp = 0);
+  int _link(Inode *in, Inode *dir, const char *name, int uid=-1, int gid=-1, InodeRef *inp = 0);
   int _unlink(Inode *dir, const char *name, int uid=-1, int gid=-1);
   int _rename(Inode *olddir, const char *oname, Inode *ndir, const char *nname, int uid=-1, int gid=-1);
-  int _mkdir(Inode *dir, const char *name, mode_t mode, int uid=-1, int gid=-1, Inode **inp = 0);
+  int _mkdir(Inode *dir, const char *name, mode_t mode, int uid=-1, int gid=-1, InodeRef *inp = 0);
   int _rmdir(Inode *dir, const char *name, int uid=-1, int gid=-1);
-  int _symlink(Inode *dir, const char *name, const char *target, int uid=-1, int gid=-1, Inode **inp = 0);
-  int _mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev, int uid=-1, int gid=-1, Inode **inp = 0);
-  int _setattr(Inode *in, struct stat *attr, int mask, int uid=-1, int gid=-1, Inode **inp = 0);
+  int _symlink(Inode *dir, const char *name, const char *target, int uid=-1, int gid=-1, InodeRef *inp = 0);
+  int _mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev, int uid=-1, int gid=-1, InodeRef *inp = 0);
+  int _setattr(Inode *in, struct stat *attr, int mask, int uid=-1, int gid=-1, InodeRef *inp = 0);
+  int _setattr(InodeRef &in, struct stat *attr, int mask, int uid=-1, int gid=-1, InodeRef *inp = 0) {
+    return _setattr(in.get(), attr, mask, uid, gid, inp);
+  }
   int _getattr(Inode *in, int mask, int uid=-1, int gid=-1, bool force=false);
+  int _getattr(InodeRef &in, int mask, int uid=-1, int gid=-1, bool force=false) {
+    return _getattr(in.get(), mask, uid, gid, force);
+  }
   int _readlink(Inode *in, char *buf, size_t size);
   int _getxattr(Inode *in, const char *name, void *value, size_t len, int uid=-1, int gid=-1);
   int _listxattr(Inode *in, char *names, size_t len, int uid=-1, int gid=-1);
   int _setxattr(Inode *in, const char *name, const void *value, size_t len, int flags, int uid=-1, int gid=-1);
   int _removexattr(Inode *in, const char *nm, int uid=-1, int gid=-1);
   int _open(Inode *in, int flags, mode_t mode, Fh **fhp, int uid=-1, int gid=-1);
-  int _create(Inode *in, const char *name, int flags, mode_t mode, Inode **inp, Fh **fhp,
+  int _create(Inode *in, const char *name, int flags, mode_t mode, InodeRef *inp, Fh **fhp,
               int stripe_unit, int stripe_count, int object_size, const char *data_pool,
 	      bool *created = NULL, int uid=-1, int gid=-1);
+
   loff_t _lseek(Fh *fh, loff_t offset, int whence);
   int _read(Fh *fh, int64_t offset, uint64_t size, bufferlist *bl);
-  int _write(Fh *fh, int64_t offset, uint64_t size, const char *buf);
+  int _write(Fh *fh, int64_t offset, uint64_t size, const char *buf,
+          const struct iovec *iov, int iovcnt);
+  int _preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write);
   int _flush(Fh *fh);
   int _fsync(Fh *fh, bool syncdataonly);
+  int _fsync(Inode *in, bool syncdataonly);
   int _sync_fs();
   int _fallocate(Fh *fh, int mode, int64_t offset, int64_t length);
   int _getlk(Fh *fh, struct flock *fl, uint64_t owner);
@@ -681,6 +730,8 @@ private:
 
   int check_permissions(Inode *in, int flags, int uid, int gid);
 
+  int check_data_pool_exist(string name, string value, const OSDMap *osdmap);
+
   vinodeno_t _get_vino(Inode *in);
   inodeno_t _get_inodeno(Inode *in);
 
@@ -739,7 +790,7 @@ private:
   void _release_filelocks(Fh *fh);
   void _update_lock_state(struct flock *fl, uint64_t owner, ceph_lock_state_t *lock_state);
 public:
-  int mount(const std::string &mount_root);
+  int mount(const std::string &mount_root, bool require_mds=false);
   void unmount();
 
   int mds_command(
@@ -821,6 +872,7 @@ public:
   int lchown(const char *path, int uid, int gid);
   int utime(const char *path, struct utimbuf *buf);
   int lutime(const char *path, struct utimbuf *buf);
+  int flock(int fd, int operation, uint64_t owner);
   int truncate(const char *path, loff_t size);
 
   // file ops
@@ -834,7 +886,9 @@ public:
   int close(int fd);
   loff_t lseek(int fd, loff_t offset, int whence);
   int read(int fd, char *buf, loff_t size, loff_t offset=-1);
+  int preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset=-1);
   int write(int fd, const char *buf, loff_t size, loff_t offset=-1);
+  int pwritev(int fd, const struct iovec *iov, int iovcnt, loff_t offset=-1);
   int fake_write_size(int fd, loff_t size);
   int ftruncate(int fd, loff_t size);
   int fsync(int fd, bool syncdataonly);
@@ -844,12 +898,16 @@ public:
   // full path xattr ops
   int getxattr(const char *path, const char *name, void *value, size_t size);
   int lgetxattr(const char *path, const char *name, void *value, size_t size);
+  int fgetxattr(int fd, const char *name, void *value, size_t size);
   int listxattr(const char *path, char *list, size_t size);
   int llistxattr(const char *path, char *list, size_t size);
+  int flistxattr(int fd, char *list, size_t size);
   int removexattr(const char *path, const char *name);
   int lremovexattr(const char *path, const char *name);
+  int fremovexattr(int fd, const char *name);
   int setxattr(const char *path, const char *name, const void *value, size_t size, int flags);
   int lsetxattr(const char *path, const char *name, const void *value, size_t size, int flags);
+  int fsetxattr(int fd, const char *name, const void *value, size_t size, int flags);
 
   int sync_fs();
   int64_t drop_caches();
@@ -892,6 +950,8 @@ public:
     Mutex::Locker lock(client_lock);
     return _get_vino(in);
   }
+  // get inode from faked ino
+  Inode *ll_get_inode(ino_t ino);
   Inode *ll_get_inode(vinodeno_t vino);
   int ll_lookup(Inode *parent, const char *name, struct stat *attr,
 		Inode **out, int uid = -1, int gid = -1);
@@ -908,6 +968,7 @@ public:
   int ll_listxattr(Inode *in, char *list, size_t size, int uid=-1, int gid=-1);
   int ll_opendir(Inode *in, dir_result_t **dirpp, int uid = -1, int gid = -1);
   int ll_releasedir(dir_result_t* dirp);
+  int ll_fsyncdir(dir_result_t* dirp);
   int ll_readlink(Inode *in, char *buf, size_t bufsize, int uid = -1, int gid = -1);
   int ll_mknod(Inode *in, const char *name, mode_t mode, dev_t rdev,
 	       struct stat *attr, Inode **out, int uid = -1, int gid = -1);
@@ -952,6 +1013,7 @@ public:
   int ll_getlk(Fh *fh, struct flock *fl, uint64_t owner);
   int ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep, void *fuse_req);
   int ll_flock(Fh *fh, int cmd, uint64_t owner, void *fuse_req);
+  int ll_file_layout(Fh *fh, ceph_file_layout *layout);
   void ll_interrupt(void *d);
   int ll_get_stripe_osd(struct Inode *in, uint64_t blockno,
 			ceph_file_layout* layout);
@@ -963,6 +1025,10 @@ public:
 
   void ll_register_callbacks(struct client_callback_args *args);
   int test_dentry_handling(bool can_invalidate);
+
+  virtual const char** get_tracked_conf_keys() const;
+  virtual void handle_conf_change(const struct md_config_t *conf,
+	                          const std::set <std::string> &changed);
 };
 
 #endif
diff --git a/src/client/Dentry.h b/src/client/Dentry.h
index aad6343..198b375 100644
--- a/src/client/Dentry.h
+++ b/src/client/Dentry.h
@@ -5,17 +5,18 @@
 #include "include/xlist.h"
 
 #include "mds/mdstypes.h"
+#include "InodeRef.h"
 
 class Dir;
 struct Inode;
 
 class Dentry : public LRUObject {
  public:
-  string  name;                      // sort of lame
+  string   name;                      // sort of lame
   //const char *name;
-  Dir     *dir;
-  Inode   *inode;
-  int     ref;                       // 1 if there's a dir beneath me.
+  Dir	   *dir;
+  InodeRef inode;
+  int	   ref;                       // 1 if there's a dir beneath me.
   uint64_t offset;
   mds_rank_t lease_mds;
   utime_t lease_ttl;
@@ -47,7 +48,7 @@ class Dentry : public LRUObject {
   void dump(Formatter *f) const;
 
   Dentry() :
-    dir(0), inode(0), ref(1), offset(0),
+    dir(0), ref(1), offset(0),
     lease_mds(-1), lease_gen(0), lease_seq(0), cap_shared_gen(0),
     item_dentry_list(this)  { }
 private:
diff --git a/src/client/Fh.h b/src/client/Fh.h
index dcf70cd..db3a28c 100644
--- a/src/client/Fh.h
+++ b/src/client/Fh.h
@@ -3,16 +3,16 @@
 
 #include "common/Readahead.h"
 #include "include/types.h"
+#include "InodeRef.h"
 
-struct Inode;
 class Cond;
 class ceph_lock_state_t;
 
 // file handle for any open file state
 
 struct Fh {
+  InodeRef  inode;
   int	    _ref;
-  Inode     *inode;
   loff_t    pos;
   int       mds;        // have to talk to mds we opened with (for now)
   int       mode;       // the mode i opened the file with
@@ -27,7 +27,7 @@ struct Fh {
   ceph_lock_state_t *fcntl_locks;
   ceph_lock_state_t *flock_locks;
 
-  Fh() : _ref(1), inode(0), pos(0), mds(0), mode(0), flags(0), pos_locked(false),
+  Fh() : _ref(1), pos(0), mds(0), mode(0), flags(0), pos_locked(false),
       readahead(), fcntl_locks(NULL), flock_locks(NULL) {}
   void get() { ++_ref; }
   int put() { return --_ref; }
diff --git a/src/client/Inode.cc b/src/client/Inode.cc
index c63ba1c..03e7a07 100644
--- a/src/client/Inode.cc
+++ b/src/client/Inode.cc
@@ -1,16 +1,18 @@
 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
 // vim: ts=8 sw=2 smarttab
 
-#include "MetaSession.h"
+#include "Client.h"
 #include "Inode.h"
 #include "Dentry.h"
 #include "Dir.h"
+#include "MetaSession.h"
 #include "ClientSnapRealm.h"
 
 ostream& operator<<(ostream &out, Inode &in)
 {
   out << in.vino() << "("
-      << "ref=" << in._ref
+      << "faked_ino=" << in.faked_ino
+      << " ref=" << in._ref
       << " ll_ref=" << in.ll_ref
       << " cap_refs=" << in.cap_refs
       << " open=" << in.open_by_mode
@@ -126,7 +128,7 @@ int Inode::put_cap_ref(int cap)
     if (cap & 1) {
       int c = 1 << n;
       if (cap_refs[c] <= 0) {
-	lderr(cct) << "put_cap_ref " << ccap_string(c) << " went negative on " << *this << dendl;
+	lderr(client->cct) << "put_cap_ref " << ccap_string(c) << " went negative on " << *this << dendl;
 	assert(cap_refs[c] > 0);
       }
       if (--cap_refs[c] == 0)
@@ -151,10 +153,10 @@ bool Inode::cap_is_valid(Cap* cap)
     << "cap expire  " << cap->session->cap_ttl << std::endl
     << "cur time    " << ceph_clock_now(cct) << std::endl;*/
   if ((cap->session->cap_gen <= cap->gen)
-      && (ceph_clock_now(cct) < cap->session->cap_ttl)) {
+      && (ceph_clock_now(client->cct) < cap->session->cap_ttl)) {
     return true;
   }
-  return true;
+  return false;
 }
 
 int Inode::caps_issued(int *implemented)
@@ -268,7 +270,7 @@ Dir *Inode::open_dir()
 {
   if (!dir) {
     dir = new Dir(this);
-    lsubdout(cct, mds, 15) << "open_dir " << dir << " on " << this << dendl;
+    lsubdout(client->cct, client, 15) << "open_dir " << dir << " on " << this << dendl;
     assert(dn_set.size() < 2); // dirs can't be hard-linked
     if (!dn_set.empty())
       (*dn_set.begin())->get();      // pin dentry
@@ -307,6 +309,21 @@ bool Inode::check_mode(uid_t ruid, gid_t rgid, gid_t *sgids, int sgids_count, ui
   return (mode & fmode) == fmode;
 }
 
+void Inode::get() {
+  _ref++;
+  lsubdout(client->cct, client, 15) << "inode.get on " << this << " " <<  ino << '.' << snapid
+				    << " now " << _ref << dendl;
+}
+
+//private method to put a reference; see Client::put_inode()
+int Inode::_put(int n) {
+  _ref -= n;
+  lsubdout(client->cct, client, 15) << "inode.put on " << this << " " << ino << '.' << snapid
+				    << " now " << _ref << dendl;
+  assert(_ref >= 0);
+  return _ref;
+}
+
 
 void Inode::dump(Formatter *f) const
 {
@@ -378,13 +395,12 @@ void Inode::dump(Formatter *f) const
   f->dump_stream("dirty_caps") << ccap_string(dirty_caps);
   if (flushing_caps) {
     f->dump_stream("flushings_caps") << ccap_string(flushing_caps);
-    f->dump_unsigned("flushing_cap_seq", flushing_cap_seq);
     f->open_object_section("flushing_cap_tid");
-    for (unsigned bit = 0; bit < CEPH_CAP_BITS; bit++) {
-      if (flushing_caps & (1 << bit)) {
-	string n(ccap_string(1 << bit));
-	f->dump_unsigned(n.c_str(), flushing_cap_tid[bit]);
-      }
+    for (map<ceph_tid_t, int>::const_iterator p = flushing_cap_tids.begin();
+	 p != flushing_cap_tids.end();
+	 ++p) {
+      string n(ccap_string(p->second));
+      f->dump_unsigned(n.c_str(), p->first);
     }
     f->close_section();
   }
@@ -396,7 +412,6 @@ void Inode::dump(Formatter *f) const
   }
 
   f->dump_stream("hold_caps_until") << hold_caps_until;
-  f->dump_unsigned("last_flush_tid", last_flush_tid);
 
   if (snaprealm) {
     f->open_object_section("snaprealm");
diff --git a/src/client/Inode.h b/src/client/Inode.h
index 4a27402..07061e2 100644
--- a/src/client/Inode.h
+++ b/src/client/Inode.h
@@ -13,12 +13,16 @@
 #include "osdc/ObjectCacher.h"
 #include "include/assert.h"
 
+#include "InodeRef.h"
+
+class Client;
 struct MetaSession;
 class Dentry;
 class Dir;
 struct SnapRealm;
 struct Inode;
 class ceph_lock_state_t;
+class MetaRequest;
 
 struct Cap {
   MetaSession *session;
@@ -41,7 +45,7 @@ struct Cap {
 
 struct CapSnap {
   //snapid_t follows;  // map key
-  Inode *in;
+  InodeRef in;
   SnapContext context;
   int issued, dirty;
 
@@ -54,6 +58,9 @@ struct CapSnap {
   map<string,bufferptr> xattrs;
   version_t xattr_version;
 
+  bufferlist inline_data;
+  version_t inline_version;
+
   bool writing, dirty_data;
   uint64_t flush_tid;
   xlist<CapSnap*>::item flushing_item;
@@ -61,7 +68,7 @@ struct CapSnap {
   CapSnap(Inode *i)
     : in(i), issued(0), dirty(0),
       size(0), time_warp_seq(0), mode(0), uid(0), gid(0), xattr_version(0),
-      writing(false), dirty_data(false), flush_tid(0),
+      inline_version(0), writing(false), dirty_data(false), flush_tid(0),
       flushing_item(this)
   {}
 
@@ -147,11 +154,13 @@ public:
 #define I_DIR_ORDERED 2
 
 struct Inode {
-  CephContext *cct;
+  Client *client;
 
   // -- the actual inode --
   inodeno_t ino;
   snapid_t  snapid;
+  ino_t faked_ino;
+
   uint32_t   rdev;    // if special file
 
   // affected by any inode change...
@@ -226,17 +235,15 @@ struct Inode {
   map<mds_rank_t, Cap*> caps;            // mds -> Cap
   Cap *auth_cap;
   unsigned dirty_caps, flushing_caps;
-  uint64_t flushing_cap_seq;
-  __u16 flushing_cap_tid[CEPH_CAP_BITS];
+  std::map<ceph_tid_t, int> flushing_cap_tids;
   int shared_gen, cache_gen;
   int snap_caps, snap_cap_refs;
   utime_t hold_caps_until;
   xlist<Inode*>::item cap_item, flushing_cap_item;
-  ceph_tid_t last_flush_tid;
 
   SnapRealm *snaprealm;
   xlist<Inode*>::item snaprealm_item;
-  Inode *snapdir_parent;  // only if we are a snapdir inode
+  InodeRef snapdir_parent;  // only if we are a snapdir inode
   map<snapid_t,CapSnap*> cap_snaps;   // pending flush to mds
 
   //int open_by_mode[CEPH_FILE_MODE_NUM];
@@ -267,19 +274,8 @@ struct Inode {
   void make_long_path(filepath& p);
   void make_nosnap_relative_path(filepath& p);
 
-  void get() {
-    _ref++;
-    lsubdout(cct, mds, 15) << "inode.get on " << this << " " <<  ino << '.' << snapid
-                           << " now " << _ref << dendl;
-  }
-  /// private method to put a reference; see Client::put_inode()
-  int _put(int n=1) {
-    _ref -= n; 
-    lsubdout(cct, mds, 15) << "inode.put on " << this << " " << ino << '.' << snapid
-                           << " now " << _ref << dendl;
-    assert(_ref >= 0);
-    return _ref;
-  }
+  void get();
+  int _put(int n=1);
 
   int get_num_ref() {
     return _ref;
@@ -297,8 +293,10 @@ struct Inode {
   ceph_lock_state_t *fcntl_locks;
   ceph_lock_state_t *flock_locks;
 
-  Inode(CephContext *cct_, vinodeno_t vino, ceph_file_layout *newlayout)
-    : cct(cct_), ino(vino.ino), snapid(vino.snapid),
+  xlist<MetaRequest*> unsafe_dir_ops;
+
+  Inode(Client *c, vinodeno_t vino, ceph_file_layout *newlayout)
+    : client(c), ino(vino.ino), snapid(vino.snapid), faked_ino(0),
       rdev(0), mode(0), uid(0), gid(0), nlink(0),
       size(0), truncate_seq(1), truncate_size(-1),
       time_warp_seq(0), max_size(0), version(0), xattr_version(0),
@@ -306,10 +304,10 @@ struct Inode {
       flags(0),
       qtree(NULL),
       dir_hashed(false), dir_replicated(false), auth_cap(NULL),
-      dirty_caps(0), flushing_caps(0), flushing_cap_seq(0), shared_gen(0), cache_gen(0),
+      dirty_caps(0), flushing_caps(0), shared_gen(0), cache_gen(0),
       snap_caps(0), snap_cap_refs(0),
-      cap_item(this), flushing_cap_item(this), last_flush_tid(0),
-      snaprealm(0), snaprealm_item(this), snapdir_parent(0),
+      cap_item(this), flushing_cap_item(this),
+      snaprealm(0), snaprealm_item(this),
       oset((void *)this, newlayout->fl_pg_pool, ino),
       reported_size(0), wanted_max_size(0), requested_max_size(0),
       _ref(0), ll_ref(0), dir(0), dn_set(),
@@ -318,7 +316,6 @@ struct Inode {
   {
     memset(&dir_layout, 0, sizeof(dir_layout));
     memset(&layout, 0, sizeof(layout));
-    memset(&flushing_cap_tid, 0, sizeof(__u16)*CEPH_CAP_BITS);
     memset(&quota, 0, sizeof(quota));
   }
   ~Inode() { }
diff --git a/src/client/InodeRef.h b/src/client/InodeRef.h
new file mode 100644
index 0000000..822ec0f
--- /dev/null
+++ b/src/client/InodeRef.h
@@ -0,0 +1,12 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_CLIENT_INODEREF_H
+#define CEPH_CLIENT_INODEREF_H
+
+#include <boost/intrusive_ptr.hpp>
+class Inode;
+void intrusive_ptr_add_ref(Inode *in);
+void intrusive_ptr_release(Inode *in);
+typedef boost::intrusive_ptr<Inode> InodeRef;
+#endif
diff --git a/src/client/Makefile.am b/src/client/Makefile.am
index 60dd227..8e47f51 100644
--- a/src/client/Makefile.am
+++ b/src/client/Makefile.am
@@ -16,6 +16,7 @@ noinst_HEADERS += \
 	client/Dir.h \
 	client/Fh.h \
 	client/Inode.h \
+	client/InodeRef.h \
 	client/MetaRequest.h \
 	client/MetaSession.h \
 	client/ClientSnapRealm.h \
@@ -26,7 +27,8 @@ noinst_HEADERS += \
 
 if WITH_FUSE
 libclient_fuse_la_SOURCES = client/fuse_ll.cc
-libclient_fuse_la_LIBADD = libclient.la -lfuse
+libclient_fuse_la_LIBADD = libclient.la $(LIBFUSE_LIBS)
+libclient_fuse_la_CXXFLAGS = $(AM_CXXFLAGS) $(LIBFUSE_CFLAGS)
 noinst_LTLIBRARIES += libclient_fuse.la
 noinst_HEADERS += client/fuse_ll.h
 endif
diff --git a/src/client/MetaRequest.cc b/src/client/MetaRequest.cc
index c8c4552..330edde 100644
--- a/src/client/MetaRequest.cc
+++ b/src/client/MetaRequest.cc
@@ -57,9 +57,6 @@ void MetaRequest::dump(Formatter *f) const
 
 MetaRequest::~MetaRequest()
 {
-  assert(!_inode);
-  assert(!_old_inode);
-  assert(!_other_inode);
   if (_dentry)
     _dentry->put();
   if (_old_dentry)
@@ -68,33 +65,6 @@ MetaRequest::~MetaRequest()
     reply->put();
 }
 
-void MetaRequest::set_inode(Inode *in) {
-  assert(_inode == NULL);
-  _inode = in;
-  _inode->get();
-}
-Inode *MetaRequest::inode() {
-  return _inode;
-}
-
-void MetaRequest::set_old_inode(Inode *in) {
-  assert(_old_inode == NULL);
-  _old_inode = in;
-  _old_inode->get();
-}
-Inode *MetaRequest::old_inode() {
-  return _old_inode;
-}
-
-void MetaRequest::set_other_inode(Inode *in) {
-  assert(_other_inode == NULL);
-  _other_inode = in;
-  _other_inode->get();
-}
-Inode *MetaRequest::other_inode() {
-  return _other_inode;
-}
-
 void MetaRequest::set_dentry(Dentry *d) {
   assert(_dentry == NULL);
   _dentry = d;
diff --git a/src/client/MetaRequest.h b/src/client/MetaRequest.h
index e3b6bd1..660a88a 100644
--- a/src/client/MetaRequest.h
+++ b/src/client/MetaRequest.h
@@ -11,19 +11,18 @@
 #include "include/filepath.h"
 #include "include/atomic.h"
 #include "mds/mdstypes.h"
+#include "InodeRef.h"
 
 #include "common/Mutex.h"
 
 #include "messages/MClientRequest.h"
 
 class MClientReply;
-struct Inode;
 class Dentry;
 
 struct MetaRequest {
 private:
-  Inode *_inode;
-  Inode *_old_inode, *_other_inode;
+  InodeRef _inode, _old_inode, _other_inode;
   Dentry *_dentry; //associated with path
   Dentry *_old_dentry; //associated with path2
 public:
@@ -54,6 +53,7 @@ public:
   MClientReply *reply;         // the reply
   bool kick;
   bool aborted;
+  bool success;
   
   // readdir result
   frag_t readdir_frag;
@@ -61,7 +61,7 @@ public:
   uint64_t readdir_offset;
 
   frag_t readdir_reply_frag;
-  vector<pair<string,Inode*> > readdir_result;
+  vector<pair<string,InodeRef> > readdir_result;
   bool readdir_end;
   int readdir_num;
   string readdir_last_name;
@@ -71,15 +71,16 @@ public:
 
   xlist<MetaRequest*>::item item;
   xlist<MetaRequest*>::item unsafe_item;
+  xlist<MetaRequest*>::item unsafe_dir_item;
   Mutex lock; //for get/set sync
 
   Cond  *caller_cond;          // who to take up
   Cond  *dispatch_cond;        // who to kick back
+  list<Cond*> waitfor_safe;
 
-  Inode *target;
+  InodeRef target;
 
   MetaRequest(int op) :
-    _inode(NULL), _old_inode(NULL), _other_inode(NULL),
     _dentry(NULL), _old_dentry(NULL),
     tid(0),
     inode_drop(0), inode_unless(0),
@@ -91,37 +92,42 @@ public:
     mds(-1), resend_mds(-1), send_to_auth(false), sent_on_mseq(0),
     num_fwd(0), retry_attempt(0),
     ref(1), reply(0), 
-    kick(false), aborted(false),
+    kick(false), aborted(false), success(false),
     readdir_offset(0), readdir_end(false), readdir_num(0),
-    got_unsafe(false), item(this), unsafe_item(this),
+    got_unsafe(false), item(this), unsafe_item(this), unsafe_dir_item(this),
     lock("MetaRequest lock"),
-    caller_cond(0), dispatch_cond(0),
-    target(0) {
+    caller_cond(0), dispatch_cond(0) {
     memset(&head, 0, sizeof(ceph_mds_request_head));
     head.op = op;
   }
   ~MetaRequest();
 
-  void set_inode(Inode *in);
-  Inode *inode();
-  Inode *take_inode() {
-    Inode *i = _inode;
-    _inode = 0;
-    return i;
-  }
-  void set_old_inode(Inode *in);
-  Inode *old_inode();
-  Inode *take_old_inode() {
-    Inode *i = _old_inode;
-    _old_inode = NULL;
-    return i;
-  }
-  void set_other_inode(Inode *in);
-  Inode *other_inode();
-  Inode *take_other_inode() {
-    Inode *i = _other_inode;
-    _other_inode = 0;
-    return i;
+  void set_inode(Inode *in) {
+    _inode = in;
+  }
+  Inode *inode() {
+    return _inode.get();
+  }
+  void take_inode(InodeRef *out) {
+    out->swap(_inode);
+  }
+  void set_old_inode(Inode *in) {
+    _old_inode = in;
+  }
+  Inode *old_inode() {
+    return _old_inode.get();
+  }
+  void take_old_inode(InodeRef *out) {
+    out->swap(_old_inode);
+  }
+  void set_other_inode(Inode *in) {
+    _old_inode = in;
+  }
+  Inode *other_inode() {
+    return _other_inode.get();
+  }
+  void take_other_inode(InodeRef *out) {
+    out->swap(_other_inode);
   }
   void set_dentry(Dentry *d);
   Dentry *dentry();
@@ -161,21 +167,16 @@ public:
   bool is_write() {
     return
       (head.op & CEPH_MDS_OP_WRITE) || 
-      (head.op == CEPH_MDS_OP_OPEN && !(head.args.open.flags & (O_CREAT|O_TRUNC))) ||
-      (head.op == CEPH_MDS_OP_CREATE && !(head.args.open.flags & (O_CREAT|O_TRUNC)));
+      (head.op == CEPH_MDS_OP_OPEN && (head.args.open.flags & (O_CREAT|O_TRUNC)));
   }
   bool can_forward() {
-    if (is_write() ||
-	head.op == CEPH_MDS_OP_OPEN ||   // do not forward _any_ open request.
-	head.op == CEPH_MDS_OP_CREATE)   // do not forward _any_ open request.
+    if ((head.op & CEPH_MDS_OP_WRITE) ||
+	head.op == CEPH_MDS_OP_OPEN)   // do not forward _any_ open request.
       return false;
     return true;
   }
   bool auth_is_best() {
-    if (is_write()) 
-      return true;
-    if (head.op == CEPH_MDS_OP_OPEN ||
-	head.op == CEPH_MDS_OP_CREATE ||
+    if ((head.op & CEPH_MDS_OP_WRITE) || head.op == CEPH_MDS_OP_OPEN ||
 	head.op == CEPH_MDS_OP_READDIR) 
       return true;
     return false;    
diff --git a/src/client/MetaSession.h b/src/client/MetaSession.h
index 36b5814..e21be83 100644
--- a/src/client/MetaSession.h
+++ b/src/client/MetaSession.h
@@ -46,6 +46,7 @@ struct MetaSession {
   xlist<CapSnap*> flushing_capsnaps;
   xlist<MetaRequest*> requests;
   xlist<MetaRequest*> unsafe_requests;
+  std::set<ceph_tid_t> flushing_caps_tids;
 
   Cap *s_cap_iterator;
 
diff --git a/src/client/ObjecterWriteback.h b/src/client/ObjecterWriteback.h
index b9e6f9c..8c05e96 100644
--- a/src/client/ObjecterWriteback.h
+++ b/src/client/ObjecterWriteback.h
@@ -39,13 +39,6 @@ class ObjecterWriteback : public WritebackHandler {
 						    m_finisher));
   }
 
-  virtual ceph_tid_t lock(const object_t& oid, const object_locator_t& oloc, int op,
-		     int flags, Context *onack, Context *oncommit) {
-    return m_objecter->lock(oid, oloc, op, flags, onack,
-			    new C_OnFinisher(new C_Lock(m_lock, oncommit),
-					     m_finisher));
-  }
-
  private:
   Objecter *m_objecter;
   Finisher *m_finisher;
diff --git a/src/client/SyntheticClient.cc b/src/client/SyntheticClient.cc
index df4e488..fa92f62 100644
--- a/src/client/SyntheticClient.cc
+++ b/src/client/SyntheticClient.cc
@@ -601,7 +601,7 @@ int SyntheticClient::run()
         int size = iargs.front();  iargs.pop_front();
         int inflight = iargs.front();  iargs.pop_front();
         if (run_me()) {
-          dout(2) << "createobjects " << cout << " of " << size << " bytes"
+          dout(2) << "createobjects " << count << " of " << size << " bytes"
 		  << ", " << inflight << " in flight" << dendl;
           create_objects(count, size, inflight);
         }
@@ -617,7 +617,7 @@ int SyntheticClient::run()
         int rskew = iargs.front();  iargs.pop_front();
         int wskew = iargs.front();  iargs.pop_front();
         if (run_me()) {
-          dout(2) << "objectrw " << cout << " " << size << " " << wrpc 
+          dout(2) << "objectrw " << count << " " << size << " " << wrpc 
 		  << " " << overlap << " " << rskew << " " << wskew << dendl;
           object_rw(count, size, wrpc, overlap, rskew, wskew);
         }
@@ -2389,12 +2389,6 @@ int SyntheticClient::object_rw(int nobj, int osize, int wrpc,
 
     utime_t lat = ceph_clock_now(client->cct);
     lat -= start;
-    if (client->logger) {
-      if (write) 
-	client->logger->tset(l_c_owrlat, lat);
-      else 
-	client->logger->tset(l_c_ordlat, lat);
-    }
   }
 
   return 0;
diff --git a/src/client/fuse_ll.cc b/src/client/fuse_ll.cc
index f9c380c..219cf0e 100644
--- a/src/client/fuse_ll.cc
+++ b/src/client/fuse_ll.cc
@@ -14,8 +14,8 @@
 
 #define FUSE_USE_VERSION 30
 
-#include <fuse/fuse.h>
-#include <fuse/fuse_lowlevel.h>
+#include <sys/types.h>
+#include <sys/wait.h>
 #include <signal.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -34,6 +34,8 @@
 #include "common/config.h"
 #include "include/assert.h"
 
+#include <fuse.h>
+#include <fuse_lowlevel.h>
 #include "fuse_ll.h"
 
 #define FINO_INO(x) ((x) & ((1ull<<48)-1ull))
@@ -72,9 +74,8 @@ public:
   void finalize();
 
   uint64_t fino_snap(uint64_t fino);
-  vinodeno_t fino_vino(inodeno_t fino);
   uint64_t make_fake_ino(inodeno_t ino, snapid_t snapid);
-  Inode * iget(inodeno_t fino);
+  Inode * iget(fuse_ino_t fino);
   void iput(Inode *in);
 
   int fd_on_success;
@@ -172,7 +173,12 @@ static void fuse_ll_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr,
 // XATTRS
 
 static void fuse_ll_setxattr(fuse_req_t req, fuse_ino_t ino, const char *name,
-			     const char *value, size_t size, int flags)
+			     const char *value, size_t size, 
+			     int flags
+#if defined(DARWIN)
+			     ,uint32_t pos
+#endif
+  )
 {
   CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req);
   const struct fuse_ctx *ctx = fuse_req_ctx(req);
@@ -204,7 +210,11 @@ static void fuse_ll_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size)
 }
 
 static void fuse_ll_getxattr(fuse_req_t req, fuse_ino_t ino, const char *name,
-			     size_t size)
+			     size_t size
+#if defined(DARWIN)
+			     ,uint32_t position
+#endif
+  )
 {
   CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req);
   const struct fuse_ctx *ctx = fuse_req_ctx(req);
@@ -486,7 +496,7 @@ static void fuse_ll_ioctl(fuse_req_t req, fuse_ino_t ino, int cmd, void *arg, st
       struct ceph_file_layout layout;
       struct ceph_ioctl_layout l;
       Fh *fh = (Fh*)fi->fh;
-      cfuse->client->ll_file_layout(fh->inode, &layout);
+      cfuse->client->ll_file_layout(fh, &layout);
       l.stripe_unit = layout.fl_stripe_unit;
       l.stripe_count = layout.fl_stripe_count;
       l.object_size = layout.fl_object_size;
@@ -596,6 +606,15 @@ static void fuse_ll_releasedir(fuse_req_t req, fuse_ino_t ino,
   fuse_reply_err(req, 0);
 }
 
+static void fuse_ll_fsyncdir(fuse_req_t req, fuse_ino_t ino, int datasync,
+			     struct fuse_file_info *fi)
+{
+  CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req);
+  dir_result_t *dirp = reinterpret_cast<dir_result_t*>(fi->fh);
+  int r = cfuse->client->ll_fsyncdir(dirp);
+  fuse_reply_err(req, -r);
+}
+
 static void fuse_ll_access(fuse_req_t req, fuse_ino_t ino, int mask)
 {
   fuse_reply_err(req, 0);
@@ -817,7 +836,7 @@ const static struct fuse_lowlevel_ops fuse_ll_oper = {
  opendir: fuse_ll_opendir,
  readdir: fuse_ll_readdir,
  releasedir: fuse_ll_releasedir,
- fsyncdir: 0,
+ fsyncdir: fuse_ll_fsyncdir,
  statfs: fuse_ll_statfs,
  setxattr: fuse_ll_setxattr,
  getxattr: fuse_ll_getxattr,
@@ -899,6 +918,7 @@ int CephFuse::Handle::init(int argc, const char *argv[])
     newargv[newargc++] = "-o";
     newargv[newargc++] = "default_permissions";
   }
+#if defined(__linux__)
   if (client->cct->_conf->fuse_big_writes) {
     newargv[newargc++] = "-o";
     newargv[newargc++] = "big_writes";
@@ -907,7 +927,7 @@ int CephFuse::Handle::init(int argc, const char *argv[])
     newargv[newargc++] = "-o";
     newargv[newargc++] = "atomic_o_trunc";
   }
-
+#endif
   if (client->cct->_conf->fuse_debug)
     newargv[newargc++] = "-d";
 
@@ -959,7 +979,9 @@ int CephFuse::Handle::start()
     ino_cb: client->cct->_conf->fuse_use_invalidate_cb ? ino_invalidate_cb : NULL,
     dentry_cb: dentry_invalidate_cb,
     switch_intr_cb: switch_interrupt_cb,
+#if defined(__linux__)
     remount_cb: remount_cb,
+#endif
     /*
      * this is broken:
      *
@@ -988,27 +1010,27 @@ int CephFuse::Handle::loop()
 
 uint64_t CephFuse::Handle::fino_snap(uint64_t fino)
 {
-  Mutex::Locker l(stag_lock);
-  uint64_t stag = FINO_STAG(fino);
-  assert(stag_snap_map.count(stag));
-  return stag_snap_map[stag];
-}
-
-vinodeno_t CephFuse::Handle::fino_vino(inodeno_t fino)
-{
-  if (fino.val == 1) {
-    fino = inodeno_t(client->get_root_ino());
+  if (client->use_faked_inos()) {
+    vinodeno_t vino  = client->map_faked_ino(fino);
+    return vino.snapid;
+  } else {
+    Mutex::Locker l(stag_lock);
+    uint64_t stag = FINO_STAG(fino);
+    assert(stag_snap_map.count(stag));
+    return stag_snap_map[stag];
   }
-  vinodeno_t vino(FINO_INO(fino), fino_snap(fino));
-  //cout << "fino_vino " << fino << " -> " << vino << std::endl;
-  return vino;
 }
 
-Inode * CephFuse::Handle::iget(inodeno_t fino)
+Inode * CephFuse::Handle::iget(fuse_ino_t fino)
 {
-  Inode *in =
-    client->ll_get_inode(fino_vino(fino));
-  return in;
+  if (client->use_faked_inos()) {
+    return client->ll_get_inode((ino_t)fino);
+  } else {
+    if (fino == 1)
+      fino = inodeno_t(client->get_root_ino());
+    vinodeno_t vino(FINO_INO(fino), fino_snap(fino));
+    return client->ll_get_inode(vino);
+  }
 }
 
 void CephFuse::Handle::iput(Inode *in)
@@ -1018,17 +1040,22 @@ void CephFuse::Handle::iput(Inode *in)
 
 uint64_t CephFuse::Handle::make_fake_ino(inodeno_t ino, snapid_t snapid)
 {
-  Mutex::Locker l(stag_lock);
-  uint64_t stag;
-  if (snap_stag_map.count(snapid) == 0) {
-    stag = ++last_stag;
-    snap_stag_map[snapid] = stag;
-    stag_snap_map[stag] = snapid;
-  } else
-    stag = snap_stag_map[snapid];
-  inodeno_t fino = MAKE_FINO(ino, stag);
-  //cout << "make_fake_ino " << ino << "." << snapid << " -> " << fino << std::endl;
-  return fino;
+  if (client->use_faked_inos()) {
+    // already faked by libcephfs
+    return ino;
+  } else {
+    Mutex::Locker l(stag_lock);
+    uint64_t stag;
+    if (snap_stag_map.count(snapid) == 0) {
+      stag = ++last_stag;
+      snap_stag_map[snapid] = stag;
+      stag_snap_map[stag] = snapid;
+    } else
+      stag = snap_stag_map[snapid];
+    inodeno_t fino = MAKE_FINO(ino, stag);
+    //cout << "make_fake_ino " << ino << "." << snapid << " -> " << fino << std::endl;
+    return fino;
+  }
 }
 
 CephFuse::CephFuse(Client *c, int fd) : _handle(new CephFuse::Handle(c, fd))
diff --git a/src/cls/Makefile-client.am b/src/cls/Makefile-client.am
index 70a76da..134fb0c 100644
--- a/src/cls/Makefile-client.am
+++ b/src/cls/Makefile-client.am
@@ -24,6 +24,9 @@ noinst_LIBRARIES += libcls_log_client.a
 libcls_statelog_client_a_SOURCES = cls/statelog/cls_statelog_client.cc
 noinst_LIBRARIES += libcls_statelog_client.a
 
+libcls_timeindex_client_a_SOURCES = cls/timeindex/cls_timeindex_client.cc
+noinst_LIBRARIES += libcls_timeindex_client.a
+
 libcls_replica_log_client_a_SOURCES = \
 	cls/replica_log/cls_replica_log_types.cc \
 	cls/replica_log/cls_replica_log_ops.cc \
@@ -48,10 +51,18 @@ DENCODER_DEPS += libcls_user_client.a
 
 noinst_LIBRARIES += libcls_user_client.a
 
+libcls_cephfs_client_la_SOURCES = cls/cephfs/cls_cephfs_client.cc
+noinst_LTLIBRARIES += libcls_cephfs_client.la
+
+libcls_numops_client_la_SOURCES = cls/numops/cls_numops_client.cc
+noinst_LTLIBRARIES += libcls_numops_client.la
+DENCODER_DEPS += libcls_numops_client.la
+
 noinst_HEADERS += \
 	cls/lock/cls_lock_types.h \
 	cls/lock/cls_lock_ops.h \
 	cls/lock/cls_lock_client.h \
+	cls/numops/cls_numops_client.h \
 	cls/rbd/cls_rbd.h \
 	cls/rbd/cls_rbd_client.h \
 	cls/refcount/cls_refcount_ops.h \
@@ -65,6 +76,9 @@ noinst_HEADERS += \
 	cls/statelog/cls_statelog_types.h \
 	cls/statelog/cls_statelog_ops.h \
 	cls/statelog/cls_statelog_client.h \
+	cls/timeindex/cls_timeindex_types.h \
+	cls/timeindex/cls_timeindex_ops.h \
+	cls/timeindex/cls_timeindex_client.h \
 	cls/replica_log/cls_replica_log_types.h \
 	cls/replica_log/cls_replica_log_ops.h \
 	cls/replica_log/cls_replica_log_client.h \
@@ -73,4 +87,6 @@ noinst_HEADERS += \
 	cls/rgw/cls_rgw_types.h \
 	cls/user/cls_user_client.h \
 	cls/user/cls_user_ops.h \
-	cls/user/cls_user_types.h
+	cls/user/cls_user_types.h \
+	cls/cephfs/cls_cephfs.h \
+	cls/cephfs/cls_cephfs_client.h
diff --git a/src/cls/Makefile-server.am b/src/cls/Makefile-server.am
index ee4cb2b..9b719e3 100644
--- a/src/cls/Makefile-server.am
+++ b/src/cls/Makefile-server.am
@@ -6,6 +6,10 @@ libcls_hello_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
 libcls_hello_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*'
 radoslib_LTLIBRARIES += libcls_hello.la
 
+libcls_numops_la_SOURCES = cls/numops/cls_numops.cc
+libcls_numops_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*'
+radoslib_LTLIBRARIES += libcls_numops.la
+
 libcls_rbd_la_SOURCES = cls/rbd/cls_rbd.cc
 libcls_rbd_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
 libcls_rbd_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*'
@@ -39,6 +43,11 @@ libcls_statelog_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
 libcls_statelog_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*'
 radoslib_LTLIBRARIES += libcls_statelog.la
 
+libcls_timeindex_la_SOURCES = cls/timeindex/cls_timeindex.cc
+libcls_timeindex_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
+libcls_timeindex_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*'
+radoslib_LTLIBRARIES += libcls_timeindex.la
+
 libcls_replica_log_la_SOURCES = cls/replica_log/cls_replica_log.cc
 libcls_replica_log_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
 libcls_replica_log_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*'
@@ -57,4 +66,9 @@ libcls_rgw_la_SOURCES = \
 libcls_rgw_la_LIBADD = libjson_spirit.la $(PTHREAD_LIBS) $(EXTRALIBS)
 libcls_rgw_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*'
 radoslib_LTLIBRARIES += libcls_rgw.la
+
+libcls_cephfs_la_SOURCES = cls/cephfs/cls_cephfs.cc
+libcls_cephfs_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
+libcls_cephfs_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*'
+radoslib_LTLIBRARIES += libcls_cephfs.la
 endif # WITH_OSD
diff --git a/src/cls/cephfs/cls_cephfs.cc b/src/cls/cephfs/cls_cephfs.cc
new file mode 100644
index 0000000..f58f0de
--- /dev/null
+++ b/src/cls/cephfs/cls_cephfs.cc
@@ -0,0 +1,143 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+#include <string>
+#include <errno.h>
+#include <sstream>
+
+#include "objclass/objclass.h"
+
+#include "cls_cephfs.h"
+
+CLS_VER(1,0)
+CLS_NAME(cephfs_size_scan)
+
+cls_handle_t h_class;
+cls_method_handle_t h_accumulate_inode_metadata;
+
+
+
+std::ostream &operator<<(std::ostream &out, ObjCeiling &in)
+{
+  out << "id: " << in.id << " size: " << in.size;
+  return out;
+}
+
+
+/**
+ * Set a named xattr to a given value, if and only if the xattr
+ * is not already set to a greater value.
+ *
+ * If the xattr is missing, then it is set to the input integer.
+ *
+ * @param xattr_name: name of xattr to compare against and set
+ * @param input_val: candidate new value, of ::encode()'able type
+ * @returns 0 on success (irrespective of whether our new value
+ *          was used) else an error code
+ */
+template <typename A>
+static int set_if_greater(cls_method_context_t hctx,
+    const std::string &xattr_name, const A input_val)
+{
+  bufferlist existing_val_bl;
+
+  bool set_val = false;
+  int r = cls_cxx_getxattr(hctx, xattr_name.c_str(), &existing_val_bl);
+  if (r == -ENOENT || existing_val_bl.length() == 0) {
+    set_val = true;
+  } else if (r >= 0) {
+    bufferlist::iterator existing_p = existing_val_bl.begin();
+    try {
+      A existing_val;
+      ::decode(existing_val, existing_p);
+      if (!existing_p.end()) {
+        // Trailing junk?  Consider it invalid and overwrite
+        set_val = true;
+      } else {
+        // Valid existing value, do comparison
+        set_val = input_val > existing_val;
+      }
+    } catch (const buffer::error &err) {
+      // Corrupt or empty existing value, overwrite it
+      set_val = true;
+    }
+  } else {
+    return r;
+  }
+
+  // Conditionally set the new xattr
+  if (set_val) {
+    bufferlist set_bl;
+    ::encode(input_val, set_bl);
+    return cls_cxx_setxattr(hctx, xattr_name.c_str(), &set_bl);
+  } else {
+    return 0;
+  }
+}
+
+static int accumulate_inode_metadata(cls_method_context_t hctx,
+    bufferlist *in, bufferlist *out)
+{
+  assert(in != NULL);
+  assert(out != NULL);
+
+  int r = 0;
+
+  // Decode `in`
+  bufferlist::iterator q = in->begin();
+  AccumulateArgs args;
+  try {
+    args.decode(q);
+  } catch (const buffer::error &err) {
+    return -EINVAL;
+  }
+
+  ObjCeiling ceiling(args.obj_index, args.obj_size);
+  r = set_if_greater(hctx, args.obj_xattr_name, ceiling);
+  if (r < 0) {
+    return r;
+  }
+
+  r = set_if_greater(hctx, args.mtime_xattr_name, args.mtime);
+  if (r < 0) {
+    return r;
+  }
+
+  r = set_if_greater(hctx, args.obj_size_xattr_name, args.obj_size);
+  if (r < 0) {
+    return r;
+  }
+
+  return 0;
+}
+
+/**
+ * initialize class
+ *
+ * We do two things here: we register the new class, and then register
+ * all of the class's methods.
+ */
+void __cls_init()
+{
+  // this log message, at level 0, will always appear in the ceph-osd
+  // log file.
+  CLS_LOG(0, "loading cephfs_size_scan");
+
+  cls_register("cephfs", &h_class);
+  cls_register_cxx_method(h_class, "accumulate_inode_metadata",
+			  CLS_METHOD_WR | CLS_METHOD_RD,
+			  accumulate_inode_metadata, &h_accumulate_inode_metadata);
+}
+
diff --git a/src/cls/cephfs/cls_cephfs.h b/src/cls/cephfs/cls_cephfs.h
new file mode 100644
index 0000000..d4a5f23
--- /dev/null
+++ b/src/cls/cephfs/cls_cephfs.h
@@ -0,0 +1,127 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "include/encoding.h"
+
+/**
+ * Value class for the xattr we'll use to accumulate
+ * the highest object seen for a given inode
+ */
+class ObjCeiling {
+  public:
+    uint64_t id;
+    uint64_t size;
+
+    ObjCeiling()
+      : id(0), size(0)
+    {}
+
+    ObjCeiling(uint64_t id_, uint64_t size_)
+      : id(id_), size(size_)
+    {}
+
+    bool operator >(ObjCeiling const &rhs) const
+    {
+      return id > rhs.id;
+    }
+
+    void encode(bufferlist &bl) const
+    {
+      ENCODE_START(1, 1, bl);
+      ::encode(id, bl);
+      ::encode(size, bl);
+      ENCODE_FINISH(bl);
+    }
+
+    void decode(bufferlist::iterator &p)
+    {
+      DECODE_START(1, p);
+      ::decode(id, p);
+      ::decode(size, p);
+      DECODE_FINISH(p);
+    }
+};
+WRITE_CLASS_ENCODER(ObjCeiling)
+
+class AccumulateArgs
+{
+public:
+  uint64_t obj_index;
+  uint64_t obj_size;
+  int64_t mtime;
+  std::string obj_xattr_name;
+  std::string mtime_xattr_name;
+  std::string obj_size_xattr_name;
+
+  AccumulateArgs(
+      uint64_t obj_index_,
+      uint64_t obj_size_,
+      time_t mtime_,
+      std::string obj_xattr_name_,
+      std::string mtime_xattr_name_,
+      std::string obj_size_xattr_name_)
+   : obj_index(obj_index_),
+     obj_size(obj_size_),
+     mtime(mtime_),
+     obj_xattr_name(obj_xattr_name_),
+     mtime_xattr_name(mtime_xattr_name_),
+     obj_size_xattr_name(obj_size_xattr_name_)
+  {}
+
+  AccumulateArgs()
+    : obj_index(0), obj_size(0), mtime(0)
+  {}
+
+  void encode(bufferlist &bl) const
+  {
+    ENCODE_START(1, 1, bl);
+    ::encode(obj_xattr_name, bl);
+    ::encode(mtime_xattr_name, bl);
+    ::encode(obj_size_xattr_name, bl);
+    ::encode(obj_index, bl);
+    ::encode(obj_size, bl);
+    ::encode(mtime, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::iterator &bl)
+  {
+    DECODE_START(1, bl);
+    ::decode(obj_xattr_name, bl);
+    ::decode(mtime_xattr_name, bl);
+    ::decode(obj_size_xattr_name, bl);
+    ::decode(obj_index, bl);
+    ::decode(obj_size, bl);
+    ::decode(mtime, bl);
+    DECODE_FINISH(bl);
+  }
+};
+
+class AccumulateResult
+{
+public:
+  // Index of the highest-indexed object seen
+  uint64_t ceiling_obj_index;
+  // Size of the highest-index object seen
+  uint64_t ceiling_obj_size;
+  // Largest object seen
+  uint64_t max_obj_size;
+  // Highest mtime seen
+  int64_t   max_mtime;
+
+  AccumulateResult()
+    : ceiling_obj_index(0), ceiling_obj_size(0), max_obj_size(0), max_mtime(0)
+  {}
+};
+
diff --git a/src/cls/cephfs/cls_cephfs_client.cc b/src/cls/cephfs/cls_cephfs_client.cc
new file mode 100644
index 0000000..c471fde
--- /dev/null
+++ b/src/cls/cephfs/cls_cephfs_client.cc
@@ -0,0 +1,146 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+#include "cls_cephfs_client.h"
+
+#include "mds/CInode.h"
+
+#define XATTR_CEILING "scan_ceiling"
+#define XATTR_MAX_MTIME "scan_max_mtime"
+#define XATTR_MAX_SIZE "scan_max_size"
+
+int ClsCephFSClient::accumulate_inode_metadata(
+  librados::IoCtx &ctx,
+  inodeno_t inode_no,
+  const uint64_t obj_index,
+  const uint64_t obj_size,
+  const time_t mtime)
+{
+  AccumulateArgs args(
+      obj_index,
+      obj_size,
+      mtime,
+      XATTR_CEILING,
+      XATTR_MAX_MTIME,
+      XATTR_MAX_SIZE);
+
+  // Generate 0th object name, where we will accumulate sizes/mtimes
+  object_t zeroth_object = InodeStore::get_object_name(inode_no, frag_t(), "");
+
+  // Construct a librados operation invoking our class method
+  librados::ObjectReadOperation op;
+  bufferlist inbl;
+  args.encode(inbl);
+  op.exec("cephfs", "accumulate_inode_metadata", inbl);
+
+  // Execute op
+  bufferlist outbl;
+  return ctx.operate(zeroth_object.name, &op, &outbl);
+}
+
+int ClsCephFSClient::fetch_inode_accumulate_result(
+  librados::IoCtx &ctx,
+  const std::string &oid,
+  inode_backtrace_t *backtrace,
+  ceph_file_layout *layout,
+  AccumulateResult *result)
+{
+  assert(backtrace != NULL);
+  assert(result != NULL);
+
+  librados::ObjectReadOperation op;
+
+  int scan_ceiling_r = 0;
+  bufferlist scan_ceiling_bl;
+  op.getxattr(XATTR_CEILING, &scan_ceiling_bl, &scan_ceiling_r);
+
+  int scan_max_size_r = 0;
+  bufferlist scan_max_size_bl;
+  op.getxattr(XATTR_MAX_SIZE, &scan_max_size_bl, &scan_max_size_r);
+
+  int scan_max_mtime_r = 0;
+  bufferlist scan_max_mtime_bl;
+  op.getxattr(XATTR_MAX_MTIME, &scan_max_mtime_bl, &scan_max_mtime_r);
+
+  int parent_r = 0;
+  bufferlist parent_bl;
+  op.getxattr("parent", &parent_bl, &parent_r);
+  op.set_op_flags2(librados::OP_FAILOK);
+
+  int layout_r = 0;
+  bufferlist layout_bl;
+  op.getxattr("layout", &layout_bl, &layout_r);
+  op.set_op_flags2(librados::OP_FAILOK);
+
+  bufferlist op_bl;
+  int r = ctx.operate(oid, &op, &op_bl);
+  if (r < 0) {
+    return r;
+  }
+
+  // Load scan_ceiling
+  try {
+    bufferlist::iterator scan_ceiling_bl_iter = scan_ceiling_bl.begin();
+    ObjCeiling ceiling;
+    ceiling.decode(scan_ceiling_bl_iter);
+    result->ceiling_obj_index = ceiling.id;
+    result->ceiling_obj_size = ceiling.size;
+  } catch (const buffer::error &err) {
+    //dout(4) << "Invalid size attr on '" << oid << "'" << dendl;
+    return -EINVAL;
+  }
+
+  // Load scan_max_size
+  try {
+    bufferlist::iterator scan_max_size_bl_iter = scan_max_size_bl.begin();
+    ::decode(result->max_obj_size, scan_max_size_bl_iter);
+  } catch (const buffer::error &err) {
+    //dout(4) << "Invalid size attr on '" << oid << "'" << dendl;
+    return -EINVAL;
+  }
+
+  // Load scan_max_mtime
+  try {
+    bufferlist::iterator scan_max_mtime_bl_iter = scan_max_mtime_bl.begin();
+    ::decode(result->max_mtime, scan_max_mtime_bl_iter);
+  } catch (const buffer::error &err) {
+    //dout(4) << "Invalid size attr on '" << oid << "'" << dendl;
+    return -EINVAL;
+  }
+
+  // Deserialize backtrace
+  if (parent_bl.length()) {
+    try {
+      bufferlist::iterator q = parent_bl.begin();
+      backtrace->decode(q);
+    } catch (buffer::error &e) {
+      //dout(4) << "Corrupt backtrace on '" << oid << "': " << e << dendl;
+      return -EINVAL;
+    }
+  }
+
+  // Deserialize layout
+  if (layout_bl.length()) {
+    try {
+      bufferlist::iterator q = layout_bl.begin();
+      ::decode(*layout, q);
+    } catch (buffer::error &e) {
+      return -EINVAL;
+    }
+  }
+
+  return 0;
+}
+
diff --git a/src/cls/cephfs/cls_cephfs_client.h b/src/cls/cephfs/cls_cephfs_client.h
new file mode 100644
index 0000000..45d3c4b
--- /dev/null
+++ b/src/cls/cephfs/cls_cephfs_client.h
@@ -0,0 +1,26 @@
+
+#include "include/rados/librados.hpp"
+#include "mds/mdstypes.h"
+
+#include "cls_cephfs.h"
+
+class AccumulateArgs;
+
+class ClsCephFSClient
+{
+  public:
+  static int accumulate_inode_metadata(
+      librados::IoCtx &ctx,
+      inodeno_t inode_no,
+      const uint64_t obj_index,
+      const uint64_t obj_size,
+      const time_t mtime);
+
+  static int fetch_inode_accumulate_result(
+      librados::IoCtx &ctx,
+      const std::string &oid,
+      inode_backtrace_t *backtrace,
+      ceph_file_layout *layout,
+      AccumulateResult *result);
+};
+
diff --git a/src/cls/hello/cls_hello.cc b/src/cls/hello/cls_hello.cc
index 0d5c78b..878130f 100644
--- a/src/cls/hello/cls_hello.cc
+++ b/src/cls/hello/cls_hello.cc
@@ -258,6 +258,40 @@ static int bad_writer(cls_method_context_t hctx, bufferlist *in, bufferlist *out
 }
 
 
+class PGLSHelloFilter : public PGLSFilter {
+  string val;
+public:
+  int init(bufferlist::iterator& params) {
+    try {
+      ::decode(xattr, params);
+      ::decode(val, params);
+    } catch (buffer::error &e) {
+      return -EINVAL;
+    }
+    return 0;
+  }
+
+  virtual ~PGLSHelloFilter() {}
+  virtual bool filter(const hobject_t &obj, bufferlist& xattr_data,
+                      bufferlist& outdata)
+  {
+    if (val.size() != xattr_data.length())
+      return false;
+
+    if (memcmp(val.c_str(), xattr_data.c_str(), val.size()))
+      return false;
+
+    return true;
+  }
+};
+
+
+PGLSFilter *hello_filter()
+{
+  return new PGLSHelloFilter();
+}
+
+
 /**
  * initialize class
  *
@@ -285,7 +319,7 @@ void __cls_init()
 			  CLS_METHOD_RD,
 			  say_hello, &h_say_hello);
   cls_register_cxx_method(h_class, "record_hello",
-			  CLS_METHOD_WR,
+			  CLS_METHOD_WR | CLS_METHOD_PROMOTE,
 			  record_hello, &h_record_hello);
   cls_register_cxx_method(h_class, "writes_dont_return_data",
 			  CLS_METHOD_WR,
@@ -296,7 +330,7 @@ void __cls_init()
 
   // RD | WR is a read-modify-write method.
   cls_register_cxx_method(h_class, "turn_it_to_11",
-			  CLS_METHOD_RD | CLS_METHOD_WR,
+			  CLS_METHOD_RD | CLS_METHOD_WR | CLS_METHOD_PROMOTE,
 			  turn_it_to_11, &h_turn_it_to_11);
 
   // counter-examples
@@ -304,4 +338,7 @@ void __cls_init()
 			  bad_reader, &h_bad_reader);
   cls_register_cxx_method(h_class, "bad_writer", CLS_METHOD_RD,
 			  bad_writer, &h_bad_writer);
+
+  // A PGLS filter
+  cls_register_cxx_filter(h_class, "hello", hello_filter);
 }
diff --git a/src/cls/lock/cls_lock.cc b/src/cls/lock/cls_lock.cc
index cefb870..048ec40 100644
--- a/src/cls/lock/cls_lock.cc
+++ b/src/cls/lock/cls_lock.cc
@@ -29,6 +29,8 @@
 
 #include "global/global_context.h"
 
+#include "include/compat.h"
+
 
 using namespace rados::cls::lock;
 
@@ -517,10 +519,10 @@ void __cls_init()
 
   cls_register("lock", &h_class);
   cls_register_cxx_method(h_class, "lock",
-                          CLS_METHOD_RD | CLS_METHOD_WR,
+                          CLS_METHOD_RD | CLS_METHOD_WR | CLS_METHOD_PROMOTE,
                           lock_op, &h_lock_op);
   cls_register_cxx_method(h_class, "unlock",
-                          CLS_METHOD_RD | CLS_METHOD_WR,
+                          CLS_METHOD_RD | CLS_METHOD_WR | CLS_METHOD_PROMOTE,
                           unlock_op, &h_unlock_op);
   cls_register_cxx_method(h_class, "break_lock",
                           CLS_METHOD_RD | CLS_METHOD_WR,
@@ -532,7 +534,7 @@ void __cls_init()
                           CLS_METHOD_RD,
                           list_locks, &h_list_locks);
   cls_register_cxx_method(h_class, "assert_locked",
-                          CLS_METHOD_RD,
+                          CLS_METHOD_RD | CLS_METHOD_PROMOTE,
                           assert_locked, &h_assert_locked);
 
   return;
diff --git a/src/cls/log/cls_log.cc b/src/cls/log/cls_log.cc
index 7b254fd..23df866 100644
--- a/src/cls/log/cls_log.cc
+++ b/src/cls/log/cls_log.cc
@@ -15,6 +15,7 @@
 #include "cls_log_ops.h"
 
 #include "global/global_context.h"
+#include "include/compat.h"
 
 CLS_VER(1,0)
 CLS_NAME(log)
diff --git a/src/cls/log/cls_log_client.cc b/src/cls/log/cls_log_client.cc
index e5b47bf..d0c6603 100644
--- a/src/cls/log/cls_log_client.cc
+++ b/src/cls/log/cls_log_client.cc
@@ -3,6 +3,7 @@
 #include "include/types.h"
 #include "cls/log/cls_log_ops.h"
 #include "include/rados/librados.hpp"
+#include "include/compat.h"
 
 
 using namespace librados;
diff --git a/src/cls/numops/cls_numops.cc b/src/cls/numops/cls_numops.cc
new file mode 100644
index 0000000..a9823bc
--- /dev/null
+++ b/src/cls/numops/cls_numops.cc
@@ -0,0 +1,163 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 CERN
+ *
+ * Author: Joaquim Rocha <joaquim.rocha at cern.ch>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+/** \file
+ *
+ * This is an OSD class that implements methods for object numeric options on
+ * its omap values.
+ *
+ */
+
+#include "objclass/objclass.h"
+#include <errno.h>
+#include <iostream>
+#include <map>
+#include <string>
+#include <sstream>
+#include <cstdio>
+#include <include/compat.h>
+
+#define DECIMAL_PRECISION 10
+
+CLS_VER(1,0)
+CLS_NAME(numops)
+
+cls_handle_t h_class;
+cls_method_handle_t h_add;
+cls_method_handle_t h_mul;
+
+static int add(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
+{
+  string key, diff_str;
+
+  bufferlist::iterator iter = in->begin();
+  try {
+    ::decode(key, iter);
+    ::decode(diff_str, iter);
+  } catch (const buffer::error &err) {
+    CLS_LOG(20, "add: invalid decode of input");
+    return -EINVAL;
+  }
+
+  char *end_ptr = 0;
+  double difference = strtod(diff_str.c_str(), &end_ptr);
+
+  if (end_ptr && *end_ptr != '\0') {
+    CLS_ERR("add: invalid input value: %s", diff_str.c_str());
+    return -EINVAL;
+  }
+
+  bufferlist bl;
+  int ret = cls_cxx_map_get_val(hctx, key, &bl);
+
+  double value;
+
+  if (ret == -ENODATA || bl.length() == 0) {
+    value = 0;
+  } else if (ret < 0) {
+    if (ret != -ENOENT) {
+      CLS_ERR("add: error reading omap key %s: %d", key.c_str(), ret);
+    }
+    return ret;
+  } else {
+    std::string stored_value(bl.c_str(), bl.length());
+    end_ptr = 0;
+    value = strtod(stored_value.c_str(), &end_ptr);
+
+    if (end_ptr && *end_ptr != '\0') {
+      CLS_ERR("add: invalid stored value: %s", stored_value.c_str());
+      return -EBADMSG;
+    }
+  }
+
+  value += difference;
+
+  std::stringstream stream;
+  stream << std::setprecision(DECIMAL_PRECISION) << value;
+
+  bufferlist new_value;
+  new_value.append(stream.str());
+
+  return cls_cxx_map_set_val(hctx, key, &new_value);
+}
+
+static int mul(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
+{
+  string key, diff_str;
+
+  bufferlist::iterator iter = in->begin();
+  try {
+    ::decode(key, iter);
+    ::decode(diff_str, iter);
+  } catch (const buffer::error &err) {
+    CLS_LOG(20, "add: invalid decode of input");
+    return -EINVAL;
+  }
+
+  char *end_ptr = 0;
+  double difference = strtod(diff_str.c_str(), &end_ptr);
+
+  if (end_ptr && *end_ptr != '\0') {
+    CLS_ERR("add: invalid input value: %s", diff_str.c_str());
+    return -EINVAL;
+  }
+
+  bufferlist bl;
+  int ret = cls_cxx_map_get_val(hctx, key, &bl);
+
+  double value;
+
+  if (ret == -ENODATA || bl.length() == 0) {
+    value = 0;
+  } else if (ret < 0) {
+    if (ret != -ENOENT) {
+      CLS_ERR("add: error reading omap key %s: %d", key.c_str(), ret);
+    }
+    return ret;
+  } else {
+    std::string stored_value(bl.c_str(), bl.length());
+    end_ptr = 0;
+    value = strtod(stored_value.c_str(), &end_ptr);
+
+    if (end_ptr && *end_ptr != '\0') {
+      CLS_ERR("add: invalid stored value: %s", stored_value.c_str());
+      return -EBADMSG;
+    }
+  }
+
+  value *= difference;
+
+  std::stringstream stream;
+  stream << std::setprecision(DECIMAL_PRECISION) << value;
+
+  bufferlist new_value;
+  new_value.append(stream.str());
+
+  return cls_cxx_map_set_val(hctx, key, &new_value);
+}
+
+void __cls_init()
+{
+  CLS_LOG(20, "loading cls_numops");
+
+  cls_register("numops", &h_class);
+
+  cls_register_cxx_method(h_class, "add",
+                          CLS_METHOD_RD | CLS_METHOD_WR,
+                          add, &h_add);
+
+  cls_register_cxx_method(h_class, "mul",
+                          CLS_METHOD_RD | CLS_METHOD_WR,
+                          mul, &h_mul);
+}
diff --git a/src/cls/numops/cls_numops_client.cc b/src/cls/numops/cls_numops_client.cc
new file mode 100644
index 0000000..6892820
--- /dev/null
+++ b/src/cls/numops/cls_numops_client.cc
@@ -0,0 +1,80 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 CERN
+ *
+ * Author: Joaquim Rocha <joaquim.rocha at cern.ch>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#include "objclass/objclass.h"
+#include "cls/numops/cls_numops_client.h"
+#include "include/encoding.h"
+
+#include <cstdlib>
+#include <errno.h>
+#include <sstream>
+
+namespace rados {
+  namespace cls {
+    namespace numops {
+
+      int add(librados::IoCtx *ioctx,
+              const std::string& oid,
+              const std::string& key,
+              double value_to_add)
+      {
+        bufferlist in, out;
+        ::encode(key, in);
+
+        std::stringstream stream;
+        stream << value_to_add;
+
+        ::encode(stream.str(), in);
+
+        return ioctx->exec(oid, "numops", "add", in, out);
+      }
+
+      int sub(librados::IoCtx *ioctx,
+              const std::string& oid,
+              const std::string& key,
+              double value_to_subtract)
+      {
+        return add(ioctx, oid, key, -value_to_subtract);
+      }
+
+      int mul(librados::IoCtx *ioctx,
+              const std::string& oid,
+              const std::string& key,
+              double value_to_multiply)
+      {
+        bufferlist in, out;
+        ::encode(key, in);
+
+        std::stringstream stream;
+        stream << value_to_multiply;
+
+        ::encode(stream.str(), in);
+
+        return ioctx->exec(oid, "numops", "mul", in, out);
+      }
+
+      int div(librados::IoCtx *ioctx,
+              const std::string& oid,
+              const std::string& key,
+              double value_to_divide)
+      {
+        if (value_to_divide == 0)
+          return -EINVAL;
+
+        return mul(ioctx, oid, key, 1 / value_to_divide);
+      }
+
+    } // namespace numops
+  } // namespace cls
+} // namespace rados
diff --git a/src/cls/numops/cls_numops_client.h b/src/cls/numops/cls_numops_client.h
new file mode 100644
index 0000000..8d776bf
--- /dev/null
+++ b/src/cls/numops/cls_numops_client.h
@@ -0,0 +1,49 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 CERN
+ *
+ * Author: Joaquim Rocha <joaquim.rocha at cern.ch>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#ifndef CEPH_LIBRBD_CLS_NUMOPS_CLIENT_H
+#define CEPH_LIBRBD_CLS_NUMOPS_CLIENT_H
+
+#include "include/rados/librados.hpp"
+
+namespace rados {
+  namespace cls {
+    namespace numops {
+
+      extern int add(librados::IoCtx *ioctx,
+                     const std::string& oid,
+                     const std::string& key,
+                     double value_to_add);
+
+      extern int sub(librados::IoCtx *ioctx,
+                     const std::string& oid,
+                     const std::string& key,
+                     double value_to_subtract);
+
+      extern int mul(librados::IoCtx *ioctx,
+                     const std::string& oid,
+                     const std::string& key,
+                     double value_to_multiply);
+
+      extern int div(librados::IoCtx *ioctx,
+                     const std::string& oid,
+                     const std::string& key,
+                     double value_to_divide);
+
+    } // namespace numops
+  } // namespace cls
+} // namespace rados
+
+#endif // CEPH_LIBRBD_CLS_NUMOPS_CLIENT_H
+
diff --git a/src/cls/rbd/cls_rbd.cc b/src/cls/rbd/cls_rbd.cc
index ae2a432..74af0a2 100644
--- a/src/cls/rbd/cls_rbd.cc
+++ b/src/cls/rbd/cls_rbd.cc
@@ -41,6 +41,7 @@
 #include "common/errno.h"
 #include "objclass/objclass.h"
 #include "include/rbd_types.h"
+#include "include/rbd/object_map_types.h"
 
 #include "cls/rbd/cls_rbd.h"
 
@@ -64,6 +65,7 @@ CLS_NAME(rbd)
 cls_handle_t h_class;
 cls_method_handle_t h_create;
 cls_method_handle_t h_get_features;
+cls_method_handle_t h_set_features;
 cls_method_handle_t h_get_size;
 cls_method_handle_t h_set_size;
 cls_method_handle_t h_get_parent;
@@ -94,8 +96,15 @@ cls_method_handle_t h_dir_add_image;
 cls_method_handle_t h_dir_remove_image;
 cls_method_handle_t h_dir_rename_image;
 cls_method_handle_t h_object_map_load;
+cls_method_handle_t h_object_map_save;
 cls_method_handle_t h_object_map_resize;
 cls_method_handle_t h_object_map_update;
+cls_method_handle_t h_object_map_snap_add;
+cls_method_handle_t h_object_map_snap_remove;
+cls_method_handle_t h_metadata_set;
+cls_method_handle_t h_metadata_remove;
+cls_method_handle_t h_metadata_list;
+cls_method_handle_t h_metadata_get;
 cls_method_handle_t h_old_snapshots_list;
 cls_method_handle_t h_old_snapshot_add;
 cls_method_handle_t h_old_snapshot_remove;
@@ -104,6 +113,7 @@ cls_method_handle_t h_old_snapshot_remove;
 #define RBD_SNAP_KEY_PREFIX "snapshot_"
 #define RBD_DIR_ID_KEY_PREFIX "id_"
 #define RBD_DIR_NAME_KEY_PREFIX "name_"
+#define RBD_METADATA_KEY_PREFIX "metadata_"
 
 static int snap_read_header(cls_method_context_t hctx, bufferlist& bl)
 {
@@ -279,15 +289,17 @@ int create(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
 
 /**
  * Input:
- * @param snap_id which snapshot to query, or CEPH_NOSNAP (uint64_t)
+ * @param snap_id which snapshot to query, or CEPH_NOSNAP (uint64_t) (deprecated)
+ * @param read_only true if the image will be used read-only (bool)
  *
  * Output:
  * @param features list of enabled features for the given snapshot (uint64_t)
+ * @param incompatible incompatible feature bits
  * @returns 0 on success, negative error code on failure
  */
 int get_features(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
 {
-  uint64_t features, snap_id;
+  uint64_t snap_id;
   bool read_only = false;
 
   bufferlist::iterator iter = in->begin();
@@ -300,30 +312,87 @@ int get_features(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
     return -EINVAL;
   }
 
-  CLS_LOG(20, "get_features snap_id=%llu", (unsigned long long)snap_id);
+  CLS_LOG(20, "get_features snap_id=%" PRIu64 ", read_only=%d",
+          snap_id, read_only);
 
-  if (snap_id == CEPH_NOSNAP) {
-    int r = read_key(hctx, "features", &features);
-    if (r < 0) {
-      CLS_ERR("failed to read features off disk: %s", cpp_strerror(r).c_str());
-      return r;
-    }
-  } else {
+  // NOTE: keep this deprecated snapshot logic to support negative
+  // test cases in older (pre-Infernalis) releases. Remove once older
+  // releases are no longer supported.
+  if (snap_id != CEPH_NOSNAP) {
     cls_rbd_snap snap;
     string snapshot_key;
     key_from_snap_id(snap_id, &snapshot_key);
     int r = read_key(hctx, snapshot_key, &snap);
-    if (r < 0)
+    if (r < 0) {
       return r;
+    }
+  }
 
-    features = snap.features;
+  uint64_t features;
+  int r = read_key(hctx, "features", &features);
+  if (r < 0) {
+    CLS_ERR("failed to read features off disk: %s", cpp_strerror(r).c_str());
+    return r;
   }
 
   uint64_t incompatible = (read_only ? features & RBD_FEATURES_INCOMPATIBLE :
 				       features & RBD_FEATURES_RW_INCOMPATIBLE);
   ::encode(features, *out);
   ::encode(incompatible, *out);
+  return 0;
+}
+
+/**
+ * set the image features
+ *
+ * Input:
+ * @params features image features
+ * @params mask image feature mask
+ *
+ * Output:
+ * none
+ *
+ * @returns 0 on success, negative error code upon failure
+ */
+int set_features(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
+{
+  uint64_t features;
+  uint64_t mask;
+  bufferlist::iterator iter = in->begin();
+  try {
+    ::decode(features, iter);
+    ::decode(mask, iter);
+  } catch (const buffer::error &err) {
+    return -EINVAL;
+  }
+
+  if ((mask & RBD_FEATURES_MUTABLE) != mask) {
+    CLS_ERR("Attempting to set immutable feature: %" PRIu64,
+            mask & ~RBD_FEATURES_MUTABLE);
+    return -EINVAL;
+  }
 
+  // check that features exists to make sure this is a header object
+  // that was created correctly
+  uint64_t orig_features = 0;
+  int r = read_key(hctx, "features", &orig_features);
+  if (r < 0 && r != -ENOENT) {
+    CLS_ERR("Could not read image's features off disk: %s",
+            cpp_strerror(r).c_str());
+    return r;
+  }
+
+  features = (orig_features & ~mask) | (features & mask);
+  CLS_LOG(10, "set_features features=%" PRIu64 " orig_features=%" PRIu64,
+          features, orig_features);
+
+  bufferlist bl;
+  ::encode(features, bl);
+  r = cls_cxx_map_set_val(hctx, "features", &bl);
+  if (r < 0) {
+    CLS_ERR("error updating features: %s", cpp_strerror(r).c_str());
+    return r;
+  }
   return 0;
 }
 
@@ -994,6 +1063,58 @@ int remove_parent(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
   if (r < 0)
     return r;
 
+  uint64_t features;
+  r = read_key(hctx, "features", &features);
+  if (r < 0) {
+    return r;
+  }
+
+  // remove the parent from all snapshots
+  if ((features & RBD_FEATURE_DEEP_FLATTEN) != 0) {
+    int max_read = RBD_MAX_KEYS_READ;
+    vector<snapid_t> snap_ids;
+    string last_read = RBD_SNAP_KEY_PREFIX;
+
+    do {
+      set<string> keys;
+      r = cls_cxx_map_get_keys(hctx, last_read, max_read, &keys);
+      if (r < 0) {
+        return r;
+      }
+
+      for (std::set<string>::const_iterator it = keys.begin();
+           it != keys.end(); ++it) {
+        if ((*it).find(RBD_SNAP_KEY_PREFIX) != 0) {
+	  break;
+        }
+
+        uint64_t snap_id = snap_id_from_key(*it);
+        cls_rbd_snap snap_meta;
+        r = read_key(hctx, *it, &snap_meta);
+        if (r < 0) {
+          CLS_ERR("Could not read snapshot: snap_id=%" PRIu64 ": %s",
+                  snap_id, cpp_strerror(r).c_str());
+          return r;
+        }
+
+        snap_meta.parent = cls_rbd_parent();
+
+        bufferlist bl;
+        ::encode(snap_meta, bl);
+        r = cls_cxx_map_set_val(hctx, *it, &bl);
+        if (r < 0) {
+          CLS_ERR("Could not update snapshot: snap_id=%" PRIu64 ": %s",
+                  snap_id, cpp_strerror(r).c_str());
+          return r;
+        }
+      }
+
+      if (!keys.empty()) {
+        last_read = *(keys.rbegin());
+      }
+    } while (r == max_read);
+  }
+
   cls_rbd_parent parent;
   r = read_key(hctx, "parent", &parent);
   if (r < 0)
@@ -1004,7 +1125,6 @@ int remove_parent(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
     CLS_ERR("error removing parent: %d", r);
     return r;
   }
-
   return 0;
 }
 
@@ -1934,6 +2054,9 @@ int object_map_read(cls_method_context_t hctx, BitVector<2> &object_map)
   if (r < 0) {
     return r;
   }
+  if (size == 0) {
+    return -ENOENT;
+  }
 
   bufferlist bl;
   r = cls_cxx_read(hctx, 0, size, &bl);
@@ -1975,6 +2098,32 @@ int object_map_load(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
 }
 
 /**
+ * Save an rbd image's object map
+ *
+ * Input:
+ * @param object map bit vector
+ *
+ * Output:
+ * @returns 0 on success, negative error code on failure
+ */
+int object_map_save(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
+{
+  BitVector<2> object_map;
+  try {
+    bufferlist::iterator iter = in->begin();
+    ::decode(object_map, iter);
+  } catch (const buffer::error &err) {
+    return -EINVAL;
+  }
+
+  bufferlist bl;
+  ::encode(object_map, bl);
+  CLS_LOG(20, "object_map_save: object size=%" PRIu64 ", byte size=%u",
+	  object_map.size(), bl.length());
+  return cls_cxx_write_full(hctx, &bl);
+}
+
+/**
  * Resize an rbd image's object map
  *
  * Input:
@@ -2113,8 +2262,10 @@ int object_map_update(cls_method_context_t hctx, bufferlist *in, bufferlist *out
   bool updated = false;
   for (uint64_t object_no = start_object_no; object_no < end_object_no;
        ++object_no) {
-    if ((!current_object_state || object_map[object_no] == *current_object_state) &&
-	object_map[object_no] != new_object_state) {
+    uint8_t state = object_map[object_no];
+    if ((!current_object_state || state == *current_object_state ||
+        (*current_object_state == OBJECT_EXISTS &&
+         state == OBJECT_EXISTS_CLEAN)) && state != new_object_state) {
       object_map[object_no] = new_object_state;
       updated = true;
     }
@@ -2138,6 +2289,243 @@ int object_map_update(cls_method_context_t hctx, bufferlist *in, bufferlist *out
   return r;
 }
 
+/**
+ * Mark all _EXISTS objects as _EXISTS_CLEAN so future writes to the
+ * image HEAD can be tracked.
+ *
+ * Input:
+ * none
+ *
+ * Output:
+ * @returns 0 on success, negative error code on failure
+ */
+int object_map_snap_add(cls_method_context_t hctx, bufferlist *in,
+                        bufferlist *out)
+{
+  BitVector<2> object_map;
+  int r = object_map_read(hctx, object_map);
+  if (r < 0) {
+    return r;
+  }
+
+  bool updated = false;
+  for (uint64_t i = 0; i < object_map.size(); ++i) {
+    if (object_map[i] == OBJECT_EXISTS) {
+      object_map[i] = OBJECT_EXISTS_CLEAN;
+      updated = true;
+    }
+  }
+
+  if (updated) {
+    bufferlist bl;
+    ::encode(object_map, bl);
+    r = cls_cxx_write_full(hctx, &bl);
+  }
+  return r;
+}
+
+/**
+ * Mark all _EXISTS_CLEAN objects as _EXISTS in the current object map
+ * if the provided snapshot object map object is marked as _EXISTS.
+ *
+ * Input:
+ * @param snapshot object map bit vector
+ *
+ * Output:
+ * @returns 0 on success, negative error code on failure
+ */
+int object_map_snap_remove(cls_method_context_t hctx, bufferlist *in,
+                           bufferlist *out)
+{
+  BitVector<2> src_object_map;
+  try {
+    bufferlist::iterator iter = in->begin();
+    ::decode(src_object_map, iter);
+  } catch (const buffer::error &err) {
+    return -EINVAL;
+  }
+
+  BitVector<2> dst_object_map;
+  int r = object_map_read(hctx, dst_object_map);
+  if (r < 0) {
+    return r;
+  }
+
+  bool updated = false;
+  for (uint64_t i = 0; i < dst_object_map.size(); ++i) {
+    if (dst_object_map[i] == OBJECT_EXISTS_CLEAN &&
+        (i >= src_object_map.size() || src_object_map[i] == OBJECT_EXISTS)) {
+      dst_object_map[i] = OBJECT_EXISTS;
+      updated = true;
+    }
+  }
+
+  if (updated) {
+    bufferlist bl;
+    ::encode(dst_object_map, bl);
+    r = cls_cxx_write_full(hctx, &bl);
+  }
+  return r;
+}
+
+static const string metadata_key_for_name(const string &name)
+{
+  return RBD_METADATA_KEY_PREFIX + name;
+}
+
+static const string metadata_name_from_key(const string &key)
+{
+  return key.substr(strlen(RBD_METADATA_KEY_PREFIX));
+}
+
+/**
+ * Input:
+ * @param start_after which name to begin listing after
+ *        (use the empty string to start at the beginning)
+ * @param max_return the maximum number of names to lis(if 0 means no limit)
+
+ * Output:
+ * @param value
+ * @returns 0 on success, negative error code on failure
+ */
+int metadata_list(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
+{
+  string start_after;
+  uint64_t max_return;
+
+  try {
+    bufferlist::iterator iter = in->begin();
+    ::decode(start_after, iter);
+    ::decode(max_return, iter);
+  } catch (const buffer::error &err) {
+    return -EINVAL;
+  }
+
+  map<string, bufferlist> data;
+  string last_read = metadata_key_for_name(start_after);
+  int max_read = max_return ? MIN(RBD_MAX_KEYS_READ, max_return) : RBD_MAX_KEYS_READ;
+
+  do {
+    map<string, bufferlist> raw_data;
+    int r = cls_cxx_map_get_vals(hctx, last_read, RBD_METADATA_KEY_PREFIX,
+                             max_read, &raw_data);
+    if (r < 0) {
+      CLS_ERR("failed to read the vals off of disk: %s", cpp_strerror(r).c_str());
+      return r;
+    }
+    if (raw_data.empty())
+      break;
+
+    map<string, bufferlist>::iterator it = raw_data.begin();
+    if (metadata_name_from_key(it->first) == last_read)
+        ++it;
+    for (; it != raw_data.end(); ++it)
+      data[metadata_name_from_key(it->first)].swap(it->second);
+
+    last_read = raw_data.rbegin()->first;
+    if (max_return)
+      max_read = MIN(RBD_MAX_KEYS_READ, max_return-data.size());
+  } while (max_return && max_read);
+
+  ::encode(data, *out);
+  return 0;
+}
+
+/**
+ * Input:
+ * @param data <map(key, value)>
+ *
+ * Output:
+ * @returns 0 on success, negative error code on failure
+ */
+int metadata_set(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
+{
+  map<string, bufferlist> data, raw_data;
+
+  bufferlist::iterator iter = in->begin();
+  try {
+    ::decode(data, iter);
+  } catch (const buffer::error &err) {
+    return -EINVAL;
+  }
+
+  for (map<string, bufferlist>::iterator it = data.begin();
+       it != data.end(); ++it) {
+    CLS_LOG(20, "metdata_set key=%s value=%.*s", it->first.c_str(),
+	    it->second.length(), it->second.c_str());
+    raw_data[metadata_key_for_name(it->first)].swap(it->second);
+  }
+  int r = cls_cxx_map_set_vals(hctx, &raw_data);
+  if (r < 0) {
+    CLS_ERR("error writing metadata: %d", r);
+    return r;
+  }
+
+  return 0;
+}
+
+/**
+ * Input:
+ * @param key
+ *
+ * Output:
+ * @returns 0 on success, negative error code on failure
+ */
+int metadata_remove(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
+{
+  string key;
+
+  bufferlist::iterator iter = in->begin();
+  try {
+    ::decode(key, iter);
+  } catch (const buffer::error &err) {
+    return -EINVAL;
+  }
+
+  CLS_LOG(20, "metdata_set key=%s", key.c_str());
+
+  int r = cls_cxx_map_remove_key(hctx, metadata_key_for_name(key));
+  if (r < 0) {
+    CLS_ERR("error remove metadata: %d", r);
+    return r;
+  }
+
+  return 0;
+}
+
+/**
+ * Input:
+ * @param key
+ *
+ * Output:
+ * @param metadata value associated with the key
+ * @returns 0 on success, negative error code on failure
+ */
+int metadata_get(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
+{
+  string key;
+  bufferlist value;
+
+  bufferlist::iterator iter = in->begin();
+  try {
+    ::decode(key, iter);
+  } catch (const buffer::error &err) {
+    return -EINVAL;
+  }
+
+  CLS_LOG(20, "metdata_get key=%s", key.c_str());
+
+  int r = cls_cxx_map_get_val(hctx, metadata_key_for_name(key), &value);
+  if (r < 0) {
+    CLS_ERR("error get metadata: %d", r);
+    return r;
+  }
+
+  ::encode(value, *out);
+  return 0;
+}
+
+
 /****************************** Old format *******************************/
 
 int old_snapshots_list(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
@@ -2345,6 +2733,9 @@ void __cls_init()
   cls_register_cxx_method(h_class, "get_features",
 			  CLS_METHOD_RD,
 			  get_features, &h_get_features);
+  cls_register_cxx_method(h_class, "set_features",
+			  CLS_METHOD_RD | CLS_METHOD_WR,
+			  set_features, &h_set_features);
   cls_register_cxx_method(h_class, "get_size",
 			  CLS_METHOD_RD,
 			  get_size, &h_get_size);
@@ -2399,6 +2790,18 @@ void __cls_init()
   cls_register_cxx_method(h_class, "set_flags",
                           CLS_METHOD_RD | CLS_METHOD_WR,
                           set_flags, &h_set_flags);
+  cls_register_cxx_method(h_class, "metadata_list",
+                          CLS_METHOD_RD,
+			  metadata_list, &h_metadata_list);
+  cls_register_cxx_method(h_class, "metadata_set",
+                          CLS_METHOD_RD | CLS_METHOD_WR,
+			  metadata_set, &h_metadata_set);
+  cls_register_cxx_method(h_class, "metadata_remove",
+                          CLS_METHOD_RD | CLS_METHOD_WR,
+			  metadata_remove, &h_metadata_remove);
+  cls_register_cxx_method(h_class, "metadata_get",
+                          CLS_METHOD_RD,
+			  metadata_get, &h_metadata_get);
 
   /* methods for the rbd_children object */
   cls_register_cxx_method(h_class, "add_child",
@@ -2443,14 +2846,23 @@ void __cls_init()
   cls_register_cxx_method(h_class, "object_map_load",
                           CLS_METHOD_RD,
 			  object_map_load, &h_object_map_load);
+  cls_register_cxx_method(h_class, "object_map_save",
+                          CLS_METHOD_RD | CLS_METHOD_WR,
+			  object_map_save, &h_object_map_save);
   cls_register_cxx_method(h_class, "object_map_resize",
                           CLS_METHOD_RD | CLS_METHOD_WR,
 			  object_map_resize, &h_object_map_resize);
   cls_register_cxx_method(h_class, "object_map_update",
                           CLS_METHOD_RD | CLS_METHOD_WR,
 			  object_map_update, &h_object_map_update);
+  cls_register_cxx_method(h_class, "object_map_snap_add",
+                          CLS_METHOD_RD | CLS_METHOD_WR,
+			  object_map_snap_add, &h_object_map_snap_add);
+  cls_register_cxx_method(h_class, "object_map_snap_remove",
+                          CLS_METHOD_RD | CLS_METHOD_WR,
+			  object_map_snap_remove, &h_object_map_snap_remove);
 
-  /* methods for the old format */
+ /* methods for the old format */
   cls_register_cxx_method(h_class, "snap_list",
 			  CLS_METHOD_RD,
 			  old_snapshots_list, &h_old_snapshots_list);
diff --git a/src/cls/rbd/cls_rbd_client.cc b/src/cls/rbd/cls_rbd_client.cc
index bef87ab..0385ec9 100644
--- a/src/cls/rbd/cls_rbd_client.cc
+++ b/src/cls/rbd/cls_rbd_client.cc
@@ -154,6 +154,18 @@ namespace librbd {
       return 0;
     }
 
+    int set_features(librados::IoCtx *ioctx, const std::string &oid,
+                      uint64_t features, uint64_t mask)
+    {
+      bufferlist inbl;
+      ::encode(features, inbl);
+      ::encode(mask, inbl);
+
+      librados::ObjectWriteOperation op;
+      op.exec("rbd", "set_features", inbl);
+      return ioctx->operate(oid, &op);
+    }
+
     int get_object_prefix(librados::IoCtx *ioctx, const std::string &oid,
 			  std::string *object_prefix)
     {
@@ -406,7 +418,6 @@ namespace librbd {
 		      const std::vector<snapid_t> &ids,
 		      std::vector<string> *names,
 		      std::vector<uint64_t> *sizes,
-		      std::vector<uint64_t> *features,
 		      std::vector<parent_info> *parents,
 		      std::vector<uint8_t> *protection_statuses)
     {
@@ -414,8 +425,6 @@ namespace librbd {
       names->resize(ids.size());
       sizes->clear();
       sizes->resize(ids.size());
-      features->clear();
-      features->resize(ids.size());
       parents->clear();
       parents->resize(ids.size());
       protection_statuses->clear();
@@ -425,17 +434,15 @@ namespace librbd {
       for (vector<snapid_t>::const_iterator it = ids.begin();
 	   it != ids.end(); ++it) {
 	snapid_t snap_id = it->val;
-	bufferlist bl1, bl2, bl3, bl4, bl5;
+	bufferlist bl1, bl2, bl3, bl4;
 	::encode(snap_id, bl1);
 	op.exec("rbd", "get_snapshot_name", bl1);
 	::encode(snap_id, bl2);
 	op.exec("rbd", "get_size", bl2);
 	::encode(snap_id, bl3);
-	op.exec("rbd", "get_features", bl3);
+	op.exec("rbd", "get_parent", bl3);
 	::encode(snap_id, bl4);
-	op.exec("rbd", "get_parent", bl4);
-	::encode(snap_id, bl5);
-	op.exec("rbd", "get_protection_status", bl5);
+	op.exec("rbd", "get_protection_status", bl4);
       }
 
       bufferlist outbl;
@@ -447,15 +454,11 @@ namespace librbd {
 	bufferlist::iterator iter = outbl.begin();
 	for (size_t i = 0; i < ids.size(); ++i) {
 	  uint8_t order;
-	  uint64_t incompat_features;
 	  // get_snapshot_name
 	  ::decode((*names)[i], iter);
 	  // get_size
 	  ::decode(order, iter);
 	  ::decode((*sizes)[i], iter);
-	  // get_features
-	  ::decode((*features)[i], iter);
-	  ::decode(incompat_features, iter);
 	  // get_parent
 	  ::decode((*parents)[i].spec.pool_id, iter);
 	  ::decode((*parents)[i].spec.image_id, iter);
@@ -728,6 +731,17 @@ namespace librbd {
       return 0;
     }
 
+    void object_map_save(librados::ObjectWriteOperation *rados_op,
+                         const ceph::BitVector<2> &object_map)
+    {
+      ceph::BitVector<2> object_map_copy(object_map);
+      object_map_copy.set_crc_enabled(false);
+
+      bufferlist in;
+      ::encode(object_map_copy, in);
+      rados_op->exec("rbd", "object_map_save", in);
+    }
+
     void object_map_resize(librados::ObjectWriteOperation *rados_op,
                            uint64_t object_count, uint8_t default_state)
     {
@@ -750,5 +764,82 @@ namespace librbd {
       rados_op->exec("rbd", "object_map_update", in);
     }
 
+    void object_map_snap_add(librados::ObjectWriteOperation *rados_op)
+    {
+      bufferlist in;
+      rados_op->exec("rbd", "object_map_snap_add", in);
+    }
+
+    void object_map_snap_remove(librados::ObjectWriteOperation *rados_op,
+                                const ceph::BitVector<2> &object_map)
+    {
+      ceph::BitVector<2> object_map_copy(object_map);
+      object_map_copy.set_crc_enabled(false);
+
+      bufferlist in;
+      ::encode(object_map_copy, in);
+      rados_op->exec("rbd", "object_map_snap_remove", in);
+    }
+
+    int metadata_set(librados::IoCtx *ioctx, const std::string &oid,
+                     const map<string, bufferlist> &data)
+    {
+      bufferlist in;
+      ::encode(data, in);
+      bufferlist out;
+      return ioctx->exec(oid, "rbd", "metadata_set", in, out);
+    }
+
+    int metadata_remove(librados::IoCtx *ioctx, const std::string &oid,
+                        const std::string &key)
+    {
+      bufferlist in;
+      ::encode(key, in);
+      bufferlist out;
+      return ioctx->exec(oid, "rbd", "metadata_remove", in, out);
+    }
+
+    int metadata_list(librados::IoCtx *ioctx, const std::string &oid,
+                      const std::string &start, uint64_t max_return,
+                      map<string, bufferlist> *pairs)
+    {
+      assert(pairs);
+      bufferlist in, out;
+      ::encode(start, in);
+      ::encode(max_return, in);
+      int r = ioctx->exec(oid, "rbd", "metadata_list", in, out);
+      if (r < 0)
+        return r;
+
+      bufferlist::iterator iter = out.begin();
+      try {
+        ::decode(*pairs, iter);
+      } catch (const buffer::error &err) {
+        return -EBADMSG;
+      }
+
+      return 0;
+    }
+
+    int metadata_get(librados::IoCtx *ioctx, const std::string &oid,
+                     const std::string &key, string *s)
+    {
+      assert(s);
+      bufferlist in, out;
+      ::encode(key, in);
+      int r = ioctx->exec(oid, "rbd", "metadata_get", in, out);
+      if (r < 0)
+        return r;
+
+      bufferlist::iterator iter = out.begin();
+      try {
+        ::decode(*s, iter);
+      } catch (const buffer::error &err) {
+        return -EBADMSG;
+      }
+
+      return 0;
+    }
+
   } // namespace cls_client
 } // namespace librbd
diff --git a/src/cls/rbd/cls_rbd_client.h b/src/cls/rbd/cls_rbd_client.h
index 419ff85..486d17f 100644
--- a/src/cls/rbd/cls_rbd_client.h
+++ b/src/cls/rbd/cls_rbd_client.h
@@ -35,6 +35,8 @@ namespace librbd {
 		     const std::string &object_prefix);
     int get_features(librados::IoCtx *ioctx, const std::string &oid,
 		     snapid_t snap_id, uint64_t *features);
+    int set_features(librados::IoCtx *ioctx, const std::string &oid,
+                     uint64_t features, uint64_t mask);
     int get_object_prefix(librados::IoCtx *ioctx, const std::string &oid,
 			  std::string *object_prefix);
     int get_size(librados::IoCtx *ioctx, const std::string &oid,
@@ -74,7 +76,6 @@ namespace librbd {
 		      const std::vector<snapid_t> &ids,
 		      std::vector<string> *names,
 		      std::vector<uint64_t> *sizes,
-		      std::vector<uint64_t> *features,
 		      std::vector<parent_info> *parents,
 		      std::vector<uint8_t> *protection_statuses);
     int copyup(librados::IoCtx *ioctx, const std::string &oid,
@@ -87,6 +88,15 @@ namespace librbd {
 			      uint64_t *stripe_unit, uint64_t *stripe_count);
     int set_stripe_unit_count(librados::IoCtx *ioctx, const std::string &oid,
 			      uint64_t stripe_unit, uint64_t stripe_count);
+    int metadata_list(librados::IoCtx *ioctx, const std::string &oid,
+                      const std::string &start, uint64_t max_return,
+                      map<string, bufferlist> *pairs);
+    int metadata_set(librados::IoCtx *ioctx, const std::string &oid,
+                     const map<std::string, bufferlist> &data);
+    int metadata_remove(librados::IoCtx *ioctx, const std::string &oid,
+                        const std::string &key);
+    int metadata_get(librados::IoCtx *ioctx, const std::string &oid,
+                     const std::string &key, string *v);
 
     // operations on rbd_id objects
     int get_id(librados::IoCtx *ioctx, const std::string &oid, std::string *id);
@@ -112,12 +122,17 @@ namespace librbd {
     // operations on the rbd_object_map.$image_id object
     int object_map_load(librados::IoCtx *ioctx, const std::string &oid,
 		        ceph::BitVector<2> *object_map);
+    void object_map_save(librados::ObjectWriteOperation *rados_op,
+                         const ceph::BitVector<2> &object_map);
     void object_map_resize(librados::ObjectWriteOperation *rados_op,
 			   uint64_t object_count, uint8_t default_state);
     void object_map_update(librados::ObjectWriteOperation *rados_op,
 			   uint64_t start_object_no, uint64_t end_object_no,
 			   uint8_t new_object_state,
 			   const boost::optional<uint8_t> &current_object_state);
+    void object_map_snap_add(librados::ObjectWriteOperation *rados_op);
+    void object_map_snap_remove(librados::ObjectWriteOperation *rados_op,
+                                const ceph::BitVector<2> &object_map);
 
     // class operations on the old format, kept for
     // backwards compatability
diff --git a/src/cls/refcount/cls_refcount.cc b/src/cls/refcount/cls_refcount.cc
index c97460f..f5598ae 100644
--- a/src/cls/refcount/cls_refcount.cc
+++ b/src/cls/refcount/cls_refcount.cc
@@ -14,6 +14,7 @@
 #include "common/Clock.h"
 
 #include "global/global_context.h"
+#include "include/compat.h"
 
 CLS_VER(1,0)
 CLS_NAME(refcount)
diff --git a/src/cls/rgw/cls_rgw.cc b/src/cls/rgw/cls_rgw.cc
index db488e4..b4892fd 100644
--- a/src/cls/rgw/cls_rgw.cc
+++ b/src/cls/rgw/cls_rgw.cc
@@ -17,6 +17,7 @@
 #include "common/escape.h"
 
 #include "global/global_context.h"
+#include "include/compat.h"
 
 CLS_VER(1,0)
 CLS_NAME(rgw)
@@ -423,54 +424,65 @@ int rgw_bucket_list(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
   bufferlist bl;
 
   map<string, bufferlist> keys;
+  std::map<string, bufferlist>::iterator kiter;
   string start_key;
   encode_list_index_key(hctx, op.start_obj, &start_key);
-  rc = get_obj_vals(hctx, start_key, op.filter_prefix, op.num_entries + 1, &keys);
-  if (rc < 0)
-    return rc;
+  bool done = false;
+  uint32_t left_to_read = op.num_entries + 1;
 
-  std::map<string, struct rgw_bucket_dir_entry>& m = new_dir.m;
-  std::map<string, bufferlist>::iterator kiter = keys.begin();
-  uint32_t i;
+  do {
+    rc = get_obj_vals(hctx, start_key, op.filter_prefix, left_to_read, &keys);
+    if (rc < 0)
+      return rc;
 
-  bool done = false;
+    std::map<string, struct rgw_bucket_dir_entry>& m = new_dir.m;
 
-  for (i = 0; i < op.num_entries && kiter != keys.end(); ++i, ++kiter) {
-    struct rgw_bucket_dir_entry entry;
+    done = keys.empty();
 
-    if (!bi_is_objs_index(kiter->first)) {
-      done = true;
-      break;
-    }
+    for (kiter = keys.begin(); kiter != keys.end(); ++kiter) {
+      struct rgw_bucket_dir_entry entry;
 
-    bufferlist& entrybl = kiter->second;
-    bufferlist::iterator eiter = entrybl.begin();
-    try {
-      ::decode(entry, eiter);
-    } catch (buffer::error& err) {
-      CLS_LOG(1, "ERROR: rgw_bucket_list(): failed to decode entry, key=%s\n", kiter->first.c_str());
-      return -EINVAL;
-    }
+      if (!bi_is_objs_index(kiter->first)) {
+        done = true;
+        break;
+      }
 
-    cls_rgw_obj_key key;
-    uint64_t ver;
-    decode_list_index_key(kiter->first, &key, &ver);
+      bufferlist& entrybl = kiter->second;
+      bufferlist::iterator eiter = entrybl.begin();
+      try {
+        ::decode(entry, eiter);
+      } catch (buffer::error& err) {
+        CLS_LOG(1, "ERROR: rgw_bucket_list(): failed to decode entry, key=%s\n", kiter->first.c_str());
+        return -EINVAL;
+      }
 
-    if (!entry.is_valid()) {
-      CLS_LOG(20, "entry %s[%s] is not valid\n", key.name.c_str(), key.instance.c_str());
-      continue;
-    }
+      cls_rgw_obj_key key;
+      uint64_t ver;
+      decode_list_index_key(kiter->first, &key, &ver);
 
-    if (!op.list_versions && !entry.is_visible()) {
-      CLS_LOG(20, "entry %s[%s] is not visible\n", key.name.c_str(), key.instance.c_str());
-      continue;
-    }
-    m[kiter->first] = entry;
+      start_key = kiter->first;
+      CLS_LOG(20, "start_key=%s len=%d", start_key.c_str(), start_key.size());
 
-    CLS_LOG(20, "got entry %s[%s] m.size()=%d\n", key.name.c_str(), key.instance.c_str(), (int)m.size());
-  }
+      if (!entry.is_valid()) {
+        CLS_LOG(20, "entry %s[%s] is not valid\n", key.name.c_str(), key.instance.c_str());
+        continue;
+      }
+
+      if (!op.list_versions && !entry.is_visible()) {
+        CLS_LOG(20, "entry %s[%s] is not visible\n", key.name.c_str(), key.instance.c_str());
+        continue;
+      }
+      if (m.size() < op.num_entries) {
+        m[kiter->first] = entry;
+      }
+      left_to_read--;
+
+      CLS_LOG(20, "got entry %s[%s] m.size()=%d\n", key.name.c_str(), key.instance.c_str(), (int)m.size());
+    }
+  } while (left_to_read > 0 && !done);
 
-  ret.is_truncated = (kiter != keys.end() && !done);
+  ret.is_truncated = (left_to_read == 0) && /* we found more entries than we were requested, meaning response is truncated */
+                     !done;
 
   ::encode(ret, *out);
   return 0;
diff --git a/src/cls/rgw/cls_rgw_client.h b/src/cls/rgw/cls_rgw_client.h
index ecec679..37c856f 100644
--- a/src/cls/rgw/cls_rgw_client.h
+++ b/src/cls/rgw/cls_rgw_client.h
@@ -7,6 +7,7 @@
 #include "cls_rgw_types.h"
 #include "cls_rgw_ops.h"
 #include "common/RefCountedObj.h"
+#include "include/compat.h"
 
 // Forward declaration
 class BucketIndexAioManager;
diff --git a/src/cls/rgw/cls_rgw_types.cc b/src/cls/rgw/cls_rgw_types.cc
index 89c8476..faec1a3 100644
--- a/src/cls/rgw/cls_rgw_types.cc
+++ b/src/cls/rgw/cls_rgw_types.cc
@@ -263,6 +263,7 @@ void rgw_bucket_olh_log_entry::generate_test_instances(list<rgw_bucket_olh_log_e
   entry->key.name = "key.name";
   entry->key.instance = "key.instance";
   entry->delete_marker = true;
+  o.push_back(entry);
   o.push_back(new rgw_bucket_olh_log_entry);
 }
 
diff --git a/src/cls/timeindex/cls_timeindex.cc b/src/cls/timeindex/cls_timeindex.cc
new file mode 100644
index 0000000..3300362
--- /dev/null
+++ b/src/cls/timeindex/cls_timeindex.cc
@@ -0,0 +1,273 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <iostream>
+
+#include <string.h>
+#include <stdlib.h>
+#include <errno.h>
+
+#include "include/types.h"
+#include "include/utime.h"
+#include "objclass/objclass.h"
+
+#include "cls_timeindex_types.h"
+#include "cls_timeindex_ops.h"
+
+#include "global/global_context.h"
+#include "include/compat.h"
+
+CLS_VER(1,0)
+CLS_NAME(timeindex)
+
+cls_handle_t h_class;
+cls_method_handle_t h_timeindex_add;
+cls_method_handle_t h_timeindex_list;
+cls_method_handle_t h_timeindex_trim;
+
+static const size_t MAX_LIST_ENTRIES = 1000;
+static const size_t MAX_TRIM_ENTRIES = 1000;
+
+static const string TIMEINDEX_PREFIX = "1_";
+
+static void get_index_time_prefix(const utime_t& ts,
+                                  string& index)
+{
+  char buf[32];
+
+  snprintf(buf, sizeof(buf), "%s%010ld.%06ld_", TIMEINDEX_PREFIX.c_str(),
+          (long)ts.sec(), (long)ts.usec());
+  buf[sizeof(buf) - 1] = '\0';
+
+  index = buf;
+}
+
+static void get_index(cls_method_context_t hctx,
+                      const utime_t& key_ts,
+                      const string& key_ext,
+                      string& index)
+{
+  get_index_time_prefix(key_ts, index);
+  index.append(key_ext);
+}
+
+static int parse_index(const string& index,
+                       utime_t& key_ts,
+                       string& key_ext)
+{
+  int sec, usec;
+  char keyext[256];
+
+  int ret = sscanf(index.c_str(), "1_%d.%d_%255s", &sec, &usec, keyext);
+
+  key_ts  = utime_t(sec, usec);
+  key_ext = string(keyext);
+  return ret;
+}
+
+static int cls_timeindex_add(cls_method_context_t hctx,
+                             bufferlist * const in,
+                             bufferlist * const out)
+{
+  bufferlist::iterator in_iter = in->begin();
+
+  cls_timeindex_add_op op;
+  try {
+    ::decode(op, in_iter);
+  } catch (buffer::error& err) {
+    CLS_LOG(1, "ERROR: cls_timeindex_add_op(): failed to decode op");
+    return -EINVAL;
+  }
+
+  for (list<cls_timeindex_entry>::iterator iter = op.entries.begin();
+       iter != op.entries.end();
+       ++iter) {
+    cls_timeindex_entry& entry = *iter;
+
+    string index;
+    get_index(hctx, entry.key_ts, entry.key_ext, index);
+
+    CLS_LOG(20, "storing entry at %s", index.c_str());
+
+    int ret = cls_cxx_map_set_val(hctx, index, &entry.value);
+    if (ret < 0) {
+      return ret;
+    }
+  }
+
+  return 0;
+}
+
+static int cls_timeindex_list(cls_method_context_t hctx,
+                              bufferlist * const in,
+                              bufferlist * const out)
+{
+  bufferlist::iterator in_iter = in->begin();
+
+  cls_timeindex_list_op op;
+  try {
+    ::decode(op, in_iter);
+  } catch (buffer::error& err) {
+    CLS_LOG(1, "ERROR: cls_timeindex_list_op(): failed to decode op");
+    return -EINVAL;
+  }
+
+  map<string, bufferlist> keys;
+
+  string from_index;
+  string to_index;
+
+  if (op.marker.empty()) {
+    get_index_time_prefix(op.from_time, from_index);
+  } else {
+    from_index = op.marker;
+  }
+  const bool use_time_boundary = (op.to_time >= op.from_time);
+
+  if (use_time_boundary) {
+    get_index_time_prefix(op.to_time, to_index);
+  }
+
+  size_t max_entries = op.max_entries;
+  if (max_entries > MAX_LIST_ENTRIES) {
+    max_entries = MAX_LIST_ENTRIES;
+  }
+
+  int rc = cls_cxx_map_get_vals(hctx, from_index, TIMEINDEX_PREFIX,
+          max_entries + 1, &keys);
+  if (rc < 0) {
+    return rc;
+  }
+
+  cls_timeindex_list_ret ret;
+
+  list<cls_timeindex_entry>& entries = ret.entries;
+  map<string, bufferlist>::iterator iter = keys.begin();
+
+  bool done = false;
+  string marker;
+
+  for (size_t i = 0; i < max_entries && iter != keys.end(); ++i, ++iter) {
+    const string& index = iter->first;
+    bufferlist& bl = iter->second;
+
+    marker = index;
+    if (use_time_boundary && index.compare(0, to_index.size(), to_index) >= 0) {
+      CLS_LOG(20, "DEBUG: cls_timeindex_list: finishing on to_index=%s",
+              to_index.c_str());
+      done = true;
+      break;
+    }
+
+    cls_timeindex_entry e;
+
+    if (parse_index(index, e.key_ts, e.key_ext) < 0) {
+      CLS_LOG(0, "ERROR: cls_timeindex_list: could not parse index=%s",
+              index.c_str());
+    } else {
+      CLS_LOG(20, "DEBUG: cls_timeindex_list: index=%s, key_ext=%s, bl.len = %d",
+              index.c_str(), e.key_ext.c_str(), bl.length());
+      e.value = bl;
+      entries.push_back(e);
+    }
+  }
+
+  if (iter == keys.end()) {
+    done = true;
+  }
+
+  ret.marker = marker;
+  ret.truncated = !done;
+
+  ::encode(ret, *out);
+
+  return 0;
+}
+
+
+static int cls_timeindex_trim(cls_method_context_t hctx,
+                              bufferlist * const in,
+                              bufferlist * const out)
+{
+  bufferlist::iterator in_iter = in->begin();
+
+  cls_timeindex_trim_op op;
+  try {
+    ::decode(op, in_iter);
+  } catch (buffer::error& err) {
+    CLS_LOG(1, "ERROR: cls_timeindex_trim: failed to decode entry");
+    return -EINVAL;
+  }
+
+  map<string, bufferlist> keys;
+
+  string from_index;
+  string to_index;
+
+  if (op.from_marker.empty()) {
+    get_index_time_prefix(op.from_time, from_index);
+  } else {
+    from_index = op.from_marker;
+  }
+
+  if (op.to_marker.empty()) {
+    get_index_time_prefix(op.to_time, to_index);
+  } else {
+    to_index = op.to_marker;
+  }
+
+  int rc = cls_cxx_map_get_vals(hctx, from_index, TIMEINDEX_PREFIX,
+          MAX_TRIM_ENTRIES, &keys);
+  if (rc < 0) {
+    return rc;
+  }
+
+  map<string, bufferlist>::iterator iter = keys.begin();
+
+  bool removed = false;
+  for (size_t i = 0; i < MAX_TRIM_ENTRIES && iter != keys.end(); ++i, ++iter) {
+    const string& index = iter->first;
+
+    CLS_LOG(20, "index=%s to_index=%s", index.c_str(), to_index.c_str());
+
+    if (index.compare(0, to_index.size(), to_index) > 0) {
+      CLS_LOG(20, "DEBUG: cls_timeindex_trim: finishing on to_index=%s",
+              to_index.c_str());
+      break;
+    }
+
+    CLS_LOG(20, "removing key: index=%s", index.c_str());
+
+    int rc = cls_cxx_map_remove_key(hctx, index);
+    if (rc < 0) {
+      CLS_LOG(1, "ERROR: cls_cxx_map_remove_key failed rc=%d", rc);
+      return rc;
+    }
+
+    removed = true;
+  }
+
+  if (!removed) {
+    return -ENODATA;
+  }
+
+  return 0;
+}
+
+void __cls_init()
+{
+  CLS_LOG(1, "Loaded timeindex class!");
+
+  cls_register("timeindex", &h_class);
+
+  /* timeindex */
+  cls_register_cxx_method(h_class, "add", CLS_METHOD_RD | CLS_METHOD_WR,
+          cls_timeindex_add, &h_timeindex_add);
+  cls_register_cxx_method(h_class, "list", CLS_METHOD_RD,
+          cls_timeindex_list, &h_timeindex_list);
+  cls_register_cxx_method(h_class, "trim", CLS_METHOD_RD | CLS_METHOD_WR,
+          cls_timeindex_trim, &h_timeindex_trim);
+
+  return;
+}
+
diff --git a/src/cls/timeindex/cls_timeindex_client.cc b/src/cls/timeindex/cls_timeindex_client.cc
new file mode 100644
index 0000000..6b9abce
--- /dev/null
+++ b/src/cls/timeindex/cls_timeindex_client.cc
@@ -0,0 +1,157 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <errno.h>
+
+#include "include/types.h"
+#include "cls/timeindex/cls_timeindex_ops.h"
+#include "include/rados/librados.hpp"
+#include "include/compat.h"
+
+
+using namespace librados;
+
+
+void cls_timeindex_add(librados::ObjectWriteOperation& op, list<cls_timeindex_entry>& entries)
+{
+  bufferlist in;
+  cls_timeindex_add_op call;
+
+  call.entries = entries;
+
+  ::encode(call, in);
+  op.exec("timeindex", "add", in);
+}
+
+void cls_timeindex_add(librados::ObjectWriteOperation& op, cls_timeindex_entry& entry)
+{
+  bufferlist in;
+  cls_timeindex_add_op call;
+
+  call.entries.push_back(entry);
+
+  ::encode(call, in);
+  op.exec("timeindex", "add", in);
+}
+
+void cls_timeindex_add_prepare_entry(cls_timeindex_entry& entry,
+                                     const utime_t& key_timestamp,
+                                     const string& key_ext,
+                                     const bufferlist& bl)
+{
+  entry.key_ts  = key_timestamp;
+  entry.key_ext = key_ext;
+  entry.value   = bl;
+}
+
+void cls_timeindex_add(librados::ObjectWriteOperation& op,
+                       const utime_t& key_timestamp,
+                       const string& key_ext,
+                       const bufferlist& bl)
+{
+  cls_timeindex_entry entry;
+
+  cls_timeindex_add_prepare_entry(entry, key_timestamp, key_ext, bl);
+  cls_timeindex_add(op, entry);
+}
+
+void cls_timeindex_trim(librados::ObjectWriteOperation& op,
+                        const utime_t& from_time,
+                        const utime_t& to_time,
+                        const string& from_marker,
+                        const string& to_marker)
+{
+  bufferlist in;
+  cls_timeindex_trim_op call;
+
+  call.from_time   = from_time;
+  call.to_time     = to_time;
+  call.from_marker = from_marker;
+  call.to_marker   = to_marker;
+
+  ::encode(call, in);
+
+  op.exec("timeindex", "trim", in);
+}
+
+int cls_timeindex_trim(librados::IoCtx& io_ctx,
+                       const string& oid,
+                       const utime_t& from_time,
+                       const utime_t& to_time,
+                       const string& from_marker,
+                       const string& to_marker)
+{
+  bool done = false;
+
+  do {
+    ObjectWriteOperation op;
+
+    cls_timeindex_trim(op, from_time, to_time, from_marker, to_marker);
+
+    int r = io_ctx.operate(oid, &op);
+    if (r == -ENODATA) {
+      done = true;
+    } else if (r < 0) {
+      return r;
+    }
+
+  } while (!done);
+
+  return 0;
+}
+
+class TimeindexListCtx : public ObjectOperationCompletion {
+  list<cls_timeindex_entry> *entries;
+  string *marker;
+  bool *truncated;
+
+public:
+  TimeindexListCtx(list<cls_timeindex_entry> *_entries,
+                   string *_marker,
+                   bool *_truncated)
+    : entries(_entries), marker(_marker), truncated(_truncated) {}
+
+  void handle_completion(int r, bufferlist& outbl) {
+    if (r >= 0) {
+      cls_timeindex_list_ret ret;
+      try {
+        bufferlist::iterator iter = outbl.begin();
+        ::decode(ret, iter);
+        if (entries) {
+          *entries = ret.entries;
+        }
+        if (truncated) {
+          *truncated = ret.truncated;
+        }
+        if (marker) {
+          *marker = ret.marker;
+        }
+      } catch (buffer::error& err) {
+        // nothing we can do about it atm
+      }
+    }
+  }
+};
+
+void cls_timeindex_list(librados::ObjectReadOperation& op,
+                        const utime_t& from,
+                        const utime_t& to,
+                        const string& in_marker,
+                        const int max_entries,
+                        list<cls_timeindex_entry>& entries,
+                        string *out_marker,
+                        bool *truncated)
+{
+  bufferlist inbl;
+  cls_timeindex_list_op call;
+
+  call.from_time = from;
+  call.to_time = to;
+  call.marker = in_marker;
+  call.max_entries = max_entries;
+
+  ::encode(call, inbl);
+
+  op.exec("timeindex", "list", inbl,
+          new TimeindexListCtx(&entries, out_marker, truncated));
+}
diff --git a/src/cls/timeindex/cls_timeindex_client.h b/src/cls/timeindex/cls_timeindex_client.h
new file mode 100644
index 0000000..09a420f
--- /dev/null
+++ b/src/cls/timeindex/cls_timeindex_client.h
@@ -0,0 +1,52 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_CLS_TIMEINDEX_CLIENT_H
+#define CEPH_CLS_TIMEINDEX_CLIENT_H
+
+#include "include/types.h"
+#include "include/rados/librados.hpp"
+#include "cls_timeindex_types.h"
+
+/*
+ * timeindex objclass
+ */
+
+void cls_timeindex_add_prepare_entry(cls_timeindex_entry& entry,
+                                     const utime_t& key_timestamp,
+                                     const string& key_ext,
+                                     bufferlist& bl);
+
+void cls_timeindex_add(librados::ObjectWriteOperation& op,
+                       const list<cls_timeindex_entry>& entry);
+
+void cls_timeindex_add(librados::ObjectWriteOperation& op,
+                       const cls_timeindex_entry& entry);
+
+void cls_timeindex_add(librados::ObjectWriteOperation& op,
+                       const utime_t& timestamp,
+                       const string& name,
+                       const bufferlist& bl);
+
+void cls_timeindex_list(librados::ObjectReadOperation& op,
+                        const utime_t& from,
+                        const utime_t& to,
+                        const string& in_marker,
+                        const int max_entries,
+                        list<cls_timeindex_entry>& entries,
+                        string *out_marker,
+                        bool *truncated);
+
+void cls_timeindex_trim(librados::ObjectWriteOperation& op,
+                        const utime_t& from_time,
+                        const utime_t& to_time,
+                        const string& from_marker = std::string(),
+                        const string& to_marker   = std::string());
+
+int cls_timeindex_trim(librados::IoCtx& io_ctx,
+                       const string& oid,
+                       const utime_t& from_time,
+                       const utime_t& to_time,
+                       const string& from_marker  = std::string(),
+                       const string& to_marker    = std::string());
+#endif
diff --git a/src/cls/timeindex/cls_timeindex_ops.h b/src/cls/timeindex/cls_timeindex_ops.h
new file mode 100644
index 0000000..e5498f7
--- /dev/null
+++ b/src/cls/timeindex/cls_timeindex_ops.h
@@ -0,0 +1,116 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_CLS_TIMEINDEX_OPS_H
+#define CEPH_CLS_TIMEINDEX_OPS_H
+
+#include "include/types.h"
+#include "cls_timeindex_types.h"
+
+struct cls_timeindex_add_op {
+  list<cls_timeindex_entry> entries;
+
+  cls_timeindex_add_op() {}
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    ::encode(entries, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::iterator& bl) {
+    DECODE_START(1, bl);
+    ::decode(entries, bl);
+    DECODE_FINISH(bl);
+  }
+};
+WRITE_CLASS_ENCODER(cls_timeindex_add_op)
+
+struct cls_timeindex_list_op {
+  utime_t from_time;
+  string marker; /* if not empty, overrides from_time */
+  utime_t to_time; /* not inclusive */
+  int max_entries; /* upperbound to returned num of entries
+                      might return less than that and still be truncated */
+
+  cls_timeindex_list_op() : max_entries(0) {}
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    ::encode(from_time, bl);
+    ::encode(marker, bl);
+    ::encode(to_time, bl);
+    ::encode(max_entries, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::iterator& bl) {
+    DECODE_START(1, bl);
+    ::decode(from_time, bl);
+    ::decode(marker, bl);
+    ::decode(to_time, bl);
+    ::decode(max_entries, bl);
+    DECODE_FINISH(bl);
+  }
+};
+WRITE_CLASS_ENCODER(cls_timeindex_list_op)
+
+struct cls_timeindex_list_ret {
+  list<cls_timeindex_entry> entries;
+  string marker;
+  bool truncated;
+
+  cls_timeindex_list_ret() : truncated(false) {}
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    ::encode(entries, bl);
+    ::encode(marker, bl);
+    ::encode(truncated, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::iterator& bl) {
+    DECODE_START(1, bl);
+    ::decode(entries, bl);
+    ::decode(marker, bl);
+    ::decode(truncated, bl);
+    DECODE_FINISH(bl);
+  }
+};
+WRITE_CLASS_ENCODER(cls_timeindex_list_ret)
+
+
+/*
+ * operation will return 0 when successfully removed but not done. Will return
+ * -ENODATA when done, so caller needs to repeat sending request until that.
+ */
+struct cls_timeindex_trim_op {
+  utime_t from_time;
+  utime_t to_time; /* inclusive */
+  string from_marker;
+  string to_marker;
+
+  cls_timeindex_trim_op() {}
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    ::encode(from_time, bl);
+    ::encode(to_time, bl);
+    ::encode(from_marker, bl);
+    ::encode(to_marker, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::iterator& bl) {
+    DECODE_START(1, bl);
+    ::decode(from_time, bl);
+    ::decode(to_time, bl);
+    ::decode(from_marker, bl);
+    ::decode(to_marker, bl);
+    DECODE_FINISH(bl);
+  }
+};
+WRITE_CLASS_ENCODER(cls_timeindex_trim_op)
+
+#endif /* CEPH_CLS_TIMEINDEX_OPS_H */
diff --git a/src/cls/timeindex/cls_timeindex_types.h b/src/cls/timeindex/cls_timeindex_types.h
new file mode 100644
index 0000000..afb7d07
--- /dev/null
+++ b/src/cls/timeindex/cls_timeindex_types.h
@@ -0,0 +1,43 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_CLS_TIMEINDEX_TYPES_H
+#define CEPH_CLS_TIMEINDEX_TYPES_H
+
+#include "include/encoding.h"
+#include "include/types.h"
+
+#include "include/utime.h"
+
+class JSONObj;
+
+struct cls_timeindex_entry {
+  /* Mandatory timestamp. Will be part of the key. */
+  utime_t key_ts;
+  /* Not mandatory. The name_ext field, if not empty, will form second
+   * part of the key. */
+  string key_ext;
+  /* Become value of OMAP-based mapping. */
+  bufferlist value;
+
+  cls_timeindex_entry() {}
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    ::encode(key_ts, bl);
+    ::encode(key_ext, bl);
+    ::encode(value, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::iterator& bl) {
+    DECODE_START(1, bl);
+    ::decode(key_ts, bl);
+    ::decode(key_ext, bl);
+    ::decode(value, bl);
+    DECODE_FINISH(bl);
+  }
+};
+WRITE_CLASS_ENCODER(cls_timeindex_entry)
+
+#endif /* CEPH_CLS_TIMEINDEX_TYPES_H */
diff --git a/src/cls/user/cls_user.cc b/src/cls/user/cls_user.cc
index 003a834..3a91112 100644
--- a/src/cls/user/cls_user.cc
+++ b/src/cls/user/cls_user.cc
@@ -293,7 +293,7 @@ static int cls_user_list_buckets(cls_method_context_t hctx, bufferlist *in, buff
 
 #define MAX_ENTRIES 1000
   size_t max_entries = op.max_entries;
-  if (!max_entries || max_entries > MAX_ENTRIES)
+  if (max_entries > MAX_ENTRIES)
     max_entries = MAX_ENTRIES;
 
   string match_prefix;
diff --git a/src/cls/version/cls_version.cc b/src/cls/version/cls_version.cc
index ea38072..307ec33 100644
--- a/src/cls/version/cls_version.cc
+++ b/src/cls/version/cls_version.cc
@@ -15,6 +15,7 @@
 #include "common/Clock.h"
 
 #include "global/global_context.h"
+#include "include/compat.h"
 
 CLS_VER(1,0)
 CLS_NAME(version)
diff --git a/src/common/ConfUtils.cc b/src/common/ConfUtils.cc
index 5efde8d..1ae5df5 100644
--- a/src/common/ConfUtils.cc
+++ b/src/common/ConfUtils.cc
@@ -24,6 +24,7 @@
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>
+#include <iostream>
 
 #include "include/buffer.h"
 #include "common/errno.h"
diff --git a/src/common/Finisher.cc b/src/common/Finisher.cc
index f3f4107..7ebbe05 100644
--- a/src/common/Finisher.cc
+++ b/src/common/Finisher.cc
@@ -20,9 +20,11 @@ void Finisher::stop()
   ldout(cct, 10) << __func__ << dendl;
   finisher_lock.Lock();
   finisher_stop = true;
+  // we don't have any new work to do, but we want the worker to wake up anyway
+  // to process the stop condition.
   finisher_cond.Signal();
   finisher_lock.Unlock();
-  finisher_thread.join();
+  finisher_thread.join(); // wait until the worker exits completely
   ldout(cct, 10) << __func__ << " finish" << dendl;
 }
 
@@ -43,7 +45,10 @@ void *Finisher::finisher_thread_entry()
   ldout(cct, 10) << "finisher_thread start" << dendl;
 
   while (!finisher_stop) {
+    /// Every time we are woken up, we process the queue until it is empty.
     while (!finisher_queue.empty()) {
+      // To reduce lock contention, we swap out the queue to process.
+      // This way other threads can submit new contexts to complete while we are working.
       vector<Context*> ls;
       list<pair<Context*,int> > ls_rval;
       ls.swap(finisher_queue);
@@ -52,12 +57,17 @@ void *Finisher::finisher_thread_entry()
       finisher_lock.Unlock();
       ldout(cct, 10) << "finisher_thread doing " << ls << dendl;
 
+      // Now actually process the contexts.
       for (vector<Context*>::iterator p = ls.begin();
 	   p != ls.end();
 	   ++p) {
 	if (*p) {
 	  (*p)->complete(0);
 	} else {
+	  // When an item is NULL in the finisher_queue, it means
+	  // we should instead process an item from finisher_queue_rval,
+	  // which has a parameter for complete() other than zero.
+	  // This preserves the order while saving some storage.
 	  assert(!ls_rval.empty());
 	  Context *c = ls_rval.front().first;
 	  c->complete(ls_rval.front().second);
@@ -80,6 +90,8 @@ void *Finisher::finisher_thread_entry()
     ldout(cct, 10) << "finisher_thread sleeping" << dendl;
     finisher_cond.Wait(finisher_lock);
   }
+  // If we are exiting, we signal the thread waiting in stop(),
+  // otherwise it would never unblock
   finisher_empty_cond.Signal();
 
   ldout(cct, 10) << "finisher_thread stop" << dendl;
diff --git a/src/common/Finisher.h b/src/common/Finisher.h
index 3a5b4e3..8767445 100644
--- a/src/common/Finisher.h
+++ b/src/common/Finisher.h
@@ -23,19 +23,36 @@
 
 class CephContext;
 
+/// Finisher queue length performance counter ID.
 enum {
   l_finisher_first = 997082,
   l_finisher_queue_len,
   l_finisher_last
 };
 
+/** @brief Asynchronous cleanup class.
+ * Finisher asynchronously completes Contexts, which are simple classes
+ * representing callbacks, in a dedicated worker thread. Enqueuing
+ * contexts to complete is thread-safe.
+ */
 class Finisher {
   CephContext *cct;
-  Mutex          finisher_lock;
-  Cond           finisher_cond, finisher_empty_cond;
-  bool           finisher_stop, finisher_running;
+  Mutex        finisher_lock; ///< Protects access to queues and finisher_running.
+  Cond         finisher_cond; ///< Signaled when there is something to process.
+  Cond         finisher_empty_cond; ///< Signaled when the finisher has nothing more to process.
+  bool         finisher_stop; ///< Set when the finisher should stop.
+  bool         finisher_running; ///< True when the finisher is currently executing contexts.
+  /// Queue for contexts for which complete(0) will be called.
+  /// NULLs in this queue indicate that an item from finisher_queue_rval
+  /// should be completed in that place instead.
   vector<Context*> finisher_queue;
+
+  /// Queue for contexts for which the complete function will be called
+  /// with a parameter other than 0.
   list<pair<Context*,int> > finisher_queue_rval;
+
+  /// Performance counter for the finisher's queue length.
+  /// Only active for named finishers.
   PerfCounters *logger;
   
   void *finisher_thread_entry();
@@ -47,6 +64,7 @@ class Finisher {
   } finisher_thread;
 
  public:
+  /// Add a context to complete, optionally specifying a parameter for the complete function.
   void queue(Context *c, int r = 0) {
     finisher_lock.Lock();
     if (finisher_queue.empty()) {
@@ -94,17 +112,32 @@ class Finisher {
     finisher_lock.Unlock();
     ls.clear();
   }
-  
+
+  /// Start the worker thread.
   void start();
+
+  /** @brief Stop the worker thread.
+   *
+   * Does not wait until all outstanding contexts are completed.
+   * To ensure that everything finishes, you should first shut down
+   * all sources that can add contexts to this finisher and call
+   * wait_for_empty() before calling stop(). */
   void stop();
 
+  /** @brief Blocks until the finisher has nothing left to process.
+   * This function will also return when a concurrent call to stop()
+   * finishes, but this class should never be used in this way. */
   void wait_for_empty();
 
+  /// Construct an anonymous Finisher.
+  /// Anonymous finishers do not log their queue length.
   Finisher(CephContext *cct_) :
     cct(cct_), finisher_lock("Finisher::finisher_lock"),
     finisher_stop(false), finisher_running(false),
     logger(0),
     finisher_thread(this) {}
+
+  /// Construct a named Finisher that logs its queue length.
   Finisher(CephContext *cct_, string name) :
     cct(cct_), finisher_lock("Finisher::finisher_lock"),
     finisher_stop(false), finisher_running(false),
@@ -126,6 +159,7 @@ class Finisher {
   }
 };
 
+/// Context that is completed asynchronously on the supplied finisher.
 class C_OnFinisher : public Context {
   Context *con;
   Finisher *fin;
diff --git a/src/common/Formatter.h b/src/common/Formatter.h
index c61138d..181a0e0 100644
--- a/src/common/Formatter.h
+++ b/src/common/Formatter.h
@@ -6,10 +6,9 @@
 #include "include/int_types.h"
 
 #include <deque>
-#include <iostream>
+#include <iosfwd>
 #include <list>
 #include <vector>
-#include <ostream>
 #include <sstream>
 #include <stdarg.h>
 #include <string>
@@ -63,6 +62,12 @@ namespace ceph {
     {
       dump_format_unquoted(name, "%s", (b ? "true" : "false"));
     }
+    template<typename T>
+    void dump_object(const char *name, const T& foo) {
+      open_object_section(name);
+      foo.dump(this);
+      close_section();
+    }
     virtual std::ostream& dump_stream(const char *name) = 0;
     virtual void dump_format_va(const char *name, const char *ns, bool quoted, const char *fmt, va_list ap) = 0;
     virtual void dump_format(const char *name, const char *fmt, ...);
diff --git a/src/common/HeartbeatMap.cc b/src/common/HeartbeatMap.cc
index 9787f73..f2bf02d 100644
--- a/src/common/HeartbeatMap.cc
+++ b/src/common/HeartbeatMap.cc
@@ -32,7 +32,9 @@ namespace ceph {
 HeartbeatMap::HeartbeatMap(CephContext *cct)
   : m_cct(cct),
     m_rwlock("HeartbeatMap::m_rwlock"),
-    m_inject_unhealthy_until(0)
+    m_inject_unhealthy_until(0),
+    m_unhealthy_workers(0),
+    m_total_workers(0)
 {
 }
 
@@ -41,7 +43,7 @@ HeartbeatMap::~HeartbeatMap()
   assert(m_workers.empty());
 }
 
-heartbeat_handle_d *HeartbeatMap::add_worker(string name)
+heartbeat_handle_d *HeartbeatMap::add_worker(const string& name)
 {
   m_rwlock.get_write();
   ldout(m_cct, 10) << "add_worker '" << name << "'" << dendl;
@@ -52,7 +54,7 @@ heartbeat_handle_d *HeartbeatMap::add_worker(string name)
   return h;
 }
 
-void HeartbeatMap::remove_worker(heartbeat_handle_d *h)
+void HeartbeatMap::remove_worker(const heartbeat_handle_d *h)
 {
   m_rwlock.get_write();
   ldout(m_cct, 10) << "remove_worker '" << h->name << "'" << dendl;
@@ -61,7 +63,7 @@ void HeartbeatMap::remove_worker(heartbeat_handle_d *h)
   delete h;
 }
 
-bool HeartbeatMap::_check(heartbeat_handle_d *h, const char *who, time_t now)
+bool HeartbeatMap::_check(const heartbeat_handle_d *h, const char *who, time_t now)
 {
   bool healthy = true;
   time_t was;
@@ -109,6 +111,8 @@ void HeartbeatMap::clear_timeout(heartbeat_handle_d *h)
 
 bool HeartbeatMap::is_healthy()
 {
+  int unhealthy = 0;
+  int total = 0;
   m_rwlock.get_read();
   time_t now = time(NULL);
   if (m_cct->_conf->heartbeat_inject_failure) {
@@ -129,13 +133,30 @@ bool HeartbeatMap::is_healthy()
     heartbeat_handle_d *h = *p;
     if (!_check(h, "is_healthy", now)) {
       healthy = false;
+      unhealthy++;
     }
+    total++;
   }
   m_rwlock.put_read();
-  ldout(m_cct, 20) << "is_healthy = " << (healthy ? "healthy" : "NOT HEALTHY") << dendl;
+
+  m_unhealthy_workers.set(unhealthy);
+  m_total_workers.set(total);
+
+  ldout(m_cct, 20) << "is_healthy = " << (healthy ? "healthy" : "NOT HEALTHY")
+    << ", total workers: " << total << ", number of unhealthy: " << unhealthy << dendl;
   return healthy;
 }
 
+int HeartbeatMap::get_unhealthy_workers() const
+{
+  return m_unhealthy_workers.read();
+}
+
+int HeartbeatMap::get_total_workers() const
+{
+  return m_total_workers.read();
+}
+
 void HeartbeatMap::check_touch_file()
 {
   if (is_healthy()) {
diff --git a/src/common/HeartbeatMap.h b/src/common/HeartbeatMap.h
index a4aee48..61c2f90 100644
--- a/src/common/HeartbeatMap.h
+++ b/src/common/HeartbeatMap.h
@@ -41,7 +41,7 @@ namespace ceph {
  */
 
 struct heartbeat_handle_d {
-  std::string name;
+  const std::string name;
   atomic_t timeout, suicide_timeout;
   time_t grace, suicide_grace;
   std::list<heartbeat_handle_d*>::iterator list_item;
@@ -54,8 +54,8 @@ struct heartbeat_handle_d {
 class HeartbeatMap {
  public:
   // register/unregister
-  heartbeat_handle_d *add_worker(std::string name);
-  void remove_worker(heartbeat_handle_d *h);
+  heartbeat_handle_d *add_worker(const std::string& name);
+  void remove_worker(const heartbeat_handle_d *h);
 
   // reset the timeout so that it expects another touch within grace amount of time
   void reset_timeout(heartbeat_handle_d *h, time_t grace, time_t suicide_grace);
@@ -68,6 +68,12 @@ class HeartbeatMap {
   // touch cct->_conf->heartbeat_file if is_healthy()
   void check_touch_file();
 
+  // get the number of unhealthy workers
+  int get_unhealthy_workers() const;
+
+  // get the number of total workers
+  int get_total_workers() const;
+
   HeartbeatMap(CephContext *cct);
   ~HeartbeatMap();
 
@@ -76,8 +82,10 @@ class HeartbeatMap {
   RWLock m_rwlock;
   time_t m_inject_unhealthy_until;
   std::list<heartbeat_handle_d*> m_workers;
+  atomic_t m_unhealthy_workers;
+  atomic_t m_total_workers;
 
-  bool _check(heartbeat_handle_d *h, const char *who, time_t now);
+  bool _check(const heartbeat_handle_d *h, const char *who, time_t now);
 };
 
 }
diff --git a/src/common/Initialize.h b/src/common/Initialize.h
index 273a871..35414d6 100644
--- a/src/common/Initialize.h
+++ b/src/common/Initialize.h
@@ -59,8 +59,8 @@ class Initialize {
   /**
    * This form of constructor causes its function argument to be invoked
    * when the object is constructed.  When used with a static Initialize
-   * object, this will cause #func to run before main() runs, so that
-   * #func can perform once-only initialization.
+   * object, this will cause \p func to run before main() runs, so that
+   * \p func can perform once-only initialization.
    *
    * \param func
    *      This function is invoked with no arguments when the object is
diff --git a/src/common/Makefile.am b/src/common/Makefile.am
index 620e550..f98cd18 100644
--- a/src/common/Makefile.am
+++ b/src/common/Makefile.am
@@ -73,12 +73,11 @@ libcommon_internal_la_SOURCES = \
 	common/module.c \
 	common/Readahead.cc \
 	common/Cycles.cc \
-	common/ContextCompletion.cc
+	common/ContextCompletion.cc \
+	common/TracepointProvider.cc
 
-if WITH_RBD
 libcommon_internal_la_SOURCES += \
 	common/blkdev.cc
-endif
 
 if ENABLE_XIO
 libcommon_internal_la_SOURCES += \
@@ -101,13 +100,6 @@ libcommon_internal_la_SOURCES += \
 LIBCOMMON_DEPS += libcommon_internal.la
 noinst_LTLIBRARIES += libcommon_internal.la
 
-libcommon_api_la_SOURCES = \
-	common/buffer.cc
-if LINUX
-libcommon_api_la_CXXFLAGS = -fvisibility=hidden -fvisibility-inlines-hidden
-endif # LINUX
-noinst_LTLIBRARIES += libcommon_api.la
-
 # inject crc in common
 libcommon_crc_la_SOURCES = \
 	common/sctp_crc32.c \
@@ -122,25 +114,34 @@ endif
 LIBCOMMON_DEPS += libcommon_crc.la
 noinst_LTLIBRARIES += libcommon_crc.la
 
+if HAVE_ARMV8_CRC
+libcommon_crc_aarch64_la_SOURCES = common/crc32c_aarch64.c
+libcommon_crc_aarch64_la_CFLAGS = $(AM_CFLAGS) $(ARM_CRC_FLAGS)
+LIBCOMMON_DEPS += libcommon_crc_aarch64.la
+noinst_LTLIBRARIES += libcommon_crc_aarch64.la
+endif
+
 noinst_HEADERS += \
 	common/bloom_filter.hpp \
 	common/sctp_crc32.h \
 	common/crc32c_intel_baseline.h \
-	common/crc32c_intel_fast.h
+	common/crc32c_intel_fast.h \
+	common/crc32c_aarch64.h
 
 
 # important; libmsg before libauth!
 LIBCOMMON_DEPS += \
 	$(LIBERASURE_CODE) \
 	$(LIBMSG) $(LIBAUTH) \
-	$(LIBCRUSH) $(LIBJSON_SPIRIT) $(LIBLOG) $(LIBARCH)
+	$(LIBCRUSH) $(LIBJSON_SPIRIT) $(LIBLOG) $(LIBARCH) \
+	$(BOOST_RANDOM_LIBS)
 
 if LINUX
-LIBCOMMON_DEPS += -lrt
+LIBCOMMON_DEPS += -lrt -lblkid
 endif # LINUX
 
-libcommon_la_SOURCES =
-libcommon_la_LIBADD = $(LIBCOMMON_DEPS) libcommon_api.la
+libcommon_la_SOURCES = common/buffer.cc
+libcommon_la_LIBADD = $(LIBCOMMON_DEPS)
 noinst_LTLIBRARIES += libcommon.la
 
 noinst_HEADERS += \
@@ -234,7 +235,9 @@ noinst_HEADERS += \
 	common/Initialize.h \
 	common/ContextCompletion.h \
 	common/bit_vector.hpp \
-	common/valgrind.h
+	common/SubProcess.h \
+	common/valgrind.h \
+	common/TracepointProvider.h
 
 if ENABLE_XIO
 noinst_HEADERS += \
diff --git a/src/common/Mutex.cc b/src/common/Mutex.cc
index 808513e..5e9b590 100644
--- a/src/common/Mutex.cc
+++ b/src/common/Mutex.cc
@@ -30,7 +30,7 @@ Mutex::Mutex(const std::string &n, bool r, bool ld,
   if (cct) {
     PerfCountersBuilder b(cct, string("mutex-") + name,
 			  l_mutex_first, l_mutex_last);
-    b.add_time_avg(l_mutex_wait, "wait");
+    b.add_time_avg(l_mutex_wait, "wait", "Average time of mutex in locked state");
     logger = b.create_perf_counters();
     cct->get_perfcounters_collection()->add(logger);
     logger->set(l_mutex_wait, 0);
@@ -82,21 +82,26 @@ Mutex::~Mutex() {
 }
 
 void Mutex::Lock(bool no_lockdep) {
-  utime_t start;
   int r;
 
   if (lockdep && g_lockdep && !no_lockdep) _will_lock();
 
-  if (TryLock()) {
-    goto out;
-  }
-
-  if (logger && cct && cct->_conf->mutex_perf_counter)
+  if (logger && cct && cct->_conf->mutex_perf_counter) {
+    utime_t start;
+    // instrumented mutex enabled
     start = ceph_clock_now(cct);
-  r = pthread_mutex_lock(&_m);
-  if (logger && cct && cct->_conf->mutex_perf_counter)
+    if (TryLock()) {
+      goto out;
+    }
+
+    r = pthread_mutex_lock(&_m);
+
     logger->tinc(l_mutex_wait,
 		 ceph_clock_now(cct) - start);
+  } else {
+    r = pthread_mutex_lock(&_m);
+  }
+
   assert(r == 0);
   if (lockdep && g_lockdep) _locked();
   _post_lock();
diff --git a/src/common/OutputDataSocket.cc b/src/common/OutputDataSocket.cc
index 2c4526d..e43f5cf 100644
--- a/src/common/OutputDataSocket.cc
+++ b/src/common/OutputDataSocket.cc
@@ -179,14 +179,14 @@ std::string OutputDataSocket::bind_and_listen(const std::string &sock_path, int
   address.sun_family = AF_UNIX;
   snprintf(address.sun_path, sizeof(address.sun_path),
 	   "%s", sock_path.c_str());
-  if (bind(sock_fd, (struct sockaddr*)&address,
+  if (::bind(sock_fd, (struct sockaddr*)&address,
 	   sizeof(struct sockaddr_un)) != 0) {
     int err = errno;
     if (err == EADDRINUSE) {
       // The old UNIX domain socket must still be there.
       // Let's unlink it and try again.
       VOID_TEMP_FAILURE_RETRY(unlink(sock_path.c_str()));
-      if (bind(sock_fd, (struct sockaddr*)&address,
+      if (::bind(sock_fd, (struct sockaddr*)&address,
 	       sizeof(struct sockaddr_un)) == 0) {
 	err = 0;
       }
diff --git a/src/common/Preforker.h b/src/common/Preforker.h
index c28fd13..446f1c9 100644
--- a/src/common/Preforker.h
+++ b/src/common/Preforker.h
@@ -3,11 +3,17 @@
 #ifndef CEPH_COMMON_PREFORKER_H
 #define CEPH_COMMON_PREFORKER_H
 
+#include "acconfig.h"
 #include <sys/types.h>
 #include <sys/socket.h>
 #include <sys/wait.h>
 #include <errno.h>
+#include <stdlib.h>
 #include <unistd.h>
+#include <sstream>
+#include <string>
+
+#include "include/assert.h"
 #include "common/safe_io.h"
 #include "common/errno.h"
 
@@ -31,22 +37,31 @@ public:
       forked(false)
   {}
 
-  void prefork() {
+  int prefork(std::string &err) {
     assert(!forked);
     int r = socketpair(AF_UNIX, SOCK_STREAM, 0, fd);
+    std::ostringstream oss;
     if (r < 0) {
-      cerr << "[" << getpid() << "]: unable to create socketpair: " << cpp_strerror(errno) << std::endl;
-      exit(errno);
+      oss << "[" << getpid() << "]: unable to create socketpair: " << cpp_strerror(errno);
+      err = oss.str();
+      return r;
     }
 
     forked = true;
 
     childpid = fork();
+    if (childpid < 0) {
+      r = -errno;
+      oss << "[" << getpid() << "]: unable to fork: " << cpp_strerror(errno);
+      err = oss.str();
+      return r;
+    }
     if (childpid == 0) {
       ::close(fd[0]);
     } else {
       ::close(fd[1]);
     }
+    return 0;
   }
 
   bool is_child() {
@@ -57,10 +72,11 @@ public:
     return childpid != 0;
   }
 
-  int parent_wait() {
+  int parent_wait(std::string &err_msg) {
     assert(forked);
 
     int r = -1;
+    std::ostringstream oss;
     int err = safe_read_exact(fd[0], &r, sizeof(r));
     if (err == 0 && r == -1) {
       // daemonize
@@ -69,12 +85,25 @@ public:
       ::close(2);
       r = 0;
     } else if (err) {
-      cerr << "[" << getpid() << "]: " << cpp_strerror(err) << std::endl;
+      oss << "[" << getpid() << "]: " << cpp_strerror(err);
     } else {
       // wait for child to exit
-      waitpid(childpid, NULL, 0);
+      int status;
+      err = waitpid(childpid, &status, 0);
+      if (err < 0) {
+        oss << "[" << getpid() << "]" << " waitpid error: " << cpp_strerror(err);
+      } else if (WIFSIGNALED(status)) {
+        oss << "[" << getpid() << "]" << " exited with a signal";
+      } else if (!WIFEXITED(status)) {
+        oss << "[" << getpid() << "]" << " did not exit normally";
+      } else {
+        err = WEXITSTATUS(status);
+        if (err != 0)
+         oss << "[" << getpid() << "]" << " returned exit_status " << cpp_strerror(err);
+      }
     }
-    return r;
+    err_msg = oss.str();
+    return err;
   }
 
   int signal_exit(int r) {
diff --git a/src/common/PrioritizedQueue.h b/src/common/PrioritizedQueue.h
index ee1dc9c..5ae94a5 100644
--- a/src/common/PrioritizedQueue.h
+++ b/src/common/PrioritizedQueue.h
@@ -49,13 +49,14 @@ class PrioritizedQueue {
   int64_t max_tokens_per_subqueue;
   int64_t min_cost;
 
+  typedef std::list<std::pair<unsigned, T> > ListPairs;
   template <class F>
   static unsigned filter_list_pairs(
-    list<pair<unsigned, T> > *l, F f,
-    list<T> *out) {
+    ListPairs *l, F f,
+    std::list<T> *out) {
     unsigned ret = 0;
     if (out) {
-      for (typename list<pair<unsigned, T> >::reverse_iterator i = l->rbegin();
+      for (typename ListPairs::reverse_iterator i = l->rbegin();
 	   i != l->rend();
 	   ++i) {
 	if (f(i->second)) {
@@ -63,7 +64,7 @@ class PrioritizedQueue {
 	}
       }
     }
-    for (typename list<pair<unsigned, T> >::iterator i = l->begin();
+    for (typename ListPairs::iterator i = l->begin();
 	 i != l->end();
       ) {
       if (f(i->second)) {
@@ -78,10 +79,11 @@ class PrioritizedQueue {
 
   struct SubQueue {
   private:
-    map<K, list<pair<unsigned, T> > > q;
+    typedef std::map<K, ListPairs> Classes;
+    Classes q;
     unsigned tokens, max_tokens;
     int64_t size;
-    typename map<K, list<pair<unsigned, T> > >::iterator cur;
+    typename Classes::iterator cur;
   public:
     SubQueue(const SubQueue &other)
       : q(other.q),
@@ -114,18 +116,18 @@ class PrioritizedQueue {
 	tokens = 0;
     }
     void enqueue(K cl, unsigned cost, T item) {
-      q[cl].push_back(make_pair(cost, item));
+      q[cl].push_back(std::make_pair(cost, item));
       if (cur == q.end())
 	cur = q.begin();
       size++;
     }
     void enqueue_front(K cl, unsigned cost, T item) {
-      q[cl].push_front(make_pair(cost, item));
+      q[cl].push_front(std::make_pair(cost, item));
       if (cur == q.end())
 	cur = q.begin();
       size++;
     }
-    pair<unsigned, T> front() const {
+    std::pair<unsigned, T> front() const {
       assert(!(q.empty()));
       assert(cur != q.end());
       return cur->second.front();
@@ -150,8 +152,8 @@ class PrioritizedQueue {
       return q.empty();
     }
     template <class F>
-    void remove_by_filter(F f, list<T> *out) {
-      for (typename map<K, list<pair<unsigned, T> > >::iterator i = q.begin();
+    void remove_by_filter(F f, std::list<T> *out) {
+      for (typename Classes::iterator i = q.begin();
 	   i != q.end();
 	   ) {
 	size -= filter_list_pairs(&(i->second), f, out);
@@ -166,15 +168,15 @@ class PrioritizedQueue {
       if (cur == q.end())
 	cur = q.begin();
     }
-    void remove_by_class(K k, list<T> *out) {
-      typename map<K, list<pair<unsigned, T> > >::iterator i = q.find(k);
+    void remove_by_class(K k, std::list<T> *out) {
+      typename Classes::iterator i = q.find(k);
       if (i == q.end())
 	return;
       size -= i->second.size();
       if (i == cur)
 	++cur;
       if (out) {
-	for (typename list<pair<unsigned, T> >::reverse_iterator j =
+	for (typename ListPairs::reverse_iterator j =
 	       i->second.rbegin();
 	     j != i->second.rend();
 	     ++j) {
@@ -195,11 +197,13 @@ class PrioritizedQueue {
 	f->dump_int("first_item_cost", front().first);
     }
   };
-  map<unsigned, SubQueue> high_queue;
-  map<unsigned, SubQueue> queue;
+
+  typedef std::map<unsigned, SubQueue> SubQueues;
+  SubQueues high_queue;
+  SubQueues queue;
 
   SubQueue *create_queue(unsigned priority) {
-    typename map<unsigned, SubQueue>::iterator p = queue.find(priority);
+    typename SubQueues::iterator p = queue.find(priority);
     if (p != queue.end())
       return &p->second;
     total_priority += priority;
@@ -218,7 +222,7 @@ class PrioritizedQueue {
   void distribute_tokens(unsigned cost) {
     if (total_priority == 0)
       return;
-    for (typename map<unsigned, SubQueue>::iterator i = queue.begin();
+    for (typename SubQueues::iterator i = queue.begin();
 	 i != queue.end();
 	 ++i) {
       i->second.put_tokens(((i->first * cost) / total_priority) + 1);
@@ -234,13 +238,13 @@ public:
 
   unsigned length() const {
     unsigned total = 0;
-    for (typename map<unsigned, SubQueue>::const_iterator i = queue.begin();
+    for (typename SubQueues::const_iterator i = queue.begin();
 	 i != queue.end();
 	 ++i) {
       assert(i->second.length());
       total += i->second.length();
     }
-    for (typename map<unsigned, SubQueue>::const_iterator i = high_queue.begin();
+    for (typename SubQueues::const_iterator i = high_queue.begin();
 	 i != high_queue.end();
 	 ++i) {
       assert(i->second.length());
@@ -250,8 +254,8 @@ public:
   }
 
   template <class F>
-  void remove_by_filter(F f, list<T> *removed = 0) {
-    for (typename map<unsigned, SubQueue>::iterator i = queue.begin();
+  void remove_by_filter(F f, std::list<T> *removed = 0) {
+    for (typename SubQueues::iterator i = queue.begin();
 	 i != queue.end();
 	 ) {
       unsigned priority = i->first;
@@ -264,7 +268,7 @@ public:
 	++i;
       }
     }
-    for (typename map<unsigned, SubQueue>::iterator i = high_queue.begin();
+    for (typename SubQueues::iterator i = high_queue.begin();
 	 i != high_queue.end();
 	 ) {
       i->second.remove_by_filter(f, removed);
@@ -276,8 +280,8 @@ public:
     }
   }
 
-  void remove_by_class(K k, list<T> *out = 0) {
-    for (typename map<unsigned, SubQueue>::iterator i = queue.begin();
+  void remove_by_class(K k, std::list<T> *out = 0) {
+    for (typename SubQueues::iterator i = queue.begin();
 	 i != queue.end();
 	 ) {
       i->second.remove_by_class(k, out);
@@ -289,7 +293,7 @@ public:
 	++i;
       }
     }
-    for (typename map<unsigned, SubQueue>::iterator i = high_queue.begin();
+    for (typename SubQueues::iterator i = high_queue.begin();
 	 i != high_queue.end();
 	 ) {
       i->second.remove_by_class(k, out);
@@ -345,7 +349,7 @@ public:
     // if there are multiple buckets/subqueues with sufficient tokens,
     // we behave like a strict priority queue among all subqueues that
     // are eligible to run.
-    for (typename map<unsigned, SubQueue>::iterator i = queue.begin();
+    for (typename SubQueues::iterator i = queue.begin();
 	 i != queue.end();
 	 ++i) {
       assert(!(i->second.empty()));
@@ -377,7 +381,7 @@ public:
     f->dump_int("max_tokens_per_subqueue", max_tokens_per_subqueue);
     f->dump_int("min_cost", min_cost);
     f->open_array_section("high_queues");
-    for (typename map<unsigned, SubQueue>::const_iterator p = high_queue.begin();
+    for (typename SubQueues::const_iterator p = high_queue.begin();
 	 p != high_queue.end();
 	 ++p) {
       f->open_object_section("subqueue");
@@ -387,7 +391,7 @@ public:
     }
     f->close_section();
     f->open_array_section("queues");
-    for (typename map<unsigned, SubQueue>::const_iterator p = queue.begin();
+    for (typename SubQueues::const_iterator p = queue.begin();
 	 p != queue.end();
 	 ++p) {
       f->open_object_section("subqueue");
diff --git a/src/common/RWLock.h b/src/common/RWLock.h
index c82a23c..47a8c87 100644
--- a/src/common/RWLock.h
+++ b/src/common/RWLock.h
@@ -29,6 +29,7 @@ class RWLock
   std::string name;
   mutable int id;
   mutable atomic_t nrlock, nwlock;
+  bool track;
 
   std::string unique_name(const char* name) const;
 
@@ -36,22 +37,25 @@ public:
   RWLock(const RWLock& other);
   const RWLock& operator=(const RWLock& other);
 
-  RWLock(const std::string &n) : name(n), id(-1), nrlock(0), nwlock(0) {
+  RWLock(const std::string &n, bool track_lock=true) : name(n), id(-1), nrlock(0), nwlock(0), track(track_lock) {
     pthread_rwlock_init(&L, NULL);
     if (g_lockdep) id = lockdep_register(name.c_str());
   }
 
   bool is_locked() const {
+    assert(track);
     return (nrlock.read() > 0) || (nwlock.read() > 0);
   }
 
   bool is_wlocked() const {
+    assert(track);
     return (nwlock.read() > 0);
   }
   virtual ~RWLock() {
     // The following check is racy but we are about to destroy
     // the object and we assume that there are no other users.
-    assert(!is_locked());
+    if (track)
+      assert(!is_locked());
     pthread_rwlock_destroy(&L);
     if (g_lockdep) {
       lockdep_unregister(id);
@@ -59,11 +63,13 @@ public:
   }
 
   void unlock(bool lockdep=true) const {
-    if (nwlock.read() > 0) {
-      nwlock.dec();
-    } else {
-      assert(nrlock.read() > 0);
-      nrlock.dec();
+    if (track) {
+      if (nwlock.read() > 0) {
+        nwlock.dec();
+      } else {
+        assert(nrlock.read() > 0);
+        nrlock.dec();
+      }
     }
     if (lockdep && g_lockdep) id = lockdep_will_unlock(name.c_str(), id);
     int r = pthread_rwlock_unlock(&L);
@@ -76,11 +82,13 @@ public:
     int r = pthread_rwlock_rdlock(&L);
     assert(r == 0);
     if (g_lockdep) id = lockdep_locked(name.c_str(), id);
-    nrlock.inc();
+    if (track)
+      nrlock.inc();
   }
   bool try_get_read() const {
     if (pthread_rwlock_tryrdlock(&L) == 0) {
-      nrlock.inc();
+      if (track)
+         nrlock.inc();
       if (g_lockdep) id = lockdep_locked(name.c_str(), id);
       return true;
     }
@@ -96,13 +104,15 @@ public:
     int r = pthread_rwlock_wrlock(&L);
     assert(r == 0);
     if (g_lockdep) id = lockdep_locked(name.c_str(), id);
-    nwlock.inc();
+    if (track)
+      nwlock.inc();
 
   }
   bool try_get_write(bool lockdep=true) {
     if (pthread_rwlock_trywrlock(&L) == 0) {
       if (lockdep && g_lockdep) id = lockdep_locked(name.c_str(), id);
-      nwlock.inc();
+      if (track)
+         nwlock.inc();
       return true;
     }
     return false;
diff --git a/src/common/Readahead.cc b/src/common/Readahead.cc
index a3f5bfc..b1ee2e0 100644
--- a/src/common/Readahead.cc
+++ b/src/common/Readahead.cc
@@ -30,6 +30,10 @@ Readahead::extent_t Readahead::update(const vector<extent_t>& extents, uint64_t
   for (vector<extent_t>::const_iterator p = extents.begin(); p != extents.end(); ++p) {
     _observe_read(p->first, p->second);
   }
+  if (m_readahead_pos >= limit) {
+    m_lock.Unlock();
+    return extent_t(0, 0);
+  }
   pair<uint64_t, uint64_t> extent = _compute_readahead(limit);
   m_lock.Unlock();
   return extent;
@@ -38,6 +42,10 @@ Readahead::extent_t Readahead::update(const vector<extent_t>& extents, uint64_t
 Readahead::extent_t Readahead::update(uint64_t offset, uint64_t length, uint64_t limit) {
   m_lock.Lock();
   _observe_read(offset, length);
+  if (m_readahead_pos >= limit) {
+    m_lock.Unlock();
+    return extent_t(0, 0);
+  }
   extent_t extent = _compute_readahead(limit);
   m_lock.Unlock();
   return extent;
@@ -52,6 +60,7 @@ void Readahead::_observe_read(uint64_t offset, uint64_t length) {
     m_consec_read_bytes = 0;
     m_readahead_trigger_pos = 0;
     m_readahead_size = 0;
+    m_readahead_pos = 0;
   }
   m_last_pos = offset + length;
 }
@@ -70,6 +79,9 @@ Readahead::extent_t Readahead::_compute_readahead(uint64_t limit) {
       } else {
 	// continuing readahead trigger
 	m_readahead_size *= 2;
+	if (m_last_pos > m_readahead_pos) {
+	  m_readahead_pos = m_last_pos;
+	}
       }
       m_readahead_size = MAX(m_readahead_size, m_readahead_min_bytes);
       m_readahead_size = MIN(m_readahead_size, m_readahead_max_bytes);
diff --git a/src/common/RefCountedObj.h b/src/common/RefCountedObj.h
index 729bbb9..3755018 100644
--- a/src/common/RefCountedObj.h
+++ b/src/common/RefCountedObj.h
@@ -52,7 +52,7 @@ public:
     cct = c;
   }
 
-  uint64_t get_nref() {
+  uint64_t get_nref() const {
     return nref.read();
   }
 };
diff --git a/src/common/SubProcess.h b/src/common/SubProcess.h
new file mode 100644
index 0000000..18d6e92
--- /dev/null
+++ b/src/common/SubProcess.h
@@ -0,0 +1,484 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph distributed storage system
+ *
+ * Copyright (C) 2015 Mirantis Inc
+ *
+ * Author: Mykola Golub <mgolub at mirantis.com>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#ifndef SUB_PROCESS_H
+#define SUB_PROCESS_H
+
+#include <sys/types.h>
+#include <sys/wait.h>
+
+#include <signal.h>
+#include <errno.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sstream>
+#include <vector>
+
+#include <include/assert.h>
+#include <common/errno.h>
+
+/**
+ * SubProcess:
+ * A helper class to spawn a subprocess.
+ *
+ * Example:
+ *
+ *   SubProcess cat("cat", true, true);
+ *   if (cat.spawn() != 0) {
+ *     std::cerr << "cat failed: " << cat.err() << std::endl;
+ *     return false;
+ *   }
+ *   write_to_fd(cat.stdout(), "hello world!\n");
+ *   cat.close_stdout();
+ *   read_from_fd(cat.stdin(), buf);
+ *   if (cat.join() != 0) {
+ *     std::cerr << cat.err() << std::endl;
+ *     return false;
+ *   }
+ */
+
+class SubProcess {
+public:
+  SubProcess(const char *cmd, bool pipe_stdin = false, bool pipe_stdout = false,
+	     bool pipe_stderr = false);
+  virtual ~SubProcess();
+
+  void add_cmd_args(const char *arg, ...);
+  void add_cmd_arg(const char *arg);
+
+  virtual int spawn(); // Returns 0 on success or -errno on failure.
+  virtual int join();  // Returns exit code (0 on success).
+
+  bool is_spawned() const { return pid > 0; }
+
+  int stdin() const;
+  int stdout() const;
+  int stderr() const;
+
+  void close_stdin();
+  void close_stdout();
+  void close_stderr();
+
+  void kill(int signo = SIGTERM) const;
+
+  const char* err() const;
+
+protected:
+  bool is_child() const { return pid == 0; }
+  virtual void exec();
+
+private:
+  void close(int &fd);
+
+protected:
+  std::string cmd;
+  std::vector<std::string> cmd_args;
+  bool pipe_stdin;
+  bool pipe_stdout;
+  bool pipe_stderr;
+  int stdin_pipe_out_fd;
+  int stdout_pipe_in_fd;
+  int stderr_pipe_in_fd;
+  int pid;
+  std::ostringstream errstr;
+};
+
+class SubProcessTimed : public SubProcess {
+public:
+  SubProcessTimed(const char *cmd, bool pipe_stdin = false,
+		  bool pipe_stdout = false, bool pipe_stderr = false,
+		  int timeout = 0, int sigkill = SIGKILL);
+
+protected:
+  virtual void exec();
+
+private:
+  int timeout;
+  int sigkill;
+};
+
+SubProcess::SubProcess(const char *cmd_, bool stdin, bool stdout, bool stderr) :
+  cmd(cmd_),
+  cmd_args(),
+  pipe_stdin(stdin),
+  pipe_stdout(stdout),
+  pipe_stderr(stderr),
+  stdin_pipe_out_fd(-1),
+  stdout_pipe_in_fd(-1),
+  stderr_pipe_in_fd(-1),
+  pid(-1),
+  errstr() {
+}
+
+SubProcess::~SubProcess() {
+  assert(!is_spawned());
+  assert(stdin_pipe_out_fd == -1);
+  assert(stdout_pipe_in_fd == -1);
+  assert(stderr_pipe_in_fd == -1);
+}
+
+void SubProcess::add_cmd_args(const char *arg, ...) {
+  assert(!is_spawned());
+
+  va_list ap;
+  va_start(ap, arg);
+  const char *p = arg;
+  do {
+    add_cmd_arg(p);
+    p = va_arg(ap, const char*);
+  } while (p != NULL);
+  va_end(ap);
+}
+
+void SubProcess::add_cmd_arg(const char *arg) {
+  assert(!is_spawned());
+
+  cmd_args.push_back(arg);
+}
+
+int SubProcess::stdin() const {
+  assert(is_spawned());
+  assert(pipe_stdin);
+
+  return stdin_pipe_out_fd;
+}
+
+int SubProcess::stdout() const {
+  assert(is_spawned());
+  assert(pipe_stdout);
+
+  return stdout_pipe_in_fd;
+}
+
+int SubProcess::stderr() const {
+  assert(is_spawned());
+  assert(pipe_stderr);
+
+  return stderr_pipe_in_fd;
+}
+
+void SubProcess::close(int &fd) {
+  if (fd == -1)
+    return;
+
+  ::close(fd);
+  fd = -1;
+}
+
+void SubProcess::close_stdin() {
+  assert(is_spawned());
+  assert(pipe_stdin);
+
+  close(stdin_pipe_out_fd);
+}
+
+void SubProcess::close_stdout() {
+  assert(is_spawned());
+  assert(pipe_stdout);
+
+  close(stdout_pipe_in_fd);
+}
+
+void SubProcess::close_stderr() {
+  assert(is_spawned());
+  assert(pipe_stderr);
+
+  close(stderr_pipe_in_fd);
+}
+
+void SubProcess::kill(int signo) const {
+  assert(is_spawned());
+
+  int ret = ::kill(pid, signo);
+  assert(ret == 0);
+}
+
+const char* SubProcess::err() const {
+  return errstr.str().c_str();
+}
+
+class fd_buf : public std::streambuf {
+  int fd;
+public:
+  fd_buf (int fd) : fd(fd)
+  {}
+protected:
+  int_type overflow (int_type c) override {
+    if (c == EOF) return EOF;
+    char buf = c;
+    if (write (fd, &buf, 1) != 1) {
+      return EOF;
+    }
+    return c;
+  }
+  std::streamsize xsputn (const char* s, std::streamsize count) override {
+    return write(fd, s, count);
+  }
+};
+
+int SubProcess::spawn() {
+  assert(!is_spawned());
+  assert(stdin_pipe_out_fd == -1);
+  assert(stdout_pipe_in_fd == -1);
+  assert(stderr_pipe_in_fd == -1);
+
+  enum { IN = 0, OUT = 1 };
+
+  int ipipe[2], opipe[2], epipe[2];
+
+  ipipe[0] = ipipe[1] = opipe[0] = opipe[1] = epipe[0] = epipe[1] = -1;
+
+  int ret = 0;
+
+  if ((pipe_stdin  && ::pipe(ipipe) == -1) ||
+      (pipe_stdout && ::pipe(opipe) == -1) ||
+      (pipe_stderr && ::pipe(epipe) == -1)) {
+    ret = -errno;
+    errstr << "pipe failed: " << cpp_strerror(errno);
+    goto fail;
+  }
+
+  pid = fork();
+
+  if (pid > 0) { // Parent
+    stdin_pipe_out_fd = ipipe[OUT]; close(ipipe[IN ]);
+    stdout_pipe_in_fd = opipe[IN ]; close(opipe[OUT]);
+    stderr_pipe_in_fd = epipe[IN ]; close(epipe[OUT]);
+    return 0;
+  }
+
+  if (pid == 0) { // Child
+    close(ipipe[OUT]);
+    close(opipe[IN ]);
+    close(epipe[IN ]);
+
+    if (ipipe[IN] != -1 && ipipe[IN] != STDIN_FILENO) {
+      ::dup2(ipipe[IN], STDIN_FILENO);
+      close(ipipe[IN]);
+    }
+    if (opipe[OUT] != -1 && opipe[OUT] != STDOUT_FILENO) {
+      ::dup2(opipe[OUT], STDOUT_FILENO);
+      close(opipe[OUT]);
+      static fd_buf buf(STDOUT_FILENO);
+      std::cout.rdbuf(&buf);
+    }
+    if (epipe[OUT] != -1 && epipe[OUT] != STDERR_FILENO) {
+      ::dup2(epipe[OUT], STDERR_FILENO);
+      close(epipe[OUT]);
+      static fd_buf buf(STDERR_FILENO);
+      std::cerr.rdbuf(&buf);
+    }
+
+    int maxfd = sysconf(_SC_OPEN_MAX);
+    if (maxfd == -1)
+      maxfd = 16384;
+    for (int fd = 0; fd <= maxfd; fd++) {
+      if (fd == STDIN_FILENO && pipe_stdin)
+	continue;
+      if (fd == STDOUT_FILENO && pipe_stdout)
+	continue;
+      if (fd == STDERR_FILENO && pipe_stderr)
+	continue;
+      ::close(fd);
+    }
+
+    exec();
+    assert(0); // Never reached
+  }
+
+  ret = -errno;
+  errstr << "fork failed: " << cpp_strerror(errno);
+
+fail:
+  close(ipipe[0]);
+  close(ipipe[1]);
+  close(opipe[0]);
+  close(opipe[1]);
+  close(epipe[0]);
+  close(epipe[1]);
+
+  return ret;
+}
+
+void SubProcess::exec() {
+  assert(is_child());
+
+  std::vector<const char *> args;
+  args.push_back(cmd.c_str());
+  for (std::vector<std::string>::iterator i = cmd_args.begin();
+       i != cmd_args.end();
+       i++) {
+    args.push_back(i->c_str());
+  }
+  args.push_back(NULL);
+
+  int ret = execvp(cmd.c_str(), (char * const *)&args[0]);
+  assert(ret == -1);
+
+  std::cerr << cmd << ": exec failed: " << cpp_strerror(errno) << "\n";
+  _exit(EXIT_FAILURE);
+}
+
+int SubProcess::join() {
+  assert(is_spawned());
+
+  close(stdin_pipe_out_fd);
+  close(stdout_pipe_in_fd);
+  close(stderr_pipe_in_fd);
+
+  int status;
+
+  while (waitpid(pid, &status, 0) == -1)
+    assert(errno == EINTR);
+
+  pid = -1;
+
+  if (WIFEXITED(status)) {
+    if (WEXITSTATUS(status) != EXIT_SUCCESS)
+      errstr << cmd << ": exit status: " << WEXITSTATUS(status);
+    return WEXITSTATUS(status);
+  }
+  if (WIFSIGNALED(status)) {
+    errstr << cmd << ": got signal: " << WTERMSIG(status);
+    return 128 + WTERMSIG(status);
+  }
+  errstr << cmd << ": waitpid: unknown status returned\n";
+  return EXIT_FAILURE;
+}
+
+SubProcessTimed::SubProcessTimed(const char *cmd, bool pipe_stdin,
+				 bool pipe_stdout, bool pipe_stderr,
+				 int timeout_, int sigkill_) :
+  SubProcess(cmd, pipe_stdin, pipe_stdout, pipe_stderr),
+  timeout(timeout_),
+  sigkill(sigkill_) {
+}
+
+static bool timedout = false; // only used after fork
+static void timeout_sighandler(int sig) {
+  timedout = true;
+}
+static void dummy_sighandler(int sig) {}
+
+void SubProcessTimed::exec() {
+  assert(is_child());
+
+  if (timeout <= 0) {
+    SubProcess::exec();
+    assert(0); // Never reached
+  }
+
+  sigset_t mask, oldmask;
+  int pid;
+
+  // Restore default action for SIGTERM in case the parent process decided
+  // to ignore it.
+  if (signal(SIGTERM, SIG_DFL) == SIG_ERR) {
+    std::cerr << cmd << ": signal failed: " << cpp_strerror(errno) << "\n";
+    goto fail_exit;
+  }
+  // Because SIGCHLD is ignored by default, setup dummy handler for it,
+  // so we can mask it.
+  if (signal(SIGCHLD, dummy_sighandler) == SIG_ERR) {
+    std::cerr << cmd << ": signal failed: " << cpp_strerror(errno) << "\n";
+    goto fail_exit;
+  }
+  // Setup timeout handler.
+  if (signal(SIGALRM, timeout_sighandler) == SIG_ERR) {
+    std::cerr << cmd << ": signal failed: " << cpp_strerror(errno) << "\n";
+    goto fail_exit;
+  }
+  // Block interesting signals.
+  sigemptyset(&mask);
+  sigaddset(&mask, SIGINT);
+  sigaddset(&mask, SIGTERM);
+  sigaddset(&mask, SIGCHLD);
+  sigaddset(&mask, SIGALRM);
+  if (sigprocmask(SIG_SETMASK, &mask, &oldmask) == -1) {
+    std::cerr << cmd << ": sigprocmask failed: " << cpp_strerror(errno) << "\n";
+    goto fail_exit;
+  }
+
+  pid = fork();
+
+  if (pid == -1) {
+    std::cerr << cmd << ": fork failed: " << cpp_strerror(errno) << "\n";
+    goto fail_exit;
+  }
+
+  if (pid == 0) { // Child
+    // Restore old sigmask.
+    if (sigprocmask(SIG_SETMASK, &oldmask, NULL) == -1) {
+      std::cerr << cmd << ": sigprocmask failed: " << cpp_strerror(errno) << "\n";
+      goto fail_exit;
+    }
+    (void)setpgid(0, 0); // Become process group leader.
+    SubProcess::exec();
+    assert(0); // Never reached
+  }
+
+  // Parent
+  (void)alarm(timeout);
+
+  for (;;) {
+    int signo;
+    if (sigwait(&mask, &signo) == -1) {
+      std::cerr << cmd << ": sigwait failed: " << cpp_strerror(errno) << "\n";
+      goto fail_exit;
+    }
+    switch (signo) {
+    case SIGCHLD:
+      int status;
+      if (waitpid(pid, &status, WNOHANG) == -1) {
+	std::cerr << cmd << ": waitpid failed: " << cpp_strerror(errno) << "\n";
+	goto fail_exit;
+      }
+      if (WIFEXITED(status))
+	_exit(WEXITSTATUS(status));
+      if (WIFSIGNALED(status))
+	_exit(128 + WTERMSIG(status));
+      std::cerr << cmd << ": unknown status returned\n";
+      goto fail_exit;
+    case SIGINT:
+    case SIGTERM:
+      // Pass SIGINT and SIGTERM, which are usually used to terminate
+      // a process, to the child.
+      if (::kill(pid, signo) == -1) {
+	std::cerr << cmd << ": kill failed: " << cpp_strerror(errno) << "\n";
+	goto fail_exit;
+      }
+      continue;
+    case SIGALRM:
+      std::cerr << cmd << ": timed out (" << timeout << " sec)\n";
+      if (::killpg(pid, sigkill) == -1) {
+	std::cerr << cmd << ": kill failed: " << cpp_strerror(errno) << "\n";
+	goto fail_exit;
+      }
+      continue;
+    default:
+      std::cerr << cmd << ": sigwait: invalid signal: " << signo << "\n";
+      goto fail_exit;
+    }
+  }
+
+fail_exit:
+  _exit(EXIT_FAILURE);
+}
+
+#endif
diff --git a/src/common/Thread.cc b/src/common/Thread.cc
index 584e97b..b917838 100644
--- a/src/common/Thread.cc
+++ b/src/common/Thread.cc
@@ -1,4 +1,4 @@
- // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
 // vim: ts=8 sw=2 smarttab
 /*
  * Ceph - scalable distributed file system
@@ -27,13 +27,34 @@
 #include <stdlib.h>
 #include <string.h>
 #include <sys/types.h>
+#ifdef HAVE_SCHED
+#include <sched.h>
+#endif
 
+static int _set_affinity(int id)
+{
+#ifdef HAVE_SCHED
+  if (id >= 0 && id < CPU_SETSIZE) {
+    cpu_set_t cpuset;
+    CPU_ZERO(&cpuset);
+
+    CPU_SET(id, &cpuset);
+
+    if (sched_setaffinity(0, sizeof(cpuset), &cpuset) < 0)
+      return -errno;
+    /* guaranteed to take effect immediately */
+    sched_yield();
+  }
+#endif
+  return 0;
+}
 
 Thread::Thread()
   : thread_id(0),
     pid(0),
     ioprio_class(-1),
-    ioprio_priority(-1)
+    ioprio_priority(-1),
+    cpuid(-1)
 {
 }
 
@@ -58,10 +79,12 @@ void *Thread::entry_wrapper()
 		    pid,
 		    IOPRIO_PRIO_VALUE(ioprio_class, ioprio_priority));
   }
+  if (pid && cpuid >= 0)
+    _set_affinity(cpuid);
   return entry();
 }
 
-const pthread_t &Thread::get_thread_id()
+const pthread_t &Thread::get_thread_id() const
 {
   return thread_id;
 }
@@ -71,7 +94,7 @@ bool Thread::is_started() const
   return thread_id != 0;
 }
 
-bool Thread::am_self()
+bool Thread::am_self() const
 {
   return (pthread_self() == thread_id);
 }
@@ -87,11 +110,11 @@ int Thread::kill(int signal)
 int Thread::try_create(size_t stacksize)
 {
   pthread_attr_t *thread_attr = NULL;
+  pthread_attr_t thread_attr_loc;
+  
   stacksize &= CEPH_PAGE_MASK;  // must be multiple of page
   if (stacksize) {
-    thread_attr = (pthread_attr_t*) malloc(sizeof(pthread_attr_t));
-    if (!thread_attr)
-      return -ENOMEM;
+    thread_attr = &thread_attr_loc;
     pthread_attr_init(thread_attr);
     pthread_attr_setstacksize(thread_attr, stacksize);
   }
@@ -113,8 +136,10 @@ int Thread::try_create(size_t stacksize)
   r = pthread_create(&thread_id, thread_attr, _entry_func, (void*)this);
   restore_sigset(&old_sigset);
 
-  if (thread_attr)
-    free(thread_attr);
+  if (thread_attr) {
+    pthread_attr_destroy(thread_attr);	
+  }
+
   return r;
 }
 
@@ -138,7 +163,14 @@ int Thread::join(void **prval)
   }
 
   int status = pthread_join(thread_id, prval);
-  assert(status == 0);
+  if (status != 0) {
+    char buf[256];
+    snprintf(buf, sizeof(buf), "Thread::join(): pthread_join "
+             "failed with error %d\n", status);
+    dout_emergency(buf);
+    assert(status == 0);
+  }
+
   thread_id = 0;
   return status;
 }
@@ -159,3 +191,11 @@ int Thread::set_ioprio(int cls, int prio)
 			   IOPRIO_PRIO_VALUE(cls, prio));
   return 0;
 }
+
+int Thread::set_affinity(int id)
+{
+  cpuid = id;
+  if (pid && ceph_gettid() == pid)
+    _set_affinity(id);
+  return 0;
+}
diff --git a/src/common/Thread.h b/src/common/Thread.h
index 7889c91..e284bda 100644
--- a/src/common/Thread.h
+++ b/src/common/Thread.h
@@ -24,11 +24,12 @@ class Thread {
   pthread_t thread_id;
   pid_t pid;
   int ioprio_class, ioprio_priority;
+  int cpuid;
 
   void *entry_wrapper();
 
  public:
-  Thread(const Thread& other);
+  explicit Thread(const Thread& other);
   const Thread& operator=(const Thread& other);
 
   Thread();
@@ -41,16 +42,17 @@ class Thread {
   static void *_entry_func(void *arg);
 
  public:
-  const pthread_t &get_thread_id();
+  const pthread_t &get_thread_id() const;
   pid_t get_pid() const { return pid; }
   bool is_started() const;
-  bool am_self();
+  bool am_self() const;
   int kill(int signal);
   int try_create(size_t stacksize);
   void create(size_t stacksize = 0);
   int join(void **prval = 0);
   int detach();
   int set_ioprio(int cls, int prio);
+  int set_affinity(int cpuid);
 };
 
 #endif
diff --git a/src/common/Throttle.cc b/src/common/Throttle.cc
index 5c68a1f..d117794 100644
--- a/src/common/Throttle.cc
+++ b/src/common/Throttle.cc
@@ -29,9 +29,9 @@ enum {
   l_throttle_last,
 };
 
-Throttle::Throttle(CephContext *cct, std::string n, int64_t m, bool _use_perf)
+Throttle::Throttle(CephContext *cct, const std::string& n, int64_t m, bool _use_perf)
   : cct(cct), name(n), logger(NULL),
-		max(m),
+    max(m),
     lock("Throttle::lock"),
     use_perf(_use_perf)
 {
@@ -42,17 +42,17 @@ Throttle::Throttle(CephContext *cct, std::string n, int64_t m, bool _use_perf)
 
   if (cct->_conf->throttler_perf_counter) {
     PerfCountersBuilder b(cct, string("throttle-") + name, l_throttle_first, l_throttle_last);
-    b.add_u64_counter(l_throttle_val, "val");
-    b.add_u64_counter(l_throttle_max, "max");
-    b.add_u64_counter(l_throttle_get, "get");
-    b.add_u64_counter(l_throttle_get_sum, "get_sum");
-    b.add_u64_counter(l_throttle_get_or_fail_fail, "get_or_fail_fail");
-    b.add_u64_counter(l_throttle_get_or_fail_success, "get_or_fail_success");
-    b.add_u64_counter(l_throttle_take, "take");
-    b.add_u64_counter(l_throttle_take_sum, "take_sum");
-    b.add_u64_counter(l_throttle_put, "put");
-    b.add_u64_counter(l_throttle_put_sum, "put_sum");
-    b.add_time_avg(l_throttle_wait, "wait");
+    b.add_u64_counter(l_throttle_val, "val", "Currently available throttle");
+    b.add_u64_counter(l_throttle_max, "max", "Max value for throttle");
+    b.add_u64_counter(l_throttle_get, "get", "Gets");
+    b.add_u64_counter(l_throttle_get_sum, "get_sum", "Got data");
+    b.add_u64_counter(l_throttle_get_or_fail_fail, "get_or_fail_fail", "Get blocked during get_or_fail");
+    b.add_u64_counter(l_throttle_get_or_fail_success, "get_or_fail_success", "Successful get during get_or_fail");
+    b.add_u64_counter(l_throttle_take, "take", "Takes");
+    b.add_u64_counter(l_throttle_take_sum, "take_sum", "Taken data");
+    b.add_u64_counter(l_throttle_put, "put", "Puts");
+    b.add_u64_counter(l_throttle_put_sum, "put_sum", "Put data");
+    b.add_time_avg(l_throttle_wait, "wait", "Waiting latency");
 
     logger = b.create_perf_counters();
     cct->get_perfcounters_collection()->add(logger);
@@ -80,6 +80,8 @@ Throttle::~Throttle()
 void Throttle::_reset_max(int64_t m)
 {
   assert(lock.is_locked());
+  if ((int64_t)max.read() == m)
+    return;
   if (!cond.empty())
     cond.front()->SignalOne();
   if (logger)
@@ -124,7 +126,7 @@ bool Throttle::_wait(int64_t c)
 
 bool Throttle::wait(int64_t m)
 {
-  if (0 == max.read()) {
+  if (0 == max.read() && 0 == m) {
     return false;
   }
 
@@ -158,7 +160,7 @@ int64_t Throttle::take(int64_t c)
 
 bool Throttle::get(int64_t c, int64_t m)
 {
-  if (0 == max.read()) {
+  if (0 == max.read() && 0 == m) {
     return false;
   }
 
@@ -280,3 +282,89 @@ int SimpleThrottle::wait_for_ret()
     m_cond.Wait(m_lock);
   return m_ret;
 }
+
+void C_OrderedThrottle::finish(int r) {
+  m_ordered_throttle->finish_op(m_tid, r);
+}
+
+OrderedThrottle::OrderedThrottle(uint64_t max, bool ignore_enoent)
+  : m_lock("OrderedThrottle::m_lock"), m_max(max), m_current(0), m_ret_val(0),
+    m_ignore_enoent(ignore_enoent), m_next_tid(0), m_complete_tid(0) {
+}
+
+C_OrderedThrottle *OrderedThrottle::start_op(Context *on_finish) {
+  assert(on_finish != NULL);
+
+  Mutex::Locker locker(m_lock);
+  uint64_t tid = m_next_tid++;
+  m_tid_result[tid] = Result(on_finish);
+  C_OrderedThrottle *ctx = new C_OrderedThrottle(this, tid);
+
+  complete_pending_ops();
+  while (m_max == m_current) {
+    m_cond.Wait(m_lock);
+    complete_pending_ops();
+  }
+  ++m_current;
+
+  return ctx;
+}
+
+void OrderedThrottle::end_op(int r) {
+  Mutex::Locker locker(m_lock);
+  assert(m_current > 0);
+
+  if (r < 0 && m_ret_val == 0 && (r != -ENOENT || !m_ignore_enoent)) {
+    m_ret_val = r;
+  }
+  --m_current;
+  m_cond.Signal();
+}
+
+void OrderedThrottle::finish_op(uint64_t tid, int r) {
+  Mutex::Locker locker(m_lock);
+
+  TidResult::iterator it = m_tid_result.find(tid);
+  assert(it != m_tid_result.end());
+
+  it->second.finished = true;
+  it->second.ret_val = r;
+  m_cond.Signal();
+}
+
+bool OrderedThrottle::pending_error() const {
+  Mutex::Locker locker(m_lock);
+  return (m_ret_val < 0);
+}
+
+int OrderedThrottle::wait_for_ret() {
+  Mutex::Locker locker(m_lock);
+  complete_pending_ops();
+
+  while (m_current > 0) {
+    m_cond.Wait(m_lock);
+    complete_pending_ops();
+  }
+  return m_ret_val;
+}
+
+void OrderedThrottle::complete_pending_ops() {
+  assert(m_lock.is_locked());
+
+  while (true) {
+    TidResult::iterator it = m_tid_result.begin();
+    if (it == m_tid_result.end() || it->first != m_complete_tid ||
+        !it->second.finished) {
+      break;
+    }
+
+    Result result = it->second;
+    m_tid_result.erase(it);
+
+    m_lock.Unlock();
+    result.on_finish->complete(result.ret_val);
+    m_lock.Lock();
+
+    ++m_complete_tid;
+  }
+}
diff --git a/src/common/Throttle.h b/src/common/Throttle.h
index b171e27..c04a931 100644
--- a/src/common/Throttle.h
+++ b/src/common/Throttle.h
@@ -7,27 +7,37 @@
 #include "Mutex.h"
 #include "Cond.h"
 #include <list>
+#include <map>
 #include "include/atomic.h"
+#include "include/Context.h"
 
 class CephContext;
 class PerfCounters;
 
+/**
+ * @class Throttle
+ * Throttles the maximum number of active requests.
+ *
+ * This class defines the maximum number of slots currently taken away. The
+ * excessive requests for more of them are delayed, until some slots are put
+ * back, so @p get_current() drops below the limit after fulfills the requests.
+ */
 class Throttle {
   CephContext *cct;
-  std::string name;
+  const std::string name;
   PerfCounters *logger;
-	ceph::atomic_t count, max;
+  ceph::atomic_t count, max;
   Mutex lock;
   list<Cond*> cond;
-  bool use_perf;
-  
+  const bool use_perf;
+
 public:
-  Throttle(CephContext *cct, std::string n, int64_t m = 0, bool _use_perf = true);
+  Throttle(CephContext *cct, const std::string& n, int64_t m = 0, bool _use_perf = true);
   ~Throttle();
 
 private:
   void _reset_max(int64_t m);
-  bool _should_wait(int64_t c) {
+  bool _should_wait(int64_t c) const {
     int64_t m = max.read();
     int64_t cur = count.read();
     return
@@ -39,23 +49,66 @@ private:
   bool _wait(int64_t c);
 
 public:
-  int64_t get_current() {
+  /**
+   * gets the number of currently taken slots
+   * @returns the number of taken slots
+   */
+  int64_t get_current() const {
     return count.read();
   }
 
-  int64_t get_max() { return max.read(); }
+  /**
+   * get the max number of slots
+   * @returns the max number of slots
+   */
+  int64_t get_max() const { return max.read(); }
 
+  /**
+   * set the new max number, and wait until the number of taken slots drains
+   * and drops below this limit.
+   *
+   * @param m the new max number
+   * @returns true if this method is blocked, false it it returns immediately
+   */
   bool wait(int64_t m = 0);
 
+  /**
+   * take the specified number of slots from the stock regardless the throttling
+   * @param c number of slots to take
+   * @returns the total number of taken slots
+   */
   int64_t take(int64_t c = 1);
+
+  /**
+   * get the specified amount of slots from the stock, but will wait if the
+   * total number taken by consumer would exceed the maximum number.
+   * @param c number of slots to get
+   * @param m new maximum number to set, ignored if it is 0
+   * @returns true if this request is blocked due to the throttling, false 
+   * otherwise
+   */
   bool get(int64_t c = 1, int64_t m = 0);
 
   /**
-   * Returns true if it successfully got the requested amount,
+   * the unblocked version of @p get()
+   * @returns true if it successfully got the requested amount,
    * or false if it would block.
    */
   bool get_or_fail(int64_t c = 1);
+
+  /**
+   * put slots back to the stock
+   * @param c number of slots to return
+   * @returns number of requests being hold after this
+   */
   int64_t put(int64_t c = 1);
+  bool should_wait(int64_t c) const {
+    return _should_wait(c);
+  }
+  void reset_max(int64_t m) {
+    Mutex::Locker l(lock);
+    _reset_max(m);
+  }
 };
 
 
@@ -99,4 +152,70 @@ private:
   SimpleThrottle *m_throttle;
 };
 
+class OrderedThrottle;
+
+class C_OrderedThrottle : public Context {
+public:
+  C_OrderedThrottle(OrderedThrottle *ordered_throttle, uint64_t tid)
+    : m_ordered_throttle(ordered_throttle), m_tid(tid) {
+  }
+
+protected:
+  virtual void finish(int r);
+
+private:
+  OrderedThrottle *m_ordered_throttle;
+  uint64_t m_tid;
+};
+
+/**
+ * @class OrderedThrottle
+ * Throttles the maximum number of active requests and completes them in order
+ *
+ * Operations can complete out-of-order but their associated Context callback
+ * will completed in-order during invokation of start_op() and wait_for_ret()
+ */
+class OrderedThrottle {
+public:
+  OrderedThrottle(uint64_t max, bool ignore_enoent);
+
+  C_OrderedThrottle *start_op(Context *on_finish);
+  void end_op(int r);
+
+  bool pending_error() const;
+  int wait_for_ret();
+
+protected:
+  friend class C_OrderedThrottle;
+
+  void finish_op(uint64_t tid, int r);
+
+private:
+  struct Result {
+    bool finished;
+    int ret_val;
+    Context *on_finish;
+
+    Result(Context *_on_finish = NULL)
+      : finished(false), ret_val(0), on_finish(_on_finish) {
+    }
+  };
+
+  typedef std::map<uint64_t, Result> TidResult;
+
+  mutable Mutex m_lock;
+  Cond m_cond;
+  uint64_t m_max;
+  uint64_t m_current;
+  int m_ret_val;
+  bool m_ignore_enoent;
+
+  uint64_t m_next_tid;
+  uint64_t m_complete_tid;
+
+  TidResult m_tid_result;
+
+  void complete_pending_ops();
+};
+
 #endif
diff --git a/src/common/TracepointProvider.cc b/src/common/TracepointProvider.cc
new file mode 100644
index 0000000..7c3d453
--- /dev/null
+++ b/src/common/TracepointProvider.cc
@@ -0,0 +1,44 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/TracepointProvider.h"
+#include "common/config.h"
+
+TracepointProvider::TracepointProvider(CephContext *cct, const char *library,
+                                       const char *config_key)
+  : m_cct(cct), m_library(library), m_config_keys{config_key, NULL},
+    m_lock("TracepointProvider::m_lock"), m_enabled(false) {
+  m_cct->_conf->add_observer(this);
+  verify_config(m_cct->_conf);
+}
+
+TracepointProvider::~TracepointProvider() {
+  m_cct->_conf->remove_observer(this);
+}
+
+void TracepointProvider::handle_conf_change(
+    const struct md_config_t *conf, const std::set<std::string> &changed) {
+  if (changed.count(m_config_keys[0])) {
+    verify_config(conf);
+  }
+}
+
+void TracepointProvider::verify_config(const struct md_config_t *conf) {
+  Mutex::Locker locker(m_lock);
+  if (m_enabled) {
+    return;
+  }
+
+  char buf[10];
+  char *pbuf = buf;
+  if (conf->get_val(m_config_keys[0], &pbuf, sizeof(buf)) != 0 ||
+      strncmp(buf, "true", 5) != 0) {
+    return;
+  }
+
+  void *handle = dlopen(m_library.c_str(), RTLD_NOW);
+  if (handle != NULL) {
+    m_enabled = true;
+  }
+}
+
diff --git a/src/common/TracepointProvider.h b/src/common/TracepointProvider.h
new file mode 100644
index 0000000..e54a037
--- /dev/null
+++ b/src/common/TracepointProvider.h
@@ -0,0 +1,83 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_TRACEPOINT_PROVIDER_H
+#define CEPH_TRACEPOINT_PROVIDER_H
+
+#include "include/int_types.h"
+#include "common/ceph_context.h"
+#include "common/config_obs.h"
+#include "common/Mutex.h"
+#include <dlfcn.h>
+#include <set>
+#include <string>
+#include <boost/noncopyable.hpp>
+
+struct md_config_t;
+
+class TracepointProvider : public md_config_obs_t, boost::noncopyable {
+public:
+  struct Traits {
+    const char *library;
+    const char *config_key;
+
+    Traits(const char *library, const char *config_key)
+      : library(library), config_key(config_key) {
+    }
+  };
+
+  class Singleton {
+  public:
+    Singleton(CephContext *cct, const char *library, const char *config_key)
+      : tracepoint_provider(new TracepointProvider(cct, library, config_key)) {
+    }
+    ~Singleton() {
+      delete tracepoint_provider;
+    }
+
+    inline bool is_enabled() const {
+      return tracepoint_provider->m_enabled;
+    }
+  private:
+    TracepointProvider *tracepoint_provider;
+  };
+
+  template <const Traits &traits>
+  class TypedSingleton : public Singleton {
+  public:
+    TypedSingleton(CephContext *cct)
+      : Singleton(cct, traits.library, traits.config_key) {
+    }
+  };
+
+  TracepointProvider(CephContext *cct, const char *library,
+                     const char *config_key);
+  virtual ~TracepointProvider();
+
+  template <const Traits &traits>
+  static void initialize(CephContext *cct) {
+#if WITH_LTTNG
+    TypedSingleton<traits> *singleton;
+    cct->lookup_or_create_singleton_object(singleton, traits.library);
+#endif
+  }
+
+protected:
+  virtual const char** get_tracked_conf_keys() const {
+    return m_config_keys;
+  }
+  virtual void handle_conf_change(const struct md_config_t *conf,
+                                  const std::set <std::string> &changed);
+
+private:
+  CephContext *m_cct;
+  std::string m_library;
+  mutable const char* m_config_keys[2];
+
+  Mutex m_lock;
+  bool m_enabled;
+
+  void verify_config(const struct md_config_t *conf);
+};
+
+#endif // CEPH_TRACEPOINT_PROVIDER_H
diff --git a/src/common/TrackedOp.cc b/src/common/TrackedOp.cc
index 32dbc53..f759894 100644
--- a/src/common/TrackedOp.cc
+++ b/src/common/TrackedOp.cc
@@ -152,12 +152,14 @@ void OpTracker::unregister_inflight_op(TrackedOp *i)
 
 bool OpTracker::check_ops_in_flight(std::vector<string> &warning_vector)
 {
+  if (!tracking_enabled)
+    return false;
+
   utime_t now = ceph_clock_now(cct);
   utime_t too_old = now;
   too_old -= complaint_time;
-  utime_t oldest_op;
+  utime_t oldest_op = now;
   uint64_t total_ops_in_flight = 0;
-  bool got_first_op = false;
 
   for (uint32_t i = 0; i < num_optracker_shards; i++) {
     ShardedTrackingData* sdata = sharded_in_flight_list[i];
@@ -165,10 +167,7 @@ bool OpTracker::check_ops_in_flight(std::vector<string> &warning_vector)
     Mutex::Locker locker(sdata->ops_in_flight_lock_sharded);
     if (!sdata->ops_in_flight_sharded.empty()) {
       utime_t oldest_op_tmp = sdata->ops_in_flight_sharded.front()->get_initiated();
-      if (!got_first_op) {
-        oldest_op = oldest_op_tmp;
-        got_first_op = true;
-      } else if (oldest_op_tmp < oldest_op) {
+      if (oldest_op_tmp < oldest_op) {
         oldest_op = oldest_op_tmp;
       }
     } 
@@ -188,10 +187,13 @@ bool OpTracker::check_ops_in_flight(std::vector<string> &warning_vector)
     return false;
 
   warning_vector.reserve(log_threshold + 1);
+  //store summary message
+  warning_vector.push_back("");
 
   int slow = 0;     // total slow
   int warned = 0;   // total logged
-  for (uint32_t iter = 0; iter < num_optracker_shards; iter++) {
+  for (uint32_t iter = 0;
+       iter < num_optracker_shards && warned < log_threshold; iter++) {
     ShardedTrackingData* sdata = sharded_in_flight_list[iter];
     assert(NULL != sdata);
     Mutex::Locker locker(sdata->ops_in_flight_lock_sharded);
@@ -205,8 +207,6 @@ bool OpTracker::check_ops_in_flight(std::vector<string> &warning_vector)
       if (((*i)->get_initiated() +
 	 (complaint_time * (*i)->warn_interval_multiplier)) < now) {
       // will warn
-        if (warning_vector.empty())
-          warning_vector.push_back("");
         warned++;
         if (warned > log_threshold)
           break;
@@ -236,7 +236,7 @@ bool OpTracker::check_ops_in_flight(std::vector<string> &warning_vector)
     warning_vector[0] = ss.str();
   }
 
-  return warning_vector.size();
+  return warned;
 }
 
 void OpTracker::get_age_ms_histogram(pow2_hist_t *h)
@@ -294,12 +294,12 @@ void OpTracker::_mark_event(TrackedOp *op, const string &evt,
 }
 
 void OpTracker::RemoveOnDelete::operator()(TrackedOp *op) {
-  op->mark_event("done");
   if (!tracker->tracking_enabled) {
     op->_unregistered();
     delete op;
     return;
   }
+  op->mark_event("done");
   tracker->unregister_inflight_op(op);
   // Do not delete op, unregister_inflight_op took control
 }
diff --git a/src/common/TrackedOp.h b/src/common/TrackedOp.h
index 42d1eaf..2f656ca 100644
--- a/src/common/TrackedOp.h
+++ b/src/common/TrackedOp.h
@@ -161,7 +161,8 @@ protected:
     warn_interval_multiplier(1)
   {
     tracker->register_inflight_op(&xitem);
-    events.push_back(make_pair(initiated_at, "initiated"));
+    if (tracker->tracking_enabled)
+      events.push_back(make_pair(initiated_at, "initiated"));
   }
 
   /// output any type-specific data you want to get when dump() is called
@@ -179,11 +180,12 @@ public:
   const utime_t& get_initiated() const {
     return initiated_at;
   }
-  // This function maybe needs some work; assumes last event is completion time
+
   double get_duration() const {
-    return events.empty() ?
-      0.0 :
-      (events.rbegin()->first - get_initiated());
+    if (!events.empty() && events.rbegin()->second.compare("done") == 0)
+      return events.rbegin()->first - get_initiated();
+    else
+      return ceph_clock_now(NULL) - get_initiated();
   }
 
   void mark_event(const string &event);
diff --git a/src/common/WorkQueue.h b/src/common/WorkQueue.h
index 300ae7d..f0754de 100644
--- a/src/common/WorkQueue.h
+++ b/src/common/WorkQueue.h
@@ -23,6 +23,7 @@
 
 class CephContext;
 
+/// Pool of threads that share work submitted to multiple work queues.
 class ThreadPool : public md_config_obs_t {
   CephContext *cct;
   string name;
@@ -54,6 +55,7 @@ public:
   };
 private:
 
+  /// Basic interface to a work queue used by the worker threads.
   struct WorkQueue_ {
     string name;
     time_t timeout_interval, suicide_interval;
@@ -61,10 +63,20 @@ private:
       : name(n), timeout_interval(ti), suicide_interval(sti)
     { }
     virtual ~WorkQueue_() {}
+    /// Remove all work items from the queue.
     virtual void _clear() = 0;
+    /// Check whether there is anything to do.
     virtual bool _empty() = 0;
+    /// Get the next work item to process.
     virtual void *_void_dequeue() = 0;
+    /** @brief Process the work item.
+     * This function will be called several times in parallel
+     * and must therefore be thread-safe. */
     virtual void _void_process(void *item, TPHandle &handle) = 0;
+    /** @brief Synchronously finish processing a work item.
+     * This function is called after _void_process with the global thread pool lock held,
+     * so at most one copy will execute simultaneously for a given thread pool.
+     * It can be used for non-thread-safe finalization. */
     virtual void _void_process_finish(void *) = 0;
   };
 
@@ -80,6 +92,9 @@ private:
 			  const std::set <std::string> &changed);
 
 public:
+  /** @brief Work queue that processes several submitted items at once.
+   * The queue will automatically add itself to the thread pool on construction
+   * and remove itself on destruction. */
   template<class T>
   class BatchWorkQueue : public WorkQueue_ {
     ThreadPool *pool;
@@ -87,12 +102,9 @@ public:
     virtual bool _enqueue(T *) = 0;
     virtual void _dequeue(T *) = 0;
     virtual void _dequeue(list<T*> *) = 0;
-    virtual void _process(const list<T*> &) { assert(0); }
-    virtual void _process(const list<T*> &items, TPHandle &handle) {
-      _process(items);
-    }
     virtual void _process_finish(const list<T*> &) {}
 
+    // virtual methods from WorkQueue_ below
     void *_void_dequeue() {
       list<T*> *out(new list<T*>);
       _dequeue(out);
@@ -111,6 +123,12 @@ public:
       delete (list<T*> *)p;
     }
 
+  protected:
+    virtual void _process(const list<T*> &) { assert(0); }
+    virtual void _process(const list<T*> &items, TPHandle &handle) {
+      _process(items);
+    }
+
   public:
     BatchWorkQueue(string n, time_t ti, time_t sti, ThreadPool* p)
       : WorkQueue_(n, ti, sti), pool(p) {
@@ -155,6 +173,12 @@ public:
     }
 
   };
+
+  /** @brief Templated by-value work queue.
+   * Skeleton implementation of a queue that processes items submitted by value.
+   * This is useful if the items are single primitive values or very small objects
+   * (a few bytes). The queue will automatically add itself to the thread pool on
+   * construction and remove itself on destruction. */
   template<typename T, typename U = T>
   class WorkQueueVal : public WorkQueue_ {
     Mutex _lock;
@@ -165,10 +189,6 @@ public:
     virtual void _enqueue_front(T) = 0;
     virtual bool _empty() = 0;
     virtual U _dequeue() = 0;
-    virtual void _process(U) { assert(0); }
-    virtual void _process(U u, TPHandle &) {
-      _process(u);
-    }
     virtual void _process_finish(U) {}
 
     void *_void_dequeue() {
@@ -235,20 +255,30 @@ public:
     void unlock() {
       pool->unlock();
     }
+    virtual void _process(U) { assert(0); }
+    virtual void _process(U u, TPHandle &) {
+      _process(u);
+    }
   };
+
+  /** @brief Template by-pointer work queue.
+   * Skeleton implementation of a queue that processes items of a given type submitted as pointers.
+   * This is useful when the work item are large or include dynamically allocated memory. The queue
+   * will automatically add itself to the thread pool on construction and remove itself on
+   * destruction. */
   template<class T>
   class WorkQueue : public WorkQueue_ {
     ThreadPool *pool;
     
+    /// Add a work item to the queue.
     virtual bool _enqueue(T *) = 0;
+    /// Dequeue a previously submitted work item.
     virtual void _dequeue(T *) = 0;
+    /// Dequeue a work item and return the original submitted pointer.
     virtual T *_dequeue() = 0;
-    virtual void _process(T *t) { assert(0); }
-    virtual void _process(T *t, TPHandle &) {
-      _process(t);
-    }
     virtual void _process_finish(T *) {}
-    
+
+    // implementation of virtual methods from WorkQueue_
     void *_void_dequeue() {
       return (void *)_dequeue();
     }
@@ -259,6 +289,13 @@ public:
       _process_finish(static_cast<T *>(p));
     }
 
+  protected:
+    /// Process a work item. Called from the worker threads.
+    virtual void _process(T *t) { assert(0); }
+    virtual void _process(T *t, TPHandle &) {
+      _process(t);
+    }
+
   public:
     WorkQueue(string n, time_t ti, time_t sti, ThreadPool* p) : WorkQueue_(n, ti, sti), pool(p) {
       pool->add_work_queue(this);
@@ -285,6 +322,10 @@ public:
       pool->_lock.Unlock();
     }
 
+    Mutex &get_lock() {
+      return pool->_lock;
+    }
+
     void lock() {
       pool->lock();
     }
@@ -299,6 +340,9 @@ public:
     void _wake() {
       pool->_wake();
     }
+    void _wait() {
+      pool->_wait();
+    }
     void drain() {
       pool->drain(this);
     }
@@ -340,10 +384,12 @@ public:
   
   /// assign a work queue to this thread pool
   void add_work_queue(WorkQueue_* wq) {
+    Mutex::Locker l(_lock);
     work_queues.push_back(wq);
   }
   /// remove a work queue from this thread pool
   void remove_work_queue(WorkQueue_* wq) {
+    Mutex::Locker l(_lock);
     unsigned i = 0;
     while (work_queues[i] != wq)
       i++;
@@ -376,6 +422,9 @@ public:
     Mutex::Locker l(_lock);
     _cond.Signal();
   }
+  void _wait() {
+    _cond.Wait(_lock);
+  }
 
   /// start thread pool thread
   void start();
@@ -387,7 +436,10 @@ public:
   void pause_new();
   /// resume work in thread pool.  must match each pause() call 1:1 to resume.
   void unpause();
-  /// wait for all work to complete
+  /** @brief Wait until work completes.
+   * If the parameter is NULL, blocks until all threads are idle.
+   * If it is not NULL, blocks until the given work queue does not have
+   * any items left to process. */
   void drain(WorkQueue_* wq = 0);
 
   /// set io priority
@@ -417,6 +469,7 @@ public:
     _queue.pop_front();
     return c;
   }
+  using ThreadPool::WorkQueueVal<GenContext<ThreadPool::TPHandle&>*>::_process;
   void _process(GenContext<ThreadPool::TPHandle&> *c, ThreadPool::TPHandle &tp) {
     c->complete(tp);
   }
@@ -433,6 +486,8 @@ public:
   }
 };
 
+/// Work queue that asynchronously completes contexts (executes callbacks).
+/// @see Finisher
 class ContextWQ : public ThreadPool::WorkQueueVal<std::pair<Context *, int> > {
 public:
   ContextWQ(const string &name, time_t ti, ThreadPool *tp)
@@ -461,6 +516,7 @@ protected:
   virtual void _process(std::pair<Context *, int> item) {
     item.first->complete(item.second);
   }
+  using ThreadPool::WorkQueueVal<std::pair<Context *, int> >::_process;
 private:
   list<std::pair<Context *, int> > _queue;
 };
diff --git a/src/common/admin_socket.cc b/src/common/admin_socket.cc
index 95a48b3..07a2246 100644
--- a/src/common/admin_socket.cc
+++ b/src/common/admin_socket.cc
@@ -209,7 +209,7 @@ std::string AdminSocket::bind_and_listen(const std::string &sock_path, int *fd)
   address.sun_family = AF_UNIX;
   snprintf(address.sun_path, sizeof(address.sun_path),
 	   "%s", sock_path.c_str());
-  if (bind(sock_fd, (struct sockaddr*)&address,
+  if (::bind(sock_fd, (struct sockaddr*)&address,
 	   sizeof(struct sockaddr_un)) != 0) {
     int err = errno;
     if (err == EADDRINUSE) {
@@ -222,7 +222,7 @@ std::string AdminSocket::bind_and_listen(const std::string &sock_path, int *fd)
       } else {
 	ldout(m_cct, 20) << "unlink stale file " << sock_path << dendl;
 	VOID_TEMP_FAILURE_RETRY(unlink(sock_path.c_str()));
-	if (bind(sock_fd, (struct sockaddr*)&address,
+	if (::bind(sock_fd, (struct sockaddr*)&address,
 		 sizeof(struct sockaddr_un)) == 0) {
 	  err = 0;
 	} else {
@@ -352,7 +352,7 @@ bool AdminSocket::do_accept()
   stringstream errss;
   cmdvec.push_back(cmd);
   if (!cmdmap_from_json(cmdvec, &cmdmap, errss)) {
-    ldout(m_cct, 0) << "AdminSocket: " << errss << dendl;
+    ldout(m_cct, 0) << "AdminSocket: " << errss.rdbuf() << dendl;
     return false;
   }
   cmd_getval(m_cct, cmdmap, "format", format);
diff --git a/src/common/blkdev.cc b/src/common/blkdev.cc
index 70dde42..f013a7b 100644
--- a/src/common/blkdev.cc
+++ b/src/common/blkdev.cc
@@ -1,3 +1,14 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (c) 2015 Hewlett-Packard Development Company, L.P.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
 #include <errno.h>
 #include <sys/types.h>
 #include <sys/ioctl.h>
@@ -8,9 +19,13 @@
 #include <dirent.h>
 #include <stdlib.h>
 #include "include/int_types.h"
+#include "include/uuid.h"
 
 #ifdef __linux__
 #include <linux/fs.h>
+#include <blkid/blkid.h>
+
+#define UUID_LEN 36
 
 static const char *sandbox_dir = "";
 
@@ -57,7 +72,8 @@ int get_block_device_base(const char *dev, char *out, size_t out_len)
   if (strncmp(dev, "/dev/", 5) != 0)
     return -EINVAL;
 
-  strcpy(devname, dev + 5);
+  strncpy(devname, dev + 5, PATH_MAX-1);
+  devname[PATH_MAX-1] = '\0';
   for (p = devname; *p; ++p)
     if (*p == '/')
       *p = '!';
@@ -161,6 +177,44 @@ int block_device_discard(int fd, int64_t offset, int64_t len)
   return ioctl(fd, BLKDISCARD, range);
 }
 
+int get_device_by_uuid(uuid_d dev_uuid, const char* label, char* partition,
+	char* device)
+{
+  char uuid_str[UUID_LEN+1];
+  char basename[PATH_MAX];
+  const char* temp_partition_ptr = NULL;
+  blkid_cache cache = NULL;
+  blkid_dev dev = NULL;
+  int rc = 0;
+
+  dev_uuid.print(uuid_str);
+
+  if (blkid_get_cache(&cache, NULL) >= 0)
+    dev = blkid_find_dev_with_tag(cache, label, (const char*)uuid_str);
+  else
+    rc = -EINVAL;
+
+  if (dev) {
+    temp_partition_ptr = blkid_dev_devname(dev);
+    strncpy(partition, temp_partition_ptr, PATH_MAX);
+    rc = get_block_device_base(partition, basename,
+      sizeof(basename));
+    if (rc >= 0) {
+      strncpy(device, basename, sizeof(basename));
+      rc = 0;
+    } else {
+      rc = -ENODEV;
+    }
+  } else {
+    rc = -EINVAL;
+  }
+
+  /* From what I can tell, blkid_put_cache cleans up dev, which
+   * appears to be a pointer into cache, as well */
+  if (cache)
+    blkid_put_cache(cache);
+  return rc;
+}
 #elif defined(__APPLE__)
 #include <sys/disk.h>
 
@@ -188,6 +242,12 @@ int block_device_discard(int fd, int64_t offset, int64_t len)
 {
   return -EOPNOTSUPP;
 }
+
+int get_device_by_uuid(uuid_d dev_uuid, const char* label, char* partition,
+	char* device)
+{
+  return -EOPNOTSUPP;
+}
 #elif defined(__FreeBSD__)
 #include <sys/disk.h>
 
@@ -208,6 +268,12 @@ int block_device_discard(int fd, int64_t offset, int64_t len)
 {
   return -EOPNOTSUPP;
 }
+
+int get_device_by_uuid(uuid_d dev_uuid, const char* label, char* partition,
+	char* device)
+{
+  return -EOPNOTSUPP;
+}
 #else
 # error "Unable to query block device size: unsupported platform, please report."
 #endif
diff --git a/src/common/blkdev.h b/src/common/blkdev.h
index 5c63ac9..697e8a8 100644
--- a/src/common/blkdev.h
+++ b/src/common/blkdev.h
@@ -9,4 +9,6 @@ extern int get_block_device_size(int fd, int64_t *psize);
 extern int64_t get_block_device_int_property(const char *devname, const char *property);
 extern bool block_device_support_discard(const char *devname);
 extern int block_device_discard(int fd, int64_t offset, int64_t len);
+extern int get_device_by_uuid(uuid_d dev_uuid, const char* label,
+		char* partition, char* device);
 #endif
diff --git a/src/common/buffer.cc b/src/common/buffer.cc
index 502163b..bca14d1 100644
--- a/src/common/buffer.cc
+++ b/src/common/buffer.cc
@@ -21,9 +21,10 @@
 #include "common/strtol.h"
 #include "common/likely.h"
 #include "include/atomic.h"
-#include "common/Mutex.h"
+#include "common/RWLock.h"
 #include "include/types.h"
 #include "include/compat.h"
+#include "include/inline_memory.h"
 #if defined(HAVE_XIO)
 #include "msg/xio/XioMsg.h"
 #endif
@@ -34,6 +35,7 @@
 #include <sys/uio.h>
 #include <limits.h>
 
+#include <ostream>
 namespace ceph {
 
 #ifdef BUFFER_DEBUG
@@ -45,8 +47,8 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
 # define bendl std::endl; }
 #endif
 
-  atomic_t buffer_total_alloc;
-  bool buffer_track_alloc = get_env_bool("CEPH_BUFFER_TRACK");
+  static atomic_t buffer_total_alloc;
+  const bool buffer_track_alloc = get_env_bool("CEPH_BUFFER_TRACK");
 
   void buffer::inc_total_alloc(unsigned len) {
     if (buffer_track_alloc)
@@ -60,9 +62,9 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
     return buffer_total_alloc.read();
   }
 
-  atomic_t buffer_cached_crc;
-  atomic_t buffer_cached_crc_adjusted;
-  bool buffer_track_crc = get_env_bool("CEPH_BUFFER_TRACK");
+  static atomic_t buffer_cached_crc;
+  static atomic_t buffer_cached_crc_adjusted;
+  static bool buffer_track_crc = get_env_bool("CEPH_BUFFER_TRACK");
 
   void buffer::track_cached_crc(bool b) {
     buffer_track_crc = b;
@@ -74,8 +76,8 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
     return buffer_cached_crc_adjusted.read();
   }
 
-  atomic_t buffer_c_str_accesses;
-  bool buffer_track_c_str = get_env_bool("CEPH_BUFFER_TRACK");
+  static atomic_t buffer_c_str_accesses;
+  static bool buffer_track_c_str = get_env_bool("CEPH_BUFFER_TRACK");
 
   void buffer::track_c_str(bool b) {
     buffer_track_c_str = b;
@@ -84,7 +86,7 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
     return buffer_c_str_accesses.read();
   }
 
-  atomic_t buffer_max_pipe_size;
+  static atomic_t buffer_max_pipe_size;
   int update_max_pipe_size() {
 #ifdef CEPH_HAVE_SETPIPE_SZ
     char buf[32];
@@ -118,6 +120,18 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
     return 65536;
   }
 
+  const char * buffer::error::what() const throw () {
+    return "buffer::exception";
+  }
+  const char * buffer::bad_alloc::what() const throw () {
+    return "buffer::bad_alloc";
+  }
+  const char * buffer::end_of_buffer::what() const throw () {
+    return "buffer::end_of_buffer";
+  }
+  const char * buffer::malformed_input::what() const throw () {
+    return buf;
+  }
   buffer::error_code::error_code(int error) :
     buffer::malformed_input(cpp_strerror(error).c_str()), code(error) {}
 
@@ -127,16 +141,16 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
     unsigned len;
     atomic_t nref;
 
-    mutable Mutex crc_lock;
+    mutable RWLock crc_lock;
     map<pair<size_t, size_t>, pair<uint32_t, uint32_t> > crc_map;
 
     raw(unsigned l)
       : data(NULL), len(l), nref(0),
-	crc_lock("buffer::raw::crc_lock", false, false)
+	crc_lock("buffer::raw::crc_lock", false)
     { }
     raw(char *c, unsigned l)
       : data(c), len(l), nref(0),
-	crc_lock("buffer::raw::crc_lock", false, false)
+	crc_lock("buffer::raw::crc_lock", false)
     { }
     virtual ~raw() {}
 
@@ -172,23 +186,33 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
       return true;
     }
     bool get_crc(const pair<size_t, size_t> &fromto,
-		 pair<uint32_t, uint32_t> *crc) const {
-      Mutex::Locker l(crc_lock);
+         pair<uint32_t, uint32_t> *crc) const {
+      crc_lock.get_read();
       map<pair<size_t, size_t>, pair<uint32_t, uint32_t> >::const_iterator i =
-	crc_map.find(fromto);
-      if (i == crc_map.end())
-	return false;
+      crc_map.find(fromto);
+      if (i == crc_map.end()) {
+          crc_lock.unlock();
+          return false;
+      }
       *crc = i->second;
+      crc_lock.unlock();
       return true;
     }
     void set_crc(const pair<size_t, size_t> &fromto,
-		 const pair<uint32_t, uint32_t> &crc) {
-      Mutex::Locker l(crc_lock);
+         const pair<uint32_t, uint32_t> &crc) {
+      crc_lock.get_write();
       crc_map[fromto] = crc;
+      crc_lock.unlock();
     }
     void invalidate_crc() {
-      Mutex::Locker l(crc_lock);
-      crc_map.clear();
+      // don't own the write lock when map is empty
+      crc_lock.get_read();
+      if (crc_map.size() != 0) {
+        crc_lock.unlock();
+        crc_lock.get_write();
+        crc_map.clear();
+      }
+      crc_lock.unlock();
     }
   };
 
@@ -556,8 +580,8 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
 
   class buffer::xio_mempool : public buffer::raw {
   public:
-    struct xio_mempool_obj *mp;
-    xio_mempool(struct xio_mempool_obj *_mp, unsigned l) :
+    struct xio_reg_mem *mp;
+    xio_mempool(struct xio_reg_mem *_mp, unsigned l) :
       raw((char*)mp->addr, l), mp(_mp)
     { }
     ~xio_mempool() {}
@@ -566,7 +590,7 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
     }
   };
 
-  struct xio_mempool_obj* get_xio_mp(const buffer::ptr& bp)
+  struct xio_reg_mem* get_xio_mp(const buffer::ptr& bp)
   {
     buffer::xio_mempool *mb = dynamic_cast<buffer::xio_mempool*>(bp.get_raw());
     if (mb) {
@@ -766,6 +790,14 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
   unsigned buffer::ptr::raw_length() const { assert(_raw); return _raw->len; }
   int buffer::ptr::raw_nref() const { assert(_raw); return _raw->nref.read(); }
 
+  void buffer::ptr::copy_out(unsigned o, unsigned l, char *dest) const {
+    assert(_raw);
+    if (o+l > _len)
+        throw end_of_buffer();
+    char* src =  _raw->data + _off + o;
+    maybe_inline_memcpy(dest, src, l, 8);
+  }
+
   unsigned buffer::ptr::wasted()
   {
     assert(_raw);
@@ -789,53 +821,69 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
 
   bool buffer::ptr::is_zero() const
   {
-    const char *data = c_str();
-    for (size_t p = 0; p < _len; p++) {
-      if (data[p] != 0) {
-	return false;
-      }
-    }
-    return true;
+    return mem_is_zero(c_str(), _len);
   }
 
-  void buffer::ptr::append(char c)
+  unsigned buffer::ptr::append(char c)
   {
     assert(_raw);
     assert(1 <= unused_tail_length());
-    (c_str())[_len] = c;
+    char* ptr = _raw->data + _off + _len;
+    *ptr = c;
     _len++;
+    return _len + _off;
   }
-  
-  void buffer::ptr::append(const char *p, unsigned l)
+
+  unsigned buffer::ptr::append(const char *p, unsigned l)
   {
     assert(_raw);
     assert(l <= unused_tail_length());
-    memcpy(c_str() + _len, p, l);
+    char* c = _raw->data + _off + _len;
+    maybe_inline_memcpy(c, p, l, 32);
     _len += l;
+    return _len + _off;
   }
-    
+
   void buffer::ptr::copy_in(unsigned o, unsigned l, const char *src)
   {
+    copy_in(o, l, src, true);
+  }
+
+  void buffer::ptr::copy_in(unsigned o, unsigned l, const char *src, bool crc_reset)
+  {
     assert(_raw);
     assert(o <= _len);
     assert(o+l <= _len);
-    _raw->invalidate_crc();
-    memcpy(c_str()+o, src, l);
+    char* dest = _raw->data + _off + o;
+    if (crc_reset)
+        _raw->invalidate_crc();
+    maybe_inline_memcpy(dest, src, l, 64);
   }
 
   void buffer::ptr::zero()
   {
-    _raw->invalidate_crc();
+    zero(true);
+  }
+
+  void buffer::ptr::zero(bool crc_reset)
+  {
+    if (crc_reset)
+        _raw->invalidate_crc();
     memset(c_str(), 0, _len);
   }
 
   void buffer::ptr::zero(unsigned o, unsigned l)
   {
+    zero(o, l, true);
+  }
+
+  void buffer::ptr::zero(unsigned o, unsigned l, bool crc_reset)
+  {
     assert(o+l <= _len);
-    _raw->invalidate_crc();
+    if (crc_reset)
+        _raw->invalidate_crc();
     memset(c_str()+o, 0, l);
   }
-
   bool buffer::ptr::can_zero_copy() const
   {
     return _raw->can_zero_copy();
@@ -860,7 +908,8 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
     return *this;
     }*/
 
-  void buffer::list::iterator::advance(int o)
+  template<bool is_const>
+  void buffer::list::iterator_impl<is_const>::advance(int o)
   {
     //cout << this << " advance " << o << " from " << off << " (p_off " << p_off << " in " << p->length() << ")" << std::endl;
     if (o > 0) {
@@ -898,22 +947,31 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
     }
   }
 
-  void buffer::list::iterator::seek(unsigned o)
+  template<bool is_const>
+  void buffer::list::iterator_impl<is_const>::seek(unsigned o)
   {
-    //cout << this << " seek " << o << std::endl;
     p = ls->begin();
     off = p_off = 0;
     advance(o);
   }
 
-  char buffer::list::iterator::operator*()
+  template<bool is_const>
+  bool buffer::list::iterator_impl<is_const>::operator!=(const buffer::list::iterator_impl<is_const>& rhs) const
+  {
+    return bl == rhs.bl && off == rhs.off;
+  }
+
+  template<bool is_const>
+  char buffer::list::iterator_impl<is_const>::operator*() const
   {
     if (p == ls->end())
       throw end_of_buffer();
     return (*p)[p_off];
   }
-  
-  buffer::list::iterator& buffer::list::iterator::operator++()
+
+  template<bool is_const>
+  buffer::list::iterator_impl<is_const>&
+  buffer::list::iterator_impl<is_const>::operator++()
   {
     if (p == ls->end())
       throw end_of_buffer();
@@ -921,24 +979,25 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
     return *this;
   }
 
-  buffer::ptr buffer::list::iterator::get_current_ptr()
+  template<bool is_const>
+  buffer::ptr buffer::list::iterator_impl<is_const>::get_current_ptr() const
   {
     if (p == ls->end())
       throw end_of_buffer();
     return ptr(*p, p_off, p->length() - p_off);
   }
-  
+
   // copy data out.
   // note that these all _append_ to dest!
-  
-  void buffer::list::iterator::copy(unsigned len, char *dest)
+  template<bool is_const>
+  void buffer::list::iterator_impl<is_const>::copy(unsigned len, char *dest)
   {
     if (p == ls->end()) seek(off);
     while (len > 0) {
       if (p == ls->end())
 	throw end_of_buffer();
-      assert(p->length() > 0); 
-      
+      assert(p->length() > 0);
+
       unsigned howmuch = p->length() - p_off;
       if (len < howmuch) howmuch = len;
       p->copy_out(p_off, howmuch, dest);
@@ -948,39 +1007,42 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
       advance(howmuch);
     }
   }
-  
-  void buffer::list::iterator::copy(unsigned len, ptr &dest)
+
+  template<bool is_const>
+  void buffer::list::iterator_impl<is_const>::copy(unsigned len, ptr &dest)
   {
     dest = create(len);
     copy(len, dest.c_str());
   }
 
-  void buffer::list::iterator::copy(unsigned len, list &dest)
+  template<bool is_const>
+  void buffer::list::iterator_impl<is_const>::copy(unsigned len, list &dest)
   {
     if (p == ls->end())
       seek(off);
     while (len > 0) {
       if (p == ls->end())
 	throw end_of_buffer();
-      
+
       unsigned howmuch = p->length() - p_off;
       if (len < howmuch)
 	howmuch = len;
       dest.append(*p, p_off, howmuch);
-      
+
       len -= howmuch;
       advance(howmuch);
     }
   }
 
-  void buffer::list::iterator::copy(unsigned len, std::string &dest)
+  template<bool is_const>
+  void buffer::list::iterator_impl<is_const>::copy(unsigned len, std::string &dest)
   {
     if (p == ls->end())
       seek(off);
     while (len > 0) {
       if (p == ls->end())
 	throw end_of_buffer();
-      
+
       unsigned howmuch = p->length() - p_off;
       const char *c_str = p->c_str();
       if (len < howmuch)
@@ -992,7 +1054,8 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
     }
   }
 
-  void buffer::list::iterator::copy_all(list &dest)
+  template<bool is_const>
+  void buffer::list::iterator_impl<is_const>::copy_all(list &dest)
   {
     if (p == ls->end())
       seek(off);
@@ -1000,19 +1063,86 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
       if (p == ls->end())
 	return;
       assert(p->length() > 0);
-      
+
       unsigned howmuch = p->length() - p_off;
       const char *c_str = p->c_str();
       dest.append(c_str + p_off, howmuch);
-      
+
       advance(howmuch);
     }
   }
-  
-  // copy data in
+
+  // explicitly instantiate only the iterator types we need, so we can hide the
+  // details in this compilation unit without introducing unnecessary link time
+  // dependencies.
+  template class buffer::list::iterator_impl<true>;
+  template class buffer::list::iterator_impl<false>;
+
+  void buffer::list::iterator::advance(int o)
+  {
+    buffer::list::iterator_impl<false>::advance(o);
+  }
+
+  void buffer::list::iterator::seek(unsigned o)
+  {
+    buffer::list::iterator_impl<false>::seek(o);
+  }
+
+  char buffer::list::iterator::operator*()
+  {
+    if (p == ls->end()) {
+      throw end_of_buffer();
+    }
+    return (*p)[p_off];
+  }
+
+  buffer::list::iterator& buffer::list::iterator::operator++()
+  {
+    buffer::list::iterator_impl<false>::operator++();
+    return *this;
+  }
+
+  buffer::ptr buffer::list::iterator::get_current_ptr()
+  {
+    if (p == ls->end()) {
+      throw end_of_buffer();
+    }
+    return ptr(*p, p_off, p->length() - p_off);
+  }
+
+  void buffer::list::iterator::copy(unsigned len, char *dest)
+  {
+    return buffer::list::iterator_impl<false>::copy(len, dest);
+  }
+
+  void buffer::list::iterator::copy(unsigned len, ptr &dest)
+  {
+    buffer::list::iterator_impl<false>::copy(len, dest);
+  }
+
+  void buffer::list::iterator::copy(unsigned len, list &dest)
+  {
+    buffer::list::iterator_impl<false>::copy(len, dest);
+  }
+
+  void buffer::list::iterator::copy(unsigned len, std::string &dest)
+  {
+    buffer::list::iterator_impl<false>::copy(len, dest);
+  }
+
+  void buffer::list::iterator::copy_all(list &dest)
+  {
+    buffer::list::iterator_impl<false>::copy_all(dest);
+  }
 
   void buffer::list::iterator::copy_in(unsigned len, const char *src)
   {
+    copy_in(len, src, true);
+  }
+
+  // copy data in
+  void buffer::list::iterator::copy_in(unsigned len, const char *src, bool crc_reset)
+  {
     // copy
     if (p == ls->end())
       seek(off);
@@ -1023,7 +1153,7 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
       unsigned howmuch = p->length() - p_off;
       if (len < howmuch)
 	howmuch = len;
-      p->copy_in(p_off, howmuch, src);
+      p->copy_in(p_off, howmuch, src, crc_reset);
 	
       src += howmuch;
       len -= howmuch;
@@ -1052,6 +1182,15 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
 
   // -- buffer::list --
 
+  buffer::list::list(list&& other)
+    : _buffers(std::move(other._buffers)),
+      _len(other._len),
+      _memcopy_count(other._memcopy_count),
+      last_p(this) {
+    append_buffer.swap(other.append_buffer);
+    other.clear();
+  }
+
   void buffer::list::swap(list& other)
   {
     std::swap(_len, other._len);
@@ -1063,7 +1202,12 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
     other.last_p = other.begin();
   }
 
-  bool buffer::list::contents_equal(ceph::buffer::list& other)
+  bool buffer::list::contents_equal(buffer::list& other)
+  {
+    return static_cast<const buffer::list*>(this)->contents_equal(other);
+  }
+
+  bool buffer::list::contents_equal(const ceph::buffer::list& other) const
   {
     if (length() != other.length())
       return false;
@@ -1096,8 +1240,8 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
 
     // byte-wise comparison
     if (false) {
-      bufferlist::iterator me = begin();
-      bufferlist::iterator him = other.begin();
+      bufferlist::const_iterator me = begin();
+      bufferlist::const_iterator him = other.begin();
       while (!me.end()) {
 	if (*me != *him)
 	  return false;
@@ -1224,13 +1368,14 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
     for (std::list<ptr>::iterator it = _buffers.begin();
 	 it != _buffers.end();
 	 ++it) {
-      nb.copy_in(pos, it->length(), it->c_str());
+      nb.copy_in(pos, it->length(), it->c_str(), false);
       pos += it->length();
     }
     _memcopy_count += pos;
     _buffers.clear();
     if (nb.length())
       _buffers.push_back(nb);
+    invalidate_crc();
   }
 
   void buffer::list::rebuild_aligned(unsigned align)
@@ -1342,12 +1487,17 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
     
   void buffer::list::copy_in(unsigned off, unsigned len, const char *src)
   {
+    copy_in(off, len, src, true);
+  }
+
+  void buffer::list::copy_in(unsigned off, unsigned len, const char *src, bool crc_reset)
+  {
     if (off + len > length())
       throw end_of_buffer();
     
     if (last_p.get_off() != off) 
       last_p.seek(off);
-    last_p.copy_in(len, src);
+    last_p.copy_in(len, src, crc_reset);
   }
 
   void buffer::list::copy_in(unsigned off, unsigned len, const list& src)
@@ -1363,33 +1513,31 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
     unsigned gap = append_buffer.unused_tail_length();
     if (!gap) {
       // make a new append_buffer!
-      unsigned alen = CEPH_PAGE_SIZE;
-      append_buffer = create_page_aligned(alen);
+      append_buffer = create_aligned(CEPH_BUFFER_APPEND_SIZE, CEPH_BUFFER_APPEND_SIZE);
       append_buffer.set_length(0);   // unused, so far.
     }
-    append_buffer.append(c);
-    append(append_buffer, append_buffer.end() - 1, 1);	// add segment to the list
+    append(append_buffer, append_buffer.append(c) - 1, 1);	// add segment to the list
   }
-  
+
   void buffer::list::append(const char *data, unsigned len)
   {
     while (len > 0) {
       // put what we can into the existing append_buffer.
       unsigned gap = append_buffer.unused_tail_length();
       if (gap > 0) {
-	if (gap > len) gap = len;
-	//cout << "append first char is " << data[0] << ", last char is " << data[len-1] << std::endl;
-	append_buffer.append(data, gap);
-	append(append_buffer, append_buffer.end() - gap, gap);	// add segment to the list
-	len -= gap;
-	data += gap;
+        if (gap > len) gap = len;
+    //cout << "append first char is " << data[0] << ", last char is " << data[len-1] << std::endl;
+        append_buffer.append(data, gap);
+        append(append_buffer, append_buffer.end() - gap, gap);	// add segment to the list
+        len -= gap;
+        data += gap;
       }
       if (len == 0)
-	break;  // done!
+        break;  // done!
       
       // make a new append_buffer!
-      unsigned alen = CEPH_PAGE_SIZE * (((len-1) / CEPH_PAGE_SIZE) + 1);
-      append_buffer = create_page_aligned(alen);
+      unsigned alen = CEPH_BUFFER_APPEND_SIZE * (((len-1) / CEPH_BUFFER_APPEND_SIZE) + 1);
+      append_buffer = create_aligned(alen, CEPH_BUFFER_APPEND_SIZE);
       append_buffer.set_length(0);   // unused, so far.
     }
   }
@@ -1499,9 +1647,24 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
     }
 
     if (off + len > curbuf->length()) {
-      // FIXME we'll just rebuild the whole list for now.
-      rebuild();
-      return c_str() + orig_off;
+      bufferlist tmp;
+      unsigned l = off + len;
+
+      do {
+	if (l >= curbuf->length())
+	  l -= curbuf->length();
+	else
+	  l = 0;
+	tmp.append(*curbuf);
+	curbuf = _buffers.erase(curbuf);
+
+      } while (curbuf != _buffers.end() && l > 0);
+
+      assert(l == 0);
+
+      tmp.rebuild();
+      _buffers.insert(curbuf, tmp._buffers.front());
+      return tmp.c_str() + off;
     }
 
     return curbuf->c_str() + off;
@@ -1698,8 +1861,8 @@ ssize_t buffer::list::read_fd(int fd, size_t len)
     // available for raw_pipe until we actually inspect the data
     return 0;
   }
-  int s = ROUND_UP_TO(len, CEPH_PAGE_SIZE);
-  bufferptr bp = buffer::create_page_aligned(s);
+  int s = ROUND_UP_TO(len, CEPH_BUFFER_APPEND_SIZE);
+  bufferptr bp = buffer::create_aligned(s, CEPH_BUFFER_APPEND_SIZE);
   ssize_t ret = safe_read(fd, (void*)bp.c_str(), len);
   if (ret >= 0) {
     bp.set_length(ret);
@@ -1805,6 +1968,18 @@ int buffer::list::write_fd(int fd) const
   return 0;
 }
 
+void buffer::list::prepare_iov(std::vector<iovec> *piov) const
+{
+  piov->resize(_buffers.size());
+  unsigned n = 0;
+  for (std::list<buffer::ptr>::const_iterator p = _buffers.begin();
+       p != _buffers.end();
+       ++p, ++n) {
+    (*piov)[n].iov_base = (void *)p->c_str();
+    (*piov)[n].iov_len = p->length();
+  }
+}
+
 int buffer::list::write_fd_zero_copy(int fd) const
 {
   if (!can_zero_copy())
@@ -1867,6 +2042,15 @@ __u32 buffer::list::crc32c(__u32 crc) const
   return crc;
 }
 
+void buffer::list::invalidate_crc()
+{
+  for (std::list<ptr>::const_iterator p = _buffers.begin(); p != _buffers.end(); ++p) {
+    raw *r = p->get_raw();
+    if (r) {
+      r->invalidate_crc();
+    }
+  }
+}
 
 /**
  * Binary write all contents to a C++ stream
@@ -1918,5 +2102,34 @@ std::ostream& operator<<(std::ostream& out, const buffer::raw &r) {
   return out << "buffer::raw(" << (void*)r.data << " len " << r.len << " nref " << r.nref.read() << ")";
 }
 
+std::ostream& operator<<(std::ostream& out, const buffer::ptr& bp) {
+  if (bp.have_raw())
+    out << "buffer::ptr(" << bp.offset() << "~" << bp.length()
+	<< " " << (void*)bp.c_str()
+	<< " in raw " << (void*)bp.raw_c_str()
+	<< " len " << bp.raw_length()
+	<< " nref " << bp.raw_nref() << ")";
+  else
+    out << "buffer:ptr(" << bp.offset() << "~" << bp.length() << " no raw)";
+  return out;
+}
+
+std::ostream& operator<<(std::ostream& out, const buffer::list& bl) {
+  out << "buffer::list(len=" << bl.length() << "," << std::endl;
+
+  std::list<buffer::ptr>::const_iterator it = bl.buffers().begin();
+  while (it != bl.buffers().end()) {
+    out << "\t" << *it;
+    if (++it == bl.buffers().end()) break;
+    out << "," << std::endl;
+  }
+  out << std::endl << ")";
+  return out;
+}
+
+std::ostream& operator<<(std::ostream& out, const buffer::error& e)
+{
+  return out << e.what();
+}
 
 }
diff --git a/src/common/ceph_argparse.cc b/src/common/ceph_argparse.cc
index a76c424..1a60f2e 100644
--- a/src/common/ceph_argparse.cc
+++ b/src/common/ceph_argparse.cc
@@ -45,6 +45,26 @@
 #undef generic_dout
 #undef dendl
 
+struct strict_str_convert {
+  const char *str;
+  std::string *err;
+  strict_str_convert(const char *str,  std::string *err)
+    : str(str), err(err) {}
+
+  inline operator float() const
+  {
+    return strict_strtof(str, err);
+  }
+  inline operator int() const
+  {
+    return strict_strtol(str, 10, err);
+  }
+  inline operator long long() const
+  {
+    return  strict_strtoll(str, 10, err);
+  }
+};
+
 void string_to_vec(std::vector<std::string>& args, std::string argstr)
 {
   istringstream iss(argstr);
@@ -131,6 +151,50 @@ void vec_to_argv(const char *argv0, std::vector<const char*>& args,
     (*argv)[(*argc)++] = args[i];
 }
 
+void ceph_arg_value_type(const char * nextargstr, bool *bool_option, bool *bool_numeric)
+{
+  bool is_numeric = true;
+  bool is_float = false;
+  bool is_option;
+
+  if (nextargstr == NULL) {
+    return;
+  }
+
+  if (strlen(nextargstr) < 2) {
+    is_option = false;
+  } else {
+    is_option = (nextargstr[0] == '-') && (nextargstr[1] == '-');
+  }
+
+  for (unsigned int i = 0; i < strlen(nextargstr); i++) {
+    if (!(nextargstr[i] >= '0' && nextargstr[i] <= '9')) {
+      // May be negative numeral value
+      if ((i == 0) && (strlen(nextargstr) >= 2))  {
+	if (nextargstr[0] == '-')
+	  continue;
+      }
+      if ( (nextargstr[i] == '.') && (is_float == false) ) {
+        is_float = true;
+        continue;
+      }
+        
+      is_numeric = false;
+      break;
+    }
+  }
+
+  // -<option>
+  if (nextargstr[0] == '-' && is_numeric == false) {
+    is_option = true;
+  }
+
+  *bool_option = is_option;
+  *bool_numeric = is_numeric;
+
+  return;
+}
+
 bool parse_ip_port_vec(const char *s, vector<entity_addr_t>& vec)
 {
   const char *p = s;
@@ -277,8 +341,9 @@ bool ceph_argparse_binary_flag(std::vector<const char*> &args,
   return r;
 }
 
-static bool va_ceph_argparse_witharg(std::vector<const char*> &args,
-	std::vector<const char*>::iterator &i, std::string *ret, va_list ap)
+static int va_ceph_argparse_witharg(std::vector<const char*> &args,
+	std::vector<const char*>::iterator &i, std::string *ret,
+	std::ostream &oss, va_list ap)
 {
   const char *first = *i;
   char tmp[strlen(first)+1];
@@ -289,7 +354,7 @@ static bool va_ceph_argparse_witharg(std::vector<const char*> &args,
   while (1) {
     const char *a = va_arg(ap, char*);
     if (a == NULL)
-      return false;
+      return 0;
     int strlen_a = strlen(a);
     char a2[strlen_a+1];
     dashes_to_underscores(a, a2);
@@ -297,101 +362,98 @@ static bool va_ceph_argparse_witharg(std::vector<const char*> &args,
       if (first[strlen_a] == '=') {
 	*ret = first + strlen_a + 1;
 	i = args.erase(i);
-	return true;
+	return 1;
       }
       else if (first[strlen_a] == '\0') {
 	// find second part (or not)
 	if (i+1 == args.end()) {
-	  cerr << "Option " << *i << " requires an argument." << std::endl;
-	  _exit(1);
+	  oss << "Option " << *i << " requires an argument." << std::endl;
+	  i = args.erase(i);
+	  return -EINVAL;
 	}
 	i = args.erase(i);
 	*ret = *i;
 	i = args.erase(i);
-	return true;
+	return 1;
       }
     }
   }
 }
 
+template<class T>
 bool ceph_argparse_witharg(std::vector<const char*> &args,
-	std::vector<const char*>::iterator &i, std::string *ret, ...)
+	std::vector<const char*>::iterator &i, T *ret,
+	std::ostream &oss, ...)
 {
-  bool r;
-  va_list ap;
-  va_start(ap, ret);
-  r = va_ceph_argparse_witharg(args, i, ret, ap);
-  va_end(ap);
-  return r;
-}
-
-bool ceph_argparse_withint(std::vector<const char*> &args,
-	std::vector<const char*>::iterator &i, int *ret,
-	std::ostream *oss, ...)
-{
-  bool r;
+  int r;
   va_list ap;
+  bool is_option = false;
+  bool is_numeric = true;
   std::string str;
   va_start(ap, oss);
-  r = va_ceph_argparse_witharg(args, i, &str, ap);
+  r = va_ceph_argparse_witharg(args, i, &str, oss, ap);
   va_end(ap);
-  if (!r) {
+  if (r == 0) {
     return false;
+  } else if (r < 0) {
+    return true;
+  }
+
+  ceph_arg_value_type(str.c_str(), &is_option, &is_numeric);
+  if ((is_option == true) || (is_numeric == false)) {
+    *ret = EXIT_FAILURE;
+    if (is_option == true) {
+      oss << "Missing option value";
+    } else {
+      oss << "The option value '" << str << "' is invalid";
+    }
+    return true;
   }
 
   std::string err;
-  int myret = strict_strtol(str.c_str(), 10, &err);
+  T myret = strict_str_convert(str.c_str(), &err);
   *ret = myret;
   if (!err.empty()) {
-    *oss << err;
+    oss << err;
   }
   return true;
 }
 
-bool ceph_argparse_withlonglong(std::vector<const char*> &args,
+template bool ceph_argparse_witharg<int>(std::vector<const char*> &args,
+	std::vector<const char*>::iterator &i, int *ret,
+	std::ostream &oss, ...);
+
+template bool ceph_argparse_witharg<long long>(std::vector<const char*> &args,
 	std::vector<const char*>::iterator &i, long long *ret,
-	std::ostream *oss, ...)
+	std::ostream &oss, ...);
+
+template bool ceph_argparse_witharg<float>(std::vector<const char*> &args,
+	std::vector<const char*>::iterator &i, float *ret,
+	std::ostream &oss, ...);
+
+bool ceph_argparse_witharg(std::vector<const char*> &args,
+	std::vector<const char*>::iterator &i, std::string *ret,
+	std::ostream &oss, ...)
 {
-  bool r;
+  int r;
   va_list ap;
-  std::string str;
   va_start(ap, oss);
-  r = va_ceph_argparse_witharg(args, i, &str, ap);
+  r = va_ceph_argparse_witharg(args, i, ret, oss, ap);
   va_end(ap);
-  if (!r) {
-    return false;
-  }
-
-  std::string err;
-  long long myret = strict_strtoll(str.c_str(), 10, &err);
-  *ret = myret;
-  if (!err.empty()) {
-    *oss << err;
-  }
-  return true;
+  return r != 0;
 }
 
-bool ceph_argparse_withfloat(std::vector<const char*> &args,
-	std::vector<const char*>::iterator &i, float *ret,
-	std::ostream *oss, ...)
+bool ceph_argparse_witharg(std::vector<const char*> &args,
+	std::vector<const char*>::iterator &i, std::string *ret, ...)
 {
-  bool r;
+  int r;
   va_list ap;
-  std::string str;
-  va_start(ap, oss);
-  r = va_ceph_argparse_witharg(args, i, &str, ap);
+  va_start(ap, ret);
+  r = va_ceph_argparse_witharg(args, i, ret, cerr, ap);
   va_end(ap);
-  if (!r) {
-    return false;
-  }
-
-  std::string err;
-  float myret = strict_strtof(str.c_str(), &err);
-  *ret = myret;
-  if (!err.empty()) {
-    *oss << err;
-  }
-  return true;
+  if (r < 0)
+    _exit(1);
+  return r != 0;
 }
 
 CephInitParameters ceph_argparse_early_args
@@ -459,6 +521,8 @@ static void generic_usage(bool is_server)
   --id/-i ID        set ID portion of my name\n\
   --name/-n TYPE.ID set name\n\
   --cluster NAME    set cluster name (default: ceph)\n\
+  --setuser USER    set uid to user or uid (and gid to user's gid)\n\
+  --setgroup GROUP  set gid to group or gid\n\
   --version         show version and quit\n\
 " << std::endl;
 
diff --git a/src/common/ceph_argparse.h b/src/common/ceph_argparse.h
index 3ef0251..6ad0234 100644
--- a/src/common/ceph_argparse.h
+++ b/src/common/ceph_argparse.h
@@ -56,22 +56,20 @@ bool ceph_argparse_double_dash(std::vector<const char*> &args,
 bool ceph_argparse_flag(std::vector<const char*> &args,
 	std::vector<const char*>::iterator &i, ...);
 bool ceph_argparse_witharg(std::vector<const char*> &args,
+	std::vector<const char*>::iterator &i, std::string *ret,
+	std::ostream &oss, ...);
+bool ceph_argparse_witharg(std::vector<const char*> &args,
 	std::vector<const char*>::iterator &i, std::string *ret, ...);
+template<class T>
+bool ceph_argparse_witharg(std::vector<const char*> &args,
+	std::vector<const char*>::iterator &i, T *ret,
+	std::ostream &oss, ...);
 bool ceph_argparse_binary_flag(std::vector<const char*> &args,
 	std::vector<const char*>::iterator &i, int *ret,
 	std::ostream *oss, ...);
 extern CephInitParameters ceph_argparse_early_args
 	    (std::vector<const char*>& args, uint32_t module_type, int flags,
 	     std::string *cluster, std::string *conf_file_list);
-extern bool ceph_argparse_withint(std::vector<const char*> &args,
-	std::vector<const char*>::iterator &i, int *ret,
-	std::ostream *oss, ...);
-extern bool ceph_argparse_withfloat(std::vector<const char*> &args,
-	std::vector<const char*>::iterator &i, float *ret,
-	std::ostream *oss, ...);
-extern bool ceph_argparse_withlonglong(std::vector<const char*> &args,
-	std::vector<const char*>::iterator &i, long long *ret,
-	std::ostream *oss, ...);
 extern void generic_server_usage();
 extern void generic_client_usage();
 
diff --git a/src/common/ceph_context.cc b/src/common/ceph_context.cc
index 50346ed..7383ed7 100644
--- a/src/common/ceph_context.cc
+++ b/src/common/ceph_context.cc
@@ -106,6 +106,9 @@ public:
         _reopen_logs = false;
       }
       _cct->_heartbeat_map->check_touch_file();
+
+      // refresh the perf coutners
+      _cct->refresh_perf_values();
     }
     return NULL;
   }
@@ -230,7 +233,8 @@ bool CephContext::check_experimental_feature_enabled(const std::string& feat,
 						     std::ostream *message)
 {
   ceph_spin_lock(&_feature_lock);
-  bool enabled = _experimental_features.count(feat);
+  bool enabled = (_experimental_features.count(feat) ||
+		  _experimental_features.count("*"));
   ceph_spin_unlock(&_feature_lock);
 
   if (enabled) {
@@ -408,11 +412,13 @@ CephContext::CephContext(uint32_t module_type_)
     _heartbeat_map(NULL),
     _crypto_none(NULL),
     _crypto_aes(NULL),
-    _lockdep_obs(NULL)
+    _lockdep_obs(NULL),
+    _cct_perf(NULL)
 {
   ceph_spin_init(&_service_thread_lock);
   ceph_spin_init(&_associated_objs_lock);
   ceph_spin_init(&_feature_lock);
+  ceph_spin_init(&_cct_perf_lock);
 
   _log = new ceph::log::Log(&_conf->subsys);
   _log->start();
@@ -427,6 +433,7 @@ CephContext::CephContext(uint32_t module_type_)
   _conf->add_observer(_lockdep_obs);
 
   _perf_counters_collection = new PerfCountersCollection(this);
+ 
   _admin_socket = new AdminSocket(this);
   _heartbeat_map = new HeartbeatMap(this);
 
@@ -448,18 +455,24 @@ CephContext::CephContext(uint32_t module_type_)
   _admin_socket->register_command("log dump", "log dump", _admin_hook, "dump recent log entries to log file");
   _admin_socket->register_command("log reopen", "log reopen", _admin_hook, "reopen log file");
 
-  _crypto_none = new CryptoNone;
-  _crypto_aes = new CryptoAES;
+  _crypto_none = CryptoHandler::create(CEPH_CRYPTO_NONE);
+  _crypto_aes = CryptoHandler::create(CEPH_CRYPTO_AES);
 }
 
 CephContext::~CephContext()
 {
   join_service_thread();
 
-  for (map<string, AssociatedSingletonObject*>::iterator it = _associated_objs.begin();
+  for (map<string, SingletonWrapper*>::iterator it = _associated_objs.begin();
        it != _associated_objs.end(); ++it)
     delete it->second;
 
+  if (_cct_perf) {
+    _perf_counters_collection->remove(_cct_perf);
+    delete _cct_perf;
+    _cct_perf = NULL;
+  }
+
   _admin_socket->unregister_command("perfcounters_dump");
   _admin_socket->unregister_command("perf dump");
   _admin_socket->unregister_command("1");
@@ -505,6 +518,7 @@ CephContext::~CephContext()
   ceph_spin_destroy(&_service_thread_lock);
   ceph_spin_destroy(&_associated_objs_lock);
   ceph_spin_destroy(&_feature_lock);
+  ceph_spin_destroy(&_cct_perf_lock);
 
   delete _crypto_none;
   delete _crypto_aes;
@@ -577,6 +591,41 @@ PerfCountersCollection *CephContext::get_perfcounters_collection()
   return _perf_counters_collection;
 }
 
+void CephContext::enable_perf_counter()
+{
+  PerfCountersBuilder plb(this, "cct", l_cct_first, l_cct_last);
+  plb.add_u64_counter(l_cct_total_workers, "total_workers", "Total workers");
+  plb.add_u64_counter(l_cct_unhealthy_workers, "unhealthy_workers", "Unhealthy workers");
+  PerfCounters *perf_tmp = plb.create_perf_counters();
+
+  ceph_spin_lock(&_cct_perf_lock);
+  assert(_cct_perf == NULL);
+  _cct_perf = perf_tmp;
+  ceph_spin_unlock(&_cct_perf_lock);
+
+  _perf_counters_collection->add(_cct_perf);
+}
+
+void CephContext::disable_perf_counter()
+{
+  _perf_counters_collection->remove(_cct_perf);
+
+  ceph_spin_lock(&_cct_perf_lock);
+  delete _cct_perf;
+  _cct_perf = NULL;
+  ceph_spin_unlock(&_cct_perf_lock);
+}
+
+void CephContext::refresh_perf_values()
+{
+  ceph_spin_lock(&_cct_perf_lock);
+  if (_cct_perf) {
+    _cct_perf->set(l_cct_total_workers, _heartbeat_map->get_total_workers());
+    _cct_perf->set(l_cct_unhealthy_workers, _heartbeat_map->get_unhealthy_workers());
+  }
+  ceph_spin_unlock(&_cct_perf_lock);
+}
+
 AdminSocket *CephContext::get_admin_socket()
 {
   return _admin_socket;
diff --git a/src/common/ceph_context.h b/src/common/ceph_context.h
index a9ffde0..037f2d8 100644
--- a/src/common/ceph_context.h
+++ b/src/common/ceph_context.h
@@ -15,25 +15,26 @@
 #ifndef CEPH_CEPHCONTEXT_H
 #define CEPH_CEPHCONTEXT_H
 
-#include <iostream>
+#include <iosfwd>
 #include <stdint.h>
 #include <string>
 #include <set>
 
+#include "include/assert.h"
 #include "include/buffer.h"
 #include "include/atomic.h"
 #include "common/cmdparse.h"
 #include "include/Spinlock.h"
+#include <boost/noncopyable.hpp>
 
 class AdminSocket;
 class CephContextServiceThread;
 class PerfCountersCollection;
+class PerfCounters;
 class md_config_obs_t;
 struct md_config_t;
 class CephContextHook;
 class CephContextObs;
-class CryptoNone;
-class CryptoAES;
 class CryptoHandler;
 
 namespace ceph {
@@ -61,10 +62,6 @@ private:
   ~CephContext();
   atomic_t nref;
 public:
-  class AssociatedSingletonObject {
-   public:
-    virtual ~AssociatedSingletonObject() {}
-  };
   CephContext *get() {
     nref.inc();
     return this;
@@ -97,6 +94,22 @@ public:
   }
 
   /**
+   * Enable the performance counter, currently we only have counter for the
+   * number of total/unhealthy workers.
+   */
+  void enable_perf_counter();
+
+  /**
+   * Disable the performance counter.
+   */
+  void disable_perf_counter();
+
+  /**
+   * Refresh perf counter values.
+   */
+  void refresh_perf_values();
+
+  /**
    * Get the admin socket associated with this CephContext.
    *
    * Currently there is always an admin socket object,
@@ -117,9 +130,12 @@ public:
     ceph_spin_lock(&_associated_objs_lock);
     if (!_associated_objs.count(name)) {
       p = new T(this);
-      _associated_objs[name] = reinterpret_cast<AssociatedSingletonObject*>(p);
+      _associated_objs[name] = new TypedSingletonWrapper<T>(p);
     } else {
-      p = reinterpret_cast<T*>(_associated_objs[name]);
+      TypedSingletonWrapper<T> *wrapper =
+        dynamic_cast<TypedSingletonWrapper<T> *>(_associated_objs[name]);
+      assert(wrapper != NULL);
+      p = wrapper->singleton;
     }
     ceph_spin_unlock(&_associated_objs_lock);
   }
@@ -134,6 +150,21 @@ public:
 					  std::ostream *message);
 
 private:
+  struct SingletonWrapper : boost::noncopyable {
+    virtual ~SingletonWrapper() {}
+  };
+
+  template <typename T>
+  struct TypedSingletonWrapper : public SingletonWrapper {
+    TypedSingletonWrapper(T *p) : singleton(p) {
+    }
+    virtual ~TypedSingletonWrapper() {
+      delete singleton;
+    }
+
+    T *singleton;
+  };
+
   CephContext(const CephContext &rhs);
   CephContext &operator=(const CephContext &rhs);
 
@@ -167,11 +198,11 @@ private:
   ceph::HeartbeatMap *_heartbeat_map;
 
   ceph_spinlock_t _associated_objs_lock;
-  std::map<std::string, AssociatedSingletonObject*> _associated_objs;
+  std::map<std::string, SingletonWrapper*> _associated_objs;
 
   // crypto
-  CryptoNone *_crypto_none;
-  CryptoAES *_crypto_aes;
+  CryptoHandler *_crypto_none;
+  CryptoHandler *_crypto_aes;
 
   // experimental
   CephContextObs *_cct_obs;
@@ -180,6 +211,15 @@ private:
 
   md_config_obs_t *_lockdep_obs;
 
+  enum {
+    l_cct_first,
+    l_cct_total_workers,
+    l_cct_unhealthy_workers,
+    l_cct_last
+  };
+  PerfCounters *_cct_perf;
+  ceph_spinlock_t _cct_perf_lock;
+
   friend class CephContextObs;
 };
 
diff --git a/src/common/ceph_crypto.cc b/src/common/ceph_crypto.cc
index f15ef09..de5a03b 100644
--- a/src/common/ceph_crypto.cc
+++ b/src/common/ceph_crypto.cc
@@ -36,7 +36,7 @@ ceph::crypto::HMACSHA1::~HMACSHA1()
 {
 }
 
-#elif USE_NSS
+#elif defined(USE_NSS)
 
 // for SECMOD_RestartModules()
 #include <secmod.h>
diff --git a/src/common/ceph_crypto.h b/src/common/ceph_crypto.h
index 686efb4..10055f6 100644
--- a/src/common/ceph_crypto.h
+++ b/src/common/ceph_crypto.h
@@ -11,9 +11,9 @@
 #ifdef USE_CRYPTOPP
 # define CRYPTOPP_ENABLE_NAMESPACE_WEAK 1
 #include <string.h>
-# include <cryptopp/md5.h>
-# include <cryptopp/sha.h>
-# include <cryptopp/hmac.h>
+#include <cryptopp/md5.h>
+#include <cryptopp/sha.h>
+#include <cryptopp/hmac.h>
 
 // reinclude our assert to clobber the system one
 # include "include/assert.h"
@@ -38,7 +38,7 @@ namespace ceph {
     };
   }
 }
-#elif USE_NSS
+#elif defined(USE_NSS)
 // you *must* use CRYPTO_CXXFLAGS in Makefile.am for including this include
 # include <nss.h>
 # include <pk11pub.h>
diff --git a/src/common/ceph_fs.cc b/src/common/ceph_fs.cc
index 3172c57..6b69e26 100644
--- a/src/common/ceph_fs.cc
+++ b/src/common/ceph_fs.cc
@@ -37,7 +37,8 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
 
 int ceph_flags_to_mode(int flags)
 {
-	int mode = 0;
+	/* because CEPH_FILE_MODE_PIN is zero, so mode = -1 is error */
+	int mode = -1;
 
 #ifdef O_DIRECTORY  /* fixme */
 	if ((flags & O_DIRECTORY) == O_DIRECTORY)
diff --git a/src/common/ceph_json.h b/src/common/ceph_json.h
index 604230a..e69055b 100644
--- a/src/common/ceph_json.h
+++ b/src/common/ceph_json.h
@@ -1,7 +1,7 @@
 #ifndef CEPH_JSON_H
 #define CEPH_JSON_H
 
-#include <iostream>
+#include <iosfwd>
 #include <include/types.h>
 #include <list>
 
@@ -198,7 +198,7 @@ void decode_json_obj(multimap<K, V>& m, JSONObj *obj)
     JSONObj *o = *iter;
     JSONDecoder::decode_json("key", key, o);
     JSONDecoder::decode_json("val", val, o);
-    m.insert(make_pair<K, V>(key, val));
+    m.insert(make_pair(key, val));
   }
 }
 
diff --git a/src/common/ceph_strings.cc b/src/common/ceph_strings.cc
index 41203bc..d1a418d 100644
--- a/src/common/ceph_strings.cc
+++ b/src/common/ceph_strings.cc
@@ -64,6 +64,7 @@ const char *ceph_mds_state_name(int s)
 		/* down and out */
 	case CEPH_MDS_STATE_DNE:        return "down:dne";
 	case CEPH_MDS_STATE_STOPPED:    return "down:stopped";
+	case CEPH_MDS_STATE_DAMAGED:   return "down:damaged";
 		/* up and out */
 	case CEPH_MDS_STATE_BOOT:       return "up:boot";
 	case CEPH_MDS_STATE_STANDBY:    return "up:standby";
@@ -128,6 +129,7 @@ const char *ceph_mds_op_name(int op)
 	case CEPH_MDS_OP_LSSNAP: return "lssnap";
 	case CEPH_MDS_OP_MKSNAP: return "mksnap";
 	case CEPH_MDS_OP_RMSNAP: return "rmsnap";
+	case CEPH_MDS_OP_RENAMESNAP: return "renamesnap";
 	case CEPH_MDS_OP_SETFILELOCK: return "setfilelock";
 	case CEPH_MDS_OP_GETFILELOCK: return "getfilelock";
 	case CEPH_MDS_OP_FRAGMENTDIR: return "fragmentdir";
diff --git a/src/common/cmdparse.cc b/src/common/cmdparse.cc
index 3ca3bbd..6c1aae8 100644
--- a/src/common/cmdparse.cc
+++ b/src/common/cmdparse.cc
@@ -224,7 +224,7 @@ handle_bad_get(CephContext *cct, string k, const char *tname)
   BackTrace bt(1);
   ostringstream oss;
   bt.print(oss);
-  lderr(cct) << oss << dendl;
+  lderr(cct) << oss.rdbuf() << dendl;
   if (status == 0)
     free((char *)typestr);
 }
diff --git a/src/common/config.cc b/src/common/config.cc
index 3b0ed62..c26b826 100644
--- a/src/common/config.cc
+++ b/src/common/config.cc
@@ -98,24 +98,26 @@ struct config_option config_optionsp[] = {
 
 const int NUM_CONFIG_OPTIONS = sizeof(config_optionsp) / sizeof(config_option);
 
-bool ceph_resolve_file_search(const std::string& filename_list,
-			      std::string& result)
+int ceph_resolve_file_search(const std::string& filename_list,
+			     std::string& result)
 {
   list<string> ls;
   get_str_list(filename_list, ls);
 
+  int ret = -ENOENT;
   list<string>::iterator iter;
   for (iter = ls.begin(); iter != ls.end(); ++iter) {
     int fd = ::open(iter->c_str(), O_RDONLY);
-    if (fd < 0)
+    if (fd < 0) {
+      ret = -errno;
       continue;
-
+    }
     close(fd);
     result = *iter;
-    return true;
+    return 0;
   }
 
-  return false;
+  return ret;
 }
 
 md_config_t::md_config_t()
@@ -484,6 +486,7 @@ int md_config_t::parse_option(std::vector<const char*>& args,
   }
 
   for (o = 0; o < NUM_CONFIG_OPTIONS; ++o) {
+    ostringstream err;
     const config_option *opt = config_optionsp + o;
     std::string as_option("--");
     as_option += opt->name;
@@ -507,8 +510,13 @@ int md_config_t::parse_option(std::vector<const char*>& args,
 	}
       }
     }
-    else if (ceph_argparse_witharg(args, i, &val,
+    else if (ceph_argparse_witharg(args, i, &val, err,
 				   as_option.c_str(), (char*)NULL)) {
+      if (!err.str().empty()) {
+	*oss << err.str();
+	ret = -EINVAL;
+	break;
+      }
       if (oss && (
 		  ((opt->type == OPT_STR) || (opt->type == OPT_ADDR) ||
 		   (opt->type == OPT_UUID)) &&
@@ -1142,8 +1150,3 @@ void md_config_t::diff(
       diff->insert(make_pair(opt->name, make_pair(local_val, other_val)));
   }
 }
-
-md_config_obs_t::
-~md_config_obs_t()
-{
-}
diff --git a/src/common/config.h b/src/common/config.h
index d4a84be..41c999d 100644
--- a/src/common/config.h
+++ b/src/common/config.h
@@ -255,8 +255,8 @@ typedef enum {
 	OPT_ADDR, OPT_U32, OPT_U64, OPT_UUID
 } opt_type_t;
 
-bool ceph_resolve_file_search(const std::string& filename_list,
-			      std::string& result);
+int ceph_resolve_file_search(const std::string& filename_list,
+			     std::string& result);
 
 struct config_option {
   const char *name;
diff --git a/src/common/config_obs.h b/src/common/config_obs.h
index f15d934..723f739 100644
--- a/src/common/config_obs.h
+++ b/src/common/config_obs.h
@@ -20,12 +20,24 @@
 
 struct md_config_t;
 
+/** @brief Base class for configuration observers.
+ * Use this as a base class for your object if it has to respond to configuration changes,
+ * for example by updating some values or modifying its behavior.
+ * Subscribe for configuration changes by calling the md_config_t::add_observer() method
+ * and unsubscribe using md_config_t::remove_observer().
+ */
 class md_config_obs_t {
 public:
-  virtual ~md_config_obs_t();
+  virtual ~md_config_obs_t() {}
+  /** @brief Get a table of strings specifying the configuration keys in which the object is interested.
+   * This is called when the object is subscribed to configuration changes with add_observer().
+   * The returned table should not be freed until the observer is removed with remove_observer().
+   * Note that it is not possible to change the set of tracked keys without re-subscribing. */
   virtual const char** get_tracked_conf_keys() const = 0;
+  /// React to a configuration change.
   virtual void handle_conf_change(const struct md_config_t *conf,
 				  const std::set <std::string> &changed) = 0;
+  /// Unused for now
   virtual void handle_subsys_change(const struct md_config_t *conf,
 				    const std::set<int>& changed) { }
 };
diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index 95d3a4b..ab52520 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -26,14 +26,19 @@ OPTION(lockdep, OPT_BOOL, false)
 OPTION(lockdep_force_backtrace, OPT_BOOL, false) // always gather current backtrace at every lock
 OPTION(run_dir, OPT_STR, "/var/run/ceph")       // the "/var/run/ceph" dir, created on daemon startup
 OPTION(admin_socket, OPT_STR, "$run_dir/$cluster-$name.asok") // default changed by common_preinit()
+OPTION(crushtool, OPT_STR, "crushtool") // crushtool utility path
 
 OPTION(daemonize, OPT_BOOL, false) // default changed by common_preinit()
+OPTION(setuser, OPT_STR, "")        // uid or user name
+OPTION(setgroup, OPT_STR, "")        // gid or group name
+OPTION(setuser_match_path, OPT_STR, "")  // make setuser/group conditional on this patch matching ownership
 OPTION(pid_file, OPT_STR, "") // default changed by common_preinit()
 OPTION(chdir, OPT_STR, "/")
 OPTION(max_open_files, OPT_LONGLONG, 0)
 OPTION(restapi_log_level, OPT_STR, "") 	// default set by Python code
 OPTION(restapi_base_url, OPT_STR, "")	// "
 OPTION(fatal_signal_handlers, OPT_BOOL, true)
+OPTION(erasure_code_dir, OPT_STR, CEPH_PKGLIBDIR"/erasure-code") // default location for erasure-code plugins
 
 OPTION(log_file, OPT_STR, "/var/log/ceph/$cluster-$name.log") // default changed by common_preinit()
 OPTION(log_max_new, OPT_INT, 1000) // default changed by common_preinit()
@@ -67,7 +72,7 @@ OPTION(enable_experimental_unrecoverable_data_corrupting_features, OPT_STR, "")
 OPTION(xio_trace_mempool, OPT_BOOL, false) // mempool allocation counters
 OPTION(xio_trace_msgcnt, OPT_BOOL, false) // incoming/outgoing msg counters
 OPTION(xio_trace_xcon, OPT_BOOL, false) // Xio message encode/decode trace
-OPTION(xio_queue_depth, OPT_INT, 512) // depth of Accelio msg queue
+OPTION(xio_queue_depth, OPT_INT, 128) // depth of Accelio msg queue
 OPTION(xio_mp_min, OPT_INT, 128) // default min mempool size
 OPTION(xio_mp_max_64, OPT_INT, 65536) // max 64-byte chunks (buffer is 40)
 OPTION(xio_mp_max_256, OPT_INT, 8192) // max 256-byte chunks
@@ -75,6 +80,14 @@ OPTION(xio_mp_max_1k, OPT_INT, 8192) // max 1K chunks
 OPTION(xio_mp_max_page, OPT_INT, 4096) // max 1K chunks
 OPTION(xio_mp_max_hint, OPT_INT, 4096) // max size-hint chunks
 OPTION(xio_portal_threads, OPT_INT, 2) // xio portal threads per messenger
+OPTION(xio_transport_type, OPT_STR, "rdma") // xio transport type: {rdma or tcp}
+OPTION(xio_max_send_inline, OPT_INT, 512) // xio maximum threshold to send inline
+
+OPTION(async_compressor_enabled, OPT_BOOL, false)
+OPTION(async_compressor_type, OPT_STR, "snappy")
+OPTION(async_compressor_threads, OPT_INT, 2)
+OPTION(async_compressor_thread_timeout, OPT_INT, 5)
+OPTION(async_compressor_thread_suicide_timeout, OPT_INT, 30)
 
 DEFAULT_SUBSYS(0, 5)
 SUBSYS(lockdep, 0, 1)
@@ -120,6 +133,8 @@ SUBSYS(asok, 1, 5)
 SUBSYS(throttle, 1, 1)
 SUBSYS(refs, 0, 0)
 SUBSYS(xio, 1, 5)
+SUBSYS(compressor, 1, 5)
+SUBSYS(newstore, 1, 5)
 
 OPTION(key, OPT_STR, "")
 OPTION(keyfile, OPT_STR, "")
@@ -196,14 +211,18 @@ OPTION(mon_osd_max_op_age, OPT_DOUBLE, 32)     // max op age before we get conce
 OPTION(mon_osd_max_split_count, OPT_INT, 32) // largest number of PGs per "involved" OSD to let split create
 OPTION(mon_osd_allow_primary_temp, OPT_BOOL, false)  // allow primary_temp to be set in the osdmap
 OPTION(mon_osd_allow_primary_affinity, OPT_BOOL, false)  // allow primary_affinity to be set in the osdmap
+OPTION(mon_osd_prime_pg_temp, OPT_BOOL, false)  // prime osdmap with pg mapping changes
+OPTION(mon_osd_prime_pg_temp_max_time, OPT_FLOAT, .5)  // max time to spend priming
+OPTION(mon_osd_pool_ec_fast_read, OPT_BOOL, false) // whether turn on fast read on the pool or not
 OPTION(mon_stat_smooth_intervals, OPT_INT, 2)  // smooth stats over last N PGMap maps
 OPTION(mon_lease, OPT_FLOAT, 5)       // lease interval
-OPTION(mon_lease_renew_interval, OPT_FLOAT, 3) // on leader, to renew the lease
-OPTION(mon_lease_ack_timeout, OPT_FLOAT, 10.0) // on leader, if lease isn't acked by all peons
+OPTION(mon_lease_renew_interval_factor, OPT_FLOAT, .6) // on leader, to renew the lease
+OPTION(mon_lease_ack_timeout_factor, OPT_FLOAT, 2.0) // on leader, if lease isn't acked by all peons
+OPTION(mon_accept_timeout_factor, OPT_FLOAT, 2.0)    // on leader, if paxos update isn't accepted
+
 OPTION(mon_clock_drift_allowed, OPT_FLOAT, .050) // allowed clock drift between monitors
 OPTION(mon_clock_drift_warn_backoff, OPT_FLOAT, 5) // exponential backoff for clock drift warnings
 OPTION(mon_timecheck_interval, OPT_FLOAT, 300.0) // on leader, timecheck (clock drift check) interval (seconds)
-OPTION(mon_accept_timeout, OPT_FLOAT, 10.0)    // on leader, if paxos update isn't accepted
 OPTION(mon_pg_create_interval, OPT_FLOAT, 30.0) // no more than every 30s
 OPTION(mon_pg_stuck_threshold, OPT_INT, 300) // number of seconds after which pgs can be considered inactive, unclean, or stale (see doc/control.rst under dump_stuck for more info)
 OPTION(mon_pg_warn_min_per_osd, OPT_INT, 30)  // min # pgs per (in) osd before we warn the admin
@@ -242,6 +261,11 @@ OPTION(mon_health_to_clog_tick_interval, OPT_DOUBLE, 60.0)
 OPTION(mon_data_avail_crit, OPT_INT, 5)
 OPTION(mon_data_avail_warn, OPT_INT, 30)
 OPTION(mon_data_size_warn, OPT_U64, 15*1024*1024*1024) // issue a warning when the monitor's data store goes over 15GB (in bytes)
+OPTION(mon_scrub_interval, OPT_INT, 3600*24) // once a day
+OPTION(mon_scrub_timeout, OPT_INT, 60*5) // let's give it 5 minutes; why not.
+OPTION(mon_scrub_max_keys, OPT_INT, 100) // max number of keys to scrub each time
+OPTION(mon_scrub_inject_crc_mismatch, OPT_DOUBLE, 0.0) // probability of injected crc mismatch [0.0, 1.0]
+OPTION(mon_scrub_inject_missing_keys, OPT_DOUBLE, 0.0) // probability of injected missing keys [0.0, 1.0]
 OPTION(mon_config_key_max_entry_size, OPT_INT, 4096) // max num bytes per config-key entry
 OPTION(mon_sync_timeout, OPT_DOUBLE, 60.0)
 OPTION(mon_sync_max_payload_size, OPT_U32, 1048576) // max size for a sync chunk payload (say, 1MB)
@@ -254,10 +278,13 @@ OPTION(mon_osd_min_down_reporters, OPT_INT, 1)   // number of OSDs who need to r
 OPTION(mon_osd_min_down_reports, OPT_INT, 3)     // number of times a down OSD must be reported for it to count
 OPTION(mon_osd_force_trim_to, OPT_INT, 0)   // force mon to trim maps to this point, regardless of min_last_epoch_clean (dangerous, use with care)
 OPTION(mon_mds_force_trim_to, OPT_INT, 0)   // force mon to trim mdsmaps to this point (dangerous, use with care)
-OPTION(crushtool, OPT_STR, "crushtool")
+
+// monitor debug options
+OPTION(mon_debug_deprecated_as_obsolete, OPT_BOOL, false) // consider deprecated commands as obsolete
 
 // dump transactions
 OPTION(mon_debug_dump_transactions, OPT_BOOL, false)
+OPTION(mon_debug_dump_json, OPT_BOOL, false)
 OPTION(mon_debug_dump_location, OPT_STR, "/var/log/ceph/$cluster-$name.tdump")
 OPTION(mon_inject_transaction_delay_max, OPT_DOUBLE, 10.0)      // seconds
 OPTION(mon_inject_transaction_delay_probability, OPT_DOUBLE, 0) // range [0, 1]
@@ -326,6 +353,7 @@ OPTION(client_debug_force_sync_read, OPT_BOOL, false)     // always read synchro
 OPTION(client_debug_inject_tick_delay, OPT_INT, 0) // delay the client tick for a number of seconds
 OPTION(client_max_inline_size, OPT_U64, 4096)
 OPTION(client_inject_release_failure, OPT_BOOL, false)  // synthetic client bug for testing
+OPTION(client_inject_fixed_oldest_tid, OPT_BOOL, false)  // synthetic client bug for testing
 // note: the max amount of "in flight" dirty data is roughly (max - target)
 OPTION(fuse_use_invalidate_cb, OPT_BOOL, false) // use fuse 2.8+ invalidate callback to keep page cache consistent
 OPTION(fuse_allow_other, OPT_BOOL, true)
@@ -334,9 +362,11 @@ OPTION(fuse_big_writes, OPT_BOOL, true)
 OPTION(fuse_atomic_o_trunc, OPT_BOOL, true)
 OPTION(fuse_debug, OPT_BOOL, false)
 OPTION(fuse_multithreaded, OPT_BOOL, true)
+OPTION(fuse_require_active_mds, OPT_BOOL, true) // if ceph_fuse requires active mds server
 OPTION(client_try_dentry_invalidate, OPT_BOOL, true) // the client should try to use dentry invaldation instead of remounting, on kernels it believes that will work for
 OPTION(client_die_on_failed_remount, OPT_BOOL, true)
 OPTION(client_check_pool_perm, OPT_BOOL, true)
+OPTION(client_use_faked_inos, OPT_BOOL, false)
 
 OPTION(crush_location, OPT_STR, "")       // whitespace-separated list of key=value pairs describing crush location
 
@@ -347,6 +377,9 @@ OPTION(objecter_inflight_ops, OPT_U64, 1024)               // max in-flight ios
 OPTION(objecter_completion_locks_per_session, OPT_U64, 32) // num of completion locks per each session, for serializing same object responses
 OPTION(objecter_inject_no_watch_ping, OPT_BOOL, false)   // suppress watch pings
 
+// Max number of deletes at once in a single Filer::purge call
+OPTION(filer_max_purge_ops, OPT_U32, 10)
+
 OPTION(journaler_allow_split_entries, OPT_BOOL, true)
 OPTION(journaler_write_head_interval, OPT_INT, 15)
 OPTION(journaler_prefetch_periods, OPT_INT, 10)   // * journal object size
@@ -354,11 +387,10 @@ OPTION(journaler_prezero_periods, OPT_INT, 5)     // * journal object size
 OPTION(journaler_batch_interval, OPT_DOUBLE, .001)   // seconds.. max add latency we artificially incur
 OPTION(journaler_batch_max, OPT_U64, 0)  // max bytes we'll delay flushing; disable, for now....
 OPTION(mds_data, OPT_STR, "/var/lib/ceph/mds/$cluster-$id")
-OPTION(mds_max_file_size, OPT_U64, 1ULL << 40)
+OPTION(mds_max_file_size, OPT_U64, 1ULL << 40) // Used when creating new CephFS. Change with 'ceph mds set max_file_size <size>' afterwards
 OPTION(mds_cache_size, OPT_INT, 100000)
 OPTION(mds_cache_mid, OPT_FLOAT, .7)
 OPTION(mds_max_file_recover, OPT_U32, 32)
-OPTION(mds_mem_max, OPT_INT, 1048576)        // KB
 OPTION(mds_dir_max_commit_size, OPT_INT, 10) // MB
 OPTION(mds_decay_halflife, OPT_FLOAT, 5)
 OPTION(mds_beacon_interval, OPT_FLOAT, 4)
@@ -366,6 +398,7 @@ OPTION(mds_beacon_grace, OPT_FLOAT, 15)
 OPTION(mds_enforce_unique_name, OPT_BOOL, true)
 OPTION(mds_blacklist_interval, OPT_FLOAT, 24.0*60.0)  // how long to blacklist failed nodes
 OPTION(mds_session_timeout, OPT_FLOAT, 60)    // cap bits and leases time out if client idle
+OPTION(mds_sessionmap_keys_per_op, OPT_U32, 1024)    // how many sessions should I try to load/store in a single OMAP operation?
 OPTION(mds_revoke_cap_timeout, OPT_FLOAT, 60)    // detect clients which aren't revoking caps
 OPTION(mds_recall_state_timeout, OPT_FLOAT, 60)    // detect clients which aren't trimming caps
 OPTION(mds_freeze_tree_timeout, OPT_FLOAT, 30)    // detecting freeze tree deadlock
@@ -385,7 +418,7 @@ OPTION(mds_log_max_events, OPT_INT, -1)
 OPTION(mds_log_events_per_segment, OPT_INT, 1024)
 OPTION(mds_log_segment_size, OPT_INT, 0)  // segment size for mds log,
 	      // defaults to g_default_file_layout.fl_object_size (4MB)
-OPTION(mds_log_max_segments, OPT_INT, 30)
+OPTION(mds_log_max_segments, OPT_U32, 30)
 OPTION(mds_log_max_expiring, OPT_INT, 20)
 OPTION(mds_bal_sample_interval, OPT_FLOAT, 3.0)  // every 5 seconds
 OPTION(mds_bal_replicate_threshold, OPT_FLOAT, 8000)
@@ -451,15 +484,30 @@ OPTION(mds_op_complaint_time, OPT_FLOAT, 30) // how many seconds old makes an op
 OPTION(mds_op_log_threshold, OPT_INT, 5) // how many op log messages to show in one go
 OPTION(mds_snap_min_uid, OPT_U32, 0) // The minimum UID required to create a snapshot
 OPTION(mds_snap_max_uid, OPT_U32, 65536) // The maximum UID allowed to create a snapshot
+OPTION(mds_snap_rstat, OPT_BOOL, false) // enable/disbale nested stat for snapshot
 OPTION(mds_verify_backtrace, OPT_U32, 1)
+// detect clients which aren't trimming completed requests
+OPTION(mds_max_completed_flushes, OPT_U32, 100000)
+OPTION(mds_max_completed_requests, OPT_U32, 100000)
 
 OPTION(mds_action_on_write_error, OPT_U32, 1) // 0: ignore; 1: force readonly; 2: crash
+OPTION(mds_mon_shutdown_timeout, OPT_DOUBLE, 5)
+
+// Maximum number of concurrent stray files to purge
+OPTION(mds_max_purge_files, OPT_U32, 64)
+// Maximum number of concurrent RADOS ops to issue in purging
+OPTION(mds_max_purge_ops, OPT_U32, 8192)
+// Maximum number of concurrent RADOS ops to issue in purging, scaled by PG count
+OPTION(mds_max_purge_ops_per_pg, OPT_FLOAT, 0.5)
+
+OPTION(mds_root_ino_uid, OPT_INT, 0) // The UID of / on new filesystems
+OPTION(mds_root_ino_gid, OPT_INT, 0) // The GID of / on new filesystems
 
 // If true, compact leveldb store on mount
 OPTION(osd_compact_leveldb_on_mount, OPT_BOOL, false)
 
 // Maximum number of backfills to or from a single osd
-OPTION(osd_max_backfills, OPT_U64, 10)
+OPTION(osd_max_backfills, OPT_U64, 1)
 
 // Minimum recovery priority (255 = max, smaller = lower)
 OPTION(osd_min_recovery_priority, OPT_INT, 0)
@@ -472,6 +520,7 @@ OPTION(osd_backfill_retry_interval, OPT_DOUBLE, 10.0)
 
 // max agent flush ops
 OPTION(osd_agent_max_ops, OPT_INT, 4)
+OPTION(osd_agent_max_low_ops, OPT_INT, 2)
 OPTION(osd_agent_min_evict_effort, OPT_FLOAT, .1)
 OPTION(osd_agent_quantize_effort, OPT_FLOAT, .1)
 OPTION(osd_agent_delay_time, OPT_FLOAT, 5.0)
@@ -497,6 +546,7 @@ OPTION(osd_client_message_cap, OPT_U64, 100)              // num client messages
 OPTION(osd_pg_bits, OPT_INT, 6)  // bits per osd
 OPTION(osd_pgp_bits, OPT_INT, 6)  // bits per osd
 OPTION(osd_crush_chooseleaf_type, OPT_INT, 1) // 1 = host
+OPTION(osd_pool_use_gmt_hitset, OPT_BOOL, true) // try to use gmt for hitset archive names if all osds in cluster support it.
 OPTION(osd_pool_default_crush_rule, OPT_INT, -1) // deprecated for osd_pool_default_crush_replicated_ruleset
 OPTION(osd_pool_default_crush_replicated_ruleset, OPT_INT, CEPH_DEFAULT_CRUSH_REPLICATED_RULESET)
 OPTION(osd_pool_erasure_code_stripe_width, OPT_U32, OSD_POOL_ERASURE_CODE_STRIPE_WIDTH) // in bytes
@@ -504,7 +554,6 @@ OPTION(osd_pool_default_size, OPT_INT, 3)
 OPTION(osd_pool_default_min_size, OPT_INT, 0)  // 0 means no specific default; ceph will use size-size/2
 OPTION(osd_pool_default_pg_num, OPT_INT, 8) // number of PGs for new pools. Configure in global or mon section of ceph.conf
 OPTION(osd_pool_default_pgp_num, OPT_INT, 8) // number of PGs for placement purposes. Should be equal to pg_num
-OPTION(osd_pool_default_erasure_code_directory, OPT_STR, CEPH_PKGLIBDIR"/erasure-code") // default for the erasure-code-directory=XXX property of osd pool create
 OPTION(osd_pool_default_erasure_code_profile,
        OPT_STR,
        "plugin=jerasure "
@@ -530,6 +579,7 @@ OPTION(osd_pool_default_flag_nopgchange, OPT_BOOL, false) // pool's pg and pgp n
 OPTION(osd_pool_default_flag_nosizechange, OPT_BOOL, false) // pool's size and min size can't be changed
 OPTION(osd_pool_default_hit_set_bloom_fpp, OPT_FLOAT, .05)
 OPTION(osd_pool_default_cache_target_dirty_ratio, OPT_FLOAT, .4)
+OPTION(osd_pool_default_cache_target_dirty_high_ratio, OPT_FLOAT, .6)
 OPTION(osd_pool_default_cache_target_full_ratio, OPT_FLOAT, .8)
 OPTION(osd_pool_default_cache_min_flush_age, OPT_INT, 0)  // seconds
 OPTION(osd_pool_default_cache_min_evict_age, OPT_INT, 0)  // seconds
@@ -542,10 +592,11 @@ OPTION(osd_tier_default_cache_hit_set_count, OPT_INT, 4)
 OPTION(osd_tier_default_cache_hit_set_period, OPT_INT, 1200)
 OPTION(osd_tier_default_cache_hit_set_type, OPT_STR, "bloom")
 OPTION(osd_tier_default_cache_min_read_recency_for_promote, OPT_INT, 1) // number of recent HitSets the object must appear in to be promoted (on read)
+OPTION(osd_tier_default_cache_min_write_recency_for_promote, OPT_INT, 1) // number of recent HitSets the object must appear in to be promoted (on write)
 
 OPTION(osd_map_dedup, OPT_BOOL, true)
-OPTION(osd_map_max_advance, OPT_INT, 200) // make this < cache_size!
-OPTION(osd_map_cache_size, OPT_INT, 500)
+OPTION(osd_map_max_advance, OPT_INT, 150) // make this < cache_size!
+OPTION(osd_map_cache_size, OPT_INT, 200)
 OPTION(osd_map_message_max, OPT_INT, 100)  // max maps per MOSDMap message
 OPTION(osd_map_share_max_epochs, OPT_INT, 100)  // cap on # of inc maps we send to peers, clients
 OPTION(osd_inject_bad_map_crc_probability, OPT_FLOAT, 0)
@@ -562,8 +613,6 @@ OPTION(osd_recover_clone_overlap, OPT_BOOL, true)   // preserve clone_overlap du
 OPTION(osd_op_num_threads_per_shard, OPT_INT, 2)
 OPTION(osd_op_num_shards, OPT_INT, 5)
 
-OPTION(osd_read_eio_on_bad_digest, OPT_BOOL, true) // return EIO if object digest is bad
-
 // Only use clone_overlap for recovery if there are fewer than
 // osd_recover_clone_overlap_limit entries in the overlap set
 OPTION(osd_recover_clone_overlap_limit, OPT_INT, 10)
@@ -574,18 +623,12 @@ OPTION(osd_op_thread_timeout, OPT_INT, 15)
 OPTION(osd_op_thread_suicide_timeout, OPT_INT, 150)
 OPTION(osd_recovery_thread_timeout, OPT_INT, 30)
 OPTION(osd_recovery_thread_suicide_timeout, OPT_INT, 300)
-OPTION(osd_snap_trim_thread_timeout, OPT_INT, 60*60*1)
-OPTION(osd_snap_trim_thread_suicide_timeout, OPT_INT, 60*60*10)
+OPTION(osd_recovery_sleep, OPT_FLOAT, 0)         // seconds to sleep between recovery ops
 OPTION(osd_snap_trim_sleep, OPT_FLOAT, 0)
-OPTION(osd_scrub_thread_timeout, OPT_INT, 60)
-OPTION(osd_scrub_thread_suicide_timeout, OPT_INT, 60)
-OPTION(osd_scrub_finalize_thread_timeout, OPT_INT, 60*10)
 OPTION(osd_scrub_invalid_stats, OPT_BOOL, true)
 OPTION(osd_remove_thread_timeout, OPT_INT, 60*60)
 OPTION(osd_remove_thread_suicide_timeout, OPT_INT, 10*60*60)
 OPTION(osd_command_thread_timeout, OPT_INT, 10*60)
-OPTION(osd_age, OPT_FLOAT, .8)
-OPTION(osd_age_time, OPT_INT, 0)
 OPTION(osd_command_thread_suicide_timeout, OPT_INT, 15*60)
 OPTION(osd_heartbeat_addr, OPT_ADDR, entity_addr_t())
 OPTION(osd_heartbeat_interval, OPT_INT, 6)       // (seconds) how often we ping peers
@@ -609,8 +652,8 @@ OPTION(osd_default_data_pool_replay_window, OPT_INT, 45)
 OPTION(osd_preserve_trimmed_log, OPT_BOOL, false)
 OPTION(osd_auto_mark_unfound_lost, OPT_BOOL, false)
 OPTION(osd_recovery_delay_start, OPT_FLOAT, 0)
-OPTION(osd_recovery_max_active, OPT_INT, 15)
-OPTION(osd_recovery_max_single_start, OPT_INT, 5)
+OPTION(osd_recovery_max_active, OPT_INT, 3)
+OPTION(osd_recovery_max_single_start, OPT_INT, 1)
 OPTION(osd_recovery_max_chunk, OPT_U64, 8<<20)  // max size of push chunk
 OPTION(osd_copyfrom_max_chunk, OPT_U64, 8<<20)   // max size of a COPYFROM chunk
 OPTION(osd_push_per_object_cost, OPT_U64, 1000)  // push cost per object
@@ -623,6 +666,7 @@ OPTION(osd_scrub_end_hour, OPT_INT, 24)
 OPTION(osd_scrub_load_threshold, OPT_FLOAT, 0.5)
 OPTION(osd_scrub_min_interval, OPT_FLOAT, 60*60*24)    // if load is low
 OPTION(osd_scrub_max_interval, OPT_FLOAT, 7*60*60*24)  // regardless of load
+OPTION(osd_scrub_interval_randomize_ratio, OPT_FLOAT, 0.5) // randomize the scheduled scrub in the span of [min,min*(1+randomize_radio))
 OPTION(osd_scrub_chunk_min, OPT_INT, 5)
 OPTION(osd_scrub_chunk_max, OPT_INT, 25)
 OPTION(osd_scrub_sleep, OPT_FLOAT, 0)   // sleep between [deep]scrub ops
@@ -630,7 +674,6 @@ OPTION(osd_deep_scrub_interval, OPT_FLOAT, 60*60*24*7) // once a week
 OPTION(osd_deep_scrub_stride, OPT_INT, 524288)
 OPTION(osd_deep_scrub_update_digest_min_age, OPT_INT, 2*60*60)   // objects must be this old (seconds) before we update the whole-object digest on scrub
 OPTION(osd_scan_list_ping_tp_interval, OPT_U64, 100)
-OPTION(osd_auto_weight, OPT_BOOL, false)
 OPTION(osd_class_dir, OPT_STR, CEPH_LIBDIR "/rados-classes") // where rados plugins are stored
 OPTION(osd_open_classes_on_start, OPT_BOOL, true)
 OPTION(osd_check_for_log_corruption, OPT_BOOL, false)
@@ -640,7 +683,7 @@ OPTION(osd_default_notify_timeout, OPT_U32, 30) // default notify timeout in sec
 OPTION(osd_kill_backfill_at, OPT_INT, 0)
 
 // Bounds how infrequently a new map epoch will be persisted for a pg
-OPTION(osd_pg_epoch_persisted_max_stale, OPT_U32, 200)
+OPTION(osd_pg_epoch_persisted_max_stale, OPT_U32, 150) // make this < map_cache_size!
 
 OPTION(osd_min_pg_log_entries, OPT_U32, 3000)  // number of entries to keep in the pg log when trimming it
 OPTION(osd_max_pg_log_entries, OPT_U32, 10000) // max entries, say when degraded, before we trim
@@ -662,6 +705,7 @@ OPTION(osd_debug_verify_stray_on_activate, OPT_BOOL, false)
 OPTION(osd_debug_skip_full_check_in_backfill_reservation, OPT_BOOL, false)
 OPTION(osd_debug_reject_backfill_probability, OPT_DOUBLE, 0)
 OPTION(osd_debug_inject_copyfrom_error, OPT_BOOL, false)  // inject failure during copyfrom completion
+OPTION(osd_debug_randomize_hobject_sort_order, OPT_BOOL, false)
 OPTION(osd_enable_op_tracker, OPT_BOOL, true) // enable/disable OSD op tracking
 OPTION(osd_num_op_tracker_shard, OPT_U32, 32) // The number of shards for holding the ops
 OPTION(osd_op_history_size, OPT_U32, 20)    // Max number of completed ops to track
@@ -671,6 +715,7 @@ OPTION(osd_failsafe_full_ratio, OPT_FLOAT, .97) // what % full makes an OSD "ful
 OPTION(osd_failsafe_nearfull_ratio, OPT_FLOAT, .90) // what % full makes an OSD near full (failsafe)
 
 OPTION(osd_pg_object_context_cache_count, OPT_INT, 64)
+OPTION(osd_tracing, OPT_BOOL, false) // true if LTTng-UST tracepoints should be enabled
 
 // determines whether PGLog::check() compares written out log to stored log
 OPTION(osd_debug_pg_log_writeout, OPT_BOOL, false)
@@ -696,41 +741,36 @@ OPTION(kinetic_user_id, OPT_INT, 1) // kinetic user to authenticate as
 OPTION(kinetic_hmac_key, OPT_STR, "asdfasdf") // kinetic key to authenticate with
 OPTION(kinetic_use_ssl, OPT_BOOL, false) // whether to secure kinetic traffic with TLS
 
-OPTION(rocksdb_compact_on_mount, OPT_BOOL, false)
-OPTION(rocksdb_write_buffer_size, OPT_U64, 0) // rocksdb write buffer size
-OPTION(rocksdb_target_file_size_base, OPT_U64, 0) // target file size for compaction
-OPTION(rocksdb_cache_size, OPT_U64, 0) // rocksdb cache size
-OPTION(rocksdb_block_size, OPT_U64, 0) // rocksdb block size
-OPTION(rocksdb_bloom_size, OPT_INT, 0) // rocksdb bloom bits per entry
-OPTION(rocksdb_write_buffer_num, OPT_INT, 0) // rocksdb bloom bits per entry
-OPTION(rocksdb_background_compactions, OPT_INT, 0) // number for background compaction jobs
-OPTION(rocksdb_background_flushes, OPT_INT, 0) // number for background flush jobs
-OPTION(rocksdb_max_open_files, OPT_INT, 0) // rocksdb max open files
-OPTION(rocksdb_compression, OPT_STR, "") // rocksdb uses compression : none, snappy, zlib, bzip2
-OPTION(rocksdb_paranoid, OPT_BOOL, false) // rocksdb paranoid flag
-OPTION(rocksdb_log, OPT_STR, "/dev/null")  // enable rocksdb log file
-OPTION(rocksdb_level0_file_num_compaction_trigger, OPT_U64, 0) // Number of files to trigger level-0 compaction
-OPTION(rocksdb_level0_slowdown_writes_trigger, OPT_U64, 0)  // number of level-0 files at which we start slowing down write.
-OPTION(rocksdb_level0_stop_writes_trigger, OPT_U64, 0)  // number of level-0 files at which we stop writes
-OPTION(rocksdb_disableDataSync, OPT_BOOL, true) // if true, data files are not synced to stable storage
-OPTION(rocksdb_disableWAL, OPT_BOOL, false)  // diable write ahead log
-OPTION(rocksdb_num_levels, OPT_INT, 0) // number of levels for this database
-OPTION(rocksdb_wal_dir, OPT_STR, "")  //  rocksdb write ahead log file
-OPTION(rocksdb_info_log_level, OPT_STR, "info")  // info log level : debug , info , warn, error, fatal
+
+// rocksdb options that will be used for keyvaluestore(if backend is rocksdb)
+OPTION(keyvaluestore_rocksdb_options, OPT_STR, "")
+// rocksdb options that will be used for omap(if omap_backend is rocksdb)
+OPTION(filestore_rocksdb_options, OPT_STR, "")
+// rocksdb options that will be used in monstore
+OPTION(mon_rocksdb_options, OPT_STR, "")
 
 /**
- * osd_client_op_priority and osd_recovery_op_priority adjust the relative
- * priority of client io vs recovery io.
+ * osd_*_priority adjust the relative priority of client io, recovery io,
+ * snaptrim io, etc
  *
- * osd_client_op_priority/osd_recovery_op_priority determines the ratio of
- * available io between client and recovery.  Each option may be set between
+ * osd_*_priority determines the ratio of available io between client and
+ * recovery.  Each option may be set between
  * 1..63.
- *
+ */
+OPTION(osd_client_op_priority, OPT_U32, 63)
+OPTION(osd_recovery_op_priority, OPT_U32, 3)
+
+OPTION(osd_snap_trim_priority, OPT_U32, 5)
+OPTION(osd_snap_trim_cost, OPT_U32, 1<<20) // set default cost equal to 1MB io
+
+OPTION(osd_scrub_priority, OPT_U32, 5)
+// set default cost equal to 50MB io
+OPTION(osd_scrub_cost, OPT_U32, 50<<20) 
+
+/**
  * osd_recovery_op_warn_multiple scales the normal warning threshhold,
  * osd_op_complaint_time, so that slow recovery ops won't cause noise
  */
-OPTION(osd_client_op_priority, OPT_U32, 63)
-OPTION(osd_recovery_op_priority, OPT_U32, 10)
 OPTION(osd_recovery_op_warn_multiple, OPT_U32, 16)
 
 // Max time to wait between notifying mon of shutdown and shutting down
@@ -742,6 +782,7 @@ OPTION(osd_max_attr_name_len, OPT_U32, 100)    // max rados attr name len; canno
 OPTION(osd_max_attr_size, OPT_U64, 0)
 
 OPTION(osd_objectstore, OPT_STR, "filestore")  // ObjectStore backend type
+OPTION(osd_objectstore_tracing, OPT_BOOL, false) // true if LTTng-UST tracepoints should be enabled
 // Override maintaining compatibility with older OSDs
 // Set to true for testing.  Users should NOT set this.
 OPTION(osd_debug_override_acting_compat, OPT_BOOL, false)
@@ -752,6 +793,38 @@ OPTION(osd_bench_max_block_size, OPT_U64, 64 << 20) // cap the block size at 64M
 OPTION(osd_bench_duration, OPT_U32, 30) // duration of 'osd bench', capped at 30s to avoid triggering timeouts
 
 OPTION(memstore_device_bytes, OPT_U64, 1024*1024*1024)
+OPTION(memstore_page_set, OPT_BOOL, true)
+OPTION(memstore_page_size, OPT_U64, 64 << 10)
+
+OPTION(newstore_max_dir_size, OPT_U32, 1000000)
+OPTION(newstore_onode_map_size, OPT_U32, 1024)   // onodes per collection
+OPTION(newstore_backend, OPT_STR, "rocksdb")
+OPTION(newstore_backend_options, OPT_STR, "")
+OPTION(newstore_fail_eio, OPT_BOOL, true)
+OPTION(newstore_sync_io, OPT_BOOL, false)  // perform initial io synchronously
+OPTION(newstore_sync_transaction, OPT_BOOL, false)  // perform kv txn synchronously
+OPTION(newstore_sync_submit_transaction, OPT_BOOL, false)
+OPTION(newstore_sync_wal_apply, OPT_BOOL, true)     // perform initial wal work synchronously (possibly in combination with aio so we only *queue* ios)
+OPTION(newstore_fsync_threads, OPT_INT, 16)  // num threads calling fsync
+OPTION(newstore_fsync_thread_timeout, OPT_INT, 30) // thread timeout value
+OPTION(newstore_fsync_thread_suicide_timeout, OPT_INT, 120) // suicide timeout value
+OPTION(newstore_wal_threads, OPT_INT, 4)
+OPTION(newstore_wal_thread_timeout, OPT_INT, 30)
+OPTION(newstore_wal_thread_suicide_timeout, OPT_INT, 120)
+OPTION(newstore_max_ops, OPT_U64, 512)
+OPTION(newstore_max_bytes, OPT_U64, 64*1024*1024)
+OPTION(newstore_wal_max_ops, OPT_U64, 512)
+OPTION(newstore_wal_max_bytes, OPT_U64, 128*1024*1024)
+OPTION(newstore_fid_prealloc, OPT_INT, 1024)
+OPTION(newstore_nid_prealloc, OPT_INT, 1024)
+OPTION(newstore_overlay_max_length, OPT_INT, 65536)
+OPTION(newstore_overlay_max, OPT_INT, 32)
+OPTION(newstore_open_by_handle, OPT_BOOL, true)
+OPTION(newstore_o_direct, OPT_BOOL, true)
+OPTION(newstore_db_path, OPT_STR, "")
+OPTION(newstore_aio, OPT_BOOL, true)
+OPTION(newstore_aio_poll_ms, OPT_INT, 250)  // milliseconds
+OPTION(newstore_aio_max_queue_depth, OPT_INT, 4096)
 
 OPTION(filestore_omap_backend, OPT_STR, "leveldb")
 
@@ -808,6 +881,7 @@ OPTION(filestore_btrfs_clone_range, OPT_BOOL, true)
 OPTION(filestore_zfs_snap, OPT_BOOL, false) // zfsonlinux is still unstable
 OPTION(filestore_fsync_flushes_journal_data, OPT_BOOL, false)
 OPTION(filestore_fiemap, OPT_BOOL, false)     // (try to) use fiemap
+OPTION(filestore_seek_data_hole, OPT_BOOL, false)     // (try to) use seek_data/hole
 OPTION(filestore_fadvise, OPT_BOOL, true)
 
 // (try to) use extsize for alloc hint NOTE: extsize seems to trigger
@@ -853,6 +927,7 @@ OPTION(keyvaluestore_default_strip_size, OPT_INT, 4096) // Only affect new objec
 OPTION(keyvaluestore_max_expected_write_size, OPT_U64, 1ULL << 24) // bytes
 OPTION(keyvaluestore_header_cache_size, OPT_INT, 4096)    // Header cache size
 OPTION(keyvaluestore_backend, OPT_STR, "leveldb")
+OPTION(keyvaluestore_dump_file, OPT_STR, "")         // file onto which store transaction dumps
 
 // max bytes to search ahead in journal searching for corruption
 OPTION(journal_max_corrupt_search, OPT_U64, 10<<20)
@@ -870,6 +945,7 @@ OPTION(journal_discard, OPT_BOOL, false) //using ssd disk as journal, whether su
 
 OPTION(rados_mon_op_timeout, OPT_DOUBLE, 0) // how many seconds to wait for a response from the monitor before returning an error from a rados operation. 0 means on limit.
 OPTION(rados_osd_op_timeout, OPT_DOUBLE, 0) // how many seconds to wait for a response from osds before returning an error from a rados operation. 0 means no limit.
+OPTION(rados_tracing, OPT_BOOL, false) // true if LTTng-UST tracepoints should be enabled
 
 OPTION(rbd_op_threads, OPT_INT, 1)
 OPTION(rbd_op_thread_timeout, OPT_INT, 60)
@@ -894,6 +970,9 @@ OPTION(rbd_clone_copy_on_read, OPT_BOOL, false)
 OPTION(rbd_blacklist_on_break_lock, OPT_BOOL, true) // whether to blacklist clients whose lock was broken
 OPTION(rbd_blacklist_expire_seconds, OPT_INT, 0) // number of seconds to blacklist - set to 0 for OSD default
 OPTION(rbd_request_timed_out_seconds, OPT_INT, 30) // number of seconds before maint request times out
+OPTION(rbd_skip_partial_discard, OPT_BOOL, false) // when trying to discard a range inside an object, set to true to skip zeroing the range.
+OPTION(rbd_enable_alloc_hint, OPT_BOOL, true) // when writing a object, it will issue a hint to osd backend to indicate the expected size object need
+OPTION(rbd_tracing, OPT_BOOL, false) // true if LTTng-UST tracepoints should be enabled
 
 /*
  * The following options change the behavior for librbd's image creation methods that
@@ -911,7 +990,7 @@ OPTION(rbd_request_timed_out_seconds, OPT_INT, 30) // number of seconds before m
  * rbd_create3()/RBD::create3() and rbd_clone2/RBD::clone2() are only
  * affected by rbd_default_order.
  */
-OPTION(rbd_default_format, OPT_INT, 1)
+OPTION(rbd_default_format, OPT_INT, 2)
 OPTION(rbd_default_order, OPT_INT, 22)
 OPTION(rbd_default_stripe_count, OPT_U64, 0) // changing requires stripingv2 feature
 OPTION(rbd_default_stripe_unit, OPT_U64, 0) // changing to non-object size requires stripingv2 feature
@@ -919,10 +998,13 @@ OPTION(rbd_default_features, OPT_INT, 3) // only applies to format 2 images
 					 // +1 for layering, +2 for stripingv2,
 					 // +4 for exclusive lock, +8 for object map
 
+OPTION(rbd_default_map_options, OPT_STR, "") // default rbd map -o / --options
+
 OPTION(nss_db_path, OPT_STR, "") // path to nss db
 
 
 OPTION(rgw_max_chunk_size, OPT_INT, 512 * 1024)
+OPTION(rgw_max_put_size, OPT_U64, 5ULL*1024*1024*1024)
 
 /**
  * override max bucket index shards in zone configuration (if not zero)
@@ -1046,13 +1128,21 @@ OPTION(rgw_user_quota_sync_idle_users, OPT_BOOL, false) // whether stats for idl
 OPTION(rgw_user_quota_sync_wait_time, OPT_INT, 3600 * 24) // min time between two full stats sync for non-idle users
 
 OPTION(rgw_multipart_min_part_size, OPT_INT, 5 * 1024 * 1024) // min size for each part (except for last one) in multipart upload
+OPTION(rgw_multipart_part_upload_limit, OPT_INT, 10000) // parts limit in multipart upload
 
 OPTION(rgw_olh_pending_timeout_sec, OPT_INT, 3600) // time until we retire a pending olh change
 OPTION(rgw_user_max_buckets, OPT_U32, 1000) // global option to set max buckets count for all user
 
+OPTION(rgw_objexp_gc_interval, OPT_U32, 60 * 10) // maximum time between round of expired objects garbage collecting
+OPTION(rgw_objexp_time_step, OPT_U32, 4096) // number of seconds for rounding the timestamps
+OPTION(rgw_objexp_hints_num_shards, OPT_U32, 127) // maximum number of parts in which the hint index is stored in
+OPTION(rgw_objexp_chunk_size, OPT_U32, 100) // maximum number of entries in a single operation when processing objexp data
+
 OPTION(mutex_perf_counter, OPT_BOOL, false) // enable/disable mutex perf counter
 OPTION(throttler_perf_counter, OPT_BOOL, true) // enable/disable throttler perf counter
 
 // This will be set to true when it is safe to start threads.
 // Once it is true, it will never change.
 OPTION(internal_safe_to_start_threads, OPT_BOOL, false)
+
+OPTION(debug_deliberately_leak_memory, OPT_BOOL, false)
diff --git a/src/common/crc32c.cc b/src/common/crc32c.cc
index e2e81a4..45432f5 100644
--- a/src/common/crc32c.cc
+++ b/src/common/crc32c.cc
@@ -5,9 +5,11 @@
 
 #include "arch/probe.h"
 #include "arch/intel.h"
+#include "arch/arm.h"
 #include "common/sctp_crc32.h"
 #include "common/crc32c_intel_baseline.h"
 #include "common/crc32c_intel_fast.h"
+#include "common/crc32c_aarch64.h"
 
 /*
  * choose best implementation based on the CPU architecture.
@@ -24,6 +26,10 @@ ceph_crc32c_func_t ceph_choose_crc32(void)
     return ceph_crc32c_intel_fast;
   }
 
+  if (ceph_arch_aarch64_crc32){
+    return ceph_crc32c_aarch64;
+  }
+
   // default
   return ceph_crc32c_sctp;
 }
diff --git a/src/common/crc32c_aarch64.c b/src/common/crc32c_aarch64.c
new file mode 100644
index 0000000..d33827d
--- /dev/null
+++ b/src/common/crc32c_aarch64.c
@@ -0,0 +1,47 @@
+#include "acconfig.h"
+#include "include/int_types.h"
+#include "common/crc32c_aarch64.h"
+
+#define CRC32CX(crc, value) __asm__("crc32cx %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(value))
+#define CRC32CW(crc, value) __asm__("crc32cw %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
+#define CRC32CH(crc, value) __asm__("crc32ch %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
+#define CRC32CB(crc, value) __asm__("crc32cb %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
+
+uint32_t ceph_crc32c_aarch64(uint32_t crc, unsigned char const *buffer, unsigned len)
+{
+	int64_t length = len;
+
+	if (!buffer) {
+
+		while ((length -= sizeof(uint64_t)) >= 0)
+			CRC32CX(crc, 0);
+
+		/* The following is more efficient than the straight loop */
+		if (length & sizeof(uint32_t))
+			CRC32CW(crc, 0);
+
+		if (length & sizeof(uint16_t))
+			CRC32CH(crc, 0);
+
+		if (length & sizeof(uint8_t))
+			CRC32CB(crc, 0);
+	} else {
+		while ((length -= sizeof(uint64_t)) >= 0) {
+			CRC32CX(crc, *(uint64_t *)buffer);
+			buffer += sizeof(uint64_t);
+		}
+
+		/* The following is more efficient than the straight loop */
+		if (length & sizeof(uint32_t)) {
+			CRC32CW(crc, *(uint32_t *)buffer);
+			buffer += sizeof(uint32_t);
+		}
+		if (length & sizeof(uint16_t)) {
+			CRC32CH(crc, *(uint16_t *)buffer);
+			buffer += sizeof(uint16_t);
+		}
+		if (length & sizeof(uint8_t))
+			CRC32CB(crc, *buffer);
+	}
+	return crc;
+}
diff --git a/src/common/crc32c_aarch64.h b/src/common/crc32c_aarch64.h
new file mode 100644
index 0000000..3727f54
--- /dev/null
+++ b/src/common/crc32c_aarch64.h
@@ -0,0 +1,27 @@
+#ifndef CEPH_COMMON_CRC32C_AARCH64_H
+#define CEPH_COMMON_CRC32C_AARCH64_H
+
+#include "arch/arm.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef HAVE_ARMV8_CRC
+
+extern uint32_t ceph_crc32c_aarch64(uint32_t crc, unsigned char const *buffer, unsigned len);
+
+#else
+
+static inline uint32_t ceph_crc32c_aarch64(uint32_t crc, unsigned char const *buffer, unsigned len)
+{
+	return 0;
+}
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/common/hobject.cc b/src/common/hobject.cc
index 866c992..e651047 100644
--- a/src/common/hobject.cc
+++ b/src/common/hobject.cc
@@ -80,7 +80,7 @@ string hobject_t::to_str() const
   uint64_t poolid(pool);
   t += snprintf(t, end - t, "%.*llX", 16, (long long unsigned)poolid);
 
-  uint32_t revhash(get_filestore_key_u32());
+  uint32_t revhash(get_nibblewise_key_u32());
   t += snprintf(t, end - t, ".%.*X", 8, revhash);
 
   if (snap == CEPH_NOSNAP)
@@ -130,18 +130,21 @@ void hobject_t::decode(bufferlist::iterator& bl)
   if (struct_v >= 4) {
     ::decode(nspace, bl);
     ::decode(pool, bl);
-    // newer OSDs have a different hobject_t::get_min(); decode it properly.
-    if (pool == INT64_MIN &&
-	hash == 0 &&
+    // for compat with hammer, which did not handle the transition
+    // from pool -1 -> pool INT64_MIN for MIN properly.  this object
+    // name looks a bit like a pgmeta object for the meta collection,
+    // but those do not ever exist (and is_pgmeta() pool >= 0).
+    if (pool == -1 &&
 	snap == 0 &&
+	hash == 0 &&
 	!max &&
 	oid.name.empty()) {
-      pool = -1;
+      pool = INT64_MIN;
       assert(is_min());
     }
   }
   DECODE_FINISH(bl);
-  build_filestore_key_cache();
+  build_hash_cache();
 }
 
 void hobject_t::decode(json_spirit::Value& v)
@@ -165,7 +168,7 @@ void hobject_t::decode(json_spirit::Value& v)
     else if (p.name_ == "namespace")
       nspace = p.value_.get_str();
   }
-  build_filestore_key_cache();
+  build_hash_cache();
 }
 
 void hobject_t::dump(Formatter *f) const
@@ -193,21 +196,99 @@ void hobject_t::generate_test_instances(list<hobject_t*>& o)
 
 ostream& operator<<(ostream& out, const hobject_t& o)
 {
+  if (o == hobject_t())
+    return out << "MIN";
   if (o.is_max())
     return out << "MAX";
-  out << std::hex << o.get_hash() << std::dec;
+  out << o.pool << '/';
+  out << std::hex;
+  out.width(8);
+  out.fill('0');
+  out << o.get_hash();
+  out.width(0);
+  out.fill(' ');
+  out << std::dec;
+  if (o.nspace.length())
+    out << ":" << o.nspace;
   if (o.get_key().length())
     out << "." << o.get_key();
   out << "/" << o.oid << "/" << o.snap;
-  out << "/" << o.nspace << "/" << o.pool;
   return out;
 }
 
+int cmp_nibblewise(const hobject_t& l, const hobject_t& r)
+{
+  if (l.max < r.max)
+    return -1;
+  if (l.max > r.max)
+    return 1;
+  if (l.pool < r.pool)
+    return -1;
+  if (l.pool > r.pool)
+    return 1;
+  if (l.get_nibblewise_key() < r.get_nibblewise_key())
+    return -1;
+  if (l.get_nibblewise_key() > r.get_nibblewise_key())
+    return 1;
+  if (l.nspace < r.nspace)
+    return -1;
+  if (l.nspace > r.nspace)
+    return 1;
+  if (l.get_effective_key() < r.get_effective_key())
+    return -1;
+  if (l.get_effective_key() > r.get_effective_key())
+    return 1;
+  if (l.oid < r.oid)
+    return -1;
+  if (l.oid > r.oid)
+    return 1;
+  if (l.snap < r.snap)
+    return -1;
+  if (l.snap > r.snap)
+    return 1;
+  return 0;
+}
+
+int cmp_bitwise(const hobject_t& l, const hobject_t& r)
+{
+  if (l.max < r.max)
+    return -1;
+  if (l.max > r.max)
+    return 1;
+  if (l.pool < r.pool)
+    return -1;
+  if (l.pool > r.pool)
+    return 1;
+  if (l.get_bitwise_key() < r.get_bitwise_key())
+    return -1;
+  if (l.get_bitwise_key() > r.get_bitwise_key())
+    return 1;
+  if (l.nspace < r.nspace)
+    return -1;
+  if (l.nspace > r.nspace)
+    return 1;
+  if (l.get_effective_key() < r.get_effective_key())
+    return -1;
+  if (l.get_effective_key() > r.get_effective_key())
+    return 1;
+  if (l.oid < r.oid)
+    return -1;
+  if (l.oid > r.oid)
+    return 1;
+  if (l.snap < r.snap)
+    return -1;
+  if (l.snap > r.snap)
+    return 1;
+  return 0;
+}
+
+
+
 // This is compatible with decode for hobject_t prior to
 // version 5.
 void ghobject_t::encode(bufferlist& bl) const
 {
-  ENCODE_START(5, 3, bl);
+  ENCODE_START(6, 3, bl);
   ::encode(hobj.key, bl);
   ::encode(hobj.oid, bl);
   ::encode(hobj.snap, bl);
@@ -217,12 +298,13 @@ void ghobject_t::encode(bufferlist& bl) const
   ::encode(hobj.pool, bl);
   ::encode(generation, bl);
   ::encode(shard_id, bl);
+  ::encode(max, bl);
   ENCODE_FINISH(bl);
 }
 
 void ghobject_t::decode(bufferlist::iterator& bl)
 {
-  DECODE_START_LEGACY_COMPAT_LEN(5, 3, 3, bl);
+  DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, bl);
   if (struct_v >= 1)
     ::decode(hobj.key, bl);
   ::decode(hobj.oid, bl);
@@ -235,13 +317,14 @@ void ghobject_t::decode(bufferlist::iterator& bl)
   if (struct_v >= 4) {
     ::decode(hobj.nspace, bl);
     ::decode(hobj.pool, bl);
-    // newer OSDs have a different hobject_t::get_min(); decode it properly.
-    if (hobj.pool == INT64_MIN &&
-	hobj.hash == 0 &&
+    // for compat with hammer, which did not handle the transition from
+    // pool -1 -> pool INT64_MIN for MIN properly (see hobject_t::decode()).
+    if (hobj.pool == -1 &&
 	hobj.snap == 0 &&
+	hobj.hash == 0 &&
 	!hobj.max &&
 	hobj.oid.name.empty()) {
-      hobj.pool = -1;
+      hobj.pool = INT64_MIN;
       assert(hobj.is_min());
     }
   }
@@ -252,8 +335,13 @@ void ghobject_t::decode(bufferlist::iterator& bl)
     generation = ghobject_t::NO_GEN;
     shard_id = shard_id_t::NO_SHARD;
   }
+  if (struct_v >= 6) {
+    ::decode(max, bl);
+  } else {
+    max = false;
+  }
   DECODE_FINISH(bl);
-  hobj.set_hash(hobj.get_hash()); //to call build_filestore_key_cache();
+  hobj.build_hash_cache();
 }
 
 void ghobject_t::decode(json_spirit::Value& v)
@@ -267,6 +355,8 @@ void ghobject_t::decode(json_spirit::Value& v)
       generation = p.value_.get_uint64();
     else if (p.name_ == "shard_id")
       shard_id.id = p.value_.get_int();
+    else if (p.name_ == "max")
+      max = p.value_.get_int();
   }
 }
 
@@ -277,6 +367,7 @@ void ghobject_t::dump(Formatter *f) const
     f->dump_int("generation", generation);
   if (shard_id != shard_id_t::NO_SHARD)
     f->dump_int("shard_id", shard_id);
+  f->dump_int("max", (int)max);
 }
 
 void ghobject_t::generate_test_instances(list<ghobject_t*>& o)
@@ -306,11 +397,55 @@ void ghobject_t::generate_test_instances(list<ghobject_t*>& o)
 
 ostream& operator<<(ostream& out, const ghobject_t& o)
 {
+  if (o == ghobject_t())
+    return out << "GHMIN";
+  if (o.is_max())
+    return out << "GHMAX";
+  if (o.shard_id != shard_id_t::NO_SHARD)
+    out << std::hex << o.shard_id << std::dec << ":";
   out << o.hobj;
-  if (o.generation != ghobject_t::NO_GEN ||
-      o.shard_id != shard_id_t::NO_SHARD) {
-    assert(o.shard_id != shard_id_t::NO_SHARD);
-    out << "/" << o.generation << "/" << (unsigned)(o.shard_id);
+  if (o.generation != ghobject_t::NO_GEN) {
+    out << "/" << std::hex << (unsigned)(o.generation) << std::dec;
   }
   return out;
 }
+
+int cmp_nibblewise(const ghobject_t& l, const ghobject_t& r)
+{
+  if (l.max < r.max)
+    return -1;
+  if (l.max > r.max)
+    return 1;
+  if (l.shard_id < r.shard_id)
+    return -1;
+  if (l.shard_id > r.shard_id)
+    return 1;
+  int ret = cmp_nibblewise(l.hobj, r.hobj);
+  if (ret != 0)
+    return ret;
+  if (l.generation < r.generation)
+    return -1;
+  if (l.generation > r.generation)
+    return 1;
+  return 0;
+}
+
+int cmp_bitwise(const ghobject_t& l, const ghobject_t& r)
+{
+  if (l.max < r.max)
+    return -1;
+  if (l.max > r.max)
+    return 1;
+  if (l.shard_id < r.shard_id)
+    return -1;
+  if (l.shard_id > r.shard_id)
+    return 1;
+  int ret = cmp_bitwise(l.hobj, r.hobj);
+  if (ret != 0)
+    return ret;
+  if (l.generation < r.generation)
+    return -1;
+  if (l.generation > r.generation)
+    return 1;
+  return 0;
+}
diff --git a/src/common/hobject.h b/src/common/hobject.h
index 7495cc1..4698756 100644
--- a/src/common/hobject.h
+++ b/src/common/hobject.h
@@ -23,8 +23,6 @@
 #include "json_spirit/json_spirit_value.h"
 #include "include/assert.h"   // spirit clobbers it!
 
-typedef uint64_t filestore_hobject_key_t;
-
 namespace ceph {
   class Formatter;
 }
@@ -42,8 +40,11 @@ struct hobject_t {
 private:
   uint32_t hash;
   bool max;
-  filestore_hobject_key_t filestore_key_cache;
-  static const int64_t POOL_IS_TEMP = -1;
+  uint32_t nibblewise_key_cache;
+  uint32_t hash_reverse_bits;
+  static const int64_t POOL_META = -1;
+  static const int64_t POOL_TEMP_START = -2; // and then negative
+  friend class spg_t;  // for POOL_TEMP_START
 public:
   int64_t pool;
   string nspace;
@@ -56,6 +57,13 @@ public:
     return key;
   }
 
+  void set_key(const std::string &key_) {
+    if (key_ == oid.name)
+      key.clear();
+    else
+      key = key_;
+  }
+
   string to_str() const;
   
   uint32_t get_hash() const { 
@@ -63,7 +71,7 @@ public:
   }
   void set_hash(uint32_t value) { 
     hash = value;
-    build_filestore_key_cache();
+    build_hash_cache();
   }
 
   static bool match_hash(uint32_t to_check, uint32_t bits, uint32_t match) {
@@ -73,15 +81,15 @@ public:
     return match_hash(hash, bits, match);
   }
 
-  static hobject_t make_temp(const string &name) {
-    return hobject_t(object_t(name), "", CEPH_NOSNAP, 0, POOL_IS_TEMP, "");
-  }
   bool is_temp() const {
-    return pool == POOL_IS_TEMP;
+    return pool <= POOL_TEMP_START && pool != INT64_MIN;
+  }
+  bool is_meta() const {
+    return pool == POOL_META;
   }
 
-  hobject_t() : snap(0), hash(0), max(false), pool(-1) {
-    build_filestore_key_cache();
+  hobject_t() : snap(0), hash(0), max(false), pool(INT64_MIN) {
+    build_hash_cache();
   }
 
   hobject_t(object_t oid, const string& key, snapid_t snap, uint64_t hash,
@@ -89,7 +97,7 @@ public:
     : oid(oid), snap(snap), hash(hash), max(false),
       pool(pool), nspace(nspace),
       key(oid.name == key ? string() : key) {
-    build_filestore_key_cache();
+    build_hash_cache();
   }
 
   hobject_t(const sobject_t &soid, const string &key, uint32_t hash,
@@ -97,7 +105,7 @@ public:
     : oid(soid.oid), snap(soid.snap), hash(hash), max(false),
       pool(pool), nspace(nspace),
       key(soid.oid.name == key ? string() : key) {
-    build_filestore_key_cache();
+    build_hash_cache();
   }
 
   /// @return min hobject_t ret s.t. ret.hash == this->hash
@@ -146,8 +154,8 @@ public:
 
   /* Do not use when a particular hash function is needed */
   explicit hobject_t(const sobject_t &o) :
-    oid(o.oid), snap(o.snap), max(false), pool(-1) {
-    set_hash(CEPH_HASH_NAMESPACE::hash<sobject_t>()(o));
+    oid(o.oid), snap(o.snap), max(false), pool(POOL_META) {
+    set_hash(std::hash<sobject_t>()(o));
   }
 
   // maximum sorted value.
@@ -164,9 +172,23 @@ public:
     return snap == 0 &&
 	   hash == 0 &&
 	   !max &&
-	   pool == -1;
+	   pool == INT64_MIN;
+  }
+
+  static uint32_t _reverse_bits(uint32_t v) {
+    // reverse bits
+    // swap odd and even bits
+    v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
+    // swap consecutive pairs
+    v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
+    // swap nibbles ...
+    v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
+    // swap bytes
+    v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
+    // swap 2-byte long pairs
+    v = ( v >> 16             ) | ( v               << 16);
+    return v;
   }
-
   static uint32_t _reverse_nibbles(uint32_t retval) {
     // reverse nibbles
     retval = ((retval & 0x0f0f0f0f) << 4) | ((retval & 0xf0f0f0f0) >> 4);
@@ -178,8 +200,8 @@ public:
   /**
    * Returns set S of strings such that for any object
    * h where h.match(bits, mask), there is some string
-   * s \in S such that s is a prefix of h.to_str().
-   * Furthermore, for any s \in S, s is a prefix of
+   * s \f$\in\f$ S such that s is a prefix of h.to_str().
+   * Furthermore, for any s $f\in\f$ S, s is a prefix of
    * h.str() implies that h.match(bits, mask).
    */
   static set<string> get_prefixes(
@@ -187,15 +209,35 @@ public:
     uint32_t mask,
     int64_t pool);
 
-  filestore_hobject_key_t get_filestore_key_u32() const {
+  // filestore nibble-based key
+  uint32_t get_nibblewise_key_u32() const {
     assert(!max);
-    return _reverse_nibbles(hash);
+    return nibblewise_key_cache;
   }
-  filestore_hobject_key_t get_filestore_key() const {
-    return max ? 0x100000000ull : filestore_key_cache;
+  uint64_t get_nibblewise_key() const {
+    return max ? 0x100000000ull : nibblewise_key_cache;
   }
-  void build_filestore_key_cache() {    
-    filestore_key_cache = _reverse_nibbles(hash);
+
+  // newer bit-reversed key
+  uint32_t get_bitwise_key_u32() const {
+    assert(!max);
+    return hash_reverse_bits;
+  }
+  uint64_t get_bitwise_key() const {
+    return max ? 0x100000000ull : hash_reverse_bits;
+  }
+
+  void build_hash_cache() {
+    nibblewise_key_cache = _reverse_nibbles(hash);
+    hash_reverse_bits = _reverse_bits(hash);
+  }
+  void set_nibblewise_key_u32(uint32_t value) {
+    hash = _reverse_nibbles(value);
+    build_hash_cache();
+  }
+  void set_bitwise_key_u32(uint32_t value) {
+    hash = _reverse_bits(value);
+    build_hash_cache();
   }
 
   const string& get_effective_key() const {
@@ -219,17 +261,48 @@ public:
   void decode(json_spirit::Value& v);
   void dump(Formatter *f) const;
   static void generate_test_instances(list<hobject_t*>& o);
-  friend bool operator<(const hobject_t&, const hobject_t&);
-  friend bool operator>(const hobject_t&, const hobject_t&);
-  friend bool operator<=(const hobject_t&, const hobject_t&);
-  friend bool operator>=(const hobject_t&, const hobject_t&);
+  friend int cmp_nibblewise(const hobject_t& l, const hobject_t& r);
+  friend int cmp_bitwise(const hobject_t& l, const hobject_t& r);
   friend bool operator==(const hobject_t&, const hobject_t&);
   friend bool operator!=(const hobject_t&, const hobject_t&);
   friend struct ghobject_t;
+
+  struct NibblewiseComparator {
+    bool operator()(const hobject_t& l, const hobject_t& r) const {
+      return cmp_nibblewise(l, r) < 0;
+    }
+  };
+
+  struct BitwiseComparator {
+    bool operator()(const hobject_t& l, const hobject_t& r) const {
+      return cmp_bitwise(l, r) < 0;
+    }
+  };
+
+  struct Comparator {
+    bool bitwise;
+    Comparator(bool b) : bitwise(b) {}
+    bool operator()(const hobject_t& l, const hobject_t& r) const {
+      if (bitwise)
+	return cmp_bitwise(l, r) < 0;
+      else
+	return cmp_nibblewise(l, r) < 0;
+    }
+  };
+  struct ComparatorWithDefault {
+    bool bitwise;
+    ComparatorWithDefault(bool b=true) : bitwise(b) {}
+    bool operator()(const hobject_t& l, const hobject_t& r) const {
+      if (bitwise)
+	return cmp_bitwise(l, r) < 0;
+      else
+	return cmp_nibblewise(l, r) < 0;
+    }
+  };
 };
 WRITE_CLASS_ENCODER(hobject_t)
 
-CEPH_HASH_NAMESPACE_START
+namespace std {
   template<> struct hash<hobject_t> {
     size_t operator()(const hobject_t &r) const {
       static hash<object_t> H;
@@ -237,47 +310,70 @@ CEPH_HASH_NAMESPACE_START
       return H(r.oid) ^ I(r.snap);
     }
   };
-CEPH_HASH_NAMESPACE_END
+} // namespace std
 
 ostream& operator<<(ostream& out, const hobject_t& o);
 
-WRITE_EQ_OPERATORS_7(hobject_t, oid, get_key(), snap, hash, max, pool, nspace)
-// sort hobject_t's by <max, get_filestore_key(hash), key, oid, snapid>
-WRITE_CMP_OPERATORS_7(hobject_t,
-		      max,
-		      get_filestore_key(),
-		      nspace,
-		      pool,
-		      get_effective_key(),
-		      oid,
-		      snap)
+WRITE_EQ_OPERATORS_7(hobject_t, hash, oid, get_key(), snap, pool, max, nspace)
 
-typedef version_t gen_t;
+extern int cmp_nibblewise(const hobject_t& l, const hobject_t& r);
+extern int cmp_bitwise(const hobject_t& l, const hobject_t& r);
+static inline int cmp(const hobject_t& l, const hobject_t& r, bool sort_bitwise) {
+  if (sort_bitwise)
+    return cmp_bitwise(l, r);
+  else
+    return cmp_nibblewise(l, r);
+}
 
-#ifndef UINT64_MAX
-#define UINT64_MAX (18446744073709551615ULL)
-#endif
+// these are convenient
+static inline hobject_t MAX_HOBJ(const hobject_t& l, const hobject_t& r, bool bitwise) {
+  if (cmp(l, r, bitwise) >= 0)
+    return l;
+  else
+    return r;
+}
+
+static inline hobject_t MIN_HOBJ(const hobject_t& l, const hobject_t& r, bool bitwise) {
+  if (cmp(l, r, bitwise) <= 0)
+    return l;
+  else
+    return r;
+}
+
+typedef version_t gen_t;
 
 struct ghobject_t {
   hobject_t hobj;
   gen_t generation;
   shard_id_t shard_id;
+  bool max;
 
 public:
   static const gen_t NO_GEN = UINT64_MAX;
 
-  ghobject_t() : generation(NO_GEN), shard_id(shard_id_t::NO_SHARD) {}
+  ghobject_t()
+    : generation(NO_GEN),
+      shard_id(shard_id_t::NO_SHARD),
+      max(false) {}
 
-  ghobject_t(const hobject_t &obj) : hobj(obj), generation(NO_GEN), shard_id(shard_id_t::NO_SHARD) {}
+  explicit ghobject_t(const hobject_t &obj)
+    : hobj(obj),
+      generation(NO_GEN),
+      shard_id(shard_id_t::NO_SHARD),
+      max(false) {}
 
-  ghobject_t(const hobject_t &obj, gen_t gen, shard_id_t shard) : hobj(obj), generation(gen), shard_id(shard) {}
+  ghobject_t(const hobject_t &obj, gen_t gen, shard_id_t shard)
+    : hobj(obj),
+      generation(gen),
+      shard_id(shard),
+      max(false) {}
 
   static ghobject_t make_pgmeta(int64_t pool, uint32_t hash, shard_id_t shard) {
     hobject_t h(object_t(), string(), CEPH_NOSNAP, hash, pool, string());
     return ghobject_t(h, NO_GEN, shard);
   }
   bool is_pgmeta() const {
-    // make sure we are distinct from hobject_t(), which has pool -1
+    // make sure we are distinct from hobject_t(), which has pool INT64_MIN
     return hobj.pool >= 0 && hobj.oid.name.empty();
   }
 
@@ -294,11 +390,11 @@ public:
     ret.hobj.pool = hobj.pool;
     return ret;
   }
-  filestore_hobject_key_t get_filestore_key_u32() const {
-    return hobj.get_filestore_key_u32();
+  uint32_t get_nibblewise_key_u32() const {
+    return hobj.get_nibblewise_key_u32();
   }
-  filestore_hobject_key_t get_filestore_key() const {
-    return hobj.get_filestore_key();
+  uint32_t get_nibblewise_key() const {
+    return hobj.get_nibblewise_key();
   }
 
   bool is_degenerate() const {
@@ -313,13 +409,22 @@ public:
     return shard_id == shard_id_t::NO_SHARD;
   }
 
+  void set_shard(shard_id_t s) {
+    shard_id = s;
+  }
+
   // maximum sorted value.
   static ghobject_t get_max() {
-    ghobject_t h(hobject_t::get_max());
+    ghobject_t h;
+    h.max = true;
+    h.hobj = hobject_t::get_max();  // so that is_max() => hobj.is_max()
     return h;
   }
   bool is_max() const {
-    return hobj.is_max();
+    return max;
+  }
+  bool is_min() const {
+    return *this == ghobject_t();
   }
 
   void swap(ghobject_t &o) {
@@ -333,16 +438,37 @@ public:
   void decode(json_spirit::Value& v);
   void dump(Formatter *f) const;
   static void generate_test_instances(list<ghobject_t*>& o);
-  friend bool operator<(const ghobject_t&, const ghobject_t&);
-  friend bool operator>(const ghobject_t&, const ghobject_t&);
-  friend bool operator<=(const ghobject_t&, const ghobject_t&);
-  friend bool operator>=(const ghobject_t&, const ghobject_t&);
+  friend int cmp_nibblewise(const ghobject_t& l, const ghobject_t& r);
+  friend int cmp_bitwise(const ghobject_t& l, const ghobject_t& r);
   friend bool operator==(const ghobject_t&, const ghobject_t&);
   friend bool operator!=(const ghobject_t&, const ghobject_t&);
+
+  struct NibblewiseComparator {
+    bool operator()(const ghobject_t& l, const ghobject_t& r) const {
+      return cmp_nibblewise(l, r) < 0;
+    }
+  };
+
+  struct BitwiseComparator {
+    bool operator()(const ghobject_t& l, const ghobject_t& r) const {
+      return cmp_bitwise(l, r) < 0;
+    }
+  };
+
+  struct Comparator {
+    bool bitwise;
+    Comparator(bool b) : bitwise(b) {}
+    bool operator()(const ghobject_t& l, const ghobject_t& r) const {
+         if (bitwise)
+	return cmp_bitwise(l, r) < 0;
+      else
+	return cmp_nibblewise(l, r) < 0;
+    }
+  };
 };
 WRITE_CLASS_ENCODER(ghobject_t)
 
-CEPH_HASH_NAMESPACE_START
+namespace std {
   template<> struct hash<ghobject_t> {
     size_t operator()(const ghobject_t &r) const {
       static hash<object_t> H;
@@ -350,18 +476,37 @@ CEPH_HASH_NAMESPACE_START
       return H(r.hobj.oid) ^ I(r.hobj.snap);
     }
   };
-CEPH_HASH_NAMESPACE_END
+} // namespace std
 
 ostream& operator<<(ostream& out, const ghobject_t& o);
 
-WRITE_EQ_OPERATORS_3(ghobject_t, hobj, shard_id, generation)
-// sort ghobject_t's by <hobj, shard_id, generation> 
-// 
-// Two objects which differ by generation are more related than
-// two objects of the same generation which differ by shard.
-// 
-WRITE_CMP_OPERATORS_3(ghobject_t,
-		      hobj,
-		      shard_id,
-		      generation)
+WRITE_EQ_OPERATORS_4(ghobject_t, max, shard_id, hobj, generation)
+
+extern int cmp_nibblewise(const ghobject_t& l, const ghobject_t& r);
+extern int cmp_bitwise(const ghobject_t& l, const ghobject_t& r);
+static inline int cmp(const ghobject_t& l, const ghobject_t& r,
+		      bool sort_bitwise) {
+  if (sort_bitwise)
+    return cmp_bitwise(l, r);
+  else
+    return cmp_nibblewise(l, r);
+}
+
+// these are convenient
+static inline ghobject_t MAX_GHOBJ(const ghobject_t& l, const ghobject_t& r,
+				   bool bitwise) {
+  if (cmp(l, r, bitwise) >= 0)
+    return l;
+  else
+    return r;
+}
+
+static inline ghobject_t MIN_GHOBJ(const ghobject_t& l, const ghobject_t& r,
+				   bool bitwise) {
+  if (cmp(l, r, bitwise) <= 0)
+    return l;
+  else
+    return r;
+}
+
 #endif
diff --git a/src/common/ipaddr.cc b/src/common/ipaddr.cc
index 3147d37..abf09b0 100644
--- a/src/common/ipaddr.cc
+++ b/src/common/ipaddr.cc
@@ -1,5 +1,6 @@
 #include "include/ipaddr.h"
 
+#include <sys/socket.h>
 #include <arpa/inet.h>
 #include <stdlib.h>
 #include <string.h>
diff --git a/src/common/lockdep.cc b/src/common/lockdep.cc
index 5f9fa19..79fd56c 100644
--- a/src/common/lockdep.cc
+++ b/src/common/lockdep.cc
@@ -19,10 +19,9 @@
 #include "lockdep.h"
 
 #include "include/unordered_map.h"
-#include "include/hash_namespace.h"
 
 #if defined(__FreeBSD__) && defined(__LP64__)	// On FreeBSD pthread_t is a pointer.
-CEPH_HASH_NAMESPACE_START
+namespace std {
   template<>
     struct hash<pthread_t>
     {
@@ -30,7 +29,7 @@ CEPH_HASH_NAMESPACE_START
       operator()(pthread_t __x) const
       { return (uintptr_t)__x; }
     };
-CEPH_HASH_NAMESPACE_END
+} // namespace std
 #endif
 
 /******* Constants **********/
diff --git a/src/common/obj_bencher.cc b/src/common/obj_bencher.cc
index 81abe09..3772eb8 100644
--- a/src/common/obj_bencher.cc
+++ b/src/common/obj_bencher.cc
@@ -61,7 +61,7 @@ ostream& ObjBencher::out(ostream& os, utime_t& t)
   if (show_time)
     return t.localtime(os) << " ";
   else
-    return os << " ";
+    return os;
 }
 
 ostream& ObjBencher::out(ostream& os)
@@ -73,42 +73,47 @@ ostream& ObjBencher::out(ostream& os)
 void *ObjBencher::status_printer(void *_bencher) {
   ObjBencher *bencher = static_cast<ObjBencher *>(_bencher);
   bench_data& data = bencher->data;
+  Formatter *formatter = bencher->formatter;
+  ostream *outstream = bencher->outstream;
   Cond cond;
   int i = 0;
   int previous_writes = 0;
   int cycleSinceChange = 0;
   double bandwidth;
+  int iops;
   utime_t ONE_SECOND;
   ONE_SECOND.set_from_double(1.0);
   bencher->lock.Lock();
+  if (formatter)
+    formatter->open_array_section("datas");
   while(!data.done) {
     utime_t cur_time = ceph_clock_now(bencher->cct);
 
-    if (i % 20 == 0) {
+    if (i % 20 == 0 && !formatter) {
       if (i > 0)
-	cur_time.localtime(cout) << "min lat: " << data.min_latency
-	     << " max lat: " << data.max_latency
-	     << " avg lat: " << data.avg_latency << std::endl;
+        cur_time.localtime(cout) << " min lat: " << data.min_latency
+          << " max lat: " << data.max_latency
+          << " avg lat: " << data.avg_latency << std::endl;
       //I'm naughty and don't reset the fill
       bencher->out(cout, cur_time) << setfill(' ')
-	   << setw(5) << "sec"
-	   << setw(8) << "Cur ops"
-	   << setw(10) << "started"
-	   << setw(10) << "finished"
-	   << setw(10) << "avg MB/s"
-	   << setw(10) << "cur MB/s"
-	   << setw(10) << "last lat"
-	   << setw(10) << "avg lat" << std::endl;
+          << setw(5) << "sec"
+          << setw(8) << "Cur ops"
+          << setw(10) << "started"
+          << setw(10) << "finished"
+          << setw(10) << "avg MB/s"
+          << setw(10) << "cur MB/s"
+          << setw(10) << "last lat"
+          << setw(10) << "avg lat" << std::endl;
     }
     if (cycleSinceChange)
       bandwidth = (double)(data.finished - previous_writes)
-	* (data.trans_size)
-	/ (1024*1024)
-	/ cycleSinceChange;
+        * (data.object_size)
+        / (1024*1024)
+        / cycleSinceChange;
     else
-      bandwidth = 0;
+      bandwidth = -1;
 
-    if (!isnan(bandwidth)) {
+    if (!std::isnan(bandwidth) && bandwidth > -1) {
       if (bandwidth > data.idata.max_bandwidth)
         data.idata.max_bandwidth = bandwidth;
       if (bandwidth < data.idata.min_bandwidth)
@@ -117,97 +122,139 @@ void *ObjBencher::status_printer(void *_bencher) {
       data.history.bandwidth.push_back(bandwidth);
     }
 
-    double avg_bandwidth = (double) (data.trans_size) * (data.finished)
+    if (cycleSinceChange)
+      iops = (double)(data.finished - previous_writes)
+        / cycleSinceChange;
+    else
+      iops = -1;
+
+    if (!std::isnan(iops) && iops > -1) {
+      if (iops > data.idata.max_iops)
+        data.idata.max_iops = iops;
+      if (iops < data.idata.min_iops)
+        data.idata.min_iops = iops;
+
+      data.history.iops.push_back(iops);
+    }
+    
+    if (formatter)
+      formatter->open_object_section("data");
+
+    double avg_bandwidth = (double) (data.object_size) * (data.finished)
       / (double)(cur_time - data.start_time) / (1024*1024);
     if (previous_writes != data.finished) {
       previous_writes = data.finished;
       cycleSinceChange = 0;
-      bencher->out(cout, cur_time) << setfill(' ')
-	   << setw(5) << i
-	   << setw(8) << data.in_flight
-	   << setw(10) << data.started
-	   << setw(10) << data.finished
-	   << setw(10) << avg_bandwidth
-	   << setw(10) << bandwidth
-	   << setw(10) << (double)data.cur_latency
-	   << setw(10) << data.avg_latency << std::endl;
+      if (!formatter) {
+        bencher->out(cout, cur_time) << setfill(' ')
+          << setw(5) << i
+          << setw(8) << data.in_flight
+          << setw(10) << data.started
+          << setw(10) << data.finished
+          << setw(10) << avg_bandwidth
+          << setw(10) << bandwidth
+          << setw(10) << (double)data.cur_latency
+          << setw(10) << data.avg_latency << std::endl;
+      } else {
+        formatter->dump_format("sec", "%d", i);
+        formatter->dump_format("cur_ops", "%d", data.in_flight);
+        formatter->dump_format("started", "%d", data.started);
+        formatter->dump_format("finished", "%d", data.finished);
+        formatter->dump_format("avg_bw", "%f", avg_bandwidth);
+        formatter->dump_format("cur_bw", "%f", bandwidth);
+        formatter->dump_format("last_lat", "%f", (double)data.cur_latency);
+        formatter->dump_format("avg_lat", "%f", data.avg_latency);
+      }
     }
     else {
-      bencher->out(cout, cur_time) << setfill(' ')
-	   << setw(5) << i
-	   << setw(8) << data.in_flight
-	   << setw(10) << data.started
-	   << setw(10) << data.finished
-	   << setw(10) << avg_bandwidth
-	   << setw(10) << '0'
-	   << setw(10) << '-'
-	   << setw(10) << data.avg_latency << std::endl;
+      if (!formatter) {
+        bencher->out(cout, cur_time) << setfill(' ')
+          << setw(5) << i
+          << setw(8) << data.in_flight
+          << setw(10) << data.started
+          << setw(10) << data.finished
+          << setw(10) << avg_bandwidth
+          << setw(10) << '0'
+          << setw(10) << '-'
+          << setw(10) << data.avg_latency << std::endl;
+      } else {
+        formatter->dump_format("sec", "%d", i);
+        formatter->dump_format("cur_ops", "%d", data.in_flight);
+        formatter->dump_format("started", "%d", data.started);
+        formatter->dump_format("finished", "%d", data.finished);
+        formatter->dump_format("avg_bw", "%f", avg_bandwidth);
+        formatter->dump_format("cur_bw", "%f", 0);
+        formatter->dump_format("last_lat", "%f", 0);
+        formatter->dump_format("avg_lat", "%f", data.avg_latency);
+      }
+    }
+    if (formatter) {
+      formatter->close_section(); // data
+      formatter->flush(*outstream);
     }
     ++i;
     ++cycleSinceChange;
     cond.WaitInterval(bencher->cct, bencher->lock, ONE_SECOND);
   }
+  if (formatter)
+    formatter->close_section(); //datas
   bencher->lock.Unlock();
   return NULL;
 }
 
 int ObjBencher::aio_bench(
   int operation, int secondsToRun,
-  int maxObjectsToCreate,
-  int concurrentios, int op_size, bool cleanup, const char* run_name) {
+  int concurrentios, int object_size, bool cleanup, const std::string& run_name, bool no_verify) {
 
   if (concurrentios <= 0) 
     return -EINVAL;
 
-  int object_size = op_size;
   int num_objects = 0;
   int r = 0;
   int prevPid = 0;
 
   // default metadata object is used if user does not specify one
-  const std::string run_name_meta = (run_name == NULL ? BENCH_LASTRUN_METADATA : std::string(run_name));
+  const std::string run_name_meta = (run_name.empty() ? BENCH_LASTRUN_METADATA : run_name);
 
   //get data from previous write run, if available
   if (operation != OP_WRITE) {
     r = fetch_bench_metadata(run_name_meta, &object_size, &num_objects, &prevPid);
     if (r < 0) {
       if (r == -ENOENT)
-	cerr << "Must write data before running a read benchmark!" << std::endl;
+        cerr << "Must write data before running a read benchmark!" << std::endl;
       return r;
     }
-  } else {
-    object_size = op_size;
   }
 
   char* contentsChars = new char[object_size];
   lock.Lock();
   data.done = false;
   data.object_size = object_size;
-  data.trans_size = op_size;
   data.in_flight = 0;
   data.started = 0;
-  data.finished = num_objects;
+  data.finished = 0;
   data.min_latency = 9999.0; // this better be higher than initial latency!
   data.max_latency = 0;
   data.avg_latency = 0;
-  data.idata.min_bandwidth = 99999999.0;
-  data.idata.max_bandwidth = 0;
   data.object_contents = contentsChars;
   lock.Unlock();
 
   //fill in contentsChars deterministically so we can check returns
   sanitize_object_contents(&data, data.object_size);
 
+  if (formatter)
+    formatter->open_object_section("bench");
+
   if (OP_WRITE == operation) {
-    r = write_bench(secondsToRun, maxObjectsToCreate, concurrentios, run_name_meta);
+    r = write_bench(secondsToRun, concurrentios, run_name_meta);
     if (r != 0) goto out;
   }
   else if (OP_SEQ_READ == operation) {
-    r = seq_read_bench(secondsToRun, num_objects, concurrentios, prevPid);
+    r = seq_read_bench(secondsToRun, num_objects, concurrentios, prevPid, no_verify);
     if (r != 0) goto out;
   }
   else if (OP_RAND_READ == operation) {
-    r = rand_read_bench(secondsToRun, num_objects, concurrentios, prevPid);
+    r = rand_read_bench(secondsToRun, num_objects, concurrentios, prevPid, no_verify);
     if (r != 0) goto out;
   }
 
@@ -215,10 +262,10 @@ int ObjBencher::aio_bench(
     r = fetch_bench_metadata(run_name_meta, &object_size, &num_objects, &prevPid);
     if (r < 0) {
       if (r == -ENOENT)
-	cerr << "Should never happen: bench metadata missing for current run!" << std::endl;
+        cerr << "Should never happen: bench metadata missing for current run!" << std::endl;
       goto out;
     }
- 
+
     r = clean_up(num_objects, prevPid, concurrentios);
     if (r != 0) goto out;
 
@@ -228,6 +275,11 @@ int ObjBencher::aio_bench(
   }
 
  out:
+  if (formatter) {
+    formatter->close_section(); // bench
+    formatter->flush(*outstream);
+    *outstream << std::endl;
+  }
   delete[] contentsChars;
   return r;
 }
@@ -245,23 +297,24 @@ void _aio_cb(void *cb, void *arg) {
   lc->lock->Unlock();
 }
 
-static double vec_stddev(vector<double>& v)
+template<class T>
+static T vec_stddev(vector<T>& v)
 {
-  double mean = 0;
+  T mean = 0;
 
   if (v.size() < 2)
     return 0;
 
-  vector<double>::iterator iter;
+  typename vector<T>::iterator iter;
   for (iter = v.begin(); iter != v.end(); ++iter) {
     mean += *iter;
   }
 
   mean /= v.size();
 
-  double stddev = 0;
+  T stddev = 0;
   for (iter = v.begin(); iter != v.end(); ++iter) {
-    double dev = *iter - mean;
+    T dev = *iter - mean;
     dev *= dev;
     stddev += dev;
   }
@@ -289,22 +342,28 @@ int ObjBencher::fetch_bench_metadata(const std::string& metadata_file, int* obje
   return 0;
 }
 
-int ObjBencher::write_bench(int secondsToRun, int maxObjectsToCreate,
+int ObjBencher::write_bench(int secondsToRun,
 			    int concurrentios, const string& run_name_meta) {
   if (concurrentios <= 0) 
     return -EINVAL;
-
-  if (maxObjectsToCreate > 0 && concurrentios > maxObjectsToCreate)
-    concurrentios = maxObjectsToCreate;
-  out(cout) << "Maintaining " << concurrentios << " concurrent writes of "
-	    << data.object_size << " bytes for up to "
-	    << secondsToRun << " seconds or "
-	    << maxObjectsToCreate << " objects"
-	    << std::endl;
+  
+  if (!formatter) {
+    out(cout) << "Maintaining " << concurrentios << " concurrent writes of "
+           << data.object_size << " bytes for up to "
+           << secondsToRun << " seconds"
+           << std::endl;
+  } else {
+    formatter->dump_format("concurrent_ios", "%d", concurrentios);
+    formatter->dump_format("object_size", "%d", data.object_size);
+    formatter->dump_format("seconds_to_run", "%d", secondsToRun);
+  }
   bufferlist* newContents = 0;
 
   std::string prefix = generate_object_prefix();
-  out(cout) << "Object prefix: " << prefix << std::endl;
+  if (!formatter)
+    out(cout) << "Object prefix: " << prefix << std::endl;
+  else
+    formatter->dump_string("object_prefix", prefix);
 
   std::vector<string> name(concurrentios);
   std::string newName;
@@ -332,6 +391,7 @@ int ObjBencher::write_bench(int secondsToRun, int maxObjectsToCreate,
 
   pthread_create(&print_thread, NULL, ObjBencher::status_printer, (void *)this);
   lock.Lock();
+  data.finished = 0;
   data.start_time = ceph_clock_now(cct);
   lock.Unlock();
   for (int i = 0; i<concurrentios; ++i) {
@@ -358,16 +418,15 @@ int ObjBencher::write_bench(int secondsToRun, int maxObjectsToCreate,
   stopTime = data.start_time + runtime;
   slot = 0;
   lock.Lock();
-  while( ceph_clock_now(cct) < stopTime &&
-	 (!maxObjectsToCreate || data.started < maxObjectsToCreate)) {
+  while(ceph_clock_now(cct) < stopTime) {
     bool found = false;
     while (1) {
       int old_slot = slot;
       do {
-	if (completion_is_done(slot)) {
-          found = true;
-	  break;
-	}
+        if (completion_is_done(slot)) {
+            found = true;
+            break;
+        }
         slot++;
         if (slot == concurrentios) {
           slot = 0;
@@ -379,10 +438,12 @@ int ObjBencher::write_bench(int secondsToRun, int maxObjectsToCreate,
     }
     lock.Unlock();
     //create new contents and name on the heap, and fill them
-    newContents = new bufferlist();
     newName = generate_object_name(data.started);
-    snprintf(data.object_contents, data.object_size, "I'm the %16dth object!", data.started);
-    newContents->append(data.object_contents, data.object_size);
+    newContents = contents[slot];
+    snprintf(newContents->c_str(), data.object_size, "I'm the %16dth object!", data.started);
+    // we wrote to buffer, going around internal crc cache, so invalidate it now.
+    newContents->invalidate_crc();
+
     completion_wait(slot);
     lock.Lock();
     r = completion_ret(slot);
@@ -402,8 +463,7 @@ int ObjBencher::write_bench(int secondsToRun, int maxObjectsToCreate,
     release_completion(slot);
     timePassed = ceph_clock_now(cct) - data.start_time;
 
-    //write new stuff to backend, then delete old stuff
-    //and save locations of new stuff for later deletion
+    //write new stuff to backend
     start_times[slot] = ceph_clock_now(cct);
     r = create_completion(slot, _aio_cb, &lc);
     if (r < 0)
@@ -412,10 +472,7 @@ int ObjBencher::write_bench(int secondsToRun, int maxObjectsToCreate,
     if (r < 0) {//naughty; doesn't clean up heap space.
       goto ERR;
     }
-    delete contents[slot];
     name[slot] = newName;
-    contents[slot] = newContents;
-    newContents = 0;
     lock.Lock();
     ++data.started;
     ++data.in_flight;
@@ -442,6 +499,7 @@ int ObjBencher::write_bench(int secondsToRun, int maxObjectsToCreate,
     lock.Unlock();
     release_completion(slot);
     delete contents[slot];
+    contents[slot] = 0;
   }
 
   timePassed = ceph_clock_now(cct) - data.start_time;
@@ -454,21 +512,40 @@ int ObjBencher::write_bench(int secondsToRun, int maxObjectsToCreate,
   double bandwidth;
   bandwidth = ((double)data.finished)*((double)data.object_size)/(double)timePassed;
   bandwidth = bandwidth/(1024*1024); // we want it in MB/sec
-  char bw[20];
-  snprintf(bw, sizeof(bw), "%.3lf \n", bandwidth);
 
-  out(cout) << "Total time run:         " << timePassed << std::endl
+  if (!formatter) {
+    out(cout) << "Total time run:         " << timePassed << std::endl
        << "Total writes made:      " << data.finished << std::endl
        << "Write size:             " << data.object_size << std::endl
-       << "Bandwidth (MB/sec):     " << bw << std::endl
+       << "Bandwidth (MB/sec):     " << setprecision(3) << bandwidth << std::endl
        << "Stddev Bandwidth:       " << vec_stddev(data.history.bandwidth) << std::endl
        << "Max bandwidth (MB/sec): " << data.idata.max_bandwidth << std::endl
        << "Min bandwidth (MB/sec): " << data.idata.min_bandwidth << std::endl
+       << "Average IOPS:           " << (int)(data.finished/timePassed) << std::endl
+       << "Stddev IOPS:            " << vec_stddev(data.history.iops) << std::endl
+       << "Max IOPS:               " << data.idata.max_iops << std::endl
+       << "Min IOPS:               " << data.idata.min_iops << std::endl
        << "Average Latency:        " << data.avg_latency << std::endl
        << "Stddev Latency:         " << vec_stddev(data.history.latency) << std::endl
        << "Max latency:            " << data.max_latency << std::endl
        << "Min latency:            " << data.min_latency << std::endl;
-
+  } else {
+    formatter->dump_format("total_time_run", "%f", (double)timePassed);
+    formatter->dump_format("total_writes_made", "%d", data.finished);
+    formatter->dump_format("write_size", "%d", data.object_size);
+    formatter->dump_format("bandwidth", "%f", bandwidth);
+    formatter->dump_format("stddev_bandwidth", "%f", vec_stddev(data.history.bandwidth));
+    formatter->dump_format("max_bandwidth", "%f", data.idata.max_bandwidth);
+    formatter->dump_format("min_bandwidth", "%f", data.idata.min_bandwidth);
+    formatter->dump_format("average_iops", "%d", (int)(data.finished/timePassed));
+    formatter->dump_format("stddev_iops", "%d", vec_stddev(data.history.iops));
+    formatter->dump_format("max_iops", "%d", data.idata.max_iops);
+    formatter->dump_format("min_iops", "%d", data.idata.min_iops);
+    formatter->dump_format("average_latency", "%f", data.avg_latency);
+    formatter->dump_format("stddev_latency", "%f", vec_stddev(data.history.latency));
+    formatter->dump_format("max_latency:", "%f", data.max_latency);
+    formatter->dump_format("min_latency", "%f", data.min_latency);
+  }
   //write object size/number data for read benchmarks
   ::encode(data.object_size, b_write);
   ::encode(data.finished, b_write);
@@ -478,6 +555,9 @@ int ObjBencher::write_bench(int secondsToRun, int maxObjectsToCreate,
   sync_write(run_name_meta, b_write, sizeof(int)*3);
 
   completions_done();
+  for (int i = 0; i < concurrentios; i++)
+      if (contents[i])
+          delete contents[i];
 
   return 0;
 
@@ -486,11 +566,13 @@ int ObjBencher::write_bench(int secondsToRun, int maxObjectsToCreate,
   data.done = 1;
   lock.Unlock();
   pthread_join(print_thread, NULL);
-  delete newContents;
-  return -5;
+  for (int i = 0; i < concurrentios; i++)
+      if (contents[i])
+          delete contents[i];
+  return r;
 }
 
-int ObjBencher::seq_read_bench(int seconds_to_run, int num_objects, int concurrentios, int pid) {
+int ObjBencher::seq_read_bench(int seconds_to_run, int num_objects, int concurrentios, int pid, bool no_verify) {
   lock_cond lc(&lock);
 
   if (concurrentios <= 0) 
@@ -558,46 +640,60 @@ int ObjBencher::seq_read_bench(int seconds_to_run, int num_objects, int concurre
     bool found = false;
     while (1) {
       do {
-	if (completion_is_done(slot)) {
+        if (completion_is_done(slot)) {
           found = true;
-	  break;
-	}
+          break;
+        }
         slot++;
         if (slot == concurrentios) {
           slot = 0;
         }
       } while (slot != old_slot);
       if (found) {
-	break;
+        break;
       }
       lc.cond.Wait(lock);
     }
-    lock.Unlock();
-    newName = generate_object_name(data.started, pid);
+
+    // calculate latency here, so memcmp doesn't inflate it
+    data.cur_latency = ceph_clock_now(cct) - start_times[slot];
+
+    cur_contents = contents[slot];
     int current_index = index[slot];
+    
+    // invalidate internal crc cache
+    cur_contents->invalidate_crc();
+  
+    if (!no_verify) {
+      snprintf(data.object_contents, data.object_size, "I'm the %16dth object!", current_index);
+      if (memcmp(data.object_contents, cur_contents->c_str(), data.object_size) != 0) {
+        cerr << name[slot] << " is not correct!" << std::endl;
+        ++errors;
+      }
+    }
+
+    newName = generate_object_name(data.started, pid);
     index[slot] = data.started;
+    lock.Unlock();
     completion_wait(slot);
-    lock.Lock();
     r = completion_ret(slot);
     if (r < 0) {
       cerr << "read got " << r << std::endl;
       lock.Unlock();
       goto ERR;
     }
-    data.cur_latency = ceph_clock_now(cct) - start_times[slot];
+    lock.Lock();
     total_latency += data.cur_latency;
-    if( data.cur_latency > data.max_latency) data.max_latency = data.cur_latency;
+    if (data.cur_latency > data.max_latency) data.max_latency = data.cur_latency;
     if (data.cur_latency < data.min_latency) data.min_latency = data.cur_latency;
     ++data.finished;
     data.avg_latency = total_latency / data.finished;
     --data.in_flight;
     lock.Unlock();
     release_completion(slot);
-    cur_contents = contents[slot];
 
     //start new read and check data if requested
     start_times[slot] = ceph_clock_now(cct);
-    contents[slot] = new bufferlist();
     create_completion(slot, _aio_cb, (void *)&lc);
     r = aio_read(newName, slot, contents[slot], data.object_size);
     if (r < 0) {
@@ -606,14 +702,8 @@ int ObjBencher::seq_read_bench(int seconds_to_run, int num_objects, int concurre
     lock.Lock();
     ++data.started;
     ++data.in_flight;
-    snprintf(data.object_contents, data.object_size, "I'm the %16dth object!", current_index);
     lock.Unlock();
-    if (memcmp(data.object_contents, cur_contents->c_str(), data.object_size) != 0) {
-      cerr << name[slot] << " is not correct!" << std::endl;
-      ++errors;
-    }
     name[slot] = newName;
-    delete cur_contents;
   }
 
   //wait for final reads to complete
@@ -635,11 +725,15 @@ int ObjBencher::seq_read_bench(int seconds_to_run, int num_objects, int concurre
     data.avg_latency = total_latency / data.finished;
     --data.in_flight;
     release_completion(slot);
-    snprintf(data.object_contents, data.object_size, "I'm the %16dth object!", index[slot]);
-    lock.Unlock();
-    if (memcmp(data.object_contents, contents[slot]->c_str(), data.object_size) != 0) {
-      cerr << name[slot] << " is not correct!" << std::endl;
-      ++errors;
+    if (!no_verify) {
+      snprintf(data.object_contents, data.object_size, "I'm the %16dth object!", index[slot]);
+      lock.Unlock();
+      if (memcmp(data.object_contents, contents[slot]->c_str(), data.object_size) != 0) {
+        cerr << name[slot] << " is not correct!" << std::endl;
+        ++errors;
+      }
+    } else {
+        lock.Unlock();
     }
     delete contents[slot];
   }
@@ -654,16 +748,32 @@ int ObjBencher::seq_read_bench(int seconds_to_run, int num_objects, int concurre
   double bandwidth;
   bandwidth = ((double)data.finished)*((double)data.object_size)/(double)runtime;
   bandwidth = bandwidth/(1024*1024); // we want it in MB/sec
-  char bw[20];
-  snprintf(bw, sizeof(bw), "%.3lf \n", bandwidth);
 
-  out(cout) << "Total time run:        " << runtime << std::endl
+  if (!formatter) {
+    out(cout) << "Total time run:       " << runtime << std::endl
        << "Total reads made:     " << data.finished << std::endl
        << "Read size:            " << data.object_size << std::endl
-       << "Bandwidth (MB/sec):    " << bw << std::endl
-       << "Average Latency:       " << data.avg_latency << std::endl
-       << "Max latency:           " << data.max_latency << std::endl
-       << "Min latency:           " << data.min_latency << std::endl;
+       << "Bandwidth (MB/sec):   " << setprecision(3) << bandwidth << std::endl
+       << "Average IOPS          " << (int)(data.finished/runtime) << std::endl
+       << "Stddev IOPS:          " << vec_stddev(data.history.iops) << std::endl
+       << "Max IOPS:             " << data.idata.max_iops << std::endl
+       << "Min IOPS:             " << data.idata.min_iops << std::endl
+       << "Average Latency:      " << data.avg_latency << std::endl
+       << "Max latency:          " << data.max_latency << std::endl
+       << "Min latency:          " << data.min_latency << std::endl;
+  } else {
+    formatter->dump_format("total_time_run", "%f", (double)runtime);
+    formatter->dump_format("total_reads_made", "%d", data.finished);
+    formatter->dump_format("read_size", "%d", data.object_size);
+    formatter->dump_format("bandwidth", "%f", bandwidth);
+    formatter->dump_format("average_iops", "%d", (int)(data.finished/runtime));
+    formatter->dump_format("stddev_iops", "%d", vec_stddev(data.history.iops));
+    formatter->dump_format("max_iops", "%d", data.idata.max_iops);
+    formatter->dump_format("min_iops", "%d", data.idata.min_iops);
+    formatter->dump_format("average_latency", "%f", data.avg_latency);
+    formatter->dump_format("max_latency", "%f", data.max_latency);
+    formatter->dump_format("min_latency", "%f", data.min_latency);
+  }
 
   completions_done();
 
@@ -674,16 +784,16 @@ int ObjBencher::seq_read_bench(int seconds_to_run, int num_objects, int concurre
   data.done = 1;
   lock.Unlock();
   pthread_join(print_thread, NULL);
-  return -5;
+  return r;
 }
 
-int ObjBencher::rand_read_bench(int seconds_to_run, int num_objects, int concurrentios, int pid)
+int ObjBencher::rand_read_bench(int seconds_to_run, int num_objects, int concurrentios, int pid, bool no_verify)
 {
   lock_cond lc(&lock);
 
-  if (concurrentios <= 0) 
+  if (concurrentios <= 0)
     return -EINVAL;
- 
+
   std::vector<string> name(concurrentios);
   std::string newName;
   bufferlist* contents[concurrentios];
@@ -762,11 +872,14 @@ int ObjBencher::rand_read_bench(int seconds_to_run, int num_objects, int concurr
       }
       lc.cond.Wait(lock);
     }
+
+    // calculate latency here, so memcmp doesn't inflate it
+    data.cur_latency = ceph_clock_now(cct) - start_times[slot];
+
     lock.Unlock();
-    rand_id = rand() % num_objects;
-    newName = generate_object_name(rand_id, pid);
+
     int current_index = index[slot];
-    index[slot] = rand_id;
+    cur_contents = contents[slot];
     completion_wait(slot);
     lock.Lock();
     r = completion_ret(slot);
@@ -775,20 +888,33 @@ int ObjBencher::rand_read_bench(int seconds_to_run, int num_objects, int concurr
       lock.Unlock();
       goto ERR;
     }
-    data.cur_latency = ceph_clock_now(g_ceph_context) - start_times[slot];
+
     total_latency += data.cur_latency;
-    if( data.cur_latency > data.max_latency) data.max_latency = data.cur_latency;
+    if (data.cur_latency > data.max_latency) data.max_latency = data.cur_latency;
     if (data.cur_latency < data.min_latency) data.min_latency = data.cur_latency;
     ++data.finished;
     data.avg_latency = total_latency / data.finished;
     --data.in_flight;
     lock.Unlock();
+    
+    if (!no_verify) {
+      snprintf(data.object_contents, data.object_size, "I'm the %16dth object!", current_index);
+      if (memcmp(data.object_contents, cur_contents->c_str(), data.object_size) != 0) {
+        cerr << name[slot] << " is not correct!" << std::endl;
+        ++errors;
+      }
+    } 
+
+    rand_id = rand() % num_objects;
+    newName = generate_object_name(rand_id, pid);
+    index[slot] = rand_id;
     release_completion(slot);
-    cur_contents = contents[slot];
+
+    // invalidate internal crc cache
+    cur_contents->invalidate_crc();
 
     //start new read and check data if requested
     start_times[slot] = ceph_clock_now(g_ceph_context);
-    contents[slot] = new bufferlist();
     create_completion(slot, _aio_cb, (void *)&lc);
     r = aio_read(newName, slot, contents[slot], data.object_size);
     if (r < 0) {
@@ -797,16 +923,11 @@ int ObjBencher::rand_read_bench(int seconds_to_run, int num_objects, int concurr
     lock.Lock();
     ++data.started;
     ++data.in_flight;
-    snprintf(data.object_contents, data.object_size, "I'm the %16dth object!", current_index);
     lock.Unlock();
-    if (memcmp(data.object_contents, cur_contents->c_str(), data.object_size) != 0) {
-      cerr << name[slot] << " is not correct!" << std::endl;
-      ++errors;
-    }
     name[slot] = newName;
-    delete cur_contents;
   }
 
+
   //wait for final reads to complete
   while (data.finished < data.started) {
     slot = data.finished % concurrentios;
@@ -826,11 +947,15 @@ int ObjBencher::rand_read_bench(int seconds_to_run, int num_objects, int concurr
     data.avg_latency = total_latency / data.finished;
     --data.in_flight;
     release_completion(slot);
-    snprintf(data.object_contents, data.object_size, "I'm the %16dth object!", index[slot]);
-    lock.Unlock();
-    if (memcmp(data.object_contents, contents[slot]->c_str(), data.object_size) != 0) {
-      cerr << name[slot] << " is not correct!" << std::endl;
-      ++errors;
+    if (!no_verify) {
+      snprintf(data.object_contents, data.object_size, "I'm the %16dth object!", index[slot]);
+      lock.Unlock();
+      if (memcmp(data.object_contents, contents[slot]->c_str(), data.object_size) != 0) {
+        cerr << name[slot] << " is not correct!" << std::endl;
+        ++errors;
+      }
+    } else {
+        lock.Unlock();
     }
     delete contents[slot];
   }
@@ -845,17 +970,32 @@ int ObjBencher::rand_read_bench(int seconds_to_run, int num_objects, int concurr
   double bandwidth;
   bandwidth = ((double)data.finished)*((double)data.object_size)/(double)runtime;
   bandwidth = bandwidth/(1024*1024); // we want it in MB/sec
-  char bw[20];
-  snprintf(bw, sizeof(bw), "%.3lf \n", bandwidth);
 
-  out(cout) << "Total time run:        " << runtime << std::endl
+  if (!formatter) {
+    out(cout) << "Total time run:       " << runtime << std::endl
        << "Total reads made:     " << data.finished << std::endl
        << "Read size:            " << data.object_size << std::endl
-       << "Bandwidth (MB/sec):    " << bw << std::endl
-       << "Average Latency:       " << data.avg_latency << std::endl
-       << "Max latency:           " << data.max_latency << std::endl
-       << "Min latency:           " << data.min_latency << std::endl;
-
+       << "Bandwidth (MB/sec):   " << setprecision(3) << bandwidth << std::endl
+       << "Average IOPS:         " << (int)(data.finished/runtime) << std::endl
+       << "Stddev IOPS:          " << vec_stddev(data.history.iops) << std::endl
+       << "Max IOPS:             " << data.idata.max_iops << std::endl
+       << "Min IOPS:             " << data.idata.min_iops << std::endl
+       << "Average Latency:      " << data.avg_latency << std::endl
+       << "Max latency:          " << data.max_latency << std::endl
+       << "Min latency:          " << data.min_latency << std::endl;
+  } else {
+    formatter->dump_format("total_time_run", "%f", (double)runtime);
+    formatter->dump_format("total_reads_made", "%d", data.finished);
+    formatter->dump_format("read_size", "%d", data.object_size);
+    formatter->dump_format("bandwidth", "%f", bandwidth);
+    formatter->dump_format("average_iops", "%d", (int)(data.finished/runtime));
+    formatter->dump_format("stddev_iops", "%d", vec_stddev(data.history.iops));
+    formatter->dump_format("max_iops", "%d", data.idata.max_iops);
+    formatter->dump_format("min_iops", "%d", data.idata.min_iops);
+    formatter->dump_format("average_latency", "%f", data.avg_latency);
+    formatter->dump_format("max_latency", "%f", data.max_latency);
+    formatter->dump_format("min_latency", "%f", data.min_latency);
+  }
   completions_done();
 
   return 0;
@@ -865,22 +1005,22 @@ int ObjBencher::rand_read_bench(int seconds_to_run, int num_objects, int concurr
   data.done = 1;
   lock.Unlock();
   pthread_join(print_thread, NULL);
-  return -5;
+  return r;
 }
 
-int ObjBencher::clean_up(const char* prefix, int concurrentios, const char* run_name) {
+int ObjBencher::clean_up(const std::string& prefix, int concurrentios, const std::string& run_name) {
   int r = 0;
   int object_size;
   int num_objects;
   int prevPid;
 
   // default meta object if user does not specify one
-  const std::string run_name_meta = (run_name == NULL ? BENCH_LASTRUN_METADATA : std::string(run_name));
+  const std::string run_name_meta = (run_name.empty() ? BENCH_LASTRUN_METADATA : run_name);
 
   r = fetch_bench_metadata(run_name_meta, &object_size, &num_objects, &prevPid);
   if (r < 0) {
     // if the metadata file is not found we should try to do a linear search on the prefix
-    if (r == -ENOENT && prefix != NULL) {
+    if (r == -ENOENT && prefix != "") {
       return clean_up_slow(prefix, concurrentios);
     }
     else {
@@ -951,17 +1091,17 @@ int ObjBencher::clean_up(int num_objects, int prevPid, int concurrentios) {
     bool found = false;
     while (1) {
       do {
-	if (completion_is_done(slot)) {
+        if (completion_is_done(slot)) {
           found = true;
-	  break;
-	}
+          break;
+        }
         slot++;
         if (slot == concurrentios) {
           slot = 0;
         }
       } while (slot != old_slot);
       if (found) {
-	break;
+        break;
       }
       lc.cond.Wait(lock);
     }
@@ -1022,7 +1162,7 @@ int ObjBencher::clean_up(int num_objects, int prevPid, int concurrentios) {
   lock.Lock();
   data.done = 1;
   lock.Unlock();
-  return -5;
+  return r;
 }
 
 /**
@@ -1032,14 +1172,13 @@ int ObjBencher::clean_up(int num_objects, int prevPid, int concurrentios) {
  * prefix. The list is guaranteed to have at least one item when the
  * function returns true.
  *
- * @in
  * @param prefix the prefix to match against
- * @param objects return list of objects
+ * @param objects [out] return list of objects
  * @returns true if there are any objects in the store which match
  * the prefix, false if there are no more
  */
-bool ObjBencher::more_objects_matching_prefix(const std::string& prefix, std::list<std::string>* objects) {
-  std::list<std::string> unfiltered_objects;
+bool ObjBencher::more_objects_matching_prefix(const std::string& prefix, std::list<Object>* objects) {
+  std::list<Object> unfiltered_objects;
 
   objects->clear();
 
@@ -1048,12 +1187,10 @@ bool ObjBencher::more_objects_matching_prefix(const std::string& prefix, std::li
     if (!objects_remain)
       return false;
 
-    std::list<std::string>::const_iterator i = unfiltered_objects.begin();
+    std::list<Object>::const_iterator i = unfiltered_objects.begin();
     for ( ; i != unfiltered_objects.end(); ++i) {
-      const std::string& next = *i;
-
-      if (next.substr(0, prefix.length()) == prefix) {
-        objects->push_back(next);
+      if (i->first.substr(0, prefix.length()) == prefix) {
+        objects->push_back(*i);
       }
     }
   }
@@ -1067,12 +1204,12 @@ int ObjBencher::clean_up_slow(const std::string& prefix, int concurrentios) {
   if (concurrentios <= 0) 
     return -EINVAL;
 
-  std::vector<string> name(concurrentios);
-  std::string newName;
+  std::vector<Object> name(concurrentios);
+  Object newName;
   int r = 0;
   utime_t runtime;
   int slot = 0;
-  std::list<std::string> objects;
+  std::list<Object> objects;
   bool objects_remain = true;
 
   lock.Lock();
@@ -1107,7 +1244,8 @@ int ObjBencher::clean_up_slow(const std::string& prefix, int concurrentios) {
   //start initial removes
   for (int i = 0; i < concurrentios; ++i) {
     create_completion(i, _aio_cb, (void *)&lc);
-    r = aio_remove(name[i], i);
+    set_namespace(name[i].second);
+    r = aio_remove(name[i].first, i);
     if (r < 0) { //naughty, doesn't clean up heap
       cerr << "r = " << r << std::endl;
       goto ERR;
@@ -1125,17 +1263,17 @@ int ObjBencher::clean_up_slow(const std::string& prefix, int concurrentios) {
     bool found = false;
     while (1) {
       do {
-	if (completion_is_done(slot)) {
+        if (completion_is_done(slot)) {
           found = true;
-	  break;
-	}
+          break;
+        }
         slot++;
         if (slot == concurrentios) {
           slot = 0;
         }
       } while (slot != old_slot);
       if (found) {
-	break;
+        break;
       }
       lc.cond.Wait(lock);
     }
@@ -1169,7 +1307,8 @@ int ObjBencher::clean_up_slow(const std::string& prefix, int concurrentios) {
 
     //start new remove and check data if requested
     create_completion(slot, _aio_cb, (void *)&lc);
-    r = aio_remove(newName, slot);
+    set_namespace(newName.second);
+    r = aio_remove(newName.first, slot);
     if (r < 0) {
       goto ERR;
     }
diff --git a/src/common/obj_bencher.h b/src/common/obj_bencher.h
index 4d89f41..34e22c2 100644
--- a/src/common/obj_bencher.h
+++ b/src/common/obj_bencher.h
@@ -18,21 +18,25 @@
 #include "common/config.h"
 #include "common/Cond.h"
 #include "common/ceph_context.h"
+#include "common/Formatter.h"
+#include <cfloat>
 
 struct bench_interval_data {
-  double min_bandwidth;
-  double max_bandwidth;
+  double min_bandwidth = DBL_MAX;
+  double max_bandwidth = 0;
+  int min_iops = INT_MAX;
+  int max_iops = 0;
 };
 
 struct bench_history {
   vector<double> bandwidth;
   vector<double> latency;
+  vector<long> iops;
 };
 
 struct bench_data {
   bool done; //is the benchmark is done
   int object_size; //the size of the objects
-  int trans_size; //size of the write/read to perform
   // same as object_size for write tests
   int in_flight; //number of reads/writes being waited on
   int started;
@@ -51,8 +55,13 @@ const int OP_WRITE     = 1;
 const int OP_SEQ_READ  = 2;
 const int OP_RAND_READ = 3;
 
+// Object is composed of <oid,namespace>
+typedef std::pair<std::string, std::string> Object;
+
 class ObjBencher {
   bool show_time;
+  Formatter *formatter = NULL;
+  ostream *outstream = NULL;
 public:
   CephContext *cct;
 protected:
@@ -64,13 +73,12 @@ protected:
 
   int fetch_bench_metadata(const std::string& metadata_file, int* object_size, int* num_objects, int* prevPid);
 
-  int write_bench(int secondsToRun, int maxObjects, int concurrentios, const string& run_name_meta);
-  int seq_read_bench(int secondsToRun, int num_objects, int concurrentios, int writePid);
-  int rand_read_bench(int secondsToRun, int num_objects, int concurrentios, int writePid);
+  int write_bench(int secondsToRun, int concurrentios, const string& run_name_meta);
+  int seq_read_bench(int secondsToRun, int num_objects, int concurrentios, int writePid, bool no_verify=false);
+  int rand_read_bench(int secondsToRun, int num_objects, int concurrentios, int writePid, bool no_verify=false);
 
   int clean_up(int num_objects, int prevPid, int concurrentios);
-  int clean_up_slow(const std::string& prefix, int concurrentios);
-  bool more_objects_matching_prefix(const std::string& prefix, std::list<std::string>* name);
+  bool more_objects_matching_prefix(const std::string& prefix, std::list<Object>* name);
 
   virtual int completions_init(int concurrentios) = 0;
   virtual void completions_done() = 0;
@@ -89,7 +97,8 @@ protected:
   virtual int sync_write(const std::string& oid, bufferlist& bl, size_t len) = 0;
   virtual int sync_remove(const std::string& oid) = 0;
 
-  virtual bool get_objects(std::list<std::string>* objects, int num) = 0;
+  virtual bool get_objects(std::list< std::pair<std::string, std::string> >* objects, int num) = 0;
+  virtual void set_namespace(const std::string&) {}
 
   ostream& out(ostream& os);
   ostream& out(ostream& os, utime_t& t);
@@ -97,13 +106,20 @@ public:
   ObjBencher(CephContext *cct_) : show_time(false), cct(cct_), lock("ObjBencher::lock") {}
   virtual ~ObjBencher() {}
   int aio_bench(
-    int operation, int secondsToRun, int maxObjectsToCreate,
-    int concurrentios, int op_size, bool cleanup, const char* run_name);
-  int clean_up(const char* prefix, int concurrentios, const char* run_name);
+    int operation, int secondsToRun,
+    int concurrentios, int op_size, bool cleanup, const std::string& run_name, bool no_verify=false);
+  int clean_up(const std::string& prefix, int concurrentios, const std::string& run_name);
 
   void set_show_time(bool dt) {
     show_time = dt;
   }
+  void set_formatter(Formatter *f) {
+    formatter = f;
+  }
+  void set_outstream(ostream& os) {
+    outstream = &os;
+  }
+  int clean_up_slow(const std::string& prefix, int concurrentios);
 };
 
 
diff --git a/src/common/perf_counters.cc b/src/common/perf_counters.cc
index 6124272..0c86f13 100644
--- a/src/common/perf_counters.cc
+++ b/src/common/perf_counters.cc
@@ -292,6 +292,18 @@ void PerfCounters::dump_formatted(Formatter *f, bool schema,
     if (schema) {
       f->open_object_section(d->name);
       f->dump_int("type", d->type);
+
+      if (d->description) {
+        f->dump_string("description", d->description);
+      } else {
+        f->dump_string("description", "");
+      }
+
+      if (d->nick != NULL) {
+        f->dump_string("nick", d->nick);
+      } else {
+        f->dump_string("nick", "");
+      }
       f->close_section();
     } else {
       if (d->type & PERFCOUNTER_LONGRUNAVG) {
@@ -356,32 +368,38 @@ PerfCountersBuilder::~PerfCountersBuilder()
   m_perf_counters = NULL;
 }
 
-void PerfCountersBuilder::add_u64_counter(int idx, const char *name)
+void PerfCountersBuilder::add_u64_counter(int idx, const char *name,
+    const char *description, const char *nick)
 {
-  add_impl(idx, name, PERFCOUNTER_U64 | PERFCOUNTER_COUNTER);
+  add_impl(idx, name, description, nick, PERFCOUNTER_U64 | PERFCOUNTER_COUNTER);
 }
 
-void PerfCountersBuilder::add_u64(int idx, const char *name)
+void PerfCountersBuilder::add_u64(int idx, const char *name,
+    const char *description, const char *nick)
 {
-  add_impl(idx, name, PERFCOUNTER_U64);
+  add_impl(idx, name, description, nick, PERFCOUNTER_U64);
 }
 
-void PerfCountersBuilder::add_u64_avg(int idx, const char *name)
+void PerfCountersBuilder::add_u64_avg(int idx, const char *name,
+    const char *description, const char *nick)
 {
-  add_impl(idx, name, PERFCOUNTER_U64 | PERFCOUNTER_LONGRUNAVG);
+  add_impl(idx, name, description, nick, PERFCOUNTER_U64 | PERFCOUNTER_LONGRUNAVG);
 }
 
-void PerfCountersBuilder::add_time(int idx, const char *name)
+void PerfCountersBuilder::add_time(int idx, const char *name,
+    const char *description, const char *nick)
 {
-  add_impl(idx, name, PERFCOUNTER_TIME);
+  add_impl(idx, name, description, nick, PERFCOUNTER_TIME);
 }
 
-void PerfCountersBuilder::add_time_avg(int idx, const char *name)
+void PerfCountersBuilder::add_time_avg(int idx, const char *name,
+    const char *description, const char *nick)
 {
-  add_impl(idx, name, PERFCOUNTER_TIME | PERFCOUNTER_LONGRUNAVG);
+  add_impl(idx, name, description, nick, PERFCOUNTER_TIME | PERFCOUNTER_LONGRUNAVG);
 }
 
-void PerfCountersBuilder::add_impl(int idx, const char *name, int ty)
+void PerfCountersBuilder::add_impl(int idx, const char *name,
+    const char *description, const char *nick, int ty)
 {
   assert(idx > m_perf_counters->m_lower_bound);
   assert(idx < m_perf_counters->m_upper_bound);
@@ -390,6 +408,8 @@ void PerfCountersBuilder::add_impl(int idx, const char *name, int ty)
     &data(vec[idx - m_perf_counters->m_lower_bound - 1]);
   assert(data.type == PERFCOUNTER_NONE);
   data.name = name;
+  data.description = description;
+  data.nick = nick;
   data.type = (enum perfcounter_type_d)ty;
 }
 
diff --git a/src/common/perf_counters.h b/src/common/perf_counters.h
index 8850138..34f4067 100644
--- a/src/common/perf_counters.h
+++ b/src/common/perf_counters.h
@@ -114,6 +114,8 @@ private:
   struct perf_counter_data_any_d {
     perf_counter_data_any_d()
       : name(NULL),
+        description(NULL),
+        nick(NULL),
 	type(PERFCOUNTER_NONE),
 	u64(0),
 	avgcount(0),
@@ -121,6 +123,8 @@ private:
     {}
     perf_counter_data_any_d(const perf_counter_data_any_d& other)
       : name(other.name),
+        description(other.description),
+        nick(other.nick),
 	type(other.type),
 	u64(other.u64.read()) {
       pair<uint64_t,uint64_t> a = other.read_avg();
@@ -129,10 +133,9 @@ private:
       avgcount2.set(a.second);
     }
 
-    void write_schema_json(char *buf, size_t buf_sz) const;
-    void  write_json(char *buf, size_t buf_sz) const;
-
     const char *name;
+    const char *description;
+    const char *nick;
     enum perfcounter_type_d type;
     atomic64_t u64;
     atomic64_t avgcount;
@@ -149,6 +152,8 @@ private:
 
     perf_counter_data_any_d& operator=(const perf_counter_data_any_d& other) {
       name = other.name;
+      description = other.description;
+      nick = other.nick;
       type = other.type;
       pair<uint64_t,uint64_t> a = other.read_avg();
       u64.set(a.first);
@@ -234,16 +239,22 @@ public:
   PerfCountersBuilder(CephContext *cct, const std::string &name,
 		    int first, int last);
   ~PerfCountersBuilder();
-  void add_u64(int key, const char *name);
-  void add_u64_counter(int key, const char *name);
-  void add_u64_avg(int key, const char *name);
-  void add_time(int key, const char *name);
-  void add_time_avg(int key, const char *name);
+  void add_u64(int key, const char *name,
+      const char *description=NULL, const char *nick = NULL);
+  void add_u64_counter(int key, const char *name,
+      const char *description=NULL, const char *nick = NULL);
+  void add_u64_avg(int key, const char *name,
+      const char *description=NULL, const char *nick = NULL);
+  void add_time(int key, const char *name,
+      const char *description=NULL, const char *nick = NULL);
+  void add_time_avg(int key, const char *name,
+      const char *description=NULL, const char *nick = NULL);
   PerfCounters* create_perf_counters();
 private:
   PerfCountersBuilder(const PerfCountersBuilder &rhs);
   PerfCountersBuilder& operator=(const PerfCountersBuilder &rhs);
-  void add_impl(int idx, const char *name, int ty);
+  void add_impl(int idx, const char *name,
+                const char *description, const char *nick, int ty);
 
   PerfCounters *m_perf_counters;
 };
diff --git a/src/common/safe_io.c b/src/common/safe_io.c
index 76fd25a..9367a9a 100644
--- a/src/common/safe_io.c
+++ b/src/common/safe_io.c
@@ -133,6 +133,8 @@ ssize_t safe_splice(int fd_in, loff_t *off_in, int fd_out, loff_t *off_out,
       }
       if (errno == EINTR)
 	continue;
+      if (errno == EAGAIN)
+	break;
       return -errno;
     }
     cnt += r;
diff --git a/src/common/shared_cache.hpp b/src/common/shared_cache.hpp
index aaa47f0..c55119c 100644
--- a/src/common/shared_cache.hpp
+++ b/src/common/shared_cache.hpp
@@ -22,7 +22,7 @@
 #include "common/Mutex.h"
 #include "common/Cond.h"
 
-template <class K, class V>
+template <class K, class V, class C = std::less<K> >
 class SharedLRU {
   CephContext *cct;
   typedef ceph::shared_ptr<V> VPtr;
@@ -34,10 +34,10 @@ class SharedLRU {
 public:
   int waiting;
 private:
-  map<K, typename list<pair<K, VPtr> >::iterator > contents;
+  map<K, typename list<pair<K, VPtr> >::iterator, C> contents;
   list<pair<K, VPtr> > lru;
 
-  map<K, pair<WeakVPtr, V*> > weak_refs;
+  map<K, pair<WeakVPtr, V*>, C> weak_refs;
 
   void trim_cache(list<VPtr> *to_release) {
     while (size > max_size) {
@@ -47,7 +47,7 @@ private:
   }
 
   void lru_remove(const K& key) {
-    typename map<K, typename list<pair<K, VPtr> >::iterator>::iterator i =
+    typename map<K, typename list<pair<K, VPtr> >::iterator, C>::iterator i =
       contents.find(key);
     if (i == contents.end())
       return;
@@ -57,7 +57,7 @@ private:
   }
 
   void lru_add(const K& key, const VPtr& val, list<VPtr> *to_release) {
-    typename map<K, typename list<pair<K, VPtr> >::iterator>::iterator i =
+    typename map<K, typename list<pair<K, VPtr> >::iterator, C>::iterator i =
       contents.find(key);
     if (i != contents.end()) {
       lru.splice(lru.begin(), lru, i->second);
@@ -71,7 +71,7 @@ private:
 
   void remove(const K& key, V *valptr) {
     Mutex::Locker l(lock);
-    typename map<K, pair<WeakVPtr, V*> >::iterator i = weak_refs.find(key);
+    typename map<K, pair<WeakVPtr, V*>, C>::iterator i = weak_refs.find(key);
     if (i != weak_refs.end() && i->second.second == valptr) {
       weak_refs.erase(i);
     }
@@ -80,9 +80,9 @@ private:
 
   class Cleanup {
   public:
-    SharedLRU<K, V> *cache;
+    SharedLRU<K, V, C> *cache;
     K key;
-    Cleanup(SharedLRU<K, V> *cache, K key) : cache(cache), key(key) {}
+    Cleanup(SharedLRU<K, V, C> *cache, K key) : cache(cache), key(key) {}
     void operator()(V *ptr) {
       cache->remove(key, ptr);
       delete ptr;
@@ -105,6 +105,24 @@ public:
     }
   }
 
+  /// adjust container comparator (for purposes of get_next sort order)
+  void reset_comparator(C comp) {
+    // get_next uses weak_refs; that's the only container we need to
+    // reorder.
+    map<K, pair<WeakVPtr, V*>, C> temp;
+
+    Mutex::Locker l(lock);
+    temp.swap(weak_refs);
+
+    // reconstruct with new comparator
+    weak_refs = map<K, pair<WeakVPtr, V*>, C>(comp);
+    weak_refs.insert(temp.begin(), temp.end());
+  }
+
+  C get_comparator() {
+    return weak_refs.key_comp();
+  }
+
   void set_cct(CephContext *c) {
     cct = c;
   }
@@ -116,7 +134,7 @@ public:
   }
 
   void dump_weak_refs(ostream& out) {
-    for (typename map<K, pair<WeakVPtr, V*> >::iterator p = weak_refs.begin();
+    for (typename map<K, pair<WeakVPtr, V*>, C>::iterator p = weak_refs.begin();
 	 p != weak_refs.end();
 	 ++p) {
       out << __func__ << " " << this << " weak_refs: "
@@ -188,7 +206,7 @@ public:
 	retry = false;
 	if (weak_refs.empty())
 	  break;
-	typename map<K, pair<WeakVPtr, V*> >::iterator i =
+	typename map<K, pair<WeakVPtr, V*>, C>::iterator i =
 	  weak_refs.lower_bound(key);
 	if (i == weak_refs.end())
 	  --i;
@@ -210,7 +228,7 @@ public:
     {
       Mutex::Locker l(lock);
       VPtr next_val;
-      typename map<K, pair<WeakVPtr, V*> >::iterator i = weak_refs.upper_bound(key);
+      typename map<K, pair<WeakVPtr, V*>, C>::iterator i = weak_refs.upper_bound(key);
 
       while (i != weak_refs.end() &&
 	     !(next_val = i->second.first.lock()))
@@ -246,7 +264,7 @@ public:
       bool retry = false;
       do {
 	retry = false;
-	typename map<K, pair<WeakVPtr, V*> >::iterator i = weak_refs.find(key);
+	typename map<K, pair<WeakVPtr, V*>, C>::iterator i = weak_refs.find(key);
 	if (i != weak_refs.end()) {
 	  val = i->second.first.lock();
 	  if (val) {
@@ -270,7 +288,7 @@ public:
       bool retry = false;
       do {
 	retry = false;
-	typename map<K, pair<WeakVPtr, V*> >::iterator i = weak_refs.find(key);
+	typename map<K, pair<WeakVPtr, V*>, C>::iterator i = weak_refs.find(key);
 	if (i != weak_refs.end()) {
 	  val = i->second.first.lock();
 	  if (val) {
@@ -320,7 +338,7 @@ public:
     list<VPtr> to_release;
     {
       Mutex::Locker l(lock);
-      typename map<K, pair<WeakVPtr, V*> >::iterator actual =
+      typename map<K, pair<WeakVPtr, V*>, C>::iterator actual =
 	weak_refs.lower_bound(key);
       if (actual != weak_refs.end() && actual->first == key) {
         if (existed) 
diff --git a/src/common/sharedptr_registry.hpp b/src/common/sharedptr_registry.hpp
index b8c26ce..bb0fcad 100644
--- a/src/common/sharedptr_registry.hpp
+++ b/src/common/sharedptr_registry.hpp
@@ -24,7 +24,7 @@
  * Provides a registry of shared_ptr<V> indexed by K while
  * the references are alive.
  */
-template <class K, class V>
+template <class K, class V, class C = std::less<K> >
 class SharedPtrRegistry {
 public:
   typedef ceph::shared_ptr<V> VPtr;
@@ -33,18 +33,18 @@ public:
 private:
   Mutex lock;
   Cond cond;
-  map<K, pair<WeakVPtr, V*> > contents;
+  map<K, pair<WeakVPtr, V*>, C> contents;
 
   class OnRemoval {
-    SharedPtrRegistry<K,V> *parent;
+    SharedPtrRegistry<K,V,C> *parent;
     K key;
   public:
-    OnRemoval(SharedPtrRegistry<K,V> *parent, K key) :
+    OnRemoval(SharedPtrRegistry<K,V,C> *parent, K key) :
       parent(parent), key(key) {}
     void operator()(V *to_remove) {
       {
 	Mutex::Locker l(parent->lock);
-	typename map<K, pair<WeakVPtr, V*> >::iterator i =
+	typename map<K, pair<WeakVPtr, V*>, C>::iterator i =
 	  parent->contents.find(key);
 	if (i != parent->contents.end() &&
 	    i->second.second == to_remove) {
@@ -73,7 +73,7 @@ public:
     {
       Mutex::Locker l(lock);
       VPtr next_val;
-      typename map<K, pair<WeakVPtr, V*> >::iterator i =
+      typename map<K, pair<WeakVPtr, V*>, C>::iterator i =
 	contents.upper_bound(key);
       while (i != contents.end() &&
 	     !(next_val = i->second.first.lock()))
@@ -92,7 +92,7 @@ public:
   bool get_next(const K &key, pair<K, V> *next) {
     VPtr next_val;
     Mutex::Locker l(lock);
-    typename map<K, pair<WeakVPtr, V*> >::iterator i =
+    typename map<K, pair<WeakVPtr, V*>, C>::iterator i =
       contents.upper_bound(key);
     while (i != contents.end() &&
 	   !(next_val = i->second.first.lock()))
@@ -108,7 +108,7 @@ public:
     Mutex::Locker l(lock);
     waiting++;
     while (1) {
-      typename map<K, pair<WeakVPtr, V*> >::iterator i =
+      typename map<K, pair<WeakVPtr, V*>, C>::iterator i =
 	contents.find(key);
       if (i != contents.end()) {
 	VPtr retval = i->second.first.lock();
@@ -129,7 +129,7 @@ public:
     Mutex::Locker l(lock);
     waiting++;
     while (1) {
-      typename map<K, pair<WeakVPtr, V*> >::iterator i =
+      typename map<K, pair<WeakVPtr, V*>, C>::iterator i =
 	contents.find(key);
       if (i != contents.end()) {
 	VPtr retval = i->second.first.lock();
@@ -165,7 +165,7 @@ public:
     Mutex::Locker l(lock);
     waiting++;
     while (1) {
-      typename map<K, pair<WeakVPtr, V*> >::iterator i =
+      typename map<K, pair<WeakVPtr, V*>, C>::iterator i =
 	contents.find(key);
       if (i != contents.end()) {
 	VPtr retval = i->second.first.lock();
diff --git a/src/common/simple_cache.hpp b/src/common/simple_cache.hpp
index 30abd0a..8038306 100644
--- a/src/common/simple_cache.hpp
+++ b/src/common/simple_cache.hpp
@@ -21,13 +21,13 @@
 #include "common/Mutex.h"
 #include "common/Cond.h"
 
-template <class K, class V>
+template <class K, class V, class C = std::less<K> >
 class SimpleLRU {
   Mutex lock;
   size_t max_size;
-  map<K, typename list<pair<K, V> >::iterator> contents;
+  map<K, typename list<pair<K, V> >::iterator, C> contents;
   list<pair<K, V> > lru;
-  map<K, V> pinned;
+  map<K, V, C> pinned;
 
   void trim_cache() {
     while (lru.size() > max_size) {
@@ -52,7 +52,7 @@ public:
 
   void clear_pinned(K e) {
     Mutex::Locker l(lock);
-    for (typename map<K, V>::iterator i = pinned.begin();
+    for (typename map<K, V, C>::iterator i = pinned.begin();
 	 i != pinned.end() && i->first <= e;
 	 pinned.erase(i++)) {
       if (!contents.count(i->first))
@@ -64,7 +64,7 @@ public:
 
   void clear(K key) {
     Mutex::Locker l(lock);
-    typename map<K, typename list<pair<K, V> >::iterator>::iterator i =
+    typename map<K, typename list<pair<K, V> >::iterator, C>::iterator i =
       contents.find(key);
     if (i == contents.end())
       return;
diff --git a/src/common/str_map.cc b/src/common/str_map.cc
index b731af3..bd68612 100644
--- a/src/common/str_map.cc
+++ b/src/common/str_map.cc
@@ -58,6 +58,20 @@ int get_json_str_map(
   }
   return 0;
 }
+string trim(const string& str) {
+  size_t start = 0;
+  size_t end = str.size() - 1;
+  while (isspace(str[start]) != 0 && start <= end) {
+    ++start;
+  }
+  while (isspace(str[end]) != 0 && start <= end) {
+    --end;
+  }
+  if (start <= end) {
+    return str.substr(start, end - start + 1);
+  }
+  return string();
+}
 
 int get_str_map(
     const string &str,
@@ -71,9 +85,9 @@ int get_str_map(
     if (equal == string::npos)
       (*str_map)[*i] = string();
     else {
-      const string key = i->substr(0, equal);
+      const string key = trim(i->substr(0, equal));
       equal++;
-      const string value = i->substr(equal);
+      const string value = trim(i->substr(equal));
       (*str_map)[key] = value;
     }
   }
diff --git a/src/common/strtol.cc b/src/common/strtol.cc
index 8a43eb5..ea39ba0 100644
--- a/src/common/strtol.cc
+++ b/src/common/strtol.cc
@@ -24,26 +24,32 @@ using std::ostringstream;
 long long strict_strtoll(const char *str, int base, std::string *err)
 {
   char *endptr;
+  std::string errStr;
   errno = 0; /* To distinguish success/failure after call (see man page) */
   long long ret = strtoll(str, &endptr, base);
 
   if ((errno == ERANGE && (ret == LLONG_MAX || ret == LLONG_MIN))
       || (errno != 0 && ret == 0)) {
-    ostringstream oss;
-    oss << "strict_strtoll: integer underflow or overflow parsing '" << str << "'";
-    *err = oss.str();
+    errStr = "The option value '";
+    errStr.append(str);
+    errStr.append("'");
+    errStr.append(" seems to be invalid");
+    *err = errStr;
     return 0;
   }
   if (endptr == str) {
-    ostringstream oss;
-    oss << "strict_strtoll: expected integer, got: '" << str << "'";
-    *err = oss.str();
+    errStr = "Expected option value to be integer, got '";
+    errStr.append(str);
+    errStr.append("'");
+    *err =  errStr;
     return 0;
   }
   if (*endptr != '\0') {
-    ostringstream oss;
-    oss << "strict_strtoll: garbage at end of string. got: '" << str << "'";
-    *err = oss.str();
+    errStr = "The option value '";
+    errStr.append(str);
+    errStr.append("'");
+    errStr.append(" seems to be invalid");
+    *err =  errStr;
     return 0;
   }
   *err = "";
@@ -52,19 +58,16 @@ long long strict_strtoll(const char *str, int base, std::string *err)
 
 int strict_strtol(const char *str, int base, std::string *err)
 {
+  std::string errStr;
   long long ret = strict_strtoll(str, base, err);
   if (!err->empty())
     return 0;
-  if (ret <= INT_MIN) {
-    ostringstream oss;
-    oss << "strict_strtol: integer underflow parsing '" << str << "'";
-    *err = oss.str();
-    return 0;
-  }
-  if (ret >= INT_MAX) {
-    ostringstream oss;
-    oss << "strict_strtol: integer overflow parsing '" << str << "'";
-    *err = oss.str();
+  if ((ret <= INT_MIN) || (ret >= INT_MAX)) {
+    errStr = "The option value '";
+    errStr.append(str);
+    errStr.append("'");
+    errStr.append(" seems to be invalid");
+    *err = errStr;
     return 0;
   }
   return static_cast<int>(ret);
diff --git a/src/common/sync_filesystem.h b/src/common/sync_filesystem.h
index b7eaea7..a664a86 100644
--- a/src/common/sync_filesystem.h
+++ b/src/common/sync_filesystem.h
@@ -17,12 +17,10 @@
 
 #include <unistd.h>
 
-#ifndef __CYGWIN__
-# ifndef DARWIN
-#  include <sys/ioctl.h>
-#  include <syscall.h>
-#  include "../os/btrfs_ioctl.h"
-# endif
+#if defined(__linux__)
+#include <sys/ioctl.h>
+#include <syscall.h>
+#include "../os/btrfs_ioctl.h"
 #endif
 
 inline int sync_filesystem(int fd)
@@ -34,22 +32,25 @@ inline int sync_filesystem(int fd)
 #ifdef HAVE_SYS_SYNCFS
   if (syncfs(fd) == 0)
     return 0;
-  else
-    return -errno;
 #elif defined(SYS_syncfs)
   if (syscall(SYS_syncfs, fd) == 0)
     return 0;
-  else
-    return -errno;
 #elif defined(__NR_syncfs)
   if (syscall(__NR_syncfs, fd) == 0)
     return 0;
-  else
-    return -errno;
 #endif
 
+#if defined(HAVE_SYS_SYNCFS) || defined(SYS_syncfs) || defined(__NR_syncfs)
+  else if (errno == ENOSYS) {
+    sync();
+    return 0;
+  } else {
+    return -errno;
+  }
+#else
   sync();
   return 0;
+#endif
 }
 
 #endif
diff --git a/src/common/tracked_int_ptr.hpp b/src/common/tracked_int_ptr.hpp
index ba0900d..e0e4238 100644
--- a/src/common/tracked_int_ptr.hpp
+++ b/src/common/tracked_int_ptr.hpp
@@ -50,12 +50,21 @@ public:
     TrackedIntPtr o(rhs.ptr);
     swap(o);
   }
+  const T &operator*() const {
+    return *ptr;
+  }
   T &operator*() {
     return *ptr;
   }
+  const T *operator->() const {
+    return ptr;
+  }
   T *operator->() {
     return ptr;
   }
+  operator bool() const {
+    return ptr != NULL;
+  }
   bool operator<(const TrackedIntPtr &lhs) const {
     return ptr < lhs.ptr;
   }
diff --git a/src/common/types.cc b/src/common/types.cc
index 1ad2ccc..3686132 100644
--- a/src/common/types.cc
+++ b/src/common/types.cc
@@ -22,11 +22,11 @@
 #define UINT8_MAX (255)
 #endif
 
-const shard_id_t shard_id_t::NO_SHARD(UINT8_MAX);
+const shard_id_t shard_id_t::NO_SHARD(-1);
 
 ostream &operator<<(ostream &lhs, const shard_id_t &rhs)
 {
-  return lhs << (unsigned)rhs.id;
+  return lhs << (unsigned)(uint8_t)rhs.id;
 }
 
 #endif
diff --git a/src/common/util.cc b/src/common/util.cc
index 212384b..f9262f3 100644
--- a/src/common/util.cc
+++ b/src/common/util.cc
@@ -13,8 +13,11 @@
  */
 
 #include <errno.h>
+#include <sys/utsname.h>
+#include <boost/lexical_cast.hpp>
 
 #include "include/util.h"
+#include "common/debug.h"
 #include "common/errno.h"
 #include "common/strtol.h"
 
@@ -22,6 +25,11 @@
 #include <sys/vfs.h>
 #endif
 
+#if defined(DARWIN) || defined(__FreeBSD__)
+#include <sys/param.h>
+#include <sys/mount.h>
+#endif
+
 // test if an entire buf is zero in 8-byte chunks
 bool buf_is_zero(const char *buf, size_t len)
 {
@@ -126,3 +134,127 @@ int get_fs_stats(ceph_data_stats_t &stats, const char *path)
   stats.avail_percent = (((float)stats.byte_avail/stats.byte_total)*100);
   return 0;
 }
+
+static bool lsb_release_set(char *buf, const char *prefix,
+			    map<string, string> *pm, const char *key)
+{
+  if (strncmp(buf, prefix, strlen(prefix))) {
+    return false;
+  }
+
+  if (buf[strlen(buf)-1] == '\n')
+    buf[strlen(buf)-1] = '\0';
+
+  char *value = buf + strlen(prefix) + 1;
+  (*pm)[key] = value;
+  return true;
+}
+
+static void lsb_release_parse(map<string, string> *m, CephContext *cct)
+{
+  FILE *fp = popen("lsb_release -idrc", "r");
+  if (!fp) {
+    int ret = -errno;
+    lderr(cct) << "lsb_release_parse - failed to call lsb_release binary with error: " << cpp_strerror(ret) << dendl;
+    return;
+  }
+
+  char buf[512];
+  while (fgets(buf, sizeof(buf) - 1, fp) != NULL) {
+    if (lsb_release_set(buf, "Distributor ID:", m, "distro"))
+      continue;
+    if (lsb_release_set(buf, "Description:", m, "distro_description"))
+      continue;
+    if (lsb_release_set(buf, "Release:", m, "distro_version"))
+      continue;
+    if (lsb_release_set(buf, "Codename:", m, "distro_codename"))
+      continue;
+
+    lderr(cct) << "unhandled output: " << buf << dendl;
+  }
+
+  if (pclose(fp)) {
+    int ret = -errno;
+    lderr(cct) << "lsb_release_parse - pclose failed: " << cpp_strerror(ret) << dendl;
+  }
+}
+
+void collect_sys_info(map<string, string> *m, CephContext *cct)
+{
+  // kernel info
+  struct utsname u;
+  int r = uname(&u);
+  if (r >= 0) {
+    (*m)["os"] = u.sysname;
+    (*m)["kernel_version"] = u.release;
+    (*m)["kernel_description"] = u.version;
+    (*m)["hostname"] = u.nodename;
+    (*m)["arch"] = u.machine;
+  }
+
+  // memory
+  FILE *f = fopen("/proc/meminfo", "r");
+  if (f) {
+    char buf[100];
+    while (!feof(f)) {
+      char *line = fgets(buf, sizeof(buf), f);
+      if (!line)
+	break;
+      char key[40];
+      long long value;
+      int r = sscanf(line, "%s %lld", key, &value);
+      if (r == 2) {
+	if (strcmp(key, "MemTotal:") == 0)
+	  (*m)["mem_total_kb"] = boost::lexical_cast<string>(value);
+	else if (strcmp(key, "SwapTotal:") == 0)
+	  (*m)["mem_swap_kb"] = boost::lexical_cast<string>(value);
+      }
+    }
+    fclose(f);
+  }
+
+  // processor
+  f = fopen("/proc/cpuinfo", "r");
+  if (f) {
+    char buf[100];
+    while (!feof(f)) {
+      char *line = fgets(buf, sizeof(buf), f);
+      if (!line)
+	break;
+      if (strncmp(line, "model name", 10) == 0) {
+	char *c = strchr(buf, ':');
+	c++;
+	while (*c == ' ')
+	  ++c;
+	char *nl = c;
+	while (*nl != '\n')
+	  ++nl;
+	*nl = '\0';
+	(*m)["cpu"] = c;
+	break;
+      }
+    }
+    fclose(f);
+  }
+
+  // distro info
+  lsb_release_parse(m, cct);
+}
+
+void dump_services(Formatter* f, const map<string, list<int> >& services, const char* type)
+{
+  assert(f);
+
+  f->open_object_section(type);
+  for (map<string, list<int> >::const_iterator host = services.begin();
+       host != services.end(); ++host) {
+    f->open_array_section(host->first.c_str());
+    const list<int>& hosted = host->second;
+    for (list<int>::const_iterator s = hosted.begin();
+	 s != hosted.end(); ++s) {
+      f->dump_int(type, *s);
+    }
+    f->close_section();
+  }
+  f->close_section();
+}
diff --git a/src/common/xattr.c b/src/common/xattr.c
index 239ee02..caa31d5 100644
--- a/src/common/xattr.c
+++ b/src/common/xattr.c
@@ -9,6 +9,7 @@
  * Foundation.  See file COPYING.
  */
 
+#include "acconfig.h"
 #if defined(__FreeBSD__)
 #include <errno.h>
 #include <stdint.h>
@@ -42,8 +43,10 @@ ceph_os_setxattr(const char *path, const char *name,
 	    size);
 	if (error > 0)
 		error = 0;
-#elif defined(__linux__) || defined(DARWIN)
+#elif defined(__linux__) 
 	error = setxattr(path, name, value, size, 0);
+#elif defined(DARWIN)
+	error = setxattr(path, name, value, size, 0 /* position */, 0);
 #endif
 
 	return (error);
@@ -56,12 +59,13 @@ ceph_os_fsetxattr(int fd, const char *name, const void *value,
 	int error = -1;
 
 #if defined(__FreeBSD__)
-	error = extattr_set_fd(fd, EXTATTR_NAMESPACE_USER, name, value,
-	    size);
+	error = extattr_set_fd(fd, EXTATTR_NAMESPACE_USER, name, value, size);
 	if (error > 0)
 		error = 0;
-#elif defined(__linux__) || defined(DARWIN)
+#elif defined(__linux__)
 	error = fsetxattr(fd, name, value, size, 0);
+#elif defined(DARWIN)
+	error = fsetxattr(fd, name, value, size, 0, 0 /* no options should be indentical to Linux */ );
 #endif
 
 	return (error);
@@ -93,7 +97,10 @@ void *value, size_t size)
 #elif defined(__linux__)
 	error = getxattr(path, name, value, size);
 #elif defined(DARWIN)
-	error = getxattr(path, name, value, size, 0);
+	error = getxattr(path, name, value, size, 0 /* position  */, 0);
+	/* ENOATTR and ENODATA have different values */
+	if (error < 0 && errno == ENOATTR)
+		errno = ENODATA;
 #endif
 
 	return (error);
@@ -125,7 +132,10 @@ ceph_os_fgetxattr(int fd, const char *name, void *value,
 #elif defined(__linux__)
 	error = fgetxattr(fd, name, value, size);
 #elif defined(DARWIN)
-	error = fgetxattr(fd, name, value, size, 0);
+	error = fgetxattr(fd, name, value, size, 0, 0 /* no options */);
+	/* ENOATTR and ENODATA have different values */
+	if (error < 0 && errno == ENOATTR)
+		errno = ENODATA;
 #endif
 
 	return (error);
@@ -240,6 +250,9 @@ ceph_os_removexattr(const char *path, const char *name)
 	error = removexattr(path, name);
 #elif defined(DARWIN)
 	error = removexattr(path, name, 0);
+	/* ENOATTR and ENODATA have different values */
+	if (error < 0 && errno == ENOATTR)
+		errno = ENODATA;
 #endif
 
 	return (error);
@@ -256,6 +269,9 @@ ceph_os_fremovexattr(int fd, const char *name)
 	error = fremovexattr(fd, name);
 #elif defined(DARWIN)
 	error = fremovexattr(fd, name, 0);
+	/* ENOATTR and ENODATA have different values */
+	if (error < 0 && errno == ENOATTR)
+		errno = ENODATA;
 #endif
 
 	return (error);
diff --git a/src/common/xattr.h b/src/common/xattr.h
index 30b0485..147a23c 100644
--- a/src/common/xattr.h
+++ b/src/common/xattr.h
@@ -13,11 +13,19 @@
 #define CEPH_EXTATTR_H
 
 #include <sys/types.h>
+#include <errno.h>
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+// Almost everyone defines ENOATTR, except for Linux,
+// which does #define ENOATTR ENODATA.  It seems that occasionally that
+// isn't defined, though, so let's make sure.
+#ifndef ENOATTR
+# define ENOATTR ENODATA
+#endif
+
 int ceph_os_setxattr(const char *path, const char *name,
                   const void *value, size_t size);
 int ceph_os_fsetxattr(int fd, const char *name, const void *value,
diff --git a/src/compressor/AsyncCompressor.cc b/src/compressor/AsyncCompressor.cc
new file mode 100644
index 0000000..564d614
--- /dev/null
+++ b/src/compressor/AsyncCompressor.cc
@@ -0,0 +1,157 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Haomai Wang <haomaiwang at gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/dout.h"
+#include "common/errno.h"
+#include "AsyncCompressor.h"
+
+#define dout_subsys ceph_subsys_compressor
+#undef dout_prefix
+#define dout_prefix *_dout << "compressor "
+
+AsyncCompressor::AsyncCompressor(CephContext *c):
+  compressor(Compressor::create(c->_conf->async_compressor_type)), cct(c),
+  job_id(0),
+  compress_tp(g_ceph_context, "AsyncCompressor::compressor_tp", cct->_conf->async_compressor_threads, "async_compressor_threads"),
+  job_lock("AsyncCompressor::job_lock"),
+  compress_wq(this, c->_conf->async_compressor_thread_timeout, c->_conf->async_compressor_thread_suicide_timeout, &compress_tp) {
+}
+
+void AsyncCompressor::init()
+{
+  ldout(cct, 10) << __func__ << dendl;
+  compress_tp.start();
+}
+
+void AsyncCompressor::terminate()
+{
+  ldout(cct, 10) << __func__ << dendl;
+  compress_tp.stop();
+}
+
+uint64_t AsyncCompressor::async_compress(bufferlist &data)
+{
+  uint64_t id = job_id.inc();
+  pair<unordered_map<uint64_t, Job>::iterator, bool> it;
+  {
+    Mutex::Locker l(job_lock);
+    it = jobs.insert(make_pair(id, Job(id, true)));
+    it.first->second.data = data;
+  }
+  compress_wq.queue(&it.first->second);
+  ldout(cct, 10) << __func__ << " insert async compress job id=" << id << dendl;
+  return id;
+}
+
+uint64_t AsyncCompressor::async_decompress(bufferlist &data)
+{
+  uint64_t id = job_id.inc();
+  pair<unordered_map<uint64_t, Job>::iterator, bool> it;
+  {
+    Mutex::Locker l(job_lock);
+    it = jobs.insert(make_pair(id, Job(id, false)));
+    it.first->second.data = data;
+  }
+  compress_wq.queue(&it.first->second);
+  ldout(cct, 10) << __func__ << " insert async decompress job id=" << id << dendl;
+  return id;
+}
+
+int AsyncCompressor::get_compress_data(uint64_t compress_id, bufferlist &data, bool blocking, bool *finished)
+{
+  assert(finished);
+  Mutex::Locker l(job_lock);
+  unordered_map<uint64_t, Job>::iterator it = jobs.find(compress_id);
+  if (it == jobs.end() || !it->second.is_compress) {
+    ldout(cct, 10) << __func__ << " missing to get compress job id=" << compress_id << dendl;
+    return -ENOENT;
+  }
+  int status;
+
+ retry:
+  status = it->second.status.read();
+  if (status == DONE) {
+    ldout(cct, 20) << __func__ << " successfully getting compressed data, job id=" << compress_id << dendl;
+    *finished = true;
+    data.swap(it->second.data);
+    jobs.erase(it);
+  } else if (status == ERROR) {
+    ldout(cct, 20) << __func__ << " compressed data failed, job id=" << compress_id << dendl;
+    jobs.erase(it);
+    return -EIO;
+  } else if (blocking) {
+    if (it->second.status.compare_and_swap(WAIT, DONE)) {
+      ldout(cct, 10) << __func__ << " compress job id=" << compress_id << " hasn't finished, abort!"<< dendl;
+      if (compressor->compress(it->second.data, data)) {
+        ldout(cct, 1) << __func__ << " compress job id=" << compress_id << " failed!"<< dendl;
+        it->second.status.set(ERROR);
+        return -EIO;
+      }
+      *finished = true;
+    } else {
+      job_lock.Unlock();
+      usleep(1000);
+      job_lock.Lock();
+      goto retry;
+    }
+  } else {
+    ldout(cct, 10) << __func__ << " compress job id=" << compress_id << " hasn't finished."<< dendl;
+    *finished = false;
+  }
+  return 0;
+}
+
+int AsyncCompressor::get_decompress_data(uint64_t decompress_id, bufferlist &data, bool blocking, bool *finished)
+{
+  assert(finished);
+  Mutex::Locker l(job_lock);
+  unordered_map<uint64_t, Job>::iterator it = jobs.find(decompress_id);
+  if (it == jobs.end() || it->second.is_compress) {
+    ldout(cct, 10) << __func__ << " missing to get decompress job id=" << decompress_id << dendl;
+    return -ENOENT;
+  }
+  int status;
+
+ retry:
+  status = it->second.status.read();
+  if (status == DONE) {
+    ldout(cct, 20) << __func__ << " successfully getting decompressed data, job id=" << decompress_id << dendl;
+    *finished = true;
+    data.swap(it->second.data);
+    jobs.erase(it);
+  } else if (status == ERROR) {
+    ldout(cct, 20) << __func__ << " compressed data failed, job id=" << decompress_id << dendl;
+    jobs.erase(it);
+    return -EIO;
+  } else if (blocking) {
+    if (it->second.status.compare_and_swap(WAIT, DONE)) {
+      ldout(cct, 10) << __func__ << " decompress job id=" << decompress_id << " hasn't started, abort!"<< dendl;
+      if (compressor->decompress(it->second.data, data)) {
+        ldout(cct, 1) << __func__ << " decompress job id=" << decompress_id << " failed!"<< dendl;
+        it->second.status.set(ERROR);
+        return -EIO;
+      }
+      *finished = true;
+    } else {
+      job_lock.Unlock();
+      usleep(1000);
+      job_lock.Lock();
+      goto retry;
+    }
+  } else {
+    ldout(cct, 10) << __func__ << " decompress job id=" << decompress_id << " hasn't finished."<< dendl;
+    *finished = false;
+  }
+  return 0;
+}
diff --git a/src/compressor/AsyncCompressor.h b/src/compressor/AsyncCompressor.h
new file mode 100644
index 0000000..15af92b
--- /dev/null
+++ b/src/compressor/AsyncCompressor.h
@@ -0,0 +1,128 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Haomai Wang <haomaiwang at gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_ASYNCCOMPRESSOR_H
+#define CEPH_ASYNCCOMPRESSOR_H
+
+#include <deque>
+
+#include "include/atomic.h"
+#include "include/str_list.h"
+#include "Compressor.h"
+#include "common/WorkQueue.h"
+
+
+class AsyncCompressor {
+ private:
+  Compressor *compressor;
+  CephContext *cct;
+  atomic_t job_id;
+  vector<int> coreids;
+  ThreadPool compress_tp;
+
+  enum {
+    WAIT,
+    WORKING,
+    DONE,
+    ERROR
+  } status;
+  struct Job {
+    uint64_t id;
+    atomic_t status;
+    bool is_compress;
+    bufferlist data;
+    Job(uint64_t i, bool compress): id(i), status(WAIT), is_compress(compress) {}
+    Job(const Job &j): id(j.id), status(j.status.read()), is_compress(j.is_compress), data(j.data) {}
+  };
+  Mutex job_lock;
+  // only when job.status == DONE && with job_lock holding, we can insert/erase element in jobs
+  // only when job.status == WAIT && with pool_lock holding, you can change its status and modify element's info later
+  unordered_map<uint64_t, Job> jobs;
+
+  struct CompressWQ : public ThreadPool::WorkQueue<Job> {
+    typedef AsyncCompressor::Job Job;
+    AsyncCompressor *async_compressor;
+    deque<Job*> job_queue;
+
+    CompressWQ(AsyncCompressor *ac, time_t timeout, time_t suicide_timeout, ThreadPool *tp)
+      : ThreadPool::WorkQueue<Job>("AsyncCompressor::CompressWQ", timeout, suicide_timeout, tp), async_compressor(ac) {}
+
+    bool _enqueue(Job *item) {
+      job_queue.push_back(item);
+      return true;
+    }
+    void _dequeue(Job *item) {
+      assert(0);
+    }
+    bool _empty() {
+      return job_queue.empty();
+    }
+    Job* _dequeue() {
+      if (job_queue.empty())
+        return NULL;
+      Job *item = NULL;
+      while (!job_queue.empty()) {
+        item = job_queue.front();
+        job_queue.pop_front();
+        if (item->status.compare_and_swap(WAIT, WORKING)) {
+          break;
+        } else {
+          Mutex::Locker (async_compressor->job_lock);
+          async_compressor->jobs.erase(item->id);
+          item = NULL;
+        }
+      }
+      return item;
+    }
+    void _process(Job *item, ThreadPool::TPHandle &handle) {
+      assert(item->status.read() == WORKING);
+      bufferlist out;
+      int r;
+      if (item->is_compress)
+        r = async_compressor->compressor->compress(item->data, out);
+      else
+        r = async_compressor->compressor->decompress(item->data, out);
+      if (!r) {
+        item->data.swap(out);
+        assert(item->status.compare_and_swap(WORKING, DONE));
+      } else {
+        item->status.set(ERROR);
+      }
+    }
+    void _process_finish(Job *item) {}
+    void _clear() {}
+  } compress_wq;
+  friend class CompressWQ;
+  void _compress(bufferlist &in, bufferlist &out);
+  void _decompress(bufferlist &in, bufferlist &out);
+
+ public:
+  AsyncCompressor(CephContext *c);
+  virtual ~AsyncCompressor() {}
+
+  int get_cpuid(int id) {
+    if (coreids.empty())
+      return -1;
+    return coreids[id % coreids.size()];
+  }
+
+  void init();
+  void terminate();
+  uint64_t async_compress(bufferlist &data);
+  uint64_t async_decompress(bufferlist &data);
+  int get_compress_data(uint64_t compress_id, bufferlist &data, bool blocking, bool *finished);
+  int get_decompress_data(uint64_t decompress_id, bufferlist &data, bool blocking, bool *finished);
+};
+
+#endif
diff --git a/src/compressor/Compressor.cc b/src/compressor/Compressor.cc
new file mode 100644
index 0000000..0d11e74
--- /dev/null
+++ b/src/compressor/Compressor.cc
@@ -0,0 +1,25 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Haomai Wang <haomaiwang at gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "Compressor.h"
+#include "SnappyCompressor.h"
+
+
+Compressor* Compressor::create(const string &type)
+{
+  if (type == "snappy")
+    return new SnappyCompressor();
+
+  assert(0);
+}
diff --git a/src/compressor/Compressor.h b/src/compressor/Compressor.h
new file mode 100644
index 0000000..3eb71aa
--- /dev/null
+++ b/src/compressor/Compressor.h
@@ -0,0 +1,30 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Haomai Wang <haomaiwang at gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMPRESSOR_H
+#define CEPH_COMPRESSOR_H
+
+#include "include/int_types.h"
+#include "include/Context.h"
+
+class Compressor {
+ public:
+  virtual ~Compressor() {}
+  virtual int compress(bufferlist &in, bufferlist &out) = 0;
+  virtual int decompress(bufferlist &in, bufferlist &out) = 0;
+
+  static Compressor *create(const string &type);
+};
+
+#endif
diff --git a/src/compressor/Makefile.am b/src/compressor/Makefile.am
new file mode 100644
index 0000000..bd2a2d7
--- /dev/null
+++ b/src/compressor/Makefile.am
@@ -0,0 +1,11 @@
+libcompressor_la_SOURCES = \
+	compressor/Compressor.cc \
+	compressor/AsyncCompressor.cc
+noinst_LTLIBRARIES += libcompressor.la
+
+libcompressor_la_LIBADD = $(LIBCOMMON)
+
+noinst_HEADERS += \
+	compressor/Compressor.h \
+	compressor/AsyncCompressor.h \
+	compressor/SnappyCompressor.h
diff --git a/src/compressor/SnappyCompressor.h b/src/compressor/SnappyCompressor.h
new file mode 100644
index 0000000..ba58b46
--- /dev/null
+++ b/src/compressor/SnappyCompressor.h
@@ -0,0 +1,78 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Haomai Wang <haomaiwang at gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_SNAPPYCOMPRESSOR_H
+#define CEPH_SNAPPYCOMPRESSOR_H
+
+#include <snappy.h>
+#include <snappy-sinksource.h>
+#include "include/buffer.h"
+#include "Compressor.h"
+
+class BufferlistSource : public snappy::Source {
+  list<bufferptr>::const_iterator pb;
+  size_t pb_off;
+  size_t left;
+
+ public:
+  BufferlistSource(bufferlist &data): pb(data.buffers().begin()), pb_off(0), left(data.length()) {}
+  virtual ~BufferlistSource() {}
+  virtual size_t Available() const { return left; }
+  virtual const char* Peek(size_t* len) {
+    if (left) {
+      *len = pb->length() - pb_off;
+      return pb->c_str() + pb_off;
+    } else {
+      *len = 0;
+      return NULL;
+    }
+  }
+  virtual void Skip(size_t n) {
+    if (n + pb_off == pb->length()) {
+      ++pb;
+      pb_off = 0;
+    } else {
+      pb_off += n;
+    }
+    left -= n;
+  }
+};
+
+class SnappyCompressor : public Compressor {
+ public:
+  virtual ~SnappyCompressor() {}
+  virtual int compress(bufferlist &src, bufferlist &dst) {
+    BufferlistSource source(src);
+    bufferptr ptr(snappy::MaxCompressedLength(src.length()));
+    snappy::UncheckedByteArraySink sink(ptr.c_str());
+    snappy::Compress(&source, &sink);
+    dst.append(ptr, 0, sink.CurrentDestination()-ptr.c_str());
+    return 0;
+  }
+  virtual int decompress(bufferlist &src, bufferlist &dst) {
+    BufferlistSource source(src);
+    size_t res_len = 0;
+    // Trick, decompress only need first 32bits buffer
+    if (!snappy::GetUncompressedLength(src.get_contiguous(0, 8), 8, &res_len))
+      return -1;
+    bufferptr ptr(res_len);
+    if (snappy::RawUncompress(&source, ptr.c_str())) {
+      dst.append(ptr);
+      return 0;
+    }
+    return -1;
+  }
+};
+
+#endif
diff --git a/src/crush/CrushTester.cc b/src/crush/CrushTester.cc
index d2be1f0..0863833 100644
--- a/src/crush/CrushTester.cc
+++ b/src/crush/CrushTester.cc
@@ -1,3 +1,5 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
 
 #include "include/stringify.h"
 #include "CrushTester.h"
@@ -5,12 +7,8 @@
 
 #include <algorithm>
 #include <stdlib.h>
-/* fork */
-#include <unistd.h>
-/* waitpid */
-#include <sys/types.h>
-#include <sys/wait.h>
-#include <common/errno.h>
+#include <boost/lexical_cast.hpp>
+#include <common/SubProcess.h>
 
 void CrushTester::set_device_weight(int dev, float f)
 {
@@ -357,94 +355,47 @@ void CrushTester::write_integer_indexed_scalar_data_string(vector<string> &dst,
   dst.push_back( data_buffer.str() );
 }
 
-int CrushTester::test_with_crushtool(const string& crushtool,
-                                     int max_id,
-                                     int timeout,
+int CrushTester::test_with_crushtool(const char *crushtool_cmd,
+				     int max_id, int timeout,
 				     int ruleset)
 {
-  string timeout_string = stringify(timeout);
-  string opt_max_id = stringify(max_id);
-  vector<const char *> cmd_args;
-  cmd_args.push_back("timeout");
-  cmd_args.push_back(timeout_string.c_str());
-  cmd_args.push_back(crushtool.c_str());
-  cmd_args.push_back("-i");
-  cmd_args.push_back("-");
-  cmd_args.push_back("--test");
-  cmd_args.push_back("--check");
-  cmd_args.push_back(opt_max_id.c_str());
-  cmd_args.push_back("--min-x");
-  cmd_args.push_back("1");
-  cmd_args.push_back("--max-x");
-  cmd_args.push_back("50");
+  SubProcessTimed crushtool(crushtool_cmd, true, false, true, timeout);
+  string opt_max_id = boost::lexical_cast<string>(max_id);
+  crushtool.add_cmd_args(
+    "-i", "-",
+    "--test", "--check", opt_max_id.c_str(),
+    "--min-x", "1",
+    "--max-x", "50",
+    NULL);
   if (ruleset >= 0) {
-    cmd_args.push_back("--ruleset");
-    cmd_args.push_back(stringify(ruleset).c_str());
+    crushtool.add_cmd_args(
+      "--ruleset",
+      stringify(ruleset).c_str(),
+      NULL);
   }
-  cmd_args.push_back(NULL);
-
-  int pipefds[2];
-  if (::pipe(pipefds) == -1) {
-    int r = errno;
-    err << "error creating pipe: " << cpp_strerror(r) << "\n";
-    return -r;
-  }
-
-  int fpid = fork();
-  if (fpid < 0) {
-    int r = errno;
-    err << "unable to fork(): " << cpp_strerror(r);
-    ::close(pipefds[0]);
-    ::close(pipefds[1]);
-    return -r;
-  } else if (fpid == 0) {
-    ::close(pipefds[1]);
-    ::dup2(pipefds[0], STDIN_FILENO);
-    ::close(pipefds[0]);
-    ::close(1);
-    ::close(2);
-    int r = execvp(cmd_args[0], (char * const *)&cmd_args[0]);
-    if (r < 0)
-      exit(errno);
-    // we should never reach this
-    exit(EINVAL);
+  int ret = crushtool.spawn();
+  if (ret != 0) {
+    err << "failed run crushtool: " << crushtool.err();
+    return ret;
   }
-  ::close(pipefds[0]);
 
   bufferlist bl;
   ::encode(crush, bl);
-  bl.write_fd(pipefds[1]);
-  ::close(pipefds[1]);
-
-  int status;
-  int r = waitpid(fpid, &status, 0);
-  assert(r == fpid);
-
-  if (!WIFEXITED(status)) {
-    assert(WIFSIGNALED(status));
-    err << "error testing crush map\n";
-    return -EINVAL;
+  bl.write_fd(crushtool.stdin());
+  crushtool.close_stdin();
+  bl.clear();
+  ret = bl.read_fd(crushtool.stderr(), 100 * 1024);
+  if (ret < 0) {
+    err << "failed read from crushtool: " << cpp_strerror(-ret);
+    return ret;
   }
-
-  r = WEXITSTATUS(status);
-  if (r == 0) {
-    // major success!
-    return 0;
-  }
-  if (r == 124) {
-    // the test takes longer than timeout and was interrupted
-    return -EINTR;
-  }
-
-  if (r == ENOENT) {
-    err << "unable to find " << cmd_args << " to test the map";
-    return -ENOENT;
+  bl.write_stream(err);
+  if (crushtool.join() != 0) {
+    err << crushtool.err();
+    return -EINVAL;
   }
 
-  // something else entirely happened
-  // log it and consider an invalid crush map
-  err << "error running crushmap through crushtool: " << cpp_strerror(r);
-  return -EINVAL;
+  return 0;
 }
 
 namespace {
@@ -458,7 +409,7 @@ namespace {
   class CrushWalker : public CrushTreeDumper::Dumper<void> {
     typedef void DumbFormatter;
     typedef CrushTreeDumper::Dumper<DumbFormatter> Parent;
-    unsigned max_id;
+    int max_id;
   public:
     CrushWalker(const CrushWrapper *crush, unsigned max_id)
       : Parent(crush), max_id(max_id) {}
@@ -470,7 +421,7 @@ namespace {
 	}
 	type = crush->get_bucket_type(qi.id);
       } else {
-	if (max_id > 0 && qi.id >= (int)max_id) {
+	if (max_id > 0 && qi.id >= max_id) {
 	  throw BadCrushMap("item id too large", qi.id);
 	}
 	type = 0;
diff --git a/src/crush/CrushTester.h b/src/crush/CrushTester.h
index a9221c7..2f1a2c6 100644
--- a/src/crush/CrushTester.h
+++ b/src/crush/CrushTester.h
@@ -348,7 +348,7 @@ public:
    */
   bool check_name_maps(unsigned max_id = 0) const;
   int test();
-  int test_with_crushtool(const string& crushtool,
+  int test_with_crushtool(const char *crushtool_cmd = "crushtool",
 			  int max_id = -1,
 			  int timeout = 0,
 			  int ruleset = -1);
diff --git a/src/crush/CrushWrapper.cc b/src/crush/CrushWrapper.cc
index 0dac389..ba365dd 100644
--- a/src/crush/CrushWrapper.cc
+++ b/src/crush/CrushWrapper.cc
@@ -933,17 +933,33 @@ void CrushWrapper::reweight(CephContext *cct)
   }
 }
 
-int CrushWrapper::add_simple_ruleset(string name, string root_name,
-                                     string failure_domain_name,
-                                     string mode,
-                                     int rule_type,
-                                     ostream *err)
+int CrushWrapper::add_simple_ruleset_at(string name, string root_name,
+                                        string failure_domain_name,
+                                        string mode, int rule_type,
+                                        int rno, ostream *err)
 {
   if (rule_exists(name)) {
     if (err)
       *err << "rule " << name << " exists";
     return -EEXIST;
   }
+  if (rno >= 0) {
+    if (rule_exists(rno)) {
+      if (err)
+        *err << "rule with ruleno " << rno << " exists";
+      return -EEXIST;
+    }
+    if (ruleset_exists(rno)) {
+      if (err)
+        *err << "ruleset " << rno << " exists";
+      return -EEXIST;
+    }
+  } else {
+    for (rno = 0; rno < get_max_rules(); rno++) {
+      if (!rule_exists(rno) && !ruleset_exists(rno))
+        break;
+    }
+  }
   if (!name_exists(root_name)) {
     if (err)
       *err << "root item " << root_name << " does not exist";
@@ -965,11 +981,6 @@ int CrushWrapper::add_simple_ruleset(string name, string root_name,
     return -EINVAL;
   }
 
-  int rno = -1;
-  for (rno = 0; rno < get_max_rules(); rno++) {
-    if (!rule_exists(rno) && !ruleset_exists(rno))
-       break;
-  }
   int steps = 3;
   if (mode == "indep")
     steps = 5;
@@ -1008,6 +1019,15 @@ int CrushWrapper::add_simple_ruleset(string name, string root_name,
   return rno;
 }
 
+int CrushWrapper::add_simple_ruleset(string name, string root_name,
+                                     string failure_domain_name,
+                                     string mode, int rule_type,
+                                     ostream *err)
+{
+  return add_simple_ruleset_at(name, root_name, failure_domain_name, mode,
+                               rule_type, -1, err);
+}
+
 int CrushWrapper::get_rule_weight_osd_map(unsigned ruleno, map<int,float> *pmap)
 {
   if (ruleno >= crush->max_rules)
@@ -1676,16 +1696,13 @@ void CrushWrapper::generate_test_instances(list<CrushWrapper*>& o)
   // fixme
 }
 
-/**
- * Determine the default CRUSH ruleset ID to be used with
- * newly created replicated pools.
- *
- * @returns a ruleset ID (>=0) or an error (<0)
- */
-int CrushWrapper::get_osd_pool_default_crush_replicated_ruleset(CephContext *cct)
+int CrushWrapper::_get_osd_pool_default_crush_replicated_ruleset(CephContext *cct,
+                                                                 bool quiet)
 {
-  int crush_ruleset = cct->_conf->osd_pool_default_crush_replicated_ruleset;
-  if (cct->_conf->osd_pool_default_crush_rule != -1) {
+  int crush_ruleset = cct->_conf->osd_pool_default_crush_rule;
+  if (crush_ruleset == -1) {
+    crush_ruleset = cct->_conf->osd_pool_default_crush_replicated_ruleset;
+  } else if (!quiet) {
     ldout(cct, 0) << "osd_pool_default_crush_rule is deprecated "
                   << "use osd_pool_default_crush_replicated_ruleset instead"
                   << dendl;
@@ -1694,11 +1711,25 @@ int CrushWrapper::get_osd_pool_default_crush_replicated_ruleset(CephContext *cct
                   << "osd_pool_default_crush_replicated_ruleset = "
                   << cct->_conf->osd_pool_default_crush_replicated_ruleset
                   << dendl;
-    crush_ruleset = cct->_conf->osd_pool_default_crush_rule;
   }
 
+  return crush_ruleset;
+}
+
+/**
+ * Determine the default CRUSH ruleset ID to be used with
+ * newly created replicated pools.
+ *
+ * @returns a ruleset ID (>=0) or -1 if no suitable ruleset found
+ */
+int CrushWrapper::get_osd_pool_default_crush_replicated_ruleset(CephContext *cct)
+{
+  int crush_ruleset = _get_osd_pool_default_crush_replicated_ruleset(cct,
+                                                                     false);
   if (crush_ruleset == CEPH_DEFAULT_CRUSH_REPLICATED_RULESET) {
     crush_ruleset = find_first_ruleset(pg_pool_t::TYPE_REPLICATED);
+  } else if (!ruleset_exists(crush_ruleset)) {
+    crush_ruleset = -1; // match find_first_ruleset() retval
   }
 
   return crush_ruleset;
diff --git a/src/crush/CrushWrapper.h b/src/crush/CrushWrapper.h
index c76fac3..e3b4c48 100644
--- a/src/crush/CrushWrapper.h
+++ b/src/crush/CrushWrapper.h
@@ -9,7 +9,7 @@
 #include <set>
 #include <string>
 
-#include <iostream> //for testing, remove
+#include <iosfwd>
 
 #include "include/types.h"
 
@@ -622,7 +622,6 @@ public:
    *
    * Will return the weight for the first instance it finds.
    *
-   * @param cct cct
    * @param id item id to check
    * @return weight of item
    */
@@ -798,6 +797,12 @@ public:
 
   int add_simple_ruleset(string name, string root_name, string failure_domain_type,
 			 string mode, int rule_type, ostream *err = 0);
+  /**
+   * @param rno ruleset id to use, -1 to pick the lowest available
+   */
+  int add_simple_ruleset_at(string name, string root_name,
+                            string failure_domain_type, string mode,
+                            int rule_type, int rno, ostream *err = 0);
 
   int remove_rule(int ruleno);
 
@@ -1058,6 +1063,8 @@ public:
   void dump_tree(Formatter *f) const;
   static void generate_test_instances(list<CrushWrapper*>& o);
 
+  int _get_osd_pool_default_crush_replicated_ruleset(CephContext *cct,
+                                                     bool quiet);
   int get_osd_pool_default_crush_replicated_ruleset(CephContext *cct);
 
   static bool is_valid_crush_name(const string& s);
diff --git a/src/crush/Makefile.am b/src/crush/Makefile.am
index 616a00a..5da6f17 100644
--- a/src/crush/Makefile.am
+++ b/src/crush/Makefile.am
@@ -16,9 +16,10 @@ noinst_HEADERS += \
 	crush/CrushWrapper.i \
 	crush/builder.h \
 	crush/crush.h \
+	crush/crush_compat.h \
+	crush/crush_ln_table.h \
 	crush/grammar.h \
 	crush/hash.h \
-	crush/crush_ln_table.h \
 	crush/mapper.h \
 	crush/sample.txt \
 	crush/types.h
diff --git a/src/crush/builder.c b/src/crush/builder.c
index 28d957d..1212e4b 100644
--- a/src/crush/builder.c
+++ b/src/crush/builder.c
@@ -1465,3 +1465,23 @@ int crush_reweight_bucket(struct crush_map *crush, struct crush_bucket *b)
 
 /***************************/
 
+/* methods to check for safe arithmetic operations */
+
+int crush_addition_is_unsafe(__u32 a, __u32 b)
+{
+	if ((((__u32)(-1)) - b) < a)
+		return 1;
+	else
+		return 0;
+}
+
+int crush_multiplication_is_unsafe(__u32  a, __u32 b)
+{
+	/* prevent division by zero */
+	if (!b)
+		return 1;
+	if ((((__u32)(-1)) / b) < a)
+		return 1;
+	else
+		return 0;
+}
diff --git a/src/crush/builder.h b/src/crush/builder.h
index efd7c8a..b436e3e 100644
--- a/src/crush/builder.h
+++ b/src/crush/builder.h
@@ -41,4 +41,7 @@ crush_make_straw_bucket(struct crush_map *map,
 			int *items,
 			int *weights);
 
+extern int crush_addition_is_unsafe(__u32 a, __u32 b);
+extern int crush_multiplication_is_unsafe(__u32  a, __u32 b);
+
 #endif
diff --git a/src/crush/crush.c b/src/crush/crush.c
index c0bcdb7..80d7c3a 100644
--- a/src/crush/crush.c
+++ b/src/crush/crush.c
@@ -1,16 +1,11 @@
-
 #ifdef __KERNEL__
 # include <linux/slab.h>
+# include <linux/crush/crush.h>
 #else
-# include <stdlib.h>
-# include <assert.h>
-# define kfree(x) do { if (x) free(x); } while (0)
-# define BUG_ON(x) assert(!(x))
-# include "include/int_types.h"
+# include "crush_compat.h"
+# include "crush.h"
 #endif
 
-#include "crush.h"
-
 const char *crush_bucket_alg_name(int alg)
 {
 	switch (alg) {
@@ -135,7 +130,9 @@ void crush_destroy(struct crush_map *map)
 		kfree(map->rules);
 	}
 
+#ifndef __KERNEL__
 	kfree(map->choose_tries);
+#endif
 	kfree(map);
 }
 
@@ -143,23 +140,3 @@ void crush_destroy_rule(struct crush_rule *rule)
 {
 	kfree(rule);
 }
-
-// methods to check for safe arithmetic operations
-int crush_addition_is_unsafe(__u32 a, __u32 b)
-{
-  if ((((__u32)(-1)) - b) < a)
-    return 1;
-  else
-    return 0;
-}
-
-int crush_multiplication_is_unsafe(__u32  a, __u32 b)
-{
-  // prevent division by zero 
-  if (!b)
-    return 1;
-  if ((((__u32)(-1)) / b) < a)
-    return 1;
-  else
-    return 0;
-}
diff --git a/src/crush/crush.h b/src/crush/crush.h
index 5082c03..48b4930 100644
--- a/src/crush/crush.h
+++ b/src/crush/crush.h
@@ -1,7 +1,11 @@
 #ifndef CEPH_CRUSH_CRUSH_H
 #define CEPH_CRUSH_CRUSH_H
 
-#include "include/int_types.h"
+#ifdef __KERNEL__
+# include <linux/types.h>
+#else
+# include "crush_compat.h"
+#endif
 
 /*
  * CRUSH is a pseudo-random data distribution algorithm that
@@ -20,8 +24,8 @@
 #define CRUSH_MAGIC 0x00010000ul   /* for detecting algorithm revisions */
 
 #define CRUSH_MAX_DEPTH 10  /* max crush hierarchy depth */
-#define CRUSH_MAX_RULESET (1<<8) /*max crush ruleset number*/
-#define CRUSH_MAX_RULES	CRUSH_MAX_RULESET /*max crush rules, shold be the same as max rulesets*/
+#define CRUSH_MAX_RULESET (1<<8)  /* max crush ruleset number */
+#define CRUSH_MAX_RULES CRUSH_MAX_RULESET  /* should be the same as max rulesets */
 
 #define CRUSH_MAX_DEVICE_WEIGHT (100u * 0x10000u)
 #define CRUSH_MAX_BUCKET_WEIGHT (65535u * 0x10000u)
@@ -187,7 +191,7 @@ struct crush_map {
 	/* choose local attempts using a fallback permutation before
 	 * re-descent */
 	__u32 choose_local_fallback_tries;
-	/* choose attempts before giving up */ 
+	/* choose attempts before giving up */
 	__u32 choose_total_tries;
 	/* attempt chooseleaf inner descent once for firstn mode; on
 	 * reject retry outer descent.  Note that this does *not*
@@ -201,6 +205,7 @@ struct crush_map {
 	 * mappings line up a bit better with previous mappings. */
 	__u8 chooseleaf_vary_r;
 
+#ifndef __KERNEL__
 	/*
 	 * version 0 (original) of straw_calc has various flaws.  version 1
 	 * fixes a few of them.
@@ -217,13 +222,12 @@ struct crush_map {
 	__u32 allowed_bucket_algs;
 
 	__u32 *choose_tries;
+#endif
 };
 
 
 /* crush.c */
 extern int crush_get_bucket_item_weight(const struct crush_bucket *b, int pos);
-extern int crush_addition_is_unsafe(__u32 a, __u32 b);
-extern int crush_multiplication_is_unsafe(__u32  a, __u32 b);
 extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
 extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
 extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
diff --git a/src/crush/crush_compat.h b/src/crush/crush_compat.h
new file mode 100644
index 0000000..08eb4ea
--- /dev/null
+++ b/src/crush/crush_compat.h
@@ -0,0 +1,39 @@
+#ifndef CEPH_CRUSH_COMPAT_H
+#define CEPH_CRUSH_COMPAT_H
+
+#include "include/int_types.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+/* asm-generic/bug.h */
+
+#define BUG_ON(x) assert(!(x))
+
+/* linux/kernel.h */
+
+#define U8_MAX		((__u8)~0U)
+#define S8_MAX		((__s8)(U8_MAX>>1))
+#define S8_MIN		((__s8)(-S8_MAX - 1))
+#define U16_MAX		((__u16)~0U)
+#define S16_MAX		((__s16)(U16_MAX>>1))
+#define S16_MIN		((__s16)(-S16_MAX - 1))
+#define U32_MAX		((__u32)~0U)
+#define S32_MAX		((__s32)(U32_MAX>>1))
+#define S32_MIN		((__s32)(-S32_MAX - 1))
+#define U64_MAX		((__u64)~0ULL)
+#define S64_MAX		((__s64)(U64_MAX>>1))
+#define S64_MIN		((__s64)(-S64_MAX - 1))
+
+/* linux/math64.h */
+
+#define div64_s64(dividend, divisor) ((dividend) / (divisor))
+
+/* linux/slab.h */
+
+#define kmalloc(size, flags) malloc(size)
+#define kfree(x) do { if (x) free(x); } while (0)
+
+#endif /* CEPH_CRUSH_COMPAT_H */
diff --git a/src/crush/crush_ln_table.h b/src/crush/crush_ln_table.h
index 80aeeb0..aae534c 100644
--- a/src/crush/crush_ln_table.h
+++ b/src/crush/crush_ln_table.h
@@ -1,5 +1,3 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
 /*
  * Ceph - scalable distributed file system
  *
@@ -12,22 +10,20 @@
  *
  */
 
-#include "include/int_types.h"
-
-#if defined(__linux__)
-#include <linux/types.h>
-#elif defined(__FreeBSD__)
-#include <sys/types.h>
-#endif
-
 #ifndef CEPH_CRUSH_LN_H
 #define CEPH_CRUSH_LN_H
 
+#ifdef __KERNEL__
+# include <linux/types.h>
+#else
+# include "crush_compat.h"
+#endif
 
-// RH_LH_tbl[2*k] = 2^48/(1.0+k/128.0)
-// RH_LH_tbl[2*k+1] = 2^48*log2(1.0+k/128.0)
-
-static int64_t __RH_LH_tbl[128*2+2] = {
+/*
+ * RH_LH_tbl[2*k] = 2^48/(1.0+k/128.0)
+ * RH_LH_tbl[2*k+1] = 2^48*log2(1.0+k/128.0)
+ */
+static __s64 __RH_LH_tbl[128*2+2] = {
   0x0001000000000000ll, 0x0000000000000000ll, 0x0000fe03f80fe040ll, 0x000002dfca16dde1ll,
   0x0000fc0fc0fc0fc1ll, 0x000005b9e5a170b4ll, 0x0000fa232cf25214ll, 0x0000088e68ea899all,
   0x0000f83e0f83e0f9ll, 0x00000b5d69bac77ell, 0x0000f6603d980f67ll, 0x00000e26fd5c8555ll,
@@ -93,11 +89,12 @@ static int64_t __RH_LH_tbl[128*2+2] = {
   0x0000820820820821ll, 0x0000fa2f045e7832ll, 0x000081848da8faf1ll, 0x0000fba577877d7dll,
   0x0000810204081021ll, 0x0000fd1a708bbe11ll, 0x0000808080808081ll, 0x0000fe8df263f957ll,
   0x0000800000000000ll, 0x0000ffff00000000ll,
-  };
-
+};
 
-    // LL_tbl[k] = 2^48*log2(1.0+k/2^15);
-static int64_t __LL_tbl[256] = {
+/*
+ * LL_tbl[k] = 2^48*log2(1.0+k/2^15)
+ */
+static __s64 __LL_tbl[256] = {
   0x0000000000000000ull, 0x00000002e2a60a00ull, 0x000000070cb64ec5ull, 0x00000009ef50ce67ull,
   0x0000000cd1e588fdull, 0x0000000fb4747e9cull, 0x0000001296fdaf5eull, 0x0000001579811b58ull,
   0x000000185bfec2a1ull, 0x0000001b3e76a552ull, 0x0000001e20e8c380ull, 0x0000002103551d43ull,
@@ -164,7 +161,4 @@ static int64_t __LL_tbl[256] = {
   0x000002d4562d2ec6ull, 0x000002d73330209dull, 0x000002da102d63b0ull, 0x000002dced24f814ull,
 };
 
-
-
-
 #endif
diff --git a/src/crush/hash.c b/src/crush/hash.c
index 9b15321..ed123af 100644
--- a/src/crush/hash.c
+++ b/src/crush/hash.c
@@ -1,13 +1,9 @@
-#include "include/int_types.h"
-
-#if defined(__linux__)
-#include <linux/types.h>
-#elif defined(__FreeBSD__)
-#include <sys/types.h>
+#ifdef __KERNEL__
+# include <linux/crush/hash.h>
+#else
+# include "hash.h"
 #endif
 
-#include "hash.h"
-
 /*
  * Robert Jenkins' function for mixing 32-bit values
  * http://burtleburtle.net/bob/hash/evahash.html
diff --git a/src/crush/hash.h b/src/crush/hash.h
index 91e8842..d1d9025 100644
--- a/src/crush/hash.h
+++ b/src/crush/hash.h
@@ -1,6 +1,12 @@
 #ifndef CEPH_CRUSH_HASH_H
 #define CEPH_CRUSH_HASH_H
 
+#ifdef __KERNEL__
+# include <linux/types.h>
+#else
+# include "crush_compat.h"
+#endif
+
 #define CRUSH_HASH_RJENKINS1   0
 
 #define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1
diff --git a/src/crush/mapper.c b/src/crush/mapper.c
index 916790d..393bfb2 100644
--- a/src/crush/mapper.c
+++ b/src/crush/mapper.c
@@ -1,5 +1,3 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
 /*
  * Ceph - scalable distributed file system
  *
@@ -17,26 +15,17 @@
 # include <linux/slab.h>
 # include <linux/bug.h>
 # include <linux/kernel.h>
-# ifndef dprintk
-#  define dprintk(args...)
-# endif
+# include <linux/crush/crush.h>
+# include <linux/crush/hash.h>
 #else
-# include <string.h>
-# include <stdio.h>
-# include <stdlib.h>
-# include <assert.h>
-# define BUG_ON(x) assert(!(x))
-# define dprintk(args...) /* printf(args) */
-# define kmalloc(x, f) malloc(x)
-# define kfree(x) free(x)
-/*# define DEBUG_INDEP*/
-# include "include/int_types.h"
+# include "crush_compat.h"
+# include "crush.h"
+# include "hash.h"
 #endif
-
-#include "crush.h"
-#include "hash.h"
 #include "crush_ln_table.h"
 
+#define dprintk(args...) /* printf(args) */
+
 /*
  * Implement the core CRUSH mapping algorithm.
  */
@@ -154,7 +143,7 @@ static int bucket_list_choose(struct crush_bucket_list *bucket,
 	int i;
 
 	for (i = bucket->h.size-1; i >= 0; i--) {
-		__u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i],
+		__u64 w = crush_hash32_4(bucket->h.hash, x, bucket->h.items[i],
 					 r, bucket->h.id);
 		w &= 0xffff;
 		dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
@@ -253,43 +242,46 @@ static int bucket_straw_choose(struct crush_bucket_straw *bucket,
 	return bucket->h.items[high];
 }
 
-// compute 2^44*log2(input+1)
-uint64_t crush_ln(unsigned xin)
+/* compute 2^44*log2(input+1) */
+static __u64 crush_ln(unsigned int xin)
 {
-    unsigned x=xin, x1;
-    int iexpon, index1, index2;
-    uint64_t RH, LH, LL, xl64, result;
+	unsigned int x = xin, x1;
+	int iexpon, index1, index2;
+	__u64 RH, LH, LL, xl64, result;
 
-    x++;
+	x++;
 
-    // normalize input
-    iexpon = 15;
-    while(!(x&0x18000)) { x<<=1; iexpon--; }
+	/* normalize input */
+	iexpon = 15;
+	while (!(x & 0x18000)) {
+		x <<= 1;
+		iexpon--;
+	}
 
-    index1 = (x>>8)<<1;
-    // RH ~ 2^56/index1
-    RH = __RH_LH_tbl[index1 - 256];
-    // LH ~ 2^48 * log2(index1/256)
-    LH = __RH_LH_tbl[index1 + 1 - 256];
+	index1 = (x >> 8) << 1;
+	/* RH ~ 2^56/index1 */
+	RH = __RH_LH_tbl[index1 - 256];
+	/* LH ~ 2^48 * log2(index1/256) */
+	LH = __RH_LH_tbl[index1 + 1 - 256];
 
-    // RH*x ~ 2^48 * (2^15 + xf), xf<2^8
-    xl64 = (int64_t)x * RH;
-    xl64 >>= 48;
-    x1 = xl64;
+	/* RH*x ~ 2^48 * (2^15 + xf), xf<2^8 */
+	xl64 = (__s64)x * RH;
+	xl64 >>= 48;
+	x1 = xl64;
 
-    result = iexpon;
-    result <<= (12 + 32);
+	result = iexpon;
+	result <<= (12 + 32);
 
-    index2 = x1 & 0xff;
-    // LL ~ 2^48*log2(1.0+index2/2^15)
-    LL = __LL_tbl[index2];
+	index2 = x1 & 0xff;
+	/* LL ~ 2^48*log2(1.0+index2/2^15) */
+	LL = __LL_tbl[index2];
 
-    LH = LH + LL;
+	LH = LH + LL;
 
-    LH >>= (48-12 - 32);
-    result += LH;
+	LH >>= (48 - 12 - 32);
+	result += LH;
 
-    return result;
+	return result;
 }
 
 
@@ -305,9 +297,9 @@ uint64_t crush_ln(unsigned xin)
 static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket,
 				int x, int r)
 {
-	unsigned i, high = 0;
-	unsigned u;
-	unsigned w;
+	unsigned int i, high = 0;
+	unsigned int u;
+	unsigned int w;
 	__s64 ln, draw, high_draw = 0;
 
 	for (i = 0; i < bucket->h.size; i++) {
@@ -335,9 +327,9 @@ static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket,
 			 * weight means a larger (less negative) value
 			 * for draw.
 			 */
-			draw = ln / w;
+			draw = div64_s64(ln, w);
 		} else {
-			draw = INT64_MIN;
+			draw = S64_MIN;
 		}
 
 		if (i == 0 || draw > high_draw) {
@@ -582,9 +574,10 @@ reject:
 		out[outpos] = item;
 		outpos++;
 		count--;
-
+#ifndef __KERNEL__
 		if (map->choose_tries && ftotal <= map->choose_total_tries)
 			map->choose_tries[ftotal]++;
+#endif
 	}
 
 	dprintk("CHOOSE returns %d\n", outpos);
@@ -630,16 +623,16 @@ static void crush_choose_indep(const struct crush_map *map,
 	for (ftotal = 0; left > 0 && ftotal < tries; ftotal++) {
 #ifdef DEBUG_INDEP
 		if (out2 && ftotal) {
-			printf("%u %d a: ", ftotal, left);
+			dprintk("%u %d a: ", ftotal, left);
 			for (rep = outpos; rep < endpos; rep++) {
-				printf(" %d", out[rep]);
+				dprintk(" %d", out[rep]);
 			}
-			printf("\n");
-			printf("%u %d b: ", ftotal, left);
+			dprintk("\n");
+			dprintk("%u %d b: ", ftotal, left);
 			for (rep = outpos; rep < endpos; rep++) {
-				printf(" %d", out2[rep]);
+				dprintk(" %d", out2[rep]);
 			}
-			printf("\n");
+			dprintk("\n");
 		}
 #endif
 		for (rep = outpos; rep < endpos; rep++) {
@@ -758,21 +751,22 @@ static void crush_choose_indep(const struct crush_map *map,
 			out2[rep] = CRUSH_ITEM_NONE;
 		}
 	}
-        if (map->choose_tries && ftotal <= map->choose_total_tries)
-          map->choose_tries[ftotal]++;
-
+#ifndef __KERNEL__
+	if (map->choose_tries && ftotal <= map->choose_total_tries)
+		map->choose_tries[ftotal]++;
+#endif
 #ifdef DEBUG_INDEP
 	if (out2) {
-		printf("%u %d a: ", ftotal, left);
+		dprintk("%u %d a: ", ftotal, left);
 		for (rep = outpos; rep < endpos; rep++) {
-			printf(" %d", out[rep]);
+			dprintk(" %d", out[rep]);
 		}
-		printf("\n");
-		printf("%u %d b: ", ftotal, left);
+		dprintk("\n");
+		dprintk("%u %d b: ", ftotal, left);
 		for (rep = outpos; rep < endpos; rep++) {
-			printf(" %d", out2[rep]);
+			dprintk(" %d", out2[rep]);
 		}
-		printf("\n");
+		dprintk("\n");
 	}
 #endif
 }
@@ -933,7 +927,7 @@ int crush_do_rule(const struct crush_map *map,
 						0);
 				} else {
 					out_size = ((numrep < (result_max-osize)) ?
-                                                    numrep : (result_max-osize));
+						    numrep : (result_max-osize));
 					crush_choose_indep(
 						map,
 						map->buckets[-1-w[i]],
@@ -979,5 +973,3 @@ int crush_do_rule(const struct crush_map *map,
 	}
 	return result_len;
 }
-
-
diff --git a/src/erasure-code/ErasureCode.cc b/src/erasure-code/ErasureCode.cc
index 30dc186..d8d5490 100644
--- a/src/erasure-code/ErasureCode.cc
+++ b/src/erasure-code/ErasureCode.cc
@@ -18,12 +18,23 @@
 #include <errno.h>
 #include <vector>
 #include <algorithm>
+#include <ostream>
 
 #include "common/strtol.h"
 #include "ErasureCode.h"
 
 const unsigned ErasureCode::SIMD_ALIGN = 32;
 
+int ErasureCode::sanity_check_k(int k, ostream *ss)
+{
+  if (k < 2) {
+    *ss << "k=" << k << " must be >= 2" << std::endl;
+    return -EINVAL;
+  } else {
+    return 0;
+  }
+}
+
 int ErasureCode::chunk_index(unsigned int i) const
 {
   return chunk_mapping.size() > i ? chunk_mapping[i] : i;
@@ -163,21 +174,21 @@ int ErasureCode::decode_chunks(const set<int> &want_to_read,
   assert("ErasureCode::decode_chunks not implemented" == 0);
 }
 
-int ErasureCode::parse(const map<std::string,std::string> &parameters,
+int ErasureCode::parse(const ErasureCodeProfile &profile,
 		       ostream *ss)
 {
-  return to_mapping(parameters, ss);
+  return to_mapping(profile, ss);
 }
 
 const vector<int> &ErasureCode::get_chunk_mapping() const {
   return chunk_mapping;
 }
 
-int ErasureCode::to_mapping(const map<std::string,std::string> &parameters,
+int ErasureCode::to_mapping(const ErasureCodeProfile &profile,
 			    ostream *ss)
 {
-  if (parameters.find("mapping") != parameters.end()) {
-    std::string mapping = parameters.find("mapping")->second;
+  if (profile.find("mapping") != profile.end()) {
+    std::string mapping = profile.find("mapping")->second;
     int position = 0;
     vector<int> coding_chunk_mapping;
     for(std::string::iterator it = mapping.begin(); it != mapping.end(); ++it) {
@@ -195,24 +206,22 @@ int ErasureCode::to_mapping(const map<std::string,std::string> &parameters,
 }
 
 int ErasureCode::to_int(const std::string &name,
-			const map<std::string,std::string> &parameters,
+			ErasureCodeProfile &profile,
 			int *value,
-			int default_value,
+			const std::string &default_value,
 			ostream *ss)
 {
-  if (parameters.find(name) == parameters.end() ||
-      parameters.find(name)->second.size() == 0) {
-    *value = default_value;
-    return 0;
-  }
-  std::string p = parameters.find(name)->second;
+  if (profile.find(name) == profile.end() ||
+      profile.find(name)->second.size() == 0)
+    profile[name] = default_value;
+  std::string p = profile.find(name)->second;
   std::string err;
   int r = strict_strtol(p.c_str(), 10, &err);
   if (!err.empty()) {
     *ss << "could not convert " << name << "=" << p
 	<< " to int because " << err
 	<< ", set to default " << default_value << std::endl;
-    *value = default_value;
+    *value = strict_strtol(default_value.c_str(), 10, &err);
     return -EINVAL;
   }
   *value = r;
@@ -220,21 +229,32 @@ int ErasureCode::to_int(const std::string &name,
 }
 
 int ErasureCode::to_bool(const std::string &name,
-			 const map<std::string,std::string> &parameters,
+			 ErasureCodeProfile &profile,
 			 bool *value,
-			 bool default_value,
+			 const std::string &default_value,
 			 ostream *ss)
 {
-  if (parameters.find(name) == parameters.end() ||
-      parameters.find(name)->second.size() == 0) {
-    *value = default_value;
-    return 0;
-  }
-  const std::string p = parameters.find(name)->second;
+  if (profile.find(name) == profile.end() ||
+      profile.find(name)->second.size() == 0)
+    profile[name] = default_value;
+  const std::string p = profile.find(name)->second;
   *value = (p == "yes") || (p == "true");
   return 0;
 }
 
+int ErasureCode::to_string(const std::string &name,
+			   ErasureCodeProfile &profile,
+			   std::string *value,
+			   const std::string &default_value,
+			   ostream *ss)
+{
+  if (profile.find(name) == profile.end() ||
+      profile.find(name)->second.size() == 0)
+    profile[name] = default_value;
+  *value = profile[name];
+  return 0;
+}
+
 int ErasureCode::decode_concat(const map<int, bufferlist> &chunks,
 			       bufferlist *decoded)
 {
diff --git a/src/erasure-code/ErasureCode.h b/src/erasure-code/ErasureCode.h
index b135ade..bad6d81 100644
--- a/src/erasure-code/ErasureCode.h
+++ b/src/erasure-code/ErasureCode.h
@@ -33,9 +33,21 @@ namespace ceph {
     static const unsigned SIMD_ALIGN;
 
     vector<int> chunk_mapping;
+    ErasureCodeProfile _profile;
 
     virtual ~ErasureCode() {}
 
+    virtual int init(ErasureCodeProfile &profile, ostream *ss) {
+      _profile = profile;
+      return 0;
+    }
+
+    virtual const ErasureCodeProfile &get_profile() const {
+      return _profile;
+    }
+
+    int sanity_check_k(int k, ostream *ss);
+
     virtual unsigned int get_coding_chunk_count() const {
       return get_chunk_count() - get_data_chunk_count();
     }
@@ -66,29 +78,36 @@ namespace ceph {
                               const map<int, bufferlist> &chunks,
                               map<int, bufferlist> *decoded);
 
-    virtual int parse(const map<std::string,std::string> &parameters,
-		      ostream *ss);
-
     virtual const vector<int> &get_chunk_mapping() const;
 
-    int to_mapping(const map<std::string,std::string> &parameters,
+    int to_mapping(const ErasureCodeProfile &profile,
 		   ostream *ss);
 
     static int to_int(const std::string &name,
-		      const map<std::string,std::string> &parameters,
+		      ErasureCodeProfile &profile,
 		      int *value,
-		      int default_value,
+		      const std::string &default_value,
 		      ostream *ss);
 
     static int to_bool(const std::string &name,
-		       const map<std::string,std::string> &parameters,
+		       ErasureCodeProfile &profile,
 		       bool *value,
-		       bool default_value,
+		       const std::string &default_value,
 		       ostream *ss);
 
+    static int to_string(const std::string &name,
+			 ErasureCodeProfile &profile,
+			 std::string *value,
+			 const std::string &default_value,
+			 ostream *ss);
+
     virtual int decode_concat(const map<int, bufferlist> &chunks,
 			      bufferlist *decoded);
 
+  protected:
+    int parse(const ErasureCodeProfile &profile,
+	      ostream *ss);
+
   private:
     int chunk_index(unsigned int i) const;
   };
diff --git a/src/erasure-code/ErasureCodeInterface.h b/src/erasure-code/ErasureCodeInterface.h
index 56b5265..5eb5571 100644
--- a/src/erasure-code/ErasureCodeInterface.h
+++ b/src/erasure-code/ErasureCodeInterface.h
@@ -143,6 +143,7 @@
 #include <map>
 #include <set>
 #include <vector>
+#include <iostream>
 #include "include/memory.h"
 #include "include/buffer.h"
 
@@ -152,11 +153,50 @@ using namespace std;
 
 namespace ceph {
 
+  typedef map<std::string,std::string> ErasureCodeProfile;
+
+  inline ostream& operator<<(ostream& out, const ErasureCodeProfile& profile) {
+    out << "{";
+    for (ErasureCodeProfile::const_iterator it = profile.begin();
+	 it != profile.end();
+	 ++it) {
+      if (it != profile.begin()) out << ",";
+      out << it->first << "=" << it->second;
+    }
+    out << "}";
+    return out;
+  }
+
+
   class ErasureCodeInterface {
   public:
     virtual ~ErasureCodeInterface() {}
 
     /**
+     * Initialize the instance according to the content of
+     * **profile**. The **ss** stream is set with debug messages or
+     * error messages, the content of which depend on the
+     * implementation.
+     *
+     * Return 0 on success or a negative errno on error. When
+     * returning on error, the implementation is expected to
+     * provide a human readable explanation in **ss**.
+     *
+     * @param [in] profile a key/value map
+     * @param [out] ss contains informative messages when an error occurs
+     * @return 0 on success or a negative errno on error.
+     */
+    virtual int init(ErasureCodeProfile &profile, ostream *ss) = 0;
+
+    /**
+     * Return the profile that was used to initialize the instance
+     * with the **init** method.
+     *
+     * @return the profile in use by the instance
+     */
+    virtual const ErasureCodeProfile &get_profile() const = 0;
+
+    /**
      * Create a new ruleset in **crush** under the name **name**,
      * unless it already exists.
      *
diff --git a/src/erasure-code/ErasureCodePlugin.cc b/src/erasure-code/ErasureCodePlugin.cc
index 1c5db04..b120eda 100644
--- a/src/erasure-code/ErasureCodePlugin.cc
+++ b/src/erasure-code/ErasureCodePlugin.cc
@@ -24,7 +24,11 @@
 #include "include/str_list.h"
 
 #define PLUGIN_PREFIX "libec_"
+#if defined(DARWIN)
+#define PLUGIN_SUFFIX ".dylib"
+#else
 #define PLUGIN_SUFFIX ".so"
+#endif
 #define PLUGIN_INIT_FUNCTION "__erasure_code_init"
 #define PLUGIN_VERSION_FUNCTION "__erasure_code_version"
 
@@ -84,9 +88,10 @@ ErasureCodePlugin *ErasureCodePluginRegistry::get(const std::string &name)
 }
 
 int ErasureCodePluginRegistry::factory(const std::string &plugin_name,
-				       const map<std::string,std::string> &parameters,
+				       const std::string &directory,
+				       ErasureCodeProfile &profile,
 				       ErasureCodeInterfaceRef *erasure_code,
-				       ostream &ss)
+				       ostream *ss)
 {
   ErasureCodePlugin *plugin;
   {
@@ -94,15 +99,22 @@ int ErasureCodePluginRegistry::factory(const std::string &plugin_name,
     plugin = get(plugin_name);
     if (plugin == 0) {
       loading = true;
-      assert(parameters.count("directory") != 0);
-      int r = load(plugin_name, parameters.find("directory")->second, &plugin, ss);
+      int r = load(plugin_name, directory, &plugin, ss);
       loading = false;
       if (r != 0)
 	return r;
     }
   }
 
-  return plugin->factory(parameters, erasure_code);
+  int r = plugin->factory(directory, profile, erasure_code, ss);
+  if (r)
+    return r;
+  if (profile != (*erasure_code)->get_profile()) {
+    *ss << __func__ << " profile " << profile << " != get_profile() "
+	<< (*erasure_code)->get_profile() << std::endl;
+    return -EINVAL;
+  }
+  return 0;
 }
 
 static const char *an_older_version() {
@@ -112,14 +124,14 @@ static const char *an_older_version() {
 int ErasureCodePluginRegistry::load(const std::string &plugin_name,
 				    const std::string &directory,
 				    ErasureCodePlugin **plugin,
-				    ostream &ss)
+				    ostream *ss)
 {
   assert(lock.is_locked());
   std::string fname = directory + "/" PLUGIN_PREFIX
     + plugin_name + PLUGIN_SUFFIX;
   void *library = dlopen(fname.c_str(), RTLD_NOW);
   if (!library) {
-    ss << "load dlopen(" << fname << "): " << dlerror();
+    *ss << "load dlopen(" << fname << "): " << dlerror();
     return -EIO;
   }
 
@@ -128,8 +140,8 @@ int ErasureCodePluginRegistry::load(const std::string &plugin_name,
   if (erasure_code_version == NULL)
     erasure_code_version = an_older_version;
   if (erasure_code_version() != string(CEPH_GIT_NICE_VER)) {
-    ss << "expected plugin " << fname << " version " << CEPH_GIT_NICE_VER
-       << " but it claims to be " << erasure_code_version() << " instead";
+    *ss << "expected plugin " << fname << " version " << CEPH_GIT_NICE_VER
+	<< " but it claims to be " << erasure_code_version() << " instead";
     dlclose(library);
     return -EXDEV;
   }
@@ -140,38 +152,38 @@ int ErasureCodePluginRegistry::load(const std::string &plugin_name,
     std::string name = plugin_name;
     int r = erasure_code_init(name.c_str(), directory.c_str());
     if (r != 0) {
-      ss << "erasure_code_init(" << plugin_name
-	 << "," << directory 
-	 << "): " << cpp_strerror(r);
+      *ss << "erasure_code_init(" << plugin_name
+	  << "," << directory
+	  << "): " << cpp_strerror(r);
       dlclose(library);
       return r;
     }
   } else {
-    ss << "load dlsym(" << fname
-       << ", " << PLUGIN_INIT_FUNCTION
-       << "): " << dlerror();
+    *ss << "load dlsym(" << fname
+	<< ", " << PLUGIN_INIT_FUNCTION
+	<< "): " << dlerror();
     dlclose(library);
     return -ENOENT;
   }
 
   *plugin = get(plugin_name);
   if (*plugin == 0) {
-    ss << "load " << PLUGIN_INIT_FUNCTION << "()"
-       << "did not register " << plugin_name;
+    *ss << "load " << PLUGIN_INIT_FUNCTION << "()"
+	<< "did not register " << plugin_name;
     dlclose(library);
     return -EBADF;
   }
 
   (*plugin)->library = library;
 
-  ss << __func__ << ": " << plugin_name << " ";
+  *ss << __func__ << ": " << plugin_name << " ";
 
   return 0;
 }
 
 int ErasureCodePluginRegistry::preload(const std::string &plugins,
 				       const std::string &directory,
-				       ostream &ss)
+				       ostream *ss)
 {
   Mutex::Locker l(lock);
   list<string> plugins_list;
diff --git a/src/erasure-code/ErasureCodePlugin.h b/src/erasure-code/ErasureCodePlugin.h
index 035bf2e..72c187b 100644
--- a/src/erasure-code/ErasureCodePlugin.h
+++ b/src/erasure-code/ErasureCodePlugin.h
@@ -36,8 +36,10 @@ namespace ceph {
       library(0) {}
     virtual ~ErasureCodePlugin() {}
 
-    virtual int factory(const map<std::string,std::string> &parameters,
-                        ErasureCodeInterfaceRef *erasure_code) = 0;
+    virtual int factory(const std::string &directory,
+			ErasureCodeProfile &profile,
+                        ErasureCodeInterfaceRef *erasure_code,
+			ostream *ss) = 0;
   };
 
   class ErasureCodePluginRegistry {
@@ -57,9 +59,10 @@ namespace ceph {
     }
 
     int factory(const std::string &plugin,
-		const map<std::string,std::string> &parameters,
+		const std::string &directory,
+		ErasureCodeProfile &profile,
 		ErasureCodeInterfaceRef *erasure_code,
-		ostream &ss);
+		ostream *ss);
 
     int add(const std::string &name, ErasureCodePlugin *plugin);
     int remove(const std::string &name);
@@ -68,11 +71,11 @@ namespace ceph {
     int load(const std::string &plugin_name,
 	     const std::string &directory,
 	     ErasureCodePlugin **plugin,
-	     ostream &ss);
+	     ostream *ss);
 
     int preload(const std::string &plugins,
 		const std::string &directory,
-		ostream &ss);
+		ostream *ss);
   };
 }
 
diff --git a/src/erasure-code/isa/ErasureCodeIsa.cc b/src/erasure-code/isa/ErasureCodeIsa.cc
index f9e0793..acf63d8 100644
--- a/src/erasure-code/isa/ErasureCodeIsa.cc
+++ b/src/erasure-code/isa/ErasureCodeIsa.cc
@@ -41,6 +41,9 @@ _prefix(std::ostream* _dout)
 }
 // -----------------------------------------------------------------------------
 
+const std::string ErasureCodeIsaDefault::DEFAULT_K("7");
+const std::string ErasureCodeIsaDefault::DEFAULT_M("3");
+
 int
 ErasureCodeIsa::create_ruleset(const string &name,
                                CrushWrapper &crush,
@@ -63,21 +66,22 @@ ErasureCodeIsa::create_ruleset(const string &name,
 
 // -----------------------------------------------------------------------------
 
-void
-ErasureCodeIsa::init(const map<string, string> &parameters)
+int
+ErasureCodeIsa::init(ErasureCodeProfile &profile, ostream *ss)
 {
-  dout(10) << "technique=" << technique << dendl;
-  map<string, string>::const_iterator parameter;
-  parameter = parameters.find("ruleset-root");
-  if (parameter != parameters.end())
-    ruleset_root = parameter->second;
-  parameter = parameters.find("ruleset-failure-domain");
-  if (parameter != parameters.end())
-    ruleset_failure_domain = parameter->second;
-  ostringstream ss;
-  if (parse(parameters, &ss))
-    derr << ss.str() << dendl;
+  int err = 0;
+  err |= to_string("ruleset-root", profile,
+		   &ruleset_root,
+		   DEFAULT_RULESET_ROOT, ss);
+  err |= to_string("ruleset-failure-domain", profile,
+		   &ruleset_failure_domain,
+		   DEFAULT_RULESET_FAILURE_DOMAIN, ss);
+  err |= parse(profile, ss);
+  if (err)
+    return err;
   prepare();
+  ErasureCode::init(profile, ss);
+  return err;
 }
 
 // -----------------------------------------------------------------------------
@@ -340,13 +344,13 @@ ErasureCodeIsaDefault::get_alignment() const
 
 // -----------------------------------------------------------------------------
 
-int ErasureCodeIsaDefault::parse(const map<std::string,
-                                 std::string> &parameters,
+int ErasureCodeIsaDefault::parse(ErasureCodeProfile &profile,
                                  ostream *ss)
 {
-  int err = ErasureCode::parse(parameters, ss);
-  err |= to_int("k", parameters, &k, DEFAULT_K, ss);
-  err |= to_int("m", parameters, &m, DEFAULT_M, ss);
+  int err = ErasureCode::parse(profile, ss);
+  err |= to_int("k", profile, &k, DEFAULT_K, ss);
+  err |= to_int("m", profile, &m, DEFAULT_M, ss);
+  err |= sanity_check_k(k, ss);
 
   if (matrixtype == kVandermonde) {
     // these are verified safe values evaluated using the
diff --git a/src/erasure-code/isa/ErasureCodeIsa.h b/src/erasure-code/isa/ErasureCodeIsa.h
index fe71c7a..63aa5f3 100644
--- a/src/erasure-code/isa/ErasureCodeIsa.h
+++ b/src/erasure-code/isa/ErasureCodeIsa.h
@@ -33,6 +33,9 @@
 #include <list>
 // -----------------------------------------------------------------------------
 
+#define DEFAULT_RULESET_ROOT "default"
+#define DEFAULT_RULESET_FAILURE_DOMAIN "host"
+
 class ErasureCodeIsa : public ErasureCode {
 public:
 
@@ -56,8 +59,8 @@ public:
   w(0),
   tcache(_tcache),
   technique(_technique),
-  ruleset_root("default"),
-  ruleset_failure_domain("host")
+  ruleset_root(DEFAULT_RULESET_ROOT),
+  ruleset_failure_domain(DEFAULT_RULESET_FAILURE_DOMAIN)
   {
   }
 
@@ -91,7 +94,7 @@ public:
                             const map<int, bufferlist> &chunks,
                             map<int, bufferlist> *decoded);
 
-  void init(const map<std::string, std::string> &parameters);
+  virtual int init(ErasureCodeProfile &profile, ostream *ss);
 
   virtual void isa_encode(char **data,
                           char **coding,
@@ -105,12 +108,11 @@ public:
 
   virtual unsigned get_alignment() const = 0;
 
-  virtual int parse(const map<std::string,
-                    std::string> &parameters,
-                    ostream *ss) = 0;
-
   virtual void prepare() = 0;
 
+ private:
+  virtual int parse(ErasureCodeProfile &profile,
+                    ostream *ss) = 0;
 };
 
 // -----------------------------------------------------------------------------
@@ -121,8 +123,8 @@ private:
 
 public:
 
-  static const int DEFAULT_K = 7;
-  static const int DEFAULT_M = 3;
+  static const std::string DEFAULT_K;
+  static const std::string DEFAULT_M;
 
   unsigned char* encode_coeff; // encoding coefficient
   unsigned char* encode_tbls; // encoding table
@@ -155,13 +157,11 @@ public:
 
   virtual unsigned get_alignment() const;
 
-  virtual int parse(const map<std::string,
-                    std::string> &parameters,
-                    ostream *ss);
-
   virtual void prepare();
 
-
+ private:
+  virtual int parse(ErasureCodeProfile &profile,
+                    ostream *ss);
 };
 
 #endif
diff --git a/src/erasure-code/isa/ErasureCodeIsaTableCache.cc b/src/erasure-code/isa/ErasureCodeIsaTableCache.cc
index 003ff22..fa834ab 100644
--- a/src/erasure-code/isa/ErasureCodeIsaTableCache.cc
+++ b/src/erasure-code/isa/ErasureCodeIsaTableCache.cc
@@ -260,7 +260,7 @@ ErasureCodeIsaTableCache::getDecodingTableFromCache(std::string &signature,
     memcpy(table, (*decode_tbls_map)[signature].second.c_str(), k * (m + k)*32);
     // find item in LRU queue and push back
     dout(12) << "[ cache size   ] = " << decode_tbls_lru->size() << dendl;
-    decode_tbls_lru->splice((*decode_tbls_map)[signature].first, *decode_tbls_lru, decode_tbls_lru->end());
+    decode_tbls_lru->splice( (decode_tbls_lru->begin()), *decode_tbls_lru, (*decode_tbls_map)[signature].first);
     found = true;
   }
 
@@ -298,26 +298,26 @@ ErasureCodeIsaTableCache::putDecodingTableToCache(std::string &signature,
   if ((int) decode_tbls_lru->size() >= ErasureCodeIsaTableCache::decoding_tables_lru_length) {
     dout(12) << "[ shrink lru   ] = " << signature << dendl;
     // reuse old buffer
-    cachetable = (*decode_tbls_map)[decode_tbls_lru->front()].second;
+    cachetable = (*decode_tbls_map)[decode_tbls_lru->back()].second;
+
     if ((int) cachetable.length() != (k * (m + k)*32)) {
       // we need to replace this with a different size buffer
       cachetable = buffer::create(k * (m + k)*32);
-      (*decode_tbls_map)[signature] = std::make_pair(decode_tbls_lru->begin(), cachetable);
     }
 
     // remove from map
-    decode_tbls_map->erase(decode_tbls_lru->front());
+    decode_tbls_map->erase(decode_tbls_lru->back());
     // remove from lru
-    decode_tbls_lru->pop_front();
+    decode_tbls_lru->pop_back();
+    // add to the head of lru
+    decode_tbls_lru->push_front(signature);
     // add the new to the map
     (*decode_tbls_map)[signature] = std::make_pair(decode_tbls_lru->begin(), cachetable);
-    // add to the end of lru
-    decode_tbls_lru->push_back(signature);
   } else {
     dout(12) << "[ store table  ] = " << signature << dendl;
     // allocate a new buffer
     cachetable = buffer::create(k * (m + k)*32);
-    decode_tbls_lru->push_back(signature);
+    decode_tbls_lru->push_front(signature);
     (*decode_tbls_map)[signature] = std::make_pair(decode_tbls_lru->begin(), cachetable);
     dout(12) << "[ cache size   ] = " << decode_tbls_lru->size() << dendl;
   }
diff --git a/src/erasure-code/isa/ErasureCodePluginIsa.cc b/src/erasure-code/isa/ErasureCodePluginIsa.cc
index 4000fd1..611d76b 100644
--- a/src/erasure-code/isa/ErasureCodePluginIsa.cc
+++ b/src/erasure-code/isa/ErasureCodePluginIsa.cc
@@ -35,13 +35,16 @@ class ErasureCodePluginIsa : public ErasureCodePlugin {
 public:
   ErasureCodeIsaTableCache tcache;
 
-  virtual int factory(const map<std::string, std::string> &parameters,
-                      ErasureCodeInterfaceRef *erasure_code)
+  virtual int factory(const std::string &directory,
+		      ErasureCodeProfile &profile,
+                      ErasureCodeInterfaceRef *erasure_code,
+                      ostream *ss)
   {
     ErasureCodeIsa *interface;
-    std::string t = "reed_sol_van";
-    if (parameters.find("technique") != parameters.end())
-      t = parameters.find("technique")->second;
+    std::string t;
+    if (profile.find("technique") == profile.end())
+      profile["technique"] = "reed_sol_van";
+    t = profile.find("technique")->second;
     if ((t == "reed_sol_van")) {
       interface = new ErasureCodeIsaDefault(tcache,
                                             ErasureCodeIsaDefault::kVandermonde);
@@ -50,15 +53,19 @@ public:
         interface = new ErasureCodeIsaDefault(tcache,
                                               ErasureCodeIsaDefault::kCauchy);
       } else {
-        derr << "technique=" << t << " is not a valid coding technique. "
+        *ss << "technique=" << t << " is not a valid coding technique. "
           << " Choose one of the following: "
           << "reed_sol_van,"
-          << "cauchy" << dendl;
+          << "cauchy" << std::endl;
         return -ENOENT;
       }
     }
 
-    interface->init(parameters);
+    int r = interface->init(profile, ss);
+    if (r) {
+      delete interface;
+      return r;
+    }
     *erasure_code = ErasureCodeInterfaceRef(interface);
     return 0;
   }
diff --git a/src/erasure-code/isa/Makefile.am b/src/erasure-code/isa/Makefile.am
index 649ddaa..67725dd 100644
--- a/src/erasure-code/isa/Makefile.am
+++ b/src/erasure-code/isa/Makefile.am
@@ -33,6 +33,24 @@ isa_sources = \
 	erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx2.asm.s \
 	erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx.asm.s \
 	erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_sse.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_avx2.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_avx.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_sse.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_avx2.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_avx.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_sse.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_avx2.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_avx.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_sse.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_avx2.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_avx.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_sse.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_avx2.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_avx.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_sse.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_vect_mad_avx2.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_vect_mad_avx.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_vect_mad_sse.asm.s \
 	erasure-code/isa/isa-l/erasure_code/gf_vect_mul_avx.asm.s \
 	erasure-code/isa/isa-l/erasure_code/gf_vect_mul_sse.asm.s \
 	erasure-code/isa/ErasureCodeIsa.cc \
@@ -49,7 +67,7 @@ libec_isa_la_CXXFLAGS = ${AM_CXXFLAGS} -I $(srcdir)/erasure-code/isa/isa-l/inclu
 libec_isa_la_CCASFLAGS = ${AM_CCASFLAGS} -I $(abs_srcdir)/erasure-code/isa/isa-l/include/
 
 libec_isa_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(EXTRALIBS)
-libec_isa_la_LDFLAGS = ${AM_LDFLAGS} -version-info 2:10:0
+libec_isa_la_LDFLAGS = ${AM_LDFLAGS} -version-info 2:14:0
 if LINUX
 libec_isa_la_LDFLAGS += -export-symbols-regex '.*__erasure_code_.*'
 endif
diff --git a/src/erasure-code/isa/isa-l/erasure_code/ec_base.c b/src/erasure-code/isa/isa-l/erasure_code/ec_base.c
index 5e93cb6..3c7e838 100644
--- a/src/erasure-code/isa/isa-l/erasure_code/ec_base.c
+++ b/src/erasure-code/isa/isa-l/erasure_code/ec_base.c
@@ -1,5 +1,5 @@
 /**********************************************************************
-  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
@@ -275,6 +275,18 @@ void gf_vect_dot_prod_base(int len, int vlen, unsigned char *v,
 	}
 }
 
+void gf_vect_mad_base(int len, int vec, int vec_i,
+		      unsigned char *v, unsigned char *src, unsigned char *dest)
+{
+	int i;
+	unsigned char s;
+	for (i = 0; i < len; i++) {
+		s = dest[i];
+		s ^= gf_mul(src[i], v[vec_i * 32 + 1]);
+		dest[i] = s;
+	}
+}
+
 void ec_encode_data_base(int len, int srcs, int dests, unsigned char *v,
 			 unsigned char **src, unsigned char **dest)
 {
@@ -292,6 +304,22 @@ void ec_encode_data_base(int len, int srcs, int dests, unsigned char *v,
 	}
 }
 
+void ec_encode_data_update_base(int len, int k, int rows, int vec_i, unsigned char *v,
+				unsigned char *data, unsigned char **dest)
+{
+	int i, l;
+	unsigned char s;
+
+	for (l = 0; l < rows; l++) {
+		for (i = 0; i < len; i++) {
+			s = dest[l][i];
+			s ^= gf_mul(data[i], v[vec_i * 32 + l * k * 32 + 1]);
+
+			dest[l][i] = s;
+		}
+	}
+}
+
 void gf_vect_mul_base(int len, unsigned char *a, unsigned char *src, unsigned char *dest)
 {
 	//2nd element of table array is ref value used to fill it in
diff --git a/src/erasure-code/isa/isa-l/erasure_code/ec_base.h b/src/erasure-code/isa/isa-l/erasure_code/ec_base.h
index 519ac7a..d69a92d 100644
--- a/src/erasure-code/isa/isa-l/erasure_code/ec_base.h
+++ b/src/erasure-code/isa/isa-l/erasure_code/ec_base.h
@@ -1,5 +1,5 @@
 /**********************************************************************
-  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
diff --git a/src/erasure-code/isa/isa-l/erasure_code/ec_highlevel_func.c b/src/erasure-code/isa/isa-l/erasure_code/ec_highlevel_func.c
index 9cea61e..fe2cdc9 100644
--- a/src/erasure-code/isa/isa-l/erasure_code/ec_highlevel_func.c
+++ b/src/erasure-code/isa/isa-l/erasure_code/ec_highlevel_func.c
@@ -1,5 +1,5 @@
 /**********************************************************************
-  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
@@ -42,7 +42,6 @@ void ec_init_tables(int k, int rows, unsigned char *a, unsigned char *g_tbls)
 	}
 }
 
-#if __WORDSIZE == 64 || _WIN64 || __x86_64__
 void ec_encode_data_sse(int len, int k, int rows, unsigned char *g_tbls, unsigned char **data,
 			unsigned char **coding)
 {
@@ -77,7 +76,6 @@ void ec_encode_data_sse(int len, int k, int rows, unsigned char *g_tbls, unsigne
 void ec_encode_data_avx(int len, int k, int rows, unsigned char *g_tbls, unsigned char **data,
 			unsigned char **coding)
 {
-
 	if (len < 16) {
 		ec_encode_data_base(len, k, rows, g_tbls, data, coding);
 		return;
@@ -136,6 +134,123 @@ void ec_encode_data_avx2(int len, int k, int rows, unsigned char *g_tbls, unsign
 
 }
 
+#if __WORDSIZE == 64 || _WIN64 || __x86_64__
+
+void ec_encode_data_update_sse(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
+			       unsigned char *data, unsigned char **coding)
+{
+	if (len < 16) {
+		ec_encode_data_update_base(len, k, rows, vec_i, g_tbls, data, coding);
+		return;
+	}
+
+	while (rows > 6) {
+		gf_6vect_mad_sse(len, k, vec_i, g_tbls, data, coding);
+		g_tbls += 6 * k * 32;
+		coding += 6;
+		rows -= 6;
+	}
+	switch (rows) {
+	case 6:
+		gf_6vect_mad_sse(len, k, vec_i, g_tbls, data, coding);
+		break;
+	case 5:
+		gf_5vect_mad_sse(len, k, vec_i, g_tbls, data, coding);
+		break;
+	case 4:
+		gf_4vect_mad_sse(len, k, vec_i, g_tbls, data, coding);
+		break;
+	case 3:
+		gf_3vect_mad_sse(len, k, vec_i, g_tbls, data, coding);
+		break;
+	case 2:
+		gf_2vect_mad_sse(len, k, vec_i, g_tbls, data, coding);
+		break;
+	case 1:
+		gf_vect_mad_sse(len, k, vec_i, g_tbls, data, *coding);
+		break;
+	case 0:
+		break;
+	}
+
+}
+
+void ec_encode_data_update_avx(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
+			       unsigned char *data, unsigned char **coding)
+{
+	if (len < 16) {
+		ec_encode_data_update_base(len, k, rows, vec_i, g_tbls, data, coding);
+		return;
+	}
+	while (rows > 6) {
+		gf_6vect_mad_avx(len, k, vec_i, g_tbls, data, coding);
+		g_tbls += 6 * k * 32;
+		coding += 6;
+		rows -= 6;
+	}
+	switch (rows) {
+	case 6:
+		gf_6vect_mad_avx(len, k, vec_i, g_tbls, data, coding);
+		break;
+	case 5:
+		gf_5vect_mad_avx(len, k, vec_i, g_tbls, data, coding);
+		break;
+	case 4:
+		gf_4vect_mad_avx(len, k, vec_i, g_tbls, data, coding);
+		break;
+	case 3:
+		gf_3vect_mad_avx(len, k, vec_i, g_tbls, data, coding);
+		break;
+	case 2:
+		gf_2vect_mad_avx(len, k, vec_i, g_tbls, data, coding);
+		break;
+	case 1:
+		gf_vect_mad_avx(len, k, vec_i, g_tbls, data, *coding);
+		break;
+	case 0:
+		break;
+	}
+
+}
+
+void ec_encode_data_update_avx2(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
+				unsigned char *data, unsigned char **coding)
+{
+	if (len < 32) {
+		ec_encode_data_update_base(len, k, rows, vec_i, g_tbls, data, coding);
+		return;
+	}
+	while (rows > 6) {
+		gf_6vect_mad_avx2(len, k, vec_i, g_tbls, data, coding);
+		g_tbls += 6 * k * 32;
+		coding += 6;
+		rows -= 6;
+	}
+	switch (rows) {
+	case 6:
+		gf_6vect_mad_avx2(len, k, vec_i, g_tbls, data, coding);
+		break;
+	case 5:
+		gf_5vect_mad_avx2(len, k, vec_i, g_tbls, data, coding);
+		break;
+	case 4:
+		gf_4vect_mad_avx2(len, k, vec_i, g_tbls, data, coding);
+		break;
+	case 3:
+		gf_3vect_mad_avx2(len, k, vec_i, g_tbls, data, coding);
+		break;
+	case 2:
+		gf_2vect_mad_avx2(len, k, vec_i, g_tbls, data, coding);
+		break;
+	case 1:
+		gf_vect_mad_avx2(len, k, vec_i, g_tbls, data, *coding);
+		break;
+	case 0:
+		break;
+	}
+
+}
+
 #endif //__WORDSIZE == 64 || _WIN64 || __x86_64__
 
 struct slver {
diff --git a/src/erasure-code/isa/isa-l/erasure_code/ec_multibinary.asm.s b/src/erasure-code/isa/isa-l/erasure_code/ec_multibinary.asm.s
index 54f7301..03f501a 100644
--- a/src/erasure-code/isa/isa-l/erasure_code/ec_multibinary.asm.s
+++ b/src/erasure-code/isa/isa-l/erasure_code/ec_multibinary.asm.s
@@ -1,5 +1,5 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 ;
 ;  Redistribution and use in source and binary forms, with or without
 ;  modification, are permitted provided that the following conditions
@@ -28,42 +28,63 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 %ifidn __OUTPUT_FORMAT__, elf64
-%define WRT_OPT		wrt ..plt
+ %define WRT_OPT		wrt ..plt
 %else
-%define WRT_OPT
+ %define WRT_OPT
 %endif
 
+%include "reg_sizes.asm"
+
 %ifidn __OUTPUT_FORMAT__, elf32
 
 [bits 32]
 
-%define def_wrd		dd
-%define wrd_sz  	dword
-%define arg1		esi
+ %define def_wrd		dd
+ %define wrd_sz  	dword
+ %define arg1		esi
+ %define arg2		eax
+ %define arg3		ebx
+ %define arg4		ecx
+ %define arg5		edx
 
 %else
 
-%include "reg_sizes.asm"
-default rel
-[bits 64]
+ default rel
+ [bits 64]
 
-%define def_wrd 	dq
-%define wrd_sz  	qword
-%define arg1		rsi
+ %define def_wrd 	dq
+ %define wrd_sz  	qword
+ %define arg1		rsi
+ %define arg2		rax
+ %define arg3		rbx
+ %define arg4		rcx
+ %define arg5		rdx
 
-extern ec_encode_data_sse
-extern ec_encode_data_avx
-extern ec_encode_data_avx2
-extern gf_vect_mul_sse
-extern gf_vect_mul_avx
-extern gf_vect_dot_prod_sse
-extern gf_vect_dot_prod_avx
-extern gf_vect_dot_prod_avx2
+
+ extern ec_encode_data_update_sse
+ extern ec_encode_data_update_avx
+ extern ec_encode_data_update_avx2
+ extern gf_vect_mul_sse
+ extern gf_vect_mul_avx
+
+ extern gf_vect_mad_sse
+ extern gf_vect_mad_avx
+ extern gf_vect_mad_avx2
 %endif
 
 extern gf_vect_mul_base
 extern ec_encode_data_base
+extern ec_encode_data_update_base
 extern gf_vect_dot_prod_base
+extern gf_vect_mad_base
+
+extern gf_vect_dot_prod_sse
+extern gf_vect_dot_prod_avx
+extern gf_vect_dot_prod_avx2
+extern ec_encode_data_sse
+extern ec_encode_data_avx
+extern ec_encode_data_avx2
+
 
 section .data
 ;;; *_mbinit are initial values for *_dispatched; is updated on first call.
@@ -78,6 +99,12 @@ gf_vect_mul_dispatched:
 gf_vect_dot_prod_dispatched:
 	def_wrd      gf_vect_dot_prod_mbinit
 
+ec_encode_data_update_dispatched:
+	def_wrd      ec_encode_data_update_mbinit
+
+gf_vect_mad_dispatched:
+	def_wrd      gf_vect_mad_mbinit
+
 section .text
 ;;;;
 ; ec_encode_data multibinary function
@@ -91,50 +118,45 @@ ec_encode_data:
 
 ec_encode_data_dispatch_init:
 	push    arg1
-%ifidn __OUTPUT_FORMAT__, elf32		;; 32-bit check
-	lea     arg1, [ec_encode_data_base]
-%else
-	push    rax
-	push    rbx
-	push    rcx
-	push    rdx
+	push    arg2
+	push    arg3
+	push    arg4
+	push    arg5
 	lea     arg1, [ec_encode_data_base WRT_OPT] ; Default
 
 	mov     eax, 1
 	cpuid
-	lea     rbx, [ec_encode_data_sse WRT_OPT]
+	lea     arg3, [ec_encode_data_sse WRT_OPT]
 	test    ecx, FLAG_CPUID1_ECX_SSE4_1
-	cmovne  arg1, rbx
+	cmovne  arg1, arg3
 
 	and	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
 	cmp	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
-	lea	rbx, [ec_encode_data_avx WRT_OPT]
+	lea	arg3, [ec_encode_data_avx WRT_OPT]
 
 	jne	_done_ec_encode_data_init
-	mov	rsi, rbx
+	mov	arg1, arg3
 
 	;; Try for AVX2
 	xor	ecx, ecx
 	mov	eax, 7
 	cpuid
 	test	ebx, FLAG_CPUID1_EBX_AVX2
-	lea     rbx, [ec_encode_data_avx2 WRT_OPT]
-	cmovne	rsi, rbx
-
+	lea     arg3, [ec_encode_data_avx2 WRT_OPT]
+	cmovne	arg1, arg3
 	;; Does it have xmm and ymm support
 	xor	ecx, ecx
 	xgetbv
 	and	eax, FLAG_XGETBV_EAX_XMM_YMM
 	cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
 	je	_done_ec_encode_data_init
-	lea     rsi, [ec_encode_data_sse WRT_OPT]
+	lea     arg1, [ec_encode_data_sse WRT_OPT]
 
 _done_ec_encode_data_init:
-	pop     rdx
-	pop     rcx
-	pop     rbx
-	pop     rax
-%endif			;; END 32-bit check
+	pop     arg5
+	pop     arg4
+	pop     arg3
+	pop     arg2
 	mov     [ec_encode_data_dispatched], arg1
 	pop     arg1
 	ret
@@ -190,6 +212,65 @@ _done_gf_vect_mul_dispatch_init:
 	pop     arg1
 	ret
 
+;;;;
+; ec_encode_data_update multibinary function
+;;;;
+global ec_encode_data_update:function
+ec_encode_data_update_mbinit:
+	call	ec_encode_data_update_dispatch_init
+
+ec_encode_data_update:
+	jmp	wrd_sz [ec_encode_data_update_dispatched]
+
+ec_encode_data_update_dispatch_init:
+	push    arg1
+%ifidn __OUTPUT_FORMAT__, elf32		;; 32-bit check
+	lea     arg1, [ec_encode_data_update_base]
+%else
+	push    rax
+	push    rbx
+	push    rcx
+	push    rdx
+	lea     arg1, [ec_encode_data_update_base WRT_OPT] ; Default
+
+	mov     eax, 1
+	cpuid
+	lea     rbx, [ec_encode_data_update_sse WRT_OPT]
+	test    ecx, FLAG_CPUID1_ECX_SSE4_1
+	cmovne  arg1, rbx
+
+	and	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+	cmp	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+	lea	rbx, [ec_encode_data_update_avx WRT_OPT]
+
+	jne	_done_ec_encode_data_update_init
+	mov	rsi, rbx
+
+	;; Try for AVX2
+	xor	ecx, ecx
+	mov	eax, 7
+	cpuid
+	test	ebx, FLAG_CPUID1_EBX_AVX2
+	lea     rbx, [ec_encode_data_update_avx2 WRT_OPT]
+	cmovne	rsi, rbx
+
+	;; Does it have xmm and ymm support
+	xor	ecx, ecx
+	xgetbv
+	and	eax, FLAG_XGETBV_EAX_XMM_YMM
+	cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
+	je	_done_ec_encode_data_update_init
+	lea     rsi, [ec_encode_data_update_sse WRT_OPT]
+
+_done_ec_encode_data_update_init:
+	pop     rdx
+	pop     rcx
+	pop     rbx
+	pop     rax
+%endif			;; END 32-bit check
+	mov     [ec_encode_data_update_dispatched], arg1
+	pop     arg1
+	ret
 
 ;;;;
 ; gf_vect_dot_prod multibinary function
@@ -203,26 +284,81 @@ gf_vect_dot_prod:
 
 gf_vect_dot_prod_dispatch_init:
 	push    arg1
+	push    arg2
+	push    arg3
+	push    arg4
+	push    arg5
+	lea     arg1, [gf_vect_dot_prod_base WRT_OPT] ; Default
+
+	mov     eax, 1
+	cpuid
+	lea     arg3, [gf_vect_dot_prod_sse WRT_OPT]
+	test    ecx, FLAG_CPUID1_ECX_SSE4_1
+	cmovne  arg1, arg3
+
+	and		ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+	cmp		ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+	lea     arg3, [gf_vect_dot_prod_avx WRT_OPT]
+
+	jne     _done_gf_vect_dot_prod_init
+	mov		arg1, arg3
+
+	;; Try for AVX2
+	xor		ecx, ecx
+	mov		eax, 7
+	cpuid
+	test	ebx, FLAG_CPUID1_EBX_AVX2
+	lea     arg3, [gf_vect_dot_prod_avx2 WRT_OPT]
+	cmovne	arg1, arg3
+	;; Does it have xmm and ymm support
+	xor	ecx, ecx
+	xgetbv
+	and	eax, FLAG_XGETBV_EAX_XMM_YMM
+	cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
+	je	_done_gf_vect_dot_prod_init
+	lea     arg1, [gf_vect_dot_prod_sse WRT_OPT]
+
+_done_gf_vect_dot_prod_init:
+	pop     arg5
+	pop     arg4
+	pop     arg3
+	pop     arg2
+	mov     [gf_vect_dot_prod_dispatched], arg1
+	pop	arg1
+	ret
+
+;;;;
+; gf_vect_mad multibinary function
+;;;;
+global gf_vect_mad:function
+gf_vect_mad_mbinit:
+	call    gf_vect_mad_dispatch_init
+
+gf_vect_mad:
+	jmp     wrd_sz [gf_vect_mad_dispatched]
+
+gf_vect_mad_dispatch_init:
+	push    arg1
 %ifidn __OUTPUT_FORMAT__, elf32         ;; 32-bit check
-	lea     arg1, [gf_vect_dot_prod_base]
+	lea     arg1, [gf_vect_mad_base]
 %else
 	push	rax
 	push	rbx
 	push	rcx
 	push	rdx
-	lea     arg1, [gf_vect_dot_prod_base WRT_OPT] ; Default
+	lea     arg1, [gf_vect_mad_base WRT_OPT] ; Default
 
 	mov     eax, 1
 	cpuid
-	lea     rbx, [gf_vect_dot_prod_sse WRT_OPT]
+	lea     rbx, [gf_vect_mad_sse WRT_OPT]
 	test    ecx, FLAG_CPUID1_ECX_SSE4_1
 	cmovne  arg1, rbx
 
 	and	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
 	cmp	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
-	lea     rbx, [gf_vect_dot_prod_avx WRT_OPT]
+	lea     rbx, [gf_vect_mad_avx WRT_OPT]
 
-	jne     _done_gf_vect_dot_prod_init
+	jne     _done_gf_vect_mad_init
 	mov	rsi, rbx
 
 	;; Try for AVX2
@@ -230,7 +366,7 @@ gf_vect_dot_prod_dispatch_init:
 	mov	eax, 7
 	cpuid
 	test	ebx, FLAG_CPUID1_EBX_AVX2
-	lea     rbx, [gf_vect_dot_prod_avx2 WRT_OPT]
+	lea     rbx, [gf_vect_mad_avx2 WRT_OPT]
 	cmovne	rsi, rbx
 
 	;; Does it have xmm and ymm support
@@ -238,31 +374,22 @@ gf_vect_dot_prod_dispatch_init:
 	xgetbv
 	and	eax, FLAG_XGETBV_EAX_XMM_YMM
 	cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
-	je	_done_gf_vect_dot_prod_init
-	lea     rsi, [gf_vect_dot_prod_sse WRT_OPT]
+	je	_done_gf_vect_mad_init
+	lea     rsi, [gf_vect_mad_sse WRT_OPT]
 
-_done_gf_vect_dot_prod_init:
+_done_gf_vect_mad_init:
 	pop     rdx
 	pop     rcx
 	pop     rbx
 	pop     rax
 %endif			;; END 32-bit check
-	mov     [gf_vect_dot_prod_dispatched], arg1
+	mov     [gf_vect_mad_dispatched], arg1
 	pop	arg1
 	ret
 
-%macro slversion 4
-global %1_slver_%2%3%4
-global %1_slver
-%1_slver:
-%1_slver_%2%3%4:
-	dw 0x%4
-	db 0x%3, 0x%2
-%endmacro
-
-;;;       func                  core, ver, snum
-slversion ec_encode_data,	00,   02,  0133
-slversion gf_vect_mul,		00,   02,  0134
-slversion gf_vect_dot_prod,	00,   01,  0138
-; inform linker that this doesn't require executable stack
-section .note.GNU-stack noalloc noexec nowrite progbits
+;;;       func                 		core, ver, snum
+slversion ec_encode_data,		00,   04,  0133
+slversion gf_vect_mul,			00,   03,  0134
+slversion ec_encode_data_update,	00,   03,  0212
+slversion gf_vect_dot_prod,		00,   03,  0138
+slversion gf_vect_mad,			00,   02,  0213
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_dot_prod_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_dot_prod_avx.asm.s
index db8064a..db1e1c5 100644
--- a/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_dot_prod_avx.asm.s
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_dot_prod_avx.asm.s
@@ -1,5 +1,5 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 ;
 ;  Redistribution and use in source and binary forms, with or without
 ;  modification, are permitted provided that the following conditions
@@ -30,8 +30,8 @@
 ;;;
 ;;; gf_2vect_dot_prod_avx(len, vec, *g_tbls, **buffs, **dests);
 ;;;
-;;; Author: Gregory Tucker
 
+%include "reg_sizes.asm"
 
 %ifidn __OUTPUT_FORMAT__, elf64
  %define arg0  rdi
@@ -46,6 +46,9 @@
  %define tmp3  r9
  %define tmp4  r12		; must be saved and restored
  %define return rax
+ %macro  SLDR 2
+ %endmacro
+ %define SSTR SLDR
  %define PS 8
  %define LOG_PS 3
 
@@ -70,6 +73,9 @@
  %define tmp3   r13		; must be saved and restored
  %define tmp4   r14		; must be saved and restored
  %define return rax
+ %macro  SLDR 2
+ %endmacro
+ %define SSTR SLDR
  %define PS     8
  %define LOG_PS 3
  %define stack_size  3*16 + 3*8 	; must be an odd multiple of 8
@@ -99,17 +105,92 @@
  %endmacro
 %endif
 
+%ifidn __OUTPUT_FORMAT__, elf32
+
+;;;================== High Address;
+;;;	arg4
+;;;	arg3
+;;;	arg2
+;;;	arg1
+;;;	arg0
+;;;	return
+;;;<================= esp of caller
+;;;	ebp
+;;;<================= ebp = esp
+;;;	var0
+;;;	esi
+;;;	edi
+;;;	ebx
+;;;<================= esp of callee
+;;;
+;;;================== Low Address;
+
+ %define PS 4
+ %define LOG_PS 2
+ %define func(x) x:
+ %define arg(x) [ebp + PS*2 + PS*x]
+ %define var(x) [ebp - PS - PS*x]
+
+ %define trans   ecx
+ %define trans2  esi
+ %define arg0    trans		;trans and trans2 are for the variables in stack
+ %define arg0_m  arg(0)
+ %define arg1    ebx
+ %define arg2    arg2_m
+ %define arg2_m  arg(2)
+ %define arg3    trans
+ %define arg3_m  arg(3)
+ %define arg4    trans
+ %define arg4_m  arg(4)
+ %define tmp	 edx
+ %define tmp2    edi
+ %define tmp3    trans2
+ %define tmp4    trans2
+ %define tmp4_m  var(0)
+ %define return  eax
+ %macro SLDR 2	;; stack load/restore
+	mov %1, %2
+ %endmacro
+ %define SSTR SLDR
+
+ %macro FUNC_SAVE 0
+	push	ebp
+	mov	ebp, esp
+	sub	esp, PS*1	;1 local variable
+	push	esi
+	push	edi
+	push	ebx
+	mov	arg1, arg(1)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	pop	ebx
+	pop	edi
+	pop	esi
+	add	esp, PS*1	;1 local variable
+	pop	ebp
+ %endmacro
+
+%endif	; output formats
+
 %define len   arg0
 %define vec   arg1
 %define mul_array arg2
 %define	src   arg3
-%define dest1 arg4
+%define dest1  arg4
 
 %define vec_i tmp2
 %define ptr   tmp3
 %define dest2 tmp4
 %define pos   return
 
+ %ifidn PS,4				;32-bit code
+	%define  len_m 	arg0_m
+	%define  src_m 	arg3_m
+	%define  dest1_m arg4_m
+	%define  dest2_m tmp4_m
+ %endif
+
 %ifndef EC_ALIGNED_ADDR
 ;;; Use Un-aligned load/store
  %define XLDR vmovdqu
@@ -125,35 +206,54 @@
  %endif
 %endif
 
+%ifidn PS,8			; 64-bit code
+ default rel
+  [bits 64]
+%endif
 
-default rel
-
-[bits 64]
 section .text
 
-%define xmask0f   xmm8
-%define xgft1_lo  xmm7
-%define xgft1_hi  xmm6
-%define xgft2_lo  xmm5
-%define xgft2_hi  xmm4
-
-%define x0     xmm0
-%define xtmpa  xmm1
-%define xp1    xmm2
-%define xp2    xmm3
+%ifidn PS,8			;64-bit code
+ %define xmask0f   xmm8
+ %define xgft1_lo  xmm7
+ %define xgft1_hi  xmm6
+ %define xgft2_lo  xmm5
+ %define xgft2_hi  xmm4
+
+ %define x0     xmm0
+ %define xtmpa  xmm1
+ %define xp1    xmm2
+ %define xp2    xmm3
+%else				;32-bit code
+ %define xmask0f   xmm4
+ %define xgft1_lo  xmm7
+ %define xgft1_hi  xmm6
+ %define xgft2_lo  xgft1_lo
+ %define xgft2_hi  xgft1_hi
+
+ %define x0     xmm0
+ %define xtmpa  xmm1
+ %define xp1    xmm2
+ %define xp2    xmm3
+%endif
 
 align 16
 global gf_2vect_dot_prod_avx:function
 
 func(gf_2vect_dot_prod_avx)
 	FUNC_SAVE
+	SLDR	len, len_m
 	sub	len, 16
+	SSTR	len_m, len
 	jl	.return_fail
 	xor	pos, pos
 	vmovdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
 	sal	vec, LOG_PS		;vec *= PS. Make vec_i count by PS
+	SLDR	dest1, dest1_m
 	mov	dest2, [dest1+PS]
+	SSTR	dest2_m, dest2
 	mov	dest1, [dest1]
+	SSTR	dest1_m, dest1
 
 .loop16
 	vpxor	xp1, xp1
@@ -162,16 +262,18 @@ func(gf_2vect_dot_prod_avx)
 	xor	vec_i, vec_i
 
 .next_vect
+	SLDR	src, src_m
 	mov	ptr, [src+vec_i]
 
 	vmovdqu	xgft1_lo, [tmp]		;Load array Ax{00}, Ax{01}, ..., Ax{0f}
 	vmovdqu	xgft1_hi, [tmp+16]	;     "     Ax{00}, Ax{10}, ..., Ax{f0}
+ %ifidn PS,8				; 64-bit code
 	vmovdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
 	vmovdqu	xgft2_hi, [tmp+vec*(32/PS)+16]	;     "     Bx{00}, Bx{10}, ..., Bx{f0}
-
-	XLDR	x0, [ptr+pos]		;Get next source vector
 	add	tmp, 32
 	add	vec_i, PS
+ %endif
+	XLDR	x0, [ptr+pos]		;Get next source vector
 
 	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
 	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
@@ -182,6 +284,12 @@ func(gf_2vect_dot_prod_avx)
 	vpxor	xgft1_hi, xgft1_lo	;GF add high and low partials
 	vpxor	xp1, xgft1_hi		;xp1 += partial
 
+ %ifidn PS,4				; 32-bit code
+	vmovdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+	vmovdqu	xgft2_hi, [tmp+vec*(32/PS)+16]	;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+	add	tmp, 32
+	add	vec_i, PS
+ %endif
 	vpshufb	xgft2_hi, x0		;Lookup mul table of high nibble
 	vpshufb	xgft2_lo, xtmpa		;Lookup mul table of low nibble
 	vpxor	xgft2_hi, xgft2_lo	;GF add high and low partials
@@ -190,9 +298,12 @@ func(gf_2vect_dot_prod_avx)
 	cmp	vec_i, vec
 	jl	.next_vect
 
+	SLDR	dest1, dest1_m
+	SLDR	dest2, dest2_m
 	XSTR	[dest1+pos], xp1
 	XSTR	[dest2+pos], xp2
 
+	SLDR	len, len_m
 	add	pos, 16			;Loop on 16 bytes at a time
 	cmp	pos, len
 	jle	.loop16
@@ -222,15 +333,5 @@ section .data
 align 16
 mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
 
-%macro slversion 4
-global %1_slver_%2%3%4
-global %1_slver
-%1_slver:
-%1_slver_%2%3%4:
-	dw 0x%4
-	db 0x%3, 0x%2
-%endmacro
 ;;;       func                  core, ver, snum
-slversion gf_2vect_dot_prod_avx, 02,  03,  0191
-; inform linker that this doesn't require executable stack
-section .note.GNU-stack noalloc noexec nowrite progbits
+slversion gf_2vect_dot_prod_avx, 02,  05,  0191
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_dot_prod_avx2.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_dot_prod_avx2.asm.s
index 5d75d81..0387893 100644
--- a/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_dot_prod_avx2.asm.s
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_dot_prod_avx2.asm.s
@@ -1,5 +1,5 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 ;
 ;  Redistribution and use in source and binary forms, with or without
 ;  modification, are permitted provided that the following conditions
@@ -30,8 +30,8 @@
 ;;;
 ;;; gf_2vect_dot_prod_avx2(len, vec, *g_tbls, **buffs, **dests);
 ;;;
-;;; Author: Gregory Tucker
 
+%include "reg_sizes.asm"
 
 %ifidn __OUTPUT_FORMAT__, elf64
  %define arg0  rdi
@@ -48,7 +48,10 @@
  %define tmp3  r9
  %define tmp4  r12		; must be saved and restored
  %define return rax
- %define PS 8
+ %macro  SLDR   2
+ %endmacro
+ %define SSTR   SLDR
+ %define PS     8
  %define LOG_PS 3
 
  %define func(x) x:
@@ -74,6 +77,9 @@
  %define tmp3   r13		; must be saved and restored
  %define tmp4   r14		; must be saved and restored
  %define return rax
+ %macro  SLDR   2
+ %endmacro
+ %define SSTR   SLDR
  %define PS     8
  %define LOG_PS 3
  %define stack_size  3*16 + 3*8 	; must be an odd multiple of 8
@@ -103,6 +109,76 @@
  %endmacro
 %endif
 
+%ifidn __OUTPUT_FORMAT__, elf32
+
+;;;================== High Address;
+;;;	arg4
+;;;	arg3
+;;;	arg2
+;;;	arg1
+;;;	arg0
+;;;	return
+;;;<================= esp of caller
+;;;	ebp
+;;;<================= ebp = esp
+;;;	var0
+;;;	esi
+;;;	edi
+;;;	ebx
+;;;<================= esp of callee
+;;;
+;;;================== Low Address;
+
+ %define PS 4
+ %define LOG_PS 2
+ %define func(x) x:
+ %define arg(x) [ebp + PS*2 + PS*x]
+ %define var(x) [ebp - PS - PS*x]
+
+ %define trans   ecx
+ %define trans2  esi
+ %define arg0    trans			;trans and trans2 are for the variables in stack
+ %define arg0_m  arg(0)
+ %define arg1    ebx
+ %define arg2    arg2_m
+ %define arg2_m  arg(2)
+ %define arg3    trans
+ %define arg3_m  arg(3)
+ %define arg4    trans
+ %define arg4_m  arg(4)
+ %define tmp	 edx
+ %define tmp.w   edx
+ %define tmp.b   dl
+ %define tmp2    edi
+ %define tmp3    trans2
+ %define tmp4    trans2
+ %define tmp4_m  var(0)
+ %define return  eax
+ %macro SLDR 	 2			;stack load/restore
+	mov %1, %2
+ %endmacro
+ %define SSTR SLDR
+
+ %macro FUNC_SAVE 0
+	push	ebp
+	mov	ebp, esp
+	sub	esp, PS*1		;1 local variable
+	push	esi
+	push	edi
+	push	ebx
+	mov	arg1, arg(1)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	pop	ebx
+	pop	edi
+	pop	esi
+	add	esp, PS*1		;1 local variable
+	pop	ebp
+ %endmacro
+
+%endif	; output formats
+
 %define len   arg0
 %define vec   arg1
 %define mul_array arg2
@@ -114,6 +190,13 @@
 %define dest2 tmp4
 %define pos   return
 
+%ifidn PS,4				;32-bit code
+ %define  len_m   arg0_m
+ %define  src_m   arg3_m
+ %define  dest1_m arg4_m
+ %define  dest2_m tmp4_m
+%endif
+
 %ifndef EC_ALIGNED_ADDR
 ;;; Use Un-aligned load/store
  %define XLDR vmovdqu
@@ -130,30 +213,48 @@
  %endif
 %endif
 
+%ifidn PS,8				;64-bit code
+ default rel
+ [bits 64]
+%endif
 
-default rel
-
-[bits 64]
 section .text
 
-%define xmask0f   ymm8
-%define xmask0fx  xmm8
-%define xgft1_lo  ymm7
-%define xgft1_hi  ymm6
-%define xgft2_lo  ymm5
-%define xgft2_hi  ymm4
+%ifidn PS,8				;64-bit code
+ %define xmask0f   ymm8
+ %define xmask0fx  xmm8
+ %define xgft1_lo  ymm7
+ %define xgft1_hi  ymm6
+ %define xgft2_lo  ymm5
+ %define xgft2_hi  ymm4
+
+ %define x0     ymm0
+ %define xtmpa  ymm1
+ %define xp1    ymm2
+ %define xp2    ymm3
+%else					;32-bit code
+ %define xmask0f   ymm7
+ %define xmask0fx  xmm7
+ %define xgft1_lo  ymm5
+ %define xgft1_hi  ymm4
+ %define xgft2_lo  xgft1_lo
+ %define xgft2_hi  xgft1_hi
+
+ %define x0     ymm0
+ %define xtmpa  ymm1
+ %define xp1    ymm2
+ %define xp2    ymm3
 
-%define x0     ymm0
-%define xtmpa  ymm1
-%define xp1    ymm2
-%define xp2    ymm3
+%endif
 
 align 16
 global gf_2vect_dot_prod_avx2:function
 
 func(gf_2vect_dot_prod_avx2)
 	FUNC_SAVE
+	SLDR	len, len_m
 	sub	len, 32
+	SSTR	len_m, len
 	jl	.return_fail
 	xor	pos, pos
 	mov	tmp.b, 0x0f
@@ -161,8 +262,11 @@ func(gf_2vect_dot_prod_avx2)
 	vpbroadcastb xmask0f, xmask0fx	;Construct mask 0x0f0f0f...
 
 	sal	vec, LOG_PS		;vec *= PS. Make vec_i count by PS
+	SLDR	dest1, dest1_m
 	mov	dest2, [dest1+PS]
+	SSTR	dest2_m, dest2
 	mov	dest1, [dest1]
+	SSTR	dest1_m, dest1
 
 .loop32
 	vpxor	xp1, xp1
@@ -171,22 +275,25 @@ func(gf_2vect_dot_prod_avx2)
 	xor	vec_i, vec_i
 
 .next_vect
+	SLDR	src, src_m
 	mov	ptr, [src+vec_i]
 
 	vmovdqu	xgft1_lo, [tmp]		;Load array Ax{00}, Ax{01}, ..., Ax{0f}
 					;     "     Ax{00}, Ax{10}, ..., Ax{f0}
 	vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x11 ; swapped to hi | hi
 	vperm2i128 xgft1_lo, xgft1_lo, xgft1_lo, 0x00 ; swapped to lo | lo
-
+ %ifidn PS,8				; 64-bit code
 	vmovdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
 						;     "     Bx{00}, Bx{10}, ..., Bx{f0}
 	vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi
 	vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo
 
-
 	XLDR	x0, [ptr+pos]		;Get next source vector
 	add	tmp, 32
 	add	vec_i, PS
+ %else
+	XLDR	x0, [ptr+pos]		;Get next source vector
+ %endif
 
 	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
 	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
@@ -197,6 +304,14 @@ func(gf_2vect_dot_prod_avx2)
 	vpxor	xgft1_hi, xgft1_lo	;GF add high and low partials
 	vpxor	xp1, xgft1_hi		;xp1 += partial
 
+ %ifidn PS,4				; 32-bit code
+	vmovdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+						;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+	vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi
+	vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo
+	add	tmp, 32
+	add	vec_i, PS
+ %endif
 	vpshufb	xgft2_hi, x0		;Lookup mul table of high nibble
 	vpshufb	xgft2_lo, xtmpa		;Lookup mul table of low nibble
 	vpxor	xgft2_hi, xgft2_lo	;GF add high and low partials
@@ -205,9 +320,12 @@ func(gf_2vect_dot_prod_avx2)
 	cmp	vec_i, vec
 	jl	.next_vect
 
+	SLDR	dest1, dest1_m
+	SLDR	dest2, dest2_m
 	XSTR	[dest1+pos], xp1
 	XSTR	[dest2+pos], xp2
 
+	SLDR	len, len_m
 	add	pos, 32			;Loop on 32 bytes at a time
 	cmp	pos, len
 	jle	.loop32
@@ -234,15 +352,5 @@ endproc_frame
 
 section .data
 
-%macro slversion 4
-global %1_slver_%2%3%4
-global %1_slver
-%1_slver:
-%1_slver_%2%3%4:
-	dw 0x%4
-	db 0x%3, 0x%2
-%endmacro
 ;;;       func                   core, ver, snum
-slversion gf_2vect_dot_prod_avx2, 04,  03,  0196
-; inform linker that this doesn't require executable stack
-section .note.GNU-stack noalloc noexec nowrite progbits
+slversion gf_2vect_dot_prod_avx2, 04,  05,  0196
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_dot_prod_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_dot_prod_sse.asm.s
index 4f324ae..95dd92a 100644
--- a/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_dot_prod_sse.asm.s
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_dot_prod_sse.asm.s
@@ -1,5 +1,5 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 ;
 ;  Redistribution and use in source and binary forms, with or without
 ;  modification, are permitted provided that the following conditions
@@ -30,8 +30,8 @@
 ;;;
 ;;; gf_2vect_dot_prod_sse(len, vec, *g_tbls, **buffs, **dests);
 ;;;
-;;; Author: Gregory Tucker
 
+%include "reg_sizes.asm"
 
 %ifidn __OUTPUT_FORMAT__, elf64
  %define arg0  rdi
@@ -46,6 +46,9 @@
  %define tmp3  r9
  %define tmp4  r12		; must be saved and restored
  %define return rax
+ %macro  SLDR 2
+ %endmacro
+ %define SSTR SLDR
  %define PS 8
  %define LOG_PS 3
 
@@ -70,6 +73,9 @@
  %define tmp3   r13		; must be saved and restored
  %define tmp4   r14		; must be saved and restored
  %define return rax
+ %macro  SLDR 2
+ %endmacro
+ %define SSTR SLDR
  %define PS     8
  %define LOG_PS 3
  %define stack_size  3*16 + 3*8 	; must be an odd multiple of 8
@@ -99,23 +105,97 @@
  %endmacro
 %endif
 
+%ifidn __OUTPUT_FORMAT__, elf32
+
+;;;================== High Address;
+;;;	arg4
+;;;	arg3
+;;;	arg2
+;;;	arg1
+;;;	arg0
+;;;	return
+;;;<================= esp of caller
+;;;	ebp
+;;;<================= ebp = esp
+;;;	var0
+;;;	esi
+;;;	edi
+;;;	ebx
+;;;<================= esp of callee
+;;;
+;;;================== Low Address;
+
+ %define PS 4
+ %define LOG_PS 2
+ %define func(x) x:
+ %define arg(x) [ebp + PS*2 + PS*x]
+ %define var(x) [ebp - PS - PS*x]
+
+ %define trans   ecx
+ %define trans2  esi
+ %define arg0    trans			;trans and trans2 are for the variables in stack
+ %define arg0_m  arg(0)
+ %define arg1    ebx
+ %define arg2    arg2_m
+ %define arg2_m  arg(2)
+ %define arg3    trans
+ %define arg3_m  arg(3)
+ %define arg4    trans
+ %define arg4_m  arg(4)
+ %define tmp	 edx
+ %define tmp2    edi
+ %define tmp3    trans2
+ %define tmp4    trans2
+ %define tmp4_m  var(0)
+ %define return  eax
+ %macro SLDR 2	;; stack load/restore
+	mov %1, %2
+ %endmacro
+ %define SSTR SLDR
+
+ %macro FUNC_SAVE 0
+	push	ebp
+	mov	ebp, esp
+	sub 	esp, PS*1		;1 local variable
+	push	esi
+	push	edi
+	push	ebx
+	mov	arg1, arg(1)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	pop	ebx
+	pop	edi
+	pop	esi
+	add	esp, PS*1		;1 local variable
+	pop	ebp
+ %endmacro
+
+%endif	; output formats
+
 %define len   arg0
 %define vec   arg1
 %define mul_array arg2
 %define	src   arg3
-%define dest1 arg4
+%define dest1  arg4
 
 %define vec_i tmp2
 %define ptr   tmp3
 %define dest2 tmp4
 %define pos   return
 
+ %ifidn PS,4				;32-bit code
+	%define  len_m 	 arg0_m
+	%define  src_m 	 arg3_m
+	%define  dest1_m arg4_m
+	%define  dest2_m tmp4_m
+ %endif
+
 %ifndef EC_ALIGNED_ADDR
 ;;; Use Un-aligned load/store
  %define XLDR movdqu
  %define XSTR movdqu
 %else
-
 ;;; Use Non-temporal load/stor
  %ifdef NO_NT_LDST
   %define XLDR movdqa
@@ -126,35 +206,54 @@
  %endif
 %endif
 
+%ifidn PS,8				;64-bit code
+ default rel
+  [bits 64]
+%endif
 
-default rel
-
-[bits 64]
 section .text
 
-%define xmask0f   xmm8
-%define xgft1_lo  xmm7
-%define xgft1_hi  xmm6
-%define xgft2_lo  xmm5
-%define xgft2_hi  xmm4
-
-%define x0     xmm0
-%define xtmpa  xmm1
-%define xp1    xmm2
-%define xp2    xmm3
+%ifidn PS,8				;64-bit code
+ %define xmask0f   xmm8
+ %define xgft1_lo  xmm7
+ %define xgft1_hi  xmm6
+ %define xgft2_lo  xmm5
+ %define xgft2_hi  xmm4
+
+ %define x0     xmm0
+ %define xtmpa  xmm1
+ %define xp1    xmm2
+ %define xp2    xmm3
+%else					;32-bit code
+ %define xmask0f   xmm4
+ %define xgft1_lo  xmm7
+ %define xgft1_hi  xmm6
+ %define xgft2_lo  xgft1_lo
+ %define xgft2_hi  xgft1_hi
+
+ %define x0     xmm0
+ %define xtmpa  xmm1
+ %define xp1    xmm2
+ %define xp2    xmm3
+%endif
 
 align 16
 global gf_2vect_dot_prod_sse:function
 
 func(gf_2vect_dot_prod_sse)
 	FUNC_SAVE
+	SLDR 	len, len_m
 	sub	len, 16
+	SSTR	len_m, len
 	jl	.return_fail
 	xor	pos, pos
 	movdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
 	sal	vec, LOG_PS		;vec *= PS. Make vec_i count by PS
+	SLDR 	dest1, dest1_m
 	mov	dest2, [dest1+PS]
+	SSTR 	dest2_m, dest2
 	mov	dest1, [dest1]
+	SSTR 	dest1_m, dest1
 
 .loop16
 	pxor	xp1, xp1
@@ -163,16 +262,18 @@ func(gf_2vect_dot_prod_sse)
 	xor	vec_i, vec_i
 
 .next_vect
+	SLDR 	src, src_m
 	mov	ptr, [src+vec_i]
 
 	movdqu	xgft1_lo, [tmp]		;Load array Ax{00}, Ax{01}, ..., Ax{0f}
 	movdqu	xgft1_hi, [tmp+16]	;     "     Ax{00}, Ax{10}, ..., Ax{f0}
+ %ifidn PS,8				;64-bit code
 	movdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
 	movdqu	xgft2_hi, [tmp+vec*(32/PS)+16]	;     "     Bx{00}, Bx{10}, ..., Bx{f0}
-
-	XLDR	x0, [ptr+pos]		;Get next source vector
 	add	tmp, 32
 	add	vec_i, PS
+ %endif
+	XLDR	x0, [ptr+pos]		;Get next source vector
 
 	movdqa	xtmpa, x0		;Keep unshifted copy of src
 	psraw	x0, 4			;Shift to put high nibble into bits 4-0
@@ -184,6 +285,13 @@ func(gf_2vect_dot_prod_sse)
 	pxor	xgft1_hi, xgft1_lo	;GF add high and low partials
 	pxor	xp1, xgft1_hi		;xp1 += partial
 
+ %ifidn PS,4				;32-bit code
+	movdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+	movdqu	xgft2_hi, [tmp+vec*(32/PS)+16]	;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+
+	add	tmp, 32
+	add	vec_i, PS
+ %endif
 	pshufb	xgft2_hi, x0		;Lookup mul table of high nibble
 	pshufb	xgft2_lo, xtmpa		;Lookup mul table of low nibble
 	pxor	xgft2_hi, xgft2_lo	;GF add high and low partials
@@ -192,9 +300,12 @@ func(gf_2vect_dot_prod_sse)
 	cmp	vec_i, vec
 	jl	.next_vect
 
+	SLDR 	dest1, dest1_m
+	SLDR 	dest2, dest2_m
 	XSTR	[dest1+pos], xp1
 	XSTR	[dest2+pos], xp2
 
+	SLDR 	len, len_m
 	add	pos, 16			;Loop on 16 bytes at a time
 	cmp	pos, len
 	jle	.loop16
@@ -224,15 +335,5 @@ section .data
 align 16
 mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
 
-%macro slversion 4
-global %1_slver_%2%3%4
-global %1_slver
-%1_slver:
-%1_slver_%2%3%4:
-	dw 0x%4
-	db 0x%3, 0x%2
-%endmacro
 ;;;       func                  core, ver, snum
-slversion gf_2vect_dot_prod_sse, 00,  02,  0062
-; inform linker that this doesn't require executable stack
-section .note.GNU-stack noalloc noexec nowrite progbits
+slversion gf_2vect_dot_prod_sse, 00,  04,  0062
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_avx.asm.s
new file mode 100644
index 0000000..e182381
--- /dev/null
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_avx.asm.s
@@ -0,0 +1,236 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_2vect_mad_avx(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+
+%define PS 8
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg0.w ecx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define arg4  r12
+ %define arg5  r15
+ %define tmp   r11
+ %define tmp2   r10
+ %define return rax
+ %define return.w eax
+ %define stack_size 16*9 + 3*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	movdqa	[rsp+16*0],xmm6
+	movdqa	[rsp+16*1],xmm7
+	movdqa	[rsp+16*2],xmm8
+	movdqa	[rsp+16*3],xmm9
+	movdqa	[rsp+16*4],xmm10
+	movdqa	[rsp+16*5],xmm11
+	movdqa	[rsp+16*6],xmm12
+	movdqa	[rsp+16*7],xmm13
+	movdqa	[rsp+16*8],xmm14
+	save_reg	r12,  9*16 + 0*8
+	save_reg	r15,  9*16 + 1*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp+16*0]
+	movdqa	xmm7, [rsp+16*1]
+	movdqa	xmm8, [rsp+16*2]
+	movdqa	xmm9, [rsp+16*3]
+	movdqa	xmm10, [rsp+16*4]
+	movdqa	xmm11, [rsp+16*5]
+	movdqa	xmm12, [rsp+16*6]
+	movdqa	xmm13, [rsp+16*7]
+	movdqa	xmm14, [rsp+16*8]
+	mov	r12,  [rsp + 9*16 + 0*8]
+	mov	r15,  [rsp + 9*16 + 1*8]
+	add	rsp, stack_size
+%endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define tmp2   r10
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+;;; gf_2vect_mad_avx(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest1 arg5
+%define pos   return
+%define pos.w return.w
+
+%define dest2 tmp2
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f  xmm14
+%define xgft1_lo  xmm13
+%define xgft1_hi  xmm12
+%define xgft2_lo  xmm11
+%define xgft2_hi  xmm10
+
+%define x0      xmm0
+%define xtmpa   xmm1
+%define xtmph1  xmm2
+%define xtmpl1  xmm3
+%define xtmph2  xmm4
+%define xtmpl2  xmm5
+%define xd1     xmm6
+%define xd2     xmm7
+%define xtmpd1  xmm8
+%define xtmpd2  xmm9
+
+
+align 16
+global gf_2vect_mad_avx:function
+
+func(gf_2vect_mad_avx)
+	FUNC_SAVE
+	sub	len, 16
+	jl	.return_fail
+
+	xor	pos, pos
+	vmovdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+	sal	vec_i, 5		;Multiply by 32
+	sal	vec, 5
+	lea	tmp, [mul_array + vec_i]
+	vmovdqu	xgft1_lo, [tmp]		;Load array Ax{00}, Ax{01}, Ax{02}, ...
+	vmovdqu	xgft1_hi, [tmp+16]	; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	vmovdqu	xgft2_lo, [tmp+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+	vmovdqu	xgft2_hi, [tmp+vec+16]	; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+
+	mov	dest2, [dest1+PS]
+	mov	dest1, [dest1]
+
+	XLDR	xtmpd1, [dest1+len]	;backup the last 16 bytes in dest
+	XLDR	xtmpd2, [dest2+len]	;backup the last 16 bytes in dest
+
+.loop16
+	XLDR	xd1, [dest1+pos]		;Get next dest vector
+	XLDR	xd2, [dest2+pos]		;Get next dest vector
+.loop16_overlap:
+	XLDR	x0, [src+pos]		;Get next source vector
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	vpshufb	xtmph1, xgft1_hi, x0	;Lookup mul table of high nibble
+	vpshufb	xtmpl1, xgft1_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xtmpl1 ;GF add high and low partials
+	vpxor	xd1, xd1, xtmph1	;xd1 += partial
+
+	vpshufb	xtmph2, xgft2_hi, x0	;Lookup mul table of high nibble
+	vpshufb	xtmpl2, xgft2_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmph2, xtmpl2 ;GF add high and low partials
+	vpxor	xd2, xd2, xtmph2	;xd2 += partial
+
+	XSTR	[dest1+pos], xd1
+	XSTR	[dest2+pos], xd2
+
+	add	pos, 16			;Loop on 16 bytes at a time
+	cmp	pos, len
+	jle	.loop16
+
+	lea	tmp, [len + 16]
+	cmp	pos, tmp
+	je	.return_pass
+
+	;; Tail len
+	mov	pos, len	;Overlapped offset length-16
+	vmovdqa	xd1, xtmpd1	;Restore xd1
+	vmovdqa	xd2, xtmpd2	;Restore xd2
+	jmp	.loop16_overlap	;Do one more overlap pass
+
+.return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+.return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+
+;;;       func             core, ver, snum
+slversion gf_2vect_mad_avx, 02,  01,  0204
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_avx2.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_avx2.asm.s
new file mode 100644
index 0000000..03902f4
--- /dev/null
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_avx2.asm.s
@@ -0,0 +1,247 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_2vect_mad_avx2(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+
+%define PS 8
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0   rcx
+ %define arg0.w ecx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define arg4  r12
+ %define arg5  r15
+
+ %define tmp    r11
+ %define tmp.w  r11d
+ %define tmp.b  r11b
+ %define tmp2   r10
+ %define return rax
+ %define return.w eax
+ %define stack_size  16*9 + 3*8 	; must be an odd multiple of 8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	vmovdqa	[rsp+16*0],xmm6
+	vmovdqa	[rsp+16*1],xmm7
+	vmovdqa	[rsp+16*2],xmm8
+	vmovdqa	[rsp+16*3],xmm9
+	vmovdqa	[rsp+16*4],xmm10
+	vmovdqa	[rsp+16*5],xmm11
+	vmovdqa	[rsp+16*6],xmm12
+	vmovdqa	[rsp+16*7],xmm13
+	vmovdqa	[rsp+16*8],xmm14
+	save_reg	r12,  9*16 + 0*8
+	save_reg	r15,  9*16 + 1*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	vmovdqa	xmm6, [rsp+16*0]
+	vmovdqa	xmm7, [rsp+16*1]
+	vmovdqa	xmm8, [rsp+16*2]
+	vmovdqa	xmm9, [rsp+16*3]
+	vmovdqa	xmm10, [rsp+16*4]
+	vmovdqa	xmm11, [rsp+16*5]
+	vmovdqa	xmm12, [rsp+16*6]
+	vmovdqa	xmm13, [rsp+16*7]
+	vmovdqa	xmm14, [rsp+16*8]
+	mov	r12,  [rsp + 9*16 + 0*8]
+	mov	r15,  [rsp + 9*16 + 1*8]
+	add	rsp, stack_size
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+
+ %define tmp   r11
+ %define tmp.w r11d
+ %define tmp.b r11b
+ %define tmp2   r10
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+;;; gf_2vect_mad_avx2(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest1 arg5
+%define pos   return
+%define pos.w return.w
+
+%define dest2 tmp2
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f   ymm14
+%define xmask0fx  xmm14
+%define xgft1_lo  ymm13
+%define xgft1_hi  ymm12
+%define xgft2_lo  ymm11
+%define xgft2_hi  ymm10
+
+%define x0      ymm0
+%define xtmpa   ymm1
+%define xtmph1  ymm2
+%define xtmpl1  ymm3
+%define xtmph2  ymm4
+%define xtmpl2  ymm5
+%define xd1     ymm6
+%define xd2     ymm7
+%define xtmpd1  ymm8
+%define xtmpd2  ymm9
+
+align 16
+global gf_2vect_mad_avx2:function
+
+func(gf_2vect_mad_avx2)
+	FUNC_SAVE
+	sub	len, 32
+	jl	.return_fail
+	xor	pos, pos
+	mov	tmp.b, 0x0f
+	vpinsrb	xmask0fx, xmask0fx, tmp.w, 0
+	vpbroadcastb xmask0f, xmask0fx	;Construct mask 0x0f0f0f...
+
+	sal	vec_i, 5		;Multiply by 32
+	sal	vec, 5
+	lea	tmp, [mul_array + vec_i]
+	vmovdqu	xgft1_lo, [tmp]		;Load array Ax{00}, Ax{01}, ..., Ax{0f}
+					;     "     Ax{00}, Ax{10}, ..., Ax{f0}
+	vmovdqu	xgft2_lo, [tmp+vec]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+					;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+
+	vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x11 ; swapped to hi | hi
+	vperm2i128 xgft1_lo, xgft1_lo, xgft1_lo, 0x00 ; swapped to lo | lo
+	vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi
+	vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo
+	mov	dest2, [dest1+PS]	; reuse mul_array
+	mov	dest1, [dest1]
+
+	XLDR	xtmpd1, [dest1+len]	;backup the last 16 bytes in dest
+	XLDR	xtmpd2, [dest2+len]	;backup the last 16 bytes in dest
+
+.loop32
+	XLDR	xd1, [dest1+pos]		;Get next dest vector
+	XLDR	xd2, [dest2+pos]		;Get next dest vector
+.loop32_overlap:
+	XLDR	x0, [src+pos]		;Get next source vector
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	vpshufb	xtmph1, xgft1_hi, x0	;Lookup mul table of high nibble
+	vpshufb	xtmpl1, xgft1_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xtmpl1 ;GF add high and low partials
+	vpxor	xd1, xd1, xtmph1	;xd1 += partial
+
+	vpshufb	xtmph2, xgft2_hi, x0	;Lookup mul table of high nibble
+	vpshufb	xtmpl2, xgft2_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmph2, xtmpl2 ;GF add high and low partials
+	vpxor	xd2, xd2, xtmph2	;xd2 += partial
+
+	XSTR	[dest1+pos], xd1
+	XSTR	[dest2+pos], xd2
+
+	add	pos, 32			;Loop on 32 bytes at a time
+	cmp	pos, len
+	jle	.loop32
+
+	lea	tmp, [len + 32]
+	cmp	pos, tmp
+	je	.return_pass
+
+	;; Tail len
+	mov	pos, len	;Overlapped offset length-32
+	vmovdqa	xd1, xtmpd1	;Restore xd1
+	vmovdqa	xd2, xtmpd2	;Restore xd2
+	jmp	.loop32_overlap	;Do one more overlap pass
+
+.return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+.return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+;;;       func              core, ver, snum
+slversion gf_2vect_mad_avx2, 04,  01,  0205
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_sse.asm.s
new file mode 100644
index 0000000..2e82c5a
--- /dev/null
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_sse.asm.s
@@ -0,0 +1,239 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_2vect_mad_sse(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+
+%define PS 8
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg0.w ecx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define arg4  r12
+ %define arg5  r15
+ %define tmp   r11
+ %define tmp2   r10
+ %define return rax
+ %define return.w eax
+ %define stack_size 16*9 + 3*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	movdqa	[rsp+16*0],xmm6
+	movdqa	[rsp+16*1],xmm7
+	movdqa	[rsp+16*2],xmm8
+	movdqa	[rsp+16*3],xmm9
+	movdqa	[rsp+16*4],xmm10
+	movdqa	[rsp+16*5],xmm11
+	movdqa	[rsp+16*6],xmm12
+	movdqa	[rsp+16*7],xmm13
+	movdqa	[rsp+16*8],xmm14
+	save_reg	r12,  9*16 + 0*8
+	save_reg	r15,  9*16 + 1*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp+16*0]
+	movdqa	xmm7, [rsp+16*1]
+	movdqa	xmm8, [rsp+16*2]
+	movdqa	xmm9, [rsp+16*3]
+	movdqa	xmm10, [rsp+16*4]
+	movdqa	xmm11, [rsp+16*5]
+	movdqa	xmm12, [rsp+16*6]
+	movdqa	xmm13, [rsp+16*7]
+	movdqa	xmm14, [rsp+16*8]
+	mov	r12,  [rsp + 9*16 + 0*8]
+	mov	r15,  [rsp + 9*16 + 1*8]
+	add	rsp, stack_size
+%endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define tmp2   r10
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+;;; gf_2vect_mad_sse(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest1  arg5
+%define pos   return
+%define pos.w return.w
+
+%define dest2 tmp2
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR movdqu
+ %define XSTR movdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR movdqa
+  %define XSTR movdqa
+ %else
+  %define XLDR movntdqa
+  %define XSTR movntdq
+ %endif
+%endif
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f  xmm14
+%define xgft1_lo  xmm13
+%define xgft1_hi  xmm12
+%define xgft2_lo  xmm11
+%define xgft2_hi  xmm10
+
+%define x0      xmm0
+%define xtmpa   xmm1
+%define xtmph1  xmm2
+%define xtmpl1  xmm3
+%define xtmph2  xmm4
+%define xtmpl2  xmm5
+%define xd1     xmm6
+%define xd2     xmm7
+%define xtmpd1  xmm8
+%define xtmpd2  xmm9
+
+
+align 16
+global gf_2vect_mad_sse:function
+func(gf_2vect_mad_sse)
+	FUNC_SAVE
+	sub	len, 16
+	jl	.return_fail
+
+	xor	pos, pos
+	movdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+	sal	vec_i, 5		;Multiply by 32
+	sal	vec, 5
+	lea	tmp, [mul_array + vec_i]
+	movdqu	xgft1_lo,[tmp]		;Load array Ax{00}, Ax{01}, Ax{02}, ...
+	movdqu	xgft1_hi, [tmp+16]	; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	movdqu	xgft2_lo, [tmp+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+	movdqu	xgft2_hi, [tmp+vec+16]	; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	mov	dest2, [dest1+PS]
+	mov	dest1, [dest1]
+
+	XLDR	xtmpd1, [dest1+len]	;backup the last 16 bytes in dest
+	XLDR	xtmpd2, [dest2+len]	;backup the last 16 bytes in dest
+
+.loop16:
+	XLDR	xd1, [dest1+pos]		;Get next dest vector
+	XLDR	xd2, [dest2+pos]		;Get next dest vector
+.loop16_overlap:
+	XLDR	x0, [src+pos]		;Get next source vector
+	movdqa	xtmph1, xgft1_hi		;Reload const array registers
+	movdqa	xtmpl1, xgft1_lo
+	movdqa	xtmph2, xgft2_hi		;Reload const array registers
+	movdqa	xtmpl2, xgft2_lo
+	movdqa	xtmpa, x0		;Keep unshifted copy of src
+	psraw	x0, 4			;Shift to put high nibble into bits 4-0
+	pand	x0, xmask0f		;Mask high src nibble in bits 4-0
+	pand	xtmpa, xmask0f		;Mask low src nibble in bits 4-0
+
+	pshufb	xtmph1, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl1, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph1, xtmpl1		;GF add high and low partials
+	pxor	xd1, xtmph1
+
+	pshufb	xtmph2, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl2, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph2, xtmpl2		;GF add high and low partials
+	pxor	xd2, xtmph2
+
+	XSTR	[dest1+pos], xd1	;Store result
+	XSTR	[dest2+pos], xd2	;Store result
+
+	add	pos, 16			;Loop on 16 bytes at a time
+	cmp	pos, len
+	jle	.loop16
+
+	lea	tmp, [len + 16]
+	cmp	pos, tmp
+	je	.return_pass
+
+	;; Tail len
+	mov	pos, len	;Overlapped offset length-16
+	movdqa	xd1, xtmpd1	;Restore xd1
+	movdqa	xd2, xtmpd2	;Restore xd2
+	jmp	.loop16_overlap	;Do one more overlap pass
+
+.return_pass:
+	FUNC_RESTORE
+	mov	return, 0
+	ret
+
+.return_fail:
+	FUNC_RESTORE
+	mov	return, 1
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+
+mask0f:
+	ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+
+;;;       func             core, ver, snum
+slversion gf_2vect_mad_sse, 00,  01,  0203
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_dot_prod_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_dot_prod_avx.asm.s
index 6935cb1..33fc198 100644
--- a/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_dot_prod_avx.asm.s
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_dot_prod_avx.asm.s
@@ -1,5 +1,5 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 ;
 ;  Redistribution and use in source and binary forms, with or without
 ;  modification, are permitted provided that the following conditions
@@ -30,8 +30,8 @@
 ;;;
 ;;; gf_3vect_dot_prod_avx(len, vec, *g_tbls, **buffs, **dests);
 ;;;
-;;; Author: Gregory Tucker
 
+%include "reg_sizes.asm"
 
 %ifidn __OUTPUT_FORMAT__, elf64
  %define arg0  rdi
@@ -46,6 +46,9 @@
  %define tmp3  r13		; must be saved and restored
  %define tmp4  r12		; must be saved and restored
  %define return rax
+ %macro  SLDR 2
+ %endmacro
+ %define SSTR SLDR
  %define PS 8
  %define LOG_PS 3
 
@@ -73,6 +76,9 @@
  %define tmp3   r13		; must be saved and restored
  %define tmp4   r14		; must be saved and restored
  %define return rax
+ %macro  SLDR 2
+ %endmacro
+ %define SSTR SLDR
  %define PS     8
  %define LOG_PS 3
  %define stack_size  6*16 + 5*8 	; must be an odd multiple of 8
@@ -110,17 +116,97 @@
  %endmacro
 %endif
 
+%ifidn __OUTPUT_FORMAT__, elf32
+
+;;;================== High Address;
+;;;	arg4
+;;;	arg3
+;;;	arg2
+;;;	arg1
+;;;	arg0
+;;;	return
+;;;<================= esp of caller
+;;;	ebp
+;;;<================= ebp = esp
+;;;	var0
+;;;	var1
+;;;	esi
+;;;	edi
+;;;	ebx
+;;;<================= esp of callee
+;;;
+;;;================== Low Address;
+
+ %define PS 4
+ %define LOG_PS 2
+ %define func(x) x:
+ %define arg(x) [ebp + PS*2 + PS*x]
+ %define var(x) [ebp - PS - PS*x]
+
+ %define trans   ecx
+ %define trans2  esi
+ %define arg0    trans		;trans and trans2 are for the variables in stack
+ %define arg0_m  arg(0)
+ %define arg1    ebx
+ %define arg2    arg2_m
+ %define arg2_m  arg(2)
+ %define arg3    trans
+ %define arg3_m  arg(3)
+ %define arg4    trans
+ %define arg4_m  arg(4)
+ %define arg5	 trans2
+ %define tmp	 edx
+ %define tmp2    edi
+ %define tmp3    trans2
+ %define tmp3_m  var(0)
+ %define tmp4    trans2
+ %define tmp4_m  var(1)
+ %define return  eax
+ %macro SLDR 2	;; stack load/restore
+	mov %1, %2
+ %endmacro
+ %define SSTR SLDR
+
+ %macro FUNC_SAVE 0
+	push	ebp
+	mov	ebp, esp
+	sub	esp, PS*2		;2 local variables
+	push	esi
+	push	edi
+	push	ebx
+	mov	arg1, arg(1)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	pop	ebx
+	pop	edi
+	pop	esi
+	add	esp, PS*2		;2 local variables
+	pop	ebp
+ %endmacro
+
+%endif	; output formats
+
 %define len   arg0
 %define vec   arg1
 %define mul_array arg2
 %define	src   arg3
-%define dest1 arg4
+%define dest1  arg4
 %define ptr   arg5
+
 %define vec_i tmp2
 %define dest2 tmp3
 %define dest3 tmp4
 %define pos   return
 
+ %ifidn PS,4				;32-bit code
+	%define  len_m 	arg0_m
+	%define  src_m 	arg3_m
+	%define  dest1_m arg4_m
+	%define  dest2_m tmp3_m
+	%define  dest3_m tmp4_m
+ %endif
+
 %ifndef EC_ALIGNED_ADDR
 ;;; Use Un-aligned load/store
  %define XLDR vmovdqu
@@ -136,39 +222,62 @@
  %endif
 %endif
 
+%ifidn PS,8			; 64-bit code
+ default rel
+  [bits 64]
+%endif
 
-default rel
 
-[bits 64]
 section .text
 
-%define xmask0f   xmm11
-%define xgft1_lo  xmm10
-%define xgft1_hi  xmm9
-%define xgft2_lo  xmm8
-%define xgft2_hi  xmm7
-%define xgft3_lo  xmm6
-%define xgft3_hi  xmm5
-
-%define x0     xmm0
-%define xtmpa  xmm1
-%define xp1    xmm2
-%define xp2    xmm3
-%define xp3    xmm4
+%ifidn PS,8			;64-bit code
+ %define xmask0f   xmm11
+ %define xgft1_lo  xmm10
+ %define xgft1_hi  xmm9
+ %define xgft2_lo  xmm8
+ %define xgft2_hi  xmm7
+ %define xgft3_lo  xmm6
+ %define xgft3_hi  xmm5
+
+ %define x0     xmm0
+ %define xtmpa  xmm1
+ %define xp1    xmm2
+ %define xp2    xmm3
+ %define xp3    xmm4
+%else
+ %define xmask0f   xmm7
+ %define xgft1_lo  xmm6
+ %define xgft1_hi  xmm5
+ %define xgft2_lo  xgft1_lo
+ %define xgft2_hi  xgft1_hi
+ %define xgft3_lo  xgft1_lo
+ %define xgft3_hi  xgft1_hi
+
+ %define x0     xmm0
+ %define xtmpa  xmm1
+ %define xp1    xmm2
+ %define xp2    xmm3
+ %define xp3    xmm4
+%endif
 
 align 16
 global gf_3vect_dot_prod_avx:function
 func(gf_3vect_dot_prod_avx)
 	FUNC_SAVE
+	SLDR	len, len_m
 	sub	len, 16
+	SSTR	len_m, len
 	jl	.return_fail
 	xor	pos, pos
 	vmovdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
 	sal	vec, LOG_PS		;vec *= PS. Make vec_i count by PS
+	SLDR	dest1, dest1_m
 	mov	dest2, [dest1+PS]
+	SSTR	dest2_m, dest2
 	mov	dest3, [dest1+2*PS]
+	SSTR	dest3_m, dest3
 	mov	dest1, [dest1]
-
+	SSTR	dest1_m, dest1
 
 .loop16:
 	vpxor	xp1, xp1
@@ -178,17 +287,19 @@ func(gf_3vect_dot_prod_avx)
 	xor	vec_i, vec_i
 
 .next_vect:
+	SLDR	src, src_m
 	mov	ptr, [src+vec_i]
 
 	vmovdqu	xgft1_lo, [tmp]		;Load array Ax{00}, Ax{01}, ..., Ax{0f}
 	vmovdqu	xgft1_hi, [tmp+16]	;     "     Ax{00}, Ax{10}, ..., Ax{f0}
+ %ifidn PS,8				; 64-bit code
 	vmovdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
 	vmovdqu	xgft2_hi, [tmp+vec*(32/PS)+16]	;     "     Bx{00}, Bx{10}, ..., Bx{f0}
 	vmovdqu	xgft3_lo, [tmp+vec*(64/PS)]	;Load array Cx{00}, Cx{01}, ..., Cx{0f}
 	vmovdqu	xgft3_hi, [tmp+vec*(64/PS)+16]	;     "     Cx{00}, Cx{10}, ..., Cx{f0}
-
 	add	tmp, 32
 	add	vec_i, PS
+ %endif
 	XLDR	x0, [ptr+pos]		;Get next source vector
 
 	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
@@ -200,11 +311,23 @@ func(gf_3vect_dot_prod_avx)
 	vpxor	xgft1_hi, xgft1_lo	;GF add high and low partials
 	vpxor	xp1, xgft1_hi		;xp1 += partial
 
+ %ifidn PS,4				; 32-bit code
+	vmovdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+	vmovdqu	xgft2_hi, [tmp+vec*(32/PS)+16]	;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+ %endif
 	vpshufb	xgft2_hi, x0		;Lookup mul table of high nibble
 	vpshufb	xgft2_lo, xtmpa		;Lookup mul table of low nibble
 	vpxor	xgft2_hi, xgft2_lo	;GF add high and low partials
 	vpxor	xp2, xgft2_hi		;xp2 += partial
 
+ %ifidn PS,4				; 32-bit code
+	sal	vec, 1
+	vmovdqu	xgft3_lo, [tmp+vec*(32/PS)]	;Load array Cx{00}, Cx{01}, ..., Cx{0f}
+	vmovdqu	xgft3_hi, [tmp+vec*(32/PS)+16]	;     "     Cx{00}, Cx{10}, ..., Cx{f0}
+	sar	vec, 1
+	add	tmp, 32
+	add	vec_i, PS
+ %endif
 	vpshufb	xgft3_hi, x0		;Lookup mul table of high nibble
 	vpshufb	xgft3_lo, xtmpa		;Lookup mul table of low nibble
 	vpxor	xgft3_hi, xgft3_lo	;GF add high and low partials
@@ -213,10 +336,14 @@ func(gf_3vect_dot_prod_avx)
 	cmp	vec_i, vec
 	jl	.next_vect
 
+	SLDR	dest1, dest1_m
+	SLDR	dest2, dest2_m
 	XSTR	[dest1+pos], xp1
 	XSTR	[dest2+pos], xp2
+	SLDR	dest3, dest3_m
 	XSTR	[dest3+pos], xp3
 
+	SLDR	len, len_m
 	add	pos, 16			;Loop on 16 bytes at a time
 	cmp	pos, len
 	jle	.loop16
@@ -246,15 +373,5 @@ section .data
 align 16
 mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
 
-%macro slversion 4
-global %1_slver_%2%3%4
-global %1_slver
-%1_slver:
-%1_slver_%2%3%4:
-	dw 0x%4
-	db 0x%3, 0x%2
-%endmacro
 ;;;       func                  core, ver, snum
-; inform linker that this doesn't require executable stack
-section .note.GNU-stack noalloc noexec nowrite progbits
-slversion gf_3vect_dot_prod_avx, 02,  03,  0192
+slversion gf_3vect_dot_prod_avx, 02,  05,  0192
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_dot_prod_avx2.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_dot_prod_avx2.asm.s
index 4ad0153..23c46a7 100644
--- a/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_dot_prod_avx2.asm.s
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_dot_prod_avx2.asm.s
@@ -1,5 +1,5 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 ;
 ;  Redistribution and use in source and binary forms, with or without
 ;  modification, are permitted provided that the following conditions
@@ -30,8 +30,8 @@
 ;;;
 ;;; gf_3vect_dot_prod_avx2(len, vec, *g_tbls, **buffs, **dests);
 ;;;
-;;; Author: Gregory Tucker
 
+%include "reg_sizes.asm"
 
 %ifidn __OUTPUT_FORMAT__, elf64
  %define arg0  rdi
@@ -48,7 +48,10 @@
  %define tmp3  r13		; must be saved and restored
  %define tmp4  r12		; must be saved and restored
  %define return rax
- %define PS 8
+ %macro  SLDR   2
+ %endmacro
+ %define SSTR   SLDR
+ %define PS     8
  %define LOG_PS 3
 
  %define func(x) x:
@@ -77,6 +80,9 @@
  %define tmp3   r13		; must be saved and restored
  %define tmp4   r14		; must be saved and restored
  %define return rax
+ %macro  SLDR   2
+ %endmacro
+ %define SSTR   SLDR
  %define PS     8
  %define LOG_PS 3
  %define stack_size  6*16 + 5*8 	; must be an odd multiple of 8
@@ -114,17 +120,99 @@
  %endmacro
 %endif
 
+%ifidn __OUTPUT_FORMAT__, elf32
+
+;;;================== High Address;
+;;;	arg4
+;;;	arg3
+;;;	arg2
+;;;	arg1
+;;;	arg0
+;;;	return
+;;;<================= esp of caller
+;;;	ebp
+;;;<================= ebp = esp
+;;;	var0
+;;;	var1
+;;;	esi
+;;;	edi
+;;;	ebx
+;;;<================= esp of callee
+;;;
+;;;================== Low Address;
+
+ %define PS 4
+ %define LOG_PS 2
+ %define func(x) x:
+ %define arg(x) [ebp + PS*2 + PS*x]
+ %define var(x) [ebp - PS - PS*x]
+
+ %define trans   ecx
+ %define trans2  esi
+ %define arg0    trans			;trans and trans2 are for the variables in stack
+ %define arg0_m  arg(0)
+ %define arg1    ebx
+ %define arg2    arg2_m
+ %define arg2_m  arg(2)
+ %define arg3    trans
+ %define arg3_m  arg(3)
+ %define arg4    trans
+ %define arg4_m  arg(4)
+ %define arg5	 trans2
+ %define tmp	 edx
+ %define tmp.w   edx
+ %define tmp.b   dl
+ %define tmp2    edi
+ %define tmp3    trans2
+ %define tmp3_m  var(0)
+ %define tmp4    trans2
+ %define tmp4_m  var(1)
+ %define return  eax
+ %macro SLDR     2			;stack load/restore
+	mov %1, %2
+ %endmacro
+ %define SSTR SLDR
+
+ %macro FUNC_SAVE 0
+	push	ebp
+	mov	ebp, esp
+	sub	esp, PS*2		;2 local variables
+	push	esi
+	push	edi
+	push	ebx
+	mov	arg1, arg(1)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	pop	ebx
+	pop	edi
+	pop	esi
+	add	esp, PS*2		;2 local variables
+	pop	ebp
+ %endmacro
+
+%endif	; output formats
+
 %define len   arg0
 %define vec   arg1
 %define mul_array arg2
 %define	src   arg3
 %define dest1 arg4
 %define ptr   arg5
+
 %define vec_i tmp2
 %define dest2 tmp3
 %define dest3 tmp4
 %define pos   return
 
+%ifidn PS,4				;32-bit code
+ %define  len_m   arg0_m
+ %define  src_m   arg3_m
+ %define  dest1_m arg4_m
+ %define  dest2_m tmp3_m
+ %define  dest3_m tmp4_m
+%endif
+
 %ifndef EC_ALIGNED_ADDR
 ;;; Use Un-aligned load/store
  %define XLDR vmovdqu
@@ -140,32 +228,53 @@
  %endif
 %endif
 
+%ifidn PS,8				;64-bit code
+ default rel
+ [bits 64]
+%endif
 
-default rel
-
-[bits 64]
 section .text
 
-%define xmask0f   ymm11
-%define xmask0fx  xmm11
-%define xgft1_lo  ymm10
-%define xgft1_hi  ymm9
-%define xgft2_lo  ymm8
-%define xgft2_hi  ymm7
-%define xgft3_lo  ymm6
-%define xgft3_hi  ymm5
-
-%define x0     ymm0
-%define xtmpa  ymm1
-%define xp1    ymm2
-%define xp2    ymm3
-%define xp3    ymm4
+%ifidn PS,8				;64-bit code
+ %define xmask0f   ymm11
+ %define xmask0fx  xmm11
+ %define xgft1_lo  ymm10
+ %define xgft1_hi  ymm9
+ %define xgft2_lo  ymm8
+ %define xgft2_hi  ymm7
+ %define xgft3_lo  ymm6
+ %define xgft3_hi  ymm5
+
+ %define x0     ymm0
+ %define xtmpa  ymm1
+ %define xp1    ymm2
+ %define xp2    ymm3
+ %define xp3    ymm4
+%else
+ %define xmask0f   ymm7
+ %define xmask0fx  xmm7
+ %define xgft1_lo  ymm6
+ %define xgft1_hi  ymm5
+ %define xgft2_lo  xgft1_lo
+ %define xgft2_hi  xgft1_hi
+ %define xgft3_lo  xgft1_lo
+ %define xgft3_hi  xgft1_hi
+
+ %define x0     ymm0
+ %define xtmpa  ymm1
+ %define xp1    ymm2
+ %define xp2    ymm3
+ %define xp3    ymm4
+
+%endif
 
 align 16
 global gf_3vect_dot_prod_avx2:function
 func(gf_3vect_dot_prod_avx2)
 	FUNC_SAVE
+	SLDR	len, len_m
 	sub	len, 32
+	SSTR	len_m, len
 	jl	.return_fail
 	xor	pos, pos
 	mov	tmp.b, 0x0f
@@ -173,10 +282,13 @@ func(gf_3vect_dot_prod_avx2)
 	vpbroadcastb xmask0f, xmask0fx	;Construct mask 0x0f0f0f...
 
 	sal	vec, LOG_PS		;vec *= PS. Make vec_i count by PS
+	SLDR	dest1, dest1_m
 	mov	dest2, [dest1+PS]
+	SSTR	dest2_m, dest2
 	mov	dest3, [dest1+2*PS]
+	SSTR	dest3_m, dest3
 	mov	dest1, [dest1]
-
+	SSTR	dest1_m, dest1
 
 .loop32:
 	vpxor	xp1, xp1
@@ -186,25 +298,27 @@ func(gf_3vect_dot_prod_avx2)
 	xor	vec_i, vec_i
 
 .next_vect:
+	SLDR	src, src_m
 	mov	ptr, [src+vec_i]
 
 	vmovdqu	xgft1_lo, [tmp]		;Load array Ax{00}, Ax{01}, ..., Ax{0f}
 					;     "     Ax{00}, Ax{10}, ..., Ax{f0}
 	vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x11 ; swapped to hi | hi
 	vperm2i128 xgft1_lo, xgft1_lo, xgft1_lo, 0x00 ; swapped to lo | lo
-
-	vmovdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+ %ifidn PS,8				; 64-bit code
+	vmovdqu	   xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
 						;     "     Bx{00}, Bx{10}, ..., Bx{f0}
 	vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi
 	vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo
 
-	vmovdqu	xgft3_lo, [tmp+vec*(64/PS)]	;Load array Cx{00}, Cx{01}, ..., Cx{0f}
+	vmovdqu	   xgft3_lo, [tmp+vec*(64/PS)]	;Load array Cx{00}, Cx{01}, ..., Cx{0f}
 						;     "     Cx{00}, Cx{10}, ..., Cx{f0}
 	vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x11 ; swapped to hi | hi
 	vperm2i128 xgft3_lo, xgft3_lo, xgft3_lo, 0x00 ; swapped to lo | lo
 
 	add	tmp, 32
 	add	vec_i, PS
+ %endif
 	XLDR	x0, [ptr+pos]		;Get next source vector
 
 	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
@@ -216,11 +330,27 @@ func(gf_3vect_dot_prod_avx2)
 	vpxor	xgft1_hi, xgft1_lo	;GF add high and low partials
 	vpxor	xp1, xgft1_hi		;xp1 += partial
 
-	vpshufb	xgft2_hi, x0		;Lookup mul table of high nibble
-	vpshufb	xgft2_lo, xtmpa		;Lookup mul table of low nibble
-	vpxor	xgft2_hi, xgft2_lo	;GF add high and low partials
-	vpxor	xp2, xgft2_hi		;xp2 += partial
-
+ %ifidn PS,4				; 32-bit code
+	vmovdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+						;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+	vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi
+	vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo
+ %endif
+	vpshufb	   xgft2_hi, x0		;Lookup mul table of high nibble
+	vpshufb	   xgft2_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	   xgft2_hi, xgft2_lo	;GF add high and low partials
+	vpxor	   xp2, xgft2_hi		;xp2 += partial
+
+ %ifidn PS,4				; 32-bit code
+	sal     vec, 1
+	vmovdqu	xgft3_lo, [tmp+vec*(32/PS)]	;Load array Cx{00}, Cx{01}, ..., Cx{0f}
+						;     "     Cx{00}, Cx{10}, ..., Cx{f0}
+	vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x11 ; swapped to hi | hi
+	vperm2i128 xgft3_lo, xgft3_lo, xgft3_lo, 0x00 ; swapped to lo | lo
+	sar	vec, 1
+	add	tmp, 32
+	add	vec_i, PS
+ %endif
 	vpshufb	xgft3_hi, x0		;Lookup mul table of high nibble
 	vpshufb	xgft3_lo, xtmpa		;Lookup mul table of low nibble
 	vpxor	xgft3_hi, xgft3_lo	;GF add high and low partials
@@ -229,10 +359,14 @@ func(gf_3vect_dot_prod_avx2)
 	cmp	vec_i, vec
 	jl	.next_vect
 
+	SLDR	dest1, dest1_m
+	SLDR	dest2, dest2_m
 	XSTR	[dest1+pos], xp1
 	XSTR	[dest2+pos], xp2
+	SLDR	dest3, dest3_m
 	XSTR	[dest3+pos], xp3
 
+	SLDR	len, len_m
 	add	pos, 32			;Loop on 32 bytes at a time
 	cmp	pos, len
 	jle	.loop32
@@ -259,15 +393,5 @@ endproc_frame
 
 section .data
 
-%macro slversion 4
-global %1_slver_%2%3%4
-global %1_slver
-%1_slver:
-%1_slver_%2%3%4:
-	dw 0x%4
-	db 0x%3, 0x%2
-%endmacro
 ;;;       func                   core, ver, snum
-slversion gf_3vect_dot_prod_avx2, 04,  03,  0197
-; inform linker that this doesn't require executable stack
-section .note.GNU-stack noalloc noexec nowrite progbits
+slversion gf_3vect_dot_prod_avx2, 04,  05,  0197
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_dot_prod_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_dot_prod_sse.asm.s
index 925fd34..a082fb8 100644
--- a/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_dot_prod_sse.asm.s
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_dot_prod_sse.asm.s
@@ -1,5 +1,5 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 ;
 ;  Redistribution and use in source and binary forms, with or without
 ;  modification, are permitted provided that the following conditions
@@ -30,8 +30,8 @@
 ;;;
 ;;; gf_3vect_dot_prod_sse(len, vec, *g_tbls, **buffs, **dests);
 ;;;
-;;; Author: Gregory Tucker
 
+%include "reg_sizes.asm"
 
 %ifidn __OUTPUT_FORMAT__, elf64
  %define arg0  rdi
@@ -46,6 +46,9 @@
  %define tmp3  r13		; must be saved and restored
  %define tmp4  r12		; must be saved and restored
  %define return rax
+ %macro  SLDR 2
+ %endmacro
+ %define SSTR SLDR
  %define PS 8
  %define LOG_PS 3
 
@@ -73,6 +76,9 @@
  %define tmp3   r13		; must be saved and restored
  %define tmp4   r14		; must be saved and restored
  %define return rax
+ %macro  SLDR 2
+ %endmacro
+ %define SSTR SLDR
  %define PS     8
  %define LOG_PS 3
  %define stack_size  6*16 + 5*8 	; must be an odd multiple of 8
@@ -110,17 +116,97 @@
  %endmacro
 %endif
 
+%ifidn __OUTPUT_FORMAT__, elf32
+
+;;;================== High Address;
+;;;	arg4
+;;;	arg3
+;;;	arg2
+;;;	arg1
+;;;	arg0
+;;;	return
+;;;<================= esp of caller
+;;;	ebp
+;;;<================= ebp = esp
+;;;	var0
+;;;	var1
+;;;	esi
+;;;	edi
+;;;	ebx
+;;;<================= esp of callee
+;;;
+;;;================== Low Address;
+
+ %define PS 4
+ %define LOG_PS 2
+ %define func(x) x:
+ %define arg(x) [ebp + PS*2 + PS*x]
+ %define var(x) [ebp - PS - PS*x]
+
+ %define trans	 ecx
+ %define trans2  esi
+ %define arg0	 trans		;trans and trans2 are for the variables in stack
+ %define arg0_m	 arg(0)
+ %define arg1	 ebx
+ %define arg2	 arg2_m
+ %define arg2_m	 arg(2)
+ %define arg3	 trans
+ %define arg3_m	 arg(3)
+ %define arg4	 trans
+ %define arg4_m	 arg(4)
+ %define arg5	 trans2
+ %define tmp	 edx
+ %define tmp2	 edi
+ %define tmp3	 trans2
+ %define tmp3_m	 var(0)
+ %define tmp4	 trans2
+ %define tmp4_m	 var(1)
+ %define return	 eax
+ %macro SLDR 2	;; stack load/restore
+	mov %1, %2
+ %endmacro
+ %define SSTR SLDR
+
+ %macro FUNC_SAVE 0
+	push	ebp
+	mov	ebp, esp
+	sub	esp, PS*2		;2 local variables
+	push	esi
+	push	edi
+	push	ebx
+	mov	arg1, arg(1)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	pop	ebx
+	pop	edi
+	pop	esi
+	add	esp, PS*2		;2 local variables
+	pop	ebp
+ %endmacro
+
+%endif	; output formats
+
 %define len   arg0
 %define vec   arg1
 %define mul_array arg2
 %define	src   arg3
-%define dest1 arg4
+%define dest1  arg4
 %define ptr   arg5
+
 %define vec_i tmp2
 %define dest2 tmp3
 %define dest3 tmp4
 %define pos   return
 
+ %ifidn PS,4				;32-bit code
+	%define  len_m 	arg0_m
+	%define  src_m 	arg3_m
+	%define  dest1_m arg4_m
+	%define  dest2_m tmp3_m
+	%define  dest3_m tmp4_m
+ %endif
+
 %ifndef EC_ALIGNED_ADDR
 ;;; Use Un-aligned load/store
  %define XLDR movdqu
@@ -136,39 +222,62 @@
  %endif
 %endif
 
+%ifidn PS,8				; 64-bit code
+ default rel
+  [bits 64]
+%endif
 
-default rel
 
-[bits 64]
 section .text
 
-%define xmask0f   xmm11
-%define xgft1_lo  xmm10
-%define xgft1_hi  xmm9
-%define xgft2_lo  xmm8
-%define xgft2_hi  xmm7
-%define xgft3_lo  xmm6
-%define xgft3_hi  xmm5
-
-%define x0     xmm0
-%define xtmpa  xmm1
-%define xp1    xmm2
-%define xp2    xmm3
-%define xp3    xmm4
+%ifidn PS,8				;64-bit code
+ %define xmask0f   xmm11
+ %define xgft1_lo  xmm2
+ %define xgft1_hi  xmm3
+ %define xgft2_lo  xmm4
+ %define xgft2_hi  xmm7
+ %define xgft3_lo  xmm6
+ %define xgft3_hi  xmm5
+
+ %define x0     xmm0
+ %define xtmpa  xmm1
+ %define xp1    xmm10
+ %define xp2    xmm9
+ %define xp3    xmm8
+%else
+ %define xmask0f   xmm7
+ %define xgft1_lo  xmm6
+ %define xgft1_hi  xmm5
+ %define xgft2_lo  xgft1_lo
+ %define xgft2_hi  xgft1_hi
+ %define xgft3_lo  xgft1_lo
+ %define xgft3_hi  xgft1_hi
+
+ %define x0     xmm0
+ %define xtmpa  xmm1
+ %define xp1    xmm2
+ %define xp2    xmm3
+ %define xp3    xmm4
+%endif
 
 align 16
 global gf_3vect_dot_prod_sse:function
 func(gf_3vect_dot_prod_sse)
 	FUNC_SAVE
+	SLDR	len, len_m
 	sub	len, 16
+	SSTR	len_m, len
 	jl	.return_fail
 	xor	pos, pos
 	movdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
 	sal	vec, LOG_PS		;vec *= PS. Make vec_i count by PS
+	SLDR	dest1, dest1_m
 	mov	dest2, [dest1+PS]
+	SSTR	dest2_m, dest2
 	mov	dest3, [dest1+2*PS]
+	SSTR	dest3_m, dest3
 	mov	dest1, [dest1]
-
+	SSTR	dest1_m, dest1
 
 .loop16:
 	pxor	xp1, xp1
@@ -178,17 +287,19 @@ func(gf_3vect_dot_prod_sse)
 	xor	vec_i, vec_i
 
 .next_vect:
+	SLDR src, src_m
 	mov	ptr, [src+vec_i]
 
 	movdqu	xgft1_lo, [tmp]		;Load array Ax{00}, Ax{01}, ..., Ax{0f}
 	movdqu	xgft1_hi, [tmp+16]	;     "     Ax{00}, Ax{10}, ..., Ax{f0}
+ %ifidn PS,8				;64-bit code
 	movdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
 	movdqu	xgft2_hi, [tmp+vec*(32/PS)+16]	;     "     Bx{00}, Bx{10}, ..., Bx{f0}
 	movdqu	xgft3_lo, [tmp+vec*(64/PS)]	;Load array Cx{00}, Cx{01}, ..., Cx{0f}
 	movdqu	xgft3_hi, [tmp+vec*(64/PS)+16]	;     "     Cx{00}, Cx{10}, ..., Cx{f0}
-
 	add	tmp, 32
 	add	vec_i, PS
+ %endif
 	XLDR	x0, [ptr+pos]		;Get next source vector
 
 	movdqa	xtmpa, x0		;Keep unshifted copy of src
@@ -201,11 +312,23 @@ func(gf_3vect_dot_prod_sse)
 	pxor	xgft1_hi, xgft1_lo	;GF add high and low partials
 	pxor	xp1, xgft1_hi		;xp1 += partial
 
+ %ifidn PS,4				;32-bit code
+	movdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+	movdqu	xgft2_hi, [tmp+vec*(32/PS)+16]	;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+ %endif
 	pshufb	xgft2_hi, x0		;Lookup mul table of high nibble
 	pshufb	xgft2_lo, xtmpa		;Lookup mul table of low nibble
 	pxor	xgft2_hi, xgft2_lo	;GF add high and low partials
 	pxor	xp2, xgft2_hi		;xp2 += partial
 
+ %ifidn PS,4				;32-bit code
+	sal	vec, 1
+	movdqu	xgft3_lo, [tmp+vec*(32/PS)]	;Load array Cx{00}, Cx{01}, ..., Cx{0f}
+	movdqu	xgft3_hi, [tmp+vec*(32/PS)+16]	;     "     Cx{00}, Cx{10}, ..., Cx{f0}
+	sar 	vec, 1
+	add	tmp, 32
+	add	vec_i, PS
+ %endif
 	pshufb	xgft3_hi, x0		;Lookup mul table of high nibble
 	pshufb	xgft3_lo, xtmpa		;Lookup mul table of low nibble
 	pxor	xgft3_hi, xgft3_lo	;GF add high and low partials
@@ -214,10 +337,14 @@ func(gf_3vect_dot_prod_sse)
 	cmp	vec_i, vec
 	jl	.next_vect
 
+	SLDR	dest1, dest1_m
+	SLDR	dest2, dest2_m
 	XSTR	[dest1+pos], xp1
 	XSTR	[dest2+pos], xp2
+	SLDR	dest3, dest3_m
 	XSTR	[dest3+pos], xp3
 
+	SLDR	len, len_m
 	add	pos, 16			;Loop on 16 bytes at a time
 	cmp	pos, len
 	jle	.loop16
@@ -247,15 +374,5 @@ section .data
 align 16
 mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
 
-%macro slversion 4
-global %1_slver_%2%3%4
-global %1_slver
-%1_slver:
-%1_slver_%2%3%4:
-	dw 0x%4
-	db 0x%3, 0x%2
-%endmacro
 ;;;       func                  core, ver, snum
-slversion gf_3vect_dot_prod_sse, 00,  03,  0063
-; inform linker that this doesn't require executable stack
-section .note.GNU-stack noalloc noexec nowrite progbits
+slversion gf_3vect_dot_prod_sse, 00,  06,  0063
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_avx.asm.s
new file mode 100644
index 0000000..ed25d6a
--- /dev/null
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_avx.asm.s
@@ -0,0 +1,288 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_3vect_mad_avx(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+
+%define PS 8
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg0.w ecx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define arg4  r12
+ %define arg5  r15
+ %define tmp   r11
+ %define return rax
+ %define return.w eax
+ %define stack_size 16*10 + 3*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	vmovdqa	[rsp+16*0],xmm6
+	vmovdqa	[rsp+16*1],xmm7
+	vmovdqa	[rsp+16*2],xmm8
+	vmovdqa	[rsp+16*3],xmm9
+	vmovdqa	[rsp+16*4],xmm10
+	vmovdqa	[rsp+16*5],xmm11
+	vmovdqa	[rsp+16*6],xmm12
+	vmovdqa	[rsp+16*7],xmm13
+	vmovdqa	[rsp+16*8],xmm14
+	vmovdqa	[rsp+16*9],xmm15
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r15,  10*16 + 1*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	vmovdqa	xmm6, [rsp+16*0]
+	vmovdqa	xmm7, [rsp+16*1]
+	vmovdqa	xmm8, [rsp+16*2]
+	vmovdqa	xmm9, [rsp+16*3]
+	vmovdqa	xmm10, [rsp+16*4]
+	vmovdqa	xmm11, [rsp+16*5]
+	vmovdqa	xmm12, [rsp+16*6]
+	vmovdqa	xmm13, [rsp+16*7]
+	vmovdqa	xmm14, [rsp+16*8]
+	vmovdqa	xmm15, [rsp+16*9]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r15,  [rsp + 10*16 + 1*8]
+	add	rsp, stack_size
+%endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+;;; gf_3vect_mad_avx(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest1 arg5
+%define pos   return
+%define pos.w return.w
+
+%define dest2 mul_array
+%define dest3 vec_i
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f  xmm15
+%define xgft1_lo  xmm14
+%define xgft1_hi  xmm13
+%define xgft2_lo  xmm12
+%define xgft2_hi  xmm11
+%define xgft3_lo  xmm10
+%define xgft3_hi  xmm9
+
+%define x0      xmm0
+%define xtmpa   xmm1
+%define xtmph1  xmm2
+%define xtmpl1  xmm3
+%define xtmph2  xmm4
+%define xtmpl2  xmm5
+%define xtmph3  xmm6
+%define xtmpl3  xmm7
+%define xd1     xmm8
+%define xd2     xtmpl1
+%define xd3     xtmph1
+
+align 16
+global gf_3vect_mad_avx:function
+func(gf_3vect_mad_avx)
+	FUNC_SAVE
+	sub	len, 16
+	jl	.return_fail
+	xor	pos, pos
+	vmovdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+
+	sal	vec_i, 5		;Multiply by 32
+	sal	vec, 5
+	lea	tmp, [mul_array + vec_i]
+	vmovdqu	xgft1_lo, [tmp]		;Load array Ax{00}, Ax{01}, Ax{02}, ...
+	vmovdqu	xgft1_hi, [tmp+16]	; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	vmovdqu	xgft2_lo, [tmp+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+	vmovdqu	xgft2_hi, [tmp+vec+16]	; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	vmovdqu	xgft3_lo, [tmp+2*vec]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+	vmovdqu	xgft3_hi, [tmp+2*vec+16]; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+	mov	dest2, [dest1+PS]	; reuse mul_array
+	mov	dest3, [dest1+2*PS]	; reuse vec_i
+	mov	dest1, [dest1]
+
+.loop16:
+	XLDR	x0, [src+pos]		;Get next source vector
+	XLDR	xd1, [dest1+pos]		;Get next dest vector
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	; dest1
+	vpshufb	xtmph1, xgft1_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl1, xgft1_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xtmpl1		;GF add high and low partials
+	vpxor	xd1, xd1, xtmph1		;xd1 += partial
+
+	XLDR	xd2, [dest2+pos]	;reuse xtmpl1. Get next dest vector
+	XLDR	xd3, [dest3+pos]	;reuse xtmph1. Get next dest vector
+
+	; dest2
+	vpshufb	xtmph2, xgft2_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl2, xgft2_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmph2, xtmpl2		;GF add high and low partials
+	vpxor	xd2, xd2, xtmph2		;xd2 += partial
+
+	; dest3
+	vpshufb	xtmph3, xgft3_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl3, xgft3_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph3, xtmph3, xtmpl3		;GF add high and low partials
+	vpxor	xd3, xd3, xtmph3		;xd3 += partial
+
+	XSTR	[dest1+pos], xd1
+	XSTR	[dest2+pos], xd2
+	XSTR	[dest3+pos], xd3
+
+	add	pos, 16			;Loop on 16 bytes at a time
+	cmp	pos, len
+	jle	.loop16
+
+	lea	tmp, [len + 16]
+	cmp	pos, tmp
+	je	.return_pass
+
+.lessthan16:
+	;; Tail len
+	;; Do one more overlap pass
+	mov	tmp, len		;Overlapped offset length-16
+	XLDR	x0, [src+tmp]		;Get next source vector
+	XLDR	xd1, [dest1+tmp]	;Get next dest vector
+	XLDR	xd2, [dest2+tmp]	;reuse xtmpl1. Get next dest vector
+	XLDR	xd3, [dest3+tmp]	;reuse xtmph1. Get next dest vector
+
+	sub	len, pos
+
+	movdqa	xtmph3, [constip16]	;Load const of i + 16
+	vpinsrb	xtmpl3, xtmpl3, len.w, 15
+	vpshufb	xtmpl3, xtmpl3, xmask0f		;Broadcast len to all bytes
+	vpcmpgtb	xtmpl3, xtmpl3, xtmph3
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	; dest1
+	vpshufb	xgft1_hi, xgft1_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft1_lo, xgft1_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xgft1_hi, xgft1_hi, xgft1_lo	;GF add high and low partials
+	vpand	xgft1_hi, xgft1_hi, xtmpl3
+	vpxor	xd1, xd1, xgft1_hi
+
+	; dest2
+	vpshufb	xgft2_hi, xgft2_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft2_lo, xgft2_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xgft2_hi, xgft2_hi, xgft2_lo	;GF add high and low partials
+	vpand	xgft2_hi, xgft2_hi, xtmpl3
+	vpxor	xd2, xd2, xgft2_hi
+
+	; dest3
+	vpshufb	xgft3_hi, xgft3_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft3_lo, xgft3_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xgft3_hi, xgft3_hi, xgft3_lo	;GF add high and low partials
+	vpand	xgft3_hi, xgft3_hi, xtmpl3
+	vpxor	xd3, xd3, xgft3_hi
+
+	XSTR	[dest1+tmp], xd1
+	XSTR	[dest2+tmp], xd2
+	XSTR	[dest3+tmp], xd3
+
+.return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+.return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+constip16:
+	ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+
+;;;       func             core, ver, snum
+slversion gf_3vect_mad_avx, 02,  01,  0207
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_avx2.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_avx2.asm.s
new file mode 100644
index 0000000..d0b9272
--- /dev/null
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_avx2.asm.s
@@ -0,0 +1,317 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_3vect_mad_avx2(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+
+%define PS 8
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0   rcx
+ %define arg0.w ecx
+ %define arg1   rdx
+ %define arg2   r8
+ %define arg3   r9
+ %define arg4   r12 		; must be saved, loaded and restored
+ %define arg5   r15 		; must be saved and restored
+
+ %define tmp    r11
+ %define tmp.w  r11d
+ %define tmp.b  r11b
+ %define return rax
+ %define return.w eax
+ %define stack_size 16*10 + 3*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+ %macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	vmovdqa	[rsp+16*0],xmm6
+	vmovdqa	[rsp+16*1],xmm7
+	vmovdqa	[rsp+16*2],xmm8
+	vmovdqa	[rsp+16*3],xmm9
+	vmovdqa	[rsp+16*4],xmm10
+	vmovdqa	[rsp+16*5],xmm11
+	vmovdqa	[rsp+16*6],xmm12
+	vmovdqa	[rsp+16*7],xmm13
+	vmovdqa	[rsp+16*8],xmm14
+	vmovdqa	[rsp+16*9],xmm15
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r15,  10*16 + 1*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	vmovdqa	xmm6, [rsp+16*0]
+	vmovdqa	xmm7, [rsp+16*1]
+	vmovdqa	xmm8, [rsp+16*2]
+	vmovdqa	xmm9, [rsp+16*3]
+	vmovdqa	xmm10, [rsp+16*4]
+	vmovdqa	xmm11, [rsp+16*5]
+	vmovdqa	xmm12, [rsp+16*6]
+	vmovdqa	xmm13, [rsp+16*7]
+	vmovdqa	xmm14, [rsp+16*8]
+	vmovdqa	xmm15, [rsp+16*9]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r15,  [rsp + 10*16 + 1*8]
+	add	rsp, stack_size
+ %endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+
+ %define tmp   r11
+ %define tmp.w r11d
+ %define tmp.b r11b
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+;;; gf_3vect_mad_avx2(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest1 arg5
+%define pos   return
+%define pos.w return.w
+
+%define dest2 mul_array
+%define dest3 vec_i
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f   ymm15
+%define xmask0fx  xmm15
+%define xgft1_lo  ymm14
+%define xgft1_hi  ymm13
+%define xgft2_lo  ymm12
+%define xgft3_lo  ymm11
+
+%define x0      ymm0
+%define xtmpa   ymm1
+%define xtmph1  ymm2
+%define xtmpl1  ymm3
+%define xtmph2  ymm4
+%define xtmpl2  ymm5
+%define xtmpl2x xmm5
+%define xtmph3  ymm6
+%define xtmpl3  ymm7
+%define xtmpl3x xmm7
+%define xd1     ymm8
+%define xd2     ymm9
+%define xd3     ymm10
+
+align 16
+global gf_3vect_mad_avx2:function
+func(gf_3vect_mad_avx2)
+	FUNC_SAVE
+	sub	len, 32
+	jl	.return_fail
+	xor	pos, pos
+	mov	tmp.b, 0x0f
+	vpinsrb	xmask0fx, xmask0fx, tmp.w, 0
+	vpbroadcastb xmask0f, xmask0fx	;Construct mask 0x0f0f0f...
+
+	sal	vec_i, 5		;Multiply by 32
+	sal	vec, 5
+	lea	tmp, [mul_array + vec_i]
+
+	vmovdqu	xgft1_lo, [tmp]		;Load array Ax{00}, Ax{01}, ..., Ax{0f}
+					;     "     Ax{00}, Ax{10}, ..., Ax{f0}
+	vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x11 ; swapped to hi | hi
+	vperm2i128 xgft1_lo, xgft1_lo, xgft1_lo, 0x00 ; swapped to lo | lo
+
+	vmovdqu	xgft2_lo, [tmp+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+					; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	vmovdqu	xgft3_lo, [tmp+2*vec]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+					; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+	mov	dest2, [dest1+PS]	; reuse mul_array
+	mov	dest3, [dest1+2*PS]	; reuse vec_i
+	mov	dest1, [dest1]
+
+.loop32:
+	XLDR	x0, [src+pos]		;Get next source vector
+	XLDR	xd1, [dest1+pos]		;Get next dest vector
+	XLDR	xd2, [dest2+pos]		;Get next dest vector
+	XLDR	xd3, [dest3+pos]		;Get next dest vector
+	vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi
+	vperm2i128 xtmpl2, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo
+
+	vperm2i128 xtmph3, xgft3_lo, xgft3_lo, 0x11 ; swapped to hi | hi
+	vperm2i128 xtmpl3, xgft3_lo, xgft3_lo, 0x00 ; swapped to lo | lo
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	; dest1
+	vpshufb	xtmph1, xgft1_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl1, xgft1_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xtmpl1		;GF add high and low partials
+	vpxor	xd1, xd1, xtmph1		;xd1 += partial
+
+	; dest2
+	vpshufb	xtmph2, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl2, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmpl2		;GF add high and low partials
+	vpxor	xd2, xtmph2		;xd2 += partial
+
+	; dest3
+	vpshufb	xtmph3, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl3, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph3, xtmpl3		;GF add high and low partials
+	vpxor	xd3, xtmph3		;xd3 += partial
+
+	XSTR	[dest1+pos], xd1
+	XSTR	[dest2+pos], xd2
+	XSTR	[dest3+pos], xd3
+
+	add	pos, 32			;Loop on 32 bytes at a time
+	cmp	pos, len
+	jle	.loop32
+
+	lea	tmp, [len + 32]
+	cmp	pos, tmp
+	je	.return_pass
+
+.lessthan32:
+	;; Tail len
+	;; Do one more overlap pass
+	mov	tmp.b, 0x1f
+	vpinsrb	xtmpl2x, xtmpl2x, tmp.w, 0
+	vpbroadcastb xtmpl2, xtmpl2x	;Construct mask 0x1f1f1f...
+
+	mov	tmp, len		;Overlapped offset length-32
+
+	XLDR	x0, [src+tmp]		;Get next source vector
+	XLDR	xd1, [dest1+tmp]	;Get next dest vector
+	XLDR	xd2, [dest2+tmp]	;Get next dest vector
+	XLDR	xd3, [dest3+tmp]	;Get next dest vector
+
+	sub	len, pos
+
+	vmovdqa	xtmph3, [constip32]	;Load const of i + 32
+	vpinsrb	xtmpl3x, xtmpl3x, len.w, 15
+	vinserti128	xtmpl3, xtmpl3, xtmpl3x, 1 ;swapped to xtmpl3x | xtmpl3x
+	vpshufb	xtmpl3, xtmpl3, xtmpl2	;Broadcast len to all bytes. xtmpl2=0x1f1f1f...
+	vpcmpgtb	xtmpl3, xtmpl3, xtmph3
+
+	vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi
+	vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo
+
+	vperm2i128 xtmph3, xgft3_lo, xgft3_lo, 0x11 ; swapped to hi | hi
+	vperm2i128 xgft3_lo, xgft3_lo, xgft3_lo, 0x00 ; swapped to lo | lo
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	; dest1
+	vpshufb	xtmph1, xgft1_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl1, xgft1_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xtmpl1		;GF add high and low partials
+	vpand	xtmph1, xtmph1, xtmpl3
+	vpxor	xd1, xd1, xtmph1		;xd1 += partial
+
+	; dest2
+	vpshufb	xtmph2, xtmph2, x0		;Lookup mul table of high nibble
+	vpshufb	xgft2_lo, xgft2_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmph2, xgft2_lo	;GF add high and low partials
+	vpand	xtmph2, xtmph2, xtmpl3
+	vpxor	xd2, xd2, xtmph2		;xd2 += partial
+
+	; dest3
+	vpshufb	xtmph3, xtmph3, x0		;Lookup mul table of high nibble
+	vpshufb	xgft3_lo, xgft3_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph3, xtmph3, xgft3_lo	;GF add high and low partials
+	vpand	xtmph3, xtmph3, xtmpl3
+	vpxor	xd3, xd3, xtmph3		;xd3 += partial
+
+	XSTR	[dest1+tmp], xd1
+	XSTR	[dest2+tmp], xd2
+	XSTR	[dest3+tmp], xd3
+
+.return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+.return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+align 32
+constip32:
+	ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+	ddq 0xe0e1e2e3e4e5e6e7e8e9eaebecedeeef
+
+;;;       func              core, ver, snum
+slversion gf_3vect_mad_avx2, 04,  01,  0208
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_sse.asm.s
new file mode 100644
index 0000000..a06eb3d
--- /dev/null
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_sse.asm.s
@@ -0,0 +1,298 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_3vect_mad_sse(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+
+%define PS 8
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg0.w ecx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define arg4  r12
+ %define arg5  r15
+ %define tmp   r11
+ %define return rax
+ %define return.w eax
+ %define stack_size 16*10 + 3*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	movdqa	[rsp+16*0],xmm6
+	movdqa	[rsp+16*1],xmm7
+	movdqa	[rsp+16*2],xmm8
+	movdqa	[rsp+16*3],xmm9
+	movdqa	[rsp+16*4],xmm10
+	movdqa	[rsp+16*5],xmm11
+	movdqa	[rsp+16*6],xmm12
+	movdqa	[rsp+16*7],xmm13
+	movdqa	[rsp+16*8],xmm14
+	movdqa	[rsp+16*9],xmm15
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r15,  10*16 + 1*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp+16*0]
+	movdqa	xmm7, [rsp+16*1]
+	movdqa	xmm8, [rsp+16*2]
+	movdqa	xmm9, [rsp+16*3]
+	movdqa	xmm10, [rsp+16*4]
+	movdqa	xmm11, [rsp+16*5]
+	movdqa	xmm12, [rsp+16*6]
+	movdqa	xmm13, [rsp+16*7]
+	movdqa	xmm14, [rsp+16*8]
+	movdqa	xmm15, [rsp+16*9]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r15,  [rsp + 10*16 + 1*8]
+	add	rsp, stack_size
+%endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+;;; gf_3vect_mad_sse(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest1  arg5
+%define pos   return
+%define pos.w return.w
+
+%define dest2 mul_array
+%define dest3 vec_i
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR movdqu
+ %define XSTR movdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR movdqa
+  %define XSTR movdqa
+ %else
+  %define XLDR movntdqa
+  %define XSTR movntdq
+ %endif
+%endif
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f  xmm15
+%define xgft1_lo  xmm14
+%define xgft1_hi  xmm13
+%define xgft2_lo  xmm12
+%define xgft2_hi  xmm11
+%define xgft3_lo  xmm10
+%define xgft3_hi  xmm9
+
+%define x0      xmm0
+%define xtmpa   xmm1
+%define xtmph1  xmm2
+%define xtmpl1  xmm3
+%define xtmph2  xmm4
+%define xtmpl2  xmm5
+%define xtmph3  xmm6
+%define xtmpl3  xmm7
+%define xd1     xmm8
+%define xd2     xtmpl1
+%define xd3     xtmph1
+
+align 16
+global gf_3vect_mad_sse:function
+func(gf_3vect_mad_sse)
+	FUNC_SAVE
+	sub	len, 16
+	jl	.return_fail
+	xor	pos, pos
+	movdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+	sal	vec_i, 5		;Multiply by 32
+	sal	vec, 5
+	lea	tmp, [mul_array + vec_i]
+
+	movdqu	xgft1_lo, [tmp]		;Load array Ax{00}, Ax{01}, Ax{02}, ...
+	movdqu	xgft1_hi, [tmp+16]	; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	movdqu	xgft2_lo, [tmp+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+	movdqu	xgft2_hi, [tmp+vec+16]	; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	movdqu	xgft3_lo, [tmp+2*vec]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+	movdqu	xgft3_hi, [tmp+2*vec+16]	; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+	mov	dest2, [dest1+PS]	; reuse mul_array
+	mov	dest3, [dest1+2*PS]	; reuse vec_i
+	mov	dest1, [dest1]
+
+.loop16:
+	XLDR	x0, [src+pos]		;Get next source vector
+	movdqa	xtmph1, xgft1_hi	;Reload const array registers
+	movdqa	xtmpl1, xgft1_lo
+	movdqa	xtmph2, xgft2_hi	;Reload const array registers
+	movdqa	xtmpl2, xgft2_lo
+	movdqa	xtmph3, xgft3_hi	;Reload const array registers
+	movdqa	xtmpl3, xgft3_lo
+
+	XLDR	xd1, [dest1+pos]	;Get next dest vector
+
+	movdqa	xtmpa, x0		;Keep unshifted copy of src
+	psraw	x0, 4			;Shift to put high nibble into bits 4-0
+	pand	x0, xmask0f		;Mask high src nibble in bits 4-0
+	pand	xtmpa, xmask0f		;Mask low src nibble in bits 4-0
+
+	; dest1
+	pshufb	xtmph1, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl1, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph1, xtmpl1		;GF add high and low partials
+	pxor	xd1, xtmph1
+
+	XLDR	xd2, [dest2+pos]	;reuse xtmpl1. Get next dest vector
+	XLDR	xd3, [dest3+pos]	;reuse xtmph1. Get next dest vector
+
+	; dest2
+	pshufb	xtmph2, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl2, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph2, xtmpl2		;GF add high and low partials
+	pxor	xd2, xtmph2
+
+	; dest3
+	pshufb	xtmph3, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl3, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph3, xtmpl3		;GF add high and low partials
+	pxor	xd3, xtmph3
+
+	XSTR	[dest1+pos], xd1	;Store result
+	XSTR	[dest2+pos], xd2	;Store result
+	XSTR	[dest3+pos], xd3	;Store result
+
+	add	pos, 16			;Loop on 16 bytes at a time
+	cmp	pos, len
+	jle	.loop16
+
+	lea	tmp, [len + 16]
+	cmp	pos, tmp
+	je	.return_pass
+
+.lessthan16:
+	;; Tail len
+	;; Do one more overlap pass
+	mov	tmp, len		;Overlapped offset length-16
+
+	XLDR	x0, [src+tmp]		;Get next source vector
+	XLDR	xd1, [dest1+tmp]	;Get next dest vector
+	XLDR	xd2, [dest2+tmp]	;reuse xtmpl1. Get next dest vector
+	XLDR	xd3, [dest3+tmp]	;reuse xtmph1. Get next dest vector
+
+	sub	len, pos
+
+	movdqa	xtmph3, [constip16]	;Load const of i + 16
+	pinsrb	xtmpl3, len.w, 15
+	pshufb	xtmpl3, xmask0f		;Broadcast len to all bytes
+	pcmpgtb	xtmpl3, xtmph3
+
+	movdqa	xtmpa, x0		;Keep unshifted copy of src
+	psraw	x0, 4			;Shift to put high nibble into bits 4-0
+	pand	x0, xmask0f		;Mask high src nibble in bits 4-0
+	pand	xtmpa, xmask0f		;Mask low src nibble in bits 4-0
+
+	; dest1
+	pshufb	xgft1_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft1_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft1_hi, xgft1_lo	;GF add high and low partials
+	pand	xgft1_hi, xtmpl3
+	pxor	xd1, xgft1_hi
+
+	; dest2
+	pshufb	xgft2_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft2_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft2_hi, xgft2_lo	;GF add high and low partials
+	pand	xgft2_hi, xtmpl3
+	pxor	xd2, xgft2_hi
+
+	; dest3
+	pshufb	xgft3_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft3_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft3_hi, xgft3_lo	;GF add high and low partials
+	pand	xgft3_hi, xtmpl3
+	pxor	xd3, xgft3_hi
+
+	XSTR	[dest1+tmp], xd1	;Store result
+	XSTR	[dest2+tmp], xd2	;Store result
+	XSTR	[dest3+tmp], xd3	;Store result
+
+.return_pass:
+	FUNC_RESTORE
+	mov	return, 0
+	ret
+
+.return_fail:
+	FUNC_RESTORE
+	mov	return, 1
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+
+mask0f:
+	ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+constip16:
+	ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+
+;;;       func             core, ver, snum
+slversion gf_3vect_mad_sse, 00,  01,  0206
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_dot_prod_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_dot_prod_avx.asm.s
index 6197f01..9863012 100644
--- a/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_dot_prod_avx.asm.s
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_dot_prod_avx.asm.s
@@ -1,5 +1,5 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 ;
 ;  Redistribution and use in source and binary forms, with or without
 ;  modification, are permitted provided that the following conditions
@@ -30,8 +30,8 @@
 ;;;
 ;;; gf_4vect_dot_prod_avx(len, vec, *g_tbls, **buffs, **dests);
 ;;;
-;;; Author: Gregory Tucker
 
+%include "reg_sizes.asm"
 
 %ifidn __OUTPUT_FORMAT__, elf64
  %define arg0  rdi
@@ -48,7 +48,10 @@
  %define tmp5  r14		; must be saved and restored
  %define tmp6  r15		; must be saved and restored
  %define return rax
- %define PS 8
+ %macro  SLDR   2
+ %endmacro
+ %define SSTR   SLDR
+ %define PS     8
  %define LOG_PS 3
 
  %define func(x) x:
@@ -81,6 +84,9 @@
  %define tmp5   rdi		; must be saved and restored
  %define tmp6   rsi		; must be saved and restored
  %define return rax
+ %macro  SLDR   2
+ %endmacro
+ %define SSTR   SLDR
  %define PS     8
  %define LOG_PS 3
  %define stack_size  9*16 + 7*8		; must be an odd multiple of 8
@@ -128,6 +134,82 @@
  %endmacro
 %endif
 
+%ifidn __OUTPUT_FORMAT__, elf32
+
+;;;================== High Address;
+;;;	arg4
+;;;	arg3
+;;;	arg2
+;;;	arg1
+;;;	arg0
+;;;	return
+;;;<================= esp of caller
+;;;	ebp
+;;;<================= ebp = esp
+;;;	var0
+;;;	var1
+;;;	var2
+;;;	var3
+;;;	esi
+;;;	edi
+;;;	ebx
+;;;<================= esp of callee
+;;;
+;;;================== Low Address;
+
+ %define PS     4
+ %define LOG_PS 2
+ %define func(x) x:
+ %define arg(x) [ebp + PS*2 + PS*x]
+ %define var(x) [ebp - PS - PS*x]
+
+ %define trans	 ecx
+ %define trans2  esi
+ %define arg0	 trans		;trans and trans2 are for the variables in stack
+ %define arg0_m	 arg(0)
+ %define arg1	 ebx
+ %define arg2	 arg2_m
+ %define arg2_m	 arg(2)
+ %define arg3	 trans
+ %define arg3_m	 arg(3)
+ %define arg4	 trans
+ %define arg4_m	 arg(4)
+ %define arg5	 trans2
+ %define tmp	 edx
+ %define tmp2	 edi
+ %define tmp3	 trans2
+ %define tmp3_m	 var(0)
+ %define tmp4	 trans2
+ %define tmp4_m	 var(1)
+ %define tmp5	 trans2
+ %define tmp5_m	 var(2)
+ %define tmp6	 trans2
+ %define tmp6_m	 var(3)
+ %define return	 eax
+ %macro SLDR 2				;stack load/restore
+	mov %1, %2
+ %endmacro
+ %define SSTR SLDR
+
+ %macro FUNC_SAVE 0
+	push	ebp
+	mov	ebp, esp
+	sub	esp, PS*4		;4 local variables
+	push	esi
+	push	edi
+	push	ebx
+	mov	arg1, arg(1)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	pop	ebx
+	pop	edi
+	pop	esi
+	add	esp, PS*4		;4 local variables
+	pop	ebp
+ %endmacro
+
+%endif	; output formats
 
 %define len    arg0
 %define vec    arg1
@@ -140,7 +222,17 @@
 %define dest3  tmp4
 %define dest4  tmp5
 %define vskip3 tmp6
-%define pos   return
+%define pos    return
+
+ %ifidn PS,4				;32-bit code
+	%define  len_m 	arg0_m
+	%define  src_m 	arg3_m
+	%define  dest1_m arg4_m
+	%define  dest2_m tmp3_m
+	%define  dest3_m tmp4_m
+	%define  dest4_m tmp5_m
+	%define  vskip3_m tmp6_m
+ %endif
 
 %ifndef EC_ALIGNED_ADDR
 ;;; Use Un-aligned load/store
@@ -157,46 +249,73 @@
  %endif
 %endif
 
+%ifidn PS,8				; 64-bit code
+ default rel
+  [bits 64]
+%endif
 
-default rel
 
-[bits 64]
 section .text
 
-%define xmask0f   xmm14
-%define xgft1_lo  xmm13
-%define xgft1_hi  xmm12
-%define xgft2_lo  xmm11
-%define xgft2_hi  xmm10
-%define xgft3_lo  xmm9
-%define xgft3_hi  xmm8
-%define xgft4_lo  xmm7
-%define xgft4_hi  xmm6
-
-
-%define x0     xmm0
-%define xtmpa  xmm1
-%define xp1    xmm2
-%define xp2    xmm3
-%define xp3    xmm4
-%define xp4    xmm5
-
+%ifidn PS,8				;64-bit code
+ %define xmask0f   xmm14
+ %define xgft1_lo  xmm13
+ %define xgft1_hi  xmm12
+ %define xgft2_lo  xmm11
+ %define xgft2_hi  xmm10
+ %define xgft3_lo  xmm9
+ %define xgft3_hi  xmm8
+ %define xgft4_lo  xmm7
+ %define xgft4_hi  xmm6
+
+ %define x0     xmm0
+ %define xtmpa  xmm1
+ %define xp1    xmm2
+ %define xp2    xmm3
+ %define xp3    xmm4
+ %define xp4    xmm5
+%else
+ %define xmm_trans xmm7			;reuse xmask0f and xgft1_lo
+ %define xmask0f   xmm_trans
+ %define xgft1_lo  xmm_trans
+ %define xgft1_hi  xmm6
+ %define xgft2_lo  xgft1_lo
+ %define xgft2_hi  xgft1_hi
+ %define xgft3_lo  xgft1_lo
+ %define xgft3_hi  xgft1_hi
+ %define xgft4_lo  xgft1_lo
+ %define xgft4_hi  xgft1_hi
+
+ %define x0     xmm0
+ %define xtmpa  xmm1
+ %define xp1    xmm2
+ %define xp2    xmm3
+ %define xp3    xmm4
+ %define xp4    xmm5
+%endif
 align 16
 global gf_4vect_dot_prod_avx:function
 func(gf_4vect_dot_prod_avx)
 	FUNC_SAVE
+	SLDR	len, len_m
 	sub	len, 16
+	SSTR	len_m, len
 	jl	.return_fail
 	xor	pos, pos
 	vmovdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
 	mov	vskip3, vec
 	imul	vskip3, 96
+	SSTR	vskip3_m, vskip3
 	sal	vec, LOG_PS		;vec *= PS. Make vec_i count by PS
+	SLDR	dest1, dest1_m
 	mov	dest2, [dest1+PS]
+	SSTR	dest2_m, dest2
 	mov	dest3, [dest1+2*PS]
+	SSTR	dest3_m, dest3
 	mov	dest4, [dest1+3*PS]
+	SSTR	dest4_m, dest4
 	mov	dest1, [dest1]
-
+	SSTR	dest1_m, dest1
 
 .loop16:
 	vpxor	xp1, xp1
@@ -207,41 +326,70 @@ func(gf_4vect_dot_prod_avx)
 	xor	vec_i, vec_i
 
 .next_vect:
+	SLDR 	src, src_m
 	mov	ptr, [src+vec_i]
 
+ %ifidn PS,8				;64-bit code
 	vmovdqu	xgft1_lo, [tmp]			;Load array Ax{00}, Ax{01}, ..., Ax{0f}
 	vmovdqu	xgft1_hi, [tmp+16]		;     "     Ax{00}, Ax{10}, ..., Ax{f0}
 	vmovdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
 	vmovdqu	xgft2_hi, [tmp+vec*(32/PS)+16]	;     "     Bx{00}, Bx{10}, ..., Bx{f0}
 	vmovdqu	xgft3_lo, [tmp+vec*(64/PS)]	;Load array Cx{00}, Cx{01}, ..., Cx{0f}
 	vmovdqu	xgft3_hi, [tmp+vec*(64/PS)+16]	;     "     Cx{00}, Cx{10}, ..., Cx{f0}
-	vmovdqu	xgft4_lo, [tmp+vskip3]		;Load array Cx{00}, Cx{01}, ..., Cx{0f}
-	vmovdqu	xgft4_hi, [tmp+vskip3+16]	;     "     Cx{00}, Cx{10}, ..., Cx{f0}
+	vmovdqu	xgft4_lo, [tmp+vskip3]		;Load array Dx{00}, Dx{01}, ..., Dx{0f}
+	vmovdqu	xgft4_hi, [tmp+vskip3+16]	;     "     Dx{00}, Dx{10}, ..., Dx{f0}
 
-	XLDR	x0, [ptr+pos]		;Get next source vector
+	XLDR	x0, 	[ptr+pos]	;Get next source vector
 	add	tmp, 32
 	add	vec_i, PS
 
 	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
 	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
 	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+ %else					;32-bit code
+	XLDR	x0, [ptr+pos]		;Get next source vector
+	vmovdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
 
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	vmovdqu	xgft1_lo, [tmp]			;Load array Ax{00}, Ax{01}, ..., Ax{0f}
+	vmovdqu	xgft1_hi, [tmp+16]		;     "     Ax{00}, Ax{10}, ..., Ax{f0}
+ %endif
 
 	vpshufb	xgft1_hi, x0		;Lookup mul table of high nibble
 	vpshufb	xgft1_lo, xtmpa		;Lookup mul table of low nibble
 	vpxor	xgft1_hi, xgft1_lo	;GF add high and low partials
 	vpxor	xp1, xgft1_hi		;xp1 += partial
 
+ %ifidn PS,4				;32-bit code
+	vmovdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+	vmovdqu	xgft2_hi, [tmp+vec*(32/PS)+16]	;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+ %endif
 	vpshufb	xgft2_hi, x0		;Lookup mul table of high nibble
 	vpshufb	xgft2_lo, xtmpa		;Lookup mul table of low nibble
 	vpxor	xgft2_hi, xgft2_lo	;GF add high and low partials
 	vpxor	xp2, xgft2_hi		;xp2 += partial
 
+ %ifidn PS,4				;32-bit code
+	sal	vec, 1
+	vmovdqu	xgft3_lo, [tmp+vec*(32/PS)]	;Load array Cx{00}, Cx{01}, ..., Cx{0f}
+	vmovdqu	xgft3_hi, [tmp+vec*(32/PS)+16]	;     "     Cx{00}, Cx{10}, ..., Cx{f0}
+	sar 	vec, 1
+ %endif
 	vpshufb	xgft3_hi, x0		;Lookup mul table of high nibble
 	vpshufb	xgft3_lo, xtmpa		;Lookup mul table of low nibble
 	vpxor	xgft3_hi, xgft3_lo	;GF add high and low partials
 	vpxor	xp3, xgft3_hi		;xp3 += partial
 
+ %ifidn PS,4				;32-bit code
+	SLDR	vskip3, vskip3_m
+	vmovdqu	xgft4_lo, [tmp+vskip3]		;Load array Dx{00}, Dx{01}, ..., Dx{0f}
+	vmovdqu	xgft4_hi, [tmp+vskip3+16]	;     "     Dx{00}, Dx{10}, ..., Dx{f0}
+	add	tmp, 32
+	add	vec_i, PS
+ %endif
 	vpshufb	xgft4_hi, x0		;Lookup mul table of high nibble
 	vpshufb	xgft4_lo, xtmpa		;Lookup mul table of low nibble
 	vpxor	xgft4_hi, xgft4_lo	;GF add high and low partials
@@ -250,11 +398,16 @@ func(gf_4vect_dot_prod_avx)
 	cmp	vec_i, vec
 	jl	.next_vect
 
+	SLDR	dest1, dest1_m
+	SLDR	dest2, dest2_m
 	XSTR	[dest1+pos], xp1
 	XSTR	[dest2+pos], xp2
+	SLDR	dest3, dest3_m
 	XSTR	[dest3+pos], xp3
+	SLDR	dest4, dest4_m
 	XSTR	[dest4+pos], xp4
 
+	SLDR	len, len_m
 	add	pos, 16			;Loop on 16 bytes at a time
 	cmp	pos, len
 	jle	.loop16
@@ -284,15 +437,5 @@ section .data
 align 16
 mask0f:	ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
 
-%macro slversion 4
-global %1_slver_%2%3%4
-global %1_slver
-%1_slver:
-%1_slver_%2%3%4:
-	dw 0x%4
-	db 0x%3, 0x%2
-%endmacro
 ;;;       func                  core, ver, snum
-slversion gf_4vect_dot_prod_avx, 00,  02,  0064
-; inform linker that this doesn't require executable stack
-section .note.GNU-stack noalloc noexec nowrite progbits
+slversion gf_4vect_dot_prod_avx, 02,  05,  0193
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_dot_prod_avx2.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_dot_prod_avx2.asm.s
index e4267e2..95aa8eb 100644
--- a/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_dot_prod_avx2.asm.s
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_dot_prod_avx2.asm.s
@@ -1,5 +1,5 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 ;
 ;  Redistribution and use in source and binary forms, with or without
 ;  modification, are permitted provided that the following conditions
@@ -30,8 +30,8 @@
 ;;;
 ;;; gf_4vect_dot_prod_avx2(len, vec, *g_tbls, **buffs, **dests);
 ;;;
-;;; Author: Gregory Tucker
 
+%include "reg_sizes.asm"
 
 %ifidn __OUTPUT_FORMAT__, elf64
  %define arg0  rdi
@@ -50,7 +50,10 @@
  %define tmp5  r14		; must be saved and restored
  %define tmp6  r15		; must be saved and restored
  %define return rax
- %define PS 8
+ %macro  SLDR   2
+ %endmacro
+ %define SSTR   SLDR
+ %define PS     8
  %define LOG_PS 3
 
  %define func(x) x:
@@ -85,6 +88,9 @@
  %define tmp5   rdi		; must be saved and restored
  %define tmp6   rsi		; must be saved and restored
  %define return rax
+ %macro  SLDR   2
+ %endmacro
+ %define SSTR   SLDR
  %define PS     8
  %define LOG_PS 3
  %define stack_size  9*16 + 7*8		; must be an odd multiple of 8
@@ -132,6 +138,84 @@
  %endmacro
 %endif
 
+%ifidn __OUTPUT_FORMAT__, elf32
+
+;;;================== High Address;
+;;;	arg4
+;;;	arg3
+;;;	arg2
+;;;	arg1
+;;;	arg0
+;;;	return
+;;;<================= esp of caller
+;;;	ebp
+;;;<================= ebp = esp
+;;;	var0
+;;;	var1
+;;;	var2
+;;;	var3
+;;;	esi
+;;;	edi
+;;;	ebx
+;;;<================= esp of callee
+;;;
+;;;================== Low Address;
+
+ %define PS     4
+ %define LOG_PS 2
+ %define func(x) x:
+ %define arg(x) [ebp + PS*2 + PS*x]
+ %define var(x) [ebp - PS - PS*x]
+
+ %define trans	 ecx
+ %define trans2  esi
+ %define arg0	 trans		;trans and trans2 are for the variables in stack
+ %define arg0_m	 arg(0)
+ %define arg1	 ebx
+ %define arg2	 arg2_m
+ %define arg2_m	 arg(2)
+ %define arg3	 trans
+ %define arg3_m	 arg(3)
+ %define arg4	 trans
+ %define arg4_m	 arg(4)
+ %define arg5	 trans2
+ %define tmp	 edx
+ %define tmp.w   edx
+ %define tmp.b   dl
+ %define tmp2	 edi
+ %define tmp3	 trans2
+ %define tmp3_m	 var(0)
+ %define tmp4	 trans2
+ %define tmp4_m	 var(1)
+ %define tmp5	 trans2
+ %define tmp5_m	 var(2)
+ %define tmp6	 trans2
+ %define tmp6_m	 var(3)
+ %define return	 eax
+ %macro SLDR 2				;stack load/restore
+	mov %1, %2
+ %endmacro
+ %define SSTR SLDR
+
+ %macro FUNC_SAVE 0
+	push	ebp
+	mov	ebp, esp
+	sub	esp, PS*4		;4 local variables
+	push	esi
+	push	edi
+	push	ebx
+	mov	arg1, arg(1)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	pop	ebx
+	pop	edi
+	pop	esi
+	add	esp, PS*4		;4 local variables
+	pop	ebp
+ %endmacro
+
+%endif	; output formats
 
 %define len    arg0
 %define vec    arg1
@@ -144,7 +228,17 @@
 %define dest3  tmp4
 %define dest4  tmp5
 %define vskip3 tmp6
-%define pos   return
+%define pos    return
+
+ %ifidn PS,4				;32-bit code
+	%define  len_m 	arg0_m
+	%define  src_m 	arg3_m
+	%define  dest1_m arg4_m
+	%define  dest2_m tmp3_m
+	%define  dest3_m tmp4_m
+	%define  dest4_m tmp5_m
+	%define  vskip3_m tmp6_m
+ %endif
 
 %ifndef EC_ALIGNED_ADDR
 ;;; Use Un-aligned load/store
@@ -161,36 +255,59 @@
  %endif
 %endif
 
+%ifidn PS,8				;64-bit code
+ default rel
+  [bits 64]
+%endif
 
-default rel
 
-[bits 64]
 section .text
 
-%define xmask0f   ymm14
-%define xmask0fx  xmm14
-%define xgft1_lo  ymm13
-%define xgft1_hi  ymm12
-%define xgft2_lo  ymm11
-%define xgft2_hi  ymm10
-%define xgft3_lo  ymm9
-%define xgft3_hi  ymm8
-%define xgft4_lo  ymm7
-%define xgft4_hi  ymm6
-
-
-%define x0     ymm0
-%define xtmpa  ymm1
-%define xp1    ymm2
-%define xp2    ymm3
-%define xp3    ymm4
-%define xp4    ymm5
-
+%ifidn PS,8				;64-bit code
+ %define xmask0f   ymm14
+ %define xmask0fx  xmm14
+ %define xgft1_lo  ymm13
+ %define xgft1_hi  ymm12
+ %define xgft2_lo  ymm11
+ %define xgft2_hi  ymm10
+ %define xgft3_lo  ymm9
+ %define xgft3_hi  ymm8
+ %define xgft4_lo  ymm7
+ %define xgft4_hi  ymm6
+
+ %define x0     ymm0
+ %define xtmpa  ymm1
+ %define xp1    ymm2
+ %define xp2    ymm3
+ %define xp3    ymm4
+ %define xp4    ymm5
+%else
+ %define ymm_trans ymm7			;reuse xmask0f and xgft1_hi
+ %define xmask0f   ymm_trans
+ %define xmask0fx  xmm7
+ %define xgft1_lo  ymm6
+ %define xgft1_hi  ymm_trans
+ %define xgft2_lo  xgft1_lo
+ %define xgft2_hi  xgft1_hi
+ %define xgft3_lo  xgft1_lo
+ %define xgft3_hi  xgft1_hi
+ %define xgft4_lo  xgft1_lo
+ %define xgft4_hi  xgft1_hi
+
+ %define x0     ymm0
+ %define xtmpa  ymm1
+ %define xp1    ymm2
+ %define xp2    ymm3
+ %define xp3    ymm4
+ %define xp4    ymm5
+%endif
 align 16
 global gf_4vect_dot_prod_avx2:function
 func(gf_4vect_dot_prod_avx2)
 	FUNC_SAVE
+	SLDR	len, len_m
 	sub	len, 32
+	SSTR	len_m, len
 	jl	.return_fail
 	xor	pos, pos
 	mov	tmp.b, 0x0f
@@ -198,12 +315,17 @@ func(gf_4vect_dot_prod_avx2)
 	vpbroadcastb xmask0f, xmask0fx	;Construct mask 0x0f0f0f...
 	mov	vskip3, vec
 	imul	vskip3, 96
+	SSTR	vskip3_m, vskip3
 	sal	vec, LOG_PS		;vec *= PS. Make vec_i count by PS
+	SLDR	dest1, dest1_m
 	mov	dest2, [dest1+PS]
+	SSTR	dest2_m, dest2
 	mov	dest3, [dest1+2*PS]
+	SSTR	dest3_m, dest3
 	mov	dest4, [dest1+3*PS]
+	SSTR	dest4_m, dest4
 	mov	dest1, [dest1]
-
+	SSTR	dest1_m, dest1
 
 .loop32:
 	vpxor	xp1, xp1
@@ -214,10 +336,12 @@ func(gf_4vect_dot_prod_avx2)
 	xor	vec_i, vec_i
 
 .next_vect:
+	SLDR	src, src_m
 	mov	ptr, [src+vec_i]
 	XLDR	x0, [ptr+pos]		;Get next source vector
-	add	vec_i, PS
 
+	add	vec_i, PS
+ %ifidn PS,8				;64-bit code
 	vpand	xgft4_lo, x0, xmask0f	;Mask low src nibble in bits 4-0
 	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
 	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
@@ -230,30 +354,64 @@ func(gf_4vect_dot_prod_avx2)
 						;     "     Bx{00}, Bx{10}, ..., Bx{f0}
 	vmovdqu	xgft3_lo, [tmp+vec*(64/PS)]	;Load array Cx{00}, Cx{01}, ..., Cx{0f}
 						;     "     Cx{00}, Cx{10}, ..., Cx{f0}
-	vmovdqu	xgft4_lo, [tmp+vskip3]		;Load array Cx{00}, Cx{01}, ..., Cx{0f}
-						;     "     Cx{00}, Cx{10}, ..., Cx{f0}
+	vmovdqu	xgft4_lo, [tmp+vskip3]		;Load array Dx{00}, Dx{01}, ..., Dx{0f}
+						;     "     Dx{00}, Dx{10}, ..., Dx{f0}
 
 	vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
 	vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
 	vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
 	vperm2i128 xgft4_hi, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
 	add	tmp, 32
+ %else					;32-bit code
+	mov	cl, 0x0f		;use ecx as a temp variable
+	vpinsrb	xmask0fx, xmask0fx, ecx, 0
+	vpbroadcastb xmask0f, xmask0fx	;Construct mask 0x0f0f0f...
+
+	vpand	xgft4_lo, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+	vperm2i128 xtmpa, xgft4_lo, x0, 0x30 	;swap xtmpa from 1lo|2lo to 1lo|2hi
+	vperm2i128 x0, xgft4_lo, x0, 0x12	;swap x0 from    1hi|2hi to 1hi|2lo
+
+	vmovdqu	xgft1_lo, [tmp]			;Load array Ax{00}, Ax{01}, ..., Ax{0f}
+						;     "     Ax{00}, Ax{10}, ..., Ax{f0}
+	vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
+ %endif
 
 	vpshufb	xgft1_hi, x0		;Lookup mul table of high nibble
 	vpshufb	xgft1_lo, xtmpa		;Lookup mul table of low nibble
 	vpxor	xgft1_hi, xgft1_lo	;GF add high and low partials
 	vpxor	xp1, xgft1_hi		;xp1 += partial
 
+ %ifidn PS,4				; 32-bit code
+	vmovdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+						;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+	vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
+ %endif
 	vpshufb	xgft2_hi, x0		;Lookup mul table of high nibble
 	vpshufb	xgft2_lo, xtmpa		;Lookup mul table of low nibble
 	vpxor	xgft2_hi, xgft2_lo	;GF add high and low partials
 	vpxor	xp2, xgft2_hi		;xp2 += partial
 
+ %ifidn PS,4				; 32-bit code
+	sal     vec, 1
+	vmovdqu	xgft3_lo, [tmp+vec*(32/PS)]	;Load array Cx{00}, Cx{01}, ..., Cx{0f}
+						;     "     Cx{00}, Cx{10}, ..., Cx{f0}
+	vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
+	sar	vec, 1
+ %endif
 	vpshufb	xgft3_hi, x0		;Lookup mul table of high nibble
 	vpshufb	xgft3_lo, xtmpa		;Lookup mul table of low nibble
 	vpxor	xgft3_hi, xgft3_lo	;GF add high and low partials
 	vpxor	xp3, xgft3_hi		;xp3 += partial
 
+ %ifidn PS,4				; 32-bit code
+	SLDR	vskip3, vskip3_m
+	vmovdqu	xgft4_lo, [tmp+vskip3]		;Load array Dx{00}, Dx{01}, ..., Dx{0f}
+						;     "     DX{00}, Dx{10}, ..., Dx{f0}
+	vperm2i128 xgft4_hi, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
+	add	tmp, 32
+ %endif
 	vpshufb	xgft4_hi, x0		;Lookup mul table of high nibble
 	vpshufb	xgft4_lo, xtmpa		;Lookup mul table of low nibble
 	vpxor	xgft4_hi, xgft4_lo	;GF add high and low partials
@@ -262,11 +420,16 @@ func(gf_4vect_dot_prod_avx2)
 	cmp	vec_i, vec
 	jl	.next_vect
 
+	SLDR	dest1, dest1_m
+	SLDR	dest2, dest2_m
 	XSTR	[dest1+pos], xp1
 	XSTR	[dest2+pos], xp2
+	SLDR	dest3, dest3_m
 	XSTR	[dest3+pos], xp3
+	SLDR	dest4, dest4_m
 	XSTR	[dest4+pos], xp4
 
+	SLDR	len, len_m
 	add	pos, 32			;Loop on 32 bytes at a time
 	cmp	pos, len
 	jle	.loop32
@@ -293,15 +456,5 @@ endproc_frame
 
 section .data
 
-%macro slversion 4
-global %1_slver_%2%3%4
-global %1_slver
-%1_slver:
-%1_slver_%2%3%4:
-	dw 0x%4
-	db 0x%3, 0x%2
-%endmacro
 ;;;       func                   core, ver, snum
-slversion gf_4vect_dot_prod_avx2, 04,  03,  0064
-; inform linker that this doesn't require executable stack
-section .note.GNU-stack noalloc noexec nowrite progbits
+slversion gf_4vect_dot_prod_avx2, 04,  05,  0198
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_dot_prod_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_dot_prod_sse.asm.s
index 920a8da..2867cca 100644
--- a/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_dot_prod_sse.asm.s
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_dot_prod_sse.asm.s
@@ -1,5 +1,5 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 ;
 ;  Redistribution and use in source and binary forms, with or without
 ;  modification, are permitted provided that the following conditions
@@ -30,8 +30,8 @@
 ;;;
 ;;; gf_4vect_dot_prod_sse(len, vec, *g_tbls, **buffs, **dests);
 ;;;
-;;; Author: Gregory Tucker
 
+%include "reg_sizes.asm"
 
 %ifidn __OUTPUT_FORMAT__, elf64
  %define arg0  rdi
@@ -48,7 +48,10 @@
  %define tmp5  r14		; must be saved and restored
  %define tmp6  r15		; must be saved and restored
  %define return rax
- %define PS 8
+ %macro  SLDR   2
+ %endmacro
+ %define SSTR   SLDR
+ %define PS     8
  %define LOG_PS 3
 
  %define func(x) x:
@@ -81,6 +84,9 @@
  %define tmp5   rdi		; must be saved and restored
  %define tmp6   rsi		; must be saved and restored
  %define return rax
+ %macro  SLDR   2
+ %endmacro
+ %define SSTR   SLDR
  %define PS     8
  %define LOG_PS 3
  %define stack_size  9*16 + 7*8		; must be an odd multiple of 8
@@ -128,6 +134,82 @@
  %endmacro
 %endif
 
+%ifidn __OUTPUT_FORMAT__, elf32
+
+;;;================== High Address;
+;;;	arg4
+;;;	arg3
+;;;	arg2
+;;;	arg1
+;;;	arg0
+;;;	return
+;;;<================= esp of caller
+;;;	ebp
+;;;<================= ebp = esp
+;;;	var0
+;;;	var1
+;;;	var2
+;;;	var3
+;;;	esi
+;;;	edi
+;;;	ebx
+;;;<================= esp of callee
+;;;
+;;;================== Low Address;
+
+ %define PS     4
+ %define LOG_PS 2
+ %define func(x) x:
+ %define arg(x) [ebp + PS*2 + PS*x]
+ %define var(x) [ebp - PS - PS*x]
+
+ %define trans	 ecx
+ %define trans2  esi
+ %define arg0	 trans		;trans and trans2 are for the variables in stack
+ %define arg0_m	 arg(0)
+ %define arg1	 ebx
+ %define arg2	 arg2_m
+ %define arg2_m	 arg(2)
+ %define arg3	 trans
+ %define arg3_m	 arg(3)
+ %define arg4	 trans
+ %define arg4_m	 arg(4)
+ %define arg5	 trans2
+ %define tmp	 edx
+ %define tmp2	 edi
+ %define tmp3	 trans2
+ %define tmp3_m	 var(0)
+ %define tmp4	 trans2
+ %define tmp4_m	 var(1)
+ %define tmp5	 trans2
+ %define tmp5_m	 var(2)
+ %define tmp6	 trans2
+ %define tmp6_m	 var(3)
+ %define return	 eax
+ %macro SLDR 2				;stack load/restore
+	mov %1, %2
+ %endmacro
+ %define SSTR SLDR
+
+ %macro FUNC_SAVE 0
+	push	ebp
+	mov	ebp, esp
+	sub	esp, PS*4		;4 local variables
+	push	esi
+	push	edi
+	push	ebx
+	mov	arg1, arg(1)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	pop	ebx
+	pop	edi
+	pop	esi
+	add	esp, PS*4		;4 local variables
+	pop	ebp
+ %endmacro
+
+%endif	; output formats
 
 %define len    arg0
 %define vec    arg1
@@ -140,7 +222,17 @@
 %define dest3  tmp4
 %define dest4  tmp5
 %define vskip3 tmp6
-%define pos   return
+%define pos    return
+
+ %ifidn PS,4				;32-bit code
+	%define  len_m 	arg0_m
+	%define  src_m 	arg3_m
+	%define  dest1_m arg4_m
+	%define  dest2_m tmp3_m
+	%define  dest3_m tmp4_m
+	%define  dest4_m tmp5_m
+	%define  vskip3_m tmp6_m
+ %endif
 
 %ifndef EC_ALIGNED_ADDR
 ;;; Use Un-aligned load/store
@@ -157,46 +249,73 @@
  %endif
 %endif
 
+%ifidn PS,8				; 64-bit code
+ default rel
+  [bits 64]
+%endif
 
-default rel
 
-[bits 64]
 section .text
 
-%define xmask0f   xmm14
-%define xgft1_lo  xmm13
-%define xgft1_hi  xmm12
-%define xgft2_lo  xmm11
-%define xgft2_hi  xmm10
-%define xgft3_lo  xmm9
-%define xgft3_hi  xmm8
-%define xgft4_lo  xmm7
-%define xgft4_hi  xmm6
-
-
-%define x0     xmm0
-%define xtmpa  xmm1
-%define xp1    xmm2
-%define xp2    xmm3
-%define xp3    xmm4
-%define xp4    xmm5
-
+%ifidn PS,8				;64-bit code
+ %define xmask0f   xmm14
+ %define xgft1_lo  xmm2
+ %define xgft1_hi  xmm3
+ %define xgft2_lo  xmm11
+ %define xgft2_hi  xmm4
+ %define xgft3_lo  xmm9
+ %define xgft3_hi  xmm5
+ %define xgft4_lo  xmm7
+ %define xgft4_hi  xmm6
+
+ %define x0     xmm0
+ %define xtmpa  xmm1
+ %define xp1    xmm8
+ %define xp2    xmm10
+ %define xp3    xmm12
+ %define xp4    xmm13
+%else
+ %define xmm_trans xmm7			;reuse xmask0f and xgft1_lo
+ %define xmask0f   xmm_trans
+ %define xgft1_lo  xmm_trans
+ %define xgft1_hi  xmm6
+ %define xgft2_lo  xgft1_lo
+ %define xgft2_hi  xgft1_hi
+ %define xgft3_lo  xgft1_lo
+ %define xgft3_hi  xgft1_hi
+ %define xgft4_lo  xgft1_lo
+ %define xgft4_hi  xgft1_hi
+
+ %define x0     xmm0
+ %define xtmpa  xmm1
+ %define xp1    xmm2
+ %define xp2    xmm3
+ %define xp3    xmm4
+ %define xp4    xmm5
+%endif
 align 16
 global gf_4vect_dot_prod_sse:function
 func(gf_4vect_dot_prod_sse)
 	FUNC_SAVE
+	SLDR	len, len_m
 	sub	len, 16
+	SSTR	len_m, len
 	jl	.return_fail
 	xor	pos, pos
 	movdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
-	mov	vskip3, vec
-	imul	vskip3, 96
-	sal	vec, LOG_PS		;vec *= PS. Make vec_i count by PS
-	mov	dest2, [dest1+PS]
-	mov	dest3, [dest1+2*PS]
-	mov	dest4, [dest1+3*PS]
-	mov	dest1, [dest1]
-
+	mov	vskip3,  vec
+	imul	vskip3,  96
+	SSTR	vskip3_m, vskip3
+	sal	vec, 	 LOG_PS		;vec *= PS. Make vec_i count by PS
+	SLDR	dest1, 	 dest1_m
+	mov	dest2, 	 [dest1+PS]
+	SSTR	dest2_m, dest2
+	mov	dest3, 	 [dest1+2*PS]
+	SSTR	dest3_m, dest3
+	mov	dest4, 	 [dest1+3*PS]
+	SSTR	dest4_m, dest4
+	mov	dest1, 	 [dest1]
+	SSTR	dest1_m, dest1
 
 .loop16:
 	pxor	xp1, xp1
@@ -207,41 +326,72 @@ func(gf_4vect_dot_prod_sse)
 	xor	vec_i, vec_i
 
 .next_vect:
+	SLDR 	src, src_m
 	mov	ptr, [src+vec_i]
 
+ %ifidn PS,8				;64-bit code
 	movdqu	xgft1_lo, [tmp]			;Load array Ax{00}, Ax{01}, ..., Ax{0f}
 	movdqu	xgft1_hi, [tmp+16]		;     "     Ax{00}, Ax{10}, ..., Ax{f0}
 	movdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
 	movdqu	xgft2_hi, [tmp+vec*(32/PS)+16]	;     "     Bx{00}, Bx{10}, ..., Bx{f0}
 	movdqu	xgft3_lo, [tmp+vec*(64/PS)]	;Load array Cx{00}, Cx{01}, ..., Cx{0f}
 	movdqu	xgft3_hi, [tmp+vec*(64/PS)+16]	;     "     Cx{00}, Cx{10}, ..., Cx{f0}
-	movdqu	xgft4_lo, [tmp+vskip3]		;Load array Cx{00}, Cx{01}, ..., Cx{0f}
-	movdqu	xgft4_hi, [tmp+vskip3+16]	;     "     Cx{00}, Cx{10}, ..., Cx{f0}
+	movdqu	xgft4_lo, [tmp+vskip3]		;Load array Dx{00}, Dx{01}, ..., Dx{0f}
+	movdqu	xgft4_hi, [tmp+vskip3+16]	;     "     Dx{00}, Dx{10}, ..., Dx{f0}
 
-	XLDR	x0, [ptr+pos]		;Get next source vector
-	add	tmp, 32
-	add	vec_i, PS
+	XLDR	x0, 	[ptr+pos]	;Get next source vector
+	add	tmp, 	32
+	add	vec_i, 	PS
 
 	movdqa	xtmpa, x0		;Keep unshifted copy of src
 	psraw	x0, 4			;Shift to put high nibble into bits 4-0
 	pand	x0, xmask0f		;Mask high src nibble in bits 4-0
+	pand	xtmpa, 	xmask0f		;Mask low src nibble in bits 4-0
+ %else					;32-bit code
+	XLDR	x0, 	 [ptr+pos]	;Get next source vector
+	movdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+
+	movdqa	xtmpa, 	x0		;Keep unshifted copy of src
+	psraw	x0, 	4		;Shift to put high nibble into bits 4-0
+	pand	x0, 	xmask0f		;Mask high src nibble in bits 4-0
 	pand	xtmpa, xmask0f		;Mask low src nibble in bits 4-0
 
+	movdqu	xgft1_lo, [tmp]			;Load array Ax{00}, Ax{01}, ..., Ax{0f}
+	movdqu	xgft1_hi, [tmp+16]		;     "     Ax{00}, Ax{10}, ..., Ax{f0}
+ %endif
+
 	pshufb	xgft1_hi, x0		;Lookup mul table of high nibble
 	pshufb	xgft1_lo, xtmpa		;Lookup mul table of low nibble
 	pxor	xgft1_hi, xgft1_lo	;GF add high and low partials
 	pxor	xp1, xgft1_hi		;xp1 += partial
 
+ %ifidn PS,4				;32-bit code
+	movdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+	movdqu	xgft2_hi, [tmp+vec*(32/PS)+16]	;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+ %endif
 	pshufb	xgft2_hi, x0		;Lookup mul table of high nibble
 	pshufb	xgft2_lo, xtmpa		;Lookup mul table of low nibble
 	pxor	xgft2_hi, xgft2_lo	;GF add high and low partials
 	pxor	xp2, xgft2_hi		;xp2 += partial
 
+ %ifidn PS,4				;32-bit code
+	sal	vec, 1
+	movdqu	xgft3_lo, [tmp+vec*(32/PS)]	;Load array Cx{00}, Cx{01}, ..., Cx{0f}
+	movdqu	xgft3_hi, [tmp+vec*(32/PS)+16]	;     "     Cx{00}, Cx{10}, ..., Cx{f0}
+	sar 	vec, 1
+ %endif
 	pshufb	xgft3_hi, x0		;Lookup mul table of high nibble
 	pshufb	xgft3_lo, xtmpa		;Lookup mul table of low nibble
 	pxor	xgft3_hi, xgft3_lo	;GF add high and low partials
 	pxor	xp3, xgft3_hi		;xp3 += partial
 
+ %ifidn PS,4				;32-bit code
+	SLDR	vskip3, vskip3_m
+	movdqu	xgft4_lo, [tmp+vskip3]		;Load array Dx{00}, Dx{01}, ..., Dx{0f}
+	movdqu	xgft4_hi, [tmp+vskip3+16]	;     "     Dx{00}, Dx{10}, ..., Dx{f0}
+	add	tmp, 32
+	add	vec_i, PS
+ %endif
 	pshufb	xgft4_hi, x0		;Lookup mul table of high nibble
 	pshufb	xgft4_lo, xtmpa		;Lookup mul table of low nibble
 	pxor	xgft4_hi, xgft4_lo	;GF add high and low partials
@@ -250,11 +400,16 @@ func(gf_4vect_dot_prod_sse)
 	cmp	vec_i, vec
 	jl	.next_vect
 
+	SLDR	dest1, dest1_m
+	SLDR	dest2, dest2_m
 	XSTR	[dest1+pos], xp1
 	XSTR	[dest2+pos], xp2
+	SLDR	dest3, dest3_m
 	XSTR	[dest3+pos], xp3
+	SLDR	dest4, dest4_m
 	XSTR	[dest4+pos], xp4
 
+	SLDR	len, len_m
 	add	pos, 16			;Loop on 16 bytes at a time
 	cmp	pos, len
 	jle	.loop16
@@ -284,15 +439,5 @@ section .data
 align 16
 mask0f:	ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
 
-%macro slversion 4
-global %1_slver_%2%3%4
-global %1_slver
-%1_slver:
-%1_slver_%2%3%4:
-	dw 0x%4
-	db 0x%3, 0x%2
-%endmacro
 ;;;       func                  core, ver, snum
-slversion gf_4vect_dot_prod_sse, 00,  03,  0064
-; inform linker that this doesn't require executable stack
-section .note.GNU-stack noalloc noexec nowrite progbits
+slversion gf_4vect_dot_prod_sse, 00,  06,  0064
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_avx.asm.s
new file mode 100644
index 0000000..5b28916
--- /dev/null
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_avx.asm.s
@@ -0,0 +1,336 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_4vect_mad_avx(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+
+%define PS 8
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg0.w ecx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define arg4  r12
+ %define arg5  r15
+ %define tmp   r11
+ %define tmp2   r10
+ %define tmp3   r13
+ %define return rax
+ %define return.w eax
+ %define stack_size 16*10 + 3*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	movdqa	[rsp+16*0],xmm6
+	movdqa	[rsp+16*1],xmm7
+	movdqa	[rsp+16*2],xmm8
+	movdqa	[rsp+16*3],xmm9
+	movdqa	[rsp+16*4],xmm10
+	movdqa	[rsp+16*5],xmm11
+	movdqa	[rsp+16*6],xmm12
+	movdqa	[rsp+16*7],xmm13
+	movdqa	[rsp+16*8],xmm14
+	movdqa	[rsp+16*9],xmm15
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r13,  10*16 + 1*8
+	save_reg	r15,  10*16 + 2*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp+16*0]
+	movdqa	xmm7, [rsp+16*1]
+	movdqa	xmm8, [rsp+16*2]
+	movdqa	xmm9, [rsp+16*3]
+	movdqa	xmm10, [rsp+16*4]
+	movdqa	xmm11, [rsp+16*5]
+	movdqa	xmm12, [rsp+16*6]
+	movdqa	xmm13, [rsp+16*7]
+	movdqa	xmm14, [rsp+16*8]
+	movdqa	xmm15, [rsp+16*9]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r13,  [rsp + 10*16 + 1*8]
+	mov	r15,  [rsp + 10*16 + 2*8]
+	add	rsp, stack_size
+%endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define tmp2   r10
+ %define tmp3   r12
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r12
+ %endmacro
+%endif
+
+;;; gf_4vect_mad_avx(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest1  arg5
+%define pos   return
+%define pos.w return.w
+
+%define dest2 mul_array
+%define dest3 tmp2
+%define dest4 vec_i
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f  xmm15
+%define xgft3_hi xmm14
+%define xgft4_hi xmm13
+%define xgft4_lo xmm12
+
+%define x0      xmm0
+%define xtmpa   xmm1
+%define xtmph1  xmm2
+%define xtmpl1  xmm3
+%define xtmph2  xmm4
+%define xtmpl2  xmm5
+%define xtmph3  xmm6
+%define xtmpl3  xmm7
+%define xtmph4  xmm8
+%define xtmpl4  xmm9
+%define xd1     xmm10
+%define xd2     xmm11
+%define xd3     xtmph1
+%define xd4     xtmpl1
+
+align 16
+global gf_4vect_mad_avx:function
+func(gf_4vect_mad_avx)
+	FUNC_SAVE
+	sub	len, 16
+	jl	.return_fail
+	xor	pos, pos
+	vmovdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+
+	mov	tmp, vec
+
+	sal	vec_i, 5		;Multiply by 32
+	lea	tmp3, [mul_array + vec_i]
+
+	sal	tmp, 6			;Multiply by 64
+	vmovdqu	xgft3_hi, [tmp3+tmp+16]	; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+	sal	vec, 5			;Multiply by 32
+	add	tmp, vec
+	vmovdqu	xgft4_lo, [tmp3+tmp]	;Load array Dx{00}, Dx{01}, Dx{02}, ...
+	vmovdqu	xgft4_hi, [tmp3+tmp+16]	; " Dx{00}, Dx{10}, Dx{20}, ... , Dx{f0}
+
+	mov	dest2, [dest1+PS]		; reuse mul_array
+	mov	dest3, [dest1+2*PS]
+	mov	dest4, [dest1+3*PS]		; reuse vec_i
+	mov	dest1, [dest1]
+
+.loop16:
+	XLDR	x0, [src+pos]		;Get next source vector
+	vmovdqu	xtmph1, [tmp3+16]	; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	vmovdqu	xtmpl1, [tmp3]		;Load array Ax{00}, Ax{01}, Ax{02}, ...
+	vmovdqu	xtmph2, [tmp3+vec+16]	; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	vmovdqu	xtmpl2, [tmp3+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+	vmovdqu	xtmpl3, [tmp3+2*vec]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+
+	XLDR	xd1, [dest1+pos]	;Get next dest vector
+	XLDR	xd2, [dest2+pos]	;Get next dest vector
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	; dest1
+	vpshufb	xtmph1, xtmph1, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl1, xtmpl1, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xtmpl1		;GF add high and low partials
+	vpxor	xd1, xd1, xtmph1
+
+	XLDR	xd3, [dest3+pos]	;Reuse xtmph1, Get next dest vector
+	XLDR	xd4, [dest4+pos]	;Reuse xtmpl1, Get next dest vector
+
+	; dest2
+	vpshufb	xtmph2, xtmph2, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl2, xtmpl2, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmph2, xtmpl2		;GF add high and low partials
+	vpxor	xd2, xd2, xtmph2
+
+	; dest3
+	vpshufb	xtmph3, xgft3_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl3, xtmpl3, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph3, xtmph3, xtmpl3		;GF add high and low partials
+	vpxor	xd3, xd3, xtmph3
+
+	; dest4
+	vpshufb	xtmph4, xgft4_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl4, xgft4_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph4, xtmph4, xtmpl4		;GF add high and low partials
+	vpxor	xd4, xd4, xtmph4
+
+	XSTR	[dest1+pos], xd1	;Store result
+	XSTR	[dest2+pos], xd2	;Store result
+	XSTR	[dest3+pos], xd3	;Store result
+	XSTR	[dest4+pos], xd4	;Store result
+
+	add	pos, 16			;Loop on 16 bytes at a time
+	cmp	pos, len
+	jle	.loop16
+
+	lea	tmp, [len + 16]
+	cmp	pos, tmp
+	je	.return_pass
+
+.lessthan16:
+	;; Tail len
+	;; Do one more overlap pass
+
+	mov	tmp, len	;Overlapped offset length-16
+
+	XLDR	x0, [src+tmp]		;Get next source vector
+
+	vmovdqu	xtmph1, [tmp3+16]	; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	vmovdqu	xtmpl1, [tmp3]		;Load array Ax{00}, Ax{01}, Ax{02}, ...
+	vmovdqu	xtmph2, [tmp3+vec+16]	; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	vmovdqu	xtmpl2, [tmp3+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+	vmovdqu	xtmpl3, [tmp3+2*vec]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+
+	XLDR	xd1, [dest1+tmp]	;Get next dest vector
+	XLDR	xd2, [dest2+tmp]	;Get next dest vector
+	XLDR	xtmph4, [dest3+tmp]	;Get next dest vector
+
+	sub	len, pos
+
+	vmovdqa	xtmpl4, [constip16]	;Load const of i + 16
+	vpinsrb	xtmph3, xtmph3, len.w, 15
+	vpshufb	xtmph3, xtmph3, xmask0f		;Broadcast len to all bytes
+	vpcmpgtb	xtmph3, xtmph3, xtmpl4
+
+	XLDR	xtmpl4, [dest4+tmp]	;Get next dest vector
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	; dest1
+	vpshufb	xtmph1, xtmph1, x0	;Lookup mul table of high nibble
+	vpshufb	xtmpl1, xtmpl1, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xtmpl1	;GF add high and low partials
+	vpand	xtmph1, xtmph1, xtmph3
+	vpxor	xd1, xd1, xtmph1
+
+	; dest2
+	vpshufb	xtmph2, xtmph2, x0	;Lookup mul table of high nibble
+	vpshufb	xtmpl2, xtmpl2, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmph2, xtmpl2	;GF add high and low partials
+	vpand	xtmph2, xtmph2, xtmph3
+	vpxor	xd2, xd2, xtmph2
+
+	; dest3
+	vpshufb	xgft3_hi, xgft3_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl3, xtmpl3, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft3_hi, xgft3_hi, xtmpl3	;GF add high and low partials
+	vpand	xgft3_hi, xgft3_hi, xtmph3
+	vpxor	xtmph4, xtmph4, xgft3_hi
+
+	; dest4
+	vpshufb	xgft4_hi, xgft4_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft4_lo, xgft4_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xgft4_hi, xgft4_hi, xgft4_lo	;GF add high and low partials
+	vpand	xgft4_hi, xgft4_hi, xtmph3
+	vpxor	xtmpl4, xtmpl4, xgft4_hi
+
+	XSTR	[dest1+tmp], xd1	;Store result
+	XSTR	[dest2+tmp], xd2	;Store result
+	XSTR	[dest3+tmp], xtmph4	;Store result
+	XSTR	[dest4+tmp], xtmpl4	;Store result
+
+.return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+.return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+mask0f:	ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+constip16:
+	ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+
+;;;       func             core, ver, snum
+slversion gf_4vect_mad_avx, 02,  01,  020a
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_avx2.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_avx2.asm.s
new file mode 100644
index 0000000..5df1f83
--- /dev/null
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_avx2.asm.s
@@ -0,0 +1,342 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_4vect_mad_avx2(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+
+%define PS 8
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg0.w ecx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define arg4  r12
+ %define arg5  r15
+ %define tmp    r11
+ %define tmp.w  r11d
+ %define tmp.b  r11b
+ %define return rax
+ %define return.w eax
+ %define stack_size 16*10 + 3*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	movdqa	[rsp+16*0],xmm6
+	movdqa	[rsp+16*1],xmm7
+	movdqa	[rsp+16*2],xmm8
+	movdqa	[rsp+16*3],xmm9
+	movdqa	[rsp+16*4],xmm10
+	movdqa	[rsp+16*5],xmm11
+	movdqa	[rsp+16*6],xmm12
+	movdqa	[rsp+16*7],xmm13
+	movdqa	[rsp+16*8],xmm14
+	movdqa	[rsp+16*9],xmm15
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r15,  10*16 + 1*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp+16*0]
+	movdqa	xmm7, [rsp+16*1]
+	movdqa	xmm8, [rsp+16*2]
+	movdqa	xmm9, [rsp+16*3]
+	movdqa	xmm10, [rsp+16*4]
+	movdqa	xmm11, [rsp+16*5]
+	movdqa	xmm12, [rsp+16*6]
+	movdqa	xmm13, [rsp+16*7]
+	movdqa	xmm14, [rsp+16*8]
+	movdqa	xmm15, [rsp+16*9]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r15,  [rsp + 10*16 + 1*8]
+	add	rsp, stack_size
+%endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define tmp.w r11d
+ %define tmp.b r11b
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+
+;;; gf_4vect_mad_avx2(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest1  arg5
+%define pos   return
+%define pos.w return.w
+
+%define dest2 mul_array
+%define dest3 vec
+%define dest4 vec_i
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f   ymm15
+%define xmask0fx  xmm15
+%define xgft1_lo  ymm14
+%define xgft2_lo  ymm13
+%define xgft3_lo  ymm12
+%define xgft4_lo  ymm11
+
+%define x0      ymm0
+%define xtmpa   ymm1
+%define xtmpl   ymm2
+%define xtmplx  xmm2
+%define xtmph1  ymm3
+%define xtmph1x xmm3
+%define xtmph2  ymm4
+%define xtmph3  ymm5
+%define xtmph4  ymm6
+%define xd1     ymm7
+%define xd2     ymm8
+%define xd3     ymm9
+%define xd4     ymm10
+
+align 16
+global gf_4vect_mad_avx2:function
+func(gf_4vect_mad_avx2)
+	FUNC_SAVE
+	sub	len, 32
+	jl	.return_fail
+	xor	pos, pos
+	mov	tmp.b, 0x0f
+	vpinsrb	xmask0fx, xmask0fx, tmp.w, 0
+	vpbroadcastb xmask0f, xmask0fx	;Construct mask 0x0f0f0f...
+
+	sal	vec_i, 5		;Multiply by 32
+	sal	vec, 5			;Multiply by 32
+	lea	tmp, [mul_array + vec_i]
+
+	vmovdqu	xgft1_lo, [tmp]	;Load array Ax{00}, Ax{01}, Ax{02}, ...
+					; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	vmovdqu	xgft2_lo, [tmp+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+					; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	vmovdqu	xgft3_lo, [tmp+2*vec]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+					; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+	add	tmp, vec
+	vmovdqu	xgft4_lo, [tmp+2*vec]	;Load array Dx{00}, Dx{01}, Dx{02}, ...
+					; " Dx{00}, Dx{10}, Dx{20}, ... , Dx{f0}
+
+	mov	dest2, [dest1+PS]		; reuse mul_array
+	mov	dest3, [dest1+2*PS]		; reuse vec
+	mov	dest4, [dest1+3*PS]		; reuse vec_i
+	mov	dest1, [dest1]
+
+.loop32:
+	XLDR	x0, [src+pos]		;Get next source vector
+
+	XLDR	xd1, [dest1+pos]	;Get next dest vector
+	XLDR	xd2, [dest2+pos]	;Get next dest vector
+	XLDR	xd3, [dest3+pos]	;Get next dest vector
+	XLDR	xd4, [dest4+pos]	;reuse xtmpl1. Get next dest vector
+
+	vpand	xtmpl, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	vperm2i128 xtmpa, xtmpl, x0, 0x30 	;swap xtmpa from 1lo|2lo to 1lo|2hi
+	vperm2i128 x0, xtmpl, x0, 0x12		;swap x0 from    1hi|2hi to 1hi|2lo
+
+	vperm2i128 xtmph1, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
+	vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
+	vperm2i128 xtmph3, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
+	vperm2i128 xtmph4, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
+
+	; dest1
+	vpshufb	xtmph1, xtmph1, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft1_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xtmpl		;GF add high and low partials
+	vpxor	xd1, xd1, xtmph1		;xd1 += partial
+
+	; dest2
+	vpshufb	xtmph2, xtmph2, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft2_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmph2, xtmpl		;GF add high and low partials
+	vpxor	xd2, xd2, xtmph2		;xd2 += partial
+
+	; dest3
+	vpshufb	xtmph3, xtmph3, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft3_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph3, xtmph3, xtmpl		;GF add high and low partials
+	vpxor	xd3, xd3, xtmph3		;xd3 += partial
+
+	; dest4
+	vpshufb	xtmph4, xtmph4, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft4_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph4, xtmph4, xtmpl		;GF add high and low partials
+	vpxor	xd4, xd4, xtmph4		;xd4 += partial
+
+	XSTR	[dest1+pos], xd1
+	XSTR	[dest2+pos], xd2
+	XSTR	[dest3+pos], xd3
+	XSTR	[dest4+pos], xd4
+
+	add	pos, 32			;Loop on 32 bytes at a time
+	cmp	pos, len
+	jle	.loop32
+
+	lea	tmp, [len + 32]
+	cmp	pos, tmp
+	je	.return_pass
+
+.lessthan32:
+	;; Tail len
+	;; Do one more overlap pass
+	mov	tmp.b, 0x1f
+	vpinsrb	xtmph1x, xtmph1x, tmp.w, 0
+	vpbroadcastb xtmph1, xtmph1x	;Construct mask 0x1f1f1f...
+
+	mov	tmp, len		;Overlapped offset length-32
+
+	XLDR	x0, [src+tmp]		;Get next source vector
+
+	XLDR	xd1, [dest1+tmp]	;Get next dest vector
+	XLDR	xd2, [dest2+tmp]	;Get next dest vector
+	XLDR	xd3, [dest3+tmp]	;Get next dest vector
+	XLDR	xd4, [dest4+tmp]	;Get next dest vector
+
+	sub	len, pos
+
+	vmovdqa	xtmph2, [constip32]	;Load const of i + 32
+	vpinsrb	xtmplx, xtmplx, len.w, 15
+	vinserti128	xtmpl, xtmpl, xtmplx, 1 ;swapped to xtmplx | xtmplx
+	vpshufb	xtmpl, xtmpl, xtmph1	;Broadcast len to all bytes. xtmph1=0x1f1f1f...
+	vpcmpgtb	xtmpl, xtmpl, xtmph2
+
+	vpand	xtmph1, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	vperm2i128 xtmpa, xtmph1, x0, 0x30 	;swap xtmpa from 1lo|2lo to 1lo|2hi
+	vperm2i128 x0, xtmph1, x0, 0x12		;swap x0 from    1hi|2hi to 1hi|2lo
+
+	vperm2i128 xtmph1, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
+	vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
+	vperm2i128 xtmph3, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
+	vperm2i128 xtmph4, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
+
+	; dest1
+	vpshufb	xtmph1, xtmph1, x0		;Lookup mul table of high nibble
+	vpshufb	xgft1_lo, xgft1_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xgft1_lo	;GF add high and low partials
+	vpand	xtmph1, xtmph1, xtmpl
+	vpxor	xd1, xd1, xtmph1		;xd1 += partial
+
+	; dest2
+	vpshufb	xtmph2, xtmph2, x0		;Lookup mul table of high nibble
+	vpshufb	xgft2_lo, xgft2_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmph2, xgft2_lo	;GF add high and low partials
+	vpand	xtmph2, xtmph2, xtmpl
+	vpxor	xd2, xd2, xtmph2		;xd2 += partial
+
+	; dest3
+	vpshufb	xtmph3, xtmph3, x0		;Lookup mul table of high nibble
+	vpshufb	xgft3_lo, xgft3_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph3, xtmph3, xgft3_lo	;GF add high and low partials
+	vpand	xtmph3, xtmph3, xtmpl
+	vpxor	xd3, xd3, xtmph3		;xd3 += partial
+
+	; dest4
+	vpshufb	xtmph4, xtmph4, x0		;Lookup mul table of high nibble
+	vpshufb	xgft4_lo, xgft4_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph4, xtmph4, xgft4_lo	;GF add high and low partials
+	vpand	xtmph4, xtmph4, xtmpl
+	vpxor	xd4, xd4, xtmph4		;xd4 += partial
+
+	XSTR	[dest1+tmp], xd1
+	XSTR	[dest2+tmp], xd2
+	XSTR	[dest3+tmp], xd3
+	XSTR	[dest4+tmp], xd4
+
+.return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+.return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+align 32
+constip32:
+	ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+	ddq 0xe0e1e2e3e4e5e6e7e8e9eaebecedeeef
+
+;;;       func              core, ver, snum
+slversion gf_4vect_mad_avx2, 04,  01,  020b
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_sse.asm.s
new file mode 100644
index 0000000..f753c13
--- /dev/null
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_sse.asm.s
@@ -0,0 +1,342 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_4vect_mad_sse(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+
+%define PS 8
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg0.w ecx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define arg4  r12
+ %define arg5  r15
+ %define tmp   r11
+ %define tmp2   r10
+ %define tmp3   r13
+ %define return rax
+ %define return.w eax
+ %define stack_size 16*10 + 3*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	movdqa	[rsp+16*0],xmm6
+	movdqa	[rsp+16*1],xmm7
+	movdqa	[rsp+16*2],xmm8
+	movdqa	[rsp+16*3],xmm9
+	movdqa	[rsp+16*4],xmm10
+	movdqa	[rsp+16*5],xmm11
+	movdqa	[rsp+16*6],xmm12
+	movdqa	[rsp+16*7],xmm13
+	movdqa	[rsp+16*8],xmm14
+	movdqa	[rsp+16*9],xmm15
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r13,  10*16 + 1*8
+	save_reg	r15,  10*16 + 2*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp+16*0]
+	movdqa	xmm7, [rsp+16*1]
+	movdqa	xmm8, [rsp+16*2]
+	movdqa	xmm9, [rsp+16*3]
+	movdqa	xmm10, [rsp+16*4]
+	movdqa	xmm11, [rsp+16*5]
+	movdqa	xmm12, [rsp+16*6]
+	movdqa	xmm13, [rsp+16*7]
+	movdqa	xmm14, [rsp+16*8]
+	movdqa	xmm15, [rsp+16*9]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r13,  [rsp + 10*16 + 1*8]
+	mov	r15,  [rsp + 10*16 + 2*8]
+	add	rsp, stack_size
+%endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define tmp2   r10
+ %define tmp3   r12
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r12
+ %endmacro
+%endif
+
+;;; gf_4vect_mad_sse(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest1  arg5
+%define pos   return
+%define pos.w return.w
+
+%define dest2 mul_array
+%define dest3 tmp2
+%define dest4 vec_i
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR movdqu
+ %define XSTR movdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR movdqa
+  %define XSTR movdqa
+ %else
+  %define XLDR movntdqa
+  %define XSTR movntdq
+ %endif
+%endif
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f  xmm15
+%define xgft3_hi xmm14
+%define xgft4_hi xmm13
+%define xgft4_lo xmm12
+
+%define x0      xmm0
+%define xtmpa   xmm1
+%define xtmph1  xmm2
+%define xtmpl1  xmm3
+%define xtmph2  xmm4
+%define xtmpl2  xmm5
+%define xtmph3  xmm6
+%define xtmpl3  xmm7
+%define xtmph4  xmm8
+%define xtmpl4  xmm9
+%define xd1     xmm10
+%define xd2     xmm11
+%define xd3     xtmph1
+%define xd4     xtmpl1
+
+align 16
+global gf_4vect_mad_sse:function
+func(gf_4vect_mad_sse)
+	FUNC_SAVE
+	sub	len, 16
+	jl	.return_fail
+	xor	pos, pos
+	movdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+	mov	tmp, vec
+
+	sal	vec_i, 5		;Multiply by 32
+	lea	tmp3, [mul_array + vec_i]
+
+	sal	tmp, 6			;Multiply by 64
+
+	movdqu	xgft3_hi, [tmp3+tmp+16]	; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+	sal	vec, 5			;Multiply by 32
+	add	tmp, vec
+	movdqu	xgft4_lo, [tmp3+tmp]	;Load array Dx{00}, Dx{01}, Dx{02}, ...
+	movdqu	xgft4_hi, [tmp3+tmp+16]	; " Dx{00}, Dx{10}, Dx{20}, ... , Dx{f0}
+
+	mov	dest2, [dest1+PS]		; reuse mul_array
+	mov	dest3, [dest1+2*PS]
+	mov	dest4, [dest1+3*PS]		; reuse vec_i
+	mov	dest1, [dest1]
+
+.loop16:
+	XLDR	x0, [src+pos]		;Get next source vector
+	movdqu	xtmph1, [tmp3+16]	; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	movdqu	xtmpl1, [tmp3]		;Load array Ax{00}, Ax{01}, Ax{02}, ...
+	movdqu	xtmph2, [tmp3+vec+16]	; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	movdqu	xtmpl2, [tmp3+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+	movdqu	xtmpl3, [tmp3+2*vec]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+
+	movdqa	xtmph3, xgft3_hi
+	movdqa	xtmpl4, xgft4_lo
+	movdqa	xtmph4, xgft4_hi
+
+	XLDR	xd1, [dest1+pos]	;Get next dest vector
+	XLDR	xd2, [dest2+pos]	;Get next dest vector
+
+	movdqa	xtmpa, x0		;Keep unshifted copy of src
+	psraw	x0, 4			;Shift to put high nibble into bits 4-0
+	pand	x0, xmask0f		;Mask high src nibble in bits 4-0
+	pand	xtmpa, xmask0f		;Mask low src nibble in bits 4-0
+
+	; dest1
+	pshufb	xtmph1, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl1, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph1, xtmpl1		;GF add high and low partials
+	pxor	xd1, xtmph1
+
+	XLDR	xd3, [dest3+pos]	;Reuse xtmph1, Get next dest vector
+	XLDR	xd4, [dest4+pos]	;Reuse xtmpl1, Get next dest vector
+
+	; dest2
+	pshufb	xtmph2, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl2, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph2, xtmpl2		;GF add high and low partials
+	pxor	xd2, xtmph2
+
+	; dest3
+	pshufb	xtmph3, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl3, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph3, xtmpl3		;GF add high and low partials
+	pxor	xd3, xtmph3
+
+	; dest4
+	pshufb	xtmph4, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl4, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph4, xtmpl4		;GF add high and low partials
+	pxor	xd4, xtmph4
+
+	XSTR	[dest1+pos], xd1	;Store result
+	XSTR	[dest2+pos], xd2	;Store result
+	XSTR	[dest3+pos], xd3	;Store result
+	XSTR	[dest4+pos], xd4	;Store result
+
+	add	pos, 16			;Loop on 16 bytes at a time
+	cmp	pos, len
+	jle	.loop16
+
+	lea	tmp, [len + 16]
+	cmp	pos, tmp
+	je	.return_pass
+
+.lessthan16:
+	;; Tail len
+	;; Do one more overlap pass
+	mov	tmp, len	;Overlapped offset length-16
+
+	XLDR	x0, [src+tmp]		;Get next source vector
+
+	movdqu	xtmph1, [tmp3+16]	; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	movdqu	xtmpl1, [tmp3]		;Load array Ax{00}, Ax{01}, Ax{02}, ...
+	movdqu	xtmph2, [tmp3+vec+16]	; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	movdqu	xtmpl2, [tmp3+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+	movdqu	xtmpl3, [tmp3+2*vec]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+
+	XLDR	xd1, [dest1+tmp]	;Get next dest vector
+	XLDR	xd2, [dest2+tmp]	;Get next dest vector
+	XLDR	xtmph4, [dest3+tmp]	;Reuse xtmph1. Get next dest vector
+
+	sub	len, pos
+
+	movdqa	xtmpl4, [constip16]	;Load const of i + 16
+	pinsrb	xtmph3, len.w, 15
+	pshufb	xtmph3, xmask0f		;Broadcast len to all bytes
+	pcmpgtb	xtmph3, xtmpl4
+
+	XLDR	xtmpl4, [dest4+tmp]	;Get next dest vector
+
+	movdqa	xtmpa, x0		;Keep unshifted copy of src
+	psraw	x0, 4			;Shift to put high nibble into bits 4-0
+	pand	x0, xmask0f		;Mask high src nibble in bits 4-0
+	pand	xtmpa, xmask0f		;Mask low src nibble in bits 4-0
+
+	; dest1
+	pshufb	xtmph1, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl1, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph1, xtmpl1		;GF add high and low partials
+	pand	xtmph1, xtmph3
+	pxor	xd1, xtmph1
+
+	; dest2
+	pshufb	xtmph2, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl2, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph2, xtmpl2		;GF add high and low partials
+	pand	xtmph2, xtmph3
+	pxor	xd2, xtmph2
+
+	; dest3
+	pshufb	xgft3_hi, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl3, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft3_hi, xtmpl3	;GF add high and low partials
+	pand	xgft3_hi, xtmph3
+	pxor	xtmph4, xgft3_hi
+
+	; dest4
+	pshufb	xgft4_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft4_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft4_hi, xgft4_lo	;GF add high and low partials
+	pand	xgft4_hi, xtmph3
+	pxor	xtmpl4, xgft4_hi
+
+	XSTR	[dest1+tmp], xd1	;Store result
+	XSTR	[dest2+tmp], xd2	;Store result
+	XSTR	[dest3+tmp], xtmph4	;Store result
+	XSTR	[dest4+tmp], xtmpl4	;Store result
+
+.return_pass:
+	FUNC_RESTORE
+	mov	return, 0
+	ret
+
+.return_fail:
+	FUNC_RESTORE
+	mov	return, 1
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+
+mask0f:
+	ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+constip16:
+	ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+
+;;;       func             core, ver, snum
+slversion gf_4vect_mad_sse, 00,  01,  0209
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_dot_prod_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_dot_prod_avx.asm.s
index a562565..41fd301 100644
--- a/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_dot_prod_avx.asm.s
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_dot_prod_avx.asm.s
@@ -1,5 +1,5 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 ;
 ;  Redistribution and use in source and binary forms, with or without
 ;  modification, are permitted provided that the following conditions
@@ -30,8 +30,8 @@
 ;;;
 ;;; gf_5vect_dot_prod_avx(len, vec, *g_tbls, **buffs, **dests);
 ;;;
-;;; Author: Gregory Tucker
 
+%include "reg_sizes.asm"
 
 %ifidn __OUTPUT_FORMAT__, elf64
  %define arg0  rdi
@@ -299,15 +299,5 @@ section .data
 align 16
 mask0f:	ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
 
-%macro slversion 4
-global %1_slver_%2%3%4
-global %1_slver
-%1_slver:
-%1_slver_%2%3%4:
-	dw 0x%4
-	db 0x%3, 0x%2
-%endmacro
 ;;;       func                  core, ver, snum
-slversion gf_5vect_dot_prod_avx, 02,  03,  0194
-; inform linker that this doesn't require executable stack
-section .note.GNU-stack noalloc noexec nowrite progbits
+slversion gf_5vect_dot_prod_avx, 02,  04,  0194
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_dot_prod_avx2.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_dot_prod_avx2.asm.s
index 7f25c16..2698add 100644
--- a/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_dot_prod_avx2.asm.s
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_dot_prod_avx2.asm.s
@@ -1,5 +1,5 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 ;
 ;  Redistribution and use in source and binary forms, with or without
 ;  modification, are permitted provided that the following conditions
@@ -30,8 +30,8 @@
 ;;;
 ;;; gf_5vect_dot_prod_avx2(len, vec, *g_tbls, **buffs, **dests);
 ;;;
-;;; Author: Gregory Tucker
 
+%include "reg_sizes.asm"
 
 %ifidn __OUTPUT_FORMAT__, elf64
  %define arg0  rdi
@@ -311,15 +311,5 @@ endproc_frame
 
 section .data
 
-%macro slversion 4
-global %1_slver_%2%3%4
-global %1_slver
-%1_slver:
-%1_slver_%2%3%4:
-	dw 0x%4
-	db 0x%3, 0x%2
-%endmacro
 ;;;       func                  core, ver, snum
-slversion gf_5vect_dot_prod_avx2, 04,  03,  0199
-; inform linker that this doesn't require executable stack
-section .note.GNU-stack noalloc noexec nowrite progbits
+slversion gf_5vect_dot_prod_avx2, 04,  04,  0199
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_dot_prod_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_dot_prod_sse.asm.s
index 003ad26..5c8c903 100644
--- a/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_dot_prod_sse.asm.s
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_dot_prod_sse.asm.s
@@ -1,5 +1,5 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 ;
 ;  Redistribution and use in source and binary forms, with or without
 ;  modification, are permitted provided that the following conditions
@@ -30,8 +30,8 @@
 ;;;
 ;;; gf_5vect_dot_prod_sse(len, vec, *g_tbls, **buffs, **dests);
 ;;;
-;;; Author: Gregory Tucker
 
+%include "reg_sizes.asm"
 
 %ifidn __OUTPUT_FORMAT__, elf64
  %define arg0  rdi
@@ -165,23 +165,23 @@ default rel
 section .text
 
 %define xmask0f   xmm15
-%define xgft1_lo  xmm14
-%define xgft1_hi  xmm13
-%define xgft2_lo  xmm12
-%define xgft2_hi  xmm11
+%define xgft1_lo  xmm2
+%define xgft1_hi  xmm3
+%define xgft2_lo  xmm4
+%define xgft2_hi  xmm5
 %define xgft3_lo  xmm10
-%define xgft3_hi  xmm9
+%define xgft3_hi  xmm6
 %define xgft4_lo  xmm8
 %define xgft4_hi  xmm7
 
 
 %define x0     xmm0
 %define xtmpa  xmm1
-%define xp1    xmm2
-%define xp2    xmm3
-%define xp3    xmm4
-%define xp4    xmm5
-%define xp5    xmm6
+%define xp1    xmm9
+%define xp2    xmm11
+%define xp3    xmm12
+%define xp4    xmm13
+%define xp5    xmm14
 
 align 16
 global gf_5vect_dot_prod_sse:function
@@ -300,15 +300,5 @@ section .data
 align 16
 mask0f:	ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
 
-%macro slversion 4
-global %1_slver_%2%3%4
-global %1_slver
-%1_slver:
-%1_slver_%2%3%4:
-	dw 0x%4
-	db 0x%3, 0x%2
-%endmacro
 ;;;       func                  core, ver, snum
-; inform linker that this doesn't require executable stack
-section .note.GNU-stack noalloc noexec nowrite progbits
-slversion gf_5vect_dot_prod_sse, 00,  03,  0065
+slversion gf_5vect_dot_prod_sse, 00,  05,  0065
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_avx.asm.s
new file mode 100644
index 0000000..6b534a3
--- /dev/null
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_avx.asm.s
@@ -0,0 +1,365 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_5vect_mad_avx(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+
+%define PS 8
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg0.w ecx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define arg4  r12
+ %define arg5  r15
+ %define tmp   r11
+ %define tmp2   r10
+ %define tmp3   r13
+ %define tmp4   r14
+ %define return rax
+ %define return.w eax
+ %define stack_size 16*10 + 5*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	movdqa	[rsp+16*0],xmm6
+	movdqa	[rsp+16*1],xmm7
+	movdqa	[rsp+16*2],xmm8
+	movdqa	[rsp+16*3],xmm9
+	movdqa	[rsp+16*4],xmm10
+	movdqa	[rsp+16*5],xmm11
+	movdqa	[rsp+16*6],xmm12
+	movdqa	[rsp+16*7],xmm13
+	movdqa	[rsp+16*8],xmm14
+	movdqa	[rsp+16*9],xmm15
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r13,  10*16 + 1*8
+	save_reg	r14,  10*16 + 2*8
+	save_reg	r15,  10*16 + 3*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp+16*0]
+	movdqa	xmm7, [rsp+16*1]
+	movdqa	xmm8, [rsp+16*2]
+	movdqa	xmm9, [rsp+16*3]
+	movdqa	xmm10, [rsp+16*4]
+	movdqa	xmm11, [rsp+16*5]
+	movdqa	xmm12, [rsp+16*6]
+	movdqa	xmm13, [rsp+16*7]
+	movdqa	xmm14, [rsp+16*8]
+	movdqa	xmm15, [rsp+16*9]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r13,  [rsp + 10*16 + 1*8]
+	mov	r14,  [rsp + 10*16 + 2*8]
+	mov	r15,  [rsp + 10*16 + 3*8]
+	add	rsp, stack_size
+%endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define tmp2   r10
+ %define tmp3   r12
+ %define tmp4   r13
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+	push	r13
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r13
+	pop	r12
+ %endmacro
+%endif
+
+;;; gf_5vect_mad_avx(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest1  arg5
+%define pos   return
+%define pos.w return.w
+
+%define dest2 tmp4
+%define dest3 mul_array
+%define dest4 tmp2
+%define dest5 vec_i
+
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f  xmm15
+%define xgft5_hi xmm14
+%define xgft4_lo xmm13
+%define xgft4_hi xmm12
+
+%define x0      xmm0
+%define xtmpa   xmm1
+%define xtmph1  xmm2
+%define xtmpl1  xmm3
+%define xtmph2  xmm4
+%define xtmpl2  xmm5
+%define xtmph3  xmm6
+%define xtmpl3  xmm7
+%define xtmph5  xmm8
+%define xtmpl5  xmm9
+%define xd1     xmm10
+%define xd2     xmm11
+%define xd3     xtmpl1
+%define xd4     xtmph1
+%define xd5     xtmpl2
+
+
+align 16
+global gf_5vect_mad_avx:function
+func(gf_5vect_mad_avx)
+	FUNC_SAVE
+	sub	len, 16
+	jl	.return_fail
+	xor	pos, pos
+	vmovdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+	mov	tmp, vec
+	sal	vec_i, 5		;Multiply by 32
+	lea	tmp3, [mul_array + vec_i]
+	sal	tmp, 6			;Multiply by 64
+	vmovdqu	xgft5_hi, [tmp3+2*tmp+16]	;     "     Ex{00}, Ex{10}, ..., Ex{f0}
+	sal	vec, 5			;Multiply by 32
+	add	tmp, vec
+	vmovdqu	xgft4_hi, [tmp3+tmp+16]	; " Dx{00}, Dx{10}, Dx{20}, ... , Dx{f0}
+	vmovdqu	xgft4_lo, [tmp3+tmp]	;Load array Dx{00}, Dx{01}, Dx{02}, ...
+
+	mov	dest3, [dest1+2*PS]	; reuse mul_array
+	mov	dest4, [dest1+3*PS]
+	mov	dest5, [dest1+4*PS]	; reuse vec_i
+	mov	dest2, [dest1+PS]
+	mov	dest1, [dest1]
+
+.loop16:
+	XLDR	x0, [src+pos]		;Get next source vector
+
+	vmovdqu	xtmph1, [tmp3+16]	; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	vmovdqu	xtmpl1, [tmp3]		;Load array Ax{00}, Ax{01}, Ax{02}, ...
+	vmovdqu	xtmph2, [tmp3+vec+16]	; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	vmovdqu	xtmpl2, [tmp3+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+	vmovdqu	xtmph3, [tmp3+2*vec+16]	; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+	vmovdqu	xtmpl3, [tmp3+2*vec]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+	vmovdqu	xtmpl5, [tmp3+4*vec]	;Load array Ex{00}, Ex{01}, ..., Ex{0f}
+
+	XLDR	xd1, [dest1+pos]	;Get next dest vector
+	XLDR	xd2, [dest2+pos]	;Get next dest vector
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	; dest1
+	vpshufb	xtmph1, xtmph1, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl1, xtmpl1, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xtmpl1		;GF add high and low partials
+	vpxor	xd1, xd1, xtmph1
+
+	XLDR	xd3, [dest3+pos]	;Reuse xtmpl1, Get next dest vector
+	XLDR	xd4, [dest4+pos]	;Reuse xtmph1, Get next dest vector
+
+	; dest2
+	vpshufb	xtmph2, xtmph2, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl2, xtmpl2, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmph2, xtmpl2		;GF add high and low partials
+	vpxor	xd2, xd2, xtmph2
+
+	XLDR	xd5, [dest5+pos]	;Reuse xtmpl2. Get next dest vector
+
+	; dest3
+	vpshufb	xtmph3, xtmph3, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl3, xtmpl3, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph3, xtmph3, xtmpl3		;GF add high and low partials
+	vpxor	xd3, xd3, xtmph3
+
+	; dest4
+	vpshufb	xtmph2, xgft4_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl3, xgft4_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmph2, xtmpl3		;GF add high and low partials
+	vpxor	xd4, xd4, xtmph2
+
+	; dest5
+	vpshufb	xtmph5, xgft5_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl5, xtmpl5, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph5, xtmph5, xtmpl5	;GF add high and low partials
+	vpxor	xd5, xd5, xtmph5
+
+	XSTR	[dest1+pos], xd1	;Store result into dest1
+	XSTR	[dest2+pos], xd2	;Store result into dest2
+	XSTR	[dest3+pos], xd3	;Store result into dest3
+	XSTR	[dest4+pos], xd4	;Store result into dest4
+	XSTR	[dest5+pos], xd5	;Store result into dest5
+
+	add	pos, 16			;Loop on 16 bytes at a time
+	cmp	pos, len
+	jle	.loop16
+
+	lea	tmp, [len + 16]
+	cmp	pos, tmp
+	je	.return_pass
+
+.lessthan16:
+	;; Tail len
+	;; Do one more overlap pass
+	mov	tmp, len	;Overlapped offset length-16
+	XLDR	x0, [src+tmp]		;Get next source vector
+
+	sub	len, pos
+
+	vmovdqa	xtmph1, [constip16]	;Load const of i + 16
+	vpinsrb	xtmph5, len.w, 15
+	vpshufb	xtmph5, xmask0f		;Broadcast len to all bytes
+	vpcmpgtb	xtmph5, xtmph5, xtmph1
+
+	vmovdqu	xtmph1, [tmp3+16]	; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	vmovdqu	xtmpl1, [tmp3]		;Load array Ax{00}, Ax{01}, Ax{02}, ...
+	vmovdqu	xtmph2, [tmp3+vec+16]	; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	vmovdqu	xtmpl2, [tmp3+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+	vmovdqu	xtmph3, [tmp3+2*vec+16]	; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+	vmovdqu	xtmpl3, [tmp3+2*vec]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+	vmovdqu	xtmpl5, [tmp3+4*vec]	;Load array Ex{00}, Ex{01}, ..., Ex{0f}
+
+	XLDR	xd1, [dest1+tmp]	;Get next dest vector
+	XLDR	xd2, [dest2+tmp]	;Get next dest vector
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	; dest1
+	vpshufb	xtmph1, xtmph1, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl1, xtmpl1, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xtmpl1		;GF add high and low partials
+	vpand	xtmph1, xtmph1, xtmph5
+	vpxor	xd1, xd1, xtmph1
+
+	XLDR	xd3, [dest3+tmp]	;Reuse xtmpl1, Get next dest vector
+	XLDR	xd4, [dest4+tmp]	;Reuse xtmph1, Get next dest vector
+
+	; dest2
+	vpshufb	xtmph2, xtmph2, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl2, xtmpl2, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmph2, xtmpl2		;GF add high and low partials
+	vpand	xtmph2, xtmph2, xtmph5
+	vpxor	xd2, xd2, xtmph2
+
+	XLDR	xd5, [dest5+tmp]	;Reuse xtmpl2. Get next dest vector
+
+	; dest3
+	vpshufb	xtmph3, xtmph3, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl3, xtmpl3, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph3, xtmph3, xtmpl3		;GF add high and low partials
+	vpand	xtmph3, xtmph3, xtmph5
+	vpxor	xd3, xd3, xtmph3
+
+	; dest4
+	vpshufb	xgft4_hi, xgft4_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft4_lo, xgft4_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft4_hi, xgft4_hi, xgft4_lo		;GF add high and low partials
+	vpand	xgft4_hi, xgft4_hi, xtmph5
+	vpxor	xd4, xd4, xgft4_hi
+
+	; dest5
+	vpshufb	xgft5_hi, xgft5_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl5, xtmpl5, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft5_hi, xgft5_hi, xtmpl5	;GF add high and low partials
+	vpand	xgft5_hi, xgft5_hi, xtmph5
+	vpxor	xd5, xd5, xgft5_hi
+
+	XSTR	[dest1+tmp], xd1	;Store result into dest1
+	XSTR	[dest2+tmp], xd2	;Store result into dest2
+	XSTR	[dest3+tmp], xd3	;Store result into dest3
+	XSTR	[dest4+tmp], xd4	;Store result into dest4
+	XSTR	[dest5+tmp], xd5	;Store result into dest5
+
+.return_pass:
+	FUNC_RESTORE
+	mov	return, 0
+	ret
+
+.return_fail:
+	FUNC_RESTORE
+	mov	return, 1
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+mask0f:	ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+constip16:
+	ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+
+;;;       func             core, ver, snum
+slversion gf_5vect_mad_avx, 02,  01,  020d
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_avx2.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_avx2.asm.s
new file mode 100644
index 0000000..b495c21
--- /dev/null
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_avx2.asm.s
@@ -0,0 +1,363 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_5vect_mad_avx2(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+
+%define PS 8
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg0.w ecx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define arg4  r12
+ %define arg5  r15
+ %define tmp    r11
+ %define tmp.w  r11d
+ %define tmp.b  r11b
+ %define tmp2   r10
+ %define return rax
+ %define return.w eax
+ %define stack_size 16*10 + 3*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	movdqa	[rsp+16*0],xmm6
+	movdqa	[rsp+16*1],xmm7
+	movdqa	[rsp+16*2],xmm8
+	movdqa	[rsp+16*3],xmm9
+	movdqa	[rsp+16*4],xmm10
+	movdqa	[rsp+16*5],xmm11
+	movdqa	[rsp+16*6],xmm12
+	movdqa	[rsp+16*7],xmm13
+	movdqa	[rsp+16*8],xmm14
+	movdqa	[rsp+16*9],xmm15
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r15,  10*16 + 1*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp+16*0]
+	movdqa	xmm7, [rsp+16*1]
+	movdqa	xmm8, [rsp+16*2]
+	movdqa	xmm9, [rsp+16*3]
+	movdqa	xmm10, [rsp+16*4]
+	movdqa	xmm11, [rsp+16*5]
+	movdqa	xmm12, [rsp+16*6]
+	movdqa	xmm13, [rsp+16*7]
+	movdqa	xmm14, [rsp+16*8]
+	movdqa	xmm15, [rsp+16*9]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r15,  [rsp + 10*16 + 1*8]
+	add	rsp, stack_size
+%endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp    r11
+ %define tmp.w  r11d
+ %define tmp.b  r11b
+ %define tmp2   r10
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+;;; gf_5vect_mad_avx2(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest1  arg5
+%define pos   return
+%define pos.w return.w
+
+%define dest2 tmp2
+%define dest3 mul_array
+%define dest4 vec
+%define dest5 vec_i
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f   ymm15
+%define xmask0fx  xmm15
+%define xgft1_lo  ymm14
+%define xgft2_lo  ymm13
+%define xgft3_lo  ymm12
+%define xgft4_lo  ymm11
+%define xgft5_lo  ymm10
+
+%define x0      ymm0
+%define xtmpa   ymm1
+%define xtmpl   ymm2
+%define xtmplx  xmm2
+%define xtmph1  ymm3
+%define xtmph1x xmm3
+%define xtmph2  ymm4
+%define xd1     ymm5
+%define xd2     ymm6
+%define xd3     ymm7
+%define xd4     ymm8
+%define xd5     ymm9
+
+align 16
+global gf_5vect_mad_avx2:function
+func(gf_5vect_mad_avx2)
+	FUNC_SAVE
+	sub	len, 32
+	jl	.return_fail
+	xor	pos, pos
+	mov	tmp.b, 0x0f
+	vpinsrb	xmask0fx, xmask0fx, tmp.w, 0
+	vpbroadcastb xmask0f, xmask0fx	;Construct mask 0x0f0f0f...
+
+	sal	vec_i, 5		;Multiply by 32
+	sal	vec, 5			;Multiply by 32
+	lea	tmp, [mul_array + vec_i]
+
+	vmovdqu	xgft1_lo, [tmp]			;Load array Ax{00}, Ax{01}, ..., Ax{0f}
+						;     "     Ax{00}, Ax{10}, ..., Ax{f0}
+	vmovdqu	xgft2_lo, [tmp+vec]		;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+						;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+	vmovdqu	xgft3_lo, [tmp+2*vec]		;Load array Cx{00}, Cx{01}, ..., Cx{0f}
+						;     "     Cx{00}, Cx{10}, ..., Cx{f0}
+	vmovdqu	xgft5_lo, [tmp+4*vec]		;Load array Ex{00}, Ex{01}, ..., Ex{0f}
+						;     "     Ex{00}, Ex{10}, ..., Ex{f0}
+	add	tmp, vec
+	vmovdqu	xgft4_lo, [tmp+2*vec]		;Load array Dx{00}, Dx{01}, ..., Dx{0f}
+						;     "     Dx{00}, Dx{10}, ..., Dx{f0}
+
+	mov	dest3, [dest1+2*PS]	; reuse mul_array
+	mov	dest4, [dest1+3*PS]	; reuse vec
+	mov	dest5, [dest1+4*PS]	; reuse vec_i
+	mov	dest2, [dest1+PS]
+	mov	dest1, [dest1]
+
+.loop32:
+	XLDR	x0, [src+pos]		;Get next source vector
+
+	XLDR	xd1, [dest1+pos]	;Get next dest vector
+	XLDR	xd2, [dest2+pos]	;Get next dest vector
+	XLDR	xd3, [dest3+pos]	;Get next dest vector
+	XLDR	xd4, [dest4+pos]	;Get next dest vector
+	XLDR	xd5, [dest5+pos]	;Get next dest vector
+
+	vpand	xtmpl, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+	vperm2i128 xtmpa, xtmpl, x0, 0x30 	;swap xtmpa from 1lo|2lo to 1lo|2hi
+	vperm2i128 x0, xtmpl, x0, 0x12	;swap x0 from    1hi|2hi to 1hi|2lo
+
+	vperm2i128 xtmph1, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
+	vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
+
+	; dest1
+	vpshufb	xtmph1, xtmph1, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft1_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xtmpl		;GF add high and low partials
+	vpxor	xd1, xd1, xtmph1		;xd1 += partial
+
+	vperm2i128 xtmph1, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
+	; dest2
+	vpshufb	xtmph2, xtmph2, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft2_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmph2, xtmpl		;GF add high and low partials
+	vpxor	xd2, xd2, xtmph2		;xd2 += partial
+
+	vperm2i128 xtmph2, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
+	; dest3
+	vpshufb	xtmph1, xtmph1, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft3_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xtmpl		;GF add high and low partials
+	vpxor	xd3, xd3, xtmph1		;xd3 += partial
+
+	vperm2i128 xtmph1, xgft5_lo, xgft5_lo, 0x01 ; swapped to hi | lo
+	; dest4
+	vpshufb	xtmph2, xtmph2, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft4_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmph2, xtmpl		;GF add high and low partials
+	vpxor	xd4, xd4, xtmph2		;xd4 += partial
+
+	; dest5
+	vpshufb	xtmph1, xtmph1, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft5_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xtmpl		;GF add high and low partials
+	vpxor	xd5, xd5, xtmph1		;xd5 += partial
+
+	XSTR	[dest1+pos], xd1
+	XSTR	[dest2+pos], xd2
+	XSTR	[dest3+pos], xd3
+	XSTR	[dest4+pos], xd4
+	XSTR	[dest5+pos], xd5
+
+	add	pos, 32			;Loop on 32 bytes at a time
+	cmp	pos, len
+	jle	.loop32
+
+	lea	tmp, [len + 32]
+	cmp	pos, tmp
+	je	.return_pass
+
+.lessthan32:
+	;; Tail len
+	;; Do one more overlap pass
+	mov	tmp.b, 0x1f
+	vpinsrb	xtmph1x, xtmph1x, tmp.w, 0
+	vpbroadcastb xtmph1, xtmph1x	;Construct mask 0x1f1f1f...
+
+	mov	tmp, len		;Overlapped offset length-32
+
+	XLDR	x0, [src+tmp]		;Get next source vector
+
+	XLDR	xd1, [dest1+tmp]	;Get next dest vector
+	XLDR	xd2, [dest2+tmp]	;Get next dest vector
+	XLDR	xd3, [dest3+tmp]	;Get next dest vector
+	XLDR	xd4, [dest4+tmp]	;Get next dest vector
+	XLDR	xd5, [dest5+tmp]	;Get next dest vector
+
+	sub	len, pos
+
+	vmovdqa	xtmph2, [constip32]	;Load const of i + 32
+	vpinsrb	xtmplx, xtmplx, len.w, 15
+	vinserti128	xtmpl, xtmpl, xtmplx, 1 ;swapped to xtmplx | xtmplx
+	vpshufb	xtmpl, xtmpl, xtmph1	;Broadcast len to all bytes. xtmph1=0x1f1f1f...
+	vpcmpgtb	xtmpl, xtmpl, xtmph2
+
+	vpand	xtmph1, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+	vperm2i128 xtmpa, xtmph1, x0, 0x30 	;swap xtmpa from 1lo|2lo to 1lo|2hi
+	vperm2i128 x0, xtmph1, x0, 0x12	;swap x0 from    1hi|2hi to 1hi|2lo
+
+	vperm2i128 xtmph1, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
+	vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
+
+	; dest1
+	vpshufb	xtmph1, xtmph1, x0		;Lookup mul table of high nibble
+	vpshufb	xgft1_lo, xgft1_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xgft1_lo	;GF add high and low partials
+	vpand	xtmph1, xtmph1, xtmpl
+	vpxor	xd1, xd1, xtmph1		;xd1 += partial
+
+	vperm2i128 xtmph1, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
+	; dest2
+	vpshufb	xtmph2, xtmph2, x0		;Lookup mul table of high nibble
+	vpshufb	xgft2_lo, xgft2_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmph2, xgft2_lo	;GF add high and low partials
+	vpand	xtmph2, xtmph2, xtmpl
+	vpxor	xd2, xd2, xtmph2		;xd2 += partial
+
+	vperm2i128 xtmph2, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
+	; dest3
+	vpshufb	xtmph1, xtmph1, x0		;Lookup mul table of high nibble
+	vpshufb	xgft3_lo, xgft3_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xgft3_lo	;GF add high and low partials
+	vpand	xtmph1, xtmph1, xtmpl
+	vpxor	xd3, xd3, xtmph1		;xd3 += partial
+
+	vperm2i128 xtmph1, xgft5_lo, xgft5_lo, 0x01 ; swapped to hi | lo
+	; dest4
+	vpshufb	xtmph2, xtmph2, x0		;Lookup mul table of high nibble
+	vpshufb	xgft4_lo, xgft4_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmph2, xgft4_lo	;GF add high and low partials
+	vpand	xtmph2, xtmph2, xtmpl
+	vpxor	xd4, xd4, xtmph2		;xd4 += partial
+
+	; dest5
+	vpshufb	xtmph1, xtmph1, x0		;Lookup mul table of high nibble
+	vpshufb	xgft5_lo, xgft5_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xgft5_lo	;GF add high and low partials
+	vpand	xtmph1, xtmph1, xtmpl
+	vpxor	xd5, xd5, xtmph1		;xd5 += partial
+
+	XSTR	[dest1+tmp], xd1
+	XSTR	[dest2+tmp], xd2
+	XSTR	[dest3+tmp], xd3
+	XSTR	[dest4+tmp], xd4
+	XSTR	[dest5+tmp], xd5
+
+.return_pass:
+	FUNC_RESTORE
+	mov	return, 0
+	ret
+
+.return_fail:
+	FUNC_RESTORE
+	mov	return, 1
+	ret
+
+endproc_frame
+
+section .data
+align 32
+constip32:
+	ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+	ddq 0xe0e1e2e3e4e5e6e7e8e9eaebecedeeef
+
+;;;       func             core, ver, snum
+slversion gf_5vect_mad_avx2, 04,  01,  020e
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_sse.asm.s
new file mode 100644
index 0000000..b26d4bc
--- /dev/null
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_sse.asm.s
@@ -0,0 +1,373 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_5vect_mad_sse(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+
+%define PS 8
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg0.w ecx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define arg4  r12
+ %define arg5  r15
+ %define tmp   r11
+ %define tmp2   r10
+ %define tmp3   r13
+ %define tmp4   r14
+ %define return rax
+ %define return.w eax
+ %define stack_size 16*10 + 5*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	movdqa	[rsp+16*0],xmm6
+	movdqa	[rsp+16*1],xmm7
+	movdqa	[rsp+16*2],xmm8
+	movdqa	[rsp+16*3],xmm9
+	movdqa	[rsp+16*4],xmm10
+	movdqa	[rsp+16*5],xmm11
+	movdqa	[rsp+16*6],xmm12
+	movdqa	[rsp+16*7],xmm13
+	movdqa	[rsp+16*8],xmm14
+	movdqa	[rsp+16*9],xmm15
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r13,  10*16 + 1*8
+	save_reg	r14,  10*16 + 2*8
+	save_reg	r15,  10*16 + 3*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp+16*0]
+	movdqa	xmm7, [rsp+16*1]
+	movdqa	xmm8, [rsp+16*2]
+	movdqa	xmm9, [rsp+16*3]
+	movdqa	xmm10, [rsp+16*4]
+	movdqa	xmm11, [rsp+16*5]
+	movdqa	xmm12, [rsp+16*6]
+	movdqa	xmm13, [rsp+16*7]
+	movdqa	xmm14, [rsp+16*8]
+	movdqa	xmm15, [rsp+16*9]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r13,  [rsp + 10*16 + 1*8]
+	mov	r14,  [rsp + 10*16 + 2*8]
+	mov	r15,  [rsp + 10*16 + 3*8]
+	add	rsp, stack_size
+%endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define tmp2   r10
+ %define tmp3   r12
+ %define tmp4   r13
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+	push	r13
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r13
+	pop	r12
+ %endmacro
+%endif
+
+;;; gf_5vect_mad_sse(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest1  arg5
+%define pos   return
+%define pos.w return.w
+
+%define dest2 tmp4
+%define dest3 mul_array
+%define dest4 tmp2
+%define dest5 vec_i
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR movdqu
+ %define XSTR movdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR movdqa
+  %define XSTR movdqa
+ %else
+  %define XLDR movntdqa
+  %define XSTR movntdq
+ %endif
+%endif
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f  xmm15
+%define xgft5_hi xmm14
+%define xgft4_lo xmm13
+%define xgft4_hi xmm12
+
+%define x0      xmm0
+%define xtmpa   xmm1
+%define xtmph1  xmm2
+%define xtmpl1  xmm3
+%define xtmph2  xmm4
+%define xtmpl2  xmm5
+%define xtmph3  xmm6
+%define xtmpl3  xmm7
+%define xtmph5  xmm8
+%define xtmpl5  xmm9
+%define xd1     xmm10
+%define xd2     xmm11
+%define xd3     xtmpl1
+%define xd4     xtmph1
+%define xd5     xtmpl2
+
+
+align 16
+global gf_5vect_mad_sse:function
+func(gf_5vect_mad_sse)
+	FUNC_SAVE
+	sub	len, 16
+	jl	.return_fail
+	xor	pos, pos
+
+	movdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+	mov	tmp, vec
+	sal	vec_i, 5		;Multiply by 32
+	lea	tmp3, [mul_array + vec_i]
+	sal	tmp, 6			;Multiply by 64
+	movdqu	xgft5_hi, [tmp3+2*tmp+16]	;     "     Ex{00}, Ex{10}, ..., Ex{f0}
+	sal	vec, 5			;Multiply by 32
+	add	tmp, vec
+	movdqu	xgft4_hi, [tmp3+tmp+16]	; " Dx{00}, Dx{10}, Dx{20}, ... , Dx{f0}
+	movdqu	xgft4_lo, [tmp3+tmp]	;Load array Dx{00}, Dx{01}, Dx{02}, ...
+
+	mov	dest3, [dest1+2*PS]	; reuse mul_array
+	mov	dest4, [dest1+3*PS]
+	mov	dest5, [dest1+4*PS]	; reuse vec_i
+	mov	dest2, [dest1+PS]
+	mov	dest1, [dest1]
+
+.loop16:
+	XLDR	x0, [src+pos]		;Get next source vector
+
+	movdqu	xtmph1, [tmp3+16]	; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	movdqu	xtmpl1, [tmp3]		;Load array Ax{00}, Ax{01}, Ax{02}, ...
+	movdqu	xtmph2, [tmp3+vec+16]	; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	movdqu	xtmpl2, [tmp3+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+	movdqu	xtmph3, [tmp3+2*vec+16]	; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+	movdqu	xtmpl3, [tmp3+2*vec]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+	movdqu	xtmpl5, [tmp3+4*vec]	;Load array Ex{00}, Ex{01}, ..., Ex{0f}
+	movdqa	xtmph5, xgft5_hi		;Reload const array registers
+
+	XLDR	xd1, [dest1+pos]	;Get next dest vector
+	XLDR	xd2, [dest2+pos]	;Get next dest vector
+
+	movdqa	xtmpa, x0		;Keep unshifted copy of src
+	psraw	x0, 4			;Shift to put high nibble into bits 4-0
+	pand	x0, xmask0f		;Mask high src nibble in bits 4-0
+	pand	xtmpa, xmask0f		;Mask low src nibble in bits 4-0
+
+	; dest1
+	pshufb	xtmph1, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl1, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph1, xtmpl1		;GF add high and low partials
+	pxor	xd1, xtmph1
+
+	XLDR	xd3, [dest3+pos]	;Reuse xtmpl1, Get next dest vector
+	XLDR	xd4, [dest4+pos]	;Reuse xtmph1. Get next dest vector
+
+	; dest2
+	pshufb	xtmph2, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl2, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph2, xtmpl2		;GF add high and low partials
+	pxor	xd2, xtmph2
+
+	XLDR	xd5, [dest5+pos]	;Reuse xtmpl2. Get next dest vector
+
+	; dest3
+	pshufb	xtmph3, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl3, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph3, xtmpl3		;GF add high and low partials
+	pxor	xd3, xtmph3
+
+	movdqa	xtmph2, xgft4_hi		;Reload const array registers
+	movdqa	xtmpl3, xgft4_lo		;Reload const array registers
+
+	; dest5
+	pshufb	xtmph5, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl5, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph5, xtmpl5		;GF add high and low partials
+	pxor	xd5, xtmph5
+
+	; dest4
+	pshufb	xtmph2, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl3, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph2, xtmpl3		;GF add high and low partials
+	pxor	xd4, xtmph2
+
+	XSTR	[dest1+pos], xd1	;Store result into dest1
+	XSTR	[dest2+pos], xd2	;Store result into dest2
+	XSTR	[dest3+pos], xd3	;Store result into dest3
+	XSTR	[dest4+pos], xd4	;Store result into dest4
+	XSTR	[dest5+pos], xd5	;Store result into dest5
+
+	add	pos, 16			;Loop on 16 bytes at a time
+	cmp	pos, len
+	jle	.loop16
+
+	lea	tmp, [len + 16]
+	cmp	pos, tmp
+	je	.return_pass
+
+.lessthan16:
+	;; Tail len
+	;; Do one more overlap pass
+	mov	tmp, len	;Overlapped offset length-16
+	XLDR	x0, [src+tmp]		;Get next source vector
+
+	sub	len, pos
+
+	movdqa	xtmpl1, [constip16]	;Load const of i + 16
+	pinsrb	xtmph5, len.w, 15
+	pshufb	xtmph5, xmask0f		;Broadcast len to all bytes
+	pcmpgtb	xtmph5, xtmpl1
+
+	movdqu	xtmph1, [tmp3+16]	; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	movdqu	xtmpl1, [tmp3]		;Load array Ax{00}, Ax{01}, Ax{02}, ...
+	movdqu	xtmph2, [tmp3+vec+16]	; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	movdqu	xtmpl2, [tmp3+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+	movdqu	xtmph3, [tmp3+2*vec+16]	; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+	movdqu	xtmpl3, [tmp3+2*vec]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+	movdqu	xtmpl5, [tmp3+4*vec]	;Load array Ex{00}, Ex{01}, ..., Ex{0f}
+
+	XLDR	xd1, [dest1+tmp]	;Get next dest vector
+	XLDR	xd2, [dest2+tmp]	;Get next dest vector
+
+	movdqa	xtmpa, x0		;Keep unshifted copy of src
+	psraw	x0, 4			;Shift to put high nibble into bits 4-0
+	pand	x0, xmask0f		;Mask high src nibble in bits 4-0
+	pand	xtmpa, xmask0f		;Mask low src nibble in bits 4-0
+
+	; dest1
+	pshufb	xtmph1, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl1, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph1, xtmpl1		;GF add high and low partials
+	pand	xtmph1, xtmph5
+	pxor	xd1, xtmph1
+
+	XLDR	xd3, [dest3+tmp]	;Reuse xtmpl1, Get next dest vector
+	XLDR	xd4, [dest4+tmp]	;Reuse xtmph1. Get next dest vector
+
+	; dest2
+	pshufb	xtmph2, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl2, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph2, xtmpl2		;GF add high and low partials
+	pand	xtmph2, xtmph5
+	pxor	xd2, xtmph2
+
+	XLDR	xd5, [dest5+tmp]	;Reuse xtmpl2. Get next dest vector
+
+	; dest3
+	pshufb	xtmph3, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl3, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph3, xtmpl3		;GF add high and low partials
+	pand	xtmph3, xtmph5
+	pxor	xd3, xtmph3
+
+	; dest4
+	pshufb	xgft4_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft4_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft4_hi, xgft4_lo		;GF add high and low partials
+	pand	xgft4_hi, xtmph5
+	pxor	xd4, xgft4_hi
+
+	; dest5
+	pshufb	xgft5_hi, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl5, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft5_hi, xtmpl5		;GF add high and low partials
+	pand	xgft5_hi, xtmph5
+	pxor	xd5, xgft5_hi
+
+	XSTR	[dest1+tmp], xd1	;Store result into dest1
+	XSTR	[dest2+tmp], xd2	;Store result into dest2
+	XSTR	[dest3+tmp], xd3	;Store result into dest3
+	XSTR	[dest4+tmp], xd4	;Store result into dest4
+	XSTR	[dest5+tmp], xd5	;Store result into dest5
+
+.return_pass:
+	FUNC_RESTORE
+	mov	return, 0
+	ret
+
+.return_fail:
+	FUNC_RESTORE
+	mov	return, 1
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+
+mask0f:
+	ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+constip16:
+	ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+
+;;;       func             core, ver, snum
+slversion gf_5vect_mad_sse, 00,  01,  020c
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_dot_prod_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_dot_prod_avx.asm.s
index 28ca861..fb29f76 100644
--- a/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_dot_prod_avx.asm.s
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_dot_prod_avx.asm.s
@@ -1,5 +1,5 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 ;
 ;  Redistribution and use in source and binary forms, with or without
 ;  modification, are permitted provided that the following conditions
@@ -30,8 +30,8 @@
 ;;;
 ;;; gf_6vect_dot_prod_avx(len, vec, *g_tbls, **buffs, **dests);
 ;;;
-;;; Author: Gregory Tucker
 
+%include "reg_sizes.asm"
 
 %ifidn __OUTPUT_FORMAT__, elf64
  %define arg0  rdi
@@ -311,15 +311,5 @@ section .data
 align 16
 mask0f:	ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
 
-%macro slversion 4
-global %1_slver_%2%3%4
-global %1_slver
-%1_slver:
-%1_slver_%2%3%4:
-	dw 0x%4
-	db 0x%3, 0x%2
-%endmacro
 ;;;       func                  core, ver, snum
-slversion gf_6vect_dot_prod_avx, 02,  03,  0195
-; inform linker that this doesn't require executable stack
-section .note.GNU-stack noalloc noexec nowrite progbits
+slversion gf_6vect_dot_prod_avx, 02,  04,  0195
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_dot_prod_avx2.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_dot_prod_avx2.asm.s
index a957c9e..85bb78a 100644
--- a/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_dot_prod_avx2.asm.s
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_dot_prod_avx2.asm.s
@@ -1,5 +1,5 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 ;
 ;  Redistribution and use in source and binary forms, with or without
 ;  modification, are permitted provided that the following conditions
@@ -30,8 +30,8 @@
 ;;;
 ;;; gf_6vect_dot_prod_avx2(len, vec, *g_tbls, **buffs, **dests);
 ;;;
-;;; Author: Gregory Tucker
 
+%include "reg_sizes.asm"
 
 %ifidn __OUTPUT_FORMAT__, elf64
  %define arg0  rdi
@@ -322,15 +322,5 @@ endproc_frame
 
 section .data
 
-%macro slversion 4
-global %1_slver_%2%3%4
-global %1_slver
-%1_slver:
-%1_slver_%2%3%4:
-	dw 0x%4
-	db 0x%3, 0x%2
-%endmacro
 ;;;       func                   core, ver, snum
-slversion gf_6vect_dot_prod_avx2, 04,  03,  019a
-; inform linker that this doesn't require executable stack
-section .note.GNU-stack noalloc noexec nowrite progbits
+slversion gf_6vect_dot_prod_avx2, 04,  04,  019a
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_dot_prod_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_dot_prod_sse.asm.s
index 4910ddd..34f7b87 100644
--- a/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_dot_prod_sse.asm.s
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_dot_prod_sse.asm.s
@@ -1,5 +1,5 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 ;
 ;  Redistribution and use in source and binary forms, with or without
 ;  modification, are permitted provided that the following conditions
@@ -30,8 +30,8 @@
 ;;;
 ;;; gf_6vect_dot_prod_sse(len, vec, *g_tbls, **buffs, **dests);
 ;;;
-;;; Author: Gregory Tucker
 
+%include "reg_sizes.asm"
 
 %ifidn __OUTPUT_FORMAT__, elf64
  %define arg0  rdi
@@ -166,20 +166,20 @@ default rel
 section .text
 
 %define xmask0f   xmm15
-%define xgft1_lo  xmm14
-%define xgft1_hi  xmm13
-%define xgft2_lo  xmm12
-%define xgft2_hi  xmm11
-%define xgft3_lo  xmm10
-%define xgft3_hi  xmm9
+%define xgft1_lo  xmm2
+%define xgft1_hi  xmm3
+%define xgft2_lo  xmm4
+%define xgft2_hi  xmm5
+%define xgft3_lo  xmm6
+%define xgft3_hi  xmm7
 %define x0     xmm0
 %define xtmpa  xmm1
-%define xp1    xmm2
-%define xp2    xmm3
-%define xp3    xmm4
-%define xp4    xmm5
-%define xp5    xmm6
-%define xp6    xmm7
+%define xp1    xmm8
+%define xp2    xmm9
+%define xp3    xmm10
+%define xp4    xmm11
+%define xp5    xmm12
+%define xp6    xmm13
 
 align 16
 global gf_6vect_dot_prod_sse:function
@@ -311,15 +311,5 @@ section .data
 align 16
 mask0f:	ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
 
-%macro slversion 4
-global %1_slver_%2%3%4
-global %1_slver
-%1_slver:
-%1_slver_%2%3%4:
-	dw 0x%4
-	db 0x%3, 0x%2
-%endmacro
 ;;;       func                  core, ver, snum
-slversion gf_6vect_dot_prod_sse, 00,  03,  0066
-; inform linker that this doesn't require executable stack
-section .note.GNU-stack noalloc noexec nowrite progbits
+slversion gf_6vect_dot_prod_sse, 00,  05,  0066
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_avx.asm.s
new file mode 100644
index 0000000..3c60d0a
--- /dev/null
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_avx.asm.s
@@ -0,0 +1,394 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_6vect_mad_avx(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+
+%define PS 8
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg0.w ecx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define arg4  r12
+ %define arg5  r15
+ %define tmp   r11
+ %define tmp2   r10
+ %define tmp3   r13
+ %define tmp4   r14
+ %define tmp5   rdi
+ %define return rax
+ %define return.w eax
+ %define stack_size 16*10 + 5*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	movdqa	[rsp+16*0],xmm6
+	movdqa	[rsp+16*1],xmm7
+	movdqa	[rsp+16*2],xmm8
+	movdqa	[rsp+16*3],xmm9
+	movdqa	[rsp+16*4],xmm10
+	movdqa	[rsp+16*5],xmm11
+	movdqa	[rsp+16*6],xmm12
+	movdqa	[rsp+16*7],xmm13
+	movdqa	[rsp+16*8],xmm14
+	movdqa	[rsp+16*9],xmm15
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r13,  10*16 + 1*8
+	save_reg	r14,  10*16 + 2*8
+	save_reg	r15,  10*16 + 3*8
+	save_reg	rdi,  10*16 + 4*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp+16*0]
+	movdqa	xmm7, [rsp+16*1]
+	movdqa	xmm8, [rsp+16*2]
+	movdqa	xmm9, [rsp+16*3]
+	movdqa	xmm10, [rsp+16*4]
+	movdqa	xmm11, [rsp+16*5]
+	movdqa	xmm12, [rsp+16*6]
+	movdqa	xmm13, [rsp+16*7]
+	movdqa	xmm14, [rsp+16*8]
+	movdqa	xmm15, [rsp+16*9]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r13,  [rsp + 10*16 + 1*8]
+	mov	r14,  [rsp + 10*16 + 2*8]
+	mov	r15,  [rsp + 10*16 + 3*8]
+	mov	rdi,  [rsp + 10*16 + 4*8]
+	add	rsp, stack_size
+%endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define tmp2   r10
+ %define tmp3   r12
+ %define tmp4   r13
+ %define tmp5   r14
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+	push	r13
+	push	r14
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r14
+	pop	r13
+	pop	r12
+ %endmacro
+%endif
+
+;;; gf_6vect_mad_avx(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest1  arg5
+%define pos   return
+%define pos.w return.w
+
+%define dest2 tmp4
+%define dest3 tmp2
+%define dest4 mul_array
+%define dest5 tmp5
+%define dest6 vec_i
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f  xmm15
+%define xgft4_lo  xmm14
+%define xgft4_hi  xmm13
+%define xgft5_lo  xmm12
+%define xgft5_hi  xmm11
+%define xgft6_lo  xmm10
+%define xgft6_hi  xmm9
+
+%define x0         xmm0
+%define xtmpa      xmm1
+%define xtmph1     xmm2
+%define xtmpl1     xmm3
+%define xtmph2     xmm4
+%define xtmpl2     xmm5
+%define xtmph3     xmm6
+%define xtmpl3     xmm7
+%define xd1        xmm8
+%define xd2        xtmpl1
+%define xd3        xtmph1
+
+
+align 16
+global gf_6vect_mad_avx:function
+func(gf_6vect_mad_avx)
+	FUNC_SAVE
+	sub	len, 16
+	jl	.return_fail
+	xor	pos, pos
+	vmovdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+	mov	tmp, vec
+	sal	vec_i, 5		;Multiply by 32
+	lea	tmp3, [mul_array + vec_i]
+	sal	tmp, 6			;Multiply by 64
+
+	sal	vec, 5			;Multiply by 32
+	lea	vec_i, [tmp + vec]	;vec_i = vec*96
+	lea	mul_array, [tmp + vec_i]	;mul_array = vec*160
+
+	vmovdqu	xgft5_lo, [tmp3+2*tmp]		;Load array Ex{00}, Ex{01}, ..., Ex{0f}
+	vmovdqu	xgft5_hi, [tmp3+2*tmp+16]	;     "     Ex{00}, Ex{10}, ..., Ex{f0}
+	vmovdqu	xgft4_lo, [tmp3+vec_i]		;Load array Dx{00}, Dx{01}, Dx{02}, ...
+	vmovdqu	xgft4_hi, [tmp3+vec_i+16]	; " Dx{00}, Dx{10}, Dx{20}, ... , Dx{f0}
+	vmovdqu	xgft6_lo, [tmp3+mul_array]	;Load array Fx{00}, Fx{01}, ..., Fx{0f}
+	vmovdqu	xgft6_hi, [tmp3+mul_array+16]	;     "     Fx{00}, Fx{10}, ..., Fx{f0}
+
+	mov	dest2, [dest1+PS]
+	mov	dest3, [dest1+2*PS]
+	mov	dest4, [dest1+3*PS]  ; reuse mul_array
+	mov	dest5, [dest1+4*PS]
+	mov	dest6, [dest1+5*PS]  ; reuse vec_i
+	mov	dest1, [dest1]
+
+.loop16:
+	XLDR	x0, [src+pos]		;Get next source vector
+
+	vmovdqu	xtmpl1, [tmp3]	;Load array Ax{00}, Ax{01}, Ax{02}, ...
+	vmovdqu	xtmph1, [tmp3+16]	; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	vmovdqu	xtmpl2, [tmp3+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+	vmovdqu	xtmph2, [tmp3+vec+16]	; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	vmovdqu	xtmpl3, [tmp3+2*vec]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+	vmovdqu	xtmph3, [tmp3+2*vec+16]	; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+	XLDR	xd1, [dest1+pos]		;Get next dest vector
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+
+	;dest1
+	vpshufb	xtmph1, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl1, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmpl1		;GF add high and low partials
+	vpxor	xd1, xtmph1
+
+	XLDR	xd2, [dest2+pos]	;reuse xtmpl1. Get next dest vector
+	XLDR	xd3, [dest3+pos]	;reuse xtmph1. Get next dest vector
+
+	;dest2
+	vpshufb	xtmph2, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl2, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmpl2		;GF add high and low partials
+	vpxor	xd2, xtmph2
+
+	;dest3
+	vpshufb	xtmph3, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl3, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph3, xtmpl3		;GF add high and low partials
+	vpxor	xd3, xtmph3
+
+	XSTR	[dest1+pos], xd1	;Store result into dest1
+	XSTR	[dest2+pos], xd2	;Store result into dest2
+	XSTR	[dest3+pos], xd3	;Store result into dest3
+
+	;dest4
+	XLDR	xd1, [dest4+pos]		;Get next dest vector
+	vpshufb	xtmph1, xgft4_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl1, xgft4_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xtmpl1		;GF add high and low partials
+	vpxor	xd1, xd1, xtmph1
+
+	XLDR	xd2, [dest5+pos]	;reuse xtmpl1. Get next dest vector
+	XLDR	xd3, [dest6+pos]	;reuse xtmph1. Get next dest vector
+
+	;dest5
+	vpshufb	xtmph2, xgft5_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl2, xgft5_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmph2, xtmpl2		;GF add high and low partials
+	vpxor	xd2, xd2, xtmph2
+
+	;dest6
+	vpshufb	xtmph3, xgft6_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl3, xgft6_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph3, xtmph3, xtmpl3		;GF add high and low partials
+	vpxor	xd3, xd3, xtmph3
+
+	XSTR	[dest4+pos], xd1	;Store result into dest4
+	XSTR	[dest5+pos], xd2	;Store result into dest5
+	XSTR	[dest6+pos], xd3	;Store result into dest6
+
+	add	pos, 16			;Loop on 16 bytes at a time
+	cmp	pos, len
+	jle	.loop16
+
+	lea	tmp, [len + 16]
+	cmp	pos, tmp
+	je	.return_pass
+
+.lessthan16:
+	;; Tail len
+	;; Do one more overlap pass
+	;; Overlapped offset length-16
+	mov	tmp, len		;Backup len as len=rdi
+
+	XLDR	x0, [src+tmp]		;Get next source vector
+	XLDR	xd1, [dest4+tmp]	;Get next dest vector
+	XLDR	xd2, [dest5+tmp]	;reuse xtmpl1. Get next dest vector
+	XLDR	xd3, [dest6+tmp]	;reuse xtmph1. Get next dest vector
+
+	sub	len, pos
+
+	vmovdqa	xtmph3, [constip16]	;Load const of i + 16
+	vpinsrb	xtmpl3, len.w, 15
+	vpshufb	xtmpl3, xmask0f		;Broadcast len to all bytes
+	vpcmpgtb	xtmpl3, xtmpl3, xtmph3
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	;dest4
+	vpshufb	xgft4_hi, xgft4_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft4_lo, xgft4_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft4_hi, xgft4_hi, xgft4_lo	;GF add high and low partials
+	vpand	xgft4_hi, xgft4_hi, xtmpl3
+	vpxor	xd1, xd1, xgft4_hi
+
+	;dest5
+	vpshufb	xgft5_hi, xgft5_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft5_lo, xgft5_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft5_hi, xgft5_hi, xgft5_lo	;GF add high and low partials
+	vpand	xgft5_hi, xgft5_hi, xtmpl3
+	vpxor	xd2, xd2, xgft5_hi
+
+	;dest6
+	vpshufb	xgft6_hi, xgft6_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft6_lo, xgft6_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft6_hi, xgft6_hi, xgft6_lo	;GF add high and low partials
+	vpand	xgft6_hi, xgft6_hi, xtmpl3
+	vpxor	xd3, xd3, xgft6_hi
+
+	XSTR	[dest4+tmp], xd1	;Store result into dest4
+	XSTR	[dest5+tmp], xd2	;Store result into dest5
+	XSTR	[dest6+tmp], xd3	;Store result into dest6
+
+	vmovdqu	xgft4_lo, [tmp3]	;Load array Ax{00}, Ax{01}, Ax{02}, ...
+	vmovdqu	xgft4_hi, [tmp3+16]	; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	vmovdqu	xgft5_lo, [tmp3+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+	vmovdqu	xgft5_hi, [tmp3+vec+16]	; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	vmovdqu	xgft6_lo, [tmp3+2*vec]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+	vmovdqu	xgft6_hi, [tmp3+2*vec+16]	; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+	XLDR	xd1, [dest1+tmp]	;Get next dest vector
+	XLDR	xd2, [dest2+tmp]	;reuse xtmpl1. Get next dest vector
+	XLDR	xd3, [dest3+tmp]	;reuse xtmph1. Get next dest3 vector
+
+	;dest1
+	vpshufb	xgft4_hi, xgft4_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft4_lo, xgft4_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft4_hi, xgft4_hi, xgft4_lo		;GF add high and low partials
+	vpand	xgft4_hi, xgft4_hi, xtmpl3
+	vpxor	xd1, xd1, xgft4_hi
+
+	;dest2
+	vpshufb	xgft5_hi, xgft5_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft5_lo, xgft5_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft5_hi, xgft5_hi, xgft5_lo	;GF add high and low partials
+	vpand	xgft5_hi, xgft5_hi, xtmpl3
+	vpxor	xd2, xd2, xgft5_hi
+
+	;dest3
+	vpshufb	xgft6_hi, xgft6_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft6_lo, xgft6_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft6_hi, xgft6_hi, xgft6_lo	;GF add high and low partials
+	vpand	xgft6_hi, xgft6_hi, xtmpl3
+	vpxor	xd3, xd3, xgft6_hi
+
+	XSTR	[dest1+tmp], xd1	;Store result into dest1
+	XSTR	[dest2+tmp], xd2	;Store result into dest2
+	XSTR	[dest3+tmp], xd3	;Store result into dest3
+
+.return_pass:
+	FUNC_RESTORE
+	mov	return, 0
+	ret
+
+.return_fail:
+	FUNC_RESTORE
+	mov	return, 1
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+mask0f:	ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+constip16:
+	ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+
+;;;       func             core, ver, snum
+slversion gf_6vect_mad_avx, 02,  01,  0210
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_avx2.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_avx2.asm.s
new file mode 100644
index 0000000..e180457
--- /dev/null
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_avx2.asm.s
@@ -0,0 +1,400 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_6vect_mad_avx2(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+
+%define PS 8
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg0.w ecx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define arg4  r12
+ %define arg5  r15
+ %define tmp    r11
+ %define tmp.w  r11d
+ %define tmp.b  r11b
+ %define tmp2   r10
+ %define tmp3   r13
+ %define return rax
+ %define return.w eax
+ %define stack_size 16*10 + 3*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	movdqa	[rsp+16*0],xmm6
+	movdqa	[rsp+16*1],xmm7
+	movdqa	[rsp+16*2],xmm8
+	movdqa	[rsp+16*3],xmm9
+	movdqa	[rsp+16*4],xmm10
+	movdqa	[rsp+16*5],xmm11
+	movdqa	[rsp+16*6],xmm12
+	movdqa	[rsp+16*7],xmm13
+	movdqa	[rsp+16*8],xmm14
+	movdqa	[rsp+16*9],xmm15
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r13,  10*16 + 1*8
+	save_reg	r15,  10*16 + 2*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp+16*0]
+	movdqa	xmm7, [rsp+16*1]
+	movdqa	xmm8, [rsp+16*2]
+	movdqa	xmm9, [rsp+16*3]
+	movdqa	xmm10, [rsp+16*4]
+	movdqa	xmm11, [rsp+16*5]
+	movdqa	xmm12, [rsp+16*6]
+	movdqa	xmm13, [rsp+16*7]
+	movdqa	xmm14, [rsp+16*8]
+	movdqa	xmm15, [rsp+16*9]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r13,  [rsp + 10*16 + 1*8]
+	mov	r15,  [rsp + 10*16 + 3*8]
+	add	rsp, stack_size
+%endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define tmp.w r11d
+ %define tmp.b r11b
+ %define tmp2   r10
+ %define tmp3   r12
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r12
+ %endmacro
+%endif
+
+;;; gf_6vect_mad_avx2(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest1  arg5
+%define pos   return
+%define pos.w return.w
+
+%define dest2 tmp3
+%define dest3 tmp2
+%define dest4 mul_array
+%define dest5 vec
+%define dest6 vec_i
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f  ymm15
+%define xmask0fx  xmm15
+%define xgft1_lo  ymm14
+%define xgft2_lo  ymm13
+%define xgft3_lo  ymm12
+%define xgft4_lo  ymm11
+%define xgft5_lo  ymm10
+%define xgft6_lo  ymm9
+
+%define x0         ymm0
+%define xtmpa      ymm1
+%define xtmpl      ymm2
+%define xtmplx     xmm2
+%define xtmph      ymm3
+%define xtmphx     xmm3
+%define xd1        ymm4
+%define xd2        ymm5
+%define xd3        ymm6
+%define xd4        ymm7
+%define xd5        ymm8
+%define xd6        xd1
+
+align 16
+global gf_6vect_mad_avx2:function
+func(gf_6vect_mad_avx2)
+	FUNC_SAVE
+	sub	len, 32
+	jl	.return_fail
+	xor	pos, pos
+	mov	tmp.b, 0x0f
+	vpinsrb	xmask0fx, xmask0fx, tmp.w, 0
+	vpbroadcastb xmask0f, xmask0fx	;Construct mask 0x0f0f0f...
+
+	sal	vec_i, 5		;Multiply by 32
+	sal	vec, 5			;Multiply by 32
+	lea	tmp, [mul_array + vec_i]
+	mov	vec_i, vec
+	mov	mul_array, vec
+	sal	vec_i, 1
+	sal	mul_array, 1
+	add	vec_i, vec		;vec_i=vec*96
+	add	mul_array, vec_i	;vec_i=vec*160
+
+	vmovdqu	xgft1_lo, [tmp]			;Load array Ax{00}, Ax{01}, ..., Ax{0f}
+						;     "     Ax{00}, Ax{10}, ..., Ax{f0}
+	vmovdqu	xgft2_lo, [tmp+vec]		;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+						;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+	vmovdqu	xgft3_lo, [tmp+2*vec]		;Load array Cx{00}, Cx{01}, ..., Cx{0f}
+						;     "     Cx{00}, Cx{10}, ..., Cx{f0}
+	vmovdqu	xgft4_lo, [tmp+vec_i]		;Load array Fx{00}, Fx{01}, ..., Fx{0f}
+						;     "     Fx{00}, Fx{10}, ..., Fx{f0}
+	vmovdqu	xgft5_lo, [tmp+4*vec]		;Load array Ex{00}, Ex{01}, ..., Ex{0f}
+						;     "     Ex{00}, Ex{10}, ..., Ex{f0}
+	vmovdqu	xgft6_lo, [tmp+mul_array]	;Load array Dx{00}, Dx{01}, ..., Dx{0f}
+						;     "     Dx{00}, Dx{10}, ..., Dx{f0}
+
+	mov	dest2, [dest1+PS]    ; reuse tmp3
+	mov	dest3, [dest1+2*PS]  ; reuse tmp2
+	mov	dest4, [dest1+3*PS]  ; reuse mul_array
+	mov	dest5, [dest1+4*PS]  ; reuse vec
+	mov	dest6, [dest1+5*PS]  ; reuse vec_i
+	mov	dest1, [dest1]
+
+.loop32:
+	XLDR	x0, [src+pos]		;Get next source vector
+	XLDR	xd1, [dest1+pos]	;Get next dest vector
+	XLDR	xd2, [dest2+pos]	;Get next dest vector
+	XLDR	xd3, [dest3+pos]	;Get next dest vector
+	XLDR	xd4, [dest4+pos]	;Get next dest vector
+	XLDR	xd5, [dest5+pos]	;Get next dest vector
+
+	vpand	xtmpl, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+	vperm2i128 xtmpa, xtmpl, x0, 0x30 	;swap xtmpa from 1lo|2lo to 1lo|2hi
+	vperm2i128 x0, xtmpl, x0, 0x12	;swap x0 from    1hi|2hi to 1hi|2lo
+
+	;dest1
+	vperm2i128 xtmph, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
+	vpshufb	xtmph, xtmph, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft1_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph, xtmph, xtmpl		;GF add high and low partials
+	vpxor	xd1, xd1, xtmph		;xd1 += partial
+
+	XSTR	[dest1+pos], xd1	;Store result into dest1
+
+	;dest2
+	vperm2i128 xtmph, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
+	vpshufb	xtmph, xtmph, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft2_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph, xtmph, xtmpl		;GF add high and low partials
+	vpxor	xd2, xd2, xtmph		;xd2 += partial
+
+	;dest3
+	vperm2i128 xtmph, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
+	vpshufb	xtmph, xtmph, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft3_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph, xtmph, xtmpl		;GF add high and low partials
+	vpxor	xd3, xd3, xtmph		;xd3 += partial
+
+	XLDR	xd6, [dest6+pos]	;reuse xd1. Get next dest vector
+
+	;dest4
+	vperm2i128 xtmph, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
+	vpshufb	xtmph, xtmph, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft4_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph, xtmph, xtmpl		;GF add high and low partials
+	vpxor	xd4, xd4, xtmph		;xd4 += partial
+
+	;dest5
+	vperm2i128 xtmph, xgft5_lo, xgft5_lo, 0x01 ; swapped to hi | lo
+	vpshufb	xtmph, xtmph, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft5_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph, xtmph, xtmpl		;GF add high and low partials
+	vpxor	xd5, xd5, xtmph		;xd5 += partial
+
+	;dest6
+	vperm2i128 xtmph, xgft6_lo, xgft6_lo, 0x01 ; swapped to hi | lo
+	vpshufb	xtmph, xtmph, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft6_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph, xtmph, xtmpl		;GF add high and low partials
+	vpxor	xd6, xd6, xtmph		;xd6 += partial
+
+	XSTR	[dest2+pos], xd2	;Store result into dest2
+	XSTR	[dest3+pos], xd3	;Store result into dest3
+	XSTR	[dest4+pos], xd4	;Store result into dest4
+	XSTR	[dest5+pos], xd5	;Store result into dest5
+	XSTR	[dest6+pos], xd6	;Store result into dest6
+
+	add	pos, 32			;Loop on 32 bytes at a time
+	cmp	pos, len
+	jle	.loop32
+
+	lea	tmp, [len + 32]
+	cmp	pos, tmp
+	je	.return_pass
+
+.lessthan32:
+	;; Tail len
+	;; Do one more overlap pass
+	mov	tmp.b, 0x1f
+	vpinsrb	xtmphx, xtmphx, tmp.w, 0
+	vpbroadcastb xtmph, xtmphx	;Construct mask 0x1f1f1f...
+
+	mov	tmp, len		;Overlapped offset length-32
+
+	XLDR	x0, [src+tmp]		;Get next source vector
+	XLDR	xd1, [dest1+tmp]	;Get next dest vector
+	XLDR	xd2, [dest2+tmp]	;Get next dest vector
+	XLDR	xd3, [dest3+tmp]	;Get next dest vector
+	XLDR	xd4, [dest4+tmp]	;Get next dest vector
+	XLDR	xd5, [dest5+tmp]	;Get next dest vector
+
+	sub	len, pos
+
+	vpinsrb	xtmplx, xtmplx, len.w, 15
+	vinserti128	xtmpl, xtmpl, xtmplx, 1 ;swapped to xtmplx | xtmplx
+	vpshufb	xtmpl, xtmpl, xtmph	;Broadcast len to all bytes. xtmph=0x1f1f1f...
+	vpcmpgtb	xtmpl, xtmpl, [constip32]
+
+	vpand	xtmph, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+	vperm2i128 xtmpa, xtmph, x0, 0x30 	;swap xtmpa from 1lo|2lo to 1lo|2hi
+	vperm2i128 x0, xtmph, x0, 0x12	;swap x0 from    1hi|2hi to 1hi|2lo
+
+	;dest1
+	vperm2i128 xtmph, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
+	vpshufb	xtmph, xtmph, x0		;Lookup mul table of high nibble
+	vpshufb	xgft1_lo, xgft1_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph, xtmph, xgft1_lo		;GF add high and low partials
+	vpand	xtmph, xtmph, xtmpl
+	vpxor	xd1, xd1, xtmph		;xd1 += partial
+
+	XSTR	[dest1+tmp], xd1	;Store result into dest1
+
+	;dest2
+	vperm2i128 xtmph, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
+	vpshufb	xtmph, xtmph, x0		;Lookup mul table of high nibble
+	vpshufb	xgft2_lo, xgft2_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph, xtmph, xgft2_lo		;GF add high and low partials
+	vpand	xtmph, xtmph, xtmpl
+	vpxor	xd2, xd2, xtmph		;xd2 += partial
+
+	;dest3
+	vperm2i128 xtmph, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
+	vpshufb	xtmph, xtmph, x0		;Lookup mul table of high nibble
+	vpshufb	xgft3_lo, xgft3_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph, xtmph, xgft3_lo		;GF add high and low partials
+	vpand	xtmph, xtmph, xtmpl
+	vpxor	xd3, xd3, xtmph		;xd3 += partial
+
+	XLDR	xd6, [dest6+tmp]	;reuse xd1. Get next dest vector
+
+	;dest4
+	vperm2i128 xtmph, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
+	vpshufb	xtmph, xtmph, x0		;Lookup mul table of high nibble
+	vpshufb	xgft4_lo, xgft4_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph, xtmph, xgft4_lo		;GF add high and low partials
+	vpand	xtmph, xtmph, xtmpl
+	vpxor	xd4, xd4, xtmph		;xd4 += partial
+
+	;dest5
+	vperm2i128 xtmph, xgft5_lo, xgft5_lo, 0x01 ; swapped to hi | lo
+	vpshufb	xtmph, xtmph, x0		;Lookup mul table of high nibble
+	vpshufb	xgft5_lo, xgft5_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph, xtmph, xgft5_lo		;GF add high and low partials
+	vpand	xtmph, xtmph, xtmpl
+	vpxor	xd5, xd5, xtmph		;xd5 += partial
+
+	;dest6
+	vperm2i128 xtmph, xgft6_lo, xgft6_lo, 0x01 ; swapped to hi | lo
+	vpshufb	xtmph, xtmph, x0		;Lookup mul table of high nibble
+	vpshufb	xgft6_lo, xgft6_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph, xtmph, xgft6_lo		;GF add high and low partials
+	vpand	xtmph, xtmph, xtmpl
+	vpxor	xd6, xd6, xtmph		;xd6 += partial
+
+	XSTR	[dest2+tmp], xd2	;Store result into dest2
+	XSTR	[dest3+tmp], xd3	;Store result into dest3
+	XSTR	[dest4+tmp], xd4	;Store result into dest4
+	XSTR	[dest5+tmp], xd5	;Store result into dest5
+	XSTR	[dest6+tmp], xd6	;Store result into dest6
+
+.return_pass:
+	FUNC_RESTORE
+	mov	return, 0
+	ret
+
+.return_fail:
+	FUNC_RESTORE
+	mov	return, 1
+	ret
+
+endproc_frame
+
+section .data
+align 32
+constip32:
+	ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+	ddq 0xe0e1e2e3e4e5e6e7e8e9eaebecedeeef
+
+;;;       func              core, ver, snum
+slversion gf_6vect_mad_avx2, 04,  01,  0211
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_sse.asm.s
new file mode 100644
index 0000000..574c8e5
--- /dev/null
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_sse.asm.s
@@ -0,0 +1,406 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_6vect_mad_sse(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+
+%define PS 8
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg0.w ecx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define arg4  r12
+ %define arg5  r15
+ %define tmp   r11
+ %define tmp.w r11d
+ %define tmp2   r10
+ %define tmp3   r13
+ %define tmp4   r14
+ %define tmp5   rdi
+ %define return rax
+ %define return.w eax
+ %define stack_size 16*10 + 5*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	movdqa	[rsp+16*0],xmm6
+	movdqa	[rsp+16*1],xmm7
+	movdqa	[rsp+16*2],xmm8
+	movdqa	[rsp+16*3],xmm9
+	movdqa	[rsp+16*4],xmm10
+	movdqa	[rsp+16*5],xmm11
+	movdqa	[rsp+16*6],xmm12
+	movdqa	[rsp+16*7],xmm13
+	movdqa	[rsp+16*8],xmm14
+	movdqa	[rsp+16*9],xmm15
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r13,  10*16 + 1*8
+	save_reg	r14,  10*16 + 2*8
+	save_reg	r15,  10*16 + 3*8
+	save_reg	rdi,  10*16 + 4*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp+16*0]
+	movdqa	xmm7, [rsp+16*1]
+	movdqa	xmm8, [rsp+16*2]
+	movdqa	xmm9, [rsp+16*3]
+	movdqa	xmm10, [rsp+16*4]
+	movdqa	xmm11, [rsp+16*5]
+	movdqa	xmm12, [rsp+16*6]
+	movdqa	xmm13, [rsp+16*7]
+	movdqa	xmm14, [rsp+16*8]
+	movdqa	xmm15, [rsp+16*9]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r13,  [rsp + 10*16 + 1*8]
+	mov	r14,  [rsp + 10*16 + 2*8]
+	mov	r15,  [rsp + 10*16 + 3*8]
+	mov	rdi,  [rsp + 10*16 + 4*8]
+	add	rsp, stack_size
+%endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define tmp.w r11d
+ %define tmp2   r10
+ %define tmp3   r12
+ %define tmp4   r13
+ %define tmp5   r14
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+	push	r13
+	push	r14
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r14
+	pop	r13
+	pop	r12
+ %endmacro
+%endif
+
+;;; gf_6vect_mad_sse(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest1  arg5
+%define pos   return
+%define pos.w return.w
+
+%define dest2 mul_array
+%define dest3 tmp2
+%define dest4 tmp4
+%define dest5 tmp5
+%define dest6 vec_i
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR movdqu
+ %define XSTR movdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR movdqa
+  %define XSTR movdqa
+ %else
+  %define XLDR movntdqa
+  %define XSTR movntdq
+ %endif
+%endif
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f  xmm15
+%define xgft4_lo  xmm14
+%define xgft4_hi  xmm13
+%define xgft5_lo  xmm12
+%define xgft5_hi  xmm11
+%define xgft6_lo  xmm10
+%define xgft6_hi  xmm9
+
+%define x0         xmm0
+%define xtmpa      xmm1
+%define xtmph1     xmm2
+%define xtmpl1     xmm3
+%define xtmph2     xmm4
+%define xtmpl2     xmm5
+%define xtmph3     xmm6
+%define xtmpl3     xmm7
+%define xd1        xmm8
+%define xd2        xtmpl1
+%define xd3        xtmph1
+
+
+align 16
+global gf_6vect_mad_sse:function
+func(gf_6vect_mad_sse)
+	FUNC_SAVE
+	sub	len, 16
+	jl	.return_fail
+
+	xor	pos, pos
+	movdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+
+	mov	tmp, vec
+	sal	vec_i, 5		;Multiply by 32
+	lea	tmp3, [mul_array + vec_i]
+	sal	tmp, 6			;Multiply by 64
+
+	sal	vec, 5			;Multiply by 32
+	lea	vec_i, [tmp + vec]	;vec_i = 96
+	lea	mul_array, [tmp + vec_i]	;mul_array = 160
+
+	movdqu	xgft5_lo, [tmp3+2*tmp]		;Load array Ex{00}, Ex{01}, ..., Ex{0f}
+	movdqu	xgft5_hi, [tmp3+2*tmp+16]	;     "     Ex{00}, Ex{10}, ..., Ex{f0}
+	movdqu	xgft4_lo, [tmp3+vec_i]		;Load array Dx{00}, Dx{01}, Dx{02}, ...
+	movdqu	xgft4_hi, [tmp3+vec_i+16]	; " Dx{00}, Dx{10}, Dx{20}, ... , Dx{f0}
+	movdqu	xgft6_lo, [tmp3+mul_array]	;Load array Fx{00}, Fx{01}, ..., Fx{0f}
+	movdqu	xgft6_hi, [tmp3+mul_array+16]	;     "     Fx{00}, Fx{10}, ..., Fx{f0}
+
+	mov	dest2, [dest1+PS]
+	mov	dest3, [dest1+2*PS]
+	mov	dest4, [dest1+3*PS]  ; reuse mul_array
+	mov	dest5, [dest1+4*PS]
+	mov	dest6, [dest1+5*PS]  ; reuse vec_i
+	mov	dest1, [dest1]
+
+.loop16:
+	XLDR	x0, [src+pos]		;Get next source vector
+
+	movdqu	xtmpl1, [tmp3]	;Load array Ax{00}, Ax{01}, Ax{02}, ...
+	movdqu	xtmph1, [tmp3+16]	; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	movdqu	xtmpl2, [tmp3+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+	movdqu	xtmph2, [tmp3+vec+16]	; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	movdqu	xtmpl3, [tmp3+2*vec]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+	movdqu	xtmph3, [tmp3+2*vec+16]	; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+	XLDR	xd1, [dest1+pos]	;Get next dest vector
+
+	movdqa	xtmpa, x0		;Keep unshifted copy of src
+	psraw	x0, 4			;Shift to put high nibble into bits 4-0
+	pand	x0, xmask0f		;Mask high src nibble in bits 4-0
+	pand	xtmpa, xmask0f		;Mask low src nibble in bits 4-0
+
+	;dest1
+	pshufb	xtmph1, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl1, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph1, xtmpl1		;GF add high and low partials
+	pxor	xd1, xtmph1
+
+	XLDR	xd2, [dest2+pos]	;reuse xtmpl1. Get next dest vector
+	XLDR	xd3, [dest3+pos]	;reuse xtmph1. Get next dest3 vector
+
+	;dest2
+	pshufb	xtmph2, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl2, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph2, xtmpl2		;GF add high and low partials
+	pxor	xd2, xtmph2
+
+	;dest3
+	pshufb	xtmph3, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl3, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph3, xtmpl3		;GF add high and low partials
+	pxor	xd3, xtmph3
+
+	XSTR	[dest1+pos], xd1	;Store result into dest1
+	XSTR	[dest2+pos], xd2	;Store result into dest2
+	XSTR	[dest3+pos], xd3	;Store result into dest3
+
+	movdqa	xtmph1, xgft4_hi	;Reload const array registers
+	movdqa	xtmpl1, xgft4_lo	;Reload const array registers
+	movdqa	xtmph2, xgft5_hi	;Reload const array registers
+	movdqa	xtmpl2, xgft5_lo	;Reload const array registers
+	movdqa	xtmph3, xgft6_hi	;Reload const array registers
+	movdqa	xtmpl3, xgft6_lo	;Reload const array registers
+
+	;dest4
+	XLDR	xd1, [dest4+pos]	;Get next dest vector
+	pshufb	xtmph1, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl1, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph1, xtmpl1		;GF add high and low partials
+	pxor	xd1, xtmph1
+
+	XLDR	xd2, [dest5+pos]	;reuse xtmpl1. Get next dest vector
+	XLDR	xd3, [dest6+pos]	;reuse xtmph1. Get next dest vector
+
+	;dest5
+	pshufb	xtmph2, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl2, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph2, xtmpl2		;GF add high and low partials
+	pxor	xd2, xtmph2
+
+	;dest6
+	pshufb	xtmph3, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl3, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph3, xtmpl3		;GF add high and low partials
+	pxor	xd3, xtmph3
+
+	XSTR	[dest4+pos], xd1	;Store result into dest4
+	XSTR	[dest5+pos], xd2	;Store result into dest5
+	XSTR	[dest6+pos], xd3	;Store result into dest6
+
+	add	pos, 16			;Loop on 16 bytes at a time
+	cmp	pos, len
+	jle	.loop16
+
+	lea	tmp, [len + 16]
+	cmp	pos, tmp
+	je	.return_pass
+
+.lessthan16:
+	;; Tail len
+	;; Do one more overlap pass
+	;; Overlapped offset length-16
+	mov	tmp, len		;Backup len as len=rdi
+
+	XLDR	x0, [src+tmp]		;Get next source vector
+	XLDR	xd1, [dest4+tmp]	;Get next dest vector
+	XLDR	xd2, [dest5+tmp]	;reuse xtmpl1. Get next dest vector
+	XLDR	xd3, [dest6+tmp]	;reuse xtmph1. Get next dest vector
+
+	sub	len, pos
+
+	movdqa	xtmph3, [constip16]	;Load const of i + 16
+	pinsrb	xtmpl3, len.w, 15
+	pshufb	xtmpl3, xmask0f		;Broadcast len to all bytes
+	pcmpgtb	xtmpl3, xtmph3
+
+	movdqa	xtmpa, x0		;Keep unshifted copy of src
+	psraw	x0, 4			;Shift to put high nibble into bits 4-0
+	pand	x0, xmask0f		;Mask high src nibble in bits 4-0
+	pand	xtmpa, xmask0f		;Mask low src nibble in bits 4-0
+
+	;dest4
+	pshufb	xgft4_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft4_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft4_hi, xgft4_lo	;GF add high and low partials
+	pand	xgft4_hi, xtmpl3
+	pxor	xd1, xgft4_hi
+
+	;dest5
+	pshufb	xgft5_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft5_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft5_hi, xgft5_lo	;GF add high and low partials
+	pand	xgft5_hi, xtmpl3
+	pxor	xd2, xgft5_hi
+
+	;dest6
+	pshufb	xgft6_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft6_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft6_hi, xgft6_lo	;GF add high and low partials
+	pand	xgft6_hi, xtmpl3
+	pxor	xd3, xgft6_hi
+
+	XSTR	[dest4+tmp], xd1	;Store result into dest4
+	XSTR	[dest5+tmp], xd2	;Store result into dest5
+	XSTR	[dest6+tmp], xd3	;Store result into dest6
+
+	movdqu	xgft4_lo, [tmp3]	;Load array Ax{00}, Ax{01}, Ax{02}, ...
+	movdqu	xgft4_hi, [tmp3+16]	; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	movdqu	xgft5_lo, [tmp3+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+	movdqu	xgft5_hi, [tmp3+vec+16]	; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	movdqu	xgft6_lo, [tmp3+2*vec]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+	movdqu	xgft6_hi, [tmp3+2*vec+16]	; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+	XLDR	xd1, [dest1+tmp]	;Get next dest vector
+	XLDR	xd2, [dest2+tmp]	;reuse xtmpl1. Get next dest vector
+	XLDR	xd3, [dest3+tmp]	;reuse xtmph1. Get next dest3 vector
+
+	;dest1
+	pshufb	xgft4_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft4_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft4_hi, xgft4_lo		;GF add high and low partials
+	pand	xgft4_hi, xtmpl3
+	pxor	xd1, xgft4_hi
+
+	;dest2
+	pshufb	xgft5_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft5_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft5_hi, xgft5_lo	;GF add high and low partials
+	pand	xgft5_hi, xtmpl3
+	pxor	xd2, xgft5_hi
+
+	;dest3
+	pshufb	xgft6_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft6_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft6_hi, xgft6_lo	;GF add high and low partials
+	pand	xgft6_hi, xtmpl3
+	pxor	xd3, xgft6_hi
+
+	XSTR	[dest1+tmp], xd1	;Store result into dest1
+	XSTR	[dest2+tmp], xd2	;Store result into dest2
+	XSTR	[dest3+tmp], xd3	;Store result into dest3
+
+.return_pass:
+	FUNC_RESTORE
+	mov	return, 0
+	ret
+
+.return_fail:
+	FUNC_RESTORE
+	mov	return, 1
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+
+mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+constip16:
+	ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+
+;;;       func             core, ver, snum
+slversion gf_6vect_mad_sse, 00,  01,  020f
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx.asm.s
index 894783f..4f06b12 100644
--- a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx.asm.s
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx.asm.s
@@ -1,5 +1,5 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 ;
 ;  Redistribution and use in source and binary forms, with or without
 ;  modification, are permitted provided that the following conditions
@@ -30,8 +30,8 @@
 ;;;
 ;;; gf_vect_dot_prod_avx(len, vec, *g_tbls, **buffs, *dest);
 ;;;
-;;; Author: Gregory Tucker
 
+%include "reg_sizes.asm"
 
 %ifidn __OUTPUT_FORMAT__, elf64
  %define arg0  rdi
@@ -39,12 +39,14 @@
  %define arg2  rdx
  %define arg3  rcx
  %define arg4  r8
- %define arg5  r9
 
  %define tmp   r11
  %define tmp2  r10
  %define tmp3  r9
  %define return rax
+ %macro  SLDR 2
+ %endmacro
+ %define SSTR SLDR
  %define PS 8
  %define func(x) x:
  %define FUNC_SAVE
@@ -62,6 +64,9 @@
  %define tmp2   r10
  %define tmp3   rdi 		; must be saved and loaded
  %define return rax
+ %macro  SLDR 2
+ %endmacro
+ %define SSTR SLDR
  %define PS 8
  %define frame_size 2*8
  %define arg(x)      [rsp + frame_size + PS + PS*x]
@@ -80,6 +85,67 @@
  %endmacro
 %endif
 
+%ifidn __OUTPUT_FORMAT__, elf32
+
+;;;================== High Address;
+;;;	arg4
+;;;	arg3
+;;;	arg2
+;;;	arg1
+;;;	arg0
+;;;	return
+;;;<================= esp of caller
+;;;	ebp
+;;;<================= ebp = esp
+;;;	esi
+;;;	edi
+;;;	ebx
+;;;<================= esp of callee
+;;;
+;;;================== Low Address;
+
+ %define PS 4
+ %define LOG_PS 2
+ %define func(x) x:
+ %define arg(x) [ebp + PS*2 + PS*x]
+
+ %define trans   ecx			;trans is for the variables in stack
+ %define arg0    trans
+ %define arg0_m  arg(0)
+ %define arg1    trans
+ %define arg1_m  arg(1)
+ %define arg2    arg2_m
+ %define arg2_m  arg(2)
+ %define arg3    ebx
+ %define arg4    trans
+ %define arg4_m  arg(4)
+ %define tmp	 edx
+ %define tmp2    edi
+ %define tmp3    esi
+ %define return  eax
+ %macro SLDR 2	;; stack load/restore
+	mov %1, %2
+ %endmacro
+ %define SSTR SLDR
+
+ %macro FUNC_SAVE 0
+	push	ebp
+	mov	ebp, esp
+	push	esi
+	push	edi
+	push	ebx
+	mov	arg3, arg(3)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	pop	ebx
+	pop	edi
+	pop	esi
+	mov	esp, ebp
+	pop	ebp
+ %endmacro
+
+%endif	; output formats
 
 %define len   arg0
 %define vec   arg1
@@ -91,6 +157,12 @@
 %define ptr   tmp3
 %define pos   return
 
+ %ifidn PS,4				;32-bit code
+	%define  vec_m 	arg1_m
+	%define  len_m 	arg0_m
+	%define  dest_m arg4_m
+ %endif
+
 %ifndef EC_ALIGNED_ADDR
 ;;; Use Un-aligned load/store
  %define XLDR vmovdqu
@@ -106,10 +178,11 @@
  %endif
 %endif
 
+%ifidn PS,8			; 64-bit code
+ default rel
+  [bits 64]
+%endif
 
-default rel
-
-[bits 64]
 section .text
 
 %define xmask0f  xmm5
@@ -124,7 +197,9 @@ align 16
 global gf_vect_dot_prod_avx:function
 func(gf_vect_dot_prod_avx)
 	FUNC_SAVE
+	SLDR 	len, len_m
 	sub	len, 16
+	SSTR 	len_m, len
 	jl	.return_fail
 	xor	pos, pos
 	vmovdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
@@ -135,10 +210,12 @@ func(gf_vect_dot_prod_avx)
 	xor	vec_i, vec_i
 
 .next_vect:
+
 	mov	ptr, [src+vec_i*PS]
 	vmovdqu	xgft_lo, [tmp]		;Load array Cx{00}, Cx{01}, ..., Cx{0f}
 	vmovdqu	xgft_hi, [tmp+16]	;     "     Cx{00}, Cx{10}, ..., Cx{f0}
 	XLDR	x0, [ptr+pos]		;Get next source vector
+
 	add	tmp, 32
 	add	vec_i, 1
 
@@ -150,11 +227,16 @@ func(gf_vect_dot_prod_avx)
 	vpshufb	xgft_lo, xgft_lo, xtmpa	;Lookup mul table of low nibble
 	vpxor	xgft_hi, xgft_hi, xgft_lo ;GF add high and low partials
 	vpxor	xp, xp, xgft_hi		;xp += partial
+
+	SLDR	vec, vec_m
 	cmp	vec_i, vec
 	jl	.next_vect
 
+	SLDR 	dest, dest_m
 	XSTR	[dest+pos], xp
+
 	add	pos, 16			;Loop on 16 bytes at a time
+	SLDR 	len, len_m
 	cmp	pos, len
 	jle	.loop16
 
@@ -182,19 +264,8 @@ section .data
 
 align 16
 
-poly:
 mask0f:
 ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
 
-%macro slversion 4
-global %1_slver_%2%3%4
-global %1_slver
-%1_slver:
-%1_slver_%2%3%4:
-	dw 0x%4
-	db 0x%3, 0x%2
-%endmacro
 ;;;       func                 core, ver, snum
-slversion gf_vect_dot_prod_avx, 02,  03,  0061
-; inform linker that this doesn't require executable stack
-section .note.GNU-stack noalloc noexec nowrite progbits
+slversion gf_vect_dot_prod_avx, 02,  05,  0061
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx2.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx2.asm.s
index f5f9287..47bb38c 100644
--- a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx2.asm.s
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx2.asm.s
@@ -1,5 +1,5 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 ;
 ;  Redistribution and use in source and binary forms, with or without
 ;  modification, are permitted provided that the following conditions
@@ -30,8 +30,8 @@
 ;;;
 ;;; gf_vect_dot_prod_avx2(len, vec, *g_tbls, **buffs, *dest);
 ;;;
-;;; Author: Gregory Tucker
 
+%include "reg_sizes.asm"
 
 %ifidn __OUTPUT_FORMAT__, elf64
  %define arg0  rdi
@@ -47,7 +47,10 @@
  %define tmp2  r10
  %define tmp3  r9
  %define return rax
- %define PS 8
+ %macro  SLDR   2
+ %endmacro
+ %define SSTR   SLDR
+ %define PS     8
  %define func(x) x:
  %define FUNC_SAVE
  %define FUNC_RESTORE
@@ -66,7 +69,10 @@
  %define tmp2   r10
  %define tmp3   rdi 		; must be saved and loaded
  %define return rax
- %define PS 8
+ %macro  SLDR   2
+ %endmacro
+ %define SSTR   SLDR
+ %define PS     8
  %define frame_size 2*8
  %define arg(x)      [rsp + frame_size + PS + PS*x]
 
@@ -84,6 +90,69 @@
  %endmacro
 %endif
 
+%ifidn __OUTPUT_FORMAT__, elf32
+
+;;;================== High Address;
+;;;	arg4
+;;;	arg3
+;;;	arg2
+;;;	arg1
+;;;	arg0
+;;;	return
+;;;<================= esp of caller
+;;;	ebp
+;;;<================= ebp = esp
+;;;	esi
+;;;	edi
+;;;	ebx
+;;;<================= esp of callee
+;;;
+;;;================== Low Address;
+
+ %define PS 4
+ %define LOG_PS 2
+ %define func(x) x:
+ %define arg(x) [ebp + PS*2 + PS*x]
+
+ %define trans   ecx			;trans is for the variables in stack
+ %define arg0    trans
+ %define arg0_m  arg(0)
+ %define arg1    trans
+ %define arg1_m  arg(1)
+ %define arg2    arg2_m
+ %define arg2_m  arg(2)
+ %define arg3    ebx
+ %define arg4    trans
+ %define arg4_m  arg(4)
+ %define tmp	 edx
+ %define tmp.w   edx
+ %define tmp.b   dl
+ %define tmp2    edi
+ %define tmp3    esi
+ %define return  eax
+ %macro SLDR     2			;stack load/restore
+	mov %1, %2
+ %endmacro
+ %define SSTR SLDR
+
+ %macro FUNC_SAVE 0
+	push	ebp
+	mov	ebp, esp
+	push	esi
+	push	edi
+	push	ebx
+	mov	arg3, arg(3)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	pop	ebx
+	pop	edi
+	pop	esi
+	mov	esp, ebp
+	pop	ebp
+ %endmacro
+
+%endif	; output formats
 
 %define len   arg0
 %define vec   arg1
@@ -95,6 +164,12 @@
 %define ptr   tmp3
 %define pos   return
 
+%ifidn PS,4				;32-bit code
+ %define  vec_m  arg1_m
+ %define  len_m  arg0_m
+ %define  dest_m arg4_m
+%endif
+
 %ifndef EC_ALIGNED_ADDR
 ;;; Use Un-aligned load/store
  %define XLDR vmovdqu
@@ -110,10 +185,11 @@
  %endif
 %endif
 
+%ifidn PS,8				;64-bit code
+ default rel
+ [bits 64]
+%endif
 
-default rel
-
-[bits 64]
 section .text
 
 %define xmask0f  ymm3
@@ -129,7 +205,9 @@ align 16
 global gf_vect_dot_prod_avx2:function
 func(gf_vect_dot_prod_avx2)
 	FUNC_SAVE
+	SLDR 	len, len_m
 	sub	len, 32
+	SSTR 	len_m, len
 	jl	.return_fail
 	xor	pos, pos
 	mov	tmp.b, 0x0f
@@ -142,6 +220,7 @@ func(gf_vect_dot_prod_avx2)
 	xor	vec_i, vec_i
 
 .next_vect:
+
 	mov	ptr, [src+vec_i*PS]
 
 	vmovdqu	xgft_lo, [tmp]		;Load array Cx{00}, Cx{01}, Cx{02}, ...
@@ -150,6 +229,7 @@ func(gf_vect_dot_prod_avx2)
 	vperm2i128 xgft_lo, xgft_lo, xgft_lo, 0x00 ; swapped to lo | lo
 
 	XLDR	x0, [ptr+pos]		;Get next source vector
+
 	add	tmp, 32
 	add	vec_i, 1
 
@@ -161,11 +241,16 @@ func(gf_vect_dot_prod_avx2)
 	vpshufb	xgft_lo, xgft_lo, xtmpa	;Lookup mul table of low nibble
 	vpxor	xgft_hi, xgft_hi, xgft_lo ;GF add high and low partials
 	vpxor	xp, xp, xgft_hi		;xp += partial
+
+	SLDR	vec, vec_m
 	cmp	vec_i, vec
 	jl	.next_vect
 
+	SLDR 	dest, dest_m
 	XSTR	[dest+pos], xp
+
 	add	pos, 32			;Loop on 32 bytes at a time
+	SLDR 	len, len_m
 	cmp	pos, len
 	jle	.loop32
 
@@ -191,15 +276,5 @@ endproc_frame
 
 section .data
 
-%macro slversion 4
-global %1_slver_%2%3%4
-global %1_slver
-%1_slver:
-%1_slver_%2%3%4:
-	dw 0x%4
-	db 0x%3, 0x%2
-%endmacro
 ;;;       func                  core, ver, snum
-slversion gf_vect_dot_prod_avx2, 04,  03,  0190
-; inform linker that this doesn't require executable stack
-section .note.GNU-stack noalloc noexec nowrite progbits
+slversion gf_vect_dot_prod_avx2, 04,  05,  0190
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_sse.asm.s
index 2e13c18..f7699c1 100644
--- a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_sse.asm.s
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_sse.asm.s
@@ -1,5 +1,5 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 ;
 ;  Redistribution and use in source and binary forms, with or without
 ;  modification, are permitted provided that the following conditions
@@ -30,8 +30,8 @@
 ;;;
 ;;; gf_vect_dot_prod_sse(len, vec, *g_tbls, **buffs, *dest);
 ;;;
-;;; Author: Gregory Tucker
 
+%include "reg_sizes.asm"
 
 %ifidn __OUTPUT_FORMAT__, elf64
  %define arg0  rdi
@@ -44,6 +44,9 @@
  %define tmp2  r10
  %define tmp3  r9
  %define return rax
+ %macro  SLDR 2
+ %endmacro
+ %define SSTR SLDR
  %define PS 8
  %define func(x) x:
  %define FUNC_SAVE
@@ -61,6 +64,9 @@
  %define tmp2   r10
  %define tmp3   rdi 		; must be saved and loaded
  %define return rax
+ %macro  SLDR 2
+ %endmacro
+ %define SSTR SLDR
  %define PS 8
  %define frame_size 2*8
  %define arg(x)      [rsp + frame_size + PS + PS*x]
@@ -79,6 +85,67 @@
  %endmacro
 %endif
 
+%ifidn __OUTPUT_FORMAT__, elf32
+
+;;;================== High Address;
+;;;	arg4
+;;;	arg3
+;;;	arg2
+;;;	arg1
+;;;	arg0
+;;;	return
+;;;<================= esp of caller
+;;;	ebp
+;;;<================= ebp = esp
+;;;	esi
+;;;	edi
+;;;	ebx
+;;;<================= esp of callee
+;;;
+;;;================== Low Address;
+
+ %define PS 4
+ %define LOG_PS 2
+ %define func(x) x:
+ %define arg(x) [ebp + PS*2 + PS*x]
+
+ %define trans   ecx			;trans is for the variables in stack
+ %define arg0    trans
+ %define arg0_m  arg(0)
+ %define arg1    trans
+ %define arg1_m  arg(1)
+ %define arg2    arg2_m
+ %define arg2_m  arg(2)
+ %define arg3    ebx
+ %define arg4    trans
+ %define arg4_m  arg(4)
+ %define tmp	 edx
+ %define tmp2    edi
+ %define tmp3    esi
+ %define return  eax
+ %macro SLDR 2	;; stack load/restore
+	mov %1, %2
+ %endmacro
+ %define SSTR SLDR
+
+ %macro FUNC_SAVE 0
+	push	ebp
+	mov	ebp, esp
+	push	esi
+	push	edi
+	push	ebx
+	mov	arg3, arg(3)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	pop	ebx
+	pop	edi
+	pop	esi
+	mov	esp, ebp
+	pop	ebp
+ %endmacro
+
+%endif	; output formats
 
 %define len   arg0
 %define vec   arg1
@@ -90,6 +157,11 @@
 %define ptr   tmp3
 %define pos   return
 
+ %ifidn PS,4				;32-bit code
+	%define  vec_m 	arg1_m
+	%define  len_m 	arg0_m
+	%define  dest_m arg4_m
+ %endif
 
 %ifndef EC_ALIGNED_ADDR
 ;;; Use Un-aligned load/store
@@ -106,10 +178,11 @@
  %endif
 %endif
 
+%ifidn PS,8				;64-bit code
+ default rel
+  [bits 64]
+%endif
 
-default rel
-
-[bits 64]
 section .text
 
 %define xmask0f  xmm5
@@ -124,7 +197,9 @@ align 16
 global gf_vect_dot_prod_sse:function
 func(gf_vect_dot_prod_sse)
 	FUNC_SAVE
+	SLDR 	len, len_m
 	sub	len, 16
+	SSTR 	len_m, len
 	jl	.return_fail
 	xor	pos, pos
 	movdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
@@ -135,26 +210,34 @@ func(gf_vect_dot_prod_sse)
 	xor	vec_i, vec_i
 
 .next_vect:
+
 	mov	ptr, [src+vec_i*PS]
 	movdqu	xgft_lo, [tmp]		;Load array Cx{00}, Cx{01}, ..., Cx{0f}
 	movdqu	xgft_hi, [tmp+16]	;     "     Cx{00}, Cx{10}, ..., Cx{f0}
 	XLDR	x0, [ptr+pos]		;Get next source vector
+
 	add	tmp, 32
 	add	vec_i, 1
+
 	movdqa	xtmpa, x0		;Keep unshifted copy of src
 	psraw	x0, 4			;Shift to put high nibble into bits 4-0
 	pand	x0, xmask0f		;Mask high src nibble in bits 4-0
 	pand	xtmpa, xmask0f		;Mask low src nibble in bits 4-0
+
 	pshufb	xgft_hi, x0		;Lookup mul table of high nibble
 	pshufb	xgft_lo, xtmpa		;Lookup mul table of low nibble
 	pxor	xgft_hi, xgft_lo	;GF add high and low partials
 	pxor	xp, xgft_hi		;xp += partial
+
+	SLDR 	vec, vec_m
 	cmp	vec_i, vec
 	jl	.next_vect
 
+	SLDR 	dest, dest_m
 	XSTR	[dest+pos], xp
 
 	add	pos, 16			;Loop on 16 bytes at a time
+	SLDR 	len, len_m
 	cmp	pos, len
 	jle	.loop16
 
@@ -181,17 +264,8 @@ endproc_frame
 section .data
 
 align 16
+
 mask0f:	ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
 
-%macro slversion 4
-global %1_slver_%2%3%4
-global %1_slver
-%1_slver:
-%1_slver_%2%3%4:
-	dw 0x%4
-	db 0x%3, 0x%2
-%endmacro
 ;;;       func                 core, ver, snum
-slversion gf_vect_dot_prod_sse, 00,  03,  0060
-; inform linker that this doesn't require executable stack
-section .note.GNU-stack noalloc noexec nowrite progbits
+slversion gf_vect_dot_prod_sse, 00,  05,  0060
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mad_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mad_avx.asm.s
new file mode 100644
index 0000000..f0fd91a
--- /dev/null
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mad_avx.asm.s
@@ -0,0 +1,196 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_vect_mad_avx(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg0.w ecx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define arg4  r12
+ %define arg5  r15
+ %define tmp   r11
+ %define return rax
+ %define return.w eax
+ %define PS 8
+ %define stack_size 16*3 + 3*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	vmovdqa	[rsp+16*0],xmm6
+	vmovdqa	[rsp+16*1],xmm7
+	vmovdqa	[rsp+16*2],xmm8
+	save_reg	r12,  3*16 + 0*8
+	save_reg	r15,  3*16 + 1*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	vmovdqa	xmm6, [rsp+16*0]
+	vmovdqa	xmm7, [rsp+16*1]
+	vmovdqa	xmm8, [rsp+16*2]
+	mov	r12,  [rsp + 3*16 + 0*8]
+	mov	r15,  [rsp + 3*16 + 1*8]
+	add	rsp, stack_size
+%endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+;;; gf_vect_mad_avx(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest  arg5
+%define pos   return
+%define pos.w return.w
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f  xmm8
+%define xgft_lo  xmm7
+%define xgft_hi  xmm6
+
+%define x0     xmm0
+%define xtmpa  xmm1
+%define xtmph  xmm2
+%define xtmpl  xmm3
+%define xd     xmm4
+%define xtmpd  xmm5
+
+align 16
+global gf_vect_mad_avx:function
+func(gf_vect_mad_avx)
+	FUNC_SAVE
+	sub	len, 16
+	jl	.return_fail
+
+	xor	pos, pos
+	vmovdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+
+	sal	vec_i, 5		;Multiply by 32
+	vmovdqu	xgft_lo, [vec_i+mul_array]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+	vmovdqu	xgft_hi, [vec_i+mul_array+16]	; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+
+	XLDR	xtmpd, [dest+len]	;backup the last 16 bytes in dest
+
+.loop16:
+	XLDR	xd, [dest+pos]		;Get next dest vector
+.loop16_overlap:
+	XLDR	x0, [src+pos]		;Get next source vector
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	vpshufb	xtmph, xgft_hi, x0	;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph, xtmph, xtmpl ;GF add high and low partials
+	vpxor	xd, xd, xtmph		;xd += partial
+
+	XSTR	[dest+pos], xd
+	add	pos, 16			;Loop on 16 bytes at a time
+	cmp	pos, len
+	jle	.loop16
+
+	lea	tmp, [len + 16]
+	cmp	pos, tmp
+	je	.return_pass
+
+	;; Tail len
+	mov	pos, len	;Overlapped offset length-16
+	vmovdqa	xd, xtmpd	;Restore xd
+	jmp	.loop16_overlap	;Do one more overlap pass
+
+.return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+.return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+
+mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+
+;;;       func            core, ver, snum
+slversion gf_vect_mad_avx, 02,  01,  0201
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mad_avx2.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mad_avx2.asm.s
new file mode 100644
index 0000000..5fa5da4
--- /dev/null
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mad_avx2.asm.s
@@ -0,0 +1,203 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_vect_mad_avx2(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0   rcx
+ %define arg0.w ecx
+ %define arg1   rdx
+ %define arg2   r8
+ %define arg3   r9
+ %define arg4   r12 		; must be saved and loaded
+ %define arg5   r15
+
+ %define tmp    r11
+ %define tmp.w  r11d
+ %define tmp.b  r11b
+ %define return rax
+ %define return.w eax
+ %define PS 8
+ %define stack_size 16*3 + 3*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+ %macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	vmovdqa	[rsp+16*0],xmm6
+	vmovdqa	[rsp+16*1],xmm7
+	vmovdqa	[rsp+16*2],xmm8
+	save_reg	r12,  3*16 + 0*8
+	save_reg	r15,  3*16 + 1*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	vmovdqa	xmm6, [rsp+16*0]
+	vmovdqa	xmm7, [rsp+16*1]
+	vmovdqa	xmm8, [rsp+16*2]
+	mov	r12,  [rsp + 3*16 + 0*8]
+	mov	r15,  [rsp + 3*16 + 1*8]
+	add	rsp, stack_size
+ %endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+
+ %define tmp      r11
+ %define tmp.w    r11d
+ %define tmp.b    r11b
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+
+;;; gf_vect_mad_avx2(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec   arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest  arg5
+%define pos   return
+%define pos.w return.w
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f  ymm8
+%define xmask0fx xmm8
+%define xgft_lo  ymm7
+%define xgft_hi  ymm6
+
+%define x0     ymm0
+%define xtmpa  ymm1
+%define xtmph  ymm2
+%define xtmpl  ymm3
+%define xd     ymm4
+%define xtmpd  ymm5
+
+align 16
+global gf_vect_mad_avx2:function
+func(gf_vect_mad_avx2)
+	FUNC_SAVE
+	sub	len, 32
+	jl	.return_fail
+	xor	pos, pos
+	mov	tmp.b, 0x0f
+	vpinsrb	xmask0fx, xmask0fx, tmp.w, 0
+	vpbroadcastb xmask0f, xmask0fx	;Construct mask 0x0f0f0f...
+
+	sal	vec_i, 5		;Multiply by 32
+	vmovdqu	xgft_lo, [vec_i+mul_array]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+						; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+	vperm2i128 xgft_hi, xgft_lo, xgft_lo, 0x11 ; swapped to hi | hi
+	vperm2i128 xgft_lo, xgft_lo, xgft_lo, 0x00 ; swapped to lo | lo
+
+	XLDR	xtmpd, [dest+len]	;backup the last 32 bytes in dest
+
+.loop32:
+	XLDR	xd, [dest+pos]		;Get next dest vector
+.loop32_overlap:
+	XLDR	x0, [src+pos]		;Get next source vector
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	vpshufb	xtmph, xgft_hi, x0	;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph, xtmph, xtmpl ;GF add high and low partials
+	vpxor	xd, xd, xtmph		;xd += partial
+
+	XSTR	[dest+pos], xd
+	add	pos, 32			;Loop on 32 bytes at a time
+	cmp	pos, len
+	jle	.loop32
+
+	lea	tmp, [len + 32]
+	cmp	pos, tmp
+	je	.return_pass
+
+	;; Tail len
+	mov	pos, len	;Overlapped offset length-32
+	vmovdqa	xd, xtmpd	;Restore xd
+	jmp	.loop32_overlap	;Do one more overlap pass
+
+.return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+.return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+;;;       func             core, ver, snum
+slversion gf_vect_mad_avx2, 04,  01,  0202
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mad_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mad_sse.asm.s
new file mode 100644
index 0000000..b3ebc97
--- /dev/null
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mad_sse.asm.s
@@ -0,0 +1,197 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_vect_mad_sse(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg0.w ecx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define arg4  r12
+ %define arg5  r15
+ %define tmp   r11
+ %define return rax
+ %define return.w eax
+ %define PS 8
+ %define stack_size 16*3 + 3*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	movdqa	[rsp+16*0],xmm6
+	movdqa	[rsp+16*1],xmm7
+	movdqa	[rsp+16*2],xmm8
+	save_reg	r12,  3*16 + 0*8
+	save_reg	r15,  3*16 + 1*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp+16*0]
+	movdqa	xmm7, [rsp+16*1]
+	movdqa	xmm8, [rsp+16*2]
+	mov	r12,  [rsp + 3*16 + 0*8]
+	mov	r15,  [rsp + 3*16 + 1*8]
+	add	rsp, stack_size
+%endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+;;; gf_vect_mad_sse(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest  arg5
+%define pos   return
+%define pos.w return.w
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR movdqu
+ %define XSTR movdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR movdqa
+  %define XSTR movdqa
+ %else
+  %define XLDR movntdqa
+  %define XSTR movntdq
+ %endif
+%endif
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f  xmm8
+%define xgft_lo  xmm7
+%define xgft_hi  xmm6
+
+%define x0     xmm0
+%define xtmpa  xmm1
+%define xtmph  xmm2
+%define xtmpl  xmm3
+%define xd     xmm4
+%define xtmpd  xmm5
+
+
+align 16
+global gf_vect_mad_sse:function
+func(gf_vect_mad_sse)
+	FUNC_SAVE
+	sub	len, 16
+	jl	.return_fail
+
+	xor	pos, pos
+	movdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+	sal	vec_i, 5		;Multiply by 32
+	movdqu	xgft_lo, [vec_i+mul_array]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+	movdqu	xgft_hi, [vec_i+mul_array+16]	; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+
+	XLDR	xtmpd, [dest+len]	;backup the last 16 bytes in dest
+
+.loop16:
+	XLDR	xd, [dest+pos]		;Get next dest vector
+.loop16_overlap:
+	XLDR	x0, [src+pos]		;Get next source vector
+	movdqa	xtmph, xgft_hi		;Reload const array registers
+	movdqa	xtmpl, xgft_lo
+	movdqa	xtmpa, x0		;Keep unshifted copy of src
+	psraw	x0, 4			;Shift to put high nibble into bits 4-0
+	pand	x0, xmask0f		;Mask high src nibble in bits 4-0
+	pand	xtmpa, xmask0f		;Mask low src nibble in bits 4-0
+	pshufb	xtmph, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph, xtmpl		;GF add high and low partials
+
+	pxor	xd, xtmph
+	XSTR	[dest+pos], xd		;Store result
+
+	add	pos, 16			;Loop on 16 bytes at a time
+	cmp	pos, len
+	jle	.loop16
+
+	lea	tmp, [len + 16]
+	cmp	pos, tmp
+	je	.return_pass
+
+	;; Tail len
+	mov	pos, len	;Overlapped offset length-16
+	movdqa	xd, xtmpd	;Restore xd
+	jmp	.loop16_overlap	;Do one more overlap pass
+
+.return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+.return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+
+mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+
+;;;       func            core, ver, snum
+slversion gf_vect_mad_sse, 00,  01,  0200
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mul_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mul_avx.asm.s
index 0536ed7..c9438b1 100644
--- a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mul_avx.asm.s
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mul_avx.asm.s
@@ -1,5 +1,5 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 ;
 ;  Redistribution and use in source and binary forms, with or without
 ;  modification, are permitted provided that the following conditions
@@ -30,8 +30,8 @@
 ;;;
 ;;; gf_vect_mul_avx(len, mul_array, src, dest)
 ;;;
-;;; Author: Gregory Tucker
 
+%include "reg_sizes.asm"
 
 %ifidn __OUTPUT_FORMAT__, elf64
  %define arg0  rdi
@@ -160,15 +160,5 @@ align 16
 mask0f:
 ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
 
-%macro slversion 4
-global %1_slver_%2%3%4
-global %1_slver
-%1_slver:
-%1_slver_%2%3%4:
-	dw 0x%4
-	db 0x%3, 0x%2
-%endmacro
 ;;;       func             core, ver, snum
-slversion gf_vect_mul_avx, 01,   02,  0036
-; inform linker that this doesn't require executable stack
-section .note.GNU-stack noalloc noexec nowrite progbits
+slversion gf_vect_mul_avx, 01,   03,  0036
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mul_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mul_sse.asm.s
index c6d7d58..2a14cc9 100644
--- a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mul_sse.asm.s
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mul_sse.asm.s
@@ -1,5 +1,5 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 ;
 ;  Redistribution and use in source and binary forms, with or without
 ;  modification, are permitted provided that the following conditions
@@ -30,8 +30,8 @@
 ;;;
 ;;; gf_vect_mul_sse(len, mul_array, src, dest)
 ;;;
-;;; Author: Gregory Tucker
 
+%include "reg_sizes.asm"
 
 %ifidn __OUTPUT_FORMAT__, elf64
  %define arg0  rdi
@@ -166,15 +166,5 @@ align 16
 mask0f:
 ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
 
-%macro slversion 4
-global %1_slver_%2%3%4
-global %1_slver
-%1_slver:
-%1_slver_%2%3%4:
-	dw 0x%4
-	db 0x%3, 0x%2
-%endmacro
 ;;;       func        core, ver, snum
-slversion gf_vect_mul_sse, 00,   02,  0034
-; inform linker that this doesn't require executable stack
-section .note.GNU-stack noalloc noexec nowrite progbits
+slversion gf_vect_mul_sse, 00,   03,  0034
diff --git a/src/erasure-code/isa/isa-l/include/erasure_code.h b/src/erasure-code/isa/isa-l/include/erasure_code.h
index 0f3b6db..53e480f 100644
--- a/src/erasure-code/isa/isa-l/include/erasure_code.h
+++ b/src/erasure-code/isa/isa-l/include/erasure_code.h
@@ -1,5 +1,5 @@
 /**********************************************************************
-  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
@@ -74,73 +74,128 @@ extern "C" {
 void ec_init_tables(int k, int rows, unsigned char* a, unsigned char* gftbls);
 
 /**
- * @brief Generate or decode erasure codes on blocks of data.
+ * @brief Generate or decode erasure codes on blocks of data, runs appropriate version.
  *
  * Given a list of source data blocks, generate one or multiple blocks of
  * encoded data as specified by a matrix of GF(2^8) coefficients. When given a
  * suitable set of coefficients, this function will perform the fast generation
  * or decoding of Reed-Solomon type erasure codes.
  *
- * @requires SSE4.1
+ * This function determines what instruction sets are enabled and
+ * selects the appropriate version at runtime.
+ *
  * @param len    Length of each block of data (vector) of source or dest data.
  * @param k      The number of vector sources or rows in the generator matrix
  * 		 for coding.
  * @param rows   The number of output vectors to concurrently encode/decode.
  * @param gftbls Pointer to array of input tables generated from coding
- *               coefficients in ec_init_tables(). Must be of size 32*k*rows
+ * 		 coefficients in ec_init_tables(). Must be of size 32*k*rows
  * @param data   Array of pointers to source input buffers.
  * @param coding Array of pointers to coded output buffers.
  * @returns none
  */
 
-void ec_encode_data_sse(int len, int k, int rows, unsigned char *gftbls, unsigned char **data, unsigned char **coding);
+void ec_encode_data(int len, int k, int rows, unsigned char *gftbls, unsigned char **data,
+		    unsigned char **coding);
 
+/**
+ * @brief Generate or decode erasure codes on blocks of data.
+ *
+ * Arch specific version of ec_encode_data() with same parameters.
+ * @requires SSE4.1
+ */
+void ec_encode_data_sse(int len, int k, int rows, unsigned char *gftbls, unsigned char **data,
+			unsigned char **coding);
 
 /**
- * @brief Generate or decode erasure codes on blocks of data, runs appropriate version.
+ * @brief Generate or decode erasure codes on blocks of data.
  *
- * Given a list of source data blocks, generate one or multiple blocks of
- * encoded data as specified by a matrix of GF(2^8) coefficients. When given a
- * suitable set of coefficients, this function will perform the fast generation
- * or decoding of Reed-Solomon type erasure codes.
+ * Arch specific version of ec_encode_data() with same parameters.
+ * @requires AVX
+ */
+void ec_encode_data_avx(int len, int k, int rows, unsigned char *gftbls, unsigned char **data,
+			unsigned char **coding);
+
+/**
+ * @brief Generate or decode erasure codes on blocks of data.
  *
- * This function determines what instruction sets are enabled and
- * selects the appropriate version at runtime.
+ * Arch specific version of ec_encode_data() with same parameters.
+ * @requires AVX2
+ */
+void ec_encode_data_avx2(int len, int k, int rows, unsigned char *gftbls, unsigned char **data,
+			 unsigned char **coding);
+
+/**
+ * @brief Generate or decode erasure codes on blocks of data, runs baseline version.
+ *
+ * Baseline version of ec_encode_data() with same parameters.
+ */
+void ec_encode_data_base(int len, int srcs, int dests, unsigned char *v, unsigned char **src,
+			 unsigned char **dest);
+
+/**
+ * @brief Generate update for encode or decode of erasure codes from single source, runs appropriate version.
+ *
+ * Given one source data block, update one or multiple blocks of encoded data as
+ * specified by a matrix of GF(2^8) coefficients. When given a suitable set of
+ * coefficients, this function will perform the fast generation or decoding of
+ * Reed-Solomon type erasure codes from one input source at a time.
+ *
+ * This function determines what instruction sets are enabled and selects the
+ * appropriate version at runtime.
  *
  * @param len    Length of each block of data (vector) of source or dest data.
  * @param k      The number of vector sources or rows in the generator matrix
  * 		 for coding.
  * @param rows   The number of output vectors to concurrently encode/decode.
- * @param gftbls Pointer to array of input tables generated from coding
+ * @param vec_i  The vector index corresponding to the single input source.
+ * @param g_tbls Pointer to array of input tables generated from coding
  * 		 coefficients in ec_init_tables(). Must be of size 32*k*rows
- * @param data   Array of pointers to source input buffers.
+ * @param data   Pointer to single input source used to update output parity.
  * @param coding Array of pointers to coded output buffers.
  * @returns none
  */
+void ec_encode_data_update(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
+			   unsigned char *data, unsigned char **coding);
 
-void ec_encode_data(int len, int k, int rows, unsigned char *gftbls, unsigned char **data, unsigned char **coding);
+/**
+ * @brief Generate update for encode or decode of erasure codes from single source.
+ *
+ * Arch specific version of ec_encode_data_update() with same parameters.
+ * @requires SSE4.1
+ */
 
+void ec_encode_data_update_sse(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
+			       unsigned char *data, unsigned char **coding);
 
 /**
- * @brief Generate or decode erasure codes on blocks of data, runs baseline version.
+ * @brief Generate update for encode or decode of erasure codes from single source.
  *
- * Given a list of source data blocks, generate one or multiple blocks of
- * encoded data as specified by a matrix of GF(2^8) coefficients.  When given a
- * suitable set of coefficients, this function will perform the fast generation
- * or decoding of Reed-Solomon type erasure codes.
+ * Arch specific version of ec_encode_data_update() with same parameters.
+ * @requires AVX
+ */
+
+void ec_encode_data_update_avx(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
+			       unsigned char *data, unsigned char **coding);
+
+/**
+ * @brief Generate update for encode or decode of erasure codes from single source.
  *
- * @param len    Length of each block of data (vector) of source or dest data.
- * @param srcs   The number of vector sources or rows in the generator matrix
- * 		 for coding.
- * @param dests  The number of output vectors to concurrently encode/decode.
- * @param v      Pointer to array of input tables generated from coding
- * 		 coefficients in ec_init_tables(). Must be of size 32*k*rows
- * @param src    Array of pointers to source input buffers.
- * @param dest   Array of pointers to coded output buffers.
- * @returns none
+ * Arch specific version of ec_encode_data_update() with same parameters.
+ * @requires AVX2
+ */
+
+void ec_encode_data_update_avx2(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
+				unsigned char *data, unsigned char **coding);
+
+/**
+ * @brief Generate update for encode or decode of erasure codes from single source.
+ *
+ * Baseline version of ec_encode_data_update().
  */
 
-void ec_encode_data_base(int len, int srcs, int dests, unsigned char *v, unsigned char **src, unsigned char **dest);
+void ec_encode_data_update_base(int len, int k, int rows, int vec_i, unsigned char *v,
+				unsigned char *data, unsigned char **dest);
 
 
 /**
@@ -150,8 +205,8 @@ void ec_encode_data_base(int len, int srcs, int dests, unsigned char *v, unsigne
  * set of coefficients to produce each byte of the output. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 32*vlen byte constant array based on the input coefficients.
- *
  * @requires SSE4.1
+ *
  * @param len    Length of each vector in bytes. Must be >= 16.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based
@@ -171,8 +226,8 @@ void gf_vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls,
  * set of coefficients to produce each byte of the output. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 32*vlen byte constant array based on the input coefficients.
- *
  * @requires AVX
+ *
  * @param len    Length of each vector in bytes. Must be >= 16.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based
@@ -192,8 +247,8 @@ void gf_vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls,
  * set of coefficients to produce each byte of the output. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 32*vlen byte constant array based on the input coefficients.
- *
  * @requires AVX2
+ *
  * @param len    Length of each vector in bytes. Must be >= 32.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based
@@ -214,8 +269,8 @@ void gf_vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls,
  * sets of coefficients to produce each byte of the outputs. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 2*32*vlen byte constant array based on the two sets of input coefficients.
- *
  * @requires SSE4.1
+ *
  * @param len    Length of each vector in bytes. Must be >= 16.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 2*32*vlen byte array of pre-calculated constants
@@ -236,8 +291,8 @@ void gf_2vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls,
  * sets of coefficients to produce each byte of the outputs. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 2*32*vlen byte constant array based on the two sets of input coefficients.
- *
  * @requires AVX
+ *
  * @param len    Length of each vector in bytes. Must be >= 16.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 2*32*vlen byte array of pre-calculated constants
@@ -258,8 +313,8 @@ void gf_2vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls,
  * sets of coefficients to produce each byte of the outputs. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 2*32*vlen byte constant array based on the two sets of input coefficients.
- *
  * @requires AVX2
+ *
  * @param len    Length of each vector in bytes. Must be >= 32.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 2*32*vlen byte array of pre-calculated constants
@@ -280,8 +335,8 @@ void gf_2vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls,
  * sets of coefficients to produce each byte of the outputs. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 3*32*vlen byte constant array based on the three sets of input coefficients.
- *
  * @requires SSE4.1
+ *
  * @param len    Length of each vector in bytes. Must be >= 16.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 3*32*vlen byte array of pre-calculated constants
@@ -302,8 +357,8 @@ void gf_3vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls,
  * sets of coefficients to produce each byte of the outputs. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 3*32*vlen byte constant array based on the three sets of input coefficients.
- *
  * @requires AVX
+ *
  * @param len    Length of each vector in bytes. Must be >= 16.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 3*32*vlen byte array of pre-calculated constants
@@ -324,8 +379,8 @@ void gf_3vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls,
  * sets of coefficients to produce each byte of the outputs. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 3*32*vlen byte constant array based on the three sets of input coefficients.
- *
  * @requires AVX2
+ *
  * @param len    Length of each vector in bytes. Must be >= 32.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 3*32*vlen byte array of pre-calculated constants
@@ -346,8 +401,8 @@ void gf_3vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls,
  * sets of coefficients to produce each byte of the outputs. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 4*32*vlen byte constant array based on the four sets of input coefficients.
- *
  * @requires SSE4.1
+ *
  * @param len    Length of each vector in bytes. Must be >= 16.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 4*32*vlen byte array of pre-calculated constants
@@ -368,8 +423,8 @@ void gf_4vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls,
  * sets of coefficients to produce each byte of the outputs. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 4*32*vlen byte constant array based on the four sets of input coefficients.
- *
  * @requires AVX
+ *
  * @param len    Length of each vector in bytes. Must be >= 16.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 4*32*vlen byte array of pre-calculated constants
@@ -390,8 +445,8 @@ void gf_4vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls,
  * sets of coefficients to produce each byte of the outputs. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 4*32*vlen byte constant array based on the four sets of input coefficients.
- *
  * @requires AVX2
+ *
  * @param len    Length of each vector in bytes. Must be >= 32.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 4*32*vlen byte array of pre-calculated constants
@@ -412,8 +467,8 @@ void gf_4vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls,
  * sets of coefficients to produce each byte of the outputs. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 5*32*vlen byte constant array based on the five sets of input coefficients.
- *
  * @requires SSE4.1
+ *
  * @param len    Length of each vector in bytes. Must >= 16.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 5*32*vlen byte array of pre-calculated constants
@@ -434,8 +489,8 @@ void gf_5vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls,
  * sets of coefficients to produce each byte of the outputs. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 5*32*vlen byte constant array based on the five sets of input coefficients.
- *
  * @requires AVX
+ *
  * @param len    Length of each vector in bytes. Must >= 16.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 5*32*vlen byte array of pre-calculated constants
@@ -456,8 +511,8 @@ void gf_5vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls,
  * sets of coefficients to produce each byte of the outputs. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 5*32*vlen byte constant array based on the five sets of input coefficients.
- *
  * @requires AVX2
+ *
  * @param len    Length of each vector in bytes. Must >= 32.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 5*32*vlen byte array of pre-calculated constants
@@ -478,8 +533,8 @@ void gf_5vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls,
  * sets of coefficients to produce each byte of the outputs. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 6*32*vlen byte constant array based on the six sets of input coefficients.
- *
  * @requires SSE4.1
+ *
  * @param len    Length of each vector in bytes. Must be >= 16.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 6*32*vlen byte array of pre-calculated constants
@@ -500,8 +555,8 @@ void gf_6vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls,
  * sets of coefficients to produce each byte of the outputs. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 6*32*vlen byte constant array based on the six sets of input coefficients.
- *
  * @requires AVX
+ *
  * @param len    Length of each vector in bytes. Must be >= 16.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 6*32*vlen byte array of pre-calculated constants
@@ -522,8 +577,8 @@ void gf_6vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls,
  * sets of coefficients to produce each byte of the outputs. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 6*32*vlen byte constant array based on the six sets of input coefficients.
- *
  * @requires AVX2
+ *
  * @param len    Length of each vector in bytes. Must be >= 32.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 6*32*vlen byte array of pre-calculated constants
@@ -582,6 +637,224 @@ void gf_vect_dot_prod_base(int len, int vlen, unsigned char *gftbls,
 void gf_vect_dot_prod(int len, int vlen, unsigned char *gftbls,
                         unsigned char **src, unsigned char *dest);
 
+
+/**
+ * @brief GF(2^8) vector multiply accumulate, runs appropriate version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constant and add to destination array. Can be used for erasure coding encode
+ * and decode update when only one source is available at a time. Function
+ * requires pre-calculation of a 32*vec byte constant array based on the input
+ * coefficients.
+ *
+ * This function determines what instruction sets are enabled and selects the
+ * appropriate version at runtime.
+ *
+ * @param len    Length of each vector in bytes. Must be >= 32.
+ * @param vec    The number of vector sources or rows in the generator matrix
+ * 		 for coding.
+ * @param vec_i  The vector index corresponding to the single input source.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * 		 coefficients in ec_init_tables(). Must be of size 32*vec.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Pointer to destination data array.
+ * @returns none
+ */
+
+void gf_vect_mad(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		 unsigned char *dest);
+
+/**
+ * @brief GF(2^8) vector multiply accumulate, arch specific version.
+ *
+ * Arch specific version of gf_vect_mad() with same parameters.
+ * @requires SSE4.1
+ */
+
+void gf_vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		     unsigned char *dest);
+/**
+ * @brief GF(2^8) vector multiply accumulate, arch specific version.
+ *
+ * Arch specific version of gf_vect_mad() with same parameters.
+ * @requires AVX
+ */
+
+void gf_vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		     unsigned char *dest);
+
+/**
+ * @brief GF(2^8) vector multiply accumulate, arch specific version.
+ *
+ * Arch specific version of gf_vect_mad() with same parameters.
+ * @requires AVX2
+ */
+
+void gf_vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char *dest);
+
+/**
+ * @brief GF(2^8) vector multiply accumulate, baseline version.
+ *
+ * Baseline version of gf_vect_mad() with same parameters.
+ */
+
+void gf_vect_mad_base(int len, int vec, int vec_i, unsigned char *v, unsigned char *src,
+		      unsigned char *dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 2 accumulate.  SSE version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constants and add to destination arrays. Can be used for erasure coding
+ * encode and decode update when only one source is available at a
+ * time. Function requires pre-calculation of a 32*vec byte constant array based
+ * on the input coefficients.
+ * @requires SSE4.1
+ *
+ * @param len    Length of each vector in bytes. Must be >= 32.
+ * @param vec    The number of vector sources or rows in the generator matrix
+ * 		 for coding.
+ * @param vec_i  The vector index corresponding to the single input source.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * 		 coefficients in ec_init_tables(). Must be of size 32*vec.
+ * @param src    Pointer to source input array.
+ * @param dest   Array of pointers to destination input/outputs.
+ * @returns none
+ */
+
+void gf_2vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 2 accumulate. AVX version of gf_2vect_mad_sse().
+ * @requires AVX
+ */
+void gf_2vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+/**
+ * @brief GF(2^8) vector multiply with 2 accumulate. AVX2 version of gf_2vect_mad_sse().
+ * @requires AVX2
+ */
+void gf_2vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		       unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 3 accumulate. SSE version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constants and add to destination arrays. Can be used for erasure coding
+ * encode and decode update when only one source is available at a
+ * time. Function requires pre-calculation of a 32*vec byte constant array based
+ * on the input coefficients.
+ * @requires SSE4.1
+ *
+ * @param len    Length of each vector in bytes. Must be >= 32.
+ * @param vec    The number of vector sources or rows in the generator matrix
+ * 		 for coding.
+ * @param vec_i  The vector index corresponding to the single input source.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * 		 coefficients in ec_init_tables(). Must be of size 32*vec.
+ * @param src    Pointer to source input array.
+ * @param dest   Array of pointers to destination input/outputs.
+ * @returns none
+ */
+
+void gf_3vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 3 accumulate. AVX version of gf_3vect_mad_sse().
+ * @requires AVX
+ */
+void gf_3vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 3 accumulate. AVX2 version of gf_3vect_mad_sse().
+ * @requires AVX2
+ */
+void gf_3vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		       unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 4 accumulate. SSE version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constants and add to destination arrays. Can be used for erasure coding
+ * encode and decode update when only one source is available at a
+ * time. Function requires pre-calculation of a 32*vec byte constant array based
+ * on the input coefficients.
+ * @requires SSE4.1
+ *
+ * @param len    Length of each vector in bytes. Must be >= 32.
+ * @param vec    The number of vector sources or rows in the generator matrix
+ * 		 for coding.
+ * @param vec_i  The vector index corresponding to the single input source.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * 		 coefficients in ec_init_tables(). Must be of size 32*vec.
+ * @param src    Pointer to source input array.
+ * @param dest   Array of pointers to destination input/outputs.
+ * @returns none
+ */
+
+void gf_4vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 4 accumulate. AVX version of gf_4vect_mad_sse().
+ * @requires AVX
+ */
+void gf_4vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+/**
+ * @brief GF(2^8) vector multiply with 4 accumulate. AVX2 version of gf_4vect_mad_sse().
+ * @requires AVX2
+ */
+void gf_4vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		       unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 5 accumulate. SSE version.
+ * @requires SSE4.1
+ */
+void gf_5vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 5 accumulate. AVX version.
+ * @requires AVX
+ */
+void gf_5vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+/**
+ * @brief GF(2^8) vector multiply with 5 accumulate. AVX2 version.
+ * @requires AVX2
+ */
+void gf_5vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		       unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 6 accumulate. SSE version.
+ * @requires SSE4.1
+ */
+void gf_6vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+/**
+ * @brief GF(2^8) vector multiply with 6 accumulate. AVX version.
+ * @requires AVX
+ */
+void gf_6vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 6 accumulate. AVX2 version.
+ * @requires AVX2
+ */
+void gf_6vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		       unsigned char **dest);
+
+
 /**********************************************************************
  * The remaining are lib support functions used in GF(2^8) operations.
  */
@@ -650,6 +923,7 @@ void gf_gen_cauchy1_matrix(unsigned char *a, int m, int k);
 
 int gf_invert_matrix(unsigned char *in, unsigned char *out, const int n);
 
+
 /*************************************************************/
 
 #ifdef __cplusplus
diff --git a/src/erasure-code/isa/isa-l/include/gf_vect_mul.h b/src/erasure-code/isa/isa-l/include/gf_vect_mul.h
index ef19845..bf4fd01 100644
--- a/src/erasure-code/isa/isa-l/include/gf_vect_mul.h
+++ b/src/erasure-code/isa/isa-l/include/gf_vect_mul.h
@@ -1,5 +1,5 @@
 /**********************************************************************
-  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
@@ -52,8 +52,8 @@ extern "C" {
  * 32-element constant array based on constant C. gftbl(C) = {C{00},
  * C{01}, C{02}, ... , C{0f} }, {C{00}, C{10}, C{20}, ... , C{f0} }. Len
  * and src must be aligned to 32B.
-
  * @requires SSE4.1
+ *
  * @param len   Length of vector in bytes. Must be aligned to 32B.
  * @param gftbl Pointer to 32-byte array of pre-calculated constants based on C.
  * @param src   Pointer to src data array. Must be aligned to 32B.
@@ -73,8 +73,8 @@ int gf_vect_mul_sse(int len, unsigned char *gftbl, void *src, void *dest);
  * 32-element constant array based on constant C. gftbl(C) = {C{00},
  * C{01}, C{02}, ... , C{0f} }, {C{00}, C{10}, C{20}, ... , C{f0} }. Len
  * and src must be aligned to 32B.
-
  * @requires AVX
+ *
  * @param len   Length of vector in bytes. Must be aligned to 32B.
  * @param gftbl Pointer to 32-byte array of pre-calculated constants based on C.
  * @param src   Pointer to src data array. Must be aligned to 32B.
diff --git a/src/erasure-code/isa/isa-l/include/reg_sizes.asm b/src/erasure-code/isa/isa-l/include/reg_sizes.asm
index ed21252..650c1fe 100644
--- a/src/erasure-code/isa/isa-l/include/reg_sizes.asm
+++ b/src/erasure-code/isa/isa-l/include/reg_sizes.asm
@@ -1,5 +1,5 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 ;
 ;  Redistribution and use in source and binary forms, with or without
 ;  modification, are permitted provided that the following conditions
@@ -27,6 +27,9 @@
 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
+%ifndef _REG_SIZES_ASM_
+%define _REG_SIZES_ASM_
+
 %define EFLAGS_HAS_CPUID        (1<<21)
 %define FLAG_CPUID1_ECX_CLMUL   (1<<1)
 %define FLAG_CPUID1_EDX_SSE2    (1<<26)
@@ -94,3 +97,27 @@
 %define BYTE(reg)  reg %+ b
 
 %define XWORD(reg) reg %+ x
+
+%ifidn __OUTPUT_FORMAT__,elf32
+section .note.GNU-stack noalloc noexec nowrite progbits
+section .text
+%endif
+%ifidn __OUTPUT_FORMAT__,elf64
+section .note.GNU-stack noalloc noexec nowrite progbits
+section .text
+%endif
+%ifidn __OUTPUT_FORMAT__, macho64
+%define elf64 macho64
+%endif
+
+%macro slversion 4
+	section .text
+	global %1_slver_%2%3%4
+	global %1_slver
+	%1_slver:
+	%1_slver_%2%3%4:
+		dw 0x%4
+		db 0x%3, 0x%2
+%endmacro
+
+%endif ; ifndef _REG_SIZES_ASM_
diff --git a/src/erasure-code/isa/isa-l/include/types.h b/src/erasure-code/isa/isa-l/include/types.h
index 0feed47..f5775ef 100644
--- a/src/erasure-code/isa/isa-l/include/types.h
+++ b/src/erasure-code/isa/isa-l/include/types.h
@@ -1,5 +1,5 @@
 /**********************************************************************
-  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
@@ -41,7 +41,7 @@
 extern "C" {
 #endif
 
-#ifndef __unix__
+#ifdef __WIN32__
 #ifdef __MINGW32__
 # include <_mingw.h>
 #endif
@@ -59,12 +59,20 @@ typedef unsigned char      UINT8;
 #endif
 
 
-#ifdef __unix__
+#if defined  __unix__ || defined __APPLE__
 # define DECLARE_ALIGNED(decl, alignval) decl __attribute__((aligned(alignval)))
 # define __forceinline static inline
+# define aligned_free(x) free(x)
 #else
-# define DECLARE_ALIGNED(decl, alignval) __declspec(align(alignval)) decl
-# define posix_memalign(p, algn, len) (NULL == (*((char**)(p)) = (void*) _aligned_malloc(len, algn)))
+# ifdef __MINGW32__
+#   define DECLARE_ALIGNED(decl, alignval) decl __attribute__((aligned(alignval)))
+#   define posix_memalign(p, algn, len) (NULL == (*((char**)(p)) = (void*) _aligned_malloc(len, algn)))
+#   define aligned_free(x) _aligned_free(x)
+# else
+#   define DECLARE_ALIGNED(decl, alignval) __declspec(align(alignval)) decl
+#   define posix_memalign(p, algn, len) (NULL == (*((char**)(p)) = (void*) _aligned_malloc(len, algn)))
+#   define aligned_free(x) _aligned_free(x)
+# endif
 #endif
 
 #ifdef DEBUG
diff --git a/src/erasure-code/jerasure/ErasureCodeJerasure.cc b/src/erasure-code/jerasure/ErasureCodeJerasure.cc
index 4720d21..408f51f 100644
--- a/src/erasure-code/jerasure/ErasureCodeJerasure.cc
+++ b/src/erasure-code/jerasure/ErasureCodeJerasure.cc
@@ -52,36 +52,40 @@ int ErasureCodeJerasure::create_ruleset(const string &name,
   }
 }
 
-void ErasureCodeJerasure::init(const map<string,string> &parameters)
+int ErasureCodeJerasure::init(ErasureCodeProfile& profile, ostream *ss)
 {
+  int err = 0;
   dout(10) << "technique=" << technique << dendl;
-  map<string,string>::const_iterator parameter;
-  parameter = parameters.find("ruleset-root");
-  if (parameter != parameters.end())
-    ruleset_root = parameter->second;
-  parameter = parameters.find("ruleset-failure-domain");
-  if (parameter != parameters.end())
-    ruleset_failure_domain = parameter->second;
-  ostringstream ss;
-  if (parse(parameters, &ss))
-    derr << ss.str() << dendl;
+  profile["technique"] = technique;
+  err |= to_string("ruleset-root", profile,
+		   &ruleset_root,
+		   DEFAULT_RULESET_ROOT, ss);
+  err |= to_string("ruleset-failure-domain", profile,
+		   &ruleset_failure_domain,
+		   DEFAULT_RULESET_FAILURE_DOMAIN, ss);
+  err |= parse(profile, ss);
+  if (err)
+    return err;
   prepare();
+  ErasureCode::init(profile, ss);
+  return err;
 }
 
-int ErasureCodeJerasure::parse(const map<std::string,std::string> &parameters,
+int ErasureCodeJerasure::parse(ErasureCodeProfile &profile,
 			       ostream *ss)
 {
-  int err = ErasureCode::parse(parameters, ss);
-  err |= to_int("k", parameters, &k, DEFAULT_K, ss);
-  err |= to_int("m", parameters, &m, DEFAULT_M, ss);
-  err |= to_int("w", parameters, &w, DEFAULT_W, ss);
+  int err = ErasureCode::parse(profile, ss);
+  err |= to_int("k", profile, &k, DEFAULT_K, ss);
+  err |= to_int("m", profile, &m, DEFAULT_M, ss);
+  err |= to_int("w", profile, &w, DEFAULT_W, ss);
   if (chunk_mapping.size() > 0 && (int)chunk_mapping.size() != k + m) {
-    *ss << "mapping " << parameters.find("mapping")->second
+    *ss << "mapping " << profile.find("mapping")->second
 	<< " maps " << chunk_mapping.size() << " chunks instead of"
 	<< " the expected " << k + m << " and will be ignored" << std::endl;
     chunk_mapping.clear();
     err = -EINVAL;
   }
+  err |= sanity_check_k(k, ss);
   return err;
 }
 
@@ -191,18 +195,20 @@ unsigned ErasureCodeJerasureReedSolomonVandermonde::get_alignment() const
   }
 }
 
-int ErasureCodeJerasureReedSolomonVandermonde::parse(const map<std::string,std::string> &parameters,
-						      ostream *ss)
+int ErasureCodeJerasureReedSolomonVandermonde::parse(ErasureCodeProfile &profile,
+						     ostream *ss)
 {
-  int err = ErasureCodeJerasure::parse(parameters, ss);
+  int err = 0;
+  err |= ErasureCodeJerasure::parse(profile, ss);
   if (w != 8 && w != 16 && w != 32) {
     *ss << "ReedSolomonVandermonde: w=" << w
-	<< " must be one of {8, 16, 32} : revert to DEFAULT_W " << std::endl;
-    w = DEFAULT_W;
+	<< " must be one of {8, 16, 32} : revert to " << DEFAULT_W << std::endl;
+    profile["w"] = "8";
+    err |= to_int("w", profile, &w, DEFAULT_W, ss);
     err = -EINVAL;
   }
-  err |= to_bool("jerasure-per-chunk-alignment", parameters,
-		 &per_chunk_alignment, false, ss);
+  err |= to_bool("jerasure-per-chunk-alignment", profile,
+		 &per_chunk_alignment, "false", ss);
   return err;
 }
 
@@ -241,15 +247,17 @@ unsigned ErasureCodeJerasureReedSolomonRAID6::get_alignment() const
   }
 }
 
-int ErasureCodeJerasureReedSolomonRAID6::parse(const map<std::string,std::string> &parameters,
+int ErasureCodeJerasureReedSolomonRAID6::parse(ErasureCodeProfile &profile,
 					       ostream *ss)
 {
-  int err = ErasureCodeJerasure::parse(parameters, ss);
+  int err = ErasureCodeJerasure::parse(profile, ss);
+  profile.erase("m");
   m = 2;
   if (w != 8 && w != 16 && w != 32) {
     *ss << "ReedSolomonRAID6: w=" << w
 	<< " must be one of {8, 16, 32} : revert to 8 " << std::endl;
-    w = 8;
+    profile["w"] = "8";
+    err |= to_int("w", profile, &w, DEFAULT_W, ss);
     err = -EINVAL;
   }
   return err;
@@ -296,13 +304,13 @@ unsigned ErasureCodeJerasureCauchy::get_alignment() const
   }  
 }
 
-int ErasureCodeJerasureCauchy::parse(const map<std::string,std::string> &parameters,
+int ErasureCodeJerasureCauchy::parse(ErasureCodeProfile &profile,
 				     ostream *ss)
 {
-  int err = ErasureCodeJerasure::parse(parameters, ss);
-  err |= to_int("packetsize", parameters, &packetsize, DEFAULT_PACKETSIZE, ss);
-  err |= to_bool("jerasure-per-chunk-alignment", parameters,
-		 &per_chunk_alignment, false, ss);
+  int err = ErasureCodeJerasure::parse(profile, ss);
+  err |= to_int("packetsize", profile, &packetsize, DEFAULT_PACKETSIZE, ss);
+  err |= to_bool("jerasure-per-chunk-alignment", profile,
+		 &per_chunk_alignment, "false", ss);
   return err;
 }
 
@@ -409,20 +417,26 @@ bool ErasureCodeJerasureLiberation::check_packetsize(ostream *ss) const
   }
 }
 
-void ErasureCodeJerasureLiberation::revert_to_default(ostream *ss)
+int ErasureCodeJerasureLiberation::revert_to_default(ErasureCodeProfile &profile,
+						     ostream *ss)
 {
+  int err = 0;
   *ss << "reverting to k=" << DEFAULT_K << ", w="
       << DEFAULT_W << ", packetsize=" << DEFAULT_PACKETSIZE << std::endl;
-  k = DEFAULT_K;
-  w = DEFAULT_W;
-  packetsize = DEFAULT_PACKETSIZE;
+  profile["k"] = DEFAULT_K;
+  err |= to_int("k", profile, &k, DEFAULT_K, ss);
+  profile["w"] = DEFAULT_W;
+  err |= to_int("w", profile, &w, DEFAULT_W, ss);
+  profile["packetsize"] = DEFAULT_PACKETSIZE;
+  err |= to_int("packetsize", profile, &packetsize, DEFAULT_PACKETSIZE, ss);
+  return err;
 }
 
-int ErasureCodeJerasureLiberation::parse(const map<std::string,std::string> &parameters,
+int ErasureCodeJerasureLiberation::parse(ErasureCodeProfile &profile,
 					 ostream *ss)
 {
-  int err = ErasureCodeJerasure::parse(parameters, ss);
-  err |= to_int("packetsize", parameters, &packetsize, DEFAULT_PACKETSIZE, ss);
+  int err = ErasureCodeJerasure::parse(profile, ss);
+  err |= to_int("packetsize", profile, &packetsize, DEFAULT_PACKETSIZE, ss);
 
   bool error = false;
   if (!check_k(ss))
@@ -432,7 +446,7 @@ int ErasureCodeJerasureLiberation::parse(const map<std::string,std::string> &par
   if (!check_packetsize_set(ss) || !check_packetsize(ss))
     error = true;
   if (error) {
-    revert_to_default(ss);
+    revert_to_default(profile, ss);
     err = -EINVAL;
   }
   return err;
@@ -471,13 +485,15 @@ void ErasureCodeJerasureBlaumRoth::prepare()
 // 
 // ErasureCodeJerasureLiber8tion
 //
-int ErasureCodeJerasureLiber8tion::parse(const map<std::string,std::string> &parameters,
+int ErasureCodeJerasureLiber8tion::parse(ErasureCodeProfile &profile,
 					 ostream *ss)
 {
-  int err = ErasureCodeJerasure::parse(parameters, ss);
-  m = DEFAULT_M;
-  w = DEFAULT_W;
-  err |= to_int("packetsize", parameters, &packetsize, DEFAULT_PACKETSIZE, ss);
+  int err = ErasureCodeJerasure::parse(profile, ss);
+  profile.erase("m");
+  err |= to_int("m", profile, &m, DEFAULT_M, ss);
+  profile.erase("w");
+  err |= to_int("w", profile, &w, DEFAULT_W, ss);
+  err |= to_int("packetsize", profile, &packetsize, DEFAULT_PACKETSIZE, ss);
 
   bool error = false;
   if (!check_k(ss))
@@ -485,7 +501,7 @@ int ErasureCodeJerasureLiber8tion::parse(const map<std::string,std::string> &par
   if (!check_packetsize_set(ss))
     error = true;
   if (error) {
-    revert_to_default(ss);
+    revert_to_default(profile, ss);
     err = -EINVAL;
   }
   return err;
diff --git a/src/erasure-code/jerasure/ErasureCodeJerasure.h b/src/erasure-code/jerasure/ErasureCodeJerasure.h
index b7a2358..df60dac 100644
--- a/src/erasure-code/jerasure/ErasureCodeJerasure.h
+++ b/src/erasure-code/jerasure/ErasureCodeJerasure.h
@@ -20,14 +20,17 @@
 
 #include "erasure-code/ErasureCode.h"
 
+#define DEFAULT_RULESET_ROOT "default"
+#define DEFAULT_RULESET_FAILURE_DOMAIN "host"
+
 class ErasureCodeJerasure : public ErasureCode {
 public:
   int k;
-  int DEFAULT_K;
+  std::string DEFAULT_K;
   int m;
-  int DEFAULT_M;
+  std::string DEFAULT_M;
   int w;
-  int DEFAULT_W;
+  std::string DEFAULT_W;
   const char *technique;
   string ruleset_root;
   string ruleset_failure_domain;
@@ -35,22 +38,19 @@ public:
 
   ErasureCodeJerasure(const char *_technique) :
     k(0),
-    DEFAULT_K(2),
+    DEFAULT_K("2"),
     m(0),
-    DEFAULT_M(1),
+    DEFAULT_M("1"),
     w(0),
-    DEFAULT_W(8),
+    DEFAULT_W("8"),
     technique(_technique),
-    ruleset_root("default"),
-    ruleset_failure_domain("host"),
+    ruleset_root(DEFAULT_RULESET_ROOT),
+    ruleset_failure_domain(DEFAULT_RULESET_FAILURE_DOMAIN),
     per_chunk_alignment(false)
   {}
 
   virtual ~ErasureCodeJerasure() {}
   
-  virtual int parse(const map<std::string,std::string> &parameters,
-		    ostream *ss);
-
   virtual int create_ruleset(const string &name,
 			     CrushWrapper &crush,
 			     ostream *ss) const;
@@ -72,7 +72,8 @@ public:
 			    const map<int, bufferlist> &chunks,
 			    map<int, bufferlist> *decoded);
 
-  void init(const map<std::string,std::string> &parameters);
+  virtual int init(ErasureCodeProfile &profile, ostream *ss);
+
   virtual void jerasure_encode(char **data,
                                char **coding,
                                int blocksize) = 0;
@@ -83,6 +84,8 @@ public:
   virtual unsigned get_alignment() const = 0;
   virtual void prepare() = 0;
   static bool is_prime(int value);
+protected:
+  virtual int parse(ErasureCodeProfile &profile, ostream *ss);
 };
 
 class ErasureCodeJerasureReedSolomonVandermonde : public ErasureCodeJerasure {
@@ -93,9 +96,9 @@ public:
     ErasureCodeJerasure("reed_sol_van"),
     matrix(0)
   {
-    DEFAULT_K = 7;
-    DEFAULT_M = 3;
-    DEFAULT_W = 8;
+    DEFAULT_K = "7";
+    DEFAULT_M = "3";
+    DEFAULT_W = "8";
   }
   virtual ~ErasureCodeJerasureReedSolomonVandermonde() {
     if (matrix)
@@ -110,9 +113,9 @@ public:
                                char **coding,
                                int blocksize);
   virtual unsigned get_alignment() const;
-  virtual int parse(const map<std::string,std::string> &parameters,
-		    ostream *ss);
   virtual void prepare();
+private:
+  virtual int parse(ErasureCodeProfile &profile, ostream *ss);
 };
 
 class ErasureCodeJerasureReedSolomonRAID6 : public ErasureCodeJerasure {
@@ -123,8 +126,8 @@ public:
     ErasureCodeJerasure("reed_sol_r6_op"),
     matrix(0)
   {
-    DEFAULT_K = 7;
-    DEFAULT_W = 8;
+    DEFAULT_K = "7";
+    DEFAULT_W = "8";
   }
   virtual ~ErasureCodeJerasureReedSolomonRAID6() {
     if (matrix)
@@ -139,14 +142,15 @@ public:
                                char **coding,
                                int blocksize);
   virtual unsigned get_alignment() const;
-  virtual int parse(const map<std::string,std::string> &parameters,
-		    ostream *ss);
   virtual void prepare();
+private:
+  virtual int parse(ErasureCodeProfile &profile, ostream *ss);
 };
 
+#define DEFAULT_PACKETSIZE "2048"
+
 class ErasureCodeJerasureCauchy : public ErasureCodeJerasure {
 public:
-  static const int DEFAULT_PACKETSIZE = 2048;
   int *bitmatrix;
   int **schedule;
   int packetsize;
@@ -156,9 +160,9 @@ public:
     bitmatrix(0),
     schedule(0)
   {
-    DEFAULT_K = 7;
-    DEFAULT_M = 3;
-    DEFAULT_W = 8;
+    DEFAULT_K = "7";
+    DEFAULT_M = "3";
+    DEFAULT_W = "8";
   }
   virtual ~ErasureCodeJerasureCauchy() {
     if (bitmatrix)
@@ -175,9 +179,9 @@ public:
                                char **coding,
                                int blocksize);
   virtual unsigned get_alignment() const;
-  virtual int parse(const map<std::string,std::string> &parameters,
-		    ostream *ss);
   void prepare_schedule(int *matrix);
+private:
+  virtual int parse(ErasureCodeProfile &profile, ostream *ss);
 };
 
 class ErasureCodeJerasureCauchyOrig : public ErasureCodeJerasureCauchy {
@@ -200,7 +204,6 @@ public:
 
 class ErasureCodeJerasureLiberation : public ErasureCodeJerasure {
 public:
-  static const int DEFAULT_PACKETSIZE = 2048;
   int *bitmatrix;
   int **schedule;
   int packetsize;
@@ -210,9 +213,9 @@ public:
     bitmatrix(0),
     schedule(0)
   {
-    DEFAULT_K = 2;
-    DEFAULT_M = 2;
-    DEFAULT_W = 7;
+    DEFAULT_K = "2";
+    DEFAULT_M = "2";
+    DEFAULT_W = "7";
   }
   virtual ~ErasureCodeJerasureLiberation();
 
@@ -228,10 +231,11 @@ public:
   virtual bool check_w(ostream *ss) const;
   virtual bool check_packetsize_set(ostream *ss) const;
   virtual bool check_packetsize(ostream *ss) const;
-  virtual void revert_to_default(ostream *ss);
-  virtual int parse(const map<std::string,std::string> &parameters,
-		    ostream *ss);
+  virtual int revert_to_default(ErasureCodeProfile &profile,
+				ostream *ss);
   virtual void prepare();
+private:
+  virtual int parse(ErasureCodeProfile &profile, ostream *ss);
 };
 
 class ErasureCodeJerasureBlaumRoth : public ErasureCodeJerasureLiberation {
@@ -250,14 +254,14 @@ public:
   ErasureCodeJerasureLiber8tion() :
     ErasureCodeJerasureLiberation("liber8tion")
   {
-    DEFAULT_K = 2;
-    DEFAULT_M = 2;
-    DEFAULT_W = 8;
+    DEFAULT_K = "2";
+    DEFAULT_M = "2";
+    DEFAULT_W = "8";
   }
 
-  virtual int parse(const map<std::string,std::string> &parameters,
-		    ostream *ss);
   virtual void prepare();
+private:
+  virtual int parse(ErasureCodeProfile &profile, ostream *ss);
 };
 
 #endif
diff --git a/src/erasure-code/jerasure/ErasureCodePluginJerasure.cc b/src/erasure-code/jerasure/ErasureCodePluginJerasure.cc
index e5f8b83..b3d03b5 100644
--- a/src/erasure-code/jerasure/ErasureCodePluginJerasure.cc
+++ b/src/erasure-code/jerasure/ErasureCodePluginJerasure.cc
@@ -31,12 +31,14 @@ static ostream& _prefix(std::ostream* _dout)
 
 class ErasureCodePluginJerasure : public ErasureCodePlugin {
 public:
-  virtual int factory(const map<std::string,std::string> &parameters,
-		      ErasureCodeInterfaceRef *erasure_code) {
+  virtual int factory(const std::string& directory,
+		      ErasureCodeProfile &profile,
+		      ErasureCodeInterfaceRef *erasure_code,
+		      ostream *ss) {
     ErasureCodeJerasure *interface;
     std::string t;
-    if (parameters.find("technique") != parameters.end())
-      t = parameters.find("technique")->second;
+    if (profile.find("technique") != profile.end())
+      t = profile.find("technique")->second;
     if (t == "reed_sol_van") {
       interface = new ErasureCodeJerasureReedSolomonVandermonde();
     } else if (t == "reed_sol_r6_op") {
@@ -59,7 +61,12 @@ public:
 	   << dendl;
       return -ENOENT;
     }
-    interface->init(parameters);
+    dout(20) << __func__ << ": " << profile << dendl;
+    int r = interface->init(profile, ss);
+    if (r) {
+      delete interface;
+      return r;
+    }
     *erasure_code = ErasureCodeInterfaceRef(interface);
     return 0;
   }
diff --git a/src/erasure-code/jerasure/ErasureCodePluginSelectJerasure.cc b/src/erasure-code/jerasure/ErasureCodePluginSelectJerasure.cc
index 808cf01..96c2280 100644
--- a/src/erasure-code/jerasure/ErasureCodePluginSelectJerasure.cc
+++ b/src/erasure-code/jerasure/ErasureCodePluginSelectJerasure.cc
@@ -54,26 +54,27 @@ static string get_variant() {
 
 class ErasureCodePluginSelectJerasure : public ErasureCodePlugin {
 public:
-  virtual int factory(const map<string,string> &parameters,
-		      ErasureCodeInterfaceRef *erasure_code) {
+  virtual int factory(const std::string &directory,
+		      ErasureCodeProfile &profile,
+		      ErasureCodeInterfaceRef *erasure_code,
+		      ostream *ss) {
     ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
-    stringstream ss;
     int ret;
     string name = "jerasure";
-    if (parameters.count("jerasure-name"))
-      name = parameters.find("jerasure-name")->second;
-    if (parameters.count("jerasure-variant")) {
+    if (profile.count("jerasure-name"))
+      name = profile.find("jerasure-name")->second;
+    if (profile.count("jerasure-variant")) {
       dout(10) << "jerasure-variant " 
-	       << parameters.find("jerasure-variant")->second << dendl;
-      ret = instance.factory(name + "_" + parameters.find("jerasure-variant")->second,
-			     parameters, erasure_code, ss);
+	       << profile.find("jerasure-variant")->second << dendl;
+      ret = instance.factory(name + "_" + profile.find("jerasure-variant")->second,
+			     directory,
+			     profile, erasure_code, ss);
     } else {
       string variant = get_variant();
       dout(10) << variant << " plugin" << dendl;
-      ret = instance.factory(name + "_" + variant, parameters, erasure_code, ss);
+      ret = instance.factory(name + "_" + variant, directory,
+			     profile, erasure_code, ss);
     }
-    if (ret)
-      derr << ss.str() << dendl;
     return ret;
   }
 };
@@ -87,7 +88,7 @@ int __erasure_code_init(char *plugin_name, char *directory)
   ErasureCodePlugin *plugin;
   stringstream ss;
   int r = instance.load(plugin_name + string("_") + variant,
-			directory, &plugin, ss);
+			directory, &plugin, &ss);
   if (r) {
     derr << ss.str() << dendl;
     return r;
diff --git a/src/erasure-code/jerasure/gf-complete/src/gf.c b/src/erasure-code/jerasure/gf-complete/src/gf.c
index 01a385b..835fb12 100644
--- a/src/erasure-code/jerasure/gf-complete/src/gf.c
+++ b/src/erasure-code/jerasure/gf-complete/src/gf.c
@@ -119,7 +119,7 @@ void gf_error()
 uint64_t gf_composite_get_default_poly(gf_t *base) 
 {
   gf_internal_t *h;
-  int rv;
+  uint64_t rv;
 
   h = (gf_internal_t *) base->scratch;
   if (h->w == 4) {
@@ -584,7 +584,7 @@ uint32_t gf_bitmatrix_inverse(uint32_t y, int w, uint32_t pp)
   uint32_t mat[32], inv[32], mask;
   int i;
 
-  mask = (w == 32) ? 0xffffffff : (1 << w) - 1;
+  mask = (w == 32) ? 0xffffffff : ((uint32_t)1 << w) - 1;
   for (i = 0; i < w; i++) {
     mat[i] = y;
 
diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_w128.c b/src/erasure-code/jerasure/gf-complete/src/gf_w128.c
index 190f6b0..b6cfeba 100644
--- a/src/erasure-code/jerasure/gf-complete/src/gf_w128.c
+++ b/src/erasure-code/jerasure/gf-complete/src/gf_w128.c
@@ -49,7 +49,7 @@ void
 gf_w128_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes,
 int xor)
 {
-    int i;
+    uint32_t i;
     gf_val_128_t s128;
     gf_val_128_t d128;
     uint64_t c128[2];
@@ -87,7 +87,7 @@ void
 gf_w128_clm_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes,
 int xor)
 {
-    int i;
+    uint32_t i;
     gf_val_128_t s128;
     gf_val_128_t d128;
     gf_region_data rd;
diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_w16.c b/src/erasure-code/jerasure/gf-complete/src/gf_w16.c
index ce47849..4e026b2 100644
--- a/src/erasure-code/jerasure/gf-complete/src/gf_w16.c
+++ b/src/erasure-code/jerasure/gf-complete/src/gf_w16.c
@@ -1218,7 +1218,7 @@ int gf_w16_split_init(gf_t *gf)
   struct gf_w16_split_8_8_data *d8;
   int i, j, exp, issse3;
   int isneon = 0;
-  uint32_t p, basep;
+  uint32_t p, basep, tmp;
 
   h = (gf_internal_t *) gf->scratch;
 
@@ -1253,7 +1253,8 @@ int gf_w16_split_init(gf_t *gf)
           if (j&1) {
             d8->tables[exp][i][j] = d8->tables[exp][i][j^1] ^ p;
           } else {
-            d8->tables[exp][i][j] = GF_MULTBY_TWO(d8->tables[exp][i][j>>1]);
+            tmp = d8->tables[exp][i][j>>1];
+            d8->tables[exp][i][j] = GF_MULTBY_TWO(tmp);
           }
         }
       }
diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_w32.c b/src/erasure-code/jerasure/gf-complete/src/gf_w32.c
index 2e187fd..854a6e4 100644
--- a/src/erasure-code/jerasure/gf-complete/src/gf_w32.c
+++ b/src/erasure-code/jerasure/gf-complete/src/gf_w32.c
@@ -50,7 +50,7 @@ void
 gf_w32_multiply_region_from_single(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int 
 xor)
 {
-  int i;
+  uint32_t i;
   uint32_t *s32;
   uint32_t *d32;
    
@@ -75,7 +75,7 @@ void
 gf_w32_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
 {
 
-  int i;
+  uint32_t i;
   uint32_t *s32;
   uint32_t *d32;
   
@@ -125,7 +125,7 @@ void
 gf_w32_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
 {
 
-  int i;
+  uint32_t i;
   uint32_t *s32;
   uint32_t *d32;
   
@@ -178,7 +178,7 @@ static
 void
 gf_w32_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
 {
-  int i;
+  uint32_t i;
   uint32_t *s32;
   uint32_t *d32;
   
@@ -389,7 +389,7 @@ void
 gf_w32_cfmgk_multiply_region_from_single(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
 {
 
-  int i;
+  uint32_t i;
   uint32_t *s32;
   uint32_t *d32;
   
@@ -666,12 +666,12 @@ static
   void
 gf_w32_group_set_shift_tables(uint32_t *shift, uint32_t val, gf_internal_t *h)
 {
-  int i;
+  uint32_t i;
   uint32_t j;
 
   shift[0] = 0;
 
-  for (i = 1; i < (1 << h->arg1); i <<= 1) {
+  for (i = 1; i < ((uint32_t)1 << h->arg1); i <<= 1) {
     for (j = 0; j < i; j++) shift[i|j] = shift[j]^val;
     if (val & GF_FIRST_BIT) {
       val <<= 1;
@@ -2375,7 +2375,7 @@ int gf_w32_group_init(gf_t *gf)
   uint32_t i, j, p, index;
   struct gf_w32_group_data *gd;
   gf_internal_t *h = (gf_internal_t *) gf->scratch;
-  int g_r, g_s;
+  uint32_t g_r, g_s;
 
   g_s = h->arg1;
   g_r = h->arg2;
@@ -2393,7 +2393,7 @@ int gf_w32_group_init(gf_t *gf)
   gd->tshift = ((gd->tshift-1)/g_r) * g_r;
 
   gd->reduce[0] = 0;
-  for (i = 0; i < (1 << g_r); i++) {
+  for (i = 0; i < ((uint32_t)1 << g_r); i++) {
     p = 0;
     index = 0;
     for (j = 0; j < g_r; j++) {
diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_w64.c b/src/erasure-code/jerasure/gf-complete/src/gf_w64.c
index 6e75f5e..ba75d8c 100644
--- a/src/erasure-code/jerasure/gf-complete/src/gf_w64.c
+++ b/src/erasure-code/jerasure/gf-complete/src/gf_w64.c
@@ -35,7 +35,7 @@ void
 gf_w64_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int
 xor)
 {
-  int i;
+  uint32_t i;
   gf_val_64_t *s64;
   gf_val_64_t *d64;
 
@@ -733,7 +733,7 @@ static
 void
 gf_w64_group_set_shift_tables(uint64_t *shift, uint64_t val, gf_internal_t *h)
 {
-  int i;
+  uint64_t i;
   uint64_t j;
   uint64_t one = 1;
   int g_s;
@@ -741,7 +741,7 @@ gf_w64_group_set_shift_tables(uint64_t *shift, uint64_t val, gf_internal_t *h)
   g_s = h->arg1;
   shift[0] = 0;
  
-  for (i = 1; i < (1 << g_s); i <<= 1) {
+  for (i = 1; i < ((uint64_t)1 << g_s); i <<= 1) {
     for (j = 0; j < i; j++) shift[i|j] = shift[j]^val;
     if (val & (one << 63)) {
       val <<= 1;
@@ -767,7 +767,7 @@ gf_w64_group_multiply(gf_t *gf, gf_val_64_t a, gf_val_64_t b)
   gd = (struct gf_w64_group_data *) h->private;
   gf_w64_group_set_shift_tables(gd->shift, b, h);
 
-  mask = ((1 << g_s) - 1);
+  mask = (((uint64_t)1 << g_s) - 1);
   top = 0;
   bot = gd->shift[a&mask];
   a >>= g_s; 
@@ -791,7 +791,7 @@ gf_w64_group_multiply(gf_t *gf, gf_val_64_t a, gf_val_64_t b)
      
   lshift = ((lshift-1) / g_r) * g_r;
   rshift = 64 - lshift;
-  mask = (1 << g_r) - 1;
+  mask = ((uint64_t)1 << g_r) - 1;
   while (lshift >= 0) {
     tp = gd->reduce[(top >> lshift) & mask];
     top ^= (tp >> rshift);
@@ -840,8 +840,8 @@ void gf_w64_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t v
   d64 = (uint64_t *) rd.d_start;
   dtop = (uint64_t *) rd.d_top;
 
-  smask = (1 << g_s) - 1;
-  rmask = (1 << g_r) - 1;
+  smask = ((uint64_t)1 << g_s) - 1;
+  rmask = ((uint64_t)1 << g_r) - 1;
 
   while (d64 < dtop) {
     a64 = *s64;
@@ -984,7 +984,7 @@ int gf_w64_group_init(gf_t *gf)
   uint64_t i, j, p, index;
   struct gf_w64_group_data *gd;
   gf_internal_t *h = (gf_internal_t *) gf->scratch;
-  int g_r, g_s;
+  uint64_t g_r, g_s;
 
   g_s = h->arg1;
   g_r = h->arg2;
@@ -994,7 +994,7 @@ int gf_w64_group_init(gf_t *gf)
   gd->reduce = gd->shift + (1 << g_s);
 
   gd->reduce[0] = 0;
-  for (i = 0; i < (1 << g_r); i++) {
+  for (i = 0; i < ((uint64_t)1 << g_r); i++) {
     p = 0;
     index = 0;
     for (j = 0; j < g_r; j++) {
diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_wgen.c b/src/erasure-code/jerasure/gf-complete/src/gf_wgen.c
index 06f7993..ebc50a5 100644
--- a/src/erasure-code/jerasure/gf-complete/src/gf_wgen.c
+++ b/src/erasure-code/jerasure/gf-complete/src/gf_wgen.c
@@ -166,10 +166,10 @@ gf_wgen_shift_multiply (gf_t *gf, uint32_t a32, uint32_t b32)
 
   product = 0;
 
-  for (i = 0; i < h->w; i++) {
+  for (i = 0; i < (uint64_t)h->w; i++) {
     if (a & (one << i)) product ^= (b << i);
   }
-  for (i = h->w*2-1; i >= h->w; i--) {
+  for (i = h->w*2-1; i >= (uint64_t)h->w; i--) {
     if (product & (one << i)) product ^= (pp << (i-h->w));
   }
   return product;
@@ -256,7 +256,7 @@ static
 void
 gf_wgen_group_set_shift_tables(uint32_t *shift, uint32_t val, gf_internal_t *h)
 {
-  int i;
+  uint32_t i;
   uint32_t j;
   int g_s;
 
@@ -268,7 +268,7 @@ gf_wgen_group_set_shift_tables(uint32_t *shift, uint32_t val, gf_internal_t *h)
 
   shift[0] = 0;
 
-  for (i = 1; i < (1 << g_s); i <<= 1) {
+  for (i = 1; i < ((uint32_t)1 << g_s); i <<= 1) {
     for (j = 0; j < i; j++) shift[i|j] = shift[j]^val;
     if (val & (1 << (h->w-1))) {
       val <<= 1;
@@ -417,7 +417,7 @@ int gf_wgen_group_init(gf_t *gf)
   uint32_t i, j, p, index;
   struct gf_wgen_group_data *gd;
   gf_internal_t *h = (gf_internal_t *) gf->scratch;
-  int g_s, g_r;
+  uint32_t g_s, g_r;
 
   if (h->mult_type == GF_MULT_DEFAULT) {
     g_s = 2;
@@ -440,7 +440,7 @@ int gf_wgen_group_init(gf_t *gf)
   gd->tshift = ((gd->tshift-1)/g_r) * g_r;
 
   gd->reduce[0] = 0;
-  for (i = 0; i < (1 << g_r); i++) {
+  for (i = 0; i < ((uint32_t)1 << g_r); i++) {
     p = 0;
     index = 0;
     for (j = 0; j < g_r; j++) {
@@ -504,15 +504,15 @@ int gf_wgen_table_8_init(gf_t *gf)
   std->mult = &(std->base);
   std->div = std->mult + ((1<<h->w)*(1<<h->w));
   
-  for (a = 0; a < (1 << w); a++) {
+  for (a = 0; a < ((uint32_t)1 << w); a++) {
     std->mult[a] = 0;
     std->mult[a<<w] = 0;
     std->div[a] = 0;
     std->div[a<<w] = 0;
   }
     
-  for (a = 1; a < (1 << w); a++) {
-    for (b = 1; b < (1 << w); b++) {
+  for (a = 1; a < ((uint32_t)1 << w); a++) {
+    for (b = 1; b < ((uint32_t)1 << w); b++) {
       p = gf_wgen_shift_multiply(gf, a, b);
       std->mult[(a<<w)|b] = p;
       std->div[(p<<w)|a] = b;
@@ -565,15 +565,15 @@ int gf_wgen_table_16_init(gf_t *gf)
   std->mult = &(std->base);
   std->div = std->mult + ((1<<h->w)*(1<<h->w));
   
-  for (a = 0; a < (1 << w); a++) {
+  for (a = 0; a < ((uint32_t)1 << w); a++) {
     std->mult[a] = 0;
     std->mult[a<<w] = 0;
     std->div[a] = 0;
     std->div[a<<w] = 0;
   }
   
-  for (a = 1; a < (1 << w); a++) {
-    for (b = 1; b < (1 << w); b++) {
+  for (a = 1; a < ((uint32_t)1 << w); a++) {
+    for (b = 1; b < ((uint32_t)1 << w); b++) {
       p = gf_wgen_shift_multiply(gf, a, b);
       std->mult[(a<<w)|b] = p;
       std->div[(p<<w)|a] = b;
@@ -649,11 +649,11 @@ int gf_wgen_log_8_init(gf_t *gf)
   std->anti = std->log + (1<<h->w);
   std->danti = std->anti + (1<<h->w)-1;
   
-  for (i = 0; i < (1 << w); i++)
+  for (i = 0; i < ((uint32_t)1 << w); i++)
     std->log[i] = 0;
 
   a = 1;
-  for(i=0; i < (1<<w)-1; i++)
+  for(i=0; i < ((uint32_t)1<<w)-1; i++)
   {
     if (std->log[a] != 0) check = 1;
     std->log[a] = i;
@@ -724,11 +724,11 @@ int gf_wgen_log_16_init(gf_t *gf)
   std->anti = std->log + (1<<h->w);
   std->danti = std->anti + (1<<h->w)-1;
  
-  for (i = 0; i < (1 << w); i++)
+  for (i = 0; i < ((uint32_t)1 << w); i++)
     std->log[i] = 0;
 
   a = 1;
-  for(i=0; i < (1<<w)-1; i++)
+  for(i=0; i < ((uint32_t)1<<w)-1; i++)
   {
     if (std->log[a] != 0) check = 1;
     std->log[a] = i;
@@ -800,11 +800,11 @@ int gf_wgen_log_32_init(gf_t *gf)
   std->anti = std->log + (1<<h->w);
   std->danti = std->anti + (1<<h->w)-1;
   
-  for (i = 0; i < (1 << w); i++)
+  for (i = 0; i < ((uint32_t)1 << w); i++)
     std->log[i] = 0;
 
   a = 1;
-  for(i=0; i < (1<<w)-1; i++)
+  for(i=0; i < ((uint32_t)1<<w)-1; i++)
   {
     if (std->log[a] != 0) check = 1;
     std->log[a] = i;
diff --git a/src/erasure-code/lrc/ErasureCodeLrc.cc b/src/erasure-code/lrc/ErasureCodeLrc.cc
index 2f90ea8..2a17ef8 100644
--- a/src/erasure-code/lrc/ErasureCodeLrc.cc
+++ b/src/erasure-code/lrc/ErasureCodeLrc.cc
@@ -98,15 +98,15 @@ int ErasureCodeLrc::create_ruleset(const string &name,
   return ruleset;
 }
 
-int ErasureCodeLrc::layers_description(const map<string,string> &parameters,
+int ErasureCodeLrc::layers_description(const ErasureCodeProfile &profile,
 				       json_spirit::mArray *description,
 				       ostream *ss) const
 {
-  if (parameters.count("layers") == 0) {
-    *ss << "could not find 'layers' in " << parameters << std::endl;
+  if (profile.count("layers") == 0) {
+    *ss << "could not find 'layers' in " << profile << std::endl;
     return ERROR_LRC_DESCRIPTION;
   }
-  string str = parameters.find("layers")->second;
+  string str = profile.find("layers")->second;
   try {
     json_spirit::mValue json;
     json_spirit::read_or_throw(str, json);
@@ -145,7 +145,7 @@ int ErasureCodeLrc::layers_parse(string description_string,
       return ERROR_LRC_ARRAY;
     }
     json_spirit::mArray layer_json = i->get_array();
-    map<string, string> parameters;
+    ErasureCodeProfile profile;
     int index = 0;
     for (vector<json_spirit::mValue>::iterator j = layer_json.begin();
 	 j != layer_json.end();
@@ -177,7 +177,7 @@ int ErasureCodeLrc::layers_parse(string description_string,
 	  return ERROR_LRC_CONFIG_OPTIONS;
 	}
 	if (j->type() == json_spirit::str_type) {
-	  int err = get_json_str_map(j->get_str(), *ss, &layer.parameters);
+	  int err = get_json_str_map(j->get_str(), *ss, &layer.profile);
 	  if (err)
 	    return err;
 	} else if (j->type() == json_spirit::obj_type) {
@@ -186,7 +186,7 @@ int ErasureCodeLrc::layers_parse(string description_string,
 	  for (map<string, json_spirit::mValue>::iterator i = o.begin();
 	       i != o.end();
 	       ++i) {
-	    layer.parameters[i->first] = i->second.get_str();
+	    layer.profile[i->first] = i->second.get_str();
 	  }
 	}
       } else {
@@ -197,7 +197,7 @@ int ErasureCodeLrc::layers_parse(string description_string,
   return 0;
 }
 
-int ErasureCodeLrc::layers_init()
+int ErasureCodeLrc::layers_init(ostream *ss)
 {
   ErasureCodePluginRegistry &registry = ErasureCodePluginRegistry::instance();
   for (unsigned int i = 0; i < layers.size(); i++) {
@@ -217,25 +217,21 @@ int ErasureCodeLrc::layers_init()
     layer.chunks = layer.data;
     layer.chunks.insert(layer.chunks.end(),
 			layer.coding.begin(), layer.coding.end());
-    if (layer.parameters.find("k") == layer.parameters.end())
-      layer.parameters["k"] = stringify(layer.data.size());
-    if (layer.parameters.find("m") == layer.parameters.end())
-      layer.parameters["m"] = stringify(layer.coding.size());
-    if (layer.parameters.find("plugin") == layer.parameters.end())
-      layer.parameters["plugin"] = "jerasure";
-    if (layer.parameters.find("technique") == layer.parameters.end())
-      layer.parameters["technique"] = "reed_sol_van";
-    if (layer.parameters.find("directory") == layer.parameters.end())
-      layer.parameters["directory"] = directory;
-    stringstream ss;
-    int err = registry.factory(layer.parameters["plugin"],
-			       layer.parameters,
+    if (layer.profile.find("k") == layer.profile.end())
+      layer.profile["k"] = stringify(layer.data.size());
+    if (layer.profile.find("m") == layer.profile.end())
+      layer.profile["m"] = stringify(layer.coding.size());
+    if (layer.profile.find("plugin") == layer.profile.end())
+      layer.profile["plugin"] = "jerasure";
+    if (layer.profile.find("technique") == layer.profile.end())
+      layer.profile["technique"] = "reed_sol_van";
+    int err = registry.factory(layer.profile["plugin"],
+			       directory,
+			       layer.profile,
 			       &layer.erasure_code,
 			       ss);
-    if (err) {
-      derr << ss.str() << dendl;
+    if (err)
       return err;
-    }
   }
   return 0;
 }
@@ -269,36 +265,35 @@ int ErasureCodeLrc::layers_sanity_checks(string description_string,
   return 0;
 }
 
-int ErasureCodeLrc::parse(const map<string,string> &parameters,
+int ErasureCodeLrc::parse(ErasureCodeProfile &profile,
 			  ostream *ss)
 {
-  int r = ErasureCode::parse(parameters, ss);
+  int r = ErasureCode::parse(profile, ss);
   if (r)
     return r;
 
-  if (parameters.count("directory") != 0)
-    directory = parameters.find("directory")->second;
-
-  return parse_ruleset(parameters, ss);
+  return parse_ruleset(profile, ss);
 }
 
-int ErasureCodeLrc::parse_kml(map<string,string> &parameters,
+const string ErasureCodeLrc::DEFAULT_KML("-1");
+
+int ErasureCodeLrc::parse_kml(ErasureCodeProfile &profile,
 			      ostream *ss)
 {
-  int err = ErasureCode::parse(parameters, ss);
-  const int DEFAULT = -1;
+  int err = ErasureCode::parse(profile, ss);
+  const int DEFAULT_INT = -1;
   int k, m, l;
-  err |= to_int("k", parameters, &k, DEFAULT, ss);
-  err |= to_int("m", parameters, &m, DEFAULT, ss);
-  err |= to_int("l", parameters, &l, DEFAULT, ss);
+  err |= to_int("k", profile, &k, DEFAULT_KML, ss);
+  err |= to_int("m", profile, &m, DEFAULT_KML, ss);
+  err |= to_int("l", profile, &l, DEFAULT_KML, ss);
 
-  if (k == DEFAULT && m == DEFAULT && l == DEFAULT)
-    return 0;
+  if (k == DEFAULT_INT && m == DEFAULT_INT && l == DEFAULT_INT)
+    return err;
 
-  if ((k != DEFAULT || m != DEFAULT || l != DEFAULT) &&
-      (k == DEFAULT || m == DEFAULT || l == DEFAULT)) {
+  if ((k != DEFAULT_INT || m != DEFAULT_INT || l != DEFAULT_INT) &&
+      (k == DEFAULT_INT || m == DEFAULT_INT || l == DEFAULT_INT)) {
     *ss << "All of k, m, l must be set or none of them in "
-	<< parameters << std::endl;
+	<< profile << std::endl;
     return ERROR_LRC_ALL_OR_NOTHING;
   }
 
@@ -307,16 +302,16 @@ int ErasureCodeLrc::parse_kml(map<string,string> &parameters,
 			      "ruleset-steps" };
 
   for (int i = 0; i < 3; i++) {
-    if (parameters.count(generated[i])) {
+    if (profile.count(generated[i])) {
       *ss << "The " << generated[i] << " parameter cannot be set "
-	  << "when k, m, l are set in " << parameters << std::endl;
+	  << "when k, m, l are set in " << profile << std::endl;
       return ERROR_LRC_GENERATED;
     }
   }
 
   if ((k + m) % l) {
     *ss << "k + m must be a multiple of l in "
-	<< parameters << std::endl;
+	<< profile << std::endl;
     return ERROR_LRC_K_M_MODULO;
   }
 
@@ -324,13 +319,13 @@ int ErasureCodeLrc::parse_kml(map<string,string> &parameters,
 
   if (k % local_group_count) {
     *ss << "k must be a multiple of (k + m) / l in "
-	<< parameters << std::endl;
+	<< profile << std::endl;
     return ERROR_LRC_K_MODULO;
   }
 
   if (m % local_group_count) {
     *ss << "m must be a multiple of (k + m) / l in "
-	<< parameters << std::endl;
+	<< profile << std::endl;
     return ERROR_LRC_M_MODULO;
   }
 
@@ -339,7 +334,7 @@ int ErasureCodeLrc::parse_kml(map<string,string> &parameters,
     mapping += string(k / local_group_count, 'D') +
       string(m / local_group_count, '_') + "_";
   }
-  parameters["mapping"] = mapping;
+  profile["mapping"] = mapping;
 
   string layers = "[ ";
 
@@ -362,16 +357,16 @@ int ErasureCodeLrc::parse_kml(map<string,string> &parameters,
     }
     layers += "\", \"\" ],";
   }
-  parameters["layers"] = layers + "]";
+  profile["layers"] = layers + "]";
 
-  map<string,string>::const_iterator parameter;
+  ErasureCodeProfile::const_iterator parameter;
   string ruleset_locality;
-  parameter = parameters.find("ruleset-locality");
-  if (parameter != parameters.end())
+  parameter = profile.find("ruleset-locality");
+  if (parameter != profile.end())
     ruleset_locality = parameter->second;
   string ruleset_failure_domain = "host";
-  parameter = parameters.find("ruleset-failure-domain");
-  if (parameter != parameters.end())
+  parameter = profile.find("ruleset-failure-domain");
+  if (parameter != profile.end())
     ruleset_failure_domain = parameter->second;
 
   if (ruleset_locality != "") {
@@ -385,20 +380,20 @@ int ErasureCodeLrc::parse_kml(map<string,string> &parameters,
     ruleset_steps.push_back(Step("chooseleaf", ruleset_failure_domain, 0));
   }
 
-  return 0;
+  return err;
 }
 
-int ErasureCodeLrc::parse_ruleset(const map<string,string> &parameters,
+int ErasureCodeLrc::parse_ruleset(ErasureCodeProfile &profile,
 				  ostream *ss)
 {
-  map<string,string>::const_iterator parameter;
-  parameter = parameters.find("ruleset-root");
-  if (parameter != parameters.end())
-    ruleset_root = parameter->second;
+  int err = 0;
+  err |= to_string("ruleset-root", profile,
+		   &ruleset_root,
+		   "default", ss);
 
-  if (parameters.count("ruleset-steps") != 0) {
+  if (profile.count("ruleset-steps") != 0) {
     ruleset_steps.clear();
-    string str = parameters.find("ruleset-steps")->second;
+    string str = profile.find("ruleset-steps")->second;
     json_spirit::mArray description;
     try {
       json_spirit::mValue json;
@@ -479,26 +474,25 @@ int ErasureCodeLrc::parse_ruleset_step(string description_string,
   return 0;
 }
 
-int ErasureCodeLrc::init(const map<string,string> &parameters,
+int ErasureCodeLrc::init(ErasureCodeProfile &profile,
 			 ostream *ss)
 {
   int r;
 
-  map<string,string> parameters_rw = parameters;
-  r = parse_kml(parameters_rw, ss);
+  r = parse_kml(profile, ss);
   if (r)
     return r;
 
-  r = parse(parameters_rw, ss);
+  r = parse(profile, ss);
   if (r)
     return r;
 
   json_spirit::mArray description;
-  r = layers_description(parameters_rw, &description, ss);
+  r = layers_description(profile, &description, ss);
   if (r)
     return r;
 
-  string description_string = parameters_rw.find("layers")->second;
+  string description_string = profile.find("layers")->second;
 
   dout(10) << "init(" << description_string << ")" << dendl;
 
@@ -506,15 +500,15 @@ int ErasureCodeLrc::init(const map<string,string> &parameters,
   if (r)
     return r;
 
-  r = layers_init();
+  r = layers_init(ss);
   if (r)
     return r;
 
-  if (parameters_rw.count("mapping") == 0) {
-    *ss << "the 'mapping' parameter is missing from " << parameters_rw;
+  if (profile.count("mapping") == 0) {
+    *ss << "the 'mapping' profile is missing from " << profile;
     return ERROR_LRC_MAPPING;
   }
-  string mapping = parameters_rw.find("mapping")->second;
+  string mapping = profile.find("mapping")->second;
   data_chunk_count = 0;
   for(std::string::iterator it = mapping.begin(); it != mapping.end(); ++it) {
     if (*it == 'D')
@@ -522,7 +516,22 @@ int ErasureCodeLrc::init(const map<string,string> &parameters,
   }
   chunk_count = mapping.length();
 
-  return layers_sanity_checks(description_string, ss);
+  r = layers_sanity_checks(description_string, ss);
+  if (r)
+    return r;
+
+  //
+  // When initialized with kml, the profile parameters
+  // that were generated should not be stored because
+  // they would otherwise be exposed to the caller.
+  //
+  if (profile.find("l") != profile.end() &&
+      profile.find("l")->second != DEFAULT_KML) {
+    profile.erase("mapping");
+    profile.erase("layers");
+  }
+  ErasureCode::init(profile, ss);
+  return 0;
 }
 
 set<int> ErasureCodeLrc::get_erasures(const set<int> &want,
diff --git a/src/erasure-code/lrc/ErasureCodeLrc.h b/src/erasure-code/lrc/ErasureCodeLrc.h
index 1c73273..ffc7748 100644
--- a/src/erasure-code/lrc/ErasureCodeLrc.h
+++ b/src/erasure-code/lrc/ErasureCodeLrc.h
@@ -46,6 +46,8 @@
 
 class ErasureCodeLrc : public ErasureCode {
 public:
+  static const string DEFAULT_KML;
+
   struct Layer {
     Layer(string _chunks_map) : chunks_map(_chunks_map) { }
     ErasureCodeInterfaceRef erasure_code;
@@ -54,7 +56,7 @@ public:
     vector<int> chunks;
     set<int> chunks_as_set;
     string chunks_map;
-    map<string,string> parameters;
+    ErasureCodeProfile profile;
   };
   vector<Layer> layers;
   string directory;
@@ -72,8 +74,9 @@ public:
   };
   vector<Step> ruleset_steps;
 
-  ErasureCodeLrc() :
-    chunk_count(0), data_chunk_count(0), ruleset_root("default")
+  ErasureCodeLrc(const std::string &dir)
+    : directory(dir),
+      chunk_count(0), data_chunk_count(0), ruleset_root("default")
   {
     ruleset_steps.push_back(Step("chooseleaf", "host", 0));
   }
@@ -108,25 +111,25 @@ public:
 			    const map<int, bufferlist> &chunks,
 			    map<int, bufferlist> *decoded);
 
-  int init(const map<string,string> &parameters, ostream *ss);
+  virtual int init(ErasureCodeProfile &profile, ostream *ss);
 
-  virtual int parse(const map<string,string> &parameters, ostream *ss);
+  virtual int parse(ErasureCodeProfile &profile, ostream *ss);
 
-  int parse_kml(map<string,string> &parameters, ostream *ss);
+  int parse_kml(ErasureCodeProfile &profile, ostream *ss);
 
-  int parse_ruleset(const map<string,string> &parameters, ostream *ss);
+  int parse_ruleset(ErasureCodeProfile &profile, ostream *ss);
 
   int parse_ruleset_step(string description_string,
 			 json_spirit::mArray description,
 			 ostream *ss);
 
-  int layers_description(const map<string,string> &parameters,
+  int layers_description(const ErasureCodeProfile &profile,
 			 json_spirit::mArray *description,
 			 ostream *ss) const;
   int layers_parse(string description_string,
 		   json_spirit::mArray description,
 		   ostream *ss);
-  int layers_init();
+  int layers_init(ostream *ss);
   int layers_sanity_checks(string description_string,
 			   ostream *ss) const;
 };
diff --git a/src/erasure-code/lrc/ErasureCodePluginLrc.cc b/src/erasure-code/lrc/ErasureCodePluginLrc.cc
index dfb680a..5f22cc6 100644
--- a/src/erasure-code/lrc/ErasureCodePluginLrc.cc
+++ b/src/erasure-code/lrc/ErasureCodePluginLrc.cc
@@ -27,22 +27,16 @@
 #undef dout_prefix
 #define dout_prefix _prefix(_dout)
 
-static ostream& _prefix(std::ostream* _dout)
-{
-  return *_dout << "ErasureCodePluginLrc: ";
-}
-
 class ErasureCodePluginLrc : public ErasureCodePlugin {
 public:
-  virtual int factory(const map<std::string,std::string> &parameters,
-		      ErasureCodeInterfaceRef *erasure_code) {
+  virtual int factory(const std::string &directory,
+		      ErasureCodeProfile &profile,
+		      ErasureCodeInterfaceRef *erasure_code,
+		      ostream *ss) {
     ErasureCodeLrc *interface;
-    interface = new ErasureCodeLrc();
-    stringstream ss;
-    assert(parameters.count("directory") != 0);
-    int r = interface->init(parameters, &ss);
+    interface = new ErasureCodeLrc(directory);
+    int r = interface->init(profile, ss);
     if (r) {
-      derr << ss.str() << dendl;
       delete interface;
       return r;
     }
diff --git a/src/erasure-code/shec/ErasureCodePluginSelectShec.cc b/src/erasure-code/shec/ErasureCodePluginSelectShec.cc
new file mode 100644
index 0000000..3a4f74e
--- /dev/null
+++ b/src/erasure-code/shec/ErasureCodePluginSelectShec.cc
@@ -0,0 +1,102 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013,2014 Cloudwatt <libre.licensing at cloudwatt.com>
+ * Copyright (C) 2014 Red Hat <contact at redhat.com>
+ * Copyright (C) 2014,2015 FUJITSU LIMITED
+ *
+ * Author: Loic Dachary <loic at dachary.org>
+ * Author: Shotaro Kawaguchi <kawaguchi.s at jp.fujitsu.com>
+ * Author: Takanori Nakao <nakao.takanori at jp.fujitsu.com>
+ * Author: Takeshi Miyamae <miyamae.takeshi at jp.fujitsu.com>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#include "ceph_ver.h"
+#include "common/debug.h"
+#include "arch/probe.h"
+#include "arch/intel.h"
+#include "arch/arm.h"
+#include "erasure-code/ErasureCodePlugin.h"
+
+#define dout_subsys ceph_subsys_osd
+#undef dout_prefix
+#define dout_prefix _prefix(_dout)
+
+static ostream& _prefix(std::ostream* _dout)
+{
+  return *_dout << "ErasureCodePluginSelectShec: ";
+}
+
+static string get_variant() {
+  ceph_arch_probe();
+
+  if (ceph_arch_intel_pclmul &&
+      ceph_arch_intel_sse42 &&
+      ceph_arch_intel_sse41 &&
+      ceph_arch_intel_ssse3 &&
+      ceph_arch_intel_sse3 &&
+      ceph_arch_intel_sse2) {
+    return "sse4";
+  } else if (ceph_arch_intel_ssse3 &&
+	     ceph_arch_intel_sse3 &&
+	     ceph_arch_intel_sse2) {
+    return "sse3";
+  } else if (ceph_arch_neon) {
+    return "neon";
+  } else {
+    return "generic";
+  }
+}
+
+class ErasureCodePluginSelectShec : public ErasureCodePlugin {
+public:
+  virtual int factory(const std::string &directory,
+		      ErasureCodeProfile &profile,
+		      ErasureCodeInterfaceRef *erasure_code,
+		      ostream *ss) {
+    ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
+    int ret;
+    string name = "shec";
+    if (profile.count("shec-name"))
+      name = profile.find("shec-name")->second;
+    if (profile.count("shec-variant")) {
+      dout(10) << "shec-variant "
+	       << profile.find("shec-variant")->second << dendl;
+      ret = instance.factory(name + "_" + profile.find("shec-variant")->second,
+			     directory,
+			     profile, erasure_code, ss);
+    } else {
+      string variant = get_variant();
+      dout(10) << variant << " plugin" << dendl;
+      ret = instance.factory(name + "_" + variant, directory,
+			     profile, erasure_code, ss);
+    }
+    return ret;
+  }
+};
+
+const char *__erasure_code_version() { return CEPH_GIT_NICE_VER; }
+
+int __erasure_code_init(char *plugin_name, char *directory)
+{
+  ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
+  string variant = get_variant();
+  ErasureCodePlugin *plugin;
+  stringstream ss;
+  int r = instance.load(plugin_name + string("_") + variant,
+			directory, &plugin, &ss);
+  if (r) {
+    derr << ss.str() << dendl;
+    return r;
+  }
+  dout(10) << ss.str() << dendl;
+  return instance.add(plugin_name, new ErasureCodePluginSelectShec());
+}
diff --git a/src/erasure-code/shec/ErasureCodePluginShec.cc b/src/erasure-code/shec/ErasureCodePluginShec.cc
index 75a64a7..d2b72f5 100644
--- a/src/erasure-code/shec/ErasureCodePluginShec.cc
+++ b/src/erasure-code/shec/ErasureCodePluginShec.cc
@@ -37,29 +37,30 @@ class ErasureCodePluginShec : public ErasureCodePlugin {
 public:
   ErasureCodeShecTableCache tcache;
 
-  virtual int factory(const map<std::string,std::string> &parameters,
-		      ErasureCodeInterfaceRef *erasure_code) {
+  virtual int factory(const std::string &directory,
+		      ErasureCodeProfile &profile,
+		      ErasureCodeInterfaceRef *erasure_code,
+		      ostream *ss) {
     ErasureCodeShec *interface;
-    std::string t = "multiple";
 
-    if (parameters.find("technique") != parameters.end()){
-      t = parameters.find("technique")->second;
-    }
+    if (profile.find("technique") == profile.end())
+      profile["technique"] = "multiple";
+    std::string t = profile.find("technique")->second;
 
     if (t == "single"){
       interface = new ErasureCodeShecReedSolomonVandermonde(tcache, ErasureCodeShec::SINGLE);
     } else if (t == "multiple"){
       interface = new ErasureCodeShecReedSolomonVandermonde(tcache, ErasureCodeShec::MULTIPLE);
     } else {
-      derr << "technique=" << t << " is not a valid coding technique. "
-	   << " Choose one of the following: "
-	   << "single, multiple"
-	   << dendl;
+      *ss << "technique=" << t << " is not a valid coding technique. "
+	  << "Choose one of the following: "
+	  << "single, multiple ";
       return -ENOENT;
     }
-    int err = interface->init(parameters);
-    if (err) {
-      return err;
+    int r = interface->init(profile, ss);
+    if (r) {
+      delete interface;
+      return r;
     }
     *erasure_code = ErasureCodeInterfaceRef(interface);
 
diff --git a/src/erasure-code/shec/ErasureCodeShec.cc b/src/erasure-code/shec/ErasureCodeShec.cc
index b0437a5..2180328 100644
--- a/src/erasure-code/shec/ErasureCodeShec.cc
+++ b/src/erasure-code/shec/ErasureCodeShec.cc
@@ -18,24 +18,27 @@
  *
  */
 
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
 #include <errno.h>
 #include <algorithm>
 #include "common/debug.h"
 #include "ErasureCodeShec.h"
 #include "crush/CrushWrapper.h"
 #include "osd/osd_types.h"
-#include "shec.h"
 extern "C" {
 #include "jerasure/include/jerasure.h"
 #include "jerasure/include/galois.h"
+
+extern int calc_determinant(int *matrix, int dim);
+extern int* reed_sol_vandermonde_coding_matrix(int k, int m, int w);
 }
 
 #define dout_subsys ceph_subsys_osd
 #undef dout_prefix
 #define dout_prefix _prefix(_dout)
 
-#define talloc(type, num) (type *) malloc(sizeof(type)*(num))
-
 static ostream& _prefix(std::ostream* _dout)
 {
   return *_dout << "ErasureCodeShec: ";
@@ -55,22 +58,22 @@ int ErasureCodeShec::create_ruleset(const string &name,
   }
 }
 
-int ErasureCodeShec::init(const map<std::string,std::string> &parameters)
+int ErasureCodeShec::init(ErasureCodeProfile &profile,
+			  ostream *ss)
 {
-  dout(10) << "technique=" << technique << dendl;
-  map<string,string>::const_iterator parameter;
-  parameter = parameters.find("ruleset-root");
-  if (parameter != parameters.end())
-    ruleset_root = parameter->second;
-  parameter = parameters.find("ruleset-failure-domain");
-  if (parameter != parameters.end())
-    ruleset_failure_domain = parameter->second;
-  int err = parse(parameters);
-  if (err) {
+  int err = 0;
+  err |= to_string("ruleset-root", profile,
+		   &ruleset_root,
+		   DEFAULT_RULESET_ROOT, ss);
+  err |= to_string("ruleset-failure-domain", profile,
+		   &ruleset_failure_domain,
+		   DEFAULT_RULESET_FAILURE_DOMAIN, ss);
+  err |= parse(profile);
+  if (err)
     return err;
-  }
   prepare();
-  return 0;
+  ErasureCode::init(profile, ss);
+  return err;
 }
 
 unsigned int ErasureCodeShec::get_chunk_size(unsigned int object_size) const
@@ -83,51 +86,61 @@ unsigned int ErasureCodeShec::get_chunk_size(unsigned int object_size) const
   return padded_length / k;
 }
 
-int ErasureCodeShec::minimum_to_decode(const set<int> &want_to_decode,
+int ErasureCodeShec::minimum_to_decode(const set<int> &want_to_read,
 				       const set<int> &available_chunks,
 				       set<int> *minimum_chunks)
 {
-  int erased[k + m];
+  if (!minimum_chunks) return -EINVAL;
+
+  for (set<int>::iterator it = available_chunks.begin(); it != available_chunks.end(); ++it){
+    if (*it < 0 || k+m <= *it) return -EINVAL;
+  }
+
+  for (set<int>::iterator it = want_to_read.begin(); it != want_to_read.end(); ++it){
+    if (*it < 0 || k+m <= *it) return -EINVAL;
+  }
+
+  int want[k + m];
   int avails[k + m];
   int minimum[k + m];
-  int dm_ids[k];
 
-  if (!minimum_chunks) return -EINVAL;
+  memset(want, 0, sizeof(want));
+  memset(avails, 0, sizeof(avails));
+  memset(minimum, 0, sizeof(minimum));
+  (*minimum_chunks).clear();
 
-  for (set<int>::iterator it = available_chunks.begin(); it != available_chunks.end(); it++){
-    if (*it < 0 || k+m <= *it) return -EINVAL;
+  for (set<int>::const_iterator i = want_to_read.begin();
+       i != want_to_read.end();
+       ++i) {
+    want[*i] = 1;
   }
 
-  if (includes(available_chunks.begin(), available_chunks.end(),
-	       want_to_decode.begin(), want_to_decode.end())) {
-    *minimum_chunks = want_to_decode;
-  } else {
-    for (int i = 0; i < k + m; i++) {
-      erased[i] = 0;
-      if (available_chunks.find(i) == available_chunks.end()) {
-	if (want_to_decode.count(i) > 0) {
-	  erased[i] = 1;
-	}
-	avails[i] = 0;
-      } else {
-	avails[i] = 1;
-      }
-    }
+  for (set<int>::const_iterator i = available_chunks.begin();
+       i != available_chunks.end();
+       ++i) {
+    avails[*i] = 1;
+  }
 
-    if (shec_make_decoding_matrix(true, k, m, w, matrix, erased,
-				  avails, 0, dm_ids, minimum) < 0) {
+  {
+    int decoding_matrix[k*k];
+    int dm_row[k];
+    int dm_column[k];
+    memset(decoding_matrix, 0, sizeof(decoding_matrix));
+    memset(dm_row, 0, sizeof(dm_row));
+    memset(dm_column, 0, sizeof(dm_column));
+    if (shec_make_decoding_matrix(true, want, avails, decoding_matrix, dm_row, dm_column, minimum) < 0) {
       return -EIO;
     }
+  }
 
-    for (int i = 0; i < k + m; i++) {
-      if (minimum[i] == 1) minimum_chunks->insert(i);
-    }
+  for (int i = 0; i < k + m; i++) {
+    if (minimum[i] == 1) minimum_chunks->insert(i);
   }
 
   return 0;
 }
 
-int ErasureCodeShec::minimum_to_decode_with_cost(const set<int> &want_to_decode,
+int ErasureCodeShec::minimum_to_decode_with_cost(const set<int> &want_to_read,
 						 const map<int, int> &available,
 						 set<int> *minimum_chunks)
 {
@@ -138,7 +151,7 @@ int ErasureCodeShec::minimum_to_decode_with_cost(const set<int> &want_to_decode,
        ++i)
     available_chunks.insert(i->first);
 
-  return minimum_to_decode(want_to_decode, available_chunks, minimum_chunks);
+  return minimum_to_decode(want_to_read, available_chunks, minimum_chunks);
 }
 
 int ErasureCodeShec::encode(const set<int> &want_to_encode,
@@ -235,7 +248,6 @@ int ErasureCodeShec::decode_chunks(const set<int> &want_to_read,
       }
       avails[i] = 0;
     } else {
-      (*decoded)[i] = chunks.find(i)->second;
       avails[i] = 1;
     }
     if (i < k)
@@ -268,8 +280,7 @@ int ErasureCodeShecReedSolomonVandermonde::shec_decode(int *erased,
 					    char **coding,
 					    int blocksize)
 {
-  return shec_matrix_decode(k, m, w, matrix,
-			    erased, avails, data, coding, blocksize);
+  return shec_matrix_decode(erased, avails, data, coding, blocksize);
 }
 
 unsigned ErasureCodeShecReedSolomonVandermonde::get_alignment() const
@@ -277,26 +288,26 @@ unsigned ErasureCodeShecReedSolomonVandermonde::get_alignment() const
   return k*w*sizeof(int);
 }
 
-int ErasureCodeShecReedSolomonVandermonde::parse(const map<std::string,std::string> &parameters)
+int ErasureCodeShecReedSolomonVandermonde::parse(const ErasureCodeProfile &profile)
 {
   int err = 0;
   // k, m, c
-  if (parameters.find("k") == parameters.end() &&
-      parameters.find("m") == parameters.end() &&
-      parameters.find("c") == parameters.end()){
+  if (profile.find("k") == profile.end() &&
+      profile.find("m") == profile.end() &&
+      profile.find("c") == profile.end()){
     dout(10) << "(k, m, c) default to " << "(" << DEFAULT_K
 	     << ", " << DEFAULT_M << ", " << DEFAULT_C << ")" << dendl;
     k = DEFAULT_K; m = DEFAULT_M; c = DEFAULT_C;
-  } else if (parameters.find("k") == parameters.end() ||
-	     parameters.find("m") == parameters.end() ||
-	     parameters.find("c") == parameters.end()){
+  } else if (profile.find("k") == profile.end() ||
+	     profile.find("m") == profile.end() ||
+	     profile.find("c") == profile.end()){
     dout(10) << "(k, m, c) must be choosed" << dendl;
     err = -EINVAL;
   } else {
     std::string err_k, err_m, err_c, value_k, value_m, value_c;
-    value_k = parameters.find("k")->second;
-    value_m = parameters.find("m")->second;
-    value_c = parameters.find("c")->second;
+    value_k = profile.find("k")->second;
+    value_m = profile.find("m")->second;
+    value_c = profile.find("c")->second;
     k = strict_strtol(value_k.c_str(), 10, &err_k);
     m = strict_strtol(value_m.c_str(), 10, &err_m);
     c = strict_strtol(value_c.c_str(), 10, &err_c);
@@ -351,12 +362,12 @@ int ErasureCodeShecReedSolomonVandermonde::parse(const map<std::string,std::stri
 	   << c << ")"<< dendl;
 
   // w
-  if (parameters.find("w") == parameters.end()){
+  if (profile.find("w") == profile.end()){
     dout(10) << "w default to " << DEFAULT_W << dendl;
     w = DEFAULT_W;
   } else {
     std::string err_w, value_w;
-    value_w = parameters.find("w")->second;
+    value_w = profile.find("w")->second;
     w = strict_strtol(value_w.c_str(), 10, &err_w);
 
     if (!err_w.empty()){
@@ -387,12 +398,26 @@ void ErasureCodeShecReedSolomonVandermonde::prepare()
     dout(10) << "[ cache tables ] creating coeff for k=" <<
       k << " m=" << m << " c=" << c << " w=" << w << dendl;
 
-    matrix = shec_reedsolomon_coding_matrix(k, m, c, w, technique);
+    matrix = shec_reedsolomon_coding_matrix(technique);
 
     // either our new created table is stored or if it has been
     // created in the meanwhile the locally allocated table will be
     // freed by setEncodingTable
     matrix = tcache.setEncodingTable(technique, k, m, c, w, matrix);
+
+    dout(10) << "matrix = " << dendl;
+    for (int i=0; i<m; i++) {
+      char mat[k+1];
+      for (int j=0; j<k; j++) {
+        if (matrix[i*k+j] > 0) {
+          mat[j] = '1';
+        } else {
+          mat[j] = '0';
+        }
+      }
+      mat[k] = '\0';
+      dout(10) << mat << dendl;
+    }
   } else {
     matrix = *p_enc_table;
   }
@@ -403,3 +428,396 @@ void ErasureCodeShecReedSolomonVandermonde::prepare()
   assert((technique == SINGLE) || (technique == MULTIPLE));
 
 }
+
+// ErasureCodeShec::
+// Mearged from shec.cc.
+
+double ErasureCodeShec::shec_calc_recovery_efficiency1(int k, int m1, int m2, int c1, int c2){
+  int r_eff_k[k];
+  double r_e1;
+  int i, rr, cc, start, end;
+  int first_flag;
+
+  if (m1 < c1 || m2 < c2) return -1;
+  if ((m1 == 0 && c1 != 0) || (m2 == 0 && c2 != 0)) return -1;
+
+  for (i=0; i<k; i++) r_eff_k[i] = 100000000;
+  r_e1 = 0;
+
+  for (rr=0; rr<m1; rr++){
+    start = ((rr*k)/m1) % k;
+    end = (((rr+c1)*k)/m1) % k;
+    for (cc=start, first_flag=1; first_flag || cc!=end; cc=(cc+1)%k){
+      first_flag = 0;
+      r_eff_k[cc] = std::min(r_eff_k[cc], ((rr+c1)*k)/m1 - (rr*k)/m1);
+    }
+    r_e1 += ((rr+c1)*k)/m1 - (rr*k)/m1;
+  }
+
+  for (rr=0; rr<m2; rr++){
+    start = ((rr*k)/m2) % k;
+    end = (((rr+c2)*k)/m2) % k;
+    for (cc=start, first_flag=1; first_flag || cc!=end; cc=(cc+1)%k){
+      first_flag = 0;
+      r_eff_k[cc] = std::min(r_eff_k[cc], ((rr+c2)*k)/m2 - (rr*k)/m2);
+    }
+    r_e1 += ((rr+c2)*k)/m2 - (rr*k)/m2;
+  }
+
+  for (i=0; i<k; i++){
+    r_e1 += r_eff_k[i];
+  }
+
+  r_e1 /= (k+m1+m2);
+
+  return r_e1;
+}
+
+int* ErasureCodeShec::shec_reedsolomon_coding_matrix(int is_single)
+{
+  int *matrix;
+  int rr, cc, start, end;
+  int m1, m2, c1, c2;
+
+  if (w != 8 && w != 16 && w != 32) return NULL;
+
+  if (!is_single){
+    int c1_best = -1, m1_best = -1;
+    double min_r_e1 = 100.0;
+
+    // create all multiple shec pattern and choose best.
+
+    for (c1=0; c1 <= c/2; c1++){
+      for (m1=0; m1 <= m; m1++){
+        c2 = c-c1;
+        m2 = m-m1;
+
+        if (m1 < c1 || m2 < c2) continue;
+        if ((m1 == 0 && c1 != 0) || (m2 == 0 && c2 != 0)) continue;
+        if ((m1 != 0 && c1 == 0) || (m2 != 0 && c2 == 0)) continue;
+
+        // minimize r_e1
+
+        if (true) {
+          double r_e1;
+          r_e1 = shec_calc_recovery_efficiency1(k, m1, m2, c1, c2);
+          if (min_r_e1 - r_e1 > std::numeric_limits<double>::epsilon() &&
+	      r_e1 < min_r_e1) {
+            min_r_e1 = r_e1;
+            c1_best = c1;
+            m1_best = m1;
+          }
+        }
+      }
+    }
+    m1 = m1_best;
+    c1 = c1_best;
+    m2 = m - m1_best;
+    c2 = c - c1_best;
+  } else {
+    m1 = 0;
+    c1 = 0;
+    m2 = m;
+    c2 = c;
+  }
+
+  // create matrix
+  matrix = reed_sol_vandermonde_coding_matrix(k, m, w);
+
+  for (rr=0; rr<m1; rr++){
+    end = ((rr*k)/m1) % k;
+    start = (((rr+c1)*k)/m1) % k;
+    for (cc=start; cc!=end; cc=(cc+1)%k){
+      matrix[cc + rr*k] = 0;
+    }
+  }
+
+  for (rr=0; rr<m2; rr++){
+    end = ((rr*k)/m2) % k;
+    start = (((rr+c2)*k)/m2) % k;
+    for (cc=start; cc!=end; cc=(cc+1)%k){
+      matrix[cc + (rr+m1)*k] = 0;
+    }
+  }
+
+  return matrix;
+}
+
+int ErasureCodeShec::shec_make_decoding_matrix(bool prepare, int *want_, int *avails,
+                                               int *decoding_matrix, int *dm_row, int *dm_column,
+                                               int *minimum)
+{
+  int mindup = k+1, minp = k+1;
+  int want[k + m];
+  for (int i = 0; i < k + m; ++i) {
+    want[i] = want_[i];
+  }
+
+  for (int i = 0; i < m; ++i) {
+    if (want[i + k] && !avails[i + k]) {
+      for (int j=0; j < k; ++j) {
+        if (matrix[i * k + j] > 0) {
+          want[j] = 1;
+        }
+      }
+    }
+  }
+
+  if (tcache.getDecodingTableFromCache(decoding_matrix,
+                                       dm_row, dm_column, minimum,
+                                       technique,
+                                       k, m, c, w,
+                                       want, avails)) {
+    return 0;
+  }
+
+  for (unsigned long long pp = 0; pp < (1ull << m); ++pp) {
+
+    // select parity chunks
+    int ek = 0;
+    int p[m];
+    for (int i=0; i < m; ++i) {
+      if (pp & (1ull << i)) {
+        p[ek++] = i;
+      }
+    }
+    if (ek > minp) {
+      continue;
+    }
+
+    // Are selected parity chunks avail?
+    bool ok = true;
+    for (int i = 0; i < ek && ok; i++) {
+      if (!avails[k+p[i]]) {
+        ok = false;
+        break;
+      }
+    }
+
+    if (!ok) {
+      continue;
+    }
+
+    int tmprow[k + m];
+    int tmpcolumn[k];
+    for (int i = 0; i < k + m; i++) {
+      tmprow[i] = 0;
+    }
+    for (int i = 0; i < k; i++) {
+      tmpcolumn[i] = 0;
+    }
+
+    for (int i=0; i < k; i++) {
+      if (want[i] && !avails[i]) {
+        tmpcolumn[i] = 1;
+      }
+    }
+
+    // Parity chunks which are used to recovery erased data chunks, are added to tmprow.
+    for (int i = 0; i < ek; i++) {
+      tmprow[k + p[i]] = 1;
+      for (int j = 0; j < k; j++) {
+        int element = matrix[(p[i]) * k + j];
+        if (element != 0) {
+          tmpcolumn[j] = 1;
+        }
+        if (element != 0 && avails[j] == 1) {
+          tmprow[j] = 1;
+        }
+      }
+    }
+
+    int dup_row = 0, dup_column = 0, dup = 0;
+    for (int i = 0; i < k + m; i++) {
+      if (tmprow[i]) {
+        dup_row++;
+      }
+    }
+
+    for (int i = 0; i < k; i++) {
+      if (tmpcolumn[i]) {
+        dup_column++;
+      }
+    }
+
+    if (dup_row != dup_column) {
+      continue;
+    }
+    dup = dup_row;
+    if (dup == 0) {
+      mindup = dup;
+      for (int i = 0; i < k; i++) {
+        dm_row[i] = -1;
+      }
+      for (int i = 0; i < k; i++) {
+        dm_column[i] = -1;
+      }
+      break;
+    }
+
+    // minimum is updated.
+    if (dup < mindup) {
+      int tmpmat[dup * dup];
+      {
+        for (int i = 0, row = 0; i < k + m; i++) {
+          if (tmprow[i]) {
+            for (int j = 0, column = 0; j < k; j++) {
+              if (tmpcolumn[j]) {
+                if (i < k) {
+                  tmpmat[row * dup + column] = (i == j ? 1 : 0);
+                } else {
+                  tmpmat[row * dup + column] = matrix[(i - k) * k + j];
+                }
+                column++;
+              }
+            }
+            row++;
+          }
+        }
+      }
+      int det = calc_determinant(tmpmat, dup);
+
+      if (det != 0) {
+        int row_id = 0;
+        int column_id = 0;
+        for (int i = 0; i < k; i++) {
+          dm_row[i] = -1;
+        }
+        for (int i = 0; i < k; i++) {
+          dm_column[i] = -1;
+        }
+
+        mindup = dup;
+        for (int i=0; i < k + m; i++) {
+          if (tmprow[i]) {
+            dm_row[row_id++] = i;
+          }
+        }
+        for (int i=0; i < k; i++) {
+          if (tmpcolumn[i]) {
+            dm_column[column_id++] = i;
+          }
+        }
+        minp = ek;
+      }
+    }
+  }
+
+
+  if (mindup == k+1) {
+    fprintf(stderr, "shec_make_decoding_matrix(): can't find recover matrix.\n");
+    return -1;
+  }
+
+  for (int i = 0; i < k + m; i++) {
+    minimum[i] = 0;
+  }
+
+  for (int i=0; i < k && dm_row[i] != -1; i++) {
+    minimum[dm_row[i]] = 1;
+  }
+
+  for (int i = 0; i < k; ++i) {
+    if (want[i] && avails[i]) {
+      minimum[i] = 1;
+    }
+  }
+
+  for (int i = 0; i < m; ++i) {
+    if (want[k + i] && avails[k + i] && !minimum[k + i]) {
+      for (int j = 0; j < k; ++j) {
+        if (matrix[i * k + j] > 0 && !want[j]) {
+          minimum[k + i] = 1;
+          break;
+        }
+      }
+    }
+  }
+
+  if (mindup == 0) {
+    return 0;
+  }
+
+  int tmpmat[mindup * mindup];
+  for (int i=0; i < mindup; i++) {
+    for (int j=0; j < mindup; j++) {
+      if (dm_row[i] < k) {
+        tmpmat[i * mindup + j] = (dm_row[i] == dm_column[j] ? 1 : 0);
+      } else {
+        tmpmat[i * mindup + j] = matrix[(dm_row[i] - k) * k + dm_column[j]];
+      }
+    }
+    if (dm_row[i] < k) {
+      for (int j = 0; j < mindup; j++) {
+        if (dm_row[i] == dm_column[j]) {
+          dm_row[i] = j;
+        }
+      }
+    } else {
+      dm_row[i] -= (k - mindup);
+    }
+  }
+
+  if (prepare) {
+    return 0;
+  }
+
+  int ret = jerasure_invert_matrix(tmpmat, decoding_matrix, mindup, w);
+
+  tcache.putDecodingTableToCache(decoding_matrix, dm_row, dm_column, minimum, technique,
+                                 k, m, c, w, want, avails);
+
+  return ret;
+}
+
+int ErasureCodeShec::shec_matrix_decode(int *want, int *avails, char **data_ptrs,
+                                        char **coding_ptrs, int size)
+{
+  int decoding_matrix[k*k];
+  int dm_row[k], dm_column[k];
+  int minimum[k + m];
+
+  memset(decoding_matrix, 0, sizeof(decoding_matrix));
+  memset(dm_row, -1, sizeof(dm_row));
+  memset(dm_column, -1, sizeof(dm_column));
+  memset(minimum, -1, sizeof(minimum));
+
+  if (w != 8 && w != 16 && w != 32) return -1;
+
+  if (shec_make_decoding_matrix(false, want, avails, decoding_matrix,
+                                dm_row, dm_column, minimum) < 0) {
+    return -1;
+  }
+
+  // Get decoding matrix size
+  int dm_size = 0;
+  for (int i = 0; i < k; i++) {
+    if (dm_row[i] == -1) {
+      break;
+    }
+    dm_size++;
+  }
+
+  char *dm_data_ptrs[dm_size];
+  for (int i = 0; i < dm_size; i++) {
+    dm_data_ptrs[i] = data_ptrs[dm_column[i]];
+  }
+
+  // Decode the data drives
+  for (int i = 0; i < dm_size; i++) {
+    if (!avails[dm_column[i]]) {
+      jerasure_matrix_dotprod(dm_size, w, decoding_matrix + (i * dm_size),
+                              dm_row, i, dm_data_ptrs, coding_ptrs, size);
+    }
+  }
+
+  // Re-encode any erased coding devices
+  for (int i = 0; i < m; i++) {
+    if (want[k+i] && !avails[k+i]) {
+      jerasure_matrix_dotprod(k, w, matrix + (i * k), NULL, i+k,
+                              data_ptrs, coding_ptrs, size);
+    }
+  }
+
+  return 0;
+}
diff --git a/src/erasure-code/shec/ErasureCodeShec.h b/src/erasure-code/shec/ErasureCodeShec.h
index 7eaccf5..bf07b80 100644
--- a/src/erasure-code/shec/ErasureCodeShec.h
+++ b/src/erasure-code/shec/ErasureCodeShec.h
@@ -26,6 +26,9 @@
 #include "ErasureCodeShecTableCache.h"
 #include <list>
 
+#define DEFAULT_RULESET_ROOT "default"
+#define DEFAULT_RULESET_FAILURE_DOMAIN "host"
+
 class ErasureCodeShec : public ErasureCode {
 
 public:
@@ -60,8 +63,8 @@ public:
     w(0),
     DEFAULT_W(8),
     technique(_technique),
-    ruleset_root("default"),
-    ruleset_failure_domain("host"),
+    ruleset_root(DEFAULT_RULESET_ROOT),
+    ruleset_failure_domain(DEFAULT_RULESET_FAILURE_DOMAIN),
     matrix(0)
   {}
 
@@ -81,11 +84,11 @@ public:
 
   virtual unsigned int get_chunk_size(unsigned int object_size) const;
 
-  virtual int minimum_to_decode(const set<int> &want_to_decode,
+  virtual int minimum_to_decode(const set<int> &want_to_read,
 				const set<int> &available_chunks,
 				set<int> *minimum);
 
-  virtual int minimum_to_decode_with_cost(const set<int> &want_to_decode,
+  virtual int minimum_to_decode_with_cost(const set<int> &want_to_read,
 					  const map<int, int> &available,
 					  set<int> *minimum);
 
@@ -102,7 +105,7 @@ public:
 			    const map<int, bufferlist> &chunks,
 			    map<int, bufferlist> *decoded);
 
-  int init(const map<std::string,std::string> &parameters);
+  virtual int init(ErasureCodeProfile &profile, ostream *ss);
   virtual void shec_encode(char **data,
 			   char **coding,
 			   int blocksize) = 0;
@@ -112,8 +115,21 @@ public:
 			  char **coding,
 			  int blocksize) = 0;
   virtual unsigned get_alignment() const = 0;
-  virtual int parse(const map<std::string,std::string> &parameters) = 0;
   virtual void prepare() = 0;
+
+  virtual int shec_matrix_decode(int *erased, int *avails,
+                                 char **data_ptrs, char **coding_ptrs, int size);
+  virtual int* shec_reedsolomon_coding_matrix(int is_single);
+
+private:
+  virtual int parse(const ErasureCodeProfile &profile) = 0;
+
+  virtual double shec_calc_recovery_efficiency1(int k, int m1, int m2, int c1, int c2);
+  virtual int shec_make_decoding_matrix(bool prepare,
+                                        int *want, int *avails,
+                                        int *decoding_matrix,
+                                        int *dm_row, int *dm_column,
+                                        int *minimum);
 };
 
 class ErasureCodeShecReedSolomonVandermonde : public ErasureCodeShec {
@@ -136,8 +152,9 @@ public:
 			  char **coding,
 			  int blocksize);
   virtual unsigned get_alignment() const;
-  virtual int parse(const map<std::string,std::string> &parameters);
   virtual void prepare();
+private:
+  virtual int parse(const ErasureCodeProfile &profile);
 };
 
 #endif
diff --git a/src/erasure-code/shec/ErasureCodeShecTableCache.cc b/src/erasure-code/shec/ErasureCodeShecTableCache.cc
index 8fb64b2..ebd32db 100644
--- a/src/erasure-code/shec/ErasureCodeShecTableCache.cc
+++ b/src/erasure-code/shec/ErasureCodeShecTableCache.cc
@@ -23,34 +23,95 @@
 #include "common/debug.h"
 // -----------------------------------------------------------------------------
 
+// -----------------------------------------------------------------------------
+#define dout_subsys ceph_subsys_osd
+#undef dout_prefix
+#define dout_prefix _tc_prefix(_dout)
+// -----------------------------------------------------------------------------
+
+// -----------------------------------------------------------------------------
+
+static ostream&
+_tc_prefix(std::ostream* _dout) {
+  return *_dout << "ErasureCodeShecTableCache: ";
+}
+
+// -----------------------------------------------------------------------------
+
 ErasureCodeShecTableCache::~ErasureCodeShecTableCache()
 {
   Mutex::Locker lock(codec_tables_guard);
 
-  codec_technique_tables_t::const_iterator ttables_it;
-  codec_tables_t::const_iterator tables_it;
-  codec_tables_t_::const_iterator tables_it_;
-  codec_tables_t__::const_iterator tables_it__;
-  codec_table_t::const_iterator table_it;
-
   // clean-up all allocated tables
+  {
+    codec_technique_tables_t::const_iterator ttables_it;
+    codec_tables_t::const_iterator tables_it;
+    codec_tables_t_::const_iterator tables_it_;
+    codec_tables_t__::const_iterator tables_it__;
+    codec_table_t::const_iterator table_it;
 
-  for (ttables_it = encoding_table.begin(); ttables_it != encoding_table.end(); ++ttables_it) {
-    for (tables_it = ttables_it->second.begin(); tables_it != ttables_it->second.end(); ++tables_it) {
-      for (tables_it_ = tables_it->second.begin(); tables_it_ != tables_it->second.end(); ++tables_it_) {
-	for (tables_it__ = tables_it_->second.begin(); tables_it__ != tables_it_->second.end(); ++tables_it__) {
-	  for (table_it = tables_it__->second.begin(); table_it != tables_it__->second.end(); ++table_it) {
-	    if (table_it->second) {
-	      if (*(table_it->second)) {
-		delete *(table_it->second);
-	      }
-	      delete table_it->second;
-	    }
-	  }
+    for (ttables_it = encoding_table.begin(); ttables_it != encoding_table.end(); ++ttables_it) {
+      for (tables_it = ttables_it->second.begin(); tables_it != ttables_it->second.end(); ++tables_it) {
+        for (tables_it_ = tables_it->second.begin(); tables_it_ != tables_it->second.end(); ++tables_it_) {
+          for (tables_it__ = tables_it_->second.begin(); tables_it__ != tables_it_->second.end(); ++tables_it__) {
+            for (table_it = tables_it__->second.begin(); table_it != tables_it__->second.end(); ++table_it) {
+              if (table_it->second) {
+                if (*(table_it->second)) {
+                  delete *(table_it->second);
+                }
+                delete table_it->second;
+              }
+            }
+          }
         }
       }
     }
   }
+
+  {
+    std::map<int, lru_map_t*>::const_iterator lru_map_it;
+    std::map<int, lru_list_t*>::const_iterator lru_list_it;
+
+    for (lru_map_it = decoding_tables.begin();
+         lru_map_it != decoding_tables.end();
+         ++lru_map_it) {
+      if (lru_map_it->second) {
+        delete lru_map_it->second;
+      }
+    }
+
+    for (lru_list_it = decoding_tables_lru.begin();
+         lru_list_it != decoding_tables_lru.end();
+         ++lru_list_it) {
+      if (lru_list_it->second) {
+        delete lru_list_it->second;
+      }
+    }
+  }
+}
+
+ErasureCodeShecTableCache::lru_map_t*
+ErasureCodeShecTableCache::getDecodingTables(int technique) {
+  // the caller must hold the guard mutex:
+  // => Mutex::Locker lock(codec_tables_guard);
+
+  // create an lru_map if not yet allocated
+  if (!decoding_tables[technique]) {
+    decoding_tables[technique] = new lru_map_t;
+  }
+  return decoding_tables[technique];
+}
+
+ErasureCodeShecTableCache::lru_list_t*
+ErasureCodeShecTableCache::getDecodingTablesLru(int technique) {
+  // the caller must hold the guard mutex:
+  // => Mutex::Locker lock(codec_tables_guard);
+
+  // create an lru_list if not yet allocated
+  if (!decoding_tables_lru[technique]) {
+    decoding_tables_lru[technique] = new lru_list_t;
+  }
+  return decoding_tables_lru[technique];
 }
 
 int**
@@ -95,3 +156,162 @@ ErasureCodeShecTableCache::getLock()
 {
   return &codec_tables_guard;
 }
+
+uint64_t
+ErasureCodeShecTableCache::getDecodingCacheSignature(int k, int m, int c, int w,
+                                                     int *erased, int *avails) {
+  uint64_t signature = 0;
+  signature = (uint64_t)k;
+  signature |= ((uint64_t)m << 6);
+  signature |= ((uint64_t)c << 12);
+  signature |= ((uint64_t)w << 18);
+
+  for (int i=0; i < k+m; i++) {
+    signature |= ((uint64_t)(avails[i] ? 1 : 0) << (24+i));
+  }
+  for (int i=0; i < k+m; i++) {
+    signature |= ((uint64_t)(erased[i] ? 1 : 0) << (44+i));
+  }
+  return signature;
+}
+
+bool
+ErasureCodeShecTableCache::getDecodingTableFromCache(int* decoding_matrix,
+                                                     int* dm_row,
+                                                     int* dm_column,
+                                                     int* minimum,
+                                                     int technique,
+                                                     int k,
+                                                     int m,
+                                                     int c,
+                                                     int w,
+                                                     int* erased,
+                                                     int* avails) {
+  // --------------------------------------------------------------------------
+  // LRU decoding matrix cache
+  // --------------------------------------------------------------------------
+
+  uint64_t signature = getDecodingCacheSignature(k, m, c, w, erased, avails);
+  Mutex::Locker lock(codec_tables_guard);
+
+  dout(20) << "[ get table    ] = " << signature << dendl;
+
+  // we try to fetch a decoding table from an LRU cache
+  lru_map_t* decode_tbls_map =
+    getDecodingTables(technique);
+
+  lru_list_t* decode_tbls_lru =
+    getDecodingTablesLru(technique);
+
+  lru_map_t::iterator decode_tbls_map_it = decode_tbls_map->find(signature);
+  if (decode_tbls_map_it == decode_tbls_map->end()) {
+    return false;
+  }
+
+  dout(20) << "[ cached table ] = " << signature << dendl;
+  // copy parameters out of the cache
+
+  memcpy(decoding_matrix,
+         decode_tbls_map_it->second.second.decoding_matrix,
+         k * k * sizeof(int));
+  memcpy(dm_row,
+         decode_tbls_map_it->second.second.dm_row,
+         k * sizeof(int));
+  memcpy(dm_column,
+         decode_tbls_map_it->second.second.dm_column,
+         k * sizeof(int));
+  memcpy(minimum,
+         decode_tbls_map_it->second.second.minimum,
+         (k+m) * sizeof(int));
+
+  // find item in LRU queue and push back
+  decode_tbls_lru->splice(decode_tbls_lru->end(),
+                          *decode_tbls_lru,
+                          decode_tbls_map_it->second.first);
+  return true;
+}
+
+void
+ErasureCodeShecTableCache::putDecodingTableToCache(int* decoding_matrix,
+                                                   int* dm_row,
+                                                   int* dm_column,
+                                                   int* minimum,
+                                                   int technique,
+                                                   int k,
+                                                   int m,
+                                                   int c,
+                                                   int w,
+                                                   int* erased,
+                                                   int* avails) {
+  // --------------------------------------------------------------------------
+  // LRU decoding matrix cache
+  // --------------------------------------------------------------------------
+
+  Mutex::Locker lock(codec_tables_guard);
+
+  uint64_t signature = getDecodingCacheSignature(k, m, c, w, erased, avails);
+  dout(20) << "[ put table    ] = " << signature << dendl;
+
+  // we store a new table to the cache
+
+  //  bufferptr cachetable;
+
+  lru_map_t* decode_tbls_map =
+    getDecodingTables(technique);
+
+  lru_list_t* decode_tbls_lru =
+    getDecodingTablesLru(technique);
+
+  if (decode_tbls_map->count(signature)) {
+    dout(20) << "[ already on table ] = " << signature << dendl;
+
+    // find item in LRU queue and push back
+    decode_tbls_lru->splice(decode_tbls_lru->end(),
+                            *decode_tbls_lru,
+                            (*decode_tbls_map)[signature].first);
+    return;
+  }
+
+  // evt. shrink the LRU queue/map
+  if ((int)decode_tbls_lru->size() >=
+      ErasureCodeShecTableCache::decoding_tables_lru_length) {
+    dout(20) << "[ shrink lru   ] = " << signature << dendl;
+    // remove from map
+    decode_tbls_map->erase(decode_tbls_lru->front());
+    // remove from lru
+    decode_tbls_lru->pop_front();
+  }
+
+  {
+    dout(20) << "[ store table  ] = " << signature << dendl;
+
+    decode_tbls_lru->push_back(signature);
+
+    // allocate a new buffer
+    lru_list_t::iterator it_end = decode_tbls_lru->end();
+    --it_end;
+
+    lru_entry_t &map_value =
+      (*decode_tbls_map)[signature] =
+      std::make_pair(it_end, DecodingCacheParameter());
+    map_value.second.decoding_matrix = new int[k*k];
+    map_value.second.dm_row = new int[k];
+    map_value.second.dm_column = new int[k];
+    map_value.second.minimum = new int[k+m];
+
+    memcpy(map_value.second.decoding_matrix,
+           decoding_matrix,
+           k * k * sizeof(int));
+    memcpy(map_value.second.dm_row,
+           dm_row,
+           k * sizeof(int));
+    memcpy(map_value.second.dm_column,
+           dm_column,
+           k * sizeof(int));
+    memcpy(map_value.second.minimum,
+           minimum,
+           (k+m) * sizeof(int));
+
+    dout(20) << "[ cache size   ] = " << decode_tbls_lru->size() << dendl;
+  }
+}
diff --git a/src/erasure-code/shec/ErasureCodeShecTableCache.h b/src/erasure-code/shec/ErasureCodeShecTableCache.h
index 21f65bd..e4eaf0f 100644
--- a/src/erasure-code/shec/ErasureCodeShecTableCache.h
+++ b/src/erasure-code/shec/ErasureCodeShecTableCache.h
@@ -31,11 +31,43 @@ class ErasureCodeShecTableCache {
   // ---------------------------------------------------------------------------
   // This class implements a table cache for encoding and decoding matrices.
   // Encoding matrices are shared for the same (k,m,c,w) combination.
+  // It supplies a decoding matrix lru cache which is shared for identical
+  // matrix types e.g. there is one cache (lru-list + lru-map)
   // ---------------------------------------------------------------------------
 
+  class DecodingCacheParameter {
+   public:
+    int* decoding_matrix;  // size: k*k
+    int* dm_row;  // size: k
+    int* dm_column;  // size: k
+    int* minimum;  // size: k+m
+    DecodingCacheParameter() {
+      decoding_matrix = 0;
+      dm_row = 0;
+      dm_column = 0;
+      minimum = 0;
+    }
+    ~DecodingCacheParameter() {
+      if (decoding_matrix) {
+        delete[] decoding_matrix;
+      }
+      if (dm_row) {
+        delete[] dm_row;
+      }
+      if (dm_column) {
+        delete[] dm_column;
+      }
+      if (minimum) {
+        delete[] minimum;
+      }
+    }
+  };
+
  public:
 
-  typedef std::pair<std::list<std::string>::iterator, bufferptr> lru_entry_t;
+  static const int decoding_tables_lru_length = 10000;
+  typedef std::pair<std::list<uint64_t>::iterator,
+                    DecodingCacheParameter> lru_entry_t;
   typedef std::map< int, int** > codec_table_t;
   typedef std::map< int, codec_table_t > codec_tables_t__;
   typedef std::map< int, codec_tables_t__ > codec_tables_t_;
@@ -43,6 +75,9 @@ class ErasureCodeShecTableCache {
   typedef std::map< int, codec_tables_t > codec_technique_tables_t;
   // int** matrix = codec_technique_tables_t[technique][k][m][c][w]
   
+  typedef std::map< uint64_t, lru_entry_t > lru_map_t;
+  typedef std::list< uint64_t > lru_list_t;
+
  ErasureCodeShecTableCache() :
   codec_tables_guard("shec-lru-cache")
     {
@@ -52,15 +87,38 @@ class ErasureCodeShecTableCache {
   
   Mutex codec_tables_guard; // mutex used to protect modifications in encoding/decoding table maps
   
+  bool getDecodingTableFromCache(int* matrix,
+                                 int* dm_row, int* dm_column,
+                                 int* minimum,
+                                 int technique,
+                                 int k, int m, int c, int w,
+                                 int* want, int* avails);
+
+  void putDecodingTableToCache(int* matrix,
+                               int* dm_row, int* dm_column,
+                               int* minimum,
+                               int technique,
+                               int k, int m, int c, int w,
+                               int* want, int* avails);
+
   int** getEncodingTable(int technique, int k, int m, int c, int w);
   int** getEncodingTableNoLock(int technique, int k, int m, int c, int w);
   int* setEncodingTable(int technique, int k, int m, int c, int w, int*);
   
  private:
-  codec_technique_tables_t encoding_table; // encoding coefficients accessed via table[technique][k][m]
-  
+  // encoding table accessed via table[matrix][k][m][c][w]
+  // decoding table cache accessed via map[matrixtype]
+  // decoding table lru list accessed via list[matrixtype]
+  codec_technique_tables_t encoding_table;
+  std::map<int, lru_map_t*> decoding_tables;
+  std::map<int, lru_list_t*> decoding_tables_lru;
+
+  lru_map_t* getDecodingTables(int technique);
+  lru_list_t* getDecodingTablesLru(int technique);
+  uint64_t getDecodingCacheSignature(int k, int m, int c, int w,
+                                     int *want, int *avails);
+
   Mutex* getLock();
-  
 };
 
 #endif
diff --git a/src/erasure-code/shec/Makefile.am b/src/erasure-code/shec/Makefile.am
index 148f1c3..1449a41 100644
--- a/src/erasure-code/shec/Makefile.am
+++ b/src/erasure-code/shec/Makefile.am
@@ -1,11 +1,9 @@
 # SHEC plugin
-
-libec_shec_la_SOURCES = \
+shec_sources = \
 	erasure-code/ErasureCode.cc \
 	erasure-code/shec/ErasureCodePluginShec.cc \
 	erasure-code/shec/ErasureCodeShec.cc \
 	erasure-code/shec/ErasureCodeShecTableCache.cc \
-	erasure-code/shec/shec.cc \
 	erasure-code/shec/determinant.c \
 	erasure-code/jerasure/jerasure/src/cauchy.c \
 	erasure-code/jerasure/jerasure/src/galois.c \
@@ -23,10 +21,10 @@ libec_shec_la_SOURCES = \
 	erasure-code/jerasure/gf-complete/src/gf_w4.c \
 	erasure-code/jerasure/gf-complete/src/gf_rand.c \
 	erasure-code/jerasure/gf-complete/src/gf_w8.c
+
 noinst_HEADERS += \
 	erasure-code/shec/ErasureCodeShec.h \
 	erasure-code/shec/ErasureCodeShecTableCache.h \
-	erasure-code/shec/shec.h \
 	erasure-code/jerasure/jerasure/include/cauchy.h \
 	erasure-code/jerasure/jerasure/include/galois.h \
 	erasure-code/jerasure/jerasure/include/jerasure.h \
@@ -40,22 +38,125 @@ noinst_HEADERS += \
 
 erasure-code/shec/ErasureCodePluginShec.cc: ./ceph_ver.h
 
-libec_shec_la_CFLAGS = ${AM_CFLAGS} \
+libec_shec_generic_la_SOURCES = ${shec_sources}
+libec_shec_generic_la_CFLAGS = ${AM_CFLAGS}  \
+	-I$(srcdir)/erasure-code/jerasure/jerasure/include \
+	-I$(srcdir)/erasure-code/jerasure/gf-complete/include \
+	-I$(srcdir)/erasure-code/jerasure \
+	-I$(srcdir)/erasure-code/shec
+libec_shec_generic_la_CXXFLAGS= ${AM_CXXFLAGS} \
+	-I$(srcdir)/erasure-code/jerasure/jerasure/include \
+	-I$(srcdir)/erasure-code/jerasure/gf-complete/include \
+	-I$(srcdir)/erasure-code/jerasure \
+	-I$(srcdir)/erasure-code/shec
+libec_shec_generic_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(EXTRALIBS)
+libec_shec_generic_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0
+if LINUX
+libec_shec_generic_la_LDFLAGS += -export-symbols-regex '.*__erasure_code_.*'
+endif
+
+erasure_codelib_LTLIBRARIES += libec_shec_generic.la
+
+libec_shec_neon_la_SOURCES = ${shec_sources} \
+	erasure-code/jerasure/gf-complete/src/neon/gf_w4_neon.c \
+	erasure-code/jerasure/gf-complete/src/neon/gf_w8_neon.c \
+	erasure-code/jerasure/gf-complete/src/neon/gf_w16_neon.c \
+	erasure-code/jerasure/gf-complete/src/neon/gf_w32_neon.c \
+	erasure-code/jerasure/gf-complete/src/neon/gf_w64_neon.c
+libec_shec_neon_la_CFLAGS = ${AM_CFLAGS}  \
+	${ARM_NEON_FLAGS} \
+	-I$(srcdir)/erasure-code/jerasure/jerasure/include \
+	-I$(srcdir)/erasure-code/jerasure/gf-complete/include \
+	-I$(srcdir)/erasure-code/jerasure \
+	-I$(srcdir)/erasure-code/shec
+libec_shec_neon_la_CXXFLAGS= ${AM_CXXFLAGS} \
+	${ARM_NEON_FLAGS} \
+	-I$(srcdir)/erasure-code/jerasure/jerasure/include \
+	-I$(srcdir)/erasure-code/jerasure/gf-complete/include \
+	-I$(srcdir)/erasure-code/jerasure \
+	-I$(srcdir)/erasure-code/shec
+libec_shec_neon_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(EXTRALIBS)
+libec_shec_neon_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0
+if LINUX
+libec_shec_neon_la_LDFLAGS += -export-symbols-regex '.*__erasure_code_.*'
+endif
+
+if HAVE_NEON
+erasure_codelib_LTLIBRARIES += libec_shec_neon.la
+endif
+
+libec_shec_sse3_la_SOURCES = ${shec_sources}
+libec_shec_sse3_la_CFLAGS = ${AM_CFLAGS}  \
+	${INTEL_SSE_FLAGS} \
+	${INTEL_SSE2_FLAGS} \
+	${INTEL_SSE3_FLAGS} \
+	${INTEL_SSSE3_FLAGS} \
+	-I$(srcdir)/erasure-code/jerasure/jerasure/include \
+	-I$(srcdir)/erasure-code/jerasure/gf-complete/include \
+	-I$(srcdir)/erasure-code/jerasure \
+	-I$(srcdir)/erasure-code/shec
+libec_shec_sse3_la_CXXFLAGS= ${AM_CXXFLAGS} \
+	${INTEL_SSE_FLAGS} \
+	${INTEL_SSE2_FLAGS} \
+	${INTEL_SSE3_FLAGS} \
+	${INTEL_SSSE3_FLAGS} \
+	-I$(srcdir)/erasure-code/jerasure/jerasure/include \
+	-I$(srcdir)/erasure-code/jerasure/gf-complete/include \
+	-I$(srcdir)/erasure-code/jerasure \
+	-I$(srcdir)/erasure-code/shec
+libec_shec_sse3_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(EXTRALIBS)
+libec_shec_sse3_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0
+if LINUX
+libec_shec_sse3_la_LDFLAGS += -export-symbols-regex '.*__erasure_code_.*'
+endif
+
+if HAVE_SSSE3
+erasure_codelib_LTLIBRARIES += libec_shec_sse3.la
+endif
+
+libec_shec_sse4_la_SOURCES = ${shec_sources}
+libec_shec_sse4_la_CFLAGS = ${AM_CFLAGS}  \
+	${INTEL_SSE_FLAGS} \
+	${INTEL_SSE2_FLAGS} \
+	${INTEL_SSE3_FLAGS} \
+	${INTEL_SSSE3_FLAGS} \
+	${INTEL_SSE4_1_FLAGS} \
+	${INTEL_SSE4_2_FLAGS} \
 	-I$(srcdir)/erasure-code/jerasure/jerasure/include \
 	-I$(srcdir)/erasure-code/jerasure/gf-complete/include \
 	-I$(srcdir)/erasure-code/jerasure \
 	-I$(srcdir)/erasure-code/shec
-libec_shec_la_CXXFLAGS= ${AM_CXXFLAGS} \
+libec_shec_sse4_la_CXXFLAGS= ${AM_CXXFLAGS} \
+	${INTEL_SSE_FLAGS} \
+	${INTEL_SSE2_FLAGS} \
+	${INTEL_SSE3_FLAGS} \
+	${INTEL_SSSE3_FLAGS} \
+	${INTEL_SSE4_1_FLAGS} \
+	${INTEL_SSE4_2_FLAGS} \
 	-I$(srcdir)/erasure-code/jerasure/jerasure/include \
 	-I$(srcdir)/erasure-code/jerasure/gf-complete/include \
 	-I$(srcdir)/erasure-code/jerasure \
 	-I$(srcdir)/erasure-code/shec
+libec_shec_sse4_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(EXTRALIBS)
+libec_shec_sse4_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0
+if LINUX
+libec_shec_sse4_la_LDFLAGS += -export-symbols-regex '.*__erasure_code_.*'
+endif
+
+if HAVE_SSE4_PCLMUL
+erasure_codelib_LTLIBRARIES += libec_shec_sse4.la
+endif
+
+libec_shec_la_SOURCES = \
+	erasure-code/shec/ErasureCodePluginSelectShec.cc
+libec_shec_la_CFLAGS = ${AM_CFLAGS}
+libec_shec_la_CXXFLAGS= ${AM_CXXFLAGS}
 libec_shec_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(EXTRALIBS)
-#libec_shec_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
-#libec_shec_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 -export-symbols-regex '.*__erasure_code_.*'
 libec_shec_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0
 if LINUX
 libec_shec_la_LDFLAGS += -export-symbols-regex '.*__erasure_code_.*'
 endif
 
+erasure-code/shec/ErasureCodePluginSelectShec.cc: ./ceph_ver.h
+
 erasure_codelib_LTLIBRARIES += libec_shec.la
diff --git a/src/erasure-code/shec/shec.cc b/src/erasure-code/shec/shec.cc
deleted file mode 100644
index 9522555..0000000
--- a/src/erasure-code/shec/shec.cc
+++ /dev/null
@@ -1,329 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2014 FUJITSU LIMITED
- * Copyright (C) 2014, James S. Plank and Kevin Greenan
- *
- * Author: Takanori Nakao <nakao.takanori at jp.fujitsu.com>
- * Author: Takeshi Miyamae <miyamae.takeshi at jp.fujitsu.com>
- *
- *  This library is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU Lesser General Public
- *  License as published by the Free Software Foundation; either
- *  version 2.1 of the License, or (at your option) any later version.
- *
- */
-
-/* Jerasure's authors:
-
-   Revision 2.x - 2014: James S. Plank and Kevin M. Greenan
-   Revision 1.2 - 2008: James S. Plank, Scott Simmerman and Catherine D. Schuman.
-   Revision 1.0 - 2007: James S. Plank
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <algorithm>
-
-#include "shec.h"
-
-extern "C"{
-#include "jerasure/include/jerasure.h"
-#include "jerasure/include/reed_sol.h"
-
-#define talloc(type, num) (type *) malloc(sizeof(type)*(num))
-
-extern int calc_determinant(int *matrix, int dim);
-}
-
-double shec_calc_recovery_efficiency1(int k, int m1, int m2, int c1, int c2){
-  int r_eff_k[k];
-  double r_e1;
-  int i, rr, cc, start, end;
-  int first_flag;
-
-  if (m1 < c1 || m2 < c2) return -1;
-  if ((m1 == 0 && c1 != 0) || (m2 == 0 && c2 != 0)) return -1;
-
-  for (i=0; i<k; i++) r_eff_k[i] = 100000000;
-  r_e1 = 0;
-
-  for (rr=0; rr<m1; rr++){
-    start = ((rr*k)/m1) % k;
-    end = (((rr+c1)*k)/m1) % k;
-    for (cc=start, first_flag=1; first_flag || cc!=end; cc=(cc+1)%k){
-      first_flag = 0;
-      r_eff_k[cc] = std::min(r_eff_k[cc], ((rr+c1)*k)/m1 - (rr*k)/m1);
-    }
-    r_e1 += ((rr+c1)*k)/m1 - (rr*k)/m1;
-  }
-
-  for (rr=0; rr<m2; rr++){
-    start = ((rr*k)/m2) % k;
-    end = (((rr+c2)*k)/m2) % k;
-    for (cc=start, first_flag=1; first_flag || cc!=end; cc=(cc+1)%k){
-      first_flag = 0;
-      r_eff_k[cc] = std::min(r_eff_k[cc], ((rr+c2)*k)/m2 - (rr*k)/m2);
-    }
-    r_e1 += ((rr+c2)*k)/m2 - (rr*k)/m2;
-  }
-
-  for (i=0; i<k; i++){
-    r_e1 += r_eff_k[i];
-  }
-
-  r_e1 /= (k+m1+m2);
-
-  return r_e1;
-}
-
-int *shec_reedsolomon_coding_matrix(int k, int m, int c, int w, int is_single)
-{
-  int *matrix;
-  int rr, cc, start, end;
-  int m1, m2, c1, c2, c1_best = -1, m1_best = -1;
-  double min_r_e1;
-
-  if (w != 8 && w != 16 && w != 32) return NULL;
-
-  if (!is_single){
-
-    min_r_e1 = 100.0;
-
-    // create all multiple shec pattern and choose best.
-
-    for (c1=0; c1 <= c/2; c1++){
-      for (m1=0; m1 <= m; m1++){
-	c2 = c-c1;
-	m2 = m-m1;
-
-	if (m1 < c1 || m2 < c2) continue;
-	if ((m1 == 0 && c1 != 0) || (m2 == 0 && c2 != 0)) continue;
-	if ((m1 != 0 && c1 == 0) || (m2 != 0 && c2 == 0)) continue;
-
-	// minimize r_e1
-
-	if (true) {
-	  double r_e1;
-	  r_e1 = shec_calc_recovery_efficiency1(k, m1, m2, c1, c2);
-	  if (r_e1 < min_r_e1){
-	    min_r_e1 = r_e1;
-	    c1_best = c1;
-	    m1_best = m1;
-	  }
-	}
-      }
-    }
-    m1 = m1_best;
-    c1 = c1_best;
-    m2 = m - m1_best;
-    c2 = c - c1_best;
-  } else {
-    m1 = 0;
-    c1 = 0;
-    m2 = m;
-    c2 = c;
-  }
-
-  // create matrix
-  matrix = reed_sol_vandermonde_coding_matrix(k, m, w);
-
-  for (rr=0; rr<m1; rr++){
-    end = ((rr*k)/m1) % k;
-    start = (((rr+c1)*k)/m1) % k;
-    for (cc=start; cc!=end; cc=(cc+1)%k){
-      matrix[cc + rr*k] = 0;
-    }
-  }
-
-  for (rr=0; rr<m2; rr++){
-    end = ((rr*k)/m2) % k;
-    start = (((rr+c2)*k)/m2) % k;
-    for (cc=start; cc!=end; cc=(cc+1)%k){
-      matrix[cc + (rr+m1)*k] = 0;
-    }
-  }
-
-  return matrix;
-}
-
-int shec_make_decoding_matrix(bool prepare, int k, int m, int w, int *matrix, int *erased, int *avails, int *decoding_matrix, int *dm_ids, int *minimum)
-{
-  int i, j, det = 0;
-  int ek;
-  int *tmpmat = NULL, tmprow[k+m], element, dup, mindup;
-
-  for (i = 0, j = 0, ek = 0; i < k; i++) {
-    if (erased[i] == 1) {
-      ek++;
-    } else {
-      dm_ids[j] = i;
-      j++;
-    }
-  }
-
-  tmpmat = talloc(int, k*k);
-  if (tmpmat == NULL) { return -1; }
-  for (i = 0; i < k-ek; i++) {
-    for (j = 0; j < k; j++) tmpmat[i*k+j] = 0;
-    tmpmat[i*k+dm_ids[i]] = 1;
-  }
-
-  if (ek > m){
-    return -1;
-  }
-
-  mindup = k+1;
-  int minc[ek];
-  for (i=0; i<ek; i++){
-    minc[i] = -1;
-  }
-  int p[ek];
-  int pp[k+m];
-  for (i=0; i<ek; i++){
-    pp[i] = 1;
-  }
-  for (i=ek; i<m; i++){
-    pp[i] = 0;
-  }
-
-  do {
-    i=0;
-    for (j=0; j<m; j++){
-      if (pp[j]){
-	p[i++] = j;
-      }
-    }
-
-    bool ok = true;
-    for (i = 0; i < ek; i++) {
-      if (erased[k+p[i]] == 1 || avails[k+p[i]] == 0) ok = false;
-      for (j = 0; j < k; j++) {
-	element = matrix[(p[i])*k+j];
-	if (element != 0) {
-	  if (erased[j] == 0 && avails[j] == 0) ok = false;
-	}
-      }
-    }
-    if (ok == false) continue;
-
-    for (i = 0; i < k+m; i++) tmprow[i] = 0;
-    for (i = 0; i < m; i++) {
-      if (erased[k+i] == 1) {
-	for (j = 0; j < k; j++) {
-	  if (matrix[i*k+j] != 0 && erased[j] == 0) tmprow[j] = 1;
-	}
-      }
-    }
-    for (i = 0; i < ek; i++) {
-      tmprow[k+p[i]] = 1;
-      for (j = 0; j < k; j++) {
-	element = matrix[(p[i])*k+j];
-	tmpmat[(k-ek+i)*k+j] = element;
-	if (element != 0 && erased[j] == 0) tmprow[j] = 1;
-      }
-    }
-    dup = 0;
-    for (j = 0; j < k; j++) {
-      if (tmprow[j] > 0) dup++;
-    }
-    if (dup < mindup) {
-      det = calc_determinant(tmpmat, k);
-      if (det != 0) {
-	mindup = dup;
-	for (int i=0; i<ek; i++){
-	  minc[i] = p[i];
-	}
-      }
-    }
-  } while (std::prev_permutation(pp, pp+m));
-
-  if (minc[0] == -1 && mindup == k+1) {
-    fprintf(stderr, "shec_make_decoding_matrix(): can't find recover matrix.\n");
-    free(tmpmat);
-    return -1;
-  }
-
-  for (i = 0; i < k+m; i++) minimum[i] = 0;
-  for (i = 0; i < m; i++) {
-    if (erased[k+i] == 1) {
-      for (j = 0; j < k; j++) {
-	if (matrix[i*k+j] != 0 && erased[j] == 0) minimum[j] = 1;
-      }
-    }
-  }
-  for (i = 0; i < ek; i++) {
-    dm_ids[k-ek+i] = k+minc[i];
-    minimum[k+minc[i]] = 1;
-    for (j = 0; j < k; j++) {
-      element = matrix[(minc[i])*k+j];
-      tmpmat[(k-ek+i)*k+j] = element;
-      if (element != 0 && erased[j] == 0) minimum[j] = 1;
-    }
-  }
-
-  if (prepare == true) {
-    free(tmpmat);
-    return 0;
-  }
-
-  i = jerasure_invert_matrix(tmpmat, decoding_matrix, k, w);
-
-  free(tmpmat);
-
-  return i;
-}
-
-int shec_matrix_decode(int k, int m, int w, int *matrix,
-		       int *erased, int *avails, char **data_ptrs, char **coding_ptrs, int size)
-{
-  int i, edd;
-  int *decoding_matrix = NULL, dm_ids[k];
-  int minimum[k + m];
-
-  if (w != 8 && w != 16 && w != 32) return -1;
-
-  /* Find the number of data drives failed */
-
-  edd = 0;
-  for (i = 0; i < k; i++) {
-    if (erased[i]) {
-      edd++;
-    }
-  }
-
-  decoding_matrix = talloc(int, k*k);
-  if (decoding_matrix == NULL) { return -1; }
-
-  if (shec_make_decoding_matrix(false, k, m, w, matrix, erased,
-				avails, decoding_matrix, dm_ids, minimum) < 0) {
-    free(decoding_matrix);
-    return -1;
-  }
-
-  /* Decode the data drives */
-
-  for (i = 0; edd > 0 && i < k; i++) {
-    if (erased[i]) {
-      jerasure_matrix_dotprod(k, w, decoding_matrix+(i*k),
-			      dm_ids, i, data_ptrs, coding_ptrs, size);
-      edd--;
-    }
-  }
-
-  /* Re-encode any erased coding devices */
-
-  for (i = 0; i < m; i++) {
-    if (erased[k+i]) {
-      jerasure_matrix_dotprod(k, w, matrix+(i*k), NULL, i+k,
-			      data_ptrs, coding_ptrs, size);
-    }
-  }
-
-  if (decoding_matrix != NULL) free(decoding_matrix);
-
-  return 0;
-}
diff --git a/src/erasure-code/shec/shec.h b/src/erasure-code/shec/shec.h
deleted file mode 100755
index fe4471f..0000000
--- a/src/erasure-code/shec/shec.h
+++ /dev/null
@@ -1,35 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2014 FUJITSU LIMITED
- * Copyright (C) 2014, James S. Plank and Kevin Greenan
- *
- * Author: Takanori Nakao <nakao.takanori at jp.fujitsu.com>
- * Author: Takeshi Miyamae <miyamae.takeshi at jp.fujitsu.com>
- *
- *  This library is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU Lesser General Public
- *  License as published by the Free Software Foundation; either
- *  version 2.1 of the License, or (at your option) any later version.
- *
- */
-
-/* Jerasure's authors:
-
-   Revision 2.x - 2014: James S. Plank and Kevin M. Greenan
-   Revision 1.2 - 2008: James S. Plank, Scott Simmerman and Catherine D. Schuman.
-   Revision 1.0 - 2007: James S. Plank
- */
-
-#ifndef SHEC_H
-#define SHEC_H
-
-int *shec_reedsolomon_coding_matrix(int k, int m, int c, int w, int is_single);
-int shec_make_decoding_matrix(bool prepare, int k, int m, int w, int *matrix,
-    int *erased, int *avails, int *decoding_matrix, int *dm_ids, int *minimum);
-int shec_matrix_decode(int k, int m, int w, int *matrix,
-    int *erased, int *avails, char **data_ptrs, char **coding_ptrs, int size);
-
-#endif
diff --git a/src/global/global_context.h b/src/global/global_context.h
index 6586b5c..bf59c78 100644
--- a/src/global/global_context.h
+++ b/src/global/global_context.h
@@ -17,7 +17,6 @@
 
 #include "common/ceph_context.h"
 
-#include <iostream>
 #include <stdint.h>
 
 struct md_config_t;
diff --git a/src/global/global_init.cc b/src/global/global_init.cc
index 3464b0a..ed5d186 100644
--- a/src/global/global_init.cc
+++ b/src/global/global_init.cc
@@ -29,6 +29,9 @@
 #include "include/compat.h"
 #include "include/color.h"
 
+#include <pwd.h>
+#include <grp.h>
+
 #include <errno.h>
 #include <deque>
 
@@ -126,13 +129,104 @@ void global_init(std::vector < const char * > *alt_def_args,
   if (g_conf->log_flush_on_exit)
     g_ceph_context->_log->set_flush_on_exit();
 
+  // consider --setuser root a no-op, even if we're not root
+  if (getuid() != 0) {
+    if (g_conf->setuser.length()) {
+      cerr << "ignoring --setuser " << g_conf->setuser << " since I am not root"
+	   << std::endl;
+      g_conf->set_val("setuser", "", false, false);
+    }
+    if (g_conf->setgroup.length()) {
+      cerr << "ignoring --setgroup " << g_conf->setgroup
+	   << " since I am not root" << std::endl;
+      g_conf->set_val("setgroup", "", false, false);
+    }
+  }
+
+  // drop privileges?
+  if (g_conf->setgroup.length() ||
+      g_conf->setuser.length()) {
+    uid_t uid = 0;  // zero means no change; we can only drop privs here.
+    gid_t gid = 0;
+    if (g_conf->setuser.length()) {
+      uid = atoi(g_conf->setuser.c_str());
+      if (!uid) {
+	char buf[4096];
+	struct passwd pa;
+	struct passwd *p = 0;
+	getpwnam_r(g_conf->setuser.c_str(), &pa, buf, sizeof(buf), &p);
+	if (!p) {
+	  cerr << "unable to look up user '" << g_conf->setuser << "'"
+	       << std::endl;
+	  exit(1);
+	}
+	uid = p->pw_uid;
+	gid = p->pw_gid;
+      }
+    }
+    if (g_conf->setgroup.length() > 0) {
+      gid = atoi(g_conf->setgroup.c_str());
+      if (!gid) {
+	char buf[4096];
+	struct group gr;
+	struct group *g = 0;
+	getgrnam_r(g_conf->setgroup.c_str(), &gr, buf, sizeof(buf), &g);
+	if (!g) {
+	  cerr << "unable to look up group '" << g_conf->setgroup << "'"
+	       << std::endl;
+	  exit(1);
+	}
+	gid = g->gr_gid;
+      }
+    }
+    if ((uid || gid) &&
+	g_conf->setuser_match_path.length()) {
+      struct stat st;
+      int r = ::stat(g_conf->setuser_match_path.c_str(), &st);
+      if (r < 0) {
+	r = -errno;
+	cerr << "unable to stat setuser_match_path "
+	     << g_conf->setuser_match_path
+	     << ": " << cpp_strerror(r) << std::endl;
+	exit(1);
+      }
+      if ((uid && uid != st.st_uid) ||
+	  (gid && gid != st.st_gid)) {
+	cerr << "WARNING: will not setuid/gid: " << g_conf->setuser_match_path
+	     << " owned by " << st.st_uid << ":" << st.st_gid
+	     << " and not requested " << uid << ":" << gid
+	     << std::endl;
+	uid = 0;
+	gid = 0;
+      } else {
+	dout(10) << "setuser_match_path "
+		 << g_conf->setuser_match_path << " owned by "
+		 << st.st_uid << ":" << st.st_gid << ", doing setuid/gid"
+		 << dendl;
+      }
+    }
+    if (setgid(gid) != 0) {
+      int r = errno;
+      cerr << "unable to setgid " << gid << ": " << cpp_strerror(r)
+	   << std::endl;
+      exit(1);
+    }
+    if (setuid(uid) != 0) {
+      int r = errno;
+      cerr << "unable to setuid " << uid << ": " << cpp_strerror(r)
+	   << std::endl;
+      exit(1);
+    }
+    dout(0) << "set uid:gid to " << uid << ":" << gid << dendl;
+  }
+
   if (g_conf->run_dir.length() &&
       code_env == CODE_ENVIRONMENT_DAEMON &&
       !(flags & CINIT_FLAG_NO_DAEMON_ACTIONS)) {
     int r = ::mkdir(g_conf->run_dir.c_str(), 0755);
     if (r < 0 && errno != EEXIST) {
       r = -errno;
-      derr << "warning: unable to create " << g_conf->run_dir << ": " << cpp_strerror(r) << dendl;
+      cerr << "warning: unable to create " << g_conf->run_dir << ": " << cpp_strerror(r) << std::endl;
     }
   }
 
@@ -142,6 +236,13 @@ void global_init(std::vector < const char * > *alt_def_args,
   // and opening the log file immediately.
   g_conf->call_all_observers();
 
+  // test leak checking
+  if (g_conf->debug_deliberately_leak_memory) {
+    derr << "deliberately leaking some memory" << dendl;
+    char *s = new char[1234567];
+    (void)s;
+  }
+
   if (code_env == CODE_ENVIRONMENT_DAEMON && !(flags & CINIT_FLAG_NO_DAEMON_ACTIONS))
     output_ceph_version();
 }
diff --git a/src/include/CompatSet.h b/src/include/CompatSet.h
index 03bf54d..d3afd40 100644
--- a/src/include/CompatSet.h
+++ b/src/include/CompatSet.h
@@ -29,19 +29,26 @@ struct CompatSet {
     Feature(uint64_t _id, const string& _name) : id(_id), name(_name) {}
   };
 
-  struct FeatureSet {
+  class FeatureSet {
     uint64_t mask;
     map <uint64_t,string> names;
 
+  public:
+    friend struct CompatSet;
+    friend class CephCompatSet_AllSet_Test;
+    friend class CephCompatSet_other_Test;
+    friend class CephCompatSet_merge_Test;
+    friend ostream& operator<<(ostream& out, const CompatSet::FeatureSet& fs);
+    friend ostream& operator<<(ostream& out, const CompatSet& compat);
     FeatureSet() : mask(1), names() {}
-    void insert(Feature f) {
+    void insert(const Feature& f) {
       assert(f.id > 0);
       assert(f.id < 64);
       mask |= ((uint64_t)1<<f.id);
       names[f.id] = f.name;
     }
 
-    bool contains(Feature f) const {
+    bool contains(const Feature& f) const {
       return names.count(f.id);
     }
     bool contains(uint64_t f) const {
@@ -55,13 +62,14 @@ struct CompatSet {
       assert(i != names.end());
       return i->second;
     }
+
     void remove(uint64_t f) {
       if (names.count(f)) {
 	names.erase(f);
 	mask &= ~((uint64_t)1<<f);
       }
     }
-    void remove(Feature f) {
+    void remove(const Feature& f) {
       remove(f.id);
     }
 
@@ -103,7 +111,7 @@ struct CompatSet {
       for (map<uint64_t,string>::const_iterator p = names.begin();
 	   p != names.end();
 	   ++p) {
-	char s[10];
+	char s[18];
 	snprintf(s, sizeof(s), "feature_%lld", (unsigned long long)p->first);
 	f->dump_string(s, p->second);
       }
diff --git a/src/include/Context.h b/src/include/Context.h
index 2ae9221..16c7c43 100644
--- a/src/include/Context.h
+++ b/src/include/Context.h
@@ -22,7 +22,6 @@
 #include <list>
 #include <set>
 
-#include <iostream>
 #include "include/assert.h"
 #include "include/memory.h"
 
@@ -191,7 +190,7 @@ public:
   }
   void complete(int r) {
     // Neuter any ContextInstanceType custom complete(), because although
-    // I want to look like him, I don't actually want to run his code.
+    // I want to look like it, I don't actually want to run its code.
     Context::complete(r);
   }
   void finish(int r) {
diff --git a/src/include/Makefile.am b/src/include/Makefile.am
index ef2e0ae..a364b29 100644
--- a/src/include/Makefile.am
+++ b/src/include/Makefile.am
@@ -61,6 +61,7 @@ noinst_HEADERS += \
 	include/cmp.h \
 	include/color.h \
 	include/compat.h \
+	include/sock_compat.h \
 	include/crc32c.h \
 	include/encoding.h \
 	include/err.h \
@@ -68,6 +69,7 @@ noinst_HEADERS += \
 	include/filepath.h \
 	include/frag.h \
 	include/hash.h \
+	include/inline_memory.h \
 	include/intarith.h \
 	include/interval_set.h \
 	include/int_types.h \
@@ -90,6 +92,8 @@ noinst_HEADERS += \
 	include/elist.h \
 	include/uuid.h \
 	include/xlist.h \
+	include/compact_map.h \
+	include/compact_set.h \
 	include/rados/librados.h \
 	include/rados/rados_types.h \
 	include/rados/rados_types.hpp \
@@ -109,6 +113,6 @@ noinst_HEADERS += \
 	include/on_exit.h \
 	include/memory.h \
 	include/rados/memory.h \
-	include/hash_namespace.h \
 	include/unordered_set.h \
-	include/unordered_map.h
+	include/unordered_map.h \
+	include/timegm.h
diff --git a/src/include/atomic.h b/src/include/atomic.h
index 960123e..d5bc1c7 100644
--- a/src/include/atomic.h
+++ b/src/include/atomic.h
@@ -70,6 +70,17 @@ namespace ceph {
       ceph_spin_unlock(&lock);
       return ret;
     }
+    bool compare_and_swap(T o, T n) {
+      bool success = false;
+      ceph_spin_lock(&lock);
+      if (val == o) {
+        success = true;
+        val = n;
+      }
+      ceph_spin_unlock(&lock);
+      return success;
+    }
+
   private:
     // forbid copying
     atomic_spinlock_t(const atomic_spinlock_t<T> &other);
@@ -113,6 +124,10 @@ namespace ceph {
       // at some point.  this hack can go away someday...
       return AO_load_full((AO_t *)&val);
     }
+    bool compare_and_swap(AO_t o, AO_t n) {
+      return AO_compare_and_swap(&val, o, n);
+    }
+
   private:
     // forbid copying
     atomic_t(const atomic_t &other);
diff --git a/src/include/buffer.h b/src/include/buffer.h
index d243d6e..f28bc5e 100644
--- a/src/include/buffer.h
+++ b/src/include/buffer.h
@@ -23,6 +23,7 @@
 #endif
 
 #include <stdio.h>
+#include <sys/uio.h>
 
 #if defined(__linux__)	// For malloc(2).
 #include <malloc.h>
@@ -36,12 +37,13 @@
 # include <sys/mman.h>
 #endif
 
-#include <iostream>
-#include <istream>
+#include <iosfwd>
 #include <iomanip>
 #include <list>
+#include <vector>
 #include <string>
 #include <exception>
+#include <type_traits>
 
 #include "page.h"
 #include "crc32c.h"
@@ -59,12 +61,14 @@
 #endif
 
 #if defined(HAVE_XIO)
-struct xio_mempool_obj;
+struct xio_reg_mem;
 class XioDispatchHook;
 #endif
 
 namespace ceph {
 
+const static int CEPH_BUFFER_APPEND_SIZE(4096);
+
 class CEPH_BUFFER_API buffer {
   /*
    * exceptions
@@ -72,27 +76,19 @@ class CEPH_BUFFER_API buffer {
 
 public:
   struct error : public std::exception{
-    const char *what() const throw () {
-      return "buffer::exception";
-    }
+    const char *what() const throw ();
   };
   struct bad_alloc : public error {
-    const char *what() const throw () {
-      return "buffer::bad_alloc";
-    }
+    const char *what() const throw ();
   };
   struct end_of_buffer : public error {
-    const char *what() const throw () {
-      return "buffer::end_of_buffer";
-    }
+    const char *what() const throw ();
   };
   struct malformed_input : public error {
-    explicit malformed_input(const char *w) {
-      snprintf(buf, sizeof(buf), "buffer::malformed_input: %s", w);
-    }
-    const char *what() const throw () {
-      return buf;
+    explicit malformed_input(const std::string& w) {
+      snprintf(buf, sizeof(buf), "buffer::malformed_input: %s", w.c_str());
     }
+    const char *what() const throw ();
   private:
     char buf[256];
   };
@@ -220,12 +216,7 @@ public:
     unsigned raw_length() const;
     int raw_nref() const;
 
-    void copy_out(unsigned o, unsigned l, char *dest) const {
-      assert(_raw);
-      if (!((o <= _len) && (o+l <= _len)))
-	throw end_of_buffer();
-      memcpy(dest, c_str()+o, l);
-    }
+    void copy_out(unsigned o, unsigned l, char *dest) const;
 
     bool can_zero_copy() const;
     int zero_copy_to_fd(int fd, int64_t *offset) const;
@@ -236,14 +227,23 @@ public:
     bool is_zero() const;
 
     // modifiers
-    void set_offset(unsigned o) { _off = o; }
-    void set_length(unsigned l) { _len = l; }
+    void set_offset(unsigned o) {
+      assert(raw_length() >= o);
+      _off = o;
+    }
+    void set_length(unsigned l) {
+      assert(raw_length() >= l);
+      _len = l;
+    }
 
-    void append(char c);
-    void append(const char *p, unsigned l);
+    unsigned append(char c);
+    unsigned append(const char *p, unsigned l);
     void copy_in(unsigned o, unsigned l, const char *src);
+    void copy_in(unsigned o, unsigned l, const char *src, bool crc_reset);
     void zero();
+    void zero(bool crc_reset);
     void zero(unsigned o, unsigned l);
+    void zero(unsigned o, unsigned l, bool crc_reset);
 
   };
 
@@ -260,60 +260,55 @@ public:
     unsigned _memcopy_count; //the total of memcopy using rebuild().
     ptr append_buffer;  // where i put small appends.
 
-  public:
-    class CEPH_BUFFER_API iterator {
-      list *bl;
-      std::list<ptr> *ls; // meh.. just here to avoid an extra pointer dereference..
-      unsigned off;  // in bl
-      std::list<ptr>::iterator p;
-      unsigned p_off; // in *p
+    template <bool is_const>
+    class iterator_impl: public std::iterator<std::forward_iterator_tag, char> {
+    protected:
+      typedef typename std::conditional<is_const,
+					const list,
+					list>::type bl_t;
+      typedef typename std::conditional<is_const,
+					const std::list<ptr>,
+					std::list<ptr> >::type list_t;
+      typedef typename std::conditional<is_const,
+					typename std::list<ptr>::const_iterator,
+					typename std::list<ptr>::iterator>::type list_iter_t;
+      bl_t* bl;
+      list_t* ls;  // meh.. just here to avoid an extra pointer dereference..
+      unsigned off; // in bl
+      list_iter_t p;
+      unsigned p_off;   // in *p
+
     public:
       // constructor.  position.
-      iterator() :
-	bl(0), ls(0), off(0), p_off(0) {}
-      iterator(list *l, unsigned o=0) : 
-	bl(l), ls(&bl->_buffers), off(0), p(ls->begin()), p_off(0) {
+      iterator_impl()
+	: bl(0), ls(0), off(0), p_off(0) {}
+      iterator_impl(bl_t *l, unsigned o=0)
+	: bl(l), ls(&bl->_buffers), off(0), p(ls->begin()), p_off(0) {
 	advance(o);
       }
-      iterator(list *l, unsigned o, std::list<ptr>::iterator ip, unsigned po) : 
-	bl(l), ls(&bl->_buffers), off(o), p(ip), p_off(po) { }
-
-      iterator(const iterator& other) : bl(other.bl),
-					ls(other.ls),
-					off(other.off),
-					p(other.p),
-					p_off(other.p_off) {}
-
-      iterator& operator=(const iterator& other) {
-	if (this != &other) {
-	  bl = other.bl;
-	  ls = other.ls;
-	  off = other.off;
-	  p = other.p;
-	  p_off = other.p_off;
-	}
-	return *this;
-      }
+      iterator_impl(bl_t *l, unsigned o, list_iter_t ip, unsigned po)
+	: bl(l), ls(&bl->_buffers), off(o), p(ip), p_off(po) {}
 
       /// get current iterator offset in buffer::list
-      unsigned get_off() { return off; }
+      unsigned get_off() const { return off; }
       
       /// get number of bytes remaining from iterator position to the end of the buffer::list
-      unsigned get_remaining() { return bl->length() - off; }
+      unsigned get_remaining() const { return bl->length() - off; }
 
       /// true if iterator is at the end of the buffer::list
-      bool end() {
+      bool end() const {
 	return p == ls->end();
 	//return off == bl->length();
       }
 
       void advance(int o);
       void seek(unsigned o);
-      char operator*();
-      iterator& operator++();
-      ptr get_current_ptr();
+      bool operator!=(const iterator_impl& rhs) const;
+      char operator*() const;
+      iterator_impl& operator++();
+      ptr get_current_ptr() const;
 
-      list& get_bl() { return *bl; }
+      bl_t& get_bl() { return *bl; }
 
       // copy data out.
       // note that these all _append_ to dest!
@@ -322,11 +317,36 @@ public:
       void copy(unsigned len, list &dest);
       void copy(unsigned len, std::string &dest);
       void copy_all(list &dest);
+    };
+
+  public:
+    typedef iterator_impl<true> const_iterator;
+
+    class CEPH_BUFFER_API iterator : public iterator_impl<false> {
+    public:
+      iterator(): iterator_impl() {}
+      iterator(bl_t *l, unsigned o=0) :
+	iterator_impl(l, o) {}
+      iterator(bl_t *l, unsigned o, list_iter_t ip, unsigned po) :
+	iterator_impl(l, o, ip, po) {}
+
+      void advance(int o);
+      void seek(unsigned o);
+      char operator*();
+      iterator& operator++();
+      ptr get_current_ptr();
+
+      // copy data out
+      void copy(unsigned len, char *dest);
+      void copy(unsigned len, ptr &dest);
+      void copy(unsigned len, list &dest);
+      void copy(unsigned len, std::string &dest);
+      void copy_all(list &dest);
 
       // copy data in
       void copy_in(unsigned len, const char *src);
+      void copy_in(unsigned len, const char *src, bool crc_reset);
       void copy_in(unsigned len, const list& otherl);
-
     };
 
   private:
@@ -345,6 +365,7 @@ public:
 			      _memcopy_count(other._memcopy_count), last_p(this) {
       make_shareable();
     }
+    list(list&& other);
     list& operator= (const list& other) {
       if (this != &other) {
         _buffers = other._buffers;
@@ -370,7 +391,9 @@ public:
 #endif
       return _len;
     }
+
     bool contents_equal(buffer::list& other);
+    bool contents_equal(const buffer::list& other) const;
 
     bool can_zero_copy() const;
     bool is_aligned(unsigned align) const;
@@ -454,12 +477,20 @@ public:
       return iterator(this, _len, _buffers.end(), 0);
     }
 
+    const_iterator begin() const {
+      return const_iterator(this, 0);
+    }
+    const_iterator end() const {
+      return const_iterator(this, _len, _buffers.end(), 0);
+    }
+
     // crope lookalikes.
     // **** WARNING: this are horribly inefficient for large bufferlists. ****
     void copy(unsigned off, unsigned len, char *dest) const;
     void copy(unsigned off, unsigned len, list &dest) const;
     void copy(unsigned off, unsigned len, std::string& dest) const;
     void copy_in(unsigned off, unsigned len, const char *src);
+    void copy_in(unsigned off, unsigned len, const char *src, bool crc_reset);
     void copy_in(unsigned off, unsigned len, const list& src);
 
     void append(char c);
@@ -500,7 +531,9 @@ public:
     int write_file(const char *fn, int mode=0644);
     int write_fd(int fd) const;
     int write_fd_zero_copy(int fd) const;
+    void prepare_iov(std::vector<iovec> *piov) const;
     uint32_t crc32c(uint32_t crc) const;
+	void invalidate_crc();
   };
 
   /*
@@ -525,7 +558,7 @@ public:
 };
 
 #if defined(HAVE_XIO)
-xio_mempool_obj* get_xio_mp(const buffer::ptr& bp);
+xio_reg_mem* get_xio_mp(const buffer::ptr& bp);
 #endif
 
 typedef buffer::ptr bufferptr;
@@ -568,41 +601,17 @@ inline bool operator<=(bufferlist& l, bufferlist& r) {
 }
 
 
-inline std::ostream& operator<<(std::ostream& out, const buffer::ptr& bp) {
-  if (bp.have_raw())
-    out << "buffer::ptr(" << bp.offset() << "~" << bp.length()
-	<< " " << (void*)bp.c_str() 
-	<< " in raw " << (void*)bp.raw_c_str()
-	<< " len " << bp.raw_length()
-	<< " nref " << bp.raw_nref() << ")";
-  else
-    out << "buffer:ptr(" << bp.offset() << "~" << bp.length() << " no raw)";
-  return out;
-}
+std::ostream& operator<<(std::ostream& out, const buffer::ptr& bp);
 
-inline std::ostream& operator<<(std::ostream& out, const buffer::list& bl) {
-  out << "buffer::list(len=" << bl.length() << "," << std::endl;
 
-  std::list<buffer::ptr>::const_iterator it = bl.buffers().begin();
-  while (it != bl.buffers().end()) {
-    out << "\t" << *it;
-    if (++it == bl.buffers().end()) break;
-    out << "," << std::endl;
-  }
-  out << std::endl << ")";
-  return out;
-}
+std::ostream& operator<<(std::ostream& out, const buffer::list& bl);
 
-inline std::ostream& operator<<(std::ostream& out, buffer::error& e)
-{
-  return out << e.what();
-}
+std::ostream& operator<<(std::ostream& out, const buffer::error& e);
 
 inline bufferhash& operator<<(bufferhash& l, bufferlist &r) {
   l.update(r);
   return l;
 }
-
 }
 
 #endif
diff --git a/src/include/ceph_features.h b/src/include/ceph_features.h
index 781df1b..4857b0a 100644
--- a/src/include/ceph_features.h
+++ b/src/include/ceph_features.h
@@ -64,7 +64,10 @@
 // duplicated since it was introduced at the same time as MIN_SIZE_RECOVERY
 #define CEPH_FEATURE_OSD_PROXY_FEATURES (1ULL<<49)  /* overlap w/ above */
 #define CEPH_FEATURE_MON_METADATA (1ULL<<50)
-/* ... */
+#define CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT (1ULL<<51) /* can sort objs bitwise */
+#define CEPH_FEATURE_OSD_PROXY_WRITE_FEATURES (1ULL<<52)
+#define CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3 (1ULL<<53)
+#define CEPH_FEATURE_OSD_HITSET_GMT (1ULL<<54)
 #define CEPH_FEATURE_HAMMER_0_94_4 (1ULL<<55)
 
 #define CEPH_FEATURE_RESERVED2 (1ULL<<61)  /* slow down, we are almost out... */
@@ -151,6 +154,11 @@ static inline unsigned long long ceph_sanitize_features(unsigned long long f) {
 	 CEPH_FEATURE_MDS_QUOTA | \
          CEPH_FEATURE_CRUSH_V4 |	     \
          CEPH_FEATURE_OSD_MIN_SIZE_RECOVERY |		 \
+	 CEPH_FEATURE_MON_METADATA |			 \
+	 CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT |		 \
+         CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3 |   \
+         CEPH_FEATURE_OSD_PROXY_WRITE_FEATURES |         \
+	 CEPH_FEATURE_OSD_HITSET_GMT |			 \
 	 CEPH_FEATURE_HAMMER_0_94_4 |		 \
 	 0ULL)
 
diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h
index d366032..08ef460 100644
--- a/src/include/ceph_fs.h
+++ b/src/include/ceph_fs.h
@@ -29,6 +29,7 @@
 #define CEPH_INO_ROOT   1
 #define CEPH_INO_CEPH   2       /* hidden .ceph dir */
 #define CEPH_INO_DOTDOT 3	/* used by ceph fuse for parent (..) */
+#define CEPH_INO_LOST_AND_FOUND 4	/* reserved ino for use in recovery */
 
 /* arbitrary limit on max # of monitors (cluster of 3 is typical) */
 #define CEPH_MAX_MON   31
@@ -94,6 +95,7 @@ struct ceph_dir_layout {
 #define CEPH_MSG_MON_MAP                4
 #define CEPH_MSG_MON_GET_MAP            5
 #define CEPH_MSG_MON_GET_OSDMAP         6
+#define CEPH_MSG_MON_METADATA           7
 #define CEPH_MSG_STATFS                 13
 #define CEPH_MSG_STATFS_REPLY           14
 #define CEPH_MSG_MON_SUBSCRIBE          15
@@ -244,6 +246,7 @@ struct ceph_mon_subscribe_ack {
 #define CEPH_MDS_STATE_STARTING    -7  /* up, starting previously stopped mds */
 #define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */
 #define CEPH_MDS_STATE_REPLAYONCE   -9 /* up, replaying an active node's journal */
+#define CEPH_MDS_STATE_NULL         -10
 
 #define CEPH_MDS_STATE_REPLAY       8  /* up, replaying journal. */
 #define CEPH_MDS_STATE_RESOLVE      9  /* up, disambiguating distributed
@@ -253,6 +256,7 @@ struct ceph_mon_subscribe_ack {
 #define CEPH_MDS_STATE_CLIENTREPLAY 12 /* up, replaying client operations */
 #define CEPH_MDS_STATE_ACTIVE       13 /* up, active */
 #define CEPH_MDS_STATE_STOPPING     14 /* up, but exporting metadata */
+#define CEPH_MDS_STATE_DAMAGED      15 /* rank not replayable, need repair */
 
 extern const char *ceph_mds_state_name(int s);
 
@@ -341,6 +345,7 @@ enum {
 	CEPH_MDS_OP_MKSNAP     = 0x01400,
 	CEPH_MDS_OP_RMSNAP     = 0x01401,
 	CEPH_MDS_OP_LSSNAP     = 0x00402,
+	CEPH_MDS_OP_RENAMESNAP = 0x01403,
 
 	// internal op
 	CEPH_MDS_OP_FRAGMENTDIR= 0x01500,
@@ -404,6 +409,7 @@ union ceph_mds_request_args {
 	} __attribute__ ((packed)) open;
 	struct {
 		__le32 flags;
+		__le32 osdmap_epoch; 	    /* use for set file/dir layout */
 	} __attribute__ ((packed)) setxattr;
 	struct {
 		struct ceph_file_layout layout;
diff --git a/src/include/cephfs/libcephfs.h b/src/include/cephfs/libcephfs.h
index f27f879..2083093 100644
--- a/src/include/cephfs/libcephfs.h
+++ b/src/include/cephfs/libcephfs.h
@@ -24,7 +24,7 @@
 #include <stdbool.h>
 
 // FreeBSD compatibility
-#ifdef __FreeBSD__
+#if defined(__FreeBSD__) || defined(__APPLE__)
 typedef off_t loff_t;
 typedef off_t off64_t;
 #endif
@@ -33,6 +33,13 @@ typedef off_t off64_t;
 extern "C" {
 #endif
 
+#define LIBCEPHFS_VER_MAJOR 0
+#define LIBCEPHFS_VER_MINOR 94
+#define LIBCEPHFS_VER_EXTRA 0
+
+#define LIBCEPHFS_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra)
+#define LIBCEPHFS_VERSION_CODE LIBCEPHFS_VERSION(LIBCEPHFS_VER_MAJOR, LIBCEPHFS_VER_MINOR, LIBCEPHFS_VER_EXTRA)
+
 /*
  * On FreeBSD and Apple the offset is 64 bit, but libc doesn't announce it in
  * the way glibc does.
@@ -69,7 +76,7 @@ struct ceph_file_layout {
 } __attribute__ ((packed));
 
 
-typedef struct _inodeno_t {
+typedef struct inodeno_t {
   uint64_t val;
 } inodeno_t;
 
@@ -83,15 +90,17 @@ typedef struct vinodeno_t {
 } vinodeno_t;
 
 typedef struct Fh Fh;
+#else /* _cplusplus */
+
+struct inodeno_t;
+struct vinodeno_t;
+typedef struct vinodeno_t vinodeno;
 
 #endif /* ! __cplusplus */
 
-struct inodeno_t;
 struct Inode;
 typedef struct Inode Inode;
 
-struct vinodeno_t;
-typedef struct vinodeno_t vinodeno;
 struct ceph_mount_info;
 struct ceph_dir_result;
 struct CephContext;
@@ -127,7 +136,7 @@ struct CephContext;
  *
  * @param major where to store the major version number
  * @param minor where to store the minor version number
- * @param extra where to store the extra version number
+ * @param patch where to store the extra version number
  */
 const char *ceph_version(int *major, int *minor, int *patch);
 
@@ -363,7 +372,7 @@ const char* ceph_getcwd(struct ceph_mount_info *cmount);
  * @param path the path to the working directory to change into.
  * @returns 0 on success, negative error code otherwise.
  */
-int ceph_chdir(struct ceph_mount_info *cmount, const char *s);
+int ceph_chdir(struct ceph_mount_info *cmount, const char *path);
 
 /** @} fsops */
 
@@ -655,7 +664,7 @@ int ceph_chown(struct ceph_mount_info *cmount, const char *path, int uid, int gi
  * Change the ownership of a file from an open file descriptor.
  *
  * @param cmount the ceph mount handle to use for performing the chown.
- * @param path the path of the file/directory to change the ownership of.
+ * @param fd the fd of the open file/directory to change the ownership of.
  * @param uid the user id to set on the file/directory.
  * @param gid the group id to set on the file/directory.
  * @returns 0 on success or negative error code on failure.
@@ -684,6 +693,21 @@ int ceph_lchown(struct ceph_mount_info *cmount, const char *path, int uid, int g
 int ceph_utime(struct ceph_mount_info *cmount, const char *path, struct utimbuf *buf);
 
 /**
+ * Apply or remove an advisory lock.
+ *
+ * @param cmount the ceph mount handle to use for performing the lock.
+ * @param fd the open file descriptor to change advisory lock.
+ * @param operation the advisory lock operation to be performed on the file
+ * descriptor among LOCK_SH (shared lock), LOCK_EX (exclusive lock),
+ * or LOCK_UN (remove lock). The LOCK_NB value can be ORed to perform a
+ * non-blocking operation.
+ * @param owner the user-supplied owner identifier (an arbitrary integer)
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_flock(struct ceph_mount_info *cmount, int fd, int operation,
+	       uint64_t owner);
+
+/**
  * Truncate the file to the given size.  If this operation causes the
  * file to expand, the empty bytes will be filled in with zeros.
  *
@@ -757,8 +781,8 @@ int ceph_close(struct ceph_mount_info *cmount, int fd);
  * @param offset the offset to set the stream to
  * @param whence the flag to indicate what type of seeking to perform:
  *	SEEK_SET: the offset is set to the given offset in the file.
- *      SEEK_CUR: the offset is set to the current location plus @ref offset bytes.
- *      SEEK_END: the offset is set to the end of the file plus @ref offset bytes.
+ *      SEEK_CUR: the offset is set to the current location plus @e offset bytes.
+ *      SEEK_END: the offset is set to the end of the file plus @e offset bytes.
  * @returns 0 on success or a negative error code on failure.
  */
 int64_t ceph_lseek(struct ceph_mount_info *cmount, int fd, int64_t offset, int whence);
@@ -768,7 +792,7 @@ int64_t ceph_lseek(struct ceph_mount_info *cmount, int fd, int64_t offset, int w
  * @param cmount the ceph mount handle to use for performing the read.
  * @param fd the file descriptor of the open file to read from.
  * @param buf the buffer to read data into
- * @param the initial size of the buffer
+ * @param size the initial size of the buffer
  * @param offset the offset in the file to read from.  If this value is negative, the
  *        function reads from the current offset of the file descriptor.
  * @returns the number of bytes read into buf, or a negative error code on failure.
@@ -776,6 +800,19 @@ int64_t ceph_lseek(struct ceph_mount_info *cmount, int fd, int64_t offset, int w
 int ceph_read(struct ceph_mount_info *cmount, int fd, char *buf, int64_t size, int64_t offset);
 
 /**
+ * Read data from the file.
+ * @param cmount the ceph mount handle to use for performing the read.
+ * @param fd the file descriptor of the open file to read from.
+ * @param iov the iov structure to read data into
+ * @param iovcnt the number of items that iov includes
+ * @param offset the offset in the file to read from.  If this value is negative, the
+ *        function reads from the current offset of the file descriptor.
+ * @returns the number of bytes read into buf, or a negative error code on failure.
+ */
+int ceph_preadv(struct ceph_mount_info *cmount, int fd, const struct iovec *iov, int iovcnt,
+           int64_t offset);
+
+/**
  * Write data to a file.
  *
  * @param cmount the ceph mount handle to use for performing the write.
@@ -790,6 +827,20 @@ int ceph_write(struct ceph_mount_info *cmount, int fd, const char *buf, int64_t
 	       int64_t offset);
 
 /**
+ * Write data to a file.
+ *
+ * @param cmount the ceph mount handle to use for performing the write.
+ * @param fd the file descriptor of the open file to write to
+ * @param iov the iov structure to read data into
+ * @param iovcnt the number of items that iov includes
+ * @param offset the offset of the file write into.  If this value is negative, the
+ *        function writes to the current offset of the file descriptor.
+ * @returns the number of bytes written, or a negative error code
+ */
+int ceph_pwritev(struct ceph_mount_info *cmount, int fd, const struct iovec *iov, int iovcnt,
+           int64_t offset);
+
+/**
  * Truncate a file to the given size.
  *
  * @param cmount the ceph mount handle to use for performing the ftruncate.
@@ -862,6 +913,19 @@ int ceph_getxattr(struct ceph_mount_info *cmount, const char *path, const char *
 	void *value, size_t size);
 
 /**
+ * Get an extended attribute.
+ *
+ * @param cmount the ceph mount handle to use for performing the getxattr.
+ * @param fd the open file descriptor referring to the file to get extended attribute from.
+ * @param name the name of the extended attribute to get
+ * @param value a pre-allocated buffer to hold the xattr's value
+ * @param size the size of the pre-allocated buffer
+ * @returns the size of the value or a negative error code on failure.
+ */
+int ceph_fgetxattr(struct ceph_mount_info *cmount, int fd, const char *name,
+	void *value, size_t size);
+
+/**
  * Get an extended attribute wihtout following symbolic links.  This function is
  * identical to ceph_getxattr, but if the path refers to a symbolic link,
  * we get the extended attributes of the symlink rather than the attributes
@@ -889,6 +953,17 @@ int ceph_lgetxattr(struct ceph_mount_info *cmount, const char *path, const char
 int ceph_listxattr(struct ceph_mount_info *cmount, const char *path, char *list, size_t size);
 
 /**
+ * List the extended attribute keys on a file.
+ *
+ * @param cmount the ceph mount handle to use for performing the listxattr.
+ * @param fd the open file descriptor referring to the file to list extended attributes on.
+ * @param list a buffer to be filled in with the list of extended attributes keys.
+ * @param size the size of the list buffer.
+ * @returns the size of the resulting list filled in.
+ */
+int ceph_flistxattr(struct ceph_mount_info *cmount, int fd, char *list, size_t size);
+
+/**
  * Get the list of extended attribute keys on a file, but do not follow symbolic links.
  *
  * @param cmount the ceph mount handle to use for performing the llistxattr.
@@ -910,6 +985,16 @@ int ceph_llistxattr(struct ceph_mount_info *cmount, const char *path, char *list
 int ceph_removexattr(struct ceph_mount_info *cmount, const char *path, const char *name);
 
 /**
+ * Remove an extended attribute from a file.
+ *
+ * @param cmount the ceph mount handle to use for performing the removexattr.
+ * @param fd the open file descriptor referring to the file to remove extended attribute from.
+ * @param name the name of the extended attribute to remove.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_fremovexattr(struct ceph_mount_info *cmount, int fd, const char *name);
+
+/**
  * Remove the extended attribute from a file, do not follow symbolic links.
  *
  * @param cmount the ceph mount handle to use for performing the lremovexattr.
@@ -936,6 +1021,22 @@ int ceph_setxattr(struct ceph_mount_info *cmount, const char *path, const char *
 	const void *value, size_t size, int flags);
 
 /**
+ * Set an extended attribute on a file.
+ *
+ * @param cmount the ceph mount handle to use for performing the setxattr.
+ * @param fd the open file descriptor referring to the file to set extended attribute on.
+ * @param name the name of the extended attribute to set.
+ * @param value the bytes of the extended attribute value
+ * @param size the size of the extended attribute value
+ * @param flags the flags can be:
+ *	CEPH_XATTR_CREATE: create the extended attribute.  Must not exist.
+ *      CEPH_XATTR_REPLACE: replace the extended attribute, Must already exist.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_fsetxattr(struct ceph_mount_info *cmount, int fd, const char *name,
+	const void *value, size_t size, int flags);
+
+/**
  * Set an extended attribute on a file, do not follow symbolic links.
  *
  * @param cmount the ceph mount handle to use for performing the lsetxattr.
@@ -1144,7 +1245,7 @@ int ceph_get_pool_replication(struct ceph_mount_info *cmount, int pool_id);
  *	anywhere within the stripe unit.
  * @param addr the address of the OSD holding that stripe
  * @param naddr the capacity of the address passed in.
- * @returns the size of the addressed filled into the @ref addr parameter, or a negative
+ * @returns the size of the addressed filled into the @e addr parameter, or a negative
  *	error code on failure.
  */
 int ceph_get_file_stripe_address(struct ceph_mount_info *cmount, int fd, int64_t offset,
@@ -1163,7 +1264,7 @@ int ceph_get_file_stripe_address(struct ceph_mount_info *cmount, int fd, int64_t
  * @returns the number of items stored in the output array, or -ERANGE if the
  * array is not large enough.
  */
-int ceph_get_file_extent_osds(struct ceph_mount_info *cmount, int fh,
+int ceph_get_file_extent_osds(struct ceph_mount_info *cmount, int fd,
                               int64_t offset, int64_t *length, int *osds, int nosds);
 
 /**
@@ -1248,7 +1349,7 @@ int ceph_debug_get_fd_caps(struct ceph_mount_info *cmount, int fd);
  * Get the capabilities currently issued to the client.
  *
  * @param cmount the ceph mount handle to use.
- * @param the path to the file
+ * @param path the path to the file
  * @returns the current capabilities issued to this client
  *       for the file
  */
diff --git a/src/include/cmp.h b/src/include/cmp.h
index 93365bf..79372fd 100644
--- a/src/include/cmp.h
+++ b/src/include/cmp.h
@@ -84,6 +84,40 @@
 		      (l.b == r.b && (l.c <= r.c))));			\
   }
 
+#define WRITE_EQ_OPERATORS_4(type, a, b, c, d)				\
+  inline bool operator==(const type &l, const type &r) {		\
+    return l.a == r.a && l.b == r.b && l.c == r.c && l.d == r.d;	\
+  }									\
+  inline bool operator!=(const type &l, const type &r) {		\
+    return l.a != r.a || l.b != r.b || l.c != r.c || l.d != r.d;	\
+  }
+
+#define WRITE_CMP_OPERATORS_4(type, a, b, c, d)				\
+  inline bool operator>(const type &l, const type &r) {			\
+    return l.a > r.a ||							\
+      (l.a == r.a && (l.b > r.b ||					\
+		      (l.b == r.b && (l.c > r.c ||			\
+				      (l.c == r.c && (l.d > r.d))))));	\
+  }									\
+  inline bool operator<(const type &l, const type &r) {			\
+    return l.a < r.a ||							\
+      (l.a == r.a && (l.b < r.b ||					\
+		      (l.b == r.b && (l.c < r.c ||			\
+				      (l.c == r.c && (l.d < r.d))))));	\
+  }									\
+  inline bool operator>=(const type &l, const type &r) {		\
+    return l.a > r.a ||							\
+      (l.a == r.a && (l.b > r.b ||					\
+		      (l.b == r.b && (l.c > r.c ||			\
+				      (l.c == r.c && (l.d >= r.d))))));	\
+  }									\
+  inline bool operator<=(const type &l, const type &r) {		\
+    return l.a < r.a ||							\
+      (l.a == r.a && (l.b < r.b ||					\
+		      (l.b == r.b && (l.c < r.c ||			\
+				      (l.c == r.c && (l.d <= r.d)))))); \
+  }
+
 
 
 #define WRITE_EQ_OPERATORS_5(type, a, b, c, d, e)			\
diff --git a/src/include/compact_map.h b/src/include/compact_map.h
new file mode 100644
index 0000000..a530c27
--- /dev/null
+++ b/src/include/compact_map.h
@@ -0,0 +1,347 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#ifndef CEPH_COMPACT_MAP_H
+#define CEPH_COMPACT_MAP_H
+
+#include <map>
+
+template <class Key, class T, class Map>
+class compact_map_base {
+protected:
+  Map *map;
+  void alloc_internal() {
+    if (!map)
+      map = new Map;
+  }
+  void free_internal() {
+    if (map) {
+      delete map;
+      map = 0;
+    }
+  }
+  template <class It>
+  class const_iterator_base {
+    const compact_map_base *map;
+    It it;
+    const_iterator_base() : map(0) { }
+    const_iterator_base(const compact_map_base* m) : map(m) { }
+    const_iterator_base(const compact_map_base *m, const It& i) : map(m), it(i) { }
+    friend class compact_map_base;
+    friend class iterator_base;
+  public:
+    const_iterator_base(const const_iterator_base& o) {
+      map = o.map;
+      it = o.it;
+    }
+    bool operator==(const const_iterator_base& o) const {
+      return (map == o.map) && (!map->map || it == o.it);
+    }
+    bool operator!=(const const_iterator_base& o) const {
+      return !(*this == o);;
+    }
+    const_iterator_base& operator=(const const_iterator_base& o) {
+      map = o.map;
+      it = o.it;
+      return *this;
+    }
+    const_iterator_base& operator++() {
+      ++it;
+      return *this;
+    }
+    const_iterator_base& operator--() {
+      --it;
+      return *this;
+    }
+    const std::pair<const Key,T>* operator->() {
+      return it.operator->();
+    }
+  };
+  template <class It>
+  class iterator_base {
+  private:
+    const compact_map_base* map;
+    It it;
+    iterator_base() : map(0) { }
+    iterator_base(compact_map_base* m) : map(m) { }
+    iterator_base(compact_map_base* m, const It& i) : map(m), it(i) { }
+    friend class compact_map_base;
+  public:
+    iterator_base(const iterator_base& o) {
+      map = o.map;
+      it = o.it;
+    }
+    bool operator==(const iterator_base& o) const {
+      return (map == o.map) && (!map->map || it == o.it);
+    }
+    bool operator!=(const iterator_base& o) const {
+      return !(*this == o);;
+    }
+    iterator_base& operator=(const iterator_base& o) {
+      map = o.map;
+      it = o.it;
+      return *this;
+    }
+    iterator_base& operator++() {
+      ++it;
+      return *this;
+    }
+    iterator_base operator++(int) {
+      iterator_base tmp = *this;
+      ++it;
+      return tmp;
+    }
+    iterator_base& operator--() {
+      --it;
+      return *this;
+    }
+    std::pair<const Key,T>* operator->() {
+      return it.operator->();
+    }
+    operator const_iterator_base<It>() const {
+      return const_iterator_base<It>(map, it);
+    }
+  };
+
+public:
+  class iterator : public iterator_base<typename Map::iterator> {
+    public:
+      iterator() { }
+      iterator(const iterator_base<typename Map::iterator>& o)
+	: iterator_base<typename Map::iterator>(o) { }
+      iterator(compact_map_base* m) : iterator_base<typename Map::iterator>(m) { }
+      iterator(compact_map_base* m, const typename Map::iterator& i)
+	: iterator_base<typename Map::iterator>(m, i) { }
+  };
+  class const_iterator : public const_iterator_base<typename Map::const_iterator> {
+    public:
+      const_iterator() { }
+      const_iterator(const iterator_base<typename Map::const_iterator>& o)
+	: const_iterator_base<typename Map::const_iterator>(o) { }
+      const_iterator(const compact_map_base* m) : const_iterator_base<typename Map::const_iterator>(m) { }
+      const_iterator(const compact_map_base* m, const typename Map::const_iterator& i)
+	: const_iterator_base<typename Map::const_iterator>(m, i) { }
+  };
+  class reverse_iterator : public iterator_base<typename Map::reverse_iterator> {
+    public:
+      reverse_iterator() { }
+      reverse_iterator(const iterator_base<typename Map::reverse_iterator>& o)
+	: iterator_base<typename Map::reverse_iterator>(o) { }
+      reverse_iterator(compact_map_base* m) : iterator_base<typename Map::reverse_iterator>(m) { }
+      reverse_iterator(compact_map_base* m, const typename Map::reverse_iterator& i)
+	: iterator_base<typename Map::reverse_iterator>(m, i) { }
+  };
+  class const_reverse_iterator : public const_iterator_base<typename Map::const_reverse_iterator> {
+    public:
+      const_reverse_iterator() { }
+      const_reverse_iterator(const iterator_base<typename Map::const_reverse_iterator>& o)
+	: iterator_base<typename Map::const_reverse_iterator>(o) { }
+      const_reverse_iterator(const compact_map_base* m) : const_iterator_base<typename Map::const_reverse_iterator>(m) { }
+      const_reverse_iterator(const compact_map_base* m, const typename Map::const_reverse_iterator& i)
+	: const_iterator_base<typename Map::const_reverse_iterator>(m, i) { }
+  };
+  compact_map_base() : map(0) {}
+  compact_map_base(const compact_map_base& o) : map(0) {
+    if (o.map) {
+      alloc_internal();
+      *map = *o.map;
+    }
+  }
+  ~compact_map_base() { delete map; }
+
+  bool empty() const {
+    return !map || map->empty();
+  }
+  size_t size() const {
+    return map ? map->size() : 0;
+  }
+  bool operator==(const compact_map_base& o) const {
+    return (empty() && o.empty()) || (map && o.map && *map == *o.map);
+  }
+  bool operator!=(const compact_map_base& o) const {
+    return !(*this == o);
+  }
+  size_t count (const Key& k) const {
+    return map ? map->count(k) : 0;
+  }
+  void erase (iterator p) {
+    if (map) {
+      assert(this == p.map);
+      map->erase(p.it);
+      if (map->empty())
+	free_internal();
+    }
+  }
+  size_t erase (const Key& k) {
+    if (!map)
+      return 0;
+    size_t r = map->erase(k);
+    if (map->empty())
+	free_internal();
+    return r;
+  }
+  void clear() {
+    free_internal();
+  }
+  void swap(compact_map_base& o) {
+    Map *tmp = map;
+    map = o.map;
+    o.map = tmp;
+  }
+  compact_map_base& operator=(const compact_map_base& o) {
+    if (o.map) {
+      alloc_internal();
+      *map = *o.map;
+    } else
+      free_internal();
+    return *this;
+  }
+  iterator insert(const std::pair<const Key, T>& val) {
+    alloc_internal();
+    return iterator(this, map->insert(val));
+  }
+  iterator begin() {
+   if (!map)
+     return iterator(this);
+   return iterator(this, map->begin());
+  }
+  iterator end() {
+   if (!map)
+     return iterator(this);
+   return iterator(this, map->end());
+  }
+  reverse_iterator rbegin() {
+   if (!map)
+     return reverse_iterator(this);
+   return reverse_iterator(this, map->rbegin());
+  }
+  reverse_iterator rend() {
+   if (!map)
+     return reverse_iterator(this);
+   return reverse_iterator(this, map->rend());
+  }
+  iterator find(const Key& k) {
+    if (!map)
+      return iterator(this);
+    return iterator(this, map->find(k));
+  }
+  iterator lower_bound(const Key& k) {
+    if (!map)
+      return iterator(this);
+    return iterator(this, map->lower_bound(k));
+  }
+  iterator upper_bound(const Key& k) {
+    if (!map)
+      return iterator(this);
+    return iterator(this, map->upper_bound(k));
+  }
+  const_iterator begin() const {
+   if (!map)
+     return const_iterator(this);
+   return const_iterator(this, map->begin());
+  }
+  const_iterator end() const {
+   if (!map)
+     return const_iterator(this);
+   return const_iterator(this, map->end());
+  }
+  const_reverse_iterator rbegin() const {
+   if (!map)
+     return const_reverse_iterator(this);
+   return const_reverse_iterator(this, map->rbegin());
+  }
+  const_reverse_iterator rend() const {
+   if (!map)
+     return const_reverse_iterator(this);
+   return const_reverse_iterator(this, map->rend());
+  }
+  const_iterator find(const Key& k) const {
+    if (!map)
+      return const_iterator(this);
+    return const_iterator(this, map->find(k));
+  }
+  const_iterator lower_bound(const Key& k) const {
+    if (!map)
+      return const_iterator(this);
+    return const_iterator(this, map->lower_bound(k));
+  }
+  const_iterator upper_bound(const Key& k) const {
+    if (!map)
+      return const_iterator(this);
+    return const_iterator(this, map->upper_bound(k));
+  }
+  void encode(bufferlist &bl) const {
+    if (map)
+      ::encode(*map, bl);
+    else
+      ::encode((uint32_t)0, bl);
+  }
+  void decode(bufferlist::iterator& p) {
+    uint32_t n;
+    ::decode(n, p);
+    if (n > 0) {
+      alloc_internal();
+      ::decode_nohead(n, *map, p);
+    } else
+      free_internal();
+  }
+};
+
+template<class Key, class T, class Map>
+inline void encode(const compact_map_base<Key, T, Map>& m, bufferlist& bl) {
+  m.encode(bl);
+}
+template<class Key, class T, class Map>
+inline void decode(compact_map_base<Key, T, Map>& m, bufferlist::iterator& p) {
+  m.decode(p);
+}
+
+template <class Key, class T>
+class compact_map : public compact_map_base<Key, T, std::map<Key,T> > {
+public:
+  T& operator[](const Key& k) {
+    this->alloc_internal();
+    return (*(this->map))[k];
+  }
+};
+
+template <class Key, class T>
+inline std::ostream& operator<<(std::ostream& out, const compact_map<Key, T>& m)
+{
+  out << "{";
+  for (typename compact_map<Key, T>::const_iterator it = m.begin();
+       it != m.end();
+       ++it) {
+    if (it != m.begin())
+      out << ",";
+    out << it->first << "=" << it->second;
+  }
+  out << "}";
+  return out;
+}
+
+template <class Key, class T>
+class compact_multimap : public compact_map_base<Key, T, std::multimap<Key,T> > {
+};
+
+template <class Key, class T>
+inline std::ostream& operator<<(std::ostream& out, const compact_multimap<Key, T>& m)
+{
+  out << "{{";
+  for (typename compact_map<Key, T>::const_iterator it = m.begin(); !it.end(); ++it) {
+    if (it != m.begin())
+      out << ",";
+    out << it->first << "=" << it->second;
+  }
+  out << "}}";
+  return out;
+}
+#endif
diff --git a/src/include/compact_set.h b/src/include/compact_set.h
new file mode 100644
index 0000000..fcd847b
--- /dev/null
+++ b/src/include/compact_set.h
@@ -0,0 +1,290 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#ifndef CEPH_COMPACT_SET_H
+#define CEPH_COMPACT_SET_H
+
+#include <set>
+
+template <class T, class Set>
+class compact_set_base {
+protected:
+  Set *set;
+  void alloc_internal() {
+    if (!set)
+      set = new Set;
+  }
+  void free_internal() {
+    if (set) {
+      delete set;
+      set = 0;
+    }
+  }
+  template <class It>
+  class iterator_base {
+  private:
+    const compact_set_base* set;
+    It it;
+    iterator_base() : set(0) { }
+    iterator_base(const compact_set_base* s) : set(s) { }
+    iterator_base(const compact_set_base* s, const It& i) : set(s), it(i) { }
+    friend class compact_set_base;
+  public:
+    iterator_base(const iterator_base& o) {
+      set = o.set;
+      it = o.it;
+    }
+    bool operator==(const iterator_base& o) const {
+      return (set == o.set) && (!set->set || it == o.it);
+    }
+    bool operator!=(const iterator_base& o) const {
+      return !(*this == o);;
+    }
+    iterator_base& operator=(const iterator_base& o) {
+      set->set = o.set;
+      it = o.it;
+      return *this;
+    }
+    iterator_base& operator++() {
+      ++it;
+      return *this;
+    }
+    iterator_base operator++(int) {
+      iterator_base tmp = *this;
+      ++it;
+      return tmp;
+    }
+    iterator_base& operator--() {
+      --it;
+      return *this;
+    }
+    const T& operator*() {
+      return *it;
+    }
+  };
+public:
+  class const_iterator : public iterator_base<typename Set::const_iterator> {
+    public:
+      const_iterator() { }
+      const_iterator(const iterator_base<typename Set::const_iterator>& o)
+	: iterator_base<typename Set::const_iterator>(o) { }
+      const_iterator(const compact_set_base* s) : iterator_base<typename Set::const_iterator>(s) { }
+      const_iterator(const compact_set_base* s, const typename Set::const_iterator& i)
+	: iterator_base<typename Set::const_iterator>(s, i) { }
+  };
+  class iterator : public iterator_base<typename Set::iterator> {
+    public:
+      iterator() { }
+      iterator(const iterator_base<typename Set::iterator>& o)
+	: iterator_base<typename Set::iterator>(o) { }
+      iterator(compact_set_base* s) : iterator_base<typename Set::iterator>(s) { }
+      iterator(compact_set_base* s, const typename Set::iterator& i)
+	: iterator_base<typename Set::iterator>(s, i) { }
+      operator const_iterator() const {
+	return const_iterator(this->set, this->it);
+      }
+  };
+  class const_reverse_iterator : public iterator_base<typename Set::const_reverse_iterator> {
+    public:
+      const_reverse_iterator() { }
+      const_reverse_iterator(const iterator_base<typename Set::const_reverse_iterator>& o)
+	: iterator_base<typename Set::const_reverse_iterator>(o) { }
+      const_reverse_iterator(const compact_set_base* s) : iterator_base<typename Set::const_reverse_iterator>(s) { }
+      const_reverse_iterator(const compact_set_base* s, const typename Set::const_reverse_iterator& i)
+	: iterator_base<typename Set::const_reverse_iterator>(s, i) { }
+  };
+  class reverse_iterator : public iterator_base<typename Set::reverse_iterator> {
+    public:
+      reverse_iterator() { }
+      reverse_iterator(const iterator_base<typename Set::reverse_iterator>& o)
+	: iterator_base<typename Set::reverse_iterator>(o) { }
+      reverse_iterator(compact_set_base* s) : iterator_base<typename Set::reverse_iterator>(s) { }
+      reverse_iterator(compact_set_base* s, const typename Set::reverse_iterator& i)
+	: iterator_base<typename Set::reverse_iterator>(s, i) { }
+      operator const_iterator() const {
+	return const_iterator(this->set, this->it);
+      }
+  };
+
+  compact_set_base() : set(0) {}
+  compact_set_base(const compact_set_base& o) : set(0) {
+    if (o.set) {
+      alloc_internal();
+      *set = *o.set;
+    }
+  }
+  ~compact_set_base() { delete set; }
+
+
+  bool empty() const {
+    return !set || set->empty();
+  }
+  size_t size() const {
+    return set ? set->size() : 0;
+  }
+  bool operator==(const compact_set_base& o) const {
+    return (empty() && o.empty()) || (set && o.set && *set == *o.set);
+  }
+  bool operator!=(const compact_set_base& o) const {
+    return !(*this == o);
+  }
+  size_t count(const T& t) const {
+    return set ? set->count(t) : 0;
+  }
+  void erase (iterator p) {
+    if (set) {
+      assert(this == p.set);
+      set->erase(p.it);
+      if (set->empty())
+	free_internal();
+    }
+  }
+  size_t erase (const T& t) {
+    if (!set)
+      return 0;
+    size_t r = set->erase(t);
+    if (set->empty())
+	free_internal();
+    return r;
+  }
+  void clear() {
+    free_internal();
+  }
+  void swap(compact_set_base& o) {
+    Set *tmp = set;
+    set = o.set;
+    o.set = tmp;
+  }
+  compact_set_base& operator=(const compact_set_base& o) {
+    if (o.set) {
+      alloc_internal();
+      *set = *o.set;
+    } else
+      free_internal();
+    return *this;
+  }
+  std::pair<iterator,bool> insert(const T& t) {
+    alloc_internal();
+    std::pair<typename Set::iterator,bool> r = set->insert(t);
+    return std::make_pair(iterator(this, r.first), r.second);
+  }
+  iterator begin() {
+   if (!set)
+     return iterator(this);
+   return iterator(this, set->begin());
+  }
+  iterator end() {
+   if (!set)
+     return iterator(this);
+   return iterator(this, set->end());
+  }
+  reverse_iterator rbegin() {
+   if (!set)
+     return reverse_iterator(this);
+   return reverse_iterator(this, set->rbegin());
+  }
+  reverse_iterator rend() {
+   if (!set)
+     return reverse_iterator(this);
+   return reverse_iterator(this, set->rend());
+  }
+  iterator find(const T& t) {
+    if (!set)
+      return iterator(this);
+    return iterator(this, set->find(t));
+  }
+  iterator lower_bound(const T& t) {
+    if (!set)
+      return iterator(this);
+    return iterator(this, set->lower_bound(t));
+  }
+  iterator upper_bound(const T& t) {
+    if (!set)
+      return iterator(this);
+    return iterator(this, set->upper_bound(t));
+  }
+  const_iterator begin() const {
+   if (!set)
+     return const_iterator(this);
+   return const_iterator(this, set->begin());
+  }
+  const_iterator end() const {
+   if (!set)
+     return const_iterator(this);
+   return const_iterator(this, set->end());
+  }
+  const_reverse_iterator rbegin() const {
+   if (!set)
+     return const_reverse_iterator(this);
+   return const_reverse_iterator(this, set->rbegin());
+  }
+  const_reverse_iterator rend() const {
+   if (!set)
+     return const_reverse_iterator(this);
+   return const_reverse_iterator(this, set->rend());
+  }
+  const_iterator find(const T& t) const {
+    if (!set)
+      return const_iterator(this);
+    return const_iterator(this, set->find(t));
+  }
+  const_iterator lower_bound(const T& t) const {
+    if (!set)
+      return const_iterator(this);
+    return const_iterator(this, set->lower_bound(t));
+  }
+  const_iterator upper_bound(const T& t) const {
+    if (!set)
+      return const_iterator(this);
+    return const_iterator(this, set->upper_bound(t));
+  }
+  void encode(bufferlist &bl) const {
+    if (set)
+      ::encode(*set, bl);
+    else
+      ::encode((uint32_t)0, bl);
+  }
+  void decode(bufferlist::iterator& p) {
+    uint32_t n;
+    ::decode(n, p);
+    if (n > 0) {
+      alloc_internal();
+      ::decode_nohead(n, *set, p);
+    } else
+      free_internal();
+  }
+};
+
+template<class T, class Set>
+inline void encode(const compact_set_base<T, Set>& m, bufferlist& bl) {
+  m.encode(bl);
+}
+template<class T, class Set>
+inline void decode(compact_set_base<T, Set>& m, bufferlist::iterator& p) {
+  m.decode(p);
+}
+
+template <class T>
+class compact_set : public compact_set_base<T, std::set<T> > {
+};
+
+template <class T>
+inline std::ostream& operator<<(std::ostream& out, const compact_set<T>& s)
+{
+  for (typename compact_set<T>::const_iterator it = s.begin();
+       it != s.end(); ++it) {
+    if (it != s.begin())
+      out << ",";
+    out << it->first << "=" << it->second;
+  }
+  return out;
+}
+#endif
diff --git a/src/include/compat.h b/src/include/compat.h
index 25d3d76..885b9c1 100644
--- a/src/include/compat.h
+++ b/src/include/compat.h
@@ -13,10 +13,25 @@
 #define CEPH_COMPAT_H
 
 #if defined(__FreeBSD__)
-#define	ENODATA	61
+#define	ENODATA	ENOATTR
 #define	MSG_MORE 0
 #endif /* !__FreeBSD__ */
 
+#if defined(__APPLE__)
+/* PATH_MAX */
+#include <limits.h>
+#endif /* __APPLE__ */
+
+/* O_LARGEFILE is not defined/required on OSX/FreeBSD */
+#ifndef O_LARGEFILE
+#define O_LARGEFILE 0
+#endif
+
+/* Could be relevant for other platforms */
+#ifndef ERESTART
+#define ERESTART EINTR
+#endif
+
 #ifndef TEMP_FAILURE_RETRY
 #define TEMP_FAILURE_RETRY(expression) ({     \
   typeof(expression) __result;                \
diff --git a/src/include/encoding.h b/src/include/encoding.h
index 7b976f2..6fa12f9 100644
--- a/src/include/encoding.h
+++ b/src/include/encoding.h
@@ -164,7 +164,8 @@ inline void encode(const std::string& s, bufferlist& bl, uint64_t features=0)
 {
   __u32 len = s.length();
   encode(len, bl);
-  bl.append(s.data(), len);
+  if (len)
+    bl.append(s.data(), len);
 }
 inline void decode(std::string& s, bufferlist::iterator& p)
 {
@@ -189,7 +190,8 @@ inline void encode(const char *s, bufferlist& bl)
 {
   __u32 len = strlen(s);
   encode(len, bl);
-  bl.append(s, len);
+  if (len)
+    bl.append(s, len);
 }
 
 
@@ -453,6 +455,43 @@ inline void decode(std::set<T>& s, bufferlist::iterator& p)
   }
 }
 
+template<class T, class C>
+inline void encode(const std::set<T, C>& s, bufferlist& bl)
+{
+  __u32 n = (__u32)(s.size());
+  encode(n, bl);
+  for (typename std::set<T, C>::const_iterator p = s.begin(); p != s.end(); ++p)
+    encode(*p, bl);
+}
+template<class T, class C>
+inline void decode(std::set<T, C>& s, bufferlist::iterator& p)
+{
+  __u32 n;
+  decode(n, p);
+  s.clear();
+  while (n--) {
+    T v;
+    decode(v, p);
+    s.insert(v);
+  }
+}
+
+template<class T>
+inline void encode_nohead(const std::set<T>& s, bufferlist& bl)
+{
+  for (typename std::set<T>::const_iterator p = s.begin(); p != s.end(); ++p)
+    encode(*p, bl);
+}
+template<class T>
+inline void decode_nohead(int len, std::set<T>& s, bufferlist::iterator& p)
+{
+  for (int i=0; i<len; i++) {
+    T v;
+    decode(v, p);
+    s.insert(v);
+  }
+}
+
 // multiset
 template<class T>
 inline void encode(const std::multiset<T>& s, bufferlist& bl)
@@ -595,6 +634,16 @@ inline void encode(const std::map<T,U>& m, bufferlist& bl)
     encode(p->second, bl);
   }
 }
+template<class T, class U, class C>
+inline void encode(const std::map<T,U,C>& m, bufferlist& bl)
+{
+  __u32 n = (__u32)(m.size());
+  encode(n, bl);
+  for (typename std::map<T,U,C>::const_iterator p = m.begin(); p != m.end(); ++p) {
+    encode(p->first, bl);
+    encode(p->second, bl);
+  }
+}
 template<class T, class U>
 inline void encode(const std::map<T,U>& m, bufferlist& bl, uint64_t features)
 {
@@ -617,6 +666,29 @@ inline void decode(std::map<T,U>& m, bufferlist::iterator& p)
     decode(m[k], p);
   }
 }
+template<class T, class U, class C>
+inline void decode(std::map<T,U,C>& m, bufferlist::iterator& p)
+{
+  __u32 n;
+  decode(n, p);
+  m.clear();
+  while (n--) {
+    T k;
+    decode(k, p);
+    decode(m[k], p);
+  }
+}
+template<class T, class U, class C>
+inline void decode_noclear(std::map<T,U,C>& m, bufferlist::iterator& p)
+{
+  __u32 n;
+  decode(n, p);
+  while (n--) {
+    T k;
+    decode(k, p);
+    decode(m[k], p);
+  }
+}
 template<class T, class U>
 inline void decode_noclear(std::map<T,U>& m, bufferlist::iterator& p)
 {
@@ -791,13 +863,13 @@ inline void decode(std::deque<T>& ls, bufferlist::iterator& p)
 #define ENCODE_FINISH(bl) ENCODE_FINISH_NEW_COMPAT(bl, 0)
 
 #define DECODE_ERR_VERSION(func, v)			\
-  "" #func " unknown encoding version > " #v
+  (std::string(func) + " unknown encoding version > " #v)
 
 #define DECODE_ERR_OLDVERSION(func, v)			\
-  "" #func " no longer understand old encoding version < " #v
+  (std::string(func) + " no longer understand old encoding version < " #v)
 
 #define DECODE_ERR_PAST(func) \
-  "" #func " decode past end of struct encoding"
+  (std::string(func) + " decode past end of struct encoding")
 
 /**
  * check for very old encoding
diff --git a/src/include/filepath.h b/src/include/filepath.h
index 6d2128b..f731847 100644
--- a/src/include/filepath.h
+++ b/src/include/filepath.h
@@ -23,7 +23,7 @@
  */
 
 
-#include <iostream>
+#include <iosfwd>
 #include <string>
 #include <vector>
 using namespace std;
diff --git a/src/include/frag.h b/src/include/frag.h
index 60bb0cd..7e0d4a8 100644
--- a/src/include/frag.h
+++ b/src/include/frag.h
@@ -16,12 +16,12 @@
 #define CEPH_FRAG_H
 
 #include <stdint.h>
-#include <map>
 #include <list>
 #include <iostream>
 #include <stdio.h>
 
 #include "buffer.h"
+#include "compact_map.h"
 
 #include "ceph_frag.h"
 #include "include/assert.h"
@@ -177,7 +177,7 @@ class fragtree_t {
   //  frag_t f is split by b bits.
   //  if child frag_t does not appear, it is not split.
 public:
-  std::map<frag_t,int32_t> _splits;  
+  compact_map<frag_t,int32_t> _splits;
 
 public:
   // -------------
@@ -195,7 +195,7 @@ public:
     return _splits.empty();
   }
   int get_split(const frag_t hb) const {
-    std::map<frag_t,int32_t>::const_iterator p = _splits.find(hb);
+    compact_map<frag_t,int32_t>::const_iterator p = _splits.find(hb);
     if (p == _splits.end())
       return 0;
     else
@@ -459,6 +459,22 @@ public:
   void decode(bufferlist::iterator& p) {
     ::decode(_splits, p);
   }
+  void encode_nohead(bufferlist& bl) const {
+    for (compact_map<frag_t,int32_t>::const_iterator p = _splits.begin();
+	 p != _splits.end();
+	 ++p) {
+      ::encode(p->first, bl);
+      ::encode(p->second, bl);
+    }
+  }
+  void decode_nohead(int n, bufferlist::iterator& p) {
+    _splits.clear();
+    while (n-- > 0) {
+      frag_t f;
+      ::decode(f, p);
+      ::decode(_splits[f], p);
+    }
+  }
 
   void print(std::ostream& out) {
     out << "fragtree_t(";
@@ -482,6 +498,21 @@ public:
     }
     out << ")";
   }
+
+  void dump(Formatter *f) const {
+    f->open_array_section("splits");
+    for (compact_map<frag_t,int32_t>::const_iterator p = _splits.begin();
+         p != _splits.end();
+         ++p) {
+      f->open_object_section("split");
+      std::ostringstream frag_str;
+      frag_str << p->first;
+      f->dump_string("frag", frag_str.str());
+      f->dump_int("children", p->second);
+      f->close_section(); // split
+    }
+    f->close_section(); // splits
+  }
 };
 WRITE_CLASS_ENCODER(fragtree_t)
 
@@ -496,9 +527,9 @@ inline std::ostream& operator<<(std::ostream& out, const fragtree_t& ft)
 {
   out << "fragtree_t(";
   
-  for (std::map<frag_t,int32_t>::const_iterator p = ft._splits.begin();
+  for (compact_map<frag_t,int32_t>::const_iterator p = ft._splits.begin();
        p != ft._splits.end();
-       p++) {
+       ++p) {
     if (p != ft._splits.begin())
       out << " ";
     out << p->first << "^" << p->second;
diff --git a/src/include/hash_namespace.h b/src/include/hash_namespace.h
deleted file mode 100644
index 59a4dff..0000000
--- a/src/include/hash_namespace.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef CEPH_HASH_NAMESPACE_H
-#define CEPH_HASH_NAMESPACE_H
-
-#include <ciso646>
-
-#ifdef _LIBCPP_VERSION
-
-#include <functional>
-
-#define CEPH_HASH_NAMESPACE_START namespace std {
-#define CEPH_HASH_NAMESPACE_END }
-#define CEPH_HASH_NAMESPACE std
-
-#else
-
-#include <tr1/functional>
-
-#define CEPH_HASH_NAMESPACE_START namespace std { namespace tr1 {
-#define CEPH_HASH_NAMESPACE_END }}
-#define CEPH_HASH_NAMESPACE std::tr1
-
-#endif
-
-#endif
diff --git a/src/include/inline_memory.h b/src/include/inline_memory.h
new file mode 100644
index 0000000..f216682
--- /dev/null
+++ b/src/include/inline_memory.h
@@ -0,0 +1,138 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage at newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+#ifndef CEPH_INLINE_MEMORY_H
+#define CEPH_INLINE_MEMORY_H
+
+#if defined(__GNUC__)
+
+// optimize for the common case, which is very small copies
+static inline void *maybe_inline_memcpy(void *dest, const void *src, size_t l,
+				       size_t inline_len)
+  __attribute__((always_inline));
+
+void *maybe_inline_memcpy(void *dest, const void *src, size_t l,
+			 size_t inline_len)
+{
+  if (l > inline_len) {
+    return memcpy(dest, src, l);
+  }
+  switch (l) {
+  case 8:
+    return __builtin_memcpy(dest, src, 8);
+  case 4:
+    return __builtin_memcpy(dest, src, 4);
+  case 3:
+    return __builtin_memcpy(dest, src, 3);
+  case 2:
+    return __builtin_memcpy(dest, src, 2);
+  case 1:
+    return __builtin_memcpy(dest, src, 1);
+  default:
+    int cursor = 0;
+    while (l >= sizeof(uint64_t)) {
+      __builtin_memcpy((char*)dest + cursor, (char*)src + cursor,
+		       sizeof(uint64_t));
+      cursor += sizeof(uint64_t);
+      l -= sizeof(uint64_t);
+    }
+    while (l >= sizeof(uint32_t)) {
+      __builtin_memcpy((char*)dest + cursor, (char*)src + cursor,
+		       sizeof(uint32_t));
+      cursor += sizeof(uint32_t);
+      l -= sizeof(uint32_t);
+    }
+    while (l > 0) {
+      *((char*)dest + cursor) = *((char*)src + cursor);
+      cursor++;
+      l--;
+    }
+  }
+  return dest;
+}
+
+#else
+
+#define maybe_inline_memcpy(d, s, l, x) memcpy(d, s, l)
+
+#endif
+
+
+#if defined(__GNUC__) && defined(__x86_64__)
+
+typedef unsigned uint128_t __attribute__ ((mode (TI)));
+
+static inline bool mem_is_zero(const char *data, size_t len)
+  __attribute__((always_inline));
+
+bool mem_is_zero(const char *data, size_t len)
+{
+  // we do have XMM registers in x86-64, so if we need to check at least
+  // 16 bytes, make use of them
+  if (len / sizeof(uint128_t) > 0) {
+    // align data pointer to 16 bytes, otherwise it'll segfault due to bug
+    // in (at least some) GCC versions (using MOVAPS instead of MOVUPS).
+    // check up to 15 first bytes while at it.
+    while (((unsigned long long)data) & 15) {
+      if (*(uint8_t*)data != 0) {
+	return false;
+      }
+      data += sizeof(uint8_t);
+      --len;
+    }
+
+    const char* data_start = data;
+    const char* max128 = data + (len / sizeof(uint128_t))*sizeof(uint128_t);
+
+    while (data < max128) {
+      if (*(uint128_t*)data != 0) {
+	return false;
+      }
+      data += sizeof(uint128_t);
+    }
+    len -= (data - data_start);
+  }
+
+  const char* max = data + len;
+  const char* max32 = data + (len / sizeof(uint32_t))*sizeof(uint32_t);
+  while (data < max32) {
+    if (*(uint32_t*)data != 0) {
+      return false;
+    }
+    data += sizeof(uint32_t);
+  }
+  while (data < max) {
+    if (*(uint8_t*)data != 0) {
+      return false;
+    }
+    data += sizeof(uint8_t);
+  }
+  return true;
+}
+
+#else  // gcc and x86_64
+
+static inline bool mem_is_zero(const char *data, size_t len) {
+  const char *end = data + len;
+  while (data < end) {
+    if (*data != 0) {
+      return false;
+    }
+    ++data;
+  }
+  return true;
+}
+
+#endif  // !x86_64
+
+#endif
diff --git a/src/include/interval_set.h b/src/include/interval_set.h
index 9a7d2f2..00ead9d 100644
--- a/src/include/interval_set.h
+++ b/src/include/interval_set.h
@@ -164,6 +164,10 @@ class interval_set {
     return typename interval_set<T>::iterator(m.begin());
   }
 
+  typename interval_set<T>::iterator lower_bound(T start) {
+    return typename interval_set<T>::iterator(find_inc_m(start));
+  }
+
   typename interval_set<T>::iterator end() {
     return typename interval_set<T>::iterator(m.end());
   }
@@ -172,6 +176,10 @@ class interval_set {
     return typename interval_set<T>::const_iterator(m.begin());
   }
 
+  typename interval_set<T>::const_iterator lower_bound(T start) const {
+    return typename interval_set<T>::const_iterator(find_inc(start));
+  }
+
   typename interval_set<T>::const_iterator end() const {
     return typename interval_set<T>::const_iterator(m.end());
   }
diff --git a/src/include/krbd.h b/src/include/krbd.h
index d7e868c..75206cd 100644
--- a/src/include/krbd.h
+++ b/src/include/krbd.h
@@ -28,6 +28,8 @@ int krbd_map(struct krbd_ctx *ctx, const char *pool, const char *image,
              const char *snap, const char *options, char **pdevnode);
 
 int krbd_unmap(struct krbd_ctx *ctx, const char *devnode);
+int krbd_unmap_by_spec(struct krbd_ctx *ctx, const char *pool,
+                       const char *image, const char *snap);
 
 #ifdef __cplusplus
 }
diff --git a/src/include/memory.h b/src/include/memory.h
index 596627c..409b49f 100644
--- a/src/include/memory.h
+++ b/src/include/memory.h
@@ -1,28 +1,13 @@
 #ifndef CEPH_MEMORY_H
 #define CEPH_MEMORY_H
 
-#include <ciso646>
-
-#ifdef _LIBCPP_VERSION
-
 #include <memory>
 
 namespace ceph {
   using std::shared_ptr;
   using std::weak_ptr;
+  using std::unique_ptr;
   using std::static_pointer_cast;
 }
 
-#else
-
-#include <tr1/memory>
-
-namespace ceph {
-  using std::tr1::shared_ptr;
-  using std::tr1::weak_ptr;
-  using std::tr1::static_pointer_cast;
-}
-
-#endif
-
 #endif
diff --git a/src/include/object.h b/src/include/object.h
index b2a4e85..0f51143 100644
--- a/src/include/object.h
+++ b/src/include/object.h
@@ -18,12 +18,11 @@
 #include <stdint.h>
 #include <stdio.h>
 
-#include <iostream>
+#include <iosfwd>
 #include <iomanip>
 using namespace std;
 
 #include "include/unordered_map.h"
-#include "include/hash_namespace.h"
 
 #include "hash.h"
 #include "encoding.h"
@@ -75,7 +74,7 @@ inline ostream& operator<<(ostream& out, const object_t& o) {
   return out << o.name;
 }
 
-CEPH_HASH_NAMESPACE_START
+namespace std {
   template<> struct hash<object_t> {
     size_t operator()(const object_t& r) const { 
       //static hash<string> H;
@@ -83,7 +82,7 @@ CEPH_HASH_NAMESPACE_START
       return ceph_str_hash_linux(r.name.c_str(), r.name.length());
     }
   };
-CEPH_HASH_NAMESPACE_END
+} // namespace std
 
 
 struct file_object_t {
@@ -176,7 +175,7 @@ inline bool operator<=(const sobject_t &l, const sobject_t &r) {
 inline ostream& operator<<(ostream& out, const sobject_t &o) {
   return out << o.oid << "/" << o.snap;
 }
-CEPH_HASH_NAMESPACE_START
+namespace std {
   template<> struct hash<sobject_t> {
     size_t operator()(const sobject_t &r) const {
       static hash<object_t> H;
@@ -184,6 +183,6 @@ CEPH_HASH_NAMESPACE_START
       return H(r.oid) ^ I(r.snap);
     }
   };
-CEPH_HASH_NAMESPACE_END
+} // namespace std
 
 #endif
diff --git a/src/include/rados.h b/src/include/rados.h
index 3691a2c..59d3225 100644
--- a/src/include/rados.h
+++ b/src/include/rados.h
@@ -143,6 +143,7 @@ extern const char *ceph_osd_state_name(int s);
 #define CEPH_OSDMAP_NODEEP_SCRUB (1<<12) /* block periodic deep-scrub */
 #define CEPH_OSDMAP_NOTIERAGENT (1<<13) /* disable tiering agent */
 #define CEPH_OSDMAP_NOREBALANCE (1<<14) /* block osd backfill unless pg is degraded */
+#define CEPH_OSDMAP_SORTBITWISE (1<<15) /* use bitwise hobject_t sort */
 
 /*
  * The error code to return when an OSD can't handle a write
@@ -165,7 +166,6 @@ extern const char *ceph_osd_state_name(int s);
 #define CEPH_OSD_OP_MODE_CACHE 0x8000
 
 #define CEPH_OSD_OP_TYPE       0x0f00
-#define CEPH_OSD_OP_TYPE_LOCK  0x0100
 #define CEPH_OSD_OP_TYPE_DATA  0x0200
 #define CEPH_OSD_OP_TYPE_ATTR  0x0300
 #define CEPH_OSD_OP_TYPE_EXEC  0x0400
@@ -279,14 +279,6 @@ extern const char *ceph_osd_state_name(int s);
 	f(SCRUB_STOP,	__CEPH_OSD_OP1(SUB, 8),		"scrub-stop")	    \
 	f(SCRUB_MAP,	__CEPH_OSD_OP1(SUB, 9),		"scrub-map")	    \
 									    \
-	/** lock **/							    \
-	f(WRLOCK,	__CEPH_OSD_OP(WR, LOCK, 1),	"wrlock")	    \
-	f(WRUNLOCK,	__CEPH_OSD_OP(WR, LOCK, 2),	"wrunlock")	    \
-	f(RDLOCK,	__CEPH_OSD_OP(WR, LOCK, 3),	"rdlock")	    \
-	f(RDUNLOCK,	__CEPH_OSD_OP(WR, LOCK, 4),	"rdunlock")	    \
-	f(UPLOCK,	__CEPH_OSD_OP(WR, LOCK, 5),	"uplock")	    \
-	f(DNLOCK,	__CEPH_OSD_OP(WR, LOCK, 6),	"dnlock")	    \
-									    \
 	/** exec **/							    \
 	/* note: the RD bit here is wrong; see special-case below in helper */ \
 	f(CALL,		__CEPH_OSD_OP(RD, EXEC, 1),	"call")		    \
@@ -305,10 +297,6 @@ __CEPH_FORALL_OSD_OPS(GENERATE_ENUM_ENTRY)
 #undef GENERATE_ENUM_ENTRY
 };
 
-static inline int ceph_osd_op_type_lock(int op)
-{
-	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK;
-}
 static inline int ceph_osd_op_type_data(int op)
 {
 	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA;
@@ -410,6 +398,8 @@ enum {
 						      pool uses pool snaps */
 	CEPH_OSD_FLAG_REDIRECTED   = 0x200000,  /* op has been redirected */
 	CEPH_OSD_FLAG_KNOWN_REDIR = 0x400000,  /* redirect bit is authoritative */
+	CEPH_OSD_FLAG_FULL_TRY =    0x800000,  /* try op despite full flag */
+	CEPH_OSD_FLAG_FULL_FORCE = 0x1000000,  /* force op despite full flag */
 };
 
 enum {
@@ -446,6 +436,11 @@ enum {
 	CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE = 4, /* ignore osd cache logic */
 	CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE = 8, /* map snap direct to
 						     * cloneid */
+	CEPH_OSD_COPY_FROM_FLAG_RWORDERED = 16, /* order with write */
+};
+
+enum {
+	CEPH_OSD_COPY_GET_FLAG_NOTSUPP_OMAP = 1, /* mean dest pool don't support omap*/
 };
 
 enum {
@@ -515,11 +510,17 @@ struct ceph_osd_op {
 		} __attribute__ ((packed)) clonerange;
 		struct {
 			__le64 max;     /* max data in reply */
+			__le32 flags;
 		} __attribute__ ((packed)) copy_get;
 		struct {
 			__le64 snapid;
 			__le64 src_version;
 			__u8 flags;
+			/*
+			 * __le32 flags: CEPH_OSD_OP_FLAG_FADVISE_: mean the fadvise flags for dest object
+			 * src_fadvise_flags mean the fadvise flags for src object
+			 */
+			__le32 src_fadvise_flags;
 		} __attribute__ ((packed)) copy_from;
 		struct {
 			struct ceph_timespec stamp;
diff --git a/src/include/rados/buffer.h b/src/include/rados/buffer.h
index d243d6e..f28bc5e 100644
--- a/src/include/rados/buffer.h
+++ b/src/include/rados/buffer.h
@@ -23,6 +23,7 @@
 #endif
 
 #include <stdio.h>
+#include <sys/uio.h>
 
 #if defined(__linux__)	// For malloc(2).
 #include <malloc.h>
@@ -36,12 +37,13 @@
 # include <sys/mman.h>
 #endif
 
-#include <iostream>
-#include <istream>
+#include <iosfwd>
 #include <iomanip>
 #include <list>
+#include <vector>
 #include <string>
 #include <exception>
+#include <type_traits>
 
 #include "page.h"
 #include "crc32c.h"
@@ -59,12 +61,14 @@
 #endif
 
 #if defined(HAVE_XIO)
-struct xio_mempool_obj;
+struct xio_reg_mem;
 class XioDispatchHook;
 #endif
 
 namespace ceph {
 
+const static int CEPH_BUFFER_APPEND_SIZE(4096);
+
 class CEPH_BUFFER_API buffer {
   /*
    * exceptions
@@ -72,27 +76,19 @@ class CEPH_BUFFER_API buffer {
 
 public:
   struct error : public std::exception{
-    const char *what() const throw () {
-      return "buffer::exception";
-    }
+    const char *what() const throw ();
   };
   struct bad_alloc : public error {
-    const char *what() const throw () {
-      return "buffer::bad_alloc";
-    }
+    const char *what() const throw ();
   };
   struct end_of_buffer : public error {
-    const char *what() const throw () {
-      return "buffer::end_of_buffer";
-    }
+    const char *what() const throw ();
   };
   struct malformed_input : public error {
-    explicit malformed_input(const char *w) {
-      snprintf(buf, sizeof(buf), "buffer::malformed_input: %s", w);
-    }
-    const char *what() const throw () {
-      return buf;
+    explicit malformed_input(const std::string& w) {
+      snprintf(buf, sizeof(buf), "buffer::malformed_input: %s", w.c_str());
     }
+    const char *what() const throw ();
   private:
     char buf[256];
   };
@@ -220,12 +216,7 @@ public:
     unsigned raw_length() const;
     int raw_nref() const;
 
-    void copy_out(unsigned o, unsigned l, char *dest) const {
-      assert(_raw);
-      if (!((o <= _len) && (o+l <= _len)))
-	throw end_of_buffer();
-      memcpy(dest, c_str()+o, l);
-    }
+    void copy_out(unsigned o, unsigned l, char *dest) const;
 
     bool can_zero_copy() const;
     int zero_copy_to_fd(int fd, int64_t *offset) const;
@@ -236,14 +227,23 @@ public:
     bool is_zero() const;
 
     // modifiers
-    void set_offset(unsigned o) { _off = o; }
-    void set_length(unsigned l) { _len = l; }
+    void set_offset(unsigned o) {
+      assert(raw_length() >= o);
+      _off = o;
+    }
+    void set_length(unsigned l) {
+      assert(raw_length() >= l);
+      _len = l;
+    }
 
-    void append(char c);
-    void append(const char *p, unsigned l);
+    unsigned append(char c);
+    unsigned append(const char *p, unsigned l);
     void copy_in(unsigned o, unsigned l, const char *src);
+    void copy_in(unsigned o, unsigned l, const char *src, bool crc_reset);
     void zero();
+    void zero(bool crc_reset);
     void zero(unsigned o, unsigned l);
+    void zero(unsigned o, unsigned l, bool crc_reset);
 
   };
 
@@ -260,60 +260,55 @@ public:
     unsigned _memcopy_count; //the total of memcopy using rebuild().
     ptr append_buffer;  // where i put small appends.
 
-  public:
-    class CEPH_BUFFER_API iterator {
-      list *bl;
-      std::list<ptr> *ls; // meh.. just here to avoid an extra pointer dereference..
-      unsigned off;  // in bl
-      std::list<ptr>::iterator p;
-      unsigned p_off; // in *p
+    template <bool is_const>
+    class iterator_impl: public std::iterator<std::forward_iterator_tag, char> {
+    protected:
+      typedef typename std::conditional<is_const,
+					const list,
+					list>::type bl_t;
+      typedef typename std::conditional<is_const,
+					const std::list<ptr>,
+					std::list<ptr> >::type list_t;
+      typedef typename std::conditional<is_const,
+					typename std::list<ptr>::const_iterator,
+					typename std::list<ptr>::iterator>::type list_iter_t;
+      bl_t* bl;
+      list_t* ls;  // meh.. just here to avoid an extra pointer dereference..
+      unsigned off; // in bl
+      list_iter_t p;
+      unsigned p_off;   // in *p
+
     public:
       // constructor.  position.
-      iterator() :
-	bl(0), ls(0), off(0), p_off(0) {}
-      iterator(list *l, unsigned o=0) : 
-	bl(l), ls(&bl->_buffers), off(0), p(ls->begin()), p_off(0) {
+      iterator_impl()
+	: bl(0), ls(0), off(0), p_off(0) {}
+      iterator_impl(bl_t *l, unsigned o=0)
+	: bl(l), ls(&bl->_buffers), off(0), p(ls->begin()), p_off(0) {
 	advance(o);
       }
-      iterator(list *l, unsigned o, std::list<ptr>::iterator ip, unsigned po) : 
-	bl(l), ls(&bl->_buffers), off(o), p(ip), p_off(po) { }
-
-      iterator(const iterator& other) : bl(other.bl),
-					ls(other.ls),
-					off(other.off),
-					p(other.p),
-					p_off(other.p_off) {}
-
-      iterator& operator=(const iterator& other) {
-	if (this != &other) {
-	  bl = other.bl;
-	  ls = other.ls;
-	  off = other.off;
-	  p = other.p;
-	  p_off = other.p_off;
-	}
-	return *this;
-      }
+      iterator_impl(bl_t *l, unsigned o, list_iter_t ip, unsigned po)
+	: bl(l), ls(&bl->_buffers), off(o), p(ip), p_off(po) {}
 
       /// get current iterator offset in buffer::list
-      unsigned get_off() { return off; }
+      unsigned get_off() const { return off; }
       
       /// get number of bytes remaining from iterator position to the end of the buffer::list
-      unsigned get_remaining() { return bl->length() - off; }
+      unsigned get_remaining() const { return bl->length() - off; }
 
       /// true if iterator is at the end of the buffer::list
-      bool end() {
+      bool end() const {
 	return p == ls->end();
 	//return off == bl->length();
       }
 
       void advance(int o);
       void seek(unsigned o);
-      char operator*();
-      iterator& operator++();
-      ptr get_current_ptr();
+      bool operator!=(const iterator_impl& rhs) const;
+      char operator*() const;
+      iterator_impl& operator++();
+      ptr get_current_ptr() const;
 
-      list& get_bl() { return *bl; }
+      bl_t& get_bl() { return *bl; }
 
       // copy data out.
       // note that these all _append_ to dest!
@@ -322,11 +317,36 @@ public:
       void copy(unsigned len, list &dest);
       void copy(unsigned len, std::string &dest);
       void copy_all(list &dest);
+    };
+
+  public:
+    typedef iterator_impl<true> const_iterator;
+
+    class CEPH_BUFFER_API iterator : public iterator_impl<false> {
+    public:
+      iterator(): iterator_impl() {}
+      iterator(bl_t *l, unsigned o=0) :
+	iterator_impl(l, o) {}
+      iterator(bl_t *l, unsigned o, list_iter_t ip, unsigned po) :
+	iterator_impl(l, o, ip, po) {}
+
+      void advance(int o);
+      void seek(unsigned o);
+      char operator*();
+      iterator& operator++();
+      ptr get_current_ptr();
+
+      // copy data out
+      void copy(unsigned len, char *dest);
+      void copy(unsigned len, ptr &dest);
+      void copy(unsigned len, list &dest);
+      void copy(unsigned len, std::string &dest);
+      void copy_all(list &dest);
 
       // copy data in
       void copy_in(unsigned len, const char *src);
+      void copy_in(unsigned len, const char *src, bool crc_reset);
       void copy_in(unsigned len, const list& otherl);
-
     };
 
   private:
@@ -345,6 +365,7 @@ public:
 			      _memcopy_count(other._memcopy_count), last_p(this) {
       make_shareable();
     }
+    list(list&& other);
     list& operator= (const list& other) {
       if (this != &other) {
         _buffers = other._buffers;
@@ -370,7 +391,9 @@ public:
 #endif
       return _len;
     }
+
     bool contents_equal(buffer::list& other);
+    bool contents_equal(const buffer::list& other) const;
 
     bool can_zero_copy() const;
     bool is_aligned(unsigned align) const;
@@ -454,12 +477,20 @@ public:
       return iterator(this, _len, _buffers.end(), 0);
     }
 
+    const_iterator begin() const {
+      return const_iterator(this, 0);
+    }
+    const_iterator end() const {
+      return const_iterator(this, _len, _buffers.end(), 0);
+    }
+
     // crope lookalikes.
     // **** WARNING: this are horribly inefficient for large bufferlists. ****
     void copy(unsigned off, unsigned len, char *dest) const;
     void copy(unsigned off, unsigned len, list &dest) const;
     void copy(unsigned off, unsigned len, std::string& dest) const;
     void copy_in(unsigned off, unsigned len, const char *src);
+    void copy_in(unsigned off, unsigned len, const char *src, bool crc_reset);
     void copy_in(unsigned off, unsigned len, const list& src);
 
     void append(char c);
@@ -500,7 +531,9 @@ public:
     int write_file(const char *fn, int mode=0644);
     int write_fd(int fd) const;
     int write_fd_zero_copy(int fd) const;
+    void prepare_iov(std::vector<iovec> *piov) const;
     uint32_t crc32c(uint32_t crc) const;
+	void invalidate_crc();
   };
 
   /*
@@ -525,7 +558,7 @@ public:
 };
 
 #if defined(HAVE_XIO)
-xio_mempool_obj* get_xio_mp(const buffer::ptr& bp);
+xio_reg_mem* get_xio_mp(const buffer::ptr& bp);
 #endif
 
 typedef buffer::ptr bufferptr;
@@ -568,41 +601,17 @@ inline bool operator<=(bufferlist& l, bufferlist& r) {
 }
 
 
-inline std::ostream& operator<<(std::ostream& out, const buffer::ptr& bp) {
-  if (bp.have_raw())
-    out << "buffer::ptr(" << bp.offset() << "~" << bp.length()
-	<< " " << (void*)bp.c_str() 
-	<< " in raw " << (void*)bp.raw_c_str()
-	<< " len " << bp.raw_length()
-	<< " nref " << bp.raw_nref() << ")";
-  else
-    out << "buffer:ptr(" << bp.offset() << "~" << bp.length() << " no raw)";
-  return out;
-}
+std::ostream& operator<<(std::ostream& out, const buffer::ptr& bp);
 
-inline std::ostream& operator<<(std::ostream& out, const buffer::list& bl) {
-  out << "buffer::list(len=" << bl.length() << "," << std::endl;
 
-  std::list<buffer::ptr>::const_iterator it = bl.buffers().begin();
-  while (it != bl.buffers().end()) {
-    out << "\t" << *it;
-    if (++it == bl.buffers().end()) break;
-    out << "," << std::endl;
-  }
-  out << std::endl << ")";
-  return out;
-}
+std::ostream& operator<<(std::ostream& out, const buffer::list& bl);
 
-inline std::ostream& operator<<(std::ostream& out, buffer::error& e)
-{
-  return out << e.what();
-}
+std::ostream& operator<<(std::ostream& out, const buffer::error& e);
 
 inline bufferhash& operator<<(bufferhash& l, bufferlist &r) {
   l.update(r);
   return l;
 }
-
 }
 
 #endif
diff --git a/src/include/rados/librados.h b/src/include/rados/librados.h
index 5c56191..3aebf20 100644
--- a/src/include/rados/librados.h
+++ b/src/include/rados/librados.h
@@ -64,7 +64,6 @@ extern "C" {
  * Flags that can be set on a per-op basis via
  * rados_read_op_set_flags() and rados_write_op_set_flags().
  */
-/** @cond TODO_enums_not_yet_in_asphyxiate */
 enum {
   // fail a create operation if the object already exists
   LIBRADOS_OP_FLAG_EXCL               =  0x1,
@@ -81,7 +80,6 @@ enum {
   // indicate read/write data will not accessed again (by *this* client)
   LIBRADOS_OP_FLAG_FADVISE_NOCACHE    = 0x40,
 };
-/** @endcond */
 
 #if __GNUC__ >= 4
   #define CEPH_RADOS_API  __attribute__ ((visibility ("default")))
@@ -90,14 +88,13 @@ enum {
 #endif
 
 /**
- * @defgroup librados_h_xattr_comp xattr comparison operations
+ * @name xattr comparison operations
  * Operators for comparing xattrs on objects, and aborting the
  * rados_read_op or rados_write_op transaction if the comparison
  * fails.
  *
  * @{
  */
-/** @cond TODO_enums_not_yet_in_asphyxiate */
 enum {
 	LIBRADOS_CMPXATTR_OP_EQ  = 1,
 	LIBRADOS_CMPXATTR_OP_NE  = 2,
@@ -106,17 +103,15 @@ enum {
 	LIBRADOS_CMPXATTR_OP_LT  = 5,
 	LIBRADOS_CMPXATTR_OP_LTE = 6
 };
-/** @endcond */
 /** @} */
 
 /**
- * @defgroup librados_h_operation_flags
+ * @name Operation Flags
  * Flags for rados_read_op_opeprate(), rados_write_op_operate(),
  * rados_aio_read_op_operate(), and rados_aio_write_op_operate().
  * See librados.hpp for details.
  * @{
  */
-/** @cond TODO_enums_not_yet_in_asphyxiate */
 enum {
   LIBRADOS_OPERATION_NOFLAG             = 0,
   LIBRADOS_OPERATION_BALANCE_READS      = 1,
@@ -125,8 +120,11 @@ enum {
   LIBRADOS_OPERATION_IGNORE_CACHE       = 8,
   LIBRADOS_OPERATION_SKIPRWLOCKS        = 16,
   LIBRADOS_OPERATION_IGNORE_OVERLAY     = 32,
+  /* send requests to cluster despite the cluster or pool being marked
+     full; ops will either succeed (e.g., delete) or return EDQUOT or
+     ENOSPC. */
+  LIBRADOS_OPERATION_FULL_TRY           = 64,
 };
-/** @endcond */
 /** @} */
 
 /*
@@ -312,7 +310,7 @@ typedef void *rados_read_op_t;
 CEPH_RADOS_API void rados_version(int *major, int *minor, int *extra);
 
 /**
- * @defgroup librados_h_init Setup and Teardown
+ * @name Setup and Teardown
  * These are the first and last functions to that should be called
  * when using librados.
  *
@@ -350,7 +348,7 @@ CEPH_RADOS_API int rados_create2(rados_t *pcluster,
  * Share configuration state with another rados_t instance.
  *
  * @param cluster where to store the handle
- * @param cct_ the existing configuration to use
+ * @param cct the existing configuration to use
  * @returns 0 on success, negative error code on failure
  */
 CEPH_RADOS_API int rados_create_with_context(rados_t *cluster,
@@ -413,7 +411,7 @@ CEPH_RADOS_API void rados_shutdown(rados_t cluster);
 /** @} init */
 
 /**
- * @defgroup librados_h_config Configuration
+ * @name Configuration
  * These functions read and update Ceph configuration for a cluster
  * handle. Any configuration changes must be done before connecting to
  * the cluster.
@@ -569,7 +567,7 @@ CEPH_RADOS_API int rados_cluster_fsid(rados_t cluster, char *buf, size_t len);
 CEPH_RADOS_API int rados_wait_for_latest_osdmap(rados_t cluster);
 
 /**
- * @defgroup librados_h_pools Pools
+ * @name Pools
  *
  * RADOS pools are separate namespaces for objects. Pools may have
  * different crush rules associated with them, so they could have
@@ -625,7 +623,6 @@ CEPH_RADOS_API uint64_t rados_get_instance_id(rados_t cluster);
  *
  * @param cluster which cluster the pool is in
  * @param pool_name name of the pool
- * @param pool_id unique id of the pool
  * @param ioctx where to store the io context
  * @returns 0 on success, negative error code on failure
  */
@@ -837,7 +834,7 @@ CEPH_RADOS_API int rados_ioctx_get_pool_name(rados_ioctx_t io, char *buf,
 /** @} pools */
 
 /**
- * @defgroup librados_h_obj_loc Object Locators
+ * @name Object Locators
  *
  * @{
  */
@@ -876,7 +873,7 @@ CEPH_RADOS_API void rados_ioctx_set_namespace(rados_ioctx_t io,
 /** @} obj_loc */
 
 /**
- * @defgroup librados_h_list_nobj New Listing Objects
+ * @name New Listing Objects
  * @{
  */
 /**
@@ -937,7 +934,7 @@ CEPH_RADOS_API void rados_nobjects_list_close(rados_list_ctx_t ctx);
 /** @} New Listing Objects */
 
 /**
- * @defgroup librados_h_list_obj Deprecated Listing Objects
+ * @name Deprecated Listing Objects
  *
  * Older listing objects interface.  Please use the new interface.
  * @{
@@ -974,7 +971,7 @@ CEPH_RADOS_API void rados_objects_list_close(rados_list_ctx_t ctx);
 /** @} Listing Objects */
 
 /**
- * @defgroup librados_h_snaps Snapshots
+ * @name Snapshots
  *
  * RADOS snapshots are based upon sequence numbers that form a
  * snapshot context. They are pool-specific. The snapshot context
@@ -1169,7 +1166,7 @@ CEPH_RADOS_API int rados_ioctx_snap_get_stamp(rados_ioctx_t io, rados_snap_t id,
 /** @} Snapshots */
 
 /**
- * @defgroup librados_h_synch_io Synchronous I/O
+ * @name Synchronous I/O
  * Writes are replicated to a number of OSDs based on the
  * configuration of the pool they are in. These write functions block
  * until data is in memory on all replicas of the object they're
@@ -1301,7 +1298,7 @@ CEPH_RADOS_API int rados_trunc(rados_ioctx_t io, const char *oid,
                                uint64_t size);
 
 /**
- * @defgroup librados_h_xattrs Xattrs
+ * @name Xattrs
  * Extended attributes are stored as extended attributes on the files
  * representing an object on the OSDs. Thus, they have the same
  * limitations as the underlying filesystem. On ext4, this means that
@@ -1557,7 +1554,7 @@ CEPH_RADOS_API int rados_exec(rados_ioctx_t io, const char *oid,
 /** @} Synchronous I/O */
 
 /**
- * @defgroup librados_h_asynch_io Asynchronous I/O
+ * @name Asynchronous I/O
  * Read and write to objects without blocking.
  *
  * @{
@@ -1873,7 +1870,7 @@ CEPH_RADOS_API int rados_aio_cancel(rados_ioctx_t io,
 /** @} Asynchronous I/O */
 
 /**
- * @defgroup librados_h_watch_notify Watch/Notify
+ * @name Watch/Notify
  *
  * Watch/notify is a protocol to help communicate among clients. It
  * can be used to sychronize client state. All that's needed is a
@@ -1986,7 +1983,7 @@ CEPH_RADOS_API int rados_watch(rados_ioctx_t io, const char *o, uint64_t ver,
  * @param io the pool the object is in
  * @param o the object to watch
  * @param cookie where to store the internal id assigned to this watch
- * @param watchcb2 what to do when a notify is received on this object
+ * @param watchcb what to do when a notify is received on this object
  * @param watcherrcb what to do when the watch session encounters an error
  * @param arg opaque value to pass to the callback
  * @returns 0 on success, negative error code on failure
@@ -2140,7 +2137,7 @@ CEPH_RADOS_API int rados_watch_flush(rados_t cluster);
 /** @} Watch/Notify */
 
 /**
- * @defgroup librados_h_hints Hints
+ * @name Hints
  *
  * @{
  */
@@ -2165,7 +2162,7 @@ CEPH_RADOS_API int rados_set_alloc_hint(rados_ioctx_t io, const char *o,
 /** @} Hints */
 
 /**
- * @defgroup librados_h_obj_op Object Operations
+ * @name Object Operations
  *
  * A single rados operation can do multiple operations on one object
  * atomicly. The whole operation will suceed or fail, and no partial
@@ -2332,7 +2329,7 @@ CEPH_RADOS_API void rados_write_op_remove(rados_write_op_t write_op);
 /**
  * Truncate an object
  * @param write_op operation to add this action to
- * @offset Offset to truncate to
+ * @param offset Offset to truncate to
  */
 CEPH_RADOS_API void rados_write_op_truncate(rados_write_op_t write_op,
                                             uint64_t offset);
@@ -2340,8 +2337,8 @@ CEPH_RADOS_API void rados_write_op_truncate(rados_write_op_t write_op,
 /**
  * Zero part of an object
  * @param write_op operation to add this action to
- * @offset Offset to zero
- * @len length to zero
+ * @param offset Offset to zero
+ * @param len length to zero
  */
 CEPH_RADOS_API void rados_write_op_zero(rados_write_op_t write_op,
 			                uint64_t offset,
@@ -2480,7 +2477,7 @@ CEPH_RADOS_API void rados_read_op_assert_exists(rados_read_op_t read_op);
  * @param read_op operation to add this action to
  * @param ver object version number
  */
-CEPH_RADOS_API void rados_read_op_assert_version(rados_read_op_t write_op, uint64_t ver);
+CEPH_RADOS_API void rados_read_op_assert_version(rados_read_op_t read_op, uint64_t ver);
 
 /**
  * Ensure that the an xattr satisfies a comparison
@@ -2551,15 +2548,15 @@ CEPH_RADOS_API void rados_read_op_stat(rados_read_op_t read_op,
  *
  * @param read_op operation to add this action to
  * @param offset offset to read from
- * @param buffer where to put the data
  * @param len length of buffer
- * @param prval where to store the return value of this action
+ * @param buffer where to put the data
  * @param bytes_read where to store the number of bytes read by this action
+ * @param prval where to store the return value of this action
  */
 CEPH_RADOS_API void rados_read_op_read(rados_read_op_t read_op,
 			               uint64_t offset,
 			               size_t len,
-			               char *buf,
+			               char *buffer,
 			               size_t *bytes_read,
 			               int *prval);
 
@@ -2625,7 +2622,7 @@ CEPH_RADOS_API void rados_read_op_exec_user_buf(rados_read_op_t read_op,
  * @param read_op operation to add this action to
  * @param start_after list keys starting after start_after
  * @param filter_prefix list only keys beginning with filter_prefix
- * @parem max_return list no more than max_return key/value pairs
+ * @param max_return list no more than max_return key/value pairs
  * @param iter where to store the iterator
  * @param prval where to store the return value from this action
  */
@@ -2644,7 +2641,7 @@ CEPH_RADOS_API void rados_read_op_omap_get_vals(rados_read_op_t read_op,
  *
  * @param read_op operation to add this action to
  * @param start_after list keys starting after start_after
- * @parem max_return list no more than max_return keys
+ * @param max_return list no more than max_return keys
  * @param iter where to store the iterator
  * @param prval where to store the return value from this action
  */
@@ -2674,9 +2671,9 @@ CEPH_RADOS_API void rados_read_op_omap_get_vals_by_keys(rados_read_op_t read_op,
 /**
  * Perform a read operation synchronously
  * @param read_op operation to perform
- * @io the ioctx that the object is in
- * @oid the object id
- * @flags flags to apply to the entire operation (LIBRADOS_OPERATION_*)
+ * @param io the ioctx that the object is in
+ * @param oid the object id
+ * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*)
  */
 CEPH_RADOS_API int rados_read_op_operate(rados_read_op_t read_op,
 			                 rados_ioctx_t io,
@@ -2686,10 +2683,10 @@ CEPH_RADOS_API int rados_read_op_operate(rados_read_op_t read_op,
 /**
  * Perform a read operation asynchronously
  * @param read_op operation to perform
- * @io the ioctx that the object is in
+ * @param io the ioctx that the object is in
  * @param completion what to do when operation has been attempted
- * @oid the object id
- * @flags flags to apply to the entire operation (LIBRADOS_OPERATION_*)
+ * @param oid the object id
+ * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*)
  */
 CEPH_RADOS_API int rados_aio_read_op_operate(rados_read_op_t read_op,
 			                     rados_ioctx_t io,
@@ -2713,7 +2710,7 @@ CEPH_RADOS_API int rados_aio_read_op_operate(rados_read_op_t read_op,
  * @returns -EBUSY if the lock is already held by another (client, cookie) pair
  * @returns -EEXIST if the lock is already held by the same (client, cookie) pair
  */
-CEPH_RADOS_API int rados_lock_exclusive(rados_ioctx_t io, const char * o,
+CEPH_RADOS_API int rados_lock_exclusive(rados_ioctx_t io, const char * oid,
                                         const char * name, const char * cookie,
                                         const char * desc,
                                         struct timeval * duration,
@@ -2812,7 +2809,7 @@ CEPH_RADOS_API int rados_blacklist_add(rados_t cluster,
 				       uint32_t expire_seconds);
 
 /**
- * @defgroup librados_h_commands Mon/OSD/PG Commands
+ * @name Mon/OSD/PG Commands
  *
  * These interfaces send commands relating to the monitor, OSD, or PGs.
  *
diff --git a/src/include/rados/librados.hpp b/src/include/rados/librados.hpp
index e7ea598..b92a94f 100644
--- a/src/include/rados/librados.hpp
+++ b/src/include/rados/librados.hpp
@@ -30,8 +30,8 @@ namespace librados
   struct ObjListCtx;
   struct PoolAsyncCompletionImpl;
   class RadosClient;
-  class ListObjectImpl;
-  struct NObjectIteratorImpl;
+  struct ListObjectImpl;
+  class NObjectIteratorImpl;
 
   typedef void *list_ctx_t;
   typedef uint64_t auid_t;
@@ -109,6 +109,12 @@ namespace librados
     /// move the iterator to a given hash position.  this may (will!) be rounded to the nearest pg.
     uint32_t seek(uint32_t pos);
 
+    /**
+     * Configure PGLS filter to be applied OSD-side (requires caller
+     * to know/understand the format expected by the OSD)
+     */
+    void set_filter(const bufferlist &bl);
+
   private:
     NObjectIterator(ObjListCtx *ctx_);
     void get_next();
@@ -252,13 +258,17 @@ namespace librados
    * for CACHE_FLUSH and CACHE_EVICT operations.
    */
   enum ObjectOperationGlobalFlags {
-    OPERATION_NOFLAG         = 0,
-    OPERATION_BALANCE_READS  = 1,
-    OPERATION_LOCALIZE_READS = 2,
-    OPERATION_ORDER_READS_WRITES = 4,
-    OPERATION_IGNORE_CACHE = 8,
-    OPERATION_SKIPRWLOCKS = 16,
-    OPERATION_IGNORE_OVERLAY = 32,
+    OPERATION_NOFLAG             = LIBRADOS_OPERATION_NOFLAG,
+    OPERATION_BALANCE_READS      = LIBRADOS_OPERATION_BALANCE_READS,
+    OPERATION_LOCALIZE_READS     = LIBRADOS_OPERATION_LOCALIZE_READS,
+    OPERATION_ORDER_READS_WRITES = LIBRADOS_OPERATION_ORDER_READS_WRITES,
+    OPERATION_IGNORE_CACHE       = LIBRADOS_OPERATION_IGNORE_CACHE,
+    OPERATION_SKIPRWLOCKS        = LIBRADOS_OPERATION_SKIPRWLOCKS,
+    OPERATION_IGNORE_OVERLAY     = LIBRADOS_OPERATION_IGNORE_OVERLAY,
+    // send requests to cluster despite the cluster or pool being
+    // marked full; ops will either succeed (e.g., delete) or return
+    // EDQUOT or ENOSPC
+    OPERATION_FULL_TRY           = LIBRADOS_OPERATION_FULL_TRY,
   };
 
   /*
@@ -408,10 +418,13 @@ namespace librados
      *
      * @param src source object name
      * @param src_ioctx ioctx for the source object
-     * @param version current version of the source object
+     * @param src_version current version of the source object
+     * @param src_fadvise_flags the fadvise flags for source object
      */
     void copy_from(const std::string& src, const IoCtx& src_ioctx,
 		   uint64_t src_version);
+    void copy_from2(const std::string& src, const IoCtx& src_ioctx,
+                    uint64_t src_version, uint32_t src_fadvise_flags);
 
     /**
      * undirty an object
@@ -460,7 +473,7 @@ namespace librados
      * Get up to max_return keys and values beginning after start_after
      *
      * @param start_after [in] list no keys smaller than start_after
-     * @parem max_return [in] list no more than max_return key/value pairs
+     * @param max_return [in] list no more than max_return key/value pairs
      * @param out_vals [out] place returned values in out_vals on completion
      * @param prval [out] place error code in prval upon completion
      */
@@ -477,7 +490,7 @@ namespace librados
      *
      * @param start_after [in] list keys starting after start_after
      * @param filter_prefix [in] list only keys beginning with filter_prefix
-     * @parem max_return [in] list no more than max_return key/value pairs
+     * @param max_return [in] list no more than max_return key/value pairs
      * @param out_vals [out] place returned values in out_vals on completion
      * @param prval [out] place error code in prval upon completion
      */
@@ -495,7 +508,7 @@ namespace librados
      * Get up to max_return keys beginning after start_after
      *
      * @param start_after [in] list keys starting after start_after
-     * @parem max_return [in] list no more than max_return keys
+     * @param max_return [in] list no more than max_return keys
      * @param out_keys [out] place returned values in out_keys on completion
      * @param prval [out] place error code in prval upon completion
      */
@@ -515,8 +528,8 @@ namespace librados
     /**
      * get key/value pairs for specified keys
      *
-     * @param to_get [in] keys to get
-     * @param out_vals [out] place key/value pairs found here on completion
+     * @param keys [in] keys to get
+     * @param map [out] place key/value pairs found here on completion
      * @param prval [out] place error code in prval upon completion
      */
     void omap_get_vals_by_keys(const std::set<std::string> &keys,
@@ -550,7 +563,7 @@ namespace librados
     /**
      * query dirty state of an object
      *
-     * @param out_dirty [out] pointer to resulting bool
+     * @param isdirty [out] pointer to resulting bool
      * @param prval [out] place error code in prval upon completion
      */
     void is_dirty(bool *isdirty, int *prval);
@@ -765,8 +778,11 @@ namespace librados
 
     /// Start enumerating objects for a pool
     NObjectIterator nobjects_begin();
+    NObjectIterator nobjects_begin(const bufferlist &filter);
     /// Start enumerating objects for a pool starting from a hash position
     NObjectIterator nobjects_begin(uint32_t start_hash_position);
+    NObjectIterator nobjects_begin(uint32_t start_hash_position,
+                                   const bufferlist &filter);
     /// Iterator indicating the end of a pool
     const NObjectIterator& nobjects_end() const;
 
@@ -790,7 +806,7 @@ namespace librados
     /**
      * Retrieve hit set for a given hash, and time
      *
-     * @param uint32_t [in] hash position
+     * @param hash [in] hash position
      * @param c [in] completion
      * @param stamp [in] time interval that falls within the hit set's interval
      * @param pbl [out] buffer to store the result in
@@ -864,9 +880,8 @@ namespace librados
      * The return value of the completion will be 0 on success, negative
      * error code on failure.
      *
-     * @param io the context to operate in
      * @param oid the name of the object
-     * @param completion what to do when the remove is safe and complete
+     * @param c what to do when the remove is safe and complete
      * @returns 0 on success, -EROFS if the io context specifies a snap_seq
      * other than SNAP_HEAD
      */
diff --git a/src/include/rados/memory.h b/src/include/rados/memory.h
index 596627c..409b49f 100644
--- a/src/include/rados/memory.h
+++ b/src/include/rados/memory.h
@@ -1,28 +1,13 @@
 #ifndef CEPH_MEMORY_H
 #define CEPH_MEMORY_H
 
-#include <ciso646>
-
-#ifdef _LIBCPP_VERSION
-
 #include <memory>
 
 namespace ceph {
   using std::shared_ptr;
   using std::weak_ptr;
+  using std::unique_ptr;
   using std::static_pointer_cast;
 }
 
-#else
-
-#include <tr1/memory>
-
-namespace ceph {
-  using std::tr1::shared_ptr;
-  using std::tr1::weak_ptr;
-  using std::tr1::static_pointer_cast;
-}
-
-#endif
-
 #endif
diff --git a/src/include/radosstriper/libradosstriper.h b/src/include/radosstriper/libradosstriper.h
index 972d210..70595c9 100644
--- a/src/include/radosstriper/libradosstriper.h
+++ b/src/include/radosstriper/libradosstriper.h
@@ -57,7 +57,7 @@ void rados_striper_destroy(rados_striper_t striper);
  * Already existing objects will be opened with their own layout.
  *
  * @param striper the targetted striper
- * @param stiper_unit the stripe_unit value of the new object layout
+ * @param stripe_unit the stripe_unit value of the new object layout
  * @returns 0 on success, negative error code on failure
  */
 int rados_striper_set_object_layout_stripe_unit(rados_striper_t striper,
@@ -202,7 +202,7 @@ int rados_striper_remove(rados_striper_t striper,
  * @note the truncation can not happen if any I/O is ongoing (it
  * will return EBUSY). Identically, no I/O will be able to start
  * during truncation (same EBUSY return code)
- * @param striper the striper in which the truncation will occur
+ * @param io the rados context to use
  * @param soid the name of the striped object
  * @param size the new size of the object in bytes
  * @returns 0 on success, negative error code on failure
@@ -225,7 +225,7 @@ int rados_striper_trunc(rados_ioctx_t io, const char *soid, uint64_t size);
  * Get the value of an extended attribute on a striped object.
  *
  * @param striper the striper in which the getxattr will occur
- * @param o name of the striped object
+ * @param oid name of the striped object
  * @param name which extended attribute to read
  * @param buf where to store the result
  * @param len size of buf in bytes
@@ -241,7 +241,7 @@ int rados_striper_getxattr(rados_striper_t striper,
  * Set an extended attribute on a striped object.
  *
  * @param striper the striper in which the setxattr will occur
- * @param o name of the object
+ * @param oid name of the object
  * @param name which extended attribute to set
  * @param buf what to store in the xattr
  * @param len the number of bytes in buf
@@ -257,7 +257,7 @@ int rados_striper_setxattr(rados_striper_t striper,
  * Delete an extended attribute from a striped object.
  *
  * @param striper the striper in which the rmxattr will occur
- * @param o the name of the object
+ * @param oid name of the object
  * @param name which xattr to delete
  * @returns 0 on success, negative error code on failure
  */
diff --git a/src/include/radosstriper/libradosstriper.hpp b/src/include/radosstriper/libradosstriper.hpp
index 2da201c..05179ff 100644
--- a/src/include/radosstriper/libradosstriper.hpp
+++ b/src/include/radosstriper/libradosstriper.hpp
@@ -142,25 +142,25 @@ namespace libradosstriper
 
     /**
      * synchronously append data to the striped object
-     * NOTE: this call steals the contents of @param bl.
+     * NOTE: this call steals the contents of @p bl.
      */
     int append(const std::string& soid, const ceph::bufferlist& bl, size_t len);
 
     /**
      * asynchronously write to the striped object at the specified offset.
-     * NOTE: this call steals the contents of @param bl.
+     * NOTE: this call steals the contents of @p bl.
      */
     int aio_write(const std::string& soid, librados::AioCompletion *c, const ceph::bufferlist& bl, size_t len, uint64_t off);
 
     /**
      * asynchronously fill the striped object with the specified data
-     * NOTE: this call steals the contents of @param bl.
+     * NOTE: this call steals the contents of @p bl.
      */
     int aio_write_full(const std::string& soid, librados::AioCompletion *c, const ceph::bufferlist& bl);
 
     /**
      * asynchronously append data to the striped object
-     * NOTE: this call steals the contents of @param bl.
+     * NOTE: this call steals the contents of @p bl.
      */
     int aio_append(const std::string& soid, librados::AioCompletion *c, const ceph::bufferlist& bl, size_t len);
 
diff --git a/src/include/rangeset.h b/src/include/rangeset.h
index 9eb0b70..547af26 100644
--- a/src/include/rangeset.h
+++ b/src/include/rangeset.h
@@ -23,7 +23,6 @@
  */
 
 #include <map>
-#include <iostream>
 using namespace std;
 
 //typedef int T;
diff --git a/src/include/rbd/features.h b/src/include/rbd/features.h
index 1d17067..ac7b558 100644
--- a/src/include/rbd/features.h
+++ b/src/include/rbd/features.h
@@ -5,17 +5,37 @@
 #define RBD_FEATURE_STRIPINGV2		(1<<1)
 #define RBD_FEATURE_EXCLUSIVE_LOCK	(1<<2)
 #define RBD_FEATURE_OBJECT_MAP		(1<<3)
+#define RBD_FEATURE_FAST_DIFF           (1<<4)
+#define RBD_FEATURE_DEEP_FLATTEN        (1<<5)
 
+/// features that make an image inaccessible for read or write by
+/// clients that don't understand them
 #define RBD_FEATURES_INCOMPATIBLE 	(RBD_FEATURE_LAYERING |       \
 					 RBD_FEATURE_STRIPINGV2)
 
-#define RBD_FEATURES_RW_INCOMPATIBLE	(RBD_FEATURES_INCOMPATIBLE |  \
+/// features that make an image unwritable by clients that don't understand them
+#define RBD_FEATURES_RW_INCOMPATIBLE	(RBD_FEATURES_INCOMPATIBLE  | \
 					 RBD_FEATURE_EXCLUSIVE_LOCK | \
-					 RBD_FEATURE_OBJECT_MAP)
+					 RBD_FEATURE_OBJECT_MAP     | \
+                                         RBD_FEATURE_FAST_DIFF      | \
+                                         RBD_FEATURE_DEEP_FLATTEN)
 
-#define RBD_FEATURES_ALL          	(RBD_FEATURE_LAYERING |       \
-					 RBD_FEATURE_STRIPINGV2 |     \
+#define RBD_FEATURES_ALL          	(RBD_FEATURE_LAYERING       | \
+					 RBD_FEATURE_STRIPINGV2     | \
                                    	 RBD_FEATURE_EXCLUSIVE_LOCK | \
-                                         RBD_FEATURE_OBJECT_MAP)
+                                         RBD_FEATURE_OBJECT_MAP     | \
+                                         RBD_FEATURE_FAST_DIFF      | \
+                                         RBD_FEATURE_DEEP_FLATTEN)
+
+/// features that may be dynamically enabled or disabled
+#define RBD_FEATURES_MUTABLE            (RBD_FEATURE_EXCLUSIVE_LOCK | \
+                                         RBD_FEATURE_OBJECT_MAP     | \
+                                         RBD_FEATURE_FAST_DIFF)
+
+/// features that only work when used with a single client
+/// using the image for writes
+#define RBD_FEATURES_SINGLE_CLIENT (RBD_FEATURE_EXCLUSIVE_LOCK | \
+                                    RBD_FEATURE_OBJECT_MAP     | \
+                                    RBD_FEATURE_FAST_DIFF)
 
 #endif
diff --git a/src/include/rbd/librbd.h b/src/include/rbd/librbd.h
index 625d472..690dbbd 100644
--- a/src/include/rbd/librbd.h
+++ b/src/include/rbd/librbd.h
@@ -31,7 +31,7 @@ extern "C" {
 
 #define LIBRBD_VER_MAJOR 0
 #define LIBRBD_VER_MINOR 1
-#define LIBRBD_VER_EXTRA 9
+#define LIBRBD_VER_EXTRA 10
 
 #define LIBRBD_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra)
 
@@ -48,6 +48,7 @@ extern "C" {
 #endif
 
 #define RBD_FLAG_OBJECT_MAP_INVALID   (1<<0)
+#define RBD_FLAG_FAST_DIFF_INVALID    (1<<1)
 
 typedef void *rbd_snap_t;
 typedef void *rbd_image_t;
@@ -148,6 +149,8 @@ CEPH_RBD_API int rbd_stat(rbd_image_t image, rbd_image_info_t *info,
 CEPH_RBD_API int rbd_get_old_format(rbd_image_t image, uint8_t *old);
 CEPH_RBD_API int rbd_get_size(rbd_image_t image, uint64_t *size);
 CEPH_RBD_API int rbd_get_features(rbd_image_t image, uint64_t *features);
+CEPH_RBD_API int rbd_update_features(rbd_image_t image, uint64_t features,
+                                     uint8_t enabled);
 CEPH_RBD_API int rbd_get_stripe_unit(rbd_image_t image, uint64_t *stripe_unit);
 CEPH_RBD_API int rbd_get_stripe_count(rbd_image_t image,
                                       uint64_t *stripe_count);
@@ -162,6 +165,10 @@ CEPH_RBD_API int rbd_get_flags(rbd_image_t image, uint64_t *flags);
 /* exclusive lock feature */
 CEPH_RBD_API int rbd_is_exclusive_lock_owner(rbd_image_t image, int *is_owner);
 
+/* object map feature */
+CEPH_RBD_API int rbd_rebuild_object_map(rbd_image_t image,
+                                        librbd_progress_fn_t cb, void *cbdata);
+
 CEPH_RBD_API int rbd_copy(rbd_image_t image, rados_ioctx_t dest_io_ctx,
                           const char *destname);
 CEPH_RBD_API int rbd_copy2(rbd_image_t src, rbd_image_t dest);
@@ -383,6 +390,8 @@ CEPH_RBD_API int rbd_read_iterate2(rbd_image_t image, uint64_t ofs, uint64_t len
  * @param fromsnapname start snapshot name, or NULL
  * @param ofs start offset
  * @param len len in bytes of region to report on
+ * @param include_parent 1 if full history diff should include parent
+ * @param whole_object 1 if diff extents should cover whole object
  * @param cb callback to call for each allocated region
  * @param arg argument to pass to the callback
  * @returns 0 on success, or negative error code on error
@@ -392,6 +401,12 @@ CEPH_RBD_API int rbd_diff_iterate(rbd_image_t image,
 		                  uint64_t ofs, uint64_t len,
 		                  int (*cb)(uint64_t, size_t, int, void *),
                                   void *arg);
+CEPH_RBD_API int rbd_diff_iterate2(rbd_image_t image,
+		                   const char *fromsnapname,
+		                   uint64_t ofs, uint64_t len,
+                                   uint8_t include_parent, uint8_t whole_object,
+		                   int (*cb)(uint64_t, size_t, int, void *),
+                                   void *arg);
 CEPH_RBD_API ssize_t rbd_write(rbd_image_t image, uint64_t ofs, size_t len,
                                const char *buf);
 /*
@@ -443,6 +458,35 @@ CEPH_RBD_API int rbd_aio_flush(rbd_image_t image, rbd_completion_t c);
  */
 CEPH_RBD_API int rbd_invalidate_cache(rbd_image_t image);
 
+CEPH_RBD_API int rbd_metadata_get(rbd_image_t image, const char *key, char *value, size_t *val_len);
+CEPH_RBD_API int rbd_metadata_set(rbd_image_t image, const char *key, const char *value);
+CEPH_RBD_API int rbd_metadata_remove(rbd_image_t image, const char *key);
+/**
+ * List all metadatas associated with this image.
+ *
+ * This iterates over all metadatas, key_len and val_len are filled in
+ * with the number of bytes put into the keys and values buffers.
+ *
+ * If the provided buffers are too short, the required lengths are
+ * still filled in, but the data is not and -ERANGE is returned.
+ * Otherwise, the buffers are filled with the keys and values
+ * of the image, with a '\0' after each.
+ *
+ * @param image which image (and implicitly snapshot) to list clones of
+ * @param start_after which name to begin listing after
+ *        (use the empty string to start at the beginning)
+ * @param max the maximum number of names to lis(if 0 means no limit)
+ * @param keys buffer in which to store pool names
+ * @param keys_len number of bytes in pools buffer
+ * @param values buffer in which to store image names
+ * @param vals_len number of bytes in images buffer
+ * @returns number of children on success, negative error code on failure
+ * @returns -ERANGE if either buffer is too short
+ */
+CEPH_RBD_API int rbd_metadata_list(rbd_image_t image, const char *start, uint64_t max,
+    char *keys, size_t *key_len, char *values, size_t *vals_len);
+
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/include/rbd/librbd.hpp b/src/include/rbd/librbd.hpp
index 569fc77..e587916 100644
--- a/src/include/rbd/librbd.hpp
+++ b/src/include/rbd/librbd.hpp
@@ -108,6 +108,8 @@ public:
   Image();
   ~Image();
 
+  int close();
+
   int resize(uint64_t size);
   int resize_with_progress(uint64_t size, ProgressContext& pctx);
   int stat(image_info_t &info, size_t infosize);
@@ -116,12 +118,16 @@ public:
   int old_format(uint8_t *old);
   int size(uint64_t *size);
   int features(uint64_t *features);
+  int update_features(uint64_t features, bool enabled);
   int overlap(uint64_t *overlap);
   int get_flags(uint64_t *flags);
 
   /* exclusive lock feature */
   int is_exclusive_lock_owner(bool *is_owner);
 
+  /* object map feature */
+  int rebuild_object_map(ProgressContext &prog_ctx);
+
   int copy(IoCtx& dest_io_ctx, const char *destname);
   int copy2(Image& dest);
   int copy_with_progress(IoCtx& dest_io_ctx, const char *destname,
@@ -183,6 +189,8 @@ public:
    * @param fromsnapname start snapshot name, or NULL
    * @param ofs start offset
    * @param len len in bytes of region to report on
+   * @param include_parent true if full history diff should include parent
+   * @param whole_object 1 if diff extents should cover whole object
    * @param cb callback to call for each allocated region
    * @param arg argument to pass to the callback
    * @returns 0 on success, or negative error code on error
@@ -190,6 +198,11 @@ public:
   int diff_iterate(const char *fromsnapname,
 		   uint64_t ofs, uint64_t len,
 		   int (*cb)(uint64_t, size_t, int, void *), void *arg);
+  int diff_iterate2(const char *fromsnapname,
+		    uint64_t ofs, uint64_t len,
+                    bool include_parent, bool whole_object,
+		    int (*cb)(uint64_t, size_t, int, void *), void *arg);
+
   ssize_t write(uint64_t ofs, size_t len, ceph::bufferlist& bl);
   /* @parmam op_flags see librados.h constants beginning with LIBRADOS_OP_FLAG */
   ssize_t write2(uint64_t ofs, size_t len, ceph::bufferlist& bl, int op_flags);
@@ -234,13 +247,20 @@ public:
   int aio_flush(RBD::AioCompletion *c);
 
   /**
-   * Drop any cached data for an image
+   * Drop any cached data for this image
    *
-   * @param image the image to invalidate cached data for
    * @returns 0 on success, negative error code on failure
    */
   int invalidate_cache();
 
+  int metadata_get(const std::string &key, std::string *value);
+  int metadata_set(const std::string &key, const std::string &value);
+  int metadata_remove(const std::string &key);
+  /**
+   * Returns a pair of key/value for this image
+   */
+  int metadata_list(const std::string &start, uint64_t max, std::map<std::string, ceph::bufferlist> *pairs);
+
 private:
   friend class RBD;
 
diff --git a/src/include/rbd/object_map_types.h b/src/include/rbd/object_map_types.h
index 7776232..54852ca 100644
--- a/src/include/rbd/object_map_types.h
+++ b/src/include/rbd/object_map_types.h
@@ -8,5 +8,6 @@
 static const uint8_t OBJECT_NONEXISTENT  = 0;
 static const uint8_t OBJECT_EXISTS       = 1;
 static const uint8_t OBJECT_PENDING      = 2;
+static const uint8_t OBJECT_EXISTS_CLEAN = 3;
 
 #endif // CEPH_RBD_OBJECT_MAP_TYPES_H
diff --git a/src/include/sock_compat.h b/src/include/sock_compat.h
new file mode 100644
index 0000000..5faacc3
--- /dev/null
+++ b/src/include/sock_compat.h
@@ -0,0 +1,26 @@
+#ifndef CEPH_SOCK_COMPAT_H
+#define CEPH_SOCK_COMPAT_H
+
+#include "include/compat.h"
+
+/*
+ * This optimization may not be available on all platforms (e.g. OSX).
+ * Apparently a similar approach based on TCP_CORK can be used.
+ */
+#ifndef MSG_MORE
+# define MSG_MORE 0
+#endif
+
+/*
+ * On BSD SO_NOSIGPIPE can be set via setsockopt to block SIGPIPE.
+ */
+#ifndef MSG_NOSIGNAL
+# define MSG_NOSIGNAL 0
+# ifdef SO_NOSIGPIPE
+#  define CEPH_USE_SO_NOSIGPIPE
+# else
+#  error "Cannot block SIGPIPE!"
+# endif
+#endif
+
+#endif
diff --git a/src/include/str_list.h b/src/include/str_list.h
index 83a0e64..4ba0cad 100644
--- a/src/include/str_list.h
+++ b/src/include/str_list.h
@@ -7,22 +7,77 @@
 #include <string>
 #include <vector>
 
+/**
+ * Split **str** into a list of strings, using the ";,= \t" delimiters and output the result in **str_list**.
+ * 
+ * @param [in] str String to split and save as list
+ * @param [out] str_list List modified containing str after it has been split
+**/
 extern void get_str_list(const std::string& str,
 			 std::list<std::string>& str_list);
+
+/**
+ * Split **str** into a list of strings, using the **delims** delimiters and output the result in **str_list**.
+ * 
+ * @param [in] str String to split and save as list
+ * @param [in] delims characters used to split **str**
+ * @param [out] str_list List modified containing str after it has been split
+**/
 extern void get_str_list(const std::string& str,
                          const char *delims,
 			 std::list<std::string>& str_list);
+
+/**
+ * Split **str** into a list of strings, using the ";,= \t" delimiters and output the result in **str_vec**.
+ * 
+ * @param [in] str String to split and save as Vector
+ * @param [out] str_vec Vector modified containing str after it has been split
+**/
 extern void get_str_vec(const std::string& str,
 			 std::vector<std::string>& str_vec);
+
+/**
+ * Split **str** into a list of strings, using the **delims** delimiters and output the result in **str_vec**.
+ * 
+ * @param [in] str String to split and save as Vector
+ * @param [in] delims characters used to split **str**
+ * @param [out] str_vec Vector modified containing str after it has been split
+**/
 extern void get_str_vec(const std::string& str,
                          const char *delims,
 			 std::vector<std::string>& str_vec);
+
+/**
+ * Split **str** into a list of strings, using the ";,= \t" delimiters and output the result in **str_list**.
+ * 
+ * @param [in] str String to split and save as Set
+ * @param [out] str_list Set modified containing str after it has been split
+**/
 extern void get_str_set(const std::string& str,
 			std::set<std::string>& str_list);
+
+/**
+ * Split **str** into a list of strings, using the **delims** delimiters and output the result in **str_list**.
+ * 
+ * @param [in] str String to split and save as Set
+ * @param [in] delims characters used to split **str**
+ * @param [out] str_list Set modified containing str after it has been split
+**/
 extern void get_str_set(const std::string& str,
                         const char *delims,
 			std::set<std::string>& str_list);
 
+/**
+ * Return a String containing the vector **v** joined with **sep**
+ * 
+ * If **v** is empty, the function returns an empty string
+ * For each element in **v**,
+ * it will concatenate this element and **sep** with result
+ * 
+ * @param [in] v Vector to join as a String
+ * @param [in] sep String used to join each element from **v**
+ * @return empty string if **v** is empty or concatenated string
+**/
 inline std::string str_join(const std::vector<std::string>& v, std::string sep)
 {
   if (v.empty())
diff --git a/src/include/str_map.h b/src/include/str_map.h
index 4b739ef..0bd9de3 100644
--- a/src/include/str_map.h
+++ b/src/include/str_map.h
@@ -83,8 +83,8 @@ extern int get_json_str_map(
  * Always returns 0, as there is no condition for failure.
  *
  * @param [in] str plain text key/value pairs
+ * @param [in] delims field delimiters to be used for parsing str
  * @param [out] str_map key/value pairs parsed from str
- * @param [in] delim field delimiters to be used for parsing str
  * @return **0**
  */
 extern int get_str_map(
diff --git a/src/include/timegm.h b/src/include/timegm.h
new file mode 100644
index 0000000..fb97043
--- /dev/null
+++ b/src/include/timegm.h
@@ -0,0 +1,79 @@
+//  (C) Copyright Howard Hinnant
+//  (C) Copyright 2010-2011 Vicente J. Botet Escriba
+//  Use, modification and distribution are subject to the Boost Software License,
+//  Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
+//  http://www.boost.org/LICENSE_1_0.txt).
+
+//===-------------------------- locale ------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// This code was adapted by Vicente from Howard Hinnant's experimental work
+// on chrono i/o to Boost and some functions from libc++/locale to emulate the missing time_get::get()
+
+#ifndef BOOST_CHRONO_IO_TIME_POINT_IO_H
+#define BOOST_CHRONO_IO_TIME_POINT_IO_H
+
+#include <time.h>
+
+static int32_t is_leap(int32_t year) {
+  if(year % 400 == 0)
+    return 1;
+  if(year % 100 == 0)
+    return 0;
+  if(year % 4 == 0)
+    return 1;
+  return 0;
+}
+
+static int32_t days_from_0(int32_t year) {
+  year--;
+  return 365 * year + (year / 400) - (year/100) + (year / 4);
+}
+
+int32_t static days_from_1970(int32_t year) {
+  static const int days_from_0_to_1970 = days_from_0(1970);
+  return days_from_0(year) - days_from_0_to_1970;
+}
+
+static int32_t days_from_1jan(int32_t year,int32_t month,int32_t day) {
+  static const int32_t days[2][12] =
+  {
+    { 0,31,59,90,120,151,181,212,243,273,304,334},
+    { 0,31,60,91,121,152,182,213,244,274,305,335}
+  };
+
+  return days[is_leap(year)][month-1] + day - 1;
+}
+
+static  time_t internal_timegm(tm const *t) {
+  int year = t->tm_year + 1900;
+  int month = t->tm_mon;
+  if(month > 11)
+  {
+    year += month/12;
+    month %= 12;
+  }
+  else if(month < 0)
+  {
+    int years_diff = (-month + 11)/12;
+    year -= years_diff;
+    month+=12 * years_diff;
+  }
+  month++;
+  int day = t->tm_mday;
+  int day_of_year = days_from_1jan(year,month,day);
+  int days_since_epoch = days_from_1970(year) + day_of_year ;
+
+  time_t seconds_in_day = 3600 * 24;
+  time_t result = seconds_in_day * days_since_epoch + 3600 * t->tm_hour + 60 * t->tm_min + t->tm_sec;
+
+  return result;
+}
+
+#endif
diff --git a/src/include/types.h b/src/include/types.h
index 7c10b1e..f913af6 100644
--- a/src/include/types.h
+++ b/src/include/types.h
@@ -62,7 +62,6 @@ extern "C" {
 using namespace std;
 
 #include "include/unordered_map.h"
-#include "include/hash_namespace.h"
 
 #include "object.h"
 #include "intarith.h"
@@ -112,6 +111,12 @@ inline ostream& operator<<(ostream& out, const deque<A>& v) {
   return out;
 }
 
+template<class A, class B, class C>
+inline ostream& operator<<(ostream&out, const boost::tuple<A, B, C> &t) {
+  out << boost::get<0>(t) <<"," << boost::get<1>(t) << "," << boost::get<2>(t);
+  return out;
+}
+
 template<class A>
 inline ostream& operator<<(ostream& out, const list<A>& ilist) {
   for (typename list<A>::const_iterator it = ilist.begin();
@@ -123,12 +128,6 @@ inline ostream& operator<<(ostream& out, const list<A>& ilist) {
   return out;
 }
 
-template<class A, class B, class C>
-inline ostream& operator<<(ostream&out, const boost::tuple<A, B, C> &t) {
-  out << boost::get<0>(t) <<"," << boost::get<1>(t) << "," << boost::get<2>(t);
-  return out;
-}
-
 template<class A>
 inline ostream& operator<<(ostream& out, const set<A>& iset) {
   for (typename set<A>::const_iterator it = iset.begin();
@@ -140,6 +139,17 @@ inline ostream& operator<<(ostream& out, const set<A>& iset) {
   return out;
 }
 
+template<class A, class C>
+inline ostream& operator<<(ostream& out, const set<A, C>& iset) {
+  for (typename set<A, C>::const_iterator it = iset.begin();
+       it != iset.end();
+       ++it) {
+    if (it != iset.begin()) out << ",";
+    out << *it;
+  }
+  return out;
+}
+
 template<class A>
 inline ostream& operator<<(ostream& out, const multiset<A>& iset) {
   for (typename multiset<A>::const_iterator it = iset.begin();
@@ -165,6 +175,20 @@ inline ostream& operator<<(ostream& out, const map<A,B>& m)
   return out;
 }
 
+template<class A,class B, class C>
+inline ostream& operator<<(ostream& out, const map<A,B,C>& m)
+{
+  out << "{";
+  for (typename map<A,B,C>::const_iterator it = m.begin();
+       it != m.end();
+       ++it) {
+    if (it != m.begin()) out << ",";
+    out << it->first << "=" << it->second;
+  }
+  out << "}";
+  return out;
+}
+
 template<class A,class B>
 inline ostream& operator<<(ostream& out, const multimap<A,B>& m) 
 {
@@ -303,7 +327,7 @@ inline ostream& operator<<(ostream& out, inodeno_t ino) {
   return out << hex << ino.val << dec;
 }
 
-CEPH_HASH_NAMESPACE_START
+namespace std {
   template<> struct hash< inodeno_t >
   {
     size_t operator()( const inodeno_t& x ) const
@@ -312,7 +336,7 @@ CEPH_HASH_NAMESPACE_START
       return H(x.val);
     }
   };
-CEPH_HASH_NAMESPACE_END
+} // namespace std
 
 
 // file modes
@@ -470,12 +494,12 @@ inline ostream& operator<<(ostream& out, const weightf_t& w)
 }
 
 struct shard_id_t {
-  uint8_t id;
+  int8_t id;
 
   shard_id_t() : id(0) {}
-  explicit shard_id_t(uint8_t _id) : id(_id) {}
+  explicit shard_id_t(int8_t _id) : id(_id) {}
 
-  operator uint8_t() const { return id; }
+  operator int8_t() const { return id; }
 
   const static shard_id_t NO_SHARD;
 
diff --git a/src/include/unordered_map.h b/src/include/unordered_map.h
index 30b0914..0e48cfb 100644
--- a/src/include/unordered_map.h
+++ b/src/include/unordered_map.h
@@ -3,8 +3,6 @@
 
 #include <ciso646>
 
-#ifdef _LIBCPP_VERSION
-
 #include <unordered_map>
 
 namespace ceph {
@@ -12,15 +10,4 @@ namespace ceph {
   using std::unordered_multimap;
 }
 
-#else
-
-#include <tr1/unordered_map>
-
-namespace ceph {
-  using std::tr1::unordered_map;
-  using std::tr1::unordered_multimap;
-}
-
-#endif
-
 #endif
diff --git a/src/include/unordered_set.h b/src/include/unordered_set.h
index 9b3b70b..41a6c72 100644
--- a/src/include/unordered_set.h
+++ b/src/include/unordered_set.h
@@ -3,22 +3,10 @@
 
 #include <ciso646>
 
-#ifdef _LIBCPP_VERSION
-
 #include <unordered_set>
 
 namespace ceph {
   using std::unordered_set;
 }
 
-#else
-
-#include <tr1/unordered_set>
-
-namespace ceph {
-  using std::tr1::unordered_set;
-}
-
-#endif
-
 #endif
diff --git a/src/include/util.h b/src/include/util.h
index 87f6499..c453440 100644
--- a/src/include/util.h
+++ b/src/include/util.h
@@ -75,4 +75,14 @@ struct ceph_data_stats
 typedef struct ceph_data_stats ceph_data_stats_t;
 
 int get_fs_stats(ceph_data_stats_t &stats, const char *path);
+
+/// collect info from @p uname(2), @p /proc/meminfo and @p /proc/cpuinfo
+void collect_sys_info(map<string, string> *m, CephContext *cct);
+
+/// dump service ids grouped by their host to the specified formatter
+/// @param f formatter for the output
+/// @param services a map from hostname to a list of service id hosted by this host
+/// @param type the service type of given @p services, for example @p osd or @p mon.
+void dump_services(Formatter* f, const map<string, list<int> >& services, const char* type);
+
 #endif /* CEPH_UTIL_H */
diff --git a/src/include/utime.h b/src/include/utime.h
index 9ec0c48..30780d1 100644
--- a/src/include/utime.h
+++ b/src/include/utime.h
@@ -21,6 +21,7 @@
 #include <errno.h>
 
 #include "include/types.h"
+#include "include/timegm.h"
 #include "common/strtol.h"
 
 
@@ -288,10 +289,10 @@ public:
       gmtime_r(&tt, &tm);
 
       if (nsec) {
-        *nsec = usec * 1000;
+        *nsec = (uint64_t)usec * 1000;
       }
     }
-    time_t t = timegm(&tm);
+    time_t t = internal_timegm(&tm);
     if (epoch)
       *epoch = (uint64_t)t;
 
diff --git a/src/include/uuid.h b/src/include/uuid.h
index 942b807..03e6b5a 100644
--- a/src/include/uuid.h
+++ b/src/include/uuid.h
@@ -8,36 +8,50 @@
 #include "encoding.h"
 #include <ostream>
 
-extern "C" {
-#include <uuid/uuid.h>
-#include <unistd.h>
-}
+#include <boost/uuid/uuid.hpp>
+#include <boost/uuid/uuid_generators.hpp>
+#include <boost/uuid/uuid_io.hpp>
+#include <boost/random/random_device.hpp>
 
 struct uuid_d {
-  uuid_t uuid;
+  boost::uuids::uuid uuid;
 
   uuid_d() {
-    memset(&uuid, 0, sizeof(uuid));
+    boost::uuids::nil_generator gen;
+    uuid = gen();
   }
 
   bool is_zero() const {
-    return uuid_is_null(uuid);
+    return uuid.is_nil();
   }
 
   void generate_random() {
-    uuid_generate(uuid);
+    boost::random::random_device rng("/dev/urandom");
+    boost::uuids::basic_random_generator<boost::random::random_device> gen(&rng);
+    uuid = gen();
   }
   
   bool parse(const char *s) {
-    return uuid_parse(s, uuid) == 0;
+    try {
+      boost::uuids::string_generator gen;
+      uuid = gen(s);
+      return true;
+    } catch (std::runtime_error& e) {
+      return false;
+    }
+  }
+  void print(char *s) const {
+    memcpy(s, boost::uuids::to_string(uuid).c_str(), 37);
   }
-  void print(char *s) {
-    return uuid_unparse(uuid, s);
+
+  char *bytes() const {
+    return (char*)uuid.data;
   }
   
   void encode(bufferlist& bl) const {
     ::encode_raw(uuid, bl);
   }
+
   void decode(bufferlist::iterator& p) const {
     ::decode_raw(uuid, p);
   }
@@ -46,15 +60,15 @@ WRITE_CLASS_ENCODER(uuid_d)
 
 inline std::ostream& operator<<(std::ostream& out, const uuid_d& u) {
   char b[37];
-  uuid_unparse(u.uuid, b);
+  u.print(b);
   return out << b;
 }
 
 inline bool operator==(const uuid_d& l, const uuid_d& r) {
-  return uuid_compare(l.uuid, r.uuid) == 0;
+  return l.uuid == r.uuid;
 }
 inline bool operator!=(const uuid_d& l, const uuid_d& r) {
-  return uuid_compare(l.uuid, r.uuid) != 0;
+  return l.uuid != r.uuid;
 }
 
 
diff --git a/src/include/xlist.h b/src/include/xlist.h
index dcde225..9c36e17 100644
--- a/src/include/xlist.h
+++ b/src/include/xlist.h
@@ -16,6 +16,7 @@
 #define CEPH_XLIST_H
 
 #include "include/assert.h"
+#include <iterator>
 #include <cstdlib>
 
 template<typename T>
@@ -57,6 +58,9 @@ public:
     }
   };
 
+  typedef item* value_type;
+  typedef item* const_reference;
+
 private:
   item *_front, *_back;
   int _size;
@@ -151,7 +155,7 @@ public:
     remove(_back);
   }
 
-  class iterator {
+  class iterator: std::iterator<std::forward_iterator_tag, T> {
   private:
     item *cur;
   public:
@@ -164,12 +168,18 @@ public:
       return *this;
     }
     bool end() const { return cur == 0; }
+    bool operator==(const iterator& rhs) const {
+      return cur == rhs.cur;
+    }
+    bool operator!=(const iterator& rhs) const {
+      return cur != rhs.cur;
+    }
   };
 
   iterator begin() { return iterator(_front); }
   iterator end() { return iterator(NULL); }
 
-  class const_iterator {
+  class const_iterator: std::iterator<std::forward_iterator_tag, T> {
   private:
     item *cur;
   public:
diff --git a/src/init-ceph.in b/src/init-ceph.in
old mode 100644
new mode 100755
index 2ff98c7..faeb7bd
--- a/src/init-ceph.in
+++ b/src/init-ceph.in
@@ -12,7 +12,10 @@
 # Description:       Enable Ceph distributed file system services.
 ### END INIT INFO
 
-. /lib/lsb/init-functions
+# TODO: on FreeBSD/OSX, use equivalent script file
+if [ -e /lib/lsb/init-functions ]; then
+    . /lib/lsb/init-functions
+fi
 
 # detect systemd, also check whether the systemd-run binary exists
 SYSTEMD_RUN=$(which systemd-run 2>/dev/null)
@@ -59,12 +62,12 @@ signal_daemon() {
     signal=$4
     action=$5
     [ -z "$action" ] && action="Stopping"
-    echo -n "$action Ceph $name on $host..."
+    printf "$action Ceph $name on $host..."
     do_cmd "if [ -e $pidfile ]; then
-        pid=`cat $pidfile`
-        if [ -e /proc/\$pid ] && grep -q $daemon /proc/\$pid/cmdline ; then
+        pid=\`cat $pidfile\`
+        if ps -p \$pid -o args= | grep -q $daemon; then
 	    cmd=\"kill $signal \$pid\"
-	    echo -n \$cmd...
+	    printf \"\$cmd...\"
 	    \$cmd
         fi
     fi"
@@ -78,7 +81,7 @@ daemon_is_running() {
     pidfile=$4
     do_cmd "[ -e $pidfile ] || exit 1   # no pid, presumably not running
 	pid=\`cat $pidfile\`
-	[ -e /proc/\$pid ] && grep -q $daemon /proc/\$pid/cmdline && grep -qwe -i.$daemon_id /proc/\$pid/cmdline && exit 0 # running
+	ps -p \$pid -o args= | grep $daemon | grep -qwe -i.$daemon_id && exit 0 # running
         exit 1  # pid is something else" "" "okfail"
 }
 
@@ -89,12 +92,12 @@ stop_daemon() {
     signal=$4
     action=$5
     [ -z "$action" ] && action="Stopping"
-    echo -n "$action Ceph $name on $host..."
+    printf "$action Ceph $name on $host..."
     do_cmd "if [ -e $pidfile ] ; then 
 	pid=\`cat $pidfile\`
-	while [ -e /proc/\$pid ] && grep -q $daemon /proc/\$pid/cmdline ; do
+	while ps -p \$pid -o args= | grep -q $daemon; do
 	    cmd=\"kill $signal \$pid\"
-	    echo -n \$cmd...
+	    printf \"\$cmd...\"
 	    \$cmd
 	    sleep 1
 	    continue
@@ -304,9 +307,9 @@ for name in $what; do
 	    [ -n "$max_open_files" ] && files="ulimit -n $max_open_files;"
 
 	    if [ -n "$SYSTEMD_RUN" ]; then
-		cmd="$SYSTEMD_RUN -r bash -c '$files $cmd --cluster $cluster -f'"
+		cmd="$SYSTEMD_RUN -r bash -c '$files $cmd --cluster $cluster --setuser ceph --setgroup ceph -f'"
 	    else
-		cmd="$files $wrap $cmd --cluster $cluster $runmode"
+		cmd="$files $wrap $cmd --cluster $cluster --setuser ceph --setgroup ceph $runmode"
 	    fi
 
 	    if [ $dofsmount -eq 1 ] && [ -n "$fs_devs" ]; then
@@ -344,6 +347,7 @@ for name in $what; do
 		[ -n "$fs_opt" ] && fs_opt="-o $fs_opt"
 		[ -n "$pre_mount" ] && do_cmd "$pre_mount"
 
+		do_root_cmd_okfail "mkdir -p $fs_path"
 		if [ "$fs_type" = "btrfs" ]; then
 		    echo Mounting Btrfs on $host:$fs_path
 		    do_root_cmd_okfail "modprobe btrfs ; btrfs device scan || btrfsctl -a ; egrep -q '^[^ ]+ $fs_path ' /proc/mounts || mount -t btrfs $fs_opt $first_dev $fs_path"
@@ -364,7 +368,7 @@ for name in $what; do
 		    get_conf osd_location_hook "$BINDIR/ceph-crush-location" "osd crush location hook"
 		    osd_location=`$osd_location_hook --cluster $cluster --id $id --type osd`
 		    get_conf osd_weight "" "osd crush initial weight"
-		    defaultweight="$(df -P -k $osd_data/. | tail -1 | awk '{ print sprintf("%.2f",$2/1073741824) }')"
+		    defaultweight="$(df -P -k $osd_data/. | tail -1 | awk '{ print sprintf("%.4f",$2/1073741824) }')"
 		    get_conf osd_keyring "$osd_data/keyring" "keyring"
 		    do_cmd_okfail "timeout 30 $BINDIR/ceph -c $conf --name=osd.$id --keyring=$osd_keyring osd crush create-or-move -- $id ${osd_weight:-${defaultweight:-1}} $osd_location"
 		    if [ "$ERR" != "0" ]; then
@@ -375,7 +379,10 @@ for name in $what; do
 	    fi
 
 	    echo Starting Ceph $name on $host...
-	    mkdir -p $run_dir
+	    if [ ! -d $run_dir ]; then
+		# assume /var/run exists
+		install -d -m0770 -o ceph -g ceph /var/run/ceph
+	    fi
 	    get_conf pre_start_eval "" "pre start eval"
 	    [ -n "$pre_start_eval" ] && $pre_start_eval
 	    get_conf pre_start "" "pre start command"
@@ -424,7 +431,7 @@ for name in $what; do
 
 	status)
 	    if daemon_is_running $name ceph-$type $id $pid_file; then
-		echo -n "$name: running "
+		printf "$name: running "
 		do_cmd "$BINDIR/ceph --admin-daemon $asok version 2>/dev/null" || echo unknown
             elif [ -e "$pid_file" ]; then
                 # daemon is dead, but pid file still exists
diff --git a/src/init-rbdmap b/src/init-rbdmap
index a9d6826..bad2754 100755
--- a/src/init-rbdmap
+++ b/src/init-rbdmap
@@ -18,14 +18,15 @@
 # Description:       Ceph RBD Mapping
 ### END INIT INFO
 
-DESC="RBD Mapping:"
 RBDMAPFILE="/etc/ceph/rbdmap"
 
-. /lib/lsb/init-functions
+if [ -e /lib/lsb/init-functions ]; then
+    . /lib/lsb/init-functions
+fi
 
 do_map() {
 	if [ ! -f "$RBDMAPFILE" ]; then
-		log_warning_msg "$DESC : No $RBDMAPFILE found."
+		logger -p "daemon.warning" -t init-rbdmap "No $RBDMAPFILE found."
 		exit 0
 	fi
 
@@ -42,37 +43,39 @@ do_map() {
 			DEV=rbd/$DEV
 			;;
 		esac
-		log_action_begin_msg "${DESC} '${DEV}'"
+		logger -p "daemon.debug" -t init-rbdmap "Mapping '${DEV}'"
 		newrbd=""
 		MAP_RV=""
-		RET_OP=0
 		OIFS=$IFS
 		IFS=','
 		for PARAM in ${PARAMS[@]}; do
 			CMDPARAMS="$CMDPARAMS --$(echo $PARAM | tr '=' ' ')"
 		done
 		IFS=$OIFS
-		if [ ! -b /dev/rbd/$DEV ]; then
-			MAP_RV=$(rbd map $DEV $CMDPARAMS 2>&1)
+		if [ -b /dev/rbd/$DEV ]; then
+			MAP_RV="$(readlink -f /dev/rbd/$DEV)"
+		else
+			MAP_RV="$(rbd map $DEV $CMDPARAMS 2>&1)"
 			if [ $? -eq 0 ]; then
 			    newrbd="yes"
 			else
 			    RET=$((${RET}+$?))
-			    RET_OP=1
+			    logger -p "daemon.warning" -t init-rbdmap "Failed to map '${DEV}"
+			    continue
 			fi
 		fi
-		log_action_end_msg ${RET_OP} "${MAP_RV}"
+		logger -p "daemon.debug" -t init-rbdmap "Mapped '${DEV}' to '${MAP_RV}'"
 
 		if [ "$newrbd" ]; then
 			## Mount new rbd
 			MNT_RV=""
 			mount --fake /dev/rbd/$DEV >>/dev/null 2>&1 \
 			&& MNT_RV=$(mount -vn /dev/rbd/$DEV 2>&1)
-			[ -n "${MNT_RV}" ] && log_action_msg "mount: ${MNT_RV}"
+			[ -n "${MNT_RV}" ] && logger -p "daemon.debug" -t init-rbdmap "Mounted '${MAP_RV}' to '${MNT_RV}'"
 
 			## post-mapping
 			if [ -x "/etc/ceph/rbd.d/${DEV}" ]; then
-			    log_action_msg "RBD Running post-map hook '/etc/ceph/rbd.d/${DEV}'"
+			    logger -p "daemon.debug" -t init-rbdmap "Running post-map hook '/etc/ceph/rbd.d/${DEV}'"
 			    /etc/ceph/rbd.d/${DEV} map "/dev/rbd/${DEV}"
 			fi
 		fi
@@ -91,35 +94,32 @@ do_unmap() {
 			    LL="${L##/dev/rbd/}"
 			    if [ "$(readlink -f $L)" = "${DEV}" ] \
 			    && [ -x "/etc/ceph/rbd.d/${LL}" ]; then
-			        log_action_msg "RBD pre-unmap:  '${DEV}' hook '/etc/ceph/rbd.d/${LL}'"
+			        logger -p "daemon.debug" -t init-rbdmap "Running pre-unmap hook for '${DEV}': '/etc/ceph/rbd.d/${LL}'"
 			        /etc/ceph/rbd.d/${LL} unmap "$L"
 			        break
 			    fi
 			done
 
-			log_action_begin_msg "RBD un-mapping: '${DEV}'"
-			UMNT_RV=""
-			UMAP_RV=""
-			RET_OP=0
+			logger -p "daemon.debug" -t init-rbdmap "Unmapping '${DEV}'"
 			MNT=$(findmnt --mtab --source ${DEV} --noheadings | awk '{print $1'})
 			if [ -n "${MNT}" ]; then
-			    log_action_cont_msg "un-mounting '${MNT}'"
-			    UMNT_RV=$(umount "${MNT}" 2>&1)
+			    logger -p "daemon.debug" -t init-rbdmap "Unmounting '${MNT}'"
+			    umount "${MNT}" >>/dev/null 2>&1
 			fi
 			if mountpoint -q "${MNT}"; then
 			    ## Un-mounting failed.
-			    RET_OP=1
+			    logger -p "daemon.warning" -t init-rbdmap "Failed to unmount '${MNT}'"
 			    RET=$((${RET}+1))
-			else
-			    ## Un-mapping.
-			    UMAP_RV=$(rbd unmap $DEV 2>&1)
-			    if [ $? -ne 0 ]; then
-			        RET=$((${RET}+$?))
-			        RET_OP=1
-			    fi
+			    continue
+			fi
+			## Un-mapping.
+			rbd unmap $DEV >>/dev/null 2>&1
+			if [ $? -ne 0 ]; then
+			    logger -p "daemon.warning" -t init-rbdmap "Failed to unmap '${MNT}'"
+			    RET=$((${RET}+$?))
+			    continue
 			fi
-			log_action_end_msg ${RET_OP} "${UMAP_RV}"
-			[ -n "${UMNT_RV}" ] && log_action_msg "${UMNT_RV}"
+			logger -p "daemon.debug" -t init-rbdmap "Unmapped '${DEV}'"
 		done
 	fi
 	exit ${RET}
@@ -149,7 +149,7 @@ case "$1" in
 	;;
 
   *)
-	log_success_msg "Usage: rbdmap {start|stop|restart|force-reload|reload|status}"
+	echo "Usage: rbdmap {start|stop|restart|force-reload|reload|status}"
 	exit 1
 	;;
 esac
diff --git a/src/java/Makefile.in b/src/java/Makefile.in
index b888f48..39e879b 100644
--- a/src/java/Makefile.in
+++ b/src/java/Makefile.in
@@ -162,6 +162,7 @@ AMTAR = @AMTAR@
 AM_CXXFLAGS = @AM_CXXFLAGS@
 AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
 AR = @AR@
+ARM_CRC_FLAGS = @ARM_CRC_FLAGS@
 ARM_FLAGS = @ARM_FLAGS@
 ARM_NEON_FLAGS = @ARM_NEON_FLAGS@
 AUTOCONF = @AUTOCONF@
@@ -169,6 +170,7 @@ AUTOHEADER = @AUTOHEADER@
 AUTOMAKE = @AUTOMAKE@
 AWK = @AWK@
 BOOST_PROGRAM_OPTIONS_LIBS = @BOOST_PROGRAM_OPTIONS_LIBS@
+BOOST_RANDOM_LIBS = @BOOST_RANDOM_LIBS@
 BOOST_THREAD_LIBS = @BOOST_THREAD_LIBS@
 CC = @CC@
 CCAS = @CCAS@
@@ -226,7 +228,8 @@ LD = @LD@
 LDFLAGS = @LDFLAGS@
 LIBEDIT_CFLAGS = @LIBEDIT_CFLAGS@
 LIBEDIT_LIBS = @LIBEDIT_LIBS@
-LIBFUSE = @LIBFUSE@
+LIBFUSE_CFLAGS = @LIBFUSE_CFLAGS@
+LIBFUSE_LIBS = @LIBFUSE_LIBS@
 LIBJEMALLOC = @LIBJEMALLOC@
 LIBOBJS = @LIBOBJS@
 LIBROCKSDB_CFLAGS = @LIBROCKSDB_CFLAGS@
@@ -277,6 +280,7 @@ RPM_RELEASE = @RPM_RELEASE@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
+SPHINX_BUILD = @SPHINX_BUILD@
 STRIP = @STRIP@
 VERSION = @VERSION@
 WARN_ERROR_FORMAT_SECURITY = @WARN_ERROR_FORMAT_SECURITY@
@@ -310,6 +314,7 @@ datarootdir = @datarootdir@
 docdir = @docdir@
 dvidir = @dvidir@
 exec_prefix = @exec_prefix@
+group_rgw = @group_rgw@
 host = @host@
 host_alias = @host_alias@
 host_cpu = @host_cpu@
@@ -339,6 +344,8 @@ sharedstatedir = @sharedstatedir@
 srcdir = @srcdir@
 subdirs = @subdirs@
 sysconfdir = @sysconfdir@
+systemd_libexec_dir = @systemd_libexec_dir@
+systemd_unit_dir = @systemd_unit_dir@
 target = @target@
 target_alias = @target_alias@
 target_cpu = @target_cpu@
@@ -347,6 +354,7 @@ target_vendor = @target_vendor@
 top_build_prefix = @top_build_prefix@
 top_builddir = @top_builddir@
 top_srcdir = @top_srcdir@
+user_rgw = @user_rgw@
 JAVA_SRC = \
 	java/com/ceph/fs/CephMount.java \
 	java/com/ceph/fs/CephStat.java \
diff --git a/src/java/java/com/ceph/fs/CephMount.java b/src/java/java/com/ceph/fs/CephMount.java
index 79a48b3..a286795 100644
--- a/src/java/java/com/ceph/fs/CephMount.java
+++ b/src/java/java/com/ceph/fs/CephMount.java
@@ -79,13 +79,26 @@ public class CephMount {
   public static final int XATTR_REPLACE = 2;
   public static final int XATTR_NONE    = 3;
 
+  /*
+   * Flags for flock();
+   *
+   * Must be synchronized with JNI if changed.
+   */
+  public static final int LOCK_SH       = 1;
+  public static final int LOCK_EX       = 2;
+  public static final int LOCK_NB       = 4;
+  public static final int LOCK_UN       = 8;
 
   /*
    * This is run by the class loader and will report early any problems
    * finding or linking in the shared JNI library.
    */
   static {
-    CephNativeLoader.checkLoaded();
+    loadLibrary();
+  }
+
+  static synchronized void loadLibrary() {
+    CephNativeLoader.getInstance().loadLibrary();
   }
 
   /*
@@ -686,6 +699,27 @@ public class CephMount {
   private static native int native_ceph_fsync(long mountp, int fd, boolean dataonly);
 
   /**
+   * Apply or remove an advisory lock.
+   *
+   * @param fd File descriptor to lock or unlock.
+   * @param operation the advisory lock operation to be performed on the file
+   * descriptor among LOCK_SH (shared lock), LOCK_EX (exclusive lock),
+   * or LOCK_UN (remove lock). The LOCK_NB value can be ORed to perform a
+   * non-blocking operation.
+   * @param owner the user-supplied owner identifier (an arbitrary integer)
+   */
+  public void flock(int fd, int operation, long owner) throws IOException {
+    rlock.lock();
+    try {
+      native_ceph_flock(instance_ptr, fd, operation, owner);
+    } finally {
+      rlock.unlock();
+    }
+  }
+
+  private static native int native_ceph_flock(long mountp, int fd, int operation, long owner);
+
+  /**
    * Get file status.
    *
    * @param fd The file descriptor.
diff --git a/src/java/java/com/ceph/fs/CephNativeLoader.java b/src/java/java/com/ceph/fs/CephNativeLoader.java
index 21a54c3..358ca24 100644
--- a/src/java/java/com/ceph/fs/CephNativeLoader.java
+++ b/src/java/java/com/ceph/fs/CephNativeLoader.java
@@ -20,16 +20,74 @@
 package com.ceph.fs;
 
 class CephNativeLoader {
+  private static final CephNativeLoader instance = new CephNativeLoader();
+  private static boolean initialized = false;
 
-  private static boolean loaded = false;
+  private static final String JNI_PATH_ENV_VAR = "CEPH_JNI_PATH";
+  private static final String LIBRARY_NAME = "cephfs_jni";
+  private static final String LIBRARY_FILE = "libcephfs_jni.so";
 
-  static {
-    if (!loaded) {
-      System.loadLibrary("cephfs_jni");
-      CephMount.native_initialize();
-      loaded = true;
+  private CephNativeLoader() {}
+
+  public static CephNativeLoader getInstance() {
+    return instance;
+  }
+
+  public synchronized void loadLibrary() {
+    if (initialized)
+      return;
+
+    boolean success = false;
+
+    /*
+     * Allow a Ceph specific environment variable to force
+     * the loading path.
+     */
+    String path = System.getenv(JNI_PATH_ENV_VAR);
+    try {
+      if (path != null) {
+        System.out.println("Loading libcephfs-jni: " + path);
+        System.load(path);
+        success = true;
+      } else {
+        try {
+          /*
+           * Try default Java loading path(s)
+           */
+          System.out.println("Loading libcephfs-jni from default path: " +
+              System.getProperty("java.library.path"));
+          System.loadLibrary(LIBRARY_NAME);
+          success = true;
+        } catch (final UnsatisfiedLinkError ule1) {
+          try {
+            /*
+             * Try RHEL/CentOS default path
+             */
+            path = "/usr/lib64/" + LIBRARY_FILE;
+            System.out.println("Loading libcephfs-jni: " + path);
+            System.load(path);
+            success = true;
+          } catch (final UnsatisfiedLinkError ule2) {
+            /*
+             * Try Ubuntu default path
+             */
+            path = "/usr/lib/jni/" + LIBRARY_FILE;
+            System.out.println("Loading libcephfs-jni: " + path);
+            System.load(path);
+            success = true;
+          }
+        }
+      }
+    } finally {
+      System.out.println("Loading libcephfs-jni: " +
+          (success ? "Success!" : "Failure!"));
     }
+
+    /*
+     * Finish initialization
+     */
+    CephMount.native_initialize();
+    initialized = true;
   }
 
-  static void checkLoaded() { assert(loaded); }
 }
diff --git a/src/java/native/libcephfs_jni.cc b/src/java/native/libcephfs_jni.cc
index 99ab3f4..050acc3 100644
--- a/src/java/native/libcephfs_jni.cc
+++ b/src/java/native/libcephfs_jni.cc
@@ -87,6 +87,14 @@
 #define JAVA_XATTR_REPLACE  2
 #define JAVA_XATTR_NONE     3
 
+/*
+ * flock flags. sync with CephMount.java if changed.
+ */
+#define JAVA_LOCK_SH 1
+#define JAVA_LOCK_EX 2
+#define JAVA_LOCK_NB 4
+#define JAVA_LOCK_UN 8
+
 /* Map JAVA_O_* open flags to values in libc */
 static inline int fixup_open_flags(jint jflags)
 {
@@ -1767,6 +1775,49 @@ JNIEXPORT jint JNICALL Java_com_ceph_fs_CephMount_native_1ceph_1fsync
 
 /*
  * Class:     com_ceph_fs_CephMount
+ * Method:    native_ceph_flock
+ * Signature: (JIZ)I
+ */
+JNIEXPORT jint JNICALL Java_com_ceph_fs_CephMount_native_1ceph_1flock
+	(JNIEnv *env, jclass clz, jlong j_mntp, jint j_fd, jint j_operation, jlong j_owner)
+{
+	struct ceph_mount_info *cmount = get_ceph_mount(j_mntp);
+	CephContext *cct = ceph_get_mount_context(cmount);
+	int ret;
+
+	ldout(cct, 10) << "jni: flock: fd " << (int)j_fd <<
+		" operation " << j_operation << " owner " << j_owner << dendl;
+
+	int operation = 0;
+
+#define MAP_FLOCK_FLAG(JNI_MASK, NATIVE_MASK) do {	\
+	if ((j_operation & JNI_MASK) != 0) {		\
+		operation |= NATIVE_MASK; 		\
+		j_operation &= ~JNI_MASK;		\
+	} 						\
+	} while(0)
+	MAP_FLOCK_FLAG(JAVA_LOCK_SH, LOCK_SH);
+	MAP_FLOCK_FLAG(JAVA_LOCK_EX, LOCK_EX);
+	MAP_FLOCK_FLAG(JAVA_LOCK_NB, LOCK_NB);
+	MAP_FLOCK_FLAG(JAVA_LOCK_UN, LOCK_UN);
+	if (j_operation != 0) {
+		cephThrowIllegalArg(env, "flock flags");
+		return -EINVAL;
+	}
+#undef MAP_FLOCK_FLAG
+
+	ret = ceph_flock(cmount, (int)j_fd, operation, (uint64_t) j_owner);
+
+	ldout(cct, 10) << "jni: flock: exit ret " << ret << dendl;
+
+	if (ret)
+		handle_error(env, ret);
+
+	return ret;
+}
+
+/*
+ * Class:     com_ceph_fs_CephMount
  * Method:    native_ceph_fstat
  * Signature: (JILcom/ceph/fs/CephStat;)I
  */
diff --git a/src/java/test/com/ceph/fs/CephMountTest.java b/src/java/test/com/ceph/fs/CephMountTest.java
index 82c3f1b..1a3e568 100644
--- a/src/java/test/com/ceph/fs/CephMountTest.java
+++ b/src/java/test/com/ceph/fs/CephMountTest.java
@@ -764,6 +764,10 @@ public class CephMountTest {
     int crop_size = 333333;
     mount.ftruncate(fd, crop_size);
     mount.fstat(fd, st);
+    if (st.size != crop_size) {
+      System.err.println("ftruncate error: st.size=" + st.size + " crop_size=" + crop_size);
+      assertTrue(false);
+    }
     assertTrue(st.size == crop_size);
     mount.close(fd);
 
@@ -791,6 +795,35 @@ public class CephMountTest {
   }
 
   /*
+   * flock
+   */
+
+  @Test
+  public void test_flock() throws Exception {
+    String path = makePath();
+    int fd = createFile(path, 123);
+    mount.flock(fd, CephMount.LOCK_SH | CephMount.LOCK_NB, 42);
+    mount.flock(fd, CephMount.LOCK_SH | CephMount.LOCK_NB, 43);
+    mount.flock(fd, CephMount.LOCK_UN, 42);
+    mount.flock(fd, CephMount.LOCK_UN, 43);
+    mount.flock(fd, CephMount.LOCK_EX | CephMount.LOCK_NB, 42);
+    try {
+      mount.flock(fd, CephMount.LOCK_SH | CephMount.LOCK_NB, 43);
+      assertTrue(false);
+    } catch(IOException io) {}
+    try {
+      mount.flock(fd, CephMount.LOCK_EX | CephMount.LOCK_NB, 43);
+      assertTrue(false);
+    } catch(IOException io) {}
+    mount.flock(fd, CephMount.LOCK_SH, 42);  // downgrade
+    mount.flock(fd, CephMount.LOCK_SH, 43);
+    mount.flock(fd, CephMount.LOCK_UN, 42);
+    mount.flock(fd, CephMount.LOCK_UN, 43);
+    mount.close(fd);
+    mount.unlink(path);
+  }
+
+  /*
    * fstat
    *
    * success case is handled in test_stat along with lstat.
diff --git a/src/key_value_store/kv_flat_btree_async.h b/src/key_value_store/kv_flat_btree_async.h
index 6713bf6..727afe4 100644
--- a/src/key_value_store/kv_flat_btree_async.h
+++ b/src/key_value_store/kv_flat_btree_async.h
@@ -132,17 +132,26 @@ struct object_data {
   uint64_t version; //the version at time of read
   uint64_t size; //the number of elements in the omap
 
-  object_data()
+  object_data() 
+  : unwritable(false),
+    version(0),
+    size(0) 
   {}
 
   object_data(string the_name)
-  : name(the_name)
+  : name(the_name),
+    unwritable(false),
+    version(0),
+    size(0) 
   {}
 
   object_data(key_data min, key_data kdat, string the_name)
   : min_kdata(min),
     max_kdata(kdat),
-    name(the_name)
+    name(the_name),
+    unwritable(false),
+    version(0),
+    size(0) 
   {}
 
   object_data(key_data min, key_data kdat, string the_name,
@@ -150,14 +159,19 @@ struct object_data {
   : min_kdata(min),
     max_kdata(kdat),
     name(the_name),
-    omap(the_omap)
+    omap(the_omap),
+    unwritable(false),
+    version(0),
+    size(0) 
   {}
 
   object_data(key_data min, key_data kdat, string the_name, int the_version)
   : min_kdata(min),
     max_kdata(kdat),
     name(the_name),
-    version(the_version)
+    unwritable(false),
+    version(the_version),
+    size(0) 
   {}
 
   void encode(bufferlist &bl) const {
@@ -244,6 +258,7 @@ struct delete_data {
   uint64_t version;
 
   delete_data()
+  : version(0)
   {}
 
   delete_data(key_data n, key_data x, string o, uint64_t v)
@@ -689,7 +704,7 @@ protected:
    *
    * @param idata: the index data parsed from the index entry left by the dead
    * client.
-   * @param errno: the error that caused the client to realize the other client
+   * @param error: the error that caused the client to realize the other client
    * died (should be -ENOENT or -ETIMEDOUT)
    * @post: rolls forward if -ENOENT, otherwise rolls back.
    */
@@ -752,6 +767,7 @@ KvFlatBtreeAsync(int k_val, string name, int cache, double cache_r,
     client_name(string(name).append(".")),
     pool_name("rbd"),
     interrupt(&KeyValueStructure::nothing),
+    wait_ms(0),
     timeout(100000,0),
     cache_size(cache),
     cache_refresh(cache_r),
diff --git a/src/krbd.cc b/src/krbd.cc
index 3fb64fd..9901edb 100644
--- a/src/krbd.cc
+++ b/src/krbd.cc
@@ -94,12 +94,15 @@ static int sysfs_write_rbd_remove(const string& buf)
   return sysfs_write_rbd("remove", buf);
 }
 
-static int should_match_minor(void)
+static int have_minor_attr(void)
 {
   /*
    * 'minor' attribute was added as part of single_major merge, which
    * exposed the 'single_major' parameter.  'minor' is always present,
    * regardless of whether single-major scheme is turned on or not.
+   *
+   * (Something like ver >= KERNEL_VERSION(3, 14, 0) is a no-go because
+   * this has to work with rbd.ko backported to various kernels.)
    */
   return access("/sys/module/rbd/parameters/single_major", F_OK) == 0;
 }
@@ -218,7 +221,7 @@ static int wait_for_udev_add(struct udev_monitor *mon, const char *pool,
         const char *this_major = udev_device_get_property_value(dev, "MAJOR");
         const char *this_minor = udev_device_get_property_value(dev, "MINOR");
 
-        assert(!minor ^ should_match_minor());
+        assert(!minor ^ have_minor_attr());
 
         if (strcmp(this_major, major) == 0 &&
             (!minor || strcmp(this_minor, minor) == 0)) {
@@ -339,7 +342,7 @@ static int devno_to_krbd_id(struct udev *udev, dev_t devno, string *pid)
   if (r < 0)
     goto out_enm;
 
-  if (should_match_minor()) {
+  if (have_minor_attr()) {
     r = udev_enumerate_add_match_sysattr(enm, "minor",
                                          stringify(minor(devno)).c_str());
     if (r < 0)
@@ -373,6 +376,89 @@ out_enm:
   return r;
 }
 
+static int spec_to_devno_and_krbd_id(struct udev *udev, const char *pool,
+                                     const char *image, const char *snap,
+                                     dev_t *pdevno, string *pid)
+{
+  struct udev_enumerate *enm;
+  struct udev_list_entry *l;
+  struct udev_device *dev;
+  unsigned int maj, min = 0;
+  string err;
+  int r;
+
+  enm = udev_enumerate_new(udev);
+  if (!enm)
+    return -ENOMEM;
+
+  r = udev_enumerate_add_match_subsystem(enm, "rbd");
+  if (r < 0)
+    goto out_enm;
+
+  r = udev_enumerate_add_match_sysattr(enm, "pool", pool);
+  if (r < 0)
+    goto out_enm;
+
+  r = udev_enumerate_add_match_sysattr(enm, "name", image);
+  if (r < 0)
+    goto out_enm;
+
+  r = udev_enumerate_add_match_sysattr(enm, "current_snap", snap);
+  if (r < 0)
+    goto out_enm;
+
+  r = udev_enumerate_scan_devices(enm);
+  if (r < 0)
+    goto out_enm;
+
+  l = udev_enumerate_get_list_entry(enm);
+  if (!l) {
+    r = -ENOENT;
+    goto out_enm;
+  }
+
+  dev = udev_device_new_from_syspath(udev, udev_list_entry_get_name(l));
+  if (!dev) {
+    r = -ENOMEM;
+    goto out_enm;
+  }
+
+  maj = strict_strtoll(udev_device_get_sysattr_value(dev, "major"), 10, &err);
+  if (!err.empty()) {
+    cerr << "rbd: couldn't parse major: " << err << std::endl;
+    r = -EINVAL;
+    goto out_dev;
+  }
+  if (have_minor_attr()) {
+    min = strict_strtoll(udev_device_get_sysattr_value(dev, "minor"), 10, &err);
+    if (!err.empty()) {
+      cerr << "rbd: couldn't parse minor: " << err << std::endl;
+      r = -EINVAL;
+      goto out_dev;
+    }
+  }
+
+  /*
+   * If an image is mapped more than once don't bother trying to unmap
+   * all devices - let users run unmap the same number of times they
+   * ran map.
+   */
+  if (udev_list_entry_get_next(l))
+    cerr << "rbd: " << pool << "/" << image << "@" << snap
+         << ": mapped more than once, unmapping "
+         << get_kernel_rbd_name(udev_device_get_sysname(dev))
+         << " only" << std::endl;
+
+  *pdevno = makedev(maj, min);
+  *pid = udev_device_get_sysname(dev);
+
+out_dev:
+  udev_device_unref(dev);
+out_enm:
+  udev_enumerate_unref(enm);
+  return r;
+}
+
 static int wait_for_udev_remove(struct udev_monitor *mon, dev_t devno)
 {
   for (;;) {
@@ -492,6 +578,30 @@ static int unmap_image(struct krbd_ctx *ctx, const char *devnode)
   return do_unmap(ctx->udev, wholedevno, id);
 }
 
+static int unmap_image(struct krbd_ctx *ctx, const char *pool,
+                       const char *image, const char *snap)
+
+{
+  dev_t devno;
+  string id;
+  int r;
+
+  if (!snap)
+    snap = "-";
+
+  r = spec_to_devno_and_krbd_id(ctx->udev, pool, image, snap, &devno, &id);
+  if (r < 0) {
+    if (r == -ENOENT) {
+      cerr << "rbd: " << pool << "/" << image << "@" << snap
+           << ": not a mapped image or snapshot" << std::endl;
+      r = -EINVAL;
+    }
+    return r;
+  }
+
+  return do_unmap(ctx->udev, devno, id);
+}
+
 static void dump_one_image(Formatter *f, TextTable *tbl,
                            const char *id, const char *pool,
                            const char *image, const char *snap)
@@ -634,6 +744,12 @@ extern "C" int krbd_unmap(struct krbd_ctx *ctx, const char *devnode)
   return unmap_image(ctx, devnode);
 }
 
+extern "C" int krbd_unmap_by_spec(struct krbd_ctx *ctx, const char *pool,
+                                  const char *image, const char *snap)
+{
+  return unmap_image(ctx, pool, image, snap);
+}
+
 int krbd_showmapped(struct krbd_ctx *ctx, Formatter *f)
 {
   return dump_images(ctx, f);
diff --git a/src/libcephfs.cc b/src/libcephfs.cc
index e3281de..40cd028 100644
--- a/src/libcephfs.cc
+++ b/src/libcephfs.cc
@@ -34,9 +34,8 @@
 struct ceph_mount_info
 {
 public:
-  ceph_mount_info(uint64_t msgr_nonce_, CephContext *cct_)
-    : msgr_nonce(msgr_nonce_),
-      mounted(false),
+  ceph_mount_info(CephContext *cct_)
+    : mounted(false),
       inited(false),
       client(NULL),
       monclient(NULL),
@@ -77,7 +76,7 @@ public:
       goto fail;
 
     //network connection
-    messenger = Messenger::create(cct, cct->_conf->ms_type, entity_name_t::CLIENT(), "client", msgr_nonce);
+    messenger = Messenger::create_client_messenger(cct, "client");
 
     //at last the client
     ret = -CEPHFS_ERROR_NEW_CLIENT; //defined in libcephfs.h;
@@ -108,9 +107,11 @@ public:
     if (mounted)
       return -EISCONN;
 
-    ret = init();
-    if (ret != 0) {
-      return ret;
+    if (!inited) {
+      ret = init();
+      if (ret != 0) {
+        return ret;
+      }
     }
 
     ret = client->mount(mount_root);
@@ -233,7 +234,6 @@ public:
   }
 
 private:
-  uint64_t msgr_nonce;
   bool mounted;
   bool inited;
   Client *client;
@@ -288,14 +288,7 @@ extern "C" const char *ceph_version(int *pmajor, int *pminor, int *ppatch)
 
 extern "C" int ceph_create_with_context(struct ceph_mount_info **cmount, CephContext *cct)
 {
-  uint64_t nonce = 0;
-
-  // 6 bytes of random and 2 bytes of pid
-  get_random_bytes((char*)&nonce, sizeof(nonce));
-  nonce &= ~0xffff;
-  nonce |= (uint64_t)getpid();
-
-  *cmount = new struct ceph_mount_info(nonce, cct);
+  *cmount = new struct ceph_mount_info(cct);
   return 0;
 }
 
@@ -632,6 +625,14 @@ extern "C" int ceph_lgetxattr(struct ceph_mount_info *cmount, const char *path,
   return cmount->get_client()->lgetxattr(path, name, value, size);
 }
 
+extern "C" int ceph_fgetxattr(struct ceph_mount_info *cmount, int fd, const char *name, void *value, size_t size)
+{
+  if (!cmount->is_mounted())
+    return -ENOTCONN;
+  return cmount->get_client()->fgetxattr(fd, name, value, size);
+}
+
+
 extern "C" int ceph_listxattr(struct ceph_mount_info *cmount, const char *path, char *list, size_t size)
 {
   if (!cmount->is_mounted())
@@ -646,6 +647,13 @@ extern "C" int ceph_llistxattr(struct ceph_mount_info *cmount, const char *path,
   return cmount->get_client()->llistxattr(path, list, size);
 }
 
+extern "C" int ceph_flistxattr(struct ceph_mount_info *cmount, int fd, char *list, size_t size)
+{
+  if (!cmount->is_mounted())
+    return -ENOTCONN;
+  return cmount->get_client()->flistxattr(fd, list, size);
+}
+
 extern "C" int ceph_removexattr(struct ceph_mount_info *cmount, const char *path, const char *name)
 {
   if (!cmount->is_mounted())
@@ -660,6 +668,13 @@ extern "C" int ceph_lremovexattr(struct ceph_mount_info *cmount, const char *pat
   return cmount->get_client()->lremovexattr(path, name);
 }
 
+extern "C" int ceph_fremovexattr(struct ceph_mount_info *cmount, int fd, const char *name)
+{
+  if (!cmount->is_mounted())
+    return -ENOTCONN;
+  return cmount->get_client()->fremovexattr(fd, name);
+}
+
 extern "C" int ceph_setxattr(struct ceph_mount_info *cmount, const char *path, const char *name, const void *value, size_t size, int flags)
 {
   if (!cmount->is_mounted())
@@ -673,6 +688,13 @@ extern "C" int ceph_lsetxattr(struct ceph_mount_info *cmount, const char *path,
     return -ENOTCONN;
   return cmount->get_client()->lsetxattr(path, name, value, size, flags);
 }
+
+extern "C" int ceph_fsetxattr(struct ceph_mount_info *cmount, int fd, const char *name, const void *value, size_t size, int flags)
+{
+  if (!cmount->is_mounted())
+    return -ENOTCONN;
+  return cmount->get_client()->fsetxattr(fd, name, value, size, flags);
+}
 /* end xattr support */
 
 extern "C" int ceph_chmod(struct ceph_mount_info *cmount, const char *path, mode_t mode)
@@ -718,6 +740,14 @@ extern "C" int ceph_utime(struct ceph_mount_info *cmount, const char *path,
   return cmount->get_client()->utime(path, buf);
 }
 
+extern "C" int ceph_flock(struct ceph_mount_info *cmount, int fd, int operation,
+			  uint64_t owner)
+{
+  if (!cmount->is_mounted())
+    return -ENOTCONN;
+  return cmount->get_client()->flock(fd, operation, owner);
+}
+
 extern "C" int ceph_truncate(struct ceph_mount_info *cmount, const char *path,
 			     int64_t size)
 {
@@ -775,6 +805,14 @@ extern "C" int ceph_read(struct ceph_mount_info *cmount, int fd, char *buf,
   return cmount->get_client()->read(fd, buf, size, offset);
 }
 
+extern "C" int ceph_preadv(struct ceph_mount_info *cmount, int fd,
+              const struct iovec *iov, int iovcnt, int64_t offset)
+{
+  if (!cmount->is_mounted())
+      return -ENOTCONN;
+  return cmount->get_client()->preadv(fd, iov, iovcnt, offset);
+}
+
 extern "C" int ceph_write(struct ceph_mount_info *cmount, int fd, const char *buf,
 			  int64_t size, int64_t offset)
 {
@@ -783,6 +821,14 @@ extern "C" int ceph_write(struct ceph_mount_info *cmount, int fd, const char *bu
   return cmount->get_client()->write(fd, buf, size, offset);
 }
 
+extern "C" int ceph_pwritev(struct ceph_mount_info *cmount, int fd,
+              const struct iovec *iov, int iovcnt, int64_t offset)
+{
+  if (!cmount->is_mounted())
+    return -ENOTCONN;
+  return cmount->get_client()->pwritev(fd, iov, iovcnt, offset);
+}
+
 extern "C" int ceph_ftruncate(struct ceph_mount_info *cmount, int fd, int64_t size)
 {
   if (!cmount->is_mounted())
diff --git a/src/librados/IoCtxImpl.cc b/src/librados/IoCtxImpl.cc
index 5ef56c0..945dbec 100644
--- a/src/librados/IoCtxImpl.cc
+++ b/src/librados/IoCtxImpl.cc
@@ -468,6 +468,8 @@ int librados::IoCtxImpl::append(const object_t& oid, bufferlist& bl, size_t len)
 
 int librados::IoCtxImpl::write_full(const object_t& oid, bufferlist& bl)
 {
+  if (bl.length() > UINT_MAX/2)
+    return -E2BIG;
   ::ObjectOperation op;
   prepare_assert_ops(&op);
   op.write_full(bl);
@@ -692,12 +694,12 @@ int librados::IoCtxImpl::aio_write(const object_t &oid, AioCompletionImpl *c,
   if (snap_seq != CEPH_NOSNAP)
     return -EROFS;
 
-  c->io = this;
-  queue_aio_write(c);
-
   Context *onack = new C_aio_Ack(c);
   Context *onsafe = new C_aio_Safe(c);
 
+  c->io = this;
+  queue_aio_write(c);
+
   c->tid = objecter->write(oid, oloc,
 		  off, len, snapc, bl, ut, 0,
 		  onack, onsafe, &c->objver);
@@ -716,12 +718,12 @@ int librados::IoCtxImpl::aio_append(const object_t &oid, AioCompletionImpl *c,
   if (snap_seq != CEPH_NOSNAP)
     return -EROFS;
 
-  c->io = this;
-  queue_aio_write(c);
-
   Context *onack = new C_aio_Ack(c);
   Context *onsafe = new C_aio_Safe(c);
 
+  c->io = this;
+  queue_aio_write(c);
+
   c->tid = objecter->append(oid, oloc,
 		   len, snapc, bl, ut, 0,
 		   onack, onsafe, &c->objver);
@@ -741,12 +743,12 @@ int librados::IoCtxImpl::aio_write_full(const object_t &oid,
   if (snap_seq != CEPH_NOSNAP)
     return -EROFS;
 
-  c->io = this;
-  queue_aio_write(c);
-
   Context *onack = new C_aio_Ack(c);
   Context *onsafe = new C_aio_Safe(c);
 
+  c->io = this;
+  queue_aio_write(c);
+
   c->tid = objecter->write_full(oid, oloc,
 		       snapc, bl, ut, 0,
 		       onack, onsafe, &c->objver);
@@ -762,12 +764,12 @@ int librados::IoCtxImpl::aio_remove(const object_t &oid, AioCompletionImpl *c)
   if (snap_seq != CEPH_NOSNAP)
     return -EROFS;
 
-  c->io = this;
-  queue_aio_write(c);
-
   Context *onack = new C_aio_Ack(c);
   Context *onsafe = new C_aio_Safe(c);
 
+  c->io = this;
+  queue_aio_write(c);
+
   c->tid = objecter->remove(oid, oloc,
 		   snapc, ut, 0,
 		   onack, onsafe, &c->objver);
@@ -779,9 +781,9 @@ int librados::IoCtxImpl::aio_remove(const object_t &oid, AioCompletionImpl *c)
 int librados::IoCtxImpl::aio_stat(const object_t& oid, AioCompletionImpl *c,
 				  uint64_t *psize, time_t *pmtime)
 {
-  c->io = this;
   C_aio_stat_Ack *onack = new C_aio_stat_Ack(c, pmtime);
 
+  c->io = this;
   c->tid = objecter->stat(oid, oloc,
 		 snap_seq, psize, &onack->mtime, 0,
 		 onack, &c->objver);
@@ -1297,6 +1299,7 @@ void librados::IoCtxImpl::set_notify_timeout(uint32_t timeout)
 
 librados::IoCtxImpl::C_aio_Ack::C_aio_Ack(AioCompletionImpl *_c) : c(_c)
 {
+  assert(!c->io);
   c->get();
 }
 
@@ -1329,6 +1332,7 @@ librados::IoCtxImpl::C_aio_stat_Ack::C_aio_stat_Ack(AioCompletionImpl *_c,
 						    time_t *pm)
    : c(_c), pmtime(pm)
 {
+  assert(!c->io);
   c->get();
 }
 
diff --git a/src/librados/ListObjectImpl.h b/src/librados/ListObjectImpl.h
index 08754df..bda275f 100644
--- a/src/librados/ListObjectImpl.h
+++ b/src/librados/ListObjectImpl.h
@@ -28,9 +28,9 @@ struct ListObjectImpl {
   ListObjectImpl(std::string n, std::string o, std::string l):
       nspace(n), oid(o), locator(l) {}
 
-  const std::string& get_nspace() { return nspace; }
-  const std::string& get_oid() { return oid; }
-  const std::string& get_locator() { return locator; }
+  const std::string& get_nspace() const { return nspace; }
+  const std::string& get_oid() const { return oid; }
+  const std::string& get_locator() const { return locator; }
 };
 WRITE_EQ_OPERATORS_3(ListObjectImpl, nspace, oid, locator)
 WRITE_CMP_OPERATORS_3(ListObjectImpl, nspace, oid, locator)
@@ -40,9 +40,9 @@ inline std::ostream& operator<<(std::ostream& out, const struct ListObjectImpl&
   return out;
 }
 
-class ObjListCtx;
+struct ObjListCtx;
 
-struct NObjectIteratorImpl {
+class NObjectIteratorImpl {
   public:
     NObjectIteratorImpl() {}
     ~NObjectIteratorImpl();
@@ -57,7 +57,7 @@ struct NObjectIteratorImpl {
     NObjectIteratorImpl operator++(int); // Postincrement
     const ListObject *get_listobjectp() { return &cur_obj; }
     friend class IoCtx;
-    friend class ListObjectImpl;
+    friend struct ListObjectImpl;
     //friend class ListObject;
     friend class NObjectIterator;
 
@@ -67,6 +67,8 @@ struct NObjectIteratorImpl {
     /// move the iterator to a given hash position.  this may (will!) be rounded to the nearest pg.
     uint32_t seek(uint32_t pos);
 
+    void set_filter(const bufferlist &bl);
+
   private:
     NObjectIteratorImpl(ObjListCtx *ctx_);
     void get_next();
diff --git a/src/librados/Makefile.am b/src/librados/Makefile.am
index 103ffd8..bf2c6da 100644
--- a/src/librados/Makefile.am
+++ b/src/librados/Makefile.am
@@ -23,9 +23,6 @@ librados_la_CXXFLAGS = ${AM_CXXFLAGS}
 LIBRADOS_DEPS += \
 	librados_internal.la libcls_lock_client.la \
 	$(LIBOSDC) $(LIBCOMMON_DEPS)
-if WITH_LTTNG
-LIBRADOS_DEPS += $(LIBRADOS_TP)
-endif
 
 librados_la_LIBADD = $(LIBRADOS_DEPS) $(PTHREAD_LIBS) $(CRYPTO_LIBS) $(EXTRALIBS)
 librados_la_LDFLAGS = ${AM_LDFLAGS} -version-info 2:0:0
diff --git a/src/librados/RadosClient.cc b/src/librados/RadosClient.cc
index 3886b1e..08ed909 100644
--- a/src/librados/RadosClient.cc
+++ b/src/librados/RadosClient.cc
@@ -52,8 +52,6 @@
 #undef dout_prefix
 #define dout_prefix *_dout << "librados: "
 
-static atomic_t rados_instance;
-
 bool librados::RadosClient::ms_get_authorizer(int dest_type,
 					      AuthAuthorizer **authorizer,
 					      bool force_new) {
@@ -191,7 +189,6 @@ int librados::RadosClient::connect()
   common_init_finish(cct);
 
   int err;
-  uint64_t nonce;
 
   // already connected?
   if (state == CONNECTING)
@@ -206,9 +203,7 @@ int librados::RadosClient::connect()
     goto out;
 
   err = -ENOMEM;
-  nonce = getpid() + (1000000 * (uint64_t)rados_instance.inc());
-  messenger = Messenger::create(cct, cct->_conf->ms_type, entity_name_t::CLIENT(-1),
-				"radosclient", nonce);
+  messenger = Messenger::create_client_messenger(cct, "radosclient");
   if (!messenger)
     goto out;
 
@@ -271,6 +266,8 @@ int librados::RadosClient::connect()
 
   lock.Unlock();
 
+  cct->_conf->call_all_observers();
+
   ldout(cct, 1) << "init done" << dendl;
   err = 0;
 
diff --git a/src/librados/librados.cc b/src/librados/librados.cc
index cbefe0b..7a193a3 100644
--- a/src/librados/librados.cc
+++ b/src/librados/librados.cc
@@ -18,6 +18,7 @@
 #include "common/errno.h"
 #include "common/ceph_argparse.h"
 #include "common/common_init.h"
+#include "common/TracepointProvider.h"
 #include "include/rados/librados.h"
 #include "include/rados/librados.hpp"
 #include "include/types.h"
@@ -39,7 +40,11 @@
 #include <stdexcept>
 
 #ifdef WITH_LTTNG
+#define TRACEPOINT_DEFINE
+#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
 #include "tracing/librados.h"
+#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#undef TRACEPOINT_DEFINE
 #else
 #define tracepoint(...)
 #endif
@@ -57,6 +62,12 @@ using std::runtime_error;
 
 #define RADOS_LIST_MAX_ENTRIES 1024
 
+namespace {
+
+TracepointProvider::Traits tracepoint_traits("librados_tp.so", "rados_tracing");
+
+} // anonymous namespace
+
 /*
  * Structure of this file
  *
@@ -419,12 +430,20 @@ void librados::ObjectWriteOperation::omap_rm_keys(
 }
 
 void librados::ObjectWriteOperation::copy_from(const std::string& src,
-					       const IoCtx& src_ioctx,
-					       uint64_t src_version)
+                                               const IoCtx& src_ioctx,
+                                               uint64_t src_version)
+{
+  copy_from2(src, src_ioctx, src_version, 0);
+}
+
+void librados::ObjectWriteOperation::copy_from2(const std::string& src,
+					        const IoCtx& src_ioctx,
+					        uint64_t src_version,
+					        uint32_t src_fadvise_flags)
 {
   ::ObjectOperation *o = (::ObjectOperation *)impl;
   o->copy_from(object_t(src), src_ioctx.io_ctx_impl->snap_seq,
-	       src_ioctx.io_ctx_impl->oloc, src_version, 0);
+	       src_ioctx.io_ctx_impl->oloc, src_version, 0, src_fadvise_flags);
 }
 
 void librados::ObjectWriteOperation::undirty()
@@ -623,6 +642,18 @@ uint32_t librados::NObjectIteratorImpl::seek(uint32_t pos)
   return r;
 }
 
+void librados::NObjectIteratorImpl::set_filter(const bufferlist &bl)
+{
+  assert(ctx);
+  if (ctx->nlc) {
+    ctx->nlc->filter = bl;
+  }
+
+  if (ctx->lc) {
+    ctx->lc->filter = bl;
+  }
+}
+
 void librados::NObjectIteratorImpl::get_next()
 {
   const char *entry, *key, *nspace;
@@ -694,23 +725,31 @@ librados::NObjectIterator& librados::NObjectIterator::operator=(const librados::
 
 bool librados::NObjectIterator::operator==(const librados::NObjectIterator& rhs) const 
 {
-  return *impl == *(rhs.impl);
+  if (impl && rhs.impl) {
+    return *impl == *(rhs.impl);
+  } else {
+    return impl == rhs.impl;
+  }
 }
 
-bool librados::NObjectIterator::operator!=(const librados::NObjectIterator& rhs) const {
-  return !(*impl == *(rhs.impl));
+bool librados::NObjectIterator::operator!=(const librados::NObjectIterator& rhs) const
+{
+  return !(*this == rhs);
 }
 
 const librados::ListObject& librados::NObjectIterator::operator*() const {
+  assert(impl);
   return *(impl->get_listobjectp());
 }
 
 const librados::ListObject* librados::NObjectIterator::operator->() const {
+  assert(impl);
   return impl->get_listobjectp();
 }
 
 librados::NObjectIterator& librados::NObjectIterator::operator++()
 {
+  assert(impl);
   impl->get_next();
   return *this;
 }
@@ -724,16 +763,24 @@ librados::NObjectIterator librados::NObjectIterator::operator++(int)
 
 uint32_t librados::NObjectIterator::seek(uint32_t pos)
 {
+  assert(impl);
   return impl->seek(pos);
 }
 
+void librados::NObjectIterator::set_filter(const bufferlist &bl)
+{
+  impl->set_filter(bl);
+}
+
 void librados::NObjectIterator::get_next()
 {
+  assert(impl);
   impl->get_next();
 }
 
 uint32_t librados::NObjectIterator::get_pg_hash_position() const
 {
+  assert(impl);
   return impl->get_pg_hash_position();
 }
 
@@ -1290,6 +1337,8 @@ static int translate_flags(int flags)
     op_flags |= CEPH_OSD_FLAG_SKIPRWLOCKS;
   if (flags & librados::OPERATION_IGNORE_OVERLAY)
     op_flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY;
+  if (flags & librados::OPERATION_FULL_TRY)
+    op_flags |= CEPH_OSD_FLAG_FULL_TRY;
 
   return op_flags;
 }
@@ -1525,18 +1574,38 @@ int librados::IoCtx::list_lockers(const std::string &oid, const std::string &nam
 
 librados::NObjectIterator librados::IoCtx::nobjects_begin()
 {
+  bufferlist bl;
+  return nobjects_begin(bl);
+}
+
+librados::NObjectIterator librados::IoCtx::nobjects_begin(
+    const bufferlist &filter)
+{
   rados_list_ctx_t listh;
   rados_nobjects_list_open(io_ctx_impl, &listh);
   NObjectIterator iter((ObjListCtx*)listh);
+  if (filter.length() > 0) {
+    iter.set_filter(filter);
+  }
   iter.get_next();
   return iter;
 }
 
 librados::NObjectIterator librados::IoCtx::nobjects_begin(uint32_t pos)
 {
+  bufferlist bl;
+  return nobjects_begin(pos, bl);
+}
+
+librados::NObjectIterator librados::IoCtx::nobjects_begin(
+  uint32_t pos, const bufferlist &filter)
+{
   rados_list_ctx_t listh;
   rados_nobjects_list_open(io_ctx_impl, &listh);
   NObjectIterator iter((ObjListCtx*)listh);
+  if (filter.length() > 0) {
+    iter.set_filter(filter);
+  }
   iter.seek(pos);
   return iter;
 }
@@ -2167,10 +2236,9 @@ librados::ObjectOperation::~ObjectOperation()
 }
 
 ///////////////////////////// C API //////////////////////////////
-static
-int rados_create_common(rados_t *pcluster,
-			const char * const clustername,
-			CephInitParameters *iparams)
+
+static CephContext *rados_create_cct(const char * const clustername,
+                                     CephInitParameters *iparams)
 {
   // missing things compared to global_init:
   // g_ceph_context, g_conf, g_lockdep, signal handlers
@@ -2180,26 +2248,27 @@ int rados_create_common(rados_t *pcluster,
   cct->_conf->parse_env(); // environment variables override
   cct->_conf->apply_changes(NULL);
 
-  librados::RadosClient *radosp = new librados::RadosClient(cct);
-  *pcluster = (void *)radosp;
-
-  cct->put();
-  return 0;
+  TracepointProvider::initialize<tracepoint_traits>(cct);
+  return cct;
 }
 
 extern "C" int rados_create(rados_t *pcluster, const char * const id)
 {
-  tracepoint(librados, rados_create_enter, id);
   CephInitParameters iparams(CEPH_ENTITY_TYPE_CLIENT);
   if (id) {
     iparams.name.set(CEPH_ENTITY_TYPE_CLIENT, id);
   }
-  int retval = rados_create_common(pcluster, "ceph", &iparams);
-  tracepoint(librados, rados_create_exit, retval, *pcluster);
-  return retval;
+  CephContext *cct = rados_create_cct("ceph", &iparams);
+
+  tracepoint(librados, rados_create_enter, id);
+  *pcluster = reinterpret_cast<rados_t>(new librados::RadosClient(cct));
+  tracepoint(librados, rados_create_exit, 0, *pcluster);
+
+  cct->put();
+  return 0;
 }
 
-// as above, but 
+// as above, but
 // 1) don't assume 'client.'; name is a full type.id namestr
 // 2) allow setting clustername
 // 3) flags is for future expansion (maybe some of the global_init()
@@ -2208,16 +2277,21 @@ extern "C" int rados_create(rados_t *pcluster, const char * const id)
 extern "C" int rados_create2(rados_t *pcluster, const char *const clustername,
 			     const char * const name, uint64_t flags)
 {
-  tracepoint(librados, rados_create2_enter, clustername, name, flags);
   // client is assumed, but from_str will override
+  int retval = 0;
   CephInitParameters iparams(CEPH_ENTITY_TYPE_CLIENT);
   if (!name || !iparams.name.from_str(name)) {
-    tracepoint(librados, rados_create2_exit, -EINVAL, *pcluster);
-    return -EINVAL;
+    retval = -EINVAL;
   }
 
-  int retval = rados_create_common(pcluster, clustername, &iparams);
+  CephContext *cct = rados_create_cct(clustername, &iparams);
+  tracepoint(librados, rados_create2_enter, clustername, name, flags);
+  if (retval == 0) {
+    *pcluster = reinterpret_cast<rados_t>(new librados::RadosClient(cct));
+  }
   tracepoint(librados, rados_create2_exit, retval, *pcluster);
+
+  cct->put();
   return retval;
 }
 
@@ -2227,8 +2301,10 @@ extern "C" int rados_create2(rados_t *pcluster, const char *const clustername,
  */
 extern "C" int rados_create_with_context(rados_t *pcluster, rados_config_t cct_)
 {
-  tracepoint(librados, rados_create_with_context_enter, cct_);
   CephContext *cct = (CephContext *)cct_;
+  TracepointProvider::initialize<tracepoint_traits>(cct);
+
+  tracepoint(librados, rados_create_with_context_enter, cct_);
   librados::RadosClient *radosp = new librados::RadosClient(cct);
   *pcluster = (void *)radosp;
   tracepoint(librados, rados_create_with_context_exit, 0, *pcluster);
diff --git a/src/libradosstriper/Makefile.am b/src/libradosstriper/Makefile.am
index bd4c1d1..13b8b28 100644
--- a/src/libradosstriper/Makefile.am
+++ b/src/libradosstriper/Makefile.am
@@ -10,8 +10,8 @@ libradosstriper_la_SOURCES = \
 # We need this to avoid basename conflicts with the libradosstriper build tests in test/Makefile.am
 libradosstriper_la_CXXFLAGS = ${AM_CXXFLAGS}
 
-LIBRADOSSTRIPER_DEPS = $(LIBRADOS_DEPS)
-libradosstriper_la_LIBADD = $(LIBRADOSSTRIPER_DEPS)
+LIBRADOSSTRIPER_DEPS = librados_internal.la libcls_lock_client.la $(LIBOSDC) $(LIBCOMMON_DEPS)
+libradosstriper_la_LIBADD = $(LIBRADOSSTRIPER_DEPS) $(LIBRADOS) $(PTHREAD_LIBS) $(CRYPTO_LIBS) $(EXTRALIBS)
 libradosstriper_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0
 if LINUX
 libradosstriper_la_LDFLAGS += -export-symbols-regex '^radosstriper_.*'
diff --git a/src/libradosstriper/RadosStriperImpl.cc b/src/libradosstriper/RadosStriperImpl.cc
index 0886f8b..3544cac 100644
--- a/src/libradosstriper/RadosStriperImpl.cc
+++ b/src/libradosstriper/RadosStriperImpl.cc
@@ -463,10 +463,10 @@ int libradosstriper::RadosStriperImpl::aio_read(const std::string& soid,
   
   // create a completion object and transfer ownership of extents and resultbl
   vector<bufferlist> *resultbl = new vector<bufferlist>(extents->size());
-  c->is_read = true;
-  c->io = m_ioCtxImpl;
   ReadCompletionData *cdata = new ReadCompletionData(this, soid, lockCookie, c,
 						     bl, extents, resultbl);
+  c->is_read = true;
+  c->io = m_ioCtxImpl;
   libradosstriper::MultiAioCompletionImpl *nc = new libradosstriper::MultiAioCompletionImpl;
   nc->set_complete_callback(cdata, striper_read_aio_req_complete);
   // go through the extents
diff --git a/src/librbd/AioCompletion.cc b/src/librbd/AioCompletion.cc
index 6222531..ec7f684 100644
--- a/src/librbd/AioCompletion.cc
+++ b/src/librbd/AioCompletion.cc
@@ -6,7 +6,6 @@
 #include "common/ceph_context.h"
 #include "common/dout.h"
 #include "common/errno.h"
-#include "common/WorkQueue.h"
 
 #include "librbd/AioRequest.h"
 #include "librbd/internal.h"
@@ -72,11 +71,11 @@ namespace librbd {
     elapsed = ceph_clock_now(cct) - start_time;
     switch (aio_type) {
     case AIO_TYPE_READ:
-      ictx->perfcounter->tinc(l_librbd_aio_rd_latency, elapsed); break;
+      ictx->perfcounter->tinc(l_librbd_rd_latency, elapsed); break;
     case AIO_TYPE_WRITE:
-      ictx->perfcounter->tinc(l_librbd_aio_wr_latency, elapsed); break;
+      ictx->perfcounter->tinc(l_librbd_wr_latency, elapsed); break;
     case AIO_TYPE_DISCARD:
-      ictx->perfcounter->tinc(l_librbd_aio_discard_latency, elapsed); break;
+      ictx->perfcounter->tinc(l_librbd_discard_latency, elapsed); break;
     case AIO_TYPE_FLUSH:
       ictx->perfcounter->tinc(l_librbd_aio_flush_latency, elapsed); break;
     default:
diff --git a/src/librbd/AioCompletion.h b/src/librbd/AioCompletion.h
index 4fe53eb..fddf5fb 100644
--- a/src/librbd/AioCompletion.h
+++ b/src/librbd/AioCompletion.h
@@ -93,8 +93,12 @@ namespace librbd {
         ictx = i;
         aio_type = t;
         start_time = ceph_clock_now(ictx->cct);
-
-	async_op.start_op(*ictx);
+      }
+    }
+    void start_op(ImageCtx *i, aio_type_t t) {
+      init_time(i, t);
+      if (!async_op.started()) {
+        async_op.start_op(*ictx);
       }
     }
 
diff --git a/src/librbd/AioRequest.cc b/src/librbd/AioRequest.cc
index 7dbec4a..b6fc1f9 100644
--- a/src/librbd/AioRequest.cc
+++ b/src/librbd/AioRequest.cc
@@ -82,7 +82,7 @@ namespace librbd {
 
   static inline bool is_copy_on_read(ImageCtx *ictx, librados::snap_t snap_id) {
     assert(ictx->snap_lock.is_locked());
-    return (ictx->cct->_conf->rbd_clone_copy_on_read) &&
+    return (ictx->clone_copy_on_read) &&
            (!ictx->read_only) && (snap_id == CEPH_NOSNAP);
   }
 
@@ -299,8 +299,8 @@ namespace librbd {
 
   bool AbstractWrite::should_complete(int r)
   {
-    ldout(m_ictx->cct, 20) << "write " << this << " " << m_oid << " "
-                           << m_object_off << "~" << m_object_len
+    ldout(m_ictx->cct, 20) << get_write_type() << " " << this << " " << m_oid
+                           << " " << m_object_off << "~" << m_object_len
 			   << " should_complete: r = " << r << dendl;
 
     bool finished = true;
@@ -386,41 +386,50 @@ namespace librbd {
 
   void AbstractWrite::send() {
     assert(m_ictx->owner_lock.is_locked());
-    ldout(m_ictx->cct, 20) << "send " << this << " " << m_oid << " "
-			   << m_object_off << "~" << m_object_len << dendl;
+    ldout(m_ictx->cct, 20) << "send " << get_write_type() << " " << this <<" "
+                           << m_oid << " " << m_object_off << "~"
+                           << m_object_len << dendl;
     send_pre();
   }
 
   void AbstractWrite::send_pre() {
     assert(m_ictx->owner_lock.is_locked());
-    RWLock::RLocker snap_lock(m_ictx->snap_lock);
-    if (!m_ictx->object_map.enabled()) {
-      send_write();
-      return;
-    }
-
-    // should have been flushed prior to releasing lock
-    assert(m_ictx->image_watcher->is_lock_owner());
-
-    ldout(m_ictx->cct, 20) << "send_pre " << this << " " << m_oid << " "
-			   << m_object_off << "~" << m_object_len << dendl;
-    m_state = LIBRBD_AIO_WRITE_PRE;
 
-    uint8_t new_state;
-    boost::optional<uint8_t> current_state;
-    pre_object_map_update(&new_state);
+    bool write = false;
+    {
+      RWLock::RLocker snap_lock(m_ictx->snap_lock);
+      if (!m_ictx->object_map.enabled()) {
+        write = true;
+      } else {
+        // should have been flushed prior to releasing lock
+        assert(m_ictx->image_watcher->is_lock_owner());
+
+        ldout(m_ictx->cct, 20) << "send_pre " << this << " " << m_oid << " "
+          		       << m_object_off << "~" << m_object_len << dendl;
+        m_state = LIBRBD_AIO_WRITE_PRE;
+
+        uint8_t new_state;
+        boost::optional<uint8_t> current_state;
+        pre_object_map_update(&new_state);
+
+        RWLock::WLocker object_map_locker(m_ictx->object_map_lock);
+        if (m_ictx->object_map[m_object_no] != new_state) {
+          FunctionContext *ctx = new FunctionContext(
+            boost::bind(&AioRequest::complete, this, _1));
+          bool updated = m_ictx->object_map.aio_update(m_object_no, new_state,
+                                                       current_state, ctx);
+          assert(updated);
+        } else {
+          write = true;
+        }
+      }
+    }
 
-    RWLock::WLocker object_map_locker(m_ictx->object_map_lock);
-    if (m_ictx->object_map[m_object_no] == new_state) {
+    // avoid possible recursive lock attempts
+    if (write) {
+      // no object map update required
       send_write();
-      return;
     }
-
-    FunctionContext *ctx = new FunctionContext(
-      boost::bind(&AioRequest::complete, this, _1));
-    bool updated = m_ictx->object_map.aio_update(m_object_no, new_state,
-                                                 current_state, ctx);
-    assert(updated);
   }
 
   bool AbstractWrite::send_post() {
@@ -497,8 +506,21 @@ namespace librbd {
   }
 
   void AioWrite::add_write_ops(librados::ObjectWriteOperation *wr) {
-    wr->set_alloc_hint(m_ictx->get_object_size(), m_ictx->get_object_size());
-    wr->write(m_object_off, m_write_data);
+    if (m_ictx->enable_alloc_hint && !m_ictx->object_map.object_may_exist(m_object_no))
+      wr->set_alloc_hint(m_ictx->get_object_size(), m_ictx->get_object_size());
+    if (m_object_off == 0 && m_object_len == m_ictx->get_object_size()) {
+      wr->write_full(m_write_data);
+    } else {
+      wr->write(m_object_off, m_write_data);
+    }
     wr->set_op_flags2(m_op_flags);
   }
+
+  void AioRemove::guard_write() {
+    // do nothing to disable write guard only if deep-copyup not required
+    RWLock::RLocker snap_locker(m_ictx->snap_lock);
+    if (!m_ictx->snaps.empty()) {
+      AbstractWrite::guard_write();
+    }
+  }
 }
diff --git a/src/librbd/AioRequest.h b/src/librbd/AioRequest.h
index 4fff5ef..885cbce 100644
--- a/src/librbd/AioRequest.h
+++ b/src/librbd/AioRequest.h
@@ -179,6 +179,7 @@ namespace librbd {
     std::vector<librados::snap_t> m_snaps;
 
     virtual void add_write_ops(librados::ObjectWriteOperation *wr) = 0;
+    virtual const char* get_write_type() const = 0;
     virtual void guard_write();
     virtual void pre_object_map_update(uint8_t *new_state) = 0;
     virtual bool post_object_map_update() {
@@ -208,6 +209,11 @@ namespace librbd {
     }
   protected:
     virtual void add_write_ops(librados::ObjectWriteOperation *wr);
+
+    virtual const char* get_write_type() const {
+      return "write";
+    }
+
     virtual void pre_object_map_update(uint8_t *new_state) {
       *new_state = OBJECT_EXISTS;
     }
@@ -235,6 +241,12 @@ namespace librbd {
       }
     }
 
+    virtual const char* get_write_type() const {
+      if (has_parent()) {
+        return "remove (trunc)";
+      }
+      return "remove";
+    }
     virtual void pre_object_map_update(uint8_t *new_state) {
       if (has_parent()) {
 	m_object_state = OBJECT_EXISTS;
@@ -251,14 +263,37 @@ namespace librbd {
       return true;
     }
 
-    virtual void guard_write() {
-      // do nothing to disable write guard
-    }
+    virtual void guard_write();
 
   private:
     uint8_t m_object_state;
   };
 
+  class AioTrim : public AbstractWrite {
+  public:
+    AioTrim(ImageCtx *ictx, const std::string &oid, uint64_t object_no,
+            const ::SnapContext &snapc, Context *completion)
+      : AbstractWrite(ictx, oid, object_no, 0, 0, snapc, completion, true) {
+    }
+
+  protected:
+    virtual void add_write_ops(librados::ObjectWriteOperation *wr) {
+      wr->remove();
+    }
+
+    virtual const char* get_write_type() const {
+      return "remove (trim)";
+    }
+
+    virtual void pre_object_map_update(uint8_t *new_state) {
+      *new_state = OBJECT_PENDING;
+    }
+
+    virtual bool post_object_map_update() {
+      return true;
+    }
+  };
+
   class AioTruncate : public AbstractWrite {
   public:
     AioTruncate(ImageCtx *ictx, const std::string &oid, uint64_t object_no,
@@ -274,6 +309,10 @@ namespace librbd {
       wr->truncate(m_object_off);
     }
 
+    virtual const char* get_write_type() const {
+      return "truncate";
+    }
+
     virtual void pre_object_map_update(uint8_t *new_state) {
       *new_state = OBJECT_EXISTS;
     }
@@ -294,6 +333,10 @@ namespace librbd {
       wr->zero(m_object_off, m_object_len);
     }
 
+    virtual const char* get_write_type() const {
+      return "zero";
+    }
+
     virtual void pre_object_map_update(uint8_t *new_state) {
       *new_state = OBJECT_EXISTS;
     }
diff --git a/src/librbd/AsyncFlattenRequest.cc b/src/librbd/AsyncFlattenRequest.cc
index bd1875c..9136220 100644
--- a/src/librbd/AsyncFlattenRequest.cc
+++ b/src/librbd/AsyncFlattenRequest.cc
@@ -18,11 +18,11 @@
 
 namespace librbd {
 
-class AsyncFlattenObjectContext : public C_AsyncObjectThrottle {
+class AsyncFlattenObjectContext : public C_AsyncObjectThrottle<> {
 public:
-  AsyncFlattenObjectContext(AsyncObjectThrottle &throttle, ImageCtx *image_ctx,
-                            uint64_t object_size, ::SnapContext snapc,
-                            uint64_t object_no)
+  AsyncFlattenObjectContext(AsyncObjectThrottle<> &throttle,
+                            ImageCtx *image_ctx, uint64_t object_size,
+                            ::SnapContext snapc, uint64_t object_no)
     : C_AsyncObjectThrottle(throttle, *image_ctx), m_object_size(object_size),
       m_snapc(snapc), m_object_no(object_no)
   {
@@ -94,14 +94,14 @@ void AsyncFlattenRequest::send() {
   ldout(cct, 5) << this << " send" << dendl;
 
   m_state = STATE_FLATTEN_OBJECTS;
-  AsyncObjectThrottle::ContextFactory context_factory(
+  AsyncObjectThrottle<>::ContextFactory context_factory(
     boost::lambda::bind(boost::lambda::new_ptr<AsyncFlattenObjectContext>(),
       boost::lambda::_1, &m_image_ctx, m_object_size, m_snapc,
       boost::lambda::_2));
-  AsyncObjectThrottle *throttle = new AsyncObjectThrottle(
-    this, m_image_ctx, context_factory, create_callback_context(), m_prog_ctx,
+  AsyncObjectThrottle<> *throttle = new AsyncObjectThrottle<>(
+    this, m_image_ctx, context_factory, create_callback_context(), &m_prog_ctx,
     0, m_overlap_objects);
-  throttle->start_ops(cct->_conf->rbd_concurrent_management_ops);
+  throttle->start_ops(m_image_ctx.concurrent_management_ops);
 }
 
 bool AsyncFlattenRequest::send_update_header() {
@@ -155,7 +155,8 @@ bool AsyncFlattenRequest::send_update_children() {
   // (if snapshots remain, they have their own parent info, and the child
   // will be removed when the last snap goes away)
   RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
-  if (!m_image_ctx.snaps.empty()) {
+  if ((m_image_ctx.features & RBD_FEATURE_DEEP_FLATTEN) == 0 &&
+      !m_image_ctx.snaps.empty()) {
     return true;
   }
 
diff --git a/src/librbd/AsyncFlattenRequest.h b/src/librbd/AsyncFlattenRequest.h
index 3f0c612..01f1667 100644
--- a/src/librbd/AsyncFlattenRequest.h
+++ b/src/librbd/AsyncFlattenRequest.h
@@ -12,7 +12,7 @@ namespace librbd {
 class ImageCtx;
 class ProgressContext;
 
-class AsyncFlattenRequest : public AsyncRequest
+class AsyncFlattenRequest : public AsyncRequest<>
 {
 public:
   AsyncFlattenRequest(ImageCtx &image_ctx, Context *on_finish,
@@ -34,6 +34,8 @@ private:
    * Flatten goes through the following state machine to copyup objects
    * from the parent image:
    *
+   * @verbatim
+   *
    * <start>
    *    |
    *    v
@@ -49,6 +51,8 @@ private:
    *           .                                   .
    *           . . . . . . . . . . . . . . . . . . .
    *
+   * @endverbatim
+   *
    * The _UPDATE_CHILDREN state will be skipped if the image has one or
    * more snapshots. The _UPDATE_HEADER state will be skipped if the
    * image was concurrently flattened by another client.
diff --git a/src/librbd/AsyncObjectThrottle.cc b/src/librbd/AsyncObjectThrottle.cc
index 2c7ccd1..59b3a1f 100644
--- a/src/librbd/AsyncObjectThrottle.cc
+++ b/src/librbd/AsyncObjectThrottle.cc
@@ -3,6 +3,7 @@
 #include "librbd/AsyncObjectThrottle.h"
 #include "include/rbd/librbd.hpp"
 #include "common/RWLock.h"
+#include "common/WorkQueue.h"
 #include "librbd/AsyncRequest.h"
 #include "librbd/ImageCtx.h"
 #include "librbd/internal.h"
@@ -10,17 +11,11 @@
 namespace librbd
 {
 
-void C_AsyncObjectThrottle::finish(int r) {
-  RWLock::RLocker l(m_image_ctx.owner_lock);
-  m_finisher.finish_op(r);
-}
-
-AsyncObjectThrottle::AsyncObjectThrottle(const AsyncRequest* async_request,
-                                         ImageCtx &image_ctx,
-                                         const ContextFactory& context_factory,
-				 	 Context *ctx, ProgressContext &prog_ctx,
-					 uint64_t object_no,
-					 uint64_t end_object_no)
+template <typename T>
+AsyncObjectThrottle<T>::AsyncObjectThrottle(
+    const AsyncRequest<T>* async_request, T &image_ctx,
+    const ContextFactory& context_factory, Context *ctx,
+    ProgressContext *prog_ctx, uint64_t object_no, uint64_t end_object_no)
   : m_lock(unique_lock_name("librbd::AsyncThrottle::m_lock", this)),
     m_async_request(async_request), m_image_ctx(image_ctx),
     m_context_factory(context_factory), m_ctx(ctx), m_prog_ctx(prog_ctx),
@@ -29,7 +24,8 @@ AsyncObjectThrottle::AsyncObjectThrottle(const AsyncRequest* async_request,
 {
 }
 
-void AsyncObjectThrottle::start_ops(uint64_t max_concurrent) {
+template <typename T>
+void AsyncObjectThrottle<T>::start_ops(uint64_t max_concurrent) {
   assert(m_image_ctx.owner_lock.is_locked());
   bool complete;
   {
@@ -48,11 +44,12 @@ void AsyncObjectThrottle::start_ops(uint64_t max_concurrent) {
   }
 }
 
-void AsyncObjectThrottle::finish_op(int r) {
+template <typename T>
+void AsyncObjectThrottle<T>::finish_op(int r) {
   assert(m_image_ctx.owner_lock.is_locked());
   bool complete;
   {
-    Mutex::Locker l(m_lock);
+    Mutex::Locker locker(m_lock);
     --m_current_ops;
     if (r < 0 && r != -ENOENT && m_ret == 0) {
       m_ret = r;
@@ -67,10 +64,12 @@ void AsyncObjectThrottle::finish_op(int r) {
   }
 }
 
-void AsyncObjectThrottle::start_next_op() {
+template <typename T>
+void AsyncObjectThrottle<T>::start_next_op() {
   bool done = false;
   while (!done) {
-    if (m_async_request->is_canceled() && m_ret == 0) {
+    if (m_async_request != NULL && m_async_request->is_canceled() &&
+        m_ret == 0) {
       // allow in-flight ops to complete, but don't start new ops
       m_ret = -ERESTART;
       return;
@@ -79,7 +78,7 @@ void AsyncObjectThrottle::start_next_op() {
     }
 
     uint64_t ono = m_object_no++;
-    C_AsyncObjectThrottle *ctx = m_context_factory(*this, ono);
+    C_AsyncObjectThrottle<T> *ctx = m_context_factory(*this, ono);
 
     int r = ctx->send();
     if (r < 0) {
@@ -93,8 +92,12 @@ void AsyncObjectThrottle::start_next_op() {
       ++m_current_ops;
       done = true;
     }
-    m_prog_ctx.update_progress(ono, m_end_object_no);
+    if (m_prog_ctx != NULL) {
+      m_prog_ctx->update_progress(ono, m_end_object_no);
+    }
   }
 }
 
 } // namespace librbd
+
+template class librbd::AsyncObjectThrottle<librbd::ImageCtx>;
diff --git a/src/librbd/AsyncObjectThrottle.h b/src/librbd/AsyncObjectThrottle.h
index f7f254fb..a831051 100644
--- a/src/librbd/AsyncObjectThrottle.h
+++ b/src/librbd/AsyncObjectThrottle.h
@@ -5,13 +5,14 @@
 
 #include "include/int_types.h"
 #include "include/Context.h"
+#include "common/RWLock.h"
 
 #include <boost/function.hpp>
 #include "include/assert.h"
 
 namespace librbd
 {
-class AsyncRequest;
+template <typename ImageCtxT> class AsyncRequest;
 class ProgressContext;
 struct ImageCtx;
 
@@ -21,33 +22,39 @@ public:
   virtual void finish_op(int r) = 0;
 };
 
+template <typename ImageCtxT = ImageCtx>
 class C_AsyncObjectThrottle : public Context {
 public:
   C_AsyncObjectThrottle(AsyncObjectThrottleFinisher &finisher,
-                        ImageCtx &image_ctx)
-    : m_image_ctx(image_ctx), m_finisher(finisher)
-  {
+                        ImageCtxT &image_ctx)
+    : m_image_ctx(image_ctx), m_finisher(finisher) {
   }
 
   virtual int send() = 0;
 
 protected:
-  ImageCtx &m_image_ctx;
+  ImageCtxT &m_image_ctx;
 
-  virtual void finish(int r);
+  virtual void finish(int r) {
+    RWLock::RLocker locker(m_image_ctx.owner_lock);
+    m_finisher.finish_op(r);
+  }
 
 private:
   AsyncObjectThrottleFinisher &m_finisher;
 };
 
+template <typename ImageCtxT = ImageCtx>
 class AsyncObjectThrottle : public AsyncObjectThrottleFinisher {
 public:
-  typedef boost::function<C_AsyncObjectThrottle*(AsyncObjectThrottle&,
-      					   uint64_t)> ContextFactory;
+  typedef boost::function<
+    C_AsyncObjectThrottle<ImageCtxT>* (AsyncObjectThrottle&,
+                                       uint64_t)> ContextFactory;
 
-  AsyncObjectThrottle(const AsyncRequest *async_request, ImageCtx &image_ctx,
+  AsyncObjectThrottle(const AsyncRequest<ImageCtxT> *async_request,
+                      ImageCtxT &image_ctx,
                       const ContextFactory& context_factory, Context *ctx,
-		      ProgressContext &prog_ctx, uint64_t object_no,
+		      ProgressContext *prog_ctx, uint64_t object_no,
 		      uint64_t end_object_no);
 
   void start_ops(uint64_t max_concurrent);
@@ -55,11 +62,11 @@ public:
 
 private:
   Mutex m_lock;
-  const AsyncRequest *m_async_request;
-  ImageCtx &m_image_ctx;
+  const AsyncRequest<ImageCtxT> *m_async_request;
+  ImageCtxT &m_image_ctx;
   ContextFactory m_context_factory;
   Context *m_ctx;
-  ProgressContext &m_prog_ctx;
+  ProgressContext *m_prog_ctx;
   uint64_t m_object_no;
   uint64_t m_end_object_no;
   uint64_t m_current_ops;
@@ -70,4 +77,6 @@ private:
 
 } // namespace librbd
 
+extern template class librbd::AsyncObjectThrottle<librbd::ImageCtx>;
+
 #endif // CEPH_LIBRBD_ASYNC_OBJECT_THROTTLE_H
diff --git a/src/librbd/AsyncRequest.cc b/src/librbd/AsyncRequest.cc
index 2f0c2d9..b6e41eb 100644
--- a/src/librbd/AsyncRequest.cc
+++ b/src/librbd/AsyncRequest.cc
@@ -1,43 +1,52 @@
 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
 // vim: ts=8 sw=2 smarttab
 #include "librbd/AsyncRequest.h"
-#include "common/WorkQueue.h"
 #include "librbd/ImageCtx.h"
 #include "librbd/internal.h"
+#include "common/WorkQueue.h"
 #include <boost/bind.hpp>
 
 namespace librbd
 {
 
-AsyncRequest::AsyncRequest(ImageCtx &image_ctx, Context *on_finish)
+template <typename T>
+AsyncRequest<T>::AsyncRequest(T &image_ctx, Context *on_finish)
   : m_image_ctx(image_ctx), m_on_finish(on_finish), m_canceled(false),
     m_xlist_item(this) {
+  assert(m_on_finish != NULL);
   Mutex::Locker l(m_image_ctx.async_ops_lock);
   m_image_ctx.async_requests.push_back(&m_xlist_item);
 }
 
-AsyncRequest::~AsyncRequest() {
+template <typename T>
+AsyncRequest<T>::~AsyncRequest() {
   Mutex::Locker l(m_image_ctx.async_ops_lock);
   assert(m_xlist_item.remove_myself());
   m_image_ctx.async_requests_cond.Signal();
 }
 
-void AsyncRequest::async_complete(int r) {
+template <typename T>
+void AsyncRequest<T>::async_complete(int r) {
   m_image_ctx.op_work_queue->queue(create_callback_context(), r);
 }
 
-librados::AioCompletion *AsyncRequest::create_callback_completion() {
+template <typename T>
+librados::AioCompletion *AsyncRequest<T>::create_callback_completion() {
   return librados::Rados::aio_create_completion(create_callback_context(),
 						NULL, rados_ctx_cb);
 }
 
-Context *AsyncRequest::create_callback_context() {
-  return new FunctionContext(boost::bind(&AsyncRequest::complete, this, _1));
+template <typename T>
+Context *AsyncRequest<T>::create_callback_context() {
+  return new FunctionContext(boost::bind(&AsyncRequest<T>::complete, this, _1));
 }
 
-Context *AsyncRequest::create_async_callback_context() {
-  return new FunctionContext(boost::bind(&AsyncRequest::async_complete, this,
+template <typename T>
+Context *AsyncRequest<T>::create_async_callback_context() {
+  return new FunctionContext(boost::bind(&AsyncRequest<T>::async_complete, this,
                                          _1));;
 }
 
 } // namespace librbd
+
+template class librbd::AsyncRequest<librbd::ImageCtx>;
diff --git a/src/librbd/AsyncRequest.h b/src/librbd/AsyncRequest.h
index 7324a22..241c90e 100644
--- a/src/librbd/AsyncRequest.h
+++ b/src/librbd/AsyncRequest.h
@@ -7,15 +7,17 @@
 #include "include/Context.h"
 #include "include/rados/librados.hpp"
 #include "include/xlist.h"
+#include "include/compat.h"
 
 namespace librbd {
 
 class ImageCtx;
 
+template <typename ImageCtxT = ImageCtx>
 class AsyncRequest
 {
 public:
-  AsyncRequest(ImageCtx &image_ctx, Context *on_finish);
+  AsyncRequest(ImageCtxT &image_ctx, Context *on_finish);
   virtual ~AsyncRequest();
 
   void complete(int r) {
@@ -38,7 +40,7 @@ public:
   }
 
 protected:
-  ImageCtx &m_image_ctx;
+  ImageCtxT &m_image_ctx;
   Context *m_on_finish;
 
   librados::AioCompletion *create_callback_completion();
@@ -56,26 +58,11 @@ protected:
   }
 private:
   bool m_canceled;
-  xlist<AsyncRequest *>::item m_xlist_item;
-};
-
-class C_AsyncRequest : public Context
-{
-public:
-  C_AsyncRequest(AsyncRequest *req)
-    : m_req(req)
-  {
-  }
-
-protected:
-  virtual void finish(int r) {
-    m_req->complete(r);
-  }
-
-private:
-  AsyncRequest *m_req;
+  typename xlist<AsyncRequest<ImageCtxT> *>::item m_xlist_item;
 };
 
 } // namespace librbd
 
+extern template class librbd::AsyncRequest<librbd::ImageCtx>;
+
 #endif //CEPH_LIBRBD_ASYNC_REQUEST_H
diff --git a/src/librbd/AsyncResizeRequest.cc b/src/librbd/AsyncResizeRequest.cc
index 8ddf967..732e3f7 100644
--- a/src/librbd/AsyncResizeRequest.cc
+++ b/src/librbd/AsyncResizeRequest.cc
@@ -98,7 +98,6 @@ bool AsyncResizeRequest::should_complete(int r) {
     ldout(cct, 5) << "UPDATE_HEADER" << dendl;
     if (send_shrink_object_map()) {
       update_size_and_overlap();
-      increment_refresh_seq();
       return true;
     }
     break;
@@ -106,7 +105,6 @@ bool AsyncResizeRequest::should_complete(int r) {
   case STATE_SHRINK_OBJECT_MAP:
     ldout(cct, 5) << "SHRINK_OBJECT_MAP" << dendl;
     update_size_and_overlap();
-    increment_refresh_seq();
     return true;
 
   default:
@@ -273,12 +271,6 @@ void AsyncResizeRequest::compute_parent_overlap() {
   }
 }
 
-void AsyncResizeRequest::increment_refresh_seq() {
-  m_image_ctx.refresh_lock.Lock();
-  ++m_image_ctx.refresh_seq;
-  m_image_ctx.refresh_lock.Unlock();
-}
-
 void AsyncResizeRequest::update_size_and_overlap() {
   RWLock::WLocker snap_locker(m_image_ctx.snap_lock);
   m_image_ctx.size = m_new_size;
diff --git a/src/librbd/AsyncResizeRequest.h b/src/librbd/AsyncResizeRequest.h
index a719c9d..0acad6f 100644
--- a/src/librbd/AsyncResizeRequest.h
+++ b/src/librbd/AsyncResizeRequest.h
@@ -12,7 +12,7 @@ namespace librbd
 class ImageCtx;
 class ProgressContext;
 
-class AsyncResizeRequest : public AsyncRequest
+class AsyncResizeRequest : public AsyncRequest<>
 {
 public:
   AsyncResizeRequest(ImageCtx &image_ctx, Context *on_finish, uint64_t new_size,
@@ -29,15 +29,13 @@ public:
     return m_new_size;
   }
 
-  inline uint64_t get_parent_overlap() const {
-    return m_new_parent_overlap;
-  }
-
 private:
   /**
    * Resize goes through the following state machine to resize the image
    * and update the object map:
    *
+   * @verbatim
+   *
    * <start> -------------> STATE_FINISHED -----------------------------\
    *  |  .    (no change)                                               |
    *  |  .                                                              |
@@ -60,6 +58,8 @@ private:
    *                                             v                   v  v
    *                                  STATE_SHRINK_OBJECT_MAP ---> <finish>
    *
+   * @endverbatim
+   *
    * The _OBJECT_MAP states are skipped if the object map isn't enabled.
    * The state machine will immediately transition to _FINISHED if there
    * are no objects to trim.
@@ -93,7 +93,6 @@ private:
   void send_update_header();
 
   void compute_parent_overlap();
-  void increment_refresh_seq();
   void update_size_and_overlap();
 
 };
diff --git a/src/librbd/AsyncTrimRequest.cc b/src/librbd/AsyncTrimRequest.cc
index 20f7102..90668ce 100644
--- a/src/librbd/AsyncTrimRequest.cc
+++ b/src/librbd/AsyncTrimRequest.cc
@@ -24,10 +24,37 @@
 namespace librbd
 {
 
-class AsyncTrimObjectContext : public C_AsyncObjectThrottle {
+class C_CopyupObject : public C_AsyncObjectThrottle<> {
 public:
-  AsyncTrimObjectContext(AsyncObjectThrottle &throttle, ImageCtx *image_ctx,
-			 uint64_t object_no)
+  C_CopyupObject(AsyncObjectThrottle<> &throttle, ImageCtx *image_ctx,
+                 ::SnapContext snapc, uint64_t object_no)
+    : C_AsyncObjectThrottle(throttle, *image_ctx), m_snapc(snapc),
+      m_object_no(object_no)
+  {
+  }
+
+  virtual int send() {
+    assert(m_image_ctx.owner_lock.is_locked());
+    assert(!m_image_ctx.image_watcher->is_lock_supported() ||
+           m_image_ctx.image_watcher->is_lock_owner());
+
+    string oid = m_image_ctx.get_object_name(m_object_no);
+    ldout(m_image_ctx.cct, 10) << "removing (with copyup) " << oid << dendl;
+
+    AbstractWrite *req = new AioTrim(&m_image_ctx, oid, m_object_no, m_snapc,
+                                     this);
+    req->send();
+    return 0;
+  }
+private:
+  ::SnapContext m_snapc;
+  uint64_t m_object_no;
+};
+
+class C_RemoveObject : public C_AsyncObjectThrottle<> {
+public:
+  C_RemoveObject(AsyncObjectThrottle<> &throttle, ImageCtx *image_ctx,
+                 uint64_t object_no)
     : C_AsyncObjectThrottle(throttle, *image_ctx), m_object_no(object_no)
   {
   }
@@ -58,7 +85,8 @@ private:
 AsyncTrimRequest::AsyncTrimRequest(ImageCtx &image_ctx, Context *on_finish,
 				   uint64_t original_size, uint64_t new_size,
 				   ProgressContext &prog_ctx)
-  : AsyncRequest(image_ctx, on_finish), m_new_size(new_size), m_prog_ctx(prog_ctx)
+  : AsyncRequest(image_ctx, on_finish), m_new_size(new_size),
+    m_prog_ctx(prog_ctx)
 {
   uint64_t period = m_image_ctx.get_stripe_period();
   uint64_t new_num_periods = ((m_new_size + period - 1) / period);
@@ -86,6 +114,11 @@ bool AsyncTrimRequest::should_complete(int r)
   }
 
   switch (m_state) {
+  case STATE_COPYUP_OBJECTS:
+    ldout(cct, 5) << " COPYUP_OBJECTS" << dendl;
+    send_pre_remove();
+    break;
+
   case STATE_PRE_REMOVE:
     ldout(cct, 5) << " PRE_REMOVE" << dendl;
     {
@@ -109,7 +142,7 @@ bool AsyncTrimRequest::should_complete(int r)
 
   case STATE_CLEAN_BOUNDARY:
     ldout(cct, 5) << "CLEAN_BOUNDARY" << dendl;
-    finish();
+    finish(0);
     break;
 
   case STATE_FINISHED:
@@ -125,17 +158,62 @@ bool AsyncTrimRequest::should_complete(int r)
 }
 
 void AsyncTrimRequest::send() {
+  send_copyup_objects();
+}
+
+void AsyncTrimRequest::send_copyup_objects() {
   assert(m_image_ctx.owner_lock.is_locked());
-  if (m_delete_start < m_num_objects) {
-    send_pre_remove();
-  } else {
+  assert(!m_image_ctx.image_watcher->is_lock_supported() ||
+         m_image_ctx.image_watcher->is_lock_owner());
+
+  if (m_delete_start >= m_num_objects) {
     send_clean_boundary();
+    return;
   }
+
+  ::SnapContext snapc;
+  bool has_snapshots;
+  uint64_t parent_overlap;
+  {
+    RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+    RWLock::RLocker parent_locker(m_image_ctx.parent_lock);
+
+    snapc = m_image_ctx.snapc;
+    has_snapshots = !m_image_ctx.snaps.empty();
+    int r = m_image_ctx.get_parent_overlap(m_image_ctx.get_copyup_snap_id(),
+                                           &parent_overlap);
+    assert(r == 0);
+  }
+
+  // copyup is only required for portion of image that overlaps parent
+  uint64_t copyup_end = Striper::get_num_objects(m_image_ctx.layout,
+                                                 parent_overlap);
+  // TODO: protect against concurrent shrink and snap create?
+  if (copyup_end <= m_delete_start || !has_snapshots) {
+    send_pre_remove();
+    return;
+  }
+
+  uint64_t copyup_start = m_delete_start;
+  m_delete_start = copyup_end;
+
+  ldout(m_image_ctx.cct, 5) << this << " send_copyup_objects: "
+			    << " start object=" << copyup_start << ", "
+			    << " end object=" << copyup_end << dendl;
+  m_state = STATE_COPYUP_OBJECTS;
+
+  Context *ctx = create_callback_context();
+  AsyncObjectThrottle<>::ContextFactory context_factory(
+    boost::lambda::bind(boost::lambda::new_ptr<C_CopyupObject>(),
+      boost::lambda::_1, &m_image_ctx, snapc, boost::lambda::_2));
+  AsyncObjectThrottle<> *throttle = new AsyncObjectThrottle<>(
+    this, m_image_ctx, context_factory, ctx, &m_prog_ctx, copyup_start,
+    copyup_end);
+  throttle->start_ops(m_image_ctx.concurrent_management_ops);
 }
 
 void AsyncTrimRequest::send_remove_objects() {
   assert(m_image_ctx.owner_lock.is_locked());
-  CephContext *cct = m_image_ctx.cct;
 
   ldout(m_image_ctx.cct, 5) << this << " send_remove_objects: "
 			    << " delete_start=" << m_delete_start
@@ -143,17 +221,21 @@ void AsyncTrimRequest::send_remove_objects() {
   m_state = STATE_REMOVE_OBJECTS;
 
   Context *ctx = create_callback_context();
-  AsyncObjectThrottle::ContextFactory context_factory(
-    boost::lambda::bind(boost::lambda::new_ptr<AsyncTrimObjectContext>(),
+  AsyncObjectThrottle<>::ContextFactory context_factory(
+    boost::lambda::bind(boost::lambda::new_ptr<C_RemoveObject>(),
       boost::lambda::_1, &m_image_ctx, boost::lambda::_2));
-  AsyncObjectThrottle *throttle = new AsyncObjectThrottle(
-    this, m_image_ctx, context_factory, ctx, m_prog_ctx, m_delete_start,
+  AsyncObjectThrottle<> *throttle = new AsyncObjectThrottle<>(
+    this, m_image_ctx, context_factory, ctx, &m_prog_ctx, m_delete_start,
     m_num_objects);
-  throttle->start_ops(cct->_conf->rbd_concurrent_management_ops);
+  throttle->start_ops(m_image_ctx.concurrent_management_ops);
 }
 
 void AsyncTrimRequest::send_pre_remove() {
   assert(m_image_ctx.owner_lock.is_locked());
+  if (m_delete_start >= m_num_objects) {
+    send_clean_boundary();
+    return;
+  }
 
   bool remove_objects = false;
   {
@@ -226,16 +308,17 @@ void AsyncTrimRequest::send_clean_boundary() {
   assert(m_image_ctx.owner_lock.is_locked());
   CephContext *cct = m_image_ctx.cct;
   if (m_delete_off <= m_new_size) {
-    finish();
+    finish(0);
     return;
   }
 
   // should have been canceled prior to releasing lock
   assert(!m_image_ctx.image_watcher->is_lock_supported() ||
          m_image_ctx.image_watcher->is_lock_owner());
+  uint64_t delete_len = m_delete_off - m_new_size;
   ldout(m_image_ctx.cct, 5) << this << " send_clean_boundary: "
-			    << " delete_start=" << m_delete_start
-			    << " num_objects=" << m_num_objects << dendl;
+			    << " delete_off=" << m_delete_off
+			    << " length=" << delete_len << dendl;
   m_state = STATE_CLEAN_BOUNDARY;
 
   ::SnapContext snapc;
@@ -247,8 +330,8 @@ void AsyncTrimRequest::send_clean_boundary() {
   // discard the weird boundary
   std::vector<ObjectExtent> extents;
   Striper::file_to_extents(cct, m_image_ctx.format_string,
-			   &m_image_ctx.layout, m_new_size,
-			   m_delete_off - m_new_size, 0, extents);
+			   &m_image_ctx.layout, m_new_size, delete_len, 0,
+                           extents);
 
   ContextCompletion *completion =
     new ContextCompletion(create_callback_context(), true);
@@ -259,8 +342,8 @@ void AsyncTrimRequest::send_clean_boundary() {
 
     AbstractWrite *req;
     if (p->offset == 0) {
-      req = new AioRemove(&m_image_ctx, p->oid.name, p->objectno, snapc,
-                          req_comp);
+      req = new AioTrim(&m_image_ctx, p->oid.name, p->objectno, snapc,
+                        req_comp);
     } else {
       req = new AioTruncate(&m_image_ctx, p->oid.name, p->objectno,
                             p->offset, snapc, req_comp);
@@ -270,9 +353,9 @@ void AsyncTrimRequest::send_clean_boundary() {
   completion->finish_adding_requests();
 }
 
-void AsyncTrimRequest::finish() {
+void AsyncTrimRequest::finish(int r) {
   m_state = STATE_FINISHED;
-  async_complete(0);
+  async_complete(r);
 }
 
 } // namespace librbd
diff --git a/src/librbd/AsyncTrimRequest.h b/src/librbd/AsyncTrimRequest.h
index d4d6af9..2160c40 100644
--- a/src/librbd/AsyncTrimRequest.h
+++ b/src/librbd/AsyncTrimRequest.h
@@ -11,7 +11,7 @@ namespace librbd
 class ImageCtx;
 class ProgressContext;
 
-class AsyncTrimRequest : public AsyncRequest
+class AsyncTrimRequest : public AsyncRequest<>
 {
 public:
   AsyncTrimRequest(ImageCtx &image_ctx, Context *on_finish,
@@ -25,11 +25,17 @@ protected:
    * Trim goes through the following state machine to remove whole objects,
    * clean partially trimmed objects, and update the object map:
    *
+   * @verbatim
+   *
    *     <start> . . . . > STATE_FINISHED . . . . . . . . .
    *      |   .                                           .
    *      |   . . . . . . . . . . . .                     .
    *      |                         .                     .
-   *      v                         v                     .
+   *      v                         .                     .
+   * STATE_COPYUP_OBJECTS . . .     .                     .
+   *      |                   .     .                     .
+   *      |                   .     .                     .
+   *      v                   v     v                     .
    * STATE_PRE_REMOVE ---> STATE_REMOVE_OBJECTS           .
    *                                |   .   .             .
    *        /-----------------------/   .   . . . . . .   .
@@ -40,6 +46,10 @@ protected:
    *        .                                           .
    *        . . . . . . . . . . . . . . . . . . . . . . .
    *
+   * @endverbatim
+   *
+   * The _COPYUP_OBJECTS state is skipped if there is no parent overlap
+   * within the new image size and the image does not have any snapshots.
    * The _PRE_REMOVE/_POST_REMOVE states are skipped if the object map
    * isn't enabled. The _REMOVE_OBJECTS state is skipped if no whole objects
    * are removed.  The _CLEAN_BOUNDARY state is skipped if no boundary
@@ -48,6 +58,7 @@ protected:
    */ 
 
   enum State {
+    STATE_COPYUP_OBJECTS,
     STATE_PRE_REMOVE,
     STATE_REMOVE_OBJECTS,
     STATE_POST_REMOVE,
@@ -66,11 +77,12 @@ private:
   uint64_t m_new_size;
   ProgressContext &m_prog_ctx;
 
+  void send_copyup_objects();
   void send_remove_objects();
   void send_pre_remove();
   void send_post_remove();
   void send_clean_boundary();
-  void finish();
+  void finish(int r);
 };
 
 } // namespace librbd
diff --git a/src/librbd/CopyupRequest.cc b/src/librbd/CopyupRequest.cc
index 1535cde..667d19d 100644
--- a/src/librbd/CopyupRequest.cc
+++ b/src/librbd/CopyupRequest.cc
@@ -8,12 +8,15 @@
 
 #include "librbd/AioCompletion.h"
 #include "librbd/AioRequest.h"
+#include "librbd/AsyncObjectThrottle.h"
 #include "librbd/CopyupRequest.h"
 #include "librbd/ImageCtx.h"
 #include "librbd/ImageWatcher.h"
 #include "librbd/ObjectMap.h"
 
 #include <boost/bind.hpp>
+#include <boost/lambda/bind.hpp>
+#include <boost/lambda/construct.hpp>
 
 #define dout_subsys ceph_subsys_rbd
 #undef dout_prefix
@@ -21,6 +24,52 @@
 
 namespace librbd {
 
+namespace {
+
+class UpdateObjectMap : public C_AsyncObjectThrottle<> {
+public:
+  UpdateObjectMap(AsyncObjectThrottle<> &throttle, ImageCtx *image_ctx,
+                  uint64_t object_no, const std::vector<uint64_t> *snap_ids,
+                  size_t snap_id_idx)
+    : C_AsyncObjectThrottle(throttle, *image_ctx),
+      m_object_no(object_no), m_snap_ids(*snap_ids), m_snap_id_idx(snap_id_idx)
+  {
+  }
+
+  virtual int send() {
+    assert(m_image_ctx.owner_lock.is_locked());
+    uint64_t snap_id = m_snap_ids[m_snap_id_idx];
+    if (snap_id == CEPH_NOSNAP) {
+      RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+      RWLock::WLocker object_map_locker(m_image_ctx.object_map_lock);
+      assert(m_image_ctx.image_watcher->is_lock_owner());
+      bool sent = m_image_ctx.object_map.aio_update(m_object_no, OBJECT_EXISTS,
+                                                    boost::optional<uint8_t>(),
+                                                    this);
+      return (sent ? 0 : 1);
+    }
+
+    uint8_t state = OBJECT_EXISTS;
+    if (m_image_ctx.test_features(RBD_FEATURE_FAST_DIFF) &&
+        m_snap_id_idx + 1 < m_snap_ids.size()) {
+      state = OBJECT_EXISTS_CLEAN;
+    }
+
+    RWLock::RLocker object_map_locker(m_image_ctx.object_map_lock);
+    m_image_ctx.object_map.aio_update(snap_id, m_object_no, m_object_no + 1,
+                                      state, boost::optional<uint8_t>(), this);
+    return 0;
+  }
+
+private:
+  uint64_t m_object_no;
+  const std::vector<uint64_t> &m_snap_ids;
+  size_t m_snap_id_idx;
+};
+
+} // anonymous namespace
+
+
   CopyupRequest::CopyupRequest(ImageCtx *ictx, const std::string &oid,
                                uint64_t objectno,
 			       vector<pair<uint64_t,uint64_t> >& image_extents)
@@ -52,40 +101,76 @@ namespace librbd {
   }
 
   bool CopyupRequest::send_copyup() {
+    bool add_copyup_op = !m_copyup_data.is_zero();
+    bool copy_on_read = m_pending_requests.empty();
+    if (!add_copyup_op && copy_on_read) {
+      // copyup empty object to prevent future CoR attempts
+      m_copyup_data.clear();
+      add_copyup_op = true;
+    }
+
+    ldout(m_ictx->cct, 20) << __func__ << " " << this
+			   << ": oid " << m_oid << dendl;
+    m_state = STATE_COPYUP;
+
     m_ictx->snap_lock.get_read();
     ::SnapContext snapc = m_ictx->snapc;
     m_ictx->snap_lock.put_read();
 
     std::vector<librados::snap_t> snaps;
-    snaps.insert(snaps.end(), snapc.snaps.begin(), snapc.snaps.end());
 
-    librados::ObjectWriteOperation copyup_op;
-    if (!m_copyup_data.is_zero()) {
-      copyup_op.exec("rbd", "copyup", m_copyup_data);
+    if (!copy_on_read) {
+      m_pending_copyups.inc();
     }
 
-    // merge all pending write ops into this single RADOS op
-    for (size_t i=0; i<m_pending_requests.size(); ++i) {
-      AioRequest *req = m_pending_requests[i];
-      ldout(m_ictx->cct, 20) << __func__ << " add_copyup_ops " << req << dendl;
-      req->add_copyup_ops(&copyup_op);
-    }
+    int r;
+    if (copy_on_read || (!snapc.snaps.empty() && add_copyup_op)) {
+      assert(add_copyup_op);
+      add_copyup_op = false;
+
+      librados::ObjectWriteOperation copyup_op;
+      copyup_op.exec("rbd", "copyup", m_copyup_data);
 
-    if (copyup_op.size() == 0) {
-      return true;
+      // send only the copyup request with a blank snapshot context so that
+      // all snapshots are detected from the parent for this object.  If
+      // this is a CoW request, a second request will be created for the
+      // actual modification.
+      m_pending_copyups.inc();
+
+      ldout(m_ictx->cct, 20) << __func__ << " " << this << " copyup with "
+                             << "empty snapshot context" << dendl;
+      librados::AioCompletion *comp =
+        librados::Rados::aio_create_completion(create_callback_context(), NULL,
+                                               rados_ctx_cb);
+      r = m_ictx->md_ctx.aio_operate(m_oid, comp, &copyup_op, 0, snaps);
+      assert(r == 0);
+      comp->release();
     }
 
-    ldout(m_ictx->cct, 20) << __func__ << " " << this
-			   << ": oid " << m_oid << dendl;
-    m_state = STATE_COPYUP;
+    if (!copy_on_read) {
+      librados::ObjectWriteOperation write_op;
+      if (add_copyup_op) {
+        // CoW did not need to handle existing snapshots
+        write_op.exec("rbd", "copyup", m_copyup_data);
+      }
 
-    librados::AioCompletion *comp =
-      librados::Rados::aio_create_completion(create_callback_context(), NULL,
-                                             rados_ctx_cb);
-    int r = m_ictx->md_ctx.aio_operate(m_oid, comp, &copyup_op, snapc.seq.val,
-                                       snaps);
-    assert(r == 0);
-    comp->release();
+      // merge all pending write ops into this single RADOS op
+      for (size_t i=0; i<m_pending_requests.size(); ++i) {
+        AioRequest *req = m_pending_requests[i];
+        ldout(m_ictx->cct, 20) << __func__ << " add_copyup_ops " << req
+                               << dendl;
+        req->add_copyup_ops(&write_op);
+      }
+      assert(write_op.size() != 0);
+
+      snaps.insert(snaps.end(), snapc.snaps.begin(), snapc.snaps.end());
+      librados::AioCompletion *comp =
+        librados::Rados::aio_create_completion(create_callback_context(), NULL,
+                                               rados_ctx_cb);
+      r = m_ictx->data_ctx.aio_operate(m_oid, comp, &write_op);
+      assert(r == 0);
+      comp->release();
+    }
     return false;
   }
 
@@ -118,6 +203,7 @@ namespace librbd {
   void CopyupRequest::complete(int r)
   {
     if (should_complete(r)) {
+      complete_requests(r);
       delete this;
     }
   }
@@ -130,40 +216,42 @@ namespace librbd {
 		   << ", extents " << m_image_extents
 		   << ", r " << r << dendl;
 
+    uint64_t pending_copyups;
     switch (m_state) {
     case STATE_READ_FROM_PARENT:
       ldout(cct, 20) << "READ_FROM_PARENT" << dendl;
       remove_from_list();
-      if (r >= 0) {
+      if (r >= 0 || r == -ENOENT) {
         return send_object_map();
-      } else if (r == -ENOENT) {
-        return send_copyup();
       }
       break;
 
     case STATE_OBJECT_MAP:
       ldout(cct, 20) << "OBJECT_MAP" << dendl;
-      if (r == 0) {
-	return send_copyup();
-      }
-      break;
+      assert(r == 0);
+      return send_copyup();
 
     case STATE_COPYUP:
-      ldout(cct, 20) << "COPYUP" << dendl;
-      complete_requests(r);
-      return true;
+      // invoked via a finisher in librados, so thread safe
+      pending_copyups = m_pending_copyups.dec();
+      ldout(cct, 20) << "COPYUP (" << pending_copyups << " pending)"
+                     << dendl;
+      if (r == -ENOENT) {
+        // hide the -ENOENT error if this is the last op
+        if (pending_copyups == 0) {
+          complete_requests(0);
+        }
+      } else if (r < 0) {
+        complete_requests(r);
+      }
+      return (pending_copyups == 0);
 
     default:
       lderr(cct) << "invalid state: " << m_state << dendl;
       assert(false);
       break;
     }
-
-    if (r < 0) {
-      complete_requests(r);
-      return true;
-    }
-    return false;
+    return (r < 0);
   }
 
   void CopyupRequest::remove_from_list()
@@ -177,40 +265,50 @@ namespace librbd {
   }
 
   bool CopyupRequest::send_object_map() {
-    bool copyup = true;
     {
       RWLock::RLocker owner_locker(m_ictx->owner_lock);
       RWLock::RLocker snap_locker(m_ictx->snap_lock);
       if (m_ictx->object_map.enabled()) {
+        bool copy_on_read = m_pending_requests.empty();
         if (!m_ictx->image_watcher->is_lock_owner()) {
-         ldout(m_ictx->cct, 20) << "exclusive lock not held for copyup request"
-                                << dendl;
-          assert(m_pending_requests.empty());
+          ldout(m_ictx->cct, 20) << "exclusive lock not held for copyup request"
+                                 << dendl;
+          assert(copy_on_read);
           return true;
         }
 
         RWLock::WLocker object_map_locker(m_ictx->object_map_lock);
-        if (m_ictx->object_map[m_object_no] != OBJECT_EXISTS) {
-          ldout(m_ictx->cct, 20) << __func__ << " " << this
-			         << ": oid " << m_oid
-                                 << ", extents " << m_image_extents
-                                 << dendl;
-          m_state = STATE_OBJECT_MAP;
-
-          Context *ctx = create_callback_context();
-          bool sent = m_ictx->object_map.aio_update(m_object_no, OBJECT_EXISTS,
-                                                    boost::optional<uint8_t>(),
-                                                    ctx);
-          assert(sent);
-          copyup = false;
+        if (copy_on_read && m_ictx->object_map[m_object_no] != OBJECT_EXISTS) {
+          // CoW already updates the HEAD object map
+          m_snap_ids.push_back(CEPH_NOSNAP);
+        }
+        if (!m_ictx->snaps.empty()) {
+          m_snap_ids.insert(m_snap_ids.end(), m_ictx->snaps.begin(),
+                            m_ictx->snaps.end());
         }
       }
     }
 
     // avoid possible recursive lock attempts
-    if (copyup) {
+    if (m_snap_ids.empty()) {
       // no object map update required
       return send_copyup();
+    } else {
+      // update object maps for HEAD and all existing snapshots
+      ldout(m_ictx->cct, 20) << __func__ << " " << this
+      	                     << ": oid " << m_oid
+                             << dendl;
+      m_state = STATE_OBJECT_MAP;
+
+      RWLock::RLocker owner_locker(m_ictx->owner_lock);
+      AsyncObjectThrottle<>::ContextFactory context_factory(
+        boost::lambda::bind(boost::lambda::new_ptr<UpdateObjectMap>(),
+        boost::lambda::_1, m_ictx, m_object_no, &m_snap_ids,
+        boost::lambda::_2));
+      AsyncObjectThrottle<> *throttle = new AsyncObjectThrottle<>(
+        NULL, *m_ictx, context_factory, create_callback_context(), NULL, 0,
+        m_snap_ids.size());
+      throttle->start_ops(m_ictx->concurrent_management_ops);
     }
     return false;
   }
diff --git a/src/librbd/CopyupRequest.h b/src/librbd/CopyupRequest.h
index f8d2e6b..fd1fd87 100644
--- a/src/librbd/CopyupRequest.h
+++ b/src/librbd/CopyupRequest.h
@@ -30,16 +30,22 @@ namespace librbd {
      * Copyup requests go through the following state machine to read from the
      * parent image, update the object map, and copyup the object:
      *
+     *
+     * @verbatim
+     *
      * <start>
      *    |
      *    v
-     * STATE_READ_FROM_PARENT ----> STATE_OBJECT_MAP . . .
-     *    .               .            |                 .
-     *    .               .            v                 .
-     *    .               . . . . > STATE_COPYUP         .
-     *    .                            |                 .
-     *    .                            v                 .
-     *    . . . . . . . . . . . . > <finish> < . . . . . .
+     *  STATE_READ_FROM_PARENT
+     *    .   .        |
+     *    .   .        v
+     *    .   .     STATE_OBJECT_MAP . .
+     *    .   .        |               .
+     *    .   .        v               .
+     *    .   . . > STATE_COPYUP       .
+     *    .            |               .
+     *    .            v               .
+     *    . . . . > <finish> < . . . . .
      *
      * @endverbatim
      *
@@ -60,9 +66,12 @@ namespace librbd {
     State m_state;
     ceph::bufferlist m_copyup_data;
     vector<AioRequest *> m_pending_requests;
+    atomic_t m_pending_copyups;
 
     AsyncOperation m_async_op;
 
+    std::vector<uint64_t> m_snap_ids;
+
     void complete_requests(int r);
 
     void complete(int r);
diff --git a/src/librbd/DiffIterate.cc b/src/librbd/DiffIterate.cc
new file mode 100644
index 0000000..ae52db3
--- /dev/null
+++ b/src/librbd/DiffIterate.cc
@@ -0,0 +1,457 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/DiffIterate.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/internal.h"
+#include "include/rados/librados.hpp"
+#include "include/interval_set.h"
+#include "common/errno.h"
+#include "common/Mutex.h"
+#include "common/Throttle.h"
+#include "librados/snap_set_diff.h"
+#include <boost/tuple/tuple.hpp>
+#include <list>
+#include <map>
+#include <vector>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::DiffIterate: "
+
+namespace librbd {
+
+namespace {
+
+enum ObjectDiffState {
+  OBJECT_DIFF_STATE_NONE    = 0,
+  OBJECT_DIFF_STATE_UPDATED = 1,
+  OBJECT_DIFF_STATE_HOLE    = 2
+};
+
+struct DiffContext {
+  DiffIterate::Callback callback;
+  void *callback_arg;
+  bool whole_object;
+  uint64_t from_snap_id;
+  uint64_t end_snap_id;
+  interval_set<uint64_t> parent_diff;
+  OrderedThrottle throttle;
+
+  DiffContext(ImageCtx &image_ctx, DiffIterate::Callback callback,
+              void *callback_arg, bool _whole_object, uint64_t _from_snap_id,
+              uint64_t _end_snap_id)
+    : callback(callback), callback_arg(callback_arg),
+      whole_object(_whole_object), from_snap_id(_from_snap_id),
+      end_snap_id(_end_snap_id),
+      throttle(image_ctx.concurrent_management_ops, true) {
+  }
+};
+
+class C_DiffObject : public Context {
+public:
+  C_DiffObject(ImageCtx &image_ctx, librados::IoCtx &head_ctx,
+               DiffContext &diff_context, const std::string &oid,
+               uint64_t offset, const std::vector<ObjectExtent> &object_extents)
+    : m_image_ctx(image_ctx), m_head_ctx(head_ctx),
+      m_diff_context(diff_context), m_oid(oid), m_offset(offset),
+      m_object_extents(object_extents), m_snap_ret(0) {
+  }
+
+  void send() {
+    C_OrderedThrottle *ctx = m_diff_context.throttle.start_op(this);
+    librados::AioCompletion *rados_completion =
+      librados::Rados::aio_create_completion(ctx, NULL, rados_ctx_cb);
+
+    librados::ObjectReadOperation op;
+    op.list_snaps(&m_snap_set, &m_snap_ret);
+
+    int r = m_head_ctx.aio_operate(m_oid, rados_completion, &op, NULL);
+    assert(r == 0);
+    rados_completion->release();
+  }
+
+protected:
+  typedef boost::tuple<uint64_t, size_t, bool> Diff;
+  typedef std::list<Diff> Diffs;
+
+  virtual void finish(int r) {
+    CephContext *cct = m_image_ctx.cct;
+    if (r == 0 && m_snap_ret < 0) {
+      r = m_snap_ret;
+    }
+
+    Diffs diffs;
+    if (r == 0) {
+      ldout(cct, 20) << "object " << m_oid << ": list_snaps complete" << dendl;
+      compute_diffs(&diffs);
+    } else if (r == -ENOENT) {
+      ldout(cct, 20) << "object " << m_oid << ": list_snaps (not found)"
+                     << dendl;
+      r = 0;
+      compute_parent_overlap(&diffs);
+    } else {
+      ldout(cct, 20) << "object " << m_oid << ": list_snaps failed: "
+                     << cpp_strerror(r) << dendl;
+    }
+
+    if (r == 0) {
+      for (Diffs::const_iterator d = diffs.begin(); d != diffs.end(); ++d) {
+        r = m_diff_context.callback(d->get<0>(), d->get<1>(), d->get<2>(),
+                                    m_diff_context.callback_arg);
+        if (r < 0) {
+          break;
+        }
+      }
+    }
+    m_diff_context.throttle.end_op(r);
+  }
+
+private:
+  ImageCtx &m_image_ctx;
+  librados::IoCtx &m_head_ctx;
+  DiffContext &m_diff_context;
+  uint64_t m_request_num;
+  std::string m_oid;
+  uint64_t m_offset;
+  std::vector<ObjectExtent> m_object_extents;
+
+  librados::snap_set_t m_snap_set;
+  int m_snap_ret;
+
+  void compute_diffs(Diffs *diffs) {
+    CephContext *cct = m_image_ctx.cct;
+
+    // calc diff from from_snap_id -> to_snap_id
+    interval_set<uint64_t> diff;
+    bool end_exists;
+    calc_snap_set_diff(cct, m_snap_set, m_diff_context.from_snap_id,
+                       m_diff_context.end_snap_id, &diff, &end_exists);
+    ldout(cct, 20) << "  diff " << diff << " end_exists=" << end_exists
+                   << dendl;
+    if (diff.empty()) {
+      return;
+    } else if (m_diff_context.whole_object) {
+      // provide the full object extents to the callback
+      for (vector<ObjectExtent>::iterator q = m_object_extents.begin();
+           q != m_object_extents.end(); ++q) {
+        diffs->push_back(boost::make_tuple(m_offset + q->offset, q->length,
+                                           end_exists));
+      }
+      return;
+    }
+
+    for (vector<ObjectExtent>::iterator q = m_object_extents.begin();
+         q != m_object_extents.end(); ++q) {
+      ldout(cct, 20) << "diff_iterate object " << m_oid << " extent "
+                     << q->offset << "~" << q->length << " from "
+                     << q->buffer_extents << dendl;
+      uint64_t opos = q->offset;
+      for (vector<pair<uint64_t,uint64_t> >::iterator r =
+             q->buffer_extents.begin();
+           r != q->buffer_extents.end(); ++r) {
+        interval_set<uint64_t> overlap;  // object extents
+        overlap.insert(opos, r->second);
+        overlap.intersection_of(diff);
+        ldout(m_image_ctx.cct, 20) << " opos " << opos
+    			             << " buf " << r->first << "~" << r->second
+    			             << " overlap " << overlap << dendl;
+        for (interval_set<uint64_t>::iterator s = overlap.begin();
+    	       s != overlap.end(); ++s) {
+          uint64_t su_off = s.get_start() - opos;
+          uint64_t logical_off = m_offset + r->first + su_off;
+          ldout(cct, 20) << "   overlap extent " << s.get_start() << "~"
+                         << s.get_len() << " logical " << logical_off << "~"
+                         << s.get_len() << dendl;
+          diffs->push_back(boost::make_tuple(logical_off, s.get_len(),
+                           end_exists));
+        }
+        opos += r->second;
+      }
+      assert(opos == q->offset + q->length);
+    }
+  }
+
+  void compute_parent_overlap(Diffs *diffs) {
+    if (m_diff_context.from_snap_id == 0 &&
+        !m_diff_context.parent_diff.empty()) {
+      // report parent diff instead
+      for (vector<ObjectExtent>::iterator q = m_object_extents.begin();
+           q != m_object_extents.end(); ++q) {
+        for (vector<pair<uint64_t,uint64_t> >::iterator r =
+               q->buffer_extents.begin();
+             r != q->buffer_extents.end(); ++r) {
+          interval_set<uint64_t> o;
+          o.insert(m_offset + r->first, r->second);
+          o.intersection_of(m_diff_context.parent_diff);
+          ldout(m_image_ctx.cct, 20) << " reporting parent overlap " << o
+                                     << dendl;
+          for (interval_set<uint64_t>::iterator s = o.begin(); s != o.end();
+               ++s) {
+            diffs->push_back(boost::make_tuple(s.get_start(), s.get_len(),
+                             true));
+          }
+        }
+      }
+    }
+  }
+};
+
+} // anonymous namespace
+
+int DiffIterate::execute() {
+  CephContext* cct = m_image_ctx.cct;
+
+  librados::IoCtx head_ctx;
+  librados::snap_t from_snap_id = 0;
+  librados::snap_t end_snap_id;
+  uint64_t from_size = 0;
+  uint64_t end_size;
+  {
+    RWLock::RLocker md_locker(m_image_ctx.md_lock);
+    RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+    head_ctx.dup(m_image_ctx.data_ctx);
+    if (m_from_snap_name) {
+      from_snap_id = m_image_ctx.get_snap_id(m_from_snap_name);
+      from_size = m_image_ctx.get_image_size(from_snap_id);
+    }
+    end_snap_id = m_image_ctx.snap_id;
+    end_size = m_image_ctx.get_image_size(end_snap_id);
+  }
+
+  if (from_snap_id == CEPH_NOSNAP) {
+    return -ENOENT;
+  }
+  if (from_snap_id == end_snap_id) {
+    // no diff.
+    return 0;
+  }
+  if (from_snap_id >= end_snap_id) {
+    return -EINVAL;
+  }
+
+  int r;
+  bool fast_diff_enabled = false;
+  BitVector<2> object_diff_state;
+  {
+    RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+    if (m_whole_object && (m_image_ctx.features & RBD_FEATURE_FAST_DIFF) != 0) {
+      r = diff_object_map(from_snap_id, end_snap_id, &object_diff_state);
+      if (r < 0) {
+        ldout(cct, 5) << "fast diff disabled" << dendl;
+      } else {
+        ldout(cct, 5) << "fast diff enabled" << dendl;
+        fast_diff_enabled = true;
+      }
+    }
+  }
+
+  // we must list snaps via the head, not end snap
+  head_ctx.snap_set_read(CEPH_SNAPDIR);
+
+  ldout(cct, 5) << "diff_iterate from " << from_snap_id << " to "
+                << end_snap_id << " size from " << from_size
+                << " to " << end_size << dendl;
+
+  // check parent overlap only if we are comparing to the beginning of time
+  DiffContext diff_context(m_image_ctx, m_callback, m_callback_arg,
+                           m_whole_object, from_snap_id, end_snap_id);
+  if (m_include_parent && from_snap_id == 0) {
+    RWLock::RLocker l(m_image_ctx.snap_lock);
+    RWLock::RLocker l2(m_image_ctx.parent_lock);
+    uint64_t overlap = end_size;
+    m_image_ctx.get_parent_overlap(from_snap_id, &overlap);
+    r = 0;
+    if (m_image_ctx.parent && overlap > 0) {
+      ldout(cct, 10) << " first getting parent diff" << dendl;
+      DiffIterate diff_parent(*m_image_ctx.parent, NULL, 0, overlap,
+                              m_include_parent, m_whole_object,
+                              &DiffIterate::simple_diff_cb,
+                              &diff_context.parent_diff);
+      r = diff_parent.execute();
+    }
+    if (r < 0) {
+      return r;
+    }
+  }
+
+  uint64_t period = m_image_ctx.get_stripe_period();
+  uint64_t off = m_offset;
+  uint64_t left = m_length;
+
+  while (left > 0) {
+    uint64_t period_off = off - (off % period);
+    uint64_t read_len = min(period_off + period - off, left);
+
+    // map to extents
+    map<object_t,vector<ObjectExtent> > object_extents;
+    Striper::file_to_extents(cct, m_image_ctx.format_string,
+                             &m_image_ctx.layout, off, read_len, 0,
+                             object_extents, 0);
+
+    // get snap info for each object
+    for (map<object_t,vector<ObjectExtent> >::iterator p =
+           object_extents.begin();
+         p != object_extents.end(); ++p) {
+      ldout(cct, 20) << "object " << p->first << dendl;
+
+      if (fast_diff_enabled) {
+        const uint64_t object_no = p->second.front().objectno;
+        if (object_diff_state[object_no] != OBJECT_DIFF_STATE_NONE) {
+          bool updated = (object_diff_state[object_no] ==
+                            OBJECT_DIFF_STATE_UPDATED);
+          for (std::vector<ObjectExtent>::iterator q = p->second.begin();
+               q != p->second.end(); ++q) {
+            r = m_callback(off + q->offset, q->length, updated, m_callback_arg);
+            if (r < 0) {
+              return r;
+            }
+          }
+        }
+      } else {
+        C_DiffObject *diff_object = new C_DiffObject(m_image_ctx, head_ctx,
+                                                     diff_context,
+                                                     p->first.name, off,
+                                                     p->second);
+        diff_object->send();
+
+        if (diff_context.throttle.pending_error()) {
+          r = diff_context.throttle.wait_for_ret();
+          return r;
+        }
+      }
+    }
+
+    left -= read_len;
+    off += read_len;
+  }
+
+  r = diff_context.throttle.wait_for_ret();
+  if (r < 0) {
+    return r;
+  }
+  return 0;
+}
+
+int DiffIterate::diff_object_map(uint64_t from_snap_id, uint64_t to_snap_id,
+                                 BitVector<2>* object_diff_state) {
+  assert(m_image_ctx.snap_lock.is_locked());
+  CephContext* cct = m_image_ctx.cct;
+
+  bool diff_from_start = (from_snap_id == 0);
+  if (from_snap_id == 0) {
+    if (!m_image_ctx.snaps.empty()) {
+      from_snap_id = m_image_ctx.snaps.back();
+    } else {
+      from_snap_id = CEPH_NOSNAP;
+    }
+  }
+
+  object_diff_state->clear();
+  int r;
+  uint64_t current_snap_id = from_snap_id;
+  uint64_t next_snap_id = to_snap_id;
+  BitVector<2> prev_object_map;
+  bool prev_object_map_valid = false;
+  while (true) {
+    uint64_t current_size = m_image_ctx.size;
+    if (current_snap_id != CEPH_NOSNAP) {
+      std::map<librados::snap_t, SnapInfo>::const_iterator snap_it =
+        m_image_ctx.snap_info.find(current_snap_id);
+      assert(snap_it != m_image_ctx.snap_info.end());
+      current_size = snap_it->second.size;
+
+      ++snap_it;
+      if (snap_it != m_image_ctx.snap_info.end()) {
+        next_snap_id = snap_it->first;
+      } else {
+        next_snap_id = CEPH_NOSNAP;
+      }
+    }
+
+    uint64_t flags;
+    r = m_image_ctx.get_flags(from_snap_id, &flags);
+    if (r < 0) {
+      lderr(cct) << "diff_object_map: failed to retrieve image flags" << dendl;
+      return r;
+    }
+    if ((flags & RBD_FLAG_FAST_DIFF_INVALID) != 0) {
+      ldout(cct, 1) << "diff_object_map: cannot perform fast diff on invalid "
+                    << "object map" << dendl;
+      return -EINVAL;
+    }
+
+    BitVector<2> object_map;
+    std::string oid(ObjectMap::object_map_name(m_image_ctx.id,
+                                               current_snap_id));
+    r = cls_client::object_map_load(&m_image_ctx.md_ctx, oid, &object_map);
+    if (r < 0) {
+      lderr(cct) << "diff_object_map: failed to load object map " << oid
+                 << dendl;
+      return r;
+    }
+    ldout(cct, 20) << "diff_object_map: loaded object map " << oid << dendl;
+
+    uint64_t num_objs = Striper::get_num_objects(m_image_ctx.layout,
+                                                 current_size);
+    if (object_map.size() < num_objs) {
+      ldout(cct, 1) << "diff_object_map: object map too small: "
+                    << object_map.size() << " < " << num_objs << dendl;
+      return -EINVAL;
+    }
+    object_map.resize(num_objs);
+
+    uint64_t overlap = MIN(object_map.size(), prev_object_map.size());
+    for (uint64_t i = 0; i < overlap; ++i) {
+      ldout(cct, 20) << __func__ << ": object state: " << i << " "
+                     << static_cast<uint32_t>(prev_object_map[i])
+                     << "->" << static_cast<uint32_t>(object_map[i]) << dendl;
+      if (object_map[i] == OBJECT_NONEXISTENT) {
+        if (prev_object_map[i] != OBJECT_NONEXISTENT) {
+          (*object_diff_state)[i] = OBJECT_DIFF_STATE_HOLE;
+        }
+      } else if (object_map[i] == OBJECT_EXISTS ||
+                 (prev_object_map[i] != object_map[i] &&
+                  !(prev_object_map[i] == OBJECT_EXISTS &&
+                    object_map[i] == OBJECT_EXISTS_CLEAN))) {
+        (*object_diff_state)[i] = OBJECT_DIFF_STATE_UPDATED;
+      }
+    }
+    ldout(cct, 20) << "diff_object_map: computed overlap diffs" << dendl;
+
+    object_diff_state->resize(object_map.size());
+    if (object_map.size() > prev_object_map.size() &&
+        (diff_from_start || prev_object_map_valid)) {
+      for (uint64_t i = overlap; i < object_diff_state->size(); ++i) {
+        ldout(cct, 20) << __func__ << ": object state: " << i << " "
+                       << "->" << static_cast<uint32_t>(object_map[i]) << dendl;
+        if (object_map[i] == OBJECT_NONEXISTENT) {
+          (*object_diff_state)[i] = OBJECT_DIFF_STATE_NONE;
+        } else {
+          (*object_diff_state)[i] = OBJECT_DIFF_STATE_UPDATED;
+        }
+      }
+    }
+    ldout(cct, 20) << "diff_object_map: computed resize diffs" << dendl;
+
+    if (current_snap_id == next_snap_id || next_snap_id > to_snap_id) {
+      break;
+    }
+    current_snap_id = next_snap_id;
+    prev_object_map = object_map;
+    prev_object_map_valid = true;
+  }
+  return 0;
+}
+
+int DiffIterate::simple_diff_cb(uint64_t off, size_t len, int exists,
+                                void *arg) {
+  // it's possible for a discard to create a hole in the parent image -- ignore
+  if (exists) {
+    interval_set<uint64_t> *diff = static_cast<interval_set<uint64_t> *>(arg);
+    diff->insert(off, len);
+  }
+  return 0;
+}
+
+} // namespace librbd
diff --git a/src/librbd/DiffIterate.h b/src/librbd/DiffIterate.h
new file mode 100644
index 0000000..6b80af3
--- /dev/null
+++ b/src/librbd/DiffIterate.h
@@ -0,0 +1,47 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_LIBRBD_DIFF_ITERATE_H
+#define CEPH_LIBRBD_DIFF_ITERATE_H
+
+#include "include/int_types.h"
+#include "common/bit_vector.hpp"
+
+namespace librbd {
+
+class ImageCtx;
+
+class DiffIterate {
+public:
+  typedef int (*Callback)(uint64_t, size_t, int, void *);
+
+  DiffIterate(ImageCtx &image_ctx, const char *from_snap_name, uint64_t off,
+              uint64_t len, bool include_parent, bool whole_object,
+              Callback callback, void *callback_arg)
+    : m_image_ctx(image_ctx), m_from_snap_name(from_snap_name), m_offset(off),
+      m_length(len), m_include_parent(include_parent),
+      m_whole_object(whole_object), m_callback(callback),
+      m_callback_arg(callback_arg)
+  {
+  }
+
+  int execute();
+
+private:
+  ImageCtx &m_image_ctx;
+  const char* m_from_snap_name;
+  uint64_t m_offset;
+  uint64_t m_length;
+  bool m_include_parent;
+  bool m_whole_object;
+  Callback m_callback;
+  void *m_callback_arg;
+
+  int diff_object_map(uint64_t from_snap_id, uint64_t to_snap_id,
+                      BitVector<2>* object_diff_state);
+
+  static int simple_diff_cb(uint64_t off, size_t len, int exists, void *arg);
+};
+
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_DIFF_ITERATE_H
diff --git a/src/librbd/ImageCtx.cc b/src/librbd/ImageCtx.cc
index 0f5d46a..c68b45c 100644
--- a/src/librbd/ImageCtx.cc
+++ b/src/librbd/ImageCtx.cc
@@ -1,12 +1,13 @@
 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
 // vim: ts=8 sw=2 smarttab
 #include <errno.h>
+#include <boost/assign/list_of.hpp>
+#include <stddef.h>
 
 #include "common/ceph_context.h"
 #include "common/dout.h"
 #include "common/errno.h"
 #include "common/perf_counters.h"
-#include "common/WorkQueue.h"
 
 #include "librbd/AsyncOperation.h"
 #include "librbd/AsyncRequest.h"
@@ -50,6 +51,8 @@ public:
 
 } // anonymous namespace
 
+  const string ImageCtx::METADATA_CONF_PREFIX = "conf_";
+
   ImageCtx::ImageCtx(const string &image_name, const string &image_id,
 		     const char *snap, IoCtx& p, bool ro)
     : cct((CephContext*)p.cct()),
@@ -85,53 +88,12 @@ public:
   {
     md_ctx.dup(p);
     data_ctx.dup(p);
+    if (snap)
+      snap_name = snap;
 
     memset(&header, 0, sizeof(header));
     memset(&layout, 0, sizeof(layout));
 
-    string pname = string("librbd-") + id + string("-") +
-      data_ctx.get_pool_name() + string("/") + name;
-    if (snap) {
-      snap_name = snap;
-      pname += "@";
-      pname += snap_name;
-    }
-    perf_start(pname);
-
-    if (cct->_conf->rbd_cache) {
-      Mutex::Locker l(cache_lock);
-      ldout(cct, 20) << "enabling caching..." << dendl;
-      writeback_handler = new LibrbdWriteback(this, cache_lock);
-
-      uint64_t init_max_dirty = cct->_conf->rbd_cache_max_dirty;
-      if (cct->_conf->rbd_cache_writethrough_until_flush)
-	init_max_dirty = 0;
-      ldout(cct, 20) << "Initial cache settings:"
-		     << " size=" << cct->_conf->rbd_cache_size
-		     << " num_objects=" << 10
-		     << " max_dirty=" << init_max_dirty
-		     << " target_dirty=" << cct->_conf->rbd_cache_target_dirty
-		     << " max_dirty_age="
-		     << cct->_conf->rbd_cache_max_dirty_age << dendl;
-
-      object_cacher = new ObjectCacher(cct, pname, *writeback_handler, cache_lock,
-				       NULL, NULL,
-				       cct->_conf->rbd_cache_size,
-				       10,  /* reset this in init */
-				       init_max_dirty,
-				       cct->_conf->rbd_cache_target_dirty,
-				       cct->_conf->rbd_cache_max_dirty_age,
-				       cct->_conf->rbd_cache_block_writes_upfront);
-      object_set = new ObjectCacher::ObjectSet(NULL, data_ctx.get_id(), 0);
-      object_set->return_enoent = true;
-      object_cacher->start();
-    }
-
-    if (cct->_conf->rbd_clone_copy_on_read) {
-      copyup_finisher = new Finisher(cct);
-      copyup_finisher->start();
-    }
-
     ThreadPoolSingleton *thread_pool_singleton;
     cct->lookup_or_create_singleton_object<ThreadPoolSingleton>(
       thread_pool_singleton, "librbd::thread_pool");
@@ -169,6 +131,15 @@ public:
 
   int ImageCtx::init() {
     int r;
+    string pname = string("librbd-") + id + string("-") +
+      data_ctx.get_pool_name() + string("/") + name;
+    if (!snap_name.empty()) {
+      pname += "@";
+      pname += snap_name;
+    }
+
+    perf_start(pname);
+
     if (id.length()) {
       old_format = false;
     } else {
@@ -190,6 +161,7 @@ public:
       }
 
       header_oid = header_name(id);
+      apply_metadata_confs();
       r = cls_client::get_immutable_metadata(&md_ctx, header_oid,
 					     &object_prefix, &order);
       if (r < 0) {
@@ -208,12 +180,57 @@ public:
 
       init_layout();
     } else {
+      apply_metadata_confs();
       header_oid = old_header_name(name);
     }
 
-    md_config_t *conf = cct->_conf;
-    readahead.set_trigger_requests(conf->rbd_readahead_trigger_requests);
-    readahead.set_max_readahead_size(conf->rbd_readahead_max_bytes);
+    if (cache) {
+      Mutex::Locker l(cache_lock);
+      ldout(cct, 20) << "enabling caching..." << dendl;
+      writeback_handler = new LibrbdWriteback(this, cache_lock);
+
+      uint64_t init_max_dirty = cache_max_dirty;
+      if (cache_writethrough_until_flush)
+	init_max_dirty = 0;
+      ldout(cct, 20) << "Initial cache settings:"
+		     << " size=" << cache_size
+		     << " num_objects=" << 10
+		     << " max_dirty=" << init_max_dirty
+		     << " target_dirty=" << cache_target_dirty
+		     << " max_dirty_age="
+		     << cache_max_dirty_age << dendl;
+
+      object_cacher = new ObjectCacher(cct, pname, *writeback_handler, cache_lock,
+				       NULL, NULL,
+				       cache_size,
+				       10,  /* reset this in init */
+				       init_max_dirty,
+				       cache_target_dirty,
+				       cache_max_dirty_age,
+				       cache_block_writes_upfront);
+
+      // size object cache appropriately
+      uint64_t obj = cache_max_dirty_object;
+      if (!obj) {
+	obj = MIN(2000, MAX(10, cache_size / 100 / sizeof(ObjectCacher::Object)));
+      }
+      ldout(cct, 10) << " cache bytes " << cache_size
+	<< " -> about " << obj << " objects" << dendl;
+      object_cacher->set_max_objects(obj);
+
+      object_set = new ObjectCacher::ObjectSet(NULL, data_ctx.get_id(), 0);
+      object_set->return_enoent = true;
+      object_cacher->start();
+    }
+
+    if (clone_copy_on_read) {
+      copyup_finisher = new Finisher(cct);
+      copyup_finisher->start();
+    }
+
+    readahead.set_trigger_requests(readahead_trigger_requests);
+    readahead.set_max_readahead_size(readahead_max_bytes);
+
     return 0;
   }
 
@@ -245,17 +262,6 @@ public:
       snprintf(format_string, len, "%s.%%016llx", object_prefix.c_str());
     }
 
-    // size object cache appropriately
-    if (object_cacher) {
-      uint64_t obj = cct->_conf->rbd_cache_max_dirty_object;
-      if (!obj) {
-        obj = MIN(2000, MAX(10, cct->_conf->rbd_cache_size / 100 / sizeof(ObjectCacher::Object)));
-      }
-      ldout(cct, 10) << " cache bytes " << cct->_conf->rbd_cache_size
-		     << " -> about " << obj << " objects" << dendl;
-      object_cacher->set_max_objects(obj);
-    }
-
     ldout(cct, 10) << "init_layout stripe_unit " << stripe_unit
 		   << " stripe_count " << stripe_count
 		   << " object_size " << layout.fl_object_size
@@ -267,34 +273,25 @@ public:
   void ImageCtx::perf_start(string name) {
     PerfCountersBuilder plb(cct, name, l_librbd_first, l_librbd_last);
 
-    plb.add_u64_counter(l_librbd_rd, "rd");
-    plb.add_u64_counter(l_librbd_rd_bytes, "rd_bytes");
-    plb.add_time_avg(l_librbd_rd_latency, "rd_latency");
-    plb.add_u64_counter(l_librbd_wr, "wr");
-    plb.add_u64_counter(l_librbd_wr_bytes, "wr_bytes");
-    plb.add_time_avg(l_librbd_wr_latency, "wr_latency");
-    plb.add_u64_counter(l_librbd_discard, "discard");
-    plb.add_u64_counter(l_librbd_discard_bytes, "discard_bytes");
-    plb.add_time_avg(l_librbd_discard_latency, "discard_latency");
-    plb.add_u64_counter(l_librbd_flush, "flush");
-    plb.add_u64_counter(l_librbd_aio_rd, "aio_rd");
-    plb.add_u64_counter(l_librbd_aio_rd_bytes, "aio_rd_bytes");
-    plb.add_time_avg(l_librbd_aio_rd_latency, "aio_rd_latency");
-    plb.add_u64_counter(l_librbd_aio_wr, "aio_wr");
-    plb.add_u64_counter(l_librbd_aio_wr_bytes, "aio_wr_bytes");
-    plb.add_time_avg(l_librbd_aio_wr_latency, "aio_wr_latency");
-    plb.add_u64_counter(l_librbd_aio_discard, "aio_discard");
-    plb.add_u64_counter(l_librbd_aio_discard_bytes, "aio_discard_bytes");
-    plb.add_time_avg(l_librbd_aio_discard_latency, "aio_discard_latency");
-    plb.add_u64_counter(l_librbd_aio_flush, "aio_flush");
-    plb.add_time_avg(l_librbd_aio_flush_latency, "aio_flush_latency");
-    plb.add_u64_counter(l_librbd_snap_create, "snap_create");
-    plb.add_u64_counter(l_librbd_snap_remove, "snap_remove");
-    plb.add_u64_counter(l_librbd_snap_rollback, "snap_rollback");
-    plb.add_u64_counter(l_librbd_notify, "notify");
-    plb.add_u64_counter(l_librbd_resize, "resize");
-    plb.add_u64_counter(l_librbd_readahead, "readahead");
-    plb.add_u64_counter(l_librbd_readahead_bytes, "readahead_bytes");
+    plb.add_u64_counter(l_librbd_rd, "rd", "Reads");
+    plb.add_u64_counter(l_librbd_rd_bytes, "rd_bytes", "Data size in reads");
+    plb.add_time_avg(l_librbd_rd_latency, "rd_latency", "Latency of reads");
+    plb.add_u64_counter(l_librbd_wr, "wr", "Writes");
+    plb.add_u64_counter(l_librbd_wr_bytes, "wr_bytes", "Written data");
+    plb.add_time_avg(l_librbd_wr_latency, "wr_latency", "Write latency");
+    plb.add_u64_counter(l_librbd_discard, "discard", "Discards");
+    plb.add_u64_counter(l_librbd_discard_bytes, "discard_bytes", "Discarded data");
+    plb.add_time_avg(l_librbd_discard_latency, "discard_latency", "Discard latency");
+    plb.add_u64_counter(l_librbd_flush, "flush", "Flushes");
+    plb.add_u64_counter(l_librbd_aio_flush, "aio_flush", "Async flushes");
+    plb.add_time_avg(l_librbd_aio_flush_latency, "aio_flush_latency", "Latency of async flushes");
+    plb.add_u64_counter(l_librbd_snap_create, "snap_create", "Snap creations");
+    plb.add_u64_counter(l_librbd_snap_remove, "snap_remove", "Snap removals");
+    plb.add_u64_counter(l_librbd_snap_rollback, "snap_rollback", "Snap rollbacks");
+    plb.add_u64_counter(l_librbd_notify, "notify", "Updated header notifications");
+    plb.add_u64_counter(l_librbd_resize, "resize", "Resizes");
+    plb.add_u64_counter(l_librbd_readahead, "readahead", "Read ahead");
+    plb.add_u64_counter(l_librbd_readahead_bytes, "readahead_bytes", "Data size in read ahead");
 
     perfcounter = plb.create_perf_counters();
     cct->get_perfcounters_collection()->add(perfcounter);
@@ -315,9 +312,9 @@ public:
     if (snap_id == LIBRADOS_SNAP_HEAD)
       return flags;
 
-    if (cct->_conf->rbd_balance_snap_reads)
+    if (balance_snap_reads)
       flags |= librados::OPERATION_BALANCE_READS;
-    else if (cct->_conf->rbd_localize_snap_reads)
+    else if (localize_snap_reads)
       flags |= librados::OPERATION_LOCALIZE_READS;
     return flags;
   }
@@ -449,17 +446,24 @@ public:
   }
 
   void ImageCtx::add_snap(string in_snap_name, snap_t id, uint64_t in_size,
-			  uint64_t features, parent_info parent,
-			  uint8_t protection_status, uint64_t flags)
+			  parent_info parent, uint8_t protection_status,
+                          uint64_t flags)
   {
     assert(snap_lock.is_wlocked());
     snaps.push_back(id);
-    SnapInfo info(in_snap_name, in_size, features, parent, protection_status,
-		  flags);
+    SnapInfo info(in_snap_name, in_size, parent, protection_status, flags);
     snap_info.insert(pair<snap_t, SnapInfo>(id, info));
     snap_ids.insert(pair<string, snap_t>(in_snap_name, id));
   }
 
+  void ImageCtx::rm_snap(string in_snap_name, snap_t id)
+  {
+    assert(snap_lock.is_wlocked());
+    snaps.erase(std::remove(snaps.begin(), snaps.end(), id), snaps.end());
+    snap_info.erase(id);
+    snap_ids.erase(in_snap_name);
+  }
+
   uint64_t ImageCtx::get_image_size(snap_t in_snap_id) const
   {
     assert(snap_lock.is_locked());
@@ -478,27 +482,10 @@ public:
     return 0;
   }
 
-  int ImageCtx::get_features(snap_t in_snap_id, uint64_t *out_features) const
-  {
-    assert(snap_lock.is_locked());
-    if (in_snap_id == CEPH_NOSNAP) {
-      *out_features = features;
-      return 0;
-    }
-    const SnapInfo *info = get_snap_info(in_snap_id);
-    if (info) {
-      *out_features = info->features;
-      return 0;
-    }
-    return -ENOENT;
-  }
-
   bool ImageCtx::test_features(uint64_t test_features) const
   {
     RWLock::RLocker l(snap_lock);
-    uint64_t snap_features = 0;
-    get_features(snap_id, &snap_features);
-    return ((snap_features & test_features) == test_features);
+    return ((features & test_features) == test_features);
   }
 
   int ImageCtx::get_flags(librados::snap_t _snap_id, uint64_t *_flags) const
@@ -585,12 +572,6 @@ public:
   int ImageCtx::get_parent_overlap(snap_t in_snap_id, uint64_t *overlap) const
   {
     assert(snap_lock.is_locked());
-    if (in_snap_id == CEPH_NOSNAP && !async_resize_reqs.empty() &&
-        async_resize_reqs.front()->shrinking()) {
-      *overlap = async_resize_reqs.front()->get_parent_overlap();
-      return 0;
-    }
-
     const parent_info *info = get_parent_info(in_snap_id);
     if (info) {
       *overlap = info->overlap;
@@ -599,6 +580,17 @@ public:
     return -ENOENT;
   }
 
+  uint64_t ImageCtx::get_copyup_snap_id() const
+  {
+    assert(snap_lock.is_locked());
+    // copyup requires the largest possible parent overlap,
+    // which is always the oldest snapshot (if any).
+    if (!snaps.empty()) {
+      return snaps.back();
+    }
+    return CEPH_NOSNAP;
+  }
+
   void ImageCtx::aio_read_from_cache(object_t o, uint64_t object_no,
 				     bufferlist *bl, size_t len,
 				     uint64_t off, Context *onfinish,
@@ -637,12 +629,12 @@ public:
   }
 
   void ImageCtx::user_flushed() {
-    if (object_cacher && cct->_conf->rbd_cache_writethrough_until_flush) {
+    if (object_cacher && cache_writethrough_until_flush) {
       md_lock.get_read();
       bool flushed_before = flush_encountered;
       md_lock.put_read();
 
-      uint64_t max_dirty = cct->_conf->rbd_cache_max_dirty;
+      uint64_t max_dirty = cache_max_dirty;
       if (!flushed_before && max_dirty > 0) {
 	md_lock.get_write();
 	flush_encountered = true;
@@ -679,12 +671,13 @@ public:
     return r;
   }
 
-  void ImageCtx::shutdown_cache() {
+  int ImageCtx::shutdown_cache() {
     flush_async_operations();
 
     RWLock::RLocker owner_locker(owner_lock);
-    invalidate_cache(true);
+    int r = invalidate_cache(true);
     object_cacher->stop();
+    return r;
   }
 
   int ImageCtx::invalidate_cache(bool purge_on_error) {
@@ -760,22 +753,6 @@ public:
     image_watcher = NULL;
   }
 
-  size_t ImageCtx::parent_io_len(uint64_t offset, size_t length,
-				 snap_t in_snap_id)
-  {
-    uint64_t overlap = 0;
-    get_parent_overlap(in_snap_id, &overlap);
-
-    size_t parent_len = 0;
-    if (get_parent_pool_id(in_snap_id) != -1 && offset <= overlap)
-      parent_len = min(overlap, offset + length) - offset;
-
-    ldout(cct, 20) << __func__ << " off = " << offset << " len = " << length
-		   << " overlap = " << overlap << " parent_io_len = "
-		   << parent_len << dendl;
-    return parent_len;
-  }
-
   uint64_t ImageCtx::prune_parent_extents(vector<pair<uint64_t,uint64_t> >& objectx,
 					  uint64_t overlap)
   {
@@ -821,7 +798,7 @@ public:
     ldout(cct, 10) << "canceling async requests: count="
                    << async_requests.size() << dendl;
 
-    for (xlist<AsyncRequest*>::iterator it = async_requests.begin();
+    for (xlist<AsyncRequest<>*>::iterator it = async_requests.begin();
          !it.end(); ++it) {
       ldout(cct, 10) << "canceling async request: " << *it << dendl;
       (*it)->cancel();
@@ -831,4 +808,122 @@ public:
       async_requests_cond.Wait(async_ops_lock);
     }
   }
+
+  bool ImageCtx::_filter_metadata_confs(const string &prefix, map<string, bool> &configs,
+                                        map<string, bufferlist> &pairs, map<string, bufferlist> *res) {
+    size_t conf_prefix_len = prefix.size();
+
+    string start = prefix;
+    for (map<string, bufferlist>::iterator it = pairs.begin(); it != pairs.end(); ++it) {
+      if (it->first.compare(0, MIN(conf_prefix_len, it->first.size()), prefix) > 0)
+        return false;
+
+      if (it->first.size() <= conf_prefix_len)
+        continue;
+
+      string key = it->first.substr(conf_prefix_len, it->first.size() - conf_prefix_len);
+      map<string, bool>::iterator cit = configs.find(key);
+      if ( cit != configs.end()) {
+        cit->second = true;
+        res->insert(make_pair(key, it->second));
+      }
+    }
+    return true;
+  }
+
+  void ImageCtx::apply_metadata_confs() {
+    ldout(cct, 20) << __func__ << dendl;
+    static uint64_t max_conf_items = 128;
+    std::map<string, bool> configs = boost::assign::map_list_of(
+        "rbd_non_blocking_aio", false)(
+        "rbd_cache", false)(
+        "rbd_cache_writethrough_until_flush", false)(
+        "rbd_cache_size", false)(
+        "rbd_cache_max_dirty", false)(
+        "rbd_cache_target_dirty", false)(
+        "rbd_cache_max_dirty_age", false)(
+        "rbd_cache_max_dirty_object", false)(
+        "rbd_cache_block_writes_upfront", false)(
+        "rbd_concurrent_management_ops", false)(
+        "rbd_balance_snap_reads", false)(
+        "rbd_localize_snap_reads", false)(
+        "rbd_balance_parent_reads", false)(
+        "rbd_localize_parent_reads", false)(
+        "rbd_readahead_trigger_requests", false)(
+        "rbd_readahead_max_bytes", false)(
+        "rbd_readahead_disable_after_bytes", false)(
+        "rbd_clone_copy_on_read", false)(
+        "rbd_blacklist_on_break_lock", false)(
+        "rbd_blacklist_expire_seconds", false)(
+        "rbd_request_timed_out_seconds", false);
+
+    string start = METADATA_CONF_PREFIX;
+    int r = 0, j = 0;
+    md_config_t local_config_t;
+
+    bool retrieve_metadata = !old_format;
+    while (retrieve_metadata) {
+      map<string, bufferlist> pairs, res;
+      r = cls_client::metadata_list(&md_ctx, header_oid, start, max_conf_items,
+                                    &pairs);
+      if (r == -EOPNOTSUPP || r == -EIO) {
+        ldout(cct, 10) << "config metadata not supported by OSD" << dendl;
+        break;
+      } else if (r < 0) {
+        lderr(cct) << __func__ << " couldn't list config metadata: " << r
+                   << dendl;
+        break;
+      }
+      if (pairs.empty()) {
+        break;
+      }
+
+      retrieve_metadata = _filter_metadata_confs(METADATA_CONF_PREFIX, configs,
+                                                 pairs, &res);
+      for (map<string, bufferlist>::iterator it = res.begin();
+           it != res.end(); ++it) {
+        string val(it->second.c_str(), it->second.length());
+        j = local_config_t.set_val(it->first.c_str(), val);
+        if (j < 0) {
+          lderr(cct) << __func__ << " failed to set config " << it->first
+                     << " with value " << it->second.c_str() << ": " << j
+                     << dendl;
+        }
+      }
+      start = pairs.rbegin()->first;
+    }
+
+#define ASSIGN_OPTION(config)                                                  \
+    do {                                                                       \
+      string key = "rbd_";						       \
+      key = key + #config;					      	       \
+      if (configs[key])                                                        \
+        config = local_config_t.rbd_##config;                                  \
+      else                                                                     \
+        config = cct->_conf->rbd_##config;                                     \
+    } while (0);
+
+    ASSIGN_OPTION(non_blocking_aio);
+    ASSIGN_OPTION(cache);
+    ASSIGN_OPTION(cache_writethrough_until_flush);
+    ASSIGN_OPTION(cache_size);
+    ASSIGN_OPTION(cache_max_dirty);
+    ASSIGN_OPTION(cache_target_dirty);
+    ASSIGN_OPTION(cache_max_dirty_age);
+    ASSIGN_OPTION(cache_max_dirty_object);
+    ASSIGN_OPTION(cache_block_writes_upfront);
+    ASSIGN_OPTION(concurrent_management_ops);
+    ASSIGN_OPTION(balance_snap_reads);
+    ASSIGN_OPTION(localize_snap_reads);
+    ASSIGN_OPTION(balance_parent_reads);
+    ASSIGN_OPTION(localize_parent_reads);
+    ASSIGN_OPTION(readahead_trigger_requests);
+    ASSIGN_OPTION(readahead_max_bytes);
+    ASSIGN_OPTION(readahead_disable_after_bytes);
+    ASSIGN_OPTION(clone_copy_on_read);
+    ASSIGN_OPTION(blacklist_on_break_lock);
+    ASSIGN_OPTION(blacklist_expire_seconds);
+    ASSIGN_OPTION(request_timed_out_seconds);
+    ASSIGN_OPTION(enable_alloc_hint);
+  }
 }
diff --git a/src/librbd/ImageCtx.h b/src/librbd/ImageCtx.h
index 238b0ab..3c7f170 100644
--- a/src/librbd/ImageCtx.h
+++ b/src/librbd/ImageCtx.h
@@ -16,6 +16,7 @@
 #include "common/Readahead.h"
 #include "common/RWLock.h"
 #include "common/snap_types.h"
+#include "common/WorkQueue.h"
 #include "include/atomic.h"
 #include "include/buffer.h"
 #include "include/rbd/librbd.hpp"
@@ -31,14 +32,13 @@
 #include "librbd/parent_types.h"
 
 class CephContext;
-class ContextWQ;
 class Finisher;
 class PerfCounters;
 
 namespace librbd {
 
   class AsyncOperation;
-  class AsyncRequest;
+  template <typename ImageCtxT> class AsyncRequest;
   class AsyncResizeRequest;
   class CopyupRequest;
   class ImageWatcher;
@@ -122,7 +122,7 @@ namespace librbd {
     std::map<uint64_t, CopyupRequest*> copyup_list;
 
     xlist<AsyncOperation*> async_ops;
-    xlist<AsyncRequest*> async_requests;
+    xlist<AsyncRequest<>*> async_requests;
     Cond async_requests_cond;
 
     ObjectMap object_map;
@@ -134,6 +134,33 @@ namespace librbd {
     ContextWQ *aio_work_queue;
     ContextWQ *op_work_queue;
 
+    // Configuration
+    static const string METADATA_CONF_PREFIX;
+    bool non_blocking_aio;
+    bool cache;
+    bool cache_writethrough_until_flush;
+    uint64_t cache_size;
+    uint64_t cache_max_dirty;
+    uint64_t cache_target_dirty;
+    double cache_max_dirty_age;
+    uint32_t cache_max_dirty_object;
+    bool cache_block_writes_upfront;
+    uint32_t concurrent_management_ops;
+    bool balance_snap_reads;
+    bool localize_snap_reads;
+    bool balance_parent_reads;
+    bool localize_parent_reads;
+    uint32_t readahead_trigger_requests;
+    uint64_t readahead_max_bytes;
+    uint64_t readahead_disable_after_bytes;
+    bool clone_copy_on_read;
+    bool blacklist_on_break_lock;
+    uint32_t blacklist_expire_seconds;
+    uint32_t request_timed_out_seconds;
+    bool enable_alloc_hint;
+    static bool _filter_metadata_confs(const string &prefix, std::map<string, bool> &configs,
+                                       map<string, bufferlist> &pairs, map<string, bufferlist> *res);
+
     /**
      * Either image_name or image_id must be set.
      * If id is not known, pass the empty std::string,
@@ -169,12 +196,10 @@ namespace librbd {
     uint64_t get_stripe_period() const;
 
     void add_snap(std::string in_snap_name, librados::snap_t id,
-		  uint64_t in_size, uint64_t features,
-		  parent_info parent, uint8_t protection_status,
-		  uint64_t flags);
+		  uint64_t in_size, parent_info parent,
+                  uint8_t protection_status, uint64_t flags);
+    void rm_snap(std::string in_snap_name, librados::snap_t id);
     uint64_t get_image_size(librados::snap_t in_snap_id) const;
-    int get_features(librados::snap_t in_snap_id,
-		     uint64_t *out_features) const;
     bool test_features(uint64_t test_features) const;
     int get_flags(librados::snap_t in_snap_id, uint64_t *flags) const;
     bool test_flags(uint64_t test_flags) const;
@@ -186,6 +211,7 @@ namespace librbd {
     uint64_t get_parent_snap_id(librados::snap_t in_snap_id) const;
     int get_parent_overlap(librados::snap_t in_snap_id,
 			   uint64_t *overlap) const;
+    uint64_t get_copyup_snap_id() const;
     void aio_read_from_cache(object_t o, uint64_t object_no, bufferlist *bl,
 			     size_t len, uint64_t off, Context *onfinish,
 			     int fadvise_flags);
@@ -194,15 +220,13 @@ namespace librbd {
     void user_flushed();
     void flush_cache_aio(Context *onfinish);
     int flush_cache();
-    void shutdown_cache();
+    int shutdown_cache();
     int invalidate_cache(bool purge_on_error=false);
     void invalidate_cache(Context *on_finish);
     void invalidate_cache_completion(int r, Context *on_finish);
     void clear_nonexistence_cache();
     int register_watch();
     void unregister_watch();
-    size_t parent_io_len(uint64_t offset, size_t length,
-			 librados::snap_t in_snap_id);
     uint64_t prune_parent_extents(vector<pair<uint64_t,uint64_t> >& objectx,
 				  uint64_t overlap);
 
@@ -210,6 +234,7 @@ namespace librbd {
     void flush_async_operations(Context *on_finish);
 
     void cancel_async_requests();
+    void apply_metadata_confs();
   };
 }
 
diff --git a/src/librbd/ImageWatcher.cc b/src/librbd/ImageWatcher.cc
index 71b4c86..43644f8 100644
--- a/src/librbd/ImageWatcher.cc
+++ b/src/librbd/ImageWatcher.cc
@@ -157,13 +157,12 @@ int ImageWatcher::try_lock() {
       }
     }
 
-    md_config_t *conf = m_image_ctx.cct->_conf;
-    if (conf->rbd_blacklist_on_break_lock) {
+    if (m_image_ctx.blacklist_on_break_lock) {
       ldout(m_image_ctx.cct, 1) << this << " blacklisting client: " << locker
                                 << "@" << locker_address << dendl;
       librados::Rados rados(m_image_ctx.md_ctx);
       r = rados.blacklist_add(locker_address,
-			      conf->rbd_blacklist_expire_seconds);
+			      m_image_ctx.blacklist_expire_seconds);
       if (r < 0) {
         lderr(m_image_ctx.cct) << this << " unable to blacklist client: "
 			       << cpp_strerror(r) << dendl;
@@ -325,7 +324,7 @@ int ImageWatcher::lock() {
   // send the notification when we aren't holding locks
   FunctionContext *ctx = new FunctionContext(
     boost::bind(&IoCtx::notify2, &m_image_ctx.md_ctx, m_image_ctx.header_oid,
-		bl, NOTIFY_TIMEOUT, reinterpret_cast<bufferlist *>(NULL)));
+		bl, NOTIFY_TIMEOUT, reinterpret_cast<bufferlist *>(0)));
   m_task_finisher->queue(TASK_CODE_ACQUIRED_LOCK, ctx);
   return 0;
 }
@@ -443,8 +442,10 @@ int ImageWatcher::notify_async_complete(const AsyncRequestId &request,
   bufferlist bl;
   ::encode(NotifyMessage(AsyncCompletePayload(request, r)), bl);
 
-  librbd::notify_change(m_image_ctx.md_ctx, m_image_ctx.header_oid,
-			&m_image_ctx);
+  if (r >= 0) {
+    librbd::notify_change(m_image_ctx.md_ctx, m_image_ctx.header_oid,
+			  &m_image_ctx);
+  }
   int ret = m_image_ctx.md_ctx.notify2(m_image_ctx.header_oid, bl,
 				       NOTIFY_TIMEOUT, NULL);
   if (ret < 0) {
@@ -495,6 +496,29 @@ int ImageWatcher::notify_snap_create(const std::string &snap_name) {
   return notify_lock_owner(bl);
 }
 
+int ImageWatcher::notify_snap_remove(const std::string &snap_name) {
+  assert(m_image_ctx.owner_lock.is_locked());
+  assert(!is_lock_owner());
+
+  bufferlist bl;
+  ::encode(NotifyMessage(SnapRemovePayload(snap_name)), bl);
+
+  return notify_lock_owner(bl);
+}
+
+int ImageWatcher::notify_rebuild_object_map(uint64_t request_id,
+                                            ProgressContext &prog_ctx) {
+  assert(m_image_ctx.owner_lock.is_locked());
+  assert(!is_lock_owner());
+
+  AsyncRequestId async_request_id(get_client_id(), request_id);
+
+  bufferlist bl;
+  ::encode(NotifyMessage(RebuildObjectMapPayload(async_request_id)), bl);
+
+  return notify_async_request(async_request_id, bl, prog_ctx);
+}
+
 void ImageWatcher::notify_header_update(librados::IoCtx &io_ctx,
 				        const std::string &oid)
 {
@@ -695,9 +719,7 @@ void ImageWatcher::schedule_async_request_timed_out(const AsyncRequestId &id) {
   Task task(TASK_CODE_ASYNC_REQUEST, id);
   m_task_finisher->cancel(task);
 
-  md_config_t *conf = m_image_ctx.cct->_conf;
-  m_task_finisher->add_event_after(task, conf->rbd_request_timed_out_seconds,
-                                   ctx);
+  m_task_finisher->add_event_after(task, m_image_ctx.request_timed_out_seconds, ctx);
 }
 
 void ImageWatcher::async_request_timed_out(const AsyncRequestId &id) {
@@ -741,6 +763,33 @@ int ImageWatcher::notify_async_request(const AsyncRequestId &async_request_id,
   return ctx.wait();
 }
 
+int ImageWatcher::prepare_async_request(const AsyncRequestId& async_request_id,
+                                        bool* new_request, Context** ctx,
+                                        ProgressContext** prog_ctx) {
+  if (async_request_id.client_id == get_client_id()) {
+    return -ERESTART;
+  } else {
+    RWLock::WLocker l(m_async_request_lock);
+    if (m_async_pending.count(async_request_id) == 0) {
+      m_async_pending.insert(async_request_id);
+      *new_request = true;
+      *prog_ctx = new RemoteProgressContext(*this, async_request_id);
+      *ctx = new RemoteContext(*this, async_request_id, *prog_ctx);
+    } else {
+      *new_request = false;
+    }
+  }
+  return 0;
+}
+
+void ImageWatcher::cleanup_async_request(const AsyncRequestId& async_request_id,
+                                         Context *ctx) {
+  delete ctx;
+
+  RWLock::WLocker l(m_async_request_lock);
+  m_async_pending.erase(async_request_id);
+}
+
 void ImageWatcher::handle_payload(const HeaderUpdatePayload &payload,
 				  bufferlist *out) {
   ldout(m_image_ctx.cct, 10) << this << " image header updated" << dendl;
@@ -851,34 +900,19 @@ void ImageWatcher::handle_payload(const FlattenPayload &payload,
 
   RWLock::RLocker l(m_image_ctx.owner_lock);
   if (m_lock_owner_state == LOCK_OWNER_STATE_LOCKED) {
-    int r = 0;
-    bool new_request = false;
-    if (payload.async_request_id.client_id == get_client_id()) {
-      r = -ERESTART;
-    } else {
-      RWLock::WLocker l(m_async_request_lock);
-      if (m_async_pending.count(payload.async_request_id) == 0) {
-	m_async_pending.insert(payload.async_request_id);
-	new_request = true;
-      }
-    }
-
+    bool new_request;
+    Context *ctx;
+    ProgressContext *prog_ctx;
+    int r = prepare_async_request(payload.async_request_id, &new_request,
+                                  &ctx, &prog_ctx);
     if (new_request) {
-      RemoteProgressContext *prog_ctx =
-	new RemoteProgressContext(*this, payload.async_request_id);
-      RemoteContext *ctx = new RemoteContext(*this, payload.async_request_id,
-					     prog_ctx);
-
       ldout(m_image_ctx.cct, 10) << this << " remote flatten request: "
 				 << payload.async_request_id << dendl;
       r = librbd::async_flatten(&m_image_ctx, ctx, *prog_ctx);
       if (r < 0) {
-	delete ctx;
 	lderr(m_image_ctx.cct) << this << " remove flatten request failed: "
 			       << cpp_strerror(r) << dendl;
-
-	RWLock::WLocker l(m_async_request_lock);
-	m_async_pending.erase(payload.async_request_id);
+        cleanup_async_request(payload.async_request_id, ctx);
       }
     }
 
@@ -890,24 +924,12 @@ void ImageWatcher::handle_payload(const ResizePayload &payload,
 				  bufferlist *out) {
   RWLock::RLocker l(m_image_ctx.owner_lock);
   if (m_lock_owner_state == LOCK_OWNER_STATE_LOCKED) {
-    int r = 0;
-    bool new_request = false;
-    if (payload.async_request_id.client_id == get_client_id()) {
-      r = -ERESTART;
-    } else {
-      RWLock::WLocker l(m_async_request_lock);
-      if (m_async_pending.count(payload.async_request_id) == 0) {
-	m_async_pending.insert(payload.async_request_id);
-	new_request = true;
-      }
-    }
-
+    bool new_request;
+    Context *ctx;
+    ProgressContext *prog_ctx;
+    int r = prepare_async_request(payload.async_request_id, &new_request,
+                                  &ctx, &prog_ctx);
     if (new_request) {
-      RemoteProgressContext *prog_ctx =
-	new RemoteProgressContext(*this, payload.async_request_id);
-      RemoteContext *ctx = new RemoteContext(*this, payload.async_request_id,
-					     prog_ctx);
-
       ldout(m_image_ctx.cct, 10) << this << " remote resize request: "
 				 << payload.async_request_id << " "
 				 << payload.size << dendl;
@@ -915,10 +937,7 @@ void ImageWatcher::handle_payload(const ResizePayload &payload,
       if (r < 0) {
 	lderr(m_image_ctx.cct) << this << " remove resize request failed: "
 			       << cpp_strerror(r) << dendl;
-	delete ctx;
-
-	RWLock::WLocker l(m_async_request_lock);
-	m_async_pending.erase(payload.async_request_id);
+        cleanup_async_request(payload.async_request_id, ctx);
       }
     }
 
@@ -939,6 +958,45 @@ void ImageWatcher::handle_payload(const SnapCreatePayload &payload,
   }
 }
 
+void ImageWatcher::handle_payload(const SnapRemovePayload &payload,
+				  bufferlist *out) {
+  RWLock::RLocker l(m_image_ctx.owner_lock);
+  if (m_lock_owner_state == LOCK_OWNER_STATE_LOCKED) {
+    ldout(m_image_ctx.cct, 10) << this << " remote snap_remove request: "
+			       << payload.snap_name << dendl;
+    int r = librbd::snap_remove_helper(&m_image_ctx, NULL,
+                                       payload.snap_name.c_str());
+
+    ::encode(ResponseMessage(r), *out);
+  }
+}
+
+void ImageWatcher::handle_payload(const RebuildObjectMapPayload& payload,
+                                  bufferlist *out) {
+  RWLock::RLocker l(m_image_ctx.owner_lock);
+  if (m_lock_owner_state == LOCK_OWNER_STATE_LOCKED) {
+    bool new_request;
+    Context *ctx;
+    ProgressContext *prog_ctx;
+    int r = prepare_async_request(payload.async_request_id, &new_request,
+                                  &ctx, &prog_ctx);
+    if (new_request) {
+      ldout(m_image_ctx.cct, 10) << this
+                                 << " remote rebuild object map request: "
+                                 << payload.async_request_id << dendl;
+      r = librbd::async_rebuild_object_map(&m_image_ctx, ctx, *prog_ctx);
+      if (r < 0) {
+        lderr(m_image_ctx.cct) << this
+                               << " remove rebuild object map request failed: "
+                               << cpp_strerror(r) << dendl;
+        cleanup_async_request(payload.async_request_id, ctx);
+      }
+    }
+
+    ::encode(ResponseMessage(0), *out);
+  }
+}
+
 void ImageWatcher::handle_payload(const UnknownPayload &payload,
 				  bufferlist *out) {
   RWLock::RLocker l(m_image_ctx.owner_lock);
diff --git a/src/librbd/ImageWatcher.h b/src/librbd/ImageWatcher.h
index 760a698..6ebeb9d 100644
--- a/src/librbd/ImageWatcher.h
+++ b/src/librbd/ImageWatcher.h
@@ -50,6 +50,9 @@ namespace librbd {
     int notify_resize(uint64_t request_id, uint64_t size,
 		      ProgressContext &prog_ctx);
     int notify_snap_create(const std::string &snap_name);
+    int notify_snap_remove(const std::string &snap_name);
+    int notify_rebuild_object_map(uint64_t request_id,
+                                  ProgressContext &prog_ctx);
 
     static void notify_header_update(librados::IoCtx &io_ctx,
 				     const std::string &oid);
@@ -140,7 +143,7 @@ namespace librbd {
     public:
       RemoteContext(ImageWatcher &image_watcher,
 		    const WatchNotify::AsyncRequestId &id,
-		    RemoteProgressContext *prog_ctx)
+		    ProgressContext *prog_ctx)
         : m_image_watcher(image_watcher), m_async_request_id(id),
 	  m_prog_ctx(prog_ctx)
       {
@@ -155,7 +158,7 @@ namespace librbd {
     private:
       ImageWatcher &m_image_watcher;
       WatchNotify::AsyncRequestId m_async_request_id;
-      RemoteProgressContext *m_prog_ctx;
+      ProgressContext *m_prog_ctx;
     };
 
     struct HandlePayloadVisitor : public boost::static_visitor<void> {
@@ -242,6 +245,12 @@ namespace librbd {
     int notify_async_complete(const WatchNotify::AsyncRequestId &id,
 			      int r);
 
+    int prepare_async_request(const WatchNotify::AsyncRequestId& id,
+                              bool* new_request, Context** ctx,
+                              ProgressContext** prog_ctx);
+    void cleanup_async_request(const WatchNotify::AsyncRequestId& id,
+                               Context *ctx);
+
     void handle_payload(const WatchNotify::HeaderUpdatePayload& payload,
 		        bufferlist *out);
     void handle_payload(const WatchNotify::AcquiredLockPayload& payload,
@@ -260,6 +269,10 @@ namespace librbd {
 		        bufferlist *out);
     void handle_payload(const WatchNotify::SnapCreatePayload& payload,
 		        bufferlist *out);
+    void handle_payload(const WatchNotify::SnapRemovePayload& payload,
+		        bufferlist *out);
+    void handle_payload(const WatchNotify::RebuildObjectMapPayload& payload,
+                        bufferlist *out);
     void handle_payload(const WatchNotify::UnknownPayload& payload,
 		        bufferlist *out);
 
diff --git a/src/librbd/Makefile.am b/src/librbd/Makefile.am
index 96d0a00..4360497 100644
--- a/src/librbd/Makefile.am
+++ b/src/librbd/Makefile.am
@@ -16,11 +16,13 @@ librbd_internal_la_SOURCES = \
 	librbd/AsyncResizeRequest.cc \
 	librbd/AsyncTrimRequest.cc \
 	librbd/CopyupRequest.cc \
+	librbd/DiffIterate.cc \
 	librbd/ImageCtx.cc \
 	librbd/ImageWatcher.cc \
 	librbd/internal.cc \
 	librbd/LibrbdWriteback.cc \
-	librbd/ObjectMap.cc
+	librbd/ObjectMap.cc \
+	librbd/RebuildObjectMapRequest.cc
 noinst_LTLIBRARIES += librbd_internal.la
 
 librbd_api_la_SOURCES = \
@@ -37,10 +39,6 @@ librbd_la_LIBADD = \
 	libcls_lock_client.la \
 	$(PTHREAD_LIBS) $(EXTRALIBS)
 
-if WITH_LTTNG
-librbd_la_LIBADD += $(LIBRBD_TP)
-endif
-
 librbd_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0
 if LINUX
 librbd_la_CXXFLAGS = -fvisibility=hidden -fvisibility-inlines-hidden
@@ -58,12 +56,14 @@ noinst_HEADERS += \
 	librbd/AsyncResizeRequest.h \
 	librbd/AsyncTrimRequest.h \
 	librbd/CopyupRequest.h \
+	librbd/DiffIterate.h \
 	librbd/ImageCtx.h \
 	librbd/ImageWatcher.h \
 	librbd/internal.h \
 	librbd/LibrbdWriteback.h \
 	librbd/ObjectMap.h \
 	librbd/parent_types.h \
+	librbd/RebuildObjectMapRequest.h \
 	librbd/SnapInfo.h \
 	librbd/TaskFinisher.h \
 	librbd/WatchNotifyTypes.h
diff --git a/src/librbd/ObjectMap.cc b/src/librbd/ObjectMap.cc
index 9e7aae2..d947807 100644
--- a/src/librbd/ObjectMap.cc
+++ b/src/librbd/ObjectMap.cc
@@ -17,7 +17,7 @@
 namespace librbd {
 
 ObjectMap::ObjectMap(ImageCtx &image_ctx)
-  : m_image_ctx(image_ctx), m_enabled(false)
+  : m_image_ctx(image_ctx), m_snap_id(CEPH_NOSNAP), m_enabled(false)
 {
 }
 
@@ -33,6 +33,13 @@ std::string ObjectMap::object_map_name(const std::string &image_id,
   return oid;
 }
 
+ceph::BitVector<2u>::Reference ObjectMap::operator[](uint64_t object_no)
+{
+  assert(m_image_ctx.object_map_lock.is_wlocked());
+  assert(object_no < m_object_map.size());
+  return m_object_map[object_no];
+}
+
 uint8_t ObjectMap::operator[](uint64_t object_no) const
 {
   assert(m_image_ctx.object_map_lock.is_locked());
@@ -59,11 +66,11 @@ int ObjectMap::lock()
     }
   }
 
-  int r;
   bool broke_lock = false;
   CephContext *cct = m_image_ctx.cct;
   std::string oid(object_map_name(m_image_ctx.id, CEPH_NOSNAP));
   while (true) {
+    int r;
     ldout(cct, 10) << &m_image_ctx << " locking object map" << dendl;
     r = rados::cls::lock::lock(&m_image_ctx.md_ctx, oid,
 			       RBD_LOCK_NAME, LOCK_EXCLUSIVE, "", "", "",
@@ -80,9 +87,9 @@ int ObjectMap::lock()
     lockers_t lockers;
     ClsLockType lock_type;
     std::string lock_tag;
-    int r = rados::cls::lock::get_lock_info(&m_image_ctx.md_ctx, oid,
-                                            RBD_LOCK_NAME, &lockers,
-                                            &lock_type, &lock_tag);
+    r = rados::cls::lock::get_lock_info(&m_image_ctx.md_ctx, oid,
+                                        RBD_LOCK_NAME, &lockers,
+                                        &lock_type, &lock_tag);
     if (r == -ENOENT) {
       continue;
     } else if (r < 0) {
@@ -142,10 +149,8 @@ bool ObjectMap::object_may_exist(uint64_t object_no) const
   if (!m_enabled) {
     return true;
   }
-  assert(object_no < m_object_map.size());
-
   uint8_t state = (*this)[object_no];
-  bool exists = (state == OBJECT_EXISTS || state == OBJECT_PENDING);
+  bool exists = (state != OBJECT_NONEXISTENT);
   ldout(m_image_ctx.cct, 20) << &m_image_ctx << " object_may_exist: "
 			     << "object_no=" << object_no << " r=" << exists
 			     << dendl;
@@ -156,10 +161,9 @@ void ObjectMap::refresh(uint64_t snap_id)
 {
   assert(m_image_ctx.snap_lock.is_wlocked());
   RWLock::WLocker l(m_image_ctx.object_map_lock);
+  m_snap_id = snap_id;
 
-  uint64_t features;
-  m_image_ctx.get_features(snap_id, &features);
-  if ((features & RBD_FEATURE_OBJECT_MAP) == 0 ||
+  if ((m_image_ctx.features & RBD_FEATURE_OBJECT_MAP) == 0 ||
       (m_image_ctx.snap_id == snap_id && !m_image_ctx.snap_exists)) {
     m_object_map.clear();
     m_enabled = false;
@@ -170,13 +174,35 @@ void ObjectMap::refresh(uint64_t snap_id)
   CephContext *cct = m_image_ctx.cct;
   ldout(cct, 10) << &m_image_ctx << " refreshing object map" << dendl;
 
+  uint64_t num_objs = Striper::get_num_objects(
+    m_image_ctx.layout, m_image_ctx.get_image_size(snap_id));
+
   std::string oid(object_map_name(m_image_ctx.id, snap_id));
   int r = cls_client::object_map_load(&m_image_ctx.md_ctx, oid,
                                       &m_object_map);
+  if (r == -EINVAL) {
+    // object map is corrupt on-disk -- clear it and properly size it
+    // so future IO can keep the object map in sync
+    invalidate(snap_id, false);
+
+    librados::ObjectWriteOperation op;
+    if (snap_id == CEPH_NOSNAP) {
+      rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, LOCK_EXCLUSIVE, "",
+                                      "");
+    }
+    op.truncate(0);
+    cls_client::object_map_resize(&op, num_objs, OBJECT_NONEXISTENT);
+
+    r = m_image_ctx.md_ctx.operate(oid, &op);
+    if (r == 0) {
+      m_object_map.clear();
+      resize(num_objs, OBJECT_NONEXISTENT);
+    }
+  }
   if (r < 0) {
     lderr(cct) << "error refreshing object map: " << cpp_strerror(r)
                << dendl;
-    invalidate();
+    invalidate(snap_id, false);
     m_object_map.clear();
     return;
   }
@@ -184,12 +210,23 @@ void ObjectMap::refresh(uint64_t snap_id)
   ldout(cct, 20) << "refreshed object map: " << m_object_map.size()
                  << dendl;
 
-  uint64_t num_objs = Striper::get_num_objects(
-    m_image_ctx.layout, m_image_ctx.get_image_size(snap_id));
   if (m_object_map.size() < num_objs) {
     lderr(cct) << "object map smaller than current object count: "
                << m_object_map.size() << " != " << num_objs << dendl;
-    invalidate();
+    invalidate(snap_id, false);
+
+    // correct the size issue so future IO can keep the object map in sync
+    librados::ObjectWriteOperation op;
+    if (snap_id == CEPH_NOSNAP) {
+      rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, LOCK_EXCLUSIVE, "",
+                                      "");
+    }
+    cls_client::object_map_resize(&op, num_objs, OBJECT_NONEXISTENT);
+
+    r = m_image_ctx.md_ctx.operate(oid, &op);
+    if (r == 0) {
+      resize(num_objs, OBJECT_NONEXISTENT);
+    }
   } else if (m_object_map.size() > num_objs) {
     // resize op might have been interrupted
     ldout(cct, 1) << "object map larger than current object count: "
@@ -205,9 +242,7 @@ void ObjectMap::rollback(uint64_t snap_id) {
   CephContext *cct = m_image_ctx.cct;
   ldout(cct, 10) << &m_image_ctx << " rollback object map" << dendl;
 
-  uint64_t features;
-  m_image_ctx.get_features(snap_id, &features);
-  if ((features & RBD_FEATURE_OBJECT_MAP) == 0) {
+  if ((m_image_ctx.features & RBD_FEATURE_OBJECT_MAP) == 0) {
     r = m_image_ctx.md_ctx.remove(oid);
     if (r < 0 && r != -ENOENT) {
       lderr(cct) << "unable to remove object map: " << cpp_strerror(r)
@@ -227,7 +262,7 @@ void ObjectMap::rollback(uint64_t snap_id) {
   if (r < 0) {
     lderr(cct) << "unable to load snapshot object map '" << snap_oid << "': "
 	       << cpp_strerror(r) << dendl;
-    invalidate();
+    invalidate(snap_id, false);
     return;
   }
 
@@ -239,15 +274,13 @@ void ObjectMap::rollback(uint64_t snap_id) {
   if (r < 0) {
     lderr(cct) << "unable to rollback object map: " << cpp_strerror(r)
 	       << dendl;
-    invalidate();
+    invalidate(CEPH_NOSNAP, true);
   }
 }
 
-void ObjectMap::snapshot(uint64_t snap_id) {
+void ObjectMap::snapshot_add(uint64_t snap_id) {
   assert(m_image_ctx.snap_lock.is_wlocked());
-  uint64_t features;
-  m_image_ctx.get_features(CEPH_NOSNAP, &features);
-  if ((features & RBD_FEATURE_OBJECT_MAP) == 0) {
+  if ((m_image_ctx.features & RBD_FEATURE_OBJECT_MAP) == 0) {
     return;
   }
 
@@ -265,7 +298,8 @@ void ObjectMap::snapshot(uint64_t snap_id) {
   if (r < 0) {
     lderr(cct) << "unable to load object map: " << cpp_strerror(r)
 	       << dendl;
-    invalidate();
+    invalidate(CEPH_NOSNAP, false);
+    return;
   }
 
   std::string snap_oid(object_map_name(m_image_ctx.id, snap_id));
@@ -273,19 +307,134 @@ void ObjectMap::snapshot(uint64_t snap_id) {
   if (r < 0) {
     lderr(cct) << "unable to snapshot object map '" << snap_oid << "': "
 	       << cpp_strerror(r) << dendl;
-    invalidate();
+    invalidate(snap_id, false);
+    return;
+  }
+
+  if ((m_image_ctx.features & RBD_FEATURE_FAST_DIFF) != 0) {
+    librados::ObjectWriteOperation op;
+    rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, LOCK_EXCLUSIVE, "", "");
+    cls_client::object_map_snap_add(&op);
+    r = m_image_ctx.md_ctx.operate(oid, &op);
+    if (r < 0) {
+      lderr(cct) << "unable to snapshot object map: " << cpp_strerror(r)
+                 << dendl;
+      invalidate(CEPH_NOSNAP, true);
+      return;
+    }
+
+    for (uint64_t i = 0; i < m_object_map.size(); ++i) {
+      if (m_object_map[i] == OBJECT_EXISTS) {
+        m_object_map[i] = OBJECT_EXISTS_CLEAN;
+      }
+    }
   }
 }
 
+int ObjectMap::snapshot_remove(uint64_t snap_id) {
+  assert(m_image_ctx.snap_lock.is_wlocked());
+  assert(snap_id != CEPH_NOSNAP);
+  CephContext *cct = m_image_ctx.cct;
+
+  int r;
+  if ((m_image_ctx.features & RBD_FEATURE_FAST_DIFF) != 0) {
+    RWLock::WLocker l(m_image_ctx.object_map_lock);
+
+    uint64_t next_snap_id = CEPH_NOSNAP;
+    std::map<librados::snap_t, SnapInfo>::const_iterator it =
+      m_image_ctx.snap_info.find(snap_id);
+    assert(it != m_image_ctx.snap_info.end());
+
+    ++it;
+    if (it != m_image_ctx.snap_info.end()) {
+      next_snap_id = it->first;
+    }
+
+    ceph::BitVector<2> snap_object_map;
+    std::string snap_oid(object_map_name(m_image_ctx.id, snap_id));
+    r = cls_client::object_map_load(&m_image_ctx.md_ctx, snap_oid,
+                                    &snap_object_map);
+    if (r < 0) {
+      lderr(cct) << "error loading snapshot object map: " << cpp_strerror(r)
+                 << dendl;
+    }
+
+    if (r == 0) {
+      uint64_t flags;
+      m_image_ctx.get_flags(snap_id, &flags);
+      if ((flags & RBD_FLAG_OBJECT_MAP_INVALID) != 0) {
+        invalidate(next_snap_id, true);
+        r = -EINVAL;
+      }
+    }
+
+    if (r == 0) {
+      std::string oid(object_map_name(m_image_ctx.id, next_snap_id));
+      librados::ObjectWriteOperation op;
+      if (next_snap_id == CEPH_NOSNAP) {
+        rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, LOCK_EXCLUSIVE, "",
+                                        "");
+      }
+      cls_client::object_map_snap_remove(&op, snap_object_map);
+
+      r = m_image_ctx.md_ctx.operate(oid, &op);
+      if (r < 0) {
+        lderr(cct) << "unable to remove object map snapshot: "
+                   << cpp_strerror(r) << dendl;
+        invalidate(next_snap_id, true);
+      }
+    }
+
+    if (r == 0 && next_snap_id == CEPH_NOSNAP) {
+      for (uint64_t i = 0; i < m_object_map.size(); ++i) {
+        if (m_object_map[i] == OBJECT_EXISTS_CLEAN &&
+            (i >= snap_object_map.size() ||
+             snap_object_map[i] == OBJECT_EXISTS)) {
+          m_object_map[i] = OBJECT_EXISTS;
+        }
+      }
+    }
+  }
+
+  std::string oid(object_map_name(m_image_ctx.id, snap_id));
+  r = m_image_ctx.md_ctx.remove(oid);
+  if (r < 0 && r != -ENOENT) {
+    return r;
+  }
+  return 0;
+}
+
+void ObjectMap::aio_save(Context *on_finish)
+{
+  assert(m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP));
+  assert(m_image_ctx.owner_lock.is_locked());
+  RWLock::RLocker object_map_locker(m_image_ctx.object_map_lock);
+
+  librados::ObjectWriteOperation op;
+  if (m_snap_id == CEPH_NOSNAP) {
+    rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, LOCK_EXCLUSIVE, "", "");
+  }
+  cls_client::object_map_save(&op, m_object_map);
+
+  std::string oid(object_map_name(m_image_ctx.id, m_snap_id));
+  librados::AioCompletion *comp = librados::Rados::aio_create_completion(
+    on_finish, NULL, rados_ctx_cb);
+
+  int r = m_image_ctx.md_ctx.aio_operate(oid, comp, &op);
+  assert(r == 0);
+  comp->release();
+}
+
 void ObjectMap::aio_resize(uint64_t new_size, uint8_t default_object_state,
 			   Context *on_finish) {
   assert(m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP));
   assert(m_image_ctx.owner_lock.is_locked());
   assert(m_image_ctx.image_watcher != NULL);
-  assert(m_image_ctx.image_watcher->is_lock_owner());
+  assert(!m_image_ctx.image_watcher->is_lock_supported() ||
+         m_image_ctx.image_watcher->is_lock_owner());
 
   ResizeRequest *req = new ResizeRequest(
-    m_image_ctx, new_size, default_object_state, on_finish);
+    m_image_ctx, m_snap_id, new_size, default_object_state, on_finish);
   req->send();
 }
 
@@ -306,46 +455,69 @@ bool ObjectMap::aio_update(uint64_t start_object_no, uint64_t end_object_no,
   assert((m_image_ctx.features & RBD_FEATURE_OBJECT_MAP) != 0);
   assert(m_image_ctx.owner_lock.is_locked());
   assert(m_image_ctx.image_watcher != NULL);
-  assert(m_image_ctx.image_watcher->is_lock_owner());
+  assert(!m_image_ctx.image_watcher->is_lock_supported(m_image_ctx.snap_lock) ||
+         m_image_ctx.image_watcher->is_lock_owner());
   assert(m_image_ctx.object_map_lock.is_wlocked());
   assert(start_object_no < end_object_no);
-  
+
   CephContext *cct = m_image_ctx.cct;
   ldout(cct, 20) << &m_image_ctx << " aio_update: start=" << start_object_no
-		 << ", end=" << end_object_no << ", new_state="
-		 << static_cast<uint32_t>(new_state) << dendl;
+		 << ", end=" << end_object_no << ", "
+                 << (current_state ?
+                       stringify(static_cast<uint32_t>(*current_state)) : "")
+		 << "->" << static_cast<uint32_t>(new_state) << dendl;
   if (end_object_no > m_object_map.size()) {
     ldout(cct, 20) << "skipping update of invalid object map" << dendl;
     return false;
   }
-  
+
   for (uint64_t object_no = start_object_no; object_no < end_object_no;
        ++object_no) {
-    if ((!current_state || m_object_map[object_no] == *current_state) &&
-        m_object_map[object_no] != new_state) {
-      UpdateRequest *req = new UpdateRequest(m_image_ctx, start_object_no,
-					     end_object_no, new_state,
-					     current_state, on_finish);
-      req->send();
+    uint8_t state = m_object_map[object_no];
+    if ((!current_state || state == *current_state ||
+          (*current_state == OBJECT_EXISTS && state == OBJECT_EXISTS_CLEAN)) &&
+        state != new_state) {
+      aio_update(m_snap_id, start_object_no, end_object_no, new_state,
+                 current_state, on_finish);
       return true;
     }
   }
   return false;
 }
 
-void ObjectMap::invalidate() {
+void ObjectMap::aio_update(uint64_t snap_id, uint64_t start_object_no,
+                           uint64_t end_object_no, uint8_t new_state,
+                           const boost::optional<uint8_t> &current_state,
+                           Context *on_finish) {
+  UpdateRequest *req = new UpdateRequest(m_image_ctx, snap_id,
+                                         start_object_no, end_object_no,
+                                         new_state, current_state,
+                                         on_finish);
+  req->send();
+}
+
+void ObjectMap::invalidate(uint64_t snap_id, bool force) {
   assert(m_image_ctx.snap_lock.is_wlocked());
   assert(m_image_ctx.object_map_lock.is_wlocked());
   uint64_t flags;
-  m_image_ctx.get_flags(m_image_ctx.snap_id, &flags);
+  m_image_ctx.get_flags(snap_id, &flags);
   if ((flags & RBD_FLAG_OBJECT_MAP_INVALID) != 0) {
     return;
   }
 
+  flags = RBD_FLAG_OBJECT_MAP_INVALID;
+  if ((m_image_ctx.features & RBD_FEATURE_FAST_DIFF) != 0) {
+    flags |= RBD_FLAG_FAST_DIFF_INVALID;
+  }
+
   CephContext *cct = m_image_ctx.cct;
   lderr(cct) << &m_image_ctx << " invalidating object map" << dendl;
-  m_image_ctx.update_flags(m_image_ctx.snap_id, RBD_FLAG_OBJECT_MAP_INVALID,
-                           true);
+  int r = m_image_ctx.update_flags(snap_id, flags, true);
+  if (r < 0) {
+    lderr(cct) << "failed to invalidate in-memory object map: "
+               << cpp_strerror(r) << dendl;
+    return;
+  }
 
   // do not update on-disk flags if not image owner
   if (m_image_ctx.image_watcher == NULL ||
@@ -355,13 +527,12 @@ void ObjectMap::invalidate() {
   }
 
   librados::ObjectWriteOperation op;
-  if (m_image_ctx.snap_id == CEPH_NOSNAP) {
+  if (snap_id == CEPH_NOSNAP && !force) {
     m_image_ctx.image_watcher->assert_header_locked(&op);
   }
-  cls_client::set_flags(&op, m_image_ctx.snap_id, m_image_ctx.flags,
-                        RBD_FLAG_OBJECT_MAP_INVALID);
+  cls_client::set_flags(&op, snap_id, flags, flags);
 
-  int r = m_image_ctx.md_ctx.operate(m_image_ctx.header_oid, &op);
+  r = m_image_ctx.md_ctx.operate(m_image_ctx.header_oid, &op);
   if (r == -EBUSY) {
     ldout(cct, 5) << "skipping on-disk object map invalidation: "
                   << "image not locked by client" << dendl;
@@ -371,6 +542,15 @@ void ObjectMap::invalidate() {
   }
 }
 
+void ObjectMap::resize(uint64_t num_objs, uint8_t defualt_state) {
+  size_t orig_object_map_size = m_object_map.size();
+  m_object_map.resize(num_objs);
+  for (uint64_t i = orig_object_map_size;
+       i < m_object_map.size(); ++i) {
+    m_object_map[i] = defualt_state;
+  }
+}
+
 bool ObjectMap::Request::should_complete(int r) {
   CephContext *cct = m_image_ctx.cct;
   ldout(cct, 20) << &m_image_ctx << " should_complete: r=" << r << dendl;
@@ -380,7 +560,7 @@ bool ObjectMap::Request::should_complete(int r) {
   case STATE_REQUEST:
     if (r == -EBUSY) {
       lderr(cct) << "object map lock not owned by client" << dendl;
-      return true;
+      return invalidate();
     } else if (r < 0) {
       lderr(cct) << "failed to update object map: " << cpp_strerror(r)
 		 << dendl;
@@ -420,14 +600,17 @@ bool ObjectMap::Request::invalidate() {
   // requests shouldn't be running while using snapshots
   assert(m_image_ctx.snap_id == CEPH_NOSNAP);
 
+  uint64_t flags = RBD_FLAG_OBJECT_MAP_INVALID;
+  if ((m_image_ctx.features & RBD_FEATURE_FAST_DIFF) != 0) {
+    flags |= RBD_FLAG_FAST_DIFF_INVALID;
+  }
+
   lderr(cct) << &m_image_ctx << " invalidating object map" << dendl;
   m_state = STATE_INVALIDATE;
-  m_image_ctx.flags |= RBD_FLAG_OBJECT_MAP_INVALID;
+  m_image_ctx.flags |= flags;
 
   librados::ObjectWriteOperation op;
-  m_image_ctx.image_watcher->assert_header_locked(&op);
-  cls_client::set_flags(&op, CEPH_NOSNAP, m_image_ctx.flags,
-                        RBD_FLAG_OBJECT_MAP_INVALID);
+  cls_client::set_flags(&op, CEPH_NOSNAP, flags, flags);
 
   librados::AioCompletion *rados_completion = create_callback_completion();
   int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid,
@@ -447,11 +630,13 @@ void ObjectMap::ResizeRequest::send() {
 		<< m_num_objs << dendl;
 
   librados::ObjectWriteOperation op;
-  rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, LOCK_EXCLUSIVE, "", "");
+  if (m_snap_id == CEPH_NOSNAP) {
+    rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, LOCK_EXCLUSIVE, "", "");
+  }
   cls_client::object_map_resize(&op, m_num_objs, m_default_object_state);
 
   librados::AioCompletion *rados_completion = create_callback_completion();
-  std::string oid(object_map_name(m_image_ctx.id, CEPH_NOSNAP));
+  std::string oid(object_map_name(m_image_ctx.id, m_snap_id));
   int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op);
   assert(r == 0);
   rados_completion->release();
@@ -462,48 +647,56 @@ void ObjectMap::ResizeRequest::finish(ObjectMap *object_map) {
 
   ldout(cct, 5) << &m_image_ctx << " resizing in-memory object map: "
 		<< m_num_objs << dendl;
-  size_t orig_object_map_size = object_map->m_object_map.size();
-  object_map->m_object_map.resize(m_num_objs);
-  for (uint64_t i = orig_object_map_size;
-       i < object_map->m_object_map.size(); ++i) {
-    object_map->m_object_map[i] = m_default_object_state;
-  }
+  object_map->resize(m_num_objs, m_default_object_state);
 }
 
 void ObjectMap::UpdateRequest::send() {
+  assert(m_image_ctx.object_map_lock.is_locked());
   CephContext *cct = m_image_ctx.cct;
 
-  ldout(cct, 20) << &m_image_ctx << " updating on-disk object map: ["
+  // safe to update in-memory state first without handling rollback since any
+  // failures will invalidate the object map
+  ldout(cct, 20) << &m_image_ctx << " updating object map"
+                 << (m_snap_id != CEPH_NOSNAP ?
+                       " snap " + stringify(m_snap_id) : std::string())
+                 << ": ["
 		 << m_start_object_no << "," << m_end_object_no << ") = "
 		 << (m_current_state ?
 		       stringify(static_cast<uint32_t>(*m_current_state)) : "")
 		 << "->" << static_cast<uint32_t>(m_new_state)
 		 << dendl;
-  
+
+  ObjectMap& object_map = m_image_ctx.object_map;
+  if (m_snap_id == object_map.m_snap_id) {
+    assert(m_image_ctx.object_map_lock.is_wlocked());
+    for (uint64_t object_no = m_start_object_no;
+         object_no < MIN(m_end_object_no, object_map.m_object_map.size());
+         ++object_no) {
+      uint8_t state = object_map.m_object_map[object_no];
+      if (!m_current_state || state == *m_current_state ||
+          (*m_current_state == OBJECT_EXISTS && state == OBJECT_EXISTS_CLEAN)) {
+        object_map.m_object_map[object_no] = m_new_state;
+      }
+    }
+  }
+
   librados::ObjectWriteOperation op;
-  rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, LOCK_EXCLUSIVE, "", "");
+  if (m_snap_id == CEPH_NOSNAP) {
+    rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, LOCK_EXCLUSIVE, "", "");
+  }
   cls_client::object_map_update(&op, m_start_object_no, m_end_object_no,
 				m_new_state, m_current_state);
 
   librados::AioCompletion *rados_completion = create_callback_completion();
-  std::string oid(object_map_name(m_image_ctx.id, CEPH_NOSNAP));
+  std::string oid(object_map_name(m_image_ctx.id, m_snap_id));
   int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op);
   assert(r == 0);
   rados_completion->release();
 }
 
 void ObjectMap::UpdateRequest::finish(ObjectMap *object_map) {
-  CephContext *cct = m_image_ctx.cct;
-
-  ldout(cct, 20) << &m_image_ctx << " updating in-memory object map" << dendl;
-  for (uint64_t object_no = m_start_object_no;
-       object_no < MIN(m_end_object_no, object_map->m_object_map.size());
-       ++object_no) {
-    if (!m_current_state ||
-	object_map->m_object_map[object_no] == *m_current_state) {
-      object_map->m_object_map[object_no] = m_new_state;
-    }
-  }
+  ldout(m_image_ctx.cct, 20) << &m_image_ctx << " on-disk object map updated"
+                             << dendl;
 }
 
 } // namespace librbd
diff --git a/src/librbd/ObjectMap.h b/src/librbd/ObjectMap.h
index 4104636..797307f 100644
--- a/src/librbd/ObjectMap.h
+++ b/src/librbd/ObjectMap.h
@@ -24,13 +24,18 @@ public:
   static std::string object_map_name(const std::string &image_id,
 				     uint64_t snap_id);
 
+  ceph::BitVector<2u>::Reference operator[](uint64_t object_no);
   uint8_t operator[](uint64_t object_no) const;
+  inline uint64_t size() const {
+    return m_object_map.size();
+  }
 
   int lock();
   int unlock();
 
   bool object_may_exist(uint64_t object_no) const;
 
+  void aio_save(Context *on_finish);
   void aio_resize(uint64_t new_size, uint8_t default_object_state,
 		  Context *on_finish);
   bool aio_update(uint64_t object_no, uint8_t new_state,
@@ -41,22 +46,30 @@ public:
 		  const boost::optional<uint8_t> &current_state,
 		  Context *on_finish);
 
+  void aio_update(uint64_t snap_id, uint64_t start_object_no,
+                  uint64_t end_object_no, uint8_t new_state,
+                  const boost::optional<uint8_t> &current_state,
+                  Context *on_finish);
+
   void refresh(uint64_t snap_id);
   void rollback(uint64_t snap_id);
-  void snapshot(uint64_t snap_id);
+  void snapshot_add(uint64_t snap_id);
+  int snapshot_remove(uint64_t snap_id);
 
   bool enabled() const;
 
 private:
 
-  class Request : public AsyncRequest {
+  class Request : public AsyncRequest<> {
   public:
-    Request(ImageCtx &image_ctx, Context *on_finish)
-      : AsyncRequest(image_ctx, on_finish), m_state(STATE_REQUEST)
+    Request(ImageCtx &image_ctx, uint64_t snap_id, Context *on_finish)
+      : AsyncRequest(image_ctx, on_finish), m_snap_id(snap_id),
+        m_state(STATE_REQUEST)
     {
     }
 
   protected:
+    const uint64_t m_snap_id;
 
     virtual bool safely_cancel(int r) {
       return false;
@@ -67,7 +80,6 @@ private:
       return 0;
     }
     virtual void finish(ObjectMap *object_map) = 0;
-
   private:
     /**
      * <start> ---> STATE_REQUEST ---> <finish>
@@ -87,10 +99,10 @@ private:
 
   class ResizeRequest : public Request {
   public:
-    ResizeRequest(ImageCtx &image_ctx, uint64_t new_size,
+    ResizeRequest(ImageCtx &image_ctx, uint64_t snap_id, uint64_t new_size,
 		  uint8_t default_object_state, Context *on_finish)
-      : Request(image_ctx, on_finish), m_num_objs(0), m_new_size(new_size),
-        m_default_object_state(default_object_state)
+      : Request(image_ctx, snap_id, on_finish), m_num_objs(0),
+        m_new_size(new_size), m_default_object_state(default_object_state)
     {
     }
 
@@ -105,13 +117,14 @@ private:
 
   class UpdateRequest : public Request {
   public:
-    UpdateRequest(ImageCtx &image_ctx, uint64_t start_object_no,
-		  uint64_t end_object_no, uint8_t new_state,
+    UpdateRequest(ImageCtx &image_ctx, uint64_t snap_id,
+                  uint64_t start_object_no, uint64_t end_object_no,
+                  uint8_t new_state,
                   const boost::optional<uint8_t> &current_state,
 		  Context *on_finish)
-      : Request(image_ctx, on_finish), m_start_object_no(start_object_no),
-	m_end_object_no(end_object_no), m_new_state(new_state),
-	m_current_state(current_state)
+      : Request(image_ctx, snap_id, on_finish),
+        m_start_object_no(start_object_no), m_end_object_no(end_object_no),
+        m_new_state(new_state), m_current_state(current_state)
     {
     }
 
@@ -126,13 +139,12 @@ private:
   };
 
   ImageCtx &m_image_ctx;
-
   ceph::BitVector<2> m_object_map;
-
+  uint64_t m_snap_id;
   bool m_enabled;
 
-  void invalidate();
-
+  void invalidate(uint64_t snap_id, bool force);
+  void resize(uint64_t num_objs, uint8_t default_state);
 };
 
 } // namespace librbd
diff --git a/src/librbd/RebuildObjectMapRequest.cc b/src/librbd/RebuildObjectMapRequest.cc
new file mode 100644
index 0000000..8a3b29f
--- /dev/null
+++ b/src/librbd/RebuildObjectMapRequest.cc
@@ -0,0 +1,361 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/RebuildObjectMapRequest.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/AsyncObjectThrottle.h"
+#include "librbd/AsyncResizeRequest.h"
+#include "librbd/AsyncTrimRequest.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageWatcher.h"
+#include "librbd/internal.h"
+#include "librbd/ObjectMap.h"
+#include <boost/lambda/bind.hpp>
+#include <boost/lambda/construct.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::RebuildObjectMapRequest: "
+
+namespace librbd {
+
+namespace {
+
+class C_VerifyObject : public C_AsyncObjectThrottle<> {
+public:
+  C_VerifyObject(AsyncObjectThrottle<> &throttle, ImageCtx *image_ctx,
+                 uint64_t snap_id, uint64_t object_no)
+    : C_AsyncObjectThrottle(throttle, *image_ctx), m_snap_id(snap_id),
+      m_object_no(object_no), m_oid(m_image_ctx.get_object_name(m_object_no))
+  {
+    m_io_ctx.dup(m_image_ctx.md_ctx);
+    m_io_ctx.snap_set_read(CEPH_SNAPDIR);
+  }
+
+  virtual void complete(int r) {
+    if (should_complete(r)) {
+      ldout(m_image_ctx.cct, 20) << m_oid << " C_VerifyObject completed "
+                                 << dendl;
+      finish(r);
+      delete this;
+    }
+  }
+
+  virtual int send() {
+    send_list_snaps();
+    return 0;
+  }
+
+private:
+  librados::IoCtx m_io_ctx;
+  uint64_t m_snap_id;
+  uint64_t m_object_no;
+  std::string m_oid;
+
+  librados::snap_set_t m_snap_set;
+  int m_snap_list_ret;
+
+  bool should_complete(int r) {
+    CephContext *cct = m_image_ctx.cct;
+    if (r == 0) {
+      r = m_snap_list_ret;
+    }
+    if (r < 0 && r != -ENOENT) {
+      lderr(cct) << m_oid << " C_VerifyObject::should_complete: "
+                 << "encountered an error: " << cpp_strerror(r) << dendl;
+      return true;
+    }
+
+    ldout(cct, 20) << m_oid << " C_VerifyObject::should_complete: " << " r="
+                   << r << dendl;
+    return update_object_map(get_object_state());
+  }
+
+  void send_list_snaps() {
+    assert(m_image_ctx.owner_lock.is_locked());
+    ldout(m_image_ctx.cct, 5) << m_oid << " C_VerifyObject::send_list_snaps"
+                              << dendl;
+
+    librados::AioCompletion *comp = librados::Rados::aio_create_completion(
+      this, NULL, rados_ctx_cb);
+
+    librados::ObjectReadOperation op;
+    op.list_snaps(&m_snap_set, &m_snap_list_ret);
+
+    int r = m_io_ctx.aio_operate(m_oid, comp, &op, NULL);
+    assert(r == 0);
+    comp->release();
+  }
+
+  uint8_t get_object_state() {
+    RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+    for (std::vector<librados::clone_info_t>::const_iterator r =
+           m_snap_set.clones.begin(); r != m_snap_set.clones.end(); ++r) {
+      librados::snap_t from_snap_id;
+      librados::snap_t to_snap_id;
+      if (r->cloneid == librados::SNAP_HEAD) {
+        from_snap_id = next_valid_snap_id(m_snap_set.seq + 1);
+        to_snap_id = librados::SNAP_HEAD;
+      } else {
+        from_snap_id = next_valid_snap_id(r->snaps[0]);
+        to_snap_id = r->snaps[r->snaps.size()-1];
+      }
+
+      if (to_snap_id < m_snap_id) {
+        continue;
+      } else if (m_snap_id < from_snap_id) {
+        break;
+      }
+
+      if ((m_image_ctx.features & RBD_FEATURE_FAST_DIFF) != 0 &&
+          from_snap_id != m_snap_id) {
+        return OBJECT_EXISTS_CLEAN;
+      }
+      return OBJECT_EXISTS;
+    }
+    return OBJECT_NONEXISTENT;
+  }
+
+  uint64_t next_valid_snap_id(uint64_t snap_id) {
+    assert(m_image_ctx.snap_lock.is_locked());
+
+    std::map<librados::snap_t, SnapInfo>::iterator it =
+      m_image_ctx.snap_info.lower_bound(snap_id);
+    if (it == m_image_ctx.snap_info.end()) {
+      return CEPH_NOSNAP;
+    }
+    return it->first;
+  }
+
+  bool update_object_map(uint8_t new_state) {
+    RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+    CephContext *cct = m_image_ctx.cct;
+
+    // should have been canceled prior to releasing lock
+    assert(!m_image_ctx.image_watcher->is_lock_supported() ||
+           m_image_ctx.image_watcher->is_lock_owner());
+
+    RWLock::WLocker l(m_image_ctx.object_map_lock);
+    uint8_t state = m_image_ctx.object_map[m_object_no];
+    if (state == OBJECT_EXISTS && new_state == OBJECT_NONEXISTENT &&
+        m_snap_id == CEPH_NOSNAP) {
+      // might be writing object to OSD concurrently
+      new_state = state;
+    }
+
+    if (new_state != state) {
+      ldout(cct, 15) << m_oid << " C_VerifyObject::update_object_map "
+                     << static_cast<uint32_t>(state) << "->"
+                     << static_cast<uint32_t>(new_state) << dendl;
+      m_image_ctx.object_map[m_object_no] = new_state;
+    }
+    return true;
+  }
+};
+
+} // anonymous namespace
+
+
+void RebuildObjectMapRequest::send() {
+  send_resize_object_map();
+}
+
+bool RebuildObjectMapRequest::should_complete(int r) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 5) << this << " should_complete: " << " r=" << r << dendl;
+
+  switch (m_state) {
+  case STATE_RESIZE_OBJECT_MAP:
+    ldout(cct, 5) << "RESIZE_OBJECT_MAP" << dendl;
+    if (r == -ESTALE && !m_attempted_trim) {
+      // objects are still flagged as in-use -- delete them
+      m_attempted_trim = true;
+      RWLock::RLocker owner_lock(m_image_ctx.owner_lock);
+      send_trim_image();
+      return false;
+    } else if (r == 0) {
+      RWLock::RLocker owner_lock(m_image_ctx.owner_lock);
+      send_verify_objects();
+    }
+    break;
+
+  case STATE_TRIM_IMAGE:
+    ldout(cct, 5) << "TRIM_IMAGE" << dendl;
+    if (r == 0) {
+      RWLock::RLocker owner_lock(m_image_ctx.owner_lock);
+      send_resize_object_map();
+    }
+    break;
+
+  case STATE_VERIFY_OBJECTS:
+    ldout(cct, 5) << "VERIFY_OBJECTS" << dendl;
+    if (r == 0) {
+      assert(m_image_ctx.owner_lock.is_locked());
+      send_save_object_map();
+    }
+    break;
+
+  case STATE_SAVE_OBJECT_MAP:
+    ldout(cct, 5) << "SAVE_OBJECT_MAP" << dendl;
+    if (r == 0) {
+      RWLock::RLocker owner_lock(m_image_ctx.owner_lock);
+      send_update_header();
+    }
+    break;
+  case STATE_UPDATE_HEADER:
+    ldout(cct, 5) << "UPDATE_HEADER" << dendl;
+    if (r == 0) {
+      return true;
+    }
+    break;
+
+  default:
+    assert(false);
+    break;
+  }
+
+  if (r < 0) {
+    lderr(cct) << "rebuild object map encountered an error: " << cpp_strerror(r)
+               << dendl;
+    return true;
+  }
+  return false;
+}
+
+void RebuildObjectMapRequest::send_resize_object_map() {
+  assert(m_image_ctx.owner_lock.is_locked());
+  CephContext *cct = m_image_ctx.cct;
+
+  uint64_t num_objects;
+  uint64_t size;
+  {
+    RWLock::RLocker l(m_image_ctx.snap_lock);
+    size = get_image_size();
+    num_objects = Striper::get_num_objects(m_image_ctx.layout, size);
+  }
+
+  if (m_image_ctx.object_map.size() == num_objects) {
+    send_verify_objects();
+    return;
+  }
+
+  ldout(cct, 5) << this << " send_resize_object_map" << dendl;
+  m_state = STATE_RESIZE_OBJECT_MAP;
+
+  // should have been canceled prior to releasing lock
+  assert(!m_image_ctx.image_watcher->is_lock_supported() ||
+         m_image_ctx.image_watcher->is_lock_owner());
+  m_image_ctx.object_map.aio_resize(size, OBJECT_NONEXISTENT,
+                                    create_callback_context());
+}
+
+void RebuildObjectMapRequest::send_trim_image() {
+  CephContext *cct = m_image_ctx.cct;
+
+  RWLock::RLocker l(m_image_ctx.owner_lock);
+
+  // should have been canceled prior to releasing lock
+  assert(!m_image_ctx.image_watcher->is_lock_supported() ||
+         m_image_ctx.image_watcher->is_lock_owner());
+  ldout(cct, 5) << this << " send_trim_image" << dendl;
+  m_state = STATE_TRIM_IMAGE;
+
+  uint64_t new_size;
+  uint64_t orig_size;
+  {
+    RWLock::RLocker l(m_image_ctx.snap_lock);
+    new_size = get_image_size();
+    orig_size = m_image_ctx.get_object_size() *
+                m_image_ctx.object_map.size();
+  }
+  AsyncTrimRequest *req = new AsyncTrimRequest(m_image_ctx,
+                                               create_callback_context(),
+                                               orig_size, new_size,
+                                               m_prog_ctx);
+  req->send();
+}
+
+void RebuildObjectMapRequest::send_verify_objects() {
+  assert(m_image_ctx.owner_lock.is_locked());
+  CephContext *cct = m_image_ctx.cct;
+
+  uint64_t snap_id;
+  uint64_t num_objects;
+  {
+    RWLock::RLocker l(m_image_ctx.snap_lock);
+    snap_id = m_image_ctx.snap_id;
+    num_objects = Striper::get_num_objects(m_image_ctx.layout,
+                                           m_image_ctx.get_image_size(snap_id));
+  }
+
+  if (num_objects == 0) {
+    send_save_object_map();
+    return;
+  }
+
+  m_state = STATE_VERIFY_OBJECTS;
+  ldout(cct, 5) << this << " send_verify_objects" << dendl;
+
+  AsyncObjectThrottle<>::ContextFactory context_factory(
+    boost::lambda::bind(boost::lambda::new_ptr<C_VerifyObject>(),
+      boost::lambda::_1, &m_image_ctx, snap_id, boost::lambda::_2));
+  AsyncObjectThrottle<> *throttle = new AsyncObjectThrottle<>(
+    this, m_image_ctx, context_factory, create_callback_context(), &m_prog_ctx,
+    0, num_objects);
+  throttle->start_ops(cct->_conf->rbd_concurrent_management_ops);
+}
+
+void RebuildObjectMapRequest::send_save_object_map() {
+  assert(m_image_ctx.owner_lock.is_locked());
+  CephContext *cct = m_image_ctx.cct;
+
+  ldout(cct, 5) << this << " send_save_object_map" << dendl;
+  m_state = STATE_SAVE_OBJECT_MAP;
+
+  // should have been canceled prior to releasing lock
+  assert(!m_image_ctx.image_watcher->is_lock_supported() ||
+         m_image_ctx.image_watcher->is_lock_owner());
+  m_image_ctx.object_map.aio_save(create_callback_context());
+}
+
+void RebuildObjectMapRequest::send_update_header() {
+  assert(m_image_ctx.owner_lock.is_locked());
+
+  // should have been canceled prior to releasing lock
+  assert(!m_image_ctx.image_watcher->is_lock_supported() ||
+         m_image_ctx.image_watcher->is_lock_owner());
+
+  ldout(m_image_ctx.cct, 5) << this << " send_update_header" << dendl;
+  m_state = STATE_UPDATE_HEADER;
+
+  librados::ObjectWriteOperation op;
+  if (m_image_ctx.image_watcher->is_lock_supported()) {
+    m_image_ctx.image_watcher->assert_header_locked(&op);
+  }
+
+  uint64_t flags = RBD_FLAG_OBJECT_MAP_INVALID | RBD_FLAG_FAST_DIFF_INVALID;
+  cls_client::set_flags(&op, m_image_ctx.snap_id, 0, flags);
+
+  librados::AioCompletion *comp = create_callback_completion();
+  int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op);
+  assert(r == 0);
+  comp->release();
+
+  RWLock::WLocker snap_locker(m_image_ctx.snap_lock);
+  m_image_ctx.update_flags(m_image_ctx.snap_id, flags, false);
+}
+
+uint64_t RebuildObjectMapRequest::get_image_size() const {
+  assert(m_image_ctx.snap_lock.is_locked());
+  if (m_image_ctx.snap_id == CEPH_NOSNAP) {
+    if (!m_image_ctx.async_resize_reqs.empty()) {
+      return m_image_ctx.async_resize_reqs.front()->get_image_size();
+    } else {
+      return m_image_ctx.size;
+    }
+  }
+  return  m_image_ctx.get_image_size(m_image_ctx.snap_id);
+}
+
+} // namespace librbd
diff --git a/src/librbd/RebuildObjectMapRequest.h b/src/librbd/RebuildObjectMapRequest.h
new file mode 100644
index 0000000..02a41ef
--- /dev/null
+++ b/src/librbd/RebuildObjectMapRequest.h
@@ -0,0 +1,78 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_LIBRBD_REBUILD_OBJECT_MAP_REQUEST_H
+#define CEPH_LIBRBD_REBUILD_OBJECT_MAP_REQUEST_H
+
+#include "include/int_types.h"
+#include "librbd/AsyncRequest.h"
+
+namespace librbd {
+
+class ImageCtx;
+class ProgressContext;
+
+class RebuildObjectMapRequest : public AsyncRequest<> {
+public:
+
+  RebuildObjectMapRequest(ImageCtx &image_ctx, Context *on_finish,
+                          ProgressContext &prog_ctx)
+    : AsyncRequest(image_ctx, on_finish), m_image_ctx(image_ctx),
+      m_prog_ctx(prog_ctx), m_attempted_trim(false)
+  {
+  }
+
+  virtual void send();
+
+protected:
+  virtual bool should_complete(int r);
+
+private:
+  /**
+   * Rebuild object map goes through the following state machine to
+   * verify per-object state:
+   *
+   * <start>
+   *  .   |               . . . . . . . . . .
+   *  .   |               .                 .
+   *  .   v               v                 .
+   *  . STATE_RESIZE_OBJECT_MAP . . . > STATE_TRIM_IMAGE
+   *  .          |
+   *  .          v
+   *  . . . > STATE_VERIFY_OBJECTS
+   *             |
+   *             v
+   *          STATE_SAVE_OBJECT_MAP
+   *             |
+   *             v
+   *          STATE_UPDATE_HEADER
+   *
+   * The _RESIZE_OBJECT_MAP state will be skipped if the object map
+   * is appropriately sized for the image. The _TRIM_IMAGE state will
+   * only be hit if the resize failed due to an in-use object.
+   */
+  enum State {
+    STATE_RESIZE_OBJECT_MAP,
+    STATE_TRIM_IMAGE,
+    STATE_VERIFY_OBJECTS,
+    STATE_SAVE_OBJECT_MAP,
+    STATE_UPDATE_HEADER
+  };
+
+  ImageCtx &m_image_ctx;
+  ProgressContext &m_prog_ctx;
+  State m_state;
+  bool m_attempted_trim;
+
+  void send_resize_object_map();
+  void send_trim_image();
+  void send_verify_objects();
+  void send_save_object_map();
+  void send_update_header();
+
+  uint64_t get_image_size() const;
+
+};
+
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_REBUILD_OBJECT_MAP_REQUEST_H
diff --git a/src/librbd/SnapInfo.h b/src/librbd/SnapInfo.h
index a3307ff..4a225a4 100644
--- a/src/librbd/SnapInfo.h
+++ b/src/librbd/SnapInfo.h
@@ -15,13 +15,12 @@ namespace librbd {
   struct SnapInfo {
     std::string name;
     uint64_t size;
-    uint64_t features;
     parent_info parent;
     uint8_t protection_status;
     uint64_t flags;
-    SnapInfo(std::string _name, uint64_t _size, uint64_t _features,
-	     parent_info _parent, uint8_t _protection_status, uint64_t _flags)
-      : name(_name), size(_size), features(_features), parent(_parent),
+    SnapInfo(std::string _name, uint64_t _size, parent_info _parent,
+             uint8_t _protection_status, uint64_t _flags)
+      : name(_name), size(_size), parent(_parent),
 	protection_status(_protection_status), flags(_flags) {}
   };
 }
diff --git a/src/librbd/TaskFinisher.h b/src/librbd/TaskFinisher.h
index 4942a38..43ec517 100644
--- a/src/librbd/TaskFinisher.h
+++ b/src/librbd/TaskFinisher.h
@@ -89,7 +89,7 @@ public:
         return false;
       }
     }
-    m_task_contexts[task] = std::make_pair(ctx, reinterpret_cast<Context *>(NULL));
+    m_task_contexts[task] = std::make_pair(ctx, reinterpret_cast<Context *>(0));
 
     m_finisher->queue(new C_Task(this, task));
     return true;
diff --git a/src/librbd/WatchNotifyTypes.cc b/src/librbd/WatchNotifyTypes.cc
index e7dde46..6785864 100644
--- a/src/librbd/WatchNotifyTypes.cc
+++ b/src/librbd/WatchNotifyTypes.cc
@@ -241,6 +241,36 @@ void SnapCreatePayload::dump(Formatter *f) const {
   f->dump_string("snap_name", snap_name);
 }
 
+void SnapRemovePayload::encode(bufferlist &bl) const {
+  ::encode(static_cast<uint32_t>(NOTIFY_OP_SNAP_REMOVE), bl);
+  ::encode(snap_name, bl);
+}
+
+void SnapRemovePayload::decode(__u8 version, bufferlist::iterator &iter) {
+  ::decode(snap_name, iter);
+}
+
+void SnapRemovePayload::dump(Formatter *f) const {
+  f->dump_string("notify_op", stringify(NOTIFY_OP_SNAP_REMOVE));
+  f->dump_string("snap_name", snap_name);
+}
+
+void RebuildObjectMapPayload::encode(bufferlist &bl) const {
+  ::encode(static_cast<uint32_t>(NOTIFY_OP_REBUILD_OBJECT_MAP), bl);
+  ::encode(async_request_id, bl);
+}
+
+void RebuildObjectMapPayload::decode(__u8 version, bufferlist::iterator &iter) {
+  ::decode(async_request_id, iter);
+}
+
+void RebuildObjectMapPayload::dump(Formatter *f) const {
+  f->dump_string("notify_op", stringify(NOTIFY_OP_REBUILD_OBJECT_MAP));
+  f->open_object_section("async_request_id");
+  async_request_id.dump(f);
+  f->close_section();
+}
+
 void UnknownPayload::encode(bufferlist &bl) const {
   assert(false);
 }
@@ -292,6 +322,12 @@ void NotifyMessage::decode(bufferlist::iterator& iter) {
   case NOTIFY_OP_SNAP_CREATE:
     payload = SnapCreatePayload();
     break;
+  case NOTIFY_OP_SNAP_REMOVE:
+    payload = SnapRemovePayload();
+    break;
+  case NOTIFY_OP_REBUILD_OBJECT_MAP:
+    payload = RebuildObjectMapPayload();
+    break;
   default:
     payload = UnknownPayload();
     break;
@@ -315,6 +351,8 @@ void NotifyMessage::generate_test_instances(std::list<NotifyMessage *> &o) {
   o.push_back(new NotifyMessage(FlattenPayload(AsyncRequestId(ClientId(0, 1), 2))));
   o.push_back(new NotifyMessage(ResizePayload(123, AsyncRequestId(ClientId(0, 1), 2))));
   o.push_back(new NotifyMessage(SnapCreatePayload("foo")));
+  o.push_back(new NotifyMessage(SnapRemovePayload("foo")));
+  o.push_back(new NotifyMessage(RebuildObjectMapPayload(AsyncRequestId(ClientId(0, 1), 2))));
 }
 
 void ResponseMessage::encode(bufferlist& bl) const {
@@ -372,6 +410,12 @@ std::ostream &operator<<(std::ostream &out,
   case NOTIFY_OP_SNAP_CREATE:
     out << "SnapCreate";
     break;
+  case NOTIFY_OP_SNAP_REMOVE:
+    out << "SnapRemove";
+    break;
+  case NOTIFY_OP_REBUILD_OBJECT_MAP:
+    out << "RebuildObjectMap";
+    break;
   default:
     out << "Unknown (" << static_cast<uint32_t>(op) << ")";
     break;
diff --git a/src/librbd/WatchNotifyTypes.h b/src/librbd/WatchNotifyTypes.h
index 270f25d..94a2836 100644
--- a/src/librbd/WatchNotifyTypes.h
+++ b/src/librbd/WatchNotifyTypes.h
@@ -6,7 +6,7 @@
 #include "include/int_types.h"
 #include "include/buffer.h"
 #include "include/encoding.h"
-#include <iostream>
+#include <iosfwd>
 #include <list>
 #include <string>
 #include <boost/variant.hpp>
@@ -73,15 +73,17 @@ struct AsyncRequestId {
 };
 
 enum NotifyOp {
-  NOTIFY_OP_ACQUIRED_LOCK  = 0,
-  NOTIFY_OP_RELEASED_LOCK  = 1,
-  NOTIFY_OP_REQUEST_LOCK   = 2,
-  NOTIFY_OP_HEADER_UPDATE  = 3,
-  NOTIFY_OP_ASYNC_PROGRESS = 4,
-  NOTIFY_OP_ASYNC_COMPLETE = 5,
-  NOTIFY_OP_FLATTEN        = 6,
-  NOTIFY_OP_RESIZE         = 7,
-  NOTIFY_OP_SNAP_CREATE    = 8
+  NOTIFY_OP_ACQUIRED_LOCK      = 0,
+  NOTIFY_OP_RELEASED_LOCK      = 1,
+  NOTIFY_OP_REQUEST_LOCK       = 2,
+  NOTIFY_OP_HEADER_UPDATE      = 3,
+  NOTIFY_OP_ASYNC_PROGRESS     = 4,
+  NOTIFY_OP_ASYNC_COMPLETE     = 5,
+  NOTIFY_OP_FLATTEN            = 6,
+  NOTIFY_OP_RESIZE             = 7,
+  NOTIFY_OP_SNAP_CREATE        = 8,
+  NOTIFY_OP_SNAP_REMOVE        = 9,
+  NOTIFY_OP_REBUILD_OBJECT_MAP = 10
 };
 
 struct AcquiredLockPayload {
@@ -179,7 +181,29 @@ struct SnapCreatePayload {
   SnapCreatePayload(const std::string &name) : snap_name(name) {}
 
   std::string snap_name;
- 
+
+  void encode(bufferlist &bl) const;
+  void decode(__u8 version, bufferlist::iterator &iter);
+  void dump(Formatter *f) const;
+};
+
+struct SnapRemovePayload {
+  SnapRemovePayload() {}
+  SnapRemovePayload(const std::string &name) : snap_name(name) {}
+
+  std::string snap_name;
+
+  void encode(bufferlist &bl) const;
+  void decode(__u8 version, bufferlist::iterator &iter);
+  void dump(Formatter *f) const;
+};
+
+struct RebuildObjectMapPayload {
+  RebuildObjectMapPayload() {}
+  RebuildObjectMapPayload(const AsyncRequestId &id) : async_request_id(id) {}
+
+  AsyncRequestId async_request_id;
+
   void encode(bufferlist &bl) const;
   void decode(__u8 version, bufferlist::iterator &iter);
   void dump(Formatter *f) const;
@@ -200,6 +224,8 @@ typedef boost::variant<AcquiredLockPayload,
                  FlattenPayload,
                  ResizePayload,
                  SnapCreatePayload,
+                 SnapRemovePayload,
+                 RebuildObjectMapPayload,
                  UnknownPayload> Payload;
 
 struct NotifyMessage {
diff --git a/src/librbd/internal.cc b/src/librbd/internal.cc
index 7364e6c..3c6f740 100644
--- a/src/librbd/internal.cc
+++ b/src/librbd/internal.cc
@@ -10,11 +10,11 @@
 #include "common/errno.h"
 #include "common/ContextCompletion.h"
 #include "common/Throttle.h"
-#include "common/WorkQueue.h"
 #include "cls/lock/cls_lock_client.h"
 #include "include/stringify.h"
 
 #include "cls/rbd/cls_rbd.h"
+#include "cls/rbd/cls_rbd_client.h"
 
 #include "librbd/AioCompletion.h"
 #include "librbd/AioRequest.h"
@@ -22,16 +22,15 @@
 #include "librbd/AsyncResizeRequest.h"
 #include "librbd/AsyncTrimRequest.h"
 #include "librbd/CopyupRequest.h"
+#include "librbd/DiffIterate.h"
 #include "librbd/ImageCtx.h"
 #include "librbd/ImageWatcher.h"
-
 #include "librbd/internal.h"
 #include "librbd/ObjectMap.h"
 #include "librbd/parent_types.h"
+#include "librbd/RebuildObjectMapRequest.h"
 #include "include/util.h"
 
-#include "librados/snap_set_diff.h"
-
 #include <boost/bind.hpp>
 #include <boost/scope_exit.hpp>
 #include "include/assert.h"
@@ -58,6 +57,53 @@ namespace librbd {
 
 namespace {
 
+int remove_object_map(ImageCtx *ictx) {
+  assert(ictx->snap_lock.is_locked());
+  CephContext *cct = ictx->cct;
+
+  int r;
+  for (std::map<snap_t, SnapInfo>::iterator it = ictx->snap_info.begin();
+       it != ictx->snap_info.end(); ++it) {
+    std::string oid(ObjectMap::object_map_name(ictx->id, it->first));
+    r = ictx->md_ctx.remove(oid);
+    if (r < 0 && r != -ENOENT) {
+      lderr(cct) << "failed to remove object map " << oid << ": "
+                 << cpp_strerror(r) << dendl;
+      return r;
+    }
+  }
+
+  r = ictx->md_ctx.remove(ObjectMap::object_map_name(ictx->id, CEPH_NOSNAP));
+  if (r < 0 && r != -ENOENT) {
+    lderr(cct) << "failed to remove object map: " << cpp_strerror(r) << dendl;
+  }
+  return 0;
+}
+
+int update_all_flags(ImageCtx *ictx, uint64_t flags, uint64_t mask) {
+  assert(ictx->snap_lock.is_locked());
+  CephContext *cct = ictx->cct;
+
+  std::vector<uint64_t> snap_ids;
+  snap_ids.push_back(CEPH_NOSNAP);
+  for (std::map<snap_t, SnapInfo>::iterator it = ictx->snap_info.begin();
+       it != ictx->snap_info.end(); ++it) {
+    snap_ids.push_back(it->first);
+  }
+
+  for (size_t i=0; i<snap_ids.size(); ++i) {
+    librados::ObjectWriteOperation op;
+    cls_client::set_flags(&op, snap_ids[i], flags, mask);
+    int r = ictx->md_ctx.operate(ictx->header_oid, &op);
+    if (r < 0) {
+      lderr(cct) << "failed to update image flags: " << cpp_strerror(r)
+	         << dendl;
+      return r;
+    }
+  }
+  return 0;
+}
+
 int prepare_image_update(ImageCtx *ictx) {
   assert(ictx->owner_lock.is_locked() && !ictx->owner_lock.is_wlocked());
   if (ictx->image_watcher == NULL) {
@@ -163,12 +209,14 @@ int invoke_async_request(ImageCtx *ictx, const std::string& request_type,
     if (old_format)
       *old_format = true;
     int r = io_ctx.stat(old_header_name(name), size, NULL);
-    if (r < 0) {
+    if (r == -ENOENT) {
       if (old_format)
 	*old_format = false;
       r = io_ctx.stat(id_obj_name(name), size, NULL);
       if (r < 0)
 	return r;
+    } else if (r < 0) {
+      return r;
     }
 
     ldout(cct, 20) << "detect format of " << name << " : "
@@ -390,7 +438,7 @@ int invoke_async_request(ImageCtx *ictx, const std::string& request_type,
 
     int r;
     CephContext *cct = ictx->cct;
-    SimpleThrottle throttle(cct->_conf->rbd_concurrent_management_ops, true);
+    SimpleThrottle throttle(ictx->concurrent_management_ops, true);
 
     for (uint64_t i = 0; i < numseg; i++) {
       string oid = ictx->get_object_name(i);
@@ -626,55 +674,109 @@ int invoke_async_request(ImageCtx *ictx, const std::string& request_type,
     if (r < 0)
       return r;
 
-    RWLock::RLocker owner_locker(ictx->owner_lock);
+    bool fast_diff_enabled = false;
+    {
+      RWLock::RLocker snap_locker(ictx->snap_lock);
+      if (ictx->get_snap_id(snap_name) == CEPH_NOSNAP) {
+        return -ENOENT;
+      }
+      fast_diff_enabled = ((ictx->features & RBD_FEATURE_FAST_DIFF) != 0);
+    }
+
+    if (fast_diff_enabled) {
+      r = invoke_async_request(ictx, "snap_remove", true,
+                               boost::bind(&snap_remove_helper, ictx, _1,
+                                           snap_name),
+                               boost::bind(&ImageWatcher::notify_snap_remove,
+                                           ictx->image_watcher, snap_name));
+      if (r < 0 && r != -EEXIST) {
+        return r;
+      }
+    } else {
+      RWLock::RLocker owner_lock(ictx->owner_lock);
+      r = snap_remove_helper(ictx, NULL, snap_name);
+      if (r < 0) {
+        return r;
+      }
+    }
+
+    notify_change(ictx->md_ctx, ictx->header_oid, ictx);
+
+    ictx->perfcounter->inc(l_librbd_snap_remove);
+    return 0;
+  }
+
+  int snap_remove_helper(ImageCtx *ictx, Context *ctx, const char *snap_name)
+  {
+    assert(ictx->owner_lock.is_locked());
+    {
+      if ((ictx->features & RBD_FEATURE_FAST_DIFF) != 0) {
+        assert(!ictx->image_watcher->is_lock_supported() ||
+               ictx->image_watcher->is_lock_owner());
+      }
+    }
+
+    ldout(ictx->cct, 20) << "snap_remove_helper " << ictx << " " << snap_name
+                         << dendl;
+
+    int r = ictx_check(ictx, true);
+    if (r < 0) {
+      return r;
+    }
+
     RWLock::RLocker md_locker(ictx->md_lock);
     snap_t snap_id;
-
     {
-      // block for purposes of auto-destruction of l2 on early return
-      RWLock::RLocker l2(ictx->snap_lock);
+      RWLock::WLocker snap_locker(ictx->snap_lock);
       snap_id = ictx->get_snap_id(snap_name);
-      if (snap_id == CEPH_NOSNAP)
-	return -ENOENT;
+      if (snap_id == CEPH_NOSNAP) {
+        return -ENOENT;
+      }
 
-      parent_spec our_pspec;
-      RWLock::RLocker l3(ictx->parent_lock);
-      r = ictx->get_parent_spec(snap_id, &our_pspec);
+      r = ictx->object_map.snapshot_remove(snap_id);
       if (r < 0) {
-	lderr(ictx->cct) << "snap_remove: can't get parent spec" << dendl;
-	return r;
+        lderr(ictx->cct) << "snap_remove: failed to remove snapshot object map"
+		         << dendl;
+        return r;
       }
 
-      if (ictx->parent_md.spec != our_pspec &&
-	  (scan_for_parents(ictx, our_pspec, snap_id) == -ENOENT)) {
-	  r = cls_client::remove_child(&ictx->md_ctx, RBD_CHILDREN,
+      {
+        parent_spec our_pspec;
+        RWLock::RLocker parent_locker(ictx->parent_lock);
+        r = ictx->get_parent_spec(snap_id, &our_pspec);
+        if (r < 0) {
+	  lderr(ictx->cct) << "snap_remove: can't get parent spec" << dendl;
+	  return r;
+        }
+
+        if (ictx->parent_md.spec != our_pspec &&
+	    (scan_for_parents(ictx, our_pspec, snap_id) == -ENOENT)) {
+          r = cls_client::remove_child(&ictx->md_ctx, RBD_CHILDREN,
 				       our_pspec, ictx->id);
 	  if (r < 0 && r != -ENOENT) {
             lderr(ictx->cct) << "snap_remove: failed to deregister from parent "
-                                "image" << dendl;
+                             << "image" << dendl;
 	    return r;
           }
+        }
       }
-    }
 
-    r = ictx->md_ctx.remove(ObjectMap::object_map_name(ictx->id, snap_id));
-    if (r < 0 && r != -ENOENT) {
-      lderr(ictx->cct) << "snap_remove: failed to remove snapshot object map"
-		       << dendl;
-      return 0;
+      r = rm_snap(ictx, snap_name, snap_id);
+      if (r < 0) {
+        return r;
+      }
     }
 
-    r = rm_snap(ictx, snap_name);
-    if (r < 0)
-      return r;
-
     r = ictx->data_ctx.selfmanaged_snap_remove(snap_id);
-    if (r < 0)
+    if (r < 0) {
+      lderr(ictx->cct) << "snap_remove: failed to remove RADOS snapshot"
+                       << dendl;
       return r;
+    }
 
-    notify_change(ictx->md_ctx, ictx->header_oid, ictx);
-
-    ictx->perfcounter->inc(l_librbd_snap_remove);
+    if (ctx != NULL) {
+      ctx->complete(0);
+    }
     return 0;
   }
 
@@ -692,9 +794,7 @@ int invoke_async_request(ImageCtx *ictx, const std::string& request_type,
 
     RWLock::RLocker l(ictx->md_lock);
     RWLock::RLocker l2(ictx->snap_lock);
-    uint64_t features;
-    ictx->get_features(ictx->snap_id, &features);
-    if ((features & RBD_FEATURE_LAYERING) == 0) {
+    if ((ictx->features & RBD_FEATURE_LAYERING) == 0) {
       lderr(ictx->cct) << "snap_protect: image must support layering"
 		       << dendl;
       return -ENOSYS;
@@ -735,9 +835,7 @@ int invoke_async_request(ImageCtx *ictx, const std::string& request_type,
 
     RWLock::RLocker l(ictx->md_lock);
     RWLock::RLocker l2(ictx->snap_lock);
-    uint64_t features;
-    ictx->get_features(ictx->snap_id, &features);
-    if ((features & RBD_FEATURE_LAYERING) == 0) {
+    if ((ictx->features & RBD_FEATURE_LAYERING) == 0) {
       lderr(ictx->cct) << "snap_unprotect: image must support layering"
 		       << dendl;
       return -ENOSYS;
@@ -964,7 +1062,11 @@ reprotect_and_return_err:
       }
     }
 
-    if ((features & RBD_FEATURE_OBJECT_MAP) != 0) {
+    if ((features & RBD_FEATURE_FAST_DIFF) != 0 &&
+        (features & RBD_FEATURE_OBJECT_MAP) == 0) {
+      lderr(cct) << "cannot use fast diff without object map" << dendl;
+      goto err_remove_header;
+    } else if ((features & RBD_FEATURE_OBJECT_MAP) != 0) {
       if ((features & RBD_FEATURE_EXCLUSIVE_LOCK) == 0) {
         lderr(cct) << "cannot use object map without exclusive lock" << dendl;
         goto err_remove_header;
@@ -1062,8 +1164,8 @@ reprotect_and_return_err:
     if (!*order)
       *order = RBD_DEFAULT_OBJ_ORDER;
 
-    if (*order && (*order > 64 || *order < 12)) {
-      lderr(cct) << "order must be in the range [12, 64]" << dendl;
+    if (*order > 25 || *order < 12) {
+      lderr(cct) << "order must be in the range [12, 25]" << dendl;
       return -EDOM;
     }
 
@@ -1142,9 +1244,10 @@ reprotect_and_return_err:
     int order;
     uint64_t size;
     uint64_t p_features;
-    int remove_r;
+    int partial_r;
     librbd::NoOpProgressContext no_op;
     ImageCtx *c_imctx = NULL;
+    map<string, bufferlist> pairs;
     // make sure parent snapshot exists
     ImageCtx *p_imctx = new ImageCtx(p_name, "", p_snap_name, p_ioctx, true);
     r = open_image(p_imctx);
@@ -1164,7 +1267,7 @@ reprotect_and_return_err:
     }
 
     p_imctx->snap_lock.get_read();
-    p_imctx->get_features(p_imctx->snap_id, &p_features);
+    p_features = p_imctx->features;
     size = p_imctx->get_image_size(p_imctx->snap_id);
     p_imctx->is_snap_protected(p_imctx->snap_id, &snap_protected);
     p_imctx->snap_lock.put_read();
@@ -1211,6 +1314,18 @@ reprotect_and_return_err:
       goto err_close_child;
     }
 
+    r = cls_client::metadata_list(&p_ioctx, p_imctx->header_oid, "", 0, &pairs);
+    if (r < 0 && r != -EOPNOTSUPP && r != -EIO) {
+      lderr(cct) << "couldn't list metadata: " << r << dendl;
+      goto err_close_child;
+    } else if (r == 0 && !pairs.empty()) {
+      r = cls_client::metadata_set(&c_ioctx, c_imctx->header_oid, pairs);
+      if (r < 0) {
+        lderr(cct) << "couldn't set metadata: " << r << dendl;
+        goto err_close_child;
+      }
+    }
+
     {
       RWLock::RLocker owner_locker(p_imctx->owner_lock);
       r = ictx_refresh(p_imctx);
@@ -1227,24 +1342,27 @@ reprotect_and_return_err:
     }
 
     ldout(cct, 2) << "done." << dendl;
-    close_image(c_imctx);
-    close_image(p_imctx);
-    return 0;
+    r = close_image(c_imctx);
+    partial_r = close_image(p_imctx);
+    if (r == 0 && partial_r < 0) {
+      r = partial_r;
+    }
+    return r;
 
   err_remove_child:
-    remove_r = cls_client::remove_child(&c_ioctx, RBD_CHILDREN, pspec,
-					c_imctx->id);
-    if (remove_r < 0) {
+    partial_r = cls_client::remove_child(&c_ioctx, RBD_CHILDREN, pspec,
+                                         c_imctx->id);
+    if (partial_r < 0) {
      lderr(cct) << "Error removing failed clone from list of children: "
-		 << cpp_strerror(remove_r) << dendl;
+                << cpp_strerror(partial_r) << dendl;
     }
   err_close_child:
     close_image(c_imctx);
   err_remove:
-    remove_r = remove(c_ioctx, c_name, no_op);
-    if (remove_r < 0) {
+    partial_r = remove(c_ioctx, c_name, no_op);
+    if (partial_r < 0) {
       lderr(cct) << "Error removing failed clone: "
-		 << cpp_strerror(remove_r) << dendl;
+		 << cpp_strerror(partial_r) << dendl;
     }
   err_close_parent:
     close_image(p_imctx);
@@ -1265,6 +1383,17 @@ reprotect_and_return_err:
       return r;
     }
 
+    r = detect_format(io_ctx, dstname, NULL, NULL);
+    if (r < 0 && r != -ENOENT) {
+      lderr(cct) << "error checking for existing image called "
+		 << dstname << ":" << cpp_strerror(r) << dendl;
+      return r;
+    }
+    if (r == 0) {
+      lderr(cct) << "rbd image " << dstname << " already exists" << dendl;
+      return -EEXIST;
+    }
+
     string src_oid =
       old_format ? old_header_name(srcname) : id_obj_name(srcname);
     string dst_oid =
@@ -1303,17 +1432,6 @@ reprotect_and_return_err:
 	last_read = outbl.rbegin()->first;
     } while (r == MAX_READ);
 
-    r = detect_format(io_ctx, dstname, NULL, NULL);
-    if (r < 0 && r != -ENOENT) {
-      lderr(cct) << "error checking for existing image called "
-		 << dstname << ":" << cpp_strerror(r) << dendl;
-      return r;
-    }
-    if (r == 0) {
-      lderr(cct) << "rbd image " << dstname << " already exists" << dendl;
-      return -EEXIST;
-    }
-
     librados::ObjectWriteOperation op;
     op.create(true);
     op.write_full(databl);
@@ -1399,7 +1517,115 @@ reprotect_and_return_err:
     if (r < 0)
       return r;
     RWLock::RLocker l(ictx->snap_lock);
-    return ictx->get_features(ictx->snap_id, features);
+    *features = ictx->features;
+    return 0;
+  }
+
+  int update_features(ImageCtx *ictx, uint64_t features, bool enabled)
+  {
+    int r = ictx_check(ictx);
+    if (r < 0) {
+      return r;
+    }
+
+    CephContext *cct = ictx->cct;
+    if (ictx->read_only) {
+      return -EROFS;
+    } else if (ictx->old_format) {
+      lderr(cct) << "old-format images do not support features" << dendl;
+      return -EINVAL;
+    }
+
+    if ((features & RBD_FEATURES_MUTABLE) != features) {
+      lderr(cct) << "cannot update immutable features" << dendl;
+      return -EINVAL;
+    } else if (features == 0) {
+      lderr(cct) << "update requires at least one feature" << dendl;
+      return -EINVAL;
+    }
+
+    RWLock::RLocker l(ictx->snap_lock);
+    uint64_t new_features = ictx->features | features;
+    if (!enabled) {
+      new_features = ictx->features & ~features;
+    }
+
+    if (ictx->features == new_features) {
+      return 0;
+    }
+
+    uint64_t features_mask = features;
+    uint64_t disable_flags = 0;
+    if (enabled) {
+      uint64_t enable_flags = 0;
+
+      if ((features & RBD_FEATURE_OBJECT_MAP) != 0) {
+        if ((new_features & RBD_FEATURE_EXCLUSIVE_LOCK) == 0) {
+          lderr(cct) << "cannot enable object map" << dendl;
+          return -EINVAL;
+        }
+        enable_flags |= RBD_FLAG_OBJECT_MAP_INVALID;
+        features_mask |= RBD_FEATURE_EXCLUSIVE_LOCK;
+      }
+      if ((features & RBD_FEATURE_FAST_DIFF) != 0) {
+        if ((new_features & RBD_FEATURE_OBJECT_MAP) == 0) {
+          lderr(cct) << "cannot enable fast diff" << dendl;
+          return -EINVAL;
+        }
+        enable_flags |= RBD_FLAG_FAST_DIFF_INVALID;
+        features_mask |= (RBD_FEATURE_OBJECT_MAP | RBD_FEATURE_EXCLUSIVE_LOCK);
+      }
+
+      if (enable_flags != 0) {
+        r = update_all_flags(ictx, enable_flags, enable_flags);
+        if (r < 0) {
+          return r;
+        }
+      }
+    } else {
+      if ((features & RBD_FEATURE_EXCLUSIVE_LOCK) != 0) {
+        if ((new_features & RBD_FEATURE_OBJECT_MAP) != 0) {
+          lderr(cct) << "cannot disable exclusive lock" << dendl;
+          return -EINVAL;
+        }
+        features_mask |= RBD_FEATURE_OBJECT_MAP;
+      }
+      if ((features & RBD_FEATURE_OBJECT_MAP) != 0) {
+        if ((new_features & RBD_FEATURE_FAST_DIFF) != 0) {
+          lderr(cct) << "cannot disable object map" << dendl;
+          return -EINVAL;
+        }
+
+        disable_flags = RBD_FLAG_OBJECT_MAP_INVALID;
+        r = remove_object_map(ictx);
+        if (r < 0) {
+          lderr(cct) << "failed to remove object map" << dendl;
+          return r;
+        }
+      }
+      if ((features & RBD_FEATURE_FAST_DIFF) != 0) {
+        disable_flags = RBD_FLAG_FAST_DIFF_INVALID;
+      }
+    }
+
+    ldout(cct, 10) << "update_features: features=" << new_features << ", mask="
+                   << features_mask << dendl;
+    r = librbd::cls_client::set_features(&ictx->md_ctx, ictx->header_oid,
+                                         new_features, features_mask);
+    if (r < 0) {
+      lderr(cct) << "failed to update features: " << cpp_strerror(r)
+                 << dendl;
+    }
+
+    if (disable_flags != 0) {
+      r = update_all_flags(ictx, 0, disable_flags);
+      if (r < 0) {
+        return r;
+      }
+    }
+
+    notify_change(ictx->md_ctx, ictx->header_oid, ictx);
+    return 0;
   }
 
   int get_overlap(ImageCtx *ictx, uint64_t *overlap)
@@ -1448,9 +1674,9 @@ reprotect_and_return_err:
     ictx->parent = new ImageCtx("", parent_image_id, NULL, p_ioctx, true);
 
     // set rados flags for reading the parent image
-    if (ictx->cct->_conf->rbd_balance_parent_reads)
+    if (ictx->balance_parent_reads)
       ictx->parent->set_read_flag(librados::OPERATION_BALANCE_READS);
-    else if (ictx->cct->_conf->rbd_localize_parent_reads)
+    else if (ictx->localize_parent_reads)
       ictx->parent->set_read_flag(librados::OPERATION_LOCALIZE_READS);
 
     r = open_image(ictx->parent);
@@ -1835,7 +2061,7 @@ reprotect_and_return_err:
 
     RWLock::WLocker l(ictx->snap_lock);
     if (!ictx->old_format) {
-      ictx->object_map.snapshot(snap_id);
+      ictx->object_map.snapshot_add(snap_id);
       if (lock_owner) {
 	// immediately start using the new snap context if we
 	// own the exclusive lock
@@ -1853,17 +2079,19 @@ reprotect_and_return_err:
     return 0;
   }
 
-  int rm_snap(ImageCtx *ictx, const char *snap_name)
+  int rm_snap(ImageCtx *ictx, const char *snap_name, uint64_t snap_id)
   {
+    assert(ictx->snap_lock.is_wlocked());
+
     int r;
     if (ictx->old_format) {
       r = cls_client::old_snapshot_remove(&ictx->md_ctx,
 					  ictx->header_oid, snap_name);
     } else {
-      RWLock::RLocker l(ictx->snap_lock);
-      r = cls_client::snapshot_remove(&ictx->md_ctx,
-				      ictx->header_oid,
-				      ictx->get_snap_id(snap_name));
+      r = cls_client::snapshot_remove(&ictx->md_ctx, ictx->header_oid, snap_id);
+      if (r == 0) {
+        ictx->rm_snap(snap_name, snap_id);
+      }
     }
 
     if (r < 0) {
@@ -1912,11 +2140,12 @@ reprotect_and_return_err:
     if (ictx->parent) {
       uint64_t overlap;
       r = ictx->get_parent_overlap(ictx->snap_id, &overlap);
-      if (r < 0)
+      if (r < 0 && r != -ENOENT) {
 	return r;
-      if (!overlap ||
+      }
+      if (r == -ENOENT || overlap == 0 ||
 	  ictx->parent->md_ctx.get_id() !=
-	  ictx->get_parent_pool_id(ictx->snap_id) ||
+            ictx->get_parent_pool_id(ictx->snap_id) ||
 	  ictx->parent->id != ictx->get_parent_image_id(ictx->snap_id) ||
 	  ictx->parent->snap_id != ictx->get_parent_snap_id(ictx->snap_id)) {
 	ictx->clear_nonexistence_cache();
@@ -1955,7 +2184,6 @@ reprotect_and_return_err:
     bool new_snap = false;
     vector<string> snap_names;
     vector<uint64_t> snap_sizes;
-    vector<uint64_t> snap_features;
     vector<parent_info> snap_parents;
     vector<uint8_t> snap_protection;
     vector<uint64_t> snap_flags;
@@ -2036,6 +2264,9 @@ reprotect_and_return_err:
 				   << "disabling object map optimizations"
 				   << dendl;
 	      ictx->flags = RBD_FLAG_OBJECT_MAP_INVALID;
+              if ((ictx->features & RBD_FEATURE_FAST_DIFF) != 0) {
+                ictx->flags |= RBD_FLAG_FAST_DIFF_INVALID;
+              }
 
 	      vector<uint64_t> default_flags(new_snapc.snaps.size(), ictx->flags);
 	      snap_flags.swap(default_flags);
@@ -2049,8 +2280,8 @@ reprotect_and_return_err:
 
 	    r = cls_client::snapshot_list(&(ictx->md_ctx), ictx->header_oid,
 					  new_snapc.snaps, &snap_names,
-					  &snap_sizes, &snap_features,
-					  &snap_parents, &snap_protection);
+                                          &snap_sizes, &snap_parents,
+                                          &snap_protection);
 	    // -ENOENT here means we raced with snapshot deletion
 	    if (r < 0 && r != -ENOENT) {
 	      lderr(ictx->cct) << "snapc = " << new_snapc << dendl;
@@ -2062,7 +2293,6 @@ reprotect_and_return_err:
 	}
 
 	for (size_t i = 0; i < new_snapc.snaps.size(); ++i) {
-	  uint64_t features = ictx->old_format ? 0 : snap_features[i];
 	  parent_info parent;
 	  if (!ictx->old_format)
 	    parent = snap_parents[i];
@@ -2073,7 +2303,6 @@ reprotect_and_return_err:
 	    ldout(cct, 20) << "new snapshot id=" << new_snapc.snaps[i].val
 			   << " name=" << snap_names[i]
 			   << " size=" << snap_sizes[i]
-			   << " features=" << features
 			   << dendl;
 	  }
 	}
@@ -2082,7 +2311,6 @@ reprotect_and_return_err:
 	ictx->snap_info.clear();
 	ictx->snap_ids.clear();
 	for (size_t i = 0; i < new_snapc.snaps.size(); ++i) {
-	  uint64_t features = ictx->old_format ? 0 : snap_features[i];
 	  uint64_t flags = ictx->old_format ? 0 : snap_flags[i];
 	  uint8_t protection_status = ictx->old_format ?
 	    (uint8_t)RBD_PROTECTION_STATUS_UNPROTECTED : snap_protection[i];
@@ -2090,7 +2318,7 @@ reprotect_and_return_err:
 	  if (!ictx->old_format)
 	    parent = snap_parents[i];
 	  ictx->add_snap(snap_names[i], new_snapc.snaps[i].val, snap_sizes[i],
-			 features, parent, protection_status, flags);
+			 parent, protection_status, flags);
 	}
 
 	r = refresh_parent(ictx);
@@ -2240,8 +2468,7 @@ reprotect_and_return_err:
     int order = src->order;
 
     src->snap_lock.get_read();
-    uint64_t src_features;
-    src->get_features(src->snap_id, &src_features);
+    uint64_t src_features = src->features;
     uint64_t src_size = src->get_image_size(src->snap_id);
     src->snap_lock.put_read();
 
@@ -2261,7 +2488,10 @@ reprotect_and_return_err:
     }
 
     r = copy(src, dest, prog_ctx);
-    close_image(dest);
+    int close_r = close_image(dest);
+    if (r == 0 && close_r < 0) {
+      r = close_r;
+    }
     return r;
   }
 
@@ -2303,7 +2533,7 @@ reprotect_and_return_err:
 
       Context *ctx = new C_CopyWrite(m_throttle, m_bl);
       AioCompletion *comp = aio_create_completion_internal(ctx, rbd_ctx_cb);
-      aio_write(m_dest, m_offset, m_bl->length(), m_bl->c_str(), comp, 0);
+      aio_write(m_dest, m_offset, m_bl->length(), m_bl->c_str(), comp, LIBRADOS_OP_FLAG_FADVISE_DONTNEED);
     }
   private:
     SimpleThrottle *m_throttle;
@@ -2329,8 +2559,23 @@ reprotect_and_return_err:
       return -EINVAL;
     }
     int r;
-    SimpleThrottle throttle(cct->_conf->rbd_concurrent_management_ops, false);
+    map<string, bufferlist> pairs;
+
+    r = cls_client::metadata_list(&src->md_ctx, src->header_oid, "", 0, &pairs);
+    if (r < 0 && r != -EOPNOTSUPP && r != -EIO) {
+      lderr(cct) << "couldn't list metadata: " << r << dendl;
+      return r;
+    } else if (r == 0 && !pairs.empty()) {
+      r = cls_client::metadata_set(&dest->md_ctx, dest->header_oid, pairs);
+      if (r < 0) {
+        lderr(cct) << "couldn't set metadata: " << r << dendl;
+        return r;
+      }
+    }
+
+    SimpleThrottle throttle(src->concurrent_management_ops, false);
     uint64_t period = src->get_stripe_period();
+    unsigned fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL | LIBRADOS_OP_FLAG_FADVISE_NOCACHE;
     for (uint64_t offset = 0; offset < src_size; offset += period) {
       if (throttle.pending_error()) {
         return throttle.wait_for_ret();
@@ -2340,7 +2585,7 @@ reprotect_and_return_err:
       bufferlist *bl = new bufferlist();
       Context *ctx = new C_CopyRead(&throttle, dest, offset, bl);
       AioCompletion *comp = aio_create_completion_internal(ctx, rbd_ctx_cb);
-      aio_read(src, offset, len, NULL, bl, comp, 0);
+      aio_read(src, offset, len, NULL, bl, comp, fadvise_flags);
       prog_ctx.update_progress(offset, src_size);
     }
 
@@ -2461,7 +2706,7 @@ reprotect_and_return_err:
     return r;
   }
 
-  void close_image(ImageCtx *ictx)
+  int close_image(ImageCtx *ictx)
   {
     ldout(ictx->cct, 20) << "close_image " << ictx << dendl;
 
@@ -2478,10 +2723,15 @@ reprotect_and_return_err:
     ictx->flush_async_operations();
     ictx->readahead.wait_for_pending();
 
+    int r;
     if (ictx->object_cacher) {
-      ictx->shutdown_cache(); // implicitly flushes
+      r = ictx->shutdown_cache(); // implicitly flushes
     } else {
-      flush(ictx);
+      r = flush(ictx);
+    }
+    if (r < 0) {
+      lderr(ictx->cct) << "error flushing IO: " << cpp_strerror(r)
+                       << dendl;
     }
 
     ictx->op_work_queue->drain();
@@ -2492,7 +2742,10 @@ reprotect_and_return_err:
     }
 
     if (ictx->parent) {
-      close_image(ictx->parent);
+      int close_r = close_image(ictx->parent);
+      if (r == 0 && close_r < 0) {
+        r = close_r;
+      }
       ictx->parent = NULL;
     }
 
@@ -2500,10 +2753,13 @@ reprotect_and_return_err:
       {
 	RWLock::WLocker l(ictx->owner_lock);
 	if (ictx->image_watcher->is_lock_owner()) {
-	  int r = ictx->image_watcher->unlock();
-	  if (r < 0) {
-	    lderr(ictx->cct) << "error unlocking image: " << cpp_strerror(r)
-			     << dendl;
+	  int unlock_r = ictx->image_watcher->unlock();
+	  if (unlock_r < 0) {
+	    lderr(ictx->cct) << "error unlocking image: "
+                             << cpp_strerror(unlock_r) << dendl;
+            if (r == 0) {
+              r = unlock_r;
+            }
 	  }
 	}
       }
@@ -2511,6 +2767,7 @@ reprotect_and_return_err:
     }
 
     delete ictx;
+    return r;
   }
 
   // 'flatten' child image by copying all parent's blocks
@@ -2570,15 +2827,15 @@ reprotect_and_return_err:
     }
 
     uint64_t object_size;
-    uint64_t overlap;
     uint64_t overlap_objects;
     ::SnapContext snapc;
 
     {
+      uint64_t overlap;
       RWLock::RLocker l(ictx->snap_lock);
       RWLock::RLocker l2(ictx->parent_lock);
 
-      if (ictx->read_only || ictx->snap_id != CEPH_NOSNAP) {
+      if (ictx->read_only) {
         return -EROFS;
       }
 
@@ -2609,6 +2866,57 @@ reprotect_and_return_err:
     return 0;
   }
 
+  int rebuild_object_map(ImageCtx *ictx, ProgressContext &prog_ctx) {
+    CephContext *cct = ictx->cct;
+    ldout(cct, 10) << "rebuild_object_map" << dendl;
+
+    int r = ictx_check(ictx);
+    if (r < 0) {
+      return r;
+    }
+
+    uint64_t request_id = ictx->async_request_seq.inc();
+    r = invoke_async_request(ictx, "rebuild object map", true,
+                             boost::bind(&async_rebuild_object_map, ictx, _1,
+                                         boost::ref(prog_ctx)),
+                             boost::bind(&ImageWatcher::notify_rebuild_object_map,
+                                         ictx->image_watcher, request_id,
+                                         boost::ref(prog_ctx)));
+
+    ldout(cct, 10) << "rebuild object map finished" << dendl;
+    if (r < 0) {
+      notify_change(ictx->md_ctx, ictx->header_oid, ictx);
+    }
+    return r;
+  }
+
+  int async_rebuild_object_map(ImageCtx *ictx, Context *ctx,
+                               ProgressContext &prog_ctx) {
+    assert(ictx->owner_lock.is_locked());
+    assert(!ictx->image_watcher->is_lock_supported() ||
+	   ictx->image_watcher->is_lock_owner());
+
+    CephContext *cct = ictx->cct;
+    ldout(cct, 20) << "async_rebuild_object_map " << ictx << dendl;
+
+    if (ictx->read_only) {
+      return -EROFS;
+    }
+    if (!ictx->test_features(RBD_FEATURE_OBJECT_MAP)) {
+      return -EINVAL;
+    }
+
+    int r = ictx_check(ictx, true);
+    if (r < 0) {
+      return r;
+    }
+
+    RebuildObjectMapRequest *req = new RebuildObjectMapRequest(*ictx, ctx,
+                                                               prog_ctx);
+    req->send();
+    return 0;
+  }
+
   int list_lockers(ImageCtx *ictx,
 		   std::list<locker_t> *lockers,
 		   bool *exclusive,
@@ -2703,8 +3011,7 @@ reprotect_and_return_err:
       return -EINVAL;
     }
 
-    md_config_t *conf = ictx->cct->_conf;
-    if (conf->rbd_blacklist_on_break_lock) {
+    if (ictx->blacklist_on_break_lock) {
       typedef std::map<rados::cls::lock::locker_id_t,
 		       rados::cls::lock::locker_info_t> Lockers;
       Lockers lockers;
@@ -2734,7 +3041,7 @@ reprotect_and_return_err:
       RWLock::RLocker locker(ictx->md_lock);
       librados::Rados rados(ictx->md_ctx);
       r = rados.blacklist_add(client_address,
-			      conf->rbd_blacklist_expire_seconds);
+			      ictx->blacklist_expire_seconds);
       if (r < 0) {
         lderr(ictx->cct) << "unable to blacklist client: " << cpp_strerror(r)
           	       << dendl;
@@ -2822,25 +3129,10 @@ reprotect_and_return_err:
     return total_read;
   }
 
-  int simple_diff_cb(uint64_t off, size_t len, int exists, void *arg)
-  {
-    // This reads the existing extents in a parent from the beginning
-    // of time.  Since images are thin-provisioned, the extents will
-    // always represent data, not holes.
-    assert(exists);
-    interval_set<uint64_t> *diff = static_cast<interval_set<uint64_t> *>(arg);
-    diff->insert(off, len);
-    return 0;
-  }
-
-
-  int diff_iterate(ImageCtx *ictx, const char *fromsnapname,
-		   uint64_t off, uint64_t len,
-		   int (*cb)(uint64_t, size_t, int, void *),
-		   void *arg)
+  int diff_iterate(ImageCtx *ictx, const char *fromsnapname, uint64_t off,
+                   uint64_t len, bool include_parent, bool whole_object,
+		   int (*cb)(uint64_t, size_t, int, void *), void *arg)
   {
-    utime_t start_time, elapsed;
-
     ldout(ictx->cct, 20) << "diff_iterate " << ictx << " off = " << off
 			 << " len = " << len << dendl;
 
@@ -2851,157 +3143,21 @@ reprotect_and_return_err:
     }
 
     int r = ictx_check(ictx);
-    if (r < 0)
+    if (r < 0) {
       return r;
+    }
 
     ictx->snap_lock.get_read();
     r = clip_io(ictx, off, &len);
     ictx->snap_lock.put_read();
-    if (r < 0)
+    if (r < 0) {
       return r;
-
-    librados::IoCtx head_ctx;
-
-    ictx->md_lock.get_read();
-    ictx->snap_lock.get_read();
-    head_ctx.dup(ictx->data_ctx);
-    snap_t from_snap_id = 0;
-    uint64_t from_size = 0;
-    if (fromsnapname) {
-      from_snap_id = ictx->get_snap_id(fromsnapname);
-      from_size = ictx->get_image_size(from_snap_id);
-    }
-    snap_t end_snap_id = ictx->snap_id;
-    uint64_t end_size = ictx->get_image_size(end_snap_id);
-    ictx->snap_lock.put_read();
-    ictx->md_lock.put_read();
-    if (from_snap_id == CEPH_NOSNAP) {
-      return -ENOENT;
-    }
-    if (from_snap_id == end_snap_id) {
-      // no diff.
-      return 0;
-    }
-    if (from_snap_id >= end_snap_id) {
-      return -EINVAL;
-    }
-
-    // we must list snaps via the head, not end snap
-    head_ctx.snap_set_read(CEPH_SNAPDIR);
-
-    ldout(ictx->cct, 20) << "diff_iterate from " << from_snap_id << " to " << end_snap_id
-			 << " size from " << from_size << " to " << end_size << dendl;
-
-    // FIXME: if end_size > from_size, we could read_iterate for the
-    // final part, and skip the listsnaps op.
-
-    // check parent overlap only if we are comparing to the beginning of time
-    interval_set<uint64_t> parent_diff;
-    if (from_snap_id == 0) {
-      RWLock::RLocker l(ictx->snap_lock);
-      RWLock::RLocker l2(ictx->parent_lock);
-      uint64_t overlap = end_size;
-      ictx->get_parent_overlap(from_snap_id, &overlap);
-      r = 0;
-      if (ictx->parent && overlap > 0) {
-	ldout(ictx->cct, 10) << " first getting parent diff" << dendl;
-	r = diff_iterate(ictx->parent, NULL, 0, overlap, simple_diff_cb, &parent_diff);
-      }
-      if (r < 0)
-	return r;
-    }
-
-    uint64_t period = ictx->get_stripe_period();
-    uint64_t left = len;
-
-    while (left > 0) {
-      uint64_t period_off = off - (off % period);
-      uint64_t read_len = min(period_off + period - off, left);
-
-      // map to extents
-      map<object_t,vector<ObjectExtent> > object_extents;
-      Striper::file_to_extents(ictx->cct, ictx->format_string, &ictx->layout,
-			       off, read_len, 0, object_extents, 0);
-
-      // get snap info for each object
-      for (map<object_t,vector<ObjectExtent> >::iterator p = object_extents.begin();
-	   p != object_extents.end();
-	   ++p) {
-	ldout(ictx->cct, 20) << "diff_iterate object " << p->first << dendl;
-
-	librados::snap_set_t snap_set;
-	int r = head_ctx.list_snaps(p->first.name, &snap_set);
-	if (r == -ENOENT) {
-	  if (from_snap_id == 0 && !parent_diff.empty()) {
-	    // report parent diff instead
-	    for (vector<ObjectExtent>::iterator q = p->second.begin(); q != p->second.end(); ++q) {
-	      for (vector<pair<uint64_t,uint64_t> >::iterator r = q->buffer_extents.begin();
-		   r != q->buffer_extents.end();
-		   ++r) {
-		interval_set<uint64_t> o;
-		o.insert(off + r->first, r->second);
-		o.intersection_of(parent_diff);
-		ldout(ictx->cct, 20) << " reporting parent overlap " << o << dendl;
-		for (interval_set<uint64_t>::iterator s = o.begin(); s != o.end(); ++s) {
-		  cb(s.get_start(), s.get_len(), true, arg);
-		}
-	      }
-	    }
-	  }
-	  continue;
-	}
-	if (r < 0)
-	  return r;
-
-	// calc diff from from_snap_id -> to_snap_id
-	interval_set<uint64_t> diff;
-	bool end_exists;
-	calc_snap_set_diff(ictx->cct, snap_set,
-			   from_snap_id,
-			   end_snap_id,
-			   &diff, &end_exists);
-	ldout(ictx->cct, 20) << "  diff " << diff << " end_exists=" << end_exists << dendl;
-	if (diff.empty())
-	  continue;
-
-	for (vector<ObjectExtent>::iterator q = p->second.begin(); q != p->second.end(); ++q) {
-	  ldout(ictx->cct, 20) << "diff_iterate object " << p->first
-			       << " extent " << q->offset << "~" << q->length
-			       << " from " << q->buffer_extents
-			       << dendl;
-	  uint64_t opos = q->offset;
-	  for (vector<pair<uint64_t,uint64_t> >::iterator r = q->buffer_extents.begin();
-	       r != q->buffer_extents.end();
-	       ++r) {
-	    interval_set<uint64_t> overlap;  // object extents
-	    overlap.insert(opos, r->second);
-	    overlap.intersection_of(diff);
-	    ldout(ictx->cct, 20) << " opos " << opos
-				 << " buf " << r->first << "~" << r->second
-				 << " overlap " << overlap
-				 << dendl;
-	    for (interval_set<uint64_t>::iterator s = overlap.begin();
-		 s != overlap.end();
-		 ++s) {
-	      uint64_t su_off = s.get_start() - opos;
-	      uint64_t logical_off = off + r->first + su_off;
-	      ldout(ictx->cct, 20) << "   overlap extent " << s.get_start() << "~" << s.get_len()
-				   << " logical "
-				   << logical_off << "~" << s.get_len()
-				   << dendl;
-	      cb(logical_off, s.get_len(), end_exists, arg);
-	    }
-	    opos += r->second;
-	  }
-	  assert(opos == q->offset + q->length);
-	}
-      }
-
-      left -= read_len;
-      off += read_len;
     }
 
-    return 0;
+    DiffIterate command(*ictx, fromsnapname, off, len, include_parent,
+                        whole_object, cb, arg);
+    r = command.execute();
+    return r;
   }
 
   int simple_read_cb(uint64_t ofs, size_t len, const char *buf, void *arg)
@@ -3017,15 +3173,23 @@ reprotect_and_return_err:
 
   ssize_t read(ImageCtx *ictx, uint64_t ofs, size_t len, char *buf, int op_flags)
   {
+    ssize_t ret;
+    ldout(ictx->cct, 20) << "read " << ictx << " off = " << ofs << " len = "
+			 << len << dendl;
+
     vector<pair<uint64_t,uint64_t> > extents;
     extents.push_back(make_pair(ofs, len));
-    return read(ictx, extents, buf, NULL, op_flags);
+    ret = read(ictx, extents, buf, NULL, op_flags);
+    if (ret < 0)
+      return ret;
+
+    return ret;
   }
 
   ssize_t read(ImageCtx *ictx, const vector<pair<uint64_t,uint64_t> >& image_extents,
 		char *buf, bufferlist *pbl, int op_flags)
   {
-    Mutex mylock("IoCtxImpl::write::mylock");
+    Mutex mylock("librbd::read::mylock");
     Cond cond;
     bool done;
     int ret;
@@ -3044,11 +3208,9 @@ reprotect_and_return_err:
 
   ssize_t write(ImageCtx *ictx, uint64_t off, size_t len, const char *buf, int op_flags)
   {
-    utime_t start_time, elapsed;
     ldout(ictx->cct, 20) << "write " << ictx << " off = " << off << " len = "
 			 << len << dendl;
 
-    start_time = ceph_clock_now(ictx->cct);
     Mutex mylock("librbd::write::mylock");
     Cond cond;
     bool done;
@@ -3075,28 +3237,30 @@ reprotect_and_return_err:
       return ret;
     }
 
-    elapsed = ceph_clock_now(ictx->cct) - start_time;
-    ictx->perfcounter->tinc(l_librbd_wr_latency, elapsed);
-    ictx->perfcounter->inc(l_librbd_wr);
-    ictx->perfcounter->inc(l_librbd_wr_bytes, mylen);
     return mylen;
   }
 
   int discard(ImageCtx *ictx, uint64_t off, uint64_t len)
   {
-    utime_t start_time, elapsed;
     ldout(ictx->cct, 20) << "discard " << ictx << " off = " << off << " len = "
 			 << len << dendl;
 
-    start_time = ceph_clock_now(ictx->cct);
     Mutex mylock("librbd::discard::mylock");
     Cond cond;
     bool done;
     int ret;
 
+    uint64_t mylen = len;
+    ictx->snap_lock.get_read();
+    int r = clip_io(ictx, off, &mylen);
+    ictx->snap_lock.put_read();
+    if (r < 0) {
+      return r;
+    }
+
     Context *ctx = new C_SafeCond(&mylock, &cond, &done, &ret);
     AioCompletion *c = aio_create_completion_internal(ctx, rbd_ctx_cb);
-    aio_discard(ictx, off, len, c);
+    aio_discard(ictx, off, mylen, c);
 
     mylock.Lock();
     while (!done)
@@ -3107,11 +3271,7 @@ reprotect_and_return_err:
       return ret;
     }
 
-    elapsed = ceph_clock_now(ictx->cct) - start_time;
-    ictx->perfcounter->inc(l_librbd_discard_latency, elapsed);
-    ictx->perfcounter->inc(l_librbd_discard);
-    ictx->perfcounter->inc(l_librbd_discard_bytes, len);
-    return len;
+    return mylen;
   }
 
   ssize_t handle_sparse_read(CephContext *cct,
@@ -3231,7 +3391,7 @@ reprotect_and_return_err:
     c->add_request();
     ictx->flush_async_operations(flush_ctx);
 
-    c->init_time(ictx, AIO_TYPE_FLUSH);
+    c->start_op(ictx, AIO_TYPE_FLUSH);
     C_AioWrite *req_comp = new C_AioWrite(cct, c);
     c->add_request();
     if (ictx->object_cacher) {
@@ -3338,8 +3498,7 @@ reprotect_and_return_err:
       }
 
       snapc = ictx->snapc;
-
-      c->init_time(ictx, AIO_TYPE_WRITE);
+      c->start_op(ictx, AIO_TYPE_WRITE);
     }
 
     if (ictx->image_watcher->is_lock_supported() &&
@@ -3385,8 +3544,62 @@ reprotect_and_return_err:
     c->finish_adding_requests(ictx->cct);
     c->put();
 
-    ictx->perfcounter->inc(l_librbd_aio_wr);
-    ictx->perfcounter->inc(l_librbd_aio_wr_bytes, clip_len);
+    ictx->perfcounter->inc(l_librbd_wr);
+    ictx->perfcounter->inc(l_librbd_wr_bytes, clip_len);
+  }
+
+  int metadata_get(ImageCtx *ictx, const string &key, string *value)
+  {
+    CephContext *cct = ictx->cct;
+    ldout(cct, 20) << "metadata_get " << ictx << " key=" << key << dendl;
+
+    int r = ictx_check(ictx);
+    if (r < 0) {
+      return r;
+    }
+
+    return cls_client::metadata_get(&ictx->md_ctx, ictx->header_oid, key, value);
+  }
+
+  int metadata_set(ImageCtx *ictx, const string &key, const string &value)
+  {
+    CephContext *cct = ictx->cct;
+    ldout(cct, 20) << "metadata_set " << ictx << " key=" << key << " value=" << value << dendl;
+
+    int r = ictx_check(ictx);
+    if (r < 0) {
+      return r;
+    }
+
+    map<string, bufferlist> data;
+    data[key].append(value);
+    return cls_client::metadata_set(&ictx->md_ctx, ictx->header_oid, data);
+  }
+
+  int metadata_remove(ImageCtx *ictx, const string &key)
+  {
+    CephContext *cct = ictx->cct;
+    ldout(cct, 20) << "metadata_remove " << ictx << " key=" << key << dendl;
+
+    int r = ictx_check(ictx);
+    if (r < 0) {
+      return r;
+    }
+
+    return cls_client::metadata_remove(&ictx->md_ctx, ictx->header_oid, key);
+  }
+
+  int metadata_list(ImageCtx *ictx, const string &start, uint64_t max, map<string, bufferlist> *pairs)
+  {
+    CephContext *cct = ictx->cct;
+    ldout(cct, 20) << "metadata_list " << ictx << dendl;
+
+    int r = ictx_check(ictx);
+    if (r < 0) {
+      return r;
+    }
+
+    return cls_client::metadata_list(&ictx->md_ctx, ictx->header_oid, start, max, pairs);
   }
 
   void aio_discard(ImageCtx *ictx, uint64_t off, uint64_t len, AioCompletion *c)
@@ -3424,8 +3637,7 @@ reprotect_and_return_err:
 
       // TODO: check for snap
       snapc = ictx->snapc;
-
-      c->init_time(ictx, AIO_TYPE_DISCARD);
+      c->start_op(ictx, AIO_TYPE_DISCARD);
     }
 
     if (ictx->image_watcher->is_lock_supported() &&
@@ -3450,12 +3662,16 @@ reprotect_and_return_err:
       AbstractWrite *req;
       c->add_request();
 
-      if (p->offset == 0 && p->length == ictx->layout.fl_object_size) {
+      if (p->length == ictx->layout.fl_object_size) {
 	req = new AioRemove(ictx, p->oid.name, p->objectno, snapc, req_comp);
       } else if (p->offset + p->length == ictx->layout.fl_object_size) {
 	req = new AioTruncate(ictx, p->oid.name, p->objectno, p->offset, snapc,
                               req_comp);
       } else {
+	if(ictx->cct->_conf->rbd_skip_partial_discard) {
+	  delete req_comp;
+	  continue;
+	}
 	req = new AioZero(ictx, p->oid.name, p->objectno, p->offset, p->length,
 			  snapc, req_comp);
       }
@@ -3471,8 +3687,8 @@ reprotect_and_return_err:
     c->finish_adding_requests(ictx->cct);
     c->put();
 
-    ictx->perfcounter->inc(l_librbd_aio_discard);
-    ictx->perfcounter->inc(l_librbd_aio_discard_bytes, clip_len);
+    ictx->perfcounter->inc(l_librbd_discard);
+    ictx->perfcounter->inc(l_librbd_discard_bytes, clip_len);
   }
 
   void rbd_req_cb(completion_t cb, void *arg)
@@ -3505,8 +3721,7 @@ reprotect_and_return_err:
   };
 
   static void readahead(ImageCtx *ictx,
-			const vector<pair<uint64_t,uint64_t> >& image_extents,
-			const md_config_t *conf)
+			const vector<pair<uint64_t,uint64_t> >& image_extents)
   {
     uint64_t total_bytes = 0;
     for (vector<pair<uint64_t,uint64_t> >::const_iterator p = image_extents.begin();
@@ -3515,8 +3730,8 @@ reprotect_and_return_err:
       total_bytes += p->second;
     }
     ictx->md_lock.get_write();
-    bool abort = conf->rbd_readahead_disable_after_bytes != 0 &&
-      ictx->total_bytes_read > (uint64_t)conf->rbd_readahead_disable_after_bytes;
+    bool abort = ictx->readahead_disable_after_bytes != 0 &&
+      ictx->total_bytes_read > ictx->readahead_disable_after_bytes;
     ictx->total_bytes_read += total_bytes;
     ictx->snap_lock.get_read();
     uint64_t image_size = ictx->get_image_size(ictx->snap_id);
@@ -3566,10 +3781,9 @@ reprotect_and_return_err:
     RWLock::RLocker owner_locker(ictx->owner_lock);
 
     // readahead
-    const md_config_t *conf = ictx->cct->_conf;
-    if (ictx->object_cacher && conf->rbd_readahead_max_bytes > 0 &&
+    if (ictx->object_cacher && ictx->readahead_max_bytes > 0 &&
 	!(op_flags & LIBRADOS_OP_FLAG_FADVISE_RANDOM)) {
-      readahead(ictx, image_extents, conf);
+      readahead(ictx, image_extents);
     }
 
     snap_t snap_id;
@@ -3599,8 +3813,7 @@ reprotect_and_return_err:
 			         p->first, len, 0, object_extents, buffer_ofs);
         buffer_ofs += len;
       }
-
-      c->init_time(ictx, AIO_TYPE_READ);
+      c->start_op(ictx, AIO_TYPE_READ);
     }
 
     c->read_buf = buf;
@@ -3636,8 +3849,8 @@ reprotect_and_return_err:
     c->finish_adding_requests(cct);
     c->put();
 
-    ictx->perfcounter->inc(l_librbd_aio_rd);
-    ictx->perfcounter->inc(l_librbd_aio_rd_bytes, buffer_ofs);
+    ictx->perfcounter->inc(l_librbd_rd);
+    ictx->perfcounter->inc(l_librbd_rd_bytes, buffer_ofs);
   }
 
   AioCompletion *aio_create_completion() {
diff --git a/src/librbd/internal.h b/src/librbd/internal.h
index a633c9d..7eaa3c5 100644
--- a/src/librbd/internal.h
+++ b/src/librbd/internal.h
@@ -28,15 +28,6 @@ enum {
   l_librbd_discard_latency,
   l_librbd_flush,
 
-  l_librbd_aio_rd,               // read ops
-  l_librbd_aio_rd_bytes,         // bytes read
-  l_librbd_aio_rd_latency,
-  l_librbd_aio_wr,
-  l_librbd_aio_wr_bytes,
-  l_librbd_aio_wr_latency,
-  l_librbd_aio_discard,
-  l_librbd_aio_discard_bytes,
-  l_librbd_aio_discard_latency,
   l_librbd_aio_flush,
   l_librbd_aio_flush_latency,
 
@@ -101,6 +92,7 @@ namespace librbd {
   int get_old_format(ImageCtx *ictx, uint8_t *old);
   int get_size(ImageCtx *ictx, uint64_t *size);
   int get_features(ImageCtx *ictx, uint64_t *features);
+  int update_features(ImageCtx *ictx, uint64_t features, bool enabled);
   int get_overlap(ImageCtx *ictx, uint64_t *overlap);
   int get_parent_info(ImageCtx *ictx, std::string *parent_pool_name,
 		      std::string *parent_name, std::string *parent_snap_name);
@@ -117,12 +109,13 @@ namespace librbd {
   int snap_rollback(ImageCtx *ictx, const char *snap_name,
 		    ProgressContext& prog_ctx);
   int snap_remove(ImageCtx *ictx, const char *snap_name);
+  int snap_remove_helper(ImageCtx *ictx, Context* ctx, const char *snap_name);
   int snap_protect(ImageCtx *ictx, const char *snap_name);
   int snap_unprotect(ImageCtx *ictx, const char *snap_name);
   int snap_is_protected(ImageCtx *ictx, const char *snap_name,
 			bool *is_protected);
   int add_snap(ImageCtx *ictx, const char *snap_name);
-  int rm_snap(ImageCtx *ictx, const char *snap_name);
+  int rm_snap(ImageCtx *ictx, const char *snap_name, uint64_t snap_id);
   int refresh_parent(ImageCtx *ictx);
   int ictx_check(ImageCtx *ictx, bool owner_locked=false);
   int ictx_refresh(ImageCtx *ictx);
@@ -132,12 +125,14 @@ namespace librbd {
 
   int open_parent(ImageCtx *ictx);
   int open_image(ImageCtx *ictx);
-  void close_image(ImageCtx *ictx);
+  int close_image(ImageCtx *ictx);
 
   int copyup_block(ImageCtx *ictx, uint64_t offset, size_t len,
 		   const char *buf);
   int flatten(ImageCtx *ictx, ProgressContext &prog_ctx);
 
+  int rebuild_object_map(ImageCtx *ictx, ProgressContext &prog_ctx);
+
   /* cooperative locking */
   int list_lockers(ImageCtx *ictx,
 		   std::list<locker_t> *locks,
@@ -182,8 +177,8 @@ namespace librbd {
   int64_t read_iterate(ImageCtx *ictx, uint64_t off, uint64_t len,
 		       int (*cb)(uint64_t, size_t, const char *, void *),
 		       void *arg);
-  int diff_iterate(ImageCtx *ictx, const char *fromsnapname,
-		   uint64_t off, uint64_t len,
+  int diff_iterate(ImageCtx *ictx, const char *fromsnapname, uint64_t off,
+                   uint64_t len, bool include_parent, bool whole_object,
 		   int (*cb)(uint64_t, size_t, int, void *),
 		   void *arg);
   ssize_t read(ImageCtx *ictx, uint64_t off, size_t len, char *buf, int op_flags);
@@ -197,6 +192,8 @@ namespace librbd {
 		   ProgressContext &prog_ctx);
   void async_resize_helper(ImageCtx *ictx, Context *ctx, uint64_t new_size,
                            ProgressContext& prog_ctx);
+  int async_rebuild_object_map(ImageCtx *ictx, Context *ctx,
+                               ProgressContext &prog_ctx);
 
   void aio_write(ImageCtx *ictx, uint64_t off, size_t len, const char *buf,
 		 AioCompletion *c, int op_flags);
@@ -209,6 +206,10 @@ namespace librbd {
   int flush(ImageCtx *ictx);
   int _flush(ImageCtx *ictx);
   int invalidate_cache(ImageCtx *ictx);
+  int metadata_list(ImageCtx *ictx, const string &last, uint64_t max, map<string, bufferlist> *pairs);
+  int metadata_get(ImageCtx *ictx, const std::string &key, std::string *value);
+  int metadata_set(ImageCtx *ictx, const std::string &key, const std::string &value);
+  int metadata_remove(ImageCtx *ictx, const std::string &key);
 
   ssize_t handle_sparse_read(CephContext *cct,
 			     ceph::bufferlist data_bl,
diff --git a/src/librbd/librbd.cc b/src/librbd/librbd.cc
index a99f7d4..251b96e 100644
--- a/src/librbd/librbd.cc
+++ b/src/librbd/librbd.cc
@@ -20,7 +20,7 @@
 #include "common/errno.h"
 #include "common/snap_types.h"
 #include "common/perf_counters.h"
-#include "common/WorkQueue.h"
+#include "common/TracepointProvider.h"
 #include "include/Context.h"
 #include "include/rbd/librbd.hpp"
 #include "osdc/ObjectCacher.h"
@@ -36,7 +36,11 @@
 #include <vector>
 
 #ifdef WITH_LTTNG
+#define TRACEPOINT_DEFINE
+#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
 #include "tracing/librbd.h"
+#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#undef TRACEPOINT_DEFINE
 #else
 #define tracepoint(...)
 #endif
@@ -54,6 +58,8 @@ using librados::IoCtx;
 
 namespace {
 
+TracepointProvider::Traits tracepoint_traits("librbd_tp.so", "rbd_tracing");
+
 class C_AioReadWQ : public Context {
 public:
   C_AioReadWQ(librbd::ImageCtx *ictx, uint64_t off, size_t len,
@@ -130,7 +136,8 @@ private:
 void submit_aio_read(librbd::ImageCtx *ictx, uint64_t off, size_t len,
                      char *buf, bufferlist *pbl, librbd::AioCompletion *c,
                      int op_flags) {
-  if (ictx->cct->_conf->rbd_non_blocking_aio) {
+  c->init_time(ictx, librbd::AIO_TYPE_READ);
+  if (ictx->non_blocking_aio) {
     ictx->aio_work_queue->queue(new C_AioReadWQ(ictx, off, len, buf, pbl, c,
                                                 op_flags));
   } else {
@@ -140,7 +147,8 @@ void submit_aio_read(librbd::ImageCtx *ictx, uint64_t off, size_t len,
 
 void submit_aio_write(librbd::ImageCtx *ictx, uint64_t off, size_t len,
                       const char *buf, librbd::AioCompletion *c, int op_flags) {
-  if (ictx->cct->_conf->rbd_non_blocking_aio) {
+  c->init_time(ictx, librbd::AIO_TYPE_WRITE);
+  if (ictx->non_blocking_aio) {
     ictx->aio_work_queue->queue(new C_AioWriteWQ(ictx, off, len, buf, c,
                                                  op_flags));
   } else {
@@ -150,7 +158,8 @@ void submit_aio_write(librbd::ImageCtx *ictx, uint64_t off, size_t len,
 
 void submit_aio_discard(librbd::ImageCtx *ictx, uint64_t off, uint64_t len,
                         librbd::AioCompletion *c) {
-  if (ictx->cct->_conf->rbd_non_blocking_aio) {
+  c->init_time(ictx, librbd::AIO_TYPE_DISCARD);
+  if (ictx->non_blocking_aio) {
     ictx->aio_work_queue->queue(new C_AioDiscardWQ(ictx, off, len, c));
   } else {
     librbd::aio_discard(ictx, off, len, c);
@@ -158,13 +167,18 @@ void submit_aio_discard(librbd::ImageCtx *ictx, uint64_t off, uint64_t len,
 }
 
 void submit_aio_flush(librbd::ImageCtx *ictx, librbd::AioCompletion *c) {
-  if (ictx->cct->_conf->rbd_non_blocking_aio) {
+  c->init_time(ictx, librbd::AIO_TYPE_FLUSH);
+  if (ictx->non_blocking_aio) {
     ictx->aio_work_queue->queue(new C_AioFlushWQ(ictx, c));
   } else {
     librbd::aio_flush(ictx, c);
   }
 }
 
+CephContext* get_cct(IoCtx &io_ctx) {
+  return reinterpret_cast<CephContext*>(io_ctx.cct());
+}
+
 librbd::AioCompletion* get_aio_completion(librbd::RBD::AioCompletion *comp) {
   return reinterpret_cast<librbd::AioCompletion *>(comp->pc);
 }
@@ -217,8 +231,14 @@ namespace librbd {
 		const char *snap_name)
   {
     ImageCtx *ictx = new ImageCtx(name, "", snap_name, io_ctx, false);
+    TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
     tracepoint(librbd, open_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only);
 
+    if (image.ctx != NULL) {
+      close_image(reinterpret_cast<ImageCtx*>(image.ctx));
+      image.ctx = NULL;
+    }
+
     int r = librbd::open_image(ictx);
     if (r < 0) {
       tracepoint(librbd, open_image_exit, r);
@@ -234,8 +254,14 @@ namespace librbd {
 			  const char *snap_name)
   {
     ImageCtx *ictx = new ImageCtx(name, "", snap_name, io_ctx, true);
+    TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
     tracepoint(librbd, open_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only);
 
+    if (image.ctx != NULL) {
+      close_image(reinterpret_cast<ImageCtx*>(image.ctx));
+      image.ctx = NULL;
+    }
+
     int r = librbd::open_image(ictx);
     if (r < 0) {
       tracepoint(librbd, open_image_exit, r);
@@ -249,6 +275,7 @@ namespace librbd {
 
   int RBD::create(IoCtx& io_ctx, const char *name, uint64_t size, int *order)
   {
+    TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
     tracepoint(librbd, create_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name, size, *order);
     int r = librbd::create(io_ctx, name, size, order);
     tracepoint(librbd, create_exit, r, *order);
@@ -258,6 +285,7 @@ namespace librbd {
   int RBD::create2(IoCtx& io_ctx, const char *name, uint64_t size,
 		   uint64_t features, int *order)
   {
+    TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
     tracepoint(librbd, create2_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name, size, features, *order);
     int r = librbd::create(io_ctx, name, size, false, features, order, 0, 0);
     tracepoint(librbd, create2_exit, r, *order);
@@ -268,6 +296,7 @@ namespace librbd {
 		   uint64_t features, int *order, uint64_t stripe_unit,
 		   uint64_t stripe_count)
   {
+    TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
     tracepoint(librbd, create3_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name, size, features, *order, stripe_unit, stripe_count);
     int r = librbd::create(io_ctx, name, size, false, features, order,
 			  stripe_unit, stripe_count);
@@ -279,6 +308,7 @@ namespace librbd {
 		 IoCtx& c_ioctx, const char *c_name, uint64_t features,
 		 int *c_order)
   {
+    TracepointProvider::initialize<tracepoint_traits>(get_cct(p_ioctx));
     tracepoint(librbd, clone_enter, p_ioctx.get_pool_name().c_str(), p_ioctx.get_id(), p_name, p_snap_name, c_ioctx.get_pool_name().c_str(), c_ioctx.get_id(), c_name, features);
     int r = librbd::clone(p_ioctx, p_name, p_snap_name, c_ioctx, c_name,
 			 features, c_order, 0, 0);
@@ -290,6 +320,7 @@ namespace librbd {
 		  IoCtx& c_ioctx, const char *c_name, uint64_t features,
 		  int *c_order, uint64_t stripe_unit, int stripe_count)
   {
+    TracepointProvider::initialize<tracepoint_traits>(get_cct(p_ioctx));
     tracepoint(librbd, clone2_enter, p_ioctx.get_pool_name().c_str(), p_ioctx.get_id(), p_name, p_snap_name, c_ioctx.get_pool_name().c_str(), c_ioctx.get_id(), c_name, features, stripe_unit, stripe_count);
     int r = librbd::clone(p_ioctx, p_name, p_snap_name, c_ioctx, c_name,
 			 features, c_order, stripe_unit, stripe_count);
@@ -299,6 +330,7 @@ namespace librbd {
 
   int RBD::remove(IoCtx& io_ctx, const char *name)
   {
+    TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
     tracepoint(librbd, remove_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name);
     librbd::NoOpProgressContext prog_ctx;
     int r = librbd::remove(io_ctx, name, prog_ctx);
@@ -309,6 +341,7 @@ namespace librbd {
   int RBD::remove_with_progress(IoCtx& io_ctx, const char *name,
 				ProgressContext& pctx)
   {
+    TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
     tracepoint(librbd, remove_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name);
     int r = librbd::remove(io_ctx, name, pctx);
     tracepoint(librbd, remove_exit, r);
@@ -317,6 +350,7 @@ namespace librbd {
 
   int RBD::list(IoCtx& io_ctx, vector<string>& names)
   {
+    TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
     tracepoint(librbd, list_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id());
     int r = librbd::list(io_ctx, names);
     if (r >= 0) {
@@ -330,6 +364,7 @@ namespace librbd {
 
   int RBD::rename(IoCtx& src_io_ctx, const char *srcname, const char *destname)
   {
+    TracepointProvider::initialize<tracepoint_traits>(get_cct(src_io_ctx));
     tracepoint(librbd, rename_enter, src_io_ctx.get_pool_name().c_str(), src_io_ctx.get_id(), srcname, destname);
     int r = librbd::rename(src_io_ctx, srcname, destname);
     tracepoint(librbd, rename_exit, r);
@@ -379,12 +414,20 @@ namespace librbd {
 
   Image::~Image()
   {
+    close();
+  }
+
+  int Image::close()
+  {
+    int r = 0;
     if (ctx) {
       ImageCtx *ictx = (ImageCtx *)ctx;
       tracepoint(librbd, close_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str());
-      close_image(ictx);
-      tracepoint(librbd, close_image_exit);
+      r = close_image(ictx);
+      ctx = NULL;
+      tracepoint(librbd, close_image_exit, r);
     }
+    return r;
   }
 
   int Image::resize(uint64_t size)
@@ -442,6 +485,15 @@ namespace librbd {
     return r;
   }
 
+  int Image::update_features(uint64_t features, bool enabled)
+  {
+    ImageCtx *ictx = reinterpret_cast<ImageCtx *>(ctx);
+    tracepoint(librbd, update_features_enter, ictx, features, enabled);
+    int r = librbd::update_features(ictx, features, enabled);
+    tracepoint(librbd, update_features_exit, r);
+    return r;
+  }
+
   uint64_t Image::get_stripe_unit() const
   {
     ImageCtx *ictx = (ImageCtx *)ctx;
@@ -498,6 +550,12 @@ namespace librbd {
     return r;
   }
 
+  int Image::rebuild_object_map(ProgressContext &prog_ctx)
+  {
+    ImageCtx *ictx = reinterpret_cast<ImageCtx*>(ctx);
+    return librbd::rebuild_object_map(ictx, prog_ctx);
+  }
+
   int Image::copy(IoCtx& dest_io_ctx, const char *destname)
   {
     ImageCtx *ictx = (ImageCtx *)ctx;
@@ -778,8 +836,25 @@ namespace librbd {
 			  void *arg)
   {
     ImageCtx *ictx = (ImageCtx *)ctx;
-    tracepoint(librbd, diff_iterate_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, fromsnapname, ofs, len);
-    int r = librbd::diff_iterate(ictx, fromsnapname, ofs, len, cb, arg);
+    tracepoint(librbd, diff_iterate_enter, ictx, ictx->name.c_str(),
+               ictx->snap_name.c_str(), ictx->read_only, fromsnapname, ofs, len,
+               true, true);
+    int r = librbd::diff_iterate(ictx, fromsnapname, ofs, len, true, false, cb,
+                                 arg);
+    tracepoint(librbd, diff_iterate_exit, r);
+    return r;
+  }
+
+  int Image::diff_iterate2(const char *fromsnapname, uint64_t ofs, uint64_t len,
+                           bool include_parent, bool whole_object,
+                           int (*cb)(uint64_t, size_t, int, void *), void *arg)
+  {
+    ImageCtx *ictx = (ImageCtx *)ctx;
+    tracepoint(librbd, diff_iterate_enter, ictx, ictx->name.c_str(),
+              ictx->snap_name.c_str(), ictx->read_only, fromsnapname, ofs, len,
+              include_parent, whole_object);
+    int r = librbd::diff_iterate(ictx, fromsnapname, ofs, len, include_parent,
+                                whole_object, cb, arg);
     tracepoint(librbd, diff_iterate_exit, r);
     return r;
   }
@@ -911,6 +986,52 @@ namespace librbd {
     return r;
   }
 
+  int Image::metadata_get(const std::string &key, std::string *value)
+  {
+    ImageCtx *ictx = (ImageCtx *)ctx;
+    tracepoint(librbd, metadata_get_enter, ictx, key.c_str());
+    int r = librbd::metadata_get(ictx, key, value);
+    if (r < 0) {
+      tracepoint(librbd, metadata_get_exit, r, key.c_str(), NULL);
+    } else {
+      tracepoint(librbd, metadata_get_exit, r, key.c_str(), value->c_str());
+    }
+    return r;
+  }
+
+  int Image::metadata_set(const std::string &key, const std::string &value)
+  {
+    ImageCtx *ictx = (ImageCtx *)ctx;
+    tracepoint(librbd, metadata_set_enter, ictx, key.c_str(), value.c_str());
+    int r = librbd::metadata_set(ictx, key, value);
+    tracepoint(librbd, metadata_set_exit, r);
+    return r;
+  }
+
+  int Image::metadata_remove(const std::string &key)
+  {
+    ImageCtx *ictx = (ImageCtx *)ctx;
+    tracepoint(librbd, metadata_remove_enter, ictx, key.c_str());
+    int r = librbd::metadata_remove(ictx, key);
+    tracepoint(librbd, metadata_remove_exit, r);
+    return r;
+  }
+
+  int Image::metadata_list(const std::string &start, uint64_t max, map<string, bufferlist> *pairs)
+  {
+    ImageCtx *ictx = (ImageCtx *)ctx;
+    tracepoint(librbd, metadata_list_enter, ictx);
+    int r = librbd::metadata_list(ictx, start, max, pairs);
+    if (r >= 0) {
+      for (map<string, bufferlist>::iterator it = pairs->begin();
+           it != pairs->end(); ++it) {
+        tracepoint(librbd, metadata_list_entry, it->first.c_str(), it->second.c_str());
+      }
+    }
+    tracepoint(librbd, metadata_list_exit, r);
+    return r;
+  }
+
 } // namespace librbd
 
 extern "C" void rbd_version(int *major, int *minor, int *extra)
@@ -928,6 +1049,7 @@ extern "C" int rbd_list(rados_ioctx_t p, char *names, size_t *size)
 {
   librados::IoCtx io_ctx;
   librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+  TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
   tracepoint(librbd, list_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id());
   vector<string> cpp_names;
   int r = librbd::list(io_ctx, cpp_names);
@@ -969,6 +1091,7 @@ extern "C" int rbd_create(rados_ioctx_t p, const char *name, uint64_t size, int
 {
   librados::IoCtx io_ctx;
   librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+  TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
   tracepoint(librbd, create_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name, size, *order);
   int r = librbd::create(io_ctx, name, size, order);
   tracepoint(librbd, create_exit, r, *order);
@@ -981,6 +1104,7 @@ extern "C" int rbd_create2(rados_ioctx_t p, const char *name,
 {
   librados::IoCtx io_ctx;
   librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+  TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
   tracepoint(librbd, create2_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name, size, features, *order);
   int r = librbd::create(io_ctx, name, size, false, features, order, 0, 0);
   tracepoint(librbd, create2_exit, r, *order);
@@ -994,6 +1118,7 @@ extern "C" int rbd_create3(rados_ioctx_t p, const char *name,
 {
   librados::IoCtx io_ctx;
   librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+  TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
   tracepoint(librbd, create3_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name, size, features, *order, stripe_unit, stripe_count);
   int r = librbd::create(io_ctx, name, size, false, features, order,
 			stripe_unit, stripe_count);
@@ -1008,6 +1133,7 @@ extern "C" int rbd_clone(rados_ioctx_t p_ioctx, const char *p_name,
   librados::IoCtx p_ioc, c_ioc;
   librados::IoCtx::from_rados_ioctx_t(p_ioctx, p_ioc);
   librados::IoCtx::from_rados_ioctx_t(c_ioctx, c_ioc);
+  TracepointProvider::initialize<tracepoint_traits>(get_cct(p_ioc));
   tracepoint(librbd, clone_enter, p_ioc.get_pool_name().c_str(), p_ioc.get_id(), p_name, p_snap_name, c_ioc.get_pool_name().c_str(), c_ioc.get_id(), c_name, features);
   int r = librbd::clone(p_ioc, p_name, p_snap_name, c_ioc, c_name,
 		       features, c_order, 0, 0);
@@ -1023,6 +1149,7 @@ extern "C" int rbd_clone2(rados_ioctx_t p_ioctx, const char *p_name,
   librados::IoCtx p_ioc, c_ioc;
   librados::IoCtx::from_rados_ioctx_t(p_ioctx, p_ioc);
   librados::IoCtx::from_rados_ioctx_t(c_ioctx, c_ioc);
+  TracepointProvider::initialize<tracepoint_traits>(get_cct(p_ioc));
   tracepoint(librbd, clone2_enter, p_ioc.get_pool_name().c_str(), p_ioc.get_id(), p_name, p_snap_name, c_ioc.get_pool_name().c_str(), c_ioc.get_id(), c_name, features, stripe_unit, stripe_count);
   int r = librbd::clone(p_ioc, p_name, p_snap_name, c_ioc, c_name,
 		       features, c_order, stripe_unit, stripe_count);
@@ -1034,6 +1161,7 @@ extern "C" int rbd_remove(rados_ioctx_t p, const char *name)
 {
   librados::IoCtx io_ctx;
   librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+  TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
   tracepoint(librbd, remove_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name);
   librbd::NoOpProgressContext prog_ctx;
   int r = librbd::remove(io_ctx, name, prog_ctx);
@@ -1046,6 +1174,7 @@ extern "C" int rbd_remove_with_progress(rados_ioctx_t p, const char *name,
 {
   librados::IoCtx io_ctx;
   librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+  TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
   tracepoint(librbd, remove_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name);
   librbd::CProgressContext prog_ctx(cb, cbdata);
   int r = librbd::remove(io_ctx, name, prog_ctx);
@@ -1129,6 +1258,7 @@ extern "C" int rbd_rename(rados_ioctx_t src_p, const char *srcname,
 {
   librados::IoCtx src_io_ctx;
   librados::IoCtx::from_rados_ioctx_t(src_p, src_io_ctx);
+  TracepointProvider::initialize<tracepoint_traits>(get_cct(src_io_ctx));
   tracepoint(librbd, rename_enter, src_io_ctx.get_pool_name().c_str(), src_io_ctx.get_id(), srcname, destname);
   int r = librbd::rename(src_io_ctx, srcname, destname);
   tracepoint(librbd, rename_exit, r);
@@ -1140,6 +1270,7 @@ extern "C" int rbd_open(rados_ioctx_t p, const char *name, rbd_image_t *image,
 {
   librados::IoCtx io_ctx;
   librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+  TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
   librbd::ImageCtx *ictx = new librbd::ImageCtx(name, "", snap_name, io_ctx,
 						false);
   tracepoint(librbd, open_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only);
@@ -1155,6 +1286,7 @@ extern "C" int rbd_open_read_only(rados_ioctx_t p, const char *name,
 {
   librados::IoCtx io_ctx;
   librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+  TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
   librbd::ImageCtx *ictx = new librbd::ImageCtx(name, "", snap_name, io_ctx,
 						true);
   tracepoint(librbd, open_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only);
@@ -1169,9 +1301,9 @@ extern "C" int rbd_close(rbd_image_t image)
 {
   librbd::ImageCtx *ctx = (librbd::ImageCtx *)image;
   tracepoint(librbd, close_image_enter, ctx, ctx->name.c_str(), ctx->id.c_str());
-  librbd::close_image(ctx);
-  tracepoint(librbd, close_image_exit);
-  return 0;
+  int r = librbd::close_image(ctx);
+  tracepoint(librbd, close_image_exit, r);
+  return r;
 }
 
 extern "C" int rbd_resize(rbd_image_t image, uint64_t size)
@@ -1232,6 +1364,17 @@ extern "C" int rbd_get_features(rbd_image_t image, uint64_t *features)
   return r;
 }
 
+extern "C" int rbd_update_features(rbd_image_t image, uint64_t features,
+                                  uint8_t enabled)
+{
+  librbd::ImageCtx *ictx = reinterpret_cast<librbd::ImageCtx *>(image);
+  bool features_enabled = enabled != 0;
+  tracepoint(librbd, update_features_enter, ictx, features, features_enabled);
+  int r = librbd::update_features(ictx, features, features_enabled);
+  tracepoint(librbd, update_features_exit, r);
+  return r;
+}
+
 extern "C" int rbd_get_stripe_unit(rbd_image_t image, uint64_t *stripe_unit)
 {
   librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
@@ -1322,6 +1465,14 @@ extern "C" int rbd_is_exclusive_lock_owner(rbd_image_t image, int *is_owner)
   return r;
 }
 
+extern "C" int rbd_rebuild_object_map(rbd_image_t image,
+                                      librbd_progress_fn_t cb, void *cbdata)
+{
+  librbd::ImageCtx *ictx = reinterpret_cast<librbd::ImageCtx*>(image);
+  librbd::CProgressContext prog_ctx(cb, cbdata);
+  return librbd::rebuild_object_map(ictx, prog_ctx);
+}
+
 /* snapshots */
 extern "C" int rbd_snap_create(rbd_image_t image, const char *snap_name)
 {
@@ -1678,8 +1829,27 @@ extern "C" int rbd_diff_iterate(rbd_image_t image,
 				void *arg)
 {
   librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
-  tracepoint(librbd, diff_iterate_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, fromsnapname, ofs, len);
-  int r = librbd::diff_iterate(ictx, fromsnapname, ofs, len, cb, arg);
+  tracepoint(librbd, diff_iterate_enter, ictx, ictx->name.c_str(),
+             ictx->snap_name.c_str(), ictx->read_only, fromsnapname, ofs, len,
+             true, true);
+  int r = librbd::diff_iterate(ictx, fromsnapname, ofs, len, true, false, cb,
+                               arg);
+  tracepoint(librbd, diff_iterate_exit, r);
+  return r;
+}
+
+extern "C" int rbd_diff_iterate2(rbd_image_t image, const char *fromsnapname,
+                                uint64_t ofs, uint64_t len,
+                                uint8_t include_parent, uint8_t whole_object,
+                                int (*cb)(uint64_t, size_t, int, void *),
+                                void *arg)
+{
+  librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+  tracepoint(librbd, diff_iterate_enter, ictx, ictx->name.c_str(),
+            ictx->snap_name.c_str(), ictx->read_only, fromsnapname, ofs, len,
+            include_parent != 0, whole_object != 0);
+  int r = librbd::diff_iterate(ictx, fromsnapname, ofs, len, include_parent,
+                              whole_object, cb, arg);
   tracepoint(librbd, diff_iterate_exit, r);
   return r;
 }
@@ -1812,6 +1982,82 @@ extern "C" int rbd_invalidate_cache(rbd_image_t image)
   return r;
 }
 
+extern "C" int rbd_metadata_get(rbd_image_t image, const char *key, char *value, size_t *vallen)
+{
+  librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+  string val_s;
+  tracepoint(librbd, metadata_get_enter, ictx, key);
+  int r = librbd::metadata_get(ictx, key, &val_s);
+  if (r < 0) {
+    tracepoint(librbd, metadata_get_exit, r, key, NULL);
+    return r;
+  }
+  if (*vallen < val_s.size()) {
+    r = -ERANGE;
+    *vallen = val_s.size();
+    tracepoint(librbd, metadata_get_exit, r, key, NULL);
+  } else {
+    strncpy(value, val_s.c_str(), val_s.size());
+    tracepoint(librbd, metadata_get_exit, r, key, value);
+  }
+  return r;
+}
+
+extern "C" int rbd_metadata_set(rbd_image_t image, const char *key, const char *value)
+{
+  librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+  tracepoint(librbd, metadata_set_enter, ictx, key, value);
+  int r = librbd::metadata_set(ictx, key, value);
+  tracepoint(librbd, metadata_set_exit, r);
+  return r;
+}
+
+extern "C" int rbd_metadata_remove(rbd_image_t image, const char *key)
+{
+  librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+  tracepoint(librbd, metadata_remove_enter, ictx, key);
+  int r = librbd::metadata_remove(ictx, key);
+  tracepoint(librbd, metadata_remove_exit, r);
+  return r;
+}
+
+extern "C" int rbd_metadata_list(rbd_image_t image, const char *start, uint64_t max,
+                                 char *key, size_t *key_len, char *value, size_t *val_len)
+{
+  librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+  tracepoint(librbd, metadata_list_enter, ictx);
+  map<string, bufferlist> pairs;
+  int r = librbd::metadata_list(ictx, start, max, &pairs);
+  size_t key_total_len = 0, val_total_len = 0;
+  bool too_short = false;
+  for (map<string, bufferlist>::iterator it = pairs.begin();
+       it != pairs.end(); ++it) {
+    key_total_len += it->first.size() + 1;
+    val_total_len += it->second.length() + 1;
+  }
+  if (*key_len < key_total_len || *val_len < key_total_len)
+    too_short = true;
+  *key_len = key_total_len;
+  *val_len = val_total_len;
+  if (too_short) {
+    tracepoint(librbd, metadata_list_exit, -ERANGE);
+    return -ERANGE;
+  }
+
+  char *key_p = key, *value_p = value;
+
+  for (map<string, bufferlist>::iterator it = pairs.begin();
+       it != pairs.end(); ++it) {
+    strncpy(key_p, it->first.c_str(), it->first.size());
+    key_p += it->first.size() + 1;
+    strncpy(value_p, it->second.c_str(), it->second.length());
+    value_p += it->second.length() + 1;
+    tracepoint(librbd, metadata_list_entry, it->first.c_str(), it->second.c_str());
+  }
+  tracepoint(librbd, metadata_list_exit, r);
+  return r;
+}
+
 extern "C" int rbd_aio_is_complete(rbd_completion_t c)
 {
   librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
diff --git a/src/librbd/parent_types.h b/src/librbd/parent_types.h
index 4dcc452..5c54953 100644
--- a/src/librbd/parent_types.h
+++ b/src/librbd/parent_types.h
@@ -3,10 +3,10 @@
 #ifndef CEPH_LIBRBD_PARENT_TYPES_H
 #define CEPH_LIBRBD_PARENT_TYPES_H
 
-// parent_spec uniquely identifies a parent in the clone relationship
-// (clone(parent) creates child, then parent_spec <-> child_imageid)
-
 namespace librbd {
+  /** @brief Unique identification of a parent in clone relationship.
+   * Cloning an image creates a child image that keeps a reference
+   * to its parent. This allows copy-on-write images. */
   struct parent_spec {
     int64_t pool_id;
     string image_id;
@@ -24,9 +24,18 @@ namespace librbd {
     }
   };
 
+  /// Full information about an image's parent.
   struct parent_info {
+    /// Identification of the parent.
     parent_spec spec;
+
+    /** @brief Where the portion of data shared with the child image ends.
+     * Since images can be resized multiple times, the portion of data shared
+     * with the child image is not necessarily min(parent size, child size).
+     * If the child image is first shrunk and then enlarged, the common portion
+     * will be shorter. */
     uint64_t overlap;
+
     parent_info() : overlap(0) {}
   };
 }
diff --git a/src/libs3/COPYING b/src/libs3/COPYING
deleted file mode 100644
index 94a9ed0..0000000
--- a/src/libs3/COPYING
+++ /dev/null
@@ -1,674 +0,0 @@
-                    GNU GENERAL PUBLIC LICENSE
-                       Version 3, 29 June 2007
-
- Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
- Everyone is permitted to copy and distribute verbatim copies
- of this license document, but changing it is not allowed.
-
-                            Preamble
-
-  The GNU General Public License is a free, copyleft license for
-software and other kinds of works.
-
-  The licenses for most software and other practical works are designed
-to take away your freedom to share and change the works.  By contrast,
-the GNU General Public License is intended to guarantee your freedom to
-share and change all versions of a program--to make sure it remains free
-software for all its users.  We, the Free Software Foundation, use the
-GNU General Public License for most of our software; it applies also to
-any other work released this way by its authors.  You can apply it to
-your programs, too.
-
-  When we speak of free software, we are referring to freedom, not
-price.  Our General Public Licenses are designed to make sure that you
-have the freedom to distribute copies of free software (and charge for
-them if you wish), that you receive source code or can get it if you
-want it, that you can change the software or use pieces of it in new
-free programs, and that you know you can do these things.
-
-  To protect your rights, we need to prevent others from denying you
-these rights or asking you to surrender the rights.  Therefore, you have
-certain responsibilities if you distribute copies of the software, or if
-you modify it: responsibilities to respect the freedom of others.
-
-  For example, if you distribute copies of such a program, whether
-gratis or for a fee, you must pass on to the recipients the same
-freedoms that you received.  You must make sure that they, too, receive
-or can get the source code.  And you must show them these terms so they
-know their rights.
-
-  Developers that use the GNU GPL protect your rights with two steps:
-(1) assert copyright on the software, and (2) offer you this License
-giving you legal permission to copy, distribute and/or modify it.
-
-  For the developers' and authors' protection, the GPL clearly explains
-that there is no warranty for this free software.  For both users' and
-authors' sake, the GPL requires that modified versions be marked as
-changed, so that their problems will not be attributed erroneously to
-authors of previous versions.
-
-  Some devices are designed to deny users access to install or run
-modified versions of the software inside them, although the manufacturer
-can do so.  This is fundamentally incompatible with the aim of
-protecting users' freedom to change the software.  The systematic
-pattern of such abuse occurs in the area of products for individuals to
-use, which is precisely where it is most unacceptable.  Therefore, we
-have designed this version of the GPL to prohibit the practice for those
-products.  If such problems arise substantially in other domains, we
-stand ready to extend this provision to those domains in future versions
-of the GPL, as needed to protect the freedom of users.
-
-  Finally, every program is threatened constantly by software patents.
-States should not allow patents to restrict development and use of
-software on general-purpose computers, but in those that do, we wish to
-avoid the special danger that patents applied to a free program could
-make it effectively proprietary.  To prevent this, the GPL assures that
-patents cannot be used to render the program non-free.
-
-  The precise terms and conditions for copying, distribution and
-modification follow.
-
-                       TERMS AND CONDITIONS
-
-  0. Definitions.
-
-  "This License" refers to version 3 of the GNU General Public License.
-
-  "Copyright" also means copyright-like laws that apply to other kinds of
-works, such as semiconductor masks.
-
-  "The Program" refers to any copyrightable work licensed under this
-License.  Each licensee is addressed as "you".  "Licensees" and
-"recipients" may be individuals or organizations.
-
-  To "modify" a work means to copy from or adapt all or part of the work
-in a fashion requiring copyright permission, other than the making of an
-exact copy.  The resulting work is called a "modified version" of the
-earlier work or a work "based on" the earlier work.
-
-  A "covered work" means either the unmodified Program or a work based
-on the Program.
-
-  To "propagate" a work means to do anything with it that, without
-permission, would make you directly or secondarily liable for
-infringement under applicable copyright law, except executing it on a
-computer or modifying a private copy.  Propagation includes copying,
-distribution (with or without modification), making available to the
-public, and in some countries other activities as well.
-
-  To "convey" a work means any kind of propagation that enables other
-parties to make or receive copies.  Mere interaction with a user through
-a computer network, with no transfer of a copy, is not conveying.
-
-  An interactive user interface displays "Appropriate Legal Notices"
-to the extent that it includes a convenient and prominently visible
-feature that (1) displays an appropriate copyright notice, and (2)
-tells the user that there is no warranty for the work (except to the
-extent that warranties are provided), that licensees may convey the
-work under this License, and how to view a copy of this License.  If
-the interface presents a list of user commands or options, such as a
-menu, a prominent item in the list meets this criterion.
-
-  1. Source Code.
-
-  The "source code" for a work means the preferred form of the work
-for making modifications to it.  "Object code" means any non-source
-form of a work.
-
-  A "Standard Interface" means an interface that either is an official
-standard defined by a recognized standards body, or, in the case of
-interfaces specified for a particular programming language, one that
-is widely used among developers working in that language.
-
-  The "System Libraries" of an executable work include anything, other
-than the work as a whole, that (a) is included in the normal form of
-packaging a Major Component, but which is not part of that Major
-Component, and (b) serves only to enable use of the work with that
-Major Component, or to implement a Standard Interface for which an
-implementation is available to the public in source code form.  A
-"Major Component", in this context, means a major essential component
-(kernel, window system, and so on) of the specific operating system
-(if any) on which the executable work runs, or a compiler used to
-produce the work, or an object code interpreter used to run it.
-
-  The "Corresponding Source" for a work in object code form means all
-the source code needed to generate, install, and (for an executable
-work) run the object code and to modify the work, including scripts to
-control those activities.  However, it does not include the work's
-System Libraries, or general-purpose tools or generally available free
-programs which are used unmodified in performing those activities but
-which are not part of the work.  For example, Corresponding Source
-includes interface definition files associated with source files for
-the work, and the source code for shared libraries and dynamically
-linked subprograms that the work is specifically designed to require,
-such as by intimate data communication or control flow between those
-subprograms and other parts of the work.
-
-  The Corresponding Source need not include anything that users
-can regenerate automatically from other parts of the Corresponding
-Source.
-
-  The Corresponding Source for a work in source code form is that
-same work.
-
-  2. Basic Permissions.
-
-  All rights granted under this License are granted for the term of
-copyright on the Program, and are irrevocable provided the stated
-conditions are met.  This License explicitly affirms your unlimited
-permission to run the unmodified Program.  The output from running a
-covered work is covered by this License only if the output, given its
-content, constitutes a covered work.  This License acknowledges your
-rights of fair use or other equivalent, as provided by copyright law.
-
-  You may make, run and propagate covered works that you do not
-convey, without conditions so long as your license otherwise remains
-in force.  You may convey covered works to others for the sole purpose
-of having them make modifications exclusively for you, or provide you
-with facilities for running those works, provided that you comply with
-the terms of this License in conveying all material for which you do
-not control copyright.  Those thus making or running the covered works
-for you must do so exclusively on your behalf, under your direction
-and control, on terms that prohibit them from making any copies of
-your copyrighted material outside their relationship with you.
-
-  Conveying under any other circumstances is permitted solely under
-the conditions stated below.  Sublicensing is not allowed; section 10
-makes it unnecessary.
-
-  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
-
-  No covered work shall be deemed part of an effective technological
-measure under any applicable law fulfilling obligations under article
-11 of the WIPO copyright treaty adopted on 20 December 1996, or
-similar laws prohibiting or restricting circumvention of such
-measures.
-
-  When you convey a covered work, you waive any legal power to forbid
-circumvention of technological measures to the extent such circumvention
-is effected by exercising rights under this License with respect to
-the covered work, and you disclaim any intention to limit operation or
-modification of the work as a means of enforcing, against the work's
-users, your or third parties' legal rights to forbid circumvention of
-technological measures.
-
-  4. Conveying Verbatim Copies.
-
-  You may convey verbatim copies of the Program's source code as you
-receive it, in any medium, provided that you conspicuously and
-appropriately publish on each copy an appropriate copyright notice;
-keep intact all notices stating that this License and any
-non-permissive terms added in accord with section 7 apply to the code;
-keep intact all notices of the absence of any warranty; and give all
-recipients a copy of this License along with the Program.
-
-  You may charge any price or no price for each copy that you convey,
-and you may offer support or warranty protection for a fee.
-
-  5. Conveying Modified Source Versions.
-
-  You may convey a work based on the Program, or the modifications to
-produce it from the Program, in the form of source code under the
-terms of section 4, provided that you also meet all of these conditions:
-
-    a) The work must carry prominent notices stating that you modified
-    it, and giving a relevant date.
-
-    b) The work must carry prominent notices stating that it is
-    released under this License and any conditions added under section
-    7.  This requirement modifies the requirement in section 4 to
-    "keep intact all notices".
-
-    c) You must license the entire work, as a whole, under this
-    License to anyone who comes into possession of a copy.  This
-    License will therefore apply, along with any applicable section 7
-    additional terms, to the whole of the work, and all its parts,
-    regardless of how they are packaged.  This License gives no
-    permission to license the work in any other way, but it does not
-    invalidate such permission if you have separately received it.
-
-    d) If the work has interactive user interfaces, each must display
-    Appropriate Legal Notices; however, if the Program has interactive
-    interfaces that do not display Appropriate Legal Notices, your
-    work need not make them do so.
-
-  A compilation of a covered work with other separate and independent
-works, which are not by their nature extensions of the covered work,
-and which are not combined with it such as to form a larger program,
-in or on a volume of a storage or distribution medium, is called an
-"aggregate" if the compilation and its resulting copyright are not
-used to limit the access or legal rights of the compilation's users
-beyond what the individual works permit.  Inclusion of a covered work
-in an aggregate does not cause this License to apply to the other
-parts of the aggregate.
-
-  6. Conveying Non-Source Forms.
-
-  You may convey a covered work in object code form under the terms
-of sections 4 and 5, provided that you also convey the
-machine-readable Corresponding Source under the terms of this License,
-in one of these ways:
-
-    a) Convey the object code in, or embodied in, a physical product
-    (including a physical distribution medium), accompanied by the
-    Corresponding Source fixed on a durable physical medium
-    customarily used for software interchange.
-
-    b) Convey the object code in, or embodied in, a physical product
-    (including a physical distribution medium), accompanied by a
-    written offer, valid for at least three years and valid for as
-    long as you offer spare parts or customer support for that product
-    model, to give anyone who possesses the object code either (1) a
-    copy of the Corresponding Source for all the software in the
-    product that is covered by this License, on a durable physical
-    medium customarily used for software interchange, for a price no
-    more than your reasonable cost of physically performing this
-    conveying of source, or (2) access to copy the
-    Corresponding Source from a network server at no charge.
-
-    c) Convey individual copies of the object code with a copy of the
-    written offer to provide the Corresponding Source.  This
-    alternative is allowed only occasionally and noncommercially, and
-    only if you received the object code with such an offer, in accord
-    with subsection 6b.
-
-    d) Convey the object code by offering access from a designated
-    place (gratis or for a charge), and offer equivalent access to the
-    Corresponding Source in the same way through the same place at no
-    further charge.  You need not require recipients to copy the
-    Corresponding Source along with the object code.  If the place to
-    copy the object code is a network server, the Corresponding Source
-    may be on a different server (operated by you or a third party)
-    that supports equivalent copying facilities, provided you maintain
-    clear directions next to the object code saying where to find the
-    Corresponding Source.  Regardless of what server hosts the
-    Corresponding Source, you remain obligated to ensure that it is
-    available for as long as needed to satisfy these requirements.
-
-    e) Convey the object code using peer-to-peer transmission, provided
-    you inform other peers where the object code and Corresponding
-    Source of the work are being offered to the general public at no
-    charge under subsection 6d.
-
-  A separable portion of the object code, whose source code is excluded
-from the Corresponding Source as a System Library, need not be
-included in conveying the object code work.
-
-  A "User Product" is either (1) a "consumer product", which means any
-tangible personal property which is normally used for personal, family,
-or household purposes, or (2) anything designed or sold for incorporation
-into a dwelling.  In determining whether a product is a consumer product,
-doubtful cases shall be resolved in favor of coverage.  For a particular
-product received by a particular user, "normally used" refers to a
-typical or common use of that class of product, regardless of the status
-of the particular user or of the way in which the particular user
-actually uses, or expects or is expected to use, the product.  A product
-is a consumer product regardless of whether the product has substantial
-commercial, industrial or non-consumer uses, unless such uses represent
-the only significant mode of use of the product.
-
-  "Installation Information" for a User Product means any methods,
-procedures, authorization keys, or other information required to install
-and execute modified versions of a covered work in that User Product from
-a modified version of its Corresponding Source.  The information must
-suffice to ensure that the continued functioning of the modified object
-code is in no case prevented or interfered with solely because
-modification has been made.
-
-  If you convey an object code work under this section in, or with, or
-specifically for use in, a User Product, and the conveying occurs as
-part of a transaction in which the right of possession and use of the
-User Product is transferred to the recipient in perpetuity or for a
-fixed term (regardless of how the transaction is characterized), the
-Corresponding Source conveyed under this section must be accompanied
-by the Installation Information.  But this requirement does not apply
-if neither you nor any third party retains the ability to install
-modified object code on the User Product (for example, the work has
-been installed in ROM).
-
-  The requirement to provide Installation Information does not include a
-requirement to continue to provide support service, warranty, or updates
-for a work that has been modified or installed by the recipient, or for
-the User Product in which it has been modified or installed.  Access to a
-network may be denied when the modification itself materially and
-adversely affects the operation of the network or violates the rules and
-protocols for communication across the network.
-
-  Corresponding Source conveyed, and Installation Information provided,
-in accord with this section must be in a format that is publicly
-documented (and with an implementation available to the public in
-source code form), and must require no special password or key for
-unpacking, reading or copying.
-
-  7. Additional Terms.
-
-  "Additional permissions" are terms that supplement the terms of this
-License by making exceptions from one or more of its conditions.
-Additional permissions that are applicable to the entire Program shall
-be treated as though they were included in this License, to the extent
-that they are valid under applicable law.  If additional permissions
-apply only to part of the Program, that part may be used separately
-under those permissions, but the entire Program remains governed by
-this License without regard to the additional permissions.
-
-  When you convey a copy of a covered work, you may at your option
-remove any additional permissions from that copy, or from any part of
-it.  (Additional permissions may be written to require their own
-removal in certain cases when you modify the work.)  You may place
-additional permissions on material, added by you to a covered work,
-for which you have or can give appropriate copyright permission.
-
-  Notwithstanding any other provision of this License, for material you
-add to a covered work, you may (if authorized by the copyright holders of
-that material) supplement the terms of this License with terms:
-
-    a) Disclaiming warranty or limiting liability differently from the
-    terms of sections 15 and 16 of this License; or
-
-    b) Requiring preservation of specified reasonable legal notices or
-    author attributions in that material or in the Appropriate Legal
-    Notices displayed by works containing it; or
-
-    c) Prohibiting misrepresentation of the origin of that material, or
-    requiring that modified versions of such material be marked in
-    reasonable ways as different from the original version; or
-
-    d) Limiting the use for publicity purposes of names of licensors or
-    authors of the material; or
-
-    e) Declining to grant rights under trademark law for use of some
-    trade names, trademarks, or service marks; or
-
-    f) Requiring indemnification of licensors and authors of that
-    material by anyone who conveys the material (or modified versions of
-    it) with contractual assumptions of liability to the recipient, for
-    any liability that these contractual assumptions directly impose on
-    those licensors and authors.
-
-  All other non-permissive additional terms are considered "further
-restrictions" within the meaning of section 10.  If the Program as you
-received it, or any part of it, contains a notice stating that it is
-governed by this License along with a term that is a further
-restriction, you may remove that term.  If a license document contains
-a further restriction but permits relicensing or conveying under this
-License, you may add to a covered work material governed by the terms
-of that license document, provided that the further restriction does
-not survive such relicensing or conveying.
-
-  If you add terms to a covered work in accord with this section, you
-must place, in the relevant source files, a statement of the
-additional terms that apply to those files, or a notice indicating
-where to find the applicable terms.
-
-  Additional terms, permissive or non-permissive, may be stated in the
-form of a separately written license, or stated as exceptions;
-the above requirements apply either way.
-
-  8. Termination.
-
-  You may not propagate or modify a covered work except as expressly
-provided under this License.  Any attempt otherwise to propagate or
-modify it is void, and will automatically terminate your rights under
-this License (including any patent licenses granted under the third
-paragraph of section 11).
-
-  However, if you cease all violation of this License, then your
-license from a particular copyright holder is reinstated (a)
-provisionally, unless and until the copyright holder explicitly and
-finally terminates your license, and (b) permanently, if the copyright
-holder fails to notify you of the violation by some reasonable means
-prior to 60 days after the cessation.
-
-  Moreover, your license from a particular copyright holder is
-reinstated permanently if the copyright holder notifies you of the
-violation by some reasonable means, this is the first time you have
-received notice of violation of this License (for any work) from that
-copyright holder, and you cure the violation prior to 30 days after
-your receipt of the notice.
-
-  Termination of your rights under this section does not terminate the
-licenses of parties who have received copies or rights from you under
-this License.  If your rights have been terminated and not permanently
-reinstated, you do not qualify to receive new licenses for the same
-material under section 10.
-
-  9. Acceptance Not Required for Having Copies.
-
-  You are not required to accept this License in order to receive or
-run a copy of the Program.  Ancillary propagation of a covered work
-occurring solely as a consequence of using peer-to-peer transmission
-to receive a copy likewise does not require acceptance.  However,
-nothing other than this License grants you permission to propagate or
-modify any covered work.  These actions infringe copyright if you do
-not accept this License.  Therefore, by modifying or propagating a
-covered work, you indicate your acceptance of this License to do so.
-
-  10. Automatic Licensing of Downstream Recipients.
-
-  Each time you convey a covered work, the recipient automatically
-receives a license from the original licensors, to run, modify and
-propagate that work, subject to this License.  You are not responsible
-for enforcing compliance by third parties with this License.
-
-  An "entity transaction" is a transaction transferring control of an
-organization, or substantially all assets of one, or subdividing an
-organization, or merging organizations.  If propagation of a covered
-work results from an entity transaction, each party to that
-transaction who receives a copy of the work also receives whatever
-licenses to the work the party's predecessor in interest had or could
-give under the previous paragraph, plus a right to possession of the
-Corresponding Source of the work from the predecessor in interest, if
-the predecessor has it or can get it with reasonable efforts.
-
-  You may not impose any further restrictions on the exercise of the
-rights granted or affirmed under this License.  For example, you may
-not impose a license fee, royalty, or other charge for exercise of
-rights granted under this License, and you may not initiate litigation
-(including a cross-claim or counterclaim in a lawsuit) alleging that
-any patent claim is infringed by making, using, selling, offering for
-sale, or importing the Program or any portion of it.
-
-  11. Patents.
-
-  A "contributor" is a copyright holder who authorizes use under this
-License of the Program or a work on which the Program is based.  The
-work thus licensed is called the contributor's "contributor version".
-
-  A contributor's "essential patent claims" are all patent claims
-owned or controlled by the contributor, whether already acquired or
-hereafter acquired, that would be infringed by some manner, permitted
-by this License, of making, using, or selling its contributor version,
-but do not include claims that would be infringed only as a
-consequence of further modification of the contributor version.  For
-purposes of this definition, "control" includes the right to grant
-patent sublicenses in a manner consistent with the requirements of
-this License.
-
-  Each contributor grants you a non-exclusive, worldwide, royalty-free
-patent license under the contributor's essential patent claims, to
-make, use, sell, offer for sale, import and otherwise run, modify and
-propagate the contents of its contributor version.
-
-  In the following three paragraphs, a "patent license" is any express
-agreement or commitment, however denominated, not to enforce a patent
-(such as an express permission to practice a patent or covenant not to
-sue for patent infringement).  To "grant" such a patent license to a
-party means to make such an agreement or commitment not to enforce a
-patent against the party.
-
-  If you convey a covered work, knowingly relying on a patent license,
-and the Corresponding Source of the work is not available for anyone
-to copy, free of charge and under the terms of this License, through a
-publicly available network server or other readily accessible means,
-then you must either (1) cause the Corresponding Source to be so
-available, or (2) arrange to deprive yourself of the benefit of the
-patent license for this particular work, or (3) arrange, in a manner
-consistent with the requirements of this License, to extend the patent
-license to downstream recipients.  "Knowingly relying" means you have
-actual knowledge that, but for the patent license, your conveying the
-covered work in a country, or your recipient's use of the covered work
-in a country, would infringe one or more identifiable patents in that
-country that you have reason to believe are valid.
-
-  If, pursuant to or in connection with a single transaction or
-arrangement, you convey, or propagate by procuring conveyance of, a
-covered work, and grant a patent license to some of the parties
-receiving the covered work authorizing them to use, propagate, modify
-or convey a specific copy of the covered work, then the patent license
-you grant is automatically extended to all recipients of the covered
-work and works based on it.
-
-  A patent license is "discriminatory" if it does not include within
-the scope of its coverage, prohibits the exercise of, or is
-conditioned on the non-exercise of one or more of the rights that are
-specifically granted under this License.  You may not convey a covered
-work if you are a party to an arrangement with a third party that is
-in the business of distributing software, under which you make payment
-to the third party based on the extent of your activity of conveying
-the work, and under which the third party grants, to any of the
-parties who would receive the covered work from you, a discriminatory
-patent license (a) in connection with copies of the covered work
-conveyed by you (or copies made from those copies), or (b) primarily
-for and in connection with specific products or compilations that
-contain the covered work, unless you entered into that arrangement,
-or that patent license was granted, prior to 28 March 2007.
-
-  Nothing in this License shall be construed as excluding or limiting
-any implied license or other defenses to infringement that may
-otherwise be available to you under applicable patent law.
-
-  12. No Surrender of Others' Freedom.
-
-  If conditions are imposed on you (whether by court order, agreement or
-otherwise) that contradict the conditions of this License, they do not
-excuse you from the conditions of this License.  If you cannot convey a
-covered work so as to satisfy simultaneously your obligations under this
-License and any other pertinent obligations, then as a consequence you may
-not convey it at all.  For example, if you agree to terms that obligate you
-to collect a royalty for further conveying from those to whom you convey
-the Program, the only way you could satisfy both those terms and this
-License would be to refrain entirely from conveying the Program.
-
-  13. Use with the GNU Affero General Public License.
-
-  Notwithstanding any other provision of this License, you have
-permission to link or combine any covered work with a work licensed
-under version 3 of the GNU Affero General Public License into a single
-combined work, and to convey the resulting work.  The terms of this
-License will continue to apply to the part which is the covered work,
-but the special requirements of the GNU Affero General Public License,
-section 13, concerning interaction through a network will apply to the
-combination as such.
-
-  14. Revised Versions of this License.
-
-  The Free Software Foundation may publish revised and/or new versions of
-the GNU General Public License from time to time.  Such new versions will
-be similar in spirit to the present version, but may differ in detail to
-address new problems or concerns.
-
-  Each version is given a distinguishing version number.  If the
-Program specifies that a certain numbered version of the GNU General
-Public License "or any later version" applies to it, you have the
-option of following the terms and conditions either of that numbered
-version or of any later version published by the Free Software
-Foundation.  If the Program does not specify a version number of the
-GNU General Public License, you may choose any version ever published
-by the Free Software Foundation.
-
-  If the Program specifies that a proxy can decide which future
-versions of the GNU General Public License can be used, that proxy's
-public statement of acceptance of a version permanently authorizes you
-to choose that version for the Program.
-
-  Later license versions may give you additional or different
-permissions.  However, no additional obligations are imposed on any
-author or copyright holder as a result of your choosing to follow a
-later version.
-
-  15. Disclaimer of Warranty.
-
-  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
-APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
-HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
-OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
-THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
-IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
-ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
-
-  16. Limitation of Liability.
-
-  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
-WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
-THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
-GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
-USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
-DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
-PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
-EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
-SUCH DAMAGES.
-
-  17. Interpretation of Sections 15 and 16.
-
-  If the disclaimer of warranty and limitation of liability provided
-above cannot be given local legal effect according to their terms,
-reviewing courts shall apply local law that most closely approximates
-an absolute waiver of all civil liability in connection with the
-Program, unless a warranty or assumption of liability accompanies a
-copy of the Program in return for a fee.
-
-                     END OF TERMS AND CONDITIONS
-
-            How to Apply These Terms to Your New Programs
-
-  If you develop a new program, and you want it to be of the greatest
-possible use to the public, the best way to achieve this is to make it
-free software which everyone can redistribute and change under these terms.
-
-  To do so, attach the following notices to the program.  It is safest
-to attach them to the start of each source file to most effectively
-state the exclusion of warranty; and each file should have at least
-the "copyright" line and a pointer to where the full notice is found.
-
-    <one line to give the program's name and a brief idea of what it does.>
-    Copyright (C) <year>  <name of author>
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-Also add information on how to contact you by electronic and paper mail.
-
-  If the program does terminal interaction, make it output a short
-notice like this when it starts in an interactive mode:
-
-    <program>  Copyright (C) <year>  <name of author>
-    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
-    This is free software, and you are welcome to redistribute it
-    under certain conditions; type `show c' for details.
-
-The hypothetical commands `show w' and `show c' should show the appropriate
-parts of the General Public License.  Of course, your program's commands
-might be different; for a GUI interface, you would use an "about box".
-
-  You should also get your employer (if you work as a programmer) or school,
-if any, to sign a "copyright disclaimer" for the program, if necessary.
-For more information on this, and how to apply and follow the GNU GPL, see
-<http://www.gnu.org/licenses/>.
-
-  The GNU General Public License does not permit incorporating your program
-into proprietary programs.  If your program is a subroutine library, you
-may consider it more useful to permit linking proprietary applications with
-the library.  If this is what you want to do, use the GNU Lesser General
-Public License instead of this License.  But first, please read
-<http://www.gnu.org/philosophy/why-not-lgpl.html>.
diff --git a/src/libs3/ChangeLog b/src/libs3/ChangeLog
deleted file mode 100644
index fc8797a..0000000
--- a/src/libs3/ChangeLog
+++ /dev/null
@@ -1,16 +0,0 @@
-Thu Sep 18 10:03:02 NZST 2008   bryan at ischo.com
-	* This file is no longer maintained, sorry
-
-Sat Aug  9 13:44:21 NZST 2008   bryan at ischo.com
-	* Fixed bug wherein keys with non-URI-safe characters did not work
-	  correctly because they were not being URI-encoded in the request UR
-	* Split RPM and DEB packages into normal and devel packages
-
-Fri Aug  8 22:40:19 NZST 2008	bryan at ischo.com
-	* Branched 0.4
-	* Created RPM and Debian packaging
-
-Tue Aug  5 08:52:33 NZST 2008	bryan at ischo.com
-	* Bumped version number to 0.3
-	* Moved Makefile to GNUmakefile, added shared library build
-	* Added a bunch of GNU standard files (README, INSTALL, ChangeLog, etc)
diff --git a/src/libs3/GNUmakefile b/src/libs3/GNUmakefile
deleted file mode 100644
index f387e30..0000000
--- a/src/libs3/GNUmakefile
+++ /dev/null
@@ -1,419 +0,0 @@
-# GNUmakefile
-# 
-# Copyright 2008 Bryan Ischo <bryan at ischo.com>
-# 
-# This file is part of libs3.
-# 
-# libs3 is free software: you can redistribute it and/or modify it under the
-# terms of the GNU General Public License as published by the Free Software
-# Foundation, version 3 of the License.
-#
-# In addition, as a special exception, the copyright holders give
-# permission to link the code of this library and its programs with the
-# OpenSSL library, and distribute linked combinations including the two.
-#
-# libs3 is distributed in the hope that it will be useful, but WITHOUT ANY
-# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
-# details.
-#
-# You should have received a copy of the GNU General Public License version 3
-# along with libs3, in a file named COPYING.  If not, see
-# <http://www.gnu.org/licenses/>.
-
-# I tried to use the autoconf/automake/autolocal/etc (i.e. autohell) tools
-# but I just couldn't stomach them.  Since this is a Makefile for POSIX
-# systems, I will simply do away with autohell completely and use a GNU
-# Makefile.  GNU make ought to be available pretty much everywhere, so I
-# don't see this being a significant issue for portability.
-
-# All commands assume a GNU compiler.  For systems which do not use a GNU
-# compiler, write scripts with the same names as these commands, and taking
-# the same arguments, and translate the arguments and commands into the
-# appropriate non-POSIX ones as needed.  libs3 assumes a GNU toolchain as
-# the most portable way to build software possible.  Non-POSIX, non-GNU
-# systems can do the work of supporting this build infrastructure.
-
-
-# --------------------------------------------------------------------------
-# Set libs3 version number, unless it is already set.
-# This is trunk0.trunk0 on the libs3 git master branch; release branches
-# are created with this set to specific version numbers when releases are
-# made.
-
-LIBS3_VER_MAJOR ?= trunk0
-LIBS3_VER_MINOR ?= trunk0
-LIBS3_VER := $(LIBS3_VER_MAJOR).$(LIBS3_VER_MINOR)
-
-
-# -----------------------------------------------------------------------------
-# Determine verbosity.  VERBOSE_SHOW should be prepended to every command which
-# should only be displayed if VERBOSE is set.  QUIET_ECHO may be used to
-# echo text only if VERBOSE is not set.  Typically, a VERBOSE_SHOW command will
-# be paired with a QUIET_ECHO command, to provide a command which is displayed
-# in VERBOSE mode, along with text which is displayed in non-VERBOSE mode to
-# describe the command.
-#
-# No matter what VERBOSE is defined to, it ends up as true if it's defined.
-# This will be weird if you defined VERBOSE=false in the environment, and we
-# switch it to true here; but the meaning of VERBOSE is, "if it's defined to
-# any value, then verbosity is turned on".  So don't define VERBOSE if you
-# don't want verbosity in the build process.
-# -----------------------------------------------------------------------------
-
-ifdef VERBOSE
-        VERBOSE = true
-        VERBOSE_ECHO = @ echo
-        VERBOSE_SHOW =
-        QUIET_ECHO = @ echo > /dev/null
-else
-        VERBOSE = false
-        VERBOSE_ECHO = @ echo > /dev/null
-        VERBOSE_SHOW = @
-        QUIET_ECHO = @ echo
-endif
-
-
-# --------------------------------------------------------------------------
-# BUILD directory
-ifndef BUILD
-    ifdef DEBUG
-        BUILD := build-debug
-    else
-        BUILD := build
-    endif
-endif
-
-
-# --------------------------------------------------------------------------
-# DESTDIR directory
-ifndef DESTDIR
-    DESTDIR := /usr
-endif
-
-# --------------------------------------------------------------------------
-# Compiler CC handling
-ifndef CC
-    CC := gcc
-endif
-
-# --------------------------------------------------------------------------
-# Acquire configuration information for libraries that libs3 depends upon
-
-ifndef CURL_LIBS
-    CURL_LIBS := $(shell curl-config --libs)
-endif
-
-ifndef CURL_CFLAGS
-    CURL_CFLAGS := $(shell curl-config --cflags)
-endif
-
-ifndef LIBXML2_LIBS
-    LIBXML2_LIBS := $(shell xml2-config --libs)
-endif
-
-ifndef LIBXML2_CFLAGS
-    LIBXML2_CFLAGS := $(shell xml2-config --cflags)
-endif
-
-
-# --------------------------------------------------------------------------
-# These CFLAGS assume a GNU compiler.  For other compilers, write a script
-# which converts these arguments into their equivalent for that particular
-# compiler.
-
-ifndef CFLAGS
-    ifdef DEBUG
-        CFLAGS := -g
-    else
-        CFLAGS := -O3
-    endif
-endif
-
-CFLAGS += -Wall -Werror -Wshadow -Wextra -Iinc \
-          $(CURL_CFLAGS) $(LIBXML2_CFLAGS) \
-          -DLIBS3_VER_MAJOR=\"$(LIBS3_VER_MAJOR)\" \
-          -DLIBS3_VER_MINOR=\"$(LIBS3_VER_MINOR)\" \
-          -DLIBS3_VER=\"$(LIBS3_VER)\" \
-          -D__STRICT_ANSI__ \
-          -D_ISOC99_SOURCE \
-          -D_POSIX_C_SOURCE=200112L
-
-LDFLAGS = $(CURL_LIBS) $(LIBXML2_LIBS) -lpthread
-
-
-# --------------------------------------------------------------------------
-# Default targets are everything
-
-.PHONY: all
-all: exported test
-
-
-# --------------------------------------------------------------------------
-# Exported targets are the library and driver program
-
-.PHONY: exported
-exported: libs3 s3 headers
-
-
-# --------------------------------------------------------------------------
-# Install target
-
-# adding empty install target, don't want to install anything when integrated
-# with ceph
-.PHONY: install
-install:
-
-# this is the original install target
-.PHONY: install-all
-install-all: exported
-	$(QUIET_ECHO) $(DESTDIR)/bin/s3: Installing executable
-	$(VERBOSE_SHOW) install -Dps -m u+rwx,go+rx $(BUILD)/bin/s3 \
-                    $(DESTDIR)/bin/s3
-	$(QUIET_ECHO) \
-        $(DESTDIR)/lib/libs3.so.$(LIBS3_VER): Installing shared library
-	$(VERBOSE_SHOW) install -Dps -m u+rw,go+r \
-               $(BUILD)/lib/libs3.so.$(LIBS3_VER_MAJOR) \
-               $(DESTDIR)/lib/libs3.so.$(LIBS3_VER)
-	$(QUIET_ECHO) \
-        $(DESTDIR)/lib/libs3.so.$(LIBS3_VER_MAJOR): Linking shared library
-	$(VERBOSE_SHOW) ln -sf libs3.so.$(LIBS3_VER) \
-               $(DESTDIR)/lib/libs3.so.$(LIBS3_VER_MAJOR)
-	$(QUIET_ECHO) $(DESTDIR)/lib/libs3.so: Linking shared library
-	$(VERBOSE_SHOW) ln -sf libs3.so.$(LIBS3_VER_MAJOR) $(DESTDIR)/lib/libs3.so
-	$(QUIET_ECHO) $(DESTDIR)/lib/libs3.a: Installing static library
-	$(VERBOSE_SHOW) install -Dp -m u+rw,go+r $(BUILD)/lib/libs3.a \
-                    $(DESTDIR)/lib/libs3.a
-	$(QUIET_ECHO) $(DESTDIR)/include/libs3.h: Installing header
-	$(VERBOSE_SHOW) install -Dp -m u+rw,go+r $(BUILD)/include/libs3.h \
-                    $(DESTDIR)/include/libs3.h
-
-
-# --------------------------------------------------------------------------
-# Uninstall target
-
-.PHONY: uninstall
-uninstall:
-	$(QUIET_ECHO) Installed files: Uninstalling
-	$(VERBOSE_SHOW) \
-	    rm -f $(DESTDIR)/bin/s3 \
-              $(DESTDIR)/include/libs3.h \
-              $(DESTDIR)/lib/libs3.a \
-              $(DESTDIR)/lib/libs3.so \
-              $(DESTDIR)/lib/libs3.so.$(LIBS3_VER_MAJOR) \
-              $(DESTDIR)/lib/libs3.so.$(LIBS3_VER)
-
-
-# --------------------------------------------------------------------------
-# Compile target patterns
-
-$(BUILD)/obj/%.o: src/%.c
-	$(QUIET_ECHO) $@: Compiling object
-	@ mkdir -p $(dir $(BUILD)/dep/$<)
-	@ $(CC) $(CFLAGS) -M -MG -MQ $@ -DCOMPILINGDEPENDENCIES \
-        -o $(BUILD)/dep/$(<:%.c=%.d) -c $<
-	@ mkdir -p $(dir $@)
-	$(VERBOSE_SHOW) $(CC) $(CFLAGS) -o $@ -c $<
-
-$(BUILD)/obj/%.do: src/%.c
-	$(QUIET_ECHO) $@: Compiling dynamic object
-	@ mkdir -p $(dir $(BUILD)/dep/$<)
-	@ $(CC) $(CFLAGS) -M -MG -MQ $@ -DCOMPILINGDEPENDENCIES \
-        -o $(BUILD)/dep/$(<:%.c=%.dd) -c $<
-	@ mkdir -p $(dir $@)
-	$(VERBOSE_SHOW) $(CC) $(CFLAGS) -fpic -fPIC -o $@ -c $< 
-
-
-# --------------------------------------------------------------------------
-# libs3 library targets
-
-LIBS3_SHARED = $(BUILD)/lib/libs3.so.$(LIBS3_VER_MAJOR)
-LIBS3_STATIC = $(BUILD)/lib/libs3.a
-
-.PHONY: libs3
-libs3: $(LIBS3_SHARED) $(LIBS3_STATIC)
-
-LIBS3_SOURCES := acl.c bucket.c error_parser.c general.c \
-                 object.c request.c request_context.c \
-                 response_headers_handler.c service_access_logging.c \
-                 service.c simplexml.c util.c
-
-$(LIBS3_SHARED): $(LIBS3_SOURCES:%.c=$(BUILD)/obj/%.do)
-	$(QUIET_ECHO) $@: Building shared library
-	@ mkdir -p $(dir $@)
-	$(VERBOSE_SHOW) $(CC) -shared -Wl,-soname,libs3.so.$(LIBS3_VER_MAJOR) \
-        -o $@ $^ $(LDFLAGS)
-
-$(LIBS3_STATIC): $(LIBS3_SOURCES:%.c=$(BUILD)/obj/%.o)
-	$(QUIET_ECHO) $@: Building static library
-	@ mkdir -p $(dir $@)
-	$(VERBOSE_SHOW) $(AR) cr $@ $^
-
-
-# --------------------------------------------------------------------------
-# Driver program targets
-
-.PHONY: s3
-s3: $(BUILD)/bin/s3
-
-$(BUILD)/bin/s3: $(BUILD)/obj/s3.o $(LIBS3_SHARED)
-	$(QUIET_ECHO) $@: Building executable
-	@ mkdir -p $(dir $@)
-	$(VERBOSE_SHOW) $(CC) -o $@ $^ $(LDFLAGS)
-
-
-# --------------------------------------------------------------------------
-# libs3 header targets
-
-.PHONY: headers
-headers: $(BUILD)/include/libs3.h
-
-$(BUILD)/include/libs3.h: inc/libs3.h
-	$(QUIET_ECHO) $@: Linking header
-	@ mkdir -p $(dir $@)
-	$(VERBOSE_SHOW) ln -sf $(abspath $<) $@
-
-
-# --------------------------------------------------------------------------
-# Test targets
-
-.PHONY: test
-test: $(BUILD)/bin/testsimplexml
-
-$(BUILD)/bin/testsimplexml: $(BUILD)/obj/testsimplexml.o $(LIBS3_STATIC)
-	$(QUIET_ECHO) $@: Building executable
-	@ mkdir -p $(dir $@)
-	$(VERBOSE_SHOW) $(CC) -o $@ $^ $(LIBXML2_LIBS)
-
-
-# --------------------------------------------------------------------------
-# Check target
-
-check:
-distdir:
-dist:
-
-# --------------------------------------------------------------------------
-# Clean target
-
-.PHONY: clean
-clean:
-	$(QUIET_ECHO) $(BUILD): Cleaning
-	$(VERBOSE_SHOW) rm -rf $(BUILD)
-
-.PHONY: distclean
-distclean:
-	$(QUIET_ECHO) $(BUILD): Cleaning
-	$(VERBOSE_SHOW) rm -rf $(BUILD)
-
-
-# --------------------------------------------------------------------------
-# Clean dependencies target
-
-.PHONY: cleandeps
-cleandeps:
-	$(QUIET_ECHO) $(BUILD)/dep: Cleaning dependencies
-	$(VERBOSE_SHOW) rm -rf $(BUILD)/dep
-
-
-# --------------------------------------------------------------------------
-# Dependencies
-
-ALL_SOURCES := $(LIBS3_SOURCES) s3.c testsimplexml.c
-
-$(foreach i, $(ALL_SOURCES), $(eval -include $(BUILD)/dep/src/$(i:%.c=%.d)))
-$(foreach i, $(ALL_SOURCES), $(eval -include $(BUILD)/dep/src/$(i:%.c=%.dd)))
-
-
-# --------------------------------------------------------------------------
-# Debian package target
-
-DEBPKG = $(BUILD)/pkg/libs3_$(LIBS3_VER).deb
-DEBDEVPKG = $(BUILD)/pkg/libs3-dev_$(LIBS3_VER).deb
-
-.PHONY: deb
-deb: $(DEBPKG) $(DEBDEVPKG)
-
-$(DEBPKG): DEBARCH = $(shell dpkg-architecture | grep ^DEB_BUILD_ARCH= | \
-                       cut -d '=' -f 2)
-$(DEBPKG): exported $(BUILD)/deb/DEBIAN/control $(BUILD)/deb/DEBIAN/shlibs \
-           $(BUILD)/deb/DEBIAN/postinst \
-           $(BUILD)/deb/usr/share/doc/libs3/changelog.gz \
-           $(BUILD)/deb/usr/share/doc/libs3/changelog.Debian.gz \
-           $(BUILD)/deb/usr/share/doc/libs3/copyright
-	DESTDIR=$(BUILD)/deb/usr $(MAKE) install
-	rm -rf $(BUILD)/deb/usr/include
-	rm -f $(BUILD)/deb/usr/lib/libs3.a
-	@mkdir -p $(dir $@)
-	fakeroot dpkg-deb -b $(BUILD)/deb $@
-	mv $@ $(BUILD)/pkg/libs3_$(LIBS3_VER)_$(DEBARCH).deb
-
-$(DEBDEVPKG): DEBARCH = $(shell dpkg-architecture | grep ^DEB_BUILD_ARCH= | \
-                          cut -d '=' -f 2)
-$(DEBDEVPKG): exported $(BUILD)/deb-dev/DEBIAN/control \
-           $(BUILD)/deb-dev/usr/share/doc/libs3-dev/changelog.gz \
-           $(BUILD)/deb-dev/usr/share/doc/libs3-dev/changelog.Debian.gz \
-           $(BUILD)/deb-dev/usr/share/doc/libs3-dev/copyright
-	DESTDIR=$(BUILD)/deb-dev/usr $(MAKE) install
-	rm -rf $(BUILD)/deb-dev/usr/bin
-	rm -f $(BUILD)/deb-dev/usr/lib/libs3.so*
-	@mkdir -p $(dir $@)
-	fakeroot dpkg-deb -b $(BUILD)/deb-dev $@
-	mv $@ $(BUILD)/pkg/libs3-dev_$(LIBS3_VER)_$(DEBARCH).deb
-
-$(BUILD)/deb/DEBIAN/control: debian/control
-	@mkdir -p $(dir $@)
-	echo -n "Depends: " > $@
-	dpkg-shlibdeps -Sbuild -O $(BUILD)/lib/libs3.so.$(LIBS3_VER_MAJOR) | \
-            cut -d '=' -f 2- >> $@
-	sed -e 's/LIBS3_VERSION/$(LIBS3_VER)/' \
-            < $< | sed -e 's/DEBIAN_ARCHITECTURE/$(DEBARCH)/' | \
-            grep -v ^Source: >> $@
-
-$(BUILD)/deb-dev/DEBIAN/control: debian/control.dev
-	@mkdir -p $(dir $@)
-	sed -e 's/LIBS3_VERSION/$(LIBS3_VER)/' \
-            < $< | sed -e 's/DEBIAN_ARCHITECTURE/$(DEBARCH)/' > $@
-
-$(BUILD)/deb/DEBIAN/shlibs:
-	echo -n "libs3 $(LIBS3_VER_MAJOR) libs3 " > $@
-	echo "(>= $(LIBS3_VER))" >> $@
-
-$(BUILD)/deb/DEBIAN/postinst: debian/postinst
-	@mkdir -p $(dir $@)
-	cp $< $@
-
-$(BUILD)/deb/usr/share/doc/libs3/copyright: LICENSE
-	@mkdir -p $(dir $@)
-	cp $< $@
-	@echo >> $@
-	@echo -n "An alternate location for the GNU General Public " >> $@
-	@echo "License version 3 on Debian" >> $@
-	@echo "systems is /usr/share/common-licenses/GPL-3." >> $@
-
-$(BUILD)/deb-dev/usr/share/doc/libs3-dev/copyright: LICENSE
-	@mkdir -p $(dir $@)
-	cp $< $@
-	@echo >> $@
-	@echo -n "An alternate location for the GNU General Public " >> $@
-	@echo "License version 3 on Debian" >> $@
-	@echo "systems is /usr/share/common-licenses/GPL-3." >> $@
-
-$(BUILD)/deb/usr/share/doc/libs3/changelog.gz: debian/changelog
-	@mkdir -p $(dir $@)
-	gzip --best -c $< > $@
-
-$(BUILD)/deb-dev/usr/share/doc/libs3-dev/changelog.gz: debian/changelog
-	@mkdir -p $(dir $@)
-	gzip --best -c $< > $@
-
-$(BUILD)/deb/usr/share/doc/libs3/changelog.Debian.gz: debian/changelog.Debian
-	@mkdir -p $(dir $@)
-	gzip --best -c $< > $@
-
-$(BUILD)/deb-dev/usr/share/doc/libs3-dev/changelog.Debian.gz: \
-    debian/changelog.Debian
-	@mkdir -p $(dir $@)
-	gzip --best -c $< > $@
-
-
diff --git a/src/libs3/GNUmakefile.mingw b/src/libs3/GNUmakefile.mingw
deleted file mode 100644
index 175d1e9..0000000
--- a/src/libs3/GNUmakefile.mingw
+++ /dev/null
@@ -1,296 +0,0 @@
-# GNUmakefile.mingw
-# 
-# Copyright 2008 Bryan Ischo <bryan at ischo.com>
-# 
-# This file is part of libs3.
-# 
-# libs3 is free software: you can redistribute it and/or modify it under the
-# terms of the GNU General Public License as published by the Free Software
-# Foundation, version 3 of the License.
-#
-# In addition, as a special exception, the copyright holders give
-# permission to link the code of this library and its programs with the
-# OpenSSL library, and distribute linked combinations including the two.
-#
-# libs3 is distributed in the hope that it will be useful, but WITHOUT ANY
-# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
-# details.
-#
-# You should have received a copy of the GNU General Public License version 3
-# along with libs3, in a file named COPYING.  If not, see
-# <http://www.gnu.org/licenses/>.
-
-# I tried to use the autoconf/automake/autolocal/etc (i.e. autohell) tools
-# but I just couldn't stomach them.  Since this is a Makefile for POSIX
-# systems, I will simply do away with autohell completely and use a GNU
-# Makefile.  GNU make ought to be available pretty much everywhere, so I
-# don't see this being a significant issue for portability.
-
-# All commands assume a GNU compiler.  For systems which do not use a GNU
-# compiler, write scripts with the same names as these commands, and taking
-# the same arguments, and translate the arguments and commands into the
-# appropriate non-POSIX ones as needed.  libs3 assumes a GNU toolchain as
-# the most portable way to build software possible.  Non-POSIX, non-GNU
-# systems can do the work of supporting this build infrastructure.
-
-
-# --------------------------------------------------------------------------
-# Set libs3 version number, unless it is already set.
-# This is trunk0.trunk0 on the libs3 git master branch; release branches
-# are created with this set to specific version numbers when releases are
-# made.
-
-LIBS3_VER_MAJOR ?= trunk0
-LIBS3_VER_MINOR ?= trunk0
-LIBS3_VER := $(LIBS3_VER_MAJOR).$(LIBS3_VER_MINOR)
-
-
-# -----------------------------------------------------------------------------
-# Determine verbosity.  VERBOSE_SHOW should be prepended to every command which
-# should only be displayed if VERBOSE is set.  QUIET_ECHO may be used to
-# echo text only if VERBOSE is not set.  Typically, a VERBOSE_SHOW command will
-# be paired with a QUIET_ECHO command, to provide a command which is displayed
-# in VERBOSE mode, along with text which is displayed in non-VERBOSE mode to
-# describe the command.
-#
-# No matter what VERBOSE is defined to, it ends up as true if it's defined.
-# This will be weird if you defined VERBOSE=false in the environment, and we
-# switch it to true here; but the meaning of VERBOSE is, "if it's defined to
-# any value, then verbosity is turned on".  So don't define VERBOSE if you
-# don't want verbosity in the build process.
-# -----------------------------------------------------------------------------
-
-ifdef VERBOSE
-        VERBOSE = true
-        VERBOSE_ECHO = @ echo
-        VERBOSE_SHOW =
-        QUIET_ECHO = @ echo >nul
-else
-        VERBOSE = false
-        VERBOSE_ECHO = @ echo >nul
-        VERBOSE_SHOW = @
-        QUIET_ECHO = @ echo
-endif
-
-
-# --------------------------------------------------------------------------
-# BUILD directory
-ifndef BUILD
-    ifdef DEBUG
-        BUILD := build-debug
-    else
-        BUILD := build
-    endif
-endif
-
-
-# --------------------------------------------------------------------------
-# DESTDIR directory
-ifndef DESTDIR
-    DESTDIR := libs3-$(LIBS3_VER)
-endif
-
-
-# --------------------------------------------------------------------------
-# Acquire configuration information for libraries that libs3 depends upon
-
-ifndef CURL_LIBS
-    CURL_LIBS := -Lc:\libs3-libs\bin -lcurl
-endif
-
-ifndef CURL_CFLAGS
-    CURL_CFLAGS := -Ic:\libs3-libs\include
-endif
-
-ifndef LIBXML2_LIBS
-    LIBXML2_LIBS := -Lc:\libs3-libs\bin -lxml2
-endif
-
-ifndef LIBXML2_CFLAGS
-    LIBXML2_CFLAGS := -Ic:\libs3-libs\include
-endif
-
-
-# --------------------------------------------------------------------------
-# These CFLAGS assume a GNU compiler.  For other compilers, write a script
-# which converts these arguments into their equivalent for that particular
-# compiler.
-
-ifndef CFLAGS
-    ifdef DEBUG
-        CFLAGS := -g
-    else
-        CFLAGS := -O3
-    endif
-endif
-
-CFLAGS += -Wall -Werror -Wshadow -Wextra -Iinc \
-          $(CURL_CFLAGS) $(LIBXML2_CFLAGS) \
-          -DLIBS3_VER_MAJOR=\"$(LIBS3_VER_MAJOR)\" \
-          -DLIBS3_VER_MINOR=\"$(LIBS3_VER_MINOR)\" \
-          -DLIBS3_VER=\"$(LIBS3_VER)\" \
-          -D__STRICT_ANSI__ \
-          -D_ISOC99_SOURCE \
-          -D_POSIX_C_SOURCE=200112L \
-          -Dsleep=Sleep -DSLEEP_UNITS_PER_SECOND=1000 \
-          -DFOPEN_EXTRA_FLAGS=\"b\" \
-          -Iinc/mingw -include windows.h
-
-LDFLAGS = $(CURL_LIBS) $(LIBXML2_LIBS)
-
-# --------------------------------------------------------------------------
-# Default targets are everything
-
-.PHONY: all
-all: exported test
-
-
-# --------------------------------------------------------------------------
-# Exported targets are the library and driver program
-
-.PHONY: exported
-exported: libs3 s3 headers
-
-
-# --------------------------------------------------------------------------
-# Install target
-
-.PHONY: install
-install: exported
-	$(QUIET_ECHO) $(DESTDIR)/bin/s3.exe: Installing executable
-	- @ mkdir $(DESTDIR)\bin 2>&1 | echo >nul
-	$(VERBOSE_SHOW) copy $(BUILD)\bin\s3.exe $(DESTDIR)\bin\s3.exe >nul
-	$(QUIET_ECHO) $(DESTDIR)/bin/libs3/dll: Installing dynamic library
-	$(VERBOSE_SHOW) copy $(BUILD)\bin\libs3.dll $(DESTDIR)\bin\libs3.dll >nul
-	$(QUIET_ECHO) $(DESTDIR)/lib/libs3.a: Installing static library
-	- @ mkdir $(DESTDIR)\lib 2>&1 | echo >nul
-	$(VERBOSE_SHOW) copy $(BUILD)\lib\libs3.a $(DESTDIR)\lib\libs3.a >nul
-	$(QUIET_ECHO) $(DESTDIR)/lib/libs3.def: Installing def file
-	$(VERBOSE_SHOW) copy mswin\libs3.def $(DESTDIR)\lib\libs3.def >nul
-	- @ mkdir $(DESTDIR)\include 2>&1 | echo >nul
-	$(QUIET_ECHO) $(DESTDIR)/include/libs3.h: Copying header
-	$(VERBOSE_SHOW) copy $(BUILD)\include\libs3.h \
-                    $(DESTDIR)\include\libs3.h >nul
-	$(QUIET_ECHO) $(DESTDIR)/LICENSE: Copying license
-	$(VERBOSE_SHOW) copy LICENSE $(DESTDIR)\LICENSE >nul
-	$(QUIET_ECHO) $(DESTDIR)/COPYING: Copying license
-	$(VERBOSE_SHOW) copy COPYING $(DESTDIR)\COPYING >nul
-
-
-# --------------------------------------------------------------------------
-# Uninstall target
-
-.PHONY: uninstall
-uninstall:
-	$(QUIET_ECHO) Installed files: Uninstalling
-	$(VERBOSE_SHOW) \
-	    del $(DESTDIR)\bin\s3.exe \
-            $(DESTDIR)\bin\libs3.dll \
-            $(DESTDIR)\lib\libs3.a \
-            $(DESTDIR)\lib\libs3.def \
-            $(DESTDIR)\include\libs3.h \
-            $(DESTDIR)\LICENSE \
-            $(DESTDIR)\COPYING
-
-
-# --------------------------------------------------------------------------
-# Compile target patterns
-
-$(BUILD)/obj/%.o: src/%.c
-	$(QUIET_ECHO) $@: Compiling object
-	- @ mkdir $(subst /,\,$(dir $(BUILD)/dep/$<)) 2>&1 | echo >nul
-	@ gcc $(CFLAGS) -M -MG -MQ $@ -DCOMPILINGDEPENDENCIES \
-        -o $(BUILD)/dep/$(<:%.c=%.d) -c $<
-	- @ mkdir $(subst /,\,$(dir $@)) 2>&1 | echo >nul
-	$(VERBOSE_SHOW) gcc $(CFLAGS) -o $@ -c $<
-
-
-# --------------------------------------------------------------------------
-# libs3 library targets
-
-LIBS3_SHARED = $(BUILD)/bin/libs3.dll
-LIBS3_STATIC = $(BUILD)/lib/libs3.a
-
-.PHONY: libs3
-libs3: $(LIBS3_SHARED) $(BUILD)/lib/libs3.a
-
-LIBS3_SOURCES := src/acl.c src/bucket.c src/error_parser.c src/general.c \
-                 src/object.c src/request.c src/request_context.c \
-                 src/response_headers_handler.c src/service_access_logging.c \
-                 src/service.c src/simplexml.c src/util.c src/mingw_functions.c
-
-$(LIBS3_SHARED): $(LIBS3_SOURCES:src/%.c=$(BUILD)/obj/%.o)
-	$(QUIET_ECHO) $@: Building dynamic library
-	- @ mkdir $(subst /,\,$(dir $@)) 2>&1 | echo >nul
-	$(VERBOSE_SHOW) gcc -shared -o $@ $^ $(LDFLAGS) -lws2_32
-
-$(LIBS3_STATIC): $(LIBS3_SHARED)
-	$(QUIET_ECHO) $@: Building static library
-	- @ mkdir $(subst /,\,$(dir $@)) 2>&1 | echo >nul
-	$(VERBOSE_SHOW) dlltool --def mswin\libs3.def --dllname $(subst /,\,$<) \
-            --output-lib $(subst /,\,$@)
-
-
-# --------------------------------------------------------------------------
-# Driver program targets
-
-.PHONY: s3
-s3: $(BUILD)/bin/s3.exe
-
-$(BUILD)/bin/s3.exe: $(BUILD)/obj/s3.o $(BUILD)/obj/mingw_s3_functions.o \
-                     $(BUILD)/lib/libs3.a
-	$(QUIET_ECHO) $@: Building executable
-	- @ mkdir $(subst /,\,$(dir $@)) 2>&1 | echo >nul
-	$(VERBOSE_SHOW) gcc -o $@ $^ $(LDFLAGS) -lws2_32
-
-
-# --------------------------------------------------------------------------
-# libs3 header targets
-
-.PHONY: headers
-headers: $(BUILD)\include\libs3.h
-
-$(BUILD)\include\libs3.h: inc\libs3.h
-	$(QUIET_ECHO) $@: Copying header
-	- @ mkdir $(subst /,\,$(dir $@)) 2>&1 | echo >nul
-	$(VERBOSE_SHOW) copy $< $@
-
-
-# --------------------------------------------------------------------------
-# Test targets
-
-.PHONY: test
-test: $(BUILD)/bin/testsimplexml
-
-$(BUILD)/bin/testsimplexml: $(BUILD)/obj/testsimplexml.o \
-                            $(BUILD)/obj/simplexml.o
-	$(QUIET_ECHO) $@: Building executable
-	- @ mkdir $(subst /,\,$(dir $@)) 2>&1 | echo >nul
-	$(VERBOSE_SHOW) gcc -o $@ $^ $(LIBXML2_LIBS)
-
-
-# --------------------------------------------------------------------------
-# Clean target
-
-.PHONY: clean
-clean:
-	$(QUIET_ECHO) $(BUILD): Cleaning
-	$(VERBOSE_SHOW) mswin\rmrf.bat $(BUILD)
-
-
-# --------------------------------------------------------------------------
-# Clean dependencies target
-
-.PHONY: cleandeps
-cleandeps:
-	$(QUIET_ECHO) $(BUILD)/dep: Cleaning dependencies
-	$(VERBOSE_SHOW) mswin\rmrf.bat $(BUILD)\dep
-
-
-# --------------------------------------------------------------------------
-# Dependencies
-
-ALL_SOURCES := $(LIBS3_SOURCES) s3.c testsimplexml.c
-
-$(foreach i, $(ALL_SOURCES), $(eval -include $(BUILD)/dep/src/$(i:%.c=%.d)))
diff --git a/src/libs3/GNUmakefile.osx b/src/libs3/GNUmakefile.osx
deleted file mode 100644
index d4de562..0000000
--- a/src/libs3/GNUmakefile.osx
+++ /dev/null
@@ -1,305 +0,0 @@
-# GNUmakefile.osx
-# 
-# Copyright 2008 Bryan Ischo <bryan at ischo.com>
-# 
-# This file is part of libs3.
-# 
-# libs3 is free software: you can redistribute it and/or modify it under the
-# terms of the GNU General Public License as published by the Free Software
-# Foundation, version 3 of the License.
-#
-# In addition, as a special exception, the copyright holders give
-# permission to link the code of this library and its programs with the
-# OpenSSL library, and distribute linked combinations including the two.
-#
-# libs3 is distributed in the hope that it will be useful, but WITHOUT ANY
-# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
-# details.
-#
-# You should have received a copy of the GNU General Public License version 3
-# along with libs3, in a file named COPYING.  If not, see
-# <http://www.gnu.org/licenses/>.
-
-# I tried to use the autoconf/automake/autolocal/etc (i.e. autohell) tools
-# but I just couldn't stomach them.  Since this is a Makefile for POSIX
-# systems, I will simply do away with autohell completely and use a GNU
-# Makefile.  GNU make ought to be available pretty much everywhere, so I
-# don't see this being a significant issue for portability.
-
-# All commands assume a GNU compiler.  For systems which do not use a GNU
-# compiler, write scripts with the same names as these commands, and taking
-# the same arguments, and translate the arguments and commands into the
-# appropriate non-POSIX ones as needed.  libs3 assumes a GNU toolchain as
-# the most portable way to build software possible.  Non-POSIX, non-GNU
-# systems can do the work of supporting this build infrastructure.
-
-
-# --------------------------------------------------------------------------
-# Set libs3 version number, unless it is already set.
-# This is trunk0.trunk0 on the libs3 git master branch; release branches
-# are created with this set to specific version numbers when releases are
-# made.
-
-LIBS3_VER_MAJOR ?= trunk0
-LIBS3_VER_MINOR ?= trunk0
-LIBS3_VER := $(LIBS3_VER_MAJOR).$(LIBS3_VER_MINOR)
-
-
-# -----------------------------------------------------------------------------
-# Determine verbosity.  VERBOSE_SHOW should be prepended to every command which
-# should only be displayed if VERBOSE is set.  QUIET_ECHO may be used to
-# echo text only if VERBOSE is not set.  Typically, a VERBOSE_SHOW command will
-# be paired with a QUIET_ECHO command, to provide a command which is displayed
-# in VERBOSE mode, along with text which is displayed in non-VERBOSE mode to
-# describe the command.
-#
-# No matter what VERBOSE is defined to, it ends up as true if it's defined.
-# This will be weird if you defined VERBOSE=false in the environment, and we
-# switch it to true here; but the meaning of VERBOSE is, "if it's defined to
-# any value, then verbosity is turned on".  So don't define VERBOSE if you
-# don't want verbosity in the build process.
-# -----------------------------------------------------------------------------
-
-ifdef VERBOSE
-        VERBOSE = true
-        VERBOSE_ECHO = @ echo
-        VERBOSE_SHOW =
-        QUIET_ECHO = @ echo > /dev/null
-else
-        VERBOSE = false
-        VERBOSE_ECHO = @ echo > /dev/null
-        VERBOSE_SHOW = @
-        QUIET_ECHO = @ echo
-endif
-
-
-# --------------------------------------------------------------------------
-# BUILD directory
-ifndef BUILD
-    ifdef DEBUG
-        BUILD := build-debug
-    else
-        BUILD := build
-    endif
-endif
-
-
-# --------------------------------------------------------------------------
-# DESTDIR directory
-ifndef DESTDIR
-    DESTDIR := /usr
-endif
-
-
-# --------------------------------------------------------------------------
-# Acquire configuration information for libraries that libs3 depends upon
-
-ifndef CURL_LIBS
-    CURL_LIBS := $(shell curl-config --libs)
-endif
-
-ifndef CURL_CFLAGS
-    CURL_CFLAGS := $(shell curl-config --cflags)
-endif
-
-ifndef LIBXML2_LIBS
-    LIBXML2_LIBS := $(shell xml2-config --libs)
-endif
-
-ifndef LIBXML2_CFLAGS
-    LIBXML2_CFLAGS := $(shell xml2-config --cflags)
-endif
-
-
-# --------------------------------------------------------------------------
-# These CFLAGS assume a GNU compiler.  For other compilers, write a script
-# which converts these arguments into their equivalent for that particular
-# compiler.
-
-ifndef CFLAGS
-    ifdef DEBUG
-        CFLAGS := -g
-    else
-        CFLAGS := -O3
-    endif
-endif
-
-CFLAGS += -Wall -Werror -Wshadow -Wextra -Iinc \
-          $(CURL_CFLAGS) $(LIBXML2_CFLAGS) \
-          -DLIBS3_VER_MAJOR=\"$(LIBS3_VER_MAJOR)\" \
-          -DLIBS3_VER_MINOR=\"$(LIBS3_VER_MINOR)\" \
-          -DLIBS3_VER=\"$(LIBS3_VER)\" \
-          -D__STRICT_ANSI__ \
-          -D_ISOC99_SOURCE \
-          -fno-common
-
-LDFLAGS = $(CURL_LIBS) $(LIBXML2_LIBS) -lpthread
-
-
-# --------------------------------------------------------------------------
-# Default targets are everything
-
-.PHONY: all
-all: exported test
-
-
-# --------------------------------------------------------------------------
-# Exported targets are the library and driver program
-
-.PHONY: exported
-exported: libs3 s3 headers
-
-
-# --------------------------------------------------------------------------
-# Install target
-
-.PHONY: install
-install: exported
-	$(QUIET_ECHO) $(DESTDIR)/bin/s3: Installing executable
-	$(VERBOSE_SHOW) install -ps -m u+rwx,go+rx $(BUILD)/bin/s3 \
-                    $(DESTDIR)/bin/s3
-	$(QUIET_ECHO) \
-        $(DESTDIR)/lib/libs3.$(LIBS3_VER).dylib: Installing dynamic library
-	$(VERBOSE_SHOW) install -p -m u+rw,go+r \
-                    $(BUILD)/lib/libs3.$(LIBS3_VER_MAJOR).dylib \
-                    $(DESTDIR)/lib/libs3.$(LIBS3_VER).dylib
-	$(QUIET_ECHO) \
-        $(DESTDIR)/lib/libs3.$(LIBS3_VER_MAJOR).dylib: Linking dynamic library
-	$(VERBOSE_SHOW) ln -sf libs3.$(LIBS3_VER).dylib \
-                    $(DESTDIR)/lib/libs3.$(LIBS3_VER_MAJOR).dylib
-	$(QUIET_ECHO) $(DESTDIR)/lib/libs3.dylib: Linking dynamic library
-	$(VERBOSE_SHOW) ln -sf libs3.$(LIBS3_VER_MAJOR).dylib \
-                    $(DESTDIR)/lib/libs3.dylib
-	$(QUIET_ECHO) $(DESTDIR)/lib/libs3.a: Installing static library
-	$(VERBOSE_SHOW) install -p -m u+rw,go+r $(BUILD)/lib/libs3.a \
-                    $(DESTDIR)/lib/libs3.a
-	$(QUIET_ECHO) $(DESTDIR)/include/libs3.h: Installing header
-	$(VERBOSE_SHOW) install -p -m u+rw,go+r $(BUILD)/include/libs3.h \
-                    $(DESTDIR)/include/libs3.h
-
-
-# --------------------------------------------------------------------------
-# Uninstall target
-
-.PHONY: uninstall
-uninstall:
-	$(QUIET_ECHO) Installed files: Uninstalling
-	$(VERBOSE_SHOW) \
-        rm -f $(DESTDIR)/bin/s3 \
-              $(DESTDIR)/lib/libs3.dylib \
-              $(DESTDIR)/lib/libs3.$(LIBS3_VER_MAJOR).dylib \
-              $(DESTDIR)/lib/libs3.$(LIBS3_VER).dylib \
-              $(DESTDIR)/lib/libs3.a \
-              $(DESTDIR)/include/libs3.h
-
-
-# --------------------------------------------------------------------------
-# Compile target patterns
-
-$(BUILD)/obj/%.o: src/%.c
-	$(QUIET_ECHO) $@: Compiling object
-	@ mkdir -p $(dir $(BUILD)/dep/$<)
-	@ gcc $(CFLAGS) -M -MG -MQ $@ -DCOMPILINGDEPENDENCIES \
-        -o $(BUILD)/dep/$(<:%.c=%.d) -c $<
-	@ mkdir -p $(dir $@)
-	@(VERBOSE_SHOW) gcc $(CFLAGS) -o $@ -c $<
-
-$(BUILD)/obj/%.do: src/%.c
-	$(QUIET_ECHO) $@: Compiling dynamic object
-	@ mkdir -p $(dir $(BUILD)/dep/$<)
-	@ gcc $(CFLAGS) -M -MG -MQ $@ -DCOMPILINGDEPENDENCIES \
-        -o $(BUILD)/dep/$(<:%.c=%.dd) -c $<
-	@ mkdir -p $(dir $@)
-	$(VERBOSE_SHOW) gcc $(CFLAGS) -fpic -fPIC -o $@ -c $< 
-
-
-# --------------------------------------------------------------------------
-# libs3 library targets
-
-LIBS3_SHARED = $(BUILD)/lib/libs3.$(LIBS3_VER_MAJOR).dylib
-LIBS3_STATIC = $(BUILD)/lib/libs3.a
-
-.PHONY: libs3
-libs3: $(LIBS3_SHARED) $(LIBS3_SHARED_MAJOR) $(BUILD)/lib/libs3.a
-
-LIBS3_SOURCES := src/acl.c src/bucket.c src/error_parser.c src/general.c \
-                 src/object.c src/request.c src/request_context.c \
-                 src/response_headers_handler.c src/service_access_logging.c \
-                 src/service.c src/simplexml.c src/util.c
-
-$(LIBS3_SHARED): $(LIBS3_SOURCES:src/%.c=$(BUILD)/obj/%.do)
-	$(QUIET_ECHO) $@: Building shared library
-	@ mkdir -p $(dir $@)
-	$(VERBOSE_SHOW) gcc -dynamiclib -install_name \
-        libs3.$(LIBS3_VER_MAJOR).dylib \
-        -compatibility_version $(LIBS3_VER_MAJOR) \
-        -current_version $(LIBS3_VER) -o $@ $^ $(LDFLAGS)
-
-$(LIBS3_STATIC): $(LIBS3_SOURCES:src/%.c=$(BUILD)/obj/%.o)
-	$(QUIET_ECHO) $@: Building static library
-	@ mkdir -p $(dir $@)
-	$(VERBOSE_SHOW) $(AR) cr $@ $^
-
-
-# --------------------------------------------------------------------------
-# Driver program targets
-
-.PHONY: s3
-s3: $(BUILD)/bin/s3
-
-$(BUILD)/bin/s3: $(BUILD)/obj/s3.o $(LIBS3_SHARED)
-	$(QUIET_ECHO) $@: Building executable
-	@ mkdir -p $(dir $@)
-	$(VERBOSE_SHOW) gcc -o $@ $^ $(LDFLAGS)
-
-
-# --------------------------------------------------------------------------
-# libs3 header targets
-
-.PHONY: headers
-headers: $(BUILD)/include/libs3.h
-
-$(BUILD)/include/libs3.h: inc/libs3.h
-	$(QUIET_ECHO) $@: Linking header
-	@ mkdir -p $(dir $@)
-	$(VERBOSE_SHOW) ln -sf $(abspath $<) $@
-
-
-# --------------------------------------------------------------------------
-# Test targets
-
-.PHONY: test
-test: $(BUILD)/bin/testsimplexml
-
-$(BUILD)/bin/testsimplexml: $(BUILD)/obj/testsimplexml.o $(LIBS3_STATIC)
-	$(QUIET_ECHO) $@: Building executable
-	@ mkdir -p $(dir $@)
-	$(VERBOSE_SHOW) gcc -o $@ $^ $(LIBXML2_LIBS)
-
-
-# --------------------------------------------------------------------------
-# Clean target
-
-.PHONY: clean
-clean:
-	$(QUIET_ECHO) $(BUILD): Cleaning
-	$(VERBOSE_SHOW) rm -rf $(BUILD)
-
-
-# --------------------------------------------------------------------------
-# Clean dependencies target
-
-.PHONY: cleandeps
-cleandeps:
-	$(QUIET_ECHO) $(BUILD)/dep: Cleaning dependencies
-	$(VERBOSE_SHOW) rm -rf $(BUILD)/dep
-
-
-# --------------------------------------------------------------------------
-# Dependencies
-
-ALL_SOURCES := $(LIBS3_SOURCES) s3.c testsimplexml.c
-
-$(foreach i, $(ALL_SOURCES), $(eval -include $(BUILD)/dep/src/$(i:%.c=%.d)))
-$(foreach i, $(ALL_SOURCES), $(eval -include $(BUILD)/dep/src/$(i:%.c=%.dd)))
diff --git a/src/libs3/INSTALL b/src/libs3/INSTALL
deleted file mode 100644
index 54431fc..0000000
--- a/src/libs3/INSTALL
+++ /dev/null
@@ -1,73 +0,0 @@
-
-To install libs3 on a POSIX system (except Microsoft Windows):
---------------------------------------------------------------
-
-Note that all POSIX builds have prerequisites, such as development libraries
-that libs3 requires and that must be installed at the time that libs3 is
-built.  The easiest way to find out what those are, is to run the build
-command and then observe the results.
-
-*** For RPM-based systems (Fedora Core, Mandrake, etc) ***
-
-* rpmbuild -ta <libs3 archive>
-
-for example:
-
-rpmbuild -ta libs3-0.3.tar.gz
-
-
-*** For dpkg-based systems (Debian, Ubuntu, etc) ***
-
-* make deb
-
-This will produce a Debian package in the build/pkg directory.
-
-
-*** For all other systems ***
-
-* make [DESTDIR=destination root] install
-
-DESTDIR defaults to /usr
-
-
-To install libs3 on a Microsoft Windows system:
------------------------------------------------
-
-*** Using MingW ***
-
-* libs3 can be built on Windows using the MingW compiler.  No other tool
-  is needed.  However, the following libraries are needed to build libs3:
-
-  - curl development libraries
-  - libxml2 development libraries, and the libraries that it requires:
-    - iconv
-    - zlib
-
-  These projects are independent of libs3, and their release schedule and
-  means of distribution would make it very difficult to provide links to
-  the files to download and keep them up-to-date in this file, so no attempt
-  is made here.
-
-  Development libraries and other files can be placed in:
-  c:\libs3-libs\bin
-  c:\libs3-libs\include
-
-  If the above locations are used, then the GNUmakefile.mingw will work with
-  no special caveats.  If the above locations are not used, then the following
-  environment variables should be set:
-  CURL_LIBS should be set to the MingW compiler flags needed to locate and
-      link in the curl libraries
-  CURL_CFLAGS should be set to the MingW compiler flags needed to locate and
-      include the curl headers
-  LIBXML2_LIBS should be set to the MingW compiler flags needed to locate and
-      link in the libxml2 libraries
-  LIBXML2_CFLAGS should be set to the MingW compiler flags needed to locate and
-      include the libxml2 headers
-
-* mingw32-make [DESTDIR=destination] -f GNUmakefile.mingw install
-
-DESTDIR defaults to libs3-<version>
-
-* DESTDIR can be zipped up into a .zip file for distribution.  For best
-  results, the dependent libraries (curl, openssl, etc) should be included,
-  along with their licenses.
diff --git a/src/libs3/LICENSE b/src/libs3/LICENSE
deleted file mode 100644
index db90987..0000000
--- a/src/libs3/LICENSE
+++ /dev/null
@@ -1,20 +0,0 @@
-Copyright 2008 Bryan Ischo <bryan at ischo.com>
-
-libs3 is free software: you can redistribute it and/or modify it under the
-terms of the GNU General Public License as published by the Free Software
-Foundation, version 3 of the License.
-
-In addition, as a special exception, the copyright holders give
-permission to link the code of this library and its programs with the
-OpenSSL library, and distribute linked combinations including the two.
-
-libs3 is distributed in the hope that it will be useful, but WITHOUT ANY
-WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
-details.
-
-You should have received a copy of the GNU General Public License version 3
-along with libs3, in a file named COPYING.  If not, see
-<http://www.gnu.org/licenses/>.
-
-
diff --git a/src/libs3/README b/src/libs3/README
deleted file mode 100644
index c881a77..0000000
--- a/src/libs3/README
+++ /dev/null
@@ -1,4 +0,0 @@
-This directory contains the libs3 library.
-
-The libs3 library is free software.  See the file LICENSE for copying
-permission.
diff --git a/src/libs3/TODO b/src/libs3/TODO
deleted file mode 100644
index d8821f4..0000000
--- a/src/libs3/TODO
+++ /dev/null
@@ -1,3 +0,0 @@
-* Implement functions for generating form stuff for posting to s3
-
-* Write s3 man page
diff --git a/src/libs3/archlinux/PKGBUILD b/src/libs3/archlinux/PKGBUILD
deleted file mode 100644
index 9256dee..0000000
--- a/src/libs3/archlinux/PKGBUILD
+++ /dev/null
@@ -1,28 +0,0 @@
-# Contributor: Bryan Ischo <bryan at ischo.com>
-pkgname=libs3
-pkgver=trunk
-pkgrel=1
-pkgdesc="C Library and Tools for Amazon S3 Access"
-arch=('i686' 'x86_64')
-url="http://libs3.ischo.com/index.html"
-license=('GPL')
-groups=()
-depends=('libxml2' 'openssl' 'curl')
-makedepends=('make' 'libxml2' 'openssl' 'curl')
-provides=()
-conflicts=()
-replaces=()
-backup=()
-options=()
-install=
-source=(http://libs3.ischo.com/$pkgname-$pkgver.tar.gz)
-noextract=()
-md5sums=('source md5') #generate with 'makepkg -g'
-
-build() {
-  cd "$srcdir/$pkgname-$pkgver"
-
-  DESTDIR=$pkgdir/usr make install || return 1
-}
-
-# vim:set ts=2 sw=2 et:
diff --git a/src/libs3/doxyfile b/src/libs3/doxyfile
deleted file mode 100644
index 0c7aedc..0000000
--- a/src/libs3/doxyfile
+++ /dev/null
@@ -1,886 +0,0 @@
-# Doxyfile 1.2.14
-
-# This file describes the settings to be used by the documentation system
-# doxygen (www.doxygen.org) for a project
-#
-# All text after a hash (#) is considered a comment and will be ignored
-# The format is:
-#       TAG = value [value, ...]
-# For lists items can also be appended using:
-#       TAG += value [value, ...]
-# Values that contain spaces should be placed between quotes (" ")
-
-#---------------------------------------------------------------------------
-# General configuration options
-#---------------------------------------------------------------------------
-
-# The PROJECT_NAME tag is a single word (or a sequence of words surrounded 
-# by quotes) that should identify the project.
-
-PROJECT_NAME           = libs3
-
-# The PROJECT_NUMBER tag can be used to enter a project or revision number. 
-# This could be handy for archiving the generated documentation or 
-# if some version control system is used.
-
-PROJECT_NUMBER         = trunk
-
-# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) 
-# base path where the generated documentation will be put. 
-# If a relative path is entered, it will be relative to the location 
-# where doxygen was started. If left blank the current directory will be used.
-
-OUTPUT_DIRECTORY       = dox
-
-# The OUTPUT_LANGUAGE tag is used to specify the language in which all 
-# documentation generated by doxygen is written. Doxygen will use this 
-# information to generate all constant output in the proper language. 
-# The default language is English, other supported languages are: 
-# Brazilian, Chinese, Croatian, Czech, Danish, Dutch, Finnish, French, 
-# German, Greek, Hungarian, Italian, Japanese, Korean, Norwegian, Polish, 
-# Portuguese, Romanian, Russian, Slovak, Slovene, Spanish and Swedish.
-
-OUTPUT_LANGUAGE        = English
-
-# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in 
-# documentation are documented, even if no documentation was available. 
-# Private class members and static file members will be hidden unless 
-# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES
-
-EXTRACT_ALL            = YES
-
-# If the EXTRACT_PRIVATE tag is set to YES all private members of a class 
-# will be included in the documentation.
-
-EXTRACT_PRIVATE        = YES
-
-# If the EXTRACT_STATIC tag is set to YES all static members of a file 
-# will be included in the documentation.
-
-EXTRACT_STATIC         = YES
-
-# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) 
-# defined locally in source files will be included in the documentation. 
-# If set to NO only classes defined in header files are included.
-
-EXTRACT_LOCAL_CLASSES  = YES
-
-# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all 
-# undocumented members of documented classes, files or namespaces. 
-# If set to NO (the default) these members will be included in the 
-# various overviews, but no documentation section is generated. 
-# This option has no effect if EXTRACT_ALL is enabled.
-
-HIDE_UNDOC_MEMBERS     = NO
-
-# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all 
-# undocumented classes that are normally visible in the class hierarchy. 
-# If set to NO (the default) these class will be included in the various 
-# overviews. This option has no effect if EXTRACT_ALL is enabled.
-
-HIDE_UNDOC_CLASSES     = NO
-
-# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will 
-# include brief member descriptions after the members that are listed in 
-# the file and class documentation (similar to JavaDoc). 
-# Set to NO to disable this.
-
-BRIEF_MEMBER_DESC      = YES
-
-# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend 
-# the brief description of a member or function before the detailed description. 
-# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the 
-# brief descriptions will be completely suppressed.
-
-REPEAT_BRIEF           = YES
-
-# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then 
-# Doxygen will generate a detailed section even if there is only a brief 
-# description.
-
-ALWAYS_DETAILED_SEC    = NO
-
-# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all inherited 
-# members of a class in the documentation of that class as if those members were 
-# ordinary class members. Constructors, destructors and assignment operators of 
-# the base classes will not be shown.
-
-INLINE_INHERITED_MEMB  = NO
-
-# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full 
-# path before files name in the file list and in the header files. If set 
-# to NO the shortest path that makes the file name unique will be used.
-
-FULL_PATH_NAMES        = NO
-
-# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag 
-# can be used to strip a user defined part of the path. Stripping is 
-# only done if one of the specified strings matches the left-hand part of 
-# the path. It is allowed to use relative paths in the argument list.
-
-STRIP_FROM_PATH        = 
-
-# The INTERNAL_DOCS tag determines if documentation 
-# that is typed after a \internal command is included. If the tag is set 
-# to NO (the default) then the documentation will be excluded. 
-# Set it to YES to include the internal documentation.
-
-INTERNAL_DOCS          = NO
-
-# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct 
-# doxygen to hide any special comment blocks from generated source code 
-# fragments. Normal C and C++ comments will always remain visible.
-
-STRIP_CODE_COMMENTS    = YES
-
-# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate 
-# file names in lower case letters. If set to YES upper case letters are also 
-# allowed. This is useful if you have classes or files whose names only differ 
-# in case and if your file system supports case sensitive file names. Windows 
-# users are adviced to set this option to NO.
-
-CASE_SENSE_NAMES       = YES
-
-# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter 
-# (but less readable) file names. This can be useful is your file systems 
-# doesn't support long names like on DOS, Mac, or CD-ROM.
-
-SHORT_NAMES            = NO
-
-# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen 
-# will show members with their full class and namespace scopes in the 
-# documentation. If set to YES the scope will be hidden.
-
-HIDE_SCOPE_NAMES       = NO
-
-# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen 
-# will generate a verbatim copy of the header file for each class for 
-# which an include is specified. Set to NO to disable this.
-
-VERBATIM_HEADERS       = YES
-
-# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen 
-# will put list of the files that are included by a file in the documentation 
-# of that file.
-
-SHOW_INCLUDE_FILES     = YES
-
-# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen 
-# will interpret the first line (until the first dot) of a JavaDoc-style 
-# comment as the brief description. If set to NO, the JavaDoc 
-# comments  will behave just like the Qt-style comments (thus requiring an 
-# explict @brief command for a brief description.
-
-JAVADOC_AUTOBRIEF      = NO
-
-# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented 
-# member inherits the documentation from any documented member that it 
-# reimplements.
-
-INHERIT_DOCS           = YES
-
-# If the INLINE_INFO tag is set to YES (the default) then a tag [inline] 
-# is inserted in the documentation for inline members.
-
-INLINE_INFO            = YES
-
-# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen 
-# will sort the (detailed) documentation of file and class members 
-# alphabetically by member name. If set to NO the members will appear in 
-# declaration order.
-
-SORT_MEMBER_DOCS       = NO
-
-# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC 
-# tag is set to YES, then doxygen will reuse the documentation of the first 
-# member in the group (if any) for the other members of the group. By default 
-# all members of a group must be documented explicitly.
-
-DISTRIBUTE_GROUP_DOC   = NO
-
-# The TAB_SIZE tag can be used to set the number of spaces in a tab. 
-# Doxygen uses this value to replace tabs by spaces in code fragments.
-
-TAB_SIZE               = 8
-
-# The GENERATE_TODOLIST tag can be used to enable (YES) or 
-# disable (NO) the todo list. This list is created by putting \todo 
-# commands in the documentation.
-
-GENERATE_TODOLIST      = YES
-
-# The GENERATE_TESTLIST tag can be used to enable (YES) or 
-# disable (NO) the test list. This list is created by putting \test 
-# commands in the documentation.
-
-GENERATE_TESTLIST      = YES
-
-# The GENERATE_BUGLIST tag can be used to enable (YES) or 
-# disable (NO) the bug list. This list is created by putting \bug 
-# commands in the documentation.
-
-GENERATE_BUGLIST       = YES
-
-# This tag can be used to specify a number of aliases that acts 
-# as commands in the documentation. An alias has the form "name=value". 
-# For example adding "sideeffect=\par Side Effects:\n" will allow you to 
-# put the command \sideeffect (or @sideeffect) in the documentation, which 
-# will result in a user defined paragraph with heading "Side Effects:". 
-# You can put \n's in the value part of an alias to insert newlines.
-
-ALIASES                = 
-
-# The ENABLED_SECTIONS tag can be used to enable conditional 
-# documentation sections, marked by \if sectionname ... \endif.
-
-ENABLED_SECTIONS       = 
-
-# The MAX_INITIALIZER_LINES tag determines the maximum number of lines 
-# the initial value of a variable or define consist of for it to appear in 
-# the documentation. If the initializer consists of more lines than specified 
-# here it will be hidden. Use a value of 0 to hide initializers completely. 
-# The appearance of the initializer of individual variables and defines in the 
-# documentation can be controlled using \showinitializer or \hideinitializer 
-# command in the documentation regardless of this setting.
-
-MAX_INITIALIZER_LINES  = 30
-
-# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources 
-# only. Doxygen will then generate output that is more tailored for C. 
-# For instance some of the names that are used will be different. The list 
-# of all members will be omitted, etc.
-
-OPTIMIZE_OUTPUT_FOR_C  = NO
-
-# Set the SHOW_USED_FILES tag to NO to disable the list of files generated 
-# at the bottom of the documentation of classes and structs. If set to YES the 
-# list will mention the files that were used to generate the documentation.
-
-SHOW_USED_FILES        = YES
-
-#---------------------------------------------------------------------------
-# configuration options related to warning and progress messages
-#---------------------------------------------------------------------------
-
-# The QUIET tag can be used to turn on/off the messages that are generated 
-# by doxygen. Possible values are YES and NO. If left blank NO is used.
-
-QUIET                  = NO
-
-# The WARNINGS tag can be used to turn on/off the warning messages that are 
-# generated by doxygen. Possible values are YES and NO. If left blank 
-# NO is used.
-
-WARNINGS               = YES
-
-# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings 
-# for undocumented members. If EXTRACT_ALL is set to YES then this flag will 
-# automatically be disabled.
-
-WARN_IF_UNDOCUMENTED   = YES
-
-# The WARN_FORMAT tag determines the format of the warning messages that 
-# doxygen can produce. The string should contain the $file, $line, and $text 
-# tags, which will be replaced by the file and line number from which the 
-# warning originated and the warning text.
-
-WARN_FORMAT            = "$file:$line: $text"
-
-# The WARN_LOGFILE tag can be used to specify a file to which warning 
-# and error messages should be written. If left blank the output is written 
-# to stderr.
-
-WARN_LOGFILE           = 
-
-#---------------------------------------------------------------------------
-# configuration options related to the input files
-#---------------------------------------------------------------------------
-
-# The INPUT tag can be used to specify the files and/or directories that contain 
-# documented source files. You may enter file names like "myfile.cpp" or 
-# directories like "/usr/src/myproject". Separate the files or directories 
-# with spaces.
-
-INPUT                  = inc/libs3.h
-
-# If the value of the INPUT tag contains directories, you can use the 
-# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp 
-# and *.h) to filter out the source-files in the directories. If left 
-# blank the following patterns are tested: 
-# *.c *.cc *.cxx *.cpp *.c++ *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh *.hxx *.hpp 
-# *.h++ *.idl *.odl
-
-FILE_PATTERNS          =
-
-# The RECURSIVE tag can be used to turn specify whether or not subdirectories 
-# should be searched for input files as well. Possible values are YES and NO. 
-# If left blank NO is used.
-
-RECURSIVE              = YES
-
-# The EXCLUDE tag can be used to specify files and/or directories that should 
-# excluded from the INPUT source files. This way you can easily exclude a 
-# subdirectory from a directory tree whose root is specified with the INPUT tag.
-
-EXCLUDE                =
-
-# The EXCLUDE_SYMLINKS tag can be used select whether or not files or directories 
-# that are symbolic links (a Unix filesystem feature) are excluded from the input.
-
-EXCLUDE_SYMLINKS       = NO
-
-# If the value of the INPUT tag contains directories, you can use the 
-# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude 
-# certain files from those directories.
-
-EXCLUDE_PATTERNS       = 
-
-# The EXAMPLE_PATH tag can be used to specify one or more files or 
-# directories that contain example code fragments that are included (see 
-# the \include command).
-
-EXAMPLE_PATH           = 
-
-# If the value of the EXAMPLE_PATH tag contains directories, you can use the 
-# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp 
-# and *.h) to filter out the source-files in the directories. If left 
-# blank all files are included.
-
-EXAMPLE_PATTERNS       = 
-
-# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be 
-# searched for input files to be used with the \include or \dontinclude 
-# commands irrespective of the value of the RECURSIVE tag. 
-# Possible values are YES and NO. If left blank NO is used.
-
-EXAMPLE_RECURSIVE      = NO
-
-# The IMAGE_PATH tag can be used to specify one or more files or 
-# directories that contain image that are included in the documentation (see 
-# the \image command).
-
-IMAGE_PATH             = 
-
-# The INPUT_FILTER tag can be used to specify a program that doxygen should 
-# invoke to filter for each input file. Doxygen will invoke the filter program 
-# by executing (via popen()) the command <filter> <input-file>, where <filter> 
-# is the value of the INPUT_FILTER tag, and <input-file> is the name of an 
-# input file. Doxygen will then use the output that the filter program writes 
-# to standard output.
-
-INPUT_FILTER           = 
-
-# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using 
-# INPUT_FILTER) will be used to filter the input files when producing source 
-# files to browse.
-
-FILTER_SOURCE_FILES    = NO
-
-#---------------------------------------------------------------------------
-# configuration options related to source browsing
-#---------------------------------------------------------------------------
-
-# If the SOURCE_BROWSER tag is set to YES then a list of source files will 
-# be generated. Documented entities will be cross-referenced with these sources.
-
-SOURCE_BROWSER         = NO
-
-# Setting the INLINE_SOURCES tag to YES will include the body 
-# of functions and classes directly in the documentation.
-
-INLINE_SOURCES         = NO
-
-# If the REFERENCED_BY_RELATION tag is set to YES (the default) 
-# then for each documented function all documented 
-# functions referencing it will be listed.
-
-REFERENCED_BY_RELATION = YES
-
-# If the REFERENCES_RELATION tag is set to YES (the default) 
-# then for each documented function all documented entities 
-# called/used by that function will be listed.
-
-REFERENCES_RELATION    = YES
-
-#---------------------------------------------------------------------------
-# configuration options related to the alphabetical class index
-#---------------------------------------------------------------------------
-
-# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index 
-# of all compounds will be generated. Enable this if the project 
-# contains a lot of classes, structs, unions or interfaces.
-
-ALPHABETICAL_INDEX     = NO
-
-# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then 
-# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns 
-# in which this list will be split (can be a number in the range [1..20])
-
-COLS_IN_ALPHA_INDEX    = 5
-
-# In case all classes in a project start with a common prefix, all 
-# classes will be put under the same header in the alphabetical index. 
-# The IGNORE_PREFIX tag can be used to specify one or more prefixes that 
-# should be ignored while generating the index headers.
-
-IGNORE_PREFIX          = 
-
-#---------------------------------------------------------------------------
-# configuration options related to the HTML output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_HTML tag is set to YES (the default) Doxygen will 
-# generate HTML output.
-
-GENERATE_HTML          = YES
-
-# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. 
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be 
-# put in front of it. If left blank `html' will be used as the default path.
-
-HTML_OUTPUT            = html
-
-# The HTML_FILE_EXTENSION tag can be used to specify the file extension for 
-# each generated HTML page (for example: .htm,.php,.asp). If it is left blank 
-# doxygen will generate files with .html extension.
-
-HTML_FILE_EXTENSION    = .html
-
-# The HTML_HEADER tag can be used to specify a personal HTML header for 
-# each generated HTML page. If it is left blank doxygen will generate a 
-# standard header.
-
-HTML_HEADER            = 
-
-# The HTML_FOOTER tag can be used to specify a personal HTML footer for 
-# each generated HTML page. If it is left blank doxygen will generate a 
-# standard footer.
-
-HTML_FOOTER            = 
-
-# The HTML_STYLESHEET tag can be used to specify a user defined cascading 
-# style sheet that is used by each HTML page. It can be used to 
-# fine-tune the look of the HTML output. If the tag is left blank doxygen 
-# will generate a default style sheet
-
-HTML_STYLESHEET        = 
-
-# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes, 
-# files or namespaces will be aligned in HTML using tables. If set to 
-# NO a bullet list will be used.
-
-HTML_ALIGN_MEMBERS     = YES
-
-# If the GENERATE_HTMLHELP tag is set to YES, additional index files 
-# will be generated that can be used as input for tools like the 
-# Microsoft HTML help workshop to generate a compressed HTML help file (.chm) 
-# of the generated HTML documentation.
-
-GENERATE_HTMLHELP      = NO
-
-# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag 
-# controls if a separate .chi index file is generated (YES) or that 
-# it should be included in the master .chm file (NO).
-
-GENERATE_CHI           = NO
-
-# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag 
-# controls whether a binary table of contents is generated (YES) or a 
-# normal table of contents (NO) in the .chm file.
-
-BINARY_TOC             = NO
-
-# The TOC_EXPAND flag can be set to YES to add extra items for group members 
-# to the contents of the Html help documentation and to the tree view.
-
-TOC_EXPAND             = NO
-
-# The DISABLE_INDEX tag can be used to turn on/off the condensed index at 
-# top of each HTML page. The value NO (the default) enables the index and 
-# the value YES disables it.
-
-DISABLE_INDEX          = NO
-
-# This tag can be used to set the number of enum values (range [1..20]) 
-# that doxygen will group on one line in the generated HTML documentation.
-
-ENUM_VALUES_PER_LINE   = 4
-
-# If the GENERATE_TREEVIEW tag is set to YES, a side panel will be
-# generated containing a tree-like index structure (just like the one that 
-# is generated for HTML Help). For this to work a browser that supports 
-# JavaScript and frames is required (for instance Mozilla, Netscape 4.0+, 
-# or Internet explorer 4.0+). Note that for large projects the tree generation 
-# can take a very long time. In such cases it is better to disable this feature. 
-# Windows users are probably better off using the HTML help feature.
-
-GENERATE_TREEVIEW      = YES
-
-# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be 
-# used to set the initial width (in pixels) of the frame in which the tree 
-# is shown.
-
-TREEVIEW_WIDTH         = 250
-
-#---------------------------------------------------------------------------
-# configuration options related to the LaTeX output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will 
-# generate Latex output.
-
-GENERATE_LATEX         = NO
-
-# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. 
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be 
-# put in front of it. If left blank `latex' will be used as the default path.
-
-LATEX_OUTPUT           = latex
-
-# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact 
-# LaTeX documents. This may be useful for small projects and may help to 
-# save some trees in general.
-
-COMPACT_LATEX          = NO
-
-# The PAPER_TYPE tag can be used to set the paper type that is used 
-# by the printer. Possible values are: a4, a4wide, letter, legal and 
-# executive. If left blank a4wide will be used.
-
-PAPER_TYPE             = a4wide
-
-# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX 
-# packages that should be included in the LaTeX output.
-
-EXTRA_PACKAGES         = 
-
-# The LATEX_HEADER tag can be used to specify a personal LaTeX header for 
-# the generated latex document. The header should contain everything until 
-# the first chapter. If it is left blank doxygen will generate a 
-# standard header. Notice: only use this tag if you know what you are doing!
-
-LATEX_HEADER           = 
-
-# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated 
-# is prepared for conversion to pdf (using ps2pdf). The pdf file will 
-# contain links (just like the HTML output) instead of page references 
-# This makes the output suitable for online browsing using a pdf viewer.
-
-PDF_HYPERLINKS         = NO
-
-# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of 
-# plain latex in the generated Makefile. Set this option to YES to get a 
-# higher quality PDF documentation.
-
-USE_PDFLATEX           = NO
-
-# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode. 
-# command to the generated LaTeX files. This will instruct LaTeX to keep 
-# running if errors occur, instead of asking the user for help. 
-# This option is also used when generating formulas in HTML.
-
-LATEX_BATCHMODE        = NO
-
-#---------------------------------------------------------------------------
-# configuration options related to the RTF output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output 
-# The RTF output is optimised for Word 97 and may not look very pretty with 
-# other RTF readers or editors.
-
-GENERATE_RTF           = NO
-
-# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. 
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be 
-# put in front of it. If left blank `rtf' will be used as the default path.
-
-RTF_OUTPUT             = rtf
-
-# If the COMPACT_RTF tag is set to YES Doxygen generates more compact 
-# RTF documents. This may be useful for small projects and may help to 
-# save some trees in general.
-
-COMPACT_RTF            = NO
-
-# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated 
-# will contain hyperlink fields. The RTF file will 
-# contain links (just like the HTML output) instead of page references. 
-# This makes the output suitable for online browsing using WORD or other 
-# programs which support those fields. 
-# Note: wordpad (write) and others do not support links.
-
-RTF_HYPERLINKS         = NO
-
-# Load stylesheet definitions from file. Syntax is similar to doxygen's 
-# config file, i.e. a series of assigments. You only have to provide 
-# replacements, missing definitions are set to their default value.
-
-RTF_STYLESHEET_FILE    = 
-
-# Set optional variables used in the generation of an rtf document. 
-# Syntax is similar to doxygen's config file.
-
-RTF_EXTENSIONS_FILE    = 
-
-#---------------------------------------------------------------------------
-# configuration options related to the man page output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_MAN tag is set to YES (the default) Doxygen will 
-# generate man pages
-
-GENERATE_MAN           = NO
-
-# The MAN_OUTPUT tag is used to specify where the man pages will be put. 
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be 
-# put in front of it. If left blank `man' will be used as the default path.
-
-MAN_OUTPUT             = man
-
-# The MAN_EXTENSION tag determines the extension that is added to 
-# the generated man pages (default is the subroutine's section .3)
-
-MAN_EXTENSION          = .3
-
-# If the MAN_LINKS tag is set to YES and Doxygen generates man output, 
-# then it will generate one additional man file for each entity 
-# documented in the real man page(s). These additional files 
-# only source the real man page, but without them the man command 
-# would be unable to find the correct page. The default is NO.
-
-MAN_LINKS              = NO
-
-#---------------------------------------------------------------------------
-# configuration options related to the XML output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_XML tag is set to YES Doxygen will 
-# generate an XML file that captures the structure of 
-# the code including all documentation. Note that this 
-# feature is still experimental and incomplete at the 
-# moment.
-
-GENERATE_XML           = NO
-
-#---------------------------------------------------------------------------
-# configuration options for the AutoGen Definitions output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will 
-# generate an AutoGen Definitions (see autogen.sf.net) file 
-# that captures the structure of the code including all 
-# documentation. Note that this feature is still experimental 
-# and incomplete at the moment.
-
-GENERATE_AUTOGEN_DEF   = NO
-
-#---------------------------------------------------------------------------
-# Configuration options related to the preprocessor   
-#---------------------------------------------------------------------------
-
-# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will 
-# evaluate all C-preprocessor directives found in the sources and include 
-# files.
-
-ENABLE_PREPROCESSING   = YES
-
-# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro 
-# names in the source code. If set to NO (the default) only conditional 
-# compilation will be performed. Macro expansion can be done in a controlled 
-# way by setting EXPAND_ONLY_PREDEF to YES.
-
-MACRO_EXPANSION        = NO
-
-# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES 
-# then the macro expansion is limited to the macros specified with the 
-# PREDEFINED and EXPAND_AS_PREDEFINED tags.
-
-EXPAND_ONLY_PREDEF     = NO
-
-# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files 
-# in the INCLUDE_PATH (see below) will be search if a #include is found.
-
-SEARCH_INCLUDES        = YES
-
-# The INCLUDE_PATH tag can be used to specify one or more directories that 
-# contain include files that are not input files but should be processed by 
-# the preprocessor.
-
-INCLUDE_PATH           = 
-
-# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard 
-# patterns (like *.h and *.hpp) to filter out the header-files in the 
-# directories. If left blank, the patterns specified with FILE_PATTERNS will 
-# be used.
-
-INCLUDE_FILE_PATTERNS  = 
-
-# The PREDEFINED tag can be used to specify one or more macro names that 
-# are defined before the preprocessor is started (similar to the -D option of 
-# gcc). The argument of the tag is a list of macros of the form: name 
-# or name=definition (no spaces). If the definition and the = are 
-# omitted =1 is assumed.
-
-PREDEFINED             =  DOXYGEN
-
-# If the MACRO_EXPANSION and EXPAND_PREDEF_ONLY tags are set to YES then 
-# this tag can be used to specify a list of macro names that should be expanded. 
-# The macro definition that is found in the sources will be used. 
-# Use the PREDEFINED tag if you want to use a different macro definition.
-
-EXPAND_AS_DEFINED      = 
-
-# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then 
-# doxygen's preprocessor will remove all function-like macros that are alone 
-# on a line and do not end with a semicolon. Such function macros are typically 
-# used for boiler-plate code, and will confuse the parser if not removed.
-
-SKIP_FUNCTION_MACROS   = YES
-
-#---------------------------------------------------------------------------
-# Configuration::addtions related to external references   
-#---------------------------------------------------------------------------
-
-# The TAGFILES tag can be used to specify one or more tagfiles.
-
-TAGFILES               = 
-
-# When a file name is specified after GENERATE_TAGFILE, doxygen will create 
-# a tag file that is based on the input files it reads.
-
-GENERATE_TAGFILE       = 
-
-# If the ALLEXTERNALS tag is set to YES all external classes will be listed 
-# in the class index. If set to NO only the inherited external classes 
-# will be listed.
-
-ALLEXTERNALS           = NO
-
-# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed 
-# in the modules index. If set to NO, only the current project's groups will 
-# be listed.
-
-EXTERNAL_GROUPS        = YES
-
-# The PERL_PATH should be the absolute path and name of the perl script 
-# interpreter (i.e. the result of `which perl').
-
-PERL_PATH              = /usr/bin/perl
-
-#---------------------------------------------------------------------------
-# Configuration options related to the dot tool   
-#---------------------------------------------------------------------------
-
-# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will 
-# generate a inheritance diagram (in Html, RTF and LaTeX) for classes with base or 
-# super classes. Setting the tag to NO turns the diagrams off. Note that this 
-# option is superceded by the HAVE_DOT option below. This is only a fallback. It is 
-# recommended to install and use dot, since it yield more powerful graphs.
-
-CLASS_DIAGRAMS         = YES
-
-# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is 
-# available from the path. This tool is part of Graphviz, a graph visualization 
-# toolkit from AT&T and Lucent Bell Labs. The other options in this section 
-# have no effect if this option is set to NO (the default)
-
-HAVE_DOT               = NO
-
-# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen 
-# will generate a graph for each documented class showing the direct and 
-# indirect inheritance relations. Setting this tag to YES will force the 
-# the CLASS_DIAGRAMS tag to NO.
-
-CLASS_GRAPH            = YES
-
-# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen 
-# will generate a graph for each documented class showing the direct and 
-# indirect implementation dependencies (inheritance, containment, and 
-# class references variables) of the class with other documented classes.
-
-COLLABORATION_GRAPH    = YES
-
-# If set to YES, the inheritance and collaboration graphs will show the 
-# relations between templates and their instances.
-
-TEMPLATE_RELATIONS     = YES
-
-# If set to YES, the inheritance and collaboration graphs will hide 
-# inheritance and usage relations if the target is undocumented 
-# or is not a class.
-
-HIDE_UNDOC_RELATIONS   = YES
-
-# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT 
-# tags are set to YES then doxygen will generate a graph for each documented 
-# file showing the direct and indirect include dependencies of the file with 
-# other documented files.
-
-INCLUDE_GRAPH          = YES
-
-# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and 
-# HAVE_DOT tags are set to YES then doxygen will generate a graph for each 
-# documented header file showing the documented files that directly or 
-# indirectly include this file.
-
-INCLUDED_BY_GRAPH      = YES
-
-# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen 
-# will graphical hierarchy of all classes instead of a textual one.
-
-GRAPHICAL_HIERARCHY    = YES
-
-# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images 
-# generated by dot. Possible values are gif, jpg, and png
-# If left blank gif will be used.
-
-DOT_IMAGE_FORMAT       = gif
-
-# The tag DOT_PATH can be used to specify the path where the dot tool can be 
-# found. If left blank, it is assumed the dot tool can be found on the path.
-
-DOT_PATH               = 
-
-# The DOTFILE_DIRS tag can be used to specify one or more directories that 
-# contain dot files that are included in the documentation (see the 
-# \dotfile command).
-
-DOTFILE_DIRS           = 
-
-# The MAX_DOT_GRAPH_WIDTH tag can be used to set the maximum allowed width 
-# (in pixels) of the graphs generated by dot. If a graph becomes larger than 
-# this value, doxygen will try to truncate the graph, so that it fits within 
-# the specified constraint. Beware that most browsers cannot cope with very 
-# large images.
-
-MAX_DOT_GRAPH_WIDTH    = 1024
-
-# The MAX_DOT_GRAPH_HEIGHT tag can be used to set the maximum allows height 
-# (in pixels) of the graphs generated by dot. If a graph becomes larger than 
-# this value, doxygen will try to truncate the graph, so that it fits within 
-# the specified constraint. Beware that most browsers cannot cope with very 
-# large images.
-
-MAX_DOT_GRAPH_HEIGHT   = 1024
-
-# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will 
-# generate a legend page explaining the meaning of the various boxes and 
-# arrows in the dot generated graphs.
-
-GENERATE_LEGEND        = YES
-
-# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will 
-# remove the intermedate dot files that are used to generate 
-# the various graphs.
-
-DOT_CLEANUP            = YES
-
-#---------------------------------------------------------------------------
-# Configuration::addtions related to the search engine   
-#---------------------------------------------------------------------------
-
-# The SEARCHENGINE tag specifies whether or not a search engine should be 
-# used. If set to NO the values of all tags below this one will be ignored.
-
-SEARCHENGINE           = NO
diff --git a/src/libs3/inc/error_parser.h b/src/libs3/inc/error_parser.h
deleted file mode 100644
index 8785201..0000000
--- a/src/libs3/inc/error_parser.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/** **************************************************************************
- * error_parser.h
- * 
- * Copyright 2008 Bryan Ischo <bryan at ischo.com>
- * 
- * This file is part of libs3.
- * 
- * libs3 is free software: you can redistribute it and/or modify it under the
- * terms of the GNU General Public License as published by the Free Software
- * Foundation, version 3 of the License.
- *
- * In addition, as a special exception, the copyright holders give
- * permission to link the code of this library and its programs with the
- * OpenSSL library, and distribute linked combinations including the two.
- *
- * libs3 is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License version 3
- * along with libs3, in a file named COPYING.  If not, see
- * <http://www.gnu.org/licenses/>.
- *
- ************************************************************************** **/
-
-#ifndef ERROR_PARSER_H
-#define ERROR_PARSER_H
-
-#include "libs3.h"
-#include "simplexml.h"
-#include "string_buffer.h"
-
-
-#define EXTRA_DETAILS_SIZE 8
-
-typedef struct ErrorParser
-{
-    // This is the S3ErrorDetails that this ErrorParser fills in from the
-    // data that it parses
-    S3ErrorDetails s3ErrorDetails;
-
-    // This is the error XML parser
-    SimpleXml errorXmlParser;
-
-    // Set to 1 after the first call to add
-    int errorXmlParserInitialized;
-
-    // Used to buffer the S3 Error Code as it is read in
-    string_buffer(code, 1024);
-
-    // Used to buffer the S3 Error Message as it is read in
-    string_buffer(message, 1024);
-
-    // Used to buffer the S3 Error Resource as it is read in
-    string_buffer(resource, 1024);
-
-    // Used to buffer the S3 Error Further Details as it is read in
-    string_buffer(furtherDetails, 1024);
-    
-    // The extra details; we support up to EXTRA_DETAILS_SIZE of them
-    S3NameValue extraDetails[EXTRA_DETAILS_SIZE];
-
-    // This is the buffer from which the names and values used in extraDetails
-    // are allocated
-    string_multibuffer(extraDetailsNamesValues, EXTRA_DETAILS_SIZE * 1024);
-} ErrorParser;
-
-
-// Always call this
-void error_parser_initialize(ErrorParser *errorParser);
-
-S3Status error_parser_add(ErrorParser *errorParser, char *buffer,
-                          int bufferSize);
-
-void error_parser_convert_status(ErrorParser *errorParser, S3Status *status);
-
-// Always call this
-void error_parser_deinitialize(ErrorParser *errorParser);
-
-
-#endif /* ERROR_PARSER_H */
diff --git a/src/libs3/inc/libs3.h b/src/libs3/inc/libs3.h
deleted file mode 100644
index baa200c..0000000
--- a/src/libs3/inc/libs3.h
+++ /dev/null
@@ -1,1892 +0,0 @@
-/** **************************************************************************
- * libs3.h
- * 
- * Copyright 2008 Bryan Ischo <bryan at ischo.com>
- * 
- * This file is part of libs3.
- * 
- * libs3 is free software: you can redistribute it and/or modify it under the
- * terms of the GNU General Public License as published by the Free Software
- * Foundation, version 3 of the License.
- *
- * In addition, as a special exception, the copyright holders give
- * permission to link the code of this library and its programs with the
- * OpenSSL library, and distribute linked combinations including the two.
- *
- * libs3 is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License version 3
- * along with libs3, in a file named COPYING.  If not, see
- * <http://www.gnu.org/licenses/>.
- *
- ************************************************************************** **/
-
-#ifndef LIBS3_H
-#define LIBS3_H
-
-#include <stdint.h>
-#include <sys/select.h>
-
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-
-/** **************************************************************************
- * Overview
- * --------
- *
- * This library provides an API for using Amazon's S3 service (see
- * http://s3.amazonaws.com).  Its design goals are:
- *
- * - To provide a simple and straightforward API for accessing all of S3's
- *   functionality
- * - To not require the developer using libs3 to need to know anything about:
- *     - HTTP
- *     - XML
- *     - SSL
- *   In other words, this API is meant to stand on its own, without requiring
- *   any implicit knowledge of how S3 services are accessed using HTTP
- *   protocols.
- * - To be usable from multithreaded code
- * - To be usable by code which wants to process multiple S3 requests
- *   simultaneously from a single thread
- * - To be usable in the simple, straightforward way using sequentialized
- *   blocking requests
- *
- * The general usage pattern of libs3 is:
- *
- * - Initialize libs3 once per program by calling S3_initialize() at program
- *   start up time
- * - Make any number of requests to S3 for getting, putting, or listing
- *   S3 buckets or objects, or modifying the ACLs associated with buckets
- *   or objects, using one of three general approaches:
- *   1. Simple blocking requests, one at a time
- *   2. Multiple threads each making simple blocking requests
- *   3. From a single thread, managing multiple S3 requests simultaneously
- *      using file descriptors and a select()/poll() loop
- * - Shut down libs3 at program exit time by calling S3_deinitialize()
- *
- * All functions which send requests to S3 return their results via a set of
- * callback functions which must be supplied to libs3 at the time that the
- * request is initiated.  libs3 will call these functions back in the thread
- * calling the libs3 function if blocking requests are made (i.e., if the
- * S3RequestContext for the function invocation is passed in as NULL).
- * If an S3RequestContext is used to drive multiple S3 requests
- * simultaneously, then the callbacks will be made from the thread which
- * calls S3_runall_request_context() or S3_runonce_request_context(), or
- * possibly from the thread which calls S3_destroy_request_context(), if
- * S3 requests are in progress at the time that this function is called.
- *
- * NOTE: Response headers from Amazon S3 are limited to 4K (2K of metas is all
- * that Amazon supports, and libs3 allows Amazon an additional 2K of headers).
- *
- * NOTE: Because HTTP and the S3 REST protocol are highly under-specified,
- * libs3 must make some assumptions about the maximum length of certain HTTP
- * elements (such as headers) that it will accept.  While efforts have been
- * made to enforce maximums which are beyond that expected to be needed by any
- * user of S3, it is always possible that these maximums may be too low in
- * some rare circumstances.  Bug reports should this unlikely situation occur
- * would be most appreciated.
- * 
- * Threading Rules
- * ---------------
- * 
- * 1. All arguments passed to any function must not be modified directly until
- *    the function returns.
- * 2. All S3RequestContext and S3Request arguments passed to all functions may
- *    not be passed to any other libs3 function by any other thread until the
- *    function returns.
- * 3. All functions may be called simultaneously by multiple threads as long
- *    as (1) and (2) are observed, EXCEPT for S3_initialize(), which must be
- *    called from one thread at a time only.
- * 4. All callbacks will be made in the thread of the caller of the function
- *    which invoked them, so the caller of all libs3 functions should not hold
- *    locks that it would try to re-acquire in a callback, as this may
- *    deadlock.
- ************************************************************************** **/
-
-
-/** **************************************************************************
- * Constants
- ************************************************************************** **/
-
-/**
- * S3_MAX_HOSTNAME_SIZE is the maximum size we allow for a host name
- **/
-#define S3_MAX_HOSTNAME_SIZE               255
-
-/**
- * This is the default hostname that is being used for the S3 requests
- **/
-#define S3_DEFAULT_HOSTNAME                "s3.amazonaws.com"
-
-
-/**
- * S3_MAX_BUCKET_NAME_SIZE is the maximum size of a bucket name.
- **/
-
-#define S3_MAX_BUCKET_NAME_SIZE            255
-
-/**
- * S3_MAX_KEY_SIZE is the maximum size of keys that Amazon S3 supports.
- **/
-#define S3_MAX_KEY_SIZE                    1024
-
-
-/**
- * S3_MAX_METADATA_SIZE is the maximum number of bytes allowed for
- * x-amz-meta header names and values in any request passed to Amazon S3
- **/
-#define S3_MAX_METADATA_SIZE               2048
-
-
-/**
- * S3_METADATA_HEADER_NAME_PREFIX is the prefix of an S3 "meta header"
- **/
-#define S3_METADATA_HEADER_NAME_PREFIX     "x-amz-meta-"
-
-
-/**
- * S3_MAX_METADATA_COUNT is the maximum number of x-amz-meta- headers that
- * could be included in a request to S3.  The smallest meta header is
- * "x-amz-meta-n: v".  Since S3 doesn't count the ": " against the total, the
- * smallest amount of data to count for a header would be the length of
- * "x-amz-meta-nv".
- **/
-#define S3_MAX_METADATA_COUNT \
-    (S3_MAX_METADATA_SIZE / (sizeof(S3_METADATA_HEADER_NAME_PREFIX "nv") - 1))
-
-
-/**
- * S3_MAX_ACL_GRANT_COUNT is the maximum number of ACL grants that may be
- * set on a bucket or object at one time.  It is also the maximum number of
- * ACL grants that the XML ACL parsing routine will parse.
- **/
-#define S3_MAX_ACL_GRANT_COUNT             100
-
-
-/**
- * This is the maximum number of characters (including terminating \0) that
- * libs3 supports in an ACL grantee email address.
- **/
-#define S3_MAX_GRANTEE_EMAIL_ADDRESS_SIZE  128
-
-
-/**
- * This is the maximum number of characters (including terminating \0) that
- * libs3 supports in an ACL grantee user id.
- **/
-#define S3_MAX_GRANTEE_USER_ID_SIZE        128
-
-
-/**
- * This is the maximum number of characters (including terminating \0) that
- * libs3 supports in an ACL grantee user display name.
- **/
-#define S3_MAX_GRANTEE_DISPLAY_NAME_SIZE   128
-
-
-/**
- * This is the maximum number of characters that will be stored in the
- * return buffer for the utility function which computes an HTTP authenticated
- * query string
- **/
-#define S3_MAX_AUTHENTICATED_QUERY_STRING_SIZE \
-    (sizeof("https:///") + S3_MAX_HOSTNAME_SIZE + (S3_MAX_KEY_SIZE * 3) + \
-     sizeof("?AWSAccessKeyId=") + 32 + sizeof("&Expires=") + 32 + \
-     sizeof("&Signature=") + 28 + 1)
-
-
-/**
- * This constant is used by the S3_initialize() function, to specify that
- * the winsock library should be initialized by libs3; only relevent on 
- * Microsoft Windows platforms.
- **/
-#define S3_INIT_WINSOCK                    1
-
-
-/**
- * This convenience constant is used by the S3_initialize() function to
- * indicate that all libraries required by libs3 should be initialized.
- **/
-#define S3_INIT_ALL                        (S3_INIT_WINSOCK)
-
-
-/** **************************************************************************
- * Enumerations
- ************************************************************************** **/
-
-/**
- * S3Status is a status code as returned by a libs3 function.  The meaning of
- * each status code is defined in the comments for each function which returns
- * that status.
- **/
-typedef enum
-{
-    S3StatusOK                                              ,
-
-    /**
-     * Errors that prevent the S3 request from being issued or response from
-     * being read
-     **/
-    S3StatusInternalError                                   ,
-    S3StatusOutOfMemory                                     ,
-    S3StatusInterrupted                                     ,
-    S3StatusInvalidBucketNameTooLong                        ,
-    S3StatusInvalidBucketNameFirstCharacter                 ,
-    S3StatusInvalidBucketNameCharacter                      ,
-    S3StatusInvalidBucketNameCharacterSequence              ,
-    S3StatusInvalidBucketNameTooShort                       ,
-    S3StatusInvalidBucketNameDotQuadNotation                ,
-    S3StatusQueryParamsTooLong                              ,
-    S3StatusFailedToInitializeRequest                       ,
-    S3StatusMetaDataHeadersTooLong                          ,
-    S3StatusBadMetaData                                     ,
-    S3StatusBadContentType                                  ,
-    S3StatusContentTypeTooLong                              ,
-    S3StatusBadMD5                                          ,
-    S3StatusMD5TooLong                                      ,
-    S3StatusBadCacheControl                                 ,
-    S3StatusCacheControlTooLong                             ,
-    S3StatusBadContentDispositionFilename                   ,
-    S3StatusContentDispositionFilenameTooLong               ,
-    S3StatusBadContentEncoding                              ,
-    S3StatusContentEncodingTooLong                          ,
-    S3StatusBadIfMatchETag                                  ,
-    S3StatusIfMatchETagTooLong                              ,
-    S3StatusBadIfNotMatchETag                               ,
-    S3StatusIfNotMatchETagTooLong                           ,
-    S3StatusHeadersTooLong                                  ,
-    S3StatusKeyTooLong                                      ,
-    S3StatusUriTooLong                                      ,
-    S3StatusXmlParseFailure                                 ,
-    S3StatusEmailAddressTooLong                             ,
-    S3StatusUserIdTooLong                                   ,
-    S3StatusUserDisplayNameTooLong                          ,
-    S3StatusGroupUriTooLong                                 ,
-    S3StatusPermissionTooLong                               ,
-    S3StatusTargetBucketTooLong                             ,
-    S3StatusTargetPrefixTooLong                             ,
-    S3StatusTooManyGrants                                   ,
-    S3StatusBadGrantee                                      ,
-    S3StatusBadPermission                                   ,
-    S3StatusXmlDocumentTooLarge                             ,
-    S3StatusNameLookupError                                 ,
-    S3StatusFailedToConnect                                 ,
-    S3StatusServerFailedVerification                        ,
-    S3StatusConnectionFailed                                ,
-    S3StatusAbortedByCallback                               ,
-    
-    /**
-     * Errors from the S3 service
-     **/
-    S3StatusErrorAccessDenied                               ,
-    S3StatusErrorAccountProblem                             ,
-    S3StatusErrorAmbiguousGrantByEmailAddress               ,
-    S3StatusErrorBadDigest                                  ,
-    S3StatusErrorBucketAlreadyExists                        ,
-    S3StatusErrorBucketAlreadyOwnedByYou                    ,
-    S3StatusErrorBucketNotEmpty                             ,
-    S3StatusErrorCredentialsNotSupported                    ,
-    S3StatusErrorCrossLocationLoggingProhibited             ,
-    S3StatusErrorEntityTooSmall                             ,
-    S3StatusErrorEntityTooLarge                             ,
-    S3StatusErrorExpiredToken                               ,
-    S3StatusErrorIncompleteBody                             ,
-    S3StatusErrorIncorrectNumberOfFilesInPostRequest        ,
-    S3StatusErrorInlineDataTooLarge                         ,
-    S3StatusErrorInternalError                              ,
-    S3StatusErrorInvalidAccessKeyId                         ,
-    S3StatusErrorInvalidAddressingHeader                    ,
-    S3StatusErrorInvalidArgument                            ,
-    S3StatusErrorInvalidBucketName                          ,
-    S3StatusErrorInvalidDigest                              ,
-    S3StatusErrorInvalidLocationConstraint                  ,
-    S3StatusErrorInvalidPayer                               ,
-    S3StatusErrorInvalidPolicyDocument                      ,
-    S3StatusErrorInvalidRange                               ,
-    S3StatusErrorInvalidSecurity                            ,
-    S3StatusErrorInvalidSOAPRequest                         ,
-    S3StatusErrorInvalidStorageClass                        ,
-    S3StatusErrorInvalidTargetBucketForLogging              ,
-    S3StatusErrorInvalidToken                               ,
-    S3StatusErrorInvalidURI                                 ,
-    S3StatusErrorKeyTooLong                                 ,
-    S3StatusErrorMalformedACLError                          ,
-    S3StatusErrorMalformedXML                               ,
-    S3StatusErrorMaxMessageLengthExceeded                   ,
-    S3StatusErrorMaxPostPreDataLengthExceededError          ,
-    S3StatusErrorMetadataTooLarge                           ,
-    S3StatusErrorMethodNotAllowed                           ,
-    S3StatusErrorMissingAttachment                          ,
-    S3StatusErrorMissingContentLength                       ,
-    S3StatusErrorMissingSecurityElement                     ,
-    S3StatusErrorMissingSecurityHeader                      ,
-    S3StatusErrorNoLoggingStatusForKey                      ,
-    S3StatusErrorNoSuchBucket                               ,
-    S3StatusErrorNoSuchKey                                  ,
-    S3StatusErrorNotImplemented                             ,
-    S3StatusErrorNotSignedUp                                ,
-    S3StatusErrorOperationAborted                           ,
-    S3StatusErrorPermanentRedirect                          ,
-    S3StatusErrorPreconditionFailed                         ,
-    S3StatusErrorRedirect                                   ,
-    S3StatusErrorRequestIsNotMultiPartContent               ,
-    S3StatusErrorRequestTimeout                             ,
-    S3StatusErrorRequestTimeTooSkewed                       ,
-    S3StatusErrorRequestTorrentOfBucketError                ,
-    S3StatusErrorSignatureDoesNotMatch                      ,
-    S3StatusErrorSlowDown                                   ,
-    S3StatusErrorTemporaryRedirect                          ,
-    S3StatusErrorTokenRefreshRequired                       ,
-    S3StatusErrorTooManyBuckets                             ,
-    S3StatusErrorUnexpectedContent                          ,
-    S3StatusErrorUnresolvableGrantByEmailAddress            ,
-    S3StatusErrorUserKeyMustBeSpecified                     ,
-    S3StatusErrorUnknown                                    ,
-
-    /**
-     * The following are HTTP errors returned by S3 without enough detail to
-     * distinguish any of the above S3StatusError conditions
-     **/
-    S3StatusHttpErrorMovedTemporarily                       ,
-    S3StatusHttpErrorBadRequest                             ,
-    S3StatusHttpErrorForbidden                              ,
-    S3StatusHttpErrorNotFound                               ,
-    S3StatusHttpErrorConflict                               ,
-    S3StatusHttpErrorUnknown
-} S3Status;
-
-
-/**
- * S3Protocol represents a protocol that may be used for communicating a
- * request to the Amazon S3 service.
- *
- * In general, HTTPS is greatly preferred (and should be the default of any
- * application using libs3) because it protects any data being sent to or
- * from S3 using strong encryption.  However, HTTPS is much more CPU intensive
- * than HTTP, and if the caller is absolutely certain that it is OK for the
- * data to be viewable by anyone in transit, then HTTP can be used.
- **/
-typedef enum
-{
-    S3ProtocolHTTPS                     = 0,
-    S3ProtocolHTTP                      = 1
-} S3Protocol;
-
-
-/**
- * S3UriStyle defines the form that an Amazon S3 URI identifying a bucket or
- * object can take.  They are of these forms:
- *
- * Virtual Host: ${protocol}://${bucket}.s3.amazonaws.com/[${key}]
- * Path: ${protocol}://s3.amazonaws.com/${bucket}/[${key}]
- *
- * It is generally better to use the Virual Host URI form, because it ensures
- * that the bucket name used is compatible with normal HTTP GETs and POSTs of
- * data to/from the bucket.  However, if DNS lookups for the bucket are too
- * slow or unreliable for some reason, Path URI form may be used.
- **/
-typedef enum
-{
-    S3UriStyleVirtualHost               = 0,
-    S3UriStylePath                      = 1
-} S3UriStyle;
-
-
-/**
- * S3GranteeType defines the type of Grantee used in an S3 ACL Grant.
- * Amazon Customer By Email - identifies the Grantee using their Amazon S3
- *     account email address
- * Canonical User - identifies the Grantee by S3 User ID and Display Name,
- *     which can only be obtained by making requests to S3, for example, by
- *     listing owned buckets
- * All AWS Users - identifies all authenticated AWS users
- * All Users - identifies all users
- * Log Delivery - identifies the Amazon group responsible for writing
- *                server access logs into buckets
- **/
-typedef enum
-{
-    S3GranteeTypeAmazonCustomerByEmail  = 0,
-    S3GranteeTypeCanonicalUser          = 1,
-    S3GranteeTypeAllAwsUsers            = 2,
-    S3GranteeTypeAllUsers               = 3,
-    S3GranteeTypeLogDelivery            = 4
-} S3GranteeType;
-
-
-/**
- * This is an individual permission granted to a grantee in an S3 ACL Grant.
- * Read permission gives the Grantee the permission to list the bucket, or
- *     read the object or its metadata
- * Write permission gives the Grantee the permission to create, overwrite, or
- *     delete any object in the bucket, and is not supported for objects
- * ReadACP permission gives the Grantee the permission to read the ACP for
- *     the bucket or object; the owner of the bucket or object always has
- *     this permission implicitly
- * WriteACP permission gives the Grantee the permission to overwrite the ACP
- *     for the bucket or object; the owner of the bucket or object always has
- *     this permission implicitly
- * FullControl permission gives the Grantee all permissions specified by the
- *     Read, Write, ReadACP, and WriteACP permissions
- **/
-typedef enum
-{
-    S3PermissionRead                    = 0,
-    S3PermissionWrite                   = 1,
-    S3PermissionReadACP                 = 2,
-    S3PermissionWriteACP                = 3,
-    S3PermissionFullControl             = 4
-} S3Permission;
-
-
-/**
- * S3CannedAcl is an ACL that can be specified when an object is created or
- * updated.  Each canned ACL has a predefined value when expanded to a full
- * set of S3 ACL Grants.
- * Private canned ACL gives the owner FULL_CONTROL and no other permissions
- *     are issued
- * Public Read canned ACL gives the owner FULL_CONTROL and all users Read
- *     permission 
- * Public Read Write canned ACL gives the owner FULL_CONTROL and all users
- *     Read and Write permission
- * AuthenticatedRead canned ACL gives the owner FULL_CONTROL and authenticated
- *     S3 users Read permission
- **/
-typedef enum
-{
-    S3CannedAclPrivate                  = 0, /* private */
-    S3CannedAclPublicRead               = 1, /* public-read */
-    S3CannedAclPublicReadWrite          = 2, /* public-read-write */
-    S3CannedAclAuthenticatedRead        = 3  /* authenticated-read */
-} S3CannedAcl;
-
-
-/** **************************************************************************
- * Data Types
- ************************************************************************** **/
-
-/**
- * An S3RequestContext manages multiple S3 requests simultaneously; see the
- * S3_XXX_request_context functions below for details
- **/
-typedef struct S3RequestContext S3RequestContext;
-
-
-/**
- * S3NameValue represents a single Name - Value pair, used to represent either
- * S3 metadata associated with a key, or S3 error details.
- **/
-typedef struct S3NameValue
-{
-    /**
-     * The name part of the Name - Value pair
-     **/
-    const char *name;
-
-    /**
-     * The value part of the Name - Value pair
-     **/
-    const char *value;
-} S3NameValue;
-
-
-/**
- * S3ResponseProperties is passed to the properties callback function which is
- * called when the complete response properties have been received.  Some of
- * the fields of this structure are optional and may not be provided in the
- * response, and some will always be provided in the response.
- **/
-typedef struct S3ResponseProperties
-{
-    /**
-     * This optional field identifies the request ID and may be used when
-     * reporting problems to Amazon.
-     **/
-    const char *requestId;
-
-    /**
-     * This optional field identifies the request ID and may be used when
-     * reporting problems to Amazon.
-     **/
-    const char *requestId2;
-
-    /**
-     * This optional field is the content type of the data which is returned
-     * by the request.  If not provided, the default can be assumed to be
-     * "binary/octet-stream".
-     **/
-    const char *contentType;
-
-    /**
-     * This optional field is the content length of the data which is returned
-     * in the response.  A negative value means that this value was not
-     * provided in the response.  A value of 0 means that there is no content
-     * provided.  A positive value gives the number of bytes in the content of
-     * the response.
-     **/
-    uint64_t contentLength;
-
-    /**
-     * This optional field names the server which serviced the request.
-     **/
-    const char *server;
-
-    /**
-     * This optional field provides a string identifying the unique contents
-     * of the resource identified by the request, such that the contents can
-     * be assumed not to be changed if the same eTag is returned at a later
-     * time decribing the same resource.  This is an MD5 sum of the contents.
-     **/
-    const char *eTag;
-
-    /**
-     * This optional field provides the last modified time, relative to the
-     * Unix epoch, of the contents.  If this value is < 0, then the last
-     * modified time was not provided in the response.  If this value is >= 0,
-     * then the last modified date of the contents are available as a number
-     * of seconds since the UNIX epoch.
-     * 
-     **/
-    int64_t lastModified;
-
-    /**
-     * This is the number of user-provided meta data associated with the
-     * resource.
-     **/
-    int metaDataCount;
-
-    /**
-     * These are the meta data associated with the resource.  In each case,
-     * the name will not include any S3-specific header prefixes
-     * (i.e. x-amz-meta- will have been removed from the beginning), and
-     * leading and trailing whitespace will have been stripped from the value.
-     **/
-    const S3NameValue *metaData;
-} S3ResponseProperties;
-
-
-/**
- * S3AclGrant identifies a single grant in the ACL for a bucket or object.  An
- * ACL is composed of any number of grants, which specify a grantee and the
- * permissions given to that grantee.  S3 does not normalize ACLs in any way,
- * so a redundant ACL specification will lead to a redundant ACL stored in S3.
- **/
-typedef struct S3AclGrant
-{
-    /**
-     * The granteeType gives the type of grantee specified by this grant.
-     **/
-    S3GranteeType granteeType;
-    /**
-     * The identifier of the grantee that is set is determined by the
-     * granteeType:
-     *
-     * S3GranteeTypeAmazonCustomerByEmail - amazonCustomerByEmail.emailAddress
-     * S3GranteeTypeCanonicalUser - canonicalUser.id, canonicalUser.displayName
-     * S3GranteeTypeAllAwsUsers - none
-     * S3GranteeTypeAllUsers - none
-     **/
-    union
-    {
-        /**
-         * This structure is used iff the granteeType is 
-         * S3GranteeTypeAmazonCustomerByEmail.
-         **/
-        struct
-        {
-            /**
-             * This is the email address of the Amazon Customer being granted
-             * permissions by this S3AclGrant.
-             **/
-            char emailAddress[S3_MAX_GRANTEE_EMAIL_ADDRESS_SIZE];
-        } amazonCustomerByEmail;
-        /**
-         * This structure is used iff the granteeType is
-         * S3GranteeTypeCanonicalUser.
-         **/
-        struct
-        {
-            /**
-             * This is the CanonicalUser ID of the grantee
-             **/
-            char id[S3_MAX_GRANTEE_USER_ID_SIZE];
-            /**
-             * This is the display name of the grantee
-             **/
-            char displayName[S3_MAX_GRANTEE_DISPLAY_NAME_SIZE];
-        } canonicalUser;
-    } grantee;
-    /**
-     * This is the S3Permission to be granted to the grantee
-     **/
-    S3Permission permission;
-} S3AclGrant;
-
-
-/**
- * A context for working with objects within a bucket.  A bucket context holds
- * all information necessary for working with a bucket, and may be used
- * repeatedly over many consecutive (or simultaneous) calls into libs3 bucket
- * operation functions.
- **/
-typedef struct S3BucketContext
-{
-    /**
-     * The name of the host to connect to when making S3 requests.  If set to
-     * NULL, the default S3 hostname passed in to S3_initialize will be used.
-     **/
-    const char *hostName;
-
-    /**
-     * The name of the bucket to use in the bucket context
-     **/
-    const char *bucketName;
-
-    /**
-     * The protocol to use when accessing the bucket
-     **/
-    S3Protocol protocol;
-
-    /**
-     * The URI style to use for all URIs sent to Amazon S3 while working with
-     * this bucket context
-     **/
-    S3UriStyle uriStyle;
-
-    /**
-     * The Amazon Access Key ID to use for access to the bucket
-     **/
-    const char *accessKeyId;
-
-    /**
-     *  The Amazon Secret Access Key to use for access to the bucket
-     **/
-    const char *secretAccessKey;
-} S3BucketContext;
-
-
-/**
- * This is a single entry supplied to the list bucket callback by a call to
- * S3_list_bucket.  It identifies a single matching key from the list
- * operation.
- **/
-typedef struct S3ListBucketContent
-{
-    /**
-     * This is the next key in the list bucket results.
-     **/
-    const char *key;
-
-    /**
-     * This is the number of seconds since UNIX epoch of the last modified
-     * date of the object identified by the key. 
-     **/
-    int64_t lastModified;
-
-    /**
-     * This gives a tag which gives a signature of the contents of the object,
-     * which is the MD5 of the contents of the object.
-     **/
-    const char *eTag;
-
-    /**
-     * This is the size of the object in bytes.
-     **/
-    uint64_t size;
-
-    /**
-     * This is the ID of the owner of the key; it is present only if access
-     * permissions allow it to be viewed.
-     **/
-    const char *ownerId;
-
-    /**
-     * This is the display name of the owner of the key; it is present only if
-     * access permissions allow it to be viewed.
-     **/
-    const char *ownerDisplayName;
-} S3ListBucketContent;
-
-
-/**
- * S3PutProperties is the set of properties that may optionally be set by the
- * user when putting objects to S3.  Each field of this structure is optional
- * and may or may not be present.
- **/
-typedef struct S3PutProperties
-{
-    /**
-     * If present, this is the Content-Type that should be associated with the
-     * object.  If not provided, S3 defaults to "binary/octet-stream".
-     **/
-    const char *contentType;
-
-    /**
-     * If present, this provides the MD5 signature of the contents, and is
-     * used to validate the contents.  This is highly recommended by Amazon
-     * but not required.  Its format is as a base64-encoded MD5 sum.
-     **/
-    const char *md5;
-
-    /**
-     * If present, this gives a Cache-Control header string to be supplied to
-     * HTTP clients which download this
-     **/
-    const char *cacheControl;
-
-    /**
-     * If present, this gives the filename to save the downloaded file to,
-     * whenever the object is downloaded via a web browser.  This is only
-     * relevent for objects which are intended to be shared to users via web
-     * browsers and which is additionally intended to be downloaded rather
-     * than viewed.
-     **/
-    const char *contentDispositionFilename;
-
-    /**
-     * If present, this identifies the content encoding of the object.  This
-     * is only applicable to encoded (usually, compressed) content, and only
-     * relevent if the object is intended to be downloaded via a browser.
-     **/
-    const char *contentEncoding;
-
-    /**
-     * If >= 0, this gives an expiration date for the content.  This
-     * information is typically only delivered to users who download the
-     * content via a web browser.
-     **/
-    int64_t expires;
-
-    /**
-     * This identifies the "canned ACL" that should be used for this object.
-     * The default (0) gives only the owner of the object access to it.
-     **/
-    S3CannedAcl cannedAcl;
-
-    /**
-     * This is the number of values in the metaData field.
-     **/
-    int metaDataCount;
-
-    /**
-     * These are the meta data to pass to S3.  In each case, the name part of
-     * the Name - Value pair should not include any special S3 HTTP header
-     * prefix (i.e., should be of the form 'foo', NOT 'x-amz-meta-foo').
-     **/
-    const S3NameValue *metaData;
-} S3PutProperties;
-
-
-/**
- * S3GetConditions is used for the get_object operation, and specifies
- * conditions which the object must meet in order to be successfully returned.
- **/
-typedef struct S3GetConditions
-{
-    /**
-     * The request will be processed if the Last-Modification header of the
-     * object is greater than or equal to this value, specified as a number of
-     * seconds since Unix epoch.  If this value is less than zero, it will not
-     * be used in the conditional.
-     **/
-    int64_t ifModifiedSince;
-
-    /**
-     * The request will be processed if the Last-Modification header of the
-     * object is less than this value, specified as a number of seconds since
-     * Unix epoch.  If this value is less than zero, it will not be used in
-     * the conditional.
-     **/
-    int64_t ifNotModifiedSince;
-
-    /**
-     * If non-NULL, this gives an eTag header value which the object must
-     * match in order to be returned.  Note that altough the eTag is simply an
-     * MD5, this must be presented in the S3 eTag form, which typically
-     * includes double-quotes.
-     **/
-    const char *ifMatchETag;
-
-    /**
-     * If non-NULL, this gives an eTag header value which the object must not
-     * match in order to be returned.  Note that altough the eTag is simply an
-     * MD5, this must be presented in the S3 eTag form, which typically
-     * includes double-quotes.
-     **/
-    const char *ifNotMatchETag;
-} S3GetConditions;
-
-
-/**
- * S3ErrorDetails provides detailed information describing an S3 error.  This
- * is only presented when the error is an S3-generated error (i.e. one of the
- * S3StatusErrorXXX values).
- **/
-typedef struct S3ErrorDetails
-{
-    /**
-     * This is the human-readable message that Amazon supplied describing the
-     * error
-     **/
-    const char *message;
-
-    /**
-     * This identifies the resource for which the error occurred
-     **/
-    const char *resource;
-
-    /**
-     * This gives human-readable further details describing the specifics of
-     * this error
-     **/
-    const char *furtherDetails;
-
-    /**
-     * This gives the number of S3NameValue pairs present in the extraDetails
-     * array
-     **/
-    int extraDetailsCount;
-
-    /**
-     * S3 can provide extra details in a freeform Name - Value pair format.
-     * Each error can have any number of these, and this array provides these
-     * additional extra details.
-     **/
-    S3NameValue *extraDetails;
-} S3ErrorDetails;
-
-
-/** **************************************************************************
- * Callback Signatures
- ************************************************************************** **/
-
-/**
- * This callback is made whenever the response properties become available for
- * any request.
- *
- * @param properties are the properties that are available from the response
- * @param callbackData is the callback data as specified when the request
- *        was issued.
- * @return S3StatusOK to continue processing the request, anything else to
- *         immediately abort the request with a status which will be
- *         passed to the S3ResponseCompleteCallback for this request.
- *         Typically, this will return either S3StatusOK or
- *         S3StatusAbortedByCallback.
- **/
-typedef S3Status (S3ResponsePropertiesCallback)
-    (const S3ResponseProperties *properties, void *callbackData);
-
-
-/**
- * This callback is made when the response has been completely received, or an
- * error has occurred which has prematurely aborted the request, or one of the
- * other user-supplied callbacks returned a value intended to abort the
- * request.  This callback is always made for every request, as the very last
- * callback made for that request.
- *
- * @param status gives the overall status of the response, indicating success
- *        or failure; use S3_status_is_retryable() as a simple way to detect
- *        whether or not the status indicates that the request failed but may
- *        be retried.
- * @param errorDetails if non-NULL, gives details as returned by the S3
- *        service, describing the error
- * @param callbackData is the callback data as specified when the request
- *        was issued.
- **/
-typedef void (S3ResponseCompleteCallback)(S3Status status,
-                                          const S3ErrorDetails *errorDetails,
-                                          void *callbackData);
-
-                                    
-/**
- * This callback is made for each bucket resulting from a list service
- * operation.
- *
- * @param ownerId is the ID of the owner of the bucket
- * @param ownerDisplayName is the owner display name of the owner of the bucket
- * @param bucketName is the name of the bucket
- * @param creationDateSeconds if < 0 indicates that no creation date was
- *        supplied for the bucket; if >= 0 indicates the number of seconds
- *        since UNIX Epoch of the creation date of the bucket
- * @param callbackData is the callback data as specified when the request
- *        was issued.
- * @return S3StatusOK to continue processing the request, anything else to
- *         immediately abort the request with a status which will be
- *         passed to the S3ResponseCompleteCallback for this request.
- *         Typically, this will return either S3StatusOK or
- *         S3StatusAbortedByCallback.
- **/
-typedef S3Status (S3ListServiceCallback)(const char *ownerId, 
-                                         const char *ownerDisplayName,
-                                         const char *bucketName,
-                                         int64_t creationDateSeconds,
-                                         void *callbackData);
-
-
-/**
- * This callback is made repeatedly as a list bucket operation progresses.
- * The contents reported via this callback are only reported once per list
- * bucket operation, but multiple calls to this callback may be necessary to
- * report all items resulting from the list bucket operation.
- *
- * @param isTruncated is true if the list bucket request was truncated by the
- *        S3 service, in which case the remainder of the list may be obtained
- *        by querying again using the Marker parameter to start the query
- *        after this set of results
- * @param nextMarker if present, gives the largest (alphabetically) key
- *        returned in the response, which, if isTruncated is true, may be used
- *        as the marker in a subsequent list buckets operation to continue
- *        listing
- * @param contentsCount is the number of ListBucketContent structures in the
- *        contents parameter
- * @param contents is an array of ListBucketContent structures, each one
- *        describing an object in the bucket
- * @param commonPrefixesCount is the number of common prefixes strings in the
- *        commonPrefixes parameter
- * @param commonPrefixes is an array of strings, each specifing one of the
- *        common prefixes as returned by S3
- * @param callbackData is the callback data as specified when the request
- *        was issued.
- * @return S3StatusOK to continue processing the request, anything else to
- *         immediately abort the request with a status which will be
- *         passed to the S3ResponseCompleteCallback for this request.
- *         Typically, this will return either S3StatusOK or
- *         S3StatusAbortedByCallback.
- **/
-typedef S3Status (S3ListBucketCallback)(int isTruncated,
-                                        const char *nextMarker,
-                                        int contentsCount, 
-                                        const S3ListBucketContent *contents,
-                                        int commonPrefixesCount,
-                                        const char **commonPrefixes,
-                                        void *callbackData);
-                                       
-
-/**
- * This callback is made during a put object operation, to obtain the next
- * chunk of data to put to the S3 service as the contents of the object.  This
- * callback is made repeatedly, each time acquiring the next chunk of data to
- * write to the service, until a negative or 0 value is returned.
- *
- * @param bufferSize gives the maximum number of bytes that may be written
- *        into the buffer parameter by this callback
- * @param buffer gives the buffer to fill with at most bufferSize bytes of
- *        data as the next chunk of data to send to S3 as the contents of this
- *        object
- * @param callbackData is the callback data as specified when the request
- *        was issued.
- * @return < 0 to abort the request with the S3StatusAbortedByCallback, which
- *        will be pased to the response complete callback for this request, or
- *        0 to indicate the end of data, or > 0 to identify the number of
- *        bytes that were written into the buffer by this callback
- **/
-typedef int (S3PutObjectDataCallback)(int bufferSize, char *buffer,
-                                      void *callbackData);
-
-
-/**
- * This callback is made during a get object operation, to provide the next
- * chunk of data available from the S3 service constituting the contents of
- * the object being fetched.  This callback is made repeatedly, each time
- * providing the next chunk of data read, until the complete object contents
- * have been passed through the callback in this way, or the callback
- * returns an error status.
- *
- * @param bufferSize gives the number of bytes in buffer
- * @param buffer is the data being passed into the callback
- * @param callbackData is the callback data as specified when the request
- *        was issued.
- * @return S3StatusOK to continue processing the request, anything else to
- *         immediately abort the request with a status which will be
- *         passed to the S3ResponseCompleteCallback for this request.
- *         Typically, this will return either S3StatusOK or
- *         S3StatusAbortedByCallback.
- **/
-typedef S3Status (S3GetObjectDataCallback)(int bufferSize, const char *buffer,
-                                           void *callbackData);
-                                       
-
-/** **************************************************************************
- * Callback Structures
- ************************************************************************** **/
-
-
-/**
- * An S3ResponseHandler defines the callbacks which are made for any
- * request.
- **/
-typedef struct S3ResponseHandler
-{
-    /**
-     * The propertiesCallback is made when the response properties have
-     * successfully been returned from S3.  This function may not be called
-     * if the response properties were not successfully returned from S3.
-     **/
-    S3ResponsePropertiesCallback *propertiesCallback;
-    
-    /**
-     * The completeCallback is always called for every request made to S3,
-     * regardless of the outcome of the request.  It provides the status of
-     * the request upon its completion, as well as extra error details in the
-     * event of an S3 error.
-     **/
-    S3ResponseCompleteCallback *completeCallback;
-} S3ResponseHandler;
-
-
-/**
- * An S3ListServiceHandler defines the callbacks which are made for
- * list_service requests.
- **/
-typedef struct S3ListServiceHandler
-{
-    /**
-     * responseHandler provides the properties and complete callback
-     **/
-    S3ResponseHandler responseHandler;
-
-    /**
-     * The listServiceCallback is called as items are reported back from S3 as
-     * responses to the request
-     **/
-    S3ListServiceCallback *listServiceCallback;
-} S3ListServiceHandler;
-
-
-/**
- * An S3ListBucketHandler defines the callbacks which are made for
- * list_bucket requests.
- **/
-typedef struct S3ListBucketHandler
-{
-    /**
-     * responseHandler provides the properties and complete callback
-     **/
-    S3ResponseHandler responseHandler;
-
-    /**
-     * The listBucketCallback is called as items are reported back from S3 as
-     * responses to the request.  This may be called more than one time per
-     * list bucket request, each time providing more items from the list
-     * operation.
-     **/
-    S3ListBucketCallback *listBucketCallback;
-} S3ListBucketHandler;
-
-
-/**
- * An S3PutObjectHandler defines the callbacks which are made for
- * put_object requests.
- **/
-typedef struct S3PutObjectHandler
-{
-    /**
-     * responseHandler provides the properties and complete callback
-     **/
-    S3ResponseHandler responseHandler;
-
-    /**
-     * The putObjectDataCallback is called to acquire data to send to S3 as
-     * the contents of the put_object request.  It is made repeatedly until it
-     * returns a negative number (indicating that the request should be
-     * aborted), or 0 (indicating that all data has been supplied).
-     **/
-    S3PutObjectDataCallback *putObjectDataCallback;
-} S3PutObjectHandler;
-
-
-/**
- * An S3GetObjectHandler defines the callbacks which are made for
- * get_object requests.
- **/
-typedef struct S3GetObjectHandler
-{
-    /**
-     * responseHandler provides the properties and complete callback
-     **/
-    S3ResponseHandler responseHandler;
-
-    /**
-     * The getObjectDataCallback is called as data is read from S3 as the
-     * contents of the object being read in the get_object request.  It is
-     * called repeatedly until there is no more data provided in the request,
-     * or until the callback returns an error status indicating that the
-     * request should be aborted.
-     **/
-    S3GetObjectDataCallback *getObjectDataCallback;
-} S3GetObjectHandler;
-
-
-/** **************************************************************************
- * General Library Functions
- ************************************************************************** **/
-
-/**
- * Initializes libs3 for use.  This function must be called before any other
- * libs3 function is called.  It may be called multiple times, with the same
- * effect as calling it once, as long as S3_deinitialize() is called an
- * equal number of times when the program has finished.  This function is NOT
- * thread-safe and must only be called by one thread at a time.
- *
- * @param userAgentInfo is a string that will be included in the User-Agent
- *        header of every request made to the S3 service.  You may provide
- *        NULL or the empty string if you don't care about this.  The value
- *        will not be copied by this function and must remain unaltered by the
- *        caller until S3_deinitialize() is called.
- * @param flags is a bitmask of some combination of S3_INIT_XXX flag, or
- *        S3_INIT_ALL, indicating which of the libraries that libs3 depends
- *        upon should be initialized by S3_initialize().  Only if your program
- *        initializes one of these dependency libraries itself should anything
- *        other than S3_INIT_ALL be passed in for this bitmask.
- *
- *        You should pass S3_INIT_WINSOCK if and only if your application does
- *        not initialize winsock elsewhere.  On non-Microsoft Windows
- *        platforms it has no effect.
- *
- *        As a convenience, the macro S3_INIT_ALL is provided, which will do
- *        all necessary initialization; however, be warned that things may
- *        break if your application re-initializes the dependent libraries
- *        later.
- * @param defaultS3Hostname is a string the specifies the default S3 server
- *        hostname to use when making S3 requests; this value is used
- *        whenever the hostName of an S3BucketContext is NULL.  If NULL is
- *        passed here then the default of S3_DEFAULT_HOSTNAME will be used.
- * @return One of:
- *         S3StatusOK on success
- *         S3StatusUriTooLong if the defaultS3HostName is longer than
- *             S3_MAX_HOSTNAME_SIZE
- *         S3StatusInternalError if dependent libraries could not be
- *             initialized
- *         S3StatusOutOfMemory on failure due to out of memory
- **/
-S3Status S3_initialize(const char *userAgentInfo, int flags,
-                       const char *defaultS3HostName);
-
-
-/**
- * Must be called once per program for each call to libs3_initialize().  After
- * this call is complete, no libs3 function may be called except
- * S3_initialize().
- **/
-void S3_deinitialize();
-
-
-/**
- * Returns a string with the textual name of an S3Status code
- *
- * @param status is S3Status code for which the textual name will be returned
- * @return a string with the textual name of an S3Status code
- **/
-const char *S3_get_status_name(S3Status status);
-
-
-/**
- * This function may be used to validate an S3 bucket name as being in the
- * correct form for use with the S3 service.  Amazon S3 limits the allowed
- * characters in S3 bucket names, as well as imposing some additional rules on
- * the length of bucket names and their structure.  There are actually two
- * limits; one for bucket names used only in path-style URIs, and a more
- * strict limit used for bucket names used in virtual-host-style URIs.  It is
- * advisable to use only bucket names which meet the more strict requirements
- * regardless of how the bucket expected to be used.
- *
- * This method does NOT validate that the bucket is available for use in the
- * S3 service, so the return value of this function cannot be used to decide
- * whether or not a bucket with the give name already exists in Amazon S3 or
- * is accessible by the caller.  It merely validates that the bucket name is
- * valid for use with S3.
- *
- * @param bucketName is the bucket name to validate
- * @param uriStyle gives the URI style to validate the bucket name against.
- *        It is advisable to always use S3UriStyleVirtuallHost.
- * @return One of:
- *         S3StatusOK if the bucket name was validates successfully
- *         S3StatusInvalidBucketNameTooLong if the bucket name exceeded the
- *             length limitation for the URI style, which is 255 bytes for
- *             path style URIs and 63 bytes for virtual host type URIs
- *         S3StatusInvalidBucketNameTooShort if the bucket name is less than
- *             3 characters
- *         S3StatusInvalidBucketNameFirstCharacter if the bucket name as an
- *             invalid first character, which is anything other than
- *             an alphanumeric character
- *         S3StatusInvalidBucketNameCharacterSequence if the bucket name
- *             includes an invalid character sequence, which for virtual host
- *             style buckets is ".-" or "-."
- *         S3StatusInvalidBucketNameCharacter if the bucket name includes an
- *             invalid character, which is anything other than alphanumeric,
- *             '-', '.', or for path style URIs only, '_'.
- *         S3StatusInvalidBucketNameDotQuadNotation if the bucket name is in
- *             dot-quad notation, i.e. the form of an IP address, which is
- *             not allowed by Amazon S3.
- **/
-S3Status S3_validate_bucket_name(const char *bucketName, S3UriStyle uriStyle);
-
-
-/**
- * Converts an XML representation of an ACL to a libs3 structured
- * representation.  This method is not strictly necessary for working with
- * ACLs using libs3, but may be convenient for users of the library who read
- * ACLs from elsewhere in XML format and need to use these ACLs with libs3.
- *
- * @param aclXml is the XML representation of the ACL.  This must be a
- *        zero-terminated character string.
- * @param ownerId will be filled in with the Owner ID specified in the XML.
- *        At most MAX_GRANTEE_USER_ID_SIZE bytes will be stored at this
- *        location.
- * @param ownerDisplayName will be filled in with the Owner Display Name
- *        specified in the XML.  At most MAX_GRANTEE_DISPLAY_NAME_SIZE bytes
- *        will be stored at this location.
- * @param aclGrantCountReturn returns the number of S3AclGrant structures
- *        returned in the aclGrantsReturned array
- * @param aclGrants must be passed in as an array of at least S3_ACL_MAXCOUNT
- *        structures, and on return from this function, the first
- *        aclGrantCountReturn structures will be filled in with the ACLs
- *        represented by the input XML.
- * @return One of:
- *         S3StatusOK on successful conversion of the ACL
- *         S3StatusInternalError on internal error representing a bug in the
- *             libs3 library
- *         S3StatusXmlParseFailure if the XML document was malformed
- **/
-S3Status S3_convert_acl(char *aclXml, char *ownerId, char *ownerDisplayName,
-                        int *aclGrantCountReturn, S3AclGrant *aclGrants);
-                        
-
-/**
- * Returns nonzero if the status indicates that the request should be
- * immediately retried, because the status indicates an error of a nature that
- * is likely due to transient conditions on the local system or S3, such as
- * network failures, or internal retryable errors reported by S3.  Returns
- * zero otherwise.
- *
- * @param status is the status to evaluate
- * @return nonzero if the status indicates a retryable error, 0 otherwise
- **/
-int S3_status_is_retryable(S3Status status);
-
-
-/** **************************************************************************
- * Request Context Management Functions
- ************************************************************************** **/
-
-/**
- * An S3RequestContext allows muliple requests to be serviced by the same
- * thread simultaneously.  It is an optional parameter to all libs3 request
- * functions, and if provided, the request is managed by the S3RequestContext;
- * if not, the request is handled synchronously and is complete when the libs3
- * request function has returned.
- *
- * @param requestContextReturn returns the newly-created S3RequestContext
- *        structure, which if successfully returned, must be destroyed via a
- *        call to S3_destroy_request_context when it is no longer needed.  If
- *        an error status is returned from this function, then
- *        requestContextReturn will not have been filled in, and
- *        S3_destroy_request_context should not be called on it
- * @return One of:
- *         S3StatusOK if the request context was successfully created
- *         S3StatusOutOfMemory if the request context could not be created due
- *             to an out of memory error
- **/
-S3Status S3_create_request_context(S3RequestContext **requestContextReturn);
-
-
-/**
- * Destroys an S3RequestContext which was created with
- * S3_create_request_context.  Any requests which are currently being
- * processed by the S3RequestContext will immediately be aborted and their
- * request completed callbacks made with the status S3StatusInterrupted.
- *
- * @param requestContext is the S3RequestContext to destroy
- **/
-void S3_destroy_request_context(S3RequestContext *requestContext);
-
-
-/**
- * Runs the S3RequestContext until all requests within it have completed,
- * or until an error occurs.
- *
- * @param requestContext is the S3RequestContext to run until all requests
- *            within it have completed or until an error occurs
- * @return One of:
- *         S3Status if all requests were successfully run to completion
- *         S3StatusInternalError if an internal error prevented the
- *             S3RequestContext from running one or more requests
- *         S3StatusOutOfMemory if requests could not be run to completion
- *             due to an out of memory error
- **/
-S3Status S3_runall_request_context(S3RequestContext *requestContext);
-
-
-/**
- * Does some processing of requests within the S3RequestContext.  One or more
- * requests may have callbacks made on them and may complete.  This function
- * processes any requests which have immediately available I/O, and will not
- * block waiting for I/O on any request.  This function would normally be used
- * with S3_get_request_context_fdsets.
- *
- * @param requestContext is the S3RequestContext to process
- * @param requestsRemainingReturn returns the number of requests remaining
- *            and not yet completed within the S3RequestContext after this
- *            function returns.
- * @return One of:
- *         S3StatusOK if request processing proceeded without error
- *         S3StatusInternalError if an internal error prevented the
- *             S3RequestContext from running one or more requests
- *         S3StatusOutOfMemory if requests could not be processed due to
- *             an out of memory error
- **/
-S3Status S3_runonce_request_context(S3RequestContext *requestContext, 
-                                    int *requestsRemainingReturn);
-
-
-/**
- * This function, in conjunction allows callers to manually manage a set of
- * requests using an S3RequestContext.  This function returns the set of file
- * descriptors which the caller can watch (typically using select()), along
- * with any other file descriptors of interest to the caller, and using
- * whatever timeout (if any) the caller wishes, until one or more file
- * descriptors in the returned sets become ready for I/O, at which point
- * S3_runonce_request_context can be called to process requests with available
- * I/O.
- *
- * @param requestContext is the S3RequestContext to get fd_sets from
- * @param readFdSet is a pointer to an fd_set which will have all file
- *        descriptors to watch for read events for the requests in the
- *        S3RequestContext set into it upon return.  Should be zero'd out
- *        (using FD_ZERO) before being passed into this function.
- * @param writeFdSet is a pointer to an fd_set which will have all file
- *        descriptors to watch for write events for the requests in the
- *        S3RequestContext set into it upon return.  Should be zero'd out
- *        (using FD_ZERO) before being passed into this function.
- * @param exceptFdSet is a pointer to an fd_set which will have all file
- *        descriptors to watch for exception events for the requests in the
- *        S3RequestContext set into it upon return.  Should be zero'd out
- *        (using FD_ZERO) before being passed into this function.
- * @param maxFd returns the highest file descriptor set into any of the
- *        fd_sets, or -1 if no file descriptors were set
- * @return One of:
- *         S3StatusOK if all fd_sets were successfully set
- *         S3StatusInternalError if an internal error prevented this function
- *             from completing successfully
- **/
-S3Status S3_get_request_context_fdsets(S3RequestContext *requestContext,
-                                       fd_set *readFdSet, fd_set *writeFdSet,
-                                       fd_set *exceptFdSet, int *maxFd);
-
-
-/**
- * This function returns the maximum number of milliseconds that the caller of
- * S3_runonce_request_context should wait on the fdsets obtained via a call to
- * S3_get_request_context_fdsets.  In other words, this is essentially the
- * select() timeout that needs to be used (shorter values are OK, but no
- * longer than this) to ensure that internal timeout code of libs3 can work
- * properly.  This function should be called right before select() each time
- * select() on the request_context fdsets are to be performed by the libs3
- * user.
- *
- * @param requestContext is the S3RequestContext to get the timeout from
- * @return the maximum number of milliseconds to select() on fdsets.  Callers
- *         could wait a shorter time if they wish, but not longer.
- **/
-int64_t S3_get_request_context_timeout(S3RequestContext *requestContext);
-
-
-/** **************************************************************************
- * S3 Utility Functions
- ************************************************************************** **/
-
-/**
- * Generates an HTTP authenticated query string, which may then be used by
- * a browser (or other web client) to issue the request.  The request is
- * implicitly a GET request; Amazon S3 is documented to only support this type
- * of authenticated query string request.
- *
- * @param buffer is the output buffer for the authenticated query string.
- *        It must be at least S3_MAX_AUTHENTICATED_QUERY_STRING_SIZE bytes in 
- *        length.
- * @param bucketContext gives the bucket and associated parameters for the
- *        request to generate.
- * @param key gives the key which the authenticated request will GET.
- * @param expires gives the number of seconds since Unix epoch for the
- *        expiration date of the request; after this time, the request will
- *        no longer be valid.  If this value is negative, the largest
- *        expiration date possible is used (currently, Jan 19, 2038).
- * @param resource gives a sub-resource to be fetched for the request, or NULL
- *        for none.  This should be of the form "?<resource>", i.e. 
- *        "?torrent".
- * @return One of:
- *         S3StatusUriTooLong if, due to an internal error, the generated URI
- *             is longer than S3_MAX_AUTHENTICATED_QUERY_STRING_SIZE bytes in
- *             length and thus will not fit into the supplied buffer
- *         S3StatusOK on success
- **/
-S3Status S3_generate_authenticated_query_string
-    (char *buffer, const S3BucketContext *bucketContext,
-     const char *key, int64_t expires, const char *resource);
-
-
-/** **************************************************************************
- * Service Functions
- ************************************************************************** **/
-
-/**
- * Lists all S3 buckets belonging to the access key id.
- *
- * @param protocol gives the protocol to use for this request
- * @param accessKeyId gives the Amazon Access Key ID for which to list owned
- *        buckets
- * @param secretAccessKey gives the Amazon Secret Access Key for which to list
- *        owned buckets
- * @param hostName is the S3 host name to use; if NULL is passed in, the
- *        default S3 host as provided to S3_initialize() will be used.
- * @param requestContext if non-NULL, gives the S3RequestContext to add this
- *        request to, and does not perform the request immediately.  If NULL,
- *        performs the request immediately and synchronously.
- * @param handler gives the callbacks to call as the request is processed and
- *        completed 
- * @param callbackData will be passed in as the callbackData parameter to
- *        all callbacks for this request
- **/
-void S3_list_service(S3Protocol protocol, const char *accessKeyId,
-                     const char *secretAccessKey, const char *hostName,
-                     S3RequestContext *requestContext,
-                     const S3ListServiceHandler *handler,
-                     void *callbackData);
-                         
-
-/** **************************************************************************
- * Bucket Functions
- ************************************************************************** **/
-
-/**
- * Tests the existence of an S3 bucket, additionally returning the bucket's
- * location if it exists and is accessible.
- *
- * @param protocol gives the protocol to use for this request
- * @param uriStyle gives the URI style to use for this request
- * @param accessKeyId gives the Amazon Access Key ID for which to list owned
- *        buckets
- * @param secretAccessKey gives the Amazon Secret Access Key for which to list
- *        owned buckets
- * @param hostName is the S3 host name to use; if NULL is passed in, the
- *        default S3 host as provided to S3_initialize() will be used.
- * @param bucketName is the bucket name to test
- * @param locationConstraintReturnSize gives the number of bytes in the
- *        locationConstraintReturn parameter
- * @param locationConstraintReturn provides the location into which to write
- *        the name of the location constraint naming the geographic location
- *        of the S3 bucket.  This must have at least as many characters in it
- *        as specified by locationConstraintReturn, and should start out
- *        NULL-terminated.  On successful completion of this request, this
- *        will be set to the name of the geographic location of S3 bucket, or
- *        will be left as a zero-length string if no location was available.
- * @param requestContext if non-NULL, gives the S3RequestContext to add this
- *        request to, and does not perform the request immediately.  If NULL,
- *        performs the request immediately and synchronously.
- * @param handler gives the callbacks to call as the request is processed and
- *        completed 
- * @param callbackData will be passed in as the callbackData parameter to
- *        all callbacks for this request
- **/
-void S3_test_bucket(S3Protocol protocol, S3UriStyle uriStyle,
-                    const char *accessKeyId, const char *secretAccessKey,
-                    const char *hostName, const char *bucketName,
-                    int locationConstraintReturnSize,
-                    char *locationConstraintReturn,
-                    S3RequestContext *requestContext,
-                    const S3ResponseHandler *handler, void *callbackData);
-
-                           
-/**
- * Creates a new bucket.
- *
- * @param protocol gives the protocol to use for this request
- * @param accessKeyId gives the Amazon Access Key ID for which to list owned
- *        buckets
- * @param secretAccessKey gives the Amazon Secret Access Key for which to list
- *        owned buckets
- * @param hostName is the S3 host name to use; if NULL is passed in, the
- *        default S3 host as provided to S3_initialize() will be used.
- * @param bucketName is the name of the bucket to be created
- * @param cannedAcl gives the "REST canned ACL" to use for the created bucket
- * @param locationConstraint if non-NULL, gives the geographic location for
- *        the bucket to create.
- * @param requestContext if non-NULL, gives the S3RequestContext to add this
- *        request to, and does not perform the request immediately.  If NULL,
- *        performs the request immediately and synchronously.
- * @param handler gives the callbacks to call as the request is processed and
- *        completed 
- * @param callbackData will be passed in as the callbackData parameter to
- *        all callbacks for this request
- **/
-void S3_create_bucket(S3Protocol protocol, const char *accessKeyId,
-                      const char *secretAccessKey, const char *hostName,
-                      const char *bucketName, S3CannedAcl cannedAcl,
-                      const char *locationConstraint,
-                      S3RequestContext *requestContext,
-                      const S3ResponseHandler *handler, void *callbackData);
-
-
-/**
- * Deletes a bucket.  The bucket must be empty, or the status
- * S3StatusErrorBucketNotEmpty will result.
- *
- * @param protocol gives the protocol to use for this request
- * @param uriStyle gives the URI style to use for this request
- * @param accessKeyId gives the Amazon Access Key ID for which to list owned
- *        buckets
- * @param secretAccessKey gives the Amazon Secret Access Key for which to list
- *        owned buckets
- * @param hostName is the S3 host name to use; if NULL is passed in, the
- *        default S3 host as provided to S3_initialize() will be used.
- * @param bucketName is the name of the bucket to be deleted
- * @param requestContext if non-NULL, gives the S3RequestContext to add this
- *        request to, and does not perform the request immediately.  If NULL,
- *        performs the request immediately and synchronously.
- * @param handler gives the callbacks to call as the request is processed and
- *        completed 
- * @param callbackData will be passed in as the callbackData parameter to
- *        all callbacks for this request
- **/
-void S3_delete_bucket(S3Protocol protocol, S3UriStyle uriStyle,
-                      const char *accessKeyId, const char *secretAccessKey,
-                      const char *hostName, const char *bucketName,
-                      S3RequestContext *requestContext,
-                      const S3ResponseHandler *handler, void *callbackData);
-
-
-/**
- * Lists keys within a bucket.
- *
- * @param bucketContext gives the bucket and associated parameters for this
- *        request
- * @param prefix if present, gives a prefix for matching keys
- * @param marker if present, only keys occuring after this value will be
- *        listed
- * @param delimiter if present, causes keys that contain the same string
- *        between the prefix and the first occurrence of the delimiter to be
- *        rolled up into a single result element
- * @param maxkeys is the maximum number of keys to return
- * @param requestContext if non-NULL, gives the S3RequestContext to add this
- *        request to, and does not perform the request immediately.  If NULL,
- *        performs the request immediately and synchronously.
- * @param handler gives the callbacks to call as the request is processed and
- *        completed 
- * @param callbackData will be passed in as the callbackData parameter to
- *        all callbacks for this request
- **/
-void S3_list_bucket(const S3BucketContext *bucketContext,
-                    const char *prefix, const char *marker, 
-                    const char *delimiter, int maxkeys,
-                    S3RequestContext *requestContext,
-                    const S3ListBucketHandler *handler, void *callbackData);
-
-
-/** **************************************************************************
- * Object Functions
- ************************************************************************** **/
-
-/**
- * Puts object data to S3.  This overwrites any existing object at that key;
- * note that S3 currently only supports full-object upload.  The data to
- * upload will be acquired by calling the handler's putObjectDataCallback.
- *
- * @param bucketContext gives the bucket and associated parameters for this
- *        request
- * @param key is the key of the object to put to
- * @param contentLength is required and gives the total number of bytes that
- *        will be put
- * @param putProperties optionally provides additional properties to apply to
- *        the object that is being put to
- * @param requestContext if non-NULL, gives the S3RequestContext to add this
- *        request to, and does not perform the request immediately.  If NULL,
- *        performs the request immediately and synchronously.
- * @param handler gives the callbacks to call as the request is processed and
- *        completed 
- * @param callbackData will be passed in as the callbackData parameter to
- *        all callbacks for this request
- **/
-void S3_put_object(const S3BucketContext *bucketContext, const char *key,
-                   uint64_t contentLength,
-                   const S3PutProperties *putProperties,
-                   S3RequestContext *requestContext,
-                   const S3PutObjectHandler *handler, void *callbackData);
-                        
-
-/**
- * Copies an object from one location to another.  The object may be copied
- * back to itself, which is useful for replacing metadata without changing
- * the object.
- *
- * @param bucketContext gives the source bucket and associated parameters for
- *        this request
- * @param key is the source key
- * @param destinationBucket gives the destination bucket into which to copy
- *        the object.  If NULL, the source bucket will be used.
- * @param destinationKey gives the destination key into which to copy the
- *        object.  If NULL, the source key will be used.
- * @param putProperties optionally provides properties to apply to the object
- *        that is being put to.  If not supplied (i.e. NULL is passed in),
- *        then the copied object will retain the metadata of the copied
- *        object.
- * @param lastModifiedReturn returns the last modified date of the copied
- *        object
- * @param eTagReturnSize specifies the number of bytes provided in the
- *        eTagReturn buffer
- * @param eTagReturn is a buffer into which the resulting eTag of the copied
- *        object will be written
- * @param handler gives the callbacks to call as the request is processed and
- *        completed 
- * @param callbackData will be passed in as the callbackData parameter to
- *        all callbacks for this request
- * @param requestContext if non-NULL, gives the S3RequestContext to add this
- *        request to, and does not perform the request immediately.  If NULL,
- *        performs the request immediately and synchronously.
- * @param handler gives the callbacks to call as the request is processed and
- *        completed 
- * @param callbackData will be passed in as the callbackData parameter to
- *        all callbacks for this request
- **/
-void S3_copy_object(const S3BucketContext *bucketContext,
-                    const char *key, const char *destinationBucket,
-                    const char *destinationKey,
-                    const S3PutProperties *putProperties,
-                    int64_t *lastModifiedReturn, int eTagReturnSize,
-                    char *eTagReturn, S3RequestContext *requestContext,
-                    const S3ResponseHandler *handler, void *callbackData);
-
-
-/**
- * Gets an object from S3.  The contents of the object are returned in the
- * handler's getObjectDataCallback.
- *
- * @param bucketContext gives the bucket and associated parameters for this
- *        request
- * @param key is the key of the object to get
- * @param getConditions if non-NULL, gives a set of conditions which must be
- *        met in order for the request to succeed
- * @param startByte gives the start byte for the byte range of the contents
- *        to be returned
- * @param byteCount gives the number of bytes to return; a value of 0
- *        indicates that the contents up to the end should be returned
- * @param requestContext if non-NULL, gives the S3RequestContext to add this
- *        request to, and does not perform the request immediately.  If NULL,
- *        performs the request immediately and synchronously.
- * @param handler gives the callbacks to call as the request is processed and
- *        completed 
- * @param callbackData will be passed in as the callbackData parameter to
- *        all callbacks for this request
- **/
-void S3_get_object(const S3BucketContext *bucketContext, const char *key,
-                   const S3GetConditions *getConditions,
-                   uint64_t startByte, uint64_t byteCount,
-                   S3RequestContext *requestContext,
-                   const S3GetObjectHandler *handler, void *callbackData);
-
-
-/**
- * Gets the response properties for the object, but not the object contents.
- *
- * @param bucketContext gives the bucket and associated parameters for this
- *        request
- * @param key is the key of the object to get the properties of
- * @param requestContext if non-NULL, gives the S3RequestContext to add this
- *        request to, and does not perform the request immediately.  If NULL,
- *        performs the request immediately and synchronously.
- * @param handler gives the callbacks to call as the request is processed and
- *        completed 
- * @param callbackData will be passed in as the callbackData parameter to
- *        all callbacks for this request
- **/
-void S3_head_object(const S3BucketContext *bucketContext, const char *key,
-                    S3RequestContext *requestContext,
-                    const S3ResponseHandler *handler, void *callbackData);
-                         
-/**
- * Deletes an object from S3.
- *
- * @param bucketContext gives the bucket and associated parameters for this
- *        request
- * @param key is the key of the object to delete
- * @param requestContext if non-NULL, gives the S3RequestContext to add this
- *        request to, and does not perform the request immediately.  If NULL,
- *        performs the request immediately and synchronously.
- * @param handler gives the callbacks to call as the request is processed and
- *        completed 
- * @param callbackData will be passed in as the callbackData parameter to
- *        all callbacks for this request
- **/
-void S3_delete_object(const S3BucketContext *bucketContext, const char *key,
-                      S3RequestContext *requestContext,
-                      const S3ResponseHandler *handler, void *callbackData);
-
-
-/** **************************************************************************
- * Access Control List Functions
- ************************************************************************** **/
-
-/**
- * Gets the ACL for the given bucket or object.
- *
- * @param bucketContext gives the bucket and associated parameters for this
- *        request
- * @param key is the key of the object to get the ACL of; or NULL to get the
- *        ACL of the bucket
- * @param ownerId must be supplied as a buffer of at least
- *        S3_MAX_GRANTEE_USER_ID_SIZE bytes, and will be filled in with the
- *        owner ID of the object/bucket
- * @param ownerDisplayName must be supplied as a buffer of at least
- *        S3_MAX_GRANTEE_DISPLAY_NAME_SIZE bytes, and will be filled in with
- *        the display name of the object/bucket
- * @param aclGrantCountReturn returns the number of S3AclGrant structures
- *        returned in the aclGrants parameter
- * @param aclGrants must be passed in as an array of at least
- *        S3_MAX_ACL_GRANT_COUNT S3AclGrant structures, which will be filled
- *        in with the grant information for the ACL
- * @param requestContext if non-NULL, gives the S3RequestContext to add this
- *        request to, and does not perform the request immediately.  If NULL,
- *        performs the request immediately and synchronously.
- * @param handler gives the callbacks to call as the request is processed and
- *        completed 
- * @param callbackData will be passed in as the callbackData parameter to
- *        all callbacks for this request
- **/
-void S3_get_acl(const S3BucketContext *bucketContext, const char *key, 
-                char *ownerId, char *ownerDisplayName,
-                int *aclGrantCountReturn, S3AclGrant *aclGrants, 
-                S3RequestContext *requestContext,
-                const S3ResponseHandler *handler, void *callbackData);
-
-
-/**
- * Sets the ACL for the given bucket or object.
- *
- * @param bucketContext gives the bucket and associated parameters for this
- *        request
- * @param key is the key of the object to set the ACL for; or NULL to set the
- *        ACL for the bucket
- * @param ownerId is the owner ID of the object/bucket.  Unfortunately, S3
- *        requires this to be valid and thus it must have been fetched by a
- *        previous S3 request, such as a list_buckets request.
- * @param ownerDisplayName is the owner display name of the object/bucket.
- *        Unfortunately, S3 requires this to be valid and thus it must have
- *        been fetched by a previous S3 request, such as a list_buckets
- *        request.
- * @param aclGrantCount is the number of ACL grants to set for the
- *        object/bucket
- * @param aclGrants are the ACL grants to set for the object/bucket
- * @param requestContext if non-NULL, gives the S3RequestContext to add this
- *        request to, and does not perform the request immediately.  If NULL,
- *        performs the request immediately and synchronously.
- * @param handler gives the callbacks to call as the request is processed and
- *        completed 
- * @param callbackData will be passed in as the callbackData parameter to
- *        all callbacks for this request
- **/
-void S3_set_acl(const S3BucketContext *bucketContext, const char *key, 
-                const char *ownerId, const char *ownerDisplayName,
-                int aclGrantCount, const S3AclGrant *aclGrants, 
-                S3RequestContext *requestContext,
-                const S3ResponseHandler *handler, void *callbackData);
-
-
-/** **************************************************************************
- * Server Access Log Functions
- ************************************************************************** **/
-
-/**
- * Gets the service access logging settings for a bucket.  The service access
- * logging settings specify whether or not the S3 service will write service
- * access logs for requests made for the given bucket, and if so, several
- * settings controlling how these logs will be written.
- *
- * @param bucketContext gives the bucket and associated parameters for this
- *        request; this is the bucket for which service access logging is
- *        being requested
- * @param targetBucketReturn must be passed in as a buffer of at least
- *        (S3_MAX_BUCKET_NAME_SIZE + 1) bytes in length, and will be filled
- *        in with the target bucket name for access logging for the given
- *        bucket, which is the bucket into which access logs for the specified
- *        bucket will be written.  This is returned as an empty string if
- *        service access logging is not enabled for the given bucket.
- * @param targetPrefixReturn must be passed in as a buffer of at least
- *        (S3_MAX_KEY_SIZE + 1) bytes in length, and will be filled in
- *        with the key prefix for server access logs for the given bucket,
- *        or the empty string if no such prefix is specified.
- * @param aclGrantCountReturn returns the number of ACL grants that are
- *        associated with the server access logging for the given bucket.
- * @param aclGrants must be passed in as an array of at least
- *        S3_MAX_ACL_GRANT_COUNT S3AclGrant structures, and these will be
- *        filled in with the target grants associated with the server access
- *        logging for the given bucket, whose number is returned in the
- *        aclGrantCountReturn parameter.  These grants will be applied to the
- *        ACL of any server access logging log files generated by the S3
- *        service for the given bucket.
- * @param requestContext if non-NULL, gives the S3RequestContext to add this
- *        request to, and does not perform the request immediately.  If NULL,
- *        performs the request immediately and synchronously.
- * @param handler gives the callbacks to call as the request is processed and
- *        completed 
- * @param callbackData will be passed in as the callbackData parameter to
- *        all callbacks for this request
- **/
-void S3_get_server_access_logging(const S3BucketContext *bucketContext,
-                                  char *targetBucketReturn,
-                                  char *targetPrefixReturn,
-                                  int *aclGrantCountReturn, 
-                                  S3AclGrant *aclGrants,
-                                  S3RequestContext *requestContext,
-                                  const S3ResponseHandler *handler,
-                                  void *callbackData);
-                                  
-
-/**
- * Sets the service access logging settings for a bucket.  The service access
- * logging settings specify whether or not the S3 service will write service
- * access logs for requests made for the given bucket, and if so, several
- * settings controlling how these logs will be written.
- *
- * @param bucketContext gives the bucket and associated parameters for this
- *        request; this is the bucket for which service access logging is
- *        being set
- * @param targetBucket gives the target bucket name for access logging for the
- *        given bucket, which is the bucket into which access logs for the
- *        specified bucket will be written.
- * @param targetPrefix is an option parameter which specifies the key prefix
- *        for server access logs for the given bucket, or NULL if no such
- *        prefix is to be used.
- * @param aclGrantCount specifies the number of ACL grants that are to be
- *        associated with the server access logging for the given bucket.
- * @param aclGrants is as an array of S3AclGrant structures, whose number is
- *        given by the aclGrantCount parameter.  These grants will be applied
- *        to the ACL of any server access logging log files generated by the
- *        S3 service for the given bucket.
- * @param requestContext if non-NULL, gives the S3RequestContext to add this
- *        request to, and does not perform the request immediately.  If NULL,
- *        performs the request immediately and synchronously.
- * @param handler gives the callbacks to call as the request is processed and
- *        completed 
- * @param callbackData will be passed in as the callbackData parameter to
- *        all callbacks for this request
- **/
-void S3_set_server_access_logging(const S3BucketContext *bucketContext,
-                                  const char *targetBucket, 
-                                  const char *targetPrefix, int aclGrantCount, 
-                                  const S3AclGrant *aclGrants, 
-                                  S3RequestContext *requestContext,
-                                  const S3ResponseHandler *handler,
-                                  void *callbackData);
-                                  
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* LIBS3_H */
diff --git a/src/libs3/inc/mingw/pthread.h b/src/libs3/inc/mingw/pthread.h
deleted file mode 100644
index 674a62a..0000000
--- a/src/libs3/inc/mingw/pthread.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/** **************************************************************************
- * pthread.h
- * 
- * Copyright 2008 Bryan Ischo <bryan at ischo.com>
- * 
- * This file is part of libs3.
- * 
- * libs3 is free software: you can redistribute it and/or modify it under the
- * terms of the GNU General Public License as published by the Free Software
- * Foundation, version 3 of the License.
- *
- * In addition, as a special exception, the copyright holders give
- * permission to link the code of this library and its programs with the
- * OpenSSL library, and distribute linked combinations including the two.
- *
- * libs3 is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License version 3
- * along with libs3, in a file named COPYING.  If not, see
- * <http://www.gnu.org/licenses/>.
- *
- ************************************************************************** **/
-
-#ifndef PTHREAD_H
-#define PTHREAD_H
-
-// This is a minimal implementation of pthreads on Windows, implementing just
-// the APIs needed by libs3
-
-unsigned long pthread_self();
-
-typedef struct
-{
-    CRITICAL_SECTION criticalSection;
-} pthread_mutex_t;
-
-int pthread_mutex_init(pthread_mutex_t *mutex, void *);
-int pthread_mutex_lock(pthread_mutex_t *mutex);
-int pthread_mutex_unlock(pthread_mutex_t *mutex);
-int pthread_mutex_destroy(pthread_mutex_t *mutex);
-
-#endif /* PTHREAD_H */
diff --git a/src/libs3/inc/mingw/sys/select.h b/src/libs3/inc/mingw/sys/select.h
deleted file mode 100644
index 0981da2..0000000
--- a/src/libs3/inc/mingw/sys/select.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/** **************************************************************************
- * select.h
- * 
- * Copyright 2008 Bryan Ischo <bryan at ischo.com>
- * 
- * This file is part of libs3.
- * 
- * libs3 is free software: you can redistribute it and/or modify it under the
- * terms of the GNU General Public License as published by the Free Software
- * Foundation, version 3 of the License.
- *
- * In addition, as a special exception, the copyright holders give
- * permission to link the code of this library and its programs with the
- * OpenSSL library, and distribute linked combinations including the two.
- *
- * libs3 is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License version 3
- * along with libs3, in a file named COPYING.  If not, see
- * <http://www.gnu.org/licenses/>.
- *
- ************************************************************************** **/
-
-// This file is used only on a MingW build, and converts an include of
-// sys/select.h to its Windows equivalent
-
-#include <winsock2.h>
diff --git a/src/libs3/inc/mingw/sys/utsname.h b/src/libs3/inc/mingw/sys/utsname.h
deleted file mode 100644
index 1e6b470..0000000
--- a/src/libs3/inc/mingw/sys/utsname.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/** **************************************************************************
- * utsname.h
- * 
- * Copyright 2008 Bryan Ischo <bryan at ischo.com>
- * 
- * This file is part of libs3.
- * 
- * libs3 is free software: you can redistribute it and/or modify it under the
- * terms of the GNU General Public License as published by the Free Software
- * Foundation, version 3 of the License.
- *
- * In addition, as a special exception, the copyright holders give
- * permission to link the code of this library and its programs with the
- * OpenSSL library, and distribute linked combinations including the two.
- *
- * libs3 is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License version 3
- * along with libs3, in a file named COPYING.  If not, see
- * <http://www.gnu.org/licenses/>.
- *
- ************************************************************************** **/
-
-// This file is used only on a MingW build, and provides an implementation
-// of POSIX sys/utsname.h
-
-#ifndef UTSNAME_H
-#define UTSNAME_H
-
-struct utsname
-{
-    const char *sysname;
-    const char *machine;
-};
-
-int uname(struct utsname *);
-
-#endif /* UTSNAME_H */
diff --git a/src/libs3/inc/request.h b/src/libs3/inc/request.h
deleted file mode 100644
index 9e3a477..0000000
--- a/src/libs3/inc/request.h
+++ /dev/null
@@ -1,186 +0,0 @@
-/** **************************************************************************
- * request.h
- * 
- * Copyright 2008 Bryan Ischo <bryan at ischo.com>
- * 
- * This file is part of libs3.
- * 
- * libs3 is free software: you can redistribute it and/or modify it under the
- * terms of the GNU General Public License as published by the Free Software
- * Foundation, version 3 of the License.
- *
- * In addition, as a special exception, the copyright holders give
- * permission to link the code of this library and its programs with the
- * OpenSSL library, and distribute linked combinations including the two.
- *
- * libs3 is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License version 3
- * along with libs3, in a file named COPYING.  If not, see
- * <http://www.gnu.org/licenses/>.
- *
- ************************************************************************** **/
-
-#ifndef REQUEST_H
-#define REQUEST_H
-
-#include "libs3.h"
-#include "error_parser.h"
-#include "response_headers_handler.h"
-#include "util.h"
-
-// Describes a type of HTTP request (these are our supported HTTP "verbs")
-typedef enum
-{
-    HttpRequestTypeGET,
-    HttpRequestTypeHEAD,
-    HttpRequestTypePUT,
-    HttpRequestTypeCOPY,
-    HttpRequestTypeDELETE
-} HttpRequestType;
-
-
-// This completely describes a request.  A RequestParams is not required to be
-// allocated from the heap and its lifetime is not assumed to extend beyond
-// the lifetime of the function to which it has been passed.
-typedef struct RequestParams
-{
-    // Request type, affects the HTTP verb used
-    HttpRequestType httpRequestType;
-
-    // Bucket context for request
-    S3BucketContext bucketContext;
-
-    // Key, if any
-    const char *key;
-
-    // Query params - ready to append to URI (i.e. ?p1=v1?p2=v2)
-    const char *queryParams;
-
-    // sub resource, like ?acl, ?location, ?torrent, ?logging
-    const char *subResource;
-
-    // If this is a copy operation, this gives the source bucket
-    const char *copySourceBucketName;
-
-    // If this is a copy operation, this gives the source key
-    const char *copySourceKey;
-
-    // Get conditions
-    const S3GetConditions *getConditions;
-
-    // Start byte
-    uint64_t startByte;
-
-    // Byte count
-    uint64_t byteCount;
-
-    // Put properties
-    const S3PutProperties *putProperties;
-
-    // Callback to be made when headers are available.  Might not be called.
-    S3ResponsePropertiesCallback *propertiesCallback;
-
-    // Callback to be made to supply data to send to S3.  Might not be called.
-    S3PutObjectDataCallback *toS3Callback;
-
-    // Number of bytes total that readCallback will supply
-    int64_t toS3CallbackTotalSize;
-
-    // Callback to be made that supplies data read from S3.
-    // Might not be called.
-    S3GetObjectDataCallback *fromS3Callback;
-
-    // Callback to be made when request is complete.  This will *always* be
-    // called.
-    S3ResponseCompleteCallback *completeCallback;
-
-    // Data passed to the callbacks
-    void *callbackData;
-} RequestParams;
-
-
-// This is the stuff associated with a request that needs to be on the heap
-// (and thus live while a curl_multi is in use).
-typedef struct Request
-{
-    // These put the request on a doubly-linked list of requests in a
-    // request context, *if* the request is in a request context (else these
-    // will both be 0)
-    struct Request *prev, *next;
-
-    // The status of this Request, as will be reported to the user via the
-    // complete callback
-    S3Status status;
-
-    // The HTTP code returned by the S3 server, if it is known.  Would rather
-    // not have to keep track of this but S3 doesn't always indicate its
-    // errors the same way
-    int httpResponseCode;
-
-    // The HTTP headers to use for the curl request
-    struct curl_slist *headers;
-
-    // The CURL structure driving the request
-    CURL *curl;
-
-    // libcurl requires that the uri be stored outside of the curl handle
-    char uri[MAX_URI_SIZE + 1];
-
-    // Callback to be made when headers are available.  Might not be called.
-    S3ResponsePropertiesCallback *propertiesCallback;
-
-    // Callback to be made to supply data to send to S3.  Might not be called.
-    S3PutObjectDataCallback *toS3Callback;
-
-    // Number of bytes total that readCallback has left to supply
-    int64_t toS3CallbackBytesRemaining;
-
-    // Callback to be made that supplies data read from S3.
-    // Might not be called.
-    S3GetObjectDataCallback *fromS3Callback;
-
-    // Callback to be made when request is complete.  This will *always* be
-    // called.
-    S3ResponseCompleteCallback *completeCallback;
-
-    // Data passed to the callbacks
-    void *callbackData;
-
-    // Handler of response headers
-    ResponseHeadersHandler responseHeadersHandler;
-
-    // This is set to nonzero after the properties callback has been made
-    int propertiesCallbackMade;
-
-    // Parser of errors
-    ErrorParser errorParser;
-} Request;
-
-
-// Request functions
-// ----------------------------------------------------------------------------
-
-// Initialize the API
-S3Status request_api_initialize(const char *userAgentInfo, int flags,
-                                const char *hostName);
-
-// Deinitialize the API
-void request_api_deinitialize();
-
-// Perform a request; if context is 0, performs the request immediately;
-// otherwise, sets it up to be performed by context.
-void request_perform(const RequestParams *params, S3RequestContext *context);
-
-// Called by the internal request code or internal request context code when a
-// curl has finished the request
-void request_finish(Request *request);
-
-// Convert a CURLE code to an S3Status
-S3Status request_curl_code_to_status(CURLcode code);
-
-
-#endif /* REQUEST_H */
diff --git a/src/libs3/inc/request_context.h b/src/libs3/inc/request_context.h
deleted file mode 100644
index 8074c50..0000000
--- a/src/libs3/inc/request_context.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/** **************************************************************************
- * request_context.h
- * 
- * Copyright 2008 Bryan Ischo <bryan at ischo.com>
- * 
- * This file is part of libs3.
- * 
- * libs3 is free software: you can redistribute it and/or modify it under the
- * terms of the GNU General Public License as published by the Free Software
- * Foundation, version 3 of the License.
- *
- * In addition, as a special exception, the copyright holders give
- * permission to link the code of this library and its programs with the
- * OpenSSL library, and distribute linked combinations including the two.
- *
- * libs3 is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License version 3
- * along with libs3, in a file named COPYING.  If not, see
- * <http://www.gnu.org/licenses/>.
- *
- ************************************************************************** **/
-
-#ifndef REQUEST_CONTEXT_H
-#define REQUEST_CONTEXT_H
-
-#include "libs3.h"
-
-struct S3RequestContext
-{
-    CURLM *curlm;
-
-    struct Request *requests;
-};
-
-
-#endif /* REQUEST_CONTEXT_H */
diff --git a/src/libs3/inc/response_headers_handler.h b/src/libs3/inc/response_headers_handler.h
deleted file mode 100644
index 2813e9a..0000000
--- a/src/libs3/inc/response_headers_handler.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/** **************************************************************************
- * response_headers_handler.h
- * 
- * Copyright 2008 Bryan Ischo <bryan at ischo.com>
- * 
- * This file is part of libs3.
- * 
- * libs3 is free software: you can redistribute it and/or modify it under the
- * terms of the GNU General Public License as published by the Free Software
- * Foundation, version 3 of the License.
- *
- * In addition, as a special exception, the copyright holders give
- * permission to link the code of this library and its programs with the
- * OpenSSL library, and distribute linked combinations including the two.
- *
- * libs3 is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License version 3
- * along with libs3, in a file named COPYING.  If not, see
- * <http://www.gnu.org/licenses/>.
- *
- ************************************************************************** **/
-
-#ifndef RESPONSE_HEADERS_HANDLER_H
-#define RESPONSE_HEADERS_HANDLER_H
-
-#include "libs3.h"
-#include "string_buffer.h"
-#include "util.h"
-
-
-typedef struct ResponseHeadersHandler
-{
-    // The structure to pass to the headers callback.  This is filled in by
-    // the ResponseHeadersHandler from the headers added to it.
-    S3ResponseProperties responseProperties;
-
-    // Set to 1 after the done call has been made
-    int done;
-
-    // copied into here.  We allow 128 bytes for each header, plus \0 term.
-    string_multibuffer(responsePropertyStrings, 5 * 129);
-
-    // responseproperties.metaHeaders strings get copied into here
-    string_multibuffer(responseMetaDataStrings, 
-                       COMPACTED_METADATA_BUFFER_SIZE);
-
-    // Response meta data
-    S3NameValue responseMetaData[S3_MAX_METADATA_COUNT];
-} ResponseHeadersHandler;
-
-
-void response_headers_handler_initialize(ResponseHeadersHandler *handler);
-
-void response_headers_handler_add(ResponseHeadersHandler *handler,
-                                  char *data, int dataLen);
-
-void response_headers_handler_done(ResponseHeadersHandler *handler, 
-                                   CURL *curl);
-
-#endif /* RESPONSE_HEADERS_HANDLER_H */
diff --git a/src/libs3/inc/simplexml.h b/src/libs3/inc/simplexml.h
deleted file mode 100644
index 704db07..0000000
--- a/src/libs3/inc/simplexml.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/** **************************************************************************
- * simplexml.h
- * 
- * Copyright 2008 Bryan Ischo <bryan at ischo.com>
- * 
- * This file is part of libs3.
- * 
- * libs3 is free software: you can redistribute it and/or modify it under the
- * terms of the GNU General Public License as published by the Free Software
- * Foundation, version 3 of the License.
- *
- * In addition, as a special exception, the copyright holders give
- * permission to link the code of this library and its programs with the
- * OpenSSL library, and distribute linked combinations including the two.
- *
- * libs3 is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License version 3
- * along with libs3, in a file named COPYING.  If not, see
- * <http://www.gnu.org/licenses/>.
- *
- ************************************************************************** **/
-
-#ifndef SIMPLEXML_H
-#define SIMPLEXML_H
-
-#include "libs3.h"
-
-
-// Simple XML callback.
-//
-// elementPath: is the full "path" of the element; i.e.
-// <foo><bar><baz>data</baz></bar></foo> would have 'data' in the element
-// foo/bar/baz.
-// 
-// Return of anything other than S3StatusOK causes the calling
-// simplexml_add() function to immediately stop and return the status.
-//
-// data is passed in as 0 on end of element
-typedef S3Status (SimpleXmlCallback)(const char *elementPath, const char *data,
-                                     int dataLen, void *callbackData);
-
-typedef struct SimpleXml
-{
-    void *xmlParser;
-
-    SimpleXmlCallback *callback;
-
-    void *callbackData;
-
-    char elementPath[512];
-
-    int elementPathLen;
-
-    S3Status status;
-} SimpleXml;
-
-
-// Simple XML parsing
-// ----------------------------------------------------------------------------
-
-// Always call this, even if the simplexml doesn't end up being used
-void simplexml_initialize(SimpleXml *simpleXml, SimpleXmlCallback *callback,
-                          void *callbackData);
-
-S3Status simplexml_add(SimpleXml *simpleXml, const char *data, int dataLen);
-
-
-// Always call this
-void simplexml_deinitialize(SimpleXml *simpleXml);
-
-
-#endif /* SIMPLEXML_H */
diff --git a/src/libs3/inc/string_buffer.h b/src/libs3/inc/string_buffer.h
deleted file mode 100644
index eed9bd4..0000000
--- a/src/libs3/inc/string_buffer.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/** **************************************************************************
- * string_buffer.h
- * 
- * Copyright 2008 Bryan Ischo <bryan at ischo.com>
- * 
- * This file is part of libs3.
- * 
- * libs3 is free software: you can redistribute it and/or modify it under the
- * terms of the GNU General Public License as published by the Free Software
- * Foundation, version 3 of the License.
- *
- * In addition, as a special exception, the copyright holders give
- * permission to link the code of this library and its programs with the
- * OpenSSL library, and distribute linked combinations including the two.
- *
- * libs3 is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License version 3
- * along with libs3, in a file named COPYING.  If not, see
- * <http://www.gnu.org/licenses/>.
- *
- ************************************************************************** **/
-
-#ifndef STRING_BUFFER_H
-#define STRING_BUFFER_H
-
-#include <stdio.h>
-
-
-// Declare a string_buffer with the given name of the given maximum length
-#define string_buffer(name, len)                                        \
-    char name[len + 1];                                                 \
-    int name##Len
-
-
-// Initialize a string_buffer
-#define string_buffer_initialize(sb)                                    \
-    do {                                                                \
-        sb[0] = 0;                                                      \
-        sb##Len = 0;                                                    \
-    } while (0)
-
-
-// Append [len] bytes of [str] to [sb], setting [all_fit] to 1 if it fit, and
-// 0 if it did not
-#define string_buffer_append(sb, str, len, all_fit)                     \
-    do {                                                                \
-        sb##Len += snprintf(&(sb[sb##Len]), sizeof(sb) - sb##Len - 1,   \
-                            "%.*s", (int) (len), str);                  \
-        if (sb##Len > (int) (sizeof(sb) - 1)) {                         \
-            sb##Len = sizeof(sb) - 1;                                   \
-            all_fit = 0;                                                \
-        }                                                               \
-        else {                                                          \
-            all_fit = 1;                                                \
-        }                                                               \
-    } while (0)
-
-
-// Declare a string multibuffer with the given name of the given maximum size
-#define string_multibuffer(name, size)                                  \
-    char name[size];                                                    \
-    int name##Size
-
-
-// Initialize a string_multibuffer
-#define string_multibuffer_initialize(smb)                              \
-    do {                                                                \
-        smb##Size = 0;                                                  \
-    } while (0)
-
-
-// Evaluates to the current string within the string_multibuffer
-#define string_multibuffer_current(smb)                                  \
-    &(smb[smb##Size])
-
-
-// Adds a new string to the string_multibuffer
-#define string_multibuffer_add(smb, str, len, all_fit)                  \
-    do {                                                                \
-        smb##Size += (snprintf(&(smb[smb##Size]),                       \
-                               sizeof(smb) - smb##Size,                 \
-                               "%.*s", (int) (len), str) + 1);          \
-        if (smb##Size > (int) sizeof(smb)) {                            \
-            smb##Size = sizeof(smb);                                    \
-            all_fit = 0;                                                \
-        }                                                               \
-        else {                                                          \
-            all_fit = 1;                                                \
-        }                                                               \
-    } while (0)
-
-
-// Appends to the current string in the string_multibuffer.  There must be a
-// current string, meaning that string_multibuffer_add must have been called
-// at least once for this string_multibuffer.
-#define string_multibuffer_append(smb, str, len, all_fit)               \
-    do {                                                                \
-        smb##Size--;                                                    \
-        string_multibuffer_add(smb, str, len, all_fit);                 \
-    } while (0)
-
-
-#endif /* STRING_BUFFER_H */
diff --git a/src/libs3/inc/util.h b/src/libs3/inc/util.h
deleted file mode 100644
index 94ed0e7..0000000
--- a/src/libs3/inc/util.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/** **************************************************************************
- * util.h
- * 
- * Copyright 2008 Bryan Ischo <bryan at ischo.com>
- * 
- * This file is part of libs3.
- * 
- * libs3 is free software: you can redistribute it and/or modify it under the
- * terms of the GNU General Public License as published by the Free Software
- * Foundation, version 3 of the License.
- *
- * In addition, as a special exception, the copyright holders give
- * permission to link the code of this library and its programs with the
- * OpenSSL library, and distribute linked combinations including the two.
- *
- * libs3 is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License version 3
- * along with libs3, in a file named COPYING.  If not, see
- * <http://www.gnu.org/licenses/>.
- *
- ************************************************************************** **/
-
-#ifndef UTIL_H
-#define UTIL_H
-
-#include <curl/curl.h>
-#include <curl/multi.h>
-#include <stdint.h>
-#include "libs3.h"
-
-// acl groups
-#define ACS_URL "http://acs.amazonaws.com/groups/"
-
-#define ACS_GROUP_ALL_USERS     ACS_URL "global/AllUsers"
-#define ACS_GROUP_AWS_USERS     ACS_URL "global/AuthenticatedUsers"
-#define ACS_GROUP_LOG_DELIVERY  ACS_URL "s3/LogDelivery"
-
-
-
-// Derived from S3 documentation
-
-// This is the maximum number of bytes needed in a "compacted meta header"
-// buffer, which is a buffer storing all of the compacted meta headers.
-#define COMPACTED_METADATA_BUFFER_SIZE \
-    (S3_MAX_METADATA_COUNT * sizeof(S3_METADATA_HEADER_NAME_PREFIX "n: v"))
-
-// Maximum url encoded key size; since every single character could require
-// URL encoding, it's 3 times the size of a key (since each url encoded
-// character takes 3 characters: %NN)
-#define MAX_URLENCODED_KEY_SIZE (3 * S3_MAX_KEY_SIZE)
-
-// This is the maximum size of a URI that could be passed to S3:
-// https://s3.amazonaws.com/${BUCKET}/${KEY}?acl
-// 255 is the maximum bucket length
-#define MAX_URI_SIZE \
-    ((sizeof("https:///") - 1) + S3_MAX_HOSTNAME_SIZE + 255 + 1 +       \
-     MAX_URLENCODED_KEY_SIZE + (sizeof("?torrent") - 1) + 1)
-
-// Maximum size of a canonicalized resource
-#define MAX_CANONICALIZED_RESOURCE_SIZE \
-    (1 + 255 + 1 + MAX_URLENCODED_KEY_SIZE + (sizeof("?torrent") - 1) + 1)
-
-
-// Utilities -----------------------------------------------------------------
-
-// URL-encodes a string from [src] into [dest].  [dest] must have at least
-// 3x the number of characters that [source] has.   At most [maxSrcSize] bytes
-// from [src] are encoded; if more are present in [src], 0 is returned from
-// urlEncode, else nonzero is returned.
-int urlEncode(char *dest, const char *src, int maxSrcSize);
-
-// Returns < 0 on failure >= 0 on success
-int64_t parseIso8601Time(const char *str);
-
-uint64_t parseUnsignedInt(const char *str);
-
-// base64 encode bytes.  The output buffer must have at least
-// ((4 * (inLen + 1)) / 3) bytes in it.  Returns the number of bytes written
-// to [out].
-int base64Encode(const unsigned char *in, int inLen, char *out);
-
-// Compute HMAC-SHA-1 with key [key] and message [message], storing result
-// in [hmac]
-void HMAC_SHA1(unsigned char hmac[20], const unsigned char *key, int key_len,
-               const unsigned char *message, int message_len);
-
-// Compute a 64-bit hash values given a set of bytes
-uint64_t hash(const unsigned char *k, int length);
-
-// Because Windows seems to be missing isblank(), use our own; it's a very
-// easy function to write in any case
-int is_blank(char c);
-
-#endif /* UTIL_H */
diff --git a/src/libs3/libs3.spec b/src/libs3/libs3.spec
deleted file mode 100644
index 95c3678..0000000
--- a/src/libs3/libs3.spec
+++ /dev/null
@@ -1,81 +0,0 @@
-Summary: C Library and Tools for Amazon S3 Access
-Name: libs3
-Version: trunk
-Release: 1
-License: GPL
-Group: Networking/Utilities
-URL: http://sourceforge.net/projects/reallibs3
-Source0: libs3-trunk.tar.gz
-Buildroot: %{_tmppath}/%{name}-%{version}-%{release}-root
-# Want to include curl dependencies, but older Fedora Core uses curl-devel,
-# and newer Fedora Core uses libcurl-devel ... have to figure out how to
-# handle this problem, but for now, just don't check for any curl libraries
-# Buildrequires: curl-devel
-Buildrequires: libxml2-devel
-Buildrequires: openssl-devel
-Buildrequires: make
-# Requires: libcurl
-Requires: libxml2
-Requires: openssl
-
-%define debug_package %{nil}
-
-%description
-This package includes the libs3 shared object library, needed to run
-applications compiled against libs3, and additionally contains the s3
-utility for accessing Amazon S3.
-
-%package devel
-Summary: Headers and documentation for libs3
-Group: Development/Libraries
-Requires: %{name} = %{version}-%{release}
-
-%description devel
-This library provides an API for using Amazon's S3 service (see
-http://s3.amazonaws.com).  Its design goals are:
-
- - To provide a simple and straightforward API for accessing all of S3's
-   functionality
- - To not require the developer using libs3 to need to know anything about:
-     - HTTP
-     - XML
-     - SSL
-   In other words, this API is meant to stand on its own, without requiring
-   any implicit knowledge of how S3 services are accessed using HTTP
-   protocols.
- - To be usable from multithreaded code
- - To be usable by code which wants to process multiple S3 requests
-   simultaneously from a single thread
- - To be usable in the simple, straightforward way using sequentialized
-   blocking requests
-
-
-%prep
-%setup -q
-
-%build
-BUILD=$RPM_BUILD_ROOT/build make exported
-
-%install
-BUILD=$RPM_BUILD_ROOT/build DESTDIR=$RPM_BUILD_ROOT/usr make install
-rm -rf $RPM_BUILD_ROOT/build
-
-%clean
-rm -rf $RPM_BUILD_ROOT
-
-%files
-%defattr(-,root,root,-)
-/usr/bin/s3
-/usr/lib/libs3.so*
-
-%files devel
-%defattr(-,root,root,-)
-/usr/include/libs3.h
-/usr/lib/libs3.a
-
-%changelog
-* Sat Aug 09 2008  <bryan at ischo,com> Bryan Ischo
-- Split into regular and devel packages.
-
-* Tue Aug 05 2008  <bryan at ischo,com> Bryan Ischo
-- Initial build.
diff --git a/src/libs3/mswin/libs3.def b/src/libs3/mswin/libs3.def
deleted file mode 100644
index c5bd6d8..0000000
--- a/src/libs3/mswin/libs3.def
+++ /dev/null
@@ -1,27 +0,0 @@
-EXPORTS
-S3_convert_acl
-S3_copy_object
-S3_create_bucket
-S3_create_request_context
-S3_deinitialize
-S3_delete_bucket
-S3_delete_object
-S3_destroy_request_context
-S3_generate_authenticated_query_string
-S3_get_acl
-S3_get_object
-S3_get_request_context_fdsets
-S3_get_server_access_logging
-S3_get_status_name
-S3_head_object
-S3_initialize
-S3_list_bucket
-S3_list_service
-S3_put_object
-S3_runall_request_context
-S3_runonce_request_context
-S3_set_acl
-S3_set_server_access_logging
-S3_status_is_retryable
-S3_test_bucket
-S3_validate_bucket_name
diff --git a/src/libs3/mswin/rmrf.bat b/src/libs3/mswin/rmrf.bat
deleted file mode 100644
index 204efd9..0000000
--- a/src/libs3/mswin/rmrf.bat
+++ /dev/null
@@ -1,9 +0,0 @@
- at echo off
-
-if exist "%1". (
-   rmdir /S /Q "%1"
-)
-
-if exist "%1". (
-   del /Q "%1"
-)
diff --git a/src/libs3/src/acl.c b/src/libs3/src/acl.c
deleted file mode 100644
index 25e4058..0000000
--- a/src/libs3/src/acl.c
+++ /dev/null
@@ -1,348 +0,0 @@
-/** **************************************************************************
- * acl.c
- * 
- * Copyright 2008 Bryan Ischo <bryan at ischo.com>
- * 
- * This file is part of libs3.
- * 
- * libs3 is free software: you can redistribute it and/or modify it under the
- * terms of the GNU General Public License as published by the Free Software
- * Foundation, version 3 of the License.
- *
- * In addition, as a special exception, the copyright holders give
- * permission to link the code of this library and its programs with the
- * OpenSSL library, and distribute linked combinations including the two.
- *
- * libs3 is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License version 3
- * along with libs3, in a file named COPYING.  If not, see
- * <http://www.gnu.org/licenses/>.
- *
- ************************************************************************** **/
-
-#include <stdlib.h>
-#include <string.h>
-#include "libs3.h"
-#include "request.h"
-
-// Use a rather arbitrary max size for the document of 64K
-#define ACL_XML_DOC_MAXSIZE (64 * 1024)
-
-
-// get acl -------------------------------------------------------------------
-
-typedef struct GetAclData
-{
-    SimpleXml simpleXml;
-
-    S3ResponsePropertiesCallback *responsePropertiesCallback;
-    S3ResponseCompleteCallback *responseCompleteCallback;
-    void *callbackData;
-
-    int *aclGrantCountReturn;
-    S3AclGrant *aclGrants;
-    char *ownerId;
-    char *ownerDisplayName;
-    string_buffer(aclXmlDocument, ACL_XML_DOC_MAXSIZE);
-} GetAclData;
-
-
-static S3Status getAclPropertiesCallback
-    (const S3ResponseProperties *responseProperties, void *callbackData)
-{
-    GetAclData *gaData = (GetAclData *) callbackData;
-    
-    return (*(gaData->responsePropertiesCallback))
-        (responseProperties, gaData->callbackData);
-}
-
-
-static S3Status getAclDataCallback(int bufferSize, const char *buffer,
-                                   void *callbackData)
-{
-    GetAclData *gaData = (GetAclData *) callbackData;
-
-    int fit;
-
-    string_buffer_append(gaData->aclXmlDocument, buffer, bufferSize, fit);
-    
-    return fit ? S3StatusOK : S3StatusXmlDocumentTooLarge;
-}
-
-
-static void getAclCompleteCallback(S3Status requestStatus, 
-                                   const S3ErrorDetails *s3ErrorDetails,
-                                   void *callbackData)
-{
-    GetAclData *gaData = (GetAclData *) callbackData;
-
-    if (requestStatus == S3StatusOK) {
-        // Parse the document
-        requestStatus = S3_convert_acl
-            (gaData->aclXmlDocument, gaData->ownerId, gaData->ownerDisplayName,
-             gaData->aclGrantCountReturn, gaData->aclGrants);
-    }
-
-    (*(gaData->responseCompleteCallback))
-        (requestStatus, s3ErrorDetails, gaData->callbackData);
-
-    free(gaData);
-}
-
-
-void S3_get_acl(const S3BucketContext *bucketContext, const char *key, 
-                char *ownerId, char *ownerDisplayName,
-                int *aclGrantCountReturn, S3AclGrant *aclGrants, 
-                S3RequestContext *requestContext,
-                const S3ResponseHandler *handler, void *callbackData)
-{
-    // Create the callback data
-    GetAclData *gaData = (GetAclData *) malloc(sizeof(GetAclData));
-    if (!gaData) {
-        (*(handler->completeCallback))(S3StatusOutOfMemory, 0, callbackData);
-        return;
-    }
-
-    gaData->responsePropertiesCallback = handler->propertiesCallback;
-    gaData->responseCompleteCallback = handler->completeCallback;
-    gaData->callbackData = callbackData;
-
-    gaData->aclGrantCountReturn = aclGrantCountReturn;
-    gaData->aclGrants = aclGrants;
-    gaData->ownerId = ownerId;
-    gaData->ownerDisplayName = ownerDisplayName;
-    string_buffer_initialize(gaData->aclXmlDocument);
-    *aclGrantCountReturn = 0;
-
-    // Set up the RequestParams
-    RequestParams params =
-    {
-        HttpRequestTypeGET,                           // httpRequestType
-        { bucketContext->hostName,                    // hostName
-          bucketContext->bucketName,                  // bucketName
-          bucketContext->protocol,                    // protocol
-          bucketContext->uriStyle,                    // uriStyle
-          bucketContext->accessKeyId,                 // accessKeyId
-          bucketContext->secretAccessKey },           // secretAccessKey
-        key,                                          // key
-        0,                                            // queryParams
-        "acl",                                        // subResource
-        0,                                            // copySourceBucketName
-        0,                                            // copySourceKey
-        0,                                            // getConditions
-        0,                                            // startByte
-        0,                                            // byteCount
-        0,                                            // putProperties
-        &getAclPropertiesCallback,                    // propertiesCallback
-        0,                                            // toS3Callback
-        0,                                            // toS3CallbackTotalSize
-        &getAclDataCallback,                          // fromS3Callback
-        &getAclCompleteCallback,                      // completeCallback
-        gaData                                        // callbackData
-    };
-
-    // Perform the request
-    request_perform(&params, requestContext);
-}
-
-
-// set acl -------------------------------------------------------------------
-
-static S3Status generateAclXmlDocument(const char *ownerId, 
-                                       const char *ownerDisplayName,
-                                       int aclGrantCount, 
-                                       const S3AclGrant *aclGrants,
-                                       int *xmlDocumentLenReturn,
-                                       char *xmlDocument,
-                                       int xmlDocumentBufferSize)
-{
-    *xmlDocumentLenReturn = 0;
-
-#define append(fmt, ...)                                        \
-    do {                                                        \
-        *xmlDocumentLenReturn += snprintf                       \
-            (&(xmlDocument[*xmlDocumentLenReturn]),             \
-             xmlDocumentBufferSize - *xmlDocumentLenReturn - 1, \
-             fmt, __VA_ARGS__);                                 \
-        if (*xmlDocumentLenReturn >= xmlDocumentBufferSize) {   \
-            return S3StatusXmlDocumentTooLarge;                 \
-        } \
-    } while (0)
-
-    append("<AccessControlPolicy><Owner><ID>%s</ID><DisplayName>%s"
-           "</DisplayName></Owner><AccessControlList>", ownerId,
-           ownerDisplayName);
-
-    int i;
-    for (i = 0; i < aclGrantCount; i++) {
-        append("%s", "<Grant><Grantee xmlns:xsi=\"http://www.w3.org/2001/"
-               "XMLSchema-instance\" xsi:type=\"");
-        const S3AclGrant *grant = &(aclGrants[i]);
-        switch (grant->granteeType) {
-        case S3GranteeTypeAmazonCustomerByEmail:
-            append("AmazonCustomerByEmail\"><EmailAddress>%s</EmailAddress>",
-                   grant->grantee.amazonCustomerByEmail.emailAddress);
-            break;
-        case S3GranteeTypeCanonicalUser:
-            append("CanonicalUser\"><ID>%s</ID><DisplayName>%s</DisplayName>",
-                   grant->grantee.canonicalUser.id, 
-                   grant->grantee.canonicalUser.displayName);
-            break;
-        default: { // case S3GranteeTypeAllAwsUsers/S3GranteeTypeAllUsers:
-            const char *grantee;
-            switch (grant->granteeType) {
-            case S3GranteeTypeAllAwsUsers:
-                grantee = ACS_GROUP_AWS_USERS;
-                break;
-            case S3GranteeTypeAllUsers:
-                grantee = ACS_GROUP_ALL_USERS;
-                break;
-            default:
-                grantee = ACS_GROUP_LOG_DELIVERY;
-                break;
-            }
-            append("Group\"><URI>%s</URI>", grantee);
-        }
-            break;
-        }
-        append("</Grantee><Permission>%s</Permission></Grant>",
-               ((grant->permission == S3PermissionRead) ? "READ" :
-                (grant->permission == S3PermissionWrite) ? "WRITE" :
-                (grant->permission == S3PermissionReadACP) ? "READ_ACP" :
-                (grant->permission == S3PermissionWriteACP) ? "WRITE_ACP" :
-                "FULL_CONTROL"));
-    }
-
-    append("%s", "</AccessControlList></AccessControlPolicy>");
-
-    return S3StatusOK;
-}
-
-
-typedef struct SetAclData
-{
-    S3ResponsePropertiesCallback *responsePropertiesCallback;
-    S3ResponseCompleteCallback *responseCompleteCallback;
-    void *callbackData;
-
-    int aclXmlDocumentLen;
-    char aclXmlDocument[ACL_XML_DOC_MAXSIZE];
-    int aclXmlDocumentBytesWritten;
-
-} SetAclData;
-
-
-static S3Status setAclPropertiesCallback
-    (const S3ResponseProperties *responseProperties, void *callbackData)
-{
-    SetAclData *paData = (SetAclData *) callbackData;
-    
-    return (*(paData->responsePropertiesCallback))
-        (responseProperties, paData->callbackData);
-}
-
-
-static int setAclDataCallback(int bufferSize, char *buffer, void *callbackData)
-{
-    SetAclData *paData = (SetAclData *) callbackData;
-
-    int remaining = (paData->aclXmlDocumentLen - 
-                     paData->aclXmlDocumentBytesWritten);
-
-    int toCopy = bufferSize > remaining ? remaining : bufferSize;
-    
-    if (!toCopy) {
-        return 0;
-    }
-
-    memcpy(buffer, &(paData->aclXmlDocument
-                     [paData->aclXmlDocumentBytesWritten]), toCopy);
-
-    paData->aclXmlDocumentBytesWritten += toCopy;
-
-    return toCopy;
-}
-
-
-static void setAclCompleteCallback(S3Status requestStatus, 
-                                   const S3ErrorDetails *s3ErrorDetails,
-                                   void *callbackData)
-{
-    SetAclData *paData = (SetAclData *) callbackData;
-
-    (*(paData->responseCompleteCallback))
-        (requestStatus, s3ErrorDetails, paData->callbackData);
-
-    free(paData);
-}
-
-
-void S3_set_acl(const S3BucketContext *bucketContext, const char *key,
-                const char *ownerId, const char *ownerDisplayName,
-                int aclGrantCount, const S3AclGrant *aclGrants,
-                S3RequestContext *requestContext,
-                const S3ResponseHandler *handler, void *callbackData)
-{
-    if (aclGrantCount > S3_MAX_ACL_GRANT_COUNT) {
-        (*(handler->completeCallback))
-            (S3StatusTooManyGrants, 0, callbackData);
-        return;
-    }
-
-    SetAclData *data = (SetAclData *) malloc(sizeof(SetAclData));
-    if (!data) {
-        (*(handler->completeCallback))(S3StatusOutOfMemory, 0, callbackData);
-        return;
-    }
-    
-    // Convert aclGrants to XML document
-    S3Status status = generateAclXmlDocument
-        (ownerId, ownerDisplayName, aclGrantCount, aclGrants,
-         &(data->aclXmlDocumentLen), data->aclXmlDocument, 
-         sizeof(data->aclXmlDocument));
-    if (status != S3StatusOK) {
-        free(data);
-        (*(handler->completeCallback))(status, 0, callbackData);
-        return;
-    }
-
-    data->responsePropertiesCallback = handler->propertiesCallback;
-    data->responseCompleteCallback = handler->completeCallback;
-    data->callbackData = callbackData;
-
-    data->aclXmlDocumentBytesWritten = 0;
-
-    // Set up the RequestParams
-    RequestParams params =
-    {
-        HttpRequestTypePUT,                           // httpRequestType
-        { bucketContext->hostName,                    // hostName
-          bucketContext->bucketName,                  // bucketName
-          bucketContext->protocol,                    // protocol
-          bucketContext->uriStyle,                    // uriStyle
-          bucketContext->accessKeyId,                 // accessKeyId
-          bucketContext->secretAccessKey },           // secretAccessKey
-        key,                                          // key
-        0,                                            // queryParams
-        "acl",                                        // subResource
-        0,                                            // copySourceBucketName
-        0,                                            // copySourceKey
-        0,                                            // getConditions
-        0,                                            // startByte
-        0,                                            // byteCount
-        0,                                            // putProperties
-        &setAclPropertiesCallback,                    // propertiesCallback
-        &setAclDataCallback,                          // toS3Callback
-        data->aclXmlDocumentLen,                      // toS3CallbackTotalSize
-        0,                                            // fromS3Callback
-        &setAclCompleteCallback,                      // completeCallback
-        data                                          // callbackData
-    };
-
-    // Perform the request
-    request_perform(&params, requestContext);
-}
diff --git a/src/libs3/src/bucket.c b/src/libs3/src/bucket.c
deleted file mode 100644
index f4f1987..0000000
--- a/src/libs3/src/bucket.c
+++ /dev/null
@@ -1,743 +0,0 @@
-/** **************************************************************************
- * bucket.c
- * 
- * Copyright 2008 Bryan Ischo <bryan at ischo.com>
- * 
- * This file is part of libs3.
- * 
- * libs3 is free software: you can redistribute it and/or modify it under the
- * terms of the GNU General Public License as published by the Free Software
- * Foundation, version 3 of the License.
- *
- * In addition, as a special exception, the copyright holders give
- * permission to link the code of this library and its programs with the
- * OpenSSL library, and distribute linked combinations including the two.
- *
- * libs3 is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License version 3
- * along with libs3, in a file named COPYING.  If not, see
- * <http://www.gnu.org/licenses/>.
- *
- ************************************************************************** **/
-
-#include <string.h>
-#include <stdlib.h>
-#include "libs3.h"
-#include "request.h"
-#include "simplexml.h"
-
-// test bucket ---------------------------------------------------------------
-
-typedef struct TestBucketData
-{
-    SimpleXml simpleXml;
-
-    S3ResponsePropertiesCallback *responsePropertiesCallback;
-    S3ResponseCompleteCallback *responseCompleteCallback;
-    void *callbackData;
-
-    int locationConstraintReturnSize;
-    char *locationConstraintReturn;
-
-    string_buffer(locationConstraint, 256);
-} TestBucketData;
-
-
-static S3Status testBucketXmlCallback(const char *elementPath,
-                                      const char *data, int dataLen,
-                                      void *callbackData)
-{
-    TestBucketData *tbData = (TestBucketData *) callbackData;
-
-    int fit;
-
-    if (data && !strcmp(elementPath, "LocationConstraint")) {
-        string_buffer_append(tbData->locationConstraint, data, dataLen, fit);
-    }
-
-    /* Avoid compiler error about variable set but not used */
-    (void) fit;
-
-    return S3StatusOK;
-}
-
-
-static S3Status testBucketPropertiesCallback
-    (const S3ResponseProperties *responseProperties, void *callbackData)
-{
-    TestBucketData *tbData = (TestBucketData *) callbackData;
-    
-    return (*(tbData->responsePropertiesCallback))
-        (responseProperties, tbData->callbackData);
-}
-
-
-static S3Status testBucketDataCallback(int bufferSize, const char *buffer,
-                                       void *callbackData)
-{
-    TestBucketData *tbData = (TestBucketData *) callbackData;
-
-    return simplexml_add(&(tbData->simpleXml), buffer, bufferSize);
-}
-
-
-static void testBucketCompleteCallback(S3Status requestStatus, 
-                                       const S3ErrorDetails *s3ErrorDetails,
-                                       void *callbackData)
-{
-    TestBucketData *tbData = (TestBucketData *) callbackData;
-
-    // Copy the location constraint into the return buffer
-    snprintf(tbData->locationConstraintReturn, 
-             tbData->locationConstraintReturnSize, "%s", 
-             tbData->locationConstraint);
-
-    (*(tbData->responseCompleteCallback))
-        (requestStatus, s3ErrorDetails, tbData->callbackData);
-
-    simplexml_deinitialize(&(tbData->simpleXml));
-
-    free(tbData);
-}
-
-
-void S3_test_bucket(S3Protocol protocol, S3UriStyle uriStyle,
-                    const char *accessKeyId, const char *secretAccessKey,
-                    const char *hostName, const char *bucketName,
-                    int locationConstraintReturnSize,
-                    char *locationConstraintReturn,
-                    S3RequestContext *requestContext,
-                    const S3ResponseHandler *handler, void *callbackData)
-{
-    // Create the callback data
-    TestBucketData *tbData = 
-        (TestBucketData *) malloc(sizeof(TestBucketData));
-    if (!tbData) {
-        (*(handler->completeCallback))(S3StatusOutOfMemory, 0, callbackData);
-        return;
-    }
-
-    simplexml_initialize(&(tbData->simpleXml), &testBucketXmlCallback, tbData);
-
-    tbData->responsePropertiesCallback = handler->propertiesCallback;
-    tbData->responseCompleteCallback = handler->completeCallback;
-    tbData->callbackData = callbackData;
-
-    tbData->locationConstraintReturnSize = locationConstraintReturnSize;
-    tbData->locationConstraintReturn = locationConstraintReturn;
-    string_buffer_initialize(tbData->locationConstraint);
-
-    // Set up the RequestParams
-    RequestParams params =
-    {
-        HttpRequestTypeGET,                           // httpRequestType
-        { hostName,                                   // hostName
-          bucketName,                                 // bucketName
-          protocol,                                   // protocol
-          uriStyle,                                   // uriStyle
-          accessKeyId,                                // accessKeyId
-          secretAccessKey },                          // secretAccessKey
-        0,                                            // key
-        0,                                            // queryParams
-        "location",                                   // subResource
-        0,                                            // copySourceBucketName
-        0,                                            // copySourceKey
-        0,                                            // getConditions
-        0,                                            // startByte
-        0,                                            // byteCount
-        0,                                            // putProperties
-        &testBucketPropertiesCallback,                // propertiesCallback
-        0,                                            // toS3Callback
-        0,                                            // toS3CallbackTotalSize
-        &testBucketDataCallback,                      // fromS3Callback
-        &testBucketCompleteCallback,                  // completeCallback
-        tbData                                        // callbackData
-    };
-
-    // Perform the request
-    request_perform(&params, requestContext);
-}
-
-
-// create bucket -------------------------------------------------------------
-
-typedef struct CreateBucketData
-{
-    S3ResponsePropertiesCallback *responsePropertiesCallback;
-    S3ResponseCompleteCallback *responseCompleteCallback;
-    void *callbackData;
-
-    char doc[1024];
-    int docLen, docBytesWritten;
-} CreateBucketData;                         
-                            
-
-static S3Status createBucketPropertiesCallback
-    (const S3ResponseProperties *responseProperties, void *callbackData)
-{
-    CreateBucketData *cbData = (CreateBucketData *) callbackData;
-    
-    return (*(cbData->responsePropertiesCallback))
-        (responseProperties, cbData->callbackData);
-}
-
-
-static int createBucketDataCallback(int bufferSize, char *buffer, 
-                                    void *callbackData)
-{
-    CreateBucketData *cbData = (CreateBucketData *) callbackData;
-
-    if (!cbData->docLen) {
-        return 0;
-    }
-
-    int remaining = (cbData->docLen - cbData->docBytesWritten);
-
-    int toCopy = bufferSize > remaining ? remaining : bufferSize;
-    
-    if (!toCopy) {
-        return 0;
-    }
-
-    memcpy(buffer, &(cbData->doc[cbData->docBytesWritten]), toCopy);
-
-    cbData->docBytesWritten += toCopy;
-
-    return toCopy;
-}
-
-
-static void createBucketCompleteCallback(S3Status requestStatus, 
-                                         const S3ErrorDetails *s3ErrorDetails,
-                                         void *callbackData)
-{
-    CreateBucketData *cbData = (CreateBucketData *) callbackData;
-
-    (*(cbData->responseCompleteCallback))
-        (requestStatus, s3ErrorDetails, cbData->callbackData);
-
-    free(cbData);
-}
-
-
-void S3_create_bucket(S3Protocol protocol, const char *accessKeyId,
-                      const char *secretAccessKey, const char *hostName,
-                      const char *bucketName, S3CannedAcl cannedAcl,
-                      const char *locationConstraint,
-                      S3RequestContext *requestContext,
-                      const S3ResponseHandler *handler, void *callbackData)
-{
-    // Create the callback data
-    CreateBucketData *cbData = 
-        (CreateBucketData *) malloc(sizeof(CreateBucketData));
-    if (!cbData) {
-        (*(handler->completeCallback))(S3StatusOutOfMemory, 0, callbackData);
-        return;
-    }
-
-    cbData->responsePropertiesCallback = handler->propertiesCallback;
-    cbData->responseCompleteCallback = handler->completeCallback;
-    cbData->callbackData = callbackData;
-
-    if (locationConstraint) {
-        cbData->docLen =
-            snprintf(cbData->doc, sizeof(cbData->doc),
-                     "<CreateBucketConfiguration><LocationConstraint>"
-                     "%s</LocationConstraint></CreateBucketConfiguration>",
-                     locationConstraint);
-        cbData->docBytesWritten = 0;
-    }
-    else {
-        cbData->docLen = 0;
-    }
-    
-    // Set up S3PutProperties
-    S3PutProperties properties =
-    {
-        0,                                       // contentType
-        0,                                       // md5
-        0,                                       // cacheControl
-        0,                                       // contentDispositionFilename
-        0,                                       // contentEncoding
-        0,                                       // expires
-        cannedAcl,                               // cannedAcl
-        0,                                       // metaDataCount
-        0                                        // metaData
-    };
-    
-    // Set up the RequestParams
-    RequestParams params =
-    {
-        HttpRequestTypePUT,                           // httpRequestType
-        { hostName,                                   // hostName
-          bucketName,                                 // bucketName
-          protocol,                                   // protocol
-          S3UriStylePath,                             // uriStyle
-          accessKeyId,                                // accessKeyId
-          secretAccessKey },                          // secretAccessKey
-        0,                                            // key
-        0,                                            // queryParams
-        0,                                            // subResource
-        0,                                            // copySourceBucketName
-        0,                                            // copySourceKey
-        0,                                            // getConditions
-        0,                                            // startByte
-        0,                                            // byteCount
-        &properties,                                  // putProperties
-        &createBucketPropertiesCallback,              // propertiesCallback
-        &createBucketDataCallback,                    // toS3Callback
-        cbData->docLen,                               // toS3CallbackTotalSize
-        0,                                            // fromS3Callback
-        &createBucketCompleteCallback,                // completeCallback
-        cbData                                        // callbackData
-    };
-
-    // Perform the request
-    request_perform(&params, requestContext);
-}
-
-                           
-// delete bucket -------------------------------------------------------------
-
-typedef struct DeleteBucketData
-{
-    S3ResponsePropertiesCallback *responsePropertiesCallback;
-    S3ResponseCompleteCallback *responseCompleteCallback;
-    void *callbackData;
-} DeleteBucketData;
-
-
-static S3Status deleteBucketPropertiesCallback
-    (const S3ResponseProperties *responseProperties, void *callbackData)
-{
-    DeleteBucketData *dbData = (DeleteBucketData *) callbackData;
-    
-    return (*(dbData->responsePropertiesCallback))
-        (responseProperties, dbData->callbackData);
-}
-
-
-static void deleteBucketCompleteCallback(S3Status requestStatus, 
-                                         const S3ErrorDetails *s3ErrorDetails,
-                                         void *callbackData)
-{
-    DeleteBucketData *dbData = (DeleteBucketData *) callbackData;
-
-    (*(dbData->responseCompleteCallback))
-        (requestStatus, s3ErrorDetails, dbData->callbackData);
-
-    free(dbData);
-}
-
-
-void S3_delete_bucket(S3Protocol protocol, S3UriStyle uriStyle,
-                      const char *accessKeyId, const char *secretAccessKey,
-                      const char *hostName, const char *bucketName,
-                      S3RequestContext *requestContext,
-                      const S3ResponseHandler *handler, void *callbackData)
-{
-    // Create the callback data
-    DeleteBucketData *dbData = 
-        (DeleteBucketData *) malloc(sizeof(DeleteBucketData));
-    if (!dbData) {
-        (*(handler->completeCallback))(S3StatusOutOfMemory, 0, callbackData);
-        return;
-    }
-
-    dbData->responsePropertiesCallback = handler->propertiesCallback;
-    dbData->responseCompleteCallback = handler->completeCallback;
-    dbData->callbackData = callbackData;
-
-    // Set up the RequestParams
-    RequestParams params =
-    {
-        HttpRequestTypeDELETE,                        // httpRequestType
-        { hostName,                                   // hostName
-          bucketName,                                 // bucketName
-          protocol,                                   // protocol
-          uriStyle,                                   // uriStyle
-          accessKeyId,                                // accessKeyId
-          secretAccessKey },                          // secretAccessKey
-        0,                                            // key
-        0,                                            // queryParams
-        0,                                            // subResource
-        0,                                            // copySourceBucketName
-        0,                                            // copySourceKey
-        0,                                            // getConditions
-        0,                                            // startByte
-        0,                                            // byteCount
-        0,                                            // putProperties
-        &deleteBucketPropertiesCallback,              // propertiesCallback
-        0,                                            // toS3Callback
-        0,                                            // toS3CallbackTotalSize
-        0,                                            // fromS3Callback
-        &deleteBucketCompleteCallback,                // completeCallback
-        dbData                                        // callbackData
-    };
-
-    // Perform the request
-    request_perform(&params, requestContext);
-}
-
-
-// list bucket ----------------------------------------------------------------
-
-typedef struct ListBucketContents
-{
-    string_buffer(key, 1024);
-    string_buffer(lastModified, 256);
-    string_buffer(eTag, 256);
-    string_buffer(size, 24);
-    string_buffer(ownerId, 256);
-    string_buffer(ownerDisplayName, 256);
-} ListBucketContents;
-
-
-static void initialize_list_bucket_contents(ListBucketContents *contents)
-{
-    string_buffer_initialize(contents->key);
-    string_buffer_initialize(contents->lastModified);
-    string_buffer_initialize(contents->eTag);
-    string_buffer_initialize(contents->size);
-    string_buffer_initialize(contents->ownerId);
-    string_buffer_initialize(contents->ownerDisplayName);
-}
-
-// We read up to 32 Contents at a time
-#define MAX_CONTENTS 32
-// We read up to 8 CommonPrefixes at a time
-#define MAX_COMMON_PREFIXES 8
-
-typedef struct ListBucketData
-{
-    SimpleXml simpleXml;
-
-    S3ResponsePropertiesCallback *responsePropertiesCallback;
-    S3ListBucketCallback *listBucketCallback;
-    S3ResponseCompleteCallback *responseCompleteCallback;
-    void *callbackData;
-
-    string_buffer(isTruncated, 64);
-    string_buffer(nextMarker, 1024);
-
-    int contentsCount;
-    ListBucketContents contents[MAX_CONTENTS];
-
-    int commonPrefixesCount;
-    char commonPrefixes[MAX_COMMON_PREFIXES][1024];
-    int commonPrefixLens[MAX_COMMON_PREFIXES];
-} ListBucketData;
-
-
-static void initialize_list_bucket_data(ListBucketData *lbData)
-{
-    lbData->contentsCount = 0;
-    initialize_list_bucket_contents(lbData->contents);
-    lbData->commonPrefixesCount = 0;
-    lbData->commonPrefixes[0][0] = 0;
-    lbData->commonPrefixLens[0] = 0;
-}
-
-
-static S3Status make_list_bucket_callback(ListBucketData *lbData)
-{
-    int i;
-
-    // Convert IsTruncated
-    int isTruncated = (!strcmp(lbData->isTruncated, "true") ||
-                       !strcmp(lbData->isTruncated, "1")) ? 1 : 0;
-
-    // Convert the contents
-    S3ListBucketContent contents[lbData->contentsCount];
-
-    int contentsCount = lbData->contentsCount;
-    for (i = 0; i < contentsCount; i++) {
-        S3ListBucketContent *contentDest = &(contents[i]);
-        ListBucketContents *contentSrc = &(lbData->contents[i]);
-        contentDest->key = contentSrc->key;
-        contentDest->lastModified = 
-            parseIso8601Time(contentSrc->lastModified);
-        contentDest->eTag = contentSrc->eTag;
-        contentDest->size = parseUnsignedInt(contentSrc->size);
-        contentDest->ownerId =
-            contentSrc->ownerId[0] ?contentSrc->ownerId : 0;
-        contentDest->ownerDisplayName = (contentSrc->ownerDisplayName[0] ?
-                                         contentSrc->ownerDisplayName : 0);
-    }
-
-    // Make the common prefixes array
-    int commonPrefixesCount = lbData->commonPrefixesCount;
-    char *commonPrefixes[commonPrefixesCount];
-    for (i = 0; i < commonPrefixesCount; i++) {
-        commonPrefixes[i] = lbData->commonPrefixes[i];
-    }
-
-    return (*(lbData->listBucketCallback))
-        (isTruncated, lbData->nextMarker,
-         contentsCount, contents, commonPrefixesCount, 
-         (const char **) commonPrefixes, lbData->callbackData);
-}
-
-
-static S3Status listBucketXmlCallback(const char *elementPath,
-                                      const char *data, int dataLen,
-                                      void *callbackData)
-{
-    ListBucketData *lbData = (ListBucketData *) callbackData;
-
-    int fit;
-
-    if (data) {
-        if (!strcmp(elementPath, "ListBucketResult/IsTruncated")) {
-            string_buffer_append(lbData->isTruncated, data, dataLen, fit);
-        }
-        else if (!strcmp(elementPath, "ListBucketResult/NextMarker")) {
-            string_buffer_append(lbData->nextMarker, data, dataLen, fit);
-        }
-        else if (!strcmp(elementPath, "ListBucketResult/Contents/Key")) {
-            ListBucketContents *contents = 
-                &(lbData->contents[lbData->contentsCount]);
-            string_buffer_append(contents->key, data, dataLen, fit);
-        }
-        else if (!strcmp(elementPath, 
-                         "ListBucketResult/Contents/LastModified")) {
-            ListBucketContents *contents = 
-                &(lbData->contents[lbData->contentsCount]);
-            string_buffer_append(contents->lastModified, data, dataLen, fit);
-        }
-        else if (!strcmp(elementPath, "ListBucketResult/Contents/ETag")) {
-            ListBucketContents *contents = 
-                &(lbData->contents[lbData->contentsCount]);
-            string_buffer_append(contents->eTag, data, dataLen, fit);
-        }
-        else if (!strcmp(elementPath, "ListBucketResult/Contents/Size")) {
-            ListBucketContents *contents = 
-                &(lbData->contents[lbData->contentsCount]);
-            string_buffer_append(contents->size, data, dataLen, fit);
-        }
-        else if (!strcmp(elementPath, "ListBucketResult/Contents/Owner/ID")) {
-            ListBucketContents *contents = 
-                &(lbData->contents[lbData->contentsCount]);
-            string_buffer_append(contents->ownerId, data, dataLen, fit);
-        }
-        else if (!strcmp(elementPath, 
-                         "ListBucketResult/Contents/Owner/DisplayName")) {
-            ListBucketContents *contents = 
-                &(lbData->contents[lbData->contentsCount]);
-            string_buffer_append
-                (contents->ownerDisplayName, data, dataLen, fit);
-        }
-        else if (!strcmp(elementPath, 
-                         "ListBucketResult/CommonPrefixes/Prefix")) {
-            int which = lbData->commonPrefixesCount;
-            lbData->commonPrefixLens[which] +=
-                snprintf(lbData->commonPrefixes[which],
-                         sizeof(lbData->commonPrefixes[which]) -
-                         lbData->commonPrefixLens[which] - 1,
-                         "%.*s", dataLen, data);
-            if (lbData->commonPrefixLens[which] >=
-                (int) sizeof(lbData->commonPrefixes[which])) {
-                return S3StatusXmlParseFailure;
-            }
-        }
-    }
-    else {
-        if (!strcmp(elementPath, "ListBucketResult/Contents")) {
-            // Finished a Contents
-            lbData->contentsCount++;
-            if (lbData->contentsCount == MAX_CONTENTS) {
-                // Make the callback
-                S3Status status = make_list_bucket_callback(lbData);
-                if (status != S3StatusOK) {
-                    return status;
-                }
-                initialize_list_bucket_data(lbData);
-            }
-            else {
-                // Initialize the next one
-                initialize_list_bucket_contents
-                    (&(lbData->contents[lbData->contentsCount]));
-            }
-        }
-        else if (!strcmp(elementPath,
-                         "ListBucketResult/CommonPrefixes/Prefix")) {
-            // Finished a Prefix
-            lbData->commonPrefixesCount++;
-            if (lbData->commonPrefixesCount == MAX_COMMON_PREFIXES) {
-                // Make the callback
-                S3Status status = make_list_bucket_callback(lbData);
-                if (status != S3StatusOK) {
-                    return status;
-                }
-                initialize_list_bucket_data(lbData);
-            }
-            else {
-                // Initialize the next one
-                lbData->commonPrefixes[lbData->commonPrefixesCount][0] = 0;
-                lbData->commonPrefixLens[lbData->commonPrefixesCount] = 0;
-            }
-        }
-    }
-
-    /* Avoid compiler error about variable set but not used */
-    (void) fit;
-
-    return S3StatusOK;
-}
-
-
-static S3Status listBucketPropertiesCallback
-    (const S3ResponseProperties *responseProperties, void *callbackData)
-{
-    ListBucketData *lbData = (ListBucketData *) callbackData;
-    
-    return (*(lbData->responsePropertiesCallback))
-        (responseProperties, lbData->callbackData);
-}
-
-
-static S3Status listBucketDataCallback(int bufferSize, const char *buffer, 
-                                       void *callbackData)
-{
-    ListBucketData *lbData = (ListBucketData *) callbackData;
-    
-    return simplexml_add(&(lbData->simpleXml), buffer, bufferSize);
-}
-
-
-static void listBucketCompleteCallback(S3Status requestStatus, 
-                                       const S3ErrorDetails *s3ErrorDetails,
-                                       void *callbackData)
-{
-    ListBucketData *lbData = (ListBucketData *) callbackData;
-
-    // Make the callback if there is anything
-    if (lbData->contentsCount || lbData->commonPrefixesCount) {
-        make_list_bucket_callback(lbData);
-    }
-
-    (*(lbData->responseCompleteCallback))
-        (requestStatus, s3ErrorDetails, lbData->callbackData);
-
-    simplexml_deinitialize(&(lbData->simpleXml));
-
-    free(lbData);
-}
-
-
-void S3_list_bucket(const S3BucketContext *bucketContext, const char *prefix,
-                    const char *marker, const char *delimiter, int maxkeys,
-                    S3RequestContext *requestContext,
-                    const S3ListBucketHandler *handler, void *callbackData)
-{
-    // Compose the query params
-    string_buffer(queryParams, 4096);
-    string_buffer_initialize(queryParams);
-    
-#define safe_append(name, value)                                        \
-    do {                                                                \
-        int fit;                                                        \
-        if (amp) {                                                      \
-            string_buffer_append(queryParams, "&", 1, fit);             \
-            if (!fit) {                                                 \
-                (*(handler->responseHandler.completeCallback))          \
-                    (S3StatusQueryParamsTooLong, 0, callbackData);      \
-                return;                                                 \
-            }                                                           \
-        }                                                               \
-        string_buffer_append(queryParams, name "=",                     \
-                             sizeof(name "=") - 1, fit);                \
-        if (!fit) {                                                     \
-            (*(handler->responseHandler.completeCallback))              \
-                (S3StatusQueryParamsTooLong, 0, callbackData);          \
-            return;                                                     \
-        }                                                               \
-        amp = 1;                                                        \
-        char encoded[3 * 1024];                                         \
-        if (!urlEncode(encoded, value, 1024)) {                         \
-            (*(handler->responseHandler.completeCallback))              \
-                (S3StatusQueryParamsTooLong, 0, callbackData);          \
-            return;                                                     \
-        }                                                               \
-        string_buffer_append(queryParams, encoded, strlen(encoded),     \
-                             fit);                                      \
-        if (!fit) {                                                     \
-            (*(handler->responseHandler.completeCallback))              \
-                (S3StatusQueryParamsTooLong, 0, callbackData);          \
-            return;                                                     \
-        }                                                               \
-    } while (0)
-
-
-    int amp = 0;
-    if (prefix) {
-        safe_append("prefix", prefix);
-    }
-    if (marker) {
-        safe_append("marker", marker);
-    }
-    if (delimiter) {
-        safe_append("delimiter", delimiter);
-    }
-    if (maxkeys) {
-        char maxKeysString[64];
-        snprintf(maxKeysString, sizeof(maxKeysString), "%d", maxkeys);
-        safe_append("max-keys", maxKeysString);
-    }
-
-    ListBucketData *lbData =
-        (ListBucketData *) malloc(sizeof(ListBucketData));
-
-    if (!lbData) {
-        (*(handler->responseHandler.completeCallback))
-            (S3StatusOutOfMemory, 0, callbackData);
-        return;
-    }
-
-    simplexml_initialize(&(lbData->simpleXml), &listBucketXmlCallback, lbData);
-    
-    lbData->responsePropertiesCallback = 
-        handler->responseHandler.propertiesCallback;
-    lbData->listBucketCallback = handler->listBucketCallback;
-    lbData->responseCompleteCallback = 
-        handler->responseHandler.completeCallback;
-    lbData->callbackData = callbackData;
-
-    string_buffer_initialize(lbData->isTruncated);
-    string_buffer_initialize(lbData->nextMarker);
-    initialize_list_bucket_data(lbData);
-
-    // Set up the RequestParams
-    RequestParams params =
-    {
-        HttpRequestTypeGET,                           // httpRequestType
-        { bucketContext->hostName,                    // hostName
-          bucketContext->bucketName,                  // bucketName
-          bucketContext->protocol,                    // protocol
-          bucketContext->uriStyle,                    // uriStyle
-          bucketContext->accessKeyId,                 // accessKeyId
-          bucketContext->secretAccessKey },           // secretAccessKey
-        0,                                            // key
-        queryParams[0] ? queryParams : 0,             // queryParams
-        0,                                            // subResource
-        0,                                            // copySourceBucketName
-        0,                                            // copySourceKey
-        0,                                            // getConditions
-        0,                                            // startByte
-        0,                                            // byteCount
-        0,                                            // putProperties
-        &listBucketPropertiesCallback,                // propertiesCallback
-        0,                                            // toS3Callback
-        0,                                            // toS3CallbackTotalSize
-        &listBucketDataCallback,                      // fromS3Callback
-        &listBucketCompleteCallback,                  // completeCallback
-        lbData                                        // callbackData
-    };
-
-    // Perform the request
-    request_perform(&params, requestContext);
-}
diff --git a/src/libs3/src/error_parser.c b/src/libs3/src/error_parser.c
deleted file mode 100644
index baa206e..0000000
--- a/src/libs3/src/error_parser.c
+++ /dev/null
@@ -1,239 +0,0 @@
-/** **************************************************************************
- * error_parser.c
- * 
- * Copyright 2008 Bryan Ischo <bryan at ischo.com>
- * 
- * This file is part of libs3.
- * 
- * libs3 is free software: you can redistribute it and/or modify it under the
- * terms of the GNU General Public License as published by the Free Software
- * Foundation, version 3 of the License.
- *
- * In addition, as a special exception, the copyright holders give
- * permission to link the code of this library and its programs with the
- * OpenSSL library, and distribute linked combinations including the two.
- *
- * libs3 is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License version 3
- * along with libs3, in a file named COPYING.  If not, see
- * <http://www.gnu.org/licenses/>.
- *
- ************************************************************************** **/
-
-#include <string.h>
-#include "error_parser.h"
-
-
-static S3Status errorXmlCallback(const char *elementPath, const char *data,
-                                 int dataLen, void *callbackData)
-{
-    // We ignore end of element callbacks because we don't care about them
-    if (!data) {
-        return S3StatusOK;
-    }
-
-    ErrorParser *errorParser = (ErrorParser *) callbackData;
-
-    int fit;
-
-    if (!strcmp(elementPath, "Error")) {
-        // Ignore, this is the Error element itself, we only care about subs
-    }
-    else if (!strcmp(elementPath, "Error/Code")) {
-        string_buffer_append(errorParser->code, data, dataLen, fit);
-    }
-    else if (!strcmp(elementPath, "Error/Message")) {
-        string_buffer_append(errorParser->message, data, dataLen, fit);
-        errorParser->s3ErrorDetails.message = errorParser->message;
-    }
-    else if (!strcmp(elementPath, "Error/Resource")) {
-        string_buffer_append(errorParser->resource, data, dataLen, fit);
-        errorParser->s3ErrorDetails.resource = errorParser->resource;
-    }
-    else if (!strcmp(elementPath, "Error/FurtherDetails")) {
-        string_buffer_append(errorParser->furtherDetails, data, dataLen, fit);
-        errorParser->s3ErrorDetails.furtherDetails = 
-            errorParser->furtherDetails;
-    }
-    else {
-        if (strncmp(elementPath, "Error/", sizeof("Error/") - 1)) {
-            // If for some weird reason it's not within the Error element,
-            // ignore it
-            return S3StatusOK;
-        }
-        // It's an unknown error element.  See if it matches the most
-        // recent error element.
-        const char *elementName = &(elementPath[sizeof("Error/") - 1]);
-        if (errorParser->s3ErrorDetails.extraDetailsCount && 
-            !strcmp(elementName, errorParser->s3ErrorDetails.extraDetails
-                    [errorParser->s3ErrorDetails.extraDetailsCount - 1].name)) {
-            // Append the value
-            string_multibuffer_append(errorParser->extraDetailsNamesValues,
-                                      data, dataLen, fit);
-            // If it didn't fit, remove this extra
-            if (!fit) {
-                errorParser->s3ErrorDetails.extraDetailsCount--;
-            }
-            return S3StatusOK;
-        }
-        // OK, must add another unknown error element, if it will fit.
-        if (errorParser->s3ErrorDetails.extraDetailsCount ==
-            sizeof(errorParser->extraDetails)) {
-            // Won't fit.  Ignore this one.
-            return S3StatusOK;
-        }
-        // Copy in the name and value
-        char *name = string_multibuffer_current
-            (errorParser->extraDetailsNamesValues);
-        int nameLen = strlen(elementName);
-        string_multibuffer_add(errorParser->extraDetailsNamesValues,
-                               elementName, nameLen, fit);
-        if (!fit) {
-            // Name didn't fit; ignore this one.
-            return S3StatusOK;
-        }
-        char *value = string_multibuffer_current
-            (errorParser->extraDetailsNamesValues);
-        string_multibuffer_add(errorParser->extraDetailsNamesValues,
-                               data, dataLen, fit);
-        if (!fit) {
-            // Value didn't fit; ignore this one.
-            return S3StatusOK;
-        }
-        S3NameValue *nv = 
-            &(errorParser->extraDetails
-              [errorParser->s3ErrorDetails.extraDetailsCount++]);
-        nv->name = name;
-        nv->value = value;
-    }
-
-    return S3StatusOK;
-}
-
-
-void error_parser_initialize(ErrorParser *errorParser)
-{
-    errorParser->s3ErrorDetails.message = 0;
-    errorParser->s3ErrorDetails.resource = 0;
-    errorParser->s3ErrorDetails.furtherDetails = 0;
-    errorParser->s3ErrorDetails.extraDetailsCount = 0;
-    errorParser->s3ErrorDetails.extraDetails = errorParser->extraDetails;
-    errorParser->errorXmlParserInitialized = 0;
-    string_buffer_initialize(errorParser->code);
-    string_buffer_initialize(errorParser->message);
-    string_buffer_initialize(errorParser->resource);
-    string_buffer_initialize(errorParser->furtherDetails);
-    string_multibuffer_initialize(errorParser->extraDetailsNamesValues);
-}
-
-
-S3Status error_parser_add(ErrorParser *errorParser, char *buffer,
-                          int bufferSize)
-{
-    if (!errorParser->errorXmlParserInitialized) {
-        simplexml_initialize(&(errorParser->errorXmlParser), &errorXmlCallback,
-                             errorParser);
-        errorParser->errorXmlParserInitialized = 1;
-    }
-
-    return simplexml_add(&(errorParser->errorXmlParser), buffer, bufferSize);
-}
-
-
-void error_parser_convert_status(ErrorParser *errorParser, S3Status *status)
-{
-    // Convert the error status string into a code
-    if (!errorParser->codeLen) {
-        return;
-    }
-
-#define HANDLE_CODE(name)                                       \
-    do {                                                        \
-        if (!strcmp(errorParser->code, #name)) {                \
-            *status = S3StatusError##name;                      \
-            goto code_set;                                      \
-        }                                                       \
-    } while (0)
-    
-    HANDLE_CODE(AccessDenied);
-    HANDLE_CODE(AccountProblem);
-    HANDLE_CODE(AmbiguousGrantByEmailAddress);
-    HANDLE_CODE(BadDigest);
-    HANDLE_CODE(BucketAlreadyExists);
-    HANDLE_CODE(BucketAlreadyOwnedByYou);
-    HANDLE_CODE(BucketNotEmpty);
-    HANDLE_CODE(CredentialsNotSupported);
-    HANDLE_CODE(CrossLocationLoggingProhibited);
-    HANDLE_CODE(EntityTooSmall);
-    HANDLE_CODE(EntityTooLarge);
-    HANDLE_CODE(ExpiredToken);
-    HANDLE_CODE(IncompleteBody);
-    HANDLE_CODE(IncorrectNumberOfFilesInPostRequest);
-    HANDLE_CODE(InlineDataTooLarge);
-    HANDLE_CODE(InternalError);
-    HANDLE_CODE(InvalidAccessKeyId);
-    HANDLE_CODE(InvalidAddressingHeader);
-    HANDLE_CODE(InvalidArgument);
-    HANDLE_CODE(InvalidBucketName);
-    HANDLE_CODE(InvalidDigest);
-    HANDLE_CODE(InvalidLocationConstraint);
-    HANDLE_CODE(InvalidPayer);
-    HANDLE_CODE(InvalidPolicyDocument);
-    HANDLE_CODE(InvalidRange);
-    HANDLE_CODE(InvalidSecurity);
-    HANDLE_CODE(InvalidSOAPRequest);
-    HANDLE_CODE(InvalidStorageClass);
-    HANDLE_CODE(InvalidTargetBucketForLogging);
-    HANDLE_CODE(InvalidToken);
-    HANDLE_CODE(InvalidURI);
-    HANDLE_CODE(KeyTooLong);
-    HANDLE_CODE(MalformedACLError);
-    HANDLE_CODE(MalformedXML);
-    HANDLE_CODE(MaxMessageLengthExceeded);
-    HANDLE_CODE(MaxPostPreDataLengthExceededError);
-    HANDLE_CODE(MetadataTooLarge);
-    HANDLE_CODE(MethodNotAllowed);
-    HANDLE_CODE(MissingAttachment);
-    HANDLE_CODE(MissingContentLength);
-    HANDLE_CODE(MissingSecurityElement);
-    HANDLE_CODE(MissingSecurityHeader);
-    HANDLE_CODE(NoLoggingStatusForKey);
-    HANDLE_CODE(NoSuchBucket);
-    HANDLE_CODE(NoSuchKey);
-    HANDLE_CODE(NotImplemented);
-    HANDLE_CODE(NotSignedUp);
-    HANDLE_CODE(OperationAborted);
-    HANDLE_CODE(PermanentRedirect);
-    HANDLE_CODE(PreconditionFailed);
-    HANDLE_CODE(Redirect);
-    HANDLE_CODE(RequestIsNotMultiPartContent);
-    HANDLE_CODE(RequestTimeout);
-    HANDLE_CODE(RequestTimeTooSkewed);
-    HANDLE_CODE(RequestTorrentOfBucketError);
-    HANDLE_CODE(SignatureDoesNotMatch);
-    HANDLE_CODE(SlowDown);
-    HANDLE_CODE(TemporaryRedirect);
-    HANDLE_CODE(TokenRefreshRequired);
-    HANDLE_CODE(TooManyBuckets);
-    HANDLE_CODE(UnexpectedContent);
-    HANDLE_CODE(UnresolvableGrantByEmailAddress);
-    HANDLE_CODE(UserKeyMustBeSpecified);
-    *status = S3StatusErrorUnknown;
-
- code_set:
-
-    return;
-}
-
-
-// Always call this
-void error_parser_deinitialize(ErrorParser *errorParser)
-{
-    if (errorParser->errorXmlParserInitialized) {
-        simplexml_deinitialize(&(errorParser->errorXmlParser));
-    }
-}
diff --git a/src/libs3/src/general.c b/src/libs3/src/general.c
deleted file mode 100644
index 867ae5d..0000000
--- a/src/libs3/src/general.c
+++ /dev/null
@@ -1,473 +0,0 @@
-/** **************************************************************************
- * general.c
- * 
- * Copyright 2008 Bryan Ischo <bryan at ischo.com>
- * 
- * This file is part of libs3.
- * 
- * libs3 is free software: you can redistribute it and/or modify it under the
- * terms of the GNU General Public License as published by the Free Software
- * Foundation, version 3 of the License.
- *
- * In addition, as a special exception, the copyright holders give
- * permission to link the code of this library and its programs with the
- * OpenSSL library, and distribute linked combinations including the two.
- *
- * libs3 is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License version 3
- * along with libs3, in a file named COPYING.  If not, see
- * <http://www.gnu.org/licenses/>.
- *
- ************************************************************************** **/
-
-#include <ctype.h>
-#include <string.h>
-#include "request.h"
-#include "simplexml.h"
-#include "util.h"
-
-static int initializeCountG = 0;
-
-S3Status S3_initialize(const char *userAgentInfo, int flags,
-                       const char *defaultS3HostName)
-{
-    if (initializeCountG++) {
-        return S3StatusOK;
-    }
-
-    return request_api_initialize(userAgentInfo, flags, defaultS3HostName);
-}
-
-
-void S3_deinitialize(void)
-{
-    if (--initializeCountG) {
-        return;
-    }
-
-    request_api_deinitialize();
-}
-
-const char *S3_get_status_name(S3Status status)
-{
-    switch (status) {
-#define handlecase(s)                           \
-        case S3Status##s:                       \
-            return #s
-
-        handlecase(OK);
-        handlecase(InternalError);
-        handlecase(OutOfMemory);
-        handlecase(Interrupted);
-        handlecase(InvalidBucketNameTooLong);
-        handlecase(InvalidBucketNameFirstCharacter);
-        handlecase(InvalidBucketNameCharacter);
-        handlecase(InvalidBucketNameCharacterSequence);
-        handlecase(InvalidBucketNameTooShort);
-        handlecase(InvalidBucketNameDotQuadNotation);
-        handlecase(QueryParamsTooLong);
-        handlecase(FailedToInitializeRequest);
-        handlecase(MetaDataHeadersTooLong);
-        handlecase(BadMetaData);
-        handlecase(BadContentType);
-        handlecase(ContentTypeTooLong);
-        handlecase(BadMD5);
-        handlecase(MD5TooLong);
-        handlecase(BadCacheControl);
-        handlecase(CacheControlTooLong);
-        handlecase(BadContentDispositionFilename);
-        handlecase(ContentDispositionFilenameTooLong);
-        handlecase(BadContentEncoding);
-        handlecase(ContentEncodingTooLong);
-        handlecase(BadIfMatchETag);
-        handlecase(IfMatchETagTooLong);
-        handlecase(BadIfNotMatchETag);
-        handlecase(IfNotMatchETagTooLong);
-        handlecase(HeadersTooLong);
-        handlecase(KeyTooLong);
-        handlecase(UriTooLong);
-        handlecase(XmlParseFailure);
-        handlecase(EmailAddressTooLong);
-        handlecase(UserIdTooLong);
-        handlecase(UserDisplayNameTooLong);
-        handlecase(GroupUriTooLong);
-        handlecase(PermissionTooLong);
-        handlecase(TargetBucketTooLong);
-        handlecase(TargetPrefixTooLong);
-        handlecase(TooManyGrants);
-        handlecase(BadGrantee);
-        handlecase(BadPermission);
-        handlecase(XmlDocumentTooLarge);
-        handlecase(NameLookupError);
-        handlecase(FailedToConnect);
-        handlecase(ServerFailedVerification);
-        handlecase(ConnectionFailed);
-        handlecase(AbortedByCallback);
-        handlecase(ErrorAccessDenied);
-        handlecase(ErrorAccountProblem);
-        handlecase(ErrorAmbiguousGrantByEmailAddress);
-        handlecase(ErrorBadDigest);
-        handlecase(ErrorBucketAlreadyExists);
-        handlecase(ErrorBucketAlreadyOwnedByYou);
-        handlecase(ErrorBucketNotEmpty);
-        handlecase(ErrorCredentialsNotSupported);
-        handlecase(ErrorCrossLocationLoggingProhibited);
-        handlecase(ErrorEntityTooSmall);
-        handlecase(ErrorEntityTooLarge);
-        handlecase(ErrorExpiredToken);
-        handlecase(ErrorIncompleteBody);
-        handlecase(ErrorIncorrectNumberOfFilesInPostRequest);
-        handlecase(ErrorInlineDataTooLarge);
-        handlecase(ErrorInternalError);
-        handlecase(ErrorInvalidAccessKeyId);
-        handlecase(ErrorInvalidAddressingHeader);
-        handlecase(ErrorInvalidArgument);
-        handlecase(ErrorInvalidBucketName);
-        handlecase(ErrorInvalidDigest);
-        handlecase(ErrorInvalidLocationConstraint);
-        handlecase(ErrorInvalidPayer);
-        handlecase(ErrorInvalidPolicyDocument);
-        handlecase(ErrorInvalidRange);
-        handlecase(ErrorInvalidSecurity);
-        handlecase(ErrorInvalidSOAPRequest);
-        handlecase(ErrorInvalidStorageClass);
-        handlecase(ErrorInvalidTargetBucketForLogging);
-        handlecase(ErrorInvalidToken);
-        handlecase(ErrorInvalidURI);
-        handlecase(ErrorKeyTooLong);
-        handlecase(ErrorMalformedACLError);
-        handlecase(ErrorMalformedXML);
-        handlecase(ErrorMaxMessageLengthExceeded);
-        handlecase(ErrorMaxPostPreDataLengthExceededError);
-        handlecase(ErrorMetadataTooLarge);
-        handlecase(ErrorMethodNotAllowed);
-        handlecase(ErrorMissingAttachment);
-        handlecase(ErrorMissingContentLength);
-        handlecase(ErrorMissingSecurityElement);
-        handlecase(ErrorMissingSecurityHeader);
-        handlecase(ErrorNoLoggingStatusForKey);
-        handlecase(ErrorNoSuchBucket);
-        handlecase(ErrorNoSuchKey);
-        handlecase(ErrorNotImplemented);
-        handlecase(ErrorNotSignedUp);
-        handlecase(ErrorOperationAborted);
-        handlecase(ErrorPermanentRedirect);
-        handlecase(ErrorPreconditionFailed);
-        handlecase(ErrorRedirect);
-        handlecase(ErrorRequestIsNotMultiPartContent);
-        handlecase(ErrorRequestTimeout);
-        handlecase(ErrorRequestTimeTooSkewed);
-        handlecase(ErrorRequestTorrentOfBucketError);
-        handlecase(ErrorSignatureDoesNotMatch);
-        handlecase(ErrorSlowDown);
-        handlecase(ErrorTemporaryRedirect);
-        handlecase(ErrorTokenRefreshRequired);
-        handlecase(ErrorTooManyBuckets);
-        handlecase(ErrorUnexpectedContent);
-        handlecase(ErrorUnresolvableGrantByEmailAddress);
-        handlecase(ErrorUserKeyMustBeSpecified);
-        handlecase(ErrorUnknown);    
-        handlecase(HttpErrorMovedTemporarily);
-        handlecase(HttpErrorBadRequest);
-        handlecase(HttpErrorForbidden);
-        handlecase(HttpErrorNotFound);
-        handlecase(HttpErrorConflict);
-        handlecase(HttpErrorUnknown);
-    }
-
-    return "Unknown";
-}
-
-
-S3Status S3_validate_bucket_name(const char *bucketName, S3UriStyle uriStyle)
-{
-    int virtualHostStyle = (uriStyle == S3UriStyleVirtualHost);
-    int len = 0, maxlen = virtualHostStyle ? 63 : 255;
-    const char *b = bucketName;
-
-    int hasDot = 0;
-    int hasNonDigit = 0;
-
-    while (*b) {
-        if (len == maxlen) {
-            return S3StatusInvalidBucketNameTooLong;
-        }
-        else if (isalpha(*b)) {
-            len++, b++;
-            hasNonDigit = 1;
-        }
-        else if (isdigit(*b)) {
-            len++, b++;
-        }
-        else if (len == 0) {
-            return S3StatusInvalidBucketNameFirstCharacter;
-        }
-        else if (*b == '_') {
-            /* Virtual host style bucket names cannot have underscores */
-            if (virtualHostStyle) {
-                return S3StatusInvalidBucketNameCharacter;
-            }
-            len++, b++;
-            hasNonDigit = 1;
-        }
-        else if (*b == '-') {
-            /* Virtual host style bucket names cannot have .- */
-            if (virtualHostStyle && (b > bucketName) && (*(b - 1) == '.')) {
-                return S3StatusInvalidBucketNameCharacterSequence;
-            }
-            len++, b++;
-            hasNonDigit = 1;
-        }
-        else if (*b == '.') {
-            /* Virtual host style bucket names cannot have -. */
-            if (virtualHostStyle && (b > bucketName) && (*(b - 1) == '-')) {
-                return S3StatusInvalidBucketNameCharacterSequence;
-            }
-            len++, b++;
-            hasDot = 1;
-        }
-        else {
-            return S3StatusInvalidBucketNameCharacter;
-        }
-    }
-
-    if (len < 3) {
-        return S3StatusInvalidBucketNameTooShort;
-    }
-
-    /* It's not clear from Amazon's documentation exactly what 'IP address
-       style' means.  In its strictest sense, it could mean 'could be a valid
-       IP address', which would mean that 255.255.255.255 would be invalid,
-       wherase 256.256.256.256 would be valid.  Or it could mean 'has 4 sets
-       of digits separated by dots'.  Who knows.  Let's just be really
-       conservative here: if it has any dots, and no non-digit characters,
-       then we reject it */
-    if (hasDot && !hasNonDigit) {
-        return S3StatusInvalidBucketNameDotQuadNotation;
-    }
-
-    return S3StatusOK;
-}
-
-
-typedef struct ConvertAclData
-{
-    char *ownerId;
-    int ownerIdLen;
-    char *ownerDisplayName;
-    int ownerDisplayNameLen;
-    int *aclGrantCountReturn;
-    S3AclGrant *aclGrants;
-
-    string_buffer(emailAddress, S3_MAX_GRANTEE_EMAIL_ADDRESS_SIZE);
-    string_buffer(userId, S3_MAX_GRANTEE_USER_ID_SIZE);
-    string_buffer(userDisplayName, S3_MAX_GRANTEE_DISPLAY_NAME_SIZE);
-    string_buffer(groupUri, 128);
-    string_buffer(permission, 32);
-} ConvertAclData;
-
-
-static S3Status convertAclXmlCallback(const char *elementPath,
-                                      const char *data, int dataLen,
-                                      void *callbackData)
-{
-    ConvertAclData *caData = (ConvertAclData *) callbackData;
-
-    int fit;
-
-    if (data) {
-        if (!strcmp(elementPath, "AccessControlPolicy/Owner/ID")) {
-            caData->ownerIdLen += 
-                snprintf(&(caData->ownerId[caData->ownerIdLen]),
-                         S3_MAX_GRANTEE_USER_ID_SIZE - caData->ownerIdLen - 1,
-                         "%.*s", dataLen, data);
-            if (caData->ownerIdLen >= S3_MAX_GRANTEE_USER_ID_SIZE) {
-                return S3StatusUserIdTooLong;
-            }
-        }
-        else if (!strcmp(elementPath, "AccessControlPolicy/Owner/"
-                         "DisplayName")) {
-            caData->ownerDisplayNameLen += 
-                snprintf(&(caData->ownerDisplayName
-                           [caData->ownerDisplayNameLen]),
-                         S3_MAX_GRANTEE_DISPLAY_NAME_SIZE -
-                         caData->ownerDisplayNameLen - 1, 
-                         "%.*s", dataLen, data);
-            if (caData->ownerDisplayNameLen >= 
-                S3_MAX_GRANTEE_DISPLAY_NAME_SIZE) {
-                return S3StatusUserDisplayNameTooLong;
-            }
-        }
-        else if (!strcmp(elementPath, 
-                    "AccessControlPolicy/AccessControlList/Grant/"
-                    "Grantee/EmailAddress")) {
-            // AmazonCustomerByEmail
-            string_buffer_append(caData->emailAddress, data, dataLen, fit);
-            if (!fit) {
-                return S3StatusEmailAddressTooLong;
-            }
-        }
-        else if (!strcmp(elementPath,
-                         "AccessControlPolicy/AccessControlList/Grant/"
-                         "Grantee/ID")) {
-            // CanonicalUser
-            string_buffer_append(caData->userId, data, dataLen, fit);
-            if (!fit) {
-                return S3StatusUserIdTooLong;
-            }
-        }
-        else if (!strcmp(elementPath,
-                         "AccessControlPolicy/AccessControlList/Grant/"
-                         "Grantee/DisplayName")) {
-            // CanonicalUser
-            string_buffer_append(caData->userDisplayName, data, dataLen, fit);
-            if (!fit) {
-                return S3StatusUserDisplayNameTooLong;
-            }
-        }
-        else if (!strcmp(elementPath,
-                         "AccessControlPolicy/AccessControlList/Grant/"
-                         "Grantee/URI")) {
-            // Group
-            string_buffer_append(caData->groupUri, data, dataLen, fit);
-            if (!fit) {
-                return S3StatusGroupUriTooLong;
-            }
-        }
-        else if (!strcmp(elementPath,
-                         "AccessControlPolicy/AccessControlList/Grant/"
-                         "Permission")) {
-            // Permission
-            string_buffer_append(caData->permission, data, dataLen, fit);
-            if (!fit) {
-                return S3StatusPermissionTooLong;
-            }
-        }
-    }
-    else {
-        if (!strcmp(elementPath, "AccessControlPolicy/AccessControlList/"
-                    "Grant")) {
-            // A grant has just been completed; so add the next S3AclGrant
-            // based on the values read
-            if (*(caData->aclGrantCountReturn) == S3_MAX_ACL_GRANT_COUNT) {
-                return S3StatusTooManyGrants;
-            }
-
-            S3AclGrant *grant = &(caData->aclGrants
-                                  [*(caData->aclGrantCountReturn)]);
-
-            if (caData->emailAddress[0]) {
-                grant->granteeType = S3GranteeTypeAmazonCustomerByEmail;
-                strcpy(grant->grantee.amazonCustomerByEmail.emailAddress,
-                       caData->emailAddress);
-            }
-            else if (caData->userId[0] && caData->userDisplayName[0]) {
-                grant->granteeType = S3GranteeTypeCanonicalUser;
-                strcpy(grant->grantee.canonicalUser.id, caData->userId);
-                strcpy(grant->grantee.canonicalUser.displayName, 
-                       caData->userDisplayName);
-            }
-            else if (caData->groupUri[0]) {
-                if (!strcmp(caData->groupUri,
-                            ACS_GROUP_AWS_USERS)) {
-                    grant->granteeType = S3GranteeTypeAllAwsUsers;
-                }
-                else if (!strcmp(caData->groupUri,
-                            ACS_GROUP_ALL_USERS)) {
-                    grant->granteeType = S3GranteeTypeAllUsers;
-                }
-                else if (!strcmp(caData->groupUri,
-                                 ACS_GROUP_LOG_DELIVERY)) {
-                    grant->granteeType = S3GranteeTypeLogDelivery;
-                }
-                else {
-                    return S3StatusBadGrantee;
-                }
-            }
-            else {
-                return S3StatusBadGrantee;
-            }
-
-            if (!strcmp(caData->permission, "READ")) {
-                grant->permission = S3PermissionRead;
-            }
-            else if (!strcmp(caData->permission, "WRITE")) {
-                grant->permission = S3PermissionWrite;
-            }
-            else if (!strcmp(caData->permission, "READ_ACP")) {
-                grant->permission = S3PermissionReadACP;
-            }
-            else if (!strcmp(caData->permission, "WRITE_ACP")) {
-                grant->permission = S3PermissionWriteACP;
-            }
-            else if (!strcmp(caData->permission, "FULL_CONTROL")) {
-                grant->permission = S3PermissionFullControl;
-            }
-            else {
-                return S3StatusBadPermission;
-            }
-
-            (*(caData->aclGrantCountReturn))++;
-
-            string_buffer_initialize(caData->emailAddress);
-            string_buffer_initialize(caData->userId);
-            string_buffer_initialize(caData->userDisplayName);
-            string_buffer_initialize(caData->groupUri);
-            string_buffer_initialize(caData->permission);
-        }
-    }
-
-    return S3StatusOK;
-}
-
-
-S3Status S3_convert_acl(char *aclXml, char *ownerId, char *ownerDisplayName,
-                        int *aclGrantCountReturn, S3AclGrant *aclGrants)
-{
-    ConvertAclData data;
-
-    data.ownerId = ownerId;
-    data.ownerIdLen = 0;
-    data.ownerId[0] = 0;
-    data.ownerDisplayName = ownerDisplayName;
-    data.ownerDisplayNameLen = 0;
-    data.ownerDisplayName[0] = 0;
-    data.aclGrantCountReturn = aclGrantCountReturn;
-    data.aclGrants = aclGrants;
-    *aclGrantCountReturn = 0;
-    string_buffer_initialize(data.emailAddress);
-    string_buffer_initialize(data.userId);
-    string_buffer_initialize(data.userDisplayName);
-    string_buffer_initialize(data.groupUri);
-    string_buffer_initialize(data.permission);
-
-    // Use a simplexml parser
-    SimpleXml simpleXml;
-    simplexml_initialize(&simpleXml, &convertAclXmlCallback, &data);
-
-    S3Status status = simplexml_add(&simpleXml, aclXml, strlen(aclXml));
-
-    simplexml_deinitialize(&simpleXml);
-                                          
-    return status;
-}
-
-
-int S3_status_is_retryable(S3Status status)
-{
-    switch (status) {
-    case S3StatusNameLookupError:
-    case S3StatusFailedToConnect:
-    case S3StatusConnectionFailed:
-    case S3StatusErrorInternalError:
-    case S3StatusErrorOperationAborted:
-    case S3StatusErrorRequestTimeout:
-        return 1;
-    default:
-        return 0;
-    }
-}
diff --git a/src/libs3/src/mingw_functions.c b/src/libs3/src/mingw_functions.c
deleted file mode 100644
index 0e2b7b2..0000000
--- a/src/libs3/src/mingw_functions.c
+++ /dev/null
@@ -1,119 +0,0 @@
-/** **************************************************************************
- * mingw_functions.c
- * 
- * Copyright 2008 Bryan Ischo <bryan at ischo.com>
- * 
- * This file is part of libs3.
- * 
- * libs3 is free software: you can redistribute it and/or modify it under the
- * terms of the GNU General Public License as published by the Free Software
- * Foundation, version 3 of the License.
- *
- * In addition, as a special exception, the copyright holders give
- * permission to link the code of this library and its programs with the
- * OpenSSL library, and distribute linked combinations including the two.
- *
- * libs3 is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License version 3
- * along with libs3, in a file named COPYING.  If not, see
- * <http://www.gnu.org/licenses/>.
- *
- ************************************************************************** **/
-
-#include <pthread.h>
-#include <sys/utsname.h>
-
-unsigned long pthread_self()
-{
-    return (unsigned long) GetCurrentThreadId();
-}
-
-
-int pthread_mutex_init(pthread_mutex_t *mutex, void *v)
-{
-    (void) v;
-
-    InitializeCriticalSection(&(mutex->criticalSection));
-
-    return 0;
-}
-
-
-int pthread_mutex_lock(pthread_mutex_t *mutex)
-{
-    EnterCriticalSection(&(mutex->criticalSection));
-
-    return 0;
-}
-
-
-int pthread_mutex_unlock(pthread_mutex_t *mutex)
-{
-    LeaveCriticalSection(&(mutex->criticalSection));
-
-    return 0;
-}
-
-
-int pthread_mutex_destroy(pthread_mutex_t *mutex)
-{
-    DeleteCriticalSection(&(mutex->criticalSection));
-
-    return 0;
-}
-
-
-int uname(struct utsname *u)
-{
-    OSVERSIONINFO info;
-    info.dwOSVersionInfoSize = sizeof(info);
-
-    if (!GetVersionEx(&info)) {
-        return -1;
-    }
-
-    u->machine = "";
-
-    switch (info.dwMajorVersion) {
-    case 4:
-        switch (info.dwMinorVersion) {
-        case 0:
-            u->sysname = "Microsoft Windows NT 4.0";
-            break;
-        case 10:
-            u->sysname = "Microsoft Windows 98";
-            break;
-        case 90:
-            u->sysname = "Microsoft Windows Me";
-            break;
-        default:
-            return -1;
-        }
-        break;
-
-    case 5:
-        switch (info.dwMinorVersion) {
-        case 0:
-            u->sysname = "Microsoft Windows 2000";
-            break;
-        case 1:
-            u->sysname = "Microsoft Windows XP";
-            break;
-        case 2:
-            u->sysname = "Microsoft Server 2003";
-            break;
-        default:
-            return -1;
-        }
-        break;
-
-    default:
-        return -1;
-    }
-
-    return 0;
-}
diff --git a/src/libs3/src/mingw_s3_functions.c b/src/libs3/src/mingw_s3_functions.c
deleted file mode 100644
index 142569d..0000000
--- a/src/libs3/src/mingw_s3_functions.c
+++ /dev/null
@@ -1,37 +0,0 @@
-/** **************************************************************************
- * mingw_s3_functions.c
- * 
- * Copyright 2008 Bryan Ischo <bryan at ischo.com>
- * 
- * This file is part of libs3.
- * 
- * libs3 is free software: you can redistribute it and/or modify it under the
- * terms of the GNU General Public License as published by the Free Software
- * Foundation, version 3 of the License.
- *
- * In addition, as a special exception, the copyright holders give
- * permission to link the code of this library and its programs with the
- * OpenSSL library, and distribute linked combinations including the two.
- *
- * libs3 is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License version 3
- * along with libs3, in a file named COPYING.  If not, see
- * <http://www.gnu.org/licenses/>.
- *
- ************************************************************************** **/
-
-int setenv(const char *a, const char *b, int c)
-{
-    (void) c;
-
-    return SetEnvironmentVariable(a, b);
-}
-
-int unsetenv(const char *a)
-{
-    return SetEnvironmentVariable(a, 0);
-}
diff --git a/src/libs3/src/object.c b/src/libs3/src/object.c
deleted file mode 100644
index d7c7f80..0000000
--- a/src/libs3/src/object.c
+++ /dev/null
@@ -1,345 +0,0 @@
-/** **************************************************************************
- * object.c
- * 
- * Copyright 2008 Bryan Ischo <bryan at ischo.com>
- * 
- * This file is part of libs3.
- * 
- * libs3 is free software: you can redistribute it and/or modify it under the
- * terms of the GNU General Public License as published by the Free Software
- * Foundation, version 3 of the License.
- *
- * In addition, as a special exception, the copyright holders give
- * permission to link the code of this library and its programs with the
- * OpenSSL library, and distribute linked combinations including the two.
- *
- * libs3 is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License version 3
- * along with libs3, in a file named COPYING.  If not, see
- * <http://www.gnu.org/licenses/>.
- *
- ************************************************************************** **/
-
-#include <stdlib.h>
-#include <string.h>
-#include "libs3.h"
-#include "request.h"
-
-
-// put object ----------------------------------------------------------------
-
-void S3_put_object(const S3BucketContext *bucketContext, const char *key,
-                   uint64_t contentLength,
-                   const S3PutProperties *putProperties,
-                   S3RequestContext *requestContext,
-                   const S3PutObjectHandler *handler, void *callbackData)
-{
-    // Set up the RequestParams
-    RequestParams params =
-    {
-        HttpRequestTypePUT,                           // httpRequestType
-        { bucketContext->hostName,                    // hostName
-          bucketContext->bucketName,                  // bucketName
-          bucketContext->protocol,                    // protocol
-          bucketContext->uriStyle,                    // uriStyle
-          bucketContext->accessKeyId,                 // accessKeyId
-          bucketContext->secretAccessKey },           // secretAccessKey
-        key,                                          // key
-        0,                                            // queryParams
-        0,                                            // subResource
-        0,                                            // copySourceBucketName
-        0,                                            // copySourceKey
-        0,                                            // getConditions
-        0,                                            // startByte
-        0,                                            // byteCount
-        putProperties,                                // putProperties
-        handler->responseHandler.propertiesCallback,  // propertiesCallback
-        handler->putObjectDataCallback,               // toS3Callback
-        contentLength,                                // toS3CallbackTotalSize
-        0,                                            // fromS3Callback
-        handler->responseHandler.completeCallback,    // completeCallback
-        callbackData                                  // callbackData
-    };
-
-    // Perform the request
-    request_perform(&params, requestContext);
-}
-
-
-// copy object ---------------------------------------------------------------
-
-
-typedef struct CopyObjectData
-{
-    SimpleXml simpleXml;
-
-    S3ResponsePropertiesCallback *responsePropertiesCallback;
-    S3ResponseCompleteCallback *responseCompleteCallback;
-    void *callbackData;
-
-    int64_t *lastModifiedReturn;
-    int eTagReturnSize;
-    char *eTagReturn;
-    int eTagReturnLen;
-    
-    string_buffer(lastModified, 256);
-} CopyObjectData;
-
-
-static S3Status copyObjectXmlCallback(const char *elementPath,
-                                      const char *data, int dataLen,
-                                      void *callbackData)
-{
-    CopyObjectData *coData = (CopyObjectData *) callbackData;
-
-    int fit;
-
-    if (data) {
-        if (!strcmp(elementPath, "CopyObjectResult/LastModified")) {
-            string_buffer_append(coData->lastModified, data, dataLen, fit);
-        }
-        else if (!strcmp(elementPath, "CopyObjectResult/ETag")) {
-            if (coData->eTagReturnSize && coData->eTagReturn) {
-                coData->eTagReturnLen +=
-                    snprintf(&(coData->eTagReturn[coData->eTagReturnLen]),
-                             coData->eTagReturnSize - 
-                             coData->eTagReturnLen - 1,
-                             "%.*s", dataLen, data);
-                if (coData->eTagReturnLen >= coData->eTagReturnSize) {
-                    return S3StatusXmlParseFailure;
-                }
-            }
-        }
-    }
-
-    /* Avoid compiler error about variable set but not used */
-    (void) fit;
-
-    return S3StatusOK;
-}
-
-
-static S3Status copyObjectPropertiesCallback
-    (const S3ResponseProperties *responseProperties, void *callbackData)
-{
-    CopyObjectData *coData = (CopyObjectData *) callbackData;
-    
-    return (*(coData->responsePropertiesCallback))
-        (responseProperties, coData->callbackData);
-}
-
-
-static S3Status copyObjectDataCallback(int bufferSize, const char *buffer,
-                                       void *callbackData)
-{
-    CopyObjectData *coData = (CopyObjectData *) callbackData;
-
-    return simplexml_add(&(coData->simpleXml), buffer, bufferSize);
-}
-
-
-static void copyObjectCompleteCallback(S3Status requestStatus, 
-                                       const S3ErrorDetails *s3ErrorDetails,
-                                       void *callbackData)
-{
-    CopyObjectData *coData = (CopyObjectData *) callbackData;
-
-    if (coData->lastModifiedReturn) {
-        time_t lastModified = -1;
-        if (coData->lastModifiedLen) {
-            lastModified = parseIso8601Time(coData->lastModified);
-        }
-
-        *(coData->lastModifiedReturn) = lastModified;
-    }
-
-    (*(coData->responseCompleteCallback))
-        (requestStatus, s3ErrorDetails, coData->callbackData);
-
-    simplexml_deinitialize(&(coData->simpleXml));
-
-    free(coData);
-}
-
-
-void S3_copy_object(const S3BucketContext *bucketContext, const char *key,
-                    const char *destinationBucket, const char *destinationKey,
-                    const S3PutProperties *putProperties,
-                    int64_t *lastModifiedReturn, int eTagReturnSize,
-                    char *eTagReturn, S3RequestContext *requestContext,
-                    const S3ResponseHandler *handler, void *callbackData)
-{
-    // Create the callback data
-    CopyObjectData *data = 
-        (CopyObjectData *) malloc(sizeof(CopyObjectData));
-    if (!data) {
-        (*(handler->completeCallback))(S3StatusOutOfMemory, 0, callbackData);
-        return;
-    }
-
-    simplexml_initialize(&(data->simpleXml), &copyObjectXmlCallback, data);
-
-    data->responsePropertiesCallback = handler->propertiesCallback;
-    data->responseCompleteCallback = handler->completeCallback;
-    data->callbackData = callbackData;
-
-    data->lastModifiedReturn = lastModifiedReturn;
-    data->eTagReturnSize = eTagReturnSize;
-    data->eTagReturn = eTagReturn;
-    if (data->eTagReturnSize && data->eTagReturn) {
-        data->eTagReturn[0] = 0;
-    }
-    data->eTagReturnLen = 0;
-    string_buffer_initialize(data->lastModified);
-
-    // Set up the RequestParams
-    RequestParams params =
-    {
-        HttpRequestTypeCOPY,                          // httpRequestType
-        { bucketContext->hostName,                    // hostName
-          destinationBucket ? destinationBucket : 
-          bucketContext->bucketName,                  // bucketName
-          bucketContext->protocol,                    // protocol
-          bucketContext->uriStyle,                    // uriStyle
-          bucketContext->accessKeyId,                 // accessKeyId
-          bucketContext->secretAccessKey },           // secretAccessKey
-        destinationKey ? destinationKey : key,        // key
-        0,                                            // queryParams
-        0,                                            // subResource
-        bucketContext->bucketName,                    // copySourceBucketName
-        key,                                          // copySourceKey
-        0,                                            // getConditions
-        0,                                            // startByte
-        0,                                            // byteCount
-        putProperties,                                // putProperties
-        &copyObjectPropertiesCallback,                // propertiesCallback
-        0,                                            // toS3Callback
-        0,                                            // toS3CallbackTotalSize
-        &copyObjectDataCallback,                      // fromS3Callback
-        &copyObjectCompleteCallback,                  // completeCallback
-        data                                          // callbackData
-    };
-
-    // Perform the request
-    request_perform(&params, requestContext);
-}
-
-
-// get object ----------------------------------------------------------------
-
-void S3_get_object(const S3BucketContext *bucketContext, const char *key,
-                   const S3GetConditions *getConditions,
-                   uint64_t startByte, uint64_t byteCount,
-                   S3RequestContext *requestContext,
-                   const S3GetObjectHandler *handler, void *callbackData)
-{
-    // Set up the RequestParams
-    RequestParams params =
-    {
-        HttpRequestTypeGET,                           // httpRequestType
-        { bucketContext->hostName,                    // hostName
-          bucketContext->bucketName,                  // bucketName
-          bucketContext->protocol,                    // protocol
-          bucketContext->uriStyle,                    // uriStyle
-          bucketContext->accessKeyId,                 // accessKeyId
-          bucketContext->secretAccessKey },           // secretAccessKey
-        key,                                          // key
-        0,                                            // queryParams
-        0,                                            // subResource
-        0,                                            // copySourceBucketName
-        0,                                            // copySourceKey
-        getConditions,                                // getConditions
-        startByte,                                    // startByte
-        byteCount,                                    // byteCount
-        0,                                            // putProperties
-        handler->responseHandler.propertiesCallback,  // propertiesCallback
-        0,                                            // toS3Callback
-        0,                                            // toS3CallbackTotalSize
-        handler->getObjectDataCallback,               // fromS3Callback
-        handler->responseHandler.completeCallback,    // completeCallback
-        callbackData                                  // callbackData
-    };
-
-    // Perform the request
-    request_perform(&params, requestContext);
-}
-
-
-// head object ---------------------------------------------------------------
-
-void S3_head_object(const S3BucketContext *bucketContext, const char *key,
-                    S3RequestContext *requestContext,
-                    const S3ResponseHandler *handler, void *callbackData)
-{
-    // Set up the RequestParams
-    RequestParams params =
-    {
-        HttpRequestTypeHEAD,                          // httpRequestType
-        { bucketContext->hostName,                    // hostName
-          bucketContext->bucketName,                  // bucketName
-          bucketContext->protocol,                    // protocol
-          bucketContext->uriStyle,                    // uriStyle
-          bucketContext->accessKeyId,                 // accessKeyId
-          bucketContext->secretAccessKey },           // secretAccessKey
-        key,                                          // key
-        0,                                            // queryParams
-        0,                                            // subResource
-        0,                                            // copySourceBucketName
-        0,                                            // copySourceKey
-        0,                                            // getConditions
-        0,                                            // startByte
-        0,                                            // byteCount
-        0,                                            // putProperties
-        handler->propertiesCallback,                  // propertiesCallback
-        0,                                            // toS3Callback
-        0,                                            // toS3CallbackTotalSize
-        0,                                            // fromS3Callback
-        handler->completeCallback,                    // completeCallback
-        callbackData                                  // callbackData
-    };
-
-    // Perform the request
-    request_perform(&params, requestContext);
-}
-                         
-
-// delete object --------------------------------------------------------------
-
-void S3_delete_object(const S3BucketContext *bucketContext, const char *key,
-                      S3RequestContext *requestContext,
-                      const S3ResponseHandler *handler, void *callbackData)
-{
-    // Set up the RequestParams
-    RequestParams params =
-    {
-        HttpRequestTypeDELETE,                        // httpRequestType
-        { bucketContext->hostName,                    // hostName
-          bucketContext->bucketName,                  // bucketName
-          bucketContext->protocol,                    // protocol
-          bucketContext->uriStyle,                    // uriStyle
-          bucketContext->accessKeyId,                 // accessKeyId
-          bucketContext->secretAccessKey },           // secretAccessKey
-        key,                                          // key
-        0,                                            // queryParams
-        0,                                            // subResource
-        0,                                            // copySourceBucketName
-        0,                                            // copySourceKey
-        0,                                            // getConditions
-        0,                                            // startByte
-        0,                                            // byteCount
-        0,                                            // putProperties
-        handler->propertiesCallback,                  // propertiesCallback
-        0,                                            // toS3Callback
-        0,                                            // toS3CallbackTotalSize
-        0,                                            // fromS3Callback
-        handler->completeCallback,                    // completeCallback
-        callbackData                                  // callbackData
-    };
-
-    // Perform the request
-    request_perform(&params, requestContext);
-}
diff --git a/src/libs3/src/request.c b/src/libs3/src/request.c
deleted file mode 100644
index 53bda41..0000000
--- a/src/libs3/src/request.c
+++ /dev/null
@@ -1,1392 +0,0 @@
-/** **************************************************************************
- * request.c
- * 
- * Copyright 2008 Bryan Ischo <bryan at ischo.com>
- * 
- * This file is part of libs3.
- * 
- * libs3 is free software: you can redistribute it and/or modify it under the
- * terms of the GNU General Public License as published by the Free Software
- * Foundation, version 3 of the License.
- *
- * In addition, as a special exception, the copyright holders give
- * permission to link the code of this library and its programs with the
- * OpenSSL library, and distribute linked combinations including the two.
- *
- * libs3 is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License version 3
- * along with libs3, in a file named COPYING.  If not, see
- * <http://www.gnu.org/licenses/>.
- *
- ************************************************************************** **/
-
-#include <ctype.h>
-#include <pthread.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/utsname.h>
-#include "request.h"
-#include "request_context.h"
-#include "response_headers_handler.h"
-#include "util.h"
-
-
-#define USER_AGENT_SIZE 256
-#define REQUEST_STACK_SIZE 32
-
-static char userAgentG[USER_AGENT_SIZE];
-
-static pthread_mutex_t requestStackMutexG;
-
-static Request *requestStackG[REQUEST_STACK_SIZE];
-
-static int requestStackCountG;
-
-char defaultHostNameG[S3_MAX_HOSTNAME_SIZE];
-
-
-typedef struct RequestComputedValues
-{
-    // All x-amz- headers, in normalized form (i.e. NAME: VALUE, no other ws)
-    char *amzHeaders[S3_MAX_METADATA_COUNT + 2]; // + 2 for acl and date
-
-    // The number of x-amz- headers
-    int amzHeadersCount;
-
-    // Storage for amzHeaders (the +256 is for x-amz-acl and x-amz-date)
-    char amzHeadersRaw[COMPACTED_METADATA_BUFFER_SIZE + 256 + 1];
-
-    // Canonicalized x-amz- headers
-    string_multibuffer(canonicalizedAmzHeaders,
-                       COMPACTED_METADATA_BUFFER_SIZE + 256 + 1);
-
-    // URL-Encoded key
-    char urlEncodedKey[MAX_URLENCODED_KEY_SIZE + 1];
-
-    // Canonicalized resource
-    char canonicalizedResource[MAX_CANONICALIZED_RESOURCE_SIZE + 1];
-
-    // Cache-Control header (or empty)
-    char cacheControlHeader[128];
-
-    // Content-Type header (or empty)
-    char contentTypeHeader[128];
-
-    // Content-MD5 header (or empty)
-    char md5Header[128];
-
-    // Content-Disposition header (or empty)
-    char contentDispositionHeader[128];
-
-    // Content-Encoding header (or empty)
-    char contentEncodingHeader[128];
-
-    // Expires header (or empty)
-    char expiresHeader[128];
-
-    // If-Modified-Since header
-    char ifModifiedSinceHeader[128];
-
-    // If-Unmodified-Since header
-    char ifUnmodifiedSinceHeader[128];
-
-    // If-Match header
-    char ifMatchHeader[128];
-
-    // If-None-Match header
-    char ifNoneMatchHeader[128];
-
-    // Range header
-    char rangeHeader[128];
-
-    // Authorization header
-    char authorizationHeader[128];
-} RequestComputedValues;
-
-
-// Called whenever we detect that the request headers have been completely
-// processed; which happens either when we get our first read/write callback,
-// or the request is finished being procesed.  Returns nonzero on success,
-// zero on failure.
-static void request_headers_done(Request *request)
-{
-    if (request->propertiesCallbackMade) {
-        return;
-    }
-
-    request->propertiesCallbackMade = 1;
-
-    // Get the http response code
-    long httpResponseCode;
-    request->httpResponseCode = 0;
-    if (curl_easy_getinfo(request->curl, CURLINFO_RESPONSE_CODE, 
-                          &httpResponseCode) != CURLE_OK) {
-        // Not able to get the HTTP response code - error
-        request->status = S3StatusInternalError;
-        return;
-    }
-    else {
-        request->httpResponseCode = httpResponseCode;
-    }
-
-    response_headers_handler_done(&(request->responseHeadersHandler), 
-                                  request->curl);
-
-    // Only make the callback if it was a successful request; otherwise we're
-    // returning information about the error response itself
-    if (request->propertiesCallback &&
-        (request->httpResponseCode >= 200) &&
-        (request->httpResponseCode <= 299)) {
-        request->status = (*(request->propertiesCallback))
-            (&(request->responseHeadersHandler.responseProperties), 
-             request->callbackData);
-    }
-}
-
-
-static size_t curl_header_func(void *ptr, size_t size, size_t nmemb,
-                               void *data)
-{
-    Request *request = (Request *) data;
-
-    int len = size * nmemb;
-
-    response_headers_handler_add
-        (&(request->responseHeadersHandler), (char *) ptr, len);
-
-    return len;
-}
-
-
-static size_t curl_read_func(void *ptr, size_t size, size_t nmemb, void *data)
-{
-    Request *request = (Request *) data;
-
-    int len = size * nmemb;
-
-    request_headers_done(request);
-
-    if (request->status != S3StatusOK) {
-        return CURL_READFUNC_ABORT;
-    }
-
-    // If there is no data callback, or the data callback has already returned
-    // contentLength bytes, return 0;
-    if (!request->toS3Callback || !request->toS3CallbackBytesRemaining) {
-        return 0;
-    }
-    
-    // Don't tell the callback that we are willing to accept more data than we
-    // really are
-    if (len > request->toS3CallbackBytesRemaining) {
-        len = request->toS3CallbackBytesRemaining;
-    }
-
-    // Otherwise, make the data callback
-    int ret = (*(request->toS3Callback))
-        (len, (char *) ptr, request->callbackData);
-    if (ret < 0) {
-        request->status = S3StatusAbortedByCallback;
-        return CURL_READFUNC_ABORT;
-    }
-    else {
-        if (ret > request->toS3CallbackBytesRemaining) {
-            ret = request->toS3CallbackBytesRemaining;
-        }
-        request->toS3CallbackBytesRemaining -= ret;
-        return ret;
-    }
-}
-
-
-static size_t curl_write_func(void *ptr, size_t size, size_t nmemb,
-                              void *data)
-{
-    Request *request = (Request *) data;
-
-    int len = size * nmemb;
-
-    request_headers_done(request);
-
-    if (request->status != S3StatusOK) {
-        return 0;
-    }
-
-    // On HTTP error, we expect to parse an HTTP error response
-    if ((request->httpResponseCode < 200) || 
-        (request->httpResponseCode > 299)) {
-        request->status = error_parser_add
-            (&(request->errorParser), (char *) ptr, len);
-    }
-    // If there was a callback registered, make it
-    else if (request->fromS3Callback) {
-        request->status = (*(request->fromS3Callback))
-            (len, (char *) ptr, request->callbackData);
-    }
-    // Else, consider this an error - S3 has sent back data when it was not
-    // expected
-    else {
-        request->status = S3StatusInternalError;
-    }
-
-    return ((request->status == S3StatusOK) ? len : 0);
-}
-
-
-// This function 'normalizes' all x-amz-meta headers provided in
-// params->requestHeaders, which means it removes all whitespace from
-// them such that they all look exactly like this:
-// x-amz-meta-${NAME}: ${VALUE}
-// It also adds the x-amz-acl, x-amz-copy-source, and x-amz-metadata-directive
-// headers if necessary, and always adds the x-amz-date header.  It copies the
-// raw string values into params->amzHeadersRaw, and creates an array of
-// string pointers representing these headers in params->amzHeaders (and also
-// sets params->amzHeadersCount to be the count of the total number of x-amz-
-// headers thus created).
-static S3Status compose_amz_headers(const RequestParams *params,
-                                    RequestComputedValues *values)
-{
-    const S3PutProperties *properties = params->putProperties;
-
-    values->amzHeadersCount = 0;
-    values->amzHeadersRaw[0] = 0;
-    int len = 0;
-
-    // Append a header to amzHeaders, trimming whitespace from the end.
-    // Does NOT trim whitespace from the beginning.
-#define headers_append(isNewHeader, format, ...)                        \
-    do {                                                                \
-        if (isNewHeader) {                                              \
-            values->amzHeaders[values->amzHeadersCount++] =             \
-                &(values->amzHeadersRaw[len]);                          \
-        }                                                               \
-        len += snprintf(&(values->amzHeadersRaw[len]),                  \
-                        sizeof(values->amzHeadersRaw) - len,            \
-                        format, __VA_ARGS__);                           \
-        if (len >= (int) sizeof(values->amzHeadersRaw)) {               \
-            return S3StatusMetaDataHeadersTooLong;                      \
-        }                                                               \
-        while ((len > 0) && (values->amzHeadersRaw[len - 1] == ' ')) {  \
-            len--;                                                      \
-        }                                                               \
-        values->amzHeadersRaw[len++] = 0;                               \
-    } while (0)
-
-#define header_name_tolower_copy(str, l)                                \
-    do {                                                                \
-        values->amzHeaders[values->amzHeadersCount++] =                 \
-            &(values->amzHeadersRaw[len]);                              \
-        if ((len + l) >= (int) sizeof(values->amzHeadersRaw)) {         \
-            return S3StatusMetaDataHeadersTooLong;                      \
-        }                                                               \
-        int todo = l;                                                   \
-        while (todo--) {                                                \
-            if ((*(str) >= 'A') && (*(str) <= 'Z')) {                   \
-                values->amzHeadersRaw[len++] = 'a' + (*(str) - 'A');    \
-            }                                                           \
-            else {                                                      \
-                values->amzHeadersRaw[len++] = *(str);                  \
-            }                                                           \
-            (str)++;                                                    \
-        }                                                               \
-    } while (0)
-
-    // Check and copy in the x-amz-meta headers
-    if (properties) {
-        int i;
-        for (i = 0; i < properties->metaDataCount; i++) {
-            const S3NameValue *property = &(properties->metaData[i]);
-            char headerName[S3_MAX_METADATA_SIZE - sizeof(": v")];
-            int l = snprintf(headerName, sizeof(headerName),
-                             S3_METADATA_HEADER_NAME_PREFIX "%s",
-                             property->name);
-            char *hn = headerName;
-            header_name_tolower_copy(hn, l);
-            // Copy in the value
-            headers_append(0, ": %s", property->value);
-        }
-
-        // Add the x-amz-acl header, if necessary
-        const char *cannedAclString;
-        switch (params->putProperties->cannedAcl) {
-        case S3CannedAclPrivate:
-            cannedAclString = 0;
-            break;
-        case S3CannedAclPublicRead:
-            cannedAclString = "public-read";
-            break;
-        case S3CannedAclPublicReadWrite:
-            cannedAclString = "public-read-write";
-            break;
-        default: // S3CannedAclAuthenticatedRead
-            cannedAclString = "authenticated-read";
-            break;
-        }
-        if (cannedAclString) {
-            headers_append(1, "x-amz-acl: %s", cannedAclString);
-        }
-    }
-
-    // Add the x-amz-date header
-    time_t now = time(NULL);
-    char date[64];
-    strftime(date, sizeof(date), "%a, %d %b %Y %H:%M:%S GMT", gmtime(&now));
-    headers_append(1, "x-amz-date: %s", date);
-
-    if (params->httpRequestType == HttpRequestTypeCOPY) {
-        // Add the x-amz-copy-source header
-        if (params->copySourceBucketName && params->copySourceBucketName[0] &&
-            params->copySourceKey && params->copySourceKey[0]) {
-            headers_append(1, "x-amz-copy-source: /%s/%s",
-                           params->copySourceBucketName,
-                           params->copySourceKey);
-        }
-        // And the x-amz-metadata-directive header
-        if (params->putProperties) {
-            headers_append(1, "%s", "x-amz-metadata-directive: REPLACE");
-        }
-    }
-
-    return S3StatusOK;
-}
-
-
-// Composes the other headers
-static S3Status compose_standard_headers(const RequestParams *params,
-                                         RequestComputedValues *values)
-{
-
-#define do_put_header(fmt, sourceField, destField, badError, tooLongError)  \
-    do {                                                                    \
-        if (params->putProperties &&                                        \
-            params->putProperties-> sourceField &&                          \
-            params->putProperties-> sourceField[0]) {                       \
-            /* Skip whitespace at beginning of val */                       \
-            const char *val = params->putProperties-> sourceField;          \
-            while (*val && is_blank(*val)) {                                \
-                val++;                                                      \
-            }                                                               \
-            if (!*val) {                                                    \
-                return badError;                                            \
-            }                                                               \
-            /* Compose header, make sure it all fit */                      \
-            int len = snprintf(values-> destField,                          \
-                               sizeof(values-> destField), fmt, val);       \
-            if (len >= (int) sizeof(values-> destField)) {                  \
-                return tooLongError;                                        \
-            }                                                               \
-            /* Now remove the whitespace at the end */                      \
-            while (is_blank(values-> destField[len])) {                     \
-                len--;                                                      \
-            }                                                               \
-            values-> destField[len] = 0;                                    \
-        }                                                                   \
-        else {                                                              \
-            values-> destField[0] = 0;                                      \
-        }                                                                   \
-    } while (0)
-
-#define do_get_header(fmt, sourceField, destField, badError, tooLongError)  \
-    do {                                                                    \
-        if (params->getConditions &&                                        \
-            params->getConditions-> sourceField &&                          \
-            params->getConditions-> sourceField[0]) {                       \
-            /* Skip whitespace at beginning of val */                       \
-            const char *val = params->getConditions-> sourceField;          \
-            while (*val && is_blank(*val)) {                                \
-                val++;                                                      \
-            }                                                               \
-            if (!*val) {                                                    \
-                return badError;                                            \
-            }                                                               \
-            /* Compose header, make sure it all fit */                      \
-            int len = snprintf(values-> destField,                          \
-                               sizeof(values-> destField), fmt, val);       \
-            if (len >= (int) sizeof(values-> destField)) {                  \
-                return tooLongError;                                        \
-            }                                                               \
-            /* Now remove the whitespace at the end */                      \
-            while (is_blank(values-> destField[len])) {                     \
-                len--;                                                      \
-            }                                                               \
-            values-> destField[len] = 0;                                    \
-        }                                                                   \
-        else {                                                              \
-            values-> destField[0] = 0;                                      \
-        }                                                                   \
-    } while (0)
-
-    // Cache-Control
-    do_put_header("Cache-Control: %s", cacheControl, cacheControlHeader,
-                  S3StatusBadCacheControl, S3StatusCacheControlTooLong);
-    
-    // ContentType
-    do_put_header("Content-Type: %s", contentType, contentTypeHeader,
-                  S3StatusBadContentType, S3StatusContentTypeTooLong);
-
-    // MD5
-    do_put_header("Content-MD5: %s", md5, md5Header, S3StatusBadMD5,
-                  S3StatusMD5TooLong);
-
-    // Content-Disposition
-    do_put_header("Content-Disposition: attachment; filename=\"%s\"",
-                  contentDispositionFilename, contentDispositionHeader,
-                  S3StatusBadContentDispositionFilename,
-                  S3StatusContentDispositionFilenameTooLong);
-    
-    // ContentEncoding
-    do_put_header("Content-Encoding: %s", contentEncoding, 
-                  contentEncodingHeader, S3StatusBadContentEncoding,
-                  S3StatusContentEncodingTooLong);
-    
-    // Expires
-    if (params->putProperties && (params->putProperties->expires >= 0)) {
-        time_t t = (time_t) params->putProperties->expires;
-        strftime(values->expiresHeader, sizeof(values->expiresHeader),
-                 "Expires: %a, %d %b %Y %H:%M:%S UTC", gmtime(&t));
-    }
-    else {
-        values->expiresHeader[0] = 0;
-    }
-
-    // If-Modified-Since
-    if (params->getConditions &&
-        (params->getConditions->ifModifiedSince >= 0)) {
-        time_t t = (time_t) params->getConditions->ifModifiedSince;
-        strftime(values->ifModifiedSinceHeader,
-                 sizeof(values->ifModifiedSinceHeader),
-                 "If-Modified-Since: %a, %d %b %Y %H:%M:%S UTC", gmtime(&t));
-    }
-    else {
-        values->ifModifiedSinceHeader[0] = 0;
-    }
-
-    // If-Unmodified-Since header
-    if (params->getConditions &&
-        (params->getConditions->ifNotModifiedSince >= 0)) {
-        time_t t = (time_t) params->getConditions->ifNotModifiedSince;
-        strftime(values->ifUnmodifiedSinceHeader,
-                 sizeof(values->ifUnmodifiedSinceHeader),
-                 "If-Unmodified-Since: %a, %d %b %Y %H:%M:%S UTC", gmtime(&t));
-    }
-    else {
-        values->ifUnmodifiedSinceHeader[0] = 0;
-    }
-    
-    // If-Match header
-    do_get_header("If-Match: %s", ifMatchETag, ifMatchHeader,
-                  S3StatusBadIfMatchETag, S3StatusIfMatchETagTooLong);
-    
-    // If-None-Match header
-    do_get_header("If-None-Match: %s", ifNotMatchETag, ifNoneMatchHeader,
-                  S3StatusBadIfNotMatchETag, 
-                  S3StatusIfNotMatchETagTooLong);
-    
-    // Range header
-    if (params->startByte || params->byteCount) {
-        if (params->byteCount) {
-            snprintf(values->rangeHeader, sizeof(values->rangeHeader),
-                     "Range: bytes=%llu-%llu", 
-                     (unsigned long long) params->startByte,
-                     (unsigned long long) (params->startByte + 
-                                           params->byteCount - 1));
-        }
-        else {
-            snprintf(values->rangeHeader, sizeof(values->rangeHeader),
-                     "Range: bytes=%llu-", 
-                     (unsigned long long) params->startByte);
-        }
-    }
-    else {
-        values->rangeHeader[0] = 0;
-    }
-
-    return S3StatusOK;
-}
-
-
-// URL encodes the params->key value into params->urlEncodedKey
-static S3Status encode_key(const RequestParams *params,
-                           RequestComputedValues *values)
-{
-    return (urlEncode(values->urlEncodedKey, params->key, S3_MAX_KEY_SIZE) ?
-            S3StatusOK : S3StatusUriTooLong);
-}
-
-
-// Simple comparison function for comparing two HTTP header names that are
-// embedded within an HTTP header line, returning true if header1 comes
-// before header2 alphabetically, false if not
-static int headerle(const char *header1, const char *header2)
-{
-    while (1) {
-        if (*header1 == ':') {
-            return (*header2 != ':');
-        }
-        else if (*header2 == ':') {
-            return 0;
-        }
-        else if (*header2 < *header1) {
-            return 0;
-        }
-        else if (*header2 > *header1) {
-            return 1;
-        }
-        header1++, header2++;
-    }
-}
-
-
-// Replace this with merge sort eventually, it's the best stable sort.  But
-// since typically the number of elements being sorted is small, it doesn't
-// matter that much which sort is used, and gnome sort is the world's simplest
-// stable sort.  Added a slight twist to the standard gnome_sort - don't go
-// forward +1, go forward to the last highest index considered.  This saves
-// all the string comparisons that would be done "going forward", and thus
-// only does the necessary string comparisons to move values back into their
-// sorted position.
-static void header_gnome_sort(const char **headers, int size)
-{
-    int i = 0, last_highest = 0;
-
-    while (i < size) {
-        if ((i == 0) || headerle(headers[i - 1], headers[i])) {
-            i = ++last_highest;
-        }
-        else {
-            const char *tmp = headers[i];
-            headers[i] = headers[i - 1];
-            headers[--i] = tmp;
-        }
-    }
-}
-
-
-// Canonicalizes the x-amz- headers into the canonicalizedAmzHeaders buffer
-static void canonicalize_amz_headers(RequestComputedValues *values)
-{
-    // Make a copy of the headers that will be sorted
-    const char *sortedHeaders[S3_MAX_METADATA_COUNT];
-
-    memcpy(sortedHeaders, values->amzHeaders,
-           (values->amzHeadersCount * sizeof(sortedHeaders[0])));
-
-    // Now sort these
-    header_gnome_sort(sortedHeaders, values->amzHeadersCount);
-
-    // Now copy this sorted list into the buffer, all the while:
-    // - folding repeated headers into single lines, and
-    // - folding multiple lines
-    // - removing the space after the colon
-    int lastHeaderLen = 0, i;
-    char *buffer = values->canonicalizedAmzHeaders;
-    for (i = 0; i < values->amzHeadersCount; i++) {
-        const char *header = sortedHeaders[i];
-        const char *c = header;
-        // If the header names are the same, append the next value
-        if ((i > 0) && 
-            !strncmp(header, sortedHeaders[i - 1], lastHeaderLen)) {
-            // Replacing the previous newline with a comma
-            *(buffer - 1) = ',';
-            // Skip the header name and space
-            c += (lastHeaderLen + 1);
-        }
-        // Else this is a new header
-        else {
-            // Copy in everything up to the space in the ": "
-            while (*c != ' ') {
-                *buffer++ = *c++;
-            }
-            // Save the header len since it's a new header
-            lastHeaderLen = c - header;
-            // Skip the space
-            c++;
-        }
-        // Now copy in the value, folding the lines
-        while (*c) {
-            // If c points to a \r\n[whitespace] sequence, then fold
-            // this newline out
-            if ((*c == '\r') && (*(c + 1) == '\n') && is_blank(*(c + 2))) {
-                c += 3;
-                while (is_blank(*c)) {
-                    c++;
-                }
-                // Also, what has most recently been copied into buffer amy
-                // have been whitespace, and since we're folding whitespace
-                // out around this newline sequence, back buffer up over
-                // any whitespace it contains
-                while (is_blank(*(buffer - 1))) {
-                    buffer--;
-                }
-                continue;
-            }
-            *buffer++ = *c++;
-        }
-        // Finally, add the newline
-        *buffer++ = '\n';
-    }
-
-    // Terminate the buffer
-    *buffer = 0;
-}
-
-
-// Canonicalizes the resource into params->canonicalizedResource
-static void canonicalize_resource(const char *bucketName,
-                                  const char *subResource,
-                                  const char *urlEncodedKey,
-                                  char *buffer)
-{
-    int len = 0;
-
-    *buffer = 0;
-
-#define append(str) len += sprintf(&(buffer[len]), "%s", str)
-
-    if (bucketName && bucketName[0]) {
-        buffer[len++] = '/';
-        append(bucketName);
-    }
-
-    append("/");
-
-    if (urlEncodedKey && urlEncodedKey[0]) {
-        append(urlEncodedKey);
-    }
-
-    if (subResource && subResource[0]) {
-        append("?");
-        append(subResource);
-    }
-}
-
-
-// Convert an HttpRequestType to an HTTP Verb string
-static const char *http_request_type_to_verb(HttpRequestType requestType)
-{
-    switch (requestType) {
-    case HttpRequestTypeGET:
-        return "GET";
-    case HttpRequestTypeHEAD:
-        return "HEAD";
-    case HttpRequestTypePUT:
-    case HttpRequestTypeCOPY:
-        return "PUT";
-    default: // HttpRequestTypeDELETE
-        return "DELETE";
-    }
-}
-
-
-// Composes the Authorization header for the request
-static S3Status compose_auth_header(const RequestParams *params,
-                                    RequestComputedValues *values)
-{
-    // We allow for:
-    // 17 bytes for HTTP-Verb + \n
-    // 129 bytes for Content-MD5 + \n
-    // 129 bytes for Content-Type + \n
-    // 1 byte for empty Date + \n
-    // CanonicalizedAmzHeaders & CanonicalizedResource
-    char signbuf[17 + 129 + 129 + 1 + 
-                 (sizeof(values->canonicalizedAmzHeaders) - 1) +
-                 (sizeof(values->canonicalizedResource) - 1) + 1];
-    int len = 0;
-
-#define signbuf_append(format, ...)                             \
-    len += snprintf(&(signbuf[len]), sizeof(signbuf) - len,     \
-                    format, __VA_ARGS__)
-
-    signbuf_append
-        ("%s\n", http_request_type_to_verb(params->httpRequestType));
-
-    // For MD5 and Content-Type, use the value in the actual header, because
-    // it's already been trimmed
-    signbuf_append("%s\n", values->md5Header[0] ? 
-                   &(values->md5Header[sizeof("Content-MD5: ") - 1]) : "");
-
-    signbuf_append
-        ("%s\n", values->contentTypeHeader[0] ? 
-         &(values->contentTypeHeader[sizeof("Content-Type: ") - 1]) : "");
-
-    signbuf_append("%s", "\n"); // Date - we always use x-amz-date
-
-    signbuf_append("%s", values->canonicalizedAmzHeaders);
-
-    signbuf_append("%s", values->canonicalizedResource);
-
-    // Generate an HMAC-SHA-1 of the signbuf
-    unsigned char hmac[20];
-
-    HMAC_SHA1(hmac, (unsigned char *) params->bucketContext.secretAccessKey,
-              strlen(params->bucketContext.secretAccessKey),
-              (unsigned char *) signbuf, len);
-
-    // Now base-64 encode the results
-    char b64[((20 + 1) * 4) / 3];
-    int b64Len = base64Encode(hmac, 20, b64);
-    
-    snprintf(values->authorizationHeader, sizeof(values->authorizationHeader),
-             "Authorization: AWS %s:%.*s", params->bucketContext.accessKeyId,
-             b64Len, b64);
-
-    return S3StatusOK;
-}
-
-
-// Compose the URI to use for the request given the request parameters
-static S3Status compose_uri(char *buffer, int bufferSize,
-                            const S3BucketContext *bucketContext,
-                            const char *urlEncodedKey,
-                            const char *subResource, const char *queryParams)
-{
-    int len = 0;
-    
-#define uri_append(fmt, ...)                                                 \
-    do {                                                                     \
-        len += snprintf(&(buffer[len]), bufferSize - len, fmt, __VA_ARGS__); \
-        if (len >= bufferSize) {                                             \
-            return S3StatusUriTooLong;                                       \
-        }                                                                    \
-    } while (0)
-
-    uri_append("http%s://", 
-               (bucketContext->protocol == S3ProtocolHTTP) ? "" : "s");
-
-    const char *hostName = 
-        bucketContext->hostName ? bucketContext->hostName : defaultHostNameG;
-
-    if (bucketContext->bucketName && 
-        bucketContext->bucketName[0]) {
-        if (bucketContext->uriStyle == S3UriStyleVirtualHost) {
-            uri_append("%s.%s", bucketContext->bucketName, hostName);
-        }
-        else {
-            uri_append("%s/%s", hostName, bucketContext->bucketName);
-        }
-    }
-    else {
-        uri_append("%s", hostName);
-    }
-
-    uri_append("%s", "/");
-
-    uri_append("%s", urlEncodedKey);
-    
-    if (subResource && subResource[0]) {
-        uri_append("?%s", subResource);
-    }
-    
-    if (queryParams) {
-        uri_append("%s%s", (subResource && subResource[0]) ? "&" : "?",
-                   queryParams);
-    }
-    
-    return S3StatusOK;
-}
-
-
-// Sets up the curl handle given the completely computed RequestParams
-static S3Status setup_curl(Request *request,
-                           const RequestParams *params,
-                           const RequestComputedValues *values)
-{
-    CURLcode status;
-
-#define curl_easy_setopt_safe(opt, val)                                 \
-    if ((status = curl_easy_setopt                                      \
-         (request->curl, opt, val)) != CURLE_OK) {                      \
-        return S3StatusFailedToInitializeRequest;                       \
-    }
-
-    // Debugging only
-    // curl_easy_setopt_safe(CURLOPT_VERBOSE, 1);
-    
-    // Set private data to request for the benefit of S3RequestContext
-    curl_easy_setopt_safe(CURLOPT_PRIVATE, request);
-    
-    // Set header callback and data
-    curl_easy_setopt_safe(CURLOPT_HEADERDATA, request);
-    curl_easy_setopt_safe(CURLOPT_HEADERFUNCTION, &curl_header_func);
-    
-    // Set read callback, data, and readSize
-    curl_easy_setopt_safe(CURLOPT_READFUNCTION, &curl_read_func);
-    curl_easy_setopt_safe(CURLOPT_READDATA, request);
-    
-    // Set write callback and data
-    curl_easy_setopt_safe(CURLOPT_WRITEFUNCTION, &curl_write_func);
-    curl_easy_setopt_safe(CURLOPT_WRITEDATA, request);
-
-    // Ask curl to parse the Last-Modified header.  This is easier than
-    // parsing it ourselves.
-    curl_easy_setopt_safe(CURLOPT_FILETIME, 1);
-
-    // Curl docs suggest that this is necessary for multithreaded code.
-    // However, it also points out that DNS timeouts will not be honored
-    // during DNS lookup, which can be worked around by using the c-ares
-    // library, which we do not do yet.
-    curl_easy_setopt_safe(CURLOPT_NOSIGNAL, 1);
-
-    // Turn off Curl's built-in progress meter
-    curl_easy_setopt_safe(CURLOPT_NOPROGRESS, 1);
-
-    // xxx todo - support setting the proxy for Curl to use (can't use https
-    // for proxies though)
-
-    // xxx todo - support setting the network interface for Curl to use
-
-    // I think this is useful - we don't need interactive performance, we need
-    // to complete large operations quickly
-    curl_easy_setopt_safe(CURLOPT_TCP_NODELAY, 1);
-    
-    // Don't use Curl's 'netrc' feature
-    curl_easy_setopt_safe(CURLOPT_NETRC, CURL_NETRC_IGNORED);
-
-    // Don't verify S3's certificate, there are known to be issues with
-    // them sometimes
-    // xxx todo - support an option for verifying the S3 CA (default false)
-    curl_easy_setopt_safe(CURLOPT_SSL_VERIFYPEER, 0);
-
-    // Follow any redirection directives that S3 sends
-    curl_easy_setopt_safe(CURLOPT_FOLLOWLOCATION, 1);
-
-    // A safety valve in case S3 goes bananas with redirects
-    curl_easy_setopt_safe(CURLOPT_MAXREDIRS, 10);
-
-    // Set the User-Agent; maybe Amazon will track these?
-    curl_easy_setopt_safe(CURLOPT_USERAGENT, userAgentG);
-
-    // Set the low speed limit and time; we abort transfers that stay at
-    // less than 1K per second for more than 15 seconds.
-    // xxx todo - make these configurable
-    // xxx todo - allow configurable max send and receive speed
-    curl_easy_setopt_safe(CURLOPT_LOW_SPEED_LIMIT, 1024);
-    curl_easy_setopt_safe(CURLOPT_LOW_SPEED_TIME, 15);
-
-    // Append standard headers
-#define append_standard_header(fieldName)                               \
-    if (values-> fieldName [0]) {                                       \
-        request->headers = curl_slist_append(request->headers,          \
-                                             values-> fieldName);       \
-    }
-
-    // Would use CURLOPT_INFILESIZE_LARGE, but it is buggy in libcurl
-    if (params->httpRequestType == HttpRequestTypePUT) {
-        char header[256];
-        snprintf(header, sizeof(header), "Content-Length: %llu",
-                 (unsigned long long) params->toS3CallbackTotalSize);
-        request->headers = curl_slist_append(request->headers, header);
-        request->headers = curl_slist_append(request->headers, 
-                                             "Transfer-Encoding:");
-    }
-    else if (params->httpRequestType == HttpRequestTypeCOPY) {
-        request->headers = curl_slist_append(request->headers, 
-                                             "Transfer-Encoding:");
-    }
-    
-    append_standard_header(cacheControlHeader);
-    append_standard_header(contentTypeHeader);
-    append_standard_header(md5Header);
-    append_standard_header(contentDispositionHeader);
-    append_standard_header(contentEncodingHeader);
-    append_standard_header(expiresHeader);
-    append_standard_header(ifModifiedSinceHeader);
-    append_standard_header(ifUnmodifiedSinceHeader);
-    append_standard_header(ifMatchHeader);
-    append_standard_header(ifNoneMatchHeader);
-    append_standard_header(rangeHeader);
-    append_standard_header(authorizationHeader);
-
-    // Append x-amz- headers
-    int i;
-    for (i = 0; i < values->amzHeadersCount; i++) {
-        request->headers = 
-            curl_slist_append(request->headers, values->amzHeaders[i]);
-    }
-
-    // Set the HTTP headers
-    curl_easy_setopt_safe(CURLOPT_HTTPHEADER, request->headers);
-
-    // Set URI
-    curl_easy_setopt_safe(CURLOPT_URL, request->uri);
-
-    // Set request type.
-    switch (params->httpRequestType) {
-    case HttpRequestTypeHEAD:
-    curl_easy_setopt_safe(CURLOPT_NOBODY, 1);
-        break;
-    case HttpRequestTypePUT:
-    case HttpRequestTypeCOPY:
-        curl_easy_setopt_safe(CURLOPT_UPLOAD, 1);
-        break;
-    case HttpRequestTypeDELETE:
-    curl_easy_setopt_safe(CURLOPT_CUSTOMREQUEST, "DELETE");
-        break;
-    default: // HttpRequestTypeGET
-        break;
-    }
-    
-    return S3StatusOK;
-}
-
-
-static void request_deinitialize(Request *request)
-{
-    if (request->headers) {
-        curl_slist_free_all(request->headers);
-    }
-    
-    error_parser_deinitialize(&(request->errorParser));
-
-    // curl_easy_reset prevents connections from being re-used for some
-    // reason.  This makes HTTP Keep-Alive meaningless and is very bad for
-    // performance.  But it is necessary to allow curl to work properly.
-    // xxx todo figure out why
-    curl_easy_reset(request->curl);
-}
-
-
-static S3Status request_get(const RequestParams *params, 
-                            const RequestComputedValues *values,
-                            Request **reqReturn)
-{
-    Request *request = 0;
-    
-    // Try to get one from the request stack.  We hold the lock for the
-    // shortest time possible here.
-    pthread_mutex_lock(&requestStackMutexG);
-
-    if (requestStackCountG) {
-        request = requestStackG[--requestStackCountG];
-    }
-    
-    pthread_mutex_unlock(&requestStackMutexG);
-
-    // If we got one, deinitialize it for re-use
-    if (request) {
-        request_deinitialize(request);
-    }
-    // Else there wasn't one available in the request stack, so create one
-    else {
-        if (!(request = (Request *) malloc(sizeof(Request)))) {
-            return S3StatusOutOfMemory;
-        }
-        if (!(request->curl = curl_easy_init())) {
-            free(request);
-            return S3StatusFailedToInitializeRequest;
-        }
-    }
-
-    // Initialize the request
-    request->prev = 0;
-    request->next = 0;
-
-    // Request status is initialized to no error, will be updated whenever
-    // an error occurs
-    request->status = S3StatusOK;
-
-    S3Status status;
-                        
-    // Start out with no headers
-    request->headers = 0;
-
-    // Compute the URL
-    if ((status = compose_uri
-         (request->uri, sizeof(request->uri), 
-          &(params->bucketContext), values->urlEncodedKey,
-          params->subResource, params->queryParams)) != S3StatusOK) {
-        curl_easy_cleanup(request->curl);
-        free(request);
-        return status;
-    }
-
-    // Set all of the curl handle options
-    if ((status = setup_curl(request, params, values)) != S3StatusOK) {
-        curl_easy_cleanup(request->curl);
-        free(request);
-        return status;
-    }
-
-    request->propertiesCallback = params->propertiesCallback;
-
-    request->toS3Callback = params->toS3Callback;
-
-    request->toS3CallbackBytesRemaining = params->toS3CallbackTotalSize;
-
-    request->fromS3Callback = params->fromS3Callback;
-
-    request->completeCallback = params->completeCallback;
-
-    request->callbackData = params->callbackData;
-
-    response_headers_handler_initialize(&(request->responseHeadersHandler));
-
-    request->propertiesCallbackMade = 0;
-    
-    error_parser_initialize(&(request->errorParser));
-
-    *reqReturn = request;
-    
-    return S3StatusOK;
-}
-
-
-static void request_destroy(Request *request)
-{
-    request_deinitialize(request);
-    curl_easy_cleanup(request->curl);
-    free(request);
-}
-
-
-static void request_release(Request *request)
-{
-    pthread_mutex_lock(&requestStackMutexG);
-
-    // If the request stack is full, destroy this one
-    if (requestStackCountG == REQUEST_STACK_SIZE) {
-        pthread_mutex_unlock(&requestStackMutexG);
-        request_destroy(request);
-    }
-    // Else put this one at the front of the request stack; we do this because
-    // we want the most-recently-used curl handle to be re-used on the next
-    // request, to maximize our chances of re-using a TCP connection before it
-    // times out
-    else {
-        requestStackG[requestStackCountG++] = request;
-        pthread_mutex_unlock(&requestStackMutexG);
-    }
-}
-
-
-S3Status request_api_initialize(const char *userAgentInfo, int flags,
-                                const char *defaultHostName)
-{
-    if (curl_global_init(CURL_GLOBAL_ALL & 
-                         ~((flags & S3_INIT_WINSOCK) ? 0 : CURL_GLOBAL_WIN32))
-        != CURLE_OK) {
-        return S3StatusInternalError;
-    }
-
-    if (!defaultHostName) {
-        defaultHostName = S3_DEFAULT_HOSTNAME;
-    }
-
-    if (snprintf(defaultHostNameG, S3_MAX_HOSTNAME_SIZE, 
-                 "%s", defaultHostName) >= S3_MAX_HOSTNAME_SIZE) {
-        return S3StatusUriTooLong;
-    }
-
-    pthread_mutex_init(&requestStackMutexG, 0);
-
-    requestStackCountG = 0;
-
-    if (!userAgentInfo || !*userAgentInfo) {
-        userAgentInfo = "Unknown";
-    }
-
-    char platform[96];
-    struct utsname utsn;
-    if (uname(&utsn)) {
-        strncpy(platform, "Unknown", sizeof(platform));
-        // Because strncpy doesn't always zero terminate
-        platform[sizeof(platform) - 1] = 0;
-    }
-    else {
-        snprintf(platform, sizeof(platform), "%s%s%s", utsn.sysname, 
-                 utsn.machine[0] ? " " : "", utsn.machine);
-    }
-
-    snprintf(userAgentG, sizeof(userAgentG), 
-             "Mozilla/4.0 (Compatible; %s; libs3 %s.%s; %s)",
-             userAgentInfo, LIBS3_VER_MAJOR, LIBS3_VER_MINOR, platform);
-    
-    return S3StatusOK;
-}
-
-
-void request_api_deinitialize(void)
-{
-    pthread_mutex_destroy(&requestStackMutexG);
-
-    while (requestStackCountG--) {
-        request_destroy(requestStackG[requestStackCountG]);
-    }
-}
-
-
-void request_perform(const RequestParams *params, S3RequestContext *context)
-{
-    Request *request;
-    S3Status status;
-
-#define return_status(status)                                           \
-    (*(params->completeCallback))(status, 0, params->callbackData);     \
-    return
-
-    // These will hold the computed values
-    RequestComputedValues computed;
-
-    // Validate the bucket name
-    if (params->bucketContext.bucketName && 
-        ((status = S3_validate_bucket_name
-          (params->bucketContext.bucketName, 
-           params->bucketContext.uriStyle)) != S3StatusOK)) {
-        return_status(status);
-    }
-
-    // Compose the amz headers
-    if ((status = compose_amz_headers(params, &computed)) != S3StatusOK) {
-        return_status(status);
-    }
-
-    // Compose standard headers
-    if ((status = compose_standard_headers
-         (params, &computed)) != S3StatusOK) {
-        return_status(status);
-    }
-
-    // URL encode the key
-    if ((status = encode_key(params, &computed)) != S3StatusOK) {
-        return_status(status);
-    }
-
-    // Compute the canonicalized amz headers
-    canonicalize_amz_headers(&computed);
-
-    // Compute the canonicalized resource
-    canonicalize_resource(params->bucketContext.bucketName,
-                          params->subResource, computed.urlEncodedKey,
-                          computed.canonicalizedResource);
-
-    // Compose Authorization header
-    if ((status = compose_auth_header(params, &computed)) != S3StatusOK) {
-        return_status(status);
-    }
-    
-    // Get an initialized Request structure now
-    if ((status = request_get(params, &computed, &request)) != S3StatusOK) {
-        return_status(status);
-    }
-
-    // If a RequestContext was provided, add the request to the curl multi
-    if (context) {
-        CURLMcode code = curl_multi_add_handle(context->curlm, request->curl);
-        if (code == CURLM_OK) {
-            if (context->requests) {
-                request->prev = context->requests->prev;
-                request->next = context->requests;
-                context->requests->prev->next = request;
-                context->requests->prev = request;
-            }
-            else {
-                context->requests = request->next = request->prev = request;
-            }
-        }
-        else {
-            if (request->status == S3StatusOK) {
-                request->status = (code == CURLM_OUT_OF_MEMORY) ?
-                    S3StatusOutOfMemory : S3StatusInternalError;
-            }
-            request_finish(request);
-        }
-    }
-    // Else, perform the request immediately
-    else {
-        CURLcode code = curl_easy_perform(request->curl);
-        if ((code != CURLE_OK) && (request->status == S3StatusOK)) {
-            request->status = request_curl_code_to_status(code);
-        }
-        // Finish the request, ensuring that all callbacks have been made, and
-        // also releases the request
-        request_finish(request);
-    }
-}
-
-
-void request_finish(Request *request)
-{
-    // If we haven't detected this already, we now know that the headers are
-    // definitely done being read in
-    request_headers_done(request);
-    
-    // If there was no error processing the request, then possibly there was
-    // an S3 error parsed, which should be converted into the request status
-    if (request->status == S3StatusOK) {
-        error_parser_convert_status(&(request->errorParser), 
-                                    &(request->status));
-        // If there still was no error recorded, then it is possible that
-        // there was in fact an error but that there was no error XML
-        // detailing the error
-        if ((request->status == S3StatusOK) &&
-            ((request->httpResponseCode < 200) ||
-             (request->httpResponseCode > 299))) {
-            switch (request->httpResponseCode) {
-            case 0:
-                // This happens if the request never got any HTTP response
-                // headers at all, we call this a ConnectionFailed error
-                request->status = S3StatusConnectionFailed;
-                break;
-            case 100: // Some versions of libcurl erroneously set HTTP
-                      // status to this
-                break;
-            case 301:
-                request->status = S3StatusErrorPermanentRedirect;
-                break;
-            case 307:
-                request->status = S3StatusHttpErrorMovedTemporarily;
-                break;
-            case 400:
-                request->status = S3StatusHttpErrorBadRequest;
-                break;
-            case 403: 
-                request->status = S3StatusHttpErrorForbidden;
-                break;
-            case 404:
-                request->status = S3StatusHttpErrorNotFound;
-                break;
-            case 405:
-                request->status = S3StatusErrorMethodNotAllowed;
-                break;
-            case 409:
-                request->status = S3StatusHttpErrorConflict;
-                break;
-            case 411:
-                request->status = S3StatusErrorMissingContentLength;
-                break;
-            case 412:
-                request->status = S3StatusErrorPreconditionFailed;
-                break;
-            case 416:
-                request->status = S3StatusErrorInvalidRange;
-                break;
-            case 500:
-                request->status = S3StatusErrorInternalError;
-                break;
-            case 501:
-                request->status = S3StatusErrorNotImplemented;
-                break;
-            case 503:
-                request->status = S3StatusErrorSlowDown;
-                break;
-            default:
-                request->status = S3StatusHttpErrorUnknown;
-                break;
-            }
-        }
-    }
-
-    (*(request->completeCallback))
-        (request->status, &(request->errorParser.s3ErrorDetails),
-         request->callbackData);
-
-    request_release(request);
-}
-
-
-S3Status request_curl_code_to_status(CURLcode code)
-{
-    switch (code) {
-    case CURLE_OUT_OF_MEMORY:
-        return S3StatusOutOfMemory;
-    case CURLE_COULDNT_RESOLVE_PROXY:
-    case CURLE_COULDNT_RESOLVE_HOST:
-        return S3StatusNameLookupError;
-    case CURLE_COULDNT_CONNECT:
-        return S3StatusFailedToConnect;
-    case CURLE_WRITE_ERROR:
-    case CURLE_OPERATION_TIMEDOUT:
-        return S3StatusConnectionFailed;
-    case CURLE_PARTIAL_FILE:
-        return S3StatusOK;
-    case CURLE_SSL_CACERT:
-        return S3StatusServerFailedVerification;
-    default:
-        return S3StatusInternalError;
-    }
-}
-
-
-S3Status S3_generate_authenticated_query_string
-    (char *buffer, const S3BucketContext *bucketContext,
-     const char *key, int64_t expires, const char *resource)
-{
-#define MAX_EXPIRES (((int64_t) 1 << 31) - 1)
-    // S3 seems to only accept expiration dates up to the number of seconds
-    // representably by a signed 32-bit integer
-    if (expires < 0) {
-        expires = MAX_EXPIRES;
-    }
-    else if (expires > MAX_EXPIRES) {
-        expires = MAX_EXPIRES;
-    }
-
-    // xxx todo: rework this so that it can be incorporated into shared code
-    // with request_perform().  It's really unfortunate that this code is not
-    // shared with request_perform().
-
-    // URL encode the key
-    char urlEncodedKey[S3_MAX_KEY_SIZE * 3];
-    if (key) {
-        urlEncode(urlEncodedKey, key, strlen(key));
-    }
-    else {
-        urlEncodedKey[0] = 0;
-    }
-
-    // Compute canonicalized resource
-    char canonicalizedResource[MAX_CANONICALIZED_RESOURCE_SIZE];
-    canonicalize_resource(bucketContext->bucketName, resource, urlEncodedKey,
-                          canonicalizedResource);
-                          
-    // We allow for:
-    // 17 bytes for HTTP-Verb + \n
-    // 1 byte for empty Content-MD5 + \n
-    // 1 byte for empty Content-Type + \n
-    // 20 bytes for Expires + \n
-    // 0 bytes for CanonicalizedAmzHeaders
-    // CanonicalizedResource
-    char signbuf[17 + 1 + 1 + 1 + 20 + sizeof(canonicalizedResource) + 1];
-    int len = 0;
-
-#define signbuf_append(format, ...)                             \
-    len += snprintf(&(signbuf[len]), sizeof(signbuf) - len,     \
-                    format, __VA_ARGS__)
-
-    signbuf_append("%s\n", "GET"); // HTTP-Verb
-    signbuf_append("%s\n", ""); // Content-MD5
-    signbuf_append("%s\n", ""); // Content-Type
-    signbuf_append("%llu\n", (unsigned long long) expires);
-    signbuf_append("%s", canonicalizedResource);
-
-    // Generate an HMAC-SHA-1 of the signbuf
-    unsigned char hmac[20];
-
-    HMAC_SHA1(hmac, (unsigned char *) bucketContext->secretAccessKey,
-              strlen(bucketContext->secretAccessKey),
-              (unsigned char *) signbuf, len);
-
-    // Now base-64 encode the results
-    char b64[((20 + 1) * 4) / 3];
-    int b64Len = base64Encode(hmac, 20, b64);
-
-    // Now urlEncode that
-    char signature[sizeof(b64) * 3];
-    urlEncode(signature, b64, b64Len);
-
-    // Finally, compose the uri, with params:
-    // ?AWSAccessKeyId=xxx[&Expires=]&Signature=xxx
-    char queryParams[sizeof("AWSAccessKeyId=") + 20 + 
-                     sizeof("&Expires=") + 20 + 
-                     sizeof("&Signature=") + sizeof(signature) + 1];
-
-    sprintf(queryParams, "AWSAccessKeyId=%s&Expires=%ld&Signature=%s",
-            bucketContext->accessKeyId, (long) expires, signature);
-
-    return compose_uri(buffer, S3_MAX_AUTHENTICATED_QUERY_STRING_SIZE,
-                       bucketContext, urlEncodedKey, resource, queryParams);
-}
diff --git a/src/libs3/src/request_context.c b/src/libs3/src/request_context.c
deleted file mode 100644
index ae48e55..0000000
--- a/src/libs3/src/request_context.c
+++ /dev/null
@@ -1,190 +0,0 @@
-/** **************************************************************************
- * request_context.c
- * 
- * Copyright 2008 Bryan Ischo <bryan at ischo.com>
- * 
- * This file is part of libs3.
- * 
- * libs3 is free software: you can redistribute it and/or modify it under the
- * terms of the GNU General Public License as published by the Free Software
- * Foundation, version 3 of the License.
- *
- * In addition, as a special exception, the copyright holders give
- * permission to link the code of this library and its programs with the
- * OpenSSL library, and distribute linked combinations including the two.
- *
- * libs3 is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License version 3
- * along with libs3, in a file named COPYING.  If not, see
- * <http://www.gnu.org/licenses/>.
- *
- ************************************************************************** **/
-
-#include <curl/curl.h>
-#include <stdlib.h>
-#include <sys/select.h>
-#include "request.h"
-#include "request_context.h"
-
-
-S3Status S3_create_request_context(S3RequestContext **requestContextReturn)
-{
-    *requestContextReturn = 
-        (S3RequestContext *) malloc(sizeof(S3RequestContext));
-    
-    if (!*requestContextReturn) {
-        return S3StatusOutOfMemory;
-    }
-    
-    if (!((*requestContextReturn)->curlm = curl_multi_init())) {
-        free(*requestContextReturn);
-        return S3StatusOutOfMemory;
-    }
-
-    (*requestContextReturn)->requests = 0;
-
-    return S3StatusOK;
-}
-
-
-void S3_destroy_request_context(S3RequestContext *requestContext)
-{
-    curl_multi_cleanup(requestContext->curlm);
-
-    // For each request in the context, call back its done method with
-    // 'interrupted' status
-    Request *r = requestContext->requests, *rFirst = r;
-    
-    if (r) do {
-        r->status = S3StatusInterrupted;
-        Request *rNext = r->next;
-        request_finish(r);
-        r = rNext;
-    } while (r != rFirst);
-
-    free(requestContext);
-}
-
-
-S3Status S3_runall_request_context(S3RequestContext *requestContext)
-{
-    int requestsRemaining;
-    do {
-        fd_set readfds, writefds, exceptfds;
-        FD_ZERO(&readfds);
-        FD_ZERO(&writefds);
-        FD_ZERO(&exceptfds);
-        int maxfd;
-        S3Status status = S3_get_request_context_fdsets
-            (requestContext, &readfds, &writefds, &exceptfds, &maxfd);
-        if (status != S3StatusOK) {
-            return status;
-        }
-        // curl will return -1 if it hasn't even created any fds yet because
-        // none of the connections have started yet.  In this case, don't
-        // do the select at all, because it will wait forever; instead, just
-        // skip it and go straight to running the underlying CURL handles
-        if (maxfd != -1) {
-            int64_t timeout = S3_get_request_context_timeout(requestContext);
-            struct timeval tv = { timeout / 1000, (timeout % 1000) * 1000 };
-            select(maxfd + 1, &readfds, &writefds, &exceptfds,
-                   (timeout == -1) ? 0 : &tv);
-        }
-        status = S3_runonce_request_context(requestContext,
-                                            &requestsRemaining);
-        if (status != S3StatusOK) {
-            return status;
-        }
-    } while (requestsRemaining);
-    
-    return S3StatusOK;
-}
-
-
-S3Status S3_runonce_request_context(S3RequestContext *requestContext, 
-                                    int *requestsRemainingReturn)
-{
-    CURLMcode status;
-
-    do {
-        status = curl_multi_perform(requestContext->curlm,
-                                    requestsRemainingReturn);
-
-        switch (status) {
-        case CURLM_OK:
-        case CURLM_CALL_MULTI_PERFORM:
-            break;
-        case CURLM_OUT_OF_MEMORY:
-            return S3StatusOutOfMemory;
-        default:
-            return S3StatusInternalError;
-        }
-
-        CURLMsg *msg;
-        int junk;
-        while ((msg = curl_multi_info_read(requestContext->curlm, &junk))) {
-            if (msg->msg != CURLMSG_DONE) {
-                return S3StatusInternalError;
-            }
-            Request *request;
-            if (curl_easy_getinfo(msg->easy_handle, CURLINFO_PRIVATE, 
-                                  (char **) (char *) &request) != CURLE_OK) {
-                return S3StatusInternalError;
-            }
-            // Remove the request from the list of requests
-            if (request->prev == request->next) {
-                // It was the only one on the list
-                requestContext->requests = 0;
-            }
-            else {
-                // It doesn't matter what the order of them are, so just in
-                // case request was at the head of the list, put the one after
-                // request to the head of the list
-                requestContext->requests = request->next;
-                request->prev->next = request->next;
-                request->next->prev = request->prev;
-            }
-            if ((msg->data.result != CURLE_OK) &&
-                (request->status == S3StatusOK)) {
-                request->status = request_curl_code_to_status
-                    (msg->data.result);
-            }
-            if (curl_multi_remove_handle(requestContext->curlm, 
-                                         msg->easy_handle) != CURLM_OK) {
-                return S3StatusInternalError;
-            }
-            // Finish the request, ensuring that all callbacks have been made,
-            // and also releases the request
-            request_finish(request);
-            // Now, since a callback was made, there may be new requests 
-            // queued up to be performed immediately, so do so
-            status = CURLM_CALL_MULTI_PERFORM;
-        }
-    } while (status == CURLM_CALL_MULTI_PERFORM);
-
-    return S3StatusOK;
-}
-
-S3Status S3_get_request_context_fdsets(S3RequestContext *requestContext,
-                                       fd_set *readFdSet, fd_set *writeFdSet,
-                                       fd_set *exceptFdSet, int *maxFd)
-{
-    return ((curl_multi_fdset(requestContext->curlm, readFdSet, writeFdSet,
-                              exceptFdSet, maxFd) == CURLM_OK) ?
-            S3StatusOK : S3StatusInternalError);
-}
-
-int64_t S3_get_request_context_timeout(S3RequestContext *requestContext)
-{
-    long timeout;
-
-    if (curl_multi_timeout(requestContext->curlm, &timeout) != CURLM_OK) {
-        timeout = 0;
-    }
-    
-    return timeout;
-}
diff --git a/src/libs3/src/response_headers_handler.c b/src/libs3/src/response_headers_handler.c
deleted file mode 100644
index e506ea4..0000000
--- a/src/libs3/src/response_headers_handler.c
+++ /dev/null
@@ -1,205 +0,0 @@
-/** **************************************************************************
- * response_headers_handler.c
- * 
- * Copyright 2008 Bryan Ischo <bryan at ischo.com>
- * 
- * This file is part of libs3.
- * 
- * libs3 is free software: you can redistribute it and/or modify it under the
- * terms of the GNU General Public License as published by the Free Software
- * Foundation, version 3 of the License.
- *
- * In addition, as a special exception, the copyright holders give
- * permission to link the code of this library and its programs with the
- * OpenSSL library, and distribute linked combinations including the two.
- *
- * libs3 is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License version 3
- * along with libs3, in a file named COPYING.  If not, see
- * <http://www.gnu.org/licenses/>.
- *
- ************************************************************************** **/
-
-#include <ctype.h>
-#include <string.h>
-#include "response_headers_handler.h"
-
-
-void response_headers_handler_initialize(ResponseHeadersHandler *handler)
-{
-    handler->responseProperties.requestId = 0;
-    handler->responseProperties.requestId2 = 0;
-    handler->responseProperties.contentType = 0;
-    handler->responseProperties.contentLength = 0;
-    handler->responseProperties.server = 0;
-    handler->responseProperties.eTag = 0;
-    handler->responseProperties.lastModified = -1;
-    handler->responseProperties.metaDataCount = 0;
-    handler->responseProperties.metaData = 0;
-    handler->done = 0;
-    string_multibuffer_initialize(handler->responsePropertyStrings);
-    string_multibuffer_initialize(handler->responseMetaDataStrings);
-}
-
-
-void response_headers_handler_add(ResponseHeadersHandler *handler,
-                                  char *header, int len)
-{
-    S3ResponseProperties *responseProperties = &(handler->responseProperties);
-    char *end = &(header[len]);
-    
-    // Curl might call back the header function after the body has been
-    // received, for 'chunked encoded' contents.  We don't handle this as of
-    // yet, and it's not clear that it would ever be useful.
-    if (handler->done) {
-        return;
-    }
-
-    // If we've already filled up the response headers, ignore this data.
-    // This sucks, but it shouldn't happen - S3 should not be sending back
-    // really long headers.
-    if (handler->responsePropertyStringsSize == 
-        (sizeof(handler->responsePropertyStrings) - 1)) {
-        return;
-    }
-
-    // It should not be possible to have a header line less than 3 long
-    if (len < 3) {
-        return;
-    }
-
-    // Skip whitespace at beginning of header; there never should be any,
-    // but just to be safe
-    while (is_blank(*header)) {
-        header++;
-    }
-
-    // The header must end in \r\n, so skip back over it, and also over any
-    // trailing whitespace
-    end -= 3;
-    while ((end > header) && is_blank(*end)) {
-        end--;
-    }
-    if (!is_blank(*end)) {
-        end++;
-    }
-
-    if (end == header) {
-        // totally bogus
-        return;
-    }
-
-    *end = 0;
-    
-    // Find the colon to split the header up
-    char *c = header;
-    while (*c && (*c != ':')) {
-        c++;
-    }
-    
-    int namelen = c - header;
-
-    // Now walk c past the colon
-    c++;
-    // Now skip whitespace to the beginning of the value
-    while (is_blank(*c)) {
-        c++;
-    }
-
-    int valuelen = (end - c) + 1, fit;
-
-    if (!strncmp(header, "x-amz-request-id", namelen)) {
-        responseProperties->requestId = 
-            string_multibuffer_current(handler->responsePropertyStrings);
-        string_multibuffer_add(handler->responsePropertyStrings, c, 
-                               valuelen, fit);
-    }
-    else if (!strncmp(header, "x-amz-id-2", namelen)) {
-        responseProperties->requestId2 = 
-            string_multibuffer_current(handler->responsePropertyStrings);
-        string_multibuffer_add(handler->responsePropertyStrings, c, 
-                               valuelen, fit);
-    }
-    else if (!strncmp(header, "Content-Type", namelen)) {
-        responseProperties->contentType = 
-            string_multibuffer_current(handler->responsePropertyStrings);
-        string_multibuffer_add(handler->responsePropertyStrings, c, 
-                               valuelen, fit);
-    }
-    else if (!strncmp(header, "Content-Length", namelen)) {
-        handler->responseProperties.contentLength = 0;
-        while (*c) {
-            handler->responseProperties.contentLength *= 10;
-            handler->responseProperties.contentLength += (*c++ - '0');
-        }
-    }
-    else if (!strncmp(header, "Server", namelen)) {
-        responseProperties->server = 
-            string_multibuffer_current(handler->responsePropertyStrings);
-        string_multibuffer_add(handler->responsePropertyStrings, c, 
-                               valuelen, fit);
-    }
-    else if (!strncmp(header, "ETag", namelen)) {
-        responseProperties->eTag = 
-            string_multibuffer_current(handler->responsePropertyStrings);
-        string_multibuffer_add(handler->responsePropertyStrings, c, 
-                               valuelen, fit);
-    }
-    else if (!strncmp(header, S3_METADATA_HEADER_NAME_PREFIX, 
-                      sizeof(S3_METADATA_HEADER_NAME_PREFIX) - 1)) {
-        // Make sure there is room for another x-amz-meta header
-        if (handler->responseProperties.metaDataCount ==
-            sizeof(handler->responseMetaData)) {
-            return;
-        }
-        // Copy the name in
-        char *metaName = &(header[sizeof(S3_METADATA_HEADER_NAME_PREFIX) - 1]);
-        int metaNameLen = 
-            (namelen - (sizeof(S3_METADATA_HEADER_NAME_PREFIX) - 1));
-        char *copiedName = 
-            string_multibuffer_current(handler->responseMetaDataStrings);
-        string_multibuffer_add(handler->responseMetaDataStrings, metaName,
-                               metaNameLen, fit);
-        if (!fit) {
-            return;
-        }
-
-        // Copy the value in
-        char *copiedValue = 
-            string_multibuffer_current(handler->responseMetaDataStrings);
-        string_multibuffer_add(handler->responseMetaDataStrings,
-                               c, valuelen, fit);
-        if (!fit) {
-            return;
-        }
-
-        if (!handler->responseProperties.metaDataCount) {
-            handler->responseProperties.metaData = 
-                handler->responseMetaData;
-        }
-
-        S3NameValue *metaHeader = 
-            &(handler->responseMetaData
-              [handler->responseProperties.metaDataCount++]);
-        metaHeader->name = copiedName;
-        metaHeader->value = copiedValue;
-    }
-}
-
-
-void response_headers_handler_done(ResponseHeadersHandler *handler, CURL *curl)
-{
-    // Now get the last modification time from curl, since it's easiest to let
-    // curl parse it
-    time_t lastModified;
-    if (curl_easy_getinfo
-        (curl, CURLINFO_FILETIME, &lastModified) == CURLE_OK) {
-        handler->responseProperties.lastModified = lastModified;
-    }
-    
-    handler->done = 1;
-}
diff --git a/src/libs3/src/s3.c b/src/libs3/src/s3.c
deleted file mode 100644
index 11f54af..0000000
--- a/src/libs3/src/s3.c
+++ /dev/null
@@ -1,2787 +0,0 @@
-/** **************************************************************************
- * s3.c
- * 
- * Copyright 2008 Bryan Ischo <bryan at ischo.com>
- * 
- * This file is part of libs3.
- * 
- * libs3 is free software: you can redistribute it and/or modify it under the
- * terms of the GNU General Public License as published by the Free Software
- * Foundation, version 3 of the License.
- *
- * In addition, as a special exception, the copyright holders give
- * permission to link the code of this library and its programs with the
- * OpenSSL library, and distribute linked combinations including the two.
- *
- * libs3 is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License version 3
- * along with libs3, in a file named COPYING.  If not, see
- * <http://www.gnu.org/licenses/>.
- *
- ************************************************************************** **/
-
-/**
- * This is a 'driver' program that simply converts command-line input into
- * calls to libs3 functions, and prints the results.
- **/
-
-#include <ctype.h>
-#include <getopt.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <strings.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <time.h>
-#include <unistd.h>
-#include "libs3.h"
-
-// Some Windows stuff
-#ifndef FOPEN_EXTRA_FLAGS
-#define FOPEN_EXTRA_FLAGS ""
-#endif
-
-// Some Unix stuff (to work around Windows issues)
-#ifndef SLEEP_UNITS_PER_SECOND
-#define SLEEP_UNITS_PER_SECOND 1
-#endif
-
-// Also needed for Windows, because somehow MinGW doesn't define this
-extern int putenv(char *);
-
-
-// Command-line options, saved as globals ------------------------------------
-
-static int forceG = 0;
-static int showResponsePropertiesG = 0;
-static S3Protocol protocolG = S3ProtocolHTTPS;
-static S3UriStyle uriStyleG = S3UriStylePath;
-static int retriesG = 5;
-
-
-// Environment variables, saved as globals ----------------------------------
-
-static const char *accessKeyIdG = 0;
-static const char *secretAccessKeyG = 0;
-
-
-// Request results, saved as globals -----------------------------------------
-
-static int statusG = 0;
-static char errorDetailsG[4096] = { 0 };
-
-
-// Other globals -------------------------------------------------------------
-
-static char putenvBufG[256];
-
-
-// Option prefixes -----------------------------------------------------------
-
-#define LOCATION_PREFIX "location="
-#define LOCATION_PREFIX_LEN (sizeof(LOCATION_PREFIX) - 1)
-#define CANNED_ACL_PREFIX "cannedAcl="
-#define CANNED_ACL_PREFIX_LEN (sizeof(CANNED_ACL_PREFIX) - 1)
-#define PREFIX_PREFIX "prefix="
-#define PREFIX_PREFIX_LEN (sizeof(PREFIX_PREFIX) - 1)
-#define MARKER_PREFIX "marker="
-#define MARKER_PREFIX_LEN (sizeof(MARKER_PREFIX) - 1)
-#define DELIMITER_PREFIX "delimiter="
-#define DELIMITER_PREFIX_LEN (sizeof(DELIMITER_PREFIX) - 1)
-#define MAXKEYS_PREFIX "maxkeys="
-#define MAXKEYS_PREFIX_LEN (sizeof(MAXKEYS_PREFIX) - 1)
-#define FILENAME_PREFIX "filename="
-#define FILENAME_PREFIX_LEN (sizeof(FILENAME_PREFIX) - 1)
-#define CONTENT_LENGTH_PREFIX "contentLength="
-#define CONTENT_LENGTH_PREFIX_LEN (sizeof(CONTENT_LENGTH_PREFIX) - 1)
-#define CACHE_CONTROL_PREFIX "cacheControl="
-#define CACHE_CONTROL_PREFIX_LEN (sizeof(CACHE_CONTROL_PREFIX) - 1)
-#define CONTENT_TYPE_PREFIX "contentType="
-#define CONTENT_TYPE_PREFIX_LEN (sizeof(CONTENT_TYPE_PREFIX) - 1)
-#define MD5_PREFIX "md5="
-#define MD5_PREFIX_LEN (sizeof(MD5_PREFIX) - 1)
-#define CONTENT_DISPOSITION_FILENAME_PREFIX "contentDispositionFilename="
-#define CONTENT_DISPOSITION_FILENAME_PREFIX_LEN \
-    (sizeof(CONTENT_DISPOSITION_FILENAME_PREFIX) - 1)
-#define CONTENT_ENCODING_PREFIX "contentEncoding="
-#define CONTENT_ENCODING_PREFIX_LEN (sizeof(CONTENT_ENCODING_PREFIX) - 1)
-#define EXPIRES_PREFIX "expires="
-#define EXPIRES_PREFIX_LEN (sizeof(EXPIRES_PREFIX) - 1)
-#define X_AMZ_META_PREFIX "x-amz-meta-"
-#define X_AMZ_META_PREFIX_LEN (sizeof(X_AMZ_META_PREFIX) - 1)
-#define IF_MODIFIED_SINCE_PREFIX "ifModifiedSince="
-#define IF_MODIFIED_SINCE_PREFIX_LEN (sizeof(IF_MODIFIED_SINCE_PREFIX) - 1)
-#define IF_NOT_MODIFIED_SINCE_PREFIX "ifNotmodifiedSince="
-#define IF_NOT_MODIFIED_SINCE_PREFIX_LEN \
-    (sizeof(IF_NOT_MODIFIED_SINCE_PREFIX) - 1)
-#define IF_MATCH_PREFIX "ifMatch="
-#define IF_MATCH_PREFIX_LEN (sizeof(IF_MATCH_PREFIX) - 1)
-#define IF_NOT_MATCH_PREFIX "ifNotMatch="
-#define IF_NOT_MATCH_PREFIX_LEN (sizeof(IF_NOT_MATCH_PREFIX) - 1)
-#define START_BYTE_PREFIX "startByte="
-#define START_BYTE_PREFIX_LEN (sizeof(START_BYTE_PREFIX) - 1)
-#define BYTE_COUNT_PREFIX "byteCount="
-#define BYTE_COUNT_PREFIX_LEN (sizeof(BYTE_COUNT_PREFIX) - 1)
-#define ALL_DETAILS_PREFIX "allDetails="
-#define ALL_DETAILS_PREFIX_LEN (sizeof(ALL_DETAILS_PREFIX) - 1)
-#define NO_STATUS_PREFIX "noStatus="
-#define NO_STATUS_PREFIX_LEN (sizeof(NO_STATUS_PREFIX) - 1)
-#define RESOURCE_PREFIX "resource="
-#define RESOURCE_PREFIX_LEN (sizeof(RESOURCE_PREFIX) - 1)
-#define TARGET_BUCKET_PREFIX "targetBucket="
-#define TARGET_BUCKET_PREFIX_LEN (sizeof(TARGET_BUCKET_PREFIX) - 1)
-#define TARGET_PREFIX_PREFIX "targetPrefix="
-#define TARGET_PREFIX_PREFIX_LEN (sizeof(TARGET_PREFIX_PREFIX) - 1)
-
-
-// util ----------------------------------------------------------------------
-
-static void S3_init(void)
-{
-    S3Status status;
-    const char *hostname = getenv("S3_HOSTNAME");
-    
-    if ((status = S3_initialize("s3", S3_INIT_ALL, hostname))
-        != S3StatusOK) {
-        fprintf(stderr, "Failed to initialize libs3: %s\n", 
-                S3_get_status_name(status));
-        exit(-1);
-    }
-}
-
-
-static void printError(void)
-{
-    if (statusG < S3StatusErrorAccessDenied) {
-        fprintf(stderr, "\nERROR: %s\n", S3_get_status_name(statusG));
-    }
-    else {
-        fprintf(stderr, "\nERROR: %s\n", S3_get_status_name(statusG));
-        fprintf(stderr, "%s\n", errorDetailsG);
-    }
-}
-
-
-static void usageExit(FILE *out)
-{
-    fprintf(out,
-"\n Options:\n"
-"\n"
-"   Command Line:\n"
-"\n"
-"   -f/--force           : force operation despite warnings\n"
-"   -h/--vhost-style     : use virtual-host-style URIs (default is "
-                          "path-style)\n"
-"   -u/--unencrypted     : unencrypted (use HTTP instead of HTTPS)\n"
-"   -s/--show-properties : show response properties on stdout\n"
-"   -r/--retries         : retry retryable failures this number of times\n"
-"                          (default is 5)\n"
-"\n"
-"   Environment:\n"
-"\n"
-"   S3_ACCESS_KEY_ID     : S3 access key ID (required)\n"
-"   S3_SECRET_ACCESS_KEY : S3 secret access key (required)\n"
-"   S3_HOSTNAME          : specify alternative S3 host (optional)\n"
-"\n" 
-" Commands (with <required parameters> and [optional parameters]) :\n"
-"\n"
-"   (NOTE: all command parameters take a value and are specified using the\n"
-"          pattern parameter=value)\n"
-"\n"
-"   help                 : Prints this help text\n"
-"\n"
-"   list                 : Lists owned buckets\n"
-"     [allDetails]       : Show full details\n"
-"\n"
-"   test                 : Tests a bucket for existence and accessibility\n"
-"     <bucket>           : Bucket to test\n"
-"\n"
-"   create               : Create a new bucket\n"
-"     <bucket>           : Bucket to create\n"
-"     [cannedAcl]        : Canned ACL for the bucket (see Canned ACLs)\n"
-"     [location]         : Location for bucket (for example, EU)\n"
-"\n"
-"   delete               : Delete a bucket or key\n"
-"     <bucket>[/<key>]   : Bucket or bucket/key to delete\n"
-"\n"
-"   list                 : List bucket contents\n"
-"     <bucket>           : Bucket to list\n"
-"     [prefix]           : Prefix for results set\n"
-"     [marker]           : Where in results set to start listing\n"
-"     [delimiter]        : Delimiter for rolling up results set\n"
-"     [maxkeys]          : Maximum number of keys to return in results set\n"
-"     [allDetails]       : Show full details for each key\n"
-"\n"
-"   getacl               : Get the ACL of a bucket or key\n"
-"     <bucket>[/<key>]   : Bucket or bucket/key to get the ACL of\n"
-"     [filename]         : Output filename for ACL (default is stdout)\n"
-"\n"
-"   setacl               : Set the ACL of a bucket or key\n"
-"     <bucket>[/<key>]   : Bucket or bucket/key to set the ACL of\n"
-"     [filename]         : Input filename for ACL (default is stdin)\n"
-"\n"
-"   getlogging           : Get the logging status of a bucket\n"
-"     <bucket>           : Bucket to get the logging status of\n"
-"     [filename]         : Output filename for ACL (default is stdout)\n"
-"\n"
-"   setlogging           : Set the logging status of a bucket\n"
-"     <bucket>           : Bucket to set the logging status of\n"
-"     [targetBucket]     : Target bucket to log to; if not present, disables\n"
-"                          logging\n"
-"     [targetPrefix]     : Key prefix to use for logs\n"
-"     [filename]         : Input filename for ACL (default is stdin)\n"
-"\n"
-"   put                  : Puts an object\n"
-"     <bucket>/<key>     : Bucket/key to put object to\n"
-"     [filename]         : Filename to read source data from "
-                          "(default is stdin)\n"
-"     [contentLength]    : How many bytes of source data to put (required if\n"
-"                          source file is stdin)\n"
-"     [cacheControl]     : Cache-Control HTTP header string to associate with\n"
-"                          object\n"
-"     [contentType]      : Content-Type HTTP header string to associate with\n"
-"                          object\n"
-"     [md5]              : MD5 for validating source data\n"
-"     [contentDispositionFilename] : Content-Disposition filename string to\n"
-"                          associate with object\n"
-"     [contentEncoding]  : Content-Encoding HTTP header string to associate\n"
-"                          with object\n"
-"     [expires]          : Expiration date to associate with object\n"
-"     [cannedAcl]        : Canned ACL for the object (see Canned ACLs)\n"
-"     [x-amz-meta-...]]  : Metadata headers to associate with the object\n"
-"\n"
-"   copy                 : Copies an object; if any options are set, the "
-                          "entire\n"
-"                          metadata of the object is replaced\n"
-"     <sourcebucket>/<sourcekey> : Source bucket/key\n"
-"     <destbucket>/<destkey> : Destination bucket/key\n"
-"     [cacheControl]     : Cache-Control HTTP header string to associate with\n"
-"                          object\n"
-"     [contentType]      : Content-Type HTTP header string to associate with\n"
-"                          object\n"
-"     [contentDispositionFilename] : Content-Disposition filename string to\n"
-"                          associate with object\n"
-"     [contentEncoding]  : Content-Encoding HTTP header string to associate\n"
-"                          with object\n"
-"     [expires]          : Expiration date to associate with object\n"
-"     [cannedAcl]        : Canned ACL for the object (see Canned ACLs)\n"
-"     [x-amz-meta-...]]  : Metadata headers to associate with the object\n"
-"\n"
-"   get                  : Gets an object\n"
-"     <buckey>/<key>     : Bucket/key of object to get\n"
-"     [filename]         : Filename to write object data to (required if -s\n"
-"                          command line parameter was used)\n"
-"     [ifModifiedSince]  : Only return the object if it has been modified "
-                          "since\n"
-"                          this date\n"
-"     [ifNotmodifiedSince] : Only return the object if it has not been "
-                          "modified\n"
-"                          since this date\n"
-"     [ifMatch]          : Only return the object if its ETag header matches\n"
-"                          this string\n"
-"     [ifNotMatch]       : Only return the object if its ETag header does "
-                          "not\n"
-"                          match this string\n"
-"     [startByte]        : First byte of byte range to return\n"
-"     [byteCount]        : Number of bytes of byte range to return\n"
-"\n"
-"   head                 : Gets only the headers of an object, implies -s\n"
-"     <bucket>/<key>     : Bucket/key of object to get headers of\n"
-"\n"
-"   gqs                  : Generates an authenticated query string\n"
-"     <bucket>[/<key>]   : Bucket or bucket/key to generate query string for\n"
-"     [expires]          : Expiration date for query string\n"
-"     [resource]         : Sub-resource of key for query string, without a\n"
-"                          leading '?', for example, \"torrent\"\n"
-"\n"
-" Canned ACLs:\n"
-"\n"
-"  The following canned ACLs are supported:\n"
-"    private (default), public-read, public-read-write, authenticated-read\n"
-"\n"
-" ACL Format:\n"
-"\n"
-"  For the getacl and setacl commands, the format of the ACL list is:\n"
-"  1) An initial line giving the owner id in this format:\n"
-"       OwnerID <Owner ID> <Owner Display Name>\n"
-"  2) Optional header lines, giving column headers, starting with the\n"
-"     word \"Type\", or with some number of dashes\n"
-"  3) Grant lines, of the form:\n"
-"       <Grant Type> (whitespace) <Grantee> (whitespace) <Permission>\n"
-"     where Grant Type is one of: Email, UserID, or Group, and\n"
-"     Grantee is the identification of the grantee based on this type,\n"
-"     and Permission is one of: READ, WRITE, READ_ACP, or FULL_CONTROL.\n"
-"\n"
-"  Note that the easiest way to modify an ACL is to first get it, saving it\n"
-"  into a file, then modifying the file, and then setting the modified file\n"
-"  back as the new ACL for the bucket/object.\n"
-"\n"
-" Date Format:\n"
-"\n"
-"  The format for dates used in parameters is as ISO 8601 dates, i.e.\n"
-"  YYYY-MM-DDTHH:MM:SS[+/-dd:dd].  Examples:\n"
-"      2008-07-29T20:36:14\n"
-"      2008-07-29T20:36:14-06:00\n"
-"      2008-07-29T20:36:14+11:30\n"
-"\n");
-
-    exit(-1);
-}
-
-
-static uint64_t convertInt(const char *str, const char *paramName)
-{
-    uint64_t ret = 0;
-
-    while (*str) {
-        if (!isdigit(*str)) {
-            fprintf(stderr, "\nERROR: Nondigit in %s parameter: %c\n", 
-                    paramName, *str);
-            usageExit(stderr);
-        }
-        ret *= 10;
-        ret += (*str++ - '0');
-    }
-
-    return ret;
-}
-
-
-typedef struct growbuffer
-{
-    // The total number of bytes, and the start byte
-    int size;
-    // The start byte
-    int start;
-    // The blocks
-    char data[64 * 1024];
-    struct growbuffer *prev, *next;
-} growbuffer;
-
-
-// returns nonzero on success, zero on out of memory
-static int growbuffer_append(growbuffer **gb, const char *data, int dataLen)
-{
-    while (dataLen) {
-        growbuffer *buf = *gb ? (*gb)->prev : 0;
-        if (!buf || (buf->size == sizeof(buf->data))) {
-            buf = (growbuffer *) malloc(sizeof(growbuffer));
-            if (!buf) {
-                return 0;
-            }
-            buf->size = 0;
-            buf->start = 0;
-            if (*gb && (*gb)->prev) {
-                buf->prev = (*gb)->prev;
-                buf->next = *gb;
-                (*gb)->prev->next = buf;
-                (*gb)->prev = buf;
-            }
-            else {
-                buf->prev = buf->next = buf;
-                *gb = buf;
-            }
-        }
-
-        int toCopy = (sizeof(buf->data) - buf->size);
-        if (toCopy > dataLen) {
-            toCopy = dataLen;
-        }
-
-        memcpy(&(buf->data[buf->size]), data, toCopy);
-        
-        buf->size += toCopy, data += toCopy, dataLen -= toCopy;
-    }
-
-    return 1;
-}
-
-
-static void growbuffer_read(growbuffer **gb, int amt, int *amtReturn, 
-                            char *buffer)
-{
-    *amtReturn = 0;
-
-    growbuffer *buf = *gb;
-
-    if (!buf) {
-        return;
-    }
-
-    *amtReturn = (buf->size > amt) ? amt : buf->size;
-
-    memcpy(buffer, &(buf->data[buf->start]), *amtReturn);
-    
-    buf->start += *amtReturn, buf->size -= *amtReturn;
-
-    if (buf->size == 0) {
-        if (buf->next == buf) {
-            *gb = 0;
-        }
-        else {
-            *gb = buf->next;
-            buf->prev->next = buf->next;
-            buf->next->prev = buf->prev;
-        }
-        free(buf);
-    }
-}
-
-
-static void growbuffer_destroy(growbuffer *gb)
-{
-    growbuffer *start = gb;
-
-    while (gb) {
-        growbuffer *next = gb->next;
-        free(gb);
-        gb = (next == start) ? 0 : next;
-    }
-}
-
-
-// Convenience utility for making the code look nicer.  Tests a string
-// against a format; only the characters specified in the format are
-// checked (i.e. if the string is longer than the format, the string still
-// checks out ok).  Format characters are:
-// d - is a digit
-// anything else - is that character
-// Returns nonzero the string checks out, zero if it does not.
-static int checkString(const char *str, const char *format)
-{
-    while (*format) {
-        if (*format == 'd') {
-            if (!isdigit(*str)) {
-                return 0;
-            }
-        }
-        else if (*str != *format) {
-            return 0;
-        }
-        str++, format++;
-    }
-
-    return 1;
-}
-
-
-static int64_t parseIso8601Time(const char *str)
-{
-    // Check to make sure that it has a valid format
-    if (!checkString(str, "dddd-dd-ddTdd:dd:dd")) {
-        return -1;
-    }
-
-#define nextnum() (((*str - '0') * 10) + (*(str + 1) - '0'))
-
-    // Convert it
-    struct tm stm;
-    memset(&stm, 0, sizeof(stm));
-
-    stm.tm_year = (nextnum() - 19) * 100;
-    str += 2;
-    stm.tm_year += nextnum();
-    str += 3;
-
-    stm.tm_mon = nextnum() - 1;
-    str += 3;
-
-    stm.tm_mday = nextnum();
-    str += 3;
-
-    stm.tm_hour = nextnum();
-    str += 3;
-
-    stm.tm_min = nextnum();
-    str += 3;
-
-    stm.tm_sec = nextnum();
-    str += 2;
-
-    stm.tm_isdst = -1;
-
-    // This is hokey but it's the recommended way ...
-    char *tz = getenv("TZ");
-    snprintf(putenvBufG, sizeof(putenvBufG), "TZ=UTC");
-    putenv(putenvBufG);
-
-    int64_t ret = mktime(&stm);
-
-    snprintf(putenvBufG, sizeof(putenvBufG), "TZ=%s", tz ? tz : "");
-    putenv(putenvBufG);
-
-    // Skip the millis
-
-    if (*str == '.') {
-        str++;
-        while (isdigit(*str)) {
-            str++;
-        }
-    }
-    
-    if (checkString(str, "-dd:dd") || checkString(str, "+dd:dd")) {
-        int sign = (*str++ == '-') ? -1 : 1;
-        int hours = nextnum();
-        str += 3;
-        int minutes = nextnum();
-        ret += (-sign * (((hours * 60) + minutes) * 60));
-    }
-    // Else it should be Z to be a conformant time string, but we just assume
-    // that it is rather than enforcing that
-
-    return ret;
-}
-
-
-// Simple ACL format:  Lines of this format:
-// Type - ignored
-// Starting with a dash - ignored
-// Email email_address permission
-// UserID user_id (display_name) permission
-// Group Authenticated AWS Users permission
-// Group All Users  permission
-// permission is one of READ, WRITE, READ_ACP, WRITE_ACP, FULL_CONTROL
-static int convert_simple_acl(char *aclXml, char *ownerId,
-                              char *ownerDisplayName,
-                              int *aclGrantCountReturn,
-                              S3AclGrant *aclGrants)
-{
-    *aclGrantCountReturn = 0;
-    *ownerId = 0;
-    *ownerDisplayName = 0;
-
-#define SKIP_SPACE(require_more)                \
-    do {                                        \
-        while (isspace(*aclXml)) {              \
-            aclXml++;                           \
-        }                                       \
-        if (require_more && !*aclXml) {         \
-            return 0;                           \
-        }                                       \
-    } while (0)
-    
-#define COPY_STRING_MAXLEN(field, maxlen)               \
-    do {                                                \
-        SKIP_SPACE(1);                                  \
-        int len = 0;                                    \
-        while ((len < maxlen) && !isspace(*aclXml)) {   \
-            field[len++] = *aclXml++;                   \
-        }                                               \
-        field[len] = 0;                                 \
-    } while (0)
-
-#define COPY_STRING(field)                              \
-    COPY_STRING_MAXLEN(field, (int) (sizeof(field) - 1))
-
-    while (1) {
-        SKIP_SPACE(0);
-
-        if (!*aclXml) {
-            break;
-        }
-        
-        // Skip Type lines and dash lines
-        if (!strncmp(aclXml, "Type", sizeof("Type") - 1) ||
-            (*aclXml == '-')) {
-            while (*aclXml && ((*aclXml != '\n') && (*aclXml != '\r'))) {
-                aclXml++;
-            }
-            continue;
-        }
-        
-        if (!strncmp(aclXml, "OwnerID", sizeof("OwnerID") - 1)) {
-            aclXml += sizeof("OwnerID") - 1;
-            COPY_STRING_MAXLEN(ownerId, S3_MAX_GRANTEE_USER_ID_SIZE);
-            SKIP_SPACE(1);
-            COPY_STRING_MAXLEN(ownerDisplayName,
-                               S3_MAX_GRANTEE_DISPLAY_NAME_SIZE);
-            continue;
-        }
-
-        if (*aclGrantCountReturn == S3_MAX_ACL_GRANT_COUNT) {
-            return 0;
-        }
-
-        S3AclGrant *grant = &(aclGrants[(*aclGrantCountReturn)++]);
-
-        if (!strncmp(aclXml, "Email", sizeof("Email") - 1)) {
-            grant->granteeType = S3GranteeTypeAmazonCustomerByEmail;
-            aclXml += sizeof("Email") - 1;
-            COPY_STRING(grant->grantee.amazonCustomerByEmail.emailAddress);
-        }
-        else if (!strncmp(aclXml, "UserID", sizeof("UserID") - 1)) {
-            grant->granteeType = S3GranteeTypeCanonicalUser;
-            aclXml += sizeof("UserID") - 1;
-            COPY_STRING(grant->grantee.canonicalUser.id);
-            SKIP_SPACE(1);
-            // Now do display name
-            COPY_STRING(grant->grantee.canonicalUser.displayName);
-        }
-        else if (!strncmp(aclXml, "Group", sizeof("Group") - 1)) {
-            aclXml += sizeof("Group") - 1;
-            SKIP_SPACE(1);
-            if (!strncmp(aclXml, "Authenticated AWS Users",
-                         sizeof("Authenticated AWS Users") - 1)) {
-                grant->granteeType = S3GranteeTypeAllAwsUsers;
-                aclXml += (sizeof("Authenticated AWS Users") - 1);
-            }
-            else if (!strncmp(aclXml, "All Users", sizeof("All Users") - 1)) {
-                grant->granteeType = S3GranteeTypeAllUsers;
-                aclXml += (sizeof("All Users") - 1);
-            }
-            else if (!strncmp(aclXml, "Log Delivery", 
-                              sizeof("Log Delivery") - 1)) {
-                grant->granteeType = S3GranteeTypeLogDelivery;
-                aclXml += (sizeof("Log Delivery") - 1);
-            }
-            else {
-                return 0;
-            }
-        }
-        else {
-            return 0;
-        }
-
-        SKIP_SPACE(1);
-        
-        if (!strncmp(aclXml, "READ_ACP", sizeof("READ_ACP") - 1)) {
-            grant->permission = S3PermissionReadACP;
-            aclXml += (sizeof("READ_ACP") - 1);
-        }
-        else if (!strncmp(aclXml, "READ", sizeof("READ") - 1)) {
-            grant->permission = S3PermissionRead;
-            aclXml += (sizeof("READ") - 1);
-        }
-        else if (!strncmp(aclXml, "WRITE_ACP", sizeof("WRITE_ACP") - 1)) {
-            grant->permission = S3PermissionWriteACP;
-            aclXml += (sizeof("WRITE_ACP") - 1);
-        }
-        else if (!strncmp(aclXml, "WRITE", sizeof("WRITE") - 1)) {
-            grant->permission = S3PermissionWrite;
-            aclXml += (sizeof("WRITE") - 1);
-        }
-        else if (!strncmp(aclXml, "FULL_CONTROL", 
-                          sizeof("FULL_CONTROL") - 1)) {
-            grant->permission = S3PermissionFullControl;
-            aclXml += (sizeof("FULL_CONTROL") - 1);
-        }
-    }
-
-    return 1;
-}
-
-static int should_retry(void)
-{
-    if (retriesG--) {
-        // Sleep before next retry; start out with a 1 second sleep
-        static int retrySleepInterval = 1 * SLEEP_UNITS_PER_SECOND;
-        sleep(retrySleepInterval);
-        // Next sleep 1 second longer
-        retrySleepInterval++;
-        return 1;
-    }
-
-    return 0;
-}
-
-
-static struct option longOptionsG[] =
-{
-    { "force",                no_argument,        0,  'f' },
-    { "vhost-style",          no_argument,        0,  'h' },
-    { "unencrypted",          no_argument,        0,  'u' },
-    { "show-properties",      no_argument,        0,  's' },
-    { "retries",              required_argument,  0,  'r' },
-    { 0,                      0,                  0,   0  }
-};
-
-
-// response properties callback ----------------------------------------------
-
-// This callback does the same thing for every request type: prints out the
-// properties if the user has requested them to be so
-static S3Status responsePropertiesCallback
-    (const S3ResponseProperties *properties, void *callbackData)
-{
-    (void) callbackData;
-
-    if (!showResponsePropertiesG) {
-        return S3StatusOK;
-    }
-
-#define print_nonnull(name, field)                                 \
-    do {                                                           \
-        if (properties-> field) {                                  \
-            printf("%s: %s\n", name, properties-> field);          \
-        }                                                          \
-    } while (0)
-    
-    print_nonnull("Content-Type", contentType);
-    print_nonnull("Request-Id", requestId);
-    print_nonnull("Request-Id-2", requestId2);
-    if (properties->contentLength > 0) {
-        printf("Content-Length: %lld\n", 
-               (unsigned long long) properties->contentLength);
-    }
-    print_nonnull("Server", server);
-    print_nonnull("ETag", eTag);
-    if (properties->lastModified > 0) {
-        char timebuf[256];
-        time_t t = (time_t) properties->lastModified;
-        // gmtime is not thread-safe but we don't care here.
-        strftime(timebuf, sizeof(timebuf), "%Y-%m-%dT%H:%M:%SZ", gmtime(&t));
-        printf("Last-Modified: %s\n", timebuf);
-    }
-    int i;
-    for (i = 0; i < properties->metaDataCount; i++) {
-        printf("x-amz-meta-%s: %s\n", properties->metaData[i].name,
-               properties->metaData[i].value);
-    }
-
-    return S3StatusOK;
-}
-
-
-// response complete callback ------------------------------------------------
-
-// This callback does the same thing for every request type: saves the status
-// and error stuff in global variables
-static void responseCompleteCallback(S3Status status,
-                                     const S3ErrorDetails *error, 
-                                     void *callbackData)
-{
-    (void) callbackData;
-
-    statusG = status;
-    // Compose the error details message now, although we might not use it.
-    // Can't just save a pointer to [error] since it's not guaranteed to last
-    // beyond this callback
-    int len = 0;
-    if (error && error->message) {
-        len += snprintf(&(errorDetailsG[len]), sizeof(errorDetailsG) - len,
-                        "  Message: %s\n", error->message);
-    }
-    if (error && error->resource) {
-        len += snprintf(&(errorDetailsG[len]), sizeof(errorDetailsG) - len,
-                        "  Resource: %s\n", error->resource);
-    }
-    if (error && error->furtherDetails) {
-        len += snprintf(&(errorDetailsG[len]), sizeof(errorDetailsG) - len,
-                        "  Further Details: %s\n", error->furtherDetails);
-    }
-    if (error && error->extraDetailsCount) {
-        len += snprintf(&(errorDetailsG[len]), sizeof(errorDetailsG) - len,
-                        "%s", "  Extra Details:\n");
-        int i;
-        for (i = 0; i < error->extraDetailsCount; i++) {
-            len += snprintf(&(errorDetailsG[len]), 
-                            sizeof(errorDetailsG) - len, "    %s: %s\n", 
-                            error->extraDetails[i].name,
-                            error->extraDetails[i].value);
-        }
-    }
-}
-
-
-// list service --------------------------------------------------------------
-
-typedef struct list_service_data
-{
-    int headerPrinted;
-    int allDetails;
-} list_service_data;
-
-
-static void printListServiceHeader(int allDetails)
-{
-    printf("%-56s  %-20s", "                         Bucket",
-           "      Created");
-    if (allDetails) {
-        printf("  %-64s  %-12s", 
-               "                            Owner ID",
-               "Display Name");
-    }
-    printf("\n");
-    printf("--------------------------------------------------------  "
-           "--------------------");
-    if (allDetails) {
-        printf("  -------------------------------------------------"
-               "---------------  ------------");
-    }
-    printf("\n");
-}
-
-
-static S3Status listServiceCallback(const char *ownerId, 
-                                    const char *ownerDisplayName,
-                                    const char *bucketName,
-                                    int64_t creationDate, void *callbackData)
-{
-    list_service_data *data = (list_service_data *) callbackData;
-
-    if (!data->headerPrinted) {
-        data->headerPrinted = 1;
-        printListServiceHeader(data->allDetails);
-    }
-
-    char timebuf[256];
-    if (creationDate >= 0) {
-        time_t t = (time_t) creationDate;
-        strftime(timebuf, sizeof(timebuf), "%Y-%m-%dT%H:%M:%SZ", gmtime(&t));
-    }
-    else {
-        timebuf[0] = 0;
-    }
-
-    printf("%-56s  %-20s", bucketName, timebuf);
-    if (data->allDetails) {
-        printf("  %-64s  %-12s", ownerId ? ownerId : "", 
-               ownerDisplayName ? ownerDisplayName : "");
-    }
-    printf("\n");
-
-    return S3StatusOK;
-}
-
-
-static void list_service(int allDetails)
-{
-    list_service_data data;
-
-    data.headerPrinted = 0;
-    data.allDetails = allDetails;
-
-    S3_init();
-
-    S3ListServiceHandler listServiceHandler =
-    {
-        { &responsePropertiesCallback, &responseCompleteCallback },
-        &listServiceCallback
-    };
-
-    do {
-        S3_list_service(protocolG, accessKeyIdG, secretAccessKeyG, 0, 0, 
-                        &listServiceHandler, &data);
-    } while (S3_status_is_retryable(statusG) && should_retry());
-
-    if (statusG == S3StatusOK) {
-        if (!data.headerPrinted) {
-            printListServiceHeader(allDetails);
-        }
-    }
-    else {
-        printError();
-    }
-
-    S3_deinitialize();
-}
-
-
-// test bucket ---------------------------------------------------------------
-
-static void test_bucket(int argc, char **argv, int optindex)
-{
-    // test bucket
-    if (optindex == argc) {
-        fprintf(stderr, "\nERROR: Missing parameter: bucket\n");
-        usageExit(stderr);
-    }
-
-    const char *bucketName = argv[optindex++];
-
-    if (optindex != argc) {
-        fprintf(stderr, "\nERROR: Extraneous parameter: %s\n", argv[optindex]);
-        usageExit(stderr);
-    }
-
-    S3_init();
-
-    S3ResponseHandler responseHandler =
-    {
-        &responsePropertiesCallback, &responseCompleteCallback
-    };
-
-    char locationConstraint[64];
-    do {
-        S3_test_bucket(protocolG, uriStyleG, accessKeyIdG, secretAccessKeyG,
-                       0, bucketName, sizeof(locationConstraint),
-                       locationConstraint, 0, &responseHandler, 0);
-    } while (S3_status_is_retryable(statusG) && should_retry());
-
-    const char *result;
-
-    switch (statusG) {
-    case S3StatusOK:
-        // bucket exists
-        result = locationConstraint[0] ? locationConstraint : "USA";
-        break;
-    case S3StatusErrorNoSuchBucket:
-        result = "Does Not Exist";
-        break;
-    case S3StatusErrorAccessDenied:
-        result = "Access Denied";
-        break;
-    default:
-        result = 0;
-        break;
-    }
-
-    if (result) {
-        printf("%-56s  %-20s\n", "                         Bucket",
-               "       Status");
-        printf("--------------------------------------------------------  "
-               "--------------------\n");
-        printf("%-56s  %-20s\n", bucketName, result);
-    }
-    else {
-        printError();
-    }
-
-    S3_deinitialize();
-}
-
-
-// create bucket -------------------------------------------------------------
-
-static void create_bucket(int argc, char **argv, int optindex)
-{
-    if (optindex == argc) {
-        fprintf(stderr, "\nERROR: Missing parameter: bucket\n");
-        usageExit(stderr);
-    }
-
-    const char *bucketName = argv[optindex++];
-
-    if (!forceG && (S3_validate_bucket_name
-                    (bucketName, S3UriStyleVirtualHost) != S3StatusOK)) {
-        fprintf(stderr, "\nWARNING: Bucket name is not valid for "
-                "virtual-host style URI access.\n");
-        fprintf(stderr, "Bucket not created.  Use -f option to force the "
-                "bucket to be created despite\n");
-        fprintf(stderr, "this warning.\n\n");
-        exit(-1);
-    }
-
-    const char *locationConstraint = 0;
-    S3CannedAcl cannedAcl = S3CannedAclPrivate;
-    while (optindex < argc) {
-        char *param = argv[optindex++];
-        if (!strncmp(param, LOCATION_PREFIX, LOCATION_PREFIX_LEN)) {
-            locationConstraint = &(param[LOCATION_PREFIX_LEN]);
-        }
-        else if (!strncmp(param, CANNED_ACL_PREFIX, CANNED_ACL_PREFIX_LEN)) {
-            char *val = &(param[CANNED_ACL_PREFIX_LEN]);
-            if (!strcmp(val, "private")) {
-                cannedAcl = S3CannedAclPrivate;
-            }
-            else if (!strcmp(val, "public-read")) {
-                cannedAcl = S3CannedAclPublicRead;
-            }
-            else if (!strcmp(val, "public-read-write")) {
-                cannedAcl = S3CannedAclPublicReadWrite;
-            }
-            else if (!strcmp(val, "authenticated-read")) {
-                cannedAcl = S3CannedAclAuthenticatedRead;
-            }
-            else {
-                fprintf(stderr, "\nERROR: Unknown canned ACL: %s\n", val);
-                usageExit(stderr);
-            }
-        }
-        else {
-            fprintf(stderr, "\nERROR: Unknown param: %s\n", param);
-            usageExit(stderr);
-        }
-    }
-
-    S3_init();
-
-    S3ResponseHandler responseHandler =
-    {
-        &responsePropertiesCallback, &responseCompleteCallback
-    };
-
-    do {
-        S3_create_bucket(protocolG, accessKeyIdG, secretAccessKeyG,
-                         0, bucketName, cannedAcl, locationConstraint, 0,
-                         &responseHandler, 0);
-    } while (S3_status_is_retryable(statusG) && should_retry());
-
-    if (statusG == S3StatusOK) {
-        printf("Bucket successfully created.\n");
-    }
-    else {
-        printError();
-    }
-    
-    S3_deinitialize();
-}
-
-
-// delete bucket -------------------------------------------------------------
-
-static void delete_bucket(int argc, char **argv, int optindex)
-{
-    if (optindex == argc) {
-        fprintf(stderr, "\nERROR: Missing parameter: bucket\n");
-        usageExit(stderr);
-    }
-
-    const char *bucketName = argv[optindex++];
-
-    if (optindex != argc) {
-        fprintf(stderr, "\nERROR: Extraneous parameter: %s\n", argv[optindex]);
-        usageExit(stderr);
-    }
-
-    S3_init();
-
-    S3ResponseHandler responseHandler =
-    {
-        &responsePropertiesCallback, &responseCompleteCallback
-    };
-
-    do {
-        S3_delete_bucket(protocolG, uriStyleG, accessKeyIdG, secretAccessKeyG,
-                         0, bucketName, 0, &responseHandler, 0);
-    } while (S3_status_is_retryable(statusG) && should_retry());
-
-    if (statusG != S3StatusOK) {
-        printError();
-    }
-
-    S3_deinitialize();
-}
-
-
-// list bucket ---------------------------------------------------------------
-
-typedef struct list_bucket_callback_data
-{
-    int isTruncated;
-    char nextMarker[1024];
-    int keyCount;
-    int allDetails;
-} list_bucket_callback_data;
-
-
-static void printListBucketHeader(int allDetails)
-{
-    printf("%-50s  %-20s  %-5s", 
-           "                       Key", 
-           "   Last Modified", "Size");
-    if (allDetails) {
-        printf("  %-34s  %-64s  %-12s", 
-               "               ETag", 
-               "                            Owner ID",
-               "Display Name");
-    }
-    printf("\n");
-    printf("--------------------------------------------------  "
-           "--------------------  -----");
-    if (allDetails) {
-        printf("  ----------------------------------  "
-               "-------------------------------------------------"
-               "---------------  ------------");
-    }
-    printf("\n");
-}
-
-
-static S3Status listBucketCallback(int isTruncated, const char *nextMarker,
-                                   int contentsCount, 
-                                   const S3ListBucketContent *contents,
-                                   int commonPrefixesCount,
-                                   const char **commonPrefixes,
-                                   void *callbackData)
-{
-    list_bucket_callback_data *data = 
-        (list_bucket_callback_data *) callbackData;
-
-    data->isTruncated = isTruncated;
-    // This is tricky.  S3 doesn't return the NextMarker if there is no
-    // delimiter.  Why, I don't know, since it's still useful for paging
-    // through results.  We want NextMarker to be the last content in the
-    // list, so set it to that if necessary.
-    if ((!nextMarker || !nextMarker[0]) && contentsCount) {
-        nextMarker = contents[contentsCount - 1].key;
-    }
-    if (nextMarker) {
-        snprintf(data->nextMarker, sizeof(data->nextMarker), "%s", 
-                 nextMarker);
-    }
-    else {
-        data->nextMarker[0] = 0;
-    }
-    
-    if (contentsCount && !data->keyCount) {
-        printListBucketHeader(data->allDetails);
-    }
-
-    int i;
-    for (i = 0; i < contentsCount; i++) {
-        const S3ListBucketContent *content = &(contents[i]);
-        char timebuf[256];
-        if (0) {
-            time_t t = (time_t) content->lastModified;
-            strftime(timebuf, sizeof(timebuf), "%Y-%m-%dT%H:%M:%SZ",
-                     gmtime(&t));
-            printf("\nKey: %s\n", content->key);
-            printf("Last Modified: %s\n", timebuf);
-            printf("ETag: %s\n", content->eTag);
-            printf("Size: %llu\n", (unsigned long long) content->size);
-            if (content->ownerId) {
-                printf("Owner ID: %s\n", content->ownerId);
-            }
-            if (content->ownerDisplayName) {
-                printf("Owner Display Name: %s\n", content->ownerDisplayName);
-            }
-        }
-        else {
-            time_t t = (time_t) content->lastModified;
-            strftime(timebuf, sizeof(timebuf), "%Y-%m-%dT%H:%M:%SZ", 
-                     gmtime(&t));
-            char sizebuf[16];
-            if (content->size < 100000) {
-                sprintf(sizebuf, "%5llu", (unsigned long long) content->size);
-            }
-            else if (content->size < (1024 * 1024)) {
-                sprintf(sizebuf, "%4lluK", 
-                        ((unsigned long long) content->size) / 1024ULL);
-            }
-            else if (content->size < (10 * 1024 * 1024)) {
-                float f = content->size;
-                f /= (1024 * 1024);
-                sprintf(sizebuf, "%1.2fM", f);
-            }
-            else if (content->size < (1024 * 1024 * 1024)) {
-                sprintf(sizebuf, "%4lluM", 
-                        ((unsigned long long) content->size) / 
-                        (1024ULL * 1024ULL));
-            }
-            else {
-                float f = (content->size / 1024);
-                f /= (1024 * 1024);
-                sprintf(sizebuf, "%1.2fG", f);
-            }
-            printf("%-50s  %s  %s", content->key, timebuf, sizebuf);
-            if (data->allDetails) {
-                printf("  %-34s  %-64s  %-12s",
-                       content->eTag, 
-                       content->ownerId ? content->ownerId : "",
-                       content->ownerDisplayName ? 
-                       content->ownerDisplayName : "");
-            }
-            printf("\n");
-        }
-    }
-
-    data->keyCount += contentsCount;
-
-    for (i = 0; i < commonPrefixesCount; i++) {
-        printf("\nCommon Prefix: %s\n", commonPrefixes[i]);
-    }
-
-    return S3StatusOK;
-}
-
-
-static void list_bucket(const char *bucketName, const char *prefix,
-                        const char *marker, const char *delimiter,
-                        int maxkeys, int allDetails)
-{
-    S3_init();
-    
-    S3BucketContext bucketContext =
-    {
-        0,
-        bucketName,
-        protocolG,
-        uriStyleG,
-        accessKeyIdG,
-        secretAccessKeyG
-    };
-
-    S3ListBucketHandler listBucketHandler =
-    {
-        { &responsePropertiesCallback, &responseCompleteCallback },
-        &listBucketCallback
-    };
-
-    list_bucket_callback_data data;
-
-    snprintf(data.nextMarker, sizeof(data.nextMarker), "%s", marker);
-    data.keyCount = 0;
-    data.allDetails = allDetails;
-
-    do {
-        data.isTruncated = 0;
-        do {
-            S3_list_bucket(&bucketContext, prefix, data.nextMarker,
-                           delimiter, maxkeys, 0, &listBucketHandler, &data);
-        } while (S3_status_is_retryable(statusG) && should_retry());
-        if (statusG != S3StatusOK) {
-            break;
-        }
-    } while (data.isTruncated && (!maxkeys || (data.keyCount < maxkeys)));
-
-    if (statusG == S3StatusOK) {
-        if (!data.keyCount) {
-            printListBucketHeader(allDetails);
-        }
-    }
-    else {
-        printError();
-    }
-
-    S3_deinitialize();
-}
-
-
-static void list(int argc, char **argv, int optindex)
-{
-    if (optindex == argc) {
-        list_service(0);
-        return;
-    }
-
-    const char *bucketName = 0;
-
-    const char *prefix = 0, *marker = 0, *delimiter = 0;
-    int maxkeys = 0, allDetails = 0;
-    while (optindex < argc) {
-        char *param = argv[optindex++];
-        if (!strncmp(param, PREFIX_PREFIX, PREFIX_PREFIX_LEN)) {
-            prefix = &(param[PREFIX_PREFIX_LEN]);
-        }
-        else if (!strncmp(param, MARKER_PREFIX, MARKER_PREFIX_LEN)) {
-            marker = &(param[MARKER_PREFIX_LEN]);
-        }
-        else if (!strncmp(param, DELIMITER_PREFIX, DELIMITER_PREFIX_LEN)) {
-            delimiter = &(param[DELIMITER_PREFIX_LEN]);
-        }
-        else if (!strncmp(param, MAXKEYS_PREFIX, MAXKEYS_PREFIX_LEN)) {
-            maxkeys = convertInt(&(param[MAXKEYS_PREFIX_LEN]), "maxkeys");
-        }
-        else if (!strncmp(param, ALL_DETAILS_PREFIX,
-                          ALL_DETAILS_PREFIX_LEN)) {
-            const char *ad = &(param[ALL_DETAILS_PREFIX_LEN]);
-            if (!strcmp(ad, "true") || !strcmp(ad, "TRUE") || 
-                !strcmp(ad, "yes") || !strcmp(ad, "YES") ||
-                !strcmp(ad, "1")) {
-                allDetails = 1;
-            }
-        }
-        else if (!bucketName) {
-            bucketName = param;
-        }
-        else {
-            fprintf(stderr, "\nERROR: Unknown param: %s\n", param);
-            usageExit(stderr);
-        }
-    }
-
-    if (bucketName) {
-        list_bucket(bucketName, prefix, marker, delimiter, maxkeys, 
-                    allDetails);
-    }
-    else {
-        list_service(allDetails);
-    }
-}
-
-    
-
-// delete object -------------------------------------------------------------
-
-static void delete_object(int argc, char **argv, int optindex)
-{
-    (void) argc;
-
-    // Split bucket/key
-    char *slash = argv[optindex];
-
-    // We know there is a slash in there, put_object is only called if so
-    while (*slash && (*slash != '/')) {
-        slash++;
-    }
-    *slash++ = 0;
-
-    const char *bucketName = argv[optindex++];
-    const char *key = slash;
-
-    S3_init();
-    
-    S3BucketContext bucketContext =
-    {
-        0,
-        bucketName,
-        protocolG,
-        uriStyleG,
-        accessKeyIdG,
-        secretAccessKeyG
-    };
-
-    S3ResponseHandler responseHandler =
-    { 
-        0,
-        &responseCompleteCallback
-    };
-
-    do {
-        S3_delete_object(&bucketContext, key, 0, &responseHandler, 0);
-    } while (S3_status_is_retryable(statusG) && should_retry());
-
-    if ((statusG != S3StatusOK) &&
-        (statusG != S3StatusErrorPreconditionFailed)) {
-        printError();
-    }
-
-    S3_deinitialize();
-}
-
-
-// put object ----------------------------------------------------------------
-
-typedef struct put_object_callback_data
-{
-    FILE *infile;
-    growbuffer *gb;
-    uint64_t contentLength, originalContentLength;
-    int noStatus;
-} put_object_callback_data;
-
-
-static int putObjectDataCallback(int bufferSize, char *buffer,
-                                 void *callbackData)
-{
-    put_object_callback_data *data = 
-        (put_object_callback_data *) callbackData;
-    
-    int ret = 0;
-
-    if (data->contentLength) {
-        int toRead = ((data->contentLength > (unsigned) bufferSize) ?
-                      (unsigned) bufferSize : data->contentLength);
-        if (data->gb) {
-            growbuffer_read(&(data->gb), toRead, &ret, buffer);
-        }
-        else if (data->infile) {
-            ret = fread(buffer, 1, toRead, data->infile);
-        }
-    }
-
-    data->contentLength -= ret;
-
-    if (data->contentLength && !data->noStatus) {
-        // Avoid a weird bug in MingW, which won't print the second integer
-        // value properly when it's in the same call, so print separately
-        printf("%llu bytes remaining ", 
-               (unsigned long long) data->contentLength);
-        printf("(%d%% complete) ...\n",
-               (int) (((data->originalContentLength - 
-                        data->contentLength) * 100) /
-                      data->originalContentLength));
-    }
-
-    return ret;
-}
-
-
-static void put_object(int argc, char **argv, int optindex)
-{
-    if (optindex == argc) {
-        fprintf(stderr, "\nERROR: Missing parameter: bucket/key\n");
-        usageExit(stderr);
-    }
-
-    // Split bucket/key
-    char *slash = argv[optindex];
-    while (*slash && (*slash != '/')) {
-        slash++;
-    }
-    if (!*slash || !*(slash + 1)) {
-        fprintf(stderr, "\nERROR: Invalid bucket/key name: %s\n",
-                argv[optindex]);
-        usageExit(stderr);
-    }
-    *slash++ = 0;
-
-    const char *bucketName = argv[optindex++];
-    const char *key = slash;
-
-    const char *filename = 0;
-    uint64_t contentLength = 0;
-    const char *cacheControl = 0, *contentType = 0, *md5 = 0;
-    const char *contentDispositionFilename = 0, *contentEncoding = 0;
-    int64_t expires = -1;
-    S3CannedAcl cannedAcl = S3CannedAclPrivate;
-    int metaPropertiesCount = 0;
-    S3NameValue metaProperties[S3_MAX_METADATA_COUNT];
-    int noStatus = 0;
-
-    while (optindex < argc) {
-        char *param = argv[optindex++];
-        if (!strncmp(param, FILENAME_PREFIX, FILENAME_PREFIX_LEN)) {
-            filename = &(param[FILENAME_PREFIX_LEN]);
-        }
-        else if (!strncmp(param, CONTENT_LENGTH_PREFIX, 
-                          CONTENT_LENGTH_PREFIX_LEN)) {
-            contentLength = convertInt(&(param[CONTENT_LENGTH_PREFIX_LEN]),
-                                       "contentLength");
-            if (contentLength > (5LL * 1024 * 1024 * 1024)) {
-                fprintf(stderr, "\nERROR: contentLength must be no greater "
-                        "than 5 GB\n");
-                usageExit(stderr);
-            }
-        }
-        else if (!strncmp(param, CACHE_CONTROL_PREFIX, 
-                          CACHE_CONTROL_PREFIX_LEN)) {
-            cacheControl = &(param[CACHE_CONTROL_PREFIX_LEN]);
-        }
-        else if (!strncmp(param, CONTENT_TYPE_PREFIX, 
-                          CONTENT_TYPE_PREFIX_LEN)) {
-            contentType = &(param[CONTENT_TYPE_PREFIX_LEN]);
-        }
-        else if (!strncmp(param, MD5_PREFIX, MD5_PREFIX_LEN)) {
-            md5 = &(param[MD5_PREFIX_LEN]);
-        }
-        else if (!strncmp(param, CONTENT_DISPOSITION_FILENAME_PREFIX, 
-                          CONTENT_DISPOSITION_FILENAME_PREFIX_LEN)) {
-            contentDispositionFilename = 
-                &(param[CONTENT_DISPOSITION_FILENAME_PREFIX_LEN]);
-        }
-        else if (!strncmp(param, CONTENT_ENCODING_PREFIX, 
-                          CONTENT_ENCODING_PREFIX_LEN)) {
-            contentEncoding = &(param[CONTENT_ENCODING_PREFIX_LEN]);
-        }
-        else if (!strncmp(param, EXPIRES_PREFIX, EXPIRES_PREFIX_LEN)) {
-            expires = parseIso8601Time(&(param[EXPIRES_PREFIX_LEN]));
-            if (expires < 0) {
-                fprintf(stderr, "\nERROR: Invalid expires time "
-                        "value; ISO 8601 time format required\n");
-                usageExit(stderr);
-            }
-        }
-        else if (!strncmp(param, X_AMZ_META_PREFIX, X_AMZ_META_PREFIX_LEN)) {
-            if (metaPropertiesCount == S3_MAX_METADATA_COUNT) {
-                fprintf(stderr, "\nERROR: Too many x-amz-meta- properties, "
-                        "limit %lu: %s\n", 
-                        (unsigned long) S3_MAX_METADATA_COUNT, param);
-                usageExit(stderr);
-            }
-            char *name = &(param[X_AMZ_META_PREFIX_LEN]);
-            char *value = name;
-            while (*value && (*value != '=')) {
-                value++;
-            }
-            if (!*value || !*(value + 1)) {
-                fprintf(stderr, "\nERROR: Invalid parameter: %s\n", param);
-                usageExit(stderr);
-            }
-            *value++ = 0;
-            metaProperties[metaPropertiesCount].name = name;
-            metaProperties[metaPropertiesCount++].value = value;
-        }
-        else if (!strncmp(param, CANNED_ACL_PREFIX, CANNED_ACL_PREFIX_LEN)) {
-            char *val = &(param[CANNED_ACL_PREFIX_LEN]);
-            if (!strcmp(val, "private")) {
-                cannedAcl = S3CannedAclPrivate;
-            }
-            else if (!strcmp(val, "public-read")) {
-                cannedAcl = S3CannedAclPublicRead;
-            }
-            else if (!strcmp(val, "public-read-write")) {
-                cannedAcl = S3CannedAclPublicReadWrite;
-            }
-            else if (!strcmp(val, "authenticated-read")) {
-                cannedAcl = S3CannedAclAuthenticatedRead;
-            }
-            else {
-                fprintf(stderr, "\nERROR: Unknown canned ACL: %s\n", val);
-                usageExit(stderr);
-            }
-        }
-        else if (!strncmp(param, NO_STATUS_PREFIX, NO_STATUS_PREFIX_LEN)) {
-            const char *ns = &(param[NO_STATUS_PREFIX_LEN]);
-            if (!strcmp(ns, "true") || !strcmp(ns, "TRUE") || 
-                !strcmp(ns, "yes") || !strcmp(ns, "YES") ||
-                !strcmp(ns, "1")) {
-                noStatus = 1;
-            }
-        }
-        else {
-            fprintf(stderr, "\nERROR: Unknown param: %s\n", param);
-            usageExit(stderr);
-        }
-    }
-
-    put_object_callback_data data;
-
-    data.infile = 0;
-    data.gb = 0;
-    data.noStatus = noStatus;
-
-    if (filename) {
-        if (!contentLength) {
-            struct stat statbuf;
-            // Stat the file to get its length
-            if (stat(filename, &statbuf) == -1) {
-                fprintf(stderr, "\nERROR: Failed to stat file %s: ",
-                        filename);
-                perror(0);
-                exit(-1);
-            }
-            contentLength = statbuf.st_size;
-        }
-        // Open the file
-        if (!(data.infile = fopen(filename, "r" FOPEN_EXTRA_FLAGS))) {
-            fprintf(stderr, "\nERROR: Failed to open input file %s: ",
-                    filename);
-            perror(0);
-            exit(-1);
-        }
-    }
-    else {
-        // Read from stdin.  If contentLength is not provided, we have
-        // to read it all in to get contentLength.
-        if (!contentLength) {
-            // Read all if stdin to get the data
-            char buffer[64 * 1024];
-            while (1) {
-                int amtRead = fread(buffer, 1, sizeof(buffer), stdin);
-                if (amtRead == 0) {
-                    break;
-                }
-                if (!growbuffer_append(&(data.gb), buffer, amtRead)) {
-                    fprintf(stderr, "\nERROR: Out of memory while reading "
-                            "stdin\n");
-                    exit(-1);
-                }
-                contentLength += amtRead;
-                if (amtRead < (int) sizeof(buffer)) {
-                    break;
-                }
-            }
-        }
-        else {
-            data.infile = stdin;
-        }
-    }
-
-    data.contentLength = data.originalContentLength = contentLength;
-
-    S3_init();
-    
-    S3BucketContext bucketContext =
-    {
-        0,
-        bucketName,
-        protocolG,
-        uriStyleG,
-        accessKeyIdG,
-        secretAccessKeyG
-    };
-
-    S3PutProperties putProperties =
-    {
-        contentType,
-        md5,
-        cacheControl,
-        contentDispositionFilename,
-        contentEncoding,
-        expires,
-        cannedAcl,
-        metaPropertiesCount,
-        metaProperties
-    };
-
-    S3PutObjectHandler putObjectHandler =
-    {
-        { &responsePropertiesCallback, &responseCompleteCallback },
-        &putObjectDataCallback
-    };
-
-    do {
-        S3_put_object(&bucketContext, key, contentLength, &putProperties, 0,
-                      &putObjectHandler, &data);
-    } while (S3_status_is_retryable(statusG) && should_retry());
-
-    if (data.infile) {
-        fclose(data.infile);
-    }
-    else if (data.gb) {
-        growbuffer_destroy(data.gb);
-    }
-
-    if (statusG != S3StatusOK) {
-        printError();
-    }
-    else if (data.contentLength) {
-        fprintf(stderr, "\nERROR: Failed to read remaining %llu bytes from "
-                "input\n", (unsigned long long) data.contentLength);
-    }
-
-    S3_deinitialize();
-}
-
-
-// copy object ---------------------------------------------------------------
-
-static void copy_object(int argc, char **argv, int optindex)
-{
-    if (optindex == argc) {
-        fprintf(stderr, "\nERROR: Missing parameter: source bucket/key\n");
-        usageExit(stderr);
-    }
-
-    // Split bucket/key
-    char *slash = argv[optindex];
-    while (*slash && (*slash != '/')) {
-        slash++;
-    }
-    if (!*slash || !*(slash + 1)) {
-        fprintf(stderr, "\nERROR: Invalid source bucket/key name: %s\n",
-                argv[optindex]);
-        usageExit(stderr);
-    }
-    *slash++ = 0;
-
-    const char *sourceBucketName = argv[optindex++];
-    const char *sourceKey = slash;
-
-    if (optindex == argc) {
-        fprintf(stderr, "\nERROR: Missing parameter: "
-                "destination bucket/key\n");
-        usageExit(stderr);
-    }
-
-    // Split bucket/key
-    slash = argv[optindex];
-    while (*slash && (*slash != '/')) {
-        slash++;
-    }
-    if (!*slash || !*(slash + 1)) {
-        fprintf(stderr, "\nERROR: Invalid destination bucket/key name: %s\n",
-                argv[optindex]);
-        usageExit(stderr);
-    }
-    *slash++ = 0;
-
-    const char *destinationBucketName = argv[optindex++];
-    const char *destinationKey = slash;
-
-    const char *cacheControl = 0, *contentType = 0;
-    const char *contentDispositionFilename = 0, *contentEncoding = 0;
-    int64_t expires = -1;
-    S3CannedAcl cannedAcl = S3CannedAclPrivate;
-    int metaPropertiesCount = 0;
-    S3NameValue metaProperties[S3_MAX_METADATA_COUNT];
-    int anyPropertiesSet = 0;
-
-    while (optindex < argc) {
-        char *param = argv[optindex++];
-        if (!strncmp(param, CACHE_CONTROL_PREFIX, 
-                          CACHE_CONTROL_PREFIX_LEN)) {
-            cacheControl = &(param[CACHE_CONTROL_PREFIX_LEN]);
-            anyPropertiesSet = 1;
-        }
-        else if (!strncmp(param, CONTENT_TYPE_PREFIX, 
-                          CONTENT_TYPE_PREFIX_LEN)) {
-            contentType = &(param[CONTENT_TYPE_PREFIX_LEN]);
-            anyPropertiesSet = 1;
-        }
-        else if (!strncmp(param, CONTENT_DISPOSITION_FILENAME_PREFIX, 
-                          CONTENT_DISPOSITION_FILENAME_PREFIX_LEN)) {
-            contentDispositionFilename = 
-                &(param[CONTENT_DISPOSITION_FILENAME_PREFIX_LEN]);
-            anyPropertiesSet = 1;
-        }
-        else if (!strncmp(param, CONTENT_ENCODING_PREFIX, 
-                          CONTENT_ENCODING_PREFIX_LEN)) {
-            contentEncoding = &(param[CONTENT_ENCODING_PREFIX_LEN]);
-            anyPropertiesSet = 1;
-        }
-        else if (!strncmp(param, EXPIRES_PREFIX, EXPIRES_PREFIX_LEN)) {
-            expires = parseIso8601Time(&(param[EXPIRES_PREFIX_LEN]));
-            if (expires < 0) {
-                fprintf(stderr, "\nERROR: Invalid expires time "
-                        "value; ISO 8601 time format required\n");
-                usageExit(stderr);
-            }
-            anyPropertiesSet = 1;
-        }
-        else if (!strncmp(param, X_AMZ_META_PREFIX, X_AMZ_META_PREFIX_LEN)) {
-            if (metaPropertiesCount == S3_MAX_METADATA_COUNT) {
-                fprintf(stderr, "\nERROR: Too many x-amz-meta- properties, "
-                        "limit %lu: %s\n", 
-                        (unsigned long) S3_MAX_METADATA_COUNT, param);
-                usageExit(stderr);
-            }
-            char *name = &(param[X_AMZ_META_PREFIX_LEN]);
-            char *value = name;
-            while (*value && (*value != '=')) {
-                value++;
-            }
-            if (!*value || !*(value + 1)) {
-                fprintf(stderr, "\nERROR: Invalid parameter: %s\n", param);
-                usageExit(stderr);
-            }
-            *value++ = 0;
-            metaProperties[metaPropertiesCount].name = name;
-            metaProperties[metaPropertiesCount++].value = value;
-            anyPropertiesSet = 1;
-        }
-        else if (!strncmp(param, CANNED_ACL_PREFIX, CANNED_ACL_PREFIX_LEN)) {
-            char *val = &(param[CANNED_ACL_PREFIX_LEN]);
-            if (!strcmp(val, "private")) {
-                cannedAcl = S3CannedAclPrivate;
-            }
-            else if (!strcmp(val, "public-read")) {
-                cannedAcl = S3CannedAclPublicRead;
-            }
-            else if (!strcmp(val, "public-read-write")) {
-                cannedAcl = S3CannedAclPublicReadWrite;
-            }
-            else if (!strcmp(val, "authenticated-read")) {
-                cannedAcl = S3CannedAclAuthenticatedRead;
-            }
-            else {
-                fprintf(stderr, "\nERROR: Unknown canned ACL: %s\n", val);
-                usageExit(stderr);
-            }
-            anyPropertiesSet = 1;
-        }
-        else {
-            fprintf(stderr, "\nERROR: Unknown param: %s\n", param);
-            usageExit(stderr);
-        }
-    }
-
-    S3_init();
-    
-    S3BucketContext bucketContext =
-    {
-        0,
-        sourceBucketName,
-        protocolG,
-        uriStyleG,
-        accessKeyIdG,
-        secretAccessKeyG
-    };
-
-    S3PutProperties putProperties =
-    {
-        contentType,
-        0,
-        cacheControl,
-        contentDispositionFilename,
-        contentEncoding,
-        expires,
-        cannedAcl,
-        metaPropertiesCount,
-        metaProperties
-    };
-
-    S3ResponseHandler responseHandler =
-    { 
-        &responsePropertiesCallback,
-        &responseCompleteCallback
-    };
-
-    int64_t lastModified;
-    char eTag[256];
-
-    do {
-        S3_copy_object(&bucketContext, sourceKey, destinationBucketName,
-                       destinationKey, anyPropertiesSet ? &putProperties : 0,
-                       &lastModified, sizeof(eTag), eTag, 0,
-                       &responseHandler, 0);
-    } while (S3_status_is_retryable(statusG) && should_retry());
-
-    if (statusG == S3StatusOK) {
-        if (lastModified >= 0) {
-            char timebuf[256];
-            time_t t = (time_t) lastModified;
-            strftime(timebuf, sizeof(timebuf), "%Y-%m-%dT%H:%M:%SZ",
-                     gmtime(&t));
-            printf("Last-Modified: %s\n", timebuf);
-        }
-        if (eTag[0]) {
-            printf("ETag: %s\n", eTag);
-        }
-    }
-    else {
-        printError();
-    }
-
-    S3_deinitialize();
-}
-
-
-// get object ----------------------------------------------------------------
-
-static S3Status getObjectDataCallback(int bufferSize, const char *buffer,
-                                      void *callbackData)
-{
-    FILE *outfile = (FILE *) callbackData;
-
-    size_t wrote = fwrite(buffer, 1, bufferSize, outfile);
-    
-    return ((wrote < (size_t) bufferSize) ? 
-            S3StatusAbortedByCallback : S3StatusOK);
-}
-
-
-static void get_object(int argc, char **argv, int optindex)
-{
-    if (optindex == argc) {
-        fprintf(stderr, "\nERROR: Missing parameter: bucket/key\n");
-        usageExit(stderr);
-    }
-
-    // Split bucket/key
-    char *slash = argv[optindex];
-    while (*slash && (*slash != '/')) {
-        slash++;
-    }
-    if (!*slash || !*(slash + 1)) {
-        fprintf(stderr, "\nERROR: Invalid bucket/key name: %s\n",
-                argv[optindex]);
-        usageExit(stderr);
-    }
-    *slash++ = 0;
-
-    const char *bucketName = argv[optindex++];
-    const char *key = slash;
-
-    const char *filename = 0;
-    int64_t ifModifiedSince = -1, ifNotModifiedSince = -1;
-    const char *ifMatch = 0, *ifNotMatch = 0;
-    uint64_t startByte = 0, byteCount = 0;
-
-    while (optindex < argc) {
-        char *param = argv[optindex++];
-        if (!strncmp(param, FILENAME_PREFIX, FILENAME_PREFIX_LEN)) {
-            filename = &(param[FILENAME_PREFIX_LEN]);
-        }
-        else if (!strncmp(param, IF_MODIFIED_SINCE_PREFIX, 
-                     IF_MODIFIED_SINCE_PREFIX_LEN)) {
-            // Parse ifModifiedSince
-            ifModifiedSince = parseIso8601Time
-                (&(param[IF_MODIFIED_SINCE_PREFIX_LEN]));
-            if (ifModifiedSince < 0) {
-                fprintf(stderr, "\nERROR: Invalid ifModifiedSince time "
-                        "value; ISO 8601 time format required\n");
-                usageExit(stderr);
-            }
-        }
-        else if (!strncmp(param, IF_NOT_MODIFIED_SINCE_PREFIX, 
-                          IF_NOT_MODIFIED_SINCE_PREFIX_LEN)) {
-            // Parse ifModifiedSince
-            ifNotModifiedSince = parseIso8601Time
-                (&(param[IF_NOT_MODIFIED_SINCE_PREFIX_LEN]));
-            if (ifNotModifiedSince < 0) {
-                fprintf(stderr, "\nERROR: Invalid ifNotModifiedSince time "
-                        "value; ISO 8601 time format required\n");
-                usageExit(stderr);
-            }
-        }
-        else if (!strncmp(param, IF_MATCH_PREFIX, IF_MATCH_PREFIX_LEN)) {
-            ifMatch = &(param[IF_MATCH_PREFIX_LEN]);
-        }
-        else if (!strncmp(param, IF_NOT_MATCH_PREFIX,
-                          IF_NOT_MATCH_PREFIX_LEN)) {
-            ifNotMatch = &(param[IF_NOT_MATCH_PREFIX_LEN]);
-        }
-        else if (!strncmp(param, START_BYTE_PREFIX, START_BYTE_PREFIX_LEN)) {
-            startByte = convertInt
-                (&(param[START_BYTE_PREFIX_LEN]), "startByte");
-        }
-        else if (!strncmp(param, BYTE_COUNT_PREFIX, BYTE_COUNT_PREFIX_LEN)) {
-            byteCount = convertInt
-                (&(param[BYTE_COUNT_PREFIX_LEN]), "byteCount");
-        }
-        else {
-            fprintf(stderr, "\nERROR: Unknown param: %s\n", param);
-            usageExit(stderr);
-        }
-    }
-
-    FILE *outfile = 0;
-
-    if (filename) {
-        // Stat the file, and if it doesn't exist, open it in w mode
-        struct stat buf;
-        if (stat(filename, &buf) == -1) {
-            outfile = fopen(filename, "w" FOPEN_EXTRA_FLAGS);
-        }
-        else {
-            // Open in r+ so that we don't truncate the file, just in case
-            // there is an error and we write no bytes, we leave the file
-            // unmodified
-            outfile = fopen(filename, "r+" FOPEN_EXTRA_FLAGS);
-        }
-        
-        if (!outfile) {
-            fprintf(stderr, "\nERROR: Failed to open output file %s: ",
-                    filename);
-            perror(0);
-            exit(-1);
-        }
-    }
-    else if (showResponsePropertiesG) {
-        fprintf(stderr, "\nERROR: get -s requires a filename parameter\n");
-        usageExit(stderr);
-    }
-    else {
-        outfile = stdout;
-    }
-
-    S3_init();
-    
-    S3BucketContext bucketContext =
-    {
-        0,
-        bucketName,
-        protocolG,
-        uriStyleG,
-        accessKeyIdG,
-        secretAccessKeyG
-    };
-
-    S3GetConditions getConditions =
-    {
-        ifModifiedSince,
-        ifNotModifiedSince,
-        ifMatch,
-        ifNotMatch
-    };
-
-    S3GetObjectHandler getObjectHandler =
-    {
-        { &responsePropertiesCallback, &responseCompleteCallback },
-        &getObjectDataCallback
-    };
-
-    do {
-        S3_get_object(&bucketContext, key, &getConditions, startByte,
-                      byteCount, 0, &getObjectHandler, outfile);
-    } while (S3_status_is_retryable(statusG) && should_retry());
-
-    if (statusG != S3StatusOK) {
-        printError();
-    }
-
-    fclose(outfile);
-
-    S3_deinitialize();
-}
-
-
-// head object ---------------------------------------------------------------
-
-static void head_object(int argc, char **argv, int optindex)
-{
-    if (optindex == argc) {
-        fprintf(stderr, "\nERROR: Missing parameter: bucket/key\n");
-        usageExit(stderr);
-    }
-    
-    // Head implies showing response properties
-    showResponsePropertiesG = 1;
-
-    // Split bucket/key
-    char *slash = argv[optindex];
-
-    while (*slash && (*slash != '/')) {
-        slash++;
-    }
-    if (!*slash || !*(slash + 1)) {
-        fprintf(stderr, "\nERROR: Invalid bucket/key name: %s\n",
-                argv[optindex]);
-        usageExit(stderr);
-    }
-    *slash++ = 0;
-
-    const char *bucketName = argv[optindex++];
-    const char *key = slash;
-
-    if (optindex != argc) {
-        fprintf(stderr, "\nERROR: Extraneous parameter: %s\n", argv[optindex]);
-        usageExit(stderr);
-    }
-
-    S3_init();
-    
-    S3BucketContext bucketContext =
-    {
-        0,
-        bucketName,
-        protocolG,
-        uriStyleG,
-        accessKeyIdG,
-        secretAccessKeyG
-    };
-
-    S3ResponseHandler responseHandler =
-    { 
-        &responsePropertiesCallback,
-        &responseCompleteCallback
-    };
-
-    do {
-        S3_head_object(&bucketContext, key, 0, &responseHandler, 0);
-    } while (S3_status_is_retryable(statusG) && should_retry());
-
-    if ((statusG != S3StatusOK) &&
-        (statusG != S3StatusErrorPreconditionFailed)) {
-        printError();
-    }
-
-    S3_deinitialize();
-}
-
-
-// generate query string ------------------------------------------------------
-
-static void generate_query_string(int argc, char **argv, int optindex)
-{
-    if (optindex == argc) {
-        fprintf(stderr, "\nERROR: Missing parameter: bucket[/key]\n");
-        usageExit(stderr);
-    }
-
-    const char *bucketName = argv[optindex];
-    const char *key = 0;
-
-    // Split bucket/key
-    char *slash = argv[optindex++];
-    while (*slash && (*slash != '/')) {
-        slash++;
-    }
-    if (*slash) {
-        *slash++ = 0;
-        key = slash;
-    }
-    else {
-        key = 0;
-    }
-
-    int64_t expires = -1;
-
-    const char *resource = 0;
-
-    while (optindex < argc) {
-        char *param = argv[optindex++];
-        if (!strncmp(param, EXPIRES_PREFIX, EXPIRES_PREFIX_LEN)) {
-            expires = parseIso8601Time(&(param[EXPIRES_PREFIX_LEN]));
-            if (expires < 0) {
-                fprintf(stderr, "\nERROR: Invalid expires time "
-                        "value; ISO 8601 time format required\n");
-                usageExit(stderr);
-            }
-        }
-        else if (!strncmp(param, RESOURCE_PREFIX, RESOURCE_PREFIX_LEN)) {
-            resource = &(param[RESOURCE_PREFIX_LEN]);
-        }
-        else {
-            fprintf(stderr, "\nERROR: Unknown param: %s\n", param);
-            usageExit(stderr);
-        }
-    }
-
-    S3_init();
-    
-    S3BucketContext bucketContext =
-    {
-        0,
-        bucketName,
-        protocolG,
-        uriStyleG,
-        accessKeyIdG,
-        secretAccessKeyG
-    };
-
-    char buffer[S3_MAX_AUTHENTICATED_QUERY_STRING_SIZE];
-
-    S3Status status = S3_generate_authenticated_query_string
-        (buffer, &bucketContext, key, expires, resource);
-    
-    if (status != S3StatusOK) {
-        printf("Failed to generate authenticated query string: %s\n",
-               S3_get_status_name(status));
-    }
-    else {
-        printf("%s\n", buffer);
-    }
-
-    S3_deinitialize();
-}
-
-
-// get acl -------------------------------------------------------------------
-
-void get_acl(int argc, char **argv, int optindex)
-{
-    if (optindex == argc) {
-        fprintf(stderr, "\nERROR: Missing parameter: bucket[/key]\n");
-        usageExit(stderr);
-    }
-
-    const char *bucketName = argv[optindex];
-    const char *key = 0;
-
-    // Split bucket/key
-    char *slash = argv[optindex++];
-    while (*slash && (*slash != '/')) {
-        slash++;
-    }
-    if (*slash) {
-        *slash++ = 0;
-        key = slash;
-    }
-    else {
-        key = 0;
-    }
-
-    const char *filename = 0;
-
-    while (optindex < argc) {
-        char *param = argv[optindex++];
-        if (!strncmp(param, FILENAME_PREFIX, FILENAME_PREFIX_LEN)) {
-            filename = &(param[FILENAME_PREFIX_LEN]);
-        }
-        else {
-            fprintf(stderr, "\nERROR: Unknown param: %s\n", param);
-            usageExit(stderr);
-        }
-    }
-
-    FILE *outfile = 0;
-
-    if (filename) {
-        // Stat the file, and if it doesn't exist, open it in w mode
-        struct stat buf;
-        if (stat(filename, &buf) == -1) {
-            outfile = fopen(filename, "w" FOPEN_EXTRA_FLAGS);
-        }
-        else {
-            // Open in r+ so that we don't truncate the file, just in case
-            // there is an error and we write no bytes, we leave the file
-            // unmodified
-            outfile = fopen(filename, "r+" FOPEN_EXTRA_FLAGS);
-        }
-        
-        if (!outfile) {
-            fprintf(stderr, "\nERROR: Failed to open output file %s: ",
-                    filename);
-            perror(0);
-            exit(-1);
-        }
-    }
-    else if (showResponsePropertiesG) {
-        fprintf(stderr, "\nERROR: getacl -s requires a filename parameter\n");
-        usageExit(stderr);
-    }
-    else {
-        outfile = stdout;
-    }
-
-    int aclGrantCount;
-    S3AclGrant aclGrants[S3_MAX_ACL_GRANT_COUNT];
-    char ownerId[S3_MAX_GRANTEE_USER_ID_SIZE];
-    char ownerDisplayName[S3_MAX_GRANTEE_DISPLAY_NAME_SIZE];
-
-    S3_init();
-
-    S3BucketContext bucketContext =
-    {
-        0,
-        bucketName,
-        protocolG,
-        uriStyleG,
-        accessKeyIdG,
-        secretAccessKeyG
-    };
-
-    S3ResponseHandler responseHandler =
-    {
-        &responsePropertiesCallback,
-        &responseCompleteCallback
-    };
-
-    do {
-        S3_get_acl(&bucketContext, key, ownerId, ownerDisplayName, 
-                   &aclGrantCount, aclGrants, 0, &responseHandler, 0);
-    } while (S3_status_is_retryable(statusG) && should_retry());
-
-    if (statusG == S3StatusOK) {
-        fprintf(outfile, "OwnerID %s %s\n", ownerId, ownerDisplayName);
-        fprintf(outfile, "%-6s  %-90s  %-12s\n", " Type", 
-                "                                   User Identifier",
-                " Permission");
-        fprintf(outfile, "------  "
-                "------------------------------------------------------------"
-                "------------------------------  ------------\n");
-        int i;
-        for (i = 0; i < aclGrantCount; i++) {
-            S3AclGrant *grant = &(aclGrants[i]);
-            const char *type;
-            char composedId[S3_MAX_GRANTEE_USER_ID_SIZE + 
-                            S3_MAX_GRANTEE_DISPLAY_NAME_SIZE + 16];
-            const char *id;
-
-            switch (grant->granteeType) {
-            case S3GranteeTypeAmazonCustomerByEmail:
-                type = "Email";
-                id = grant->grantee.amazonCustomerByEmail.emailAddress;
-                break;
-            case S3GranteeTypeCanonicalUser:
-                type = "UserID";
-                snprintf(composedId, sizeof(composedId),
-                         "%s (%s)", grant->grantee.canonicalUser.id,
-                         grant->grantee.canonicalUser.displayName);
-                id = composedId;
-                break;
-            case S3GranteeTypeAllAwsUsers:
-                type = "Group";
-                id = "Authenticated AWS Users";
-                break;
-            case S3GranteeTypeAllUsers:
-                type = "Group";
-                id = "All Users";
-                break;
-            default:
-                type = "Group";
-                id = "Log Delivery";
-                break;
-            }
-            const char *perm;
-            switch (grant->permission) {
-            case S3PermissionRead:
-                perm = "READ";
-                break;
-            case S3PermissionWrite:
-                perm = "WRITE";
-                break;
-            case S3PermissionReadACP:
-                perm = "READ_ACP";
-                break;
-            case S3PermissionWriteACP:
-                perm = "WRITE_ACP";
-                break;
-            default:
-                perm = "FULL_CONTROL";
-                break;
-            }
-            fprintf(outfile, "%-6s  %-90s  %-12s\n", type, id, perm);
-        }
-    }
-    else {
-        printError();
-    }
-
-    fclose(outfile);
-
-    S3_deinitialize();
-}
-
-
-// set acl -------------------------------------------------------------------
-
-void set_acl(int argc, char **argv, int optindex)
-{
-    if (optindex == argc) {
-        fprintf(stderr, "\nERROR: Missing parameter: bucket[/key]\n");
-        usageExit(stderr);
-    }
-
-    const char *bucketName = argv[optindex];
-    const char *key = 0;
-
-    // Split bucket/key
-    char *slash = argv[optindex++];
-    while (*slash && (*slash != '/')) {
-        slash++;
-    }
-    if (*slash) {
-        *slash++ = 0;
-        key = slash;
-    }
-    else {
-        key = 0;
-    }
-
-    const char *filename = 0;
-
-    while (optindex < argc) {
-        char *param = argv[optindex++];
-        if (!strncmp(param, FILENAME_PREFIX, FILENAME_PREFIX_LEN)) {
-            filename = &(param[FILENAME_PREFIX_LEN]);
-        }
-        else {
-            fprintf(stderr, "\nERROR: Unknown param: %s\n", param);
-            usageExit(stderr);
-        }
-    }
-
-    FILE *infile;
-
-    if (filename) {
-        if (!(infile = fopen(filename, "r" FOPEN_EXTRA_FLAGS))) {
-            fprintf(stderr, "\nERROR: Failed to open input file %s: ",
-                    filename);
-            perror(0);
-            exit(-1);
-        }
-    }
-    else {
-        infile = stdin;
-    }
-
-    // Read in the complete ACL
-    char aclBuf[65536];
-    aclBuf[fread(aclBuf, 1, sizeof(aclBuf), infile)] = 0;
-    char ownerId[S3_MAX_GRANTEE_USER_ID_SIZE];
-    char ownerDisplayName[S3_MAX_GRANTEE_DISPLAY_NAME_SIZE];
-    
-    // Parse it
-    int aclGrantCount;
-    S3AclGrant aclGrants[S3_MAX_ACL_GRANT_COUNT];
-    if (!convert_simple_acl(aclBuf, ownerId, ownerDisplayName,
-                            &aclGrantCount, aclGrants)) {
-        fprintf(stderr, "\nERROR: Failed to parse ACLs\n");
-        fclose(infile);
-        exit(-1);
-    }
-
-    S3_init();
-
-    S3BucketContext bucketContext =
-    {
-        0,
-        bucketName,
-        protocolG,
-        uriStyleG,
-        accessKeyIdG,
-        secretAccessKeyG
-    };
-
-    S3ResponseHandler responseHandler =
-    {
-        &responsePropertiesCallback,
-        &responseCompleteCallback
-    };
-
-    do {
-        S3_set_acl(&bucketContext, key, ownerId, ownerDisplayName,
-                   aclGrantCount, aclGrants, 0, &responseHandler, 0);
-    } while (S3_status_is_retryable(statusG) && should_retry());
-    
-    if (statusG != S3StatusOK) {
-        printError();
-    }
-
-    fclose(infile);
-
-    S3_deinitialize();
-}
-
-
-// get logging ----------------------------------------------------------------
-
-void get_logging(int argc, char **argv, int optindex)
-{
-    if (optindex == argc) {
-        fprintf(stderr, "\nERROR: Missing parameter: bucket\n");
-        usageExit(stderr);
-    }
-
-    const char *bucketName = argv[optindex++];
-    const char *filename = 0;
-
-    while (optindex < argc) {
-        char *param = argv[optindex++];
-        if (!strncmp(param, FILENAME_PREFIX, FILENAME_PREFIX_LEN)) {
-            filename = &(param[FILENAME_PREFIX_LEN]);
-        }
-        else {
-            fprintf(stderr, "\nERROR: Unknown param: %s\n", param);
-            usageExit(stderr);
-        }
-    }
-
-    FILE *outfile = 0;
-
-    if (filename) {
-        // Stat the file, and if it doesn't exist, open it in w mode
-        struct stat buf;
-        if (stat(filename, &buf) == -1) {
-            outfile = fopen(filename, "w" FOPEN_EXTRA_FLAGS);
-        }
-        else {
-            // Open in r+ so that we don't truncate the file, just in case
-            // there is an error and we write no bytes, we leave the file
-            // unmodified
-            outfile = fopen(filename, "r+" FOPEN_EXTRA_FLAGS);
-        }
-        
-        if (!outfile) {
-            fprintf(stderr, "\nERROR: Failed to open output file %s: ",
-                    filename);
-            perror(0);
-            exit(-1);
-        }
-    }
-    else if (showResponsePropertiesG) {
-        fprintf(stderr, "\nERROR: getlogging -s requires a filename "
-                "parameter\n");
-        usageExit(stderr);
-    }
-    else {
-        outfile = stdout;
-    }
-
-    int aclGrantCount;
-    S3AclGrant aclGrants[S3_MAX_ACL_GRANT_COUNT];
-    char targetBucket[S3_MAX_BUCKET_NAME_SIZE];
-    char targetPrefix[S3_MAX_KEY_SIZE];
-
-    S3_init();
-
-    S3BucketContext bucketContext =
-    {
-        0,
-        bucketName,
-        protocolG,
-        uriStyleG,
-        accessKeyIdG,
-        secretAccessKeyG
-    };
-
-    S3ResponseHandler responseHandler =
-    {
-        &responsePropertiesCallback,
-        &responseCompleteCallback
-    };
-
-    do {
-        S3_get_server_access_logging(&bucketContext, targetBucket, targetPrefix,
-                                     &aclGrantCount, aclGrants, 0, 
-                                     &responseHandler, 0);
-    } while (S3_status_is_retryable(statusG) && should_retry());
-
-    if (statusG == S3StatusOK) {
-        if (targetBucket[0]) {
-            printf("Target Bucket: %s\n", targetBucket);
-            if (targetPrefix[0]) {
-                printf("Target Prefix: %s\n", targetPrefix);
-            }
-            fprintf(outfile, "%-6s  %-90s  %-12s\n", " Type", 
-                    "                                   User Identifier",
-                    " Permission");
-            fprintf(outfile, "------  "
-                    "---------------------------------------------------------"
-                    "---------------------------------  ------------\n");
-            int i;
-            for (i = 0; i < aclGrantCount; i++) {
-                S3AclGrant *grant = &(aclGrants[i]);
-                const char *type;
-                char composedId[S3_MAX_GRANTEE_USER_ID_SIZE + 
-                                S3_MAX_GRANTEE_DISPLAY_NAME_SIZE + 16];
-                const char *id;
-                
-                switch (grant->granteeType) {
-                case S3GranteeTypeAmazonCustomerByEmail:
-                    type = "Email";
-                    id = grant->grantee.amazonCustomerByEmail.emailAddress;
-                    break;
-                case S3GranteeTypeCanonicalUser:
-                    type = "UserID";
-                    snprintf(composedId, sizeof(composedId),
-                             "%s (%s)", grant->grantee.canonicalUser.id,
-                             grant->grantee.canonicalUser.displayName);
-                    id = composedId;
-                    break;
-                case S3GranteeTypeAllAwsUsers:
-                    type = "Group";
-                    id = "Authenticated AWS Users";
-                    break;
-                default:
-                    type = "Group";
-                    id = "All Users";
-                    break;
-                }
-                const char *perm;
-                switch (grant->permission) {
-                case S3PermissionRead:
-                    perm = "READ";
-                    break;
-                case S3PermissionWrite:
-                    perm = "WRITE";
-                    break;
-                case S3PermissionReadACP:
-                    perm = "READ_ACP";
-                    break;
-                case S3PermissionWriteACP:
-                    perm = "WRITE_ACP";
-                    break;
-                default:
-                    perm = "FULL_CONTROL";
-                    break;
-                }
-                fprintf(outfile, "%-6s  %-90s  %-12s\n", type, id, perm);
-            }
-        }
-        else {
-            printf("Service logging is not enabled for this bucket.\n");
-        }
-    }
-    else {
-        printError();
-    }
-
-    fclose(outfile);
-
-    S3_deinitialize();
-}
-
-
-// set logging ----------------------------------------------------------------
-
-void set_logging(int argc, char **argv, int optindex)
-{
-    if (optindex == argc) {
-        fprintf(stderr, "\nERROR: Missing parameter: bucket\n");
-        usageExit(stderr);
-    }
-
-    const char *bucketName = argv[optindex++];
-
-    const char *targetBucket = 0, *targetPrefix = 0, *filename = 0;
-
-    while (optindex < argc) {
-        char *param = argv[optindex++];
-        if (!strncmp(param, TARGET_BUCKET_PREFIX, TARGET_BUCKET_PREFIX_LEN)) {
-            targetBucket = &(param[TARGET_BUCKET_PREFIX_LEN]);
-        }
-        else if (!strncmp(param, TARGET_PREFIX_PREFIX, 
-                          TARGET_PREFIX_PREFIX_LEN)) {
-            targetPrefix = &(param[TARGET_PREFIX_PREFIX_LEN]);
-        }
-        else if (!strncmp(param, FILENAME_PREFIX, FILENAME_PREFIX_LEN)) {
-            filename = &(param[FILENAME_PREFIX_LEN]);
-        }
-        else {
-            fprintf(stderr, "\nERROR: Unknown param: %s\n", param);
-            usageExit(stderr);
-        }
-    }
-
-    int aclGrantCount = 0;
-    S3AclGrant aclGrants[S3_MAX_ACL_GRANT_COUNT];
-
-    if (targetBucket) {
-        FILE *infile;
-        
-        if (filename) {
-            if (!(infile = fopen(filename, "r" FOPEN_EXTRA_FLAGS))) {
-                fprintf(stderr, "\nERROR: Failed to open input file %s: ",
-                        filename);
-                perror(0);
-                exit(-1);
-            }
-        }
-        else {
-            infile = stdin;
-        }
-
-        // Read in the complete ACL
-        char aclBuf[65536];
-        aclBuf[fread(aclBuf, 1, sizeof(aclBuf), infile)] = 0;
-        char ownerId[S3_MAX_GRANTEE_USER_ID_SIZE];
-        char ownerDisplayName[S3_MAX_GRANTEE_DISPLAY_NAME_SIZE];
-        
-        // Parse it
-        if (!convert_simple_acl(aclBuf, ownerId, ownerDisplayName,
-                                &aclGrantCount, aclGrants)) {
-            fprintf(stderr, "\nERROR: Failed to parse ACLs\n");
-            fclose(infile);
-            exit(-1);
-        }
-
-        fclose(infile);
-    }
-
-    S3_init();
-
-    S3BucketContext bucketContext =
-    {
-        0,
-        bucketName,
-        protocolG,
-        uriStyleG,
-        accessKeyIdG,
-        secretAccessKeyG
-    };
-
-    S3ResponseHandler responseHandler =
-    {
-        &responsePropertiesCallback,
-        &responseCompleteCallback
-    };
-
-    do {
-        S3_set_server_access_logging(&bucketContext, targetBucket, 
-                                     targetPrefix, aclGrantCount, aclGrants, 
-                                     0, &responseHandler, 0);
-    } while (S3_status_is_retryable(statusG) && should_retry());
-    
-    if (statusG != S3StatusOK) {
-        printError();
-    }
-
-    S3_deinitialize();
-}
-
-
-// main ----------------------------------------------------------------------
-
-int main(int argc, char **argv)
-{
-    // Parse args
-    while (1) {
-        int idx = 0;
-        int c = getopt_long(argc, argv, "fhusr:", longOptionsG, &idx);
-
-        if (c == -1) {
-            // End of options
-            break;
-        }
-
-        switch (c) {
-        case 'f':
-            forceG = 1;
-            break;
-        case 'h':
-            uriStyleG = S3UriStyleVirtualHost;
-            break;
-        case 'u':
-            protocolG = S3ProtocolHTTP;
-            break;
-        case 's':
-            showResponsePropertiesG = 1;
-            break;
-        case 'r': {
-            const char *v = optarg;
-            retriesG = 0;
-            while (*v) {
-                retriesG *= 10;
-                retriesG += *v - '0';
-                v++;
-            }
-            break;
-        }
-        default:
-            fprintf(stderr, "\nERROR: Unknown option: -%c\n", c);
-            // Usage exit
-            usageExit(stderr);
-        }
-    }
-
-    // The first non-option argument gives the operation to perform
-    if (optind == argc) {
-        fprintf(stderr, "\n\nERROR: Missing argument: command\n\n");
-        usageExit(stderr);
-    }
-
-    const char *command = argv[optind++];
-    
-    if (!strcmp(command, "help")) {
-        fprintf(stdout, "\ns3 is a program for performing single requests "
-                "to Amazon S3.\n");
-        usageExit(stdout);
-    }
-
-    accessKeyIdG = getenv("S3_ACCESS_KEY_ID");
-    if (!accessKeyIdG) {
-        fprintf(stderr, "Missing environment variable: S3_ACCESS_KEY_ID\n");
-        return -1;
-    }
-    secretAccessKeyG = getenv("S3_SECRET_ACCESS_KEY");
-    if (!secretAccessKeyG) {
-        fprintf(stderr, 
-                "Missing environment variable: S3_SECRET_ACCESS_KEY\n");
-        return -1;
-    }
-
-    if (!strcmp(command, "list")) {
-        list(argc, argv, optind);
-    }
-    else if (!strcmp(command, "test")) {
-        test_bucket(argc, argv, optind);
-    }
-    else if (!strcmp(command, "create")) {
-        create_bucket(argc, argv, optind);
-    }
-    else if (!strcmp(command, "delete")) {
-        if (optind == argc) {
-            fprintf(stderr, 
-                    "\nERROR: Missing parameter: bucket or bucket/key\n");
-            usageExit(stderr);
-        }
-        char *val = argv[optind];
-        int hasSlash = 0;
-        while (*val) {
-            if (*val++ == '/') {
-                hasSlash = 1;
-                break;
-            }
-        }
-        if (hasSlash) {
-            delete_object(argc, argv, optind);
-        }
-        else {
-            delete_bucket(argc, argv, optind);
-        }
-    }
-    else if (!strcmp(command, "put")) {
-        put_object(argc, argv, optind);
-    }
-    else if (!strcmp(command, "copy")) {
-        copy_object(argc, argv, optind);
-    }
-    else if (!strcmp(command, "get")) {
-        get_object(argc, argv, optind);
-    }
-    else if (!strcmp(command, "head")) {
-        head_object(argc, argv, optind);
-    }
-    else if (!strcmp(command, "gqs")) {
-        generate_query_string(argc, argv, optind);
-    }
-    else if (!strcmp(command, "getacl")) {
-        get_acl(argc, argv, optind);
-    }
-    else if (!strcmp(command, "setacl")) {
-        set_acl(argc, argv, optind);
-    }
-    else if (!strcmp(command, "getlogging")) {
-        get_logging(argc, argv, optind);
-    }
-    else if (!strcmp(command, "setlogging")) {
-        set_logging(argc, argv, optind);
-    }
-    else {
-        fprintf(stderr, "Unknown command: %s\n", command);
-        return -1;
-    }
-
-    return 0;
-}
diff --git a/src/libs3/src/service.c b/src/libs3/src/service.c
deleted file mode 100644
index 2d1e038..0000000
--- a/src/libs3/src/service.c
+++ /dev/null
@@ -1,191 +0,0 @@
-/** **************************************************************************
- * service.c
- * 
- * Copyright 2008 Bryan Ischo <bryan at ischo.com>
- * 
- * This file is part of libs3.
- * 
- * libs3 is free software: you can redistribute it and/or modify it under the
- * terms of the GNU General Public License as published by the Free Software
- * Foundation, version 3 of the License.
- *
- * In addition, as a special exception, the copyright holders give
- * permission to link the code of this library and its programs with the
- * OpenSSL library, and distribute linked combinations including the two.
- *
- * libs3 is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License version 3
- * along with libs3, in a file named COPYING.  If not, see
- * <http://www.gnu.org/licenses/>.
- *
- ************************************************************************** **/
-
-#include <ctype.h>
-#include <stdlib.h>
-#include <string.h>
-#include <time.h>
-#include "request.h"
-
-
-typedef struct XmlCallbackData
-{
-    SimpleXml simpleXml;
-    
-    S3ResponsePropertiesCallback *responsePropertiesCallback;
-    S3ListServiceCallback *listServiceCallback;
-    S3ResponseCompleteCallback *responseCompleteCallback;
-    void *callbackData;
-
-    string_buffer(ownerId, 256);
-    string_buffer(ownerDisplayName, 256);
-    string_buffer(bucketName, 256);
-    string_buffer(creationDate, 128);
-} XmlCallbackData;
-
-
-static S3Status xmlCallback(const char *elementPath, const char *data,
-                            int dataLen, void *callbackData)
-{
-    XmlCallbackData *cbData = (XmlCallbackData *) callbackData;
-
-    int fit;
-
-    if (data) {
-        if (!strcmp(elementPath, "ListAllMyBucketsResult/Owner/ID")) {
-            string_buffer_append(cbData->ownerId, data, dataLen, fit);
-        }
-        else if (!strcmp(elementPath, 
-                         "ListAllMyBucketsResult/Owner/DisplayName")) {
-            string_buffer_append(cbData->ownerDisplayName, data, dataLen, fit);
-        }
-        else if (!strcmp(elementPath, 
-                         "ListAllMyBucketsResult/Buckets/Bucket/Name")) {
-            string_buffer_append(cbData->bucketName, data, dataLen, fit);
-        }
-        else if (!strcmp
-                 (elementPath, 
-                  "ListAllMyBucketsResult/Buckets/Bucket/CreationDate")) {
-            string_buffer_append(cbData->creationDate, data, dataLen, fit);
-        }
-    }
-    else {
-        if (!strcmp(elementPath, "ListAllMyBucketsResult/Buckets/Bucket")) {
-            // Parse date.  Assume ISO-8601 date format.
-            time_t creationDate = parseIso8601Time(cbData->creationDate);
-
-            // Make the callback - a bucket just finished
-            S3Status status = (*(cbData->listServiceCallback))
-                (cbData->ownerId, cbData->ownerDisplayName,
-                 cbData->bucketName, creationDate, cbData->callbackData);
-
-            string_buffer_initialize(cbData->bucketName);
-            string_buffer_initialize(cbData->creationDate);
-
-            return status;
-        }
-    }
-
-    /* Avoid compiler error about variable set but not used */
-    (void) fit;
-
-    return S3StatusOK;
-}
-
-
-static S3Status propertiesCallback
-    (const S3ResponseProperties *responseProperties, void *callbackData)
-{
-    XmlCallbackData *cbData = (XmlCallbackData *) callbackData;
-    
-    return (*(cbData->responsePropertiesCallback))
-        (responseProperties, cbData->callbackData);
-}
-
-
-static S3Status dataCallback(int bufferSize, const char *buffer,
-                             void *callbackData)
-{
-    XmlCallbackData *cbData = (XmlCallbackData *) callbackData;
-
-    return simplexml_add(&(cbData->simpleXml), buffer, bufferSize);
-}
-
-
-static void completeCallback(S3Status requestStatus,
-                             const S3ErrorDetails *s3ErrorDetails,
-                             void *callbackData)
-{
-    XmlCallbackData *cbData = (XmlCallbackData *) callbackData;
-
-    (*(cbData->responseCompleteCallback))
-        (requestStatus, s3ErrorDetails, cbData->callbackData);
-
-    simplexml_deinitialize(&(cbData->simpleXml));
-
-    free(cbData);
-}
-
-
-void S3_list_service(S3Protocol protocol, const char *accessKeyId,
-                     const char *secretAccessKey, const char *hostName,
-                     S3RequestContext *requestContext,
-                     const S3ListServiceHandler *handler, void *callbackData)
-{
-    // Create and set up the callback data
-    XmlCallbackData *data = 
-        (XmlCallbackData *) malloc(sizeof(XmlCallbackData));
-    if (!data) {
-        (*(handler->responseHandler.completeCallback))
-            (S3StatusOutOfMemory, 0, callbackData);
-        return;
-    }
-
-    simplexml_initialize(&(data->simpleXml), &xmlCallback, data);
-
-    data->responsePropertiesCallback =
-        handler->responseHandler.propertiesCallback;
-    data->listServiceCallback = handler->listServiceCallback;
-    data->responseCompleteCallback = handler->responseHandler.completeCallback;
-    data->callbackData = callbackData;
-
-    string_buffer_initialize(data->ownerId);
-    string_buffer_initialize(data->ownerDisplayName);
-    string_buffer_initialize(data->bucketName);
-    string_buffer_initialize(data->creationDate);
-    
-    // Set up the RequestParams
-    RequestParams params =
-    {
-        HttpRequestTypeGET,                           // httpRequestType
-        { hostName,                                   // hostName
-          0,                                          // bucketName
-          protocol,                                   // protocol
-          S3UriStylePath,                             // uriStyle
-          accessKeyId,                                // accessKeyId
-          secretAccessKey },                          // secretAccessKey
-        0,                                            // key
-        0,                                            // queryParams
-        0,                                            // subResource
-        0,                                            // copySourceBucketName
-        0,                                            // copySourceKey
-        0,                                            // getConditions
-        0,                                            // startByte
-        0,                                            // byteCount
-        0,                                            // requestProperties
-        &propertiesCallback,                          // propertiesCallback
-        0,                                            // toS3Callback
-        0,                                            // toS3CallbackTotalSize
-        &dataCallback,                                // fromS3Callback
-        &completeCallback,                            // completeCallback
-        data                                          // callbackData
-    };
-
-    // Perform the request
-    request_perform(&params, requestContext);
-}
-
-
diff --git a/src/libs3/src/service_access_logging.c b/src/libs3/src/service_access_logging.c
deleted file mode 100644
index 22c6e40..0000000
--- a/src/libs3/src/service_access_logging.c
+++ /dev/null
@@ -1,555 +0,0 @@
-/** **************************************************************************
- * server_access_logging.c
- * 
- * Copyright 2008 Bryan Ischo <bryan at ischo.com>
- * 
- * This file is part of libs3.
- * 
- * libs3 is free software: you can redistribute it and/or modify it under the
- * terms of the GNU General Public License as published by the Free Software
- * Foundation, version 3 of the License.
- *
- * In addition, as a special exception, the copyright holders give
- * permission to link the code of this library and its programs with the
- * OpenSSL library, and distribute linked combinations including the two.
- *
- * libs3 is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License version 3
- * along with libs3, in a file named COPYING.  If not, see
- * <http://www.gnu.org/licenses/>.
- *
- ************************************************************************** **/
-
-#include <stdlib.h>
-#include <string.h>
-#include "libs3.h"
-#include "request.h"
-
-
-// get server access logging---------------------------------------------------
-
-typedef struct ConvertBlsData
-{
-    char *targetBucketReturn;
-    int targetBucketReturnLen;
-    char *targetPrefixReturn;
-    int targetPrefixReturnLen;
-    int *aclGrantCountReturn;
-    S3AclGrant *aclGrants;
-
-    string_buffer(emailAddress, S3_MAX_GRANTEE_EMAIL_ADDRESS_SIZE);
-    string_buffer(userId, S3_MAX_GRANTEE_USER_ID_SIZE);
-    string_buffer(userDisplayName, S3_MAX_GRANTEE_DISPLAY_NAME_SIZE);
-    string_buffer(groupUri, 128);
-    string_buffer(permission, 32);
-} ConvertBlsData;
-
-
-static S3Status convertBlsXmlCallback(const char *elementPath,
-                                      const char *data, int dataLen,
-                                      void *callbackData)
-{
-    ConvertBlsData *caData = (ConvertBlsData *) callbackData;
-
-    int fit;
-
-    if (data) {
-        if (!strcmp(elementPath, "BucketLoggingStatus/LoggingEnabled/"
-                    "TargetBucket")) {
-            caData->targetBucketReturnLen += 
-                snprintf(&(caData->targetBucketReturn
-                           [caData->targetBucketReturnLen]),
-                         255 - caData->targetBucketReturnLen - 1, 
-                         "%.*s", dataLen, data);
-            if (caData->targetBucketReturnLen >= 255) {
-                return S3StatusTargetBucketTooLong;
-            }
-        }
-        else if (!strcmp(elementPath, "BucketLoggingStatus/LoggingEnabled/"
-                    "TargetPrefix")) {
-            caData->targetPrefixReturnLen += 
-                snprintf(&(caData->targetPrefixReturn
-                           [caData->targetPrefixReturnLen]),
-                         255 - caData->targetPrefixReturnLen - 1, 
-                         "%.*s", dataLen, data);
-            if (caData->targetPrefixReturnLen >= 255) {
-                return S3StatusTargetPrefixTooLong;
-            }
-        }
-        else if (!strcmp(elementPath, "BucketLoggingStatus/LoggingEnabled/"
-                         "TargetGrants/Grant/Grantee/EmailAddress")) {
-            // AmazonCustomerByEmail
-            string_buffer_append(caData->emailAddress, data, dataLen, fit);
-            if (!fit) {
-                return S3StatusEmailAddressTooLong;
-            }
-        }
-        else if (!strcmp(elementPath,
-                         "AccessControlPolicy/AccessControlList/Grant/"
-                         "Grantee/ID")) {
-            // CanonicalUser
-            string_buffer_append(caData->userId, data, dataLen, fit);
-            if (!fit) {
-                return S3StatusUserIdTooLong;
-            }
-        }
-        else if (!strcmp(elementPath, "BucketLoggingStatus/LoggingEnabled/"
-                         "TargetGrants/Grant/Grantee/DisplayName")) {
-            // CanonicalUser
-            string_buffer_append(caData->userDisplayName, data, dataLen, fit);
-            if (!fit) {
-                return S3StatusUserDisplayNameTooLong;
-            }
-        }
-        else if (!strcmp(elementPath, "BucketLoggingStatus/LoggingEnabled/"
-                         "TargetGrants/Grant/Grantee/URI")) {
-            // Group
-            string_buffer_append(caData->groupUri, data, dataLen, fit);
-            if (!fit) {
-                return S3StatusGroupUriTooLong;
-            }
-        }
-        else if (!strcmp(elementPath, "BucketLoggingStatus/LoggingEnabled/"
-                         "TargetGrants/Grant/Permission")) {
-            // Permission
-            string_buffer_append(caData->permission, data, dataLen, fit);
-            if (!fit) {
-                return S3StatusPermissionTooLong;
-            }
-        }
-    }
-    else {
-        if (!strcmp(elementPath, "BucketLoggingStatus/LoggingEnabled/"
-                    "TargetGrants/Grant")) {
-            // A grant has just been completed; so add the next S3AclGrant
-            // based on the values read
-            if (*(caData->aclGrantCountReturn) == S3_MAX_ACL_GRANT_COUNT) {
-                return S3StatusTooManyGrants;
-            }
-
-            S3AclGrant *grant = &(caData->aclGrants
-                                  [*(caData->aclGrantCountReturn)]);
-
-            if (caData->emailAddress[0]) {
-                grant->granteeType = S3GranteeTypeAmazonCustomerByEmail;
-                strcpy(grant->grantee.amazonCustomerByEmail.emailAddress,
-                       caData->emailAddress);
-            }
-            else if (caData->userId[0] && caData->userDisplayName[0]) {
-                grant->granteeType = S3GranteeTypeCanonicalUser;
-                strcpy(grant->grantee.canonicalUser.id, caData->userId);
-                strcpy(grant->grantee.canonicalUser.displayName, 
-                       caData->userDisplayName);
-            }
-            else if (caData->groupUri[0]) {
-                if (!strcmp(caData->groupUri,
-                            ACS_GROUP_AWS_USERS)) {
-                    grant->granteeType = S3GranteeTypeAllAwsUsers;
-                }
-                else if (!strcmp(caData->groupUri,
-                                 ACS_GROUP_ALL_USERS)) {
-                    grant->granteeType = S3GranteeTypeAllUsers;
-                }
-                else {
-                    return S3StatusBadGrantee;
-                }
-            }
-            else {
-                return S3StatusBadGrantee;
-            }
-
-            if (!strcmp(caData->permission, "READ")) {
-                grant->permission = S3PermissionRead;
-            }
-            else if (!strcmp(caData->permission, "WRITE")) {
-                grant->permission = S3PermissionWrite;
-            }
-            else if (!strcmp(caData->permission, "READ_ACP")) {
-                grant->permission = S3PermissionReadACP;
-            }
-            else if (!strcmp(caData->permission, "WRITE_ACP")) {
-                grant->permission = S3PermissionWriteACP;
-            }
-            else if (!strcmp(caData->permission, "FULL_CONTROL")) {
-                grant->permission = S3PermissionFullControl;
-            }
-            else {
-                return S3StatusBadPermission;
-            }
-
-            (*(caData->aclGrantCountReturn))++;
-
-            string_buffer_initialize(caData->emailAddress);
-            string_buffer_initialize(caData->userId);
-            string_buffer_initialize(caData->userDisplayName);
-            string_buffer_initialize(caData->groupUri);
-            string_buffer_initialize(caData->permission);
-        }
-    }
-
-    return S3StatusOK;
-}
-
-
-static S3Status convert_bls(char *blsXml, char *targetBucketReturn,
-                            char *targetPrefixReturn, int *aclGrantCountReturn,
-                            S3AclGrant *aclGrants)
-{
-    ConvertBlsData data;
-
-    data.targetBucketReturn = targetBucketReturn;
-    data.targetBucketReturn[0] = 0;
-    data.targetBucketReturnLen = 0;
-    data.targetPrefixReturn = targetPrefixReturn;
-    data.targetPrefixReturn[0] = 0;
-    data.targetPrefixReturnLen = 0;
-    data.aclGrantCountReturn = aclGrantCountReturn;
-    data.aclGrants = aclGrants;
-    *aclGrantCountReturn = 0;
-    string_buffer_initialize(data.emailAddress);
-    string_buffer_initialize(data.userId);
-    string_buffer_initialize(data.userDisplayName);
-    string_buffer_initialize(data.groupUri);
-    string_buffer_initialize(data.permission);
-
-    // Use a simplexml parser
-    SimpleXml simpleXml;
-    simplexml_initialize(&simpleXml, &convertBlsXmlCallback, &data);
-
-    S3Status status = simplexml_add(&simpleXml, blsXml, strlen(blsXml));
-
-    simplexml_deinitialize(&simpleXml);
-                                          
-    return status;
-}
-
-
-// Use a rather arbitrary max size for the document of 64K
-#define BLS_XML_DOC_MAXSIZE (64 * 1024)
-
-
-typedef struct GetBlsData
-{
-    SimpleXml simpleXml;
-
-    S3ResponsePropertiesCallback *responsePropertiesCallback;
-    S3ResponseCompleteCallback *responseCompleteCallback;
-    void *callbackData;
-
-    char *targetBucketReturn;
-    char *targetPrefixReturn;
-    int *aclGrantCountReturn;
-    S3AclGrant *aclGrants;
-    string_buffer(blsXmlDocument, BLS_XML_DOC_MAXSIZE);
-} GetBlsData;
-
-
-static S3Status getBlsPropertiesCallback
-    (const S3ResponseProperties *responseProperties, void *callbackData)
-{
-    GetBlsData *gsData = (GetBlsData *) callbackData;
-    
-    return (*(gsData->responsePropertiesCallback))
-        (responseProperties, gsData->callbackData);
-}
-
-
-static S3Status getBlsDataCallback(int bufferSize, const char *buffer,
-                                   void *callbackData)
-{
-    GetBlsData *gsData = (GetBlsData *) callbackData;
-
-    int fit;
-
-    string_buffer_append(gsData->blsXmlDocument, buffer, bufferSize, fit);
-    
-    return fit ? S3StatusOK : S3StatusXmlDocumentTooLarge;
-}
-
-
-static void getBlsCompleteCallback(S3Status requestStatus, 
-                                   const S3ErrorDetails *s3ErrorDetails,
-                                   void *callbackData)
-{
-    GetBlsData *gsData = (GetBlsData *) callbackData;
-
-    if (requestStatus == S3StatusOK) {
-        // Parse the document
-        requestStatus = convert_bls
-            (gsData->blsXmlDocument, gsData->targetBucketReturn,
-             gsData->targetPrefixReturn, gsData->aclGrantCountReturn, 
-             gsData->aclGrants);
-    }
-
-    (*(gsData->responseCompleteCallback))
-        (requestStatus, s3ErrorDetails, gsData->callbackData);
-
-    free(gsData);
-}
-
-
-void S3_get_server_access_logging(const S3BucketContext *bucketContext,
-                                  char *targetBucketReturn,
-                                  char *targetPrefixReturn,
-                                  int *aclGrantCountReturn, 
-                                  S3AclGrant *aclGrants,
-                                  S3RequestContext *requestContext,
-                                  const S3ResponseHandler *handler,
-                                  void *callbackData)
-{
-    // Create the callback data
-    GetBlsData *gsData = (GetBlsData *) malloc(sizeof(GetBlsData));
-    if (!gsData) {
-        (*(handler->completeCallback))(S3StatusOutOfMemory, 0, callbackData);
-        return;
-    }
-
-    gsData->responsePropertiesCallback = handler->propertiesCallback;
-    gsData->responseCompleteCallback = handler->completeCallback;
-    gsData->callbackData = callbackData;
-
-    gsData->targetBucketReturn = targetBucketReturn;
-    gsData->targetPrefixReturn = targetPrefixReturn;
-    gsData->aclGrantCountReturn = aclGrantCountReturn;
-    gsData->aclGrants = aclGrants;
-    string_buffer_initialize(gsData->blsXmlDocument);
-    *aclGrantCountReturn = 0;
-
-    // Set up the RequestParams
-    RequestParams params =
-    {
-        HttpRequestTypeGET,                           // httpRequestType
-        { bucketContext->hostName,                    // hostName
-          bucketContext->bucketName,                  // bucketName
-          bucketContext->protocol,                    // protocol
-          bucketContext->uriStyle,                    // uriStyle
-          bucketContext->accessKeyId,                 // accessKeyId
-          bucketContext->secretAccessKey },           // secretAccessKey
-        0,                                            // key
-        0,                                            // queryParams
-        "logging",                                    // subResource
-        0,                                            // copySourceBucketName
-        0,                                            // copySourceKey
-        0,                                            // getConditions
-        0,                                            // startByte
-        0,                                            // byteCount
-        0,                                            // putProperties
-        &getBlsPropertiesCallback,                    // propertiesCallback
-        0,                                            // toS3Callback
-        0,                                            // toS3CallbackTotalSize
-        &getBlsDataCallback,                          // fromS3Callback
-        &getBlsCompleteCallback,                      // completeCallback
-        gsData                                        // callbackData
-    };
-
-    // Perform the request
-    request_perform(&params, requestContext);
-}
-
-
-
-// set server access logging---------------------------------------------------
-
-static S3Status generateSalXmlDocument(const char *targetBucket,
-                                       const char *targetPrefix,
-                                       int aclGrantCount, 
-                                       const S3AclGrant *aclGrants,
-                                       int *xmlDocumentLenReturn,
-                                       char *xmlDocument,
-                                       int xmlDocumentBufferSize)
-{
-    *xmlDocumentLenReturn = 0;
-
-#define append(fmt, ...)                                        \
-    do {                                                        \
-        *xmlDocumentLenReturn += snprintf                       \
-            (&(xmlDocument[*xmlDocumentLenReturn]),             \
-             xmlDocumentBufferSize - *xmlDocumentLenReturn - 1, \
-             fmt, __VA_ARGS__);                                 \
-        if (*xmlDocumentLenReturn >= xmlDocumentBufferSize) {   \
-            return S3StatusXmlDocumentTooLarge;                 \
-        } \
-    } while (0)
-
-    append("%s", "<BucketLoggingStatus "
-           "xmlns=\"http://doc.s3.amazonaws.com/2006-03-01\">");
-
-    if (targetBucket && targetBucket[0]) {
-        append("<LoggingEnabled><TargetBucket>%s</TargetBucket>", targetBucket);
-        append("<TargetPrefix>%s</TargetPrefix>", 
-               targetPrefix ? targetPrefix : "");
-
-        if (aclGrantCount) {
-            append("%s", "<TargetGrants>");
-            int i;
-            for (i = 0; i < aclGrantCount; i++) {
-                append("%s", "<Grant><Grantee "
-                       "xmlns:xsi=\"http://www.w3.org/2001/"
-                       "XMLSchema-instance\" xsi:type=\"");
-                const S3AclGrant *grant = &(aclGrants[i]);
-                switch (grant->granteeType) {
-                case S3GranteeTypeAmazonCustomerByEmail:
-                    append("AmazonCustomerByEmail\"><EmailAddress>%s"
-                           "</EmailAddress>",
-                           grant->grantee.amazonCustomerByEmail.emailAddress);
-                    break;
-                case S3GranteeTypeCanonicalUser:
-                    append("CanonicalUser\"><ID>%s</ID><DisplayName>%s"
-                           "</DisplayName>",
-                           grant->grantee.canonicalUser.id, 
-                           grant->grantee.canonicalUser.displayName);
-                    break;
-                default: // case S3GranteeTypeAllAwsUsers/S3GranteeTypeAllUsers:
-                    append("Group\"><URI>%s</URI>",
-                           (grant->granteeType == S3GranteeTypeAllAwsUsers) ?
-                           ACS_GROUP_AWS_USERS : ACS_GROUP_ALL_USERS);
-                    break;
-                }
-                append("</Grantee><Permission>%s</Permission></Grant>",
-                       ((grant->permission == S3PermissionRead) ? "READ" :
-                        (grant->permission == S3PermissionWrite) ? "WRITE" :
-                        (grant->permission == 
-                         S3PermissionReadACP) ? "READ_ACP" :
-                        (grant->permission == 
-                         S3PermissionWriteACP) ? "WRITE_ACP" : "FULL_CONTROL"));
-            }
-            append("%s", "</TargetGrants>");
-        }
-        append("%s", "</LoggingEnabled>");
-    }
-
-    append("%s", "</BucketLoggingStatus>");
-
-    return S3StatusOK;
-}
-
-
-typedef struct SetSalData
-{
-    S3ResponsePropertiesCallback *responsePropertiesCallback;
-    S3ResponseCompleteCallback *responseCompleteCallback;
-    void *callbackData;
-
-    int salXmlDocumentLen;
-    char salXmlDocument[BLS_XML_DOC_MAXSIZE];
-    int salXmlDocumentBytesWritten;
-
-} SetSalData;
-
-
-static S3Status setSalPropertiesCallback
-    (const S3ResponseProperties *responseProperties, void *callbackData)
-{
-    SetSalData *paData = (SetSalData *) callbackData;
-    
-    return (*(paData->responsePropertiesCallback))
-        (responseProperties, paData->callbackData);
-}
-
-
-static int setSalDataCallback(int bufferSize, char *buffer, void *callbackData)
-{
-    SetSalData *paData = (SetSalData *) callbackData;
-
-    int remaining = (paData->salXmlDocumentLen - 
-                     paData->salXmlDocumentBytesWritten);
-
-    int toCopy = bufferSize > remaining ? remaining : bufferSize;
-    
-    if (!toCopy) {
-        return 0;
-    }
-
-    memcpy(buffer, &(paData->salXmlDocument
-                     [paData->salXmlDocumentBytesWritten]), toCopy);
-
-    paData->salXmlDocumentBytesWritten += toCopy;
-
-    return toCopy;
-}
-
-
-static void setSalCompleteCallback(S3Status requestStatus, 
-                                   const S3ErrorDetails *s3ErrorDetails,
-                                   void *callbackData)
-{
-    SetSalData *paData = (SetSalData *) callbackData;
-
-    (*(paData->responseCompleteCallback))
-        (requestStatus, s3ErrorDetails, paData->callbackData);
-
-    free(paData);
-}
-
-
-void S3_set_server_access_logging(const S3BucketContext *bucketContext,
-                                  const char *targetBucket, 
-                                  const char *targetPrefix, int aclGrantCount, 
-                                  const S3AclGrant *aclGrants, 
-                                  S3RequestContext *requestContext,
-                                  const S3ResponseHandler *handler,
-                                  void *callbackData)
-{
-    if (aclGrantCount > S3_MAX_ACL_GRANT_COUNT) {
-        (*(handler->completeCallback))
-            (S3StatusTooManyGrants, 0, callbackData);
-        return;
-    }
-
-    SetSalData *data = (SetSalData *) malloc(sizeof(SetSalData));
-    if (!data) {
-        (*(handler->completeCallback))(S3StatusOutOfMemory, 0, callbackData);
-        return;
-    }
-    
-    // Convert aclGrants to XML document
-    S3Status status = generateSalXmlDocument
-        (targetBucket, targetPrefix, aclGrantCount, aclGrants,
-         &(data->salXmlDocumentLen), data->salXmlDocument, 
-         sizeof(data->salXmlDocument));
-    if (status != S3StatusOK) {
-        free(data);
-        (*(handler->completeCallback))(status, 0, callbackData);
-        return;
-    }
-
-    data->responsePropertiesCallback = handler->propertiesCallback;
-    data->responseCompleteCallback = handler->completeCallback;
-    data->callbackData = callbackData;
-
-    data->salXmlDocumentBytesWritten = 0;
-
-    // Set up the RequestParams
-    RequestParams params =
-    {
-        HttpRequestTypePUT,                           // httpRequestType
-        { bucketContext->hostName,                    // hostName
-          bucketContext->bucketName,                  // bucketName
-          bucketContext->protocol,                    // protocol
-          bucketContext->uriStyle,                    // uriStyle
-          bucketContext->accessKeyId,                 // accessKeyId
-          bucketContext->secretAccessKey },           // secretAccessKey
-        0,                                            // key
-        0,                                            // queryParams
-        "logging",                                    // subResource
-        0,                                            // copySourceBucketName
-        0,                                            // copySourceKey
-        0,                                            // getConditions
-        0,                                            // startByte
-        0,                                            // byteCount
-        0,                                            // putProperties
-        &setSalPropertiesCallback,                    // propertiesCallback
-        &setSalDataCallback,                          // toS3Callback
-        data->salXmlDocumentLen,                      // toS3CallbackTotalSize
-        0,                                            // fromS3Callback
-        &setSalCompleteCallback,                      // completeCallback
-        data                                          // callbackData
-    };
-
-    // Perform the request
-    request_perform(&params, requestContext);
-}
diff --git a/src/libs3/src/simplexml.c b/src/libs3/src/simplexml.c
deleted file mode 100644
index bd8616b..0000000
--- a/src/libs3/src/simplexml.c
+++ /dev/null
@@ -1,207 +0,0 @@
-/** **************************************************************************
- * simplexml.c
- * 
- * Copyright 2008 Bryan Ischo <bryan at ischo.com>
- * 
- * This file is part of libs3.
- * 
- * libs3 is free software: you can redistribute it and/or modify it under the
- * terms of the GNU General Public License as published by the Free Software
- * Foundation, version 3 of the License.
- *
- * In addition, as a special exception, the copyright holders give
- * permission to link the code of this library and its programs with the
- * OpenSSL library, and distribute linked combinations including the two.
- *
- * libs3 is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License version 3
- * along with libs3, in a file named COPYING.  If not, see
- * <http://www.gnu.org/licenses/>.
- *
- ************************************************************************** **/
-
-#include <libxml/parser.h>
-#include <string.h>
-#include "simplexml.h"
-
-// Use libxml2 for parsing XML.  XML is severely overused in modern
-// computing.  It is useful for only a very small subset of tasks, but
-// software developers who don't know better and are afraid to go against the
-// grain use it for everything, and in most cases, it is completely
-// inappropriate.  Usually, the document structure is severely under-specified
-// as well, as is the case with S3.  We do our best by just caring about the
-// most important aspects of the S3 "XML document" responses: the elements and
-// their values.  The SAX API (just about the lamest API ever devised and
-// proof that XML sucks - well, the real proof is how crappy all of the XML
-// parsing libraries are, including libxml2 - but I digress) is used here
-// because we don't need much from the parser and SAX is fast and low memory.
-//
-// Note that for simplicity we assume all ASCII here.  No attempts are made to
-// detect non-ASCII sequences in utf-8 and convert them into ASCII in any way.
-// S3 appears to only use ASCII anyway.
-
-
-static xmlEntityPtr saxGetEntity(void *user_data, const xmlChar *name)
-{
-    (void) user_data;
-
-    return xmlGetPredefinedEntity(name);
-}
-
-
-static void saxStartElement(void *user_data, const xmlChar *nameUtf8,
-                            const xmlChar **attr)
-{
-    (void) attr;
-
-    SimpleXml *simpleXml = (SimpleXml *) user_data;
-
-    if (simpleXml->status != S3StatusOK) {
-        return;
-    }
-    
-    // Assume that name has no non-ASCII in it
-    char *name = (char *) nameUtf8;
-
-    // Append the element to the element path
-    int len = strlen(name);
-
-    if ((simpleXml->elementPathLen + len + 1) >= 
-        (int) sizeof(simpleXml->elementPath)) {
-        // Cannot handle this element, stop!
-        simpleXml->status = S3StatusXmlParseFailure;
-        return;
-    }
-
-    if (simpleXml->elementPathLen) {
-        simpleXml->elementPath[simpleXml->elementPathLen++] = '/';
-    }
-    strcpy(&(simpleXml->elementPath[simpleXml->elementPathLen]), name);
-    simpleXml->elementPathLen += len;
-}
-
-
-static void saxEndElement(void *user_data, const xmlChar *name)
-{
-    (void) name;
-
-    SimpleXml *simpleXml = (SimpleXml *) user_data;
-
-    if (simpleXml->status != S3StatusOK) {
-        return;
-    }
-
-    // Call back with 0 data
-    simpleXml->status = (*(simpleXml->callback))
-        (simpleXml->elementPath, 0, 0, simpleXml->callbackData);
-
-    while ((simpleXml->elementPathLen > 0) &&
-           (simpleXml->elementPath[simpleXml->elementPathLen] != '/')) {
-        simpleXml->elementPathLen--;
-    }
-
-    simpleXml->elementPath[simpleXml->elementPathLen] = 0;
-}
-
-
-static void saxCharacters(void *user_data, const xmlChar *ch, int len)
-{
-    SimpleXml *simpleXml = (SimpleXml *) user_data;
-
-    if (simpleXml->status != S3StatusOK) {
-        return;
-    }
-
-    simpleXml->status = (*(simpleXml->callback))
-        (simpleXml->elementPath, (char *) ch, len, simpleXml->callbackData);
-}
-
-
-static void saxError(void *user_data, const char *msg, ...)
-{
-    (void) msg;
-
-    SimpleXml *simpleXml = (SimpleXml *) user_data;
-
-    if (simpleXml->status != S3StatusOK) {
-        return;
-    }
-
-    simpleXml->status = S3StatusXmlParseFailure;
-}
-
-
-static struct _xmlSAXHandler saxHandlerG =
-{
-    0, // internalSubsetSAXFunc
-    0, // isStandaloneSAXFunc
-    0, // hasInternalSubsetSAXFunc
-    0, // hasExternalSubsetSAXFunc
-    0, // resolveEntitySAXFunc
-    &saxGetEntity, // getEntitySAXFunc
-    0, // entityDeclSAXFunc
-    0, // notationDeclSAXFunc
-    0, // attributeDeclSAXFunc
-    0, // elementDeclSAXFunc
-    0, // unparsedEntityDeclSAXFunc
-    0, // setDocumentLocatorSAXFunc
-    0, // startDocumentSAXFunc
-    0, // endDocumentSAXFunc
-    &saxStartElement, // startElementSAXFunc
-    &saxEndElement, // endElementSAXFunc
-    0, // referenceSAXFunc
-    &saxCharacters, // charactersSAXFunc
-    0, // ignorableWhitespaceSAXFunc
-    0, // processingInstructionSAXFunc
-    0, // commentSAXFunc
-    0, // warningSAXFunc
-    &saxError, // errorSAXFunc
-    &saxError, // fatalErrorSAXFunc
-    0, // getParameterEntitySAXFunc
-    &saxCharacters, // cdataBlockSAXFunc
-    0, // externalSubsetSAXFunc
-    0, // initialized
-    0, // _private
-    0, // startElementNsSAX2Func
-    0, // endElementNsSAX2Func
-    0 // xmlStructuredErrorFunc serror;
-};
-
-void simplexml_initialize(SimpleXml *simpleXml, 
-                          SimpleXmlCallback *callback, void *callbackData)
-{
-    simpleXml->callback = callback;
-    simpleXml->callbackData = callbackData;
-    simpleXml->elementPathLen = 0;
-    simpleXml->status = S3StatusOK;
-    simpleXml->xmlParser = 0;
-}
-
-
-void simplexml_deinitialize(SimpleXml *simpleXml)
-{
-    if (simpleXml->xmlParser) {
-        xmlFreeParserCtxt(simpleXml->xmlParser);
-    }
-}
-
-
-S3Status simplexml_add(SimpleXml *simpleXml, const char *data, int dataLen)
-{
-    if (!simpleXml->xmlParser &&
-        (!(simpleXml->xmlParser = xmlCreatePushParserCtxt
-           (&saxHandlerG, simpleXml, 0, 0, 0)))) {
-        return S3StatusInternalError;
-    }
-
-    if (xmlParseChunk((xmlParserCtxtPtr) simpleXml->xmlParser, 
-                      data, dataLen, 0)) {
-        return S3StatusXmlParseFailure;
-    }
-
-    return simpleXml->status;
-}
diff --git a/src/libs3/src/testsimplexml.c b/src/libs3/src/testsimplexml.c
deleted file mode 100644
index 57fba7d..0000000
--- a/src/libs3/src/testsimplexml.c
+++ /dev/null
@@ -1,87 +0,0 @@
-/** **************************************************************************
- * testsimplexml.c
- * 
- * Copyright 2008 Bryan Ischo <bryan at ischo.com>
- * 
- * This file is part of libs3.
- * 
- * libs3 is free software: you can redistribute it and/or modify it under the
- * terms of the GNU General Public License as published by the Free Software
- * Foundation, version 3 of the License.
- *
- * In addition, as a special exception, the copyright holders give
- * permission to link the code of this library and its programs with the
- * OpenSSL library, and distribute linked combinations including the two.
- *
- * libs3 is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License version 3
- * along with libs3, in a file named COPYING.  If not, see
- * <http://www.gnu.org/licenses/>.
- *
- ************************************************************************** **/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <time.h>
-#include "simplexml.h"
-
-static S3Status simpleXmlCallback(const char *elementPath, const char *data,
-                                  int dataLen, void *callbackData)
-{
-    (void) callbackData;
-
-    printf("[%s]: [%.*s]\n", elementPath, dataLen, data);
-
-    return S3StatusOK;
-}
-
-
-// The only argument allowed is a specification of the random seed to use
-int main(int argc, char **argv)
-{
-    if (argc > 1) {
-        char *arg = argv[1];
-        int seed = 0;
-        while (*arg) {
-            seed *= 10;
-            seed += (*arg++ - '0');
-        }
-        
-        srand(seed);
-    }
-    else {
-        srand(time(0));
-    }
-
-    SimpleXml simpleXml;
-
-    simplexml_initialize(&simpleXml, &simpleXmlCallback, 0);
-
-    // Read chunks of 10K from stdin, and then feed them in random chunks
-    // to simplexml_add
-    char inbuf[10000];
-
-    int amt_read;
-    while ((amt_read = fread(inbuf, 1, sizeof(inbuf), stdin)) > 0) {
-        char *buf = inbuf;
-        while (amt_read) {
-            int amt = (rand() % amt_read) + 1;
-            S3Status status = simplexml_add(&simpleXml, buf, amt);
-            if (status != S3StatusOK) {
-                fprintf(stderr, "ERROR: Parse failure: %d\n", status);
-                simplexml_deinitialize(&simpleXml);
-                return -1;
-            }
-            buf += amt, amt_read -= amt;
-        }
-    }
-
-    simplexml_deinitialize(&simpleXml);
-
-    return 0;
-}
diff --git a/src/libs3/src/util.c b/src/libs3/src/util.c
deleted file mode 100644
index 0737084..0000000
--- a/src/libs3/src/util.c
+++ /dev/null
@@ -1,560 +0,0 @@
-/** **************************************************************************
- * util.c
- * 
- * Copyright 2008 Bryan Ischo <bryan at ischo.com>
- * 
- * This file is part of libs3.
- * 
- * libs3 is free software: you can redistribute it and/or modify it under the
- * terms of the GNU General Public License as published by the Free Software
- * Foundation, version 3 of the License.
- *
- * In addition, as a special exception, the copyright holders give
- * permission to link the code of this library and its programs with the
- * OpenSSL library, and distribute linked combinations including the two.
- *
- * libs3 is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License version 3
- * along with libs3, in a file named COPYING.  If not, see
- * <http://www.gnu.org/licenses/>.
- *
- ************************************************************************** **/
-
-#include <ctype.h>
-#include <string.h>
-#include "util.h"
-
-
-// Convenience utility for making the code look nicer.  Tests a string
-// against a format; only the characters specified in the format are
-// checked (i.e. if the string is longer than the format, the string still
-// checks out ok).  Format characters are:
-// d - is a digit
-// anything else - is that character
-// Returns nonzero the string checks out, zero if it does not.
-static int checkString(const char *str, const char *format)
-{
-    while (*format) {
-        if (*format == 'd') {
-            if (!isdigit(*str)) {
-                return 0;
-            }
-        }
-        else if (*str != *format) {
-            return 0;
-        }
-        str++, format++;
-    }
-
-    return 1;
-}
-
-
-int urlEncode(char *dest, const char *src, int maxSrcSize)
-{
-    static const char *hex = "0123456789ABCDEF";
-
-    int len = 0;
-
-    if (src) while (*src) {
-        if (++len > maxSrcSize) {
-            *dest = 0;
-            return 0;
-        }
-        unsigned char c = *src;
-        if (isalnum(c) ||
-            (c == '-') || (c == '_') || (c == '.') || (c == '!') || 
-            (c == '~') || (c == '*') || (c == '\'') || (c == '(') ||
-            (c == ')') || (c == '/')) {
-            *dest++ = c;
-        }
-        else if (*src == ' ') {
-            *dest++ = '+';
-        }
-        else {
-            *dest++ = '%';
-            *dest++ = hex[c >> 4];
-            *dest++ = hex[c & 15];
-        }
-        src++;
-    }
-
-    *dest = 0;
-
-    return 1;
-}
-
-
-int64_t parseIso8601Time(const char *str)
-{
-    // Check to make sure that it has a valid format
-    if (!checkString(str, "dddd-dd-ddTdd:dd:dd")) {
-        return -1;
-    }
-
-#define nextnum() (((*str - '0') * 10) + (*(str + 1) - '0'))
-
-    // Convert it
-    struct tm stm;
-    memset(&stm, 0, sizeof(stm));
-
-    stm.tm_year = (nextnum() - 19) * 100;
-    str += 2;
-    stm.tm_year += nextnum();
-    str += 3;
-
-    stm.tm_mon = nextnum() - 1;
-    str += 3;
-
-    stm.tm_mday = nextnum();
-    str += 3;
-
-    stm.tm_hour = nextnum();
-    str += 3;
-
-    stm.tm_min = nextnum();
-    str += 3;
-
-    stm.tm_sec = nextnum();
-    str += 2;
-
-    stm.tm_isdst = -1;
-    
-    int64_t ret = mktime(&stm);
-
-    // Skip the millis
-
-    if (*str == '.') {
-        str++;
-        while (isdigit(*str)) {
-            str++;
-        }
-    }
-    
-    if (checkString(str, "-dd:dd") || checkString(str, "+dd:dd")) {
-        int sign = (*str++ == '-') ? -1 : 1;
-        int hours = nextnum();
-        str += 3;
-        int minutes = nextnum();
-        ret += (-sign * (((hours * 60) + minutes) * 60));
-    }
-    // Else it should be Z to be a conformant time string, but we just assume
-    // that it is rather than enforcing that
-
-    return ret;
-}
-
-
-uint64_t parseUnsignedInt(const char *str)
-{
-    // Skip whitespace
-    while (is_blank(*str)) {
-        str++;
-    }
-
-    uint64_t ret = 0;
-
-    while (isdigit(*str)) {
-        ret *= 10;
-        ret += (*str++ - '0');
-    }
-
-    return ret;
-}
-
-
-int base64Encode(const unsigned char *in, int inLen, char *out)
-{
-    static const char *ENC = 
-        "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
-
-    char *original_out = out;
-
-    while (inLen) {
-        // first 6 bits of char 1
-        *out++ = ENC[*in >> 2];
-        if (!--inLen) {
-            // last 2 bits of char 1, 4 bits of 0
-            *out++ = ENC[(*in & 0x3) << 4];
-            *out++ = '=';
-            *out++ = '=';
-            break;
-        }
-        // last 2 bits of char 1, first 4 bits of char 2
-        *out++ = ENC[((*in & 0x3) << 4) | (*(in + 1) >> 4)];
-        in++;
-        if (!--inLen) {
-            // last 4 bits of char 2, 2 bits of 0
-            *out++ = ENC[(*in & 0xF) << 2];
-            *out++ = '=';
-            break;
-        }
-        // last 4 bits of char 2, first 2 bits of char 3
-        *out++ = ENC[((*in & 0xF) << 2) | (*(in + 1) >> 6)];
-        in++;
-        // last 6 bits of char 3
-        *out++ = ENC[*in & 0x3F];
-        in++, inLen--;
-    }
-
-    return (out - original_out);
-}
-
-
-#define rol(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits))))
-
-#define blk0L(i) (block->l[i] = (rol(block->l[i], 24) & 0xFF00FF00)     \
-                  | (rol(block->l[i], 8) & 0x00FF00FF))
-
-#define blk0B(i) (block->l[i])
-
-#define blk(i) (block->l[i & 15] = rol(block->l[(i + 13) & 15] ^        \
-                                       block->l[(i + 8) & 15] ^         \
-                                       block->l[(i + 2) & 15] ^         \
-                                       block->l[i & 15], 1))
-
-#define R0_L(v, w, x, y, z, i)                                          \
-    z += ((w & (x ^ y)) ^ y) + blk0L(i) + 0x5A827999 + rol(v, 5);       \
-    w = rol(w, 30);
-#define R0_B(v, w, x, y, z, i)                                          \
-    z += ((w & (x ^ y)) ^ y) + blk0B(i) + 0x5A827999 + rol(v, 5);       \
-    w = rol(w, 30);
-#define R1(v, w, x, y, z, i)                                            \
-    z += ((w & (x ^ y)) ^ y) + blk(i) + 0x5A827999 + rol(v, 5);         \
-    w = rol(w, 30);
-#define R2(v, w, x, y, z, i)                                            \
-    z += (w ^ x ^ y) + blk(i) + 0x6ED9EBA1 + rol(v, 5);                 \
-    w = rol(w, 30);
-#define R3(v, w, x, y, z, i)                                            \
-    z += (((w | x) & y) | (w & x)) + blk(i) + 0x8F1BBCDC + rol(v, 5);   \
-    w = rol(w, 30);
-#define R4(v, w, x, y, z, i)                                            \
-    z += (w ^ x ^ y) + blk(i) + 0xCA62C1D6 + rol(v, 5);                 \
-    w = rol(w, 30);
-
-#define R0A_L(i) R0_L(a, b, c, d, e, i)
-#define R0B_L(i) R0_L(b, c, d, e, a, i)
-#define R0C_L(i) R0_L(c, d, e, a, b, i)
-#define R0D_L(i) R0_L(d, e, a, b, c, i)
-#define R0E_L(i) R0_L(e, a, b, c, d, i)
-
-#define R0A_B(i) R0_B(a, b, c, d, e, i)
-#define R0B_B(i) R0_B(b, c, d, e, a, i)
-#define R0C_B(i) R0_B(c, d, e, a, b, i)
-#define R0D_B(i) R0_B(d, e, a, b, c, i)
-#define R0E_B(i) R0_B(e, a, b, c, d, i)
-
-#define R1A(i) R1(a, b, c, d, e, i)
-#define R1B(i) R1(b, c, d, e, a, i)
-#define R1C(i) R1(c, d, e, a, b, i)
-#define R1D(i) R1(d, e, a, b, c, i)
-#define R1E(i) R1(e, a, b, c, d, i)
-
-#define R2A(i) R2(a, b, c, d, e, i)
-#define R2B(i) R2(b, c, d, e, a, i)
-#define R2C(i) R2(c, d, e, a, b, i)
-#define R2D(i) R2(d, e, a, b, c, i)
-#define R2E(i) R2(e, a, b, c, d, i)
-
-#define R3A(i) R3(a, b, c, d, e, i)
-#define R3B(i) R3(b, c, d, e, a, i)
-#define R3C(i) R3(c, d, e, a, b, i)
-#define R3D(i) R3(d, e, a, b, c, i)
-#define R3E(i) R3(e, a, b, c, d, i)
-
-#define R4A(i) R4(a, b, c, d, e, i)
-#define R4B(i) R4(b, c, d, e, a, i)
-#define R4C(i) R4(c, d, e, a, b, i)
-#define R4D(i) R4(d, e, a, b, c, i)
-#define R4E(i) R4(e, a, b, c, d, i)
-
-
-static void SHA1_transform(uint32_t state[5], const unsigned char buffer[64])
-{
-    uint32_t a, b, c, d, e;
-
-    typedef union {
-        unsigned char c[64];
-        uint32_t l[16];
-    } u;
-
-    unsigned char w[64];
-    u *block = (u *) w;
-
-    memcpy(block, buffer, 64);
-
-    a = state[0];
-    b = state[1];
-    c = state[2];
-    d = state[3];
-    e = state[4];
-
-    static uint32_t endianness_indicator = 0x1;
-    if (((unsigned char *) &endianness_indicator)[0]) {
-        R0A_L( 0);
-        R0E_L( 1); R0D_L( 2); R0C_L( 3); R0B_L( 4); R0A_L( 5);
-        R0E_L( 6); R0D_L( 7); R0C_L( 8); R0B_L( 9); R0A_L(10);
-        R0E_L(11); R0D_L(12); R0C_L(13); R0B_L(14); R0A_L(15);
-    }
-    else {
-        R0A_B( 0);
-        R0E_B( 1); R0D_B( 2); R0C_B( 3); R0B_B( 4); R0A_B( 5);
-        R0E_B( 6); R0D_B( 7); R0C_B( 8); R0B_B( 9); R0A_B(10);
-        R0E_B(11); R0D_B(12); R0C_B(13); R0B_B(14); R0A_B(15);
-    }
-    R1E(16); R1D(17); R1C(18); R1B(19); R2A(20);
-    R2E(21); R2D(22); R2C(23); R2B(24); R2A(25);
-    R2E(26); R2D(27); R2C(28); R2B(29); R2A(30);
-    R2E(31); R2D(32); R2C(33); R2B(34); R2A(35);
-    R2E(36); R2D(37); R2C(38); R2B(39); R3A(40);
-    R3E(41); R3D(42); R3C(43); R3B(44); R3A(45);
-    R3E(46); R3D(47); R3C(48); R3B(49); R3A(50);
-    R3E(51); R3D(52); R3C(53); R3B(54); R3A(55);
-    R3E(56); R3D(57); R3C(58); R3B(59); R4A(60);
-    R4E(61); R4D(62); R4C(63); R4B(64); R4A(65);
-    R4E(66); R4D(67); R4C(68); R4B(69); R4A(70);
-    R4E(71); R4D(72); R4C(73); R4B(74); R4A(75);
-    R4E(76); R4D(77); R4C(78); R4B(79);
-
-    state[0] += a;
-    state[1] += b;
-    state[2] += c;
-    state[3] += d;
-    state[4] += e;
-}
-
-
-typedef struct
-{
-    uint32_t state[5];
-    uint32_t count[2];
-    unsigned char buffer[64];
-} SHA1Context;
-
-
-static void SHA1_init(SHA1Context *context)
-{
-    context->state[0] = 0x67452301;
-    context->state[1] = 0xEFCDAB89;
-    context->state[2] = 0x98BADCFE;
-    context->state[3] = 0x10325476;
-    context->state[4] = 0xC3D2E1F0;
-    context->count[0] = context->count[1] = 0;
-}
-
-
-static void SHA1_update(SHA1Context *context, const unsigned char *data,
-                        unsigned int len)
-{
-    uint32_t i, j;
-
-    j = (context->count[0] >> 3) & 63;
-
-    if ((context->count[0] += len << 3) < (len << 3)) {
-        context->count[1]++;
-    }
-
-    context->count[1] += (len >> 29);
-
-    if ((j + len) > 63) {
-        memcpy(&(context->buffer[j]), data, (i = 64 - j));
-        SHA1_transform(context->state, context->buffer);
-        for ( ; (i + 63) < len; i += 64) {
-            SHA1_transform(context->state, &(data[i]));
-        }
-        j = 0;
-    }
-    else {
-        i = 0;
-    }
-
-    memcpy(&(context->buffer[j]), &(data[i]), len - i);
-}
-
-
-static void SHA1_final(unsigned char digest[20], SHA1Context *context)
-{
-    uint32_t i;
-    unsigned char finalcount[8];
-
-    for (i = 0; i < 8; i++) {
-        finalcount[i] = (unsigned char)
-            ((context->count[(i >= 4 ? 0 : 1)] >>
-              ((3 - (i & 3)) * 8)) & 255);
-    }
-
-    SHA1_update(context, (unsigned char *) "\200", 1);
-
-    while ((context->count[0] & 504) != 448) {
-        SHA1_update(context, (unsigned char *) "\0", 1);
-    }
-
-    SHA1_update(context, finalcount, 8);
-
-    for (i = 0; i < 20; i++) {
-        digest[i] = (unsigned char)
-            ((context->state[i >> 2] >> ((3 - (i & 3)) * 8)) & 255);
-    }
-
-    memset(context->buffer, 0, 64);
-    memset(context->state, 0, 20);
-    memset(context->count, 0, 8);
-    memset(&finalcount, 0, 8);
-
-    SHA1_transform(context->state, context->buffer);
-}
-
-
-// HMAC-SHA-1:
-//
-// K - is key padded with zeros to 512 bits
-// m - is message
-// OPAD - 0x5c5c5c...
-// IPAD - 0x363636...
-//
-// HMAC(K,m) = SHA1((K ^ OPAD) . SHA1((K ^ IPAD) . m))
-void HMAC_SHA1(unsigned char hmac[20], const unsigned char *key, int key_len,
-               const unsigned char *message, int message_len)
-{
-    unsigned char kopad[64], kipad[64];
-    int i;
-    
-    if (key_len > 64) {
-        key_len = 64;
-    }
-
-    for (i = 0; i < key_len; i++) {
-        kopad[i] = key[i] ^ 0x5c;
-        kipad[i] = key[i] ^ 0x36;
-    }
-
-    for ( ; i < 64; i++) {
-        kopad[i] = 0 ^ 0x5c;
-        kipad[i] = 0 ^ 0x36;
-    }
-
-    unsigned char digest[20];
-
-    SHA1Context context;
-    
-    SHA1_init(&context);
-    SHA1_update(&context, kipad, 64);
-    SHA1_update(&context, message, message_len);
-    SHA1_final(digest, &context);
-
-    SHA1_init(&context);
-    SHA1_update(&context, kopad, 64);
-    SHA1_update(&context, digest, 20);
-    SHA1_final(hmac, &context);
-}
-
-#define rot(x,k) (((x) << (k)) | ((x) >> (32 - (k))))
-
-uint64_t hash(const unsigned char *k, int length)
-{
-    uint32_t a, b, c;
-
-    a = b = c = 0xdeadbeef + ((uint32_t) length);
-
-    static uint32_t endianness_indicator = 0x1;
-    if (((unsigned char *) &endianness_indicator)[0]) {
-        while (length > 12) {
-            a += k[0];
-            a += ((uint32_t) k[1]) << 8;
-            a += ((uint32_t) k[2]) << 16;
-            a += ((uint32_t) k[3]) << 24;
-            b += k[4];
-            b += ((uint32_t) k[5]) << 8;
-            b += ((uint32_t) k[6]) << 16;
-            b += ((uint32_t) k[7]) << 24;
-            c += k[8];
-            c += ((uint32_t) k[9]) << 8;
-            c += ((uint32_t) k[10]) << 16;
-            c += ((uint32_t) k[11]) << 24;
-            a -= c; a ^= rot(c, 4);  c += b;
-            b -= a; b ^= rot(a, 6);  a += c;
-            c -= b; c ^= rot(b, 8);  b += a;
-            a -= c; a ^= rot(c, 16);  c += b;
-            b -= a; b ^= rot(a, 19);  a += c;
-            c -= b; c ^= rot(b, 4);  b += a;
-            length -= 12;
-            k += 12;
-        }
-        
-        switch(length) {
-        case 12: c += ((uint32_t) k[11]) << 24;
-        case 11: c += ((uint32_t) k[10]) << 16;
-        case 10: c += ((uint32_t) k[9]) << 8;
-        case 9 : c += k[8];
-        case 8 : b += ((uint32_t) k[7]) << 24;
-        case 7 : b += ((uint32_t) k[6]) << 16;
-        case 6 : b += ((uint32_t) k[5]) << 8;
-        case 5 : b += k[4];
-        case 4 : a += ((uint32_t) k[3]) << 24;
-        case 3 : a += ((uint32_t) k[2]) << 16;
-        case 2 : a += ((uint32_t) k[1]) << 8;
-        case 1 : a += k[0]; break;
-        case 0 : goto end;
-        }
-    }
-    else {
-        while (length > 12) {
-            a += ((uint32_t) k[0]) << 24;
-            a += ((uint32_t) k[1]) << 16;
-            a += ((uint32_t) k[2]) << 8;
-            a += ((uint32_t) k[3]);
-            b += ((uint32_t) k[4]) << 24;
-            b += ((uint32_t) k[5]) << 16;
-            b += ((uint32_t) k[6]) << 8;
-            b += ((uint32_t) k[7]);
-            c += ((uint32_t) k[8]) << 24;
-            c += ((uint32_t) k[9]) << 16;
-            c += ((uint32_t) k[10]) << 8;
-            c += ((uint32_t) k[11]);
-            a -= c; a ^= rot(c, 4);  c += b;
-            b -= a; b ^= rot(a, 6);  a += c;
-            c -= b; c ^= rot(b, 8);  b += a;
-            a -= c; a ^= rot(c, 16);  c += b;
-            b -= a; b ^= rot(a, 19);  a += c;
-            c -= b; c ^= rot(b, 4);  b += a;
-            length -= 12;
-            k += 12;
-        }
-
-        switch(length) {
-        case 12: c += k[11];
-        case 11: c += ((uint32_t) k[10]) << 8;
-        case 10: c += ((uint32_t) k[9]) << 16;
-        case 9 : c += ((uint32_t) k[8]) << 24;
-        case 8 : b += k[7];
-        case 7 : b += ((uint32_t) k[6]) << 8;
-        case 6 : b += ((uint32_t) k[5]) << 16;
-        case 5 : b += ((uint32_t) k[4]) << 24;
-        case 4 : a += k[3];
-        case 3 : a += ((uint32_t) k[2]) << 8;
-        case 2 : a += ((uint32_t) k[1]) << 16;
-        case 1 : a += ((uint32_t) k[0]) << 24; break;
-        case 0 : goto end;
-        }
-    }
-    
-    c ^= b; c -= rot(b, 14);
-    a ^= c; a -= rot(c, 11);
-    b ^= a; b -= rot(a, 25);
-    c ^= b; c -= rot(b, 16);
-    a ^= c; a -= rot(c, 4);
-    b ^= a; b -= rot(a, 14);
-    c ^= b; c -= rot(b, 24);
-
- end:
-    return ((((uint64_t) c) << 32) | b);
-}
-
-int is_blank(char c)
-{
-    return ((c == ' ') || (c == '\t'));
-}
diff --git a/src/libs3/test/badxml_01.xml b/src/libs3/test/badxml_01.xml
deleted file mode 100644
index e9917023..0000000
--- a/src/libs3/test/badxml_01.xml
+++ /dev/null
@@ -1,105 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- each elementxx is 9 characters long, + slash gives 10 characters -->
-<element00>
-<element01>
-<element02>
-<element03>
-<element04>
-<element05>
-<element06>
-<element07>
-<element08>
-<element09>
-<element10>
-<element11>
-<element12>
-<element13>
-<element14>
-<element15>
-<element16>
-<element17>
-<element18>
-<element19>
-<element20>
-<element21>
-<element22>
-<element23>
-<element24>
-<element25>
-<element26>
-<element27>
-<element28>
-<element29>
-<element30>
-<element31>
-<element32>
-<element33>
-<element34>
-<element35>
-<element36>
-<element37>
-<element38>
-<element39>
-<element40>
-<element41>
-<element42>
-<element43>
-<element44>
-<element45>
-<element46>
-<element47>
-<element48>
-<element49>
-<element50xxx>
-Data
-</element50xxx>
-</element49>
-</element48>
-</element47>
-</element46>
-</element45>
-</element44>
-</element43>
-</element42>
-</element41>
-</element40>
-</element39>
-</element38>
-</element37>
-</element36>
-</element35>
-</element34>
-</element33>
-</element32>
-</element31>
-</element30>
-</element29>
-</element28>
-</element27>
-</element26>
-</element25>
-</element24>
-</element23>
-</element22>
-</element21>
-</element20>
-</element19>
-</element18>
-</element17>
-</element16>
-</element15>
-</element14>
-</element13>
-</element12>
-</element11>
-</element10>
-</element09>
-</element08>
-</element07>
-</element06>
-</element05>
-</element04>
-</element03>
-</element02>
-</element01>
-</element00>
diff --git a/src/libs3/test/goodxml_01.xml b/src/libs3/test/goodxml_01.xml
deleted file mode 100644
index 687214e..0000000
--- a/src/libs3/test/goodxml_01.xml
+++ /dev/null
@@ -1,7 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<Error>
-  <Code>NoSuchKey</Code>
-  <Message> The resource <![CDATA[<now> & then]]> you requested does not exist & so there  </Message>
-  <Resource>/mybucket/myfoto.jpg</Resource> 
-  <RequestId>4442587FB7D0A2F9</RequestId>
-</Error>
diff --git a/src/libs3/test/goodxml_02.xml b/src/libs3/test/goodxml_02.xml
deleted file mode 100644
index d713bc6..0000000
--- a/src/libs3/test/goodxml_02.xml
+++ /dev/null
@@ -1,105 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- each elementxx is 9 characters long, + slash gives 10 characters -->
-<element00>
-<element01>
-<element02>
-<element03>
-<element04>
-<element05>
-<element06>
-<element07>
-<element08>
-<element09>
-<element10>
-<element11>
-<element12>
-<element13>
-<element14>
-<element15>
-<element16>
-<element17>
-<element18>
-<element19>
-<element20>
-<element21>
-<element22>
-<element23>
-<element24>
-<element25>
-<element26>
-<element27>
-<element28>
-<element29>
-<element30>
-<element31>
-<element32>
-<element33>
-<element34>
-<element35>
-<element36>
-<element37>
-<element38>
-<element39>
-<element40>
-<element41>
-<element42>
-<element43>
-<element44>
-<element45>
-<element46>
-<element47>
-<element48>
-<element49>
-<element50xx>
-Data
-</element50xx>
-</element49>
-</element48>
-</element47>
-</element46>
-</element45>
-</element44>
-</element43>
-</element42>
-</element41>
-</element40>
-</element39>
-</element38>
-</element37>
-</element36>
-</element35>
-</element34>
-</element33>
-</element32>
-</element31>
-</element30>
-</element29>
-</element28>
-</element27>
-</element26>
-</element25>
-</element24>
-</element23>
-</element22>
-</element21>
-</element20>
-</element19>
-</element18>
-</element17>
-</element16>
-</element15>
-</element14>
-</element13>
-</element12>
-</element11>
-</element10>
-</element09>
-</element08>
-</element07>
-</element06>
-</element05>
-</element04>
-</element03>
-</element02>
-</element01>
-</element00>
diff --git a/src/libs3/test/goodxml_03.xml b/src/libs3/test/goodxml_03.xml
deleted file mode 100644
index 9a85166..0000000
--- a/src/libs3/test/goodxml_03.xml
+++ /dev/null
@@ -1,13 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<longdata>
-123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012 [...]
-123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012 [...]
-123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012 [...]
-123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012 [...]
-123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012 [...]
-123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012 [...]
-123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012 [...]
-123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012 [...]
-123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012 [...]
-</longdata>
-
diff --git a/src/libs3/test/test.sh b/src/libs3/test/test.sh
deleted file mode 100755
index 3acfc86..0000000
--- a/src/libs3/test/test.sh
+++ /dev/null
@@ -1,173 +0,0 @@
-#!/bin/sh
-
-# Environment:
-# S3_ACCESS_KEY_ID - must be set to S3 Access Key ID
-# S3_SECRET_ACCESS_KEY - must be set to S3 Secret Access Key
-# TEST_BUCKET_PREFIX - must be set to the test bucket prefix to use
-# S3_COMMAND - may be set to s3 command to use (i.e. valgrind s3); defaults
-#              to "s3"
-
-if [ -z "$S3_ACCESS_KEY_ID" ]; then
-    echo "S3_ACCESS_KEY_ID required"
-    exit -1;
-fi
-
-if [ -z "$S3_SECRET_ACCESS_KEY" ]; then
-    echo "S3_SECRET_ACCESS_KEY required"
-    exit -1;
-fi
-
-if [ -z "$TEST_BUCKET_PREFIX" ]; then
-    echo "TEST_BUCKET_PREFIX required"
-    exit -1;
-fi
-
-if [ -z "$S3_COMMAND" ]; then
-    S3_COMMAND=s3
-fi
-
-TEST_BUCKET=${TEST_BUCKET_PREFIX}.testbucket
-
-# Create the test bucket in EU
-echo "$S3_COMMAND create $TEST_BUCKET locationConstraint=EU"
-$S3_COMMAND create $TEST_BUCKET
-
-# List to find it
-echo "$S3_COMMAND list | grep $TEST_BUCKET"
-$S3_COMMAND list | grep $TEST_BUCKET
-
-# Test it
-echo "$S3_COMMAND test $TEST_BUCKET"
-$S3_COMMAND test $TEST_BUCKET
-
-# List to ensure that it is empty
-echo "$S3_COMMAND list $TEST_BUCKET"
-$S3_COMMAND list $TEST_BUCKET
-
-# Put some data
-rm -f seqdata
-seq 1 10000 > seqdata
-echo "$S3_COMMAND put $TEST_BUCKET/testkey filename=seqdata noStatus=1"
-$S3_COMMAND put $TEST_BUCKET/testkey filename=seqdata noStatus=1
-
-rm -f testkey
-# Get the data and make sure that it matches
-echo "$S3_COMMAND get $TEST_BUCKET/testkey filename=testkey"
-$S3_COMMAND get $TEST_BUCKET/testkey filename=testkey
-diff seqdata testkey
-rm -f seqdata testkey
-
-# Delete the file
-echo "$S3_COMMAND delete $TEST_BUCKET/testkey"
-$S3_COMMAND delete $TEST_BUCKET/testkey
-
-# Remove the test bucket
-echo "$S3_COMMAND delete $TEST_BUCKET"
-$S3_COMMAND delete $TEST_BUCKET
-
-# Make sure it's not there
-echo "$S3_COMMAND list | grep $TEST_BUCKET"
-$S3_COMMAND list | grep $TEST_BUCKET
-
-# Now create it again
-echo "$S3_COMMAND create $TEST_BUCKET"
-$S3_COMMAND create $TEST_BUCKET
-
-# Put 10 files in it
-for i in `seq 0 9`; do
-    echo "echo \"Hello\" | $S3_COMMAND put $TEST_BUCKET/key_$i"
-    echo "Hello" | $S3_COMMAND put $TEST_BUCKET/key_$i
-done
-
-# List with all details
-echo "$S3_COMMAND list $TEST_BUCKET allDetails=1"
-$S3_COMMAND list $TEST_BUCKET allDetails=1
-
-COPY_BUCKET=${TEST_BUCKET_PREFIX}.copybucket
-
-# Create another test bucket and copy a file into it
-echo "$S3_COMMAND create $COPY_BUCKET"
-$S3_COMMAND create $COPY_BUCKET
-echo <<EOF
-$S3_COMMAND copy $TEST_BUCKET/key_5 $COPY_BUCKET/copykey
-EOF
-$S3_COMMAND copy $TEST_BUCKET/key_5 $COPY_BUCKET/copykey
-
-# List the copy bucket
-echo "$S3_COMMAND list $COPY_BUCKET allDetails=1"
-$S3_COMMAND list $COPY_BUCKET allDetails=1
-
-# Compare the files
-rm -f key_5 copykey
-echo "$S3_COMMAND get $TEST_BUCKET/key_5 filename=key_5"
-$S3_COMMAND get $TEST_BUCKET/key_5 filename=key_5
-echo "$S3_COMMAND get $COPY_BUCKET/copykey filename=copykey"
-$S3_COMMAND get $COPY_BUCKET/copykey filename=copykey
-diff key_5 copykey
-rm -f key_5 copykey
-
-# Delete the files
-for i in `seq 0 9`; do
-    echo "$S3_COMMAND delete $TEST_BUCKET/key_$i"
-    $S3_COMMAND delete $TEST_BUCKET/key_$i
-done
-echo "$S3_COMMAND delete $COPY_BUCKET/copykey"
-$S3_COMMAND delete $COPY_BUCKET/copykey
-
-# Delete the copy bucket
-echo "$S3_COMMAND delete $COPY_BUCKET"
-$S3_COMMAND delete $COPY_BUCKET
-
-# Now create a new zero-length file
-echo "$S3_COMMAND put $TEST_BUCKET/aclkey < /dev/null"
-$S3_COMMAND put $TEST_BUCKET/aclkey < /dev/null
-
-# Get the bucket acl
-rm -f acl
-echo "$S3_COMMAND getacl $TEST_BUCKET filename=acl allDetails=1"
-$S3_COMMAND getacl $TEST_BUCKET filename=acl allDetails=1
-
-# Add READ for all AWS users, and READ_ACP for everyone
-echo <<EOF >> acl
-Group   Authenticated AWS Users                                   READ
-EOF
-echo <<EOF >> acl
-Group   All Users                                                 READ_ACP
-EOF
-echo "$S3_COMMAND setacl $TEST_BUCKET filename=acl"
-$S3_COMMAND setacl $TEST_BUCKET filename=acl
-
-# Test to make sure that it worked
-rm -f acl_new
-echo "$S3_COMMAND getacl $TEST_BUCKET filename=acl_new allDetails=1"
-$S3_COMMAND getacl $TEST_BUCKET filename=acl_new allDetails=1
-diff acl acl_new
-rm -f acl acl_new
-
-# Get the key acl
-rm -f acl
-echo "$S3_COMMAND getacl $TEST_BUCKET/aclkey filename=acl allDetails=1"
-$S3_COMMAND getacl $TEST_BUCKET/aclkey filename=acl allDetails=1
-
-# Add READ for all AWS users, and READ_ACP for everyone
-echo <<EOF >> acl
-Group   Authenticated AWS Users                                   READ
-EOF
-echo <<EOF >> acl
-Group   All Users                                                 READ_ACP
-EOF
-echo "$S3_COMMAND setacl $TEST_BUCKET/aclkey filename=acl"
-$S3_COMMAND setacl $TEST_BUCKET/aclkey filename=acl
-
-# Test to make sure that it worked
-rm -f acl_new
-echo "$S3_COMMAND getacl $TEST_BUCKET/aclkey filename=acl_new allDetails=1"
-$S3_COMMAND getacl $TEST_BUCKET/aclkey filename=acl_new allDetails=1
-diff acl acl_new
-rm -f acl acl_new
-
-# Remove the test file
-echo "$S3_COMMAND delete $TEST_BUCKET/aclkey"
-$S3_COMMAND delete $TEST_BUCKET/aclkey
-echo "$S3_COMMAND delete $TEST_BUCKET"
-$S3_COMMAND delete $TEST_BUCKET
diff --git a/src/logrotate.conf b/src/logrotate.conf
index 1833d55..08ad4b4 100644
--- a/src/logrotate.conf
+++ b/src/logrotate.conf
@@ -4,26 +4,9 @@
     compress
     sharedscripts
     postrotate
-        if which invoke-rc.d > /dev/null 2>&1 && [ -x `which invoke-rc.d` ]; then
-            invoke-rc.d ceph reload >/dev/null
-        elif which service > /dev/null 2>&1 && [ -x `which service` ]; then
-            service ceph reload >/dev/null
-        fi
-        # Possibly reload twice, but depending on ceph.conf the reload above may be a no-op
-        if which initctl > /dev/null 2>&1 && [ -x `which initctl` ]; then
-            for daemon in osd mon mds ; do
-              find -L /var/lib/ceph/$daemon/ -mindepth 1 -maxdepth 1 -regextype posix-egrep -regex '.*/[A-Za-z0-9]+-[A-Za-z0-9._-]+' -printf '%P\n' \
-                | while read f; do
-                    if [ -e "/var/lib/ceph/$daemon/$f/done" -o -e "/var/lib/ceph/$daemon/$f/ready" ] && [ -e "/var/lib/ceph/$daemon/$f/upstart" ] && [ ! -e "/var/lib/ceph/$daemon/$f/sysvinit" ]; then
-                      cluster="${f%%-*}"
-                      id="${f#*-}"
-
-                      initctl reload ceph-$daemon cluster="$cluster" id="$id" 2>/dev/null || :
-                    fi
-                  done
-            done
-        fi
+        killall -q -1 ceph-mon ceph-mds ceph-osd radosgw || true
     endscript
     missingok
     notifempty
+    su ceph ceph
 }
diff --git a/src/make_version b/src/make_version
index 3ed2bac..4834579 100755
--- a/src/make_version
+++ b/src/make_version
@@ -1,23 +1,112 @@
 #!/bin/sh
 
-echo '$1: '$1
+GIT_VERSION_FILE=
+CEPH_VER_HEADER=
+NO_VERSION=0
 
-if [ "$1" =  "-n" ] ; then
-   cur="no_version"
-   v="Development"
-else
-   cur=`head -1 $1`
-   v=`tail -1 $1 | cut -c 2-`
-fi
+is_git() {
+    type git > /dev/null 2>&1
+    if [ $? -ne 0 ]; then
+        echo "Could not find git command. Assuming this is not a git repository, not updating .git_version"
+        return 1
+    fi
+    git status > /dev/zero 2>&1;
+    if [ $? -ne 0 ]; then
+        echo "This is no git repository, not updating .git_version"
+        return 1
+    else
+        return 0
+    fi
+}
+
+check_gitversion() {
+    if is_git; then
+        current=`git rev-parse HEAD 2> /dev/null; git describe 2> /dev/null`
+        if [ -f $GIT_VERSION_FILE ] ; then
+            old=`cat $GIT_VERSION_FILE`
+
+            if [ "$current" != "$old" ]; then
+                echo "$current" > $GIT_VERSION_FILE
+            fi
+        else
+            echo "$current" > $GIT_VERSION_FILE
+        fi
+    fi
+}
+
+print_ceph_ver() {
+    # print the content of the ceph_ver.h file
+    if [ $NO_VERSION -eq 1 ]; then
+        ver="no_version"
+        ver_nice="Development"
+    else
+        ver=`head -1 $GIT_VERSION_FILE`
+        ver_nice=`tail -1 $GIT_VERSION_FILE | cut -c 2-`
+    fi
 
-print_all() {
-	echo "#ifndef CEPH_VERSION_H"
-	echo "#define CEPH_VERSION_H"
-	echo
-	echo "#define CEPH_GIT_VER $cur"
-	echo "#define CEPH_GIT_NICE_VER \"$v\""
-	echo
-	echo "#endif"
+    echo "#ifndef CEPH_VERSION_H"
+    echo "#define CEPH_VERSION_H"
+    echo
+    echo "#define CEPH_GIT_VER $ver"
+    echo "#define CEPH_GIT_NICE_VER \"$ver_nice\""
+    echo
+    echo "#endif"
 }
 
-print_all > $2
+set_ceph_ver() {
+    # compare new and old CEPH_VER_HEADER
+    if [ -f $CEPH_VER_HEADER ]; then
+	tmpfile=$(mktemp -t "ceph_ver_h.XXXXXXXXXXXXX")
+        print_ceph_ver > $tmpfile
+        cur_ver=`cat $CEPH_VER_HEADER`
+        new_ver=`cat $tmpfile`
+        if [ "$cur_ver" != "$new_ver" ]; then
+            mv $tmpfile $CEPH_VER_HEADER
+	else
+	    rm $tmpfile
+        fi
+    else
+        print_ceph_ver > $CEPH_VER_HEADER
+    fi
+}
+
+usage() {
+    printf "usage: $0 -g FILEPATH [options]\n"
+    printf "\t-g|--git-version-file\tFILEPATH for git version file (e.g. ./src/.git_version)\n"
+    printf "\t-c|--ceph-ver-header\tFILEPATH for ceph version header (e.g. ./src/ceph_ver.h)\n"
+    printf "\t-n|--no-version\t\tdon't generate version from git\n"
+    printf "\t-h|--help\t\tprint this usage instructions\n"
+}
+
+until [ -z "$1" ]; do
+case $1 in
+    -n|--no-version)
+        NO_VERSION=1;
+        ;;
+    -g|--git-version-file)
+        GIT_VERSION_FILE=$2
+        shift;
+        ;;
+    -c|--ceph-ver-header)
+        CEPH_VER_HEADER=$2
+        shift;
+        ;;
+    -h|--help)
+        usage;
+        ;;
+    *)
+        ;;
+esac
+shift
+done;
+
+if [ -n "$GIT_VERSION_FILE" ] ; then
+    if [ -z "$CEPH_VER_HEADER" ] ; then
+        check_gitversion
+    else
+        check_gitversion
+        set_ceph_ver
+    fi
+else
+    usage
+fi
diff --git a/src/mds/Beacon.cc b/src/mds/Beacon.cc
index 6a6a5c8..c28f9aa 100644
--- a/src/mds/Beacon.cc
+++ b/src/mds/Beacon.cc
@@ -16,11 +16,12 @@
 #include "common/dout.h"
 #include "common/HeartbeatMap.h"
 #include "include/stringify.h"
+#include "include/util.h"
 
 #include "messages/MMDSBeacon.h"
 #include "mon/MonClient.h"
 #include "mds/MDLog.h"
-#include "mds/MDS.h"
+#include "mds/MDSRank.h"
 #include "mds/MDSMap.h"
 #include "mds/Locker.h"
 
@@ -32,10 +33,10 @@
 
 
 Beacon::Beacon(CephContext *cct_, MonClient *monc_, std::string name_) :
-  Dispatcher(cct_), lock("Beacon"), monc(monc_), timer(g_ceph_context, lock), name(name_)
+  Dispatcher(cct_), lock("Beacon"), monc(monc_), timer(g_ceph_context, lock),
+  name(name_), awaiting_seq(-1)
 {
   want_state = MDSMap::STATE_NULL;
-  last_send = 0;
   last_seq = 0;
   sender = NULL;
   was_laggy = false;
@@ -56,7 +57,6 @@ void Beacon::init(MDSMap const *mdsmap, MDSMap::DaemonState want_state_,
   Mutex::Locker l(lock);
   assert(mdsmap != NULL);
 
-  // Initialize copies of MDS state
   want_state = want_state_;
   _notify_mdsmap(mdsmap);
   standby_for_rank = standby_rank_;
@@ -106,25 +106,37 @@ void Beacon::handle_mds_beacon(MMDSBeacon *m)
 
   // update lab
   if (seq_stamp.count(seq)) {
-    assert(seq_stamp[seq] > last_acked_stamp);
-    last_acked_stamp = seq_stamp[seq];
     utime_t now = ceph_clock_now(g_ceph_context);
-    utime_t rtt = now - last_acked_stamp;
+    if (seq_stamp[seq] > last_acked_stamp) {
+      last_acked_stamp = seq_stamp[seq];
+      utime_t rtt = now - last_acked_stamp;
 
-    dout(10) << "handle_mds_beacon " << ceph_mds_state_name(m->get_state())
-	     << " seq " << m->get_seq() 
-	     << " rtt " << rtt << dendl;
+      dout(10) << "handle_mds_beacon " << ceph_mds_state_name(m->get_state())
+	       << " seq " << m->get_seq() << " rtt " << rtt << dendl;
 
-    if (was_laggy && rtt < g_conf->mds_beacon_grace) {
-      dout(0) << "handle_mds_beacon no longer laggy" << dendl;
-      was_laggy = false;
-      laggy_until = now;
+      if (was_laggy && rtt < g_conf->mds_beacon_grace) {
+	dout(0) << "handle_mds_beacon no longer laggy" << dendl;
+	was_laggy = false;
+	laggy_until = now;
+      }
+    } else {
+      // Mark myself laggy if system clock goes backwards. Hopping
+      // later beacons will clear it.
+      dout(1) << "handle_mds_beacon system clock goes backwards, "
+	      << "mark myself laggy" << dendl;
+      last_acked_stamp = now - utime_t(g_conf->mds_beacon_grace + 1, 0);
+      was_laggy = true;
     }
 
     // clean up seq_stamp map
     while (!seq_stamp.empty() &&
 	   seq_stamp.begin()->first <= seq)
       seq_stamp.erase(seq_stamp.begin());
+
+    // Wake a waiter up if present
+    if (awaiting_seq == seq) {
+      waiting_cond.Signal();
+    }
   } else {
     dout(10) << "handle_mds_beacon " << ceph_mds_state_name(m->get_state())
 	     << " seq " << m->get_seq() << " dne" << dendl;
@@ -139,6 +151,25 @@ void Beacon::send()
 }
 
 
+void Beacon::send_and_wait(const double duration)
+{
+  Mutex::Locker l(lock);
+  _send();
+  awaiting_seq = last_seq;
+  dout(20) << __func__ << ": awaiting " << awaiting_seq
+           << " for up to " << duration << "s" << dendl;
+
+  utime_t timeout;
+  timeout.set_from_double(ceph_clock_now(cct) + duration);
+  while ((!seq_stamp.empty() && seq_stamp.begin()->first <= awaiting_seq)
+         && ceph_clock_now(cct) < timeout) {
+    waiting_cond.WaitUntil(lock, timeout);
+  }
+
+  awaiting_seq = -1;
+}
+
+
 /**
  * Call periodically, or when you have updated the desired state
  */
@@ -163,6 +194,8 @@ void Beacon::_send()
 	   << dendl;
 
   seq_stamp[last_seq] = ceph_clock_now(g_ceph_context);
+
+  assert(want_state != MDSMap::STATE_NULL);
   
   MMDSBeacon *beacon = new MMDSBeacon(
       monc->get_fsid(), mds_gid_t(monc->get_global_id()),
@@ -175,7 +208,12 @@ void Beacon::_send()
   beacon->set_standby_for_name(standby_for_name);
   beacon->set_health(health);
   beacon->set_compat(compat);
-
+  // piggyback the sys info on beacon msg
+  if (want_state == MDSMap::STATE_BOOT) {
+    map<string, string> sys_info;
+    collect_sys_info(&sys_info, cct);
+    beacon->set_sys_info(sys_info);
+  }
   monc->send_mon_message(beacon);
 }
 
@@ -193,10 +231,13 @@ void Beacon::notify_mdsmap(MDSMap const *mdsmap)
 void Beacon::_notify_mdsmap(MDSMap const *mdsmap)
 {
   assert(mdsmap != NULL);
+  assert(mdsmap->get_epoch() >= epoch);
 
-  epoch = mdsmap->get_epoch();
-  compat = get_mdsmap_compat_set_default();
-  compat.merge(mdsmap->compat);
+  if (mdsmap->get_epoch() != epoch) {
+    epoch = mdsmap->get_epoch();
+    compat = get_mdsmap_compat_set_default();
+    compat.merge(mdsmap->compat);
+  }
 }
 
 
@@ -233,11 +274,23 @@ utime_t Beacon::get_laggy_until() const
   return laggy_until;
 }
 
-void Beacon::notify_want_state(MDSMap::DaemonState const newstate)
+void Beacon::set_want_state(MDSMap const *mdsmap, MDSMap::DaemonState const newstate)
 {
   Mutex::Locker l(lock);
 
-  want_state = newstate;
+  // Update mdsmap epoch atomically with updating want_state, so that when
+  // we send a beacon with the new want state it has the latest epoch, and
+  // once we have updated to the latest epoch, we are not sending out
+  // a stale want_state (i.e. one from before making it through MDSMap
+  // handling)
+  _notify_mdsmap(mdsmap);
+
+  if (want_state != newstate) {
+    dout(10) << __func__ << ": "
+      << ceph_mds_state_name(want_state) << " -> "
+      << ceph_mds_state_name(newstate) << dendl;
+    want_state = newstate;
+  }
 }
 
 
@@ -246,9 +299,13 @@ void Beacon::notify_want_state(MDSMap::DaemonState const newstate)
  * some health metrics that we will send in the next
  * beacon.
  */
-void Beacon::notify_health(MDS const *mds)
+void Beacon::notify_health(MDSRank const *mds)
 {
   Mutex::Locker l(lock);
+  if (!mds) {
+    // No MDS rank held
+    return;
+  }
 
   // I'm going to touch this MDS, so it must be locked
   assert(mds->mds_lock.is_locked_by_me());
@@ -264,8 +321,8 @@ void Beacon::notify_health(MDS const *mds)
         << "/" << g_conf->mds_log_max_segments << ")";
 
       MDSHealthMetric m(MDS_HEALTH_TRIM, HEALTH_WARN, oss.str());
-      m.metadata["num_segments"] = mds->mdlog->get_num_segments();
-      m.metadata["max_segments"] = g_conf->mds_log_max_segments;
+      m.metadata["num_segments"] = stringify(mds->mdlog->get_num_segments());
+      m.metadata["max_segments"] = stringify(g_conf->mds_log_max_segments);
       health.metrics.push_back(m);
     }
   }
@@ -304,7 +361,7 @@ void Beacon::notify_health(MDS const *mds)
       oss << "Many clients (" << late_cap_metrics.size()
           << ") failing to respond to capability release";
       MDSHealthMetric m(MDS_HEALTH_CLIENT_LATE_RELEASE_MANY, HEALTH_WARN, oss.str());
-      m.metadata["client_count"] = late_cap_metrics.size();
+      m.metadata["client_count"] = stringify(late_cap_metrics.size());
       health.metrics.push_back(m);
       late_cap_metrics.clear();
     }
@@ -312,6 +369,8 @@ void Beacon::notify_health(MDS const *mds)
 
   // Detect clients failing to generate cap releases from CEPH_SESSION_RECALL_STATE
   // messages. May be due to buggy client or resource-hogging application.
+  //
+  // Detect clients failing to advance their old_client_tid
   {
     set<Session*> sessions;
     mds->sessionmap.get_client_session_set(sessions);
@@ -319,6 +378,7 @@ void Beacon::notify_health(MDS const *mds)
     cutoff -= g_conf->mds_recall_state_timeout;
 
     std::list<MDSHealthMetric> late_recall_metrics;
+    std::list<MDSHealthMetric> large_completed_requests_metrics;
     for (set<Session*>::iterator i = sessions.begin(); i != sessions.end(); ++i) {
       Session *session = *i;
       if (!session->recalled_at.is_zero()) {
@@ -328,14 +388,24 @@ void Beacon::notify_health(MDS const *mds)
         if (session->recalled_at < cutoff) {
           dout(20) << "  exceeded timeout " << session->recalled_at << " vs. " << cutoff << dendl;
           std::ostringstream oss;
-        oss << "Client " << session->get_human_name() << " failing to respond to cache pressure";
+	  oss << "Client " << session->get_human_name() << " failing to respond to cache pressure";
           MDSHealthMetric m(MDS_HEALTH_CLIENT_RECALL, HEALTH_WARN, oss.str());
-          m.metadata["client_id"] = session->info.inst.name.num();
+          m.metadata["client_id"] = stringify(session->info.inst.name.num());
           late_recall_metrics.push_back(m);
         } else {
           dout(20) << "  within timeout " << session->recalled_at << " vs. " << cutoff << dendl;
         }
       }
+      if ((session->get_num_trim_requests_warnings() > 0 &&
+	   session->get_num_completed_requests() >= g_conf->mds_max_completed_requests) ||
+	  (session->get_num_trim_flushes_warnings() > 0 &&
+	   session->get_num_completed_flushes() >= g_conf->mds_max_completed_flushes)) {
+	std::ostringstream oss;
+	oss << "Client " << session->get_human_name() << " failing to advance its oldest client/flush tid";
+	MDSHealthMetric m(MDS_HEALTH_CLIENT_OLDEST_TID, HEALTH_WARN, oss.str());
+	m.metadata["client_id"] = stringify(session->info.inst.name.num());
+	large_completed_requests_metrics.push_back(m);
+      }
     }
 
     if (late_recall_metrics.size() <= (size_t)g_conf->mds_health_summarize_threshold) {
@@ -345,10 +415,28 @@ void Beacon::notify_health(MDS const *mds)
       oss << "Many clients (" << late_recall_metrics.size()
           << ") failing to respond to cache pressure";
       MDSHealthMetric m(MDS_HEALTH_CLIENT_RECALL_MANY, HEALTH_WARN, oss.str());
-      m.metadata["client_count"] = late_recall_metrics.size();
+      m.metadata["client_count"] = stringify(late_recall_metrics.size());
       health.metrics.push_back(m);
       late_recall_metrics.clear();
     }
+
+    if (large_completed_requests_metrics.size() <= (size_t)g_conf->mds_health_summarize_threshold) {
+      health.metrics.splice(health.metrics.end(), large_completed_requests_metrics);
+    } else {
+      std::ostringstream oss;
+      oss << "Many clients (" << large_completed_requests_metrics.size()
+	<< ") failing to advance their oldest client/flush tid";
+      MDSHealthMetric m(MDS_HEALTH_CLIENT_OLDEST_TID_MANY, HEALTH_WARN, oss.str());
+      m.metadata["client_count"] = stringify(large_completed_requests_metrics.size());
+      health.metrics.push_back(m);
+      large_completed_requests_metrics.clear();
+    }
   }
 }
 
+MDSMap::DaemonState Beacon::get_want_state() const
+{
+  Mutex::Locker l(lock);
+  return want_state;
+}
+
diff --git a/src/mds/Beacon.h b/src/mds/Beacon.h
index 4336d1a..a63daff 100644
--- a/src/mds/Beacon.h
+++ b/src/mds/Beacon.h
@@ -25,7 +25,7 @@
 class MonClient;
 class MMDSBeacon;
 class Message;
-class MDS;
+class MDSRank;
 
 
 /**
@@ -53,7 +53,6 @@ class Beacon : public Dispatcher
   MDSMap::DaemonState want_state;
 
   // Internal beacon state
-  version_t last_send;
   version_t last_seq;          // last seq sent to monitor
   std::map<version_t,utime_t>  seq_stamp;    // seq # -> time sent
   utime_t last_acked_stamp;  // last time we sent a beacon that got acked
@@ -79,6 +78,9 @@ class Beacon : public Dispatcher
   void _notify_mdsmap(MDSMap const *mdsmap);
   void _send();
 
+  version_t awaiting_seq;
+  Cond waiting_cond;
+
 public:
   Beacon(CephContext *cct_, MonClient *monc_, std::string name);
   ~Beacon();
@@ -92,14 +94,23 @@ public:
   void ms_handle_remote_reset(Connection *c) {}
 
   void notify_mdsmap(MDSMap const *mdsmap);
-  void notify_want_state(MDSMap::DaemonState const newstate);
-  void notify_health(MDS const *mds);
+  void notify_health(MDSRank const *mds);
 
   void set_standby_for(mds_rank_t rank_, std::string const &name_);
 
   void handle_mds_beacon(MMDSBeacon *m);
   void send();
 
+  void set_want_state(MDSMap const *mdsmap, MDSMap::DaemonState const newstate);
+  MDSMap::DaemonState get_want_state() const;
+
+  /**
+   * Send a beacon, and block until the ack is received from the mon
+   * or `duration` seconds pass, whichever happens sooner.  Useful
+   * for emitting a last message on shutdown.
+   */
+  void send_and_wait(const double duration);
+
   bool is_laggy();
   utime_t get_laggy_until() const;
 };
diff --git a/src/mds/CDentry.cc b/src/mds/CDentry.cc
index 37d1d3e..e56c66a 100644
--- a/src/mds/CDentry.cc
+++ b/src/mds/CDentry.cc
@@ -18,7 +18,7 @@
 #include "CInode.h"
 #include "CDir.h"
 
-#include "MDS.h"
+#include "MDSRank.h"
 #include "MDCache.h"
 #include "Locker.h"
 #include "LogSegment.h"
@@ -71,16 +71,7 @@ ostream& operator<<(ostream& out, const CDentry& dn)
   if (dn.get_linkage()->is_null()) out << " NULL";
   if (dn.get_linkage()->is_remote()) {
     out << " REMOTE(";
-    switch (DTTOIF(dn.get_linkage()->get_remote_d_type())) {
-    case S_IFSOCK: out << "sock"; break;
-    case S_IFLNK: out << "lnk"; break;
-    case S_IFREG: out << "reg"; break;
-    case S_IFBLK: out << "blk"; break;
-    case S_IFDIR: out << "dir"; break;
-    case S_IFCHR: out << "chr"; break;
-    case S_IFIFO: out << "fifo"; break;
-    default: assert(0);
-    }
+    out << dn.get_linkage()->get_remote_d_type_string();
     out << ")";
   }
 
@@ -559,3 +550,75 @@ void CDentry::_put()
     }
   }
 }
+
+void CDentry::dump(Formatter *f) const
+{
+  assert(f != NULL);
+
+  filepath path;
+  make_path(path);
+
+  f->dump_string("path", path.get_path());
+  f->dump_int("snap_first", first);
+  f->dump_int("snap_last", last);
+
+  f->dump_bool("is_null", get_linkage()->is_null());
+  f->dump_bool("is_remote", get_linkage()->is_remote());
+  f->dump_bool("is_new", is_new());
+  if (get_linkage()->get_inode()) {
+    f->dump_int("inode", get_linkage()->get_inode()->ino());
+  } else {
+    f->dump_int("inode", 0);
+  }
+
+  if (linkage.is_remote()) {
+    f->dump_string("remote_type", linkage.get_remote_d_type_string());
+  } else {
+    f->dump_string("remote_type", "");
+  }
+
+  f->dump_int("version", get_version());
+  f->dump_int("projected_version", get_projected_version());
+
+  f->dump_int("auth_pins", auth_pins);
+  f->dump_int("nested_auth_pins", nested_auth_pins);
+
+  MDSCacheObject::dump(f);
+
+  f->open_object_section("lock");
+  lock.dump(f);
+  f->close_section();
+
+  f->open_object_section("versionlock");
+  versionlock.dump(f);
+  f->close_section();
+
+  f->open_array_section("states");
+  MDSCacheObject::dump_states(f);
+  if (state_test(STATE_NEW))
+    f->dump_string("state", "new");
+  if (state_test(STATE_FRAGMENTING))
+    f->dump_string("state", "fragmenting");
+  if (state_test(STATE_PURGING))
+    f->dump_string("state", "purging");
+  if (state_test(STATE_BADREMOTEINO))
+    f->dump_string("state", "badremoteino");
+  if (state_test(STATE_STRAY))
+    f->dump_string("state", "stray");
+  f->close_section();
+}
+
+std::string CDentry::linkage_t::get_remote_d_type_string() const
+{
+  switch (DTTOIF(remote_d_type)) {
+    case S_IFSOCK: return "sock";
+    case S_IFLNK: return "lnk";
+    case S_IFREG: return "reg";
+    case S_IFBLK: return "blk";
+    case S_IFDIR: return "dir";
+    case S_IFCHR: return "chr";
+    case S_IFIFO: return "fifo";
+    default: assert(0); return "";
+  }
+}
+
diff --git a/src/mds/CDentry.h b/src/mds/CDentry.h
index c590598..40a4723 100644
--- a/src/mds/CDentry.h
+++ b/src/mds/CDentry.h
@@ -72,6 +72,7 @@ public:
   static const int STATE_FRAGMENTING =  (1<<1);
   static const int STATE_PURGING =      (1<<2);
   static const int STATE_BADREMOTEINO = (1<<3);
+  static const int STATE_EVALUATINGSTRAY = (1<<4);
   // stray dentry needs notification of releasing reference
   static const int STATE_STRAY =	STATE_NOTIFYREF;
 
@@ -126,6 +127,7 @@ public:
     const CInode *get_inode() const { return inode; }
     inodeno_t get_remote_ino() const { return remote_ino; }
     unsigned char get_remote_d_type() const { return remote_d_type; }
+    std::string get_remote_d_type_string() const;
 
     void set_remote(inodeno_t ino, unsigned char d_type) { 
       remote_ino = ino;
@@ -148,13 +150,10 @@ public:
   elist<CDentry*>::item item_stray;
 
 protected:
-  int auth_pins, nested_auth_pins;
-#ifdef MDS_AUTHPIN_SET
-  multiset<void*> auth_pin_set;
-#endif
   friend class Migrator;
   friend class Locker;
   friend class MDCache;
+  friend class StrayManager;
   friend class CInode;
   friend class C_MDC_XlockRequest;
 
@@ -176,7 +175,6 @@ public:
     dir(0),
     version(0), projected_version(0),
     item_dirty(this),
-    auth_pins(0), nested_auth_pins(0),
     lock(this, &lock_type),
     versionlock(this, &versionlock_type) {
     g_num_dn++;
@@ -189,7 +187,6 @@ public:
     dir(0),
     version(0), projected_version(0),
     item_dirty(this),
-    auth_pins(0), nested_auth_pins(0),
     lock(this, &lock_type),
     versionlock(this, &versionlock_type) {
     g_num_dn++;
@@ -263,10 +260,7 @@ public:
   void adjust_nested_auth_pins(int adjustment, int diradj, void *by);
   bool is_frozen() const;
   bool is_freezing() const;
-  bool is_auth_pinned() const { return auth_pins || nested_auth_pins; }
-  int get_num_auth_pins() const { return auth_pins; }
   int get_num_dir_auth_pins() const;
-  int get_num_nested_auth_pins() const { return nested_auth_pins; }
   
   // remote links
   void link_remote(linkage_t *dnl, CInode *in);
@@ -397,6 +391,7 @@ public:
   
   ostream& print_db_line_prefix(ostream& out);
   void print(ostream& out);
+  void dump(Formatter *f) const;
 
   friend class CDir;
 };
diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc
index 5b72d51..9ecbf01 100644
--- a/src/mds/CDir.cc
+++ b/src/mds/CDir.cc
@@ -21,7 +21,7 @@
 #include "Mutation.h"
 
 #include "MDSMap.h"
-#include "MDS.h"
+#include "MDSRank.h"
 #include "MDCache.h"
 #include "Locker.h"
 #include "MDLog.h"
@@ -35,18 +35,20 @@
 
 #include "common/config.h"
 #include "include/assert.h"
+#include "include/compat.h"
 
 #define dout_subsys ceph_subsys_mds
 #undef dout_prefix
 #define dout_prefix *_dout << "mds." << cache->mds->get_nodeid() << ".cache.dir(" << this->dirfrag() << ") "
 
-
+int CDir::num_frozen_trees = 0;
+int CDir::num_freezing_trees = 0;
 
 class CDirContext : public MDSInternalContextBase
 {
 protected:
   CDir *dir;
-  MDS* get_mds() {return dir->cache->mds;}
+  MDSRank* get_mds() {return dir->cache->mds;}
 
 public:
   CDirContext(CDir *d) : dir(d) {
@@ -59,7 +61,7 @@ class CDirIOContext : public MDSIOContextBase
 {
 protected:
   CDir *dir;
-  MDS* get_mds() {return dir->cache->mds;}
+  MDSRank* get_mds() {return dir->cache->mds;}
 
 public:
   CDirIOContext(CDir *d) : dir(d) {
@@ -220,8 +222,6 @@ CDir::CDir(CInode *in, frag_t fg, MDCache *mdcache, bool auth) :
   if (auth) 
     state |= STATE_AUTH;
  
-  auth_pins = 0;
-  nested_auth_pins = 0;
   dir_auth_pins = 0;
   request_pins = 0;
 
@@ -630,6 +630,7 @@ void CDir::unlink_inode_work( CDentry *dn )
 
 void CDir::add_to_bloom(CDentry *dn)
 {
+  assert(dn->last == CEPH_NOSNAP);
   if (!bloom) {
     /* not create bloom filter for incomplete dir that was added by log replay */
     if (!is_complete())
@@ -671,8 +672,13 @@ void CDir::remove_null_dentries() {
   assert(get_num_any() == items.size());
 }
 
-// remove dirty null dentries for deleted directory. the dirfrag will be
-// deleted soon, so it's safe to not commit dirty dentries.
+/** remove dirty null dentries for deleted directory. the dirfrag will be
+ *  deleted soon, so it's safe to not commit dirty dentries.
+ *
+ *  This is called when a directory is being deleted, a prerequisite
+ *  of which is that its children have been unlinked: we expect to only see
+ *  null, unprojected dentries here.
+ */
 void CDir::try_remove_dentries_for_stray()
 {
   dout(10) << __func__ << dendl;
@@ -686,8 +692,8 @@ void CDir::try_remove_dentries_for_stray()
     CDentry *dn = p->second;
     ++p;
     if (dn->last == CEPH_NOSNAP) {
-      if (!dn->get_linkage()->is_null() || dn->is_projected())
-	continue; // shouldn't happen
+      assert(!dn->is_projected());
+      assert(dn->get_linkage()->is_null());
       if (clear_dirty && dn->is_dirty())
 	dn->mark_clean();
       // It's OK to remove lease prematurely because we will never link
@@ -697,8 +703,7 @@ void CDir::try_remove_dentries_for_stray()
       if (dn->get_num_ref() == 0)
 	remove_dentry(dn);
     } else {
-      if (dn->is_projected())
-	continue; // shouldn't happen
+      assert(!dn->is_projected());
       CDentry::linkage_t *dnl= dn->get_linkage();
       CInode *in = NULL;
       if (dnl->is_primary()) {
@@ -819,8 +824,14 @@ void CDir::steal_dentry(CDentry *dn)
       else
 	fnode.fragstat.nfiles++;
     }
-  } else
-      num_snap_items++;
+  } else {
+    num_snap_items++;
+    if (dn->get_linkage()->is_primary()) {
+      CInode *in = dn->get_linkage()->get_inode();
+      if (in->is_dirty_rstat())
+	dirty_rstat_inodes.push_back(&in->dirty_rstat_item);
+    }
+  }
 
   if (dn->auth_pins || dn->nested_auth_pins) {
     // use the helpers here to maintain the auth_pin invariants on the dir inode
@@ -1029,8 +1040,8 @@ void CDir::merge(list<CDir*>& subs, list<MDSInternalContextBase*>& waiters, bool
       steal_dentry(dir->items.begin()->second);
     
     // merge replica map
-    for (map<mds_rank_t,unsigned>::iterator p = dir->replicas_begin();
-	 p != dir->replica_map.end();
+    for (compact_map<mds_rank_t,unsigned>::iterator p = dir->replicas_begin();
+	 p != dir->replicas_end();
 	 ++p) {
       unsigned cur = replica_map[p->first];
       if (p->second > cur)
@@ -1166,7 +1177,7 @@ void CDir::take_dentry_waiting(const string& dname, snapid_t first, snapid_t las
   
   string_snap_t lb(dname, first);
   string_snap_t ub(dname, last);
-  map<string_snap_t, list<MDSInternalContextBase*> >::iterator p = waiting_on_dentry.lower_bound(lb);
+  compact_map<string_snap_t, list<MDSInternalContextBase*> >::iterator p = waiting_on_dentry.lower_bound(lb);
   while (p != waiting_on_dentry.end() &&
 	 !(ub < p->first)) {
     dout(10) << "take_dentry_waiting dentry " << dname
@@ -1185,7 +1196,7 @@ void CDir::take_sub_waiting(list<MDSInternalContextBase*>& ls)
 {
   dout(10) << "take_sub_waiting" << dendl;
   if (!waiting_on_dentry.empty()) {
-    for (map<string_snap_t, list<MDSInternalContextBase*> >::iterator p = waiting_on_dentry.begin(); 
+    for (compact_map<string_snap_t, list<MDSInternalContextBase*> >::iterator p = waiting_on_dentry.begin();
 	 p != waiting_on_dentry.end();
 	 ++p) 
       ls.splice(ls.end(), p->second);
@@ -1232,7 +1243,7 @@ void CDir::take_waiting(uint64_t mask, list<MDSInternalContextBase*>& ls)
   if ((mask & WAIT_DENTRY) && !waiting_on_dentry.empty()) {
     // take all dentry waiters
     while (!waiting_on_dentry.empty()) {
-      map<string_snap_t, list<MDSInternalContextBase*> >::iterator p = waiting_on_dentry.begin(); 
+      compact_map<string_snap_t, list<MDSInternalContextBase*> >::iterator p = waiting_on_dentry.begin();
       dout(10) << "take_waiting dentry " << p->first.name
 	       << " snap " << p->first.snapid << " on " << *this << dendl;
       ls.splice(ls.end(), p->second);
@@ -1343,9 +1354,11 @@ struct C_Dir_Dirty : public CDirContext {
   C_Dir_Dirty(CDir *d, version_t p, LogSegment *l) : CDirContext(d), pv(p), ls(l) {}
   void finish(int r) {
     dir->mark_dirty(pv, ls);
+    dir->auth_unpin(dir);
   }
 };
 
+// caller should hold auth pin of this
 void CDir::log_mark_dirty()
 {
   MDLog *mdlog = inode->mdcache->mds->mdlog;
@@ -1455,7 +1468,7 @@ void CDir::_tmap_fetch(const string& want_dn)
   ObjectOperation rd;
   rd.tmap_get(&fin->bl, NULL);
   cache->mds->objecter->read(oid, oloc, rd, CEPH_NOSNAP, NULL, 0,
-			     new C_OnFinisher(fin, &cache->mds->finisher));
+			     new C_OnFinisher(fin, cache->mds->finisher));
 }
 
 void CDir::_tmap_fetched(bufferlist& bl, const string& want_dn, int r)
@@ -1497,7 +1510,9 @@ class C_IO_Dir_OMAP_Fetched : public CDirIOContext {
   bufferlist btbl;
   int ret1, ret2, ret3;
 
-  C_IO_Dir_OMAP_Fetched(CDir *d, const string& w) : CDirIOContext(d), want_dn(w) { }
+  C_IO_Dir_OMAP_Fetched(CDir *d, const string& w) : 
+    CDirIOContext(d), want_dn(w),
+    ret1(0), ret2(0), ret3(0) {}
   void finish(int r) {
     // check the correctness of backtrace
     if (r >= 0 && ret3 != -ECANCELED)
@@ -1525,7 +1540,183 @@ void CDir::_omap_fetch(const string& want_dn)
   }
 
   cache->mds->objecter->read(oid, oloc, rd, CEPH_NOSNAP, NULL, 0,
-			     new C_OnFinisher(fin, &cache->mds->finisher));
+			     new C_OnFinisher(fin, cache->mds->finisher));
+}
+
+CDentry *CDir::_load_dentry(
+    const std::string &key,
+    const std::string &dname,
+    const snapid_t last,
+    bufferlist &bl,
+    const int pos,
+    const std::set<snapid_t> *snaps,
+    bool *force_dirty,
+    list<CInode*> *undef_inodes)
+{
+  bufferlist::iterator q = bl.begin();
+
+  snapid_t first;
+  ::decode(first, q);
+
+  // marker
+  char type;
+  ::decode(type, q);
+
+  dout(20) << "_fetched pos " << pos << " marker '" << type << "' dname '" << dname
+           << " [" << first << "," << last << "]"
+           << dendl;
+
+  bool stale = false;
+  if (snaps && last != CEPH_NOSNAP) {
+    set<snapid_t>::const_iterator p = snaps->lower_bound(first);
+    if (p == snaps->end() || *p > last) {
+      dout(10) << " skipping stale dentry on [" << first << "," << last << "]" << dendl;
+      stale = true;
+    }
+  }
+  
+  /*
+   * look for existing dentry for _last_ snap, because unlink +
+   * create may leave a "hole" (epochs during which the dentry
+   * doesn't exist) but for which no explicit negative dentry is in
+   * the cache.
+   */
+  CDentry *dn;
+  if (stale)
+    dn = lookup_exact_snap(dname, last);
+  else
+    dn = lookup(dname, last);
+
+  if (type == 'L') {
+    // hard link
+    inodeno_t ino;
+    unsigned char d_type;
+    ::decode(ino, q);
+    ::decode(d_type, q);
+
+    if (stale) {
+      if (!dn) {
+        stale_items.insert(key);
+        *force_dirty = true;
+      }
+      return dn;
+    }
+
+    if (dn) {
+      if (dn->get_linkage()->get_inode() == 0) {
+        dout(12) << "_fetched  had NEG dentry " << *dn << dendl;
+      } else {
+        dout(12) << "_fetched  had dentry " << *dn << dendl;
+      }
+    } else {
+      // (remote) link
+      dn = add_remote_dentry(dname, ino, d_type, first, last);
+      
+      // link to inode?
+      CInode *in = cache->get_inode(ino);   // we may or may not have it.
+      if (in) {
+        dn->link_remote(dn->get_linkage(), in);
+        dout(12) << "_fetched  got remote link " << ino << " which we have " << *in << dendl;
+      } else {
+        dout(12) << "_fetched  got remote link " << ino << " (dont' have it)" << dendl;
+      }
+    }
+  } 
+  else if (type == 'I') {
+    // inode
+    
+    // Load inode data before looking up or constructing CInode
+    InodeStore inode_data;
+    inode_data.decode_bare(q);
+    
+    if (stale) {
+      if (!dn) {
+        stale_items.insert(key);
+        *force_dirty = true;
+      }
+      return dn;
+    }
+
+    bool undef_inode = false;
+    if (dn) {
+      CInode *in = dn->get_linkage()->get_inode();
+      if (in) {
+        dout(12) << "_fetched  had dentry " << *dn << dendl;
+        if (in->state_test(CInode::STATE_REJOINUNDEF)) {
+          undef_inodes->push_back(in);
+          undef_inode = true;
+        }
+      } else
+        dout(12) << "_fetched  had NEG dentry " << *dn << dendl;
+    }
+
+    if (!dn || undef_inode) {
+      // add inode
+      CInode *in = cache->get_inode(inode_data.inode.ino, last);
+      if (!in || undef_inode) {
+        if (undef_inode && in)
+          in->first = first;
+        else
+          in = new CInode(cache, true, first, last);
+        
+        in->inode = inode_data.inode;
+        // symlink?
+        if (in->is_symlink()) 
+          in->symlink = inode_data.symlink;
+        
+        in->dirfragtree.swap(inode_data.dirfragtree);
+        in->xattrs.swap(inode_data.xattrs);
+        in->old_inodes.swap(inode_data.old_inodes);
+        in->oldest_snap = inode_data.oldest_snap;
+        in->decode_snap_blob(inode_data.snap_blob);
+        if (snaps && !in->snaprealm)
+          in->purge_stale_snap_data(*snaps);
+
+        if (!undef_inode) {
+          cache->add_inode(in); // add
+          dn = add_primary_dentry(dname, in, first, last); // link
+        }
+        dout(12) << "_fetched  got " << *dn << " " << *in << dendl;
+
+        if (in->inode.is_dirty_rstat())
+          in->mark_dirty_rstat();
+
+        if (inode->is_stray()) {
+          dn->state_set(CDentry::STATE_STRAY);
+          if (in->inode.nlink == 0)
+            in->state_set(CInode::STATE_ORPHAN);
+        }
+
+        //in->hack_accessed = false;
+        //in->hack_load_stamp = ceph_clock_now(g_ceph_context);
+        //num_new_inodes_loaded++;
+      } else {
+        dout(0) << "_fetched  badness: got (but i already had) " << *in
+                << " mode " << in->inode.mode
+                << " mtime " << in->inode.mtime << dendl;
+        string dirpath, inopath;
+        this->inode->make_path_string(dirpath);
+        in->make_path_string(inopath);
+        cache->mds->clog->error() << "loaded dup inode " << inode_data.inode.ino
+          << " [" << first << "," << last << "] v" << inode_data.inode.version
+          << " at " << dirpath << "/" << dname
+          << ", but inode " << in->vino() << " v" << in->inode.version
+          << " already exists at " << inopath << "\n";
+        return dn;
+      }
+    }
+  } else {
+    dout(1) << "corrupt directory, i got tag char '" << type << "' pos "
+      << pos << dendl;
+    cache->mds->clog->error() << "Corrupt directory entry '" << key
+      << "' in dirfrag " << *this;
+    // TODO: add a mechanism for selectively marking a path
+    // damaged, rather than marking the whole rank damaged.
+    cache->mds->damaged();
+    assert(0);  // Unreachable: damaged() respawns us
+  }
+
+  return dn;
 }
 
 void CDir::_omap_fetched(bufferlist& hdrbl, map<string, bufferlist>& omap,
@@ -1550,24 +1741,28 @@ void CDir::_omap_fetched(bufferlist& hdrbl, map<string, bufferlist>& omap,
     dout(0) << "_fetched missing object for " << *this << dendl;
     clog->error() << "dir " << dirfrag() << " object missing on disk; some files may be lost\n";
 
-    state_set(STATE_BADFRAG);
-    // mark complete, !fetching
-    mark_complete();
-    state_clear(STATE_FETCHING);
-    auth_unpin(this);
-    
-    // kick waiters
-    finish_waiting(WAIT_COMPLETE, 0);
+    go_bad();
     return;
   }
 
   fnode_t got_fnode;
   {
     bufferlist::iterator p = hdrbl.begin();
-    ::decode(got_fnode, p);
+    try {
+      ::decode(got_fnode, p);
+    } catch (const buffer::error &err) {
+      derr << "Corrupt fnode in dirfrag " << dirfrag()
+        << ": " << err << dendl;
+      clog->warn() << "Corrupt fnode header in " << dirfrag() << ": "
+		  << err;
+      go_bad();
+      return;
+    }
     if (!p.end()) {
       clog->warn() << "header buffer of dir " << dirfrag() << " has "
 		  << hdrbl.length() - p.get_off() << " extra bytes\n";
+      go_bad();
+      return;
     }
   }
 
@@ -1592,6 +1787,7 @@ void CDir::_omap_fetched(bufferlist& hdrbl, map<string, bufferlist>& omap,
 
   // purge stale snaps?
   // only if we have past_parents open!
+  bool force_dirty = false;
   const set<snapid_t> *snaps = NULL;
   SnapRealm *realm = inode->find_snaprealm();
   if (!realm->have_past_parents_open()) {
@@ -1601,163 +1797,33 @@ void CDir::_omap_fetched(bufferlist& hdrbl, map<string, bufferlist>& omap,
     dout(10) << " snap_purged_thru " << fnode.snap_purged_thru
 	     << " < " << realm->get_last_destroyed()
 	     << ", snap purge based on " << *snaps << dendl;
-    fnode.snap_purged_thru = realm->get_last_destroyed();
+    if (get_num_snap_items() == 0) {
+      fnode.snap_purged_thru = realm->get_last_destroyed();
+      force_dirty = true;
+    }
   }
 
-  bool stray = inode->is_stray();
-
   unsigned pos = omap.size() - 1;
   for (map<string, bufferlist>::reverse_iterator p = omap.rbegin();
        p != omap.rend();
        ++p, --pos) {
-    // dname
     string dname;
-    snapid_t first, last;
+    snapid_t last;
     dentry_key_t::decode_helper(p->first, dname, last);
-    
-    bufferlist::iterator q = p->second.begin();
-    ::decode(first, q);
-
-    // marker
-    char type;
-    ::decode(type, q);
-
-    dout(20) << "_fetched pos " << pos << " marker '" << type << "' dname '" << dname
-	     << " [" << first << "," << last << "]"
-	     << dendl;
-
-    bool stale = false;
-    if (snaps && last != CEPH_NOSNAP) {
-      set<snapid_t>::const_iterator p = snaps->lower_bound(first);
-      if (p == snaps->end() || *p > last) {
-	dout(10) << " skipping stale dentry on [" << first << "," << last << "]" << dendl;
-	stale = true;
-      }
-    }
-    
-    /*
-     * look for existing dentry for _last_ snap, because unlink +
-     * create may leave a "hole" (epochs during which the dentry
-     * doesn't exist) but for which no explicit negative dentry is in
-     * the cache.
-     */
-    CDentry *dn = NULL;
-    if (!stale)
-      dn = lookup(dname, last);
-
-    if (type == 'L') {
-      // hard link
-      inodeno_t ino;
-      unsigned char d_type;
-      ::decode(ino, q);
-      ::decode(d_type, q);
-
-      if (stale)
-	continue;
-
-      if (dn) {
-        if (dn->get_linkage()->get_inode() == 0) {
-          dout(12) << "_fetched  had NEG dentry " << *dn << dendl;
-        } else {
-          dout(12) << "_fetched  had dentry " << *dn << dendl;
-        }
-      } else {
-	// (remote) link
-	dn = add_remote_dentry(dname, ino, d_type, first, last);
-	
-	// link to inode?
-	CInode *in = cache->get_inode(ino);   // we may or may not have it.
-	if (in) {
-	  dn->link_remote(dn->get_linkage(), in);
-	  dout(12) << "_fetched  got remote link " << ino << " which we have " << *in << dendl;
-	} else {
-	  dout(12) << "_fetched  got remote link " << ino << " (dont' have it)" << dendl;
-	}
-      }
-    } 
-    else if (type == 'I') {
-      // inode
-      
-      // Load inode data before looking up or constructing CInode
-      InodeStore inode_data;
-      inode_data.decode_bare(q);
-      
-      if (stale)
-	continue;
-
-      bool undef_inode = false;
-      if (dn) {
-	CInode *in = dn->get_linkage()->get_inode();
-	if (in) {
-	  dout(12) << "_fetched  had dentry " << *dn << dendl;
-	  if (in->state_test(CInode::STATE_REJOINUNDEF)) {
-	    undef_inodes.push_back(in);
-	    undef_inode = true;
-	  }
-	} else
-	  dout(12) << "_fetched  had NEG dentry " << *dn << dendl;
-      }
 
-      if (!dn || undef_inode) {
-	// add inode
-	CInode *in = cache->get_inode(inode_data.inode.ino, last);
-	if (!in || undef_inode) {
-	  if (undef_inode && in)
-	    in->first = first;
-	  else
-	    in = new CInode(cache, true, first, last);
-	  
-	  in->inode = inode_data.inode;
-	  // symlink?
-	  if (in->is_symlink()) 
-	    in->symlink = inode_data.symlink;
-	  
-	  in->dirfragtree.swap(inode_data.dirfragtree);
-	  in->xattrs.swap(inode_data.xattrs);
-	  in->old_inodes.swap(inode_data.old_inodes);
-	  in->decode_snap_blob(inode_data.snap_blob);
-	  in->oldest_snap = inode_data.oldest_snap;
-	  if (snaps && !in->snaprealm)
-	    in->purge_stale_snap_data(*snaps);
-
-	  if (!undef_inode) {
-	    cache->add_inode(in); // add
-	    dn = add_primary_dentry(dname, in, first, last); // link
-	  }
-	  dout(12) << "_fetched  got " << *dn << " " << *in << dendl;
-
-	  if (in->inode.is_dirty_rstat())
-	    in->mark_dirty_rstat();
-
-	  if (stray) {
-	    dn->state_set(CDentry::STATE_STRAY);
-	    if (in->inode.nlink == 0)
-	      in->state_set(CInode::STATE_ORPHAN);
-	  }
-
-	  //in->hack_accessed = false;
-	  //in->hack_load_stamp = ceph_clock_now(g_ceph_context);
-	  //num_new_inodes_loaded++;
-	} else {
-	  dout(0) << "_fetched  badness: got (but i already had) " << *in
-		  << " mode " << in->inode.mode
-		  << " mtime " << in->inode.mtime << dendl;
-	  string dirpath, inopath;
-	  this->inode->make_path_string(dirpath);
-	  in->make_path_string(inopath);
-	  clog->error() << "loaded dup inode " << inode_data.inode.ino
-	    << " [" << first << "," << last << "] v" << inode_data.inode.version
-	    << " at " << dirpath << "/" << dname
-	    << ", but inode " << in->vino() << " v" << in->inode.version
-	    << " already exists at " << inopath << "\n";
-	  continue;
-	}
-      }
-    } else {
-      dout(1) << "corrupt directory, i got tag char '" << type << "' pos " << pos << dendl;
-      assert(0);
+    CDentry *dn = NULL;
+    try {
+      dn = _load_dentry(
+            p->first, dname, last, p->second, pos, snaps,
+            &force_dirty, &undef_inodes);
+    } catch (const buffer::error &err) {
+      cache->mds->clog->warn() << "Corrupt dentry '" << dname << "' in "
+                                  "dir frag " << dirfrag() << ": "
+                               << err;
+      go_bad();
+      return;
     }
-    
+
     if (dn && want_dn.length() && want_dn == dname) {
       dout(10) << " touching wanted dn " << *dn << dendl;
       inode->mdcache->touch_dentry(dn);
@@ -1804,13 +1870,27 @@ void CDir::_omap_fetched(bufferlist& hdrbl, map<string, bufferlist>& omap,
     cache->opened_undef_inode(in);
   }
 
-  auth_unpin(this);
+  // dirty myself to remove stale snap dentries
+  if (force_dirty && !is_dirty() && !inode->mdcache->is_readonly())
+    log_mark_dirty();
+  else
+    auth_unpin(this);
 
   // kick waiters
   finish_waiting(WAIT_COMPLETE, 0);
 }
 
-
+void CDir::go_bad()
+{
+  state_set(STATE_BADFRAG);
+  // mark complete, !fetching
+  mark_complete();
+  state_clear(STATE_FETCHING);
+  auth_unpin(this);
+  
+  // kick waiters
+  finish_waiting(WAIT_COMPLETE, 0);
+}
 
 // -----------------------
 // COMMIT
@@ -1887,6 +1967,7 @@ void CDir::_omap_commit(int op_prio)
     dout(10) << " snap_purged_thru " << fnode.snap_purged_thru
 	     << " < " << realm->get_last_destroyed()
 	     << ", snap purge based on " << *snaps << dendl;
+    // fnode.snap_purged_thru = realm->get_last_destroyed();
   }
 
   set<string> to_remove;
@@ -1895,12 +1976,22 @@ void CDir::_omap_commit(int op_prio)
   C_GatherBuilder gather(g_ceph_context,
 			 new C_OnFinisher(new C_IO_Dir_Committed(this,
 								 get_version()),
-					  &cache->mds->finisher));
+					  cache->mds->finisher));
 
   SnapContext snapc;
   object_t oid = get_ondisk_object();
   object_locator_t oloc(cache->mds->mdsmap->get_metadata_pool());
 
+  if (!stale_items.empty()) {
+    for (compact_set<string>::iterator p = stale_items.begin();
+	 p != stale_items.end();
+	 ++p) {
+      to_remove.insert(*p);
+      write_size += (*p).length();
+    }
+    stale_items.clear();
+  }
+
   for (map_t::iterator p = items.begin();
       p != items.end(); ) {
     CDentry *dn = p->second;
@@ -2017,12 +2108,18 @@ void CDir::_encode_dentry(CDentry *dn, bufferlist& bl,
     // marker, name, inode, [symlink string]
     bl.append('I');         // inode
 
-    if (in->is_multiversion() && snaps && !in->snaprealm)
-      in->purge_stale_snap_data(*snaps);
+    if (in->is_multiversion()) {
+      if (!in->snaprealm) {
+	if (snaps)
+	  in->purge_stale_snap_data(*snaps);
+      } else if (in->snaprealm->have_past_parents_open()) {
+	in->purge_stale_snap_data(in->snaprealm->get_snaps());
+      }
+    }
 
-    in->encode_snap_blob(in->snap_blob);
-    in->encode_bare(bl);
-    in->snap_blob.clear();
+    bufferlist snap_blob;
+    in->encode_snap_blob(snap_blob);
+    in->encode_bare(bl, &snap_blob);
   }
 }
 
@@ -2150,9 +2247,9 @@ void CDir::_committed(int r, version_t v)
   // finishers?
   bool were_waiters = !waiting_for_commit.empty();
   
-  map<version_t, list<MDSInternalContextBase*> >::iterator p = waiting_for_commit.begin();
+  compact_map<version_t, list<MDSInternalContextBase*> >::iterator p = waiting_for_commit.begin();
   while (p != waiting_for_commit.end()) {
-    map<version_t, list<MDSInternalContextBase*> >::iterator n = p;
+    compact_map<version_t, list<MDSInternalContextBase*> >::iterator n = p;
     ++n;
     if (p->first > committed_version) {
       dout(10) << " there are waiters for " << p->first << ", committing again" << dendl;
@@ -2519,6 +2616,7 @@ bool CDir::freeze_tree()
     return true;
   } else {
     state_set(STATE_FREEZINGTREE);
+    ++num_freezing_trees;
     dout(10) << "freeze_tree waiting " << *this << dendl;
     return false;
   }
@@ -2530,8 +2628,12 @@ void CDir::_freeze_tree()
   assert(is_freezeable(true));
 
   // twiddle state
-  state_clear(STATE_FREEZINGTREE);   // actually, this may get set again by next context?
+  if (state_test(STATE_FREEZINGTREE)) {
+    state_clear(STATE_FREEZINGTREE);   // actually, this may get set again by next context?
+    --num_freezing_trees;
+  }
   state_set(STATE_FROZENTREE);
+  ++num_frozen_trees;
   get(PIN_FROZEN);
 
   // auth_pin inode for duration of freeze, if we are not a subtree root.
@@ -2546,6 +2648,8 @@ void CDir::unfreeze_tree()
   if (state_test(STATE_FROZENTREE)) {
     // frozen.  unfreeze.
     state_clear(STATE_FROZENTREE);
+    --num_frozen_trees;
+
     put(PIN_FROZEN);
 
     // unpin  (may => FREEZEABLE)   FIXME: is this order good?
@@ -2560,6 +2664,7 @@ void CDir::unfreeze_tree()
     // freezing.  stop it.
     assert(state_test(STATE_FREEZINGTREE));
     state_clear(STATE_FREEZINGTREE);
+    --num_freezing_trees;
     auth_unpin(this);
     
     finish_waiting(WAIT_UNFREEZE);
@@ -2568,6 +2673,8 @@ void CDir::unfreeze_tree()
 
 bool CDir::is_freezing_tree() const
 {
+  if (num_freezing_trees == 0)
+    return false;
   const CDir *dir = this;
   while (1) {
     if (dir->is_freezing_tree_root()) return true;
@@ -2581,6 +2688,8 @@ bool CDir::is_freezing_tree() const
 
 bool CDir::is_frozen_tree() const
 {
+  if (num_frozen_trees == 0)
+    return false;
   const CDir *dir = this;
   while (1) {
     if (dir->is_frozen_tree_root()) return true;
@@ -2718,39 +2827,16 @@ void CDir::dump(Formatter *f) const
 
   string path;
   get_inode()->make_path_string_projected(path);
+  f->dump_stream("path") << path;
 
   f->dump_stream("dirfrag") << dirfrag();
-  f->dump_stream("path") << path;
   f->dump_int("snapid_first", first);
-  f->dump_bool("auth", is_auth());
 
-  // Fields only meaningful for auth
-  f->open_object_section("auth_state");
-  {
-    f->open_object_section("replica_map");
-    for (std::map<mds_rank_t, unsigned>::const_iterator i = replica_map.begin();
-         i != replica_map.end(); ++i) {
-      std::ostringstream rank_str;
-      rank_str << i->first;
-      f->dump_int(rank_str.str().c_str(), i->second);
-    }
-    f->close_section();
-    f->dump_stream("projected_version") << get_projected_version();
-    f->dump_stream("version") << get_version();
-    f->dump_stream("comitting_version") << get_committing_version();
-    f->dump_stream("comitted_version") << get_committed_version();
-  }
-  f->close_section();
+  f->dump_stream("projected_version") << get_projected_version();
+  f->dump_stream("version") << get_version();
+  f->dump_stream("committing_version") << get_committing_version();
+  f->dump_stream("committed_version") << get_committed_version();
 
-  // Fields only meaningful for replica
-  f->open_object_section("replica_state");
-  {
-    f->dump_stream("authority_first") << authority().first;
-    f->dump_stream("authority_second") << authority().second;
-    f->dump_stream("replica_nonce") << get_replica_nonce();
-  }
-  f->close_section();
-  
   f->dump_bool("is_rep", is_rep());
 
   if (get_dir_auth() != CDIR_AUTH_DEFAULT) {
@@ -2764,6 +2850,7 @@ void CDir::dump(Formatter *f) const
   }
 
   f->open_array_section("states");
+  MDSCacheObject::dump_states(f);
   if (state_test(CDir::STATE_COMPLETE)) f->dump_string("state", "complete");
   if (state_test(CDir::STATE_FREEZINGTREE)) f->dump_string("state", "freezingtree");
   if (state_test(CDir::STATE_FROZENTREE)) f->dump_string("state", "frozentree");
@@ -2773,5 +2860,7 @@ void CDir::dump(Formatter *f) const
   if (state_test(CDir::STATE_IMPORTBOUND)) f->dump_string("state", "importbound");
   if (state_test(CDir::STATE_BADFRAG)) f->dump_string("state", "badfrag");
   f->close_section();
+
+  MDSCacheObject::dump(f);
 }
 
diff --git a/src/mds/CDir.h b/src/mds/CDir.h
index f72a249..be0f10a 100644
--- a/src/mds/CDir.h
+++ b/src/mds/CDir.h
@@ -23,7 +23,7 @@
 #include "common/config.h"
 #include "common/DecayCounter.h"
 
-#include <iostream>
+#include <iosfwd>
 
 #include <list>
 #include <set>
@@ -170,7 +170,7 @@ public:
 
   fnode_t fnode;
   snapid_t first;
-  std::map<snapid_t,old_rstat_t> dirty_old_rstat;  // [value.first,key]
+  compact_map<snapid_t,old_rstat_t> dirty_old_rstat;  // [value.first,key]
 
   // my inodes with dirty rstat data
   elist<CInode*> dirty_rstat_inodes;     
@@ -222,13 +222,14 @@ public:
     }
   }
   void mark_dirty(version_t pv, LogSegment *ls);
-  void log_mark_dirty();
   void mark_clean();
 
   bool is_new() { return item_new.is_on_list(); }
   void mark_new(LogSegment *ls);
 
   bool is_bad() { return state_test(STATE_BADFRAG); }
+private:
+  void log_mark_dirty();
 
 public:
   typedef std::map<dentry_key_t, CDentry*> map_t;
@@ -247,18 +248,18 @@ protected:
   version_t committing_version;
   version_t committed_version;
 
+  compact_set<string> stale_items;
 
   // lock nesting, freeze
-  int auth_pins;
-#ifdef MDS_AUTHPIN_SET
-  multiset<void*> auth_pin_set;
-#endif
-  int nested_auth_pins, dir_auth_pins;
+  static int num_frozen_trees;
+  static int num_freezing_trees;
+
+  int dir_auth_pins;
   int request_pins;
 
   // cache control  (defined for authority; hints for replicas)
   __s32      dir_rep;
-  std::set<__s32> dir_rep_by;      // if dir_rep == REP_LIST
+  compact_set<__s32> dir_rep_by;      // if dir_rep == REP_LIST
 
   // popularity
   dirfrag_load_vec_t pop_me;
@@ -428,10 +429,7 @@ private:
   // for giving to clients
   void get_dist_spec(std::set<mds_rank_t>& ls, mds_rank_t auth) {
     if (is_rep()) {
-      for (std::map<mds_rank_t,unsigned>::iterator p = replicas_begin();
-	   p != replicas_end(); 
-	   ++p)
-	ls.insert(p->first);
+      list_replicas(ls);
       if (!ls.empty()) 
 	ls.insert(auth);
     }
@@ -499,13 +497,28 @@ private:
   void fetch(MDSInternalContextBase *c, const std::string& want_dn, bool ignore_authpinnability=false);
 protected:
   void _omap_fetch(const std::string& want_dn);
+  CDentry *_load_dentry(
+      const std::string &key,
+      const std::string &dname,
+      snapid_t last,
+      bufferlist &bl,
+      int pos,
+      const std::set<snapid_t> *snaps,
+      bool *force_dirty,
+      list<CInode*> *undef_inodes);
+
+  /**
+   * Mark this fragment as BADFRAG
+   */
+  void go_bad();
+
   void _omap_fetched(bufferlist& hdrbl, std::map<std::string, bufferlist>& omap,
 		     const std::string& want_dn, int r);
   void _tmap_fetch(const std::string& want_dn);
   void _tmap_fetched(bufferlist &bl, const std::string& want_dn, int r);
 
   // -- commit --
-  std::map<version_t, std::list<MDSInternalContextBase*> > waiting_for_commit;
+  compact_map<version_t, std::list<MDSInternalContextBase*> > waiting_for_commit;
   void _commit(version_t want, int op_prio);
   void _omap_commit(int op_prio);
   void _encode_dentry(CDentry *dn, bufferlist& bl, const std::set<snapid_t> *snaps);
@@ -539,10 +552,9 @@ public:
     if (request_pins == 0) put(PIN_REQUEST);
   }
 
-    
   // -- waiters --
 protected:
-  std::map< string_snap_t, std::list<MDSInternalContextBase*> > waiting_on_dentry;
+  compact_map< string_snap_t, std::list<MDSInternalContextBase*> > waiting_on_dentry;
 
 public:
   bool is_waiting_for_dentry(const std::string& dname, snapid_t snap) {
@@ -565,7 +577,6 @@ public:
   }
   void decode_import(bufferlist::iterator& blp, utime_t now, LogSegment *ls);
 
-
   // -- auth pins --
   bool can_auth_pin() const { return is_auth() && !(is_frozen() || is_freezing()); }
   int get_cum_auth_pins() const { return auth_pins + nested_auth_pins; }
diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc
index 779efbf..cb5d8f2 100644
--- a/src/mds/CInode.cc
+++ b/src/mds/CInode.cc
@@ -13,6 +13,7 @@
  */
 
 #include "include/int_types.h"
+#include "common/errno.h"
 
 #include <string>
 #include <stdio.h>
@@ -21,7 +22,7 @@
 #include "CDir.h"
 #include "CDentry.h"
 
-#include "MDS.h"
+#include "MDSRank.h"
 #include "MDCache.h"
 #include "MDLog.h"
 #include "Locker.h"
@@ -55,7 +56,7 @@ class CInodeIOContext : public MDSIOContextBase
 {
 protected:
   CInode *in;
-  MDS *get_mds() {return in->mdcache->mds;}
+  MDSRank *get_mds() {return in->mdcache->mds;}
 public:
   CInodeIOContext(CInode *in_) : in(in_) {
     assert(in != NULL);
@@ -229,8 +230,9 @@ ostream& operator<<(ostream& out, const CInode& in)
   }
   if (!in.get_mds_caps_wanted().empty()) {
     out << " mcw={";
-    for (map<int,int>::const_iterator p = in.get_mds_caps_wanted().begin();
-	 p != in.get_mds_caps_wanted().end(); ++p) {
+    for (compact_map<int,int>::const_iterator p = in.get_mds_caps_wanted().begin();
+	 p != in.get_mds_caps_wanted().end();
+	 ++p) {
       if (p != in.get_mds_caps_wanted().begin())
 	out << ',';
       out << p->first << '=' << ccap_string(p->second);
@@ -278,7 +280,7 @@ void CInode::add_need_snapflush(CInode *snapin, snapid_t snapid, client_t client
 void CInode::remove_need_snapflush(CInode *snapin, snapid_t snapid, client_t client)
 {
   dout(10) << "remove_need_snapflush client." << client << " snapid " << snapid << " on " << snapin << dendl;
-  map<snapid_t, std::set<client_t> >::iterator p = client_need_snapflush.find(snapid);
+  compact_map<snapid_t, std::set<client_t> >::iterator p = client_need_snapflush.find(snapid);
   if (p == client_need_snapflush.end()) {
     dout(10) << " snapid not found" << dendl;
     return;
@@ -302,11 +304,15 @@ void CInode::remove_need_snapflush(CInode *snapin, snapid_t snapid, client_t cli
 void CInode::split_need_snapflush(CInode *cowin, CInode *in)
 {
   dout(10) << "split_need_snapflush [" << cowin->first << "," << cowin->last << "] for " << *cowin << dendl;
-  for (map<snapid_t, set<client_t> >::iterator p = client_need_snapflush.lower_bound(cowin->first);
-       p != client_need_snapflush.end() && p->first <= cowin->last;
-       ++p) {
-    assert(!p->second.empty());
-    cowin->auth_pin(this);
+  for (compact_map<snapid_t, set<client_t> >::iterator p = client_need_snapflush.lower_bound(cowin->first);
+       p != client_need_snapflush.end() && p->first < in->first; ) {
+    compact_map<snapid_t, set<client_t> >::iterator q = p;
+    ++p;
+    assert(!q->second.empty());
+    if (cowin->last >= q->first)
+      cowin->auth_pin(this);
+    else
+      client_need_snapflush.erase(q);
     in->auth_unpin(this);
   }
 }
@@ -346,7 +352,10 @@ inode_t *CInode::project_inode(map<string,bufferptr> *px)
     if (px)
       *px = *get_projected_xattrs();
   }
-  projected_nodes.back()->xattrs = px;
+  if (px) {
+    projected_nodes.back()->xattrs = px;
+    ++num_projected_xattrs;
+  }
   dout(15) << "project_inode " << projected_nodes.back()->inode << dendl;
   return projected_nodes.back()->inode;
 }
@@ -366,12 +375,15 @@ void CInode::pop_and_dirty_projected_inode(LogSegment *ls)
 
   map<string,bufferptr> *px = projected_nodes.front()->xattrs;
   if (px) {
+    --num_projected_xattrs;
     xattrs = *px;
     delete px;
   }
 
-  if (projected_nodes.front()->snapnode)
+  if (projected_nodes.front()->snapnode) {
     pop_projected_snaprealm(projected_nodes.front()->snapnode);
+    --num_projected_srnodes;
+  }
 
   delete projected_nodes.front()->inode;
   delete projected_nodes.front();
@@ -393,6 +405,7 @@ sr_t *CInode::project_snaprealm(snapid_t snapid)
   }
   dout(10) << "project_snaprealm " << new_srnode << dendl;
   projected_nodes.back()->snapnode = new_srnode;
+  ++num_projected_srnodes;
   return new_srnode;
 }
 
@@ -430,20 +443,11 @@ void CInode::pop_projected_snaprealm(sr_t *next_snaprealm)
   } else if (next_snaprealm->past_parents.size() !=
 	     snaprealm->srnode.past_parents.size()) {
     invalidate_cached_snaps = true;
+    // re-open past parents
+    snaprealm->_close_parents();
 
-    // update parent pointer
-    assert(snaprealm->open);
-    assert(snaprealm->parent);   // had a parent before
-    SnapRealm *new_parent = get_parent_inode()->find_snaprealm();
-    assert(new_parent);
-    CInode *parenti = new_parent->inode;
-    assert(parenti);
-    assert(parenti->snaprealm);
-    snaprealm->parent = new_parent;
-    snaprealm->add_open_past_parent(new_parent);
     dout(10) << " realm " << *snaprealm << " past_parents " << snaprealm->srnode.past_parents
 	     << " -> " << next_snaprealm->past_parents << dendl;
-    dout(10) << " pinning new parent " << *parenti << dendl;
   }
   snaprealm->srnode = *next_snaprealm;
   delete next_snaprealm;
@@ -464,7 +468,7 @@ void CInode::pop_projected_snaprealm(sr_t *next_snaprealm)
 
 // dirfrags
 
-__u32 CInode::hash_dentry_name(const string &dn)
+__u32 InodeStoreBase::hash_dentry_name(const string &dn)
 {
   int which = inode.dir_layout.dl_dir_hash;
   if (!which)
@@ -472,7 +476,7 @@ __u32 CInode::hash_dentry_name(const string &dn)
   return ceph_str_hash(which, dn.data(), dn.length());
 }
 
-frag_t CInode::pick_dirfrag(const string& dn)
+frag_t InodeStoreBase::pick_dirfrag(const string& dn)
 {
   if (dirfragtree.empty())
     return frag_t();          // avoid the string hash if we can.
@@ -497,7 +501,7 @@ bool CInode::get_dirfrags_under(frag_t fg, list<CDir*>& ls)
 
   fragtree_t tmpdft;
   tmpdft.force_to_leaf(g_ceph_context, fg);
-  for (map<frag_t,CDir*>::iterator p = dirfrags.begin(); p != dirfrags.end(); ++p) {
+  for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin(); p != dirfrags.end(); ++p) {
     tmpdft.force_to_leaf(g_ceph_context, p->first);
     if (fg.contains(p->first) && !dirfragtree.is_leaf(p->first))
       ls.push_back(p->second);
@@ -517,7 +521,7 @@ bool CInode::get_dirfrags_under(frag_t fg, list<CDir*>& ls)
 void CInode::verify_dirfrags()
 {
   bool bad = false;
-  for (map<frag_t,CDir*>::iterator p = dirfrags.begin(); p != dirfrags.end(); ++p) {
+  for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin(); p != dirfrags.end(); ++p) {
     if (!dirfragtree.is_leaf(p->first)) {
       dout(0) << "have open dirfrag " << p->first << " but not leaf in " << dirfragtree
 	      << ": " << *p->second << dendl;
@@ -530,7 +534,7 @@ void CInode::verify_dirfrags()
 void CInode::force_dirfrags()
 {
   bool bad = false;
-  for (map<frag_t,CDir*>::iterator p = dirfrags.begin(); p != dirfrags.end(); ++p) {
+  for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin(); p != dirfrags.end(); ++p) {
     if (!dirfragtree.is_leaf(p->first)) {
       dout(0) << "have open dirfrag " << p->first << " but not leaf in " << dirfragtree
 	      << ": " << *p->second << dendl;
@@ -571,7 +575,7 @@ CDir *CInode::get_approx_dirfrag(frag_t fg)
 void CInode::get_dirfrags(list<CDir*>& ls) 
 {
   // all dirfrags
-  for (map<frag_t,CDir*>::iterator p = dirfrags.begin();
+  for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin();
        p != dirfrags.end();
        ++p)
     ls.push_back(p->second);
@@ -579,7 +583,7 @@ void CInode::get_dirfrags(list<CDir*>& ls)
 void CInode::get_nested_dirfrags(list<CDir*>& ls) 
 {  
   // dirfrags in same subtree
-  for (map<frag_t,CDir*>::iterator p = dirfrags.begin();
+  for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin();
        p != dirfrags.end();
        ++p)
     if (!p->second->is_subtree_root())
@@ -588,7 +592,7 @@ void CInode::get_nested_dirfrags(list<CDir*>& ls)
 void CInode::get_subtree_dirfrags(list<CDir*>& ls) 
 { 
   // dirfrags that are roots of new subtrees
-  for (map<frag_t,CDir*>::iterator p = dirfrags.begin();
+  for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin();
        p != dirfrags.end();
        ++p)
     if (p->second->is_subtree_root())
@@ -660,7 +664,7 @@ void CInode::close_dirfrags()
 
 bool CInode::has_subtree_root_dirfrag(int auth)
 {
-  for (map<frag_t,CDir*>::iterator p = dirfrags.begin();
+  for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin();
        p != dirfrags.end();
        ++p)
     if (p->second->is_subtree_root() &&
@@ -671,7 +675,7 @@ bool CInode::has_subtree_root_dirfrag(int auth)
 
 bool CInode::has_subtree_or_exporting_dirfrag()
 {
-  for (map<frag_t,CDir*>::iterator p = dirfrags.begin();
+  for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin();
        p != dirfrags.end();
        ++p)
     if (p->second->is_subtree_root() ||
@@ -684,7 +688,7 @@ void CInode::get_stickydirs()
 {
   if (stickydir_ref == 0) {
     get(PIN_STICKYDIRS);
-    for (map<frag_t,CDir*>::iterator p = dirfrags.begin();
+    for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin();
 	 p != dirfrags.end();
 	 ++p) {
       p->second->state_set(CDir::STATE_STICKY);
@@ -700,7 +704,7 @@ void CInode::put_stickydirs()
   stickydir_ref--;
   if (stickydir_ref == 0) {
     put(PIN_STICKYDIRS);
-    for (map<frag_t,CDir*>::iterator p = dirfrags.begin();
+    for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin();
 	 p != dirfrags.end();
 	 ++p) {
       p->second->state_clear(CDir::STATE_STICKY);
@@ -845,9 +849,10 @@ void CInode::name_stray_dentry(string& dname)
 
 version_t CInode::pre_dirty()
 {
-  version_t pv; 
-  if (parent || !projected_parent.empty()) {
-    pv = get_projected_parent_dn()->pre_dirty(get_projected_version());
+  version_t pv;
+  CDentry* _cdentry = get_projected_parent_dn(); 
+  if (_cdentry) {
+    pv = _cdentry->pre_dirty(get_projected_version());
     dout(10) << "pre_dirty " << pv << " (current v " << inode.version << ")" << dendl;
   } else {
     assert(is_base());
@@ -926,7 +931,7 @@ struct C_IO_Inode_Stored : public CInodeIOContext {
   }
 };
 
-object_t InodeStore::get_object_name(inodeno_t ino, frag_t fg, const char *suffix)
+object_t InodeStoreBase::get_object_name(inodeno_t ino, frag_t fg, const char *suffix)
 {
   char n[60];
   snprintf(n, sizeof(n), "%llx.%08llx%s", (long long unsigned)ino, (long long unsigned)fg, suffix ? suffix : "");
@@ -938,6 +943,9 @@ void CInode::store(MDSInternalContextBase *fin)
   dout(10) << "store " << get_version() << dendl;
   assert(is_base());
 
+  if (snaprealm)
+    purge_stale_snap_data(snaprealm->get_snaps());
+
   // encode
   bufferlist bl;
   string magic = CEPH_FS_ONDISK_MAGIC;
@@ -954,7 +962,7 @@ void CInode::store(MDSInternalContextBase *fin)
 
   Context *newfin =
     new C_OnFinisher(new C_IO_Inode_Stored(this, get_version(), fin),
-		     &mdcache->mds->finisher);
+		     mdcache->mds->finisher);
   mdcache->mds->objecter->mutate(oid, oloc, m, snapc,
 				 ceph_clock_now(g_ceph_context), 0,
 				 NULL, newfin);
@@ -1008,6 +1016,7 @@ struct C_IO_Inode_Fetched : public CInodeIOContext {
   Context *fin;
   C_IO_Inode_Fetched(CInode *i, Context *f) : CInodeIOContext(i), fin(f) {}
   void finish(int r) {
+    // Ignore 'r', because we fetch from two places, so r is usually ENOENT
     in->_fetched(bl, bl2, fin);
   }
 };
@@ -1017,17 +1026,17 @@ void CInode::fetch(MDSInternalContextBase *fin)
   dout(10) << "fetch" << dendl;
 
   C_IO_Inode_Fetched *c = new C_IO_Inode_Fetched(this, fin);
-  C_GatherBuilder gather(g_ceph_context, new C_OnFinisher(c, &mdcache->mds->finisher));
+  C_GatherBuilder gather(g_ceph_context, new C_OnFinisher(c, mdcache->mds->finisher));
 
   object_t oid = CInode::get_object_name(ino(), frag_t(), "");
   object_locator_t oloc(mdcache->mds->mdsmap->get_metadata_pool());
 
+  // Old on-disk format: inode stored in xattr of a dirfrag
   ObjectOperation rd;
   rd.getxattr("inode", &c->bl, NULL);
-
   mdcache->mds->objecter->read(oid, oloc, rd, CEPH_NOSNAP, (bufferlist*)NULL, 0, gather.new_sub());
 
-  // read from separate object too
+  // Current on-disk format: inode stored in a .inode object
   object_t oid2 = CInode::get_object_name(ino(), frag_t(), ".inode");
   mdcache->mds->objecter->read(oid2, oloc, 0, 0, CEPH_NOSNAP, &c->bl2, 0, gather.new_sub());
 
@@ -1038,21 +1047,37 @@ void CInode::_fetched(bufferlist& bl, bufferlist& bl2, Context *fin)
 {
   dout(10) << "_fetched got " << bl.length() << " and " << bl2.length() << dendl;
   bufferlist::iterator p;
-  if (bl2.length())
+  if (bl2.length()) {
     p = bl2.begin();
-  else
+  } else if (bl.length()) {
     p = bl.begin();
-  string magic;
-  ::decode(magic, p);
-  dout(10) << " magic is '" << magic << "' (expecting '" << CEPH_FS_ONDISK_MAGIC << "')" << dendl;
-  if (magic != CEPH_FS_ONDISK_MAGIC) {
-    dout(0) << "on disk magic '" << magic << "' != my magic '" << CEPH_FS_ONDISK_MAGIC
-	    << "'" << dendl;
-    fin->complete(-EINVAL);
   } else {
-    decode_store(p);
-    dout(10) << "_fetched " << *this << dendl;
-    fin->complete(0);
+    derr << "No data while reading inode 0x" << std::hex << ino()
+      << std::dec << dendl;
+    fin->complete(-ENOENT);
+    return;
+  }
+
+  // Attempt decode
+  try {
+    string magic;
+    ::decode(magic, p);
+    dout(10) << " magic is '" << magic << "' (expecting '"
+             << CEPH_FS_ONDISK_MAGIC << "')" << dendl;
+    if (magic != CEPH_FS_ONDISK_MAGIC) {
+      dout(0) << "on disk magic '" << magic << "' != my magic '" << CEPH_FS_ONDISK_MAGIC
+              << "'" << dendl;
+      fin->complete(-EINVAL);
+    } else {
+      decode_store(p);
+      dout(10) << "_fetched " << *this << dendl;
+      fin->complete(0);
+    }
+  } catch (buffer::error &err) {
+    derr << "Corrupt inode 0x" << std::hex << ino() << std::dec
+      << ": " << err << dendl;
+    fin->complete(-EINVAL);
+    return;
   }
 }
 
@@ -1070,15 +1095,12 @@ void CInode::build_backtrace(int64_t pool, inode_backtrace_t& bt)
     in = diri;
     pdn = in->get_parent_dn();
   }
-  vector<int64_t>::iterator i = inode.old_pools.begin();
-  while(i != inode.old_pools.end()) {
+  for (compact_set<int64_t>::iterator i = inode.old_pools.begin();
+       i != inode.old_pools.end();
+       ++i) {
     // don't add our own pool id to old_pools to avoid looping (e.g. setlayout 0, 1, 0)
-    if (*i == pool) {
-      ++i;
-      continue;
-    }
-    bt.old_pools.insert(*i);
-    ++i;
+    if (*i != pool)
+      bt.old_pools.insert(*i);
   }
 }
 
@@ -1102,29 +1124,35 @@ void CInode::store_backtrace(MDSInternalContextBase *fin, int op_prio)
   auth_pin(this);
 
   int64_t pool;
-  if (is_dir())
+  if (is_dir()) {
     pool = mdcache->mds->mdsmap->get_metadata_pool();
-  else
+  } else {
     pool = inode.layout.fl_pg_pool;
+  }
 
   inode_backtrace_t bt;
   build_backtrace(pool, bt);
-  bufferlist bl;
-  ::encode(bt, bl);
+  bufferlist parent_bl;
+  ::encode(bt, parent_bl);
 
   ObjectOperation op;
   op.priority = op_prio;
   op.create(false);
-  op.setxattr("parent", bl);
+  op.setxattr("parent", parent_bl);
+
+  bufferlist layout_bl;
+  ::encode(inode.layout, layout_bl);
+  op.setxattr("layout", layout_bl);
 
   SnapContext snapc;
   object_t oid = get_object_name(ino(), frag_t(), "");
   object_locator_t oloc(pool);
   Context *fin2 = new C_OnFinisher(
     new C_IO_Inode_StoredBacktrace(this, inode.backtrace_version, fin),
-    &mdcache->mds->finisher);
+    mdcache->mds->finisher);
 
   if (!state_test(STATE_DIRTYPOOL) || inode.old_pools.empty()) {
+    dout(20) << __func__ << ": no dirtypool or no old pools" << dendl;
     mdcache->mds->objecter->mutate(oid, oloc, op, snapc, ceph_clock_now(g_ceph_context),
 				   0, NULL, fin2);
     return;
@@ -1134,22 +1162,25 @@ void CInode::store_backtrace(MDSInternalContextBase *fin, int op_prio)
   mdcache->mds->objecter->mutate(oid, oloc, op, snapc, ceph_clock_now(g_ceph_context),
 				 0, NULL, gather.new_sub());
 
-  set<int64_t> old_pools;
-  for (vector<int64_t>::iterator p = inode.old_pools.begin();
-      p != inode.old_pools.end();
-      ++p) {
-    if (*p == pool || old_pools.count(*p))
+  // In the case where DIRTYPOOL is set, we update all old pools backtraces
+  // such that anyone reading them will see the new pool ID in
+  // inode_backtrace_t::pool and go read everything else from there.
+  for (compact_set<int64_t>::iterator p = inode.old_pools.begin();
+       p != inode.old_pools.end();
+       ++p) {
+    if (*p == pool)
       continue;
 
+    dout(20) << __func__ << ": updating old pool " << *p << dendl;
+
     ObjectOperation op;
     op.priority = op_prio;
     op.create(false);
-    op.setxattr("parent", bl);
+    op.setxattr("parent", parent_bl);
 
     object_locator_t oloc(*p);
     mdcache->mds->objecter->mutate(oid, oloc, op, snapc, ceph_clock_now(g_ceph_context),
 				   0, NULL, gather.new_sub());
-    old_pools.insert(*p);
   }
   gather.activate();
 }
@@ -1227,7 +1258,7 @@ void CInode::verify_diri_backtrace(bufferlist &bl, int err)
   }
 
   if (err) {
-    MDS *mds = mdcache->mds;
+    MDSRank *mds = mdcache->mds;
     mds->clog->error() << "bad backtrace on dir ino " << ino() << "\n";
     assert(!"bad backtrace" == (g_conf->mds_verify_backtrace > 1));
 
@@ -1240,33 +1271,38 @@ void CInode::verify_diri_backtrace(bufferlist &bl, int err)
 // parent dir
 
 
-void InodeStore::encode_bare(bufferlist &bl) const
+void InodeStoreBase::encode_bare(bufferlist &bl, const bufferlist *snap_blob) const
 {
   ::encode(inode, bl);
   if (is_symlink())
     ::encode(symlink, bl);
   ::encode(dirfragtree, bl);
   ::encode(xattrs, bl);
-  ::encode(snap_blob, bl);
+  if (snap_blob)
+    ::encode(*snap_blob, bl);
+  else
+    ::encode(bufferlist(), bl);
   ::encode(old_inodes, bl);
   ::encode(oldest_snap, bl);
+  ::encode(damage_flags, bl);
 }
 
-void InodeStore::encode(bufferlist &bl) const
+void InodeStoreBase::encode(bufferlist &bl, const bufferlist *snap_blob) const
 {
-  ENCODE_START(5, 4, bl);
-  encode_bare(bl);
+  ENCODE_START(6, 4, bl);
+  encode_bare(bl, snap_blob);
   ENCODE_FINISH(bl);
 }
 
 void CInode::encode_store(bufferlist& bl)
 {
+  bufferlist snap_blob;
   encode_snap_blob(snap_blob);
-  InodeStore::encode(bl);
-  snap_blob.clear();
+  InodeStoreBase::encode(bl, &snap_blob);
 }
 
-void InodeStore::decode_bare(bufferlist::iterator &bl, __u8 struct_v)
+void InodeStoreBase::decode_bare(bufferlist::iterator &bl,
+			      bufferlist& snap_blob, __u8 struct_v)
 {
   ::decode(inode, bl);
   if (is_symlink())
@@ -1274,6 +1310,7 @@ void InodeStore::decode_bare(bufferlist::iterator &bl, __u8 struct_v)
   ::decode(dirfragtree, bl);
   ::decode(xattrs, bl);
   ::decode(snap_blob, bl);
+
   ::decode(old_inodes, bl);
   if (struct_v == 2 && inode.is_dir()) {
     bool default_layout_exists;
@@ -1283,23 +1320,33 @@ void InodeStore::decode_bare(bufferlist::iterator &bl, __u8 struct_v)
       ::decode(inode.layout, bl); // but we only care about the layout portion
     }
   }
-  if (struct_v >= 5 && !bl.end())
-    ::decode(oldest_snap, bl);
+
+  if (struct_v >= 5) {
+    // InodeStore is embedded in dentries without proper versioning, so
+    // we consume up to the end of the buffer
+    if (!bl.end()) {
+      ::decode(oldest_snap, bl);
+    }
+
+    if (!bl.end()) {
+      ::decode(damage_flags, bl);
+    }
+  }
 }
 
 
-void InodeStore::decode(bufferlist::iterator &bl)
+void InodeStoreBase::decode(bufferlist::iterator &bl, bufferlist& snap_blob)
 {
   DECODE_START_LEGACY_COMPAT_LEN(5, 4, 4, bl);
-  decode_bare(bl, struct_v);
+  decode_bare(bl, snap_blob, struct_v);
   DECODE_FINISH(bl);
 }
 
 void CInode::decode_store(bufferlist::iterator& bl)
 {
-  InodeStore::decode(bl);
+  bufferlist snap_blob;
+  InodeStoreBase::decode(bl, snap_blob);
   decode_snap_blob(snap_blob);
-  snap_blob.clear();
 }
 
 // ------------------
@@ -1368,7 +1415,6 @@ void CInode::encode_lock_state(int type, bufferlist& bl)
 	::encode(inode.truncate_size, bl);
 	::encode(inode.client_ranges, bl);
 	::encode(inode.inline_data, bl);
-	::encode(inode.inline_version, bl);
       }
     } else {
       // treat flushing as dirty when rejoining cache
@@ -1381,7 +1427,7 @@ void CInode::encode_lock_state(int type, bufferlist& bl)
       ::encode(inode.dirstat, bl);  // only meaningful if i am auth.
       bufferlist tmp;
       __u32 n = 0;
-      for (map<frag_t,CDir*>::iterator p = dirfrags.begin();
+      for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin();
 	   p != dirfrags.end();
 	   ++p) {
 	frag_t fg = p->first;
@@ -1416,7 +1462,7 @@ void CInode::encode_lock_state(int type, bufferlist& bl)
       ::encode(inode.rstat, bl);  // only meaningful if i am auth.
       bufferlist tmp;
       __u32 n = 0;
-      for (map<frag_t,CDir*>::iterator p = dirfrags.begin();
+      for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin();
 	   p != dirfrags.end();
 	   ++p) {
 	frag_t fg = p->first;
@@ -1452,8 +1498,7 @@ void CInode::encode_lock_state(int type, bufferlist& bl)
 
   case CEPH_LOCK_IFLOCK:
     ::encode(inode.version, bl);
-    ::encode(fcntl_locks, bl);
-    ::encode(flock_locks, bl);
+    _encode_file_locks(bl);
     break;
 
   case CEPH_LOCK_IPOLICY:
@@ -1537,7 +1582,7 @@ void CInode::decode_lock_state(int type, bufferlist& bl)
 	//  dft was scattered, or we may still be be waiting on the
 	//  notify from the auth)
 	dirfragtree.swap(temp);
-	for (map<frag_t,CDir*>::iterator p = dirfrags.begin();
+	for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin();
 	     p != dirfrags.end();
 	     ++p) {
 	  if (!dirfragtree.is_leaf(p->first)) {
@@ -1566,7 +1611,6 @@ void CInode::decode_lock_state(int type, bufferlist& bl)
 	::decode(inode.truncate_size, p);
 	::decode(inode.client_ranges, p);
 	::decode(inode.inline_data, p);
-	::decode(inode.inline_version, p);
       }
     } else {
       bool replica_dirty;
@@ -1651,7 +1695,7 @@ void CInode::decode_lock_state(int type, bufferlist& bl)
 	snapid_t fgfirst;
 	nest_info_t rstat;
 	nest_info_t accounted_rstat;
-	map<snapid_t,old_rstat_t> dirty_old_rstat;
+	compact_map<snapid_t,old_rstat_t> dirty_old_rstat;
 	::decode(fg, p);
 	::decode(fgfirst, p);
 	::decode(rstat, p);
@@ -1708,8 +1752,7 @@ void CInode::decode_lock_state(int type, bufferlist& bl)
 
   case CEPH_LOCK_IFLOCK:
     ::decode(inode.version, p);
-    ::decode(fcntl_locks, p);
-    ::decode(flock_locks, p);
+    _decode_file_locks(p);
     break;
 
   case CEPH_LOCK_IPOLICY:
@@ -1774,7 +1817,7 @@ void CInode::start_scatter(ScatterLock *lock)
   assert(is_auth());
   inode_t *pi = get_projected_inode();
 
-  for (map<frag_t,CDir*>::iterator p = dirfrags.begin();
+  for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin();
        p != dirfrags.end();
        ++p) {
     frag_t fg = p->first;
@@ -1807,7 +1850,7 @@ protected:
   CInode *in;
   CDir *dir;
   MutationRef mut;
-  MDS *get_mds() {return in->mdcache->mds;}
+  MDSRank *get_mds() {return in->mdcache->mds;}
   void finish(int r) {
     in->_finish_frag_update(dir, mut);
   }    
@@ -1918,7 +1961,7 @@ void CInode::finish_scatter_gather_update(int type)
       bool touched_mtime = false;
       dout(20) << "  orig dirstat " << pi->dirstat << dendl;
       pi->dirstat.version++;
-      for (map<frag_t,CDir*>::iterator p = dirfrags.begin();
+      for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin();
 	   p != dirfrags.end();
 	   ++p) {
 	frag_t fg = p->first;
@@ -2010,7 +2053,7 @@ void CInode::finish_scatter_gather_update(int type)
       inode_t *pi = get_projected_inode();
       dout(20) << "  orig rstat " << pi->rstat << dendl;
       pi->rstat.version++;
-      for (map<frag_t,CDir*>::iterator p = dirfrags.begin();
+      for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin();
 	   p != dirfrags.end();
 	   ++p) {
 	frag_t fg = p->first;
@@ -2035,7 +2078,7 @@ void CInode::finish_scatter_gather_update(int type)
 	  dout(20) << fg << " dirty_old_rstat " << dir->dirty_old_rstat << dendl;
 	  mdcache->project_rstat_frag_to_inode(pf->rstat, pf->accounted_rstat,
 					       dir->first, CEPH_NOSNAP, this, true);
-	  for (map<snapid_t,old_rstat_t>::iterator q = dir->dirty_old_rstat.begin();
+	  for (compact_map<snapid_t,old_rstat_t>::iterator q = dir->dirty_old_rstat.begin();
 	       q != dir->dirty_old_rstat.end();
 	       ++q)
 	    mdcache->project_rstat_frag_to_inode(q->second.rstat, q->second.accounted_rstat,
@@ -2097,7 +2140,7 @@ void CInode::finish_scatter_gather_update_accounted(int type, MutationRef& mut,
   dout(10) << "finish_scatter_gather_update_accounted " << type << " on " << *this << dendl;
   assert(is_auth());
 
-  for (map<frag_t,CDir*>::iterator p = dirfrags.begin();
+  for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin();
        p != dirfrags.end();
        ++p) {
     CDir *dir = p->second;
@@ -2155,7 +2198,7 @@ void CInode::take_dir_waiting(frag_t fg, list<MDSInternalContextBase*>& ls)
   if (waiting_on_dir.empty())
     return;
 
-  map<frag_t, list<MDSInternalContextBase*> >::iterator p = waiting_on_dir.find(fg);
+  compact_map<frag_t, list<MDSInternalContextBase*> >::iterator p = waiting_on_dir.find(fg);
   if (p != waiting_on_dir.end()) {
     dout(10) << "take_dir_waiting frag " << fg << " on " << *this << dendl;
     ls.splice(ls.end(), p->second);
@@ -2191,7 +2234,7 @@ void CInode::take_waiting(uint64_t mask, list<MDSInternalContextBase*>& ls)
   if ((mask & WAIT_DIR) && !waiting_on_dir.empty()) {
     // take all dentry waiters
     while (!waiting_on_dir.empty()) {
-      map<frag_t, list<MDSInternalContextBase*> >::iterator p = waiting_on_dir.begin();
+      compact_map<frag_t, list<MDSInternalContextBase*> >::iterator p = waiting_on_dir.begin();
       dout(10) << "take_waiting dirfrag " << p->first << " on " << *this << dendl;
       ls.splice(ls.end(), p->second);
       waiting_on_dir.erase(p);
@@ -2347,7 +2390,7 @@ void CInode::adjust_nested_auth_pins(int a, void *by)
   if (g_conf->mds_debug_auth_pins) {
     // audit
     int s = 0;
-    for (map<frag_t,CDir*>::iterator p = dirfrags.begin();
+    for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin();
 	 p != dirfrags.end();
 	 ++p) {
       CDir *dir = p->second;
@@ -2410,7 +2453,8 @@ old_inode_t& CInode::cow_old_inode(snapid_t follows, bool cow_head)
 
   old.inode.trim_client_ranges(follows);
 
-  if (!(old.inode.rstat == old.inode.accounted_rstat))
+  if (g_conf->mds_snap_rstat &&
+      !(old.inode.rstat == old.inode.accounted_rstat))
     dirty_old_rstats.insert(follows);
   
   first = follows+1;
@@ -2424,7 +2468,7 @@ old_inode_t& CInode::cow_old_inode(snapid_t follows, bool cow_head)
 
 void CInode::split_old_inode(snapid_t snap)
 {
-  map<snapid_t, old_inode_t>::iterator p = old_inodes.lower_bound(snap);
+  compact_map<snapid_t, old_inode_t>::iterator p = old_inodes.lower_bound(snap);
   assert(p != old_inodes.end() && p->second.first < snap);
 
   old_inode_t &old = old_inodes[snap - 1];
@@ -2449,7 +2493,7 @@ void CInode::purge_stale_snap_data(const set<snapid_t>& snaps)
   if (old_inodes.empty())
     return;
 
-  map<snapid_t,old_inode_t>::iterator p = old_inodes.begin();
+  compact_map<snapid_t,old_inode_t>::iterator p = old_inodes.begin();
   while (p != old_inodes.end()) {
     set<snapid_t>::const_iterator q = snaps.lower_bound(p->second.first);
     if (q == snaps.end() || *q > p->first) {
@@ -2465,7 +2509,7 @@ void CInode::purge_stale_snap_data(const set<snapid_t>& snaps)
  */
 old_inode_t * CInode::pick_old_inode(snapid_t snap)
 {
-  map<snapid_t, old_inode_t>::iterator p = old_inodes.lower_bound(snap);  // p is first key >= to snap
+  compact_map<snapid_t, old_inode_t>::iterator p = old_inodes.lower_bound(snap);  // p is first key >= to snap
   if (p != old_inodes.end() && p->second.first <= snap) {
     dout(10) << "pick_old_inode snap " << snap << " -> [" << p->second.first << "," << p->first << "]" << dendl;
     return &p->second;
@@ -2533,6 +2577,10 @@ void CInode::decode_snap_blob(bufferlist& snapbl)
     open_snaprealm();
     bufferlist::iterator p = snapbl.begin();
     ::decode(snaprealm->srnode, p);
+    if (is_base()) {
+      bool ok = snaprealm->_open_parents(NULL);
+      assert(ok);
+    }
     dout(20) << "decode_snap_blob " << *snaprealm << dendl;
   }
 }
@@ -2542,12 +2590,14 @@ void CInode::encode_snap(bufferlist& bl)
   bufferlist snapbl;
   encode_snap_blob(snapbl);
   ::encode(snapbl, bl);
+  ::encode(oldest_snap, bl);
 }    
 
 void CInode::decode_snap(bufferlist::iterator& p)
 {
   bufferlist snapbl;
   ::decode(snapbl, p);
+  ::decode(oldest_snap, p);
   decode_snap_blob(snapbl);
 }
 
@@ -2715,8 +2765,8 @@ void CInode::remove_client_cap(client_t client)
   mdcache->num_caps--;
 
   //clean up advisory locks
-  bool fcntl_removed = fcntl_locks.remove_all_from(client);
-  bool flock_removed = flock_locks.remove_all_from(client);
+  bool fcntl_removed = fcntl_locks ? fcntl_locks->remove_all_from(client) : false;
+  bool flock_removed = flock_locks ? flock_locks->remove_all_from(client) : false; 
   if (fcntl_removed || flock_removed) {
     list<MDSInternalContextBase*> waiters;
     take_waiting(CInode::WAIT_FLOCK, waiters);
@@ -2837,7 +2887,7 @@ int CInode::get_caps_allowed_for_client(client_t client) const
   } else {
     allowed = get_caps_allowed_by_type(CAP_ANY);
   }
-  if (inode.inline_version != CEPH_INLINE_NONE &&
+  if (inode.inline_data.version != CEPH_INLINE_NONE &&
       !mdcache->mds->get_session(client)->connection->has_feature(CEPH_FEATURE_MDS_INLINE_DATA))
     allowed &= ~(CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR);
   return allowed;
@@ -2898,7 +2948,7 @@ int CInode::get_caps_wanted(int *ploner, int *pother, int shift, int mask) const
     //cout << " get_caps_wanted client " << it->first << " " << cap_string(it->second.wanted()) << endl;
   }
   if (is_auth())
-    for (map<int,int>::const_iterator it = mds_caps_wanted.begin();
+    for (compact_map<int,int>::const_iterator it = mds_caps_wanted.begin();
 	 it != mds_caps_wanted.end();
 	 ++it) {
       w |= it->second;
@@ -2973,26 +3023,36 @@ int CInode::encode_inodestat(bufferlist& bl, Session *session,
 
   map<string, bufferptr> *pxattrs = 0;
 
-  if (snapid != CEPH_NOSNAP && is_multiversion()) {
+  if (snapid != CEPH_NOSNAP) {
 
     // for now at least, old_inodes is only defined/valid on the auth
     if (!is_auth())
       valid = false;
 
-    map<snapid_t,old_inode_t>::iterator p = old_inodes.lower_bound(snapid);
-    if (p != old_inodes.end()) {
-      if (p->second.first > snapid) {
-        if  (p != old_inodes.begin())
-          --p;
-        else dout(0) << "old_inode lower_bound starts after snapid!" << dendl;
+    if (is_multiversion()) {
+      compact_map<snapid_t,old_inode_t>::iterator p = old_inodes.lower_bound(snapid);
+      if (p != old_inodes.end()) {
+	if (p->second.first > snapid) {
+	  if  (p != old_inodes.begin())
+	    --p;
+	}
+	if (p->second.first <= snapid && snapid <= p->first) {
+	  dout(15) << "encode_inodestat snapid " << snapid
+		   << " to old_inode [" << p->second.first << "," << p->first << "]"
+		   << " " << p->second.inode.rstat
+		   << dendl;
+	  pi = oi = &p->second.inode;
+	  pxattrs = &p->second.xattrs;
+	} else {
+	  // snapshoted remote dentry can result this
+	  dout(0) << "encode_inodestat old_inode for snapid " << snapid
+		  << " not found" << dendl;
+	}
       }
-      dout(15) << "encode_inodestat snapid " << snapid
-	       << " to old_inode [" << p->second.first << "," << p->first << "]" 
-	       << " " << p->second.inode.rstat
-	       << dendl;
-      assert(p->second.first <= snapid && snapid <= p->first);
-      pi = oi = &p->second.inode;
-      pxattrs = &p->second.xattrs;
+    } else if (snapid < first || snapid > last) {
+      // snapshoted remote dentry can result this
+      dout(0) << "encode_inodestat [" << first << "," << last << "]"
+	      << " not match snapid " << snapid << dendl;
     }
   }
   
@@ -3051,13 +3111,14 @@ int CInode::encode_inodestat(bufferlist& bl, Session *session,
   // inline data
   version_t inline_version = 0;
   bufferlist inline_data;
-  if (i->inline_version == CEPH_INLINE_NONE) {
+  if (i->inline_data.version == CEPH_INLINE_NONE) {
     inline_version = CEPH_INLINE_NONE;
   } else if ((!cap && !no_caps) ||
-	     (cap && cap->client_inline_version < i->inline_version) ||
+	     (cap && cap->client_inline_version < i->inline_data.version) ||
 	     (getattr_caps & CEPH_CAP_FILE_RD)) { // client requests inline data
-    inline_version = i->inline_version;
-    inline_data = i->inline_data;
+    inline_version = i->inline_data.version;
+    if (i->inline_data.length() > 0)
+      inline_data = i->inline_data.get_data();
   }
 
   // nest (do same as file... :/)
@@ -3156,7 +3217,7 @@ int CInode::encode_inodestat(bufferlist& bl, Session *session,
       e.cap.wanted = cap->wanted();
       e.cap.cap_id = cap->get_cap_id();
       e.cap.seq = cap->get_last_seq();
-      dout(10) << "encode_inodestat issueing " << ccap_string(issue) << " seq " << cap->get_last_seq() << dendl;
+      dout(10) << "encode_inodestat issuing " << ccap_string(issue) << " seq " << cap->get_last_seq() << dendl;
       e.cap.mseq = cap->get_mseq();
       e.cap.realm = realm->inode->ino();
     } else {
@@ -3202,12 +3263,9 @@ int CInode::encode_inodestat(bufferlist& bl, Session *session,
   // encode
   e.fragtree.nsplits = dirfragtree._splits.size();
   ::encode(e, bl);
-  for (map<frag_t,int32_t>::iterator p = dirfragtree._splits.begin();
-       p != dirfragtree._splits.end();
-       ++p) {
-    ::encode(p->first, bl);
-    ::encode(p->second, bl);
-  }
+
+  dirfragtree.encode_nohead(bl);
+
   ::encode(symlink, bl);
   if (session->connection->has_feature(CEPH_FEATURE_DIRLAYOUTHASH)) {
     i = pfile ? pi : oi;
@@ -3255,9 +3313,10 @@ void CInode::encode_cap_message(MClientCaps *m, Capability *cap)
   i->atime.encode_timeval(&m->head.atime);
   m->head.time_warp_seq = i->time_warp_seq;
 
-  if (cap->client_inline_version < i->inline_version) {
-    m->inline_version = cap->client_inline_version = i->inline_version;
-    m->inline_data = i->inline_data;
+  if (cap->client_inline_version < i->inline_data.version) {
+    m->inline_version = cap->client_inline_version = i->inline_data.version;
+    if (i->inline_data.length() > 0)
+      m->inline_data = i->inline_data.get_data();
   } else {
     m->inline_version = 0;
   }
@@ -3296,6 +3355,7 @@ void CInode::_encode_base(bufferlist& bl)
   ::encode(dirfragtree, bl);
   ::encode(xattrs, bl);
   ::encode(old_inodes, bl);
+  ::encode(damage_flags, bl);
   encode_snap(bl);
 }
 void CInode::_decode_base(bufferlist::iterator& p)
@@ -3306,6 +3366,7 @@ void CInode::_decode_base(bufferlist::iterator& p)
   ::decode(dirfragtree, p);
   ::decode(xattrs, p);
   ::decode(old_inodes, p);
+  ::decode(damage_flags, p);
   decode_snap(p);
 }
 
@@ -3414,7 +3475,7 @@ void CInode::encode_export(bufferlist& bl)
   // include scatterlock info for any bounding CDirs
   bufferlist bounding;
   if (inode.is_dir())
-    for (map<frag_t,CDir*>::iterator p = dirfrags.begin();
+    for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin();
 	 p != dirfrags.end();
 	 ++p) {
       CDir *dir = p->second;
@@ -3431,8 +3492,8 @@ void CInode::encode_export(bufferlist& bl)
 
   _encode_locks_full(bl);
 
-  ::encode(fcntl_locks, bl);
-  ::encode(flock_locks, bl);
+  _encode_file_locks(bl);
+
   ENCODE_FINISH(bl);
 
   get(PIN_TEMPEXPORTING);
@@ -3523,36 +3584,32 @@ void CInode::decode_import(bufferlist::iterator& p,
 
   _decode_locks_full(p);
 
-  if (struct_v >= 5) {
-    ::decode(fcntl_locks, p);
-    ::decode(flock_locks, p);
-  }
+  _decode_file_locks(p);
 
   DECODE_FINISH(p);
 }
 
 
-void InodeStore::dump(Formatter *f) const
+void InodeStoreBase::dump(Formatter *f) const
 {
-  f->open_object_section("inode_store");
-  {
-    inode.dump(f);
-    f->dump_string("symlink", symlink);
-    // FIXME: dirfragtree: dump methods for fragtree_t
-    // FIXME: xattrs: JSON-safe versions of binary xattrs
-    f->open_array_section("old_inodes");
-    for (std::map<snapid_t, old_inode_t>::const_iterator i = old_inodes.begin(); i != old_inodes.end(); ++i) {
-      f->open_object_section("old_inode");
-      {
-        // The key is the last snapid, the first is in the old_inode_t
-        f->dump_int("last", i->first);
-        i->second.dump(f);
-      }
-      f->close_section();  // old_inode
+  inode.dump(f);
+  f->dump_string("symlink", symlink);
+  f->open_array_section("old_inodes");
+  for (compact_map<snapid_t, old_inode_t>::const_iterator i = old_inodes.begin();
+      i != old_inodes.end(); ++i) {
+    f->open_object_section("old_inode");
+    {
+      // The key is the last snapid, the first is in the old_inode_t
+      f->dump_int("last", i->first);
+      i->second.dump(f);
     }
-    f->close_section();  // old_inodes
+    f->close_section();  // old_inode
   }
-  f->close_section();  // inode_store
+  f->close_section();  // old_inodes
+
+  f->open_object_section("dirfragtree");
+  dirfragtree.dump(f);
+  f->close_section(); // dirfragtree
 }
 
 
@@ -3614,7 +3671,7 @@ void CInode::validate_disk_state(CInode::validated_data *results,
       results->passed_validation = false; // we haven't finished it yet
 
       C_OnFinisher *conf = new C_OnFinisher(get_io_callback(BACKTRACE),
-                                            &in->mdcache->mds->finisher);
+                                            in->mdcache->mds->finisher);
 
       in->fetch_backtrace(conf, &bl);
       return false;
@@ -3746,9 +3803,9 @@ void CInode::validate_disk_state(CInode::validated_data *results,
 
       // check each dirfrag...
       nest_info_t& sub_info = results->raw_rstats.ondisk_value;
-      for (map<frag_t,CDir*>::iterator p = in->dirfrags.begin();
-          p != in->dirfrags.end();
-          ++p) {
+      for (compact_map<frag_t,CDir*>::iterator p = in->dirfrags.begin();
+	   p != in->dirfrags.end();
+	   ++p) {
         if (!p->second->is_complete()) {
           results->raw_rstats.error_str << "dirfrag is INCOMPLETE despite fetching; probably too large compared to MDS cache size?\n";
           return true;
@@ -3791,7 +3848,7 @@ void CInode::validated_data::dump(Formatter *f) const
       f->dump_int("read_ret_val", backtrace.ondisk_read_retval);
       f->dump_stream("ondisk_value") << backtrace.ondisk_value;
       f->dump_stream("memoryvalue") << backtrace.memory_value;
-      f->dump_stream("error_str") << backtrace.error_str;
+      f->dump_string("error_str", backtrace.error_str.str());
     }
     f->close_section(); // backtrace
     f->open_object_section("raw_rstats");
@@ -3801,7 +3858,7 @@ void CInode::validated_data::dump(Formatter *f) const
       f->dump_int("read_ret_val", raw_rstats.ondisk_read_retval);
       f->dump_stream("ondisk_value") << raw_rstats.ondisk_value;
       f->dump_stream("memory_value") << raw_rstats.memory_value;
-      f->dump_stream("error_str") << raw_rstats.error_str;
+      f->dump_string("error_str", raw_rstats.error_str.str());
     }
     f->close_section(); // raw_rstats
     // dump failure return code
@@ -3816,3 +3873,109 @@ void CInode::validated_data::dump(Formatter *f) const
   }
   f->close_section(); // results
 }
+
+void CInode::dump(Formatter *f) const
+{
+  InodeStoreBase::dump(f);
+
+  MDSCacheObject::dump(f);
+
+  f->open_object_section("versionlock");
+  versionlock.dump(f);
+  f->close_section();
+
+  f->open_object_section("authlock");
+  authlock.dump(f);
+  f->close_section();
+
+  f->open_object_section("linklock");
+  linklock.dump(f);
+  f->close_section();
+
+  f->open_object_section("dirfragtreelock");
+  dirfragtreelock.dump(f);
+  f->close_section();
+
+  f->open_object_section("filelock");
+  filelock.dump(f);
+  f->close_section();
+
+  f->open_object_section("xattrlock");
+  xattrlock.dump(f);
+  f->close_section();
+
+  f->open_object_section("snaplock");
+  snaplock.dump(f);
+  f->close_section();
+
+  f->open_object_section("nestlock");
+  nestlock.dump(f);
+  f->close_section();
+
+  f->open_object_section("flocklock");
+  flocklock.dump(f);
+  f->close_section();
+
+  f->open_object_section("policylock");
+  policylock.dump(f);
+  f->close_section();
+
+  f->open_array_section("states");
+  MDSCacheObject::dump_states(f);
+  if (state_test(STATE_EXPORTING))
+    f->dump_string("state", "exporting");
+  if (state_test(STATE_OPENINGDIR))
+    f->dump_string("state", "openingdir");
+  if (state_test(STATE_FREEZING))
+    f->dump_string("state", "freezing");
+  if (state_test(STATE_FROZEN))
+    f->dump_string("state", "frozen");
+  if (state_test(STATE_AMBIGUOUSAUTH))
+    f->dump_string("state", "ambiguousauth");
+  if (state_test(STATE_EXPORTINGCAPS))
+    f->dump_string("state", "exportingcaps");
+  if (state_test(STATE_NEEDSRECOVER))
+    f->dump_string("state", "needsrecover");
+  if (state_test(STATE_PURGING))
+    f->dump_string("state", "purging");
+  if (state_test(STATE_DIRTYPARENT))
+    f->dump_string("state", "dirtyparent");
+  if (state_test(STATE_DIRTYRSTAT))
+    f->dump_string("state", "dirtyrstat");
+  if (state_test(STATE_STRAYPINNED))
+    f->dump_string("state", "straypinned");
+  if (state_test(STATE_FROZENAUTHPIN))
+    f->dump_string("state", "frozenauthpin");
+  if (state_test(STATE_DIRTYPOOL))
+    f->dump_string("state", "dirtypool");
+  if (state_test(STATE_ORPHAN))
+    f->dump_string("state", "orphan");
+  f->close_section();
+
+  f->open_array_section("client_caps");
+  for (map<client_t,Capability*>::const_iterator it = client_caps.begin();
+       it != client_caps.end(); ++it) {
+    f->open_object_section("client_cap");
+    f->dump_int("client_id", it->first.v);
+    f->dump_string("pending", ccap_string(it->second->pending()));
+    f->dump_string("issued", ccap_string(it->second->issued()));
+    f->dump_string("wanted", ccap_string(it->second->wanted()));
+    f->dump_string("last_sent", ccap_string(it->second->get_last_sent()));
+    f->close_section();
+  }
+  f->close_section();
+
+  f->dump_int("loner", loner_cap.v);
+  f->dump_int("want_loner", want_loner_cap.v);
+
+  f->open_array_section("mds_caps_wanted");
+  for (compact_map<int,int>::const_iterator p = mds_caps_wanted.begin();
+       p != mds_caps_wanted.end(); ++p) {
+    f->open_object_section("mds_cap_wanted");
+    f->dump_int("rank", p->first);
+    f->dump_string("cap", ccap_string(p->second));
+    f->close_section();
+  }
+  f->close_section();
+}
+
diff --git a/src/mds/CInode.h b/src/mds/CInode.h
index 5a88097..a7abba4 100644
--- a/src/mds/CInode.h
+++ b/src/mds/CInode.h
@@ -21,6 +21,7 @@
 #include "include/elist.h"
 #include "include/types.h"
 #include "include/lru.h"
+#include "include/compact_set.h"
 
 #include "mdstypes.h"
 #include "flock.h"
@@ -35,7 +36,6 @@
 #include <list>
 #include <set>
 #include <map>
-//#include <iostream>
 
 class Context;
 class CDentry;
@@ -70,18 +70,17 @@ extern int num_cinode_locks;
  * handle CInodes from the backing store without hitting all
  * the business logic in CInode proper.
  */
-class InodeStore {
+class InodeStoreBase {
 public:
   inode_t                    inode;        // the inode itself
   std::string                symlink;      // symlink dest, if symlink
   std::map<std::string, bufferptr> xattrs;
   fragtree_t                 dirfragtree;  // dir frag tree, if any.  always consistent with our dirfrag map.
-  std::map<snapid_t, old_inode_t> old_inodes;   // key = last, value.first = first
-  bufferlist		     snap_blob;    // Encoded copy of SnapRealm, because we can't
-                                           // rehydrate it without full MDCache
+  compact_map<snapid_t, old_inode_t> old_inodes;   // key = last, value.first = first
   snapid_t                  oldest_snap;
+  damage_flags_t            damage_flags;
 
-  InodeStore() : oldest_snap(CEPH_NOSNAP) { }
+  InodeStoreBase() : oldest_snap(CEPH_NOSNAP), damage_flags(0) { }
 
   /* Helpers */
   bool is_file() const    { return inode.is_file(); }
@@ -90,20 +89,43 @@ public:
   static object_t get_object_name(inodeno_t ino, frag_t fg, const char *suffix);
 
   /* Full serialization for use in ".inode" root inode objects */
-  void encode(bufferlist &bl) const;
-  void decode(bufferlist::iterator &bl);
+  void encode(bufferlist &bl, const bufferlist *snap_blob=NULL) const;
+  void decode(bufferlist::iterator &bl, bufferlist& snap_blob);
 
   /* Serialization without ENCODE_START/FINISH blocks for use embedded in dentry */
-  void encode_bare(bufferlist &bl) const;
-  void decode_bare(bufferlist::iterator &bl, __u8 struct_v=5);
+  void encode_bare(bufferlist &bl, const bufferlist *snap_blob=NULL) const;
+  void decode_bare(bufferlist::iterator &bl, bufferlist &snap_blob, __u8 struct_v=5);
 
-  /* For use in debug and ceph-dencoder */
+  /* For test/debug output */
   void dump(Formatter *f) const;
+
+  /* For use by offline tools */
+  __u32 hash_dentry_name(const std::string &dn);
+  frag_t pick_dirfrag(const std::string &dn);
+};
+
+class InodeStore : public InodeStoreBase {
+public:
+  bufferlist snap_blob;  // Encoded copy of SnapRealm, because we can't
+			 // rehydrate it without full MDCache
+  void encode(bufferlist &bl) const {
+    InodeStoreBase::encode(bl, &snap_blob);
+  }
+  void decode(bufferlist::iterator &bl) {
+    InodeStoreBase::decode(bl, snap_blob);
+  }
+  void encode_bare(bufferlist &bl) const {
+    InodeStoreBase::encode_bare(bl, &snap_blob);
+  }
+  void decode_bare(bufferlist::iterator &bl) {
+    InodeStoreBase::decode_bare(bl, snap_blob);
+  }
+
   static void generate_test_instances(std::list<InodeStore*>& ls);
 };
 
 // cached inode wrapper
-class CInode : public MDSCacheObject, public InodeStore {
+class CInode : public MDSCacheObject, public InodeStoreBase {
   /*
    * This class uses a boost::pool to handle allocation. This is *not*
    * thread-safe, so don't do allocations from multiple threads!
@@ -218,7 +240,7 @@ public:
   SnapRealm        *snaprealm;
   SnapRealm        *containing_realm;
   snapid_t          first, last;
-  std::set<snapid_t> dirty_old_rstats;
+  compact_set<snapid_t> dirty_old_rstats;
 
   bool is_multiversion() const {
     return snaprealm ||  // other snaprealms will link to me
@@ -271,6 +293,8 @@ public:
       : inode(in), xattrs(xp), snapnode(sn) {}
   };
   std::list<projected_inode_t*> projected_nodes;   // projected values (only defined while dirty)
+  int num_projected_xattrs;
+  int num_projected_srnodes;
   
   inode_t *project_inode(std::map<std::string,bufferptr> *px=0);
   void pop_and_dirty_projected_inode(LogSegment *ls);
@@ -315,11 +339,13 @@ public:
   }
 
   std::map<std::string,bufferptr> *get_projected_xattrs() {
-    for (std::list<projected_inode_t*>::reverse_iterator p = projected_nodes.rbegin();
-	 p != projected_nodes.rend();
-	 ++p)
-      if ((*p)->xattrs)
-	return (*p)->xattrs;
+    if (num_projected_xattrs > 0) {
+      for (std::list<projected_inode_t*>::reverse_iterator p = projected_nodes.rbegin();
+	   p != projected_nodes.rend();
+	   ++p)
+	if ((*p)->xattrs)
+	  return (*p)->xattrs;
+    }
     return &xattrs;
   }
   std::map<std::string,bufferptr> *get_previous_projected_xattrs() {
@@ -334,34 +360,30 @@ public:
 
   sr_t *project_snaprealm(snapid_t snapid=0);
   const sr_t *get_projected_srnode() const {
-    if (projected_nodes.empty()) {
-      if (snaprealm)
-	return &snaprealm->srnode;
-      else
-	return NULL;
-    } else {
+    if (num_projected_srnodes > 0) {
       for (std::list<projected_inode_t*>::const_reverse_iterator p = projected_nodes.rbegin();
-          p != projected_nodes.rend();
-          ++p)
-        if ((*p)->snapnode)
-          return (*p)->snapnode;
+	  p != projected_nodes.rend();
+	  ++p)
+	if ((*p)->snapnode)
+	  return (*p)->snapnode;
     }
-    return &snaprealm->srnode;
+    if (snaprealm)
+      return &snaprealm->srnode;
+    else
+      return NULL;
   }
   sr_t *get_projected_srnode() {
-    if (projected_nodes.empty()) {
-      if (snaprealm)
-	return &snaprealm->srnode;
-      else
-	return NULL;
-    } else {
+    if (num_projected_srnodes > 0) {
       for (std::list<projected_inode_t*>::reverse_iterator p = projected_nodes.rbegin();
-          p != projected_nodes.rend();
-          ++p)
-        if ((*p)->snapnode)
-          return (*p)->snapnode;
+	   p != projected_nodes.rend();
+	   ++p)
+	if ((*p)->snapnode)
+	  return (*p)->snapnode;
     }
-    return &snaprealm->srnode;
+    if (snaprealm)
+      return &snaprealm->srnode;
+    else
+      return NULL;
   }
   void project_past_snaprealm_parent(SnapRealm *newparent);
 
@@ -377,12 +399,10 @@ public:
 
   // -- cache infrastructure --
 private:
-  std::map<frag_t,CDir*> dirfrags; // cached dir fragments under this Inode
+  compact_map<frag_t,CDir*> dirfrags; // cached dir fragments under this Inode
   int stickydir_ref;
 
 public:
-  __u32 hash_dentry_name(const std::string &dn);
-  frag_t pick_dirfrag(const std::string &dn);
   bool has_dirfrags() { return !dirfrags.empty(); }
   CDir* get_dirfrag(frag_t fg) {
     if (dirfrags.count(fg)) {
@@ -412,7 +432,7 @@ public:
  protected:
   // parent dentries in cache
   CDentry         *parent;             // primary link
-  std::set<CDentry*>    remote_parents;     // if hard linked
+  compact_set<CDentry*>    remote_parents;     // if hard linked
 
   std::list<CDentry*>   projected_parent;   // for in-progress rename, (un)link, etc.
 
@@ -422,12 +442,12 @@ public:
 protected:
   // file capabilities
   std::map<client_t, Capability*> client_caps;         // client -> caps
-  std::map<int32_t, int32_t>      mds_caps_wanted;     // [auth] mds -> caps wanted
+  compact_map<int32_t, int32_t>      mds_caps_wanted;     // [auth] mds -> caps wanted
   int                   replica_caps_wanted; // [replica] what i've requested from auth
 
-  std::map<int, std::set<client_t> > client_snap_caps;     // [auth] [snap] dirty metadata we still need from the head
+  compact_map<int, std::set<client_t> > client_snap_caps;     // [auth] [snap] dirty metadata we still need from the head
 public:
-  std::map<snapid_t, std::set<client_t> > client_need_snapflush;
+  compact_map<snapid_t, std::set<client_t> > client_need_snapflush;
 
   void add_need_snapflush(CInode *snapin, snapid_t snapid, client_t client);
   void remove_need_snapflush(CInode *snapin, snapid_t snapid, client_t client);
@@ -435,12 +455,54 @@ public:
 
 protected:
 
-  ceph_lock_state_t fcntl_locks;
-  ceph_lock_state_t flock_locks;
+  ceph_lock_state_t *fcntl_locks;
+  ceph_lock_state_t *flock_locks;
 
+  ceph_lock_state_t *get_fcntl_lock_state() {
+    if (!fcntl_locks)
+      fcntl_locks = new ceph_lock_state_t(g_ceph_context);
+    return fcntl_locks;
+  }
+  void clear_fcntl_lock_state() {
+    delete fcntl_locks;
+    fcntl_locks = NULL;
+  }
+  ceph_lock_state_t *get_flock_lock_state() {
+    if (!flock_locks)
+      flock_locks = new ceph_lock_state_t(g_ceph_context);
+    return flock_locks;
+  }
+  void clear_flock_lock_state() {
+    delete flock_locks;
+    flock_locks = NULL;
+  }
   void clear_file_locks() {
-    fcntl_locks.clear();
-    flock_locks.clear();
+    clear_fcntl_lock_state();
+    clear_flock_lock_state();
+  }
+  void _encode_file_locks(bufferlist& bl) const {
+    bool has_fcntl_locks = fcntl_locks && !fcntl_locks->empty();
+    ::encode(has_fcntl_locks, bl);
+    if (has_fcntl_locks)
+      ::encode(*fcntl_locks, bl);
+    bool has_flock_locks = flock_locks && !flock_locks->empty();
+    ::encode(has_flock_locks, bl);
+    if (has_flock_locks)
+      ::encode(*flock_locks, bl);
+  }
+  void _decode_file_locks(bufferlist::iterator& p) {
+    bool has_fcntl_locks;
+    ::decode(has_fcntl_locks, p);
+    if (has_fcntl_locks)
+      ::decode(*get_fcntl_lock_state(), p);
+    else
+      clear_fcntl_lock_state();
+    bool has_flock_locks;
+    ::decode(has_flock_locks, p);
+    if (has_flock_locks)
+      ::decode(*get_flock_lock_state(), p);
+    else
+      clear_flock_lock_state();
   }
 
   // LogSegment lists i (may) belong to
@@ -453,14 +515,7 @@ public:
   elist<CInode*>::item item_dirty_dirfrag_nest;
   elist<CInode*>::item item_dirty_dirfrag_dirfragtree;
 
-private:
-  // auth pin
-  int auth_pins;
-  int nested_auth_pins;
 public:
-#ifdef MDS_AUTHPIN_SET
-  multiset<void*> auth_pin_set;
-#endif
   int auth_pin_freeze_allowance;
 
   inode_load_vec_t pop;
@@ -470,6 +525,7 @@ public:
   friend class Locker;
   friend class Migrator;
   friend class MDCache;
+  friend class StrayManager;
   friend class CDir;
   friend class CInodeExport;
 
@@ -480,16 +536,17 @@ public:
     first(f), last(l),
     last_journaled(0), //last_open_journaled(0), 
     //hack_accessed(true),
+    num_projected_xattrs(0),
+    num_projected_srnodes(0),
     stickydir_ref(0),
     parent(0),
     inode_auth(CDIR_AUTH_DEFAULT),
     replica_caps_wanted(0),
-    fcntl_locks(g_ceph_context), flock_locks(g_ceph_context),
+    fcntl_locks(0), flock_locks(0),
     item_dirty(this), item_caps(this), item_open_file(this), item_dirty_parent(this),
     item_dirty_dirfrag_dir(this), 
     item_dirty_dirfrag_nest(this), 
     item_dirty_dirfrag_dirfragtree(this), 
-    auth_pins(0), nested_auth_pins(0),
     auth_pin_freeze_allowance(0),
     pop(ceph_clock_now(g_ceph_context)),
     versionlock(this, &versionlock_type),
@@ -514,12 +571,18 @@ public:
     g_num_inos++;
     close_dirfrags();
     close_snaprealm();
+    clear_file_locks();
+    assert(num_projected_xattrs == 0);
+    assert(num_projected_srnodes == 0);
   }
   
 
   // -- accessors --
   bool is_root() const { return inode.ino == MDS_INO_ROOT; }
   bool is_stray() const { return MDS_INO_IS_STRAY(inode.ino); }
+  mds_rank_t get_stray_owner() const {
+    return (mds_rank_t)MDS_INO_STRAY_OWNER(inode.ino);
+  }
   bool is_mdsdir() const { return MDS_INO_IS_MDSDIR(inode.ino); }
   bool is_base() const { return is_root() || is_mdsdir(); }
   bool is_system() const { return inode.ino < MDS_INO_SYSTEM_BASE; }
@@ -624,7 +687,7 @@ public:
 
   // -- waiting --
 protected:
-  std::map<frag_t, std::list<MDSInternalContextBase*> > waiting_on_dir;
+  compact_map<frag_t, std::list<MDSInternalContextBase*> > waiting_on_dir;
 public:
   void add_dir_waiter(frag_t fg, MDSInternalContextBase *c);
   void take_dir_waiting(frag_t fg, std::list<MDSInternalContextBase*>& ls);
@@ -776,8 +839,8 @@ public:
   bool is_any_caps() { return !client_caps.empty(); }
   bool is_any_nonstale_caps() { return count_nonstale_caps(); }
 
-  const std::map<int32_t,int32_t>& get_mds_caps_wanted() const { return mds_caps_wanted; }
-  std::map<int32_t,int32_t>& get_mds_caps_wanted() { return mds_caps_wanted; }
+  const compact_map<int32_t,int32_t>& get_mds_caps_wanted() const { return mds_caps_wanted; }
+  compact_map<int32_t,int32_t>& get_mds_caps_wanted() { return mds_caps_wanted; }
 
   const std::map<client_t,Capability*>& get_client_caps() const { return client_caps; }
   Capability *get_client_cap(client_t client) {
@@ -818,15 +881,10 @@ public:
   bool issued_caps_need_gather(SimpleLock *lock);
   void replicate_relax_locks();
 
-
   // -- authority --
   mds_authority_t authority() const;
 
-
   // -- auth pins --
-  bool is_auth_pinned() const { return auth_pins || nested_auth_pins; }
-  int get_num_auth_pins() const { return auth_pins; }
-  int get_num_nested_auth_pins() const { return nested_auth_pins; }
   void adjust_nested_auth_pins(int a, void *by);
   bool can_auth_pin() const;
   void auth_pin(void *by);
@@ -902,6 +960,7 @@ public:
   }
 
   void print(ostream& out);
+  void dump(Formatter *f) const;
 
   /**
    * @defgroup Scrubbing and fsck
@@ -952,8 +1011,8 @@ public:
    *
    * @param results A freshly-created validated_data struct, with values set
    * as described in the struct documentation.
-   * @param Context The callback to activate once the validation has
-   * been completed.
+   * @param mdr The request to be responeded upon the completion of the
+   * validation.
    */
   void validate_disk_state(validated_data *results,
                            MDRequestRef& mdr);
diff --git a/src/mds/InoTable.cc b/src/mds/InoTable.cc
index a83c5ff..fd26a11 100644
--- a/src/mds/InoTable.cc
+++ b/src/mds/InoTable.cc
@@ -13,7 +13,7 @@
  */
 
 #include "InoTable.h"
-#include "MDS.h"
+#include "MDSRank.h"
 
 #include "include/types.h"
 
diff --git a/src/mds/InoTable.h b/src/mds/InoTable.h
index e07a6b7..e63f3f6 100644
--- a/src/mds/InoTable.h
+++ b/src/mds/InoTable.h
@@ -19,14 +19,14 @@
 #include "MDSTable.h"
 #include "include/interval_set.h"
 
-class MDS;
+class MDSRank;
 
 class InoTable : public MDSTable {
   interval_set<inodeno_t> free;   // unused ids
   interval_set<inodeno_t> projected_free;
 
  public:
-  InoTable(MDS *m) : MDSTable(m, "inotable", true) { }
+  InoTable(MDSRank *m) : MDSTable(m, "inotable", true) { }
 
   inodeno_t project_alloc_id(inodeno_t id=0);
   void apply_alloc_id(inodeno_t id);
diff --git a/src/mds/JournalPointer.cc b/src/mds/JournalPointer.cc
index 0ceb758..68e8aa4 100644
--- a/src/mds/JournalPointer.cc
+++ b/src/mds/JournalPointer.cc
@@ -56,7 +56,11 @@ int JournalPointer::load(Objecter *objecter)
   // Construct JournalPointer result, null or decoded data
   if (r == 0) {
     bufferlist::iterator q = data.begin();
-    decode(q);
+    try {
+      decode(q);
+    } catch (const buffer::error &e) {
+      return -EINVAL;
+    }
   } else {
     dout(1) << "Journal pointer '" << object_id << "' read failed: " << cpp_strerror(r) << dendl;
   }
diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc
index 0284486..fb548a5 100644
--- a/src/mds/Locker.cc
+++ b/src/mds/Locker.cc
@@ -13,7 +13,7 @@
  */
 
 
-#include "MDS.h"
+#include "MDSRank.h"
 #include "MDCache.h"
 #include "Locker.h"
 #include "CInode.h"
@@ -25,27 +25,15 @@
 #include "MDLog.h"
 #include "MDSMap.h"
 
-#include "include/filepath.h"
-
 #include "events/EUpdate.h"
 #include "events/EOpen.h"
 
 #include "msg/Messenger.h"
 #include "osdc/Objecter.h"
 
-#include "messages/MGenericMessage.h"
-#include "messages/MDiscover.h"
-#include "messages/MDiscoverReply.h"
-
-#include "messages/MDirUpdate.h"
-
 #include "messages/MInodeFileCaps.h"
-
 #include "messages/MLock.h"
 #include "messages/MClientLease.h"
-#include "messages/MDentryUnlink.h"
-
-#include "messages/MClientRequest.h"
 #include "messages/MClientReply.h"
 #include "messages/MClientCaps.h"
 #include "messages/MClientCapRelease.h"
@@ -62,7 +50,7 @@
 #undef DOUT_COND
 #define DOUT_COND(cct, l) l<=cct->_conf->debug_mds || l <= cct->_conf->debug_mds_locker
 #define dout_prefix _prefix(_dout, mds)
-static ostream& _prefix(std::ostream *_dout, MDS *mds) {
+static ostream& _prefix(std::ostream *_dout, MDSRank *mds) {
   return *_dout << "mds." << mds->get_nodeid() << ".locker ";
 }
 
@@ -70,7 +58,7 @@ static ostream& _prefix(std::ostream *_dout, MDS *mds) {
 class LockerContext : public MDSInternalContextBase {
   protected:
   Locker *locker;
-  MDS *get_mds()
+  MDSRank *get_mds()
   {
     return locker->mds;
   }
@@ -129,8 +117,8 @@ void Locker::tick()
 
 void Locker::send_lock_message(SimpleLock *lock, int msg)
 {
-  for (map<mds_rank_t,unsigned>::iterator it = lock->get_parent()->replicas_begin();
-       it != lock->get_parent()->replicas_end(); 
+  for (compact_map<mds_rank_t,unsigned>::iterator it = lock->get_parent()->replicas_begin();
+       it != lock->get_parent()->replicas_end();
        ++it) {
     if (mds->mdsmap->get_state(it->first) < MDSMap::STATE_REJOIN) 
       continue;
@@ -141,8 +129,8 @@ void Locker::send_lock_message(SimpleLock *lock, int msg)
 
 void Locker::send_lock_message(SimpleLock *lock, int msg, const bufferlist &data)
 {
-  for (map<mds_rank_t,unsigned>::iterator it = lock->get_parent()->replicas_begin();
-       it != lock->get_parent()->replicas_end(); 
+  for (compact_map<mds_rank_t,unsigned>::iterator it = lock->get_parent()->replicas_begin();
+       it != lock->get_parent()->replicas_end();
        ++it) {
     if (mds->mdsmap->get_state(it->first) < MDSMap::STATE_REJOIN) 
       continue;
@@ -1173,7 +1161,7 @@ bool Locker::_rdlock_kick(SimpleLock *lock, bool as_anon)
 	CInode *in = static_cast<CInode*>(lock->get_parent());
 	if (lock->get_state() == LOCK_EXCL &&
 	    in->get_target_loner() >= 0 &&
-	    !as_anon)   // as_anon => caller wants SYNC, not XSYN
+	    !in->is_dir() && !as_anon)   // as_anon => caller wants SYNC, not XSYN
 	  file_xsyn(lock);
 	else
 	  simple_sync(lock);
@@ -1531,8 +1519,9 @@ bool Locker::xlock_start(SimpleLock *lock, MDRequestRef& mut)
 	}
       }
 
-      if (!lock->is_stable() && !(lock->get_state() == LOCK_XLOCKDONE &&
-				  lock->get_xlock_by_client() == client))
+      if (!lock->is_stable() && (lock->get_state() != LOCK_XLOCKDONE ||
+				 lock->get_xlock_by_client() != client ||
+				 lock->is_waiter_for(SimpleLock::WAIT_STABLE)))
 	break;
 
       if (lock->get_state() == LOCK_LOCK || lock->get_state() == LOCK_XLOCKDONE) {
@@ -1726,8 +1715,18 @@ void Locker::file_update_finish(CInode *in, MutationRef& mut, bool share, client
 
   mut->apply();
   
-  if (ack)
-    mds->send_message_client_counted(ack, client);
+  if (ack) {
+    Session *session = mds->get_session(client);
+    if (session) {
+      // "oldest flush tid" > 0 means client uses unique TID for each flush
+      if (ack->get_oldest_flush_tid() > 0)
+	session->add_completed_flush(ack->get_client_tid());
+      mds->send_message_client_counted(ack, session);
+    } else {
+      dout(10) << " no session for client." << client << " " << *ack << dendl;
+      ack->put();
+    }
+  }
 
   set<CInode*> need_issue;
   drop_locks(mut.get(), &need_issue);
@@ -1736,7 +1735,7 @@ void Locker::file_update_finish(CInode *in, MutationRef& mut, bool share, client
     dout(10) << " client_snap_caps " << in->client_snap_caps << dendl;
     // check for snap writeback completion
     bool gather = false;
-    map<int,set<client_t> >::iterator p = in->client_snap_caps.begin();
+    compact_map<int,set<client_t> >::iterator p = in->client_snap_caps.begin();
     while (p != in->client_snap_caps.end()) {
       SimpleLock *lock = in->get_lock(p->first);
       assert(lock);
@@ -1888,7 +1887,7 @@ bool Locker::issue_caps(CInode *in, Capability *only_cap)
     allowed |= xlocker_allowed & in->get_xlocker_mask(it->first);
 
     Session *session = mds->get_session(it->first);
-    if (in->inode.inline_version != CEPH_INLINE_NONE &&
+    if (in->inode.inline_data.version != CEPH_INLINE_NONE &&
 	!(session && session->connection &&
 	  session->connection->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)))
       allowed &= ~(CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR);
@@ -2394,18 +2393,15 @@ void Locker::adjust_cap_wanted(Capability *cap, int wanted, int issue_seq)
 
 
 
-void Locker::_do_null_snapflush(CInode *head_in, client_t client, snapid_t follows)
+void Locker::_do_null_snapflush(CInode *head_in, client_t client)
 {
-  dout(10) << "_do_null_snapflish client." << client << " follows " << follows << " on " << *head_in << dendl;
-  map<snapid_t, set<client_t> >::iterator p = head_in->client_need_snapflush.begin();
+  dout(10) << "_do_null_snapflush client." << client << " on " << *head_in << dendl;
+  compact_map<snapid_t, set<client_t> >::iterator p = head_in->client_need_snapflush.begin();
   while (p != head_in->client_need_snapflush.end()) {
     snapid_t snapid = p->first;
     set<client_t>& clients = p->second;
     ++p;  // be careful, q loop below depends on this
 
-    // snapid is the snap inode's ->last
-    if (follows > snapid)
-      break;
     if (clients.count(client)) {
       dout(10) << " doing async NULL snapflush on " << snapid << " from client." << client << dendl;
       CInode *sin = mdcache->get_inode(head_in->ino(), snapid);
@@ -2413,7 +2409,7 @@ void Locker::_do_null_snapflush(CInode *head_in, client_t client, snapid_t follo
 	// hrm, look forward until we find the inode. 
 	//  (we can only look it up by the last snapid it is valid for)
 	dout(10) << " didn't have " << head_in->ino() << " snapid " << snapid << dendl;
-	for (map<snapid_t, set<client_t> >::iterator q = p;  // p is already at next entry
+	for (compact_map<snapid_t, set<client_t> >::iterator q = p;  // p is already at next entry
 	     q != head_in->client_need_snapflush.end();
 	     ++q) {
 	  dout(10) << " trying snapid " << q->first << dendl;
@@ -2458,11 +2454,12 @@ bool Locker::should_defer_client_cap_frozen(CInode *in)
  */
 void Locker::handle_client_caps(MClientCaps *m)
 {
+  Session *session = static_cast<Session *>(m->get_connection()->get_priv());
   client_t client = m->get_source().num();
 
   snapid_t follows = m->get_snap_follows();
   dout(7) << "handle_client_caps on " << m->get_ino()
-	  << " follows " << follows 
+	  << " tid " << m->get_client_tid() << " follows " << follows
 	  << " op " << ceph_cap_op_name(m->get_op()) << dendl;
 
   if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
@@ -2470,6 +2467,49 @@ void Locker::handle_client_caps(MClientCaps *m)
     return;
   }
 
+  if (m->get_client_tid() > 0 &&
+      session->have_completed_flush(m->get_client_tid())) {
+    dout(7) << "handle_client_caps already flushed tid " << m->get_client_tid()
+	    << " for client." << client << dendl;
+    MClientCaps *ack;
+    if (m->get_op() == CEPH_CAP_OP_FLUSHSNAP) {
+      ack = new MClientCaps(CEPH_CAP_OP_FLUSHSNAP_ACK, m->get_ino(), 0, 0, 0, 0, 0,
+			    m->get_dirty(), 0, mds->get_osd_epoch_barrier());
+    } else {
+      ack = new MClientCaps(CEPH_CAP_OP_FLUSH_ACK, m->get_ino(), 0, m->get_cap_id(),
+			    m->get_seq(), m->get_caps(), 0, m->get_dirty(), 0,
+			    mds->get_osd_epoch_barrier());
+    }
+    ack->set_snap_follows(follows);
+    ack->set_client_tid(m->get_client_tid());
+    mds->send_message_client_counted(ack, m->get_connection());
+    m->put();
+    return;
+  }
+
+  // "oldest flush tid" > 0 means client uses unique TID for each flush
+  if (m->get_oldest_flush_tid() > 0) {
+    if (session->trim_completed_flushes(m->get_oldest_flush_tid())) {
+      mds->mdlog->get_current_segment()->touched_sessions.insert(session->info.inst.name);
+
+      if (session->get_num_trim_flushes_warnings() > 0 &&
+	  session->get_num_completed_flushes() * 2 < g_conf->mds_max_completed_flushes)
+	session->reset_num_trim_flushes_warnings();
+    } else {
+      if (session->get_num_completed_flushes() >=
+	  (g_conf->mds_max_completed_flushes << session->get_num_trim_flushes_warnings())) {
+	session->inc_num_trim_flushes_warnings();
+	stringstream ss;
+	ss << "client." << session->get_client() << " does not advance its oldest_flush_tid ("
+	   << m->get_oldest_flush_tid() << "), "
+	   << session->get_num_completed_flushes()
+	   << " completed flushes recorded in session\n";
+	mds->clog->warn() << ss.str();
+	dout(20) << __func__ << " " << ss.str() << dendl;
+      }
+    }
+  }
+
   CInode *head_in = mdcache->get_inode(m->get_ino());
   if (!head_in) {
     dout(7) << "handle_client_caps on unknown ino " << m->get_ino() << ", dropping" << dendl;
@@ -2487,9 +2527,12 @@ void Locker::handle_client_caps(MClientCaps *m)
     mds->set_osd_epoch_barrier(m->osd_epoch_barrier);
   }
 
-  CInode *in = mdcache->pick_inode_snap(head_in, follows);
-  if (in != head_in)
-    dout(10) << " head inode " << *head_in << dendl;
+  CInode *in = head_in;
+  if (follows > 0) {
+    in = mdcache->pick_inode_snap(head_in, follows);
+    if (in != head_in)
+      dout(10) << " head inode " << *head_in << dendl;
+  }
   dout(10) << "  cap inode " << *in << dendl;
 
   Capability *cap = 0;
@@ -2529,6 +2572,17 @@ void Locker::handle_client_caps(MClientCaps *m)
     snapid_t snap = realm->get_snap_following(follows);
     dout(10) << "  flushsnap follows " << follows << " -> snap " << snap << dendl;
 
+    // we can prepare the ack now, since this FLUSHEDSNAP is independent of any
+    // other cap ops.  (except possibly duplicate FLUSHSNAP requests, but worst
+    // case we get a dup response, so whatever.)
+    MClientCaps *ack = 0;
+    if (m->get_dirty()) {
+      ack = new MClientCaps(CEPH_CAP_OP_FLUSHSNAP_ACK, in->ino(), 0, 0, 0, 0, 0, m->get_dirty(), 0, mds->get_osd_epoch_barrier());
+      ack->set_snap_follows(follows);
+      ack->set_client_tid(m->get_client_tid());
+      ack->set_oldest_flush_tid(m->get_oldest_flush_tid());
+    }
+
     if (in == head_in ||
 	(head_in->client_need_snapflush.count(snap) &&
 	 head_in->client_need_snapflush[snap].count(client))) {
@@ -2536,25 +2590,19 @@ void Locker::handle_client_caps(MClientCaps *m)
 	      << " client." << client << " on " << *in << dendl;
 
       // this cap now follows a later snap (i.e. the one initiating this flush, or later)
-      cap->client_follows = MAX(snap, (snapid_t)(in->first + 1));
+      if (in == head_in)
+	cap->client_follows = snap < CEPH_NOSNAP ? snap : realm->get_newest_seq();
    
-      // we can prepare the ack now, since this FLUSHEDSNAP is independent of any
-      // other cap ops.  (except possibly duplicate FLUSHSNAP requests, but worst
-      // case we get a dup response, so whatever.)
-      MClientCaps *ack = 0;
-      if (m->get_dirty()) {
-	ack = new MClientCaps(CEPH_CAP_OP_FLUSHSNAP_ACK, in->ino(), 0, 0, 0, 0, 0, m->get_dirty(), 0, mds->get_osd_epoch_barrier());
-	ack->set_snap_follows(follows);
-	ack->set_client_tid(m->get_client_tid());
-      }
-
       _do_snap_update(in, snap, m->get_dirty(), follows, client, m, ack);
 
       if (in != head_in)
 	head_in->remove_need_snapflush(in, snap, client);
       
-    } else
+    } else {
       dout(7) << " not expecting flushsnap " << snap << " from client." << client << " on " << *in << dendl;
+      if (ack)
+	mds->send_message_client_counted(ack, m->get_connection());
+    }
     goto out;
   }
 
@@ -2596,7 +2644,7 @@ void Locker::handle_client_caps(MClientCaps *m)
     //  update/release).
     if (!head_in->client_need_snapflush.empty()) {
       if ((cap->issued() & CEPH_CAP_ANY_FILE_WR) == 0) {
-	_do_null_snapflush(head_in, client, follows);
+	_do_null_snapflush(head_in, client);
       } else {
 	dout(10) << " revocation in progress, not making any conclusions about null snapflushes" << dendl;
       }
@@ -2608,6 +2656,7 @@ void Locker::handle_client_caps(MClientCaps *m)
       ack = new MClientCaps(CEPH_CAP_OP_FLUSH_ACK, in->ino(), 0, cap->get_cap_id(), m->get_seq(),
 			    m->get_caps(), 0, m->get_dirty(), 0, mds->get_osd_epoch_barrier());
       ack->set_client_tid(m->get_client_tid());
+      ack->set_oldest_flush_tid(m->get_oldest_flush_tid());
     }
 
     // filter wanted based on what we could ever give out (given auth/replica status)
@@ -2730,6 +2779,12 @@ void Locker::process_request_cap_release(MDRequestRef& mdr, client_t client, con
     caps &= cap->issued();
   }
   cap->confirm_receipt(seq, caps);
+
+  if (!in->client_need_snapflush.empty() &&
+      (cap->issued() & CEPH_CAP_ANY_FILE_WR) == 0) {
+    _do_null_snapflush(in, client);
+  }
+
   adjust_cap_wanted(cap, wanted, issue_seq);
   
   if (mdr)
@@ -2878,6 +2933,11 @@ void Locker::_do_snap_update(CInode *in, snapid_t snap, int dirty, snapid_t foll
   mdcache->predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY, 0, follows);
   mdcache->journal_dirty_inode(mut.get(), &le->metablob, in, follows);
 
+  // "oldest flush tid" > 0 means client uses unique TID for each flush
+  if (ack && ack->get_oldest_flush_tid() > 0)
+    le->metablob.add_client_flush(metareqid_t(m->get_source(), ack->get_client_tid()),
+				  ack->get_oldest_flush_tid());
+
   mds->mdlog->submit_entry(le, new C_Locker_FileUpdate_finish(this, in, mut,
 								 false,
 								 client, NULL,
@@ -2918,9 +2978,12 @@ void Locker::_update_cap_fields(CInode *in, int dirty, MClientCaps *m, inode_t *
     }
     if (in->inode.is_file() &&
         (dirty & CEPH_CAP_FILE_WR) &&
-        inline_version > pi->inline_version) {
-      pi->inline_version = inline_version;
-      pi->inline_data = m->inline_data;
+        inline_version > pi->inline_data.version) {
+      pi->inline_data.version = inline_version;
+      if (inline_version != CEPH_INLINE_NONE && m->inline_data.length() > 0)
+	pi->inline_data.get_data() = m->inline_data;
+      else
+	pi->inline_data.free_data();
     }
     if ((dirty & CEPH_CAP_FILE_EXCL) && atime != pi->atime) {
       dout(7) << "  atime " << pi->atime << " -> " << atime
@@ -3051,17 +3114,17 @@ bool Locker::_do_cap_update(CInode *in, Capability *cap,
     for ( int i=0; i < num_locks; ++i) {
       ceph_filelock decoded_lock;
       ::decode(decoded_lock, bli);
-      in->fcntl_locks.held_locks.
+      in->get_fcntl_lock_state()->held_locks.
 	insert(pair<uint64_t, ceph_filelock>(decoded_lock.start, decoded_lock));
-      ++in->fcntl_locks.client_held_lock_counts[(client_t)(decoded_lock.client)];
+      ++in->get_fcntl_lock_state()->client_held_lock_counts[(client_t)(decoded_lock.client)];
     }
     ::decode(num_locks, bli);
     for ( int i=0; i < num_locks; ++i) {
       ceph_filelock decoded_lock;
       ::decode(decoded_lock, bli);
-      in->flock_locks.held_locks.
+      in->get_flock_lock_state()->held_locks.
 	insert(pair<uint64_t, ceph_filelock>(decoded_lock.start, decoded_lock));
-      ++in->flock_locks.client_held_lock_counts[(client_t)(decoded_lock.client)];
+      ++in->get_flock_lock_state()->client_held_lock_counts[(client_t)(decoded_lock.client)];
     }
   }
 
@@ -3120,6 +3183,11 @@ bool Locker::_do_cap_update(CInode *in, Capability *cap,
   mdcache->predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY, 0, follows);
   mdcache->journal_dirty_inode(mut.get(), &le->metablob, in, follows);
 
+  // "oldest flush tid" > 0 means client uses unique TID for each flush
+  if (ack && ack->get_oldest_flush_tid() > 0)
+    le->metablob.add_client_flush(metareqid_t(m->get_source(), ack->get_client_tid()),
+				  ack->get_oldest_flush_tid());
+
   mds->mdlog->submit_entry(le, new C_Locker_FileUpdate_finish(this, in, mut,
 								 change_max,
 								 client, cap,
@@ -3231,7 +3299,7 @@ void Locker::remove_client_cap(CInode *in, client_t client)
 {
   // clean out any pending snapflush state
   if (!in->client_need_snapflush.empty())
-    _do_null_snapflush(in, client, 0);
+    _do_null_snapflush(in, client);
 
   in->remove_client_cap(client);
 
diff --git a/src/mds/Locker.h b/src/mds/Locker.h
index 6322f65..8a6581f 100644
--- a/src/mds/Locker.h
+++ b/src/mds/Locker.h
@@ -24,27 +24,17 @@ using std::map;
 using std::list;
 using std::set;
 
-class MDS;
+class MDSRank;
 class Session;
-class CDir;
 class CInode;
 class CDentry;
-class EMetaBlob;
 struct SnapRealm;
 
 class Message;
 
-class MDiscover;
-class MDiscoverReply;
-class MCacheExpire;
-class MDirUpdate;
-class MDentryUnlink;
 class MLock;
 
-class MClientRequest;
-
 class Capability;
-class LogSegment;
 
 class SimpleLock;
 class ScatterLock;
@@ -57,11 +47,11 @@ typedef ceph::shared_ptr<MDRequestImpl> MDRequestRef;
 
 class Locker {
 private:
-  MDS *mds;
+  MDSRank *mds;
   MDCache *mdcache;
  
  public:
-  Locker(MDS *m, MDCache *c) : mds(m), mdcache(c) {}  
+  Locker(MDSRank *m, MDCache *c) : mds(m), mdcache(c) {}  
 
   SimpleLock *get_lock(int lock_type, MDSCacheObjectInfo &info);
   
@@ -203,7 +193,7 @@ public:
   void handle_client_caps(class MClientCaps *m);
   void _update_cap_fields(CInode *in, int dirty, MClientCaps *m, inode_t *pi);
   void _do_snap_update(CInode *in, snapid_t snap, int dirty, snapid_t follows, client_t client, MClientCaps *m, MClientCaps *ack);
-  void _do_null_snapflush(CInode *head_in, client_t client, snapid_t follows);
+  void _do_null_snapflush(CInode *head_in, client_t client);
   bool _do_cap_update(CInode *in, Capability *cap, int dirty, snapid_t follows, MClientCaps *m,
 		      MClientCaps *ack=0);
   void handle_client_cap_release(class MClientCapRelease *m);
diff --git a/src/mds/LogEvent.cc b/src/mds/LogEvent.cc
index 2928a61..c933261 100644
--- a/src/mds/LogEvent.cc
+++ b/src/mds/LogEvent.cc
@@ -15,7 +15,7 @@
 #include "common/config.h"
 #include "LogEvent.h"
 
-#include "MDS.h"
+#include "MDSRank.h"
 
 // events i know of
 #include "events/ESubtreeMap.h"
diff --git a/src/mds/LogEvent.h b/src/mds/LogEvent.h
index 145c69a..b3fabf4 100644
--- a/src/mds/LogEvent.h
+++ b/src/mds/LogEvent.h
@@ -46,7 +46,7 @@
 #include "include/Context.h"
 #include "include/utime.h"
 
-class MDS;
+class MDSRank;
 class LogSegment;
 class EMetaBlob;
 
@@ -109,7 +109,7 @@ public:
   /*** recovery ***/
   /* replay() - replay given event.  this is idempotent.
    */
-  virtual void replay(MDS *m) { assert(0); }
+  virtual void replay(MDSRank *m) { assert(0); }
 
   /**
    * If the subclass embeds a MetaBlob, return it here so that
diff --git a/src/mds/LogSegment.h b/src/mds/LogSegment.h
index 6626afe..f5ef207 100644
--- a/src/mds/LogSegment.h
+++ b/src/mds/LogSegment.h
@@ -30,7 +30,7 @@ using ceph::unordered_set;
 class CDir;
 class CInode;
 class CDentry;
-class MDS;
+class MDSRank;
 struct MDSlaveUpdate;
 
 typedef uint64_t log_segment_seq_t;
@@ -63,13 +63,16 @@ class LogSegment {
   // client request ids
   map<int, ceph_tid_t> last_client_tids;
 
+  // potentially dirty sessions
+  std::set<entity_name_t> touched_sessions;
+
   // table version
   version_t inotablev;
   version_t sessionmapv;
   map<int,version_t> tablev;
 
   // try to expire
-  void try_to_expire(MDS *mds, MDSGatherBuilder &gather_bld, int op_prio);
+  void try_to_expire(MDSRank *mds, MDSGatherBuilder &gather_bld, int op_prio);
 
   std::list<MDSInternalContextBase*> expiry_waiters;
 
diff --git a/src/mds/MDBalancer.cc b/src/mds/MDBalancer.cc
index 9219691..263c21a 100644
--- a/src/mds/MDBalancer.cc
+++ b/src/mds/MDBalancer.cc
@@ -15,7 +15,7 @@
 #include "mdstypes.h"
 
 #include "MDBalancer.h"
-#include "MDS.h"
+#include "MDSRank.h"
 #include "mon/MonClient.h"
 #include "MDSMap.h"
 #include "CInode.h"
@@ -113,7 +113,7 @@ void MDBalancer::tick()
 
 class C_Bal_SendHeartbeat : public MDSInternalContext {
 public:
-  C_Bal_SendHeartbeat(MDS *mds_) : MDSInternalContext(mds_) { }
+  C_Bal_SendHeartbeat(MDSRank *mds_) : MDSInternalContext(mds_) { }
   virtual void finish(int f) {
     mds->balancer->send_heartbeat();
   }
@@ -159,7 +159,7 @@ mds_load_t MDBalancer::get_load(utime_t now)
   }
 
   load.req_rate = mds->get_req_rate();
-  load.queue_len = mds->messenger->get_dispatch_queue_len();
+  load.queue_len = messenger->get_dispatch_queue_len();
 
   ifstream cpu("/proc/loadavg");
   if (cpu.is_open())
@@ -218,14 +218,14 @@ void MDBalancer::send_heartbeat()
 
 
   set<mds_rank_t> up;
-  mds->get_mds_map()->get_mds_set(up);
+  mds->get_mds_map()->get_up_mds_set(up);
   for (set<mds_rank_t>::iterator p = up.begin(); p != up.end(); ++p) {
     if (*p == mds->get_nodeid())
       continue;
     MHeartbeat *hb = new MHeartbeat(load, beat_epoch);
     hb->get_import_map() = import_map;
-    mds->messenger->send_message(hb,
-                                 mds->mdsmap->get_inst(*p));
+    messenger->send_message(hb,
+                            mds->mdsmap->get_inst(*p));
   }
 }
 
@@ -758,7 +758,7 @@ void MDBalancer::try_rebalance()
 bool MDBalancer::check_targets()
 {
   // get MonMap's idea of my_targets
-  const set<mds_rank_t>& map_targets = mds->mdsmap->get_mds_info(mds->whoami).export_targets;
+  const set<mds_rank_t>& map_targets = mds->mdsmap->get_mds_info(mds->get_nodeid()).export_targets;
 
   bool send = false;
   bool ok = true;
@@ -805,8 +805,8 @@ bool MDBalancer::check_targets()
   dout(10) << "check_targets have " << map_targets << " need " << need_targets << " want " << want_targets << dendl;
 
   if (send) {
-    MMDSLoadTargets* m = new MMDSLoadTargets(mds_gid_t(mds->monc->get_global_id()), want_targets);
-    mds->monc->send_mon_message(m);
+    MMDSLoadTargets* m = new MMDSLoadTargets(mds_gid_t(mon_client->get_global_id()), want_targets);
+    mon_client->send_mon_message(m);
   }
   return ok;
 }
diff --git a/src/mds/MDBalancer.h b/src/mds/MDBalancer.h
index 696e2ca..68546db 100644
--- a/src/mds/MDBalancer.h
+++ b/src/mds/MDBalancer.h
@@ -27,15 +27,19 @@ using std::map;
 #include "CInode.h"
 
 
-class MDS;
+class MDSRank;
 class Message;
 class MHeartbeat;
 class CInode;
 class CDir;
+class Messenger;
+class MonClient;
 
 class MDBalancer {
  protected:
-  MDS *mds;
+  MDSRank *mds;
+  Messenger *messenger;
+  MonClient *mon_client;
   int beat_epoch;
 
   int last_epoch_under;  
@@ -73,8 +77,10 @@ class MDBalancer {
   }
 
 public:
-  MDBalancer(MDS *m) : 
+  MDBalancer(MDSRank *m, Messenger *msgr, MonClient *monc) : 
     mds(m),
+    messenger(msgr),
+    mon_client(monc),
     beat_epoch(0),
     last_epoch_under(0), last_epoch_over(0), my_load(0.0), target_load(0.0) { }
   
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index f62afae..3ec852a 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -20,7 +20,7 @@
 #include <map>
 
 #include "MDCache.h"
-#include "MDS.h"
+#include "MDSRank.h"
 #include "Server.h"
 #include "Locker.h"
 #include "MDLog.h"
@@ -107,7 +107,7 @@ extern struct ceph_file_layout g_default_file_layout;
 #define dout_subsys ceph_subsys_mds
 #undef dout_prefix
 #define dout_prefix _prefix(_dout, mds)
-static ostream& _prefix(std::ostream *_dout, MDS *mds) {
+static ostream& _prefix(std::ostream *_dout, MDSRank *mds) {
   return *_dout << "mds." << mds->get_nodeid() << ".cache ";
 }
 
@@ -136,7 +136,7 @@ set<int> SimpleLock::empty_gather_set;
 class MDCacheContext : public virtual MDSInternalContextBase {
 protected:
   MDCache *mdcache;
-  virtual MDS *get_mds()
+  virtual MDSRank *get_mds()
   {
     assert(mdcache != NULL);
     return mdcache->mds;
@@ -156,7 +156,7 @@ public:
 class MDCacheIOContext : public virtual MDSIOContextBase {
 protected:
   MDCache *mdcache;
-  virtual MDS *get_mds()
+  virtual MDSRank *get_mds()
   {
     assert(mdcache != NULL);
     return mdcache->mds;
@@ -166,15 +166,15 @@ public:
 };
 
 
-MDCache::MDCache(MDS *m) :
+MDCache::MDCache(MDSRank *m) :
+  mds(m),
   logger(0),
-  num_strays(0),
-  num_strays_purging(0),
-  num_strays_delayed(0),
+  filer(m->objecter, m->finisher),
+  rejoin_done(NULL),
+  resolve_done(NULL),
   recovery_queue(m),
-  delayed_eval_stray(member_offset(CDentry, item_stray))
+  stray_manager(m)
 {
-  mds = m;
   migrator = new Migrator(mds, this);
   root = NULL;
   myin = NULL;
@@ -212,6 +212,7 @@ MDCache::MDCache(MDS *m) :
 
   decayrate.set_halflife(g_conf->mds_decay_halflife);
 
+  memset(&default_file_layout, 0, sizeof(default_file_layout));
   memset(&default_log_layout, 0, sizeof(default_log_layout));
 
   did_shutdown_log_cap = false;
@@ -225,7 +226,9 @@ MDCache::~MDCache()
     delete logger;
     logger = 0;
   }
-  //delete renamer;
+
+  delete rejoin_done; rejoin_done = NULL;
+  delete resolve_done; resolve_done = NULL;
 }
 
 
@@ -323,19 +326,31 @@ void MDCache::remove_inode(CInode *o)
   delete o; 
 }
 
+ceph_file_layout MDCache::gen_default_file_layout(const MDSMap &mdsmap)
+{
+  ceph_file_layout result = g_default_file_layout;
+  result.fl_pg_pool = mdsmap.get_first_data_pool();
 
+  return result;
+}
 
-void MDCache::init_layouts()
+ceph_file_layout MDCache::gen_default_log_layout(const MDSMap &mdsmap)
 {
-  default_file_layout = g_default_file_layout;
-  default_file_layout.fl_pg_pool = mds->mdsmap->get_first_data_pool();
-
-  default_log_layout = g_default_file_layout;
-  default_log_layout.fl_pg_pool = mds->mdsmap->get_metadata_pool();
+  ceph_file_layout result = g_default_file_layout;
+  result.fl_pg_pool = mdsmap.get_metadata_pool();
   if (g_conf->mds_log_segment_size > 0) {
-    default_log_layout.fl_object_size = g_conf->mds_log_segment_size;
-    default_log_layout.fl_stripe_unit = g_conf->mds_log_segment_size;
+    result.fl_object_size = g_conf->mds_log_segment_size;
+    result.fl_stripe_unit = g_conf->mds_log_segment_size;
   }
+
+  return result;
+}
+
+void MDCache::init_layouts()
+{
+  default_file_layout = gen_default_file_layout(*(mds->mdsmap));
+  default_log_layout = gen_default_log_layout(*(mds->mdsmap));
+
 }
 
 void MDCache::create_unlinked_system_inode(CInode *in, inodeno_t ino,
@@ -364,7 +379,7 @@ void MDCache::create_unlinked_system_inode(CInode *in, inodeno_t ino,
 
   if (in->is_base()) {
     if (in->is_root())
-      in->inode_auth = mds_authority_t(mds->whoami, CDIR_AUTH_UNKNOWN);
+      in->inode_auth = mds_authority_t(mds->get_nodeid(), CDIR_AUTH_UNKNOWN);
     else
       in->inode_auth = mds_authority_t(mds_rank_t(in->ino() - MDS_INO_MDSDIR_OFFSET), CDIR_AUTH_UNKNOWN);
     in->open_snaprealm();  // empty snaprealm
@@ -385,6 +400,8 @@ CInode *MDCache::create_system_inode(inodeno_t ino, int mode)
 CInode *MDCache::create_root_inode()
 {
   CInode *i = create_system_inode(MDS_INO_ROOT, S_IFDIR|0755);
+  i->inode.uid = g_conf->mds_root_ino_uid;
+  i->inode.gid = g_conf->mds_root_ino_gid;
   i->inode.layout = default_file_layout;
   i->inode.layout.fl_pg_pool = mds->mdsmap->get_first_data_pool();
   return i;
@@ -397,7 +414,7 @@ void MDCache::create_empty_hierarchy(MDSGather *gather)
 
   // force empty root dir
   CDir *rootdir = root->get_or_open_dirfrag(this, frag_t());
-  adjust_subtree_auth(rootdir, mds->whoami);   
+  adjust_subtree_auth(rootdir, mds->get_nodeid());   
   rootdir->dir_rep = CDir::REP_ALL;   //NONE;
 
   rootdir->fnode.accounted_fragstat = rootdir->fnode.fragstat;
@@ -419,17 +436,17 @@ void MDCache::create_mydir_hierarchy(MDSGather *gather)
 {
   // create mds dir
   char myname[10];
-  snprintf(myname, sizeof(myname), "mds%d", int(mds->whoami));
-  CInode *my = create_system_inode(MDS_INO_MDSDIR(mds->whoami), S_IFDIR);
+  snprintf(myname, sizeof(myname), "mds%d", int(mds->get_nodeid()));
+  CInode *my = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR);
 
   CDir *mydir = my->get_or_open_dirfrag(this, frag_t());
-  adjust_subtree_auth(mydir, mds->whoami);   
+  adjust_subtree_auth(mydir, mds->get_nodeid());   
 
   LogSegment *ls = mds->mdlog->get_current_segment();
 
   // stray dir
   for (int i = 0; i < NUM_STRAY; ++i) {
-    CInode *stray = create_system_inode(MDS_INO_STRAY(mds->whoami, i), S_IFDIR);
+    CInode *stray = create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR);
     CDir *straydir = stray->get_or_open_dirfrag(this, frag_t());
     stringstream name;
     name << "stray" << i;
@@ -448,13 +465,6 @@ void MDCache::create_mydir_hierarchy(MDSGather *gather)
     stray->store_backtrace(gather->new_sub());
   }
 
-  CInode *journal = create_system_inode(MDS_INO_LOG_OFFSET + mds->whoami, S_IFREG);
-  string name = "journal";
-  CDentry *jdn = mydir->add_primary_dentry(name, journal);
-  jdn->_mark_dirty(mds->mdlog->get_current_segment());
-
-  mydir->fnode.fragstat.nfiles++;
-  mydir->fnode.rstat.rfiles++;
   mydir->fnode.accounted_fragstat = mydir->fnode.fragstat;
   mydir->fnode.accounted_rstat = mydir->fnode.rstat;
 
@@ -463,7 +473,6 @@ void MDCache::create_mydir_hierarchy(MDSGather *gather)
   ++myin->inode.rstat.rsubdirs;
   myin->inode.accounted_rstat = myin->inode.rstat;
 
-
   mydir->mark_complete();
   mydir->mark_dirty(mydir->pre_dirty(), ls);
   mydir->commit(0, gather->new_sub());
@@ -575,7 +584,7 @@ struct C_MDS_RetryOpenRoot : public MDSInternalContext {
 
 void MDCache::open_root_inode(MDSInternalContextBase *c)
 {
-  if (mds->whoami == mds->mdsmap->get_root()) {
+  if (mds->get_nodeid() == mds->mdsmap->get_root()) {
     CInode *in;
     in = create_system_inode(MDS_INO_ROOT, S_IFDIR|0755);  // initially inaccurate!
     in->fetch(c);
@@ -586,7 +595,7 @@ void MDCache::open_root_inode(MDSInternalContextBase *c)
 
 void MDCache::open_mydir_inode(MDSInternalContextBase *c)
 {
-  CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->whoami), S_IFDIR|0755);  // initially inaccurate!
+  CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755);  // initially inaccurate!
   in->fetch(c);
 }
 
@@ -598,12 +607,12 @@ void MDCache::open_root()
     open_root_inode(new C_MDS_RetryOpenRoot(this));
     return;
   }
-  if (mds->whoami == mds->mdsmap->get_root()) {
+  if (mds->get_nodeid() == mds->mdsmap->get_root()) {
     assert(root->is_auth());  
     CDir *rootdir = root->get_or_open_dirfrag(this, frag_t());
     assert(rootdir);
     if (!rootdir->is_subtree_root())
-      adjust_subtree_auth(rootdir, mds->whoami);   
+      adjust_subtree_auth(rootdir, mds->get_nodeid());   
     if (!rootdir->is_complete()) {
       rootdir->fetch(new C_MDS_RetryOpenRoot(this));
       return;
@@ -618,13 +627,13 @@ void MDCache::open_root()
   }
 
   if (!myin) {
-    CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->whoami), S_IFDIR|0755);  // initially inaccurate!
+    CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755);  // initially inaccurate!
     in->fetch(new C_MDS_RetryOpenRoot(this));
     return;
   }
   CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
   assert(mydir);
-  adjust_subtree_auth(mydir, mds->whoami);
+  adjust_subtree_auth(mydir, mds->get_nodeid());
 
   populate_mydir();
 }
@@ -640,7 +649,18 @@ void MDCache::populate_mydir()
   if (!mydir->is_complete()) {
     mydir->fetch(new C_MDS_RetryOpenRoot(this));
     return;
-  }    
+  }
+
+  if (mydir->get_version() == 0 && mydir->state_test(CDir::STATE_BADFRAG)) {
+    // A missing dirfrag, we will recreate it.  Before that, we must dirty
+    // it before dirtying any of the strays we create within it.
+    mds->clog->warn() << "fragment " << mydir->dirfrag() << " was unreadable, "
+      "recreating it now";
+    LogSegment *ls = mds->mdlog->get_current_segment();
+    mydir->state_clear(CDir::STATE_BADFRAG);
+    mydir->mark_complete();
+    mydir->mark_dirty(mydir->pre_dirty(), ls);
+  }
 
   // open or create stray
   for (int i = 0; i < NUM_STRAY; ++i) {
@@ -653,7 +673,7 @@ void MDCache::populate_mydir()
       straydn = mydir->lookup("stray");
 
     if (!straydn || !straydn->get_linkage()->get_inode()) {
-      _create_system_file(mydir, name.str().c_str(), create_system_inode(MDS_INO_STRAY(mds->whoami, i), S_IFDIR),
+      _create_system_file(mydir, name.str().c_str(), create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR),
 			  new C_MDS_RetryOpenRoot(this));
       return;
     }
@@ -673,22 +693,18 @@ void MDCache::populate_mydir()
     for (list<frag_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
       frag_t fg = *p;
       CDir *dir = strays[i]->get_dirfrag(fg);
-      if (!dir)
+      if (!dir) {
 	dir = strays[i]->get_or_open_dirfrag(this, fg);
-      if (dir->get_version() == 0) {
-	dir->fetch(new C_MDS_RetryOpenRoot(this));
-	return;
       }
-    }
-  }
 
-  // open or create journal file
-  string jname("journal");
-  CDentry *jdn = mydir->lookup(jname);
-  if (!jdn || !jdn->get_linkage()->get_inode()) {
-    _create_system_file(mydir, jname.c_str(), create_system_inode(MDS_INO_LOG_OFFSET + mds->whoami, S_IFREG),
-			new C_MDS_RetryOpenRoot(this));
-    return;
+      if (dir->state_test(CDir::STATE_BADFRAG)) {
+        mds->damaged();
+        assert(0);
+      } else if (dir->get_version() == 0) {
+        dir->fetch(new C_MDS_RetryOpenRoot(this));
+        return;
+      }
+    }
   }
 
   // okay!
@@ -719,14 +735,15 @@ CDentry *MDCache::get_or_create_stray_dentry(CInode *in)
   if (!straydn) {
     straydn = straydir->add_null_dentry(straydname);
     straydn->mark_new();
-
-    num_strays++;
-    logger->set(l_mdc_num_strays, num_strays);
-    logger->inc(l_mdc_strays_created);
   } else {
     assert(straydn->get_projected_linkage()->is_null());
   }
 
+  // Notify even if a null dentry already existed, because
+  // StrayManager counts the number of stray inodes, not the
+  // number of dentries in the directory.
+  stray_manager.notify_stray_created();
+
   straydn->state_set(CDentry::STATE_STRAY);
   return straydn;
 }
@@ -1492,11 +1509,25 @@ CInode *MDCache::cow_inode(CInode *in, snapid_t last)
 {
   assert(last >= in->first);
 
-  CInode *oldin = new CInode(this, true, in->first, last);
+  SnapRealm *realm = in->find_snaprealm();
+  const set<snapid_t>& snaps = realm->get_snaps();
+
+  // make sure snap inode's last match existing snapshots.
+  // MDCache::pick_inode_snap() requires this.
+  snapid_t last_snap = last;
+  if (snaps.count(last) == 0) {
+    set<snapid_t>::const_iterator p = snaps.upper_bound(last);
+    if (p != snaps.begin()) {
+      --p;
+      if (*p >= in->first)
+	last_snap = *p;
+    }
+  }
+
+  CInode *oldin = new CInode(this, true, in->first, last_snap);
   oldin->inode = *in->get_previous_projected_inode();
   oldin->symlink = in->symlink;
   oldin->xattrs = *in->get_previous_projected_xattrs();
-
   oldin->inode.trim_client_ranges(last);
 
   if (in->first < in->oldest_snap)
@@ -1506,9 +1537,6 @@ CInode *MDCache::cow_inode(CInode *in, snapid_t last)
 
   dout(10) << "cow_inode " << *in << " to " << *oldin << dendl;
   add_inode(oldin);
-  
-  SnapRealm *realm = in->find_snaprealm();
-  const set<snapid_t>& snaps = realm->get_snaps();
 
   if (in->last != CEPH_NOSNAP) {
     CInode *head_in = get_inode(in->ino());
@@ -1575,32 +1603,39 @@ void MDCache::journal_cow_dentry(MutationImpl *mut, EMetaBlob *metablob,
   if (dnl->is_primary() && dnl->get_inode()->is_multiversion()) {
     // multiversion inode.
     CInode *in = dnl->get_inode();
+    SnapRealm *realm = NULL;
 
     if (in->get_projected_parent_dn() != dn) {
       assert(follows == CEPH_NOSNAP);
-      snapid_t dir_follows = dn->dir->inode->find_snaprealm()->get_newest_seq();
+      realm = dn->dir->inode->find_snaprealm();
+      snapid_t dir_follows = realm->get_newest_snap();
 
       if (dir_follows+1 > dn->first) {
 	snapid_t oldfirst = dn->first;
 	dn->first = dir_follows+1;
-	CDentry *olddn = dn->dir->add_remote_dentry(dn->name, in->ino(),  in->d_type(),
-						    oldfirst, dir_follows);
-	olddn->pre_dirty();
-	dout(10) << " olddn " << *olddn << dendl;
-	metablob->add_remote_dentry(olddn, true);
-	mut->add_cow_dentry(olddn);
-	// FIXME: adjust link count here?  hmm.
-
-	if (dir_follows+1 > in->first)
-	  in->cow_old_inode(dir_follows, false);
+	if (realm->has_snaps_in_range(oldfirst, dn->last)) {
+	  CDentry *olddn = dn->dir->add_remote_dentry(dn->name, in->ino(),  in->d_type(),
+						      oldfirst, dir_follows);
+	  olddn->pre_dirty();
+	  dout(10) << " olddn " << *olddn << dendl;
+	  metablob->add_remote_dentry(olddn, true);
+	  mut->add_cow_dentry(olddn);
+	  // FIXME: adjust link count here?  hmm.
+
+	  if (dir_follows+1 > in->first)
+	    in->cow_old_inode(dir_follows, false);
+	}
       }
 
-      if (in->snaprealm)
-	follows = in->snaprealm->get_newest_seq();
-      else
+      if (in->snaprealm) {
+	realm = in->snaprealm;
+	follows = realm->get_newest_seq();
+      } else
 	follows = dir_follows;
-    } else if (follows == CEPH_NOSNAP) {
-      follows = in->find_snaprealm()->get_newest_seq();
+    } else {
+      realm = in->find_snaprealm();
+      if (follows == CEPH_NOSNAP)
+	follows = realm->get_newest_seq();
     }
 
     // already cloned?
@@ -1609,26 +1644,41 @@ void MDCache::journal_cow_dentry(MutationImpl *mut, EMetaBlob *metablob,
       return;
     }
 
+    if (!realm->has_snaps_in_range(in->first, in->last)) {
+      dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *in << dendl;
+      in->first = follows + 1;
+      return;
+    }
+
     in->cow_old_inode(follows, false);
 
   } else {
+    SnapRealm *realm = dn->dir->inode->find_snaprealm();
     if (follows == CEPH_NOSNAP)
-      follows = dn->dir->inode->find_snaprealm()->get_newest_seq();
-    
+      follows = realm->get_newest_seq();
+
     // already cloned?
     if (follows < dn->first) {
       dout(10) << "journal_cow_dentry follows " << follows << " < first on " << *dn << dendl;
       return;
     }
-       
+
     // update dn.first before adding old dentry to cdir's map
     snapid_t oldfirst = dn->first;
     dn->first = follows+1;
+
+    CInode *in = dnl->is_primary() ? dnl->get_inode() : NULL;
+
+    if (!realm->has_snaps_in_range(oldfirst, dn->last)) {
+      dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *dn << dendl;
+      if (in)
+	in->first = follows+1;
+      return;
+    }
     
     dout(10) << "    dn " << *dn << dendl;
-    if (dnl->is_primary()) {
-      assert(oldfirst == dnl->get_inode()->first);
-      CInode *oldin = cow_inode(dnl->get_inode(), follows);
+    if (in) {
+      CInode *oldin = cow_inode(in, follows);
       mut->add_cow_inode(oldin);
       if (pcow_inode)
 	*pcow_inode = oldin;
@@ -1718,17 +1768,19 @@ void MDCache::project_rstat_inode_to_frag(CInode *cur, CDir *parent, snapid_t fi
 
   if (cur->last >= floor)
     _project_rstat_inode_to_frag(*curi, MAX(first, floor), cur->last, parent, linkunlink);
-      
-  for (set<snapid_t>::iterator p = cur->dirty_old_rstats.begin();
-       p != cur->dirty_old_rstats.end();
-       ++p) {
-    old_inode_t& old = cur->old_inodes[*p];
-    snapid_t ofirst = MAX(old.first, floor);
-    set<snapid_t>::const_iterator q = snaps.lower_bound(ofirst);
-    if (q == snaps.end() || *q > *p)
-      continue;
-    if (*p >= floor)
-      _project_rstat_inode_to_frag(old.inode, ofirst, *p, parent, 0);
+
+  if (g_conf->mds_snap_rstat) {
+    for (compact_set<snapid_t>::iterator p = cur->dirty_old_rstats.begin();
+	 p != cur->dirty_old_rstats.end();
+	 ++p) {
+      old_inode_t& old = cur->old_inodes[*p];
+      snapid_t ofirst = MAX(old.first, floor);
+      set<snapid_t>::const_iterator q = snaps.lower_bound(ofirst);
+      if (q == snaps.end() || *q > *p)
+	continue;
+      if (*p >= floor)
+	_project_rstat_inode_to_frag(old.inode, ofirst, *p, parent, 0);
+    }
   }
   cur->dirty_old_rstats.clear();
 }
@@ -1765,7 +1817,10 @@ void MDCache::_project_rstat_inode_to_frag(inode_t& inode, snapid_t ofirst, snap
     snapid_t first;
     fnode_t *pf = parent->get_projected_fnode();
     if (last == CEPH_NOSNAP) {
-      first = MAX(ofirst, parent->first);
+      if (g_conf->mds_snap_rstat)
+	first = MAX(ofirst, parent->first);
+      else
+	first = parent->first;
       prstat = &pf->rstat;
       dout(20) << " projecting to head [" << first << "," << last << "] " << *prstat << dendl;
 
@@ -1780,6 +1835,9 @@ void MDCache::_project_rstat_inode_to_frag(inode_t& inode, snapid_t ofirst, snap
 	parent->dirty_old_rstat[first-1].accounted_rstat = pf->accounted_rstat;
       }
       parent->first = first;
+    } else if (!g_conf->mds_snap_rstat) {
+      // drop snapshots' rstats
+      break;
     } else if (last >= parent->first) {
       first = parent->first;
       parent->dirty_old_rstat[last].first = first;
@@ -1794,7 +1852,7 @@ void MDCache::_project_rstat_inode_to_frag(inode_t& inode, snapid_t ofirst, snap
       first = ofirst;
 
       // find any intersection with last
-      map<snapid_t,old_rstat_t>::iterator p = parent->dirty_old_rstat.lower_bound(last);
+      compact_map<snapid_t,old_rstat_t>::iterator p = parent->dirty_old_rstat.lower_bound(last);
       if (p == parent->dirty_old_rstat.end()) {
 	dout(20) << "  no dirty_old_rstat with last >= last " << last << dendl;
 	if (!parent->dirty_old_rstat.empty() && parent->dirty_old_rstat.rbegin()->first >= first) {
@@ -1873,7 +1931,7 @@ void MDCache::project_rstat_frag_to_inode(nest_info_t& rstat, nest_info_t& accou
       } else {
 	// our life is easier here because old_inodes is not sparse
 	// (although it may not begin at snapid 1)
-	map<snapid_t,old_inode_t>::iterator p = pin->old_inodes.lower_bound(last);
+	compact_map<snapid_t,old_inode_t>::iterator p = pin->old_inodes.lower_bound(last);
 	if (p == pin->old_inodes.end()) {
 	  dout(10) << " no old_inode <= " << last << ", done." << dendl;
 	  break;
@@ -1963,8 +2021,9 @@ update:
     msg->quota = i->quota;
     mds->send_message_client_counted(msg, session->connection);
   }
-  for (map<const mds_rank_t, unsigned>::iterator it = in->replicas_begin();
-       it != in->replicas_end(); ++it) {
+  for (compact_map<mds_rank_t, unsigned>::iterator it = in->replicas_begin();
+       it != in->replicas_end();
+       ++it) {
     MGatherCaps *msg = new MGatherCaps;
     msg->ino = in->ino();
     mds->send_message_mds(msg, it->first);
@@ -2240,10 +2299,13 @@ void MDCache::predirty_journal_parents(MutationRef mut, EMetaBlob *blob,
       // first, if the frag is stale, bring it back in sync.
       parent->resync_accounted_rstat();
 
-      for (map<snapid_t,old_rstat_t>::iterator p = parent->dirty_old_rstat.begin();
-	   p != parent->dirty_old_rstat.end();
-	   ++p)
-	project_rstat_frag_to_inode(p->second.rstat, p->second.accounted_rstat, p->second.first, p->first, pin, true);//false);
+      if (g_conf->mds_snap_rstat) {
+	for (compact_map<snapid_t,old_rstat_t>::iterator p = parent->dirty_old_rstat.begin();
+	     p != parent->dirty_old_rstat.end();
+	     ++p)
+	  project_rstat_frag_to_inode(p->second.rstat, p->second.accounted_rstat, p->second.first,
+				      p->first, pin, true);//false);
+      }
       parent->dirty_old_rstat.clear();
       project_rstat_frag_to_inode(pf->rstat, pf->accounted_rstat, parent->first, CEPH_NOSNAP, pin, true);//false);
 
@@ -2452,7 +2514,7 @@ ESubtreeMap *MDCache::create_subtree_map()
 
     // so not
     //   !me, *
-    if (dir->get_dir_auth().first != mds->whoami)
+    if (dir->get_dir_auth().first != mds->get_nodeid())
       continue;
 
     if (migrator->is_ambiguous_import(dir->dirfrag()) ||
@@ -2590,11 +2652,13 @@ ESubtreeMap *MDCache::create_subtree_map()
 }
 
 
-void MDCache::resolve_start()
+void MDCache::resolve_start(MDSInternalContext *resolve_done_)
 {
   dout(10) << "resolve_start" << dendl;
+  assert(resolve_done == NULL);
+  resolve_done = resolve_done_;
 
-  if (mds->mdsmap->get_root() != mds->whoami) {
+  if (mds->mdsmap->get_root() != mds->get_nodeid()) {
     // if we don't have the root dir, adjust it to UNKNOWN.  during
     // resolve we want mds0 to explicit claim the portion of it that
     // it owns, so that anything beyond its bounds get left as
@@ -2695,7 +2759,7 @@ void MDCache::send_subtree_resolves()
   for (set<mds_rank_t>::iterator p = recovery_set.begin();
        p != recovery_set.end();
        ++p) {
-    if (*p == mds->whoami)
+    if (*p == mds->get_nodeid())
       continue;
     if (mds->is_resolve() || mds->mdsmap->is_resolve(*p))
       resolves[*p] = new MMDSResolve;
@@ -2946,7 +3010,7 @@ void MDCache::handle_mds_recovery(mds_rank_t who)
     CDir *dir = p->first;
 
     if (dir->authority().first != who ||
-	dir->authority().second == mds->whoami)
+	dir->authority().second == mds->get_nodeid())
       continue;
     assert(!dir->is_auth());
    
@@ -3212,7 +3276,9 @@ void MDCache::maybe_resolve_finish()
   if (mds->is_resolve()) {
     trim_unlinked_inodes();
     recalc_auth_bits(false);
-    mds->resolve_done();
+    assert(resolve_done != NULL);
+    resolve_done->complete(0);
+    resolve_done = NULL;
   } else {
     maybe_send_pending_rejoins();
   }
@@ -3426,7 +3492,7 @@ void MDCache::disambiguate_imports()
   other_ambiguous_imports.clear();
 
   // my ambiguous imports
-  mds_authority_t me_ambig(mds->whoami, mds->whoami);
+  mds_authority_t me_ambig(mds->get_nodeid(), mds->get_nodeid());
   while (!my_ambiguous_imports.empty()) {
     map<dirfrag_t, vector<dirfrag_t> >::iterator q = my_ambiguous_imports.begin();
 
@@ -3442,7 +3508,7 @@ void MDCache::disambiguate_imports()
       CDir *root = get_subtree_root(dir);
       if (root != dir)
 	dout(10) << "  subtree root is " << *root << dendl;
-      assert(root->dir_auth.first != mds->whoami);  // no us!
+      assert(root->dir_auth.first != mds->get_nodeid());  // no us!
       try_trim_non_auth_subtree(root);
 
       mds->mdlog->start_submit_entry(new EImportFinish(dir, false));
@@ -3555,6 +3621,60 @@ void MDCache::remove_inode_recursive(CInode *in)
   remove_inode(in);
 }
 
+bool MDCache::expire_recursive(
+  CInode *in,
+  map<mds_rank_t, MCacheExpire*>& expiremap,
+  CDir *subtree)
+{
+  assert(!in->is_auth());
+
+  dout(10) << __func__ << ":" << *in << dendl;
+
+  mds_rank_t owner = subtree->dir_auth.first;
+  MCacheExpire *expire_msg = expiremap[owner];
+  assert(expire_msg);
+
+  // Recurse into any dirfrags beneath this inode
+  list<CDir*> ls;
+  in->get_dirfrags(ls);
+  for (std::list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
+    CDir *subdir = *p;
+
+    dout(10) << __func__ << ": entering dirfrag " << subdir << dendl;
+    for (CDir::map_t::iterator q = subdir->items.begin();
+         q != subdir->items.end(); ++q) {
+      CDentry *dn = q->second;
+      CDentry::linkage_t *dnl = dn->get_linkage();
+      if (dnl->is_primary()) {
+	CInode *tin = dnl->get_inode();
+        dout(10) << __func__ << ": tin="
+          << *tin << dendl;
+
+        /* Remote strays with linkage (i.e. hardlinks) should not be
+         * expired, because they may be the target of
+         * a rename() as the owning MDS shuts down */
+        if (!tin->is_dir() && tin->inode.nlink) {
+          dout(10) << __func__ << ": child still has linkage" << dendl;
+          return true;
+        }
+
+	const bool abort = expire_recursive(tin, expiremap, subtree);
+        if (abort) {
+          return true;
+        }
+      }
+      if (dn->lru_is_expireable()) {
+        trim_dentry(dn, expiremap);
+      } else {
+        dout(10) << __func__ << ": dn still has linkage " << *dn << dendl;
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
 void MDCache::trim_unlinked_inodes()
 {
   dout(7) << "trim_unlinked_inodes" << dendl;
@@ -3582,7 +3702,7 @@ void MDCache::recalc_auth_bits(bool replay)
 
   if (root) {
     root->inode_auth.first = mds->mdsmap->get_root();
-    bool auth = mds->whoami == root->inode_auth.first;
+    bool auth = mds->get_nodeid() == root->inode_auth.first;
     if (auth) {
       root->state_set(CInode::STATE_AUTH);
     } else {
@@ -3728,9 +3848,11 @@ void MDCache::recalc_auth_bits(bool replay)
  *   after recovery.
  */
 
-void MDCache::rejoin_start()
+void MDCache::rejoin_start(MDSInternalContext *rejoin_done_)
 {
   dout(10) << "rejoin_start" << dendl;
+  assert(rejoin_done == NULL);
+  rejoin_done = rejoin_done_;
 
   rejoin_gather = recovery_set;
   // need finish opening cap inodes before sending cache rejoins
@@ -3959,7 +4081,7 @@ void MDCache::rejoin_send_rejoins()
     rejoin_ack_gather.insert(p->first);
     mds->send_message_mds(p->second, p->first);
   }
-  rejoin_ack_gather.insert(mds->whoami);   // we need to complete rejoin_gather_finish, too
+  rejoin_ack_gather.insert(mds->get_nodeid());   // we need to complete rejoin_gather_finish, too
   rejoins_pending = false;
 
   // nothing?
@@ -5023,8 +5145,8 @@ void MDCache::rejoin_gather_finish()
   rejoin_send_acks();
   
   // signal completion of fetches, rejoin_gather_finish, etc.
-  assert(rejoin_ack_gather.count(mds->whoami));
-  rejoin_ack_gather.erase(mds->whoami);
+  assert(rejoin_ack_gather.count(mds->get_nodeid()));
+  rejoin_ack_gather.erase(mds->get_nodeid());
 
   // did we already get our acks too?
   if (rejoin_ack_gather.empty()) {
@@ -5548,7 +5670,9 @@ void MDCache::open_snap_parents()
     do_delayed_cap_imports();
 
     start_files_to_recover(rejoin_recover_q, rejoin_check_q);
-    mds->rejoin_done();
+    assert(rejoin_done != NULL);
+    rejoin_done->complete(0);
+    rejoin_done = NULL;
   }
 }
 
@@ -5688,7 +5812,7 @@ void MDCache::rejoin_send_acks()
       dq.pop_front();
       
       // dir
-      for (map<mds_rank_t,unsigned>::iterator r = dir->replicas_begin();
+      for (compact_map<mds_rank_t,unsigned>::iterator r = dir->replicas_begin();
 	   r != dir->replicas_end();
 	   ++r) {
 	ack[r->first]->add_strong_dirfrag(dir->dirfrag(), ++r->second, dir->dir_rep);
@@ -5707,7 +5831,7 @@ void MDCache::rejoin_send_acks()
 	  in = dnl->get_inode();
 
 	// dentry
-	for (map<mds_rank_t,unsigned>::iterator r = dn->replicas_begin();
+	for (compact_map<mds_rank_t,unsigned>::iterator r = dn->replicas_begin();
 	     r != dn->replicas_end();
 	     ++r) {
 	  ack[r->first]->add_strong_dentry(dir->dirfrag(), dn->name, dn->first, dn->last,
@@ -5724,7 +5848,7 @@ void MDCache::rejoin_send_acks()
 	if (!in)
 	  continue;
 
-	for (map<mds_rank_t,unsigned>::iterator r = in->replicas_begin();
+	for (compact_map<mds_rank_t,unsigned>::iterator r = in->replicas_begin();
 	     r != in->replicas_end();
 	     ++r) {
 	  ack[r->first]->add_inode_base(in);
@@ -5741,7 +5865,7 @@ void MDCache::rejoin_send_acks()
 
   // base inodes too
   if (root && root->is_auth()) 
-    for (map<mds_rank_t,unsigned>::iterator r = root->replicas_begin();
+    for (compact_map<mds_rank_t,unsigned>::iterator r = root->replicas_begin();
 	 r != root->replicas_end();
 	 ++r) {
       ack[r->first]->add_inode_base(root);
@@ -5750,7 +5874,7 @@ void MDCache::rejoin_send_acks()
       ack[r->first]->add_inode_locks(root, ++r->second, bl);
     }
   if (myin)
-    for (map<mds_rank_t,unsigned>::iterator r = myin->replicas_begin();
+    for (compact_map<mds_rank_t,unsigned>::iterator r = myin->replicas_begin();
 	 r != myin->replicas_end();
 	 ++r) {
       ack[r->first]->add_inode_base(myin);
@@ -5764,7 +5888,7 @@ void MDCache::rejoin_send_acks()
        p != rejoin_potential_updated_scatterlocks.end();
        ++p) {
     CInode *in = *p;
-    for (map<mds_rank_t,unsigned>::iterator r = in->replicas_begin();
+    for (compact_map<mds_rank_t,unsigned>::iterator r = in->replicas_begin();
 	 r != in->replicas_end();
 	 ++r)
       ack[r->first]->add_inode_base(in);
@@ -5978,12 +6102,12 @@ void MDCache::_truncate_inode(CInode *in, LogSegment *ls)
     assert(in->last == CEPH_NOSNAP);
   }
   dout(10) << "_truncate_inode  snapc " << snapc << " on " << *in << dendl;
-  mds->filer->truncate(in->inode.ino, &in->inode.layout, *snapc,
-		       pi->truncate_size, pi->truncate_from-pi->truncate_size,
-		       pi->truncate_seq, utime_t(), 0,
-		       0, new C_OnFinisher(new C_IO_MDC_TruncateFinish(this, in,
+  filer.truncate(in->inode.ino, &in->inode.layout, *snapc,
+		 pi->truncate_size, pi->truncate_from-pi->truncate_size,
+		 pi->truncate_seq, utime_t(), 0,
+		 0, new C_OnFinisher(new C_IO_MDC_TruncateFinish(this, in,
 								       ls),
-					   &mds->finisher));
+					   mds->finisher));
 }
 
 struct C_MDC_TruncateLogged : public MDCacheContext {
@@ -6107,14 +6231,7 @@ bool MDCache::trim(int max, int count)
   dout(7) << "trim max=" << max << "  cur=" << lru.lru_get_size() << dendl;
 
   // process delayed eval_stray()
-  for (elist<CDentry*>::iterator p = delayed_eval_stray.begin(); !p.end(); ) {
-    CDentry *dn = *p;
-    ++p;
-    dn->item_stray.remove_myself();
-    num_strays_delayed--;
-    eval_stray(dn);
-  }
-  logger->set(l_mdc_num_strays_delayed, num_strays_delayed);
+  stray_manager.advance_delayed();
 
   map<mds_rank_t, MCacheExpire*> expiremap;
   bool is_standby_replay = mds->is_standby_replay();
@@ -6165,6 +6282,61 @@ bool MDCache::trim(int max, int count)
       trim_inode(0, root, 0, expiremap);
   }
 
+  // Trim remote stray dirs for stopping MDS ranks
+  std::list<CDir*> subtree_list;
+  list_subtrees(subtree_list);  // Take copy because will modify in loop
+  for (std::list<CDir*>::iterator s = subtree_list.begin();
+       s != subtree_list.end(); ++s) {
+    CDir *subtree = *s;
+    if (subtree->inode->is_mdsdir()) {
+      mds_rank_t owner = mds_rank_t(MDS_INO_MDSDIR_OWNER(subtree->inode->ino()));
+      if (owner == mds->get_nodeid() || !mds->mdsmap->is_up(owner)) {
+        continue;
+      }
+
+      dout(20) << __func__ << ": checking remote MDS dir " << *(subtree) << dendl;
+
+      const MDSMap::mds_info_t &owner_info = mds->mdsmap->get_mds_info(owner);
+      if (owner_info.state == MDSMap::STATE_STOPPING) {
+        dout(20) << __func__ << ": it's stopping, remove it" << dendl;
+        if (expiremap.count(owner) == 0)  {
+          expiremap[owner] = new MCacheExpire(mds->get_nodeid());
+        }
+
+	const bool aborted = expire_recursive(
+            subtree->inode, expiremap, subtree);
+        if (!aborted) {
+          dout(20) << __func__ << ": successfully expired mdsdir" << dendl;
+          CInode *subtree_in = subtree->inode;
+          list<CDir*> ls;
+          subtree->inode->get_dirfrags(ls);
+          for (std::list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
+            CDir *frag = *p;
+            trim_dirfrag(frag, subtree, expiremap);
+          }
+          trim_inode(NULL, subtree_in, NULL, expiremap);
+        } else {
+          dout(20) << __func__ << ": some unexpirable contents in mdsdir" << dendl;
+        }
+      } else {
+        dout(20) << __func__ << ": not stopping, leaving it alone" << dendl;
+      }
+    }
+  }
+
+  // Other rank's base inodes (when I'm stopping)
+  if (max == 0) {
+    for (set<CInode*>::iterator p = base_inodes.begin();
+         p != base_inodes.end(); ++p) {
+      if (MDS_INO_MDSDIR_OWNER((*p)->ino()) != mds->get_nodeid()) {
+        dout(20) << __func__ << ": maybe trimming base: " << *(*p) << dendl;
+        if ((*p)->get_num_ref() == 0) {
+          trim_inode(NULL, *p, NULL, expiremap);
+        }
+      }
+    }
+  }
+
   // send any expire messages
   send_expire_messages(expiremap);
 
@@ -6260,7 +6432,7 @@ bool MDCache::trim_dentry(CDentry *dn, map<mds_rank_t, MCacheExpire*>& expiremap
   }
 
   // remove dentry
-  if (dir->is_auth())
+  if (dn->last == CEPH_NOSNAP && dir->is_auth())
     dir->add_to_bloom(dn);
   dir->remove_dentry(dn);
 
@@ -6321,6 +6493,11 @@ void MDCache::trim_dirfrag(CDir *dir, CDir *con, map<mds_rank_t, MCacheExpire*>&
   in->close_dirfrag(dir->dirfrag().frag);
 }
 
+/**
+ * Try trimming an inode from the cache
+ *
+ * @return true if the inode is still in cache, else false if it was trimmed
+ */
 bool MDCache::trim_inode(CDentry *dn, CInode *in, CDir *con, map<mds_rank_t, MCacheExpire*>& expiremap)
 {
   dout(15) << "trim_inode " << *in << dendl;
@@ -6350,10 +6527,13 @@ bool MDCache::trim_inode(CDentry *dn, CInode *in, CDir *con, map<mds_rank_t, MCa
   // INODE
   if (in->is_auth()) {
     // eval stray after closing dirfrags
-    if (dn && !mds->is_standby_replay()) {
+    if (dn) {
       maybe_eval_stray(in);
-      if (dn->get_num_ref() > 0)
+      if (dn->get_num_ref() > 0) {
+        // Independent of whether we passed this on to the purge queue,
+        // if it still has refs then don't count it as trimmed
 	return true;
+      }
     }
   } else {
     mds_authority_t auth = in->authority();
@@ -6616,7 +6796,7 @@ void MDCache::try_trim_non_auth_subtree(CDir *dir)
   get_subtree_bounds(dir, bounds);
   for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) {
     CDir *bd = *p;
-    if (bd->get_dir_auth().first != mds->whoami &&  // we are not auth
+    if (bd->get_dir_auth().first != mds->get_nodeid() &&  // we are not auth
 	bd->get_num_any() == 0 && // and empty
 	can_trim_non_auth_dirfrag(bd)) {
       CInode *bi = bd->get_inode();
@@ -6635,7 +6815,7 @@ void MDCache::try_trim_non_auth_subtree(CDir *dir)
     while (true) {
       CInode *diri = dir->get_inode();
       if (diri->is_base()) {
-	if (!diri->is_root() && diri->authority().first != mds->whoami) {
+	if (!diri->is_root() && diri->authority().first != mds->get_nodeid()) {
 	  dout(10) << " closing empty non-auth subtree " << *dir << dendl;
 	  remove_subtree(dir);
 	  dir->mark_clean();
@@ -6651,7 +6831,7 @@ void MDCache::try_trim_non_auth_subtree(CDir *dir)
 
       CDir *psub = get_subtree_root(diri->get_parent_dir());
       dout(10) << " parent subtree is " << *psub << dendl;
-      if (psub->get_dir_auth().first == mds->whoami)
+      if (psub->get_dir_auth().first == mds->get_nodeid())
 	break;  // we are auth, keep.
 
       dout(10) << " closing empty non-auth subtree " << *dir << dendl;
@@ -6777,6 +6957,7 @@ void MDCache::handle_cache_expire(MCacheExpire *m)
 	assert(in);
       }        
       assert(in->is_auth());
+      dout(20) << __func__ << ": expiring inode " << *in << dendl;
       
       // check nonce
       if (nonce == in->get_replica_nonce(from)) {
@@ -6805,7 +6986,7 @@ void MDCache::handle_cache_expire(MCacheExpire *m)
 	CInode *diri = get_inode(it->first.ino);
 	if (diri) {
 	  if (mds->is_rejoin() &&
-	      rejoin_ack_gather.count(mds->whoami) && // haven't sent rejoin ack yet
+	      rejoin_ack_gather.count(mds->get_nodeid()) && // haven't sent rejoin ack yet
 	      !diri->is_replica(from)) {
 	    list<CDir*> ls;
 	    diri->get_nested_dirfrags(ls);
@@ -6831,8 +7012,10 @@ void MDCache::handle_cache_expire(MCacheExpire *m)
 		<< ", don't have it" << dendl;
 	assert(dir);
       }
+      dout(20) << __func__ << ": expiring dirfrag " << *dir << dendl;
+
       assert(dir->is_auth());
-      
+
       // check nonce
       if (nonce == dir->get_replica_nonce(from)) {
 	// remove from our cached_by
@@ -6960,6 +7143,12 @@ void MDCache::dentry_remove_replica(CDentry *dn, mds_rank_t from, set<SimpleLock
   // fix lock
   if (dn->lock.remove_replica(from))
     gather_locks.insert(&dn->lock);
+
+  // Replicated strays might now be elegible for purge
+  CDentry::linkage_t *dnl = dn->get_linkage();
+  if (dnl->is_primary()) {
+    maybe_eval_stray(dnl->get_inode());
+  }
 }
 
 void MDCache::trim_client_leases()
@@ -7006,7 +7195,6 @@ void MDCache::check_memory_usage()
 	   << ", malloc " << last.malloc << " mmap " << last.mmap
 	   << ", baseline " << baseline.get_heap()
 	   << ", buffers " << (buffer::get_total_alloc() >> 10)
-	   << ", max " << g_conf->mds_mem_max
 	   << ", " << num_inodes_with_caps << " / " << inode_map.size() << " inodes have caps"
 	   << ", " << num_caps << " caps, " << caps_per_inode << " caps per inode"
 	   << dendl;
@@ -7015,13 +7203,6 @@ void MDCache::check_memory_usage()
   mds->mlogger->set(l_mdm_heap, last.get_heap());
   mds->mlogger->set(l_mdm_malloc, last.malloc);
 
-  /*int size = last.get_total();
-  if (size > g_conf->mds_mem_max * .9) {
-    float ratio = (float)g_conf->mds_mem_max * .9 / (float)size;
-    if (ratio < 1.0)
-      mds->server->recall_client_state(ratio);
-  } else 
-    */
   if (num_inodes_with_caps > g_conf->mds_cache_size) {
     float ratio = (float)g_conf->mds_cache_size * .9 / (float)num_inodes_with_caps;
     if (ratio < 1.0)
@@ -7191,6 +7372,14 @@ bool MDCache::shutdown_pass()
   }
   assert(subtrees.empty());
 
+  // Still replicas of mydir?
+  if (mydir->inode->is_replicated()) {
+    // We do this because otherwise acks to locks could come in after
+    // we cap the log.
+    dout(7) << "waiting for mydir replicas to release: " << *mydir << dendl;
+    return false;
+  }
+
   // (only do this once!)
   if (!mds->mdlog->is_capped()) {
     dout(7) << "capping the log" << dendl;
@@ -7250,10 +7439,13 @@ bool MDCache::shutdown_export_strays()
     }
     strays[i]->get_dirfrags(dfs);
   }
+
+  stray_manager.abort_queue();
   
-  while (!dfs.empty()) {
-    CDir *dir = dfs.front();
-    dfs.pop_front();
+  for (std::list<CDir*>::iterator dfs_i = dfs.begin();
+       dfs_i != dfs.end(); ++dfs_i)
+  {
+    CDir *dir = *dfs_i;
 
     if (!dir->is_complete()) {
       dir->fetch(0);
@@ -7268,10 +7460,16 @@ bool MDCache::shutdown_export_strays()
       if (dnl->is_null()) continue;
       done = false;
       
+      if (dn->state_test(CDentry::STATE_PURGING)) {
+        // Don't try to migrate anything that is actually
+        // being purged right now
+        continue;
+      }
+
       // FIXME: we'll deadlock if a rename fails.
       if (exported_strays.count(dnl->get_inode()->ino()) == 0) {
 	exported_strays.insert(dnl->get_inode()->ino());
-	migrate_stray(dn, mds_rank_t(0));  // send to root!
+	stray_manager.migrate_stray(dn, mds_rank_t(0));  // send to root!
       } else {
 	dout(10) << "already exporting " << *dn << dendl;
       }
@@ -7445,6 +7643,12 @@ int MDCache::path_traverse(MDRequestRef& mdr, Message *req, MDSInternalContextBa
   if (cur->state_test(CInode::STATE_PURGING))
     return -ESTALE;
 
+  // make sure snaprealm are open...
+  if (mdr && cur->snaprealm && !cur->snaprealm->is_open() &&
+      !cur->snaprealm->open_parents(_get_waiter(mdr, req, fin))) {
+    return 1;
+  }
+
   // start trace
   if (pdnvec)
     pdnvec->clear();
@@ -7525,13 +7729,6 @@ int MDCache::path_traverse(MDRequestRef& mdr, Message *req, MDSInternalContextBa
     }
     */
 
-    // make sure snaprealm parents are open...
-    if (cur->snaprealm && !cur->snaprealm->open && mdr &&
-	!cur->snaprealm->open_parents(_get_waiter(mdr, req, fin))) {
-      return 1;
-    }
-
-
     // dentry
     CDentry *dn = curdir->lookup(path[depth], snapid);
     CDentry::linkage_t *dnl = dn ? dn->get_projected_linkage() : 0;
@@ -7598,8 +7795,14 @@ int MDCache::path_traverse(MDRequestRef& mdr, Message *req, MDSInternalContextBa
         }        
       }
 
-      // add to trace, continue.
       cur = in;
+      // make sure snaprealm are open...
+      if (mdr && cur->snaprealm && !cur->snaprealm->is_open() &&
+	  !cur->snaprealm->open_parents(_get_waiter(mdr, req, fin))) {
+	return 1;
+      }
+
+      // add to trace, continue.
       touch_inode(cur);
       if (pdnvec)
 	pdnvec->push_back(dn);
@@ -7615,8 +7818,10 @@ int MDCache::path_traverse(MDRequestRef& mdr, Message *req, MDSInternalContextBa
 
     if (curdir->is_auth()) {
       // dentry is mine.
-      if (curdir->is_complete() || (curdir->has_bloom() &&
-          !curdir->is_in_bloom(path[depth]))){
+      if (curdir->is_complete() ||
+	  (snapid == CEPH_NOSNAP &&
+	   curdir->has_bloom() &&
+	   !curdir->is_in_bloom(path[depth]))){
         // file not found
 	if (pdnvec) {
 	  // instantiate a null dn?
@@ -7904,7 +8109,7 @@ void MDCache::_open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err
       C_IO_MDC_OpenInoBacktraceFetched *fin =
 	new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
       fetch_backtrace(ino, info.pool, fin->bl,
-		      new C_OnFinisher(fin, &mds->finisher));
+		      new C_OnFinisher(fin, mds->finisher));
       return;
     }
   } else if (err == -ENOENT) {
@@ -7916,7 +8121,7 @@ void MDCache::_open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err
       C_IO_MDC_OpenInoBacktraceFetched *fin =
 	new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
       fetch_backtrace(ino, info.pool, fin->bl,
-		      new C_OnFinisher(fin, &mds->finisher));
+		      new C_OnFinisher(fin, mds->finisher));
       return;
     }
   }
@@ -7962,7 +8167,7 @@ void MDCache::_open_ino_parent_opened(inodeno_t ino, int ret)
     _open_ino_traverse_dir(ino, info, 0);
   } else {
     if (ret >= 0) {
-      mds_rank_t checked_rank;
+      mds_rank_t checked_rank = mds_rank_t(ret);
       info.check_peers = true;
       info.auth_hint = checked_rank;
       info.checked.erase(checked_rank);
@@ -8144,7 +8349,7 @@ void MDCache::do_open_ino(inodeno_t ino, open_ino_info_t& info, int err)
     C_IO_MDC_OpenInoBacktraceFetched *fin =
       new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
     fetch_backtrace(ino, info.pool, fin->bl,
-		    new C_OnFinisher(fin, &mds->finisher));
+		    new C_OnFinisher(fin, mds->finisher));
   } else {
     assert(!info.ancestors.empty());
     info.checking = mds->get_nodeid();
@@ -8347,7 +8552,7 @@ void MDCache::find_ino_peers(inodeno_t ino, MDSInternalContextBase *c, mds_rank_
   fip.tid = tid;
   fip.fin = c;
   fip.hint = hint;
-  fip.checked.insert(mds->whoami);
+  fip.checked.insert(mds->get_nodeid());
   _do_find_ino_peer(fip);
 }
 
@@ -8368,7 +8573,7 @@ void MDCache::_do_find_ino_peer(find_ino_peer_info_t& fip)
     fip.hint = MDS_RANK_NONE;
   } else {
     for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
-      if (*p != mds->whoami &&
+      if (*p != mds->get_nodeid() &&
 	  fip.checked.count(*p) == 0) {
 	m = *p;
 	break;
@@ -8829,6 +9034,7 @@ void MDCache::do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool
   bufferlist snapbl;
   in->snaprealm->build_snap_trace(snapbl);
 
+  set<SnapRealm*> past_children;
   map<client_t, MClientSnap*> updates;
   list<SnapRealm*> q;
   q.push_back(in->snaprealm);
@@ -8853,6 +9059,13 @@ void MDCache::do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool
       }
     }
 
+    if (snapop == CEPH_SNAP_OP_UPDATE || snapop == CEPH_SNAP_OP_DESTROY) {
+      for (set<SnapRealm*>::iterator p = realm->open_past_children.begin();
+	   p != realm->open_past_children.end();
+	   ++p)
+	past_children.insert(*p);
+    }
+
     // notify for active children, too.
     dout(10) << " " << realm << " open_children are " << realm->open_children << dendl;
     for (set<SnapRealm*>::iterator p = realm->open_children.begin();
@@ -8863,6 +9076,43 @@ void MDCache::do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool
 
   if (!nosend)
     send_snaps(updates);
+
+  // notify past children and their descendants if we update/delete old snapshots
+  for (set<SnapRealm*>::iterator p = past_children.begin();
+       p !=  past_children.end();
+       ++p)
+    q.push_back(*p);
+
+  while (!q.empty()) {
+    SnapRealm *realm = q.front();
+    q.pop_front();
+
+    realm->invalidate_cached_snaps();
+
+    for (set<SnapRealm*>::iterator p = realm->open_children.begin();
+	 p != realm->open_children.end();
+	 ++p) {
+      if (past_children.count(*p) == 0)
+	q.push_back(*p);
+    }
+
+    for (set<SnapRealm*>::iterator p = realm->open_past_children.begin();
+	 p != realm->open_past_children.end();
+	 ++p) {
+      if (past_children.count(*p) == 0) {
+	q.push_back(*p);
+	past_children.insert(*p);
+      }
+    }
+  }
+
+  if (snapop == CEPH_SNAP_OP_DESTROY) {
+    // eval stray inodes if we delete snapshot from their past ancestor snaprealm
+    for (set<SnapRealm*>::iterator p = past_children.begin();
+	p != past_children.end();
+	++p)
+      maybe_eval_stray((*p)->inode, true);
+  }
 }
 
 void MDCache::_snaprealm_create_finish(MDRequestRef& mdr, MutationRef& mut, CInode *in)
@@ -8884,9 +9134,10 @@ void MDCache::_snaprealm_create_finish(MDRequestRef& mdr, MutationRef& mut, CIno
   ::decode(seq, p);
 
   in->open_snaprealm();
-  in->snaprealm->open = true;
   in->snaprealm->srnode.seq = seq;
   in->snaprealm->srnode.created = seq;
+  bool ok = in->snaprealm->_open_parents(NULL);
+  assert(ok);
 
   do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT);
 
@@ -8935,488 +9186,73 @@ void MDCache::scan_stray_dir(dirfrag_t next)
     for (CDir::map_t::iterator q = dir->items.begin(); q != dir->items.end(); ++q) {
       CDentry *dn = q->second;
       CDentry::linkage_t *dnl = dn->get_projected_linkage();
-      num_strays++;
-      if (dnl->is_primary())
+      stray_manager.notify_stray_created();
+      if (dnl->is_primary()) {
 	maybe_eval_stray(dnl->get_inode());
-    }
-  }
-}
-
-struct C_MDC_EvalStray : public MDCacheContext {
-  CDentry *dn;
-  C_MDC_EvalStray(MDCache *c, CDentry *d) : MDCacheContext(c), dn(d) {}
-  void finish(int r) {
-    mdcache->eval_stray(dn);
-  }
-};
-
-void MDCache::eval_stray(CDentry *dn, bool delay)
-{
-  dout(10) << "eval_stray " << *dn << dendl;
-  CDentry::linkage_t *dnl = dn->get_projected_linkage();
-  dout(10) << " inode is " << *dnl->get_inode() << dendl;
-  assert(dnl->is_primary());
-  CInode *in = dnl->get_inode();
-  assert(in);
-
-  assert(dn->get_dir()->get_inode()->is_stray());
-
-  if (!dn->is_auth()) {
-    // has to be mine
-    // move to bottom of lru so that we trim quickly!
-    touch_dentry_bottom(dn);
-    return;
-  }
-
-  // purge?
-  if (in->inode.nlink == 0) {
-      // past snaprealm parents imply snapped dentry remote links.
-      // only important for directories.  normal file data snaps are handled
-      // by the object store.
-    if (in->snaprealm && in->snaprealm->has_past_parents()) {
-      if (!in->snaprealm->have_past_parents_open() &&
-	  !in->snaprealm->open_parents(new C_MDC_EvalStray(this, dn)))
-	return;
-      in->snaprealm->prune_past_parents();
-    }
-    if (in->is_dir()) {
-      if (in->snaprealm && in->snaprealm->has_past_parents()) {
-	dout(20) << "  directory has past parents " << in->snaprealm->srnode.past_parents << dendl;
-	return;  // not until some snaps are deleted.
       }
-      if (in->has_dirfrags()) {
-	list<CDir*> ls;
-	in->get_nested_dirfrags(ls);
-	for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p)
-	  (*p)->try_remove_dentries_for_stray();
-      }
-    }
-    if (dn->is_replicated()) {
-      dout(20) << " replicated" << dendl;
-      return;
-    }
-    if (dn->is_any_leases() || in->is_any_caps()) {
-      dout(20) << " caps | leases" << dendl;
-      return;  // wait
-    }
-    if (dn->state_test(CDentry::STATE_PURGING)) {
-      dout(20) << " already purging" << dendl;
-      return;  // already purging
     }
-    if (in->state_test(CInode::STATE_NEEDSRECOVER) ||
-	in->state_test(CInode::STATE_RECOVERING)) {
-      dout(20) << " pending recovery" << dendl;
-      return;  // don't mess with file size probing
-    }
-    if (in->get_num_ref() > (int)in->is_dirty() + (int)in->is_dirty_parent()) {
-      dout(20) << " too many inode refs" << dendl;
-      return;
-    }
-    if (dn->get_num_ref() > (int)dn->is_dirty() + !!in->get_num_ref()) {
-      dout(20) << " too many dn refs" << dendl;
-      return;
-    }
-    if (delay) {
-      if (!dn->item_stray.is_on_list()) {
-	delayed_eval_stray.push_back(&dn->item_stray);
-	num_strays_delayed++;
-	logger->set(l_mdc_num_strays_delayed, num_strays_delayed);
-      }
-    } else if (in->snaprealm && in->snaprealm->has_past_parents()) {
-      assert(!in->is_dir());
-      dout(20) << " file has past parents " << in->snaprealm->srnode.past_parents << dendl;
-      if (in->is_file() && in->get_projected_inode()->size > 0)
-	truncate_stray(dn); // truncate head objects
-    } else {
-      if (in->is_dir())
-	in->close_dirfrags();
-      purge_stray(dn);
-    }
-  }
-  else if (in->inode.nlink >= 1) {
-    // trivial reintegrate?
-    if (!in->remote_parents.empty()) {
-      CDentry *rlink = *in->remote_parents.begin();
-      
-      // don't do anything if the remote parent is projected, or we may
-      // break user-visible semantics!
-      // NOTE: we repeat this check in _rename(), since our submission path is racey.
-      if (!rlink->is_projected()) {
-	if (rlink->is_auth() && rlink->dir->can_auth_pin())
-	  reintegrate_stray(dn, rlink);
-	
-	if (!rlink->is_auth() && dn->is_auth())
-	  migrate_stray(dn, rlink->authority().first);
-      }
-    }
-  } else {
-    // wait for next use.
   }
 }
 
-void MDCache::eval_remote(CDentry *dn)
-{
-  dout(10) << "eval_remote " << *dn << dendl;
-  CDentry::linkage_t *dnl = dn->get_projected_linkage();
-  assert(dnl->is_remote());
-  CInode *in = dnl->get_inode();
-  if (!in) return;
-
-  // refers to stray?
-  if (in->get_parent_dn()->get_dir()->get_inode()->is_stray()) {
-    if (in->is_auth())
-      eval_stray(in->get_parent_dn());
-    else
-      migrate_stray(in->get_parent_dn(), mds->get_nodeid());
-  }
-}
-
-void MDCache::fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin)
-{
-  object_t oid = CInode::get_object_name(ino, frag_t(), "");
-  mds->objecter->getxattr(oid, object_locator_t(pool), "parent", CEPH_NOSNAP, &bl, 0, fin);
-}
-
-class C_IO_MDC_PurgeStrayPurged : public MDCacheIOContext {
-  CDentry *dn;
-  bool only_head;
-public:
-  C_IO_MDC_PurgeStrayPurged(MDCache *c, CDentry *d, bool oh) :
-    MDCacheIOContext(c), dn(d), only_head(oh) { }
-  void finish(int r) {
-    assert(r == 0 || r == -ENOENT);
-    mdcache->_purge_stray_purged(dn, only_head);
-  }
-};
-
-void MDCache::truncate_stray(CDentry *dn)
+/**
+ * If a remote dentry refers to an inode whose primary
+ * dentry is a stray, then evaluate the inode for purging if
+ * we have the auth copy, or migrate the stray to use if we
+ * do not.
+ */
+void MDCache::eval_remote(CDentry *remote_dn)
 {
-  CDentry::linkage_t *dnl = dn->get_projected_linkage();
-  CInode *in = dnl->get_inode();
-  dout(10) << "truncate_stray " << *dn << " " << *in << dendl;
-  assert(!dn->is_replicated());
+  assert(remote_dn);
+  dout(10) << __func__ << " " << *remote_dn << dendl;
 
-  dn->state_set(CDentry::STATE_PURGING);
-  dn->get(CDentry::PIN_PURGING);
-  in->state_set(CInode::STATE_PURGING);
-
-  if (dn->item_stray.is_on_list())
-    dn->item_stray.remove_myself();
-
-  C_GatherBuilder gather(
-    g_ceph_context,
-    new C_OnFinisher(new C_IO_MDC_PurgeStrayPurged(this, dn, true),
-		     &mds->finisher));
-
-  SnapRealm *realm = in->find_snaprealm();
-  assert(realm);
-  dout(10) << " realm " << *realm << dendl;
-  const SnapContext *snapc = &realm->get_snap_context();
-
-  uint64_t period = (uint64_t)in->inode.layout.fl_object_size *
-		    (uint64_t)in->inode.layout.fl_stripe_count;
-  uint64_t to = in->inode.get_max_size();
-  to = MAX(in->inode.size, to);
-  // when truncating a file, the filer does not delete stripe objects that are
-  // truncated to zero. so we need to purge stripe objects up to the max size
-  // the file has ever been.
-  to = MAX(in->inode.max_size_ever, to);
-  if (period && to > period) {
-    uint64_t num = (to - 1) / period;
-    dout(10) << "purge_stray 0~" << to << " objects 0~" << num
-      << " snapc " << snapc << " on " << *in << dendl;
-    mds->filer->purge_range(in->ino(), &in->inode.layout, *snapc,
-			    1, num, ceph_clock_now(g_ceph_context),
-			    0, gather.new_sub());
-  }
-
-  // keep backtrace object
-  if (period && to > 0) {
-    mds->filer->zero(in->ino(), &in->inode.layout, *snapc,
-		     0, period, ceph_clock_now(g_ceph_context),
-		     0, true, NULL, gather.new_sub());
-  }
-
-  assert(gather.has_subs());
-  gather.activate();
-}
-
-void MDCache::purge_stray(CDentry *dn)
-{
-  CDentry::linkage_t *dnl = dn->get_projected_linkage();
+  CDentry::linkage_t *dnl = remote_dn->get_projected_linkage();
+  assert(dnl->is_remote());
   CInode *in = dnl->get_inode();
-  dout(10) << "purge_stray " << *dn << " " << *in << dendl;
-  assert(!dn->is_replicated());
-
-  dn->state_set(CDentry::STATE_PURGING);
-  dn->get(CDentry::PIN_PURGING);
-  in->state_set(CInode::STATE_PURGING);
 
-  num_strays_purging++;
-  logger->set(l_mdc_num_strays_purging, num_strays_purging);
-
-  if (dn->item_stray.is_on_list()) {
-    dn->item_stray.remove_myself();
-    num_strays_delayed--;
-    logger->set(l_mdc_num_strays_delayed, num_strays_delayed);
+  if (!in) {
+    dout(20) << __func__ << ": no inode, cannot evaluate" << dendl;
+    return;
   }
 
-  if (in->is_dirty_parent())
-    in->clear_dirty_parent();
-
-  // CHEAT.  there's no real need to journal our intent to purge, since
-  // that is implicit in the dentry's presence and non-use in the stray
-  // dir.  on recovery, we'll need to re-eval all strays anyway.
-  
-  SnapContext nullsnapc;
-  C_GatherBuilder gather(
-    g_ceph_context,
-    new C_OnFinisher(new C_IO_MDC_PurgeStrayPurged(this, dn, false),
-		     &mds->finisher));
-
-  if (in->is_dir()) {
-    object_locator_t oloc(mds->mdsmap->get_metadata_pool());
-    list<frag_t> ls;
-    if (!in->dirfragtree.is_leaf(frag_t()))
-      in->dirfragtree.get_leaves(ls);
-    ls.push_back(frag_t());
-    for (list<frag_t>::iterator p = ls.begin();
-         p != ls.end();
-         ++p) {
-      object_t oid = CInode::get_object_name(in->inode.ino, *p, "");
-      dout(10) << "purge_stray remove dirfrag " << oid << dendl;
-      mds->objecter->remove(oid, oloc, nullsnapc, ceph_clock_now(g_ceph_context),
-                            0, NULL, gather.new_sub());
-    }
-    assert(gather.has_subs());
-    gather.activate();
+  if (remote_dn->last != CEPH_NOSNAP) {
+    dout(20) << __func__ << ": snap dentry, cannot evaluate" << dendl;
     return;
   }
 
-  const SnapContext *snapc;
-  SnapRealm *realm = in->find_snaprealm();
-  if (realm) {
-    dout(10) << " realm " << *realm << dendl;
-    snapc = &realm->get_snap_context();
-  } else {
-    dout(10) << " NO realm, using null context" << dendl;
-    snapc = &nullsnapc;
-    assert(in->last == CEPH_NOSNAP);
-  }
+  // refers to stray?
+  CDentry *primary_dn = in->get_projected_parent_dn();
+  assert(primary_dn != NULL);
+  if (primary_dn->get_dir()->get_inode()->is_stray()) {
+    if (in->is_auth()) {
+      dout(20) << __func__ << ": have auth for inode, evaluating" << dendl;
 
-  if (in->is_file()) {
-    uint64_t period = (uint64_t)in->inode.layout.fl_object_size *
-		      (uint64_t)in->inode.layout.fl_stripe_count;
-    uint64_t to = in->inode.get_max_size();
-    to = MAX(in->inode.size, to);
-    // when truncating a file, the filer does not delete stripe objects that are
-    // truncated to zero. so we need to purge stripe objects up to the max size
-    // the file has ever been.
-    to = MAX(in->inode.max_size_ever, to);
-    if (to && period) {
-      uint64_t num = (to + period - 1) / period;
-      dout(10) << "purge_stray 0~" << to << " objects 0~" << num
-	       << " snapc " << snapc << " on " << *in << dendl;
-      mds->filer->purge_range(in->inode.ino, &in->inode.layout, *snapc,
-			      0, num, ceph_clock_now(g_ceph_context), 0,
-			      gather.new_sub());
+      stray_manager.eval_remote_stray(primary_dn, remote_dn);
+    } else {
+      dout(20) << __func__ << ": do not have auth for inode, migrating " << dendl;
+      /*
+       * Inodes get filed into a stray dentry when a client unlinks
+       * the primary DN for them.  However, that doesn't mean there
+       * isn't a remote DN still in the world.  The remote DN just
+       * ends up pointing at a stray.  Strays can pretty much live
+       * forever in this scenario.
+       *
+       * Therefore, we have a special behaviour here: migrate a stray
+       * to <me> when <I> handle a client request with a trace referring
+       * to a stray inode on another MDS.
+       */
+      stray_manager.migrate_stray(primary_dn, mds->get_nodeid());
     }
-  }
-
-  inode_t *pi = in->get_projected_inode();
-  object_t oid = CInode::get_object_name(pi->ino, frag_t(), "");
-  // remove the backtrace object if it was not purged
-  if (!gather.has_subs()) {
-    object_locator_t oloc(pi->layout.fl_pg_pool);
-    dout(10) << "purge_stray remove backtrace object " << oid
-	     << " pool " << oloc.pool << " snapc " << snapc << dendl;
-    mds->objecter->remove(oid, oloc, *snapc, ceph_clock_now(g_ceph_context), 0,
-			  NULL, gather.new_sub());
-  }
-  // remove old backtrace objects
-  for (vector<int64_t>::iterator p = pi->old_pools.begin();
-       p != pi->old_pools.end();
-       ++p) {
-    object_locator_t oloc(*p);
-    dout(10) << "purge_stray remove backtrace object " << oid
-	     << " old pool " << *p << " snapc " << snapc << dendl;
-    mds->objecter->remove(oid, oloc, *snapc, ceph_clock_now(g_ceph_context), 0,
-			  NULL, gather.new_sub());
-  }
-  assert(gather.has_subs());
-  gather.activate();
-}
-
-class C_MDC_PurgeStrayLogged : public MDCacheContext {
-  CDentry *dn;
-  version_t pdv;
-  LogSegment *ls;
-public:
-  C_MDC_PurgeStrayLogged(MDCache *c, CDentry *d, version_t v, LogSegment *s) : 
-    MDCacheContext(c), dn(d), pdv(v), ls(s) { }
-  void finish(int r) {
-    mdcache->_purge_stray_logged(dn, pdv, ls);
-  }
-};
-class C_MDC_PurgeStrayLoggedTruncate : public MDCacheContext {
-  CDentry *dn;
-  LogSegment *ls;
-public:
-  C_MDC_PurgeStrayLoggedTruncate(MDCache *c, CDentry *d, LogSegment *s) : 
-    MDCacheContext(c), dn(d), ls(s) { }
-  void finish(int r) {
-    mdcache->_purge_stray_logged_truncate(dn, ls);
-  }
-};
-
-void MDCache::_purge_stray_purged(CDentry *dn, bool only_head)
-{
-  CInode *in = dn->get_projected_linkage()->get_inode();
-  dout(10) << "_purge_stray_purged " << *dn << " " << *in << dendl;
-
-  if (!only_head &&
-      in->get_num_ref() == (int)in->is_dirty() &&
-      dn->get_num_ref() == (int)dn->is_dirty() + !!in->get_num_ref() + 1/*PIN_PURGING*/) {
-    // kill dentry.
-    version_t pdv = dn->pre_dirty();
-    dn->push_projected_linkage(); // NULL
-
-    EUpdate *le = new EUpdate(mds->mdlog, "purge_stray");
-    mds->mdlog->start_entry(le);
-
-    // update dirfrag fragstat, rstat
-    CDir *dir = dn->get_dir();
-    fnode_t *pf = dir->project_fnode();
-    pf->version = dir->pre_dirty();
-    if (in->is_dir())
-      pf->fragstat.nsubdirs--;
-    else
-      pf->fragstat.nfiles--;
-    pf->rstat.sub(in->inode.accounted_rstat);
-
-    le->metablob.add_dir_context(dn->dir);
-    EMetaBlob::dirlump& dl = le->metablob.add_dir(dn->dir, true);
-    le->metablob.add_null_dentry(dl, dn, true);
-    le->metablob.add_destroyed_inode(in->ino());
-
-    mds->mdlog->submit_entry(le, new C_MDC_PurgeStrayLogged(this, dn, pdv, mds->mdlog->get_current_segment()));
-
-    num_strays_purging--;
-    num_strays--;
-    logger->set(l_mdc_num_strays, num_strays);
-    logger->set(l_mdc_num_strays_purging, num_strays_purging);
-    logger->inc(l_mdc_strays_purged);
   } else {
-    // new refs.. just truncate to 0
-    EUpdate *le = new EUpdate(mds->mdlog, "purge_stray truncate");
-    mds->mdlog->start_entry(le);
-    
-    inode_t *pi = in->project_inode();
-    pi->size = 0;
-    pi->max_size_ever = 0;
-    pi->client_ranges.clear();
-    pi->truncate_size = 0;
-    pi->truncate_from = 0;
-    pi->version = in->pre_dirty();
-
-    le->metablob.add_dir_context(dn->dir);
-    le->metablob.add_primary_dentry(dn, in, true);
-
-    mds->mdlog->submit_entry(le, new C_MDC_PurgeStrayLoggedTruncate(this, dn, mds->mdlog->get_current_segment()));
+    dout(20) << __func__ << ": inode's primary dn not stray" << dendl;
   }
 }
 
-void MDCache::_purge_stray_logged(CDentry *dn, version_t pdv, LogSegment *ls)
-{
-  CInode *in = dn->get_linkage()->get_inode();
-  dout(10) << "_purge_stray_logged " << *dn << " " << *in << dendl;
-
-  assert(!in->state_test(CInode::STATE_RECOVERING));
-
-  // unlink
-  assert(dn->get_projected_linkage()->is_null());
-  dn->dir->unlink_inode(dn);
-  dn->pop_projected_linkage();
-  dn->mark_dirty(pdv, ls);
-
-  dn->dir->pop_and_dirty_projected_fnode(ls);
-
-  in->state_clear(CInode::STATE_ORPHAN);
-  dn->state_clear(CDentry::STATE_PURGING);
-  dn->put(CDentry::PIN_PURGING);
-
-  // drop inode
-  if (in->is_dirty())
-    in->mark_clean();
-
-  remove_inode(in);
-
-  // drop dentry?
-  if (dn->is_new()) {
-    dout(20) << " dn is new, removing" << dendl;
-    dn->mark_clean();
-    dn->dir->remove_dentry(dn);
-  } else
-    touch_dentry_bottom(dn);  // drop dn as quickly as possible.
-}
-
-void MDCache::_purge_stray_logged_truncate(CDentry *dn, LogSegment *ls)
-{
-  CInode *in = dn->get_projected_linkage()->get_inode();
-  dout(10) << "_purge_stray_logged_truncate " << *dn << " " << *in << dendl;
-
-  dn->state_clear(CDentry::STATE_PURGING);
-  dn->put(CDentry::PIN_PURGING);
-
-  in->pop_and_dirty_projected_inode(ls);
-
-  eval_stray(dn);
-}
-
-void MDCache::reintegrate_stray(CDentry *straydn, CDentry *rdn)
+void MDCache::fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin)
 {
-  dout(10) << "reintegrate_stray " << *straydn << " into " << *rdn << dendl;
-  
-  // rename it to another mds.
-  filepath src;
-  straydn->make_path(src);
-  filepath dst;
-  rdn->make_path(dst);
-
-  MClientRequest *req = new MClientRequest(CEPH_MDS_OP_RENAME);
-  req->set_filepath(dst);
-  req->set_filepath2(src);
-  req->set_tid(mds->issue_tid());
-
-  mds->send_message_mds(req, rdn->authority().first);
+  object_t oid = CInode::get_object_name(ino, frag_t(), "");
+  mds->objecter->getxattr(oid, object_locator_t(pool), "parent", CEPH_NOSNAP, &bl, 0, fin);
 }
- 
-
-void MDCache::migrate_stray(CDentry *dn, mds_rank_t to)
-{
-  CInode *in = dn->get_linkage()->get_inode();
-  assert(in);
-  CInode *diri = dn->dir->get_inode();
-  assert(diri->is_stray());
-  dout(10) << "migrate_stray from mds." << MDS_INO_STRAY_OWNER(diri->inode.ino)
-	   << " to mds." << to
-	   << " " << *dn << " " << *in << dendl;
-
-  // rename it to another mds.
-  filepath src;
-  dn->make_path(src);
 
-  string dname;
-  in->name_stray_dentry(dname);
-  filepath dst(dname, MDS_INO_STRAY(to, 0));
-
-  MClientRequest *req = new MClientRequest(CEPH_MDS_OP_RENAME);
-  req->set_filepath(dst);
-  req->set_filepath2(src);
-  req->set_tid(mds->issue_tid());
-
-  mds->send_message_mds(req, to);
-}
 
 
 
@@ -10178,7 +10014,7 @@ int MDCache::send_dir_updates(CDir *dir, bool bcast)
   if (bcast) {
     mds->get_mds_map()->get_active_mds_set(who);
   } else {
-    for (map<mds_rank_t,unsigned>::iterator p = dir->replicas_begin();
+    for (compact_map<mds_rank_t,unsigned>::iterator p = dir->replicas_begin();
 	 p != dir->replicas_end();
 	 ++p)
       who.insert(p->first);
@@ -10259,8 +10095,8 @@ void MDCache::send_dentry_link(CDentry *dn, MDRequestRef& mdr)
   dout(7) << "send_dentry_link " << *dn << dendl;
 
   CDir *subtree = get_subtree_root(dn->get_dir());
-  for (map<mds_rank_t,unsigned>::iterator p = dn->replicas_begin();
-       p != dn->replicas_end(); 
+  for (compact_map<mds_rank_t,unsigned>::iterator p = dn->replicas_begin();
+       p != dn->replicas_end();
        ++p) {
     // don't tell (rename) witnesses; they already know
     if (mdr.get() && mdr->more()->witnessed.count(p->first))
@@ -10340,26 +10176,26 @@ void MDCache::send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& md
 {
   dout(10) << "send_dentry_unlink " << *dn << dendl;
   // share unlink news with replicas
-  map<mds_rank_t,unsigned> replicas;
-  replicas.insert(dn->replicas_begin(), dn->replicas_end());
+  set<mds_rank_t> replicas;
+  dn->list_replicas(replicas);
   if (straydn)
-    replicas.insert(straydn->replicas_begin(), straydn->replicas_end());
-  for (map<mds_rank_t,unsigned>::iterator it = replicas.begin();
+    straydn->list_replicas(replicas);
+  for (set<mds_rank_t>::iterator it = replicas.begin();
        it != replicas.end();
        ++it) {
     // don't tell (rmdir) witnesses; they already know
-    if (mdr.get() && mdr->more()->witnessed.count(it->first))
+    if (mdr.get() && mdr->more()->witnessed.count(*it))
       continue;
 
-    if (mds->mdsmap->get_state(it->first) < MDSMap::STATE_REJOIN ||
-	(mds->mdsmap->get_state(it->first) == MDSMap::STATE_REJOIN &&
-	 rejoin_gather.count(it->first)))
+    if (mds->mdsmap->get_state(*it) < MDSMap::STATE_REJOIN ||
+	(mds->mdsmap->get_state(*it) == MDSMap::STATE_REJOIN &&
+	 rejoin_gather.count(*it)))
       continue;
 
     MDentryUnlink *unlink = new MDentryUnlink(dn->get_dir()->dirfrag(), dn->name);
     if (straydn)
-      replicate_stray(straydn, it->first, unlink->straybl);
-    mds->send_message_mds(unlink, it->first);
+      replicate_stray(straydn, *it, unlink->straybl);
+    mds->send_message_mds(unlink, *it);
   }
 }
 
@@ -11174,8 +11010,8 @@ void MDCache::_fragment_stored(MDRequestRef& mdr)
 
   // tell peers
   CDir *first = *info.resultfrags.begin();
-  for (map<mds_rank_t,unsigned>::iterator p = first->replicas_begin();
-       p != first->replica_map.end();
+  for (compact_map<mds_rank_t,unsigned>::iterator p = first->replicas_begin();
+       p != first->replicas_end();
        ++p) {
     if (mds->mdsmap->get_state(p->first) < MDSMap::STATE_REJOIN ||
 	(mds->mdsmap->get_state(p->first) == MDSMap::STATE_REJOIN &&
@@ -11236,7 +11072,7 @@ void MDCache::_fragment_committed(dirfrag_t basedirfrag, list<CDir*>& resultfrag
     g_ceph_context,
     new C_OnFinisher(
       new C_IO_MDC_FragmentFinish(this, basedirfrag, resultfrags),
-      &mds->finisher));
+      mds->finisher));
 
   SnapContext nullsnapc;
   object_locator_t oloc(mds->mdsmap->get_metadata_pool());
@@ -11672,63 +11508,124 @@ void MDCache::show_cache()
   }
 }
 
+void MDCache::dump_cache(std::string const &file_name)
+{
+  dump_cache(file_name.c_str(), NULL);
+}
+
+void MDCache::dump_cache(Formatter *f)
+{
+  dump_cache(NULL, f);
+}
 
-void MDCache::dump_cache(const char *fn)
+/**
+ * Dump the metadata cache, either to a Formatter, if
+ * provided, else to a plain text file.
+ */
+void MDCache::dump_cache(const char *fn, Formatter *f)
 {
-  int r;
-  char deffn[200];
-  if (!fn) {
-    snprintf(deffn, sizeof(deffn), "cachedump.%d.mds%d", (int)mds->mdsmap->get_epoch(), int(mds->get_nodeid()));
-    fn = deffn;
-  }
+  int r = 0;
+  int fd = -1;
 
-  dout(1) << "dump_cache to " << fn << dendl;
+  if (f) {
+    f->open_array_section("inodes");
+  } else {
+    char deffn[200];
+    if (!fn) {
+      snprintf(deffn, sizeof(deffn), "cachedump.%d.mds%d", (int)mds->mdsmap->get_epoch(), int(mds->get_nodeid()));
+      fn = deffn;
+    }
 
-  int fd = ::open(fn, O_WRONLY|O_CREAT|O_EXCL, 0600);
-  if (fd < 0) {
-    derr << "failed to open " << fn << ": " << cpp_strerror(errno) << dendl;
-    return;
+    dout(1) << "dump_cache to " << fn << dendl;
+
+    fd = ::open(fn, O_WRONLY|O_CREAT|O_EXCL, 0600);
+    if (fd < 0) {
+      derr << "failed to open " << fn << ": " << cpp_strerror(errno) << dendl;
+      return;
+    }
   }
   
   for (ceph::unordered_map<vinodeno_t,CInode*>::iterator it = inode_map.begin();
        it != inode_map.end();
        ++it) {
     CInode *in = it->second;
-    ostringstream ss;
-    ss << *in << std::endl;
-    std::string s = ss.str();
-    r = safe_write(fd, s.c_str(), s.length());
-    if (r < 0)
-      goto out;
+    if (f) {
+      f->open_object_section("inode");
+      in->dump(f);
+    } else {
+      ostringstream ss;
+      ss << *in << std::endl;
+      std::string s = ss.str();
+      r = safe_write(fd, s.c_str(), s.length());
+      if (r < 0) {
+        goto out;
+      }
+    }
 
     list<CDir*> dfs;
     in->get_dirfrags(dfs);
+    if (f) {
+      f->open_array_section("dirfrags");
+    }
     for (list<CDir*>::iterator p = dfs.begin(); p != dfs.end(); ++p) {
       CDir *dir = *p;
-      ostringstream tt;
-      tt << " " << *dir << std::endl;
-      string t = tt.str();
-      r = safe_write(fd, t.c_str(), t.length());
-      if (r < 0)
-	goto out;
+      if (f) {
+        f->open_object_section("dir");
+        dir->dump(f);
+      } else {
+        ostringstream tt;
+        tt << " " << *dir << std::endl;
+        string t = tt.str();
+        r = safe_write(fd, t.c_str(), t.length());
+        if (r < 0) {
+          goto out;
+        }
+      }
       
+      if (f) {
+        f->open_array_section("dentries");
+      }
       for (CDir::map_t::iterator q = dir->items.begin();
 	   q != dir->items.end();
 	   ++q) {
 	CDentry *dn = q->second;
-	ostringstream uu;
-	uu << "  " << *dn << std::endl;
-	string u = uu.str();
-	r = safe_write(fd, u.c_str(), u.length());
-	if (r < 0)
-	  goto out;
+        if (f) {
+	  f->open_object_section("dentry");
+          dn->dump(f);
+          f->close_section();
+        } else {
+          ostringstream uu;
+          uu << "  " << *dn << std::endl;
+          string u = uu.str();
+          r = safe_write(fd, u.c_str(), u.length());
+          if (r < 0) {
+            goto out;
+          }
+        }
+      }
+      if (f) {
+	f->close_section();  //dentries
       }
       dir->check_rstats();
+      if (f) {
+	f->close_section();  //dir
+      }
+    }
+    if (f) {
+      f->close_section();  // dirfrags
+    }
+
+    if (f) {
+      f->close_section();  // inode
     }
   }
 
  out:
-  ::close(fd);
+  if (f) {
+    f->close_section();  // inodes
+  } else {
+    ::close(fd);
+  }
 }
 
 
@@ -11815,11 +11712,11 @@ void MDCache::flush_dentry(const string& path, Context *fin)
 
 class C_FinishIOMDR : public MDSInternalContextBase {
 protected:
-  MDS *mds;
+  MDSRank *mds;
   MDRequestRef mdr;
-  MDS *get_mds() { return mds; }
+  MDSRank *get_mds() { return mds; }
 public:
-  C_FinishIOMDR(MDS *mds_, MDRequestRef& mdr_) : mds(mds_), mdr(mdr_) {}
+  C_FinishIOMDR(MDSRank *mds_, MDRequestRef& mdr_) : mds(mds_), mdr(mdr_) {}
   void finish(int r) { mds->server->respond_to_request(mdr, r); }
 };
 
@@ -11849,21 +11746,69 @@ void MDCache::register_perfcounters()
             "mds_cache", l_mdc_first, l_mdc_last);
 
     /* Stray/purge statistics */
-    pcb.add_u64(l_mdc_num_strays, "num_strays");
-    pcb.add_u64(l_mdc_num_strays_purging, "num_strays_purging");
-    pcb.add_u64(l_mdc_num_strays_delayed, "num_strays_delayed");
-    pcb.add_u64_counter(l_mdc_strays_created, "strays_created");
-    pcb.add_u64_counter(l_mdc_strays_purged, "strays_purged");
+    pcb.add_u64(l_mdc_num_strays, "num_strays",
+        "Stray dentries", "stry");
+    pcb.add_u64(l_mdc_num_strays_purging, "num_strays_purging", "Stray dentries purging");
+    pcb.add_u64(l_mdc_num_strays_delayed, "num_strays_delayed", "Stray dentries delayed");
+    pcb.add_u64(l_mdc_num_purge_ops, "num_purge_ops", "Purge operations");
+    pcb.add_u64_counter(l_mdc_strays_created, "strays_created", "Stray dentries created");
+    pcb.add_u64_counter(l_mdc_strays_purged, "strays_purged",
+        "Stray dentries purged", "purg");
+    pcb.add_u64_counter(l_mdc_strays_reintegrated, "strays_reintegrated", "Stray dentries reintegrated");
+    pcb.add_u64_counter(l_mdc_strays_migrated, "strays_migrated", "Stray dentries migrated");
 
     /* Recovery queue statistics */
-    pcb.add_u64(l_mdc_num_recovering_processing, "num_recovering_processing");
-    pcb.add_u64(l_mdc_num_recovering_enqueued, "num_recovering_enqueued");
-    pcb.add_u64(l_mdc_num_recovering_prioritized, "num_recovering_prioritized");
-    pcb.add_u64_counter(l_mdc_recovery_started, "recovery_started");
-    pcb.add_u64_counter(l_mdc_recovery_completed, "recovery_completed");
+    pcb.add_u64(l_mdc_num_recovering_processing, "num_recovering_processing", "Files currently being recovered");
+    pcb.add_u64(l_mdc_num_recovering_enqueued, "num_recovering_enqueued",
+        "Files waiting for recovery", "recy");
+    pcb.add_u64(l_mdc_num_recovering_prioritized, "num_recovering_prioritized", "Files waiting for recovery with elevated priority");
+    pcb.add_u64_counter(l_mdc_recovery_started, "recovery_started", "File recoveries started");
+    pcb.add_u64_counter(l_mdc_recovery_completed, "recovery_completed",
+        "File recoveries completed", "recd");
 
     logger = pcb.create_perf_counters();
     g_ceph_context->get_perfcounters_collection()->add(logger);
     recovery_queue.set_logger(logger);
+    stray_manager.set_logger(logger);
+}
+
+/**
+ * Call this when putting references to an inode/dentry or
+ * when attempting to trim it.
+ *
+ * If this inode is no longer linked by anyone, and this MDS
+ * rank holds the primary dentry, and that dentry is in a stray
+ * directory, then give up the dentry to the StrayManager, never
+ * to be seen again by MDCache.
+ *
+ * @param delay if true, then purgeable inodes are stashed til
+ *              the next trim(), rather than being purged right
+ *              away.
+ */
+void MDCache::maybe_eval_stray(CInode *in, bool delay) {
+  if (in->inode.nlink > 0 || in->is_base() || is_readonly() || mds->is_standby_replay())
+    return;
+  CDentry *dn = in->get_projected_parent_dn();
+
+  if (dn->state_test(CDentry::STATE_PURGING)) {
+    /* We have already entered the purging process, no need
+     * to re-evaluate me ! */
+    return;
+  }
+
+  if (dn->get_projected_linkage()->is_primary() &&
+      dn->get_dir()->get_inode()->is_stray()) {
+    stray_manager.eval_stray(dn, delay);
+  }
+}
+
+void MDCache::notify_mdsmap_changed()
+{
+  stray_manager.update_op_limit();
+}
+
+void MDCache::notify_osdmap_changed()
+{
+  stray_manager.update_op_limit();
 }
 
diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
index c31ffdb..c369acd 100644
--- a/src/mds/MDCache.h
+++ b/src/mds/MDCache.h
@@ -21,20 +21,23 @@
 #include "include/filepath.h"
 #include "include/elist.h"
 
+#include "osdc/Filer.h"
 #include "CInode.h"
 #include "CDentry.h"
 #include "CDir.h"
 #include "include/Context.h"
 #include "events/EMetaBlob.h"
 #include "RecoveryQueue.h"
+#include "StrayManager.h"
 #include "MDSContext.h"
+#include "MDSMap.h"
 
 #include "messages/MClientRequest.h"
 #include "messages/MMDSSlaveRequest.h"
 
 class PerfCounters;
 
-class MDS;
+class MDSRank;
 class Session;
 class Migrator;
 
@@ -74,16 +77,22 @@ struct MDSlaveUpdate;
 
 enum {
   l_mdc_first = 3000,
-  // How many dentries are currently in stray dirs
+  // How many inodes currently in stray dentries
   l_mdc_num_strays,
   // How many stray dentries are currently being purged
   l_mdc_num_strays_purging,
   // How many stray dentries are currently delayed for purge due to refs
   l_mdc_num_strays_delayed,
+  // How many purge RADOS ops might currently be in flight?
+  l_mdc_num_purge_ops,
   // How many dentries have ever been added to stray dir
   l_mdc_strays_created,
   // How many dentries have ever finished purging from stray dir
   l_mdc_strays_purged,
+  // How many strays have been reintegrated?
+  l_mdc_strays_reintegrated,
+  // How many strays have been migrated?
+  l_mdc_strays_migrated,
 
   // How many inode sizes currently being recovered
   l_mdc_num_recovering_processing,
@@ -108,7 +117,7 @@ static const int PREDIRTY_SHALLOW = 4; // only go to immediate parent (for easie
 class MDCache {
  public:
   // my master
-  MDS *mds;
+  MDSRank *mds;
 
   // -- my cache --
   LRU lru;   // dentry lru for expiring items from cache
@@ -131,10 +140,23 @@ class MDCache {
 
   PerfCounters *logger;
 
+  Filer filer;
+
 public:
   void advance_stray() {
     stray_index = (stray_index+1)%NUM_STRAY;
   }
+
+  /**
+   * Call this when you know that a CDentry is ready to be passed
+   * on to StrayManager (i.e. this is a stray you've just created)
+   */
+  void notify_stray(CDentry *dn) {
+    assert(dn->get_dir()->get_inode()->is_stray());
+    stray_manager.eval_stray(dn);
+  }
+
+  void maybe_eval_stray(CInode *in, bool delay=false);
   bool is_readonly() { return readonly; }
   void force_readonly();
 
@@ -143,12 +165,11 @@ public:
   int num_inodes_with_caps;
   int num_caps;
 
-  uint64_t num_strays;
-  uint64_t num_strays_purging;
-  uint64_t num_strays_delayed;
-
   unsigned max_dir_commit_size;
 
+  static ceph_file_layout gen_default_file_layout(const MDSMap &mdsmap);
+  static ceph_file_layout gen_default_log_layout(const MDSMap &mdsmap);
+
   ceph_file_layout default_file_layout;
   ceph_file_layout default_log_layout;
 
@@ -166,6 +187,16 @@ public:
     r->ttl = ttl;
   }
 
+  void notify_stray_removed()
+  {
+    stray_manager.notify_stray_removed();
+  }
+
+  void notify_stray_created()
+  {
+    stray_manager.notify_stray_created();
+  }
+
   // -- client caps --
   uint64_t              last_cap_id;
   
@@ -430,7 +461,7 @@ public:
   }
   void cancel_ambiguous_import(CDir *);
   void finish_ambiguous_import(dirfrag_t dirino);
-  void resolve_start();
+  void resolve_start(MDSInternalContext *resolve_done_);
   void send_resolves();
   void send_slave_resolves();
   void send_subtree_resolves();
@@ -489,8 +520,10 @@ protected:
     if (rejoins_pending)
       rejoin_send_rejoins();
   }
+  MDSInternalContext *rejoin_done;
+  MDSInternalContext *resolve_done;
 public:
-  void rejoin_start();
+  void rejoin_start(MDSInternalContext *rejoin_done_);
   void rejoin_gather_finish();
   void rejoin_send_rejoins();
   void rejoin_export_caps(inodeno_t ino, client_t client, ceph_mds_cap_reconnect& capinfo,
@@ -569,6 +602,9 @@ public:
   friend class Migrator;
   friend class MDBalancer;
 
+  // StrayManager needs to be able to remove_inode() from us
+  // when it is done purging
+  friend class StrayManager;
 
   // File size recovery
 private:
@@ -584,7 +620,7 @@ public:
   Migrator *migrator;
 
  public:
-  MDCache(MDS *m);
+  MDCache(MDSRank *m);
   ~MDCache();
   
   // debug
@@ -615,6 +651,25 @@ public:
 	   uncommitted_slave_rename_olddir.count(dir->inode) == 0;
   }
 
+  /**
+   * For all unreferenced inodes, dirs, dentries below an inode, compose
+   * expiry messages.  This is used when giving up all replicas of entities
+   * for an MDS peer in the 'stopping' state, such that the peer can
+   * empty its cache and finish shutting down.
+   *
+   * We have to make sure we're only expiring un-referenced items to
+   * avoid interfering with ongoing stray-movement (we can't distinguish
+   * between the "moving my strays" and "waiting for my cache to empty"
+   * phases within 'stopping')
+   *
+   * @return false if we completed cleanly, true if caller should stop
+   *         expiring because we hit something with refs.
+   */
+  bool expire_recursive(
+    CInode *in,
+    std::map<mds_rank_t, MCacheExpire*>& expiremap,
+    CDir *subtree);
+
   void trim_client_leases();
   void check_memory_usage();
 
@@ -839,7 +894,8 @@ protected:
     int64_t pool;
     list<MDSInternalContextBase*> waiters;
     open_ino_info_t() : checking(MDS_RANK_NONE), auth_hint(MDS_RANK_NONE),
-      check_peers(true), fetch_backtrace(true), discover(false) {}
+      check_peers(true), fetch_backtrace(true), discover(false),
+      want_replica(false), want_xlocked(false), tid(0), pool(-1) {}
   };
   ceph_tid_t open_ino_last_tid;
   map<inodeno_t,open_ino_info_t> opening_inodes;
@@ -894,38 +950,14 @@ public:
 
   // -- stray --
 public:
-  elist<CDentry*> delayed_eval_stray;
-
-  void eval_stray(CDentry *dn, bool delay=false);
   void eval_remote(CDentry *dn);
-
-  void maybe_eval_stray(CInode *in, bool delay=false) {
-    if (in->inode.nlink > 0 || in->is_base() || is_readonly())
-      return;
-    CDentry *dn = in->get_projected_parent_dn();
-    if (!dn->state_test(CDentry::STATE_PURGING) &&
-	dn->get_projected_linkage()->is_primary() &&
-	dn->get_dir()->get_inode()->is_stray())
-      eval_stray(dn, delay);
-  }
-
   void fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin);
 
 protected:
   void scan_stray_dir(dirfrag_t next=dirfrag_t());
-  void truncate_stray(CDentry *dn);
-  void purge_stray(CDentry *dn);
-  void _purge_stray_purged(CDentry *dn, bool only_head);
-  void _purge_stray_logged(CDentry *dn, version_t pdv, LogSegment *ls);
-  void _purge_stray_logged_truncate(CDentry *dn, LogSegment *ls);
+  StrayManager stray_manager;
   friend struct C_MDC_RetryScanStray;
   friend class C_IO_MDC_FetchedBacktrace;
-  friend class C_MDC_PurgeStrayLogged;
-  friend class C_MDC_PurgeStrayLoggedTruncate;
-  friend class C_IO_MDC_PurgeStrayPurged;
-  void reintegrate_stray(CDentry *dn, CDentry *rlink);
-  void migrate_stray(CDentry *dn, mds_rank_t dest);
-
 
   // == messages ==
  public:
@@ -994,7 +1026,7 @@ private:
     utime_t last_cum_auth_pins_change;
     int last_cum_auth_pins;
     int num_remote_waiters;	// number of remote authpin waiters
-    fragment_info_t() : all_frozen(false), last_cum_auth_pins(0), num_remote_waiters(0) {}
+    fragment_info_t() : bits(0), all_frozen(false), last_cum_auth_pins(0), num_remote_waiters(0) {}
     bool is_fragmenting() { return !resultfrags.empty(); }
   };
   map<dirfrag_t,fragment_info_t> fragments;
@@ -1061,11 +1093,19 @@ public:
   void process_delayed_expire(CDir *dir);
   void discard_delayed_expire(CDir *dir);
 
+  void notify_mdsmap_changed();
+  void notify_osdmap_changed();
+
+protected:
+  void dump_cache(const char *fn, Formatter *f);
+public:
+  void dump_cache() {dump_cache(NULL, NULL);}
+  void dump_cache(const std::string &filename);
+  void dump_cache(Formatter *f);
 
   // == crap fns ==
  public:
   void show_cache();
-  void dump_cache(const char *fn=0);
   void show_subtrees(int dbl=10);
 
   CInode *hack_pick_random_inode() {
diff --git a/src/mds/MDLog.cc b/src/mds/MDLog.cc
index d8c634e..43cdc48 100644
--- a/src/mds/MDLog.cc
+++ b/src/mds/MDLog.cc
@@ -12,8 +12,8 @@
  * 
  */
 
+#include "MDSRank.h"
 #include "MDLog.h"
-#include "MDS.h"
 #include "MDCache.h"
 #include "LogEvent.h"
 #include "MDSContext.h"
@@ -53,37 +53,45 @@ void MDLog::create_logger()
 {
   PerfCountersBuilder plb(g_ceph_context, "mds_log", l_mdl_first, l_mdl_last);
 
-  plb.add_u64_counter(l_mdl_evadd, "evadd");
-  plb.add_u64_counter(l_mdl_evex, "evex");
-  plb.add_u64_counter(l_mdl_evtrm, "evtrm");
-  plb.add_u64(l_mdl_ev, "ev");
-  plb.add_u64(l_mdl_evexg, "evexg");
-  plb.add_u64(l_mdl_evexd, "evexd");
-
-  plb.add_u64_counter(l_mdl_segadd, "segadd");
-  plb.add_u64_counter(l_mdl_segex, "segex");
-  plb.add_u64_counter(l_mdl_segtrm, "segtrm");
-  plb.add_u64(l_mdl_seg, "seg");
-  plb.add_u64(l_mdl_segexg, "segexg");
-  plb.add_u64(l_mdl_segexd, "segexd");
-
-  plb.add_u64(l_mdl_expos, "expos");
-  plb.add_u64(l_mdl_wrpos, "wrpos");
-  plb.add_u64(l_mdl_rdpos, "rdpos");
-  plb.add_u64(l_mdl_jlat, "jlat");
+  plb.add_u64_counter(l_mdl_evadd, "evadd",
+      "Events submitted", "subm");
+  plb.add_u64_counter(l_mdl_evex, "evex", "Total expired events");
+  plb.add_u64_counter(l_mdl_evtrm, "evtrm", "Trimmed events");
+  plb.add_u64(l_mdl_ev, "ev",
+      "Events", "evts");
+  plb.add_u64(l_mdl_evexg, "evexg", "Expiring events");
+  plb.add_u64(l_mdl_evexd, "evexd", "Current expired events");
+
+  plb.add_u64_counter(l_mdl_segadd, "segadd", "Segments added");
+  plb.add_u64_counter(l_mdl_segex, "segex", "Total expired segments");
+  plb.add_u64_counter(l_mdl_segtrm, "segtrm", "Trimmed segments");
+  plb.add_u64(l_mdl_seg, "seg",
+      "Segments", "segs");
+  plb.add_u64(l_mdl_segexg, "segexg", "Expiring segments");
+  plb.add_u64(l_mdl_segexd, "segexd", "Current expired segments");
+
+  plb.add_u64(l_mdl_expos, "expos", "Journaler xpire position");
+  plb.add_u64(l_mdl_wrpos, "wrpos", "Journaler  write position");
+  plb.add_u64(l_mdl_rdpos, "rdpos", "Journaler  read position");
+  plb.add_u64(l_mdl_jlat, "jlat", "Journaler flush latency");
 
   // logger
   logger = plb.create_perf_counters();
   g_ceph_context->get_perfcounters_collection()->add(logger);
 }
 
+void MDLog::set_write_iohint(unsigned iohint_flags)
+{
+  journaler->set_write_iohint(iohint_flags);
+}
+
 class C_MDL_WriteError : public MDSIOContextBase {
   protected:
   MDLog *mdlog;
-  MDS *get_mds() {return mdlog->mds;}
+  MDSRank *get_mds() {return mdlog->mds;}
 
   void finish(int r) {
-    MDS *mds = get_mds();
+    MDSRank *mds = get_mds();
     // assume journal is reliable, so don't choose action based on
     // g_conf->mds_action_on_write_error.
     if (r == -EBLACKLISTED) {
@@ -104,7 +112,7 @@ void MDLog::write_head(MDSInternalContextBase *c)
 {
   C_OnFinisher *fin = NULL;
   if (c != NULL) {
-    fin = new C_OnFinisher(new C_IO_Wrapper(mds, c), &(mds->finisher));
+    fin = new C_OnFinisher(new C_IO_Wrapper(mds, c), mds->finisher);
   }
   journaler->write_head(fin);
 }
@@ -133,7 +141,7 @@ void MDLog::create(MDSInternalContextBase *c)
   C_GatherBuilder gather(g_ceph_context);
   // This requires an OnFinisher wrapper because Journaler will call back the completion for write_head inside its own lock
   // XXX but should maybe that be handled inside Journaler?
-  gather.set_finisher(new C_OnFinisher(new C_IO_Wrapper(mds, c), &(mds->finisher)));
+  gather.set_finisher(new C_OnFinisher(new C_IO_Wrapper(mds, c), mds->finisher));
 
   // The inode of the default Journaler we will create
   ino = MDS_INO_LOG_OFFSET + mds->get_nodeid();
@@ -143,7 +151,7 @@ void MDLog::create(MDSInternalContextBase *c)
   journaler = new Journaler(ino, mds->mdsmap->get_metadata_pool(), CEPH_FS_ONDISK_MAGIC, mds->objecter,
 			    logger, l_mdl_jlat,
 			    &mds->timer,
-                            &mds->finisher);
+                            mds->finisher);
   assert(journaler->is_readonly());
   journaler->set_write_error_handler(new C_MDL_WriteError(this));
   journaler->set_writeable();
@@ -170,7 +178,6 @@ void MDLog::open(MDSInternalContextBase *c)
 
   recovery_thread.set_completion(c);
   recovery_thread.create();
-  recovery_thread.detach();
 
   submit_thread.create();
   // either append() or replay() will follow.
@@ -210,7 +217,6 @@ void MDLog::reopen(MDSInternalContextBase *c)
 
   recovery_thread.set_completion(new C_ReopenComplete(this, c));
   recovery_thread.create();
-  recovery_thread.detach();
 }
 
 void MDLog::append()
@@ -234,7 +240,6 @@ void MDLog::_start_entry(LogEvent *e)
 
   assert(cur_event == NULL);
   cur_event = e;
-  e->set_start_off(get_write_pos());
 
   event_seq++;
 
@@ -312,13 +317,39 @@ void MDLog::_submit_entry(LogEvent *le, MDSInternalContextBase *c)
   }
 }
 
+/**
+ * Invoked on the flush after each entry submitted
+ */
+class C_MDL_Flushed : public MDSIOContextBase {
+  protected:
+  MDLog *mdlog;
+  MDSRank *get_mds() {return mdlog->mds;}
+  uint64_t flushed_to;
+  MDSInternalContextBase *wrapped;
+
+  void finish(int r) {
+    if (wrapped) {
+      wrapped->complete(r);
+    }
+
+    mdlog->submit_mutex.Lock();
+    assert(mdlog->safe_pos <= flushed_to);
+    mdlog->safe_pos = flushed_to;
+    mdlog->submit_mutex.Unlock();
+  }
+
+  public:
+  C_MDL_Flushed(MDLog *m, uint64_t ft, MDSInternalContextBase *w)
+    : mdlog(m), flushed_to(ft), wrapped(w) {}
+};
+
 void MDLog::_submit_thread()
 {
   dout(10) << "_submit_thread start" << dendl;
 
   submit_mutex.Lock();
 
-  while (!stopping) {
+  while (!mds->is_daemon_stopping()) {
     map<uint64_t,list<PendingEvent> >::iterator it = pending_events.begin();
     if (it == pending_events.end()) {
       submit_cond.Wait(submit_mutex);
@@ -352,11 +383,12 @@ void MDLog::_submit_thread()
 	      << " : " << *le << dendl;
 
       // journal it.
-      journaler->append_entry(bl);  // bl is destroyed.
-      ls->end = journaler->get_write_pos();
+      const uint64_t new_write_pos = journaler->append_entry(bl);  // bl is destroyed.
+      ls->end = new_write_pos;
+
+      journaler->wait_for_flush(new C_MDL_Flushed(
+            this, new_write_pos, data.fin));
 
-      if (data.fin)
-	journaler->wait_for_flush(new C_IO_Wrapper(mds, data.fin));
       if (data.flush)
 	journaler->flush();
 
@@ -365,8 +397,8 @@ void MDLog::_submit_thread()
 
       delete le;
     } else {
-      if (data.fin)
-	journaler->wait_for_flush(new C_IO_Wrapper(mds, data.fin));
+      journaler->wait_for_flush(new C_MDL_Flushed(
+            this, journaler->get_write_pos(), data.fin));
       if (data.flush)
 	journaler->flush();
     }
@@ -430,19 +462,48 @@ void MDLog::cap()
 
 void MDLog::shutdown()
 {
+  assert(mds->mds_lock.is_locked_by_me());
+
   dout(5) << "shutdown" << dendl;
-  if (!submit_thread.is_started())
-    return;
+  if (submit_thread.is_started()) {
+    assert(mds->is_daemon_stopping());
 
-  assert(mds->mds_lock.is_locked_by_me());
-  mds->mds_lock.Unlock();
+    if (submit_thread.am_self()) {
+      // Called suicide from the thread: trust it to do no work after
+      // returning from suicide, and subsequently respect mds->is_daemon_stopping()
+      // and fall out of its loop.
+    } else {
+      mds->mds_lock.Unlock();
+      // Because MDS::stopping is true, it's safe to drop mds_lock: nobody else
+      // picking it up will do anything with it.
+   
+      submit_mutex.Lock();
+      submit_cond.Signal();
+      submit_mutex.Unlock();
 
-  submit_mutex.Lock();
-  stopping = true;
-  submit_cond.Signal();
-  submit_mutex.Unlock();
+      mds->mds_lock.Lock();
+
+      submit_thread.join();
+    }
+  }
+
+  // Replay thread can be stuck inside e.g. Journaler::wait_for_readable,
+  // so we need to shutdown the journaler first.
+  if (journaler) {
+    journaler->shutdown();
+  }
+
+  if (replay_thread.is_started() && !replay_thread.am_self()) {
+    mds->mds_lock.Unlock();
+    replay_thread.join();
+    mds->mds_lock.Lock();
+  }
 
-  mds->mds_lock.Lock();
+  if (recovery_thread.is_started() && !recovery_thread.am_self()) {
+    mds->mds_lock.Unlock();
+    recovery_thread.join();
+    mds->mds_lock.Lock();
+  }
 }
 
 
@@ -485,7 +546,7 @@ void MDLog::_journal_segment_subtree_map(MDSInternalContextBase *onsync)
 
 void MDLog::trim(int m)
 {
-  int max_segments = g_conf->mds_log_max_segments;
+  unsigned max_segments = g_conf->mds_log_max_segments;
   int max_events = g_conf->mds_log_max_events;
   if (m >= 0)
     max_events = m;
@@ -495,6 +556,11 @@ void MDLog::trim(int m)
     return;
   }
 
+  // Clamp max_events to not be smaller than events per segment
+  if (max_events > 0 && max_events <= g_conf->mds_log_events_per_segment) {
+    max_events = g_conf->mds_log_events_per_segment + 1;
+  }
+
   submit_mutex.Lock();
 
   // trim!
@@ -515,11 +581,10 @@ void MDLog::trim(int m)
   stop += 2.0;
 
   map<uint64_t,LogSegment*>::iterator p = segments.begin();
-  while (p != segments.end() && 
+  while (p != segments.end() &&
 	 ((max_events >= 0 &&
 	   num_events - expiring_events - expired_events > max_events) ||
-	  (max_segments >= 0 &&
-	   segments.size() - expiring_segments.size() - expired_segments.size() > (unsigned)max_segments))) {
+	  (segments.size() - expiring_segments.size() - expired_segments.size() > max_segments))) {
     
     if (stop < ceph_clock_now(g_ceph_context))
       break;
@@ -538,7 +603,7 @@ void MDLog::trim(int m)
     ++p;
     
     if (pending_events.count(ls->seq) ||
-	ls->end > journaler->get_write_safe_pos()) {
+	ls->end > safe_pos) {
       dout(5) << "trim segment " << ls->seq << "/" << ls->offset << ", not fully flushed yet, safe "
 	      << journaler->get_write_safe_pos() << " < end " << ls->end << dendl;
       break;
@@ -582,7 +647,7 @@ class C_MaybeExpiredSegment : public MDSInternalContext {
 };
 
 /**
- * Like ::trim, but instead of trimming to max_segments, trim all but the latest
+ * Like MDLog::trim, but instead of trimming to max_segments, trim all but the latest
  * segment.
  */
 int MDLog::trim_all()
@@ -594,7 +659,6 @@ int MDLog::trim_all()
            << "/" << expiring_segments.size()
            << "/" << expired_segments.size() << dendl;
 
-  uint64_t safe_pos = journaler->get_write_safe_pos();
   uint64_t last_seq = 0;
   if (!segments.empty())
     last_seq = get_last_segment_seq();
@@ -779,7 +843,6 @@ void MDLog::replay(MDSInternalContextBase *c)
   already_replayed = true;
 
   replay_thread.create();
-  replay_thread.detach();
 }
 
 
@@ -816,9 +879,10 @@ void MDLog::_recovery_thread(MDSInternalContextBase *completion)
     // Nothing graceful we can do for this
     assert(write_result >= 0);
   } else if (read_result != 0) {
-    // No graceful way of handling this: give up and leave it for support
-    // to work out why RADOS preventing access.
-    assert(0);
+    mds->clog->error() << "failed to read JournalPointer: " << read_result
+                       << " (" << cpp_strerror(read_result) << ")";
+    mds->damaged_unlocked();
+    assert(0);  // Should be unreachable because damaged() calls respawn()
   }
 
   // If the back pointer is non-null, that means that a journal
@@ -834,16 +898,21 @@ void MDLog::_recovery_thread(MDSInternalContextBase *completion)
     dout(1) << "Erasing journal " << jp.back << dendl;
     C_SaferCond erase_waiter;
     Journaler back(jp.back, mds->mdsmap->get_metadata_pool(), CEPH_FS_ONDISK_MAGIC,
-        mds->objecter, logger, l_mdl_jlat, &mds->timer, &mds->finisher);
+        mds->objecter, logger, l_mdl_jlat, &mds->timer, mds->finisher);
 
     // Read all about this journal (header + extents)
     C_SaferCond recover_wait;
     back.recover(&recover_wait);
     int recovery_result = recover_wait.wait();
+    if (recovery_result != 0) {
+      // Journaler.recover succeeds if no journal objects are present: an error
+      // means something worse like a corrupt header, which we can't handle here.
+      mds->clog->error() << "Error recovering journal " << jp.front << ": "
+        << cpp_strerror(recovery_result);
+      mds->damaged_unlocked();
+      assert(recovery_result == 0); // Unreachable because damaged() calls respawn()
+    }
 
-    // Journaler.recover succeeds if no journal objects are present: an error
-    // means something worse like a corrupt header, which we can't handle here.
-    assert(recovery_result == 0);
     // We could read journal, so we can erase it.
     back.erase(&erase_waiter);
     int erase_result = erase_waiter.wait();
@@ -863,7 +932,15 @@ void MDLog::_recovery_thread(MDSInternalContextBase *completion)
 
   /* Read the header from the front journal */
   Journaler *front_journal = new Journaler(jp.front, mds->mdsmap->get_metadata_pool(),
-      CEPH_FS_ONDISK_MAGIC, mds->objecter, logger, l_mdl_jlat, &mds->timer, &mds->finisher);
+      CEPH_FS_ONDISK_MAGIC, mds->objecter, logger, l_mdl_jlat, &mds->timer, mds->finisher);
+
+  // Assign to ::journaler so that we can be aborted by ::shutdown while
+  // waiting for journaler recovery
+  {
+    Mutex::Locker l(mds->mds_lock);
+    journaler = front_journal;
+  }
+
   C_SaferCond recover_wait;
   front_journal->recover(&recover_wait);
   dout(4) << "Waiting for journal " << jp.front << " to recover..." << dendl;
@@ -871,36 +948,43 @@ void MDLog::_recovery_thread(MDSInternalContextBase *completion)
   dout(4) << "Journal " << jp.front << " recovered." << dendl;
 
   if (recovery_result != 0) {
-    derr << "Error recovering journal " << jp.front << ": " << cpp_strerror(recovery_result) << dendl;
-    mds->mds_lock.Lock();
-    completion->complete(recovery_result);
-    mds->mds_lock.Unlock();
-    return;
+    mds->clog->error() << "Error recovering journal " << jp.front << ": "
+      << cpp_strerror(recovery_result);
+    mds->damaged_unlocked();
+    assert(recovery_result == 0); // Unreachable because damaged() calls respawn()
   }
 
   /* Check whether the front journal format is acceptable or needs re-write */
   if (front_journal->get_stream_format() > JOURNAL_FORMAT_MAX) {
     dout(0) << "Journal " << jp.front << " is in unknown format " << front_journal->get_stream_format()
             << ", does this MDS daemon require upgrade?" << dendl;
-    mds->mds_lock.Lock();
-    completion->complete(-EINVAL);
-    mds->mds_lock.Unlock();
+    {
+      Mutex::Locker l(mds->mds_lock);
+      if (mds->is_daemon_stopping()) {
+        journaler = NULL;
+        delete front_journal;
+        return;
+      }
+      completion->complete(-EINVAL);
+    }
   } else if (mds->is_standby_replay() || front_journal->get_stream_format() >= g_conf->mds_journal_format) {
     /* The journal is of configured format, or we are in standbyreplay and will
      * tolerate replaying old journals until we have to go active. Use front_journal as
      * our journaler attribute and complete */
     dout(4) << "Recovered journal " << jp.front << " in format " << front_journal->get_stream_format() << dendl;
-    journaler = front_journal;
     journaler->set_write_error_handler(new C_MDL_WriteError(this));
-    mds->mds_lock.Lock();
-    completion->complete(0);
-    mds->mds_lock.Unlock();
+    {
+      Mutex::Locker l(mds->mds_lock);
+      if (mds->is_daemon_stopping()) {
+        return;
+      }
+      completion->complete(0);
+    }
   } else {
     /* Hand off to reformat routine, which will ultimately set the
      * completion when it has done its thing */
     dout(1) << "Journal " << jp.front << " has old format "
       << front_journal->get_stream_format() << ", it will now be updated" << dendl;
-
     _reformat_journal(jp, front_journal, completion);
   }
 }
@@ -930,7 +1014,7 @@ void MDLog::_reformat_journal(JournalPointer const &jp_in, Journaler *old_journa
 
   /* Create the new Journaler file */
   Journaler *new_journal = new Journaler(jp.back, mds->mdsmap->get_metadata_pool(),
-      CEPH_FS_ONDISK_MAGIC, mds->objecter, logger, l_mdl_jlat, &mds->timer, &mds->finisher);
+      CEPH_FS_ONDISK_MAGIC, mds->objecter, logger, l_mdl_jlat, &mds->timer, mds->finisher);
   dout(4) << "Writing new journal header " << jp.back << dendl;
   ceph_file_layout new_layout = old_journal->get_layout();
   new_journal->set_writeable();
@@ -1016,9 +1100,11 @@ void MDLog::_reformat_journal(JournalPointer const &jp_in, Journaler *old_journa
 
       // Zero-out expire_pos in subtreemap because offsets have changed
       // (expire_pos is just an optimization so it's safe to eliminate it)
-      if (le->get_type() == EVENT_SUBTREEMAP) {
-        dout(20) << __func__ << " zeroing expire_pos in subtreemap event at " << le_pos << dendl;
+      if (le->get_type() == EVENT_SUBTREEMAP
+          || le->get_type() == EVENT_SUBTREEMAP_TEST) {
         ESubtreeMap *sle = dynamic_cast<ESubtreeMap*>(le);
+        dout(20) << __func__ << " zeroing expire_pos in subtreemap event at "
+          << le_pos << " seq=" << sle->event_seq << dendl;
         assert(sle != NULL);
         sle->expire_pos = 0;
         modified = true;
@@ -1065,7 +1151,16 @@ void MDLog::_reformat_journal(JournalPointer const &jp_in, Journaler *old_journa
   old_journal->erase(&erase_waiter);
   int erase_result = erase_waiter.wait();
   assert(erase_result == 0);
-  delete old_journal;
+  {
+    Mutex::Locker l(mds->mds_lock);
+    if (mds->is_daemon_stopping()) {
+      delete new_journal;
+      return;
+    }
+    assert(journaler == old_journal);
+    journaler = NULL;
+    delete old_journal;
+  }
 
   /* Update the pointer to reflect we're back in clean single journal state. */
   jp.back = 0;
@@ -1074,14 +1169,25 @@ void MDLog::_reformat_journal(JournalPointer const &jp_in, Journaler *old_journa
 
   /* Reset the Journaler object to its default state */
   dout(1) << "Journal rewrite complete, continuing with normal startup" << dendl;
-  journaler = new_journal;
-  journaler->set_readonly();
-  journaler->set_write_error_handler(new C_MDL_WriteError(this));
+  {
+    Mutex::Locker l(mds->mds_lock);
+    if (mds->is_daemon_stopping()) {
+      delete new_journal;
+      return;
+    }
+    journaler = new_journal;
+    journaler->set_readonly();
+    journaler->set_write_error_handler(new C_MDL_WriteError(this));
+  }
 
   /* Trigger completion */
-  mds->mds_lock.Lock();
-  completion->complete(0);
-  mds->mds_lock.Unlock();
+  {
+    Mutex::Locker l(mds->mds_lock);
+    if (mds->is_daemon_stopping()) {
+      return;
+    }
+    completion->complete(0);
+  }
 }
 
 
@@ -1099,21 +1205,31 @@ void MDLog::_replay_thread()
 	   !journaler->get_error()) {
       C_SaferCond readable_waiter;
       journaler->wait_for_readable(&readable_waiter);
-      readable_waiter.wait();
+      r = readable_waiter.wait();
     }
     if (journaler->get_error()) {
       r = journaler->get_error();
       dout(0) << "_replay journaler got error " << r << ", aborting" << dendl;
       if (r == -ENOENT) {
-	// journal has been trimmed by somebody else?
-	assert(journaler->is_readonly());
-	r = -EAGAIN;
+        if (mds->is_standby_replay()) {
+          // journal has been trimmed by somebody else
+          r = -EAGAIN;
+        } else {
+          mds->clog->error() << "missing journal object";
+          mds->damaged_unlocked();
+          assert(0);  // Should be unreachable because damaged() calls respawn()
+        }
       } else if (r == -EINVAL) {
         if (journaler->get_read_pos() < journaler->get_expire_pos()) {
           // this should only happen if you're following somebody else
-          assert(journaler->is_readonly());
-          dout(0) << "expire_pos is higher than read_pos, returning EAGAIN" << dendl;
-          r = -EAGAIN;
+          if(journaler->is_readonly()) {
+            dout(0) << "expire_pos is higher than read_pos, returning EAGAIN" << dendl;
+            r = -EAGAIN;
+          } else {
+            mds->clog->error() << "invalid journaler offsets";
+            mds->damaged_unlocked();
+            assert(0);  // Should be unreachable because damaged() calls respawn()
+          }
         } else {
           /* re-read head and check it
            * Given that replay happens in a separate thread and
@@ -1132,7 +1248,11 @@ void MDLog::_replay_thread()
             } else {
                 dout(0) << "got error while reading head: " << cpp_strerror(err)
                         << dendl;
-                mds->suicide();
+
+                mds->clog->error() << "error reading journal header";
+                mds->damaged_unlocked();
+                assert(0);  // Should be unreachable because damaged() calls
+                            // respawn()
             }
           }
 	  standby_trim_segments();
@@ -1144,12 +1264,12 @@ void MDLog::_replay_thread()
       }
       break;
     }
-    
+
     if (!journaler->is_readable() &&
 	journaler->get_read_pos() == journaler->get_write_pos())
       break;
     
-    assert(journaler->is_readable());
+    assert(journaler->is_readable() || mds->is_daemon_stopping());
     
     // read it
     uint64_t pos = journaler->get_read_pos();
@@ -1168,8 +1288,17 @@ void MDLog::_replay_thread()
       bl.hexdump(*_dout);
       *_dout << dendl;
 
-      assert(!!"corrupt log event" == g_conf->mds_log_skip_corrupt_events);
-      continue;
+      mds->clog->error() << "corrupt journal event at " << pos << "~"
+                         << bl.length() << " / "
+                         << journaler->get_write_pos();
+      if (g_conf->mds_log_skip_corrupt_events) {
+        continue;
+      } else {
+        mds->damaged_unlocked();
+        assert(0);  // Should be unreachable because damaged() calls
+                    // respawn()
+      }
+
     }
     le->set_start_off(pos);
 
@@ -1199,9 +1328,13 @@ void MDLog::_replay_thread()
       le->_segment->end = journaler->get_read_pos();
       num_events++;
 
-      mds->mds_lock.Lock();
-      le->replay(mds);
-      mds->mds_lock.Unlock();
+      {
+        Mutex::Locker l(mds->mds_lock);
+        if (mds->is_daemon_stopping()) {
+          return;
+        }
+        le->replay(mds);
+      }
     }
     delete le;
 
@@ -1217,10 +1350,16 @@ void MDLog::_replay_thread()
     logger->set(l_mdl_expos, journaler->get_expire_pos());
   }
 
+  safe_pos = journaler->get_write_safe_pos();
+
   dout(10) << "_replay_thread kicking waiters" << dendl;
-  mds->mds_lock.Lock();
-  finish_contexts(g_ceph_context, waitfor_replay, r);  
-  mds->mds_lock.Unlock();
+  {
+    Mutex::Locker l(mds->mds_lock);
+    if (mds->is_daemon_stopping()) {
+      return;
+    }
+    finish_contexts(g_ceph_context, waitfor_replay, r);  
+  }
 
   dout(10) << "_replay_thread finish" << dendl;
 }
diff --git a/src/mds/MDLog.h b/src/mds/MDLog.h
index 7b21ff2..c8f9e70 100644
--- a/src/mds/MDLog.h
+++ b/src/mds/MDLog.h
@@ -50,7 +50,7 @@ enum {
 class Journaler;
 class JournalPointer;
 class LogEvent;
-class MDS;
+class MDSRank;
 class LogSegment;
 class ESubtreeMap;
 
@@ -64,7 +64,7 @@ using std::map;
 
 class MDLog {
 public:
-  MDS *mds;
+  MDSRank *mds;
 protected:
   int num_events; // in events
 
@@ -72,7 +72,10 @@ protected:
 
   bool capped;
 
-  bool stopping;
+  // Log position which is persistent *and* for which
+  // submit_entry wait_for_safe callbacks have already
+  // been called.
+  uint64_t safe_pos;
 
   inodeno_t ino;
   Journaler *journaler;
@@ -176,23 +179,23 @@ public:
   // replay state
   map<inodeno_t, set<inodeno_t> >   pending_exports;
 
-
+  void set_write_iohint(unsigned iohint_flags);
 
 public:
-  MDLog(MDS *m) : mds(m),
-		  num_events(0), 
-		  unflushed(0),
-		  capped(false),
-		  stopping(false),
-		  journaler(0),
-		  logger(0),
-		  replay_thread(this),
-		  already_replayed(false),
-		  recovery_thread(this),
-		  event_seq(0), expiring_events(0), expired_events(0),
-		  submit_mutex("MDLog::submit_mutex"),
-		  submit_thread(this),
-		  cur_event(NULL) { }		  
+  MDLog(MDSRank *m) : mds(m),
+                      num_events(0), 
+                      unflushed(0),
+                      capped(false),
+                      safe_pos(0),
+                      journaler(0),
+                      logger(0),
+                      replay_thread(this),
+                      already_replayed(false),
+                      recovery_thread(this),
+                      event_seq(0), expiring_events(0), expired_events(0),
+                      submit_mutex("MDLog::submit_mutex"),
+                      submit_thread(this),
+                      cur_event(NULL) { }		  
   ~MDLog();
 
 
@@ -290,6 +293,7 @@ private:
   void _trim_expired_segments();
 
   friend class C_MaybeExpiredSegment;
+  friend class C_MDL_Flushed;
 
 public:
   void trim_expired_segments();
diff --git a/src/mds/MDS.cc b/src/mds/MDS.cc
deleted file mode 100644
index 69d6be0..0000000
--- a/src/mds/MDS.cc
+++ /dev/null
@@ -1,3066 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage at newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software 
- * Foundation.  See file COPYING.
- * 
- */
-
-#include <unistd.h>
-
-#include "global/signal_handler.h"
-
-#include "include/types.h"
-#include "include/str_list.h"
-#include "common/entity_name.h"
-#include "common/Clock.h"
-#include "common/signal.h"
-#include "common/ceph_argparse.h"
-#include "common/errno.h"
-
-#include "msg/Messenger.h"
-#include "mon/MonClient.h"
-
-#include "osdc/Objecter.h"
-#include "osdc/Filer.h"
-#include "osdc/Journaler.h"
-
-#include "MDSMap.h"
-
-#include "MDS.h"
-#include "Server.h"
-#include "Locker.h"
-#include "MDCache.h"
-#include "MDLog.h"
-#include "MDBalancer.h"
-#include "Migrator.h"
-
-#include "SnapServer.h"
-#include "SnapClient.h"
-
-#include "InoTable.h"
-
-#include "common/HeartbeatMap.h"
-
-#include "common/perf_counters.h"
-
-#include "common/Timer.h"
-
-#include "events/ESession.h"
-#include "events/ESubtreeMap.h"
-
-#include "messages/MMDSMap.h"
-#include "messages/MMDSBeacon.h"
-
-#include "messages/MGenericMessage.h"
-
-#include "messages/MClientRequest.h"
-#include "messages/MClientRequestForward.h"
-
-#include "messages/MMDSTableRequest.h"
-
-#include "messages/MMonCommand.h"
-#include "messages/MCommand.h"
-#include "messages/MCommandReply.h"
-
-#include "auth/AuthAuthorizeHandler.h"
-#include "auth/KeyRing.h"
-
-#include "common/config.h"
-
-#include "perfglue/cpu_profiler.h"
-#include "perfglue/heap_profiler.h"
-
-
-#define dout_subsys ceph_subsys_mds
-#undef dout_prefix
-#define dout_prefix *_dout << "mds." << whoami << '.' << incarnation << ' '
-
-
-// cons/des
-MDS::MDS(const std::string &n, Messenger *m, MonClient *mc) : 
-  Dispatcher(m->cct),
-  mds_lock("MDS::mds_lock"),
-  timer(m->cct, mds_lock),
-  hb(NULL),
-  beacon(m->cct, mc, n),
-  authorize_handler_cluster_registry(new AuthAuthorizeHandlerRegistry(m->cct,
-								      m->cct->_conf->auth_supported.empty() ?
-								      m->cct->_conf->auth_cluster_required :
-								      m->cct->_conf->auth_supported)),
-  authorize_handler_service_registry(new AuthAuthorizeHandlerRegistry(m->cct,
-								      m->cct->_conf->auth_supported.empty() ?
-								      m->cct->_conf->auth_service_required :
-								      m->cct->_conf->auth_supported)),
-  name(n),
-  whoami(MDS_RANK_NONE), incarnation(0),
-  standby_for_rank(MDSMap::MDS_NO_STANDBY_PREF),
-  standby_type(MDSMap::STATE_NULL),
-  standby_replaying(false),
-  messenger(m),
-  monc(mc),
-  log_client(m->cct, messenger, &mc->monmap, LogClient::NO_FLAGS),
-  op_tracker(cct, m->cct->_conf->mds_enable_op_tracker, 
-                     m->cct->_conf->osd_num_op_tracker_shard),
-  finisher(cct),
-  osd_epoch_barrier(0),
-  sessionmap(this),
-  progress_thread(this),
-  asok_hook(NULL)
-{
-
-  hb = cct->get_heartbeat_map()->add_worker("MDS");
-
-  orig_argc = 0;
-  orig_argv = NULL;
-
-  last_tid = 0;
-
-  clog = log_client.create_channel();
-
-  monc->set_messenger(messenger);
-
-  mdsmap = new MDSMap;
-
-  objecter = new Objecter(m->cct, messenger, monc, NULL, 0, 0);
-  objecter->unset_honor_osdmap_full();
-
-  filer = new Filer(objecter, &finisher);
-
-  mdcache = new MDCache(this);
-  mdlog = new MDLog(this);
-  balancer = new MDBalancer(this);
-
-  inotable = new InoTable(this);
-  snapserver = new SnapServer(this);
-  snapclient = new SnapClient(this);
-
-  server = new Server(this);
-  locker = new Locker(this, mdcache);
-
-  dispatch_depth = 0;
-
-  // clients
-  last_client_mdsmap_bcast = 0;
-  
-  // tick
-  tick_event = 0;
-
-  req_rate = 0;
-
-  last_state = want_state = state = MDSMap::STATE_BOOT;
-
-  logger = 0;
-  mlogger = 0;
-  op_tracker.set_complaint_and_threshold(m->cct->_conf->mds_op_complaint_time,
-                                         m->cct->_conf->mds_op_log_threshold);
-  op_tracker.set_history_size_and_duration(m->cct->_conf->mds_op_history_size,
-                                           m->cct->_conf->mds_op_history_duration);
-}
-
-MDS::~MDS() {
-  Mutex::Locker lock(mds_lock);
-
-  delete authorize_handler_service_registry;
-  delete authorize_handler_cluster_registry;
-
-  if (mdcache) { delete mdcache; mdcache = NULL; }
-  if (mdlog) { delete mdlog; mdlog = NULL; }
-  if (balancer) { delete balancer; balancer = NULL; }
-  if (inotable) { delete inotable; inotable = NULL; }
-  if (snapserver) { delete snapserver; snapserver = NULL; }
-  if (snapclient) { delete snapclient; snapclient = NULL; }
-  if (mdsmap) { delete mdsmap; mdsmap = 0; }
-
-  if (server) { delete server; server = 0; }
-  if (locker) { delete locker; locker = 0; }
-
-  if (filer) { delete filer; filer = 0; }
-  if (objecter) { delete objecter; objecter = 0; }
-
-  if (logger) {
-    g_ceph_context->get_perfcounters_collection()->remove(logger);
-    delete logger;
-    logger = 0;
-  }
-  if (mlogger) {
-    g_ceph_context->get_perfcounters_collection()->remove(mlogger);
-    delete mlogger;
-    mlogger = 0;
-  }
-  
-  if (messenger)
-    delete messenger;
-
-  if (hb) {
-    cct->get_heartbeat_map()->remove_worker(hb);
-  }
-}
-
-class MDSSocketHook : public AdminSocketHook {
-  MDS *mds;
-public:
-  MDSSocketHook(MDS *m) : mds(m) {}
-  bool call(std::string command, cmdmap_t& cmdmap, std::string format,
-	    bufferlist& out) {
-    stringstream ss;
-    bool r = mds->asok_command(command, cmdmap, format, ss);
-    out.append(ss);
-    return r;
-  }
-};
-
-bool MDS::asok_command(string command, cmdmap_t& cmdmap, string format,
-		    ostream& ss)
-{
-  dout(1) << "asok_command: " << command << " (starting...)" << dendl;
-
-  Formatter *f = Formatter::create(format, "json-pretty", "json-pretty");
-  if (command == "status") {
-
-    const OSDMap *osdmap = objecter->get_osdmap_read();
-    const epoch_t osd_epoch = osdmap->get_epoch();
-    objecter->put_osdmap_read();
-
-    f->open_object_section("status");
-    f->dump_stream("cluster_fsid") << monc->get_fsid();
-    f->dump_unsigned("whoami", whoami);
-    f->dump_string("state", ceph_mds_state_name(get_state()));
-    f->dump_unsigned("mdsmap_epoch", mdsmap->get_epoch());
-    f->dump_unsigned("osdmap_epoch", osd_epoch);
-    f->dump_unsigned("osdmap_epoch_barrier", get_osd_epoch_barrier());
-    f->close_section(); // status
-  } else {
-    if (whoami < 0) {
-      dout(1) << "Can't run that command on an inactive MDS!" << dendl;
-      f->dump_string("error", "mds_not_active");
-    } else if (command == "dump_ops_in_flight" ||
-	       command == "ops") {
-      op_tracker.dump_ops_in_flight(f);
-    } else if (command == "dump_historic_ops") {
-      op_tracker.dump_historic_ops(f);
-    } else if (command == "osdmap barrier") {
-      int64_t target_epoch = 0;
-      bool got_val = cmd_getval(g_ceph_context, cmdmap, "target_epoch", target_epoch);
-      
-      if (!got_val) {
-	ss << "no target epoch given";
-	delete f;
-	return true;
-      }
-      
-      mds_lock.Lock();
-      set_osd_epoch_barrier(target_epoch);
-      mds_lock.Unlock();
-      
-      C_SaferCond cond;
-      bool already_got = objecter->wait_for_map(target_epoch, &cond);
-      if (!already_got) {
-	dout(4) << __func__ << ": waiting for OSD epoch " << target_epoch << dendl;
-	cond.wait();
-      }
-    } else if (command == "session ls") {
-      mds_lock.Lock();
-      
-      heartbeat_reset();
-      
-      // Dump sessions, decorated with recovery/replay status
-      f->open_array_section("sessions");
-      const ceph::unordered_map<entity_name_t, Session*> session_map = sessionmap.get_sessions();
-      for (ceph::unordered_map<entity_name_t,Session*>::const_iterator p = session_map.begin();
-	   p != session_map.end();
-	   ++p)  {
-	if (!p->first.is_client()) {
-	  continue;
-	}
-	
-	Session *s = p->second;
-	
-	f->open_object_section("session");
-	f->dump_int("id", p->first.num());
-	
-	f->dump_int("num_leases", s->leases.size());
-	f->dump_int("num_caps", s->caps.size());
-	
-	f->dump_string("state", s->get_state_name());
-	f->dump_int("replay_requests", is_clientreplay() ? s->get_request_count() : 0);
-	f->dump_bool("reconnecting", server->waiting_for_reconnect(p->first.num()));
-	f->dump_stream("inst") << s->info.inst;
-	f->open_object_section("client_metadata");
-	for (map<string, string>::const_iterator i = s->info.client_metadata.begin();
-	     i != s->info.client_metadata.end(); ++i) {
-	  f->dump_string(i->first.c_str(), i->second);
-	}
-	f->close_section(); // client_metadata
-	f->close_section(); //session
-      }
-      f->close_section(); //sessions
-      
-      mds_lock.Unlock();
-    } else if (command == "session evict") {
-      std::string client_id;
-      const bool got_arg = cmd_getval(g_ceph_context, cmdmap, "client_id", client_id);
-      assert(got_arg == true);
-      
-      mds_lock.Lock();
-      Session *session = sessionmap.get_session(entity_name_t(CEPH_ENTITY_TYPE_CLIENT,
-							      strtol(client_id.c_str(), 0, 10)));
-      if (session) {
-	C_SaferCond on_safe;
-	server->kill_session(session, &on_safe);
-	
-	mds_lock.Unlock();
-	on_safe.wait();
-      } else {
-	dout(15) << "session " << session << " not in sessionmap!" << dendl;
-	mds_lock.Unlock();
-      }
-    } else if (command == "scrub_path") {
-      string path;
-      cmd_getval(g_ceph_context, cmdmap, "path", path);
-      command_scrub_path(f, path);
-    } else if (command == "flush_path") {
-      string path;
-      cmd_getval(g_ceph_context, cmdmap, "path", path);
-      command_flush_path(f, path);
-    } else if (command == "flush journal") {
-      command_flush_journal(f);
-    } else if (command == "get subtrees") {
-      command_get_subtrees(f);
-    } else if (command == "export dir") {
-      string path;
-      if(!cmd_getval(g_ceph_context, cmdmap, "path", path)) {
-	ss << "malformed path";
-        delete f;
-        return true;
-      }
-      int64_t rank;
-      if(!cmd_getval(g_ceph_context, cmdmap, "rank", rank)) {
-	ss << "malformed rank";
-        delete f;
-        return true;
-      }
-      command_export_dir(f, path, (mds_rank_t)rank);
-    } else if (command == "force_readonly") {
-      mds_lock.Lock();
-      mdcache->force_readonly();
-      mds_lock.Unlock();
-    }
-  }
-  f->flush(ss);
-  delete f;
-  
-  dout(1) << "asok_command: " << command << " (complete)" << dendl;
-  
-  return true;
-}
-
-void MDS::command_scrub_path(Formatter *f, const string& path)
-{
-  C_SaferCond scond;
-  {
-    Mutex::Locker l(mds_lock);
-    mdcache->scrub_dentry(path, f, &scond);
-  }
-  scond.wait();
-  // scrub_dentry() finishers will dump the data for us; we're done!
-}
-
-void MDS::command_flush_path(Formatter *f, const string& path)
-{
-  C_SaferCond scond;
-  {
-    Mutex::Locker l(mds_lock);
-    mdcache->flush_dentry(path, &scond);
-  }
-  int r = scond.wait();
-  f->open_object_section("results");
-  f->dump_int("return_code", r);
-  f->close_section(); // results
-}
-
-/**
- * Wrapper around _command_flush_journal that
- * handles serialization of result
- */
-void MDS::command_flush_journal(Formatter *f)
-{
-  assert(f != NULL);
-
-  std::stringstream ss;
-  const int r = _command_flush_journal(&ss);
-  f->open_object_section("result");
-  f->dump_string("message", ss.str());
-  f->dump_int("return_code", r);
-  f->close_section();
-}
-
-/**
- * Implementation of "flush journal" asok command.
- *
- * @param ss
- * Optionally populate with a human readable string describing the
- * reason for any unexpected return status.
- */
-int MDS::_command_flush_journal(std::stringstream *ss)
-{
-  assert(ss != NULL);
-
-  Mutex::Locker l(mds_lock);
-
-  if (mdcache->is_readonly()) {
-    dout(5) << __func__ << ": read-only FS" << dendl;
-    return -EROFS;
-  }
-
-  // I need to seal off the current segment, and then mark all previous segments
-  // for expiry
-  mdlog->start_new_segment();
-  int r = 0;
-
-  // Flush initially so that all the segments older than our new one
-  // will be elegible for expiry
-  C_SaferCond mdlog_flushed;
-  mdlog->flush();
-  mdlog->wait_for_safe(new MDSInternalContextWrapper(this, &mdlog_flushed));
-  mds_lock.Unlock();
-  r = mdlog_flushed.wait();
-  mds_lock.Lock();
-  if (r != 0) {
-    *ss << "Error " << r << " (" << cpp_strerror(r) << ") while flushing journal";
-    return r;
-  }
-
-  // Put all the old log segments into expiring or expired state
-  dout(5) << __func__ << ": beginning segment expiry" << dendl;
-  r = mdlog->trim_all();
-  if (r != 0) {
-    *ss << "Error " << r << " (" << cpp_strerror(r) << ") while trimming log";
-    return r;
-  }
-
-  // Attach contexts to wait for all expiring segments to expire
-  MDSGatherBuilder expiry_gather(g_ceph_context);
-
-  const std::set<LogSegment*> &expiring_segments = mdlog->get_expiring_segments();
-  for (std::set<LogSegment*>::const_iterator i = expiring_segments.begin();
-       i != expiring_segments.end(); ++i) {
-    (*i)->wait_for_expiry(expiry_gather.new_sub());
-  }
-  dout(5) << __func__ << ": waiting for " << expiry_gather.num_subs_created()
-          << " segments to expire" << dendl;
-
-  if (expiry_gather.has_subs()) {
-    C_SaferCond cond;
-    expiry_gather.set_finisher(new MDSInternalContextWrapper(this, &cond));
-    expiry_gather.activate();
-
-    // Drop mds_lock to allow progress until expiry is complete
-    mds_lock.Unlock();
-    int r = cond.wait();
-    mds_lock.Lock();
-
-    assert(r == 0);  // MDLog is not allowed to raise errors via wait_for_expiry
-  }
-
-  dout(5) << __func__ << ": expiry complete, expire_pos/trim_pos is now " << std::hex <<
-    mdlog->get_journaler()->get_expire_pos() << "/" <<
-    mdlog->get_journaler()->get_trimmed_pos() << dendl;
-
-  // Now everyone I'm interested in is expired
-  mdlog->trim_expired_segments();
-
-  dout(5) << __func__ << ": trim complete, expire_pos/trim_pos is now " << std::hex <<
-    mdlog->get_journaler()->get_expire_pos() << "/" <<
-    mdlog->get_journaler()->get_trimmed_pos() << dendl;
-
-  // Flush the journal header so that readers will start from after the flushed region
-  C_SaferCond wrote_head;
-  mdlog->get_journaler()->write_head(&wrote_head);
-  mds_lock.Unlock();  // Drop lock to allow messenger dispatch progress
-  r = wrote_head.wait();
-  mds_lock.Lock();
-  if (r != 0) {
-      *ss << "Error " << r << " (" << cpp_strerror(r) << ") while writing header";
-      return r;
-  }
-
-  dout(5) << __func__ << ": write_head complete, all done!" << dendl;
-
-  return 0;
-}
-
-
-void MDS::command_get_subtrees(Formatter *f)
-{
-  assert(f != NULL);
-
-  std::list<CDir*> subtrees;
-  mdcache->list_subtrees(subtrees);
-
-  f->open_array_section("subtrees");
-  for (std::list<CDir*>::iterator i = subtrees.begin(); i != subtrees.end(); ++i) {
-    const CDir *dir = *i;
-
-    f->open_object_section("subtree");
-    {
-      f->dump_bool("is_auth", dir->is_auth());
-      f->dump_int("auth_first", dir->get_dir_auth().first);
-      f->dump_int("auth_second", dir->get_dir_auth().second);
-      f->open_object_section("dir");
-      dir->dump(f);
-      f->close_section();
-    }
-    f->close_section();
-  }
-  f->close_section();
-}
-
-
-void MDS::command_export_dir(Formatter *f,
-    const std::string &path,
-    mds_rank_t target)
-{
-  int r = _command_export_dir(path, target);
-  f->open_object_section("results");
-  f->dump_int("return_code", r);
-  f->close_section(); // results
-}
-
-int MDS::_command_export_dir(
-    const std::string &path,
-    mds_rank_t target)
-{
-  filepath fp(path.c_str());
-
-  if (target == whoami || !mdsmap->is_up(target) || !mdsmap->is_in(target)) {
-    derr << "bad MDS target " << target << dendl;
-    return -ENOENT;
-  }
-
-  CInode *in = mdcache->cache_traverse(fp);
-  if (!in) {
-    derr << "Bath path '" << path << "'" << dendl;
-    return -ENOENT;
-  }
-  CDir *dir = in->get_dirfrag(frag_t());
-  if (!dir || !(dir->is_auth())) {
-    derr << "bad export_dir path dirfrag frag_t() or dir not auth" << dendl;
-    return -EINVAL;
-  }
-
-  mdcache->migrator->export_dir(dir, target);
-  return 0;
-}
-
-
-void MDS::set_up_admin_socket()
-{
-  int r;
-  AdminSocket *admin_socket = g_ceph_context->get_admin_socket();
-  asok_hook = new MDSSocketHook(this);
-  r = admin_socket->register_command("status", "status", asok_hook,
-				     "high-level status of MDS");
-  assert(0 == r);
-  r = admin_socket->register_command("dump_ops_in_flight",
-				     "dump_ops_in_flight", asok_hook,
-				     "show the ops currently in flight");
-  assert(0 == r);
-  r = admin_socket->register_command("ops",
-				     "ops", asok_hook,
-				     "show the ops currently in flight");
-  assert(0 == r);
-  r = admin_socket->register_command("dump_historic_ops", "dump_historic_ops",
-				     asok_hook,
-				     "show slowest recent ops");
-  r = admin_socket->register_command("scrub_path",
-                                     "scrub_path name=path,type=CephString",
-                                     asok_hook,
-                                     "scrub an inode and output results");
-  r = admin_socket->register_command("flush_path",
-                                     "flush_path name=path,type=CephString",
-                                     asok_hook,
-                                     "flush an inode (and its dirfrags)");
-  r = admin_socket->register_command("export dir",
-                                     "export dir "
-                                     "name=path,type=CephString "
-                                     "name=rank,type=CephInt",
-                                     asok_hook,
-                                     "migrate a subtree to named MDS");
-  assert(0 == r);
-  r = admin_socket->register_command("session evict",
-				     "session evict name=client_id,type=CephString",
-				     asok_hook,
-				     "Evict a CephFS client");
-  assert(0 == r);
-  r = admin_socket->register_command("osdmap barrier",
-				     "osdmap barrier name=target_epoch,type=CephInt",
-				     asok_hook,
-				     "Wait until the MDS has this OSD map epoch");
-  assert(0 == r);
-  r = admin_socket->register_command("session ls",
-				     "session ls",
-				     asok_hook,
-				     "Enumerate connected CephFS clients");
-  assert(0 == r);
-  r = admin_socket->register_command("flush journal",
-				     "flush journal",
-				     asok_hook,
-				     "Flush the journal to the backing store");
-  assert(0 == r);
-  r = admin_socket->register_command("force_readonly",
-				     "force_readonly",
-				     asok_hook,
-				     "Force MDS to read-only mode");
-  assert(0 == r);
-  r = admin_socket->register_command("get subtrees",
-				     "get subtrees",
-				     asok_hook,
-				     "Return the subtree map");
-  assert(0 == r);
-}
-
-void MDS::clean_up_admin_socket()
-{
-  AdminSocket *admin_socket = g_ceph_context->get_admin_socket();
-  admin_socket->unregister_command("status");
-  admin_socket->unregister_command("dump_ops_in_flight");
-  admin_socket->unregister_command("ops");
-  admin_socket->unregister_command("dump_historic_ops");
-  admin_socket->unregister_command("scrub_path");
-  admin_socket->unregister_command("flush_path");
-  admin_socket->unregister_command("session evict");
-  admin_socket->unregister_command("session ls");
-  admin_socket->unregister_command("flush journal");
-  admin_socket->unregister_command("force_readonly");
-  delete asok_hook;
-  asok_hook = NULL;
-}
-
-const char** MDS::get_tracked_conf_keys() const
-{
-  static const char* KEYS[] = {
-    "mds_op_complaint_time", "mds_op_log_threshold",
-    "mds_op_history_size", "mds_op_history_duration",
-    // clog & admin clog
-    "clog_to_monitors",
-    "clog_to_syslog",
-    "clog_to_syslog_facility",
-    "clog_to_syslog_level",
-    NULL
-  };
-  return KEYS;
-}
-
-void MDS::handle_conf_change(const struct md_config_t *conf,
-			     const std::set <std::string> &changed)
-{
-  if (changed.count("mds_op_complaint_time") ||
-      changed.count("mds_op_log_threshold")) {
-    op_tracker.set_complaint_and_threshold(conf->mds_op_complaint_time,
-                                           conf->mds_op_log_threshold);
-  }
-  if (changed.count("mds_op_history_size") ||
-      changed.count("mds_op_history_duration")) {
-    op_tracker.set_history_size_and_duration(conf->mds_op_history_size,
-                                             conf->mds_op_history_duration);
-  }
-  if (changed.count("clog_to_monitors") ||
-      changed.count("clog_to_syslog") ||
-      changed.count("clog_to_syslog_level") ||
-      changed.count("clog_to_syslog_facility")) {
-    update_log_config();
-  }
-}
-
-void MDS::update_log_config()
-{
-  map<string,string> log_to_monitors;
-  map<string,string> log_to_syslog;
-  map<string,string> log_channel;
-  map<string,string> log_prio;
-  if (parse_log_client_options(g_ceph_context, log_to_monitors, log_to_syslog,
-			       log_channel, log_prio) == 0)
-    clog->update_config(log_to_monitors, log_to_syslog,
-			log_channel, log_prio);
-  derr << "log_to_monitors " << log_to_monitors << dendl;
-}
-
-void MDS::create_logger()
-{
-  dout(10) << "create_logger" << dendl;
-  {
-    PerfCountersBuilder mds_plb(g_ceph_context, "mds", l_mds_first, l_mds_last);
-
-    mds_plb.add_u64_counter(l_mds_request, "request");
-    mds_plb.add_u64_counter(l_mds_reply, "reply");
-    mds_plb.add_time_avg(l_mds_reply_latency, "reply_latency");
-    mds_plb.add_u64_counter(l_mds_forward, "forward");
-    
-    mds_plb.add_u64_counter(l_mds_dir_fetch, "dir_fetch");
-    mds_plb.add_u64_counter(l_mds_dir_commit, "dir_commit");
-    mds_plb.add_u64_counter(l_mds_dir_split, "dir_split");
-
-    mds_plb.add_u64(l_mds_inode_max, "inode_max");
-    mds_plb.add_u64(l_mds_inodes, "inodes");
-    mds_plb.add_u64(l_mds_inodes_top, "inodes_top");
-    mds_plb.add_u64(l_mds_inodes_bottom, "inodes_bottom");
-    mds_plb.add_u64(l_mds_inodes_pin_tail, "inodes_pin_tail");  
-    mds_plb.add_u64(l_mds_inodes_pinned, "inodes_pinned");
-    mds_plb.add_u64_counter(l_mds_inodes_expired, "inodes_expired");
-    mds_plb.add_u64_counter(l_mds_inodes_with_caps, "inodes_with_caps");
-    mds_plb.add_u64_counter(l_mds_caps, "caps");
-    mds_plb.add_u64(l_mds_subtrees, "subtrees");
-    
-    mds_plb.add_u64_counter(l_mds_traverse, "traverse"); 
-    mds_plb.add_u64_counter(l_mds_traverse_hit, "traverse_hit");
-    mds_plb.add_u64_counter(l_mds_traverse_forward, "traverse_forward");
-    mds_plb.add_u64_counter(l_mds_traverse_discover, "traverse_discover");
-    mds_plb.add_u64_counter(l_mds_traverse_dir_fetch, "traverse_dir_fetch");
-    mds_plb.add_u64_counter(l_mds_traverse_remote_ino, "traverse_remote_ino");
-    mds_plb.add_u64_counter(l_mds_traverse_lock, "traverse_lock");
-    
-    mds_plb.add_u64(l_mds_load_cent, "load_cent");
-    mds_plb.add_u64(l_mds_dispatch_queue_len, "q");
-    
-    mds_plb.add_u64_counter(l_mds_exported, "exported");
-    mds_plb.add_u64_counter(l_mds_exported_inodes, "exported_inodes");
-    mds_plb.add_u64_counter(l_mds_imported, "imported");
-    mds_plb.add_u64_counter(l_mds_imported_inodes, "imported_inodes");
-    logger = mds_plb.create_perf_counters();
-    g_ceph_context->get_perfcounters_collection()->add(logger);
-  }
-
-  {
-    PerfCountersBuilder mdm_plb(g_ceph_context, "mds_mem", l_mdm_first, l_mdm_last);
-    mdm_plb.add_u64(l_mdm_ino, "ino");
-    mdm_plb.add_u64_counter(l_mdm_inoa, "ino+");
-    mdm_plb.add_u64_counter(l_mdm_inos, "ino-");
-    mdm_plb.add_u64(l_mdm_dir, "dir");
-    mdm_plb.add_u64_counter(l_mdm_dira, "dir+");
-    mdm_plb.add_u64_counter(l_mdm_dirs, "dir-");
-    mdm_plb.add_u64(l_mdm_dn, "dn");
-    mdm_plb.add_u64_counter(l_mdm_dna, "dn+");
-    mdm_plb.add_u64_counter(l_mdm_dns, "dn-");
-    mdm_plb.add_u64(l_mdm_cap, "cap");
-    mdm_plb.add_u64_counter(l_mdm_capa, "cap+");
-    mdm_plb.add_u64_counter(l_mdm_caps, "cap-");
-    mdm_plb.add_u64(l_mdm_rss, "rss");
-    mdm_plb.add_u64(l_mdm_heap, "heap");
-    mdm_plb.add_u64(l_mdm_malloc, "malloc");
-    mdm_plb.add_u64(l_mdm_buf, "buf");
-    mlogger = mdm_plb.create_perf_counters();
-    g_ceph_context->get_perfcounters_collection()->add(mlogger);
-  }
-
-  mdlog->create_logger();
-  server->create_logger();
-  mdcache->register_perfcounters();
-}
-
-
-
-MDSTableClient *MDS::get_table_client(int t)
-{
-  switch (t) {
-  case TABLE_ANCHOR: return NULL;
-  case TABLE_SNAP: return snapclient;
-  default: assert(0);
-  }
-}
-
-MDSTableServer *MDS::get_table_server(int t)
-{
-  switch (t) {
-  case TABLE_ANCHOR: return NULL;
-  case TABLE_SNAP: return snapserver;
-  default: assert(0);
-  }
-}
-
-
-
-
-
-
-
-
-void MDS::send_message(Message *m, Connection *c)
-{ 
-  assert(c);
-  c->send_message(m);
-}
-
-
-void MDS::send_message_mds(Message *m, mds_rank_t mds)
-{
-  if (!mdsmap->is_up(mds)) {
-    dout(10) << "send_message_mds mds." << mds << " not up, dropping " << *m << dendl;
-    m->put();
-    return;
-  }
-
-  // send mdsmap first?
-  if (mds != whoami && peer_mdsmap_epoch[mds] < mdsmap->get_epoch()) {
-    messenger->send_message(new MMDSMap(monc->get_fsid(), mdsmap), 
-			    mdsmap->get_inst(mds));
-    peer_mdsmap_epoch[mds] = mdsmap->get_epoch();
-  }
-
-  // send message
-  messenger->send_message(m, mdsmap->get_inst(mds));
-}
-
-void MDS::forward_message_mds(Message *m, mds_rank_t mds)
-{
-  assert(mds != whoami);
-
-  // client request?
-  if (m->get_type() == CEPH_MSG_CLIENT_REQUEST &&
-      (static_cast<MClientRequest*>(m))->get_source().is_client()) {
-    MClientRequest *creq = static_cast<MClientRequest*>(m);
-    creq->inc_num_fwd();    // inc forward counter
-
-    /*
-     * don't actually forward if non-idempotent!
-     * client has to do it.  although the MDS will ignore duplicate requests,
-     * the affected metadata may migrate, in which case the new authority
-     * won't have the metareq_id in the completed request map.
-     */
-    // NEW: always make the client resend!  
-    bool client_must_resend = true;  //!creq->can_forward();
-
-    // tell the client where it should go
-    messenger->send_message(new MClientRequestForward(creq->get_tid(), mds, creq->get_num_fwd(),
-						      client_must_resend),
-			    creq->get_source_inst());
-    
-    if (client_must_resend) {
-      m->put();
-      return; 
-    }
-  }
-
-  // these are the only types of messages we should be 'forwarding'; they
-  // explicitly encode their source mds, which gets clobbered when we resend
-  // them here.
-  assert(m->get_type() == MSG_MDS_DIRUPDATE ||
-	 m->get_type() == MSG_MDS_EXPORTDIRDISCOVER);
-
-  // send mdsmap first?
-  if (peer_mdsmap_epoch[mds] < mdsmap->get_epoch()) {
-    messenger->send_message(new MMDSMap(monc->get_fsid(), mdsmap), 
-			    mdsmap->get_inst(mds));
-    peer_mdsmap_epoch[mds] = mdsmap->get_epoch();
-  }
-
-  messenger->send_message(m, mdsmap->get_inst(mds));
-}
-
-
-
-void MDS::send_message_client_counted(Message *m, client_t client)
-{
-  Session *session =  sessionmap.get_session(entity_name_t::CLIENT(client.v));
-  if (session) {
-    send_message_client_counted(m, session);
-  } else {
-    dout(10) << "send_message_client_counted no session for client." << client << " " << *m << dendl;
-  }
-}
-
-void MDS::send_message_client_counted(Message *m, Connection *connection)
-{
-  Session *session = static_cast<Session *>(connection->get_priv());
-  if (session) {
-    session->put();  // do not carry ref
-    send_message_client_counted(m, session);
-  } else {
-    dout(10) << "send_message_client_counted has no session for " << m->get_source_inst() << dendl;
-    // another Connection took over the Session
-  }
-}
-
-void MDS::send_message_client_counted(Message *m, Session *session)
-{
-  version_t seq = session->inc_push_seq();
-  dout(10) << "send_message_client_counted " << session->info.inst.name << " seq "
-	   << seq << " " << *m << dendl;
-  if (session->connection) {
-    session->connection->send_message(m);
-  } else {
-    session->preopen_out_queue.push_back(m);
-  }
-}
-
-void MDS::send_message_client(Message *m, Session *session)
-{
-  dout(10) << "send_message_client " << session->info.inst << " " << *m << dendl;
-  if (session->connection) {
-    session->connection->send_message(m);
-  } else {
-    session->preopen_out_queue.push_back(m);
-  }
-}
-
-int MDS::init(MDSMap::DaemonState wanted_state)
-{
-  dout(10) << sizeof(MDSCacheObject) << "\tMDSCacheObject" << dendl;
-  dout(10) << sizeof(CInode) << "\tCInode" << dendl;
-  dout(10) << sizeof(elist<void*>::item) << "\t elist<>::item   *7=" << 7*sizeof(elist<void*>::item) << dendl;
-  dout(10) << sizeof(inode_t) << "\t inode_t " << dendl;
-  dout(10) << sizeof(nest_info_t) << "\t  nest_info_t " << dendl;
-  dout(10) << sizeof(frag_info_t) << "\t  frag_info_t " << dendl;
-  dout(10) << sizeof(SimpleLock) << "\t SimpleLock   *5=" << 5*sizeof(SimpleLock) << dendl;
-  dout(10) << sizeof(ScatterLock) << "\t ScatterLock  *3=" << 3*sizeof(ScatterLock) << dendl;
-  dout(10) << sizeof(CDentry) << "\tCDentry" << dendl;
-  dout(10) << sizeof(elist<void*>::item) << "\t elist<>::item" << dendl;
-  dout(10) << sizeof(SimpleLock) << "\t SimpleLock" << dendl;
-  dout(10) << sizeof(CDir) << "\tCDir " << dendl;
-  dout(10) << sizeof(elist<void*>::item) << "\t elist<>::item   *2=" << 2*sizeof(elist<void*>::item) << dendl;
-  dout(10) << sizeof(fnode_t) << "\t fnode_t " << dendl;
-  dout(10) << sizeof(nest_info_t) << "\t  nest_info_t *2" << dendl;
-  dout(10) << sizeof(frag_info_t) << "\t  frag_info_t *2" << dendl;
-  dout(10) << sizeof(Capability) << "\tCapability " << dendl;
-  dout(10) << sizeof(xlist<void*>::item) << "\t xlist<>::item   *2=" << 2*sizeof(xlist<void*>::item) << dendl;
-
-  objecter->init();
-
-  messenger->add_dispatcher_tail(objecter);
-  messenger->add_dispatcher_tail(&beacon);
-  messenger->add_dispatcher_tail(this);
-
-  // get monmap
-  monc->set_messenger(messenger);
-
-  monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS);
-  monc->init();
-
-  finisher.start();
-
-  // tell monc about log_client so it will know about mon session resets
-  monc->set_log_client(&log_client);
-  update_log_config();
-  
-  int r = monc->authenticate();
-  if (r < 0) {
-    derr << "ERROR: failed to authenticate: " << cpp_strerror(-r) << dendl;
-    mds_lock.Lock();
-    suicide();
-    mds_lock.Unlock();
-    return r;
-  }
-  while (monc->wait_auth_rotating(30.0) < 0) {
-    derr << "unable to obtain rotating service keys; retrying" << dendl;
-  }
-  objecter->start();
-
-  mds_lock.Lock();
-  if (want_state == CEPH_MDS_STATE_DNE) {
-    mds_lock.Unlock();
-    return 0;
-  }
-
-  monc->sub_want("mdsmap", 0, 0);
-  monc->renew_subs();
-
-  mds_lock.Unlock();
-
-  // verify that osds support tmap2omap
-  while (true) {
-    objecter->maybe_request_map();
-    objecter->wait_for_osd_map();
-    const OSDMap *osdmap = objecter->get_osdmap_read();
-    uint64_t osd_features = osdmap->get_up_osd_features();
-    if (osd_features & CEPH_FEATURE_OSD_TMAP2OMAP) {
-      objecter->put_osdmap_read();
-      break;
-    }
-    if (osdmap->get_num_up_osds() > 0) {
-        derr << "*** one or more OSDs do not support TMAP2OMAP; upgrade OSDs before starting MDS (or downgrade MDS) ***" << dendl;
-    } else {
-        derr << "*** no OSDs are up as of epoch " << osdmap->get_epoch() << ", waiting" << dendl;
-    }
-    objecter->put_osdmap_read();
-    sleep(10);
-  }
-
-  mds_lock.Lock();
-  if (want_state == MDSMap::STATE_DNE) {
-    suicide();  // we could do something more graceful here
-  }
-
-  timer.init();
-
-  if (wanted_state==MDSMap::STATE_BOOT && g_conf->mds_standby_replay) {
-    wanted_state = MDSMap::STATE_STANDBY_REPLAY;
-  }
-
-  // starting beacon.  this will induce an MDSMap from the monitor
-  want_state = wanted_state;
-  if (wanted_state==MDSMap::STATE_STANDBY_REPLAY ||
-      wanted_state==MDSMap::STATE_ONESHOT_REPLAY) {
-    g_conf->set_val_or_die("mds_standby_replay", "true");
-    g_conf->apply_changes(NULL);
-    if ( wanted_state == MDSMap::STATE_ONESHOT_REPLAY &&
-        (g_conf->mds_standby_for_rank == -1) &&
-        g_conf->mds_standby_for_name.empty()) {
-      // uh-oh, must specify one or the other!
-      dout(0) << "Specified oneshot replay mode but not an MDS!" << dendl;
-      suicide();
-    }
-    want_state = MDSMap::STATE_BOOT;
-    standby_type = wanted_state;
-  }
-
-  standby_for_rank = mds_rank_t(g_conf->mds_standby_for_rank);
-  standby_for_name.assign(g_conf->mds_standby_for_name);
-
-  if (wanted_state == MDSMap::STATE_STANDBY_REPLAY &&
-      standby_for_rank == -1) {
-    if (standby_for_name.empty())
-      standby_for_rank = MDSMap::MDS_STANDBY_ANY;
-    else
-      standby_for_rank = MDSMap::MDS_STANDBY_NAME;
-  } else if (standby_type == MDSMap::STATE_NULL && !standby_for_name.empty())
-    standby_for_rank = MDSMap::MDS_MATCHED_ACTIVE;
-
-  beacon.init(mdsmap, want_state, standby_for_rank, standby_for_name);
-  whoami = -1;
-  messenger->set_myname(entity_name_t::MDS(whoami));
-  
-  // schedule tick
-  reset_tick();
-
-  // Start handler for finished_queue
-  progress_thread.create();
-
-  create_logger();
-  set_up_admin_socket();
-  g_conf->add_observer(this);
-
-  mds_lock.Unlock();
-
-  return 0;
-}
-
-void MDS::reset_tick()
-{
-  // cancel old
-  if (tick_event) timer.cancel_event(tick_event);
-
-  // schedule
-  tick_event = new C_MDS_Tick(this);
-  timer.add_event_after(g_conf->mds_tick_interval, tick_event);
-}
-
-void MDS::tick()
-{
-  heartbeat_reset();
-
-  tick_event = 0;
-
-  // reschedule
-  reset_tick();
-
-  if (beacon.is_laggy()) {
-    dout(5) << "tick bailing out since we seem laggy" << dendl;
-    return;
-  } else {
-    // Wake up thread in case we use to be laggy and have waiting_for_nolaggy
-    // messages to progress.
-    progress_thread.signal();
-  }
-
-  // make sure mds log flushes, trims periodically
-  mdlog->flush();
-
-  if (is_active() || is_stopping()) {
-    mdcache->trim();
-    mdcache->trim_client_leases();
-    mdcache->check_memory_usage();
-    mdlog->trim();  // NOT during recovery!
-  }
-
-  // log
-  utime_t now = ceph_clock_now(g_ceph_context);
-  mds_load_t load = balancer->get_load(now);
-  
-  if (logger) {
-    req_rate = logger->get(l_mds_request);
-    
-    logger->set(l_mds_load_cent, 100 * load.mds_load());
-    logger->set(l_mds_dispatch_queue_len, messenger->get_dispatch_queue_len());
-    logger->set(l_mds_subtrees, mdcache->num_subtrees());
-
-    mdcache->log_stat();
-  }
-
-  // ...
-  if (is_clientreplay() || is_active() || is_stopping()) {
-    locker->tick();
-    server->find_idle_sessions();
-  }
-  
-  if (is_reconnect())
-    server->reconnect_tick();
-  
-  if (is_active()) {
-    balancer->tick();
-    mdcache->find_stale_fragment_freeze();
-    mdcache->migrator->find_stale_export_freeze();
-    if (snapserver)
-      snapserver->check_osd_map(false);
-  }
-
-  // Expose ourselves to Beacon to update health indicators
-  beacon.notify_health(this);
-
-  check_ops_in_flight();
-}
-
-void MDS::check_ops_in_flight()
-{
-  vector<string> warnings;
-  if (op_tracker.check_ops_in_flight(warnings)) {
-    for (vector<string>::iterator i = warnings.begin();
-        i != warnings.end();
-        ++i) {
-      clog->warn() << *i;
-    }
-  }
-  return;
-}
-
-/* This function DOES put the passed message before returning*/
-void MDS::handle_command(MCommand *m)
-{
-  Session *session = static_cast<Session *>(m->get_connection()->get_priv());
-  assert(session != NULL);
-
-  int r = 0;
-  cmdmap_t cmdmap;
-  std::stringstream ss;
-  std::string outs;
-  bufferlist outbl;
-  Context *run_after = NULL;
-
-
-  if (!session->auth_caps.allow_all()) {
-    dout(1) << __func__
-      << ": received command from client without `tell` capability: "
-      << m->get_connection()->peer_addr << dendl;
-
-    ss << "permission denied";
-    r = -EPERM;
-  } else if (m->cmd.empty()) {
-    ss << "no command given";
-    outs = ss.str();
-  } else if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
-    r = -EINVAL;
-    outs = ss.str();
-  } else {
-    r = _handle_command(cmdmap, m->get_data(), &outbl, &outs, &run_after);
-  }
-
-  MCommandReply *reply = new MCommandReply(r, outs);
-  reply->set_tid(m->get_tid());
-  reply->set_data(outbl);
-  m->get_connection()->send_message(reply);
-
-  if (run_after) {
-    run_after->complete(0);
-  }
-
-  m->put();
-}
-
-
-struct MDSCommand {
-  string cmdstring;
-  string helpstring;
-  string module;
-  string perm;
-  string availability;
-} mds_commands[] = {
-
-#define COMMAND(parsesig, helptext, module, perm, availability) \
-  {parsesig, helptext, module, perm, availability},
-
-COMMAND("injectargs " \
-	"name=injected_args,type=CephString,n=N",
-	"inject configuration arguments into running MDS",
-	"mds", "*", "cli,rest")
-COMMAND("exit",
-	"Terminate this MDS",
-	"mds", "*", "cli,rest")
-COMMAND("respawn",
-	"Restart this MDS",
-	"mds", "*", "cli,rest")
-COMMAND("session kill " \
-        "name=session_id,type=CephInt",
-	"End a client session",
-	"mds", "*", "cli,rest")
-COMMAND("cpu_profiler " \
-	"name=arg,type=CephChoices,strings=status|flush",
-	"run cpu profiling on daemon", "mds", "rw", "cli,rest")
-COMMAND("heap " \
-	"name=heapcmd,type=CephChoices,strings=dump|start_profiler|stop_profiler|release|stats", \
-	"show heap usage info (available only if compiled with tcmalloc)", \
-	"mds", "*", "cli,rest")
-};
-
-// FIXME: reinstate dumpcache as an admin socket command
-//  -- it makes no sense for it to be a remote command when
-//     the output is a local file
-// FIXME: reinstate issue_caps, try_eval, fragment_dir, merge_dir
-//  *if* it makes sense to do so (or should these be admin socket things?)
-
-/* This function DOES put the passed message before returning*/
-void MDS::handle_command(MMonCommand *m)
-{
-  bufferlist outbl;
-  _handle_command_legacy(m->cmd);
-  m->put();
-}
-
-int MDS::_handle_command(
-    const cmdmap_t &cmdmap,
-    bufferlist const &inbl,
-    bufferlist *outbl,
-    std::string *outs,
-    Context **run_later)
-{
-  assert(outbl != NULL);
-  assert(outs != NULL);
-
-  class SuicideLater : public MDSInternalContext
-  {
-    public:
-
-    SuicideLater(MDS *mds) : MDSInternalContext(mds) {}
-    void finish(int r) {
-      // Wait a little to improve chances of caller getting
-      // our response before seeing us disappear from mdsmap
-      sleep(1);
-
-      mds->suicide();
-    }
-  };
-
-
-  class RespawnLater : public MDSInternalContext
-  {
-    public:
-
-    RespawnLater(MDS *mds) : MDSInternalContext(mds) {}
-    void finish(int r) {
-      // Wait a little to improve chances of caller getting
-      // our response before seeing us disappear from mdsmap
-      sleep(1);
-
-      mds->respawn();
-    }
-  };
-
-  std::stringstream ds;
-  std::stringstream ss;
-  std::string prefix;
-  cmd_getval(cct, cmdmap, "prefix", prefix);
-
-  int r = 0;
-
-  if (prefix == "get_command_descriptions") {
-    int cmdnum = 0;
-    JSONFormatter *f = new JSONFormatter();
-    f->open_object_section("command_descriptions");
-    for (MDSCommand *cp = mds_commands;
-	 cp < &mds_commands[ARRAY_SIZE(mds_commands)]; cp++) {
-
-      ostringstream secname;
-      secname << "cmd" << setfill('0') << std::setw(3) << cmdnum;
-      dump_cmddesc_to_json(f, secname.str(), cp->cmdstring, cp->helpstring,
-			   cp->module, cp->perm, cp->availability);
-      cmdnum++;
-    }
-    f->close_section();	// command_descriptions
-
-    f->flush(ds);
-    delete f;
-  } else if (prefix == "injectargs") {
-    vector<string> argsvec;
-    cmd_getval(cct, cmdmap, "injected_args", argsvec);
-
-    if (argsvec.empty()) {
-      r = -EINVAL;
-      ss << "ignoring empty injectargs";
-      goto out;
-    }
-    string args = argsvec.front();
-    for (vector<string>::iterator a = ++argsvec.begin(); a != argsvec.end(); ++a)
-      args += " " + *a;
-    cct->_conf->injectargs(args, &ss);
-  } else if (prefix == "exit") {
-    // We will send response before executing
-    ss << "Exiting...";
-    *run_later = new SuicideLater(this);
-  }
-  else if (prefix == "respawn") {
-    // We will send response before executing
-    ss << "Respawning...";
-    *run_later = new RespawnLater(this);
-  } else if (prefix == "session kill") {
-    // FIXME harmonize `session kill` with admin socket session evict
-    int64_t session_id = 0;
-    bool got = cmd_getval(cct, cmdmap, "session_id", session_id);
-    assert(got);
-    Session *session = sessionmap.get_session(entity_name_t(CEPH_ENTITY_TYPE_CLIENT, session_id));
-
-    if (session) {
-      server->kill_session(session, NULL);
-    } else {
-      r = -ENOENT;
-      ss << "session '" << session_id << "' not found";
-    }
-  } else if (prefix == "heap") {
-    if (!ceph_using_tcmalloc()) {
-      r = -EOPNOTSUPP;
-      ss << "could not issue heap profiler command -- not using tcmalloc!";
-    } else {
-      string heapcmd;
-      cmd_getval(cct, cmdmap, "heapcmd", heapcmd);
-      vector<string> heapcmd_vec;
-      get_str_vec(heapcmd, heapcmd_vec);
-      ceph_heap_profiler_handle_command(heapcmd_vec, ds);
-    }
-  } else if (prefix == "cpu_profiler") {
-    string arg;
-    cmd_getval(cct, cmdmap, "arg", arg);
-    vector<string> argvec;
-    get_str_vec(arg, argvec);
-    cpu_profiler_handle_command(argvec, ds);
-  } else {
-    std::ostringstream ss;
-    ss << "unrecognized command! " << prefix;
-    r = -EINVAL;
-  }
-
-out:
-  *outs = ss.str();
-  outbl->append(ds);
-  return r;
-}
-
-/**
- * Legacy "mds tell", takes a simple array of args
- */
-int MDS::_handle_command_legacy(std::vector<std::string> args)
-{
-  dout(10) << "handle_command args: " << args << dendl;
-  if (args[0] == "injectargs") {
-    if (args.size() < 2) {
-      derr << "Ignoring empty injectargs!" << dendl;
-    }
-    else {
-      std::ostringstream oss;
-      mds_lock.Unlock();
-      g_conf->injectargs(args[1], &oss);
-      mds_lock.Lock();
-      derr << "injectargs:" << dendl;
-      derr << oss.str() << dendl;
-    }
-  }
-  else if (args[0] == "dumpcache") {
-    if (args.size() > 1)
-      mdcache->dump_cache(args[1].c_str());
-    else
-      mdcache->dump_cache();
-  }
-  else if (args[0] == "exit") {
-    suicide();
-  }
-  else if (args[0] == "respawn") {
-    respawn();
-  }
-  else if (args[0] == "session" && args[1] == "kill") {
-    Session *session = sessionmap.get_session(entity_name_t(CEPH_ENTITY_TYPE_CLIENT,
-							    strtol(args[2].c_str(), 0, 10)));
-    if (session)
-      server->kill_session(session, NULL);
-    else
-      dout(15) << "session " << session << " not in sessionmap!" << dendl;
-  } else if (args[0] == "issue_caps") {
-    long inum = strtol(args[1].c_str(), 0, 10);
-    CInode *in = mdcache->get_inode(inodeno_t(inum));
-    if (in) {
-      bool r = locker->issue_caps(in);
-      dout(20) << "called issue_caps on inode "  << inum
-	       << " with result " << r << dendl;
-    } else dout(15) << "inode " << inum << " not in mdcache!" << dendl;
-  } else if (args[0] == "try_eval") {
-    long inum = strtol(args[1].c_str(), 0, 10);
-    int mask = strtol(args[2].c_str(), 0, 10);
-    CInode * ino = mdcache->get_inode(inodeno_t(inum));
-    if (ino) {
-      locker->try_eval(ino, mask);
-      dout(20) << "try_eval(" << inum << ", " << mask << ")" << dendl;
-    } else dout(15) << "inode " << inum << " not in mdcache!" << dendl;
-  } else if (args[0] == "fragment_dir") {
-    if (args.size() == 4) {
-      filepath fp(args[1].c_str());
-      CInode *in = mdcache->cache_traverse(fp);
-      if (in) {
-	frag_t fg;
-	if (fg.parse(args[2].c_str())) {
-	  CDir *dir = in->get_dirfrag(fg);
-	  if (dir) {
-	    if (dir->is_auth()) {
-	      int by = atoi(args[3].c_str());
-	      if (by)
-		mdcache->split_dir(dir, by);
-	      else
-		dout(0) << "need to split by >0 bits" << dendl;
-	    } else dout(0) << "dir " << dir->dirfrag() << " not auth" << dendl;
-	  } else dout(0) << "dir " << in->ino() << " " << fg << " dne" << dendl;
-	} else dout(0) << " frag " << args[2] << " does not parse" << dendl;
-      } else dout(0) << "path " << fp << " not found" << dendl;
-    } else dout(0) << "bad syntax" << dendl;
-  } else if (args[0] == "merge_dir") {
-    if (args.size() == 3) {
-      filepath fp(args[1].c_str());
-      CInode *in = mdcache->cache_traverse(fp);
-      if (in) {
-	frag_t fg;
-	if (fg.parse(args[2].c_str())) {
-	  mdcache->merge_dir(in, fg);
-	} else dout(0) << " frag " << args[2] << " does not parse" << dendl;
-      } else dout(0) << "path " << fp << " not found" << dendl;
-    } else dout(0) << "bad syntax" << dendl;
-  } else if (args[0] == "export_dir") {
-    if (args.size() == 3) {
-      filepath fp(args[1].c_str());
-      mds_rank_t target = mds_rank_t(atoi(args[2].c_str()));
-      if (target != whoami && mdsmap->is_up(target) && mdsmap->is_in(target)) {
-	CInode *in = mdcache->cache_traverse(fp);
-	if (in) {
-	  CDir *dir = in->get_dirfrag(frag_t());
-	  if (dir && dir->is_auth()) {
-	    mdcache->migrator->export_dir(dir, target);
-	  } else dout(0) << "bad export_dir path dirfrag frag_t() or dir not auth" << dendl;
-	} else dout(0) << "bad export_dir path" << dendl;
-      } else dout(0) << "bad export_dir target syntax" << dendl;
-    } else dout(0) << "bad export_dir syntax" << dendl;
-  } 
-  else if (args[0] == "cpu_profiler") {
-    ostringstream ss;
-    cpu_profiler_handle_command(args, ss);
-    clog->info() << ss.str();
-  }
-  else if (args[0] == "heap") {
-    if (!ceph_using_tcmalloc())
-      clog->info() << "tcmalloc not enabled, can't use heap profiler commands\n";
-    else {
-      ostringstream ss;
-      vector<std::string> cmdargs;
-      cmdargs.insert(cmdargs.begin(), args.begin()+1, args.end());
-      ceph_heap_profiler_handle_command(cmdargs, ss);
-      clog->info() << ss.str();
-    }
-  } else {
-    dout(0) << "unrecognized command! " << args << dendl;
-  }
-
-  return 0;
-}
-
-/* This function deletes the passed message before returning. */
-void MDS::handle_mds_map(MMDSMap *m)
-{
-  version_t epoch = m->get_epoch();
-  dout(5) << "handle_mds_map epoch " << epoch << " from " << m->get_source() << dendl;
-
-  // note source's map version
-  if (m->get_source().is_mds() && 
-      peer_mdsmap_epoch[mds_rank_t(m->get_source().num())] < epoch) {
-    dout(15) << " peer " << m->get_source()
-	     << " has mdsmap epoch >= " << epoch
-	     << dendl;
-    peer_mdsmap_epoch[mds_rank_t(m->get_source().num())] = epoch;
-  }
-
-  // is it new?
-  if (epoch <= mdsmap->get_epoch()) {
-    dout(5) << " old map epoch " << epoch << " <= " << mdsmap->get_epoch() 
-	    << ", discarding" << dendl;
-    m->put();
-    return;
-  }
-
-  // keep old map, for a moment
-  MDSMap *oldmap = mdsmap;
-  int oldwhoami = whoami;
-  MDSMap::DaemonState oldstate = state;
-  entity_addr_t addr;
-
-  // decode and process
-  mdsmap = new MDSMap;
-  mdsmap->decode(m->get_encoded());
-
-  monc->sub_got("mdsmap", mdsmap->get_epoch());
-
-  // verify compatset
-  CompatSet mdsmap_compat(get_mdsmap_compat_set_all());
-  dout(10) << "     my compat " << mdsmap_compat << dendl;
-  dout(10) << " mdsmap compat " << mdsmap->compat << dendl;
-  if (!mdsmap_compat.writeable(mdsmap->compat)) {
-    dout(0) << "handle_mds_map mdsmap compatset " << mdsmap->compat
-	    << " not writeable with daemon features " << mdsmap_compat
-	    << ", killing myself" << dendl;
-    suicide();
-    goto out;
-  }
-
-  // see who i am
-  addr = messenger->get_myaddr();
-  whoami = mdsmap->get_rank_gid(mds_gid_t(monc->get_global_id()));
-  state = mdsmap->get_state_gid(mds_gid_t(monc->get_global_id()));
-  incarnation = mdsmap->get_inc_gid(mds_gid_t(monc->get_global_id()));
-  dout(10) << "map says i am " << addr << " mds." << whoami << "." << incarnation
-	   << " state " << ceph_mds_state_name(state) << dendl;
-
-  // mark down any failed peers
-  for (map<mds_gid_t,MDSMap::mds_info_t>::const_iterator p = oldmap->get_mds_info().begin();
-       p != oldmap->get_mds_info().end();
-       ++p) {
-    if (mdsmap->get_mds_info().count(p->first) == 0) {
-      dout(10) << " peer mds gid " << p->first << " removed from map" << dendl;
-      messenger->mark_down(p->second.addr);
-    }
-  }
-
-  if (state != oldstate)
-    last_state = oldstate;
-
-  if (state == MDSMap::STATE_STANDBY) {
-    state = MDSMap::STATE_STANDBY;
-    set_want_state(state);
-    dout(1) << "handle_mds_map standby" << dendl;
-
-    if (standby_type) // we want to be in standby_replay or oneshot_replay!
-      request_state(standby_type);
-
-    goto out;
-  } else if (state == MDSMap::STATE_STANDBY_REPLAY) {
-    if (standby_type != MDSMap::STATE_NULL && standby_type != MDSMap::STATE_STANDBY_REPLAY) {
-      set_want_state(standby_type);
-      beacon.send();
-      state = oldstate;
-      goto out;
-    }
-  }
-
-  if (whoami < 0) {
-    if (state == MDSMap::STATE_STANDBY_REPLAY ||
-        state == MDSMap::STATE_ONESHOT_REPLAY) {
-      // fill in whoami from standby-for-rank. If we let this be changed
-      // the logic used to set it here will need to be adjusted.
-      whoami = mdsmap->get_mds_info_gid(mds_gid_t(monc->get_global_id())).standby_for_rank;
-    } else {
-      if (want_state == MDSMap::STATE_STANDBY) {
-        dout(10) << "dropped out of mdsmap, try to re-add myself" << dendl;
-        state = MDSMap::STATE_BOOT;
-        set_want_state(state);
-        goto out;
-      }
-      if (want_state == MDSMap::STATE_BOOT) {
-        dout(10) << "not in map yet" << dendl;
-      } else {
-	// did i get kicked by someone else?
-	if (g_conf->mds_enforce_unique_name) {
-	  if (mds_gid_t existing = mdsmap->find_mds_gid_by_name(name)) {
-	    MDSMap::mds_info_t& i = mdsmap->get_info_gid(existing);
-	    if (i.global_id > monc->get_global_id()) {
-	      dout(1) << "handle_mds_map i (" << addr
-		      << ") dne in the mdsmap, new instance has larger gid " << i.global_id
-		      << ", suicide" << dendl;
-	      suicide();
-	      goto out;
-	    }
-	  }
-	}
-
-        dout(1) << "handle_mds_map i (" << addr
-            << ") dne in the mdsmap, respawning myself" << dendl;
-        respawn();
-      }
-      goto out;
-    }
-  }
-
-  // ??
-
-  if (oldwhoami != whoami || oldstate != state) {
-    // update messenger.
-    if (state == MDSMap::STATE_STANDBY_REPLAY || state == MDSMap::STATE_ONESHOT_REPLAY) {
-      dout(1) << "handle_mds_map i am now mds." << monc->get_global_id() << "." << incarnation
-	      << "replaying mds." << whoami << "." << incarnation << dendl;
-      messenger->set_myname(entity_name_t::MDS(monc->get_global_id()));
-    } else {
-      dout(1) << "handle_mds_map i am now mds." << whoami << "." << incarnation << dendl;
-      messenger->set_myname(entity_name_t::MDS(whoami));
-    }
-  }
-
-  // tell objecter my incarnation
-  if (objecter->get_client_incarnation() != incarnation)
-    objecter->set_client_incarnation(incarnation);
-
-  // for debug
-  if (g_conf->mds_dump_cache_on_map)
-    mdcache->dump_cache();
-
-  // did it change?
-  if (oldstate != state) {
-    dout(1) << "handle_mds_map state change "
-	    << ceph_mds_state_name(oldstate) << " --> "
-	    << ceph_mds_state_name(state) << dendl;
-    set_want_state(state);
-
-    if (oldstate == MDSMap::STATE_STANDBY_REPLAY) {
-        dout(10) << "Monitor activated us! Deactivating replay loop" << dendl;
-        assert (state == MDSMap::STATE_REPLAY);
-    } else {
-      // did i just recover?
-      if ((is_active() || is_clientreplay()) &&
-          (oldstate == MDSMap::STATE_CREATING ||
-	   oldstate == MDSMap::STATE_REJOIN ||
-	   oldstate == MDSMap::STATE_RECONNECT))
-        recovery_done(oldstate);
-
-      if (is_active()) {
-        active_start();
-      } else if (is_any_replay()) {
-        replay_start();
-      } else if (is_resolve()) {
-        resolve_start();
-      } else if (is_reconnect()) {
-        reconnect_start();
-      } else if (is_rejoin()) {
-	rejoin_start();
-      } else if (is_clientreplay()) {
-        clientreplay_start();
-      } else if (is_creating()) {
-        boot_create();
-      } else if (is_starting()) {
-        boot_start();
-      } else if (is_stopping()) {
-        assert(oldstate == MDSMap::STATE_ACTIVE);
-        stopping_start();
-      }
-    }
-  }
-  
-  // RESOLVE
-  // is someone else newly resolving?
-  if (is_resolve() || is_reconnect() || is_rejoin() ||
-      is_clientreplay() || is_active() || is_stopping()) {
-    if (!oldmap->is_resolving() && mdsmap->is_resolving()) {
-      set<mds_rank_t> resolve;
-      mdsmap->get_mds_set(resolve, MDSMap::STATE_RESOLVE);
-      dout(10) << " resolve set is " << resolve << dendl;
-      calc_recovery_set();
-      mdcache->send_resolves();
-    }
-  }
-  
-  // REJOIN
-  // is everybody finally rejoining?
-  if (is_rejoin() || is_clientreplay() || is_active() || is_stopping()) {
-    // did we start?
-    if (!oldmap->is_rejoining() && mdsmap->is_rejoining())
-      rejoin_joint_start();
-
-    // did we finish?
-    if (g_conf->mds_dump_cache_after_rejoin &&
-	oldmap->is_rejoining() && !mdsmap->is_rejoining()) 
-      mdcache->dump_cache();      // for DEBUG only
-
-    if (oldstate >= MDSMap::STATE_REJOIN) {
-      // ACTIVE|CLIENTREPLAY|REJOIN => we can discover from them.
-      set<mds_rank_t> olddis, dis;
-      oldmap->get_mds_set(olddis, MDSMap::STATE_ACTIVE);
-      oldmap->get_mds_set(olddis, MDSMap::STATE_CLIENTREPLAY);
-      oldmap->get_mds_set(olddis, MDSMap::STATE_REJOIN);
-      mdsmap->get_mds_set(dis, MDSMap::STATE_ACTIVE);
-      mdsmap->get_mds_set(dis, MDSMap::STATE_CLIENTREPLAY);
-      mdsmap->get_mds_set(dis, MDSMap::STATE_REJOIN);
-      for (set<mds_rank_t>::iterator p = dis.begin(); p != dis.end(); ++p)
-	if (*p != whoami &&            // not me
-	    olddis.count(*p) == 0) {  // newly so?
-	  mdcache->kick_discovers(*p);
-	  mdcache->kick_open_ino_peers(*p);
-	}
-    }
-  }
-
-  if (oldmap->is_degraded() && !mdsmap->is_degraded() && state >= MDSMap::STATE_ACTIVE)
-    dout(1) << "cluster recovered." << dendl;
-
-  // did someone go active?
-  if (oldstate >= MDSMap::STATE_CLIENTREPLAY &&
-      (is_clientreplay() || is_active() || is_stopping())) {
-    set<mds_rank_t> oldactive, active;
-    oldmap->get_mds_set(oldactive, MDSMap::STATE_ACTIVE);
-    oldmap->get_mds_set(oldactive, MDSMap::STATE_CLIENTREPLAY);
-    mdsmap->get_mds_set(active, MDSMap::STATE_ACTIVE);
-    mdsmap->get_mds_set(active, MDSMap::STATE_CLIENTREPLAY);
-    for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p) 
-      if (*p != whoami &&            // not me
-	  oldactive.count(*p) == 0)  // newly so?
-	handle_mds_recovery(*p);
-  }
-
-  // did someone fail?
-  if (true) {
-    // new failed?
-    set<mds_rank_t> oldfailed, failed;
-    oldmap->get_failed_mds_set(oldfailed);
-    mdsmap->get_failed_mds_set(failed);
-    for (set<mds_rank_t>::iterator p = failed.begin(); p != failed.end(); ++p)
-      if (oldfailed.count(*p) == 0) {
-	messenger->mark_down(oldmap->get_inst(*p).addr);
-	handle_mds_failure(*p);
-      }
-    
-    // or down then up?
-    //  did their addr/inst change?
-    set<mds_rank_t> up;
-    mdsmap->get_up_mds_set(up);
-    for (set<mds_rank_t>::iterator p = up.begin(); p != up.end(); ++p) 
-      if (oldmap->have_inst(*p) &&
-	  oldmap->get_inst(*p) != mdsmap->get_inst(*p)) {
-	messenger->mark_down(oldmap->get_inst(*p).addr);
-	handle_mds_failure(*p);
-      }
-  }
-  if (is_clientreplay() || is_active() || is_stopping()) {
-    // did anyone stop?
-    set<mds_rank_t> oldstopped, stopped;
-    oldmap->get_stopped_mds_set(oldstopped);
-    mdsmap->get_stopped_mds_set(stopped);
-    for (set<mds_rank_t>::iterator p = stopped.begin(); p != stopped.end(); ++p) 
-      if (oldstopped.count(*p) == 0)      // newly so?
-	mdcache->migrator->handle_mds_failure_or_stop(*p);
-  }
-
-  if (!is_any_replay())
-    balancer->try_rebalance();
-
-  {
-    map<epoch_t,list<MDSInternalContextBase*> >::iterator p = waiting_for_mdsmap.begin();
-    while (p != waiting_for_mdsmap.end() && p->first <= mdsmap->get_epoch()) {
-      list<MDSInternalContextBase*> ls;
-      ls.swap(p->second);
-      waiting_for_mdsmap.erase(p++);
-      finish_contexts(g_ceph_context, ls);
-    }
-  }
-
-  if (is_active()) {
-    // Before going active, set OSD epoch barrier to latest (so that
-    // we don't risk handing out caps to clients with old OSD maps that
-    // might not include barriers from the previous incarnation of this MDS)
-    const OSDMap *osdmap = objecter->get_osdmap_read();
-    const epoch_t osd_epoch = osdmap->get_epoch();
-    objecter->put_osdmap_read();
-    set_osd_epoch_barrier(osd_epoch);
-  }
-
- out:
-  beacon.notify_mdsmap(mdsmap);
-
-  m->put();
-  delete oldmap;
-}
-
-void MDS::bcast_mds_map()
-{
-  dout(7) << "bcast_mds_map " << mdsmap->get_epoch() << dendl;
-
-  // share the map with mounted clients
-  set<Session*> clients;
-  sessionmap.get_client_session_set(clients);
-  for (set<Session*>::const_iterator p = clients.begin();
-       p != clients.end();
-       ++p) 
-    (*p)->connection->send_message(new MMDSMap(monc->get_fsid(), mdsmap));
-  last_client_mdsmap_bcast = mdsmap->get_epoch();
-}
-
-
-void MDS::request_state(MDSMap::DaemonState s)
-{
-  dout(3) << "request_state " << ceph_mds_state_name(s) << dendl;
-  set_want_state(s);
-  beacon.send();
-}
-
-
-class C_MDS_CreateFinish : public MDSInternalContext {
-public:
-  C_MDS_CreateFinish(MDS *m) : MDSInternalContext(m) {}
-  void finish(int r) { mds->creating_done(); }
-};
-
-void MDS::boot_create()
-{
-  dout(3) << "boot_create" << dendl;
-
-  MDSGatherBuilder fin(g_ceph_context, new C_MDS_CreateFinish(this));
-
-  mdcache->init_layouts();
-
-  snapserver->set_rank(whoami);
-  inotable->set_rank(whoami);
-  sessionmap.set_rank(whoami);
-
-  // start with a fresh journal
-  dout(10) << "boot_create creating fresh journal" << dendl;
-  mdlog->create(fin.new_sub());
-
-  // open new journal segment, but do not journal subtree map (yet)
-  mdlog->prepare_new_segment();
-
-  if (whoami == mdsmap->get_root()) {
-    dout(3) << "boot_create creating fresh hierarchy" << dendl;
-    mdcache->create_empty_hierarchy(fin.get());
-  }
-
-  dout(3) << "boot_create creating mydir hierarchy" << dendl;
-  mdcache->create_mydir_hierarchy(fin.get());
-
-  // fixme: fake out inotable (reset, pretend loaded)
-  dout(10) << "boot_create creating fresh inotable table" << dendl;
-  inotable->reset();
-  inotable->save(fin.new_sub());
-
-  // write empty sessionmap
-  sessionmap.save(fin.new_sub());
-
-  // initialize tables
-  if (mdsmap->get_tableserver() == whoami) {
-    dout(10) << "boot_create creating fresh snaptable" << dendl;
-    snapserver->reset();
-    snapserver->save(fin.new_sub());
-  }
-
-  assert(g_conf->mds_kill_create_at != 1);
-
-  // ok now journal it
-  mdlog->journal_segment_subtree_map(fin.new_sub());
-  mdlog->flush();
-
-  fin.activate();
-}
-
-void MDS::creating_done()
-{
-  dout(1)<< "creating_done" << dendl;
-  request_state(MDSMap::STATE_ACTIVE);
-}
-
-
-class C_MDS_BootStart : public MDSInternalContext {
-  MDS::BootStep nextstep;
-public:
-  C_MDS_BootStart(MDS *m, MDS::BootStep n) : MDSInternalContext(m), nextstep(n) {}
-  void finish(int r) {
-    mds->boot_start(nextstep, r);
-  }
-};
-
-
-void MDS::boot_start(BootStep step, int r)
-{
-  // Handle errors from previous step
-  if (r < 0) {
-    if (is_standby_replay() && (r == -EAGAIN)) {
-      dout(0) << "boot_start encountered an error EAGAIN"
-              << ", respawning since we fell behind journal" << dendl;
-      respawn();
-    } else {
-      dout(0) << "boot_start encountered an error, failing" << dendl;
-      suicide();
-      return;
-    }
-  }
-
-  assert(is_starting() || is_any_replay());
-
-  switch(step) {
-    case MDS_BOOT_INITIAL:
-      {
-        mdcache->init_layouts();
-
-        MDSGatherBuilder gather(g_ceph_context,
-            new C_MDS_BootStart(this, MDS_BOOT_OPEN_ROOT));
-        dout(2) << "boot_start " << step << ": opening inotable" << dendl;
-        inotable->set_rank(whoami);
-        inotable->load(gather.new_sub());
-
-        dout(2) << "boot_start " << step << ": opening sessionmap" << dendl;
-        sessionmap.set_rank(whoami);
-        sessionmap.load(gather.new_sub());
-
-        dout(2) << "boot_start " << step << ": opening mds log" << dendl;
-        mdlog->open(gather.new_sub());
-
-        if (mdsmap->get_tableserver() == whoami) {
-          dout(2) << "boot_start " << step << ": opening snap table" << dendl;
-          snapserver->set_rank(whoami);
-          snapserver->load(gather.new_sub());
-        }
-
-        gather.activate();
-      }
-      break;
-    case MDS_BOOT_OPEN_ROOT:
-      {
-        dout(2) << "boot_start " << step << ": loading/discovering base inodes" << dendl;
-
-        MDSGatherBuilder gather(g_ceph_context,
-            new C_MDS_BootStart(this, MDS_BOOT_PREPARE_LOG));
-
-        mdcache->open_mydir_inode(gather.new_sub());
-
-        if (is_starting() ||
-            whoami == mdsmap->get_root()) {  // load root inode off disk if we are auth
-          mdcache->open_root_inode(gather.new_sub());
-        } else {
-          // replay.  make up fake root inode to start with
-          mdcache->create_root_inode();
-        }
-        gather.activate();
-      }
-      break;
-    case MDS_BOOT_PREPARE_LOG:
-      if (is_any_replay()) {
-        dout(2) << "boot_start " << step << ": replaying mds log" << dendl;
-        mdlog->replay(new C_MDS_BootStart(this, MDS_BOOT_REPLAY_DONE));
-      } else {
-        dout(2) << "boot_start " << step << ": positioning at end of old mds log" << dendl;
-        mdlog->append();
-        starting_done();
-      }
-      break;
-    case MDS_BOOT_REPLAY_DONE:
-      assert(is_any_replay());
-      replay_done();
-      break;
-  }
-}
-
-void MDS::starting_done()
-{
-  dout(3) << "starting_done" << dendl;
-  assert(is_starting());
-  request_state(MDSMap::STATE_ACTIVE);
-
-  mdcache->open_root();
-
-  // start new segment
-  mdlog->start_new_segment();
-}
-
-
-void MDS::calc_recovery_set()
-{
-  // initialize gather sets
-  set<mds_rank_t> rs;
-  mdsmap->get_recovery_mds_set(rs);
-  rs.erase(whoami);
-  mdcache->set_recovery_set(rs);
-
-  dout(1) << " recovery set is " << rs << dendl;
-}
-
-
-void MDS::replay_start()
-{
-  dout(1) << "replay_start" << dendl;
-
-  if (is_standby_replay())
-    standby_replaying = true;
-  
-  standby_type = MDSMap::STATE_NULL;
-
-  calc_recovery_set();
-
-  // Check if we need to wait for a newer OSD map before starting
-  Context *fin = new C_OnFinisher(new C_IO_Wrapper(this, new C_MDS_BootStart(this, MDS_BOOT_INITIAL)), &finisher);
-  bool const ready = objecter->wait_for_map(
-      mdsmap->get_last_failure_osd_epoch(),
-      fin);
-
-  if (ready) {
-    delete fin;
-    boot_start();
-  } else {
-    dout(1) << " waiting for osdmap " << mdsmap->get_last_failure_osd_epoch() 
-	    << " (which blacklists prior instance)" << dendl;
-  }
-}
-
-
-class MDS::C_MDS_StandbyReplayRestartFinish : public MDSIOContext {
-  uint64_t old_read_pos;
-public:
-  C_MDS_StandbyReplayRestartFinish(MDS *mds_, uint64_t old_read_pos_) :
-    MDSIOContext(mds_), old_read_pos(old_read_pos_) {}
-  void finish(int r) {
-    mds->_standby_replay_restart_finish(r, old_read_pos);
-  }
-};
-
-void MDS::_standby_replay_restart_finish(int r, uint64_t old_read_pos)
-{
-  if (old_read_pos < mdlog->get_journaler()->get_trimmed_pos()) {
-    dout(0) << "standby MDS fell behind active MDS journal's expire_pos, restarting" << dendl;
-    respawn(); /* we're too far back, and this is easier than
-		  trying to reset everything in the cache, etc */
-  } else {
-    mdlog->standby_trim_segments();
-    boot_start(MDS_BOOT_PREPARE_LOG, r);
-  }
-}
-
-inline void MDS::standby_replay_restart()
-{
-  dout(1) << "standby_replay_restart"
-	  << (standby_replaying ? " (as standby)":" (final takeover pass)")
-	  << dendl;
-  if (standby_replaying) {
-    /* Go around for another pass of replaying in standby */
-    mdlog->get_journaler()->reread_head_and_probe(
-      new C_MDS_StandbyReplayRestartFinish(
-        this,
-	mdlog->get_journaler()->get_read_pos()));
-  } else {
-    /* We are transitioning out of standby: wait for OSD map update
-       before making final pass */
-    Context *fin = new C_OnFinisher(new C_IO_Wrapper(this,
-          new C_MDS_BootStart(this, MDS_BOOT_PREPARE_LOG)),
-      &finisher);
-    bool const ready =
-      objecter->wait_for_map(mdsmap->get_last_failure_osd_epoch(), fin);
-    if (ready) {
-      delete fin;
-      mdlog->get_journaler()->reread_head_and_probe(
-        new C_MDS_StandbyReplayRestartFinish(
-          this,
-	  mdlog->get_journaler()->get_read_pos()));
-    } else {
-      dout(1) << " waiting for osdmap " << mdsmap->get_last_failure_osd_epoch() 
-              << " (which blacklists prior instance)" << dendl;
-    }
-  }
-}
-
-class MDS::C_MDS_StandbyReplayRestart : public MDSInternalContext {
-public:
-  C_MDS_StandbyReplayRestart(MDS *m) : MDSInternalContext(m) {}
-  void finish(int r) {
-    assert(!r);
-    mds->standby_replay_restart();
-  }
-};
-
-void MDS::replay_done()
-{
-  dout(1) << "replay_done" << (standby_replaying ? " (as standby)" : "") << dendl;
-
-  if (is_oneshot_replay()) {
-    dout(2) << "hack.  journal looks ok.  shutting down." << dendl;
-    suicide();
-    return;
-  }
-
-  if (is_standby_replay()) {
-    // The replay was done in standby state, and we are still in that state
-    assert(standby_replaying);
-    dout(10) << "setting replay timer" << dendl;
-    timer.add_event_after(g_conf->mds_replay_interval,
-                          new C_MDS_StandbyReplayRestart(this));
-    return;
-  } else if (standby_replaying) {
-    // The replay was done in standby state, we have now _left_ that state
-    dout(10) << " last replay pass was as a standby; making final pass" << dendl;
-    standby_replaying = false;
-    standby_replay_restart();
-    return;
-  } else {
-    // Replay is complete, journal read should be up to date
-    assert(mdlog->get_journaler()->get_read_pos() == mdlog->get_journaler()->get_write_pos());
-    assert(!is_standby_replay());
-
-    // Reformat and come back here
-    if (mdlog->get_journaler()->get_stream_format() < g_conf->mds_journal_format) {
-        dout(4) << "reformatting journal on standbyreplay->replay transition" << dendl;
-        mdlog->reopen(new C_MDS_BootStart(this, MDS_BOOT_REPLAY_DONE));
-        return;
-    }
-  }
-
-  dout(1) << "making mds journal writeable" << dendl;
-  mdlog->get_journaler()->set_writeable();
-  mdlog->get_journaler()->trim_tail();
-
-  if (g_conf->mds_wipe_sessions) {
-    dout(1) << "wiping out client sessions" << dendl;
-    sessionmap.wipe();
-    sessionmap.save(new C_MDSInternalNoop);
-  }
-  if (g_conf->mds_wipe_ino_prealloc) {
-    dout(1) << "wiping out ino prealloc from sessions" << dendl;
-    sessionmap.wipe_ino_prealloc();
-    sessionmap.save(new C_MDSInternalNoop);
-  }
-  if (g_conf->mds_skip_ino) {
-    inodeno_t i = g_conf->mds_skip_ino;
-    dout(1) << "skipping " << i << " inodes" << dendl;
-    inotable->skip_inos(i);
-    inotable->save(new C_MDSInternalNoop);
-  }
-
-  if (mdsmap->get_num_in_mds() == 1 &&
-      mdsmap->get_num_failed_mds() == 0) { // just me!
-    dout(2) << "i am alone, moving to state reconnect" << dendl;      
-    request_state(MDSMap::STATE_RECONNECT);
-  } else {
-    dout(2) << "i am not alone, moving to state resolve" << dendl;
-    request_state(MDSMap::STATE_RESOLVE);
-  }
-}
-
-void MDS::reopen_log()
-{
-  dout(1) << "reopen_log" << dendl;
-  mdcache->rollback_uncommitted_fragments();
-}
-
-
-void MDS::resolve_start()
-{
-  dout(1) << "resolve_start" << dendl;
-
-  reopen_log();
-
-  mdcache->resolve_start();
-  finish_contexts(g_ceph_context, waiting_for_resolve);
-}
-void MDS::resolve_done()
-{
-  dout(1) << "resolve_done" << dendl;
-  request_state(MDSMap::STATE_RECONNECT);
-}
-
-void MDS::reconnect_start()
-{
-  dout(1) << "reconnect_start" << dendl;
-
-  if (last_state == MDSMap::STATE_REPLAY)
-    reopen_log();
-
-  server->reconnect_clients();
-  finish_contexts(g_ceph_context, waiting_for_reconnect);
-}
-void MDS::reconnect_done()
-{
-  dout(1) << "reconnect_done" << dendl;
-  request_state(MDSMap::STATE_REJOIN);    // move to rejoin state
-}
-
-void MDS::rejoin_joint_start()
-{
-  dout(1) << "rejoin_joint_start" << dendl;
-  mdcache->rejoin_send_rejoins();
-}
-void MDS::rejoin_start()
-{
-  dout(1) << "rejoin_start" << dendl;
-  mdcache->rejoin_start();
-}
-void MDS::rejoin_done()
-{
-  dout(1) << "rejoin_done" << dendl;
-  mdcache->show_subtrees();
-  mdcache->show_cache();
-
-  // funny case: is our cache empty?  no subtrees?
-  if (!mdcache->is_subtrees()) {
-    dout(1) << " empty cache, no subtrees, leaving cluster" << dendl;
-    request_state(MDSMap::STATE_STOPPED);
-    return;
-  }
-
-  if (replay_queue.empty())
-    request_state(MDSMap::STATE_ACTIVE);
-  else
-    request_state(MDSMap::STATE_CLIENTREPLAY);
-}
-
-void MDS::clientreplay_start()
-{
-  dout(1) << "clientreplay_start" << dendl;
-  finish_contexts(g_ceph_context, waiting_for_replay);  // kick waiters
-  queue_one_replay();
-}
-
-void MDS::clientreplay_done()
-{
-  dout(1) << "clientreplay_done" << dendl;
-  request_state(MDSMap::STATE_ACTIVE);
-}
-
-void MDS::active_start()
-{
-  dout(1) << "active_start" << dendl;
-
-  if (last_state == MDSMap::STATE_CREATING)
-    mdcache->open_root();
-
-  mdcache->clean_open_file_lists();
-  mdcache->export_remaining_imported_caps();
-  finish_contexts(g_ceph_context, waiting_for_replay);  // kick waiters
-  finish_contexts(g_ceph_context, waiting_for_active);  // kick waiters
-}
-
-void MDS::recovery_done(int oldstate)
-{
-  dout(1) << "recovery_done -- successful recovery!" << dendl;
-  assert(is_clientreplay() || is_active());
-  
-  // kick snaptable (resent AGREEs)
-  if (mdsmap->get_tableserver() == whoami) {
-    set<mds_rank_t> active;
-    mdsmap->get_clientreplay_or_active_or_stopping_mds_set(active);
-    snapserver->finish_recovery(active);
-  }
-
-  if (oldstate == MDSMap::STATE_CREATING)
-    return;
-
-  mdcache->start_recovered_truncates();
-  mdcache->do_file_recover();
-
-  mdcache->reissue_all_caps();
-  
-  // tell connected clients
-  //bcast_mds_map();     // not anymore, they get this from the monitor
-
-  mdcache->populate_mydir();
-}
-
-void MDS::handle_mds_recovery(mds_rank_t who) 
-{
-  dout(5) << "handle_mds_recovery mds." << who << dendl;
-  
-  mdcache->handle_mds_recovery(who);
-
-  if (mdsmap->get_tableserver() == whoami) {
-    snapserver->handle_mds_recovery(who);
-  }
-
-  queue_waiters(waiting_for_active_peer[who]);
-  waiting_for_active_peer.erase(who);
-}
-
-void MDS::handle_mds_failure(mds_rank_t who)
-{
-  if (who == whoami) {
-    dout(5) << "handle_mds_failure for myself; not doing anything" << dendl;
-    return;
-  }
-  dout(5) << "handle_mds_failure mds." << who << dendl;
-
-  mdcache->handle_mds_failure(who);
-
-  snapclient->handle_mds_failure(who);
-}
-
-void MDS::stopping_start()
-{
-  dout(2) << "stopping_start" << dendl;
-
-  if (mdsmap->get_num_in_mds() == 1 && !sessionmap.empty()) {
-    // we're the only mds up!
-    dout(0) << "we are the last MDS, and have mounted clients: we cannot flush our journal.  suicide!" << dendl;
-    suicide();
-  }
-
-  mdcache->shutdown_start();
-}
-
-void MDS::stopping_done()
-{
-  dout(2) << "stopping_done" << dendl;
-
-  // tell monitor we shut down cleanly.
-  request_state(MDSMap::STATE_STOPPED);
-}
-
-void MDS::handle_signal(int signum)
-{
-  assert(signum == SIGINT || signum == SIGTERM);
-  derr << "*** got signal " << sys_siglist[signum] << " ***" << dendl;
-  mds_lock.Lock();
-  suicide();
-  mds_lock.Unlock();
-}
-
-void MDS::suicide()
-{
-  assert(mds_lock.is_locked());
-  set_want_state(MDSMap::STATE_DNE); // whatever.
-
-  dout(1) << "suicide.  wanted " << ceph_mds_state_name(want_state)
-	  << ", now " << ceph_mds_state_name(state) << dendl;
-
-  mdlog->shutdown();
-
-  finisher.stop(); // no flushing
-
-  // stop timers
-  beacon.shutdown();
-  if (tick_event) {
-    timer.cancel_event(tick_event);
-    tick_event = 0;
-  }
-  timer.cancel_all_events();
-  //timer.join();
-  timer.shutdown();
-  
-  clean_up_admin_socket();
-
-  // shut down cache
-  mdcache->shutdown();
-
-  if (objecter->initialized.read())
-    objecter->shutdown();
-
-  monc->shutdown();
-
-  op_tracker.on_shutdown();
-
-  progress_thread.shutdown();
-
-  // shut down messenger
-  messenger->shutdown();
-
-  // Workaround unclean shutdown: HeartbeatMap will assert if
-  // worker is not removed (as we do in ~MDS), but ~MDS is not
-  // always called after suicide.
-  if (hb) {
-    cct->get_heartbeat_map()->remove_worker(hb);
-    hb = NULL;
-  }
-}
-
-void MDS::respawn()
-{
-  dout(1) << "respawn" << dendl;
-
-  char *new_argv[orig_argc+1];
-  dout(1) << " e: '" << orig_argv[0] << "'" << dendl;
-  for (int i=0; i<orig_argc; i++) {
-    new_argv[i] = (char *)orig_argv[i];
-    dout(1) << " " << i << ": '" << orig_argv[i] << "'" << dendl;
-  }
-  new_argv[orig_argc] = NULL;
-
-  /* Determine the path to our executable, try to read
-   * linux-specific /proc/ path first */
-  char exe_path[PATH_MAX];
-  ssize_t exe_path_bytes = readlink("/proc/self/exe", exe_path,
-				    sizeof(exe_path) - 1);
-  if (exe_path_bytes < 0) {
-    /* Print CWD for the user's interest */
-    char buf[PATH_MAX];
-    char *cwd = getcwd(buf, sizeof(buf));
-    assert(cwd);
-    dout(1) << " cwd " << cwd << dendl;
-
-    /* Fall back to a best-effort: just running in our CWD */
-    strncpy(exe_path, orig_argv[0], sizeof(exe_path) - 1);
-  } else {
-    exe_path[exe_path_bytes] = '\0';
-  }
-
-  dout(1) << " exe_path " << exe_path << dendl;
-
-  unblock_all_signals(NULL);
-  execv(exe_path, new_argv);
-
-  dout(0) << "respawn execv " << orig_argv[0]
-	  << " failed with " << cpp_strerror(errno) << dendl;
-  suicide();
-}
-
-void MDS::handle_write_error(int err)
-{
-  if (err == -EBLACKLISTED) {
-    derr << "we have been blacklisted (fenced), respawning..." << dendl;
-    respawn();
-    return;
-  }
-
-  if (g_conf->mds_action_on_write_error >= 2) {
-    derr << "unhandled write error " << cpp_strerror(err) << ", suicide..." << dendl;
-    suicide();
-  } else if (g_conf->mds_action_on_write_error == 1) {
-    derr << "unhandled write error " << cpp_strerror(err) << ", force readonly..." << dendl;
-    mdcache->force_readonly();
-  } else {
-    // ignore;
-    derr << "unhandled write error " << cpp_strerror(err) << ", ignore..." << dendl;
-  }
-}
-
-bool MDS::ms_dispatch(Message *m)
-{
-  bool ret;
-  mds_lock.Lock();
-
-  heartbeat_reset();
-
-  if (want_state == CEPH_MDS_STATE_DNE) {
-    dout(10) << " stopping, discarding " << *m << dendl;
-    m->put();
-    ret = true;
-  } else {
-    inc_dispatch_depth();
-    ret = _dispatch(m);
-    dec_dispatch_depth();
-  }
-  mds_lock.Unlock();
-  return ret;
-}
-
-bool MDS::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new)
-{
-  dout(10) << "MDS::ms_get_authorizer type=" << ceph_entity_type_name(dest_type) << dendl;
-
-  /* monitor authorization is being handled on different layer */
-  if (dest_type == CEPH_ENTITY_TYPE_MON)
-    return true;
-
-  if (force_new) {
-    if (monc->wait_auth_rotating(10) < 0)
-      return false;
-  }
-
-  *authorizer = monc->auth->build_authorizer(dest_type);
-  return *authorizer != NULL;
-}
-
-
-#define ALLOW_MESSAGES_FROM(peers) \
-do { \
-  if (m->get_connection() && (m->get_connection()->get_peer_type() & (peers)) == 0) { \
-    dout(0) << __FILE__ << "." << __LINE__ << ": filtered out request, peer=" << m->get_connection()->get_peer_type() \
-           << " allowing=" << #peers << " message=" << *m << dendl; \
-    m->put();							    \
-    return true; \
-  } \
-} while (0)
-
-
-/*
- * high priority messages we always process
- */
-bool MDS::handle_core_message(Message *m)
-{
-  switch (m->get_type()) {
-  case CEPH_MSG_MON_MAP:
-    ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MON);
-    m->put();
-    break;
-
-    // MDS
-  case CEPH_MSG_MDS_MAP:
-    ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_MDS);
-    handle_mds_map(static_cast<MMDSMap*>(m));
-    break;
-
-    // misc
-  case MSG_MON_COMMAND:
-    ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MON);
-    handle_command(static_cast<MMonCommand*>(m));
-    break;    
-
-    // OSD
-  case MSG_COMMAND:
-    handle_command(static_cast<MCommand*>(m));
-    break;
-  case CEPH_MSG_OSD_MAP:
-    ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD);
-
-    if (is_active() && snapserver) {
-      snapserver->check_osd_map(true);
-    }
-
-    server->handle_osd_map();
-
-    // By default the objecter only requests OSDMap updates on use,
-    // we would like to always receive the latest maps in order to
-    // apply policy based on the FULL flag.
-    objecter->maybe_request_map();
-
-    break;
-
-  default:
-    return false;
-  }
-  return true;
-}
-
-/*
- * lower priority messages we defer if we seem laggy
- */
-bool MDS::handle_deferrable_message(Message *m)
-{
-  int port = m->get_type() & 0xff00;
-
-  switch (port) {
-  case MDS_PORT_CACHE:
-    ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS);
-    mdcache->dispatch(m);
-    break;
-    
-  case MDS_PORT_MIGRATOR:
-    ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS);
-    mdcache->migrator->dispatch(m);
-    break;
-    
-  default:
-    switch (m->get_type()) {
-      // SERVER
-    case CEPH_MSG_CLIENT_SESSION:
-    case CEPH_MSG_CLIENT_RECONNECT:
-      ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_CLIENT);
-      // fall-thru
-    case CEPH_MSG_CLIENT_REQUEST:
-      server->dispatch(m);
-      break;
-    case MSG_MDS_SLAVE_REQUEST:
-      ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS);
-      server->dispatch(m);
-      break;
-      
-    case MSG_MDS_HEARTBEAT:
-      ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS);
-      balancer->proc_message(m);
-      break;
-	  
-    case MSG_MDS_TABLE_REQUEST:
-      ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS);
-      {
-	MMDSTableRequest *req = static_cast<MMDSTableRequest*>(m);
-	if (req->op < 0) {
-	  MDSTableClient *client = get_table_client(req->table);
-	      client->handle_request(req);
-	} else {
-	  MDSTableServer *server = get_table_server(req->table);
-	  server->handle_request(req);
-	}
-      }
-      break;
-
-    case MSG_MDS_LOCK:
-    case MSG_MDS_INODEFILECAPS:
-      ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS);
-      locker->dispatch(m);
-      break;
-      
-    case CEPH_MSG_CLIENT_CAPS:
-    case CEPH_MSG_CLIENT_CAPRELEASE:
-    case CEPH_MSG_CLIENT_LEASE:
-      ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_CLIENT);
-      locker->dispatch(m);
-      break;
-      
-    default:
-      return false;
-    }
-  }
-
-  return true;
-}
-
-bool MDS::is_stale_message(Message *m)
-{
-  // from bad mds?
-  if (m->get_source().is_mds()) {
-    mds_rank_t from = mds_rank_t(m->get_source().num());
-    if (!mdsmap->have_inst(from) ||
-	mdsmap->get_inst(from) != m->get_source_inst() ||
-	mdsmap->is_down(from)) {
-      // bogus mds?
-      if (m->get_type() == CEPH_MSG_MDS_MAP) {
-	dout(5) << "got " << *m << " from old/bad/imposter mds " << m->get_source()
-		<< ", but it's an mdsmap, looking at it" << dendl;
-      } else if (m->get_type() == MSG_MDS_CACHEEXPIRE &&
-		 mdsmap->get_inst(from) == m->get_source_inst()) {
-	dout(5) << "got " << *m << " from down mds " << m->get_source()
-		<< ", but it's a cache_expire, looking at it" << dendl;
-      } else {
-	dout(5) << "got " << *m << " from down/old/bad/imposter mds " << m->get_source()
-		<< ", dropping" << dendl;
-	return true;
-      }
-    }
-  }
-  return false;
-}
-
-/**
- * Advance finished_queue and waiting_for_nolaggy.
- *
- * Usually drain both queues, but may not drain waiting_for_nolaggy
- * if beacon is currently laggy.
- */
-void MDS::_advance_queues()
-{
-  assert(mds_lock.is_locked_by_me());
-
-  while (!finished_queue.empty()) {
-    dout(7) << "mds has " << finished_queue.size() << " queued contexts" << dendl;
-    dout(10) << finished_queue << dendl;
-    list<MDSInternalContextBase*> ls;
-    ls.swap(finished_queue);
-    while (!ls.empty()) {
-      dout(10) << " finish " << ls.front() << dendl;
-      ls.front()->complete(0);
-      ls.pop_front();
-
-      heartbeat_reset();
-    }
-  }
-
-  while (!waiting_for_nolaggy.empty()) {
-    // stop if we're laggy now!
-    if (beacon.is_laggy())
-      break;
-
-    Message *old = waiting_for_nolaggy.front();
-    waiting_for_nolaggy.pop_front();
-
-    if (is_stale_message(old)) {
-      old->put();
-    } else {
-      dout(7) << " processing laggy deferred " << *old << dendl;
-      handle_deferrable_message(old);
-    }
-
-    heartbeat_reset();
-  }
-}
-
-/* If this function returns true, it has put the message. If it returns false,
- * it has not put the message. */
-bool MDS::_dispatch(Message *m)
-{
-  if (is_stale_message(m)) {
-    m->put();
-    return true;
-  }
-
-  // core
-  if (!handle_core_message(m)) {
-    if (beacon.is_laggy()) {
-      dout(10) << " laggy, deferring " << *m << dendl;
-      waiting_for_nolaggy.push_back(m);
-    } else {
-      if (!handle_deferrable_message(m)) {
-	dout(0) << "unrecognized message " << *m << dendl;
-	m->put();
-	return false;
-      }
-    }
-  }
-
-  if (dispatch_depth > 1)
-    return true;
-
-  // finish any triggered contexts
-  _advance_queues();
-
-  if (beacon.is_laggy()) {
-    // We've gone laggy during dispatch, don't do any
-    // more housekeeping
-    return true;
-  }
-
-  // done with all client replayed requests?
-  if (is_clientreplay() &&
-      mdcache->is_open() &&
-      replay_queue.empty() &&
-      want_state == MDSMap::STATE_CLIENTREPLAY) {
-    int num_requests = mdcache->get_num_client_requests();
-    dout(10) << " still have " << num_requests << " active replay requests" << dendl;
-    if (num_requests == 0)
-      clientreplay_done();
-  }
-
-  // hack: thrash exports
-  static utime_t start;
-  utime_t now = ceph_clock_now(g_ceph_context);
-  if (start == utime_t()) 
-    start = now;
-  /*double el = now - start;
-  if (el > 30.0 &&
-    el < 60.0)*/
-  for (int i=0; i<g_conf->mds_thrash_exports; i++) {
-    set<mds_rank_t> s;
-    if (!is_active()) break;
-    mdsmap->get_mds_set(s, MDSMap::STATE_ACTIVE);
-    if (s.size() < 2 || mdcache->get_num_inodes() < 10) 
-      break;  // need peers for this to work.
-
-    dout(7) << "mds thrashing exports pass " << (i+1) << "/" << g_conf->mds_thrash_exports << dendl;
-    
-    // pick a random dir inode
-    CInode *in = mdcache->hack_pick_random_inode();
-
-    list<CDir*> ls;
-    in->get_dirfrags(ls);
-    if (ls.empty())
-      continue;                // must be an open dir.
-    list<CDir*>::iterator p = ls.begin();
-    int n = rand() % ls.size();
-    while (n--)
-      ++p;
-    CDir *dir = *p;
-    if (!dir->get_parent_dir()) continue;    // must be linked.
-    if (!dir->is_auth()) continue;           // must be auth.
-
-    mds_rank_t dest;
-    do {
-      int k = rand() % s.size();
-      set<mds_rank_t>::iterator p = s.begin();
-      while (k--) ++p;
-      dest = *p;
-    } while (dest == whoami);
-    mdcache->migrator->export_dir_nicely(dir,dest);
-  }
-  // hack: thrash fragments
-  for (int i=0; i<g_conf->mds_thrash_fragments; i++) {
-    if (!is_active()) break;
-    if (mdcache->get_num_fragmenting_dirs() > 5) break;
-    dout(7) << "mds thrashing fragments pass " << (i+1) << "/" << g_conf->mds_thrash_fragments << dendl;
-    
-    // pick a random dir inode
-    CInode *in = mdcache->hack_pick_random_inode();
-
-    list<CDir*> ls;
-    in->get_dirfrags(ls);
-    if (ls.empty()) continue;                // must be an open dir.
-    CDir *dir = ls.front();
-    if (!dir->get_parent_dir()) continue;    // must be linked.
-    if (!dir->is_auth()) continue;           // must be auth.
-    frag_t fg = dir->get_frag();
-    if (fg == frag_t() || (rand() % (1 << fg.bits()) == 0))
-      mdcache->split_dir(dir, 1);
-    else
-      balancer->queue_merge(dir);
-  }
-
-  // hack: force hash root?
-  /*
-  if (false &&
-      mdcache->get_root() &&
-      mdcache->get_root()->dir &&
-      !(mdcache->get_root()->dir->is_hashed() || 
-        mdcache->get_root()->dir->is_hashing())) {
-    dout(0) << "hashing root" << dendl;
-    mdcache->migrator->hash_dir(mdcache->get_root()->dir);
-  }
-  */
-
-  if (mlogger) {
-    mlogger->set(l_mdm_ino, g_num_ino);
-    mlogger->set(l_mdm_dir, g_num_dir);
-    mlogger->set(l_mdm_dn, g_num_dn);
-    mlogger->set(l_mdm_cap, g_num_cap);
-
-    mlogger->inc(l_mdm_inoa, g_num_inoa);  g_num_inoa = 0;
-    mlogger->inc(l_mdm_inos, g_num_inos);  g_num_inos = 0;
-    mlogger->inc(l_mdm_dira, g_num_dira);  g_num_dira = 0;
-    mlogger->inc(l_mdm_dirs, g_num_dirs);  g_num_dirs = 0;
-    mlogger->inc(l_mdm_dna, g_num_dna);  g_num_dna = 0;
-    mlogger->inc(l_mdm_dns, g_num_dns);  g_num_dns = 0;
-    mlogger->inc(l_mdm_capa, g_num_capa);  g_num_capa = 0;
-    mlogger->inc(l_mdm_caps, g_num_caps);  g_num_caps = 0;
-
-    mlogger->set(l_mdm_buf, buffer::get_total_alloc());
-
-  }
-
-  // shut down?
-  if (is_stopping()) {
-    mdlog->trim();
-    if (mdcache->shutdown_pass()) {
-      dout(7) << "shutdown_pass=true, finished w/ shutdown, moving to down:stopped" << dendl;
-      stopping_done();
-    }
-    else {
-      dout(7) << "shutdown_pass=false" << dendl;
-    }
-  }
-  return true;
-}
-
-
-
-
-void MDS::ms_handle_connect(Connection *con) 
-{
-}
-
-bool MDS::ms_handle_reset(Connection *con) 
-{
-  if (con->get_peer_type() != CEPH_ENTITY_TYPE_CLIENT)
-    return false;
-
-  Mutex::Locker l(mds_lock);
-  dout(5) << "ms_handle_reset on " << con->get_peer_addr() << dendl;
-  if (want_state == CEPH_MDS_STATE_DNE)
-    return false;
-
-  Session *session = static_cast<Session *>(con->get_priv());
-  if (session) {
-    if (session->is_closed()) {
-      dout(3) << "ms_handle_reset closing connection for session " << session->info.inst << dendl;
-      con->mark_down();
-      con->set_priv(NULL);
-    }
-    session->put();
-  } else {
-    con->mark_down();
-  }
-  return false;
-}
-
-
-void MDS::ms_handle_remote_reset(Connection *con) 
-{
-  if (con->get_peer_type() != CEPH_ENTITY_TYPE_CLIENT)
-    return;
-
-  Mutex::Locker l(mds_lock);
-  dout(5) << "ms_handle_remote_reset on " << con->get_peer_addr() << dendl;
-  if (want_state == CEPH_MDS_STATE_DNE)
-    return;
-
-  Session *session = static_cast<Session *>(con->get_priv());
-  if (session) {
-    if (session->is_closed()) {
-      dout(3) << "ms_handle_remote_reset closing connection for session " << session->info.inst << dendl;
-      con->mark_down();
-      con->set_priv(NULL);
-    }
-    session->put();
-  }
-}
-
-bool MDS::ms_verify_authorizer(Connection *con, int peer_type,
-			       int protocol, bufferlist& authorizer_data, bufferlist& authorizer_reply,
-			       bool& is_valid, CryptoKey& session_key)
-{
-  Mutex::Locker l(mds_lock);
-  if (want_state == CEPH_MDS_STATE_DNE)
-    return false;
-
-  AuthAuthorizeHandler *authorize_handler = 0;
-  switch (peer_type) {
-  case CEPH_ENTITY_TYPE_MDS:
-    authorize_handler = authorize_handler_cluster_registry->get_handler(protocol);
-    break;
-  default:
-    authorize_handler = authorize_handler_service_registry->get_handler(protocol);
-  }
-  if (!authorize_handler) {
-    dout(0) << "No AuthAuthorizeHandler found for protocol " << protocol << dendl;
-    is_valid = false;
-    return true;
-  }
-
-  AuthCapsInfo caps_info;
-  EntityName name;
-  uint64_t global_id;
-
-  is_valid = authorize_handler->verify_authorizer(cct, monc->rotating_secrets,
-						  authorizer_data, authorizer_reply, name, global_id, caps_info, session_key);
-
-  if (is_valid) {
-    // wire up a Session* to this connection, and add it to the session map
-    entity_name_t n(con->get_peer_type(), global_id);
-    Session *s = sessionmap.get_session(n);
-    if (!s) {
-      s = new Session;
-      s->info.inst.addr = con->get_peer_addr();
-      s->info.inst.name = n;
-      dout(10) << " new session " << s << " for " << s->info.inst << " con " << con << dendl;
-      con->set_priv(s);
-      s->connection = con;
-    } else {
-      dout(10) << " existing session " << s << " for " << s->info.inst << " existing con " << s->connection
-	       << ", new/authorizing con " << con << dendl;
-      con->set_priv(s->get());
-
-
-
-      // Wait until we fully accept the connection before setting
-      // s->connection.  In particular, if there are multiple incoming
-      // connection attempts, they will all get their authorizer
-      // validated, but some of them may "lose the race" and get
-      // dropped.  We only want to consider the winner(s).  See
-      // ms_handle_accept().  This is important for Sessions we replay
-      // from the journal on recovery that don't have established
-      // messenger state; we want the con from only the winning
-      // connect attempt(s).  (Normal reconnects that don't follow MDS
-      // recovery are reconnected to the existing con by the
-      // messenger.)
-    }
-
-    if (caps_info.allow_all) {
-        // Flag for auth providers that don't provide cap strings
-        s->auth_caps.set_allow_all();
-    }
-
-    bufferlist::iterator p = caps_info.caps.begin();
-    string auth_cap_str;
-    try {
-      ::decode(auth_cap_str, p);
-
-      dout(10) << __func__ << ": parsing auth_cap_str='" << auth_cap_str << "'" << dendl;
-      std::ostringstream errstr;
-      if (!s->auth_caps.parse(auth_cap_str, &errstr)) {
-        dout(1) << __func__ << ": auth cap parse error: " << errstr.str()
-          << " parsing '" << auth_cap_str << "'" << dendl;
-      }
-    } catch (buffer::error& e) {
-      // Assume legacy auth, defaults to:
-      //  * permit all filesystem ops
-      //  * permit no `tell` ops
-      dout(1) << __func__ << ": cannot decode auth caps bl of length " << caps_info.caps.length() << dendl;
-    }
-  }
-
-  return true;  // we made a decision (see is_valid)
-}
-
-
-void MDS::ms_handle_accept(Connection *con)
-{
-  Mutex::Locker l(mds_lock);
-  Session *s = static_cast<Session *>(con->get_priv());
-  dout(10) << "ms_handle_accept " << con->get_peer_addr() << " con " << con << " session " << s << dendl;
-  if (s) {
-    if (s->connection != con) {
-      dout(10) << " session connection " << s->connection << " -> " << con << dendl;
-      s->connection = con;
-
-      // send out any queued messages
-      while (!s->preopen_out_queue.empty()) {
-	con->send_message(s->preopen_out_queue.front());
-	s->preopen_out_queue.pop_front();
-      }
-    }
-    s->put();
-  }
-}
-
-void MDS::set_want_state(MDSMap::DaemonState newstate)
-{
-  if (want_state != newstate) {
-    dout(10) << __func__ << " "
-      << ceph_mds_state_name(want_state) << " -> "
-      << ceph_mds_state_name(newstate) << dendl;
-    want_state = newstate;
-    beacon.notify_want_state(newstate);
-  }
-}
-
-/**
- * Call this when you take mds_lock, or periodically if you're going to
- * hold the lock for a long time (e.g. iterating over clients/inodes)
- */
-void MDS::heartbeat_reset()
-{
-  // Any thread might jump into mds_lock and call us immediately
-  // after a call to suicide() completes, in which case MDS::hb
-  // has been freed and we are a no-op.
-  if (!hb) {
-      assert(state == CEPH_MDS_STATE_DNE);
-      return;
-  }
-
-  // NB not enabling suicide grace, because the mon takes care of killing us
-  // (by blacklisting us) when we fail to send beacons, and it's simpler to
-  // only have one way of dying.
-  cct->get_heartbeat_map()->reset_timeout(hb, g_conf->mds_beacon_grace, 0);
-}
-
-
-void *MDS::ProgressThread::entry()
-{
-  Mutex::Locker l(mds->mds_lock);
-  while (true) {
-    while (!stopping &&
-	   mds->finished_queue.empty() &&
-	   (mds->waiting_for_nolaggy.empty() || mds->beacon.is_laggy())) {
-      cond.Wait(mds->mds_lock);
-    }
-
-    if (stopping) {
-      break;
-    }
-
-    mds->_advance_queues();
-  }
-
-  return NULL;
-}
-
-
-void MDS::ProgressThread::shutdown()
-{
-  assert(mds->mds_lock.is_locked_by_me());
-
-  stopping = true;
-  cond.Signal();
-  mds->mds_lock.Unlock();
-  if (is_started())
-    join();
-  mds->mds_lock.Lock();
-}
-
-/**
- * This is used whenever a RADOS operation has been cancelled
- * or a RADOS client has been blacklisted, to cause the MDS and
- * any clients to wait for this OSD epoch before using any new caps.
- *
- * See doc/cephfs/eviction
- */
-void MDS::set_osd_epoch_barrier(epoch_t e)
-{
-  dout(4) << __func__ << ": epoch=" << e << dendl;
-  osd_epoch_barrier = e;
-}
diff --git a/src/mds/MDS.h b/src/mds/MDS.h
deleted file mode 100644
index 5da3a8b..0000000
--- a/src/mds/MDS.h
+++ /dev/null
@@ -1,496 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage at newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software 
- * Foundation.  See file COPYING.
- * 
- */
-
-
-
-#ifndef CEPH_MDS_H
-#define CEPH_MDS_H
-
-#include "mdstypes.h"
-
-#include "msg/Dispatcher.h"
-#include "include/CompatSet.h"
-#include "include/types.h"
-#include "include/Context.h"
-#include "common/DecayCounter.h"
-#include "common/perf_counters.h"
-#include "common/Mutex.h"
-#include "common/Cond.h"
-#include "common/Timer.h"
-#include "common/LogClient.h"
-#include "common/TrackedOp.h"
-#include "common/Finisher.h"
-#include "common/cmdparse.h"
-
-#include "MDSMap.h"
-
-#include "SessionMap.h"
-#include "Beacon.h"
-
-
-#define CEPH_MDS_PROTOCOL    24 /* cluster internal */
-
-enum {
-  l_mds_first = 2000,
-  l_mds_request,
-  l_mds_reply,
-  l_mds_reply_latency,
-  l_mds_forward,
-  l_mds_dir_fetch,
-  l_mds_dir_commit,
-  l_mds_dir_split,
-  l_mds_inode_max,
-  l_mds_inodes,
-  l_mds_inodes_top,
-  l_mds_inodes_bottom,
-  l_mds_inodes_pin_tail,
-  l_mds_inodes_pinned,
-  l_mds_inodes_expired,
-  l_mds_inodes_with_caps,
-  l_mds_caps,
-  l_mds_subtrees,
-  l_mds_traverse,
-  l_mds_traverse_hit,
-  l_mds_traverse_forward,
-  l_mds_traverse_discover,
-  l_mds_traverse_dir_fetch,
-  l_mds_traverse_remote_ino,
-  l_mds_traverse_lock,
-  l_mds_load_cent,
-  l_mds_dispatch_queue_len,
-  l_mds_exported,
-  l_mds_exported_inodes,
-  l_mds_imported,
-  l_mds_imported_inodes,
-  l_mds_last,
-};
-
-// memory utilization
-enum {
-  l_mdm_first = 2500,
-  l_mdm_ino,
-  l_mdm_inoa,
-  l_mdm_inos,
-  l_mdm_dir,
-  l_mdm_dira,
-  l_mdm_dirs,
-  l_mdm_dn,
-  l_mdm_dna,
-  l_mdm_dns,
-  l_mdm_cap,
-  l_mdm_capa,
-  l_mdm_caps,
-  l_mdm_rss,
-  l_mdm_heap,
-  l_mdm_malloc,
-  l_mdm_buf,
-  l_mdm_last,
-};
-
-
-
-namespace ceph {
-  struct heartbeat_handle_d;
-}
-class filepath;
-
-class MonClient;
-
-class Objecter;
-class Filer;
-
-class Server;
-class Locker;
-class MDCache;
-class MDLog;
-class MDBalancer;
-class MDSInternalContextBase;
-
-class CInode;
-class CDir;
-class CDentry;
-
-class Messenger;
-class Message;
-
-class MClientRequest;
-class MClientReply;
-
-class MMDSBeacon;
-
-class InoTable;
-class SnapServer;
-class SnapClient;
-
-class MDSTableServer;
-class MDSTableClient;
-
-class AuthAuthorizeHandlerRegistry;
-
-class MDS : public Dispatcher, public md_config_obs_t {
- public:
-  Mutex        mds_lock;
-  SafeTimer    timer;
-
- private:
-  ceph::heartbeat_handle_d *hb;  // Heartbeat for threads using mds_lock
-  void heartbeat_reset();
-  Beacon  beacon;
-  void set_want_state(MDSMap::DaemonState newstate);
- public:
-  utime_t get_laggy_until() {return beacon.get_laggy_until();}
-
-  AuthAuthorizeHandlerRegistry *authorize_handler_cluster_registry;
-  AuthAuthorizeHandlerRegistry *authorize_handler_service_registry;
-
-  string name;
-  mds_rank_t whoami;
-  int incarnation;
-
-  mds_rank_t standby_for_rank;
-  MDSMap::DaemonState standby_type;  // one of STANDBY_REPLAY, ONESHOT_REPLAY
-  string standby_for_name;
-  bool standby_replaying;  // true if current replay pass is in standby-replay mode
-
-  Messenger    *messenger;
-  MonClient    *monc;
-  MDSMap       *mdsmap;
-  Objecter     *objecter;
-  Filer        *filer;       // for reading/writing to/from osds
-  LogClient    log_client;
-  LogChannelRef clog;
-
-  // sub systems
-  Server       *server;
-  MDCache      *mdcache;
-  Locker       *locker;
-  MDLog        *mdlog;
-  MDBalancer   *balancer;
-
-  InoTable     *inotable;
-
-  SnapServer   *snapserver;
-  SnapClient   *snapclient;
-
-  MDSTableClient *get_table_client(int t);
-  MDSTableServer *get_table_server(int t);
-
-  PerfCounters       *logger, *mlogger;
-  OpTracker    op_tracker;
-
-  Finisher finisher;
-
-  int orig_argc;
-  const char **orig_argv;
-
- protected:
-  // -- MDS state --
-  MDSMap::DaemonState last_state;
-  MDSMap::DaemonState state;         // my confirmed state
-  MDSMap::DaemonState want_state;    // the state i want
-
-  list<MDSInternalContextBase*> waiting_for_active, waiting_for_replay, waiting_for_reconnect, waiting_for_resolve;
-  list<MDSInternalContextBase*> replay_queue;
-  map<mds_rank_t, list<MDSInternalContextBase*> > waiting_for_active_peer;
-  list<Message*> waiting_for_nolaggy;
-  map<epoch_t, list<MDSInternalContextBase*> > waiting_for_mdsmap;
-
-  map<mds_rank_t, version_t> peer_mdsmap_epoch;
-
-  ceph_tid_t last_tid;    // for mds-initiated requests (e.g. stray rename)
-
-  epoch_t osd_epoch_barrier;
-
- public:
-  void set_osd_epoch_barrier(epoch_t e);
-  epoch_t get_osd_epoch_barrier() const {return osd_epoch_barrier;}
-
-  void wait_for_active(MDSInternalContextBase *c) { 
-    waiting_for_active.push_back(c); 
-  }
-  void wait_for_active_peer(mds_rank_t who, MDSInternalContextBase *c) { 
-    waiting_for_active_peer[who].push_back(c);
-  }
-  void wait_for_replay(MDSInternalContextBase *c) { 
-    waiting_for_replay.push_back(c); 
-  }
-  void wait_for_reconnect(MDSInternalContextBase *c) {
-    waiting_for_reconnect.push_back(c);
-  }
-  void wait_for_resolve(MDSInternalContextBase *c) {
-    waiting_for_resolve.push_back(c);
-  }
-  void wait_for_mdsmap(epoch_t e, MDSInternalContextBase *c) {
-    waiting_for_mdsmap[e].push_back(c);
-  }
-  void enqueue_replay(MDSInternalContextBase *c) {
-    replay_queue.push_back(c);
-  }
-
-  MDSMap::DaemonState get_state() { return state; } 
-  MDSMap::DaemonState get_want_state() { return want_state; } 
-  bool is_creating() { return state == MDSMap::STATE_CREATING; }
-  bool is_starting() { return state == MDSMap::STATE_STARTING; }
-  bool is_standby()  { return state == MDSMap::STATE_STANDBY; }
-  bool is_replay()   { return state == MDSMap::STATE_REPLAY; }
-  bool is_standby_replay() { return state == MDSMap::STATE_STANDBY_REPLAY; }
-  bool is_resolve()  { return state == MDSMap::STATE_RESOLVE; }
-  bool is_reconnect() { return state == MDSMap::STATE_RECONNECT; }
-  bool is_rejoin()   { return state == MDSMap::STATE_REJOIN; }
-  bool is_clientreplay()   { return state == MDSMap::STATE_CLIENTREPLAY; }
-  bool is_active()   { return state == MDSMap::STATE_ACTIVE; }
-  bool is_stopping() { return state == MDSMap::STATE_STOPPING; }
-
-  bool is_oneshot_replay()   { return state == MDSMap::STATE_ONESHOT_REPLAY; }
-  bool is_any_replay() { return (is_replay() || is_standby_replay() ||
-                                 is_oneshot_replay()); }
-
-  bool is_stopped()  { return mdsmap->is_stopped(whoami); }
-
-  void request_state(MDSMap::DaemonState s);
-
-  ceph_tid_t issue_tid() { return ++last_tid; }
-    
-
-  // -- waiters --
-private:
-  list<MDSInternalContextBase*> finished_queue;
-  void _advance_queues();
-public:
-
-  void queue_waiter(MDSInternalContextBase *c) {
-    finished_queue.push_back(c);
-    progress_thread.signal();
-  }
-  void queue_waiters(list<MDSInternalContextBase*>& ls) {
-    finished_queue.splice( finished_queue.end(), ls );
-    progress_thread.signal();
-  }
-  bool queue_one_replay() {
-    if (replay_queue.empty())
-      return false;
-    queue_waiter(replay_queue.front());
-    replay_queue.pop_front();
-    return true;
-  }
-  
-  // tick and other timer fun
-  class C_MDS_Tick : public MDSInternalContext {
-  public:
-    C_MDS_Tick(MDS *m) : MDSInternalContext(m) {}
-    void finish(int r) {
-      mds->tick_event = 0;
-      mds->tick();
-    }
-  } *tick_event;
-  void     reset_tick();
-
-  // -- client map --
-  SessionMap   sessionmap;
-  epoch_t      last_client_mdsmap_bcast;
-  //void log_clientmap(Context *c);
-
-
-  // shutdown crap
-  int req_rate;
-
-  // ino's and fh's
- public:
-
-  int get_req_rate() { return req_rate; }
-  Session *get_session(client_t client) {
-    return sessionmap.get_session(entity_name_t::CLIENT(client.v));
-  }
-
- private:
-  int dispatch_depth;
-  bool ms_dispatch(Message *m);
-  bool ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new);
-  bool ms_verify_authorizer(Connection *con, int peer_type,
-			       int protocol, bufferlist& authorizer_data, bufferlist& authorizer_reply,
-			       bool& isvalid, CryptoKey& session_key);
-  void ms_handle_accept(Connection *con);
-  void ms_handle_connect(Connection *con);
-  bool ms_handle_reset(Connection *con);
-  void ms_handle_remote_reset(Connection *con);
-
-private:
-  class ProgressThread : public Thread {
-    MDS *mds;
-    bool stopping;
-    Cond cond;
-  public:
-    ProgressThread(MDS *mds_) : mds(mds_), stopping(false) {}
-    void * entry(); 
-    void shutdown();
-    void signal() {cond.Signal();}
-  } progress_thread;
-  void _progress_thread();
-
- public:
-  MDS(const std::string &n, Messenger *m, MonClient *mc);
-  ~MDS();
-
-  // handle a signal (e.g., SIGTERM)
-  void handle_signal(int signum);
-
-  // who am i etc
-  mds_rank_t get_nodeid() const { return whoami; }
-  uint64_t get_metadata_pool() { return mdsmap->get_metadata_pool(); }
-  MDSMap *get_mds_map() { return mdsmap; }
-
-  void send_message_mds(Message *m, mds_rank_t mds);
-  void forward_message_mds(Message *req, mds_rank_t mds);
-
-  void send_message_client_counted(Message *m, client_t client);
-  void send_message_client_counted(Message *m, Session *session);
-  void send_message_client_counted(Message *m, Connection *connection);
-  void send_message_client_counted(Message *m, const ConnectionRef& con) {
-    send_message_client_counted(m, con.get());
-  }
-  void send_message_client(Message *m, Session *session);
-  void send_message(Message *m, Connection *c);
-  void send_message(Message *m, const ConnectionRef& c) {
-    send_message(m, c.get());
-  }
-
-  // start up, shutdown
-  int init(MDSMap::DaemonState wanted_state=MDSMap::STATE_BOOT);
-
-  // admin socket handling
-  friend class MDSSocketHook;
-  class MDSSocketHook *asok_hook;
-  bool asok_command(string command, cmdmap_t& cmdmap, string format,
-		    ostream& ss);
-  void set_up_admin_socket();
-  void clean_up_admin_socket();
-  void check_ops_in_flight(); // send off any slow ops to monitor
-  void command_scrub_path(Formatter *f, const string& path);
-  void command_flush_path(Formatter *f, const string& path);
-  void command_flush_journal(Formatter *f);
-  void command_get_subtrees(Formatter *f);
-  void command_export_dir(Formatter *f,
-      const std::string &path, mds_rank_t dest);
- private:
-  int _command_export_dir(const std::string &path, mds_rank_t dest);
-  int _command_flush_journal(std::stringstream *ss);
- public:
-    // config observer bits
-  virtual const char** get_tracked_conf_keys() const;
-  virtual void handle_conf_change(const struct md_config_t *conf,
-				  const std::set <std::string> &changed);
-  void create_logger();
-  void update_log_config();
-
-  void bcast_mds_map();  // to mounted clients
-
-  void boot_create();             // i am new mds.
-
- private:
-  typedef enum {
-    // The MDSMap is available, configure default layouts and structures
-    MDS_BOOT_INITIAL = 0,
-    // We are ready to open some inodes
-    MDS_BOOT_OPEN_ROOT,
-    // We are ready to do a replay if needed
-    MDS_BOOT_PREPARE_LOG,
-    // Replay is complete
-    MDS_BOOT_REPLAY_DONE
-  } BootStep;
-
-  friend class C_MDS_BootStart;
-  friend class C_MDS_InternalBootStart;
-  void boot_start(BootStep step=MDS_BOOT_INITIAL, int r=0);    // starting|replay
-  void calc_recovery_set();
- public:
-
-  void replay_start();
-  void creating_done();
-  void starting_done();
-  void replay_done();
-  void standby_replay_restart();
-  void _standby_replay_restart_finish(int r, uint64_t old_read_pos);
-  class C_MDS_StandbyReplayRestart;
-  class C_MDS_StandbyReplayRestartFinish;
-
-  void reopen_log();
-
-  void resolve_start();
-  void resolve_done();
-  void reconnect_start();
-  void reconnect_done();
-  void rejoin_joint_start();
-  void rejoin_start();
-  void rejoin_done();
-  void recovery_done(int oldstate);
-  void clientreplay_start();
-  void clientreplay_done();
-  void active_start();
-  void stopping_start();
-  void stopping_done();
-
-  void handle_mds_recovery(mds_rank_t who);
-  void handle_mds_failure(mds_rank_t who);
-
-  void suicide();
-  void respawn();
-  void handle_write_error(int err);
-
-  void tick();
-  
-
-  void inc_dispatch_depth() { ++dispatch_depth; }
-  void dec_dispatch_depth() { --dispatch_depth; }
-
-  // messages
-  bool _dispatch(Message *m);
-
-  protected:
-  bool is_stale_message(Message *m);
-
-  bool handle_core_message(Message *m);
-  bool handle_deferrable_message(Message *m);
-  
-  // special message types
-  int _handle_command_legacy(std::vector<std::string> args);
-  int _handle_command(
-      const cmdmap_t &cmdmap,
-      bufferlist const &inbl,
-      bufferlist *outbl,
-      std::string *outs,
-      Context **run_later);
-  void handle_command(class MMonCommand *m);
-  void handle_command(class MCommand *m);
-  void handle_mds_map(class MMDSMap *m);
-};
-
-
-/* This expects to be given a reference which it is responsible for.
- * The finish function calls functions which
- * will put the Message exactly once.*/
-class C_MDS_RetryMessage : public MDSInternalContext {
-  Message *m;
-public:
-  C_MDS_RetryMessage(MDS *mds, Message *m) : MDSInternalContext(mds) {
-    assert(m);
-    this->m = m;
-  }
-  virtual void finish(int r) {
-    mds->inc_dispatch_depth();
-    mds->_dispatch(m);
-    mds->dec_dispatch_depth();
-  }
-};
-
-#endif
diff --git a/src/mds/MDSContext.cc b/src/mds/MDSContext.cc
index 9a9a6f1..d1c3276 100644
--- a/src/mds/MDSContext.cc
+++ b/src/mds/MDSContext.cc
@@ -13,7 +13,7 @@
  */
 
 
-#include "MDS.h"
+#include "MDSRank.h"
 
 #include "MDSContext.h"
 
@@ -22,7 +22,7 @@
 
 
 void MDSInternalContextBase::complete(int r) {
-  MDS *mds = get_mds();
+  MDSRank *mds = get_mds();
 
   dout(10) << "MDSInternalContextBase::complete: " << typeid(*this).name() << dendl;
   assert(mds != NULL);
@@ -31,11 +31,11 @@ void MDSInternalContextBase::complete(int r) {
 }
 
 
-MDS *MDSInternalContext::get_mds() {
+MDSRank *MDSInternalContext::get_mds() {
   return mds;
 }
 
-MDS *MDSInternalContextWrapper::get_mds()
+MDSRank *MDSInternalContextWrapper::get_mds()
 {
   return mds;
 }
@@ -47,11 +47,17 @@ void MDSInternalContextWrapper::finish(int r)
 
 
 void MDSIOContextBase::complete(int r) {
-  MDS *mds = get_mds();
+  MDSRank *mds = get_mds();
 
   dout(10) << "MDSIOContextBase::complete: " << typeid(*this).name() << dendl;
   assert(mds != NULL);
   Mutex::Locker l(mds->mds_lock);
+  if (mds->is_daemon_stopping()) {
+    dout(4) << "MDSIOContextBase::complete: dropping for stopping "
+            << typeid(*this).name() << dendl;
+    return;
+  }
+
   if (r == -EBLACKLISTED) {
     derr << "MDSIOContextBase: blacklisted!  Restarting..." << dendl;
     mds->respawn();
@@ -60,11 +66,11 @@ void MDSIOContextBase::complete(int r) {
   }
 }
 
-MDS *MDSIOContext::get_mds() {
+MDSRank *MDSIOContext::get_mds() {
   return mds;
 }
 
-MDS *MDSIOContextWrapper::get_mds() {
+MDSRank *MDSIOContextWrapper::get_mds() {
   return mds;
 }
 
@@ -73,7 +79,7 @@ void MDSIOContextWrapper::finish(int r)
   fin->complete(r);
 }
 
-MDS *MDSInternalContextGather::get_mds()
+MDSRank *MDSInternalContextGather::get_mds()
 {
   derr << "Forbidden call to MDSInternalContextGather::get_mds by " << typeid(*this).name() << dendl;
   assert(0);
diff --git a/src/mds/MDSContext.h b/src/mds/MDSContext.h
index a5e8458..cd49f3e 100644
--- a/src/mds/MDSContext.h
+++ b/src/mds/MDSContext.h
@@ -18,7 +18,7 @@
 
 #include "include/Context.h"
 
-class MDS;
+class MDSRank;
 
 
 /**
@@ -31,7 +31,7 @@ class MDS;
 class MDSContext : public Context
 {
 protected:
-  virtual MDS *get_mds() = 0;
+  virtual MDSRank *get_mds() = 0;
 };
 
 
@@ -51,11 +51,11 @@ public:
 class MDSInternalContext : public MDSInternalContextBase
 {
 protected:
-  MDS *mds;
-  virtual MDS* get_mds();
+  MDSRank *mds;
+  virtual MDSRank* get_mds();
 
 public:
-  MDSInternalContext(MDS *mds_) : mds(mds_) {
+  MDSInternalContext(MDSRank *mds_) : mds(mds_) {
     assert(mds != NULL);
   }
 };
@@ -67,11 +67,11 @@ public:
 class MDSInternalContextWrapper : public MDSInternalContextBase
 {
 protected:
-  MDS *mds;
+  MDSRank *mds;
   Context *fin;
-  MDS *get_mds();
+  MDSRank *get_mds();
 public:
-  MDSInternalContextWrapper(MDS *m, Context *c) : mds(m), fin(c) {}
+  MDSInternalContextWrapper(MDSRank *m, Context *c) : mds(m), fin(c) {}
   void finish(int r);
 };
 
@@ -81,17 +81,17 @@ class MDSIOContextBase : public MDSContext
 };
 
 /**
- * Completion for an I/O operation, takes big MDS lock
+ * Completion for an I/O operation, takes big MDSRank lock
  * before executing finish function.
  */
 class MDSIOContext : public MDSIOContextBase
 {
 protected:
-  MDS *mds;
-  virtual MDS* get_mds();
+  MDSRank *mds;
+  virtual MDSRank* get_mds();
 
 public:
-  MDSIOContext(MDS *mds_) : mds(mds_) {
+  MDSIOContext(MDSRank *mds_) : mds(mds_) {
     assert(mds != NULL);
   }
 };
@@ -103,11 +103,11 @@ public:
 class MDSIOContextWrapper : public MDSIOContextBase
 {
 protected:
-  MDS *mds;
+  MDSRank *mds;
   Context *fin;
-  MDS *get_mds();
+  MDSRank *get_mds();
 public:
-  MDSIOContextWrapper(MDS *m, Context *c) : mds(m), fin(c) {}
+  MDSIOContextWrapper(MDSRank *m, Context *c) : mds(m), fin(c) {}
   void finish(int r);
 };
 
@@ -116,7 +116,7 @@ public:
  */
 class C_MDSInternalNoop : public MDSInternalContextBase
 {
-  virtual MDS* get_mds() {assert(0);}
+  virtual MDSRank* get_mds() {assert(0);}
 public:
   void finish(int r) {}
   void complete(int r) {}
@@ -132,7 +132,7 @@ class C_IO_Wrapper : public MDSIOContext
 private:
   MDSInternalContextBase *wrapped;
 public:
-  C_IO_Wrapper(MDS *mds_, MDSInternalContextBase *wrapped_) : MDSIOContext(mds_), wrapped(wrapped_) {
+  C_IO_Wrapper(MDSRank *mds_, MDSInternalContextBase *wrapped_) : MDSIOContext(mds_), wrapped(wrapped_) {
     assert(wrapped != NULL);
   }
   virtual void finish(int r) {
@@ -147,7 +147,7 @@ public:
 class MDSInternalContextGather : public MDSInternalContextBase
 {
 protected:
-  MDS *get_mds();
+  MDSRank *get_mds();
 };
 
 
@@ -156,7 +156,7 @@ class MDSGather : public C_GatherBase<MDSInternalContextBase, MDSInternalContext
 public:
   MDSGather(CephContext *cct, MDSInternalContextBase *onfinish) : C_GatherBase<MDSInternalContextBase, MDSInternalContextGather>(cct, onfinish) {}
 protected:
-  virtual MDS *get_mds() {return NULL;}
+  virtual MDSRank *get_mds() {return NULL;}
 };
 
 
diff --git a/src/mds/MDSDaemon.cc b/src/mds/MDSDaemon.cc
new file mode 100644
index 0000000..cf54e82
--- /dev/null
+++ b/src/mds/MDSDaemon.cc
@@ -0,0 +1,1348 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage at newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include <unistd.h>
+
+#include "global/signal_handler.h"
+
+#include "include/types.h"
+#include "include/str_list.h"
+#include "common/entity_name.h"
+#include "common/Clock.h"
+#include "common/signal.h"
+#include "common/ceph_argparse.h"
+#include "common/errno.h"
+
+#include "msg/Messenger.h"
+#include "mon/MonClient.h"
+
+#include "osdc/Objecter.h"
+
+#include "MDSMap.h"
+
+#include "MDSDaemon.h"
+#include "Server.h"
+#include "Locker.h"
+#include "MDCache.h"
+#include "MDLog.h"
+#include "MDBalancer.h"
+#include "Migrator.h"
+
+#include "SnapServer.h"
+#include "SnapClient.h"
+
+#include "InoTable.h"
+
+#include "common/HeartbeatMap.h"
+
+#include "common/perf_counters.h"
+
+#include "common/Timer.h"
+
+#include "events/ESession.h"
+#include "events/ESubtreeMap.h"
+
+#include "messages/MMDSMap.h"
+#include "messages/MMDSBeacon.h"
+
+#include "messages/MGenericMessage.h"
+
+#include "messages/MMonCommand.h"
+#include "messages/MCommand.h"
+#include "messages/MCommandReply.h"
+
+#include "auth/AuthAuthorizeHandler.h"
+#include "auth/KeyRing.h"
+
+#include "common/config.h"
+
+#include "perfglue/cpu_profiler.h"
+#include "perfglue/heap_profiler.h"
+
+
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "mds." << name << ' '
+
+/**
+ * Helper for simple callbacks that call a void fn with no args.
+ */
+class C_VoidFn : public Context
+{
+  typedef void (MDSDaemon::*fn_ptr)();
+  protected:
+   MDSDaemon *mds;
+   fn_ptr fn;
+  public:
+  C_VoidFn(MDSDaemon *mds_, fn_ptr fn_)
+    : mds(mds_), fn(fn_)
+  {
+    assert(mds_);
+    assert(fn_);
+  }
+
+  void finish(int r)
+  {
+    (mds->*fn)();
+  }
+};
+
+// cons/des
+MDSDaemon::MDSDaemon(const std::string &n, Messenger *m, MonClient *mc) : 
+  Dispatcher(m->cct),
+  mds_lock("MDSDaemon::mds_lock"),
+  stopping(false),
+  timer(m->cct, mds_lock),
+  beacon(m->cct, mc, n),
+  authorize_handler_cluster_registry(new AuthAuthorizeHandlerRegistry(m->cct,
+								      m->cct->_conf->auth_supported.empty() ?
+								      m->cct->_conf->auth_cluster_required :
+								      m->cct->_conf->auth_supported)),
+  authorize_handler_service_registry(new AuthAuthorizeHandlerRegistry(m->cct,
+								      m->cct->_conf->auth_supported.empty() ?
+								      m->cct->_conf->auth_service_required :
+								      m->cct->_conf->auth_supported)),
+  name(n),
+  messenger(m),
+  monc(mc),
+  objecter(new Objecter(m->cct, m, mc, NULL, 0, 0)),
+  log_client(m->cct, messenger, &mc->monmap, LogClient::NO_FLAGS),
+  mds_rank(NULL),
+  tick_event(0),
+  standby_for_rank(MDSMap::MDS_NO_STANDBY_PREF),
+  standby_type(MDSMap::STATE_NULL),
+  asok_hook(NULL)
+{
+  orig_argc = 0;
+  orig_argv = NULL;
+
+  clog = log_client.create_channel();
+
+  monc->set_messenger(messenger);
+
+  mdsmap = new MDSMap;
+
+  objecter->unset_honor_osdmap_full();
+}
+
+MDSDaemon::~MDSDaemon() {
+  Mutex::Locker lock(mds_lock);
+
+  delete mds_rank; 
+  mds_rank = NULL; 
+  delete objecter; 
+  objecter = NULL;
+  delete mdsmap;
+  mdsmap = NULL;
+
+  delete authorize_handler_service_registry;
+  delete authorize_handler_cluster_registry;
+}
+
+class MDSSocketHook : public AdminSocketHook {
+  MDSDaemon *mds;
+public:
+  MDSSocketHook(MDSDaemon *m) : mds(m) {}
+  bool call(std::string command, cmdmap_t& cmdmap, std::string format,
+	    bufferlist& out) {
+    stringstream ss;
+    bool r = mds->asok_command(command, cmdmap, format, ss);
+    out.append(ss);
+    return r;
+  }
+};
+
+bool MDSDaemon::asok_command(string command, cmdmap_t& cmdmap, string format,
+		    ostream& ss)
+{
+  dout(1) << "asok_command: " << command << " (starting...)" << dendl;
+
+  Formatter *f = Formatter::create(format, "json-pretty", "json-pretty");
+  bool handled = false;
+  if (command == "status") {
+    const OSDMap *osdmap = objecter->get_osdmap_read();
+    const epoch_t osd_epoch = osdmap->get_epoch();
+    objecter->put_osdmap_read();
+
+    f->open_object_section("status");
+    f->dump_stream("cluster_fsid") << monc->get_fsid();
+    if (mds_rank) {
+      f->dump_unsigned("whoami", mds_rank->get_nodeid());
+    } else {
+      f->dump_unsigned("whoami", MDS_RANK_NONE);
+    }
+
+    f->dump_string("state", ceph_mds_state_name(mdsmap->get_state_gid(mds_gid_t(
+        monc->get_global_id()))));
+    f->dump_unsigned("mdsmap_epoch", mdsmap->get_epoch());
+    f->dump_unsigned("osdmap_epoch", osd_epoch);
+    if (mds_rank) {
+      f->dump_unsigned("osdmap_epoch_barrier", mds_rank->get_osd_epoch_barrier());
+    } else {
+      f->dump_unsigned("osdmap_epoch_barrier", 0);
+    }
+    f->close_section(); // status
+    handled = true;
+  } else {
+    if (mds_rank == NULL) {
+      dout(1) << "Can't run that command on an inactive MDS!" << dendl;
+      f->dump_string("error", "mds_not_active");
+    } else {
+      handled =  mds_rank->handle_asok_command(command, cmdmap, f, ss);
+    }
+
+  }
+  f->flush(ss);
+  delete f;
+  
+  dout(1) << "asok_command: " << command << " (complete)" << dendl;
+  
+  return handled;
+}
+
+void MDSDaemon::set_up_admin_socket()
+{
+  int r;
+  AdminSocket *admin_socket = g_ceph_context->get_admin_socket();
+  asok_hook = new MDSSocketHook(this);
+  r = admin_socket->register_command("status", "status", asok_hook,
+				     "high-level status of MDS");
+  assert(r == 0);
+  r = admin_socket->register_command("dump_ops_in_flight",
+				     "dump_ops_in_flight", asok_hook,
+				     "show the ops currently in flight");
+  assert(r == 0);
+  r = admin_socket->register_command("ops",
+				     "ops", asok_hook,
+				     "show the ops currently in flight");
+  assert(r == 0);
+  r = admin_socket->register_command("dump_historic_ops", "dump_historic_ops",
+				     asok_hook,
+				     "show slowest recent ops");
+  assert(r == 0);
+  r = admin_socket->register_command("scrub_path",
+                                     "scrub_path name=path,type=CephString",
+                                     asok_hook,
+                                     "scrub an inode and output results");
+  assert(r == 0);
+  r = admin_socket->register_command("flush_path",
+                                     "flush_path name=path,type=CephString",
+                                     asok_hook,
+                                     "flush an inode (and its dirfrags)");
+  assert(r == 0);
+  r = admin_socket->register_command("export dir",
+                                     "export dir "
+                                     "name=path,type=CephString "
+                                     "name=rank,type=CephInt",
+                                     asok_hook,
+                                     "migrate a subtree to named MDS");
+  assert(r == 0);
+  r = admin_socket->register_command("dump cache",
+                                     "dump cache name=path,type=CephString,req=false",
+                                     asok_hook,
+                                     "dump metadata cache (optionally to a file)");
+  assert(r == 0);
+  r = admin_socket->register_command("session evict",
+				     "session evict name=client_id,type=CephString",
+				     asok_hook,
+				     "Evict a CephFS client");
+  assert(r == 0);
+  r = admin_socket->register_command("osdmap barrier",
+				     "osdmap barrier name=target_epoch,type=CephInt",
+				     asok_hook,
+				     "Wait until the MDS has this OSD map epoch");
+  assert(r == 0);
+  r = admin_socket->register_command("session ls",
+				     "session ls",
+				     asok_hook,
+				     "Enumerate connected CephFS clients");
+  assert(r == 0);
+  r = admin_socket->register_command("flush journal",
+				     "flush journal",
+				     asok_hook,
+				     "Flush the journal to the backing store");
+  assert(r == 0);
+  r = admin_socket->register_command("force_readonly",
+				     "force_readonly",
+				     asok_hook,
+				     "Force MDS to read-only mode");
+  assert(r == 0);
+  r = admin_socket->register_command("get subtrees",
+				     "get subtrees",
+				     asok_hook,
+				     "Return the subtree map");
+  assert(r == 0);
+  r = admin_socket->register_command("dirfrag split",
+				     "dirfrag split "
+                                     "name=path,type=CephString,req=true "
+                                     "name=frag,type=CephString,req=true "
+                                     "name=bits,type=CephInt,req=true ",
+				     asok_hook,
+				     "Fragment directory by path");
+  assert(r == 0);
+  r = admin_socket->register_command("dirfrag merge",
+				     "dirfrag merge "
+                                     "name=path,type=CephString,req=true "
+                                     "name=frag,type=CephString,req=true",
+				     asok_hook,
+				     "De-fragment directory by path");
+  assert(r == 0);
+  r = admin_socket->register_command("dirfrag ls",
+				     "dirfrag ls "
+                                     "name=path,type=CephString,req=true",
+				     asok_hook,
+				     "List fragments in directory");
+  assert(r == 0);
+}
+
+void MDSDaemon::clean_up_admin_socket()
+{
+  AdminSocket *admin_socket = g_ceph_context->get_admin_socket();
+  admin_socket->unregister_command("status");
+  admin_socket->unregister_command("dump_ops_in_flight");
+  admin_socket->unregister_command("ops");
+  admin_socket->unregister_command("dump_historic_ops");
+  admin_socket->unregister_command("scrub_path");
+  admin_socket->unregister_command("flush_path");
+  admin_socket->unregister_command("session evict");
+  admin_socket->unregister_command("session ls");
+  admin_socket->unregister_command("flush journal");
+  admin_socket->unregister_command("force_readonly");
+  delete asok_hook;
+  asok_hook = NULL;
+}
+
+const char** MDSDaemon::get_tracked_conf_keys() const
+{
+  static const char* KEYS[] = {
+    "mds_op_complaint_time", "mds_op_log_threshold",
+    "mds_op_history_size", "mds_op_history_duration",
+    // clog & admin clog
+    "clog_to_monitors",
+    "clog_to_syslog",
+    "clog_to_syslog_facility",
+    "clog_to_syslog_level",
+    NULL
+  };
+  return KEYS;
+}
+
+void MDSDaemon::handle_conf_change(const struct md_config_t *conf,
+			     const std::set <std::string> &changed)
+{
+  Mutex::Locker l(mds_lock);
+
+  if (changed.count("mds_op_complaint_time") ||
+      changed.count("mds_op_log_threshold")) {
+    if (mds_rank) {
+      mds_rank->op_tracker.set_complaint_and_threshold(conf->mds_op_complaint_time,
+                                             conf->mds_op_log_threshold);
+    }
+  }
+  if (changed.count("mds_op_history_size") ||
+      changed.count("mds_op_history_duration")) {
+    if (mds_rank) {
+      mds_rank->op_tracker.set_history_size_and_duration(conf->mds_op_history_size,
+                                               conf->mds_op_history_duration);
+    }
+  }
+  if (changed.count("clog_to_monitors") ||
+      changed.count("clog_to_syslog") ||
+      changed.count("clog_to_syslog_level") ||
+      changed.count("clog_to_syslog_facility")) {
+    if (mds_rank) {
+      mds_rank->update_log_config();
+    }
+  }
+}
+
+
+int MDSDaemon::init(MDSMap::DaemonState wanted_state)
+{
+  dout(10) << sizeof(MDSCacheObject) << "\tMDSCacheObject" << dendl;
+  dout(10) << sizeof(CInode) << "\tCInode" << dendl;
+  dout(10) << sizeof(elist<void*>::item) << "\t elist<>::item   *7=" << 7*sizeof(elist<void*>::item) << dendl;
+  dout(10) << sizeof(inode_t) << "\t inode_t " << dendl;
+  dout(10) << sizeof(nest_info_t) << "\t  nest_info_t " << dendl;
+  dout(10) << sizeof(frag_info_t) << "\t  frag_info_t " << dendl;
+  dout(10) << sizeof(SimpleLock) << "\t SimpleLock   *5=" << 5*sizeof(SimpleLock) << dendl;
+  dout(10) << sizeof(ScatterLock) << "\t ScatterLock  *3=" << 3*sizeof(ScatterLock) << dendl;
+  dout(10) << sizeof(CDentry) << "\tCDentry" << dendl;
+  dout(10) << sizeof(elist<void*>::item) << "\t elist<>::item" << dendl;
+  dout(10) << sizeof(SimpleLock) << "\t SimpleLock" << dendl;
+  dout(10) << sizeof(CDir) << "\tCDir " << dendl;
+  dout(10) << sizeof(elist<void*>::item) << "\t elist<>::item   *2=" << 2*sizeof(elist<void*>::item) << dendl;
+  dout(10) << sizeof(fnode_t) << "\t fnode_t " << dendl;
+  dout(10) << sizeof(nest_info_t) << "\t  nest_info_t *2" << dendl;
+  dout(10) << sizeof(frag_info_t) << "\t  frag_info_t *2" << dendl;
+  dout(10) << sizeof(Capability) << "\tCapability " << dendl;
+  dout(10) << sizeof(xlist<void*>::item) << "\t xlist<>::item   *2=" << 2*sizeof(xlist<void*>::item) << dendl;
+
+  objecter->init();
+  messenger->add_dispatcher_tail(objecter);
+
+  messenger->add_dispatcher_tail(&beacon);
+  messenger->add_dispatcher_tail(this);
+
+  // get monmap
+  monc->set_messenger(messenger);
+
+  monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS);
+  monc->init();
+
+  // tell monc about log_client so it will know about mon session resets
+  monc->set_log_client(&log_client);
+  
+  int r = monc->authenticate();
+  if (r < 0) {
+    derr << "ERROR: failed to authenticate: " << cpp_strerror(-r) << dendl;
+    mds_lock.Lock();
+    suicide();
+    mds_lock.Unlock();
+    return r;
+  }
+  while (monc->wait_auth_rotating(30.0) < 0) {
+    derr << "unable to obtain rotating service keys; retrying" << dendl;
+  }
+
+  objecter->start();
+
+  mds_lock.Lock();
+  if (beacon.get_want_state() == CEPH_MDS_STATE_DNE) {
+    dout(4) << __func__ << ": terminated already, dropping out" << dendl;
+    mds_lock.Unlock();
+    return 0;
+  }
+
+  monc->sub_want("mdsmap", 0, 0);
+  monc->renew_subs();
+
+  mds_lock.Unlock();
+
+  // verify that osds support tmap2omap
+  while (true) {
+    objecter->maybe_request_map();
+    objecter->wait_for_osd_map();
+    const OSDMap *osdmap = objecter->get_osdmap_read();
+    uint64_t osd_features = osdmap->get_up_osd_features();
+    if (osd_features & CEPH_FEATURE_OSD_TMAP2OMAP) {
+      objecter->put_osdmap_read();
+      break;
+    }
+    if (osdmap->get_num_up_osds() > 0) {
+        derr << "*** one or more OSDs do not support TMAP2OMAP; upgrade OSDs before starting MDS (or downgrade MDS) ***" << dendl;
+    } else {
+        derr << "*** no OSDs are up as of epoch " << osdmap->get_epoch() << ", waiting" << dendl;
+    }
+    objecter->put_osdmap_read();
+    sleep(10);
+  }
+
+  mds_lock.Lock();
+  if (beacon.get_want_state() == MDSMap::STATE_DNE) {
+    suicide();  // we could do something more graceful here
+  }
+
+  timer.init();
+
+  if (wanted_state==MDSMap::STATE_BOOT && g_conf->mds_standby_replay) {
+    wanted_state = MDSMap::STATE_STANDBY_REPLAY;
+  }
+
+  // starting beacon.  this will induce an MDSMap from the monitor
+  if (wanted_state==MDSMap::STATE_STANDBY_REPLAY ||
+      wanted_state==MDSMap::STATE_ONESHOT_REPLAY) {
+    g_conf->set_val_or_die("mds_standby_replay", "true");
+    g_conf->apply_changes(NULL);
+    if ( wanted_state == MDSMap::STATE_ONESHOT_REPLAY &&
+        (g_conf->mds_standby_for_rank == -1) &&
+        g_conf->mds_standby_for_name.empty()) {
+      // uh-oh, must specify one or the other!
+      dout(0) << "Specified oneshot replay mode but not an MDS!" << dendl;
+      suicide();
+    }
+    standby_type = wanted_state;
+    wanted_state = MDSMap::STATE_BOOT;
+  }
+
+  standby_for_rank = mds_rank_t(g_conf->mds_standby_for_rank);
+  standby_for_name.assign(g_conf->mds_standby_for_name);
+
+  if (standby_type == MDSMap::STATE_STANDBY_REPLAY &&
+      standby_for_rank == -1) {
+    if (standby_for_name.empty())
+      standby_for_rank = MDSMap::MDS_STANDBY_ANY;
+    else
+      standby_for_rank = MDSMap::MDS_STANDBY_NAME;
+  } else if (standby_type == MDSMap::STATE_NULL && !standby_for_name.empty())
+    standby_for_rank = MDSMap::MDS_MATCHED_ACTIVE;
+
+  if (wanted_state == MDSMap::STATE_NULL) {
+    wanted_state = MDSMap::STATE_BOOT;
+  }
+  beacon.init(mdsmap, wanted_state, standby_for_rank, standby_for_name);
+  messenger->set_myname(entity_name_t::MDS(MDS_RANK_NONE));
+  
+  // schedule tick
+  reset_tick();
+
+  set_up_admin_socket();
+  g_conf->add_observer(this);
+
+  mds_lock.Unlock();
+
+  return 0;
+}
+
+void MDSDaemon::reset_tick()
+{
+  // cancel old
+  if (tick_event) timer.cancel_event(tick_event);
+
+  // schedule
+  tick_event = new C_MDS_Tick(this);
+  timer.add_event_after(g_conf->mds_tick_interval, tick_event);
+}
+
+void MDSDaemon::tick()
+{
+  tick_event = 0;
+
+  // reschedule
+  reset_tick();
+
+  // Call through to subsystems' tick functions
+  if (mds_rank) {
+    mds_rank->tick();
+  }
+}
+
+/* This function DOES put the passed message before returning*/
+void MDSDaemon::handle_command(MCommand *m)
+{
+  Session *session = static_cast<Session *>(m->get_connection()->get_priv());
+  assert(session != NULL);
+
+  int r = 0;
+  cmdmap_t cmdmap;
+  std::stringstream ss;
+  std::string outs;
+  bufferlist outbl;
+  Context *run_after = NULL;
+
+
+  if (!session->auth_caps.allow_all()) {
+    dout(1) << __func__
+      << ": received command from client without `tell` capability: "
+      << m->get_connection()->peer_addr << dendl;
+
+    ss << "permission denied";
+    r = -EPERM;
+  } else if (m->cmd.empty()) {
+    ss << "no command given";
+    outs = ss.str();
+  } else if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
+    r = -EINVAL;
+    outs = ss.str();
+  } else {
+    r = _handle_command(cmdmap, m->get_data(), &outbl, &outs, &run_after);
+  }
+
+  MCommandReply *reply = new MCommandReply(r, outs);
+  reply->set_tid(m->get_tid());
+  reply->set_data(outbl);
+  m->get_connection()->send_message(reply);
+
+  if (run_after) {
+    run_after->complete(0);
+  }
+
+  m->put();
+}
+
+
+struct MDSCommand {
+  string cmdstring;
+  string helpstring;
+  string module;
+  string perm;
+  string availability;
+} mds_commands[] = {
+
+#define COMMAND(parsesig, helptext, module, perm, availability) \
+  {parsesig, helptext, module, perm, availability},
+
+COMMAND("injectargs " \
+	"name=injected_args,type=CephString,n=N",
+	"inject configuration arguments into running MDS",
+	"mds", "*", "cli,rest")
+COMMAND("exit",
+	"Terminate this MDS",
+	"mds", "*", "cli,rest")
+COMMAND("respawn",
+	"Restart this MDS",
+	"mds", "*", "cli,rest")
+COMMAND("session kill " \
+        "name=session_id,type=CephInt",
+	"End a client session",
+	"mds", "*", "cli,rest")
+COMMAND("cpu_profiler " \
+	"name=arg,type=CephChoices,strings=status|flush",
+	"run cpu profiling on daemon", "mds", "rw", "cli,rest")
+COMMAND("heap " \
+	"name=heapcmd,type=CephChoices,strings=dump|start_profiler|stop_profiler|release|stats", \
+	"show heap usage info (available only if compiled with tcmalloc)", \
+	"mds", "*", "cli,rest")
+};
+
+// FIXME: reinstate issue_caps, try_eval,
+//  *if* it makes sense to do so (or should these be admin socket things?)
+
+/* This function DOES put the passed message before returning*/
+void MDSDaemon::handle_command(MMonCommand *m)
+{
+  bufferlist outbl;
+  _handle_command_legacy(m->cmd);
+  m->put();
+}
+
+int MDSDaemon::_handle_command(
+    const cmdmap_t &cmdmap,
+    bufferlist const &inbl,
+    bufferlist *outbl,
+    std::string *outs,
+    Context **run_later)
+{
+  assert(outbl != NULL);
+  assert(outs != NULL);
+
+  class SuicideLater : public Context
+  {
+    MDSDaemon *mds;
+
+    public:
+    SuicideLater(MDSDaemon *mds_) : mds(mds_) {}
+    void finish(int r) {
+      // Wait a little to improve chances of caller getting
+      // our response before seeing us disappear from mdsmap
+      sleep(1);
+
+      mds->suicide();
+    }
+  };
+
+
+  class RespawnLater : public Context
+  {
+    MDSDaemon *mds;
+
+    public:
+
+    RespawnLater(MDSDaemon *mds_) : mds(mds_) {}
+    void finish(int r) {
+      // Wait a little to improve chances of caller getting
+      // our response before seeing us disappear from mdsmap
+      sleep(1);
+
+      mds->respawn();
+    }
+  };
+
+  std::stringstream ds;
+  std::stringstream ss;
+  std::string prefix;
+  cmd_getval(cct, cmdmap, "prefix", prefix);
+
+  int r = 0;
+
+  if (prefix == "get_command_descriptions") {
+    int cmdnum = 0;
+    JSONFormatter *f = new JSONFormatter();
+    f->open_object_section("command_descriptions");
+    for (MDSCommand *cp = mds_commands;
+	 cp < &mds_commands[ARRAY_SIZE(mds_commands)]; cp++) {
+
+      ostringstream secname;
+      secname << "cmd" << setfill('0') << std::setw(3) << cmdnum;
+      dump_cmddesc_to_json(f, secname.str(), cp->cmdstring, cp->helpstring,
+			   cp->module, cp->perm, cp->availability);
+      cmdnum++;
+    }
+    f->close_section();	// command_descriptions
+
+    f->flush(ds);
+    delete f;
+  } else if (prefix == "injectargs") {
+    vector<string> argsvec;
+    cmd_getval(cct, cmdmap, "injected_args", argsvec);
+
+    if (argsvec.empty()) {
+      r = -EINVAL;
+      ss << "ignoring empty injectargs";
+      goto out;
+    }
+    string args = argsvec.front();
+    for (vector<string>::iterator a = ++argsvec.begin(); a != argsvec.end(); ++a)
+      args += " " + *a;
+    cct->_conf->injectargs(args, &ss);
+  } else if (prefix == "exit") {
+    // We will send response before executing
+    ss << "Exiting...";
+    *run_later = new SuicideLater(this);
+  }
+  else if (prefix == "respawn") {
+    // We will send response before executing
+    ss << "Respawning...";
+    *run_later = new RespawnLater(this);
+  } else if (prefix == "session kill") {
+    if (mds_rank == NULL) {
+      r = -EINVAL;
+      ss << "MDS not active";
+    }
+    // FIXME harmonize `session kill` with admin socket session evict
+    int64_t session_id = 0;
+    bool got = cmd_getval(cct, cmdmap, "session_id", session_id);
+    assert(got);
+    const bool killed = mds_rank->kill_session(session_id);
+    if (!killed) {
+      r = -ENOENT;
+      ss << "session '" << session_id << "' not found";
+    }
+  } else if (prefix == "heap") {
+    if (!ceph_using_tcmalloc()) {
+      r = -EOPNOTSUPP;
+      ss << "could not issue heap profiler command -- not using tcmalloc!";
+    } else {
+      string heapcmd;
+      cmd_getval(cct, cmdmap, "heapcmd", heapcmd);
+      vector<string> heapcmd_vec;
+      get_str_vec(heapcmd, heapcmd_vec);
+      ceph_heap_profiler_handle_command(heapcmd_vec, ds);
+    }
+  } else if (prefix == "cpu_profiler") {
+    string arg;
+    cmd_getval(cct, cmdmap, "arg", arg);
+    vector<string> argvec;
+    get_str_vec(arg, argvec);
+    cpu_profiler_handle_command(argvec, ds);
+  } else {
+    std::ostringstream ss;
+    ss << "unrecognized command! " << prefix;
+    r = -EINVAL;
+  }
+
+out:
+  *outs = ss.str();
+  outbl->append(ds);
+  return r;
+}
+
+/**
+ * Legacy "mds tell", takes a simple array of args
+ */
+int MDSDaemon::_handle_command_legacy(std::vector<std::string> args)
+{
+  dout(10) << "handle_command args: " << args << dendl;
+  if (args[0] == "injectargs") {
+    if (args.size() < 2) {
+      derr << "Ignoring empty injectargs!" << dendl;
+    }
+    else {
+      std::ostringstream oss;
+      mds_lock.Unlock();
+      g_conf->injectargs(args[1], &oss);
+      mds_lock.Lock();
+      derr << "injectargs:" << dendl;
+      derr << oss.str() << dendl;
+    }
+  }
+  else if (args[0] == "exit") {
+    suicide();
+  }
+  else if (args[0] == "respawn") {
+    respawn();
+  }
+  else if (args[0] == "cpu_profiler") {
+    ostringstream ss;
+    cpu_profiler_handle_command(args, ss);
+    clog->info() << ss.str();
+  }
+  else if (args[0] == "heap") {
+    if (!ceph_using_tcmalloc())
+      clog->info() << "tcmalloc not enabled, can't use heap profiler commands\n";
+    else {
+      ostringstream ss;
+      vector<std::string> cmdargs;
+      cmdargs.insert(cmdargs.begin(), args.begin()+1, args.end());
+      ceph_heap_profiler_handle_command(cmdargs, ss);
+      clog->info() << ss.str();
+    }
+  } else {
+    if (!(mds_rank && mds_rank->handle_command_legacy(args))) {
+      dout(0) << "unrecognized command! " << args << dendl;
+    }
+  }
+
+  return 0;
+}
+
+/* This function deletes the passed message before returning. */
+
+void MDSDaemon::handle_mds_map(MMDSMap *m)
+{
+  version_t epoch = m->get_epoch();
+  dout(5) << "handle_mds_map epoch " << epoch << " from " << m->get_source() << dendl;
+
+  // is it new?
+  if (epoch <= mdsmap->get_epoch()) {
+    dout(5) << " old map epoch " << epoch << " <= " << mdsmap->get_epoch() 
+	    << ", discarding" << dendl;
+    m->put();
+    return;
+  }
+
+  entity_addr_t addr;
+
+  // keep old map, for a moment
+  MDSMap *oldmap = mdsmap;
+
+  // decode and process
+  mdsmap = new MDSMap;
+  mdsmap->decode(m->get_encoded());
+  const MDSMap::DaemonState new_state = mdsmap->get_state_gid(mds_gid_t(monc->get_global_id()));
+  const int incarnation = mdsmap->get_inc_gid(mds_gid_t(monc->get_global_id()));
+
+  monc->sub_got("mdsmap", mdsmap->get_epoch());
+
+  // Calculate my effective rank (either my owned rank or my
+  // standby_for_rank if in standby replay)
+  mds_rank_t whoami = mdsmap->get_rank_gid(mds_gid_t(monc->get_global_id()));
+
+  // verify compatset
+  CompatSet mdsmap_compat(get_mdsmap_compat_set_all());
+  dout(10) << "     my compat " << mdsmap_compat << dendl;
+  dout(10) << " mdsmap compat " << mdsmap->compat << dendl;
+  if (!mdsmap_compat.writeable(mdsmap->compat)) {
+    dout(0) << "handle_mds_map mdsmap compatset " << mdsmap->compat
+	    << " not writeable with daemon features " << mdsmap_compat
+	    << ", killing myself" << dendl;
+    suicide();
+    goto out;
+  }
+
+  // mark down any failed peers
+  for (map<mds_gid_t,MDSMap::mds_info_t>::const_iterator p = oldmap->get_mds_info().begin();
+       p != oldmap->get_mds_info().end();
+       ++p) {
+    if (mdsmap->get_mds_info().count(p->first) == 0) {
+      dout(10) << " peer mds gid " << p->first << " removed from map" << dendl;
+      messenger->mark_down(p->second.addr);
+    }
+  }
+
+  // If I was put into standby replay, but I am configured for a different standby
+  // type, ignore the map's state and request my standby type (only used
+  // for oneshot replay?)
+  if (new_state == MDSMap::STATE_STANDBY_REPLAY) {
+    if (standby_type != MDSMap::STATE_NULL && standby_type != MDSMap::STATE_STANDBY_REPLAY) {
+      beacon.set_want_state(mdsmap, standby_type);
+      beacon.send();
+      goto out;
+    }
+  }
+
+  if (whoami == MDS_RANK_NONE && (
+      new_state == MDSMap::STATE_STANDBY_REPLAY || new_state == MDSMap::STATE_ONESHOT_REPLAY)) {
+    whoami = mdsmap->get_mds_info_gid(mds_gid_t(monc->get_global_id())).standby_for_rank;
+  }
+
+  // see who i am
+  addr = messenger->get_myaddr();
+  dout(10) << "map says i am " << addr << " mds." << whoami << "." << incarnation
+	   << " state " << ceph_mds_state_name(new_state) << dendl;
+
+  if (whoami == MDS_RANK_NONE) {
+    if (mds_rank != NULL) {
+      // We have entered a rank-holding state, we shouldn't be back
+      // here!
+      if (g_conf->mds_enforce_unique_name) {
+        if (mds_gid_t existing = mdsmap->find_mds_gid_by_name(name)) {
+          MDSMap::mds_info_t& i = mdsmap->get_info_gid(existing);
+          if (i.global_id > monc->get_global_id()) {
+            dout(1) << "handle_mds_map i (" << addr
+                    << ") dne in the mdsmap, new instance has larger gid " << i.global_id
+                    << ", suicide" << dendl;
+            // Call suicide() rather than respawn() because if someone else
+            // has taken our ID, we don't want to keep restarting and
+            // fighting them for the ID.
+            suicide();
+            return;
+          }
+        }
+      }
+
+      dout(1) << "handle_mds_map i (" << addr
+          << ") dne in the mdsmap, respawning myself" << dendl;
+      respawn();
+    }
+    // MDSRank not active: process the map here to see if we have
+    // been assigned a rank.
+    dout(10) <<  __func__ << ": handling map in rankless mode" << dendl;
+    _handle_mds_map(oldmap);
+  } else {
+
+    // Did we already hold a different rank?  MDSMonitor shouldn't try
+    // to change that out from under me!
+    if (mds_rank && whoami != mds_rank->get_nodeid()) {
+      derr << "Invalid rank transition " << mds_rank->get_nodeid() << "->"
+           << whoami << dendl;
+      respawn();
+    }
+
+    // Did I previously not hold a rank?  Initialize!
+    if (mds_rank == NULL) {
+      mds_rank = new MDSRankDispatcher(whoami, mds_lock, clog,
+          timer, beacon, mdsmap, messenger, monc, objecter,
+          new C_VoidFn(this, &MDSDaemon::respawn),
+          new C_VoidFn(this, &MDSDaemon::suicide));
+      dout(10) <<  __func__ << ": initializing MDS rank "
+               << mds_rank->get_nodeid() << dendl;
+      mds_rank->init();
+    }
+
+    // MDSRank is active: let him process the map, we have no say.
+    dout(10) <<  __func__ << ": handling map as rank " 
+             << mds_rank->get_nodeid() << dendl;
+    mds_rank->handle_mds_map(m, oldmap);
+  }
+
+out:
+  beacon.notify_mdsmap(mdsmap);
+  m->put();
+  delete oldmap;
+}
+
+void MDSDaemon::_handle_mds_map(MDSMap *oldmap)
+{
+  MDSMap::DaemonState new_state = mdsmap->get_state_gid(mds_gid_t(monc->get_global_id()));
+
+  // Normal rankless case, we're marked as standby
+  if (new_state == MDSMap::STATE_STANDBY) {
+    beacon.set_want_state(mdsmap, new_state);
+    dout(1) << "handle_mds_map standby" << dendl;
+
+    if (standby_type != MDSMap::STATE_NULL) {// we want to be in standby_replay or oneshot_replay!
+      beacon.set_want_state(mdsmap, standby_type);
+      beacon.send();
+    }
+    return;
+  }
+
+  // Case where we thought we were standby, but MDSMap disagrees
+  if (beacon.get_want_state() == MDSMap::STATE_STANDBY) {
+    dout(10) << "dropped out of mdsmap, try to re-add myself" << dendl;
+    new_state = MDSMap::STATE_BOOT;
+    beacon.set_want_state(mdsmap, new_state);
+    return;
+  }
+  
+  // Case where we have sent a boot beacon that isn't reflected yet
+  if (beacon.get_want_state() == MDSMap::STATE_BOOT) {
+    dout(10) << "not in map yet" << dendl;
+  }
+}
+
+void MDSDaemon::handle_signal(int signum)
+{
+  assert(signum == SIGINT || signum == SIGTERM);
+  derr << "*** got signal " << sys_siglist[signum] << " ***" << dendl;
+  {
+    Mutex::Locker l(mds_lock);
+    if (stopping) {
+      return;
+    }
+    suicide();
+  }
+}
+
+void MDSDaemon::suicide()
+{
+  assert(mds_lock.is_locked());
+
+  dout(1) << "suicide.  wanted state "
+          << ceph_mds_state_name(beacon.get_want_state()) << dendl;
+
+  if (tick_event) {
+    timer.cancel_event(tick_event);
+    tick_event = 0;
+  }
+
+  //because add_observer is called after set_up_admin_socket
+  //so we can use asok_hook to avoid assert in the remove_observer
+  if (asok_hook != NULL) 
+    g_conf->remove_observer(this);
+
+  clean_up_admin_socket();
+
+  // Inform MDS we are going away, then shut down beacon
+  beacon.set_want_state(mdsmap, MDSMap::STATE_DNE);
+  if (!mdsmap->is_dne_gid(mds_gid_t(monc->get_global_id()))) {
+    // Notify the MDSMonitor that we're dying, so that it doesn't have to
+    // wait for us to go laggy.  Only do this if we're actually in the
+    // MDSMap, because otherwise the MDSMonitor will drop our message.
+    beacon.send_and_wait(1);
+  }
+  beacon.shutdown();
+
+  timer.shutdown();
+
+  if (mds_rank) {
+    mds_rank->shutdown();
+  } else {
+
+    if (objecter->initialized.read()) {
+      objecter->shutdown();
+    }
+
+    monc->shutdown();
+    messenger->shutdown();
+  }
+}
+
+void MDSDaemon::respawn()
+{
+  dout(1) << "respawn" << dendl;
+
+  char *new_argv[orig_argc+1];
+  dout(1) << " e: '" << orig_argv[0] << "'" << dendl;
+  for (int i=0; i<orig_argc; i++) {
+    new_argv[i] = (char *)orig_argv[i];
+    dout(1) << " " << i << ": '" << orig_argv[i] << "'" << dendl;
+  }
+  new_argv[orig_argc] = NULL;
+
+  /* Determine the path to our executable, try to read
+   * linux-specific /proc/ path first */
+  char exe_path[PATH_MAX];
+  ssize_t exe_path_bytes = readlink("/proc/self/exe", exe_path,
+				    sizeof(exe_path) - 1);
+  if (exe_path_bytes < 0) {
+    /* Print CWD for the user's interest */
+    char buf[PATH_MAX];
+    char *cwd = getcwd(buf, sizeof(buf));
+    assert(cwd);
+    dout(1) << " cwd " << cwd << dendl;
+
+    /* Fall back to a best-effort: just running in our CWD */
+    strncpy(exe_path, orig_argv[0], sizeof(exe_path) - 1);
+  } else {
+    exe_path[exe_path_bytes] = '\0';
+  }
+
+  dout(1) << " exe_path " << exe_path << dendl;
+
+  unblock_all_signals(NULL);
+  execv(exe_path, new_argv);
+
+  dout(0) << "respawn execv " << orig_argv[0]
+	  << " failed with " << cpp_strerror(errno) << dendl;
+
+  // We have to assert out here, because suicide() returns, and callers
+  // to respawn expect it never to return.
+  assert(0);
+}
+
+
+
+bool MDSDaemon::ms_dispatch(Message *m)
+{
+  Mutex::Locker l(mds_lock);
+  if (stopping) {
+    return false;
+  }
+
+  // Drop out early if shutting down
+  if (beacon.get_want_state() == CEPH_MDS_STATE_DNE) {
+    dout(10) << " stopping, discarding " << *m << dendl;
+    m->put();
+    return true;
+  }
+
+  // First see if it's a daemon message
+  const bool handled_core = handle_core_message(m); 
+  if (handled_core) {
+    return true;
+  }
+
+  // Not core, try it as a rank message
+  if (mds_rank) {
+    return mds_rank->ms_dispatch(m);
+  } else {
+    return false;
+  }
+}
+
+bool MDSDaemon::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new)
+{
+  dout(10) << "MDSDaemon::ms_get_authorizer type=" << ceph_entity_type_name(dest_type) << dendl;
+
+  /* monitor authorization is being handled on different layer */
+  if (dest_type == CEPH_ENTITY_TYPE_MON)
+    return true;
+
+  if (force_new) {
+    if (monc->wait_auth_rotating(10) < 0)
+      return false;
+  }
+
+  *authorizer = monc->auth->build_authorizer(dest_type);
+  return *authorizer != NULL;
+}
+
+
+/*
+ * high priority messages we always process
+ */
+bool MDSDaemon::handle_core_message(Message *m)
+{
+  switch (m->get_type()) {
+  case CEPH_MSG_MON_MAP:
+    ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MON);
+    m->put();
+    break;
+
+    // MDS
+  case CEPH_MSG_MDS_MAP:
+    ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_MDS);
+    handle_mds_map(static_cast<MMDSMap*>(m));
+    break;
+
+    // misc
+  case MSG_MON_COMMAND:
+    ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MON);
+    handle_command(static_cast<MMonCommand*>(m));
+    break;    
+
+    // OSD
+  case MSG_COMMAND:
+    handle_command(static_cast<MCommand*>(m));
+    break;
+  case CEPH_MSG_OSD_MAP:
+    ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD);
+
+    if (mds_rank) {
+      mds_rank->handle_osd_map();
+    }
+    break;
+
+  default:
+    return false;
+  }
+  return true;
+}
+
+void MDSDaemon::ms_handle_connect(Connection *con) 
+{
+}
+
+bool MDSDaemon::ms_handle_reset(Connection *con) 
+{
+  if (con->get_peer_type() != CEPH_ENTITY_TYPE_CLIENT)
+    return false;
+
+  Mutex::Locker l(mds_lock);
+  if (stopping) {
+    return false;
+  }
+  dout(5) << "ms_handle_reset on " << con->get_peer_addr() << dendl;
+  if (beacon.get_want_state() == CEPH_MDS_STATE_DNE)
+    return false;
+
+  Session *session = static_cast<Session *>(con->get_priv());
+  if (session) {
+    if (session->is_closed()) {
+      dout(3) << "ms_handle_reset closing connection for session " << session->info.inst << dendl;
+      con->mark_down();
+      con->set_priv(NULL);
+    }
+    session->put();
+  } else {
+    con->mark_down();
+  }
+  return false;
+}
+
+
+void MDSDaemon::ms_handle_remote_reset(Connection *con) 
+{
+  if (con->get_peer_type() != CEPH_ENTITY_TYPE_CLIENT)
+    return;
+
+  Mutex::Locker l(mds_lock);
+  if (stopping) {
+    return;
+  }
+
+  dout(5) << "ms_handle_remote_reset on " << con->get_peer_addr() << dendl;
+  if (beacon.get_want_state() == CEPH_MDS_STATE_DNE)
+    return;
+
+  Session *session = static_cast<Session *>(con->get_priv());
+  if (session) {
+    if (session->is_closed()) {
+      dout(3) << "ms_handle_remote_reset closing connection for session " << session->info.inst << dendl;
+      con->mark_down();
+      con->set_priv(NULL);
+    }
+    session->put();
+  }
+}
+
+bool MDSDaemon::ms_verify_authorizer(Connection *con, int peer_type,
+			       int protocol, bufferlist& authorizer_data, bufferlist& authorizer_reply,
+			       bool& is_valid, CryptoKey& session_key)
+{
+  Mutex::Locker l(mds_lock);
+  if (stopping) {
+    return false;
+  }
+  if (beacon.get_want_state() == CEPH_MDS_STATE_DNE)
+    return false;
+
+  AuthAuthorizeHandler *authorize_handler = 0;
+  switch (peer_type) {
+  case CEPH_ENTITY_TYPE_MDS:
+    authorize_handler = authorize_handler_cluster_registry->get_handler(protocol);
+    break;
+  default:
+    authorize_handler = authorize_handler_service_registry->get_handler(protocol);
+  }
+  if (!authorize_handler) {
+    dout(0) << "No AuthAuthorizeHandler found for protocol " << protocol << dendl;
+    is_valid = false;
+    return true;
+  }
+
+  AuthCapsInfo caps_info;
+  EntityName name;
+  uint64_t global_id;
+
+  is_valid = authorize_handler->verify_authorizer(cct, monc->rotating_secrets,
+						  authorizer_data, authorizer_reply, name, global_id, caps_info, session_key);
+
+  if (is_valid) {
+    entity_name_t n(con->get_peer_type(), global_id);
+
+    // We allow connections and assign Session instances to connections
+    // even if we have not been assigned a rank, because clients with
+    // "allow *" are allowed to connect and do 'tell' operations before
+    // we have a rank.
+    Session *s = NULL;
+    if (mds_rank) {
+      // If we do hold a rank, see if this is an existing client establishing
+      // a new connection, rather than a new client
+      s = mds_rank->sessionmap.get_session(n);
+    }
+    
+    // Wire up a Session* to this connection
+    // It doesn't go into a SessionMap instance until it sends an explicit
+    // request to open a session (initial state of Session is `closed`)
+    if (!s) {
+      s = new Session;
+      s->info.inst.addr = con->get_peer_addr();
+      s->info.inst.name = n;
+      dout(10) << " new session " << s << " for " << s->info.inst << " con " << con << dendl;
+      con->set_priv(s);
+      s->connection = con;
+    } else {
+      dout(10) << " existing session " << s << " for " << s->info.inst << " existing con " << s->connection
+	       << ", new/authorizing con " << con << dendl;
+      con->set_priv(s->get());
+
+
+
+      // Wait until we fully accept the connection before setting
+      // s->connection.  In particular, if there are multiple incoming
+      // connection attempts, they will all get their authorizer
+      // validated, but some of them may "lose the race" and get
+      // dropped.  We only want to consider the winner(s).  See
+      // ms_handle_accept().  This is important for Sessions we replay
+      // from the journal on recovery that don't have established
+      // messenger state; we want the con from only the winning
+      // connect attempt(s).  (Normal reconnects that don't follow MDS
+      // recovery are reconnected to the existing con by the
+      // messenger.)
+    }
+
+    if (caps_info.allow_all) {
+        // Flag for auth providers that don't provide cap strings
+        s->auth_caps.set_allow_all();
+    }
+
+    bufferlist::iterator p = caps_info.caps.begin();
+    string auth_cap_str;
+    try {
+      ::decode(auth_cap_str, p);
+
+      dout(10) << __func__ << ": parsing auth_cap_str='" << auth_cap_str << "'" << dendl;
+      std::ostringstream errstr;
+      if (!s->auth_caps.parse(auth_cap_str, &errstr)) {
+        dout(1) << __func__ << ": auth cap parse error: " << errstr.str()
+          << " parsing '" << auth_cap_str << "'" << dendl;
+      }
+    } catch (buffer::error& e) {
+      // Assume legacy auth, defaults to:
+      //  * permit all filesystem ops
+      //  * permit no `tell` ops
+      dout(1) << __func__ << ": cannot decode auth caps bl of length " << caps_info.caps.length() << dendl;
+    }
+  }
+
+  return true;  // we made a decision (see is_valid)
+}
+
+
+void MDSDaemon::ms_handle_accept(Connection *con)
+{
+  Mutex::Locker l(mds_lock);
+  if (stopping) {
+    return;
+  }
+
+  Session *s = static_cast<Session *>(con->get_priv());
+  dout(10) << "ms_handle_accept " << con->get_peer_addr() << " con " << con << " session " << s << dendl;
+  if (s) {
+    if (s->connection != con) {
+      dout(10) << " session connection " << s->connection << " -> " << con << dendl;
+      s->connection = con;
+
+      // send out any queued messages
+      while (!s->preopen_out_queue.empty()) {
+	con->send_message(s->preopen_out_queue.front());
+	s->preopen_out_queue.pop_front();
+      }
+    }
+    s->put();
+  }
+}
+
+bool MDSDaemon::is_clean_shutdown()
+{
+  if (mds_rank) {
+    return mds_rank->is_stopped();
+  } else {
+    return true;
+  }
+}
+
diff --git a/src/mds/MDSDaemon.h b/src/mds/MDSDaemon.h
new file mode 100644
index 0000000..ba3f456
--- /dev/null
+++ b/src/mds/MDSDaemon.h
@@ -0,0 +1,207 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage at newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#ifndef CEPH_MDS_H
+#define CEPH_MDS_H
+
+#include "mdstypes.h"
+
+#include "msg/Dispatcher.h"
+#include "include/CompatSet.h"
+#include "include/types.h"
+#include "include/Context.h"
+#include "common/DecayCounter.h"
+#include "common/perf_counters.h"
+#include "common/Mutex.h"
+#include "common/Cond.h"
+#include "common/Timer.h"
+#include "common/LogClient.h"
+#include "common/TrackedOp.h"
+#include "common/Finisher.h"
+#include "common/cmdparse.h"
+
+#include "MDSRank.h"
+#include "MDSMap.h"
+
+#include "Beacon.h"
+
+
+#define CEPH_MDS_PROTOCOL    27 /* cluster internal */
+
+class filepath;
+
+class MonClient;
+
+class Objecter;
+class Filer;
+
+class Server;
+class Locker;
+class MDCache;
+class MDBalancer;
+class MDSInternalContextBase;
+
+class CInode;
+class CDir;
+class CDentry;
+
+class Messenger;
+class Message;
+
+class MMDSBeacon;
+
+class InoTable;
+class SnapServer;
+class SnapClient;
+
+class MDSTableServer;
+class MDSTableClient;
+
+class AuthAuthorizeHandlerRegistry;
+
+class MDSDaemon : public Dispatcher, public md_config_obs_t {
+ public:
+  /* Global MDS lock: every time someone takes this, they must
+   * also check the `stopping` flag.  If stopping is true, you
+   * must either do nothing and immediately drop the lock, or
+   * never drop the lock again (i.e. call respawn()) */
+  Mutex        mds_lock;
+  bool         stopping;
+
+  SafeTimer    timer;
+
+ protected:
+  Beacon  beacon;
+
+  AuthAuthorizeHandlerRegistry *authorize_handler_cluster_registry;
+  AuthAuthorizeHandlerRegistry *authorize_handler_service_registry;
+
+  std::string name;
+
+  Messenger    *messenger;
+  MonClient    *monc;
+  MDSMap       *mdsmap;
+  Objecter     *objecter;
+  LogClient    log_client;
+  LogChannelRef clog;
+
+  MDSRankDispatcher *mds_rank;
+
+ public:
+  MDSDaemon(const std::string &n, Messenger *m, MonClient *mc);
+  ~MDSDaemon();
+  int orig_argc;
+  const char **orig_argv;
+
+  // handle a signal (e.g., SIGTERM)
+  void handle_signal(int signum);
+
+  // start up, shutdown
+  int init(MDSMap::DaemonState wanted_state=MDSMap::STATE_BOOT);
+
+  /**
+   * Hint at whether we were shutdown gracefully (i.e. we were only
+   * in standby, or our rank was stopped).  Should be removed once
+   * we handle shutdown properly (e.g. clear out all message queues)
+   * such that deleting xlists doesn't assert.
+   */
+  bool is_clean_shutdown();
+
+  // config observer bits
+  virtual const char** get_tracked_conf_keys() const;
+  virtual void handle_conf_change(const struct md_config_t *conf,
+				  const std::set <std::string> &changed);
+ protected:
+  // tick and other timer fun
+  class C_MDS_Tick : public Context {
+    protected:
+      MDSDaemon *mds_daemon;
+  public:
+    C_MDS_Tick(MDSDaemon *m) : mds_daemon(m) {}
+    void finish(int r) {
+      assert(mds_daemon->mds_lock.is_locked_by_me());
+
+      mds_daemon->tick_event = 0;
+      mds_daemon->tick();
+    }
+  } *tick_event;
+  void     reset_tick();
+
+  void wait_for_omap_osds();
+
+  mds_rank_t standby_for_rank;
+  string standby_for_name;
+  MDSMap::DaemonState standby_type;  // one of STANDBY_REPLAY, ONESHOT_REPLAY
+
+ private:
+  bool ms_dispatch(Message *m);
+  bool ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new);
+  bool ms_verify_authorizer(Connection *con, int peer_type,
+			       int protocol, bufferlist& authorizer_data, bufferlist& authorizer_reply,
+			       bool& isvalid, CryptoKey& session_key);
+  void ms_handle_accept(Connection *con);
+  void ms_handle_connect(Connection *con);
+  bool ms_handle_reset(Connection *con);
+  void ms_handle_remote_reset(Connection *con);
+
+ protected:
+  // admin socket handling
+  friend class MDSSocketHook;
+  class MDSSocketHook *asok_hook;
+  bool asok_command(string command, cmdmap_t& cmdmap, string format,
+		    ostream& ss);
+  void set_up_admin_socket();
+  void clean_up_admin_socket();
+  void check_ops_in_flight(); // send off any slow ops to monitor
+
+  /**
+   * Terminate this daemon process.
+   *
+   * This function will return, but once it does so the calling thread
+   * must do no more work as all subsystems will have been shut down.
+   */
+  void suicide();
+
+  /**
+   * Start a new daemon process with the same command line parameters that
+   * this process was run with, then terminate this process
+   */
+  void respawn();
+
+  void tick();
+  
+  // messages
+  bool _dispatch(Message *m, bool new_msg);
+
+protected:
+  bool handle_core_message(Message *m);
+  
+  // special message types
+  int _handle_command_legacy(std::vector<std::string> args);
+  int _handle_command(
+      const cmdmap_t &cmdmap,
+      bufferlist const &inbl,
+      bufferlist *outbl,
+      std::string *outs,
+      Context **run_later);
+  void handle_command(class MMonCommand *m);
+  void handle_command(class MCommand *m);
+  void handle_mds_map(class MMDSMap *m);
+  void _handle_mds_map(MDSMap *oldmap);
+};
+
+
+#endif
diff --git a/src/mds/MDSMap.cc b/src/mds/MDSMap.cc
index 831e236..796aad3 100644
--- a/src/mds/MDSMap.cc
+++ b/src/mds/MDSMap.cc
@@ -63,7 +63,7 @@ CompatSet get_mdsmap_compat_set_base() {
   feature_incompat_base.insert(MDS_FEATURE_INCOMPAT_BASE);
   CompatSet::FeatureSet feature_ro_compat_base;
 
-  return CompatSet(feature_compat_base, feature_incompat_base, feature_ro_compat_base);
+  return CompatSet(feature_compat_base, feature_ro_compat_base, feature_incompat_base);
 }
 
 void MDSMap::mds_info_t::dump(Formatter *f) const
@@ -122,7 +122,7 @@ void MDSMap::dump(Formatter *f) const
   f->close_section();
   f->open_object_section("up");
   for (map<mds_rank_t,mds_gid_t>::const_iterator p = up.begin(); p != up.end(); ++p) {
-    char s[10];
+    char s[14];
     sprintf(s, "mds_%d", int(p->first));
     f->dump_int(s, p->second);
   }
@@ -131,6 +131,10 @@ void MDSMap::dump(Formatter *f) const
   for (set<mds_rank_t>::const_iterator p = failed.begin(); p != failed.end(); ++p)
     f->dump_int("mds", *p);
   f->close_section();
+  f->open_array_section("damaged");
+  for (set<mds_rank_t>::const_iterator p = damaged.begin(); p != damaged.end(); ++p)
+    f->dump_int("mds", *p);
+  f->close_section();
   f->open_array_section("stopped");
   for (set<mds_rank_t>::const_iterator p = stopped.begin(); p != stopped.end(); ++p)
     f->dump_int("mds", *p);
@@ -187,6 +191,7 @@ void MDSMap::print(ostream& out)
   out << "in\t" << in << "\n"
       << "up\t" << up << "\n"
       << "failed\t" << failed << "\n"
+      << "damaged\t" << damaged << "\n"
       << "stopped\t" << stopped << "\n";
   out << "data_pools\t" << data_pools << "\n";
   out << "metadata_pool\t" << metadata_pool << "\n";
@@ -288,6 +293,14 @@ void MDSMap::print_summary(Formatter *f, ostream *out)
       *out << ", " << failed.size() << " failed";
     }
   }
+
+  if (!damaged.empty()) {
+    if (f) {
+      f->dump_unsigned("damaged", damaged.size());
+    } else {
+      *out << ", " << damaged.size() << " damaged";
+    }
+  }
   //if (stopped.size())
   //out << ", " << stopped.size() << " stopped";
 }
@@ -312,6 +325,23 @@ void MDSMap::get_health(list<pair<health_status_t,string> >& summary,
     }
   }
 
+  if (!damaged.empty()) {
+    std::ostringstream oss;
+    oss << "mds rank"
+	<< ((damaged.size() > 1) ? "s ":" ")
+	<< damaged
+	<< ((damaged.size() > 1) ? " are":" is")
+	<< " damaged";
+    summary.push_back(make_pair(HEALTH_ERR, oss.str()));
+    if (detail) {
+      for (set<mds_rank_t>::const_iterator p = damaged.begin(); p != damaged.end(); ++p) {
+	std::ostringstream oss;
+	oss << "mds." << *p << " is damaged";
+	detail->push_back(make_pair(HEALTH_ERR, oss.str()));
+      }
+    }
+  }
+
   if (is_degraded()) {
     summary.push_back(make_pair(HEALTH_WARN, "mds cluster is degraded"));
     if (detail) {
@@ -342,6 +372,9 @@ void MDSMap::get_health(list<pair<health_status_t,string> >& summary,
   set<string> laggy;
   for (; u != u_end; ++u) {
     map<mds_gid_t, mds_info_t>::const_iterator m = mds_info.find(u->second);
+    if (m == m_end) {
+      std::cerr << "Up rank " << u->first << " GID " << u->second << " not found!" << std::endl;
+    }
     assert(m != m_end);
     const mds_info_t &mds_info(m->second);
     if (mds_info.laggy()) {
@@ -499,7 +532,7 @@ void MDSMap::encode(bufferlist& bl, uint64_t features) const
   ::encode(cas_pool, bl);
 
   // kclient ignores everything from here
-  __u16 ev = 8;
+  __u16 ev = 9;
   ::encode(ev, bl);
   ::encode(compat, bl);
   ::encode(metadata_pool, bl);
@@ -517,6 +550,7 @@ void MDSMap::encode(bufferlist& bl, uint64_t features) const
   ::encode(inline_data_enabled, bl);
   ::encode(enabled, bl);
   ::encode(fs_name, bl);
+  ::encode(damaged, bl);
   ENCODE_FINISH(bl);
 }
 
@@ -598,5 +632,9 @@ void MDSMap::decode(bufferlist::iterator& p)
       enabled = false;
     }
   }
+
+  if (ev >= 9) {
+    ::decode(damaged, p);
+  }
   DECODE_FINISH(p);
 }
diff --git a/src/mds/MDSMap.h b/src/mds/MDSMap.h
index 0e36c14..f4b369b 100644
--- a/src/mds/MDSMap.h
+++ b/src/mds/MDSMap.h
@@ -80,7 +80,7 @@ public:
   typedef enum {
     // States of an MDS daemon not currently holding a rank
     // ====================================================
-    STATE_NULL     =   0,                                  // null value for fns returning this type.
+    STATE_NULL     =   CEPH_MDS_STATE_NULL,                                  // null value for fns returning this type.
     STATE_BOOT     =   CEPH_MDS_STATE_BOOT,                // up, boot announcement.  destiny unknown.
     STATE_STANDBY  =   CEPH_MDS_STATE_STANDBY,             // up, idle.  waiting for assignment by monitor.
     STATE_STANDBY_REPLAY = CEPH_MDS_STATE_STANDBY_REPLAY,  // up, replaying active node, ready to take over.
@@ -100,7 +100,12 @@ public:
     STATE_CLIENTREPLAY = CEPH_MDS_STATE_CLIENTREPLAY, // up, active
     STATE_ACTIVE =     CEPH_MDS_STATE_ACTIVE,         // up, active
     STATE_STOPPING  =  CEPH_MDS_STATE_STOPPING,       // up, exporting metadata (-> standby or out)
-    STATE_DNE       =  CEPH_MDS_STATE_DNE             // down, rank does not exist
+    STATE_DNE       =  CEPH_MDS_STATE_DNE,             // down, rank does not exist
+
+    // State which a daemon may send to MDSMonitor in its beacon
+    // to indicate that offline repair is required.  Daemon must stop
+    // immediately after indicating this state.
+    STATE_DAMAGED   = CEPH_MDS_STATE_DAMAGED
 
     /*
      * In addition to explicit states, an MDS rank implicitly in state:
@@ -136,12 +141,8 @@ public:
     std::string standby_for_name;
     std::set<mds_rank_t> export_targets;
 
-#if 1
     mds_info_t() : global_id(MDS_GID_NONE), rank(MDS_RANK_NONE), inc(0), state(STATE_STANDBY), state_seq(0),
 		   standby_for_rank(MDS_NO_STANDBY_PREF) { }
-#else
-    mds_info_t();
-#endif
 
     bool laggy() const { return !(laggy_since == utime_t()); }
     void clear_laggy() { laggy_since = utime_t(); }
@@ -197,7 +198,8 @@ protected:
 
   std::set<mds_rank_t> in;              // currently defined cluster
   std::map<mds_rank_t,int32_t> inc;     // most recent incarnation.
-  std::set<mds_rank_t> failed, stopped; // which roles are failed or stopped
+  // which ranks are failed, stopped, damaged (i.e. not held by a daemon)
+  std::set<mds_rank_t> failed, stopped, damaged;
   std::map<mds_rank_t, mds_gid_t> up;        // who is in those roles
   std::map<mds_gid_t, mds_info_t> mds_info;
 
@@ -347,6 +349,17 @@ public:
   void get_failed_mds_set(std::set<mds_rank_t>& s) {
     s = failed;
   }
+
+  /**
+   * Get MDS ranks which are in but not up.
+   */
+  void get_down_mds_set(std::set<mds_rank_t> *s)
+  {
+    assert(s != NULL);
+    s->insert(failed.begin(), failed.end());
+    s->insert(damaged.begin(), damaged.end());
+  }
+
   int get_failed() {
     if (!failed.empty()) return *failed.begin();
     return -1;
@@ -518,7 +531,7 @@ public:
     return mds_rank_t(in.size()) >= max_mds;
   }
   bool is_degraded() const {   // degraded = some recovery in process.  fixes active membership and recovery_set.
-    if (!failed.empty())
+    if (!failed.empty() || !damaged.empty())
       return true;
     for (std::map<mds_gid_t,mds_info_t>::const_iterator p = mds_info.begin();
 	 p != mds_info.end();
@@ -549,10 +562,19 @@ public:
     return up.empty();
   }
 
-  // inst
+  /**
+   * Get whether a rank is 'up', i.e. has
+   * an MDS daemon's entity_inst_t associated
+   * with it.
+   */
   bool have_inst(mds_rank_t m) {
     return up.count(m);
   }
+
+  /**
+   * Get the MDS daemon entity_inst_t for a rank
+   * known to be up.
+   */
   const entity_inst_t get_inst(mds_rank_t m) {
     assert(up.count(m));
     return mds_info[up[m]].get_inst();
@@ -561,6 +583,14 @@ public:
     assert(up.count(m));
     return mds_info[up[m]].addr;
   }
+
+  /**
+   * Get the MDS daemon entity_inst_t for a rank,
+   * if it is up.
+   * 
+   * @return true if the rank was up and the inst
+   *         was populated, else false.
+   */
   bool get_inst(mds_rank_t m, entity_inst_t& inst) {
     if (up.count(m)) {
       inst = get_inst(m);
diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc
new file mode 100644
index 0000000..7e2048d
--- /dev/null
+++ b/src/mds/MDSRank.cc
@@ -0,0 +1,2405 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include "common/debug.h"
+#include "common/errno.h"
+
+#include "messages/MClientRequestForward.h"
+#include "messages/MMDSMap.h"
+
+#include "MDSMap.h"
+//#include "MDS.h"
+#include "mds_table_types.h"
+#include "SnapClient.h"
+#include "SnapServer.h"
+#include "MDBalancer.h"
+#include "messages/MMDSTableRequest.h"
+#include "Locker.h"
+#include "Server.h"
+#include "InoTable.h"
+#include "mon/MonClient.h"
+#include "common/HeartbeatMap.h"
+
+
+#include "MDSRank.h"
+
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "mds." << whoami << '.' << incarnation << ' '
+
+MDSRank::MDSRank(
+    mds_rank_t whoami_,
+    Mutex &mds_lock_,
+    LogChannelRef &clog_,
+    SafeTimer &timer_,
+    Beacon &beacon_,
+    MDSMap *& mdsmap_,
+    Messenger *msgr,
+    MonClient *monc_,
+    Objecter *objecter_,
+    Context *respawn_hook_,
+    Context *suicide_hook_)
+  :
+    whoami(whoami_), incarnation(0),
+    mds_lock(mds_lock_), clog(clog_), timer(timer_),
+    mdsmap(mdsmap_),
+    objecter(objecter_),
+    server(NULL), mdcache(NULL), locker(NULL), mdlog(NULL),
+    balancer(NULL), inotable(NULL), snapserver(NULL), snapclient(NULL),
+    sessionmap(this), logger(NULL), mlogger(NULL),
+    op_tracker(g_ceph_context, g_conf->mds_enable_op_tracker, 
+               g_conf->osd_num_op_tracker_shard),
+    last_state(MDSMap::STATE_BOOT),
+    state(MDSMap::STATE_BOOT),
+    stopping(false),
+    progress_thread(this), dispatch_depth(0),
+    hb(NULL), last_tid(0), osd_epoch_barrier(0), beacon(beacon_),
+    last_client_mdsmap_bcast(0),
+    messenger(msgr), monc(monc_),
+    respawn_hook(respawn_hook_),
+    suicide_hook(suicide_hook_),
+    standby_replaying(false)
+{
+  hb = g_ceph_context->get_heartbeat_map()->add_worker("MDSRank");
+
+  finisher = new Finisher(msgr->cct);
+
+  mdcache = new MDCache(this);
+  mdlog = new MDLog(this);
+  balancer = new MDBalancer(this, messenger, monc);
+
+  inotable = new InoTable(this);
+  snapserver = new SnapServer(this, monc);
+  snapclient = new SnapClient(this);
+
+  server = new Server(this);
+  locker = new Locker(this, mdcache);
+
+  op_tracker.set_complaint_and_threshold(msgr->cct->_conf->mds_op_complaint_time,
+                                         msgr->cct->_conf->mds_op_log_threshold);
+  op_tracker.set_history_size_and_duration(msgr->cct->_conf->mds_op_history_size,
+                                           msgr->cct->_conf->mds_op_history_duration);
+}
+
+MDSRank::~MDSRank()
+{
+  if (hb) {
+    g_ceph_context->get_heartbeat_map()->remove_worker(hb);
+  }
+
+  if (mdcache) { delete mdcache; mdcache = NULL; }
+  if (mdlog) { delete mdlog; mdlog = NULL; }
+  if (balancer) { delete balancer; balancer = NULL; }
+  if (inotable) { delete inotable; inotable = NULL; }
+  if (snapserver) { delete snapserver; snapserver = NULL; }
+  if (snapclient) { delete snapclient; snapclient = NULL; }
+  if (mdsmap) { delete mdsmap; mdsmap = 0; }
+
+  if (server) { delete server; server = 0; }
+  if (locker) { delete locker; locker = 0; }
+
+  if (logger) {
+    g_ceph_context->get_perfcounters_collection()->remove(logger);
+    delete logger;
+    logger = 0;
+  }
+  if (mlogger) {
+    g_ceph_context->get_perfcounters_collection()->remove(mlogger);
+    delete mlogger;
+    mlogger = 0;
+  }
+
+  delete finisher;
+  finisher = NULL;
+
+  delete suicide_hook;
+  suicide_hook = NULL;
+
+  delete respawn_hook;
+  respawn_hook = NULL;
+}
+
+void MDSRankDispatcher::init()
+{
+  update_log_config();
+  create_logger();
+
+  // Expose the OSDMap (already populated during MDS::init) to anyone
+  // who is interested in it.
+  handle_osd_map();
+
+  progress_thread.create();
+
+  finisher->start();
+}
+
+void MDSRankDispatcher::tick()
+{
+  heartbeat_reset();
+
+  if (beacon.is_laggy()) {
+    dout(5) << "tick bailing out since we seem laggy" << dendl;
+    return;
+  }
+
+  check_ops_in_flight();
+
+  // Wake up thread in case we use to be laggy and have waiting_for_nolaggy
+  // messages to progress.
+  progress_thread.signal();
+
+  // make sure mds log flushes, trims periodically
+  mdlog->flush();
+
+  if (is_active() || is_stopping()) {
+    mdcache->trim();
+    mdcache->trim_client_leases();
+    mdcache->check_memory_usage();
+    mdlog->trim();  // NOT during recovery!
+  }
+
+  // log
+  utime_t now = ceph_clock_now(g_ceph_context);
+  mds_load_t load = balancer->get_load(now);
+  
+  if (logger) {
+    logger->set(l_mds_load_cent, 100 * load.mds_load());
+    logger->set(l_mds_dispatch_queue_len, messenger->get_dispatch_queue_len());
+    logger->set(l_mds_subtrees, mdcache->num_subtrees());
+
+    mdcache->log_stat();
+  }
+
+  // ...
+  if (is_clientreplay() || is_active() || is_stopping()) {
+    server->find_idle_sessions();
+    locker->tick();
+  }
+  
+  if (is_reconnect())
+    server->reconnect_tick();
+  
+  if (is_active()) {
+    balancer->tick();
+    mdcache->find_stale_fragment_freeze();
+    mdcache->migrator->find_stale_export_freeze();
+    if (snapserver)
+      snapserver->check_osd_map(false);
+  }
+
+  // Expose ourselves to Beacon to update health indicators
+  beacon.notify_health(this);
+}
+
+void MDSRankDispatcher::shutdown()
+{
+  // It should never be possible for shutdown to get called twice, because
+  // anyone picking up mds_lock checks if stopping is true and drops
+  // out if it is.
+  assert(stopping == false);
+  stopping = true;
+
+  dout(1) << __func__ << ": shutting down rank " << whoami << dendl;
+
+  timer.shutdown();
+
+  // MDLog has to shut down before the finisher, because some of its
+  // threads block on IOs that require finisher to complete.
+  mdlog->shutdown();
+
+  finisher->stop(); // no flushing
+
+  // shut down cache
+  mdcache->shutdown();
+
+  if (objecter->initialized.read())
+    objecter->shutdown();
+
+  monc->shutdown();
+
+  op_tracker.on_shutdown();
+
+  progress_thread.shutdown();
+
+  // shut down messenger
+  messenger->shutdown();
+
+  // Workaround unclean shutdown: HeartbeatMap will assert if
+  // worker is not removed (as we do in ~MDS), but ~MDS is not
+  // always called after suicide.
+  if (hb) {
+    g_ceph_context->get_heartbeat_map()->remove_worker(hb);
+    hb = NULL;
+  }
+}
+
+/**
+ * Helper for simple callbacks that call a void fn with no args.
+ */
+class C_VoidFn : public MDSInternalContext
+{
+  typedef void (MDSRank::*fn_ptr)();
+  protected:
+   fn_ptr fn; 
+  public:
+  C_VoidFn(MDSRank *mds_, fn_ptr fn_)
+    : MDSInternalContext(mds_), fn(fn_)
+  {
+    assert(mds_);
+    assert(fn_);
+  }
+
+  void finish(int r)
+  {
+    (mds->*fn)();
+  }
+};
+
+uint64_t MDSRank::get_metadata_pool()
+{
+    return mdsmap->get_metadata_pool();
+}
+
+MDSTableClient *MDSRank::get_table_client(int t)
+{
+  switch (t) {
+  case TABLE_ANCHOR: return NULL;
+  case TABLE_SNAP: return snapclient;
+  default: assert(0);
+  }
+}
+
+MDSTableServer *MDSRank::get_table_server(int t)
+{
+  switch (t) {
+  case TABLE_ANCHOR: return NULL;
+  case TABLE_SNAP: return snapserver;
+  default: assert(0);
+  }
+}
+
+void MDSRank::suicide()
+{
+  if (suicide_hook) {
+    suicide_hook->complete(0);
+    suicide_hook = NULL;
+  }
+}
+
+void MDSRank::respawn()
+{
+  if (respawn_hook) {
+    respawn_hook->complete(0);
+    respawn_hook = NULL;
+  }
+}
+
+void MDSRank::damaged()
+{
+  assert(whoami != MDS_RANK_NONE);
+  assert(mds_lock.is_locked_by_me());
+
+  beacon.set_want_state(mdsmap, MDSMap::STATE_DAMAGED);
+  monc->flush_log();  // Flush any clog error from before we were called
+  beacon.notify_health(this);  // Include latest status in our swan song
+  beacon.send_and_wait(g_conf->mds_mon_shutdown_timeout);
+
+  // It's okay if we timed out and the mon didn't get our beacon, because
+  // another daemon (or ourselves after respawn) will eventually take the
+  // rank and report DAMAGED again when it hits same problem we did.
+
+  respawn();  // Respawn into standby in case mon has other work for us
+}
+
+void MDSRank::damaged_unlocked()
+{
+  Mutex::Locker l(mds_lock);
+  damaged();
+}
+
+void MDSRank::handle_write_error(int err)
+{
+  if (err == -EBLACKLISTED) {
+    derr << "we have been blacklisted (fenced), respawning..." << dendl;
+    respawn();
+    return;
+  }
+
+  if (g_conf->mds_action_on_write_error >= 2) {
+    derr << "unhandled write error " << cpp_strerror(err) << ", suicide..." << dendl;
+    respawn();
+  } else if (g_conf->mds_action_on_write_error == 1) {
+    derr << "unhandled write error " << cpp_strerror(err) << ", force readonly..." << dendl;
+    mdcache->force_readonly();
+  } else {
+    // ignore;
+    derr << "unhandled write error " << cpp_strerror(err) << ", ignore..." << dendl;
+  }
+}
+
+void *MDSRank::ProgressThread::entry()
+{
+  Mutex::Locker l(mds->mds_lock);
+  while (true) {
+    while (!mds->stopping &&
+	   mds->finished_queue.empty() &&
+	   (mds->waiting_for_nolaggy.empty() || mds->beacon.is_laggy())) {
+      cond.Wait(mds->mds_lock);
+    }
+
+    if (mds->stopping) {
+      break;
+    }
+
+    mds->_advance_queues();
+  }
+
+  return NULL;
+}
+
+
+void MDSRank::ProgressThread::shutdown()
+{
+  assert(mds->mds_lock.is_locked_by_me());
+  assert(mds->stopping);
+
+  if (am_self()) {
+    // Stopping is set, we will fall out of our main loop naturally
+  } else {
+    // Kick the thread to notice mds->stopping, and join it
+    cond.Signal();
+    mds->mds_lock.Unlock();
+    if (is_started())
+      join();
+    mds->mds_lock.Lock();
+  }
+}
+
+bool MDSRankDispatcher::ms_dispatch(Message *m)
+{
+  bool ret;
+  inc_dispatch_depth();
+  ret = _dispatch(m, true);
+  dec_dispatch_depth();
+  return ret;
+}
+
+/* If this function returns true, it has put the message. If it returns false,
+ * it has not put the message. */
+bool MDSRank::_dispatch(Message *m, bool new_msg)
+{
+  if (is_stale_message(m)) {
+    m->put();
+    return true;
+  }
+
+  if (beacon.is_laggy()) {
+    dout(10) << " laggy, deferring " << *m << dendl;
+    waiting_for_nolaggy.push_back(m);
+  } else if (new_msg && !waiting_for_nolaggy.empty()) {
+    dout(10) << " there are deferred messages, deferring " << *m << dendl;
+    waiting_for_nolaggy.push_back(m);
+  } else {
+    if (!handle_deferrable_message(m)) {
+      dout(0) << "unrecognized message " << *m << dendl;
+      m->put();
+      return false;
+    }
+  }
+
+  if (dispatch_depth > 1)
+    return true;
+
+  // finish any triggered contexts
+  _advance_queues();
+
+  if (beacon.is_laggy()) {
+    // We've gone laggy during dispatch, don't do any
+    // more housekeeping
+    return true;
+  }
+
+  // done with all client replayed requests?
+  if (is_clientreplay() &&
+      mdcache->is_open() &&
+      replay_queue.empty() &&
+      beacon.get_want_state() == MDSMap::STATE_CLIENTREPLAY) {
+    int num_requests = mdcache->get_num_client_requests();
+    dout(10) << " still have " << num_requests << " active replay requests" << dendl;
+    if (num_requests == 0)
+      clientreplay_done();
+  }
+
+  // hack: thrash exports
+  static utime_t start;
+  utime_t now = ceph_clock_now(g_ceph_context);
+  if (start == utime_t()) 
+    start = now;
+  /*double el = now - start;
+  if (el > 30.0 &&
+    el < 60.0)*/
+  for (int i=0; i<g_conf->mds_thrash_exports; i++) {
+    set<mds_rank_t> s;
+    if (!is_active()) break;
+    mdsmap->get_mds_set(s, MDSMap::STATE_ACTIVE);
+    if (s.size() < 2 || mdcache->get_num_inodes() < 10) 
+      break;  // need peers for this to work.
+
+    dout(7) << "mds thrashing exports pass " << (i+1) << "/" << g_conf->mds_thrash_exports << dendl;
+    
+    // pick a random dir inode
+    CInode *in = mdcache->hack_pick_random_inode();
+
+    list<CDir*> ls;
+    in->get_dirfrags(ls);
+    if (!ls.empty()) {	// must be an open dir.
+      list<CDir*>::iterator p = ls.begin();
+      int n = rand() % ls.size();
+      while (n--)
+        ++p;
+      CDir *dir = *p;
+      if (!dir->get_parent_dir()) continue;    // must be linked.
+      if (!dir->is_auth()) continue;           // must be auth.
+  
+      mds_rank_t dest;
+      do {
+        int k = rand() % s.size();
+        set<mds_rank_t>::iterator p = s.begin();
+        while (k--) ++p;
+        dest = *p;
+      } while (dest == whoami);
+      mdcache->migrator->export_dir_nicely(dir,dest);
+    }
+  }
+  // hack: thrash fragments
+  for (int i=0; i<g_conf->mds_thrash_fragments; i++) {
+    if (!is_active()) break;
+    if (mdcache->get_num_fragmenting_dirs() > 5) break;
+    dout(7) << "mds thrashing fragments pass " << (i+1) << "/" << g_conf->mds_thrash_fragments << dendl;
+    
+    // pick a random dir inode
+    CInode *in = mdcache->hack_pick_random_inode();
+
+    list<CDir*> ls;
+    in->get_dirfrags(ls);
+    if (ls.empty()) continue;                // must be an open dir.
+    CDir *dir = ls.front();
+    if (!dir->get_parent_dir()) continue;    // must be linked.
+    if (!dir->is_auth()) continue;           // must be auth.
+    frag_t fg = dir->get_frag();
+    if (fg == frag_t() || (rand() % (1 << fg.bits()) == 0))
+      mdcache->split_dir(dir, 1);
+    else
+      balancer->queue_merge(dir);
+  }
+
+  // hack: force hash root?
+  /*
+  if (false &&
+      mdcache->get_root() &&
+      mdcache->get_root()->dir &&
+      !(mdcache->get_root()->dir->is_hashed() || 
+        mdcache->get_root()->dir->is_hashing())) {
+    dout(0) << "hashing root" << dendl;
+    mdcache->migrator->hash_dir(mdcache->get_root()->dir);
+  }
+  */
+
+  if (mlogger) {
+    mlogger->set(l_mdm_ino, g_num_ino);
+    mlogger->set(l_mdm_dir, g_num_dir);
+    mlogger->set(l_mdm_dn, g_num_dn);
+    mlogger->set(l_mdm_cap, g_num_cap);
+
+    mlogger->inc(l_mdm_inoa, g_num_inoa);  g_num_inoa = 0;
+    mlogger->inc(l_mdm_inos, g_num_inos);  g_num_inos = 0;
+    mlogger->inc(l_mdm_dira, g_num_dira);  g_num_dira = 0;
+    mlogger->inc(l_mdm_dirs, g_num_dirs);  g_num_dirs = 0;
+    mlogger->inc(l_mdm_dna, g_num_dna);  g_num_dna = 0;
+    mlogger->inc(l_mdm_dns, g_num_dns);  g_num_dns = 0;
+    mlogger->inc(l_mdm_capa, g_num_capa);  g_num_capa = 0;
+    mlogger->inc(l_mdm_caps, g_num_caps);  g_num_caps = 0;
+
+    mlogger->set(l_mdm_buf, buffer::get_total_alloc());
+  }
+
+  // shut down?
+  if (is_stopping()) {
+    mdlog->trim();
+    if (mdcache->shutdown_pass()) {
+      dout(7) << "shutdown_pass=true, finished w/ shutdown, moving to down:stopped" << dendl;
+      stopping_done();
+    }
+    else {
+      dout(7) << "shutdown_pass=false" << dendl;
+    }
+  }
+  return true;
+}
+
+/*
+ * lower priority messages we defer if we seem laggy
+ */
+bool MDSRank::handle_deferrable_message(Message *m)
+{
+  int port = m->get_type() & 0xff00;
+
+  switch (port) {
+  case MDS_PORT_CACHE:
+    ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS);
+    mdcache->dispatch(m);
+    break;
+    
+  case MDS_PORT_MIGRATOR:
+    ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS);
+    mdcache->migrator->dispatch(m);
+    break;
+    
+  default:
+    switch (m->get_type()) {
+      // SERVER
+    case CEPH_MSG_CLIENT_SESSION:
+    case CEPH_MSG_CLIENT_RECONNECT:
+      ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_CLIENT);
+      // fall-thru
+    case CEPH_MSG_CLIENT_REQUEST:
+      server->dispatch(m);
+      break;
+    case MSG_MDS_SLAVE_REQUEST:
+      ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS);
+      server->dispatch(m);
+      break;
+      
+    case MSG_MDS_HEARTBEAT:
+      ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS);
+      balancer->proc_message(m);
+      break;
+	  
+    case MSG_MDS_TABLE_REQUEST:
+      ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS);
+      {
+	MMDSTableRequest *req = static_cast<MMDSTableRequest*>(m);
+	if (req->op < 0) {
+	  MDSTableClient *client = get_table_client(req->table);
+	      client->handle_request(req);
+	} else {
+	  MDSTableServer *server = get_table_server(req->table);
+	  server->handle_request(req);
+	}
+      }
+      break;
+
+    case MSG_MDS_LOCK:
+    case MSG_MDS_INODEFILECAPS:
+      ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS);
+      locker->dispatch(m);
+      break;
+      
+    case CEPH_MSG_CLIENT_CAPS:
+    case CEPH_MSG_CLIENT_CAPRELEASE:
+    case CEPH_MSG_CLIENT_LEASE:
+      ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_CLIENT);
+      locker->dispatch(m);
+      break;
+      
+    default:
+      return false;
+    }
+  }
+
+  return true;
+}
+
+/**
+ * Advance finished_queue and waiting_for_nolaggy.
+ *
+ * Usually drain both queues, but may not drain waiting_for_nolaggy
+ * if beacon is currently laggy.
+ */
+void MDSRank::_advance_queues()
+{
+  assert(mds_lock.is_locked_by_me());
+
+  while (!finished_queue.empty()) {
+    dout(7) << "mds has " << finished_queue.size() << " queued contexts" << dendl;
+    dout(10) << finished_queue << dendl;
+    list<MDSInternalContextBase*> ls;
+    ls.swap(finished_queue);
+    while (!ls.empty()) {
+      dout(10) << " finish " << ls.front() << dendl;
+      ls.front()->complete(0);
+      ls.pop_front();
+
+      heartbeat_reset();
+    }
+  }
+
+  while (!waiting_for_nolaggy.empty()) {
+    // stop if we're laggy now!
+    if (beacon.is_laggy())
+      break;
+
+    Message *old = waiting_for_nolaggy.front();
+    waiting_for_nolaggy.pop_front();
+
+    if (is_stale_message(old)) {
+      old->put();
+    } else {
+      dout(7) << " processing laggy deferred " << *old << dendl;
+      handle_deferrable_message(old);
+    }
+
+    heartbeat_reset();
+  }
+}
+
+/**
+ * Call this when you take mds_lock, or periodically if you're going to
+ * hold the lock for a long time (e.g. iterating over clients/inodes)
+ */
+void MDSRank::heartbeat_reset()
+{
+  // Any thread might jump into mds_lock and call us immediately
+  // after a call to suicide() completes, in which case MDSRank::hb
+  // has been freed and we are a no-op.
+  if (!hb) {
+      assert(stopping);
+      return;
+  }
+
+  // NB not enabling suicide grace, because the mon takes care of killing us
+  // (by blacklisting us) when we fail to send beacons, and it's simpler to
+  // only have one way of dying.
+  g_ceph_context->get_heartbeat_map()->reset_timeout(hb, g_conf->mds_beacon_grace, 0);
+}
+
+bool MDSRank::is_stale_message(Message *m)
+{
+  // from bad mds?
+  if (m->get_source().is_mds()) {
+    mds_rank_t from = mds_rank_t(m->get_source().num());
+    if (!mdsmap->have_inst(from) ||
+	mdsmap->get_inst(from) != m->get_source_inst() ||
+	mdsmap->is_down(from)) {
+      // bogus mds?
+      if (m->get_type() == CEPH_MSG_MDS_MAP) {
+	dout(5) << "got " << *m << " from old/bad/imposter mds " << m->get_source()
+		<< ", but it's an mdsmap, looking at it" << dendl;
+      } else if (m->get_type() == MSG_MDS_CACHEEXPIRE &&
+		 mdsmap->get_inst(from) == m->get_source_inst()) {
+	dout(5) << "got " << *m << " from down mds " << m->get_source()
+		<< ", but it's a cache_expire, looking at it" << dendl;
+      } else {
+	dout(5) << "got " << *m << " from down/old/bad/imposter mds " << m->get_source()
+		<< ", dropping" << dendl;
+	return true;
+      }
+    }
+  }
+  return false;
+}
+
+
+void MDSRank::send_message(Message *m, Connection *c)
+{ 
+  assert(c);
+  c->send_message(m);
+}
+
+
+void MDSRank::send_message_mds(Message *m, mds_rank_t mds)
+{
+  if (!mdsmap->is_up(mds)) {
+    dout(10) << "send_message_mds mds." << mds << " not up, dropping " << *m << dendl;
+    m->put();
+    return;
+  }
+
+  // send mdsmap first?
+  if (mds != whoami && peer_mdsmap_epoch[mds] < mdsmap->get_epoch()) {
+    messenger->send_message(new MMDSMap(monc->get_fsid(), mdsmap), 
+			    mdsmap->get_inst(mds));
+    peer_mdsmap_epoch[mds] = mdsmap->get_epoch();
+  }
+
+  // send message
+  messenger->send_message(m, mdsmap->get_inst(mds));
+}
+
+void MDSRank::forward_message_mds(Message *m, mds_rank_t mds)
+{
+  assert(mds != whoami);
+
+  // client request?
+  if (m->get_type() == CEPH_MSG_CLIENT_REQUEST &&
+      (static_cast<MClientRequest*>(m))->get_source().is_client()) {
+    MClientRequest *creq = static_cast<MClientRequest*>(m);
+    creq->inc_num_fwd();    // inc forward counter
+
+    /*
+     * don't actually forward if non-idempotent!
+     * client has to do it.  although the MDS will ignore duplicate requests,
+     * the affected metadata may migrate, in which case the new authority
+     * won't have the metareq_id in the completed request map.
+     */
+    // NEW: always make the client resend!  
+    bool client_must_resend = true;  //!creq->can_forward();
+
+    // tell the client where it should go
+    messenger->send_message(new MClientRequestForward(creq->get_tid(), mds, creq->get_num_fwd(),
+						      client_must_resend),
+			    creq->get_source_inst());
+    
+    if (client_must_resend) {
+      m->put();
+      return; 
+    }
+  }
+
+  // these are the only types of messages we should be 'forwarding'; they
+  // explicitly encode their source mds, which gets clobbered when we resend
+  // them here.
+  assert(m->get_type() == MSG_MDS_DIRUPDATE ||
+	 m->get_type() == MSG_MDS_EXPORTDIRDISCOVER);
+
+  // send mdsmap first?
+  if (peer_mdsmap_epoch[mds] < mdsmap->get_epoch()) {
+    messenger->send_message(new MMDSMap(monc->get_fsid(), mdsmap), 
+			    mdsmap->get_inst(mds));
+    peer_mdsmap_epoch[mds] = mdsmap->get_epoch();
+  }
+
+  messenger->send_message(m, mdsmap->get_inst(mds));
+}
+
+
+
+void MDSRank::send_message_client_counted(Message *m, client_t client)
+{
+  Session *session =  sessionmap.get_session(entity_name_t::CLIENT(client.v));
+  if (session) {
+    send_message_client_counted(m, session);
+  } else {
+    dout(10) << "send_message_client_counted no session for client." << client << " " << *m << dendl;
+  }
+}
+
+void MDSRank::send_message_client_counted(Message *m, Connection *connection)
+{
+  Session *session = static_cast<Session *>(connection->get_priv());
+  if (session) {
+    session->put();  // do not carry ref
+    send_message_client_counted(m, session);
+  } else {
+    dout(10) << "send_message_client_counted has no session for " << m->get_source_inst() << dendl;
+    // another Connection took over the Session
+  }
+}
+
+void MDSRank::send_message_client_counted(Message *m, Session *session)
+{
+  version_t seq = session->inc_push_seq();
+  dout(10) << "send_message_client_counted " << session->info.inst.name << " seq "
+	   << seq << " " << *m << dendl;
+  if (session->connection) {
+    session->connection->send_message(m);
+  } else {
+    session->preopen_out_queue.push_back(m);
+  }
+}
+
+void MDSRank::send_message_client(Message *m, Session *session)
+{
+  dout(10) << "send_message_client " << session->info.inst << " " << *m << dendl;
+  if (session->connection) {
+    session->connection->send_message(m);
+  } else {
+    session->preopen_out_queue.push_back(m);
+  }
+}
+
+/**
+ * This is used whenever a RADOS operation has been cancelled
+ * or a RADOS client has been blacklisted, to cause the MDS and
+ * any clients to wait for this OSD epoch before using any new caps.
+ *
+ * See doc/cephfs/eviction
+ */
+void MDSRank::set_osd_epoch_barrier(epoch_t e)
+{
+  dout(4) << __func__ << ": epoch=" << e << dendl;
+  osd_epoch_barrier = e;
+}
+
+/**
+ * FIXME ugly call up to MDS daemon until the dispatching is separated out
+ */
+void MDSRank::retry_dispatch(Message *m)
+{
+  inc_dispatch_depth();
+  _dispatch(m, false);
+  dec_dispatch_depth();
+}
+
+utime_t MDSRank::get_laggy_until() const
+{
+  return beacon.get_laggy_until();
+}
+
+bool MDSRank::is_daemon_stopping() const
+{
+  return stopping;
+}
+
+// FIXME>> this fns are state-machiney, not dispatchy
+// >>>>>
+
+void MDSRank::request_state(MDSMap::DaemonState s)
+{
+  dout(3) << "request_state " << ceph_mds_state_name(s) << dendl;
+  beacon.set_want_state(mdsmap, s);
+  beacon.send();
+}
+
+
+class C_MDS_BootStart : public MDSInternalContext {
+  MDSRank::BootStep nextstep;
+public:
+  C_MDS_BootStart(MDSRank *m, MDSRank::BootStep n)
+    : MDSInternalContext(m), nextstep(n) {}
+  void finish(int r) {
+    mds->boot_start(nextstep, r);
+  }
+};
+
+
+void MDSRank::boot_start(BootStep step, int r)
+{
+  // Handle errors from previous step
+  if (r < 0) {
+    if (is_standby_replay() && (r == -EAGAIN)) {
+      dout(0) << "boot_start encountered an error EAGAIN"
+              << ", respawning since we fell behind journal" << dendl;
+      respawn();
+    } else if (r == -EINVAL || r == -ENOENT) {
+      // Invalid or absent data, indicates damaged on-disk structures
+      clog->error() << "Error loading MDS rank " << whoami << ": "
+        << cpp_strerror(r);
+      damaged();
+      assert(r == 0);  // Unreachable, damaged() calls respawn()
+    } else {
+      // Completely unexpected error, give up and die
+      dout(0) << "boot_start encountered an error, failing" << dendl;
+      suicide();
+      return;
+    }
+  }
+
+  assert(is_starting() || is_any_replay());
+
+  switch(step) {
+    case MDS_BOOT_INITIAL:
+      {
+        mdcache->init_layouts();
+
+        MDSGatherBuilder gather(g_ceph_context,
+            new C_MDS_BootStart(this, MDS_BOOT_OPEN_ROOT));
+        dout(2) << "boot_start " << step << ": opening inotable" << dendl;
+        inotable->set_rank(whoami);
+        inotable->load(gather.new_sub());
+
+        dout(2) << "boot_start " << step << ": opening sessionmap" << dendl;
+        sessionmap.set_rank(whoami);
+        sessionmap.load(gather.new_sub());
+
+        dout(2) << "boot_start " << step << ": opening mds log" << dendl;
+        mdlog->open(gather.new_sub());
+
+        if (mdsmap->get_tableserver() == whoami) {
+          dout(2) << "boot_start " << step << ": opening snap table" << dendl;
+          snapserver->set_rank(whoami);
+          snapserver->load(gather.new_sub());
+        }
+
+        gather.activate();
+      }
+      break;
+    case MDS_BOOT_OPEN_ROOT:
+      {
+        dout(2) << "boot_start " << step << ": loading/discovering base inodes" << dendl;
+
+        MDSGatherBuilder gather(g_ceph_context,
+            new C_MDS_BootStart(this, MDS_BOOT_PREPARE_LOG));
+
+        mdcache->open_mydir_inode(gather.new_sub());
+
+        if (is_starting() ||
+            whoami == mdsmap->get_root()) {  // load root inode off disk if we are auth
+          mdcache->open_root_inode(gather.new_sub());
+        } else {
+          // replay.  make up fake root inode to start with
+          mdcache->create_root_inode();
+        }
+        gather.activate();
+      }
+      break;
+    case MDS_BOOT_PREPARE_LOG:
+      if (is_any_replay()) {
+        dout(2) << "boot_start " << step << ": replaying mds log" << dendl;
+        mdlog->replay(new C_MDS_BootStart(this, MDS_BOOT_REPLAY_DONE));
+      } else {
+        dout(2) << "boot_start " << step << ": positioning at end of old mds log" << dendl;
+        mdlog->append();
+        starting_done();
+      }
+      break;
+    case MDS_BOOT_REPLAY_DONE:
+      assert(is_any_replay());
+      replay_done();
+      break;
+  }
+}
+
+void MDSRank::starting_done()
+{
+  dout(3) << "starting_done" << dendl;
+  assert(is_starting());
+  request_state(MDSMap::STATE_ACTIVE);
+
+  mdcache->open_root();
+
+  // start new segment
+  mdlog->start_new_segment();
+}
+
+
+void MDSRank::calc_recovery_set()
+{
+  // initialize gather sets
+  set<mds_rank_t> rs;
+  mdsmap->get_recovery_mds_set(rs);
+  rs.erase(whoami);
+  mdcache->set_recovery_set(rs);
+
+  dout(1) << " recovery set is " << rs << dendl;
+}
+
+
+void MDSRank::replay_start()
+{
+  dout(1) << "replay_start" << dendl;
+
+  if (is_standby_replay())
+    standby_replaying = true;
+  
+  calc_recovery_set();
+
+  // Check if we need to wait for a newer OSD map before starting
+  Context *fin = new C_OnFinisher(new C_IO_Wrapper(this, new C_MDS_BootStart(this, MDS_BOOT_INITIAL)), finisher);
+  bool const ready = objecter->wait_for_map(
+      mdsmap->get_last_failure_osd_epoch(),
+      fin);
+
+  if (ready) {
+    delete fin;
+    boot_start();
+  } else {
+    dout(1) << " waiting for osdmap " << mdsmap->get_last_failure_osd_epoch() 
+	    << " (which blacklists prior instance)" << dendl;
+  }
+}
+
+
+class MDSRank::C_MDS_StandbyReplayRestartFinish : public MDSIOContext {
+  uint64_t old_read_pos;
+public:
+  C_MDS_StandbyReplayRestartFinish(MDSRank *mds_, uint64_t old_read_pos_) :
+    MDSIOContext(mds_), old_read_pos(old_read_pos_) {}
+  void finish(int r) {
+    mds->_standby_replay_restart_finish(r, old_read_pos);
+  }
+};
+
+void MDSRank::_standby_replay_restart_finish(int r, uint64_t old_read_pos)
+{
+  if (old_read_pos < mdlog->get_journaler()->get_trimmed_pos()) {
+    dout(0) << "standby MDS fell behind active MDS journal's expire_pos, restarting" << dendl;
+    respawn(); /* we're too far back, and this is easier than
+		  trying to reset everything in the cache, etc */
+  } else {
+    mdlog->standby_trim_segments();
+    boot_start(MDS_BOOT_PREPARE_LOG, r);
+  }
+}
+
+inline void MDSRank::standby_replay_restart()
+{
+  dout(1) << "standby_replay_restart"
+	  << (standby_replaying ? " (as standby)":" (final takeover pass)")
+	  << dendl;
+  if (standby_replaying) {
+    /* Go around for another pass of replaying in standby */
+    mdlog->get_journaler()->reread_head_and_probe(
+      new C_MDS_StandbyReplayRestartFinish(
+        this,
+	mdlog->get_journaler()->get_read_pos()));
+  } else {
+    /* We are transitioning out of standby: wait for OSD map update
+       before making final pass */
+    Context *fin = new C_OnFinisher(new C_IO_Wrapper(this,
+          new C_MDS_BootStart(this, MDS_BOOT_PREPARE_LOG)),
+      finisher);
+    bool const ready =
+      objecter->wait_for_map(mdsmap->get_last_failure_osd_epoch(), fin);
+    if (ready) {
+      delete fin;
+      mdlog->get_journaler()->reread_head_and_probe(
+        new C_MDS_StandbyReplayRestartFinish(
+          this,
+	  mdlog->get_journaler()->get_read_pos()));
+    } else {
+      dout(1) << " waiting for osdmap " << mdsmap->get_last_failure_osd_epoch() 
+              << " (which blacklists prior instance)" << dendl;
+    }
+  }
+}
+
+class MDSRank::C_MDS_StandbyReplayRestart : public MDSInternalContext {
+public:
+  C_MDS_StandbyReplayRestart(MDSRank *m) : MDSInternalContext(m) {}
+  void finish(int r) {
+    assert(!r);
+    mds->standby_replay_restart();
+  }
+};
+
+void MDSRank::replay_done()
+{
+  dout(1) << "replay_done" << (standby_replaying ? " (as standby)" : "") << dendl;
+
+  if (is_oneshot_replay()) {
+    dout(2) << "hack.  journal looks ok.  shutting down." << dendl;
+    suicide();
+    return;
+  }
+
+  if (is_standby_replay()) {
+    // The replay was done in standby state, and we are still in that state
+    assert(standby_replaying);
+    dout(10) << "setting replay timer" << dendl;
+    timer.add_event_after(g_conf->mds_replay_interval,
+                          new C_MDS_StandbyReplayRestart(this));
+    return;
+  } else if (standby_replaying) {
+    // The replay was done in standby state, we have now _left_ that state
+    dout(10) << " last replay pass was as a standby; making final pass" << dendl;
+    standby_replaying = false;
+    standby_replay_restart();
+    return;
+  } else {
+    // Replay is complete, journal read should be up to date
+    assert(mdlog->get_journaler()->get_read_pos() == mdlog->get_journaler()->get_write_pos());
+    assert(!is_standby_replay());
+
+    // Reformat and come back here
+    if (mdlog->get_journaler()->get_stream_format() < g_conf->mds_journal_format) {
+        dout(4) << "reformatting journal on standbyreplay->replay transition" << dendl;
+        mdlog->reopen(new C_MDS_BootStart(this, MDS_BOOT_REPLAY_DONE));
+        return;
+    }
+  }
+
+  dout(1) << "making mds journal writeable" << dendl;
+  mdlog->get_journaler()->set_writeable();
+  mdlog->get_journaler()->trim_tail();
+
+  if (g_conf->mds_wipe_sessions) {
+    dout(1) << "wiping out client sessions" << dendl;
+    sessionmap.wipe();
+    sessionmap.save(new C_MDSInternalNoop);
+  }
+  if (g_conf->mds_wipe_ino_prealloc) {
+    dout(1) << "wiping out ino prealloc from sessions" << dendl;
+    sessionmap.wipe_ino_prealloc();
+    sessionmap.save(new C_MDSInternalNoop);
+  }
+  if (g_conf->mds_skip_ino) {
+    inodeno_t i = g_conf->mds_skip_ino;
+    dout(1) << "skipping " << i << " inodes" << dendl;
+    inotable->skip_inos(i);
+    inotable->save(new C_MDSInternalNoop);
+  }
+
+  if (mdsmap->get_num_in_mds() == 1 &&
+      mdsmap->get_num_failed_mds() == 0) { // just me!
+    dout(2) << "i am alone, moving to state reconnect" << dendl;      
+    request_state(MDSMap::STATE_RECONNECT);
+  } else {
+    dout(2) << "i am not alone, moving to state resolve" << dendl;
+    request_state(MDSMap::STATE_RESOLVE);
+  }
+}
+
+void MDSRank::reopen_log()
+{
+  dout(1) << "reopen_log" << dendl;
+  mdcache->rollback_uncommitted_fragments();
+}
+
+
+void MDSRank::resolve_start()
+{
+  dout(1) << "resolve_start" << dendl;
+
+  reopen_log();
+
+  mdcache->resolve_start(new C_VoidFn(this, &MDSRank::resolve_done));
+  finish_contexts(g_ceph_context, waiting_for_resolve);
+}
+void MDSRank::resolve_done()
+{
+  dout(1) << "resolve_done" << dendl;
+  request_state(MDSMap::STATE_RECONNECT);
+}
+
+void MDSRank::reconnect_start()
+{
+  dout(1) << "reconnect_start" << dendl;
+
+  if (last_state == MDSMap::STATE_REPLAY) {
+    reopen_log();
+  }
+
+  server->reconnect_clients(new C_VoidFn(this, &MDSRank::reconnect_done));
+  finish_contexts(g_ceph_context, waiting_for_reconnect);
+}
+void MDSRank::reconnect_done()
+{
+  dout(1) << "reconnect_done" << dendl;
+  request_state(MDSMap::STATE_REJOIN);    // move to rejoin state
+}
+
+void MDSRank::rejoin_joint_start()
+{
+  dout(1) << "rejoin_joint_start" << dendl;
+  mdcache->rejoin_send_rejoins();
+}
+void MDSRank::rejoin_start()
+{
+  dout(1) << "rejoin_start" << dendl;
+  mdcache->rejoin_start(new C_VoidFn(this, &MDSRank::rejoin_done));
+}
+void MDSRank::rejoin_done()
+{
+  dout(1) << "rejoin_done" << dendl;
+  mdcache->show_subtrees();
+  mdcache->show_cache();
+
+  // funny case: is our cache empty?  no subtrees?
+  if (!mdcache->is_subtrees()) {
+    if (whoami == 0) {
+      // The root should always have a subtree!
+      clog->error() << "No subtrees found for root MDS rank!";
+      damaged();
+      assert(mdcache->is_subtrees());
+    } else {
+      dout(1) << " empty cache, no subtrees, leaving cluster" << dendl;
+      request_state(MDSMap::STATE_STOPPED);
+    }
+    return;
+  }
+
+  if (replay_queue.empty())
+    request_state(MDSMap::STATE_ACTIVE);
+  else
+    request_state(MDSMap::STATE_CLIENTREPLAY);
+}
+
+void MDSRank::clientreplay_start()
+{
+  dout(1) << "clientreplay_start" << dendl;
+  finish_contexts(g_ceph_context, waiting_for_replay);  // kick waiters
+  queue_one_replay();
+}
+
+void MDSRank::clientreplay_done()
+{
+  dout(1) << "clientreplay_done" << dendl;
+  request_state(MDSMap::STATE_ACTIVE);
+}
+
+void MDSRank::active_start()
+{
+  dout(1) << "active_start" << dendl;
+
+  if (last_state == MDSMap::STATE_CREATING) {
+    mdcache->open_root();
+  }
+
+  mdcache->clean_open_file_lists();
+  mdcache->export_remaining_imported_caps();
+  finish_contexts(g_ceph_context, waiting_for_replay);  // kick waiters
+
+  mdcache->reissue_all_caps();
+
+  finish_contexts(g_ceph_context, waiting_for_active);  // kick waiters
+}
+
+void MDSRank::recovery_done(int oldstate)
+{
+  dout(1) << "recovery_done -- successful recovery!" << dendl;
+  assert(is_clientreplay() || is_active());
+  
+  // kick snaptable (resent AGREEs)
+  if (mdsmap->get_tableserver() == whoami) {
+    set<mds_rank_t> active;
+    mdsmap->get_clientreplay_or_active_or_stopping_mds_set(active);
+    snapserver->finish_recovery(active);
+  }
+
+  if (oldstate == MDSMap::STATE_CREATING)
+    return;
+
+  mdcache->start_recovered_truncates();
+  mdcache->do_file_recover();
+
+  // tell connected clients
+  //bcast_mds_map();     // not anymore, they get this from the monitor
+
+  mdcache->populate_mydir();
+}
+
+void MDSRank::creating_done()
+{
+  dout(1)<< "creating_done" << dendl;
+  request_state(MDSMap::STATE_ACTIVE);
+}
+
+void MDSRank::boot_create()
+{
+  dout(3) << "boot_create" << dendl;
+
+  MDSGatherBuilder fin(g_ceph_context, new C_VoidFn(this, &MDSRank::creating_done));
+
+  mdcache->init_layouts();
+
+  snapserver->set_rank(whoami);
+  inotable->set_rank(whoami);
+  sessionmap.set_rank(whoami);
+
+  // start with a fresh journal
+  dout(10) << "boot_create creating fresh journal" << dendl;
+  mdlog->create(fin.new_sub());
+
+  // open new journal segment, but do not journal subtree map (yet)
+  mdlog->prepare_new_segment();
+
+  if (whoami == mdsmap->get_root()) {
+    dout(3) << "boot_create creating fresh hierarchy" << dendl;
+    mdcache->create_empty_hierarchy(fin.get());
+  }
+
+  dout(3) << "boot_create creating mydir hierarchy" << dendl;
+  mdcache->create_mydir_hierarchy(fin.get());
+
+  // fixme: fake out inotable (reset, pretend loaded)
+  dout(10) << "boot_create creating fresh inotable table" << dendl;
+  inotable->reset();
+  inotable->save(fin.new_sub());
+
+  // write empty sessionmap
+  sessionmap.save(fin.new_sub());
+
+  // initialize tables
+  if (mdsmap->get_tableserver() == whoami) {
+    dout(10) << "boot_create creating fresh snaptable" << dendl;
+    snapserver->reset();
+    snapserver->save(fin.new_sub());
+  }
+
+  assert(g_conf->mds_kill_create_at != 1);
+
+  // ok now journal it
+  mdlog->journal_segment_subtree_map(fin.new_sub());
+  mdlog->flush();
+
+  fin.activate();
+}
+
+void MDSRank::stopping_start()
+{
+  dout(2) << "stopping_start" << dendl;
+
+  if (mdsmap->get_num_in_mds() == 1 && !sessionmap.empty()) {
+    // we're the only mds up!
+    dout(0) << "we are the last MDS, and have mounted clients: we cannot flush our journal.  suicide!" << dendl;
+    suicide();
+  }
+
+  mdcache->shutdown_start();
+}
+
+void MDSRank::stopping_done()
+{
+  dout(2) << "stopping_done" << dendl;
+
+  // tell monitor we shut down cleanly.
+  request_state(MDSMap::STATE_STOPPED);
+}
+
+// <<<<<<<<
+
+void MDSRankDispatcher::handle_mds_map(
+    MMDSMap *m,
+    MDSMap *oldmap)
+{
+  // I am only to be passed MDSMaps in which I hold a rank
+  assert(whoami != MDS_RANK_NONE);
+
+  MDSMap::DaemonState oldstate = state;
+  mds_gid_t mds_gid = mds_gid_t(monc->get_global_id());
+  state = mdsmap->get_state_gid(mds_gid);
+  if (state != oldstate) {
+    last_state = oldstate;
+    incarnation = mdsmap->get_inc_gid(mds_gid);
+  }
+
+  version_t epoch = m->get_epoch();
+
+  // note source's map version
+  if (m->get_source().is_mds() && 
+      peer_mdsmap_epoch[mds_rank_t(m->get_source().num())] < epoch) {
+    dout(15) << " peer " << m->get_source()
+	     << " has mdsmap epoch >= " << epoch
+	     << dendl;
+    peer_mdsmap_epoch[mds_rank_t(m->get_source().num())] = epoch;
+  }
+
+  // Validate state transitions while I hold a rank
+  bool state_valid = true;
+  if (state != oldstate) {
+    if (oldstate == MDSMap::STATE_REPLAY) {
+      if (state != MDSMap::STATE_RESOLVE && state != MDSMap::STATE_RECONNECT) {
+        state_valid = false;
+      }
+    } else if (oldstate == MDSMap::STATE_REJOIN) {
+      if (state != MDSMap::STATE_ACTIVE
+          && state != MDSMap::STATE_CLIENTREPLAY
+          && state != MDSMap::STATE_STOPPED) {
+        state_valid = false;
+      }
+    } else if (oldstate >= MDSMap::STATE_RECONNECT && oldstate < MDSMap::STATE_ACTIVE) {
+      // Once I have entered replay, the only allowable transitions are to
+      // the next state along in the sequence.
+      if (state != oldstate + 1) {
+        state_valid = false;
+      }
+    }
+  }
+
+  if (!state_valid) {
+    derr << "Invalid state transition " << ceph_mds_state_name(oldstate)
+      << "->" << ceph_mds_state_name(state) << dendl;
+    respawn();
+  }
+
+  if (oldstate != state) {
+    // update messenger.
+    if (state == MDSMap::STATE_STANDBY_REPLAY || state == MDSMap::STATE_ONESHOT_REPLAY) {
+      dout(1) << "handle_mds_map i am now mds." << mds_gid << "." << incarnation
+	      << " replaying mds." << whoami << "." << incarnation << dendl;
+      messenger->set_myname(entity_name_t::MDS(mds_gid));
+    } else {
+      dout(1) << "handle_mds_map i am now mds." << whoami << "." << incarnation << dendl;
+      messenger->set_myname(entity_name_t::MDS(whoami));
+    }
+  }
+
+  // tell objecter my incarnation
+  if (objecter->get_client_incarnation() != incarnation)
+    objecter->set_client_incarnation(incarnation);
+
+  // for debug
+  if (g_conf->mds_dump_cache_on_map)
+    mdcache->dump_cache();
+
+  // did it change?
+  if (oldstate != state) {
+    dout(1) << "handle_mds_map state change "
+	    << ceph_mds_state_name(oldstate) << " --> "
+	    << ceph_mds_state_name(state) << dendl;
+    beacon.set_want_state(mdsmap, state);
+
+    if (oldstate == MDSMap::STATE_STANDBY_REPLAY) {
+        dout(10) << "Monitor activated us! Deactivating replay loop" << dendl;
+        assert (state == MDSMap::STATE_REPLAY);
+    } else {
+      // did i just recover?
+      if ((is_active() || is_clientreplay()) &&
+          (oldstate == MDSMap::STATE_CREATING ||
+	   oldstate == MDSMap::STATE_REJOIN ||
+	   oldstate == MDSMap::STATE_RECONNECT))
+        recovery_done(oldstate);
+
+      if (is_active()) {
+        active_start();
+      } else if (is_any_replay()) {
+        replay_start();
+      } else if (is_resolve()) {
+        resolve_start();
+      } else if (is_reconnect()) {
+        reconnect_start();
+      } else if (is_rejoin()) {
+	rejoin_start();
+      } else if (is_clientreplay()) {
+        clientreplay_start();
+      } else if (is_creating()) {
+        boot_create();
+      } else if (is_starting()) {
+        boot_start();
+      } else if (is_stopping()) {
+        assert(oldstate == MDSMap::STATE_ACTIVE);
+        stopping_start();
+      }
+    }
+  }
+  
+  // RESOLVE
+  // is someone else newly resolving?
+  if (is_resolve() || is_reconnect() || is_rejoin() ||
+      is_clientreplay() || is_active() || is_stopping()) {
+    if (!oldmap->is_resolving() && mdsmap->is_resolving()) {
+      set<mds_rank_t> resolve;
+      mdsmap->get_mds_set(resolve, MDSMap::STATE_RESOLVE);
+      dout(10) << " resolve set is " << resolve << dendl;
+      calc_recovery_set();
+      mdcache->send_resolves();
+    }
+  }
+  
+  // REJOIN
+  // is everybody finally rejoining?
+  if (is_rejoin() || is_clientreplay() || is_active() || is_stopping()) {
+    // did we start?
+    if (!oldmap->is_rejoining() && mdsmap->is_rejoining())
+      rejoin_joint_start();
+
+    // did we finish?
+    if (g_conf->mds_dump_cache_after_rejoin &&
+	oldmap->is_rejoining() && !mdsmap->is_rejoining()) 
+      mdcache->dump_cache();      // for DEBUG only
+
+    if (oldstate >= MDSMap::STATE_REJOIN) {
+      // ACTIVE|CLIENTREPLAY|REJOIN => we can discover from them.
+      set<mds_rank_t> olddis, dis;
+      oldmap->get_mds_set(olddis, MDSMap::STATE_ACTIVE);
+      oldmap->get_mds_set(olddis, MDSMap::STATE_CLIENTREPLAY);
+      oldmap->get_mds_set(olddis, MDSMap::STATE_REJOIN);
+      mdsmap->get_mds_set(dis, MDSMap::STATE_ACTIVE);
+      mdsmap->get_mds_set(dis, MDSMap::STATE_CLIENTREPLAY);
+      mdsmap->get_mds_set(dis, MDSMap::STATE_REJOIN);
+      for (set<mds_rank_t>::iterator p = dis.begin(); p != dis.end(); ++p)
+	if (*p != whoami &&            // not me
+	    olddis.count(*p) == 0) {  // newly so?
+	  mdcache->kick_discovers(*p);
+	  mdcache->kick_open_ino_peers(*p);
+	}
+    }
+  }
+
+  if (oldmap->is_degraded() && !mdsmap->is_degraded() && state >= MDSMap::STATE_ACTIVE)
+    dout(1) << "cluster recovered." << dendl;
+
+  // did someone go active?
+  if (oldstate >= MDSMap::STATE_CLIENTREPLAY &&
+      (is_clientreplay() || is_active() || is_stopping())) {
+    set<mds_rank_t> oldactive, active;
+    oldmap->get_mds_set(oldactive, MDSMap::STATE_ACTIVE);
+    oldmap->get_mds_set(oldactive, MDSMap::STATE_CLIENTREPLAY);
+    mdsmap->get_mds_set(active, MDSMap::STATE_ACTIVE);
+    mdsmap->get_mds_set(active, MDSMap::STATE_CLIENTREPLAY);
+    for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p) 
+      if (*p != whoami &&            // not me
+	  oldactive.count(*p) == 0)  // newly so?
+	handle_mds_recovery(*p);
+  }
+
+  // did someone fail?
+  //   new down?
+  {
+    set<mds_rank_t> olddown, down;
+    oldmap->get_down_mds_set(&olddown);
+    mdsmap->get_down_mds_set(&down);
+    for (set<mds_rank_t>::iterator p = down.begin(); p != down.end(); ++p) {
+      if (olddown.count(*p) == 0) {
+        messenger->mark_down(oldmap->get_inst(*p).addr);
+        handle_mds_failure(*p);
+      }
+    }
+  }
+
+  // did someone fail?
+  //   did their addr/inst change?
+  {
+    set<mds_rank_t> up;
+    mdsmap->get_up_mds_set(up);
+    for (set<mds_rank_t>::iterator p = up.begin(); p != up.end(); ++p) {
+      if (oldmap->have_inst(*p) &&
+         oldmap->get_inst(*p) != mdsmap->get_inst(*p)) {
+        messenger->mark_down(oldmap->get_inst(*p).addr);
+        handle_mds_failure(*p);
+      }
+    }
+  }
+
+  if (is_clientreplay() || is_active() || is_stopping()) {
+    // did anyone stop?
+    set<mds_rank_t> oldstopped, stopped;
+    oldmap->get_stopped_mds_set(oldstopped);
+    mdsmap->get_stopped_mds_set(stopped);
+    for (set<mds_rank_t>::iterator p = stopped.begin(); p != stopped.end(); ++p) 
+      if (oldstopped.count(*p) == 0)      // newly so?
+	mdcache->migrator->handle_mds_failure_or_stop(*p);
+  }
+
+  if (!is_any_replay())
+    balancer->try_rebalance();
+
+  {
+    map<epoch_t,list<MDSInternalContextBase*> >::iterator p = waiting_for_mdsmap.begin();
+    while (p != waiting_for_mdsmap.end() && p->first <= mdsmap->get_epoch()) {
+      list<MDSInternalContextBase*> ls;
+      ls.swap(p->second);
+      waiting_for_mdsmap.erase(p++);
+      finish_contexts(g_ceph_context, ls);
+    }
+  }
+
+  if (is_active()) {
+    // Before going active, set OSD epoch barrier to latest (so that
+    // we don't risk handing out caps to clients with old OSD maps that
+    // might not include barriers from the previous incarnation of this MDS)
+    const OSDMap *osdmap = objecter->get_osdmap_read();
+    const epoch_t osd_epoch = osdmap->get_epoch();
+    objecter->put_osdmap_read();
+    set_osd_epoch_barrier(osd_epoch);
+  }
+
+  if (is_active()) {
+    bool found = false;
+    MDSMap::mds_info_t info = mdsmap->get_info(whoami);
+
+    for (map<mds_gid_t,MDSMap::mds_info_t>::const_iterator p = mdsmap->get_mds_info().begin();
+       p != mdsmap->get_mds_info().end();
+       ++p) {
+      if (p->second.state == MDSMap::STATE_STANDBY_REPLAY &&
+	  (p->second.standby_for_rank == whoami ||(info.name.length() && p->second.standby_for_name == info.name))) {
+	found = true;
+	break;
+      }
+      if (found)
+	mdlog->set_write_iohint(0);
+      else
+	mdlog->set_write_iohint(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED);
+    }
+  }
+
+  mdcache->notify_mdsmap_changed();
+}
+
+void MDSRank::handle_mds_recovery(mds_rank_t who) 
+{
+  dout(5) << "handle_mds_recovery mds." << who << dendl;
+  
+  mdcache->handle_mds_recovery(who);
+
+  if (mdsmap->get_tableserver() == whoami) {
+    snapserver->handle_mds_recovery(who);
+  }
+
+  queue_waiters(waiting_for_active_peer[who]);
+  waiting_for_active_peer.erase(who);
+}
+
+void MDSRank::handle_mds_failure(mds_rank_t who)
+{
+  if (who == whoami) {
+    dout(5) << "handle_mds_failure for myself; not doing anything" << dendl;
+    return;
+  }
+  dout(5) << "handle_mds_failure mds." << who << dendl;
+
+  mdcache->handle_mds_failure(who);
+
+  snapclient->handle_mds_failure(who);
+}
+
+bool MDSRankDispatcher::handle_asok_command(
+    std::string command, cmdmap_t& cmdmap, Formatter *f,
+		    std::ostream& ss)
+{
+  if (command == "dump_ops_in_flight" ||
+             command == "ops") {
+    if (!op_tracker.tracking_enabled) {
+      ss << "op_tracker tracking is not enabled";
+    } else {
+      op_tracker.dump_ops_in_flight(f);
+    }
+  } else if (command == "dump_historic_ops") {
+    if (!op_tracker.tracking_enabled) {
+      ss << "op_tracker tracking is not enabled";
+    } else {
+      op_tracker.dump_historic_ops(f);
+    }
+  } else if (command == "osdmap barrier") {
+    int64_t target_epoch = 0;
+    bool got_val = cmd_getval(g_ceph_context, cmdmap, "target_epoch", target_epoch);
+    
+    if (!got_val) {
+      ss << "no target epoch given";
+      delete f;
+      return true;
+    }
+    
+    mds_lock.Lock();
+    set_osd_epoch_barrier(target_epoch);
+    mds_lock.Unlock();
+    
+    C_SaferCond cond;
+    bool already_got = objecter->wait_for_map(target_epoch, &cond);
+    if (!already_got) {
+      dout(4) << __func__ << ": waiting for OSD epoch " << target_epoch << dendl;
+      cond.wait();
+    }
+  } else if (command == "session ls") {
+    mds_lock.Lock();
+    
+    heartbeat_reset();
+    
+    // Dump sessions, decorated with recovery/replay status
+    f->open_array_section("sessions");
+    const ceph::unordered_map<entity_name_t, Session*> session_map = sessionmap.get_sessions();
+    for (ceph::unordered_map<entity_name_t,Session*>::const_iterator p = session_map.begin();
+         p != session_map.end();
+         ++p)  {
+      if (!p->first.is_client()) {
+        continue;
+      }
+      
+      Session *s = p->second;
+      
+      f->open_object_section("session");
+      f->dump_int("id", p->first.num());
+      
+      f->dump_int("num_leases", s->leases.size());
+      f->dump_int("num_caps", s->caps.size());
+      
+      f->dump_string("state", s->get_state_name());
+      f->dump_int("replay_requests", is_clientreplay() ? s->get_request_count() : 0);
+      f->dump_unsigned("completed_requests", s->get_num_completed_requests());
+      f->dump_bool("reconnecting", server->waiting_for_reconnect(p->first.num()));
+      f->dump_stream("inst") << s->info.inst;
+      f->open_object_section("client_metadata");
+      for (map<string, string>::const_iterator i = s->info.client_metadata.begin();
+           i != s->info.client_metadata.end(); ++i) {
+        f->dump_string(i->first.c_str(), i->second);
+      }
+      f->close_section(); // client_metadata
+      f->close_section(); //session
+    }
+    f->close_section(); //sessions
+    
+    mds_lock.Unlock();
+  } else if (command == "session evict") {
+    std::string client_id;
+    const bool got_arg = cmd_getval(g_ceph_context, cmdmap, "client_id", client_id);
+    assert(got_arg == true);
+    
+    mds_lock.Lock();
+    Session *session = sessionmap.get_session(entity_name_t(CEPH_ENTITY_TYPE_CLIENT,
+                                                            strtol(client_id.c_str(), 0, 10)));
+    if (session) {
+      C_SaferCond on_safe;
+      server->kill_session(session, &on_safe);
+      
+      mds_lock.Unlock();
+      on_safe.wait();
+    } else {
+      dout(15) << "session " << session << " not in sessionmap!" << dendl;
+      mds_lock.Unlock();
+    }
+  } else if (command == "scrub_path") {
+    string path;
+    cmd_getval(g_ceph_context, cmdmap, "path", path);
+    command_scrub_path(f, path);
+  } else if (command == "flush_path") {
+    string path;
+    cmd_getval(g_ceph_context, cmdmap, "path", path);
+    command_flush_path(f, path);
+  } else if (command == "flush journal") {
+    command_flush_journal(f);
+  } else if (command == "get subtrees") {
+    command_get_subtrees(f);
+  } else if (command == "export dir") {
+    string path;
+    if(!cmd_getval(g_ceph_context, cmdmap, "path", path)) {
+      ss << "malformed path";
+      delete f;
+      return true;
+    }
+    int64_t rank;
+    if(!cmd_getval(g_ceph_context, cmdmap, "rank", rank)) {
+      ss << "malformed rank";
+      delete f;
+      return true;
+    }
+    command_export_dir(f, path, (mds_rank_t)rank);
+  } else if (command == "dump cache") {
+    string path;
+    if(!cmd_getval(g_ceph_context, cmdmap, "path", path)) {
+      mdcache->dump_cache(f);
+    } else {
+      mdcache->dump_cache(path);
+    }
+  } else if (command == "force_readonly") {
+    mds_lock.Lock();
+    mdcache->force_readonly();
+    mds_lock.Unlock();
+  } else if (command == "dirfrag split") {
+    Mutex::Locker l(mds_lock);
+    command_dirfrag_split(cmdmap, ss);
+  } else if (command == "dirfrag merge") {
+    Mutex::Locker l(mds_lock);
+    command_dirfrag_merge(cmdmap, ss);
+  } else if (command == "dirfrag ls") {
+    Mutex::Locker l(mds_lock);
+    command_dirfrag_ls(cmdmap, ss, f);
+  } else {
+    return false;
+  }
+
+  return true;
+}
+
+
+
+void MDSRank::command_scrub_path(Formatter *f, const string& path)
+{
+  C_SaferCond scond;
+  {
+    Mutex::Locker l(mds_lock);
+    mdcache->scrub_dentry(path, f, &scond);
+  }
+  scond.wait();
+  // scrub_dentry() finishers will dump the data for us; we're done!
+}
+
+void MDSRank::command_flush_path(Formatter *f, const string& path)
+{
+  C_SaferCond scond;
+  {
+    Mutex::Locker l(mds_lock);
+    mdcache->flush_dentry(path, &scond);
+  }
+  int r = scond.wait();
+  f->open_object_section("results");
+  f->dump_int("return_code", r);
+  f->close_section(); // results
+}
+
+/**
+ * Wrapper around _command_flush_journal that
+ * handles serialization of result
+ */
+void MDSRank::command_flush_journal(Formatter *f)
+{
+  assert(f != NULL);
+
+  std::stringstream ss;
+  const int r = _command_flush_journal(&ss);
+  f->open_object_section("result");
+  f->dump_string("message", ss.str());
+  f->dump_int("return_code", r);
+  f->close_section();
+}
+
+/**
+ * Implementation of "flush journal" asok command.
+ *
+ * @param ss
+ * Optionally populate with a human readable string describing the
+ * reason for any unexpected return status.
+ */
+int MDSRank::_command_flush_journal(std::stringstream *ss)
+{
+  assert(ss != NULL);
+
+  Mutex::Locker l(mds_lock);
+
+  if (mdcache->is_readonly()) {
+    dout(5) << __func__ << ": read-only FS" << dendl;
+    return -EROFS;
+  }
+
+  if (!is_active()) {
+    dout(5) << __func__ << ": MDS not active, no-op" << dendl;
+    return 0;
+  }
+
+  // I need to seal off the current segment, and then mark all previous segments
+  // for expiry
+  mdlog->start_new_segment();
+  int r = 0;
+
+  // Flush initially so that all the segments older than our new one
+  // will be elegible for expiry
+  {
+    C_SaferCond mdlog_flushed;
+    mdlog->flush();
+    mdlog->wait_for_safe(new MDSInternalContextWrapper(this, &mdlog_flushed));
+    mds_lock.Unlock();
+    r = mdlog_flushed.wait();
+    mds_lock.Lock();
+    if (r != 0) {
+      *ss << "Error " << r << " (" << cpp_strerror(r) << ") while flushing journal";
+      return r;
+    }
+  }
+
+  // Because we may not be the last wait_for_safe context on MDLog, and
+  // subsequent contexts might wake up in the middle of our later trim_all
+  // and interfere with expiry (by e.g. marking dirs/dentries dirty
+  // on previous log segments), we run a second wait_for_safe here.
+  // See #10368
+  {
+    C_SaferCond mdlog_cleared;
+    mdlog->wait_for_safe(new MDSInternalContextWrapper(this, &mdlog_cleared));
+    mds_lock.Unlock();
+    r = mdlog_cleared.wait();
+    mds_lock.Lock();
+    if (r != 0) {
+      *ss << "Error " << r << " (" << cpp_strerror(r) << ") while flushing journal";
+      return r;
+    }
+  }
+
+  // Put all the old log segments into expiring or expired state
+  dout(5) << __func__ << ": beginning segment expiry" << dendl;
+  r = mdlog->trim_all();
+  if (r != 0) {
+    *ss << "Error " << r << " (" << cpp_strerror(r) << ") while trimming log";
+    return r;
+  }
+
+  // Attach contexts to wait for all expiring segments to expire
+  MDSGatherBuilder expiry_gather(g_ceph_context);
+
+  const std::set<LogSegment*> &expiring_segments = mdlog->get_expiring_segments();
+  for (std::set<LogSegment*>::const_iterator i = expiring_segments.begin();
+       i != expiring_segments.end(); ++i) {
+    (*i)->wait_for_expiry(expiry_gather.new_sub());
+  }
+  dout(5) << __func__ << ": waiting for " << expiry_gather.num_subs_created()
+          << " segments to expire" << dendl;
+
+  if (expiry_gather.has_subs()) {
+    C_SaferCond cond;
+    expiry_gather.set_finisher(new MDSInternalContextWrapper(this, &cond));
+    expiry_gather.activate();
+
+    // Drop mds_lock to allow progress until expiry is complete
+    mds_lock.Unlock();
+    int r = cond.wait();
+    mds_lock.Lock();
+
+    assert(r == 0);  // MDLog is not allowed to raise errors via wait_for_expiry
+  }
+
+  dout(5) << __func__ << ": expiry complete, expire_pos/trim_pos is now " << std::hex <<
+    mdlog->get_journaler()->get_expire_pos() << "/" <<
+    mdlog->get_journaler()->get_trimmed_pos() << dendl;
+
+  // Now everyone I'm interested in is expired
+  mdlog->trim_expired_segments();
+
+  dout(5) << __func__ << ": trim complete, expire_pos/trim_pos is now " << std::hex <<
+    mdlog->get_journaler()->get_expire_pos() << "/" <<
+    mdlog->get_journaler()->get_trimmed_pos() << dendl;
+
+  // Flush the journal header so that readers will start from after the flushed region
+  C_SaferCond wrote_head;
+  mdlog->get_journaler()->write_head(&wrote_head);
+  mds_lock.Unlock();  // Drop lock to allow messenger dispatch progress
+  r = wrote_head.wait();
+  mds_lock.Lock();
+  if (r != 0) {
+      *ss << "Error " << r << " (" << cpp_strerror(r) << ") while writing header";
+      return r;
+  }
+
+  dout(5) << __func__ << ": write_head complete, all done!" << dendl;
+
+  return 0;
+}
+
+
+void MDSRank::command_get_subtrees(Formatter *f)
+{
+  assert(f != NULL);
+
+  std::list<CDir*> subtrees;
+  mdcache->list_subtrees(subtrees);
+
+  f->open_array_section("subtrees");
+  for (std::list<CDir*>::iterator i = subtrees.begin(); i != subtrees.end(); ++i) {
+    const CDir *dir = *i;
+
+    f->open_object_section("subtree");
+    {
+      f->dump_bool("is_auth", dir->is_auth());
+      f->dump_int("auth_first", dir->get_dir_auth().first);
+      f->dump_int("auth_second", dir->get_dir_auth().second);
+      f->open_object_section("dir");
+      dir->dump(f);
+      f->close_section();
+    }
+    f->close_section();
+  }
+  f->close_section();
+}
+
+
+void MDSRank::command_export_dir(Formatter *f,
+    const std::string &path,
+    mds_rank_t target)
+{
+  int r = _command_export_dir(path, target);
+  f->open_object_section("results");
+  f->dump_int("return_code", r);
+  f->close_section(); // results
+}
+
+int MDSRank::_command_export_dir(
+    const std::string &path,
+    mds_rank_t target)
+{
+  filepath fp(path.c_str());
+
+  if (target == whoami || !mdsmap->is_up(target) || !mdsmap->is_in(target)) {
+    derr << "bad MDS target " << target << dendl;
+    return -ENOENT;
+  }
+
+  CInode *in = mdcache->cache_traverse(fp);
+  if (!in) {
+    derr << "Bath path '" << path << "'" << dendl;
+    return -ENOENT;
+  }
+  CDir *dir = in->get_dirfrag(frag_t());
+  if (!dir || !(dir->is_auth())) {
+    derr << "bad export_dir path dirfrag frag_t() or dir not auth" << dendl;
+    return -EINVAL;
+  }
+
+  mdcache->migrator->export_dir(dir, target);
+  return 0;
+}
+
+CDir *MDSRank::_command_dirfrag_get(
+    const cmdmap_t &cmdmap,
+    std::ostream &ss)
+{
+  std::string path;
+  bool got = cmd_getval(g_ceph_context, cmdmap, "path", path);
+  if (!got) {
+    ss << "missing path argument";
+    return NULL;
+  }
+
+  std::string frag_str;
+  if (!cmd_getval(g_ceph_context, cmdmap, "frag", frag_str)) {
+    ss << "missing frag argument";
+    return NULL;
+  }
+
+  CInode *in = mdcache->cache_traverse(filepath(path.c_str()));
+  if (!in) {
+    // TODO really we should load something in if it's not in cache,
+    // but the infrastructure is harder, and we might still be unable
+    // to act on it if someone else is auth.
+    ss << "directory '" << path << "' inode not in cache";
+    return NULL;
+  }
+
+  frag_t fg;
+
+  if (!fg.parse(frag_str.c_str())) {
+    ss << "frag " << frag_str << " failed to parse";
+    return NULL;
+  }
+
+  CDir *dir = in->get_dirfrag(fg);
+  if (!dir) {
+    ss << "frag 0x" << std::hex << in->ino() << "/" << fg << " not in cache ("
+          "use `dirfrag ls` to see if it should exist)";
+    return NULL;
+  }
+
+  if (!dir->is_auth()) {
+    ss << "frag " << dir->dirfrag() << " not auth (auth = "
+       << dir->authority() << ")";
+    return NULL;
+  }
+
+  return dir;
+}
+
+bool MDSRank::command_dirfrag_split(
+    cmdmap_t cmdmap,
+    std::ostream &ss)
+{
+  int64_t by = 0;
+  if (!cmd_getval(g_ceph_context, cmdmap, "bits", by)) {
+    ss << "missing bits argument";
+    return false;
+  }
+
+  if (by <= 0) {
+    ss << "must split by >0 bits";
+    return false;
+  }
+
+  CDir *dir = _command_dirfrag_get(cmdmap, ss);
+  if (!dir) {
+    return false;
+  }
+
+  mdcache->split_dir(dir, by);
+
+  return true;
+}
+
+bool MDSRank::command_dirfrag_merge(
+    cmdmap_t cmdmap,
+    std::ostream &ss)
+{
+  std::string path;
+  bool got = cmd_getval(g_ceph_context, cmdmap, "path", path);
+  if (!got) {
+    ss << "missing path argument";
+    return false;
+  }
+
+  std::string frag_str;
+  if (!cmd_getval(g_ceph_context, cmdmap, "frag", frag_str)) {
+    ss << "missing frag argument";
+    return false;
+  }
+
+  CInode *in = mdcache->cache_traverse(filepath(path.c_str()));
+  if (!in) {
+    ss << "directory '" << path << "' inode not in cache";
+    return false;
+  }
+
+  frag_t fg;
+  if (!fg.parse(frag_str.c_str())) {
+    ss << "frag " << frag_str << " failed to parse";
+    return false;
+  }
+
+  mdcache->merge_dir(in, fg);
+
+  return true;
+}
+
+bool MDSRank::command_dirfrag_ls(
+    cmdmap_t cmdmap,
+    std::ostream &ss,
+    Formatter *f)
+{
+  std::string path;
+  bool got = cmd_getval(g_ceph_context, cmdmap, "path", path);
+  if (!got) {
+    ss << "missing path argument";
+    return false;
+  }
+
+  CInode *in = mdcache->cache_traverse(filepath(path.c_str()));
+  if (!in) {
+    ss << "directory inode not in cache";
+    return false;
+  }
+
+  f->open_array_section("frags");
+  std::list<frag_t> frags;
+  // NB using get_leaves_under instead of get_dirfrags to give
+  // you the list of what dirfrags may exist, not which are in cache
+  in->dirfragtree.get_leaves_under(frag_t(), frags);
+  for (std::list<frag_t>::iterator i = frags.begin();
+       i != frags.end(); ++i) {
+    f->open_object_section("frag");
+    f->dump_int("value", i->value());
+    f->dump_int("bits", i->bits());
+    std::ostringstream frag_str;
+    frag_str << std::hex << i->value() << "/" << std::dec << i->bits();
+    f->dump_string("str", frag_str.str());
+    f->close_section();
+  }
+  f->close_section();
+
+  return true;
+}
+
+void MDSRankDispatcher::update_log_config()
+{
+  map<string,string> log_to_monitors;
+  map<string,string> log_to_syslog;
+  map<string,string> log_channel;
+  map<string,string> log_prio;
+  if (parse_log_client_options(g_ceph_context, log_to_monitors, log_to_syslog,
+			       log_channel, log_prio) == 0)
+    clog->update_config(log_to_monitors, log_to_syslog,
+			log_channel, log_prio);
+  dout(10) << __func__ << " log_to_monitors " << log_to_monitors << dendl;
+}
+
+void MDSRank::create_logger()
+{
+  dout(10) << "create_logger" << dendl;
+  {
+    PerfCountersBuilder mds_plb(g_ceph_context, "mds", l_mds_first, l_mds_last);
+
+    mds_plb.add_u64_counter(l_mds_request, "request", "Requests");
+    mds_plb.add_u64_counter(l_mds_reply, "reply", "Replies");
+    mds_plb.add_time_avg(l_mds_reply_latency, "reply_latency",
+        "Reply latency", "rlat");
+    mds_plb.add_u64_counter(l_mds_forward, "forward", "Forwarding request");
+    
+    mds_plb.add_u64_counter(l_mds_dir_fetch, "dir_fetch", "Directory fetch");
+    mds_plb.add_u64_counter(l_mds_dir_commit, "dir_commit", "Directory commit");
+    mds_plb.add_u64_counter(l_mds_dir_split, "dir_split", "Directory split");
+
+    mds_plb.add_u64(l_mds_inode_max, "inode_max", "Max inodes, cache size");
+    mds_plb.add_u64(l_mds_inodes, "inodes", "Inodes", "inos");
+    mds_plb.add_u64(l_mds_inodes_top, "inodes_top", "Inodes on top");
+    mds_plb.add_u64(l_mds_inodes_bottom, "inodes_bottom", "Inodes on bottom");
+    mds_plb.add_u64(l_mds_inodes_pin_tail, "inodes_pin_tail", "Inodes on pin tail");  
+    mds_plb.add_u64(l_mds_inodes_pinned, "inodes_pinned", "Inodes pinned");
+    mds_plb.add_u64(l_mds_inodes_expired, "inodes_expired", "Inodes expired");
+    mds_plb.add_u64(l_mds_inodes_with_caps, "inodes_with_caps", "Inodes with capabilities");
+    mds_plb.add_u64(l_mds_caps, "caps", "Capabilities", "caps");
+    mds_plb.add_u64(l_mds_subtrees, "subtrees", "Subtrees");
+    
+    mds_plb.add_u64_counter(l_mds_traverse, "traverse", "Traverses"); 
+    mds_plb.add_u64_counter(l_mds_traverse_hit, "traverse_hit", "Traverse hits");
+    mds_plb.add_u64_counter(l_mds_traverse_forward, "traverse_forward", "Traverse forwards");
+    mds_plb.add_u64_counter(l_mds_traverse_discover, "traverse_discover", "Traverse directory discovers");
+    mds_plb.add_u64_counter(l_mds_traverse_dir_fetch, "traverse_dir_fetch", "Traverse incomplete directory content fetchings");
+    mds_plb.add_u64_counter(l_mds_traverse_remote_ino, "traverse_remote_ino", "Traverse remote dentries");
+    mds_plb.add_u64_counter(l_mds_traverse_lock, "traverse_lock", "Traverse locks");
+    
+    mds_plb.add_u64(l_mds_load_cent, "load_cent", "Load per cent");
+    mds_plb.add_u64(l_mds_dispatch_queue_len, "q", "Dispatch queue length");
+    
+    mds_plb.add_u64_counter(l_mds_exported, "exported", "Exports");
+    mds_plb.add_u64_counter(l_mds_exported_inodes, "exported_inodes", "Exported inodes");
+    mds_plb.add_u64_counter(l_mds_imported, "imported", "Imports");
+    mds_plb.add_u64_counter(l_mds_imported_inodes, "imported_inodes", "Imported inodes");
+    logger = mds_plb.create_perf_counters();
+    g_ceph_context->get_perfcounters_collection()->add(logger);
+  }
+
+  {
+    PerfCountersBuilder mdm_plb(g_ceph_context, "mds_mem", l_mdm_first, l_mdm_last);
+    mdm_plb.add_u64(l_mdm_ino, "ino", "Inodes");
+    mdm_plb.add_u64_counter(l_mdm_inoa, "ino+", "Inodes opened");
+    mdm_plb.add_u64_counter(l_mdm_inos, "ino-", "Inodes closed");
+    mdm_plb.add_u64(l_mdm_dir, "dir", "Directories");
+    mdm_plb.add_u64_counter(l_mdm_dira, "dir+", "Directories opened");
+    mdm_plb.add_u64_counter(l_mdm_dirs, "dir-", "Directories closed");
+    mdm_plb.add_u64(l_mdm_dn, "dn", "Dentries");
+    mdm_plb.add_u64_counter(l_mdm_dna, "dn+", "Dentries opened");
+    mdm_plb.add_u64_counter(l_mdm_dns, "dn-", "Dentries closed");
+    mdm_plb.add_u64(l_mdm_cap, "cap", "Capabilities");
+    mdm_plb.add_u64_counter(l_mdm_capa, "cap+", "Capabilities added");
+    mdm_plb.add_u64_counter(l_mdm_caps, "cap-", "Capabilities removed");
+    mdm_plb.add_u64(l_mdm_rss, "rss", "RSS");
+    mdm_plb.add_u64(l_mdm_heap, "heap", "Heap size");
+    mdm_plb.add_u64(l_mdm_malloc, "malloc", "Malloc size");
+    mdm_plb.add_u64(l_mdm_buf, "buf", "Buffer size");
+    mlogger = mdm_plb.create_perf_counters();
+    g_ceph_context->get_perfcounters_collection()->add(mlogger);
+  }
+
+  mdlog->create_logger();
+  server->create_logger();
+  mdcache->register_perfcounters();
+}
+
+void MDSRank::check_ops_in_flight()
+{
+  vector<string> warnings;
+  if (op_tracker.check_ops_in_flight(warnings)) {
+    for (vector<string>::iterator i = warnings.begin();
+        i != warnings.end();
+        ++i) {
+      clog->warn() << *i;
+    }
+  }
+  return;
+}
+
+void MDSRankDispatcher::handle_osd_map()
+{
+  if (is_active() && snapserver) {
+    snapserver->check_osd_map(true);
+  }
+
+  server->handle_osd_map();
+
+  mdcache->notify_osdmap_changed();
+
+  // By default the objecter only requests OSDMap updates on use,
+  // we would like to always receive the latest maps in order to
+  // apply policy based on the FULL flag.
+  objecter->maybe_request_map();
+}
+
+bool MDSRankDispatcher::kill_session(int64_t session_id)
+{
+  Session *session = sessionmap.get_session(entity_name_t(CEPH_ENTITY_TYPE_CLIENT, session_id));
+
+  if (session) {
+    server->kill_session(session, NULL);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+void MDSRank::bcast_mds_map()
+{
+  dout(7) << "bcast_mds_map " << mdsmap->get_epoch() << dendl;
+
+  // share the map with mounted clients
+  set<Session*> clients;
+  sessionmap.get_client_session_set(clients);
+  for (set<Session*>::const_iterator p = clients.begin();
+       p != clients.end();
+       ++p) 
+    (*p)->connection->send_message(new MMDSMap(monc->get_fsid(), mdsmap));
+  last_client_mdsmap_bcast = mdsmap->get_epoch();
+}
+
+
+bool MDSRankDispatcher::handle_command_legacy(std::vector<std::string> args)
+{
+  if (args[0] == "dumpcache") {
+    if (args.size() > 1)
+      mdcache->dump_cache(args[1].c_str());
+    else
+      mdcache->dump_cache();
+  }
+  else if (args[0] == "session" && args[1] == "kill") {
+    Session *session = sessionmap.get_session(entity_name_t(CEPH_ENTITY_TYPE_CLIENT,
+							    strtol(args[2].c_str(), 0, 10)));
+    if (session)
+      server->kill_session(session, NULL);
+    else
+      dout(15) << "session " << session << " not in sessionmap!" << dendl;
+  } else if (args[0] == "issue_caps") {
+    long inum = strtol(args[1].c_str(), 0, 10);
+    CInode *in = mdcache->get_inode(inodeno_t(inum));
+    if (in) {
+      bool r = locker->issue_caps(in);
+      dout(20) << "called issue_caps on inode "  << inum
+	       << " with result " << r << dendl;
+    } else dout(15) << "inode " << inum << " not in mdcache!" << dendl;
+  } else if (args[0] == "try_eval") {
+    long inum = strtol(args[1].c_str(), 0, 10);
+    int mask = strtol(args[2].c_str(), 0, 10);
+    CInode * ino = mdcache->get_inode(inodeno_t(inum));
+    if (ino) {
+      locker->try_eval(ino, mask);
+      dout(20) << "try_eval(" << inum << ", " << mask << ")" << dendl;
+    } else dout(15) << "inode " << inum << " not in mdcache!" << dendl;
+  } else if (args[0] == "fragment_dir") {
+    if (args.size() == 4) {
+      filepath fp(args[1].c_str());
+      CInode *in = mdcache->cache_traverse(fp);
+      if (in) {
+	frag_t fg;
+	if (fg.parse(args[2].c_str())) {
+	  CDir *dir = in->get_dirfrag(fg);
+	  if (dir) {
+	    if (dir->is_auth()) {
+	      int by = atoi(args[3].c_str());
+	      if (by)
+		mdcache->split_dir(dir, by);
+	      else
+		dout(0) << "need to split by >0 bits" << dendl;
+	    } else dout(0) << "dir " << dir->dirfrag() << " not auth" << dendl;
+	  } else dout(0) << "dir " << in->ino() << " " << fg << " dne" << dendl;
+	} else dout(0) << " frag " << args[2] << " does not parse" << dendl;
+      } else dout(0) << "path " << fp << " not found" << dendl;
+    } else dout(0) << "bad syntax" << dendl;
+  } else if (args[0] == "merge_dir") {
+    if (args.size() == 3) {
+      filepath fp(args[1].c_str());
+      CInode *in = mdcache->cache_traverse(fp);
+      if (in) {
+	frag_t fg;
+	if (fg.parse(args[2].c_str())) {
+	  mdcache->merge_dir(in, fg);
+	} else dout(0) << " frag " << args[2] << " does not parse" << dendl;
+      } else dout(0) << "path " << fp << " not found" << dendl;
+    } else dout(0) << "bad syntax" << dendl;
+  } else if (args[0] == "export_dir") {
+    if (args.size() == 3) {
+      filepath fp(args[1].c_str());
+      mds_rank_t target = mds_rank_t(atoi(args[2].c_str()));
+      if (target != whoami && mdsmap->is_up(target) && mdsmap->is_in(target)) {
+	CInode *in = mdcache->cache_traverse(fp);
+	if (in) {
+	  CDir *dir = in->get_dirfrag(frag_t());
+	  if (dir && dir->is_auth()) {
+	    mdcache->migrator->export_dir(dir, target);
+	  } else dout(0) << "bad export_dir path dirfrag frag_t() or dir not auth" << dendl;
+	} else dout(0) << "bad export_dir path" << dendl;
+      } else dout(0) << "bad export_dir target syntax" << dendl;
+    } else dout(0) << "bad export_dir syntax" << dendl;
+  } else {
+    return false;
+  }
+
+  return true;
+}
+
+MDSRankDispatcher::MDSRankDispatcher(
+    mds_rank_t whoami_,
+    Mutex &mds_lock_,
+    LogChannelRef &clog_,
+    SafeTimer &timer_,
+    Beacon &beacon_,
+    MDSMap *& mdsmap_,
+    Messenger *msgr,
+    MonClient *monc_,
+    Objecter *objecter_,
+    Context *respawn_hook_,
+    Context *suicide_hook_)
+  : MDSRank(whoami_, mds_lock_, clog_, timer_, beacon_, mdsmap_,
+      msgr, monc_, objecter_, respawn_hook_, suicide_hook_)
+{}
+
diff --git a/src/mds/MDSRank.h b/src/mds/MDSRank.h
new file mode 100644
index 0000000..b999657
--- /dev/null
+++ b/src/mds/MDSRank.h
@@ -0,0 +1,522 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef MDS_RANK_H_
+#define MDS_RANK_H_
+
+#include "common/TrackedOp.h"
+#include "common/LogClient.h"
+#include "common/Timer.h"
+
+#include "Beacon.h"
+#include "MDSMap.h"
+#include "SessionMap.h"
+#include "MDCache.h"
+#include "Migrator.h"
+#include "MDLog.h"
+#include "osdc/Journaler.h"
+
+// Full .h import instead of forward declaration for PerfCounter, for the
+// benefit of those including this header and using MDSRank::logger
+#include "common/perf_counters.h"
+
+enum {
+  l_mds_first = 2000,
+  l_mds_request,
+  l_mds_reply,
+  l_mds_reply_latency,
+  l_mds_forward,
+  l_mds_dir_fetch,
+  l_mds_dir_commit,
+  l_mds_dir_split,
+  l_mds_inode_max,
+  l_mds_inodes,
+  l_mds_inodes_top,
+  l_mds_inodes_bottom,
+  l_mds_inodes_pin_tail,
+  l_mds_inodes_pinned,
+  l_mds_inodes_expired,
+  l_mds_inodes_with_caps,
+  l_mds_caps,
+  l_mds_subtrees,
+  l_mds_traverse,
+  l_mds_traverse_hit,
+  l_mds_traverse_forward,
+  l_mds_traverse_discover,
+  l_mds_traverse_dir_fetch,
+  l_mds_traverse_remote_ino,
+  l_mds_traverse_lock,
+  l_mds_load_cent,
+  l_mds_dispatch_queue_len,
+  l_mds_exported,
+  l_mds_exported_inodes,
+  l_mds_imported,
+  l_mds_imported_inodes,
+  l_mds_last,
+};
+
+// memory utilization
+enum {
+  l_mdm_first = 2500,
+  l_mdm_ino,
+  l_mdm_inoa,
+  l_mdm_inos,
+  l_mdm_dir,
+  l_mdm_dira,
+  l_mdm_dirs,
+  l_mdm_dn,
+  l_mdm_dna,
+  l_mdm_dns,
+  l_mdm_cap,
+  l_mdm_capa,
+  l_mdm_caps,
+  l_mdm_rss,
+  l_mdm_heap,
+  l_mdm_malloc,
+  l_mdm_buf,
+  l_mdm_last,
+};
+
+namespace ceph {
+  struct heartbeat_handle_d;
+}
+
+class Server;
+class Locker;
+class MDCache;
+class MDLog;
+class MDBalancer;
+class InoTable;
+class SnapServer;
+class SnapClient;
+class MDSTableServer;
+class MDSTableClient;
+class Messenger;
+class Objecter;
+class MonClient;
+class Finisher;
+class MMDSMap;
+
+/**
+ * The public part of this class's interface is what's exposed to all
+ * the various subsystems (server, mdcache, etc), such as pointers
+ * to the other subsystems, and message-sending calls.
+ */
+class MDSRank {
+  protected:
+    const mds_rank_t whoami;
+
+    // Incarnation as seen in MDSMap at the point where a rank is
+    // assigned.
+    int incarnation;
+
+  public:
+    mds_rank_t get_nodeid() const { return whoami; }
+    uint64_t get_metadata_pool();
+
+    // Reference to global MDS::mds_lock, so that users of MDSRank don't
+    // carry around references to the outer MDS, and we can substitute
+    // a separate lock here in future potentially.
+    Mutex &mds_lock;
+
+    bool is_daemon_stopping() const;
+
+    // Reference to global cluster log client, just to avoid initialising
+    // a separate one here.
+    LogChannelRef &clog;
+
+    // Reference to global timer utility, because MDSRank and MDSDaemon
+    // currently both use the same mds_lock, so it makes sense for them
+    // to share a timer.
+    SafeTimer &timer;
+
+    MDSMap *&mdsmap;
+
+    Objecter     *objecter;
+
+    // sub systems
+    Server       *server;
+    MDCache      *mdcache;
+    Locker       *locker;
+    MDLog        *mdlog;
+    MDBalancer   *balancer;
+
+    InoTable     *inotable;
+
+    SnapServer   *snapserver;
+    SnapClient   *snapclient;
+
+    MDSTableClient *get_table_client(int t);
+    MDSTableServer *get_table_server(int t);
+
+    SessionMap   sessionmap;
+    Session *get_session(client_t client) {
+      return sessionmap.get_session(entity_name_t::CLIENT(client.v));
+    }
+
+    PerfCounters       *logger, *mlogger;
+    OpTracker    op_tracker;
+
+    // The last different state I held before current
+    MDSMap::DaemonState last_state;
+    // The state assigned to me by the MDSMap
+    MDSMap::DaemonState state;
+
+    MDSMap::DaemonState get_state() const { return state; } 
+    MDSMap::DaemonState get_want_state() const { return beacon.get_want_state(); } 
+
+    bool is_creating() { return state == MDSMap::STATE_CREATING; }
+    bool is_starting() { return state == MDSMap::STATE_STARTING; }
+    bool is_standby()  { return state == MDSMap::STATE_STANDBY; }
+    bool is_replay()   { return state == MDSMap::STATE_REPLAY; }
+    bool is_standby_replay() { return state == MDSMap::STATE_STANDBY_REPLAY; }
+    bool is_resolve()  { return state == MDSMap::STATE_RESOLVE; }
+    bool is_reconnect() { return state == MDSMap::STATE_RECONNECT; }
+    bool is_rejoin()   { return state == MDSMap::STATE_REJOIN; }
+    bool is_clientreplay()   { return state == MDSMap::STATE_CLIENTREPLAY; }
+    bool is_active()   { return state == MDSMap::STATE_ACTIVE; }
+    bool is_stopping() { return state == MDSMap::STATE_STOPPING; }
+    bool is_oneshot_replay()   { return state == MDSMap::STATE_ONESHOT_REPLAY; }
+    bool is_any_replay() { return (is_replay() || is_standby_replay() ||
+        is_oneshot_replay()); }
+    bool is_stopped()  { return mdsmap->is_stopped(whoami); }
+
+    void handle_write_error(int err);
+
+  protected:
+    // Flag to indicate we entered shutdown: anyone seeing this to be true
+    // after taking mds_lock must drop out.
+    bool stopping;
+
+    class ProgressThread : public Thread {
+      MDSRank *mds;
+      Cond cond;
+      public:
+      ProgressThread(MDSRank *mds_) : mds(mds_) {}
+      void * entry(); 
+      void shutdown();
+      void signal() {cond.Signal();}
+    } progress_thread;
+
+    list<Message*> waiting_for_nolaggy;
+    list<MDSInternalContextBase*> finished_queue;
+    // Dispatch, retry, queues
+    int dispatch_depth;
+    void inc_dispatch_depth() { ++dispatch_depth; }
+    void dec_dispatch_depth() { --dispatch_depth; }
+    void retry_dispatch(Message *m);
+    bool handle_deferrable_message(Message *m);
+    void _advance_queues();
+    bool _dispatch(Message *m, bool new_msg);
+
+    ceph::heartbeat_handle_d *hb;  // Heartbeat for threads using mds_lock
+    void heartbeat_reset();
+
+    bool is_stale_message(Message *m);
+
+    map<mds_rank_t, version_t> peer_mdsmap_epoch;
+
+    ceph_tid_t last_tid;    // for mds-initiated requests (e.g. stray rename)
+
+    list<MDSInternalContextBase*> waiting_for_active, waiting_for_replay, waiting_for_reconnect, waiting_for_resolve;
+    list<MDSInternalContextBase*> replay_queue;
+    map<mds_rank_t, list<MDSInternalContextBase*> > waiting_for_active_peer;
+    map<epoch_t, list<MDSInternalContextBase*> > waiting_for_mdsmap;
+
+    epoch_t osd_epoch_barrier;
+
+    // Const reference to the beacon so that we can behave differently
+    // when it's laggy.
+    Beacon &beacon;
+
+    /**
+     * Emit clog warnings for any ops reported as warnings by optracker
+     */
+    void check_ops_in_flight();
+
+    /**
+     * Share MDSMap with clients
+     */
+    void bcast_mds_map();  // to mounted clients
+    epoch_t      last_client_mdsmap_bcast;
+
+    void create_logger();
+  public:
+
+    void queue_waiter(MDSInternalContextBase *c) {
+      finished_queue.push_back(c);
+      progress_thread.signal();
+    }
+    void queue_waiters(list<MDSInternalContextBase*>& ls) {
+      finished_queue.splice( finished_queue.end(), ls );
+      progress_thread.signal();
+    }
+
+    MDSRank(
+        mds_rank_t whoami_,
+        Mutex &mds_lock_,
+        LogChannelRef &clog_,
+        SafeTimer &timer_,
+        Beacon &beacon_,
+        MDSMap *& mdsmap_,
+        Messenger *msgr,
+        MonClient *monc_,
+        Objecter *objecter_,
+        Context *respawn_hook_,
+        Context *suicide_hook_);
+    ~MDSRank();
+
+    // Daemon lifetime functions: these guys break the abstraction
+    // and call up into the parent MDSDaemon instance.  It's kind
+    // of unavoidable: if we want any depth into our calls 
+    // to be able to e.g. tear down the whole process, we have to
+    // have a reference going all the way down.
+    // >>>
+    void suicide();
+    void respawn();
+    // <<<
+
+    /**
+     * Report state DAMAGED to the mon, and then pass on to respawn().  Call
+     * this when an unrecoverable error is encountered while attempting
+     * to load an MDS rank's data structures.  This is *not* for use with
+     * errors affecting normal dirfrag/inode objects -- they should be handled
+     * through cleaner scrub/repair mechanisms.
+     *
+     * Callers must already hold mds_lock.
+     */
+    void damaged();
+
+    /**
+     * Wrapper around `damaged` for users who are not
+     * already holding mds_lock.
+     *
+     * Callers must not already hold mds_lock.
+     */
+    void damaged_unlocked();
+
+    utime_t get_laggy_until() const;
+
+    void send_message_mds(Message *m, mds_rank_t mds);
+    void forward_message_mds(Message *req, mds_rank_t mds);
+
+    void send_message_client_counted(Message *m, client_t client);
+    void send_message_client_counted(Message *m, Session *session);
+    void send_message_client_counted(Message *m, Connection *connection);
+    void send_message_client_counted(Message *m, const ConnectionRef& con) {
+      send_message_client_counted(m, con.get());
+    }
+    void send_message_client(Message *m, Session *session);
+    void send_message(Message *m, Connection *c);
+    void send_message(Message *m, const ConnectionRef& c) {
+      send_message(m, c.get());
+    }
+
+    void wait_for_active(MDSInternalContextBase *c) { 
+      waiting_for_active.push_back(c); 
+    }
+    void wait_for_active_peer(mds_rank_t who, MDSInternalContextBase *c) { 
+      waiting_for_active_peer[who].push_back(c);
+    }
+    void wait_for_replay(MDSInternalContextBase *c) { 
+      waiting_for_replay.push_back(c); 
+    }
+    void wait_for_reconnect(MDSInternalContextBase *c) {
+      waiting_for_reconnect.push_back(c);
+    }
+    void wait_for_resolve(MDSInternalContextBase *c) {
+      waiting_for_resolve.push_back(c);
+    }
+    void wait_for_mdsmap(epoch_t e, MDSInternalContextBase *c) {
+      waiting_for_mdsmap[e].push_back(c);
+    }
+    void enqueue_replay(MDSInternalContextBase *c) {
+      replay_queue.push_back(c);
+    }
+
+    bool queue_one_replay() {
+      if (replay_queue.empty())
+        return false;
+      queue_waiter(replay_queue.front());
+      replay_queue.pop_front();
+      return true;
+    }
+
+    void set_osd_epoch_barrier(epoch_t e);
+    epoch_t get_osd_epoch_barrier() const {return osd_epoch_barrier;}
+
+    ceph_tid_t issue_tid() { return ++last_tid; }
+
+    Finisher     *finisher;
+
+    MDSMap *get_mds_map() { return mdsmap; }
+
+    int get_req_rate() { return logger->get(l_mds_request); }
+
+  protected:
+    void command_scrub_path(Formatter *f, const string& path);
+    void command_flush_path(Formatter *f, const string& path);
+    void command_flush_journal(Formatter *f);
+    void command_get_subtrees(Formatter *f);
+    void command_export_dir(Formatter *f,
+        const std::string &path, mds_rank_t dest);
+    bool command_dirfrag_split(
+        cmdmap_t cmdmap,
+        std::ostream &ss);
+    bool command_dirfrag_merge(
+        cmdmap_t cmdmap,
+        std::ostream &ss);
+    bool command_dirfrag_ls(
+        cmdmap_t cmdmap,
+        std::ostream &ss,
+        Formatter *f);
+    int _command_export_dir(const std::string &path, mds_rank_t dest);
+    int _command_flush_journal(std::stringstream *ss);
+    CDir *_command_dirfrag_get(
+        const cmdmap_t &cmdmap,
+        std::ostream &ss);
+    // <<<
+
+  protected:
+    Messenger    *messenger;
+    MonClient    *monc;
+
+    Context *respawn_hook;
+    Context *suicide_hook;
+
+    // Friended to access retry_dispatch
+    friend class C_MDS_RetryMessage;
+
+    // FIXME the state machine logic should be separable from the dispatch
+    // logic that calls it.
+    // >>>
+    void calc_recovery_set();
+    void request_state(MDSMap::DaemonState s);
+
+    bool standby_replaying;  // true if current replay pass is in standby-replay mode
+
+    typedef enum {
+      // The MDSMap is available, configure default layouts and structures
+      MDS_BOOT_INITIAL = 0,
+      // We are ready to open some inodes
+      MDS_BOOT_OPEN_ROOT,
+      // We are ready to do a replay if needed
+      MDS_BOOT_PREPARE_LOG,
+      // Replay is complete
+      MDS_BOOT_REPLAY_DONE
+    } BootStep;
+    friend class C_MDS_BootStart;
+    friend class C_MDS_InternalBootStart;
+    void boot_create();             // i am new mds.
+    void boot_start(BootStep step=MDS_BOOT_INITIAL, int r=0);    // starting|replay
+
+    void replay_start();
+    void creating_done();
+    void starting_done();
+    void replay_done();
+    void standby_replay_restart();
+    void _standby_replay_restart_finish(int r, uint64_t old_read_pos);
+    class C_MDS_StandbyReplayRestart;
+    class C_MDS_StandbyReplayRestartFinish;
+
+    void reopen_log();
+
+    void resolve_start();
+    void resolve_done();
+    void reconnect_start();
+    void reconnect_done();
+    void rejoin_joint_start();
+    void rejoin_start();
+    void rejoin_done();
+    void recovery_done(int oldstate);
+    void clientreplay_start();
+    void clientreplay_done();
+    void active_start();
+    void stopping_start();
+    void stopping_done();
+    // <<<
+    
+    // >>>
+    void handle_mds_recovery(mds_rank_t who);
+    void handle_mds_failure(mds_rank_t who);
+    // <<<
+};
+
+/* This expects to be given a reference which it is responsible for.
+ * The finish function calls functions which
+ * will put the Message exactly once.*/
+class C_MDS_RetryMessage : public MDSInternalContext {
+protected:
+  Message *m;
+public:
+  C_MDS_RetryMessage(MDSRank *mds, Message *m)
+    : MDSInternalContext(mds)
+  {
+    assert(m);
+    this->m = m;
+  }
+  virtual void finish(int r) {
+    mds->retry_dispatch(m);
+  }
+};
+
+/**
+ * The aspect of MDSRank exposed to MDSDaemon but not subsystems: i.e.
+ * the service/dispatcher stuff like init/shutdown that subsystems should
+ * never touch.
+ */
+class MDSRankDispatcher : public MDSRank
+{
+public:
+  void init();
+  void tick();
+  void shutdown();
+  bool handle_asok_command(std::string command, cmdmap_t& cmdmap,
+                           Formatter *f, std::ostream& ss);
+  void handle_mds_map(MMDSMap *m, MDSMap *oldmap);
+  void handle_osd_map();
+  bool kill_session(int64_t session_id);
+  void update_log_config();
+  bool handle_command_legacy(std::vector<std::string> args);
+
+  // Call into me from MDS::ms_dispatch
+  bool ms_dispatch(Message *m);
+
+  MDSRankDispatcher(
+      mds_rank_t whoami_,
+      Mutex &mds_lock_,
+      LogChannelRef &clog_,
+      SafeTimer &timer_,
+      Beacon &beacon_,
+      MDSMap *& mdsmap_,
+      Messenger *msgr,
+      MonClient *monc_,
+      Objecter *objecter_,
+      Context *respawn_hook_,
+      Context *suicide_hook_);
+};
+
+// This utility for MDS and MDSRank dispatchers.
+#define ALLOW_MESSAGES_FROM(peers) \
+do { \
+  if (m->get_connection() && (m->get_connection()->get_peer_type() & (peers)) == 0) { \
+    dout(0) << __FILE__ << "." << __LINE__ << ": filtered out request, peer=" << m->get_connection()->get_peer_type() \
+           << " allowing=" << #peers << " message=" << *m << dendl; \
+    m->put();							    \
+    return true; \
+  } \
+} while (0)
+
+#endif // MDS_RANK_H_
+
diff --git a/src/mds/MDSTable.cc b/src/mds/MDSTable.cc
index a1712dc..1d0c764 100644
--- a/src/mds/MDSTable.cc
+++ b/src/mds/MDSTable.cc
@@ -14,7 +14,7 @@
 
 #include "MDSTable.h"
 
-#include "MDS.h"
+#include "MDSRank.h"
 #include "MDLog.h"
 
 #include "osdc/Filer.h"
@@ -22,6 +22,7 @@
 #include "include/types.h"
 
 #include "common/config.h"
+#include "common/errno.h"
 #include "common/Finisher.h"
 
 #include "include/assert.h"
@@ -36,7 +37,7 @@ class MDSTableIOContext : public MDSIOContextBase
 {
   protected:
     MDSTable *ida;
-    MDS *get_mds() {return ida->mds;}
+    MDSRank *get_mds() {return ida->mds;}
   public:
     MDSTableIOContext(MDSTable *ida_) : ida(ida_) {
       assert(ida != NULL);
@@ -83,7 +84,7 @@ void MDSTable::save(MDSInternalContextBase *onfinish, version_t v)
 			    bl, ceph_clock_now(g_ceph_context), 0,
 			    NULL,
 			    new C_OnFinisher(new C_IO_MT_Save(this, version),
-					     &mds->finisher));
+					     mds->finisher));
 }
 
 void MDSTable::save_2(int r, version_t v)
@@ -133,7 +134,7 @@ object_t MDSTable::get_object_name()
 {
   char n[50];
   if (per_mds)
-    snprintf(n, sizeof(n), "mds%d_%s", int(mds->whoami), table_name);
+    snprintf(n, sizeof(n), "mds%d_%s", int(mds->get_nodeid()), table_name);
   else
     snprintf(n, sizeof(n), "mds_%s", table_name);
   return object_t(n);
@@ -150,7 +151,7 @@ void MDSTable::load(MDSInternalContextBase *onfinish)
   object_t oid = get_object_name();
   object_locator_t oloc(mds->mdsmap->get_metadata_pool());
   mds->objecter->read_full(oid, oloc, CEPH_NOSNAP, &c->bl, 0,
-			   new C_OnFinisher(c, &mds->finisher));
+			   new C_OnFinisher(c, mds->finisher));
 }
 
 void MDSTable::load_2(int r, bufferlist& bl, Context *onfinish)
@@ -158,20 +159,31 @@ void MDSTable::load_2(int r, bufferlist& bl, Context *onfinish)
   assert(is_opening());
   state = STATE_ACTIVE;
   if (r == -EBLACKLISTED) {
-    mds->suicide();
+    mds->respawn();
     return;
   }
   if (r < 0) {
     derr << "load_2 could not read table: " << r << dendl;
-    assert(r >= 0);
+    mds->clog->error() << "error reading table object '" << get_object_name()
+                       << "' " << r << " (" << cpp_strerror(r) << ")";
+    mds->damaged();
+    assert(r >= 0);  // Should be unreachable because damaged() calls respawn()
   }
 
   dout(10) << "load_2 got " << bl.length() << " bytes" << dendl;
   bufferlist::iterator p = bl.begin();
-  ::decode(version, p);
-  projected_version = committed_version = version;
-  dout(10) << "load_2 loaded v" << version << dendl;
-  decode_state(p);
+
+  try {
+    ::decode(version, p);
+    projected_version = committed_version = version;
+    dout(10) << "load_2 loaded v" << version << dendl;
+    decode_state(p);
+  } catch (buffer::error &e) {
+    mds->clog->error() << "error decoding table object '" << get_object_name()
+                       << "': " << e.what();
+    mds->damaged();
+    assert(r >= 0);  // Should be unreachable because damaged() calls respawn()
+  }
 
   if (onfinish) {
     onfinish->complete(0);
diff --git a/src/mds/MDSTable.h b/src/mds/MDSTable.h
index 392f4fd..3eeb8a2 100644
--- a/src/mds/MDSTable.h
+++ b/src/mds/MDSTable.h
@@ -19,13 +19,13 @@
 #include "mds_table_types.h"
 #include "include/buffer.h"
 
-class MDS;
+class MDSRank;
 class Context;
 class MDSInternalContextBase;
 
 class MDSTable {
 public:
-  MDS *mds;
+  MDSRank *mds;
 protected:
   const char *table_name;
   bool per_mds;
@@ -44,7 +44,7 @@ protected:
   map<version_t, list<MDSInternalContextBase*> > waitfor_save;
   
 public:
-  MDSTable(MDS *m, const char *n, bool is_per_mds) :
+  MDSTable(MDSRank *m, const char *n, bool is_per_mds) :
     mds(m), table_name(n), per_mds(is_per_mds), rank(MDS_RANK_NONE),
     state(STATE_UNDEF),
     version(0), committing_version(0), committed_version(0), projected_version(0) {}
diff --git a/src/mds/MDSTableClient.cc b/src/mds/MDSTableClient.cc
index e06f92b..81b1b8a 100644
--- a/src/mds/MDSTableClient.cc
+++ b/src/mds/MDSTableClient.cc
@@ -19,7 +19,7 @@
 #include "MDSContext.h"
 #include "msg/Messenger.h"
 
-#include "MDS.h"
+#include "MDSRank.h"
 #include "MDLog.h"
 #include "LogSegment.h"
 
@@ -91,7 +91,7 @@ void MDSTableClient::handle_request(class MMDSTableRequest *m)
 	       << ", sending ROLLBACK" << dendl;
       assert(!server_ready);
       MMDSTableRequest *req = new MMDSTableRequest(table, TABLESERVER_OP_ROLLBACK, 0, tid);
-      mds->send_message_mds(req, mds->mdsmap->get_tableserver());
+      mds->send_message_mds(req, mds->get_mds_map()->get_tableserver());
     }
     break;
 
@@ -169,7 +169,7 @@ void MDSTableClient::_prepare(bufferlist& mutation, version_t *ptid, bufferlist
     // send message
     MMDSTableRequest *req = new MMDSTableRequest(table, TABLESERVER_OP_PREPARE, reqid);
     req->bl = mutation;
-    mds->send_message_mds(req, mds->mdsmap->get_tableserver());
+    mds->send_message_mds(req, mds->get_mds_map()->get_tableserver());
   } else
     dout(10) << "tableserver is not ready yet, deferring request" << dendl;
 }
@@ -190,7 +190,7 @@ void MDSTableClient::commit(version_t tid, LogSegment *ls)
   if (server_ready) {
     // send message
     MMDSTableRequest *req = new MMDSTableRequest(table, TABLESERVER_OP_COMMIT, 0, tid);
-    mds->send_message_mds(req, mds->mdsmap->get_tableserver());
+    mds->send_message_mds(req, mds->get_mds_map()->get_tableserver());
   } else
     dout(10) << "tableserver is not ready yet, deferring request" << dendl;
 }
@@ -222,7 +222,7 @@ void MDSTableClient::resend_commits()
        ++p) {
     dout(10) << "resending commit on " << p->first << dendl;
     MMDSTableRequest *req = new MMDSTableRequest(table, TABLESERVER_OP_COMMIT, 0, p->first);
-    mds->send_message_mds(req, mds->mdsmap->get_tableserver());
+    mds->send_message_mds(req, mds->get_mds_map()->get_tableserver());
   }
 }
 
@@ -239,13 +239,13 @@ void MDSTableClient::resend_prepares()
     dout(10) << "resending prepare on " << p->first << dendl;
     MMDSTableRequest *req = new MMDSTableRequest(table, TABLESERVER_OP_PREPARE, p->first);
     req->bl = p->second.mutation;
-    mds->send_message_mds(req, mds->mdsmap->get_tableserver());
+    mds->send_message_mds(req, mds->get_mds_map()->get_tableserver());
   }
 }
 
 void MDSTableClient::handle_mds_failure(mds_rank_t who)
 {
-  if (who != mds->mdsmap->get_tableserver())
+  if (who != mds->get_mds_map()->get_tableserver())
     return; // do nothing.
 
   dout(7) << "tableserver mds." << who << " fails" << dendl;
diff --git a/src/mds/MDSTableClient.h b/src/mds/MDSTableClient.h
index a5671d0..b311428 100644
--- a/src/mds/MDSTableClient.h
+++ b/src/mds/MDSTableClient.h
@@ -19,13 +19,13 @@
 #include "MDSContext.h"
 #include "mds_table_types.h"
 
-class MDS;
+class MDSRank;
 class LogSegment;
 class MMDSTableRequest;
 
 class MDSTableClient {
 protected:
-  MDS *mds;
+  MDSRank *mds;
   int table;
 
   uint64_t last_reqid;
@@ -57,7 +57,7 @@ protected:
   friend class C_LoggedAck;
 
 public:
-  MDSTableClient(MDS *m, int tab) :
+  MDSTableClient(MDSRank *m, int tab) :
     mds(m), table(tab), last_reqid(~0ULL), server_ready(false) {}
   virtual ~MDSTableClient() {}
 
diff --git a/src/mds/MDSTableServer.cc b/src/mds/MDSTableServer.cc
index 7c91a88..eb5103d 100644
--- a/src/mds/MDSTableServer.cc
+++ b/src/mds/MDSTableServer.cc
@@ -13,7 +13,7 @@
  */
 
 #include "MDSTableServer.h"
-#include "MDS.h"
+#include "MDSRank.h"
 #include "MDLog.h"
 #include "msg/Messenger.h"
 
diff --git a/src/mds/MDSTableServer.h b/src/mds/MDSTableServer.h
index b70c260..d103d3e 100644
--- a/src/mds/MDSTableServer.h
+++ b/src/mds/MDSTableServer.h
@@ -57,7 +57,7 @@ private:
   }
   
 
-  MDSTableServer(MDS *m, int tab) : MDSTable(m, get_mdstable_name(tab), false), table(tab) {}
+  MDSTableServer(MDSRank *m, int tab) : MDSTable(m, get_mdstable_name(tab), false), table(tab) {}
   virtual ~MDSTableServer() {}
 
   void handle_request(MMDSTableRequest *m);
diff --git a/src/mds/Makefile-server.am b/src/mds/Makefile-server.am
index 2584797..ee3daed 100644
--- a/src/mds/Makefile-server.am
+++ b/src/mds/Makefile-server.am
@@ -22,8 +22,10 @@ noinst_HEADERS += \
 	mds/MDBalancer.h \
 	mds/MDCache.h \
 	mds/RecoveryQueue.h \
+	mds/StrayManager.h \
 	mds/MDLog.h \
-	mds/MDS.h \
+	mds/MDSRank.h \
+	mds/MDSDaemon.h \
 	mds/Beacon.h \
 	mds/MDSContext.h \
 	mds/MDSAuthCaps.h \
diff --git a/src/mds/Makefile.am b/src/mds/Makefile.am
index 37c73f0..c7b0307 100644
--- a/src/mds/Makefile.am
+++ b/src/mds/Makefile.am
@@ -1,6 +1,7 @@
 LIBMDS_SOURCES = \
 	mds/Capability.cc \
-	mds/MDS.cc \
+	mds/MDSDaemon.cc \
+	mds/MDSRank.cc \
 	mds/Beacon.cc \
 	mds/locks.c \
 	mds/journal.cc \
@@ -8,6 +9,7 @@ LIBMDS_SOURCES = \
 	mds/Mutation.cc \
 	mds/MDCache.cc \
 	mds/RecoveryQueue.cc \
+	mds/StrayManager.cc \
 	mds/Locker.cc \
 	mds/Migrator.cc \
 	mds/MDBalancer.cc \
@@ -20,6 +22,7 @@ LIBMDS_SOURCES = \
 	mds/JournalPointer.cc \
 	mds/MDSTableClient.cc \
 	mds/MDSTableServer.cc \
+	mds/SimpleLock.cc \
 	mds/SnapRealm.cc \
 	mds/SnapServer.cc \
 	mds/snap.cc \
diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc
index a2c4710..d807fb6 100644
--- a/src/mds/Migrator.cc
+++ b/src/mds/Migrator.cc
@@ -12,7 +12,7 @@
  * 
  */
 
-#include "MDS.h"
+#include "MDSRank.h"
 #include "MDCache.h"
 #include "CInode.h"
 #include "CDir.h"
@@ -86,7 +86,7 @@
 class MigratorContext : public MDSInternalContextBase {
 protected:
   Migrator *mig;
-  MDS *get_mds() {
+  MDSRank *get_mds() {
     return mig->mds;
   }
 public:
@@ -966,7 +966,7 @@ void Migrator::export_frozen(CDir *dir, uint64_t tid)
   MExportDirPrep *prep = new MExportDirPrep(dir->dirfrag(), it->second.tid);
 
   // include list of bystanders
-  for (map<mds_rank_t,unsigned>::iterator p = dir->replicas_begin();
+  for (compact_map<mds_rank_t,unsigned>::iterator p = dir->replicas_begin();
        p != dir->replicas_end();
        ++p) {
     if (p->first != it->second.peer) {
@@ -1042,12 +1042,12 @@ void Migrator::export_frozen(CDir *dir, uint64_t tid)
 
       start = 'f';  // start with dirfrag
     }
-    bufferlist final;
+    bufferlist final_bl;
     dirfrag_t df = cur->dirfrag();
-    ::encode(df, final);
-    ::encode(start, final);
-    final.claim_append(tracebl);
-    prep->add_trace(final);
+    ::encode(df, final_bl);
+    ::encode(start, final_bl);
+    final_bl.claim_append(tracebl);
+    prep->add_trace(final_bl);
   }
 
   // send.
@@ -1146,7 +1146,7 @@ void Migrator::handle_export_prep_ack(MExportDirPrepAck *m)
 	  it->second.warning_ack_waiting.count(MDS_RANK_NONE) > 0));
   assert(it->second.notify_ack_waiting.empty());
 
-  for (map<mds_rank_t,unsigned>::iterator p = dir->replicas_begin();
+  for (compact_map<mds_rank_t,unsigned>::iterator p = dir->replicas_begin();
        p != dir->replicas_end();
        ++p) {
     if (p->first == it->second.peer) continue;
@@ -1905,7 +1905,7 @@ void Migrator::handle_export_discover(MExportDirDiscover *m)
     if (r > 0) return;
     if (r < 0) {
       dout(7) << "handle_export_discover_2 failed to discover or not dir " << m->get_path() << ", NAK" << dendl;
-      assert(0);    // this shouldn't happen if the auth pins his path properly!!!! 
+      assert(0);    // this shouldn't happen if the auth pins its path properly!!!!
     }
 
     assert(0); // this shouldn't happen; the get_inode above would have succeeded.
@@ -2566,7 +2566,7 @@ void Migrator::import_logged_start(dirfrag_t df, CDir *dir, mds_rank_t from,
   dout(7) << "sending ack for " << *dir << " to old auth mds." << from << dendl;
 
   // test surviving observer of a failed migration that did not complete
-  //assert(dir->replica_map.size() < 2 || mds->whoami != 0);
+  //assert(dir->replica_map.size() < 2 || mds->get_nodeid() != 0);
 
   MExportDirAck *ack = new MExportDirAck(dir->dirfrag(), it->second.tid);
   ::encode(imported_caps, ack->imported_caps);
@@ -2696,7 +2696,7 @@ void Migrator::import_finish(CDir *dir, bool notify, bool last)
   mds->mdcache->maybe_send_pending_resolves();
 
   // did i just import mydir?
-  if (dir->ino() == MDS_INO_MDSDIR(mds->whoami))
+  if (dir->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
     cache->populate_mydir();
 
   // is it empty?
@@ -2932,7 +2932,8 @@ int Migrator::decode_import_dir(bufferlist::iterator& blp,
     else if (icode == 'I') {
       // inode
       assert(le);
-      decode_import_inode(dn, blp, oldauth, ls, le->get_start_off(), peer_exports, updated_scatterlocks);
+      decode_import_inode(dn, blp, oldauth, ls, le->get_metablob()->event_seq,
+          peer_exports, updated_scatterlocks);
     }
     
     // add dentry to journal entry
diff --git a/src/mds/Migrator.h b/src/mds/Migrator.h
index a947486..5872933 100644
--- a/src/mds/Migrator.h
+++ b/src/mds/Migrator.h
@@ -27,7 +27,7 @@ using std::list;
 using std::set;
 
 
-class MDS;
+class MDSRank;
 class CDir;
 class CInode;
 class CDentry;
@@ -51,7 +51,7 @@ class EImportStart;
 
 class Migrator {
 private:
-  MDS *mds;
+  MDSRank *mds;
   MDCache *cache;
 
   // -- exports --
@@ -147,7 +147,7 @@ protected:
 
 public:
   // -- cons --
-  Migrator(MDS *m, MDCache *c) : mds(m), cache(c) {}
+  Migrator(MDSRank *m, MDCache *c) : mds(m), cache(c) {}
 
   void dispatch(Message*);
 
diff --git a/src/mds/Mutation.cc b/src/mds/Mutation.cc
index 3fb54e9..7dbc6e7 100644
--- a/src/mds/Mutation.cc
+++ b/src/mds/Mutation.cc
@@ -313,6 +313,11 @@ void MDRequestImpl::print(ostream &out)
   out << ")";
 }
 
+void MDRequestImpl::dump(Formatter *f) const
+{
+  _dump(ceph_clock_now(g_ceph_context), f);
+}
+
 void MDRequestImpl::_dump(utime_t now, Formatter *f) const
 {
   f->dump_string("flag_point", state_string());
@@ -324,8 +329,7 @@ void MDRequestImpl::_dump(utime_t now, Formatter *f) const
       f->dump_stream("client") << client_request->get_orig_source();
       f->dump_int("tid", client_request->get_tid());
       f->close_section(); // client_info
-    } else if (slave_request) {
-      assert(!slave_request->is_reply()); // replies go to an existing mdr
+    } else if (is_slave() && slave_request) { // replies go to an existing mdr
       f->dump_string("op_type", "slave_request");
       f->open_object_section("master_info");
       f->dump_stream("master") << slave_request->get_orig_source();
@@ -348,12 +352,14 @@ void MDRequestImpl::_dump(utime_t now, Formatter *f) const
       f->dump_stream("op_stamp") << slave_request->op_stamp;
       f->close_section(); // request_info
     }
-    else { // internal request
-      assert(internal_op != -1);
+    else if (internal_op != -1) { // internal request
       f->dump_string("op_type", "internal_op");
       f->dump_int("internal_op", internal_op);
       f->dump_string("op_name", ceph_mds_op_name(internal_op));
     }
+    else {
+      f->dump_string("op_type", "no_available_op_found");
+    }
   }
   {
     f->open_array_section("events");
diff --git a/src/mds/Mutation.h b/src/mds/Mutation.h
index bf63744..f43d660 100644
--- a/src/mds/Mutation.h
+++ b/src/mds/Mutation.h
@@ -110,8 +110,8 @@ public:
     assert(remote_wrlocks.empty());
   }
 
-  bool is_master() { return slave_to_mds == MDS_RANK_NONE; }
-  bool is_slave() { return slave_to_mds != MDS_RANK_NONE; }
+  bool is_master() const { return slave_to_mds == MDS_RANK_NONE; }
+  bool is_slave() const { return slave_to_mds != MDS_RANK_NONE; }
 
   client_t get_client() {
     if (reqid.name.is_client())
@@ -161,6 +161,8 @@ public:
   virtual void print(ostream &out) {
     out << "mutation(" << this << ")";
   }
+
+  virtual void dump(Formatter *f) const {}
 };
 
 inline ostream& operator<<(ostream& out, MutationImpl &mut)
@@ -336,6 +338,7 @@ struct MDRequestImpl : public MutationImpl, public TrackedOp {
   void set_filepath2(const filepath& fp);
 
   void print(ostream &out);
+  void dump(Formatter *f) const;
 
   // TrackedOp stuff
   typedef ceph::shared_ptr<MDRequestImpl> Ref;
diff --git a/src/mds/RecoveryQueue.cc b/src/mds/RecoveryQueue.cc
index 46a7e8c..33aabe4 100644
--- a/src/mds/RecoveryQueue.cc
+++ b/src/mds/RecoveryQueue.cc
@@ -14,7 +14,7 @@
 
 #include "CInode.h"
 #include "MDCache.h"
-#include "MDS.h"
+#include "MDSRank.h"
 #include "Locker.h"
 #include "osdc/Filer.h"
 
@@ -33,7 +33,7 @@ protected:
     rq->_recovered(in, r, size, mtime);
   }
 
-  MDS *get_mds() {
+  MDSRank *get_mds() {
     return rq->mds;
   }
 
@@ -47,6 +47,11 @@ public:
 };
 
 
+RecoveryQueue::RecoveryQueue(MDSRank *mds_)
+  : mds(mds_), logger(NULL), filer(mds_->objecter, mds_->finisher)
+{}
+
+
 /**
  * Progress the queue.  Call this after enqueuing something or on
  * completion of something.
@@ -93,7 +98,7 @@ void RecoveryQueue::_start(CInode *in)
     file_recovering.insert(in);
 
     C_MDC_Recover *fin = new C_MDC_Recover(this, in);
-    mds->filer->probe(in->inode.ino, &in->inode.layout, in->last,
+    filer.probe(in->inode.ino, &in->inode.layout, in->last,
 		      pi->get_max_size(), &fin->size, &fin->mtime, false,
 		      0, fin);
   } else {
@@ -154,7 +159,7 @@ void RecoveryQueue::_recovered(CInode *in, int r, uint64_t size, utime_t mtime)
   if (r != 0) {
     dout(0) << "recovery error! " << r << dendl;
     if (r == -EBLACKLISTED) {
-      mds->suicide();
+      mds->respawn();
       return;
     }
     assert(0 == "unexpected error from osd during recovery");
diff --git a/src/mds/RecoveryQueue.h b/src/mds/RecoveryQueue.h
index 3fe04ee..c0e2f7a 100644
--- a/src/mds/RecoveryQueue.h
+++ b/src/mds/RecoveryQueue.h
@@ -19,8 +19,10 @@
 
 #include <set>
 
+#include "osdc/Filer.h"
+
 class CInode;
-class MDS;
+class MDSRank;
 class PerfCounters;
 
 class RecoveryQueue {
@@ -28,7 +30,7 @@ public:
   void enqueue(CInode *in);
   void advance();
   void prioritize(CInode *in);   ///< do this inode now/soon
-  RecoveryQueue(MDS *mds_) : mds(mds_), logger(NULL) {}
+  RecoveryQueue(MDSRank *mds_);
 
   void set_logger(PerfCounters *p) {logger=p;}
 
@@ -39,8 +41,9 @@ private:
   std::set<CInode*> file_recover_queue_front;  ///< elevated priority items
   std::set<CInode*> file_recovering;
   void _recovered(CInode *in, int r, uint64_t size, utime_t mtime);
-  MDS *mds;
+  MDSRank *mds;
   PerfCounters *logger;
+  Filer filer;
 
   friend class C_MDC_Recover;
 };
diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index d4ed072..eebb836 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -16,10 +16,9 @@
 #include "include/assert.h"  // lexical_cast includes system assert.h
 
 #include <boost/config/warning_disable.hpp>
-#include <boost/spirit/include/qi.hpp>
 #include <boost/fusion/include/std_pair.hpp>
 
-#include "MDS.h"
+#include "MDSRank.h"
 #include "Server.h"
 #include "Locker.h"
 #include "MDCache.h"
@@ -45,8 +44,6 @@
 
 #include "messages/MLock.h"
 
-#include "messages/MDentryUnlink.h"
-
 #include "events/EUpdate.h"
 #include "events/ESlaveUpdate.h"
 #include "events/ESession.h"
@@ -77,7 +74,7 @@ using namespace std;
 class ServerContext : public MDSInternalContextBase {
   protected:
   Server *server;
-  MDS *get_mds()
+  MDSRank *get_mds()
   {
     return server->mds;
   }
@@ -92,15 +89,29 @@ class ServerContext : public MDSInternalContextBase {
 void Server::create_logger()
 {
   PerfCountersBuilder plb(g_ceph_context, "mds_server", l_mdss_first, l_mdss_last);
-  plb.add_u64_counter(l_mdss_handle_client_request,"handle_client_request");
-  plb.add_u64_counter(l_mdss_handle_slave_request, "handle_slave_request");
-  plb.add_u64_counter(l_mdss_handle_client_session, "handle_client_session");
-  plb.add_u64_counter(l_mdss_dispatch_client_request, "dispatch_client_request");
-  plb.add_u64_counter(l_mdss_dispatch_slave_request, "dispatch_server_request");
+  plb.add_u64_counter(l_mdss_handle_client_request,"handle_client_request",
+      "Client requests", "hcr");
+  plb.add_u64_counter(l_mdss_handle_slave_request, "handle_slave_request",
+      "Slave requests", "hsr");
+  plb.add_u64_counter(l_mdss_handle_client_session, "handle_client_session",
+      "Client session messages", "hcs");
+  plb.add_u64_counter(l_mdss_dispatch_client_request, "dispatch_client_request", "Client requests dispatched");
+  plb.add_u64_counter(l_mdss_dispatch_slave_request, "dispatch_server_request", "Server requests dispatched");
   logger = plb.create_perf_counters();
   g_ceph_context->get_perfcounters_collection()->add(logger);
 }
 
+Server::Server(MDSRank *m) : 
+  mds(m), 
+  mdcache(mds->mdcache), mdlog(mds->mdlog),
+  logger(0),
+  is_full(false),
+  reconnect_done(NULL),
+  failed_reconnects(0),
+  terminating_sessions(false)
+{
+}
+
 
 /* This function DOES put the passed message before returning*/
 void Server::dispatch(Message *m) 
@@ -117,6 +128,12 @@ void Server::dispatch(Message *m)
     if (m->get_type() == CEPH_MSG_CLIENT_REQUEST &&
 	(mds->is_reconnect() || mds->get_want_state() == CEPH_MDS_STATE_RECONNECT)) {
       MClientRequest *req = static_cast<MClientRequest*>(m);
+      Session *session = get_session(req);
+      if (!session || session->is_closed()) {
+	dout(5) << "session is closed, dropping " << req->get_reqid() << dendl;
+	req->put();
+	return;
+      }
       bool queue_replay = false;
       if (req->is_replay()) {
 	dout(3) << "queuing replayed op" << dendl;
@@ -125,8 +142,7 @@ void Server::dispatch(Message *m)
 	// process completed request in clientreplay stage. The completed request
 	// might have created new file/directorie. This guarantees MDS sends a reply
 	// to client before other request modifies the new file/directorie.
-	Session *session = get_session(req);
-	if (session && session->have_completed_request(req->get_reqid().tid, NULL)) {
+	if (session->have_completed_request(req->get_reqid().tid, NULL)) {
 	  dout(3) << "queuing completed op" << dendl;
 	  queue_replay = true;
 	}
@@ -198,9 +214,9 @@ class C_MDS_session_finish : public MDSInternalContext {
   version_t inotablev;
   Context *fin;
 public:
-  C_MDS_session_finish(MDS *m, Session *se, uint64_t sseq, bool s, version_t mv, Context *fin_ = NULL) :
+  C_MDS_session_finish(MDSRank *m, Session *se, uint64_t sseq, bool s, version_t mv, Context *fin_ = NULL) :
     MDSInternalContext(m), session(se), state_seq(sseq), open(s), cmapv(mv), inotablev(0), fin(fin_) { }
-  C_MDS_session_finish(MDS *m, Session *se, uint64_t sseq, bool s, version_t mv, interval_set<inodeno_t>& i, version_t iv, Context *fin_ = NULL) :
+  C_MDS_session_finish(MDSRank *m, Session *se, uint64_t sseq, bool s, version_t mv, interval_set<inodeno_t>& i, version_t iv, Context *fin_ = NULL) :
     MDSInternalContext(m), session(se), state_seq(sseq), open(s), cmapv(mv), inos(i), inotablev(iv), fin(fin_) { }
   void finish(int r) {
     assert(r == 0);
@@ -267,9 +283,9 @@ void Server::handle_client_session(MClientSession *m)
     if (session->is_closed())
       mds->sessionmap.add_session(session);
 
+    pv = mds->sessionmap.mark_projected(session);
     sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
     mds->sessionmap.touch_session(session);
-    pv = ++mds->sessionmap.projected;
     mdlog->start_submit_entry(new ESession(m->get_source_inst(), true, pv, m->client_meta),
 			      new C_MDS_session_finish(mds, session, sseq, true, pv));
     mdlog->flush();
@@ -359,6 +375,8 @@ void Server::_session_logged(Session *session, uint64_t state_seq, bool open, ve
   dout(10) << "_session_logged " << session->info.inst << " state_seq " << state_seq << " " << (open ? "open":"close")
 	   << " " << pv << dendl;
 
+  mds->sessionmap.mark_dirty(session);
+
   if (piv) {
     mds->inotable->apply_release_ids(inos);
     assert(mds->inotable->get_version() == piv);
@@ -432,18 +450,29 @@ void Server::_session_logged(Session *session, uint64_t state_seq, bool open, ve
   } else {
     assert(0);
   }
-  mds->sessionmap.version++;  // noop
 }
 
+/**
+ * Inject sessions from some source other than actual connections.
+ *
+ * For example:
+ *  - sessions inferred from journal replay
+ *  - sessions learned from other MDSs during rejoin
+ *  - sessions learned from other MDSs during dir/caps migration
+ *  - sessions learned from other MDSs during a cross-MDS rename
+ */
 version_t Server::prepare_force_open_sessions(map<client_t,entity_inst_t>& cm,
 					      map<client_t,uint64_t>& sseqmap)
 {
-  version_t pv = ++mds->sessionmap.projected;
+  version_t pv = mds->sessionmap.get_projected();
+
   dout(10) << "prepare_force_open_sessions " << pv 
 	   << " on " << cm.size() << " clients"
 	   << dendl;
   for (map<client_t,entity_inst_t>::iterator p = cm.begin(); p != cm.end(); ++p) {
+
     Session *session = mds->sessionmap.get_or_add_session(p->second);
+    pv = mds->sessionmap.mark_projected(session);
     if (session->is_closed() || 
 	session->is_closing() ||
 	session->is_killing())
@@ -453,7 +482,6 @@ version_t Server::prepare_force_open_sessions(map<client_t,entity_inst_t>& cm,
 	     session->is_opening() ||
 	     session->is_stale());
     session->inc_importing();
-//  mds->sessionmap.touch_session(session);
   }
   return pv;
 }
@@ -468,8 +496,13 @@ void Server::finish_force_open_sessions(map<client_t,entity_inst_t>& cm,
    * trying to force open a session...  
    */
   dout(10) << "finish_force_open_sessions on " << cm.size() << " clients,"
-	   << " v " << mds->sessionmap.version << " -> " << (mds->sessionmap.version+1) << dendl;
+	   << " initial v " << mds->sessionmap.get_version() << dendl;
+  
+
+  int sessions_inserted = 0;
   for (map<client_t,entity_inst_t>::iterator p = cm.begin(); p != cm.end(); ++p) {
+    sessions_inserted++;
+
     Session *session = mds->sessionmap.get_session(p->second.name);
     assert(session);
     
@@ -489,10 +522,15 @@ void Server::finish_force_open_sessions(map<client_t,entity_inst_t>& cm,
       dout(10) << "force_open_sessions skipping already-open " << session->info.inst << dendl;
       assert(session->is_open() || session->is_stale());
     }
-    if (dec_import)
+
+    if (dec_import) {
       session->dec_importing();
+    }
+
+    mds->sessionmap.mark_dirty(session);
   }
-  mds->sessionmap.version++;
+
+  dout(10) << __func__ << ": final v " << mds->sessionmap.get_version() << dendl;
 }
 
 class C_MDS_TerminatedSessions : public ServerContext {
@@ -614,7 +652,7 @@ void Server::kill_session(Session *session, Context *on_safe)
 void Server::journal_close_session(Session *session, int state, Context *on_safe)
 {
   uint64_t sseq = mds->sessionmap.set_state(session, state);
-  version_t pv = ++mds->sessionmap.projected;
+  version_t pv = mds->sessionmap.mark_projected(session);
   version_t piv = 0;
 
   // release alloc and pending-alloc inos for this session
@@ -646,8 +684,9 @@ void Server::journal_close_session(Session *session, int state, Context *on_safe
   finish_flush_session(session, session->get_push_seq());
 }
 
-void Server::reconnect_clients()
+void Server::reconnect_clients(MDSInternalContext *reconnect_done_)
 {
+  reconnect_done = reconnect_done_;
   mds->sessionmap.get_client_set(client_reconnect_gather);
 
   if (client_reconnect_gather.empty()) {
@@ -667,7 +706,7 @@ void Server::reconnect_clients()
 void Server::handle_client_reconnect(MClientReconnect *m)
 {
   dout(7) << "handle_client_reconnect " << m->get_source() << dendl;
-  int from = m->get_source().num();
+  client_t from = m->get_source().num();
   Session *session = get_session(m);
   assert(session);
 
@@ -751,7 +790,7 @@ void Server::handle_client_reconnect(MClientReconnect *m)
       dout(15) << "open cap realm " << inodeno_t(p->second.capinfo.snaprealm)
 	       << " on " << *in << dendl;
       in->reconnect_cap(from, p->second.capinfo, session);
-      mds->mdcache->add_reconnected_cap(in, from, inodeno_t(p->second.capinfo.snaprealm));
+      mdcache->add_reconnected_cap(in, from, inodeno_t(p->second.capinfo.snaprealm));
       recover_filelocks(in, p->second.flockbl, m->get_orig_source().num());
       continue;
     }
@@ -782,7 +821,9 @@ void Server::handle_client_reconnect(MClientReconnect *m)
 void Server::reconnect_gather_finish()
 {
   dout(7) << "reconnect_gather_finish.  failed on " << failed_reconnects << " clients" << dendl;
-  mds->reconnect_done();
+  assert(reconnect_done);
+  reconnect_done->complete(0);
+  reconnect_done = NULL;
 }
 
 void Server::reconnect_tick()
@@ -816,17 +857,15 @@ void Server::recover_filelocks(CInode *in, bufferlist locks, int64_t client)
   for (int i = 0; i < numlocks; ++i) {
     ::decode(lock, p);
     lock.client = client;
-    in->fcntl_locks.held_locks.insert(pair<uint64_t, ceph_filelock>
-				      (lock.start, lock));
-    ++in->fcntl_locks.client_held_lock_counts[client];
+    in->get_fcntl_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock>(lock.start, lock));
+    ++in->get_fcntl_lock_state()->client_held_lock_counts[client];
   }
   ::decode(numlocks, p);
   for (int i = 0; i < numlocks; ++i) {
     ::decode(lock, p);
     lock.client = client;
-    in->flock_locks.held_locks.insert(pair<uint64_t, ceph_filelock>
-				      (lock.start, lock));
-    ++in->flock_locks.client_held_lock_counts[client];
+    in->get_flock_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock> (lock.start, lock));
+    ++in->get_flock_lock_state()->client_held_lock_counts[client];
   }
 }
 
@@ -928,7 +967,7 @@ class C_MarkEvent : public MDSInternalContext
   MDRequestRef mdr;
   string event_str;
 public:
-  C_MarkEvent(MDS *mds_, Context *f, MDRequestRef& _mdr,
+  C_MarkEvent(MDSRank *mds_, Context *f, MDRequestRef& _mdr,
 		 const char *evt)
     : MDSInternalContext(mds_), true_finisher(f), mdr(_mdr),
       event_str("journal_committed: ") {
@@ -1052,9 +1091,18 @@ void Server::reply_client_request(MDRequestRef& mdr, MClientReply *reply)
   mdr->mark_event("replying");
 
   // note successful request in session map?
-  if (req->may_write() && mdr->session && reply->get_result() == 0) {
+  //
+  // setfilelock requests are special, they only modify states in MDS memory.
+  // The states get lost when MDS fails. If Client re-send a completed
+  // setfilelock request, it means that client did not receive corresponding
+  // setfilelock reply.  So MDS should re-execute the setfilelock request.
+  if (req->may_write() && req->get_op() != CEPH_MDS_OP_SETFILELOCK &&
+      reply->get_result() == 0 && mdr->session) {
     inodeno_t created = mdr->alloc_ino ? mdr->alloc_ino : mdr->used_prealloc_ino;
     mdr->session->add_completed_request(mdr->reqid.tid, created);
+    if (mdr->ls) {
+      mdr->ls->touched_sessions.insert(mdr->session->info.inst.name);
+    }
   }
 
   // give any preallocated inos to the session
@@ -1125,9 +1173,10 @@ void Server::reply_client_request(MDRequestRef& mdr, MClientReply *reply)
 
   // take a closer look at tracei, if it happens to be a remote link
   if (tracei && 
-      tracei->get_parent_dn() &&
-      tracei->get_parent_dn()->get_projected_linkage()->is_remote())
-    mdcache->eval_remote(tracei->get_parent_dn());
+      tracedn &&
+      tracedn->get_projected_linkage()->is_remote()) {
+    mdcache->eval_remote(tracedn);
+  }
 }
 
 
@@ -1322,7 +1371,27 @@ void Server::handle_client_request(MClientRequest *req)
   if (req->get_oldest_client_tid() > 0) {
     dout(15) << " oldest_client_tid=" << req->get_oldest_client_tid() << dendl;
     assert(session);
-    session->trim_completed_requests(req->get_oldest_client_tid());
+    if (session->trim_completed_requests(req->get_oldest_client_tid())) {
+      // Sessions 'completed_requests' was dirtied, mark it to be
+      // potentially flushed at segment expiry.
+      mdlog->get_current_segment()->touched_sessions.insert(session->info.inst.name);
+
+      if (session->get_num_trim_requests_warnings() > 0 &&
+	  session->get_num_completed_requests() * 2 < g_conf->mds_max_completed_requests)
+	session->reset_num_trim_requests_warnings();
+    } else {
+      if (session->get_num_completed_requests() >=
+	  (g_conf->mds_max_completed_requests << session->get_num_trim_requests_warnings())) {
+	session->inc_num_trim_requests_warnings();
+	stringstream ss;
+	ss << "client." << session->get_client() << " does not advance its oldest_client_tid ("
+	   << req->get_oldest_client_tid() << "), "
+	   << session->get_num_completed_requests()
+	   << " completed requests recorded in session\n";
+	mds->clog->warn() << ss.str();
+	dout(20) << __func__ << " " << ss.str() << dendl;
+      }
+    }
   }
 
   // register + dispatch
@@ -1504,7 +1573,9 @@ void Server::dispatch_client_request(MDRequestRef& mdr)
   case CEPH_MDS_OP_RMSNAP:
     handle_client_rmsnap(mdr);
     break;
-
+  case CEPH_MDS_OP_RENAMESNAP:
+    handle_client_renamesnap(mdr);
+    break;
 
   default:
     dout(1) << " unknown client op " << req->get_op() << dendl;
@@ -1619,7 +1690,7 @@ void Server::handle_slave_request_reply(MMDSSlaveRequest *m)
 
   if (m->get_op() == MMDSSlaveRequest::OP_COMMITTED) {
     metareqid_t r = m->get_reqid();
-    mds->mdcache->committed_master_slave(r, from);
+    mdcache->committed_master_slave(r, from);
     m->put();
     return;
   }
@@ -2086,6 +2157,8 @@ CDentry* Server::prepare_null_dentry(MDRequestRef& mdr, CDir *dir, const string&
         respond_to_request(mdr, -EEXIST);
         return 0;
       }
+    } else {
+      dn->first = dir->inode->find_snaprealm()->get_newest_seq() + 1;
     }
 
     return dn;
@@ -2099,7 +2172,7 @@ CDentry* Server::prepare_null_dentry(MDRequestRef& mdr, CDir *dir, const string&
   }
   
   // create
-  dn = dir->add_null_dentry(dname);
+  dn = dir->add_null_dentry(dname, dir->inode->find_snaprealm()->get_newest_seq() + 1);
   dn->mark_new();
   dout(10) << "prepare_null_dentry added " << *dn << dendl;
   return dn;
@@ -2137,7 +2210,8 @@ CInode* Server::prepare_new_inode(MDRequestRef& mdr, CDir *dir, inodeno_t useino
   if (mdr->session->info.prealloc_inos.size()) {
     mdr->used_prealloc_ino = 
       in->inode.ino = mdr->session->take_ino(useino);  // prealloc -> used
-    mds->sessionmap.projected++;
+    mds->sessionmap.mark_projected(mdr->session);
+
     dout(10) << "prepare_new_inode used_prealloc " << mdr->used_prealloc_ino
 	     << " (" << mdr->session->info.prealloc_inos
 	     << ", " << mdr->session->info.prealloc_inos.size() << " left)"
@@ -2152,7 +2226,7 @@ CInode* Server::prepare_new_inode(MDRequestRef& mdr, CDir *dir, inodeno_t useino
     dout(0) << "WARNING: client specified " << useino << " and i allocated " << in->inode.ino << dendl;
     mds->clog->error() << mdr->client_request->get_source()
        << " specified ino " << useino
-       << " but mds." << mds->whoami << " allocated " << in->inode.ino << "\n";
+       << " but mds." << mds->get_nodeid() << " allocated " << in->inode.ino << "\n";
     //assert(0); // just for now.
   }
     
@@ -2161,7 +2235,7 @@ CInode* Server::prepare_new_inode(MDRequestRef& mdr, CDir *dir, inodeno_t useino
     mds->inotable->project_alloc_ids(mdr->prealloc_inos, got);
     assert(mdr->prealloc_inos.size());  // or else fix projected increment semantics
     mdr->session->pending_prealloc_inos.insert(mdr->prealloc_inos);
-    mds->sessionmap.projected++;
+    mds->sessionmap.mark_projected(mdr->session);
     dout(10) << "prepare_new_inode prealloc " << mdr->prealloc_inos << dendl;
   }
 
@@ -2178,7 +2252,7 @@ CInode* Server::prepare_new_inode(MDRequestRef& mdr, CDir *dir, inodeno_t useino
   } else if (layout) {
     in->inode.layout = *layout;
   } else {
-    in->inode.layout = mds->mdcache->default_file_layout;
+    in->inode.layout = mdcache->default_file_layout;
   }
 
   in->inode.truncate_size = -1ull;  // not truncated, yet!
@@ -2217,7 +2291,7 @@ CInode* Server::prepare_new_inode(MDRequestRef& mdr, CDir *dir, inodeno_t useino
 
   if (!mds->mdsmap->get_inline_data_enabled() ||
       !mdr->session->connection->has_feature(CEPH_FEATURE_MDS_INLINE_DATA))
-    in->inode.inline_version = CEPH_INLINE_NONE;
+    in->inode.inline_data.version = CEPH_INLINE_NONE;
 
   mdcache->add_inode(in);  // add
   dout(10) << "prepare_new_inode " << *in << dendl;
@@ -2226,14 +2300,14 @@ CInode* Server::prepare_new_inode(MDRequestRef& mdr, CDir *dir, inodeno_t useino
 
 void Server::journal_allocated_inos(MDRequestRef& mdr, EMetaBlob *blob)
 {
-  dout(20) << "journal_allocated_inos sessionmapv " << mds->sessionmap.projected
+  dout(20) << "journal_allocated_inos sessionmapv " << mds->sessionmap.get_projected()
 	   << " inotablev " << mds->inotable->get_projected_version()
 	   << dendl;
   blob->set_ino_alloc(mdr->alloc_ino,
 		      mdr->used_prealloc_ino,
 		      mdr->prealloc_inos,
 		      mdr->client_request->get_source(),
-		      mds->sessionmap.projected,
+		      mds->sessionmap.get_projected(),
 		      mds->inotable->get_projected_version());
 }
 
@@ -2249,15 +2323,17 @@ void Server::apply_allocated_inos(MDRequestRef& mdr)
   }
   if (mdr->prealloc_inos.size()) {
     assert(session);
-    session->pending_prealloc_inos.subtract(mdr->prealloc_inos);
-    session->info.prealloc_inos.insert(mdr->prealloc_inos);
-    mds->sessionmap.version++;
+    if (!mdr->killed) {
+      session->pending_prealloc_inos.subtract(mdr->prealloc_inos);
+      session->info.prealloc_inos.insert(mdr->prealloc_inos);
+    }
+    mds->sessionmap.mark_dirty(session);
     mds->inotable->apply_alloc_ids(mdr->prealloc_inos);
   }
   if (mdr->used_prealloc_ino) {
     assert(session);
     session->info.used_inos.erase(mdr->used_prealloc_ino);
-    mds->sessionmap.version++;
+    mds->sessionmap.mark_dirty(session);
   }
 }
 
@@ -2530,7 +2606,7 @@ CDir* Server::try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequestRef& mdr)
 
   // invent?
   if (!dir) 
-    dir = diri->get_or_open_dirfrag(mds->mdcache, fg);
+    dir = diri->get_or_open_dirfrag(mdcache, fg);
  
   // am i auth for the dirfrag?
   if (!dir->is_auth()) {
@@ -2630,6 +2706,11 @@ void Server::handle_client_lookup_ino(MDRequestRef& mdr,
     return;
   }
 
+  if (mdr && in->snaprealm && !in->snaprealm->is_open() &&
+      !in->snaprealm->open_parents(new C_MDS_RetryRequest(mdcache, mdr))) {
+    return;
+  }
+
   CDentry *dn = in->get_projected_parent_dn();
   CInode *diri = dn ? dn->get_dir()->inode : NULL;
   if (dn && (want_parent || want_dentry)) {
@@ -2698,7 +2779,7 @@ void Server::handle_client_open(MDRequestRef& mdr)
   MClientRequest *req = mdr->client_request;
 
   int flags = req->head.args.open.flags;
-  int cmode = ceph_flags_to_mode(req->head.args.open.flags);
+  int cmode = ceph_flags_to_mode(flags);
 
   bool need_auth = !file_mode_is_readonly(cmode) || (flags & O_TRUNC);
 
@@ -2727,7 +2808,7 @@ void Server::handle_client_open(MDRequestRef& mdr)
       return;
   }
 
-  if (mdr->snapid != CEPH_NOSNAP && mdr->client_request->may_write()) {
+  if (mdr->snapid != CEPH_NOSNAP && req->may_write()) {
     respond_to_request(mdr, -EROFS);
     return;
   }
@@ -2747,13 +2828,13 @@ void Server::handle_client_open(MDRequestRef& mdr)
     respond_to_request(mdr, -ENXIO);                 // FIXME what error do we want?
     return;
     }*/
-  if ((req->head.args.open.flags & O_DIRECTORY) && !cur->inode.is_dir()) {
+  if ((flags & O_DIRECTORY) && !cur->inode.is_dir()) {
     dout(7) << "specified O_DIRECTORY on non-directory " << *cur << dendl;
     respond_to_request(mdr, -EINVAL);
     return;
   }
 
-  if (cur->inode.inline_version != CEPH_INLINE_NONE &&
+  if (cur->inode.inline_data.version != CEPH_INLINE_NONE &&
       !mdr->session->connection->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) {
     dout(7) << "old client cannot open inline data file " << *cur << dendl;
     respond_to_request(mdr, -EPERM);
@@ -2859,7 +2940,7 @@ class C_MDS_openc_finish : public MDSInternalContext {
   CInode *newi;
   snapid_t follows;
 public:
-  C_MDS_openc_finish(MDS *m, MDRequestRef& r, CDentry *d, CInode *ni, snapid_t f) :
+  C_MDS_openc_finish(MDSRank *m, MDRequestRef& r, CDentry *d, CInode *ni, snapid_t f) :
     MDSInternalContext(m), mdr(r), dn(d), newi(ni), follows(f) {}
   void finish(int r) {
     assert(r == 0);
@@ -2938,7 +3019,7 @@ void Server::handle_client_openc(MDRequestRef& mdr)
   if (dir_layout)
     layout = *dir_layout;
   else
-    layout = mds->mdcache->default_file_layout;
+    layout = mdcache->default_file_layout;
 
   // fill in any special params from client
   if (req->head.args.open.stripe_unit)
@@ -3010,8 +3091,7 @@ void Server::handle_client_openc(MDRequestRef& mdr)
   }
   in->inode.rstat.rfiles = 1;
 
-  if (follows >= dn->first)
-    dn->first = follows+1;
+  assert(dn->first == follows+1);
   in->first = dn->first;
   
   // prepare finisher
@@ -3112,16 +3192,7 @@ void Server::handle_client_readdir(MDRequestRef& mdr)
   snapid_t snapid = mdr->snapid;
   dout(10) << "snapid " << snapid << dendl;
 
-  // purge stale snap data?
-  const set<snapid_t> *snaps = 0;
   SnapRealm *realm = diri->find_snaprealm();
-  if (realm->get_last_destroyed() > dir->fnode.snap_purged_thru) {
-    snaps = &realm->get_snaps();
-    dout(10) << " last_destroyed " << realm->get_last_destroyed() << " > " << dir->fnode.snap_purged_thru
-	     << ", doing snap purge with " << *snaps << dendl;
-    dir->fnode.snap_purged_thru = realm->get_last_destroyed();
-    assert(snapid == CEPH_NOSNAP || snaps->count(snapid));  // just checkin'! 
-  }
 
   unsigned max = req->head.args.readdir.max_entries;
   if (!max)
@@ -3158,9 +3229,7 @@ void Server::handle_client_readdir(MDRequestRef& mdr)
 
     if (dnl->is_null())
       continue;
-    if (snaps && dn->last != CEPH_NOSNAP)
-      if (dir->try_trim_snap_dentry(dn, *snaps))
-	continue;
+
     if (dn->last < snapid || dn->first > snapid) {
       dout(20) << "skipping non-overlapping snap " << *dn << dendl;
       continue;
@@ -3243,9 +3312,6 @@ void Server::handle_client_readdir(MDRequestRef& mdr)
   ::encode(complete, dirbl);
   dirbl.claim_append(dnbl);
   
-  if (snaps)
-    dir->log_mark_dirty();
-
   // yay, reply
   dout(10) << "reply to " << *req << " readdir num=" << numfiles
 	   << " bytes=" << dirbl.length()
@@ -3278,7 +3344,7 @@ class C_MDS_inode_update_finish : public MDSInternalContext {
   CInode *in;
   bool truncating_smaller, changed_ranges;
 public:
-  C_MDS_inode_update_finish(MDS *m, MDRequestRef& r, CInode *i,
+  C_MDS_inode_update_finish(MDSRank *m, MDRequestRef& r, CInode *i,
 			    bool sm=false, bool cr=false) :
     MDSInternalContext(m), mdr(r), in(i), truncating_smaller(sm), changed_ranges(cr) { }
   void finish(int r) {
@@ -3343,14 +3409,14 @@ void Server::handle_client_file_setlock(MDRequestRef& mdr)
     interrupt = true;
     // fall-thru
   case CEPH_LOCK_FLOCK:
-    lock_state = &cur->flock_locks;
+    lock_state = cur->get_flock_lock_state();
     break;
 
   case CEPH_LOCK_FCNTL_INTR:
     interrupt = true;
     // fall-thru
   case CEPH_LOCK_FCNTL:
-    lock_state = &cur->fcntl_locks;
+    lock_state = cur->get_fcntl_lock_state();
     break;
 
   default:
@@ -3433,11 +3499,11 @@ void Server::handle_client_file_readlock(MDRequestRef& mdr)
   ceph_lock_state_t *lock_state = NULL;
   switch (req->head.args.filelock_change.rule) {
   case CEPH_LOCK_FLOCK:
-    lock_state = &cur->flock_locks;
+    lock_state = cur->get_flock_lock_state();
     break;
 
   case CEPH_LOCK_FCNTL:
-    lock_state = &cur->fcntl_locks;
+    lock_state = cur->get_fcntl_lock_state();
     break;
 
   default:
@@ -3620,6 +3686,10 @@ void Server::do_open_truncate(MDRequestRef& mdr, int cmode)
 
   journal_and_reply(mdr, in, dn, le, new C_MDS_inode_update_finish(mds, mdr, in, old_size > 0,
 								   changed_ranges));
+  // Although the `open` part can give an early reply, the truncation won't
+  // happen until our EUpdate is persistent, to give the client a prompt
+  // response we must also flush that event.
+  mdlog->flush();
 }
 
 
@@ -3733,7 +3803,7 @@ void Server::handle_client_setdirlayout(MDRequestRef& mdr)
   else if (dir_layout)
     layout = *dir_layout;
   else
-    layout = mds->mdcache->default_file_layout;
+    layout = mdcache->default_file_layout;
 
   if (req->head.args.setlayout.layout.fl_object_size > 0)
     layout.fl_object_size = req->head.args.setlayout.layout.fl_object_size;
@@ -3779,33 +3849,10 @@ void Server::handle_client_setdirlayout(MDRequestRef& mdr)
   journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(mds, mdr, cur));
 }
 
-
-
-
 // XATTRS
 
-// parse a map of keys/values.
-namespace qi = boost::spirit::qi;
-
-template <typename Iterator>
-struct keys_and_values
-  : qi::grammar<Iterator, std::map<string, string>()>
-{
-    keys_and_values()
-      : keys_and_values::base_type(query)
-    {
-      query =  pair >> *(qi::lit(' ') >> pair);
-      pair  =  key >> '=' >> value;
-      key   =  qi::char_("a-zA-Z_") >> *qi::char_("a-zA-Z_0-9");
-      value = +qi::char_("a-zA-Z_0-9");
-    }
-    qi::rule<Iterator, std::map<string, string>()> query;
-    qi::rule<Iterator, std::pair<string, string>()> pair;
-    qi::rule<Iterator, string()> key, value;
-};
-
 int Server::parse_layout_vxattr(string name, string value, const OSDMap *osdmap,
-				ceph_file_layout *layout)
+				ceph_file_layout *layout, bool validate)
 {
   dout(20) << "parse_layout_vxattr name " << name << " value '" << value << "'" << dendl;
   try {
@@ -3822,7 +3869,10 @@ int Server::parse_layout_vxattr(string name, string value, const OSDMap *osdmap,
       if (begin != end)
 	return -EINVAL;
       for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
-	int r = parse_layout_vxattr(string("layout.") + q->first, q->second, osdmap, layout);
+        // Skip validation on each attr, we do it once at the end (avoid
+        // rejecting intermediate states if the overall result is ok)
+	int r = parse_layout_vxattr(string("layout.") + q->first, q->second,
+                                    osdmap, layout, false);
 	if (r < 0)
 	  return r;
       }
@@ -3852,7 +3902,7 @@ int Server::parse_layout_vxattr(string name, string value, const OSDMap *osdmap,
     return -EINVAL;
   }
 
-  if (!ceph_file_layout_is_valid(layout)) {
+  if (validate && !ceph_file_layout_is_valid(layout)) {
     dout(10) << "bad layout" << dendl;
     return -EINVAL;
   }
@@ -3940,21 +3990,26 @@ void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur,
       else if (dir_layout)
 	layout = *dir_layout;
       else
-	layout = mds->mdcache->default_file_layout;
+	layout = mdcache->default_file_layout;
 
       rest = name.substr(name.find("layout"));
       const OSDMap *osdmap = mds->objecter->get_osdmap_read();
       int r = parse_layout_vxattr(rest, value, osdmap, &layout);
+      epoch_t epoch = osdmap->get_epoch();
       mds->objecter->put_osdmap_read();
       if (r < 0) {
 	if (r == -ENOENT) {
-	  if (!mdr->waited_for_osdmap) {
-	    // make sure we have the latest map.
-	    // FIXME: we should get the client's osdmap epoch and just
-	    // make sure we have *that*.
+	  epoch_t req_epoch = req->get_osdmap_epoch();
+	  if (req_epoch > epoch) {
+	    if (!mds->objecter->wait_for_map(req_epoch,
+		  new C_OnFinisher(new C_IO_Wrapper(mds, new C_MDS_RetryRequest(mdcache, mdr)), mds->finisher)))
+	    return;
+	  } else  if (req_epoch == 0 && !mdr->waited_for_osdmap) {
+	    // For compatibility with client w/ old code, we still need get the latest map. 
+	    // One day if COMPACT_VERSION of MClientRequest >=3, we can remove those code.
 	    mdr->waited_for_osdmap = true;
 	    mds->objecter->wait_for_latest_osdmap(
-	      new C_OnFinisher(new C_IO_Wrapper(mds, new C_MDS_RetryRequest(mdcache, mdr)), &mds->finisher));
+		new C_OnFinisher(new C_IO_Wrapper(mds, new C_MDS_RetryRequest(mdcache, mdr)), mds->finisher));
 	    return;
 	  }
 	  r = -EINVAL;
@@ -3983,16 +4038,21 @@ void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur,
       rest = name.substr(name.find("layout"));
       const OSDMap *osdmap = mds->objecter->get_osdmap_read();
       int r = parse_layout_vxattr(rest, value, osdmap, &layout);
+      epoch_t epoch = osdmap->get_epoch();
       mds->objecter->put_osdmap_read();
       if (r < 0) {
 	if (r == -ENOENT) {
-	  if (!mdr->waited_for_osdmap) {
-	    // make sure we have the latest map.
-	    // FIXME: we should get the client's osdmap epoch and just
-	    // make sure we have *that*.
+	  epoch_t req_epoch = req->get_osdmap_epoch();
+	  if (req_epoch > epoch) {
+	    if (!mds->objecter->wait_for_map(req_epoch,
+		new C_OnFinisher(new C_IO_Wrapper(mds, new C_MDS_RetryRequest(mdcache, mdr)), mds->finisher)))
+	    return;
+	  } else if (req_epoch == 0 && !mdr->waited_for_osdmap) {
+	    // For compatibility with client w/ old code, we still need get the latest map. 
+	    // One day if COMPACT_VERSION of MClientRequest >=3, we can remove those code.
 	    mdr->waited_for_osdmap = true;
 	    mds->objecter->wait_for_latest_osdmap(
-	      new C_OnFinisher(new C_IO_Wrapper(mds, new C_MDS_RetryRequest(mdcache, mdr)), &mds->finisher));
+	      new C_OnFinisher(new C_IO_Wrapper(mds, new C_MDS_RetryRequest(mdcache, mdr)), mds->finisher));
 	    return;
 	  }
 	  r = -EINVAL;
@@ -4105,7 +4165,7 @@ class C_MDS_inode_xattr_update_finish : public MDSInternalContext {
   CInode *in;
 public:
 
-  C_MDS_inode_xattr_update_finish(MDS *m, MDRequestRef& r, CInode *i) :
+  C_MDS_inode_xattr_update_finish(MDSRank *m, MDRequestRef& r, CInode *i) :
     MDSInternalContext(m), mdr(r), in(i) { }
   void finish(int r) {
     assert(r == 0);
@@ -4261,10 +4321,9 @@ class C_MDS_mknod_finish : public MDSInternalContext {
   MDRequestRef mdr;
   CDentry *dn;
   CInode *newi;
-  snapid_t follows;
 public:
-  C_MDS_mknod_finish(MDS *m, MDRequestRef& r, CDentry *d, CInode *ni, snapid_t f) :
-    MDSInternalContext(m), mdr(r), dn(d), newi(ni), follows(f) {}
+  C_MDS_mknod_finish(MDSRank *m, MDRequestRef& r, CDentry *d, CInode *ni) :
+    MDSInternalContext(m), mdr(r), dn(d), newi(ni) {}
   void finish(int r) {
     assert(r == 0);
 
@@ -4331,7 +4390,7 @@ void Server::handle_client_mknod(MDRequestRef& mdr)
   if (dir_layout && S_ISREG(mode))
     layout = *dir_layout;
   else
-    layout = mds->mdcache->default_file_layout;
+    layout = mdcache->default_file_layout;
 
   SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
   snapid_t follows = realm->get_newest_seq();
@@ -4372,8 +4431,7 @@ void Server::handle_client_mknod(MDRequestRef& mdr)
     }
   }
 
-  if (follows >= dn->first)
-    dn->first = follows + 1;
+  assert(dn->first == follows + 1);
   newi->first = dn->first;
     
   dout(10) << "mknod mode " << newi->inode.mode << " rdev " << newi->inode.rdev << dendl;
@@ -4389,7 +4447,7 @@ void Server::handle_client_mknod(MDRequestRef& mdr)
 				    PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
   le->metablob.add_primary_dentry(dn, newi, true, true, true);
 
-  journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(mds, mdr, dn, newi, follows));
+  journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(mds, mdr, dn, newi));
 }
 
 
@@ -4429,12 +4487,11 @@ void Server::handle_client_mkdir(MDRequestRef& mdr)
   newi->inode.update_backtrace();
 
   dout(12) << " follows " << follows << dendl;
-  if (follows >= dn->first)
-    dn->first = follows + 1;
+  assert(dn->first == follows + 1);
   newi->first = dn->first;
 
   // ...and that new dir is empty.
-  CDir *newdir = newi->get_or_open_dirfrag(mds->mdcache, frag_t());
+  CDir *newdir = newi->get_or_open_dirfrag(mdcache, frag_t());
   newdir->mark_complete();
   newdir->fnode.version = newdir->pre_dirty();
 
@@ -4467,7 +4524,7 @@ void Server::handle_client_mkdir(MDRequestRef& mdr)
   LogSegment *ls = mds->mdlog->get_current_segment();
   ls->open_files.push_back(&newi->item_open_file);
 
-  journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(mds, mdr, dn, newi, follows));
+  journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(mds, mdr, dn, newi));
 }
 
 
@@ -4488,8 +4545,6 @@ void Server::handle_client_symlink(MDRequestRef& mdr)
   if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
     return;
 
-  snapid_t follows = dn->get_dir()->inode->find_snaprealm()->get_newest_seq();
-
   unsigned mode = S_IFLNK | 0777;
   CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode);
   assert(newi);
@@ -4504,8 +4559,6 @@ void Server::handle_client_symlink(MDRequestRef& mdr)
   newi->inode.version = dn->pre_dirty();
   newi->inode.update_backtrace();
 
-  if (follows >= dn->first)
-    dn->first = follows + 1;
   newi->first = dn->first;
 
   // prepare finisher
@@ -4517,7 +4570,7 @@ void Server::handle_client_symlink(MDRequestRef& mdr)
   mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
   le->metablob.add_primary_dentry(dn, newi, true, true);
 
-  journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(mds, mdr, dn, newi, follows));
+  journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(mds, mdr, dn, newi));
 }
 
 
@@ -4577,7 +4630,7 @@ class C_MDS_link_local_finish : public MDSInternalContext {
   version_t dnpv;
   version_t tipv;
 public:
-  C_MDS_link_local_finish(MDS *m, MDRequestRef& r, CDentry *d, CInode *ti,
+  C_MDS_link_local_finish(MDSRank *m, MDRequestRef& r, CDentry *d, CInode *ti,
 			  version_t dnpv_, version_t tipv_) :
     MDSInternalContext(m), mdr(r), dn(d), targeti(ti),
     dnpv(dnpv_), tipv(tipv_) { }
@@ -4604,10 +4657,6 @@ void Server::_link_local(MDRequestRef& mdr, CDentry *dn, CInode *targeti)
   pi->ctime = mdr->get_op_stamp();
   pi->version = tipv;
 
-  snapid_t follows = dn->get_dir()->inode->find_snaprealm()->get_newest_seq();
-  if (follows >= dn->first)
-    dn->first = follows;
-
   // log + wait
   EUpdate *le = new EUpdate(mdlog, "link_local");
   mdlog->start_entry(le);
@@ -4638,7 +4687,7 @@ void Server::_link_local_finish(MDRequestRef& mdr, CDentry *dn, CInode *targeti,
   mdr->apply();
 
   MDRequestRef null_ref;
-  mds->mdcache->send_dentry_link(dn, null_ref);
+  mdcache->send_dentry_link(dn, null_ref);
 
   // bump target popularity
   mds->balancer->hit_inode(mdr->get_mds_stamp(), targeti, META_POP_IWR);
@@ -4658,7 +4707,7 @@ class C_MDS_link_remote_finish : public MDSInternalContext {
   CInode *targeti;
   version_t dpv;
 public:
-  C_MDS_link_remote_finish(MDS *m, MDRequestRef& r, bool i, CDentry *d, CInode *ti) :
+  C_MDS_link_remote_finish(MDSRank *m, MDRequestRef& r, bool i, CDentry *d, CInode *ti) :
     MDSInternalContext(m), mdr(r), inc(i), dn(d), targeti(ti),
     dpv(d->get_projected_version()) {}
   void finish(int r) {
@@ -4713,7 +4762,7 @@ void Server::_link_remote(MDRequestRef& mdr, bool inc, CDentry *dn, CInode *targ
     dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl;
     le->reqid = mdr->reqid;
     le->had_slaves = true;
-    mds->mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
+    mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
   }
 
   if (inc) {
@@ -4758,9 +4807,9 @@ void Server::_link_remote_finish(MDRequestRef& mdr, bool inc,
 
   MDRequestRef null_ref;
   if (inc)
-    mds->mdcache->send_dentry_link(dn, null_ref);
+    mdcache->send_dentry_link(dn, null_ref);
   else
-    mds->mdcache->send_dentry_unlink(dn, NULL, null_ref);
+    mdcache->send_dentry_unlink(dn, NULL, null_ref);
   
   // bump target popularity
   mds->balancer->hit_inode(mdr->get_mds_stamp(), targeti, META_POP_IWR);
@@ -4939,7 +4988,7 @@ void Server::_committed_slave(MDRequestRef& mdr)
   MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt, 
 					       MMDSSlaveRequest::OP_COMMITTED);
   mds->send_message_mds(req, mdr->slave_to_mds);
-  mds->mdcache->request_finish(mdr);
+  mdcache->request_finish(mdr);
 }
 
 struct C_MDS_LoggedLinkRollback : public ServerContext {
@@ -4964,13 +5013,13 @@ void Server::do_link_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef&
 
   assert(g_conf->mds_kill_link_at != 9);
 
-  mds->mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes
+  mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes
   assert(mdr || mds->is_resolve());
 
   MutationRef mut(new MutationImpl(rollback.reqid));
   mut->ls = mds->mdlog->get_current_segment();
 
-  CInode *in = mds->mdcache->get_inode(rollback.ino);
+  CInode *in = mdcache->get_inode(rollback.ino);
   assert(in);
   dout(10) << " target is " << *in << dendl;
   assert(!in->is_projected());  // live slave request hold versionlock xlock.
@@ -5020,9 +5069,9 @@ void Server::_link_rollback_finish(MutationRef& mut, MDRequestRef& mdr)
 
   mut->apply();
   if (mdr)
-    mds->mdcache->request_finish(mdr);
+    mdcache->request_finish(mdr);
 
-  mds->mdcache->finish_rollback(mut->reqid);
+  mdcache->finish_rollback(mut->reqid);
 
   mut->cleanup();
 }
@@ -5201,7 +5250,7 @@ class C_MDS_unlink_local_finish : public MDSInternalContext {
   CDentry *straydn;
   version_t dnpv;  // deleted dentry
 public:
-  C_MDS_unlink_local_finish(MDS *m, MDRequestRef& r, CDentry *d, CDentry *sd) :
+  C_MDS_unlink_local_finish(MDSRank *m, MDRequestRef& r, CDentry *d, CDentry *sd) :
     MDSInternalContext(m), mdr(r), dn(d), straydn(sd),
     dnpv(d->get_projected_version()) {}
   void finish(int r) {
@@ -5231,7 +5280,7 @@ void Server::_unlink_local(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
     dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl;
     le->reqid = mdr->reqid;
     le->had_slaves = true;
-    mds->mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
+    mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
   }
 
   if (straydn) {
@@ -5282,7 +5331,7 @@ void Server::_unlink_local(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
 
   if (in->is_dir()) {
     assert(straydn);
-    mds->mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
+    mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
   }
 
   journal_and_reply(mdr, 0, dn, le, new C_MDS_unlink_local_finish(mds, mdr, dn, straydn));
@@ -5319,7 +5368,7 @@ void Server::_unlink_local_finish(MDRequestRef& mdr,
   if (snap_is_new) //only new if strayin exists
     mdcache->do_realm_invalidate_and_update_notify(strayin, CEPH_SNAP_OP_SPLIT, true);
   
-  mds->mdcache->send_dentry_unlink(dn, straydn, mdr);
+  mdcache->send_dentry_unlink(dn, straydn, mdr);
   
   // update subtree map?
   if (straydn && strayin->is_dir())
@@ -5335,8 +5384,11 @@ void Server::_unlink_local_finish(MDRequestRef& mdr,
   dn->get_dir()->try_remove_unlinked_dn(dn);
 
   // clean up?
-  if (straydn)
-    mdcache->eval_stray(straydn);
+  if (straydn) {
+    // Tip off the MDCache that this dentry is a stray that
+    // might be elegible for purge.
+    mdcache->notify_stray(straydn);
+  }
 }
 
 bool Server::_rmdir_prepare_witness(MDRequestRef& mdr, mds_rank_t who, CDentry *dn, CDentry *straydn)
@@ -5460,7 +5512,7 @@ void Server::handle_slave_rmdir_prep(MDRequestRef& mdr)
   dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
   le->commit.renamed_dirino = in->ino();
 
-  mds->mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
+  mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
 
   mdr->more()->slave_update_journaled = true;
   submit_mdlog_entry(le, new C_MDS_SlaveRmdirPrep(this, mdr, dn, straydn),
@@ -5565,17 +5617,17 @@ void Server::do_rmdir_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef&
   ::decode(rollback, p);
   
   dout(10) << "do_rmdir_rollback on " << rollback.reqid << dendl;
-  mds->mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes
+  mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes
   assert(mdr || mds->is_resolve());
 
-  CDir *dir = mds->mdcache->get_dirfrag(rollback.src_dir);
+  CDir *dir = mdcache->get_dirfrag(rollback.src_dir);
   if (!dir)
-    dir = mds->mdcache->get_dirfrag(rollback.src_dir.ino, rollback.src_dname);
+    dir = mdcache->get_dirfrag(rollback.src_dir.ino, rollback.src_dname);
   assert(dir);
   CDentry *dn = dir->lookup(rollback.src_dname);
   assert(dn);
   dout(10) << " dn " << *dn << dendl;
-  dir = mds->mdcache->get_dirfrag(rollback.dest_dir);
+  dir = mdcache->get_dirfrag(rollback.dest_dir);
   assert(dir);
   CDentry *straydn = dir->lookup(rollback.dest_dname);
   assert(straydn);
@@ -5590,8 +5642,8 @@ void Server::do_rmdir_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef&
 
     mdcache->adjust_subtree_after_rename(in, straydn->get_dir(), false);
 
-    mds->mdcache->request_finish(mdr);
-    mds->mdcache->finish_rollback(rollback.reqid);
+    mdcache->request_finish(mdr);
+    mdcache->finish_rollback(rollback.reqid);
     return;
   }
 
@@ -5634,9 +5686,9 @@ void Server::_rmdir_rollback_finish(MDRequestRef& mdr, metareqid_t reqid, CDentr
   }
 
   if (mdr)
-    mds->mdcache->request_finish(mdr);
+    mdcache->request_finish(mdr);
 
-  mds->mdcache->finish_rollback(reqid);
+  mdcache->finish_rollback(reqid);
 }
 
 
@@ -5711,7 +5763,7 @@ class C_MDS_rename_finish : public MDSInternalContext {
   CDentry *destdn;
   CDentry *straydn;
 public:
-  C_MDS_rename_finish(MDS *m, MDRequestRef& r,
+  C_MDS_rename_finish(MDSRank *m, MDRequestRef& r,
 		      CDentry *sdn, CDentry *ddn, CDentry *stdn) :
     MDSInternalContext(m), mdr(r),
     srcdn(sdn), destdn(ddn), straydn(stdn) { }
@@ -5875,7 +5927,7 @@ void Server::handle_client_rename(MDRequestRef& mdr)
 
   // dest a child of src?
   // e.g. mv /usr /usr/foo
-  CDentry *pdn = destdir->inode->parent;
+  CDentry *pdn = destdir->inode->get_projected_parent_dn();
   while (pdn) {
     if (pdn == srcdn) {
       dout(7) << "cannot rename item to be a child of itself" << dendl;
@@ -5943,7 +5995,7 @@ void Server::handle_client_rename(MDRequestRef& mdr)
     rdlocks.insert(&srctrace[i]->lock);
   xlocks.insert(&srcdn->lock);
   mds_rank_t srcdirauth = srcdn->get_dir()->authority().first;
-  if (srcdirauth != mds->whoami) {
+  if (srcdirauth != mds->get_nodeid()) {
     dout(10) << " will remote_wrlock srcdir scatterlocks on mds." << srcdirauth << dendl;
     remote_wrlocks[&srcdn->get_dir()->inode->filelock] = srcdirauth;
     remote_wrlocks[&srcdn->get_dir()->inode->nestlock] = srcdirauth;
@@ -6025,7 +6077,7 @@ void Server::handle_client_rename(MDRequestRef& mdr)
 	(srcrealm->get_newest_seq() + 1 > srcdn->first ||
 	 destrealm->get_newest_seq() + 1 > srcdn->first)) {
       dout(10) << " renaming between snaprealms, creating snaprealm for " << *srci << dendl;
-      mds->mdcache->snaprealm_create(mdr, srci);
+      mdcache->snaprealm_create(mdr, srci);
       return;
     }
   }
@@ -6116,14 +6168,14 @@ void Server::handle_client_rename(MDRequestRef& mdr)
     le->reqid = mdr->reqid;
     le->had_slaves = true;
     
-    mds->mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
+    mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
     // no need to send frozen auth pin to recovring auth MDS of srci
     mdr->more()->is_remote_frozen_authpin = false;
   }
   
   _rename_prepare(mdr, &le->metablob, &le->client_map, srcdn, destdn, straydn);
   if (le->client_map.length())
-    le->cmapv = mds->sessionmap.projected;
+    le->cmapv = mds->sessionmap.get_projected();
 
   // -- commit locally --
   C_MDS_rename_finish *fin = new C_MDS_rename_finish(mds, mdr, srcdn, destdn, straydn);
@@ -6142,7 +6194,7 @@ void Server::_rename_finish(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn,
   // apply
   _rename_apply(mdr, srcdn, destdn, straydn);
 
-  mds->mdcache->send_dentry_link(destdn, mdr);
+  mdcache->send_dentry_link(destdn, mdr);
 
   CDentry::linkage_t *destdnl = destdn->get_linkage();
   CInode *in = destdnl->get_inode();
@@ -6170,8 +6222,9 @@ void Server::_rename_finish(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn,
     mds->locker->eval(in, CEPH_CAP_LOCKS, true);
 
   // clean up?
-  if (straydn) 
-    mdcache->eval_stray(straydn);
+  if (straydn) {
+    mdcache->notify_stray(straydn);
+  }
 }
 
 
@@ -6245,7 +6298,7 @@ bool Server::_need_force_journal(CInode *diri, bool empty)
   bool force_journal = false;
   if (empty) {
     for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
-      if ((*p)->is_subtree_root() && (*p)->get_dir_auth().first == mds->whoami) {
+      if ((*p)->is_subtree_root() && (*p)->get_dir_auth().first == mds->get_nodeid()) {
 	dout(10) << " frag " << (*p)->get_frag() << " is auth subtree dirfrag, will force journal" << dendl;
 	force_journal = true;
 	break;
@@ -6255,13 +6308,13 @@ bool Server::_need_force_journal(CInode *diri, bool empty)
   } else {
     // see if any children of our frags are auth subtrees.
     list<CDir*> subtrees;
-    mds->mdcache->list_subtrees(subtrees);
+    mdcache->list_subtrees(subtrees);
     dout(10) << " subtrees " << subtrees << " frags " << ls << dendl;
     for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
       CDir *dir = *p;
       for (list<CDir*>::iterator q = subtrees.begin(); q != subtrees.end(); ++q) {
 	if (dir->contains(*q)) {
-	  if ((*q)->get_dir_auth().first == mds->whoami) {
+	  if ((*q)->get_dir_auth().first == mds->get_nodeid()) {
 	    dout(10) << " frag " << (*p)->get_frag() << " contains (maybe) auth subtree, will force journal "
 		     << **q << dendl;
 	    force_journal = true;
@@ -6647,7 +6700,7 @@ void Server::_rename_apply(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, C
       // finish cap imports
       finish_force_open_sessions(mdr->more()->imported_client_map, mdr->more()->sseq_map);
       if (mdr->more()->cap_imports.count(destdnl->get_inode())) {
-	mds->mdcache->migrator->finish_import_inode_caps(destdnl->get_inode(),
+	mdcache->migrator->finish_import_inode_caps(destdnl->get_inode(),
 							 mdr->more()->srcdn_auth_mds, true,
 							 mdr->more()->cap_imports[destdnl->get_inode()],
 							 imported_caps);
@@ -6681,6 +6734,20 @@ void Server::_rename_apply(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, C
     }
   }
 
+  if (srcdn->get_dir()->inode->is_stray() &&
+      srcdn->get_dir()->inode->get_stray_owner() == mds->get_nodeid()) {
+    // A reintegration event or a migration away from me
+    dout(20) << __func__ << ": src dentry was a stray, updating stats" << dendl;
+    mdcache->notify_stray_removed();
+  }
+
+  if (destdn->get_dir()->inode->is_stray() &&
+      destdn->get_dir()->inode->get_stray_owner() == mds->get_nodeid()) {
+    // A stray migration (to me)
+    dout(20) << __func__ << ": dst dentry was a stray, updating stats" << dendl;
+    mdcache->notify_stray_created();
+  }
+
   // src
   if (srcdn->is_auth())
     srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls);
@@ -7111,7 +7178,7 @@ void Server::_commit_slave_rename(MDRequestRef& mdr, int r,
 	mdr->more()->is_ambiguous_auth = false;
       }
       mds->queue_waiters(finished);
-      mds->mdcache->request_finish(mdr);
+      mdcache->request_finish(mdr);
     }
   }
 }
@@ -7172,15 +7239,15 @@ void Server::do_rename_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef
 
   dout(10) << "do_rename_rollback on " << rollback.reqid << dendl;
   // need to finish this update before sending resolve to claim the subtree
-  mds->mdcache->add_rollback(rollback.reqid, master);
+  mdcache->add_rollback(rollback.reqid, master);
 
   MutationRef mut(new MutationImpl(rollback.reqid));
   mut->ls = mds->mdlog->get_current_segment();
 
   CDentry *srcdn = NULL;
-  CDir *srcdir = mds->mdcache->get_dirfrag(rollback.orig_src.dirfrag);
+  CDir *srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag);
   if (!srcdir)
-    srcdir = mds->mdcache->get_dirfrag(rollback.orig_src.dirfrag.ino, rollback.orig_src.dname);
+    srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag.ino, rollback.orig_src.dname);
   if (srcdir) {
     dout(10) << "  srcdir " << *srcdir << dendl;
     srcdn = srcdir->lookup(rollback.orig_src.dname);
@@ -7193,9 +7260,9 @@ void Server::do_rename_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef
     dout(10) << "  srcdir not found" << dendl;
 
   CDentry *destdn = NULL;
-  CDir *destdir = mds->mdcache->get_dirfrag(rollback.orig_dest.dirfrag);
+  CDir *destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag);
   if (!destdir)
-    destdir = mds->mdcache->get_dirfrag(rollback.orig_dest.dirfrag.ino, rollback.orig_dest.dname);
+    destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag.ino, rollback.orig_dest.dname);
   if (destdir) {
     dout(10) << " destdir " << *destdir << dendl;
     destdn = destdir->lookup(rollback.orig_dest.dname);
@@ -7208,16 +7275,16 @@ void Server::do_rename_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef
 
   CInode *in = NULL;
   if (rollback.orig_src.ino) {
-    in = mds->mdcache->get_inode(rollback.orig_src.ino);
+    in = mdcache->get_inode(rollback.orig_src.ino);
     if (in && in->is_dir())
       assert(srcdn && destdn);
   } else
-    in = mds->mdcache->get_inode(rollback.orig_src.remote_ino);
+    in = mdcache->get_inode(rollback.orig_src.remote_ino);
 
   CDir *straydir = NULL;
   CDentry *straydn = NULL;
   if (rollback.stray.dirfrag.ino) {
-    straydir = mds->mdcache->get_dirfrag(rollback.stray.dirfrag);
+    straydir = mdcache->get_dirfrag(rollback.stray.dirfrag);
     if (straydir) {
       dout(10) << "straydir " << *straydir << dendl;
       straydn = straydir->lookup(rollback.stray.dname);
@@ -7232,11 +7299,11 @@ void Server::do_rename_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef
 
   CInode *target = NULL;
   if (rollback.orig_dest.ino) {
-    target = mds->mdcache->get_inode(rollback.orig_dest.ino);
+    target = mdcache->get_inode(rollback.orig_dest.ino);
     if (target)
       assert(destdn && straydn);
   } else if (rollback.orig_dest.remote_ino)
-    target = mds->mdcache->get_inode(rollback.orig_dest.remote_ino);
+    target = mdcache->get_inode(rollback.orig_dest.remote_ino);
 
   // can't use is_auth() in the resolve stage
   mds_rank_t whoami = mds->get_nodeid();
@@ -7459,10 +7526,10 @@ void Server::_rename_rollback_finish(MutationRef& mut, MDRequestRef& mdr, CDentr
     }
     mds->queue_waiters(finished);
     if (finish_mdr)
-      mds->mdcache->request_finish(mdr);
+      mdcache->request_finish(mdr);
   }
 
-  mds->mdcache->finish_rollback(mut->reqid);
+  mdcache->finish_rollback(mut->reqid);
 
   mut->cleanup();
 }
@@ -7574,29 +7641,58 @@ void Server::handle_client_lssnap(MDRequestRef& mdr)
   map<snapid_t,SnapInfo*> infomap;
   realm->get_snap_info(infomap, diri->get_oldest_snap());
 
+  unsigned max_entries = req->head.args.readdir.max_entries;
+  if (!max_entries)
+    max_entries = infomap.size();
+  int max_bytes = req->head.args.readdir.max_bytes;
+  if (!max_bytes)
+    max_bytes = 512 << 10;
+
+  __u64 last_snapid = 0;
+  string offset_str = req->get_path2();
+  if (!offset_str.empty())
+    last_snapid = realm->resolve_snapname(offset_str, diri->ino());
+
+  bufferlist dirbl;
+  encode_empty_dirstat(dirbl);
+
+  max_bytes -= dirbl.length() - sizeof(__u32) + sizeof(__u8) * 2;
+
   __u32 num = 0;
   bufferlist dnbl;
-  for (map<snapid_t,SnapInfo*>::iterator p = infomap.begin();
-       p != infomap.end();
-       ++p) {
+  map<snapid_t,SnapInfo*>::iterator p = infomap.upper_bound(last_snapid);
+  for (; p != infomap.end() && num < max_entries; ++p) {
     dout(10) << p->first << " -> " << *p->second << dendl;
 
     // actual
+    string snap_name;
     if (p->second->ino == diri->ino())
-      ::encode(p->second->name, dnbl);
+      snap_name = p->second->name;
     else
-      ::encode(p->second->get_long_name(), dnbl);
+      snap_name = p->second->get_long_name();
+
+    unsigned start_len = dnbl.length();
+    if (int(start_len + snap_name.length() + sizeof(__u32) + sizeof(LeaseStat)) > max_bytes)
+      break;
+
+    ::encode(snap_name, dnbl);
     encode_infinite_lease(dnbl);
-    diri->encode_inodestat(dnbl, mdr->session, realm, p->first);
-    num++;
+
+    int r = diri->encode_inodestat(dnbl, mdr->session, realm, p->first, max_bytes - (int)dnbl.length());
+    if (r < 0) {
+      bufferlist keep;
+      keep.substr_of(dnbl, 0, start_len);
+      dnbl.swap(keep);
+      break;
+    }
+    ++num;
   }
 
-  bufferlist dirbl;
-  encode_empty_dirstat(dirbl);
   ::encode(num, dirbl);
-  __u8 t = 1;
-  ::encode(t, dirbl);  // end
-  ::encode(t, dirbl);  // complete
+  __u8 end = (p == infomap.end());
+  ::encode(end, dirbl);  // end
+  __u8 complete = end && last_snapid == 0;
+  ::encode(complete, dirbl);  // complete
   dirbl.claim_append(dnbl);
   
   mdr->reply_extra_bl = dirbl;
@@ -7611,7 +7707,7 @@ struct C_MDS_mksnap_finish : public MDSInternalContext {
   MDRequestRef mdr;
   CInode *diri;
   SnapInfo info;
-  C_MDS_mksnap_finish(MDS *m, MDRequestRef& r, CInode *di, SnapInfo &i) :
+  C_MDS_mksnap_finish(MDSRank *m, MDRequestRef& r, CInode *di, SnapInfo &i) :
     MDSInternalContext(m), mdr(r), diri(di), info(i) {}
   void finish(int r) {
     mds->server->_mksnap_finish(mdr, diri, info);
@@ -7688,7 +7784,7 @@ void Server::handle_client_mksnap(MDRequestRef& mdr)
     mds->snapclient->prepare_create(diri->ino(), snapname,
 				    mdr->get_mds_stamp(),
 				    &mdr->more()->stid, &mdr->more()->snapidbl,
-				    new C_MDS_RetryRequest(mds->mdcache, mdr));
+				    new C_MDS_RetryRequest(mdcache, mdr));
     return;
   }
 
@@ -7761,7 +7857,7 @@ struct C_MDS_rmsnap_finish : public MDSInternalContext {
   MDRequestRef mdr;
   CInode *diri;
   snapid_t snapid;
-  C_MDS_rmsnap_finish(MDS *m, MDRequestRef& r, CInode *di, snapid_t sn) :
+  C_MDS_rmsnap_finish(MDSRank *m, MDRequestRef& r, CInode *di, snapid_t sn) :
     MDSInternalContext(m), mdr(r), diri(di), snapid(sn) {}
   void finish(int r) {
     mds->server->_rmsnap_finish(mdr, diri, snapid);
@@ -7821,7 +7917,7 @@ void Server::handle_client_rmsnap(MDRequestRef& mdr)
   if (!mdr->more()->stid) {
     mds->snapclient->prepare_destroy(diri->ino(), snapid,
 				     &mdr->more()->stid, &mdr->more()->snapidbl,
-				     new C_MDS_RetryRequest(mds->mdcache, mdr));
+				     new C_MDS_RetryRequest(mdcache, mdr));
     return;
   }
   version_t stid = mdr->more()->stid;
@@ -7875,8 +7971,147 @@ void Server::_rmsnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
   // yay
   mdr->in[0] = diri;
   respond_to_request(mdr, 0);
+
+  // purge snapshot data
+  if (diri->snaprealm->have_past_parents_open())
+    diri->purge_stale_snap_data(diri->snaprealm->get_snaps());
 }
 
+struct C_MDS_renamesnap_finish : public MDSInternalContext {
+  MDRequestRef mdr;
+  CInode *diri;
+  snapid_t snapid;
+  C_MDS_renamesnap_finish(MDSRank *m, MDRequestRef& r, CInode *di, snapid_t sn) :
+    MDSInternalContext(m), mdr(r), diri(di), snapid(sn) {}
+  void finish(int r) {
+    mds->server->_renamesnap_finish(mdr, diri, snapid);
+  }
+};
+
+/* This function takes responsibility for the passed mdr*/
+void Server::handle_client_renamesnap(MDRequestRef& mdr)
+{
+  MClientRequest *req = mdr->client_request;
+  if (req->get_filepath().get_ino() != req->get_filepath2().get_ino()) {
+    respond_to_request(mdr, -EINVAL);
+    return;
+  }
+
+  CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
+  if (!diri || diri->state_test(CInode::STATE_PURGING)) {
+    respond_to_request(mdr, -ESTALE);
+    return;
+  }
+
+  if (!diri->is_auth()) {    // fw to auth?
+    mdcache->request_forward(mdr, diri->authority().first);
+    return;
+  }
+
+  if (!diri->is_dir()) { // dir only
+    respond_to_request(mdr, -ENOTDIR);
+    return;
+  }
+
+  if (mdr->client_request->get_caller_uid() < g_conf->mds_snap_min_uid ||
+      mdr->client_request->get_caller_uid() > g_conf->mds_snap_max_uid) {
+    respond_to_request(mdr, -EPERM);
+    return;
+  }
+
+  const string &dstname = req->get_filepath().last_dentry();
+  const string &srcname = req->get_filepath2().last_dentry();
+  dout(10) << "renamesnap " << srcname << "->" << dstname << " on " << *diri << dendl;
+
+  if (srcname.length() == 0 || srcname[0] == '_') {
+    respond_to_request(mdr, -EINVAL);   // can't rename a parent snap.
+    return;
+  }
+  if (!diri->snaprealm || !diri->snaprealm->exists(srcname)) {
+    respond_to_request(mdr, -ENOENT);
+    return;
+  }
+  if (dstname.length() == 0 || dstname[0] == '_') {
+    respond_to_request(mdr, -EINVAL);
+    return;
+  }
+  if (diri->snaprealm->exists(dstname)) {
+    respond_to_request(mdr, -EEXIST);
+    return;
+  }
+
+  snapid_t snapid = diri->snaprealm->resolve_snapname(srcname, diri->ino());
+  dout(10) << " snapname " << srcname << " is " << snapid << dendl;
+
+  // lock snap
+  set<SimpleLock*> rdlocks, wrlocks, xlocks;
+
+  mds->locker->include_snap_rdlocks(rdlocks, diri);
+  rdlocks.erase(&diri->snaplock);
+  xlocks.insert(&diri->snaplock);
+
+  if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
+    return;
+
+    // prepare
+  if (!mdr->more()->stid) {
+    mds->snapclient->prepare_update(diri->ino(), snapid, dstname, utime_t(),
+				    &mdr->more()->stid, &mdr->more()->snapidbl,
+				    new C_MDS_RetryRequest(mdcache, mdr));
+    return;
+  }
+
+  version_t stid = mdr->more()->stid;
+  bufferlist::iterator p = mdr->more()->snapidbl.begin();
+  snapid_t seq;
+  ::decode(seq, p);
+  dout(10) << " stid is " << stid << ", seq is " << seq << dendl;
+
+  // journal
+  inode_t *pi = diri->project_inode();
+  pi->ctime = mdr->get_op_stamp();
+  pi->version = diri->pre_dirty();
+
+  // project the snaprealm
+  sr_t *newsnap = diri->project_snaprealm();
+  assert(newsnap->snaps.count(snapid));
+  newsnap->snaps[snapid].name = dstname;
+
+  // journal the inode changes
+  mdr->ls = mdlog->get_current_segment();
+  EUpdate *le = new EUpdate(mdlog, "renamesnap");
+  mdlog->start_entry(le);
+
+  le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
+  le->metablob.add_table_transaction(TABLE_SNAP, stid);
+  mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
+  mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
+
+  // journal the snaprealm changes
+  submit_mdlog_entry(le, new C_MDS_renamesnap_finish(mds, mdr, diri, snapid),
+                     mdr, __func__);
+  mdlog->flush();
+}
+
+void Server::_renamesnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
+{
+  dout(10) << "_renamesnap_finish " << *mdr << " " << snapid << dendl;
+
+  diri->pop_and_dirty_projected_inode(mdr->ls);
+  mdr->apply();
+
+  mds->snapclient->commit(mdr->more()->stid, mdr->ls);
+
+  dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
+
+  mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_UPDATE, true);
+
+  // yay
+  mdr->in[0] = diri;
+  mdr->tracei = diri;
+  mdr->snapid = snapid;
+  respond_to_request(mdr, 0);
+}
 
 /**
  * Return true if server is in state RECONNECT and this
diff --git a/src/mds/Server.h b/src/mds/Server.h
index 17dcd94..c99f7ae 100644
--- a/src/mds/Server.h
+++ b/src/mds/Server.h
@@ -15,7 +15,7 @@
 #ifndef CEPH_MDS_SERVER_H
 #define CEPH_MDS_SERVER_H
 
-#include "MDS.h"
+#include "MDSRank.h"
 
 class OSDMap;
 class PerfCounters;
@@ -24,6 +24,9 @@ class EMetaBlob;
 class EUpdate;
 class MMDSSlaveRequest;
 struct SnapInfo;
+class MClientRequest;
+class MClientReply;
+class MDLog;
 
 struct MutationImpl;
 struct MDRequestImpl;
@@ -41,35 +44,30 @@ enum {
 };
 
 class Server {
-public:
-  // XXX FIXME: can probably friend enough contexts to make this not need to be public
-  MDS *mds;
 private:
+  MDSRank *mds;
   MDCache *mdcache;
   MDLog *mdlog;
-  Messenger *messenger;
   PerfCounters *logger;
 
   // OSDMap full status, used to generate ENOSPC on some operations
   bool is_full;
 
-public:
+  // State for while in reconnect
+  MDSInternalContext *reconnect_done;
   int failed_reconnects;
 
+  friend class MDSContinuation;
+  friend class ServerContext;
+
+public:
   bool terminating_sessions;
 
-  Server(MDS *m) : 
-    mds(m), 
-    mdcache(mds->mdcache), mdlog(mds->mdlog),
-    messenger(mds->messenger),
-    logger(0),
-    is_full(false),
-    failed_reconnects(0),
-    terminating_sessions(false) {
-  }
+  Server(MDSRank *m);
   ~Server() {
     g_ceph_context->get_perfcounters_collection()->remove(logger);
     delete logger;
+    delete reconnect_done;
   }
 
   void create_logger();
@@ -99,7 +97,7 @@ public:
   void find_idle_sessions();
   void kill_session(Session *session, Context *on_safe);
   void journal_close_session(Session *session, int state, Context *on_safe);
-  void reconnect_clients();
+  void reconnect_clients(MDSInternalContext *reconnect_done_);
   void handle_client_reconnect(class MClientReconnect *m);
   //void process_reconnect_cap(CInode *in, int from, ceph_mds_cap_reconnect& capinfo);
   void reconnect_gather_finish();
@@ -172,7 +170,7 @@ public:
   void handle_client_setdirlayout(MDRequestRef& mdr);
 
   int parse_layout_vxattr(string name, string value, const OSDMap *osdmap,
-			  ceph_file_layout *layout);
+			  ceph_file_layout *layout, bool validate=true);
   int parse_quota_vxattr(string name, string value, quota_info_t *quota);
   void handle_set_vxattr(MDRequestRef& mdr, CInode *cur,
 			 ceph_file_layout *dir_layout,
@@ -243,6 +241,9 @@ public:
   void _mksnap_finish(MDRequestRef& mdr, CInode *diri, SnapInfo &info);
   void handle_client_rmsnap(MDRequestRef& mdr);
   void _rmsnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid);
+  void handle_client_renamesnap(MDRequestRef& mdr);
+  void _renamesnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid);
+
 
   // helpers
   bool _rename_prepare_witness(MDRequestRef& mdr, mds_rank_t who, set<mds_rank_t> &witnesse,
diff --git a/src/mds/SessionMap.cc b/src/mds/SessionMap.cc
index 4fa43d9..dc0ba71 100644
--- a/src/mds/SessionMap.cc
+++ b/src/mds/SessionMap.cc
@@ -12,11 +12,12 @@
  * 
  */
 
-#include "MDS.h"
+#include "MDSRank.h"
 #include "MDCache.h"
 #include "Mutation.h"
 #include "SessionMap.h"
 #include "osdc/Filer.h"
+#include "common/Finisher.h"
 
 #include "common/config.h"
 #include "common/errno.h"
@@ -32,7 +33,7 @@ class SessionMapIOContext : public MDSIOContextBase
 {
   protected:
     SessionMap *sessionmap;
-    MDS *get_mds() {return sessionmap->mds;}
+    MDSRank *get_mds() {return sessionmap->mds;}
   public:
     SessionMapIOContext(SessionMap *sessionmap_) : sessionmap(sessionmap_) {
       assert(sessionmap != NULL);
@@ -63,19 +64,173 @@ void SessionMap::dump()
 object_t SessionMap::get_object_name()
 {
   char s[30];
-  snprintf(s, sizeof(s), "mds%d_sessionmap", int(mds->whoami));
+  snprintf(s, sizeof(s), "mds%d_sessionmap", int(mds->get_nodeid()));
   return object_t(s);
 }
 
 class C_IO_SM_Load : public SessionMapIOContext {
 public:
-  bufferlist bl;
-  C_IO_SM_Load(SessionMap *cm) : SessionMapIOContext(cm) {}
+  const bool first;  //< Am I the initial (header) load?
+  int header_r;  //< Return value from OMAP header read
+  int values_r;  //< Return value from OMAP value read
+  bufferlist header_bl;
+  std::map<std::string, bufferlist> session_vals;
+
+  C_IO_SM_Load(SessionMap *cm, const bool f)
+    : SessionMapIOContext(cm), first(f), header_r(0), values_r(0) {}
+
   void finish(int r) {
-    sessionmap->_load_finish(r, bl);
+    sessionmap->_load_finish(r, header_r, values_r, first, header_bl, session_vals);
   }
 };
 
+
+/**
+ * Decode OMAP header.  Call this once when loading.
+ */
+void SessionMapStore::decode_header(
+      bufferlist &header_bl)
+{
+  bufferlist::iterator q = header_bl.begin();
+  DECODE_START(1, q)
+  ::decode(version, q);
+  DECODE_FINISH(q);
+}
+
+void SessionMapStore::encode_header(
+    bufferlist *header_bl)
+{
+  ENCODE_START(1, 1, *header_bl);
+  ::encode(version, *header_bl);
+  ENCODE_FINISH(*header_bl);
+}
+
+/**
+ * Decode and insert some serialized OMAP values.  Call this
+ * repeatedly to insert batched loads.
+ */
+void SessionMapStore::decode_values(std::map<std::string, bufferlist> &session_vals)
+{
+  for (std::map<std::string, bufferlist>::iterator i = session_vals.begin();
+       i != session_vals.end(); ++i) {
+
+    entity_inst_t inst;
+
+    bool parsed = inst.name.parse(i->first);
+    if (!parsed) {
+      derr << "Corrupt entity name '" << i->first << "' in sessionmap" << dendl;
+      throw buffer::malformed_input("Corrupt entity name in sessionmap");
+    }
+
+    Session *s = get_or_add_session(inst);
+    if (s->is_closed())
+      s->set_state(Session::STATE_OPEN);
+    bufferlist::iterator q = i->second.begin();
+    s->decode(q);
+  }
+}
+
+/**
+ * An OMAP read finished.
+ */
+void SessionMap::_load_finish(
+    int operation_r,
+    int header_r,
+    int values_r,
+    bool first,
+    bufferlist &header_bl,
+    std::map<std::string, bufferlist> &session_vals)
+{
+  if (operation_r < 0) {
+    derr << "_load_finish got " << cpp_strerror(operation_r) << dendl;
+    mds->clog->error() << "error reading sessionmap '" << get_object_name()
+                       << "' " << operation_r << " ("
+                       << cpp_strerror(operation_r) << ")";
+    mds->damaged();
+    assert(0);  // Should be unreachable because damaged() calls respawn()
+  }
+
+  // Decode header
+  if (first) {
+    if (header_r != 0) {
+      derr << __func__ << ": header error: " << cpp_strerror(header_r) << dendl;
+      mds->clog->error() << "error reading sessionmap header "
+                         << header_r << " (" << cpp_strerror(header_r) << ")";
+      mds->damaged();
+      assert(0);  // Should be unreachable because damaged() calls respawn()
+    }
+
+    if(header_bl.length() == 0) {
+      dout(4) << __func__ << ": header missing, loading legacy..." << dendl;
+      load_legacy();
+      return;
+    }
+
+    try {
+      decode_header(header_bl);
+    } catch (buffer::error &e) {
+      mds->clog->error() << "corrupt sessionmap header: " << e.what();
+      mds->damaged();
+      assert(0);  // Should be unreachable because damaged() calls respawn()
+    }
+    dout(10) << __func__ << " loaded version " << version << dendl;
+  }
+
+  if (values_r != 0) {
+    derr << __func__ << ": error reading values: "
+      << cpp_strerror(values_r) << dendl;
+    mds->clog->error() << "error reading sessionmap values: " 
+                       << values_r << " (" << cpp_strerror(values_r) << ")";
+    mds->damaged();
+    assert(0);  // Should be unreachable because damaged() calls respawn()
+  }
+
+  // Decode session_vals
+  try {
+    decode_values(session_vals);
+  } catch (buffer::error &e) {
+    mds->clog->error() << "corrupt sessionmap values: " << e.what();
+    mds->damaged();
+    assert(0);  // Should be unreachable because damaged() calls respawn()
+  }
+
+  if (session_vals.size() == g_conf->mds_sessionmap_keys_per_op) {
+    // Issue another read if we're not at the end of the omap
+    const std::string last_key = session_vals.rbegin()->first;
+    dout(10) << __func__ << ": continue omap load from '"
+             << last_key << "'" << dendl;
+    object_t oid = get_object_name();
+    object_locator_t oloc(mds->mdsmap->get_metadata_pool());
+    C_IO_SM_Load *c = new C_IO_SM_Load(this, false);
+    ObjectOperation op;
+    op.omap_get_vals(last_key, "", g_conf->mds_sessionmap_keys_per_op,
+        &c->session_vals, &c->values_r);
+    mds->objecter->read(oid, oloc, op, CEPH_NOSNAP, NULL, 0,
+        new C_OnFinisher(c, mds->finisher));
+  } else {
+    // I/O is complete.  Update `by_state`
+    dout(10) << __func__ << ": omap load complete" << dendl;
+    for (ceph::unordered_map<entity_name_t, Session*>::iterator i = session_map.begin();
+         i != session_map.end(); ++i) {
+      Session *s = i->second;
+      if (by_state.count(s->get_state()) == 0)
+        by_state[s->get_state()] = new xlist<Session*>;
+      by_state[s->get_state()]->push_back(&s->item_session_list);
+    }
+
+    // Population is complete.  Trigger load waiters.
+    dout(10) << __func__ << ": v " << version 
+	   << ", " << session_map.size() << " sessions" << dendl;
+    projected = committing = committed = version;
+    dump();
+    finish_contexts(g_ceph_context, waiting_for_load);
+  }
+}
+
+/**
+ * Populate session state from OMAP records in this
+ * rank's sessionmap object.
+ */
 void SessionMap::load(MDSInternalContextBase *onload)
 {
   dout(10) << "load" << dendl;
@@ -83,14 +238,47 @@ void SessionMap::load(MDSInternalContextBase *onload)
   if (onload)
     waiting_for_load.push_back(onload);
   
-  C_IO_SM_Load *c = new C_IO_SM_Load(this);
+  C_IO_SM_Load *c = new C_IO_SM_Load(this, true);
+  object_t oid = get_object_name();
+  object_locator_t oloc(mds->mdsmap->get_metadata_pool());
+
+  ObjectOperation op;
+  op.omap_get_header(&c->header_bl, &c->header_r);
+  op.omap_get_vals("", "", g_conf->mds_sessionmap_keys_per_op,
+      &c->session_vals, &c->values_r);
+
+  mds->objecter->read(oid, oloc, op, CEPH_NOSNAP, NULL, 0, new C_OnFinisher(c, mds->finisher));
+}
+
+class C_IO_SM_LoadLegacy : public SessionMapIOContext {
+public:
+  bufferlist bl;
+  C_IO_SM_LoadLegacy(SessionMap *cm) : SessionMapIOContext(cm) {}
+  void finish(int r) {
+    sessionmap->_load_legacy_finish(r, bl);
+  }
+};
+
+
+/**
+ * Load legacy (object data blob) SessionMap format, assuming
+ * that waiting_for_load has already been populated with
+ * the relevant completion.  This is the fallback if we do not
+ * find an OMAP header when attempting to load normally.
+ */
+void SessionMap::load_legacy()
+{
+  dout(10) << __func__ << dendl;
+
+  C_IO_SM_LoadLegacy *c = new C_IO_SM_LoadLegacy(this);
   object_t oid = get_object_name();
   object_locator_t oloc(mds->mdsmap->get_metadata_pool());
+
   mds->objecter->read_full(oid, oloc, CEPH_NOSNAP, &c->bl, 0,
-			   new C_OnFinisher(c, &mds->finisher));
+			   new C_OnFinisher(c, mds->finisher));
 }
 
-void SessionMap::_load_finish(int r, bufferlist &bl)
+void SessionMap::_load_legacy_finish(int r, bufferlist &bl)
 { 
   bufferlist::iterator blp = bl.begin();
   if (r < 0) {
@@ -98,13 +286,24 @@ void SessionMap::_load_finish(int r, bufferlist &bl)
     assert(0 == "failed to load sessionmap");
   }
   dump();
-  decode(blp);  // note: this sets last_cap_renew = now()
+  decode_legacy(blp);  // note: this sets last_cap_renew = now()
   dout(10) << "_load_finish v " << version 
 	   << ", " << session_map.size() << " sessions, "
 	   << bl.length() << " bytes"
 	   << dendl;
   projected = committing = committed = version;
   dump();
+
+  // Mark all sessions dirty, so that on next save() we will write
+  // a complete OMAP version of the data loaded from the legacy format
+  for (ceph::unordered_map<entity_name_t, Session*>::iterator i = session_map.begin();
+       i != session_map.end(); ++i) {
+    // Don't use mark_dirty because on this occasion we want to ignore the
+    // keys_per_op limit and do one big write (upgrade must be atomic)
+    dirty_sessions.insert(i->first);
+  }
+  loaded_legacy = true;
+
   finish_contexts(g_ceph_context, waiting_for_load);
 }
 
@@ -124,7 +323,7 @@ public:
 
 void SessionMap::save(MDSInternalContextBase *onsave, version_t needv)
 {
-  dout(10) << "save needv " << needv << ", v " << version << dendl;
+  dout(10) << __func__ << ": needv " << needv << ", v " << version << dendl;
  
   if (needv && committing >= needv) {
     assert(committing > committed);
@@ -133,21 +332,78 @@ void SessionMap::save(MDSInternalContextBase *onsave, version_t needv)
   }
 
   commit_waiters[version].push_back(onsave);
-  
-  bufferlist bl;
-  
-  encode(bl);
+
   committing = version;
   SnapContext snapc;
   object_t oid = get_object_name();
   object_locator_t oloc(mds->mdsmap->get_metadata_pool());
 
-  mds->objecter->write_full(oid, oloc,
-			    snapc,
-			    bl, ceph_clock_now(g_ceph_context), 0,
-			    NULL,
-			    new C_OnFinisher(new C_IO_SM_Save(this, version),
-					     &mds->finisher));
+  ObjectOperation op;
+
+  /* Compose OSD OMAP transaction for full write */
+  bufferlist header_bl;
+  encode_header(&header_bl);
+  op.omap_set_header(header_bl);
+
+  /* If we loaded a legacy sessionmap, then erase the old data.  If
+   * an old-versioned MDS tries to read it, it'll fail out safely
+   * with an end_of_buffer exception */
+  if (loaded_legacy) {
+    dout(4) << __func__ << " erasing legacy sessionmap" << dendl;
+    op.truncate(0);
+    loaded_legacy = false;  // only need to truncate once.
+  }
+
+  dout(20) << " updating keys:" << dendl;
+  map<string, bufferlist> to_set;
+  for(std::set<entity_name_t>::iterator i = dirty_sessions.begin();
+      i != dirty_sessions.end(); ++i) {
+    const entity_name_t name = *i;
+    Session *session = session_map[name];
+
+    if (session->is_open() ||
+	session->is_closing() ||
+	session->is_stale() ||
+	session->is_killing()) {
+      dout(20) << "  " << name << dendl;
+      // Serialize K
+      std::ostringstream k;
+      k << name;
+
+      // Serialize V
+      bufferlist bl;
+      session->info.encode(bl);
+
+      // Add to RADOS op
+      to_set[k.str()] = bl;
+
+      session->clear_dirty_completed_requests();
+    } else {
+      dout(20) << "  " << name << " (ignoring)" << dendl;
+    }
+  }
+  if (!to_set.empty()) {
+    op.omap_set(to_set);
+  }
+
+  dout(20) << " removing keys:" << dendl;
+  set<string> to_remove;
+  for(std::set<entity_name_t>::const_iterator i = null_sessions.begin();
+      i != null_sessions.end(); ++i) {
+    dout(20) << "  " << *i << dendl;
+    std::ostringstream k;
+    k << *i;
+    to_remove.insert(k.str());
+  }
+  if (!to_remove.empty()) {
+    op.omap_rm_keys(to_remove);
+  }
+
+  dirty_sessions.clear();
+  null_sessions.clear();
+
+  mds->objecter->mutate(oid, oloc, op, snapc, ceph_clock_now(g_ceph_context),
+      0, NULL, new C_OnFinisher(new C_IO_SM_Save(this, version), mds->finisher));
 }
 
 void SessionMap::_save_finish(version_t v)
@@ -160,37 +416,13 @@ void SessionMap::_save_finish(version_t v)
 }
 
 
-// -------------------
-
-void SessionMapStore::encode(bufferlist& bl) const
-{
-  uint64_t pre = -1;     // for 0.19 compatibility; we forgot an encoding prefix.
-  ::encode(pre, bl);
-
-  ENCODE_START(3, 3, bl);
-  ::encode(version, bl);
-
-  for (ceph::unordered_map<entity_name_t,Session*>::const_iterator p = session_map.begin(); 
-       p != session_map.end(); 
-       ++p) {
-    if (p->second->is_open() ||
-	p->second->is_closing() ||
-	p->second->is_stale() ||
-	p->second->is_killing()) {
-      ::encode(p->first, bl);
-      p->second->info.encode(bl);
-    }
-  }
-  ENCODE_FINISH(bl);
-}
-
 /**
  * Deserialize sessions, and update by_state index
  */
-void SessionMap::decode(bufferlist::iterator &p)
+void SessionMap::decode_legacy(bufferlist::iterator &p)
 {
   // Populate `sessions`
-  SessionMapStore::decode(p);
+  SessionMapStore::decode_legacy(p);
 
   // Update `by_state`
   for (ceph::unordered_map<entity_name_t, Session*>::iterator i = session_map.begin();
@@ -212,7 +444,7 @@ uint64_t SessionMap::set_state(Session *session, int s) {
   return session->get_state_seq();
 }
 
-void SessionMapStore::decode(bufferlist::iterator& p)
+void SessionMapStore::decode_legacy(bufferlist::iterator& p)
 {
   utime_t now = ceph_clock_now(g_ceph_context);
   uint64_t pre;
@@ -353,6 +585,10 @@ void SessionMap::remove_session(Session *s)
   s->trim_completed_requests(0);
   s->item_session_list.remove_myself();
   session_map.erase(s->info.inst.name);
+  if (dirty_sessions.count(s->info.inst.name)) {
+    dirty_sessions.erase(s->info.inst.name);
+  }
+  null_sessions.insert(s->info.inst.name);
   s->put();
 }
 
@@ -449,3 +685,148 @@ void Session::decode(bufferlist::iterator &p)
   _update_human_name();
 }
 
+void SessionMap::_mark_dirty(Session *s)
+{
+  if (dirty_sessions.size() >= g_conf->mds_sessionmap_keys_per_op) {
+    // Pre-empt the usual save() call from journal segment trim, in
+    // order to avoid building up an oversized OMAP update operation
+    // from too many sessions modified at once
+    save(new C_MDSInternalNoop, version);
+  }
+
+  dirty_sessions.insert(s->info.inst.name);
+}
+
+void SessionMap::mark_dirty(Session *s)
+{
+  dout(20) << __func__ << " s=" << s << " name=" << s->info.inst.name
+    << " v=" << version << dendl;
+
+  _mark_dirty(s);
+  version++;
+  s->pop_pv(version);
+}
+
+void SessionMap::replay_dirty_session(Session *s)
+{
+  dout(20) << __func__ << " s=" << s << " name=" << s->info.inst.name
+    << " v=" << version << dendl;
+
+  _mark_dirty(s);
+
+  replay_advance_version();
+}
+
+void SessionMap::replay_advance_version()
+{
+  version++;
+  projected = version;
+}
+
+version_t SessionMap::mark_projected(Session *s)
+{
+  dout(20) << __func__ << " s=" << s << " name=" << s->info.inst.name
+    << " pv=" << projected << " -> " << projected + 1 << dendl;
+  ++projected;
+  s->push_pv(projected);
+  return projected;
+}
+
+
+class C_IO_SM_Save_One : public SessionMapIOContext {
+  MDSInternalContextBase *on_safe;
+public:
+  C_IO_SM_Save_One(SessionMap *cm, MDSInternalContextBase *on_safe_)
+    : SessionMapIOContext(cm), on_safe(on_safe_) {}
+  void finish(int r) {
+    if (r != 0) {
+      get_mds()->handle_write_error(r);
+    } else {
+      on_safe->complete(r);
+    }
+  }
+};
+
+
+void SessionMap::save_if_dirty(const std::set<entity_name_t> &tgt_sessions,
+                               MDSGatherBuilder *gather_bld)
+{
+  assert(gather_bld != NULL);
+
+  std::vector<entity_name_t> write_sessions;
+
+  // Decide which sessions require a write
+  for (std::set<entity_name_t>::iterator i = tgt_sessions.begin();
+       i != tgt_sessions.end(); ++i) {
+    const entity_name_t &session_id = *i;
+
+    if (session_map.count(session_id) == 0) {
+      // Session isn't around any more, never mind.
+      continue;
+    }
+
+    Session *session = session_map[session_id];
+    if (!session->has_dirty_completed_requests()) {
+      // Session hasn't had completed_requests
+      // modified since last write, no need to
+      // write it now.
+      continue;
+    }
+
+    if (dirty_sessions.count(session_id) > 0) {
+      // Session is already dirtied, will be written, no
+      // need to pre-empt that.
+      continue;
+    }
+    // Okay, passed all our checks, now we write
+    // this session out.  The version we write
+    // into the OMAP may now be higher-versioned
+    // than the version in the header, but that's
+    // okay because it's never a problem to have
+    // an overly-fresh copy of a session.
+    write_sessions.push_back(*i);
+  }
+
+  dout(4) << __func__ << ": writing " << write_sessions.size() << dendl;
+
+  // Batch writes into mds_sessionmap_keys_per_op
+  const uint32_t kpo = g_conf->mds_sessionmap_keys_per_op;
+  map<string, bufferlist> to_set;
+  for (uint32_t i = 0; i < write_sessions.size(); ++i) {
+    // Start a new write transaction?
+    if (i % g_conf->mds_sessionmap_keys_per_op == 0) {
+      to_set.clear();
+    }
+
+    const entity_name_t &session_id = write_sessions[i];
+    Session *session = session_map[session_id];
+    session->clear_dirty_completed_requests();
+
+    // Serialize K
+    std::ostringstream k;
+    k << session_id;
+
+    // Serialize V
+    bufferlist bl;
+    session->info.encode(bl);
+
+    // Add to RADOS op
+    to_set[k.str()] = bl;
+
+    // Complete this write transaction?
+    if (i == write_sessions.size() - 1
+        || i % kpo == kpo - 1) {
+      ObjectOperation op;
+      op.omap_set(to_set);
+
+      SnapContext snapc;
+      object_t oid = get_object_name();
+      object_locator_t oloc(mds->mdsmap->get_metadata_pool());
+      MDSInternalContextBase *on_safe = gather_bld->new_sub();
+      mds->objecter->mutate(oid, oloc, op, snapc, ceph_clock_now(g_ceph_context),
+          0, NULL, new C_OnFinisher(new C_IO_SM_Save_One(this, on_safe), mds->finisher));
+    }
+  }
+}
+
+
diff --git a/src/mds/SessionMap.h b/src/mds/SessionMap.h
index c990ada..cfcfa04 100644
--- a/src/mds/SessionMap.h
+++ b/src/mds/SessionMap.h
@@ -86,8 +86,27 @@ private:
   void _update_human_name();
   std::string human_name;
 
+  // Versions in this this session was projected: used to verify
+  // that appropriate mark_dirty calls follow.
+  std::deque<version_t> projected;
+
 public:
 
+  void push_pv(version_t pv)
+  {
+    if (!projected.empty()) {
+      assert(projected.back() != pv);
+    }
+    projected.push_back(pv);
+  }
+
+  void pop_pv(version_t v)
+  {
+    assert(!projected.empty());
+    assert(projected.front() == v);
+    projected.pop_front();
+  }
+
   inline int get_state() const {return state;}
   void set_state(int new_state)
   {
@@ -154,12 +173,12 @@ public:
   int get_state() { return state; }
   const char *get_state_name() { return get_state_name(state); }
   uint64_t get_state_seq() { return state_seq; }
-  bool is_closed() { return state == STATE_CLOSED; }
-  bool is_opening() { return state == STATE_OPENING; }
-  bool is_open() { return state == STATE_OPEN; }
-  bool is_closing() { return state == STATE_CLOSING; }
-  bool is_stale() { return state == STATE_STALE; }
-  bool is_killing() { return state == STATE_KILLING; }
+  bool is_closed() const { return state == STATE_CLOSED; }
+  bool is_opening() const { return state == STATE_OPENING; }
+  bool is_open() const { return state == STATE_OPEN; }
+  bool is_closing() const { return state == STATE_CLOSING; }
+  bool is_stale() const { return state == STATE_STALE; }
+  bool is_killing() const { return state == STATE_KILLING; }
 
   void inc_importing() {
     ++importing_count;
@@ -174,6 +193,7 @@ public:
 private:
   version_t cap_push_seq;        // cap push seq #
   map<version_t, list<MDSInternalContextBase*> > waitfor_flush; // flush session messages
+
 public:
   xlist<Capability*> caps;     // inodes with caps; front=most recently used
   xlist<ClientLease*> leases;  // metadata leases to clients
@@ -208,17 +228,30 @@ public:
 
   // -- completed requests --
 private:
+  // Has completed_requests been modified since the last time we
+  // wrote this session out?
+  bool completed_requests_dirty;
 
-
+  unsigned num_trim_flushes_warnings;
+  unsigned num_trim_requests_warnings;
 public:
   void add_completed_request(ceph_tid_t t, inodeno_t created) {
     info.completed_requests[t] = created;
+    completed_requests_dirty = true;
   }
-  void trim_completed_requests(ceph_tid_t mintid) {
+  bool trim_completed_requests(ceph_tid_t mintid) {
     // trim
+    bool erased_any = false;
     while (!info.completed_requests.empty() && 
-	   (mintid == 0 || info.completed_requests.begin()->first < mintid))
+	   (mintid == 0 || info.completed_requests.begin()->first < mintid)) {
       info.completed_requests.erase(info.completed_requests.begin());
+      erased_any = true;
+    }
+
+    if (erased_any) {
+      completed_requests_dirty = true;
+    }
+    return erased_any;
   }
   bool have_completed_request(ceph_tid_t tid, inodeno_t *pcreated) const {
     map<ceph_tid_t,inodeno_t>::const_iterator p = info.completed_requests.find(tid);
@@ -229,6 +262,45 @@ public:
     return true;
   }
 
+  void add_completed_flush(ceph_tid_t tid) {
+    info.completed_flushes.insert(tid);
+  }
+  bool trim_completed_flushes(ceph_tid_t mintid) {
+    bool erased_any = false;
+    while (!info.completed_flushes.empty() &&
+	(mintid == 0 || *info.completed_flushes.begin() < mintid)) {
+      info.completed_flushes.erase(info.completed_flushes.begin());
+      erased_any = true;
+    }
+    if (erased_any) {
+      completed_requests_dirty = true;
+    }
+    return erased_any;
+  }
+  bool have_completed_flush(ceph_tid_t tid) const {
+    return info.completed_flushes.count(tid);
+  }
+
+  unsigned get_num_completed_flushes() const { return info.completed_flushes.size(); }
+  unsigned get_num_trim_flushes_warnings() { return num_trim_flushes_warnings; }
+  void inc_num_trim_flushes_warnings() { ++num_trim_flushes_warnings; }
+  void reset_num_trim_flushes_warnings() { num_trim_flushes_warnings = 0; }
+
+  unsigned get_num_completed_requests() const { return info.completed_requests.size(); }
+  unsigned get_num_trim_requests_warnings() { return num_trim_requests_warnings; }
+  void inc_num_trim_requests_warnings() { ++num_trim_requests_warnings; }
+  void reset_num_trim_requests_warnings() { num_trim_requests_warnings = 0; }
+
+  bool has_dirty_completed_requests() const
+  {
+    return completed_requests_dirty;
+  }
+
+  void clear_dirty_completed_requests()
+  {
+    completed_requests_dirty = false;
+  }
+
 
   Session() : 
     state(STATE_CLOSED), state_seq(0), importing_count(0),
@@ -236,7 +308,10 @@ public:
     connection(NULL), item_session_list(this),
     requests(0),  // member_offset passed to front() manually
     cap_push_seq(0),
-    lease_seq(0) { }
+    lease_seq(0),
+    completed_requests_dirty(false),
+    num_trim_flushes_warnings(0),
+    num_trim_requests_warnings(0) { }
   ~Session() {
     assert(!item_session_list.is_on_list());
     while (!preopen_out_queue.empty()) {
@@ -253,27 +328,31 @@ public:
     last_cap_renew = utime_t();
 
   }
-
 };
 
 /*
  * session map
  */
 
-class MDS;
+class MDSRank;
 
 /**
  * Encapsulate the serialized state associated with SessionMap.  Allows
  * encode/decode outside of live MDS instance.
  */
 class SessionMapStore {
+protected:
+  version_t version;
 public:
   ceph::unordered_map<entity_name_t, Session*> session_map;
-  version_t version;
   mds_rank_t rank;
 
-  virtual void encode(bufferlist& bl) const;
-  virtual void decode(bufferlist::iterator& blp);
+  version_t get_version() const {return version;}
+
+  virtual void encode_header(bufferlist *header_bl);
+  virtual void decode_header(bufferlist &header_bl);
+  virtual void decode_values(std::map<std::string, bufferlist> &session_vals);
+  virtual void decode_legacy(bufferlist::iterator& blp);
   void dump(Formatter *f) const;
 
   void set_rank(mds_rank_t r)
@@ -307,20 +386,47 @@ public:
 
 class SessionMap : public SessionMapStore {
 public:
-  MDS *mds;
+  MDSRank *mds;
 
-public:  // i am lazy
+protected:
   version_t projected, committing, committed;
+public:
   map<int,xlist<Session*>* > by_state;
   uint64_t set_state(Session *session, int state);
   map<version_t, list<MDSInternalContextBase*> > commit_waiters;
 
-  SessionMap(MDS *m) : mds(m),
-		       projected(0), committing(0), committed(0) 
+  SessionMap(MDSRank *m) : mds(m),
+		       projected(0), committing(0), committed(0),
+                       loaded_legacy(false)
   { }
 
+  void set_version(const version_t v)
+  {
+    version = projected = v;
+  }
+
+  void set_projected(const version_t v)
+  {
+    projected = v;
+  }
+
+  version_t get_projected() const
+  {
+    return projected;
+  }
+
+  version_t get_committed() const
+  {
+    return committed;
+  }
+
+  version_t get_committing() const
+  {
+    return committed;
+  }
+
   // sessions
-  void decode(bufferlist::iterator& blp);
+  void decode_legacy(bufferlist::iterator& blp);
   bool empty() { return session_map.empty(); }
   const ceph::unordered_map<entity_name_t, Session*> &get_sessions() const
   {
@@ -428,10 +534,71 @@ public:  // i am lazy
   object_t get_object_name();
 
   void load(MDSInternalContextBase *onload);
-  void _load_finish(int r, bufferlist &bl);
+  void _load_finish(
+      int operation_r,
+      int header_r,
+      int values_r,
+      bool first,
+      bufferlist &header_bl,
+      std::map<std::string, bufferlist> &session_vals);
+
+  void load_legacy();
+  void _load_legacy_finish(int r, bufferlist &bl);
+
   void save(MDSInternalContextBase *onsave, version_t needv=0);
   void _save_finish(version_t v);
- 
+
+protected:
+  std::set<entity_name_t> dirty_sessions;
+  std::set<entity_name_t> null_sessions;
+  bool loaded_legacy;
+  void _mark_dirty(Session *session);
+public:
+
+  /**
+   * Advance the version, and mark this session
+   * as dirty within the new version.
+   *
+   * Dirty means journalled but needing writeback
+   * to the backing store.  Must have called
+   * mark_projected previously for this session.
+   */
+  void mark_dirty(Session *session);
+
+  /**
+   * Advance the projected version, and mark this
+   * session as projected within the new version
+   *
+   * Projected means the session is updated in memory
+   * but we're waiting for the journal write of the update
+   * to finish.  Must subsequently call mark_dirty
+   * for sessions in the same global order as calls
+   * to mark_projected.
+   */
+  version_t mark_projected(Session *session);
+
+  /**
+   * During replay, advance versions to account
+   * for a session modification, and mark the
+   * session dirty.
+   */
+  void replay_dirty_session(Session *session);
+
+  /**
+   * During replay, if a session no longer present
+   * would have consumed a version, advance `version`
+   * and `projected` to account for that.
+   */
+  void replay_advance_version();
+
+  /**
+   * For these session IDs, if a session exists with this ID, and it has
+   * dirty completed_requests, then persist it immediately
+   * (ahead of usual project/dirty versioned writes
+   *  of the map).
+   */
+  void save_if_dirty(const std::set<entity_name_t> &tgt_sessions,
+                     MDSGatherBuilder *gather_bld);
 };
 
 
diff --git a/src/mds/SimpleLock.cc b/src/mds/SimpleLock.cc
new file mode 100644
index 0000000..4e5bbec
--- /dev/null
+++ b/src/mds/SimpleLock.cc
@@ -0,0 +1,43 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#include "SimpleLock.h"
+#include "Mutation.h"
+
+void SimpleLock::dump(Formatter *f) const {
+  assert(f != NULL);
+  if (is_sync_and_unlocked()) {
+    return;
+  }
+
+  f->open_array_section("gather_set");
+  if (have_more()) {
+    for(std::set<int32_t>::iterator i = more()->gather_set.begin();
+        i != more()->gather_set.end(); ++i) {
+      f->dump_int("rank", *i);
+    }
+  }
+  f->close_section();
+
+  f->dump_int("num_client_lease", num_client_lease);
+  f->dump_int("num_rdlocks", get_num_rdlocks());
+  f->dump_int("num_wrlocks", get_num_wrlocks());
+  f->dump_int("num_xlocks", get_num_xlocks());
+  f->open_object_section("xlock_by");
+  if (get_xlock_by()) {
+    get_xlock_by()->dump(f);
+  }
+  f->close_section();
+}
diff --git a/src/mds/SimpleLock.h b/src/mds/SimpleLock.h
index 1c78a8b..085aec7 100644
--- a/src/mds/SimpleLock.h
+++ b/src/mds/SimpleLock.h
@@ -16,6 +16,7 @@
 #ifndef CEPH_SIMPLELOCK_H
 #define CEPH_SIMPLELOCK_H
 
+#include "mdstypes.h"
 #include "MDSContext.h"
 
 // -- lock types --
@@ -367,8 +368,8 @@ public:
   }
 
   void init_gather() {
-    for (map<mds_rank_t,unsigned>::const_iterator p = parent->replicas_begin();
-	 p != parent->replicas_end(); 
+    for (compact_map<mds_rank_t,unsigned>::iterator p = parent->replicas_begin();
+	 p != parent->replicas_end();
 	 ++p)
       more()->gather_set.insert(p->first);
   }
@@ -669,6 +670,12 @@ public:
     */
   }
 
+  /**
+   * Write bare values (caller must be in an object section)
+   * to formatter, or nothing if is_sync_and_unlocked.
+   */
+  void dump(Formatter *f) const;
+
   virtual void print(ostream& out) const {
     out << "(";
     _print(out);
diff --git a/src/mds/SnapClient.h b/src/mds/SnapClient.h
index 3e8d6a2..883699f 100644
--- a/src/mds/SnapClient.h
+++ b/src/mds/SnapClient.h
@@ -19,12 +19,12 @@
 #include "snap.h"
 
 class MDSInternalContextBase;
-class MDS;
+class MDSRank;
 class LogSegment;
 
 class SnapClient : public MDSTableClient {
 public:
-  SnapClient(MDS *m) : MDSTableClient(m, TABLE_SNAP) {}
+  SnapClient(MDSRank *m) : MDSTableClient(m, TABLE_SNAP) {}
 
   void resend_queries() {}
   void handle_query_result(MMDSTableRequest *m) {}
@@ -56,6 +56,18 @@ public:
     ::encode(snapid, bl);
     _prepare(bl, pstid, pbl, onfinish);
   }
+
+  void prepare_update(inodeno_t ino, snapid_t snapid, const string& name, utime_t stamp,
+		      version_t *pstid, bufferlist *pbl, MDSInternalContextBase *onfinish) {
+    bufferlist bl;
+    __u32 op = TABLE_OP_UPDATE;
+    ::encode(op, bl);
+    ::encode(ino, bl);
+    ::encode(snapid, bl);
+    ::encode(name, bl);
+    ::encode(stamp, bl);
+    _prepare(bl, pstid, pbl, onfinish);
+  }
 };
 
 #endif
diff --git a/src/mds/SnapRealm.cc b/src/mds/SnapRealm.cc
index 6278924..12dd67b 100644
--- a/src/mds/SnapRealm.cc
+++ b/src/mds/SnapRealm.cc
@@ -14,7 +14,7 @@
 
 #include "SnapRealm.h"
 #include "MDCache.h"
-#include "MDS.h"
+#include "MDSRank.h"
 
 #include "messages/MClientSnap.h"
 
@@ -61,9 +61,20 @@ ostream& operator<<(ostream& out, const SnapRealm& realm)
 void SnapRealm::add_open_past_parent(SnapRealm *parent)
 {
   open_past_parents[parent->inode->ino()] = parent;
+  parent->open_past_children.insert(this);
   parent->inode->get(CInode::PIN_PASTSNAPPARENT);
 }
 
+void SnapRealm::remove_open_past_parent(inodeno_t ino)
+{
+  map<inodeno_t,SnapRealm*>::iterator p = open_past_parents.find(ino);
+  assert(p != open_past_parents.end());
+  SnapRealm *parent = p->second;
+  open_past_parents.erase(p);
+  parent->open_past_children.erase(this);
+  parent->inode->put(CInode::PIN_PASTSNAPPARENT);
+}
+
 struct C_SR_RetryOpenParents : public MDSInternalContextBase {
   SnapRealm *sr;
   snapid_t first, last, parent_last;
@@ -71,13 +82,16 @@ struct C_SR_RetryOpenParents : public MDSInternalContextBase {
   MDSInternalContextBase* fin;
   C_SR_RetryOpenParents(SnapRealm *s, snapid_t f, snapid_t l, snapid_t pl,
 			inodeno_t p, MDSInternalContextBase *c) :
-    sr(s), first(f), last(l), parent_last(pl),  parent(p), fin(c) {}
-  MDS *get_mds() { return sr->mdcache->mds; }
+    sr(s), first(f), last(l), parent_last(pl),  parent(p), fin(c) {
+    sr->inode->get(CInode::PIN_OPENINGSNAPPARENTS);
+  }
+  MDSRank *get_mds() { return sr->mdcache->mds; }
   void finish(int r) {
     if (r < 0)
       sr->_remove_missing_parent(parent_last, parent, r);
     if (sr->_open_parents(fin, first, last))
       fin->complete(0);
+    sr->inode->put(CInode::PIN_OPENINGSNAPPARENTS);
   }
 };
 
@@ -112,8 +126,7 @@ bool SnapRealm::_open_parents(MDSInternalContextBase *finish, snapid_t first, sn
   assert(srnode.past_parents.size() >= open_past_parents.size());
   if (srnode.past_parents.size() > open_past_parents.size()) {
     for (map<snapid_t, snaplink_t>::iterator p = srnode.past_parents.begin();
-	 p != srnode.past_parents.end();
-	 ++p) {    
+	 p != srnode.past_parents.end(); ) {
       dout(10) << " past_parent [" << p->second.first << "," << p->first << "] is "
 	       << p->second.ino << dendl;
       CInode *parent = mdcache->get_inode(p->second.ino);
@@ -123,12 +136,18 @@ bool SnapRealm::_open_parents(MDSInternalContextBase *finish, snapid_t first, sn
 	mdcache->open_ino(p->second.ino, mdcache->mds->mdsmap->get_metadata_pool(), fin);
 	return false;
       }
+      if (parent->state_test(CInode::STATE_PURGING)) {
+	dout(10) << " skip purging past_parent " << *parent << dendl;
+	srnode.past_parents.erase(p++);
+	continue;
+      }
       assert(parent->snaprealm);  // hmm!
       if (!parent->snaprealm->_open_parents(finish, p->second.first, p->first))
 	return false;
       if (!open_past_parents.count(p->second.ino)) {
 	add_open_past_parent(parent->snaprealm);
       }
+      ++p;
     }
   }
 
@@ -166,8 +185,10 @@ void SnapRealm::close_parents()
 {
   for (map<inodeno_t,SnapRealm*>::iterator p = open_past_parents.begin();
        p != open_past_parents.end();
-       ++p)
+       ++p) {
     p->second->inode->put(CInode::PIN_PASTSNAPPARENT);
+    p->second->open_past_children.erase(this);
+  }
   open_past_parents.clear();
 }
 
@@ -214,6 +235,7 @@ void SnapRealm::build_snap_set(set<snapid_t> &s,
 
 void SnapRealm::check_cache()
 {
+  assert(open);
   if (cached_seq >= srnode.seq)
     return;
 
@@ -506,6 +528,7 @@ void SnapRealm::prune_past_parents()
 	*q > p->first) {
       dout(10) << "prune_past_parents pruning [" << p->second.first << "," << p->first 
 	       << "] " << p->second.ino << dendl;
+      remove_open_past_parent(p->second.ino);
       srnode.past_parents.erase(p++);
     } else {
       dout(10) << "prune_past_parents keeping [" << p->second.first << "," << p->first 
diff --git a/src/mds/SnapRealm.h b/src/mds/SnapRealm.h
index 2888624..92f3de9 100644
--- a/src/mds/SnapRealm.h
+++ b/src/mds/SnapRealm.h
@@ -35,6 +35,7 @@ struct SnapRealm {
   SnapRealm *parent;
   set<SnapRealm*> open_children;    // active children that are currently open
   map<inodeno_t,SnapRealm*> open_past_parents;  // these are explicitly pinned.
+  set<SnapRealm*> open_past_children;  // past children who has pinned me
 
   // cache
   snapid_t cached_seq;           // max seq over self and all past+present parents.
@@ -65,6 +66,8 @@ struct SnapRealm {
     return false;
   }
 
+  bool is_open() { return open; }
+  void _close_parents() { open = false; }
   bool _open_parents(MDSInternalContextBase *retryorfinish, snapid_t first=1, snapid_t last=CEPH_NOSNAP);
   void _remove_missing_parent(snapid_t snapid, inodeno_t parent, int err);
   bool open_parents(MDSInternalContextBase *retryorfinish) {
@@ -75,6 +78,7 @@ struct SnapRealm {
   }
   bool have_past_parents_open(snapid_t first=1, snapid_t last=CEPH_NOSNAP);
   void add_open_past_parent(SnapRealm *parent);
+  void remove_open_past_parent(inodeno_t ino);
   void close_parents();
 
   void prune_past_parents();
@@ -119,13 +123,20 @@ struct SnapRealm {
 
   snapid_t get_snap_following(snapid_t follows) {
     check_cache();
-    set<snapid_t> s = get_snaps();
-    set<snapid_t>::iterator p = s.upper_bound(follows);
+    const set<snapid_t>& s = get_snaps();
+    set<snapid_t>::const_iterator p = s.upper_bound(follows);
     if (p != s.end())
       return *p;
     return CEPH_NOSNAP;
   }
 
+  bool has_snaps_in_range(snapid_t first, snapid_t last) {
+    check_cache();
+    const set<snapid_t>& s = get_snaps();
+    set<snapid_t>::const_iterator p = s.lower_bound(first);
+    return (p != s.end() && *p <= last);
+  }
+
   void adjust_parent();
 
   void split_at(SnapRealm *child);
diff --git a/src/mds/SnapServer.cc b/src/mds/SnapServer.cc
index 04ab36a..7006909 100644
--- a/src/mds/SnapServer.cc
+++ b/src/mds/SnapServer.cc
@@ -13,7 +13,7 @@
  */
 
 #include "SnapServer.h"
-#include "MDS.h"
+#include "MDSRank.h"
 #include "osd/OSDMap.h"
 #include "osdc/Objecter.h"
 #include "mon/MonClient.h"
@@ -39,19 +39,26 @@ void SnapServer::reset_state()
   need_to_purge.clear();
 
   // find any removed snapshot in data pools
-  snapid_t first_free = 0;
-  const OSDMap *osdmap = mds->objecter->get_osdmap_read();
-  for (set<int64_t>::const_iterator p = mds->mdsmap->get_data_pools().begin();
-       p != mds->mdsmap->get_data_pools().end();
-       ++p) {
-    const pg_pool_t *pi = osdmap->get_pg_pool(*p);
-    if (!pi->removed_snaps.empty() &&
-        pi->removed_snaps.range_end() > first_free)
-      first_free = pi->removed_snaps.range_end();
+  if (mds) {  // only if I'm running in a live MDS
+    snapid_t first_free = 0;
+    const OSDMap *osdmap = mds->objecter->get_osdmap_read();
+    for (set<int64_t>::const_iterator p = mds->mdsmap->get_data_pools().begin();
+         p != mds->mdsmap->get_data_pools().end();
+         ++p) {
+      const pg_pool_t *pi = osdmap->get_pg_pool(*p);
+      if (!pi) {
+        // If pool isn't in OSDMap yet then can't have any snaps needing
+        // removal, skip.
+        continue;
+      }
+      if (!pi->removed_snaps.empty() &&
+          pi->removed_snaps.range_end() > first_free)
+        first_free = pi->removed_snaps.range_end();
+    }
+    mds->objecter->put_osdmap_read();
+    if (first_free > last_snap)
+      last_snap = first_free;
   }
-  mds->objecter->put_osdmap_read();
-  if (first_free > last_snap)
-    last_snap = first_free;
 }
 
 
@@ -74,7 +81,8 @@ void SnapServer::_prepare(bufferlist &bl, uint64_t reqid, mds_rank_t bymds)
 	::decode(info.name, p);
 	::decode(info.stamp, p);
 	info.snapid = ++last_snap;
-	pending_create[version] = info;
+	info.long_name = "create";
+	pending_update[version] = info;
 	dout(10) << "prepare v" << version << " create " << info << dendl;
       } else {
 	pending_noop.insert(version);
@@ -104,6 +112,26 @@ void SnapServer::_prepare(bufferlist &bl, uint64_t reqid, mds_rank_t bymds)
     }
     break;
 
+  case TABLE_OP_UPDATE:
+    {
+      SnapInfo info;
+      ::decode(info.ino, p);
+      ::decode(info.snapid, p);
+      ::decode(info.name, p);
+      ::decode(info.stamp, p);
+      info.long_name = "update";
+
+      version++;
+      // bump last_snap... we use it as a version value on the snaprealm.
+      ++last_snap;
+      pending_update[version] = info;
+      dout(10) << "prepare v" << version << " update " << info << dendl;
+
+      bl.clear();
+      ::encode(last_snap, bl);
+    }
+    break;
+
   default:
     assert(0);
   }
@@ -113,17 +141,25 @@ void SnapServer::_prepare(bufferlist &bl, uint64_t reqid, mds_rank_t bymds)
 bool SnapServer::_is_prepared(version_t tid)
 {
   return 
-    pending_create.count(tid) ||
+    pending_update.count(tid) ||
     pending_destroy.count(tid);
 }
 
 bool SnapServer::_commit(version_t tid, MMDSTableRequest *req)
 {
-  if (pending_create.count(tid)) {
-    dout(7) << "commit " << tid << " create " << pending_create[tid] << dendl;
-    snaps[pending_create[tid].snapid] = pending_create[tid];
-    pending_create.erase(tid);
-  } 
+  if (pending_update.count(tid)) {
+    SnapInfo &info = pending_update[tid];
+    string opname;
+    if (info.long_name.empty())
+      opname = "create";
+    else
+      opname.swap(info.long_name);
+    if (info.stamp == utime_t() && snaps.count(info.snapid))
+      info.stamp = snaps[info.snapid].stamp;
+    dout(7) << "commit " << tid << " " << opname << " " << info << dendl;
+    snaps[info.snapid] = info;
+    pending_update.erase(tid);
+  }
 
   else if (pending_destroy.count(tid)) {
     snapid_t sn = pending_destroy[tid].first;
@@ -155,9 +191,15 @@ bool SnapServer::_commit(version_t tid, MMDSTableRequest *req)
 
 void SnapServer::_rollback(version_t tid) 
 {
-  if (pending_create.count(tid)) {
-    dout(7) << "rollback " << tid << " create " << pending_create[tid] << dendl;
-    pending_create.erase(tid);
+  if (pending_update.count(tid)) {
+    SnapInfo &info = pending_update[tid];
+    string opname;
+    if (info.long_name.empty())
+      opname = "create";
+    else
+      opname.swap(info.long_name);
+    dout(7) << "rollback " << tid << " " << opname << " " << info << dendl;
+    pending_update.erase(tid);
   } 
 
   else if (pending_destroy.count(tid)) {
@@ -223,6 +265,12 @@ void SnapServer::check_osd_map(bool force)
        ++p) {
     int id = p->first;
     const pg_pool_t *pi = osdmap->get_pg_pool(id);
+    if (pi == NULL) {
+      // The pool is gone.  So are the snapshots.
+      all_purged[id] = std::vector<snapid_t>(p->second.begin(), p->second.end());
+      continue;
+    }
+
     for (set<snapid_t>::iterator q = p->second.begin();
 	 q != p->second.end();
 	 ++q) {
@@ -246,7 +294,7 @@ void SnapServer::check_osd_map(bool force)
   if (!all_purge.empty()) {
     dout(10) << "requesting removal of " << all_purge << dendl;
     MRemoveSnaps *m = new MRemoveSnaps(all_purge);
-    mds->monc->send_mon_message(m);
+    mon_client->send_mon_message(m);
   }
 
   last_checked_osdmap = version;
@@ -285,8 +333,8 @@ void SnapServer::dump(Formatter *f) const
   }
   f->close_section();
 
-  f->open_array_section("pending_create");
-  for(map<version_t, SnapInfo>::const_iterator i = pending_create.begin(); i != pending_create.end(); ++i) {
+  f->open_array_section("pending_update");
+  for(map<version_t, SnapInfo>::const_iterator i = pending_update.begin(); i != pending_update.end(); ++i) {
     f->open_object_section("snap");
     f->dump_unsigned("version", i->first);
     f->open_object_section("snapinfo");
@@ -324,7 +372,7 @@ void SnapServer::generate_test_instances(list<SnapServer*>& ls)
   populated->last_snap = 123;
   populated->snaps[456] = populated_snapinfo;
   populated->need_to_purge[2].insert(012);
-  populated->pending_create[234] = populated_snapinfo;
+  populated->pending_update[234] = populated_snapinfo;
   populated->pending_destroy[345].first = 567;
   populated->pending_destroy[345].second = 768;
   populated->pending_noop.insert(890);
diff --git a/src/mds/SnapServer.h b/src/mds/SnapServer.h
index 13669fc..65177be 100644
--- a/src/mds/SnapServer.h
+++ b/src/mds/SnapServer.h
@@ -18,25 +18,26 @@
 #include "MDSTableServer.h"
 #include "snap.h"
 
-class MDS;
+class MDSRank;
+class MonClient;
 
 class SnapServer : public MDSTableServer {
-public:
-  
 protected:
+  MonClient *mon_client;
   snapid_t last_snap;
   map<snapid_t, SnapInfo> snaps;
   map<int, set<snapid_t> > need_to_purge;
   
-  map<version_t, SnapInfo> pending_create;
+  map<version_t, SnapInfo> pending_update;
   map<version_t, pair<snapid_t,snapid_t> > pending_destroy; // (removed_snap, seq)
   set<version_t>           pending_noop;
 
   version_t last_checked_osdmap;
 
 public:
-  SnapServer(MDS *m) : MDSTableServer(m, TABLE_SNAP),
-		       last_checked_osdmap(0) { }
+  SnapServer(MDSRank *m, MonClient *monc)
+    : MDSTableServer(m, TABLE_SNAP), mon_client(monc), last_checked_osdmap(0)
+  {}
     
   void reset_state();
   void encode_server_state(bufferlist& bl) const {
@@ -44,7 +45,7 @@ public:
     ::encode(last_snap, bl);
     ::encode(snaps, bl);
     ::encode(need_to_purge, bl);
-    ::encode(pending_create, bl);
+    ::encode(pending_update, bl);
     ::encode(pending_destroy, bl);
     ::encode(pending_noop, bl);
     ENCODE_FINISH(bl);
@@ -54,7 +55,7 @@ public:
     ::decode(last_snap, bl);
     ::decode(snaps, bl);
     ::decode(need_to_purge, bl);
-    ::decode(pending_create, bl);
+    ::decode(pending_update, bl);
     if (struct_v >= 2)
       ::decode(pending_destroy, bl);
     else {
diff --git a/src/mds/StrayManager.cc b/src/mds/StrayManager.cc
new file mode 100644
index 0000000..a052d4e
--- /dev/null
+++ b/src/mds/StrayManager.cc
@@ -0,0 +1,901 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#include "common/perf_counters.h"
+
+#include "osdc/Objecter.h"
+#include "osdc/Filer.h"
+#include "mds/MDSRank.h"
+#include "mds/MDCache.h"
+#include "mds/MDLog.h"
+#include "mds/CDir.h"
+#include "mds/CDentry.h"
+#include "events/EUpdate.h"
+#include "messages/MClientRequest.h"
+
+#include "StrayManager.h"
+
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, mds)
+static ostream& _prefix(std::ostream *_dout, MDSRank *mds) {
+  return *_dout << "mds." << mds->get_nodeid() << ".cache.strays ";
+}
+
+class StrayManagerIOContext : public virtual MDSIOContextBase {
+protected:
+  StrayManager *sm;
+  virtual MDSRank *get_mds()
+  {
+    return sm->mds;
+  }
+public:
+  StrayManagerIOContext(StrayManager *sm_) : sm(sm_) {}
+};
+
+
+class StrayManagerContext : public virtual MDSInternalContextBase {
+protected:
+  StrayManager *sm;
+  virtual MDSRank *get_mds()
+  {
+    return sm->mds;
+  }
+public:
+  StrayManagerContext(StrayManager *sm_) : sm(sm_) {}
+};
+
+
+/**
+ * Context wrapper for _purge_stray_purged completion
+ */
+class C_IO_PurgeStrayPurged : public StrayManagerIOContext {
+  CDentry *dn;
+  bool only_head;
+  // How many ops_in_flight were allocated to this purge?
+  uint32_t ops_allowance;
+public:
+  C_IO_PurgeStrayPurged(StrayManager *sm_, CDentry *d, bool oh, uint32_t ops) : 
+    StrayManagerIOContext(sm_), dn(d), only_head(oh), ops_allowance(ops) { }
+  void finish(int r) {
+    assert(r == 0 || r == -ENOENT);
+    sm->_purge_stray_purged(dn, ops_allowance, only_head);
+  }
+};
+
+void StrayManager::purge(CDentry *dn, uint32_t op_allowance)
+{
+  CDentry::linkage_t *dnl = dn->get_projected_linkage();
+  CInode *in = dnl->get_inode();
+  dout(10) << __func__ << " " << *dn << " " << *in << dendl;
+  assert(!dn->is_replicated());
+
+  num_strays_purging++;
+  logger->set(l_mdc_num_strays_purging, num_strays_purging);
+
+
+  // CHEAT.  there's no real need to journal our intent to purge, since
+  // that is implicit in the dentry's presence and non-use in the stray
+  // dir.  on recovery, we'll need to re-eval all strays anyway.
+  
+  SnapContext nullsnapc;
+  C_GatherBuilder gather(
+    g_ceph_context,
+    new C_OnFinisher(new C_IO_PurgeStrayPurged(
+        this, dn, false, op_allowance), mds->finisher));
+
+  if (in->is_dir()) {
+    object_locator_t oloc(mds->mdsmap->get_metadata_pool());
+    std::list<frag_t> ls;
+    if (!in->dirfragtree.is_leaf(frag_t()))
+      in->dirfragtree.get_leaves(ls);
+    ls.push_back(frag_t());
+    for (std::list<frag_t>::iterator p = ls.begin();
+         p != ls.end();
+         ++p) {
+      object_t oid = CInode::get_object_name(in->inode.ino, *p, "");
+      dout(10) << __func__ << " remove dirfrag " << oid << dendl;
+      mds->objecter->remove(oid, oloc, nullsnapc, ceph_clock_now(g_ceph_context),
+                            0, NULL, gather.new_sub());
+    }
+    assert(gather.has_subs());
+    gather.activate();
+    return;
+  }
+
+  const SnapContext *snapc;
+  SnapRealm *realm = in->find_snaprealm();
+  if (realm) {
+    dout(10) << " realm " << *realm << dendl;
+    snapc = &realm->get_snap_context();
+  } else {
+    dout(10) << " NO realm, using null context" << dendl;
+    snapc = &nullsnapc;
+    assert(in->last == CEPH_NOSNAP);
+  }
+
+  if (in->is_file()) {
+    uint64_t period = (uint64_t)in->inode.layout.fl_object_size *
+		      (uint64_t)in->inode.layout.fl_stripe_count;
+    uint64_t to = in->inode.get_max_size();
+    to = MAX(in->inode.size, to);
+    // when truncating a file, the filer does not delete stripe objects that are
+    // truncated to zero. so we need to purge stripe objects up to the max size
+    // the file has ever been.
+    to = MAX(in->inode.max_size_ever, to);
+    if (to && period) {
+      uint64_t num = (to + period - 1) / period;
+      dout(10) << __func__ << " 0~" << to << " objects 0~" << num
+	       << " snapc " << snapc << " on " << *in << dendl;
+      filer.purge_range(in->inode.ino, &in->inode.layout, *snapc,
+			      0, num, ceph_clock_now(g_ceph_context), 0,
+			      gather.new_sub());
+    }
+  }
+
+  inode_t *pi = in->get_projected_inode();
+  object_t oid = CInode::get_object_name(pi->ino, frag_t(), "");
+  // remove the backtrace object if it was not purged
+  if (!gather.has_subs()) {
+    object_locator_t oloc(pi->layout.fl_pg_pool);
+    dout(10) << __func__ << " remove backtrace object " << oid
+	     << " pool " << oloc.pool << " snapc " << snapc << dendl;
+    mds->objecter->remove(oid, oloc, *snapc, ceph_clock_now(g_ceph_context), 0,
+			  NULL, gather.new_sub());
+  }
+  // remove old backtrace objects
+  for (compact_set<int64_t>::iterator p = pi->old_pools.begin();
+       p != pi->old_pools.end();
+       ++p) {
+    object_locator_t oloc(*p);
+    dout(10) << __func__ << " remove backtrace object " << oid
+	     << " old pool " << *p << " snapc " << snapc << dendl;
+    mds->objecter->remove(oid, oloc, *snapc, ceph_clock_now(g_ceph_context), 0,
+			  NULL, gather.new_sub());
+  }
+  assert(gather.has_subs());
+  gather.activate();
+}
+
+class C_PurgeStrayLogged : public StrayManagerContext {
+  CDentry *dn;
+  version_t pdv;
+  LogSegment *ls;
+public:
+  C_PurgeStrayLogged(StrayManager *sm_, CDentry *d, version_t v, LogSegment *s) : 
+    StrayManagerContext(sm_), dn(d), pdv(v), ls(s) { }
+  void finish(int r) {
+    sm->_purge_stray_logged(dn, pdv, ls);
+  }
+};
+
+class C_TruncateStrayLogged : public StrayManagerContext {
+  CDentry *dn;
+  LogSegment *ls;
+public:
+  C_TruncateStrayLogged(StrayManager *sm, CDentry *d, LogSegment *s) :
+    StrayManagerContext(sm), dn(d), ls(s) { }
+  void finish(int r) {
+    sm->_truncate_stray_logged(dn, ls);
+  }
+};
+
+void StrayManager::_purge_stray_purged(
+    CDentry *dn, uint32_t ops_allowance, bool only_head)
+{
+  CInode *in = dn->get_projected_linkage()->get_inode();
+  dout(10) << "_purge_stray_purged " << *dn << " " << *in << dendl;
+
+  if (only_head) {
+    /* This was a ::truncate */
+    EUpdate *le = new EUpdate(mds->mdlog, "purge_stray truncate");
+    mds->mdlog->start_entry(le);
+    
+    inode_t *pi = in->project_inode();
+    pi->size = 0;
+    pi->max_size_ever = 0;
+    pi->client_ranges.clear();
+    pi->truncate_size = 0;
+    pi->truncate_from = 0;
+    pi->version = in->pre_dirty();
+
+    le->metablob.add_dir_context(dn->dir);
+    le->metablob.add_primary_dentry(dn, in, true);
+
+    mds->mdlog->submit_entry(le,
+        new C_TruncateStrayLogged(
+          this, dn, mds->mdlog->get_current_segment()));
+  } else {
+    if (in->get_num_ref() != (int)in->is_dirty() ||
+        dn->get_num_ref() != (int)dn->is_dirty() + !!in->get_num_ref() + 1/*PIN_PURGING*/) {
+      // Nobody should be taking new references to an inode when it
+      // is being purged (aside from it it were 
+
+      derr << "Rogue reference after purge to " << *dn << dendl;
+      assert(0 == "rogue reference to purging inode");
+    }
+
+    // kill dentry.
+    version_t pdv = dn->pre_dirty();
+    dn->push_projected_linkage(); // NULL
+
+    EUpdate *le = new EUpdate(mds->mdlog, "purge_stray");
+    mds->mdlog->start_entry(le);
+
+    // update dirfrag fragstat, rstat
+    CDir *dir = dn->get_dir();
+    fnode_t *pf = dir->project_fnode();
+    pf->version = dir->pre_dirty();
+    if (in->is_dir())
+      pf->fragstat.nsubdirs--;
+    else
+      pf->fragstat.nfiles--;
+    pf->rstat.sub(in->inode.accounted_rstat);
+
+    le->metablob.add_dir_context(dn->dir);
+    EMetaBlob::dirlump& dl = le->metablob.add_dir(dn->dir, true);
+    le->metablob.add_null_dentry(dl, dn, true);
+    le->metablob.add_destroyed_inode(in->ino());
+
+    mds->mdlog->submit_entry(le, new C_PurgeStrayLogged(this, dn, pdv,
+          mds->mdlog->get_current_segment()));
+
+    num_strays--;
+    logger->set(l_mdc_num_strays, num_strays);
+    logger->inc(l_mdc_strays_purged);
+  }
+
+  num_strays_purging--;
+  logger->set(l_mdc_num_strays_purging, num_strays_purging);
+
+  // Release resources
+  dout(10) << __func__ << ": decrementing op allowance "
+    << ops_allowance << " from " << ops_in_flight << " in flight" << dendl;
+  assert(ops_in_flight >= ops_allowance);
+  ops_in_flight -= ops_allowance;
+  logger->set(l_mdc_num_purge_ops, ops_in_flight);
+  files_purging -= 1;
+  _advance();
+}
+
+void StrayManager::_purge_stray_logged(CDentry *dn, version_t pdv, LogSegment *ls)
+{
+  CInode *in = dn->get_linkage()->get_inode();
+  dout(10) << "_purge_stray_logged " << *dn << " " << *in << dendl;
+
+  assert(!in->state_test(CInode::STATE_RECOVERING));
+
+  // unlink
+  assert(dn->get_projected_linkage()->is_null());
+  dn->dir->unlink_inode(dn);
+  dn->pop_projected_linkage();
+  dn->mark_dirty(pdv, ls);
+
+  dn->dir->pop_and_dirty_projected_fnode(ls);
+
+  in->state_clear(CInode::STATE_ORPHAN);
+  dn->state_clear(CDentry::STATE_PURGING);
+  dn->put(CDentry::PIN_PURGING);
+
+  // drop inode
+  if (in->is_dirty())
+    in->mark_clean();
+  in->mdcache->remove_inode(in);
+
+  // drop dentry?
+  if (dn->is_new()) {
+    dout(20) << " dn is new, removing" << dendl;
+    dn->mark_clean();
+    dn->dir->remove_dentry(dn);
+  } else {
+    in->mdcache->touch_dentry_bottom(dn);  // drop dn as quickly as possible.
+  }
+}
+
+void StrayManager::enqueue(CDentry *dn, bool trunc)
+{
+  CDentry::linkage_t *dnl = dn->get_projected_linkage();
+  assert(dnl);
+  CInode *in = dnl->get_inode();
+  assert(in);
+
+  /* We consider a stray to be purging as soon as it is enqueued, to avoid
+   * enqueing it twice */
+  dn->state_set(CDentry::STATE_PURGING);
+  dn->get(CDentry::PIN_PURGING);
+  in->state_set(CInode::STATE_PURGING);
+
+  if (dn->item_stray.is_on_list()) {
+    dn->item_stray.remove_myself();
+    num_strays_delayed--;
+    logger->set(l_mdc_num_strays_delayed, num_strays_delayed);
+  }
+
+  /* We must clear this as soon as enqueuing it, to prevent the journal
+   * expiry code from seeing a dirty parent and trying to write a backtrace */
+  if (!trunc) {
+    if (in->is_dirty_parent()) {
+      in->clear_dirty_parent();
+    }
+  }
+
+  const uint32_t ops_required = _calculate_ops_required(in, trunc);
+
+  // Try to purge immediately if there is nothing in the queue, otherwise
+  // we will go to the back of the queue (even if there is allowance available
+  // to run us immediately) in order to be fair to others.
+  bool consumed = false;
+  if (ready_for_purge.empty()) {
+    consumed = _consume(dn, trunc, ops_required);
+  }
+
+  if (consumed) {
+    dout(10) << __func__ << ": purging this dentry immediately: "
+      << *dn << dendl;
+  } else {
+    dout(10) << __func__ << ": enqueuing this dentry for later purge: "
+      << *dn << dendl;
+    ready_for_purge.push_back(QueuedStray(dn, trunc, ops_required));
+  }
+}
+
+void StrayManager::_advance()
+{
+  std::list<QueuedStray>::iterator i;
+  for (i = ready_for_purge.begin();
+       i != ready_for_purge.end(); ++i) {
+    const QueuedStray &qs = *i;
+    const bool consumed = _consume(qs.dn, qs.trunc, qs.ops_required);
+    if (!consumed) {
+      break;
+    }
+  }
+
+  // Erase all the ones that returned true from _consume
+  ready_for_purge.erase(ready_for_purge.begin(), i);
+}
+
+/*
+ * Note that there are compromises to how throttling
+ * is implemented here, in the interests of simplicity:
+ *  * If insufficient ops are available to execute
+ *    the next item on the queue, we block even if
+ *    there are items further down the queue requiring
+ *    fewer ops which might be executable
+ *  * The ops considered "in use" by a purge will be
+ *    an overestimate over the period of execution, as
+ *    we count filer_max_purge_ops and ops for old backtraces
+ *    as in use throughout, even though towards the end
+ *    of the purge the actual ops in flight will be
+ *    lower.
+ *  * The ops limit may be exceeded if the number of ops
+ *    required by a single inode is greater than the
+ *    limit, for example directories with very many
+ *    fragments.
+ */
+bool StrayManager::_consume(CDentry *dn, bool trunc, uint32_t ops_required)
+{
+  const int files_avail = g_conf->mds_max_purge_files - files_purging;
+
+  if (files_avail <= 0) {
+    dout(20) << __func__ << ": throttling on max files" << dendl;
+    return false;
+  } else {
+    dout(20) << __func__ << ": purging dn: " << *dn << dendl;
+  }
+
+  // Calculate how much of the ops allowance is available, allowing
+  // for the case where the limit is currently being exceeded.
+  uint32_t ops_avail;
+  if (ops_in_flight <= max_purge_ops) {
+    ops_avail = max_purge_ops - ops_in_flight;
+  } else {
+    ops_avail = 0;
+  }
+
+  /* The ops_in_flight > 0 condition here handles the case where the
+   * ops required by this inode would never fit in the limit: we wait
+   * instead until nothing else is running */
+  if (ops_in_flight > 0 && ops_avail < ops_required) {
+    dout(20) << __func__ << ": throttling on max ops (require "
+             << ops_required << ", " << ops_in_flight << " in flight" << dendl;
+    return false;
+  }
+
+  // Resources are available, acquire them and execute the purge
+  files_purging += 1;
+  dout(10) << __func__ << ": allocating allowance "
+    << ops_required << " to " << ops_in_flight << " in flight" << dendl;
+  ops_in_flight += ops_required;
+  logger->set(l_mdc_num_purge_ops, ops_in_flight);
+  if (trunc) {
+    truncate(dn, ops_required);
+  } else {
+    purge(dn, ops_required);
+  }
+  return true;
+}
+
+uint32_t StrayManager::_calculate_ops_required(CInode *in, bool trunc)
+{
+  uint32_t ops_required = 0;
+  if (in->is_dir()) {
+    // Directory, count dirfrags to be deleted
+    std::list<frag_t> ls;
+    if (!in->dirfragtree.is_leaf(frag_t())) {
+      in->dirfragtree.get_leaves(ls);
+    }
+    // One for the root, plus any leaves
+    ops_required = 1 + ls.size();
+  } else {
+    // File, work out concurrent Filer::purge deletes
+    const uint64_t period = (uint64_t)in->inode.layout.fl_object_size *
+		      (uint64_t)in->inode.layout.fl_stripe_count;
+    const uint64_t to = MAX(in->inode.max_size_ever,
+            MAX(in->inode.size, in->inode.get_max_size()));
+
+    const uint64_t num = MAX(1, (to + period - 1) / period);
+    ops_required = MIN(num, g_conf->filer_max_purge_ops);
+
+    // Account for removing (or zeroing) backtrace
+    ops_required += 1;
+
+    // Account for deletions for old pools
+    if (!trunc) {
+      ops_required += in->get_projected_inode()->old_pools.size();
+    }
+  }
+
+  return ops_required;
+}
+
+void StrayManager::advance_delayed()
+{
+  for (elist<CDentry*>::iterator p = delayed_eval_stray.begin(); !p.end(); ) {
+    CDentry *dn = *p;
+    ++p;
+    dn->item_stray.remove_myself();
+    num_strays_delayed--;
+
+    if (dn->get_projected_linkage()->is_null()) {
+      /* A special case: a stray dentry can go null if its inode is being
+       * re-linked into another MDS's stray dir during a shutdown migration. */
+      dout(4) << __func__ << ": delayed dentry is now null: " << *dn << dendl;
+      continue;
+    }
+
+    const bool purging = eval_stray(dn);
+    if (!purging) {
+      derr << "Dentry " << *dn << " was purgeable but no longer is!" << dendl;
+      /*
+       * This can happen if a stray is purgeable, but has gained an extra
+       * reference by virtue of having its backtrace updated.
+       * FIXME perhaps we could simplify this further by
+       * avoiding writing the backtrace of purge-ready strays, so
+       * that this code could be more rigid?
+       */
+    }
+  }
+  logger->set(l_mdc_num_strays_delayed, num_strays_delayed);
+}
+
+void StrayManager::notify_stray_created()
+{
+  num_strays++;
+  logger->set(l_mdc_num_strays, num_strays);
+  logger->inc(l_mdc_strays_created);
+}
+
+void StrayManager::notify_stray_removed()
+{
+  num_strays--;
+  logger->set(l_mdc_num_strays, num_strays);
+}
+
+struct C_EvalStray : public StrayManagerContext {
+  CDentry *dn;
+  C_EvalStray(StrayManager *sm_, CDentry *d) : StrayManagerContext(sm_), dn(d) {}
+  void finish(int r) {
+    sm->eval_stray(dn);
+  }
+};
+
+struct C_MDC_EvalStray : public StrayManagerContext {
+  CDentry *dn;
+  C_MDC_EvalStray(StrayManager *sm_, CDentry *d) : StrayManagerContext(sm_), dn(d) {}
+  void finish(int r) {
+    sm->eval_stray(dn);
+  }
+};
+
+bool StrayManager::__eval_stray(CDentry *dn, bool delay)
+{
+  dout(10) << "eval_stray " << *dn << dendl;
+  CDentry::linkage_t *dnl = dn->get_projected_linkage();
+  assert(dnl->is_primary());
+  dout(10) << " inode is " << *dnl->get_inode() << dendl;
+  CInode *in = dnl->get_inode();
+  assert(in);
+
+  // The only dentries elegible for purging are those
+  // in the stray directories
+  assert(dn->get_dir()->get_inode()->is_stray());
+
+  // Inode may not pass through this function if it
+  // was already identified for purging (i.e. cannot
+  // call eval_stray() after purge()
+  assert(!dn->state_test(CDentry::STATE_PURGING));
+
+  if (!dn->is_auth()) {
+    // has to be mine
+    // move to bottom of lru so that we trim quickly!
+
+    in->mdcache->touch_dentry_bottom(dn);
+    return false;
+  }
+
+  // purge?
+  if (in->inode.nlink == 0) {
+    // past snaprealm parents imply snapped dentry remote links.
+    // only important for directories.  normal file data snaps are handled
+    // by the object store.
+    if (in->snaprealm) {
+      if (!in->snaprealm->have_past_parents_open() &&
+          !in->snaprealm->open_parents(new C_MDC_EvalStray(this, dn))) {
+        return false;
+      }
+      in->snaprealm->prune_past_parents();
+      in->purge_stale_snap_data(in->snaprealm->get_snaps());
+    }
+    if (in->is_dir()) {
+      if (in->snaprealm && in->snaprealm->has_past_parents()) {
+	dout(20) << "  directory has past parents "
+          << in->snaprealm->srnode.past_parents << dendl;
+	return false;  // not until some snaps are deleted.
+      }
+
+      if (in->has_dirfrags()) {
+        list<CDir*> ls;
+        in->get_nested_dirfrags(ls);
+        for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
+          (*p)->try_remove_dentries_for_stray();
+        }
+      }
+
+      if (!in->remote_parents.empty()) {
+	// unlink any stale remote snap dentry.
+	for (compact_set<CDentry*>::iterator p = in->remote_parents.begin();
+	     p != in->remote_parents.end(); ) {
+	  CDentry *remote_dn = *p;
+	  ++p;
+	  assert(remote_dn->last != CEPH_NOSNAP);
+	  remote_dn->unlink_remote(remote_dn->get_linkage());
+	}
+      }
+    }
+    if (dn->is_replicated()) {
+      dout(20) << " replicated" << dendl;
+      return false;
+    }
+    if (dn->is_any_leases() || in->is_any_caps()) {
+      dout(20) << " caps | leases" << dendl;
+      return false;  // wait
+    }
+    if (in->state_test(CInode::STATE_NEEDSRECOVER) ||
+	in->state_test(CInode::STATE_RECOVERING)) {
+      dout(20) << " pending recovery" << dendl;
+      return false;  // don't mess with file size probing
+    }
+    if (in->get_num_ref() > (int)in->is_dirty() + (int)in->is_dirty_parent()) {
+      dout(20) << " too many inode refs" << dendl;
+      return false;
+    }
+    if (dn->get_num_ref() > (int)dn->is_dirty() + !!in->get_num_ref()) {
+      dout(20) << " too many dn refs" << dendl;
+      return false;
+    }
+    if (delay) {
+      if (!dn->item_stray.is_on_list()) {
+	delayed_eval_stray.push_back(&dn->item_stray);
+	num_strays_delayed++;
+	logger->set(l_mdc_num_strays_delayed, num_strays_delayed);
+      }
+    // don't purge multiversion inode with snap data
+    } else if (in->snaprealm && in->snaprealm->has_past_parents() &&
+              !in->old_inodes.empty()) {
+      // A file with snapshots: we will truncate the HEAD revision
+      // but leave the metadata intact.
+      assert(!in->is_dir());
+      dout(20) << " file has past parents "
+        << in->snaprealm->srnode.past_parents << dendl;
+      if (in->is_file() && in->get_projected_inode()->size > 0) {
+	enqueue(dn, true); // truncate head objects    
+      }
+    } else {
+      // A straightforward file, ready to be purged.  Enqueue it.
+      if (in->is_dir()) {
+	in->close_dirfrags();
+      }
+
+      enqueue(dn, false);
+    }
+
+    return true;
+  } else {
+    /*
+     * Where a stray has some links, they should be remotes, check
+     * if we can do anything with them if we happen to have them in
+     * cache.
+     */
+    eval_remote_stray(dn, NULL);
+    return false;
+  }
+}
+
+bool StrayManager::eval_stray(CDentry *dn, bool delay)
+{
+  // avoid nested eval_stray
+  if (dn->state_test(CDentry::STATE_EVALUATINGSTRAY))
+      return false;
+
+  dn->state_set(CDentry::STATE_EVALUATINGSTRAY);
+  bool ret = __eval_stray(dn, delay);
+  dn->state_clear(CDentry::STATE_EVALUATINGSTRAY);
+  return ret;
+}
+
+void StrayManager::eval_remote_stray(CDentry *stray_dn, CDentry *remote_dn)
+{
+  assert(stray_dn != NULL);
+  assert(stray_dn->get_dir()->get_inode()->is_stray());
+  CDentry::linkage_t *stray_dnl = stray_dn->get_projected_linkage();
+  assert(stray_dnl->is_primary());
+  CInode *stray_in = stray_dnl->get_inode();
+  assert(stray_in->inode.nlink >= 1);
+  assert(stray_in->last == CEPH_NOSNAP);
+
+  /* If no remote_dn hinted, pick one arbitrarily */
+  if (remote_dn == NULL) {
+    if (!stray_in->remote_parents.empty()) {
+      for (compact_set<CDentry*>::iterator p = stray_in->remote_parents.begin();
+	   p != stray_in->remote_parents.end();
+	   ++p)
+	if ((*p)->last == CEPH_NOSNAP) {
+	  remote_dn = *p;
+	  break;
+	}
+    }
+    if (!remote_dn) {
+      dout(20) << __func__ << ": not reintegrating (no remote parents in cache)" << dendl;
+      return;
+    }
+  }
+  assert(remote_dn->last == CEPH_NOSNAP);
+    // NOTE: we repeat this check in _rename(), since our submission path is racey.
+    if (!remote_dn->is_projected()) {
+      if (remote_dn->is_auth() && remote_dn->dir->can_auth_pin()) {
+        reintegrate_stray(stray_dn, remote_dn);
+      } else if (!remote_dn->is_auth() && stray_dn->is_auth()) {
+        migrate_stray(stray_dn, remote_dn->authority().first);
+      } else {
+        dout(20) << __func__ << ": not reintegrating" << dendl;
+      }
+    } else {
+      // don't do anything if the remote parent is projected, or we may
+      // break user-visible semantics!
+      dout(20) << __func__ << ": not reintegrating (projected)" << dendl;
+    }
+}
+
+void StrayManager::reintegrate_stray(CDentry *straydn, CDentry *rdn)
+{
+  dout(10) << __func__ << " " << *straydn << " into " << *rdn << dendl;
+
+  logger->inc(l_mdc_strays_reintegrated);
+  
+  // rename it to another mds.
+  filepath src;
+  straydn->make_path(src);
+  filepath dst;
+  rdn->make_path(dst);
+
+  MClientRequest *req = new MClientRequest(CEPH_MDS_OP_RENAME);
+  req->set_filepath(dst);
+  req->set_filepath2(src);
+  req->set_tid(mds->issue_tid());
+
+  mds->send_message_mds(req, rdn->authority().first);
+}
+ 
+void StrayManager::migrate_stray(CDentry *dn, mds_rank_t to)
+{
+  CInode *in = dn->get_linkage()->get_inode();
+  assert(in);
+  CInode *diri = dn->dir->get_inode();
+  assert(diri->is_stray());
+  dout(10) << "migrate_stray from mds." << MDS_INO_STRAY_OWNER(diri->inode.ino)
+	   << " to mds." << to
+	   << " " << *dn << " " << *in << dendl;
+
+  logger->inc(l_mdc_strays_migrated);
+
+  // rename it to another mds.
+  filepath src;
+  dn->make_path(src);
+
+  string dname;
+  in->name_stray_dentry(dname);
+  filepath dst(dname, MDS_INO_STRAY(to, 0));
+
+  MClientRequest *req = new MClientRequest(CEPH_MDS_OP_RENAME);
+  req->set_filepath(dst);
+  req->set_filepath2(src);
+  req->set_tid(mds->issue_tid());
+
+  mds->send_message_mds(req, to);
+}
+
+StrayManager::StrayManager(MDSRank *mds)
+  : delayed_eval_stray(member_offset(CDentry, item_stray)),
+    mds(mds), logger(NULL),
+    ops_in_flight(0), files_purging(0),
+    max_purge_ops(0), 
+    num_strays(0), num_strays_purging(0), num_strays_delayed(0),
+    filer(mds->objecter, mds->finisher)
+{
+  assert(mds != NULL);
+}
+
+void StrayManager::abort_queue()
+{
+  for (std::list<QueuedStray>::iterator i = ready_for_purge.begin();
+       i != ready_for_purge.end(); ++i)
+  {
+    const QueuedStray &qs = *i;
+    CDentry *dn = qs.dn;
+    dout(10) << __func__ << ": aborting enqueued purge " << *dn << dendl;
+
+    CDentry::linkage_t *dnl = dn->get_projected_linkage();
+    assert(dnl);
+    CInode *in = dnl->get_inode();
+    assert(in);
+
+    // Clear flags set in enqueue
+    dn->state_clear(CDentry::STATE_PURGING);
+    dn->put(CDentry::PIN_PURGING);
+    in->state_clear(CInode::STATE_PURGING);
+  }
+  ready_for_purge.clear();
+}
+
+void StrayManager::truncate(CDentry *dn, uint32_t op_allowance)
+{
+  CDentry::linkage_t *dnl = dn->get_projected_linkage();
+  CInode *in = dnl->get_inode();
+  assert(in);
+  dout(10) << __func__ << ": " << *dn << " " << *in << dendl;
+  assert(!dn->is_replicated());
+
+  num_strays_purging++;
+  logger->set(l_mdc_num_strays_purging, num_strays_purging);
+
+  C_GatherBuilder gather(
+    g_ceph_context,
+    new C_OnFinisher(new C_IO_PurgeStrayPurged(this, dn, true, 0),
+		     mds->finisher));
+
+  SnapRealm *realm = in->find_snaprealm();
+  assert(realm);
+  dout(10) << " realm " << *realm << dendl;
+  const SnapContext *snapc = &realm->get_snap_context();
+
+  uint64_t period = (uint64_t)in->inode.layout.fl_object_size *
+		    (uint64_t)in->inode.layout.fl_stripe_count;
+  uint64_t to = in->inode.get_max_size();
+  to = MAX(in->inode.size, to);
+  // when truncating a file, the filer does not delete stripe objects that are
+  // truncated to zero. so we need to purge stripe objects up to the max size
+  // the file has ever been.
+  to = MAX(in->inode.max_size_ever, to);
+  if (period && to > period) {
+    uint64_t num = (to - 1) / period;
+    dout(10) << __func__ << " 0~" << to << " objects 0~" << num
+      << " snapc " << snapc << " on " << *in << dendl;
+    filer.purge_range(in->ino(), &in->inode.layout, *snapc,
+			    1, num, ceph_clock_now(g_ceph_context),
+			    0, gather.new_sub());
+  }
+
+  // keep backtrace object
+  if (period && to > 0) {
+    filer.zero(in->ino(), &in->inode.layout, *snapc,
+		     0, period, ceph_clock_now(g_ceph_context),
+		     0, true, NULL, gather.new_sub());
+  }
+
+  assert(gather.has_subs());
+  gather.activate();
+}
+
+void StrayManager::_truncate_stray_logged(CDentry *dn, LogSegment *ls)
+{
+  CInode *in = dn->get_projected_linkage()->get_inode();
+
+  dout(10) << __func__ << ": " << *dn << " " << *in << dendl;
+
+  dn->state_clear(CDentry::STATE_PURGING);
+  dn->put(CDentry::PIN_PURGING);
+
+  in->pop_and_dirty_projected_inode(ls);
+
+  eval_stray(dn);
+}
+
+
+const char** StrayManager::get_tracked_conf_keys() const
+{
+  static const char* KEYS[] = {
+    "mds_max_purge_ops",
+    "mds_max_purge_ops_per_pg",
+    NULL
+  };
+  return KEYS;
+}
+
+void StrayManager::handle_conf_change(const struct md_config_t *conf,
+			  const std::set <std::string> &changed)
+{
+  if (changed.count("mds_max_purge_ops")
+      || changed.count("mds_max_purge_ops_per_pg")) {
+    update_op_limit();
+  }
+}
+
+
+void StrayManager::update_op_limit()
+{
+  const OSDMap *osdmap = mds->objecter->get_osdmap_read();
+  assert(osdmap != NULL);
+
+  // Number of PGs across all data pools
+  uint64_t pg_count = 0;
+  const std::set<int64_t> &data_pools = mds->mdsmap->get_data_pools();
+  for (std::set<int64_t>::iterator i = data_pools.begin();
+       i != data_pools.end(); ++i) {
+    if (osdmap->get_pg_pool(*i) == NULL) {
+      // It is possible that we have an older OSDMap than MDSMap, because
+      // we don't start watching every OSDMap until after MDSRank is
+      // initialized
+      dout(4) << __func__ << " data pool " << *i
+              << " not found in OSDMap" << dendl;
+      continue;
+    }
+    pg_count += osdmap->get_pg_num(*i);
+  }
+
+  mds->objecter->put_osdmap_read();
+
+  uint64_t mds_count = mds->mdsmap->get_max_mds();
+
+  // Work out a limit based on n_pgs / n_mdss, multiplied by the user's
+  // preference for how many ops per PG
+  max_purge_ops = uint64_t(((double)pg_count / (double)mds_count) * g_conf->mds_max_purge_ops_per_pg);
+
+  // User may also specify a hard limit, apply this if so.
+  if (g_conf->mds_max_purge_ops) {
+    max_purge_ops = MIN(max_purge_ops, g_conf->mds_max_purge_ops);
+  }
+}
+
diff --git a/src/mds/StrayManager.h b/src/mds/StrayManager.h
new file mode 100644
index 0000000..1db88b1
--- /dev/null
+++ b/src/mds/StrayManager.h
@@ -0,0 +1,252 @@
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef STRAY_MANAGER_H
+#define STRAY_MANAGER_H
+
+#include "include/elist.h"
+#include <list>
+#include "osdc/Filer.h"
+
+class MDSRank;
+class PerfCounters;
+class CInode;
+class CDentry;
+
+class StrayManager : public md_config_obs_t
+{
+  protected:
+  class QueuedStray {
+    public:
+    CDentry *dn;
+    bool trunc;
+    uint32_t ops_required;
+    QueuedStray(CDentry *dn_, bool t, uint32_t ops)
+      : dn(dn_), trunc(t), ops_required(ops) {}
+  };
+
+  // Has passed through eval_stray and still has refs
+  elist<CDentry*> delayed_eval_stray;
+
+  // No more refs, can purge these
+  std::list<QueuedStray> ready_for_purge;
+
+  // Global references for doing I/O
+  MDSRank *mds;
+  PerfCounters *logger;
+
+  // Throttled allowances
+  uint64_t ops_in_flight;
+  uint64_t files_purging;
+
+  // Dynamic op limit per MDS based on PG count
+  uint64_t max_purge_ops;
+
+  // Statistics
+  uint64_t num_strays;
+  uint64_t num_strays_purging;
+  uint64_t num_strays_delayed;
+
+  Filer filer;
+
+  void truncate(CDentry *dn, uint32_t op_allowance);
+
+  /**
+   * Purge a dentry from a stray directory.  This function
+   * is called once eval_stray is satisfied and StrayManager
+   * throttling is also satisfied.  There is no going back
+   * at this stage!
+   */
+  void purge(CDentry *dn, uint32_t op_allowance);
+
+  /**
+   * Completion handler for a Filer::purge on a stray inode.
+   */
+  void _purge_stray_purged(CDentry *dn, uint32_t ops, bool only_head);
+
+  void _purge_stray_logged(CDentry *dn, version_t pdv, LogSegment *ls);
+
+  /**
+   * Callback: we have logged the update to an inode's metadata
+   * reflecting it's newly-zeroed length.
+   */
+  void _truncate_stray_logged(CDentry *dn, LogSegment *ls);
+
+  friend class StrayManagerIOContext;
+  friend class StrayManagerContext;
+
+  friend class C_PurgeStrayLogged;
+  friend class C_TruncateStrayLogged;
+  friend class C_IO_PurgeStrayPurged;
+
+  /**
+   * Enqueue a purge operation on a dentry that has passed the tests
+   * in eval_stray.  This may start the operation inline if the throttle
+   * allowances are already available.
+   *
+   * @param trunc false to purge dentry (normal), true to just truncate
+   *                 inode (snapshots)
+   */
+  void enqueue(CDentry *dn, bool trunc);
+
+  /**
+   * Iteratively call _consume on items from the ready_for_purge
+   * list until it returns false (throttle limit reached)
+   */
+  void _advance();
+
+  /**
+   * Attempt to purge an inode, if throttling permits
+   * its.
+   *
+   * Return true if we successfully consumed resource,
+   * false if insufficient resource was available.
+   */
+  bool _consume(CDentry *dn, bool trunc, uint32_t ops_required);
+
+  /**
+   * Return the maximum number of concurrent RADOS ops that
+   * may be executed while purging this inode.
+   *
+   * @param trunc true if it's a truncate, false if it's a purge
+   */
+  uint32_t _calculate_ops_required(CInode *in, bool trunc);
+
+  /**
+   * When hard links exist to an inode whose primary dentry
+   * is unlinked, the inode gets a stray primary dentry.
+   *
+   * We may later "reintegrate" the inode into a remaining
+   * non-stray dentry (one of what was previously a remote
+   * dentry) by issuing a rename from the stray to the other
+   * dentry.
+   */
+  void reintegrate_stray(CDentry *dn, CDentry *rlink);
+
+  /**
+   * Evaluate a stray dentry for purging or reintegration.
+   *
+   * purging: If the inode has no linkage, and no more references, then
+   *          we may decide to purge it.
+   *
+   * reintegration: If the inode still has linkage, then it means someone else
+   *                (a hard link) is still referring to it, and we should
+   *                think about reintegrating that inode into the remote dentry.
+   *
+   * @returns true if the dentry will be purged (caller should never
+   *          take more refs after this happens), else false.
+   */
+  bool __eval_stray(CDentry *dn, bool delay=false);
+
+  // My public interface is for consumption by MDCache
+  public:
+  StrayManager(MDSRank *mds);
+  void set_logger(PerfCounters *l) {logger = l;}
+
+  bool eval_stray(CDentry *dn, bool delay=false);
+
+  /**
+   * Where eval_stray was previously invoked with delay=true, call
+   * eval_stray again for any dentries that were put on the
+   * delayed_eval_stray list as a result of the original call.
+   *
+   * Used so that various places can call eval_stray(delay=true) during
+   * an operation to identify dentries of interest, and then call
+   * this function later during trim in order to do the final
+   * evaluation (and resulting actions) while not in the middle of another
+   * metadata operation.
+   */
+  void advance_delayed();
+
+  /**
+   * When a metadata op touches a remote dentry that points to
+   * a stray, call in here to evaluate it for migration (move
+   * a stray residing on another MDS to this MDS) or reintegration
+   * (move a stray dentry's inode into a non-stray hardlink dentry and
+   * clean up the stray).
+   *
+   * @param stray_dn a stray dentry whose inode has been referenced
+   *                 by a remote dentry
+   * @param remote_dn (optional) which remote dentry was touched
+   *                  in an operation that led us here: this is used
+   *                  as a hint for which remote to reintegrate into
+   *                  if there are multiple remotes.
+   */
+  void eval_remote_stray(CDentry *stray_dn, CDentry *remote_dn=NULL);
+
+  /**
+   * Given a dentry within one of my stray directories,
+   * send it off to a stray directory in another MDS.
+   *
+   * This is for use:
+   *  * Case A: when shutting down a rank, we migrate strays
+   *    away from ourselves rather than waiting for purge
+   *  * Case B: when a client request has a trace that refers to
+   *    a stray inode on another MDS, we migrate that inode from
+   *    there to here, in order that we can later re-integrate it
+   *    here.
+   *
+   * In case B, the receiver should be calling into eval_stray
+   * on completion of mv (i.e. inode put), resulting in a subsequent
+   * reintegration.
+   */
+  void migrate_stray(CDentry *dn, mds_rank_t dest);
+
+  /**
+   * Update stats to reflect a newly created stray dentry.  Needed
+   * because stats on strays live here, but creation happens
+   * in Server or MDCache.  For our purposes "creation" includes
+   * loading a stray from a dirfrag and migrating a stray from
+   * another MDS, in addition to creations per-se.
+   */
+  void notify_stray_created();
+
+  /**
+   * Update stats to reflect a removed stray dentry.  Needed because
+   * stats on strays live here, but removal happens in Server or
+   * MDCache.  Also includes migration (rename) of strays from
+   * this MDS to another MDS.
+   */
+  void notify_stray_removed();
+
+  /**
+   * For any strays that are enqueued for purge, but
+   * currently blocked on throttling, clear their
+   * purging status.  Used during MDS rank shutdown
+   * so that it can migrate these strays instead
+   * of waiting for them to trickle through the
+   * queue.
+   */
+  void abort_queue();
+
+  /*
+   * Calculate our local RADOS op throttle limit based on
+   * (mds_max_purge_ops_per_pg / number_of_mds) * number_of_pg
+   *
+   * Call this whenever one of those operands changes.
+   */
+  void update_op_limit();
+
+  /**
+   * Subscribe to changes on mds_max_purge_ops
+   */
+  virtual const char** get_tracked_conf_keys() const;
+
+  /**
+   * Call update_op_limit if mds_max_purge_ops changes
+   */
+  virtual void handle_conf_change(const struct md_config_t *conf,
+			  const std::set <std::string> &changed);
+};
+
+#endif  // STRAY_MANAGER_H
diff --git a/src/mds/events/ECommitted.h b/src/mds/events/ECommitted.h
index 2889a3b..48de756 100644
--- a/src/mds/events/ECommitted.h
+++ b/src/mds/events/ECommitted.h
@@ -36,7 +36,7 @@ public:
   static void generate_test_instances(list<ECommitted*>& ls);
 
   void update_segment() {}
-  void replay(MDS *mds);
+  void replay(MDSRank *mds);
 };
 
 #endif
diff --git a/src/mds/events/EExport.h b/src/mds/events/EExport.h
index 1cab674..64a3373 100644
--- a/src/mds/events/EExport.h
+++ b/src/mds/events/EExport.h
@@ -18,7 +18,7 @@
 #include "common/config.h"
 #include "include/types.h"
 
-#include "../MDS.h"
+#include "../MDSRank.h"
 
 #include "EMetaBlob.h"
 #include "../LogEvent.h"
@@ -48,7 +48,7 @@ public:
   void decode(bufferlist::iterator &bl);
   void dump(Formatter *f) const;
   static void generate_test_instances(list<EExport*>& ls);
-  void replay(MDS *mds);
+  void replay(MDSRank *mds);
 
 };
 
diff --git a/src/mds/events/EFragment.h b/src/mds/events/EFragment.h
index 56f5485..fac7d36 100644
--- a/src/mds/events/EFragment.h
+++ b/src/mds/events/EFragment.h
@@ -74,7 +74,7 @@ public:
   void decode(bufferlist::iterator &bl);
   void dump(Formatter *f) const;
   static void generate_test_instances(list<EFragment*>& ls);
-  void replay(MDS *mds);
+  void replay(MDSRank *mds);
 };
 
 #endif
diff --git a/src/mds/events/EImportFinish.h b/src/mds/events/EImportFinish.h
index 7ed25e1..4129a4c 100644
--- a/src/mds/events/EImportFinish.h
+++ b/src/mds/events/EImportFinish.h
@@ -18,7 +18,7 @@
 #include "common/config.h"
 #include "include/types.h"
 
-#include "../MDS.h"
+#include "../MDSRank.h"
 #include "../LogEvent.h"
 
 class EImportFinish : public LogEvent {
@@ -45,7 +45,7 @@ class EImportFinish : public LogEvent {
   void dump(Formatter *f) const;
   static void generate_test_instances(list<EImportFinish*>& ls);
   
-  void replay(MDS *mds);
+  void replay(MDSRank *mds);
 
 };
 
diff --git a/src/mds/events/EImportStart.h b/src/mds/events/EImportStart.h
index 17d0bdd..9f47d49 100644
--- a/src/mds/events/EImportStart.h
+++ b/src/mds/events/EImportStart.h
@@ -18,7 +18,8 @@
 #include "common/config.h"
 #include "include/types.h"
 
-#include "../MDS.h"
+class MDLog;
+class MDSRank;
 
 #include "EMetaBlob.h"
 #include "../LogEvent.h"
@@ -52,7 +53,7 @@ protected:
   static void generate_test_instances(list<EImportStart*>& ls);
   
   void update_segment();
-  void replay(MDS *mds);
+  void replay(MDSRank *mds);
 
 };
 
diff --git a/src/mds/events/EMetaBlob.h b/src/mds/events/EMetaBlob.h
index 60f64f8..7d219e9 100644
--- a/src/mds/events/EMetaBlob.h
+++ b/src/mds/events/EMetaBlob.h
@@ -24,7 +24,7 @@
 
 #include "include/interval_set.h"
 
-class MDS;
+class MDSRank;
 class MDLog;
 class LogSegment;
 struct MDSlaveUpdate;
@@ -62,6 +62,7 @@ public:
     static const int STATE_DIRTY =	 (1<<0);
     static const int STATE_DIRTYPARENT = (1<<1);
     static const int STATE_DIRTYPOOL   = (1<<2);
+    typedef compact_map<snapid_t, old_inode_t> old_inodes_t;
     string  dn;         // dentry
     snapid_t dnfirst, dnlast;
     version_t dnv;
@@ -72,7 +73,6 @@ public:
     snapid_t oldest_snap;
     bufferlist snapbl;
     __u8 state;
-    typedef map<snapid_t, old_inode_t> old_inodes_t;
     old_inodes_t old_inodes;
 
     fullbit(const fullbit& o);
@@ -105,7 +105,7 @@ public:
     void dump(Formatter *f) const;
     static void generate_test_instances(list<EMetaBlob::fullbit*>& ls);
 
-    void update_inode(MDS *mds, CInode *in);
+    void update_inode(MDSRank *mds, CInode *in);
     bool is_dirty() const { return (state & STATE_DIRTY); }
     bool is_dirty_parent() const { return (state & STATE_DIRTYPARENT); }
     bool is_dirty_pool() const { return (state & STATE_DIRTYPOOL); }
@@ -314,6 +314,7 @@ private:
 
   // idempotent op(s)
   list<pair<metareqid_t,uint64_t> > client_reqs;
+  list<pair<metareqid_t,uint64_t> > client_flushes;
 
  public:
   void encode(bufferlist& bl) const;
@@ -346,6 +347,9 @@ private:
   void add_client_req(metareqid_t r, uint64_t tid=0) {
     client_reqs.push_back(pair<metareqid_t,uint64_t>(r, tid));
   }
+  void add_client_flush(metareqid_t r, uint64_t tid=0) {
+    client_flushes.push_back(pair<metareqid_t,uint64_t>(r, tid));
+  }
 
   void add_table_transaction(int table, version_t tid) {
     table_tids.push_back(pair<__u8, version_t>(table, tid));
@@ -376,7 +380,7 @@ private:
     truncate_finish[ino] = segoff;
   }
   
-  bool rewrite_truncate_finish(MDS const *mds, std::map<uint64_t, uint64_t> const &old_to_new);
+  bool rewrite_truncate_finish(MDSRank const *mds, std::map<uint64_t, uint64_t> const &old_to_new);
 
   void add_destroyed_inode(inodeno_t ino) {
     destroyed_inodes.push_back(ino);
@@ -574,7 +578,7 @@ private:
   }
 
   void update_segment(LogSegment *ls);
-  void replay(MDS *mds, LogSegment *ls, MDSlaveUpdate *su=NULL);
+  void replay(MDSRank *mds, LogSegment *ls, MDSlaveUpdate *su=NULL);
 };
 WRITE_CLASS_ENCODER(EMetaBlob)
 WRITE_CLASS_ENCODER(EMetaBlob::fullbit)
diff --git a/src/mds/events/ENoOp.h b/src/mds/events/ENoOp.h
index a66d540..9a585c2 100644
--- a/src/mds/events/ENoOp.h
+++ b/src/mds/events/ENoOp.h
@@ -28,7 +28,7 @@ public:
   void decode(bufferlist::iterator& bl);
   void dump(Formatter *f) const {}
 
-  void replay(MDS *mds);
+  void replay(MDSRank *mds);
 };
 
 #endif
diff --git a/src/mds/events/EOpen.h b/src/mds/events/EOpen.h
index 207b32b..c22a585 100644
--- a/src/mds/events/EOpen.h
+++ b/src/mds/events/EOpen.h
@@ -50,7 +50,7 @@ public:
   static void generate_test_instances(list<EOpen*>& ls);
 
   void update_segment();
-  void replay(MDS *mds);
+  void replay(MDSRank *mds);
 };
 
 #endif
diff --git a/src/mds/events/EResetJournal.h b/src/mds/events/EResetJournal.h
index c782f29..5f7e9a3 100644
--- a/src/mds/events/EResetJournal.h
+++ b/src/mds/events/EResetJournal.h
@@ -32,7 +32,7 @@ class EResetJournal : public LogEvent {
     out << "EResetJournal";
   }
 
-  void replay(MDS *mds);
+  void replay(MDSRank *mds);
 };
 
 #endif
diff --git a/src/mds/events/ESession.h b/src/mds/events/ESession.h
index 3d55c83..c3b2fc0 100644
--- a/src/mds/events/ESession.h
+++ b/src/mds/events/ESession.h
@@ -66,7 +66,7 @@ class ESession : public LogEvent {
   }
   
   void update_segment();
-  void replay(MDS *mds);
+  void replay(MDSRank *mds);
   entity_inst_t get_client_inst() const {return client_inst;}
 };
 
diff --git a/src/mds/events/ESessions.h b/src/mds/events/ESessions.h
index fe943a8..5b5b594 100644
--- a/src/mds/events/ESessions.h
+++ b/src/mds/events/ESessions.h
@@ -53,7 +53,7 @@ public:
   }
   
   void update_segment();
-  void replay(MDS *mds);  
+  void replay(MDSRank *mds);  
 };
 
 #endif
diff --git a/src/mds/events/ESlaveUpdate.h b/src/mds/events/ESlaveUpdate.h
index bfac121..41e1bb0 100644
--- a/src/mds/events/ESlaveUpdate.h
+++ b/src/mds/events/ESlaveUpdate.h
@@ -143,7 +143,7 @@ public:
   void dump(Formatter *f) const;
   static void generate_test_instances(list<ESlaveUpdate*>& ls);
 
-  void replay(MDS *mds);
+  void replay(MDSRank *mds);
 };
 
 #endif
diff --git a/src/mds/events/ESubtreeMap.h b/src/mds/events/ESubtreeMap.h
index 4dea7ec..9a5f193 100644
--- a/src/mds/events/ESubtreeMap.h
+++ b/src/mds/events/ESubtreeMap.h
@@ -41,7 +41,7 @@ public:
   void dump(Formatter *f) const;
   static void generate_test_instances(list<ESubtreeMap*>& ls);
 
-  void replay(MDS *mds);
+  void replay(MDSRank *mds);
 };
 
 #endif
diff --git a/src/mds/events/ETableClient.h b/src/mds/events/ETableClient.h
index e415e60..aaa1664 100644
--- a/src/mds/events/ETableClient.h
+++ b/src/mds/events/ETableClient.h
@@ -42,7 +42,7 @@ struct ETableClient : public LogEvent {
   }  
 
   //void update_segment();
-  void replay(MDS *mds);  
+  void replay(MDSRank *mds);  
 };
 
 #endif
diff --git a/src/mds/events/ETableServer.h b/src/mds/events/ETableServer.h
index 6dff2eb..826d097 100644
--- a/src/mds/events/ETableServer.h
+++ b/src/mds/events/ETableServer.h
@@ -52,7 +52,7 @@ struct ETableServer : public LogEvent {
   }  
 
   void update_segment();
-  void replay(MDS *mds);  
+  void replay(MDSRank *mds);  
 };
 
 #endif
diff --git a/src/mds/events/EUpdate.h b/src/mds/events/EUpdate.h
index d3455e1..672af27 100644
--- a/src/mds/events/EUpdate.h
+++ b/src/mds/events/EUpdate.h
@@ -46,7 +46,7 @@ public:
   static void generate_test_instances(list<EUpdate*>& ls);
 
   void update_segment();
-  void replay(MDS *mds);
+  void replay(MDSRank *mds);
   EMetaBlob const *get_metablob() const {return &metablob;}
 };
 
diff --git a/src/mds/flock.h b/src/mds/flock.h
index bf3980d..37149eb 100644
--- a/src/mds/flock.h
+++ b/src/mds/flock.h
@@ -215,6 +215,11 @@ public:
     client_held_lock_counts.clear();
     client_waiting_lock_counts.clear();
   }
+  bool empty() const {
+    return held_locks.empty() && waiting_locks.empty() &&
+	   client_held_lock_counts.empty() &&
+	   client_waiting_lock_counts.empty();
+  }
 };
 WRITE_CLASS_ENCODER(ceph_lock_state_t)
 
diff --git a/src/mds/journal.cc b/src/mds/journal.cc
index 1870ec7..406a929 100644
--- a/src/mds/journal.cc
+++ b/src/mds/journal.cc
@@ -39,7 +39,7 @@
 
 #include "LogSegment.h"
 
-#include "MDS.h"
+#include "MDSRank.h"
 #include "MDLog.h"
 #include "MDCache.h"
 #include "Server.h"
@@ -63,7 +63,7 @@
 // -----------------------
 // LogSegment
 
-void LogSegment::try_to_expire(MDS *mds, MDSGatherBuilder &gather_bld, int op_prio)
+void LogSegment::try_to_expire(MDSRank *mds, MDSGatherBuilder &gather_bld, int op_prio)
 {
   set<CDir*> commit;
 
@@ -229,14 +229,18 @@ void LogSegment::try_to_expire(MDS *mds, MDSGatherBuilder &gather_bld, int op_pr
   }
 
   // sessionmap
-  if (sessionmapv > mds->sessionmap.committed) {
+  if (sessionmapv > mds->sessionmap.get_committed()) {
     dout(10) << "try_to_expire saving sessionmap, need " << sessionmapv 
-	      << ", committed is " << mds->sessionmap.committed
-	      << " (" << mds->sessionmap.committing << ")"
+	      << ", committed is " << mds->sessionmap.get_committed()
+	      << " (" << mds->sessionmap.get_committing() << ")"
 	      << dendl;
     mds->sessionmap.save(gather_bld.new_sub(), sessionmapv);
   }
 
+  // updates to sessions for completed_requests
+  mds->sessionmap.save_if_dirty(touched_sessions, &gather_bld);
+  touched_sessions.clear();
+
   // pending commit atids
   for (map<int, ceph::unordered_set<version_t> >::iterator p = pending_commit_tids.begin();
        p != pending_commit_tids.end();
@@ -297,7 +301,7 @@ EMetaBlob::EMetaBlob(MDLog *mdlog) : opened_ino(0), renamed_dirino(0),
 
 void EMetaBlob::add_dir_context(CDir *dir, int mode)
 {
-  MDS *mds = dir->cache->mds;
+  MDSRank *mds = dir->cache->mds;
 
   list<CDentry*> parents;
 
@@ -505,7 +509,8 @@ void EMetaBlob::fullbit::dump(Formatter *f) const
   if (!old_inodes.empty()) {
     f->open_array_section("old inodes");
     for (old_inodes_t::const_iterator iter = old_inodes.begin();
-	iter != old_inodes.end(); ++iter) {
+	 iter != old_inodes.end();
+	 ++iter) {
       f->open_object_section("inode");
       f->dump_int("snapid", iter->first);
       iter->second.dump(f);
@@ -527,7 +532,7 @@ void EMetaBlob::fullbit::generate_test_instances(list<EMetaBlob::fullbit*>& ls)
   ls.push_back(sample);
 }
 
-void EMetaBlob::fullbit::update_inode(MDS *mds, CInode *in)
+void EMetaBlob::fullbit::update_inode(MDSRank *mds, CInode *in)
 {
   in->inode = inode;
   in->xattrs = xattrs;
@@ -748,7 +753,7 @@ void EMetaBlob::dirlump::generate_test_instances(list<dirlump*>& ls)
  */
 void EMetaBlob::encode(bufferlist& bl) const
 {
-  ENCODE_START(7, 5, bl);
+  ENCODE_START(8, 5, bl);
   ::encode(lump_order, bl);
   ::encode(lump_map, bl);
   ::encode(roots, bl);
@@ -767,12 +772,13 @@ void EMetaBlob::encode(bufferlist& bl) const
   ::encode(renamed_dirino, bl);
   ::encode(renamed_dir_frags, bl);
   {
-    // make MDS use v6 format happy
+    // make MDSRank use v6 format happy
     int64_t i = -1;
     bool b = false;
     ::encode(i, bl);
     ::encode(b, bl);
   }
+  ::encode(client_flushes, bl);
   ENCODE_FINISH(bl);
 }
 void EMetaBlob::decode(bufferlist::iterator &bl)
@@ -822,6 +828,9 @@ void EMetaBlob::decode(bufferlist::iterator &bl)
     ::decode(i, bl);
     ::decode(b, bl);
   }
+  if (struct_v >= 8) {
+    ::decode(client_flushes, bl);
+  }
   DECODE_FINISH(bl);
 }
 
@@ -1103,7 +1112,7 @@ void EMetaBlob::generate_test_instances(list<EMetaBlob*>& ls)
   ls.push_back(new EMetaBlob());
 }
 
-void EMetaBlob::replay(MDS *mds, LogSegment *logseg, MDSlaveUpdate *slaveup)
+void EMetaBlob::replay(MDSRank *mds, LogSegment *logseg, MDSlaveUpdate *slaveup)
 {
   dout(10) << "EMetaBlob.replay " << lump_map.size() << " dirlumps by " << client_name << dendl;
 
@@ -1168,7 +1177,9 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg, MDSlaveUpdate *slaveup)
 	  dout(10) << "EMetaBlob.replay created base " << *diri << dendl;
 	} else {
 	  dout(0) << "EMetaBlob.replay missing dir ino  " << (*lp).ino << dendl;
-	  assert(0);
+          mds->clog->error() << "failure replaying journal (EMetaBlob)";
+          mds->damaged();
+          assert(0);  // Should be unreachable because damaged() calls respawn()
 	}
       }
 
@@ -1498,12 +1509,12 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg, MDSlaveUpdate *slaveup)
     }
   }
   if (sessionmapv) {
-    if (mds->sessionmap.version >= sessionmapv) {
+    if (mds->sessionmap.get_version() >= sessionmapv) {
       dout(10) << "EMetaBlob.replay sessionmap v " << sessionmapv
-	       << " <= table " << mds->sessionmap.version << dendl;
-    } else if (mds->sessionmap.version + 2 >= sessionmapv) {
+	       << " <= table " << mds->sessionmap.get_version() << dendl;
+    } else if (mds->sessionmap.get_version() + 2 >= sessionmapv) {
       dout(10) << "EMetaBlob.replay sessionmap v " << sessionmapv
-	       << " -(1|2) == table " << mds->sessionmap.version
+	       << " -(1|2) == table " << mds->sessionmap.get_version()
 	       << " prealloc " << preallocated_inos
 	       << " used " << used_preallocated_ino
 	       << dendl;
@@ -1524,26 +1535,28 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg, MDSlaveUpdate *slaveup)
 	    assert(i == used_preallocated_ino);
 	    session->info.used_inos.clear();
 	  }
-	  mds->sessionmap.projected = ++mds->sessionmap.version;
+          mds->sessionmap.replay_dirty_session(session);
 	}
 	if (!preallocated_inos.empty()) {
 	  session->info.prealloc_inos.insert(preallocated_inos);
-	  mds->sessionmap.projected = ++mds->sessionmap.version;
+          mds->sessionmap.replay_dirty_session(session);
 	}
+
       } else {
 	dout(10) << "EMetaBlob.replay no session for " << client_name << dendl;
-	if (used_preallocated_ino)
-	  mds->sessionmap.projected = ++mds->sessionmap.version;
+	if (used_preallocated_ino) {
+	  mds->sessionmap.replay_advance_version();
+        }
 	if (!preallocated_inos.empty())
-	  mds->sessionmap.projected = ++mds->sessionmap.version;
+	  mds->sessionmap.replay_advance_version();
       }
-      assert(sessionmapv == mds->sessionmap.version);
+      assert(sessionmapv == mds->sessionmap.get_version());
     } else {
       mds->clog->error() << "journal replay sessionmap v " << sessionmapv
-			<< " -(1|2) > table " << mds->sessionmap.version << "\n";
+			<< " -(1|2) > table " << mds->sessionmap.get_version() << "\n";
       assert(g_conf->mds_wipe_sessions);
       mds->sessionmap.wipe();
-      mds->sessionmap.version = mds->sessionmap.projected = sessionmapv;
+      mds->sessionmap.set_version(sessionmapv);
     }
   }
 
@@ -1598,6 +1611,21 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg, MDSlaveUpdate *slaveup)
     }
   }
 
+  // client flushes
+  for (list<pair<metareqid_t, uint64_t> >::iterator p = client_flushes.begin();
+       p != client_flushes.end();
+       ++p) {
+    if (p->first.name.is_client()) {
+      dout(10) << "EMetaBlob.replay flush " << p->first << " trim_to " << p->second << dendl;
+      Session *session = mds->sessionmap.get_session(p->first.name);
+      if (session) {
+	session->add_completed_flush(p->first.tid);
+	if (p->second)
+	  session->trim_completed_flushes(p->second);
+      }
+    }
+  }
+
   // update segment
   update_segment(logseg);
 
@@ -1614,16 +1642,14 @@ void ESession::update_segment()
     _segment->inotablev = inotablev;
 }
 
-void ESession::replay(MDS *mds)
+void ESession::replay(MDSRank *mds)
 {
-  if (mds->sessionmap.version >= cmapv) {
-    dout(10) << "ESession.replay sessionmap " << mds->sessionmap.version 
+  if (mds->sessionmap.get_version() >= cmapv) {
+    dout(10) << "ESession.replay sessionmap " << mds->sessionmap.get_version() 
 	     << " >= " << cmapv << ", noop" << dendl;
   } else {
-    dout(10) << "ESession.replay sessionmap " << mds->sessionmap.version
+    dout(10) << "ESession.replay sessionmap " << mds->sessionmap.get_version()
 	     << " < " << cmapv << " " << (open ? "open":"close") << " " << client_inst << dendl;
-    mds->sessionmap.projected = ++mds->sessionmap.version;
-    assert(mds->sessionmap.version == cmapv);
     Session *session;
     if (open) {
       session = mds->sessionmap.get_or_add_session(client_inst);
@@ -1636,6 +1662,7 @@ void ESession::replay(MDS *mds)
 	if (session->connection == NULL) {
 	  dout(10) << " removed session " << session->info.inst << dendl;
 	  mds->sessionmap.remove_session(session);
+          session = NULL;
 	} else {
 	  session->clear();    // the client has reconnected; keep the Session, but reset
 	  dout(10) << " reset session " << session->info.inst << " (they reconnected)" << dendl;
@@ -1645,6 +1672,12 @@ void ESession::replay(MDS *mds)
 			  << " from time " << stamp << ", ignoring";
       }
     }
+    if (session) {
+      mds->sessionmap.replay_dirty_session(session);
+    } else {
+      mds->sessionmap.replay_advance_version();
+    }
+    assert(mds->sessionmap.get_version() == cmapv);
   }
   
   if (inos.size() && inotablev) {
@@ -1767,17 +1800,17 @@ void ESessions::update_segment()
   _segment->sessionmapv = cmapv;
 }
 
-void ESessions::replay(MDS *mds)
+void ESessions::replay(MDSRank *mds)
 {
-  if (mds->sessionmap.version >= cmapv) {
-    dout(10) << "ESessions.replay sessionmap " << mds->sessionmap.version
+  if (mds->sessionmap.get_version() >= cmapv) {
+    dout(10) << "ESessions.replay sessionmap " << mds->sessionmap.get_version()
 	     << " >= " << cmapv << ", noop" << dendl;
   } else {
-    dout(10) << "ESessions.replay sessionmap " << mds->sessionmap.version
+    dout(10) << "ESessions.replay sessionmap " << mds->sessionmap.get_version()
 	     << " < " << cmapv << dendl;
     mds->sessionmap.open_sessions(client_map);
-    assert(mds->sessionmap.version == cmapv);
-    mds->sessionmap.projected = mds->sessionmap.version;
+    assert(mds->sessionmap.get_version() == cmapv);
+    mds->sessionmap.set_projected(mds->sessionmap.get_version());
   }
   update_segment();
 }
@@ -1836,7 +1869,7 @@ void ETableServer::update_segment()
   _segment->tablev[table] = version;
 }
 
-void ETableServer::replay(MDS *mds)
+void ETableServer::replay(MDSRank *mds)
 {
   MDSTableServer *server = mds->get_table_server(table);
   if (!server)
@@ -1872,7 +1905,9 @@ void ETableServer::replay(MDS *mds)
     server->_server_update(mutation);
     break;
   default:
-    assert(0);
+    mds->clog->error() << "invalid tableserver op in ETableServer";
+    mds->damaged();
+    assert(0);  // Should be unreachable because damaged() calls respawn()
   }
   
   assert(version == server->get_version());
@@ -1916,7 +1951,7 @@ void ETableClient::generate_test_instances(list<ETableClient*>& ls)
   ls.push_back(new ETableClient());
 }
 
-void ETableClient::replay(MDS *mds)
+void ETableClient::replay(MDSRank *mds)
 {
   dout(10) << " ETableClient.replay " << get_mdstable_name(table)
 	   << " op " << get_mdstableserver_opname(op)
@@ -1939,7 +1974,7 @@ void ESnap::update_segment()
   _segment->tablev[TABLE_SNAP] = version;
 }
 
-void ESnap::replay(MDS *mds)
+void ESnap::replay(MDSRank *mds)
 {
   if (mds->snaptable->get_version() >= version) {
     dout(10) << "ESnap.replay event " << version
@@ -2023,7 +2058,7 @@ void EUpdate::update_segment()
     _segment->uncommitted_masters.insert(reqid);
 }
 
-void EUpdate::replay(MDS *mds)
+void EUpdate::replay(MDSRank *mds)
 {
   metablob.replay(mds, _segment);
   
@@ -2035,11 +2070,11 @@ void EUpdate::replay(MDS *mds)
   }
   
   if (client_map.length()) {
-    if (mds->sessionmap.version >= cmapv) {
+    if (mds->sessionmap.get_version() >= cmapv) {
       dout(10) << "EUpdate.replay sessionmap v " << cmapv
-	       << " <= table " << mds->sessionmap.version << dendl;
+	       << " <= table " << mds->sessionmap.get_version() << dendl;
     } else {
-      dout(10) << "EUpdate.replay sessionmap " << mds->sessionmap.version
+      dout(10) << "EUpdate.replay sessionmap " << mds->sessionmap.get_version()
 	       << " < " << cmapv << dendl;
       // open client sessions?
       map<client_t,entity_inst_t> cm;
@@ -2049,8 +2084,8 @@ void EUpdate::replay(MDS *mds)
       mds->server->prepare_force_open_sessions(cm, seqm);
       mds->server->finish_force_open_sessions(cm, seqm);
 
-      assert(mds->sessionmap.version == cmapv);
-      mds->sessionmap.projected = mds->sessionmap.version;
+      assert(mds->sessionmap.get_version() == cmapv);
+      mds->sessionmap.set_projected(mds->sessionmap.get_version());
     }
   }
 }
@@ -2101,7 +2136,7 @@ void EOpen::update_segment()
   // ??
 }
 
-void EOpen::replay(MDS *mds)
+void EOpen::replay(MDSRank *mds)
 {
   dout(10) << "EOpen.replay " << dendl;
   metablob.replay(mds, _segment);
@@ -2123,7 +2158,7 @@ void EOpen::replay(MDS *mds)
 // -----------------------
 // ECommitted
 
-void ECommitted::replay(MDS *mds)
+void ECommitted::replay(MDSRank *mds)
 {
   if (mds->mdcache->uncommitted_masters.count(reqid)) {
     dout(10) << "ECommitted.replay " << reqid << dendl;
@@ -2394,7 +2429,7 @@ void ESlaveUpdate::generate_test_instances(list<ESlaveUpdate*>& ls)
 }
 
 
-void ESlaveUpdate::replay(MDS *mds)
+void ESlaveUpdate::replay(MDSRank *mds)
 {
   MDSlaveUpdate *su;
   switch (op) {
@@ -2427,7 +2462,9 @@ void ESlaveUpdate::replay(MDS *mds)
     break;
 
   default:
-    assert(0);
+    mds->clog->error() << "invalid op in ESlaveUpdate";
+    mds->damaged();
+    assert(0);  // Should be unreachable because damaged() calls respawn()
   }
 }
 
@@ -2497,7 +2534,7 @@ void ESubtreeMap::generate_test_instances(list<ESubtreeMap*>& ls)
   ls.push_back(new ESubtreeMap());
 }
 
-void ESubtreeMap::replay(MDS *mds) 
+void ESubtreeMap::replay(MDSRank *mds) 
 {
   if (expire_pos && expire_pos > mds->mdlog->journaler->get_expire_pos())
     mds->mdlog->journaler->set_expire_pos(expire_pos);
@@ -2524,7 +2561,7 @@ void ESubtreeMap::replay(MDS *mds)
 	++errors;
 	continue;
       }
-      if (dir->get_dir_auth().first != mds->whoami) {
+      if (dir->get_dir_auth().first != mds->get_nodeid()) {
 	mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
 			  << " subtree root " << p->first
 			  << " is not mine in cache (it's " << dir->get_dir_auth() << ")";
@@ -2578,7 +2615,7 @@ void ESubtreeMap::replay(MDS *mds)
     mds->mdcache->list_subtrees(subs);
     for (list<CDir*>::iterator p = subs.begin(); p != subs.end(); ++p) {
       CDir *dir = *p;
-      if (dir->get_dir_auth().first != mds->whoami)
+      if (dir->get_dir_auth().first != mds->get_nodeid())
 	continue;
       if (subtrees.count(dir->dirfrag()) == 0) {
 	mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
@@ -2629,7 +2666,7 @@ void ESubtreeMap::replay(MDS *mds)
 // -----------------------
 // EFragment
 
-void EFragment::replay(MDS *mds)
+void EFragment::replay(MDSRank *mds)
 {
   dout(10) << "EFragment.replay " << op_name(op) << " " << ino << " " << basefrag << " by " << bits << dendl;
 
@@ -2752,7 +2789,7 @@ void dirfrag_rollback::decode(bufferlist::iterator &bl)
 // -----------------------
 // EExport
 
-void EExport::replay(MDS *mds)
+void EExport::replay(MDSRank *mds)
 {
   dout(10) << "EExport.replay " << base << dendl;
   metablob.replay(mds, _segment);
@@ -2826,7 +2863,7 @@ void EImportStart::update_segment()
   _segment->sessionmapv = cmapv;
 }
 
-void EImportStart::replay(MDS *mds)
+void EImportStart::replay(MDSRank *mds)
 {
   dout(10) << "EImportStart.replay " << base << " bounds " << bounds << dendl;
   //metablob.print(*_dout);
@@ -2854,18 +2891,18 @@ void EImportStart::replay(MDS *mds)
 					    mds_authority_t(mds->get_nodeid(), mds->get_nodeid()));
 
   // open client sessions?
-  if (mds->sessionmap.version >= cmapv) {
-    dout(10) << "EImportStart.replay sessionmap " << mds->sessionmap.version 
+  if (mds->sessionmap.get_version() >= cmapv) {
+    dout(10) << "EImportStart.replay sessionmap " << mds->sessionmap.get_version() 
 	     << " >= " << cmapv << ", noop" << dendl;
   } else {
-    dout(10) << "EImportStart.replay sessionmap " << mds->sessionmap.version 
+    dout(10) << "EImportStart.replay sessionmap " << mds->sessionmap.get_version() 
 	     << " < " << cmapv << dendl;
     map<client_t,entity_inst_t> cm;
     bufferlist::iterator blp = client_map.begin();
     ::decode(cm, blp);
     mds->sessionmap.open_sessions(cm);
-    assert(mds->sessionmap.version == cmapv);
-    mds->sessionmap.projected = mds->sessionmap.version;
+    assert(mds->sessionmap.get_version() == cmapv);
+    mds->sessionmap.set_projected(mds->sessionmap.get_version());
   }
   update_segment();
 }
@@ -2912,7 +2949,7 @@ void EImportStart::generate_test_instances(list<EImportStart*>& ls)
 // -----------------------
 // EImportFinish
 
-void EImportFinish::replay(MDS *mds)
+void EImportFinish::replay(MDSRank *mds)
 {
   if (mds->mdcache->have_ambiguous_import(base)) {
     dout(10) << "EImportFinish.replay " << base << " success=" << success << dendl;
@@ -2928,10 +2965,13 @@ void EImportFinish::replay(MDS *mds)
       mds->mdcache->try_trim_non_auth_subtree(dir);
    }
   } else {
+    // this shouldn't happen unless this is an old journal
     dout(10) << "EImportFinish.replay " << base << " success=" << success
 	     << " on subtree not marked as ambiguous" 
 	     << dendl;
-    assert(0 == "this shouldn't happen unless this is an old journal");
+    mds->clog->error() << "failure replaying journal (EImportFinish)";
+    mds->damaged();
+    assert(0);  // Should be unreachable because damaged() calls respawn()
   }
 }
 
@@ -2994,20 +3034,20 @@ void EResetJournal::generate_test_instances(list<EResetJournal*>& ls)
   ls.push_back(new EResetJournal());
 }
 
-void EResetJournal::replay(MDS *mds)
+void EResetJournal::replay(MDSRank *mds)
 {
   dout(1) << "EResetJournal" << dendl;
 
   mds->sessionmap.wipe();
   mds->inotable->replay_reset();
 
-  if (mds->mdsmap->get_root() == mds->whoami) {
+  if (mds->mdsmap->get_root() == mds->get_nodeid()) {
     CDir *rootdir = mds->mdcache->get_root()->get_or_open_dirfrag(mds->mdcache, frag_t());
-    mds->mdcache->adjust_subtree_auth(rootdir, mds->whoami);   
+    mds->mdcache->adjust_subtree_auth(rootdir, mds->get_nodeid());   
   }
 
   CDir *mydir = mds->mdcache->get_myin()->get_or_open_dirfrag(mds->mdcache, frag_t());
-  mds->mdcache->adjust_subtree_auth(mydir, mds->whoami);   
+  mds->mdcache->adjust_subtree_auth(mydir, mds->get_nodeid());   
 
   mds->mdcache->recalc_auth_bits(true);
 
@@ -3042,7 +3082,7 @@ void ENoOp::decode(bufferlist::iterator &bl)
 }
 
 
-void ENoOp::replay(MDS *mds)
+void ENoOp::replay(MDSRank *mds)
 {
   dout(4) << "ENoOp::replay, " << pad_size << " bytes skipped in journal" << dendl;
 }
@@ -3053,14 +3093,14 @@ void ENoOp::replay(MDS *mds)
  * it.
  *
  * @param mds
- * MDS instance, just used for logging
+ * MDSRank instance, just used for logging
  * @param old_to_new
  * Map of old journal segment segment sequence numbers to new journal segment sequence numbers
  *
  * @return
  * True if the event was modified.
  */
-bool EMetaBlob::rewrite_truncate_finish(MDS const *mds,
+bool EMetaBlob::rewrite_truncate_finish(MDSRank const *mds,
     std::map<log_segment_seq_t, log_segment_seq_t> const &old_to_new)
 {
   bool modified = false;
diff --git a/src/mds/mdstypes.cc b/src/mds/mdstypes.cc
index ffb5086..d0218ec 100644
--- a/src/mds/mdstypes.cc
+++ b/src/mds/mdstypes.cc
@@ -229,6 +229,27 @@ ostream& operator<<(ostream& out, const client_writeable_range_t& r)
   return out << r.range.first << '-' << r.range.last << "@" << r.follows;
 }
 
+/*
+ * inline_data_t
+ */
+void inline_data_t::encode(bufferlist &bl) const
+{
+  ::encode(version, bl);
+  if (blp)
+    ::encode(*blp, bl);
+  else
+    ::encode(bufferlist(), bl);
+}
+void inline_data_t::decode(bufferlist::iterator &p)
+{
+  ::decode(version, p);
+  uint32_t inline_len;
+  ::decode(inline_len, p);
+  if (inline_len > 0)
+    ::decode_nohead(inline_len, get_data(), p);
+  else
+    free_data();
+}
 
 /*
  * inode_t
@@ -274,9 +295,7 @@ void inode_t::encode(bufferlist &bl) const
   ::encode(backtrace_version, bl);
   ::encode(old_pools, bl);
   ::encode(max_size_ever, bl);
-  ::encode(inline_version, bl);
   ::encode(inline_data, bl);
-
   ::encode(quota, bl);
 
   ENCODE_FINISH(bl);
@@ -340,10 +359,9 @@ void inode_t::decode(bufferlist::iterator &p)
   if (struct_v >= 8)
     ::decode(max_size_ever, p);
   if (struct_v >= 9) {
-    ::decode(inline_version, p);
     ::decode(inline_data, p);
   } else {
-    inline_version = CEPH_INLINE_NONE;
+    inline_data.version = CEPH_INLINE_NONE;
   }
   if (struct_v < 10)
     backtrace_version = 0; // force update backtrace
@@ -372,10 +390,10 @@ void inode_t::dump(Formatter *f) const
   f->close_section();
 
   f->open_array_section("old_pools");
-  vector<int64_t>::const_iterator i = old_pools.begin();
-  while(i != old_pools.end()) {
+  for (compact_set<int64_t>::const_iterator i = old_pools.begin();
+       i != old_pools.end();
+       ++i)
     f->dump_int("pool", *i);
-  }
   f->close_section();
 
   f->dump_unsigned("size", size);
@@ -425,6 +443,7 @@ void inode_t::generate_test_instances(list<inode_t*>& ls)
 int inode_t::compare(const inode_t &other, bool *divergent) const
 {
   assert(ino == other.ino);
+  *divergent = false;
   if (version == other.version) {
     if (rdev != other.rdev ||
         ctime != other.ctime ||
@@ -444,9 +463,7 @@ int inode_t::compare(const inode_t &other, bool *divergent) const
         mtime != other.mtime ||
         atime != other.atime ||
         time_warp_seq != other.time_warp_seq ||
-        !(*const_cast<bufferlist*>(&inline_data) ==
-            *const_cast<bufferlist*>(&other.inline_data)) ||
-        inline_version != other.inline_version ||
+        inline_data != other.inline_data ||
         client_ranges != other.client_ranges ||
         !(dirstat == other.dirstat) ||
         !(rstat == other.rstat) ||
@@ -472,7 +489,7 @@ bool inode_t::older_is_consistent(const inode_t &other) const
   if (max_size_ever < other.max_size_ever ||
       truncate_seq < other.truncate_seq ||
       time_warp_seq < other.time_warp_seq ||
-      inline_version < other.inline_version ||
+      inline_data.version < other.inline_data.version ||
       dirstat.version < other.dirstat.version ||
       rstat.version < other.rstat.version ||
       accounted_rstat.version < other.accounted_rstat.version ||
@@ -535,25 +552,29 @@ void old_inode_t::generate_test_instances(list<old_inode_t*>& ls)
  */
 void fnode_t::encode(bufferlist &bl) const
 {
-  ENCODE_START(2, 2, bl);
+  ENCODE_START(3, 3, bl);
   ::encode(version, bl);
   ::encode(snap_purged_thru, bl);
   ::encode(fragstat, bl);
   ::encode(accounted_fragstat, bl);
   ::encode(rstat, bl);
   ::encode(accounted_rstat, bl);
+  ::encode(damage_flags, bl);
   ENCODE_FINISH(bl);
 }
 
 void fnode_t::decode(bufferlist::iterator &bl)
 {
-  DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
+  DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
   ::decode(version, bl);
   ::decode(snap_purged_thru, bl);
   ::decode(fragstat, bl);
   ::decode(accounted_fragstat, bl);
   ::decode(rstat, bl);
   ::decode(accounted_rstat, bl);
+  if (struct_v >= 3) {
+    ::decode(damage_flags, bl);
+  }
   DECODE_FINISH(bl);
 }
 
@@ -644,12 +665,13 @@ void old_rstat_t::generate_test_instances(list<old_rstat_t*>& ls)
  */
 void session_info_t::encode(bufferlist& bl) const
 {
-  ENCODE_START(4, 3, bl);
+  ENCODE_START(5, 3, bl);
   ::encode(inst, bl);
   ::encode(completed_requests, bl);
   ::encode(prealloc_inos, bl);   // hacky, see below.
   ::encode(used_inos, bl);
   ::encode(client_metadata, bl);
+  ::encode(completed_flushes, bl);
   ENCODE_FINISH(bl);
 }
 
@@ -674,6 +696,9 @@ void session_info_t::decode(bufferlist::iterator& p)
   if (struct_v >= 4) {
     ::decode(client_metadata, p);
   }
+  if (struct_v >= 5) {
+    ::decode(completed_flushes, p);
+  }
   DECODE_FINISH(p);
 }
 
@@ -1002,3 +1027,69 @@ void cap_reconnect_t::generate_test_instances(list<cap_reconnect_t*>& ls)
   ls.back()->path = "/test/path";
   ls.back()->capinfo.cap_id = 1;
 }
+
+void MDSCacheObject::dump(Formatter *f) const
+{
+  f->dump_bool("is_auth", is_auth());
+
+  // Fields only meaningful for auth
+  f->open_object_section("auth_state");
+  {
+    f->open_object_section("replicas");
+    const compact_map<mds_rank_t,unsigned>& replicas = get_replicas();
+    for (compact_map<mds_rank_t,unsigned>::const_iterator i = replicas.begin();
+         i != replicas.end(); ++i) {
+      std::ostringstream rank_str;
+      rank_str << i->first;
+      f->dump_int(rank_str.str().c_str(), i->second);
+    }
+    f->close_section();
+  }
+  f->close_section(); // auth_state
+
+  // Fields only meaningful for replica
+  f->open_object_section("replica_state");
+  {
+    f->open_array_section("authority");
+    f->dump_int("first", authority().first);
+    f->dump_int("second", authority().second);
+    f->close_section();
+    f->dump_int("replica_nonce", get_replica_nonce());
+  }
+  f->close_section();  // replica_state
+
+  f->dump_int("auth_pins", auth_pins);
+  f->dump_int("nested_auth_pins", nested_auth_pins);
+  f->dump_bool("is_frozen", is_frozen());
+  f->dump_bool("is_freezing", is_freezing());
+
+#ifdef MDS_REF_SET
+    f->open_object_section("pins");
+    for(std::map<int, int>::const_iterator it = ref_map.begin();
+        it != ref_map.end(); ++it) {
+      f->dump_int(pin_name(it->first), it->second);
+    }
+    f->close_section();
+#endif
+    f->dump_int("nref", ref);
+}
+
+/*
+ * Use this in subclasses when printing their specialized
+ * states too.
+ */
+void MDSCacheObject::dump_states(Formatter *f) const
+{
+  if (state_test(STATE_AUTH)) f->dump_string("state", "auth");
+  if (state_test(STATE_DIRTY)) f->dump_string("state", "dirty");
+  if (state_test(STATE_NOTIFYREF)) f->dump_string("state", "notifyref");
+  if (state_test(STATE_REJOINING)) f->dump_string("state", "rejoining");
+  if (state_test(STATE_REJOINUNDEF))
+    f->dump_string("state", "rejoinundef");
+}
+
+void ceph_file_layout_wrapper::dump(Formatter *f) const
+{
+  ::dump(static_cast<const ceph_file_layout&>(*this), f);
+}
+
diff --git a/src/mds/mdstypes.h b/src/mds/mdstypes.h
index 95ebf49..72ff4fc 100644
--- a/src/mds/mdstypes.h
+++ b/src/mds/mdstypes.h
@@ -18,15 +18,16 @@
 #include "include/frag.h"
 #include "include/xlist.h"
 #include "include/interval_set.h"
+#include "include/compact_map.h"
+#include "include/compact_set.h"
 
 #include "inode_backtrace.h"
 
+#include <boost/spirit/include/qi.hpp>
 #include <boost/pool/pool.hpp>
 #include "include/assert.h"
-#include "include/hash_namespace.h"
 #include <boost/serialization/strong_typedef.hpp>
 
-
 #define CEPH_FS_ONDISK_MAGIC "ceph fs volume v011"
 
 
@@ -62,6 +63,7 @@
 
 #define MDS_INO_IS_STRAY(i)  ((i) >= MDS_INO_STRAY_OFFSET  && (i) < (MDS_INO_STRAY_OFFSET+(MAX_MDS*NUM_STRAY)))
 #define MDS_INO_IS_MDSDIR(i) ((i) >= MDS_INO_MDSDIR_OFFSET && (i) < (MDS_INO_MDSDIR_OFFSET+MAX_MDS))
+#define MDS_INO_MDSDIR_OWNER(i) (signed ((unsigned (i)) - MDS_INO_MDSDIR_OFFSET))
 #define MDS_INO_IS_BASE(i)   (MDS_INO_ROOT == (i) || MDS_INO_IS_MDSDIR(i))
 #define MDS_INO_STRAY_OWNER(i) (signed (((unsigned (i)) - MDS_INO_STRAY_OFFSET) / NUM_STRAY))
 #define MDS_INO_STRAY_INDEX(i) (((unsigned (i)) - MDS_INO_STRAY_OFFSET) % NUM_STRAY)
@@ -71,7 +73,7 @@
 #define MDS_TRAVERSE_DISCOVERXLOCK 3    // succeeds on (foreign?) null, xlocked dentries.
 
 
-BOOST_STRONG_TYPEDEF(int32_t, mds_rank_t)
+typedef int32_t mds_rank_t;
 BOOST_STRONG_TYPEDEF(uint64_t, mds_gid_t)
 extern const mds_gid_t MDS_GID_NONE;
 extern const mds_rank_t MDS_RANK_NONE;
@@ -298,7 +300,7 @@ inline bool operator==(const quota_info_t &l, const quota_info_t &r) {
 
 ostream& operator<<(ostream &out, const quota_info_t &n);
 
-CEPH_HASH_NAMESPACE_START
+namespace std {
   template<> struct hash<vinodeno_t> {
     size_t operator()(const vinodeno_t &vino) const { 
       hash<inodeno_t> H;
@@ -306,7 +308,7 @@ CEPH_HASH_NAMESPACE_START
       return H(vino.ino) ^ I(vino.snapid);
     }
   };
-CEPH_HASH_NAMESPACE_END
+} // namespace std
 
 
 
@@ -356,6 +358,58 @@ inline bool operator==(const client_writeable_range_t& l,
     l.follows == r.follows;
 }
 
+struct inline_data_t {
+private:
+  bufferlist *blp;
+public:
+  version_t version;
+
+  void free_data() {
+    delete blp;
+    blp = NULL;
+  }
+  bufferlist& get_data() {
+    if (!blp)
+      blp = new bufferlist;
+    return *blp;
+  }
+  size_t length() const { return blp ? blp->length() : 0; }
+
+  inline_data_t() : blp(0), version(1) {}
+  inline_data_t(const inline_data_t& o) : blp(0), version(o.version) {
+    if (o.blp)
+      get_data() = *o.blp;
+  }
+  ~inline_data_t() {
+    free_data();
+  }
+  inline_data_t& operator=(const inline_data_t& o) {
+    version = o.version;
+    if (o.blp)
+      get_data() = *o.blp;
+    else
+      free_data();
+    return *this;
+  }
+  bool operator==(const inline_data_t& o) const {
+   return length() == o.length() &&
+	  (length() == 0 ||
+	   (*const_cast<bufferlist*>(blp) == *const_cast<bufferlist*>(o.blp)));
+  }
+  bool operator!=(const inline_data_t& o) const {
+    return !(*this == o);
+  }
+  void encode(bufferlist &bl) const;
+  void decode(bufferlist::iterator& bl);
+};
+WRITE_CLASS_ENCODER(inline_data_t)
+
+enum {
+  DAMAGE_STATS,     // statistics (dirstat, size, etc)
+  DAMAGE_RSTATS,    // recursive statistics (rstat, accounted_rstat)
+  DAMAGE_FRAGTREE   // fragtree -- repair by searching
+};
+typedef uint32_t damage_flags_t;
 
 /*
  * inode_t
@@ -384,7 +438,7 @@ struct inode_t {
   // file (data access)
   ceph_dir_layout  dir_layout;    // [dir only]
   ceph_file_layout layout;
-  vector <int64_t> old_pools;
+  compact_set <int64_t> old_pools;
   uint64_t   size;        // on directory, # dentries
   uint64_t   max_size_ever; // max size the file has ever been
   uint32_t   truncate_seq;
@@ -393,8 +447,7 @@ struct inode_t {
   utime_t    mtime;   // file data modify time.
   utime_t    atime;   // file data access time.
   uint32_t   time_warp_seq;  // count of (potential) mtime/atime timewarps (i.e., utimes())
-  bufferlist inline_data;
-  version_t  inline_version;
+  inline_data_t inline_data;
 
   std::map<client_t,client_writeable_range_t> client_ranges;  // client(s) can write to these ranges
 
@@ -420,7 +473,6 @@ struct inode_t {
 	      truncate_seq(0), truncate_size(0), truncate_from(0),
 	      truncate_pending(0),
 	      time_warp_seq(0),
-	      inline_version(1),
 	      version(0), file_data_version(0), xattr_version(0), backtrace_version(0) {
     clear_layout();
     memset(&dir_layout, 0, sizeof(dir_layout));
@@ -503,7 +555,7 @@ struct inode_t {
 
   void add_old_pool(int64_t l) {
     backtrace_version = version;
-    old_pools.push_back(l);
+    old_pools.insert(l);
   }
 
   void encode(bufferlist &bl) const;
@@ -553,6 +605,7 @@ struct fnode_t {
   snapid_t snap_purged_thru;   // the max_last_destroy snapid we've been purged thru
   frag_info_t fragstat, accounted_fragstat;
   nest_info_t rstat, accounted_rstat;
+  damage_flags_t damage_flags;
 
   void encode(bufferlist &bl) const;
   void decode(bufferlist::iterator& bl);
@@ -589,6 +642,7 @@ struct session_info_t {
   interval_set<inodeno_t> prealloc_inos;   // preallocated, ready to use.
   interval_set<inodeno_t> used_inos;       // journaling use
   std::map<std::string, std::string> client_metadata;
+  std::set<ceph_tid_t> completed_flushes;
 
   client_t get_client() const { return client_t(inst.name.num()); }
 
@@ -596,6 +650,7 @@ struct session_info_t {
     prealloc_inos.clear();
     used_inos.clear();
     completed_requests.clear();
+    completed_flushes.clear();
   }
 
   void encode(bufferlist& bl) const;
@@ -757,14 +812,14 @@ inline bool operator<=(const metareqid_t& l, const metareqid_t& r) {
 inline bool operator>(const metareqid_t& l, const metareqid_t& r) { return !(l <= r); }
 inline bool operator>=(const metareqid_t& l, const metareqid_t& r) { return !(l < r); }
 
-CEPH_HASH_NAMESPACE_START
+namespace std {
   template<> struct hash<metareqid_t> {
     size_t operator()(const metareqid_t &r) const { 
       hash<uint64_t> H;
       return H(r.name.num()) ^ H(r.name.type()) ^ H(r.tid);
     }
   };
-CEPH_HASH_NAMESPACE_END
+} // namespace std
 
 
 // cap info for client reconnect
@@ -882,7 +937,7 @@ inline bool operator==(dirfrag_t l, dirfrag_t r) {
   return l.ino == r.ino && l.frag == r.frag;
 }
 
-CEPH_HASH_NAMESPACE_START
+namespace std {
   template<> struct hash<dirfrag_t> {
     size_t operator()(const dirfrag_t &df) const { 
       static rjhash<uint64_t> H;
@@ -890,7 +945,7 @@ CEPH_HASH_NAMESPACE_START
       return H(df.ino) ^ I(df.frag);
     }
   };
-CEPH_HASH_NAMESPACE_END
+} // namespace std
 
 
 
@@ -1235,7 +1290,9 @@ class MDSCacheObject {
   MDSCacheObject() :
     state(0), 
     ref(0),
-    replica_nonce(0) {}
+    auth_pins(0), nested_auth_pins(0),
+    replica_nonce(0)
+  {}
   virtual ~MDSCacheObject() {}
 
   // printing
@@ -1289,15 +1346,6 @@ protected:
 #endif
     return ref;
   }
-#ifdef MDS_REF_SET
-  int get_pin_totals() {
-    int total = 0;
-    for(std::map<int,int>::iterator i = ref_map.begin(); i != ref_map.end(); ++i) {
-      total += i->second;
-    }
-    return total;
-  }
-#endif
   virtual const char *pin_name(int by) const = 0;
   //bool is_pinned_by(int by) { return ref_set.count(by); }
   //multiset<int>& get_ref_set() { return ref_set; }
@@ -1321,7 +1369,6 @@ protected:
       ref--;
 #ifdef MDS_REF_SET
       ref_map[by]--;
-      assert(ref == get_pin_totals());
 #endif
       if (ref == 0)
 	last_put();
@@ -1345,7 +1392,6 @@ protected:
     if (ref_map.find(by) == ref_map.end())
       ref_map[by] = 0;
     ref_map[by]++;
-    assert(ref == get_pin_totals());
 #endif
   }
 
@@ -1361,6 +1407,20 @@ protected:
 #endif
   }
 
+  protected:
+  int auth_pins;
+  int nested_auth_pins;
+#ifdef MDS_AUTHPIN_SET
+  multiset<void*> auth_pin_set;
+#endif
+
+  public:
+  bool is_auth_pinned() const { return auth_pins || nested_auth_pins; }
+  int get_num_auth_pins() const { return auth_pins; }
+  int get_num_nested_auth_pins() const { return nested_auth_pins; }
+
+  void dump_states(Formatter *f) const;
+  void dump(Formatter *f) const;
 
   // --------------------------------------------
   // auth pins
@@ -1378,7 +1438,7 @@ protected:
   // replication (across mds cluster)
  protected:
   unsigned		replica_nonce; // [replica] defined on replica
-  std::map<mds_rank_t,unsigned>	replica_map;   // [auth] mds -> nonce
+  compact_map<mds_rank_t,unsigned>	replica_map;   // [auth] mds -> nonce
 
  public:
   bool is_replicated() const { return !replica_map.empty(); }
@@ -1411,13 +1471,13 @@ protected:
       put(PIN_REPLICATED);
     replica_map.clear();
   }
-  std::map<mds_rank_t,unsigned>::iterator replicas_begin() { return replica_map.begin(); }
-  std::map<mds_rank_t,unsigned>::iterator replicas_end() { return replica_map.end(); }
-  const std::map<mds_rank_t,unsigned>& get_replicas() const { return replica_map; }
+  compact_map<mds_rank_t,unsigned>::iterator replicas_begin() { return replica_map.begin(); }
+  compact_map<mds_rank_t,unsigned>::iterator replicas_end() { return replica_map.end(); }
+  const compact_map<mds_rank_t,unsigned>& get_replicas() const { return replica_map; }
   void list_replicas(std::set<mds_rank_t>& ls) const {
-    for (std::map<mds_rank_t,unsigned>::const_iterator p = replica_map.begin();
+    for (compact_map<mds_rank_t,unsigned>::const_iterator p = replica_map.begin();
 	 p != replica_map.end();
-	 ++p) 
+	 ++p)
       ls.insert(p->first);
   }
 
@@ -1428,7 +1488,7 @@ protected:
   // ---------------------------------------------
   // waiting
  protected:
-  multimap<uint64_t, MDSInternalContextBase*>  waiting;
+  compact_multimap<uint64_t, MDSInternalContextBase*>  waiting;
 
  public:
   bool is_waiter_for(uint64_t mask, uint64_t min=0) {
@@ -1437,7 +1497,7 @@ protected:
       while (min & (min-1))  // if more than one bit is set
 	min &= min-1;        //  clear LSB
     }
-    for (multimap<uint64_t,MDSInternalContextBase*>::iterator p = waiting.lower_bound(min);
+    for (compact_multimap<uint64_t,MDSInternalContextBase*>::iterator p = waiting.lower_bound(min);
 	 p != waiting.end();
 	 ++p) {
       if (p->first & mask) return true;
@@ -1457,7 +1517,7 @@ protected:
   }
   virtual void take_waiting(uint64_t mask, list<MDSInternalContextBase*>& ls) {
     if (waiting.empty()) return;
-    multimap<uint64_t,MDSInternalContextBase*>::iterator it = waiting.begin();
+    compact_multimap<uint64_t,MDSInternalContextBase*>::iterator it = waiting.begin();
     while (it != waiting.end()) {
       if (it->first & mask) {
 	ls.push_back(it->second);
@@ -1526,8 +1586,44 @@ inline std::ostream& operator<<(std::ostream& out, mdsco_db_line_prefix o) {
   return out;
 }
 
+class ceph_file_layout_wrapper : public ceph_file_layout
+{
+public:
+  void encode(bufferlist &bl) const
+  {
+    ::encode(static_cast<const ceph_file_layout&>(*this), bl);
+  }
 
+  void decode(bufferlist::iterator &p)
+  {
+    ::decode(static_cast<ceph_file_layout&>(*this), p);
+  }
+
+  static void generate_test_instances(std::list<ceph_file_layout_wrapper*>& ls)
+  {
+  }
+
+  void dump(Formatter *f) const;
+};
 
+// parse a map of keys/values.
+namespace qi = boost::spirit::qi;
 
+template <typename Iterator>
+struct keys_and_values
+  : qi::grammar<Iterator, std::map<string, string>()>
+{
+    keys_and_values()
+      : keys_and_values::base_type(query)
+    {
+      query =  pair >> *(qi::lit(' ') >> pair);
+      pair  =  key >> '=' >> value;
+      key   =  qi::char_("a-zA-Z_") >> *qi::char_("a-zA-Z_0-9");
+      value = +qi::char_("a-zA-Z_0-9");
+    }
+    qi::rule<Iterator, std::map<string, string>()> query;
+    qi::rule<Iterator, std::pair<string, string>()> pair;
+    qi::rule<Iterator, string()> key, value;
+};
 
 #endif
diff --git a/src/messages/MClientCaps.h b/src/messages/MClientCaps.h
index 6aa89c8..609495b 100644
--- a/src/messages/MClientCaps.h
+++ b/src/messages/MClientCaps.h
@@ -20,7 +20,7 @@
 
 
 class MClientCaps : public Message {
-  static const int HEAD_VERSION = 5;
+  static const int HEAD_VERSION = 6;
   static const int COMPAT_VERSION = 1;
 
  public:
@@ -34,6 +34,7 @@ class MClientCaps : public Message {
 
   // Receivers may not use their new caps until they have this OSD map
   epoch_t osd_epoch_barrier;
+  ceph_tid_t oldest_flush_tid;
 
   int      get_caps() { return head.caps; }
   int      get_wanted() { return head.wanted; }
@@ -86,9 +87,12 @@ class MClientCaps : public Message {
     peer.flags = flags;
   }
 
+  void set_oldest_flush_tid(ceph_tid_t tid) { oldest_flush_tid = tid; }
+  ceph_tid_t get_oldest_flush_tid() { return oldest_flush_tid; }
+
   MClientCaps()
     : Message(CEPH_MSG_CLIENT_CAPS, HEAD_VERSION, COMPAT_VERSION),
-      osd_epoch_barrier(0) {
+      osd_epoch_barrier(0), oldest_flush_tid(0) {
     inline_version = 0;
   }
   MClientCaps(int op,
@@ -102,7 +106,7 @@ class MClientCaps : public Message {
 	      int mseq,
               epoch_t oeb)
     : Message(CEPH_MSG_CLIENT_CAPS, HEAD_VERSION, COMPAT_VERSION),
-      osd_epoch_barrier(oeb) {
+      osd_epoch_barrier(oeb), oldest_flush_tid(0) {
     memset(&head, 0, sizeof(head));
     head.op = op;
     head.ino = ino;
@@ -192,8 +196,12 @@ public:
     if (header.version >= 5) {
       ::decode(osd_epoch_barrier, p);
     }
+    if (header.version >= 6) {
+      ::decode(oldest_flush_tid, p);
+    }
   }
   void encode_payload(uint64_t features) {
+    header.version = HEAD_VERSION;
     head.snap_trace_len = snapbl.length();
     head.xattr_len = xattrbl.length();
 
@@ -231,6 +239,7 @@ public:
     }
 
     ::encode(osd_epoch_barrier, payload);
+    ::encode(oldest_flush_tid, payload);
   }
 };
 
diff --git a/src/messages/MClientReconnect.h b/src/messages/MClientReconnect.h
index 1b072a3..8a9ff42 100644
--- a/src/messages/MClientReconnect.h
+++ b/src/messages/MClientReconnect.h
@@ -56,6 +56,7 @@ public:
     data.clear();
     if (features & CEPH_FEATURE_MDSENC) {
       ::encode(caps, data);
+      header.version = HEAD_VERSION;
     } else if (features & CEPH_FEATURE_FLOCK) {
       // encode with old cap_reconnect_t encoding
       __u32 n = caps.size();
diff --git a/src/messages/MClientReply.h b/src/messages/MClientReply.h
index 0a28a63..d3ee05e 100644
--- a/src/messages/MClientReply.h
+++ b/src/messages/MClientReply.h
@@ -159,13 +159,8 @@ struct InodeStat {
     rstat.rfiles = e.rfiles;
     rstat.rsubdirs = e.rsubdirs;
 
-    int n = e.fragtree.nsplits;
-    while (n) {
-      ceph_frag_tree_split s;
-      ::decode(s, p);
-      dirfragtree._splits[(__u32)s.frag] = s.by;
-      n--;
-    }
+    dirfragtree.decode_nohead(e.fragtree.nsplits, p);
+
     ::decode(symlink, p);
     
     if (features & CEPH_FEATURE_DIRLAYOUTHASH)
diff --git a/src/messages/MClientRequest.h b/src/messages/MClientRequest.h
index 35dbb17..1c37459 100644
--- a/src/messages/MClientRequest.h
+++ b/src/messages/MClientRequest.h
@@ -46,7 +46,7 @@
 // metadata ops.
 
 class MClientRequest : public Message {
-  static const int HEAD_VERSION = 2;
+  static const int HEAD_VERSION = 3;
   static const int COMPAT_VERSION = 1;
 
 public:
@@ -93,6 +93,17 @@ private:
 public:
   void set_mdsmap_epoch(epoch_t e) { head.mdsmap_epoch = e; }
   epoch_t get_mdsmap_epoch() { return head.mdsmap_epoch; }
+  epoch_t get_osdmap_epoch() const {
+    assert(head.op == CEPH_MDS_OP_SETXATTR);
+    if (header.version >= 3)
+      return head.args.setxattr.osdmap_epoch;
+    else
+      return 0;
+  }
+  void set_osdmap_epoch(epoch_t e) {
+    assert(head.op == CEPH_MDS_OP_SETXATTR);
+    head.args.setxattr.osdmap_epoch = e;
+  }
 
   metareqid_t get_reqid() {
     // FIXME: for now, assume clients always have 1 incarnation
diff --git a/src/messages/MClientRequestForward.h b/src/messages/MClientRequestForward.h
index 976d9dc..5770361 100644
--- a/src/messages/MClientRequestForward.h
+++ b/src/messages/MClientRequestForward.h
@@ -16,6 +16,8 @@
 #ifndef CEPH_MCLIENTREQUESTFORWARD_H
 #define CEPH_MCLIENTREQUESTFORWARD_H
 
+#include "msg/Message.h"
+
 class MClientRequestForward : public Message {
   int32_t dest_mds;
   int32_t num_fwd;
diff --git a/src/messages/MClientSession.h b/src/messages/MClientSession.h
index 3ef28e9..8975b18 100644
--- a/src/messages/MClientSession.h
+++ b/src/messages/MClientSession.h
@@ -76,6 +76,7 @@ public:
       header.version = 1;
     } else {
       ::encode(client_meta, payload);
+      header.version = HEAD_VERSION;
     }
 
   }
diff --git a/src/messages/MCommand.h b/src/messages/MCommand.h
index 802ef69..d5f0b22 100644
--- a/src/messages/MCommand.h
+++ b/src/messages/MCommand.h
@@ -16,7 +16,6 @@
 #define CEPH_MCOMMAND_H
 
 #include <vector>
-#include <uuid/uuid.h>
 
 #include "msg/Message.h"
 
diff --git a/src/messages/MDataPing.h b/src/messages/MDataPing.h
index 16bf4d2..f713dac 100644
--- a/src/messages/MDataPing.h
+++ b/src/messages/MDataPing.h
@@ -23,10 +23,10 @@ extern "C" {
 #include "libxio.h"
 }
 #else
-struct xio_mempool_obj {};
+struct xio_reg_mem {};
 #endif /* HAVE_XIO */
 
-typedef void (*mdata_hook_func)(struct xio_mempool_obj *mp);
+typedef void (*mdata_hook_func)(struct xio_reg_mem *mp);
 
 class MDataPing : public Message {
 
@@ -38,7 +38,7 @@ class MDataPing : public Message {
   std::string tag;
   uint32_t counter;
   mdata_hook_func mdata_hook;
-  struct xio_mempool_obj mp;
+  struct xio_reg_mem mp;
   bool free_data;
 
   MDataPing()
@@ -47,7 +47,7 @@ class MDataPing : public Message {
       free_data(false)
   {}
 
-  struct xio_mempool_obj *get_mp()
+  struct xio_reg_mem *get_mp()
     {
       return ∓
     }
diff --git a/src/messages/MDirUpdate.h b/src/messages/MDirUpdate.h
index 39ea23b..727db23 100644
--- a/src/messages/MDirUpdate.h
+++ b/src/messages/MDirUpdate.h
@@ -23,14 +23,14 @@ class MDirUpdate : public Message {
   dirfrag_t dirfrag;
   int32_t dir_rep;
   int32_t discover;
-  set<int32_t> dir_rep_by;
+  compact_set<int32_t> dir_rep_by;
   filepath path;
 
  public:
   mds_rank_t get_source_mds() const { return from_mds; }
   dirfrag_t get_dirfrag() const { return dirfrag; }
   int get_dir_rep() const { return dir_rep; }
-  const set<int>& get_dir_rep_by() const { return dir_rep_by; } 
+  const compact_set<int>& get_dir_rep_by() const { return dir_rep_by; }
   bool should_discover() const { return discover > 0; }
   const filepath& get_path() const { return path; }
 
@@ -42,7 +42,7 @@ class MDirUpdate : public Message {
   MDirUpdate(mds_rank_t f, 
 	     dirfrag_t dirfrag,
              int dir_rep,
-             set<int>& dir_rep_by,
+             compact_set<int>& dir_rep_by,
              filepath& path,
              bool discover = false) :
     Message(MSG_MDS_DIRUPDATE) {
diff --git a/src/messages/MExportDirFinish.h b/src/messages/MExportDirFinish.h
index dd78dda..397abc5 100644
--- a/src/messages/MExportDirFinish.h
+++ b/src/messages/MExportDirFinish.h
@@ -25,7 +25,7 @@ class MExportDirFinish : public Message {
   dirfrag_t get_dirfrag() { return dirfrag; }
   bool is_last() { return last; }
   
-  MExportDirFinish() {}
+  MExportDirFinish() : last(false) {}
   MExportDirFinish(dirfrag_t df, bool l, uint64_t tid) :
     Message(MSG_MDS_EXPORTDIRFINISH), dirfrag(df), last(l) {
     set_tid(tid);
diff --git a/src/messages/MForward.h b/src/messages/MForward.h
index 92a7393..efe6085 100644
--- a/src/messages/MForward.h
+++ b/src/messages/MForward.h
@@ -25,35 +25,48 @@
 
 struct MForward : public Message {
   uint64_t tid;
-  PaxosServiceMessage *msg;
   entity_inst_t client;
   MonCap client_caps;
   uint64_t con_features;
   EntityName entity_name;
+  PaxosServiceMessage *msg;   // incoming message
+  bufferlist msg_bl;          // outgoing message
 
   static const int HEAD_VERSION = 3;
   static const int COMPAT_VERSION = 1;
 
   MForward() : Message(MSG_FORWARD, HEAD_VERSION, COMPAT_VERSION),
-               tid(0), msg(NULL), con_features(0) {}
+               tid(0), con_features(0), msg(NULL) {}
   //the message needs to have caps filled in!
   MForward(uint64_t t, PaxosServiceMessage *m, uint64_t feat) :
     Message(MSG_FORWARD, HEAD_VERSION, COMPAT_VERSION),
-    tid(t), msg(m) {
+    tid(t), msg(NULL) {
     client = m->get_source_inst();
     client_caps = m->get_session()->caps;
     con_features = feat;
+    set_message(m, feat);
   }
   MForward(uint64_t t, PaxosServiceMessage *m, uint64_t feat,
-	   const MonCap& caps) :
+           const MonCap& caps) :
     Message(MSG_FORWARD, HEAD_VERSION, COMPAT_VERSION),
-    tid(t), msg(m), client_caps(caps) {
+    tid(t), client_caps(caps), msg(NULL) {
     client = m->get_source_inst();
     con_features = feat;
+    set_message(m, feat);
   }
 private:
   ~MForward() {
-    if (msg) msg->put();
+    if (msg) {
+      // message was unclaimed
+      msg->put();
+      msg = NULL;
+    }
+  }
+
+  PaxosServiceMessage *get_msg_from_bl() {
+    bufferlist::iterator p = msg_bl.begin();
+    return (msg_bl.length() ?
+        (PaxosServiceMessage*)decode_message(NULL, 0, p) : NULL);
   }
 
 public:
@@ -61,7 +74,7 @@ public:
     ::encode(tid, payload);
     ::encode(client, payload);
     ::encode(client_caps, payload, features);
-    encode_message(msg, features, payload);
+    payload.append(msg_bl);
     ::encode(con_features, payload);
     ::encode(entity_name, payload);
   }
@@ -88,13 +101,35 @@ public:
 
   }
 
+  void set_message(PaxosServiceMessage *m, uint64_t features) {
+    // get a reference to the message.  We will not use it except for print(),
+    // and we will put it in the dtor if it is not claimed.
+    // we could avoid doing this if only we had a const bufferlist iterator :)
+    msg = (PaxosServiceMessage*)m->get();
+
+    encode_message(m, features, msg_bl);
+  }
+
+  PaxosServiceMessage *claim_message() {
+    if (!msg) {
+      return get_msg_from_bl();
+    }
+
+    // let whoever is claiming the message deal with putting it.
+    PaxosServiceMessage *m = msg;
+    msg = NULL;
+    return m;
+  }
+
   const char *get_type_name() const { return "forward"; }
   void print(ostream& o) const {
-    if (msg)
+    if (msg) {
       o << "forward(" << *msg << " caps " << client_caps
 	<< " tid " << tid
         << " con_features " << con_features << ") to leader";
-    else o << "forward(??? ) to leader";
+    } else {
+      o << "forward(??? ) to leader";
+    }
   }
 };
   
diff --git a/src/messages/MGetPoolStats.h b/src/messages/MGetPoolStats.h
index d004e29..b66a5b8 100644
--- a/src/messages/MGetPoolStats.h
+++ b/src/messages/MGetPoolStats.h
@@ -16,8 +16,6 @@
 #ifndef CEPH_MGETPOOLSTATS_H
 #define CEPH_MGETPOOLSTATS_H
 
-#include <uuid/uuid.h>
-
 #include "messages/PaxosServiceMessage.h"
 
 class MGetPoolStats : public PaxosServiceMessage {
diff --git a/src/messages/MGetPoolStatsReply.h b/src/messages/MGetPoolStatsReply.h
index 07b99cf..9dc2cbe 100644
--- a/src/messages/MGetPoolStatsReply.h
+++ b/src/messages/MGetPoolStatsReply.h
@@ -16,8 +16,6 @@
 #ifndef CEPH_MGETPOOLSTATSREPLY_H
 #define CEPH_MGETPOOLSTATSREPLY_H
 
-#include <uuid/uuid.h>
-
 class MGetPoolStatsReply : public PaxosServiceMessage {
 public:
   uuid_d fsid;
diff --git a/src/messages/MLog.h b/src/messages/MLog.h
index c71f56b..8b64e07 100644
--- a/src/messages/MLog.h
+++ b/src/messages/MLog.h
@@ -19,7 +19,6 @@
 #include "messages/PaxosServiceMessage.h"
 
 #include <deque>
-#include <uuid/uuid.h>
 
 class MLog : public PaxosServiceMessage {
 public:
diff --git a/src/messages/MLogAck.h b/src/messages/MLogAck.h
index 713cee0..414008c 100644
--- a/src/messages/MLogAck.h
+++ b/src/messages/MLogAck.h
@@ -15,8 +15,6 @@
 #ifndef CEPH_MLOGACK_H
 #define CEPH_MLOGACK_H
 
-#include <uuid/uuid.h>
-
 class MLogAck : public Message {
 public:
   uuid_d fsid;
diff --git a/src/messages/MMDSBeacon.h b/src/messages/MMDSBeacon.h
index 48d5d04..e04fe37 100644
--- a/src/messages/MMDSBeacon.h
+++ b/src/messages/MMDSBeacon.h
@@ -21,8 +21,6 @@
 
 #include "mds/MDSMap.h"
 
-#include <uuid/uuid.h>
-
 
 
 /**
@@ -36,7 +34,9 @@ enum mds_metric_t {
   MDS_HEALTH_CLIENT_RECALL,
   MDS_HEALTH_CLIENT_LATE_RELEASE,
   MDS_HEALTH_CLIENT_RECALL_MANY,
-  MDS_HEALTH_CLIENT_LATE_RELEASE_MANY
+  MDS_HEALTH_CLIENT_LATE_RELEASE_MANY,
+  MDS_HEALTH_CLIENT_OLDEST_TID,
+  MDS_HEALTH_CLIENT_OLDEST_TID_MANY,
 };
 
 /**
@@ -120,7 +120,7 @@ WRITE_CLASS_ENCODER(MDSHealth)
 
 class MMDSBeacon : public PaxosServiceMessage {
 
-  static const int HEAD_VERSION = 3;
+  static const int HEAD_VERSION = 4;
   static const int COMPAT_VERSION = 2;
 
   uuid_d fsid;
@@ -136,6 +136,8 @@ class MMDSBeacon : public PaxosServiceMessage {
 
   MDSHealth health;
 
+  map<string, string> sys_info;
+
  public:
   MMDSBeacon() : PaxosServiceMessage(MSG_MDS_BEACON, 0, HEAD_VERSION, COMPAT_VERSION) { }
   MMDSBeacon(const uuid_d &f, mds_gid_t g, string& n, epoch_t les, MDSMap::DaemonState st, version_t se) : 
@@ -167,6 +169,9 @@ public:
   void set_standby_for_name(string& n) { standby_for_name = n; }
   void set_standby_for_name(const char* c) { standby_for_name.assign(c); }
 
+  const map<string, string>& get_sys_info() const { return sys_info; }
+  void set_sys_info(const map<string, string>& i) { sys_info = i; }
+
   void print(ostream& out) const {
     out << "mdsbeacon(" << global_id << "/" << name << " " << ceph_mds_state_name(state) 
 	<< " seq " << seq << " v" << version << ")";
@@ -183,6 +188,9 @@ public:
     ::encode(standby_for_name, payload);
     ::encode(compat, payload);
     ::encode(health, payload);
+    if (state == MDSMap::STATE_BOOT) {
+      ::encode(sys_info, payload);
+    }
   }
   void decode_payload() {
     bufferlist::iterator p = payload.begin();
@@ -199,6 +207,10 @@ public:
     if (header.version >= 3) {
       ::decode(health, p);
     }
+    if (state == MDSMap::STATE_BOOT &&
+	header.version >= 4) {
+      ::decode(sys_info, p);
+    }
   }
 };
 
diff --git a/src/messages/MMDSMap.h b/src/messages/MMDSMap.h
index 66566d0..36b9e95 100644
--- a/src/messages/MMDSMap.h
+++ b/src/messages/MMDSMap.h
@@ -20,8 +20,6 @@
 #include "mds/MDSMap.h"
 #include "include/ceph_features.h"
 
-#include <uuid/uuid.h>
-
 class MMDSMap : public Message {
  public:
   /*
diff --git a/src/messages/MMDSOpenInoReply.h b/src/messages/MMDSOpenInoReply.h
index 1bf41b8..76cc55c 100644
--- a/src/messages/MMDSOpenInoReply.h
+++ b/src/messages/MMDSOpenInoReply.h
@@ -23,7 +23,7 @@ struct MMDSOpenInoReply : public Message {
   mds_rank_t hint;
   int32_t error;
 
-  MMDSOpenInoReply() : Message(MSG_MDS_OPENINOREPLY) {}
+  MMDSOpenInoReply() : Message(MSG_MDS_OPENINOREPLY), error(0) {}
   MMDSOpenInoReply(ceph_tid_t t, inodeno_t i, mds_rank_t h=MDS_RANK_NONE, int e=0) :
     Message(MSG_MDS_OPENINOREPLY), ino(i), hint(h), error(e) {
     header.tid = t;
diff --git a/src/messages/MMonMetadata.h b/src/messages/MMonMetadata.h
new file mode 100644
index 0000000..ae56611
--- /dev/null
+++ b/src/messages/MMonMetadata.h
@@ -0,0 +1,52 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MMONMETADATA_H
+#define CEPH_MMONMETADATA_H
+
+#include "mon/mon_types.h"
+#include "msg/Message.h"
+
+class MMonMetadata : public Message {
+public:
+  Metadata data;
+
+private:
+  static const int HEAD_VERSION = 1;
+  ~MMonMetadata() {}
+
+public:
+  MMonMetadata() :
+    Message(CEPH_MSG_MON_METADATA)
+  {}
+  MMonMetadata(const Metadata& metadata) :
+    Message(CEPH_MSG_MON_METADATA, HEAD_VERSION),
+    data(metadata)
+  {}
+
+  virtual const char *get_type_name() const {
+    return "mon_metadata";
+  }
+
+  virtual void encode_payload(uint64_t features) {
+    ::encode(data, payload);
+  }
+
+  virtual void decode_payload() {
+    bufferlist::iterator p = payload.begin();
+    ::decode(data, p);
+  }
+};
+
+#endif
diff --git a/src/messages/MMonPaxos.h b/src/messages/MMonPaxos.h
index 206586c..b8a9d37 100644
--- a/src/messages/MMonPaxos.h
+++ b/src/messages/MMonPaxos.h
@@ -92,6 +92,8 @@ public:
   void encode_payload(uint64_t features) {
     if ((features & CEPH_FEATURE_MONCLOCKCHECK) == 0)
       header.version = 0;
+    else
+      header.version = HEAD_VERSION;
     ::encode(epoch, payload);
     ::encode(op, payload);
     ::encode(first_committed, payload);
diff --git a/src/messages/MMonScrub.h b/src/messages/MMonScrub.h
index b16728b..9bfa61a 100644
--- a/src/messages/MMonScrub.h
+++ b/src/messages/MMonScrub.h
@@ -18,7 +18,7 @@
 
 class MMonScrub : public Message
 {
-  static const int HEAD_VERSION = 1;
+  static const int HEAD_VERSION = 2;
   static const int COMPAT_VERSION = 1;
 
 public:
@@ -38,14 +38,17 @@ public:
   op_type_t op;
   version_t version;
   ScrubResult result;
+  int32_t num_keys;
+  pair<string,string> key;
 
   MMonScrub()
-    : Message(MSG_MON_SCRUB, HEAD_VERSION, COMPAT_VERSION)
+    : Message(MSG_MON_SCRUB, HEAD_VERSION, COMPAT_VERSION),
+      num_keys(-1)
   { }
 
-  MMonScrub(op_type_t op, version_t v)
+  MMonScrub(op_type_t op, version_t v, int32_t num_keys)
     : Message(MSG_MON_SCRUB, HEAD_VERSION, COMPAT_VERSION),
-      op(op), version(v)
+      op(op), version(v), num_keys(num_keys)
   { }
 
   const char *get_type_name() const { return "mon_scrub"; }
@@ -55,6 +58,8 @@ public:
     out << " v " << version;
     if (op == OP_RESULT)
       out << " " << result;
+    out << " num_keys " << num_keys;
+    out << " key (" << key << ")";
     out << ")";
   }
 
@@ -63,6 +68,8 @@ public:
     ::encode(o, payload);
     ::encode(version, payload);
     ::encode(result, payload);
+    ::encode(num_keys, payload);
+    ::encode(key, payload);
   }
 
   void decode_payload() {
@@ -72,6 +79,10 @@ public:
     op = (op_type_t)o;
     ::decode(version, p);
     ::decode(result, p);
+    if (header.version >= 2) {
+      ::decode(num_keys, p);
+      ::decode(key, p);
+    }
   }
 };
 
diff --git a/src/messages/MMonSubscribe.h b/src/messages/MMonSubscribe.h
index 0233010..2363775 100644
--- a/src/messages/MMonSubscribe.h
+++ b/src/messages/MMonSubscribe.h
@@ -74,6 +74,7 @@ public:
   void encode_payload(uint64_t features) {
     if (features & CEPH_FEATURE_SUBSCRIBE2) {
       ::encode(what, payload);
+      header.version = HEAD_VERSION;
     } else {
       header.version = 0;
       map<string, ceph_mon_subscribe_item_old> oldwhat;
diff --git a/src/messages/MOSDECSubOpWrite.h b/src/messages/MOSDECSubOpWrite.h
index a47bcef..b3a8e3c 100644
--- a/src/messages/MOSDECSubOpWrite.h
+++ b/src/messages/MOSDECSubOpWrite.h
@@ -35,9 +35,10 @@ public:
   MOSDECSubOpWrite()
     : Message(MSG_OSD_EC_WRITE, HEAD_VERSION, COMPAT_VERSION)
     {}
-  MOSDECSubOpWrite(ECSubWrite &op)
-  : Message(MSG_OSD_EC_WRITE, HEAD_VERSION, COMPAT_VERSION),
-    op(op) {}
+  MOSDECSubOpWrite(ECSubWrite &in_op)
+    : Message(MSG_OSD_EC_WRITE, HEAD_VERSION, COMPAT_VERSION) {
+    op.claim(in_op);
+  }
 
   virtual void decode_payload() {
     bufferlist::iterator p = payload.begin();
diff --git a/src/messages/MOSDMap.h b/src/messages/MOSDMap.h
index 9c15290..06e79de 100644
--- a/src/messages/MOSDMap.h
+++ b/src/messages/MOSDMap.h
@@ -80,6 +80,7 @@ public:
     }
   }
   void encode_payload(uint64_t features) {
+    header.version = HEAD_VERSION;
     ::encode(fsid, payload);
     if ((features & CEPH_FEATURE_PGID64) == 0 ||
 	(features & CEPH_FEATURE_PGPOOL3) == 0 ||
diff --git a/src/messages/MOSDOp.h b/src/messages/MOSDOp.h
index 5b88f31..5d4ab62 100644
--- a/src/messages/MOSDOp.h
+++ b/src/messages/MOSDOp.h
@@ -32,7 +32,7 @@ class OSD;
 
 class MOSDOp : public Message {
 
-  static const int HEAD_VERSION = 5;
+  static const int HEAD_VERSION = 6;
   static const int COMPAT_VERSION = 3;
 
 private:
@@ -56,6 +56,8 @@ private:
 
   uint64_t features;
 
+  osd_reqid_t reqid; // reqid explicitly set by sender
+
 public:
   friend class MOSDOpReply;
 
@@ -69,11 +71,17 @@ public:
     snaps = i;
   }
   void set_snap_seq(const snapid_t& s) { snap_seq = s; }
+  void set_reqid(const osd_reqid_t rid) {
+    reqid = rid;
+  }
 
   osd_reqid_t get_reqid() const {
-    return osd_reqid_t(get_orig_source(),
-		       client_inc,
-		       header.tid);
+    if (reqid != osd_reqid_t())
+      return reqid;
+    else
+      return osd_reqid_t(get_orig_source(),
+                         client_inc,
+			 header.tid);
   }
   int get_client_inc() { return client_inc; }
   ceph_tid_t get_client_tid() { return header.tid; }
@@ -154,6 +162,7 @@ public:
 
   // flags
   int get_flags() const { return flags; }
+  bool has_flag(__u32 flag) { return flags & flag; };
 
   bool wants_ack() const { return flags & CEPH_OSD_FLAG_ACK; }
   bool wants_ondisk() const { return flags & CEPH_OSD_FLAG_ONDISK; }
@@ -240,6 +249,7 @@ struct ceph_osd_request_head {
       ::encode_nohead(oid.name, payload);
       ::encode_nohead(snaps, payload);
     } else {
+      header.version = HEAD_VERSION;
       ::encode(client_inc, payload);
       ::encode(osdmap_epoch, payload);
       ::encode(flags, payload);
@@ -261,6 +271,7 @@ struct ceph_osd_request_head {
 
       ::encode(retry_attempt, payload);
       ::encode(features, payload);
+      ::encode(reqid, payload);
     }
   }
 
@@ -308,6 +319,7 @@ struct ceph_osd_request_head {
 
       retry_attempt = -1;
       features = 0;
+      reqid = osd_reqid_t();
     } else {
       // new decode 
       ::decode(client_inc, p);
@@ -348,6 +360,11 @@ struct ceph_osd_request_head {
 	::decode(features, p);
       else
 	features = 0;
+
+      if (header.version >= 6)
+	::decode(reqid, p);
+      else
+	reqid = osd_reqid_t();
     }
 
     OSDOp::split_osd_op_vector_in_data(ops, data);
diff --git a/src/messages/MOSDOpReply.h b/src/messages/MOSDOpReply.h
index b2d5155..45ec3d0 100644
--- a/src/messages/MOSDOpReply.h
+++ b/src/messages/MOSDOpReply.h
@@ -169,6 +169,7 @@ public:
       }
       ::encode_nohead(oid.name, payload);
     } else {
+      header.version = HEAD_VERSION;
       ::encode(oid, payload);
       ::encode(pgid, payload);
       ::encode(flags, payload);
diff --git a/src/messages/MOSDRepOp.h b/src/messages/MOSDRepOp.h
index 25cd09d..a4b0883 100644
--- a/src/messages/MOSDRepOp.h
+++ b/src/messages/MOSDRepOp.h
@@ -104,7 +104,8 @@ public:
   }
 
   MOSDRepOp()
-    : Message(MSG_OSD_REPOP, HEAD_VERSION, COMPAT_VERSION) { }
+    : Message(MSG_OSD_REPOP, HEAD_VERSION, COMPAT_VERSION),
+      map_epoch(0), acks_wanted (0) {}
   MOSDRepOp(osd_reqid_t r, pg_shard_t from,
 	    spg_t p, const hobject_t& po, int aw,
 	    epoch_t mape, ceph_tid_t rtid, eversion_t v)
diff --git a/src/messages/MOSDRepOpReply.h b/src/messages/MOSDRepOpReply.h
index 957502d..f0faa4c 100644
--- a/src/messages/MOSDRepOpReply.h
+++ b/src/messages/MOSDRepOpReply.h
@@ -94,7 +94,9 @@ public:
     result(result_) {
     set_tid(req->get_tid());
   }
-  MOSDRepOpReply() : Message(MSG_OSD_REPOPREPLY) {}
+  MOSDRepOpReply() 
+    : Message(MSG_OSD_REPOPREPLY), map_epoch(0),  
+      ack_type(0), result(0) {}
 private:
   ~MOSDRepOpReply() {}
 
diff --git a/src/messages/MOSDSubOp.h b/src/messages/MOSDSubOp.h
index 544dfcf..f746568 100644
--- a/src/messages/MOSDSubOp.h
+++ b/src/messages/MOSDSubOp.h
@@ -69,7 +69,7 @@ public:
   map<string,bufferlist> attrset;
 
   interval_set<uint64_t> data_subset;
-  map<hobject_t, interval_set<uint64_t> > clone_subsets;
+  map<hobject_t, interval_set<uint64_t>, hobject_t::BitwiseComparator> clone_subsets;
 
   bool first, complete;
 
diff --git a/src/messages/MRoute.h b/src/messages/MRoute.h
index d7a826e..5282d39 100644
--- a/src/messages/MRoute.h
+++ b/src/messages/MRoute.h
@@ -60,6 +60,8 @@ public:
     ::encode(session_mon_tid, payload);
     ::encode(dest, payload);
     if (features & CEPH_FEATURE_MON_NULLROUTE) {
+      header.version = HEAD_VERSION;
+      header.compat_version = COMPAT_VERSION;
       bool m = msg ? true : false;
       ::encode(m, payload);
       if (msg)
diff --git a/src/messages/MStatfsReply.h b/src/messages/MStatfsReply.h
index 6bf42d8..59312ab 100644
--- a/src/messages/MStatfsReply.h
+++ b/src/messages/MStatfsReply.h
@@ -22,7 +22,7 @@ public:
 
   MStatfsReply() : Message(CEPH_MSG_STATFS_REPLY) {}
   MStatfsReply(uuid_d &f, ceph_tid_t t, epoch_t epoch) : Message(CEPH_MSG_STATFS_REPLY) {
-    memcpy(&h.fsid, f.uuid, sizeof(h.fsid));
+    memcpy(&h.fsid, f.bytes(), sizeof(h.fsid));
     header.tid = t;
     h.version = epoch;
   }
diff --git a/src/messages/Makefile.am b/src/messages/Makefile.am
index 5e962ce..d3f0251 100644
--- a/src/messages/Makefile.am
+++ b/src/messages/Makefile.am
@@ -65,6 +65,7 @@ noinst_HEADERS += \
 	messages/MMonHealth.h \
 	messages/MMonJoin.h \
 	messages/MMonMap.h \
+	messages/MMonMetadata.h \
 	messages/MMonPaxos.h \
 	messages/MMonProbe.h \
 	messages/MMonScrub.h \
diff --git a/src/mon/AuthMonitor.cc b/src/mon/AuthMonitor.cc
index ee83f9d..730410e 100644
--- a/src/mon/AuthMonitor.cc
+++ b/src/mon/AuthMonitor.cc
@@ -277,45 +277,46 @@ version_t AuthMonitor::get_trim_to()
   return 0;
 }
 
-bool AuthMonitor::preprocess_query(PaxosServiceMessage *m)
+bool AuthMonitor::preprocess_query(MonOpRequestRef op)
 {
+  PaxosServiceMessage *m = static_cast<PaxosServiceMessage*>(op->get_req());
   dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
   switch (m->get_type()) {
   case MSG_MON_COMMAND:
-    return preprocess_command((MMonCommand*)m);
+    return preprocess_command(op);
 
   case CEPH_MSG_AUTH:
-    return prep_auth((MAuth *)m, false);
+    return prep_auth(op, false);
 
   case MSG_MON_GLOBAL_ID:
     return false;
 
   default:
     assert(0);
-    m->put();
     return true;
   }
 }
 
-bool AuthMonitor::prepare_update(PaxosServiceMessage *m)
+bool AuthMonitor::prepare_update(MonOpRequestRef op)
 {
+  PaxosServiceMessage *m = static_cast<PaxosServiceMessage*>(op->get_req());
   dout(10) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
   switch (m->get_type()) {
   case MSG_MON_COMMAND:
-    return prepare_command((MMonCommand*)m);
+    return prepare_command(op);
   case MSG_MON_GLOBAL_ID:
-    return prepare_global_id((MMonGlobalID*)m); 
+    return prepare_global_id(op);
   case CEPH_MSG_AUTH:
-    return prep_auth((MAuth *)m, true);
+    return prep_auth(op, true);
   default:
     assert(0);
-    m->put();
     return false;
   }
 }
 
-uint64_t AuthMonitor::assign_global_id(MAuth *m, bool should_increase_max)
+uint64_t AuthMonitor::assign_global_id(MonOpRequestRef op, bool should_increase_max)
 {
+  MAuth *m = static_cast<MAuth*>(op->get_req());
   int total_mon = mon->monmap->size();
   dout(10) << "AuthMonitor::assign_global_id m=" << *m << " mon=" << mon->rank << "/" << total_mon
 	   << " last_allocated=" << last_allocated_id << " max_global_id=" <<  max_global_id << dendl;
@@ -352,14 +353,14 @@ uint64_t AuthMonitor::assign_global_id(MAuth *m, bool should_increase_max)
 }
 
 
-bool AuthMonitor::prep_auth(MAuth *m, bool paxos_writable)
+bool AuthMonitor::prep_auth(MonOpRequestRef op, bool paxos_writable)
 {
+  MAuth *m = static_cast<MAuth*>(op->get_req());
   dout(10) << "prep_auth() blob_size=" << m->get_auth_payload().length() << dendl;
 
-  MonSession *s = (MonSession *)m->get_connection()->get_priv();
+  MonSession *s = op->get_session();
   if (!s) {
     dout(10) << "no session, dropping" << dendl;
-    m->put();
     return true;
   }
 
@@ -439,7 +440,7 @@ bool AuthMonitor::prep_auth(MAuth *m, bool paxos_writable)
      request. If a client tries to send it later, it'll screw up its auth
      session */
   if (!s->global_id) {
-    s->global_id = assign_global_id(m, paxos_writable);
+    s->global_id = assign_global_id(op, paxos_writable);
     if (!s->global_id) {
 
       delete s->auth_handler;
@@ -447,19 +448,17 @@ bool AuthMonitor::prep_auth(MAuth *m, bool paxos_writable)
 
       if (mon->is_leader() && paxos_writable) {
         dout(10) << "increasing global id, waitlisting message" << dendl;
-        wait_for_active(new C_RetryMessage(this, m));
+        wait_for_active(op, new C_RetryMessage(this, op));
         goto done;
       }
 
-      s->put();
-
       if (!mon->is_leader()) {
 	dout(10) << "not the leader, requesting more ids from leader" << dendl;
 	int leader = mon->get_leader();
 	MMonGlobalID *req = new MMonGlobalID();
 	req->old_max_id = max_global_id;
 	mon->messenger->send_message(req, mon->monmap->get_inst(leader));
-	wait_for_finished_proposal(new C_RetryMessage(this, m));
+	wait_for_finished_proposal(op, new C_RetryMessage(this, op));
 	return true;
       }
 
@@ -486,7 +485,7 @@ bool AuthMonitor::prep_auth(MAuth *m, bool paxos_writable)
       ret = s->auth_handler->handle_request(indata, response_bl, s->global_id, caps_info, &auid);
     }
     if (ret == -EIO) {
-      wait_for_active(new C_RetryMessage(this,m));
+      wait_for_active(op, new C_RetryMessage(this,op));
       goto done;
     }
     if (caps_info.caps.length()) {
@@ -508,15 +507,14 @@ bool AuthMonitor::prep_auth(MAuth *m, bool paxos_writable)
 
 reply:
   reply = new MAuthReply(proto, &response_bl, ret, s->global_id);
-  mon->send_reply(m, reply);
-  m->put();
+  mon->send_reply(op, reply);
 done:
-  s->put();
   return true;
 }
 
-bool AuthMonitor::preprocess_command(MMonCommand *m)
+bool AuthMonitor::preprocess_command(MonOpRequestRef op)
 {
+  MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
   int r = -1;
   bufferlist rdata;
   stringstream ss, ds;
@@ -525,7 +523,7 @@ bool AuthMonitor::preprocess_command(MMonCommand *m)
   if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
     // ss has reason for failure
     string rs = ss.str();
-    mon->reply_command(m, -EINVAL, rs, rdata, get_last_committed());
+    mon->reply_command(op, -EINVAL, rs, rdata, get_last_committed());
     return true;
   }
 
@@ -542,7 +540,7 @@ bool AuthMonitor::preprocess_command(MMonCommand *m)
 
   MonSession *session = m->get_session();
   if (!session) {
-    mon->reply_command(m, -EACCES, "access denied", rdata, get_last_committed());
+    mon->reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
     return true;
   }
 
@@ -552,7 +550,7 @@ bool AuthMonitor::preprocess_command(MMonCommand *m)
   EntityName entity;
   if (!entity_name.empty() && !entity.from_str(entity_name)) {
     ss << "invalid entity_auth " << entity_name;
-    mon->reply_command(m, -EINVAL, ss.str(), get_last_committed());
+    mon->reply_command(op, -EINVAL, ss.str(), get_last_committed());
     return true;
   }
 
@@ -638,7 +636,7 @@ bool AuthMonitor::preprocess_command(MMonCommand *m)
   rdata.append(ds);
   string rs;
   getline(ss, rs, '\0');
-  mon->reply_command(m, r, rs, rdata, get_last_committed());
+  mon->reply_command(op, r, rs, rdata, get_last_committed());
   return true;
 }
 
@@ -662,8 +660,9 @@ void AuthMonitor::import_keyring(KeyRing& keyring)
   }
 }
 
-bool AuthMonitor::prepare_command(MMonCommand *m)
+bool AuthMonitor::prepare_command(MonOpRequestRef op)
 {
+  MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
   stringstream ss, ds;
   bufferlist rdata;
   string rs;
@@ -673,7 +672,7 @@ bool AuthMonitor::prepare_command(MMonCommand *m)
   if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
     // ss has reason for failure
     string rs = ss.str();
-    mon->reply_command(m, -EINVAL, rs, rdata, get_last_committed());
+    mon->reply_command(op, -EINVAL, rs, rdata, get_last_committed());
     return true;
   }
 
@@ -690,7 +689,7 @@ bool AuthMonitor::prepare_command(MMonCommand *m)
 
   MonSession *session = m->get_session();
   if (!session) {
-    mon->reply_command(m, -EACCES, "access denied", rdata, get_last_committed());
+    mon->reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
     return true;
   }
 
@@ -713,7 +712,7 @@ bool AuthMonitor::prepare_command(MMonCommand *m)
     if (bl.length() == 0) {
       ss << "auth import: no data supplied";
       getline(ss, rs);
-      mon->reply_command(m, -EINVAL, rs, get_last_committed());
+      mon->reply_command(op, -EINVAL, rs, get_last_committed());
       return true;
     }
     bufferlist::iterator iter = bl.begin();
@@ -729,7 +728,7 @@ bool AuthMonitor::prepare_command(MMonCommand *m)
     ss << "imported keyring";
     getline(ss, rs);
     err = 0;
-    wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
 					      get_last_committed() + 1));
     return true;
   } else if (prefix == "auth add" && !entity_name.empty()) {
@@ -766,8 +765,8 @@ bool AuthMonitor::prepare_command(MMonCommand *m)
         ::decode(inc, q);
         if (inc.op == KeyServerData::AUTH_INC_ADD &&
             inc.name == entity) {
-          wait_for_finished_proposal(
-              new Monitor::C_Command(mon, m, 0, rs, get_last_committed() + 1));
+          wait_for_finished_proposal(op,
+              new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1));
           return true;
         }
       }
@@ -852,7 +851,7 @@ bool AuthMonitor::prepare_command(MMonCommand *m)
 
     ss << "added key for " << auth_inc.name;
     getline(ss, rs);
-    wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
 						   get_last_committed() + 1));
     return true;
   } else if ((prefix == "auth get-or-create-key" ||
@@ -865,17 +864,25 @@ bool AuthMonitor::prepare_command(MMonCommand *m)
       goto done;
     }
 
+    // Parse the list of caps into a map
+    std::map<std::string, bufferlist> wanted_caps;
+    for (vector<string>::const_iterator it = caps_vec.begin();
+	 it != caps_vec.end() && (it + 1) != caps_vec.end();
+	 it += 2) {
+      const std::string &sys = *it;
+      bufferlist cap;
+      ::encode(*(it+1), cap);
+      wanted_caps[sys] = cap;
+    }
+
     // do we have it?
     EntityAuth entity_auth;
     if (mon->key_server.get_auth(entity, entity_auth)) {
-      for (vector<string>::iterator it = caps_vec.begin();
-	   it != caps_vec.end(); it += 2) {
-	string sys = *it;
-	bufferlist cap;
-	::encode(*(it+1), cap);
-	if (entity_auth.caps.count(sys) == 0 ||
-	    !entity_auth.caps[sys].contents_equal(cap)) {
-	  ss << "key for " << entity << " exists but cap " << sys << " does not match";
+      for (const auto &sys_cap : wanted_caps) {
+	if (entity_auth.caps.count(sys_cap.first) == 0 ||
+	    !entity_auth.caps[sys_cap.first].contents_equal(sys_cap.second)) {
+	  ss << "key for " << entity << " exists but cap " << sys_cap.first
+            << " does not match";
 	  err = -EINVAL;
 	  goto done;
 	}
@@ -891,6 +898,7 @@ bool AuthMonitor::prepare_command(MMonCommand *m)
 	KeyRing kr;
 	kr.add(entity, entity_auth.key);
         if (f) {
+          kr.set_caps(entity, entity_auth.caps);
           kr.encode_formatted("auth", f.get(), rdata);
         } else {
           kr.encode_plaintext(rdata);
@@ -910,7 +918,7 @@ bool AuthMonitor::prepare_command(MMonCommand *m)
 	::decode(auth_inc, q);
 	if (auth_inc.op == KeyServerData::AUTH_INC_ADD &&
 	    auth_inc.name == entity) {
-	  wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
+	  wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
 						get_last_committed() + 1));
 	  return true;
 	}
@@ -922,9 +930,7 @@ bool AuthMonitor::prepare_command(MMonCommand *m)
     auth_inc.op = KeyServerData::AUTH_INC_ADD;
     auth_inc.name = entity;
     auth_inc.auth.key.create(g_ceph_context, CEPH_CRYPTO_AES);
-    for (vector<string>::iterator it = caps_vec.begin();
-	 it != caps_vec.end(); it += 2)
-      ::encode(*(it+1), auth_inc.auth.caps[*it]);
+    auth_inc.auth.caps = wanted_caps;
 
     push_cephx_inc(auth_inc);
 
@@ -938,6 +944,7 @@ bool AuthMonitor::prepare_command(MMonCommand *m)
       KeyRing kr;
       kr.add(entity, auth_inc.auth.key);
       if (f) {
+        kr.set_caps(entity, wanted_caps);
         kr.encode_formatted("auth", f.get(), rdata);
       } else {
         kr.encode_plaintext(rdata);
@@ -946,7 +953,7 @@ bool AuthMonitor::prepare_command(MMonCommand *m)
 
     rdata.append(ds);
     getline(ss, rs);
-    wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs, rdata,
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs, rdata,
 					      get_last_committed() + 1));
     return true;
   } else if (prefix == "auth caps" && !entity_name.empty()) {
@@ -974,7 +981,7 @@ bool AuthMonitor::prepare_command(MMonCommand *m)
 
     ss << "updated caps for " << auth_inc.name;
     getline(ss, rs);
-    wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
 					      get_last_committed() + 1));
     return true;
   } else if (prefix == "auth del" && !entity_name.empty()) {
@@ -990,7 +997,7 @@ bool AuthMonitor::prepare_command(MMonCommand *m)
 
     ss << "updated";
     getline(ss, rs);
-    wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
 					      get_last_committed() + 1));
     return true;
   }
@@ -998,16 +1005,15 @@ bool AuthMonitor::prepare_command(MMonCommand *m)
 done:
   rdata.append(ds);
   getline(ss, rs, '\0');
-  mon->reply_command(m, err, rs, rdata, get_last_committed());
+  mon->reply_command(op, err, rs, rdata, get_last_committed());
   return false;
 }
 
-bool AuthMonitor::prepare_global_id(MMonGlobalID *m)
+bool AuthMonitor::prepare_global_id(MonOpRequestRef op)
 {
   dout(10) << "AuthMonitor::prepare_global_id" << dendl;
   increase_max_global_id();
 
-  m->put();
   return true;
 }
 
diff --git a/src/mon/AuthMonitor.h b/src/mon/AuthMonitor.h
index d66de24..4420290 100644
--- a/src/mon/AuthMonitor.h
+++ b/src/mon/AuthMonitor.h
@@ -143,21 +143,21 @@ private:
   void create_initial();
   void update_from_paxos(bool *need_bootstrap);
   void create_pending();  // prepare a new pending
-  bool prepare_global_id(MMonGlobalID *m);
+  bool prepare_global_id(MonOpRequestRef op);
   void increase_max_global_id();
-  uint64_t assign_global_id(MAuth *m, bool should_increase_max);
+  uint64_t assign_global_id(MonOpRequestRef op, bool should_increase_max);
   // propose pending update to peers
   void encode_pending(MonitorDBStore::TransactionRef t);
   virtual void encode_full(MonitorDBStore::TransactionRef t);
   version_t get_trim_to();
 
-  bool preprocess_query(PaxosServiceMessage *m);  // true if processed.
-  bool prepare_update(PaxosServiceMessage *m);
+  bool preprocess_query(MonOpRequestRef op);  // true if processed.
+  bool prepare_update(MonOpRequestRef op);
 
-  bool prep_auth(MAuth *m, bool paxos_writable);
+  bool prep_auth(MonOpRequestRef op, bool paxos_writable);
 
-  bool preprocess_command(MMonCommand *m);
-  bool prepare_command(MMonCommand *m);
+  bool preprocess_command(MonOpRequestRef op);
+  bool prepare_command(MonOpRequestRef op);
 
   bool check_rotate();
  public:
diff --git a/src/mon/ConfigKeyService.cc b/src/mon/ConfigKeyService.cc
index 97126ed..64ed42f 100644
--- a/src/mon/ConfigKeyService.cc
+++ b/src/mon/ConfigKeyService.cc
@@ -88,16 +88,17 @@ void ConfigKeyService::store_list(stringstream &ss)
 }
 
 
-bool ConfigKeyService::service_dispatch(Message *m)
+bool ConfigKeyService::service_dispatch(MonOpRequestRef op)
 {
+  Message *m = op->get_req();
+  assert(m != NULL);
   dout(10) << __func__ << " " << *m << dendl;
+
   if (!in_quorum()) {
     dout(1) << __func__ << " not in quorum -- ignore message" << dendl;
-    m->put();
     return false;
   }
 
-  assert(m != NULL);
   assert(m->get_type() == MSG_MON_COMMAND);
 
   MMonCommand *cmd = static_cast<MMonCommand*>(m);
@@ -131,7 +132,7 @@ bool ConfigKeyService::service_dispatch(Message *m)
 
   } else if (prefix == "config-key put") {
     if (!mon->is_leader()) {
-      mon->forward_request_leader(cmd);
+      mon->forward_request_leader(op);
       // we forward the message; so return now.
       return true;
     }
@@ -154,13 +155,13 @@ bool ConfigKeyService::service_dispatch(Message *m)
     }
     // we'll reply to the message once the proposal has been handled
     store_put(key, data,
-        new Monitor::C_Command(mon, cmd, 0, "value stored", 0));
+        new Monitor::C_Command(mon, op, 0, "value stored", 0));
     // return for now; we'll put the message once it's done.
     return true;
 
   } else if (prefix == "config-key del") {
     if (!mon->is_leader()) {
-      mon->forward_request_leader(cmd);
+      mon->forward_request_leader(op);
       return true;
     }
 
@@ -169,7 +170,7 @@ bool ConfigKeyService::service_dispatch(Message *m)
       ss << "no such key '" << key << "'";
       goto out;
     }
-    store_delete(key, new Monitor::C_Command(mon, cmd, 0, "key deleted", 0));
+    store_delete(key, new Monitor::C_Command(mon, op, 0, "key deleted", 0));
     // return for now; we'll put the message once it's done
     return true;
 
@@ -194,9 +195,7 @@ bool ConfigKeyService::service_dispatch(Message *m)
 out:
   if (!cmd->get_source().is_mon()) {
     string rs = ss.str();
-    mon->reply_command(cmd, ret, rs, rdata, 0);
-  } else {
-    cmd->put();
+    mon->reply_command(op, ret, rs, rdata, 0);
   }
 
   return (ret == 0);
diff --git a/src/mon/ConfigKeyService.h b/src/mon/ConfigKeyService.h
index e33070b..0ceface 100644
--- a/src/mon/ConfigKeyService.h
+++ b/src/mon/ConfigKeyService.h
@@ -55,7 +55,7 @@ public:
   virtual void get_health(Formatter *f,
 			  list<pair<health_status_t,string> >& summary,
                           list<pair<health_status_t,string> > *detail) { }
-  virtual bool service_dispatch(Message *m);
+  virtual bool service_dispatch(MonOpRequestRef op);
 
   virtual void start_epoch() { }
   virtual void finish_epoch() { }
diff --git a/src/mon/DataHealthService.cc b/src/mon/DataHealthService.cc
index ef1e0e5..3056a5e 100644
--- a/src/mon/DataHealthService.cc
+++ b/src/mon/DataHealthService.cc
@@ -230,34 +230,36 @@ void DataHealthService::service_tick()
   }
 }
 
-void DataHealthService::handle_tell(MMonHealth *m)
+void DataHealthService::handle_tell(MonOpRequestRef op)
 {
+  op->mark_event("datahealth:handle_tell");
+  MMonHealth *m = static_cast<MMonHealth*>(op->get_req());
   dout(10) << __func__ << " " << *m << dendl;
   assert(m->get_service_op() == MMonHealth::OP_TELL);
 
   stats[m->get_source_inst()] = m->data_stats;
 }
 
-bool DataHealthService::service_dispatch(MMonHealth *m)
+bool DataHealthService::service_dispatch_op(MonOpRequestRef op)
 {
+  op->mark_event("datahealth:service_dispatch_op");
+  MMonHealth *m = static_cast<MMonHealth*>(op->get_req());
   dout(10) << __func__ << " " << *m << dendl;
   assert(m->get_service_type() == get_type());
   if (!in_quorum()) {
     dout(1) << __func__ << " not in quorum -- drop message" << dendl;
-    m->put();
     return false;
   }
 
   switch (m->service_op) {
     case MMonHealth::OP_TELL:
       // someone is telling us their stats
-      handle_tell(m);
+      handle_tell(op);
       break;
     default:
       dout(0) << __func__ << " unknown op " << m->service_op << dendl;
       assert(0 == "Unknown service op");
       break;
   }
-  m->put();
   return true;
 }
diff --git a/src/mon/DataHealthService.h b/src/mon/DataHealthService.h
index 221e179..a986d18 100644
--- a/src/mon/DataHealthService.h
+++ b/src/mon/DataHealthService.h
@@ -33,7 +33,7 @@ class DataHealthService :
   map<entity_inst_t,DataStats> stats;
   int last_warned_percent;
 
-  void handle_tell(MMonHealth *m);
+  void handle_tell(MonOpRequestRef op);
   int update_store_stats(DataStats &ours);
   int update_stats();
   void share_stats();
@@ -45,11 +45,7 @@ class DataHealthService :
 
 protected:
   virtual void service_tick();
-  virtual bool service_dispatch(Message *m) {
-    assert(0 == "We should never reach this; only the function below");
-    return false;
-  }
-  virtual bool service_dispatch(MMonHealth *m);
+  virtual bool service_dispatch_op(MonOpRequestRef op);
   virtual void service_shutdown() { }
 
   virtual void start_epoch();
diff --git a/src/mon/DumplingMonCommands.h b/src/mon/DumplingMonCommands.h
index 8e9c2bb..7cce6ab 100644
--- a/src/mon/DumplingMonCommands.h
+++ b/src/mon/DumplingMonCommands.h
@@ -332,8 +332,9 @@ COMMAND("osd find " \
 	"osd", "r", "cli,rest")
 COMMAND("osd map " \
 	"name=pool,type=CephPoolname " \
-	"name=object,type=CephObjectname", \
-	"find pg for <object> in <pool>", "osd", "r", "cli,rest")
+	"name=object,type=CephObjectname " \
+	"name=nspace,type=CephString,req=false", \
+	"find pg for <object> in <pool> with [namespace]", "osd", "r", "cli,rest")
 COMMAND("osd scrub " \
 	"name=who,type=CephString", \
 	"initiate scrub on osd <who>", "osd", "rw", "cli,rest")
diff --git a/src/mon/Elector.cc b/src/mon/Elector.cc
index 6f54cb4..3bec0ef 100644
--- a/src/mon/Elector.cc
+++ b/src/mon/Elector.cc
@@ -78,8 +78,15 @@ void Elector::start()
   init();
   
   // start by trying to elect me
-  if (epoch % 2 == 0) 
+  if (epoch % 2 == 0) {
     bump_epoch(epoch+1);  // odd == election cycle
+  } else {
+    // do a trivial db write just to ensure it is writeable.
+    MonitorDBStore::TransactionRef t(new MonitorDBStore::Transaction);
+    t->put(Monitor::MONITOR_NAME, "election_writeable_test", rand());
+    int r = mon->store->apply_transaction(t);
+    assert(r >= 0);
+  }
   start_stamp = ceph_clock_now(g_ceph_context);
   electing_me = true;
   acked_me[mon->rank] = CEPH_FEATURES_ALL;
@@ -207,8 +214,10 @@ void Elector::victory()
 }
 
 
-void Elector::handle_propose(MMonElection *m)
+void Elector::handle_propose(MonOpRequestRef op)
 {
+  op->mark_event("elector:handle_propose");
+  MMonElection *m = static_cast<MMonElection*>(op->get_req());
   dout(5) << "handle_propose from " << m->get_source() << dendl;
   int from = m->get_source().num();
 
@@ -221,7 +230,7 @@ void Elector::handle_propose(MMonElection *m)
       required_features) {
     dout(5) << " ignoring propose from mon" << from
 	    << " without required features" << dendl;
-    nak_old_peer(m);
+    nak_old_peer(op);
     return;
   } else if (m->epoch > epoch) {
     bump_epoch(m->epoch);
@@ -236,7 +245,6 @@ void Elector::handle_propose(MMonElection *m)
       mon->start_election();
     } else {
       dout(5) << " ignoring old propose" << dendl;
-      m->put();
       return;
     }
   }
@@ -263,12 +271,12 @@ void Elector::handle_propose(MMonElection *m)
       dout(5) << "no, we already acked " << leader_acked << dendl;
     }
   }
-  
-  m->put();
 }
  
-void Elector::handle_ack(MMonElection *m)
+void Elector::handle_ack(MonOpRequestRef op)
 {
+  op->mark_event("elector:handle_ack");
+  MMonElection *m = static_cast<MMonElection*>(op->get_req());
   dout(5) << "handle_ack from " << m->get_source() << dendl;
   int from = m->get_source().num();
 
@@ -277,7 +285,6 @@ void Elector::handle_ack(MMonElection *m)
     dout(5) << "woah, that's a newer epoch, i must have rebooted.  bumping and re-starting!" << dendl;
     bump_epoch(m->epoch);
     start();
-    m->put();
     return;
   }
   assert(m->epoch == epoch);
@@ -286,7 +293,6 @@ void Elector::handle_ack(MMonElection *m)
       required_features) {
     dout(5) << " ignoring ack from mon" << from
 	    << " without required features" << dendl;
-    m->put();
     return;
   }
   
@@ -306,13 +312,13 @@ void Elector::handle_ack(MMonElection *m)
     // ignore, i'm deferring already.
     assert(leader_acked >= 0);
   }
-  
-  m->put();
 }
 
 
-void Elector::handle_victory(MMonElection *m)
+void Elector::handle_victory(MonOpRequestRef op)
 {
+  op->mark_event("elector:handle_victory");
+  MMonElection *m = static_cast<MMonElection*>(op->get_req());
   dout(5) << "handle_victory from " << m->get_source() << " quorum_features " << m->quorum_features << dendl;
   int from = m->get_source().num();
 
@@ -326,7 +332,6 @@ void Elector::handle_victory(MMonElection *m)
     dout(5) << "woah, that's a funny epoch, i must have rebooted.  bumping and re-starting!" << dendl;
     bump_epoch(m->epoch);
     start();
-    m->put();
     return;
   }
 
@@ -351,12 +356,12 @@ void Elector::handle_victory(MMonElection *m)
     mon->get_classic_monitor_commands(&new_cmds, &cmdsize);
     mon->set_leader_supported_commands(new_cmds, cmdsize);
   }
-
-  m->put();
 }
 
-void Elector::nak_old_peer(MMonElection *m)
+void Elector::nak_old_peer(MonOpRequestRef op)
 {
+  op->mark_event("elector:nak_old_peer");
+  MMonElection *m = static_cast<MMonElection*>(op->get_req());
   uint64_t supported_features = m->get_connection()->get_features();
 
   if (supported_features & CEPH_FEATURE_OSDMAP_ENC) {
@@ -371,11 +376,12 @@ void Elector::nak_old_peer(MMonElection *m)
     mon->features.encode(reply->sharing_bl);
     m->get_connection()->send_message(reply);
   }
-  m->put();
 }
 
-void Elector::handle_nak(MMonElection *m)
+void Elector::handle_nak(MonOpRequestRef op)
 {
+  op->mark_event("elector:handle_nak");
+  MMonElection *m = static_cast<MMonElection*>(op->get_req());
   dout(1) << "handle_nak from " << m->get_source()
 	  << " quorum_features " << m->quorum_features << dendl;
 
@@ -391,44 +397,43 @@ void Elector::handle_nak(MMonElection *m)
   // the end!
 }
 
-void Elector::dispatch(Message *m)
+void Elector::dispatch(MonOpRequestRef op)
 {
-  switch (m->get_type()) {
+  op->mark_event("elector:dispatch");
+  assert(op->is_type_election());
+
+  switch (op->get_req()->get_type()) {
     
   case MSG_MON_ELECTION:
     {
       if (!participating) {
-        m->put();
         return;
       }
-      if (m->get_source().num() >= mon->monmap->size()) {
+      if (op->get_req()->get_source().num() >= mon->monmap->size()) {
 	dout(5) << " ignoring bogus election message with bad mon rank " 
-		<< m->get_source() << dendl;
-	m->put();
+		<< op->get_req()->get_source() << dendl;
 	return;
       }
 
-      MMonElection *em = static_cast<MMonElection*>(m);
+      MMonElection *em = static_cast<MMonElection*>(op->get_req());
 
       // assume an old message encoding would have matched
       if (em->fsid != mon->monmap->fsid) {
 	dout(0) << " ignoring election msg fsid " 
 		<< em->fsid << " != " << mon->monmap->fsid << dendl;
-	m->put();
 	return;
       }
 
-      if (!mon->monmap->contains(m->get_source_addr())) {
-	dout(1) << "discarding election message: " << m->get_source_addr()
+      if (!mon->monmap->contains(em->get_source_addr())) {
+	dout(1) << "discarding election message: " << em->get_source_addr()
 		<< " not in my monmap " << *mon->monmap << dendl;
-	m->put();
 	return;
       }
 
       MonMap *peermap = new MonMap;
       peermap->decode(em->monmap_bl);
       if (peermap->epoch > mon->monmap->epoch) {
-	dout(0) << m->get_source_inst() << " has newer monmap epoch " << peermap->epoch
+	dout(0) << em->get_source_inst() << " has newer monmap epoch " << peermap->epoch
 		<< " > my epoch " << mon->monmap->epoch 
 		<< ", taking it"
 		<< dendl;
@@ -440,12 +445,11 @@ void Elector::dispatch(Message *m)
 	//mon->monmon()->paxos->stash_latest(mon->monmap->epoch, em->monmap_bl);
 	cancel_timer();
 	mon->bootstrap();
-	m->put();
 	delete peermap;
 	return;
       }
       if (peermap->epoch < mon->monmap->epoch) {
-	dout(0) << m->get_source_inst() << " has older monmap epoch " << peermap->epoch
+	dout(0) << em->get_source_inst() << " has older monmap epoch " << peermap->epoch
 		<< " < my epoch " << mon->monmap->epoch 
 		<< dendl;
       } 
@@ -453,25 +457,24 @@ void Elector::dispatch(Message *m)
 
       switch (em->op) {
       case MMonElection::OP_PROPOSE:
-	handle_propose(em);
+	handle_propose(op);
 	return;
       }
 
       if (em->epoch < epoch) {
 	dout(5) << "old epoch, dropping" << dendl;
-	em->put();
 	break;
       }
 
       switch (em->op) {
       case MMonElection::OP_ACK:
-	handle_ack(em);
+	handle_ack(op);
 	return;
       case MMonElection::OP_VICTORY:
-	handle_victory(em);
+	handle_victory(op);
 	return;
       case MMonElection::OP_NAK:
-	handle_nak(em);
+	handle_nak(op);
 	return;
       default:
 	assert(0);
diff --git a/src/mon/Elector.h b/src/mon/Elector.h
index 007fc11..ab84d0b 100644
--- a/src/mon/Elector.h
+++ b/src/mon/Elector.h
@@ -25,6 +25,7 @@ using namespace std;
 #include "include/Context.h"
 
 #include "common/Timer.h"
+#include "mon/MonOpRequest.h"
 
 class Monitor;
 
@@ -56,7 +57,7 @@ class Elector {
    *
    * @remarks This function assumes as a default firing value the duration of
    *	      the monitor's lease interval, and adds to it the value specified
-   *	      in @plus
+   *	      in @e plus
    *
    * @post expire_event is set
    *
@@ -127,7 +128,7 @@ class Elector {
    */
   int	    leader_acked;
   /**
-   * Indicates when we have acked him
+   * Indicates when we have acked it
    */
   utime_t   ack_stamp;
   /**
@@ -245,7 +246,7 @@ class Elector {
   void victory();
 
   /**
-   * Handle a message from some other node proposing himself to become him
+   * Handle a message from some other node proposing itself to become it
    * the Leader.
    *
    * If the message appears to be old (i.e., its epoch is lower than our epoch),
@@ -253,16 +254,16 @@ class Elector {
    *
    *  @li Ignore it because it's nothing more than an old proposal
    *  @li Start new elections if we verify that it was sent by a monitor from
-   *	  outside the quorum; given its old state, it's fair to assume he just
-   *	  started, so we should start new elections so he may rejoin
+   *	  outside the quorum; given its old state, it's fair to assume it just
+   *	  started, so we should start new elections so it may rejoin
    *
    * If we did not ignore the received message, then we know that this message
-   * was sent by some other node proposing himself to become the Leader. So, we
+   * was sent by some other node proposing itself to become the Leader. So, we
    * will take one of the following actions:
    *
-   *  @li Ignore him because we already acked another node with higher rank
-   *  @li Ignore him and start a new election because we outrank him
-   *  @li Defer to him because he outranks us and the node we previously
+   *  @li Ignore it because we already acked another node with higher rank
+   *  @li Ignore it and start a new election because we outrank it
+   *  @li Defer to it because it outranks us and the node we previously
    *	  acked, if any
    *
    *
@@ -270,7 +271,7 @@ class Elector {
    *
    * @param m A message sent by another participant in the quorum.
    */
-  void handle_propose(class MMonElection *m);
+  void handle_propose(MonOpRequestRef op);
   /**
    * Handle a message from some other participant Acking us as the Leader.
    *
@@ -293,7 +294,7 @@ class Elector {
    *
    * @param m A message with an operation type of OP_ACK
    */
-  void handle_ack(class MMonElection *m);
+  void handle_ack(MonOpRequestRef op);
   /**
    * Handle a message from some other participant declaring Victory.
    *
@@ -314,7 +315,7 @@ class Elector {
    *
    * @param m A message with an operation type of OP_VICTORY
    */
-  void handle_victory(class MMonElection *m);
+  void handle_victory(MonOpRequestRef op);
   /**
    * Send a nak to a peer who's out of date, containing information about why.
    *
@@ -326,7 +327,7 @@ class Elector {
    * @param m A message from a monitor not supporting required features. We
    * take ownership of the reference.
    */
-  void nak_old_peer(class MMonElection *m);
+  void nak_old_peer(MonOpRequestRef op);
   /**
    * Handle a message from some other participant declaring
    * we cannot join the quorum.
@@ -339,7 +340,7 @@ class Elector {
    *
    * @param m A message with an operation type of OP_NAK
    */
-  void handle_nak(class MMonElection *m);
+  void handle_nak(MonOpRequestRef op);
   
  public:
   /**
@@ -398,7 +399,7 @@ class Elector {
    *
    * @param m A received message
    */
-  void dispatch(Message *m);
+  void dispatch(MonOpRequestRef op);
 
   /**
    * Call an election.
diff --git a/src/mon/HealthMonitor.cc b/src/mon/HealthMonitor.cc
index 7cba39b..24ac84c 100644
--- a/src/mon/HealthMonitor.cc
+++ b/src/mon/HealthMonitor.cc
@@ -53,18 +53,17 @@ void HealthMonitor::init()
   }
 }
 
-bool HealthMonitor::service_dispatch(Message *m)
+bool HealthMonitor::service_dispatch(MonOpRequestRef op)
 {
-  assert(m->get_type() == MSG_MON_HEALTH);
-  MMonHealth *hm = (MMonHealth*)m;
+  assert(op->get_req()->get_type() == MSG_MON_HEALTH);
+  MMonHealth *hm = static_cast<MMonHealth*>(op->get_req());
   int service_type = hm->get_service_type();
   if (services.count(service_type) == 0) {
     dout(1) << __func__ << " service type " << service_type
             << " not registered -- drop message!" << dendl;
-    m->put();
     return false;
   }
-  return services[service_type]->service_dispatch(hm);
+  return services[service_type]->service_dispatch(op);
 }
 
 void HealthMonitor::service_shutdown()
diff --git a/src/mon/HealthMonitor.h b/src/mon/HealthMonitor.h
index 3d84261..4389876 100644
--- a/src/mon/HealthMonitor.h
+++ b/src/mon/HealthMonitor.h
@@ -45,7 +45,7 @@ public:
   virtual void get_health(Formatter *f,
 		     list<pair<health_status_t,string> >& summary,
 		     list<pair<health_status_t,string> > *detail);
-  virtual bool service_dispatch(Message *m);
+  virtual bool service_dispatch(MonOpRequestRef op);
 
   virtual void start_epoch() {
     for (map<int,HealthService*>::iterator it = services.begin();
diff --git a/src/mon/HealthService.h b/src/mon/HealthService.h
index 2a46f88..7b3d7ac 100644
--- a/src/mon/HealthService.h
+++ b/src/mon/HealthService.h
@@ -30,11 +30,11 @@ struct HealthService : public QuorumService
   HealthService(Monitor *m) : QuorumService(m) { }
   virtual ~HealthService() { }
 
-  virtual bool service_dispatch(Message *m) {
-    return service_dispatch(static_cast<MMonHealth*>(m));
+  virtual bool service_dispatch(MonOpRequestRef op) {
+    return service_dispatch_op(op);
   }
 
-  virtual bool service_dispatch(MMonHealth *m) = 0;
+  virtual bool service_dispatch_op(MonOpRequestRef op) = 0;
 
 public:
   virtual void get_health(Formatter *f,
diff --git a/src/mon/LogMonitor.cc b/src/mon/LogMonitor.cc
index aa1f674..b5ab447 100644
--- a/src/mon/LogMonitor.cc
+++ b/src/mon/LogMonitor.cc
@@ -12,6 +12,8 @@
  * 
  */
 
+#include <boost/algorithm/string/predicate.hpp>
+
 #include <sstream>
 #include <syslog.h>
 
@@ -28,6 +30,7 @@
 #include "osd/osd_types.h"
 #include "common/errno.h"
 #include "common/config.h"
+#include "common/strtol.h"
 #include "include/assert.h"
 #include "include/str_list.h"
 #include "include/str_map.h"
@@ -138,7 +141,7 @@ void LogMonitor::update_from_paxos(bool *need_bootstrap)
 
       if (channels.do_log_to_syslog(channel)) {
         string level = channels.get_level(channel);
-        string facility = channels.get_facility(facility);
+        string facility = channels.get_facility(channel);
         if (level.empty() || facility.empty()) {
           derr << __func__ << " unable to log to syslog -- level or facility"
                << " not defined (level: " << level << ", facility: "
@@ -264,40 +267,44 @@ version_t LogMonitor::get_trim_to()
   return 0;
 }
 
-bool LogMonitor::preprocess_query(PaxosServiceMessage *m)
+bool LogMonitor::preprocess_query(MonOpRequestRef op)
 {
+  op->mark_logmon_event("preprocess_query");
+  PaxosServiceMessage *m = static_cast<PaxosServiceMessage*>(op->get_req());
   dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
   switch (m->get_type()) {
   case MSG_MON_COMMAND:
-    return preprocess_command(static_cast<MMonCommand*>(m));
+    return preprocess_command(op);
 
   case MSG_LOG:
-    return preprocess_log((MLog*)m);
+    return preprocess_log(op);
 
   default:
     assert(0);
-    m->put();
     return true;
   }
 }
 
-bool LogMonitor::prepare_update(PaxosServiceMessage *m)
+bool LogMonitor::prepare_update(MonOpRequestRef op)
 {
+  op->mark_logmon_event("prepare_update");
+  PaxosServiceMessage *m = static_cast<PaxosServiceMessage*>(op->get_req());
   dout(10) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
   switch (m->get_type()) {
   case MSG_MON_COMMAND:
-    return prepare_command(static_cast<MMonCommand*>(m));
+    return prepare_command(op);
   case MSG_LOG:
-    return prepare_log((MLog*)m);
+    return prepare_log(op);
   default:
     assert(0);
-    m->put();
     return false;
   }
 }
 
-bool LogMonitor::preprocess_log(MLog *m)
+bool LogMonitor::preprocess_log(MonOpRequestRef op)
 {
+  op->mark_logmon_event("preprocess_log");
+  MLog *m = static_cast<MLog*>(op->get_req());
   dout(10) << "preprocess_log " << *m << " from " << m->get_orig_source() << dendl;
   int num_new = 0;
 
@@ -324,18 +331,18 @@ bool LogMonitor::preprocess_log(MLog *m)
   return false;
 
  done:
-  m->put();
   return true;
 }
 
-bool LogMonitor::prepare_log(MLog *m) 
+bool LogMonitor::prepare_log(MonOpRequestRef op) 
 {
+  op->mark_logmon_event("prepare_log");
+  MLog *m = static_cast<MLog*>(op->get_req());
   dout(10) << "prepare_log " << *m << " from " << m->get_orig_source() << dendl;
 
   if (m->fsid != mon->monmap->fsid) {
     dout(0) << "handle_log on fsid " << m->fsid << " != " << mon->monmap->fsid 
 	    << dendl;
-    m->put();
     return false;
   }
 
@@ -348,16 +355,15 @@ bool LogMonitor::prepare_log(MLog *m)
       pending_log.insert(pair<utime_t,LogEntry>(p->stamp, *p));
     }
   }
-  wait_for_finished_proposal(new C_Log(this, m));
+  wait_for_finished_proposal(op, new C_Log(this, op));
   return true;
 }
 
-void LogMonitor::_updated_log(MLog *m)
+void LogMonitor::_updated_log(MonOpRequestRef op)
 {
+  MLog *m = static_cast<MLog*>(op->get_req());
   dout(7) << "_updated_log for " << m->get_orig_source_inst() << dendl;
-  mon->send_reply(m, new MLogAck(m->fsid, m->entries.rbegin()->seq));
-
-  m->put();
+  mon->send_reply(op, new MLogAck(m->fsid, m->entries.rbegin()->seq));
 }
 
 bool LogMonitor::should_propose(double& delay)
@@ -372,8 +378,9 @@ bool LogMonitor::should_propose(double& delay)
 }
 
 
-bool LogMonitor::preprocess_command(MMonCommand *m)
+bool LogMonitor::preprocess_command(MonOpRequestRef op)
 {
+  op->mark_logmon_event("preprocess_command");
   int r = -1;
   bufferlist rdata;
   stringstream ss;
@@ -381,15 +388,17 @@ bool LogMonitor::preprocess_command(MMonCommand *m)
   if (r != -1) {
     string rs;
     getline(ss, rs);
-    mon->reply_command(m, r, rs, rdata, get_last_committed());
+    mon->reply_command(op, r, rs, rdata, get_last_committed());
     return true;
   } else
     return false;
 }
 
 
-bool LogMonitor::prepare_command(MMonCommand *m)
+bool LogMonitor::prepare_command(MonOpRequestRef op)
 {
+  op->mark_logmon_event("prepare_command");
+  MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
   stringstream ss;
   string rs;
   int err = -EINVAL;
@@ -398,7 +407,7 @@ bool LogMonitor::prepare_command(MMonCommand *m)
   if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
     // ss has reason for failure
     string rs = ss.str();
-    mon->reply_command(m, -EINVAL, rs, get_last_committed());
+    mon->reply_command(op, -EINVAL, rs, get_last_committed());
     return true;
   }
 
@@ -407,7 +416,7 @@ bool LogMonitor::prepare_command(MMonCommand *m)
 
   MonSession *session = m->get_session();
   if (!session) {
-    mon->reply_command(m, -EACCES, "access denied", get_last_committed());
+    mon->reply_command(op, -EACCES, "access denied", get_last_committed());
     return true;
   }
 
@@ -422,13 +431,13 @@ bool LogMonitor::prepare_command(MMonCommand *m)
     le.msg = str_join(logtext, " ");
     pending_summary.add(le);
     pending_log.insert(pair<utime_t,LogEntry>(le.stamp, le));
-    wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, string(),
-					      get_last_committed() + 1));
+    wait_for_finished_proposal(op, new Monitor::C_Command(
+          mon, op, 0, string(), get_last_committed() + 1));
     return true;
   }
 
   getline(ss, rs);
-  mon->reply_command(m, err, rs, get_last_committed());
+  mon->reply_command(op, err, rs, get_last_committed());
   return false;
 }
 
@@ -535,10 +544,10 @@ bool LogMonitor::_create_sub_summary(MLog *mlog, int level)
 }
 
 /**
- * Create an incremental log message from version @sv to @summary.version
+ * Create an incremental log message from version \p sv to \p summary.version
  *
  * @param mlog	Log message we'll send to the client with the messages received
- *		since version @sv, inclusive.
+ *		since version \p sv, inclusive.
  * @param level	The max log level of the messages the client is interested in.
  * @param sv	The version the client is looking for.
  */
@@ -667,6 +676,33 @@ string LogMonitor::log_channel_info::expand_channel_meta(
   return s;
 }
 
+bool LogMonitor::log_channel_info::do_log_to_syslog(const string &channel) {
+  string v = get_str_map_key(log_to_syslog, channel,
+                             &CLOG_CONFIG_DEFAULT_KEY);
+  // We expect booleans, but they are in k/v pairs, kept
+  // as strings, in 'log_to_syslog'. We must ensure
+  // compatibility with existing boolean handling, and so
+  // we are here using a modified version of how
+  // md_config_t::set_val_raw() handles booleans. We will
+  // accept both 'true' and 'false', but will also check for
+  // '1' and '0'. The main distiction between this and the
+  // original code is that we will assume everything not '1',
+  // '0', 'true' or 'false' to be 'false'.
+  bool ret = false;
+
+  if (boost::iequals(v, "false")) {
+    ret = false;
+  } else if (boost::iequals(v, "true")) {
+    ret = true;
+  } else {
+    std::string err;
+    int b = strict_strtol(v.c_str(), 10, &err);
+    ret = (err.empty() && b == 1);
+  }
+
+  return ret;
+}
+
 void LogMonitor::handle_conf_change(const struct md_config_t *conf,
                                     const std::set<std::string> &changed)
 {
diff --git a/src/mon/LogMonitor.h b/src/mon/LogMonitor.h
index 4cbeb6e..4d31b66 100644
--- a/src/mon/LogMonitor.h
+++ b/src/mon/LogMonitor.h
@@ -69,10 +69,7 @@ private:
     string expand_channel_meta(const string &input,
                                const string &change_to);
 
-    bool do_log_to_syslog(const string &channel) {
-      return (get_str_map_key(log_to_syslog, channel,
-                              &CLOG_CONFIG_DEFAULT_KEY) == "true");
-    }
+    bool do_log_to_syslog(const string &channel);
 
     string get_facility(const string &channel) {
       return get_str_map_key(syslog_facility, channel,
@@ -116,12 +113,12 @@ private:
   void encode_pending(MonitorDBStore::TransactionRef t);
   virtual void encode_full(MonitorDBStore::TransactionRef t);
   version_t get_trim_to();
-  bool preprocess_query(PaxosServiceMessage *m);  // true if processed.
-  bool prepare_update(PaxosServiceMessage *m);
+  bool preprocess_query(MonOpRequestRef op);  // true if processed.
+  bool prepare_update(MonOpRequestRef op);
 
-  bool preprocess_log(MLog *m);
-  bool prepare_log(MLog *m);
-  void _updated_log(MLog *m);
+  bool preprocess_log(MonOpRequestRef op);
+  bool prepare_log(MonOpRequestRef op);
+  void _updated_log(MonOpRequestRef op);
 
   bool should_propose(double& delay);
 
@@ -130,22 +127,20 @@ private:
     return true;
   }
 
-  struct C_Log : public Context {
+  struct C_Log : public C_MonOp {
     LogMonitor *logmon;
-    MLog *ack;
-    C_Log(LogMonitor *p, MLog *a) : logmon(p), ack(a) {}
-    void finish(int r) {
+    C_Log(LogMonitor *p, MonOpRequestRef o) : 
+      C_MonOp(o), logmon(p) {}
+    void _finish(int r) {
       if (r == -ECANCELED) {
-	if (ack)
-	  ack->put();
 	return;
       }
-      logmon->_updated_log(ack);
+      logmon->_updated_log(op);
     }    
   };
 
-  bool preprocess_command(MMonCommand *m);
-  bool prepare_command(MMonCommand *m);
+  bool preprocess_command(MonOpRequestRef op);
+  bool prepare_command(MonOpRequestRef op);
 
   bool _create_sub_summary(MLog *mlog, int level);
   void _create_sub_incremental(MLog *mlog, int level, version_t sv);
diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc
index 4cf26cd..2530628 100644
--- a/src/mon/MDSMonitor.cc
+++ b/src/mon/MDSMonitor.cc
@@ -70,7 +70,7 @@ template<> bool cmd_getval(CephContext *cct, const cmdmap_t& cmdmap,
   return cmd_getval(cct, cmdmap, k, (int64_t&)val);
 }
 
-
+static const string MDS_METADATA_PREFIX("mds_metadata");
 
 
 // my methods
@@ -138,6 +138,11 @@ void MDSMonitor::update_from_paxos(bool *need_bootstrap)
   update_logger();
 }
 
+void MDSMonitor::init()
+{
+  (void)load_metadata(pending_metadata);
+}
+
 void MDSMonitor::create_pending()
 {
   pending_mdsmap = mdsmap;
@@ -170,11 +175,13 @@ void MDSMonitor::encode_pending(MonitorDBStore::TransactionRef t)
     i->second.encode(bl);
     t->put(MDS_HEALTH_PREFIX, stringify(i->first), bl);
   }
+
   for (std::set<uint64_t>::iterator i = pending_daemon_health_rm.begin();
       i != pending_daemon_health_rm.end(); ++i) {
     t->erase(MDS_HEALTH_PREFIX, stringify(*i));
   }
   pending_daemon_health_rm.clear();
+  remove_from_metadata(t);
 }
 
 version_t MDSMonitor::get_trim_to()
@@ -205,24 +212,25 @@ void MDSMonitor::update_logger()
   mon->cluster_logger->set(l_cluster_mds_epoch, mdsmap.get_epoch());
 }
 
-bool MDSMonitor::preprocess_query(PaxosServiceMessage *m)
+bool MDSMonitor::preprocess_query(MonOpRequestRef op)
 {
+  op->mark_mdsmon_event(__func__);
+  PaxosServiceMessage *m = static_cast<PaxosServiceMessage*>(op->get_req());
   dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
 
   switch (m->get_type()) {
     
   case MSG_MDS_BEACON:
-    return preprocess_beacon(static_cast<MMDSBeacon*>(m));
+    return preprocess_beacon(op);
     
   case MSG_MON_COMMAND:
-    return preprocess_command(static_cast<MMonCommand*>(m));
+    return preprocess_command(op);
 
   case MSG_MDS_OFFLOAD_TARGETS:
-    return preprocess_offload_targets(static_cast<MMDSLoadTargets*>(m));
+    return preprocess_offload_targets(op);
 
   default:
     assert(0);
-    m->put();
     return true;
   }
 }
@@ -237,8 +245,10 @@ void MDSMonitor::_note_beacon(MMDSBeacon *m)
   last_beacon[gid].seq = seq;
 }
 
-bool MDSMonitor::preprocess_beacon(MMDSBeacon *m)
+bool MDSMonitor::preprocess_beacon(MonOpRequestRef op)
 {
+  op->mark_mdsmon_event(__func__);
+  MMDSBeacon *m = static_cast<MMDSBeacon*>(op->get_req());
   MDSMap::DaemonState state = m->get_state();
   mds_gid_t gid = m->get_global_id();
   version_t seq = m->get_seq();
@@ -287,9 +297,9 @@ bool MDSMonitor::preprocess_beacon(MMDSBeacon *m)
   // booted, but not in map?
   if (pending_mdsmap.is_dne_gid(gid)) {
     if (state != MDSMap::STATE_BOOT) {
-      dout(7) << "mds_beacon " << *m << " is not in mdsmap" << dendl;
-      mon->send_reply(m, new MMDSMap(mon->monmap->fsid, &mdsmap));
-      m->put();
+      dout(7) << "mds_beacon " << *m << " is not in mdsmap (state "
+              << ceph_mds_state_name(state) << ")" << dendl;
+      mon->send_reply(op, new MMDSMap(mon->monmap->fsid, &mdsmap));
       return true;
     } else {
       return false;  // not booted yet.
@@ -327,6 +337,15 @@ bool MDSMonitor::preprocess_beacon(MMDSBeacon *m)
 	       << " -> " << ceph_mds_state_name(state) << ")" << dendl;
       goto reply;
     }
+
+    if ((state == MDSMap::STATE_STANDBY || state == MDSMap::STATE_STANDBY_REPLAY)
+        && info.rank != MDS_RANK_NONE)
+    {
+      dout(4) << "mds_beacon MDS can't go back into standby after taking rank: "
+                 "held rank " << info.rank << " while requesting state "
+              << ceph_mds_state_name(state) << dendl;
+      goto reply;
+    }
     
     if (info.state == MDSMap::STATE_STANDBY &&
 	(state == MDSMap::STATE_STANDBY_REPLAY ||
@@ -354,21 +373,21 @@ bool MDSMonitor::preprocess_beacon(MMDSBeacon *m)
  reply:
   // note time and reply
   _note_beacon(m);
-  mon->send_reply(m,
+  mon->send_reply(op,
 		  new MMDSBeacon(mon->monmap->fsid, m->get_global_id(), m->get_name(),
 				 mdsmap.get_epoch(), state, seq));
-  m->put();
   return true;
 
  ignore:
   // I won't reply this beacon, drop it.
-  mon->no_reply(m);
-  m->put();
+  mon->no_reply(op);
   return true;
 }
 
-bool MDSMonitor::preprocess_offload_targets(MMDSLoadTargets* m)
+bool MDSMonitor::preprocess_offload_targets(MonOpRequestRef op)
 {
+  op->mark_mdsmon_event(__func__);
+  MMDSLoadTargets *m = static_cast<MMDSLoadTargets*>(op->get_req());
   dout(10) << "preprocess_offload_targets " << *m << " from " << m->get_orig_source() << dendl;
   mds_gid_t gid;
   
@@ -390,29 +409,29 @@ bool MDSMonitor::preprocess_offload_targets(MMDSLoadTargets* m)
   return false;
 
  done:
-  m->put();
   return true;
 }
 
 
-bool MDSMonitor::prepare_update(PaxosServiceMessage *m)
+bool MDSMonitor::prepare_update(MonOpRequestRef op)
 {
+  op->mark_mdsmon_event(__func__);
+  PaxosServiceMessage *m = static_cast<PaxosServiceMessage*>(op->get_req());
   dout(7) << "prepare_update " << *m << dendl;
 
   switch (m->get_type()) {
     
   case MSG_MDS_BEACON:
-    return prepare_beacon(static_cast<MMDSBeacon*>(m));
+    return prepare_beacon(op);
 
   case MSG_MON_COMMAND:
-    return prepare_command(static_cast<MMonCommand*>(m));
+    return prepare_command(op);
 
   case MSG_MDS_OFFLOAD_TARGETS:
-    return prepare_offload_targets(static_cast<MMDSLoadTargets*>(m));
+    return prepare_offload_targets(op);
   
   default:
     assert(0);
-    m->put();
   }
 
   return true;
@@ -420,8 +439,10 @@ bool MDSMonitor::prepare_update(PaxosServiceMessage *m)
 
 
 
-bool MDSMonitor::prepare_beacon(MMDSBeacon *m)
+bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
 {
+  op->mark_mdsmon_event(__func__);
+  MMDSBeacon *m = static_cast<MMDSBeacon*>(op->get_req());
   // -- this is an update --
   dout(12) << "prepare_beacon " << *m << " from " << m->get_orig_source_inst() << dendl;
   entity_addr_t addr = m->get_orig_source_inst().addr;
@@ -446,7 +467,7 @@ bool MDSMonitor::prepare_beacon(MMDSBeacon *m)
       bool failed_mds = false;
       while (mds_gid_t existing = pending_mdsmap.find_mds_gid_by_name(m->get_name())) {
         if (!mon->osdmon()->is_writeable()) {
-          mon->osdmon()->wait_for_writeable(new C_RetryMessage(this, m));
+          mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
           return false;
         }
 	fail_mds_gid(existing);
@@ -492,6 +513,7 @@ bool MDSMonitor::prepare_beacon(MMDSBeacon *m)
       pending_mdsmap.compat = m->get_compat();
     }
 
+    update_metadata(m->get_global_id(), m->get_sys_info());
   } else {
     // state change
     MDSMap::mds_info_t& info = pending_mdsmap.get_info_gid(gid);
@@ -536,7 +558,6 @@ bool MDSMonitor::prepare_beacon(MMDSBeacon *m)
           info.state = MDSMap::STATE_STANDBY_REPLAY;
           info.state_seq = seq;
         } else {
-          m->put();
           return false;
         }
       } else if (m->get_standby_for_rank() >= 0 &&
@@ -547,9 +568,55 @@ bool MDSMonitor::prepare_beacon(MMDSBeacon *m)
         info.standby_for_rank = m->get_standby_for_rank();
       } else { //it's a standby for anybody, and is already in the list
         assert(pending_mdsmap.get_mds_info().count(info.global_id));
-        m->put();
         return false;
       }
+    } else if (state == MDSMap::STATE_DAMAGED) {
+      if (!mon->osdmon()->is_writeable()) {
+        dout(4) << __func__ << ": DAMAGED from rank " << info.rank
+                << " waiting for osdmon writeable to blacklist it" << dendl;
+        mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
+        return false;
+      }
+
+      // Record this MDS rank as damaged, so that other daemons
+      // won't try to run it.
+      dout(4) << __func__ << ": marking rank "
+              << info.rank << " damaged" << dendl;
+
+
+      // Blacklist this MDS daemon
+      const utime_t until = ceph_clock_now(g_ceph_context);
+      pending_mdsmap.last_failure_osd_epoch = mon->osdmon()->blacklist(
+          info.addr, until);
+      request_proposal(mon->osdmon());
+
+      // Clear out daemon state and add rank to damaged list
+      pending_mdsmap.up.erase(info.rank);
+      pending_mdsmap.damaged.insert(info.rank);
+      last_beacon.erase(gid);
+
+      // Call erase() last because the `info` reference becomes invalid
+      // after we remove the instance from the map.
+      pending_mdsmap.mds_info.erase(gid);
+
+      // Respond to MDS, so that it knows it can continue to shut down
+      mon->send_reply(op, new MMDSBeacon(mon->monmap->fsid, m->get_global_id(),
+                    m->get_name(), mdsmap.get_epoch(), state, seq));
+    } else if (state == MDSMap::STATE_DNE) {
+      if (!mon->osdmon()->is_writeable()) {
+        dout(4) << __func__ << ": DNE from rank " << info.rank
+                << " waiting for osdmon writeable to blacklist it" << dendl;
+        mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
+        return false;
+      }
+
+      fail_mds_gid(gid);
+      assert(mon->osdmon()->is_writeable());
+      request_proposal(mon->osdmon());
+
+      // Respond to MDS, so that it knows it can continue to shut down
+      mon->send_reply(op, new MMDSBeacon(mon->monmap->fsid, m->get_global_id(),
+                    m->get_name(), mdsmap.get_epoch(), state, seq));
     } else {
       info.state = state;
       info.state_seq = seq;
@@ -559,13 +626,15 @@ bool MDSMonitor::prepare_beacon(MMDSBeacon *m)
   dout(7) << "prepare_beacon pending map now:" << dendl;
   print_map(pending_mdsmap);
   
-  wait_for_finished_proposal(new C_Updated(this, m));
+  wait_for_finished_proposal(op, new C_Updated(this, op));
 
   return true;
 }
 
-bool MDSMonitor::prepare_offload_targets(MMDSLoadTargets *m)
+bool MDSMonitor::prepare_offload_targets(MonOpRequestRef op)
 {
+  op->mark_mdsmon_event(__func__);
+  MMDSLoadTargets *m = static_cast<MMDSLoadTargets*>(op->get_req());
   mds_gid_t gid = m->global_id;
   if (pending_mdsmap.mds_info.count(gid)) {
     dout(10) << "prepare_offload_targets " << gid << " " << m->targets << dendl;
@@ -573,7 +642,6 @@ bool MDSMonitor::prepare_offload_targets(MMDSLoadTargets *m)
   } else {
     dout(10) << "prepare_offload_targets " << gid << " not in map" << dendl;
   }
-  m->put();
   return true;
 }
 
@@ -583,24 +651,25 @@ bool MDSMonitor::should_propose(double& delay)
   return PaxosService::should_propose(delay);
 }
 
-void MDSMonitor::_updated(MMDSBeacon *m)
+void MDSMonitor::_updated(MonOpRequestRef op)
 {
+  op->mark_mdsmon_event(__func__);
+  MMDSBeacon *m = static_cast<MMDSBeacon*>(op->get_req());
   dout(10) << "_updated " << m->get_orig_source() << " " << *m << dendl;
   mon->clog->info() << m->get_orig_source_inst() << " "
 	  << ceph_mds_state_name(m->get_state()) << "\n";
 
   if (m->get_state() == MDSMap::STATE_STOPPED) {
     // send the map manually (they're out of the map, so they won't get it automatic)
-    mon->send_reply(m, new MMDSMap(mon->monmap->fsid, &mdsmap));
+    mon->send_reply(op, new MMDSMap(mon->monmap->fsid, &mdsmap));
   } else {
-    mon->send_reply(m, new MMDSBeacon(mon->monmap->fsid,
+    mon->send_reply(op, new MMDSBeacon(mon->monmap->fsid,
 				      m->get_global_id(),
 				      m->get_name(),
 				      mdsmap.get_epoch(),
 				      m->get_state(),
 				      m->get_seq()));
   }
-  m->put();
 }
 
 void MDSMonitor::on_active()
@@ -670,8 +739,10 @@ void MDSMonitor::dump_info(Formatter *f)
   f->dump_unsigned("mdsmap_last_committed", get_last_committed());
 }
 
-bool MDSMonitor::preprocess_command(MMonCommand *m)
+bool MDSMonitor::preprocess_command(MonOpRequestRef op)
 {
+  op->mark_mdsmon_event(__func__);
+  MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
   int r = -1;
   bufferlist rdata;
   stringstream ss, ds;
@@ -680,7 +751,7 @@ bool MDSMonitor::preprocess_command(MMonCommand *m)
   if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
     // ss has reason for failure
     string rs = ss.str();
-    mon->reply_command(m, -EINVAL, rs, rdata, get_last_committed());
+    mon->reply_command(op, -EINVAL, rs, rdata, get_last_committed());
     return true;
   }
 
@@ -692,7 +763,7 @@ bool MDSMonitor::preprocess_command(MMonCommand *m)
 
   MonSession *session = m->get_session();
   if (!session) {
-    mon->reply_command(m, -EACCES, "access denied", rdata, get_last_committed());
+    mon->reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
     return true;
   }
 
@@ -744,6 +815,15 @@ bool MDSMonitor::preprocess_command(MMonCommand *m)
       if (p != &mdsmap)
 	delete p;
     }
+  } else if (prefix == "mds metadata") {
+    string who;
+    cmd_getval(g_ceph_context, cmdmap, "who", who);
+    if (!f)
+      f.reset(Formatter::create("json-pretty"));
+    f->open_object_section("mds_metadata");
+    r = dump_metadata(who, f.get(), ss);
+    f->close_section();
+    f->flush(ds);
   } else if (prefix == "mds getmap") {
     epoch_t e;
     int64_t epocharg;
@@ -754,18 +834,19 @@ bool MDSMonitor::preprocess_command(MMonCommand *m)
       if (err == -ENOENT) {
 	r = -ENOENT;
       } else {
-	assert(r == 0);
+	assert(err == 0);
 	assert(b.length());
 	MDSMap mm;
 	mm.decode(b);
 	mm.encode(rdata, m->get_connection()->get_features());
 	ss << "got mdsmap epoch " << mm.get_epoch();
+	r = 0;
       }
     } else {
       mdsmap.encode(rdata, m->get_connection()->get_features());
       ss << "got mdsmap epoch " << mdsmap.get_epoch();
+      r = 0;
     }
-    r = 0;
   } else if (prefix == "mds tell") {
     string whostr;
     cmd_getval(g_ceph_context, cmdmap, "who", whostr);
@@ -873,7 +954,7 @@ bool MDSMonitor::preprocess_command(MMonCommand *m)
     rdata.append(ds);
     string rs;
     getline(ss, rs);
-    mon->reply_command(m, r, rs, rdata, get_last_committed());
+    mon->reply_command(op, r, rs, rdata, get_last_committed());
     return true;
   } else
     return false;
@@ -905,9 +986,11 @@ void MDSMonitor::fail_mds_gid(mds_gid_t gid)
   }
 
   pending_mdsmap.mds_info.erase(gid);
+
+  last_beacon.erase(gid);
 }
 
-int MDSMonitor::fail_mds(std::ostream &ss, const std::string &arg)
+mds_gid_t MDSMonitor::gid_from_arg(const std::string& arg, std::ostream &ss)
 {
   std::string err;
   unsigned long long rank_or_gid = strict_strtoll(arg.c_str(), 10, &err);
@@ -916,8 +999,8 @@ int MDSMonitor::fail_mds(std::ostream &ss, const std::string &arg)
     const MDSMap::mds_info_t *mds_info = mdsmap.find_by_name(arg);
     if (!mds_info) {
       ss << "MDS named '" << arg
-         << "' does not exist, or is not up";
-      return 0;
+	 << "' does not exist, or is not up";
+      return MDS_GID_NONE;
     }
     if (mds_info->rank >= 0) {
       dout(10) << __func__ << ": resolved MDS name '" << arg << "' to rank " << rank_or_gid << dendl;
@@ -927,40 +1010,60 @@ int MDSMonitor::fail_mds(std::ostream &ss, const std::string &arg)
       rank_or_gid = mds_info->global_id;
     }
   } else {
-    dout(10) << __func__ << ": treating MDS reference '" << arg << "' as an integer " << rank_or_gid << dendl;
+    dout(10) << __func__ << ": treating MDS reference '" << arg
+	     << "' as an integer " << rank_or_gid << dendl;
   }
 
-  if (!mon->osdmon()->is_writeable()) {
-    return -EAGAIN;
- }
-
-  bool failed_mds_gid = false;
-  if (pending_mdsmap.up.count(mds_rank_t(rank_or_gid))) {
-    dout(10) << __func__ << ": validated rank/GID " << rank_or_gid << " as a rank" << dendl;
-    mds_gid_t gid = pending_mdsmap.up[mds_rank_t(rank_or_gid)];
-    if (pending_mdsmap.mds_info.count(gid)) {
-      fail_mds_gid(gid);
-      failed_mds_gid = true;
-    }
-    ss << "failed mds." << rank_or_gid;
-  } else if (pending_mdsmap.mds_info.count(mds_gid_t(rank_or_gid))) {
-    dout(10) << __func__ << ": validated rank/GID " << rank_or_gid << " as a GID" << dendl;
-    fail_mds_gid(mds_gid_t(rank_or_gid));
-    failed_mds_gid = true;
-    ss << "failed mds gid " << rank_or_gid;
+  if (mon->is_leader()) {
+    if (pending_mdsmap.up.count(mds_rank_t(rank_or_gid))) {
+      dout(10) << __func__ << ": validated rank/GID " << rank_or_gid
+	       << " as a rank" << dendl;
+      mds_gid_t gid = pending_mdsmap.up[mds_rank_t(rank_or_gid)];
+      if (pending_mdsmap.mds_info.count(gid)) {
+	return gid;
+      } else {
+	dout(10) << __func__ << ": GID " << rank_or_gid << " was removed." << dendl;
+	return MDS_GID_NONE;
+      }
+    } else if (pending_mdsmap.mds_info.count(mds_gid_t(rank_or_gid))) {
+      dout(10) << __func__ << ": validated rank/GID " << rank_or_gid
+	       << " as a GID" << dendl;
+      return mds_gid_t(rank_or_gid);
+    }
   } else {
-    dout(1) << __func__ << ": rank/GID " << rank_or_gid << " not a existent rank or GID" << dendl;
+    // mon is a peon
+    if (mdsmap.have_inst(mds_rank_t(rank_or_gid))) {
+      return mdsmap.get_info(mds_rank_t(rank_or_gid)).global_id;
+    } else if (mdsmap.get_state_gid(mds_gid_t(rank_or_gid))) {
+      return mds_gid_t(rank_or_gid);
+    }
   }
 
-  if (failed_mds_gid) {
-    assert(mon->osdmon()->is_writeable());
-    request_proposal(mon->osdmon());
+  dout(1) << __func__ << ": rank/GID " << rank_or_gid
+	  << " not a existent rank or GID" << dendl;
+  return MDS_GID_NONE;
+}
+
+int MDSMonitor::fail_mds(std::ostream &ss, const std::string &arg)
+{
+  mds_gid_t gid = gid_from_arg(arg, ss);
+  if (gid == MDS_GID_NONE) {
+    return 0;
+  }
+  if (!mon->osdmon()->is_writeable()) {
+    return -EAGAIN;
   }
+  fail_mds_gid(gid);
+  ss << "failed mds gid " << gid;
+  assert(mon->osdmon()->is_writeable());
+  request_proposal(mon->osdmon());
   return 0;
 }
 
-bool MDSMonitor::prepare_command(MMonCommand *m)
+bool MDSMonitor::prepare_command(MonOpRequestRef op)
 {
+  op->mark_mdsmon_event(__func__);
+  MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
   int r = -EINVAL;
   stringstream ss;
   bufferlist rdata;
@@ -968,7 +1071,7 @@ bool MDSMonitor::prepare_command(MMonCommand *m)
   map<string, cmd_vartype> cmdmap;
   if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
     string rs = ss.str();
-    mon->reply_command(m, -EINVAL, rs, rdata, get_last_committed());
+    mon->reply_command(op, -EINVAL, rs, rdata, get_last_committed());
     return true;
   }
 
@@ -978,12 +1081,12 @@ bool MDSMonitor::prepare_command(MMonCommand *m)
   /* Refuse access if message not associated with a valid session */
   MonSession *session = m->get_session();
   if (!session) {
-    mon->reply_command(m, -EACCES, "access denied", rdata, get_last_committed());
+    mon->reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
     return true;
   }
 
   /* Execute filesystem add/remove, or pass through to filesystem_command */
-  r = management_command(m, prefix, cmdmap, ss);
+  r = management_command(op, prefix, cmdmap, ss);
   if (r >= 0)
     goto out;
   
@@ -1003,7 +1106,7 @@ bool MDSMonitor::prepare_command(MMonCommand *m)
     ss << "No filesystem configured: use `ceph fs new` to create a filesystem";
     r = -ENOENT;
   } else {
-    r = filesystem_command(m, prefix, cmdmap, ss);
+    r = filesystem_command(op, prefix, cmdmap, ss);
     if (r < 0 && r == -EAGAIN) {
       // Do not reply, the message has been enqueued for retry
       return false;
@@ -1017,12 +1120,12 @@ out:
 
   if (r >= 0) {
     // success.. delay reply
-    wait_for_finished_proposal(new Monitor::C_Command(mon, m, r, rs,
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, r, rs,
 					      get_last_committed() + 1));
     return true;
   } else {
     // reply immediately
-    mon->reply_command(m, r, rs, rdata, get_last_committed());
+    mon->reply_command(op, r, rs, rdata, get_last_committed());
     return false;
   }
 }
@@ -1054,6 +1157,21 @@ int MDSMonitor::_check_pool(
          << " is an erasure-code pool";
       return -EINVAL;
     }
+
+    // That cache tier overlay must be writeback, not readonly (it's the
+    // write operations like modify+truncate we care about support for)
+    const pg_pool_t *write_tier = mon->osdmon()->osdmap.get_pg_pool(
+        pool->write_tier);
+    assert(write_tier != NULL);  // OSDMonitor shouldn't allow DNE tier
+    if (write_tier->cache_mode == pg_pool_t::CACHEMODE_FORWARD
+        || write_tier->cache_mode == pg_pool_t::CACHEMODE_READONLY) {
+      *ss << "EC pool '" << pool_name << "' has a write tier ("
+          << mon->osdmon()->osdmap.get_pool_name(pool->write_tier)
+          << ") that is configured "
+             "to forward writes.  Use a cache mode such as 'writeback' for "
+             "CephFS";
+      return -EINVAL;
+    }
   }
 
   if (pool->is_tier()) {
@@ -1076,11 +1194,12 @@ int MDSMonitor::_check_pool(
  * @retval < 0      An error has occurred; **ss** may have been set.
  */
 int MDSMonitor::management_command(
-    MMonCommand *m,
+    MonOpRequestRef op,
     std::string const &prefix,
     map<string, cmd_vartype> &cmdmap,
     std::stringstream &ss)
 {
+  op->mark_mdsmon_event(__func__);
   if (prefix == "mds newfs") {
     /* Legacy `newfs` command, takes pool numbers instead of
      * names, assumes fs name to be MDS_FS_NAME_DEFAULT, and
@@ -1199,7 +1318,7 @@ int MDSMonitor::management_command(
       // propose.  We thus need to make sure the osdmon is writeable before
       // we do this, waiting if it's not.
       if (!mon->osdmon()->is_writeable()) {
-        mon->osdmon()->wait_for_writeable(new C_RetryMessage(this, m));
+        mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
         return -EAGAIN;
       }
 
@@ -1296,6 +1415,10 @@ int MDSMonitor::management_command(
     newmap.inc = mdsmap.inc;
     newmap.enabled = mdsmap.enabled;
     newmap.inline_data_enabled = mdsmap.inline_data_enabled;
+    newmap.compat = get_mdsmap_compat_set_default();
+    newmap.session_timeout = g_conf->mds_session_timeout;
+    newmap.session_autoclose = g_conf->mds_session_autoclose;
+    newmap.max_file_size = g_conf->mds_max_file_size;
 
     // Persist the new MDSMap
     pending_mdsmap = newmap;
@@ -1317,11 +1440,13 @@ int MDSMonitor::management_command(
  * @retval < 0      An error has occurred; **ss** may have been set.
  */
 int MDSMonitor::filesystem_command(
-    MMonCommand *m,
+    MonOpRequestRef op,
     std::string const &prefix,
     map<string, cmd_vartype> &cmdmap,
     std::stringstream &ss)
 {
+  op->mark_mdsmon_event(__func__);
+  MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
   int r = 0;
   string whostr;
   cmd_getval(g_ceph_context, cmdmap, "who", whostr);
@@ -1359,6 +1484,10 @@ int MDSMonitor::filesystem_command(
     if (!cmd_getval(g_ceph_context, cmdmap, "maxmds", maxmds) || maxmds < 0) {
       return -EINVAL;
     }
+    if (maxmds > MAX_MDS) {
+      ss << "may not have more than " << MAX_MDS << " MDS ranks";
+      return -EINVAL;
+    }
     pending_mdsmap.max_mds = maxmds;
     r = 0;
     ss << "max_mds = " << pending_mdsmap.max_mds;
@@ -1381,6 +1510,10 @@ int MDSMonitor::filesystem_command(
       if (interr.length()) {
 	return -EINVAL;
       }
+      if (n > MAX_MDS) {
+        ss << "may not have more than " << MAX_MDS << " MDS ranks";
+        return -EINVAL;
+      }
       pending_mdsmap.max_mds = n;
     } else if (var == "inline_data") {
       if (val == "true" || val == "yes" || (!interr.length() && n == 1)) {
@@ -1476,10 +1609,21 @@ int MDSMonitor::filesystem_command(
     cmd_getval(g_ceph_context, cmdmap, "who", who);
     r = fail_mds(ss, who);
     if (r < 0 && r == -EAGAIN) {
-      mon->osdmon()->wait_for_writeable(new C_RetryMessage(this, m));
+      mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
       return -EAGAIN; // don't propose yet; wait for message to be retried
     }
 
+  } else if (prefix == "mds repaired") {
+    mds_rank_t rank;
+    cmd_getval(g_ceph_context, cmdmap, "rank", rank);
+    if (pending_mdsmap.damaged.count(rank)) {
+      dout(4) << "repaired: restoring rank " << rank << dendl;
+      pending_mdsmap.damaged.erase(rank);
+      pending_mdsmap.failed.insert(rank);
+    } else {
+      dout(4) << "repaired: no-op on rank " << rank << dendl;
+    }
+    r = 0;
   } else if (prefix == "mds rm") {
     mds_gid_t gid;
     if (!cmd_getval(g_ceph_context, cmdmap, "gid", gid)) {
@@ -1570,22 +1714,14 @@ int MDSMonitor::filesystem_command(
 	return -ENOENT;
       }
     }
-    const pg_pool_t *p = mon->osdmon()->osdmap.get_pg_pool(poolid);
-    if (!p) {
-      ss << "pool '" << poolname << "' does not exist";
-      return -ENOENT;
-    }
-    if (p->is_erasure()) {
-      // I'm sorry Dave, I'm afraid I can't do that
-      poolid = -1;
-      ss << "can't use pool '" << poolname << "' as it's an erasure-code pool";
-      return -EINVAL;
-    }
-    if (poolid >= 0) {
-      pending_mdsmap.add_data_pool(poolid);
-      ss << "added data pool " << poolid << " to mdsmap";
-      r = 0;
+
+    r = _check_pool(poolid, &ss);
+    if (r != 0) {
+      return r;
     }
+
+    pending_mdsmap.add_data_pool(poolid);
+    ss << "added data pool " << poolid << " to mdsmap";
   } else if (prefix == "mds remove_data_pool") {
     string poolname;
     cmd_getval(g_ceph_context, cmdmap, "pool", poolname);
@@ -1646,6 +1782,108 @@ void MDSMonitor::check_sub(Subscription *sub)
   }
 }
 
+void MDSMonitor::update_metadata(mds_gid_t gid,
+				 const map<string, string>& metadata)
+{
+  if (metadata.empty()) {
+    return;
+  }
+  pending_metadata[gid] = metadata;
+
+  MonitorDBStore::TransactionRef t = paxos->get_pending_transaction();
+  bufferlist bl;
+  ::encode(pending_metadata, bl);
+  t->put(MDS_METADATA_PREFIX, "last_metadata", bl);
+  paxos->trigger_propose();
+}
+
+void MDSMonitor::remove_from_metadata(MonitorDBStore::TransactionRef t)
+{
+  bool update = false;
+  for (map<mds_gid_t, Metadata>::iterator i = pending_metadata.begin();
+       i != pending_metadata.end(); ) {
+    if (pending_mdsmap.get_state_gid(i->first) == MDSMap::STATE_NULL) {
+      pending_metadata.erase(i++);
+      update = true;
+    } else {
+      ++i;
+    }
+  }
+  if (!update)
+    return;
+  bufferlist bl;
+  ::encode(pending_metadata, bl);
+  t->put(MDS_METADATA_PREFIX, "last_metadata", bl);
+}
+
+int MDSMonitor::load_metadata(map<mds_gid_t, Metadata>& m)
+{
+  bufferlist bl;
+  int r = mon->store->get(MDS_METADATA_PREFIX, "last_metadata", bl);
+  if (r)
+    return r;
+
+  bufferlist::iterator it = bl.begin();
+  ::decode(m, it);
+  return 0;
+}
+
+int MDSMonitor::dump_metadata(const std::string &who, Formatter *f, ostream& err)
+{
+  assert(f);
+
+  mds_gid_t gid = gid_from_arg(who, err);
+  if (gid == MDS_GID_NONE) {
+    return -EINVAL;
+  }
+
+  map<mds_gid_t, Metadata> metadata;
+  if (int r = load_metadata(metadata)) {
+    err << "Unable to load 'last_metadata'";
+    return r;
+  }
+
+  if (!metadata.count(gid)) {
+    return -ENOENT;
+  }
+  const Metadata& m = metadata[gid];
+  for (Metadata::const_iterator p = m.begin(); p != m.end(); ++p) {
+    f->dump_string(p->first.c_str(), p->second);
+  }
+  return 0;
+}
+
+int MDSMonitor::print_nodes(Formatter *f)
+{
+  assert(f);
+
+  map<mds_gid_t, Metadata> metadata;
+  if (int r = load_metadata(metadata)) {
+    return r;
+  }
+
+  map<string, list<int> > mdses; // hostname => rank
+  for (map<mds_gid_t, Metadata>::iterator it = metadata.begin();
+       it != metadata.end(); ++it) {
+    const Metadata& m = it->second;
+    Metadata::const_iterator hostname = m.find("hostname");
+    if (hostname == m.end()) {
+      // not likely though
+      continue;
+    }
+    const mds_gid_t gid = it->first;
+    if (mdsmap.get_state_gid(gid) == MDSMap::STATE_NULL) {
+      dout(5) << __func__ << ": GID " << gid << " not existent" << dendl;
+      continue;
+    }
+    const MDSMap::mds_info_t& mds_info = mdsmap.get_info_gid(gid);
+    mdses[hostname->second].push_back(mds_info.rank);
+  }
+
+  dump_services(f, mdses, "mds");
+  return 0;
+}
+
 void MDSMonitor::tick()
 {
   // make sure mds's are still alive
diff --git a/src/mon/MDSMonitor.h b/src/mon/MDSMonitor.h
index 5b858ee..03a2276 100644
--- a/src/mon/MDSMonitor.h
+++ b/src/mon/MDSMonitor.h
@@ -51,18 +51,17 @@ class MDSMonitor : public PaxosService {
 
   class C_Updated : public Context {
     MDSMonitor *mm;
-    MMDSBeacon *m;
+    MonOpRequestRef op;
   public:
-    C_Updated(MDSMonitor *a, MMDSBeacon *c) :
-      mm(a), m(c) {}
+    C_Updated(MDSMonitor *a, MonOpRequestRef c) :
+      mm(a), op(c) {}
     void finish(int r) {
       if (r >= 0)
-	mm->_updated(m);   // success
+	mm->_updated(op);   // success
       else if (r == -ECANCELED) {
-	mm->mon->no_reply(m);
-	m->put();
+	mm->mon->no_reply(op);
       } else {
-	mm->dispatch((PaxosServiceMessage*)m);        // try again
+	mm->dispatch(op);        // try again
       }
     }
   };
@@ -74,6 +73,7 @@ class MDSMonitor : public PaxosService {
   // service methods
   void create_initial();
   void update_from_paxos(bool *need_bootstrap);
+  void init();
   void create_pending(); 
   void encode_pending(MonitorDBStore::TransactionRef t);
   // we don't require full versions; don't encode any.
@@ -81,35 +81,35 @@ class MDSMonitor : public PaxosService {
 
   void update_logger();
 
-  void _updated(MMDSBeacon *m);
+  void _updated(MonOpRequestRef op);
  
-  bool preprocess_query(PaxosServiceMessage *m);  // true if processed.
-  bool prepare_update(PaxosServiceMessage *m);
+  bool preprocess_query(MonOpRequestRef op);  // true if processed.
+  bool prepare_update(MonOpRequestRef op);
   bool should_propose(double& delay);
 
   void on_active();
 
   void _note_beacon(class MMDSBeacon *m);
-  bool preprocess_beacon(class MMDSBeacon *m);
-  bool prepare_beacon(class MMDSBeacon *m);
+  bool preprocess_beacon(MonOpRequestRef op);
+  bool prepare_beacon(MonOpRequestRef op);
 
-  bool preprocess_offload_targets(MMDSLoadTargets *m);
-  bool prepare_offload_targets(MMDSLoadTargets *m);
+  bool preprocess_offload_targets(MonOpRequestRef op);
+  bool prepare_offload_targets(MonOpRequestRef op);
 
   void get_health(list<pair<health_status_t,string> >& summary,
 		  list<pair<health_status_t,string> > *detail) const;
   int fail_mds(std::ostream &ss, const std::string &arg);
   void fail_mds_gid(mds_gid_t gid);
 
-  bool preprocess_command(MMonCommand *m);
-  bool prepare_command(MMonCommand *m);
+  bool preprocess_command(MonOpRequestRef op);
+  bool prepare_command(MonOpRequestRef op);
   int management_command(
-      MMonCommand *m,
+      MonOpRequestRef op,
       std::string const &prefix,
       map<string, cmd_vartype> &cmdmap,
       std::stringstream &ss);
   int filesystem_command(
-      MMonCommand *m,
+      MonOpRequestRef op,
       std::string const &prefix,
       map<string, cmd_vartype> &cmdmap,
       std::stringstream &ss);
@@ -132,16 +132,25 @@ public:
   void tick();     // check state, take actions
 
   void dump_info(Formatter *f);
+  int dump_metadata(const string& who, Formatter *f, ostream& err);
+  int print_nodes(Formatter *f);
 
   void check_subs();
   void check_sub(Subscription *sub);
 
 private:
+  void update_metadata(mds_gid_t gid, const Metadata& metadata);
+  void remove_from_metadata(MonitorDBStore::TransactionRef t);
+  int load_metadata(map<mds_gid_t, Metadata>& m);
+
   // MDS daemon GID to latest health state from that GID
   std::map<uint64_t, MDSHealth> pending_daemon_health;
   std::set<uint64_t> pending_daemon_health_rm;
 
+  map<mds_gid_t, Metadata> pending_metadata;
+
   int _check_pool(const int64_t pool_id, std::stringstream *ss) const;
+  mds_gid_t gid_from_arg(const std::string& arg, std::ostream& err);
 };
 
 #endif
diff --git a/src/mon/Makefile.am b/src/mon/Makefile.am
index 8c96944..ee6542d 100644
--- a/src/mon/Makefile.am
+++ b/src/mon/Makefile.am
@@ -16,7 +16,6 @@ libmon_la_SOURCES = \
 	mon/LogMonitor.cc \
 	mon/AuthMonitor.cc \
 	mon/Elector.cc \
-	mon/MonitorStore.cc \
 	mon/HealthMonitor.cc \
 	mon/DataHealthService.cc \
 	mon/ConfigKeyService.cc
@@ -39,8 +38,8 @@ noinst_HEADERS += \
 	mon/DumplingMonCommands.h \
 	mon/MonMap.h \
 	mon/Monitor.h \
-	mon/MonitorStore.h \
 	mon/MonitorDBStore.h \
+	mon/MonOpRequest.h \
 	mon/OSDMonitor.h \
 	mon/PGMap.h \
 	mon/PGMonitor.h \
diff --git a/src/mon/MonClient.cc b/src/mon/MonClient.cc
index 04de7e2..6e9843b 100644
--- a/src/mon/MonClient.cc
+++ b/src/mon/MonClient.cc
@@ -116,9 +116,7 @@ int MonClient::get_monmap_privately()
   bool temp_msgr = false;
   Messenger* smessenger = NULL;
   if (!messenger) {
-    messenger = smessenger = Messenger::create(cct, cct->_conf->ms_type,
-					       entity_name_t::CLIENT(-1),
-					       "temp_mon_client", getpid());
+    messenger = smessenger = Messenger::create_client_messenger(cct, "temp_mon_client");
     messenger->add_dispatcher_head(this);
     smessenger->start();
     temp_msgr = true;
@@ -207,25 +205,30 @@ int MonClient::ping_monitor(const string &mon_id, string *result_reply)
 {
   ldout(cct, 10) << __func__ << dendl;
 
-  if (mon_id.empty()) {
+  string new_mon_id;
+  if (monmap.contains("noname-"+mon_id)) {
+    new_mon_id = "noname-"+mon_id;
+  } else {
+    new_mon_id = mon_id;
+  }
+
+  if (new_mon_id.empty()) {
     ldout(cct, 10) << __func__ << " specified mon id is empty!" << dendl;
     return -EINVAL;
-  } else if (!monmap.contains(mon_id)) {
-    ldout(cct, 10) << __func__ << " no such monitor 'mon." << mon_id << "'"
+  } else if (!monmap.contains(new_mon_id)) {
+    ldout(cct, 10) << __func__ << " no such monitor 'mon." << new_mon_id << "'"
                    << dendl;
     return -ENOENT;
   }
 
   MonClientPinger *pinger = new MonClientPinger(cct, result_reply);
 
-  Messenger *smsgr = Messenger::create(cct, cct->_conf->ms_type,
-				       entity_name_t::CLIENT(-1),
-				       "temp_ping_client", getpid());
+  Messenger *smsgr = Messenger::create_client_messenger(cct, "temp_ping_client");
   smsgr->add_dispatcher_head(pinger);
   smsgr->start();
 
-  ConnectionRef con = smsgr->get_connection(monmap.get_inst(mon_id));
-  ldout(cct, 10) << __func__ << " ping mon." << mon_id
+  ConnectionRef con = smsgr->get_connection(monmap.get_inst(new_mon_id));
+  ldout(cct, 10) << __func__ << " ping mon." << new_mon_id
                  << " " << con->get_peer_addr() << dendl;
   con->send_message(new MPing);
 
@@ -314,6 +317,12 @@ void MonClient::send_log()
   }
 }
 
+void MonClient::flush_log()
+{
+  Mutex::Locker l(monc_lock);
+  send_log();
+}
+
 void MonClient::handle_monmap(MMonMap *m)
 {
   ldout(cct, 10) << "handle_monmap " << *m << dendl;
diff --git a/src/mon/MonClient.h b/src/mon/MonClient.h
index 239d91b..a9761d1 100644
--- a/src/mon/MonClient.h
+++ b/src/mon/MonClient.h
@@ -200,6 +200,13 @@ public:
 
   int authenticate(double timeout=0.0);
 
+  /**
+   * Try to flush as many log messages as we can in a single
+   * message.  Use this before shutting down to transmit your
+   * last message.
+   */
+  void flush_log();
+
   // mon subscriptions
 private:
   map<string,ceph_mon_subscribe_item> sub_have;  // my subs, and current versions
@@ -288,7 +295,7 @@ public:
    * reply in @p result_reply.
    *
    * @param[in]  mon_id Target monitor's ID
-   * @param[out] Resulting reply from mon.ID, if param != NULL
+   * @param[out] result_reply reply from mon.ID, if param != NULL
    * @returns    0 in case of success; < 0 in case of error,
    *             -ETIMEDOUT if monitor didn't reply before timeout
    *             expired (default: conf->client_mount_timeout).
diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h
index a75b067..0286b83 100644
--- a/src/mon/MonCommands.h
+++ b/src/mon/MonCommands.h
@@ -104,9 +104,17 @@
  * type, so the monitor is expected to know the type of each argument.
  * See cmdparse.cc/h for more details.
  *
- * The flag parameter for COMMAND_WITH_FLAGS macro may be:
+ * The flag parameter for COMMAND_WITH_FLAGS macro must be passed using
+ * FLAG(f), where 'f' may be one of the following:
  *
+ *  NONE      - no flag assigned
  *  NOFORWARD - command may not be forwarded
+ *  OBSOLETE  - command is considered obsolete
+ *  DEPRECATED - command is considered deprecated
+ *
+ * A command should always be first considered DEPRECATED before being
+ * considered OBSOLETE, giving due consideration to users and conforming
+ * to any guidelines regarding deprecating commands.
  */
 
 /*
@@ -215,16 +223,19 @@ COMMAND("auth del " \
 /*
  * Monitor commands (Monitor.cc)
  */
-COMMAND_WITH_FLAG("compact", "cause compaction of monitor's leveldb storage", \
-	     "mon", "rw", "cli,rest", NOFORWARD)
-COMMAND("scrub", "scrub the monitor stores", "mon", "rw", "cli,rest")
+COMMAND_WITH_FLAG("compact", "cause compaction of monitor's leveldb storage (DEPRECATED)", \
+	     "mon", "rw", "cli,rest", \
+             FLAG(NOFORWARD)|FLAG(DEPRECATED))
+COMMAND_WITH_FLAG("scrub", "scrub the monitor stores (DEPRECATED)", \
+             "mon", "rw", "cli,rest", \
+             FLAG(DEPRECATED))
 COMMAND("fsid", "show cluster FSID/UUID", "mon", "r", "cli,rest")
 COMMAND("log name=logtext,type=CephString,n=N", \
 	"log supplied text to the monitor log", "mon", "rw", "cli,rest")
 COMMAND_WITH_FLAG("injectargs " \
 	     "name=injected_args,type=CephString,n=N",			\
 	     "inject config arguments into monitor", "mon", "rw", "cli,rest",
-	     NOFORWARD)
+	     FLAG(NOFORWARD))
 COMMAND("status", "show cluster status", "mon", "r", "cli,rest")
 COMMAND("health name=detail,type=CephChoices,strings=detail,req=false", \
 	"show cluster health", "mon", "r", "cli,rest")
@@ -235,16 +246,19 @@ COMMAND("report name=tags,type=CephString,n=N,req=false", \
 	"mon", "r", "cli,rest")
 COMMAND("quorum_status", "report status of monitor quorum", \
 	"mon", "r", "cli,rest")
+
 COMMAND_WITH_FLAG("mon_status", "report status of monitors", "mon", "r", "cli,rest",
-	     NOFORWARD)
-COMMAND("sync force " \
+	     FLAG(NOFORWARD))
+COMMAND_WITH_FLAG("sync force " \
 	"name=validate1,type=CephChoices,strings=--yes-i-really-mean-it,req=false " \
 	"name=validate2,type=CephChoices,strings=--i-know-what-i-am-doing,req=false", \
-	"force sync of and clear monitor store", "mon", "rw", "cli,rest")
+	"force sync of and clear monitor store (DEPRECATED)", \
+        "mon", "rw", "cli,rest", \
+        FLAG(NOFORWARD)|FLAG(DEPRECATED))
 COMMAND_WITH_FLAG("heap " \
 	     "name=heapcmd,type=CephChoices,strings=dump|start_profiler|stop_profiler|release|stats", \
 	     "show heap usage info (available only if compiled with tcmalloc)", \
-	     "mon", "rw", "cli,rest", NOFORWARD)
+	     "mon", "rw", "cli,rest", FLAG(NOFORWARD))
 COMMAND("quorum name=quorumcmd,type=CephChoices,strings=enter|exit,n=1", \
 	"enter or exit quorum", "mon", "rw", "cli,rest")
 COMMAND("tell " \
@@ -252,7 +266,32 @@ COMMAND("tell " \
 	"name=args,type=CephString,n=N", \
 	"send a command to a specific daemon", "mon", "rw", "cli,rest")
 COMMAND_WITH_FLAG("version", "show mon daemon version", "mon", "r", "cli,rest",
-	     NOFORWARD)
+                  FLAG(NOFORWARD))
+
+COMMAND("node ls " \
+	"name=type,type=CephChoices,strings=all|osd|mon|mds,req=false",
+	"list all nodes in cluster [type]", "mon", "r", "cli,rest")
+/*
+ * Monitor-specific commands under module 'mon'
+ */
+COMMAND_WITH_FLAG("mon compact", \
+    "cause compaction of monitor's leveldb storage", \
+    "mon", "rw", "cli,rest", \
+    FLAG(NOFORWARD))
+COMMAND_WITH_FLAG("mon scrub",
+    "scrub the monitor stores", \
+    "mon", "rw", "cli,rest", \
+    FLAG(NONE))
+COMMAND_WITH_FLAG("mon sync force " \
+    "name=validate1,type=CephChoices,strings=--yes-i-really-mean-it,req=false " \
+    "name=validate2,type=CephChoices,strings=--i-know-what-i-am-doing,req=false", \
+    "force sync of and clear monitor store", \
+    "mon", "rw", "cli,rest", \
+    FLAG(NOFORWARD))
+COMMAND("mon metadata name=id,type=CephString",
+	"fetch metadata for mon <id>",
+	"mon", "r", "cli,rest")
+
 
 /*
  * MDS commands (MDSMonitor.cc)
@@ -265,6 +304,9 @@ COMMAND("mds dump "
 COMMAND("mds getmap " \
 	"name=epoch,type=CephInt,req=false,range=0", \
 	"get MDS map, optionally from epoch", "mds", "r", "cli,rest")
+COMMAND("mds metadata name=who,type=CephString",
+	"fetch metadata for mds <who>",
+	"mds", "r", "cli,rest")
 COMMAND("mds tell " \
 	"name=who,type=CephString " \
 	"name=args,type=CephString,n=N", \
@@ -295,9 +337,10 @@ COMMAND("mds set_state " \
 	"set mds state of <gid> to <numeric-state>", "mds", "rw", "cli,rest")
 COMMAND("mds fail name=who,type=CephString", \
 	"force mds to status failed", "mds", "rw", "cli,rest")
+COMMAND("mds repaired name=rank,type=CephInt", \
+	"mark a damaged MDS rank as no longer damaged", "mds", "rw", "cli,rest")
 COMMAND("mds rm " \
-	"name=gid,type=CephInt,range=0 " \
-	"name=who,type=CephName", \
+	"name=gid,type=CephInt,range=0", \
 	"remove nonactive mds", "mds", "rw", "cli,rest")
 COMMAND("mds rmfailed name=who,type=CephInt,range=0", "remove failed mds", \
 	"mds", "rw", "cli,rest")
@@ -359,7 +402,6 @@ COMMAND("mon remove " \
 	"name=name,type=CephString", \
 	"remove monitor named <name>", "mon", "rw", "cli,rest")
 
-
 /*
  * OSD commands
  */
@@ -393,13 +435,14 @@ COMMAND("osd find " \
 	"find osd <id> in the CRUSH map and show its location", \
 	"osd", "r", "cli,rest")
 COMMAND("osd metadata " \
-	"name=id,type=CephInt,range=0", \
-	"fetch metadata for osd <id>", \
+	"name=id,type=CephInt,range=0,req=false", \
+	"fetch metadata for osd {id} (default all)", \
 	"osd", "r", "cli,rest")
 COMMAND("osd map " \
 	"name=pool,type=CephPoolname " \
-	"name=object,type=CephObjectname", \
-	"find pg for <object> in <pool>", "osd", "r", "cli,rest")
+	"name=object,type=CephObjectname " \
+	"name=nspace,type=CephString,req=false", \
+	"find pg for <object> in <pool> with [namespace]", "osd", "r", "cli,rest")
 COMMAND("osd scrub " \
 	"name=who,type=CephString", \
 	"initiate scrub on osd <who>", "osd", "rw", "cli,rest")
@@ -546,10 +589,10 @@ COMMAND("osd erasure-code-profile ls", \
 	"list all erasure code profiles", \
 	"osd", "r", "cli,rest")
 COMMAND("osd set " \
-	"name=key,type=CephChoices,strings=full|pause|noup|nodown|noout|noin|nobackfill|norebalance|norecover|noscrub|nodeep-scrub|notieragent", \
+	"name=key,type=CephChoices,strings=full|pause|noup|nodown|noout|noin|nobackfill|norebalance|norecover|noscrub|nodeep-scrub|notieragent|sortbitwise", \
 	"set <key>", "osd", "rw", "cli,rest")
 COMMAND("osd unset " \
-	"name=key,type=CephChoices,strings=full|pause|noup|nodown|noout|noin|nobackfill|norebalance|norecover|noscrub|nodeep-scrub|notieragent", \
+	"name=key,type=CephChoices,strings=full|pause|noup|nodown|noout|noin|nobackfill|norebalance|norecover|noscrub|nodeep-scrub|notieragent|sortbitwise", \
 	"unset <key>", "osd", "rw", "cli,rest")
 COMMAND("osd cluster_snap", "take cluster snapshot (disabled)", \
 	"osd", "r", "")
@@ -590,8 +633,9 @@ COMMAND("osd lost " \
 	"mark osd as permanently lost. THIS DESTROYS DATA IF NO MORE REPLICAS EXIST, BE CAREFUL", \
 	"osd", "rw", "cli,rest")
 COMMAND("osd create " \
-	"name=uuid,type=CephUUID,req=false", \
-	"create new osd (with optional UUID)", "osd", "rw", "cli,rest")
+	"name=uuid,type=CephUUID,req=false " \
+	"name=id,type=CephInt,range=0,req=false", \
+	"create new osd (with optional UUID and ID)", "osd", "rw", "cli,rest")
 COMMAND("osd blacklist " \
 	"name=blacklistop,type=CephChoices,strings=add|rm " \
 	"name=addr,type=CephEntityAddr " \
@@ -630,11 +674,11 @@ COMMAND("osd pool rename " \
 	"rename <srcpool> to <destpool>", "osd", "rw", "cli,rest")
 COMMAND("osd pool get " \
 	"name=pool,type=CephPoolname " \
-	"name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|auid|target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|erasure_code_profile|min_read_recency_for_promote|write_fadvise_dontneed", \
+	"name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|auid|target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|erasure_code_profile|min_read_recency_for_promote|all|min_write_recency_for_promote|fast_ [...]
 	"get pool parameter <var>", "osd", "r", "cli,rest")
 COMMAND("osd pool set " \
 	"name=pool,type=CephPoolname " \
-	"name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool|nodelete|nopgchange|nosizechange|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|debug_fake_ec_pool|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|auid|min_read_recency_for_promote|write_fadvise_dontneed " \
+	"name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|debug_fake_ec_pool|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|auid|min_read_recency_for_promote|min_write_recency_for_prom [...]
 	"name=val,type=CephString " \
 	"name=force,type=CephChoices,strings=--yes-i-really-mean-it,req=false", \
 	"set pool parameter <var> to <val>", "osd", "rw", "cli,rest")
diff --git a/src/mon/MonMap.h b/src/mon/MonMap.h
index ae81b07..390d88a 100644
--- a/src/mon/MonMap.h
+++ b/src/mon/MonMap.h
@@ -159,6 +159,7 @@ class MonMap {
   entity_inst_t get_inst(const string& n) {
     assert(mon_addr.count(n));
     int m = get_rank(n);
+    assert(m >= 0); // vector can't take negative indicies
     entity_inst_t i;
     i.addr = rank_addr[m];
     i.name = entity_name_t::MON(m);
diff --git a/src/mon/MonOpRequest.h b/src/mon/MonOpRequest.h
new file mode 100644
index 0000000..e8339f6
--- /dev/null
+++ b/src/mon/MonOpRequest.h
@@ -0,0 +1,220 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat <contact at redhat.com>
+ * Copyright (C) 2015 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#ifndef MON_OPREQUEST_H_
+#define MON_OPREQUEST_H_
+#include <iosfwd>
+#include <stdint.h>
+
+#include "common/TrackedOp.h"
+#include "include/memory.h"
+#include "mon/Session.h"
+#include "msg/Message.h"
+
+struct MonOpRequest : public TrackedOp {
+  friend class OpTracker;
+
+  void mark_dispatch() {
+    mark_event("monitor_dispatch");
+  }
+  void mark_wait_for_quorum() {
+    mark_event("wait_for_quorum");
+  }
+  void mark_zap() {
+    mark_event("monitor_zap");
+  }
+  void mark_forwarded() {
+    mark_event("forwarded");
+    forwarded_to_leader = true;
+  }
+
+  void mark_svc_event(const string &service, const string &event) {
+    string s = service;
+    s.append(":").append(event);
+    mark_event(s);
+  }
+
+  void mark_logmon_event(const string &event) {
+    mark_svc_event("logm", event);
+  }
+  void mark_osdmon_event(const string &event) {
+    mark_svc_event("osdmap", event);
+  }
+  void mark_pgmon_event(const string &event) {
+    mark_svc_event("pgmap", event);
+  }
+  void mark_mdsmon_event(const string &event) {
+    mark_svc_event("mdsmap", event);
+  }
+  void mark_authmon_event(const string &event) {
+    mark_svc_event("auth", event);
+  }
+  void mark_paxos_event(const string &event) {
+    mark_svc_event("paxos", event);
+  }
+
+
+  enum op_type_t {
+    OP_TYPE_NONE    = 0,      ///< no type defined (default)
+    OP_TYPE_SERVICE,          ///< belongs to a Paxos Service or similar
+    OP_TYPE_MONITOR,          ///< belongs to the Monitor class
+    OP_TYPE_ELECTION,         ///< belongs to the Elector class
+    OP_TYPE_PAXOS,            ///< refers to Paxos messages
+    OP_TYPE_COMMAND,          ///< is a command
+  };
+
+private:
+  Message *request;
+  utime_t dequeued_time;
+  MonSession *session;
+  ConnectionRef con;
+  bool forwarded_to_leader;
+  op_type_t op_type;
+
+  MonOpRequest(Message *req, OpTracker *tracker) :
+    TrackedOp(tracker, req->get_recv_stamp()),
+    request(req),
+    session(NULL),
+    con(NULL),
+    forwarded_to_leader(false),
+    op_type(OP_TYPE_NONE)
+  {
+    tracker->mark_event(this, "header_read", request->get_recv_stamp());
+    tracker->mark_event(this, "throttled", request->get_throttle_stamp());
+    tracker->mark_event(this, "all_read", request->get_recv_complete_stamp());
+    tracker->mark_event(this, "dispatched", request->get_dispatch_stamp());
+
+    if (req) {
+      con = req->get_connection();
+      if (con) {
+        session = static_cast<MonSession*>(con->get_priv());
+      }
+    }
+  }
+
+  void _dump(utime_t now, Formatter *f) const {
+    {
+      f->open_array_section("events");
+      Mutex::Locker l(lock);
+      for (list<pair<utime_t,string> >::const_iterator i = events.begin();
+           i != events.end(); ++i) {
+        f->open_object_section("event");
+        f->dump_stream("time") << i->first;
+        f->dump_string("event", i->second);
+        f->close_section();
+      }
+      f->close_section();
+      f->open_object_section("info");
+      f->dump_int("seq", seq);
+      f->dump_bool("src_is_mon", is_src_mon());
+      f->dump_stream("source") << request->get_source_inst();
+      f->dump_bool("forwarded_to_leader", forwarded_to_leader);
+      f->close_section();
+    }
+  }
+
+protected:
+  void _dump_op_descriptor_unlocked(ostream& stream) const {
+    get_req()->print(stream);
+  }
+
+public:
+  ~MonOpRequest() {
+    request->put();
+    // certain ops may not have a session (e.g., AUTH or PING)
+    if (session)
+      session->put();
+  }
+
+  MonSession *get_session() const {
+    if (!session)
+      return NULL;
+    return session;
+  }
+
+  template<class T>
+  T *get_req() const { return static_cast<T*>(request); }
+
+  Message *get_req() const { return get_req<Message>(); }
+
+  int get_req_type() const {
+    if (!request)
+      return 0;
+    return request->get_type();
+  }
+
+  ConnectionRef get_connection() { return con; }
+
+  void set_session(MonSession *s) {
+    if (session) {
+      // we will be rewriting the existing session; drop the ref.
+      session->put();
+    }
+
+    if (s == NULL) {
+      session = NULL;
+    } else {
+      session = static_cast<MonSession*>(s->get());
+    }
+  }
+
+  bool is_src_mon() const {
+    return (con && con->get_peer_type() & CEPH_ENTITY_TYPE_MON);
+  }
+
+  typedef ceph::shared_ptr<MonOpRequest> Ref;
+
+  void set_op_type(op_type_t t) {
+    op_type = t;
+  }
+  void set_type_service() {
+    set_op_type(OP_TYPE_SERVICE);
+  }
+  void set_type_monitor() {
+    set_op_type(OP_TYPE_MONITOR);
+  }
+  void set_type_paxos() {
+    set_op_type(OP_TYPE_PAXOS);
+  }
+  void set_type_election() {
+    set_op_type(OP_TYPE_ELECTION);
+  }
+  void set_type_command() {
+    set_op_type(OP_TYPE_COMMAND);
+  }
+
+  op_type_t get_op_type() {
+    return op_type;
+  }
+
+  bool is_type_service() {
+    return (get_op_type() == OP_TYPE_SERVICE);
+  }
+  bool is_type_monitor() {
+    return (get_op_type() == OP_TYPE_MONITOR);
+  }
+  bool is_type_paxos() {
+    return (get_op_type() == OP_TYPE_PAXOS);
+  }
+  bool is_type_election() {
+    return (get_op_type() == OP_TYPE_ELECTION);
+  }
+  bool is_type_command() {
+    return (get_op_type() == OP_TYPE_COMMAND);
+  }
+};
+
+typedef MonOpRequest::Ref MonOpRequestRef;
+
+#endif /* MON_OPREQUEST_H_ */
diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc
index ba2aecf..dcfd512 100644
--- a/src/mon/Monitor.cc
+++ b/src/mon/Monitor.cc
@@ -24,7 +24,6 @@
 
 #include "osd/OSDMap.h"
 
-#include "MonitorStore.h"
 #include "MonitorDBStore.h"
 
 #include "msg/Messenger.h"
@@ -37,6 +36,7 @@
 #include "messages/MGenericMessage.h"
 #include "messages/MMonCommand.h"
 #include "messages/MMonCommandAck.h"
+#include "messages/MMonMetadata.h"
 #include "messages/MMonSync.h"
 #include "messages/MMonScrub.h"
 #include "messages/MMonProbe.h"
@@ -96,13 +96,15 @@ const string Monitor::MONITOR_NAME = "monitor";
 const string Monitor::MONITOR_STORE_PREFIX = "monitor_store";
 
 
+#undef FLAG
 #undef COMMAND
 #undef COMMAND_WITH_FLAG
 MonCommand mon_commands[] = {
+#define FLAG(f) (MonCommand::FLAG_##f)
 #define COMMAND(parsesig, helptext, modulename, req_perms, avail)	\
-  {parsesig, helptext, modulename, req_perms, avail, 0},
-#define COMMAND_WITH_FLAG(parsesig, helptext, modulename, req_perms, avail, flag) \
-  {parsesig, helptext, modulename, req_perms, avail, MonCommand::FLAG_##flag},
+  {parsesig, helptext, modulename, req_perms, avail, FLAG(NONE)},
+#define COMMAND_WITH_FLAG(parsesig, helptext, modulename, req_perms, avail, flags) \
+  {parsesig, helptext, modulename, req_perms, avail, flags},
 #include <mon/MonCommands.h>
 };
 #undef COMMAND
@@ -166,7 +168,10 @@ Monitor::Monitor(CephContext* cct_, string nm, MonitorDBStore *s,
   required_features(0),
   leader(0),
   quorum_features(0),
+  // scrub
   scrub_version(0),
+  scrub_event(NULL),
+  scrub_timeout_event(NULL),
 
   // sync state
   sync_provider_count(0),
@@ -186,7 +191,8 @@ Monitor::Monitor(CephContext* cct_, string nm, MonitorDBStore *s,
   admin_hook(NULL),
   health_tick_event(NULL),
   health_interval_event(NULL),
-  routed_request_tid(0)
+  routed_request_tid(0),
+  op_tracker(cct, true, 1)
 {
   rank = -1;
 
@@ -199,7 +205,7 @@ Monitor::Monitor(CephContext* cct_, string nm, MonitorDBStore *s,
 
   paxos_service[PAXOS_MDSMAP] = new MDSMonitor(this, paxos, "mdsmap");
   paxos_service[PAXOS_MONMAP] = new MonmapMonitor(this, paxos, "monmap");
-  paxos_service[PAXOS_OSDMAP] = new OSDMonitor(this, paxos, "osdmap");
+  paxos_service[PAXOS_OSDMAP] = new OSDMonitor(cct, this, paxos, "osdmap");
   paxos_service[PAXOS_PGMAP] = new PGMonitor(this, paxos, "pgmap");
   paxos_service[PAXOS_LOG] = new LogMonitor(this, paxos, "logm");
   paxos_service[PAXOS_AUTH] = new AuthMonitor(this, paxos, "auth");
@@ -288,10 +294,10 @@ void Monitor::do_admin_command(string command, cmdmap_t& cmdmap, string format,
   }
   args = "[" + args + "]";
  
-  bool read_only = false;
-  if (command == "mon_status" || command == "quorum_status") {
-    read_only = true;
-  }
+  bool read_only = (command == "mon_status" ||
+                    command == "mon metadata" ||
+                    command == "quorum_status" ||
+                    command == "ops");
 
   (read_only ? audit_clog->debug() : audit_clog->info())
     << "from='admin socket' entity='admin socket' "
@@ -324,6 +330,11 @@ void Monitor::do_admin_command(string command, cmdmap_t& cmdmap, string format,
     start_election();
     elector.stop_participating();
     ss << "stopped responding to quorum, initiated new election";
+  } else if (command == "ops") {
+    op_tracker.dump_ops_in_flight(f.get());
+    if (f) {
+      f->flush(ss);
+    }
   } else {
     assert(0 == "bad AdminSocket command binding");
   }
@@ -366,6 +377,7 @@ CompatSet Monitor::get_supported_features()
   compat.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_OSD_ERASURE_CODES);
   compat.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_OSDMAP_ENC);
   compat.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V2);
+  compat.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V3);
   return compat;
 }
 
@@ -439,8 +451,9 @@ const char** Monitor::get_tracked_conf_keys() const
   static const char* KEYS[] = {
     "crushtool", // helpful for testing
     "mon_lease",
-    "mon_lease_renew_interval",
-    "mon_lease_ack_timeout",
+    "mon_lease_renew_interval_factor",
+    "mon_lease_ack_timeout_factor",
+    "mon_accept_timeout_factor",
     // clog & admin clog
     "clog_to_monitors",
     "clog_to_syslog",
@@ -450,6 +463,8 @@ const char** Monitor::get_tracked_conf_keys() const
     "mon_health_to_clog",
     "mon_health_to_clog_interval",
     "mon_health_to_clog_tick_interval",
+    // scrub interval
+    "mon_scrub_interval",
     NULL
   };
   return KEYS;
@@ -474,6 +489,10 @@ void Monitor::handle_conf_change(const struct md_config_t *conf,
       changed.count("mon_health_to_clog_tick_interval")) {
     health_to_clog_update_conf(changed);
   }
+
+  if (changed.count("mon_scrub_interval")) {
+    scrub_update_interval(conf->mon_scrub_interval);
+  }
 }
 
 void Monitor::update_log_clients()
@@ -499,11 +518,10 @@ int Monitor::sanitize_options()
 
   // mon_lease must be greater than mon_lease_renewal; otherwise we
   // may incur in leases expiring before they are renewed.
-  if (g_conf->mon_lease <= g_conf->mon_lease_renew_interval) {
-    clog->error() << "mon_lease (" << g_conf->mon_lease
-                 << ") must be greater "
-                 << "than mon_lease_renew_interval ("
-                 << g_conf->mon_lease_renew_interval << ")";
+  if (g_conf->mon_lease_renew_interval_factor >= 1.0) {
+    clog->error() << "mon_lease_renew_interval_factor ("
+		  << g_conf->mon_lease_renew_interval_factor
+		  << ") must be less than 1.0";
     r = -EINVAL;
   }
 
@@ -512,13 +530,13 @@ int Monitor::sanitize_options()
   // with the same value, for a given small vale, could mean timing out if
   // the monitors happened to be overloaded -- or even under normal load for
   // a small enough value.
-  if (g_conf->mon_lease_ack_timeout <= g_conf->mon_lease) {
-    clog->error() << "mon_lease_ack_timeout ("
-                 << g_conf->mon_lease_ack_timeout
-                 << ") must be greater than mon_lease ("
-                 << g_conf->mon_lease << ")";
+  if (g_conf->mon_lease_ack_timeout_factor <= 1.0) {
+    clog->error() << "mon_lease_ack_timeout_factor ("
+		  << g_conf->mon_lease_ack_timeout_factor
+		  << ") must be greater than 1.0";
     r = -EINVAL;
   }
+
   return r;
 }
 
@@ -537,14 +555,14 @@ int Monitor::preinit()
   assert(!logger);
   {
     PerfCountersBuilder pcb(g_ceph_context, "mon", l_mon_first, l_mon_last);
-    pcb.add_u64(l_mon_num_sessions, "num_sessions");
-    pcb.add_u64_counter(l_mon_session_add, "session_add");
-    pcb.add_u64_counter(l_mon_session_rm, "session_rm");
-    pcb.add_u64_counter(l_mon_session_trim, "session_trim");
-    pcb.add_u64_counter(l_mon_num_elections, "num_elections");
-    pcb.add_u64_counter(l_mon_election_call, "election_call");
-    pcb.add_u64_counter(l_mon_election_win, "election_win");
-    pcb.add_u64_counter(l_mon_election_lose, "election_lose");
+    pcb.add_u64(l_mon_num_sessions, "num_sessions", "Open sessions", "sess");
+    pcb.add_u64_counter(l_mon_session_add, "session_add", "Created sessions", "sadd");
+    pcb.add_u64_counter(l_mon_session_rm, "session_rm", "Removed sessions", "srm");
+    pcb.add_u64_counter(l_mon_session_trim, "session_trim", "Trimmed sessions");
+    pcb.add_u64_counter(l_mon_num_elections, "num_elections", "Elections participated in");
+    pcb.add_u64_counter(l_mon_election_call, "election_call", "Elections started");
+    pcb.add_u64_counter(l_mon_election_win, "election_win", "Elections won");
+    pcb.add_u64_counter(l_mon_election_lose, "election_lose", "Elections lost");
     logger = pcb.create_perf_counters();
     cct->get_perfcounters_collection()->add(logger);
   }
@@ -552,29 +570,29 @@ int Monitor::preinit()
   assert(!cluster_logger);
   {
     PerfCountersBuilder pcb(g_ceph_context, "cluster", l_cluster_first, l_cluster_last);
-    pcb.add_u64(l_cluster_num_mon, "num_mon");
-    pcb.add_u64(l_cluster_num_mon_quorum, "num_mon_quorum");
-    pcb.add_u64(l_cluster_num_osd, "num_osd");
-    pcb.add_u64(l_cluster_num_osd_up, "num_osd_up");
-    pcb.add_u64(l_cluster_num_osd_in, "num_osd_in");
-    pcb.add_u64(l_cluster_osd_epoch, "osd_epoch");
-    pcb.add_u64(l_cluster_osd_bytes, "osd_bytes");
-    pcb.add_u64(l_cluster_osd_bytes_used, "osd_bytes_used");
-    pcb.add_u64(l_cluster_osd_bytes_avail, "osd_bytes_avail");
-    pcb.add_u64(l_cluster_num_pool, "num_pool");
-    pcb.add_u64(l_cluster_num_pg, "num_pg");
-    pcb.add_u64(l_cluster_num_pg_active_clean, "num_pg_active_clean");
-    pcb.add_u64(l_cluster_num_pg_active, "num_pg_active");
-    pcb.add_u64(l_cluster_num_pg_peering, "num_pg_peering");
-    pcb.add_u64(l_cluster_num_object, "num_object");
-    pcb.add_u64(l_cluster_num_object_degraded, "num_object_degraded");
-    pcb.add_u64(l_cluster_num_object_misplaced, "num_object_misplaced");
-    pcb.add_u64(l_cluster_num_object_unfound, "num_object_unfound");
-    pcb.add_u64(l_cluster_num_bytes, "num_bytes");
-    pcb.add_u64(l_cluster_num_mds_up, "num_mds_up");
-    pcb.add_u64(l_cluster_num_mds_in, "num_mds_in");
-    pcb.add_u64(l_cluster_num_mds_failed, "num_mds_failed");
-    pcb.add_u64(l_cluster_mds_epoch, "mds_epoch");
+    pcb.add_u64(l_cluster_num_mon, "num_mon", "Monitors");
+    pcb.add_u64(l_cluster_num_mon_quorum, "num_mon_quorum", "Monitors in quorum");
+    pcb.add_u64(l_cluster_num_osd, "num_osd", "OSDs");
+    pcb.add_u64(l_cluster_num_osd_up, "num_osd_up", "OSDs that are up");
+    pcb.add_u64(l_cluster_num_osd_in, "num_osd_in", "OSD in state \"in\" (they are in cluster)");
+    pcb.add_u64(l_cluster_osd_epoch, "osd_epoch", "Current epoch of OSD map");
+    pcb.add_u64(l_cluster_osd_bytes, "osd_bytes", "Total capacity of cluster");
+    pcb.add_u64(l_cluster_osd_bytes_used, "osd_bytes_used", "Used space");
+    pcb.add_u64(l_cluster_osd_bytes_avail, "osd_bytes_avail", "Available space");
+    pcb.add_u64(l_cluster_num_pool, "num_pool", "Pools");
+    pcb.add_u64(l_cluster_num_pg, "num_pg", "Placement groups");
+    pcb.add_u64(l_cluster_num_pg_active_clean, "num_pg_active_clean", "Placement groups in active+clean state");
+    pcb.add_u64(l_cluster_num_pg_active, "num_pg_active", "Placement groups in active state");
+    pcb.add_u64(l_cluster_num_pg_peering, "num_pg_peering", "Placement groups in peering state");
+    pcb.add_u64(l_cluster_num_object, "num_object", "Objects");
+    pcb.add_u64(l_cluster_num_object_degraded, "num_object_degraded", "Degraded (missing replicas) objects");
+    pcb.add_u64(l_cluster_num_object_misplaced, "num_object_misplaced", "Misplaced (wrong location in the cluster) objects");
+    pcb.add_u64(l_cluster_num_object_unfound, "num_object_unfound", "Unfound objects");
+    pcb.add_u64(l_cluster_num_bytes, "num_bytes", "Size of all objects");
+    pcb.add_u64(l_cluster_num_mds_up, "num_mds_up", "MDSs that are up");
+    pcb.add_u64(l_cluster_num_mds_in, "num_mds_in", "MDS in state \"in\" (they are in cluster)");
+    pcb.add_u64(l_cluster_num_mds_failed, "num_mds_failed", "Failed MDS");
+    pcb.add_u64(l_cluster_mds_epoch, "mds_epoch", "Current epoch of MDS map");
     cluster_logger = pcb.create_perf_counters();
   }
 
@@ -718,6 +736,11 @@ int Monitor::preinit()
                                      admin_hook,
                                      "force monitor out of the quorum");
   assert(r == 0);
+  r = admin_socket->register_command("ops",
+                                     "ops",
+                                     admin_hook,
+                                     "show the ops currently in flight");
+  assert(r == 0);
   lock.Lock();
 
   // add ourselves as a conf observer
@@ -739,6 +762,7 @@ int Monitor::init()
   // i'm ready!
   messenger->add_dispatcher_tail(this);
 
+
   bootstrap();
 
   // encode command sets
@@ -829,12 +853,15 @@ void Monitor::shutdown()
 
   state = STATE_SHUTDOWN;
 
+  g_conf->remove_observer(this);
+
   if (admin_hook) {
     AdminSocket* admin_socket = cct->get_admin_socket();
     admin_socket->unregister_command("mon_status");
     admin_socket->unregister_command("quorum_status");
     admin_socket->unregister_command("sync_force");
     admin_socket->unregister_command("add_bootstrap_peer_hint");
+    admin_socket->unregister_command("ops");
     delete admin_hook;
     admin_hook = NULL;
   }
@@ -994,6 +1021,7 @@ void Monitor::_reset()
   cancel_probe_timeout();
   timecheck_finish();
   health_events_cleanup();
+  scrub_event_cancel();
 
   leader_since = utime_t();
   if (!quorum.empty()) {
@@ -1214,8 +1242,9 @@ void Monitor::sync_finish(version_t last_committed)
   bootstrap();
 }
 
-void Monitor::handle_sync(MMonSync *m)
+void Monitor::handle_sync(MonOpRequestRef op)
 {
+  MMonSync *m = static_cast<MMonSync*>(op->get_req());
   dout(10) << __func__ << " " << *m << dendl;
   switch (m->op) {
 
@@ -1223,45 +1252,46 @@ void Monitor::handle_sync(MMonSync *m)
 
   case MMonSync::OP_GET_COOKIE_FULL:
   case MMonSync::OP_GET_COOKIE_RECENT:
-    handle_sync_get_cookie(m);
+    handle_sync_get_cookie(op);
     break;
   case MMonSync::OP_GET_CHUNK:
-    handle_sync_get_chunk(m);
+    handle_sync_get_chunk(op);
     break;
 
     // client -----------
 
   case MMonSync::OP_COOKIE:
-    handle_sync_cookie(m);
+    handle_sync_cookie(op);
     break;
 
   case MMonSync::OP_CHUNK:
   case MMonSync::OP_LAST_CHUNK:
-    handle_sync_chunk(m);
+    handle_sync_chunk(op);
     break;
   case MMonSync::OP_NO_COOKIE:
-    handle_sync_no_cookie(m);
+    handle_sync_no_cookie(op);
     break;
 
   default:
     dout(0) << __func__ << " unknown op " << m->op << dendl;
     assert(0 == "unknown op");
   }
-  m->put();
 }
 
 // leader
 
-void Monitor::_sync_reply_no_cookie(MMonSync *m)
+void Monitor::_sync_reply_no_cookie(MonOpRequestRef op)
 {
+  MMonSync *m = static_cast<MMonSync*>(op->get_req());
   MMonSync *reply = new MMonSync(MMonSync::OP_NO_COOKIE, m->cookie);
   m->get_connection()->send_message(reply);
 }
 
-void Monitor::handle_sync_get_cookie(MMonSync *m)
+void Monitor::handle_sync_get_cookie(MonOpRequestRef op)
 {
+  MMonSync *m = static_cast<MMonSync*>(op->get_req());
   if (is_synchronizing()) {
-    _sync_reply_no_cookie(m);
+    _sync_reply_no_cookie(op);
     return;
   }
 
@@ -1310,13 +1340,14 @@ void Monitor::handle_sync_get_cookie(MMonSync *m)
   m->get_connection()->send_message(reply);
 }
 
-void Monitor::handle_sync_get_chunk(MMonSync *m)
+void Monitor::handle_sync_get_chunk(MonOpRequestRef op)
 {
+  MMonSync *m = static_cast<MMonSync*>(op->get_req());
   dout(10) << __func__ << " " << *m << dendl;
 
   if (sync_providers.count(m->cookie) == 0) {
     dout(10) << __func__ << " no cookie " << m->cookie << dendl;
-    _sync_reply_no_cookie(m);
+    _sync_reply_no_cookie(op);
     return;
   }
 
@@ -1330,7 +1361,7 @@ void Monitor::handle_sync_get_chunk(MMonSync *m)
     dout(10) << __func__ << " sync requester fell behind paxos, their lc " << sp.last_committed
 	     << " < our fc " << paxos->get_first_committed() << dendl;
     sync_providers.erase(m->cookie);
-    _sync_reply_no_cookie(m);
+    _sync_reply_no_cookie(op);
     return;
   }
 
@@ -1377,8 +1408,9 @@ void Monitor::handle_sync_get_chunk(MMonSync *m)
 
 // requester
 
-void Monitor::handle_sync_cookie(MMonSync *m)
+void Monitor::handle_sync_cookie(MonOpRequestRef op)
 {
+  MMonSync *m = static_cast<MMonSync*>(op->get_req());
   dout(10) << __func__ << " " << *m << dendl;
   if (sync_cookie) {
     dout(10) << __func__ << " already have a cookie, ignoring" << dendl;
@@ -1410,8 +1442,9 @@ void Monitor::sync_get_next_chunk()
   assert(g_conf->mon_sync_requester_kill_at != 4);
 }
 
-void Monitor::handle_sync_chunk(MMonSync *m)
+void Monitor::handle_sync_chunk(MonOpRequestRef op)
 {
+  MMonSync *m = static_cast<MMonSync*>(op->get_req());
   dout(10) << __func__ << " " << *m << dendl;
 
   if (m->cookie != sync_cookie) {
@@ -1464,7 +1497,7 @@ void Monitor::handle_sync_chunk(MMonSync *m)
   }
 }
 
-void Monitor::handle_sync_no_cookie(MMonSync *m)
+void Monitor::handle_sync_no_cookie(MonOpRequestRef op)
 {
   dout(10) << __func__ << dendl;
   bootstrap();
@@ -1518,23 +1551,23 @@ void Monitor::probe_timeout(int r)
   bootstrap();
 }
 
-void Monitor::handle_probe(MMonProbe *m)
+void Monitor::handle_probe(MonOpRequestRef op)
 {
+  MMonProbe *m = static_cast<MMonProbe*>(op->get_req());
   dout(10) << "handle_probe " << *m << dendl;
 
   if (m->fsid != monmap->fsid) {
     dout(0) << "handle_probe ignoring fsid " << m->fsid << " != " << monmap->fsid << dendl;
-    m->put();
     return;
   }
 
   switch (m->op) {
   case MMonProbe::OP_PROBE:
-    handle_probe_probe(m);
+    handle_probe_probe(op);
     break;
 
   case MMonProbe::OP_REPLY:
-    handle_probe_reply(m);
+    handle_probe_reply(op);
     break;
 
   case MMonProbe::OP_MISSING_FEATURES:
@@ -1543,17 +1576,15 @@ void Monitor::handle_probe(MMonProbe *m)
 	 << ", missing " << (required_features & ~CEPH_FEATURES_ALL)
 	 << dendl;
     break;
-
-  default:
-    m->put();
   }
 }
 
 /**
  * @todo fix this. This is going to cause trouble.
  */
-void Monitor::handle_probe_probe(MMonProbe *m)
+void Monitor::handle_probe_probe(MonOpRequestRef op)
 {
+  MMonProbe *m = static_cast<MMonProbe*>(op->get_req());
   MMonProbe *r;
 
   dout(10) << "handle_probe_probe " << m->get_source_inst() << *m
@@ -1603,17 +1634,17 @@ void Monitor::handle_probe_probe(MMonProbe *m)
   }
 
  out:
-  m->put();
+  return;
 }
 
-void Monitor::handle_probe_reply(MMonProbe *m)
+void Monitor::handle_probe_reply(MonOpRequestRef op)
 {
+  MMonProbe *m = static_cast<MMonProbe*>(op->get_req());
   dout(10) << "handle_probe_reply " << m->get_source_inst() << *m << dendl;
   dout(10) << " monmap is " << *monmap << dendl;
 
   // discover name and addrs during probing or electing states.
   if (!is_probing() && !is_electing()) {
-    m->put();
     return;
   }
 
@@ -1631,7 +1662,6 @@ void Monitor::handle_probe_reply(MMonProbe *m)
 	       << ", mine was " << monmap->get_epoch() << dendl;
       delete newmap;
       monmap->decode(m->monmap_bl);
-      m->put();
 
       bootstrap();
       return;
@@ -1648,7 +1678,6 @@ void Monitor::handle_probe_reply(MMonProbe *m)
     monmap->rename(peer_name, m->name);
 
     if (is_electing()) {
-      m->put();
       bootstrap();
       return;
     }
@@ -1662,7 +1691,6 @@ void Monitor::handle_probe_reply(MMonProbe *m)
       monmap->get_addr(m->name).is_blank_ip()) {
     dout(1) << " learned initial mon " << m->name << " addr " << m->get_source_addr() << dendl;
     monmap->set_addr(m->name, m->get_source_addr());
-    m->put();
 
     bootstrap();
     return;
@@ -1670,7 +1698,6 @@ void Monitor::handle_probe_reply(MMonProbe *m)
 
   // end discover phase
   if (!is_probing()) {
-    m->put();
     return;
   }
 
@@ -1678,7 +1705,6 @@ void Monitor::handle_probe_reply(MMonProbe *m)
 
   if (is_synchronizing()) {
     dout(10) << " currently syncing" << dendl;
-    m->put();
     return;
   }
 
@@ -1699,7 +1725,6 @@ void Monitor::handle_probe_reply(MMonProbe *m)
 	       << dendl;
       cancel_probe_timeout();
       sync_start(other, true);
-      m->put();
       return;
     }
     if (paxos->get_version() + g_conf->paxos_max_join_drift < m->paxos_last_version) {
@@ -1709,7 +1734,6 @@ void Monitor::handle_probe_reply(MMonProbe *m)
 	       << dendl;
       cancel_probe_timeout();
       sync_start(other, false);
-      m->put();
       return;
     }
   }
@@ -1738,7 +1762,6 @@ void Monitor::handle_probe_reply(MMonProbe *m)
       outside_quorum.insert(m->name);
     } else {
       dout(10) << " mostly ignoring mon." << m->name << ", not part of monmap" << dendl;
-      m->put();
       return;
     }
 
@@ -1755,7 +1778,6 @@ void Monitor::handle_probe_reply(MMonProbe *m)
       dout(10) << " that's not yet enough for a new quorum, waiting" << dendl;
     }
   }
-  m->put();
 }
 
 void Monitor::join_election()
@@ -1857,7 +1879,9 @@ void Monitor::win_election(epoch_t epoch, set<int>& active, uint64_t features,
     timecheck_start();
     health_tick_start();
     do_health_to_clog_interval();
+    scrub_event_start();
   }
+  collect_sys_info(&metadata[rank], g_ceph_context);
 }
 
 void Monitor::lose_election(epoch_t epoch, set<int> &q, int l, uint64_t features) 
@@ -1876,9 +1900,16 @@ void Monitor::lose_election(epoch_t epoch, set<int> &q, int l, uint64_t features
     (*p)->election_finished();
   health_monitor->start(epoch);
 
-  logger->inc(l_mon_election_win);
+  logger->inc(l_mon_election_lose);
 
   finish_election();
+
+  if (quorum_features & CEPH_FEATURE_MON_METADATA) {
+    Metadata sys_info;
+    collect_sys_info(&sys_info, g_ceph_context);
+    messenger->send_message(new MMonMetadata(sys_info),
+			    monmap->get_inst(get_leader()));
+  }
 }
 
 void Monitor::finish_election()
@@ -1913,6 +1944,9 @@ void Monitor::apply_quorum_to_compatset_features()
   if (quorum_features & CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2) {
     new_features.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V2);
   }
+  if (quorum_features & CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3) {
+    new_features.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V3);
+  }
   if (new_features.compare(features) != 0) {
     CompatSet diff = features.unsupported(new_features);
     dout(1) << __func__ << " enabling new quorum features: " << diff << dendl;
@@ -1938,6 +1972,9 @@ void Monitor::apply_compatset_features_to_quorum_requirements()
   if (features.incompat.contains(CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V2)) {
     required_features |= CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2;
   }
+  if (features.incompat.contains(CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V3)) {
+    required_features |= CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3;
+  }
   dout(10) << __func__ << " required_features " << required_features << dendl;
 }
 
@@ -2521,24 +2558,22 @@ bool Monitor::is_keyring_required()
     auth_cluster_required == "cephx";
 }
 
-void Monitor::handle_command(MMonCommand *m)
+void Monitor::handle_command(MonOpRequestRef op)
 {
+  assert(op->is_type_command());
+  MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
   if (m->fsid != monmap->fsid) {
     dout(0) << "handle_command on fsid " << m->fsid << " != " << monmap->fsid << dendl;
-    reply_command(m, -EPERM, "wrong fsid", 0);
+    reply_command(op, -EPERM, "wrong fsid", 0);
     return;
   }
 
   MonSession *session = m->get_session();
-  if (!session) {
-    string rs = "Access denied";
-    reply_command(m, -EACCES, rs, 0);
-    return;
-  }
+  assert(session);
 
   if (m->cmd.empty()) {
     string rs = "No command supplied";
-    reply_command(m, -EINVAL, rs, 0);
+    reply_command(op, -EINVAL, rs, 0);
     return;
   }
 
@@ -2556,9 +2591,7 @@ void Monitor::handle_command(MMonCommand *m)
     r = -EINVAL;
     rs = ss.str();
     if (!m->get_source().is_mon())  // don't reply to mon->mon commands
-      reply_command(m, r, rs, 0);
-    else
-      m->put();
+      reply_command(op, r, rs, 0);
     return;
   }
 
@@ -2569,7 +2602,7 @@ void Monitor::handle_command(MMonCommand *m)
     format_command_descriptions(leader_supported_mon_commands,
 				leader_supported_mon_commands_size, f, &rdata);
     delete f;
-    reply_command(m, 0, "", rdata, 0);
+    reply_command(op, 0, "", rdata, 0);
     return;
   }
 
@@ -2593,7 +2626,7 @@ void Monitor::handle_command(MMonCommand *m)
                                const_cast<MonCommand*>(leader_supported_mon_commands),
                                leader_supported_mon_commands_size);
   if (!leader_cmd) {
-    reply_command(m, -EINVAL, "command not known", 0);
+    reply_command(op, -EINVAL, "command not known", 0);
     return;
   }
   // validate command is in our map & matches, or forward if it is allowed
@@ -2601,33 +2634,42 @@ void Monitor::handle_command(MMonCommand *m)
                                               ARRAY_SIZE(mon_commands));
   if (!is_leader()) {
     if (!mon_cmd) {
-      if (leader_cmd->has_flag(MonCommand::FLAG_NOFORWARD)) {
-	reply_command(m, -EINVAL,
+      if (leader_cmd->is_noforward()) {
+	reply_command(op, -EINVAL,
 		      "command not locally supported and not allowed to forward",
 		      0);
 	return;
       }
       dout(10) << "Command not locally supported, forwarding request "
 	       << m << dendl;
-      forward_request_leader(m);
+      forward_request_leader(op);
       return;
     } else if (!mon_cmd->is_compat(leader_cmd)) {
-      if (mon_cmd->has_flag(MonCommand::FLAG_NOFORWARD)) {
-	reply_command(m, -EINVAL,
+      if (mon_cmd->is_noforward()) {
+	reply_command(op, -EINVAL,
 		      "command not compatible with leader and not allowed to forward",
 		      0);
 	return;
       }
       dout(10) << "Command not compatible with leader, forwarding request "
 	       << m << dendl;
-      forward_request_leader(m);
+      forward_request_leader(op);
       return;
     }
   }
 
-  if (session->proxy_con && mon_cmd->has_flag(MonCommand::FLAG_NOFORWARD)) {
+  if (mon_cmd->is_obsolete() ||
+      (cct->_conf->mon_debug_deprecated_as_obsolete
+       && mon_cmd->is_deprecated())) {
+    reply_command(op, -ENOTSUP,
+                  "command is obsolete; please check usage and/or man page",
+                  0);
+    return;
+  }
+
+  if (session->proxy_con && mon_cmd->is_noforward()) {
     dout(10) << "Got forward for noforward command " << m << dendl;
-    reply_command(m, -EINVAL, "forward for noforward command", rdata, 0);
+    reply_command(op, -EINVAL, "forward for noforward command", rdata, 0);
     return;
   }
 
@@ -2651,7 +2693,7 @@ void Monitor::handle_command(MMonCommand *m)
       << "from='" << session->inst << "' "
       << "entity='" << session->entity_name << "' "
       << "cmd=" << m->cmd << ":  access denied";
-    reply_command(m, -EACCES, "access denied", 0);
+    reply_command(op, -EACCES, "access denied", 0);
     return;
   }
 
@@ -2661,33 +2703,42 @@ void Monitor::handle_command(MMonCommand *m)
     << "cmd=" << m->cmd << ": dispatch";
 
   if (module == "mds" || module == "fs") {
-    mdsmon()->dispatch(m);
+    mdsmon()->dispatch(op);
     return;
   }
   if (module == "osd") {
-    osdmon()->dispatch(m);
+    osdmon()->dispatch(op);
     return;
   }
 
   if (module == "pg") {
-    pgmon()->dispatch(m);
+    pgmon()->dispatch(op);
     return;
   }
-  if (module == "mon") {
-    monmon()->dispatch(m);
+  if (module == "mon" &&
+      /* Let the Monitor class handle the following commands:
+       *  'mon compact'
+       *  'mon scrub'
+       *  'mon sync force'
+       */
+      prefix != "mon compact" &&
+      prefix != "mon scrub" &&
+      prefix != "mon sync force" &&
+      prefix != "mon metadata") {
+    monmon()->dispatch(op);
     return;
   }
   if (module == "auth") {
-    authmon()->dispatch(m);
+    authmon()->dispatch(op);
     return;
   }
   if (module == "log") {
-    logmon()->dispatch(m);
+    logmon()->dispatch(op);
     return;
   }
 
   if (module == "config-key") {
-    config_key_service->dispatch(m);
+    config_key_service->dispatch(op);
     return;
   }
 
@@ -2701,24 +2752,24 @@ void Monitor::handle_command(MMonCommand *m)
       ds << monmap->fsid;
       rdata.append(ds);
     }
-    reply_command(m, 0, "", rdata, 0);
+    reply_command(op, 0, "", rdata, 0);
     return;
   }
 
-  if (prefix == "scrub") {
+  if (prefix == "scrub" || prefix == "mon scrub") {
     wait_for_paxos_write();
     if (is_leader()) {
-      int r = scrub();
-      reply_command(m, r, "", rdata, 0);
+      int r = scrub_start();
+      reply_command(op, r, "", rdata, 0);
     } else if (is_peon()) {
-      forward_request_leader(m);
+      forward_request_leader(op);
     } else {
-      reply_command(m, -EAGAIN, "no quorum", rdata, 0);
+      reply_command(op, -EAGAIN, "no quorum", rdata, 0);
     }
     return;
   }
 
-  if (prefix == "compact") {
+  if (prefix == "compact" || prefix == "mon compact") {
     dout(1) << "triggering manual compaction" << dendl;
     utime_t start = ceph_clock_now(g_ceph_context);
     store->compact();
@@ -2838,11 +2889,50 @@ void Monitor::handle_command(MMonCommand *m)
     ss2 << "report " << rdata.crc32c(6789);
     rs = ss2.str();
     r = 0;
+  } else if (prefix == "node ls") {
+    string node_type("all");
+    cmd_getval(g_ceph_context, cmdmap, "type", node_type);
+    if (!f)
+      f.reset(Formatter::create("json-pretty"));
+    if (node_type == "all") {
+      f->open_object_section("nodes");
+      print_nodes(f.get(), ds);
+      osdmon()->print_nodes(f.get());
+      mdsmon()->print_nodes(f.get());
+      f->close_section();
+    } else if (node_type == "mon") {
+      print_nodes(f.get(), ds);
+    } else if (node_type == "osd") {
+      osdmon()->print_nodes(f.get());
+    } else if (node_type == "mds") {
+      mdsmon()->print_nodes(f.get());
+    }
+    f->flush(ds);
+    rdata.append(ds);
+    rs = "";
+    r = 0;
+  } else if (prefix == "mon metadata") {
+    string name;
+    cmd_getval(g_ceph_context, cmdmap, "id", name);
+    int mon = monmap->get_rank(name);
+    if (mon < 0) {
+      rs = "requested mon not found";
+      r = -ENOENT;
+      goto out;
+    }
+    if (!f)
+      f.reset(Formatter::create("json-pretty"));
+    f->open_object_section("mon_metadata");
+    r = get_mon_metadata(mon, f.get(), ds);
+    f->close_section();
+    f->flush(ds);
+    rdata.append(ds);
+    rs = "";
   } else if (prefix == "quorum_status") {
     // make sure our map is readable and up to date
     if (!is_leader() && !is_peon()) {
       dout(10) << " waiting for quorum" << dendl;
-      waitfor_quorum.push_back(new C_RetryMessage(this, m));
+      waitfor_quorum.push_back(new C_RetryMessage(this, op));
       return;
     }
     _quorum_status(f.get(), ds);
@@ -2856,7 +2946,8 @@ void Monitor::handle_command(MMonCommand *m)
     rdata.append(ds);
     rs = "";
     r = 0;
-  } else if (prefix == "sync force") {
+  } else if (prefix == "sync force" ||
+             prefix == "mon sync force") {
     string validate1, validate2;
     cmd_getval(g_ceph_context, cmdmap, "validate1", validate1);
     cmd_getval(g_ceph_context, cmdmap, "validate2", validate2);
@@ -2918,24 +3009,24 @@ void Monitor::handle_command(MMonCommand *m)
 
  out:
   if (!m->get_source().is_mon())  // don't reply to mon->mon commands
-    reply_command(m, r, rs, rdata, 0);
-  else
-    m->put();
+    reply_command(op, r, rs, rdata, 0);
 }
 
-void Monitor::reply_command(MMonCommand *m, int rc, const string &rs, version_t version)
+void Monitor::reply_command(MonOpRequestRef op, int rc, const string &rs, version_t version)
 {
   bufferlist rdata;
-  reply_command(m, rc, rs, rdata, version);
+  reply_command(op, rc, rs, rdata, version);
 }
 
-void Monitor::reply_command(MMonCommand *m, int rc, const string &rs, bufferlist& rdata, version_t version)
+void Monitor::reply_command(MonOpRequestRef op, int rc, const string &rs,
+                            bufferlist& rdata, version_t version)
 {
+  MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
+  assert(m->get_type() == MSG_MON_COMMAND);
   MMonCommandAck *reply = new MMonCommandAck(m->cmd, rc, rs, version);
   reply->set_tid(m->get_tid());
   reply->set_data(rdata);
-  send_reply(m, reply);
-  m->put();
+  send_reply(op, reply);
 }
 
 
@@ -2947,19 +3038,19 @@ void Monitor::reply_command(MMonCommand *m, int rc, const string &rs, bufferlist
 // back via the correct monitor and back to them.  (the monitor will not
 // initiate any connections.)
 
-void Monitor::forward_request_leader(PaxosServiceMessage *req)
+void Monitor::forward_request_leader(MonOpRequestRef op)
 {
+  op->mark_event(__func__);
+
   int mon = get_leader();
-  MonSession *session = 0;
-  if (req->get_connection())
-    session = static_cast<MonSession *>(req->get_connection()->get_priv());
+  MonSession *session = op->get_session();
+  PaxosServiceMessage *req = op->get_req<PaxosServiceMessage>();
+  
   if (req->get_source().is_mon() && req->get_source_addr() != messenger->get_myaddr()) {
     dout(10) << "forward_request won't forward (non-local) mon request " << *req << dendl;
-    req->put();
-  } else if (session && session->proxy_con) {
+  } else if (session->proxy_con) {
     dout(10) << "forward_request won't double fwd request " << *req << dendl;
-    req->put();
-  } else if (session && !session->closed) {
+  } else if (!session->closed) {
     RoutedRequest *rr = new RoutedRequest;
     rr->tid = ++routed_request_tid;
     rr->client_inst = req->get_source_inst();
@@ -2967,13 +3058,15 @@ void Monitor::forward_request_leader(PaxosServiceMessage *req)
     rr->con_features = rr->con->get_features();
     encode_message(req, CEPH_FEATURES_ALL, rr->request_bl);   // for my use only; use all features
     rr->session = static_cast<MonSession *>(session->get());
+    rr->op = op;
     routed_requests[rr->tid] = rr;
     session->routed_request_tids.insert(rr->tid);
     
     dout(10) << "forward_request " << rr->tid << " request " << *req
 	     << " features " << rr->con_features << dendl;
 
-    MForward *forward = new MForward(rr->tid, req,
+    MForward *forward = new MForward(rr->tid,
+                                     req,
 				     rr->con_features,
 				     rr->session->caps);
     forward->set_priority(req->get_priority());
@@ -2983,12 +3076,11 @@ void Monitor::forward_request_leader(PaxosServiceMessage *req)
       forward->entity_name.set_type(CEPH_ENTITY_TYPE_MON);
     }
     messenger->send_message(forward, monmap->get_inst(mon));
+    op->mark_forwarded();
+    assert(op->get_req()->get_type() != 0);
   } else {
     dout(10) << "forward_request no session for request " << *req << dendl;
-    req->put();
   }
-  if (session)
-    session->put();
 }
 
 // fake connection attached to forwarded messages
@@ -3011,11 +3103,12 @@ struct AnonConnection : public Connection {
 };
 
 //extract the original message and put it into the regular dispatch function
-void Monitor::handle_forward(MForward *m)
+void Monitor::handle_forward(MonOpRequestRef op)
 {
+  MForward *m = static_cast<MForward*>(op->get_req());
   dout(10) << "received forwarded message from " << m->client
 	   << " via " << m->get_source_inst() << dendl;
-  MonSession *session = static_cast<MonSession *>(m->get_connection()->get_priv());
+  MonSession *session = op->get_session();
   assert(session);
 
   if (!session->is_capable("mon", MON_CAP_X)) {
@@ -3024,8 +3117,11 @@ void Monitor::handle_forward(MForward *m)
   } else {
     // see PaxosService::dispatch(); we rely on this being anon
     // (c->msgr == NULL)
+    PaxosServiceMessage *req = m->claim_message();
+    assert(req != NULL);
+
     ConnectionRef c(new AnonConnection(cct));
-    MonSession *s = new MonSession(m->msg->get_source_inst(),
+    MonSession *s = new MonSession(req->get_source_inst(),
 				   static_cast<Connection*>(c.get()));
     c->set_priv(s->get());
     c->set_peer_addr(m->client.addr);
@@ -3040,8 +3136,6 @@ void Monitor::handle_forward(MForward *m)
     s->proxy_con = m->get_connection();
     s->proxy_tid = m->tid;
 
-    PaxosServiceMessage *req = m->msg;
-    m->msg = NULL;  // so ~MForward doesn't delete it
     req->set_connection(c);
 
     // not super accurate, but better than nothing.
@@ -3068,8 +3162,6 @@ void Monitor::handle_forward(MForward *m)
     _ms_dispatch(req);
     s->put();
   }
-  session->put();
-  m->put();
 }
 
 void Monitor::try_send_message(Message *m, const entity_inst_t& to)
@@ -3087,66 +3179,80 @@ void Monitor::try_send_message(Message *m, const entity_inst_t& to)
   }
 }
 
-void Monitor::send_reply(PaxosServiceMessage *req, Message *reply)
+void Monitor::send_reply(MonOpRequestRef op, Message *reply)
 {
-  ConnectionRef connection = req->get_connection();
-  if (!connection) {
+  op->mark_event(__func__);
+
+  MonSession *session = op->get_session();
+  assert(session);
+  Message *req = op->get_req();
+  ConnectionRef con = op->get_connection();
+
+  reply->set_cct(g_ceph_context);
+  dout(2) << __func__ << " " << op << " " << reply << " " << *reply << dendl;
+
+  if (!con) {
     dout(2) << "send_reply no connection, dropping reply " << *reply
 	    << " to " << req << " " << *req << dendl;
     reply->put();
+    op->mark_event("reply: no connection");
     return;
   }
-  MonSession *session = static_cast<MonSession*>(connection->get_priv());
-  if (!session) {
-    dout(2) << "send_reply no session, dropping reply " << *reply
+
+  if (!session->con && !session->proxy_con) {
+    dout(2) << "send_reply no connection, dropping reply " << *reply
 	    << " to " << req << " " << *req << dendl;
     reply->put();
+    op->mark_event("reply: no connection");
     return;
   }
+
   if (session->proxy_con) {
-    dout(15) << "send_reply routing reply to " << req->get_connection()->get_peer_addr()
+    dout(15) << "send_reply routing reply to " << con->get_peer_addr()
 	     << " via " << session->proxy_con->get_peer_addr()
 	     << " for request " << *req << dendl;
     session->proxy_con->send_message(new MRoute(session->proxy_tid, reply));
+    op->mark_event("reply: send routed request");
   } else {
     session->con->send_message(reply);
+    op->mark_event("reply: send");
   }
-  session->put();
 }
 
-void Monitor::no_reply(PaxosServiceMessage *req)
+void Monitor::no_reply(MonOpRequestRef op)
 {
-  MonSession *session = static_cast<MonSession*>(req->get_connection()->get_priv());
-  if (!session) {
-    dout(2) << "no_reply no session, dropping non-reply to " << req << " " << *req << dendl;
-    return;
-  }
+  MonSession *session = op->get_session();
+  Message *req = op->get_req();
+
   if (session->proxy_con) {
     if (get_quorum_features() & CEPH_FEATURE_MON_NULLROUTE) {
       dout(10) << "no_reply to " << req->get_source_inst()
 	       << " via " << session->proxy_con->get_peer_addr()
 	       << " for request " << *req << dendl;
       session->proxy_con->send_message(new MRoute(session->proxy_tid, NULL));
+      op->mark_event("no_reply: send routed request");
     } else {
-      dout(10) << "no_reply no quorum nullroute feature for " << req->get_source_inst()
+      dout(10) << "no_reply no quorum nullroute feature for "
+               << req->get_source_inst()
 	       << " via " << session->proxy_con->get_peer_addr()
 	       << " for request " << *req << dendl;
+      op->mark_event("no_reply: no quorum support");
     }
   } else {
-    dout(10) << "no_reply to " << req->get_source_inst() << " " << *req << dendl;
+    dout(10) << "no_reply to " << req->get_source_inst()
+             << " " << *req << dendl;
+    op->mark_event("no_reply");
   }
-  session->put();
 }
 
-void Monitor::handle_route(MRoute *m)
+void Monitor::handle_route(MonOpRequestRef op)
 {
-  MonSession *session = static_cast<MonSession *>(m->get_connection()->get_priv());
+  MRoute *m = static_cast<MRoute*>(op->get_req());
+  MonSession *session = op->get_session();
   //check privileges
-  if (session && !session->is_capable("mon", MON_CAP_X)) {
+  if (!session->is_capable("mon", MON_CAP_X)) {
     dout(0) << "MRoute received from entity without appropriate perms! "
 	    << dendl;
-    session->put();
-    m->put();
     return;
   }
   if (m->msg)
@@ -3178,9 +3284,6 @@ void Monitor::handle_route(MRoute *m)
       m->msg = NULL;
     }
   }
-  m->put();
-  if (session)
-    session->put();
 }
 
 void Monitor::resend_routed_requests()
@@ -3193,18 +3296,19 @@ void Monitor::resend_routed_requests()
        ++p) {
     RoutedRequest *rr = p->second;
 
-    bufferlist::iterator q = rr->request_bl.begin();
-    PaxosServiceMessage *req = (PaxosServiceMessage *)decode_message(cct, 0, q);
-
     if (mon == rank) {
-      dout(10) << " requeue for self tid " << rr->tid << " " << *req << dendl;
-      req->set_connection(rr->con);
-      retry.push_back(new C_RetryMessage(this, req));
+      dout(10) << " requeue for self tid " << rr->tid << dendl;
+      rr->op->mark_event("retry routed request");
+      retry.push_back(new C_RetryMessage(this, rr->op));
       delete rr;
     } else {
+      bufferlist::iterator q = rr->request_bl.begin();
+      PaxosServiceMessage *req = (PaxosServiceMessage *)decode_message(cct, 0, q);
+      rr->op->mark_event("resend forwarded message to leader");
       dout(10) << " resend to mon." << mon << " tid " << rr->tid << " " << *req << dendl;
       MForward *forward = new MForward(rr->tid, req, rr->con_features,
 				       rr->session->caps);
+      req->put();  // forward takes its own ref; drop ours.
       forward->client = rr->client_inst;
       forward->set_priority(req->get_priority());
       messenger->send_message(forward, monmap->get_inst(mon));
@@ -3219,6 +3323,7 @@ void Monitor::resend_routed_requests()
 void Monitor::remove_session(MonSession *s)
 {
   dout(10) << "remove_session " << s << " " << s->inst << dendl;
+  assert(s->con);
   assert(!s->closed);
   for (set<uint64_t>::iterator p = s->routed_request_tids.begin();
        p != s->routed_request_tids.end();
@@ -3241,9 +3346,11 @@ void Monitor::remove_all_sessions()
   while (!session_map.sessions.empty()) {
     MonSession *s = session_map.sessions.front();
     remove_session(s);
-    logger->inc(l_mon_session_rm);
+    if (logger)
+      logger->inc(l_mon_session_rm);
   }
-  logger->set(l_mon_num_sessions, session_map.get_size());
+  if (logger)
+    logger->set(l_mon_num_sessions, session_map.get_size());
 }
 
 void Monitor::send_command(const entity_inst_t& inst,
@@ -3255,7 +3362,7 @@ void Monitor::send_command(const entity_inst_t& inst,
   try_send_message(c, inst);
 }
 
-void Monitor::waitlist_or_zap_client(Message *m)
+void Monitor::waitlist_or_zap_client(MonOpRequestRef op)
 {
   /**
    * Wait list the new session until we're in the quorum, assuming it's
@@ -3270,17 +3377,24 @@ void Monitor::waitlist_or_zap_client(Message *m)
    * 3) command messages. We want to accept these under all possible
    * circumstances.
    */
-  ConnectionRef con = m->get_connection();
+  Message *m = op->get_req();
+  MonSession *s = op->get_session();
+  ConnectionRef con = op->get_connection();
   utime_t too_old = ceph_clock_now(g_ceph_context);
   too_old -= g_ceph_context->_conf->mon_lease;
   if (m->get_recv_stamp() > too_old &&
       con->is_connected()) {
     dout(5) << "waitlisting message " << *m << dendl;
-    maybe_wait_for_quorum.push_back(new C_RetryMessage(this, m));
+    maybe_wait_for_quorum.push_back(new C_RetryMessage(this, op));
+    op->mark_wait_for_quorum();
   } else {
     dout(5) << "discarding message " << *m << " and sending client elsewhere" << dendl;
     con->mark_down();
-    m->put();
+    // proxied sessions aren't registered and don't have a con; don't remove
+    // those.
+    if (!s->proxy_con)
+      remove_session(s);
+    op->mark_zap();
   }
 }
 
@@ -3291,26 +3405,12 @@ void Monitor::_ms_dispatch(Message *m)
     return;
   }
 
-  ConnectionRef connection = m->get_connection();
-  MonSession *s = NULL;
-  MonCap caps;
-  bool src_is_mon;
-
-  // regardless of who we are or who the sender is, the message must
-  // have a connection associated.  If it doesn't then something fishy
-  // is going on.
-  assert(connection);
-
-  src_is_mon = (connection->get_peer_type() & CEPH_ENTITY_TYPE_MON);
-
-  bool reuse_caps = false;
-  dout(20) << "have connection" << dendl;
-  s = static_cast<MonSession *>(connection->get_priv());
+  MonOpRequestRef op = op_tracker.create_request<MonOpRequest>(m);
+  bool src_is_mon = op->is_src_mon();
+  op->mark_event("mon:_ms_dispatch");
+  MonSession *s = op->get_session();
   if (s && s->closed) {
-    caps = s->caps;
-    reuse_caps = true;
-    s->put();
-    s = NULL;
+    return;
   }
   if (!s) {
     // if the sender is not a monitor, make sure their first message for a
@@ -3319,47 +3419,38 @@ void Monitor::_ms_dispatch(Message *m)
     // assume that the sender hasn't authenticated yet, so we have no way
     // of assessing whether we should handle it or not.
     if (!src_is_mon && (m->get_type() != CEPH_MSG_AUTH &&
-			m->get_type() != CEPH_MSG_MON_GET_MAP)) {
-      if (m->get_type() == CEPH_MSG_PING) {
-        // let it go through and be dispatched immediately!
-        return dispatch(s, m, false);
-      }
+			m->get_type() != CEPH_MSG_MON_GET_MAP &&
+			m->get_type() != CEPH_MSG_PING)) {
       dout(1) << __func__ << " dropping stray message " << *m
 	      << " from " << m->get_source_inst() << dendl;
-      m->put();
       return;
     }
 
-    if (!exited_quorum.is_zero() && !src_is_mon) {
-      waitlist_or_zap_client(m);
-      return;
-    }
-
-    dout(10) << "do not have session, making new one" << dendl;
     s = session_map.new_session(m->get_source_inst(), m->get_connection().get());
+    assert(s);
     m->get_connection()->set_priv(s->get());
-    dout(10) << "ms_dispatch new session " << s << " for " << s->inst << dendl;
+    dout(10) << __func__ << " new session " << s << " " << *s << dendl;
+    op->set_session(s);
 
     logger->set(l_mon_num_sessions, session_map.get_size());
     logger->inc(l_mon_session_add);
 
     if (!src_is_mon) {
-      dout(10) << "setting timeout on session" << dendl;
-      // set an initial timeout here, so we will trim this session even if they don't
-      // do anything.
+      dout(30) << __func__ << "  setting timeout on session" << dendl;
+      // set an initial timeout here, so we will trim this session
+      // even if they don't do anything.
       s->until = ceph_clock_now(g_ceph_context);
       s->until += g_conf->mon_subscribe_interval;
     } else {
-      //give it monitor caps; the peer type has been authenticated
-      reuse_caps = false;
-      dout(5) << "setting monitor caps on this connection" << dendl;
-      if (!s->caps.is_allow_all()) //but no need to repeatedly copy
+      // give it monitor caps; the peer type has been authenticated
+      dout(5) << __func__ << " setting monitor caps on this connection" << dendl;
+      if (!s->caps.is_allow_all()) // but no need to repeatedly copy
         s->caps = *mon_caps;
     }
-    if (reuse_caps)
-      s->caps = caps;
+    s->put();
   } else {
-    dout(20) << "ms_dispatch existing session " << s << " for " << s->inst << dendl;
+    dout(20) << __func__ << " existing session " << s << " for " << s->inst
+	     << dendl;
   }
 
   assert(s);
@@ -3368,32 +3459,42 @@ void Monitor::_ms_dispatch(Message *m)
   }
   dout(20) << " caps " << s->caps.get_str() << dendl;
 
-  if (is_synchronizing() && !src_is_mon) {
-    waitlist_or_zap_client(m);
-    return;
+  if ((is_synchronizing() ||
+       (s->global_id == 0 && !exited_quorum.is_zero())) &&
+      !src_is_mon &&
+      m->get_type() != CEPH_MSG_PING) {
+    waitlist_or_zap_client(op);
+  } else {
+    dispatch_op(op);
   }
-
-  dispatch(s, m, src_is_mon);
-  s->put();
   return;
 }
 
-void Monitor::dispatch(MonSession *s, Message *m, const bool src_is_mon)
+void Monitor::dispatch_op(MonOpRequestRef op)
 {
-  assert(m != NULL);
+  op->mark_event("mon:dispatch_op");
+  MonSession *s = op->get_session();
+  assert(s);
+  if (s->closed) {
+    dout(10) << " session closed, dropping " << op->get_req() << dendl;
+    return;
+  }
 
+  /* we will consider the default type as being 'monitor' until proven wrong */
+  op->set_type_monitor();
   /* deal with all messages that do not necessarily need caps */
   bool dealt_with = true;
-  switch (m->get_type()) {
+  switch (op->get_req()->get_type()) {
     // auth
     case MSG_MON_GLOBAL_ID:
     case CEPH_MSG_AUTH:
+      op->set_type_service();
       /* no need to check caps here */
-      paxos_service[PAXOS_AUTH]->dispatch((PaxosServiceMessage*)m);
+      paxos_service[PAXOS_AUTH]->dispatch(op);
       break;
 
     case CEPH_MSG_PING:
-      handle_ping(static_cast<MPing*>(m));
+      handle_ping(op);
       break;
 
     /* MMonGetMap may be used by clients to obtain a monmap *before*
@@ -3404,9 +3505,12 @@ void Monitor::dispatch(MonSession *s, Message *m, const bool src_is_mon)
      * not authenticate when obtaining a monmap.
      */
     case CEPH_MSG_MON_GET_MAP:
-      handle_mon_get_map(static_cast<MMonGetMap*>(m));
+      handle_mon_get_map(op);
       break;
 
+    case CEPH_MSG_MON_METADATA:
+      return handle_mon_metadata(op);
+
     default:
       dealt_with = false;
       break;
@@ -3414,9 +3518,11 @@ void Monitor::dispatch(MonSession *s, Message *m, const bool src_is_mon)
   if (dealt_with)
     return;
 
+  /* well, maybe the op belongs to a service... */
+  op->set_type_service();
   /* deal with all messages which caps should be checked somewhere else */
   dealt_with = true;
-  switch (m->get_type()) {
+  switch (op->get_req()->get_type()) {
 
     // OSDs
     case CEPH_MSG_MON_GET_OSDMAP:
@@ -3426,13 +3532,13 @@ void Monitor::dispatch(MonSession *s, Message *m, const bool src_is_mon)
     case MSG_OSD_ALIVE:
     case MSG_OSD_PGTEMP:
     case MSG_REMOVE_SNAPS:
-      paxos_service[PAXOS_OSDMAP]->dispatch((PaxosServiceMessage*)m);
+      paxos_service[PAXOS_OSDMAP]->dispatch(op);
       break;
 
     // MDSs
     case MSG_MDS_BEACON:
     case MSG_MDS_OFFLOAD_TARGETS:
-      paxos_service[PAXOS_MDSMAP]->dispatch((PaxosServiceMessage*)m);
+      paxos_service[PAXOS_MDSMAP]->dispatch(op);
       break;
 
 
@@ -3440,21 +3546,22 @@ void Monitor::dispatch(MonSession *s, Message *m, const bool src_is_mon)
     case CEPH_MSG_STATFS:
     case MSG_PGSTATS:
     case MSG_GETPOOLSTATS:
-      paxos_service[PAXOS_PGMAP]->dispatch((PaxosServiceMessage*)m);
+      paxos_service[PAXOS_PGMAP]->dispatch(op);
       break;
 
     case CEPH_MSG_POOLOP:
-      paxos_service[PAXOS_OSDMAP]->dispatch((PaxosServiceMessage*)m);
+      paxos_service[PAXOS_OSDMAP]->dispatch(op);
       break;
 
     // log
     case MSG_LOG:
-      paxos_service[PAXOS_LOG]->dispatch((PaxosServiceMessage*)m);
+      paxos_service[PAXOS_LOG]->dispatch(op);
       break;
 
     // handle_command() does its own caps checking
     case MSG_MON_COMMAND:
-      handle_command(static_cast<MMonCommand*>(m));
+      op->set_type_command();
+      handle_command(op);
       break;
 
     default:
@@ -3464,27 +3571,30 @@ void Monitor::dispatch(MonSession *s, Message *m, const bool src_is_mon)
   if (dealt_with)
     return;
 
+  /* nop, looks like it's not a service message; revert back to monitor */
+  op->set_type_monitor();
+
   /* messages we, the Monitor class, need to deal with
    * but may be sent by clients. */
 
-  if (!s->is_capable("mon", MON_CAP_R)) {
-    dout(5) << __func__ << " " << m->get_source_inst()
-            << " not enough caps for " << *m << " -- dropping"
+  if (!op->get_session()->is_capable("mon", MON_CAP_R)) {
+    dout(5) << __func__ << " " << op->get_req()->get_source_inst()
+            << " not enough caps for " << *(op->get_req()) << " -- dropping"
             << dendl;
     goto drop;
   }
 
   dealt_with = true;
-  switch (m->get_type()) {
+  switch (op->get_req()->get_type()) {
 
     // misc
     case CEPH_MSG_MON_GET_VERSION:
-      handle_get_version(static_cast<MMonGetVersion*>(m));
+      handle_get_version(op);
       break;
 
     case CEPH_MSG_MON_SUBSCRIBE:
       /* FIXME: check what's being subscribed, filter accordingly */
-      handle_subscribe(static_cast<MMonSubscribe*>(m));
+      handle_subscribe(op);
       break;
 
     default:
@@ -3494,53 +3604,53 @@ void Monitor::dispatch(MonSession *s, Message *m, const bool src_is_mon)
   if (dealt_with)
     return;
 
-  if (!src_is_mon) {
+  if (!op->is_src_mon()) {
     dout(1) << __func__ << " unexpected monitor message from"
-            << " non-monitor entity " << m->get_source_inst()
-            << " " << *m << " -- dropping" << dendl;
+            << " non-monitor entity " << op->get_req()->get_source_inst()
+            << " " << *(op->get_req()) << " -- dropping" << dendl;
     goto drop;
   }
 
   /* messages that should only be sent by another monitor */
   dealt_with = true;
-  switch (m->get_type()) {
+  switch (op->get_req()->get_type()) {
 
     case MSG_ROUTE:
-      handle_route(static_cast<MRoute*>(m));
+      handle_route(op);
       break;
 
     case MSG_MON_PROBE:
-      handle_probe(static_cast<MMonProbe*>(m));
+      handle_probe(op);
       break;
 
     // Sync (i.e., the new slurp, but on steroids)
     case MSG_MON_SYNC:
-      handle_sync(static_cast<MMonSync*>(m));
+      handle_sync(op);
       break;
     case MSG_MON_SCRUB:
-      handle_scrub(static_cast<MMonScrub*>(m));
+      handle_scrub(op);
       break;
 
     /* log acks are sent from a monitor we sent the MLog to, and are
        never sent by clients to us. */
     case MSG_LOGACK:
-      log_client.handle_log_ack((MLogAck*)m);
-      m->put();
+      log_client.handle_log_ack((MLogAck*)op->get_req());
       break;
 
     // monmap
     case MSG_MON_JOIN:
-      paxos_service[PAXOS_MONMAP]->dispatch((PaxosServiceMessage*)m);
+      op->set_type_service();
+      paxos_service[PAXOS_MONMAP]->dispatch(op);
       break;
 
     // paxos
     case MSG_MON_PAXOS:
       {
-        MMonPaxos *pm = static_cast<MMonPaxos*>(m);
-        if (!src_is_mon ||
-            !s->is_capable("mon", MON_CAP_X)) {
+        op->set_type_paxos();
+        MMonPaxos *pm = static_cast<MMonPaxos*>(op->get_req());
+        if (!op->is_src_mon() ||
+            !op->get_session()->is_capable("mon", MON_CAP_X)) {
           //can't send these!
-          pm->put();
           break;
         }
 
@@ -3549,52 +3659,46 @@ void Monitor::dispatch(MonSession *s, Message *m, const bool src_is_mon)
           // good, thus just drop them and ignore them.
           dout(10) << __func__ << " ignore paxos msg from "
             << pm->get_source_inst() << dendl;
-          pm->put();
           break;
         }
 
         // sanitize
         if (pm->epoch > get_epoch()) {
           bootstrap();
-          pm->put();
           break;
         }
         if (pm->epoch != get_epoch()) {
-          pm->put();
           break;
         }
 
-        paxos->dispatch((PaxosServiceMessage*)m);
+        paxos->dispatch(op);
       }
       break;
 
     // elector messages
     case MSG_MON_ELECTION:
+      op->set_type_election();
       //check privileges here for simplicity
-      if (s &&
-          !s->is_capable("mon", MON_CAP_X)) {
+      if (!op->get_session()->is_capable("mon", MON_CAP_X)) {
         dout(0) << "MMonElection received from entity without enough caps!"
-          << s->caps << dendl;
-        m->put();
+          << op->get_session()->caps << dendl;
         break;
       }
       if (!is_probing() && !is_synchronizing()) {
-        elector.dispatch(m);
-      } else {
-        m->put();
+        elector.dispatch(op);
       }
       break;
 
     case MSG_FORWARD:
-      handle_forward(static_cast<MForward *>(m));
+      handle_forward(op);
       break;
 
     case MSG_TIMECHECK:
-      handle_timecheck(static_cast<MTimeCheck *>(m));
+      handle_timecheck(op);
       break;
 
     case MSG_MON_HEALTH:
-      health_monitor->dispatch(static_cast<MMonHealth *>(m));
+      health_monitor->dispatch(op);
       break;
 
     default:
@@ -3602,17 +3706,18 @@ void Monitor::dispatch(MonSession *s, Message *m, const bool src_is_mon)
       break;
   }
   if (!dealt_with) {
-    dout(1) << "dropping unexpected " << *m << dendl;
+    dout(1) << "dropping unexpected " << *(op->get_req()) << dendl;
     goto drop;
   }
   return;
 
 drop:
-  m->put();
+  return;
 }
 
-void Monitor::handle_ping(MPing *m)
+void Monitor::handle_ping(MonOpRequestRef op)
 {
+  MPing *m = static_cast<MPing*>(op->get_req());
   dout(10) << __func__ << " " << *m << dendl;
   MPing *reply = new MPing;
   entity_inst_t inst = m->get_source_inst();
@@ -3634,7 +3739,6 @@ void Monitor::handle_ping(MPing *m)
   reply->set_payload(payload);
   dout(10) << __func__ << " reply payload len " << reply->get_payload().length() << dendl;
   messenger->send_message(reply, inst);
-  m->put();
 }
 
 void Monitor::timecheck_start()
@@ -3824,8 +3928,9 @@ health_status_t Monitor::timecheck_status(ostringstream &ss,
   return status;
 }
 
-void Monitor::handle_timecheck_leader(MTimeCheck *m)
+void Monitor::handle_timecheck_leader(MonOpRequestRef op)
 {
+  MTimeCheck *m = static_cast<MTimeCheck*>(op->get_req());
   dout(10) << __func__ << " " << *m << dendl;
   /* handles PONG's */
   assert(m->op == MTimeCheck::OP_PONG);
@@ -3946,8 +4051,9 @@ void Monitor::handle_timecheck_leader(MTimeCheck *m)
   }
 }
 
-void Monitor::handle_timecheck_peon(MTimeCheck *m)
+void Monitor::handle_timecheck_peon(MonOpRequestRef op)
 {
+  MTimeCheck *m = static_cast<MTimeCheck*>(op->get_req());
   dout(10) << __func__ << " " << *m << dendl;
 
   assert(is_peon());
@@ -3987,40 +4093,37 @@ void Monitor::handle_timecheck_peon(MTimeCheck *m)
   m->get_connection()->send_message(reply);
 }
 
-void Monitor::handle_timecheck(MTimeCheck *m)
+void Monitor::handle_timecheck(MonOpRequestRef op)
 {
+  MTimeCheck *m = static_cast<MTimeCheck*>(op->get_req());
   dout(10) << __func__ << " " << *m << dendl;
 
   if (is_leader()) {
     if (m->op != MTimeCheck::OP_PONG) {
       dout(1) << __func__ << " drop unexpected msg (not pong)" << dendl;
     } else {
-      handle_timecheck_leader(m);
+      handle_timecheck_leader(op);
     }
   } else if (is_peon()) {
     if (m->op != MTimeCheck::OP_PING && m->op != MTimeCheck::OP_REPORT) {
       dout(1) << __func__ << " drop unexpected msg (not ping or report)" << dendl;
     } else {
-      handle_timecheck_peon(m);
+      handle_timecheck_peon(op);
     }
   } else {
     dout(1) << __func__ << " drop unexpected msg" << dendl;
   }
-  m->put();
 }
 
-void Monitor::handle_subscribe(MMonSubscribe *m)
+void Monitor::handle_subscribe(MonOpRequestRef op)
 {
+  MMonSubscribe *m = static_cast<MMonSubscribe*>(op->get_req());
   dout(10) << "handle_subscribe " << *m << dendl;
   
   bool reply = false;
 
-  MonSession *s = static_cast<MonSession *>(m->get_connection()->get_priv());
-  if (!s) {
-    dout(10) << " no session, dropping" << dendl;
-    m->put();
-    return;
-  }
+  MonSession *s = op->get_session();
+  assert(s);
 
   s->until = ceph_clock_now(g_ceph_context);
   s->until += g_conf->mon_subscribe_interval;
@@ -4041,6 +4144,10 @@ void Monitor::handle_subscribe(MMonSubscribe *m)
       }
     } else if (p->first == "osdmap") {
       if ((int)s->is_capable("osd", MON_CAP_R)) {
+	if (s->osd_epoch > p->second.start) {
+	  // client needs earlier osdmaps on purpose, so reset the sent epoch
+	  s->osd_epoch = 0;
+	}
         osdmon()->check_sub(s->sub_map["osdmap"]);
       }
     } else if (p->first == "osd_pg_creates") {
@@ -4059,25 +4166,20 @@ void Monitor::handle_subscribe(MMonSubscribe *m)
   if (reply)
     m->get_connection()->send_message(new MMonSubscribeAck(monmap->get_fsid(), (int)g_conf->mon_subscribe_interval));
 
-  s->put();
-  m->put();
 }
 
-void Monitor::handle_get_version(MMonGetVersion *m)
+void Monitor::handle_get_version(MonOpRequestRef op)
 {
+  MMonGetVersion *m = static_cast<MMonGetVersion*>(op->get_req());
   dout(10) << "handle_get_version " << *m << dendl;
   PaxosService *svc = NULL;
 
-  MonSession *s = static_cast<MonSession *>(m->get_connection()->get_priv());
-  if (!s) {
-    dout(10) << " no session, dropping" << dendl;
-    m->put();
-    return;
-  }
+  MonSession *s = op->get_session();
+  assert(s);
 
   if (!is_leader() && !is_peon()) {
     dout(10) << " waiting for quorum" << dendl;
-    waitfor_quorum.push_back(new C_RetryMessage(this, m));
+    waitfor_quorum.push_back(new C_RetryMessage(this, op));
     goto out;
   }
 
@@ -4093,7 +4195,7 @@ void Monitor::handle_get_version(MMonGetVersion *m)
 
   if (svc) {
     if (!svc->is_readable()) {
-      svc->wait_for_readable(new C_RetryMessage(this, m));
+      svc->wait_for_readable(op, new C_RetryMessage(this, op));
       goto out;
     }
 
@@ -4105,11 +4207,8 @@ void Monitor::handle_get_version(MMonGetVersion *m)
 
     m->get_connection()->send_message(reply);
   }
-
-  m->put();
-
  out:
-  s->put();
+  return;
 }
 
 bool Monitor::ms_handle_reset(Connection *con)
@@ -4174,19 +4273,100 @@ void Monitor::send_latest_monmap(Connection *con)
   con->send_message(new MMonMap(bl));
 }
 
-void Monitor::handle_mon_get_map(MMonGetMap *m)
+void Monitor::handle_mon_get_map(MonOpRequestRef op)
 {
+  MMonGetMap *m = static_cast<MMonGetMap*>(op->get_req());
   dout(10) << "handle_mon_get_map" << dendl;
   send_latest_monmap(m->get_connection().get());
-  m->put();
 }
 
+void Monitor::handle_mon_metadata(MonOpRequestRef op)
+{
+  MMonMetadata *m = static_cast<MMonMetadata*>(op->get_req());
+  if (is_leader()) {
+    dout(10) << __func__ << dendl;
+    update_mon_metadata(m->get_source().num(), m->data);
+  }
+}
+
+void Monitor::update_mon_metadata(int from, const Metadata& m)
+{
+  metadata[from] = m;
+
+  bufferlist bl;
+  int err = store->get(MONITOR_STORE_PREFIX, "last_metadata", bl);
+  map<int, Metadata> last_metadata;
+  if (!err) {
+    bufferlist::iterator iter = bl.begin();
+    ::decode(last_metadata, iter);
+    metadata.insert(last_metadata.begin(), last_metadata.end());
+  }
+
+  MonitorDBStore::TransactionRef t = paxos->get_pending_transaction();
+  bl.clear();
+  ::encode(metadata, bl);
+  t->put(MONITOR_STORE_PREFIX, "last_metadata", bl);
+  paxos->trigger_propose();
+}
+
+int Monitor::load_metadata(map<int, Metadata>& metadata)
+{
+  bufferlist bl;
+  int r = store->get(MONITOR_STORE_PREFIX, "last_metadata", bl);
+  if (r)
+    return r;
+  bufferlist::iterator it = bl.begin();
+  ::decode(metadata, it);
+  return 0;
+}
+
+int Monitor::get_mon_metadata(int mon, Formatter *f, ostream& err)
+{
+  assert(f);
+  map<int, Metadata> last_metadata;
+  if (int r = load_metadata(last_metadata)) {
+    err << "Unable to load metadata";
+    return r;
+  }
+  if (!last_metadata.count(mon)) {
+    err << "mon." << mon << " not found";
+    return -EINVAL;
+  }
+  const Metadata& m = last_metadata[mon];
+  for (Metadata::const_iterator p = m.begin(); p != m.end(); ++p) {
+    f->dump_string(p->first.c_str(), p->second);
+  }
+  return 0;
+}
+
+int Monitor::print_nodes(Formatter *f, ostream& err)
+{
+  map<int, Metadata> metadata;
+  if (int r = load_metadata(metadata)) {
+    err << "Unable to load metadata.\n";
+    return r;
+  }
+
+  map<string, list<int> > mons;	// hostname => mon
+  for (map<int, Metadata>::iterator it = metadata.begin();
+       it != metadata.end(); ++it) {
+    const Metadata& m = it->second;
+    Metadata::const_iterator hostname = m.find("hostname");
+    if (hostname == m.end()) {
+      // not likely though
+      continue;
+    }
+    mons[hostname->second].push_back(it->first);
+  }
 
+  dump_services(f, mons, "mon");
+  return 0;
+}
 
 // ----------------------------------------------
 // scrub
 
-int Monitor::scrub()
+int Monitor::scrub_start()
 {
   dout(10) << __func__ << dendl;
   assert(is_leader());
@@ -4201,39 +4381,80 @@ int Monitor::scrub()
     return -EBUSY;
   }
 
+  scrub_event_cancel();
   scrub_result.clear();
+  scrub_state.reset(new ScrubState);
+
+  scrub();
+  return 0;
+}
+
+int Monitor::scrub()
+{
+  assert(is_leader());
+  assert(scrub_state);
+
+  scrub_cancel_timeout();
+  wait_for_paxos_write();
   scrub_version = paxos->get_version();
 
+
+  // scrub all keys if we're the only monitor in the quorum
+  int32_t num_keys =
+    (quorum.size() == 1 ? -1 : cct->_conf->mon_scrub_max_keys);
+
   for (set<int>::iterator p = quorum.begin();
        p != quorum.end();
        ++p) {
     if (*p == rank)
       continue;
-    MMonScrub *r = new MMonScrub(MMonScrub::OP_SCRUB, scrub_version);
+    MMonScrub *r = new MMonScrub(MMonScrub::OP_SCRUB, scrub_version,
+                                 num_keys);
+    r->key = scrub_state->last_key;
     messenger->send_message(r, monmap->get_inst(*p));
   }
 
   // scrub my keys
-  _scrub(&scrub_result[rank]);
+  bool r = _scrub(&scrub_result[rank],
+                  &scrub_state->last_key,
+                  &num_keys);
 
-  if (scrub_result.size() == quorum.size())
-    scrub_finish();
+  scrub_state->finished = !r;
+
+  // only after we got our scrub results do we really care whether the
+  // other monitors are late on their results.  Also, this way we avoid
+  // triggering the timeout if we end up getting stuck in _scrub() for
+  // longer than the duration of the timeout.
+  scrub_reset_timeout();
 
+  if (quorum.size() == 1) {
+    assert(scrub_state->finished == true);
+    scrub_finish();
+  }
   return 0;
 }
 
-void Monitor::handle_scrub(MMonScrub *m)
+void Monitor::handle_scrub(MonOpRequestRef op)
 {
+  MMonScrub *m = static_cast<MMonScrub*>(op->get_req());
   dout(10) << __func__ << " " << *m << dendl;
   switch (m->op) {
   case MMonScrub::OP_SCRUB:
     {
       if (!is_peon())
 	break;
+
+      wait_for_paxos_write();
+
       if (m->version != paxos->get_version())
 	break;
-      MMonScrub *reply = new MMonScrub(MMonScrub::OP_RESULT, m->version);
-      _scrub(&reply->result);
+
+      MMonScrub *reply = new MMonScrub(MMonScrub::OP_RESULT,
+                                       m->version,
+                                       m->num_keys);
+
+      reply->key = m->key;
+      _scrub(&reply->result, &reply->key, &reply->num_keys);
       m->get_connection()->send_message(reply);
     }
     break;
@@ -4244,41 +4465,92 @@ void Monitor::handle_scrub(MMonScrub *m)
 	break;
       if (m->version != scrub_version)
 	break;
+      // reset the timeout each time we get a result
+      scrub_reset_timeout();
+
       int from = m->get_source().num();
       assert(scrub_result.count(from) == 0);
       scrub_result[from] = m->result;
 
-      if (scrub_result.size() == quorum.size())
-	scrub_finish();
+      if (scrub_result.size() == quorum.size()) {
+        scrub_check_results();
+        scrub_result.clear();
+        if (scrub_state->finished)
+          scrub_finish();
+        else
+          scrub();
+      }
     }
     break;
   }
-  m->put();
 }
 
-void Monitor::_scrub(ScrubResult *r)
+bool Monitor::_scrub(ScrubResult *r,
+                     pair<string,string> *start,
+                     int *num_keys)
 {
+  assert(r != NULL);
+  assert(start != NULL);
+  assert(num_keys != NULL);
+
   set<string> prefixes = get_sync_targets_names();
   prefixes.erase("paxos");  // exclude paxos, as this one may have extra states for proposals, etc.
 
-  dout(10) << __func__ << " prefixes " << prefixes << dendl;
+  dout(10) << __func__ << " start (" << *start << ")"
+           << " num_keys " << *num_keys << dendl;
+
+  MonitorDBStore::Synchronizer it = store->get_synchronizer(*start, prefixes);
+
+  int scrubbed_keys = 0;
+  pair<string,string> last_key;
+
+  while (it->has_next_chunk()) {
+
+    if (*num_keys > 0 && scrubbed_keys == *num_keys)
+      break;
+
+    pair<string,string> k = it->get_next_key();
+    if (prefixes.count(k.first) == 0)
+      continue;
 
-  pair<string,string> start;
-  MonitorDBStore::Synchronizer synchronizer = store->get_synchronizer(start, prefixes);
+    if (cct->_conf->mon_scrub_inject_missing_keys > 0.0 &&
+        (rand() % 10000 < cct->_conf->mon_scrub_inject_missing_keys*10000.0)) {
+      dout(10) << __func__ << " inject missing key, skipping (" << k << ")"
+               << dendl;
+      continue;
+    }
 
-  while (synchronizer->has_next_chunk()) {
-    pair<string,string> k = synchronizer->get_next_key();
     bufferlist bl;
     store->get(k.first, k.second, bl);
-    dout(30) << __func__ << " " << k << " bl " << bl.length() << " bytes crc " << bl.crc32c(0) << dendl;
+    uint32_t key_crc = bl.crc32c(0);
+    dout(30) << __func__ << " " << k << " bl " << bl.length() << " bytes"
+                                     << " crc " << key_crc << dendl;
     r->prefix_keys[k.first]++;
     if (r->prefix_crc.count(k.first) == 0)
       r->prefix_crc[k.first] = 0;
     r->prefix_crc[k.first] = bl.crc32c(r->prefix_crc[k.first]);
+
+    if (cct->_conf->mon_scrub_inject_crc_mismatch > 0.0 &&
+        (rand() % 10000 < cct->_conf->mon_scrub_inject_crc_mismatch*10000.0)) {
+      dout(10) << __func__ << " inject failure at (" << k << ")" << dendl;
+      r->prefix_crc[k.first] += 1;
+    }
+
+    ++scrubbed_keys;
+    last_key = k;
   }
+
+  dout(20) << __func__ << " last_key (" << last_key << ")"
+                       << " scrubbed_keys " << scrubbed_keys
+                       << " has_next " << it->has_next_chunk() << dendl;
+
+  *start = last_key;
+  *num_keys = scrubbed_keys;
+
+  return it->has_next_chunk();
 }
 
-void Monitor::scrub_finish()
+void Monitor::scrub_check_results()
 {
   dout(10) << __func__ << dendl;
 
@@ -4299,18 +4571,91 @@ void Monitor::scrub_finish()
   }
   if (!errors)
     clog->info() << "scrub ok on " << quorum << ": " << mine << "\n";
+}
 
+inline void Monitor::scrub_timeout()
+{
+  dout(1) << __func__ << " restarting scrub" << dendl;
   scrub_reset();
+  scrub_start();
+}
+
+void Monitor::scrub_finish()
+{
+  dout(10) << __func__ << dendl;
+  scrub_reset();
+  scrub_event_start();
 }
 
 void Monitor::scrub_reset()
 {
   dout(10) << __func__ << dendl;
+  scrub_cancel_timeout();
   scrub_version = 0;
   scrub_result.clear();
+  scrub_state.reset();
+}
+
+inline void Monitor::scrub_update_interval(int secs)
+{
+  // we don't care about changes if we are not the leader.
+  // changes will be visible if we become the leader.
+  if (!is_leader())
+    return;
+
+  dout(1) << __func__ << " new interval = " << secs << dendl;
+
+  // if scrub already in progress, all changes will already be visible during
+  // the next round.  Nothing to do.
+  if (scrub_state != NULL)
+    return;
+
+  scrub_event_cancel();
+  scrub_event_start();
 }
 
+void Monitor::scrub_event_start()
+{
+  dout(10) << __func__ << dendl;
 
+  if (scrub_event)
+    scrub_event_cancel();
+
+  if (cct->_conf->mon_scrub_interval <= 0) {
+    dout(1) << __func__ << " scrub event is disabled"
+            << " (mon_scrub_interval = " << cct->_conf->mon_scrub_interval
+            << ")" << dendl;
+    return;
+  }
+
+  scrub_event = new C_Scrub(this);
+  timer.add_event_after(cct->_conf->mon_scrub_interval, scrub_event);
+}
+
+void Monitor::scrub_event_cancel()
+{
+  dout(10) << __func__ << dendl;
+  if (scrub_event) {
+    timer.cancel_event(scrub_event);
+    scrub_event = NULL;
+  }
+}
+
+inline void Monitor::scrub_cancel_timeout()
+{
+  if (scrub_timeout_event) {
+    timer.cancel_event(scrub_timeout_event);
+    scrub_timeout_event = NULL;
+  }
+}
+
+void Monitor::scrub_reset_timeout()
+{
+  dout(15) << __func__ << " reset timeout event" << dendl;
+  scrub_cancel_timeout();
+  scrub_timeout_event = new C_ScrubTimeout(this);
+  timer.add_event_after(g_conf->mon_scrub_timeout, scrub_timeout_event);
+}
 
 /************ TICK ***************/
 
@@ -4494,8 +4839,11 @@ int Monitor::mkfs(bufferlist& osdmapbl)
   if (is_keyring_required()) {
     KeyRing keyring;
     string keyring_filename;
-    if (!ceph_resolve_file_search(g_conf->keyring, keyring_filename)) {
-      derr << "unable to find a keyring file on " << g_conf->keyring << dendl;
+
+    r = ceph_resolve_file_search(g_conf->keyring, keyring_filename);
+    if (r) {
+      derr << "unable to find a keyring file on " << g_conf->keyring
+	   << ": " << cpp_strerror(r) << dendl;
       if (g_conf->key != "") {
 	string keyring_plaintext = "[mon.]\n\tkey = " + g_conf->key +
 	  "\n\tcaps mon = \"allow *\"\n";
diff --git a/src/mon/Monitor.h b/src/mon/Monitor.h
index 0d3978a..9c2ced0 100644
--- a/src/mon/Monitor.h
+++ b/src/mon/Monitor.h
@@ -47,7 +47,6 @@
 
 #include "messages/MMonCommand.h"
 #include "messages/MPing.h"
-#include "mon/MonitorStore.h"
 #include "mon/MonitorDBStore.h"
 
 #include <memory>
@@ -55,6 +54,9 @@
 #include "include/str_map.h"
 #include <errno.h>
 
+#include "common/TrackedOp.h"
+#include "mon/MonOpRequest.h"
+
 
 #define CEPH_MON_PROTOCOL     13 /* cluster internal */
 
@@ -108,6 +110,7 @@ class AdminSocketHook;
 
 class MMonGetMap;
 class MMonGetVersion;
+class MMonMetadata;
 class MMonSync;
 class MMonScrub;
 class MMonProbe;
@@ -232,7 +235,7 @@ private:
   set<string> outside_quorum;
 
   /**
-   * @defgroup scrub
+   * @defgroup Monitor_h_scrub
    * @{
    */
   version_t scrub_version;            ///< paxos version we are scrubbing
@@ -243,14 +246,50 @@ private:
    *
    * Verify all mons are storing identical content
    */
+  int scrub_start();
   int scrub();
-  void handle_scrub(MMonScrub *m);
-  void _scrub(ScrubResult *r);
+  void handle_scrub(MonOpRequestRef op);
+  bool _scrub(ScrubResult *r,
+              pair<string,string> *start,
+              int *num_keys);
+  void scrub_check_results();
+  void scrub_timeout();
   void scrub_finish();
   void scrub_reset();
+  void scrub_update_interval(int secs);
+
+  struct C_Scrub : public Context {
+    Monitor *mon;
+    C_Scrub(Monitor *m) : mon(m) { }
+    void finish(int r) {
+      mon->scrub_start();
+    }
+  };
+  struct C_ScrubTimeout : public Context {
+    Monitor *mon;
+    C_ScrubTimeout(Monitor *m) : mon(m) { }
+    void finish(int r) {
+      mon->scrub_timeout();
+    }
+  };
+  Context *scrub_event;       ///< periodic event to trigger scrub (leader)
+  Context *scrub_timeout_event;  ///< scrub round timeout (leader)
+  void scrub_event_start();
+  void scrub_event_cancel();
+  void scrub_reset_timeout();
+  void scrub_cancel_timeout();
+
+  struct ScrubState {
+    pair<string,string> last_key; ///< last scrubbed key
+    bool finished;
+
+    ScrubState() : finished(false) { }
+    virtual ~ScrubState() { }
+  };
+  ceph::shared_ptr<ScrubState> scrub_state; ///< keeps track of current scrub
 
   /**
-   * @defgroup Synchronization
+   * @defgroup Monitor_h_sync Synchronization
    * @{
    */
   /**
@@ -411,18 +450,18 @@ private:
    *
    * @param m Sync message with operation type MMonSync::OP_START_CHUNKS
    */
-  void handle_sync(MMonSync *m);
+  void handle_sync(MonOpRequestRef op);
 
-  void _sync_reply_no_cookie(MMonSync *m);
+  void _sync_reply_no_cookie(MonOpRequestRef op);
 
-  void handle_sync_get_cookie(MMonSync *m);
-  void handle_sync_get_chunk(MMonSync *m);
-  void handle_sync_finish(MMonSync *m);
+  void handle_sync_get_cookie(MonOpRequestRef op);
+  void handle_sync_get_chunk(MonOpRequestRef op);
+  void handle_sync_finish(MonOpRequestRef op);
 
-  void handle_sync_cookie(MMonSync *m);
-  void handle_sync_forward(MMonSync *m);
-  void handle_sync_chunk(MMonSync *m);
-  void handle_sync_no_cookie(MMonSync *m);
+  void handle_sync_cookie(MonOpRequestRef op);
+  void handle_sync_forward(MonOpRequestRef op);
+  void handle_sync_chunk(MonOpRequestRef op);
+  void handle_sync_no_cookie(MonOpRequestRef op);
 
   /**
    * @} // Synchronization
@@ -487,9 +526,9 @@ private:
   health_status_t timecheck_status(ostringstream &ss,
                                    const double skew_bound,
                                    const double latency);
-  void handle_timecheck_leader(MTimeCheck *m);
-  void handle_timecheck_peon(MTimeCheck *m);
-  void handle_timecheck(MTimeCheck *m);
+  void handle_timecheck_leader(MonOpRequestRef op);
+  void handle_timecheck_peon(MonOpRequestRef op);
+  void handle_timecheck(MonOpRequestRef op);
   /**
    * @}
    */
@@ -520,7 +559,7 @@ private:
   /**
    * Handle ping messages from others.
    */
-  void handle_ping(MPing *m);
+  void handle_ping(MonOpRequestRef op);
 
   Context *probe_timeout_event;  // for probing
 
@@ -635,9 +674,10 @@ public:
   void send_latest_monmap(Connection *con);
 
   // messages
-  void handle_get_version(MMonGetVersion *m);
-  void handle_subscribe(MMonSubscribe *m);
-  void handle_mon_get_map(MMonGetMap *m);
+  void handle_get_version(MonOpRequestRef op);
+  void handle_subscribe(MonOpRequestRef op);
+  void handle_mon_get_map(MonOpRequestRef op);
+
   static void _generate_command_map(map<string,cmd_vartype>& cmdmap,
                                     map<string,string> &param_str_map);
   static const MonCommand *_get_moncommand(const string &cmd_prefix,
@@ -649,8 +689,13 @@ public:
   void get_mon_status(Formatter *f, ostream& ss);
   void _quorum_status(Formatter *f, ostream& ss);
   bool _add_bootstrap_peer_hint(string cmd, cmdmap_t& cmdmap, ostream& ss);
-  void handle_command(class MMonCommand *m);
-  void handle_route(MRoute *m);
+  void handle_command(MonOpRequestRef op);
+  void handle_route(MonOpRequestRef op);
+
+  void handle_mon_metadata(MonOpRequestRef op);
+  int get_mon_metadata(int mon, Formatter *f, ostream& err);
+  int print_nodes(Formatter *f, ostream& err);
+  map<int, Metadata> metadata;
 
   /**
    *
@@ -714,11 +759,11 @@ public:
                              Formatter *f);
   void get_cluster_status(stringstream &ss, Formatter *f);
 
-  void reply_command(MMonCommand *m, int rc, const string &rs, version_t version);
-  void reply_command(MMonCommand *m, int rc, const string &rs, bufferlist& rdata, version_t version);
+  void reply_command(MonOpRequestRef op, int rc, const string &rs, version_t version);
+  void reply_command(MonOpRequestRef op, int rc, const string &rs, bufferlist& rdata, version_t version);
 
 
-  void handle_probe(MMonProbe *m);
+  void handle_probe(MonOpRequestRef op);
   /**
    * Handle a Probe Operation, replying with our name, quorum and known versions.
    *
@@ -733,8 +778,8 @@ public:
    *
    * @param m A Probe message, with an operation of type Probe.
    */
-  void handle_probe_probe(MMonProbe *m);
-  void handle_probe_reply(MMonProbe *m);
+  void handle_probe_probe(MonOpRequestRef op);
+  void handle_probe_reply(MonOpRequestRef op);
 
   // request routing
   struct RoutedRequest {
@@ -744,6 +789,7 @@ public:
     ConnectionRef con;
     uint64_t con_features;
     entity_inst_t client_inst;
+    MonOpRequestRef op;
 
     RoutedRequest() : tid(0), session(NULL), con_features(0) {}
     ~RoutedRequest() {
@@ -754,38 +800,39 @@ public:
   uint64_t routed_request_tid;
   map<uint64_t, RoutedRequest*> routed_requests;
   
-  void forward_request_leader(PaxosServiceMessage *req);
-  void handle_forward(MForward *m);
+  void forward_request_leader(MonOpRequestRef op);
+  void handle_forward(MonOpRequestRef op);
   void try_send_message(Message *m, const entity_inst_t& to);
-  void send_reply(PaxosServiceMessage *req, Message *reply);
-  void no_reply(PaxosServiceMessage *req);
+  void send_reply(MonOpRequestRef op, Message *reply);
+  void no_reply(MonOpRequestRef op);
   void resend_routed_requests();
   void remove_session(MonSession *s);
   void remove_all_sessions();
-  void waitlist_or_zap_client(Message *m);
+  void waitlist_or_zap_client(MonOpRequestRef op);
 
   void send_command(const entity_inst_t& inst,
 		    const vector<string>& com);
 
 public:
-  struct C_Command : public Context {
+  struct C_Command : public C_MonOp {
     Monitor *mon;
-    MMonCommand *m;
     int rc;
     string rs;
     bufferlist rdata;
     version_t version;
-    C_Command(Monitor *_mm, MMonCommand *_m, int r, string s, version_t v) :
-      mon(_mm), m(_m), rc(r), rs(s), version(v){}
-    C_Command(Monitor *_mm, MMonCommand *_m, int r, string s, bufferlist rd, version_t v) :
-      mon(_mm), m(_m), rc(r), rs(s), rdata(rd), version(v){}
-    void finish(int r) {
+    C_Command(Monitor *_mm, MonOpRequestRef _op, int r, string s, version_t v) :
+      C_MonOp(_op), mon(_mm), rc(r), rs(s), version(v){}
+    C_Command(Monitor *_mm, MonOpRequestRef _op, int r, string s, bufferlist rd, version_t v) :
+      C_MonOp(_op), mon(_mm), rc(r), rs(s), rdata(rd), version(v){}
+
+    virtual void _finish(int r) {
+      MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
       if (r >= 0) {
         ostringstream ss;
-        if (!m->get_connection()) {
+        if (!op->get_req()->get_connection()) {
           ss << "connection dropped for command ";
         } else {
-          MonSession *s = m->get_session();
+          MonSession *s = op->get_session();
 
           // if client drops we may not have a session to draw information from.
           if (s) {
@@ -798,28 +845,29 @@ public:
         ss << "cmd='" << m->cmd << "': finished";
 
         mon->audit_clog->info() << ss.str();
-	mon->reply_command(m, rc, rs, rdata, version);
+	mon->reply_command(op, rc, rs, rdata, version);
       }
       else if (r == -ECANCELED)
-	m->put();
+        return;
       else if (r == -EAGAIN)
-	mon->_ms_dispatch(m);
+	mon->dispatch_op(op);
       else
 	assert(0 == "bad C_Command return value");
     }
   };
 
  private:
-  class C_RetryMessage : public Context {
+  class C_RetryMessage : public C_MonOp {
     Monitor *mon;
-    Message *msg;
   public:
-    C_RetryMessage(Monitor *m, Message *ms) : mon(m), msg(ms) {}
-    void finish(int r) {
+    C_RetryMessage(Monitor *m, MonOpRequestRef op) :
+      C_MonOp(op), mon(m) { }
+
+    virtual void _finish(int r) {
       if (r == -EAGAIN || r >= 0)
-	mon->_ms_dispatch(msg);
+        mon->dispatch_op(op);
       else if (r == -ECANCELED)
-	msg->put();
+        return;
       else
 	assert(0 == "bad C_RetryMessage return value");
     }
@@ -834,8 +882,7 @@ public:
     lock.Unlock();
     return true;
   }
-  // dissociate message handling from session and connection logic
-  void dispatch(MonSession *s, Message *m, const bool src_is_mon);
+  void dispatch_op(MonOpRequestRef op);
   //mon_caps is used for un-connected messages from monitors
   MonCap * mon_caps;
   bool ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new);
@@ -848,6 +895,9 @@ public:
   int write_default_keyring(bufferlist& bl);
   void extract_save_mon_key(KeyRing& keyring);
 
+  void update_mon_metadata(int from, const Metadata& m);
+  int load_metadata(map<int, Metadata>& m);
+
   // features
   static CompatSet get_initial_supported_features();
   static CompatSet get_supported_features();
@@ -857,6 +907,8 @@ public:
   void read_features();
   void write_features(MonitorDBStore::TransactionRef t);
 
+  OpTracker op_tracker;
+
  public:
   Monitor(CephContext *cct_, string nm, MonitorDBStore *s,
 	  Messenger *m, MonMap *map);
@@ -924,6 +976,7 @@ public:
 #define CEPH_MON_FEATURE_INCOMPAT_OSD_ERASURE_CODES CompatSet::Feature(4, "support erasure code pools")
 #define CEPH_MON_FEATURE_INCOMPAT_OSDMAP_ENC CompatSet::Feature(5, "new-style osdmap encoding")
 #define CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V2 CompatSet::Feature(6, "support isa/lrc erasure code")
+#define CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V3 CompatSet::Feature(7, "support shec erasure code")
 // make sure you add your feature to Monitor::get_supported_features
 
 long parse_pos_long(const char *s, ostream *pss = NULL);
@@ -937,10 +990,11 @@ struct MonCommand {
   uint64_t flags;
 
   // MonCommand flags
-  enum {
-    FLAG_NOFORWARD = (1 << 0),
-  };
-
+  static const uint64_t FLAG_NONE       = 0;
+  static const uint64_t FLAG_NOFORWARD  = 1 << 0;
+  static const uint64_t FLAG_OBSOLETE   = 1 << 1;
+  static const uint64_t FLAG_DEPRECATED = 1 << 2;
+  
   bool has_flag(uint64_t flag) const { return (flags & flag) != 0; }
   void set_flag(uint64_t flag) { flags |= flag; }
   void unset_flag(uint64_t flag) { flags &= ~flag; }
@@ -965,11 +1019,23 @@ struct MonCommand {
     ::decode(availability, bl);
   }
   bool is_compat(const MonCommand* o) const {
-    return cmdstring == o->cmdstring && helpstring == o->helpstring &&
+    return cmdstring == o->cmdstring &&
 	module == o->module && req_perms == o->req_perms &&
 	availability == o->availability;
   }
 
+  bool is_noforward() const {
+    return has_flag(MonCommand::FLAG_NOFORWARD);
+  }
+
+  bool is_obsolete() const {
+    return has_flag(MonCommand::FLAG_OBSOLETE);
+  }
+
+  bool is_deprecated() const {
+    return has_flag(MonCommand::FLAG_DEPRECATED);
+  }
+
   static void encode_array(const MonCommand *cmds, int size, bufferlist &bl) {
     ENCODE_START(2, 1, bl);
     uint16_t s = size;
diff --git a/src/mon/MonitorDBStore.h b/src/mon/MonitorDBStore.h
index e37b95d..6b1d6c8 100644
--- a/src/mon/MonitorDBStore.h
+++ b/src/mon/MonitorDBStore.h
@@ -20,6 +20,7 @@
 #include <string>
 #include <boost/scoped_ptr.hpp>
 #include <sstream>
+#include <fstream>
 #include "os/KeyValueDB.h"
 
 #include "include/assert.h"
@@ -31,7 +32,10 @@ class MonitorDBStore
 {
   boost::scoped_ptr<KeyValueDB> db;
   bool do_dump;
-  int dump_fd;
+  int dump_fd_binary;
+  std::ofstream dump_fd_json;
+  JSONFormatter dump_fmt;
+  
 
   Finisher io_work;
 
@@ -187,7 +191,7 @@ class MonitorDBStore
       return (size() == 0);
     }
 
-    bool size() {
+    size_t size() const {
       return ops.size();
     }
     uint64_t get_keys() const {
@@ -255,9 +259,15 @@ class MonitorDBStore
     KeyValueDB::Transaction dbt = db->get_transaction();
 
     if (do_dump) {
-      bufferlist bl;
-      t->encode(bl);
-      bl.write_fd(dump_fd);
+      if (!g_conf->mon_debug_dump_json) {
+        bufferlist bl;
+        t->encode(bl);
+        bl.write_fd(dump_fd_binary);
+      } else {
+        t->dump(&dump_fmt, true);
+        dump_fmt.flush(dump_fd_json);
+        dump_fd_json.flush();
+      }
     }
 
     list<pair<string, pair<string,string> > > compact;
@@ -291,6 +301,8 @@ class MonitorDBStore
 	  db->compact_range_async(compact.front().first, compact.front().second.first, compact.front().second.second);
 	compact.pop_front();
       }
+    } else {
+      assert(0 == "failed to write to db");
     }
     return r;
   }
@@ -448,11 +460,15 @@ class MonitorDBStore
 
     virtual pair<string,string> get_next_key() {
       assert(iter->valid());
-      pair<string,string> r = iter->raw_key();
-      do {
-	iter->next();
-      } while (iter->valid() && sync_prefixes.count(iter->raw_key().first) == 0);
-      return r;
+
+      for (; iter->valid(); iter->next()) {
+        pair<string,string> r = iter->raw_key();
+        if (sync_prefixes.count(r.first) > 0) {
+          iter->next();
+          return r;
+        }
+      }
+      return pair<string,string>();
     }
 
     virtual bool _is_valid() {
@@ -564,11 +580,15 @@ class MonitorDBStore
     for (iter = prefixes.begin(); iter != prefixes.end(); ++iter) {
       dbt->rmkeys_by_prefix((*iter));
     }
-    db->submit_transaction_sync(dbt);
+    int r = db->submit_transaction_sync(dbt);
+    assert(r >= 0);
   }
 
   int open(ostream &out) {
-    db->init();
+    if (g_conf->mon_keyvaluedb == "rocksdb")
+      db->init(g_conf->mon_rocksdb_options);
+    else
+      db->init();
     int r = db->open(out);
     if (r < 0)
       return r;
@@ -578,7 +598,10 @@ class MonitorDBStore
   }
 
   int create_and_open(ostream &out) {
-    db->init();
+    if (g_conf->mon_keyvaluedb == "rocksdb")
+      db->init(g_conf->mon_rocksdb_options);
+    else
+      db->init();
     int r = db->create_and_open(out);
     if (r < 0)
       return r;
@@ -608,7 +631,8 @@ class MonitorDBStore
   MonitorDBStore(const string& path)
     : db(0),
       do_dump(false),
-      dump_fd(-1),
+      dump_fd_binary(-1),
+      dump_fmt(true),
       io_work(g_ceph_context, "monstore"),
       is_open(false) {
     string::const_reverse_iterator rit;
@@ -633,21 +657,35 @@ class MonitorDBStore
     db.reset(db_ptr);
 
     if (g_conf->mon_debug_dump_transactions) {
-      do_dump = true;
-      dump_fd = ::open(
-	g_conf->mon_debug_dump_location.c_str(),
-	O_CREAT|O_APPEND|O_WRONLY, 0644);
-      if (!dump_fd) {
-	dump_fd = -errno;
-	derr << "Could not open log file, got "
-	     << cpp_strerror(dump_fd) << dendl;
+      if (!g_conf->mon_debug_dump_json) {
+        dump_fd_binary = ::open(
+          g_conf->mon_debug_dump_location.c_str(),
+          O_CREAT|O_APPEND|O_WRONLY, 0644);
+        if (!dump_fd_binary) {
+          dump_fd_binary = -errno;
+          derr << "Could not open log file, got "
+               << cpp_strerror(dump_fd_binary) << dendl;
+        }
+      } else {
+        dump_fmt.reset();
+        dump_fmt.open_array_section("dump");
+        dump_fd_json.open(g_conf->mon_debug_dump_location.c_str());
       }
+      do_dump = true;
     }
   }
   ~MonitorDBStore() {
     assert(!is_open);
-    if (do_dump)
-      ::close(dump_fd);
+    if (do_dump) {
+      if (!g_conf->mon_debug_dump_json) {
+        ::close(dump_fd_binary);
+      } else {
+        dump_fmt.close_section();
+        dump_fmt.flush(dump_fd_json);
+        dump_fd_json.flush();
+        dump_fd_json.close();
+      }
+    }
   }
 
 };
diff --git a/src/mon/MonitorStore.cc b/src/mon/MonitorStore.cc
deleted file mode 100644
index afaddab..0000000
--- a/src/mon/MonitorStore.cc
+++ /dev/null
@@ -1,499 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage at newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software 
- * Foundation.  See file COPYING.
- * 
- */
-
-#include "MonitorStore.h"
-#include "common/Clock.h"
-#include "common/debug.h"
-#include "common/entity_name.h"
-#include "common/errno.h"
-#include "common/run_cmd.h"
-#include "common/safe_io.h"
-#include "common/config.h"
-#include "common/sync_filesystem.h"
-
-#if defined(__FreeBSD__)
-#include <sys/param.h>
-#endif
-
-#include "include/compat.h"
-
-#define dout_subsys ceph_subsys_mon
-#undef dout_prefix
-#define dout_prefix _prefix(_dout, dir)
-static ostream& _prefix(std::ostream *_dout, const string& dir) {
-  return *_dout << "store(" << dir << ") ";
-}
-
-#include <stdio.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <errno.h>
-#include <unistd.h>
-#include <sstream>
-#include <sys/file.h>
-
-int MonitorStore::mount()
-{
-  char t[1024];
-
-  dout(1) << "mount" << dendl;
-  // verify dir exists
-  DIR *d = ::opendir(dir.c_str());
-  if (!d) {
-    dout(1) << "basedir " << dir << " dne" << dendl;
-    return -ENOENT;
-  }
-  ::closedir(d);
-
-  // open lockfile
-  snprintf(t, sizeof(t), "%s/lock", dir.c_str());
-  lock_fd = ::open(t, O_CREAT|O_RDWR, 0600);
-  if (lock_fd < 0)
-    return -errno;
-  struct flock l;
-  memset(&l, 0, sizeof(l));
-  l.l_type = F_WRLCK;
-  l.l_whence = SEEK_SET;
-  l.l_start = 0;
-  l.l_len = 0;
-  int r = ::fcntl(lock_fd, F_SETLK, &l);
-  if (r < 0) {
-    dout(0) << "failed to lock " << t << ", is another ceph-mon still running?" << dendl;
-    return -errno;
-  }
-
-  if ((!g_conf->chdir.empty()) && (dir[0] != '/')) {
-    // combine it with the cwd, in case fuse screws things up (i.e. fakefuse)
-    string old = dir;
-    char cwd[PATH_MAX];
-    char *p = getcwd(cwd, sizeof(cwd));
-    dir = p;
-    dir += "/";
-    dir += old;
-  }
-  return 0;
-}
-
-int MonitorStore::umount()
-{
-  int close_err = TEMP_FAILURE_RETRY(::close(lock_fd));
-  assert (0 == close_err);
-  return 0;
-}
-
-int MonitorStore::mkfs()
-{
-  int err;
-
-  err = ::mkdir(dir.c_str(), 0700);
-  if (err < 0 && errno != EEXIST) {
-    err = -errno;
-    derr << "MonitorStore::mkfs: unable to create " << dir << ": " << cpp_strerror(err) << dendl;
-    return err;
-  }
-
-  int fd = ::open(dir.c_str(), O_RDONLY);
-  if (fd < 0) {
-    err = -errno;
-    derr << "MonitorStore::mkfs: unable to open " << dir << ": " << cpp_strerror(err) << dendl;
-    return err;
-  }
-  int close_err = TEMP_FAILURE_RETRY(::close(fd));
-  assert (0 == close_err);
-
-  dout(0) << "created monfs at " << dir << " for "
-	  << g_conf->name.get_id() << dendl;
-  return 0;
-}
-
-version_t MonitorStore::get_int(const char *a, const char *b)
-{
-  char fn[1024];
-  if (b)
-    snprintf(fn, sizeof(fn), "%s/%s/%s", dir.c_str(), a, b);
-  else
-    snprintf(fn, sizeof(fn), "%s/%s", dir.c_str(), a);
-  
-  int fd = ::open(fn, O_RDONLY);
-  if (fd < 0) {
-    int err = errno;
-    if (err == ENOENT) {
-      // Non-existent files are treated as containing 0.
-      return 0;
-    }
-    derr << "MonitorStore::get_int: failed to open '" << fn << "': "
-	 << cpp_strerror(err) << dendl;
-    assert(0 == "failed to open");
-    return 0;
-  }
-  
-  char buf[20];
-  memset(buf, 0, sizeof(buf));
-  int r = safe_read(fd, buf, sizeof(buf) - 1);
-  if (r < 0) {
-    derr << "MonitorStore::get_int: failed to read '" << fn << "': "
-	 << cpp_strerror(r) << dendl;
-    int close_err = TEMP_FAILURE_RETRY(::close(fd));
-    assert(0 == close_err);
-    assert(0); // the file exists; so this is a different failure
-    return 0;
-  }
-  int close_err = TEMP_FAILURE_RETRY(::close(fd));
-  assert (0 == close_err);
-  
-  version_t val = atoi(buf);
-  
-  if (b) {
-    dout(15) << "get_int " << a << "/" << b << " = " << val << dendl;
-  } else {
-    dout(15) << "get_int " << a << " = " << val << dendl;
-  }
-  return val;
-}
-
-
-void MonitorStore::put_int(version_t val, const char *a, const char *b)
-{
-  char fn[1024];
-  snprintf(fn, sizeof(fn), "%s/%s", dir.c_str(), a);
-  if (b) {
-    int r = ::mkdir(fn, 0755);
-    if ((r < 0) && (errno != EEXIST)) {
-      int err = -errno;
-      derr << __func__ << " failed to create dir " << fn << ": "
-	   << cpp_strerror(err) << dendl;
-      ceph_abort();
-    }
-    dout(15) << "set_int " << a << "/" << b << " = " << val << dendl;
-    snprintf(fn, sizeof(fn), "%s/%s/%s", dir.c_str(), a, b);
-  } else {
-    dout(15) << "set_int " << a << " = " << val << dendl;
-  }
-  
-  char vs[30];
-  snprintf(vs, sizeof(vs), "%lld\n", (unsigned long long)val);
-
-  char tfn[1024];
-  snprintf(tfn, sizeof(tfn), "%s.new", fn);
-
-  int fd = TEMP_FAILURE_RETRY(::open(tfn, O_WRONLY|O_CREAT|O_TRUNC, 0600));
-  if (fd < 0) {
-    int err = errno;
-    derr << "MonitorStore::put_int: failed to open '" << tfn << "': "
-	 << cpp_strerror(err) << dendl;
-    ceph_abort();
-  }
-  int r = safe_write(fd, vs, strlen(vs));
-  if (r) {
-    derr << "MonitorStore::put_int: failed to write to '" << tfn << "': "
-	 << cpp_strerror(r) << dendl;
-    ceph_abort();
-  }
-  r = ::fsync(fd);
-  if (r) {
-    derr << "Monitor::put_int: failed to fsync fd for '" << tfn << "': "
-	 << cpp_strerror(r) << dendl;
-    ceph_abort();
-  }
-  if (TEMP_FAILURE_RETRY(::close(fd))) {
-    derr << "MonitorStore::put_int: failed to close fd for '" << tfn << "': "
-	 << cpp_strerror(r) << dendl;
-    ceph_abort();
-  }
-  if (::rename(tfn, fn)) {
-    int err = errno;
-    derr << "MonitorStore::put_int: failed to rename '" << tfn << "' to "
-	 << "'" << fn << "': " << cpp_strerror(err) << dendl;
-    ceph_abort();
-  }
-}
-
-// kludge to associate a global version number with each per-machine paxos state
-version_t MonitorStore::get_global_version(const char *a, version_t b)
-{
-  char fn[1024], fn2[1024];
-  snprintf(fn, sizeof(fn), "%s_gv", a);
-  snprintf(fn2, sizeof(fn2), "%llu", (long long unsigned)b);
-  return get_int(fn, fn2);
-}
-
-// ----------------------------------------
-// buffers
-
-bool MonitorStore::exists_bl_ss(const char *a, const char *b)
-{
-  char fn[1024];
-  if (b) {
-    dout(15) << "exists_bl " << a << "/" << b << dendl;
-    snprintf(fn, sizeof(fn), "%s/%s/%s", dir.c_str(), a, b);
-  } else {
-    dout(15) << "exists_bl " << a << dendl;
-    snprintf(fn, sizeof(fn), "%s/%s", dir.c_str(), a);
-  }
-  
-  struct stat st;
-  int r = ::stat(fn, &st);
-  //char buf[80];
-  //dout(15) << "exists_bl stat " << fn << " r=" << r << " " << cpp_strerror(errno) << dendl;
-  if (r) {
-    assert (errno == ENOENT);
-  }
-  return r == 0;
-}
-
-void MonitorStore::erase_ss(const char *a, const char *b)
-{
-  char fn[1024];
-  char dr[1024];
-  snprintf(dr, sizeof(dr), "%s/%s", dir.c_str(), a);
-  if (b) {
-    dout(15) << "erase_ss " << a << "/" << b << dendl;
-    snprintf(fn, sizeof(fn), "%s/%s/%s", dir.c_str(), a, b);
-  } else {
-    dout(15) << "erase_ss " << a << dendl;
-    strcpy(fn, dr);
-  }
-  int r = ::unlink(fn);
-  assert(0 == r || ENOENT == errno); // callers don't check for existence first
-
-  ::rmdir(dr);  // sloppy attempt to clean up empty dirs
-}
-
-int MonitorStore::get_bl_ss(bufferlist& bl, const char *a, const char *b)
-{
-  char fn[1024];
-  if (b) {
-    snprintf(fn, sizeof(fn), "%s/%s/%s", dir.c_str(), a, b);
-  } else {
-    snprintf(fn, sizeof(fn), "%s/%s", dir.c_str(), a);
-  }
-  
-  int fd = ::open(fn, O_RDONLY);
-  if (fd < 0) {
-    if (b) {
-      dout(15) << "get_bl " << a << "/" << b << " " << cpp_strerror(errno) << dendl;
-    } else {
-      dout(15) << "get_bl " << a << " " << cpp_strerror(errno) << dendl;
-    }
-    return -errno;
-  }
-
-  // get size
-  struct stat st;
-  int rc = ::fstat(fd, &st);
-  assert(rc == 0);
-  __int32_t len = st.st_size;
- 
-  // read buffer
-  bl.clear();
-  bufferptr bp(len);
-  int off = 0;
-  while (off < len) {
-    dout(20) << "reading at off " << off << " of " << len << dendl;
-    int r = ::read(fd, bp.c_str()+off, len-off);
-    if (r < 0)
-      dout(0) << "errno on read " << cpp_strerror(errno) << dendl;
-    assert(r>0);
-    off += r;
-  }
-  bl.append(bp);
-  int close_err = TEMP_FAILURE_RETRY(::close(fd));
-  assert (0 == close_err);
-
-  if (b) {
-    dout(15) << "get_bl " << a << "/" << b << " = " << bl.length() << " bytes" << dendl;
-  } else {
-    dout(15) << "get_bl " << a << " = " << bl.length() << " bytes" << dendl;
-  }
-
-  return len;
-}
-
-void MonitorStore::write_bl_ss(bufferlist& bl, const char *a, const char *b, bool append)
-{
-  int err = 0;
-  char fn[1024];
-  snprintf(fn, sizeof(fn), "%s/%s", dir.c_str(), a);
-  if (b) {
-    int r = ::mkdir(fn, 0755);
-    if ((r < 0) && (errno != EEXIST)) {
-      err = -errno;
-      derr << __func__ << " failed to create dir " << fn
-	   << ": " << cpp_strerror(err) << dendl;
-      assert(0 == "failed to create dir");
-    }
-    dout(15) << "put_bl " << a << "/" << b << " = " << bl.length() << " bytes" << dendl;
-    snprintf(fn, sizeof(fn), "%s/%s/%s", dir.c_str(), a, b);
-  } else {
-    dout(15) << "put_bl " << a << " = " << bl.length() << " bytes" << dendl;
-  }
-  
-  char tfn[1024];
-  int fd;
-  if (append) {
-    fd = ::open(fn, O_WRONLY|O_CREAT|O_APPEND, 0600);
-    if (fd < 0) {
-      err = -errno;
-      derr << "failed to open " << fn << "for append: "
-	   << cpp_strerror(err) << dendl;
-      assert(0 == "failed to open for append");
-    }
-  } else {
-    snprintf(tfn, sizeof(tfn), "%s.new", fn);
-    fd = ::open(tfn, O_WRONLY|O_CREAT|O_TRUNC, 0600);
-    if (fd < 0) {
-      err = -errno;
-      derr << "failed to open " << tfn << ": " << cpp_strerror(err) << dendl;
-      assert(0 == "failed to open");
-    }
-  }
-  
-  err = bl.write_fd(fd);
-  assert(!err);
-  err = ::fsync(fd);
-  assert(!err);
-  err = TEMP_FAILURE_RETRY(::close(fd));
-  assert (!err); // this really can't fail, right? right?...
-  if (!append) {
-    err = ::rename(tfn, fn);
-    if (err < 0) {
-      err = -errno;
-      derr << __func__ << " failed to rename '" << tfn << "' -> '"
-	   << fn << "': " << cpp_strerror(err) << dendl;
-      assert(0 == "failed to rename");
-    }
-  }
-}
-
-void MonitorStore::put_bl_sn_map(const char *a,
-				map<version_t,bufferlist>::iterator start,
-				map<version_t,bufferlist>::iterator end)
-{
-  int err = 0;
-  int close_err = 0;
-  version_t first = start->first;
-  map<version_t,bufferlist>::iterator lastp = end;
-  --lastp;
-  version_t last = lastp->first;
-  dout(15) <<  "put_bl_sn_map " << a << "/[" << first << ".." << last << "]" << dendl;
-
-  // only do a big sync if there are several values, or if the feature is disabled.
-  if (g_conf->mon_sync_fs_threshold <= 0 ||
-      last - first < (unsigned)g_conf->mon_sync_fs_threshold) {
-    // just do them individually
-    for (map<version_t,bufferlist>::iterator p = start; p != end; ++p) {
-      put_bl_sn(p->second, a, p->first);
-    }
-    return;
-  }
-
-  // make sure dir exists
-  char dfn[1024];
-  snprintf(dfn, sizeof(dfn), "%s/%s", dir.c_str(), a);
-  int r = ::mkdir(dfn, 0755);
-  if ((r < 0) && (errno != EEXIST)) {
-    err = -errno;
-    derr << __func__ << " failed to create dir " << dfn << ": "
-	 << cpp_strerror(err) << dendl;
-    assert(0 == "failed to create dir");
-  }
-
-  for (map<version_t,bufferlist>::iterator p = start; p != end; ++p) {
-    char tfn[1024], fn[1024];
-
-    snprintf(fn, sizeof(fn), "%s/%llu", dfn, (long long unsigned)p->first);
-    snprintf(tfn, sizeof(tfn), "%s.new", fn);
-
-    int fd = ::open(tfn, O_WRONLY|O_CREAT|O_TRUNC, 0600);
-    if (fd < 0) {
-      int err = -errno;
-      derr << "failed to open " << tfn << ": " << cpp_strerror(err) << dendl;
-      assert(0 == "failed to open");
-    }
-
-    err = p->second.write_fd(fd);
-    close_err = TEMP_FAILURE_RETRY(::close(fd));
-    assert (0 == close_err);
-    if (err < 0)
-      assert(0 == "failed to write");
-  }
-
-  // sync them all
-  int dirfd = ::open(dir.c_str(), O_RDONLY);
-  if (dirfd < 0) {
-    err = -errno;
-    derr << "failed to open " << dir << ": " << cpp_strerror(err) << dendl;
-    assert(0 == "failed to open temp file");
-  }
-
-  err = sync_filesystem(dirfd);
-  if (err < 0) {
-    derr << "sync_filesystem error " << cpp_strerror(err) << dendl;
-    assert(0 == "failed to sync_filesystem");
-  }
-
-  close_err = TEMP_FAILURE_RETRY(::close(dirfd));
-  assert (0 == close_err);
-    
-  // rename them all into place
-  for (map<version_t,bufferlist>::iterator p = start; p != end; ++p) {
-    char tfn[1024], fn[1024];
-    
-    snprintf(fn, sizeof(fn), "%s/%llu", dfn, (long long unsigned)p->first);
-    snprintf(tfn, sizeof(tfn), "%s.new", fn);
-    
-    err = ::rename(tfn, fn);
-    if (err < 0)
-      assert(0 == "failed to rename");
-  }
-    
-  // fsync the dir (to commit the renames)
-  dirfd = ::open(dir.c_str(), O_RDONLY);
-  if (dirfd < 0) {
-    err = -errno;
-    derr << __func__ << " failed to open " << dir
-	 << ": " << cpp_strerror(err) << dendl;
-    assert(0 == "failed to open dir");
-  }
-  err = ::fsync(dirfd);
-  if (err < 0) {
-    err = -errno;
-    derr << __func__ << " failed to fsync " << dir
-	 << ": " << cpp_strerror(err) << dendl;
-    assert(0 == "failed to fsync");
-  }
-  close_err = TEMP_FAILURE_RETRY(::close(dirfd));
-  assert (0 == close_err);
-}
-
-void MonitorStore::sync()
-{
-  int dirfd = ::open(dir.c_str(), O_RDONLY);
-  if (dirfd < 0) {
-    int err = -errno;
-    derr << __func__ << " failed to open " << dir
-	 << ": " << cpp_strerror(err) << dendl;
-    assert(0 == "failed to open dir for syncing");
-  }
-
-  int ret = sync_filesystem(dirfd);
-  if (ret < 0) {
-    derr << __func__ << " sync_filesystem error " << cpp_strerror(ret) << dendl;
-    assert(0 == "failed to sync_filesystem");
-  }
-
-  int close_err = TEMP_FAILURE_RETRY(::close(dirfd));
-  assert (0 == close_err);
-}
diff --git a/src/mon/MonitorStore.h b/src/mon/MonitorStore.h
deleted file mode 100644
index 76b8363..0000000
--- a/src/mon/MonitorStore.h
+++ /dev/null
@@ -1,109 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage at newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software 
- * Foundation.  See file COPYING.
- * 
- */
-
-#ifndef CEPH_MON_MONITORSTORE_H
-#define CEPH_MON_MONITORSTORE_H
-
-#include "include/types.h"
-#include "include/buffer.h"
-
-#include "common/compiler_extensions.h"
-
-#include <iosfwd>
-#include <string.h>
-#include <errno.h>
-
-class MonitorStore {
-  string dir;
-  int lock_fd;
-
-  void write_bl_ss(bufferlist& bl, const char *a, const char *b,
-		  bool append);
-public:
-  MonitorStore(const std::string &d) : dir(d), lock_fd(-1) { }
-  ~MonitorStore() { }
-
-  int mkfs();  // wipe
-  int mount();
-  int umount();
-  void sync();
-
-  // ints (stored as ascii)
-  version_t get_int(const char *a, const char *b=0) WARN_UNUSED_RESULT;
-  void put_int(version_t v, const char *a, const char *b=0);
-
-  version_t get_global_version(const char *a, version_t b) WARN_UNUSED_RESULT;
-
-  // buffers
-  // ss and sn varieties.
-  bool exists_bl_ss(const char *a, const char *b=0);
-  int get_bl_ss(bufferlist& bl, const char *a, const char *b) WARN_UNUSED_RESULT;
-  void get_bl_ss_safe(bufferlist& bl, const char *a, const char *b) {
-    int ret = get_bl_ss(bl, a, b);
-    assert (ret >= 0 || ret == -ENOENT);
-  }
-  void put_bl_ss(bufferlist& bl, const char *a, const char *b) {
-    write_bl_ss(bl, a, b, false);
-  }
-  void append_bl_ss(bufferlist& bl, const char *a, const char *b) {
-    write_bl_ss(bl, a, b, true);
-  }
-  bool exists_bl_sn(const char *a, version_t b) {
-    char bs[20];
-    snprintf(bs, sizeof(bs), "%llu", (unsigned long long)b);
-    return exists_bl_ss(a, bs);
-  }
-  int get_bl_sn(bufferlist& bl, const char *a, version_t b) WARN_UNUSED_RESULT {
-    char bs[20];
-    snprintf(bs, sizeof(bs), "%llu", (unsigned long long)b);
-    return get_bl_ss(bl, a, bs);
-  }
-  void get_bl_sn_safe(bufferlist& bl, const char *a, version_t b) {
-    int ret = get_bl_sn(bl, a, b);
-    assert(ret >= 0 || ret == -ENOENT);
-  }
-  void put_bl_sn(bufferlist& bl, const char *a, version_t b) {
-    char bs[20];
-    snprintf(bs, sizeof(bs), "%llu", (unsigned long long)b);
-    put_bl_ss(bl, a, bs);
-  }
-  /**
-   * Put a whole set of values efficiently and safely.
-   *
-   * @param a - prefix/directory
-   * @param vals - map of int name -> values
-   * @return 0 for success or negative error code
-   */
-  void put_bl_sn_map(const char *a,
-		    map<version_t,bufferlist>::iterator start,
-		    map<version_t,bufferlist>::iterator end);
-
-  void erase_ss(const char *a, const char *b);
-  void erase_sn(const char *a, version_t b) {
-    char bs[20];
-    snprintf(bs, sizeof(bs), "%llu", (unsigned long long)b);
-    erase_ss(a, bs);
-  }
-
-  /*
-  version_t get_incarnation() { return get_int("incarnation"); }
-  void set_incarnation(version_t i) { set_int(i, "incarnation"); }
-  
-  version_t get_last_proposal() { return get_int("last_proposal"); }
-  void set_last_proposal(version_t i) { set_int(i, "last_proposal"); }
-  */
-};
-
-
-#endif
diff --git a/src/mon/MonmapMonitor.cc b/src/mon/MonmapMonitor.cc
index c6b21cb..f60d399 100644
--- a/src/mon/MonmapMonitor.cc
+++ b/src/mon/MonmapMonitor.cc
@@ -126,17 +126,17 @@ void MonmapMonitor::on_active()
     mon->clog->info() << "monmap " << *mon->monmap << "\n";
 }
 
-bool MonmapMonitor::preprocess_query(PaxosServiceMessage *m)
+bool MonmapMonitor::preprocess_query(MonOpRequestRef op)
 {
+  PaxosServiceMessage *m = static_cast<PaxosServiceMessage*>(op->get_req());
   switch (m->get_type()) {
     // READs
   case MSG_MON_COMMAND:
-    return preprocess_command(static_cast<MMonCommand*>(m));
+    return preprocess_command(op);
   case MSG_MON_JOIN:
-    return preprocess_join(static_cast<MMonJoin*>(m));
+    return preprocess_join(op);
   default:
     assert(0);
-    m->put();
     return true;
   }
 }
@@ -154,8 +154,9 @@ void MonmapMonitor::dump_info(Formatter *f)
   f->close_section();
 }
 
-bool MonmapMonitor::preprocess_command(MMonCommand *m)
+bool MonmapMonitor::preprocess_command(MonOpRequestRef op)
 {
+  MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
   int r = -1;
   bufferlist rdata;
   stringstream ss;
@@ -163,7 +164,7 @@ bool MonmapMonitor::preprocess_command(MMonCommand *m)
   map<string, cmd_vartype> cmdmap;
   if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
     string rs = ss.str();
-    mon->reply_command(m, -EINVAL, rs, rdata, get_last_committed());
+    mon->reply_command(op, -EINVAL, rs, rdata, get_last_committed());
     return true;
   }
 
@@ -172,7 +173,7 @@ bool MonmapMonitor::preprocess_command(MMonCommand *m)
 
   MonSession *session = m->get_session();
   if (!session) {
-    mon->reply_command(m, -EACCES, "access denied", get_last_committed());
+    mon->reply_command(op, -EACCES, "access denied", get_last_committed());
     return true;
   }
 
@@ -249,32 +250,33 @@ reply:
     string rs;
     getline(ss, rs);
 
-    mon->reply_command(m, r, rs, rdata, get_last_committed());
+    mon->reply_command(op, r, rs, rdata, get_last_committed());
     return true;
   } else
     return false;
 }
 
 
-bool MonmapMonitor::prepare_update(PaxosServiceMessage *m)
+bool MonmapMonitor::prepare_update(MonOpRequestRef op)
 {
+  PaxosServiceMessage *m = static_cast<PaxosServiceMessage*>(op->get_req());
   dout(7) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
   
   switch (m->get_type()) {
   case MSG_MON_COMMAND:
-    return prepare_command(static_cast<MMonCommand*>(m));
+    return prepare_command(op);
   case MSG_MON_JOIN:
-    return prepare_join(static_cast<MMonJoin*>(m));
+    return prepare_join(op);
   default:
     assert(0);
-    m->put();
   }
 
   return false;
 }
 
-bool MonmapMonitor::prepare_command(MMonCommand *m)
+bool MonmapMonitor::prepare_command(MonOpRequestRef op)
 {
+  MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
   stringstream ss;
   string rs;
   int err = -EINVAL;
@@ -282,7 +284,7 @@ bool MonmapMonitor::prepare_command(MMonCommand *m)
   map<string, cmd_vartype> cmdmap;
   if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
     string rs = ss.str();
-    mon->reply_command(m, -EINVAL, rs, get_last_committed());
+    mon->reply_command(op, -EINVAL, rs, get_last_committed());
     return true;
   }
 
@@ -291,7 +293,7 @@ bool MonmapMonitor::prepare_command(MMonCommand *m)
 
   MonSession *session = m->get_session();
   if (!session) {
-    mon->reply_command(m, -EACCES, "access denied", get_last_committed());
+    mon->reply_command(op, -EACCES, "access denied", get_last_committed());
     return true;
   }
 
@@ -351,7 +353,7 @@ bool MonmapMonitor::prepare_command(MMonCommand *m)
     pending_map.add(name, addr);
     pending_map.last_changed = ceph_clock_now(g_ceph_context);
     getline(ss, rs);
-    wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
                                                      get_last_committed() + 1));
     return true;
 
@@ -375,7 +377,7 @@ bool MonmapMonitor::prepare_command(MMonCommand *m)
     ss << "removed mon." << name << " at " << addr << ", there are now " << pending_map.size() << " monitors" ;
     getline(ss, rs);
     // send reply immediately in case we get removed
-    mon->reply_command(m, 0, rs, get_last_committed());
+    mon->reply_command(op, 0, rs, get_last_committed());
     return true;
   }
   else
@@ -383,36 +385,35 @@ bool MonmapMonitor::prepare_command(MMonCommand *m)
 
 out:
   getline(ss, rs);
-  mon->reply_command(m, err, rs, get_last_committed());
+  mon->reply_command(op, err, rs, get_last_committed());
   return false;
 }
 
-bool MonmapMonitor::preprocess_join(MMonJoin *join)
+bool MonmapMonitor::preprocess_join(MonOpRequestRef op)
 {
+  MMonJoin *join = static_cast<MMonJoin*>(op->get_req());
   dout(10) << "preprocess_join " << join->name << " at " << join->addr << dendl;
 
   MonSession *session = join->get_session();
   if (!session ||
       !session->is_capable("mon", MON_CAP_W | MON_CAP_X)) {
     dout(10) << " insufficient caps" << dendl;
-    join->put();
     return true;
   }
 
   if (pending_map.contains(join->name) && !pending_map.get_addr(join->name).is_blank_ip()) {
     dout(10) << " already have " << join->name << dendl;
-    join->put();
     return true;
   }
   if (pending_map.contains(join->addr) && pending_map.get_name(join->addr) == join->name) {
     dout(10) << " already have " << join->addr << dendl;
-    join->put();
     return true;
   }
   return false;
 }
-bool MonmapMonitor::prepare_join(MMonJoin *join)
+bool MonmapMonitor::prepare_join(MonOpRequestRef op)
 {
+  MMonJoin *join = static_cast<MMonJoin*>(op->get_req());
   dout(0) << "adding/updating " << join->name << " at " << join->addr << " to monitor cluster" << dendl;
   if (pending_map.contains(join->name))
     pending_map.remove(join->name);
@@ -420,7 +421,6 @@ bool MonmapMonitor::prepare_join(MMonJoin *join)
     pending_map.remove(pending_map.get_name(join->addr));
   pending_map.add(join->name, join->addr);
   pending_map.last_changed = ceph_clock_now(g_ceph_context);
-  join->put();
   return true;
 }
 
diff --git a/src/mon/MonmapMonitor.h b/src/mon/MonmapMonitor.h
index 22b51ad..f554092 100644
--- a/src/mon/MonmapMonitor.h
+++ b/src/mon/MonmapMonitor.h
@@ -58,14 +58,14 @@ class MonmapMonitor : public PaxosService {
 
   void dump_info(Formatter *f);
 
-  bool preprocess_query(PaxosServiceMessage *m);
-  bool prepare_update(PaxosServiceMessage *m);
+  bool preprocess_query(MonOpRequestRef op);
+  bool prepare_update(MonOpRequestRef op);
 
-  bool preprocess_join(MMonJoin *m);
-  bool prepare_join(MMonJoin *m);
+  bool preprocess_join(MonOpRequestRef op);
+  bool prepare_join(MonOpRequestRef op);
 
-  bool preprocess_command(MMonCommand *m);
-  bool prepare_command(MMonCommand *m);
+  bool preprocess_command(MonOpRequestRef op);
+  bool prepare_command(MonOpRequestRef op);
 
   void get_health(list<pair<health_status_t,string> >& summary,
 		  list<pair<health_status_t,string> > *detail) const;
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
index cdbb6c7..9ebb349 100644
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -1,4 +1,4 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
 // vim: ts=8 sw=2 smarttab
 /*
  * Ceph - scalable distributed file system
@@ -11,12 +11,14 @@
  *
  * This is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software 
+ * License version 2.1, as published by the Free Software
  * Foundation.  See file COPYING.
- * 
+ *
  */
 
+#include <algorithm>
 #include <sstream>
+#include <boost/assign.hpp>
 
 #include "OSDMonitor.h"
 #include "Monitor.h"
@@ -70,11 +72,13 @@ static ostream& _prefix(std::ostream *_dout, Monitor *mon, OSDMap& osdmap) {
 		<< ").osd e" << osdmap.get_epoch() << " ";
 }
 
-OSDMonitor::OSDMonitor(Monitor *mn, Paxos *p, string service_name)
-  : PaxosService(mn, p, service_name),
-    inc_osd_cache(g_conf->mon_osd_cache_size),
-    full_osd_cache(g_conf->mon_osd_cache_size),
-    thrash_map(0), thrash_last_up_osd(-1) { }
+OSDMonitor::OSDMonitor(CephContext *cct, Monitor *mn, Paxos *p, const string& service_name)
+ : PaxosService(mn, p, service_name),
+   inc_osd_cache(g_conf->mon_osd_cache_size),
+   full_osd_cache(g_conf->mon_osd_cache_size),
+   thrash_map(0), thrash_last_up_osd(-1),
+   op_tracker(cct, true, 1)
+{}
 
 bool OSDMonitor::_have_pending_crush()
 {
@@ -117,6 +121,9 @@ void OSDMonitor::create_initial()
   newmap.set_epoch(1);
   newmap.created = newmap.modified = ceph_clock_now(g_ceph_context);
 
+  // new clusters should sort bitwise by default.
+  newmap.set_flag(CEPH_OSDMAP_SORTBITWISE);
+
   // encode into pending incremental
   newmap.encode(pending_inc.fullmap, mon->quorum_features | CEPH_FEATURE_RESERVED);
   pending_inc.full_crc = newmap.get_crc();
@@ -274,9 +281,6 @@ void OSDMonitor::update_from_paxos(bool *need_bootstrap)
 
   for (int o = 0; o < osdmap.get_max_osd(); o++) {
     if (osdmap.is_down(o)) {
-      // invalidate osd_epoch cache
-      osd_epoch.erase(o);
-
       // populate down -> out map
       if (osdmap.is_in(o) &&
 	  down_pending_out.count(o) == 0) {
@@ -285,11 +289,7 @@ void OSDMonitor::update_from_paxos(bool *need_bootstrap)
       }
     }
   }
-  // blow away any osd_epoch items beyond max_osd
-  map<int,epoch_t>::iterator p = osd_epoch.upper_bound(osdmap.get_max_osd());
-  while (p != osd_epoch.end()) {
-    osd_epoch.erase(p++);
-  }
+  // XXX: need to trim MonSession connected with a osd whose id > max_osd?
 
   /** we don't have any of the feature bit infrastructure in place for
    * supporting primary_temp mappings without breaking old clients/OSDs.*/
@@ -338,7 +338,7 @@ bool OSDMonitor::thrash()
   thrash_map--;
   int o;
 
-  // mark a random osd up_thru.. 
+  // mark a random osd up_thru..
   if (rand() % 4 == 0 || thrash_last_up_osd < 0)
     o = rand() % osdmap.get_num_osds();
   else
@@ -433,13 +433,15 @@ void OSDMonitor::on_active()
   }
 
   if (mon->is_leader())
-    mon->clog->info() << "osdmap " << osdmap << "\n"; 
+    mon->clog->info() << "osdmap " << osdmap << "\n";
 
   if (!mon->is_leader()) {
-    list<MOSDFailure*> ls;
+    list<MonOpRequestRef> ls;
     take_all_failures(ls);
     while (!ls.empty()) {
-      dispatch(ls.front());
+      MonOpRequestRef op = ls.front();
+      op->mark_osdmon_event(__func__);
+      dispatch(op);
       ls.pop_front();
     }
   }
@@ -450,10 +452,9 @@ void OSDMonitor::on_shutdown()
   dout(10) << __func__ << dendl;
 
   // discard failure info, waiters
-  list<MOSDFailure*> ls;
+  list<MonOpRequestRef> ls;
   take_all_failures(ls);
   while (!ls.empty()) {
-    ls.front()->put();
     ls.pop_front();
   }
 }
@@ -461,7 +462,7 @@ void OSDMonitor::on_shutdown()
 void OSDMonitor::update_logger()
 {
   dout(10) << "update_logger" << dendl;
-  
+
   mon->cluster_logger->set(l_cluster_num_osd, osdmap.get_num_osds());
   mon->cluster_logger->set(l_cluster_num_osd_up, osdmap.get_num_up_osds());
   mon->cluster_logger->set(l_cluster_num_osd_in, osdmap.get_num_in_osds());
@@ -625,13 +626,11 @@ public:
     osdmap(osdmap_),
     pgm(pgm_),
     tree(tree_),
-    average_util(0),
+    average_util(average_utilization()),
     min_var(-1),
     max_var(-1),
     stddev(0),
     sum(0) {
-    if (pgm->osd_sum.kb)
-      average_util = 100.0 * (double)pgm->osd_sum.kb_used / (double)pgm->osd_sum.kb;
   }
 
 protected:
@@ -649,17 +648,21 @@ protected:
     float reweight = qi.is_bucket() ? -1 : osdmap->get_weightf(qi.id);
     int64_t kb = 0, kb_used = 0, kb_avail = 0;
     double util = 0;
-    if (get_bucket_utilization(qi.id, kb, kb_used, kb_avail) && kb > 0)
+    if (get_bucket_utilization(qi.id, &kb, &kb_used, &kb_avail))
       util = 100.0 * (double)kb_used / (double)kb;
     double var = 1.0;
     if (average_util)
       var = util / average_util;
 
-    dump_item(qi, reweight, kb, kb_used, kb_avail, util, var, f);
+    size_t num_pgs = pgm->get_num_pg_by_osd(qi.id);
+
+    dump_item(qi, reweight, kb, kb_used, kb_avail, util, var, num_pgs, f);
 
-    if (!qi.is_bucket()) {
-      if (min_var < 0 || var < min_var) min_var = var;
-      if (max_var < 0 || var > max_var) max_var = var;
+    if (!qi.is_bucket() && reweight > 0) {
+      if (min_var < 0 || var < min_var)
+	min_var = var;
+      if (max_var < 0 || var > max_var)
+	max_var = var;
 
       double dev = util - average_util;
       dev *= dev;
@@ -668,44 +671,65 @@ protected:
     }
   }
 
-  virtual void dump_item(const CrushTreeDumper::Item &qi, float &reweight,
-			 int64_t kb, int64_t kb_used, int64_t kb_avail,
-			 double& util, double& var, F *f) = 0;
+  virtual void dump_item(const CrushTreeDumper::Item &qi,
+			 float &reweight,
+			 int64_t kb,
+			 int64_t kb_used,
+			 int64_t kb_avail,
+			 double& util,
+			 double& var,
+			 const size_t num_pgs,
+			 F *f) = 0;
 
   double dev() {
     return sum > 0 ? sqrt(stddev / sum) : 0;
   }
 
-  bool get_bucket_utilization(int id, int64_t& kb, int64_t& kb_used,
-			      int64_t& kb_avail) const {
-    if (id >= 0) {
-      typedef ceph::unordered_map<int32_t,osd_stat_t> OsdStat;
-
-      OsdStat::const_iterator p = pgm->osd_stat.find(id);
+  double average_utilization() {
+    int64_t kb = 0, kb_used = 0;
+    for (int i = 0; i <= osdmap->get_max_osd(); i++) {
+      if (!osdmap->exists(i) || osdmap->get_weight(i) == 0)
+	continue;
+      int64_t kb_i, kb_used_i, kb_avail_i;
+      if (get_osd_utilization(i, &kb_i, &kb_used_i, &kb_avail_i)) {
+	kb += kb_i;
+	kb_used += kb_used_i;
+      }
+    }
+    return kb > 0 ? 100.0 * (double)kb_used / (double)kb : 0;
+  }
 
-      if (p == pgm->osd_stat.end())
-	return false;
+  bool get_osd_utilization(int id, int64_t* kb, int64_t* kb_used,
+			   int64_t* kb_avail) const {
+    typedef ceph::unordered_map<int32_t,osd_stat_t> OsdStat;
+    OsdStat::const_iterator p = pgm->osd_stat.find(id);
+    if (p == pgm->osd_stat.end())
+      return false;
+    *kb = p->second.kb;
+    *kb_used = p->second.kb_used;
+    *kb_avail = p->second.kb_avail;
+    return *kb > 0;
+  }
 
-      kb = p->second.kb;
-      kb_used = p->second.kb_used;
-      kb_avail = p->second.kb_avail;
-      return kb > 0;
-    }
+  bool get_bucket_utilization(int id, int64_t* kb, int64_t* kb_used,
+			      int64_t* kb_avail) const {
+    if (id >= 0)
+      return get_osd_utilization(id, kb, kb_used, kb_avail);
 
-    kb = 0;
-    kb_used = 0;
-    kb_avail = 0;
+    *kb = 0;
+    *kb_used = 0;
+    *kb_avail = 0;
 
     for (int k = osdmap->crush->get_bucket_size(id) - 1; k >= 0; k--) {
       int item = osdmap->crush->get_bucket_item(id, k);
       int64_t kb_i = 0, kb_used_i = 0, kb_avail_i;
-      if (!get_bucket_utilization(item, kb_i, kb_used_i, kb_avail_i))
+      if (!get_bucket_utilization(item, &kb_i, &kb_used_i, &kb_avail_i))
 	return false;
-      kb += kb_i;
-      kb_used += kb_used_i;
-      kb_avail += kb_avail_i;
+      *kb += kb_i;
+      *kb_used += kb_used_i;
+      *kb_avail += kb_avail_i;
     }
-    return kb > 0;
+    return *kb > 0;
   }
 
 protected:
@@ -736,6 +760,7 @@ public:
     tbl->define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
     tbl->define_column("%USE", TextTable::LEFT, TextTable::RIGHT);
     tbl->define_column("VAR", TextTable::LEFT, TextTable::RIGHT);
+    tbl->define_column("PGS", TextTable::LEFT, TextTable::RIGHT);
     if (tree)
       tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT);
 
@@ -759,9 +784,16 @@ protected:
   };
   friend std::ostream &operator<<(ostream& out, const lowprecision_t& v);
 
-  virtual void dump_item(const CrushTreeDumper::Item &qi, float &reweight,
-			 int64_t kb, int64_t kb_used, int64_t kb_avail,
-			 double& util, double& var, TextTable *tbl) {
+  using OSDUtilizationDumper<TextTable>::dump_item;
+  virtual void dump_item(const CrushTreeDumper::Item &qi,
+			 float &reweight,
+			 int64_t kb,
+			 int64_t kb_used,
+			 int64_t kb_avail,
+			 double& util,
+			 double& var,
+			 const size_t num_pgs,
+			 TextTable *tbl) {
     *tbl << qi.id
 	 << weightf_t(qi.weight)
 	 << weightf_t(reweight)
@@ -769,7 +801,8 @@ protected:
 	 << si_t(kb_used << 10)
 	 << si_t(kb_avail << 10)
 	 << lowprecision_t(util)
-	 << lowprecision_t(var);
+	 << lowprecision_t(var)
+	 << num_pgs;
 
     if (tree) {
       ostringstream name;
@@ -830,9 +863,16 @@ public:
   }
 
 protected:
-  virtual void dump_item(const CrushTreeDumper::Item &qi, float &reweight,
-			 int64_t kb, int64_t kb_used, int64_t kb_avail,
-			 double& util, double& var, Formatter *f) {
+  using OSDUtilizationDumper<Formatter>::dump_item;
+  virtual void dump_item(const CrushTreeDumper::Item &qi,
+			 float &reweight,
+			 int64_t kb,
+			 int64_t kb_used,
+			 int64_t kb_avail,
+			 double& util,
+			 double& var,
+			 const size_t num_pgs,
+			 Formatter *f) {
     f->open_object_section("item");
     CrushTreeDumper::dump_item_fields(crush, qi, f);
     f->dump_float("reweight", reweight);
@@ -841,6 +881,7 @@ protected:
     f->dump_int("kb_avail", kb_avail);
     f->dump_float("utilization", util);
     f->dump_float("var", var);
+    f->dump_unsigned("pgs", num_pgs);
     CrushTreeDumper::dump_bucket_children(crush, qi, f);
     f->close_section();
   }
@@ -884,7 +925,7 @@ void OSDMonitor::create_pending()
 {
   pending_inc = OSDMap::Incremental(osdmap.epoch+1);
   pending_inc.fsid = mon->monmap->fsid;
-  
+
   dout(10) << "create_pending e " << pending_inc.epoch << dendl;
 
   // drop any redundant pg_temp entries
@@ -894,6 +935,139 @@ void OSDMonitor::create_pending()
   OSDMap::remove_down_temps(g_ceph_context, osdmap, &pending_inc);
 }
 
+void OSDMonitor::maybe_prime_pg_temp()
+{
+  bool all = false;
+  if (pending_inc.crush.length()) {
+    dout(10) << __func__ << " new crush map, all" << dendl;
+    all = true;
+  }
+
+  if (!pending_inc.new_up_client.empty()) {
+    dout(10) << __func__ << " new up osds, all" << dendl;
+    all = true;
+  }
+
+  // check for interesting OSDs
+  set<int> osds;
+  for (map<int32_t,uint8_t>::iterator p = pending_inc.new_state.begin();
+       !all && p != pending_inc.new_state.end();
+       ++p) {
+    if ((p->second & CEPH_OSD_UP) &&
+	osdmap.is_up(p->first)) {
+      osds.insert(p->first);
+    }
+  }
+  for (map<int32_t,uint32_t>::iterator p = pending_inc.new_weight.begin();
+       !all && p != pending_inc.new_weight.end();
+       ++p) {
+    if (p->second < osdmap.get_weight(p->first)) {
+      // weight reduction
+      osds.insert(p->first);
+    } else {
+      dout(10) << __func__ << " osd." << p->first << " weight increase, all"
+	       << dendl;
+      all = true;
+    }
+  }
+
+  if (!all && osds.empty())
+    return;
+
+  OSDMap next;
+  next.deepish_copy_from(osdmap);
+  next.apply_incremental(pending_inc);
+
+  PGMap *pg_map = &mon->pgmon()->pg_map;
+
+  utime_t stop = ceph_clock_now(NULL);
+  stop += g_conf->mon_osd_prime_pg_temp_max_time;
+  int chunk = 1000;
+  int n = chunk;
+
+  if (all) {
+    for (ceph::unordered_map<pg_t, pg_stat_t>::iterator pp =
+	   pg_map->pg_stat.begin();
+	 pp != pg_map->pg_stat.end();
+	 ++pp) {
+      prime_pg_temp(next, pp);
+      if (--n <= 0) {
+	n = chunk;
+	if (ceph_clock_now(NULL) > stop) {
+	  dout(10) << __func__ << " consumed more than "
+		   << g_conf->mon_osd_prime_pg_temp_max_time
+		   << " seconds, stopping"
+		   << dendl;
+	  break;
+	}
+      }
+    }
+  } else {
+    dout(10) << __func__ << " " << osds.size() << " interesting osds" << dendl;
+    for (set<int>::iterator p = osds.begin(); p != osds.end(); ++p) {
+      n -= prime_pg_temp(next, pg_map, *p);
+      if (--n <= 0) {
+	n = chunk;
+	if (ceph_clock_now(NULL) > stop) {
+	  dout(10) << __func__ << " consumed more than "
+		   << g_conf->mon_osd_prime_pg_temp_max_time
+		   << " seconds, stopping"
+		   << dendl;
+	  break;
+	}
+      }
+    }
+  }
+}
+
+void OSDMonitor::prime_pg_temp(OSDMap& next,
+			       ceph::unordered_map<pg_t, pg_stat_t>::iterator pp)
+{
+  // do not touch a mapping if a change is pending
+  if (pending_inc.new_pg_temp.count(pp->first))
+    return;
+  vector<int> up, acting;
+  int up_primary, acting_primary;
+  next.pg_to_up_acting_osds(pp->first, &up, &up_primary, &acting, &acting_primary);
+  if (acting == pp->second.acting)
+    return;  // no change since last pg update, skip
+  vector<int> cur_up, cur_acting;
+  osdmap.pg_to_up_acting_osds(pp->first, &cur_up, &up_primary,
+			      &cur_acting, &acting_primary);
+  if (cur_acting == acting)
+    return;  // no change this epoch; must be stale pg_stat
+  if (cur_acting.empty())
+    return;  // if previously empty now we can be no worse off
+  const pg_pool_t *pool = next.get_pg_pool(pp->first.pool());
+  if (pool && cur_acting.size() < pool->min_size)
+    return;  // can be no worse off than before
+
+  dout(20) << __func__ << " " << pp->first << " " << cur_up << "/" << cur_acting
+	   << " -> " << up << "/" << acting
+	   << ", priming " << cur_acting
+	   << dendl;
+  pending_inc.new_pg_temp[pp->first] = cur_acting;
+}
+
+int OSDMonitor::prime_pg_temp(OSDMap& next, PGMap *pg_map, int osd)
+{
+  dout(10) << __func__ << " osd." << osd << dendl;
+  int num = 0;
+  ceph::unordered_map<int, set<pg_t> >::iterator po = pg_map->pg_by_osd.find(osd);
+  if (po != pg_map->pg_by_osd.end()) {
+    for (set<pg_t>::iterator p = po->second.begin();
+	 p != po->second.end();
+	 ++p, ++num) {
+      ceph::unordered_map<pg_t, pg_stat_t>::iterator pp = pg_map->pg_stat.find(*p);
+      if (pp == pg_map->pg_stat.end())
+	continue;
+      prime_pg_temp(next, pp);
+    }
+  }
+  return num;
+}
+
+
 /**
  * @note receiving a transaction in this function gives a fair amount of
  * freedom to the service implementation if it does need it. It shouldn't.
@@ -902,13 +1076,16 @@ void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
 {
   dout(10) << "encode_pending e " << pending_inc.epoch
 	   << dendl;
-  
+
   // finalize up pending_inc
   pending_inc.modified = ceph_clock_now(g_ceph_context);
 
   int r = pending_inc.propagate_snaps_to_tiers(g_ceph_context, osdmap);
   assert(r == 0);
 
+  if (g_conf->mon_osd_prime_pg_temp)
+    maybe_prime_pg_temp();
+
   bufferlist bl;
 
   // tell me about it
@@ -923,7 +1100,7 @@ void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
   }
   for (map<int32_t,entity_addr_t>::iterator i = pending_inc.new_up_client.begin();
        i != pending_inc.new_up_client.end();
-       ++i) { 
+       ++i) {
     //FIXME: insert cluster addresses too
     dout(2) << " osd." << i->first << " UP " << i->second << dendl;
   }
@@ -978,13 +1155,12 @@ void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
   pending_metadata_rm.clear();
 }
 
-int OSDMonitor::dump_osd_metadata(int osd, Formatter *f, ostream *err)
+int OSDMonitor::load_metadata(int osd, map<string, string>& m, ostream *err)
 {
   bufferlist bl;
   int r = mon->store->get(OSD_METADATA_PREFIX, stringify(osd), bl);
   if (r < 0)
     return r;
-  map<string,string> m;
   try {
     bufferlist::iterator p = bl.begin();
     ::decode(m, p);
@@ -994,11 +1170,39 @@ int OSDMonitor::dump_osd_metadata(int osd, Formatter *f, ostream *err)
       *err << "osd." << osd << " metadata is corrupt";
     return -EIO;
   }
+  return 0;
+}
+
+int OSDMonitor::dump_osd_metadata(int osd, Formatter *f, ostream *err)
+{
+  map<string,string> m;
+  if (int r = load_metadata(osd, m, err))
+    return r;
   for (map<string,string>::iterator p = m.begin(); p != m.end(); ++p)
     f->dump_string(p->first.c_str(), p->second);
   return 0;
 }
 
+void OSDMonitor::print_nodes(Formatter *f)
+{
+  // group OSDs by their hosts
+  map<string, list<int> > osds; // hostname => osd
+  for (int osd = 0; osd <= osdmap.get_max_osd(); osd++) {
+    map<string, string> m;
+    if (load_metadata(osd, m, NULL)) {
+      continue;
+    }
+    map<string, string>::iterator hostname = m.find("hostname");
+    if (hostname == m.end()) {
+      // not likely though
+      continue;
+    }
+    osds[hostname->second].push_back(osd);
+  }
+
+  dump_services(f, osds, "osd");
+}
+
 void OSDMonitor::share_map_with_random_osd()
 {
   if (osdmap.get_num_up_osds() == 0) {
@@ -1055,71 +1259,73 @@ void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx,
 
 // -------------
 
-bool OSDMonitor::preprocess_query(PaxosServiceMessage *m)
+bool OSDMonitor::preprocess_query(MonOpRequestRef op)
 {
+  op->mark_osdmon_event(__func__);
+  PaxosServiceMessage *m = static_cast<PaxosServiceMessage*>(op->get_req());
   dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
 
   switch (m->get_type()) {
     // READs
   case MSG_MON_COMMAND:
-    return preprocess_command(static_cast<MMonCommand*>(m));
+    return preprocess_command(op);
   case CEPH_MSG_MON_GET_OSDMAP:
-    return preprocess_get_osdmap(static_cast<MMonGetOSDMap*>(m));
+    return preprocess_get_osdmap(op);
 
     // damp updates
   case MSG_OSD_MARK_ME_DOWN:
-    return preprocess_mark_me_down(static_cast<MOSDMarkMeDown*>(m));
+    return preprocess_mark_me_down(op);
   case MSG_OSD_FAILURE:
-    return preprocess_failure(static_cast<MOSDFailure*>(m));
+    return preprocess_failure(op);
   case MSG_OSD_BOOT:
-    return preprocess_boot(static_cast<MOSDBoot*>(m));
+    return preprocess_boot(op);
   case MSG_OSD_ALIVE:
-    return preprocess_alive(static_cast<MOSDAlive*>(m));
+    return preprocess_alive(op);
   case MSG_OSD_PGTEMP:
-    return preprocess_pgtemp(static_cast<MOSDPGTemp*>(m));
+    return preprocess_pgtemp(op);
 
   case CEPH_MSG_POOLOP:
-    return preprocess_pool_op(static_cast<MPoolOp*>(m));
+    return preprocess_pool_op(op);
 
   case MSG_REMOVE_SNAPS:
-    return preprocess_remove_snaps(static_cast<MRemoveSnaps*>(m));
-    
+    return preprocess_remove_snaps(op);
+
   default:
     assert(0);
-    m->put();
     return true;
   }
 }
 
-bool OSDMonitor::prepare_update(PaxosServiceMessage *m)
+bool OSDMonitor::prepare_update(MonOpRequestRef op)
 {
+  op->mark_osdmon_event(__func__);
+  PaxosServiceMessage *m = static_cast<PaxosServiceMessage*>(op->get_req());
   dout(7) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
-  
+
   switch (m->get_type()) {
     // damp updates
   case MSG_OSD_MARK_ME_DOWN:
-    return prepare_mark_me_down(static_cast<MOSDMarkMeDown*>(m));
+    return prepare_mark_me_down(op);
   case MSG_OSD_FAILURE:
-    return prepare_failure(static_cast<MOSDFailure*>(m));
+    return prepare_failure(op);
   case MSG_OSD_BOOT:
-    return prepare_boot(static_cast<MOSDBoot*>(m));
+    return prepare_boot(op);
   case MSG_OSD_ALIVE:
-    return prepare_alive(static_cast<MOSDAlive*>(m));
+    return prepare_alive(op);
   case MSG_OSD_PGTEMP:
-    return prepare_pgtemp(static_cast<MOSDPGTemp*>(m));
+    return prepare_pgtemp(op);
 
   case MSG_MON_COMMAND:
-    return prepare_command(static_cast<MMonCommand*>(m));
-    
+    return prepare_command(op);
+
   case CEPH_MSG_POOLOP:
-    return prepare_pool_op(static_cast<MPoolOp*>(m));
+    return prepare_pool_op(op);
 
   case MSG_REMOVE_SNAPS:
-    return prepare_remove_snaps(static_cast<MRemoveSnaps*>(m));
+    return prepare_remove_snaps(op);
 
   default:
     assert(0);
-    m->put();
   }
 
   return false;
@@ -1151,8 +1357,10 @@ bool OSDMonitor::should_propose(double& delay)
 // ---------------------------
 // READs
 
-bool OSDMonitor::preprocess_get_osdmap(MMonGetOSDMap *m)
+bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op)
 {
+  op->mark_osdmon_event(__func__);
+  MMonGetOSDMap *m = static_cast<MMonGetOSDMap*>(op->get_req());
   dout(10) << __func__ << " " << *m << dendl;
   MOSDMap *reply = new MOSDMap(mon->monmap->fsid);
   epoch_t first = get_first_committed();
@@ -1172,8 +1380,7 @@ bool OSDMonitor::preprocess_get_osdmap(MMonGetOSDMap *m)
   }
   reply->oldest_map = get_first_committed();
   reply->newest_map = osdmap.get_epoch();
-  mon->send_reply(m, reply);
-  m->put();
+  mon->send_reply(op, reply);
   return true;
 }
 
@@ -1202,8 +1409,10 @@ bool OSDMonitor::check_source(PaxosServiceMessage *m, uuid_d fsid) {
 }
 
 
-bool OSDMonitor::preprocess_failure(MOSDFailure *m)
+bool OSDMonitor::preprocess_failure(MonOpRequestRef op)
 {
+  op->mark_osdmon_event(__func__);
+  MOSDFailure *m = static_cast<MOSDFailure*>(op->get_req());
   // who is target_osd
   int badboy = m->get_target().name.num();
 
@@ -1218,24 +1427,24 @@ bool OSDMonitor::preprocess_failure(MOSDFailure *m)
 	osdmap.get_addr(from) != m->get_orig_source_inst().addr ||
 	osdmap.is_down(from)) {
       dout(5) << "preprocess_failure from dead osd." << from << ", ignoring" << dendl;
-      send_incremental(m, m->get_epoch()+1);
+      send_incremental(op, m->get_epoch()+1);
       goto didit;
     }
   }
-  
+
 
   // weird?
   if (!osdmap.have_inst(badboy)) {
     dout(5) << "preprocess_failure dne(/dup?): " << m->get_target() << ", from " << m->get_orig_source_inst() << dendl;
     if (m->get_epoch() < osdmap.get_epoch())
-      send_incremental(m, m->get_epoch()+1);
+      send_incremental(op, m->get_epoch()+1);
     goto didit;
   }
   if (osdmap.get_inst(badboy) != m->get_target()) {
     dout(5) << "preprocess_failure wrong osd: report " << m->get_target() << " != map's " << osdmap.get_inst(badboy)
 	    << ", from " << m->get_orig_source_inst() << dendl;
     if (m->get_epoch() < osdmap.get_epoch())
-      send_incremental(m, m->get_epoch()+1);
+      send_incremental(op, m->get_epoch()+1);
     goto didit;
   }
 
@@ -1244,7 +1453,7 @@ bool OSDMonitor::preprocess_failure(MOSDFailure *m)
       osdmap.get_up_from(badboy) > m->get_epoch()) {
     dout(5) << "preprocess_failure dup/old: " << m->get_target() << ", from " << m->get_orig_source_inst() << dendl;
     if (m->get_epoch() < osdmap.get_epoch())
-      send_incremental(m, m->get_epoch()+1);
+      send_incremental(op, m->get_epoch()+1);
     goto didit;
   }
 
@@ -1257,22 +1466,21 @@ bool OSDMonitor::preprocess_failure(MOSDFailure *m)
   return false;
 
  didit:
-  m->put();
   return true;
 }
 
-class C_AckMarkedDown : public Context {
+class C_AckMarkedDown : public C_MonOp {
   OSDMonitor *osdmon;
-  MOSDMarkMeDown *m;
 public:
   C_AckMarkedDown(
     OSDMonitor *osdmon,
-    MOSDMarkMeDown *m)
-    : osdmon(osdmon), m(m) {}
+    MonOpRequestRef op)
+    : C_MonOp(op), osdmon(osdmon) {}
 
-  void finish(int) {
+  void _finish(int) {
+    MOSDMarkMeDown *m = static_cast<MOSDMarkMeDown*>(op->get_req());
     osdmon->mon->send_reply(
-      m,
+      op,
       new MOSDMarkMeDown(
 	m->fsid,
 	m->get_target(),
@@ -1280,12 +1488,13 @@ public:
 	false));   // ACK itself does not request an ack
   }
   ~C_AckMarkedDown() {
-    m->put();
   }
 };
 
-bool OSDMonitor::preprocess_mark_me_down(MOSDMarkMeDown *m)
+bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op)
 {
+  op->mark_osdmon_event(__func__);
+  MOSDMarkMeDown *m = static_cast<MOSDMarkMeDown*>(op->get_req());
   int requesting_down = m->get_target().name.num();
   int from = m->get_orig_source().num();
 
@@ -1302,7 +1511,7 @@ bool OSDMonitor::preprocess_mark_me_down(MOSDMarkMeDown *m)
       osdmap.get_addr(from) != m->get_target().addr) {
     dout(5) << "preprocess_mark_me_down from dead osd."
 	    << from << ", ignoring" << dendl;
-    send_incremental(m, m->get_epoch()+1);
+    send_incremental(op, m->get_epoch()+1);
     goto reply;
   }
 
@@ -1315,14 +1524,16 @@ bool OSDMonitor::preprocess_mark_me_down(MOSDMarkMeDown *m)
 
  reply:
   if (m->request_ack) {
-    Context *c(new C_AckMarkedDown(this, m));
+    Context *c(new C_AckMarkedDown(this, op));
     c->complete(0);
   }
   return true;
 }
 
-bool OSDMonitor::prepare_mark_me_down(MOSDMarkMeDown *m)
+bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op)
 {
+  op->mark_osdmon_event(__func__);
+  MOSDMarkMeDown *m = static_cast<MOSDMarkMeDown*>(op->get_req());
   int target_osd = m->get_target().name.num();
 
   assert(osdmap.is_up(target_osd));
@@ -1331,7 +1542,7 @@ bool OSDMonitor::prepare_mark_me_down(MOSDMarkMeDown *m)
   mon->clog->info() << "osd." << target_osd << " marked itself down\n";
   pending_inc.new_state[target_osd] = CEPH_OSD_UP;
   if (m->request_ack)
-    wait_for_finished_proposal(new C_AckMarkedDown(this, m));
+    wait_for_finished_proposal(op, new C_AckMarkedDown(this, op));
   return true;
 }
 
@@ -1412,7 +1623,9 @@ void OSDMonitor::check_failures(utime_t now)
   for (map<int,failure_info_t>::iterator p = failure_info.begin();
        p != failure_info.end();
        ++p) {
-    check_failure(now, p->first, p->second);
+    if (can_mark_down(p->first)) {
+      check_failure(now, p->first, p->second);
+    }
   }
 }
 
@@ -1482,8 +1695,10 @@ bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
   return false;
 }
 
-bool OSDMonitor::prepare_failure(MOSDFailure *m)
+bool OSDMonitor::prepare_failure(MonOpRequestRef op)
 {
+  op->mark_osdmon_event(__func__);
+  MOSDFailure *m = static_cast<MOSDFailure*>(op->get_req());
   dout(1) << "prepare_failure " << m->get_target() << " from " << m->get_orig_source_inst()
           << " is reporting failure:" << m->if_osd_failed() << dendl;
 
@@ -1495,16 +1710,15 @@ bool OSDMonitor::prepare_failure(MOSDFailure *m)
   // calculate failure time
   utime_t now = ceph_clock_now(g_ceph_context);
   utime_t failed_since = m->get_recv_stamp() - utime_t(m->failed_for ? m->failed_for : g_conf->osd_heartbeat_grace, 0);
-  
+
   if (m->if_osd_failed()) {
     // add a report
     mon->clog->debug() << m->get_target() << " reported failed by "
 		      << m->get_orig_source_inst() << "\n";
     failure_info_t& fi = failure_info[target_osd];
-    MOSDFailure *old = fi.add_report(reporter, failed_since, m);
-    if (old) {
-      mon->no_reply(old);
-      old->put();
+    MonOpRequestRef old_op = fi.add_report(reporter, failed_since, op);
+    if (old_op) {
+      mon->no_reply(old_op);
     }
 
     return check_failure(now, target_osd, fi);
@@ -1514,12 +1728,12 @@ bool OSDMonitor::prepare_failure(MOSDFailure *m)
 		      << m->get_orig_source_inst() << "\n";
     if (failure_info.count(target_osd)) {
       failure_info_t& fi = failure_info[target_osd];
-      list<MOSDFailure*> ls;
+      list<MonOpRequestRef> ls;
       fi.take_report_messages(ls);
       fi.cancel_report(reporter);
       while (!ls.empty()) {
-	mon->no_reply(ls.front());
-	ls.front()->put();
+        if (ls.front())
+          mon->no_reply(ls.front());
 	ls.pop_front();
       }
       if (fi.reporters.empty()) {
@@ -1533,10 +1747,9 @@ bool OSDMonitor::prepare_failure(MOSDFailure *m)
     } else {
       dout(10) << " no failure_info for osd." << target_osd << dendl;
     }
-    mon->no_reply(m);
-    m->put();
+    mon->no_reply(op);
   }
-  
+
   return false;
 }
 
@@ -1548,19 +1761,24 @@ void OSDMonitor::process_failures()
       ++p;
     } else {
       dout(10) << "process_failures osd." << p->first << dendl;
-      list<MOSDFailure*> ls;
+      list<MonOpRequestRef> ls;
       p->second.take_report_messages(ls);
       failure_info.erase(p++);
 
       while (!ls.empty()) {
-	send_latest(ls.front(), ls.front()->get_epoch());
+        MonOpRequestRef o = ls.front();
+        if (o) {
+          o->mark_event(__func__);
+          MOSDFailure *m = o->get_req<MOSDFailure>();
+          send_latest(o, m->get_epoch());
+        }
 	ls.pop_front();
       }
     }
   }
 }
 
-void OSDMonitor::take_all_failures(list<MOSDFailure*>& ls)
+void OSDMonitor::take_all_failures(list<MonOpRequestRef>& ls)
 {
   dout(10) << __func__ << " on " << failure_info.size() << " osds" << dendl;
 
@@ -1575,8 +1793,10 @@ void OSDMonitor::take_all_failures(list<MOSDFailure*>& ls)
 
 // boot --
 
-bool OSDMonitor::preprocess_boot(MOSDBoot *m)
+bool OSDMonitor::preprocess_boot(MonOpRequestRef op)
 {
+  op->mark_osdmon_event(__func__);
+  MOSDBoot *m = static_cast<MOSDBoot*>(op->get_req());
   int from = m->get_orig_source_inst().name.num();
 
   // check permissions, ignore if failed (no response expected)
@@ -1621,20 +1841,51 @@ bool OSDMonitor::preprocess_boot(MOSDBoot *m)
     goto ignore;
   }
 
+  if ((osdmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL) &
+       CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3) &&
+      !(m->get_connection()->get_features() & CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3)) {
+    dout(0) << __func__ << " osdmap requires erasure code plugins v3 but osd at "
+            << m->get_orig_source_inst()
+            << " doesn't announce support -- ignore" << dendl;
+    goto ignore;
+  }
+
+  if (osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE) &&
+      !(m->osd_features & CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT)) {
+    mon->clog->info() << "disallowing boot of OSD "
+		      << m->get_orig_source_inst()
+		      << " because 'sortbitwise' osdmap flag is set and OSD lacks the OSD_BITWISE_HOBJ_SORT feature\n";
+    goto ignore;
+  }
+
+  if (any_of(osdmap.get_pools().begin(),
+	     osdmap.get_pools().end(),
+	     [](const std::pair<int64_t,pg_pool_t>& pool)
+	     { return pool.second.use_gmt_hitset; })) {
+    assert(osdmap.get_num_up_osds() == 0 ||
+	   osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT);
+    if (!(m->osd_features & CEPH_FEATURE_OSD_HITSET_GMT)) {
+      dout(0) << __func__ << " one or more pools uses GMT hitsets but osd at "
+	      << m->get_orig_source_inst()
+	      << " doesn't announce support -- ignore" << dendl;
+      goto ignore;
+    }
+  }
+
   // make sure upgrades stop at hammer
-  //  * OSD_PROXY_FEATURES is the last pre-hammer feature
+  //  * HAMMER_0_94_4 is the required hammer feature
   //  * MON_METADATA is the first post-hammer feature
   if (osdmap.get_num_up_osds() > 0) {
     if ((m->osd_features & CEPH_FEATURE_MON_METADATA) &&
-	!(osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_PROXY_FEATURES)) {
+	!(osdmap.get_up_osd_features() & CEPH_FEATURE_HAMMER_0_94_4)) {
       mon->clog->info() << "disallowing boot of post-hammer OSD "
 			<< m->get_orig_source_inst()
-			<< " because one or more up OSDs is pre-hammer\n";
+			<< " because one or more up OSDs is pre-hammer v0.94.4\n";
       goto ignore;
     }
-    if (!(m->osd_features & CEPH_FEATURE_OSD_PROXY_FEATURES) &&
+    if (!(m->osd_features & CEPH_FEATURE_HAMMER_0_94_4) &&
 	(osdmap.get_up_osd_features() & CEPH_FEATURE_MON_METADATA)) {
-      mon->clog->info() << "disallowing boot of pre-hammer OSD "
+      mon->clog->info() << "disallowing boot of pre-hammer v0.94.4 OSD "
 			<< m->get_orig_source_inst()
 			<< " because all up OSDs are post-hammer\n";
       goto ignore;
@@ -1647,7 +1898,7 @@ bool OSDMonitor::preprocess_boot(MOSDBoot *m)
     // yup.
     dout(7) << "preprocess_boot dup from " << m->get_orig_source_inst()
 	    << " == " << osdmap.get_inst(from) << dendl;
-    _booted(m, false);
+    _booted(op, false);
     return true;
   }
 
@@ -1664,14 +1915,14 @@ bool OSDMonitor::preprocess_boot(MOSDBoot *m)
   if (osdmap.exists(from) &&
       osdmap.get_info(from).up_from > m->version) {
     dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl;
-    send_latest(m, m->sb.current_epoch+1);
+    send_latest(op, m->sb.current_epoch+1);
     return true;
   }
 
   // noup?
   if (!can_mark_up(from)) {
     dout(7) << "preprocess_boot ignoring boot from " << m->get_orig_source_inst() << dendl;
-    send_latest(m, m->sb.current_epoch+1);
+    send_latest(op, m->sb.current_epoch+1);
     return true;
   }
 
@@ -1679,12 +1930,13 @@ bool OSDMonitor::preprocess_boot(MOSDBoot *m)
   return false;
 
  ignore:
-  m->put();
   return true;
 }
 
-bool OSDMonitor::prepare_boot(MOSDBoot *m)
+bool OSDMonitor::prepare_boot(MonOpRequestRef op)
 {
+  op->mark_osdmon_event(__func__);
+  MOSDBoot *m = static_cast<MOSDBoot*>(op->get_req());
   dout(7) << "prepare_boot from " << m->get_orig_source_inst() << " sb " << m->sb
 	  << " cluster_addr " << m->cluster_addr
 	  << " hb_back_addr " << m->hb_back_addr
@@ -1697,7 +1949,6 @@ bool OSDMonitor::prepare_boot(MOSDBoot *m)
   // does this osd exist?
   if (from >= osdmap.get_max_osd()) {
     dout(1) << "boot from osd." << from << " >= max_osd " << osdmap.get_max_osd() << dendl;
-    m->put();
     return false;
   }
 
@@ -1711,17 +1962,17 @@ bool OSDMonitor::prepare_boot(MOSDBoot *m)
     // preprocess should have caught these;  if not, assert.
     assert(osdmap.get_inst(from) != m->get_orig_source_inst());
     assert(osdmap.get_uuid(from) == m->sb.osd_fsid);
-    
+
     if (pending_inc.new_state.count(from) == 0 ||
 	(pending_inc.new_state[from] & CEPH_OSD_UP) == 0) {
       // mark previous guy down
       pending_inc.new_state[from] = CEPH_OSD_UP;
     }
-    wait_for_finished_proposal(new C_RetryMessage(this, m));
+    wait_for_finished_proposal(op, new C_RetryMessage(this, op));
   } else if (pending_inc.new_up_client.count(from)) { //FIXME: should this be using new_up_client?
     // already prepared, just wait
     dout(7) << "prepare_boot already prepared, waiting on " << m->get_orig_source_addr() << dendl;
-    wait_for_finished_proposal(new C_RetryMessage(this, m));
+    wait_for_finished_proposal(op, new C_RetryMessage(this, op));
   } else {
     // mark new guy up.
     pending_inc.new_up_client[from] = m->get_orig_source_addr();
@@ -1815,13 +2066,15 @@ bool OSDMonitor::prepare_boot(MOSDBoot *m)
     pending_inc.new_xinfo[from] = xi;
 
     // wait
-    wait_for_finished_proposal(new C_Booted(this, m));
+    wait_for_finished_proposal(op, new C_Booted(this, op));
   }
   return true;
 }
 
-void OSDMonitor::_booted(MOSDBoot *m, bool logit)
+void OSDMonitor::_booted(MonOpRequestRef op, bool logit)
 {
+  op->mark_osdmon_event(__func__);
+  MOSDBoot *m = static_cast<MOSDBoot*>(op->get_req());
   dout(7) << "_booted " << m->get_orig_source_inst() 
 	  << " w " << m->sb.weight << " from " << m->sb.current_epoch << dendl;
 
@@ -1829,15 +2082,17 @@ void OSDMonitor::_booted(MOSDBoot *m, bool logit)
     mon->clog->info() << m->get_orig_source_inst() << " boot\n";
   }
 
-  send_latest(m, m->sb.current_epoch+1);
+  send_latest(op, m->sb.current_epoch+1);
 }
 
 
 // -------------
 // alive
 
-bool OSDMonitor::preprocess_alive(MOSDAlive *m)
+bool OSDMonitor::preprocess_alive(MonOpRequestRef op)
 {
+  op->mark_osdmon_event(__func__);
+  MOSDAlive *m = static_cast<MOSDAlive*>(op->get_req());
   int from = m->get_orig_source().num();
 
   // check permissions, ignore if failed
@@ -1859,7 +2114,7 @@ bool OSDMonitor::preprocess_alive(MOSDAlive *m)
   if (osdmap.get_up_thru(from) >= m->want) {
     // yup.
     dout(7) << "preprocess_alive want up_thru " << m->want << " dup from " << m->get_orig_source_inst() << dendl;
-    _reply_map(m, m->version);
+    _reply_map(op, m->version);
     return true;
   }
 
@@ -1868,12 +2123,13 @@ bool OSDMonitor::preprocess_alive(MOSDAlive *m)
   return false;
 
  ignore:
-  m->put();
   return true;
 }
 
-bool OSDMonitor::prepare_alive(MOSDAlive *m)
+bool OSDMonitor::prepare_alive(MonOpRequestRef op)
 {
+  op->mark_osdmon_event(__func__);
+  MOSDAlive *m = static_cast<MOSDAlive*>(op->get_req());
   int from = m->get_orig_source().num();
 
   if (0) {  // we probably don't care much about these
@@ -1883,23 +2139,25 @@ bool OSDMonitor::prepare_alive(MOSDAlive *m)
   dout(7) << "prepare_alive want up_thru " << m->want << " have " << m->version
 	  << " from " << m->get_orig_source_inst() << dendl;
   pending_inc.new_up_thru[from] = m->version;  // set to the latest map the OSD has
-  wait_for_finished_proposal(new C_ReplyMap(this, m, m->version));
+  wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
   return true;
 }
 
-void OSDMonitor::_reply_map(PaxosServiceMessage *m, epoch_t e)
+void OSDMonitor::_reply_map(MonOpRequestRef op, epoch_t e)
 {
+  op->mark_osdmon_event(__func__);
   dout(7) << "_reply_map " << e
-	  << " from " << m->get_orig_source_inst()
+	  << " from " << op->get_req()->get_orig_source_inst()
 	  << dendl;
-  send_latest(m, e);
+  send_latest(op, e);
 }
 
 // -------------
 // pg_temp changes
 
-bool OSDMonitor::preprocess_pgtemp(MOSDPGTemp *m)
+bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op)
 {
+  MOSDPGTemp *m = static_cast<MOSDPGTemp*>(op->get_req());
   dout(10) << "preprocess_pgtemp " << *m << dendl;
   vector<int> empty;
   int from = m->get_orig_source().num();
@@ -1961,16 +2219,17 @@ bool OSDMonitor::preprocess_pgtemp(MOSDPGTemp *m)
     goto ignore;
 
   dout(7) << "preprocess_pgtemp e" << m->map_epoch << " no changes from " << m->get_orig_source_inst() << dendl;
-  _reply_map(m, m->map_epoch);
+  _reply_map(op, m->map_epoch);
   return true;
 
  ignore:
-  m->put();
   return true;
 }
 
-bool OSDMonitor::prepare_pgtemp(MOSDPGTemp *m)
+bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op)
 {
+  op->mark_osdmon_event(__func__);
+  MOSDPGTemp *m = static_cast<MOSDPGTemp*>(op->get_req());
   int from = m->get_orig_source().num();
   dout(7) << "prepare_pgtemp e" << m->map_epoch << " from " << m->get_orig_source_inst() << dendl;
   for (map<pg_t,vector<int32_t> >::iterator p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
@@ -1995,15 +2254,17 @@ bool OSDMonitor::prepare_pgtemp(MOSDPGTemp *m)
       pending_inc.new_primary_temp[p->first] = -1;
   }
   pending_inc.new_up_thru[from] = m->map_epoch;   // set up_thru too, so the osd doesn't have to ask again
-  wait_for_finished_proposal(new C_ReplyMap(this, m, m->map_epoch));
+  wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->map_epoch));
   return true;
 }
 
 
 // ---
 
-bool OSDMonitor::preprocess_remove_snaps(MRemoveSnaps *m)
+bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op)
 {
+  op->mark_osdmon_event(__func__);
+  MRemoveSnaps *m = static_cast<MRemoveSnaps*>(op->get_req());
   dout(7) << "preprocess_remove_snaps " << *m << dendl;
 
   // check privilege, ignore if failed
@@ -2024,7 +2285,7 @@ bool OSDMonitor::preprocess_remove_snaps(MRemoveSnaps *m)
       continue;
     }
     const pg_pool_t *pi = osdmap.get_pg_pool(q->first);
-    for (vector<snapid_t>::iterator p = q->second.begin(); 
+    for (vector<snapid_t>::iterator p = q->second.begin();
 	 p != q->second.end();
 	 ++p) {
       if (*p > pi->get_snap_seq() ||
@@ -2034,15 +2295,16 @@ bool OSDMonitor::preprocess_remove_snaps(MRemoveSnaps *m)
   }
 
  ignore:
-  m->put();
   return true;
 }
 
-bool OSDMonitor::prepare_remove_snaps(MRemoveSnaps *m)
+bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op)
 {
+  op->mark_osdmon_event(__func__);
+  MRemoveSnaps *m = static_cast<MRemoveSnaps*>(op->get_req());
   dout(7) << "prepare_remove_snaps " << *m << dendl;
 
-  for (map<int, vector<snapid_t> >::iterator p = m->snaps.begin(); 
+  for (map<int, vector<snapid_t> >::iterator p = m->snaps.begin();
        p != m->snaps.end();
        ++p) {
     pg_pool_t& pi = osdmap.pools[p->first];
@@ -2064,8 +2326,6 @@ bool OSDMonitor::prepare_remove_snaps(MRemoveSnaps *m)
       }
     }
   }
-
-  m->put();
   return true;
 }
 
@@ -2073,15 +2333,15 @@ bool OSDMonitor::prepare_remove_snaps(MRemoveSnaps *m)
 // ---------------
 // map helpers
 
-void OSDMonitor::send_latest(PaxosServiceMessage *m, epoch_t start)
+void OSDMonitor::send_latest(MonOpRequestRef op, epoch_t start)
 {
-  dout(5) << "send_latest to " << m->get_orig_source_inst()
+  op->mark_osdmon_event(__func__);
+  dout(5) << "send_latest to " << op->get_req()->get_orig_source_inst()
 	  << " start " << start << dendl;
   if (start == 0)
-    send_full(m);
+    send_full(op);
   else
-    send_incremental(m, start);
-  m->put();
+    send_incremental(op, start);
 }
 
 
@@ -2127,99 +2387,36 @@ MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to)
   return m;
 }
 
-void OSDMonitor::send_full(PaxosServiceMessage *m)
+void OSDMonitor::send_full(MonOpRequestRef op)
 {
-  dout(5) << "send_full to " << m->get_orig_source_inst() << dendl;
-  mon->send_reply(m, build_latest_full());
+  op->mark_osdmon_event(__func__);
+  dout(5) << "send_full to " << op->get_req()->get_orig_source_inst() << dendl;
+  mon->send_reply(op, build_latest_full());
 }
 
-/* TBH, I'm fairly certain these two functions could somehow be using a single
- * helper function to do the heavy lifting. As this is not our main focus right
- * now, I'm leaving it to the next near-future iteration over the services'
- * code. We should not forget it though.
- *
- * TODO: create a helper function and get rid of the duplicated code.
- */
-void OSDMonitor::send_incremental(PaxosServiceMessage *req, epoch_t first)
+void OSDMonitor::send_incremental(MonOpRequestRef op, epoch_t first)
 {
-  dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]"
-	  << " to " << req->get_orig_source_inst()
-	  << dendl;
-
-  int osd = -1;
-  if (req->get_source().is_osd()) {
-    osd = req->get_source().num();
-    map<int,epoch_t>::iterator p = osd_epoch.find(osd);
-    if (p != osd_epoch.end()) {
-      if (first <= p->second) {
-	dout(10) << __func__ << " osd." << osd << " should already have epoch "
-		 << p->second << dendl;
-	first = p->second + 1;
-	if (first > osdmap.get_epoch())
-	  return;
-      }
-    }
-  }
-
-  if (first < get_first_committed()) {
-    first = get_first_committed();
-    bufferlist bl;
-    int err = get_version_full(first, bl);
-    assert(err == 0);
-    assert(bl.length());
-
-    dout(20) << "send_incremental starting with base full "
-	     << first << " " << bl.length() << " bytes" << dendl;
-
-    MOSDMap *m = new MOSDMap(osdmap.get_fsid());
-    m->oldest_map = first;
-    m->newest_map = osdmap.get_epoch();
-    m->maps[first] = bl;
-    mon->send_reply(req, m);
+  op->mark_osdmon_event(__func__);
 
-    if (osd >= 0)
-      note_osd_has_epoch(osd, osdmap.get_epoch());
-    return;
-  }
-
-  // send some maps.  it may not be all of them, but it will get them
-  // started.
-  epoch_t last = MIN(first + g_conf->osd_map_message_max, osdmap.get_epoch());
-  MOSDMap *m = build_incremental(first, last);
-  m->oldest_map = get_first_committed();
-  m->newest_map = osdmap.get_epoch();
-  mon->send_reply(req, m);
-
-  if (osd >= 0)
-    note_osd_has_epoch(osd, last);
-}
-
-// FIXME: we assume the OSD actually receives this.  if the mon
-// session drops and they reconnect we may not share the same maps
-// with them again, which could cause a strange hang (perhaps stuck
-// 'waiting for osdmap' requests?).  this information should go in the
-// MonSession, but I think these functions need to be refactored in
-// terms of MonSession first for that to work.
-void OSDMonitor::note_osd_has_epoch(int osd, epoch_t epoch)
-{
-  dout(20) << __func__ << " osd." << osd << " epoch " << epoch << dendl;
-  map<int,epoch_t>::iterator p = osd_epoch.find(osd);
-  if (p != osd_epoch.end()) {
-    dout(20) << __func__ << " osd." << osd << " epoch " << epoch
-	     << " (was " << p->second << ")" << dendl;
-    p->second = epoch;
-  } else {
-    dout(20) << __func__ << " osd." << osd << " epoch " << epoch << dendl;
-    osd_epoch[osd] = epoch;
-  }
+  MonSession *s = op->get_session();
+  assert(s);
+  send_incremental(first, s, false, op);
 }
 
-void OSDMonitor::send_incremental(epoch_t first, MonSession *session,
-				  bool onetime)
+void OSDMonitor::send_incremental(epoch_t first,
+				  MonSession *session,
+				  bool onetime,
+				  MonOpRequestRef req)
 {
   dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]"
 	  << " to " << session->inst << dendl;
 
+  if (first <= session->osd_epoch) {
+    dout(10) << __func__ << session->inst << " should already have epoch "
+	     << session->osd_epoch << dendl;
+    first = session->osd_epoch + 1;
+  }
+
   if (first < get_first_committed()) {
     first = get_first_committed();
     bufferlist bl;
@@ -2234,20 +2431,34 @@ void OSDMonitor::send_incremental(epoch_t first, MonSession *session,
     m->oldest_map = first;
     m->newest_map = osdmap.get_epoch();
     m->maps[first] = bl;
-    session->con->send_message(m);
+
+    if (req) {
+      mon->send_reply(req, m);
+      session->osd_epoch = first;
+      return;
+    } else {
+      session->con->send_message(m);
+      session->osd_epoch = first;
+    }
     first++;
   }
 
   while (first <= osdmap.get_epoch()) {
     epoch_t last = MIN(first + g_conf->osd_map_message_max, osdmap.get_epoch());
     MOSDMap *m = build_incremental(first, last);
-    session->con->send_message(m);
-    first = last + 1;
-
-    if (session->inst.name.is_osd())
-      note_osd_has_epoch(session->inst.name.num(), last);
 
-    if (onetime)
+    if (req) {
+      // send some maps.  it may not be all of them, but it will get them
+      // started.
+      m->oldest_map = get_first_committed();
+      m->newest_map = osdmap.get_epoch();
+      mon->send_reply(req, m);
+    } else {
+      session->con->send_message(m);
+      first = last + 1;
+    }
+    session->osd_epoch = last;
+    if (onetime || req)
       break;
   }
 }
@@ -2276,8 +2487,6 @@ int OSDMonitor::get_version_full(version_t ver, bufferlist& bl)
     return ret;
 }
 
-
-
 epoch_t OSDMonitor::blacklist(const entity_addr_t& a, utime_t until)
 {
   dout(10) << "blacklist " << a << " until " << until << dendl;
@@ -2400,7 +2609,7 @@ void OSDMonitor::tick()
 	  pending_inc.new_xinfo[o].old_weight = osdmap.osd_weight[o];
 
 	  do_propose = true;
-	
+
 	  mon->clog->info() << "osd." << o << " out (down for " << down << ")\n";
 	} else
 	  continue;
@@ -2450,7 +2659,7 @@ void OSDMonitor::tick()
       for (ps_t ps = 0; ps < numps; ++ps) {
 	pg_t pgid = pg_t(pg_t::TYPE_REPLICATED, ps, pool, -1);
 	vector<int> osds;
-	osdmap.pg_to_osds(pgid, osds); 
+	osdmap.pg_to_osds(pgid, osds);
 	if (osds[0] == 0) {
 	  pending_inc.new_pg_swap_primary[pgid] = osds[1];
 	  dout(3) << "Changing primary for PG " << pgid << " from " << osds[0] << " to "
@@ -2664,8 +2873,37 @@ void OSDMonitor::dump_info(Formatter *f)
   f->close_section();
 }
 
-bool OSDMonitor::preprocess_command(MMonCommand *m)
+namespace {
+  enum osd_pool_get_choices {
+    SIZE, MIN_SIZE, CRASH_REPLAY_INTERVAL,
+    PG_NUM, PGP_NUM, CRUSH_RULESET, HASHPSPOOL,
+    NODELETE, NOPGCHANGE, NOSIZECHANGE,
+    WRITE_FADVISE_DONTNEED, NOSCRUB, NODEEP_SCRUB,
+    HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
+    USE_GMT_HITSET, AUID, TARGET_MAX_OBJECTS, TARGET_MAX_BYTES,
+    CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
+    CACHE_TARGET_FULL_RATIO,
+    CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
+    ERASURE_CODE_PROFILE, MIN_READ_RECENCY_FOR_PROMOTE,
+    MIN_WRITE_RECENCY_FOR_PROMOTE, FAST_READ};
+
+  std::set<osd_pool_get_choices>
+    subtract_second_from_first(const std::set<osd_pool_get_choices>& first,
+				const std::set<osd_pool_get_choices>& second)
+    {
+      std::set<osd_pool_get_choices> result;
+      std::set_difference(first.begin(), first.end(),
+			  second.begin(), second.end(),
+			  std::inserter(result, result.end()));
+      return result;
+    }
+}
+
+
+bool OSDMonitor::preprocess_command(MonOpRequestRef op)
 {
+  op->mark_osdmon_event(__func__);
+  MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
   int r = 0;
   bufferlist rdata;
   stringstream ss, ds;
@@ -2673,13 +2911,13 @@ bool OSDMonitor::preprocess_command(MMonCommand *m)
   map<string, cmd_vartype> cmdmap;
   if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
     string rs = ss.str();
-    mon->reply_command(m, -EINVAL, rs, get_last_committed());
+    mon->reply_command(op, -EINVAL, rs, get_last_committed());
     return true;
   }
 
   MonSession *session = m->get_session();
   if (!session) {
-    mon->reply_command(m, -EACCES, "access denied", rdata, get_last_committed());
+    mon->reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
     return true;
   }
 
@@ -2762,7 +3000,7 @@ bool OSDMonitor::preprocess_command(MMonCommand *m)
 	f->flush(ds);
       } else {
 	p->print(ds);
-      } 
+      }
       rdata.append(ds);
       if (!f)
 	ds << " ";
@@ -2786,17 +3024,17 @@ bool OSDMonitor::preprocess_command(MMonCommand *m)
 	    ds << i;
 	  }
 	}
-      } 
+      }
       rdata.append(ds);
     } else if (prefix == "osd tree") {
       if (f) {
 	f->open_object_section("tree");
-	p->print_tree(NULL, f.get());
+	p->print_tree(f.get(), NULL);
 	f->close_section();
 	f->flush(ds);
       } else {
-	p->print_tree(&ds, NULL);
-      } 
+	p->print_tree(NULL, &ds);
+      }
       rdata.append(ds);
     } else if (prefix == "osd getmap") {
       rdata.append(osdmap_bl);
@@ -2850,14 +3088,15 @@ bool OSDMonitor::preprocess_command(MMonCommand *m)
     f->close_section();
     f->flush(rdata);
   } else if (prefix == "osd metadata") {
-    int64_t osd;
-    if (!cmd_getval(g_ceph_context, cmdmap, "id", osd)) {
+    int64_t osd = -1;
+    if (cmd_vartype_stringify(cmdmap["id"]).size() &&
+        !cmd_getval(g_ceph_context, cmdmap, "id", osd)) {
       ss << "unable to parse osd id value '"
          << cmd_vartype_stringify(cmdmap["id"]) << "'";
       r = -EINVAL;
       goto reply;
     }
-    if (!osdmap.exists(osd)) {
+    if (osd >= 0 && !osdmap.exists(osd)) {
       ss << "osd." << osd << " does not exist";
       r = -ENOENT;
       goto reply;
@@ -2865,11 +3104,27 @@ bool OSDMonitor::preprocess_command(MMonCommand *m)
     string format;
     cmd_getval(g_ceph_context, cmdmap, "format", format);
     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
-    f->open_object_section("osd_metadata");
-    r = dump_osd_metadata(osd, f.get(), &ss);
-    if (r < 0)
-      goto reply;
-    f->close_section();
+    if (osd >= 0) {
+      f->open_object_section("osd_metadata");
+      f->dump_unsigned("id", osd);
+      r = dump_osd_metadata(osd, f.get(), &ss);
+      if (r < 0)
+        goto reply;
+      f->close_section();
+    } else {
+      f->open_array_section("osd_metadata");
+      for (int i=0; i<osdmap.get_max_osd(); ++i) {
+        if (osdmap.exists(i)) {
+          f->open_object_section("osd");
+          f->dump_unsigned("id", i);
+          r = dump_osd_metadata(i, f.get(), NULL);
+          if (r < 0)
+            goto reply;
+          f->close_section();
+        }
+      }
+      f->close_section();
+    }
     f->flush(rdata);
   } else if (prefix == "osd map") {
     string poolstr, objstr, namespacestr;
@@ -3081,145 +3336,319 @@ bool OSDMonitor::preprocess_command(MMonCommand *m)
     string var;
     cmd_getval(g_ceph_context, cmdmap, "var", var);
 
-    if (!p->is_tier() &&
-        (var == "hit_set_type" || var == "hit_set_period" ||
-         var == "hit_set_count" || var == "hit_set_fpp" ||
-         var == "target_max_objects" || var == "target_max_bytes" ||
-         var == "cache_target_full_ratio" ||
-         var == "cache_target_dirty_ratio" ||
-         var == "cache_min_flush_age" || var == "cache_min_evict_age")) {
-      ss << "pool '" << poolstr
-         << "' is not a tier pool: variable not applicable";
-      r = -EACCES;
-      goto reply;
-    }
+    typedef std::map<std::string, osd_pool_get_choices> choices_map_t;
+    const choices_map_t ALL_CHOICES = boost::assign::map_list_of
+      ("size", SIZE)
+      ("min_size", MIN_SIZE)
+      ("crash_replay_interval", CRASH_REPLAY_INTERVAL)
+      ("pg_num", PG_NUM)("pgp_num", PGP_NUM)("crush_ruleset", CRUSH_RULESET)
+      ("hashpspool", HASHPSPOOL)("nodelete", NODELETE)
+      ("nopgchange", NOPGCHANGE)("nosizechange", NOSIZECHANGE)
+      ("noscrub", NOSCRUB)("nodeep-scrub", NODEEP_SCRUB)
+      ("write_fadvise_dontneed", WRITE_FADVISE_DONTNEED)
+      ("hit_set_type", HIT_SET_TYPE)("hit_set_period", HIT_SET_PERIOD)
+      ("hit_set_count", HIT_SET_COUNT)("hit_set_fpp", HIT_SET_FPP)
+      ("use_gmt_hitset", USE_GMT_HITSET)
+      ("auid", AUID)("target_max_objects", TARGET_MAX_OBJECTS)
+      ("target_max_bytes", TARGET_MAX_BYTES)
+      ("cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO)
+      ("cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO)
+      ("cache_target_full_ratio", CACHE_TARGET_FULL_RATIO)
+      ("cache_min_flush_age", CACHE_MIN_FLUSH_AGE)
+      ("cache_min_evict_age", CACHE_MIN_EVICT_AGE)
+      ("erasure_code_profile", ERASURE_CODE_PROFILE)
+      ("min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE)
+      ("min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE)
+      ("fast_read", FAST_READ);
+
+    typedef std::set<osd_pool_get_choices> choices_set_t;
+
+    const choices_set_t ONLY_TIER_CHOICES = boost::assign::list_of
+      (HIT_SET_TYPE)(HIT_SET_PERIOD)(HIT_SET_COUNT)(HIT_SET_FPP)
+      (TARGET_MAX_OBJECTS)(TARGET_MAX_BYTES)(CACHE_TARGET_FULL_RATIO)
+      (CACHE_TARGET_DIRTY_RATIO)(CACHE_TARGET_DIRTY_HIGH_RATIO)(CACHE_MIN_FLUSH_AGE)
+      (CACHE_MIN_EVICT_AGE)(MIN_READ_RECENCY_FOR_PROMOTE);
+
+    const choices_set_t ONLY_ERASURE_CHOICES = boost::assign::list_of
+      (ERASURE_CODE_PROFILE);
+
+    choices_set_t selected_choices;
+    if (var == "all") {
+      for(choices_map_t::const_iterator it = ALL_CHOICES.begin();
+	  it != ALL_CHOICES.end(); ++it) {
+	selected_choices.insert(it->second);
+      }
 
-    if (!p->is_erasure() && var == "erasure_code_profile") {
-      ss << "pool '" << poolstr
-         << "' is not a erasure pool: variable not applicable";
-      r = -EACCES;
-      goto reply;
+      if(!p->is_tier()) {
+	selected_choices = subtract_second_from_first(selected_choices,
+						      ONLY_TIER_CHOICES);
+      }
+
+      if(!p->is_erasure()) {
+	selected_choices = subtract_second_from_first(selected_choices,
+						      ONLY_ERASURE_CHOICES);
+      }
+    } else /* var != "all" */  {
+      choices_map_t::const_iterator found = ALL_CHOICES.find(var);
+      osd_pool_get_choices selected = found->second;
+
+      if (!p->is_tier() &&
+	  ONLY_TIER_CHOICES.find(selected) != ONLY_TIER_CHOICES.end()) {
+	ss << "pool '" << poolstr
+	   << "' is not a tier pool: variable not applicable";
+	r = -EACCES;
+	goto reply;
+      }
+
+      if (!p->is_erasure() &&
+	  ONLY_ERASURE_CHOICES.find(selected)
+	  != ONLY_ERASURE_CHOICES.end()) {
+	ss << "pool '" << poolstr
+	   << "' is not a erasure pool: variable not applicable";
+	r = -EACCES;
+	goto reply;
+      }
+
+      selected_choices.insert(selected);
     }
 
     if (f) {
-      f->open_object_section("pool");
-      f->dump_string("pool", poolstr);
-      f->dump_int("pool_id", pool);
-
-      if (var == "pg_num") {
-        f->dump_int("pg_num", p->get_pg_num());
-      } else if (var == "pgp_num") {
-        f->dump_int("pgp_num", p->get_pgp_num());
-      } else if (var == "auid") {
-        f->dump_int("auid", p->get_auid());
-      } else if (var == "size") {
-        f->dump_int("size", p->get_size());
-      } else if (var == "min_size") {
-        f->dump_int("min_size", p->get_min_size());
-      } else if (var == "crash_replay_interval") {
-        f->dump_int("crash_replay_interval", p->get_crash_replay_interval());
-      } else if (var == "crush_ruleset") {
-        f->dump_int("crush_ruleset", p->get_crush_ruleset());
-      } else if (var == "hit_set_period") {
-	f->dump_int("hit_set_period", p->hit_set_period);
-      } else if (var == "hit_set_count") {
-	f->dump_int("hit_set_count", p->hit_set_count);
-      } else if (var == "hit_set_type") {
-	f->dump_string("hit_set_type", HitSet::get_type_name(p->hit_set_params.get_type()));
-      } else if (var == "hit_set_fpp") {
-	if (p->hit_set_params.get_type() != HitSet::TYPE_BLOOM) {
-	  f->close_section();
-	  ss << "hit set is no of type Bloom; invalid to get a false positive rate!";
-	  r = -EINVAL;
-	  goto reply;
-	} else {
-	  BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
-	  f->dump_float("hit_set_fpp", bloomp->get_fpp());
+      for(choices_set_t::const_iterator it = selected_choices.begin();
+	  it != selected_choices.end(); ++it) {
+	choices_map_t::const_iterator i;
+	f->open_object_section("pool");
+	f->dump_string("pool", poolstr);
+	f->dump_int("pool_id", pool);
+	switch(*it) {
+	  case PG_NUM:
+	    f->dump_int("pg_num", p->get_pg_num());
+	    break;
+	  case PGP_NUM:
+	    f->dump_int("pgp_num", p->get_pgp_num());
+	    break;
+	  case AUID:
+	    f->dump_int("auid", p->get_auid());
+	    break;
+	  case SIZE:
+	    f->dump_int("size", p->get_size());
+	    break;
+	  case MIN_SIZE:
+	    f->dump_int("min_size", p->get_min_size());
+	    break;
+	  case CRASH_REPLAY_INTERVAL:
+	    f->dump_int("crash_replay_interval",
+			p->get_crash_replay_interval());
+	    break;
+	  case CRUSH_RULESET:
+	    f->dump_int("crush_ruleset", p->get_crush_ruleset());
+	    break;
+	  case HASHPSPOOL:
+	  case NODELETE:
+	  case NOPGCHANGE:
+	  case NOSIZECHANGE:
+	  case WRITE_FADVISE_DONTNEED:
+	  case NOSCRUB:
+	  case NODEEP_SCRUB:
+	    for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
+	      if (i->second == *it)
+		break;
+	    }
+	    assert(i != ALL_CHOICES.end());
+	    f->dump_string(i->first.c_str(),
+			   p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ?
+			   "true" : "false");
+	    break;
+	  case HIT_SET_PERIOD:
+	    f->dump_int("hit_set_period", p->hit_set_period);
+	    break;
+	  case HIT_SET_COUNT:
+	    f->dump_int("hit_set_count", p->hit_set_count);
+	    break;
+	  case HIT_SET_TYPE:
+	    f->dump_string("hit_set_type",
+			   HitSet::get_type_name(p->hit_set_params.get_type()));
+	    break;
+	  case HIT_SET_FPP:
+	    {
+	      if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
+		BloomHitSet::Params *bloomp =
+		  static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
+		f->dump_float("hit_set_fpp", bloomp->get_fpp());
+	      } else if(var != "all") {
+		f->close_section();
+		ss << "hit set is not of type Bloom; " <<
+		  "invalid to get a false positive rate!";
+		r = -EINVAL;
+		goto reply;
+	      }
+	    }
+	    break;
+	  case USE_GMT_HITSET:
+	    f->dump_bool("use_gmt_hitset", p->use_gmt_hitset);
+	    break;
+	  case TARGET_MAX_OBJECTS:
+	    f->dump_unsigned("target_max_objects", p->target_max_objects);
+	    break;
+	  case TARGET_MAX_BYTES:
+	    f->dump_unsigned("target_max_bytes", p->target_max_bytes);
+	    break;
+	  case CACHE_TARGET_DIRTY_RATIO:
+	    f->dump_unsigned("cache_target_dirty_ratio_micro",
+			     p->cache_target_dirty_ratio_micro);
+	    f->dump_float("cache_target_dirty_ratio",
+			  ((float)p->cache_target_dirty_ratio_micro/1000000));
+	    break;
+	  case CACHE_TARGET_DIRTY_HIGH_RATIO:
+	    f->dump_unsigned("cache_target_dirty_high_ratio_micro",
+			     p->cache_target_dirty_high_ratio_micro);
+	    f->dump_float("cache_target_dirty_high_ratio",
+			  ((float)p->cache_target_dirty_high_ratio_micro/1000000));
+	    break;
+	  case CACHE_TARGET_FULL_RATIO:
+	    f->dump_unsigned("cache_target_full_ratio_micro",
+			     p->cache_target_full_ratio_micro);
+	    f->dump_float("cache_target_full_ratio",
+			  ((float)p->cache_target_full_ratio_micro/1000000));
+	    break;
+	  case CACHE_MIN_FLUSH_AGE:
+	    f->dump_unsigned("cache_min_flush_age", p->cache_min_flush_age);
+	    break;
+	  case CACHE_MIN_EVICT_AGE:
+	    f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
+	    break;
+	  case ERASURE_CODE_PROFILE:
+	    f->dump_string("erasure_code_profile", p->erasure_code_profile);
+	    break;
+	  case MIN_READ_RECENCY_FOR_PROMOTE:
+	    f->dump_int("min_read_recency_for_promote",
+			p->min_read_recency_for_promote);
+	    break;
+	  case MIN_WRITE_RECENCY_FOR_PROMOTE:
+	    f->dump_int("min_write_recency_for_promote",
+			p->min_write_recency_for_promote);
+	    break;
+          case FAST_READ:
+            f->dump_int("fast_read", p->fast_read);
+            break;
 	}
-      } else if (var == "target_max_objects") {
-        f->dump_unsigned("target_max_objects", p->target_max_objects);
-      } else if (var == "target_max_bytes") {
-        f->dump_unsigned("target_max_bytes", p->target_max_bytes);
-      } else if (var == "cache_target_dirty_ratio") {
-        f->dump_unsigned("cache_target_dirty_ratio_micro",
-                         p->cache_target_dirty_ratio_micro);
-        f->dump_float("cache_target_dirty_ratio",
-                      ((float)p->cache_target_dirty_ratio_micro/1000000));
-      } else if (var == "cache_target_full_ratio") {
-        f->dump_unsigned("cache_target_full_ratio_micro",
-                         p->cache_target_full_ratio_micro);
-        f->dump_float("cache_target_full_ratio",
-                      ((float)p->cache_target_full_ratio_micro/1000000));
-      } else if (var == "cache_min_flush_age") {
-        f->dump_unsigned("cache_min_flush_age", p->cache_min_flush_age);
-      } else if (var == "cache_min_evict_age") {
-        f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
-      } else if (var == "erasure_code_profile") {
-       f->dump_string("erasure_code_profile", p->erasure_code_profile);
-      } else if (var == "min_read_recency_for_promote") {
-	f->dump_int("min_read_recency_for_promote", p->min_read_recency_for_promote);
-      } else if (var == "write_fadvise_dontneed") {
-	f->dump_string("write_fadvise_dontneed", p->has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED) ? "true" : "false");
+	f->close_section();
+	f->flush(rdata);
       }
 
-      f->close_section();
-      f->flush(rdata);
-    } else {
-      if (var == "pg_num") {
-        ss << "pg_num: " << p->get_pg_num();
-      } else if (var == "pgp_num") {
-        ss << "pgp_num: " << p->get_pgp_num();
-      } else if (var == "auid") {
-        ss << "auid: " << p->get_auid();
-      } else if (var == "size") {
-        ss << "size: " << p->get_size();
-      } else if (var == "min_size") {
-        ss << "min_size: " << p->get_min_size();
-      } else if (var == "crash_replay_interval") {
-        ss << "crash_replay_interval: " << p->get_crash_replay_interval();
-      } else if (var == "crush_ruleset") {
-        ss << "crush_ruleset: " << p->get_crush_ruleset();
-      } else if (var == "hit_set_period") {
-	ss << "hit_set_period: " << p->hit_set_period;
-      } else if (var == "hit_set_count") {
-	ss << "hit_set_count: " << p->hit_set_count;
-      } else if (var == "hit_set_type") {
-	ss << "hit_set_type: " <<  HitSet::get_type_name(p->hit_set_params.get_type());
-      } else if (var == "hit_set_fpp") {
-	if (p->hit_set_params.get_type() != HitSet::TYPE_BLOOM) {
-	  ss << "hit set is no of type Bloom; invalid to get a false positive rate!";
-	  r = -EINVAL;
-	  goto reply;
+    } else /* !f */ {
+      for(choices_set_t::const_iterator it = selected_choices.begin();
+	  it != selected_choices.end(); ++it) {
+	choices_map_t::const_iterator i;
+	switch(*it) {
+	  case PG_NUM:
+	    ss << "pg_num: " << p->get_pg_num() << "\n";
+	    break;
+	  case PGP_NUM:
+	    ss << "pgp_num: " << p->get_pgp_num() << "\n";
+	    break;
+	  case AUID:
+	    ss << "auid: " << p->get_auid() << "\n";
+	    break;
+	  case SIZE:
+	    ss << "size: " << p->get_size() << "\n";
+	    break;
+	  case MIN_SIZE:
+	    ss << "min_size: " << p->get_min_size() << "\n";
+	    break;
+	  case CRASH_REPLAY_INTERVAL:
+	    ss << "crash_replay_interval: " <<
+	      p->get_crash_replay_interval() << "\n";
+	    break;
+	  case CRUSH_RULESET:
+	    ss << "crush_ruleset: " << p->get_crush_ruleset() << "\n";
+	    break;
+	  case HIT_SET_PERIOD:
+	    ss << "hit_set_period: " << p->hit_set_period << "\n";
+	    break;
+	  case HIT_SET_COUNT:
+	    ss << "hit_set_count: " << p->hit_set_count << "\n";
+	    break;
+	  case HIT_SET_TYPE:
+	    ss << "hit_set_type: " <<
+	      HitSet::get_type_name(p->hit_set_params.get_type()) << "\n";
+	    break;
+	  case HIT_SET_FPP:
+	    {
+	      if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
+		BloomHitSet::Params *bloomp =
+		  static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
+		ss << "hit_set_fpp: " << bloomp->get_fpp() << "\n";
+	      } else if(var != "all") {
+		ss << "hit set is not of type Bloom; " <<
+		  "invalid to get a false positive rate!";
+		r = -EINVAL;
+		goto reply;
+	      }
+	    }
+	    break;
+	  case USE_GMT_HITSET:
+	    ss << "use_gmt_hitset: " << p->use_gmt_hitset << "\n";
+	    break;
+	  case TARGET_MAX_OBJECTS:
+	    ss << "target_max_objects: " << p->target_max_objects << "\n";
+	    break;
+	  case TARGET_MAX_BYTES:
+	    ss << "target_max_bytes: " << p->target_max_bytes << "\n";
+	    break;
+	  case CACHE_TARGET_DIRTY_RATIO:
+	    ss << "cache_target_dirty_ratio: "
+	       << ((float)p->cache_target_dirty_ratio_micro/1000000) << "\n";
+	    break;
+	  case CACHE_TARGET_DIRTY_HIGH_RATIO:
+	    ss << "cache_target_dirty_high_ratio: "
+	       << ((float)p->cache_target_dirty_high_ratio_micro/1000000) << "\n";
+	    break;
+	  case CACHE_TARGET_FULL_RATIO:
+	    ss << "cache_target_full_ratio: "
+	       << ((float)p->cache_target_full_ratio_micro/1000000) << "\n";
+	    break;
+	  case CACHE_MIN_FLUSH_AGE:
+	    ss << "cache_min_flush_age: " << p->cache_min_flush_age << "\n";
+	    break;
+	  case CACHE_MIN_EVICT_AGE:
+	    ss << "cache_min_evict_age: " << p->cache_min_evict_age << "\n";
+	    break;
+	  case ERASURE_CODE_PROFILE:
+	    ss << "erasure_code_profile: " << p->erasure_code_profile << "\n";
+	    break;
+	  case MIN_READ_RECENCY_FOR_PROMOTE:
+	    ss << "min_read_recency_for_promote: " <<
+	      p->min_read_recency_for_promote << "\n";
+	    break;
+	  case HASHPSPOOL:
+	  case NODELETE:
+	  case NOPGCHANGE:
+	  case NOSIZECHANGE:
+	  case WRITE_FADVISE_DONTNEED:
+	  case NOSCRUB:
+	  case NODEEP_SCRUB:
+	    for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
+	      if (i->second == *it)
+		break;
+	    }
+	    assert(i != ALL_CHOICES.end());
+	    ss << i->first << ": " <<
+	      (p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ?
+	       "true" : "false") << "\n";
+	    break;
+	  case MIN_WRITE_RECENCY_FOR_PROMOTE:
+	    ss << "min_write_recency_for_promote: " <<
+	      p->min_write_recency_for_promote << "\n";
+	    break;
+          case FAST_READ:
+            ss << "fast_read: " << p->fast_read << "\n";
+            break;
 	}
-	BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
-	ss << "hit_set_fpp: " << bloomp->get_fpp();
-      } else if (var == "target_max_objects") {
-        ss << "target_max_objects: " << p->target_max_objects;
-      } else if (var == "target_max_bytes") {
-        ss << "target_max_bytes: " << p->target_max_bytes;
-      } else if (var == "cache_target_dirty_ratio") {
-        ss << "cache_target_dirty_ratio: "
-          << ((float)p->cache_target_dirty_ratio_micro/1000000);
-      } else if (var == "cache_target_full_ratio") {
-        ss << "cache_target_full_ratio: "
-          << ((float)p->cache_target_full_ratio_micro/1000000);
-      } else if (var == "cache_min_flush_age") {
-        ss << "cache_min_flush_age: " << p->cache_min_flush_age;
-      } else if (var == "cache_min_evict_age") {
-        ss << "cache_min_evict_age: " << p->cache_min_evict_age;
-      } else if (var == "erasure_code_profile") {
-       ss << "erasure_code_profile: " << p->erasure_code_profile;
-      } else if (var == "min_read_recency_for_promote") {
-	ss << "min_read_recency_for_promote: " << p->min_read_recency_for_promote;
-      } else if (var == "write_fadvise_dontneed") {
-	ss << "write_fadvise_dontneed: " <<  (p->has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED) ? "true" : "false");
+	rdata.append(ss.str());
+	ss.str("");
       }
-
-      rdata.append(ss);
-      ss.str("");
     }
     r = 0;
-
   } else if (prefix == "osd pool stats") {
     string pool_name;
     cmd_getval(g_ceph_context, cmdmap, "name", pool_name);
@@ -3295,6 +3724,22 @@ bool OSDMonitor::preprocess_command(MMonCommand *m)
       if (!f && !rss.str().empty())
         tss << "  client io " << rss.str() << "\n";
 
+      // dump cache tier IO rate for cache pool
+      const pg_pool_t *pool = osdmap.get_pg_pool(poolid);
+      if (pool->is_tier()) {
+        if (f) {
+          f->close_section();
+          f->open_object_section("cache_io_rate");
+        }
+
+        rss.clear();
+        rss.str("");
+
+        pg_map.pool_cache_io_rate_summary(f.get(), &rss, poolid);
+        if (!f && !rss.str().empty())
+          tss << "  cache tier io " << rss.str() << "\n";
+      }
+
       if (f) {
         f->close_section();
         f->close_section();
@@ -3476,7 +3921,7 @@ stats_out:
  reply:
   string rs;
   getline(ss, rs);
-  mon->reply_command(m, r, rs, rdata, get_last_committed());
+  mon->reply_command(op, r, rs, rdata, get_last_committed());
   return true;
 }
 
@@ -3566,8 +4011,8 @@ void OSDMonitor::get_pools_health(
 	detail->push_back(make_pair(HEALTH_WARN, ss.str()));
     }
 
-    float warn_threshold = g_conf->mon_pool_quota_warn_threshold/100;
-    float crit_threshold = g_conf->mon_pool_quota_crit_threshold/100;
+    float warn_threshold = (float)g_conf->mon_pool_quota_warn_threshold/100;
+    float crit_threshold = (float)g_conf->mon_pool_quota_crit_threshold/100;
 
     if (pool.quota_max_objects > 0) {
       stringstream ss;
@@ -3626,8 +4071,10 @@ void OSDMonitor::get_pools_health(
 }
 
 
-int OSDMonitor::prepare_new_pool(MPoolOp *m)
+int OSDMonitor::prepare_new_pool(MonOpRequestRef op)
 {
+  op->mark_osdmon_event(__func__);
+  MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
   dout(10) << "prepare_new_pool from " << m->get_connection() << dendl;
   MonSession *session = m->get_session();
   if (!session)
@@ -3639,12 +4086,12 @@ int OSDMonitor::prepare_new_pool(MPoolOp *m)
     return prepare_new_pool(m->name, m->auid, m->crush_rule, ruleset_name,
 			    0, 0,
                             erasure_code_profile,
-			    pg_pool_t::TYPE_REPLICATED, 0, ss);
+			    pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, &ss);
   else
     return prepare_new_pool(m->name, session->auid, m->crush_rule, ruleset_name,
 			    0, 0,
                             erasure_code_profile,
-			    pg_pool_t::TYPE_REPLICATED, 0, ss);
+			    pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, &ss);
 }
 
 int OSDMonitor::crush_rename_bucket(const string& srcname,
@@ -3672,17 +4119,30 @@ int OSDMonitor::crush_rename_bucket(const string& srcname,
 			       ss);
   if (ret)
     return ret;
-  
+
   pending_inc.crush.clear();
   newcrush.encode(pending_inc.crush);
   *ss << "renamed bucket " << srcname << " into " << dstname;
   return 0;
 }
 
+int OSDMonitor::normalize_profile(ErasureCodeProfile &profile, ostream *ss)
+{
+  ErasureCodeInterfaceRef erasure_code;
+  ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
+  ErasureCodeProfile::const_iterator plugin = profile.find("plugin");
+  int err = instance.factory(plugin->second,
+			     g_conf->erasure_code_dir,
+			     profile, &erasure_code, ss);
+  if (err)
+    return err;
+  return erasure_code->init(profile, ss);
+}
+
 int OSDMonitor::crush_ruleset_create_erasure(const string &name,
 					     const string &profile,
 					     int *ruleset,
-					     stringstream &ss)
+					     ostream *ss)
 {
   int ruleid = osdmap.crush->get_rule_id(name);
   if (ruleid != -ENOENT) {
@@ -3701,11 +4161,11 @@ int OSDMonitor::crush_ruleset_create_erasure(const string &name,
     ErasureCodeInterfaceRef erasure_code;
     int err = get_erasure_code(profile, &erasure_code, ss);
     if (err) {
-      ss << "failed to load plugin using profile " << profile << std::endl;
+      *ss << "failed to load plugin using profile " << profile << std::endl;
       return err;
     }
 
-    err = erasure_code->create_ruleset(name, newcrush, &ss);
+    err = erasure_code->create_ruleset(name, newcrush, ss);
     erasure_code.reset();
     if (err < 0)
       return err;
@@ -3718,22 +4178,24 @@ int OSDMonitor::crush_ruleset_create_erasure(const string &name,
 
 int OSDMonitor::get_erasure_code(const string &erasure_code_profile,
 				 ErasureCodeInterfaceRef *erasure_code,
-				 stringstream &ss) const
+				 ostream *ss) const
 {
   if (pending_inc.has_erasure_code_profile(erasure_code_profile))
     return -EAGAIN;
-  const map<string,string> &profile =
+  ErasureCodeProfile profile =
     osdmap.get_erasure_code_profile(erasure_code_profile);
-  map<string,string>::const_iterator plugin =
+  ErasureCodeProfile::const_iterator plugin =
     profile.find("plugin");
   if (plugin == profile.end()) {
-    ss << "cannot determine the erasure code plugin"
-       << " because there is no 'plugin' entry in the erasure_code_profile "
-       << profile << std::endl;
+    *ss << "cannot determine the erasure code plugin"
+	<< " because there is no 'plugin' entry in the erasure_code_profile "
+	<< profile << std::endl;
     return -EINVAL;
   }
   ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
-  return instance.factory(plugin->second, profile, erasure_code, ss);
+  return instance.factory(plugin->second,
+			  g_conf->erasure_code_dir,
+			  profile, erasure_code, ss);
 }
 
 int OSDMonitor::check_cluster_features(uint64_t features,
@@ -3804,29 +4266,29 @@ bool OSDMonitor::validate_crush_against_features(const CrushWrapper *newcrush,
 
 bool OSDMonitor::erasure_code_profile_in_use(const map<int64_t, pg_pool_t> &pools,
 					     const string &profile,
-					     ostream &ss)
+					     ostream *ss)
 {
   bool found = false;
   for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
        p != pools.end();
        ++p) {
     if (p->second.erasure_code_profile == profile) {
-      ss << osdmap.pool_name[p->first] << " ";
+      *ss << osdmap.pool_name[p->first] << " ";
       found = true;
     }
   }
   if (found) {
-    ss << "pool(s) are using the erasure code profile '" << profile << "'";
+    *ss << "pool(s) are using the erasure code profile '" << profile << "'";
   }
   return found;
 }
 
 int OSDMonitor::parse_erasure_code_profile(const vector<string> &erasure_code_profile,
 					   map<string,string> *erasure_code_profile_map,
-					   stringstream &ss)
+					   ostream *ss)
 {
   int r = get_json_str_map(g_conf->osd_pool_default_erasure_code_profile,
-		           ss,
+		           *ss,
 		           erasure_code_profile_map);
   if (r)
     return r;
@@ -3852,17 +4314,13 @@ int OSDMonitor::parse_erasure_code_profile(const vector<string> &erasure_code_pr
   if (user_map.count("plugin") && user_map["plugin"] != default_plugin)
     (*erasure_code_profile_map) = user_map;
 
-  if ((*erasure_code_profile_map).count("directory") == 0)
-    (*erasure_code_profile_map)["directory"] =
-      g_conf->osd_pool_default_erasure_code_directory;
-
   return 0;
 }
 
 int OSDMonitor::prepare_pool_size(const unsigned pool_type,
 				  const string &erasure_code_profile,
 				  unsigned *size, unsigned *min_size,
-				  stringstream &ss)
+				  ostream *ss)
 {
   int err = 0;
   switch (pool_type) {
@@ -3881,7 +4339,7 @@ int OSDMonitor::prepare_pool_size(const unsigned pool_type,
     }
     break;
   default:
-    ss << "prepare_pool_size: " << pool_type << " is not a known pool type";
+    *ss << "prepare_pool_size: " << pool_type << " is not a known pool type";
     err = -EINVAL;
     break;
   }
@@ -3891,7 +4349,7 @@ int OSDMonitor::prepare_pool_size(const unsigned pool_type,
 int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type,
 					  const string &erasure_code_profile,
 					  uint32_t *stripe_width,
-					  stringstream &ss)
+					  ostream *ss)
 {
   int err = 0;
   switch (pool_type) {
@@ -3909,7 +4367,7 @@ int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type,
     }
     break;
   default:
-    ss << "prepare_pool_stripe_width: "
+    *ss << "prepare_pool_stripe_width: "
        << pool_type << " is not a known pool type";
     err = -EINVAL;
     break;
@@ -3921,7 +4379,7 @@ int OSDMonitor::prepare_pool_crush_ruleset(const unsigned pool_type,
 					   const string &erasure_code_profile,
 					   const string &ruleset_name,
 					   int *crush_ruleset,
-					   stringstream &ss)
+					   ostream *ss)
 {
 
   if (*crush_ruleset < 0) {
@@ -3933,8 +4391,9 @@ int OSDMonitor::prepare_pool_crush_ruleset(const unsigned pool_type,
 	  *crush_ruleset = osdmap.crush->get_osd_pool_default_crush_replicated_ruleset(g_ceph_context);
 	  if (*crush_ruleset < 0) {
 	    // Errors may happen e.g. if no valid ruleset is available
-	    ss << "No suitable CRUSH ruleset exists";
-	    return *crush_ruleset;
+	    *ss << "No suitable CRUSH ruleset exists, check "
+                << "'osd pool default crush *' config options";
+	    return -ENOENT;
 	  }
 	} else {
 	  return get_crush_ruleset(ruleset_name, crush_ruleset, ss);
@@ -3950,6 +4409,7 @@ int OSDMonitor::prepare_pool_crush_ruleset(const unsigned pool_type,
 	case -EALREADY:
 	  dout(20) << "prepare_pool_crush_ruleset: ruleset "
 		   << ruleset_name << " try again" << dendl;
+	  // fall through
 	case 0:
 	  // need to wait for the crush rule to be proposed before proceeding
 	  err = -EAGAIN;
@@ -3962,14 +4422,14 @@ int OSDMonitor::prepare_pool_crush_ruleset(const unsigned pool_type,
       }
       break;
     default:
-      ss << "prepare_pool_crush_ruleset: " << pool_type
+      *ss << "prepare_pool_crush_ruleset: " << pool_type
 	 << " is not a known pool type";
       return -EINVAL;
       break;
     }
   } else {
     if (!osdmap.crush->ruleset_exists(*crush_ruleset)) {
-      ss << "CRUSH ruleset " << *crush_ruleset << " not found";
+      *ss << "CRUSH ruleset " << *crush_ruleset << " not found";
       return -ENOENT;
     }
   }
@@ -3979,7 +4439,7 @@ int OSDMonitor::prepare_pool_crush_ruleset(const unsigned pool_type,
 
 int OSDMonitor::get_crush_ruleset(const string &ruleset_name,
 				  int *crush_ruleset,
-				  stringstream &ss)
+				  ostream *ss)
 {
   int ret;
   ret = osdmap.crush->get_rule_id(ruleset_name);
@@ -3998,7 +4458,7 @@ int OSDMonitor::get_crush_ruleset(const string &ruleset_name,
       return -EAGAIN;
     } else {
       //Cannot find it , return error
-      ss << "specified ruleset " << ruleset_name << " doesn't exist";
+      *ss << "specified ruleset " << ruleset_name << " doesn't exist";
       return ret;
     }
   }
@@ -4013,8 +4473,9 @@ int OSDMonitor::get_crush_ruleset(const string &ruleset_name,
  * @param pg_num The pg_num to use. If set to 0, will use the system default
  * @param pgp_num The pgp_num to use. If set to 0, will use the system default
  * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
- * @param pool_type TYPE_ERASURE, TYPE_REP or TYPE_RAID4
+ * @param pool_type TYPE_ERASURE, or TYPE_REP
  * @param expected_num_objects expected number of objects on the pool
+ * @param fast_read fast read type. 
  * @param ss human readable error message, if any.
  *
  * @return 0 on success, negative errno on failure.
@@ -4026,32 +4487,63 @@ int OSDMonitor::prepare_new_pool(string& name, uint64_t auid,
 				 const string &erasure_code_profile,
                                  const unsigned pool_type,
                                  const uint64_t expected_num_objects,
-				 stringstream &ss)
+                                 FastReadType fast_read,
+				 ostream *ss)
 {
   if (name.length() == 0)
     return -EINVAL;
+  if (pg_num == 0)
+    pg_num = g_conf->osd_pool_default_pg_num;
+  if (pgp_num == 0)
+    pgp_num = g_conf->osd_pool_default_pgp_num;
+  if (pg_num > (unsigned)g_conf->mon_max_pool_pg_num) {
+    *ss << "'pg_num' must be greater than 0 and less than or equal to "
+        << g_conf->mon_max_pool_pg_num
+        << " (you may adjust 'mon max pool pg num' for higher values)";
+    return -ERANGE;
+  }
+  if (pgp_num > pg_num) {
+    *ss << "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
+        << ", which in this case is " << pg_num;
+    return -ERANGE;
+  }
+  if (pool_type == pg_pool_t::TYPE_REPLICATED && fast_read == FAST_READ_ON) {
+    *ss << "'fast_read' can only apply to erasure coding pool";
+    return -EINVAL;
+  }
   int r;
   r = prepare_pool_crush_ruleset(pool_type, erasure_code_profile,
 				 crush_ruleset_name, &crush_ruleset, ss);
-  if (r)
+  if (r) {
+    dout(10) << " prepare_pool_crush_ruleset returns " << r << dendl;
     return r;
+  }
   CrushWrapper newcrush;
   _get_pending_crush(newcrush);
-  CrushTester tester(newcrush, ss);
+  ostringstream err;
+  CrushTester tester(newcrush, err);
   r = tester.test_with_crushtool(g_conf->crushtool.c_str(),
 				 osdmap.get_max_osd(),
 				 g_conf->mon_lease,
 				 crush_ruleset);
-  if (r)
+  if (r) {
+    dout(10) << " tester.test_with_crushtool returns " << r
+	     << ": " << err.str() << dendl;
+    *ss << "crushtool check failed with " << r << ": " << err.str();
     return r;
+  }
   unsigned size, min_size;
   r = prepare_pool_size(pool_type, erasure_code_profile, &size, &min_size, ss);
-  if (r)
+  if (r) {
+    dout(10) << " prepare_pool_size returns " << r << dendl;
     return r;
+  }
   uint32_t stripe_width = 0;
   r = prepare_pool_stripe_width(pool_type, erasure_code_profile, &stripe_width, ss);
-  if (r)
+  if (r) {
+    dout(10) << " prepare_pool_stripe_width returns " << r << dendl;
     return r;
+  }
 
   for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
        p != pending_inc.new_pool_names.end();
@@ -4075,20 +4567,44 @@ int OSDMonitor::prepare_new_pool(string& name, uint64_t auid,
     pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
   if (g_conf->osd_pool_default_flag_nosizechange)
     pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
+  if (g_conf->osd_pool_use_gmt_hitset &&
+      (osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT))
+    pi->use_gmt_hitset = true;
+  else
+    pi->use_gmt_hitset = false;
+
+  if (pool_type == pg_pool_t::TYPE_ERASURE) {
+    switch (fast_read) {
+      case FAST_READ_OFF:
+        pi->fast_read = false;
+        break;
+      case FAST_READ_ON:
+        pi->fast_read = true;
+        break;
+      case FAST_READ_DEFAULT:
+        pi->fast_read = g_conf->mon_osd_pool_ec_fast_read;
+        break;
+      default:
+        *ss << "invalid fast_read setting: " << fast_read;
+        return -EINVAL;
+    }
+  }
 
   pi->size = size;
   pi->min_size = min_size;
   pi->crush_ruleset = crush_ruleset;
   pi->expected_num_objects = expected_num_objects;
   pi->object_hash = CEPH_STR_HASH_RJENKINS;
-  pi->set_pg_num(pg_num ? pg_num : g_conf->osd_pool_default_pg_num);
-  pi->set_pgp_num(pgp_num ? pgp_num : g_conf->osd_pool_default_pgp_num);
+  pi->set_pg_num(pg_num);
+  pi->set_pgp_num(pgp_num);
   pi->last_change = pending_inc.epoch;
   pi->auid = auid;
   pi->erasure_code_profile = erasure_code_profile;
   pi->stripe_width = stripe_width;
   pi->cache_target_dirty_ratio_micro =
     g_conf->osd_pool_default_cache_target_dirty_ratio * 1000000;
+  pi->cache_target_dirty_high_ratio_micro =
+    g_conf->osd_pool_default_cache_target_dirty_high_ratio * 1000000;
   pi->cache_target_full_ratio_micro =
     g_conf->osd_pool_default_cache_target_full_ratio * 1000000;
   pi->cache_min_flush_age = g_conf->osd_pool_default_cache_min_flush_age;
@@ -4097,26 +4613,28 @@ int OSDMonitor::prepare_new_pool(string& name, uint64_t auid,
   return 0;
 }
 
-bool OSDMonitor::prepare_set_flag(MMonCommand *m, int flag)
+bool OSDMonitor::prepare_set_flag(MonOpRequestRef op, int flag)
 {
+  op->mark_osdmon_event(__func__);
   ostringstream ss;
   if (pending_inc.new_flags < 0)
     pending_inc.new_flags = osdmap.get_flags();
   pending_inc.new_flags |= flag;
   ss << "set " << OSDMap::get_flag_string(flag);
-  wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, ss.str(),
+  wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
 						    get_last_committed() + 1));
   return true;
 }
 
-bool OSDMonitor::prepare_unset_flag(MMonCommand *m, int flag)
+bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op, int flag)
 {
+  op->mark_osdmon_event(__func__);
   ostringstream ss;
   if (pending_inc.new_flags < 0)
     pending_inc.new_flags = osdmap.get_flags();
   pending_inc.new_flags &= ~flag;
   ss << "unset " << OSDMap::get_flag_string(flag);
-  wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, ss.str(),
+  wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
 						    get_last_committed() + 1));
   return true;
 }
@@ -4215,6 +4733,7 @@ int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
        var == "hit_set_count" || var == "hit_set_fpp" ||
        var == "target_max_objects" || var == "target_max_bytes" ||
        var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" ||
+       var == "cache_target_dirty_high_ratio" ||
        var == "cache_min_flush_age" || var == "cache_min_evict_age")) {
     ss << "pool '" << poolstr << "' is not a tier pool: variable not applicable";
     return -EACCES;
@@ -4259,11 +4778,11 @@ int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
        ErasureCodeInterfaceRef erasure_code;
        int k;
        stringstream tmp;
-       int err = get_erasure_code(p.erasure_code_profile, &erasure_code, tmp);
+       int err = get_erasure_code(p.erasure_code_profile, &erasure_code, &tmp);
        if (err == 0) {
 	 k = erasure_code->get_data_chunk_count();
        } else {
-	 ss << __func__ << " get_erasure_code failed: " << tmp;
+	 ss << __func__ << " get_erasure_code failed: " << tmp.rdbuf();
 	 return err;;
        }
 
@@ -4363,7 +4882,8 @@ int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
     }
     p.crush_ruleset = n;
   } else if (var == "hashpspool" || var == "nodelete" || var == "nopgchange" ||
-	     var == "nosizechange") {
+	     var == "nosizechange" || var == "write_fadvise_dontneed" ||
+	     var == "noscrub" || var == "nodeep-scrub") {
     uint64_t flag = pg_pool_t::get_flag_by_name(var);
     // make sure we only compare against 'n' if we didn't receive a string
     if (val == "true" || (interr.empty() && n == 1)) {
@@ -4418,6 +4938,17 @@ int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
     }
     BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p.hit_set_params.impl.get());
     bloomp->set_fpp(f);
+  } else if (var == "use_gmt_hitset") {
+    if (val == "true" || (interr.empty() && n == 1)) {
+      if (!(osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT)) {
+	ss << "not all OSDs support GMT hit set.";
+	return -EINVAL;
+      }
+      p.use_gmt_hitset = true;
+    } else {
+      ss << "expecting value 'true' or '1'";
+      return -EINVAL;
+    }
   } else if (var == "debug_fake_ec_pool") {
     if (val == "true" || (interr.empty() && n == 1)) {
       p.flags |= pg_pool_t::FLAG_DEBUG_FAKE_EC_POOL;
@@ -4444,6 +4975,16 @@ int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
       return -ERANGE;
     }
     p.cache_target_dirty_ratio_micro = uf;
+  } else if (var == "cache_target_dirty_high_ratio") {
+    if (floaterr.length()) {
+      ss << "error parsing float '" << val << "': " << floaterr;
+      return -EINVAL;
+    }
+    if (f < 0 || f > 1.0) {
+      ss << "value must be in the range 0..1";
+      return -ERANGE;
+    }
+    p.cache_target_dirty_high_ratio_micro = uf;
   } else if (var == "cache_target_full_ratio") {
     if (floaterr.length()) {
       ss << "error parsing float '" << val << "': " << floaterr;
@@ -4472,14 +5013,21 @@ int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
       return -EINVAL;
     }
     p.min_read_recency_for_promote = n;
-  } else if (var == "write_fadvise_dontneed") {
+  } else if (var == "min_write_recency_for_promote") {
+    if (interr.length()) {
+      ss << "error parsing integer value '" << val << "': " << interr;
+      return -EINVAL;
+    }
+    p.min_write_recency_for_promote = n;
+  } else if (var == "fast_read") {
     if (val == "true" || (interr.empty() && n == 1)) {
-      p.flags |= pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED;
+      if (p.is_replicated()) {
+        ss << "fast read is not supported in replication pool";
+        return -EINVAL;
+      }
+      p.fast_read = true;
     } else if (val == "false" || (interr.empty() && n == 0)) {
-      p.flags &= ~pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED;
-    } else {
-      ss << "expecting value 'true', 'false', '0', or '1'";
-      return -EINVAL;
+      p.fast_read = false;
     }
   } else {
     ss << "unrecognized variable '" << var << "'";
@@ -4491,28 +5039,32 @@ int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
   return 0;
 }
 
-bool OSDMonitor::prepare_command(MMonCommand *m)
+bool OSDMonitor::prepare_command(MonOpRequestRef op)
 {
+  op->mark_osdmon_event(__func__);
+  MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
   stringstream ss;
   map<string, cmd_vartype> cmdmap;
   if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
     string rs = ss.str();
-    mon->reply_command(m, -EINVAL, rs, get_last_committed());
+    mon->reply_command(op, -EINVAL, rs, get_last_committed());
     return true;
   }
 
   MonSession *session = m->get_session();
   if (!session) {
-    mon->reply_command(m, -EACCES, "access denied", get_last_committed());
+    mon->reply_command(op, -EACCES, "access denied", get_last_committed());
     return true;
   }
 
-  return prepare_command_impl(m, cmdmap);
+  return prepare_command_impl(op, cmdmap);
 }
 
-bool OSDMonitor::prepare_command_impl(MMonCommand *m,
+bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
 				      map<string,cmd_vartype> &cmdmap)
 {
+  op->mark_osdmon_event(__func__);
+  MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
   bool ret = false;
   stringstream ss;
   string rs;
@@ -4588,23 +5140,19 @@ bool OSDMonitor::prepare_command_impl(MMonCommand *m,
     // sanity check: test some inputs to make sure this map isn't totally broken
     dout(10) << " testing map" << dendl;
     stringstream ess;
+    CrushTester tester(crush, ess);
     // XXX: Use mon_lease as a timeout value for crushtool.
     // If the crushtool consistently takes longer than 'mon_lease' seconds,
     // then we would consistently trigger an election before the command
     // finishes, having a flapping monitor unable to hold quorum.
-    CrushTester tester(crush, ess);
-    int r = tester.test_with_crushtool(g_conf->crushtool,
+    int r = tester.test_with_crushtool(g_conf->crushtool.c_str(),
 				       osdmap.get_max_osd(),
 				       g_conf->mon_lease);
     if (r < 0) {
-      if (r == -EINTR) {
-	ss << "(note: crushtool tests not run because they took too long) ";
-      } else {
-	derr << "error on crush map: " << ess.str() << dendl;
-	ss << "Failed crushmap test: " << ess.str();
-	err = r;
-	goto reply;
-      }
+      derr << "error on crush map: " << ess.str() << dendl;
+      ss << "Failed crushmap test: " << ess.str();
+      err = r;
+      goto reply;
     }
 
     dout(10) << " result " << ess.str() << dendl;
@@ -4740,7 +5288,7 @@ bool OSDMonitor::prepare_command_impl(MMonCommand *m,
     ss << action << " item id " << osdid << " name '" << name << "' weight "
       << weight << " at location " << loc << " to crush map";
     getline(ss, rs);
-    wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
 						      get_last_committed() + 1));
     return true;
 
@@ -4785,7 +5333,7 @@ bool OSDMonitor::prepare_command_impl(MMonCommand *m,
 	ss << "create-or-move updating item name '" << name << "' weight " << weight
 	   << " at location " << loc << " to crush map";
 	getline(ss, rs);
-	wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
+	wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
 						  get_last_committed() + 1));
 	return true;
       }
@@ -4820,7 +5368,7 @@ bool OSDMonitor::prepare_command_impl(MMonCommand *m,
 	  pending_inc.crush.clear();
 	  newcrush.encode(pending_inc.crush);
 	  getline(ss, rs);
-	  wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
+	  wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
 						   get_last_committed() + 1));
 	  return true;
 	}
@@ -4882,7 +5430,7 @@ bool OSDMonitor::prepare_command_impl(MMonCommand *m,
 	err = 0;
       }
     }
-    wait_for_finished_proposal(new Monitor::C_Command(mon, m, err, ss.str(),
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, ss.str(),
 					      get_last_committed() + 1));
     return true;
   } else if (prefix == "osd crush rm" ||
@@ -4905,7 +5453,7 @@ bool OSDMonitor::prepare_command_impl(MMonCommand *m,
 	err = 0;
 	ss << "device '" << name << "' does not appear in the crush map";
 	getline(ss, rs);
-	wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
+	wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
 						  get_last_committed() + 1));
 	return true;
       }
@@ -4935,7 +5483,7 @@ bool OSDMonitor::prepare_command_impl(MMonCommand *m,
 	newcrush.encode(pending_inc.crush);
 	ss << "removed item id " << id << " name '" << name << "' from crush map";
 	getline(ss, rs);
-	wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
+	wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
 						  get_last_committed() + 1));
 	return true;
       }
@@ -4951,7 +5499,7 @@ bool OSDMonitor::prepare_command_impl(MMonCommand *m,
     newcrush.encode(pending_inc.crush);
     ss << "reweighted crush hierarchy";
     getline(ss, rs);
-    wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
 						  get_last_committed() + 1));
     return true;
   } else if (prefix == "osd crush reweight") {
@@ -4989,7 +5537,7 @@ bool OSDMonitor::prepare_command_impl(MMonCommand *m,
     ss << "reweighted item id " << id << " name '" << name << "' to " << w
        << " in crush map";
     getline(ss, rs);
-    wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
 						  get_last_committed() + 1));
     return true;
   } else if (prefix == "osd crush reweight-subtree") {
@@ -5027,7 +5575,7 @@ bool OSDMonitor::prepare_command_impl(MMonCommand *m,
     ss << "reweighted subtree id " << id << " name '" << name << "' to " << w
        << " in crush map";
     getline(ss, rs);
-    wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
 					      get_last_committed() + 1));
     return true;
   } else if (prefix == "osd crush tunables") {
@@ -5064,7 +5612,7 @@ bool OSDMonitor::prepare_command_impl(MMonCommand *m,
     newcrush.encode(pending_inc.crush);
     ss << "adjusted tunables profile to " << profile;
     getline(ss, rs);
-    wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
 					      get_last_committed() + 1));
     return true;
   } else if (prefix == "osd crush set-tunable") {
@@ -5104,7 +5652,7 @@ bool OSDMonitor::prepare_command_impl(MMonCommand *m,
     newcrush.encode(pending_inc.crush);
     ss << "adjusted tunable " << tunable << " to " << value;
     getline(ss, rs);
-    wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
 					      get_last_committed() + 1));
     return true;
 
@@ -5145,7 +5693,7 @@ bool OSDMonitor::prepare_command_impl(MMonCommand *m,
       newcrush.encode(pending_inc.crush);
     }
     getline(ss, rs);
-    wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
 					      get_last_committed() + 1));
     return true;
 
@@ -5153,10 +5701,10 @@ bool OSDMonitor::prepare_command_impl(MMonCommand *m,
     string name;
     cmd_getval(g_ceph_context, cmdmap, "name", name);
 
-    if (erasure_code_profile_in_use(pending_inc.new_pools, name, ss))
+    if (erasure_code_profile_in_use(pending_inc.new_pools, name, &ss))
       goto wait;
 
-    if (erasure_code_profile_in_use(osdmap.pools, name, ss)) {
+    if (erasure_code_profile_in_use(osdmap.pools, name, &ss)) {
       err = -EBUSY;
       goto reply;
     }
@@ -5171,7 +5719,7 @@ bool OSDMonitor::prepare_command_impl(MMonCommand *m,
       }
 
       getline(ss, rs);
-      wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
+      wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
 							get_last_committed() + 1));
       return true;
     } else {
@@ -5193,7 +5741,7 @@ bool OSDMonitor::prepare_command_impl(MMonCommand *m,
       force = false;
     }
     map<string,string> profile_map;
-    err = parse_erasure_code_profile(profile, &profile_map, ss);
+    err = parse_erasure_code_profile(profile, &profile_map, &ss);
     if (err)
       goto reply;
     if (profile_map.find("plugin") == profile_map.end()) {
@@ -5204,22 +5752,6 @@ bool OSDMonitor::prepare_command_impl(MMonCommand *m,
     }
     string plugin = profile_map["plugin"];
 
-    if (osdmap.has_erasure_code_profile(name)) {
-      if (osdmap.get_erasure_code_profile(name) == profile_map) {
-	err = 0;
-	goto reply;
-      }
-      if (!force) {
-	err = -EPERM;
-	ss << "will not override erasure code profile " << name
-	   << " because the existing profile "
-	   << osdmap.get_erasure_code_profile(name)
-	   << " is different from the proposed profile "
-	   << profile_map;
-	goto reply;
-      }
-    }
-
     if (pending_inc.has_erasure_code_profile(name)) {
       dout(20) << "erasure code profile " << name << " try again" << dendl;
       goto wait;
@@ -5231,17 +5763,45 @@ bool OSDMonitor::prepare_command_impl(MMonCommand *m,
 	if (err)
 	  goto reply;
       } else if (plugin == "shec") {
-	if (!g_ceph_context->check_experimental_feature_enabled("shec", &ss)) {
-	  err = -EINVAL;
+	err = check_cluster_features(CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3, ss);
+	if (err == -EAGAIN)
+	  goto wait;
+	if (err)
+	  goto reply;
+      }
+      err = normalize_profile(profile_map, &ss);
+      if (err)
+	goto reply;
+
+      if (osdmap.has_erasure_code_profile(name)) {
+	ErasureCodeProfile existing_profile_map =
+	  osdmap.get_erasure_code_profile(name);
+	err = normalize_profile(existing_profile_map, &ss);
+	if (err)
+	  goto reply;
+
+	if (existing_profile_map == profile_map) {
+	  err = 0;
+	  goto reply;
+	}
+	if (!force) {
+	  err = -EPERM;
+	  ss << "will not override erasure code profile " << name
+	     << " because the existing profile "
+	     << existing_profile_map
+	     << " is different from the proposed profile "
+	     << profile_map;
 	  goto reply;
 	}
       }
-      dout(20) << "erasure code profile " << name << " set" << dendl;
+
+      dout(20) << "erasure code profile set " << name << "="
+	       << profile_map << dendl;
       pending_inc.set_erasure_code_profile(name, profile_map);
     }
 
     getline(ss, rs);
-    wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
                                                       get_last_committed() + 1));
     return true;
 
@@ -5270,14 +5830,18 @@ bool OSDMonitor::prepare_command_impl(MMonCommand *m,
 						      &ss);
 	if (err)
 	  goto reply;
-	dout(20) << "erasure code profile " << profile << " set" << dendl;
+	err = normalize_profile(profile_map, &ss);
+	if (err)
+	  goto reply;
+	dout(20) << "erasure code profile set " << profile << "="
+		 << profile_map << dendl;
 	pending_inc.set_erasure_code_profile(profile, profile_map);
 	goto wait;
       }
     }
 
     int ruleset;
-    err = crush_ruleset_create_erasure(name, profile, &ruleset, ss);
+    err = crush_ruleset_create_erasure(name, profile, &ruleset, &ss);
     if (err < 0) {
       switch(err) {
       case -EEXIST: // return immediately
@@ -5298,7 +5862,7 @@ bool OSDMonitor::prepare_command_impl(MMonCommand *m,
     }
 
     getline(ss, rs);
-    wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
                                                       get_last_committed() + 1));
     return true;
 
@@ -5341,7 +5905,7 @@ bool OSDMonitor::prepare_command_impl(MMonCommand *m,
       newcrush.encode(pending_inc.crush);
     }
     getline(ss, rs);
-    wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
 					      get_last_committed() + 1));
     return true;
 
@@ -5381,44 +5945,51 @@ bool OSDMonitor::prepare_command_impl(MMonCommand *m,
     pending_inc.new_max_osd = newmax;
     ss << "set new max_osd = " << pending_inc.new_max_osd;
     getline(ss, rs);
-    wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
 					      get_last_committed() + 1));
     return true;
 
   } else if (prefix == "osd pause") {
-    return prepare_set_flag(m, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
+    return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
 
   } else if (prefix == "osd unpause") {
-    return prepare_unset_flag(m, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
+    return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
 
   } else if (prefix == "osd set") {
     string key;
     cmd_getval(g_ceph_context, cmdmap, "key", key);
     if (key == "full")
-      return prepare_set_flag(m, CEPH_OSDMAP_FULL);
+      return prepare_set_flag(op, CEPH_OSDMAP_FULL);
     else if (key == "pause")
-      return prepare_set_flag(m, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
+      return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
     else if (key == "noup")
-      return prepare_set_flag(m, CEPH_OSDMAP_NOUP);
+      return prepare_set_flag(op, CEPH_OSDMAP_NOUP);
     else if (key == "nodown")
-      return prepare_set_flag(m, CEPH_OSDMAP_NODOWN);
+      return prepare_set_flag(op, CEPH_OSDMAP_NODOWN);
     else if (key == "noout")
-      return prepare_set_flag(m, CEPH_OSDMAP_NOOUT);
+      return prepare_set_flag(op, CEPH_OSDMAP_NOOUT);
     else if (key == "noin")
-      return prepare_set_flag(m, CEPH_OSDMAP_NOIN);
+      return prepare_set_flag(op, CEPH_OSDMAP_NOIN);
     else if (key == "nobackfill")
-      return prepare_set_flag(m, CEPH_OSDMAP_NOBACKFILL);
+      return prepare_set_flag(op, CEPH_OSDMAP_NOBACKFILL);
     else if (key == "norebalance")
-      return prepare_set_flag(m, CEPH_OSDMAP_NOREBALANCE);
+      return prepare_set_flag(op, CEPH_OSDMAP_NOREBALANCE);
     else if (key == "norecover")
-      return prepare_set_flag(m, CEPH_OSDMAP_NORECOVER);
+      return prepare_set_flag(op, CEPH_OSDMAP_NORECOVER);
     else if (key == "noscrub")
-      return prepare_set_flag(m, CEPH_OSDMAP_NOSCRUB);
+      return prepare_set_flag(op, CEPH_OSDMAP_NOSCRUB);
     else if (key == "nodeep-scrub")
-      return prepare_set_flag(m, CEPH_OSDMAP_NODEEP_SCRUB);
+      return prepare_set_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
     else if (key == "notieragent")
-      return prepare_set_flag(m, CEPH_OSDMAP_NOTIERAGENT);
-    else {
+      return prepare_set_flag(op, CEPH_OSDMAP_NOTIERAGENT);
+    else if (key == "sortbitwise") {
+      if (osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT) {
+	return prepare_set_flag(op, CEPH_OSDMAP_SORTBITWISE);
+      } else {
+	ss << "not all up OSDs have OSD_BITWISE_HOBJ_SORT feature";
+	err = -EPERM;
+      }
+    } else {
       ss << "unrecognized flag '" << key << "'";
       err = -EINVAL;
     }
@@ -5427,29 +5998,31 @@ bool OSDMonitor::prepare_command_impl(MMonCommand *m,
     string key;
     cmd_getval(g_ceph_context, cmdmap, "key", key);
     if (key == "full")
-      return prepare_unset_flag(m, CEPH_OSDMAP_FULL);
+      return prepare_unset_flag(op, CEPH_OSDMAP_FULL);
     else if (key == "pause")
-      return prepare_unset_flag(m, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
+      return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
     else if (key == "noup")
-      return prepare_unset_flag(m, CEPH_OSDMAP_NOUP);
+      return prepare_unset_flag(op, CEPH_OSDMAP_NOUP);
     else if (key == "nodown")
-      return prepare_unset_flag(m, CEPH_OSDMAP_NODOWN);
+      return prepare_unset_flag(op, CEPH_OSDMAP_NODOWN);
     else if (key == "noout")
-      return prepare_unset_flag(m, CEPH_OSDMAP_NOOUT);
+      return prepare_unset_flag(op, CEPH_OSDMAP_NOOUT);
     else if (key == "noin")
-      return prepare_unset_flag(m, CEPH_OSDMAP_NOIN);
+      return prepare_unset_flag(op, CEPH_OSDMAP_NOIN);
     else if (key == "nobackfill")
-      return prepare_unset_flag(m, CEPH_OSDMAP_NOBACKFILL);
+      return prepare_unset_flag(op, CEPH_OSDMAP_NOBACKFILL);
     else if (key == "norebalance")
-      return prepare_unset_flag(m, CEPH_OSDMAP_NOREBALANCE);
+      return prepare_unset_flag(op, CEPH_OSDMAP_NOREBALANCE);
     else if (key == "norecover")
-      return prepare_unset_flag(m, CEPH_OSDMAP_NORECOVER);
+      return prepare_unset_flag(op, CEPH_OSDMAP_NORECOVER);
     else if (key == "noscrub")
-      return prepare_unset_flag(m, CEPH_OSDMAP_NOSCRUB);
+      return prepare_unset_flag(op, CEPH_OSDMAP_NOSCRUB);
     else if (key == "nodeep-scrub")
-      return prepare_unset_flag(m, CEPH_OSDMAP_NODEEP_SCRUB);
+      return prepare_unset_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
     else if (key == "notieragent")
-      return prepare_unset_flag(m, CEPH_OSDMAP_NOTIERAGENT);
+      return prepare_unset_flag(op, CEPH_OSDMAP_NOTIERAGENT);
+    else if (key == "sortbitwise")
+      return prepare_unset_flag(op, CEPH_OSDMAP_SORTBITWISE);
     else {
       ss << "unrecognized flag '" << key << "'";
       err = -EINVAL;
@@ -5524,7 +6097,7 @@ bool OSDMonitor::prepare_command_impl(MMonCommand *m,
     }
     if (any) {
       getline(ss, rs);
-      wait_for_finished_proposal(new Monitor::C_Command(mon, m, err, rs,
+      wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
 						get_last_committed() + 1));
       return true;
     }
@@ -5663,7 +6236,7 @@ bool OSDMonitor::prepare_command_impl(MMonCommand *m,
       pending_inc.new_primary_affinity[id] = ww;
       ss << "set osd." << id << " primary-affinity to " << w << " (" << ios::hex << ww << ios::dec << ")";
       getline(ss, rs);
-      wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
+      wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
                                                 get_last_committed() + 1));
       return true;
     }
@@ -5692,7 +6265,7 @@ bool OSDMonitor::prepare_command_impl(MMonCommand *m,
       pending_inc.new_weight[id] = ww;
       ss << "reweighted osd." << id << " to " << w << " (" << std::hex << ww << std::dec << ")";
       getline(ss, rs);
-      wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
+      wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
 						get_last_committed() + 1));
       return true;
     }
@@ -5718,7 +6291,7 @@ bool OSDMonitor::prepare_command_impl(MMonCommand *m,
       pending_inc.new_lost[id] = e;
       ss << "marked osd lost in epoch " << e;
       getline(ss, rs);
-      wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
+      wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
 						get_last_committed() + 1));
       return true;
     }
@@ -5726,11 +6299,23 @@ bool OSDMonitor::prepare_command_impl(MMonCommand *m,
   } else if (prefix == "osd create") {
     int i = -1;
 
+    // optional id provided?
+    int64_t id = -1;
+    if (cmd_getval(g_ceph_context, cmdmap, "id", id)) {
+      if (id < 0) {
+	ss << "invalid osd id value '" << id << "'";
+	err = -EINVAL;
+	goto reply;
+      }
+      dout(10) << " osd create got id " << id << dendl;
+    }
+
     // optional uuid provided?
     uuid_d uuid;
     string uuidstr;
     if (cmd_getval(g_ceph_context, cmdmap, "uuid", uuidstr)) {
       if (!uuid.parse(uuidstr.c_str())) {
+	ss << "invalid uuid value '" << uuidstr << "'";
 	err = -EINVAL;
 	goto reply;
       }
@@ -5738,6 +6323,11 @@ bool OSDMonitor::prepare_command_impl(MMonCommand *m,
       i = osdmap.identify_osd(uuid);
       if (i >= 0) {
 	// osd already exists
+	if (id >= 0 && i != id) {
+	  ss << "uuid " << uuidstr << " already in use for different id " << i;
+	  err = -EINVAL;
+	  goto reply;
+	}
 	err = 0;
 	if (f) {
 	  f->open_object_section("created_osd");
@@ -5750,12 +6340,32 @@ bool OSDMonitor::prepare_command_impl(MMonCommand *m,
 	}
 	goto reply;
       }
-      i = pending_inc.identify_osd(uuid);
-      if (i >= 0) {
+      // i < 0
+      if (id >= 0) {
+	if (osdmap.exists(id)) {
+	  ss << "id " << id << " already in use and does not match uuid "
+	     << uuid;
+	  err = -EINVAL;
+	  goto reply;
+	}
+	if (pending_inc.new_state.count(id)) {
+	  // osd is about to exist
+	  wait_for_finished_proposal(op, new C_RetryMessage(this, op));
+	  return true;
+	}
+	i = id;
+      }
+      if (pending_inc.identify_osd(uuid) >= 0) {
 	// osd is about to exist
-	wait_for_finished_proposal(new C_RetryMessage(this, m));
+	wait_for_finished_proposal(op, new C_RetryMessage(this, op));
 	return true;
       }
+      if (i >= 0) {
+	// raise max_osd
+	if (osdmap.get_max_osd() <= i && pending_inc.new_max_osd <= i)
+	  pending_inc.new_max_osd = i + 1;
+	goto done;
+      }
     }
 
     // allocate a new id
@@ -5788,7 +6398,7 @@ done:
       ss << i;
       rdata.append(ss);
     }
-    wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs, rdata,
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs, rdata,
 					      get_last_committed() + 1));
     return true;
 
@@ -5811,7 +6421,7 @@ done:
 	pending_inc.new_blacklist[addr] = expires;
 	ss << "blacklisting " << addr << " until " << expires << " (" << d << " sec)";
 	getline(ss, rs);
-	wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
+	wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
 						  get_last_committed() + 1));
 	return true;
       } else if (blacklistop == "rm") {
@@ -5823,7 +6433,7 @@ done:
 	    pending_inc.new_blacklist.erase(addr);
 	  ss << "un-blacklisting " << addr;
 	  getline(ss, rs);
-	  wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
+	  wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
 						    get_last_committed() + 1));
 	  return true;
 	}
@@ -5868,7 +6478,7 @@ done:
       ss << "created pool " << poolstr << " snap " << snapname;
     }
     getline(ss, rs);
-    wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
 					      get_last_committed() + 1));
     return true;
   } else if (prefix == "osd pool rmsnap") {
@@ -5908,28 +6518,14 @@ done:
       ss << "already removed pool " << poolstr << " snap " << snapname;
     }
     getline(ss, rs);
-    wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
 					      get_last_committed() + 1));
     return true;
   } else if (prefix == "osd pool create") {
     int64_t  pg_num;
     int64_t pgp_num;
     cmd_getval(g_ceph_context, cmdmap, "pg_num", pg_num, int64_t(0));
-    if ((pg_num == 0) || (pg_num > g_conf->mon_max_pool_pg_num)) {
-      ss << "'pg_num' must be greater than 0 and less than or equal to "
-	 << g_conf->mon_max_pool_pg_num
-	 << " (you may adjust 'mon max pool pg num' for higher values)";
-      err = -ERANGE;
-      goto reply;
-    }
-
     cmd_getval(g_ceph_context, cmdmap, "pgp_num", pgp_num, pg_num);
-    if ((pgp_num == 0) || (pgp_num > pg_num)) {
-      ss << "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
-	 << ", which in this case is " << pg_num;
-      err = -ERANGE;
-      goto reply;
-    }
 
     string pool_type_str;
     cmd_getval(g_ceph_context, cmdmap, "pool_type", pool_type_str);
@@ -6014,9 +6610,9 @@ done:
 
     if (!implicit_ruleset_creation && ruleset_name != "") {
       int ruleset;
-      err = get_crush_ruleset(ruleset_name, &ruleset, ss);
+      err = get_crush_ruleset(ruleset_name, &ruleset, &ss);
       if (err == -EAGAIN) {
-	wait_for_finished_proposal(new C_RetryMessage(this, m));
+	wait_for_finished_proposal(op, new C_RetryMessage(this, op));
 	return true;
       }
       if (err)
@@ -6030,21 +6626,33 @@ done:
       err = -EINVAL;
       goto reply;
     }
+
+    int64_t fast_read_param;
+    cmd_getval(g_ceph_context, cmdmap, "fast_read", fast_read_param, int64_t(-1));
+    FastReadType fast_read = FAST_READ_DEFAULT;
+    if (fast_read_param == 0)
+      fast_read = FAST_READ_OFF;
+    else if (fast_read_param > 0)
+      fast_read = FAST_READ_ON;
+    
     err = prepare_new_pool(poolstr, 0, // auid=0 for admin created pool
 			   -1, // default crush rule
 			   ruleset_name,
 			   pg_num, pgp_num,
 			   erasure_code_profile, pool_type,
                            (uint64_t)expected_num_objects,
-			   ss);
+                           fast_read,
+			   &ss);
     if (err < 0) {
       switch(err) {
       case -EEXIST:
 	ss << "pool '" << poolstr << "' already exists";
 	break;
       case -EAGAIN:
-	wait_for_finished_proposal(new C_RetryMessage(this, m));
+	wait_for_finished_proposal(op, new C_RetryMessage(this, op));
 	return true;
+      case -ERANGE:
+        goto reply;
       default:
 	goto reply;
 	break;
@@ -6053,7 +6661,7 @@ done:
       ss << "pool '" << poolstr << "' created";
     }
     getline(ss, rs);
-    wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
 					      get_last_committed() + 1));
     return true;
 
@@ -6079,7 +6687,7 @@ done:
     }
     err = _prepare_remove_pool(pool, &ss);
     if (err == -EAGAIN) {
-      wait_for_finished_proposal(new C_RetryMessage(this, m));
+      wait_for_finished_proposal(op, new C_RetryMessage(this, op));
       return true;
     }
     if (err < 0)
@@ -6123,7 +6731,7 @@ done:
         << cpp_strerror(ret);
     }
     getline(ss, rs);
-    wait_for_finished_proposal(new Monitor::C_Command(mon, m, ret, rs,
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, ret, rs,
 					      get_last_committed() + 1));
     return true;
 
@@ -6135,7 +6743,7 @@ done:
       goto reply;
 
     getline(ss, rs);
-    wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
 						   get_last_committed() + 1));
     return true;
   } else if (prefix == "osd tier add") {
@@ -6197,14 +6805,14 @@ done:
     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
     pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
     if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
-      wait_for_finished_proposal(new C_RetryMessage(this, m));
+      wait_for_finished_proposal(op, new C_RetryMessage(this, op));
       return true;
     }
     np->tiers.insert(tierpool_id);
     np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
     ntp->tier_of = pool_id;
     ss << "pool '" << tierpoolstr << "' is now (or already was) a tier of '" << poolstr << "'";
-    wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, ss.str(),
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
 					      get_last_committed() + 1));
     return true;
   } else if (prefix == "osd tier remove") {
@@ -6229,7 +6837,7 @@ done:
     const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
     assert(tp);
 
-    if (!_check_remove_tier(pool_id, p, &err, &ss)) {
+    if (!_check_remove_tier(pool_id, p, tp, &err, &ss)) {
       goto reply;
     }
 
@@ -6257,13 +6865,13 @@ done:
     if (np->tiers.count(tierpool_id) == 0 ||
 	ntp->tier_of != pool_id ||
 	np->read_tier == tierpool_id) {
-      wait_for_finished_proposal(new C_RetryMessage(this, m));
+      wait_for_finished_proposal(op, new C_RetryMessage(this, op));
       return true;
     }
     np->tiers.erase(tierpool_id);
     ntp->clear_tier();
     ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
-    wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, ss.str(),
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
 					      get_last_committed() + 1));
     return true;
   } else if (prefix == "osd tier set-overlay") {
@@ -6290,6 +6898,8 @@ done:
     }
     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
     assert(p);
+    const pg_pool_t *overlay_p = osdmap.get_pg_pool(overlaypool_id);
+    assert(overlay_p);
     if (p->tiers.count(overlaypool_id) == 0) {
       ss << "tier pool '" << overlaypoolstr << "' is not a tier of '" << poolstr << "'";
       err = -EINVAL;
@@ -6314,7 +6924,9 @@ done:
     np->write_tier = overlaypool_id;
     np->last_force_op_resend = pending_inc.epoch;
     ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
-    wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, ss.str(),
+    if (overlay_p->cache_mode == pg_pool_t::CACHEMODE_NONE)
+      ss <<" (WARNING: overlay pool cache_mode is still NONE)";
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
 					      get_last_committed() + 1));
     return true;
   } else if (prefix == "osd tier remove-overlay") {
@@ -6334,7 +6946,7 @@ done:
       goto reply;
     }
 
-    if (!_check_remove_tier(pool_id, p, &err, &ss)) {
+    if (!_check_remove_tier(pool_id, p, NULL, &err, &ss)) {
       goto reply;
     }
 
@@ -6344,7 +6956,7 @@ done:
     np->clear_write_tier();
     np->last_force_op_resend = pending_inc.epoch;
     ss << "there is now (or already was) no overlay for '" << poolstr << "'";
-    wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, ss.str(),
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
 					      get_last_committed() + 1));
     return true;
   } else if (prefix == "osd tier cache-mode") {
@@ -6460,7 +7072,14 @@ done:
     np->flags |= pg_pool_t::FLAG_INCOMPLETE_CLONES;
     ss << "set cache-mode for pool '" << poolstr
 	<< "' to " << pg_pool_t::get_cache_mode_name(mode);
-    wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, ss.str(),
+    if (mode == pg_pool_t::CACHEMODE_NONE) {
+      const pg_pool_t *base_pool = osdmap.get_pg_pool(np->tier_of);
+      assert(base_pool);
+      if (base_pool->read_tier == pool_id ||
+	  base_pool->write_tier == pool_id)
+	ss <<" (WARNING: pool is still configured as read or write tier)";
+    }
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
 					      get_last_committed() + 1));
     return true;
   } else if (prefix == "osd tier add-cache") {
@@ -6536,7 +7155,7 @@ done:
     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
     pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
     if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
-      wait_for_finished_proposal(new C_RetryMessage(this, m));
+      wait_for_finished_proposal(op, new C_RetryMessage(this, op));
       return true;
     }
     np->tiers.insert(tierpool_id);
@@ -6547,10 +7166,11 @@ done:
     ntp->hit_set_count = g_conf->osd_tier_default_cache_hit_set_count;
     ntp->hit_set_period = g_conf->osd_tier_default_cache_hit_set_period;
     ntp->min_read_recency_for_promote = g_conf->osd_tier_default_cache_min_read_recency_for_promote;
+    ntp->min_write_recency_for_promote = g_conf->osd_tier_default_cache_min_write_recency_for_promote;
     ntp->hit_set_params = hsp;
     ntp->target_max_bytes = size;
     ss << "pool '" << tierpoolstr << "' is now (or already was) a cache tier of '" << poolstr << "'";
-    wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, ss.str(),
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
 					      get_last_committed() + 1));
     return true;
   } else if (prefix == "osd pool set-quota") {
@@ -6592,7 +7212,7 @@ done:
     }
     ss << "set-quota " << field << " = " << value << " for pool " << poolstr;
     rs = ss.str();
-    wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
 					      get_last_committed() + 1));
     return true;
 
@@ -6608,7 +7228,7 @@ done:
     } else {
       ss << "SUCCESSFUL reweight-by-utilization: " << out_str;
       getline(ss, rs);
-      wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
+      wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
 						get_last_committed() + 1));
       return true;
     }
@@ -6637,7 +7257,7 @@ done:
     } else {
       ss << "SUCCESSFUL reweight-by-pg: " << out_str;
       getline(ss, rs);
-      wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
+      wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
 						get_last_committed() + 1));
       return true;
     }
@@ -6657,28 +7277,30 @@ done:
   getline(ss, rs);
   if (err < 0 && rs.length() == 0)
     rs = cpp_strerror(err);
-  mon->reply_command(m, err, rs, rdata, get_last_committed());
+  mon->reply_command(op, err, rs, rdata, get_last_committed());
   return ret;
 
  update:
   getline(ss, rs);
-  wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
+  wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
 					    get_last_committed() + 1));
   return true;
 
  wait:
-  wait_for_finished_proposal(new C_RetryMessage(this, m));
+  wait_for_finished_proposal(op, new C_RetryMessage(this, op));
   return true;
 }
 
-bool OSDMonitor::preprocess_pool_op(MPoolOp *m) 
+bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op) 
 {
+  op->mark_osdmon_event(__func__);
+  MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
   if (m->op == POOL_OP_CREATE)
-    return preprocess_pool_op_create(m);
+    return preprocess_pool_op_create(op);
 
   if (!osdmap.get_pg_pool(m->pool)) {
     dout(10) << "attempt to delete non-existent pool id " << m->pool << dendl;
-    _pool_op_reply(m, 0, osdmap.get_epoch());
+    _pool_op_reply(op, 0, osdmap.get_epoch());
     return true;
   }
 
@@ -6687,47 +7309,47 @@ bool OSDMonitor::preprocess_pool_op(MPoolOp *m)
   const pg_pool_t *p = osdmap.get_pg_pool(m->pool);
   if (p->snap_exists(m->name.c_str()))
     snap_exists = true;
-  
+
   switch (m->op) {
   case POOL_OP_CREATE_SNAP:
     if (p->is_unmanaged_snaps_mode()) {
-      _pool_op_reply(m, -EINVAL, osdmap.get_epoch());
+      _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
       return true;
     }
     if (snap_exists) {
-      _pool_op_reply(m, 0, osdmap.get_epoch());
+      _pool_op_reply(op, 0, osdmap.get_epoch());
       return true;
     }
     return false;
   case POOL_OP_CREATE_UNMANAGED_SNAP:
     if (p->is_pool_snaps_mode()) {
-      _pool_op_reply(m, -EINVAL, osdmap.get_epoch());
+      _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
       return true;
     }
     return false;
   case POOL_OP_DELETE_SNAP:
     if (p->is_unmanaged_snaps_mode()) {
-      _pool_op_reply(m, -EINVAL, osdmap.get_epoch());
+      _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
       return true;
     }
     if (!snap_exists) {
-      _pool_op_reply(m, 0, osdmap.get_epoch());
+      _pool_op_reply(op, 0, osdmap.get_epoch());
       return true;
     }
     return false;
   case POOL_OP_DELETE_UNMANAGED_SNAP:
     if (p->is_pool_snaps_mode()) {
-      _pool_op_reply(m, -EINVAL, osdmap.get_epoch());
+      _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
       return true;
     }
     if (p->is_removed_snap(m->snapid)) {
-      _pool_op_reply(m, 0, osdmap.get_epoch());
+      _pool_op_reply(op, 0, osdmap.get_epoch());
       return true;
     }
     return false;
   case POOL_OP_DELETE:
     if (osdmap.lookup_pg_pool_name(m->name.c_str()) >= 0) {
-      _pool_op_reply(m, 0, osdmap.get_epoch());
+      _pool_op_reply(op, 0, osdmap.get_epoch());
       return true;
     }
     return false;
@@ -6741,44 +7363,48 @@ bool OSDMonitor::preprocess_pool_op(MPoolOp *m)
   return false;
 }
 
-bool OSDMonitor::preprocess_pool_op_create(MPoolOp *m)
+bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op)
 {
+  op->mark_osdmon_event(__func__);
+  MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
   MonSession *session = m->get_session();
   if (!session) {
-    _pool_op_reply(m, -EPERM, osdmap.get_epoch());
+    _pool_op_reply(op, -EPERM, osdmap.get_epoch());
     return true;
   }
   if (!session->is_capable("osd", MON_CAP_W)) {
     dout(5) << "attempt to create new pool without sufficient auid privileges!"
 	    << "message: " << *m  << std::endl
 	    << "caps: " << session->caps << dendl;
-    _pool_op_reply(m, -EPERM, osdmap.get_epoch());
+    _pool_op_reply(op, -EPERM, osdmap.get_epoch());
     return true;
   }
 
   int64_t pool = osdmap.lookup_pg_pool_name(m->name.c_str());
   if (pool >= 0) {
-    _pool_op_reply(m, 0, osdmap.get_epoch());
+    _pool_op_reply(op, 0, osdmap.get_epoch());
     return true;
   }
 
   return false;
 }
 
-bool OSDMonitor::prepare_pool_op(MPoolOp *m)
+bool OSDMonitor::prepare_pool_op(MonOpRequestRef op)
 {
+  op->mark_osdmon_event(__func__);
+  MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
   dout(10) << "prepare_pool_op " << *m << dendl;
   if (m->op == POOL_OP_CREATE) {
-    return prepare_pool_op_create(m);
+    return prepare_pool_op_create(op);
   } else if (m->op == POOL_OP_DELETE) {
-    return prepare_pool_op_delete(m);
+    return prepare_pool_op_delete(op);
   }
 
   int ret = 0;
   bool changed = false;
 
   if (!osdmap.have_pg_pool(m->pool)) {
-    _pool_op_reply(m, -ENOENT, osdmap.get_epoch());
+    _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
     return false;
   }
 
@@ -6798,14 +7424,14 @@ bool OSDMonitor::prepare_pool_op(MPoolOp *m)
       } else {
         ret = -EINVAL;
       }
-      _pool_op_reply(m, ret, osdmap.get_epoch());
+      _pool_op_reply(op, ret, osdmap.get_epoch());
       return false;
 
     case POOL_OP_DELETE_UNMANAGED_SNAP:
       // we won't allow removal of an unmanaged snapshot from a pool
       // not in unmanaged snaps mode.
       if (!pool->is_unmanaged_snaps_mode()) {
-        _pool_op_reply(m, -ENOTSUP, osdmap.get_epoch());
+        _pool_op_reply(op, -ENOTSUP, osdmap.get_epoch());
         return false;
       }
       /* fall-thru */
@@ -6813,7 +7439,7 @@ bool OSDMonitor::prepare_pool_op(MPoolOp *m)
       // but we will allow creating an unmanaged snapshot on any pool
       // as long as it is not in 'pool' snaps mode.
       if (pool->is_pool_snaps_mode()) {
-        _pool_op_reply(m, -EINVAL, osdmap.get_epoch());
+        _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
         return false;
       }
   }
@@ -6864,7 +7490,7 @@ bool OSDMonitor::prepare_pool_op(MPoolOp *m)
     }
     break;
 
-  case POOL_OP_CREATE_UNMANAGED_SNAP: 
+  case POOL_OP_CREATE_UNMANAGED_SNAP:
     {
       uint64_t snapid;
       pp.add_unmanaged_snap(snapid);
@@ -6898,14 +7524,15 @@ bool OSDMonitor::prepare_pool_op(MPoolOp *m)
   }
 
  out:
-  wait_for_finished_proposal(new OSDMonitor::C_PoolOp(this, m, ret, pending_inc.epoch, &reply_data));
+  wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret, pending_inc.epoch, &reply_data));
   return true;
 }
 
-bool OSDMonitor::prepare_pool_op_create(MPoolOp *m)
+bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op)
 {
-  int err = prepare_new_pool(m);
-  wait_for_finished_proposal(new OSDMonitor::C_PoolOp(this, m, err, pending_inc.epoch));
+  op->mark_osdmon_event(__func__);
+  int err = prepare_new_pool(op);
+  wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, err, pending_inc.epoch));
   return true;
 }
 
@@ -6979,6 +7606,14 @@ bool OSDMonitor::_check_become_tier(
     return false;
   }
 
+  if (base_pool->is_tier()) {
+    *ss << "pool '" << base_pool_name << "' is already a tier of '"
+      << osdmap.get_pool_name(base_pool->tier_of) << "', "
+      << "multiple tiers are not yet supported.";
+    *err = -EINVAL;
+    return false;
+  }
+
   if (tier_pool->is_tier()) {
     *ss << "tier pool '" << tier_pool_name << "' is already a tier of '"
        << osdmap.get_pool_name(tier_pool->tier_of) << "'";
@@ -7000,17 +7635,29 @@ bool OSDMonitor::_check_become_tier(
  */
 bool OSDMonitor::_check_remove_tier(
     const int64_t base_pool_id, const pg_pool_t *base_pool,
+    const pg_pool_t *tier_pool,
     int *err, ostream *ss) const
 {
   const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
 
-  // If the pool is in use by CephFS, then refuse to remove its
-  // tier
+  // Apply CephFS-specific checks
   const MDSMap &pending_mdsmap = mon->mdsmon()->pending_mdsmap;
   if (pending_mdsmap.pool_in_use(base_pool_id)) {
-    *ss << "pool '" << base_pool_name << "' is in use by CephFS via its tier";
-    *err = -EBUSY;
-    return false;
+    if (base_pool->type != pg_pool_t::TYPE_REPLICATED) {
+      // If the underlying pool is erasure coded, we can't permit the
+      // removal of the replicated tier that CephFS relies on to access it
+      *ss << "pool '" << base_pool_name << "' is in use by CephFS via its tier";
+      *err = -EBUSY;
+      return false;
+    }
+
+    if (tier_pool && tier_pool->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK) {
+      *ss << "pool '" << base_pool_name << "' is in use by CephFS, and this "
+             "tier is still in use as a writeback cache.  Change the cache "
+             "mode and flush the cache before removing it";
+      *err = -EBUSY;
+      return false;
+    }
   }
 
   *err = 0;
@@ -7069,7 +7716,7 @@ int OSDMonitor::_prepare_rename_pool(int64_t pool, string newname)
 {
   dout(10) << "_prepare_rename_pool " << pool << dendl;
   if (pending_inc.old_pools.count(pool)) {
-    dout(10) << "_prepare_rename_pool " << pool << " pending removal" << dendl;    
+    dout(10) << "_prepare_rename_pool " << pool << " pending removal" << dendl;
     return -ENOENT;
   }
   for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
@@ -7084,26 +7731,30 @@ int OSDMonitor::_prepare_rename_pool(int64_t pool, string newname)
   return 0;
 }
 
-bool OSDMonitor::prepare_pool_op_delete(MPoolOp *m)
+bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op)
 {
+  op->mark_osdmon_event(__func__);
+  MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
   ostringstream ss;
   int ret = _prepare_remove_pool(m->pool, &ss);
   if (ret == -EAGAIN) {
-    wait_for_finished_proposal(new C_RetryMessage(this, m));
+    wait_for_finished_proposal(op, new C_RetryMessage(this, op));
     return true;
   }
   if (ret < 0)
     dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
-  wait_for_finished_proposal(new OSDMonitor::C_PoolOp(this, m, ret,
+  wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret,
 						      pending_inc.epoch));
   return true;
 }
 
-void OSDMonitor::_pool_op_reply(MPoolOp *m, int ret, epoch_t epoch, bufferlist *blp)
+void OSDMonitor::_pool_op_reply(MonOpRequestRef op,
+                                int ret, epoch_t epoch, bufferlist *blp)
 {
+  op->mark_osdmon_event(__func__);
+  MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
   dout(20) << "_pool_op_reply " << ret << dendl;
   MPoolOpReply *reply = new MPoolOpReply(m->fsid, m->get_tid(),
 					 ret, epoch, get_last_committed(), blp);
-  mon->send_reply(m, reply);
-  m->put();
+  mon->send_reply(op, reply);
 }
diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h
index 414bf08..78e00f9 100644
--- a/src/mon/OSDMonitor.h
+++ b/src/mon/OSDMonitor.h
@@ -35,28 +35,29 @@ using namespace std;
 #include "Session.h"
 
 class Monitor;
+class PGMap;
+
 #include "messages/MOSDBoot.h"
 #include "messages/MMonCommand.h"
 #include "messages/MOSDMap.h"
-#include "messages/MOSDFailure.h"
 #include "messages/MPoolOp.h"
 
 #include "erasure-code/ErasureCodeInterface.h"
 
+#include "common/TrackedOp.h"
+#include "mon/MonOpRequest.h"
+
 #define OSD_METADATA_PREFIX "osd_metadata"
 
 /// information about a particular peer's failure reports for one osd
 struct failure_reporter_t {
   int num_reports;          ///< reports from this reporter
   utime_t failed_since;     ///< when they think it failed
-  MOSDFailure *msg;         ///< most recent failure message
+  MonOpRequestRef op;       ///< most recent failure op request
 
-  failure_reporter_t() : num_reports(0), msg(NULL) {}
-  failure_reporter_t(utime_t s) : num_reports(1), failed_since(s), msg(NULL) {}
-  ~failure_reporter_t() {
-    // caller should have taken this message before removing the entry.
-    assert(!msg);
-  }
+  failure_reporter_t() : num_reports(0) {}
+  failure_reporter_t(utime_t s) : num_reports(1), failed_since(s) {}
+  ~failure_reporter_t() { }
 };
 
 /// information about all failure reports for one osd
@@ -79,9 +80,10 @@ struct failure_info_t {
     return max_failed_since;
   }
 
-  // set the message for the latest report.  return any old message we had,
+  // set the message for the latest report.  return any old op request we had,
   // if any, so we can discard it.
-  MOSDFailure *add_report(int who, utime_t failed_since, MOSDFailure *msg) {
+  MonOpRequestRef add_report(int who, utime_t failed_since,
+                              MonOpRequestRef op) {
     map<int, failure_reporter_t>::iterator p = reporters.find(who);
     if (p == reporters.end()) {
       if (max_failed_since == utime_t())
@@ -94,18 +96,18 @@ struct failure_info_t {
     }
     num_reports++;
 
-    MOSDFailure *ret = p->second.msg;
-    p->second.msg = msg;
+    MonOpRequestRef ret = p->second.op;
+    p->second.op = op;
     return ret;
   }
 
-  void take_report_messages(list<MOSDFailure*>& ls) {
+  void take_report_messages(list<MonOpRequestRef>& ls) {
     for (map<int, failure_reporter_t>::iterator p = reporters.begin();
 	 p != reporters.end();
 	 ++p) {
-      if (p->second.msg) {
-	ls.push_back(p->second.msg);
-	p->second.msg = NULL;
+      if (p->second.op) {
+	ls.push_back(p->second.op);
+        p->second.op.reset();
       }
     }
   }
@@ -135,16 +137,9 @@ private:
 
   map<int,double> osd_weight;
 
-  /*
-   * cache what epochs we think osds have.  this is purely
-   * optimization to try to avoid sending the same inc maps twice.
-   */
-  map<int,epoch_t> osd_epoch;
   SimpleLRU<version_t, bufferlist> inc_osd_cache;
   SimpleLRU<version_t, bufferlist> full_osd_cache;
 
-  void note_osd_has_epoch(int osd, epoch_t epoch);
-
   void check_failures(utime_t now);
   bool check_failure(utime_t now, int target_osd, failure_info_t& fi);
 
@@ -157,6 +152,12 @@ private:
   CrushWrapper &_get_stable_crush();
   void _get_pending_crush(CrushWrapper& newcrush);
 
+  enum FastReadType {
+    FAST_READ_OFF,
+    FAST_READ_ON,
+    FAST_READ_DEFAULT
+  };
+
   // svc
 public:  
   void create_initial();
@@ -166,7 +167,6 @@ private:
   void encode_pending(MonitorDBStore::TransactionRef t);
   void on_active();
   void on_shutdown();
-
   /**
    * we haven't delegated full version stashing to paxosservice for some time
    * now, making this function useless in current context.
@@ -203,11 +203,16 @@ private:
 
   void share_map_with_random_osd();
 
+  void maybe_prime_pg_temp();
+  void prime_pg_temp(OSDMap& next,
+		     ceph::unordered_map<pg_t, pg_stat_t>::iterator pp);
+  int prime_pg_temp(OSDMap& next, PGMap *pg_map, int osd);
+
   void update_logger();
 
   void handle_query(PaxosServiceMessage *m);
-  bool preprocess_query(PaxosServiceMessage *m);  // true if processed.
-  bool prepare_update(PaxosServiceMessage *m);
+  bool preprocess_query(MonOpRequestRef op);  // true if processed.
+  bool prepare_update(MonOpRequestRef op);
   bool should_propose(double &delay);
 
   version_t get_trim_to();
@@ -220,9 +225,12 @@ private:
   // ...
   MOSDMap *build_latest_full();
   MOSDMap *build_incremental(epoch_t first, epoch_t last);
-  void send_full(PaxosServiceMessage *m);
-  void send_incremental(PaxosServiceMessage *m, epoch_t first);
-  void send_incremental(epoch_t first, MonSession *session, bool onetime);
+  void send_full(MonOpRequestRef op);
+  void send_incremental(MonOpRequestRef op, epoch_t first);
+  // @param req an optional op request, if the osdmaps are replies to it. so
+  //            @c Monitor::send_reply() can mark_event with it.
+  void send_incremental(epoch_t first, MonSession *session, bool onetime,
+			MonOpRequestRef req = MonOpRequestRef());
 
   int reweight_by_utilization(int oload, std::string& out_str, bool by_pg,
 			      const set<int64_t> *pools);
@@ -231,27 +239,27 @@ private:
 
   bool check_source(PaxosServiceMessage *m, uuid_d fsid);
  
-  bool preprocess_get_osdmap(class MMonGetOSDMap *m);
+  bool preprocess_get_osdmap(MonOpRequestRef op);
 
-  bool preprocess_mark_me_down(class MOSDMarkMeDown *m);
+  bool preprocess_mark_me_down(MonOpRequestRef op);
 
   friend class C_AckMarkedDown;
-  bool preprocess_failure(class MOSDFailure *m);
-  bool prepare_failure(class MOSDFailure *m);
-  bool prepare_mark_me_down(class MOSDMarkMeDown *m);
+  bool preprocess_failure(MonOpRequestRef op);
+  bool prepare_failure(MonOpRequestRef op);
+  bool prepare_mark_me_down(MonOpRequestRef op);
   void process_failures();
-  void take_all_failures(list<MOSDFailure*>& ls);
+  void take_all_failures(list<MonOpRequestRef>& ls);
 
-  bool preprocess_boot(class MOSDBoot *m);
-  bool prepare_boot(class MOSDBoot *m);
-  void _booted(MOSDBoot *m, bool logit);
+  bool preprocess_boot(MonOpRequestRef op);
+  bool prepare_boot(MonOpRequestRef op);
+  void _booted(MonOpRequestRef op, bool logit);
 
-  bool preprocess_alive(class MOSDAlive *m);
-  bool prepare_alive(class MOSDAlive *m);
-  void _reply_map(PaxosServiceMessage *m, epoch_t e);
+  bool preprocess_alive(MonOpRequestRef op);
+  bool prepare_alive(MonOpRequestRef op);
+  void _reply_map(MonOpRequestRef op, epoch_t e);
 
-  bool preprocess_pgtemp(class MOSDPGTemp *m);
-  bool prepare_pgtemp(class MOSDPGTemp *m);
+  bool preprocess_pgtemp(MonOpRequestRef op);
+  bool prepare_pgtemp(MonOpRequestRef op);
 
   int _check_remove_pool(int64_t pool, const pg_pool_t *pi, ostream *ss);
   bool _check_become_tier(
@@ -259,49 +267,50 @@ private:
       int64_t base_pool_id, const pg_pool_t *base_pool,
       int *err, ostream *ss) const;
   bool _check_remove_tier(
-      int64_t base_pool_id, const pg_pool_t *base_pool,
+      int64_t base_pool_id, const pg_pool_t *base_pool, const pg_pool_t *tier_pool,
       int *err, ostream *ss) const;
 
   int _prepare_remove_pool(int64_t pool, ostream *ss);
   int _prepare_rename_pool(int64_t pool, string newname);
 
-  bool preprocess_pool_op ( class MPoolOp *m);
-  bool preprocess_pool_op_create ( class MPoolOp *m);
-  bool prepare_pool_op (MPoolOp *m);
-  bool prepare_pool_op_create (MPoolOp *m);
-  bool prepare_pool_op_delete(MPoolOp *m);
+  bool preprocess_pool_op (MonOpRequestRef op);
+  bool preprocess_pool_op_create (MonOpRequestRef op);
+  bool prepare_pool_op (MonOpRequestRef op);
+  bool prepare_pool_op_create (MonOpRequestRef op);
+  bool prepare_pool_op_delete(MonOpRequestRef op);
   int crush_rename_bucket(const string& srcname,
 			  const string& dstname,
 			  ostream *ss);
+  int normalize_profile(ErasureCodeProfile &profile, ostream *ss);
   int crush_ruleset_create_erasure(const string &name,
 				   const string &profile,
 				   int *ruleset,
-				   stringstream &ss);
+				   ostream *ss);
   int get_crush_ruleset(const string &ruleset_name,
 			int *crush_ruleset,
-			stringstream &ss);
+			ostream *ss);
   int get_erasure_code(const string &erasure_code_profile,
 		       ErasureCodeInterfaceRef *erasure_code,
-		       stringstream &ss) const;
+		       ostream *ss) const;
   int prepare_pool_crush_ruleset(const unsigned pool_type,
 				 const string &erasure_code_profile,
 				 const string &ruleset_name,
 				 int *crush_ruleset,
-				 stringstream &ss);
+				 ostream *ss);
   bool erasure_code_profile_in_use(const map<int64_t, pg_pool_t> &pools,
 				   const string &profile,
-				   ostream &ss);
+				   ostream *ss);
   int parse_erasure_code_profile(const vector<string> &erasure_code_profile,
 				 map<string,string> *erasure_code_profile_map,
-				 stringstream &ss);
+				 ostream *ss);
   int prepare_pool_size(const unsigned pool_type,
 			const string &erasure_code_profile,
 			unsigned *size, unsigned *min_size,
-			stringstream &ss);
+			ostream *ss);
   int prepare_pool_stripe_width(const unsigned pool_type,
 				const string &erasure_code_profile,
 				unsigned *stripe_width,
-				stringstream &ss);
+				ostream *ss);
   int prepare_new_pool(string& name, uint64_t auid,
 		       int crush_ruleset,
 		       const string &crush_ruleset_name,
@@ -309,81 +318,86 @@ private:
 		       const string &erasure_code_profile,
                        const unsigned pool_type,
                        const uint64_t expected_num_objects,
-		       stringstream &ss);
-  int prepare_new_pool(MPoolOp *m);
+                       FastReadType fast_read,
+		       ostream *ss);
+  int prepare_new_pool(MonOpRequestRef op);
 
   void update_pool_flags(int64_t pool_id, uint64_t flags);
   bool update_pools_status();
   void get_pools_health(list<pair<health_status_t,string> >& summary,
                         list<pair<health_status_t,string> > *detail) const;
 
-  bool prepare_set_flag(MMonCommand *m, int flag);
-  bool prepare_unset_flag(MMonCommand *m, int flag);
+  bool prepare_set_flag(MonOpRequestRef op, int flag);
+  bool prepare_unset_flag(MonOpRequestRef op, int flag);
 
-  void _pool_op_reply(MPoolOp *m, int ret, epoch_t epoch, bufferlist *blp=NULL);
+  void _pool_op_reply(MonOpRequestRef op,
+                      int ret, epoch_t epoch, bufferlist *blp=NULL);
 
-  struct C_Booted : public Context {
+  struct C_Booted : public C_MonOp {
     OSDMonitor *cmon;
-    MOSDBoot *m;
     bool logit;
-    C_Booted(OSDMonitor *cm, MOSDBoot *m_, bool l=true) : 
-      cmon(cm), m(m_), logit(l) {}
-    void finish(int r) {
+    C_Booted(OSDMonitor *cm, MonOpRequestRef op_, bool l=true) :
+      C_MonOp(op_), cmon(cm), logit(l) {}
+    void _finish(int r) {
       if (r >= 0)
-	cmon->_booted(m, logit);
+	cmon->_booted(op, logit);
       else if (r == -ECANCELED)
-	m->put();
+        return;
       else if (r == -EAGAIN)
-	cmon->dispatch((PaxosServiceMessage*)m);
+        cmon->dispatch(op);
       else
 	assert(0 == "bad C_Booted return value");
     }
   };
 
-  struct C_ReplyMap : public Context {
+  struct C_ReplyMap : public C_MonOp {
     OSDMonitor *osdmon;
-    PaxosServiceMessage *m;
     epoch_t e;
-    C_ReplyMap(OSDMonitor *o, PaxosServiceMessage *mm, epoch_t ee) : osdmon(o), m(mm), e(ee) {}
-    void finish(int r) {
+    C_ReplyMap(OSDMonitor *o, MonOpRequestRef op_, epoch_t ee)
+      : C_MonOp(op_), osdmon(o), e(ee) {}
+    void _finish(int r) {
       if (r >= 0)
-	osdmon->_reply_map(m, e);
+	osdmon->_reply_map(op, e);
       else if (r == -ECANCELED)
-	m->put();
+        return;
       else if (r == -EAGAIN)
-	osdmon->dispatch(m);
+	osdmon->dispatch(op);
       else
 	assert(0 == "bad C_ReplyMap return value");
     }    
   };
-  struct C_PoolOp : public Context {
+  struct C_PoolOp : public C_MonOp {
     OSDMonitor *osdmon;
-    MPoolOp *m;
     int replyCode;
     int epoch;
     bufferlist reply_data;
-    C_PoolOp(OSDMonitor * osd, MPoolOp *m_, int rc, int e, bufferlist *rd=NULL) :
-      osdmon(osd), m(m_), replyCode(rc), epoch(e) {
+    C_PoolOp(OSDMonitor * osd, MonOpRequestRef op_, int rc, int e, bufferlist *rd=NULL) :
+      C_MonOp(op_), osdmon(osd), replyCode(rc), epoch(e) {
       if (rd)
 	reply_data = *rd;
     }
-    void finish(int r) {
+    void _finish(int r) {
       if (r >= 0)
-	osdmon->_pool_op_reply(m, replyCode, epoch, &reply_data);
+	osdmon->_pool_op_reply(op, replyCode, epoch, &reply_data);
       else if (r == -ECANCELED)
-	m->put();
+        return;
       else if (r == -EAGAIN)
-	osdmon->dispatch(m);
+	osdmon->dispatch(op);
       else
 	assert(0 == "bad C_PoolOp return value");
     }
   };
 
-  bool preprocess_remove_snaps(struct MRemoveSnaps *m);
-  bool prepare_remove_snaps(struct MRemoveSnaps *m);
+  bool preprocess_remove_snaps(MonOpRequestRef op);
+  bool prepare_remove_snaps(MonOpRequestRef op);
+
+  CephContext *cct;
+  OpTracker op_tracker;
+
+  int load_metadata(int osd, map<string, string>& m, ostream *err);
 
  public:
-  OSDMonitor(Monitor *mn, Paxos *p, string service_name);
+  OSDMonitor(CephContext *cct, Monitor *mn, Paxos *p, const string& service_name);
 
   void tick();  // check state, take actions
 
@@ -391,9 +405,9 @@ private:
 
   void get_health(list<pair<health_status_t,string> >& summary,
 		  list<pair<health_status_t,string> > *detail) const;
-  bool preprocess_command(MMonCommand *m);
-  bool prepare_command(MMonCommand *m);
-  bool prepare_command_impl(MMonCommand *m, map<string,cmd_vartype> &cmdmap);
+  bool preprocess_command(MonOpRequestRef op);
+  bool prepare_command(MonOpRequestRef op);
+  bool prepare_command_impl(MonOpRequestRef op, map<string,cmd_vartype>& cmdmap);
 
   int set_crash_replay_interval(const int64_t pool_id, const uint32_t cri);
   int prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
@@ -403,18 +417,20 @@ private:
 			   std::map<int,utime_t> &last_osd_report);
   void mark_all_down();
 
-  void send_latest(PaxosServiceMessage *m, epoch_t start=0);
-  void send_latest_now_nodelete(PaxosServiceMessage *m, epoch_t start=0) {
-    send_incremental(m, start);
+  void send_latest(MonOpRequestRef op, epoch_t start=0);
+  void send_latest_now_nodelete(MonOpRequestRef op, epoch_t start=0) {
+    op->mark_osdmon_event(__func__);
+    send_incremental(op, start);
   }
 
-  int get_version(version_t ver, bufferlist& bl);
-  int get_version_full(version_t ver, bufferlist& bl);
+  int get_version(version_t ver, bufferlist& bl) override;
+  int get_version_full(version_t ver, bufferlist& bl) override;
 
   epoch_t blacklist(const entity_addr_t& a, utime_t until);
 
   void dump_info(Formatter *f);
   int dump_osd_metadata(int osd, Formatter *f, ostream *err);
+  void print_nodes(Formatter *f);
 
   void check_subs();
   void check_sub(Subscription *sub);
diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc
index e60ed6a..1eee600 100644
--- a/src/mon/PGMap.cc
+++ b/src/mon/PGMap.cc
@@ -360,6 +360,7 @@ void PGMap::calc_stats()
   pg_pool_sum.clear();
   pg_sum = pool_stat_t();
   osd_sum = osd_stat_t();
+  pg_by_osd.clear();
 
   for (ceph::unordered_map<pg_t,pg_stat_t>::iterator p = pg_stat.begin();
        p != pg_stat.end();
@@ -373,23 +374,25 @@ void PGMap::calc_stats()
 
   redo_full_sets();
 
-  calc_min_last_epoch_clean();
+  min_last_epoch_clean = calc_min_last_epoch_clean();
 }
 
 void PGMap::update_pg(pg_t pgid, bufferlist& bl)
 {
   bufferlist::iterator p = bl.begin();
   ceph::unordered_map<pg_t,pg_stat_t>::iterator s = pg_stat.find(pgid);
-  epoch_t old_lec = 0;
+  epoch_t old_lec = 0, lec;
   if (s != pg_stat.end()) {
     old_lec = s->second.get_effective_last_epoch_clean();
-    stat_pg_sub(pgid, s->second);
+    stat_pg_update(pgid, s->second, p);
+    lec = s->second.get_effective_last_epoch_clean();
+  } else {
+    pg_stat_t& r = pg_stat[pgid];
+    ::decode(r, p);
+    stat_pg_add(pgid, r);
+    lec = r.get_effective_last_epoch_clean();
   }
-  pg_stat_t& r = pg_stat[pgid];
-  ::decode(r, p);
-  stat_pg_add(pgid, r);
 
-  epoch_t lec = r.get_effective_last_epoch_clean();
   if (min_last_epoch_clean &&
       (lec < min_last_epoch_clean ||  // we did
        (lec > min_last_epoch_clean && // we might
@@ -456,30 +459,40 @@ void PGMap::remove_osd(int osd)
   }
 }
 
-void PGMap::stat_pg_add(const pg_t &pgid, const pg_stat_t &s, bool sumonly)
+void PGMap::stat_pg_add(const pg_t &pgid, const pg_stat_t &s, bool nocreating,
+			bool sameosds)
 {
   pg_pool_sum[pgid.pool()].add(s);
   pg_sum.add(s);
 
-  if (sumonly)
-    return;
-
   num_pg++;
   num_pg_by_state[s.state]++;
 
-  if (s.state & PG_STATE_CREATING) {
-    creating_pgs.insert(pgid);
-    if (s.acting_primary >= 0)
-      creating_pgs_by_osd[s.acting_primary].insert(pgid);
+  if (!nocreating) {
+    if (s.state & PG_STATE_CREATING) {
+      creating_pgs.insert(pgid);
+      if (s.acting_primary >= 0)
+	creating_pgs_by_osd[s.acting_primary].insert(pgid);
+    }
   }
+
+  if (sameosds)
+    return;
+
   for (vector<int>::const_iterator p = s.blocked_by.begin();
        p != s.blocked_by.end();
        ++p) {
     ++blocked_by_sum[*p];
   }
+
+  for (vector<int>::const_iterator p = s.acting.begin(); p != s.acting.end(); ++p)
+    pg_by_osd[*p].insert(pgid);
+  for (vector<int>::const_iterator p = s.up.begin(); p != s.up.end(); ++p)
+    pg_by_osd[*p].insert(pgid);
 }
 
-void PGMap::stat_pg_sub(const pg_t &pgid, const pg_stat_t &s, bool sumonly)
+void PGMap::stat_pg_sub(const pg_t &pgid, const pg_stat_t &s, bool nocreating,
+			bool sameosds)
 {
   pool_stat_t& ps = pg_pool_sum[pgid.pool()];
   ps.sub(s);
@@ -487,22 +500,24 @@ void PGMap::stat_pg_sub(const pg_t &pgid, const pg_stat_t &s, bool sumonly)
     pg_pool_sum.erase(pgid.pool());
   pg_sum.sub(s);
 
-  if (sumonly)
-    return;
-
   num_pg--;
   if (--num_pg_by_state[s.state] == 0)
     num_pg_by_state.erase(s.state);
 
-  if (s.state & PG_STATE_CREATING) {
-    creating_pgs.erase(pgid);
-    if (s.acting_primary >= 0) {
-      creating_pgs_by_osd[s.acting_primary].erase(pgid);
-      if (creating_pgs_by_osd[s.acting_primary].size() == 0)
-	creating_pgs_by_osd.erase(s.acting_primary);
+  if (!nocreating) {
+    if (s.state & PG_STATE_CREATING) {
+      creating_pgs.erase(pgid);
+      if (s.acting_primary >= 0) {
+	creating_pgs_by_osd[s.acting_primary].erase(pgid);
+	if (creating_pgs_by_osd[s.acting_primary].size() == 0)
+	  creating_pgs_by_osd.erase(s.acting_primary);
+      }
     }
   }
 
+  if (sameosds)
+    return;
+
   for (vector<int>::const_iterator p = s.blocked_by.begin();
        p != s.blocked_by.end();
        ++p) {
@@ -512,6 +527,35 @@ void PGMap::stat_pg_sub(const pg_t &pgid, const pg_stat_t &s, bool sumonly)
     if (q->second == 0)
       blocked_by_sum.erase(q);
   }
+
+  for (vector<int>::const_iterator p = s.acting.begin(); p != s.acting.end(); ++p) {
+    set<pg_t>& oset = pg_by_osd[*p];
+    oset.erase(pgid);
+    if (oset.empty())
+      pg_by_osd.erase(*p);
+  }
+  for (vector<int>::const_iterator p = s.up.begin(); p != s.up.end(); ++p) {
+    set<pg_t>& oset = pg_by_osd[*p];
+    oset.erase(pgid);
+    if (oset.empty())
+      pg_by_osd.erase(*p);
+  }
+}
+
+void PGMap::stat_pg_update(const pg_t pgid, pg_stat_t& s,
+			   bufferlist::iterator& blp)
+{
+  pg_stat_t n;
+  ::decode(n, blp);
+
+  bool sameosds =
+    s.acting == n.acting &&
+    s.up == n.up &&
+    s.blocked_by == n.blocked_by;
+
+  stat_pg_sub(pgid, s, false, sameosds);
+  s = n;
+  stat_pg_add(pgid, n, false, sameosds);
 }
 
 void PGMap::stat_osd_add(const osd_stat_t &s)
@@ -650,6 +694,7 @@ void PGMap::dump_basic(Formatter *f) const
   f->dump_stream("stamp") << stamp;
   f->dump_unsigned("last_osdmap_epoch", last_osdmap_epoch);
   f->dump_unsigned("last_pg_scan", last_pg_scan);
+  f->dump_unsigned("min_last_epoch_clean", min_last_epoch_clean);
   f->dump_float("full_ratio", full_ratio);
   f->dump_float("near_full_ratio", nearfull_ratio);
   
@@ -661,6 +706,16 @@ void PGMap::dump_basic(Formatter *f) const
   osd_sum.dump(f);
   f->close_section();
 
+  f->open_object_section("osd_epochs");
+  for (ceph::unordered_map<int32_t,epoch_t>::const_iterator p =
+	 osd_epochs.begin(); p != osd_epochs.end(); ++p) {
+    f->open_object_section("osd");
+    f->dump_unsigned("osd", p->first);
+    f->dump_unsigned("epoch", p->second);
+    f->close_section();
+  }
+  f->close_section();
+
   dump_delta(f);
 }
 
@@ -982,7 +1037,7 @@ void PGMap::print_osd_blocked_by_stats(std::ostream *ss) const
 void PGMap::recovery_summary(Formatter *f, list<string> *psl,
                              const pool_stat_t& delta_sum) const
 {
-  if (delta_sum.stats.sum.num_objects_degraded) {
+  if (delta_sum.stats.sum.num_objects_degraded && delta_sum.stats.sum.num_object_copies > 0) {
     double pc = (double)delta_sum.stats.sum.num_objects_degraded /
       (double)delta_sum.stats.sum.num_object_copies * (double)100.0;
     char b[20];
@@ -998,7 +1053,7 @@ void PGMap::recovery_summary(Formatter *f, list<string> *psl,
       psl->push_back(ss.str());
     }
   }
-  if (delta_sum.stats.sum.num_objects_misplaced) {
+  if (delta_sum.stats.sum.num_objects_misplaced && delta_sum.stats.sum.num_object_copies > 0) {
     double pc = (double)delta_sum.stats.sum.num_objects_misplaced /
       (double)delta_sum.stats.sum.num_object_copies * (double)100.0;
     char b[20];
@@ -1014,7 +1069,7 @@ void PGMap::recovery_summary(Formatter *f, list<string> *psl,
       psl->push_back(ss.str());
     }
   }
-  if (delta_sum.stats.sum.num_objects_unfound) {
+  if (delta_sum.stats.sum.num_objects_unfound && delta_sum.stats.sum.num_objects) {
     double pc = (double)delta_sum.stats.sum.num_objects_unfound /
       (double)delta_sum.stats.sum.num_objects * (double)100.0;
     char b[20];
@@ -1144,6 +1199,105 @@ void PGMap::pool_client_io_rate_summary(Formatter *f, ostream *out,
   client_io_rate_summary(f, out, p->second.first, ts->second);
 }
 
+void PGMap::cache_io_rate_summary(Formatter *f, ostream *out,
+                                  const pool_stat_t& delta_sum,
+                                  utime_t delta_stamp) const
+{
+  pool_stat_t pos_delta = delta_sum;
+  pos_delta.floor(0);
+  bool have_output = false;
+
+  if (pos_delta.stats.sum.num_flush) {
+    int64_t flush = (pos_delta.stats.sum.num_flush_kb << 10) / (double)delta_stamp;
+    if (f) {
+      f->dump_int("flush_bytes_sec", flush);
+    } else {
+      *out << pretty_si_t(flush) << "B/s flush";
+      have_output = true;
+    }
+  }
+  if (pos_delta.stats.sum.num_evict) {
+    int64_t evict = (pos_delta.stats.sum.num_evict_kb << 10) / (double)delta_stamp;
+    if (f) {
+      f->dump_int("evict_bytes_sec", evict);
+    } else {
+      if (have_output)
+	*out << ", ";
+      *out << pretty_si_t(evict) << "B/s evict";
+      have_output = true;
+    }
+  }
+  if (pos_delta.stats.sum.num_promote) {
+    int64_t promote = pos_delta.stats.sum.num_promote / (double)delta_stamp;
+    if (f) {
+      f->dump_int("promote_op_per_sec", promote);
+    } else {
+      if (have_output)
+	*out << ", ";
+      *out << pretty_si_t(promote) << "op/s promote";
+      have_output = true;
+    }
+  }
+  if (pos_delta.stats.sum.num_flush_mode_low) {
+    if (f) {
+      f->dump_int("num_flush_mode_low", pos_delta.stats.sum.num_flush_mode_low);
+    } else {
+      if (have_output)
+	*out << ", ";
+      *out << pretty_si_t(pos_delta.stats.sum.num_flush_mode_low) << "PG(s) flushing";
+      have_output = true;
+    }
+  }
+  if (pos_delta.stats.sum.num_flush_mode_high) {
+    if (f) {
+      f->dump_int("num_flush_mode_high", pos_delta.stats.sum.num_flush_mode_high);
+    } else {
+      if (have_output)
+	*out << ", ";
+      *out << pretty_si_t(pos_delta.stats.sum.num_flush_mode_high) << "PG(s) flushing (high)";
+      have_output = true;
+    }
+  }
+  if (pos_delta.stats.sum.num_evict_mode_some) {
+    if (f) {
+      f->dump_int("num_evict_mode_some", pos_delta.stats.sum.num_evict_mode_some);
+    } else {
+      if (have_output)
+	*out << ", ";
+      *out << pretty_si_t(pos_delta.stats.sum.num_evict_mode_some) << "PG(s) evicting";
+      have_output = true;
+    }
+  }
+  if (pos_delta.stats.sum.num_evict_mode_full) {
+    if (f) {
+      f->dump_int("num_evict_mode_full", pos_delta.stats.sum.num_evict_mode_full);
+    } else {
+      if (have_output)
+	*out << ", ";
+      *out << pretty_si_t(pos_delta.stats.sum.num_evict_mode_full) << "PG(s) evicting (full)";
+      have_output = true;
+    }
+  }
+}
+
+void PGMap::overall_cache_io_rate_summary(Formatter *f, ostream *out) const
+{
+  cache_io_rate_summary(f, out, pg_sum_delta, stamp_delta);
+}
+
+void PGMap::pool_cache_io_rate_summary(Formatter *f, ostream *out,
+                                       uint64_t poolid) const
+{
+  ceph::unordered_map<uint64_t,pair<pool_stat_t,utime_t> >::const_iterator p =
+    per_pool_sum_delta.find(poolid);
+  if (p == per_pool_sum_delta.end())
+    return;
+  ceph::unordered_map<uint64_t,utime_t>::const_iterator ts =
+    per_pool_sum_deltas_stamps.find(p->first);
+  assert(ts != per_pool_sum_deltas_stamps.end());
+  cache_io_rate_summary(f, out, p->second.first, ts->second);
+}
+
 /**
  * update aggregated delta
  *
@@ -1152,6 +1306,7 @@ void PGMap::pool_client_io_rate_summary(Formatter *f, ostream *out,
  * @param old_pool_sum      Previous stats sum
  * @param last_ts           Last timestamp for pool
  * @param result_pool_sum   Resulting stats
+ * @param result_pool_delta Resulting pool delta
  * @param result_ts_delta   Resulting timestamp delta
  * @param delta_avg_list    List of last N computed deltas, used to average
  */
@@ -1329,6 +1484,13 @@ void PGMap::print_summary(Formatter *f, ostream *out) const
   overall_client_io_rate_summary(f, &ssr);
   if (!f && ssr.str().length())
     *out << "  client io " << ssr.str() << "\n";
+
+  ssr.clear();
+  ssr.str("");
+
+  overall_cache_io_rate_summary(f, &ssr);
+  if (!f && ssr.str().length())
+    *out << "  cache io " << ssr.str() << "\n";
 }
 
 void PGMap::print_oneline_summary(Formatter *f, ostream *out) const
@@ -1415,11 +1577,14 @@ void PGMap::print_oneline_summary(Formatter *f, ostream *out) const
 void PGMap::generate_test_instances(list<PGMap*>& o)
 {
   o.push_back(new PGMap);
-  o.push_back(new PGMap);
   list<Incremental*> inc;
   Incremental::generate_test_instances(inc);
+  delete inc.front();
   inc.pop_front();
   while (!inc.empty()) {
+    PGMap *pmp = new PGMap();
+    *pmp = *o.back();
+    o.push_back(pmp);
     o.back()->apply_incremental(NULL, *inc.front());
     delete inc.front();
     inc.pop_front();
@@ -1452,7 +1617,7 @@ void PGMap::get_filtered_pg_stats(const string& state, int64_t poolid, int64_t o
 void PGMap::dump_filtered_pg_stats(Formatter *f, set<pg_t>& pgs)
 {
   f->open_array_section("pg_stats");
-  for (set<pg_t>::iterator i = pgs.begin(); i != pgs.end(); i++) {
+  for (set<pg_t>::iterator i = pgs.begin(); i != pgs.end(); ++i) {
     pg_stat_t& st = pg_stat[*i];
     f->open_object_section("pg_stat");
     f->dump_stream("pgid") << *i;
@@ -1466,7 +1631,7 @@ void PGMap::dump_filtered_pg_stats(ostream& ss, set<pg_t>& pgs)
   ss << "pg_stat\tobjects\tmip\tdegr\tmisp\tunf\tbytes\tlog\tdisklog\tstate\t"
     "state_stamp\tv\treported\tup\tup_primary\tacting\tacting_primary\t"
     "last_scrub\tscrub_stamp\tlast_deep_scrub\tdeep_scrub_stamp" << std::endl;
-  for (set<pg_t>::iterator i = pgs.begin(); i != pgs.end(); i++) {
+  for (set<pg_t>::iterator i = pgs.begin(); i != pgs.end(); ++i) {
     pg_stat_t& st = pg_stat[*i];
     ss << *i
        << "\t" << st.stats.sum.num_objects
diff --git a/src/mon/PGMap.h b/src/mon/PGMap.h
index 1cdff4a..e1cdf2d 100644
--- a/src/mon/PGMap.h
+++ b/src/mon/PGMap.h
@@ -121,6 +121,7 @@ public:
   osd_stat_t osd_sum;
   mutable epoch_t min_last_epoch_clean;
   ceph::unordered_map<int,int> blocked_by_sum;
+  ceph::unordered_map<int,set<pg_t> > pg_by_osd;
 
   utime_t stamp;
 
@@ -232,6 +233,14 @@ public:
     stamp = s;
   }
 
+  size_t get_num_pg_by_osd(int osd) const {
+    ceph::unordered_map<int,set<pg_t> >::const_iterator p = pg_by_osd.find(osd);
+    if (p == pg_by_osd.end())
+      return 0;
+    else
+      return p->second.size();
+  }
+
   pool_stat_t get_pg_pool_sum_stat(int64_t pool) const {
     ceph::unordered_map<int,pool_stat_t>::const_iterator p =
       pg_pool_sum.find(pool);
@@ -249,8 +258,11 @@ public:
   void redo_full_sets();
   void register_nearfull_status(int osd, const osd_stat_t& s);
   void calc_stats();
-  void stat_pg_add(const pg_t &pgid, const pg_stat_t &s, bool sumonly=false);
-  void stat_pg_sub(const pg_t &pgid, const pg_stat_t &s, bool sumonly=false);
+  void stat_pg_add(const pg_t &pgid, const pg_stat_t &s, bool nocreating=false,
+		   bool sameosds=false);
+  void stat_pg_sub(const pg_t &pgid, const pg_stat_t &s, bool nocreating=false,
+		   bool sameosds=false);
+  void stat_pg_update(const pg_t pgid, pg_stat_t &prev, bufferlist::iterator& blp);
   void stat_osd_add(const osd_stat_t &s);
   void stat_osd_sub(const osd_stat_t &s);
   
@@ -322,6 +334,25 @@ public:
    */
   void pool_client_io_rate_summary(Formatter *f, ostream *out,
                                    uint64_t poolid) const;
+  /**
+   * Obtain a formatted/plain output for cache tier IO, source from stats for a
+   * given @p delta_sum pool over a given @p delta_stamp period of time.
+   */
+  void cache_io_rate_summary(Formatter *f, ostream *out,
+                             const pool_stat_t& delta_sum,
+                             utime_t delta_stamp) const;
+  /**
+   * Obtain a formatted/plain output for the overall cache tier IO, which is
+   * calculated resorting to @p pg_sum_delta and @p stamp_delta.
+   */
+  void overall_cache_io_rate_summary(Formatter *f, ostream *out) const;
+  /**
+   * Obtain a formatted/plain output for cache tier IO over a given pool
+   * with id @p pool_id.  We will then obtain pool-specific data
+   * from @p per_pool_sum_delta.
+   */
+  void pool_cache_io_rate_summary(Formatter *f, ostream *out,
+                                  uint64_t poolid) const;
 
   void print_summary(Formatter *f, ostream *out) const;
   void print_oneline_summary(Formatter *f, ostream *out) const;
diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc
index 07e6305..77a5af1 100644
--- a/src/mon/PGMonitor.cc
+++ b/src/mon/PGMonitor.cc
@@ -575,57 +575,61 @@ version_t PGMonitor::get_trim_to()
   return 0;
 }
 
-bool PGMonitor::preprocess_query(PaxosServiceMessage *m)
+bool PGMonitor::preprocess_query(MonOpRequestRef op)
 {
+  op->mark_pgmon_event(__func__);
+  PaxosServiceMessage *m = static_cast<PaxosServiceMessage*>(op->get_req());
   dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
   switch (m->get_type()) {
   case CEPH_MSG_STATFS:
-    handle_statfs(static_cast<MStatfs*>(m));
+    handle_statfs(op);
     return true;
   case MSG_GETPOOLSTATS:
-    return preprocess_getpoolstats(static_cast<MGetPoolStats*>(m));
+    return preprocess_getpoolstats(op);
     
   case MSG_PGSTATS:
-    return preprocess_pg_stats(static_cast<MPGStats*>(m));
+    return preprocess_pg_stats(op);
 
   case MSG_MON_COMMAND:
-    return preprocess_command(static_cast<MMonCommand*>(m));
+    return preprocess_command(op);
 
 
   default:
     assert(0);
-    m->put();
     return true;
   }
 }
 
-bool PGMonitor::prepare_update(PaxosServiceMessage *m)
+bool PGMonitor::prepare_update(MonOpRequestRef op)
 {
+  op->mark_pgmon_event(__func__);
+  PaxosServiceMessage *m = static_cast<PaxosServiceMessage*>(op->get_req());
   dout(10) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
   switch (m->get_type()) {
   case MSG_PGSTATS:
-    return prepare_pg_stats((MPGStats*)m);
+    return prepare_pg_stats(op);
 
   case MSG_MON_COMMAND:
-    return prepare_command(static_cast<MMonCommand*>(m));
+    return prepare_command(op);
 
   default:
     assert(0);
-    m->put();
     return false;
   }
 }
 
-void PGMonitor::handle_statfs(MStatfs *statfs)
+void PGMonitor::handle_statfs(MonOpRequestRef op)
 {
+  op->mark_pgmon_event(__func__);
+  MStatfs *statfs = static_cast<MStatfs*>(op->get_req());
   // check caps
   MonSession *session = statfs->get_session();
   if (!session)
-    goto out;
+    return;
   if (!session->is_capable("pg", MON_CAP_R)) {
     dout(0) << "MStatfs received from entity with insufficient privileges "
 	    << session->caps << dendl;
-    goto out;
+    return;
   }
   MStatfsReply *reply;
 
@@ -633,7 +637,7 @@ void PGMonitor::handle_statfs(MStatfs *statfs)
 
   if (statfs->fsid != mon->monmap->fsid) {
     dout(0) << "handle_statfs on fsid " << statfs->fsid << " != " << mon->monmap->fsid << dendl;
-    goto out;
+    return;
   }
 
   // fill out stfs
@@ -646,13 +650,13 @@ void PGMonitor::handle_statfs(MStatfs *statfs)
   reply->h.st.num_objects = pg_map.pg_sum.stats.sum.num_objects;
 
   // reply
-  mon->send_reply(statfs, reply);
- out:
-  statfs->put();
+  mon->send_reply(op, reply);
 }
 
-bool PGMonitor::preprocess_getpoolstats(MGetPoolStats *m)
+bool PGMonitor::preprocess_getpoolstats(MonOpRequestRef op)
 {
+  op->mark_pgmon_event(__func__);
+  MGetPoolStats *m = static_cast<MGetPoolStats*>(op->get_req());
   MGetPoolStatsReply *reply;
 
   MonSession *session = m->get_session();
@@ -682,27 +686,26 @@ bool PGMonitor::preprocess_getpoolstats(MGetPoolStats *m)
     reply->pool_stats[*p] = pg_map.pg_pool_sum[poolid];
   }
 
-  mon->send_reply(m, reply);
+  mon->send_reply(op, reply);
 
  out:
-  m->put();
   return true;
 }
 
 
-bool PGMonitor::preprocess_pg_stats(MPGStats *stats)
+bool PGMonitor::preprocess_pg_stats(MonOpRequestRef op)
 {
+  op->mark_pgmon_event(__func__);
+  MPGStats *stats = static_cast<MPGStats*>(op->get_req());
   // check caps
   MonSession *session = stats->get_session();
   if (!session) {
     dout(10) << "PGMonitor::preprocess_pg_stats: no monitor session!" << dendl;
-    stats->put();
     return true;
   }
   if (!session->is_capable("pg", MON_CAP_R)) {
     derr << "PGMonitor::preprocess_pg_stats: MPGStats received from entity "
          << "with insufficient privileges " << session->caps << dendl;
-    stats->put();
     return true;
   }
 
@@ -711,7 +714,7 @@ bool PGMonitor::preprocess_pg_stats(MPGStats *stats)
   if (stats->had_map_for > 30.0 && 
       mon->osdmon()->is_readable() &&
       stats->epoch < mon->osdmon()->osdmap.get_epoch())
-    mon->osdmon()->send_latest_now_nodelete(stats, stats->epoch+1);
+    mon->osdmon()->send_latest_now_nodelete(op, stats->epoch+1);
 
   // Always forward the PGStats to the leader, even if they are the same as
   // the old PGStats. The leader will mark as down osds that haven't sent
@@ -742,14 +745,15 @@ bool PGMonitor::pg_stats_have_changed(int from, const MPGStats *stats) const
   return false;
 }
 
-bool PGMonitor::prepare_pg_stats(MPGStats *stats) 
+bool PGMonitor::prepare_pg_stats(MonOpRequestRef op)
 {
+  op->mark_pgmon_event(__func__);
+  MPGStats *stats = static_cast<MPGStats*>(op->get_req());
   dout(10) << "prepare_pg_stats " << *stats << " from " << stats->get_orig_source() << dendl;
   int from = stats->get_orig_source().num();
 
   if (stats->fsid != mon->monmap->fsid) {
     dout(0) << "prepare_pg_stats on fsid " << stats->fsid << " != " << mon->monmap->fsid << dendl;
-    stats->put();
     return false;
   }
 
@@ -759,7 +763,6 @@ bool PGMonitor::prepare_pg_stats(MPGStats *stats)
       !mon->osdmon()->osdmap.is_up(from) ||
       stats->get_orig_source_inst() != mon->osdmon()->osdmap.get_inst(from)) {
     dout(1) << " ignoring stats from non-active osd." << dendl;
-    stats->put();
     return false;
   }
       
@@ -772,8 +775,7 @@ bool PGMonitor::prepare_pg_stats(MPGStats *stats)
 	 ++p) {
       ack->pg_stat[p->first] = make_pair(p->second.reported_seq, p->second.reported_epoch);
     }
-    mon->send_reply(stats, ack);
-    stats->put();
+    mon->send_reply(op, ack);
     return false;
   }
 
@@ -791,6 +793,7 @@ bool PGMonitor::prepare_pg_stats(MPGStats *stats)
 
   // pg stats
   MPGStatsAck *ack = new MPGStatsAck;
+  MonOpRequestRef ack_op = mon->op_tracker.create_request<MonOpRequest>(ack);
   ack->set_tid(stats->get_tid());
   for (map<pg_t,pg_stat_t>::iterator p = stats->pg_stat.begin();
        p != stats->pg_stat.end();
@@ -835,15 +838,19 @@ bool PGMonitor::prepare_pg_stats(MPGStats *stats)
     */
   }
   
-  wait_for_finished_proposal(new C_Stats(this, stats, ack));
+  wait_for_finished_proposal(op, new C_Stats(this, op, ack_op));
   return true;
 }
 
-void PGMonitor::_updated_stats(MPGStats *req, MPGStatsAck *ack)
+void PGMonitor::_updated_stats(MonOpRequestRef op, MonOpRequestRef ack_op)
 {
-  dout(7) << "_updated_stats for " << req->get_orig_source_inst() << dendl;
-  mon->send_reply(req, ack);
-  req->put();
+  op->mark_pgmon_event(__func__);
+  ack_op->mark_pgmon_event(__func__);
+  MPGStats *ack = static_cast<MPGStats*>(ack_op->get_req());
+  ack->get();  // MonOpRequestRef owns one ref; give the other to send_reply.
+  dout(7) << "_updated_stats for "
+          << op->get_req()->get_orig_source_inst() << dendl;
+  mon->send_reply(op, ack);
 }
 
 
@@ -873,13 +880,13 @@ void PGMonitor::check_osd_map(epoch_t epoch)
 
   if (!mon->osdmon()->is_readable()) {
     dout(10) << "check_osd_map -- osdmap not readable, waiting" << dendl;
-    mon->osdmon()->wait_for_readable(new RetryCheckOSDMap(this, epoch));
+    mon->osdmon()->wait_for_readable_ctx(new RetryCheckOSDMap(this, epoch));
     return;
   }
 
   if (!is_writeable()) {
     dout(10) << "check_osd_map -- pgmap not writeable, waiting" << dendl;
-    wait_for_writeable(new RetryCheckOSDMap(this, epoch));
+    wait_for_writeable_ctx(new RetryCheckOSDMap(this, epoch));
     return;
   }
 
@@ -1080,13 +1087,12 @@ bool PGMonitor::register_new_pgs()
     }
   }
 
+  // we don't want to redo this work if we can avoid it.
+  pending_inc.pg_scan = epoch;
+
   dout(10) << "register_new_pgs registered " << created << " new pgs, removed "
 	   << removed << " uncreated pgs" << dendl;
-  if (created || removed) {
-    pending_inc.pg_scan = epoch;
-    return true;
-  }
-  return false;
+  return (created || removed);
 }
 
 void PGMonitor::map_pg_creates()
@@ -1193,7 +1199,7 @@ void PGMonitor::send_pg_creates(int osd, Connection *con)
     m->mkpg[*q] = pg_create_t(pg_map.pg_stat[*q].created,
 			      pg_map.pg_stat[*q].parent,
 			      pg_map.pg_stat[*q].parent_split_bits);
-    // Need the create time from the monitor using his clock to set last_scrub_stamp
+    // Need the create time from the monitor using its clock to set last_scrub_stamp
     // upon pg creation.
     m->ctimes[*q] = pg_map.pg_stat[*q].last_scrub_stamp;
   }
@@ -1252,7 +1258,7 @@ inline string percentify(const float& a) {
 //void PGMonitor::dump_object_stat_sum(stringstream& ss, Formatter *f,
 void PGMonitor::dump_object_stat_sum(TextTable &tbl, Formatter *f,
 				     object_stat_sum_t &sum, uint64_t avail,
-				     bool verbose)
+				     bool verbose) const
 {
   if (f) {
     f->dump_int("kb_used", SHIFT_ROUND_UP(sum.num_bytes, 10));
@@ -1283,7 +1289,7 @@ void PGMonitor::dump_object_stat_sum(TextTable &tbl, Formatter *f,
   }
 }
 
-int64_t PGMonitor::get_rule_avail(OSDMap& osdmap, int ruleno)
+int64_t PGMonitor::get_rule_avail(OSDMap& osdmap, int ruleno) const
 {
   map<int,float> wm;
   int r = osdmap.crush->get_rule_weight_osd_map(ruleno, &wm);
@@ -1406,7 +1412,7 @@ void PGMonitor::dump_pool_stats(stringstream &ss, Formatter *f, bool verbose)
   }
 }
 
-void PGMonitor::dump_fs_stats(stringstream &ss, Formatter *f, bool verbose)
+void PGMonitor::dump_fs_stats(stringstream &ss, Formatter *f, bool verbose) const
 {
   if (f) {
     f->open_object_section("stats");
@@ -1446,7 +1452,7 @@ void PGMonitor::dump_fs_stats(stringstream &ss, Formatter *f, bool verbose)
 }
 
 
-void PGMonitor::dump_info(Formatter *f)
+void PGMonitor::dump_info(Formatter *f) const
 {
   f->open_object_section("pgmap");
   pg_map.dump(f);
@@ -1456,8 +1462,10 @@ void PGMonitor::dump_info(Formatter *f)
   f->dump_unsigned("pgmap_last_committed", get_last_committed());
 }
 
-bool PGMonitor::preprocess_command(MMonCommand *m)
+bool PGMonitor::preprocess_command(MonOpRequestRef op)
 {
+  op->mark_pgmon_event(__func__);
+  MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
   int r = -1;
   bufferlist rdata;
   stringstream ss, ds;
@@ -1467,7 +1475,7 @@ bool PGMonitor::preprocess_command(MMonCommand *m)
   if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
     // ss has reason for failure
     string rs = ss.str();
-    mon->reply_command(m, -EINVAL, rs, rdata, get_last_committed());
+    mon->reply_command(op, -EINVAL, rs, rdata, get_last_committed());
     return true;
   }
 
@@ -1476,7 +1484,7 @@ bool PGMonitor::preprocess_command(MMonCommand *m)
 
   MonSession *session = m->get_session();
   if (!session) {
-    mon->reply_command(m, -EACCES, "access denied", rdata, get_last_committed());
+    mon->reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
     return true;
   }
 
@@ -1508,7 +1516,7 @@ bool PGMonitor::preprocess_command(MMonCommand *m)
       r = -ENOENT;
       ss << "pool " << poolstr << " does not exist";
       string rs = ss.str();
-      mon->reply_command(m, r, rs, get_last_committed());
+      mon->reply_command(op, r, rs, get_last_committed());
       return true;
     }
     cmd_putval(g_ceph_context, cmdmap, "pool", pool);
@@ -1778,12 +1786,14 @@ bool PGMonitor::preprocess_command(MMonCommand *m)
   string rs;
   getline(ss, rs);
   rdata.append(ds);
-  mon->reply_command(m, r, rs, rdata, get_last_committed());
+  mon->reply_command(op, r, rs, rdata, get_last_committed());
   return true;
 }
 
-bool PGMonitor::prepare_command(MMonCommand *m)
+bool PGMonitor::prepare_command(MonOpRequestRef op)
 {
+  op->mark_pgmon_event(__func__);
+  MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
   stringstream ss;
   pg_t pgid;
   epoch_t epoch = mon->osdmon()->osdmap.get_epoch();
@@ -1794,7 +1804,7 @@ bool PGMonitor::prepare_command(MMonCommand *m)
   if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
     // ss has reason for failure
     string rs = ss.str();
-    mon->reply_command(m, -EINVAL, rs, get_last_committed());
+    mon->reply_command(op, -EINVAL, rs, get_last_committed());
     return true;
   }
 
@@ -1803,7 +1813,7 @@ bool PGMonitor::prepare_command(MMonCommand *m)
 
   MonSession *session = m->get_session();
   if (!session) {
-    mon->reply_command(m, -EACCES, "access denied", get_last_committed());
+    mon->reply_command(op, -EACCES, "access denied", get_last_committed());
     return true;
   }
 
@@ -1857,13 +1867,13 @@ bool PGMonitor::prepare_command(MMonCommand *m)
   getline(ss, rs);
   if (r < 0 && rs.length() == 0)
     rs = cpp_strerror(r);
-  mon->reply_command(m, r, rs, get_last_committed());
+  mon->reply_command(op, r, rs, get_last_committed());
   return false;
 
  update:
   getline(ss, rs);
-  wait_for_finished_proposal(new Monitor::C_Command(mon, m, r, rs,
-						    get_last_committed() + 1));
+  wait_for_finished_proposal(op, new Monitor::C_Command(
+        mon, op, r, rs, get_last_committed() + 1));
   return true;
 }
 
@@ -2061,7 +2071,7 @@ void PGMonitor::get_health(list<pair<health_status_t,string> >& summary,
   // slow requests
   if (g_conf->mon_osd_max_op_age > 0 &&
       pg_map.osd_sum.op_queue_age_hist.upper_bound() > g_conf->mon_osd_max_op_age) {
-    unsigned sum = _warn_slow_request_histogram(pg_map.osd_sum.op_queue_age_hist, "", summary, detail);
+    unsigned sum = _warn_slow_request_histogram(pg_map.osd_sum.op_queue_age_hist, "", summary, NULL);
     if (sum > 0) {
       ostringstream ss;
       ss << sum << " requests are blocked > " << g_conf->mon_osd_max_op_age << " sec";
@@ -2112,7 +2122,7 @@ void PGMonitor::get_health(list<pair<health_status_t,string> >& summary,
     uint64_t ratio = p->second.cache_target_full_ratio_micro +
       ((1000000 - p->second.cache_target_full_ratio_micro) *
        g_conf->mon_cache_target_full_warn_ratio);
-    if (p->second.target_max_objects && (uint64_t)st.stats.sum.num_objects >
+    if (p->second.target_max_objects && (uint64_t)(st.stats.sum.num_objects - st.stats.sum.num_objects_hit_set_archive) >
 	p->second.target_max_objects * (ratio / 1000000.0)) {
       nearfull = true;
       if (detail) {
@@ -2124,12 +2134,12 @@ void PGMonitor::get_health(list<pair<health_status_t,string> >& summary,
 	detail->push_back(make_pair(HEALTH_WARN, ss.str()));
       }
     }
-    if (p->second.target_max_bytes && (uint64_t)st.stats.sum.num_bytes >
+    if (p->second.target_max_bytes && (uint64_t)(st.stats.sum.num_bytes - st.stats.sum.num_bytes_hit_set_archive) >
 	p->second.target_max_bytes * (ratio / 1000000.0)) {
       nearfull = true;
       if (detail) {
 	ostringstream ss;
-	ss << "cache pool '" << mon->osdmon()->osdmap.get_pool_name(p->first)
+	ss << "cache pool '" << name
 	   << "' with " << si_t(st.stats.sum.num_bytes)
 	   << "B at/near target max "
 	   << si_t(p->second.target_max_bytes) << "B";
@@ -2158,7 +2168,7 @@ void PGMonitor::get_health(list<pair<health_status_t,string> >& summary,
   int sum_pg_up = MAX(pg_map.pg_sum.up, static_cast<int32_t>(pg_map.pg_stat.size()));
   if (num_in && g_conf->mon_pg_warn_min_per_osd > 0) {
     int per = sum_pg_up / num_in;
-    if (per < g_conf->mon_pg_warn_min_per_osd) {
+    if (per < g_conf->mon_pg_warn_min_per_osd && per) {
       ostringstream ss;
       ss << "too few PGs per OSD (" << per << " < min " << g_conf->mon_pg_warn_min_per_osd << ")";
       summary.push_back(make_pair(HEALTH_WARN, ss.str()));
@@ -2183,9 +2193,10 @@ void PGMonitor::get_health(list<pair<health_status_t,string> >& summary,
       const pg_pool_t *pi = mon->osdmon()->osdmap.get_pg_pool(p->first);
       if (!pi)
 	continue;   // in case osdmap changes haven't propagated to PGMap yet
+      const string& name = mon->osdmon()->osdmap.get_pool_name(p->first);
       if (pi->get_pg_num() > pi->get_pgp_num()) {
 	ostringstream ss;
-	ss << "pool " << mon->osdmon()->osdmap.get_pool_name(p->first) << " pg_num "
+	ss << "pool " << name << " pg_num "
 	   << pi->get_pg_num() << " > pgp_num " << pi->get_pgp_num();
 	summary.push_back(make_pair(HEALTH_WARN, ss.str()));
 	if (detail)
@@ -2200,11 +2211,11 @@ void PGMonitor::get_health(list<pair<health_status_t,string> >& summary,
 	if (g_conf->mon_pg_warn_max_object_skew > 0 &&
 	    ratio > g_conf->mon_pg_warn_max_object_skew) {
 	  ostringstream ss;
-	  ss << "pool " << mon->osdmon()->osdmap.get_pool_name(p->first) << " has too few pgs";
+	  ss << "pool " << name << " has too few pgs";
 	  summary.push_back(make_pair(HEALTH_WARN, ss.str()));
 	  if (detail) {
 	    ostringstream ss;
-	    ss << "pool " << mon->osdmon()->osdmap.get_pool_name(p->first) << " objects per pg ("
+	    ss << "pool " << name << " objects per pg ("
 	       << objects_per_pg << ") is more than " << ratio << " times cluster average ("
 	       << average_objects_per_pg << ")";
 	    detail->push_back(make_pair(HEALTH_WARN, ss.str()));
diff --git a/src/mon/PGMonitor.h b/src/mon/PGMonitor.h
index 97a9ac1..cb725a6 100644
--- a/src/mon/PGMonitor.h
+++ b/src/mon/PGMonitor.h
@@ -74,40 +74,40 @@ private:
   void read_pgmap_full();
   void apply_pgmap_delta(bufferlist& bl);
 
-  bool preprocess_query(PaxosServiceMessage *m);  // true if processed.
-  bool prepare_update(PaxosServiceMessage *m);
+  bool preprocess_query(MonOpRequestRef op);  // true if processed.
+  bool prepare_update(MonOpRequestRef op);
 
-  bool preprocess_pg_stats(MPGStats *stats);
+  bool preprocess_pg_stats(MonOpRequestRef op);
   bool pg_stats_have_changed(int from, const MPGStats *stats) const;
-  bool prepare_pg_stats(MPGStats *stats);
-  void _updated_stats(MPGStats *req, MPGStatsAck *ack);
+  bool prepare_pg_stats(MonOpRequestRef op);
+  void _updated_stats(MonOpRequestRef op, MonOpRequestRef ack_op);
 
-  struct C_Stats : public Context {
+  struct C_Stats : public C_MonOp {
     PGMonitor *pgmon;
-    MPGStats *req;
-    MPGStatsAck *ack;
+    MonOpRequestRef stats_op_ack;
     entity_inst_t who;
-    C_Stats(PGMonitor *p, MPGStats *r, MPGStatsAck *a) : pgmon(p), req(r), ack(a) {}
-    void finish(int r) {
+    C_Stats(PGMonitor *p,
+            MonOpRequestRef op,
+            MonOpRequestRef op_ack)
+      : C_MonOp(op), pgmon(p), stats_op_ack(op_ack) {}
+    void _finish(int r) {
       if (r >= 0) {
-	pgmon->_updated_stats(req, ack);
+	pgmon->_updated_stats(op, stats_op_ack);
       } else if (r == -ECANCELED) {
-	req->put();
-	ack->put();
+        return;
       } else if (r == -EAGAIN) {
-	pgmon->dispatch(req);
-	ack->put();
+	pgmon->dispatch(op);
       } else {
 	assert(0 == "bad C_Stats return value");
       }
     }    
   };
 
-  void handle_statfs(MStatfs *statfs);
-  bool preprocess_getpoolstats(MGetPoolStats *m);
+  void handle_statfs(MonOpRequestRef op);
+  bool preprocess_getpoolstats(MonOpRequestRef op);
 
-  bool preprocess_command(MMonCommand *m);
-  bool prepare_command(MMonCommand *m);
+  bool preprocess_command(MonOpRequestRef op);
+  bool prepare_command(MonOpRequestRef op);
 
   map<int,utime_t> last_sent_pg_create;  // per osd throttle
 
@@ -148,9 +148,9 @@ private:
   void dump_object_stat_sum(TextTable &tbl, Formatter *f,
                             object_stat_sum_t &sum,
 			    uint64_t avail,
-			    bool verbose);
+			    bool verbose) const;
 
-  int64_t get_rule_avail(OSDMap& osdmap, int ruleno);
+  int64_t get_rule_avail(OSDMap& osdmap, int ruleno) const;
 
 public:
   PGMonitor(Monitor *mn, Paxos *p, const string& service_name)
@@ -190,9 +190,9 @@ public:
   void check_osd_map(epoch_t epoch);
 
   void dump_pool_stats(stringstream &ss, Formatter *f, bool verbose);
-  void dump_fs_stats(stringstream &ss, Formatter *f, bool verbose);
+  void dump_fs_stats(stringstream &ss, Formatter *f, bool verbose) const;
 
-  void dump_info(Formatter *f);
+  void dump_info(Formatter *f) const;
 
   int _warn_slow_request_histogram(const pow2_hist_t& h, string suffix,
 				   list<pair<health_status_t,string> >& summary,
diff --git a/src/mon/Paxos.cc b/src/mon/Paxos.cc
index 297ea17..e0c8039 100644
--- a/src/mon/Paxos.cc
+++ b/src/mon/Paxos.cc
@@ -77,37 +77,39 @@ void Paxos::init()
 void Paxos::init_logger()
 {
   PerfCountersBuilder pcb(g_ceph_context, "paxos", l_paxos_first, l_paxos_last);
-  pcb.add_u64_counter(l_paxos_start_leader, "start_leader");
-  pcb.add_u64_counter(l_paxos_start_peon, "start_peon");
-  pcb.add_u64_counter(l_paxos_restart, "restart");
-  pcb.add_u64_counter(l_paxos_refresh, "refresh");
-  pcb.add_time_avg(l_paxos_refresh_latency, "refresh_latency");
-  pcb.add_u64_counter(l_paxos_begin, "begin");
-  pcb.add_u64_avg(l_paxos_begin_keys, "begin_keys");
-  pcb.add_u64_avg(l_paxos_begin_bytes, "begin_bytes");
-  pcb.add_time_avg(l_paxos_begin_latency, "begin_latency");
-  pcb.add_u64_counter(l_paxos_commit, "commit");
-  pcb.add_u64_avg(l_paxos_commit_keys, "commit_keys");
-  pcb.add_u64_avg(l_paxos_commit_bytes, "commit_bytes");
-  pcb.add_time_avg(l_paxos_commit_latency, "commit_latency");
-  pcb.add_u64_counter(l_paxos_collect, "collect");
-  pcb.add_u64_avg(l_paxos_collect_keys, "collect_keys");
-  pcb.add_u64_avg(l_paxos_collect_bytes, "collect_bytes");
-  pcb.add_time_avg(l_paxos_collect_latency, "collect_latency");
-  pcb.add_u64_counter(l_paxos_collect_uncommitted, "collect_uncommitted");
-  pcb.add_u64_counter(l_paxos_collect_timeout, "collect_timeout");
-  pcb.add_u64_counter(l_paxos_accept_timeout, "accept_timeout");
-  pcb.add_u64_counter(l_paxos_lease_ack_timeout, "lease_ack_timeout");
-  pcb.add_u64_counter(l_paxos_lease_timeout, "lease_timeout");
-  pcb.add_u64_counter(l_paxos_store_state, "store_state");
-  pcb.add_u64_avg(l_paxos_store_state_keys, "store_state_keys");
-  pcb.add_u64_avg(l_paxos_store_state_bytes, "store_state_bytes");
-  pcb.add_time_avg(l_paxos_store_state_latency, "store_state_latency");
-  pcb.add_u64_counter(l_paxos_share_state, "share_state");
-  pcb.add_u64_avg(l_paxos_share_state_keys, "share_state_keys");
-  pcb.add_u64_avg(l_paxos_share_state_bytes, "share_state_bytes");
-  pcb.add_u64_counter(l_paxos_new_pn, "new_pn");
-  pcb.add_time_avg(l_paxos_new_pn_latency, "new_pn_latency");
+  pcb.add_u64_counter(l_paxos_start_leader, "start_leader", "Starts in leader role");
+  pcb.add_u64_counter(l_paxos_start_peon, "start_peon", "Starts in peon role");
+  pcb.add_u64_counter(l_paxos_restart, "restart", "Restarts");
+  pcb.add_u64_counter(l_paxos_refresh, "refresh", "Refreshes");
+  pcb.add_time_avg(l_paxos_refresh_latency, "refresh_latency", "Refresh latency");
+  pcb.add_u64_counter(l_paxos_begin, "begin", "Started and handled begins");
+  pcb.add_u64_avg(l_paxos_begin_keys, "begin_keys", "Keys in transaction on begin");
+  pcb.add_u64_avg(l_paxos_begin_bytes, "begin_bytes", "Data in transaction on begin");
+  pcb.add_time_avg(l_paxos_begin_latency, "begin_latency", "Latency of begin operation");
+  pcb.add_u64_counter(l_paxos_commit, "commit",
+      "Commits", "cmt");
+  pcb.add_u64_avg(l_paxos_commit_keys, "commit_keys", "Keys in transaction on commit");
+  pcb.add_u64_avg(l_paxos_commit_bytes, "commit_bytes", "Data in transaction on commit");
+  pcb.add_time_avg(l_paxos_commit_latency, "commit_latency",
+      "Commit latency", "clat");
+  pcb.add_u64_counter(l_paxos_collect, "collect", "Peon collects");
+  pcb.add_u64_avg(l_paxos_collect_keys, "collect_keys", "Keys in transaction on peon collect");
+  pcb.add_u64_avg(l_paxos_collect_bytes, "collect_bytes", "Data in transaction on peon collect");
+  pcb.add_time_avg(l_paxos_collect_latency, "collect_latency", "Peon collect latency");
+  pcb.add_u64_counter(l_paxos_collect_uncommitted, "collect_uncommitted", "Uncommitted values in started and handled collects");
+  pcb.add_u64_counter(l_paxos_collect_timeout, "collect_timeout", "Collect timeouts");
+  pcb.add_u64_counter(l_paxos_accept_timeout, "accept_timeout", "Accept timeouts");
+  pcb.add_u64_counter(l_paxos_lease_ack_timeout, "lease_ack_timeout", "Lease acknowledgement timeouts");
+  pcb.add_u64_counter(l_paxos_lease_timeout, "lease_timeout", "Lease timeouts");
+  pcb.add_u64_counter(l_paxos_store_state, "store_state", "Store a shared state on disk");
+  pcb.add_u64_avg(l_paxos_store_state_keys, "store_state_keys", "Keys in transaction in stored state");
+  pcb.add_u64_avg(l_paxos_store_state_bytes, "store_state_bytes", "Data in transaction in stored state");
+  pcb.add_time_avg(l_paxos_store_state_latency, "store_state_latency", "Storing state latency");
+  pcb.add_u64_counter(l_paxos_share_state, "share_state", "Sharings of state");
+  pcb.add_u64_avg(l_paxos_share_state_keys, "share_state_keys", "Keys in shared state");
+  pcb.add_u64_avg(l_paxos_share_state_bytes, "share_state_bytes", "Data in shared state");
+  pcb.add_u64_counter(l_paxos_new_pn, "new_pn", "New proposal number queries");
+  pcb.add_time_avg(l_paxos_new_pn_latency, "new_pn_latency", "New proposal number getting latency");
   logger = pcb.create_perf_counters();
   g_ceph_context->get_perfcounters_collection()->add(logger);
 }
@@ -185,13 +187,18 @@ void Paxos::collect(version_t oldpn)
 
   // set timeout event
   collect_timeout_event = new C_CollectTimeout(this);
-  mon->timer.add_event_after(g_conf->mon_accept_timeout, collect_timeout_event);
+  mon->timer.add_event_after(g_conf->mon_accept_timeout_factor *
+			     g_conf->mon_lease,
+			     collect_timeout_event);
 }
 
 
 // peon
-void Paxos::handle_collect(MMonPaxos *collect)
+void Paxos::handle_collect(MonOpRequestRef op)
 {
+  op->mark_paxos_event("handle_collect");
+
+  MMonPaxos *collect = static_cast<MMonPaxos*>(op->get_req());
   dout(10) << "handle_collect " << *collect << dendl;
 
   assert(mon->is_peon()); // mon epoch filter should catch strays
@@ -204,7 +211,7 @@ void Paxos::handle_collect(MMonPaxos *collect)
             << " leader's lowest version is too high for our last committed"
             << " (theirs: " << collect->first_committed
             << "; ours: " << last_committed << ") -- bootstrap!" << dendl;
-    collect->put();
+    op->mark_paxos_event("need to bootstrap");
     mon->bootstrap();
     return;
   }
@@ -284,13 +291,12 @@ void Paxos::handle_collect(MMonPaxos *collect)
 
   // send reply
   collect->get_connection()->send_message(last);
-  collect->put();
 }
 
 /**
  * @note This is Okay. We share our versions between peer_last_committed and
  *	 our last_committed (inclusive), and add their bufferlists to the
- *	 message. It will be the peer's job to apply them to his store, as
+ *	 message. It will be the peer's job to apply them to its store, as
  *	 these bufferlists will contain raw transactions.
  *	 This function is called by both the Peon and the Leader. The Peon will
  *	 share the state with the Leader during handle_collect(), sharing any
@@ -447,8 +453,10 @@ void Paxos::_sanity_check_store()
 
 
 // leader
-void Paxos::handle_last(MMonPaxos *last)
+void Paxos::handle_last(MonOpRequestRef op)
 {
+  op->mark_paxos_event("handle_last");
+  MMonPaxos *last = static_cast<MMonPaxos*>(op->get_req());
   bool need_refresh = false;
   int from = last->get_source().num();
 
@@ -456,7 +464,6 @@ void Paxos::handle_last(MMonPaxos *last)
 
   if (!mon->is_leader()) {
     dout(10) << "not leader, dropping" << dendl;
-    last->put();
     return;
   }
 
@@ -471,7 +478,7 @@ void Paxos::handle_last(MMonPaxos *last)
 	    << " lowest version is too high for our last committed"
             << " (theirs: " << last->first_committed
             << "; ours: " << last_committed << ") -- bootstrap!" << dendl;
-    last->put();
+    op->mark_paxos_event("need to bootstrap");
     mon->bootstrap();
     return;
   }
@@ -493,7 +500,7 @@ void Paxos::handle_last(MMonPaxos *last)
 	      << " last_committed (" << p->second
 	      << ") is too low for our first_committed (" << first_committed
 	      << ") -- bootstrap!" << dendl;
-      last->put();
+      op->mark_paxos_event("need to bootstrap");
       mon->bootstrap();
       return;
     }
@@ -578,8 +585,6 @@ void Paxos::handle_last(MMonPaxos *last)
 
   if (need_refresh)
     (void)do_refresh();
-
-  last->put();
 }
 
 void Paxos::collect_timeout()
@@ -683,18 +688,22 @@ void Paxos::begin(bufferlist& v)
 
   // set timeout event
   accept_timeout_event = new C_AcceptTimeout(this);
-  mon->timer.add_event_after(g_conf->mon_accept_timeout, accept_timeout_event);
+  mon->timer.add_event_after(g_conf->mon_accept_timeout_factor *
+			     g_conf->mon_lease,
+			     accept_timeout_event);
 }
 
 // peon
-void Paxos::handle_begin(MMonPaxos *begin)
+void Paxos::handle_begin(MonOpRequestRef op)
 {
+  op->mark_paxos_event("handle_begin");
+  MMonPaxos *begin = static_cast<MMonPaxos*>(op->get_req());
   dout(10) << "handle_begin " << *begin << dendl;
 
   // can we accept this?
   if (begin->pn < accepted_pn) {
     dout(10) << " we accepted a higher pn " << accepted_pn << ", ignoring" << dendl;
-    begin->put();
+    op->mark_paxos_event("have higher pn, ignore");
     return;
   }
   assert(begin->pn == accepted_pn);
@@ -742,25 +751,27 @@ void Paxos::handle_begin(MMonPaxos *begin)
   accept->pn = accepted_pn;
   accept->last_committed = last_committed;
   begin->get_connection()->send_message(accept);
-  
-  begin->put();
 }
 
 // leader
-void Paxos::handle_accept(MMonPaxos *accept)
+void Paxos::handle_accept(MonOpRequestRef op)
 {
+  op->mark_paxos_event("handle_accept");
+  MMonPaxos *accept = static_cast<MMonPaxos*>(op->get_req());
   dout(10) << "handle_accept " << *accept << dendl;
   int from = accept->get_source().num();
 
   if (accept->pn != accepted_pn) {
     // we accepted a higher pn, from some other leader
     dout(10) << " we accepted a higher pn " << accepted_pn << ", ignoring" << dendl;
-    goto out;
+    op->mark_paxos_event("have higher pn, ignore");
+    return;
   }
   if (last_committed > 0 &&
       accept->last_committed < last_committed-1) {
     dout(10) << " this is from an old round, ignoring" << dendl;
-    goto out;
+    op->mark_paxos_event("old round, ignore");
+    return;
   }
   assert(accept->last_committed == last_committed ||   // not committed
 	 accept->last_committed == last_committed-1);  // committed
@@ -780,11 +791,9 @@ void Paxos::handle_accept(MMonPaxos *accept)
   if (accepted == mon->get_quorum()) {
     // yay, commit!
     dout(10) << " got majority, committing, done with update" << dendl;
+    op->mark_paxos_event("commit_start");
     commit_start();
   }
-
- out:
-  accept->put();
 }
 
 void Paxos::accept_timeout()
@@ -915,8 +924,10 @@ void Paxos::commit_finish()
 }
 
 
-void Paxos::handle_commit(MMonPaxos *commit)
+void Paxos::handle_commit(MonOpRequestRef op)
 {
+  op->mark_paxos_event("handle_commit");
+  MMonPaxos *commit = static_cast<MMonPaxos*>(op->get_req());
   dout(10) << "handle_commit on " << commit->last_committed << dendl;
 
   logger->inc(l_paxos_commit);
@@ -924,17 +935,15 @@ void Paxos::handle_commit(MMonPaxos *commit)
   if (!mon->is_peon()) {
     dout(10) << "not a peon, dropping" << dendl;
     assert(0);
-    commit->put();
     return;
   }
 
+  op->mark_paxos_event("store_state");
   store_state(commit);
 
   if (do_refresh()) {
     finish_contexts(g_ceph_context, waiting_for_commit);
   }
-
-  commit->put();
 }
 
 void Paxos::extend_lease()
@@ -967,7 +976,8 @@ void Paxos::extend_lease()
   //  if old timeout is still in place, leave it.
   if (!lease_ack_timeout_event) {
     lease_ack_timeout_event = new C_LeaseAckTimeout(this);
-    mon->timer.add_event_after(g_conf->mon_lease_ack_timeout, 
+    mon->timer.add_event_after(g_conf->mon_lease_ack_timeout_factor *
+			       g_conf->mon_lease,
 			       lease_ack_timeout_event);
   }
 
@@ -975,7 +985,7 @@ void Paxos::extend_lease()
   lease_renew_event = new C_LeaseRenew(this);
   utime_t at = lease_expire;
   at -= g_conf->mon_lease;
-  at += g_conf->mon_lease_renew_interval;
+  at += g_conf->mon_lease_renew_interval_factor * g_conf->mon_lease;
   mon->timer.add_event_at(at, lease_renew_event);
 }
 
@@ -1059,14 +1069,16 @@ void Paxos::finish_round()
 
 
 // peon
-void Paxos::handle_lease(MMonPaxos *lease)
+void Paxos::handle_lease(MonOpRequestRef op)
 {
+  op->mark_paxos_event("handle_lease");
+  MMonPaxos *lease = static_cast<MMonPaxos*>(op->get_req());
   // sanity
   if (!mon->is_peon() ||
       last_committed != lease->last_committed) {
     dout(10) << "handle_lease i'm not a peon, or they're not the leader,"
 	     << " or the last_committed doesn't match, dropping" << dendl;
-    lease->put();
+    op->mark_paxos_event("invalid lease, ignore");
     return;
   }
 
@@ -1103,12 +1115,12 @@ void Paxos::handle_lease(MMonPaxos *lease)
   finish_contexts(g_ceph_context, waiting_for_active);
   if (is_readable())
     finish_contexts(g_ceph_context, waiting_for_readable);
-
-  lease->put();
 }
 
-void Paxos::handle_lease_ack(MMonPaxos *ack)
+void Paxos::handle_lease_ack(MonOpRequestRef op)
 {
+  op->mark_paxos_event("handle_lease_ack");
+  MMonPaxos *ack = static_cast<MMonPaxos*>(op->get_req());
   int from = ack->get_source().num();
 
   if (!lease_ack_timeout_event) {
@@ -1138,8 +1150,6 @@ void Paxos::handle_lease_ack(MMonPaxos *ack)
   }
 
   warn_on_future_time(ack->sent_timestamp, ack->get_source());
-
-  ack->put();
 }
 
 void Paxos::lease_ack_timeout()
@@ -1158,7 +1168,9 @@ void Paxos::reset_lease_timeout()
   if (lease_timeout_event)
     mon->timer.cancel_event(lease_timeout_event);
   lease_timeout_event = new C_LeaseTimeout(this);
-  mon->timer.add_event_after(g_conf->mon_lease_ack_timeout, lease_timeout_event);
+  mon->timer.add_event_after(g_conf->mon_lease_ack_timeout_factor *
+			     g_conf->mon_lease,
+			     lease_timeout_event);
 }
 
 void Paxos::lease_timeout()
@@ -1361,12 +1373,14 @@ void Paxos::restart()
 }
 
 
-void Paxos::dispatch(PaxosServiceMessage *m)
+void Paxos::dispatch(MonOpRequestRef op)
 {
+  assert(op->is_type_paxos());
+  op->mark_paxos_event("dispatch");
+  PaxosServiceMessage *m = static_cast<PaxosServiceMessage*>(op->get_req());
   // election in progress?
   if (!mon->is_leader() && !mon->is_peon()) {
     dout(5) << "election in progress, dropping " << *m << dendl;
-    m->put();
     return;    
   }
 
@@ -1384,25 +1398,25 @@ void Paxos::dispatch(PaxosServiceMessage *m)
       switch (pm->op) {
 	// learner
       case MMonPaxos::OP_COLLECT:
-	handle_collect(pm);
+	handle_collect(op);
 	break;
       case MMonPaxos::OP_LAST:
-	handle_last(pm);
+	handle_last(op);
 	break;
       case MMonPaxos::OP_BEGIN:
-	handle_begin(pm);
+	handle_begin(op);
 	break;
       case MMonPaxos::OP_ACCEPT:
-	handle_accept(pm);
+	handle_accept(op);
 	break;		
       case MMonPaxos::OP_COMMIT:
-	handle_commit(pm);
+	handle_commit(op);
 	break;
       case MMonPaxos::OP_LEASE:
-	handle_lease(pm);
+	handle_lease(op);
 	break;
       case MMonPaxos::OP_LEASE_ACK:
-	handle_lease_ack(pm);
+	handle_lease_ack(op);
 	break;
       default:
 	assert(0);
@@ -1481,7 +1495,6 @@ void Paxos::propose_pending()
 
   bufferlist bl;
   pending_proposal->encode(bl);
-  pending_proposal.reset();
 
   dout(10) << __func__ << " " << (last_committed + 1)
 	   << " " << bl.length() << " bytes" << dendl;
@@ -1491,6 +1504,8 @@ void Paxos::propose_pending()
   f.flush(*_dout);
   *_dout << dendl;
 
+  pending_proposal.reset();
+
   committing_finishers.swap(pending_finishers);
   state = STATE_UPDATING;
   begin(bl);
diff --git a/src/mon/Paxos.h b/src/mon/Paxos.h
index 457c8af..9b9a732 100644
--- a/src/mon/Paxos.h
+++ b/src/mon/Paxos.h
@@ -122,6 +122,7 @@ e 12v
 #include <errno.h>
 
 #include "MonitorDBStore.h"
+#include "mon/MonOpRequest.h"
 
 class Monitor;
 class MMonPaxos;
@@ -453,7 +454,7 @@ private:
    *
    * We use this variable to assess if the Leader should take into consideration
    * an uncommitted value sent by a Peon. Given that the Peon will send back to
-   * the Leader the last Proposal Number he accepted, the Leader will be able
+   * the Leader the last Proposal Number it accepted, the Leader will be able
    * to infer if this value is more recent than the one the Leader has, thus
    * more relevant.
    */
@@ -463,7 +464,7 @@ private:
    *
    * If the system fails in-between the accept replies from the Peons and the
    * instruction to commit from the Leader, then we may end up with accepted
-   * but yet-uncommitted values. During the Leader's recovery, he will attempt
+   * but yet-uncommitted values. During the Leader's recovery, it will attempt
    * to bring the whole system to the latest state, and that means committing
    * past accepted but uncommitted values.
    *
@@ -764,18 +765,18 @@ private:
    *
    * Once a Peon receives a collect message from the Leader it will reply
    * with its first and last committed versions, as well as information so
-   * the Leader may know if his Proposal Number was, or was not, accepted by
+   * the Leader may know if its Proposal Number was, or was not, accepted by
    * the Peon. The Peon will accept the Leader's Proposal Number iif it is
    * higher than the Peon's currently accepted Proposal Number. The Peon may
    * also inform the Leader of accepted but uncommitted values.
    *
    * @invariant The message is an operation of type OP_COLLECT.
    * @pre We are a Peon.
-   * @post Replied to the Leader, accepting or not accepting his PN.
+   * @post Replied to the Leader, accepting or not accepting its PN.
    *
    * @param collect The collect message sent by the Leader to the Peon.
    */
-  void handle_collect(MMonPaxos *collect);
+  void handle_collect(MonOpRequestRef op);
   /**
    * Handle a response from a Peon to the Leader's collect phase.
    *
@@ -806,7 +807,7 @@ private:
    *
    * @param last The message sent by the Peon to the Leader.
    */
-  void handle_last(MMonPaxos *last);
+  void handle_last(MonOpRequestRef op);
   /**
    * The Recovery Phase timed out, meaning that a significant part of the
    * quorum does not believe we are the Leader, and we thus should trigger new
@@ -859,7 +860,7 @@ private:
    * @pre We are a Peon
    * @pre We are on STATE_ACTIVE
    * @post We are on STATE_UPDATING iif we accept the Leader's proposal
-   * @post We send a reply message to the Leader iif we accept his proposal
+   * @post We send a reply message to the Leader iif we accept its proposal
    *
    * @invariant The received message is an operation of type OP_BEGIN
    *
@@ -867,7 +868,7 @@ private:
    *		  Paxos::begin function
    *
    */
-  void handle_begin(MMonPaxos *begin);
+  void handle_begin(MonOpRequestRef op);
   /**
    * Handle an Accept message sent by a Peon.
    *
@@ -892,7 +893,7 @@ private:
    * @param accept The message sent by the Peons to the Leader during the
    *		   Paxos::handle_begin function
    */
-  void handle_accept(MMonPaxos *accept);
+  void handle_accept(MonOpRequestRef op);
   /**
    * Trigger a fresh election.
    *
@@ -946,7 +947,7 @@ private:
    * @param commit The message sent by the Leader to the Peon during
    *		   Paxos::commit
    */
-  void handle_commit(MMonPaxos *commit);
+  void handle_commit(MonOpRequestRef op);
   /**
    * Extend the system's lease.
    *
@@ -987,10 +988,10 @@ private:
    *
    * @invariant The received message is an operation of type OP_LEASE
    *
-   * @param The message sent by the Leader to the Peon during the
+   * @param lease The message sent by the Leader to the Peon during the
    *	    Paxos::extend_lease function
    */
-  void handle_lease(MMonPaxos *lease);
+  void handle_lease(MonOpRequestRef op);
   /**
    * Account for all the Lease Acks the Leader receives from the Peons.
    *
@@ -1007,7 +1008,7 @@ private:
    * @param ack The message sent by a Peon to the Leader during the
    *		Paxos::handle_lease function
    */
-  void handle_lease_ack(MMonPaxos *ack);
+  void handle_lease_ack(MonOpRequestRef op);
   /**
    * Call fresh elections because at least one Peon didn't acked our lease.
    *
@@ -1084,7 +1085,8 @@ private:
 public:
   /**
    * @param m A monitor
-   * @param mid A machine id
+   * @param name A name for the paxos service. It serves as the naming space
+   * of the underlying persistent storage for this service.
    */
   Paxos(Monitor *m, const string &name) 
 		 : mon(m),
@@ -1110,7 +1112,7 @@ public:
     return paxos_name;
   }
 
-  void dispatch(PaxosServiceMessage *m);
+  void dispatch(MonOpRequestRef op);
 
   void read_and_prepare_transactions(MonitorDBStore::TransactionRef tx,
 				     version_t from, version_t last);
@@ -1151,7 +1153,7 @@ public:
    * quorum, thus automatically assume we are on STATE_RECOVERING, which means
    * we will soon be enrolled into the Leader's collect phase.
    *
-   * @pre There is a Leader, and he's about to start the collect phase.
+   * @pre There is a Leader, and it?s about to start the collect phase.
    * @post We are on STATE_RECOVERING and will soon receive collect phase's 
    *	   messages.
    */
@@ -1224,9 +1226,15 @@ public:
    *
    * @param c A callback
    */
-  void wait_for_active(Context *c) {
+  void wait_for_active(MonOpRequestRef op, Context *c) {
+    if (op)
+      op->mark_event("paxos:wait_for_active");
     waiting_for_active.push_back(c);
   }
+  void wait_for_active(Context *c) {
+    MonOpRequestRef o;
+    wait_for_active(o, c);
+  }
 
   /**
    * Trim the Paxos state as much as we can.
@@ -1272,7 +1280,7 @@ public:
    * Check if a given version is readable.
    *
    * A version may not be readable for a myriad of reasons:
-   *  @li the version @v is higher that the last committed version
+   *  @li the version @e v is higher that the last committed version
    *  @li we are not the Leader nor a Peon (election may be on-going)
    *  @li we do not have a committed value yet
    *  @li we do not have a valid lease
@@ -1282,7 +1290,7 @@ public:
    */
   bool is_readable(version_t seen=0);
   /**
-   * Read version @v and store its value in @bl
+   * Read version @e v and store its value in @e bl
    *
    * @param[in] v The version we want to read
    * @param[out] bl The version's value
@@ -1302,10 +1310,16 @@ public:
    *
    * @param onreadable A callback
    */
-  void wait_for_readable(Context *onreadable) {
+  void wait_for_readable(MonOpRequestRef op, Context *onreadable) {
     assert(!is_readable());
+    if (op)
+      op->mark_event("paxos:wait_for_readable");
     waiting_for_readable.push_back(onreadable);
   }
+  void wait_for_readable(Context *onreadable) {
+    MonOpRequestRef o;
+    wait_for_readable(o, onreadable);
+  }
   /**
    * @}
    */
@@ -1338,10 +1352,16 @@ public:
    *
    * @param c A callback
    */
-  void wait_for_writeable(Context *c) {
+  void wait_for_writeable(MonOpRequestRef op, Context *c) {
     assert(!is_writeable());
+    if (op)
+      op->mark_event("paxos:wait_for_writeable");
     waiting_for_writeable.push_back(c);
   }
+  void wait_for_writeable(Context *c) {
+    MonOpRequestRef o;
+    wait_for_writeable(o, c);
+  }
 
   /**
    * Get a transaction to submit operations to propose against
diff --git a/src/mon/PaxosService.cc b/src/mon/PaxosService.cc
index 4bdffc2..141a28b 100644
--- a/src/mon/PaxosService.cc
+++ b/src/mon/PaxosService.cc
@@ -22,6 +22,8 @@
 #include "include/assert.h"
 #include "common/Formatter.h"
 
+#include "mon/MonOpRequest.h"
+
 #define dout_subsys ceph_subsys_paxos
 #undef dout_prefix
 #define dout_prefix _prefix(_dout, mon, paxos, service_name, get_first_committed(), get_last_committed())
@@ -32,12 +34,17 @@ static ostream& _prefix(std::ostream *_dout, Monitor *mon, Paxos *paxos, string
 		<< ").paxosservice(" << service_name << " " << fc << ".." << lc << ") ";
 }
 
-bool PaxosService::dispatch(PaxosServiceMessage *m)
+bool PaxosService::dispatch(MonOpRequestRef op)
 {
-  dout(10) << "dispatch " << *m << " from " << m->get_orig_source_inst() << dendl;
+  assert(op->is_type_service() || op->is_type_command());
+  PaxosServiceMessage *m = static_cast<PaxosServiceMessage*>(op->get_req());
+  op->mark_event("psvc:dispatch");
+
+  dout(10) << "dispatch " << m << " " << *m
+	   << " from " << m->get_orig_source_inst()
+	   << " con " << m->get_connection() << dendl;
 
   if (mon->is_shutdown()) {
-    m->put();
     return true;
   }
 
@@ -46,7 +53,6 @@ bool PaxosService::dispatch(PaxosServiceMessage *m)
       m->rx_election_epoch < mon->get_epoch()) {
     dout(10) << " discarding forwarded message from previous election epoch "
 	     << m->rx_election_epoch << " < " << mon->get_epoch() << dendl;
-    m->put();
     return true;
   }
 
@@ -59,36 +65,35 @@ bool PaxosService::dispatch(PaxosServiceMessage *m)
       m->get_connection()->get_messenger() != NULL) {
     dout(10) << " discarding message from disconnected client "
 	     << m->get_source_inst() << " " << *m << dendl;
-    m->put();
     return true;
   }
 
   // make sure our map is readable and up to date
   if (!is_readable(m->version)) {
     dout(10) << " waiting for paxos -> readable (v" << m->version << ")" << dendl;
-    wait_for_readable(new C_RetryMessage(this, m), m->version);
+    wait_for_readable(op, new C_RetryMessage(this, op), m->version);
     return true;
   }
 
   // preprocess
-  if (preprocess_query(m)) 
+  if (preprocess_query(op)) 
     return true;  // easy!
 
   // leader?
   if (!mon->is_leader()) {
-    mon->forward_request_leader(m);
+    mon->forward_request_leader(op);
     return true;
   }
   
   // writeable?
   if (!is_writeable()) {
     dout(10) << " waiting for paxos -> writeable" << dendl;
-    wait_for_writeable(new C_RetryMessage(this, m));
+    wait_for_writeable(op, new C_RetryMessage(this, op));
     return true;
   }
 
   // update
-  if (prepare_update(m)) {
+  if (prepare_update(op)) {
     double delay = 0.0;
     if (should_propose(delay)) {
       if (delay == 0.0) {
@@ -268,7 +273,7 @@ void PaxosService::_active()
   }
   if (!is_active()) {
     dout(10) << "_active - not active" << dendl;
-    wait_for_active(new C_Active(this));
+    wait_for_active_ctx(new C_Active(this));
     return;
   }
   dout(10) << "_active" << dendl;
diff --git a/src/mon/PaxosService.h b/src/mon/PaxosService.h
index d2f6285..87bf04b 100644
--- a/src/mon/PaxosService.h
+++ b/src/mon/PaxosService.h
@@ -102,16 +102,16 @@ protected:
    * instance of this class onto the Paxos::wait_for_readable function, and
    * we will retry the whole dispatch again once the callback is fired.
    */
-  class C_RetryMessage : public Context {
+  class C_RetryMessage : public C_MonOp {
     PaxosService *svc;
-    PaxosServiceMessage *m;
   public:
-    C_RetryMessage(PaxosService *s, PaxosServiceMessage *m_) : svc(s), m(m_) {}
-    void finish(int r) {
+    C_RetryMessage(PaxosService *s, MonOpRequestRef op_) :
+      C_MonOp(op_), svc(s) { }
+    void _finish(int r) {
       if (r == -EAGAIN || r >= 0)
-	svc->dispatch(m);
+	svc->dispatch(op);
       else if (r == -ECANCELED)
-	m->put();
+        return;
       else
 	assert(0 == "bad C_RetryMessage return value");
     }
@@ -188,7 +188,7 @@ public:
   /**
    * @param mn A Monitor instance
    * @param p A Paxos instance
-   * @parem name Our service's name.
+   * @param name Our service's name.
    */
   PaxosService(Monitor *mn, Paxos *p, string name) 
     : mon(mn), paxos(p), service_name(name),
@@ -319,7 +319,7 @@ public:
    * @param m A message
    * @returns 'true' on successful dispatch; 'false' otherwise.
    */
-  bool dispatch(PaxosServiceMessage *m);
+  bool dispatch(MonOpRequestRef op);
 
   void refresh(bool *need_bootstrap);
   void post_refresh();
@@ -403,7 +403,7 @@ public:
    *	      answered, was a state change that has no effect); 'false' 
    *	      otherwise.
    */
-  virtual bool preprocess_query(PaxosServiceMessage *m) = 0;
+  virtual bool preprocess_query(MonOpRequestRef op) = 0;
 
   /**
    * Apply the message to the pending state.
@@ -414,7 +414,7 @@ public:
    * @returns 'true' if the update message was handled (e.g., a command that
    *	      went through); 'false' otherwise.
    */
-  virtual bool prepare_update(PaxosServiceMessage *m) = 0;
+  virtual bool prepare_update(MonOpRequestRef op) = 0;
   /**
    * @}
    */
@@ -609,21 +609,34 @@ public:
    *
    * @param c The callback to be awaken once the proposal is finished.
    */
-  void wait_for_finished_proposal(Context *c) {
+  void wait_for_finished_proposal(MonOpRequestRef op, Context *c) {
+    if (op)
+      op->mark_event(service_name + ":wait_for_finished_proposal");
     waiting_for_finished_proposal.push_back(c);
   }
+  void wait_for_finished_proposal_ctx(Context *c) {
+    MonOpRequestRef o;
+    wait_for_finished_proposal(o, c);
+  }
 
   /**
    * Wait for us to become active
    *
    * @param c The callback to be awaken once we become active.
    */
-  void wait_for_active(Context *c) {
+  void wait_for_active(MonOpRequestRef op, Context *c) {
+    if (op)
+      op->mark_event(service_name + ":wait_for_active");
+
     if (!is_proposing()) {
-      paxos->wait_for_active(c);
+      paxos->wait_for_active(op, c);
       return;
     }
-    wait_for_finished_proposal(c);
+    wait_for_finished_proposal(op, c);
+  }
+  void wait_for_active_ctx(Context *c) {
+    MonOpRequestRef o;
+    wait_for_active(o, c);
   }
 
   /**
@@ -632,19 +645,31 @@ public:
    * @param c The callback to be awaken once we become active.
    * @param ver The version we want to wait on.
    */
-  void wait_for_readable(Context *c, version_t ver = 0) {
+  void wait_for_readable(MonOpRequestRef op, Context *c, version_t ver = 0) {
     /* This is somewhat of a hack. We only do check if a version is readable on
      * PaxosService::dispatch(), but, nonetheless, we must make sure that if that
      * is why we are not readable, then we must wait on PaxosService and not on
      * Paxos; otherwise, we may assert on Paxos::wait_for_readable() if it
      * happens to be readable at that specific point in time.
      */
+    if (op)
+      op->mark_event(service_name + ":wait_for_readable");
+
     if (is_proposing() ||
 	ver > get_last_committed() ||
 	get_last_committed() == 0)
-      wait_for_finished_proposal(c);
-    else
-      paxos->wait_for_readable(c);
+      wait_for_finished_proposal(op, c);
+    else {
+      if (op)
+        op->mark_event(service_name + ":wait_for_readable/paxos");
+
+      paxos->wait_for_readable(op, c);
+    }
+  }
+
+  void wait_for_readable_ctx(Context *c, version_t ver = 0) {
+    MonOpRequestRef o; // will initialize the shared_ptr to NULL
+    wait_for_readable(o, c, ver);
   }
 
   /**
@@ -652,17 +677,25 @@ public:
    *
    * @param c The callback to be awaken once we become writeable.
    */
-  void wait_for_writeable(Context *c) {
+  void wait_for_writeable(MonOpRequestRef op, Context *c) {
+    if (op)
+      op->mark_event(service_name + ":wait_for_writeable");
+
     if (is_proposing())
-      wait_for_finished_proposal(c);
+      wait_for_finished_proposal(op, c);
     else if (!is_write_ready())
-      wait_for_active(c);
+      wait_for_active(op, c);
     else
-      paxos->wait_for_writeable(c);
+      paxos->wait_for_writeable(op, c);
+  }
+  void wait_for_writeable_ctx(Context *c) {
+    MonOpRequestRef o;
+    wait_for_writeable(o, c);
   }
 
+  
   /**
-   * @defgroup PaxosService_h_Trim
+   * @defgroup PaxosService_h_Trim Functions for trimming states
    * @{
    */
   /**
@@ -673,8 +706,8 @@ public:
   void maybe_trim();
 
   /**
-   * Auxiliary function to trim our state from version @from to version @to,
-   * not including; i.e., the interval [from, to[
+   * Auxiliary function to trim our state from version @p from to version
+   * @p to, not including; i.e., the interval [from, to[
    *
    * @param t The transaction to which we will add the trim operations.
    * @param from the lower limit of the interval to be trimmed
@@ -836,7 +869,7 @@ public:
    *
    * @returns Our first committed version (that is available)
    */
-  version_t get_first_committed() {
+  version_t get_first_committed() const{
     return cached_first_committed;
   }
   /**
@@ -844,7 +877,7 @@ public:
    *
    * @returns Our last committed version
    */
-  version_t get_last_committed() {
+  version_t get_last_committed() const{
     return cached_last_committed;
   }
 
diff --git a/src/mon/QuorumService.h b/src/mon/QuorumService.h
index ef9dcdc..69d5390 100644
--- a/src/mon/QuorumService.h
+++ b/src/mon/QuorumService.h
@@ -82,7 +82,7 @@ protected:
     return (mon->is_leader() || mon->is_peon());
   }
 
-  virtual bool service_dispatch(Message *m) = 0;
+  virtual bool service_dispatch(MonOpRequestRef op) = 0;
   virtual void service_tick() = 0;
   virtual void service_shutdown() = 0;
 
@@ -107,8 +107,8 @@ public:
     return epoch;
   }
 
-  bool dispatch(Message *m) {
-    return service_dispatch(m);
+  bool dispatch(MonOpRequestRef op) {
+    return service_dispatch(op);
   }
 
   void tick() {
diff --git a/src/mon/Session.h b/src/mon/Session.h
index 4a19d84..ff80730 100644
--- a/src/mon/Session.h
+++ b/src/mon/Session.h
@@ -50,6 +50,7 @@ struct MonSession : public RefCountedObject {
   uint64_t global_id;
 
   map<string, Subscription*> sub_map;
+  epoch_t osd_epoch;		// the osdmap epoch sent to the mon client
 
   AuthServiceHandler *auth_handler;
   EntityName entity_name;
@@ -58,9 +59,12 @@ struct MonSession : public RefCountedObject {
   uint64_t proxy_tid;
 
   MonSession(const entity_inst_t& i, Connection *c) :
+    RefCountedObject(g_ceph_context),
     con(c), inst(i), closed(false), item(this),
     auid(0),
-    global_id(0), auth_handler(NULL),
+    global_id(0),
+    osd_epoch(0),
+    auth_handler(NULL),
     proxy_con(NULL), proxy_tid(0) {
     time_established = ceph_clock_now(g_ceph_context);
   }
@@ -123,6 +127,7 @@ struct MonSessionMap {
 
   MonSession *new_session(const entity_inst_t& i, Connection *c) {
     MonSession *s = new MonSession(i, c);
+    assert(s);
     sessions.push_back(&s->item);
     if (i.name.is_osd())
       by_osd.insert(pair<int,MonSession*>(i.name.num(), s));
@@ -199,11 +204,11 @@ struct MonSessionMap {
   }
 };
 
-inline ostream& operator<<(ostream& out, const MonSession *s)
+inline ostream& operator<<(ostream& out, const MonSession& s)
 {
-  out << "MonSession: " << s->inst << " is "
-      << (s->closed ? "closed" : "open");
-  out << s->caps;
+  out << "MonSession(" << s.inst << " is "
+      << (s.closed ? "closed" : "open");
+  out << s.caps << ")";
   return out;
 }
 
diff --git a/src/mon/mon_types.h b/src/mon/mon_types.h
index d7346cf..a720075 100644
--- a/src/mon/mon_types.h
+++ b/src/mon/mon_types.h
@@ -18,6 +18,8 @@
 #include "include/utime.h"
 #include "include/util.h"
 #include "common/Formatter.h"
+#include "include/Context.h"
+#include "mon/MonOpRequest.h"
 
 #define PAXOS_PGMAP      0  // before osd, for pg kick to behave
 #define PAXOS_MDSMAP     1
@@ -203,4 +205,33 @@ static inline ostream& operator<<(ostream& out, const ScrubResult& r) {
   return out << "ScrubResult(keys " << r.prefix_keys << " crc " << r.prefix_crc << ")";
 }
 
+/// for information like os, kernel, hostname, memory info, cpu model.
+typedef map<string, string> Metadata;
+
+struct C_MonOp : public Context
+{
+  MonOpRequestRef op;
+
+  C_MonOp(MonOpRequestRef o) :
+    op(o) { }
+
+  void finish(int r) {
+    if (op && r == -ECANCELED) {
+      op->mark_event("callback canceled");
+    } else if (op && r == -EAGAIN) {
+      op->mark_event("callback retry");
+    } else if (op && r == 0) {
+      op->mark_event("callback finished");
+    }
+    _finish(r);
+  }
+
+  void mark_op_event(const string &event) {
+    if (op)
+      op->mark_event(event);
+  }
+
+  virtual void _finish(int r) = 0;
+};
+
 #endif
diff --git a/src/msg/Connection.h b/src/msg/Connection.h
index 8961d64..1539b39 100644
--- a/src/msg/Connection.h
+++ b/src/msg/Connection.h
@@ -113,7 +113,6 @@ public:
    *
    * @param m The Message to send. The Messenger consumes a single reference
    * when you pass it in.
-   * @param con The Connection to send the Message out on.
    *
    * @return 0 on success, or -errno on failure.
    */
diff --git a/src/msg/Dispatcher.h b/src/msg/Dispatcher.h
index 523c016..a0a3b37 100644
--- a/src/msg/Dispatcher.h
+++ b/src/msg/Dispatcher.h
@@ -192,7 +192,7 @@ public:
 protected:
   CephContext *cct;
 private:
-  Dispatcher(const Dispatcher &rhs);
+  explicit Dispatcher(const Dispatcher &rhs);
   Dispatcher& operator=(const Dispatcher &rhs);
 };
 
diff --git a/src/msg/Message.cc b/src/msg/Message.cc
index 23f5179..c29ecef 100644
--- a/src/msg/Message.cc
+++ b/src/msg/Message.cc
@@ -90,6 +90,7 @@ using namespace std;
 #include "messages/MMonGetVersion.h"
 #include "messages/MMonGetVersionReply.h"
 #include "messages/MMonHealth.h"
+#include "messages/MMonMetadata.h"
 #include "messages/MDataPing.h"
 #include "messages/MAuth.h"
 #include "messages/MAuthReply.h"
@@ -396,6 +397,9 @@ Message *decode_message(CephContext *cct, int crcflags,
   case CEPH_MSG_MON_GET_VERSION_REPLY:
     m = new MMonGetVersionReply();
     break;
+  case CEPH_MSG_MON_METADATA:
+    m = new MMonMetadata();
+    break;
 
   case MSG_OSD_BOOT:
     m = new MOSDBoot();
diff --git a/src/msg/Message.h b/src/msg/Message.h
index c2cd123..ddba0e5 100644
--- a/src/msg/Message.h
+++ b/src/msg/Message.h
@@ -313,15 +313,17 @@ public:
   Throttle *get_message_throttler() { return msg_throttler; }
 
   void set_dispatch_throttle_size(uint64_t s) { dispatch_throttle_size = s; }
-  uint64_t get_dispatch_throttle_size() { return dispatch_throttle_size; }
+  uint64_t get_dispatch_throttle_size() const { return dispatch_throttle_size; }
 
+  const ceph_msg_header &get_header() const { return header; }
   ceph_msg_header &get_header() { return header; }
   void set_header(const ceph_msg_header &e) { header = e; }
   void set_footer(const ceph_msg_footer &e) { footer = e; }
+  const ceph_msg_footer &get_footer() const { return footer; }
   ceph_msg_footer &get_footer() { return footer; }
   void set_src(const entity_name_t& src) { header.src = src; }
 
-  uint32_t get_magic() { return magic; }
+  uint32_t get_magic() const { return magic; }
   void set_magic(int _magic) { magic = _magic; }
 
   /*
@@ -346,7 +348,7 @@ public:
     clear_buffers(); // let subclass drop buffers as well
   }
 
-  bool empty_payload() { return payload.length() == 0; }
+  bool empty_payload() const { return payload.length() == 0; }
   bufferlist& get_payload() { return payload; }
   void set_payload(bufferlist& bl) {
     if (byte_throttler)
diff --git a/src/msg/Messenger.cc b/src/msg/Messenger.cc
index b63950e..1cf6761 100644
--- a/src/msg/Messenger.cc
+++ b/src/msg/Messenger.cc
@@ -10,22 +10,31 @@
 #include "msg/xio/XioMessenger.h"
 #endif
 
+Messenger *Messenger::create_client_messenger(CephContext *cct, string lname)
+{
+  uint64_t nonce = 0;
+  get_random_bytes((char*)&nonce, sizeof(nonce));
+  return Messenger::create(cct, cct->_conf->ms_type, entity_name_t::CLIENT(),
+			   lname, nonce, 0);
+}
+
 Messenger *Messenger::create(CephContext *cct, const string &type,
 			     entity_name_t name, string lname,
-			     uint64_t nonce)
+			     uint64_t nonce, uint64_t features)
 {
   int r = -1;
+  srand(time(NULL));
   if (type == "random")
     r = rand() % 2; // random does not include xio
   if (r == 0 || type == "simple")
-    return new SimpleMessenger(cct, name, lname, nonce);
+    return new SimpleMessenger(cct, name, lname, nonce, features);
   else if ((r == 1 || type == "async") &&
 	   cct->check_experimental_feature_enabled("ms-type-async"))
-    return new AsyncMessenger(cct, name, lname, nonce);
+    return new AsyncMessenger(cct, name, lname, nonce, features);
 #ifdef HAVE_XIO
   else if ((type == "xio") &&
 	   cct->check_experimental_feature_enabled("ms-type-xio"))
-    return new XioMessenger(cct, name, lname, nonce);
+    return new XioMessenger(cct, name, lname, nonce, features);
 #endif
   lderr(cct) << "unrecognized ms_type '" << type << "'" << dendl;
   return NULL;
diff --git a/src/msg/Messenger.h b/src/msg/Messenger.h
index 429e500..9e70eb7 100644
--- a/src/msg/Messenger.h
+++ b/src/msg/Messenger.h
@@ -34,7 +34,6 @@ using namespace std;
 
 #define SOCKET_PRIORITY_MIN_DELAY 6
 
-class MDS;
 class Timer;
 
 
@@ -151,12 +150,29 @@ public:
    * @param name entity name to register
    * @param lname logical name of the messenger in this process (e.g., "client")
    * @param nonce nonce value to uniquely identify this instance on the current host
+   * @param features bits for the local connection
    */
   static Messenger *create(CephContext *cct,
                            const string &type,
                            entity_name_t name,
 			   string lname,
-                           uint64_t nonce);
+                           uint64_t nonce,
+			   uint64_t features = 0);
+
+  /**
+   * create a new messenger
+   *
+   * Create a new messenger instance.
+   * Same as the above, but a slightly simpler interface for clients:
+   * - Generate a random nonce
+   * - use the default feature bits
+   * - get the messenger type from cct
+   * - use the client entity_type
+   *
+   * @param cct context
+   * @param lname logical name of the messenger in this process (e.g., "client")
+   */
+  static Messenger *create_client_messenger(CephContext *cct, string lname);
 
   /**
    * @defgroup Accessors
@@ -287,16 +303,16 @@ public:
    */
   virtual Policy get_default_policy() = 0;
   /**
-   * Set a Throttler which is applied to all Messages from the given
-   * type of peer.
+   * Set Throttlers applied to all Messages from the given type of peer
    *
    * This is an init-time function and cannot be called after calling
    * start() or bind().
    *
-   * @param type The peer type this Throttler will apply to.
-   * @param t The Throttler to apply. The Messenger does not take
-   * ownership of this pointer, but you must not destroy it before
-   * you destroy the Messenger.
+   * @param type The peer type the Throttlers will apply to.
+   * @param bytes The Throttle for the number of bytes carried by the message
+   * @param msgs The Throttle for the number of messages for this @p type
+   * @note The Messenger does not take ownership of the Throttle pointers, but
+   * you must not destroy them before you destroy the Messenger.
    */
   virtual void set_policy_throttlers(int type, Throttle *bytes, Throttle *msgs=NULL) = 0;
   /**
diff --git a/src/msg/async/AsyncConnection.cc b/src/msg/async/AsyncConnection.cc
index a9d7f51..42c55ad 100644
--- a/src/msg/async/AsyncConnection.cc
+++ b/src/msg/async/AsyncConnection.cc
@@ -23,6 +23,8 @@
 #include "AsyncMessenger.h"
 #include "AsyncConnection.h"
 
+#include "include/sock_compat.h"
+
 // Constant to limit starting sequence number to 2^31.  Nothing special about it, just a big number.  PLR
 #define SEQ_MASK  0x7fffffff 
 
@@ -39,6 +41,9 @@ ostream& AsyncConnection::_conn_prefix(std::ostream *_dout) {
                 << ").";
 }
 
+// Notes:
+// 1. Don't dispatch any event when closed! It may cause AsyncConnection alive even if AsyncMessenger dead
+
 const int AsyncConnection::TCP_PREFETCH_MIN_SIZE = 512;
 
 class C_time_wakeup : public EventCallback {
@@ -100,12 +105,7 @@ class C_handle_dispatch : public EventCallback {
  public:
   C_handle_dispatch(AsyncMessenger *msgr, Message *m): msgr(msgr), m(m) {}
   void do_request(int id) {
-    //msgr->ms_fast_preprocess(m);
-    //if (msgr->ms_can_fast_dispatch(m)) {
-    //  msgr->ms_fast_dispatch(m);
-    //} else {
-      msgr->ms_deliver_dispatch(m);
-    //}
+    msgr->ms_deliver_dispatch(m);
   }
 };
 
@@ -174,26 +174,27 @@ static void alloc_aligned_buffer(bufferlist& data, unsigned len, unsigned off)
   }
 }
 
-AsyncConnection::AsyncConnection(CephContext *cct, AsyncMessenger *m, EventCenter *c)
-  : Connection(cct, m), async_msgr(m), global_seq(0), connect_seq(0), peer_global_seq(0),
-    out_seq(0), in_seq(0), in_seq_acked(0), state(STATE_NONE), state_after_send(0), sd(-1),
-    port(-1), lock("AsyncConnection::lock"), open_write(false), keepalive(false), recv_buf(NULL),
+AsyncConnection::AsyncConnection(CephContext *cct, AsyncMessenger *m, EventCenter *c, PerfCounters *p)
+  : Connection(cct, m), async_msgr(m), logger(p), global_seq(0), connect_seq(0), peer_global_seq(0),
+    out_seq(0), ack_left(0), in_seq(0), state(STATE_NONE), state_after_send(0), sd(-1), port(-1),
+    write_lock("AsyncConnection::write_lock"), can_write(NOWRITE),
+    open_write(false), keepalive(false), lock("AsyncConnection::lock"), recv_buf(NULL),
     recv_max_prefetch(MIN(msgr->cct->_conf->ms_tcp_prefetch_max_size, TCP_PREFETCH_MIN_SIZE)),
-    recv_start(0), recv_end(0), stop_lock("AsyncConnection::stop_lock"),
-    got_bad_auth(false), authorizer(NULL), replacing(false), once_session_reset(false),
-    is_reset_from_peer(false), state_buffer(NULL), state_offset(0), net(cct), center(c)
+    recv_start(0), recv_end(0), got_bad_auth(false), authorizer(NULL), replacing(false),
+    is_reset_from_peer(false), once_ready(false), state_buffer(NULL), state_offset(0), net(cct), center(c)
 {
   read_handler.reset(new C_handle_read(this));
   write_handler.reset(new C_handle_write(this));
   reset_handler.reset(new C_handle_reset(async_msgr, this));
   remote_reset_handler.reset(new C_handle_remote_reset(async_msgr, this));
   connect_handler.reset(new C_deliver_connect(async_msgr, this));
-  accept_handler.reset(new C_deliver_accept(async_msgr, this));
   local_deliver_handler.reset(new C_local_deliver(this));
+  wakeup_handler.reset(new C_time_wakeup(this));
   memset(msgvec, 0, sizeof(msgvec));
   // double recv_max_prefetch see "read_until"
   recv_buf = new char[2*recv_max_prefetch];
   state_buffer = new char[4096];
+  logger->inc(l_msgr_created_connections);
 }
 
 AsyncConnection::~AsyncConnection()
@@ -271,8 +272,9 @@ int AsyncConnection::do_sendmsg(struct msghdr &msg, int len, bool more)
 
 // return the remaining bytes, it may larger than the length of ptr
 // else return < 0 means error
-int AsyncConnection::_try_send(bufferlist send_bl, bool send)
+int AsyncConnection::_try_send(bufferlist &send_bl, bool send)
 {
+  ldout(async_msgr->cct, 20) << __func__ << " send bl length is " << send_bl.length() << dendl;
   if (send_bl.length()) {
     if (outcoming_bl.length())
       outcoming_bl.claim_append(send_bl);
@@ -283,24 +285,6 @@ int AsyncConnection::_try_send(bufferlist send_bl, bool send)
   if (!send)
     return 0;
 
-  // standby?
-  if (is_queued() && state == STATE_STANDBY && !policy.server) {
-    assert(!outcoming_bl.length());
-    connect_seq++;
-    state = STATE_CONNECTING;
-    center->dispatch_event_external(read_handler);
-    return 0;
-  }
-
-  if (state == STATE_STANDBY) {
-    ldout(async_msgr->cct, 1) << __func__ << " connection is standby" << dendl;
-    return 0;
-  }
-  if (state == STATE_CLOSED) {
-    ldout(async_msgr->cct, 1) << __func__ << " connection is closed" << dendl;
-    return -EINTR;
-  }
-
   if (async_msgr->cct->_conf->ms_inject_socket_failures && sd >= 0) {
     if (rand() % async_msgr->cct->_conf->ms_inject_socket_failures == 0) {
       ldout(async_msgr->cct, 0) << __func__ << " injecting socket failure" << dendl;
@@ -308,7 +292,7 @@ int AsyncConnection::_try_send(bufferlist send_bl, bool send)
     }
   }
 
-  uint64_t sent = 0;
+  uint64_t sent_bytes = 0;
   list<bufferptr>::const_iterator pb = outcoming_bl.buffers().begin();
   uint64_t left_pbrs = outcoming_bl.buffers().size();
   while (left_pbrs) {
@@ -333,7 +317,7 @@ int AsyncConnection::_try_send(bufferlist send_bl, bool send)
       return r;
 
     // "r" is the remaining length
-    sent += msglen - r;
+    sent_bytes += msglen - r;
     if (r > 0) {
       ldout(async_msgr->cct, 5) << __func__ << " remaining " << r
                           << " needed to be sent, creating event for writing"
@@ -344,14 +328,14 @@ int AsyncConnection::_try_send(bufferlist send_bl, bool send)
   }
 
   // trim already sent for outcoming_bl
-  if (sent) {
+  if (sent_bytes) {
     bufferlist bl;
-    if (sent < outcoming_bl.length())
-      outcoming_bl.splice(sent, outcoming_bl.length()-sent, &bl);
+    if (sent_bytes < outcoming_bl.length())
+      outcoming_bl.splice(sent_bytes, outcoming_bl.length()-sent_bytes, &bl);
     bl.swap(outcoming_bl);
   }
 
-  ldout(async_msgr->cct, 20) << __func__ << " sent bytes " << sent
+  ldout(async_msgr->cct, 20) << __func__ << " sent bytes " << sent_bytes
                              << " remaining bytes " << outcoming_bl.length() << dendl;
 
   if (!open_write && is_queued()) {
@@ -378,8 +362,7 @@ int AsyncConnection::_try_send(bufferlist send_bl, bool send)
 // else return < 0 means error
 int AsyncConnection::read_until(uint64_t len, char *p)
 {
-  assert(len);
-  ldout(async_msgr->cct, 20) << __func__ << " len is " << len << " state_offset is "
+  ldout(async_msgr->cct, 25) << __func__ << " len is " << len << " state_offset is "
                              << state_offset << dendl;
 
   if (async_msgr->cct->_conf->ms_inject_socket_failures && sd >= 0) {
@@ -392,12 +375,11 @@ int AsyncConnection::read_until(uint64_t len, char *p)
   int r = 0;
   uint64_t left = len - state_offset;
   if (recv_end > recv_start) {
-    assert(state_offset == 0);
     uint64_t to_read = MIN(recv_end - recv_start, left);
     memcpy(p, recv_buf+recv_start, to_read);
     recv_start += to_read;
     left -= to_read;
-    ldout(async_msgr->cct, 20) << __func__ << " got " << to_read << " in buffer "
+    ldout(async_msgr->cct, 25) << __func__ << " got " << to_read << " in buffer "
                                << " left is " << left << " buffer still has "
                                << recv_end - recv_start << dendl;
     if (left == 0) {
@@ -406,14 +388,13 @@ int AsyncConnection::read_until(uint64_t len, char *p)
     state_offset += to_read;
   }
 
-  assert(recv_end == recv_start);
   recv_end = recv_start = 0;
   /* nothing left in the prefetch buffer */
   if (len > recv_max_prefetch) {
     /* this was a large read, we don't prefetch for these */
     do {
       r = read_bulk(sd, p+state_offset, left);
-      ldout(async_msgr->cct, 20) << __func__ << " read_bulk left is " << left << " got " << r << dendl;
+      ldout(async_msgr->cct, 25) << __func__ << " read_bulk left is " << left << " got " << r << dendl;
       if (r < 0) {
         ldout(async_msgr->cct, 1) << __func__ << " read failed, state is " << get_state_name(state) << dendl;
         return -1;
@@ -427,7 +408,7 @@ int AsyncConnection::read_until(uint64_t len, char *p)
   } else {
     do {
       r = read_bulk(sd, recv_buf+recv_end, recv_max_prefetch);
-      ldout(async_msgr->cct, 20) << __func__ << " read_bulk recv_end is " << recv_end
+      ldout(async_msgr->cct, 25) << __func__ << " read_bulk recv_end is " << recv_end
                                  << " left is " << left << " got " << r << dendl;
       if (r < 0) {
         ldout(async_msgr->cct, 1) << __func__ << " read failed, state is " << get_state_name(state) << dendl;
@@ -446,7 +427,7 @@ int AsyncConnection::read_until(uint64_t len, char *p)
     state_offset += (recv_end - recv_start);
     recv_end = recv_start = 0;
   }
-  ldout(async_msgr->cct, 20) << __func__ << " need len " << len << " remaining "
+  ldout(async_msgr->cct, 25) << __func__ << " need len " << len << " remaining "
                              << len - state_offset << " bytes, state is "
                              << get_state_name(state) << dendl;
   return len - state_offset;
@@ -508,7 +489,9 @@ void AsyncConnection::process()
           ldout(async_msgr->cct, 30) << __func__ << " got KEEPALIVE2 tag ..." << dendl;
           t = (ceph_timespec*)state_buffer;
           utime_t kp_t = utime_t(*t);
+          write_lock.Lock();
           _send_keepalive_or_ack(true, &kp_t);
+          write_lock.Unlock();
           ldout(async_msgr->cct, 20) << __func__ << " got KEEPALIVE2 " << kp_t << dendl;
           state = STATE_OPEN;
           break;
@@ -555,7 +538,7 @@ void AsyncConnection::process()
           ldout(async_msgr->cct, 20) << __func__ << " begin MSG" << dendl;
           ceph_msg_header header;
           ceph_msg_header_old oldheader;
-          __u32 header_crc;
+          __u32 header_crc = 0;
           int len;
           if (has_feature(CEPH_FEATURE_NOSRCADDR))
             len = sizeof(header);
@@ -616,11 +599,19 @@ void AsyncConnection::process()
       case STATE_OPEN_MESSAGE_THROTTLE_MESSAGE:
         {
           if (policy.throttler_messages) {
-            ldout(async_msgr->cct,10) << __func__ << " wants " << 1 << " message from policy throttler "
-                                << policy.throttler_messages->get_current() << "/"
-                                << policy.throttler_messages->get_max() << dendl;
-            // FIXME: may block
-            policy.throttler_messages->get();
+            ldout(async_msgr->cct, 10) << __func__ << " wants " << 1 << " message from policy throttler "
+                                       << policy.throttler_messages->get_current() << "/"
+                                       << policy.throttler_messages->get_max() << dendl;
+            if (!policy.throttler_messages->get_or_fail()) {
+              ldout(async_msgr->cct, 1) << __func__ << " wants 1 message from policy throttle "
+                                        << policy.throttler_messages->get_current() << "/"
+                                        << policy.throttler_messages->get_max() << " failed, just wait." << dendl;
+              // following thread pool deal with th full message queue isn't a
+              // short time, so we can wait a ms.
+              if (register_time_events.empty())
+                register_time_events.insert(center->create_time_event(1000, wakeup_handler));
+              break;
+            }
           }
 
           state = STATE_OPEN_MESSAGE_THROTTLE_BYTES;
@@ -632,11 +623,19 @@ void AsyncConnection::process()
           uint64_t message_size = current_header.front_len + current_header.middle_len + current_header.data_len;
           if (message_size) {
             if (policy.throttler_bytes) {
-              ldout(async_msgr->cct,10) << __func__ << " wants " << message_size << " bytes from policy throttler "
-                  << policy.throttler_bytes->get_current() << "/"
-                  << policy.throttler_bytes->get_max() << dendl;
-              // FIXME: may block
-              policy.throttler_bytes->get(message_size);
+              ldout(async_msgr->cct, 10) << __func__ << " wants " << message_size << " bytes from policy throttler "
+                                         << policy.throttler_bytes->get_current() << "/"
+                                         << policy.throttler_bytes->get_max() << dendl;
+              if (!policy.throttler_bytes->get_or_fail(message_size)) {
+                ldout(async_msgr->cct, 10) << __func__ << " wants " << message_size << " bytes from policy throttler "
+                                           << policy.throttler_bytes->get_current() << "/"
+                                           << policy.throttler_bytes->get_max() << " failed, just wait." << dendl;
+                // following thread pool deal with th full message queue isn't a
+                // short time, so we can wait a ms.
+                if (register_time_events.empty())
+                  register_time_events.insert(center->create_time_event(1000, wakeup_handler));
+                break;
+              }
             }
           }
 
@@ -797,7 +796,7 @@ void AsyncConnection::process()
             ldout(async_msgr->cct, 10) << __func__ << " no session security set" << dendl;
           } else {
             if (session_security->check_message_signature(message)) {
-              ldout(async_msgr->cct, 0) << __func__ << "Signature check failed" << dendl;
+              ldout(async_msgr->cct, 0) << __func__ << " Signature check failed" << dendl;
               message->put();
               goto fail;
             }
@@ -819,18 +818,19 @@ void AsyncConnection::process()
           // side queueing because messages can't be renumbered, but the (kernel) client will
           // occasionally pull a message out of the sent queue to send elsewhere.  in that case
           // it doesn't matter if we "got" it or not.
-          if (message->get_seq() <= in_seq) {
+          uint64_t cur_seq = in_seq.read();
+          if (message->get_seq() <= cur_seq) {
             ldout(async_msgr->cct,0) << __func__ << " got old message "
-                    << message->get_seq() << " <= " << in_seq << " " << message << " " << *message
+                    << message->get_seq() << " <= " << cur_seq << " " << message << " " << *message
                     << ", discarding" << dendl;
             message->put();
             if (has_feature(CEPH_FEATURE_RECONNECT_SEQ) && async_msgr->cct->_conf->ms_die_on_old_message)
               assert(0 == "old msgs despite reconnect_seq feature");
             break;
           }
-          if (message->get_seq() > in_seq + 1) {
+          if (message->get_seq() > cur_seq + 1) {
             ldout(async_msgr->cct, 0) << __func__ << " missed message?  skipped from seq "
-                                      << in_seq << " to " << message->get_seq() << dendl;
+                                      << cur_seq << " to " << message->get_seq() << dendl;
             if (async_msgr->cct->_conf->ms_die_on_skipped_message)
               assert(0 == "skipped incoming seq");
           }
@@ -838,13 +838,13 @@ void AsyncConnection::process()
           message->set_connection(this);
 
           // note last received message.
-          in_seq = message->get_seq();
-          ldout(async_msgr->cct, 10) << __func__ << " got message " << message->get_seq()
-                               << " " << message << " " << *message << dendl;
+          in_seq.set(message->get_seq());
+	  ldout(async_msgr->cct, 1) << " == rx == " << message->get_source() << " seq "
+                                    << message->get_seq() << " " << message << " " << *message << dendl;
 
           // if send_message always successfully send, it may have no
           // opportunity to send seq ack. 10 is a experience value.
-          if (in_seq > in_seq_acked + 10) {
+          if (ack_left.inc() > 10) {
             center->dispatch_event_external(write_handler);
           }
 
@@ -858,6 +858,8 @@ void AsyncConnection::process()
           } else {
             center->dispatch_event_external(EventCallbackRef(new C_handle_dispatch(async_msgr, message)));
           }
+          logger->inc(l_msgr_recv_messages);
+          logger->inc(l_msgr_recv_bytes, message_size + sizeof(ceph_msg_header) + sizeof(ceph_msg_footer));
 
           break;
         }
@@ -878,7 +880,7 @@ void AsyncConnection::process()
 
       case STATE_CLOSED:
         {
-          if (sd > 0)
+          if (sd >= 0)
             center->delete_file_event(sd, EVENT_READABLE);
           ldout(async_msgr->cct, 20) << __func__ << " socket closed" << dendl;
           break;
@@ -897,40 +899,40 @@ void AsyncConnection::process()
           break;
         }
     }
+  } while (prev_state != state);
 
-    continue;
+  return;
 
-fail:
-    // clean up state internal variables and states
-    if (state >= STATE_CONNECTING_SEND_CONNECT_MSG &&
-        state <= STATE_CONNECTING_READY) {
-      delete authorizer;
-      authorizer = NULL;
-      got_bad_auth = false;
-    }
-
-    if (state > STATE_OPEN_MESSAGE_THROTTLE_MESSAGE &&
-        state <= STATE_OPEN_MESSAGE_READ_FOOTER_AND_DISPATCH
-        && policy.throttler_messages) {
-      ldout(async_msgr->cct,10) << __func__ << " releasing " << 1
-                          << " message to policy throttler "
-                          << policy.throttler_messages->get_current() << "/"
-                          << policy.throttler_messages->get_max() << dendl;
-      policy.throttler_messages->put();
-    }
-    if (state > STATE_OPEN_MESSAGE_THROTTLE_BYTES &&
-        state <= STATE_OPEN_MESSAGE_READ_FOOTER_AND_DISPATCH) {
-      uint64_t message_size = current_header.front_len + current_header.middle_len + current_header.data_len;
-      if (policy.throttler_bytes) {
-        ldout(async_msgr->cct,10) << __func__ << " releasing " << message_size
-                            << " bytes to policy throttler "
-                            << policy.throttler_bytes->get_current() << "/"
-                            << policy.throttler_bytes->get_max() << dendl;
-        policy.throttler_bytes->put(message_size);
-      }
+ fail:
+  // clean up state internal variables and states
+  if (state >= STATE_CONNECTING_SEND_CONNECT_MSG &&
+      state <= STATE_CONNECTING_READY) {
+    delete authorizer;
+    authorizer = NULL;
+    got_bad_auth = false;
+  }
+
+  if (state > STATE_OPEN_MESSAGE_THROTTLE_MESSAGE &&
+      state <= STATE_OPEN_MESSAGE_READ_FOOTER_AND_DISPATCH
+      && policy.throttler_messages) {
+    ldout(async_msgr->cct,10) << __func__ << " releasing " << 1
+                        << " message to policy throttler "
+                        << policy.throttler_messages->get_current() << "/"
+                        << policy.throttler_messages->get_max() << dendl;
+    policy.throttler_messages->put();
+  }
+  if (state > STATE_OPEN_MESSAGE_THROTTLE_BYTES &&
+      state <= STATE_OPEN_MESSAGE_READ_FOOTER_AND_DISPATCH) {
+    uint64_t message_size = current_header.front_len + current_header.middle_len + current_header.data_len;
+    if (policy.throttler_bytes) {
+      ldout(async_msgr->cct,10) << __func__ << " releasing " << message_size
+                          << " bytes to policy throttler "
+                          << policy.throttler_bytes->get_current() << "/"
+                          << policy.throttler_bytes->get_max() << dendl;
+      policy.throttler_bytes->put(message_size);
     }
-    fault();
-  } while (prev_state != state);
+  }
+  fault();
 }
 
 int AsyncConnection::_process_connection()
@@ -940,6 +942,7 @@ int AsyncConnection::_process_connection()
   switch(state) {
     case STATE_WAIT_SEND:
       {
+        Mutex::Locker l(write_lock);
         if (!outcoming_bl.length()) {
           assert(state_after_send);
           state = state_after_send;
@@ -974,7 +977,6 @@ int AsyncConnection::_process_connection()
         if (r < 0) {
           goto fail;
         }
-        net.set_socket_options(sd);
 
         center->create_file_event(sd, EVENT_READABLE, read_handler);
         state = STATE_CONNECTING_WAIT_BANNER;
@@ -1001,7 +1003,7 @@ int AsyncConnection::_process_connection()
 
         bufferlist bl;
         bl.append(state_buffer, strlen(CEPH_BANNER));
-        r = _try_send(bl);
+        r = try_send(bl);
         if (r == 0) {
           state = STATE_CONNECTING_WAIT_IDENTIFY_PEER;
           ldout(async_msgr->cct, 10) << __func__ << " connect write banner done: "
@@ -1056,14 +1058,27 @@ int AsyncConnection::_process_connection()
         }
 
         ldout(async_msgr->cct, 20) << __func__ << " connect peer addr for me is " << peer_addr_for_me << dendl;
-        // TODO: it's tricky that exit loop if exist AsyncMessenger waiting for
-        // mark_down. Otherwise, it will be deadlock while
-        // AsyncMessenger::mark_down_all already hold lock.
-        if (stopping.read())
-          break;
+        lock.Unlock();
         async_msgr->learned_addr(peer_addr_for_me);
+        if (async_msgr->cct->_conf->ms_inject_internal_delays) {
+          if (rand() % async_msgr->cct->_conf->ms_inject_socket_failures == 0) {
+            ldout(msgr->cct, 10) << __func__ << " sleep for "
+                                 << async_msgr->cct->_conf->ms_inject_internal_delays << dendl;
+            utime_t t;
+            t.set_from_double(async_msgr->cct->_conf->ms_inject_internal_delays);
+            t.sleep();
+          }
+        }
+
+        lock.Lock();
+        if (state != STATE_CONNECTING_WAIT_IDENTIFY_PEER) {
+          ldout(async_msgr->cct, 1) << __func__ << " state changed while learned_addr, mark_down or "
+                                    << " replacing must be happened just now" << dendl;
+          return 0;
+        }
+
         ::encode(async_msgr->get_myaddr(), myaddrbl);
-        r = _try_send(myaddrbl);
+        r = try_send(myaddrbl);
         if (r == 0) {
           state = STATE_CONNECTING_SEND_CONNECT_MSG;
           ldout(async_msgr->cct, 10) << __func__ << " connect sent my addr "
@@ -1098,9 +1113,9 @@ int AsyncConnection::_process_connection()
         connect_msg.authorizer_protocol = authorizer ? authorizer->protocol : 0;
         connect_msg.authorizer_len = authorizer ? authorizer->bl.length() : 0;
         if (authorizer)
-          ldout(async_msgr->cct, 10) << __func__ <<  "connect_msg.authorizer_len="
-              << connect_msg.authorizer_len << " protocol="
-              << connect_msg.authorizer_protocol << dendl;
+          ldout(async_msgr->cct, 10) << __func__ <<  " connect_msg.authorizer_len="
+                                     << connect_msg.authorizer_len << " protocol="
+                                     << connect_msg.authorizer_protocol << dendl;
         connect_msg.flags = 0;
         if (policy.lossy)
           connect_msg.flags |= CEPH_MSG_CONNECT_LOSSY;  // this is fyi, actually, server decides!
@@ -1111,7 +1126,7 @@ int AsyncConnection::_process_connection()
         ldout(async_msgr->cct, 10) << __func__ << " connect sending gseq=" << global_seq << " cseq="
             << connect_seq << " proto=" << connect_msg.protocol_version << dendl;
 
-        r = _try_send(bl);
+        r = try_send(bl);
         if (r == 0) {
           state = STATE_CONNECTING_WAIT_CONNECT_REPLY;
           ldout(async_msgr->cct,20) << __func__ << " connect wrote (self +) cseq, waiting for reply" << dendl;
@@ -1184,7 +1199,6 @@ int AsyncConnection::_process_connection()
     case STATE_CONNECTING_WAIT_ACK_SEQ:
       {
         uint64_t newly_acked_seq = 0;
-        bufferlist bl;
 
         r = read_until(sizeof(newly_acked_seq), state_buffer);
         if (r < 0) {
@@ -1196,19 +1210,22 @@ int AsyncConnection::_process_connection()
 
         newly_acked_seq = *((uint64_t*)state_buffer);
         ldout(async_msgr->cct, 2) << __func__ << " got newly_acked_seq " << newly_acked_seq
-                            << " vs out_seq " << out_seq << dendl;
-        while (newly_acked_seq > out_seq) {
-          Message *m = _get_next_outgoing();
-          assert(m);
-          ldout(async_msgr->cct, 2) << __func__ << " discarding previously sent " << m->get_seq()
-                              << " " << *m << dendl;
-          assert(m->get_seq() <= newly_acked_seq);
-          m->put();
-          ++out_seq;
-        }
+                            << " vs out_seq " << out_seq.read() << dendl;
+        discard_requeued_up_to(newly_acked_seq);
+        //while (newly_acked_seq > out_seq.read()) {
+        //  Message *m = _get_next_outgoing(NULL);
+        //  assert(m);
+        //  ldout(async_msgr->cct, 2) << __func__ << " discarding previously sent " << m->get_seq()
+        //                      << " " << *m << dendl;
+        //  assert(m->get_seq() <= newly_acked_seq);
+        //  m->put();
+        //  out_seq.inc();
+        //}
 
-        bl.append((char*)&in_seq, sizeof(in_seq));
-        r = _try_send(bl);
+        bufferlist bl;
+        uint64_t s = in_seq.read();
+        bl.append((char*)&s, sizeof(s));
+        r = try_send(bl);
         if (r == 0) {
           state = STATE_CONNECTING_READY;
           ldout(async_msgr->cct, 10) << __func__ << " send in_seq done " << dendl;
@@ -1228,6 +1245,7 @@ int AsyncConnection::_process_connection()
         peer_global_seq = connect_reply.global_seq;
         policy.lossy = connect_reply.flags & CEPH_MSG_CONNECT_LOSSY;
         state = STATE_OPEN;
+        once_ready = true;
         connect_seq += 1;
         assert(connect_seq == connect_reply.connect_seq);
         backoff = utime_t();
@@ -1254,8 +1272,11 @@ int AsyncConnection::_process_connection()
 
         // message may in queue between last _try_send and connection ready
         // write event may already notify and we need to force scheduler again
+        write_lock.Lock();
+        can_write = CANWRITE;
         if (is_queued())
           center->dispatch_event_external(write_handler);
+        write_lock.Unlock();
 
         break;
       }
@@ -1284,7 +1305,7 @@ int AsyncConnection::_process_connection()
         ::encode(socket_addr, bl);
         ldout(async_msgr->cct, 1) << __func__ << " sd=" << sd << " " << socket_addr << dendl;
 
-        r = _try_send(bl);
+        r = try_send(bl);
         if (r == 0) {
           state = STATE_ACCEPTING_WAIT_BANNER_ADDR;
           ldout(async_msgr->cct, 10) << __func__ << " write banner and addr done: "
@@ -1413,6 +1434,9 @@ int AsyncConnection::_process_connection()
         ldout(async_msgr->cct, 20) << __func__ << " accept done" << dendl;
         state = STATE_OPEN;
         memset(&connect_msg, 0, sizeof(connect_msg));
+        write_lock.Lock();
+        can_write = CANWRITE;
+        write_lock.Unlock();
         break;
       }
 
@@ -1458,7 +1482,7 @@ int AsyncConnection::handle_connect_reply(ceph_msg_connect &connect, ceph_msg_co
     state = STATE_CONNECTING_SEND_CONNECT_MSG;
   }
   if (reply.tag == CEPH_MSGR_TAG_RESETSESSION) {
-    ldout(async_msgr->cct, 0) << __func__ << "connect got RESETSESSION" << dendl;
+    ldout(async_msgr->cct, 0) << __func__ << " connect got RESETSESSION" << dendl;
     was_session_reset();
     state = STATE_CONNECTING_SEND_CONNECT_MSG;
   }
@@ -1471,14 +1495,18 @@ int AsyncConnection::handle_connect_reply(ceph_msg_connect &connect, ceph_msg_co
   }
   if (reply.tag == CEPH_MSGR_TAG_RETRY_SESSION) {
     assert(reply.connect_seq > connect_seq);
-    connect_seq = reply.connect_seq;
     ldout(async_msgr->cct, 10) << __func__ << " connect got RETRY_SESSION "
-                         << connect_seq << " -> "
-                         << reply.connect_seq << dendl;
+                               << connect_seq << " -> "
+                               << reply.connect_seq << dendl;
+    connect_seq = reply.connect_seq;
     state = STATE_CONNECTING_SEND_CONNECT_MSG;
   }
   if (reply.tag == CEPH_MSGR_TAG_WAIT) {
     ldout(async_msgr->cct, 3) << __func__ << " connect got WAIT (connection race)" << dendl;
+    if (!once_ready) {
+      ldout(async_msgr->cct, 1) << __func__ << " got WAIT while connection isn't registered, just closed." << dendl;
+      goto fail;
+    }
     state = STATE_WAIT;
   }
 
@@ -1578,11 +1606,27 @@ int AsyncConnection::handle_connect_msg(ceph_msg_connect &connect, bufferlist &a
   if (existing == this)
     existing = NULL;
   if (existing) {
+    // There is no possible that existing connection will acquire this
+    // connection's lock
+    existing->lock.Lock(true);  // skip lockdep check (we are locking a second AsyncConnection here)
+
+    if (existing->replacing || existing->state == STATE_CLOSED) {
+      ldout(async_msgr->cct, 1) << __func__ << " existing racing replace or mark_down happened while replacing."
+                                << " state=" << get_state_name(existing->state) << dendl;
+      reply.global_seq = existing->peer_global_seq;
+      r = _reply_accept(CEPH_MSGR_TAG_RETRY_GLOBAL, connect, reply, authorizer_reply);
+      existing->lock.Unlock();
+      if (r < 0)
+        goto fail;
+      return 0;
+    }
+
     if (connect.global_seq < existing->peer_global_seq) {
       ldout(async_msgr->cct, 10) << __func__ << " accept existing " << existing
                            << ".gseq " << existing->peer_global_seq << " > "
                            << connect.global_seq << ", RETRY_GLOBAL" << dendl;
       reply.global_seq = existing->peer_global_seq;  // so we can send it below..
+      existing->lock.Unlock();
       return _reply_accept(CEPH_MSGR_TAG_RETRY_GLOBAL, connect, reply, authorizer_reply);
     } else {
       ldout(async_msgr->cct, 10) << __func__ << " accept existing " << existing
@@ -1616,6 +1660,7 @@ int AsyncConnection::handle_connect_msg(ceph_msg_connect &connect, bufferlist &a
                            << existing->connect_seq << " > " << connect.connect_seq
                            << ", RETRY_SESSION" << dendl;
       reply.connect_seq = existing->connect_seq + 1;
+      existing->lock.Unlock();
       return _reply_accept(CEPH_MSGR_TAG_RETRY_SESSION, connect, reply, authorizer_reply);
     }
 
@@ -1630,6 +1675,7 @@ int AsyncConnection::handle_connect_msg(ceph_msg_connect &connect, bufferlist &a
                              << ".cseq " << existing->connect_seq << " == "
                              << connect.connect_seq << ", OPEN|STANDBY, RETRY_SESSION" << dendl;
         reply.connect_seq = existing->connect_seq + 1;
+        existing->lock.Unlock();
         return _reply_accept(CEPH_MSGR_TAG_RETRY_SESSION, connect, reply, authorizer_reply);
       }
 
@@ -1642,12 +1688,11 @@ int AsyncConnection::handle_connect_msg(ceph_msg_connect &connect, bufferlist &a
         goto replace;
       } else {
         // our existing outgoing wins
-        ldout(async_msgr->cct,10) << __func__ << "accept connection race, existing "
+        ldout(async_msgr->cct,10) << __func__ << " accept connection race, existing "
                             << existing << ".cseq " << existing->connect_seq
                             << " == " << connect.connect_seq << ", sending WAIT" << dendl;
         assert(peer_addr > async_msgr->get_myaddr());
-        // make sure our outgoing connection will follow through
-        existing->send_keepalive();
+        existing->lock.Unlock();
         return _reply_accept(CEPH_MSGR_TAG_WAIT, connect, reply, authorizer_reply);
       }
     }
@@ -1655,11 +1700,11 @@ int AsyncConnection::handle_connect_msg(ceph_msg_connect &connect, bufferlist &a
     assert(connect.connect_seq > existing->connect_seq);
     assert(connect.global_seq >= existing->peer_global_seq);
     if (policy.resetcheck &&   // RESETSESSION only used by servers; peers do not reset each other
-        existing->connect_seq == 0 && once_session_reset) {
+        existing->connect_seq == 0) {
       ldout(async_msgr->cct, 0) << __func__ << " accept we reset (peer sent cseq "
                           << connect.connect_seq << ", " << existing << ".cseq = "
                           << existing->connect_seq << "), sending RESETSESSION" << dendl;
-      once_session_reset = false;
+      existing->lock.Unlock();
       return _reply_accept(CEPH_MSGR_TAG_RESETSESSION, connect, reply, authorizer_reply);
     }
 
@@ -1692,25 +1737,14 @@ int AsyncConnection::handle_connect_msg(ceph_msg_connect &connect, bufferlist &a
     t.sleep();
   }
 
-  // There is no possible that existing connection will acquire this lock
-  existing->lock.Lock();
-
-  if (existing->replacing || existing->state == STATE_CLOSED) {
-    ldout(async_msgr->cct, 1) << __func__ << " existing racing replace or mark_down happened while replacing."
-                              << " state=" << get_state_name(existing->state) << dendl;
-    reply.connect_seq = connect.connect_seq + 1;
-    r = _reply_accept(CEPH_MSGR_TAG_RETRY_SESSION, connect, reply, authorizer_reply);
-    existing->lock.Unlock();
-    if (r < 0)
-      goto fail;
-    return 0;
-  }
-
   if (existing->policy.lossy) {
     // disconnect from the Connection
     existing->center->dispatch_event_external(existing->reset_handler);
+    ldout(async_msgr->cct, 1) << __func__ << " replacing on lossy channel, failing existing" << dendl;
     existing->_stop();
   } else {
+    assert(can_write == NOWRITE);
+    existing->write_lock.Lock(true);
     // queue a reset on the new connection, which we're dumping for the old
     center->dispatch_event_external(reset_handler);
 
@@ -1723,30 +1757,36 @@ int AsyncConnection::handle_connect_msg(ceph_msg_connect &connect, bufferlist &a
     // Now existing connection will be alive and the current connection will
     // exchange socket with existing connection because we want to maintain
     // original "connection_state"
-    if (existing->sd > 0)
+    if (existing->sd >= 0)
       existing->center->delete_file_event(existing->sd, EVENT_READABLE|EVENT_WRITABLE);
     center->delete_file_event(sd, EVENT_READABLE|EVENT_WRITABLE);
     existing->center->create_file_event(sd, EVENT_READABLE, existing->read_handler);
 
-    reply.connect_seq = connect.connect_seq + 1;
+    reply.global_seq = existing->peer_global_seq;
 
     // Clean up output buffer
     existing->outcoming_bl.clear();
     existing->requeue_sent();
 
     swap(existing->sd, sd);
+    existing->can_write = NOWRITE;
     existing->open_write = false;
     existing->replacing = true;
     existing->state_offset = 0;
     existing->state = STATE_ACCEPTING_WAIT_CONNECT_MSG;
-    // there should exist any buffer
+    // Discard existing prefetch buffer in `recv_buf`
+    existing->recv_start = existing->recv_end = 0;
+    // there shouldn't exist any buffer
     assert(recv_start == recv_end);
 
-    if (existing->_reply_accept(CEPH_MSGR_TAG_RETRY_SESSION, connect, reply, authorizer_reply) < 0) {
+    existing->write_lock.Unlock();
+    if (existing->_reply_accept(CEPH_MSGR_TAG_RETRY_GLOBAL, connect, reply, authorizer_reply) < 0) {
       // handle error
-      existing->center->dispatch_event_external(existing->write_handler);
+      ldout(async_msgr->cct, 0) << __func__ << " reply fault for existing connection." << dendl;
+      existing->fault();
     }
 
+    ldout(async_msgr->cct, 1) << __func__ << " stop myself to swap existing" << dendl;
     _stop();
     existing->lock.Unlock();
     return 0;
@@ -1757,7 +1797,7 @@ int AsyncConnection::handle_connect_msg(ceph_msg_connect &connect, bufferlist &a
   connect_seq = connect.connect_seq + 1;
   peer_global_seq = connect.global_seq;
   ldout(async_msgr->cct, 10) << __func__ << " accept success, connect_seq = "
-                             << connect_seq << " in_seq=" << in_seq << ", sending READY" << dendl;
+                             << connect_seq << " in_seq=" << in_seq.read() << ", sending READY" << dendl;
 
   int next_state;
 
@@ -1770,7 +1810,7 @@ int AsyncConnection::handle_connect_msg(ceph_msg_connect &connect, bufferlist &a
     next_state = STATE_ACCEPTING_READY;
     discard_requeued_up_to(0);
     is_reset_from_peer = false;
-    in_seq = 0;
+    in_seq.set(0);
   }
 
   // send READY reply
@@ -1794,8 +1834,10 @@ int AsyncConnection::handle_connect_msg(ceph_msg_connect &connect, bufferlist &a
   if (reply.authorizer_len)
     reply_bl.append(authorizer_reply.c_str(), authorizer_reply.length());
 
-  if (reply.tag == CEPH_MSGR_TAG_SEQ)
-    reply_bl.append((char*)&in_seq, sizeof(in_seq));
+  if (reply.tag == CEPH_MSGR_TAG_SEQ) {
+    uint64_t s = in_seq.read();
+    reply_bl.append((char*)&s, sizeof(s));
+  }
 
   lock.Unlock();
   // Because "replacing" will prevent other connections preempt this addr,
@@ -1824,15 +1866,15 @@ int AsyncConnection::handle_connect_msg(ceph_msg_connect &connect, bufferlist &a
     goto fail_registered;
   }
 
-  // notify
-  center->dispatch_event_external(accept_handler);
-  async_msgr->ms_deliver_handle_fast_accept(this);
-
-
-  r = _try_send(reply_bl);
+  r = try_send(reply_bl);
   if (r < 0)
     goto fail_registered;
 
+  // notify
+  center->dispatch_event_external(EventCallbackRef(new C_deliver_accept(async_msgr, this)));
+  async_msgr->ms_deliver_handle_fast_accept(this);
+  once_ready = true;
+
   if (r == 0) {
     state = next_state;
     ldout(async_msgr->cct, 2) << __func__ << " accept write reply msg done" << dendl;
@@ -1865,7 +1907,6 @@ void AsyncConnection::_connect()
   ldout(async_msgr->cct, 10) << __func__ << " csq=" << connect_seq << dendl;
 
   state = STATE_CONNECTING;
-  stopping.set(0);
   // rescheduler connection in order to avoid lock dep
   // may called by external thread(send_message)
   center->dispatch_event_external(read_handler);
@@ -1880,76 +1921,101 @@ void AsyncConnection::accept(int incoming)
   state = STATE_ACCEPTING;
   center->create_file_event(sd, EVENT_READABLE, read_handler);
   // rescheduler connection in order to avoid lock dep
-  process();
+  center->dispatch_event_external(read_handler);
 }
 
 int AsyncConnection::send_message(Message *m)
 {
-  ldout(async_msgr->cct, 10) << __func__ << dendl;
-  m->get_header().src = async_msgr->get_myname();
+  ldout(async_msgr->cct, 1) << " == tx == " << m << " " << *m << dendl;
+
+  // optimistic think it's ok to encode(actually may broken now)
   if (!m->get_priority())
     m->set_priority(async_msgr->get_default_send_priority());
 
-  Mutex::Locker l(lock);
-  if (!is_queued() && state >= STATE_OPEN && state <= STATE_OPEN_TAG_CLOSE) {
-    ldout(async_msgr->cct, 10) << __func__ << " try send msg " << m << dendl;
-    int r = _send(m);
-    if (r < 0) {
+  m->get_header().src = async_msgr->get_myname();
+  m->set_connection(this);
+
+  if (async_msgr->get_myaddr() == get_peer_addr()) { //loopback connection
+   ldout(async_msgr->cct, 20) << __func__ << " " << *m << " local" << dendl;
+   Mutex::Locker l(write_lock);
+   local_messages.push_back(m);
+   center->dispatch_event_external(local_deliver_handler);
+   return 0;
+  }
+
+  // we don't want to consider local message here, it's too lightweight which
+  // may disturb users
+  logger->inc(l_msgr_send_messages);
+
+  bufferlist bl;
+  uint64_t f = get_features();
+
+  // TODO: Currently not all messages supports reencode like MOSDMap, so here
+  // only let fast dispatch support messages prepare message
+  bool can_fast_prepare = async_msgr->ms_can_fast_dispatch(m);
+  if (can_fast_prepare)
+    prepare_send_message(f, m, bl);
+
+  Mutex::Locker l(write_lock);
+  // "features" changes will change the payload encoding
+  if (can_fast_prepare && (can_write == NOWRITE || get_features() != f)) {
+    // ensure the correctness of message encoding
+    bl.clear();
+    m->get_payload().clear();
+    ldout(async_msgr->cct, 5) << __func__ << " clear encoded buffer, can_write=" << can_write << " previous "
+                              << f << " != " << get_features() << dendl;
+  }
+  if (!is_queued() && can_write == CANWRITE) {
+    if (!can_fast_prepare)
+      prepare_send_message(get_features(), m, bl);
+    if (write_message(m, bl) < 0) {
       ldout(async_msgr->cct, 1) << __func__ << " send msg failed" << dendl;
       // we want to handle fault within internal thread
       center->dispatch_event_external(write_handler);
     }
-  } else if (state == STATE_CLOSED) {
-      ldout(async_msgr->cct, 10) << __func__ << " connection closed."
-                                 << " Drop message " << m << dendl;
-  } else if (async_msgr->get_myaddr() == get_peer_addr()) { //loopback connection
-      ldout(async_msgr->cct, 20) << __func__ << " " << *m << " local" << dendl;
-      local_messages.push_back(m);
-      center->dispatch_event_external(local_deliver_handler);
+  } else if (can_write == CLOSED) {
+    ldout(async_msgr->cct, 10) << __func__ << " connection closed."
+                               << " Drop message " << m << dendl;
+    m->put();
   } else {
-    out_q[m->get_priority()].push_back(m);
-    if (state == STATE_STANDBY && !policy.server) {
-      ldout(async_msgr->cct, 10) << __func__ << " state is " << get_state_name(state)
-                                 << " policy.server is false" << dendl;
-      _connect();
-    } else if (sd > 0 && !open_write) {
-      center->dispatch_event_external(write_handler);
-    }
+    out_q[m->get_priority()].push_back(make_pair(bl, m));
+    ldout(async_msgr->cct, 15) << __func__ << " inline write is denied, reschedule m=" << m << dendl;
+    center->dispatch_event_external(write_handler);
   }
   return 0;
 }
 
 void AsyncConnection::requeue_sent()
 {
+  assert(write_lock.is_locked());
   if (sent.empty())
     return;
 
-  list<Message*>& rq = out_q[CEPH_MSG_PRIO_HIGHEST];
+  list<pair<bufferlist, Message*> >& rq = out_q[CEPH_MSG_PRIO_HIGHEST];
   while (!sent.empty()) {
-    Message *m = sent.back();
+    Message* m = sent.back();
     sent.pop_back();
-    ldout(async_msgr->cct, 10) << __func__ << " " << *m << " for resend seq " << out_seq
-                         << " (" << m->get_seq() << ")" << dendl;
-    rq.push_front(m);
-    out_seq--;
+    ldout(async_msgr->cct, 10) << __func__ << " " << *m << " for resend "
+                               << " (" << m->get_seq() << ")" << dendl;
+    rq.push_front(make_pair(bufferlist(), m));
   }
 }
 
 void AsyncConnection::discard_requeued_up_to(uint64_t seq)
 {
   ldout(async_msgr->cct, 10) << __func__ << " " << seq << dendl;
+  Mutex::Locker l(write_lock);
   if (out_q.count(CEPH_MSG_PRIO_HIGHEST) == 0)
     return;
-  list<Message*>& rq = out_q[CEPH_MSG_PRIO_HIGHEST];
+  list<pair<bufferlist, Message*> >& rq = out_q[CEPH_MSG_PRIO_HIGHEST];
   while (!rq.empty()) {
-    Message *m = rq.front();
-    if (m->get_seq() == 0 || m->get_seq() > seq)
+    pair<bufferlist, Message*> p = rq.front();
+    if (p.second->get_seq() == 0 || p.second->get_seq() > seq)
       break;
-    ldout(async_msgr->cct, 10) << __func__ << " " << *m << " for resend seq " << out_seq
+    ldout(async_msgr->cct, 10) << __func__ << " " << *(p.second) << " for resend seq " << p.second->get_seq()
                          << " <= " << seq << ", discarding" << dendl;
-    m->put();
+    p.second->put();
     rq.pop_front();
-    out_seq++;
   }
   if (rq.empty())
     out_q.erase(CEPH_MSG_PRIO_HIGHEST);
@@ -1962,16 +2028,17 @@ void AsyncConnection::discard_requeued_up_to(uint64_t seq)
 void AsyncConnection::discard_out_queue()
 {
   ldout(async_msgr->cct, 10) << __func__ << " started" << dendl;
+  assert(write_lock.is_locked());
 
   for (list<Message*>::iterator p = sent.begin(); p != sent.end(); ++p) {
     ldout(async_msgr->cct, 20) << __func__ << " discard " << *p << dendl;
     (*p)->put();
   }
   sent.clear();
-  for (map<int,list<Message*> >::iterator p = out_q.begin(); p != out_q.end(); ++p)
-    for (list<Message*>::iterator r = p->second.begin(); r != p->second.end(); ++r) {
-      ldout(async_msgr->cct, 20) << __func__ << " discard " << *r << dendl;
-      (*r)->put();
+  for (map<int, list<pair<bufferlist, Message*> > >::iterator p = out_q.begin(); p != out_q.end(); ++p)
+    for (list<pair<bufferlist, Message*> >::iterator r = p->second.begin(); r != p->second.end(); ++r) {
+      ldout(async_msgr->cct, 20) << __func__ << " discard " << r->second << dendl;
+      r->second->put();
     }
   out_q.clear();
   outcoming_bl.clear();
@@ -1982,13 +2049,15 @@ int AsyncConnection::randomize_out_seq()
   if (get_features() & CEPH_FEATURE_MSG_AUTH) {
     // Set out_seq to a random value, so CRC won't be predictable.   Don't bother checking seq_error
     // here.  We'll check it on the call.  PLR
-    int seq_error = get_random_bytes((char *)&out_seq, sizeof(out_seq));
-    out_seq &= SEQ_MASK;
-    lsubdout(async_msgr->cct, ms, 10) << __func__ << " randomize_out_seq " << out_seq << dendl;
+    uint64_t rand_seq;
+    int seq_error = get_random_bytes((char *)&rand_seq, sizeof(rand_seq));
+    rand_seq &= SEQ_MASK;
+    lsubdout(async_msgr->cct, ms, 10) << __func__ << " randomize_out_seq " << rand_seq << dendl;
+    out_seq.set(rand_seq);
     return seq_error;
   } else {
     // previously, seq #'s always started at 0.
-    out_seq = 0;
+    out_seq.set(0);
     return 0;
   }
 }
@@ -1997,21 +2066,24 @@ void AsyncConnection::fault()
 {
   if (state == STATE_CLOSED) {
     ldout(async_msgr->cct, 10) << __func__ << " state is already " << get_state_name(state) << dendl;
-    center->dispatch_event_external(reset_handler);
     return ;
   }
 
   if (policy.lossy && !(state >= STATE_CONNECTING && state < STATE_CONNECTING_READY)) {
-    ldout(async_msgr->cct, 10) << __func__ << " on lossy channel, failing" << dendl;
+    ldout(async_msgr->cct, 1) << __func__ << " on lossy channel, failing" << dendl;
     center->dispatch_event_external(reset_handler);
     _stop();
     return ;
   }
 
+  write_lock.Lock();
   if (sd >= 0) {
     shutdown_socket();
     center->delete_file_event(sd, EVENT_READABLE|EVENT_WRITABLE);
+    ::close(sd);
+    sd = -1;
   }
+  can_write = NOWRITE;
   open_write = false;
 
   // requeue sent items
@@ -2019,16 +2091,30 @@ void AsyncConnection::fault()
   recv_start = recv_end = 0;
   state_offset = 0;
   replacing = false;
+  is_reset_from_peer = false;
   outcoming_bl.clear();
+  if (!once_ready && !is_queued() &&
+      state >=STATE_ACCEPTING && state <= STATE_ACCEPTING_WAIT_CONNECT_MSG_AUTH) {
+    ldout(async_msgr->cct, 0) << __func__ << " with nothing to send and in the half "
+                              << " accept state just closed, state="
+                              << get_state_name(state) << dendl;
+    center->dispatch_event_external(reset_handler);
+
+    write_lock.Unlock();
+    _stop();
+    return ;
+  }
   if (policy.standby && !is_queued()) {
     ldout(async_msgr->cct,0) << __func__ << " with nothing to send, going to standby" << dendl;
     state = STATE_STANDBY;
+    write_lock.Unlock();
     return;
   }
 
+  write_lock.Unlock();
   if (!(state >= STATE_CONNECTING && state < STATE_CONNECTING_READY)) {
     // policy maybe empty when state is in accept
-    if (policy.server || (state >= STATE_ACCEPTING && state < STATE_ACCEPTING_WAIT_SEQ)) {
+    if (policy.server) {
       ldout(async_msgr->cct, 0) << __func__ << " server, going to standby" << dendl;
       state = STATE_STANDBY;
     } else {
@@ -2051,51 +2137,52 @@ void AsyncConnection::fault()
 
   // woke up again;
   register_time_events.insert(center->create_time_event(
-          backoff.to_nsec()/1000, EventCallbackRef(new C_time_wakeup(this))));
+          backoff.to_nsec()/1000, wakeup_handler));
 }
 
 void AsyncConnection::was_session_reset()
 {
   ldout(async_msgr->cct,10) << __func__ << " started" << dendl;
+  assert(lock.is_locked());
+  Mutex::Locker l(write_lock);
   discard_out_queue();
 
   center->dispatch_event_external(remote_reset_handler);
 
   if (randomize_out_seq()) {
-    lsubdout(async_msgr->cct,ms,15) << __func__ << " could not get random bytes to set seq number for session reset; set seq number to " << out_seq << dendl;
+    ldout(async_msgr->cct, 15) << __func__ << " could not get random bytes to set seq number for session reset; set seq number to " << out_seq.read() << dendl;
   }
 
-  in_seq = 0;
+  in_seq.set(0);
   connect_seq = 0;
-  in_seq_acked = 0;
-  once_session_reset = true;
+  // it's safe to directly set 0, double locked
+  ack_left.set(0);
+  once_ready = false;
+  can_write = NOWRITE;
 }
 
 void AsyncConnection::_stop()
 {
   assert(lock.is_locked());
-  ldout(async_msgr->cct, 10) << __func__ << dendl;
-  if (sd > 0)
+  if (state == STATE_CLOSED)
+    return ;
+
+  ldout(async_msgr->cct, 1) << __func__ << dendl;
+  Mutex::Locker l(write_lock);
+  if (sd >= 0)
     center->delete_file_event(sd, EVENT_READABLE|EVENT_WRITABLE);
 
   discard_out_queue();
   async_msgr->unregister_conn(this);
 
-  if (async_msgr->cct->_conf->ms_inject_internal_delays) {
-    ldout(msgr->cct, 10) << __func__ << " sleep for "
-                         << async_msgr->cct->_conf->ms_inject_internal_delays
-                         << dendl;
-    utime_t t;
-    t.set_from_double(async_msgr->cct->_conf->ms_inject_internal_delays);
-    t.sleep();
-  }
-
   state = STATE_CLOSED;
-  shutdown_socket();
   open_write = false;
+  can_write = CLOSED;
   state_offset = 0;
-  if (sd > 0)
+  if (sd >= 0) {
+    shutdown_socket();
     ::close(sd);
+  }
   sd = -1;
   for (set<uint64_t>::iterator it = register_time_events.begin();
        it != register_time_events.end(); ++it)
@@ -2104,33 +2191,43 @@ void AsyncConnection::_stop()
   center->dispatch_event_external(EventCallbackRef(new C_clean_handler(this)));
 }
 
-int AsyncConnection::_send(Message *m)
+void AsyncConnection::prepare_send_message(uint64_t features, Message *m, bufferlist &bl)
 {
-  m->set_seq(++out_seq);
-  if (!policy.lossy) {
-    // put on sent list
-    sent.push_back(m); 
-    m->get();
-  }
+  ldout(async_msgr->cct, 20) << __func__ << " m" << " " << *m << dendl;
 
   // associate message with Connection (for benefit of encode_payload)
-  m->set_connection(this);
-
-  uint64_t features = get_features();
   if (m->empty_payload())
-    ldout(async_msgr->cct, 20) << __func__ << " encoding " << m->get_seq() << " features " << features
-                         << " " << m << " " << *m << dendl;
+    ldout(async_msgr->cct, 20) << __func__ << " encoding features "
+                               << features << " " << m << " " << *m << dendl;
   else
-    ldout(async_msgr->cct, 20) << __func__ << " half-reencoding " << m->get_seq() << " features "
-                         << features << " " << m << " " << *m << dendl;
+    ldout(async_msgr->cct, 20) << __func__ << " half-reencoding features "
+                               << features << " " << m << " " << *m << dendl;
 
   // encode and copy out of *m
-  m->encode(features, async_msgr->crcflags);
+  m->encode(features, msgr->crcflags);
+
+  bl.append(m->get_payload());
+  bl.append(m->get_middle());
+  bl.append(m->get_data());
+}
+
+int AsyncConnection::write_message(Message *m, bufferlist& bl)
+{
+  assert(can_write == CANWRITE);
+  m->set_seq(out_seq.inc());
+
+  if (!policy.lossy) {
+    // put on sent list
+    sent.push_back(m);
+    m->get();
+  }
+
+  m->calc_header_crc();
 
-  // prepare everything
   ceph_msg_header& header = m->get_header();
   ceph_msg_footer& footer = m->get_footer();
 
+  // TODO: let sign_message could be reentry?
   // Now that we have all the crcs calculated, handle the
   // digital signature for the message, if the AsyncConnection has session
   // security set up.  Some session security options do not
@@ -2140,70 +2237,45 @@ int AsyncConnection::_send(Message *m)
     ldout(async_msgr->cct, 20) << __func__ << " no session security" << dendl;
   } else {
     if (session_security->sign_message(m)) {
-      ldout(async_msgr->cct, 20) << __func__ << " failed to sign seq # "
-                           << header.seq << "): sig = " << footer.sig << dendl;
+      ldout(async_msgr->cct, 20) << __func__ << " failed to sign m="
+                                 << m << "): sig = " << footer.sig << dendl;
     } else {
-      ldout(async_msgr->cct, 20) << __func__ << " signed seq # " << header.seq
-                           << "): sig = " << footer.sig << dendl;
+      ldout(async_msgr->cct, 20) << __func__ << " signed m=" << m
+                                 << "): sig = " << footer.sig << dendl;
     }
   }
 
-  bufferlist blist = m->get_payload();
-  blist.append(m->get_middle());
-  blist.append(m->get_data());
-
-  ldout(async_msgr->cct, 20) << __func__ << " sending " << m->get_seq()
-                       << " " << m << dendl;
-  int rc = write_message(header, footer, blist);
-
-  if (rc < 0) {
-    ldout(async_msgr->cct, 1) << __func__ << " error sending " << m << ", "
-                        << cpp_strerror(errno) << dendl;
-  } else if (rc == 0) {
-    ldout(async_msgr->cct, 10) << __func__ << " sending " << m << " done." << dendl;
-  } else {
-    ldout(async_msgr->cct, 10) << __func__ << " sending " << m << " continuely." << dendl;
-  }
-  m->put();
-
-  return rc;
-}
-
-int AsyncConnection::write_message(ceph_msg_header& header, ceph_msg_footer& footer,
-                                  bufferlist& blist)
-{
-  bufferlist bl;
-  int ret;
-
+  bufferlist complete_bl;
   // send tag
   char tag = CEPH_MSGR_TAG_MSG;
-  bl.append(&tag, sizeof(tag));
+  complete_bl.append(&tag, sizeof(tag));
 
-  // send envelope
-  ceph_msg_header_old oldheader;
   if (has_feature(CEPH_FEATURE_NOSRCADDR)) {
-    bl.append((char*)&header, sizeof(header));
+    complete_bl.append((char*)&header, sizeof(header));
   } else {
+    ceph_msg_header_old oldheader;
     memcpy(&oldheader, &header, sizeof(header));
     oldheader.src.name = header.src;
     oldheader.src.addr = get_peer_addr();
     oldheader.orig_src = oldheader.src;
     oldheader.reserved = header.reserved;
-    if (msgr->crcflags & MSG_CRC_HEADER) {
-       oldheader.crc = ceph_crc32c(0, (unsigned char*)&oldheader,
-                                   sizeof(oldheader) - sizeof(oldheader.crc));
-    } else {
-       oldheader.crc = 0;
-    }
-    bl.append((char*)&oldheader, sizeof(oldheader));
+    oldheader.crc = ceph_crc32c(0, (unsigned char*)&oldheader,
+                                sizeof(oldheader) - sizeof(oldheader.crc));
+    complete_bl.append((char*)&oldheader, sizeof(oldheader));
   }
 
-  bl.claim_append(blist);
+  ldout(async_msgr->cct, 20) << __func__ << " sending message type=" << header.type
+                             << " src " << entity_name_t(header.src)
+                             << " front=" << header.front_len
+                             << " data=" << header.data_len
+                             << " off " << header.data_off << dendl;
+
+  complete_bl.claim_append(bl);
 
   // send footer; if receiver doesn't support signatures, use the old footer format
   ceph_msg_footer_old old_footer;
   if (has_feature(CEPH_FEATURE_MSG_AUTH)) {
-    bl.append((char*)&footer, sizeof(footer));
+    complete_bl.append((char*)&footer, sizeof(footer));
   } else {
     if (msgr->crcflags & MSG_CRC_HEADER) {
       old_footer.front_crc = footer.front_crc;
@@ -2214,27 +2286,37 @@ int AsyncConnection::write_message(ceph_msg_header& header, ceph_msg_footer& foo
     }
     old_footer.data_crc = msgr->crcflags & MSG_CRC_DATA ? footer.data_crc : 0;
     old_footer.flags = footer.flags;
-    bl.append((char*)&old_footer, sizeof(old_footer));
+    complete_bl.append((char*)&old_footer, sizeof(old_footer));
   }
 
-  // send
-  ret = _try_send(bl);
-  if (ret < 0)
-    return ret;
+  logger->inc(l_msgr_send_bytes, complete_bl.length());
+  ldout(async_msgr->cct, 20) << __func__ << " sending " << m->get_seq()
+                             << " " << m << dendl;
+  int rc = _try_send(complete_bl);
+  if (rc < 0) {
+    ldout(async_msgr->cct, 1) << __func__ << " error sending " << m << ", "
+                              << cpp_strerror(errno) << dendl;
+  } else if (rc == 0) {
+    ldout(async_msgr->cct, 10) << __func__ << " sending " << m << " done." << dendl;
+  } else {
+    ldout(async_msgr->cct, 10) << __func__ << " sending " << m << " continuely." << dendl;
+  }
+  m->put();
 
-  return ret;
+  return rc;
 }
 
 void AsyncConnection::handle_ack(uint64_t seq)
 {
-  lsubdout(async_msgr->cct, ms, 15) << __func__ << " got ack seq " << seq << dendl;
+  ldout(async_msgr->cct, 15) << __func__ << " got ack seq " << seq << dendl;
   // trim sent list
+  Mutex::Locker l(write_lock);
   while (!sent.empty() && sent.front()->get_seq() <= seq) {
-    Message *m = sent.front();
+    Message* m = sent.front();
     sent.pop_front();
-    lsubdout(async_msgr->cct, ms, 10) << __func__ << "reader got ack seq "
-                                << seq << " >= " << m->get_seq() << " on "
-                                << m << " " << *m << dendl;
+    ldout(async_msgr->cct, 10) << __func__ << " got ack seq "
+                               << seq << " >= " << m->get_seq() << " on "
+                               << m << " " << *m << dendl;
     m->put();
   }
 }
@@ -2242,8 +2324,8 @@ void AsyncConnection::handle_ack(uint64_t seq)
 void AsyncConnection::send_keepalive()
 {
   ldout(async_msgr->cct, 10) << __func__ << " started." << dendl;
-  Mutex::Locker l(lock);
-  if (state != STATE_CLOSED) {
+  Mutex::Locker l(write_lock);
+  if (can_write != CLOSED) {
     keepalive = true;
     center->dispatch_event_external(write_handler);
   }
@@ -2251,15 +2333,14 @@ void AsyncConnection::send_keepalive()
 
 void AsyncConnection::mark_down()
 {
-  ldout(async_msgr->cct, 10) << __func__ << " started." << dendl;
-  stopping.set(1);
+  ldout(async_msgr->cct, 1) << __func__ << " started." << dendl;
   Mutex::Locker l(lock);
   _stop();
 }
 
 void AsyncConnection::_send_keepalive_or_ack(bool ack, utime_t *tp)
 {
-  assert(lock.is_locked());
+  assert(write_lock.is_locked());
   bufferlist bl;
 
   utime_t t = ceph_clock_now(async_msgr->cct);
@@ -2286,57 +2367,82 @@ void AsyncConnection::_send_keepalive_or_ack(bool ack, utime_t *tp)
 void AsyncConnection::handle_write()
 {
   ldout(async_msgr->cct, 10) << __func__ << " started." << dendl;
-  Mutex::Locker l(lock);
   bufferlist bl;
   int r = 0;
-  if (state >= STATE_OPEN && state <= STATE_OPEN_TAG_CLOSE) {
+
+  write_lock.Lock();
+  if (can_write == CANWRITE) {
     if (keepalive) {
       _send_keepalive_or_ack();
       keepalive = false;
     }
 
     while (1) {
-      Message *m = _get_next_outgoing();
+      bufferlist data;
+      Message *m = _get_next_outgoing(&data);
       if (!m)
         break;
 
-      ldout(async_msgr->cct, 10) << __func__ << " try send msg " << m << dendl;
-      r = _send(m);
+      // send_message or requeue messages may not encode message
+      if (!data.length())
+        prepare_send_message(get_features(), m, data);
+
+      r = write_message(m, data);
       if (r < 0) {
         ldout(async_msgr->cct, 1) << __func__ << " send msg failed" << dendl;
+        write_lock.Unlock();
         goto fail;
       } else if (r > 0) {
         break;
       }
     }
 
-    if (in_seq > in_seq_acked) {
+    uint64_t left = ack_left.read();
+    if (left) {
       ceph_le64 s;
-      s = in_seq;
+      s = in_seq.read();
       bl.append(CEPH_MSGR_TAG_ACK);
       bl.append((char*)&s, sizeof(s));
-      ldout(async_msgr->cct, 10) << __func__ << " try send msg ack" << dendl;
-      in_seq_acked = s;
+      ldout(async_msgr->cct, 10) << __func__ << " try send msg ack, acked " << left << " messages" << dendl;
+      ack_left.sub(left);
       r = _try_send(bl);
     } else if (is_queued()) {
       r = _try_send(bl);
     }
 
+    write_lock.Unlock();
     if (r < 0) {
       ldout(async_msgr->cct, 1) << __func__ << " send msg failed" << dendl;
       goto fail;
     }
-  } else if (state != STATE_CONNECTING) {
-    r = _try_send(bl);
-    if (r < 0) {
-      ldout(async_msgr->cct, 1) << __func__ << " send outcoming bl failed" << dendl;
-      goto fail;
+  } else {
+    write_lock.Unlock();
+    lock.Lock();
+    write_lock.Lock();
+    if (state == STATE_STANDBY && !policy.server && is_queued()) {
+      ldout(async_msgr->cct, 10) << __func__ << " state is " << get_state_name(state)
+                                 << " policy.server is false" << dendl;
+      _connect();
+    } else if (sd >= 0 && state != STATE_CONNECTING && state != STATE_CLOSED) {
+      r = _try_send(bl);
+      if (r < 0) {
+        ldout(async_msgr->cct, 1) << __func__ << " send outcoming bl failed" << dendl;
+        write_lock.Unlock();
+        fault();
+        lock.Unlock();
+        return ;
+      }
     }
+    write_lock.Unlock();
+    lock.Unlock();
   }
 
   return ;
+
  fail:
+  lock.Lock();
   fault();
+  lock.Unlock();
 }
 
 void AsyncConnection::wakeup_from(uint64_t id)
@@ -2350,20 +2456,20 @@ void AsyncConnection::wakeup_from(uint64_t id)
 void AsyncConnection::local_deliver()
 {
   ldout(async_msgr->cct, 10) << __func__ << dendl;
-  Mutex::Locker l(lock);
+  Mutex::Locker l(write_lock);
   while (!local_messages.empty()) {
-    Message *m = local_messages.back();
-    local_messages.pop_back();
+    Message *m = local_messages.front();
+    local_messages.pop_front();
     m->set_connection(this);
     m->set_recv_stamp(ceph_clock_now(async_msgr->cct));
     ldout(async_msgr->cct, 10) << __func__ << " " << *m << " local deliver " << dendl;
     async_msgr->ms_fast_preprocess(m);
-    lock.Unlock();
+    write_lock.Unlock();
     if (async_msgr->ms_can_fast_dispatch(m)) {
       async_msgr->ms_fast_dispatch(m);
     } else {
       msgr->ms_deliver_dispatch(m);
     }
-    lock.Lock();
+    write_lock.Lock();
   }
 }
diff --git a/src/msg/async/AsyncConnection.h b/src/msg/async/AsyncConnection.h
index e4e7aff..64c2921 100644
--- a/src/msg/async/AsyncConnection.h
+++ b/src/msg/async/AsyncConnection.h
@@ -25,6 +25,7 @@ using namespace std;
 
 #include "auth/AuthSessionHandler.h"
 #include "common/Mutex.h"
+#include "common/perf_counters.h"
 #include "include/buffer.h"
 #include "msg/Connection.h"
 #include "msg/Messenger.h"
@@ -45,10 +46,15 @@ class AsyncConnection : public Connection {
 
   int read_bulk(int fd, char *buf, int len);
   int do_sendmsg(struct msghdr &msg, int len, bool more);
+  int try_send(bufferlist &bl, bool send=true) {
+    Mutex::Locker l(write_lock);
+    return _try_send(bl, send);
+  }
   // if "send" is false, it will only append bl to send buffer
   // the main usage is avoid error happen outside messenger threads
-  int _try_send(bufferlist bl, bool send=true);
+  int _try_send(bufferlist &bl, bool send=true);
   int _send(Message *m);
+  void prepare_send_message(uint64_t features, Message *m, bufferlist &bl);
   int read_until(uint64_t needed, char *p);
   int _process_connection();
   void _connect();
@@ -63,7 +69,7 @@ class AsyncConnection : public Connection {
   int randomize_out_seq();
   void handle_ack(uint64_t seq);
   void _send_keepalive_or_ack(bool ack=false, utime_t *t=NULL);
-  int write_message(ceph_msg_header& header, ceph_msg_footer& footer, bufferlist& blist);
+  int write_message(Message *m, bufferlist& bl);
   int _reply_accept(char tag, ceph_msg_connect &connect, ceph_msg_connect_reply &reply,
                     bufferlist authorizer_reply) {
     bufferlist reply_bl;
@@ -74,7 +80,7 @@ class AsyncConnection : public Connection {
     if (reply.authorizer_len) {
       reply_bl.append(authorizer_reply.c_str(), authorizer_reply.length());
     }
-    int r = _try_send(reply_bl);
+    int r = try_send(reply_bl);
     if (r < 0)
       return -1;
 
@@ -82,27 +88,33 @@ class AsyncConnection : public Connection {
     return 0;
   }
   bool is_queued() {
+    assert(write_lock.is_locked());
     return !out_q.empty() || outcoming_bl.length();
   }
   void shutdown_socket() {
     if (sd >= 0)
       ::shutdown(sd, SHUT_RDWR);
   }
-  Message *_get_next_outgoing() {
+  Message *_get_next_outgoing(bufferlist *bl) {
+    assert(write_lock.is_locked());
     Message *m = 0;
     while (!m && !out_q.empty()) {
-      map<int, list<Message*> >::reverse_iterator p = out_q.rbegin();
-      if (!p->second.empty()) {
-        m = p->second.front();
-        p->second.pop_front();
+      map<int, list<pair<bufferlist, Message*> > >::reverse_iterator it = out_q.rbegin();
+      if (!it->second.empty()) {
+        list<pair<bufferlist, Message*> >::iterator p = it->second.begin();
+        m = p->second;
+        if (bl)
+          bl->swap(p->first);
+        it->second.erase(p);
       }
-      if (p->second.empty())
-        out_q.erase(p->first);
+      if (it->second.empty())
+        out_q.erase(it->first);
     }
     return m;
   }
+
  public:
-  AsyncConnection(CephContext *cct, AsyncMessenger *m, EventCenter *c);
+  AsyncConnection(CephContext *cct, AsyncMessenger *m, EventCenter *c, PerfCounters *p);
   ~AsyncConnection();
 
   ostream& _conn_prefix(std::ostream *_dout);
@@ -203,42 +215,45 @@ class AsyncConnection : public Connection {
       return statenames[state];
   }
 
-  CephContext *cc;
   AsyncMessenger *async_msgr;
+  PerfCounters *logger;
   int global_seq;
   __u32 connect_seq, peer_global_seq;
-  uint64_t out_seq;
-  uint64_t in_seq, in_seq_acked;
+  atomic_t out_seq;
+  atomic_t ack_left, in_seq;
   int state;
   int state_after_send;
   int sd;
   int port;
   Messenger::Policy policy;
-  map<int, list<Message*> > out_q;  // priority queue for outbound msgs
-  list<Message*> sent;
+
+  Mutex write_lock;
+  enum {
+    NOWRITE,
+    CANWRITE,
+    CLOSED
+  } can_write;
+  bool open_write;
+  map<int, list<pair<bufferlist, Message*> > > out_q;  // priority queue for outbound msgs
+  list<Message*> sent; // the first bufferlist need to inject seq
   list<Message*> local_messages;    // local deliver
+  bufferlist outcoming_bl;
+  bool keepalive;
+
   Mutex lock;
   utime_t backoff;         // backoff time
-  bool open_write;
   EventCallbackRef read_handler;
   EventCallbackRef write_handler;
   EventCallbackRef reset_handler;
   EventCallbackRef remote_reset_handler;
   EventCallbackRef connect_handler;
-  EventCallbackRef fast_connect_handler;
-  EventCallbackRef accept_handler;
-  EventCallbackRef fast_accept_handler;
-  EventCallbackRef stop_handler;
-  EventCallbackRef signal_handler;
   EventCallbackRef local_deliver_handler;
-  bool keepalive;
+  EventCallbackRef wakeup_handler;
   struct iovec msgvec[IOV_MAX];
   char *recv_buf;
   uint32_t recv_max_prefetch;
   uint32_t recv_start;
   uint32_t recv_end;
-  Mutex stop_lock; // used to protect `mark_down_cond`
-  Cond stop_cond;
   set<uint64_t> register_time_events; // need to delete it if stop
 
   // Tis section are temp variables used by state transition
@@ -265,15 +280,13 @@ class AsyncConnection : public Connection {
                      // there won't exists conflicting connection so we use
                      // "replacing" to skip RESETSESSION to avoid detect wrong
                      // presentation
-  bool once_session_reset;
   bool is_reset_from_peer;
-  atomic_t stopping;
+  bool once_ready;
 
   // used only for local state, it will be overwrite when state transition
   char *state_buffer;
   // used only by "read_until"
   uint64_t state_offset;
-  bufferlist outcoming_bl;
   NetHandler net;
   EventCenter *center;
   ceph::shared_ptr<AuthSessionHandler> session_security;
@@ -285,7 +298,10 @@ class AsyncConnection : public Connection {
   void wakeup_from(uint64_t id);
   void local_deliver();
   void stop() {
-    center->dispatch_event_external(reset_handler);
+    lock.Lock();
+    if (state != STATE_CLOSED)
+      center->dispatch_event_external(reset_handler);
+    lock.Unlock();
     mark_down();
   }
   void cleanup_handler() {
@@ -294,8 +310,11 @@ class AsyncConnection : public Connection {
     reset_handler.reset();
     remote_reset_handler.reset();
     connect_handler.reset();
-    accept_handler.reset();
     local_deliver_handler.reset();
+    wakeup_handler.reset();
+  }
+  PerfCounters *get_perf_counter() {
+    return logger;
   }
 }; /* AsyncConnection */
 
diff --git a/src/msg/async/AsyncMessenger.cc b/src/msg/async/AsyncMessenger.cc
index 44b8ceb..e5e393a 100644
--- a/src/msg/async/AsyncMessenger.cc
+++ b/src/msg/async/AsyncMessenger.cc
@@ -19,9 +19,6 @@
 #include <errno.h>
 #include <iostream>
 #include <fstream>
-#ifdef HAVE_SCHED
-#include <sched.h>
-#endif
 
 #include "AsyncMessenger.h"
 
@@ -53,18 +50,6 @@ static ostream& _prefix(std::ostream *_dout, WorkerPool *p) {
 }
 
 
-class C_conn_accept : public EventCallback {
-  AsyncConnectionRef conn;
-  int fd;
-
- public:
-  C_conn_accept(AsyncConnectionRef c, int s): conn(c), fd(s) {}
-  void do_request(int id) {
-    conn->accept(fd);
-  }
-};
-
-
 class C_processor_accept : public EventCallback {
   Processor *pro;
 
@@ -112,6 +97,9 @@ int Processor::bind(const entity_addr_t &bind_addr, const set<int>& avoid_ports)
     listen_sd = -1;
     return -errno;
   }
+
+  net.set_socket_options(listen_sd);
+
   // use whatever user specified (if anything)
   entity_addr_t listen_addr = bind_addr;
   listen_addr.set_family(family);
@@ -242,7 +230,7 @@ int Processor::start(Worker *w)
   ldout(msgr->cct, 1) << __func__ << " " << dendl;
 
   // start thread
-  if (listen_sd > 0) {
+  if (listen_sd >= 0) {
     worker = w;
     w->center.create_file_event(listen_sd, EVENT_READABLE,
                                 EventCallbackRef(new C_processor_accept(this)));
@@ -302,27 +290,11 @@ void *Worker::entry()
 {
   ldout(cct, 10) << __func__ << " starting" << dendl;
   if (cct->_conf->ms_async_set_affinity) {
-#ifdef HAVE_SCHED
-    int cpuid;
-    cpu_set_t cpuset;
-    CPU_ZERO(&cpuset);
-
-    cpuid = pool->get_cpuid(id);
-    if (cpuid < 0) {
-      cpuid = sched_getcpu();
+    int cid = pool->get_cpuid(id);
+    if (cid >= 0 && set_affinity(cid)) {
+      ldout(cct, 0) << __func__ << " sched_setaffinity failed: "
+                    << cpp_strerror(errno) << dendl;
     }
-
-    if (cpuid < CPU_SETSIZE) {
-      CPU_SET(cpuid, &cpuset);
-
-      if (sched_setaffinity(0, sizeof(cpuset), &cpuset) < 0) {
-        ldout(cct, 0) << __func__ << " sched_setaffinity failed: "
-            << cpp_strerror(errno) << dendl;
-      }
-      /* guaranteed to take effect immediately */
-      sched_yield();
-    }
-#endif
   }
 
   center.set_owner(pthread_self());
@@ -365,13 +337,16 @@ WorkerPool::WorkerPool(CephContext *c): cct(c), seq(0), started(false),
     else
       lderr(cct) << __func__ << " failed to parse " << *it << " in " << cct->_conf->ms_async_affinity_cores << dendl;
   }
+
 }
 
 WorkerPool::~WorkerPool()
 {
   for (uint64_t i = 0; i < workers.size(); ++i) {
-    workers[i]->stop();
-    workers[i]->join();
+    if (workers[i]->is_started()) {
+      workers[i]->stop();
+      workers[i]->join();
+    }
     delete workers[i];
   }
 }
@@ -392,8 +367,8 @@ void WorkerPool::barrier()
   pthread_t cur = pthread_self();
   for (vector<Worker*>::iterator it = workers.begin(); it != workers.end(); ++it) {
     assert(cur != (*it)->center.get_owner());
-    (*it)->center.dispatch_event_external(EventCallbackRef(new C_barrier(this)));
     barrier_count.inc();
+    (*it)->center.dispatch_event_external(EventCallbackRef(new C_barrier(this)));
   }
   ldout(cct, 10) << __func__ << " wait for " << barrier_count.read() << " barrier" << dendl;
   Mutex::Locker l(barrier_lock);
@@ -409,17 +384,19 @@ void WorkerPool::barrier()
  */
 
 AsyncMessenger::AsyncMessenger(CephContext *cct, entity_name_t name,
-                               string mname, uint64_t _nonce)
+                               string mname, uint64_t _nonce, uint64_t features)
   : SimplePolicyMessenger(cct, name,mname, _nonce),
     processor(this, cct, _nonce),
     lock("AsyncMessenger::lock"),
-    nonce(_nonce), need_addr(true), did_bind(false),
+    nonce(_nonce), need_addr(true), listen_sd(-1), did_bind(false),
     global_seq(0), deleted_lock("AsyncMessenger::deleted_lock"),
     cluster_protocol(0), stopped(true)
 {
   ceph_spin_init(&global_seq_lock);
   cct->lookup_or_create_singleton_object<WorkerPool>(pool, WorkerPool::name);
-  local_connection = new AsyncConnection(cct, this, &pool->get_worker()->center);
+  Worker *w = pool->get_worker();
+  local_connection = new AsyncConnection(cct, this, &w->center, w->get_perf_counter());
+  local_features = features;
   init_local_connection();
 }
 
@@ -545,8 +522,8 @@ AsyncConnectionRef AsyncMessenger::add_accept(int sd)
 {
   lock.Lock();
   Worker *w = pool->get_worker();
-  AsyncConnectionRef conn = new AsyncConnection(cct, this, &w->center);
-  w->center.dispatch_event_external(EventCallbackRef(new C_conn_accept(conn, sd)));
+  AsyncConnectionRef conn = new AsyncConnection(cct, this, &w->center, w->get_perf_counter());
+  conn->accept(sd);
   accepting_conns.insert(conn);
   lock.Unlock();
   return conn;
@@ -562,10 +539,11 @@ AsyncConnectionRef AsyncMessenger::create_connect(const entity_addr_t& addr, int
 
   // create connection
   Worker *w = pool->get_worker();
-  AsyncConnectionRef conn = new AsyncConnection(cct, this, &w->center);
+  AsyncConnectionRef conn = new AsyncConnection(cct, this, &w->center, w->get_perf_counter());
   conn->connect(addr, type);
   assert(!conns.count(addr));
   conns[addr] = conn;
+  w->get_perf_counter()->inc(l_msgr_active_connections);
 
   return conn;
 }
@@ -692,6 +670,7 @@ void AsyncMessenger::mark_down_all()
     AsyncConnectionRef p = it->second;
     ldout(cct, 5) << __func__ << " mark down " << it->first << " " << p << dendl;
     conns.erase(it);
+    p->get_perf_counter()->dec(l_msgr_active_connections);
     p->stop();
   }
 
diff --git a/src/msg/async/AsyncMessenger.h b/src/msg/async/AsyncMessenger.h
index 685799f..ed8a0ff 100644
--- a/src/msg/async/AsyncMessenger.h
+++ b/src/msg/async/AsyncMessenger.h
@@ -41,6 +41,18 @@ using namespace std;
 class AsyncMessenger;
 class WorkerPool;
 
+enum {
+  l_msgr_first = 94000,
+  l_msgr_recv_messages,
+  l_msgr_send_messages,
+  l_msgr_recv_bytes,
+  l_msgr_send_bytes,
+  l_msgr_created_connections,
+  l_msgr_active_connections,
+  l_msgr_last,
+};
+
+
 class Worker : public Thread {
   static const uint64_t InitEventNumber = 5000;
   static const uint64_t EventMaxWaitUs = 30000000;
@@ -48,15 +60,37 @@ class Worker : public Thread {
   WorkerPool *pool;
   bool done;
   int id;
+  PerfCounters *perf_logger;
 
  public:
   EventCenter center;
   Worker(CephContext *c, WorkerPool *p, int i)
-    : cct(c), pool(p), done(false), id(i), center(c) {
+    : cct(c), pool(p), done(false), id(i), perf_logger(NULL), center(c) {
     center.init(InitEventNumber);
+    char name[128];
+    sprintf(name, "AsyncMessenger::Worker-%d", id);
+    // initialize perf_logger
+    PerfCountersBuilder plb(cct, name, l_msgr_first, l_msgr_last);
+
+    plb.add_u64_counter(l_msgr_recv_messages, "msgr_recv_messages", "Network received messages");
+    plb.add_u64_counter(l_msgr_send_messages, "msgr_send_messages", "Network sent messages");
+    plb.add_u64_counter(l_msgr_recv_bytes, "msgr_recv_bytes", "Network received bytes");
+    plb.add_u64_counter(l_msgr_send_bytes, "msgr_send_bytes", "Network received bytes");
+    plb.add_u64_counter(l_msgr_created_connections, "msgr_active_connections", "Active connection number");
+    plb.add_u64_counter(l_msgr_active_connections, "msgr_created_connections", "Created connection number");
+
+    perf_logger = plb.create_perf_counters();
+    cct->get_perfcounters_collection()->add(perf_logger);
+  }
+  ~Worker() {
+    if (perf_logger) {
+      cct->get_perfcounters_collection()->remove(perf_logger);
+      delete perf_logger;
+    }
   }
   void *entry();
   void stop();
+  PerfCounters *get_perf_counter() { return perf_logger; }
 };
 
 /**
@@ -80,7 +114,7 @@ class Processor {
   void accept();
 };
 
-class WorkerPool: CephContext::AssociatedSingletonObject {
+class WorkerPool {
   WorkerPool(const WorkerPool &);
   WorkerPool& operator=(const WorkerPool &);
   CephContext *cct;
@@ -140,7 +174,7 @@ public:
    * be a value that will be repeated if the daemon restarts.
    */
   AsyncMessenger(CephContext *cct, entity_name_t name,
-                 string mname, uint64_t _nonce);
+                 string mname, uint64_t _nonce, uint64_t features);
 
   /**
    * Destroy the AsyncMessenger. Pretty simple since all the work is done
@@ -217,7 +251,7 @@ public:
   Connection *create_anon_connection() {
     Mutex::Locker l(lock);
     Worker *w = pool->get_worker();
-    return new AsyncConnection(cct, this, &w->center);
+    return new AsyncConnection(cct, this, &w->center, w->get_perf_counter());
   }
 
   /**
@@ -249,9 +283,6 @@ private:
    *
    * @param addr The address of the entity to connect to.
    * @param type The peer type of the entity at the address.
-   * @param con An existing Connection to associate with the new connection. If
-   * NULL, it creates a new Connection.
-   * @param msg an initial message to queue on the new connection
    *
    * @return a pointer to the newly-created connection. Caller does not own a
    * reference; take one if you need it.
@@ -266,7 +297,7 @@ private:
    *
    * @param m The Message to queue up. This function eats a reference.
    * @param con The existing Connection to use, or NULL if you don't know of one.
-   * @param addr The address to send the Message to.
+   * @param dest_addr The address to send the Message to.
    * @param dest_type The peer type of the address we're sending to
    * just drop silently under failure.
    */
@@ -353,6 +384,7 @@ private:
     Mutex::Locker l(deleted_lock);
     if (deleted_conns.count(p->second)) {
       deleted_conns.erase(p->second);
+      p->second->get_perf_counter()->dec(l_msgr_active_connections);
       conns.erase(p);
       return NULL;
     }
@@ -364,6 +396,7 @@ private:
     assert(lock.is_locked());
     local_connection->peer_addr = my_inst.addr;
     local_connection->peer_type = my_inst.name.type();
+    local_connection->set_features(local_features);
     ms_deliver_handle_fast_connect(local_connection.get());
   }
 
@@ -371,6 +404,7 @@ public:
 
   /// con used for sending messages to ourselves
   ConnectionRef local_connection;
+  uint64_t local_features;
 
   /**
    * @defgroup AsyncMessenger internals
@@ -399,6 +433,7 @@ public:
       }
     }
     conns[conn->peer_addr] = conn;
+    conn->get_perf_counter()->inc(l_msgr_active_connections);
     accepting_conns.erase(conn);
     return 0;
   }
diff --git a/src/msg/async/Event.cc b/src/msg/async/Event.cc
index bbb2457..03119de 100644
--- a/src/msg/async/Event.cc
+++ b/src/msg/async/Event.cc
@@ -32,23 +32,36 @@
 #define dout_subsys ceph_subsys_ms
 
 #undef dout_prefix
+#define dout_prefix *_dout << "EventCallback "
+class C_handle_notify : public EventCallback {
+  EventCenter *center;
+  CephContext *cct;
+
+ public:
+  C_handle_notify(EventCenter *c, CephContext *cc): center(c), cct(cc) {}
+  void do_request(int fd_or_id) {
+    char c[256];
+    int r;
+    do {
+      center->already_wakeup.set(0);
+      r = read(fd_or_id, c, sizeof(c));
+      if (r < 0) {
+        ldout(cct, 1) << __func__ << " read notify pipe failed: " << cpp_strerror(errno) << dendl;
+        break;
+      }
+    } while (center->already_wakeup.read());
+  }
+};
+
+#undef dout_prefix
 #define dout_prefix _event_prefix(_dout)
+
 ostream& EventCenter::_event_prefix(std::ostream *_dout)
 {
   return *_dout << "Event(" << this << " owner=" << get_owner() << " nevent=" << nevent
                 << " time_id=" << time_event_next_id << ").";
 }
 
-class C_handle_notify : public EventCallback {
- public:
-  C_handle_notify() {}
-  void do_request(int fd_or_id) {
-    char c[100];
-    int r = read(fd_or_id, c, 100);
-    assert(r > 0);
-  }
-};
-
 int EventCenter::init(int n)
 {
   // can't init multi times
@@ -86,25 +99,31 @@ int EventCenter::init(int n)
   if (r < 0) {
     return -1;
   }
+  r = net.set_nonblock(notify_send_fd);
+  if (r < 0) {
+    return -1;
+  }
 
   file_events = static_cast<FileEvent *>(malloc(sizeof(FileEvent)*n));
   memset(file_events, 0, sizeof(FileEvent)*n);
 
   nevent = n;
-  create_file_event(notify_receive_fd, EVENT_READABLE, EventCallbackRef(new C_handle_notify()));
+  create_file_event(notify_receive_fd, EVENT_READABLE, EventCallbackRef(new C_handle_notify(this, cct)));
   return 0;
 }
 
 EventCenter::~EventCenter()
 {
+  if (notify_receive_fd >= 0) {
+    delete_file_event(notify_receive_fd, EVENT_READABLE);
+    ::close(notify_receive_fd);
+  }
+  if (notify_send_fd >= 0)
+    ::close(notify_send_fd);
+    
   delete driver;
-
   if (file_events)
     free(file_events);
-  if (notify_receive_fd > 0)
-    ::close(notify_receive_fd);
-  if (notify_send_fd > 0)
-    ::close(notify_send_fd);
 }
 
 int EventCenter::create_file_event(int fd, int mask, EventCallbackRef ctxt)
@@ -138,8 +157,13 @@ int EventCenter::create_file_event(int fd, int mask, EventCallbackRef ctxt)
     return 0;
 
   r = driver->add_event(fd, event->mask, mask);
-  if (r < 0)
+  if (r < 0) {
+    // Actually we don't allow any failed error code, caller doesn't prepare to
+    // handle error status. So now we need to assert failure here. In practice,
+    // add_event shouldn't report error, otherwise it must be a innermost bug!
+    assert(0 == "BUG!");
     return r;
+  }
 
   event->mask |= mask;
   if (mask & EVENT_READABLE) {
@@ -168,7 +192,11 @@ void EventCenter::delete_file_event(int fd, int mask)
   if (!event->mask)
     return ;
 
-  driver->del_event(fd, event->mask, mask);
+  int r = driver->del_event(fd, event->mask, mask);
+  if (r < 0) {
+    // see create_file_event
+    assert(0 == "BUG!");
+  }
 
   if (mask & EVENT_READABLE && event->read_cb) {
     event->read_cb.reset();
@@ -237,13 +265,15 @@ void EventCenter::delete_time_event(uint64_t id)
 
 void EventCenter::wakeup()
 {
-  ldout(cct, 1) << __func__ << dendl;
-  char buf[1];
-  buf[0] = 'c';
-  // wake up "event_wait"
-  int n = write(notify_send_fd, buf, 1);
-  // FIXME ?
-  assert(n == 1);
+  if (already_wakeup.compare_and_swap(0, 1)) {
+    ldout(cct, 1) << __func__ << dendl;
+    char buf[1];
+    buf[0] = 'c';
+    // wake up "event_wait"
+    int n = write(notify_send_fd, buf, 1);
+    // FIXME ?
+    assert(n == 1);
+  }
 }
 
 int EventCenter::process_time_events()
@@ -337,44 +367,55 @@ int EventCenter::process_events(int timeout_microseconds)
   vector<FiredFileEvent> fired_events;
   next_time = shortest;
   numevents = driver->event_wait(fired_events, &tv);
+  file_lock.Lock();
   for (int j = 0; j < numevents; j++) {
     int rfired = 0;
     FileEvent *event;
-    {
-      Mutex::Locker l(file_lock);
-      event = _get_file_event(fired_events[j].fd);
-    }
+    EventCallbackRef cb;
+    event = _get_file_event(fired_events[j].fd);
 
+    // FIXME: Actually we need to pick up some ways to reduce potential
+    // file_lock contention here.
     /* note the event->mask & mask & ... code: maybe an already processed
     * event removed an element that fired and we still didn't
     * processed, so we check if the event is still valid. */
     if (event->mask & fired_events[j].mask & EVENT_READABLE) {
       rfired = 1;
-      event->read_cb->do_request(fired_events[j].fd);
+      cb = event->read_cb;
+      file_lock.Unlock();
+      cb->do_request(fired_events[j].fd);
+      file_lock.Lock();
     }
 
     if (event->mask & fired_events[j].mask & EVENT_WRITABLE) {
-      if (!rfired || event->read_cb != event->write_cb)
-        event->write_cb->do_request(fired_events[j].fd);
+      if (!rfired || event->read_cb != event->write_cb) {
+        cb = event->write_cb;
+        file_lock.Unlock();
+        cb->do_request(fired_events[j].fd);
+        file_lock.Lock();
+      }
     }
 
     ldout(cct, 20) << __func__ << " event_wq process is " << fired_events[j].fd << " mask is " << fired_events[j].mask << dendl;
   }
+  file_lock.Unlock();
 
   if (trigger_time)
     numevents += process_time_events();
 
-  {
-    external_lock.Lock();
-    while (!external_events.empty()) {
-      EventCallbackRef e = external_events.front();
-      external_events.pop_front();
-      external_lock.Unlock();
+  external_lock.Lock();
+  if (external_events.empty()) {
+    external_lock.Unlock();
+  } else {
+    deque<EventCallbackRef> cur_process;
+    cur_process.swap(external_events);
+    external_lock.Unlock();
+    while (!cur_process.empty()) {
+      EventCallbackRef e = cur_process.front();
       if (e)
         e->do_request(0);
-      external_lock.Lock();
+      cur_process.pop_front();
     }
-    external_lock.Unlock();
   }
   return numevents;
 }
diff --git a/src/msg/async/Event.h b/src/msg/async/Event.h
index 729500c..dc85238 100644
--- a/src/msg/async/Event.h
+++ b/src/msg/async/Event.h
@@ -39,6 +39,7 @@
 
 #include <pthread.h>
 
+#include "include/atomic.h"
 #include "include/Context.h"
 #include "include/unordered_map.h"
 #include "common/WorkQueue.h"
@@ -74,7 +75,7 @@ class EventDriver {
   virtual ~EventDriver() {}       // we want a virtual destructor!!!
   virtual int init(int nevent) = 0;
   virtual int add_event(int fd, int cur_mask, int mask) = 0;
-  virtual void del_event(int fd, int cur_mask, int del_mask) = 0;
+  virtual int del_event(int fd, int cur_mask, int del_mask) = 0;
   virtual int event_wait(vector<FiredFileEvent> &fired_events, struct timeval *tp) = 0;
   virtual int resize_events(int newsize) = 0;
 };
@@ -124,6 +125,8 @@ class EventCenter {
   }
 
  public:
+  atomic_t already_wakeup;
+
   EventCenter(CephContext *c):
     cct(c), nevent(0),
     external_lock("AsyncMessenger::external_lock"),
@@ -131,7 +134,7 @@ class EventCenter {
     time_lock("AsyncMessenger::time_lock"),
     file_events(NULL),
     driver(NULL), time_event_next_id(0),
-    notify_receive_fd(-1), notify_send_fd(-1), net(c), owner(0) {
+    notify_receive_fd(-1), notify_send_fd(-1), net(c), owner(0), already_wakeup(0) {
     last_time = time(NULL);
   }
   ~EventCenter();
diff --git a/src/msg/async/EventEpoll.cc b/src/msg/async/EventEpoll.cc
index c33dd3d..fe6e54d 100644
--- a/src/msg/async/EventEpoll.cc
+++ b/src/msg/async/EventEpoll.cc
@@ -71,12 +71,13 @@ int EpollDriver::add_event(int fd, int cur_mask, int add_mask)
   return 0;
 }
 
-void EpollDriver::del_event(int fd, int cur_mask, int delmask)
+int EpollDriver::del_event(int fd, int cur_mask, int delmask)
 {
   ldout(cct, 20) << __func__ << " del event fd=" << fd << " cur_mask=" << cur_mask
                  << " delmask=" << delmask << " to " << epfd << dendl;
   struct epoll_event ee;
   int mask = cur_mask & (~delmask);
+  int r = 0;
 
   ee.events = 0;
   if (mask & EVENT_READABLE) ee.events |= EPOLLIN;
@@ -84,18 +85,21 @@ void EpollDriver::del_event(int fd, int cur_mask, int delmask)
   ee.data.u64 = 0; /* avoid valgrind warning */
   ee.data.fd = fd;
   if (mask != EVENT_NONE) {
-    if (epoll_ctl(epfd, EPOLL_CTL_MOD, fd, &ee) < 0) {
+    if ((r = epoll_ctl(epfd, EPOLL_CTL_MOD, fd, &ee)) < 0) {
       lderr(cct) << __func__ << " epoll_ctl: modify fd=" << fd << " mask=" << mask
                  << " failed." << cpp_strerror(errno) << dendl;
+      return r;
     }
   } else {
     /* Note, Kernel < 2.6.9 requires a non null event pointer even for
      * EPOLL_CTL_DEL. */
-    if (epoll_ctl(epfd, EPOLL_CTL_DEL, fd, &ee) < 0) {
+    if ((r = epoll_ctl(epfd, EPOLL_CTL_DEL, fd, &ee)) < 0) {
       lderr(cct) << __func__ << " epoll_ctl: delete fd=" << fd
                  << " failed." << cpp_strerror(errno) << dendl;
+      return r;
     }
   }
+  return 0;
 }
 
 int EpollDriver::resize_events(int newsize)
diff --git a/src/msg/async/EventEpoll.h b/src/msg/async/EventEpoll.h
index 6ad32e4..7f01488 100644
--- a/src/msg/async/EventEpoll.h
+++ b/src/msg/async/EventEpoll.h
@@ -29,7 +29,7 @@ class EpollDriver : public EventDriver {
   int size;
 
  public:
-  EpollDriver(CephContext *c): epfd(-1), events(NULL), cct(c) {}
+  EpollDriver(CephContext *c): epfd(-1), events(NULL), cct(c), size(0) {}
   virtual ~EpollDriver() {
     if (epfd != -1)
       close(epfd);
@@ -40,7 +40,7 @@ class EpollDriver : public EventDriver {
 
   int init(int nevent);
   int add_event(int fd, int cur_mask, int add_mask);
-  void del_event(int fd, int cur_mask, int del_mask);
+  int del_event(int fd, int cur_mask, int del_mask);
   int resize_events(int newsize);
   int event_wait(vector<FiredFileEvent> &fired_events, struct timeval *tp);
 };
diff --git a/src/msg/async/EventKqueue.cc b/src/msg/async/EventKqueue.cc
index d0056f1..c357a92 100644
--- a/src/msg/async/EventKqueue.cc
+++ b/src/msg/async/EventKqueue.cc
@@ -65,23 +65,26 @@ int KqueueDriver::add_event(int fd, int cur_mask, int add_mask)
   return 0;
 }
 
-void KqueueDriver::del_event(int fd, int cur_mask, int delmask)
+int KqueueDriver::del_event(int fd, int cur_mask, int delmask)
 {
   ldout(cct, 20) << __func__ << " del event fd=" << fd << " cur mask=" << cur_mask
                  << " delmask=" << delmask << dendl;
   struct kevent ee;
   struct kevent ke;
   int filter = 0;
+  int r = 0;
   filter |= (delmask & EVENT_READABLE) ? EVFILT_READ : 0;
   filter |= (delmask & EVENT_WRITABLE) ? EVFILT_WRITE : 0;
 
   if (filter) {
     EV_SET(&ke, fd, filter, EV_DELETE, 0, 0, NULL);
-    if (kevent(kqfd, &ke, 1, NULL, 0, NULL) < 0) {
+    if ((r = kevent(kqfd, &ke, 1, NULL, 0, NULL)) < 0) {
       lderr(cct) << __func__ << " kevent: delete fd=" << fd << " mask=" << filter
                  << " failed." << cpp_strerror(errno) << dendl;
+      return r;
     }
   }
+  return 0;
 }
 
 int KqueueDriver::resize_events(int newsize)
diff --git a/src/msg/async/EventKqueue.h b/src/msg/async/EventKqueue.h
index 04169ad..c7f99a7 100644
--- a/src/msg/async/EventKqueue.h
+++ b/src/msg/async/EventKqueue.h
@@ -17,6 +17,7 @@
 #ifndef CEPH_MSG_EVENTKQUEUE_H
 #define CEPH_MSG_EVENTKQUEUE_H
 
+#include <sys/types.h>
 #include <sys/event.h>
 #include <unistd.h>
 
@@ -40,7 +41,7 @@ class KqueueDriver : public EventDriver {
 
   int init(int nevent);
   int add_event(int fd, int cur_mask, int add_mask);
-  void del_event(int fd, int cur_mask, int del_mask);
+  int del_event(int fd, int cur_mask, int del_mask);
   int resize_events(int newsize);
   int event_wait(vector<FiredFileEvent> &fired_events, struct timeval *tp);
 };
diff --git a/src/msg/async/EventSelect.cc b/src/msg/async/EventSelect.cc
index 34bb3a8..7ee22e8 100644
--- a/src/msg/async/EventSelect.cc
+++ b/src/msg/async/EventSelect.cc
@@ -48,7 +48,7 @@ int SelectDriver::add_event(int fd, int cur_mask, int add_mask)
   return 0;
 }
 
-void SelectDriver::del_event(int fd, int cur_mask, int delmask)
+int SelectDriver::del_event(int fd, int cur_mask, int delmask)
 {
   ldout(cct, 10) << __func__ << " del event fd=" << fd << " cur mask=" << cur_mask
                  << dendl;
@@ -57,6 +57,7 @@ void SelectDriver::del_event(int fd, int cur_mask, int delmask)
     FD_CLR(fd, &rfds);
   if (delmask & EVENT_WRITABLE)
     FD_CLR(fd, &wfds);
+  return 0;
 }
 
 int SelectDriver::resize_events(int newsize)
diff --git a/src/msg/async/EventSelect.h b/src/msg/async/EventSelect.h
index 39160b2..96ec322 100644
--- a/src/msg/async/EventSelect.h
+++ b/src/msg/async/EventSelect.h
@@ -31,12 +31,12 @@ class SelectDriver : public EventDriver {
   CephContext *cct;
 
  public:
-  SelectDriver(CephContext *c): cct(c) {}
+  SelectDriver(CephContext *c): max_fd(0), cct(c) {}
   virtual ~SelectDriver() {}
 
   int init(int nevent);
   int add_event(int fd, int cur_mask, int add_mask);
-  void del_event(int fd, int cur_mask, int del_mask);
+  int del_event(int fd, int cur_mask, int del_mask);
   int resize_events(int newsize);
   int event_wait(vector<FiredFileEvent> &fired_events, struct timeval *tp);
 };
diff --git a/src/msg/async/net_handler.cc b/src/msg/async/net_handler.cc
index 8e6468c..2639fdc 100644
--- a/src/msg/async/net_handler.cc
+++ b/src/msg/async/net_handler.cc
@@ -116,6 +116,9 @@ int NetHandler::generic_connect(const entity_addr_t& addr, bool nonblock)
       return ret;
     }
   }
+
+  set_socket_options(s);
+
   ret = ::connect(s, (sockaddr*)&addr.addr, addr.addr_size());
   if (ret < 0) {
     if (errno == EINPROGRESS && nonblock)
@@ -126,8 +129,6 @@ int NetHandler::generic_connect(const entity_addr_t& addr, bool nonblock)
     return -errno;
   }
 
-  set_socket_options(s);
-
   return s;
 }
 
diff --git a/src/msg/msg_types.h b/src/msg/msg_types.h
index 9c0f266..bf668e0 100644
--- a/src/msg/msg_types.h
+++ b/src/msg/msg_types.h
@@ -20,7 +20,6 @@
 #include "include/types.h"
 #include "include/blobhash.h"
 #include "include/encoding.h"
-#include "include/hash_namespace.h"
 
 namespace ceph {
   class Formatter;
@@ -39,7 +38,7 @@ public:
   static const int TYPE_OSD = CEPH_ENTITY_TYPE_OSD;
   static const int TYPE_CLIENT = CEPH_ENTITY_TYPE_CLIENT;
 
-  static const int NEW = -1;
+  static const int64_t NEW = -1;
 
   // cons
   entity_name_t() : _type(0), _num(0) { }
@@ -48,10 +47,10 @@ public:
     _type(n.type), _num(n.num) { }
 
   // static cons
-  static entity_name_t MON(int i=NEW) { return entity_name_t(TYPE_MON, i); }
-  static entity_name_t MDS(int i=NEW) { return entity_name_t(TYPE_MDS, i); }
-  static entity_name_t OSD(int i=NEW) { return entity_name_t(TYPE_OSD, i); }
-  static entity_name_t CLIENT(int i=NEW) { return entity_name_t(TYPE_CLIENT, i); }
+  static entity_name_t MON(int64_t i=NEW) { return entity_name_t(TYPE_MON, i); }
+  static entity_name_t MDS(int64_t i=NEW) { return entity_name_t(TYPE_MDS, i); }
+  static entity_name_t OSD(int64_t i=NEW) { return entity_name_t(TYPE_OSD, i); }
+  static entity_name_t CLIENT(int64_t i=NEW) { return entity_name_t(TYPE_CLIENT, i); }
   
   int64_t num() const { return _num; }
   int type() const { return _type; }
@@ -133,7 +132,7 @@ inline std::ostream& operator<<(std::ostream& out, const ceph_entity_name& addr)
   return out << *(const entity_name_t*)&addr;
 }
 
-CEPH_HASH_NAMESPACE_START
+namespace std {
   template<> struct hash< entity_name_t >
   {
     size_t operator()( const entity_name_t &m ) const
@@ -141,7 +140,7 @@ CEPH_HASH_NAMESPACE_START
       return rjhash32(m.type() ^ m.num());
     }
   };
-CEPH_HASH_NAMESPACE_END
+} // namespace std
 
 
 
@@ -157,14 +156,21 @@ CEPH_HASH_NAMESPACE_END
  */
 static inline void encode(const sockaddr_storage& a, bufferlist& bl) {
   struct sockaddr_storage ss = a;
-#if !defined(__FreeBSD__)
+#if defined(DARWIN) || defined(__FreeBSD__)
+  unsigned short *ss_family = reinterpret_cast<unsigned short*>(&ss);
+  *ss_family = htons(a.ss_family);
+#else
   ss.ss_family = htons(ss.ss_family);
 #endif
   ::encode_raw(ss, bl);
 }
 static inline void decode(sockaddr_storage& a, bufferlist::iterator& bl) {
   ::decode_raw(a, bl);
-#if !defined(__FreeBSD__)
+#if defined(DARWIN) || defined(__FreeBSD__)
+  unsigned short *ss_family = reinterpret_cast<unsigned short *>(&a);
+  a.ss_family = ntohs(*ss_family);
+  a.ss_len = 0;
+#else
   a.ss_family = ntohs(a.ss_family);
 #endif
 }
@@ -353,7 +359,7 @@ inline bool operator<=(const entity_addr_t& a, const entity_addr_t& b) { return
 inline bool operator>(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) > 0; }
 inline bool operator>=(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) >= 0; }
 
-CEPH_HASH_NAMESPACE_START
+namespace std {
   template<> struct hash< entity_addr_t >
   {
     size_t operator()( const entity_addr_t& x ) const
@@ -362,7 +368,7 @@ CEPH_HASH_NAMESPACE_START
       return H((const char*)&x, sizeof(x));
     }
   };
-CEPH_HASH_NAMESPACE_END
+} // namespace std
 
 
 /*
@@ -407,7 +413,7 @@ inline bool operator<=(const entity_inst_t& a, const entity_inst_t& b) {
 inline bool operator>(const entity_inst_t& a, const entity_inst_t& b) { return b < a; }
 inline bool operator>=(const entity_inst_t& a, const entity_inst_t& b) { return b <= a; }
 
-CEPH_HASH_NAMESPACE_START
+namespace std {
   template<> struct hash< entity_inst_t >
   {
     size_t operator()( const entity_inst_t& x ) const
@@ -417,7 +423,7 @@ CEPH_HASH_NAMESPACE_START
       return H(x.name) ^ I(x.addr);
     }
   };
-CEPH_HASH_NAMESPACE_END
+} // namespace std
 
 
 inline ostream& operator<<(ostream& out, const entity_inst_t &i)
diff --git a/src/msg/simple/Accepter.cc b/src/msg/simple/Accepter.cc
index 7d989a9..a8aa495 100644
--- a/src/msg/simple/Accepter.cc
+++ b/src/msg/simple/Accepter.cc
@@ -116,6 +116,7 @@ int Accepter::bind(const entity_addr_t &bind_addr, const set<int>& avoid_ports)
                              << ": " << cpp_strerror(errno)
                              << dendl;
             r = -errno;
+            listen_addr.set_port(0); //Clear port before retry, otherwise we shall fail again.
             continue;
         }
         ldout(msgr->cct,10) << "accepter.bind bound on random port " << listen_addr << dendl;
@@ -140,6 +141,16 @@ int Accepter::bind(const entity_addr_t &bind_addr, const set<int>& avoid_ports)
     return rc;
   }
   
+  if (msgr->cct->_conf->ms_tcp_rcvbuf) {
+    int size = msgr->cct->_conf->ms_tcp_rcvbuf;
+    rc = ::setsockopt(listen_sd, SOL_SOCKET, SO_RCVBUF, (void*)&size, sizeof(size));
+    if (rc < 0)  {
+      rc = -errno;
+      lderr(msgr->cct) << "accepter.bind failed to set SO_RCVBUF to " << size << ": " << cpp_strerror(r) << dendl;
+      return rc;
+    }
+  }
+
   ldout(msgr->cct,10) << "accepter.bind bound to " << listen_addr << dendl;
 
   // listen!
diff --git a/src/msg/simple/DispatchQueue.cc b/src/msg/simple/DispatchQueue.cc
index c47ee72..500239f 100644
--- a/src/msg/simple/DispatchQueue.cc
+++ b/src/msg/simple/DispatchQueue.cc
@@ -227,6 +227,17 @@ void DispatchQueue::wait()
   dispatch_thread.join();
 }
 
+void DispatchQueue::discard_local()
+{
+  for (list<pair<Message *, int> >::iterator p = local_messages.begin();
+       p != local_messages.end();
+       ++p) {
+    ldout(cct,20) << __func__ << " " << p->first << dendl;
+    p->first->put();
+  }
+  local_messages.clear();
+}
+
 void DispatchQueue::shutdown()
 {
   // stop my local delivery thread
diff --git a/src/msg/simple/DispatchQueue.h b/src/msg/simple/DispatchQueue.h
index 606b850..d379f55 100644
--- a/src/msg/simple/DispatchQueue.h
+++ b/src/msg/simple/DispatchQueue.h
@@ -180,6 +180,7 @@ class DispatchQueue {
   void fast_preprocess(Message *m);
   void enqueue(Message *m, int priority, uint64_t id);
   void discard_queue(uint64_t id);
+  void discard_local();
   uint64_t get_id() {
     Mutex::Locker l(lock);
     return next_pipe_id++;
diff --git a/src/msg/simple/Pipe.cc b/src/msg/simple/Pipe.cc
index ab277e0..d148378 100644
--- a/src/msg/simple/Pipe.cc
+++ b/src/msg/simple/Pipe.cc
@@ -12,7 +12,9 @@
  * 
  */
 
+#include <sys/types.h>
 #include <sys/socket.h>
+#include <netinet/in.h>
 #include <netinet/ip.h>
 #include <netinet/tcp.h>
 #include <sys/uio.h>
@@ -32,42 +34,28 @@
 #include "auth/cephx/CephxProtocol.h"
 #include "auth/AuthSessionHandler.h"
 
+#include "include/sock_compat.h"
+
 // Constant to limit starting sequence number to 2^31.  Nothing special about it, just a big number.  PLR
 #define SEQ_MASK  0x7fffffff 
 #define dout_subsys ceph_subsys_ms
 
 #undef dout_prefix
-#define dout_prefix _pipe_prefix(_dout)
-ostream& Pipe::_pipe_prefix(std::ostream *_dout) {
-  return *_dout << "-- " << msgr->get_myinst().addr << " >> " << peer_addr << " pipe(" << this
-		<< " sd=" << sd << " :" << port
-		<< " s=" << state
-		<< " pgs=" << peer_global_seq
-		<< " cs=" << connect_seq
-		<< " l=" << policy.lossy
-		<< " c=" << connection_state
-		<< ").";
+#define dout_prefix *_dout << *this
+ostream& Pipe::_pipe_prefix(std::ostream &out) const {
+  return out << "-- " << msgr->get_myinst().addr << " >> " << peer_addr << " pipe(" << this
+	     << " sd=" << sd << " :" << port
+             << " s=" << state
+             << " pgs=" << peer_global_seq
+             << " cs=" << connect_seq
+             << " l=" << policy.lossy
+             << " c=" << connection_state
+             << ").";
 }
 
-/*
- * This optimization may not be available on all platforms (e.g. OSX).
- * Apparently a similar approach based on TCP_CORK can be used.
- */
-#ifndef MSG_MORE
-# define MSG_MORE 0
-#endif
-
-/*
- * On BSD SO_NOSIGPIPE can be set via setsockopt to block SIGPIPE.
- */
-#ifndef MSG_NOSIGNAL
-# define MSG_NOSIGNAL 0
-# ifdef SO_NOSIGPIPE
-#  define CEPH_USE_SO_NOSIGPIPE
-# else
-#  error "Cannot block SIGPIPE!"
-# endif
-#endif
+ostream& operator<<(ostream &out, const Pipe &pipe) {
+  return pipe._pipe_prefix(out);
+}
 
 /**************************************
  * Pipe
@@ -181,7 +169,7 @@ void Pipe::join_reader()
 
 void Pipe::DelayedDelivery::discard()
 {
-  lgeneric_subdout(pipe->msgr->cct, ms, 20) << pipe->_pipe_prefix(_dout) << "DelayedDelivery::discard" << dendl;
+  lgeneric_subdout(pipe->msgr->cct, ms, 20) << *pipe << "DelayedDelivery::discard" << dendl;
   Mutex::Locker l(delay_lock);
   while (!delay_queue.empty()) {
     Message *m = delay_queue.front().second;
@@ -193,7 +181,7 @@ void Pipe::DelayedDelivery::discard()
 
 void Pipe::DelayedDelivery::flush()
 {
-  lgeneric_subdout(pipe->msgr->cct, ms, 20) << pipe->_pipe_prefix(_dout) << "DelayedDelivery::flush" << dendl;
+  lgeneric_subdout(pipe->msgr->cct, ms, 20) << *pipe << "DelayedDelivery::flush" << dendl;
   Mutex::Locker l(delay_lock);
   flush_count = delay_queue.size();
   delay_cond.Signal();
@@ -202,11 +190,11 @@ void Pipe::DelayedDelivery::flush()
 void *Pipe::DelayedDelivery::entry()
 {
   Mutex::Locker locker(delay_lock);
-  lgeneric_subdout(pipe->msgr->cct, ms, 20) << pipe->_pipe_prefix(_dout) << "DelayedDelivery::entry start" << dendl;
+  lgeneric_subdout(pipe->msgr->cct, ms, 20) << *pipe << "DelayedDelivery::entry start" << dendl;
 
   while (!stop_delayed_delivery) {
     if (delay_queue.empty()) {
-      lgeneric_subdout(pipe->msgr->cct, ms, 30) << pipe->_pipe_prefix(_dout) << "DelayedDelivery::entry sleeping on delay_cond because delay queue is empty" << dendl;
+      lgeneric_subdout(pipe->msgr->cct, ms, 30) << *pipe << "DelayedDelivery::entry sleeping on delay_cond because delay queue is empty" << dendl;
       delay_cond.Wait(delay_lock);
       continue;
     }
@@ -216,11 +204,11 @@ void *Pipe::DelayedDelivery::entry()
     if (!flush_count &&
         (release > ceph_clock_now(pipe->msgr->cct) &&
          (delay_msg_type.empty() || m->get_type_name() == delay_msg_type))) {
-      lgeneric_subdout(pipe->msgr->cct, ms, 10) << pipe->_pipe_prefix(_dout) << "DelayedDelivery::entry sleeping on delay_cond until " << release << dendl;
+      lgeneric_subdout(pipe->msgr->cct, ms, 10) << *pipe << "DelayedDelivery::entry sleeping on delay_cond until " << release << dendl;
       delay_cond.WaitUntil(delay_lock, release);
       continue;
     }
-    lgeneric_subdout(pipe->msgr->cct, ms, 10) << pipe->_pipe_prefix(_dout) << "DelayedDelivery::entry dequeuing message " << m << " for delivery, past " << release << dendl;
+    lgeneric_subdout(pipe->msgr->cct, ms, 10) << *pipe << "DelayedDelivery::entry dequeuing message " << m << " for delivery, past " << release << dendl;
     delay_queue.pop_front();
     if (flush_count > 0) {
       --flush_count;
@@ -245,7 +233,7 @@ void *Pipe::DelayedDelivery::entry()
     }
     active_flush = false;
   }
-  lgeneric_subdout(pipe->msgr->cct, ms, 20) << pipe->_pipe_prefix(_dout) << "DelayedDelivery::entry stop" << dendl;
+  lgeneric_subdout(pipe->msgr->cct, ms, 20) << *pipe << "DelayedDelivery::entry stop" << dendl;
   return NULL;
 }
 
@@ -850,7 +838,7 @@ void Pipe::set_socket_options()
 
   int prio = msgr->get_socket_priority();
   if (prio >= 0) {
-    int r;
+    int r = -1;
 #ifdef IPTOS_CLASS_CS6
     int iptos = IPTOS_CLASS_CS6;
     r = ::setsockopt(sd, IPPROTO_IP, IP_TOS, &iptos, sizeof(iptos));
@@ -862,7 +850,9 @@ void Pipe::set_socket_options()
     // setsockopt(IPTOS_CLASS_CS6) sets the priority of the socket as 0.
     // See http://goo.gl/QWhvsD and http://goo.gl/laTbjT
     // We need to call setsockopt(SO_PRIORITY) after it.
+#if defined(__linux__)
     r = ::setsockopt(sd, SOL_SOCKET, SO_PRIORITY, &prio, sizeof(prio));
+#endif
     if (r < 0) {
       ldout(msgr->cct,0) << "couldn't set SO_PRIORITY to " << prio
                          << ": " << cpp_strerror(errno) << dendl;
@@ -909,6 +899,9 @@ int Pipe::connect()
   }
 
   recv_reset();
+
+  set_socket_options();
+
   // connect!
   ldout(msgr->cct,10) << "connecting to " << peer_addr << dendl;
   rc = ::connect(sd, (sockaddr*)&peer_addr.addr, peer_addr.addr_size());
@@ -918,8 +911,6 @@ int Pipe::connect()
     goto fail;
   }
 
-  set_socket_options();
-
   // verify banner
   // FIXME: this should be non-blocking, or in some other way verify the banner as we get it.
   if (tcp_read((char*)&banner, strlen(CEPH_BANNER)) < 0) {
@@ -1796,8 +1787,8 @@ void Pipe::writer()
 	m->encode(features, msgr->crcflags);
 
 	// prepare everything
-	ceph_msg_header& header = m->get_header();
-	ceph_msg_footer& footer = m->get_footer();
+	const ceph_msg_header& header = m->get_header();
+	const ceph_msg_footer& footer = m->get_footer();
 
 	// Now that we have all the crcs calculated, handle the
 	// digital signature for the message, if the pipe has session
@@ -2235,7 +2226,7 @@ int Pipe::write_keepalive2(char tag, const utime_t& t)
 }
 
 
-int Pipe::write_message(ceph_msg_header& header, ceph_msg_footer& footer, bufferlist& blist)
+int Pipe::write_message(const ceph_msg_header& header, const ceph_msg_footer& footer, bufferlist& blist)
 {
   int ret;
 
diff --git a/src/msg/simple/Pipe.h b/src/msg/simple/Pipe.h
index 9b464a5..0c1671a 100644
--- a/src/msg/simple/Pipe.h
+++ b/src/msg/simple/Pipe.h
@@ -134,7 +134,7 @@ class DispatchQueue;
 
     SimpleMessenger *msgr;
     uint64_t conn_id;
-    ostream& _pipe_prefix(std::ostream *_dout);
+    ostream& _pipe_prefix(std::ostream &out) const;
 
     Pipe* get() {
       return static_cast<Pipe*>(RefCountedObject::get());
@@ -230,7 +230,7 @@ class DispatchQueue;
 
     int read_message(Message **pm,
 		     AuthSessionHandler *session_security_copy);
-    int write_message(ceph_msg_header& h, ceph_msg_footer& f, bufferlist& body);
+    int write_message(const ceph_msg_header& h, const ceph_msg_footer& f, bufferlist& body);
     /**
      * Write the given data (of length len) to the Pipe's socket. This function
      * will loop until all passed data has been written out.
diff --git a/src/msg/simple/PipeConnection.h b/src/msg/simple/PipeConnection.h
index 00f6d0e..9e27ec4 100644
--- a/src/msg/simple/PipeConnection.h
+++ b/src/msg/simple/PipeConnection.h
@@ -19,7 +19,7 @@
 
 class Pipe;
 
-struct PipeConnection : public Connection {
+class PipeConnection : public Connection {
   Pipe* pipe;
 
   friend class boost::intrusive_ptr<PipeConnection>;
diff --git a/src/msg/simple/SimpleMessenger.cc b/src/msg/simple/SimpleMessenger.cc
index 38b5d84..fdb7278 100644
--- a/src/msg/simple/SimpleMessenger.cc
+++ b/src/msg/simple/SimpleMessenger.cc
@@ -38,7 +38,7 @@ static ostream& _prefix(std::ostream *_dout, SimpleMessenger *msgr) {
  */
 
 SimpleMessenger::SimpleMessenger(CephContext *cct, entity_name_t name,
-				 string mname, uint64_t _nonce)
+				 string mname, uint64_t _nonce, uint64_t features)
   : SimplePolicyMessenger(cct, name,mname, _nonce),
     accepter(this, _nonce),
     dispatch_queue(cct, this),
@@ -54,6 +54,7 @@ SimpleMessenger::SimpleMessenger(CephContext *cct, entity_name_t name,
     local_connection(new PipeConnection(cct, this))
 {
   ceph_spin_init(&global_seq_lock);
+  local_features = features;
   init_local_connection();
 }
 
@@ -95,6 +96,7 @@ int SimpleMessenger::_send_message(Message *m, const entity_inst_t& dest)
 {
   // set envelope
   m->get_header().src = get_myname();
+  m->set_cct(cct);
 
   if (!m->get_priority()) m->set_priority(get_default_send_priority());
  
@@ -532,9 +534,10 @@ void SimpleMessenger::wait()
   }
   lock.Unlock();
 
-  if(dispatch_queue.is_started()) {
+  if (dispatch_queue.is_started()) {
     ldout(cct,10) << "wait: waiting for dispatch queue" << dendl;
     dispatch_queue.wait();
+    dispatch_queue.discard_local();
     ldout(cct,10) << "wait: dispatch queue is stopped" << dendl;
   }
 
@@ -710,5 +713,6 @@ void SimpleMessenger::init_local_connection()
 {
   local_connection->peer_addr = my_inst.addr;
   local_connection->peer_type = my_inst.name.type();
+  local_connection->set_features(local_features);
   ms_deliver_handle_fast_connect(local_connection.get());
 }
diff --git a/src/msg/simple/SimpleMessenger.h b/src/msg/simple/SimpleMessenger.h
index a5adaec..c05ccc6 100644
--- a/src/msg/simple/SimpleMessenger.h
+++ b/src/msg/simple/SimpleMessenger.h
@@ -79,9 +79,10 @@ public:
    * @param name The name to assign ourselves
    * _nonce A unique ID to use for this SimpleMessenger. It should not
    * be a value that will be repeated if the daemon restarts.
+   * features The local features bits for the local_connection
    */
   SimpleMessenger(CephContext *cct, entity_name_t name,
-		  string mname, uint64_t _nonce);
+		  string mname, uint64_t _nonce, uint64_t features);
 
   /**
    * Destroy the SimpleMessenger. Pretty simple since all the work is done
@@ -215,7 +216,7 @@ private:
    * @param type The peer type of the entity at the address.
    * @param con An existing Connection to associate with the new Pipe. If
    * NULL, it creates a new Connection.
-   * @param msg an initial message to queue on the new pipe
+   * @param first an initial message to queue on the new pipe
    *
    * @return a pointer to the newly-created Pipe. Caller does not own a
    * reference; take one if you need it.
@@ -331,6 +332,7 @@ public:
 
   /// con used for sending messages to ourselves
   ConnectionRef local_connection;
+  uint64_t local_features;
 
   /**
    * @defgroup SimpleMessenger internals
@@ -369,7 +371,7 @@ public:
   int get_proto_version(int peer_type, bool connect);
 
   /**
-   * Fill in the address and peer type for the local connection, which
+   * Fill in the features, address and peer type for the local connection, which
    * is used for delivering messages back to ourself.
    */
   void init_local_connection();
diff --git a/src/msg/xio/XioConnection.cc b/src/msg/xio/XioConnection.cc
index a4d44d8..2f59091 100644
--- a/src/msg/xio/XioConnection.cc
+++ b/src/msg/xio/XioConnection.cc
@@ -106,7 +106,7 @@ XioConnection::XioConnection(XioMessenger *m, XioConnection::type _type,
 
   if (policy.throttler_messages) {
     max_msgs = policy.throttler_messages->get_max();
-    ldout(m->cct,0) << "XioMessenger throttle_msgs: " << max_msgs << dendl;
+    ldout(m->cct,4) << "XioMessenger throttle_msgs: " << max_msgs << dendl;
   }
 
   xopt = m->cct->_conf->xio_queue_depth;
@@ -125,7 +125,7 @@ XioConnection::XioConnection(XioMessenger *m, XioConnection::type _type,
 
   if (policy.throttler_bytes) {
     max_bytes = policy.throttler_bytes->get_max();
-    ldout(m->cct,0) << "XioMessenger throttle_bytes: " << max_bytes << dendl;
+    ldout(m->cct,4) << "XioMessenger throttle_bytes: " << max_bytes << dendl;
   }
 
   bytes_opt = (2 << 28); /* default: 512 MB */
@@ -138,7 +138,7 @@ XioConnection::XioConnection(XioMessenger *m, XioConnection::type _type,
   xio_set_opt(NULL, XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_RCV_QUEUE_DEPTH_BYTES,
              &bytes_opt, sizeof(bytes_opt));
 
-  ldout(m->cct,0) << "Peer type: " << peer.name.type_str() <<
+  ldout(m->cct,4) << "Peer type: " << peer.name.type_str() <<
         " throttle_msgs: " << xopt << " throttle_bytes: " << bytes_opt << dendl;
 
   /* XXXX fake features, aieee! */
@@ -178,18 +178,17 @@ int XioConnection::passive_setup()
 
   /* notify hook */
   msgr->ms_deliver_handle_accept(this);
+  msgr->ms_deliver_handle_fast_accept(this);
 
   /* try to insert in conns_entity_map */
   msgr->try_insert(this);
   return (0);
 }
 
-#define uint_to_timeval(tv, s) ((tv).tv_sec = (s), (tv).tv_usec = 0)
-
 static inline XioDispatchHook* pool_alloc_xio_dispatch_hook(
   XioConnection *xcon, Message *m, XioInSeq& msg_seq)
 {
-  struct xio_mempool_obj mp_mem;
+  struct xio_reg_mem mp_mem;
   int e = xpool_alloc(xio_msgr_noreg_mpool,
 		      sizeof(XioDispatchHook), &mp_mem);
   if (!!e)
@@ -211,7 +210,7 @@ int XioConnection::on_msg_req(struct xio_session *session,
 
   if (! in_seq.p()) {
     if (!treq->in.header.iov_len) {
-	derr << __func__ << " empty header: packet out of sequence?" << dendl;
+	ldout(msgr->cct,0) << __func__ << " empty header: packet out of sequence?" << dendl;
 	xio_release_msg(req);
 	return 0;
     }
@@ -247,7 +246,7 @@ int XioConnection::on_msg_req(struct xio_session *session,
   ceph_msg_footer footer;
   buffer::list payload, middle, data;
 
-  struct timeval t1, t2;
+  const utime_t recv_stamp = ceph_clock_now(msgr->cct);
 
   ldout(msgr->cct,4) << __func__ << " " << "msg_seq.size()="  << msg_seq.size() <<
     dendl;
@@ -258,8 +257,6 @@ int XioConnection::on_msg_req(struct xio_session *session,
 		buffer::create_static(treq->in.header.iov_len,
 				      (char*) treq->in.header.iov_base));
 
-  uint_to_timeval(t1, treq->timestamp);
-
   if (magic & (MSG_MAGIC_TRACE_XCON)) {
     if (hdr.hdr->type == 43) {
       print_xio_msg_hdr(msgr->cct, "on_msg_req", hdr, NULL);
@@ -370,8 +367,6 @@ int XioConnection::on_msg_req(struct xio_session *session,
     }
   }
 
-  uint_to_timeval(t2, treq->timestamp);
-
   /* update connection timestamp */
   recv.set(treq->timestamp);
 
@@ -391,8 +386,8 @@ int XioConnection::on_msg_req(struct xio_session *session,
     m->set_magic(magic);
 
     /* update timestamps */
-    m->set_recv_stamp(t1);
-    m->set_recv_complete_stamp(t2);
+    m->set_recv_stamp(recv_stamp);
+    m->set_recv_complete_stamp(ceph_clock_now(msgr->cct));
     m->set_seq(header.seq);
 
     /* MP-SAFE */
@@ -526,8 +521,7 @@ int XioConnection::discard_input_queue(uint32_t flags)
     pthread_spin_unlock(&sp);
 
   // mqueue
-  int ix, q_size =  disc_q.size();
-  for (ix = 0; ix < q_size; ++ix) {
+  while (!disc_q.empty()) {
     Message::Queue::iterator q_iter = disc_q.begin();
     Message* m = &(*q_iter);
     disc_q.erase(q_iter);
@@ -535,16 +529,25 @@ int XioConnection::discard_input_queue(uint32_t flags)
   }
 
   // requeue
-  q_size =  deferred_q.size();
-  for (ix = 0; ix < q_size; ++ix) {
+  while (!deferred_q.empty()) {
     XioSubmit::Queue::iterator q_iter = deferred_q.begin();
     XioSubmit* xs = &(*q_iter);
-    assert(xs->type == XioSubmit::OUTGOING_MSG);
-    XioMsg* xmsg = static_cast<XioMsg*>(xs);
-    deferred_q.erase(q_iter);
-    // release once for each chained xio_msg
-    for (ix = 0; ix < int(xmsg->hdr.msg_cnt); ++ix)
-      xmsg->put();
+    XioMsg* xmsg;
+    switch (xs->type) {
+      case XioSubmit::OUTGOING_MSG:
+	xmsg = static_cast<XioMsg*>(xs);
+	deferred_q.erase(q_iter);
+	// release once for each chained xio_msg
+	xmsg->put(xmsg->hdr.msg_cnt);
+	break;
+      case XioSubmit::INCOMING_MSG_RELEASE:
+	deferred_q.erase(q_iter);
+	portal->release_xio_rsp(static_cast<XioRsp*>(xs));
+	break;
+      default:
+	ldout(msgr->cct,0) << __func__ << ": Unknown Msg type " << xs->type << dendl;
+	break;
+    }
   }
 
   return 0;
@@ -657,9 +660,8 @@ int XioConnection::CState::state_discon()
   return 0;
 }
 
-int XioConnection::CState::state_flow_controlled(uint32_t flags) {
-  dout(11) << __func__ << " ENTER " << dendl;
-
+int XioConnection::CState::state_flow_controlled(uint32_t flags)
+{
   if (! (flags & OP_FLAG_LOCKED))
     pthread_spin_lock(&xcon->sp);
 
diff --git a/src/msg/xio/XioConnection.h b/src/msg/xio/XioConnection.h
index 6ab116d..9a5f615 100644
--- a/src/msg/xio/XioConnection.h
+++ b/src/msg/xio/XioConnection.h
@@ -228,8 +228,6 @@ private:
     connected.set(false);
     pthread_spin_lock(&sp);
     discard_input_queue(CState::OP_FLAG_LOCKED);
-    if (!conn)
-      this->put();
     pthread_spin_unlock(&sp);
     return 0;
   }
diff --git a/src/msg/xio/XioMessenger.cc b/src/msg/xio/XioMessenger.cc
index de292c2..d651800 100644
--- a/src/msg/xio/XioMessenger.cc
+++ b/src/msg/xio/XioMessenger.cc
@@ -138,8 +138,8 @@ static int on_msg(struct xio_session *session,
 
   ldout(cct,25) << "on_msg session " << session << " xcon " << xcon << dendl;
 
-  static uint32_t nreqs;
   if (unlikely(XioPool::trace_mempool)) {
+    static uint32_t nreqs;
     if (unlikely((++nreqs % 65536) == 0)) {
       xp_stats.dump(__func__, nreqs);
     }
@@ -212,10 +212,12 @@ static int on_cancel_request(struct xio_session *session,
 }
 
 /* free functions */
-static string xio_uri_from_entity(const entity_addr_t& addr, bool want_port)
+static string xio_uri_from_entity(const string &type,
+				  const entity_addr_t& addr, bool want_port)
 {
   const char *host = NULL;
   char addr_buf[129];
+  string xio_uri;
 
   switch(addr.addr.ss_family) {
   case AF_INET:
@@ -231,8 +233,12 @@ static string xio_uri_from_entity(const entity_addr_t& addr, bool want_port)
     break;
   };
 
+  if (type == "rdma" || type == "tcp")
+      xio_uri = type + "://";
+  else
+      xio_uri = "rdma://";
+
   /* The following can only succeed if the host is rdma-capable */
-  string xio_uri = "rdma://";
   xio_uri += host;
   if (want_port) {
     xio_uri += ":";
@@ -244,9 +250,9 @@ static string xio_uri_from_entity(const entity_addr_t& addr, bool want_port)
 
 /* XioMessenger */
 XioMessenger::XioMessenger(CephContext *cct, entity_name_t name,
-			   string mname, uint64_t nonce,
+			   string mname, uint64_t _nonce, uint64_t features,
 			   DispatchStrategy *ds)
-  : SimplePolicyMessenger(cct, name, mname, nonce),
+  : SimplePolicyMessenger(cct, name, mname, _nonce),
     nsessions(0),
     shutdown_called(false),
     portals(this, cct->_conf->xio_portal_threads),
@@ -254,7 +260,10 @@ XioMessenger::XioMessenger(CephContext *cct, entity_name_t name,
     loop_con(new XioLoopbackConnection(this)),
     special_handling(0),
     sh_mtx("XioMessenger session mutex"),
-    sh_cond()
+    sh_cond(),
+    need_addr(true),
+    did_bind(false),
+    nonce(_nonce)
 {
 
   if (cct->_conf->xio_trace_xcon)
@@ -303,22 +312,22 @@ XioMessenger::XioMessenger(CephContext *cct, entity_name_t name,
                  &xopt, sizeof(xopt));
 
       /* and set threshold for buffer callouts */
-      xopt = 16384;
-      xio_set_opt(NULL, XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_MAX_INLINE_DATA,
+      xopt = max(cct->_conf->xio_max_send_inline, 512);
+      xio_set_opt(NULL, XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_MAX_INLINE_XIO_DATA,
                  &xopt, sizeof(xopt));
       xopt = 216;
-      xio_set_opt(NULL, XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_MAX_INLINE_HEADER,
+      xio_set_opt(NULL, XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_MAX_INLINE_XIO_HEADER,
                  &xopt, sizeof(xopt));
 
       struct xio_mempool_config mempool_config = {
         6,
         {
-          {1024,  0,  4096,  262144},
-          {4096,  0,  4096,  262144},
-          {16384, 0,  4096,  262144},
-          {65536, 0,  1024,  65536},
-          {262144, 0,  512,  16384},
-          {1048576, 0, 128,  8192}
+          {1024,  0,  cct->_conf->xio_queue_depth,  262144},
+          {4096,  0,  cct->_conf->xio_queue_depth,  262144},
+          {16384, 0,  cct->_conf->xio_queue_depth,  262144},
+          {65536, 0,  128,  65536},
+          {262144, 0,  32,  16384},
+          {1048576, 0, 8,  8192}
         }
       };
       xio_set_opt(NULL,
@@ -332,22 +341,22 @@ XioMessenger::XioMessenger(CephContext *cct, entity_name_t name,
 	xio_mempool_create(-1 /* nodeid */,
 			   XIO_MEMPOOL_FLAG_REGULAR_PAGES_ALLOC);
 
-      (void) xio_mempool_add_allocator(xio_msgr_noreg_mpool, 64,
+      (void) xio_mempool_add_slab(xio_msgr_noreg_mpool, 64,
 				       cct->_conf->xio_mp_min,
 				       cct->_conf->xio_mp_max_64,
-				       XMSG_MEMPOOL_QUANTUM);
-      (void) xio_mempool_add_allocator(xio_msgr_noreg_mpool, 256,
+				       XMSG_MEMPOOL_QUANTUM, 0);
+      (void) xio_mempool_add_slab(xio_msgr_noreg_mpool, 256,
 				       cct->_conf->xio_mp_min,
 				       cct->_conf->xio_mp_max_256,
-				       XMSG_MEMPOOL_QUANTUM);
-      (void) xio_mempool_add_allocator(xio_msgr_noreg_mpool, 1024,
+				       XMSG_MEMPOOL_QUANTUM, 0);
+      (void) xio_mempool_add_slab(xio_msgr_noreg_mpool, 1024,
 				       cct->_conf->xio_mp_min,
 				       cct->_conf->xio_mp_max_1k,
-				       XMSG_MEMPOOL_QUANTUM);
-      (void) xio_mempool_add_allocator(xio_msgr_noreg_mpool, getpagesize(),
+				       XMSG_MEMPOOL_QUANTUM, 0);
+      (void) xio_mempool_add_slab(xio_msgr_noreg_mpool, getpagesize(),
 				       cct->_conf->xio_mp_min,
 				       cct->_conf->xio_mp_max_page,
-				       XMSG_MEMPOOL_QUANTUM);
+				       XMSG_MEMPOOL_QUANTUM, 0);
 
       /* initialize ops singleton */
       xio_msgr_ops.on_session_event = on_session_event;
@@ -370,6 +379,9 @@ XioMessenger::XioMessenger(CephContext *cct, entity_name_t name,
   /* update class instance count */
   nInstances.inc();
 
+  local_features = features;
+  loop_con->set_features(features);
+
 } /* ctor */
 
 int XioMessenger::pool_hint(uint32_t dsize) {
@@ -377,9 +389,33 @@ int XioMessenger::pool_hint(uint32_t dsize) {
     return 0;
 
   /* if dsize is already present, returns -EEXIST */
-  return xio_mempool_add_allocator(xio_msgr_noreg_mpool, dsize, 0,
+  return xio_mempool_add_slab(xio_msgr_noreg_mpool, dsize, 0,
 				   cct->_conf->xio_mp_max_hint,
-				   XMSG_MEMPOOL_QUANTUM);
+				   XMSG_MEMPOOL_QUANTUM, 0);
+}
+
+void XioMessenger::learned_addr(const entity_addr_t &peer_addr_for_me)
+{
+  // be careful here: multiple threads may block here, and readers of
+  // my_inst.addr do NOT hold any lock.
+
+  // this always goes from true -> false under the protection of the
+  // mutex.  if it is already false, we need not retake the mutex at
+  // all.
+  if (!need_addr)
+    return;
+
+  sh_mtx.Lock();
+  if (need_addr) {
+    entity_addr_t t = peer_addr_for_me;
+    t.set_port(my_inst.addr.get_port());
+    my_inst.addr.addr = t.addr;
+    ldout(cct,2) << "learned my addr " << my_inst.addr << dendl;
+    need_addr = false;
+    // init_local_connection();
+  }
+  sh_mtx.Unlock();
+
 }
 
 int XioMessenger::new_session(struct xio_session *session,
@@ -404,27 +440,46 @@ int XioMessenger::session_event(struct xio_session *session,
 
   switch (event_data->event) {
   case XIO_SESSION_CONNECTION_ESTABLISHED_EVENT:
+  {
+    struct xio_connection *conn = event_data->conn;
+    struct xio_connection_attr xcona;
+    entity_addr_t peer_addr_for_me, paddr;
+
     xcon = static_cast<XioConnection*>(event_data->conn_user_context);
 
     ldout(cct,2) << "connection established " << event_data->conn
       << " session " << session << " xcon " << xcon << dendl;
 
+    (void) xio_query_connection(conn, &xcona,
+				XIO_CONNECTION_ATTR_LOCAL_ADDR|
+				XIO_CONNECTION_ATTR_PEER_ADDR);
+    (void) entity_addr_from_sockaddr(&peer_addr_for_me, (struct sockaddr *) &xcona.local_addr);
+    (void) entity_addr_from_sockaddr(&paddr, (struct sockaddr *) &xcona.peer_addr);
+    //set_myaddr(peer_addr_for_me);
+    learned_addr(peer_addr_for_me);
+    ldout(cct,2) << "client: connected from " << peer_addr_for_me << " to " << paddr << dendl;
+
     /* notify hook */
     this->ms_deliver_handle_connect(xcon);
-    break;
+    this->ms_deliver_handle_fast_connect(xcon);
+  }
+  break;
 
   case XIO_SESSION_NEW_CONNECTION_EVENT:
   {
     struct xio_connection *conn = event_data->conn;
     struct xio_connection_attr xcona;
     entity_inst_t s_inst;
+    entity_addr_t peer_addr_for_me;
 
     (void) xio_query_connection(conn, &xcona,
 				XIO_CONNECTION_ATTR_CTX|
-				XIO_CONNECTION_ATTR_PEER_ADDR);
+				XIO_CONNECTION_ATTR_PEER_ADDR|
+				XIO_CONNECTION_ATTR_LOCAL_ADDR);
     /* XXX assumes RDMA */
     (void) entity_addr_from_sockaddr(&s_inst.addr,
 				     (struct sockaddr *) &xcona.peer_addr);
+    (void) entity_addr_from_sockaddr(&peer_addr_for_me, (struct sockaddr *) &xcona.local_addr);
 
     xcon = new XioConnection(this, XioConnection::PASSIVE, s_inst);
     xcon->session = session;
@@ -454,13 +509,10 @@ int XioMessenger::session_event(struct xio_session *session,
 
     ldout(cct,2) << "new connection session " << session
 		 << " xcon " << xcon << dendl;
+    ldout(cct,2) << "server: connected from " << s_inst.addr << " to " << peer_addr_for_me << dendl;
   }
   break;
   case XIO_SESSION_CONNECTION_ERROR_EVENT:
-    ldout(cct,2) << xio_session_event_types[event_data->event]
-      << " user_context " << event_data->conn_user_context << dendl;
-    /* informational (Eyal)*/
-    break;
   case XIO_SESSION_CONNECTION_CLOSED_EVENT: /* orderly discon */
   case XIO_SESSION_CONNECTION_DISCONNECTED_EVENT: /* unexpected discon */
   case XIO_SESSION_CONNECTION_REFUSED_EVENT:
@@ -477,11 +529,13 @@ int XioMessenger::session_event(struct xio_session *session,
 	  conns_entity_map.erase(conn_iter);
 	}
       }
-      /* now find xcon on conns_list, erase, and release sentinel ref */
-      XioConnection::ConnList::iterator citer =
-	XioConnection::ConnList::s_iterator_to(*xcon);
-      /* XXX check if citer on conn_list? */
-      conns_list.erase(citer);
+      /* check if citer on conn_list */
+      if (xcon->conns_hook.is_linked()) {
+        /* now find xcon on conns_list and erase */
+        XioConnection::ConnList::iterator citer =
+            XioConnection::ConnList::s_iterator_to(*xcon);
+        conns_list.erase(citer);
+      }
       xcon->on_disconnect_event();
     }
     break;
@@ -525,7 +579,7 @@ xio_count_buffers(buffer::list& bl, int& req_size, int& msg_off, int& req_off)
 
   const std::list<buffer::ptr>& buffers = bl.buffers();
   list<bufferptr>::const_iterator pb;
-  size_t size, off, count;
+  size_t size, off;
   int result;
   int first = 1;
 
@@ -541,7 +595,7 @@ xio_count_buffers(buffer::list& bl, int& req_size, int& msg_off, int& req_off)
       size = pb->length();
       first = 0;
     }
-    count = size - off;
+    size_t count = size - off;
     if (!count) continue;
     if (req_size + count > MAX_XIO_BUF_SIZE) {
 	count = MAX_XIO_BUF_SIZE - req_size;
@@ -573,7 +627,7 @@ xio_place_buffers(buffer::list& bl, XioMsg *xmsg, struct xio_msg*& req,
   const std::list<buffer::ptr>& buffers = bl.buffers();
   list<bufferptr>::const_iterator pb;
   struct xio_iovec_ex* iov;
-  size_t size, off, count;
+  size_t size, off;
   const char *data = NULL;
   int first = 1;
 
@@ -589,7 +643,7 @@ xio_place_buffers(buffer::list& bl, XioMsg *xmsg, struct xio_msg*& req,
       data = pb->c_str();	 // is c_str() efficient?
       first = 0;
     }
-    count = size - off;
+    size_t count = size - off;
     if (!count) continue;
     if (req_size + count > MAX_XIO_BUF_SIZE) {
 	count = MAX_XIO_BUF_SIZE - req_size;
@@ -605,7 +659,7 @@ xio_place_buffers(buffer::list& bl, XioMsg *xmsg, struct xio_msg*& req,
       //break;
     default:
     {
-      struct xio_mempool_obj *mp = get_xio_mp(*pb);
+      struct xio_reg_mem *mp = get_xio_mp(*pb);
       iov->mr = (mp) ? mp->mr : NULL;
     }
       break;
@@ -644,32 +698,34 @@ int XioMessenger::bind(const entity_addr_t& addr)
   if (a->is_blank_ip()) {
     a = &_addr;
     std::vector <std::string> my_sections;
-    g_conf->get_my_sections(my_sections);
+    cct->_conf->get_my_sections(my_sections);
     std::string rdma_local_str;
-    if (g_conf->get_val_from_conf_file(my_sections, "rdma local",
+    if (cct->_conf->get_val_from_conf_file(my_sections, "rdma local",
 				      rdma_local_str, true) == 0) {
       struct entity_addr_t local_rdma_addr;
       local_rdma_addr = *a;
       const char *ep;
       if (!local_rdma_addr.parse(rdma_local_str.c_str(), &ep)) {
-	derr << "ERROR:  Cannot parse rdma local: " << rdma_local_str << dendl;
+	ldout(cct,0) << "ERROR:  Cannot parse rdma local: " << rdma_local_str << dendl;
 	return -EINVAL;
       }
       if (*ep) {
-	derr << "WARNING: 'rdma local trailing garbage ignored: '" << ep << dendl;
+	ldout(cct,0) << "WARNING: 'rdma local trailing garbage ignored: '" << ep << dendl;
       }
+      ldout(cct, 2) << "Found rdma_local address " << rdma_local_str.c_str() << dendl;
       int p = _addr.get_port();
       _addr.set_sockaddr(reinterpret_cast<struct sockaddr *>(
 			  &local_rdma_addr.ss_addr()));
       _addr.set_port(p);
     } else {
-      derr << "WARNING: need 'rdma local' config for remote use!" <<dendl;
+      ldout(cct,0) << "WARNING: need 'rdma local' config for remote use!" <<dendl;
     }
   }
 
   entity_addr_t shift_addr = *a;
 
-  string base_uri = xio_uri_from_entity(shift_addr, false /* want_port */);
+  string base_uri = xio_uri_from_entity(cct->_conf->xio_transport_type,
+					shift_addr, false /* want_port */);
   ldout(cct,4) << "XioMessenger " << this << " bind: xio_uri "
     << base_uri << ':' << shift_addr.get_port() << dendl;
 
@@ -677,7 +733,9 @@ int XioMessenger::bind(const entity_addr_t& addr)
   int r = portals.bind(&xio_msgr_ops, base_uri, shift_addr.get_port(), &port0);
   if (r == 0) {
     shift_addr.set_port(port0);
+    shift_addr.nonce = nonce;
     set_myaddr(shift_addr);
+    did_bind = true;
   }
   return r;
 } /* bind */
@@ -692,6 +750,9 @@ int XioMessenger::start()
 {
   portals.start();
   dispatch_strategy->start();
+  if (!did_bind) {
+	  my_inst.addr.nonce = nonce;
+  }
   started = true;
   return 0;
 }
@@ -714,7 +775,7 @@ int XioMessenger::_send_message(Message *m, const entity_inst_t& dest)
 static inline XioMsg* pool_alloc_xio_msg(Message *m, XioConnection *xcon,
   int ex_cnt)
 {
-  struct xio_mempool_obj mp_mem;
+  struct xio_reg_mem mp_mem;
   int e = xpool_alloc(xio_msgr_noreg_mpool, sizeof(XioMsg), &mp_mem);
   if (!!e)
     return NULL;
@@ -754,9 +815,9 @@ int XioMessenger::_send_message_impl(Message* m, XioConnection* xcon)
 {
   int code = 0;
 
-  static uint32_t nreqs;
   Mutex::Locker l(xcon->lock);
   if (unlikely(XioPool::trace_mempool)) {
+    static uint32_t nreqs;
     if (unlikely((++nreqs % 65536) == 0)) {
       xp_stats.dump(__func__, nreqs);
     }
@@ -890,6 +951,7 @@ int XioMessenger::shutdown()
   }
   portals.shutdown();
   dispatch_strategy->shutdown();
+  did_bind = false;
   started = false;
   return 0;
 } /* shutdown */
@@ -915,7 +977,8 @@ ConnectionRef XioMessenger::get_connection(const entity_inst_t& dest)
   }
   else {
     conns_sp.unlock();
-    string xio_uri = xio_uri_from_entity(dest.addr, true /* want_port */);
+    string xio_uri = xio_uri_from_entity(cct->_conf->xio_transport_type,
+					 dest.addr, true /* want_port */);
 
     ldout(cct,4) << "XioMessenger " << this << " get_connection: xio_uri "
       << xio_uri << dendl;
@@ -1008,7 +1071,7 @@ void XioMessenger::mark_down_all()
 static inline XioMarkDownHook* pool_alloc_markdown_hook(
   XioConnection *xcon, Message *m)
 {
-  struct xio_mempool_obj mp_mem;
+  struct xio_reg_mem mp_mem;
   int e = xio_mempool_alloc(xio_msgr_noreg_mpool,
 			    sizeof(XioMarkDownHook), &mp_mem);
   if (!!e)
diff --git a/src/msg/xio/XioMessenger.h b/src/msg/xio/XioMessenger.h
index fe947fa..d6bb5b5 100644
--- a/src/msg/xio/XioMessenger.h
+++ b/src/msg/xio/XioMessenger.h
@@ -43,12 +43,17 @@ private:
   uint32_t special_handling;
   Mutex sh_mtx;
   Cond sh_cond;
+  bool need_addr;
+  bool did_bind;
+
+  /// approximately unique ID set by the Constructor for use in entity_addr_t
+  uint64_t nonce;
 
   friend class XioConnection;
 
 public:
   XioMessenger(CephContext *cct, entity_name_t name,
-	       string mname, uint64_t nonce,
+	       string mname, uint64_t nonce, uint64_t features,
 	       DispatchStrategy* ds = new QueueStrategy(1));
 
   virtual ~XioMessenger();
@@ -132,11 +137,21 @@ public:
   void ds_dispatch(Message *m)
     { dispatch_strategy->ds_dispatch(m); }
 
+  /**
+   * Tell the XioMessenger its full IP address.
+   *
+   * This is used by clients when connecting to other endpoints, and
+   * probably shouldn't be called by anybody else.
+   */
+  void learned_addr(const entity_addr_t& peer_addr_for_me);
+
+
 protected:
   virtual void ready()
     { }
 
 public:
+  uint64_t local_features;
 };
 
 #endif /* XIO_MESSENGER_H */
diff --git a/src/msg/xio/XioMsg.h b/src/msg/xio/XioMsg.h
index 21bb62c..68d8ebe 100644
--- a/src/msg/xio/XioMsg.h
+++ b/src/msg/xio/XioMsg.h
@@ -183,11 +183,11 @@ public:
   XioMsgHdr hdr;
   xio_msg_ex req_0;
   xio_msg_ex* req_arr;
-  struct xio_mempool_obj mp_this;
+  struct xio_reg_mem mp_this;
   atomic_t nrefs;
 
 public:
-  XioMsg(Message *_m, XioConnection *_xcon, struct xio_mempool_obj& _mp,
+  XioMsg(Message *_m, XioConnection *_xcon, struct xio_reg_mem& _mp,
 	 int _ex_cnt) :
     XioSubmit(XioSubmit::OUTGOING_MSG, _xcon),
     m(_m), hdr(m->get_header(), m->get_footer()),
@@ -215,7 +215,7 @@ public:
   void put(int n) {
     int refs = nrefs.sub(n);
     if (refs == 0) {
-      struct xio_mempool_obj *mp = &this->mp_this;
+      struct xio_reg_mem *mp = &this->mp_this;
       this->~XioMsg();
       xpool_free(sizeof(XioMsg), mp);
     }
@@ -281,10 +281,10 @@ private:
   friend class XioConnection;
   friend class XioMessenger;
 public:
-  struct xio_mempool_obj mp_this;
+  struct xio_reg_mem mp_this;
 
   XioDispatchHook(XioConnection *_xcon, Message *_m, XioInSeq& _msg_seq,
-		    struct xio_mempool_obj& _mp) :
+		    struct xio_reg_mem& _mp) :
     CompletionHook(_m),
     xcon(_xcon->get()),
     msg_seq(_msg_seq),
@@ -319,7 +319,7 @@ public:
        */
       if (!cl_flag && release_msgs())
 	return;
-      struct xio_mempool_obj *mp = &this->mp_this;
+      struct xio_reg_mem *mp = &this->mp_this;
       this->~XioDispatchHook();
       xpool_free(sizeof(XioDispatchHook), mp);
     }
@@ -351,10 +351,10 @@ private:
   XioConnection* xcon;
 
 public:
-  struct xio_mempool_obj mp_this;
+  struct xio_reg_mem mp_this;
 
   XioMarkDownHook(
-    XioConnection* _xcon, Message *_m, struct xio_mempool_obj& _mp) :
+    XioConnection* _xcon, Message *_m, struct xio_reg_mem& _mp) :
     CompletionHook(_m), xcon(_xcon->get()), mp_this(_mp)
     { }
 
@@ -362,7 +362,7 @@ public:
 
   virtual void finish(int r) {
     xcon->put();
-    struct xio_mempool_obj *mp = &this->mp_this;
+    struct xio_reg_mem *mp = &this->mp_this;
     this->~XioMarkDownHook();
     xio_mempool_free(mp);
   }
diff --git a/src/msg/xio/XioPool.cc b/src/msg/xio/XioPool.cc
index 8b13c5b..5f0d77a 100644
--- a/src/msg/xio/XioPool.cc
+++ b/src/msg/xio/XioPool.cc
@@ -12,6 +12,7 @@
  *
  */
 
+#include <iostream>
 #include "XioPool.h"
 
 XioPoolStats xp_stats;
@@ -19,4 +20,22 @@ XioPoolStats xp_stats;
 bool XioPool::trace_mempool = 0;
 bool XioPool::trace_msgcnt = 0;
 
-
+void XioPoolStats::dump(const char* tag, uint64_t serial)
+{
+  std::cout
+    << tag << " #" << serial << ": "
+    << "pool objs: "
+    << "64: " << ctr_set[SLAB_64].read() << " "
+    << "256: " << ctr_set[SLAB_256].read() << " "
+    << "1024: " << ctr_set[SLAB_1024].read() << " "
+    << "page: " << ctr_set[SLAB_PAGE].read() << " "
+    << "max: " << ctr_set[SLAB_MAX].read() << " "
+    << "overflow: " << ctr_set[SLAB_OVERFLOW].read() << " "
+    << std::endl;
+  std::cout
+    << tag << " #" << serial << ": "
+    << " msg objs: "
+    << "in: " << hook_cnt.read() << " "
+    << "out: " << msg_cnt.read() << " "
+    << std::endl;
+}
diff --git a/src/msg/xio/XioPool.h b/src/msg/xio/XioPool.h
index 94fdc57..c8e7b87 100644
--- a/src/msg/xio/XioPool.h
+++ b/src/msg/xio/XioPool.h
@@ -20,15 +20,14 @@ extern "C" {
 #include <stdint.h>
 #include "libxio.h"
 }
-#include <iostream>
 #include <vector>
 #include "include/atomic.h"
 #include "common/likely.h"
 
 
 static inline int xpool_alloc(struct xio_mempool *pool, uint64_t size,
-			      struct xio_mempool_obj* mp);
-static inline void xpool_free(uint64_t size, struct xio_mempool_obj* mp);
+			      struct xio_reg_mem* mp);
+static inline void xpool_free(uint64_t size, struct xio_reg_mem* mp);
 
 using ceph::atomic_t;
 
@@ -43,7 +42,7 @@ public:
   static const int MB = 8;
 
   struct xio_piece {
-    struct xio_mempool_obj mp[1];
+    struct xio_reg_mem mp[1];
     struct xio_piece *next;
     int s;
     char payload[MB];
@@ -67,7 +66,7 @@ public:
   void *alloc(size_t _s)
     {
 	void *r;
-	struct xio_mempool_obj mp[1];
+	struct xio_reg_mem mp[1];
 	struct xio_piece *x;
 	int e = xpool_alloc(handle, (sizeof(struct xio_piece)-MB) + _s, mp);
 	if (e) {
@@ -91,38 +90,24 @@ private:
     SLAB_256,
     SLAB_1024,
     SLAB_PAGE,
-    SLAB_MAX
+    SLAB_MAX,
+    SLAB_OVERFLOW,
+    NUM_SLABS,
   };
 
-  atomic_t ctr_set[5];
+  atomic_t ctr_set[NUM_SLABS];
 
   atomic_t msg_cnt;  // send msgs
   atomic_t hook_cnt; // recv msgs
 
 public:
   XioPoolStats() : msg_cnt(0), hook_cnt(0) {
-    for (int ix = 0; ix < 5; ++ix) {
+    for (int ix = 0; ix < NUM_SLABS; ++ix) {
       ctr_set[ix].set(0);
     }
   }
 
-  void dump(const char* tag, uint64_t serial) {
-    std::cout
-      << tag << " #" << serial << ": "
-      << "pool objs: "
-      << "64: " << ctr_set[SLAB_64].read() << " "
-      << "256: " << ctr_set[SLAB_256].read() << " "
-      << "1024: " << ctr_set[SLAB_1024].read() << " "
-      << "page: " << ctr_set[SLAB_PAGE].read() << " "
-      << "max: " << ctr_set[SLAB_MAX].read() << " "
-      << std::endl;
-    std::cout
-      << tag << " #" << serial << ": "
-      << " msg objs: "
-      << "in: " << hook_cnt.read() << " "
-      << "out: " << msg_cnt.read() << " "
-      << std::endl;
-  }
+  void dump(const char* tag, uint64_t serial);
 
   void inc(uint64_t size) {
     if (size <= 64) {
@@ -164,6 +149,9 @@ public:
     (ctr_set[SLAB_MAX]).dec();
   }
 
+  void inc_overflow() { ctr_set[SLAB_OVERFLOW].inc(); }
+  void dec_overflow() { ctr_set[SLAB_OVERFLOW].dec(); }
+
   void inc_msgcnt() {
     if (unlikely(XioPool::trace_msgcnt)) {
       msg_cnt.inc();
@@ -192,18 +180,35 @@ public:
 extern XioPoolStats xp_stats;
 
 static inline int xpool_alloc(struct xio_mempool *pool, uint64_t size,
-			      struct xio_mempool_obj* mp)
+			      struct xio_reg_mem* mp)
 {
+  // try to allocate from the xio pool
+  int r = xio_mempool_alloc(pool, size, mp);
+  if (r == 0) {
+    if (unlikely(XioPool::trace_mempool))
+      xp_stats.inc(size);
+    return 0;
+  }
+  // fall back to malloc on errors
+  mp->addr = malloc(size);
+  assert(mp->addr);
+  mp->length = 0;
   if (unlikely(XioPool::trace_mempool))
-    xp_stats.inc(size);
-  return xio_mempool_alloc(pool, size, mp);
+    xp_stats.inc_overflow();
+  return 0;
 }
 
-static inline void xpool_free(uint64_t size, struct xio_mempool_obj* mp)
+static inline void xpool_free(uint64_t size, struct xio_reg_mem* mp)
 {
- if (unlikely(XioPool::trace_mempool))
-    xp_stats.dec(size);
-  xio_mempool_free(mp);
+  if (mp->length) {
+    if (unlikely(XioPool::trace_mempool))
+      xp_stats.dec(size);
+    xio_mempool_free(mp);
+  } else { // from malloc
+    if (unlikely(XioPool::trace_mempool))
+      xp_stats.dec_overflow();
+    free(mp->addr);
+  }
 }
 
 #define xpool_inc_msgcnt() \
diff --git a/src/msg/xio/XioPortal.h b/src/msg/xio/XioPortal.h
index aaa26f9..cb711c6 100644
--- a/src/msg/xio/XioPortal.h
+++ b/src/msg/xio/XioPortal.h
@@ -67,7 +67,7 @@ private:
 
     inline Lane* get_lane(XioConnection *xcon)
       {
-	return &qlane[((uint64_t) xcon) % nlanes];
+	return &qlane[(((uint64_t) xcon) / 16) % nlanes];
       }
 
     void enq(XioConnection *xcon, XioSubmit* xs)
@@ -159,17 +159,16 @@ public:
     struct xio_msg *msg = xrsp->dequeue();
     struct xio_msg *next_msg = NULL;
     int code;
-    while (msg) {
+    if (unlikely(!xrsp->xcon->conn)) {
+      // NOTE: msg is not safe to dereference if the connection was torn down
+      xrsp->xcon->msg_release_fail(msg, ENOTCONN);
+    }
+    else while (msg) {
       next_msg = static_cast<struct xio_msg *>(msg->user_context);
-      if (unlikely(!xrsp->xcon->conn || !xrsp->xcon->is_connected()))
-        code = ENOTCONN;
-      else
-        code = xio_release_msg(msg);
-      if (unlikely(code)) {
-	/* very unlikely, so log it */
+      code = xio_release_msg(msg);
+      if (unlikely(code)) /* very unlikely, so log it */
 	xrsp->xcon->msg_release_fail(msg, code);
-      }
-      msg =  next_msg;
+      msg = next_msg;
     }
     xrsp->finalize(); /* unconditional finalize */
   }
@@ -208,15 +207,15 @@ public:
     // and push them in FIFO order to front of the input queue,
     // and mark the connection as flow-controlled
     XioSubmit::Queue requeue_q;
-    XioSubmit *xs;
     XioMsg *xmsg;
 
     while (q_iter != send_q.end()) {
-      xs = &(*q_iter);
+      XioSubmit *xs = &(*q_iter);
       // skip retires and anything for other connections
-      if ((xs->type != XioSubmit::OUTGOING_MSG) ||
-	  (xs->xcon != xcon))
+      if (xs->xcon != xcon) {
+	q_iter++;
 	continue;
+      }
       xmsg = static_cast<XioMsg*>(xs);
       q_iter = send_q.erase(q_iter);
       requeue_q.push_back(*xmsg);
@@ -284,8 +283,19 @@ public:
 		  print_ceph_msg(msgr->cct, "xio_send_msg", xmsg->m);
 		}
 		/* get the right Accelio's errno code */
-		if (unlikely(code))
-		  code = xio_errno();
+		if (unlikely(code)) {
+		  if ((code == -1) && (xio_errno() == -1)) {
+		    /* In case XIO does not have any credits to send,
+		     * it would still queue up the message(s) for transmission,
+		     * but would return -1 and errno would also be set to -1.
+		     * This needs to be treated as a success.
+		     */
+		    code = 0;
+		  }
+		  else {
+		    code = xio_errno();
+		  }
+		}
 	      } /* !ENOTCONN */
 	      if (unlikely(code)) {
 		switch (code) {
@@ -424,20 +434,18 @@ public:
 
   void shutdown()
   {
-    XioPortal *portal;
     int nportals = portals.size();
     for (int p_ix = 0; p_ix < nportals; ++p_ix) {
-      portal = portals[p_ix];
+      XioPortal *portal = portals[p_ix];
       portal->shutdown();
     }
   }
 
   void join()
   {
-    XioPortal *portal;
     int nportals = portals.size();
     for (int p_ix = 0; p_ix < nportals; ++p_ix) {
-      portal = portals[p_ix];
+      XioPortal *portal = portals[p_ix];
       portal->join();
     }
   }
diff --git a/src/objclass/class_api.cc b/src/objclass/class_api.cc
index 46a8ff5..a3c065c 100644
--- a/src/objclass/class_api.cc
+++ b/src/objclass/class_api.cc
@@ -82,6 +82,25 @@ int cls_unregister_method(cls_method_handle_t handle)
   return 1;
 }
 
+int cls_register_cxx_filter(cls_handle_t hclass,
+                            const std::string &filter_name,
+                            cls_cxx_filter_factory_t fn,
+                            cls_filter_handle_t *handle)
+{
+  ClassHandler::ClassData *cls = (ClassHandler::ClassData *)hclass;
+  cls_filter_handle_t hfilter = (cls_filter_handle_t)cls->register_cxx_filter(filter_name, fn);
+  if (handle) {
+    *handle = hfilter;
+  }
+  return (hfilter != NULL);
+}
+
+void cls_unregister_filter(cls_filter_handle_t handle)
+{
+  ClassHandler::ClassFilter *filter = (ClassHandler::ClassFilter *)handle;
+  filter->unregister();
+}
+
 int cls_call(cls_method_context_t hctx, const char *cls, const char *method,
                                  char *indata, int datalen,
                                  char **outdata, int *outdatalen)
@@ -573,7 +592,7 @@ int cls_gen_rand_base64(char *dest, int size) /* size should be the required str
   }
   tmp_dest[ret] = '\0';
   memcpy(dest, tmp_dest, size);
-  dest[size] = '\0';
+  dest[size-1] = '\0';
 
   return 0;
 }
diff --git a/src/objclass/objclass.h b/src/objclass/objclass.h
index 6f0de28..0b4e538 100644
--- a/src/objclass/objclass.h
+++ b/src/objclass/objclass.h
@@ -8,6 +8,7 @@
 
 #include "../include/types.h"
 #include "msg/msg_types.h"
+#include "common/hobject.h"
 
 extern "C" {
 #endif
@@ -21,9 +22,10 @@ int __cls_ver_min = min;
 int __cls_name__## name = 0; \
 const char *__cls_name = #name;
 
-#define CLS_METHOD_RD		0x1
-#define CLS_METHOD_WR		0x2
-#define CLS_METHOD_PUBLIC	0x4
+#define CLS_METHOD_RD       0x1 /// method executes read operations
+#define CLS_METHOD_WR       0x2 /// method executes write operations
+#define CLS_METHOD_PUBLIC   0x4 /// unused
+#define CLS_METHOD_PROMOTE  0x8 /// method cannot be proxied to base tier
 
 
 #define CLS_LOG(level, fmt, ...)					\
@@ -34,6 +36,7 @@ void __cls_init();
 
 typedef void *cls_handle_t;
 typedef void *cls_method_handle_t;
+typedef void *cls_filter_handle_t;
 typedef void *cls_method_context_t;
 typedef int (*cls_method_call_t)(cls_method_context_t ctx,
 				 char *indata, int datalen,
@@ -70,6 +73,7 @@ extern int cls_unregister(cls_handle_t);
 extern int cls_register_method(cls_handle_t hclass, const char *method, int flags,
                         cls_method_call_t class_call, cls_method_handle_t *handle);
 extern int cls_unregister_method(cls_method_handle_t handle);
+extern void cls_unregister_filter(cls_filter_handle_t handle);
 
 
 
@@ -94,9 +98,47 @@ extern void class_fini(void);
 typedef int (*cls_method_cxx_call_t)(cls_method_context_t ctx,
 				     class buffer::list *inbl, class buffer::list *outbl);
 
+class PGLSFilter {
+protected:
+  string xattr;
+public:
+  PGLSFilter();
+  virtual ~PGLSFilter();
+  virtual bool filter(const hobject_t &obj, bufferlist& xattr_data,
+                      bufferlist& outdata) = 0;
+
+  /**
+   * Arguments passed from the RADOS client.  Implementations must
+   * handle any encoding errors, and return an appropriate error code,
+   * or 0 on valid input.
+   */
+  virtual int init(bufferlist::iterator &params) = 0;
+
+  /**
+   * xattr key, or empty string.  If non-empty, this xattr will be fetched
+   * and the value passed into ::filter
+   */
+   virtual string& get_xattr() { return xattr; }
+
+  /**
+   * If true, objects without the named xattr (if xattr name is not empty)
+   * will be rejected without calling ::filter
+   */
+  virtual bool reject_empty_xattr() { return true; }
+};
+
+// Classes expose a filter constructor that returns a subclass of PGLSFilter
+typedef PGLSFilter* (*cls_cxx_filter_factory_t)();
+
+
 extern int cls_register_cxx_method(cls_handle_t hclass, const char *method, int flags,
 				   cls_method_cxx_call_t class_call, cls_method_handle_t *handle);
 
+extern int cls_register_cxx_filter(cls_handle_t hclass,
+                                   const std::string &filter_name,
+				   cls_cxx_filter_factory_t fn,
+                                   cls_filter_handle_t *handle=NULL);
+
 extern int cls_cxx_create(cls_method_context_t hctx, bool exclusive);
 extern int cls_cxx_remove(cls_method_context_t hctx);
 extern int cls_cxx_stat(cls_method_context_t hctx, uint64_t *size, time_t *mtime);
diff --git a/src/ocf/Makefile.in b/src/ocf/Makefile.in
index 19267ac..518330c 100644
--- a/src/ocf/Makefile.in
+++ b/src/ocf/Makefile.in
@@ -159,6 +159,7 @@ AMTAR = @AMTAR@
 AM_CXXFLAGS = @AM_CXXFLAGS@
 AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
 AR = @AR@
+ARM_CRC_FLAGS = @ARM_CRC_FLAGS@
 ARM_FLAGS = @ARM_FLAGS@
 ARM_NEON_FLAGS = @ARM_NEON_FLAGS@
 AUTOCONF = @AUTOCONF@
@@ -166,6 +167,7 @@ AUTOHEADER = @AUTOHEADER@
 AUTOMAKE = @AUTOMAKE@
 AWK = @AWK@
 BOOST_PROGRAM_OPTIONS_LIBS = @BOOST_PROGRAM_OPTIONS_LIBS@
+BOOST_RANDOM_LIBS = @BOOST_RANDOM_LIBS@
 BOOST_THREAD_LIBS = @BOOST_THREAD_LIBS@
 CC = @CC@
 CCAS = @CCAS@
@@ -223,7 +225,8 @@ LD = @LD@
 LDFLAGS = @LDFLAGS@
 LIBEDIT_CFLAGS = @LIBEDIT_CFLAGS@
 LIBEDIT_LIBS = @LIBEDIT_LIBS@
-LIBFUSE = @LIBFUSE@
+LIBFUSE_CFLAGS = @LIBFUSE_CFLAGS@
+LIBFUSE_LIBS = @LIBFUSE_LIBS@
 LIBJEMALLOC = @LIBJEMALLOC@
 LIBOBJS = @LIBOBJS@
 LIBROCKSDB_CFLAGS = @LIBROCKSDB_CFLAGS@
@@ -274,6 +277,7 @@ RPM_RELEASE = @RPM_RELEASE@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
+SPHINX_BUILD = @SPHINX_BUILD@
 STRIP = @STRIP@
 VERSION = @VERSION@
 WARN_ERROR_FORMAT_SECURITY = @WARN_ERROR_FORMAT_SECURITY@
@@ -307,6 +311,7 @@ datarootdir = @datarootdir@
 docdir = @docdir@
 dvidir = @dvidir@
 exec_prefix = @exec_prefix@
+group_rgw = @group_rgw@
 host = @host@
 host_alias = @host_alias@
 host_cpu = @host_cpu@
@@ -336,6 +341,8 @@ sharedstatedir = @sharedstatedir@
 srcdir = @srcdir@
 subdirs = @subdirs@
 sysconfdir = @sysconfdir@
+systemd_libexec_dir = @systemd_libexec_dir@
+systemd_unit_dir = @systemd_unit_dir@
 target = @target@
 target_alias = @target_alias@
 target_cpu = @target_cpu@
@@ -344,6 +351,7 @@ target_vendor = @target_vendor@
 top_build_prefix = @top_build_prefix@
 top_builddir = @top_builddir@
 top_srcdir = @top_srcdir@
+user_rgw = @user_rgw@
 EXTRA_DIST = ceph.in Makefile.in
 
 # The root of the OCF resource agent hierarchy
diff --git a/src/os/CollectionIndex.h b/src/os/CollectionIndex.h
index cf808c3..6af6d77 100644
--- a/src/os/CollectionIndex.h
+++ b/src/os/CollectionIndex.h
@@ -150,7 +150,7 @@ protected:
     ) = 0;
 
   /**
-   * Moves objects matching <match> in the lsb <bits>
+   * Moves objects matching @e match in the lsb @e bits
    *
    * dest and this must be the same subclass
    *
@@ -166,18 +166,13 @@ protected:
   /// List contents of collection by hash
   virtual int collection_list_partial(
     const ghobject_t &start, ///< [in] object at which to start
-    int min_count,          ///< [in] get at least min_count objects
+    const ghobject_t &end,    ///< [in] list only objects < end
+    bool sort_bitwise,      ///< [in] use bitwise sort
     int max_count,          ///< [in] return at most max_count objects
-    snapid_t seq,           ///< [in] list only objects with snap >= seq
     vector<ghobject_t> *ls,  ///< [out] Listed objects
     ghobject_t *next         ///< [out] Next object to list
     ) = 0;
 
-  /// List contents of collection.
-  virtual int collection_list(
-    vector<ghobject_t> *ls ///< [out] Listed Objects
-    ) = 0;
-
   /// Call prior to removing directory
   virtual int prep_delete() { return 0; }
 
diff --git a/src/os/DBObjectMap.cc b/src/os/DBObjectMap.cc
index b856849..8ff7ef7 100644
--- a/src/os/DBObjectMap.cc
+++ b/src/os/DBObjectMap.cc
@@ -771,8 +771,8 @@ int DBObjectMap::get_keys(const ghobject_t &oid,
   Header header = lookup_map_header(hl, oid);
   if (!header)
     return -ENOENT;
-  ObjectMapIterator iter = get_iterator(oid);
-  for (; iter->valid(); iter->next()) {
+  ObjectMapIterator iter = _get_iterator(header);
+  for (iter->seek_to_first(); iter->valid(); iter->next()) {
     if (iter->status())
       return iter->status();
     keys->insert(iter->key());
@@ -887,10 +887,10 @@ int DBObjectMap::clone(const ghobject_t &oid,
   if (oid == target)
     return 0;
 
-  MapHeaderLock _l1(this, MIN(oid, target));
-  MapHeaderLock _l2(this, MAX(oid, target));
+  MapHeaderLock _l1(this, MIN_GHOBJ(oid, target, true));
+  MapHeaderLock _l2(this, MAX_GHOBJ(oid, target, true));
   MapHeaderLock *lsource, *ltarget;
-  if (oid > target) {
+  if (cmp_bitwise(oid, target) > 0) {
     lsource = &_l2;
     ltarget= &_l1;
   } else {
@@ -1043,6 +1043,13 @@ int DBObjectMap::sync(const ghobject_t *oid,
       header->spos = *spos;
       set_map_header(hl, *oid, *header, t);
     }
+    /* It may appear that this and the identical portion of the else
+     * block can combined below, but in this block, the transaction
+     * must be submitted under *both* the MapHeaderLock and the full
+     * header_lock.
+     *
+     * See 2b63dd25fc1c73fa42e52e9ea4ab5a45dd9422a0 and bug 9891.
+     */
     Mutex::Locker l(header_lock);
     write_state(t);
     return db->submit_transaction_sync(t);
diff --git a/src/os/DBObjectMap.h b/src/os/DBObjectMap.h
index de80d6f..ee252c1 100644
--- a/src/os/DBObjectMap.h
+++ b/src/os/DBObjectMap.h
@@ -68,7 +68,7 @@ public:
    * Set of headers currently in use
    */
   set<uint64_t> in_use;
-  set<ghobject_t> map_header_in_use;
+  set<ghobject_t, ghobject_t::BitwiseComparator> map_header_in_use;
 
   /**
    * Takes the map_header_in_use entry in constructor, releases in
@@ -327,7 +327,7 @@ private:
   /// Implicit lock on Header->seq
   typedef ceph::shared_ptr<_Header> Header;
   Mutex cache_lock;
-  SimpleLRU<ghobject_t, _Header> caches;
+  SimpleLRU<ghobject_t, _Header, ghobject_t::BitwiseComparator> caches;
 
   string map_header_key(const ghobject_t &oid);
   string header_key(uint64_t seq);
diff --git a/src/os/FDCache.h b/src/os/FDCache.h
index 8597265..635043b 100644
--- a/src/os/FDCache.h
+++ b/src/os/FDCache.h
@@ -52,14 +52,14 @@ public:
 private:
   CephContext *cct;
   const int registry_shards;
-  SharedLRU<ghobject_t, FD> *registry;
+  SharedLRU<ghobject_t, FD, ghobject_t::BitwiseComparator> *registry;
 
 public:
   FDCache(CephContext *cct) : cct(cct),
   registry_shards(cct->_conf->filestore_fd_cache_shards) {
     assert(cct);
     cct->_conf->add_observer(this);
-    registry = new SharedLRU<ghobject_t, FD>[registry_shards];
+    registry = new SharedLRU<ghobject_t, FD, ghobject_t::BitwiseComparator>[registry_shards];
     for (int i = 0; i < registry_shards; ++i) {
       registry[i].set_cct(cct);
       registry[i].set_size(
diff --git a/src/os/FileJournal.cc b/src/os/FileJournal.cc
index c6bb6f2..fb05152 100644
--- a/src/os/FileJournal.cc
+++ b/src/os/FileJournal.cc
@@ -35,27 +35,21 @@
 #include "common/blkdev.h"
 #include "common/linux_version.h"
 
+#if defined(__FreeBSD__)
+#define O_DSYNC O_SYNC
+#endif
+
 #define dout_subsys ceph_subsys_journal
 #undef dout_prefix
 #define dout_prefix *_dout << "journal "
 
 const static int64_t ONE_MEG(1 << 20);
+const static int CEPH_MINIMUM_BLOCK_SIZE(4096);
 
 int FileJournal::_open(bool forwrite, bool create)
 {
   int flags, ret;
 
-  if (aio && !directio) {
-    derr << "FileJournal::_open: aio not supported without directio; disabling aio" << dendl;
-    aio = false;
-  }
-#ifndef HAVE_LIBAIO
-  if (aio) {
-    derr << "FileJournal::_open: libaio not compiled in; disabling aio" << dendl;
-    aio = false;
-  }
-#endif
-
   if (forwrite) {
     flags = O_RDWR;
     if (directio)
@@ -154,8 +148,7 @@ int FileJournal::_open_block_device()
 	   << dendl;
   max_size = bdev_sz;
 
-  /* block devices have to write in blocks of CEPH_PAGE_SIZE */
-  block_size = CEPH_PAGE_SIZE;
+  block_size = CEPH_MINIMUM_BLOCK_SIZE;
 
   if (g_conf->journal_discard) {
     discard = block_device_support_discard(fn.c_str());
@@ -295,7 +288,7 @@ int FileJournal::_open_file(int64_t oldsize, blksize_t blksize,
   else {
     max_size = oldsize;
   }
-  block_size = MAX(blksize, (blksize_t)CEPH_PAGE_SIZE);
+  block_size = MAX(blksize, (blksize_t)CEPH_MINIMUM_BLOCK_SIZE);
 
   if (create && g_conf->journal_zero_on_create) {
     derr << "FileJournal::_open_file : zeroing journal" << dendl;
@@ -307,14 +300,14 @@ int FileJournal::_open_file(int64_t oldsize, blksize_t blksize,
     }
     memset(static_cast<void*>(buf), 0, write_size);
     uint64_t i = 0;
-    for (; (i + write_size) <= (unsigned)max_size; i += write_size) {
+    for (; (i + write_size) <= (uint64_t)max_size; i += write_size) {
       ret = ::pwrite(fd, static_cast<void*>(buf), write_size, i);
       if (ret < 0) {
 	free(buf);
 	return -errno;
       }
     }
-    if (i < (unsigned)max_size) {
+    if (i < (uint64_t)max_size) {
       ret = ::pwrite(fd, static_cast<void*>(buf), max_size - i, i);
       if (ret < 0) {
 	free(buf);
@@ -331,15 +324,17 @@ int FileJournal::_open_file(int64_t oldsize, blksize_t blksize,
   return 0;
 }
 
+// This can not be used on an active journal
 int FileJournal::check()
 {
   int ret;
 
+  assert(fd == -1);
   ret = _open(false, false);
   if (ret)
-    goto done;
+    return ret;
 
-  ret = read_header();
+  ret = read_header(&header);
   if (ret < 0)
     goto done;
 
@@ -354,8 +349,7 @@ int FileJournal::check()
   ret = 0;
 
  done:
-  VOID_TEMP_FAILURE_RETRY(::close(fd));
-  fd = -1;
+  close();
   return ret;
 }
 
@@ -386,7 +380,7 @@ int FileJournal::create()
   header.start = get_top();
   header.start_seq = 0;
 
-  print_header();
+  print_header(header);
 
   // static zeroed buffer for alignment padding
   delete [] zero_buf;
@@ -443,16 +437,20 @@ done:
   return ret;
 }
 
+// This can not be used on an active journal
 int FileJournal::peek_fsid(uuid_d& fsid)
 {
+  assert(fd == -1);
   int r = _open(false, false);
   if (r)
     return r;
-  r = read_header();
+  r = read_header(&header);
   if (r < 0)
-    return r;
+    goto out;
   fsid = header.fsid;
-  return 0;
+out:
+  close();
+  return r;
 }
 
 int FileJournal::open(uint64_t fs_op_seq)
@@ -470,7 +468,7 @@ int FileJournal::open(uint64_t fs_op_seq)
   write_pos = get_top();
 
   // read header?
-  err = read_header();
+  err = read_header(&header);
   if (err < 0)
     return err;
 
@@ -505,9 +503,9 @@ int FileJournal::open(uint64_t fs_op_seq)
 	    << block_size << " (required for direct_io journal mode)" << dendl;
     return -EINVAL;
   }
-  if ((header.alignment % CEPH_PAGE_SIZE) && directio) {
-    dout(0) << "open journal alignment " << header.alignment << " is not multiple of page size " << CEPH_PAGE_SIZE
-	    << " (required for direct_io journal mode)" << dendl;
+  if ((header.alignment % CEPH_MINIMUM_BLOCK_SIZE) && directio) {
+    dout(0) << "open journal alignment " << header.alignment << " is not multiple of minimum block size "
+           << CEPH_MINIMUM_BLOCK_SIZE << " (required for direct_io journal mode)" << dendl;
     return -EINVAL;
   }
 
@@ -556,6 +554,11 @@ int FileJournal::open(uint64_t fs_op_seq)
   return 0;
 }
 
+void FileJournal::_close(int fd) const
+{
+  VOID_TEMP_FAILURE_RETRY(::close(fd));
+}
+
 void FileJournal::close()
 {
   dout(1) << "close " << fn << dendl;
@@ -567,61 +570,120 @@ void FileJournal::close()
   assert(writeq_empty());
   assert(!must_write_header);
   assert(fd >= 0);
-  VOID_TEMP_FAILURE_RETRY(::close(fd));
+  _close(fd);
   fd = -1;
 }
 
 
 int FileJournal::dump(ostream& out)
 {
-  int err = 0;
+  return _dump(out, false);
+}
+
+int FileJournal::simple_dump(ostream& out)
+{
+  return _dump(out, true);
+}
+
+int FileJournal::_dump(ostream& out, bool simple)
+{
+  JSONFormatter f(true);
+  int ret = _fdump(f, simple);
+  f.flush(out);
+  return ret;
+}
 
-  dout(10) << "dump" << dendl;
-  err = _open(false, false);
+int FileJournal::_fdump(Formatter &f, bool simple)
+{
+  dout(10) << "_fdump" << dendl;
+
+  assert(fd == -1);
+  int err = _open(false, false);
   if (err)
     return err;
 
-  err = read_header();
-  if (err < 0)
+  err = read_header(&header);
+  if (err < 0) {
+    close();
     return err;
+  }
 
-  read_pos = header.start;
+  off64_t next_pos = header.start;
 
-  JSONFormatter f(true);
+  f.open_object_section("journal");
 
-  f.open_array_section("journal");
-  uint64_t seq = 0;
+  f.open_object_section("header");
+  f.dump_unsigned("flags", header.flags);
+  ostringstream os;
+  os << header.fsid;
+  f.dump_string("fsid", os.str());
+  f.dump_unsigned("block_size", header.block_size);
+  f.dump_unsigned("alignment", header.alignment);
+  f.dump_int("max_size", header.max_size);
+  f.dump_int("start", header.start);
+  f.dump_unsigned("committed_up_to", header.committed_up_to);
+  f.dump_unsigned("start_seq", header.start_seq);
+  f.close_section();
+
+  f.open_array_section("entries");
+  uint64_t seq = header.start_seq;
   while (1) {
     bufferlist bl;
-    uint64_t pos = read_pos;
-    if (!read_entry(bl, seq)) {
-      dout(3) << "journal_replay: end of journal, done." << dendl;
+    off64_t pos = next_pos;
+
+    if (!pos) {
+      dout(2) << "_dump -- not readable" << dendl;
+      return false;
+    }
+    stringstream ss;
+    read_entry_result result = do_read_entry(
+      pos,
+      &next_pos,
+      &bl,
+      &seq,
+      &ss);
+    if (result != SUCCESS) {
+      if (seq < header.committed_up_to) {
+        dout(2) << "Unable to read past sequence " << seq
+	    << " but header indicates the journal has committed up through "
+	    << header.committed_up_to << ", journal is corrupt" << dendl;
+        err = EINVAL;
+      }
+      dout(25) << ss.str() << dendl;
+      dout(25) << "No further valid entries found, journal is most likely valid"
+	  << dendl;
       break;
     }
 
     f.open_object_section("entry");
     f.dump_unsigned("offset", pos);
     f.dump_unsigned("seq", seq);
-    f.open_array_section("transactions");
-    bufferlist::iterator p = bl.begin();
-    int trans_num = 0;
-    while (!p.end()) {
-      ObjectStore::Transaction *t = new ObjectStore::Transaction(p);
-      f.open_object_section("transaction");
-      f.dump_unsigned("trans_num", trans_num);
-      t->dump(&f);
+    if (simple) {
+      f.dump_unsigned("bl.length", bl.length());
+    } else {
+      f.open_array_section("transactions");
+      bufferlist::iterator p = bl.begin();
+      int trans_num = 0;
+      while (!p.end()) {
+        ObjectStore::Transaction *t = new ObjectStore::Transaction(p);
+        f.open_object_section("transaction");
+        f.dump_unsigned("trans_num", trans_num);
+        t->dump(&f);
+        f.close_section();
+        delete t;
+        trans_num++;
+      }
       f.close_section();
-      delete t;
-      trans_num++;
     }
     f.close_section();
-    f.close_section();
-    f.flush(cout);
   }
 
   f.close_section();
+  f.close_section();
   dout(10) << "dump finish" << dendl;
-  return 0;
+
+  close();
+  return err;
 }
 
 
@@ -638,21 +700,28 @@ void FileJournal::start_writer()
 
 void FileJournal::stop_writer()
 {
+  // Do nothing if writer already stopped or never started
+  if (!write_stop)
   {
-    Mutex::Locker l(write_lock);
-    Mutex::Locker p(writeq_lock);
-    write_stop = true;
-    writeq_cond.Signal();
-    // Doesn't hurt to signal commit_cond in case thread is waiting there
-    // and caller didn't use committed_thru() first.
-    commit_cond.Signal();
+    {
+      Mutex::Locker l(write_lock);
+      Mutex::Locker p(writeq_lock);
+      write_stop = true;
+      writeq_cond.Signal();
+      // Doesn't hurt to signal commit_cond in case thread is waiting there
+      // and caller didn't use committed_thru() first.
+      commit_cond.Signal();
+    }
+    write_thread.join();
+
+    // write journal header now so that we have less to replay on remount
+    write_header_sync();
   }
-  write_thread.join();
 
 #ifdef HAVE_LIBAIO
   // stop aio completeion thread *after* writer thread has stopped
   // and has submitted all of its io
-  if (aio) {
+  if (aio && !aio_stop) {
     aio_lock.Lock();
     aio_stop = true;
     aio_cond.Signal();
@@ -665,7 +734,7 @@ void FileJournal::stop_writer()
 
 
 
-void FileJournal::print_header()
+void FileJournal::print_header(const header_t &header) const
 {
   dout(10) << "header: block_size " << header.block_size
 	   << " alignment " << header.alignment
@@ -675,14 +744,14 @@ void FileJournal::print_header()
   dout(10) << " write_pos " << write_pos << dendl;
 }
 
-int FileJournal::read_header()
+int FileJournal::read_header(header_t *hdr) const
 {
   dout(10) << "read_header" << dendl;
   bufferlist bl;
 
   buffer::ptr bp = buffer::create_page_aligned(block_size);
-  bp.zero();
-  int r = ::pread(fd, bp.c_str(), bp.length(), 0);
+  char* bpdata = bp.c_str();
+  int r = ::pread(fd, bpdata, bp.length(), 0);
 
   if (r < 0) {
     int err = errno;
@@ -690,11 +759,19 @@ int FileJournal::read_header()
     return -err;
   }
 
+  // don't use bp.zero() here, because it also invalidates
+  // crc cache (which is not yet populated anyway)
+  if (bp.length() != (size_t)r) {
+      // r will be always less or equal than bp.length
+      bpdata += r;
+      memset(bpdata, 0, bp.length() - r);
+  }
+
   bl.push_back(bp);
 
   try {
     bufferlist::iterator p = bl.begin();
-    ::decode(header, p);
+    ::decode(*hdr, p);
   }
   catch (buffer::error& e) {
     derr << "read_header error decoding journal header" << dendl;
@@ -709,12 +786,12 @@ int FileJournal::read_header()
    * remove this or else this (eventually old) code will clobber newer
    * code's flags.
    */
-  if (header.flags > 3) {
+  if (hdr->flags > 3) {
     derr << "read_header appears to have gibberish flags; assuming 0" << dendl;
-    header.flags = 0;
+    hdr->flags = 0;
   }
 
-  print_header();
+  print_header(*hdr);
 
   return 0;
 }
@@ -728,12 +805,23 @@ bufferptr FileJournal::prepare_header()
   }
   ::encode(header, bl);
   bufferptr bp = buffer::create_page_aligned(get_top());
-  bp.zero();
-  memcpy(bp.c_str(), bl.c_str(), bl.length());
+  // don't use bp.zero() here, because it also invalidates
+  // crc cache (which is not yet populated anyway)
+  char* data = bp.c_str();
+  memcpy(data, bl.c_str(), bl.length());
+  data += bl.length();
+  memset(data, 0, bp.length()-bl.length());
   return bp;
 }
 
-
+void FileJournal::write_header_sync()
+{
+  Mutex::Locker locker(write_lock);
+  must_write_header = true;
+  bufferlist bl;
+  do_write(bl);
+  dout(20) << __func__ << " finish" << dendl;
+}
 
 int FileJournal::check_for_full(uint64_t seq, off64_t pos, off64_t size)
 {
@@ -751,8 +839,8 @@ int FileJournal::check_for_full(uint64_t seq, off64_t pos, off64_t size)
 	   << " top " << get_top() << dendl;
 
   if (do_sync_cond) {
-    if (room < (header.max_size >> 1) &&
-	room + size > (header.max_size >> 1)) {
+    if (room >= (header.max_size >> 1) &&
+        room - size < (header.max_size >> 1)) {
       dout(10) << " passing half full mark, triggering commit" << dendl;
       do_sync_cond->SloppySignal();  // initiate a real commit so we can trim
     }
@@ -809,7 +897,7 @@ int FileJournal::prepare_multi_write(bufferlist& bl, uint64_t& orig_ops, uint64_
 	  put_throttle(1, peek_write().bl.length());
 	  pop_write();
 	}  
-	print_header();
+	print_header(header);
       }
 
       return -ENOSPC;  // hrm, full on first op
@@ -952,15 +1040,15 @@ int FileJournal::prepare_single_write(bufferlist& bl, off64_t& queue_pos, uint64
 void FileJournal::align_bl(off64_t pos, bufferlist& bl)
 {
   // make sure list segments are page aligned
-  if (directio && (!bl.is_page_aligned() ||
-		   !bl.is_n_page_sized())) {
-    bl.rebuild_page_aligned();
+  if (directio && (!bl.is_aligned(block_size) ||
+		   !bl.is_n_align_sized(CEPH_MINIMUM_BLOCK_SIZE))) {
+    bl.rebuild_aligned(CEPH_MINIMUM_BLOCK_SIZE);
     dout(10) << __func__ << " total memcopy: " << bl.get_memcopy_count() << dendl;
-    if ((bl.length() & ~CEPH_PAGE_MASK) != 0 ||
-	(pos & ~CEPH_PAGE_MASK) != 0)
+    if ((bl.length() & (CEPH_MINIMUM_BLOCK_SIZE - 1)) != 0 ||
+	(pos & (CEPH_MINIMUM_BLOCK_SIZE - 1)) != 0)
       dout(0) << "rebuild_page_aligned failed, " << bl << dendl;
-    assert((bl.length() & ~CEPH_PAGE_MASK) == 0);
-    assert((pos & ~CEPH_PAGE_MASK) == 0);
+    assert((bl.length() & (CEPH_MINIMUM_BLOCK_SIZE - 1)) == 0);
+    assert((pos & (CEPH_MINIMUM_BLOCK_SIZE - 1)) == 0);
   }
 }
 
@@ -1216,7 +1304,7 @@ void FileJournal::write_thread_entry()
 	  put_throttle(1, peek_write().bl.length());
 	  pop_write();
 	}  
-	print_header();
+	print_header(header);
 	r = 0;
       } else {
 	dout(20) << "write_thread_entry full, going to sleep (waiting for commit)" << dendl;
@@ -1379,8 +1467,10 @@ int FileJournal::write_aio_bl(off64_t& pos, bufferlist& bl, uint64_t seq)
 	  continue;
 	}
 	assert(0 == "io_submit got unexpected error");
+      } else {
+	break;
       }
-    } while (false);
+    } while (true);
     pos += aio.len;
   }
   write_finish_cond.Signal();
@@ -1641,7 +1731,7 @@ void FileJournal::committed_thru(uint64_t seq)
   }
 
   must_write_header = true;
-  print_header();
+  print_header(header);
 
   // committed but unjournaled items
   while (!writeq_empty() && peek_write().seq <= seq) {
@@ -1700,7 +1790,7 @@ void FileJournal::wrap_read_bl(
   int64_t olen,
   bufferlist* bl,
   off64_t *out_pos
-  )
+  ) const
 {
   while (olen > 0) {
     while (pos >= header.max_size)
@@ -1756,6 +1846,7 @@ bool FileJournal::read_entry(
     &seq,
     &ss);
   if (result == SUCCESS) {
+    journalq.push_back( pair<uint64_t,off64_t>(seq, pos));
     if (next_seq > seq) {
       return false;
     } else {
@@ -1767,8 +1858,7 @@ bool FileJournal::read_entry(
     }
   }
 
-  stringstream errss;
-  if (seq < header.committed_up_to) {
+  if (seq && seq < header.committed_up_to) {
     derr << "Unable to read past sequence " << seq
 	 << " but header indicates the journal has committed up through "
 	 << header.committed_up_to << ", journal is corrupt" << dendl;
@@ -1781,7 +1871,7 @@ bool FileJournal::read_entry(
     }
   }
 
-  dout(25) << errss.str() << dendl;
+  dout(25) << ss.str() << dendl;
   dout(2) << "No further valid entries found, journal is most likely valid"
 	  << dendl;
   return false;
@@ -1793,7 +1883,7 @@ FileJournal::read_entry_result FileJournal::do_read_entry(
   bufferlist *bl,
   uint64_t *seq,
   ostream *ss,
-  entry_header_t *_h)
+  entry_header_t *_h) const
 {
   off64_t cur_pos = init_pos;
   bufferlist _bl;
@@ -1863,11 +1953,6 @@ FileJournal::read_entry_result FileJournal::do_read_entry(
   if (seq)
     *seq = h->seq;
 
-  // works around an apparent GCC 4.8(?) compiler bug about unaligned
-  // bind by reference to (packed) h->seq
-  journalq.push_back(
-    pair<uint64_t,off64_t>(static_cast<uint64_t>(h->seq),
-			   static_cast<off64_t>(init_pos)));
 
   if (next_pos)
     *next_pos = cur_pos;
@@ -1896,6 +1981,7 @@ void FileJournal::get_header(
   off64_t next_pos = pos;
   bufferlist bl;
   uint64_t seq = 0;
+  dout(2) << __func__ << dendl;
   while (1) {
     bl.clear();
     pos = next_pos;
@@ -1921,6 +2007,7 @@ void FileJournal::corrupt(
   int wfd,
   off64_t corrupt_at)
 {
+  dout(2) << __func__ << dendl;
   if (corrupt_at >= header.max_size)
     corrupt_at = corrupt_at + get_top() - header.max_size;
 
@@ -1943,6 +2030,7 @@ void FileJournal::corrupt_payload(
   int wfd,
   uint64_t seq)
 {
+  dout(2) << __func__ << dendl;
   off64_t pos = 0;
   entry_header_t h;
   get_header(seq, &pos, &h);
@@ -1956,6 +2044,7 @@ void FileJournal::corrupt_footer_magic(
   int wfd,
   uint64_t seq)
 {
+  dout(2) << __func__ << dendl;
   off64_t pos = 0;
   entry_header_t h;
   get_header(seq, &pos, &h);
@@ -1971,6 +2060,7 @@ void FileJournal::corrupt_header_magic(
   int wfd,
   uint64_t seq)
 {
+  dout(2) << __func__ << dendl;
   off64_t pos = 0;
   entry_header_t h;
   get_header(seq, &pos, &h);
diff --git a/src/os/FileJournal.h b/src/os/FileJournal.h
index 574c902..fbe616d 100644
--- a/src/os/FileJournal.h
+++ b/src/os/FileJournal.h
@@ -134,8 +134,8 @@ public:
       start = block_size;
     }
 
-    uint64_t get_fsid64() {
-      return *(uint64_t*)&fsid.uuid[0];
+    uint64_t get_fsid64() const {
+      return *(uint64_t*)fsid.bytes();
     }
 
     void encode(bufferlist& bl) const {
@@ -163,8 +163,8 @@ public:
 	flags = 0;
 	uint64_t tfsid;
 	::decode(tfsid, bl);
-	*(uint64_t*)&fsid.uuid[0] = tfsid;
-	*(uint64_t*)&fsid.uuid[8] = tfsid;
+	*(uint64_t*)&fsid.bytes()[0] = tfsid;
+	*(uint64_t*)&fsid.bytes()[8] = tfsid;
 	::decode(block_size, bl);
 	::decode(alignment, bl);
 	::decode(max_size, bl);
@@ -214,6 +214,8 @@ public:
     }
   } __attribute__((__packed__, aligned(4)));
 
+  bool journalq_empty() { return journalq.empty(); }
+
 private:
   string fn;
 
@@ -294,10 +296,12 @@ private:
 
   int _open(bool wr, bool create=false);
   int _open_block_device();
+  void _close(int fd) const;
   void _check_disk_write_cache() const;
   int _open_file(int64_t oldsize, blksize_t blksize, bool create);
-  void print_header();
-  int read_header();
+  int _dump(ostream& out, bool simple);
+  void print_header(const header_t &hdr) const;
+  int read_header(header_t *hdr) const;
   bufferptr prepare_header();
   void start_writer();
   void stop_writer();
@@ -325,7 +329,7 @@ private:
     int64_t len,      ///< [in] length to read
     bufferlist* bl,   ///< [out] result
     off64_t *out_pos  ///< [out] next position to read, will be wrapped
-    );
+    ) const;
 
   void do_discard(int64_t offset, int64_t end);
 
@@ -349,7 +353,7 @@ private:
     }
   } write_finish_thread;
 
-  off64_t get_top() {
+  off64_t get_top() const {
     return ROUND_UP_TO(sizeof(header), block_size);
   }
 
@@ -379,14 +383,27 @@ private:
     full_state(FULL_NOTFULL),
     fd(-1),
     writing_seq(0),
-    throttle_ops(g_ceph_context, "filestore_ops", g_conf->journal_queue_max_ops),
-    throttle_bytes(g_ceph_context, "filestore_bytes", g_conf->journal_queue_max_bytes),
+    throttle_ops(g_ceph_context, "journal_ops", g_conf->journal_queue_max_ops),
+    throttle_bytes(g_ceph_context, "journal_bytes", g_conf->journal_queue_max_bytes),
     write_lock("FileJournal::write_lock", false, true, false, g_ceph_context),
-    write_stop(false),
-    aio_stop(false),
+    write_stop(true),
+    aio_stop(true),
     write_thread(this),
-    write_finish_thread(this) { }
+    write_finish_thread(this) {
+
+      if (aio && !directio) {
+        derr << "FileJournal::_open_any: aio not supported without directio; disabling aio" << dendl;
+        aio = false;
+      }
+#ifndef HAVE_LIBAIO
+      if (aio) {
+        derr << "FileJournal::_open_any: libaio not compiled in; disabling aio" << dendl;
+        aio = false;
+      }
+#endif
+  }
   ~FileJournal() {
+    assert(fd == -1);
     delete[] zero_buf;
   }
 
@@ -397,6 +414,8 @@ private:
   int peek_fsid(uuid_d& fsid);
 
   int dump(ostream& out);
+  int simple_dump(ostream& out);
+  int _fdump(Formatter &f, bool simple);
 
   void flush();
 
@@ -414,6 +433,8 @@ private:
     return full_state != FULL_NOTFULL && !write_stop;
   }
 
+  void write_header_sync();
+
   void set_wait_on_full(bool b) { wait_on_full = b; }
 
   // reads
@@ -446,7 +467,7 @@ private:
     uint64_t *seq,        ///< [out] seq of successful read
     ostream *ss,          ///< [out] error output
     entry_header_t *h = 0 ///< [out] header
-    ); ///< @return result code
+    ) const; ///< @return result code
 
   bool read_entry(
     bufferlist &bl,
diff --git a/src/os/FileStore.cc b/src/os/FileStore.cc
index f6c3bb8..3e8bb29 100644
--- a/src/os/FileStore.cc
+++ b/src/os/FileStore.cc
@@ -4,6 +4,7 @@
  * Ceph - scalable distributed file system
  *
  * Copyright (C) 2004-2006 Sage Weil <sage at newdream.net>
+ * Copyright (c) 2015 Hewlett-Packard Development Company, L.P.
  *
  * This is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
@@ -76,9 +77,14 @@ using ceph::crypto::SHA1;
 #include "include/assert.h"
 
 #include "common/config.h"
+#include "common/blkdev.h"
 
 #ifdef WITH_LTTNG
+#define TRACEPOINT_DEFINE
+#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
 #include "tracing/objectstore.h"
+#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#undef TRACEPOINT_DEFINE
 #else
 #define tracepoint(...)
 #endif
@@ -87,7 +93,7 @@ using ceph::crypto::SHA1;
 #undef dout_prefix
 #define dout_prefix *_dout << "filestore(" << basedir << ") "
 
-#define COMMIT_SNAP_ITEM "snap_%lld"
+#define COMMIT_SNAP_ITEM "snap_%llu"
 #define CLUSTER_SNAP_ITEM "clustersnap_%s"
 
 #define REPLAY_GUARD_XATTR "user.cephos.seq"
@@ -270,11 +276,7 @@ int FileStore::lfn_open(coll_t cid,
 
   IndexedPath path2;
   IndexedPath *path = &path2;
-  if (r < 0) {
-    derr << "error getting collection index for " << cid
-      << ": " << cpp_strerror(-r) << dendl;
-    goto fail;
-  }
+
   r = (*index)->lookup(oid, path, &exist);
   if (r < 0) {
     derr << "could not find " << oid << " in index: "
@@ -428,6 +430,9 @@ int FileStore::lfn_link(coll_t c, coll_t newcid, const ghobject_t& o, const ghob
     if (r < 0)
       return -errno;
 
+    // make sure old fd for unlinked/overwritten file is gone
+    fdcache.clear(newoid);
+
     r = index_new->created(newoid, path_new->path());
     if (r < 0) {
       assert(!m_filestore_fail_eio || r != -EIO);
@@ -525,9 +530,8 @@ FileStore::FileStore(const std::string &base, const std::string &jdev, osflagbit
   stop(false), sync_thread(this),
   fdcache(g_ceph_context),
   wbthrottle(g_ceph_context),
-  default_osr("default"),
-  op_queue_len(0), op_queue_bytes(0),
-  op_throttle_lock("FileStore::op_throttle_lock"),
+  throttle_ops(g_ceph_context, "filestore_ops",g_conf->filestore_queue_max_ops),
+  throttle_bytes(g_ceph_context, "filestore_bytes",g_conf->filestore_queue_max_bytes),
   op_finisher(g_ceph_context),
   op_tp(g_ceph_context, "FileStore::op_tp", g_conf->filestore_op_threads, "filestore_op_threads"),
   op_wq(this, g_conf->filestore_op_thread_timeout,
@@ -579,29 +583,29 @@ FileStore::FileStore(const std::string &base, const std::string &jdev, osflagbit
   // initialize logger
   PerfCountersBuilder plb(g_ceph_context, internal_name, l_os_first, l_os_last);
 
-  plb.add_u64(l_os_jq_max_ops, "journal_queue_max_ops");
-  plb.add_u64(l_os_jq_ops, "journal_queue_ops");
-  plb.add_u64_counter(l_os_j_ops, "journal_ops");
-  plb.add_u64(l_os_jq_max_bytes, "journal_queue_max_bytes");
-  plb.add_u64(l_os_jq_bytes, "journal_queue_bytes");
-  plb.add_u64_counter(l_os_j_bytes, "journal_bytes");
-  plb.add_time_avg(l_os_j_lat, "journal_latency");
-  plb.add_u64_counter(l_os_j_wr, "journal_wr");
-  plb.add_u64_avg(l_os_j_wr_bytes, "journal_wr_bytes");
-  plb.add_u64(l_os_oq_max_ops, "op_queue_max_ops");
-  plb.add_u64(l_os_oq_ops, "op_queue_ops");
-  plb.add_u64_counter(l_os_ops, "ops");
-  plb.add_u64(l_os_oq_max_bytes, "op_queue_max_bytes");
-  plb.add_u64(l_os_oq_bytes, "op_queue_bytes");
-  plb.add_u64_counter(l_os_bytes, "bytes");
-  plb.add_time_avg(l_os_apply_lat, "apply_latency");
-  plb.add_u64(l_os_committing, "committing");
-
-  plb.add_u64_counter(l_os_commit, "commitcycle");
-  plb.add_time_avg(l_os_commit_len, "commitcycle_interval");
-  plb.add_time_avg(l_os_commit_lat, "commitcycle_latency");
-  plb.add_u64_counter(l_os_j_full, "journal_full");
-  plb.add_time_avg(l_os_queue_lat, "queue_transaction_latency_avg");
+  plb.add_u64(l_os_jq_max_ops, "journal_queue_max_ops", "Max operations in journal queue");
+  plb.add_u64(l_os_jq_ops, "journal_queue_ops", "Operations in journal queue");
+  plb.add_u64_counter(l_os_j_ops, "journal_ops", "Total journal entries written");
+  plb.add_u64(l_os_jq_max_bytes, "journal_queue_max_bytes", "Max data in journal queue");
+  plb.add_u64(l_os_jq_bytes, "journal_queue_bytes", "Size of journal queue");
+  plb.add_u64_counter(l_os_j_bytes, "journal_bytes", "Total operations size in journal");
+  plb.add_time_avg(l_os_j_lat, "journal_latency", "Average journal queue completing latency");
+  plb.add_u64_counter(l_os_j_wr, "journal_wr", "Journal write IOs");
+  plb.add_u64_avg(l_os_j_wr_bytes, "journal_wr_bytes", "Journal data written");
+  plb.add_u64(l_os_oq_max_ops, "op_queue_max_ops", "Max operations in writing to FS queue");
+  plb.add_u64(l_os_oq_ops, "op_queue_ops", "Operations in writing to FS queue");
+  plb.add_u64_counter(l_os_ops, "ops", "Operations written to store");
+  plb.add_u64(l_os_oq_max_bytes, "op_queue_max_bytes", "Max data in writing to FS queue");
+  plb.add_u64(l_os_oq_bytes, "op_queue_bytes", "Size of writing to FS queue");
+  plb.add_u64_counter(l_os_bytes, "bytes", "Data written to store");
+  plb.add_time_avg(l_os_apply_lat, "apply_latency", "Apply latency");
+  plb.add_u64(l_os_committing, "committing", "Is currently committing");
+
+  plb.add_u64_counter(l_os_commit, "commitcycle", "Commit cycles");
+  plb.add_time_avg(l_os_commit_len, "commitcycle_interval", "Average interval between commits");
+  plb.add_time_avg(l_os_commit_lat, "commitcycle_latency", "Average latency of commit");
+  plb.add_u64_counter(l_os_j_full, "journal_full", "Journal writes while full");
+  plb.add_time_avg(l_os_queue_lat, "queue_transaction_latency_avg", "Store operation queue latency");
 
   logger = plb.create_perf_counters();
 
@@ -641,10 +645,32 @@ bool parse_attrname(char **name)
 
 void FileStore::collect_metadata(map<string,string> *pm)
 {
+  char partition_path[PATH_MAX];
+  char dev_node[PATH_MAX];
+  int rc = 0;
+  
   (*pm)["filestore_backend"] = backend->get_name();
   ostringstream ss;
   ss << "0x" << std::hex << m_fs_type << std::dec;
   (*pm)["filestore_f_type"] = ss.str();
+
+  rc = get_device_by_uuid(get_fsid(), "PARTUUID", partition_path,
+        dev_node);
+
+  switch (rc) {
+    case -EOPNOTSUPP:
+    case -EINVAL:
+      (*pm)["backend_filestore_partition_path"] = "unknown";
+      (*pm)["backend_filestore_dev_node"] = "unknown";
+      break;
+    case -ENODEV:
+      (*pm)["backend_filestore_partition_path"] = string(partition_path);
+      (*pm)["backend_filestore_dev_node"] = "unknown";
+      break;
+    default:
+      (*pm)["backend_filestore_partition_path"] = string(partition_path);
+      (*pm)["backend_filestore_dev_node"] = string(dev_node);
+  }
 }
 
 int FileStore::statfs(struct statfs *buf)
@@ -658,7 +684,7 @@ int FileStore::statfs(struct statfs *buf)
 }
 
 
-int FileStore::open_journal()
+void FileStore::new_journal()
 {
   if (journalpath.length()) {
     dout(10) << "open_journal at " << journalpath << dendl;
@@ -667,7 +693,7 @@ int FileStore::open_journal()
     if (journal)
       journal->logger = logger;
   }
-  return 0;
+  return;
 }
 
 int FileStore::dump_journal(ostream& out)
@@ -915,7 +941,7 @@ int FileStore::mkjournal()
 
   ret = 0;
 
-  open_journal();
+  new_journal();
   if (journal) {
     ret = journal->check();
     if (ret < 0) {
@@ -940,8 +966,8 @@ int FileStore::read_fsid(int fd, uuid_d *uuid)
     return ret;
   if (ret == 8) {
     // old 64-bit fsid... mirror it.
-    *(uint64_t*)&uuid->uuid[0] = *(uint64_t*)fsid_str;
-    *(uint64_t*)&uuid->uuid[8] = *(uint64_t*)fsid_str;
+    *(uint64_t*)&uuid->bytes()[0] = *(uint64_t*)fsid_str;
+    *(uint64_t*)&uuid->bytes()[8] = *(uint64_t*)fsid_str;
     return 0;
   }
 
@@ -1158,6 +1184,8 @@ int FileStore::version_stamp_is_valid(uint32_t *version)
   bl.push_back(bp);
   bufferlist::iterator i = bl.begin();
   ::decode(*version, i);
+  dout(10) << __func__ << " was " << *version << " vs target "
+	   << target_version << dendl;
   if (*version == target_version)
     return 1;
   else
@@ -1166,6 +1194,7 @@ int FileStore::version_stamp_is_valid(uint32_t *version)
 
 int FileStore::write_version_stamp()
 {
+  dout(1) << __func__ << " " << target_version << dendl;
   bufferlist bl;
   ::encode(target_version, bl);
 
@@ -1175,6 +1204,7 @@ int FileStore::write_version_stamp()
 
 int FileStore::upgrade()
 {
+  dout(1) << "upgrade" << dendl;
   uint32_t version;
   int r = version_stamp_is_valid(&version);
   if (r < 0)
@@ -1467,7 +1497,10 @@ int FileStore::mount()
       goto close_current_fd;
     }
 
-    omap_store->init();
+    if (superblock.omap_backend == "rocksdb")
+      omap_store->init(g_conf->filestore_rocksdb_options);
+    else
+      omap_store->init();
 
     stringstream err;
     if (omap_store->create_and_open(err)) {
@@ -1497,7 +1530,7 @@ int FileStore::mount()
   }
 
   // journal
-  open_journal();
+  new_journal();
 
   // select journal mode?
   if (journal) {
@@ -1535,7 +1568,7 @@ int FileStore::mount()
   // Cleanup possibly invalid collections
   {
     vector<coll_t> collections;
-    ret = list_collections(collections);
+    ret = list_collections(collections, true);
     if (ret < 0) {
       derr << "Error " << ret << " while listing collections" << dendl;
       goto close_current_fd;
@@ -1591,6 +1624,8 @@ int FileStore::mount()
     }
   }
 
+  init_temp_collections();
+
   journal_start();
 
   op_tp.start();
@@ -1626,10 +1661,51 @@ done:
   return ret;
 }
 
+void FileStore::init_temp_collections()
+{
+  dout(10) << __func__ << dendl;
+  vector<coll_t> ls;
+  int r = list_collections(ls, true);
+  assert(r >= 0);
+
+  dout(20) << " ls " << ls << dendl;
+
+  SequencerPosition spos;
+
+  set<coll_t> temps;
+  for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p)
+    if (p->is_temp())
+      temps.insert(*p);
+  dout(20) << " temps " << temps << dendl;
+
+  for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
+    if (p->is_temp())
+      continue;
+    if (p->is_meta())
+      continue;
+    coll_t temp = p->get_temp();
+    if (temps.count(temp)) {
+      temps.erase(temp);
+    } else {
+      dout(10) << __func__ << " creating " << temp << dendl;
+      r = _create_collection(temp, spos);
+      assert(r == 0);
+    }
+  }
+
+  for (set<coll_t>::iterator p = temps.begin(); p != temps.end(); ++p) {
+    dout(10) << __func__ << " removing stray " << *p << dendl;
+    r = _collection_remove_recursive(*p, spos);
+    assert(r == 0);
+  }
+}
+
 int FileStore::umount() 
 {
   dout(5) << "umount " << basedir << dendl;
   
+  flush();
+  sync();
   do_force_sync();
 
   lock.Lock();
@@ -1725,7 +1801,7 @@ void FileStore::queue_op(OpSequencer *osr, Op *o)
   dout(5) << "queue_op " << o << " seq " << o->op
 	  << " " << *osr
 	  << " " << o->bytes << " bytes"
-	  << "   (queue has " << op_queue_len << " ops and " << op_queue_bytes << " bytes)"
+	  << "   (queue has " << throttle_ops.get_current() << " ops and " << throttle_bytes.get_current() << " bytes)"
 	  << dendl;
   op_wq.queue(osr);
 }
@@ -1745,41 +1821,32 @@ void FileStore::op_queue_reserve_throttle(Op *o, ThreadPool::TPHandle *handle)
   logger->set(l_os_oq_max_bytes, max_bytes);
 
   utime_t start = ceph_clock_now(g_ceph_context);
-  {
-    Mutex::Locker l(op_throttle_lock);
-    while ((max_ops && (op_queue_len + 1) > max_ops) ||
-           (max_bytes && op_queue_bytes      // let single large ops through!
-	      && (op_queue_bytes + o->bytes) > max_bytes)) {
-      dout(2) << "waiting " << op_queue_len + 1 << " > " << max_ops << " ops || "
-	      << op_queue_bytes + o->bytes << " > " << max_bytes << dendl;
-      if (handle)
-	handle->suspend_tp_timeout();
-      op_throttle_cond.Wait(op_throttle_lock);
-      if (handle)
-	handle->reset_tp_timeout();
-    }
+  if (handle)
+    handle->suspend_tp_timeout();
+  if (throttle_ops.should_wait(1) || 
+    (throttle_bytes.get_current()      // let single large ops through!
+    && throttle_bytes.should_wait(o->bytes))) {
+    dout(2) << "waiting " << throttle_ops.get_current() + 1 << " > " << max_ops << " ops || "
+      << throttle_bytes.get_current() + o->bytes << " > " << max_bytes << dendl;
+  }
+  throttle_ops.get();
+  throttle_bytes.get(o->bytes);
+  if (handle)
+    handle->reset_tp_timeout();
 
-    op_queue_len++;
-    op_queue_bytes += o->bytes;
-  }
   utime_t end = ceph_clock_now(g_ceph_context);
   logger->tinc(l_os_queue_lat, end - start);
 
-  logger->set(l_os_oq_ops, op_queue_len);
-  logger->set(l_os_oq_bytes, op_queue_bytes);
+  logger->set(l_os_oq_ops, throttle_ops.get_current());
+  logger->set(l_os_oq_bytes, throttle_bytes.get_current());
 }
 
 void FileStore::op_queue_release_throttle(Op *o)
 {
-  {
-    Mutex::Locker l(op_throttle_lock);
-    op_queue_len--;
-    op_queue_bytes -= o->bytes;
-    op_throttle_cond.Signal();
-  }
-
-  logger->set(l_os_oq_ops, op_queue_len);
-  logger->set(l_os_oq_bytes, op_queue_bytes);
+  throttle_ops.put();
+  throttle_bytes.put(o->bytes);
+  logger->set(l_os_oq_ops, throttle_ops.get_current());
+  logger->set(l_os_oq_bytes, throttle_bytes.get_current());
 }
 
 void FileStore::_do_op(OpSequencer *osr, ThreadPool::TPHandle &handle)
@@ -1810,14 +1877,15 @@ void FileStore::_finish_op(OpSequencer *osr)
   list<Context*> to_queue;
   Op *o = osr->dequeue(&to_queue);
   
-  dout(10) << "_finish_op " << o << " seq " << o->op << " " << *osr << "/" << osr->parent << dendl;
+  utime_t lat = ceph_clock_now(g_ceph_context);
+  lat -= o->start;
+
+  dout(10) << "_finish_op " << o << " seq " << o->op << " " << *osr << "/" << osr->parent << " lat " << lat << dendl;
   osr->apply_lock.Unlock();  // locked in _do_op
 
   // called with tp lock held
   op_queue_release_throttle(o);
 
-  utime_t lat = ceph_clock_now(g_ceph_context);
-  lat -= o->start;
   logger->tinc(l_os_apply_lat, lat);
 
   if (o->onreadable_sync) {
@@ -1865,16 +1933,16 @@ int FileStore::queue_transactions(Sequencer *posr, list<Transaction*> &tls,
 
   // set up the sequencer
   OpSequencer *osr;
-  if (!posr)
-    posr = &default_osr;
+  assert(posr);
   if (posr->p) {
-    osr = static_cast<OpSequencer *>(posr->p);
-    dout(5) << "queue_transactions existing " << *osr << "/" << osr->parent << dendl; //<< " w/ q " << osr->q << dendl;
+    osr = static_cast<OpSequencer *>(posr->p.get());
+    dout(5) << "queue_transactions existing " << osr << " " << *osr << dendl;
   } else {
     osr = new OpSequencer;
+    osr->set_cct(g_ceph_context);
     osr->parent = posr;
     posr->p = osr;
-    dout(5) << "queue_transactions new " << *osr << "/" << osr->parent << dendl;
+    dout(5) << "queue_transactions new " << osr << " " << *osr << dendl;
   }
 
   // used to include osr information in tracepoints during transaction apply
@@ -1886,6 +1954,9 @@ int FileStore::queue_transactions(Sequencer *posr, list<Transaction*> &tls,
     Op *o = build_op(tls, onreadable, onreadable_sync, osd_op);
     op_queue_reserve_throttle(o, handle);
     journal->throttle();
+    //prepare and encode transactions data out of lock
+    bufferlist tbl;
+    int data_align = _op_journal_transactions_prepare(o->tls, tbl);
     uint64_t op_num = submit_manager.op_submit_start();
     o->op = op_num;
 
@@ -1895,7 +1966,7 @@ int FileStore::queue_transactions(Sequencer *posr, list<Transaction*> &tls,
     if (m_filestore_journal_parallel) {
       dout(5) << "queue_transactions (parallel) " << o->op << " " << o->tls << dendl;
       
-      _op_journal_transactions(o->tls, o->op, ondisk, osd_op);
+      _op_journal_transactions(tbl, data_align, o->op, ondisk, osd_op);
       
       // queue inside submit_manager op submission lock
       queue_op(osr, o);
@@ -1904,7 +1975,7 @@ int FileStore::queue_transactions(Sequencer *posr, list<Transaction*> &tls,
       
       osr->queue_journal(o->op);
 
-      _op_journal_transactions(o->tls, o->op,
+      _op_journal_transactions(tbl, data_align, o->op,
 			       new C_JournaledAhead(this, osr, o, ondisk),
 			       osd_op);
     } else {
@@ -1934,6 +2005,10 @@ int FileStore::queue_transactions(Sequencer *posr, list<Transaction*> &tls,
     return 0;
   }
 
+
+  //prepare and encode transactions data out of lock
+  bufferlist tbl;
+  int data_align = _op_journal_transactions_prepare(tls, tbl);
   uint64_t op = submit_manager.op_submit_start();
   dout(5) << "queue_transactions (trailing journal) " << op << " " << tls << dendl;
 
@@ -1944,7 +2019,7 @@ int FileStore::queue_transactions(Sequencer *posr, list<Transaction*> &tls,
   int r = do_transactions(tls, op);
     
   if (r >= 0) {
-    _op_journal_transactions(tls, op, ondisk, osd_op);
+    _op_journal_transactions(tbl, data_align, op, ondisk, osd_op);
   } else {
     delete ondisk;
   }
@@ -2011,7 +2086,12 @@ void FileStore::_set_global_replay_guard(coll_t cid,
     return;
 
   // sync all previous operations on this sequencer
-  int ret = sync_filesystem(basedir_fd);
+  int ret = object_map->sync();
+  if (ret < 0) {
+    derr << __func__ << " : omap sync error " << cpp_strerror(ret) << dendl;
+    assert(0 == "_set_global_replay_guard failed");
+  }
+  ret = sync_filesystem(basedir_fd);
   if (ret < 0) {
     derr << __func__ << " :sync_filesytem error " << cpp_strerror(ret) << dendl;
     assert(0 == "_set_global_replay_guard failed");
@@ -2094,7 +2174,7 @@ void FileStore::_set_replay_guard(coll_t cid,
     assert(0 == "_set_replay_guard failed");
   }
   _set_replay_guard(fd, spos, 0, in_progress);
-  ::close(fd);
+  VOID_TEMP_FAILURE_RETRY(::close(fd));
 } 
 
 
@@ -2150,7 +2230,7 @@ void FileStore::_close_replay_guard(coll_t cid,
     assert(0 == "_close_replay_guard failed");
   }
   _close_replay_guard(fd, spos);
-  ::close(fd);
+  VOID_TEMP_FAILURE_RETRY(::close(fd));
 } 
 
 void FileStore::_close_replay_guard(int fd, const SequencerPosition& spos)
@@ -2289,6 +2369,7 @@ unsigned FileStore::_do_transaction(
       {
         coll_t cid = i.get_cid(op->cid);
         ghobject_t oid = i.get_oid(op->oid);
+	_kludge_temp_object_collection(cid, oid);
         tracepoint(objectstore, touch_enter, osr_name);
         if (_check_replay_guard(cid, oid, spos) > 0)
           r = _touch(cid, oid);
@@ -2300,6 +2381,7 @@ unsigned FileStore::_do_transaction(
       {
         coll_t cid = i.get_cid(op->cid);
         ghobject_t oid = i.get_oid(op->oid);
+	_kludge_temp_object_collection(cid, oid);
         uint64_t off = op->off;
         uint64_t len = op->len;
         uint32_t fadvise_flags = i.get_fadvise_flags();
@@ -2316,6 +2398,7 @@ unsigned FileStore::_do_transaction(
       {
         coll_t cid = i.get_cid(op->cid);
         ghobject_t oid = i.get_oid(op->oid);
+	_kludge_temp_object_collection(cid, oid);
         uint64_t off = op->off;
         uint64_t len = op->len;
         tracepoint(objectstore, zero_enter, osr_name, off, len);
@@ -2335,6 +2418,7 @@ unsigned FileStore::_do_transaction(
       {
         coll_t cid = i.get_cid(op->cid);
         ghobject_t oid = i.get_oid(op->oid);
+	_kludge_temp_object_collection(cid, oid);
         uint64_t off = op->off;
         tracepoint(objectstore, truncate_enter, osr_name, off);
         if (_check_replay_guard(cid, oid, spos) > 0)
@@ -2347,6 +2431,7 @@ unsigned FileStore::_do_transaction(
       {
         coll_t cid = i.get_cid(op->cid);
         ghobject_t oid = i.get_oid(op->oid);
+	_kludge_temp_object_collection(cid, oid);
         tracepoint(objectstore, remove_enter, osr_name);
         if (_check_replay_guard(cid, oid, spos) > 0)
           r = _remove(cid, oid, spos);
@@ -2358,6 +2443,7 @@ unsigned FileStore::_do_transaction(
       {
         coll_t cid = i.get_cid(op->cid);
         ghobject_t oid = i.get_oid(op->oid);
+	_kludge_temp_object_collection(cid, oid);
         string name = i.decode_string();
         bufferlist bl;
         i.decode_bl(bl);
@@ -2378,6 +2464,7 @@ unsigned FileStore::_do_transaction(
       {
         coll_t cid = i.get_cid(op->cid);
         ghobject_t oid = i.get_oid(op->oid);
+	_kludge_temp_object_collection(cid, oid);
         map<string, bufferptr> aset;
         i.decode_attrset(aset);
         tracepoint(objectstore, setattrs_enter, osr_name);
@@ -2393,6 +2480,7 @@ unsigned FileStore::_do_transaction(
       {
         coll_t cid = i.get_cid(op->cid);
         ghobject_t oid = i.get_oid(op->oid);
+	_kludge_temp_object_collection(cid, oid);
         string name = i.decode_string();
         tracepoint(objectstore, rmattr_enter, osr_name);
         if (_check_replay_guard(cid, oid, spos) > 0)
@@ -2405,6 +2493,7 @@ unsigned FileStore::_do_transaction(
       {
         coll_t cid = i.get_cid(op->cid);
         ghobject_t oid = i.get_oid(op->oid);
+	_kludge_temp_object_collection(cid, oid);
         tracepoint(objectstore, rmattrs_enter, osr_name);
         if (_check_replay_guard(cid, oid, spos) > 0)
           r = _rmattrs(cid, oid, spos);
@@ -2416,6 +2505,7 @@ unsigned FileStore::_do_transaction(
       {
         coll_t cid = i.get_cid(op->cid);
         ghobject_t oid = i.get_oid(op->oid);
+	_kludge_temp_object_collection(cid, oid);
         ghobject_t noid = i.get_oid(op->dest_oid);
         tracepoint(objectstore, clone_enter, osr_name);
         r = _clone(cid, oid, noid, spos);
@@ -2427,7 +2517,9 @@ unsigned FileStore::_do_transaction(
       {
         coll_t cid = i.get_cid(op->cid);
         ghobject_t oid = i.get_oid(op->oid);
+	_kludge_temp_object_collection(cid, oid);
         ghobject_t noid = i.get_oid(op->dest_oid);
+	_kludge_temp_object_collection(cid, noid);
         uint64_t off = op->off;
         uint64_t len = op->len;
         tracepoint(objectstore, clone_range_enter, osr_name, len);
@@ -2440,7 +2532,9 @@ unsigned FileStore::_do_transaction(
       {
         coll_t cid = i.get_cid(op->cid);
         ghobject_t oid = i.get_oid(op->oid);
+	_kludge_temp_object_collection(cid, oid);
         ghobject_t noid = i.get_oid(op->dest_oid);
+	_kludge_temp_object_collection(cid, noid);
         uint64_t srcoff = op->off;
         uint64_t len = op->len;
         uint64_t dstoff = op->dest_off;
@@ -2498,6 +2592,8 @@ unsigned FileStore::_do_transaction(
         coll_t ncid = i.get_cid(op->dest_cid);
         ghobject_t oid = i.get_oid(op->oid);
 
+	assert(oid.hobj.pool >= -1);
+
         // always followed by OP_COLL_REMOVE
         Transaction::Op *op2 = i.decode_op();
         coll_t ocid2 = i.get_cid(op2->cid);
@@ -2540,6 +2636,8 @@ unsigned FileStore::_do_transaction(
         ghobject_t oldoid = i.get_oid(op->oid);
         coll_t newcid = i.get_cid(op->dest_cid);
         ghobject_t newoid = i.get_oid(op->dest_oid);
+	_kludge_temp_object_collection(oldcid, oldoid);
+	_kludge_temp_object_collection(newcid, newoid);
         tracepoint(objectstore, coll_move_rename_enter);
         r = _collection_move_rename(oldcid, oldoid, newcid, newoid, spos);
         tracepoint(objectstore, coll_move_rename_exit, r);
@@ -2586,6 +2684,7 @@ unsigned FileStore::_do_transaction(
       {
         coll_t cid = i.get_cid(op->cid);
         ghobject_t oid = i.get_oid(op->oid);
+	_kludge_temp_object_collection(cid, oid);
         tracepoint(objectstore, omap_clear_enter, osr_name);
         r = _omap_clear(cid, oid, spos);
         tracepoint(objectstore, omap_clear_exit, r);
@@ -2595,6 +2694,7 @@ unsigned FileStore::_do_transaction(
       {
         coll_t cid = i.get_cid(op->cid);
         ghobject_t oid = i.get_oid(op->oid);
+	_kludge_temp_object_collection(cid, oid);
         map<string, bufferlist> aset;
         i.decode_attrset(aset);
         tracepoint(objectstore, omap_setkeys_enter, osr_name);
@@ -2606,6 +2706,7 @@ unsigned FileStore::_do_transaction(
       {
         coll_t cid = i.get_cid(op->cid);
         ghobject_t oid = i.get_oid(op->oid);
+	_kludge_temp_object_collection(cid, oid);
         set<string> keys;
         i.decode_keyset(keys);
         tracepoint(objectstore, omap_rmkeys_enter, osr_name);
@@ -2617,6 +2718,7 @@ unsigned FileStore::_do_transaction(
       {
         coll_t cid = i.get_cid(op->cid);
         ghobject_t oid = i.get_oid(op->oid);
+	_kludge_temp_object_collection(cid, oid);
         string first, last;
         first = i.decode_string();
         last = i.decode_string();
@@ -2629,6 +2731,7 @@ unsigned FileStore::_do_transaction(
       {
         coll_t cid = i.get_cid(op->cid);
         ghobject_t oid = i.get_oid(op->oid);
+	_kludge_temp_object_collection(cid, oid);
         bufferlist bl;
         i.decode_bl(bl);
         tracepoint(objectstore, omap_setheader_enter, osr_name);
@@ -2638,13 +2741,7 @@ unsigned FileStore::_do_transaction(
       break;
     case Transaction::OP_SPLIT_COLLECTION:
       {
-        coll_t cid = i.get_cid(op->cid);
-        uint32_t bits = op->split_bits;
-        uint32_t rem = op->split_rem;
-        coll_t dest = i.get_cid(op->dest_cid);
-        tracepoint(objectstore, split_coll_enter, osr_name);
-        r = _split_collection_create(cid, bits, rem, dest, spos);
-        tracepoint(objectstore, split_coll_exit, r);
+	assert(0 == "not legacy journal; upgrade to firefly first");
       }
       break;
     case Transaction::OP_SPLIT_COLLECTION2:
@@ -2663,6 +2760,7 @@ unsigned FileStore::_do_transaction(
       {
         coll_t cid = i.get_cid(op->cid);
         ghobject_t oid = i.get_oid(op->oid);
+	_kludge_temp_object_collection(cid, oid);
         uint64_t expected_object_size = op->expected_object_size;
         uint64_t expected_write_size = op->expected_write_size;
         tracepoint(objectstore, setallochint_enter, osr_name);
@@ -2776,6 +2874,7 @@ unsigned FileStore::_do_transaction(
 bool FileStore::exists(coll_t cid, const ghobject_t& oid)
 {
   tracepoint(objectstore, exists_enter, cid.c_str());
+  _kludge_temp_object_collection(cid, oid);
   struct stat st;
   bool retval = stat(cid, oid, &st) == 0;
   tracepoint(objectstore, exists_exit, retval);
@@ -2786,6 +2885,7 @@ int FileStore::stat(
   coll_t cid, const ghobject_t& oid, struct stat *st, bool allow_eio)
 {
   tracepoint(objectstore, stat_enter, cid.c_str());
+  _kludge_temp_object_collection(cid, oid);
   int r = lfn_stat(cid, oid, st);
   assert(allow_eio || !m_filestore_fail_eio || r != -EIO);
   if (r < 0) {
@@ -2816,6 +2916,7 @@ int FileStore::read(
 {
   int got;
   tracepoint(objectstore, read_enter, cid.c_str(), offset, len);
+  _kludge_temp_object_collection(cid, oid);
 
   dout(15) << "read " << cid << "/" << oid << " " << offset << "~" << len << dendl;
 
@@ -2883,74 +2984,142 @@ int FileStore::read(
   }
 }
 
-int FileStore::fiemap(coll_t cid, const ghobject_t& oid,
-                    uint64_t offset, size_t len,
-                    bufferlist& bl)
+int FileStore::_do_fiemap(int fd, uint64_t offset, size_t len,
+                          map<uint64_t, uint64_t> *m)
 {
-  tracepoint(objectstore, fiemap_enter, cid.c_str(), offset, len);
+  struct fiemap *fiemap = NULL;
+  uint64_t i;
+  struct fiemap_extent *extent = NULL;
+  int r = 0;
 
-  if (!backend->has_fiemap() || len <= (size_t)m_filestore_fiemap_threshold) {
-    map<uint64_t, uint64_t> m;
-    m[offset] = len;
-    ::encode(m, bl);
-    return 0;
+  r = backend->do_fiemap(fd, offset, len, &fiemap);
+  if (r < 0)
+    return r;
+
+  if (fiemap->fm_mapped_extents == 0) {
+    free(fiemap);
+    return r;
   }
 
+  extent = &fiemap->fm_extents[0];
 
-  struct fiemap *fiemap = NULL;
-  map<uint64_t, uint64_t> exomap;
+  /* start where we were asked to start */
+  if (extent->fe_logical < offset) {
+    extent->fe_length -= offset - extent->fe_logical;
+    extent->fe_logical = offset;
+  }
 
-  dout(15) << "fiemap " << cid << "/" << oid << " " << offset << "~" << len << dendl;
+  i = 0;
 
-  FDRef fd;
-  int r = lfn_open(cid, oid, false, &fd);
-  if (r < 0) {
-    dout(10) << "read couldn't open " << cid << "/" << oid << ": " << cpp_strerror(r) << dendl;
-  } else {
-    uint64_t i;
+  while (i < fiemap->fm_mapped_extents) {
+    struct fiemap_extent *next = extent + 1;
 
-    r = backend->do_fiemap(**fd, offset, len, &fiemap);
-    if (r < 0)
-      goto done;
+    dout(10) << "FileStore::fiemap() fm_mapped_extents=" << fiemap->fm_mapped_extents
+             << " fe_logical=" << extent->fe_logical << " fe_length=" << extent->fe_length << dendl;
 
-    if (fiemap->fm_mapped_extents == 0) {
-      free(fiemap);
-      goto done;
+    /* try to merge extents */
+    while ((i < fiemap->fm_mapped_extents - 1) &&
+           (extent->fe_logical + extent->fe_length == next->fe_logical)) {
+        next->fe_length += extent->fe_length;
+        next->fe_logical = extent->fe_logical;
+        extent = next;
+        next = extent + 1;
+        i++;
     }
 
-    struct fiemap_extent *extent = &fiemap->fm_extents[0];
-
-    /* start where we were asked to start */
-    if (extent->fe_logical < offset) {
-      extent->fe_length -= offset - extent->fe_logical;
-      extent->fe_logical = offset;
-    }
+    if (extent->fe_logical + extent->fe_length > offset + len)
+      extent->fe_length = offset + len - extent->fe_logical;
+    (*m)[extent->fe_logical] = extent->fe_length;
+    i++;
+    extent++;
+  }
+  free(fiemap);
 
-    i = 0;
+  return r;
+}
 
-    while (i < fiemap->fm_mapped_extents) {
-      struct fiemap_extent *next = extent + 1;
+int FileStore::_do_seek_hole_data(int fd, uint64_t offset, size_t len,
+                                  map<uint64_t, uint64_t> *m)
+{
+#if defined(__linux__) && defined(SEEK_HOLE) && defined(SEEK_DATA)
+  off_t hole_pos, data_pos;
+  int r = 0;
 
-      dout(10) << "FileStore::fiemap() fm_mapped_extents=" << fiemap->fm_mapped_extents
-	       << " fe_logical=" << extent->fe_logical << " fe_length=" << extent->fe_length << dendl;
+  // If lseek fails with errno setting to be ENXIO, this means the current
+  // file offset is beyond the end of the file.
+  off_t start = offset;
+  while(start < (off_t)(offset + len)) {
+    data_pos = lseek(fd, start, SEEK_DATA);
+    if (data_pos < 0) {
+      if (errno == ENXIO)
+        break;
+      else {
+        r = -errno;
+        dout(10) << "failed to lseek: " << cpp_strerror(r) << dendl;
+	return r;
+      }
+    } else if (data_pos > (off_t)(offset + len)) {
+      break;
+    }
 
-      /* try to merge extents */
-      while ((i < fiemap->fm_mapped_extents - 1) &&
-             (extent->fe_logical + extent->fe_length == next->fe_logical)) {
-          next->fe_length += extent->fe_length;
-          next->fe_logical = extent->fe_logical;
-          extent = next;
-          next = extent + 1;
-          i++;
+    hole_pos = lseek(fd, data_pos, SEEK_HOLE);
+    if (hole_pos < 0) {
+      if (errno == ENXIO) {
+        break;
+      } else {
+        r = -errno;
+        dout(10) << "failed to lseek: " << cpp_strerror(r) << dendl;
+	return r;
       }
+    }
 
-      if (extent->fe_logical + extent->fe_length > offset + len)
-        extent->fe_length = offset + len - extent->fe_logical;
-      exomap[extent->fe_logical] = extent->fe_length;
-      i++;
-      extent++;
+    if (hole_pos >= (off_t)(offset + len)) {
+      (*m)[data_pos] = offset + len - data_pos;
+      break;
     }
-    free(fiemap);
+    (*m)[data_pos] = hole_pos - data_pos;
+    start = hole_pos;
+  }
+
+  return r;
+#else
+  (*m)[offset] = len;
+  return 0;
+#endif
+}
+
+int FileStore::fiemap(coll_t cid, const ghobject_t& oid,
+                    uint64_t offset, size_t len,
+                    bufferlist& bl)
+{
+  tracepoint(objectstore, fiemap_enter, cid.c_str(), offset, len);
+  _kludge_temp_object_collection(cid, oid);
+
+  if ((!backend->has_seek_data_hole() && !backend->has_fiemap()) ||
+      len <= (size_t)m_filestore_fiemap_threshold) {
+    map<uint64_t, uint64_t> m;
+    m[offset] = len;
+    ::encode(m, bl);
+    return 0;
+  }
+
+  dout(15) << "fiemap " << cid << "/" << oid << " " << offset << "~" << len << dendl;
+
+  map<uint64_t, uint64_t> exomap;
+  FDRef fd;
+
+  int r = lfn_open(cid, oid, false, &fd);
+  if (r < 0) {
+    dout(10) << "read couldn't open " << cid << "/" << oid << ": " << cpp_strerror(r) << dendl;
+    goto done;
+  }
+  
+  if (backend->has_seek_data_hole()) {
+    dout(15) << "seek_data/seek_hole " << cid << "/" << oid << " " << offset << "~" << len << dendl;
+    r = _do_seek_hole_data(**fd, offset, len, &exomap);
+  } else if (backend->has_fiemap()) {
+    dout(15) << "fiemap ioctl" << cid << "/" << oid << " " << offset << "~" << len << dendl;
+    r = _do_fiemap(**fd, offset, len, &exomap);
   }
 
 done:
@@ -3189,112 +3358,30 @@ int FileStore::_do_sparse_copy_range(int from, int to, uint64_t srcoff, uint64_t
 {
   dout(20) << __func__ << " " << srcoff << "~" << len << " to " << dstoff << dendl;
   int r = 0;
-  struct fiemap *fiemap = NULL;
-
+  map<uint64_t, uint64_t> exomap;
   // fiemap doesn't allow zero length
   if (len == 0)
     return 0;
 
-  r = backend->do_fiemap(from, srcoff, len, &fiemap);
-  if (r < 0) {
-    derr << "do_fiemap failed:" << srcoff << "~" << len << " = " << r << dendl;
-    return r;
-  }
-
-  // No need to copy
-  if (fiemap->fm_mapped_extents == 0)
-    return r;
-
-  int buflen = 4096*32;
-  char buf[buflen];
-  struct fiemap_extent *extent = &fiemap->fm_extents[0];
-
-  /* start where we were asked to start */
-  if (extent->fe_logical < srcoff) {
-    extent->fe_length -= srcoff - extent->fe_logical;
-    extent->fe_logical = srcoff;
+  if (backend->has_seek_data_hole()) {
+    dout(15) << "seek_data/seek_hole " << from << " " << srcoff << "~" << len << dendl;
+    r = _do_seek_hole_data(from, srcoff, len, &exomap);
+  } else if (backend->has_fiemap()) {
+    dout(15) << "fiemap ioctl" << from << " " << srcoff << "~" << len << dendl;
+    r = _do_fiemap(from, srcoff, len, &exomap);
   }
 
   int64_t written = 0;
-  uint64_t i = 0;
-
-  while (i < fiemap->fm_mapped_extents) {
-    struct fiemap_extent *next = extent + 1;
-
-    dout(10) << __func__ << " fm_mapped_extents=" << fiemap->fm_mapped_extents
-             << " fe_logical=" << extent->fe_logical << " fe_length="
-             << extent->fe_length << dendl;
-
-    /* try to merge extents */
-    while ((i < fiemap->fm_mapped_extents - 1) &&
-           (extent->fe_logical + extent->fe_length == next->fe_logical)) {
-        next->fe_length += extent->fe_length;
-        next->fe_logical = extent->fe_logical;
-        extent = next;
-        next = extent + 1;
-        i++;
-    }
-
-    if (extent->fe_logical + extent->fe_length > srcoff + len)
-      extent->fe_length = srcoff + len - extent->fe_logical;
-
-    int64_t actual;
-
-    actual = ::lseek64(from, extent->fe_logical, SEEK_SET);
-    if (actual != (int64_t)extent->fe_logical) {
-      r = errno;
-      derr << "lseek64 to " << srcoff << " got " << cpp_strerror(r) << dendl;
-      return r;
-    }
-    actual = ::lseek64(to, extent->fe_logical - srcoff + dstoff, SEEK_SET);
-    if (actual != (int64_t)(extent->fe_logical - srcoff + dstoff)) {
-      r = errno;
-      derr << "lseek64 to " << dstoff << " got " << cpp_strerror(r) << dendl;
-      return r;
-    }
-
-    loff_t pos = 0;
-    loff_t end = extent->fe_length;
-    while (pos < end) {
-      int l = MIN(end-pos, buflen);
-      r = ::read(from, buf, l);
-      dout(25) << "  read from " << pos << "~" << l << " got " << r << dendl;
-      if (r < 0) {
-        if (errno == EINTR) {
-          continue;
-        } else {
-          r = -errno;
-          derr << __func__ << ": read error at " << pos << "~" << len
-              << ", " << cpp_strerror(r) << dendl;
-          break;
-        }
-      }
-      if (r == 0) {
-        r = -ERANGE;
-        derr << __func__ << " got short read result at " << pos
-             << " of fd " << from << " len " << len << dendl;
-        break;
-      }
-      int op = 0;
-      while (op < r) {
-        int r2 = safe_write(to, buf+op, r-op);
-        dout(25) << " write to " << to << " len " << (r-op)
-                 << " got " << r2 << dendl;
-        if (r2 < 0) {
-          r = r2;
-          derr << __func__ << ": write error at " << pos << "~"
-               << r-op << ", " << cpp_strerror(r) << dendl;
-          break;
-        }
-        op += (r-op);
-      }
-      if (r < 0)
-        goto out;
-      pos += r;
+  for (map<uint64_t, uint64_t>::iterator miter = exomap.begin(); miter != exomap.end(); ++miter) {
+    uint64_t it_off = miter->first - srcoff + dstoff;
+    r = _do_copy_range(from, to, miter->first, miter->second, it_off, true);
+    if (r < 0) {
+      r = -errno;
+      derr << "FileStore::_do_copy_range: copy error at " << miter->first << "~" << miter->second
+             << " to " << it_off << ", " << cpp_strerror(r) << dendl;
+      break;
     }
-    written += end;
-    i++;
-    extent++;
+    written += miter->second;
   }
 
   if (r >= 0) {
@@ -3325,69 +3412,114 @@ int FileStore::_do_sparse_copy_range(int from, int to, uint64_t srcoff, uint64_t
   return r;
 }
 
-int FileStore::_do_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff)
+int FileStore::_do_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff, bool skip_sloppycrc)
 {
   dout(20) << "_do_copy_range " << srcoff << "~" << len << " to " << dstoff << dendl;
   int r = 0;
-  int64_t actual;
-
-  actual = ::lseek64(from, srcoff, SEEK_SET);
-  if (actual != (int64_t)srcoff) {
-    r = errno;
-    derr << "lseek64 to " << srcoff << " got " << cpp_strerror(r) << dendl;
-    return r;
-  }
-  actual = ::lseek64(to, dstoff, SEEK_SET);
-  if (actual != (int64_t)dstoff) {
-    r = errno;
-    derr << "lseek64 to " << dstoff << " got " << cpp_strerror(r) << dendl;
-    return r;
-  }
-
   loff_t pos = srcoff;
   loff_t end = srcoff + len;
-  int buflen = 4096*32;
-  char buf[buflen];
-  while (pos < end) {
-    int l = MIN(end-pos, buflen);
-    r = ::read(from, buf, l);
-    dout(25) << "  read from " << pos << "~" << l << " got " << r << dendl;
-    if (r < 0) {
-      if (errno == EINTR) {
-	continue;
-      } else {
-	r = -errno;
-	derr << "FileStore::_do_copy_range: read error at " << pos << "~" << len
-	     << ", " << cpp_strerror(r) << dendl;
+  int buflen = 4096 * 16; //limit by pipe max size.see fcntl
+
+#ifdef CEPH_HAVE_SPLICE
+  if (backend->has_splice()) {
+    int pipefd[2];
+    if (pipe(pipefd) < 0) {
+      r = errno;
+      derr << " pipe " << " got " << cpp_strerror(r) << dendl;
+      return r;
+    }
+
+    loff_t dstpos = dstoff;
+    while (pos < end) {
+      int l = MIN(end-pos, buflen);
+      r = safe_splice(from, &pos, pipefd[1], NULL, l, SPLICE_F_NONBLOCK);
+      dout(10) << "  safe_splice read from " << pos << "~" << l << " got " << r << dendl;
+      if (r < 0) {
+	derr << "FileStore::_do_copy_range: safe_splice read error at " << pos << "~" << len
+	  << ", " << cpp_strerror(r) << dendl;
+	break;
+      }
+      if (r == 0) {
+	// hrm, bad source range, wtf.
+	r = -ERANGE;
+	derr << "FileStore::_do_copy_range got short read result at " << pos
+	  << " of fd " << from << " len " << len << dendl;
+	break;
+      }
+
+      r = safe_splice(pipefd[0], NULL, to, &dstpos, r, 0);
+      dout(10) << " safe_splice write to " << to << " len " << r
+	<< " got " << r << dendl;
+      if (r < 0) {
+	derr << "FileStore::_do_copy_range: write error at " << pos << "~"
+	  << r << ", " << cpp_strerror(r) << dendl;
 	break;
       }
     }
-    if (r == 0) {
-      // hrm, bad source range, wtf.
-      r = -ERANGE;
-      derr << "FileStore::_do_copy_range got short read result at " << pos
-	      << " of fd " << from << " len " << len << dendl;
-      break;
+    close(pipefd[0]);
+    close(pipefd[1]);
+  } else
+#endif
+  {
+    int64_t actual;
+
+    actual = ::lseek64(from, srcoff, SEEK_SET);
+    if (actual != (int64_t)srcoff) {
+      r = errno;
+      derr << "lseek64 to " << srcoff << " got " << cpp_strerror(r) << dendl;
+      return r;
+    }
+    actual = ::lseek64(to, dstoff, SEEK_SET);
+    if (actual != (int64_t)dstoff) {
+      r = errno;
+      derr << "lseek64 to " << dstoff << " got " << cpp_strerror(r) << dendl;
+      return r;
     }
-    int op = 0;
-    while (op < r) {
-      int r2 = safe_write(to, buf+op, r-op);
-      dout(25) << " write to " << to << " len " << (r-op)
-	       << " got " << r2 << dendl;
-      if (r2 < 0) {
-	r = r2;
-	derr << "FileStore::_do_copy_range: write error at " << pos << "~"
-	     << r-op << ", " << cpp_strerror(r) << dendl;
 
+    char buf[buflen];
+    while (pos < end) {
+      int l = MIN(end-pos, buflen);
+      r = ::read(from, buf, l);
+      dout(25) << "  read from " << pos << "~" << l << " got " << r << dendl;
+      if (r < 0) {
+	if (errno == EINTR) {
+	  continue;
+	} else {
+	  r = -errno;
+	  derr << "FileStore::_do_copy_range: read error at " << pos << "~" << len
+	    << ", " << cpp_strerror(r) << dendl;
+	  break;
+	}
+      }
+      if (r == 0) {
+	// hrm, bad source range, wtf.
+	r = -ERANGE;
+	derr << "FileStore::_do_copy_range got short read result at " << pos
+	  << " of fd " << from << " len " << len << dendl;
 	break;
       }
-      op += (r-op);
+      int op = 0;
+      while (op < r) {
+	int r2 = safe_write(to, buf+op, r-op);
+	dout(25) << " write to " << to << " len " << (r-op)
+	  << " got " << r2 << dendl;
+	if (r2 < 0) {
+	  r = r2;
+	  derr << "FileStore::_do_copy_range: write error at " << pos << "~"
+	    << r-op << ", " << cpp_strerror(r) << dendl;
+
+	  break;
+	}
+	op += (r-op);
+      }
+      if (r < 0)
+	break;
+      pos += r;
     }
-    if (r < 0)
-      break;
-    pos += r;
   }
-  if (r >= 0 && m_filestore_sloppy_crc) {
+
+  assert(pos == end);
+  if (r >= 0 && !skip_sloppycrc && m_filestore_sloppy_crc) {
     int rc = backend->_crc_update_clone_range(from, to, srcoff, len, dstoff);
     assert(rc >= 0);
   }
@@ -3547,6 +3679,7 @@ void FileStore::sync_entry()
 	apply_manager.commit_started();
 	op_tp.unpause();
 
+	object_map->sync();
 	int err = backend->syncfs();
 	if (err < 0) {
 	  derr << "syncfs got " << cpp_strerror(err) << dendl;
@@ -3642,6 +3775,7 @@ void FileStore::start_sync(Context *onsafe)
   Mutex::Locker l(lock);
   sync_waiters.push_back(onsafe);
   sync_cond.Signal();
+  force_sync = true;
   dout(10) << "start_sync" << dendl;
 }
 
@@ -3718,6 +3852,14 @@ void FileStore::sync_and_flush()
   dout(10) << "sync_and_flush done" << dendl;
 }
 
+int FileStore::flush_journal()
+{
+  dout(10) << __func__ << dendl;
+  sync_and_flush();
+  sync();
+  return 0;
+}
+
 int FileStore::snapshot(const string& name)
 {
   dout(10) << "snapshot " << name << dendl;
@@ -3745,7 +3887,7 @@ int FileStore::snapshot(const string& name)
 
 int FileStore::_fgetattr(int fd, const char *name, bufferptr& bp)
 {
-  char val[100];
+  char val[CHAIN_XATTR_MAX_BLOCK_LEN];
   int l = chain_fgetxattr(fd, name, val, sizeof(val));
   if (l >= 0) {
     bp = buffer::create(l);
@@ -3873,6 +4015,7 @@ bool FileStore::debug_mdata_eio(const ghobject_t &oid) {
 int FileStore::getattr(coll_t cid, const ghobject_t& oid, const char *name, bufferptr &bp)
 {
   tracepoint(objectstore, getattr_enter, cid.c_str());
+  _kludge_temp_object_collection(cid, oid);
   dout(15) << "getattr " << cid << "/" << oid << " '" << name << "'" << dendl;
   FDRef fd;
   int r = lfn_open(cid, oid, false, &fd);
@@ -3914,13 +4057,14 @@ int FileStore::getattr(coll_t cid, const ghobject_t& oid, const char *name, buff
     return -EIO;
   } else {
     tracepoint(objectstore, getattr_exit, r);
-    return r;
+    return r < 0 ? r : 0;
   }
 }
 
 int FileStore::getattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr>& aset)
 {
   tracepoint(objectstore, getattrs_enter, cid.c_str());
+  _kludge_temp_object_collection(cid, oid);
   set<string> omap_attrs;
   map<string, bufferlist> omap_aset;
   Index index;
@@ -4208,7 +4352,7 @@ int FileStore::_rmattrs(coll_t cid, const ghobject_t& oid,
 // collections
 
 int FileStore::collection_getattr(coll_t c, const char *name,
-				  void *value, size_t size) 
+				  void *value, size_t size)
 {
   char fn[PATH_MAX];
   get_cdir(c, fn, sizeof(fn));
@@ -4252,7 +4396,7 @@ int FileStore::collection_getattr(coll_t c, const char *name, bufferlist& bl)
   return r;
 }
 
-int FileStore::collection_getattrs(coll_t cid, map<string,bufferptr>& aset) 
+int FileStore::collection_getattrs(coll_t cid, map<string,bufferptr>& aset)
 {
   char fn[PATH_MAX];
   get_cdir(cid, fn, sizeof(fn));
@@ -4273,7 +4417,7 @@ int FileStore::collection_getattrs(coll_t cid, map<string,bufferptr>& aset)
 
 
 int FileStore::_collection_setattr(coll_t c, const char *name,
-				  const void *value, size_t size) 
+				  const void *value, size_t size)
 {
   char fn[PATH_MAX];
   get_cdir(c, fn, sizeof(fn));
@@ -4293,7 +4437,7 @@ int FileStore::_collection_setattr(coll_t c, const char *name,
   return r;
 }
 
-int FileStore::_collection_rmattr(coll_t c, const char *name) 
+int FileStore::_collection_rmattr(coll_t c, const char *name)
 {
   char fn[PATH_MAX];
   get_cdir(c, fn, sizeof(fn));
@@ -4314,7 +4458,7 @@ int FileStore::_collection_rmattr(coll_t c, const char *name)
 }
 
 
-int FileStore::_collection_setattrs(coll_t cid, map<string,bufferptr>& aset) 
+int FileStore::_collection_setattrs(coll_t cid, map<string,bufferptr>& aset)
 {
   char fn[PATH_MAX];
   get_cdir(cid, fn, sizeof(fn));
@@ -4354,7 +4498,8 @@ int FileStore::_collection_remove_recursive(const coll_t &cid,
   vector<ghobject_t> objects;
   ghobject_t max;
   while (!max.is_max()) {
-    r = collection_list_partial(cid, max, 200, 300, 0, &objects, &max);
+    r = collection_list(cid, max, ghobject_t::get_max(), true,
+			300, &objects, &max);
     if (r < 0)
       return r;
     for (vector<ghobject_t>::iterator i = objects.begin();
@@ -4389,7 +4534,12 @@ int FileStore::collection_version_current(coll_t c, uint32_t *version)
     return 0;
 }
 
-int FileStore::list_collections(vector<coll_t>& ls) 
+int FileStore::list_collections(vector<coll_t>& ls)
+{
+  return list_collections(ls, false);
+}
+
+int FileStore::list_collections(vector<coll_t>& ls, bool include_temp)
 {
   tracepoint(objectstore, list_collections_enter);
   dout(10) << "list_collections" << dendl;
@@ -4438,7 +4588,13 @@ int FileStore::list_collections(vector<coll_t>& ls)
 	 (de->d_name[1] == '.' &&
 	  de->d_name[2] == '\0')))
       continue;
-    ls.push_back(coll_t(de->d_name));
+    coll_t cid;
+    if (!cid.parse(de->d_name)) {
+      derr << "ignoging invalid collection '" << de->d_name << "'" << dendl;
+      continue;
+    }
+    if (!cid.is_temp() || include_temp)
+      ls.push_back(cid);
   }
 
   if (r > 0) {
@@ -4490,7 +4646,8 @@ bool FileStore::collection_empty(coll_t c)
 
   vector<ghobject_t> ls;
   collection_list_handle_t handle;
-  r = index->collection_list_partial(ghobject_t(), 1, 1, 0, &ls, NULL);
+  r = index->collection_list_partial(ghobject_t(), ghobject_t::get_max(), true,
+				     1, &ls, NULL);
   if (r < 0) {
     assert(!m_filestore_fail_eio || r != -EIO);
     return false;
@@ -4499,49 +4656,60 @@ bool FileStore::collection_empty(coll_t c)
   tracepoint(objectstore, collection_empty_exit, ret);
   return ret;
 }
-
-int FileStore::collection_list_range(coll_t c, ghobject_t start, ghobject_t end,
-                                     snapid_t seq, vector<ghobject_t> *ls)
+int FileStore::collection_list(coll_t c, ghobject_t start, ghobject_t end,
+			       bool sort_bitwise, int max,
+			       vector<ghobject_t> *ls, ghobject_t *next)
 {
-  tracepoint(objectstore, collection_list_range_enter, c.c_str());
-  bool done = false;
-  ghobject_t next = start;
-
-  while (!done) {
-    vector<ghobject_t> next_objects;
-    int r = collection_list_partial(c, next,
-                                get_ideal_list_min(), get_ideal_list_max(),
-                                seq, &next_objects, &next);
-    if (r < 0)
-      return r;
-
-    ls->insert(ls->end(), next_objects.begin(), next_objects.end());
-
-    // special case for empty collection
-    if (ls->empty()) {
-      break;
-    }
+  if (start.is_max())
+    return 0;
 
-    while (!ls->empty() && ls->back() >= end) {
-      ls->pop_back();
-      done = true;
+  ghobject_t temp_next;
+  if (!next)
+    next = &temp_next;
+  // figure out the pool id.  we need this in order to generate a
+  // meaningful 'next' value.
+  int64_t pool = -1;
+  shard_id_t shard;
+  {
+    spg_t pgid;
+    if (c.is_temp(&pgid)) {
+      pool = -2 - pgid.pool();
+      shard = pgid.shard;
+    } else if (c.is_pg(&pgid)) {
+      pool = pgid.pool();
+      shard = pgid.shard;
+    } else if (c.is_meta()) {
+      pool = -1;
+      shard = shard_id_t::NO_SHARD;
+    } else {
+      // hrm, the caller is test code!  we should get kill it off.  for now,
+      // tolerate it.
+      pool = 0;
+      shard = shard_id_t::NO_SHARD;
     }
-
-    if (next >= end) {
-      done = true;
+    dout(20) << __func__ << " pool is " << pool << " shard is " << shard
+	     << " pgid " << pgid << dendl;
+  }
+  ghobject_t sep;
+  sep.hobj.pool = -1;
+  sep.set_shard(shard);
+  if (!c.is_temp() && !c.is_meta()) {
+    if (cmp_bitwise(start, sep) < 0) { // bitwise vs nibble doesn't matter here
+      dout(10) << __func__ << " first checking temp pool" << dendl;
+      coll_t temp = c.get_temp();
+      int r = collection_list(temp, start, end, sort_bitwise, max, ls, next);
+      if (r < 0)
+	return r;
+      if (*next != ghobject_t::get_max())
+	return r;
+      start = sep;
+      dout(10) << __func__ << " fall through to non-temp collection, start "
+	       << start << dendl;
+    } else {
+      dout(10) << __func__ << " start " << start << " >= sep " << sep << dendl;
     }
   }
 
-  tracepoint(objectstore, collection_list_range_exit, 0);
-  return 0;
-}
-
-int FileStore::collection_list_partial(coll_t c, ghobject_t start,
-				       int min, int max, snapid_t seq,
-				       vector<ghobject_t> *ls, ghobject_t *next)
-{
-  tracepoint(objectstore, collection_list_partial_enter, c.c_str());
-  dout(10) << "collection_list_partial: " << c << dendl;
   Index index;
   int r = get_index(c, &index);
   if (r < 0)
@@ -4550,34 +4718,22 @@ int FileStore::collection_list_partial(coll_t c, ghobject_t start,
   assert(NULL != index.index);
   RWLock::RLocker l((index.index)->access_lock);
 
-  r = index->collection_list_partial(start,
-				     min, max, seq,
-				     ls, next);
+  r = index->collection_list_partial(start, end, sort_bitwise, max, ls, next);
+
   if (r < 0) {
     assert(!m_filestore_fail_eio || r != -EIO);
     return r;
   }
-  if (ls)
-    dout(20) << "objects: " << *ls << dendl;
-  tracepoint(objectstore, collection_list_partial_exit, 0);
-  return 0;
-}
+  dout(20) << "objects: " << ls << dendl;
 
-int FileStore::collection_list(coll_t c, vector<ghobject_t>& ls)
-{  
-  tracepoint(objectstore, collection_list_enter, c.c_str());
-  Index index;
-  int r = get_index(c, &index);
-  if (r < 0)
-    return r;
-
-  assert(NULL != index.index);
-  RWLock::RLocker l((index.index)->access_lock);
+  // HashIndex doesn't know the pool when constructing a 'next' value
+  if (next && !next->is_max()) {
+    next->hobj.pool = pool;
+    next->set_shard(shard);
+    dout(20) << "  next " << *next << dendl;
+  }
 
-  r = index->collection_list(&ls);
-  assert(!m_filestore_fail_eio || r != -EIO);
-  tracepoint(objectstore, collection_list_exit, r);
-  return r;
+  return 0;
 }
 
 int FileStore::omap_get(coll_t c, const ghobject_t &hoid,
@@ -4585,6 +4741,7 @@ int FileStore::omap_get(coll_t c, const ghobject_t &hoid,
 			map<string, bufferlist> *out)
 {
   tracepoint(objectstore, omap_get_enter, c.c_str());
+  _kludge_temp_object_collection(c, hoid);
   dout(15) << __func__ << " " << c << "/" << hoid << dendl;
   Index index;
   int r = get_index(c, &index);
@@ -4613,6 +4770,7 @@ int FileStore::omap_get_header(
   bool allow_eio)
 {
   tracepoint(objectstore, omap_get_header_enter, c.c_str());
+  _kludge_temp_object_collection(c, hoid);
   dout(15) << __func__ << " " << c << "/" << hoid << dendl;
   Index index;
   int r = get_index(c, &index);
@@ -4637,6 +4795,7 @@ int FileStore::omap_get_header(
 int FileStore::omap_get_keys(coll_t c, const ghobject_t &hoid, set<string> *keys)
 {
   tracepoint(objectstore, omap_get_keys_enter, c.c_str());
+  _kludge_temp_object_collection(c, hoid);
   dout(15) << __func__ << " " << c << "/" << hoid << dendl;
   Index index;
   int r = get_index(c, &index);
@@ -4663,25 +4822,35 @@ int FileStore::omap_get_values(coll_t c, const ghobject_t &hoid,
 			       map<string, bufferlist> *out)
 {
   tracepoint(objectstore, omap_get_values_enter, c.c_str());
+  _kludge_temp_object_collection(c, hoid);
   dout(15) << __func__ << " " << c << "/" << hoid << dendl;
   Index index;
+  const char *where = 0;
   int r = get_index(c, &index);
-  if (r < 0)
-    return r;
+  if (r < 0) {
+    where = " (get_index)";
+    goto out;
+  }
   {
     assert(NULL != index.index);
     RWLock::RLocker l((index.index)->access_lock);
     r = lfn_find(hoid, index);
-    if (r < 0)
-      return r;
+    if (r < 0) {
+      where = " (lfn_find)";
+      goto out;
+    }
   }
   r = object_map->get_values(hoid, keys, out);
   if (r < 0 && r != -ENOENT) {
     assert(!m_filestore_fail_eio || r != -EIO);
-    return r;
+    goto out;
   }
-  tracepoint(objectstore, omap_get_values_exit, 0);
-  return 0;
+  r = 0;
+ out:
+  tracepoint(objectstore, omap_get_values_exit, r);
+  dout(15) << __func__ << " " << c << "/" << hoid << " = " << r
+	   << where << dendl;
+  return r;
 }
 
 int FileStore::omap_check_keys(coll_t c, const ghobject_t &hoid,
@@ -4689,6 +4858,7 @@ int FileStore::omap_check_keys(coll_t c, const ghobject_t &hoid,
 			       set<string> *out)
 {
   tracepoint(objectstore, omap_check_keys_enter, c.c_str());
+  _kludge_temp_object_collection(c, hoid);
   dout(15) << __func__ << " " << c << "/" << hoid << dendl;
 
   Index index;
@@ -4715,17 +4885,24 @@ ObjectMap::ObjectMapIterator FileStore::get_omap_iterator(coll_t c,
 							  const ghobject_t &hoid)
 {
   tracepoint(objectstore, get_omap_iterator, c.c_str());
+  _kludge_temp_object_collection(c, hoid);
   dout(15) << __func__ << " " << c << "/" << hoid << dendl;
   Index index;
   int r = get_index(c, &index);
-  if (r < 0)
+  if (r < 0) {
+    dout(10) << __func__ << " " << c << "/" << hoid << " = 0 "
+	     << "(get_index failed with " << cpp_strerror(r) << ")" << dendl;
     return ObjectMap::ObjectMapIterator(); 
+  }
   {
     assert(NULL != index.index);
     RWLock::RLocker l((index.index)->access_lock);
     r = lfn_find(hoid, index);
-    if (r < 0)
+    if (r < 0) {
+      dout(10) << __func__ << " " << c << "/" << hoid << " = 0 "
+	       << "(lfn_find failed with " << cpp_strerror(r) << ")" << dendl;
       return ObjectMap::ObjectMapIterator();
+    }
   }
   return object_map->get_iterator(hoid);
 }
@@ -4777,46 +4954,55 @@ int FileStore::_create_collection(
   r = init_index(c);
   if (r < 0)
     return r;
+
+  // create parallel temp collection, too
+  if (!c.is_meta() && !c.is_temp()) {
+    coll_t temp = c.get_temp();
+    r = _create_collection(temp, spos);
+    if (r < 0)
+      return r;
+  }
+
   _set_replay_guard(c, spos);
   return 0;
 }
 
-// DEPRECATED -- remove with _split_collection_create
-int FileStore::_create_collection(coll_t c) 
+int FileStore::_destroy_collection(coll_t c) 
 {
+  int r = 0;
   char fn[PATH_MAX];
   get_cdir(c, fn, sizeof(fn));
-  dout(15) << "create_collection " << fn << dendl;
-  int r = ::mkdir(fn, 0755);
-  if (r < 0)
-    r = -errno;
-  dout(10) << "create_collection " << fn << " = " << r << dendl;
-
-  if (r < 0)
-    return r;
-  return init_index(c);
-}
-
-int FileStore::_destroy_collection(coll_t c) 
-{
+  dout(15) << "_destroy_collection " << fn << dendl;
   {
     Index from;
     int r = get_index(c, &from);
     if (r < 0)
-      return r;
+      goto out;
     assert(NULL != from.index);
     RWLock::WLocker l((from.index)->access_lock);
 
     r = from->prep_delete();
     if (r < 0)
-      return r;
+      goto out;
   }
-  char fn[PATH_MAX];
-  get_cdir(c, fn, sizeof(fn));
-  dout(15) << "_destroy_collection " << fn << dendl;
-  int r = ::rmdir(fn);
-  if (r < 0)
+  r = ::rmdir(fn);
+  if (r < 0) {
     r = -errno;
+    goto out;
+  }
+
+ out:
+  // destroy parallel temp collection, too
+  if (!c.is_meta() && !c.is_temp()) {
+    coll_t temp = c.get_temp();
+    int r2 = _destroy_collection(temp);
+    if (r2 < 0) {
+      r = r2;
+      goto out_final;
+    }
+  }
+
+ out_final:
   dout(10) << "_destroy_collection " << fn << " = " << r << dendl;
   return r;
 }
@@ -5130,10 +5316,11 @@ int FileStore::_split_collection(coll_t cid,
     vector<ghobject_t> objects;
     ghobject_t next;
     while (1) {
-      collection_list_partial(
+      collection_list(
 	cid,
-	next,
-	get_ideal_list_min(), get_ideal_list_max(), 0,
+	next, ghobject_t::get_max(),
+	true,
+	get_ideal_list_max(),
 	&objects,
 	&next);
       if (objects.empty())
@@ -5149,10 +5336,11 @@ int FileStore::_split_collection(coll_t cid,
     }
     next = ghobject_t();
     while (1) {
-      collection_list_partial(
+      collection_list(
 	dest,
-	next,
-	get_ideal_list_min(), get_ideal_list_max(), 0,
+	next, ghobject_t::get_max(),
+	true,
+	get_ideal_list_max(),
 	&objects,
 	&next);
       if (objects.empty())
@@ -5170,52 +5358,6 @@ int FileStore::_split_collection(coll_t cid,
   return r;
 }
 
-// DEPRECATED: remove once we are sure there won't be any such transactions
-// replayed
-int FileStore::_split_collection_create(coll_t cid,
-					uint32_t bits,
-					uint32_t rem,
-					coll_t dest,
-					const SequencerPosition &spos)
-{
-  dout(15) << __func__ << " " << cid << " bits: " << bits << dendl;
-  int r = _create_collection(dest);
-  if (r < 0 && !(r == -EEXIST && replaying))
-    return r;
-
-  int dstcmp = _check_replay_guard(cid, spos);
-  if (dstcmp < 0)
-    return 0;
-
-  int srccmp = _check_replay_guard(dest, spos);
-  if (srccmp < 0)
-    return 0;
-
-  _set_replay_guard(cid, spos, true);
-  _set_replay_guard(dest, spos, true);
-
-  Index from;
-  r = get_index(cid, &from);
-
-  Index to;
-  if (!r) 
-    r = get_index(dest, &to);
-
-  if (!r) {
-    assert(NULL != from.index);
-    RWLock::WLocker l1((from.index)->access_lock);
-
-    assert(NULL != to.index);
-    RWLock::WLocker l2((to.index)->access_lock);
- 
-    r = from->split(rem, bits, to.index);
-  }
-
-  _close_replay_guard(cid, spos);
-  _close_replay_guard(dest, spos);
-  return r;
-}
-
 int FileStore::_set_alloc_hint(coll_t cid, const ghobject_t& oid,
                                uint64_t expected_object_size,
                                uint64_t expected_write_size)
@@ -5305,6 +5447,8 @@ void FileStore::handle_conf_change(const struct md_config_t *conf,
     m_filestore_sloppy_crc = conf->filestore_sloppy_crc;
     m_filestore_sloppy_crc_block_size = conf->filestore_sloppy_crc_block_size;
     m_filestore_max_alloc_hint_size = conf->filestore_max_alloc_hint_size;
+    throttle_ops.reset_max(conf->filestore_queue_max_ops);
+    throttle_bytes.reset_max(conf->filestore_queue_max_bytes);
   }
   if (changed.count("filestore_commit_timeout")) {
     Mutex::Locker l(sync_entry_timeo_lock);
diff --git a/src/os/FileStore.h b/src/os/FileStore.h
index af1fb8d..6580dd4 100644
--- a/src/os/FileStore.h
+++ b/src/os/FileStore.h
@@ -96,7 +96,6 @@ public:
     return target_version;
   }
 
-  bool need_journal() { return true; }
   int peek_journal_fsid(uuid_d *fsid);
 
   struct FSPerfTracker {
@@ -141,6 +140,15 @@ private:
   int get_index(coll_t c, Index *index);
   int init_index(coll_t c);
 
+  void _kludge_temp_object_collection(coll_t& cid, const ghobject_t& oid) {
+    // - normal temp case: cid is pg, object is temp (pool < -1)
+    // - hammer temp case: cid is pg (or already temp), object pool is -1
+    if (cid.is_pg() && (oid.hobj.pool < -1 ||
+			oid.hobj.pool == -1))
+      cid = cid.get_temp();
+  }
+  void init_temp_collections();
+
   // ObjectMap
   boost::scoped_ptr<ObjectMap> object_map;
   
@@ -258,6 +266,7 @@ private:
       q.push_back(o);
     }
     Op *peek_queue() {
+      Mutex::Locker l(qlock);
       assert(apply_lock.is_locked());
       return q.front();
     }
@@ -299,7 +308,6 @@ private:
       Mutex::Locker l(qlock);
       uint64_t seq = 0;
       if (_get_max_uncompleted(&seq)) {
-	delete c;
 	return true;
       } else {
 	flush_commit_waiters.push_back(make_pair(seq, c));
@@ -325,11 +333,8 @@ private:
   FDCache fdcache;
   WBThrottle wbthrottle;
 
-  Sequencer default_osr;
   deque<OpSequencer*> op_queue;
-  uint64_t op_queue_len, op_queue_bytes;
-  Cond op_throttle_cond;
-  Mutex op_throttle_lock;
+  Throttle throttle_ops, throttle_bytes;
   Finisher op_finisher;
 
   ThreadPool op_tp;
@@ -358,6 +363,7 @@ private:
     void _process(OpSequencer *osr, ThreadPool::TPHandle &handle) {
       store->_do_op(osr, handle);
     }
+    using ThreadPool::WorkQueue<OpSequencer>::_process;
     void _process_finish(OpSequencer *osr) {
       store->_finish_op(osr);
     }
@@ -377,7 +383,7 @@ private:
   void _journaled_ahead(OpSequencer *osr, Op *o, Context *ondisk);
   friend struct C_JournaledAhead;
 
-  int open_journal();
+  void new_journal();
 
   PerfCounters *logger;
 
@@ -423,6 +429,15 @@ public:
   }
   int mkfs();
   int mkjournal();
+  bool wants_journal() {
+    return true;
+  }
+  bool allows_journal() {
+    return true;
+  }
+  bool needs_journal() {
+    return false;
+  }
 
   int write_version_stamp();
   int version_stamp_is_valid(uint32_t *version);
@@ -445,6 +460,10 @@ public:
    */
   bool get_allow_sharded_objects();
 
+  bool can_sort_nibblewise() {
+    return true;    // i support legacy sort order
+  }
+
   void collect_metadata(map<string,string> *pm);
 
   int statfs(struct statfs *buf);
@@ -526,6 +545,10 @@ public:
     bufferlist& bl,
     uint32_t op_flags = 0,
     bool allow_eio = false);
+  int _do_fiemap(int fd, uint64_t offset, size_t len,
+                 map<uint64_t, uint64_t> *m);
+  int _do_seek_hole_data(int fd, uint64_t offset, size_t len,
+                         map<uint64_t, uint64_t> *m);
   int fiemap(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len, bufferlist& bl);
 
   int _touch(coll_t cid, const ghobject_t& oid);
@@ -540,7 +563,7 @@ public:
 		   const SequencerPosition& spos);
   int _do_clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff);
   int _do_sparse_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff);
-  int _do_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff);
+  int _do_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff, bool skip_sloppycrc=false);
   int _remove(coll_t cid, const ghobject_t& oid, const SequencerPosition &spos);
 
   int _fgetattr(int fd, const char *name, bufferptr& bp);
@@ -556,6 +579,7 @@ public:
   void flush();
   void sync_and_flush();
 
+  int flush_journal();
   int dump_journal(ostream& out);
 
   void set_fsid(uuid_d u) {
@@ -565,8 +589,8 @@ public:
 
   // DEBUG read error injection, an object is removed from both on delete()
   Mutex read_error_lock;
-  set<ghobject_t> data_error_set; // read() will return -EIO
-  set<ghobject_t> mdata_error_set; // getattr(),stat() will return -EIO
+  set<ghobject_t, ghobject_t::BitwiseComparator> data_error_set; // read() will return -EIO
+  set<ghobject_t, ghobject_t::BitwiseComparator> mdata_error_set; // getattr(),stat() will return -EIO
   void inject_data_error(const ghobject_t &oid);
   void inject_mdata_error(const ghobject_t &oid);
   void debug_obj_on_delete(const ghobject_t &oid);
@@ -597,17 +621,15 @@ public:
 				   const SequencerPosition &spos);
 
   // collections
+  int collection_list(coll_t c, ghobject_t start, ghobject_t end,
+		      bool sort_bitwise, int max,
+		      vector<ghobject_t> *ls, ghobject_t *next);
   int list_collections(vector<coll_t>& ls);
+  int list_collections(vector<coll_t>& ls, bool include_temp);
   int collection_version_current(coll_t c, uint32_t *version);
   int collection_stat(coll_t c, struct stat *st);
   bool collection_exists(coll_t c);
   bool collection_empty(coll_t c);
-  int collection_list(coll_t c, vector<ghobject_t>& oid);
-  int collection_list_partial(coll_t c, ghobject_t start,
-			      int min, int max, snapid_t snap,
-			      vector<ghobject_t> *ls, ghobject_t *next);
-  int collection_list_range(coll_t c, ghobject_t start, ghobject_t end,
-                            snapid_t seq, vector<ghobject_t> *ls);
 
   // omap (see ObjectStore.h for documentation)
   int omap_get(coll_t c, const ghobject_t &oid, bufferlist *header,
@@ -624,7 +646,6 @@ public:
 		      set<string> *out);
   ObjectMap::ObjectMapIterator get_omap_iterator(coll_t c, const ghobject_t &oid);
 
-  int _create_collection(coll_t c);
   int _create_collection(coll_t c, const SequencerPosition &spos);
   int _destroy_collection(coll_t c);
   /**
@@ -635,7 +656,7 @@ public:
    * @param expected_num_objs - expected number of objects in this collection
    * @param spos              - sequence position
    *
-   * @Return 0 on success, an error code otherwise
+   * @return 0 on success, an error code otherwise
    */
   int _collection_hint_expected_num_objs(coll_t c, uint32_t pg_num,
       uint64_t expected_num_objs,
@@ -761,7 +782,7 @@ protected:
     return filestore->current_fn;
   }
   int _copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) {
-    if (has_fiemap()) {
+    if (has_fiemap() || has_seek_data_hole()) {
       return filestore->_do_sparse_copy_range(from, to, srcoff, len, dstoff);
     } else {
       return filestore->_do_copy_range(from, to, srcoff, len, dstoff);
@@ -788,9 +809,11 @@ public:
   virtual int destroy_checkpoint(const string& name) = 0;
   virtual int syncfs() = 0;
   virtual bool has_fiemap() = 0;
+  virtual bool has_seek_data_hole() = 0;
   virtual int do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap) = 0;
   virtual int clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) = 0;
   virtual int set_alloc_hint(int fd, uint64_t hint) = 0;
+  virtual bool has_splice() const = 0;
 
   // hooks for (sloppy) crc tracking
   virtual int _crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl) = 0;
diff --git a/src/os/FlatIndex.cc b/src/os/FlatIndex.cc
deleted file mode 100644
index 6dcb52e..0000000
--- a/src/os/FlatIndex.cc
+++ /dev/null
@@ -1,426 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage at newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software 
- * Foundation.  See file COPYING.
- * 
- */
-
-#if defined(__FreeBSD__)
-#include <sys/cdefs.h>
-#include <sys/param.h>
-#endif
-
-#include "FlatIndex.h"
-#include "common/ceph_crypto.h"
-#include "osd/osd_types.h"
-#include <errno.h>
-
-#include "chain_xattr.h"
-
-using ceph::crypto::SHA1;
-
-/*
- * long file names will have the following format:
- *
- * prefix_hash_index_cookie
- *
- * The prefix will just be the first X bytes of the original file name.
- * The cookie is a constant string that shows whether this file name
- * is hashed
- */
-
-#define FILENAME_LFN_DIGEST_SIZE CEPH_CRYPTO_SHA1_DIGESTSIZE
-
-#define FILENAME_MAX_LEN        4096    // the long file name size
-#define FILENAME_SHORT_LEN      255     // the short file name size
-#define FILENAME_COOKIE         "long"  // ceph long file name
-#define FILENAME_HASH_LEN       FILENAME_LFN_DIGEST_SIZE
-#define FILENAME_EXTRA	        4       // underscores and digit
-
-#define LFN_ATTR "user.cephos.lfn"
-
-#define FILENAME_PREFIX_LEN (FILENAME_SHORT_LEN - FILENAME_HASH_LEN - (sizeof(FILENAME_COOKIE) - 1) - FILENAME_EXTRA)
-
-
-int FlatIndex::cleanup() {
-  return 0;
-}
-
-static inline void buf_to_hex(const unsigned char *buf, int len, char *str)
-{
-  int i;
-  str[0] = '\0';
-  for (i = 0; i < len; i++) {
-    sprintf(&str[i*2], "%02x", (int)buf[i]);
-  }
-}
-
-static int hash_filename(const char *filename, char *hash, int buf_len)
-{
-  if (buf_len < FILENAME_HASH_LEN + 1)
-    return -EINVAL;
-
-  char buf[FILENAME_LFN_DIGEST_SIZE];
-  char hex[FILENAME_LFN_DIGEST_SIZE * 2];
-
-  SHA1 h;
-  h.Update((const byte *)filename, strlen(filename));
-  h.Final((byte *)buf);
-
-  buf_to_hex((byte *)buf, (FILENAME_HASH_LEN + 1) / 2, hex);
-  strncpy(hash, hex, FILENAME_HASH_LEN);
-  hash[FILENAME_HASH_LEN] = '\0';
-  return 0;
-}
-
-static void build_filename(char *filename, int len, const char *old_filename, int i)
-{
-  char hash[FILENAME_HASH_LEN + 1];
-
-  assert(len >= FILENAME_SHORT_LEN + 4);
-
-  strncpy(filename, old_filename, FILENAME_PREFIX_LEN);
-  filename[FILENAME_PREFIX_LEN] = '\0';
-  if (strlen(filename) < FILENAME_PREFIX_LEN)
-    return;
-  if (old_filename[FILENAME_PREFIX_LEN] == '\0')
-    return;
-
-  hash_filename(old_filename, hash, sizeof(hash));
-  int ofs = FILENAME_PREFIX_LEN;
-  while (1) {
-    int suffix_len = sprintf(filename + ofs, "_%s_%d_" FILENAME_COOKIE, hash, i);
-    if (ofs + suffix_len <= FILENAME_SHORT_LEN || !ofs)
-      break;
-    ofs--;
-  }
-}
-
-/* is this a candidate? */
-static int lfn_is_hashed_filename(const char *filename)
-{
-  int len = strlen(filename);
-  if (len < FILENAME_SHORT_LEN)
-    return 0;
-  return (strcmp(filename + len - (sizeof(FILENAME_COOKIE) - 1), FILENAME_COOKIE) == 0);
-}
-
-static void lfn_translate(const char *path, const char *name, char *new_name, int len)
-{
-  if (!lfn_is_hashed_filename(name)) {
-    strncpy(new_name, name, len);
-    return;
-  }
-
-  char buf[PATH_MAX];
-
-  snprintf(buf, sizeof(buf), "%s/%s", path, name);
-  int r = chain_getxattr(buf, LFN_ATTR, new_name, len - 1);
-  if (r < 0)
-    strncpy(new_name, name, len);
-  else
-    new_name[r] = '\0';
-  return;
-}
-
-static int append_oname(const ghobject_t &oid, char *s, int len)
-{
-  //assert(sizeof(oid) == 28);
-  char *end = s + len;
-  char *t = s + strlen(s);
-
-  const char *i = oid.hobj.oid.name.c_str();
-  while (*i && t < end) {
-    if (*i == '\\') {
-      *t++ = '\\';
-      *t++ = '\\';      
-    } else if (*i == '.' && i == oid.hobj.oid.name.c_str()) {  // only escape leading .
-      *t++ = '\\';
-      *t++ = '.';
-    } else if (*i == '/') {
-      *t++ = '\\';
-      *t++ = 's';
-    } else
-      *t++ = *i;
-    i++;
-  }
-
-  int size = t - s;
-
-  if (oid.hobj.snap == CEPH_NOSNAP)
-    size += snprintf(t, end - t, "_head");
-  else if (oid.hobj.snap == CEPH_SNAPDIR)
-    size += snprintf(t, end - t, "_snapdir");
-  else
-    size += snprintf(t, end - t, "_%llx", (long long unsigned)oid.hobj.snap);
-
-  return size;
-}
-
-static bool parse_object(char *s, ghobject_t& oid)
-{
-  sobject_t o;
-  char *bar = s + strlen(s) - 1;
-  while (*bar != '_' &&
-	 bar > s)
-    bar--;
-  if (*bar == '_') {
-    char buf[bar-s + 1];
-    char *t = buf;
-    char *i = s;
-    while (i < bar) {
-      if (*i == '\\') {
-	i++;
-	switch (*i) {
-	case '\\': *t++ = '\\'; break;
-	case '.': *t++ = '.'; break;
-	case 's': *t++ = '/'; break;
-	default: assert(0);
-	}
-      } else {
-	*t++ = *i;
-      }
-      i++;
-    }
-    *t = 0;
-    o.oid.name = string(buf, t-buf);
-    if (strcmp(bar+1, "head") == 0)
-      o.snap = CEPH_NOSNAP;
-    else if (strcmp(bar+1, "snapdir") == 0)
-      o.snap = CEPH_SNAPDIR;
-    else
-      o.snap = strtoull(bar+1, &s, 16);
-    oid = ghobject_t(hobject_t(o));
-    return true;
-  }
-  return false;
-}
-
-static int lfn_get(const char *coll_path, const ghobject_t& oid, char *pathname, int len, char *lfn, int lfn_len, int *exist, int *is_lfn)
-{
-  int i = 0;
-  strncpy(pathname, coll_path, len);
-  size_t path_len = strlen(coll_path);
-  pathname[path_len] = '/';
-  path_len++;
-  pathname[path_len] = '\0';
-  char *filename = pathname + path_len;
-
-  *lfn = '\0';
-  int actual_len = append_oname(oid, lfn, lfn_len);
-
-  if (actual_len < (int)FILENAME_PREFIX_LEN) {
-    /* not a long file name, just build it as it is */
-    strncpy(filename, lfn, len - path_len);
-    *is_lfn = 0;
-    struct stat buf;
-    int r = ::stat(pathname, &buf);
-    if (r < 0) {
-      if (errno == ENOENT) {
-	*exist = 0;
-      } else {
-	return -errno;
-      }
-    } else {
-      *exist = 1;
-    }
-
-    return 0;
-  }
-
-  *is_lfn = 1;
-  *exist = 0;
-
-  while (1) {
-    char buf[PATH_MAX];
-    int r;
-
-    build_filename(filename, len - path_len, lfn, i);
-    r = chain_getxattr(pathname, LFN_ATTR, buf, sizeof(buf));
-    if (r < 0)
-      r = -errno;
-    if (r > 0) {
-      buf[MIN((int)sizeof(buf)-1, r)] = '\0';
-      if (strcmp(buf, lfn) == 0) { // a match?
-        *exist = 1;
-        return i;
-      }
-    }
-    switch (r) {
-    case -ENOENT:
-      return i;
-    case -ERANGE:
-      assert(0); // shouldn't happen
-    default:
-      break;
-    }
-    if (r < 0)
-      break;
-    i++;
-  }
-
-  return 0; // unreachable anyway
-}
-
-int FlatIndex::init() {
-  return 0;
-}
-
-int FlatIndex::created(const ghobject_t &hoid, const char *path) {
-  char long_name[PATH_MAX];
-  long_name[0] = '\0';
-  int actual_len = append_oname(hoid, long_name, sizeof(long_name));
-  if (actual_len < (int)FILENAME_PREFIX_LEN) {
-    return 0;
-  }
-  assert(long_name[actual_len] == '\0');
-  assert(long_name[actual_len - 1] != '\0');
-  int r = chain_setxattr(path, LFN_ATTR, long_name, actual_len);
-  if (r < 0)
-    return r;
-  return 0;
-}
-
-int FlatIndex::unlink(const ghobject_t &o) {
-  char long_fn[PATH_MAX];
-  char short_fn[PATH_MAX];
-  char short_fn2[PATH_MAX];
-  int r, i, exist, err;
-  int path_len;
-  int is_lfn;
-
-  r = lfn_get(base_path.c_str(), o, short_fn, sizeof(short_fn), 
-	      long_fn, sizeof(long_fn), &exist, &is_lfn);
-  if (r < 0)
-    return r;
-  if (!is_lfn) {
-    r = ::unlink(short_fn);
-    if (r < 0) {
-      return -errno;
-    }
-    return 0;
-  }
-  if (!exist)
-    return -ENOENT;
-
-  const char *next = strncpy(short_fn2, base_path.c_str(), sizeof(short_fn2));
-  path_len = next - short_fn2;
-  short_fn2[path_len] = '/';
-  path_len++;
-  short_fn2[path_len] = '\0';
-
-  for (i = r + 1; ; i++) {
-    struct stat buf;
-    int ret;
-
-    build_filename(&short_fn2[path_len], sizeof(short_fn2) - path_len, long_fn, i);
-    ret = ::stat(short_fn2, &buf);
-    if (ret < 0) {
-      if (i == r + 1) {
-        err = ::unlink(short_fn);
-        if (err < 0)
-          return err;
-        return 0;
-      }
-      break;
-    }
-  }
-
-  build_filename(&short_fn2[path_len], sizeof(short_fn2) - path_len, long_fn, i - 1);
-
-  if (rename(short_fn2, short_fn) < 0) {
-    assert(0);
-  }
-
-  return 0;
-}
-
-int FlatIndex::lookup(const ghobject_t &hoid, IndexedPath *path, int *exist) {
-  char long_fn[PATH_MAX];
-  char short_fn[PATH_MAX];
-  int r;
-  int is_lfn;
-  r = lfn_get(base_path.c_str(), hoid, 
-	      short_fn, sizeof(short_fn), long_fn, 
-	      sizeof(long_fn), exist, &is_lfn);
-  if (r < 0)
-    return r;
-  *path = IndexedPath(new Path(string(short_fn), this));
-  return 0;
-}
-
-static int get_hobject_from_oinfo(const char *dir, const char *file, 
-				  ghobject_t *o) {
-  char path[PATH_MAX];
-  bufferptr bp(PATH_MAX);
-  snprintf(path, sizeof(path), "%s/%s", dir, file);
-  // Hack, user.ceph._ is the attribute used to store the object info
-  int r = chain_getxattr(path, "user.ceph._", bp.c_str(), bp.length());
-  if (r < 0)
-    return r;
-  bufferlist bl;
-  bl.push_back(bp);
-  object_info_t oi(bl);
-  *o = oi.soid;
-  return 0;
-}
-
-int FlatIndex::collection_list_partial(const ghobject_t &start,
-				       int min_count,
-				       int max_count,
-				       snapid_t seq,
-				       vector<ghobject_t> *ls,
-				       ghobject_t *next) {
-  assert(0); // Should not be called
-  return 0;
-}
-
-int FlatIndex::collection_list(vector<ghobject_t> *ls) {
-  char buf[offsetof(struct dirent, d_name) + PATH_MAX + 1];
-  char dir_name[PATH_MAX], new_name[PATH_MAX];
-  strncpy(dir_name, base_path.c_str(), sizeof(dir_name));
-  dir_name[sizeof(dir_name)-1]='\0';
-
-  DIR *dir = ::opendir(dir_name);
-  if (!dir)
-    return -errno;
-  
-  // first, build (ino, object) list
-  vector< pair<ino_t,ghobject_t> > inolist;
-
-  struct dirent *de;
-  while (::readdir_r(dir, (struct dirent *)buf, &de) == 0) {
-    if (!de)
-      break;
-    // parse
-    if (de->d_name[0] == '.')
-      continue;
-    //cout << "  got object " << de->d_name << std::endl;
-    ghobject_t o;
-    lfn_translate(dir_name, de->d_name, new_name, sizeof(new_name));
-    if (parse_object(new_name, o)) {
-      get_hobject_from_oinfo(dir_name, de->d_name, &o);
-      inolist.push_back(pair<ino_t,ghobject_t>(de->d_ino, o));
-      ls->push_back(o);
-    }
-  }
-
-  // sort
-  sort(inolist.begin(), inolist.end());
-
-  // build final list
-  ls->resize(inolist.size());
-  int i = 0;
-  for (vector< pair<ino_t,ghobject_t> >::iterator p = inolist.begin(); p != inolist.end(); ++p)
-    (*ls)[i++].swap(p->second);
-  
-  ::closedir(dir);
-  return 0;
-}
diff --git a/src/os/FlatIndex.h b/src/os/FlatIndex.h
deleted file mode 100644
index 0509df4..0000000
--- a/src/os/FlatIndex.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage at newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software 
- * Foundation.  See file COPYING.
- * 
- */
-
-#ifndef CEPH_FLATINDEX_H
-#define CEPH_FLATINDEX_H
-
-#include <string>
-#include <map>
-#include <set>
-#include <vector>
-#include "include/memory.h"
-
-#include "CollectionIndex.h"
-
-/**
- * FlatIndex implements the collection layout prior to CollectionIndex
- *
- * This class should only be used for converting old filestores.
- */
-class FlatIndex : public CollectionIndex {
-  string base_path;
-  coll_t collection;
-public:
-  FlatIndex(coll_t collection, string base_path) : 
-              CollectionIndex(collection),
-              base_path(base_path),
-	      collection(collection) {}
-
-  /// @see CollectionIndex
-  uint32_t collection_version() { return FLAT_INDEX_TAG; }
-
-  coll_t coll() const { return collection; }
-
-  /// @see CollectionIndex
-  int cleanup();
-
-  /// @see CollectionIndex
-  int init();
-
-  /// @see CollectionIndex
-  int created(
-    const ghobject_t &oid,
-    const char *path
-    );
-
-  /// @see CollectionIndex
-  int unlink(
-    const ghobject_t &oid
-    );
-
-  /// @see CollectionIndex
-  int lookup(
-    const ghobject_t &oid,
-    IndexedPath *path,
-    int *exist
-    );
-
-  /// @see CollectionIndex
-  int collection_list(
-    vector<ghobject_t> *ls
-    );
-
-  /// @see CollectionIndex
-  int collection_list_partial(
-    const ghobject_t &start,
-    int min_count,
-    int max_count,
-    snapid_t seq,
-    vector<ghobject_t> *ls,
-    ghobject_t *next
-    );
-};
-
-#endif
diff --git a/src/os/GenericFileStoreBackend.cc b/src/os/GenericFileStoreBackend.cc
index 09a0228..4bba413 100644
--- a/src/os/GenericFileStoreBackend.cc
+++ b/src/os/GenericFileStoreBackend.cc
@@ -57,8 +57,11 @@
 GenericFileStoreBackend::GenericFileStoreBackend(FileStore *fs):
   FileStoreBackend(fs),
   ioctl_fiemap(false),
+  seek_data_hole(false),
   m_filestore_fiemap(g_conf->filestore_fiemap),
-  m_filestore_fsync_flushes_journal_data(g_conf->filestore_fsync_flushes_journal_data) {}
+  m_filestore_seek_data_hole(g_conf->filestore_seek_data_hole),
+  m_filestore_fsync_flushes_journal_data(g_conf->filestore_fsync_flushes_journal_data),
+  m_filestore_splice(false) {}
 
 int GenericFileStoreBackend::detect_features()
 {
@@ -110,26 +113,76 @@ int GenericFileStoreBackend::detect_features()
   }
 
   // fiemap an extent inside that
-  struct fiemap *fiemap;
-  int r = do_fiemap(fd, 2430421, 59284, &fiemap);
-  if (r < 0) {
-    dout(0) << "detect_features: FIEMAP ioctl is NOT supported" << dendl;
+  if (!m_filestore_fiemap) {
+    dout(0) << "detect_features: FIEMAP ioctl is disabled via 'filestore fiemap' config option" << dendl;
     ioctl_fiemap = false;
   } else {
-    if (fiemap->fm_mapped_extents == 0) {
-      dout(0) << "detect_features: FIEMAP ioctl is supported, but buggy -- upgrade your kernel" << dendl;
+    struct fiemap *fiemap;
+    int r = do_fiemap(fd, 2430421, 59284, &fiemap);
+    if (r < 0) {
+      dout(0) << "detect_features: FIEMAP ioctl is NOT supported" << dendl;
       ioctl_fiemap = false;
     } else {
-      dout(0) << "detect_features: FIEMAP ioctl is supported and appears to work" << dendl;
-      ioctl_fiemap = true;
+      if (fiemap->fm_mapped_extents == 0) {
+        dout(0) << "detect_features: FIEMAP ioctl is supported, but buggy -- upgrade your kernel" << dendl;
+        ioctl_fiemap = false;
+      } else {
+        dout(0) << "detect_features: FIEMAP ioctl is supported and appears to work" << dendl;
+        ioctl_fiemap = true;
+      }
+      free(fiemap);
     }
-    free(fiemap);
   }
-  if (!m_filestore_fiemap) {
-    dout(0) << "detect_features: FIEMAP ioctl is disabled via 'filestore fiemap' config option" << dendl;
-    ioctl_fiemap = false;
+
+  // SEEK_DATA/SEEK_HOLE detection
+  if (!m_filestore_seek_data_hole) {
+    dout(0) << "detect_features: SEEK_DATA/SEEK_HOLE is disabled via 'filestore seek data hole' config option" << dendl;
+    seek_data_hole = false;
+  } else {
+#if defined(__linux__) && defined(SEEK_HOLE) && defined(SEEK_DATA)
+    // If compiled on an OS with SEEK_HOLE/SEEK_DATA support, but running
+    // on an OS that doesn't support SEEK_HOLE/SEEK_DATA, EINVAL is returned.
+    // Fall back to use fiemap.
+    off_t hole_pos;
+
+    hole_pos = lseek(fd, 0, SEEK_HOLE);
+    if (hole_pos < 0) {
+      if (errno == EINVAL) {
+        dout(0) << "detect_features: lseek SEEK_DATA/SEEK_HOLE is NOT supported" << dendl;
+        seek_data_hole = false;
+      } else {
+        derr << "detect_features: failed to lseek " << fn << ": " << cpp_strerror(-errno) << dendl;
+        VOID_TEMP_FAILURE_RETRY(::close(fd));
+        return -errno;
+      }
+    } else {
+      dout(0) << "detect_features: lseek SEEK_DATA/SEEK_HOLE is supported" << dendl;
+      seek_data_hole = true;
+    }
+#endif
   }
 
+  //splice detection
+#ifdef CEPH_HAVE_SPLICE
+  if (!m_filestore_splice) {
+    int pipefd[2];
+    loff_t off_in = 0;
+    int r;
+    if ((r = pipe(pipefd)) < 0)
+      dout(0) << "detect_features: splice  pipe met error " << cpp_strerror(errno) << dendl;
+    else {
+      lseek(fd, 0, SEEK_SET);
+      r = splice(fd, &off_in, pipefd[1], NULL, 10, 0);
+      if (!(r < 0 && errno == EINVAL)) {
+	m_filestore_splice = true;
+	dout(0) << "detect_features: splice is supported" << dendl;
+      } else
+	dout(0) << "detect_features: splice is NOT supported" << dendl;
+      close(pipefd[0]);
+      close(pipefd[1]);
+    }
+  }
+#endif
   ::unlink(fn);
   VOID_TEMP_FAILURE_RETRY(::close(fd));
 
@@ -228,11 +281,15 @@ int GenericFileStoreBackend::do_fiemap(int fd, off_t start, size_t len, struct f
   fiemap->fm_length = len + start % CEPH_PAGE_SIZE;
   fiemap->fm_flags = FIEMAP_FLAG_SYNC; /* flush extents to disk if needed */
 
+#if defined(DARWIN) || defined(__FreeBSD__)
+  ret = -ENOTSUP;
+  goto done_err;
+#else
   if (ioctl(fd, FS_IOC_FIEMAP, fiemap) < 0) {
     ret = -errno;
     goto done_err;
   }
-
+#endif
   size = sizeof(struct fiemap_extent) * (fiemap->fm_mapped_extents);
 
   _realloc_fiemap = (struct fiemap *)realloc(fiemap, sizeof(struct fiemap) + size);
@@ -248,12 +305,16 @@ int GenericFileStoreBackend::do_fiemap(int fd, off_t start, size_t len, struct f
   fiemap->fm_extent_count = fiemap->fm_mapped_extents;
   fiemap->fm_mapped_extents = 0;
 
+#if defined(DARWIN) || defined(__FreeBSD__)
+  ret = -ENOTSUP;
+  goto done_err;
+#else
   if (ioctl(fd, FS_IOC_FIEMAP, fiemap) < 0) {
     ret = -errno;
     goto done_err;
   }
   *pfiemap = fiemap;
-
+#endif
   return 0;
 
 done_err:
diff --git a/src/os/GenericFileStoreBackend.h b/src/os/GenericFileStoreBackend.h
index fec56ce..f31e202 100644
--- a/src/os/GenericFileStoreBackend.h
+++ b/src/os/GenericFileStoreBackend.h
@@ -22,8 +22,11 @@ class SloppyCRCMap;
 class GenericFileStoreBackend : public FileStoreBackend {
 private:
   bool ioctl_fiemap;
+  bool seek_data_hole;
   bool m_filestore_fiemap;
+  bool m_filestore_seek_data_hole;
   bool m_filestore_fsync_flushes_journal_data;
+  bool m_filestore_splice;
 public:
   GenericFileStoreBackend(FileStore *fs);
   virtual ~GenericFileStoreBackend() {}
@@ -41,12 +44,13 @@ public:
   virtual int destroy_checkpoint(const string& name) { return -EOPNOTSUPP; }
   virtual int syncfs();
   virtual bool has_fiemap() { return ioctl_fiemap; }
+  virtual bool has_seek_data_hole() { return seek_data_hole; }
   virtual int do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap);
   virtual int clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) {
     return _copy_range(from, to, srcoff, len, dstoff);
   }
   virtual int set_alloc_hint(int fd, uint64_t hint) { return -EOPNOTSUPP; }
-
+  virtual bool has_splice() const { return m_filestore_splice; }
 private:
   int _crc_load_or_init(int fd, SloppyCRCMap *cm);
   int _crc_save(int fd, SloppyCRCMap *cm);
diff --git a/src/os/GenericObjectMap.cc b/src/os/GenericObjectMap.cc
index 67a15d2..62f052f 100644
--- a/src/os/GenericObjectMap.cc
+++ b/src/os/GenericObjectMap.cc
@@ -110,23 +110,36 @@ string GenericObjectMap::header_key(const coll_t &cid, const ghobject_t &oid)
   full_name.append(GHOBJECT_KEY_SEP_S);
 
   char buf[PATH_MAX];
-  char *t = buf;
-  char *end = t + sizeof(buf);
+  char *t;
+  char *end;
 
-  // make field ordering match with hobject_t compare operations
-  snprintf(t, end - t, "%.*X", (int)(sizeof(oid.hobj.get_hash())*2),
-           (uint32_t)oid.get_filestore_key_u32());
-  full_name += string(buf);
+  // make field ordering match with ghobject_t compare operations
+  t = buf;
+  end = t + sizeof(buf);
+  if (oid.shard_id == shard_id_t::NO_SHARD) {
+    // otherwise ff will sort *after* 0, not before.
+    full_name += "--";
+  } else {
+    t += snprintf(t, end - t, "%02x", (int)oid.shard_id);
+    full_name += string(buf);
+  }
   full_name.append(GHOBJECT_KEY_SEP_S);
 
-  append_escaped(oid.hobj.nspace, &full_name);
+  t = buf;
+  t += snprintf(t, end - t, "%016llx",
+		(long long)(oid.hobj.pool + 0x8000000000000000));
+  full_name += string(buf);
   full_name.append(GHOBJECT_KEY_SEP_S);
 
   t = buf;
-  t += snprintf(t, end - t, "%lld", (long long)oid.hobj.pool);
+  snprintf(t, end - t, "%.*X", (int)(sizeof(oid.hobj.get_hash())*2),
+           (uint32_t)oid.hobj.get_bitwise_key_u32());
   full_name += string(buf);
   full_name.append(GHOBJECT_KEY_SEP_S);
 
+  append_escaped(oid.hobj.nspace, &full_name);
+  full_name.append(GHOBJECT_KEY_SEP_S);
+
   append_escaped(oid.hobj.get_key(), &full_name);
   full_name.append(GHOBJECT_KEY_SEP_S);
 
@@ -144,20 +157,12 @@ string GenericObjectMap::header_key(const coll_t &cid, const ghobject_t &oid)
   full_name += string(buf);
 
   if (oid.generation != ghobject_t::NO_GEN) {
-    assert(oid.shard_id != shard_id_t::NO_SHARD);
     full_name.append(GHOBJECT_KEY_SEP_S);
 
     t = buf;
     end = t + sizeof(buf);
     t += snprintf(t, end - t, "%016llx", (long long unsigned)oid.generation);
     full_name += string(buf);
-
-    full_name.append(GHOBJECT_KEY_SEP_S);
-
-    t = buf;
-    end = t + sizeof(buf);
-    t += snprintf(t, end - t, "%x", (int)oid.shard_id);
-    full_name += string(buf);
   }
 
   full_name.append(1, GHOBJECT_KEY_ENDING);
@@ -174,7 +179,7 @@ bool GenericObjectMap::parse_header_key(const string &long_name,
   string ns;
   uint32_t hash;
   snapid_t snap;
-  uint64_t pool;
+  int64_t pool;
   gen_t generation = ghobject_t::NO_GEN;
   shard_id_t shard_id = shard_id_t::NO_SHARD;
 
@@ -189,25 +194,33 @@ bool GenericObjectMap::parse_header_key(const string &long_name,
   for ( ; end != long_name.end() && *end != GHOBJECT_KEY_SEP_C; ++end) ;
   if (end == long_name.end())
     return false;
-  string hash_str(current, end);
-  sscanf(hash_str.c_str(), "%X", &hash);
+  string shardstring = string(current, end);
+  if (shardstring == "--")
+    shard_id = shard_id_t::NO_SHARD;
+  else
+    shard_id = (shard_id_t)strtoul(shardstring.c_str(), NULL, 16);
 
   current = ++end;
   for ( ; end != long_name.end() && *end != GHOBJECT_KEY_SEP_C; ++end) ;
   if (end == long_name.end())
     return false;
-  if (!append_unescaped(current, end, &ns))
+  string pstring(current, end);
+  pool = strtoull(pstring.c_str(), NULL, 16);
+  pool -= 0x8000000000000000;
+
+  current = ++end;
+  for ( ; end != long_name.end() && *end != GHOBJECT_KEY_SEP_C; ++end) ;
+  if (end == long_name.end())
     return false;
+  string hash_str(current, end);
+  sscanf(hash_str.c_str(), "%X", &hash);
 
   current = ++end;
   for ( ; end != long_name.end() && *end != GHOBJECT_KEY_SEP_C; ++end) ;
   if (end == long_name.end())
     return false;
-  string pstring(current, end);
-  if (pstring == "none")
-    pool = (uint64_t)-1;
-  else
-    pool = strtoull(pstring.c_str(), NULL, 16);
+  if (!append_unescaped(current, end, &ns))
+    return false;
 
   current = ++end;
   for ( ; end != long_name.end() && *end != GHOBJECT_KEY_SEP_C; ++end) ;
@@ -224,10 +237,10 @@ bool GenericObjectMap::parse_header_key(const string &long_name,
     return false;
 
   current = ++end;
-  for ( ; end != long_name.end() && *end != GHOBJECT_KEY_SEP_C && *end != GHOBJECT_KEY_ENDING; ++end) ;
+  for ( ; end != long_name.end() && *end != GHOBJECT_KEY_SEP_C &&
+	  *end != GHOBJECT_KEY_ENDING; ++end) ;
   if (end == long_name.end())
     return false;
-
   string snap_str(current, end);
   if (snap_str == "head")
     snap = CEPH_NOSNAP;
@@ -237,34 +250,27 @@ bool GenericObjectMap::parse_header_key(const string &long_name,
     snap = strtoull(snap_str.c_str(), NULL, 16);
 
   // Optional generation/shard_id
-  string genstring, shardstring;
-  if (*end != GHOBJECT_KEY_ENDING) {
+  string genstring;
+  if (*end == GHOBJECT_KEY_SEP_C) {
     current = ++end;
-    for ( ; end != long_name.end() && *end != GHOBJECT_KEY_SEP_C; ++end) ;
-    if (*end != GHOBJECT_KEY_SEP_C)
+    for ( ; end != long_name.end() && *end != GHOBJECT_KEY_ENDING; ++end) ;
+    if (end != long_name.end())
       return false;
     genstring = string(current, end);
-
     generation = (gen_t)strtoull(genstring.c_str(), NULL, 16);
-
-    current = ++end;
-    for ( ; end != long_name.end() && *end != GHOBJECT_KEY_ENDING; ++end) ;
-    if (end == long_name.end())
-      return false;
-    shardstring = string(current, end);
-
-    shard_id = (shard_id_t)strtoul(shardstring.c_str(), NULL, 16);
   }
 
   if (out) {
-    (*out) = ghobject_t(hobject_t(name, key, snap, hash, (int64_t)pool, ns),
+    (*out) = ghobject_t(hobject_t(name, key, snap,
+				  hobject_t::_reverse_bits(hash),
+				  (int64_t)pool, ns),
                         generation, shard_id);
-    // restore reversed hash. see calculate_key
-    out->hobj.set_hash(out->get_filestore_key());
   }
 
-  if (out_coll)
-    *out_coll = coll_t(coll);
+  if (out_coll) {
+    bool valid = out_coll->parse(coll);
+    assert(valid);
+  }
 
   return true;
 }
@@ -527,6 +533,7 @@ int GenericObjectMap::clear(const Header header,
 
 int GenericObjectMap::rm_keys(const Header header,
                               const string &prefix,
+                              const set<string> &buffered_keys,
                               const set<string> &to_clear,
                               KeyValueDB::Transaction t)
 {
@@ -556,7 +563,7 @@ int GenericObjectMap::rm_keys(const Header header,
         begin = new_complete.rbegin()->first;
       }
       while (iter->valid() && copied < 20) {
-        if (!to_clear.count(iter->key()))
+        if (!to_clear.count(iter->key()) && !buffered_keys.count(iter->key()))
           to_write[iter->key()].append(iter->value());
         if (i != to_clear.end() && *i <= iter->key()) {
           ++i;
@@ -621,7 +628,7 @@ int GenericObjectMap::get_keys(const coll_t &cid, const ghobject_t &oid,
     return -ENOENT;
 
   ObjectMap::ObjectMapIterator iter = _get_iterator(header, prefix);
-  for (; iter->valid(); iter->next()) {
+  for (iter->seek_to_first(); iter->valid(); iter->next()) {
     if (iter->status())
       return iter->status();
     keys->insert(iter->key());
@@ -685,6 +692,7 @@ void GenericObjectMap::clone(const Header parent, const coll_t &cid,
   // to find parent header. So it will let lookup_parent fail when "clone" and
   // "rm_keys" in one transaction. Here have to sync transaction to make
   // visiable for lookup_parent
+  // FIXME: Clear transaction operations here
   int r = submit_transaction_sync(t);
   assert(r == 0);
 }
@@ -734,15 +742,6 @@ int GenericObjectMap::init(bool do_upgrade)
   return 0;
 }
 
-int GenericObjectMap::sync(const Header header, KeyValueDB::Transaction t)
-{
-  write_state(t);
-  if (header) {
-    set_header(header->cid, header->oid, *header, t);
-  }
-  return 0;
-}
-
 bool GenericObjectMap::check(std::ostream &out)
 {
   bool retval = true;
@@ -1049,16 +1048,15 @@ void GenericObjectMap::set_header(const coll_t &cid, const ghobject_t &oid,
   t->set(GHOBJECT_TO_SEQ_PREFIX, to_set);
 }
 
-int GenericObjectMap::list_objects(const coll_t &cid, ghobject_t start, int max,
+int GenericObjectMap::list_objects(const coll_t &cid, ghobject_t start, ghobject_t end, int max,
                                    vector<ghobject_t> *out, ghobject_t *next)
 {
   // FIXME
   Mutex::Locker l(header_lock);
-
   if (start.is_max())
       return 0;
 
-  if (start.hobj.is_min()) {
+  if (start.is_min()) {
     vector<ghobject_t> oids;
 
     KeyValueDB::Iterator iter = db->get_iterator(GHOBJECT_TO_SEQ_PREFIX);
@@ -1102,7 +1100,14 @@ int GenericObjectMap::list_objects(const coll_t &cid, ghobject_t start, int max,
       break;
     }
 
-    assert(start <= header.oid);
+    if (cmp_bitwise(header.oid, end) >= 0) {
+      if (next)
+	*next = ghobject_t::get_max();
+      break;
+    }
+
+    assert(cmp_bitwise(start, header.oid) <= 0);
+    assert(cmp_bitwise(header.oid, end) < 0);
 
 
     size++;
diff --git a/src/os/GenericObjectMap.h b/src/os/GenericObjectMap.h
index 864a06b..ecf2822 100644
--- a/src/os/GenericObjectMap.h
+++ b/src/os/GenericObjectMap.h
@@ -117,7 +117,7 @@ class GenericObjectMap {
   bool check(std::ostream &out);
 
   /// Util, list all objects, there must be no other concurrent access
-  int list_objects(const coll_t &cid, ghobject_t start, int max,
+  int list_objects(const coll_t &cid, ghobject_t start, ghobject_t end, int max,
                    vector<ghobject_t> *objs, ///< [out] objects
                    ghobject_t *next);
 
@@ -240,6 +240,7 @@ class GenericObjectMap {
   int rm_keys(
     const Header header,
     const string &prefix,
+    const set<string> &buffered_keys,
     const set<string> &to_clear,
     KeyValueDB::Transaction t
     );
@@ -260,9 +261,6 @@ class GenericObjectMap {
     KeyValueDB::Transaction t
     );
 
-  /// Ensure that all previous operations are durable
-  int sync(const Header header, KeyValueDB::Transaction t);
-
   static const string GLOBAL_STATE_KEY;
   static const string PARENT_KEY;
 
diff --git a/src/os/HashIndex.cc b/src/os/HashIndex.cc
index ead95b4..2c69d81 100644
--- a/src/os/HashIndex.cc
+++ b/src/os/HashIndex.cc
@@ -25,6 +25,69 @@
 const string HashIndex::SUBDIR_ATTR = "contents";
 const string HashIndex::IN_PROGRESS_OP_TAG = "in_progress_op";
 
+/// hex digit to integer value
+int hex_to_int(char c)
+{
+  if (c >= '0' && c <= '9')
+    return c - '0';
+  if (c >= 'A' && c <= 'F')
+    return c - 'A' + 10;
+  assert(0);
+}
+
+/// int value to hex digit
+char int_to_hex(int v)
+{
+  assert(v < 16);
+  if (v < 10)
+    return '0' + v;
+  return 'A' + v - 10;
+}
+
+/// reverse bits in a nibble (0..15)
+int reverse_nibble_bits(int in)
+{
+  assert(in < 16);
+  return
+    ((in & 8) >> 3) |
+    ((in & 4) >> 1) |
+    ((in & 2) << 1) |
+    ((in & 1) << 3);
+}
+
+/// reverse nibble bits in a hex digit
+char reverse_hexdigit_bits(char c)
+{
+  return int_to_hex(reverse_nibble_bits(hex_to_int(c)));
+}
+
+/// reverse nibble bits in a hex string
+string reverse_hexdigit_bits_string(string s)
+{
+  for (unsigned i=0; i<s.size(); ++i)
+    s[i] = reverse_hexdigit_bits(s[i]);
+  return s;
+}
+
+/// compare hex digit (as length 1 string) bitwise
+bool cmp_hexdigit_bitwise(const string& l, const string& r)
+{
+  assert(l.length() == 1 && r.length() == 1);
+  int lv = hex_to_int(l[0]);
+  int rv = hex_to_int(r[0]);
+  assert(lv < 16);
+  assert(rv < 16);
+  return reverse_nibble_bits(lv) < reverse_nibble_bits(rv);
+}
+
+/// compare hex digit string bitwise
+bool cmp_hexdigit_string_bitwise(const string& l, const string& r)
+{
+  string ll = reverse_hexdigit_bits_string(l);
+  string rr = reverse_hexdigit_bits_string(r);
+  return ll < rr;
+}
+
 int HashIndex::cleanup() {
   bufferlist bl;
   int r = get_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
@@ -71,7 +134,7 @@ int HashIndex::reset_attr(
   if (!exists)
     return 0;
   map<string, ghobject_t> objects;
-  set<string> subdirs;
+  vector<string> subdirs;
   r = list_objects(path, 0, 0, &objects);
   if (r < 0)
     return r;
@@ -98,7 +161,7 @@ int HashIndex::col_split_level(
    * bits of the hash represented by the subdir path with inbits, match passed
    * in.
    */
-  set<string> subdirs;
+  vector<string> subdirs;
   int r = from.list_subdirs(path, &subdirs);
   if (r < 0)
     return r;
@@ -108,7 +171,7 @@ int HashIndex::col_split_level(
     return r;
 
   set<string> to_move;
-  for (set<string>::iterator i = subdirs.begin();
+  for (vector<string>::iterator i = subdirs.begin();
        i != subdirs.end();
        ++i) {
     uint32_t bits = 0;
@@ -320,15 +383,10 @@ int HashIndex::_lookup(const ghobject_t &oid,
   return get_mangled_name(*path, oid, mangled_name, exists_out);
 }
 
-int HashIndex::_collection_list(vector<ghobject_t> *ls) {
-  vector<string> path;
-  return list_by_hash(path, 0, 0, 0, 0, ls);
-}
-
 int HashIndex::_collection_list_partial(const ghobject_t &start,
-					int min_count,
+					const ghobject_t &end,
+					bool sort_bitwise,
 					int max_count,
-					snapid_t seq,
 					vector<ghobject_t> *ls,
 					ghobject_t *next) {
   vector<string> path;
@@ -336,8 +394,8 @@ int HashIndex::_collection_list_partial(const ghobject_t &start,
   if (!next)
     next = &_next;
   *next = start;
-  dout(20) << "_collection_list_partial " << start << " " << min_count << "-" << max_count << " ls.size " << ls->size() << dendl;
-  return list_by_hash(path, min_count, max_count, seq, next, ls);
+  dout(20) << __func__ << " start:" << start << " end:" << end << "-" << max_count << " ls.size " << ls->size() << dendl;
+  return list_by_hash(path, end, sort_bitwise, max_count, next, ls);
 }
 
 int HashIndex::prep_delete() {
@@ -382,7 +440,7 @@ int HashIndex::pre_split_folder(uint32_t pg_num, uint64_t expected_num_objs)
     return 0;
 
   spg_t spgid;
-  if (!c.is_pg_prefix(spgid))
+  if (!c.is_pg_prefix(&spgid))
     return -EINVAL;
   const ps_t ps = spgid.pgid.ps();
 
@@ -418,6 +476,7 @@ int HashIndex::pre_split_folder(uint32_t pg_num, uint64_t expected_num_objs)
   // the below logic is inspired by rados.h#ceph_stable_mod,
   // it basically determines how many sub-folders should we
   // create for splitting
+  assert(pg_num_bits > 0); // otherwise BAD_SHIFT
   if (((1 << (pg_num_bits - 1)) | ps) >= pg_num) {
     ++split_bits;
   }
@@ -430,6 +489,7 @@ int HashIndex::pre_split_folder(uint32_t pg_num, uint64_t expected_num_objs)
     leavies = leavies >> 4;
   }
   for (uint32_t i = 0; i < subs; ++i) {
+    assert(split_bits <= 4); // otherwise BAD_SHIFT
     int v = tmp_id | (i << ((4 - split_bits) % 4));
     paths.push_back(to_hex(v));
     ret = create_path(paths);
@@ -446,7 +506,7 @@ int HashIndex::pre_split_folder(uint32_t pg_num, uint64_t expected_num_objs)
 int HashIndex::init_split_folder(vector<string> &path, uint32_t hash_level)
 {
   // Get the number of sub directories for the current path
-  set<string> subdirs;
+  vector<string> subdirs;
   int ret = list_subdirs(path, &subdirs);
   if (ret < 0)
     return ret;
@@ -461,7 +521,7 @@ int HashIndex::init_split_folder(vector<string> &path, uint32_t hash_level)
     return ret;
 
   // Do the same for subdirs
-  set<string>::const_iterator iter;
+  vector<string>::const_iterator iter;
   for (iter = subdirs.begin(); iter != subdirs.end(); ++iter) {
     path.push_back(*iter);
     ret = init_split_folder(path, hash_level + 1);
@@ -490,7 +550,7 @@ int HashIndex::recursive_create_path(vector<string>& path, int level)
 }
 
 int HashIndex::recursive_remove(const vector<string> &path) {
-  set<string> subdirs;
+  vector<string> subdirs;
   int r = list_subdirs(path, &subdirs);
   if (r < 0)
     return r;
@@ -501,7 +561,7 @@ int HashIndex::recursive_remove(const vector<string> &path) {
   if (!objects.empty())
     return -ENOTEMPTY;
   vector<string> subdir(path);
-  for (set<string>::iterator i = subdirs.begin();
+  for (vector<string>::iterator i = subdirs.begin();
        i != subdirs.end();
        ++i) {
     subdir.push_back(*i);
@@ -632,10 +692,12 @@ int HashIndex::complete_split(const vector<string> &path, subdir_info_s info) {
   r = list_objects(path, 0, 0, &objects);
   if (r < 0)
     return r;
-  set<string> subdirs;
-  r = list_subdirs(path, &subdirs);
+  vector<string> subdirs_vec;
+  r = list_subdirs(path, &subdirs_vec);
   if (r < 0)
     return r;
+  set<string> subdirs;
+  subdirs.insert(subdirs_vec.begin(), subdirs_vec.end());
   map<string, map<string, ghobject_t> > mapped;
   map<string, ghobject_t> moved;
   int num_moved = 0;
@@ -727,7 +789,7 @@ int HashIndex::complete_split(const vector<string> &path, subdir_info_s info) {
 void HashIndex::get_path_components(const ghobject_t &oid,
 				    vector<string> *path) {
   char buf[MAX_HASH_LEVEL + 1];
-  snprintf(buf, sizeof(buf), "%.*X", MAX_HASH_LEVEL, (uint32_t)oid.hobj.get_filestore_key());
+  snprintf(buf, sizeof(buf), "%.*X", MAX_HASH_LEVEL, (uint32_t)oid.hobj.get_nibblewise_key());
 
   // Path components are the hex characters of oid.hobj.hash, least
   // significant first
@@ -764,96 +826,232 @@ uint32_t HashIndex::hash_prefix_to_hash(string prefix) {
   return hash;
 }
 
-int HashIndex::get_path_contents_by_hash(const vector<string> &path,
-					 const string *lower_bound,
-					 const ghobject_t *next_object,
-					 const snapid_t *seq,
-					 set<string> *hash_prefixes,
-					 set<pair<string, ghobject_t> > *objects) {
-  set<string> subdirs;
+int HashIndex::get_path_contents_by_hash_bitwise(
+  const vector<string> &path,
+  const ghobject_t *next_object,
+  set<string, CmpHexdigitStringBitwise> *hash_prefixes,
+  set<pair<string, ghobject_t>, CmpPairBitwise> *objects)
+{
   map<string, ghobject_t> rev_objects;
   int r;
+  r = list_objects(path, 0, 0, &rev_objects);
+  if (r < 0)
+    return r;
+  // bitwise sort
+  for (map<string, ghobject_t>::iterator i = rev_objects.begin();
+       i != rev_objects.end();
+       ++i) {
+    if (next_object && cmp_bitwise(i->second, *next_object) < 0)
+      continue;
+    string hash_prefix = get_path_str(i->second);
+    hash_prefixes->insert(hash_prefix);
+    objects->insert(pair<string, ghobject_t>(hash_prefix, i->second));
+  }
+  vector<string> subdirs;
+  r = list_subdirs(path, &subdirs);
+  if (r < 0)
+    return r;
+
+  // sort subdirs bitwise (by reversing hex digit nibbles)
+  std::sort(subdirs.begin(), subdirs.end(), cmp_hexdigit_bitwise);
+
+  // Local to this function, we will convert the prefix strings
+  // (previously simply the reversed hex digits) to also have each
+  // digit's nibbles reversed.  This will make the strings sort
+  // bitwise.
   string cur_prefix;
   for (vector<string>::const_iterator i = path.begin();
        i != path.end();
        ++i) {
-    cur_prefix.append(*i);
+    cur_prefix.append(reverse_hexdigit_bits_string(*i));
+  }
+  string next_object_string;
+  if (next_object)
+    next_object_string = reverse_hexdigit_bits_string(get_path_str(*next_object));
+  for (vector<string>::iterator i = subdirs.begin();
+       i != subdirs.end();
+       ++i) {
+    string candidate = cur_prefix + reverse_hexdigit_bits_string(*i);
+    if (next_object) {
+      if (next_object->is_max())
+	continue;
+      if (candidate < next_object_string.substr(0, candidate.size()))
+	continue;
+    }
+    // re-reverse the hex digit nibbles for the caller
+    hash_prefixes->insert(reverse_hexdigit_bits_string(candidate));
   }
+  return 0;
+}
+
+int HashIndex::get_path_contents_by_hash_nibblewise(
+  const vector<string> &path,
+  const ghobject_t *next_object,
+  set<string> *hash_prefixes,
+  set<pair<string, ghobject_t>, CmpPairNibblewise > *objects)
+{
+  map<string, ghobject_t> rev_objects;
+  int r;
   r = list_objects(path, 0, 0, &rev_objects);
   if (r < 0)
     return r;
+
   for (map<string, ghobject_t>::iterator i = rev_objects.begin();
        i != rev_objects.end();
        ++i) {
     string hash_prefix = get_path_str(i->second);
-    if (lower_bound && hash_prefix < *lower_bound)
-      continue;
-    if (next_object && i->second < *next_object)
-      continue;
-    if (seq && i->second.hobj.snap < *seq)
+    if (next_object && cmp_nibblewise(i->second, *next_object) < 0)
       continue;
     hash_prefixes->insert(hash_prefix);
     objects->insert(pair<string, ghobject_t>(hash_prefix, i->second));
   }
+
+  vector<string> subdirs;
   r = list_subdirs(path, &subdirs);
   if (r < 0)
     return r;
-  for (set<string>::iterator i = subdirs.begin();
+
+  // sort nibblewise (string sort of (reversed) hex digits)
+  std::sort(subdirs.begin(), subdirs.end());
+
+  string cur_prefix;
+  for (vector<string>::const_iterator i = path.begin();
+       i != path.end();
+       ++i) {
+    cur_prefix.append(*i);
+  }
+  string next_object_string;
+  if (next_object)
+    next_object_string = get_path_str(*next_object);
+
+  for (vector<string>::iterator i = subdirs.begin();
        i != subdirs.end();
        ++i) {
     string candidate = cur_prefix + *i;
-    if (lower_bound && candidate < lower_bound->substr(0, candidate.size()))
-      continue;
-    if (next_object &&
-        (next_object->is_max() ||
-	 candidate < get_path_str(*next_object).substr(0, candidate.size())))
-      continue;
+    if (next_object) {
+      if (next_object->is_max())
+	continue;
+      if (candidate < next_object_string.substr(0, candidate.size()))
+	continue;
+    }
     hash_prefixes->insert(cur_prefix + *i);
   }
   return 0;
 }
 
 int HashIndex::list_by_hash(const vector<string> &path,
-			    int min_count,
+			    const ghobject_t &end,
+			    bool sort_bitwise,
 			    int max_count,
-			    snapid_t seq,
 			    ghobject_t *next,
-			    vector<ghobject_t> *out) {
+			    vector<ghobject_t> *out)
+{
   assert(out);
+  if (sort_bitwise)
+    return list_by_hash_bitwise(path, end, max_count, next, out);
+  else
+    return list_by_hash_nibblewise(path, end, max_count, next, out);
+}
+
+int HashIndex::list_by_hash_bitwise(
+  const vector<string> &path,
+  const ghobject_t& end,
+  int max_count,
+  ghobject_t *next,
+  vector<ghobject_t> *out)
+{
   vector<string> next_path = path;
   next_path.push_back("");
-  set<string> hash_prefixes;
-  set<pair<string, ghobject_t> > objects;
-  int r = get_path_contents_by_hash(path,
-				    NULL,
-				    next,
-				    &seq,
-				    &hash_prefixes,
-				    &objects);
+  set<string, CmpHexdigitStringBitwise> hash_prefixes;
+  set<pair<string, ghobject_t>, CmpPairBitwise> objects;
+  int r = get_path_contents_by_hash_bitwise(path,
+					    next,
+					    &hash_prefixes,
+					    &objects);
   if (r < 0)
     return r;
-  dout(20) << " prefixes " << hash_prefixes << dendl;
-  for (set<string>::iterator i = hash_prefixes.begin();
+  for (set<string, CmpHexdigitStringBitwise>::iterator i = hash_prefixes.begin();
        i != hash_prefixes.end();
        ++i) {
-    set<pair<string, ghobject_t> >::iterator j = objects.lower_bound(
+    dout(20) << __func__ << " prefix " << *i << dendl;
+    set<pair<string, ghobject_t>, CmpPairBitwise>::iterator j = objects.lower_bound(
       make_pair(*i, ghobject_t()));
     if (j == objects.end() || j->first != *i) {
-      if (min_count > 0 && out->size() > (unsigned)min_count) {
+      *(next_path.rbegin()) = *(i->rbegin());
+      ghobject_t next_recurse;
+      if (next)
+	next_recurse = *next;
+      r = list_by_hash_bitwise(next_path,
+			       end,
+			       max_count,
+			       &next_recurse,
+			       out);
+
+      if (r < 0)
+	return r;
+      if (!next_recurse.is_max()) {
 	if (next)
-	  *next = ghobject_t(hobject_t("", "", CEPH_NOSNAP, hash_prefix_to_hash(*i), -1, ""));
+	  *next = next_recurse;
 	return 0;
       }
+    } else {
+      while (j != objects.end() && j->first == *i) {
+	if (max_count > 0 && out->size() == (unsigned)max_count) {
+	  if (next)
+	    *next = j->second;
+	  return 0;
+	}
+	if (cmp_bitwise(j->second, end) >= 0) {
+	  if (next)
+	    *next = ghobject_t::get_max();
+	  return 0;
+	}
+	if (!next || cmp_bitwise(j->second, *next) >= 0) {
+	  dout(20) << __func__ << " prefix " << *i << " ob " << j->second << dendl;
+	  out->push_back(j->second);
+	}
+	++j;
+      }
+    }
+  }
+  if (next)
+    *next = ghobject_t::get_max();
+  return 0;
+}
+
+int HashIndex::list_by_hash_nibblewise(
+  const vector<string> &path,
+  const ghobject_t& end,
+  int max_count,
+  ghobject_t *next,
+  vector<ghobject_t> *out)
+{
+  vector<string> next_path = path;
+  next_path.push_back("");
+  set<string> hash_prefixes;
+  set<pair<string, ghobject_t>, CmpPairNibblewise> objects;
+  int r = get_path_contents_by_hash_nibblewise(path,
+					       next,
+					       &hash_prefixes,
+					       &objects);
+  if (r < 0)
+    return r;
+  for (set<string>::iterator i = hash_prefixes.begin();
+       i != hash_prefixes.end();
+       ++i) {
+    dout(20) << __func__ << " prefix " << *i << dendl;
+    set<pair<string, ghobject_t>, CmpPairNibblewise >::iterator j =
+      objects.lower_bound(make_pair(*i, ghobject_t()));
+    if (j == objects.end() || j->first != *i) {
       *(next_path.rbegin()) = *(i->rbegin());
       ghobject_t next_recurse;
       if (next)
 	next_recurse = *next;
-      r = list_by_hash(next_path,
-		       min_count,
-		       max_count,
-		       seq,
-		       &next_recurse,
-		       out);
+      r = list_by_hash_nibblewise(next_path,
+				  end,
+				  max_count,
+				  &next_recurse,
+				  out);
 
       if (r < 0)
 	return r;
@@ -869,7 +1067,12 @@ int HashIndex::list_by_hash(const vector<string> &path,
 	    *next = j->second;
 	  return 0;
 	}
-	if (!next || j->second >= *next) {
+	if (cmp_nibblewise(j->second, end) >= 0) {
+	  if (next)
+	    *next = ghobject_t::get_max();
+	  return 0;
+	}
+	if (!next || cmp_nibblewise(j->second, *next) >= 0) {
 	  out->push_back(j->second);
 	}
 	++j;
@@ -877,6 +1080,6 @@ int HashIndex::list_by_hash(const vector<string> &path,
     }
   }
   if (next)
-    *next = ghobject_t(hobject_t::get_max());
+    *next = ghobject_t::get_max();
   return 0;
 }
diff --git a/src/os/HashIndex.h b/src/os/HashIndex.h
index dad8ce3..cacdbc8 100644
--- a/src/os/HashIndex.h
+++ b/src/os/HashIndex.h
@@ -19,6 +19,7 @@
 #include "include/encoding.h"
 #include "LFNIndex.h"
 
+extern string reverse_hexdigit_bits_string(string l);
 
 /**
  * Implements collection prehashing.
@@ -178,9 +179,6 @@ protected:
     string *mangled_name,
     int *exists
     );
-  int _collection_list(
-    vector<ghobject_t> *ls
-    );
 
   /**
    * Pre-hash the collection to create folders according to the expected number
@@ -193,9 +191,9 @@ protected:
 
   int _collection_list_partial(
     const ghobject_t &start,
-    int min_count,
+    const ghobject_t &end,
+    bool sort_bitwise,
     int max_count,
-    snapid_t seq,
     vector<ghobject_t> *ls,
     ghobject_t *next
     );
@@ -353,22 +351,75 @@ private:
     return str;
   }
 
+  struct CmpPairNibblewise {
+    bool operator()(const pair<string, ghobject_t>& l,
+		    const pair<string, ghobject_t>& r)
+    {
+      if (l.first < r.first)
+	return true;
+      if (l.first > r.first)
+	return false;
+      if (cmp_nibblewise(l.second, r.second) < 0)
+	return true;
+      return false;
+    }
+  };
+
+  struct CmpPairBitwise {
+    bool operator()(const pair<string, ghobject_t>& l,
+		    const pair<string, ghobject_t>& r)
+    {
+      if (l.first < r.first)
+	return true;
+      if (l.first > r.first)
+	return false;
+      if (cmp_bitwise(l.second, r.second) < 0)
+	return true;
+      return false;
+    }
+  };
+
+  struct CmpHexdigitStringBitwise {
+    bool operator()(const string& l, const string& r) {
+      return reverse_hexdigit_bits_string(l) < reverse_hexdigit_bits_string(r);
+    }
+  };
+
   /// Get path contents by hash
-  int get_path_contents_by_hash(
-    const vector<string> &path,            /// [in] Path to list
-    const string *lower_bound,             /// [in] list > *lower_bound
+  int get_path_contents_by_hash_bitwise(
+    const vector<string> &path,             /// [in] Path to list
     const ghobject_t *next_object,          /// [in] list > *next_object
-    const snapid_t *seq,                   /// [in] list >= *seq
-    set<string> *hash_prefixes,            /// [out] prefixes in dir
-    set<pair<string, ghobject_t> > *objects /// [out] objects
+    set<string, CmpHexdigitStringBitwise> *hash_prefixes, /// [out] prefixes in dir
+    set<pair<string, ghobject_t>, CmpPairBitwise> *objects /// [out] objects
+    );
+  int get_path_contents_by_hash_nibblewise(
+    const vector<string> &path,             /// [in] Path to list
+    const ghobject_t *next_object,          /// [in] list > *next_object
+    set<string> *hash_prefixes,             /// [out] prefixes in dir
+    set<pair<string, ghobject_t>, CmpPairNibblewise> *objects /// [out] objects
     );
 
   /// List objects in collection in ghobject_t order
   int list_by_hash(
     const vector<string> &path, /// [in] Path to list
-    int min_count,              /// [in] List at least min_count
+    const ghobject_t &end,      /// [in] List only objects < end
+    bool sort_bitwise,          /// [in] sort bitwise
+    int max_count,              /// [in] List at most max_count
+    ghobject_t *next,            /// [in,out] List objects >= *next
+    vector<ghobject_t> *out      /// [out] Listed objects
+    ); ///< @return Error Code, 0 on success
+  /// List objects in collection in ghobject_t order
+  int list_by_hash_bitwise(
+    const vector<string> &path, /// [in] Path to list
+    const ghobject_t &end,      /// [in] List only objects < end
+    int max_count,              /// [in] List at most max_count
+    ghobject_t *next,            /// [in,out] List objects >= *next
+    vector<ghobject_t> *out      /// [out] Listed objects
+    ); ///< @return Error Code, 0 on success
+  int list_by_hash_nibblewise(
+    const vector<string> &path, /// [in] Path to list
+    const ghobject_t &end,      /// [in] List only objects < end
     int max_count,              /// [in] List at most max_count
-    snapid_t seq,               /// [in] list only objects where snap >= seq
     ghobject_t *next,            /// [in,out] List objects >= *next
     vector<ghobject_t> *out      /// [out] Listed objects
     ); ///< @return Error Code, 0 on success
diff --git a/src/os/IndexManager.cc b/src/os/IndexManager.cc
index 7f999a1..6a9f040 100644
--- a/src/os/IndexManager.cc
+++ b/src/os/IndexManager.cc
@@ -28,7 +28,6 @@
 #include "include/buffer.h"
 
 #include "IndexManager.h"
-#include "FlatIndex.h"
 #include "HashIndex.h"
 #include "CollectionIndex.h"
 
@@ -95,10 +94,7 @@ int IndexManager::build_index(coll_t c, const char *path, CollectionIndex **inde
       return r;
 
     switch (version) {
-    case CollectionIndex::FLAT_INDEX_TAG: {
-      *index = new FlatIndex(c, path);
-      return 0;
-    }
+    case CollectionIndex::FLAT_INDEX_TAG:
     case CollectionIndex::HASH_INDEX_TAG: // fall through
     case CollectionIndex::HASH_INDEX_TAG_2: // fall through
     case CollectionIndex::HOBJECT_WITH_POOL: {
diff --git a/src/os/IndexManager.h b/src/os/IndexManager.h
index cf9ba89..b167e7d 100644
--- a/src/os/IndexManager.h
+++ b/src/os/IndexManager.h
@@ -24,7 +24,6 @@
 
 #include "CollectionIndex.h"
 #include "HashIndex.h"
-#include "FlatIndex.h"
 
 
 /// Public type for Index
@@ -77,7 +76,7 @@ public:
    * Reserve and return index for c
    *
    * @param [in] c Collection for which to get index
-   * @param [in] path Path to collection
+   * @param [in] baseDir base directory of collections
    * @param [out] index Index for c
    * @return error code
    */
diff --git a/src/os/JournalingObjectStore.cc b/src/os/JournalingObjectStore.cc
index 518d54e..35cf74a 100644
--- a/src/os/JournalingObjectStore.cc
+++ b/src/os/JournalingObjectStore.cc
@@ -251,27 +251,37 @@ void JournalingObjectStore::ApplyManager::commit_finish()
 }
 
 void JournalingObjectStore::_op_journal_transactions(
-  list<ObjectStore::Transaction*>& tls, uint64_t op,
+  bufferlist& tbl, int data_align,  uint64_t op,
   Context *onjournal, TrackedOpRef osd_op)
 {
-  dout(10) << "op_journal_transactions " << op << " " << tls << dendl;
+  if (osd_op.get())
+    dout(10) << "op_journal_transactions " << op << " reqid_t "
+             << (static_cast<OpRequest *>(osd_op.get()))->get_reqid() << dendl;
+  else
+    dout(10) << "op_journal_transactions " << op  << dendl;
 
   if (journal && journal->is_writeable()) {
-    bufferlist tbl;
-    unsigned data_len = 0;
-    int data_align = -1; // -1 indicates that we don't care about the alignment
-    for (list<ObjectStore::Transaction*>::iterator p = tls.begin();
-	 p != tls.end(); ++p) {
-      ObjectStore::Transaction *t = *p;
-      if (t->get_data_length() > data_len &&
-	(int)t->get_data_length() >= g_conf->journal_align_min_size) {
-	data_len = t->get_data_length();
-	data_align = (t->get_data_alignment() - tbl.length()) & ~CEPH_PAGE_MASK;
-      }
-      ::encode(*t, tbl);
-    }
     journal->submit_entry(op, tbl, data_align, onjournal, osd_op);
   } else if (onjournal) {
     apply_manager.add_waiter(op, onjournal);
   }
 }
+
+int JournalingObjectStore::_op_journal_transactions_prepare(
+  list<ObjectStore::Transaction*>& tls, bufferlist& tbl)
+{
+  dout(10) << "_op_journal_transactions_prepare " << tls << dendl;
+  unsigned data_len = 0;
+  int data_align = -1; // -1 indicates that we don't care about the alignment
+  for (list<ObjectStore::Transaction*>::iterator p = tls.begin();
+      p != tls.end(); ++p) {
+    ObjectStore::Transaction *t = *p;
+    if (t->get_data_length() > data_len &&
+     (int)t->get_data_length() >= g_conf->journal_align_min_size) {
+     data_len = t->get_data_length();
+     data_align = (t->get_data_alignment() - tbl.length()) & ~CEPH_PAGE_MASK;
+    }
+    ::encode(*t, tbl);
+  }
+  return data_align;
+}
diff --git a/src/os/JournalingObjectStore.h b/src/os/JournalingObjectStore.h
index 6a09c61..fbfa20c 100644
--- a/src/os/JournalingObjectStore.h
+++ b/src/os/JournalingObjectStore.h
@@ -114,7 +114,9 @@ protected:
   void journal_write_close();
   int journal_replay(uint64_t fs_op_seq);
 
-  void _op_journal_transactions(list<ObjectStore::Transaction*>& tls, uint64_t op,
+  int _op_journal_transactions_prepare(
+    list<ObjectStore::Transaction*>& tls, bufferlist& tbl);
+  void _op_journal_transactions(bufferlist& tls, int data_align, uint64_t op,
 				Context *onjournal, TrackedOpRef osd_op);
 
   virtual int do_transactions(list<ObjectStore::Transaction*>& tls, uint64_t op_seq) = 0;
diff --git a/src/os/KeyValueDB.cc b/src/os/KeyValueDB.cc
index f7f1e30..65ce487 100644
--- a/src/os/KeyValueDB.cc
+++ b/src/os/KeyValueDB.cc
@@ -17,12 +17,14 @@ KeyValueDB *KeyValueDB::create(CephContext *cct, const string& type,
     return new LevelDBStore(cct, dir);
   }
 #ifdef HAVE_KINETIC
-  if (type == "kinetic") {
+  if (type == "kinetic" &&
+      cct->check_experimental_feature_enabled("kinetic")) {
     return new KineticStore(cct);
   }
 #endif
 #ifdef HAVE_LIBROCKSDB
-  if (type == "rocksdb") {
+  if (type == "rocksdb" &&
+      cct->check_experimental_feature_enabled("rocksdb")) {
     return new RocksDBStore(cct, dir);
   }
 #endif
@@ -31,7 +33,7 @@ KeyValueDB *KeyValueDB::create(CephContext *cct, const string& type,
 
 int KeyValueDB::test_init(const string& type, const string& dir)
 {
-  if (type == "leveldb"){
+  if (type == "leveldb") {
     return LevelDBStore::_test_init(dir);
   }
 #ifdef HAVE_KINETIC
@@ -40,7 +42,7 @@ int KeyValueDB::test_init(const string& type, const string& dir)
   }
 #endif
 #ifdef HAVE_LIBROCKSDB
-  if (type == "rocksdb"){
+  if (type == "rocksdb") {
     return RocksDBStore::_test_init(dir);
   }
 #endif
diff --git a/src/os/KeyValueDB.h b/src/os/KeyValueDB.h
index 0ac512f..e82151d 100644
--- a/src/os/KeyValueDB.h
+++ b/src/os/KeyValueDB.h
@@ -70,7 +70,7 @@ public:
 
   /// test whether we can successfully initialize; may have side effects (e.g., create)
   static int test_init(const string& type, const string& dir);
-  virtual int init() = 0;
+  virtual int init(string option_str="") = 0;
   virtual int open(ostream &out) = 0;
   virtual int create_and_open(ostream &out) = 0;
 
@@ -86,6 +86,21 @@ public:
     const std::set<string> &key,      ///< [in] Key to retrieve
     std::map<string, bufferlist> *out ///< [out] Key value retrieved
     ) = 0;
+  virtual int get(const string &prefix, ///< [in] prefix
+		  const string &key,    ///< [in] key
+		  bufferlist *value) {  ///< [out] value
+    set<string> ks;
+    ks.insert(key);
+    map<string,bufferlist> om;
+    int r = get(prefix, ks, &om);
+    if (om.find(key) != om.end()) {
+      *value = om[key];
+    } else {
+      *value = bufferlist();
+      r = -ENOENT;
+    }
+    return r;
+  }
 
   class WholeSpaceIteratorImpl {
   public:
@@ -130,7 +145,7 @@ public:
       if (!generic_iter->valid())
 	return false;
       pair<string,string> raw_key = generic_iter->raw_key();
-      return (raw_key.first == prefix);
+      return (raw_key.first.compare(0, prefix.length(), prefix) == 0);
     }
     int next() {
       if (valid())
@@ -145,6 +160,9 @@ public:
     string key() {
       return generic_iter->key();
     }
+    pair<string, string> raw_key() {
+      return generic_iter->raw_key();
+    }
     bufferlist value() {
       return generic_iter->value();
     }
@@ -176,6 +194,9 @@ public:
   }
 
   virtual uint64_t get_estimated_size(map<string,uint64_t> &extra) = 0;
+  virtual int get_statfs(struct statfs *buf) {
+    return -EOPNOTSUPP;
+  }
 
   virtual ~KeyValueDB() {}
 
diff --git a/src/os/KeyValueStore.cc b/src/os/KeyValueStore.cc
index 1881f2d..1a633c6 100644
--- a/src/os/KeyValueStore.cc
+++ b/src/os/KeyValueStore.cc
@@ -98,6 +98,7 @@ int StripObjectMap::save_strip_header(StripObjectHeaderRef strip_header,
     ::encode(*strip_header, strip_header->header->data);
 
     set_header(strip_header->cid, strip_header->oid, *(strip_header->header), t);
+    strip_header->updated = false;
   }
   return 0;
 }
@@ -268,9 +269,8 @@ int StripObjectMap::get_keys_with_header(const StripObjectHeaderRef header,
                                          set<string> *keys)
 {
   ObjectMap::ObjectMapIterator iter = _get_iterator(header->header, prefix);
-  for (; iter->valid(); iter->next()) {
-    if (iter->status())
-      return iter->status();
+  for (iter->seek_to_first(); iter->valid(); iter->next()) {
+    assert(!iter->status());
     keys->insert(iter->key());
   }
   return 0;
@@ -281,8 +281,7 @@ int StripObjectMap::get_with_header(const StripObjectHeaderRef header,
 {
   ObjectMap::ObjectMapIterator iter = _get_iterator(header->header, prefix);
   for (iter->seek_to_first(); iter->valid(); iter->next()) {
-    if (iter->status())
-      return iter->status();
+    assert(!iter->status());
     out->insert(make_pair(iter->key(), iter->value()));
   }
 
@@ -375,9 +374,10 @@ void KeyValueStore::BufferTransaction::set_buffer_keys(
   store->backend->set_keys(strip_header->header, prefix, values, t);
 
   uniq_id uid = make_pair(strip_header->cid, strip_header->oid);
+  map<pair<string, string>, bufferlist> &uid_buffers = buffers[uid];
   for (map<string, bufferlist>::iterator iter = values.begin();
        iter != values.end(); ++iter) {
-    buffers[uid][make_pair(prefix, iter->first)].swap(iter->second);
+    uid_buffers[make_pair(prefix, iter->first)].swap(iter->second);
   }
 }
 
@@ -387,13 +387,22 @@ int KeyValueStore::BufferTransaction::remove_buffer_keys(
 {
   uniq_id uid = make_pair(strip_header->cid, strip_header->oid);
   map< uniq_id, map<pair<string, string>, bufferlist> >::iterator obj_it = buffers.find(uid);
+  set<string> buffered_keys;
   if ( obj_it != buffers.end() ) {
+    // TODO: Avoid use empty bufferlist to indicate the key is removed
     for (set<string>::iterator iter = keys.begin(); iter != keys.end(); ++iter) {
       obj_it->second[make_pair(prefix, *iter)] = bufferlist();
     }
+    // TODO: Avoid collect all buffered keys when remove keys
+    if (strip_header->header->parent) {
+      for (map<pair<string, string>, bufferlist>::iterator iter = obj_it->second.begin();
+           iter != obj_it->second.end(); ++iter) {
+        buffered_keys.insert(iter->first.second);
+      }
+    }
   }
 
-  return store->backend->rm_keys(strip_header->header, prefix, keys, t);
+  return store->backend->rm_keys(strip_header->header, prefix, buffered_keys, keys, t);
 }
 
 void KeyValueStore::BufferTransaction::clear_buffer_keys(
@@ -462,11 +471,13 @@ int KeyValueStore::BufferTransaction::submit_transaction()
     if (header->deleted)
       continue;
 
-    r = store->backend->save_strip_header(header, t);
+    if (header->updated) {
+      r = store->backend->save_strip_header(header, t);
 
-    if (r < 0) {
-      dout(10) << __func__ << " save strip header failed " << dendl;
-      goto out;
+      if (r < 0) {
+        dout(10) << __func__ << " save strip header failed " << dendl;
+        goto out;
+      }
     }
   }
 
@@ -521,10 +532,10 @@ KeyValueStore::KeyValueStore(const std::string &base,
   fsid_fd(-1), current_fd(-1),
   backend(NULL),
   ondisk_finisher(g_ceph_context),
+  collections_lock("KeyValueStore::collections_lock"),
   lock("KeyValueStore::lock"),
-  default_osr("default"),
-  op_queue_len(0), op_queue_bytes(0),
-  op_throttle_lock("KeyValueStore::op_throttle_lock"),
+  throttle_ops(g_ceph_context, "keyvaluestore_ops", g_conf->keyvaluestore_queue_max_ops),
+  throttle_bytes(g_ceph_context, "keyvaluestore_bytes", g_conf->keyvaluestore_queue_max_bytes),
   op_finisher(g_ceph_context),
   op_tp(g_ceph_context, "KeyValueStore::op_tp",
         g_conf->keyvaluestore_op_threads, "keyvaluestore_op_threads"),
@@ -535,7 +546,9 @@ KeyValueStore::KeyValueStore(const std::string &base,
   m_keyvaluestore_queue_max_bytes(g_conf->keyvaluestore_queue_max_bytes),
   m_keyvaluestore_strip_size(g_conf->keyvaluestore_default_strip_size),
   m_keyvaluestore_max_expected_write_size(g_conf->keyvaluestore_max_expected_write_size),
-  do_update(do_update)
+  do_update(do_update),
+  m_keyvaluestore_do_dump(false),
+  m_keyvaluestore_dump_fmt(true)
 {
   ostringstream oss;
   oss << basedir << "/current";
@@ -544,15 +557,15 @@ KeyValueStore::KeyValueStore(const std::string &base,
   // initialize perf_logger
   PerfCountersBuilder plb(g_ceph_context, internal_name, l_os_commit_len, l_os_last);
 
-  plb.add_u64(l_os_oq_max_ops, "op_queue_max_ops");
-  plb.add_u64(l_os_oq_ops, "op_queue_ops");
-  plb.add_u64_counter(l_os_ops, "ops");
-  plb.add_u64(l_os_oq_max_bytes, "op_queue_max_bytes");
-  plb.add_u64(l_os_oq_bytes, "op_queue_bytes");
-  plb.add_u64_counter(l_os_bytes, "bytes");
-  plb.add_time_avg(l_os_commit_lat, "commit_latency");
-  plb.add_time_avg(l_os_apply_lat, "apply_latency");
-  plb.add_time_avg(l_os_queue_lat, "queue_transaction_latency_avg");
+  plb.add_u64(l_os_oq_max_ops, "op_queue_max_ops", "Max operations count in queue");
+  plb.add_u64(l_os_oq_ops, "op_queue_ops", "Operations count in queue");
+  plb.add_u64_counter(l_os_ops, "ops", "Operations");
+  plb.add_u64(l_os_oq_max_bytes, "op_queue_max_bytes", "Max size of queue");
+  plb.add_u64(l_os_oq_bytes, "op_queue_bytes", "Size of queue");
+  plb.add_u64_counter(l_os_bytes, "bytes", "Data written to store");
+  plb.add_time_avg(l_os_commit_lat, "commit_latency", "Commit latency");
+  plb.add_time_avg(l_os_apply_lat, "apply_latency", "Apply latency");
+  plb.add_time_avg(l_os_queue_lat, "queue_transaction_latency_avg", "Store operation queue latency");
 
   perf_logger = plb.create_perf_counters();
 
@@ -568,17 +581,29 @@ KeyValueStore::~KeyValueStore()
   g_ceph_context->get_perfcounters_collection()->remove(perf_logger);
 
   delete perf_logger;
+  
+  if (m_keyvaluestore_do_dump) {
+    dump_stop();
+  }
 }
 
 int KeyValueStore::statfs(struct statfs *buf)
 {
-  if (::statfs(basedir.c_str(), buf) < 0) {
-    int r = -errno;
-    return r;
+  int r = backend->db->get_statfs(buf);
+  if (r < 0) {
+    if (::statfs(basedir.c_str(), buf) < 0) {
+      int r = -errno;
+      return r;
+    }
   }
   return 0;
 }
 
+void KeyValueStore::collect_metadata(map<string,string> *pm)
+{
+  (*pm)["keyvaluestore_backend"] = superblock.backend;
+}
+
 int KeyValueStore::mkfs()
 {
   int ret = 0;
@@ -664,16 +689,33 @@ int KeyValueStore::mkfs()
   }
 
   {
-    ret = KeyValueDB::test_init(superblock.backend, current_fn.c_str());
-    if(ret < 0)
-    {
-      derr << __func__  << " failed to create backend type "
+    KeyValueDB *store = KeyValueDB::create(g_ceph_context,
+					   superblock.backend,
+					   current_fn.c_str());
+    if (!store) {
+      derr << __func__ << " failed to create backend type "
 	   << g_conf->keyvaluestore_backend << "." << dendl;
       ret = -1;
       goto close_fsid_fd;
+    }
 
+    ostringstream err;
+    if (store->create_and_open(err)) {
+      derr << __func__  << " failed to create/open backend type "
+	   << g_conf->keyvaluestore_backend << "." << dendl;
+      ret = -1;
+      delete store;
+      goto close_fsid_fd;
     }
+
+    bufferlist bl;
+    ::encode(collections, bl);
+    KeyValueDB::Transaction t = store->get_transaction();
+    t->set("meta", "collections", bl);
+    store->submit_transaction_sync(t);
+
     dout(1) << g_conf->keyvaluestore_backend << " backend exists/created" << dendl;
+    delete store;
   }
 
   dout(1) << "mkfs done in " << basedir << dendl;
@@ -693,8 +735,8 @@ int KeyValueStore::read_fsid(int fd, uuid_d *uuid)
     return ret;
   if (ret == 8) {
     // old 64-bit fsid... mirror it.
-    *(uint64_t*)&uuid->uuid[0] = *(uint64_t*)fsid_str;
-    *(uint64_t*)&uuid->uuid[8] = *(uint64_t*)fsid_str;
+    *(uint64_t*)&uuid->bytes()[0] = *(uint64_t*)fsid_str;
+    *(uint64_t*)&uuid->bytes()[8] = *(uint64_t*)fsid_str;
     return 0;
   }
 
@@ -907,7 +949,10 @@ int KeyValueStore::mount()
 
     }
 
-    store->init();
+    if (superblock.backend == "rocksdb")
+      store->init(g_conf->keyvaluestore_rocksdb_options);
+    else
+      store->init();
     stringstream err;
     if (store->open(err)) {
       derr << "KeyValueStore::mount Error initializing keyvaluestore backend "
@@ -917,6 +962,20 @@ int KeyValueStore::mount()
       goto close_current_fd;
     }
 
+    // get collection list
+    set<string> keys;
+    keys.insert("collections");
+    map<string,bufferlist> values;
+    store->get("meta", keys, &values);
+    if (values.empty()) {
+      ret = -EIO;
+      derr << "Error no collection list; old store?" << dendl;
+      goto close_current_fd;
+    }
+    bufferlist::iterator p = values["collections"].begin();
+    ::decode(collections, p);
+    dout(20) << "collections: " << collections << dendl;
+
     StripObjectMap *dbomap = new StripObjectMap(store);
     ret = dbomap->init(do_update);
     if (ret < 0) {
@@ -989,21 +1048,22 @@ int KeyValueStore::queue_transactions(Sequencer *posr, list<Transaction*> &tls,
 
   // set up the sequencer
   OpSequencer *osr;
-  if (!posr)
-    posr = &default_osr;
+  assert(posr);
   if (posr->p) {
-    osr = static_cast<OpSequencer *>(posr->p);
-    dout(5) << "queue_transactions existing " << *osr << "/" << osr->parent
+    osr = static_cast<OpSequencer *>(posr->p.get());
+    dout(5) << "queue_transactions existing " << osr << " " << *osr << "/" << osr->parent
             << dendl; //<< " w/ q " << osr->q << dendl;
   } else {
     osr = new OpSequencer;
     osr->parent = posr;
     posr->p = osr;
-    dout(5) << "queue_transactions new " << *osr << "/" << osr->parent << dendl;
+    dout(5) << "queue_transactions new " << osr << " " << *osr << "/" << osr->parent << dendl;
   }
 
   Op *o = build_op(tls, ondisk, onreadable, onreadable_sync, osd_op);
   op_queue_reserve_throttle(o, handle);
+  if (m_keyvaluestore_do_dump)
+    dump_transactions(o->tls, o->op, osr);
   dout(5) << "queue_transactions (trailing journal) " << " " << tls <<dendl;
   queue_op(osr, o);
 
@@ -1049,8 +1109,8 @@ void KeyValueStore::queue_op(OpSequencer *osr, Op *o)
   perf_logger->inc(l_os_bytes, o->bytes);
 
   dout(5) << "queue_op " << o << " seq " << o->op << " " << *osr << " "
-          << o->bytes << " bytes" << "   (queue has " << op_queue_len
-          << " ops and " << op_queue_bytes << " bytes)" << dendl;
+          << o->bytes << " bytes" << "   (queue has " << throttle_ops.get_current()
+          << " ops and " << throttle_bytes.get_current() << " bytes)" << dendl;
   op_wq.queue(osr);
 }
 
@@ -1063,42 +1123,32 @@ void KeyValueStore::op_queue_reserve_throttle(Op *o, ThreadPool::TPHandle *handl
   perf_logger->set(l_os_oq_max_bytes, max_bytes);
 
   utime_t start = ceph_clock_now(g_ceph_context);
-  {
-    Mutex::Locker l(op_throttle_lock);
-    while ((max_ops && (op_queue_len + 1) > max_ops) ||
-           (max_bytes && op_queue_bytes      // let single large ops through!
-           && (op_queue_bytes + o->bytes) > max_bytes)) {
-      dout(2) << "waiting " << op_queue_len + 1 << " > " << max_ops
-              << " ops || " << op_queue_bytes + o->bytes << " > " << max_bytes
-              << dendl;
-      if (handle)
-        handle->suspend_tp_timeout();
-      op_throttle_cond.Wait(op_throttle_lock);
-      if (handle)
-        handle->reset_tp_timeout();
-    }
+  if (handle)
+    handle->suspend_tp_timeout();
+  if (throttle_ops.should_wait(1) ||
+    (throttle_bytes.get_current()      // let single large ops through!
+    && throttle_bytes.should_wait(o->bytes))) {
+    dout(2) << "waiting " << throttle_ops.get_current() + 1 << " > " << max_ops << " ops || " 
+      << throttle_bytes.get_current() + o->bytes << " > " << max_bytes << dendl;
+  }
+  throttle_ops.get();
+  throttle_bytes.get(o->bytes);
+  if (handle)
+    handle->reset_tp_timeout();
 
-    op_queue_len++;
-    op_queue_bytes += o->bytes;
-  }
   utime_t end = ceph_clock_now(g_ceph_context);
   perf_logger->tinc(l_os_queue_lat, end - start);
 
-  perf_logger->set(l_os_oq_ops, op_queue_len);
-  perf_logger->set(l_os_oq_bytes, op_queue_bytes);
+  perf_logger->set(l_os_oq_ops, throttle_ops.get_current());
+  perf_logger->set(l_os_oq_bytes, throttle_bytes.get_current());
 }
 
 void KeyValueStore::op_queue_release_throttle(Op *o)
 {
-  {
-    Mutex::Locker l(op_throttle_lock);
-    op_queue_len--;
-    op_queue_bytes -= o->bytes;
-    op_throttle_cond.Signal();
-  }
-
-  perf_logger->set(l_os_oq_ops, op_queue_len);
-  perf_logger->set(l_os_oq_bytes, op_queue_bytes);
+  throttle_ops.put();
+  throttle_bytes.put(o->bytes);
+  perf_logger->set(l_os_oq_ops, throttle_ops.get_current());
+  perf_logger->set(l_os_oq_bytes, throttle_bytes.get_current());
 }
 
 void KeyValueStore::_do_op(OpSequencer *osr, ThreadPool::TPHandle &handle)
@@ -1127,12 +1177,13 @@ void KeyValueStore::_finish_op(OpSequencer *osr)
   list<Context*> to_queue;
   Op *o = osr->dequeue(&to_queue);
 
-  dout(10) << "_finish_op " << o << " seq " << o->op << " " << *osr << "/" << osr->parent << dendl;
+  utime_t lat = ceph_clock_now(g_ceph_context);
+  lat -= o->start;
+
+  dout(10) << "_finish_op " << o << " seq " << o->op << " " << *osr << "/" << osr->parent << " lat " << lat << dendl;
   osr->apply_lock.Unlock();  // locked in _do_op
   op_queue_release_throttle(o);
 
-  utime_t lat = ceph_clock_now(g_ceph_context);
-  lat -= o->start;
   perf_logger->tinc(l_os_commit_lat, lat);
   perf_logger->tinc(l_os_apply_lat, lat);
 
@@ -1403,21 +1454,8 @@ unsigned KeyValueStore::_do_transaction(Transaction& transaction,
       break;
 
     case Transaction::OP_COLL_SETATTR:
-      {
-        coll_t cid = i.get_cid(op->cid);
-        string name = i.decode_string();
-        bufferlist bl;
-        i.decode_bl(bl);
-        r = _collection_setattr(cid, name.c_str(), bl.c_str(), bl.length(), t);
-      }
-      break;
-
     case Transaction::OP_COLL_RMATTR:
-      {
-        coll_t cid = i.get_cid(op->cid);
-        string name = i.decode_string();
-        r = _collection_rmattr(cid, name.c_str(), t);
-      }
+      assert(0 == "coll attrs no longer supported");
       break;
 
     case Transaction::OP_STARTSYNC:
@@ -1428,9 +1466,7 @@ unsigned KeyValueStore::_do_transaction(Transaction& transaction,
 
     case Transaction::OP_COLL_RENAME:
       {
-        coll_t cid = i.get_cid(op->cid);
-        ghobject_t oid = i.get_oid(op->oid);
-        r = -EOPNOTSUPP;
+        assert(0 == "not implemented");
       }
       break;
 
@@ -1651,16 +1687,9 @@ int KeyValueStore::_generic_read(StripObjectMap::StripObjectHeaderRef header,
 
 
   int r = backend->get_values_with_header(header, OBJECT_STRIP_PREFIX, keys, &out);
-  if (r < 0) {
-    dout(10) << __func__ << " " << header->cid << "/" << header->oid << " "
-             << offset << "~" << len << " = " << r << dendl;
+  r = check_get_rc(header->cid, header->oid, r, out.size() == keys.size());
+  if (r < 0)
     return r;
-  } else if (out.size() != keys.size()) {
-    dout(0) << __func__ << " broken header or missing data in backend "
-            << header->cid << "/" << header->oid << " " << offset << "~"
-            << len << " = " << r << dendl;
-    return -EBADF;
-  }
 
   for (vector<StripObjectMap::StripExtent>::iterator iter = extents.begin();
        iter != extents.end(); ++iter) {
@@ -1727,8 +1756,10 @@ int KeyValueStore::fiemap(coll_t cid, const ghobject_t& oid,
   map<uint64_t, uint64_t> m;
   for (vector<StripObjectMap::StripExtent>::iterator iter = extents.begin();
        iter != extents.end(); ++iter) {
-    uint64_t off = iter->no * header->strip_size + iter->offset;
-    m[off] = iter->len;
+    if (header->bits[iter->no]) {
+      uint64_t off = iter->no * header->strip_size + iter->offset;
+      m[off] = iter->len;
+    }
   }
   ::encode(m, bl);
   return 0;
@@ -1793,16 +1824,9 @@ int KeyValueStore::_truncate(coll_t cid, const ghobject_t& oid, uint64_t size,
       lookup_keys.insert(key);
       r = t.get_buffer_keys(header, OBJECT_STRIP_PREFIX,
                             lookup_keys, &values);
-      if (r < 0) {
-        dout(10) << __func__ << " " << cid << "/" << oid << " "
-                 << size << " = " << r << dendl;
+      r = check_get_rc(cid, oid, r, lookup_keys.size() == values.size());
+      if (r < 0)
         return r;
-      } else if (values.size() != lookup_keys.size()) {
-        dout(0) << __func__ << " broken header or missing data in backend "
-                << header->cid << "/" << header->oid << " size " << size
-                <<  " r = " << r << dendl;
-        return -EBADF;
-      }
 
       values[key].copy(0, iter->offset, value);
       value.append_zero(header->strip_size-iter->offset);
@@ -1868,6 +1892,7 @@ int KeyValueStore::_generic_write(StripObjectMap::StripObjectHeaderRef header,
   if (len + offset > header->max_size) {
     header->max_size = len + offset;
     header->bits.resize(header->max_size/header->strip_size+1);
+    header->updated = true;
   }
 
   vector<StripObjectMap::StripExtent> extents;
@@ -1884,18 +1909,9 @@ int KeyValueStore::_generic_write(StripObjectMap::StripObjectHeaderRef header,
   }
 
   int r = t.get_buffer_keys(header, OBJECT_STRIP_PREFIX, keys, &out);
-  if (r < 0) {
-    dout(10) << __func__ << " failed to get value " << header->cid << "/"
-              << header->oid << " " << offset << "~" << len << " = " << r
-              << dendl;
+  r = check_get_rc(header->cid, header->oid, r, keys.size() == out.size());
+  if (r < 0) 
     return r;
-  } else if (keys.size() != out.size()) {
-    // Error on header.bits or the corresponding key/value pair is missing
-    dout(0) << __func__ << " broken header or missing data in backend "
-            << header->cid << "/" << header->oid << " " << offset << "~"
-            << len << " = " << r << dendl;
-    return -EBADF;
-  }
 
   uint64_t bl_offset = 0;
   map<string, bufferlist> values;
@@ -1928,13 +1944,13 @@ int KeyValueStore::_generic_write(StripObjectMap::StripObjectHeaderRef header,
         value.append_zero(header->strip_size-value.length());
 
       header->bits[iter->no] = 1;
+      header->updated = true;
     }
     assert(value.length() == header->strip_size);
     values[key].swap(value);
   }
   assert(bl_offset == len);
 
-  header->updated = true;
   t.set_buffer_keys(header, OBJECT_STRIP_PREFIX, values);
   dout(10) << __func__ << " " << header->cid << "/" << header->oid << " "
            << offset << "~" << len << " = " << r << dendl;
@@ -1966,13 +1982,55 @@ int KeyValueStore::_zero(coll_t cid, const ghobject_t& oid, uint64_t offset,
                          size_t len, BufferTransaction &t)
 {
   dout(15) << __func__ << " " << cid << "/" << oid << " " << offset << "~" << len << dendl;
+  int r;
+  StripObjectMap::StripObjectHeaderRef header;
 
-  bufferptr bp(len);
-  bp.zero();
-  bufferlist bl;
-  bl.push_back(bp);
-  int r = _write(cid, oid, offset, len, bl, t);
+  r = t.lookup_cached_header(cid, oid, &header, true);
+  if (r < 0) {
+    dout(10) << __func__ << " " << cid << "/" << oid << " " << offset
+             << "~" << len << " failed to get header: r = " << r << dendl;
+    return r;
+  }
+
+  if (len + offset > header->max_size) {
+    header->max_size = len + offset;
+    header->bits.resize(header->max_size/header->strip_size+1);
+    header->updated = true;
+  }
 
+  vector<StripObjectMap::StripExtent> extents;
+  StripObjectMap::file_to_extents(offset, len, header->strip_size,
+                                  extents);
+  set<string> rm_keys;
+  set<string> lookup_keys;
+  map<string, bufferlist> values;
+  map<string, pair<uint64_t, uint64_t> > off_len;
+  for (vector<StripObjectMap::StripExtent>::iterator iter = extents.begin();
+       iter != extents.end(); ++iter) {
+    string key = strip_object_key(iter->no);
+    if (header->bits[iter->no]) {
+      if (iter->offset == 0 && iter->len == header->strip_size) {
+        rm_keys.insert(key);
+        header->bits[iter->no] = 0;
+        header->updated = true;
+      } else {
+        lookup_keys.insert(key);
+        off_len[key] = make_pair(iter->offset, iter->len);
+      }
+    }    
+  }
+  r = t.get_buffer_keys(header, OBJECT_STRIP_PREFIX,
+                        lookup_keys, &values);
+  r = check_get_rc(header->cid, header->oid, r, lookup_keys.size() == values.size());
+  if (r < 0)
+    return r;
+  for(set<string>::iterator it = lookup_keys.begin(); it != lookup_keys.end(); ++it)
+  {
+    pair<uint64_t, uint64_t> p = off_len[*it];
+    values[*it].zero(p.first, p.second);
+  }
+  t.set_buffer_keys(header, OBJECT_STRIP_PREFIX, values);
+  t.remove_buffer_keys(header, OBJECT_STRIP_PREFIX, rm_keys);
   dout(10) << __func__ << " " << cid << "/" << oid << " " << offset << "~"
            << len << " = " << r << dendl;
   return r;
@@ -2090,18 +2148,22 @@ int KeyValueStore::getattr(coll_t cid, const ghobject_t& oid, const char *name,
 int KeyValueStore::getattrs(coll_t cid, const ghobject_t& oid,
                            map<string,bufferptr>& aset)
 {
-  int r;
   map<string, bufferlist> attr_aset;
+  int r;
+  StripObjectMap::StripObjectHeaderRef header;
 
-  r = backend->get(cid, oid, OBJECT_XATTR, &attr_aset);
-  if (r < 0 && r != -ENOENT) {
+  r = backend->lookup_strip_header(cid, oid, &header);
+  if (r < 0) {
+    dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl;
+    return r;
+  }
+
+  r = backend->get_with_header(header, OBJECT_XATTR, &attr_aset);
+  if (r < 0) {
     dout(10) << __func__ << " could not get attrs r = " << r << dendl;
     goto out;
   }
 
-  if (r == -ENOENT)
-    r = 0;
-
   for (map<string, bufferlist>::iterator i = attr_aset.begin();
        i != attr_aset.end(); ++i) {
     string key;
@@ -2187,7 +2249,7 @@ int KeyValueStore::_rmattrs(coll_t cid, const ghobject_t& oid,
   }
 
   r = backend->get_keys_with_header(header, OBJECT_XATTR, &attrs);
-  if (r < 0 && r != -ENOENT) {
+  if (r < 0) {
     dout(10) << __func__ << " could not get attrs r = " << r << dendl;
     return r;
   }
@@ -2199,200 +2261,25 @@ int KeyValueStore::_rmattrs(coll_t cid, const ghobject_t& oid,
   return r;
 }
 
-// collection attrs
-
-int KeyValueStore::collection_getattr(coll_t c, const char *name,
-                                      void *value, size_t size)
-{
-  dout(15) << __func__ << " " << c.to_str() << " '" << name << "' len "
-           << size << dendl;
-
-  bufferlist bl;
-  int r;
-
-  r = collection_getattr(c, name, bl);
-  if (r < 0)
-      goto out;
-
-  if (bl.length() < size) {
-    r = bl.length();
-    bl.copy(0, bl.length(), static_cast<char*>(value));
-  } else {
-    r = size;
-    bl.copy(0, size, static_cast<char*>(value));
-  }
-
-out:
-  dout(10) << __func__ << " " << c.to_str() << " '" << name << "' len "
-           << size << " = " << r << dendl;
-  return r;
-}
-
-int KeyValueStore::collection_getattr(coll_t c, const char *name,
-                                      bufferlist& bl)
-{
-  dout(15) << __func__ << " " << c.to_str() << " '" << name
-           << "'" << dendl;
-
-  set<string> keys;
-  map<string, bufferlist> out;
-  StripObjectMap::StripObjectHeaderRef header;
-
-  keys.insert(string(name));
-
-  int r = backend->lookup_strip_header(get_coll_for_coll(),
-                                       make_ghobject_for_coll(c), &header);
-  if (r < 0) {
-    dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl;
-    return r;
-  }
-
-  r = backend->get_values_with_header(header, COLLECTION_ATTR, keys, &out);
-  if (r < 0) {
-    dout(10) << __func__ << " could not get key" << string(name) << dendl;
-    r = -EINVAL;
-  }
-
-  if (!out.empty()) {
-    bl.swap(out.begin()->second);
-    r = bl.length();
-  } else {
-    r = -ENODATA;
-  }
-
-  dout(10) << __func__ << " " << c.to_str() << " '" << name << "' len "
-           << bl.length() << " = " << r << dendl;
-  return r;
-}
-
-int KeyValueStore::collection_getattrs(coll_t cid,
-                                       map<string, bufferptr> &aset)
-{
-  dout(10) << __func__ << " " << cid.to_str() << dendl;
-
-  map<string, bufferlist> out;
-
-  int r = backend->get(get_coll_for_coll(), make_ghobject_for_coll(cid),
-                       COLLECTION_ATTR, &out);
-  if (r < 0) {
-    dout(10) << __func__ << " could not get keys" << dendl;
-    goto out;
-  }
-
-  for (map<string, bufferlist>::iterator it = out.begin(); it != out.end();
-       ++it) {
-    bufferptr ptr(it->second.c_str(), it->second.length());
-    aset.insert(make_pair(it->first, ptr));
-  }
-
- out:
-  dout(10) << __func__ << " " << cid.to_str() << " = " << r << dendl;
-  return r;
-}
-
-int KeyValueStore::_collection_setattr(coll_t c, const char *name,
-                                       const void *value, size_t size,
-                                       BufferTransaction &t)
-{
-  dout(10) << __func__ << " " << c << " '" << name << "' len "
-           << size << dendl;
-
-  int r;
-  bufferlist bl;
-  map<string, bufferlist> out;
-  StripObjectMap::StripObjectHeaderRef header;
-
-  r = t.lookup_cached_header(get_coll_for_coll(),
-                             make_ghobject_for_coll(c),
-                             &header, false);
-  if (r < 0) {
-    dout(10) << __func__ << " could not find header r = " << r << dendl;
-    return r;
-  }
-
-  bl.append(reinterpret_cast<const char*>(value), size);
-  out.insert(make_pair(string(name), bl));
-
-  t.set_buffer_keys(header, COLLECTION_ATTR, out);
-
-  dout(10) << __func__ << " " << c << " '"
-           << name << "' len " << size << " = " << r << dendl;
-  return r;
-}
-
-int KeyValueStore::_collection_rmattr(coll_t c, const char *name,
-                                      BufferTransaction &t)
-{
-  dout(15) << __func__ << " " << c << dendl;
-
-  bufferlist bl;
-  set<string> out;
-  StripObjectMap::StripObjectHeaderRef header;
-
-  int r = t.lookup_cached_header(get_coll_for_coll(),
-                                 make_ghobject_for_coll(c), &header, false);
-  if (r < 0) {
-    dout(10) << __func__ << " could not find header r = " << r << dendl;
-    return r;
-  }
-
-  out.insert(string(name));
-  r = t.remove_buffer_keys(header, COLLECTION_ATTR, out);
-
-  dout(10) << __func__ << " " << c << " = " << r << dendl;
-  return r;
-}
-
-int KeyValueStore::_collection_setattrs(coll_t cid,
-                                        map<string,bufferptr>& aset,
-                                        BufferTransaction &t)
-{
-  dout(15) << __func__ << " " << cid << dendl;
-
-  map<string, bufferlist> attrs;
-  StripObjectMap::StripObjectHeaderRef header;
-  int r = t.lookup_cached_header(get_coll_for_coll(),
-                                 make_ghobject_for_coll(cid),
-                                 &header, false);
-  if (r < 0) {
-    dout(10) << __func__ << " could not find header r = " << r << dendl;
-    return r;
-  }
-
-  for (map<string, bufferptr>::iterator it = aset.begin(); it != aset.end();
-       ++it) {
-    attrs[it->first].push_back(it->second);
-  }
-
-  t.set_buffer_keys(header, COLLECTION_ATTR, attrs);
-
-  dout(10) << __func__ << " " << cid << " = " << r << dendl;
-  return r;
-}
-
 
 // collections
 
 int KeyValueStore::_create_collection(coll_t c, BufferTransaction &t)
 {
   dout(15) << __func__ << " " << c << dendl;
-
-  int r;
-  StripObjectMap::StripObjectHeaderRef header;
+  int r = 0;
   bufferlist bl;
 
-  r = t.lookup_cached_header(get_coll_for_coll(),
-                             make_ghobject_for_coll(c), &header,
-                             false);
-  if (r == 0) {
+  RWLock::WLocker l(collections_lock);
+  if (collections.count(c)) {
     r = -EEXIST;
-    return r;
+    goto out;
   }
 
-  r = t.lookup_cached_header(get_coll_for_coll(),
-                             make_ghobject_for_coll(c), &header,
-                             true);
+  collections.insert(c);
+  t.set_collections(collections);
 
+ out:
   dout(10) << __func__ << " cid " << c << " r = " << r << dendl;
   return r;
 }
@@ -2403,13 +2290,15 @@ int KeyValueStore::_destroy_collection(coll_t c, BufferTransaction &t)
 
   int r;
   uint64_t modified_object = 0;
-  StripObjectMap::StripObjectHeaderRef header;
   vector<ghobject_t> oids;
+  bufferlist bl;
 
-  r = t.lookup_cached_header(get_coll_for_coll(), make_ghobject_for_coll(c),
-                             &header, false);
-  if (r < 0) {
-    goto out;
+  {
+    RWLock::RLocker l(collections_lock);
+    if (!collections.count(c)) {
+      r = -ENOENT;
+      goto out;
+    }
   }
 
   // All modified objects are marked deleted
@@ -2426,7 +2315,7 @@ int KeyValueStore::_destroy_collection(coll_t c, BufferTransaction &t)
     }
   }
 
-  r = backend->list_objects(c, ghobject_t(), modified_object+1, &oids,
+  r = backend->list_objects(c, ghobject_t(), ghobject_t::get_max(), modified_object+1, &oids,
                             0);
   // No other object
   if (oids.size() != modified_object && oids.size() != 0) {
@@ -2434,7 +2323,7 @@ int KeyValueStore::_destroy_collection(coll_t c, BufferTransaction &t)
     goto out;
   }
 
-  for(vector<ghobject_t>::iterator iter = oids.begin();
+  for (vector<ghobject_t>::iterator iter = oids.begin();
       iter != oids.end(); ++iter) {
     if (!t.strip_headers.count(make_pair(c, *iter))) {
       r = -ENOTEMPTY;
@@ -2442,7 +2331,12 @@ int KeyValueStore::_destroy_collection(coll_t c, BufferTransaction &t)
     }
   }
 
-  r = t.clear_buffer(header);
+  {
+    RWLock::WLocker l(collections_lock);
+    collections.erase(c);
+    t.set_collections(collections);
+  }
+  r = 0;
 
 out:
   dout(10) << __func__ << " " << c << " = " << r << dendl;
@@ -2525,66 +2419,56 @@ int KeyValueStore::_collection_remove_recursive(const coll_t &cid,
                                                 BufferTransaction &t)
 {
   dout(15) << __func__ << " " << cid << dendl;
+  int r = 0;
 
-  StripObjectMap::StripObjectHeaderRef header;
-
-  int r = t.lookup_cached_header(get_coll_for_coll(),
-                                 make_ghobject_for_coll(cid),
-                                 &header, false);
-  if (r < 0) {
-    return 0;
+  {
+    RWLock::RLocker l(collections_lock);
+    if (collections.count(cid) == 0)
+      return -ENOENT;
   }
 
   vector<ghobject_t> objects;
   ghobject_t max;
   while (!max.is_max()) {
-    r = collection_list_partial(cid, max, 200, 300, 0, &objects, &max);
+    r = collection_list(cid, max, ghobject_t::get_max(), true, 300, &objects, &max);
     if (r < 0)
-      return r;
+      goto out;
 
     for (vector<ghobject_t>::iterator i = objects.begin();
          i != objects.end(); ++i) {
       r = _remove(cid, *i, t);
-
       if (r < 0)
-        return r;
+	goto out;
     }
   }
 
-  r = t.clear_buffer(header);
+  {
+    RWLock::WLocker l(collections_lock);
+    collections.erase(cid);
+    t.set_collections(collections);
+  }
 
+ out:
   dout(10) << __func__ << " " << cid  << " r = " << r << dendl;
-  return 0;
+  return r;
 }
 
 int KeyValueStore::list_collections(vector<coll_t>& ls)
 {
   dout(10) << __func__ << " " << dendl;
-
-  vector<ghobject_t> oids;
-  ghobject_t next;
-  backend->list_objects(get_coll_for_coll(), ghobject_t(), 0, &oids, &next);
-  assert(next == ghobject_t::get_max());
-
-  for (vector<ghobject_t>::const_iterator iter = oids.begin();
-       iter != oids.end(); ++iter) {
-    ls.push_back(coll_t(iter->hobj.oid.name));
+  RWLock::RLocker l(collections_lock);
+  for (set<coll_t>::iterator p = collections.begin(); p != collections.end();
+       ++p) {
+    ls.push_back(*p);
   }
-
   return 0;
 }
 
 bool KeyValueStore::collection_exists(coll_t c)
 {
   dout(10) << __func__ << " " << dendl;
-
-  StripObjectMap::StripObjectHeaderRef header;
-  int r = backend->lookup_strip_header(get_coll_for_coll(),
-                                       make_ghobject_for_coll(c), &header);
-  if (r < 0) {
-    return false;
-  }
-  return true;
+  RWLock::RLocker l(collections_lock);
+  return collections.count(c);
 }
 
 bool KeyValueStore::collection_empty(coll_t c)
@@ -2592,66 +2476,26 @@ bool KeyValueStore::collection_empty(coll_t c)
   dout(10) << __func__ << " " << dendl;
 
   vector<ghobject_t> oids;
-  backend->list_objects(c, ghobject_t(), 1, &oids, 0);
+  backend->list_objects(c, ghobject_t(), ghobject_t::get_max(), 1, &oids, 0);
 
   return oids.empty();
 }
 
-int KeyValueStore::collection_list_range(coll_t c, ghobject_t start,
-                                         ghobject_t end, snapid_t seq,
-                                         vector<ghobject_t> *ls)
-{
-  bool done = false;
-  ghobject_t next = start;
-
-  while (!done) {
-    vector<ghobject_t> next_objects;
-    int r = collection_list_partial(c, next, get_ideal_list_min(),
-                                    get_ideal_list_max(), seq,
-                                    &next_objects, &next);
-    if (r < 0)
-      return r;
-
-    ls->insert(ls->end(), next_objects.begin(), next_objects.end());
-
-    // special case for empty collection
-    if (ls->empty()) {
-      break;
-    }
-
-    while (!ls->empty() && ls->back() >= end) {
-      ls->pop_back();
-      done = true;
-    }
-
-    if (next >= end) {
-      done = true;
-    }
-  }
-
-  return 0;
-}
-
-int KeyValueStore::collection_list_partial(coll_t c, ghobject_t start,
-                                           int min, int max, snapid_t seq,
-                                           vector<ghobject_t> *ls,
-                                           ghobject_t *next)
+int KeyValueStore::collection_list(coll_t c, ghobject_t start,
+				   ghobject_t end, bool sort_bitwise, int max,
+				   vector<ghobject_t> *ls, ghobject_t *next)
 {
-  dout(10) << __func__ << " " << c << " start:" << start << " is_max:"
-           << start.is_max() << dendl;
+  if (!sort_bitwise)
+    return -EOPNOTSUPP;
 
-  if (min < 0 || max < 0)
-      return -EINVAL;
+  if (max < 0)
+    return -EINVAL;
 
   if (start.is_max())
-      return 0;
-
-  return backend->list_objects(c, start, max, ls, next);
-}
+    return 0;
 
-int KeyValueStore::collection_list(coll_t c, vector<ghobject_t>& ls)
-{
-  return collection_list_partial(c, ghobject_t(), 0, 0, 0, &ls, 0);
+  int r = backend->list_objects(c, start, end, max, ls, next);
+  return r;
 }
 
 int KeyValueStore::collection_version_current(coll_t c, uint32_t *version)
@@ -2679,7 +2523,7 @@ int KeyValueStore::omap_get(coll_t c, const ghobject_t &hoid,
   }
 
   r = backend->get_with_header(header, OBJECT_OMAP, out);
-  if (r < 0 && r != -ENOENT) {
+  if (r < 0) {
     dout(10) << __func__ << " err r =" << r << dendl;
     return r;
   }
@@ -2744,7 +2588,7 @@ int KeyValueStore::omap_get_keys(coll_t c, const ghobject_t &hoid, set<string> *
   }
 
   r = backend->get_keys_with_header(header, OBJECT_OMAP, keys);
-  if (r < 0 && r != -ENOENT) {
+  if (r < 0) {
     return r;
   }
   return 0;
@@ -2805,7 +2649,7 @@ int KeyValueStore::_omap_clear(coll_t cid, const ghobject_t &hoid,
 
   set<string> keys;
   r = backend->get_keys_with_header(header, OBJECT_OMAP, &keys);
-  if (r < 0 && r != -ENOENT) {
+  if (r < 0) {
     dout(10) << __func__ << " could not get omap_keys r = " << r << dendl;
     return r;
   }
@@ -2921,28 +2765,20 @@ int KeyValueStore::_split_collection(coll_t cid, uint32_t bits, uint32_t rem,
 
     StripObjectMap::StripObjectHeaderRef header;
 
-    int r = t.lookup_cached_header(get_coll_for_coll(),
-                                   make_ghobject_for_coll(cid),
-                                   &header, false);
-    if (r < 0) {
-      dout(2) << __func__ << ": " << cid << " DNE" << dendl;
-      return 0;
-    }
-
-    r = t.lookup_cached_header(get_coll_for_coll(),
-                               make_ghobject_for_coll(dest),
-                               &header, false);
-    if (r < 0) {
-      dout(2) << __func__ << ": " << dest << " DNE" << dendl;
-      return 0;
+    {
+      RWLock::RLocker l(collections_lock);
+      if (collections.count(cid) == 0)
+	return -ENOENT;
+      if (collections.count(dest) == 0)
+	return -ENOENT;
     }
 
     vector<ghobject_t> objects;
     ghobject_t next, current;
     int move_size = 0;
     while (1) {
-      collection_list_partial(cid, current, get_ideal_list_min(),
-                              get_ideal_list_max(), 0, &objects, &next);
+      collection_list(cid, current, ghobject_t::get_max(), true,
+		      get_ideal_list_max(), &objects, &next);
 
       dout(20) << __func__ << cid << "objects size: " << objects.size()
               << dendl;
@@ -2972,8 +2808,8 @@ int KeyValueStore::_split_collection(coll_t cid, uint32_t bits, uint32_t rem,
     vector<ghobject_t> objects;
     ghobject_t next;
     while (1) {
-      collection_list_partial(cid, next, get_ideal_list_min(),
-                              get_ideal_list_max(), 0, &objects, &next);
+      collection_list(cid, next, ghobject_t::get_max(), true,
+		      get_ideal_list_max(), &objects, &next);
       if (objects.empty())
         break;
 
@@ -2988,8 +2824,8 @@ int KeyValueStore::_split_collection(coll_t cid, uint32_t bits, uint32_t rem,
 
     next = ghobject_t();
     while (1) {
-      collection_list_partial(dest, next, get_ideal_list_min(),
-                              get_ideal_list_max(), 0, &objects, &next);
+      collection_list(dest, next, ghobject_t::get_max(), true,
+		      get_ideal_list_max(), &objects, &next);
       if (objects.empty())
         break;
 
@@ -3053,6 +2889,7 @@ const char** KeyValueStore::get_tracked_conf_keys() const
     "keyvaluestore_queue_max_ops",
     "keyvaluestore_queue_max_bytes",
     "keyvaluestore_strip_size",
+    "keyvaluestore_dump_file",
     NULL
   };
   return KEYS;
@@ -3067,15 +2904,74 @@ void KeyValueStore::handle_conf_change(const struct md_config_t *conf,
     m_keyvaluestore_queue_max_ops = conf->keyvaluestore_queue_max_ops;
     m_keyvaluestore_queue_max_bytes = conf->keyvaluestore_queue_max_bytes;
     m_keyvaluestore_max_expected_write_size = conf->keyvaluestore_max_expected_write_size;
+    throttle_ops.reset_max(conf->keyvaluestore_queue_max_ops);
+    throttle_bytes.reset_max(conf->keyvaluestore_queue_max_bytes);
   }
   if (changed.count("keyvaluestore_default_strip_size")) {
     m_keyvaluestore_strip_size = conf->keyvaluestore_default_strip_size;
     default_strip_size = m_keyvaluestore_strip_size;
   }
+  if (changed.count("keyvaluestore_dump_file")) {
+    if (conf->keyvaluestore_dump_file.length() &&
+	conf->keyvaluestore_dump_file != "-") {
+      dump_start(conf->keyvaluestore_dump_file);
+    } else {
+      dump_stop();
+    }
+  }
+}
+
+int KeyValueStore::check_get_rc(const coll_t cid, const ghobject_t& oid, int r, bool is_equal_size)
+{
+  if (r < 0) {
+    dout(10) << __func__ << " " << cid << "/" << oid << " "
+             << " get rc = " <<  r << dendl;
+  } else if (!is_equal_size) {
+    dout(0) << __func__ << " broken header or missing data in backend "
+            << cid << "/" << oid << " get rc = " << r << dendl;
+    r = -EBADF;
+  }
+  return r;
+}
+
+void KeyValueStore::dump_start(const std::string &file)
+{
+  dout(10) << "dump_start " << file << dendl;
+  if (m_keyvaluestore_do_dump) {
+    dump_stop();
+  }
+  m_keyvaluestore_dump_fmt.reset();
+  m_keyvaluestore_dump_fmt.open_array_section("dump");
+  m_keyvaluestore_dump.open(file.c_str());
+  m_keyvaluestore_do_dump = true;
 }
 
+void KeyValueStore::dump_stop()
+{
+  dout(10) << "dump_stop" << dendl;
+  m_keyvaluestore_do_dump = false;
+  if (m_keyvaluestore_dump.is_open()) {
+    m_keyvaluestore_dump_fmt.close_section();
+    m_keyvaluestore_dump_fmt.flush(m_keyvaluestore_dump);
+    m_keyvaluestore_dump.flush();
+    m_keyvaluestore_dump.close();
+  }
+}
 void KeyValueStore::dump_transactions(list<ObjectStore::Transaction*>& ls, uint64_t seq, OpSequencer *osr)
 {
+  m_keyvaluestore_dump_fmt.open_array_section("transactions");
+  unsigned trans_num = 0;
+  for (list<ObjectStore::Transaction*>::iterator i = ls.begin(); i != ls.end(); ++i, ++trans_num) {
+    m_keyvaluestore_dump_fmt.open_object_section("transaction");
+    m_keyvaluestore_dump_fmt.dump_string("osr", osr->get_name());
+    m_keyvaluestore_dump_fmt.dump_unsigned("seq", seq);
+    m_keyvaluestore_dump_fmt.dump_unsigned("trans_num", trans_num);
+    (*i)->dump(&m_keyvaluestore_dump_fmt);
+    m_keyvaluestore_dump_fmt.close_section();
+  }
+  m_keyvaluestore_dump_fmt.close_section();
+  m_keyvaluestore_dump_fmt.flush(m_keyvaluestore_dump);
+  m_keyvaluestore_dump.flush();
 }
 
 
diff --git a/src/os/KeyValueStore.h b/src/os/KeyValueStore.h
index ef3085f..90e41ee 100644
--- a/src/os/KeyValueStore.h
+++ b/src/os/KeyValueStore.h
@@ -205,6 +205,9 @@ class KeyValueStore : public ObjectStore,
 
   Finisher ondisk_finisher;
 
+  RWLock collections_lock;
+  set<coll_t> collections;
+
   Mutex lock;
 
   int _create_current();
@@ -217,22 +220,10 @@ class KeyValueStore : public ObjectStore,
 
   string strip_object_key(uint64_t no) {
     char n[100];
-    snprintf(n, 100, "%lld", (long long)no);
+    snprintf(n, 100, "%08lld", (long long)no);
     return string(n);
   }
 
-  // A special coll used by store collection info, each obj in this coll
-  // represent a coll_t
-  static bool is_coll_obj(coll_t c) {
-    return c == coll_t("COLLECTIONS");
-  }
-  static coll_t get_coll_for_coll() {
-    return coll_t("COLLECTIONS");
-  }
-  static ghobject_t make_ghobject_for_coll(const coll_t &col) {
-    return ghobject_t(hobject_t(sobject_t(col.to_str(), CEPH_NOSNAP)));
-  }
-
   // Each transaction has side effect which may influent the following
   // operations, we need to make it visible for the following within
   // transaction by caching middle result.
@@ -243,11 +234,27 @@ class KeyValueStore : public ObjectStore,
   // 4. Clone or rename
   struct BufferTransaction {
     typedef pair<coll_t, ghobject_t> uniq_id;
-    typedef map<uniq_id, StripObjectMap::StripObjectHeaderRef> StripHeaderMap;
+
+    struct CollGhobjectPairBitwiseComparator {
+      bool operator()(const uniq_id& l,
+		      const uniq_id& r) const {
+	if (l.first < r.first)
+	  return true;
+	if (l.first != r.first)
+	  return false;
+	if (cmp_bitwise(l.second, r.second) < 0)
+	  return true;
+	return false;
+      }
+    };
+
+    typedef map<uniq_id, StripObjectMap::StripObjectHeaderRef,
+		CollGhobjectPairBitwiseComparator> StripHeaderMap;
 
     //Dirty records
     StripHeaderMap strip_headers;
-    map< uniq_id, map<pair<string, string>, bufferlist> > buffers;  // pair(prefix, key),to buffer updated data in one transaction
+    map< uniq_id, map<pair<string, string>, bufferlist>,
+	 CollGhobjectPairBitwiseComparator> buffers;  // pair(prefix, key),to buffer updated data in one transaction
 
     list<Context*> finishes;
 
@@ -255,6 +262,12 @@ class KeyValueStore : public ObjectStore,
 
     KeyValueDB::Transaction t;
 
+    void set_collections(const set<coll_t>& collections) {
+      bufferlist collections_bl;
+      ::encode(collections, collections_bl);
+      t->set("meta", "collections", collections_bl);
+    }
+
     int lookup_cached_header(const coll_t &cid, const ghobject_t &oid,
                              StripObjectMap::StripObjectHeaderRef *strip_header,
                              bool create_if_missing);
@@ -394,7 +407,6 @@ class KeyValueStore : public ObjectStore,
       Mutex::Locker l(qlock);
       uint64_t seq = 0;
       if (_get_max_uncompleted(&seq)) {
-	delete c;
 	return true;
       } else {
 	flush_commit_waiters.push_back(make_pair(seq, c));
@@ -417,11 +429,8 @@ class KeyValueStore : public ObjectStore,
 
   friend ostream& operator<<(ostream& out, const OpSequencer& s);
 
-  Sequencer default_osr;
   deque<OpSequencer*> op_queue;
-  uint64_t op_queue_len, op_queue_bytes;
-  Cond op_throttle_cond;
-  Mutex op_throttle_lock;
+  Throttle throttle_ops, throttle_bytes;
   Finisher op_finisher;
 
   ThreadPool op_tp;
@@ -450,6 +459,7 @@ class KeyValueStore : public ObjectStore,
       store->op_queue.pop_front();
       return osr;
     }
+    using ThreadPool::WorkQueue<OpSequencer>::_process;
     void _process(OpSequencer *osr, ThreadPool::TPHandle &handle) {
       store->_do_op(osr, handle);
     }
@@ -474,7 +484,7 @@ class KeyValueStore : public ObjectStore,
  public:
 
   KeyValueStore(const std::string &base,
-                const char *internal_name = "keyvaluestore-dev",
+                const char *internal_name = "keyvaluestore",
                 bool update_to=false);
   ~KeyValueStore();
 
@@ -484,7 +494,6 @@ class KeyValueStore : public ObjectStore,
   uint32_t get_target_version() {
     return target_version;
   }
-  bool need_journal() { return false; };
   int peek_journal_fsid(uuid_d *id) {
     *id = fsid;
     return 0;
@@ -501,6 +510,15 @@ class KeyValueStore : public ObjectStore,
   }
   int mkfs();
   int mkjournal() {return 0;}
+  bool wants_journal() {
+    return false;
+  }
+  bool allows_journal() {
+    return false;
+  }
+  bool needs_journal() {
+    return false;
+  }
 
   /**
    ** set_allow_sharded_objects()
@@ -516,6 +534,8 @@ class KeyValueStore : public ObjectStore,
    **/
   bool get_allow_sharded_objects() {return false;}
 
+  void collect_metadata(map<string,string> *pm);
+
   int statfs(struct statfs *buf);
 
   int _do_transactions(
@@ -570,9 +590,6 @@ class KeyValueStore : public ObjectStore,
                       BufferTransaction &t);
 
   void start_sync() {}
-  void sync() {}
-  void flush() {}
-  void sync_and_flush() {}
 
   void set_fsid(uuid_d u) { fsid = u; }
   uuid_d get_fsid() { return fsid; }
@@ -588,16 +605,6 @@ class KeyValueStore : public ObjectStore,
               BufferTransaction &t);
   int _rmattrs(coll_t cid, const ghobject_t& oid, BufferTransaction &t);
 
-  int collection_getattr(coll_t c, const char *name, void *value, size_t size);
-  int collection_getattr(coll_t c, const char *name, bufferlist& bl);
-  int collection_getattrs(coll_t cid, map<string,bufferptr> &aset);
-
-  int _collection_setattr(coll_t c, const char *name, const void *value,
-                          size_t size, BufferTransaction &t);
-  int _collection_rmattr(coll_t c, const char *name, BufferTransaction &t);
-  int _collection_setattrs(coll_t cid, map<string,bufferptr> &aset,
-                           BufferTransaction &t);
-
   // collections
   int _collection_hint_expected_num_objs(coll_t cid, uint32_t pg_num,
       uint64_t num_objs) const { return 0; }
@@ -613,12 +620,9 @@ class KeyValueStore : public ObjectStore,
   int list_collections(vector<coll_t>& ls);
   bool collection_exists(coll_t c);
   bool collection_empty(coll_t c);
-  int collection_list(coll_t c, vector<ghobject_t>& oid);
-  int collection_list_partial(coll_t c, ghobject_t start,
-                              int min, int max, snapid_t snap,
-                              vector<ghobject_t> *ls, ghobject_t *next);
-  int collection_list_range(coll_t c, ghobject_t start, ghobject_t end,
-                            snapid_t seq, vector<ghobject_t> *ls);
+  int collection_list(coll_t c, ghobject_t start, ghobject_t end,
+		      bool sort_bitwise, int max,
+		      vector<ghobject_t> *ls, ghobject_t *next);
   int collection_version_current(coll_t c, uint32_t *version);
 
   // omap (see ObjectStore.h for documentation)
@@ -637,6 +641,9 @@ class KeyValueStore : public ObjectStore,
   ObjectMap::ObjectMapIterator get_omap_iterator(coll_t c,
                                                  const ghobject_t &oid);
 
+  int check_get_rc(const coll_t cid, const ghobject_t& oid, int r, bool is_equal_size);
+  void dump_start(const std::string &file);
+  void dump_stop();
   void dump_transactions(list<ObjectStore::Transaction*>& ls, uint64_t seq,
                          OpSequencer *osr);
 
@@ -673,6 +680,9 @@ class KeyValueStore : public ObjectStore,
   int m_keyvaluestore_strip_size;
   uint64_t m_keyvaluestore_max_expected_write_size;
   int do_update;
+  bool m_keyvaluestore_do_dump;
+  std::ofstream m_keyvaluestore_dump;
+  JSONFormatter m_keyvaluestore_dump_fmt;
 
   static const string OBJECT_STRIP_PREFIX;
   static const string OBJECT_XATTR;
diff --git a/src/os/KineticStore.cc b/src/os/KineticStore.cc
index 6aa05c9..fb6e2bf 100644
--- a/src/os/KineticStore.cc
+++ b/src/os/KineticStore.cc
@@ -63,8 +63,8 @@ int KineticStore::do_open(ostream &out, bool create_if_missing)
   }
 
   PerfCountersBuilder plb(g_ceph_context, "kinetic", l_kinetic_first, l_kinetic_last);
-  plb.add_u64_counter(l_kinetic_gets, "kinetic_get");
-  plb.add_u64_counter(l_kinetic_txns, "kinetic_transaction");
+  plb.add_u64_counter(l_kinetic_gets, "kinetic_get", "Gets");
+  plb.add_u64_counter(l_kinetic_txns, "kinetic_transaction", "Transactions");
   logger = plb.create_perf_counters();
   cct->get_perfcounters_collection()->add(logger);
   return 0;
diff --git a/src/os/LFNIndex.cc b/src/os/LFNIndex.cc
index 5d6bd7b..48d8db3 100644
--- a/src/os/LFNIndex.cc
+++ b/src/os/LFNIndex.cc
@@ -143,11 +143,6 @@ int LFNIndex::lookup(const ghobject_t &oid,
   );
 }
 
-int LFNIndex::collection_list(vector<ghobject_t> *ls)
-{
-  return _collection_list(ls);
-}
-
 int LFNIndex::pre_hash_collection(uint32_t pg_num, uint64_t expected_num_objs)
 {
   return _pre_hash_collection(pg_num, expected_num_objs);
@@ -155,13 +150,13 @@ int LFNIndex::pre_hash_collection(uint32_t pg_num, uint64_t expected_num_objs)
 
 
 int LFNIndex::collection_list_partial(const ghobject_t &start,
-				      int min_count,
+				      const ghobject_t &end,
+				      bool sort_bitwise,
 				      int max_count,
-				      snapid_t seq,
 				      vector<ghobject_t> *ls,
 				      ghobject_t *next)
 {
-  return _collection_list_partial(start, min_count, max_count, seq, ls, next);
+  return _collection_list_partial(start, end, sort_bitwise, max_count, ls, next);
 }
 
 /* Derived class utility methods */
@@ -390,7 +385,7 @@ static int get_hobject_from_oinfo(const char *dir, const char *file,
   bufferlist bl;
   bl.push_back(bp);
   object_info_t oi(bl);
-  *o = oi.soid;
+  *o = ghobject_t(oi.soid);
   return 0;
 }
 
@@ -457,7 +452,7 @@ int LFNIndex::list_objects(const vector<string> &to_list, int max_objs,
 }
 
 int LFNIndex::list_subdirs(const vector<string> &to_list,
-				  set<string> *out)
+			   vector<string> *out)
 {
   string to_list_path = get_full_path_subdir(to_list);
   DIR *dir = ::opendir(to_list_path.c_str());
@@ -474,7 +469,7 @@ int LFNIndex::list_subdirs(const vector<string> &to_list,
     string demangled_name;
     ghobject_t obj;
     if (lfn_is_subdir(short_name, &demangled_name)) {
-      out->insert(demangled_name);
+      out->push_back(demangled_name);
     }
   }
 
@@ -1025,7 +1020,7 @@ bool LFNIndex::lfn_parse_object_name_keyless(const string &long_name, ghobject_t
   bool r = parse_object(long_name.c_str(), *out);
   int64_t pool = -1;
   spg_t pg;
-  if (coll().is_pg_prefix(pg))
+  if (coll().is_pg_prefix(&pg))
     pool = (int64_t)pg.pgid.pool();
   out->hobj.pool = pool;
   if (!r) return r;
@@ -1118,7 +1113,7 @@ bool LFNIndex::lfn_parse_object_name_poolless(const string &long_name,
 
   int64_t pool = -1;
   spg_t pg;
-  if (coll().is_pg_prefix(pg))
+  if (coll().is_pg_prefix(&pg))
     pool = (int64_t)pg.pgid.pool();
   (*out) = ghobject_t(hobject_t(name, key, snap, hash, pool, ""));
   return true;
diff --git a/src/os/LFNIndex.h b/src/os/LFNIndex.h
index 5cd3523..41f6c93 100644
--- a/src/os/LFNIndex.h
+++ b/src/os/LFNIndex.h
@@ -178,11 +178,6 @@ public:
     int *exist
     );
 
-  /// @see CollectionIndex
-  int collection_list(
-    vector<ghobject_t> *ls
-    );
-
   /// @see CollectionIndex;
   int pre_hash_collection(
       uint32_t pg_num,
@@ -192,9 +187,9 @@ public:
   /// @see CollectionIndex
   int collection_list_partial(
     const ghobject_t &start,
-    int min_count,
+    const ghobject_t &end,
+    bool sort_bitwise,
     int max_count,
-    snapid_t seq,
     vector<ghobject_t> *ls,
     ghobject_t *next
     );
@@ -243,21 +238,6 @@ protected:
     int *exists		  ///< [out] True if the object exists.
     ) = 0;
 
-  /**
-   * List contents of the collection, must be implemented by derived class.
-   *
-   * @param [out] seq Snapid to list.
-   * @param [in] max_count Max number to list (0 for no limit).
-   * @param [out] ls Container for listed objects.
-   * @param [in,out] last List handle.  0 for beginning.  Passing the same
-   * cookie location will cause the next max_count to be listed.
-   * @return Error code.  0 on success.
-   */
-  /// List contents of collection.
-  virtual int _collection_list(
-    vector<ghobject_t> *ls ///< [out] Listed objects.
-    ) = 0;
-
   /// Pre-hash the collection with the given pg number and
   /// expected number of objects in the collection.
   virtual int _pre_hash_collection(
@@ -268,9 +248,9 @@ protected:
   /// @see CollectionIndex
   virtual int _collection_list_partial(
     const ghobject_t &start,
-    int min_count,
+    const ghobject_t &end,
+    bool sort_bitwise,
     int max_count,
-    snapid_t seq,
     vector<ghobject_t> *ls,
     ghobject_t *next
     ) = 0;
@@ -385,7 +365,7 @@ protected:
   /// Lists subdirectories.
   int list_subdirs(
     const vector<string> &to_list, ///< [in] Directory to list.
-    set<string> *out		   ///< [out] Subdirectories listed. 
+    vector<string> *out		   ///< [out] Subdirectories listed.
     );
 
   /// Create subdirectory.
diff --git a/src/os/LevelDBStore.cc b/src/os/LevelDBStore.cc
index 454fafb..1aaa168 100644
--- a/src/os/LevelDBStore.cc
+++ b/src/os/LevelDBStore.cc
@@ -10,7 +10,7 @@
 using std::string;
 #include "common/perf_counters.h"
 
-int LevelDBStore::init()
+int LevelDBStore::init(string option_str)
 {
   // init defaults.  caller can override these if they want
   // prior to calling open.
@@ -74,21 +74,24 @@ int LevelDBStore::do_open(ostream &out, bool create_if_missing)
     return -EINVAL;
   }
 
+  PerfCountersBuilder plb(g_ceph_context, "leveldb", l_leveldb_first, l_leveldb_last);
+  plb.add_u64_counter(l_leveldb_gets, "leveldb_get", "Gets");
+  plb.add_u64_counter(l_leveldb_txns, "leveldb_transaction", "Transactions");
+  plb.add_time_avg(l_leveldb_get_latency, "leveldb_get_latency", "Get Latency");
+  plb.add_time_avg(l_leveldb_submit_latency, "leveldb_submit_latency", "Submit Latency");
+  plb.add_time_avg(l_leveldb_submit_sync_latency, "leveldb_submit_sync_latency", "Submit Sync Latency");
+  plb.add_u64_counter(l_leveldb_compact, "leveldb_compact", "Compactions");
+  plb.add_u64_counter(l_leveldb_compact_range, "leveldb_compact_range", "Compactions by range");
+  plb.add_u64_counter(l_leveldb_compact_queue_merge, "leveldb_compact_queue_merge", "Mergings of ranges in compaction queue");
+  plb.add_u64(l_leveldb_compact_queue_len, "leveldb_compact_queue_len", "Length of compaction queue");
+  logger = plb.create_perf_counters();
+  cct->get_perfcounters_collection()->add(logger);
+
   if (g_conf->leveldb_compact_on_mount) {
     derr << "Compacting leveldb store..." << dendl;
     compact();
     derr << "Finished compacting leveldb store" << dendl;
   }
-
-  PerfCountersBuilder plb(g_ceph_context, "leveldb", l_leveldb_first, l_leveldb_last);
-  plb.add_u64_counter(l_leveldb_gets, "leveldb_get");
-  plb.add_u64_counter(l_leveldb_txns, "leveldb_transaction");
-  plb.add_u64_counter(l_leveldb_compact, "leveldb_compact");
-  plb.add_u64_counter(l_leveldb_compact_range, "leveldb_compact_range");
-  plb.add_u64_counter(l_leveldb_compact_queue_merge, "leveldb_compact_queue_merge");
-  plb.add_u64(l_leveldb_compact_queue_len, "leveldb_compact_queue_len");
-  logger = plb.create_perf_counters();
-  cct->get_perfcounters_collection()->add(logger);
   return 0;
 }
 
@@ -130,21 +133,27 @@ void LevelDBStore::close()
 
 int LevelDBStore::submit_transaction(KeyValueDB::Transaction t)
 {
+  utime_t start = ceph_clock_now(g_ceph_context);
   LevelDBTransactionImpl * _t =
     static_cast<LevelDBTransactionImpl *>(t.get());
   leveldb::Status s = db->Write(leveldb::WriteOptions(), &(_t->bat));
+  utime_t lat = ceph_clock_now(g_ceph_context) - start;
   logger->inc(l_leveldb_txns);
+  logger->tinc(l_leveldb_submit_latency, lat);
   return s.ok() ? 0 : -1;
 }
 
 int LevelDBStore::submit_transaction_sync(KeyValueDB::Transaction t)
 {
+  utime_t start = ceph_clock_now(g_ceph_context);
   LevelDBTransactionImpl * _t =
     static_cast<LevelDBTransactionImpl *>(t.get());
   leveldb::WriteOptions options;
   options.sync = true;
   leveldb::Status s = db->Write(options, &(_t->bat));
+  utime_t lat = ceph_clock_now(g_ceph_context) - start;
   logger->inc(l_leveldb_txns);
+  logger->tinc(l_leveldb_submit_sync_latency, lat);
   return s.ok() ? 0 : -1;
 }
 
@@ -153,21 +162,19 @@ void LevelDBStore::LevelDBTransactionImpl::set(
   const string &k,
   const bufferlist &to_set_bl)
 {
-  buffers.push_back(to_set_bl);
-  bufferlist &bl = *(buffers.rbegin());
   string key = combine_strings(prefix, k);
-  keys.push_back(key);
-  bat.Delete(leveldb::Slice(*(keys.rbegin())));
-  bat.Put(leveldb::Slice(*(keys.rbegin())),
-	  leveldb::Slice(bl.c_str(), bl.length()));
+  //bufferlist::c_str() is non-constant, so we need to make a copy
+  bufferlist val = to_set_bl;
+  bat.Delete(leveldb::Slice(key));
+  bat.Put(leveldb::Slice(key),
+	  leveldb::Slice(val.c_str(), val.length()));
 }
 
 void LevelDBStore::LevelDBTransactionImpl::rmkey(const string &prefix,
 					         const string &k)
 {
   string key = combine_strings(prefix, k);
-  keys.push_back(key);
-  bat.Delete(leveldb::Slice(*(keys.rbegin())));
+  bat.Delete(leveldb::Slice(key));
 }
 
 void LevelDBStore::LevelDBTransactionImpl::rmkeys_by_prefix(const string &prefix)
@@ -177,8 +184,7 @@ void LevelDBStore::LevelDBTransactionImpl::rmkeys_by_prefix(const string &prefix
        it->valid();
        it->next()) {
     string key = combine_strings(prefix, it->key());
-    keys.push_back(key);
-    bat.Delete(*(keys.rbegin()));
+    bat.Delete(key);
   }
 }
 
@@ -187,6 +193,7 @@ int LevelDBStore::get(
     const std::set<string> &keys,
     std::map<string, bufferlist> *out)
 {
+  utime_t start = ceph_clock_now(g_ceph_context);
   KeyValueDB::Iterator it = get_iterator(prefix);
   for (std::set<string>::const_iterator i = keys.begin();
        i != keys.end();
@@ -197,7 +204,9 @@ int LevelDBStore::get(
     } else if (!it->valid())
       break;
   }
+  utime_t lat = ceph_clock_now(g_ceph_context) - start;
   logger->inc(l_leveldb_gets);
+  logger->tinc(l_leveldb_get_latency, lat);
   return 0;
 }
 
diff --git a/src/os/LevelDBStore.h b/src/os/LevelDBStore.h
index 4617c5c..06ea071 100644
--- a/src/os/LevelDBStore.h
+++ b/src/os/LevelDBStore.h
@@ -34,6 +34,9 @@ enum {
   l_leveldb_first = 34300,
   l_leveldb_gets,
   l_leveldb_txns,
+  l_leveldb_get_latency,
+  l_leveldb_submit_latency,
+  l_leveldb_submit_sync_latency,
   l_leveldb_compact,
   l_leveldb_compact_range,
   l_leveldb_compact_queue_merge,
@@ -158,7 +161,7 @@ public:
   ~LevelDBStore();
 
   static int _test_init(const string& dir);
-  int init();
+  int init(string option_str="");
 
   /// Opens underlying db
   int open(ostream &out) {
@@ -174,10 +177,7 @@ public:
   class LevelDBTransactionImpl : public KeyValueDB::TransactionImpl {
   public:
     leveldb::WriteBatch bat;
-    list<bufferlist> buffers;
-    list<string> keys;
     LevelDBStore *db;
-
     LevelDBTransactionImpl(LevelDBStore *db) : db(db) {}
     void set(
       const string &prefix,
@@ -302,10 +302,6 @@ public:
   static string combine_strings(const string &prefix, const string &value);
   static int split_key(leveldb::Slice in, string *prefix, string *key);
   static bufferlist to_bufferlist(leveldb::Slice in);
-  static bool in_prefix(const string &prefix, leveldb::Slice key) {
-    return (key.compare(leveldb::Slice(past_prefix(prefix))) < 0) &&
-      (key.compare(leveldb::Slice(prefix)) > 0);
-  }
   static string past_prefix(const string &prefix) {
     string limit = prefix;
     limit.push_back(1);
diff --git a/src/os/Makefile.am b/src/os/Makefile.am
index 2638810..fdb6c99 100644
--- a/src/os/Makefile.am
+++ b/src/os/Makefile.am
@@ -7,11 +7,11 @@ if ENABLE_SERVER
 
 libos_la_SOURCES = \
 	os/chain_xattr.cc \
+	os/fs/FS.cc \
 	os/DBObjectMap.cc \
 	os/GenericObjectMap.cc \
 	os/FileJournal.cc \
 	os/FileStore.cc \
-	os/FlatIndex.cc \
 	os/GenericFileStoreBackend.cc \
 	os/HashIndex.cc \
 	os/IndexManager.cc \
@@ -23,15 +23,21 @@ libos_la_SOURCES = \
 	os/KeyValueStore.cc \
 	os/ObjectStore.cc \
 	os/WBThrottle.cc \
-        os/KeyValueDB.cc \
 	common/TrackedOp.cc
 
 if LINUX
 libos_la_SOURCES += os/BtrfsFileStoreBackend.cc
 endif
 
+if WITH_LIBAIO
+libos_types_la_SOURCES += os/newstore/newstore_types.cc
+libos_la_SOURCES += os/newstore/NewStore.cc
+endif
+
 if WITH_LIBXFS
-libos_la_SOURCES += os/XfsFileStoreBackend.cc
+libos_la_SOURCES += \
+    os/fs/XFS.cc \
+    os/XfsFileStoreBackend.cc
 endif
 
 if WITH_LIBZFS
@@ -40,23 +46,23 @@ endif
 
 libos_la_CXXFLAGS = ${AM_CXXFLAGS}
 libos_la_LIBADD = $(LIBOS_TYPES)
-if WITH_LTTNG
-libos_la_LIBADD += $(LIBOS_TP)
-endif
 
 noinst_LTLIBRARIES += libos.la
 
 noinst_HEADERS += \
 	os/btrfs_ioctl.h \
 	os/chain_xattr.h \
+	os/newstore/newstore_types.h \
+	os/newstore/NewStore.h \
 	os/BtrfsFileStoreBackend.h \
 	os/CollectionIndex.h \
 	os/DBObjectMap.h \
 	os/GenericObjectMap.h \
 	os/FileJournal.h \
 	os/FileStore.h \
-	os/FlatIndex.h \
 	os/FDCache.h \
+	os/fs/FS.h \
+	os/fs/XFS.h \
 	os/GenericFileStoreBackend.h \
 	os/HashIndex.h \
 	os/IndexManager.h \
@@ -69,6 +75,7 @@ noinst_HEADERS += \
 	os/KeyValueStore.h \
 	os/ObjectMap.h \
 	os/ObjectStore.h \
+	os/PageSet.h \
 	os/SequencerPosition.h \
 	os/WBThrottle.h \
 	os/XfsFileStoreBackend.h \
diff --git a/src/os/MemStore.cc b/src/os/MemStore.cc
index a1e1b27..b0aa206 100644
--- a/src/os/MemStore.cc
+++ b/src/os/MemStore.cc
@@ -27,6 +27,7 @@
 #include "include/memory.h"
 #include "common/errno.h"
 #include "MemStore.h"
+#include "include/compat.h"
 
 #define dout_subsys ceph_subsys_filestore
 #undef dout_prefix
@@ -132,7 +133,7 @@ void MemStore::dump(Formatter *f)
     f->close_section();
 
     f->open_array_section("objects");
-    for (map<ghobject_t,ObjectRef>::iterator q = p->second->object_map.begin();
+    for (map<ghobject_t,ObjectRef,ghobject_t::BitwiseComparator>::iterator q = p->second->object_map.begin();
 	 q != p->second->object_map.end();
 	 ++q) {
       f->open_object_section("object");
@@ -170,7 +171,7 @@ int MemStore::_load()
     int r = cbl.read_file(fn.c_str(), &err);
     if (r < 0)
       return r;
-    CollectionRef c(new Collection);
+    CollectionRef c(new Collection(cct));
     bufferlist::iterator p = cbl.begin();
     c->decode(p);
     coll_map[*q] = c;
@@ -271,7 +272,6 @@ bool MemStore::exists(coll_t cid, const ghobject_t& oid)
   CollectionRef c = get_collection(cid);
   if (!c)
     return false;
-  RWLock::RLocker l(c->lock);
 
   // Perform equivalent of c->get_object_(oid) != NULL. In C++11 the
   // shared_ptr needs to be compared to nullptr.
@@ -288,12 +288,11 @@ int MemStore::stat(
   CollectionRef c = get_collection(cid);
   if (!c)
     return -ENOENT;
-  RWLock::RLocker l(c->lock);
 
   ObjectRef o = c->get_object(oid);
   if (!o)
     return -ENOENT;
-  st->st_size = o->data.length();
+  st->st_size = o->get_size();
   st->st_blksize = 4096;
   st->st_blocks = (st->st_size + st->st_blksize - 1) / st->st_blksize;
   st->st_nlink = 1;
@@ -314,21 +313,19 @@ int MemStore::read(
   CollectionRef c = get_collection(cid);
   if (!c)
     return -ENOENT;
-  RWLock::RLocker lc(c->lock);
 
   ObjectRef o = c->get_object(oid);
   if (!o)
     return -ENOENT;
-  if (offset >= o->data.length())
+  if (offset >= o->get_size())
     return 0;
   size_t l = len;
   if (l == 0)  // note: len == 0 means read the entire object
-    l = o->data.length();
-  else if (offset + l > o->data.length())
-    l = o->data.length() - offset;
+    l = o->get_size();
+  else if (offset + l > o->get_size())
+    l = o->get_size() - offset;
   bl.clear();
-  bl.substr_of(o->data, offset, l);
-  return bl.length();
+  return o->read(offset, l, bl);
 }
 
 int MemStore::fiemap(coll_t cid, const ghobject_t& oid,
@@ -339,16 +336,15 @@ int MemStore::fiemap(coll_t cid, const ghobject_t& oid,
   CollectionRef c = get_collection(cid);
   if (!c)
     return -ENOENT;
-  RWLock::RLocker lc(c->lock);
 
   ObjectRef o = c->get_object(oid);
   if (!o)
     return -ENOENT;
-  if (offset >= o->data.length())
+  if (offset >= o->get_size())
     return 0;
   size_t l = len;
-  if (offset + l > o->data.length())
-    l = o->data.length() - offset;
+  if (offset + l > o->get_size())
+    l = o->get_size() - offset;
   map<uint64_t, uint64_t> m;
   m[offset] = l;
   ::encode(m, bl);
@@ -362,12 +358,12 @@ int MemStore::getattr(coll_t cid, const ghobject_t& oid,
   CollectionRef c = get_collection(cid);
   if (!c)
     return -ENOENT;
-  RWLock::RLocker l(c->lock);
 
   ObjectRef o = c->get_object(oid);
   if (!o)
     return -ENOENT;
   string k(name);
+  std::lock_guard<std::mutex> lock(o->xattr_mutex);
   if (!o->xattr.count(k)) {
     return -ENODATA;
   }
@@ -382,11 +378,11 @@ int MemStore::getattrs(coll_t cid, const ghobject_t& oid,
   CollectionRef c = get_collection(cid);
   if (!c)
     return -ENOENT;
-  RWLock::RLocker l(c->lock);
 
   ObjectRef o = c->get_object(oid);
   if (!o)
     return -ENOENT;
+  std::lock_guard<std::mutex> lock(o->xattr_mutex);
   aset = o->xattr;
   return 0;
 }
@@ -421,61 +417,29 @@ bool MemStore::collection_empty(coll_t cid)
   return c->object_map.empty();
 }
 
-int MemStore::collection_list(coll_t cid, vector<ghobject_t>& o)
+int MemStore::collection_list(coll_t cid, ghobject_t start, ghobject_t end,
+			      bool sort_bitwise, int max,
+			      vector<ghobject_t> *ls, ghobject_t *next)
 {
-  dout(10) << __func__ << " " << cid << dendl;
+  if (!sort_bitwise)
+    return -EOPNOTSUPP;
   CollectionRef c = get_collection(cid);
   if (!c)
     return -ENOENT;
   RWLock::RLocker l(c->lock);
 
-  for (map<ghobject_t,ObjectRef>::iterator p = c->object_map.begin();
-       p != c->object_map.end();
-       ++p)
-    o.push_back(p->first);
-  return 0;
-}
-
-int MemStore::collection_list_partial(coll_t cid, ghobject_t start,
-				      int min, int max, snapid_t snap, 
-				      vector<ghobject_t> *ls, ghobject_t *next)
-{
-  dout(10) << __func__ << " " << cid << " " << start << " " << min << "-"
-	   << max << " " << snap << dendl;
-  CollectionRef c = get_collection(cid);
-  if (!c)
-    return -ENOENT;
-  RWLock::RLocker l(c->lock);
-
-  map<ghobject_t,ObjectRef>::iterator p = c->object_map.lower_bound(start);
+  map<ghobject_t,ObjectRef,ghobject_t::BitwiseComparator>::iterator p = c->object_map.lower_bound(start);
   while (p != c->object_map.end() &&
-	 ls->size() < (unsigned)max) {
+	 ls->size() < (unsigned)max &&
+	 cmp_bitwise(p->first, end) < 0) {
     ls->push_back(p->first);
     ++p;
   }
-  if (p == c->object_map.end())
-    *next = ghobject_t::get_max();
-  else
-    *next = p->first;
-  return 0;
-}
-
-int MemStore::collection_list_range(coll_t cid,
-				    ghobject_t start, ghobject_t end,
-				    snapid_t seq, vector<ghobject_t> *ls)
-{
-  dout(10) << __func__ << " " << cid << " " << start << " " << end
-	   << " " << seq << dendl;
-  CollectionRef c = get_collection(cid);
-  if (!c)
-    return -ENOENT;
-  RWLock::RLocker l(c->lock);
-
-  map<ghobject_t,ObjectRef>::iterator p = c->object_map.lower_bound(start);
-  while (p != c->object_map.end() &&
-	 p->first < end) {
-    ls->push_back(p->first);
-    ++p;
+  if (next != NULL) {
+    if (p == c->object_map.end())
+      *next = ghobject_t::get_max();
+    else
+      *next = p->first;
   }
   return 0;
 }
@@ -491,11 +455,11 @@ int MemStore::omap_get(
   CollectionRef c = get_collection(cid);
   if (!c)
     return -ENOENT;
-  RWLock::RLocker l(c->lock);
 
   ObjectRef o = c->get_object(oid);
   if (!o)
     return -ENOENT;
+  std::lock_guard<std::mutex> lock(o->omap_mutex);
   *header = o->omap_header;
   *out = o->omap;
   return 0;
@@ -512,11 +476,11 @@ int MemStore::omap_get_header(
   CollectionRef c = get_collection(cid);
   if (!c)
     return -ENOENT;
-  RWLock::RLocker l(c->lock);
 
   ObjectRef o = c->get_object(oid);
   if (!o)
     return -ENOENT;
+  std::lock_guard<std::mutex> lock(o->omap_mutex);
   *header = o->omap_header;
   return 0;
 }
@@ -531,11 +495,11 @@ int MemStore::omap_get_keys(
   CollectionRef c = get_collection(cid);
   if (!c)
     return -ENOENT;
-  RWLock::RLocker l(c->lock);
 
   ObjectRef o = c->get_object(oid);
   if (!o)
     return -ENOENT;
+  std::lock_guard<std::mutex> lock(o->omap_mutex);
   for (map<string,bufferlist>::iterator p = o->omap.begin();
        p != o->omap.end();
        ++p)
@@ -554,11 +518,11 @@ int MemStore::omap_get_values(
   CollectionRef c = get_collection(cid);
   if (!c)
     return -ENOENT;
-  RWLock::RLocker l(c->lock);
 
   ObjectRef o = c->get_object(oid);
   if (!o)
     return -ENOENT;
+  std::lock_guard<std::mutex> lock(o->omap_mutex);
   for (set<string>::const_iterator p = keys.begin();
        p != keys.end();
        ++p) {
@@ -580,11 +544,11 @@ int MemStore::omap_check_keys(
   CollectionRef c = get_collection(cid);
   if (!c)
     return -ENOENT;
-  RWLock::RLocker l(c->lock);
 
   ObjectRef o = c->get_object(oid);
   if (!o)
     return -ENOENT;
+  std::lock_guard<std::mutex> lock(o->omap_mutex);
   for (set<string>::const_iterator p = keys.begin();
        p != keys.end();
        ++p) {
@@ -602,7 +566,6 @@ ObjectMap::ObjectMapIterator MemStore::get_omap_iterator(coll_t cid,
   CollectionRef c = get_collection(cid);
   if (!c)
     return ObjectMap::ObjectMapIterator();
-  RWLock::RLocker l(c->lock);
 
   ObjectRef o = c->get_object(oid);
   if (!o)
@@ -619,8 +582,22 @@ int MemStore::queue_transactions(Sequencer *osr,
 				 TrackedOpRef op,
 				 ThreadPool::TPHandle *handle)
 {
-  // fixme: ignore the Sequencer and serialize everything.
-  Mutex::Locker l(apply_lock);
+  // because memstore operations are synchronous, we can implement the
+  // Sequencer with a mutex. this guarantees ordering on a given sequencer,
+  // while allowing operations on different sequencers to happen in parallel
+  struct OpSequencer : public Sequencer_impl {
+    std::mutex mutex;
+    void flush() override {}
+    bool flush_commit(Context*) override { return true; }
+  };
+
+  std::unique_lock<std::mutex> lock;
+  if (osr) {
+    auto seq = reinterpret_cast<OpSequencer**>(&osr->p);
+    if (*seq == nullptr)
+      *seq = new OpSequencer;
+    lock = std::unique_lock<std::mutex>((*seq)->mutex);
+  }
 
   for (list<Transaction*>::iterator p = tls.begin(); p != tls.end(); ++p) {
     // poke the TPHandle heartbeat just to exercise that code path
@@ -847,27 +824,19 @@ void MemStore::_do_transaction(Transaction& t)
 
     case Transaction::OP_COLL_SETATTR:
       {
-        coll_t cid = i.get_cid(op->cid);
-        string name = i.decode_string();
-        bufferlist bl;
-        i.decode_bl(bl);
 	assert(0 == "not implemented");
       }
       break;
 
     case Transaction::OP_COLL_RMATTR:
       {
-        coll_t cid = i.get_cid(op->cid);
-        string name = i.decode_string();
 	assert(0 == "not implemented");
       }
       break;
 
     case Transaction::OP_COLL_RENAME:
       {
-        coll_t cid = i.get_cid(op->cid);
-        ghobject_t oid = i.get_oid(op->oid);
-	r = -EOPNOTSUPP;
+	assert(0 == "not implemented");
       }
       break;
 
@@ -930,8 +899,7 @@ void MemStore::_do_transaction(Transaction& t)
 
     case Transaction::OP_SETALLOCHINT:
       {
-        coll_t cid = i.get_cid(op->cid);
-        ghobject_t oid = i.get_oid(op->oid);
+        r = 0;
       }
       break;
 
@@ -994,14 +962,8 @@ int MemStore::_touch(coll_t cid, const ghobject_t& oid)
   CollectionRef c = get_collection(cid);
   if (!c)
     return -ENOENT;
-  RWLock::WLocker l(c->lock);
 
-  ObjectRef o = c->get_object(oid);
-  if (!o) {
-    o.reset(new Object);
-    c->object_map[oid] = o;
-    c->object_hash[oid] = o;
-  }
+  c->get_or_create_object(oid);
   return 0;
 }
 
@@ -1016,51 +978,15 @@ int MemStore::_write(coll_t cid, const ghobject_t& oid,
   CollectionRef c = get_collection(cid);
   if (!c)
     return -ENOENT;
-  RWLock::WLocker l(c->lock);
 
-  ObjectRef o = c->get_object(oid);
-  if (!o) {
-    // write implicitly creates a missing object
-    o.reset(new Object);
-    c->object_map[oid] = o;
-    c->object_hash[oid] = o;
-  }
-
-  int old_size = o->data.length();
-  _write_into_bl(bl, offset, &o->data);
-  used_bytes += (o->data.length() - old_size);
+  ObjectRef o = c->get_or_create_object(oid);
+  const ssize_t old_size = o->get_size();
+  o->write(offset, bl);
+  used_bytes += (o->get_size() - old_size);
 
   return 0;
 }
 
-void MemStore::_write_into_bl(const bufferlist& src, unsigned offset,
-			      bufferlist *dst)
-{
-  unsigned len = src.length();
-
-  // before
-  bufferlist newdata;
-  if (dst->length() >= offset) {
-    newdata.substr_of(*dst, 0, offset);
-  } else {
-    newdata.substr_of(*dst, 0, dst->length());
-    bufferptr bp(offset - dst->length());
-    bp.zero();
-    newdata.append(bp);
-  }
-
-  newdata.append(src);
-
-  // after
-  if (dst->length() > offset + len) {
-    bufferlist tail;
-    tail.substr_of(*dst, offset + len, dst->length() - (offset + len));
-    newdata.append(tail);
-  }
-
-  dst->claim(newdata);
-}
-
 int MemStore::_zero(coll_t cid, const ghobject_t& oid,
 		    uint64_t offset, size_t len)
 {
@@ -1079,25 +1005,14 @@ int MemStore::_truncate(coll_t cid, const ghobject_t& oid, uint64_t size)
   CollectionRef c = get_collection(cid);
   if (!c)
     return -ENOENT;
-  RWLock::WLocker l(c->lock);
 
   ObjectRef o = c->get_object(oid);
   if (!o)
     return -ENOENT;
-  if (o->data.length() > size) {
-    bufferlist bl;
-    bl.substr_of(o->data, 0, size);
-    used_bytes -= o->data.length() - size;
-    o->data.claim(bl);
-  } else if (o->data.length() == size) {
-    // do nothing
-  } else {
-    bufferptr bp(size - o->data.length());
-    bp.zero();
-    used_bytes += bp.length();
-    o->data.append(bp);
-  }
-  return 0;
+  const ssize_t old_size = o->get_size();
+  int r = o->truncate(size);
+  used_bytes += (o->get_size() - old_size);
+  return r;
 }
 
 int MemStore::_remove(coll_t cid, const ghobject_t& oid)
@@ -1108,13 +1023,12 @@ int MemStore::_remove(coll_t cid, const ghobject_t& oid)
     return -ENOENT;
   RWLock::WLocker l(c->lock);
 
-  ObjectRef o = c->get_object(oid);
-  if (!o)
+  auto i = c->object_hash.find(oid);
+  if (i == c->object_hash.end())
     return -ENOENT;
+  c->object_hash.erase(i);
   c->object_map.erase(oid);
-  c->object_hash.erase(oid);
-
-  used_bytes -= o->data.length();
+  used_bytes -= i->second->get_size();
 
   return 0;
 }
@@ -1126,11 +1040,11 @@ int MemStore::_setattrs(coll_t cid, const ghobject_t& oid,
   CollectionRef c = get_collection(cid);
   if (!c)
     return -ENOENT;
-  RWLock::WLocker l(c->lock);
 
   ObjectRef o = c->get_object(oid);
   if (!o)
     return -ENOENT;
+  std::lock_guard<std::mutex> lock(o->xattr_mutex);
   for (map<string,bufferptr>::const_iterator p = aset.begin(); p != aset.end(); ++p)
     o->xattr[p->first] = p->second;
   return 0;
@@ -1142,14 +1056,15 @@ int MemStore::_rmattr(coll_t cid, const ghobject_t& oid, const char *name)
   CollectionRef c = get_collection(cid);
   if (!c)
     return -ENOENT;
-  RWLock::WLocker l(c->lock);
 
   ObjectRef o = c->get_object(oid);
   if (!o)
     return -ENOENT;
-  if (!o->xattr.count(name))
+  std::lock_guard<std::mutex> lock(o->xattr_mutex);
+  auto i = o->xattr.find(name);
+  if (i == o->xattr.end())
     return -ENODATA;
-  o->xattr.erase(name);
+  o->xattr.erase(i);
   return 0;
 }
 
@@ -1159,11 +1074,11 @@ int MemStore::_rmattrs(coll_t cid, const ghobject_t& oid)
   CollectionRef c = get_collection(cid);
   if (!c)
     return -ENOENT;
-  RWLock::WLocker l(c->lock);
 
   ObjectRef o = c->get_object(oid);
   if (!o)
     return -ENOENT;
+  std::lock_guard<std::mutex> lock(o->xattr_mutex);
   o->xattr.clear();
   return 0;
 }
@@ -1176,19 +1091,22 @@ int MemStore::_clone(coll_t cid, const ghobject_t& oldoid,
   CollectionRef c = get_collection(cid);
   if (!c)
     return -ENOENT;
-  RWLock::WLocker l(c->lock);
 
   ObjectRef oo = c->get_object(oldoid);
   if (!oo)
     return -ENOENT;
-  ObjectRef no = c->get_object(newoid);
-  if (!no) {
-    no.reset(new Object);
-    c->object_map[newoid] = no;
-    c->object_hash[newoid] = no;
-  }
-  used_bytes += oo->data.length() - no->data.length();
-  no->data = oo->data;
+  ObjectRef no = c->get_or_create_object(newoid);
+  used_bytes += oo->get_size() - no->get_size();
+  no->clone(oo.get(), 0, oo->get_size(), 0);
+
+  // take xattr and omap locks with std::lock()
+  std::unique_lock<std::mutex>
+      ox_lock(oo->xattr_mutex, std::defer_lock),
+      nx_lock(no->xattr_mutex, std::defer_lock),
+      oo_lock(oo->omap_mutex, std::defer_lock),
+      no_lock(no->omap_mutex, std::defer_lock);
+  std::lock(ox_lock, nx_lock, oo_lock, no_lock);
+
   no->omap_header = oo->omap_header;
   no->omap = oo->omap;
   no->xattr = oo->xattr;
@@ -1206,27 +1124,19 @@ int MemStore::_clone_range(coll_t cid, const ghobject_t& oldoid,
   CollectionRef c = get_collection(cid);
   if (!c)
     return -ENOENT;
-  RWLock::WLocker l(c->lock);
 
   ObjectRef oo = c->get_object(oldoid);
   if (!oo)
     return -ENOENT;
-  ObjectRef no = c->get_object(newoid);
-  if (!no) {
-    no.reset(new Object);
-    c->object_map[newoid] = no;
-    c->object_hash[newoid] = no;
-  }
-  if (srcoff >= oo->data.length())
+  ObjectRef no = c->get_or_create_object(newoid);
+  if (srcoff >= oo->get_size())
     return 0;
-  if (srcoff + len >= oo->data.length())
-    len = oo->data.length() - srcoff;
-  bufferlist bl;
-  bl.substr_of(oo->data, srcoff, len);
+  if (srcoff + len >= oo->get_size())
+    len = oo->get_size() - srcoff;
 
-  int old_size = no->data.length();
-  _write_into_bl(bl, dstoff, &no->data);
-  used_bytes += (no->data.length() - old_size);
+  const ssize_t old_size = no->get_size();
+  no->clone(oo.get(), srcoff, len, dstoff);
+  used_bytes += (no->get_size() - old_size);
 
   return len;
 }
@@ -1237,11 +1147,11 @@ int MemStore::_omap_clear(coll_t cid, const ghobject_t &oid)
   CollectionRef c = get_collection(cid);
   if (!c)
     return -ENOENT;
-  RWLock::WLocker l(c->lock);
 
   ObjectRef o = c->get_object(oid);
   if (!o)
     return -ENOENT;
+  std::lock_guard<std::mutex> lock(o->omap_mutex);
   o->omap.clear();
   o->omap_header.clear();
   return 0;
@@ -1254,11 +1164,11 @@ int MemStore::_omap_setkeys(coll_t cid, const ghobject_t &oid,
   CollectionRef c = get_collection(cid);
   if (!c)
     return -ENOENT;
-  RWLock::WLocker l(c->lock);
 
   ObjectRef o = c->get_object(oid);
   if (!o)
     return -ENOENT;
+  std::lock_guard<std::mutex> lock(o->omap_mutex);
   for (map<string,bufferlist>::const_iterator p = aset.begin(); p != aset.end(); ++p)
     o->omap[p->first] = p->second;
   return 0;
@@ -1271,11 +1181,11 @@ int MemStore::_omap_rmkeys(coll_t cid, const ghobject_t &oid,
   CollectionRef c = get_collection(cid);
   if (!c)
     return -ENOENT;
-  RWLock::WLocker l(c->lock);
 
   ObjectRef o = c->get_object(oid);
   if (!o)
     return -ENOENT;
+  std::lock_guard<std::mutex> lock(o->omap_mutex);
   for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p)
     o->omap.erase(*p);
   return 0;
@@ -1289,15 +1199,14 @@ int MemStore::_omap_rmkeyrange(coll_t cid, const ghobject_t &oid,
   CollectionRef c = get_collection(cid);
   if (!c)
     return -ENOENT;
-  RWLock::WLocker l(c->lock);
 
   ObjectRef o = c->get_object(oid);
   if (!o)
     return -ENOENT;
-  map<string,bufferlist>::iterator p = o->omap.upper_bound(first);
+  std::lock_guard<std::mutex> lock(o->omap_mutex);
+  map<string,bufferlist>::iterator p = o->omap.lower_bound(first);
   map<string,bufferlist>::iterator e = o->omap.lower_bound(last);
-  while (p != e)
-    o->omap.erase(p++);
+  o->omap.erase(p, e);
   return 0;
 }
 
@@ -1308,11 +1217,11 @@ int MemStore::_omap_setheader(coll_t cid, const ghobject_t &oid,
   CollectionRef c = get_collection(cid);
   if (!c)
     return -ENOENT;
-  RWLock::WLocker l(c->lock);
 
   ObjectRef o = c->get_object(oid);
   if (!o)
     return -ENOENT;
+  std::lock_guard<std::mutex> lock(o->omap_mutex);
   o->omap_header = bl;
   return 0;
 }
@@ -1321,10 +1230,10 @@ int MemStore::_create_collection(coll_t cid)
 {
   dout(10) << __func__ << " " << cid << dendl;
   RWLock::WLocker l(coll_lock);
-  ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid);
-  if (cp != coll_map.end())
+  auto result = coll_map.insert(std::make_pair(cid, CollectionRef()));
+  if (!result.second)
     return -EEXIST;
-  coll_map[cid].reset(new Collection);
+  result.first->second.reset(new Collection(cct));
   return 0;
 }
 
@@ -1425,7 +1334,7 @@ int MemStore::_split_collection(coll_t cid, uint32_t bits, uint32_t match,
   RWLock::WLocker l1(MIN(&(*sc), &(*dc))->lock);
   RWLock::WLocker l2(MAX(&(*sc), &(*dc))->lock);
 
-  map<ghobject_t,ObjectRef>::iterator p = sc->object_map.begin();
+  map<ghobject_t,ObjectRef,ghobject_t::BitwiseComparator>::iterator p = sc->object_map.begin();
   while (p != sc->object_map.end()) {
     if (p->first.match(bits, match)) {
       dout(20) << " moving " << p->first << dendl;
@@ -1440,3 +1349,235 @@ int MemStore::_split_collection(coll_t cid, uint32_t bits, uint32_t match,
 
   return 0;
 }
+
+// BufferlistObject
+int MemStore::BufferlistObject::read(uint64_t offset, uint64_t len,
+                                     bufferlist &bl)
+{
+  std::lock_guard<Spinlock> lock(mutex);
+  bl.substr_of(data, offset, len);
+  return bl.length();
+}
+
+int MemStore::BufferlistObject::write(uint64_t offset, const bufferlist &src)
+{
+  unsigned len = src.length();
+
+  std::lock_guard<Spinlock> lock(mutex);
+
+  // before
+  bufferlist newdata;
+  if (get_size() >= offset) {
+    newdata.substr_of(data, 0, offset);
+  } else {
+    newdata.substr_of(data, 0, get_size());
+    bufferptr bp(offset - get_size());
+    bp.zero();
+    newdata.append(bp);
+  }
+
+  newdata.append(src);
+
+  // after
+  if (get_size() > offset + len) {
+    bufferlist tail;
+    tail.substr_of(data, offset + len, get_size() - (offset + len));
+    newdata.append(tail);
+  }
+
+  data.claim(newdata);
+  return 0;
+}
+
+int MemStore::BufferlistObject::clone(Object *src, uint64_t srcoff,
+                                      uint64_t len, uint64_t dstoff)
+{
+  auto srcbl = dynamic_cast<BufferlistObject*>(src);
+  if (srcbl == nullptr)
+    return -ENOTSUP;
+
+  bufferlist bl;
+  {
+    std::lock_guard<Spinlock> lock(srcbl->mutex);
+    if (srcoff == dstoff && len == src->get_size()) {
+      data = srcbl->data;
+      return 0;
+    }
+    bl.substr_of(srcbl->data, srcoff, len);
+  }
+  return write(dstoff, bl);
+}
+
+int MemStore::BufferlistObject::truncate(uint64_t size)
+{
+  std::lock_guard<Spinlock> lock(mutex);
+  if (get_size() > size) {
+    bufferlist bl;
+    bl.substr_of(data, 0, size);
+    data.claim(bl);
+  } else if (get_size() == size) {
+    // do nothing
+  } else {
+    bufferptr bp(size - get_size());
+    bp.zero();
+    data.append(bp);
+  }
+  return 0;
+}
+
+// PageSetObject
+
+#if defined(__GLIBCXX__)
+// use a thread-local vector for the pages returned by PageSet, so we
+// can avoid allocations in read/write()
+thread_local PageSet::page_vector MemStore::PageSetObject::tls_pages;
+#define DEFINE_PAGE_VECTOR(name)
+#else
+#define DEFINE_PAGE_VECTOR(name) PageSet::page_vector name;
+#endif
+
+int MemStore::PageSetObject::read(uint64_t offset, uint64_t len, bufferlist& bl)
+{
+  const auto start = offset;
+  const auto end = offset + len;
+  auto remaining = len;
+
+  DEFINE_PAGE_VECTOR(tls_pages);
+  data.get_range(offset, len, tls_pages);
+
+  // allocate a buffer for the data
+  buffer::ptr buf(len);
+
+  auto p = tls_pages.begin();
+  while (remaining) {
+    // no more pages in range
+    if (p == tls_pages.end() || (*p)->offset >= end) {
+      buf.zero(offset - start, remaining);
+      break;
+    }
+    auto page = *p;
+
+    // fill any holes between pages with zeroes
+    if (page->offset > offset) {
+      const auto count = std::min(remaining, page->offset - offset);
+      buf.zero(offset - start, count);
+      remaining -= count;
+      offset = page->offset;
+      if (!remaining)
+        break;
+    }
+
+    // read from page
+    const auto page_offset = offset - page->offset;
+    const auto count = min(remaining, data.get_page_size() - page_offset);
+
+    buf.copy_in(offset - start, count, page->data + page_offset);
+
+    remaining -= count;
+    offset += count;
+
+    ++p;
+  }
+
+  tls_pages.clear(); // drop page refs
+
+  bl.append(buf);
+  return len;
+}
+
+int MemStore::PageSetObject::write(uint64_t offset, const bufferlist &src)
+{
+  unsigned len = src.length();
+
+  DEFINE_PAGE_VECTOR(tls_pages);
+  // make sure the page range is allocated
+  data.alloc_range(offset, src.length(), tls_pages);
+
+  auto page = tls_pages.begin();
+
+  // XXX: cast away the const because bufferlist doesn't have a const_iterator
+  auto p = const_cast<bufferlist&>(src).begin();
+  while (len > 0) {
+    unsigned page_offset = offset - (*page)->offset;
+    unsigned pageoff = data.get_page_size() - page_offset;
+    unsigned count = min(len, pageoff);
+    p.copy(count, (*page)->data + page_offset);
+    offset += count;
+    len -= count;
+    if (count == pageoff)
+      ++page;
+  }
+  if (data_len < offset)
+    data_len = offset;
+  tls_pages.clear(); // drop page refs
+  return 0;
+}
+
+int MemStore::PageSetObject::clone(Object *src, uint64_t srcoff,
+                                   uint64_t len, uint64_t dstoff)
+{
+  const int64_t delta = dstoff - srcoff;
+
+  auto &src_data = static_cast<PageSetObject*>(src)->data;
+  const uint64_t src_page_size = src_data.get_page_size();
+
+  auto &dst_data = data;
+  const auto dst_page_size = dst_data.get_page_size();
+
+  DEFINE_PAGE_VECTOR(tls_pages);
+  PageSet::page_vector dst_pages;
+
+  while (len) {
+    const auto count = std::min(len, (uint64_t)src_page_size * 16);
+    src_data.get_range(srcoff, count, tls_pages);
+
+    for (auto &src_page : tls_pages) {
+      auto sbegin = std::max(srcoff, src_page->offset);
+      auto send = std::min(srcoff + count, src_page->offset + src_page_size);
+      dst_data.alloc_range(sbegin + delta, send - sbegin, dst_pages);
+
+      // copy data from src page to dst pages
+      for (auto &dst_page : dst_pages) {
+        auto dbegin = std::max(sbegin + delta, dst_page->offset);
+        auto dend = std::min(send + delta, dst_page->offset + dst_page_size);
+
+        std::copy(src_page->data + (dbegin - delta) - src_page->offset,
+                  src_page->data + (dend - delta) - src_page->offset,
+                  dst_page->data + dbegin - dst_page->offset);
+      }
+      dst_pages.clear(); // drop page refs
+      srcoff += count;
+      dstoff += count;
+      len -= count;
+    }
+    tls_pages.clear(); // drop page refs
+  }
+
+  // update object size
+  if (data_len < dstoff + len)
+    data_len = dstoff + len;
+  return 0;
+}
+
+int MemStore::PageSetObject::truncate(uint64_t size)
+{
+  data.free_pages_after(size);
+  data_len = size;
+
+  const auto page_size = data.get_page_size();
+  const auto page_offset = size & ~(page_size-1);
+  if (page_offset == size)
+    return 0;
+
+  DEFINE_PAGE_VECTOR(tls_pages);
+  // write zeroes to the rest of the last page
+  data.get_range(page_offset, page_size, tls_pages);
+  if (tls_pages.empty())
+    return 0;
+
+  auto page = tls_pages.begin();
+  auto data = (*page)->data;
+  std::fill(data + (size - page_offset), data + page_size, 0);
+  tls_pages.clear(); // drop page ref
+  return 0;
+}
diff --git a/src/os/MemStore.h b/src/os/MemStore.h
index 4459a6a..734195f 100644
--- a/src/os/MemStore.h
+++ b/src/os/MemStore.h
@@ -16,39 +16,58 @@
 #ifndef CEPH_MEMSTORE_H
 #define CEPH_MEMSTORE_H
 
-#include "include/assert.h"
+#include <mutex>
+#include <boost/intrusive_ptr.hpp>
+
 #include "include/unordered_map.h"
 #include "include/memory.h"
+#include "include/Spinlock.h"
 #include "common/Finisher.h"
+#include "common/RefCountedObj.h"
 #include "common/RWLock.h"
 #include "ObjectStore.h"
+#include "PageSet.h"
+#include "include/assert.h"
 
 class MemStore : public ObjectStore {
+private:
+  CephContext *const cct;
+
 public:
-  struct Object {
-    bufferlist data;
+  struct Object : public RefCountedObject {
+    std::mutex xattr_mutex;
+    std::mutex omap_mutex;
     map<string,bufferptr> xattr;
     bufferlist omap_header;
     map<string,bufferlist> omap;
 
-    void encode(bufferlist& bl) const {
-      ENCODE_START(1, 1, bl);
-      ::encode(data, bl);
+    typedef boost::intrusive_ptr<Object> Ref;
+    friend void intrusive_ptr_add_ref(Object *o) { o->get(); }
+    friend void intrusive_ptr_release(Object *o) { o->put(); }
+
+    // interface for object data
+    virtual size_t get_size() const = 0;
+    virtual int read(uint64_t offset, uint64_t len, bufferlist &bl) = 0;
+    virtual int write(uint64_t offset, const bufferlist &bl) = 0;
+    virtual int clone(Object *src, uint64_t srcoff, uint64_t len,
+                      uint64_t dstoff) = 0;
+    virtual int truncate(uint64_t offset) = 0;
+    virtual void encode(bufferlist& bl) const = 0;
+    virtual void decode(bufferlist::iterator& p) = 0;
+
+    void encode_base(bufferlist& bl) const {
       ::encode(xattr, bl);
       ::encode(omap_header, bl);
       ::encode(omap, bl);
-      ENCODE_FINISH(bl);
     }
-    void decode(bufferlist::iterator& p) {
-      DECODE_START(1, p);
-      ::decode(data, p);
+    void decode_base(bufferlist::iterator& p) {
       ::decode(xattr, p);
       ::decode(omap_header, p);
       ::decode(omap, p);
-      DECODE_FINISH(p);
     }
+
     void dump(Formatter *f) const {
-      f->dump_int("data_len", data.length());
+      f->dump_int("data_len", get_size());
       f->dump_int("omap_header_len", omap_header.length());
 
       f->open_array_section("xattrs");
@@ -74,32 +93,115 @@ public:
       f->close_section();
     }
   };
-  typedef ceph::shared_ptr<Object> ObjectRef;
+  typedef Object::Ref ObjectRef;
+
+  struct BufferlistObject : public Object {
+    Spinlock mutex;
+    bufferlist data;
+
+    size_t get_size() const override { return data.length(); }
+
+    int read(uint64_t offset, uint64_t len, bufferlist &bl) override;
+    int write(uint64_t offset, const bufferlist &bl) override;
+    int clone(Object *src, uint64_t srcoff, uint64_t len,
+              uint64_t dstoff) override;
+    int truncate(uint64_t offset) override;
+
+    void encode(bufferlist& bl) const override {
+      ENCODE_START(1, 1, bl);
+      ::encode(data, bl);
+      encode_base(bl);
+      ENCODE_FINISH(bl);
+    }
+    void decode(bufferlist::iterator& p) override {
+      DECODE_START(1, p);
+      ::decode(data, p);
+      decode_base(p);
+      DECODE_FINISH(p);
+    }
+  };
+
+  struct PageSetObject : public Object {
+    PageSet data;
+    uint64_t data_len;
+#if defined(__GLIBCXX__)
+    // use a thread-local vector for the pages returned by PageSet, so we
+    // can avoid allocations in read/write()
+    static thread_local PageSet::page_vector tls_pages;
+#endif
+
+    PageSetObject(size_t page_size) : data(page_size), data_len(0) {}
+
+    size_t get_size() const override { return data_len; }
+
+    int read(uint64_t offset, uint64_t len, bufferlist &bl) override;
+    int write(uint64_t offset, const bufferlist &bl) override;
+    int clone(Object *src, uint64_t srcoff, uint64_t len,
+              uint64_t dstoff) override;
+    int truncate(uint64_t offset) override;
 
-  struct Collection {
+    void encode(bufferlist& bl) const override {
+      ENCODE_START(1, 1, bl);
+      ::encode(data_len, bl);
+      data.encode(bl);
+      encode_base(bl);
+      ENCODE_FINISH(bl);
+    }
+    void decode(bufferlist::iterator& p) override {
+      DECODE_START(1, p);
+      ::decode(data_len, p);
+      data.decode(p);
+      decode_base(p);
+      DECODE_FINISH(p);
+    }
+  };
+
+  struct Collection : public RefCountedObject {
+    CephContext *cct;
+    bool use_page_set;
     ceph::unordered_map<ghobject_t, ObjectRef> object_hash;  ///< for lookup
-    map<ghobject_t, ObjectRef> object_map;        ///< for iteration
+    map<ghobject_t, ObjectRef,ghobject_t::BitwiseComparator> object_map;        ///< for iteration
     map<string,bufferptr> xattr;
     RWLock lock;   ///< for object_{map,hash}
 
+    typedef boost::intrusive_ptr<Collection> Ref;
+    friend void intrusive_ptr_add_ref(Collection *c) { c->get(); }
+    friend void intrusive_ptr_release(Collection *c) { c->put(); }
+
+    ObjectRef create_object() const {
+      if (use_page_set)
+        return new PageSetObject(cct->_conf->memstore_page_size);
+      return new BufferlistObject();
+    }
+
     // NOTE: The lock only needs to protect the object_map/hash, not the
     // contents of individual objects.  The osd is already sequencing
     // reads and writes, so we will never see them concurrently at this
     // level.
 
     ObjectRef get_object(ghobject_t oid) {
-      ceph::unordered_map<ghobject_t,ObjectRef>::iterator o = object_hash.find(oid);
+      RWLock::RLocker l(lock);
+      auto o = object_hash.find(oid);
       if (o == object_hash.end())
 	return ObjectRef();
       return o->second;
     }
 
+    ObjectRef get_or_create_object(ghobject_t oid) {
+      RWLock::WLocker l(lock);
+      auto result = object_hash.emplace(oid, ObjectRef());
+      if (result.second)
+        object_map[oid] = result.first->second = create_object();
+      return result.first->second;
+    }
+
     void encode(bufferlist& bl) const {
       ENCODE_START(1, 1, bl);
       ::encode(xattr, bl);
+      ::encode(use_page_set, bl);
       uint32_t s = object_map.size();
       ::encode(s, bl);
-      for (map<ghobject_t, ObjectRef>::const_iterator p = object_map.begin();
+      for (map<ghobject_t, ObjectRef,ghobject_t::BitwiseComparator>::const_iterator p = object_map.begin();
 	   p != object_map.end();
 	   ++p) {
 	::encode(p->first, bl);
@@ -110,12 +212,13 @@ public:
     void decode(bufferlist::iterator& p) {
       DECODE_START(1, p);
       ::decode(xattr, p);
+      ::decode(use_page_set, p);
       uint32_t s;
       ::decode(s, p);
       while (s--) {
 	ghobject_t k;
 	::decode(k, p);
-	ObjectRef o(new Object);
+	auto o = create_object();
 	o->decode(p);
 	object_map.insert(make_pair(k, o));
 	object_hash.insert(make_pair(k, o));
@@ -125,18 +228,20 @@ public:
 
     uint64_t used_bytes() const {
       uint64_t result = 0;
-      for (map<ghobject_t, ObjectRef>::const_iterator p = object_map.begin();
+      for (map<ghobject_t, ObjectRef,ghobject_t::BitwiseComparator>::const_iterator p = object_map.begin();
 	   p != object_map.end();
 	   ++p) {
-        result += p->second->data.length();
+        result += p->second->get_size();
       }
 
       return result;
     }
 
-    Collection() : lock("MemStore::Collection::lock") {}
+    Collection(CephContext *cct)
+      : cct(cct), use_page_set(cct->_conf->memstore_page_set),
+        lock("MemStore::Collection::lock") {}
   };
-  typedef ceph::shared_ptr<Collection> CollectionRef;
+  typedef Collection::Ref CollectionRef;
 
 private:
   class OmapIteratorImpl : public ObjectMap::ObjectMapIteratorImpl {
@@ -148,35 +253,35 @@ private:
       : c(c), o(o), it(o->omap.begin()) {}
 
     int seek_to_first() {
-      RWLock::RLocker l(c->lock);
+      std::lock_guard<std::mutex>(o->omap_mutex);
       it = o->omap.begin();
       return 0;
     }
     int upper_bound(const string &after) {
-      RWLock::RLocker l(c->lock);
+      std::lock_guard<std::mutex>(o->omap_mutex);
       it = o->omap.upper_bound(after);
       return 0;
     }
     int lower_bound(const string &to) {
-      RWLock::RLocker l(c->lock);
+      std::lock_guard<std::mutex>(o->omap_mutex);
       it = o->omap.lower_bound(to);
       return 0;
     }
     bool valid() {
-      RWLock::RLocker l(c->lock);
+      std::lock_guard<std::mutex>(o->omap_mutex);
       return it != o->omap.end();      
     }
     int next() {
-      RWLock::RLocker l(c->lock);
+      std::lock_guard<std::mutex>(o->omap_mutex);
       ++it;
       return 0;
     }
     string key() {
-      RWLock::RLocker l(c->lock);
+      std::lock_guard<std::mutex>(o->omap_mutex);
       return it->first;
     }
     bufferlist value() {
-      RWLock::RLocker l(c->lock);
+      std::lock_guard<std::mutex>(o->omap_mutex);
       return it->second;
     }
     int status() {
@@ -197,8 +302,6 @@ private:
 
   void _do_transaction(Transaction& t);
 
-  void _write_into_bl(const bufferlist& src, unsigned offset, bufferlist *dst);
-
   int _touch(coll_t cid, const ghobject_t& oid);
   int _write(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len,
 	      const bufferlist& bl, uint32_t fadvsie_flags = 0);
@@ -238,6 +341,7 @@ private:
 public:
   MemStore(CephContext *cct, const string& path)
     : ObjectStore(path),
+      cct(cct),
       coll_lock("MemStore::coll_lock"),
       apply_lock("MemStore::apply_lock"),
       finisher(cct),
@@ -245,7 +349,6 @@ public:
       sharded(false) {}
   ~MemStore() { }
 
-  bool need_journal() { return false; };
   int peek_journal_fsid(uuid_d *fsid);
 
   bool test_mount_in_use() {
@@ -266,6 +369,15 @@ public:
   int mkjournal() {
     return 0;
   }
+  bool wants_journal() {
+    return false;
+  }
+  bool allows_journal() {
+    return false;
+  }
+  bool needs_journal() {
+    return false;
+  }
 
   bool sharded;
   void set_allow_sharded_objects() {
@@ -298,12 +410,9 @@ public:
   int list_collections(vector<coll_t>& ls);
   bool collection_exists(coll_t c);
   bool collection_empty(coll_t c);
-  int collection_list(coll_t cid, vector<ghobject_t>& o);
-  int collection_list_partial(coll_t cid, ghobject_t start,
-			      int min, int max, snapid_t snap, 
-			      vector<ghobject_t> *ls, ghobject_t *next);
-  int collection_list_range(coll_t cid, ghobject_t start, ghobject_t end,
-			    snapid_t seq, vector<ghobject_t> *ls);
+  int collection_list(coll_t cid, ghobject_t start, ghobject_t end,
+		      bool sort_bitwise, int max,
+		      vector<ghobject_t> *ls, ghobject_t *next);
 
   int omap_get(
     coll_t cid,                ///< [in] Collection containing oid
diff --git a/src/os/ObjectStore.cc b/src/os/ObjectStore.cc
index 5869be5..8e8886b 100644
--- a/src/os/ObjectStore.cc
+++ b/src/os/ObjectStore.cc
@@ -21,6 +21,10 @@
 #include "KeyValueStore.h"
 #include "common/safe_io.h"
 
+#if defined(HAVE_LIBAIO)
+#include "newstore/NewStore.h"
+#endif
+
 ObjectStore *ObjectStore::create(CephContext *cct,
 				 const string& type,
 				 const string& data,
@@ -37,6 +41,12 @@ ObjectStore *ObjectStore::create(CephContext *cct,
       cct->check_experimental_feature_enabled("keyvaluestore")) {
     return new KeyValueStore(data);
   }
+#if defined(HAVE_LIBAIO)
+  if (type == "newstore" &&
+      cct->check_experimental_feature_enabled("newstore")) {
+    return new NewStore(cct, data);
+  }
+#endif
   return NULL;
 }
 
@@ -113,49 +123,3 @@ int ObjectStore::queue_transactions(
   return queue_transactions(osr, tls, _onreadable, _oncommit,
 			    onreadable_sync, op);
 }
-
-int ObjectStore::collection_list(coll_t c, vector<hobject_t>& o)
-{
-  vector<ghobject_t> go;
-  int ret = collection_list(c, go);
-  if (ret == 0) {
-    o.reserve(go.size());
-    for (vector<ghobject_t>::iterator i = go.begin(); i != go.end() ; ++i)
-      o.push_back(i->hobj);
-  }
-  return ret;
-}
-
-int ObjectStore::collection_list_partial(coll_t c, hobject_t start,
-			      int min, int max, snapid_t snap,
-				      vector<hobject_t> *ls, hobject_t *next)
-{
-  vector<ghobject_t> go;
-  ghobject_t gnext, gstart(start);
-  int ret = collection_list_partial(c, gstart, min, max, snap, &go, &gnext);
-  if (ret == 0) {
-    *next = gnext.hobj;
-    ls->reserve(go.size());
-    for (vector<ghobject_t>::iterator i = go.begin(); i != go.end() ; ++i)
-      ls->push_back(i->hobj);
-  }
-  return ret;
-}
-
-int ObjectStore::collection_list_range(coll_t c, hobject_t start, hobject_t end,
-			    snapid_t seq, vector<hobject_t> *ls)
-{
-  vector<ghobject_t> go;
-  // Starts with the smallest shard id and generation to
-  // make sure the result list has the marker object
-  ghobject_t gstart(start, 0, shard_id_t(0));
-  // Exclusive end, choose the smallest end ghobject
-  ghobject_t gend(end, 0, shard_id_t(0));
-  int ret = collection_list_range(c, gstart, gend, seq, &go);
-  if (ret == 0) {
-    ls->reserve(go.size());
-    for (vector<ghobject_t>::iterator i = go.begin(); i != go.end() ; ++i)
-      ls->push_back(i->hobj);
-  }
-  return ret;
-}
diff --git a/src/os/ObjectStore.h b/src/os/ObjectStore.h
index e0c12af..65818ff 100644
--- a/src/os/ObjectStore.h
+++ b/src/os/ObjectStore.h
@@ -108,7 +108,7 @@ public:
 			     const string& type,
 			     const string& data,
 			     const string& journal,
-			     osflagbits_t flag = 0);
+			     osflagbits_t flags = 0);
 
   Logger *logger;
 
@@ -137,15 +137,15 @@ public:
    * ABC for Sequencer implementation, private to the ObjectStore derived class.
    * created in ...::queue_transaction(s)
    */
-  struct Sequencer_impl {
+  struct Sequencer_impl : public RefCountedObject {
     virtual void flush() = 0;
 
     /**
      * Async flush_commit
      *
      * There are two cases:
-     * 1) sequencer is currently idle: the method returns true and
-     *    c is deleted
+     * 1) sequencer is currently idle: the method returns true.  c is
+     *    not touched.
      * 2) sequencer is not idle: the method returns false and c is
      *    called asyncronously with a value of 0 once all transactions
      *    queued on this sequencer prior to the call have been applied
@@ -155,20 +155,21 @@ public:
       Context *c ///< [in] context to call upon flush/commit
       ) = 0; ///< @return true if idle, false otherwise
 
+    Sequencer_impl() : RefCountedObject(NULL, 0) {}
     virtual ~Sequencer_impl() {}
   };
+  typedef boost::intrusive_ptr<Sequencer_impl> Sequencer_implRef;
 
   /**
    * External (opaque) sequencer implementation
    */
   struct Sequencer {
     string name;
-    Sequencer_impl *p;
+    Sequencer_implRef p;
 
     Sequencer(string n)
       : name(n), p(NULL) {}
     ~Sequencer() {
-      delete p;
     }
 
     /// return a unique string identifier for this sequencer
@@ -184,7 +185,6 @@ public:
     /// @see Sequencer_impl::flush_commit()
     bool flush_commit(Context *c) {
       if (!p) {
-	delete c;
 	return true;
       } else {
 	return p->flush_commit(c);
@@ -434,7 +434,7 @@ public:
     bufferlist tbl;
 
     map<coll_t, __le32> coll_index;
-    map<ghobject_t, __le32> object_index;
+    map<ghobject_t, __le32, ghobject_t::BitwiseComparator> object_index;
 
     __le32 coll_id;
     __le32 object_id;
@@ -660,7 +660,7 @@ public:
       }
 
       vector<__le32> om(other.object_index.size());
-      map<ghobject_t, __le32>::iterator object_index_p;
+      map<ghobject_t, __le32, ghobject_t::BitwiseComparator>::iterator object_index_p;
       for (object_index_p = other.object_index.begin();
            object_index_p != other.object_index.end();
            ++object_index_p) {
@@ -777,9 +777,11 @@ public:
 
       bufferlist::iterator data_bl_p;
 
+    public:
       vector<coll_t> colls;
       vector<ghobject_t> objects;
 
+    private:
       iterator(Transaction *t)
         : t(t),
 	  data_bl_p(t->data_bl.begin()),
@@ -796,7 +798,7 @@ public:
           colls[coll_index_p->second] = coll_index_p->first;
         }
 
-        map<ghobject_t, __le32>::iterator object_index_p;
+        map<ghobject_t, __le32, ghobject_t::BitwiseComparator>::iterator object_index_p;
         for (object_index_p = t->object_index.begin();
              object_index_p != t->object_index.end();
              ++object_index_p) {
@@ -838,11 +840,11 @@ public:
         ::decode(keys, data_bl_p);
       }
 
-      ghobject_t get_oid(__le32 oid_id) {
+      const ghobject_t &get_oid(__le32 oid_id) {
         assert(oid_id < objects.size());
         return objects[oid_id];
       }
-      coll_t get_cid(__le32 cid_id) {
+      const coll_t &get_cid(__le32 cid_id) {
         assert(cid_id < colls.size());
         return colls[cid_id];
       }
@@ -872,7 +874,6 @@ private:
     Op* _get_next_op() {
       if (op_ptr.length() == 0 || op_ptr.offset() >= op_ptr.length()) {
         op_ptr = bufferptr(sizeof(Op) * OPS_PER_PTR);
-	op_ptr.zero();
       }
       bufferptr ptr(op_ptr, 0, sizeof(Op));
       op_bl.append(ptr);
@@ -880,6 +881,7 @@ private:
       op_ptr.set_offset(op_ptr.offset() + sizeof(Op));
 
       char* p = ptr.c_str();
+      memset(p, 0, sizeof(Op));
       return reinterpret_cast<Op*>(p);
     }
     __le32 _get_coll_id(const coll_t& coll) {
@@ -892,7 +894,7 @@ private:
       return index_id;
     }
     __le32 _get_object_id(const ghobject_t& oid) {
-      map<ghobject_t, __le32>::iterator o = object_index.find(oid);
+      map<ghobject_t, __le32, ghobject_t::BitwiseComparator>::iterator o = object_index.find(oid);
       if (o != object_index.end())
         return o->second;
 
@@ -1190,7 +1192,7 @@ public:
       data.ops++;
     }
     /// Create the collection
-    void create_collection(coll_t cid) {
+    void create_collection(coll_t cid, int bits) {
       if (use_tbl) {
         __u32 op = OP_MKCOLL;
         ::encode(op, tbl);
@@ -1199,6 +1201,7 @@ public:
         Op* _op = _get_next_op();
         _op->op = OP_MKCOLL;
         _op->cid = _get_coll_id(cid);
+	_op->split_bits = bits;
       }
       data.ops++;
     }
@@ -1241,7 +1244,8 @@ public:
       }
       data.ops++;
     }
-    void collection_move(coll_t cid, coll_t oldcid, const ghobject_t& oid) {
+    void collection_move(coll_t cid, coll_t oldcid, const ghobject_t& oid)
+      __attribute__ ((deprecated)) {
       // NOTE: we encode this as a fixed combo of ADD + REMOVE.  they
       // always appear together, so this is effectively a single MOVE.
       if (use_tbl) {
@@ -1666,19 +1670,11 @@ public:
   };
 
   // synchronous wrappers
-  unsigned apply_transaction(Transaction& t, Context *ondisk=0) {
-    list<Transaction*> tls;
-    tls.push_back(&t);
-    return apply_transactions(NULL, tls, ondisk);
-  }
   unsigned apply_transaction(Sequencer *osr, Transaction& t, Context *ondisk=0) {
     list<Transaction*> tls;
     tls.push_back(&t);
     return apply_transactions(osr, tls, ondisk);
   }
-  unsigned apply_transactions(list<Transaction*>& tls, Context *ondisk=0) {
-    return apply_transactions(NULL, tls, ondisk);
-  }
   unsigned apply_transactions(Sequencer *osr, list<Transaction*>& tls, Context *ondisk=0);
 
   int queue_transaction_and_cleanup(Sequencer *osr, Transaction* t,
@@ -1761,22 +1757,21 @@ public:
   virtual unsigned get_max_attr_name_length() = 0;
   virtual int mkfs() = 0;  // wipe
   virtual int mkjournal() = 0; // journal only
+  virtual bool needs_journal() = 0;  //< requires a journal
+  virtual bool wants_journal() = 0;  //< prefers a journal
+  virtual bool allows_journal() = 0; //< allows a journal
   virtual void set_allow_sharded_objects() = 0;
   virtual bool get_allow_sharded_objects() = 0;
 
+  virtual bool can_sort_nibblewise() {
+    return false;   // assume a backend cannot, unless it says otherwise
+  }
+
   virtual int statfs(struct statfs *buf) = 0;
 
   virtual void collect_metadata(map<string,string> *pm) { }
 
   /**
-   * check whether need journal device
-   *
-   * It's not constant for backend store. FileStore could have journaless mode
-   * and KeyValueStore could have journal device for special backend.
-   */
-  virtual bool need_journal() = 0;
-
-  /**
    * check the journal uuid/fsid, without opening
    */
   virtual int peek_journal_fsid(uuid_d *fsid) = 0;
@@ -1813,14 +1808,7 @@ public:
 			std::string *value);
 
   /**
-   * get ideal min value for collection_list_partial()
-   *
-   * default to some arbitrary values; the implementation will override.
-   */
-  virtual int get_ideal_list_min() { return 32; }
-
-  /**
-   * get ideal max value for collection_list_partial()
+   * get ideal max value for collection_list()
    *
    * default to some arbitrary values; the implementation will override.
    */
@@ -2016,7 +2004,7 @@ public:
    * collection_getattrs - get all xattrs of a collection
    *
    * @param cid collection name
-   * @param asert map of keys and buffers that contain the values
+   * @param aset map of keys and buffers that contain the values
    * @returns 0 on success, negative error code on failure
    */
   virtual int collection_getattrs(coll_t cid, map<string,bufferptr> &aset)
@@ -2033,52 +2021,21 @@ public:
   virtual bool collection_empty(coll_t c) = 0;
 
   /**
-   * collection_list - get all objects of a collection in sorted order
-   *
-   * @param c collection name
-   * @param o [out] list of objects
-   * @returns 0 on success, negative error code on failure
-   */
-  virtual int collection_list(coll_t c, vector<ghobject_t>& o) = 0;
-
-  /**
-   * list partial contents of collection relative to a hash offset/position
-   *
-   * @param c collection
-   * @param start list objects that sort >= this value
-   * @param min return at least this many results, unless we reach the end
-   * @param max return no more than this many results
-   * @param snapid return no objects with snap < snapid
-   * @param ls [out] result
-   * @param next [out] next item sorts >= this value
-   * @return zero on success, or negative error
-   */
-  virtual int collection_list_partial(coll_t c, ghobject_t start,
-				      int min, int max, snapid_t snap,
-				      vector<ghobject_t> *ls, ghobject_t *next) = 0;
-
-  /**
-   * list contents of a collection that fall in the range [start, end)
+   * list contents of a collection that fall in the range [start, end) and no more than a specified many result
    *
    * @param c collection
    * @param start list object that sort >= this value
    * @param end list objects that sort < this value
-   * @param snapid return no objects with snap < snapid
+   * @param sort_bitwise sort bitwise (instead of legacy nibblewise)
+   * @param max return no more than this many results
+   * @param seq return no objects with snap < seq
    * @param ls [out] result
+   * @param next [out] next item sorts >= this value
    * @return zero on success, or negative error
    */
-  virtual int collection_list_range(coll_t c, ghobject_t start, ghobject_t end,
-	                            snapid_t seq, vector<ghobject_t> *ls) = 0;
-
-  //TODO: Remove
-  int collection_list(coll_t c, vector<hobject_t>& o);
-
-  int collection_list_partial(coll_t c, hobject_t start,
-				      int min, int max, snapid_t snap,
-				      vector<hobject_t> *ls, hobject_t *next);
-
-  int collection_list_range(coll_t c, hobject_t start, hobject_t end,
-	                            snapid_t seq, vector<hobject_t> *ls);
+  virtual int collection_list(coll_t c, ghobject_t start, ghobject_t end,
+			      bool sort_bitwise, int max,
+			      vector<ghobject_t> *ls, ghobject_t *next) = 0;
 
   /// OMAP
   /// Get omap contents
@@ -2134,10 +2091,8 @@ public:
     const ghobject_t &oid  ///< [in] object
     ) = 0;
 
-  virtual void sync(Context *onsync) {}
-  virtual void sync() {}
-  virtual void flush() {}
-  virtual void sync_and_flush() {}
+
+  virtual int flush_journal() { return -EOPNOTSUPP; }
 
   virtual int dump_journal(ostream& out) { return -EOPNOTSUPP; }
 
@@ -2156,6 +2111,13 @@ public:
 WRITE_CLASS_ENCODER(ObjectStore::Transaction)
 WRITE_CLASS_ENCODER(ObjectStore::Transaction::TransactionData)
 
+static inline void intrusive_ptr_add_ref(ObjectStore::Sequencer_impl *s) {
+  s->get();
+}
+static inline void intrusive_ptr_release(ObjectStore::Sequencer_impl *s) {
+  s->put();
+}
+
 ostream& operator<<(ostream& out, const ObjectStore::Sequencer& s);
 
 #endif
diff --git a/src/os/PageSet.h b/src/os/PageSet.h
new file mode 100644
index 0000000..b7ef12b
--- /dev/null
+++ b/src/os/PageSet.h
@@ -0,0 +1,227 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013- Sage Weil <sage at inktank.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.	See file COPYING.
+ *
+ */
+
+#ifndef CEPH_PAGESET_H
+#define CEPH_PAGESET_H
+
+#include <algorithm>
+#include <atomic>
+#include <cassert>
+#include <mutex>
+#include <vector>
+#include <boost/intrusive/avl_set.hpp>
+#include <boost/intrusive_ptr.hpp>
+
+#include "include/encoding.h"
+#include "include/Spinlock.h"
+
+
+struct Page {
+  char *const data;
+  boost::intrusive::avl_set_member_hook<> hook;
+  uint64_t offset;
+
+  // avoid RefCountedObject because it has a virtual destructor
+  std::atomic<uint16_t> nrefs;
+  void get() { ++nrefs; }
+  void put() { if (--nrefs == 0) delete this; }
+
+  typedef boost::intrusive_ptr<Page> Ref;
+  friend void intrusive_ptr_add_ref(Page *p) { p->get(); }
+  friend void intrusive_ptr_release(Page *p) { p->put(); }
+
+  // key-value comparison functor for avl
+  struct Less {
+    bool operator()(uint64_t offset, const Page &page) const {
+      return offset < page.offset;
+    }
+    bool operator()(const Page &page, uint64_t offset) const {
+      return page.offset < offset;
+    }
+    bool operator()(const Page &lhs, const Page &rhs) const {
+      return lhs.offset < rhs.offset;
+    }
+  };
+  void encode(bufferlist &bl, size_t page_size) const {
+    bl.append(buffer::copy(data, page_size));
+    ::encode(offset, bl);
+  }
+  void decode(bufferlist::iterator &p, size_t page_size) {
+    ::decode_array_nohead(data, page_size, p);
+    ::decode(offset, p);
+  }
+
+  static Ref create(size_t page_size, uint64_t offset = 0) {
+    // allocate the Page and its data in a single buffer
+    auto buffer = new char[page_size + sizeof(Page)];
+    // place the Page structure at the end of the buffer
+    return new (buffer + page_size) Page(buffer, offset);
+  }
+
+  // copy disabled
+  Page(const Page&) = delete;
+  const Page& operator=(const Page&) = delete;
+
+ private: // private constructor, use create() instead
+  Page(char *data, uint64_t offset) : data(data), offset(offset), nrefs(1) {}
+
+  static void operator delete(void *p) {
+    delete[] reinterpret_cast<Page*>(p)->data;
+  }
+};
+
+class PageSet {
+ public:
+  // alloc_range() and get_range() return page refs in a vector
+  typedef std::vector<Page::Ref> page_vector;
+
+ private:
+  // store pages in a boost intrusive avl_set
+  typedef Page::Less page_cmp;
+  typedef boost::intrusive::member_hook<Page,
+          boost::intrusive::avl_set_member_hook<>,
+          &Page::hook> member_option;
+  typedef boost::intrusive::avl_set<Page,
+          boost::intrusive::compare<page_cmp>, member_option> page_set;
+
+  typedef typename page_set::iterator iterator;
+
+  page_set pages;
+  uint64_t page_size;
+
+  typedef Spinlock lock_type;
+  lock_type mutex;
+
+  void free_pages(iterator cur, iterator end) {
+    while (cur != end) {
+      Page *page = &*cur;
+      cur = pages.erase(cur);
+      page->put();
+    }
+  }
+
+  int count_pages(uint64_t offset, uint64_t len) const {
+    // count the overlapping pages
+    int count = 0;
+    if (offset % page_size) {
+      count++;
+      size_t rem = page_size - offset % page_size;
+      len = len <= rem ? 0 : len - rem;
+    }
+    count += len / page_size;
+    if (len % page_size)
+      count++;
+    return count;
+  }
+
+ public:
+  PageSet(size_t page_size) : page_size(page_size) {}
+  PageSet(PageSet &&rhs)
+    : pages(std::move(rhs.pages)), page_size(rhs.page_size) {}
+  ~PageSet() {
+    free_pages(pages.begin(), pages.end());
+  }
+
+  // disable copy
+  PageSet(const PageSet&) = delete;
+  const PageSet& operator=(const PageSet&) = delete;
+
+  bool empty() const { return pages.empty(); }
+  size_t size() const { return pages.size(); }
+  size_t get_page_size() const { return page_size; }
+
+  // allocate all pages that intersect the range [offset,length)
+  void alloc_range(uint64_t offset, uint64_t length, page_vector &range) {
+    // loop in reverse so we can provide hints to avl_set::insert_check()
+    //	and get O(1) insertions after the first
+    uint64_t position = offset + length - 1;
+
+    range.resize(count_pages(offset, length));
+    auto out = range.rbegin();
+
+    std::lock_guard<lock_type> lock(mutex);
+    iterator cur = pages.end();
+    while (length) {
+      const uint64_t page_offset = position & ~(page_size-1);
+
+      typename page_set::insert_commit_data commit;
+      auto insert = pages.insert_check(cur, page_offset, page_cmp(), commit);
+      if (insert.second) {
+        auto page = Page::create(page_size, page_offset);
+        cur = pages.insert_commit(*page, commit);
+
+        // assume that the caller will write to the range [offset,length),
+        //  so we only need to zero memory outside of this range
+
+        // zero end of page past offset + length
+        if (offset + length < page->offset + page_size)
+          std::fill(page->data + offset + length - page->offset,
+                    page->data + page_size, 0);
+        // zero front of page between page_offset and offset
+        if (offset > page->offset)
+          std::fill(page->data, page->data + offset - page->offset, 0);
+      } else { // exists
+        cur = insert.first;
+      }
+      // add a reference to output vector
+      out->reset(&*cur);
+      ++out;
+
+      auto c = std::min(length, (position & (page_size-1)) + 1);
+      position -= c;
+      length -= c;
+    }
+    // make sure we sized the vector correctly
+    assert(out == range.rend());
+  }
+
+  // return all allocated pages that intersect the range [offset,length)
+  void get_range(uint64_t offset, uint64_t length, page_vector &range) {
+    auto cur = pages.lower_bound(offset & ~(page_size-1), page_cmp());
+    while (cur != pages.end() && cur->offset < offset + length)
+      range.push_back(&*cur++);
+  }
+
+  void free_pages_after(uint64_t offset) {
+    std::lock_guard<lock_type> lock(mutex);
+    auto cur = pages.lower_bound(offset & ~(page_size-1), page_cmp());
+    if (cur == pages.end())
+      return;
+    if (cur->offset < offset)
+      cur++;
+    free_pages(cur, pages.end());
+  }
+
+  void encode(bufferlist &bl) const {
+    ::encode(page_size, bl);
+    unsigned count = pages.size();
+    ::encode(count, bl);
+    for (auto p = pages.rbegin(); p != pages.rend(); ++p)
+      p->encode(bl, page_size);
+  }
+  void decode(bufferlist::iterator &p) {
+    assert(empty());
+    ::decode(page_size, p);
+    unsigned count;
+    ::decode(count, p);
+    auto cur = pages.end();
+    for (unsigned i = 0; i < count; i++) {
+      auto page = Page::create(page_size);
+      page->decode(p, page_size);
+      cur = pages.insert_before(cur, *page);
+    }
+  }
+};
+
+#endif // CEPH_PAGESET_H
diff --git a/src/os/RocksDBStore.cc b/src/os/RocksDBStore.cc
index 4a45a15..cb3ac91 100644
--- a/src/os/RocksDBStore.cc
+++ b/src/os/RocksDBStore.cc
@@ -4,135 +4,143 @@
 #include <set>
 #include <map>
 #include <string>
-#include <tr1/memory>
+#include <memory>
 #include <errno.h>
 
 #include "rocksdb/db.h"
+#include "rocksdb/table.h"
 #include "rocksdb/env.h"
 #include "rocksdb/write_batch.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/filter_policy.h"
-
+#include "rocksdb/utilities/convenience.h"
 using std::string;
 #include "common/perf_counters.h"
+#include "include/str_map.h"
 #include "KeyValueDB.h"
 #include "RocksDBStore.h"
 
+int string2bool(string val, bool &b_val)
+{
+  if (strcasecmp(val.c_str(), "false") == 0) {
+    b_val = false;
+    return 0;
+  } else if (strcasecmp(val.c_str(), "true") == 0) {
+    b_val = true;
+    return 0;
+  } else {
+    std::string err;
+    int b = strict_strtol(val.c_str(), 10, &err);
+    if (!err.empty())
+      return -EINVAL;
+    b_val = !!b;
+    return 0;
+  }
+}
+  
+int RocksDBStore::tryInterpret(const string key, const string val, rocksdb::Options &opt)
+{
+  if (key == "compaction_threads") {
+    std::string err;
+    int f = strict_sistrtoll(val.c_str(), &err);
+    if (!err.empty())
+      return -EINVAL;
+    //Low priority threadpool is used for compaction
+    opt.env->SetBackgroundThreads(f, rocksdb::Env::Priority::LOW);
+  } else if (key == "flusher_threads") {
+    std::string err;
+    int f = strict_sistrtoll(val.c_str(), &err);
+    if (!err.empty())
+      return -EINVAL;
+    //High priority threadpool is used for flusher
+    opt.env->SetBackgroundThreads(f, rocksdb::Env::Priority::HIGH);
+  } else if (key == "compact_on_mount") {
+    int ret = string2bool(val, compact_on_mount);
+    if (ret != 0)
+      return ret;
+  } else if (key == "disableWAL") {
+    int ret = string2bool(val, disableWAL);
+    if (ret != 0)
+      return ret;
+  } else {
+    //unrecognize config options.
+    return -EINVAL;
+  }
+  return 0;
+}
 
-int RocksDBStore::init()
-{
-  options.write_buffer_size = g_conf->rocksdb_write_buffer_size;
-  options.cache_size = g_conf->rocksdb_cache_size;
-  options.block_size = g_conf->rocksdb_block_size;
-  options.bloom_size = g_conf->rocksdb_bloom_size;
-  options.compression_type = g_conf->rocksdb_compression;
-  options.paranoid_checks = g_conf->rocksdb_paranoid;
-  options.max_open_files = g_conf->rocksdb_max_open_files;
-  options.log_file = g_conf->rocksdb_log;
-  options.write_buffer_num = g_conf->rocksdb_write_buffer_num;
-  options.max_background_compactions = g_conf->rocksdb_background_compactions;
-  options.max_background_flushes = g_conf->rocksdb_background_flushes;
-  options.target_file_size_base = g_conf->rocksdb_target_file_size_base;
-  options.level0_file_num_compaction_trigger = g_conf->rocksdb_level0_file_num_compaction_trigger;
-  options.level0_slowdown_writes_trigger = g_conf->rocksdb_level0_slowdown_writes_trigger;
-  options.level0_stop_writes_trigger = g_conf->rocksdb_level0_stop_writes_trigger;
-  options.disableDataSync = g_conf->rocksdb_disableDataSync;
-  options.num_levels = g_conf->rocksdb_num_levels;
-  options.disableWAL = g_conf->rocksdb_disableWAL;
-  options.wal_dir = g_conf->rocksdb_wal_dir;
-  options.info_log_level = g_conf->rocksdb_info_log_level;
+int RocksDBStore::ParseOptionsFromString(const string opt_str, rocksdb::Options &opt)
+{
+  map<string, string> str_map;
+  int r = get_str_map(opt_str, ",\n;", &str_map);
+  if (r < 0)
+    return r;
+  map<string, string>::iterator it;
+  for(it = str_map.begin(); it != str_map.end(); ++it) {
+    string this_opt = it->first + "=" + it->second;
+    rocksdb::Status status = rocksdb::GetOptionsFromString(opt, this_opt , &opt); 
+    if (!status.ok()) {
+      //unrecognized by rocksdb, try to interpret by ourselves.
+      r = tryInterpret(it->first, it->second, opt);
+      if (r < 0) {
+	derr << status.ToString() << dendl;
+	return -EINVAL;
+      }
+    }
+    lgeneric_dout(cct, 0) << " set rocksdb option " << it->first
+			  << " = " << it->second << dendl;
+  }
   return 0;
 }
 
-int RocksDBStore::do_open(ostream &out, bool create_if_missing)
+int RocksDBStore::init(string _options_str)
 {
-  rocksdb::Options ldoptions;
-
-  if (options.write_buffer_size)
-    ldoptions.write_buffer_size = options.write_buffer_size;
-  if (options.write_buffer_num)
-    ldoptions.max_write_buffer_number = options.write_buffer_num;
-  if (options.max_background_compactions)
-    ldoptions.max_background_compactions = options.max_background_compactions;
-  if (options.max_background_flushes)
-    ldoptions.max_background_flushes = options.max_background_flushes;
-  if (options.target_file_size_base)
-    ldoptions.target_file_size_base = options.target_file_size_base;
-  if (options.max_open_files)
-    ldoptions.max_open_files = options.max_open_files;
-  if (options.cache_size) {
-    ldoptions.block_cache = rocksdb::NewLRUCache(options.cache_size);
-  }
-  if (options.block_size)
-    ldoptions.block_size = options.block_size;
-  if (options.bloom_size) {
-    const rocksdb::FilterPolicy *_filterpolicy =
-	rocksdb::NewBloomFilterPolicy(options.bloom_size);
-    ldoptions.filter_policy = _filterpolicy;
-    filterpolicy = _filterpolicy;
+  options_str = _options_str;
+  rocksdb::Options opt;
+  //try parse options
+  int r = ParseOptionsFromString(options_str, opt); 
+  if (r != 0) {
+    return -EINVAL;
   }
-  if (options.compression_type.length() == 0)
-    ldoptions.compression = rocksdb::kNoCompression;
-  else if(options.compression_type == "snappy")
-    ldoptions.compression = rocksdb::kSnappyCompression;
-  else if(options.compression_type == "zlib")
-    ldoptions.compression = rocksdb::kZlibCompression;
-  else if(options.compression_type == "bzip2")
-    ldoptions.compression = rocksdb::kBZip2Compression;
-  else
-    ldoptions.compression = rocksdb::kNoCompression;
-  if (options.block_restart_interval)
-    ldoptions.block_restart_interval = options.block_restart_interval;
-
-  ldoptions.error_if_exists = options.error_if_exists;
-  ldoptions.paranoid_checks = options.paranoid_checks;
-  ldoptions.create_if_missing = create_if_missing;
-  if (options.log_file.length()) {
-    rocksdb::Env *env = rocksdb::Env::Default();
-    env->NewLogger(options.log_file, &ldoptions.info_log);
-    ldoptions.info_log->SetInfoLogLevel((rocksdb::InfoLogLevel)get_info_log_level(options.info_log_level));
-  } else {
-    ldoptions.info_log_level = (rocksdb::InfoLogLevel)get_info_log_level(options.info_log_level);
+  return 0;
+}
+
+int RocksDBStore::do_open(ostream &out, bool create_if_missing)
+{
+  rocksdb::Options opt;
+  rocksdb::Status status;
+
+  int r = ParseOptionsFromString(options_str, opt); 
+  if (r != 0) {
+    return -EINVAL;
   }
-  if(options.disableDataSync)
-    ldoptions.disableDataSync = options.disableDataSync;
-  if(options.num_levels)
-    ldoptions.num_levels = options.num_levels;
-  if(options.level0_file_num_compaction_trigger)
-    ldoptions.level0_file_num_compaction_trigger = options.level0_file_num_compaction_trigger;
-  if(options.level0_slowdown_writes_trigger)
-    ldoptions.level0_slowdown_writes_trigger = options.level0_slowdown_writes_trigger;
-  if(options.level0_stop_writes_trigger)
-    ldoptions.level0_stop_writes_trigger = options.level0_stop_writes_trigger;
-  if(options.wal_dir.length())
-    ldoptions.wal_dir = options.wal_dir;
-
-
-  //rocksdb::DB *_db;
-  rocksdb::Status status = rocksdb::DB::Open(ldoptions, path, &db);
+  opt.create_if_missing = create_if_missing;
+
+  status = rocksdb::DB::Open(opt, path, &db);
   if (!status.ok()) {
-    out << status.ToString() << std::endl;
+    derr << status.ToString() << dendl;
     return -EINVAL;
   }
-  //db.reset(_db);
 
-  if (g_conf->rocksdb_compact_on_mount) {
+  PerfCountersBuilder plb(g_ceph_context, "rocksdb", l_rocksdb_first, l_rocksdb_last);
+  plb.add_u64_counter(l_rocksdb_gets, "rocksdb_get", "Gets");
+  plb.add_u64_counter(l_rocksdb_txns, "rocksdb_transaction", "Transactions");
+  plb.add_time_avg(l_rocksdb_get_latency, "rocksdb_get_latency", "Get latency");
+  plb.add_time_avg(l_rocksdb_submit_latency, "rocksdb_submit_latency", "Submit Latency");
+  plb.add_time_avg(l_rocksdb_submit_sync_latency, "rocksdb_submit_sync_latency", "Submit Sync Latency");
+  plb.add_u64_counter(l_rocksdb_compact, "rocksdb_compact", "Compactions");
+  plb.add_u64_counter(l_rocksdb_compact_range, "rocksdb_compact_range", "Compactions by range");
+  plb.add_u64_counter(l_rocksdb_compact_queue_merge, "rocksdb_compact_queue_merge", "Mergings of ranges in compaction queue");
+  plb.add_u64(l_rocksdb_compact_queue_len, "rocksdb_compact_queue_len", "Length of compaction queue");
+  logger = plb.create_perf_counters();
+  cct->get_perfcounters_collection()->add(logger);
+
+  if (compact_on_mount) {
     derr << "Compacting rocksdb store..." << dendl;
     compact();
     derr << "Finished compacting rocksdb store" << dendl;
   }
-
-
-  PerfCountersBuilder plb(g_ceph_context, "rocksdb", l_rocksdb_first, l_rocksdb_last);
-  plb.add_u64_counter(l_rocksdb_gets, "rocksdb_get");
-  plb.add_u64_counter(l_rocksdb_txns, "rocksdb_transaction");
-  plb.add_u64_counter(l_rocksdb_compact, "rocksdb_compact");
-  plb.add_u64_counter(l_rocksdb_compact_range, "rocksdb_compact_range");
-  plb.add_u64_counter(l_rocksdb_compact_queue_merge, "rocksdb_compact_queue_merge");
-  plb.add_u64(l_rocksdb_compact_queue_len, "rocksdb_compact_queue_len");
-  logger = plb.create_perf_counters();
-  cct->get_perfcounters_collection()->add(logger);
   return 0;
 }
 
@@ -153,7 +161,6 @@ RocksDBStore::~RocksDBStore()
 
   // Ensure db is destroyed before dependent db_cache and filterpolicy
   delete db;
-  delete filterpolicy;
 }
 
 void RocksDBStore::close()
@@ -175,24 +182,30 @@ void RocksDBStore::close()
 
 int RocksDBStore::submit_transaction(KeyValueDB::Transaction t)
 {
+  utime_t start = ceph_clock_now(g_ceph_context);
   RocksDBTransactionImpl * _t =
     static_cast<RocksDBTransactionImpl *>(t.get());
   rocksdb::WriteOptions woptions;
-  woptions.disableWAL = options.disableWAL;
+  woptions.disableWAL = disableWAL;
   rocksdb::Status s = db->Write(woptions, _t->bat);
+  utime_t lat = ceph_clock_now(g_ceph_context) - start;
   logger->inc(l_rocksdb_txns);
+  logger->tinc(l_rocksdb_submit_latency, lat);
   return s.ok() ? 0 : -1;
 }
 
 int RocksDBStore::submit_transaction_sync(KeyValueDB::Transaction t)
 {
+  utime_t start = ceph_clock_now(g_ceph_context);
   RocksDBTransactionImpl * _t =
     static_cast<RocksDBTransactionImpl *>(t.get());
   rocksdb::WriteOptions woptions;
   woptions.sync = true;
-  woptions.disableWAL = options.disableWAL;
+  woptions.disableWAL = disableWAL;
   rocksdb::Status s = db->Write(woptions, _t->bat);
+  utime_t lat = ceph_clock_now(g_ceph_context) - start;
   logger->inc(l_rocksdb_txns);
+  logger->tinc(l_rocksdb_submit_sync_latency, lat);
   return s.ok() ? 0 : -1;
 }
 int RocksDBStore::get_info_log_level(string info_log_level)
@@ -226,21 +239,18 @@ void RocksDBStore::RocksDBTransactionImpl::set(
   const string &k,
   const bufferlist &to_set_bl)
 {
-  buffers.push_back(to_set_bl);
-  bufferlist &bl = *(buffers.rbegin());
   string key = combine_strings(prefix, k);
-  keys.push_back(key);
-  bat->Delete(rocksdb::Slice(*(keys.rbegin())));
-  bat->Put(rocksdb::Slice(*(keys.rbegin())),
-	  rocksdb::Slice(bl.c_str(), bl.length()));
+  //bufferlist::c_str() is non-constant, so we need to make a copy
+  bufferlist val = to_set_bl;
+  bat->Delete(rocksdb::Slice(key));
+  bat->Put(rocksdb::Slice(key),
+	  rocksdb::Slice(val.c_str(), val.length()));
 }
 
 void RocksDBStore::RocksDBTransactionImpl::rmkey(const string &prefix,
 					         const string &k)
 {
-  string key = combine_strings(prefix, k);
-  keys.push_back(key);
-  bat->Delete(rocksdb::Slice(*(keys.rbegin())));
+  bat->Delete(combine_strings(prefix, k));
 }
 
 void RocksDBStore::RocksDBTransactionImpl::rmkeys_by_prefix(const string &prefix)
@@ -249,9 +259,7 @@ void RocksDBStore::RocksDBTransactionImpl::rmkeys_by_prefix(const string &prefix
   for (it->seek_to_first();
        it->valid();
        it->next()) {
-    string key = combine_strings(prefix, it->key());
-    keys.push_back(key);
-    bat->Delete(*(keys.rbegin()));
+    bat->Delete(combine_strings(prefix, it->key()));
   }
 }
 
@@ -260,6 +268,7 @@ int RocksDBStore::get(
     const std::set<string> &keys,
     std::map<string, bufferlist> *out)
 {
+  utime_t start = ceph_clock_now(g_ceph_context);
   KeyValueDB::Iterator it = get_iterator(prefix);
   for (std::set<string>::const_iterator i = keys.begin();
        i != keys.end();
@@ -270,7 +279,9 @@ int RocksDBStore::get(
     } else if (!it->valid())
       break;
   }
+  utime_t lat = ceph_clock_now(g_ceph_context) - start;
   logger->inc(l_rocksdb_gets);
+  logger->tinc(l_rocksdb_get_latency, lat);
   return 0;
 }
 
@@ -470,11 +481,6 @@ int RocksDBStore::RocksDBWholeSpaceIteratorImpl::status()
   return dbiter->status().ok() ? 0 : -1;
 }
 
-bool RocksDBStore::in_prefix(const string &prefix, rocksdb::Slice key)
-{
-  return (key.compare(rocksdb::Slice(past_prefix(prefix))) < 0) &&
-    (key.compare(rocksdb::Slice(prefix)) > 0);
-}
 string RocksDBStore::past_prefix(const string &prefix)
 {
   string limit = prefix;
@@ -485,7 +491,7 @@ string RocksDBStore::past_prefix(const string &prefix)
 
 RocksDBStore::WholeSpaceIterator RocksDBStore::_get_iterator()
 {
-  return std::tr1::shared_ptr<KeyValueDB::WholeSpaceIteratorImpl>(
+  return std::shared_ptr<KeyValueDB::WholeSpaceIteratorImpl>(
     new RocksDBWholeSpaceIteratorImpl(
       db->NewIterator(rocksdb::ReadOptions())
     )
@@ -500,7 +506,7 @@ RocksDBStore::WholeSpaceIterator RocksDBStore::_get_snapshot_iterator()
   snapshot = db->GetSnapshot();
   options.snapshot = snapshot;
 
-  return std::tr1::shared_ptr<KeyValueDB::WholeSpaceIteratorImpl>(
+  return std::shared_ptr<KeyValueDB::WholeSpaceIteratorImpl>(
     new RocksDBSnapshotIteratorImpl(db, snapshot,
       db->NewIterator(options))
   );
diff --git a/src/os/RocksDBStore.h b/src/os/RocksDBStore.h
index 5c3160f..bf58f66 100644
--- a/src/os/RocksDBStore.h
+++ b/src/os/RocksDBStore.h
@@ -9,7 +9,7 @@
 #include <set>
 #include <map>
 #include <string>
-#include <tr1/memory>
+#include <memory>
 #include <boost/scoped_ptr.hpp>
 
 #include <errno.h>
@@ -19,13 +19,15 @@
 #include "common/Formatter.h"
 
 #include "common/ceph_context.h"
-
 class PerfCounters;
 
 enum {
   l_rocksdb_first = 34300,
   l_rocksdb_gets,
   l_rocksdb_txns,
+  l_rocksdb_get_latency,
+  l_rocksdb_submit_latency,
+  l_rocksdb_submit_sync_latency,
   l_rocksdb_compact,
   l_rocksdb_compact_range,
   l_rocksdb_compact_queue_merge,
@@ -41,8 +43,8 @@ namespace rocksdb{
   class Slice;
   class WriteBatch;
   class Iterator;
+  struct Options;
 }
-
 /**
  * Uses RocksDB to implement the KeyValueDB interface
  */
@@ -50,9 +52,8 @@ class RocksDBStore : public KeyValueDB {
   CephContext *cct;
   PerfCounters *logger;
   string path;
-  const rocksdb::FilterPolicy *filterpolicy;
   rocksdb::DB *db;
-
+  string options_str;
   int do_open(ostream &out, bool create_if_missing);
 
   // manage async compactions
@@ -78,10 +79,14 @@ class RocksDBStore : public KeyValueDB {
 
 public:
   /// compact the underlying rocksdb store
+  bool compact_on_mount;
+  bool disableWAL;
   void compact();
 
+  int tryInterpret(const string key, const string val, rocksdb::Options &opt);
+  int ParseOptionsFromString(const string opt_str, rocksdb::Options &opt);
   static int _test_init(const string& dir);
-  int init();
+  int init(string options_str);
   /// compact rocksdb for all keys with a given prefix
   void compact_prefix(const string& prefix) {
     compact_range(prefix, past_prefix(prefix));
@@ -98,71 +103,16 @@ public:
   }
   int get_info_log_level(string info_log_level);
 
-  /**
-   * options_t: Holds options which are minimally interpreted
-   * on initialization and then passed through to RocksDB.
-   * We transform a couple of these into actual RocksDB
-   * structures, but the rest are simply passed through unchanged. See
-   * rocksdb/options.h for more precise details on each.
-   *
-   * Set them after constructing the RocksDBStore, but before calling
-   * open() or create_and_open().
-   */
-  struct options_t {
-    uint64_t write_buffer_size; /// in-memory write buffer size
-    uint64_t write_buffer_num; /// in-memory write buffer number
-    uint64_t target_file_size_base; /// Target file size for compaction
-    int max_background_compactions; /// Maximum number of concurrent background compaction jobs
-    int max_background_flushes; /// Maximum number of concurrent background memtable flushea jobs
-    int max_open_files; /// maximum number of files RocksDB can open at once
-    uint64_t cache_size; /// size of extra decompressed cache to use
-    uint64_t block_size; /// user data per block
-    int bloom_size; /// number of bits per entry to put in a bloom filter
-    string compression_type; /// whether to use libsnappy compression or not
-
-    // don't change these ones. No, seriously
-    int block_restart_interval;
-    bool error_if_exists;
-    bool paranoid_checks;
-    uint64_t level0_file_num_compaction_trigger;
-    uint64_t level0_slowdown_writes_trigger;
-    uint64_t level0_stop_writes_trigger;
-    bool disableDataSync;
-    bool disableWAL;
-    int num_levels;
-
-    string log_file;
-    string wal_dir;
-    string info_log_level;
-
-    options_t() :
-      write_buffer_size(0), //< 0 means default
-      max_open_files(0), //< 0 means default
-      cache_size(0), //< 0 means no cache (default)
-      block_size(0), //< 0 means default
-      bloom_size(0), //< 0 means no bloom filter (default)
-      compression_type("none"), //< set to false for no compression
-      block_restart_interval(0), //< 0 means default
-      error_if_exists(false), //< set to true if you want to check nonexistence
-      paranoid_checks(false), //< set to true if you want paranoid checks
-      level0_file_num_compaction_trigger(0),
-      level0_slowdown_writes_trigger(0),
-      level0_stop_writes_trigger(0),
-      disableDataSync(false),
-      disableWAL(false),
-      num_levels(0),
-      info_log_level("info")
-    {}
-  } options;
-
   RocksDBStore(CephContext *c, const string &path) :
     cct(c),
     logger(NULL),
     path(path),
+    db(NULL),
     compact_queue_lock("RocksDBStore::compact_thread_lock"),
     compact_queue_stop(false),
     compact_thread(this),
-    options()
+    compact_on_mount(false),
+    disableWAL(false)
   {}
 
   ~RocksDBStore();
@@ -182,8 +132,6 @@ public:
   class RocksDBTransactionImpl : public KeyValueDB::TransactionImpl {
   public:
     rocksdb::WriteBatch *bat;
-    list<bufferlist> buffers;
-    list<string> keys;
     RocksDBStore *db;
 
     RocksDBTransactionImpl(RocksDBStore *_db);
@@ -201,7 +149,7 @@ public:
   };
 
   KeyValueDB::Transaction get_transaction() {
-    return std::tr1::shared_ptr< RocksDBTransactionImpl >(
+    return std::shared_ptr< RocksDBTransactionImpl >(
       new RocksDBTransactionImpl(this));
   }
 
@@ -253,7 +201,6 @@ public:
   static string combine_strings(const string &prefix, const string &value);
   static int split_key(rocksdb::Slice in, string *prefix, string *key);
   static bufferlist to_bufferlist(rocksdb::Slice in);
-  static bool in_prefix(const string &prefix, rocksdb::Slice key);
   static string past_prefix(const string &prefix);
 
   virtual uint64_t get_estimated_size(map<string,uint64_t> &extra) {
diff --git a/src/os/Transaction.cc b/src/os/Transaction.cc
index fdbfa3b..dc8b1ab 100644
--- a/src/os/Transaction.cc
+++ b/src/os/Transaction.cc
@@ -238,7 +238,7 @@ void ObjectStore::Transaction::_build_actions_from_tbl()
 
 	::decode(cid, p);
 
-	create_collection(cid);
+	create_collection(cid, 0);
       }
       break;
 
@@ -288,7 +288,7 @@ void ObjectStore::Transaction::_build_actions_from_tbl()
 	assert(ocid2 == ocid);
 	assert(oid2 == oid);
 
-	collection_move(ncid, ocid, oid);
+	collection_move_rename(ocid, oid, ncid, oid);
       }
       break;
 
@@ -947,8 +947,8 @@ void ObjectStore::Transaction::generate_test_instances(list<ObjectStore::Transac
   o.push_back(t);
   
   t = new Transaction;
-  coll_t c("foocoll");
-  coll_t c2("foocoll2");
+  coll_t c(spg_t(pg_t(1,2), shard_id_t::NO_SHARD));
+  coll_t c2(spg_t(pg_t(4,5), shard_id_t::NO_SHARD));
   ghobject_t o1(hobject_t("obj", "", 123, 456, -1, ""));
   ghobject_t o2(hobject_t("obj2", "", 123, 456, -1, ""));
   ghobject_t o3(hobject_t("obj3", "", 123, 456, -1, ""));
@@ -974,8 +974,8 @@ void ObjectStore::Transaction::generate_test_instances(list<ObjectStore::Transac
   t->clone(c, o1, o3);
   t->clone_range(c, o1, o2, 1, 12, 99);
 
-  t->create_collection(c);
-  t->collection_move(c, c2, o3);
+  t->create_collection(c, 12);
+  t->collection_move_rename(c, o2, c2, o3);
   t->remove_collection(c);
   t->collection_setattr(c, string("this"), bl);
   t->collection_rmattr(c, string("foo"));
diff --git a/src/os/WBThrottle.cc b/src/os/WBThrottle.cc
index f472a23..af3a888 100644
--- a/src/os/WBThrottle.cc
+++ b/src/os/WBThrottle.cc
@@ -22,12 +22,12 @@ WBThrottle::WBThrottle(CephContext *cct) :
   PerfCountersBuilder b(
     cct, string("WBThrottle"),
     l_wbthrottle_first, l_wbthrottle_last);
-  b.add_u64(l_wbthrottle_bytes_dirtied, "bytes_dirtied");
-  b.add_u64(l_wbthrottle_bytes_wb, "bytes_wb");
-  b.add_u64(l_wbthrottle_ios_dirtied, "ios_dirtied");
-  b.add_u64(l_wbthrottle_ios_wb, "ios_wb");
-  b.add_u64(l_wbthrottle_inodes_dirtied, "inodes_dirtied");
-  b.add_u64(l_wbthrottle_inodes_wb, "inodes_wb");
+  b.add_u64(l_wbthrottle_bytes_dirtied, "bytes_dirtied", "Dirty data");
+  b.add_u64(l_wbthrottle_bytes_wb, "bytes_wb", "Written data");
+  b.add_u64(l_wbthrottle_ios_dirtied, "ios_dirtied", "Dirty operations");
+  b.add_u64(l_wbthrottle_ios_wb, "ios_wb", "Written operations");
+  b.add_u64(l_wbthrottle_inodes_dirtied, "inodes_dirtied", "Entries waiting for write");
+  b.add_u64(l_wbthrottle_inodes_wb, "inodes_wb", "Written entries");
   logger = b.create_perf_counters();
   cct->get_perfcounters_collection()->add(logger);
   for (unsigned i = l_wbthrottle_first + 1; i != l_wbthrottle_last; ++i)
@@ -135,10 +135,7 @@ bool WBThrottle::get_next_should_flush(
 {
   assert(lock.is_locked());
   assert(next);
-  while (!stopping &&
-         cur_ios < io_limits.first &&
-         pending_wbs.size() < fd_limits.first &&
-         cur_size < size_limits.first)
+  while (!stopping && !beyond_limit())
          cond.Wait(lock);
   if (stopping)
     return false;
@@ -159,6 +156,14 @@ void *WBThrottle::entry()
   boost::tuple<ghobject_t, FDRef, PendingWB> wb;
   while (get_next_should_flush(&wb)) {
     clearing = wb.get<0>();
+    cur_ios -= wb.get<2>().ios;
+    logger->dec(l_wbthrottle_ios_dirtied, wb.get<2>().ios);
+    logger->inc(l_wbthrottle_ios_wb, wb.get<2>().ios);
+    cur_size -= wb.get<2>().size;
+    logger->dec(l_wbthrottle_bytes_dirtied, wb.get<2>().size);
+    logger->inc(l_wbthrottle_bytes_wb, wb.get<2>().size);
+    logger->dec(l_wbthrottle_inodes_dirtied);
+    logger->inc(l_wbthrottle_inodes_wb);
     lock.Unlock();
 #ifdef HAVE_FDATASYNC
     ::fdatasync(**wb.get<1>());
@@ -173,14 +178,6 @@ void *WBThrottle::entry()
 #endif
     lock.Lock();
     clearing = ghobject_t();
-    cur_ios -= wb.get<2>().ios;
-    logger->dec(l_wbthrottle_ios_dirtied, wb.get<2>().ios);
-    logger->inc(l_wbthrottle_ios_wb, wb.get<2>().ios);
-    cur_size -= wb.get<2>().size;
-    logger->dec(l_wbthrottle_bytes_dirtied, wb.get<2>().size);
-    logger->inc(l_wbthrottle_bytes_wb, wb.get<2>().size);
-    logger->dec(l_wbthrottle_inodes_dirtied);
-    logger->inc(l_wbthrottle_inodes_wb);
     cond.Signal();
     wb = boost::tuple<ghobject_t, FDRef, PendingWB>();
   }
@@ -212,7 +209,8 @@ void WBThrottle::queue_wb(
 
   wbiter->second.first.add(nocache, len, 1);
   insert_object(hoid);
-  cond.Signal();
+  if (beyond_limit())
+    cond.Signal();
 }
 
 void WBThrottle::clear()
@@ -229,12 +227,11 @@ void WBThrottle::clear()
     }
 #endif
 
-    cur_ios -= i->second.first.ios;
-    logger->dec(l_wbthrottle_ios_dirtied, i->second.first.ios);
-    cur_size -= i->second.first.size;
-    logger->dec(l_wbthrottle_bytes_dirtied, i->second.first.size);
-    logger->dec(l_wbthrottle_inodes_dirtied);
   }
+  cur_ios = cur_size = 0;
+  logger->set(l_wbthrottle_ios_dirtied, 0);
+  logger->set(l_wbthrottle_bytes_dirtied, 0);
+  logger->set(l_wbthrottle_inodes_dirtied, 0);
   pending_wbs.clear();
   lru.clear();
   rev_lru.clear();
@@ -265,10 +262,7 @@ void WBThrottle::clear_object(const ghobject_t &hoid)
 void WBThrottle::throttle()
 {
   Mutex::Locker l(lock);
-  while (!stopping && !(
-	   cur_ios < io_limits.second &&
-	   pending_wbs.size() < fd_limits.second &&
-	   cur_size < size_limits.second)) {
+  while (!stopping && beyond_limit()) {
     cond.Wait(lock);
   }
 }
diff --git a/src/os/WBThrottle.h b/src/os/WBThrottle.h
index b3fd9e0..d951943 100644
--- a/src/os/WBThrottle.h
+++ b/src/os/WBThrottle.h
@@ -129,6 +129,15 @@ private:
   FS fs;
 
   void set_from_conf();
+  bool beyond_limit() const {
+    if (cur_ios < io_limits.first &&
+	pending_wbs.size() < fd_limits.first &&
+	cur_size < size_limits.first)
+      return false;
+    else
+      return true;
+  }
+
 public:
   WBThrottle(CephContext *cct);
   ~WBThrottle();
diff --git a/src/os/XfsFileStoreBackend.cc b/src/os/XfsFileStoreBackend.cc
index abff018..cf8bfe1 100644
--- a/src/os/XfsFileStoreBackend.cc
+++ b/src/os/XfsFileStoreBackend.cc
@@ -24,6 +24,7 @@
 #include <xfs/xfs.h>
 
 #include "common/errno.h"
+#include "common/linux_version.h"
 #include "include/assert.h"
 #include "include/compat.h"
 
@@ -116,24 +117,15 @@ int XfsFileStoreBackend::detect_features()
     //   aff3a9edb7080f69f07fe76a8bd089b3dfa4cb5d
     // for this set_extsize bug
     //   http://oss.sgi.com/bugzilla/show_bug.cgi?id=874
-    struct utsname u;
-    int r = uname(&u);
-    assert(r == 0);
-    int major = 0, minor = 0, patch = 0;
-    r = sscanf(u.release, "%d.%d.%d", &major, &minor, &patch);
-    if (r != 3) {
-      ret = 0;
-      dout(0) << __func__ << ": failed to parse kernel version "
-	      << u.release << " to verify extsize not buggy, disabling extsize"
-	      << dendl;
+    int ver = get_linux_version();
+    if (ver == 0) {
+      dout(0) << __func__ << ": couldn't verify extsize not buggy, disabling extsize" << dendl;
       m_has_extsize = false;
-    } else if (major < 3 || (major == 3 && minor < 5)) {
-      dout(0) << __func__ << ": disabling extsize, kernel " << u.release
-	      << " is older than 3.5 and has buggy extsize ioctl" << dendl;
+    } else if (ver < KERNEL_VERSION(3, 5, 0)) {
+      dout(0) << __func__ << ": disabling extsize, your kernel < 3.5 and has buggy extsize ioctl" << dendl;
       m_has_extsize = false;
     } else {
-      dout(0) << "detect_feature: extsize is supported and kernel "
-	      << u.release << " >= 3.5" << dendl;
+      dout(0) << __func__ << ": extsize is supported and your kernel >= 3.5" << dendl;
       m_has_extsize = true;
     }
   } else {
diff --git a/src/os/chain_xattr.cc b/src/os/chain_xattr.cc
index c08acdb..1bb652a 100644
--- a/src/os/chain_xattr.cc
+++ b/src/os/chain_xattr.cc
@@ -23,6 +23,7 @@
 #endif
 
 #include "common/xattr.h"
+#include "include/compat.h"
 
 /*
  * chaining xattrs
@@ -62,7 +63,7 @@ static void get_raw_xattr_name(const char *name, int i, char *raw_name, int raw_
   if (!i) {
     *raw_name = '\0';
   } else {
-    int r = snprintf(raw_name, raw_len, "@%d", i);
+    int r = snprintf(raw_name, raw_len - pos, "@%d", i);
     assert(r < raw_len - pos);
   }
 }
@@ -116,7 +117,8 @@ static int getxattr_len(const char *fn, const char *name)
       break;
     total += r;
     i++;
-  } while (r == CHAIN_XATTR_MAX_BLOCK_LEN);
+  } while (r == CHAIN_XATTR_MAX_BLOCK_LEN ||
+	   r == CHAIN_XATTR_SHORT_BLOCK_LEN);
 
   return total;
 }
@@ -135,7 +137,6 @@ int chain_getxattr(const char *fn, const char *name, void *val, size_t size)
   do {
     chunk_size = (size < CHAIN_XATTR_MAX_BLOCK_LEN ? size : CHAIN_XATTR_MAX_BLOCK_LEN);
     get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
-    size -= chunk_size;
 
     r = sys_getxattr(fn, raw_name, (char *)val + pos, chunk_size);
     if (i && r == -ENODATA) {
@@ -147,17 +148,21 @@ int chain_getxattr(const char *fn, const char *name, void *val, size_t size)
       break;
     }
 
-    if (r > 0)
+    if (r > 0) {
       pos += r;
+      size -= r;
+    }
 
     i++;
-  } while (size && r == CHAIN_XATTR_MAX_BLOCK_LEN);
+  } while (size && (r == CHAIN_XATTR_MAX_BLOCK_LEN ||
+		    r == CHAIN_XATTR_SHORT_BLOCK_LEN));
 
   if (r >= 0) {
     ret = pos;
     /* is there another chunk? that can happen if the last read size span over
        exactly one block */
-    if (chunk_size == CHAIN_XATTR_MAX_BLOCK_LEN) {
+    if (chunk_size == CHAIN_XATTR_MAX_BLOCK_LEN ||
+	chunk_size == CHAIN_XATTR_SHORT_BLOCK_LEN) {
       get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
       r = sys_getxattr(fn, raw_name, 0, 0);
       if (r > 0) { // there's another chunk.. the original buffer was too small
@@ -183,7 +188,8 @@ static int chain_fgetxattr_len(int fd, const char *name)
       break;
     total += r;
     i++;
-  } while (r == CHAIN_XATTR_MAX_BLOCK_LEN);
+  } while (r == CHAIN_XATTR_MAX_BLOCK_LEN ||
+	   r == CHAIN_XATTR_SHORT_BLOCK_LEN);
 
   return total;
 }
@@ -202,7 +208,6 @@ int chain_fgetxattr(int fd, const char *name, void *val, size_t size)
   do {
     chunk_size = (size < CHAIN_XATTR_MAX_BLOCK_LEN ? size : CHAIN_XATTR_MAX_BLOCK_LEN);
     get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
-    size -= chunk_size;
 
     r = sys_fgetxattr(fd, raw_name, (char *)val + pos, chunk_size);
     if (i && r == -ENODATA) {
@@ -214,17 +219,21 @@ int chain_fgetxattr(int fd, const char *name, void *val, size_t size)
       break;
     }
 
-    if (r > 0)
+    if (r > 0) {
       pos += r;
+      size -= r;
+    }
 
     i++;
-  } while (size && r == CHAIN_XATTR_MAX_BLOCK_LEN);
+  } while (size && (r == CHAIN_XATTR_MAX_BLOCK_LEN ||
+		    r == CHAIN_XATTR_SHORT_BLOCK_LEN));
 
   if (r >= 0) {
     ret = pos;
     /* is there another chunk? that can happen if the last read size span over
        exactly one block */
-    if (chunk_size == CHAIN_XATTR_MAX_BLOCK_LEN) {
+    if (chunk_size == CHAIN_XATTR_MAX_BLOCK_LEN ||
+	chunk_size == CHAIN_XATTR_SHORT_BLOCK_LEN) {
       get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
       r = sys_fgetxattr(fd, raw_name, 0, 0);
       if (r > 0) { // there's another chunk.. the original buffer was too small
@@ -238,14 +247,24 @@ int chain_fgetxattr(int fd, const char *name, void *val, size_t size)
 
 // setxattr
 
+static int get_xattr_block_size(size_t size)
+{
+  if (size <= CHAIN_XATTR_SHORT_LEN_THRESHOLD)
+    // this may fit in the inode; stripe over short attrs so that XFS
+    // won't kick it out.
+    return CHAIN_XATTR_SHORT_BLOCK_LEN;
+  return CHAIN_XATTR_MAX_BLOCK_LEN;
+}
+
 int chain_setxattr(const char *fn, const char *name, const void *val, size_t size)
 {
   int i = 0, pos = 0;
   char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
   int ret = 0;
+  size_t max_chunk_size = get_xattr_block_size(size);
 
   do {
-    size_t chunk_size = (size < CHAIN_XATTR_MAX_BLOCK_LEN ? size : CHAIN_XATTR_MAX_BLOCK_LEN);
+    size_t chunk_size = (size < max_chunk_size ? size : max_chunk_size);
     get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
     size -= chunk_size;
 
@@ -278,9 +297,10 @@ int chain_fsetxattr(int fd, const char *name, const void *val, size_t size)
   int i = 0, pos = 0;
   char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
   int ret = 0;
+  size_t max_chunk_size = get_xattr_block_size(size);
 
   do {
-    size_t chunk_size = (size < CHAIN_XATTR_MAX_BLOCK_LEN ? size : CHAIN_XATTR_MAX_BLOCK_LEN);
+    size_t chunk_size = (size < max_chunk_size ? size : max_chunk_size);
     get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
     size -= chunk_size;
 
diff --git a/src/os/chain_xattr.h b/src/os/chain_xattr.h
index 7e8312f..b994d52 100644
--- a/src/os/chain_xattr.h
+++ b/src/os/chain_xattr.h
@@ -8,9 +8,24 @@
 
 #include <errno.h>
 
+#if defined(__linux__)
+#include <limits.h>
+#define CHAIN_XATTR_MAX_NAME_LEN ((XATTR_NAME_MAX + 1) / 2)
+#elif defined(__APPLE__)
+#include <sys/xattr.h>
+#define CHAIN_XATTR_MAX_NAME_LEN ((XATTR_MAXNAMELEN + 1) / 2)
+#else
 #define CHAIN_XATTR_MAX_NAME_LEN  128
+#endif
+
 #define CHAIN_XATTR_MAX_BLOCK_LEN 2048
 
+/*
+ * XFS will only inline xattrs < 255 bytes, so for xattrs that are
+ * likely to fit in the inode, stripe over short xattrs.
+ */
+#define CHAIN_XATTR_SHORT_BLOCK_LEN 250
+#define CHAIN_XATTR_SHORT_LEN_THRESHOLD 1000
 
 // wrappers to hide annoying errno handling.
 
diff --git a/src/os/fs/FS.cc b/src/os/fs/FS.cc
new file mode 100644
index 0000000..cb0bdd5
--- /dev/null
+++ b/src/os/fs/FS.cc
@@ -0,0 +1,153 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+// from include/linux/falloc.h:
+#ifndef FALLOC_FL_PUNCH_HOLE
+# define FALLOC_FL_PUNCH_HOLE 0x2
+#endif
+
+#include "FS.h"
+
+#include "acconfig.h"
+
+#ifdef HAVE_LIBXFS
+#include "XFS.h"
+#endif
+
+#if defined(DARWIN) || defined(__FreeBSD__)
+#include <sys/mount.h>
+#else
+#include <sys/vfs.h>
+#endif
+#include "include/compat.h"
+
+// ---------------
+
+FS *FS::create(uint64_t f_type)
+{
+  switch (f_type) {
+#ifdef HAVE_LIBXFS
+  case XFS_SUPER_MAGIC:
+    return new XFS;
+#endif
+  default:
+    return new FS;
+  }
+}
+
+FS *FS::create_by_fd(int fd)
+{
+  struct statfs st;
+  ::fstatfs(fd, &st);
+  return create(st.f_type);
+}
+
+// ---------------
+
+int FS::set_alloc_hint(int fd, uint64_t hint)
+{
+  return 0;  // no-op
+}
+
+#ifdef HAVE_NAME_TO_HANDLE_AT
+int FS::get_handle(int fd, std::string *h)
+{
+  char buf[sizeof(struct file_handle) + MAX_HANDLE_SZ];
+  struct file_handle *fh = (struct file_handle *)buf;
+  int mount_id;
+
+  fh->handle_bytes = MAX_HANDLE_SZ;
+  int r = name_to_handle_at(fd, "", fh, &mount_id, AT_EMPTY_PATH);
+  if (r < 0) {
+    return -errno;
+  }
+  *h = std::string(buf, fh->handle_bytes + sizeof(struct file_handle));
+  return 0;
+}
+
+int FS::open_handle(int mount_fd, const std::string& h, int flags)
+{
+  if (h.length() < sizeof(struct file_handle)) {
+    return -EINVAL;
+  }
+  struct file_handle *fh = (struct file_handle *)h.data();
+  if (fh->handle_bytes > h.length()) {
+    return -ERANGE;
+  }
+  int fd = open_by_handle_at(mount_fd, fh, flags);
+  if (fd < 0)
+    return -errno;
+  return fd;
+}
+
+#else // HAVE_NAME_TO_HANDLE_AT
+
+int FS::get_handle(int fd, std::string *h)
+{
+  return -EOPNOTSUPP;
+}
+
+int FS::open_handle(int mount_fd, const std::string& h, int flags)
+{
+  return -EOPNOTSUPP;
+}
+
+#endif // HAVE_NAME_TO_HANDLE_AT
+
+int FS::copy_file_range(int to_fd, uint64_t to_offset,
+			int from_fd,
+			uint64_t from_offset, uint64_t from_len)
+{
+  assert(0 == "write me");
+}
+
+int FS::zero(int fd, uint64_t offset, uint64_t length)
+{
+  int r;
+
+#ifdef CEPH_HAVE_FALLOCATE
+# if !defined(DARWIN) && !defined(__FreeBSD__)
+  // first try fallocate
+  r = fallocate(fd, FALLOC_FL_PUNCH_HOLE, offset, length);
+  if (r < 0) {
+    r = -errno;
+  }
+  if (r != -EOPNOTSUPP) {
+    goto out;  // a real error
+  }
+# endif
+#endif
+
+  {
+    // fall back to writing zeros
+    bufferlist bl;
+    bufferptr bp(length);
+    bp.zero();
+    bl.append(bp);
+    int r = ::lseek64(fd, offset, SEEK_SET);
+    if (r < 0) {
+      r = -errno;
+      goto out;
+    }
+    r = bl.write_fd(fd);
+  }
+
+ out:
+  return r;
+}
diff --git a/src/os/fs/FS.h b/src/os/fs/FS.h
new file mode 100644
index 0000000..941fd14
--- /dev/null
+++ b/src/os/fs/FS.h
@@ -0,0 +1,132 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OS_FS_H
+#define CEPH_OS_FS_H
+
+#include <errno.h>
+#include <time.h>
+
+#include "acconfig.h"
+#ifdef HAVE_LIBAIO
+# include <libaio.h>
+#endif
+
+#include <string>
+
+#include "include/types.h"
+#include "common/Mutex.h"
+#include "common/Cond.h"
+
+class FS {
+public:
+  virtual ~FS() { }
+
+  static FS *create(uint64_t f_type);
+  static FS *create_by_fd(int fd);
+
+  virtual const char *get_name() {
+    return "generic";
+  }
+
+  virtual int set_alloc_hint(int fd, uint64_t hint);
+
+  virtual int get_handle(int fd, std::string *h);
+  virtual int open_handle(int mount_fd, const std::string& h, int flags);
+
+  virtual int copy_file_range(int to_fd, uint64_t to_offset,
+			      int from_fd,
+			      uint64_t from_offset, uint64_t from_len);
+  virtual int zero(int fd, uint64_t offset, uint64_t length);
+
+  // -- aio --
+#if defined(HAVE_LIBAIO)
+  struct aio_t {
+    struct iocb iocb;  // must be first element; see shenanigans in aio_queue_t
+    void *priv;
+    int fd;
+    vector<iovec> iov;
+
+    aio_t(void *p, int f) : priv(p), fd(f) {
+      memset(&iocb, 0, sizeof(iocb));
+    }
+
+    void pwritev(uint64_t offset) {
+      io_prep_pwritev(&iocb, fd, &iov[0], iov.size(), offset);
+    }
+  };
+
+  struct aio_queue_t {
+    int max_iodepth;
+    io_context_t ctx;
+
+    aio_queue_t(unsigned max_iodepth)
+      : max_iodepth(max_iodepth),
+	ctx(0) {
+    }
+    ~aio_queue_t() {
+      assert(ctx == 0);
+    }
+
+    int init() {
+      assert(ctx == 0);
+      return io_setup(max_iodepth, &ctx);
+    }
+    void shutdown() {
+      if (ctx) {
+	int r = io_destroy(ctx);
+	assert(r == 0);
+	ctx = 0;
+      }
+    }
+
+    int submit(aio_t &aio, int *retries) {
+      int attempts = 10;
+      iocb *piocb = &aio.iocb;
+      while (true) {
+	int r = io_submit(ctx, 1, &piocb);
+	if (r < 0) {
+	  if (r == -EAGAIN && attempts-- > 0) {
+	    usleep(500);
+	    (*retries)++;
+	    continue;
+	  }
+	  return r;
+	}
+	assert(r == 1);
+	break;
+      }
+      return 0;
+    }
+
+    int get_next_completed(int timeout_ms, aio_t **paio, int max) {
+      io_event event[max];
+      struct timespec t = {
+	timeout_ms / 1000,
+	(timeout_ms % 1000) * 1000 * 1000
+      };
+      int r = io_getevents(ctx, 1, 1, event, &t);
+      if (r <= 0) {
+	return r;
+      }
+      for (int i=0; i<r; ++i) {
+	paio[i] = (aio_t *)event[i].obj;
+      }
+      return r;
+    }
+  };
+#endif
+};
+
+#endif
diff --git a/src/os/fs/XFS.cc b/src/os/fs/XFS.cc
new file mode 100644
index 0000000..c72ee1a
--- /dev/null
+++ b/src/os/fs/XFS.cc
@@ -0,0 +1,55 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "XFS.h"
+
+#include <xfs/xfs.h>
+
+int XFS::set_alloc_hint(int fd, uint64_t val)
+{
+  struct fsxattr fsx;
+  struct stat sb;
+  int ret;
+
+  if (fstat(fd, &sb) < 0) {
+    ret = -errno;
+    return ret;
+  }
+  if (!S_ISREG(sb.st_mode)) {
+    return -EINVAL;
+  }
+
+  if (ioctl(fd, XFS_IOC_FSGETXATTR, &fsx) < 0) {
+    ret = -errno;
+    return ret;
+  }
+
+  // already set?
+  if ((fsx.fsx_xflags & XFS_XFLAG_EXTSIZE) && fsx.fsx_extsize == val)
+    return 0;
+
+  // xfs won't change extent size if any extents are allocated
+  if (fsx.fsx_nextents != 0)
+    return 0;
+
+  fsx.fsx_xflags |= XFS_XFLAG_EXTSIZE;
+  fsx.fsx_extsize = val;
+
+  if (ioctl(fd, XFS_IOC_FSSETXATTR, &fsx) < 0) {
+    ret = -errno;
+    return ret;
+  }
+
+  return 0;
+}
diff --git a/src/os/fs/XFS.h b/src/os/fs/XFS.h
new file mode 100644
index 0000000..1c3c3c4
--- /dev/null
+++ b/src/os/fs/XFS.h
@@ -0,0 +1,31 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OS_XFS_H
+#define CEPH_OS_XFS_H
+
+#include "FS.h"
+
+# ifndef XFS_SUPER_MAGIC
+static const __SWORD_TYPE XFS_SUPER_MAGIC(0x58465342);
+# endif
+
+class XFS : public FS {
+  const char *get_name() {
+    return "xfs";
+  }
+  int set_alloc_hint(int fd, uint64_t hint);
+};
+
+#endif
diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc
new file mode 100644
index 0000000..bdc5d38
--- /dev/null
+++ b/src/os/newstore/NewStore.cc
@@ -0,0 +1,4442 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#include "NewStore.h"
+#include "include/compat.h"
+#include "include/stringify.h"
+#include "common/errno.h"
+#include "common/safe_io.h"
+
+#define dout_subsys ceph_subsys_newstore
+
+/*
+
+  TODO:
+
+  * collection_list must flush pending db work
+  * multiple fragments per object (with configurable size.. maybe 1 or 2 mb default?)
+    * read path should be totally generic (handle any fragment pattern)
+    * write path should ideally tolerate any fragment pattern, but only generate a fixed layout (since the tunable may be changed over time).
+  * rocksdb: use db_paths (db/ and db.bulk/ ?)
+  * rocksdb: auto-detect use_fsync option when not xfs or btrfs
+  * avoid mtime updates when doing open-by-handle
+  * fid xattr backpointer
+  * inline first fsync_item in TransContext to void allocation?
+  * refcounted fragments (for efficient clone)
+
+ */
+
+const string PREFIX_SUPER = "S"; // field -> value
+const string PREFIX_COLL = "C"; // collection name -> (nothing)
+const string PREFIX_OBJ = "O";  // object name -> onode
+const string PREFIX_OVERLAY = "V"; // u64 + offset -> value
+const string PREFIX_OMAP = "M"; // u64 + keyname -> value
+const string PREFIX_WAL = "L";  // write ahead log
+
+
+/*
+ * key
+ *
+ * The key string needs to lexicographically sort the same way that
+ * ghobject_t does.  We do this by escaping anything <= to '%' with %
+ * plus a 2 digit hex string, and anything >= '~' with ~ plus the two
+ * hex digits.
+ *
+ * We use ! as a separator for strings; this works because it is < %
+ * and will get escaped if it is present in the string.
+ *
+ * For the fixed length numeric fields, we just use hex and '.' as a
+ * convenient visual separator.  Two oddities here:
+ *
+ *   1. for the -1 shard value we use --; it's the only negative value
+ *      and it sorts < 0 that way.
+ *
+ *   2. for the pool value, we add 2^63 so that it sorts correctly
+ *
+ * We could do something much more compact here, but it would be less
+ * readable by humans.  :/
+ */
+
+const string KEY_SEP_S = "!";
+
+static void append_escaped(const string &in, string *out)
+{
+  char hexbyte[8];
+  for (string::const_iterator i = in.begin(); i != in.end(); ++i) {
+    if (*i <= '#') {
+      snprintf(hexbyte, sizeof(hexbyte), "#%02x", (unsigned)*i);
+      out->append(hexbyte);
+    } else if (*i >= '~') {
+      snprintf(hexbyte, sizeof(hexbyte), "~%02x", (unsigned)*i);
+      out->append(hexbyte);
+    } else {
+      out->push_back(*i);
+    }
+  }
+}
+
+static int decode_escaped(const char *p, string *out)
+{
+  const char *orig_p = p;
+  while (*p && *p != '!') {
+    if (*p == '#' || *p == '~') {
+      unsigned hex;
+      int r = sscanf(++p, "%2x", &hex);
+      if (r < 1)
+	return -EINVAL;
+      out->push_back((char)hex);
+      p += 2;
+    } else {
+      out->push_back(*p++);
+    }
+  }
+  return p - orig_p;
+}
+
+// here is a sample (large) key
+// --.7fffffffffffffff.B9FA767A.!0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa [...]
+
+static void get_coll_key_range(const coll_t& cid, int bits,
+			       string *temp_start, string *temp_end,
+			       string *start, string *end)
+{
+  temp_start->clear();
+  temp_end->clear();
+  start->clear();
+  end->clear();
+
+  spg_t pgid;
+  if (cid.is_pg(&pgid)) {
+    char buf[PATH_MAX];
+
+    // make field ordering match with ghobject_t compare operations
+    if (pgid.shard == shard_id_t::NO_SHARD) {
+      // otherwise ff will sort *after* 0, not before.
+      *start = "--";
+    } else {
+      snprintf(buf, sizeof(buf), "%02x", (int)pgid.shard);
+      start->append(buf);
+    }
+    *end = *start;
+    *temp_start = *start;
+    *temp_end = *start;
+
+    snprintf(buf, sizeof(buf), ".%016llx.%08x.",
+	     (unsigned long long)(pgid.pool() + 0x8000000000000000ull),
+	     (unsigned)hobject_t::_reverse_bits(pgid.ps()));
+    start->append(buf);
+    snprintf(buf, sizeof(buf), ".%016llx.%08x.",
+	     (unsigned long long)((-2ll - pgid.pool()) + 0x8000000000000000ull),
+	     (unsigned)hobject_t::_reverse_bits(pgid.ps()));
+    temp_start->append(buf);
+
+    uint64_t end_hash = hobject_t::_reverse_bits(pgid.ps());
+    end_hash += (1ull << (32-bits));
+    if (end_hash > 0xffffffff) {
+      snprintf(buf, sizeof(buf), ".%016llx.gggggggg.",
+	       (unsigned long long)(pgid.pool() + 0x8000000000000000ull));
+      end->append(buf);
+      snprintf(buf, sizeof(buf), ".%016llx.gggggggg.",
+	       (unsigned long long)((-2ll - pgid.pool()) + 0x8000000000000000ull));
+      temp_end->append(buf);
+    } else {
+      snprintf(buf, sizeof(buf), ".%016llx.%08x.",
+	       (unsigned long long)(pgid.pool() + 0x8000000000000000ull),
+	       (unsigned)end_hash);
+      end->append(buf);
+      snprintf(buf, sizeof(buf), ".%016llx.%08x.",
+	       (unsigned long long)((-2ll - pgid.pool()) + 0x8000000000000000ull),
+	       (unsigned)end_hash);
+      temp_end->append(buf);
+    }
+  } else if (cid.is_meta()) {
+    *start = "--.7fffffffffffffff.00000000.";
+    *end =   "--.7fffffffffffffff.gggggggg.";
+    // no separate temp section
+    *temp_start = *end;
+    *temp_end = *end;
+  } else {
+    assert(0);
+  }
+}
+
+static int get_key_object(const string& key, ghobject_t *oid);
+
+static void get_object_key(const ghobject_t& oid, string *key)
+{
+  char buf[PATH_MAX];
+  char *t = buf;
+  char *end = t + sizeof(buf);
+
+  key->clear();
+
+  // make field ordering match with ghobject_t compare operations
+  if (oid.shard_id == shard_id_t::NO_SHARD) {
+    // otherwise ff will sort *after* 0, not before.
+    *key = "--";
+  } else {
+    snprintf(buf, sizeof(buf), "%02x", (int)oid.shard_id);
+    key->append(buf);
+  }
+
+  t += snprintf(t, end - t, ".%016llx.%.*x.",
+		(unsigned long long)(oid.hobj.pool + 0x8000000000000000ull),
+		(int)(sizeof(oid.hobj.get_hash())*2),
+		(uint32_t)oid.hobj.get_bitwise_key_u32());
+  key->append(buf);
+
+  append_escaped(oid.hobj.nspace, key);
+  key->append(KEY_SEP_S);
+
+  append_escaped(oid.hobj.get_effective_key(), key);
+  key->append(KEY_SEP_S);
+
+  append_escaped(oid.hobj.oid.name, key);
+  key->append(KEY_SEP_S);
+
+  t = buf;
+  t += snprintf(t, end - t, "%016llx.%016llx",
+		(long long unsigned)oid.hobj.snap,
+		(long long unsigned)oid.generation);
+  key->append(buf);
+
+  // sanity check
+  if (true) {
+    ghobject_t t;
+    int r = get_key_object(*key, &t);
+    if (r || t != oid) {
+      derr << "  r " << r << dendl;
+      derr << "key " << *key << dendl;
+      derr << "oid " << oid << dendl;
+      derr << "  t " << t << dendl;
+      assert(t == oid);
+    }
+  }
+}
+
+static int get_key_object(const string& key, ghobject_t *oid)
+{
+  int r;
+  const char *p = key.c_str();
+
+  if (key[0] == '-') {
+    oid->shard_id = shard_id_t::NO_SHARD;
+  } else {
+    unsigned shard;
+    r = sscanf(p, "%x", &shard);
+    if (r < 1)
+      return -1;
+    oid->shard_id = shard_id_t(shard);
+  }
+  if (p[2] != '.' || p[19] != '.' || p[28] != '.')
+    return -2;
+
+  unsigned hash;
+  uint64_t pool;
+  r = sscanf(p + 3, "%llx.%x", (unsigned long long*)&pool, &hash);
+  if (r < 2)
+    return -3;
+  oid->hobj.pool = pool - 0x8000000000000000;
+  oid->hobj.set_bitwise_key_u32(hash);
+  p += 3 + 2 + 16 + 8;
+
+  r = decode_escaped(p, &oid->hobj.nspace);
+  if (r < 0)
+    return -4;
+  p += r + 1;
+  string okey;
+  r = decode_escaped(p, &okey);
+  if (r < 0)
+    return -5;
+  p += r + 1;
+  r = decode_escaped(p, &oid->hobj.oid.name);
+  if (r < 0)
+    return -6;
+  p += r + 1;
+
+  oid->hobj.set_key(okey);
+
+  r = sscanf(p, "%llx.%llx", (unsigned long long*)&oid->hobj.snap,
+	     (unsigned long long*)&oid->generation);
+  if (r < 2)
+    return -7;
+  return 0;
+}
+
+
+void get_overlay_key(uint64_t nid, uint64_t offset, string *out)
+{
+  char buf[64];
+  // note: these don't have to sort by nid; no need to pad 0's
+  snprintf(buf, sizeof(buf), "%llx %016llx", (unsigned long long)nid,
+	   (unsigned long long)offset);
+  *out = buf;
+}
+
+// '-' < '.' < '~'
+void get_omap_header(uint64_t id, string *out)
+{
+  char buf[32];
+  snprintf(buf, sizeof(buf), "%016llx-", (unsigned long long)id);
+  *out = buf;
+}
+
+// hmm, I don't think there's any need to escape the user key since we
+// have a clean prefix.
+void get_omap_key(uint64_t id, const string& key, string *out)
+{
+  char buf[32];
+  snprintf(buf, sizeof(buf), "%016llx.", (unsigned long long)id);
+  *out = buf;
+  out->append(key);
+}
+
+void rewrite_omap_key(uint64_t id, string old, string *out)
+{
+  char buf[32];
+  snprintf(buf, sizeof(buf), "%016llx", (unsigned long long)id);
+  *out = buf;
+  out->append(old.substr(16));
+}
+
+void decode_omap_key(const string& key, string *user_key)
+{
+  *user_key = key.substr(17);
+}
+
+void get_omap_tail(uint64_t id, string *out)
+{
+  char buf[32];
+  snprintf(buf, sizeof(buf), "%016llx~", (unsigned long long)id);
+  *out = buf;
+}
+
+void get_wal_key(uint64_t seq, string *out)
+{
+  char buf[32];
+  snprintf(buf, sizeof(buf), "%016llx", (unsigned long long)seq);
+  *out = buf;
+}
+
+// Onode
+
+NewStore::Onode::Onode(const ghobject_t& o, const string& k)
+  : nref(0),
+    oid(o),
+    key(k),
+    dirty(false),
+    exists(true),
+    flush_lock("NewStore::Onode::flush_lock") {
+}
+
+// OnodeHashLRU
+
+#undef dout_prefix
+#define dout_prefix *_dout << "newstore.lru(" << this << ") "
+
+void NewStore::OnodeHashLRU::_touch(OnodeRef o)
+{
+  lru_list_t::iterator p = lru.iterator_to(*o);
+  lru.erase(p);
+  lru.push_front(*o);
+}
+
+void NewStore::OnodeHashLRU::add(const ghobject_t& oid, OnodeRef o)
+{
+  Mutex::Locker l(lock);
+  dout(30) << __func__ << " " << oid << " " << o << dendl;
+  assert(onode_map.count(oid) == 0);
+  onode_map[oid] = o;
+  lru.push_back(*o);
+}
+
+NewStore::OnodeRef NewStore::OnodeHashLRU::lookup(const ghobject_t& oid)
+{
+  Mutex::Locker l(lock);
+  dout(30) << __func__ << dendl;
+  ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(oid);
+  if (p == onode_map.end()) {
+    dout(30) << __func__ << " " << oid << " miss" << dendl;
+    return OnodeRef();
+  }
+  dout(30) << __func__ << " " << oid << " hit " << p->second << dendl;
+  _touch(p->second);
+  return p->second;
+}
+
+void NewStore::OnodeHashLRU::clear()
+{
+  Mutex::Locker l(lock);
+  dout(10) << __func__ << dendl;
+  lru.clear();
+  onode_map.clear();
+}
+
+void NewStore::OnodeHashLRU::remove(const ghobject_t& oid)
+{
+  Mutex::Locker l(lock);
+  ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(oid);
+  if (p == onode_map.end()) {
+    dout(30) << __func__ << " " << oid << " miss" << dendl;
+    return;
+  }
+  dout(30) << __func__ << " " << oid << " hit " << p->second << dendl;
+  lru_list_t::iterator pi = lru.iterator_to(*p->second);
+  lru.erase(pi);
+  onode_map.erase(p);
+}
+
+void NewStore::OnodeHashLRU::rename(const ghobject_t& old_oid,
+				    const ghobject_t& new_oid)
+{
+  Mutex::Locker l(lock);
+  dout(30) << __func__ << " " << old_oid << " -> " << new_oid << dendl;
+  ceph::unordered_map<ghobject_t,OnodeRef>::iterator po, pn;
+  po = onode_map.find(old_oid);
+  pn = onode_map.find(new_oid);
+
+  assert(po != onode_map.end());
+  if (pn != onode_map.end()) {
+    lru_list_t::iterator p = lru.iterator_to(*pn->second);
+    lru.erase(p);
+    onode_map.erase(pn);
+  }
+  onode_map.insert(make_pair(new_oid, po->second));
+  _touch(po->second);
+  onode_map.erase(po);
+}
+
+bool NewStore::OnodeHashLRU::get_next(
+  const ghobject_t& after,
+  pair<ghobject_t,OnodeRef> *next)
+{
+  Mutex::Locker l(lock);
+  dout(20) << __func__ << " after " << after << dendl;
+
+  if (after == ghobject_t()) {
+    if (lru.empty()) {
+      return false;
+    }
+    ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.begin();
+    assert(p != onode_map.end());
+    next->first = p->first;
+    next->second = p->second;
+    return true;
+  }
+
+  ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(after);
+  assert(p != onode_map.end()); // for now
+  lru_list_t::iterator pi = lru.iterator_to(*p->second);
+  ++pi;
+  if (pi == lru.end()) {
+    return false;
+  }
+  next->first = pi->oid;
+  next->second = onode_map[pi->oid];
+  return true;
+}
+
+int NewStore::OnodeHashLRU::trim(int max)
+{
+  Mutex::Locker l(lock);
+  dout(20) << __func__ << " max " << max
+	   << " size " << onode_map.size() << dendl;
+  int trimmed = 0;
+  int num = onode_map.size() - max;
+  lru_list_t::iterator p = lru.end();
+  if (num)
+    --p;
+  while (num > 0) {
+    Onode *o = &*p;
+    int refs = o->nref.read();
+    if (refs > 1) {
+      dout(20) << __func__ << "  " << o->oid << " has " << refs
+	       << " refs; stopping with " << num << " left to trim" << dendl;
+      break;
+    }
+    dout(30) << __func__ << "  trim " << o->oid << dendl;
+    if (p != lru.begin()) {
+      lru.erase(p--);
+    } else {
+      lru.erase(p);
+      assert(num == 1);
+    }
+    o->get();  // paranoia
+    onode_map.erase(o->oid);
+    o->put();
+    --num;
+    ++trimmed;
+  }
+  return trimmed;
+}
+
+// =======================================================
+
+// Collection
+
+#undef dout_prefix
+#define dout_prefix *_dout << "newstore(" << store->path << ").collection(" << cid << ") "
+
+NewStore::Collection::Collection(NewStore *ns, coll_t c)
+  : store(ns),
+    cid(c),
+    lock("NewStore::Collection::lock"),
+    onode_map()
+{
+}
+
+NewStore::OnodeRef NewStore::Collection::get_onode(
+  const ghobject_t& oid,
+  bool create)
+{
+  assert(create ? lock.is_wlocked() : lock.is_locked());
+
+  spg_t pgid;
+  if (cid.is_pg(&pgid)) {
+    if (!oid.match(cnode.bits, pgid.ps())) {
+      derr << __func__ << " oid " << oid << " not part of " << pgid
+	   << " bits " << cnode.bits << dendl;
+      assert(0);
+    }
+  }
+
+  OnodeRef o = onode_map.lookup(oid);
+  if (o)
+    return o;
+
+  string key;
+  get_object_key(oid, &key);
+
+  dout(20) << __func__ << " oid " << oid << " key '" << key << "'" << dendl;
+
+  bufferlist v;
+  int r = store->db->get(PREFIX_OBJ, key, &v);
+  dout(20) << " r " << r << " v.len " << v.length() << dendl;
+  Onode *on;
+  if (v.length() == 0) {
+    assert(r == -ENOENT);
+    if (!create)
+      return OnodeRef();
+
+    // new
+    on = new Onode(oid, key);
+    on->dirty = true;
+  } else {
+    // loaded
+    assert(r >=0);
+    on = new Onode(oid, key);
+    bufferlist::iterator p = v.begin();
+    ::decode(on->onode, p);
+  }
+  o.reset(on);
+  onode_map.add(oid, o);
+  return o;
+}
+
+
+
+// =======================================================
+
+#undef dout_prefix
+#define dout_prefix *_dout << "newstore(" << path << ") "
+
+
+NewStore::NewStore(CephContext *cct, const string& path)
+  : ObjectStore(path),
+    cct(cct),
+    db(NULL),
+    fs(NULL),
+    db_path(cct->_conf->newstore_db_path),
+    path_fd(-1),
+    fsid_fd(-1),
+    frag_fd(-1),
+    fset_fd(-1),
+    mounted(false),
+    coll_lock("NewStore::coll_lock"),
+    fid_lock("NewStore::fid_lock"),
+    nid_lock("NewStore::nid_lock"),
+    nid_max(0),
+    throttle_ops(cct, "newstore_max_ops", cct->_conf->newstore_max_ops),
+    throttle_bytes(cct, "newstore_max_bytes", cct->_conf->newstore_max_bytes),
+    throttle_wal_ops(cct, "newstore_wal_max_ops",
+		     cct->_conf->newstore_max_ops +
+		     cct->_conf->newstore_wal_max_ops),
+    throttle_wal_bytes(cct, "newstore_wal_max_bytes",
+		       cct->_conf->newstore_max_bytes +
+		       cct->_conf->newstore_wal_max_bytes),
+    wal_lock("NewStore::wal_lock"),
+    wal_seq(0),
+    wal_tp(cct,
+	   "NewStore::wal_tp",
+	   cct->_conf->newstore_wal_threads,
+	   "newstore_wal_threads"),
+    wal_wq(this,
+	     cct->_conf->newstore_wal_thread_timeout,
+	     cct->_conf->newstore_wal_thread_suicide_timeout,
+	     &wal_tp),
+    finisher(cct),
+    fsync_tp(cct,
+	     "NewStore::fsync_tp",
+	     cct->_conf->newstore_fsync_threads,
+	     "newstore_fsync_threads"),
+    fsync_wq(this,
+	     cct->_conf->newstore_fsync_thread_timeout,
+	     cct->_conf->newstore_fsync_thread_suicide_timeout,
+	     &fsync_tp),
+    aio_thread(this),
+    aio_stop(false),
+    aio_queue(cct->_conf->newstore_aio_max_queue_depth),
+    kv_sync_thread(this),
+    kv_lock("NewStore::kv_lock"),
+    kv_stop(false),
+    logger(NULL),
+    reap_lock("NewStore::reap_lock")
+{
+  _init_logger();
+}
+
+NewStore::~NewStore()
+{
+  _shutdown_logger();
+  assert(!mounted);
+  assert(db == NULL);
+  assert(fsid_fd < 0);
+  assert(frag_fd < 0);
+}
+
+void NewStore::_init_logger()
+{
+  // XXX
+}
+
+void NewStore::_shutdown_logger()
+{
+  // XXX
+}
+
+int NewStore::peek_journal_fsid(uuid_d *fsid)
+{
+  return 0;
+}
+
+int NewStore::_open_path()
+{
+  assert(path_fd < 0);
+  path_fd = ::open(path.c_str(), O_DIRECTORY);
+  if (path_fd < 0) {
+    int r = -errno;
+    derr << __func__ << " unable to open " << path << ": " << cpp_strerror(r)
+	 << dendl;
+    return r;
+  }
+  assert(fs == NULL);
+  fs = FS::create(path_fd);
+  dout(1) << __func__ << " using fs driver '" << fs->get_name() << "'" << dendl;
+  return 0;
+}
+
+void NewStore::_close_path()
+{
+  VOID_TEMP_FAILURE_RETRY(::close(path_fd));
+  path_fd = -1;
+  delete fs;
+  fs = NULL;
+}
+
+int NewStore::_open_frag()
+{
+  assert(frag_fd < 0);
+  frag_fd = ::openat(path_fd, "fragments", O_DIRECTORY);
+  if (frag_fd < 0) {
+    int r = -errno;
+    derr << __func__ << " cannot open " << path << "/fragments: "
+	 << cpp_strerror(r) << dendl;
+    return r;
+  }
+  return 0;
+}
+
+int NewStore::_create_frag()
+{
+  assert(frag_fd < 0);
+  frag_fd = ::openat(path_fd, "fragments", O_DIRECTORY);
+  if (frag_fd < 0 && errno == ENOENT) {
+    int r = ::mkdirat(path_fd, "fragments", 0755);
+    if (r < 0) {
+      r = -errno;
+      derr << __func__ << " cannot create " << path << "/fragments: "
+	   << cpp_strerror(r) << dendl;
+      return r;
+    }
+    frag_fd = ::openat(path_fd, "fragments", O_DIRECTORY);
+  }
+  if (frag_fd < 0) {
+    int r = -errno;
+    derr << __func__ << " cannot open created " << path << "/fragments: "
+	 << cpp_strerror(r) << dendl;
+    return r;
+  }
+  return 0;
+}
+
+void NewStore::_close_frag()
+{
+  if (fset_fd >= 0) {
+    VOID_TEMP_FAILURE_RETRY(::close(fset_fd));
+    fset_fd = -1;
+  }
+  VOID_TEMP_FAILURE_RETRY(::close(frag_fd));
+  frag_fd = -1;
+}
+
+int NewStore::_open_fsid(bool create)
+{
+  assert(fsid_fd < 0);
+  int flags = O_RDWR;
+  if (create)
+    flags |= O_CREAT;
+  fsid_fd = ::openat(path_fd, "fsid", flags, 0644);
+  if (fsid_fd < 0) {
+    int err = -errno;
+    derr << __func__ << " " << cpp_strerror(err) << dendl;
+    return err;
+  }
+  return 0;
+}
+
+int NewStore::_read_fsid(uuid_d *uuid)
+{
+  char fsid_str[40];
+  int ret = safe_read(fsid_fd, fsid_str, sizeof(fsid_str));
+  if (ret < 0)
+    return ret;
+  if (ret > 36)
+    fsid_str[36] = 0;
+  if (!uuid->parse(fsid_str))
+    return -EINVAL;
+  return 0;
+}
+
+int NewStore::_write_fsid()
+{
+  int r = ::ftruncate(fsid_fd, 0);
+  if (r < 0) {
+    r = -errno;
+    derr << __func__ << " fsid truncate failed: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+  string str = stringify(fsid) + "\n";
+  r = safe_write(fsid_fd, str.c_str(), str.length());
+  if (r < 0) {
+    derr << __func__ << " fsid write failed: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+  r = ::fsync(fsid_fd);
+  if (r < 0) {
+    derr << __func__ << " fsid fsync failed: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+  return 0;
+}
+
+void NewStore::_close_fsid()
+{
+  VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
+  fsid_fd = -1;
+}
+
+int NewStore::_lock_fsid()
+{
+  struct flock l;
+  memset(&l, 0, sizeof(l));
+  l.l_type = F_WRLCK;
+  l.l_whence = SEEK_SET;
+  l.l_start = 0;
+  l.l_len = 0;
+  int r = ::fcntl(fsid_fd, F_SETLK, &l);
+  if (r < 0) {
+    int err = errno;
+    derr << __func__ << " failed to lock " << path << "/fsid"
+	 << " (is another ceph-osd still running?)"
+	 << cpp_strerror(err) << dendl;
+    return -err;
+  }
+  return 0;
+}
+
+bool NewStore::test_mount_in_use()
+{
+  // most error conditions mean the mount is not in use (e.g., because
+  // it doesn't exist).  only if we fail to lock do we conclude it is
+  // in use.
+  bool ret = false;
+  int r = _open_path();
+  if (r < 0)
+    return false;
+  r = _open_fsid(false);
+  if (r < 0)
+    goto out_path;
+  r = _lock_fsid();
+  if (r < 0)
+    ret = true; // if we can't lock, it is in used
+  _close_fsid();
+ out_path:
+  _close_path();
+  return ret;
+}
+
+int NewStore::_open_db()
+{
+  assert(!db);
+  char fn[PATH_MAX];
+  snprintf(fn, sizeof(fn), "%s/db", path.c_str());
+  db = KeyValueDB::create(g_ceph_context,
+			  g_conf->newstore_backend,
+			  fn);
+  if (!db) {
+    derr << __func__ << " error creating db" << dendl;
+    delete db;
+    db = NULL;
+    return -EIO;
+  }
+  db->init(g_conf->newstore_backend_options);
+  stringstream err;
+  if (db->create_and_open(err)) {
+    derr << __func__ << " erroring opening db: " << err.str() << dendl;
+    delete db;
+    db = NULL;
+    return -EIO;
+  }
+  dout(1) << __func__ << " opened " << g_conf->newstore_backend
+	  << " path " << path
+	  << " options " << g_conf->newstore_backend_options << dendl;
+  return 0;
+}
+
+void NewStore::_close_db()
+{
+  assert(db);
+  delete db;
+  db = NULL;
+}
+
+int NewStore::_aio_start()
+{
+  if (g_conf->newstore_aio) {
+    dout(10) << __func__ << dendl;
+    int r = aio_queue.init();
+    if (r < 0)
+      return r;
+    aio_thread.create();
+  }
+  return 0;
+}
+
+void NewStore::_aio_stop()
+{
+  if (g_conf->newstore_aio) {
+    dout(10) << __func__ << dendl;
+    aio_stop = true;
+    aio_thread.join();
+    aio_stop = false;
+    aio_queue.shutdown();
+  }
+}
+
+int NewStore::_open_collections()
+{
+  KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL);
+  for (it->upper_bound(string());
+       it->valid();
+       it->next()) {
+    coll_t cid;
+    if (cid.parse(it->key())) {
+      CollectionRef c(new Collection(this, cid));
+      bufferlist bl;
+      db->get(PREFIX_COLL, it->key(), &bl);
+      bufferlist::iterator p = bl.begin();
+      ::decode(c->cnode, p);
+      dout(20) << __func__ << " opened " << cid << dendl;
+      coll_map[cid] = c;
+    } else {
+      dout(20) << __func__ << " unrecognized collection " << it->key() << dendl;
+    }
+  }
+  return 0;
+}
+
+int NewStore::mkfs()
+{
+  dout(1) << __func__ << " path " << path << dendl;
+  int r;
+  uuid_d old_fsid;
+
+  r = _open_path();
+  if (r < 0)
+    return r;
+
+  r = _open_fsid(true);
+  if (r < 0)
+    goto out_path_fd;
+
+  r = _lock_fsid();
+  if (r < 0)
+    goto out_close_fsid;
+
+  r = _read_fsid(&old_fsid);
+  if (r < 0 && old_fsid.is_zero()) {
+    if (fsid.is_zero()) {
+      fsid.generate_random();
+      dout(1) << __func__ << " generated fsid " << fsid << dendl;
+    } else {
+      dout(1) << __func__ << " using provided fsid " << fsid << dendl;
+    }
+    r = _write_fsid();
+    if (r < 0)
+      goto out_close_fsid;
+  } else {
+    if (!fsid.is_zero() && fsid != old_fsid) {
+      derr << __func__ << " on-disk fsid " << old_fsid
+	   << " != provided " << fsid << dendl;
+      r = -EINVAL;
+      goto out_close_fsid;
+    }
+    fsid = old_fsid;
+    dout(1) << __func__ << " fsid is already set to " << fsid << dendl;
+  }
+
+  r = _create_frag();
+  if (r < 0)
+    goto out_close_fsid;
+
+  if (db_path != "") {
+    r = symlinkat(db_path.c_str(), path_fd, "db");
+    if (r < 0)
+      goto out_close_frag;
+  }
+  r = _open_db();
+  if (r < 0)
+    goto out_close_frag;
+
+  // FIXME: superblock
+
+  dout(10) << __func__ << " success" << dendl;
+  r = 0;
+  _close_db();
+
+ out_close_frag:
+  _close_frag();
+ out_close_fsid:
+  _close_fsid();
+ out_path_fd:
+  _close_path();
+  return r;
+}
+
+int NewStore::mount()
+{
+  dout(1) << __func__ << " path " << path << dendl;
+
+  int r = _open_path();
+  if (r < 0)
+    return r;
+  r = _open_fsid(false);
+  if (r < 0)
+    goto out_path;
+
+  r = _read_fsid(&fsid);
+  if (r < 0)
+    goto out_fsid;
+
+  r = _lock_fsid();
+  if (r < 0)
+    goto out_fsid;
+
+  r = _open_frag();
+  if (r < 0)
+    goto out_fsid;
+
+  // FIXME: superblock, features
+
+  r = _open_db();
+  if (r < 0)
+    goto out_frag;
+
+  r = _recover_next_fid();
+  if (r < 0)
+    goto out_db;
+
+  r = _recover_next_nid();
+  if (r < 0)
+    goto out_db;
+
+  r = _open_collections();
+  if (r < 0)
+    goto out_db;
+
+  r = _aio_start();
+  if (r < 0)
+    goto out_db;
+
+  r = _wal_replay();
+  if (r < 0)
+    goto out_aio;
+
+  finisher.start();
+  fsync_tp.start();
+  wal_tp.start();
+  kv_sync_thread.create();
+
+  mounted = true;
+  return 0;
+
+ out_aio:
+  _aio_stop();
+ out_db:
+  _close_db();
+ out_frag:
+  _close_frag();
+ out_fsid:
+  _close_fsid();
+ out_path:
+  _close_path();
+  return r;
+}
+
+int NewStore::umount()
+{
+  assert(mounted);
+  dout(1) << __func__ << dendl;
+
+  _sync();
+  _reap_collections();
+
+  dout(20) << __func__ << " stopping fsync_wq" << dendl;
+  fsync_tp.stop();
+  dout(20) << __func__ << " stopping aio" << dendl;
+  _aio_stop();
+  dout(20) << __func__ << " stopping kv thread" << dendl;
+  _kv_stop();
+  dout(20) << __func__ << " draining wal_wq" << dendl;
+  wal_wq.drain();
+  dout(20) << __func__ << " stopping wal_tp" << dendl;
+  wal_tp.stop();
+  dout(20) << __func__ << " draining finisher" << dendl;
+  finisher.wait_for_empty();
+  dout(20) << __func__ << " stopping finisher" << dendl;
+  finisher.stop();
+  dout(20) << __func__ << " closing" << dendl;
+
+  mounted = false;
+  if (fset_fd >= 0)
+    VOID_TEMP_FAILURE_RETRY(::close(fset_fd));
+  _close_db();
+  _close_frag();
+  _close_fsid();
+  _close_path();
+  return 0;
+}
+
+void NewStore::_sync()
+{
+  dout(10) << __func__ << dendl;
+
+  dout(20) << " flushing fsync wq" << dendl;
+  fsync_wq.flush();
+
+  kv_lock.Lock();
+  while (!kv_committing.empty() ||
+	 !kv_queue.empty()) {
+    dout(20) << " waiting for kv to commit" << dendl;
+    kv_sync_cond.Wait(kv_lock);
+  }
+  kv_lock.Unlock();
+
+  dout(10) << __func__ << " done" << dendl;
+}
+
+int NewStore::statfs(struct statfs *buf)
+{
+  if (::statfs(path.c_str(), buf) < 0) {
+    int r = -errno;
+    assert(!g_conf->newstore_fail_eio || r != -EIO);
+    return r;
+  }
+  return 0;
+}
+
+// ---------------
+// cache
+
+NewStore::CollectionRef NewStore::_get_collection(coll_t cid)
+{
+  RWLock::RLocker l(coll_lock);
+  ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid);
+  if (cp == coll_map.end())
+    return CollectionRef();
+  return cp->second;
+}
+
+void NewStore::_queue_reap_collection(CollectionRef& c)
+{
+  dout(10) << __func__ << " " << c->cid << dendl;
+  Mutex::Locker l(reap_lock);
+  removed_collections.push_back(c);
+}
+
+void NewStore::_reap_collections()
+{
+  reap_lock.Lock();
+
+  list<CollectionRef> removed_colls;
+  removed_colls.swap(removed_collections);
+  reap_lock.Unlock();
+
+  for (list<CollectionRef>::iterator p = removed_colls.begin();
+       p != removed_colls.end();
+       ++p) {
+    CollectionRef c = *p;
+    dout(10) << __func__ << " " << c->cid << dendl;
+    {
+      pair<ghobject_t,OnodeRef> next;
+      while (c->onode_map.get_next(next.first, &next)) {
+	assert(!next.second->exists);
+	if (!next.second->flush_txns.empty()) {
+	  dout(10) << __func__ << " " << c->cid << " " << next.second->oid
+		   << " flush_txns " << next.second->flush_txns << dendl;
+	  return;
+	}
+      }
+    }
+    c->onode_map.clear();
+    dout(10) << __func__ << " " << c->cid << " done" << dendl;
+  }
+
+  dout(10) << __func__ << " all reaped" << dendl;
+  reap_cond.Signal();
+}
+
+// ---------------
+// read operations
+
+bool NewStore::exists(coll_t cid, const ghobject_t& oid)
+{
+  dout(10) << __func__ << " " << cid << " " << oid << dendl;
+  CollectionRef c = _get_collection(cid);
+  if (!c)
+    return false;
+  RWLock::RLocker l(c->lock);
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists)
+    return false;
+  return true;
+}
+
+int NewStore::stat(
+    coll_t cid,
+    const ghobject_t& oid,
+    struct stat *st,
+    bool allow_eio)
+{
+  dout(10) << __func__ << " " << cid << " " << oid << dendl;
+  CollectionRef c = _get_collection(cid);
+  if (!c)
+    return -ENOENT;
+  RWLock::RLocker l(c->lock);
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists)
+    return -ENOENT;
+  st->st_size = o->onode.size;
+  st->st_blksize = 4096;
+  st->st_blocks = (st->st_size + st->st_blksize - 1) / st->st_blksize;
+  st->st_nlink = 1;
+  return 0;
+}
+
+int NewStore::read(
+  coll_t cid,
+  const ghobject_t& oid,
+  uint64_t offset,
+  size_t length,
+  bufferlist& bl,
+  uint32_t op_flags,
+  bool allow_eio)
+{
+  dout(15) << __func__ << " " << cid << " " << oid
+	   << " " << offset << "~" << length
+	   << dendl;
+  bl.clear();
+  CollectionRef c = _get_collection(cid);
+  if (!c)
+    return -ENOENT;
+  RWLock::RLocker l(c->lock);
+
+  int r;
+
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+
+  if (offset == length && offset == 0)
+    length = o->onode.size;
+
+  r = _do_read(o, offset, length, bl, op_flags);
+
+ out:
+  dout(10) << __func__ << " " << cid << " " << oid
+	   << " " << offset << "~" << length
+	   << " = " << r << dendl;
+  return r;
+}
+
+int NewStore::_do_read(
+    OnodeRef o,
+    uint64_t offset,
+    size_t length,
+    bufferlist& bl,
+    uint32_t op_flags)
+{
+  map<uint64_t,fragment_t>::iterator fp, fend;
+  map<uint64_t,overlay_t>::iterator op, oend;
+  int r;
+  int fd = -1;
+  fid_t cur_fid;
+
+  dout(20) << __func__ << " " << offset << "~" << length << " size "
+	   << o->onode.size << dendl;
+
+  if (offset > o->onode.size) {
+    r = 0;
+    goto out;
+  }
+
+  if (offset + length > o->onode.size) {
+    length = o->onode.size - offset;
+  }
+
+  o->flush();
+
+  r = 0;
+
+  // loop over overlays and data fragments.  overlays take precedence.
+  fend = o->onode.data_map.end();
+  fp = o->onode.data_map.lower_bound(offset);
+  if (fp != o->onode.data_map.begin()) {
+    --fp;
+  }
+  oend = o->onode.overlay_map.end();
+  op = o->onode.overlay_map.lower_bound(offset);
+  if (op != o->onode.overlay_map.begin()) {
+    --op;
+  }
+  while (length > 0) {
+    if (op != oend && op->first + op->second.length < offset) {
+      dout(20) << __func__ << " skip overlay " << op->first << " " << op->second
+	       << dendl;
+      ++op;
+      continue;
+    }
+    if (fp != fend && fp->first + fp->second.length <= offset) {
+      dout(30) << __func__ << " skip frag " << fp->first << "~" << fp->second
+	       << dendl;
+      ++fp;
+      continue;
+    }
+
+    // overlay?
+    if (op != oend && op->first <= offset) {
+      uint64_t x_off = offset - op->first + op->second.value_offset;
+      uint64_t x_len = MIN(op->first + op->second.length - offset, length);
+      dout(20) << __func__ << "  overlay " << op->first << " " << op->second
+	       << " use " << x_off << "~" << x_len << dendl;
+      bufferlist v;
+      string key;
+      get_overlay_key(o->onode.nid, op->second.key, &key);
+      db->get(PREFIX_OVERLAY, key, &v);
+      bufferlist frag;
+      frag.substr_of(v, x_off, x_len);
+      bl.claim_append(frag);
+      ++op;
+      length -= x_len;
+      offset += x_len;
+      continue;
+    }
+
+    unsigned x_len = length;
+    if (op != oend &&
+	op->first > offset &&
+	op->first - offset < x_len) {
+      x_len = op->first - offset;
+    }
+
+    // frag?
+    if (fp != fend && fp->first <= offset) {
+      if (fp->second.fid != cur_fid) {
+	cur_fid = fp->second.fid;
+	if (fd >= 0) {
+	  VOID_TEMP_FAILURE_RETRY(::close(fd));
+	}
+	fd = _open_fid(cur_fid, O_RDONLY);
+	if (fd < 0) {
+	  r = fd;
+	  goto out;
+	}
+      }
+      uint64_t x_off = offset - fp->first - fp->second.offset;
+      x_len = MIN(x_len, fp->second.length - x_off);
+      dout(30) << __func__ << " data " << fp->first << " " << fp->second
+	       << " use " << x_off << "~" << x_len
+	       << " fid " << cur_fid << " offset " << x_off + fp->second.offset
+	       << dendl;
+      r = ::lseek64(fd, x_off, SEEK_SET);
+      if (r < 0) {
+	r = -errno;
+	goto out;
+      }
+      bufferlist t;
+      r = t.read_fd(fd, x_len);
+      if (r < 0) {
+	goto out;
+      }
+      bl.claim_append(t);
+      if ((unsigned)r < x_len) {
+	dout(10) << __func__ << "   short read " << r << " < " << x_len
+		 << " from " << cur_fid << dendl;
+	bufferptr z(x_len - r);
+	z.zero();
+	bl.append(z);
+      }
+      offset += x_len;
+      length -= x_len;
+      if (x_off + x_len == fp->second.length) {
+	++fp;
+      }
+      continue;
+    }
+
+    // zero.
+    dout(30) << __func__ << " zero " << offset << "~" << x_len << dendl;
+    bufferptr bp(x_len);
+    bp.zero();
+    bl.push_back(bp);
+    offset += x_len;
+    length -= x_len;
+    continue;
+  }
+  r = bl.length();
+
+ out:
+  if (fd >= 0) {
+    VOID_TEMP_FAILURE_RETRY(::close(fd));
+  }
+  return r;
+}
+
+int NewStore::fiemap(
+  coll_t cid,
+  const ghobject_t& oid,
+  uint64_t offset,
+  size_t len,
+  bufferlist& bl)
+{
+  map<uint64_t, uint64_t> m;
+  CollectionRef c = _get_collection(cid);
+  if (!c)
+    return -ENOENT;
+  RWLock::RLocker l(c->lock);
+
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    return -ENOENT;
+  }
+
+  if (offset == len && offset == 0)
+    len = o->onode.size;
+
+  if (offset > o->onode.size)
+    return 0;
+
+  if (offset + len > o->onode.size) {
+    len = o->onode.size - offset;
+  }
+
+  dout(20) << __func__ << " " << offset << "~" << len << " size "
+	   << o->onode.size << dendl;
+
+  map<uint64_t,fragment_t>::iterator fp, fend;
+  map<uint64_t,overlay_t>::iterator op, oend;
+
+  // loop over overlays and data fragments.  overlays take precedence.
+  fend = o->onode.data_map.end();
+  fp = o->onode.data_map.lower_bound(offset);
+  if (fp != o->onode.data_map.begin()) {
+    --fp;
+  }
+  oend = o->onode.overlay_map.end();
+  op = o->onode.overlay_map.lower_bound(offset);
+  if (op != o->onode.overlay_map.begin()) {
+    --op;
+  }
+  uint64_t start = offset;
+  while (len > 0) {
+    if (op != oend && op->first + op->second.length < offset) {
+      ++op;
+      continue;
+    }
+    if (fp != fend && fp->first + fp->second.length <= offset) {
+      ++fp;
+      continue;
+    }
+
+    // overlay?
+    if (op != oend && op->first <= offset) {
+      uint64_t x_len = MIN(op->first + op->second.length - offset, len);
+      //m[offset] = x_len;
+      dout(30) << __func__ << " get overlay, off =  " << offset << " len=" << x_len << dendl;
+      len -= x_len;
+      offset += x_len;
+      ++op;
+      continue;
+    }
+
+    unsigned x_len = len;
+    if (op != oend &&
+	op->first > offset &&
+	op->first - offset < x_len) {
+      x_len = op->first - offset;
+    }
+
+    // frag?
+    if (fp != fend && fp->first <= offset) {
+      uint64_t x_off = offset - fp->first - fp->second.offset;
+      x_len = MIN(x_len, fp->second.length - x_off);
+      //m[offset] = x_len;
+      dout(30) << __func__ << " get frag, off =  " << offset << " len=" << x_len << dendl;
+      len -= x_len;
+      offset += x_len;
+      if (x_off + x_len == fp->second.length)
+	++fp;
+      continue;
+    }
+    // we are seeing a hole, time to add an entry to fiemap.
+    m[start] = offset - start;
+    dout(20) << __func__ << " get fiemap entry, off =  " << start << " len=" << m[start] << dendl;
+    offset += x_len;
+    start = offset;
+    len -= x_len;
+    continue;
+  }
+  //add tailing
+  if (offset - start != 0) {
+    m[start] = offset - start;
+    dout(20) << __func__ << " get fiemap entry, off =  " << start << " len=" << m[start] << dendl;
+  }
+
+  ::encode(m, bl);
+  dout(20) << __func__ << " " << offset << "~" << len << " size = 0 (" << m << ")" << dendl;
+  return 0;
+}
+
+int NewStore::getattr(
+  coll_t cid,
+  const ghobject_t& oid,
+  const char *name,
+  bufferptr& value)
+{
+  dout(15) << __func__ << " " << cid << " " << oid << " " << name << dendl;
+  CollectionRef c = _get_collection(cid);
+  if (!c)
+    return -ENOENT;
+  RWLock::RLocker l(c->lock);
+  int r;
+  string k(name);
+
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+
+  if (!o->onode.attrs.count(k)) {
+    r = -ENODATA;
+    goto out;
+  }
+  value = o->onode.attrs[k];
+  r = 0;
+ out:
+  dout(10) << __func__ << " " << cid << " " << oid << " " << name
+	   << " = " << r << dendl;
+  return r;
+}
+
+int NewStore::getattrs(
+  coll_t cid,
+  const ghobject_t& oid,
+  map<string,bufferptr>& aset)
+{
+  dout(15) << __func__ << " " << cid << " " << oid << dendl;
+  CollectionRef c = _get_collection(cid);
+  if (!c)
+    return -ENOENT;
+  RWLock::RLocker l(c->lock);
+  int r;
+
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+  aset = o->onode.attrs;
+  r = 0;
+ out:
+  dout(10) << __func__ << " " << cid << " " << oid
+	   << " = " << r << dendl;
+  return r;
+}
+
+int NewStore::list_collections(vector<coll_t>& ls)
+{
+  RWLock::RLocker l(coll_lock);
+  for (ceph::unordered_map<coll_t, CollectionRef>::iterator p = coll_map.begin();
+       p != coll_map.end();
+       ++p)
+    ls.push_back(p->first);
+  return 0;
+}
+
+bool NewStore::collection_exists(coll_t c)
+{
+  RWLock::RLocker l(coll_lock);
+  return coll_map.count(c);
+}
+
+bool NewStore::collection_empty(coll_t cid)
+{
+  dout(15) << __func__ << " " << cid << dendl;
+  vector<ghobject_t> ls;
+  ghobject_t next;
+  int r = collection_list(cid, ghobject_t(), ghobject_t::get_max(), true, 5,
+			  &ls, &next);
+  if (r < 0)
+    return false;  // fixme?
+  bool empty = ls.empty();
+  dout(10) << __func__ << " " << cid << " = " << (int)empty << dendl;
+  return empty;
+}
+
+int NewStore::collection_list(
+  coll_t cid, ghobject_t start, ghobject_t end,
+  bool sort_bitwise, int max,
+  vector<ghobject_t> *ls, ghobject_t *pnext)
+{
+  dout(15) << __func__ << " " << cid
+	   << " start " << start << " end " << end << " max " << max << dendl;
+  if (!sort_bitwise)
+    return -EOPNOTSUPP;
+  CollectionRef c = _get_collection(cid);
+  if (!c)
+    return -ENOENT;
+  RWLock::RLocker l(c->lock);
+  int r = 0;
+  KeyValueDB::Iterator it;
+  string temp_start_key, temp_end_key;
+  string start_key, end_key;
+  bool set_next = false;
+  const char *pend;
+  bool temp;
+
+  ghobject_t static_next;
+  if (!pnext)
+    pnext = &static_next;
+
+  if (start == ghobject_t::get_max())
+    goto out;
+  get_coll_key_range(cid, c->cnode.bits, &temp_start_key, &temp_end_key,
+		     &start_key, &end_key);
+  dout(20) << __func__ << " range " << temp_start_key << " to "
+	   << temp_end_key << " and " << start_key << " to " << end_key
+	   << " start " << start << dendl;
+  it = db->get_iterator(PREFIX_OBJ);
+  if (start == ghobject_t()) {
+    it->upper_bound(temp_start_key);
+    temp = true;
+  } else {
+    string k;
+    get_object_key(start, &k);
+    if (start.hobj.is_temp()) {
+      temp = true;
+      assert(k >= temp_start_key && k < temp_end_key);
+    } else {
+      temp = false;
+      assert(k >= start_key && k < end_key);
+    }
+    it->upper_bound(k);
+  }
+  if (end.hobj.is_max()) {
+    pend = temp ? temp_end_key.c_str() : end_key.c_str();
+  } else {
+    get_object_key(end, &end_key);
+    if (end.hobj.is_temp()) {
+      if (temp)
+	pend = end_key.c_str();
+      else
+	goto out;
+    } else {
+      pend = temp ? temp_end_key.c_str() : end_key.c_str();
+    }
+  }
+  dout(30) << __func__ << " pend " << pend << dendl;
+  while (true) {
+    if (!it->valid() || strcmp(it->key().c_str(), pend) > 0) {
+      if (!it->valid())
+	dout(20) << __func__ << " iterator not valid (end of db?)" << dendl;
+      else
+	dout(20) << __func__ << " key " << it->key() << " > " << end << dendl;
+      if (temp) {
+	if (end.hobj.is_temp()) {
+	  break;
+	}
+	dout(30) << __func__ << " switch to non-temp namespace" << dendl;
+	temp = false;
+	it->upper_bound(start_key);
+	pend = end_key.c_str();
+	dout(30) << __func__ << " pend " << pend << dendl;
+	continue;
+      }
+      break;
+    }
+    dout(20) << __func__ << " key " << it->key() << dendl;
+    ghobject_t oid;
+    int r = get_key_object(it->key(), &oid);
+    assert(r == 0);
+    ls->push_back(oid);
+    if (ls->size() >= (unsigned)max) {
+      *pnext = oid;
+      set_next = true;
+      break;
+    }
+    it->next();
+  }
+  if (!set_next) {
+    *pnext = ghobject_t::get_max();
+  }
+ out:
+  dout(10) << __func__ << " " << cid
+	   << " start " << start << " end " << end << " max " << max
+	   << " = " << r << ", ls.size() = " << ls->size()
+	   << ", next = " << *pnext << dendl;
+  return r;
+}
+
+// omap reads
+
+NewStore::OmapIteratorImpl::OmapIteratorImpl(CollectionRef c, OnodeRef o, KeyValueDB::Iterator it)
+  : c(c), o(o), it(it)
+{
+  RWLock::RLocker l(c->lock);
+  if (o->onode.omap_head) {
+    get_omap_header(o->onode.omap_head, &head);
+    get_omap_tail(o->onode.omap_head, &tail);
+    it->lower_bound(head);
+  }
+}
+
+int NewStore::OmapIteratorImpl::seek_to_first()
+{
+  RWLock::RLocker l(c->lock);
+  if (o->onode.omap_head) {
+    it->lower_bound(head);
+  } else {
+    it = KeyValueDB::Iterator();
+  }
+  return 0;
+}
+
+int NewStore::OmapIteratorImpl::upper_bound(const string& after)
+{
+  RWLock::RLocker l(c->lock);
+  if (o->onode.omap_head) {
+    string key;
+    get_omap_key(o->onode.omap_head, after, &key);
+    it->upper_bound(key);
+  } else {
+    it = KeyValueDB::Iterator();
+  }
+  return 0;
+}
+
+int NewStore::OmapIteratorImpl::lower_bound(const string& to)
+{
+  RWLock::RLocker l(c->lock);
+  if (o->onode.omap_head) {
+    string key;
+    get_omap_key(o->onode.omap_head, to, &key);
+    it->lower_bound(key);
+  } else {
+    it = KeyValueDB::Iterator();
+  }
+  return 0;
+}
+
+bool NewStore::OmapIteratorImpl::valid()
+{
+  RWLock::RLocker l(c->lock);
+  if (o->onode.omap_head && it->valid() && it->raw_key().second <= tail) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+int NewStore::OmapIteratorImpl::next()
+{
+  RWLock::RLocker l(c->lock);
+  if (o->onode.omap_head) {
+    it->next();
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+string NewStore::OmapIteratorImpl::key()
+{
+  RWLock::RLocker l(c->lock);
+  assert(it->valid());
+  string db_key = it->raw_key().second;
+  string user_key;
+  decode_omap_key(db_key, &user_key);
+  return user_key;
+}
+
+bufferlist NewStore::OmapIteratorImpl::value()
+{
+  RWLock::RLocker l(c->lock);
+  assert(it->valid());
+  return it->value();
+}
+
+int NewStore::omap_get(
+  coll_t cid,                ///< [in] Collection containing oid
+  const ghobject_t &oid,   ///< [in] Object containing omap
+  bufferlist *header,      ///< [out] omap header
+  map<string, bufferlist> *out /// < [out] Key to value map
+  )
+{
+  dout(15) << __func__ << " " << cid << " oid " << oid << dendl;
+  CollectionRef c = _get_collection(cid);
+  if (!c)
+    return -ENOENT;
+  RWLock::RLocker l(c->lock);
+  int r = 0;
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+  if (!o->onode.omap_head)
+    goto out;
+  o->flush();
+  {
+    KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
+    string head, tail;
+    get_omap_header(o->onode.omap_head, &head);
+    get_omap_tail(o->onode.omap_head, &tail);
+    it->lower_bound(head);
+    while (it->valid()) {
+      if (it->key() == head) {
+	dout(30) << __func__ << "  got header" << dendl;
+	*header = it->value();
+      } else if (it->key() >= tail) {
+	dout(30) << __func__ << "  reached tail" << dendl;
+	break;
+      } else {
+	string user_key;
+	decode_omap_key(it->key(), &user_key);
+	dout(30) << __func__ << "  got " << it->key() << " -> " << user_key << dendl;
+	assert(it->key() < tail);
+	(*out)[user_key] = it->value();
+      }
+      it->next();
+    }
+  }
+ out:
+  dout(10) << __func__ << " " << cid << " oid " << oid << " = " << r << dendl;
+  return r;
+}
+
+int NewStore::omap_get_header(
+  coll_t cid,                ///< [in] Collection containing oid
+  const ghobject_t &oid,   ///< [in] Object containing omap
+  bufferlist *header,      ///< [out] omap header
+  bool allow_eio ///< [in] don't assert on eio
+  )
+{
+  dout(15) << __func__ << " " << cid << " oid " << oid << dendl;
+  CollectionRef c = _get_collection(cid);
+  if (!c)
+    return -ENOENT;
+  RWLock::RLocker l(c->lock);
+  int r = 0;
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+  if (!o->onode.omap_head)
+    goto out;
+  o->flush();
+  {
+    string head;
+    get_omap_header(o->onode.omap_head, &head);
+    if (db->get(PREFIX_OMAP, head, header) >= 0) {
+      dout(30) << __func__ << "  got header" << dendl;
+    } else {
+      dout(30) << __func__ << "  no header" << dendl;
+    }
+  }
+ out:
+  dout(10) << __func__ << " " << cid << " oid " << oid << " = " << r << dendl;
+  return r;
+}
+
+int NewStore::omap_get_keys(
+  coll_t cid,              ///< [in] Collection containing oid
+  const ghobject_t &oid, ///< [in] Object containing omap
+  set<string> *keys      ///< [out] Keys defined on oid
+  )
+{
+  dout(15) << __func__ << " " << cid << " oid " << oid << dendl;
+  CollectionRef c = _get_collection(cid);
+  if (!c)
+    return -ENOENT;
+  RWLock::RLocker l(c->lock);
+  int r = 0;
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+  if (!o->onode.omap_head)
+    goto out;
+  o->flush();
+  {
+    KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
+    string head, tail;
+    get_omap_header(o->onode.omap_head, &head);
+    get_omap_tail(o->onode.omap_head, &tail);
+    it->lower_bound(head);
+    while (it->valid()) {
+      if (it->key() == head) {
+	dout(30) << __func__ << "  skipping head" << dendl;
+	it->next();
+	continue;
+      }
+      if (it->key() >= tail) {
+	dout(30) << __func__ << "  reached tail" << dendl;
+	break;
+      }
+      string user_key;
+      decode_omap_key(it->key(), &user_key);
+      dout(30) << __func__ << "  got " << it->key() << " -> " << user_key << dendl;
+      assert(it->key() < tail);
+      keys->insert(user_key);
+      it->next();
+    }
+  }
+ out:
+  dout(10) << __func__ << " " << cid << " oid " << oid << " = " << r << dendl;
+  return r;
+}
+
+int NewStore::omap_get_values(
+  coll_t cid,                    ///< [in] Collection containing oid
+  const ghobject_t &oid,       ///< [in] Object containing omap
+  const set<string> &keys,     ///< [in] Keys to get
+  map<string, bufferlist> *out ///< [out] Returned keys and values
+  )
+{
+  dout(15) << __func__ << " " << cid << " oid " << oid << dendl;
+  CollectionRef c = _get_collection(cid);
+  if (!c)
+    return -ENOENT;
+  RWLock::RLocker l(c->lock);
+  int r = 0;
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+  if (!o->onode.omap_head)
+    goto out;
+  o->flush();
+  for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
+    string key;
+    get_omap_key(o->onode.omap_head, *p, &key);
+    bufferlist val;
+    if (db->get(PREFIX_OMAP, key, &val) >= 0) {
+      dout(30) << __func__ << "  got " << key << " -> " << *p << dendl;
+      out->insert(make_pair(*p, val));
+    }
+  }
+ out:
+  dout(10) << __func__ << " " << cid << " oid " << oid << " = " << r << dendl;
+  return r;
+}
+
+int NewStore::omap_check_keys(
+  coll_t cid,                ///< [in] Collection containing oid
+  const ghobject_t &oid,   ///< [in] Object containing omap
+  const set<string> &keys, ///< [in] Keys to check
+  set<string> *out         ///< [out] Subset of keys defined on oid
+  )
+{
+  dout(15) << __func__ << " " << cid << " oid " << oid << dendl;
+  CollectionRef c = _get_collection(cid);
+  if (!c)
+    return -ENOENT;
+  RWLock::RLocker l(c->lock);
+  int r = 0;
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+  if (!o->onode.omap_head)
+    goto out;
+  o->flush();
+  for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
+    string key;
+    get_omap_key(o->onode.omap_head, *p, &key);
+    bufferlist val;
+    if (db->get(PREFIX_OMAP, key, &val) >= 0) {
+      dout(30) << __func__ << "  have " << key << " -> " << *p << dendl;
+      out->insert(*p);
+    } else {
+      dout(30) << __func__ << "  miss " << key << " -> " << *p << dendl;
+    }
+  }
+ out:
+  dout(10) << __func__ << " " << cid << " oid " << oid << " = " << r << dendl;
+  return r;
+}
+
+ObjectMap::ObjectMapIterator NewStore::get_omap_iterator(
+  coll_t cid,              ///< [in] collection
+  const ghobject_t &oid  ///< [in] object
+  )
+{
+
+  dout(10) << __func__ << " " << cid << " " << oid << dendl;
+  CollectionRef c = _get_collection(cid);
+  if (!c) {
+    dout(10) << __func__ << " " << cid << "doesn't exist" <<dendl;
+    return ObjectMap::ObjectMapIterator();
+  }
+  RWLock::RLocker l(c->lock);
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o) {
+    dout(10) << __func__ << " " << oid << "doesn't exist" <<dendl;
+    return ObjectMap::ObjectMapIterator();
+  }
+  o->flush();
+  dout(10) << __func__ << " header = " << o->onode.omap_head <<dendl;
+  KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
+  return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c, o, it));
+}
+
+
+// -----------------
+// write helpers
+
+int NewStore::_recover_next_nid()
+{
+  nid_max = 0;
+  bufferlist bl;
+  db->get(PREFIX_SUPER, "nid_max", &bl);
+  try {
+    ::decode(nid_max, bl);
+  } catch (buffer::error& e) {
+  }
+  dout(1) << __func__ << " old nid_max " << nid_max << dendl;
+  nid_last = nid_max;
+  return 0;
+}
+
+void NewStore::_assign_nid(TransContext *txc, OnodeRef o)
+{
+  if (o->onode.nid)
+    return;
+  Mutex::Locker l(nid_lock);
+  o->onode.nid = ++nid_last;
+  dout(20) << __func__ << " " << o->onode.nid << dendl;
+  if (nid_last > nid_max) {
+    nid_max += g_conf->newstore_nid_prealloc;
+    bufferlist bl;
+    ::encode(nid_max, bl);
+    txc->t->set(PREFIX_SUPER, "nid_max", bl);
+    dout(10) << __func__ << " nid_max now " << nid_max << dendl;
+  }
+}
+
+int NewStore::_recover_next_fid()
+{
+  bufferlist bl;
+  db->get(PREFIX_SUPER, "fid_max", &bl);
+  try {
+    ::decode(fid_max, bl);
+  } catch (buffer::error& e) {
+  }
+  dout(1) << __func__ << " old fid_max " << fid_max << dendl;
+  fid_last = fid_max;
+
+  if (fid_last.fset > 0) {
+    char s[32];
+    snprintf(s, sizeof(s), "%u", fid_last.fset);
+    assert(fset_fd < 0);
+    fset_fd = ::openat(frag_fd, s, O_DIRECTORY, 0644);
+    if (fset_fd < 0) {
+      int r = -errno;
+      derr << __func__ << " cannot open created " << path << "/fragments/"
+	 << s << ": " << cpp_strerror(r) << dendl;
+      return r;
+    }
+  }
+
+  return 0;
+}
+
+int NewStore::_open_fid(fid_t fid, unsigned flags)
+{
+  if (fid.handle.length() && g_conf->newstore_open_by_handle) {
+    int fd = fs->open_handle(path_fd, fid.handle, flags);
+    if (fd >= 0) {
+      dout(30) << __func__ << " " << fid << " = " << fd
+	       << " (open by handle)" << dendl;
+      return fd;
+    }
+    int err = -errno;
+    dout(30) << __func__ << " " << fid << " = " << cpp_strerror(err)
+	     << " (with open by handle, falling back to file name)" << dendl;
+  }
+
+  char fn[32];
+  snprintf(fn, sizeof(fn), "%u/%u", fid.fset, fid.fno);
+  int fd = ::openat(frag_fd, fn, flags);
+  if (fd < 0) {
+    int r = -errno;
+    derr << __func__ << " on " << fid << ": " << cpp_strerror(r) << dendl;
+    return r;
+  }
+  dout(30) << __func__ << " " << fid << " = " << fd << dendl;
+  return fd;
+}
+
+int NewStore::_create_fid(TransContext *txc, fid_t *fid, unsigned flags)
+{
+  {
+    Mutex::Locker l(fid_lock);
+    if (fid_last.fset > 0 &&
+	fid_last.fno > 0 &&
+	fid_last.fset == fid_max.fset &&
+	fid_last.fno < g_conf->newstore_max_dir_size) {
+      ++fid_last.fno;
+      if (fid_last.fno >= fid_max.fno) {
+	// raise fid_max, same fset, capping to max_dir_size
+	fid_max.fno = min(fid_max.fno + g_conf->newstore_fid_prealloc, g_conf->newstore_max_dir_size);
+	assert(fid_max.fno >= fid_last.fno);
+	bufferlist bl;
+	::encode(fid_max, bl);
+	txc->t->set(PREFIX_SUPER, "fid_max", bl);
+	dout(10) << __func__ << " fid_max now " << fid_max << dendl;
+      }
+    } else {
+      // new fset
+      ++fid_last.fset;
+      fid_last.fno = 1;
+      dout(10) << __func__ << " creating " << fid_last.fset << dendl;
+      char s[32];
+      snprintf(s, sizeof(s), "%u", fid_last.fset);
+      int r = ::mkdirat(frag_fd, s, 0755);
+      if (r < 0) {
+	r = -errno;
+	derr << __func__ << " cannot create " << path << "/fragments/"
+	     << s << ": " << cpp_strerror(r) << dendl;
+	return r;
+      }
+      if (fset_fd >= 0)
+	VOID_TEMP_FAILURE_RETRY(::close(fset_fd));
+      fset_fd = ::openat(frag_fd, s, O_DIRECTORY, 0644);
+      if (fset_fd < 0) {
+	r = -errno;
+	derr << __func__ << " cannot open created " << path << "/fragments/"
+	     << s << ": " << cpp_strerror(r) << dendl;
+      }
+
+      fid_max = fid_last;
+      fid_max.fno = g_conf->newstore_fid_prealloc;
+      bufferlist bl;
+      ::encode(fid_max, bl);
+      txc->t->set(PREFIX_SUPER, "fid_max", bl);
+      dout(10) << __func__ << " fid_max now " << fid_max << dendl;
+    }
+    *fid = fid_last;
+  }
+
+  dout(10) << __func__ << " " << fid_last << dendl;
+  char s[32];
+  snprintf(s, sizeof(s), "%u", fid->fno);
+  int fd = ::openat(fset_fd, s, flags | O_CREAT, 0644);
+  if (fd < 0) {
+    int r = -errno;
+    derr << __func__ << " cannot create " << path << "/fragments/"
+	 << *fid << ": " << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  if (g_conf->newstore_open_by_handle) {
+    int r = fs->get_handle(fd, &fid->handle);
+    if (r < 0) {
+      dout(30) << __func__ << " get_handle got " << cpp_strerror(r) << dendl;
+    } else {
+      dout(30) << __func__ << " got handle: ";
+      bufferlist bl;
+      bl.append(fid->handle);
+      bl.hexdump(*_dout);
+      *_dout << dendl;
+    }
+  }
+
+  dout(30) << __func__ << " " << *fid << " = " << fd << dendl;
+  return fd;
+}
+
+int NewStore::_remove_fid(fid_t fid)
+{
+  char fn[32];
+  snprintf(fn, sizeof(fn), "%u/%u", fid.fset, fid.fno);
+  int r = ::unlinkat(frag_fd, fn, 0);
+  if (r < 0)
+    return -errno;
+  return 0;
+}
+
+NewStore::TransContext *NewStore::_txc_create(OpSequencer *osr)
+{
+  TransContext *txc = new TransContext(osr);
+  txc->t = db->get_transaction();
+  osr->queue_new(txc);
+  dout(20) << __func__ << " osr " << osr << " = " << txc << dendl;
+  return txc;
+}
+
+void NewStore::_txc_state_proc(TransContext *txc)
+{
+  while (true) {
+    dout(10) << __func__ << " txc " << txc
+	     << " " << txc->get_state_name() << dendl;
+    switch (txc->state) {
+    case TransContext::STATE_PREPARE:
+      if (!txc->pending_aios.empty()) {
+	txc->state = TransContext::STATE_AIO_WAIT;
+	_txc_aio_submit(txc);
+	return;
+      }
+      // ** fall-thru **
+
+    case TransContext::STATE_AIO_WAIT:
+      if (!txc->sync_items.empty()) {
+	txc->state = TransContext::STATE_FSYNC_WAIT;
+	if (!g_conf->newstore_sync_io) {
+	  _txc_queue_fsync(txc);
+	  return;
+	}
+	_txc_do_sync_fsync(txc);
+      }
+      _txc_finish_io(txc);  // may trigger blocked txc's too
+      return;
+
+    case TransContext::STATE_IO_DONE:
+      assert(txc->osr->qlock.is_locked());  // see _txc_finish_io
+      txc->state = TransContext::STATE_KV_QUEUED;
+      if (!g_conf->newstore_sync_transaction) {
+	Mutex::Locker l(kv_lock);
+	if (g_conf->newstore_sync_submit_transaction) {
+	  db->submit_transaction(txc->t);
+	}
+	kv_queue.push_back(txc);
+	kv_cond.SignalOne();
+	return;
+      }
+      db->submit_transaction_sync(txc->t);
+      break;
+
+    case TransContext::STATE_KV_QUEUED:
+      txc->state = TransContext::STATE_KV_DONE;
+      _txc_finish_kv(txc);
+      // ** fall-thru **
+
+    case TransContext::STATE_KV_DONE:
+      if (txc->wal_txn) {
+	txc->state = TransContext::STATE_WAL_QUEUED;
+	if (g_conf->newstore_sync_wal_apply) {
+	  _wal_apply(txc);
+	} else {
+	  wal_wq.queue(txc);
+	}
+	return;
+      }
+      txc->state = TransContext::STATE_FINISHING;
+      break;
+
+    case TransContext::STATE_WAL_APPLYING:
+      if (!txc->pending_aios.empty()) {
+	txc->state = TransContext::STATE_WAL_AIO_WAIT;
+	_txc_aio_submit(txc);
+	return;
+      }
+      // ** fall-thru **
+
+    case TransContext::STATE_WAL_AIO_WAIT:
+      _wal_finish(txc);
+      return;
+
+    case TransContext::STATE_WAL_CLEANUP:
+      txc->state = TransContext::STATE_FINISHING;
+      // ** fall-thru **
+
+    case TransContext::TransContext::STATE_FINISHING:
+      _txc_finish(txc);
+      return;
+
+    default:
+      derr << __func__ << " unexpected txc " << txc
+	   << " state " << txc->get_state_name() << dendl;
+      assert(0 == "unexpected txc state");
+      return;
+    }
+  }
+}
+
+void NewStore::_txc_process_fsync(fsync_item *i)
+{
+  dout(20) << __func__ << " txc " << i->txc << dendl;
+  int r = ::fdatasync(i->fd);
+  if (r < 0) {
+    r = -errno;
+    derr << __func__ << " error from fdatasync on " << i->fd
+	 << " txc " << i->txc
+	 << ": " << cpp_strerror(r) << dendl;
+    assert(0 == "error from fdatasync");
+  }
+  VOID_TEMP_FAILURE_RETRY(::close(i->fd));
+  if (i->txc->finish_fsync()) {
+    _txc_finish_io(i->txc);
+  }
+  dout(20) << __func__ << " txc " << i->txc << " done" << dendl;
+}
+
+void NewStore::_txc_finish_io(TransContext *txc)
+{
+  dout(20) << __func__ << " " << txc << dendl;
+
+  /*
+   * we need to preserve the order of kv transactions,
+   * even though fsyncs will complete in any order.
+   */
+
+  OpSequencer *osr = txc->osr.get();
+  Mutex::Locker l(osr->qlock);
+  txc->state = TransContext::STATE_IO_DONE;
+
+  OpSequencer::q_list_t::iterator p = osr->q.iterator_to(*txc);
+  while (p != osr->q.begin()) {
+    --p;
+    if (p->state < TransContext::STATE_IO_DONE) {
+      dout(20) << __func__ << " " << txc << " blocked by " << &*p << " "
+	       << p->get_state_name() << dendl;
+      return;
+    }
+    if (p->state > TransContext::STATE_IO_DONE) {
+      ++p;
+      break;
+    }
+  }
+  do {
+    _txc_state_proc(&*p++);
+  } while (p != osr->q.end() &&
+	   p->state == TransContext::STATE_IO_DONE);
+}
+
+int NewStore::_txc_finalize(OpSequencer *osr, TransContext *txc)
+{
+  dout(20) << __func__ << " osr " << osr << " txc " << txc
+	   << " onodes " << txc->onodes << dendl;
+
+  // finalize onodes
+  for (set<OnodeRef>::iterator p = txc->onodes.begin();
+       p != txc->onodes.end();
+       ++p) {
+    bufferlist bl;
+    ::encode((*p)->onode, bl);
+    txc->t->set(PREFIX_OBJ, (*p)->key, bl);
+
+    Mutex::Locker l((*p)->flush_lock);
+    (*p)->flush_txns.insert(txc);
+  }
+
+  // journal wal items
+  if (txc->wal_txn) {
+    txc->wal_txn->seq = wal_seq.inc();
+    bufferlist bl;
+    ::encode(*txc->wal_txn, bl);
+    string key;
+    get_wal_key(txc->wal_txn->seq, &key);
+    txc->t->set(PREFIX_WAL, key, bl);
+  }
+
+  return 0;
+}
+
+void NewStore::_txc_queue_fsync(TransContext *txc)
+{
+  dout(20) << __func__ << " txc " << txc << dendl;
+  fsync_wq.lock();
+  for (list<fsync_item>::iterator p = txc->sync_items.begin();
+       p != txc->sync_items.end();
+       ++p) {
+    fsync_wq._enqueue(&*p);
+    fsync_wq._wake();
+  }
+  fsync_wq.unlock();
+}
+
+void NewStore::_txc_do_sync_fsync(TransContext *txc)
+{
+  dout(20) << __func__ << " txc " << txc << dendl;
+  for (list<fsync_item>::iterator p = txc->sync_items.begin();
+       p != txc->sync_items.end(); ++p) {
+    dout(30) << __func__ << " fsync " << p->fd << dendl;
+    int r = ::fdatasync(p->fd);
+    if (r < 0) {
+      r = -errno;
+      derr << __func__ << " fsync: " << cpp_strerror(r) << dendl;
+      assert(0 == "fsync error");
+    }
+    VOID_TEMP_FAILURE_RETRY(::close(p->fd));
+  }
+}
+
+void NewStore::_txc_finish_kv(TransContext *txc)
+{
+  dout(20) << __func__ << " txc " << txc << dendl;
+
+  // warning: we're calling onreadable_sync inside the sequencer lock
+  if (txc->onreadable_sync) {
+    txc->onreadable_sync->complete(0);
+    txc->onreadable_sync = NULL;
+  }
+  if (txc->onreadable) {
+    finisher.queue(txc->onreadable);
+    txc->onreadable = NULL;
+  }
+  if (txc->oncommit) {
+    finisher.queue(txc->oncommit);
+    txc->oncommit = NULL;
+  }
+  while (!txc->oncommits.empty()) {
+    finisher.queue(txc->oncommits.front());
+    txc->oncommits.pop_front();
+  }
+
+  throttle_ops.put(txc->ops);
+  throttle_bytes.put(txc->bytes);
+}
+
+void NewStore::_txc_finish(TransContext *txc)
+{
+  dout(20) << __func__ << " " << txc << " onodes " << txc->onodes << dendl;
+  assert(txc->state == TransContext::STATE_FINISHING);
+
+  for (set<OnodeRef>::iterator p = txc->onodes.begin();
+       p != txc->onodes.end();
+       ++p) {
+    Mutex::Locker l((*p)->flush_lock);
+    dout(20) << __func__ << " onode " << *p << " had " << (*p)->flush_txns
+	     << dendl;
+    assert((*p)->flush_txns.count(txc));
+    (*p)->flush_txns.erase(txc);
+    if ((*p)->flush_txns.empty())
+      (*p)->flush_cond.Signal();
+  }
+
+  // clear out refs
+  txc->onodes.clear();
+
+  while (!txc->removed_collections.empty()) {
+    _queue_reap_collection(txc->removed_collections.front());
+    txc->removed_collections.pop_front();
+  }
+
+  throttle_wal_ops.put(txc->ops);
+  throttle_wal_bytes.put(txc->bytes);
+
+  OpSequencerRef osr = txc->osr;
+  osr->qlock.Lock();
+  txc->state = TransContext::STATE_DONE;
+  osr->qlock.Unlock();
+
+  _osr_reap_done(osr.get());
+}
+
+void NewStore::_osr_reap_done(OpSequencer *osr)
+{
+  Mutex::Locker l(osr->qlock);
+  dout(20) << __func__ << " osr " << osr << dendl;
+  while (!osr->q.empty()) {
+    TransContext *txc = &osr->q.front();
+    dout(20) << __func__ << "  txc " << txc << " " << txc->get_state_name()
+	     << dendl;
+    if (txc->state != TransContext::STATE_DONE) {
+      break;
+    }
+
+    if (txc->first_collection) {
+      txc->first_collection->onode_map.trim(g_conf->newstore_onode_map_size);
+    }
+
+    osr->q.pop_front();
+    delete txc;
+    osr->qcond.Signal();
+  }
+}
+
+void NewStore::_aio_thread()
+{
+  dout(10) << __func__ << " start" << dendl;
+  while (!aio_stop) {
+    dout(40) << __func__ << " polling" << dendl;
+    int max = 16;
+    FS::aio_t *aio[max];
+    int r = aio_queue.get_next_completed(g_conf->newstore_aio_poll_ms,
+					 aio, max);
+    if (r < 0) {
+      derr << __func__ << " got " << cpp_strerror(r) << dendl;
+    }
+    if (r > 0) {
+      dout(30) << __func__ << " got " << r << " completed aios" << dendl;
+      for (int i = 0; i < r; ++i) {
+	TransContext *txc = static_cast<TransContext*>(aio[i]->priv);
+	int left = txc->num_aio.dec();
+	dout(10) << __func__ << " finished aio " << aio[i] << " txc " << txc
+		 << " state " << txc->get_state_name() << ", "
+		 << left << " aios left" << dendl;
+	VOID_TEMP_FAILURE_RETRY(::close(aio[i]->fd));
+	if (left == 0) {
+	  _txc_state_proc(txc);
+	}
+      }
+    }
+  }
+  dout(10) << __func__ << " end" << dendl;
+}
+
+void NewStore::_kv_sync_thread()
+{
+  dout(10) << __func__ << " start" << dendl;
+  kv_lock.Lock();
+  while (true) {
+    assert(kv_committing.empty());
+    assert(wal_cleaning.empty());
+    if (kv_queue.empty() && wal_cleanup_queue.empty()) {
+      if (kv_stop)
+	break;
+      dout(20) << __func__ << " sleep" << dendl;
+      kv_sync_cond.Signal();
+      kv_cond.Wait(kv_lock);
+      dout(20) << __func__ << " wake" << dendl;
+    } else {
+      dout(20) << __func__ << " committing " << kv_queue.size()
+	       << " cleaning " << wal_cleanup_queue.size() << dendl;
+      kv_committing.swap(kv_queue);
+      wal_cleaning.swap(wal_cleanup_queue);
+      utime_t start = ceph_clock_now(NULL);
+      kv_lock.Unlock();
+
+      if (!g_conf->newstore_sync_submit_transaction) {
+	for (std::deque<TransContext *>::iterator it = kv_committing.begin();
+	     it != kv_committing.end();
+	     ++it) {
+	  db->submit_transaction((*it)->t);
+	}
+      }
+
+      // one transaction to force a sync.  clean up wal keys while we
+      // are at it.
+      KeyValueDB::Transaction txc_cleanup_sync = db->get_transaction();
+      for (std::deque<TransContext *>::iterator it = wal_cleaning.begin();
+	    it != wal_cleaning.end();
+	    ++it) {
+	wal_transaction_t& wt =*(*it)->wal_txn;
+	// cleanup the data in overlays
+	for (list<wal_op_t>::iterator p = wt.ops.begin(); p != wt.ops.end(); ++p) {
+	  for (vector<overlay_t>::iterator q = p->overlays.begin();
+               q != p->overlays.end(); ++q) {
+            string key;
+            get_overlay_key(p->nid, q->key, &key);
+	    txc_cleanup_sync->rmkey(PREFIX_OVERLAY, key);
+	  }
+	}
+	// cleanup the shared overlays. this may double delete something we
+	// did above, but that's less work than doing careful ref counting
+	// of the overlay key/value pairs.
+	for (vector<string>::iterator p = wt.shared_overlay_keys.begin();
+             p != wt.shared_overlay_keys.end(); ++p) {
+	  txc_cleanup_sync->rmkey(PREFIX_OVERLAY, *p);
+	}
+	// cleanup the wal
+	string key;
+	get_wal_key(wt.seq, &key);
+	txc_cleanup_sync->rmkey(PREFIX_WAL, key);
+      }
+      db->submit_transaction_sync(txc_cleanup_sync);
+      utime_t finish = ceph_clock_now(NULL);
+      utime_t dur = finish - start;
+      dout(20) << __func__ << " committed " << kv_committing.size()
+	       << " cleaned " << wal_cleaning.size()
+	       << " in " << dur << dendl;
+      while (!kv_committing.empty()) {
+	TransContext *txc = kv_committing.front();
+	_txc_state_proc(txc);
+	kv_committing.pop_front();
+      }
+      while (!wal_cleaning.empty()) {
+	TransContext *txc = wal_cleaning.front();
+	_txc_state_proc(txc);
+	wal_cleaning.pop_front();
+      }
+
+      // this is as good a place as any ...
+      _reap_collections();
+
+      kv_lock.Lock();
+    }
+  }
+  kv_lock.Unlock();
+  dout(10) << __func__ << " finish" << dendl;
+}
+
+wal_op_t *NewStore::_get_wal_op(TransContext *txc)
+{
+  if (!txc->wal_txn) {
+    txc->wal_txn = new wal_transaction_t;
+  }
+  txc->wal_txn->ops.push_back(wal_op_t());
+  return &txc->wal_txn->ops.back();
+}
+
+int NewStore::_wal_apply(TransContext *txc)
+{
+  wal_transaction_t& wt = *txc->wal_txn;
+  dout(20) << __func__ << " txc " << txc << " seq " << wt.seq << dendl;
+  txc->state = TransContext::STATE_WAL_APPLYING;
+
+  assert(txc->pending_aios.empty());
+  int r = _do_wal_transaction(wt, txc);
+  assert(r == 0);
+
+  _txc_state_proc(txc);
+  return 0;
+}
+
+int NewStore::_wal_finish(TransContext *txc)
+{
+  wal_transaction_t& wt = *txc->wal_txn;
+  dout(20) << __func__ << " txc " << " seq " << wt.seq << txc << dendl;
+
+  Mutex::Locker l(kv_lock);
+  txc->state = TransContext::STATE_WAL_CLEANUP;
+  wal_cleanup_queue.push_back(txc);
+  kv_cond.SignalOne();
+  return 0;
+}
+
+int NewStore::_do_wal_transaction(wal_transaction_t& wt,
+				  TransContext *txc)
+{
+  vector<int> sync_fds;
+  sync_fds.reserve(wt.ops.size());
+
+  // read all the overlay data first for apply
+  _do_read_all_overlays(wt);
+
+  for (list<wal_op_t>::iterator p = wt.ops.begin(); p != wt.ops.end(); ++p) {
+    switch (p->op) {
+    case wal_op_t::OP_WRITE:
+      {
+	dout(20) << __func__ << " write " << p->fid << " "
+		 << p->offset << "~" << p->length << dendl;
+	unsigned flags = O_RDWR;
+	if (g_conf->newstore_o_direct &&
+	    (p->offset & ~CEPH_PAGE_MASK) == 0 &&
+	    (p->length & ~CEPH_PAGE_MASK) == 0) {
+	  dout(20) << __func__ << " page-aligned io, using O_DIRECT, "
+		   << p->data.buffers().size() << " buffers" << dendl;
+	  flags |= O_DIRECT | O_DSYNC;
+	  if (!p->data.is_page_aligned()) {
+	    dout(20) << __func__ << " rebuilding buffer to be page-aligned"
+		     << dendl;
+	    p->data.rebuild();
+	  }
+	}
+	int fd = _open_fid(p->fid, flags);
+	if (fd < 0)
+	  return fd;
+#ifdef HAVE_LIBAIO
+	if (g_conf->newstore_aio && txc && (flags & O_DIRECT)) {
+	  txc->pending_aios.push_back(FS::aio_t(txc, fd));
+	  FS::aio_t& aio = txc->pending_aios.back();
+	  p->data.prepare_iov(&aio.iov);
+	  aio.pwritev(p->offset);
+	  dout(2) << __func__ << " prepared aio " << &aio << dendl;
+	} else
+#endif
+	{
+	  int r = ::lseek64(fd, p->offset, SEEK_SET);
+	  if (r < 0) {
+	    r = -errno;
+	    derr << __func__ << " lseek64 on " << fd << " got: "
+		 << cpp_strerror(r) << dendl;
+	    return r;
+	  }
+	  r = p->data.write_fd(fd);
+	  if (r < 0) {
+	    derr << __func__ << " write_fd on " << fd << " got: "
+		 << cpp_strerror(r) << dendl;
+	    return r;
+	  }
+	  if (!(flags & O_DIRECT))
+	    sync_fds.push_back(fd);
+	  else
+	    VOID_TEMP_FAILURE_RETRY(::close(fd));
+	}
+      }
+      break;
+    case wal_op_t::OP_ZERO:
+      {
+	dout(20) << __func__ << " zero " << p->fid << " "
+		 << p->offset << "~" << p->length << dendl;
+	int fd = _open_fid(p->fid, O_RDWR);
+	if (fd < 0)
+	  return fd;
+	int r = fs->zero(fd, p->offset, p->length);
+	if (r < 0) {
+	  derr << __func__ << " zero on " << fd << " got: "
+	       << cpp_strerror(r) << dendl;
+	  return r;
+	}
+	// FIXME: do aio fdatasync?
+	sync_fds.push_back(fd);
+      }
+      break;
+    case wal_op_t::OP_TRUNCATE:
+      {
+	dout(20) << __func__ << " truncate " << p->fid << " "
+		 << p->offset << dendl;
+	int fd = _open_fid(p->fid, O_RDWR);
+	if (fd < 0)
+	  return fd;
+	int r = ::ftruncate(fd, p->offset);
+	if (r < 0) {
+	  r = -errno;
+	  derr << __func__ << " truncate on " << fd << " got: "
+	       << cpp_strerror(r) << dendl;
+	  return r;
+	}
+	// note: we are not syncing this truncate.  instead, we are
+	// careful about only reading as much of the fragment as we
+	// know is valid, and truncating to expected size before
+	// extending the file.
+      }
+      break;
+
+    case wal_op_t::OP_REMOVE:
+      dout(20) << __func__ << " remove " << p->fid << dendl;
+      _remove_fid(p->fid);
+      // note: we do not fsync the directory.  instead, we tolerate
+      // leaked fragments in a crash.  in practice, this will be
+      // exceedingly rare.
+      break;
+
+    default:
+      assert(0 == "unrecognized wal op");
+    }
+  }
+
+  for (vector<int>::iterator p = sync_fds.begin();
+       p != sync_fds.end();
+       ++p) {
+    int r = ::fdatasync(*p);
+    assert(r == 0);
+    VOID_TEMP_FAILURE_RETRY(::close(*p));
+  }
+
+  return 0;
+}
+
+int NewStore::_wal_replay()
+{
+  dout(10) << __func__ << " start" << dendl;
+  KeyValueDB::Iterator it = db->get_iterator(PREFIX_WAL);
+  it->lower_bound(string());
+  KeyValueDB::Transaction cleanup = db->get_transaction();
+  int count = 0;
+  while (it->valid()) {
+    bufferlist bl = it->value();
+    bufferlist::iterator p = bl.begin();
+    wal_transaction_t wt;
+    try {
+      ::decode(wt, p);
+    } catch (buffer::error& e) {
+      derr << __func__ << " failed to decode wal txn " << it->key() << dendl;
+      return -EIO;
+    }
+
+    // Get the overlay data of the WAL for replay
+    _do_read_all_overlays(wt);
+    dout(20) << __func__ << " replay " << it->key() << dendl;
+    int r = _do_wal_transaction(wt, NULL);  // don't bother with aio here
+    if (r < 0)
+      return r;
+    cleanup->rmkey(PREFIX_WAL, it->key());
+    ++count;
+    it->next();
+  }
+  if (count) {
+    dout(10) << __func__ << " cleanup" << dendl;
+    db->submit_transaction_sync(cleanup);
+  }
+  dout(10) << __func__ << " completed " << count << " events" << dendl;
+  return 0;
+}
+
+// ---------------------------
+// transactions
+
+int NewStore::queue_transactions(
+    Sequencer *posr,
+    list<Transaction*>& tls,
+    TrackedOpRef op,
+    ThreadPool::TPHandle *handle)
+{
+  Context *onreadable;
+  Context *ondisk;
+  Context *onreadable_sync;
+  ObjectStore::Transaction::collect_contexts(
+    tls, &onreadable, &ondisk, &onreadable_sync);
+  int r;
+
+  // set up the sequencer
+  OpSequencer *osr;
+  assert(posr);
+  if (posr->p) {
+    osr = static_cast<OpSequencer *>(posr->p.get());
+    dout(5) << __func__ << " existing " << osr << " " << *osr << dendl;
+  } else {
+    osr = new OpSequencer;
+    osr->parent = posr;
+    posr->p = osr;
+    dout(5) << __func__ << " new " << osr << " " << *osr << dendl;
+  }
+
+  // prepare
+  TransContext *txc = _txc_create(osr);
+  txc->onreadable = onreadable;
+  txc->onreadable_sync = onreadable_sync;
+  txc->oncommit = ondisk;
+
+  for (list<Transaction*>::iterator p = tls.begin(); p != tls.end(); ++p) {
+    (*p)->set_osr(osr);
+    txc->ops += (*p)->get_num_ops();
+    txc->bytes += (*p)->get_num_bytes();
+    _txc_add_transaction(txc, *p);
+  }
+
+  r = _txc_finalize(osr, txc);
+  assert(r == 0);
+
+  throttle_ops.get(txc->ops);
+  throttle_bytes.get(txc->bytes);
+  throttle_wal_ops.get(txc->ops);
+  throttle_wal_bytes.get(txc->bytes);
+
+  // execute (start)
+  _txc_state_proc(txc);
+  return 0;
+}
+
+void NewStore::_txc_aio_submit(TransContext *txc)
+{
+  int num = txc->pending_aios.size();
+  dout(10) << __func__ << " txc " << txc << " submitting " << num << dendl;
+  assert(num > 0);
+  txc->num_aio.set(num);
+
+  // move these aside, and get our end iterator position now, as the
+  // aios might complete as soon as they are submitted and queue more
+  // wal aio's.
+  list<FS::aio_t>::iterator e = txc->submitted_aios.begin();
+  txc->submitted_aios.splice(e, txc->pending_aios);
+  list<FS::aio_t>::iterator p = txc->submitted_aios.begin();
+  assert(p != e);
+  bool done = false;
+  while (!done) {
+    FS::aio_t& aio = *p;
+    dout(20) << __func__ << " aio " << &aio << " fd " << aio.fd << dendl;
+    for (vector<iovec>::iterator q = aio.iov.begin(); q != aio.iov.end(); ++q)
+      dout(30) << __func__ << "  iov " << (void*)q->iov_base
+	       << " len " << q->iov_len << dendl;
+    dout(30) << " fd " << aio.fd << " offset " << lseek64(aio.fd, 0, SEEK_CUR)
+	     << dendl;
+
+    // be careful: as soon as we submit aio we race with completion.
+    // since we are holding a ref take care not to dereference txc at
+    // all after that point.
+    list<FS::aio_t>::iterator cur = p;
+    ++p;
+    done = (p == e);
+
+    // do not dereference txc (or it's contents) after we submit (if
+    // done == true and we don't loop)
+    int retries = 0;
+    int r = aio_queue.submit(*cur, &retries);
+    if (retries)
+      derr << __func__ << " retries " << retries << dendl;
+    if (r) {
+      derr << " aio submit got " << cpp_strerror(r) << dendl;
+      assert(r == 0);
+    }
+  }
+}
+
+int NewStore::_txc_add_transaction(TransContext *txc, Transaction *t)
+{
+  Transaction::iterator i = t->begin();
+  int pos = 0;
+
+  vector<CollectionRef> cvec(i.colls.size());
+  unsigned j = 0;
+  for (vector<coll_t>::iterator p = i.colls.begin(); p != i.colls.end();
+       ++p, ++j) {
+    cvec[j] = _get_collection(*p);
+
+    // note first collection we reference
+    if (!j && !txc->first_collection)
+      txc->first_collection = cvec[j];
+  }
+
+  while (i.have_op()) {
+    Transaction::Op *op = i.decode_op();
+    int r = 0;
+    CollectionRef &c = cvec[op->cid];
+
+    switch (op->op) {
+    case Transaction::OP_NOP:
+      break;
+    case Transaction::OP_TOUCH:
+      {
+        const ghobject_t &oid = i.get_oid(op->oid);
+	r = _touch(txc, c, oid);
+      }
+      break;
+
+    case Transaction::OP_WRITE:
+      {
+        const ghobject_t &oid = i.get_oid(op->oid);
+        uint64_t off = op->off;
+        uint64_t len = op->len;
+	uint32_t fadvise_flags = i.get_fadvise_flags();
+        bufferlist bl;
+        i.decode_bl(bl);
+	r = _write(txc, c, oid, off, len, bl, fadvise_flags);
+      }
+      break;
+
+    case Transaction::OP_ZERO:
+      {
+        const ghobject_t &oid = i.get_oid(op->oid);
+        uint64_t off = op->off;
+        uint64_t len = op->len;
+	r = _zero(txc, c, oid, off, len);
+      }
+      break;
+
+    case Transaction::OP_TRIMCACHE:
+      {
+        // deprecated, no-op
+      }
+      break;
+
+    case Transaction::OP_TRUNCATE:
+      {
+        const ghobject_t& oid = i.get_oid(op->oid);
+        uint64_t off = op->off;
+	r = _truncate(txc, c, oid, off);
+      }
+      break;
+
+    case Transaction::OP_REMOVE:
+      {
+        const ghobject_t& oid = i.get_oid(op->oid);
+	r = _remove(txc, c, oid);
+      }
+      break;
+
+    case Transaction::OP_SETATTR:
+      {
+        const ghobject_t &oid = i.get_oid(op->oid);
+        string name = i.decode_string();
+        bufferlist bl;
+        i.decode_bl(bl);
+	map<string, bufferptr> to_set;
+	to_set[name] = bufferptr(bl.c_str(), bl.length());
+	r = _setattrs(txc, c, oid, to_set);
+      }
+      break;
+
+    case Transaction::OP_SETATTRS:
+      {
+        const ghobject_t& oid = i.get_oid(op->oid);
+        map<string, bufferptr> aset;
+        i.decode_attrset(aset);
+	r = _setattrs(txc, c, oid, aset);
+      }
+      break;
+
+    case Transaction::OP_RMATTR:
+      {
+        const ghobject_t &oid = i.get_oid(op->oid);
+	string name = i.decode_string();
+	r = _rmattr(txc, c, oid, name);
+      }
+      break;
+
+    case Transaction::OP_RMATTRS:
+      {
+        const ghobject_t &oid = i.get_oid(op->oid);
+	r = _rmattrs(txc, c, oid);
+      }
+      break;
+
+    case Transaction::OP_CLONE:
+      {
+        const ghobject_t& oid = i.get_oid(op->oid);
+        const ghobject_t& noid = i.get_oid(op->dest_oid);
+	r = _clone(txc, c, oid, noid);
+      }
+      break;
+
+    case Transaction::OP_CLONERANGE:
+      assert(0 == "deprecated");
+      break;
+
+    case Transaction::OP_CLONERANGE2:
+      {
+        const ghobject_t &oid = i.get_oid(op->oid);
+        const ghobject_t &noid = i.get_oid(op->dest_oid);
+        uint64_t srcoff = op->off;
+        uint64_t len = op->len;
+        uint64_t dstoff = op->dest_off;
+	r = _clone_range(txc, c, oid, noid, srcoff, len, dstoff);
+      }
+      break;
+
+    case Transaction::OP_MKCOLL:
+      {
+	assert(!c);
+        coll_t cid = i.get_cid(op->cid);
+	r = _create_collection(txc, cid, op->split_bits, &c);
+      }
+      break;
+
+    case Transaction::OP_COLL_HINT:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        uint32_t type = op->hint_type;
+        bufferlist hint;
+        i.decode_bl(hint);
+        bufferlist::iterator hiter = hint.begin();
+        if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
+          uint32_t pg_num;
+          uint64_t num_objs;
+          ::decode(pg_num, hiter);
+          ::decode(num_objs, hiter);
+          dout(10) << __func__ << " collection hint objects is a no-op, "
+		   << " pg_num " << pg_num << " num_objects " << num_objs
+		   << dendl;
+        } else {
+          // Ignore the hint
+          dout(10) << __func__ << " unknown collection hint " << type << dendl;
+        }
+      }
+      break;
+
+    case Transaction::OP_RMCOLL:
+      {
+        coll_t cid = i.get_cid(op->cid);
+	r = _remove_collection(txc, cid, &c);
+      }
+      break;
+
+    case Transaction::OP_COLL_ADD:
+      assert(0 == "not implmeented");
+      break;
+
+    case Transaction::OP_COLL_REMOVE:
+      assert(0 == "not implmeented");
+      break;
+
+    case Transaction::OP_COLL_MOVE:
+      assert(0 == "deprecated");
+      break;
+
+    case Transaction::OP_COLL_MOVE_RENAME:
+      {
+	assert(op->cid == op->dest_cid);
+        ghobject_t oldoid = i.get_oid(op->oid);
+        ghobject_t newoid = i.get_oid(op->dest_oid);
+	r = _rename(txc, c, oldoid, newoid);
+      }
+      break;
+
+    case Transaction::OP_COLL_SETATTR:
+      r = -EOPNOTSUPP;
+      break;
+
+    case Transaction::OP_COLL_RMATTR:
+      r = -EOPNOTSUPP;
+      break;
+
+    case Transaction::OP_COLL_RENAME:
+      assert(0 == "not implmeneted");
+      break;
+
+    case Transaction::OP_OMAP_CLEAR:
+      {
+        ghobject_t oid = i.get_oid(op->oid);
+	r = _omap_clear(txc, c, oid);
+      }
+      break;
+    case Transaction::OP_OMAP_SETKEYS:
+      {
+        ghobject_t oid = i.get_oid(op->oid);
+        map<string, bufferlist> aset;
+        i.decode_attrset(aset);
+	r = _omap_setkeys(txc, c, oid, aset);
+      }
+      break;
+    case Transaction::OP_OMAP_RMKEYS:
+      {
+        ghobject_t oid = i.get_oid(op->oid);
+        set<string> keys;
+        i.decode_keyset(keys);
+	r = _omap_rmkeys(txc, c, oid, keys);
+      }
+      break;
+    case Transaction::OP_OMAP_RMKEYRANGE:
+      {
+        ghobject_t oid = i.get_oid(op->oid);
+        string first, last;
+        first = i.decode_string();
+        last = i.decode_string();
+	r = _omap_rmkey_range(txc, c, oid, first, last);
+      }
+      break;
+    case Transaction::OP_OMAP_SETHEADER:
+      {
+        ghobject_t oid = i.get_oid(op->oid);
+        bufferlist bl;
+        i.decode_bl(bl);
+	r = _omap_setheader(txc, c, oid, bl);
+      }
+      break;
+    case Transaction::OP_SPLIT_COLLECTION:
+      assert(0 == "deprecated");
+      break;
+    case Transaction::OP_SPLIT_COLLECTION2:
+      {
+        uint32_t bits = op->split_bits;
+        uint32_t rem = op->split_rem;
+	r = _split_collection(txc, c, cvec[op->dest_cid], bits, rem);
+      }
+      break;
+
+    case Transaction::OP_SETALLOCHINT:
+      {
+        ghobject_t oid = i.get_oid(op->oid);
+        uint64_t expected_object_size = op->expected_object_size;
+        uint64_t expected_write_size = op->expected_write_size;
+	r = _setallochint(txc, c, oid,
+			  expected_object_size,
+			  expected_write_size);
+      }
+      break;
+
+    default:
+      derr << "bad op " << op->op << dendl;
+      assert(0);
+    }
+
+    if (r < 0) {
+      bool ok = false;
+
+      if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE ||
+			    op->op == Transaction::OP_CLONE ||
+			    op->op == Transaction::OP_CLONERANGE2 ||
+			    op->op == Transaction::OP_COLL_ADD))
+	// -ENOENT is usually okay
+	ok = true;
+      if (r == -ENODATA)
+	ok = true;
+
+      if (!ok) {
+	const char *msg = "unexpected error code";
+
+	if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE ||
+			     op->op == Transaction::OP_CLONE ||
+			     op->op == Transaction::OP_CLONERANGE2))
+	  msg = "ENOENT on clone suggests osd bug";
+
+	if (r == -ENOSPC)
+	  // For now, if we hit _any_ ENOSPC, crash, before we do any damage
+	  // by partially applying transactions.
+	  msg = "ENOSPC handling not implemented";
+
+	if (r == -ENOTEMPTY) {
+	  msg = "ENOTEMPTY suggests garbage data in osd data dir";
+	}
+
+	dout(0) << " error " << cpp_strerror(r) << " not handled on operation " << op->op
+		<< " (op " << pos << ", counting from 0)" << dendl;
+	dout(0) << msg << dendl;
+	dout(0) << " transaction dump:\n";
+	JSONFormatter f(true);
+	f.open_object_section("transaction");
+	t->dump(&f);
+	f.close_section();
+	f.flush(*_dout);
+	*_dout << dendl;
+	assert(0 == "unexpected error");
+      }
+    }
+
+    ++pos;
+  }
+
+  return 0;
+}
+
+
+
+// -----------------
+// write operations
+
+int NewStore::_touch(TransContext *txc,
+		     CollectionRef& c,
+		     const ghobject_t& oid)
+{
+  dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
+  int r = 0;
+  RWLock::WLocker l(c->lock);
+  OnodeRef o = c->get_onode(oid, true);
+  assert(o);
+  o->exists = true;
+  _assign_nid(txc, o);
+  txc->write_onode(o);
+  dout(10) << __func__ << " " << c->cid << " " << oid << " = " << r << dendl;
+  return r;
+}
+
+int NewStore::_do_overlay_clear(TransContext *txc,
+				OnodeRef o)
+{
+  dout(10) << __func__ << " " << o->oid << dendl;
+
+  map<uint64_t,overlay_t>::iterator p = o->onode.overlay_map.begin();
+  while (p != o->onode.overlay_map.end()) {
+    dout(20) << __func__ << " rm " << p->first << " " << p->second << dendl;
+    string key;
+    get_overlay_key(o->onode.nid, p->first, &key);
+    txc->t->rmkey(PREFIX_OVERLAY, key);
+    o->onode.overlay_map.erase(p++);
+  }
+  o->onode.shared_overlays.clear();
+  return 0;
+}
+
+int NewStore::_do_overlay_trim(TransContext *txc,
+			       OnodeRef o,
+			       uint64_t offset,
+			       uint64_t length)
+{
+  dout(10) << __func__ << " " << o->oid << " "
+	   << offset << "~" << length << dendl;
+  int changed = 0;
+
+  map<uint64_t,overlay_t>::iterator p =
+    o->onode.overlay_map.lower_bound(offset);
+  if (p != o->onode.overlay_map.begin()) {
+    --p;
+  }
+  while (p != o->onode.overlay_map.end()) {
+    if (p->first >= offset + length) {
+      dout(20) << __func__ << " stop at " << p->first << " " << p->second
+	       << dendl;
+      break;
+    }
+    if (p->first + p->second.length <= offset) {
+      dout(20) << __func__ << " skip " << p->first << " " << p->second
+	       << dendl;
+      ++p;
+      continue;
+    }
+    if (p->first >= offset &&
+	p->first + p->second.length <= offset + length) {
+      dout(20) << __func__ << " rm " << p->first << " " << p->second
+	       << dendl;
+      if (o->onode.shared_overlays.count(p->second.key) == 0) {
+	string key;
+	get_overlay_key(o->onode.nid, p->first, &key);
+	txc->t->rmkey(PREFIX_OVERLAY, key);
+      }
+      o->onode.overlay_map.erase(p++);
+      ++changed;
+      continue;
+    }
+    if (p->first >= offset) {
+      dout(20) << __func__ << " trim_front " << p->first << " " << p->second
+	       << dendl;
+      overlay_t& ov = o->onode.overlay_map[offset + length] = p->second;
+      uint64_t by = offset + length - p->first;
+      ov.value_offset += by;
+      ov.length -= by;
+      o->onode.overlay_map.erase(p++);
+      ++changed;
+      continue;
+    }
+    if (p->first < offset &&
+	p->first + p->second.length <= offset + length) {
+      dout(20) << __func__ << " trim_tail " << p->first << " " << p->second
+	       << dendl;
+      p->second.length = offset - p->first;
+      ++p;
+      ++changed;
+      continue;
+    }
+    dout(20) << __func__ << " split " << p->first << " " << p->second
+	     << dendl;
+    assert(p->first < offset);
+    assert(p->first + p->second.length > offset + length);
+    overlay_t& nov = o->onode.overlay_map[offset + length] = p->second;
+    p->second.length = offset - p->first;
+    uint64_t by = offset + length - p->first;
+    nov.value_offset += by;
+    nov.length -= by;
+    o->onode.shared_overlays.insert(p->second.key);
+    ++p;
+    ++changed;
+  }
+  return changed;
+}
+
+int NewStore::_do_overlay_write(TransContext *txc,
+				OnodeRef o,
+				uint64_t offset,
+				uint64_t length,
+				const bufferlist& bl)
+{
+  _do_overlay_trim(txc, o, offset, length);
+
+  dout(10) << __func__ << " " << o->oid << " "
+	   << offset << "~" << length << dendl;
+  overlay_t& ov = o->onode.overlay_map[offset] =
+    overlay_t(++o->onode.last_overlay_key, 0, length);
+  dout(20) << __func__ << " added " << offset << " " << ov << dendl;
+  string key;
+  get_overlay_key(o->onode.nid, o->onode.last_overlay_key, &key);
+  txc->t->set(PREFIX_OVERLAY, key, bl);
+  return 0;
+}
+
+int NewStore::_do_write_all_overlays(TransContext *txc,
+				     OnodeRef o)
+{
+  if (o->onode.overlay_map.empty())
+    return 0;
+
+  // overwrite to new fid
+  if (o->onode.data_map.empty()) {
+    // create
+    fragment_t &f = o->onode.data_map[0];
+    f.offset = 0;
+    f.length = o->onode.size;
+    int fd = _create_fid(txc, &f.fid, O_RDWR);
+    if (fd < 0) {
+      return fd;
+    }
+    VOID_TEMP_FAILURE_RETRY(::close(fd));
+    dout(20) << __func__ << " create " << f.fid << dendl;
+  }
+
+  assert(o->onode.data_map.size() == 1);
+  fragment_t& f = o->onode.data_map.begin()->second;
+  assert(f.offset == 0);
+  assert(f.length == o->onode.size);
+
+  for (map<uint64_t,overlay_t>::iterator p = o->onode.overlay_map.begin();
+       p != o->onode.overlay_map.end(); ) {
+    dout(10) << __func__ << " overlay " << p->first
+	     << "~" << p->second << dendl;
+
+    wal_op_t *op = _get_wal_op(txc);
+    op->op = wal_op_t::OP_WRITE;
+    op->offset = p->first;
+    op->length = p->second.length;
+    op->fid = f.fid;
+    // The overlays will be removed from the db after applying the WAL
+    op->nid = o->onode.nid;
+    op->overlays.push_back(p->second);
+
+    // Combine with later overlays if contiguous
+    map<uint64_t,overlay_t>::iterator prev = p, next = p;
+    ++next;
+    while (next != o->onode.overlay_map.end()) {
+      if (prev->first + prev->second.length == next->first) {
+        dout(10) << __func__ << " combining overlay " << next->first
+                 << "~" << next->second << dendl;
+        op->length += next->second.length;
+        op->overlays.push_back(next->second);
+
+        ++prev;
+        ++next;
+      } else {
+	break;
+      }
+    }
+    p = next;
+  }
+
+  // put the shared overlay keys into the WAL transaction, so that we
+  // can cleanup them later after applying the WAL
+  for (set<uint64_t>::iterator p = o->onode.shared_overlays.begin();
+       p != o->onode.shared_overlays.end();
+       ++p) {
+    dout(10) << __func__ << " shared overlay " << *p << dendl;
+    string key;
+    get_overlay_key(o->onode.nid, *p, &key);
+    txc->wal_txn->shared_overlay_keys.push_back(key);
+  }
+
+  o->onode.overlay_map.clear();
+  o->onode.shared_overlays.clear();
+  txc->write_onode(o);
+  return 0;
+}
+
+void NewStore::_do_read_all_overlays(wal_transaction_t& wt)
+{
+  for (list<wal_op_t>::iterator p = wt.ops.begin(); p != wt.ops.end(); ++p) {
+    for (vector<overlay_t>::iterator q = p->overlays.begin();
+         q != p->overlays.end(); ++q) {
+      string key;
+      get_overlay_key(p->nid, q->key, &key);
+      bufferlist bl, bl_data;
+      db->get(PREFIX_OVERLAY, key, &bl);
+      bl_data.substr_of(bl, q->value_offset, q->length);
+      p->data.claim_append(bl_data);
+    }
+  }
+  return;
+}
+
+int NewStore::_do_write(TransContext *txc,
+			OnodeRef o,
+			uint64_t offset, uint64_t length,
+			bufferlist& bl,
+			uint32_t fadvise_flags)
+{
+  int fd = -1;
+  int r = 0;
+  unsigned flags;
+
+  dout(20) << __func__ << " have " << o->onode.size
+	   << " bytes in " << o->onode.data_map.size()
+	   << " fragments" << dendl;
+
+  o->exists = true;
+
+  if (length == 0) {
+    dout(20) << __func__ << " zero-length write" << dendl;
+    goto out;
+  }
+
+  if ((int)o->onode.overlay_map.size() < g_conf->newstore_overlay_max &&
+      (int)length <= g_conf->newstore_overlay_max_length) {
+    // write an overlay
+    r = _do_overlay_write(txc, o, offset, length, bl);
+    if (r < 0)
+      goto out;
+    if (offset + length > o->onode.size) {
+      // make sure the data fragment matches
+      if (!o->onode.data_map.empty()) {
+	assert(o->onode.data_map.size() == 1);
+	fragment_t& f = o->onode.data_map.begin()->second;
+	assert(f.offset == 0);
+	assert(f.length == o->onode.size);
+	r = _clean_fid_tail(txc, f);
+	if (r < 0)
+	  goto out;
+	f.length = offset + length;
+      }
+      dout(20) << __func__ << " extending size to " << offset + length << dendl;
+      o->onode.size = offset + length;
+    }
+    txc->write_onode(o);
+    r = 0;
+    goto out;
+  }
+
+  flags = O_RDWR;
+  if (g_conf->newstore_o_direct &&
+      (offset & ~CEPH_PAGE_MASK) == 0 &&
+      (length & ~CEPH_PAGE_MASK) == 0) {
+    dout(20) << __func__ << " page-aligned, can use O_DIRECT, "
+	     << bl.buffers().size() << " buffers" << dendl;
+    flags |= O_DIRECT | O_DSYNC;
+    if (!bl.is_page_aligned()) {
+      dout(20) << __func__ << " rebuilding buffer to be page-aligned" << dendl;
+      bl.rebuild();
+    }
+  }
+
+  if (o->onode.size <= offset ||
+      o->onode.size == 0 ||
+      o->onode.data_map.empty()) {
+    uint64_t x_offset;
+    if (o->onode.data_map.empty()) {
+      // create
+      fragment_t &f = o->onode.data_map[0];
+      f.offset = 0;
+      f.length = MAX(offset + length, o->onode.size);
+      fd = _create_fid(txc, &f.fid, flags);
+      if (fd < 0) {
+	r = fd;
+	goto out;
+      }
+      x_offset = offset;
+      dout(20) << __func__ << " create " << f.fid << " writing "
+	       << offset << "~" << length << dendl;
+    } else {
+      // append (possibly with gap)
+      assert(o->onode.data_map.size() == 1);
+      fragment_t &f = o->onode.data_map.rbegin()->second;
+      fd = _open_fid(f.fid, flags);
+      if (fd < 0) {
+	r = fd;
+	goto out;
+      }
+      r = _clean_fid_tail_fd(f, fd); // in case there is trailing crap
+      if (r < 0) {
+	goto out;
+      }
+      f.length = (offset + length) - f.offset;
+      x_offset = offset - f.offset;
+      dout(20) << __func__ << " append " << f.fid << " writing "
+	       << (offset - f.offset) << "~" << length << dendl;
+    }
+    if (offset + length > o->onode.size) {
+      o->onode.size = offset + length;
+    }
+#ifdef HAVE_LIBAIO
+    if (g_conf->newstore_aio && (flags & O_DIRECT)) {
+      txc->pending_aios.push_back(FS::aio_t(txc, fd));
+      FS::aio_t& aio = txc->pending_aios.back();
+      bl.prepare_iov(&aio.iov);
+      txc->aio_bl.append(bl);
+      aio.pwritev(x_offset);
+      dout(2) << __func__ << " prepared aio " << &aio << dendl;
+    } else
+#endif
+    {
+      ::lseek64(fd, x_offset, SEEK_SET);
+      r = bl.write_fd(fd);
+      if (r < 0) {
+	derr << __func__ << " bl.write_fd error: " << cpp_strerror(r) << dendl;
+	goto out;
+      }
+      txc->sync_fd(fd);
+    }
+    r = 0;
+    goto out;
+  }
+
+  if (offset == 0 &&
+      length >= o->onode.size) {
+    // overwrite to new fid
+    assert(o->onode.data_map.size() == 1);
+    fragment_t& f = o->onode.data_map.begin()->second;
+    assert(f.offset == 0);
+    assert(f.length == o->onode.size);
+
+    _do_overlay_clear(txc, o);
+
+    wal_op_t *op = _get_wal_op(txc);
+    op->op = wal_op_t::OP_REMOVE;
+    op->fid = f.fid;
+
+    f.length = length;
+    o->onode.size = length;
+    fd = _create_fid(txc, &f.fid, O_RDWR);
+    if (fd < 0) {
+      r = fd;
+      goto out;
+    }
+    dout(20) << __func__ << " replace old fid " << op->fid
+	     << " with new fid " << f.fid
+	     << ", writing " << offset << "~" << length << dendl;
+
+#ifdef HAVE_LIBAIO
+    if (g_conf->newstore_aio && (flags & O_DIRECT)) {
+      txc->pending_aios.push_back(FS::aio_t(txc, fd));
+      FS::aio_t& aio = txc->pending_aios.back();
+      bl.prepare_iov(&aio.iov);
+      txc->aio_bl.append(bl);
+      aio.pwritev(0);
+      dout(2) << __func__ << " prepared aio " << &aio << dendl;
+    } else
+#endif
+    {
+      r = bl.write_fd(fd);
+      if (r < 0) {
+	derr << __func__ << " bl.write_fd error: " << cpp_strerror(r) << dendl;
+	goto out;
+      }
+      txc->sync_fd(fd);
+    }
+    r = 0;
+    goto out;
+  }
+
+  if (true) {
+    // WAL
+    assert(o->onode.data_map.size() == 1);
+    fragment_t& f = o->onode.data_map.begin()->second;
+    assert(f.offset == 0);
+    assert(f.length == o->onode.size);
+    r = _do_write_all_overlays(txc, o);
+    if (r < 0)
+      goto out;
+    r = _clean_fid_tail(txc, f);
+    if (r < 0)
+      goto out;
+    wal_op_t *op = _get_wal_op(txc);
+    op->op = wal_op_t::OP_WRITE;
+    op->offset = offset - f.offset;
+    op->length = length;
+    op->fid = f.fid;
+    op->data = bl;
+    if (offset + length > o->onode.size) {
+      o->onode.size = offset + length;
+    }
+    if (offset + length - f.offset > f.length) {
+      f.length = offset + length - f.offset;
+    }
+    dout(20) << __func__ << " wal " << f.fid << " write "
+	     << (offset - f.offset) << "~" << length << dendl;
+  }
+  r = 0;
+
+ out:
+  return r;
+}
+
+int NewStore::_clean_fid_tail_fd(const fragment_t& f, int fd)
+{
+  struct stat st;
+  int r = ::fstat(fd, &st);
+  if (r < 0) {
+    r = -errno;
+    derr << __func__ << " failed to fstat " << f.fid << ": "
+	 << cpp_strerror(r) << dendl;
+    return r;
+  }
+  if (st.st_size > f.length) {
+    dout(20) << __func__ << " frag " << f.fid << " is long, truncating"
+	     << dendl;
+    r = ::ftruncate(fd, f.length);
+    if (r < 0) {
+      derr << __func__ << " failed to ftruncate " << f.fid << ": "
+	   << cpp_strerror(r) << dendl;
+      return r;
+    }
+    return 1;
+  }
+  return 0;
+}
+
+int NewStore::_clean_fid_tail(TransContext *txc, const fragment_t& f)
+{
+  int fd = _open_fid(f.fid, O_RDWR);
+  if (fd < 0) {
+    return fd;
+  }
+  int r = _clean_fid_tail_fd(f, fd);
+  if (r < 0) {
+    return r;
+  }
+  if (r > 0) {
+    txc->sync_fd(fd);
+  } else {
+    // all good!
+    VOID_TEMP_FAILURE_RETRY(::close(fd));
+  }
+  return 0;
+}
+
+
+int NewStore::_write(TransContext *txc,
+		     CollectionRef& c,
+		     const ghobject_t& oid,
+		     uint64_t offset, size_t length,
+		     bufferlist& bl,
+		     uint32_t fadvise_flags)
+{
+  dout(15) << __func__ << " " << c->cid << " " << oid
+	   << " " << offset << "~" << length
+	   << dendl;
+  RWLock::WLocker l(c->lock);
+  OnodeRef o = c->get_onode(oid, true);
+  _assign_nid(txc, o);
+  int r = _do_write(txc, o, offset, length, bl, fadvise_flags);
+  txc->write_onode(o);
+
+  dout(10) << __func__ << " " << c->cid << " " << oid
+	   << " " << offset << "~" << length
+	   << " = " << r << dendl;
+  return r;
+}
+
+int NewStore::_zero(TransContext *txc,
+		    CollectionRef& c,
+		    const ghobject_t& oid,
+		    uint64_t offset, size_t length)
+{
+  dout(15) << __func__ << " " << c->cid << " " << oid
+	   << " " << offset << "~" << length
+	   << dendl;
+  int r = 0;
+
+  RWLock::WLocker l(c->lock);
+  OnodeRef o = c->get_onode(oid, true);
+  _assign_nid(txc, o);
+
+  // overlay
+  if (_do_overlay_trim(txc, o, offset, length) > 0)
+    txc->write_onode(o);
+
+  if (o->onode.data_map.empty()) {
+    // we're already a big hole
+    if (offset + length > o->onode.size) {
+      o->onode.size = offset + length;
+      txc->write_onode(o);
+    }
+  } else {
+    assert(o->onode.data_map.size() == 1);
+    fragment_t& f = o->onode.data_map.begin()->second;
+    assert(f.offset == 0);
+    assert(f.length == o->onode.size);
+
+    r = _clean_fid_tail(txc, f);
+    if (r < 0)
+      goto out;
+
+    if (offset >= o->onode.size) {
+      // after tail
+      int fd = _open_fid(f.fid, O_RDWR);
+      if (fd < 0) {
+	r = fd;
+	goto out;
+      }
+      f.length = (offset + length) - f.offset;
+      r = ::ftruncate(fd, f.length);
+      assert(r == 0);   // this shouldn't fail
+      dout(20) << __func__ << " tail " << f.fid << " truncating up to "
+	       << f.length << dendl;
+      o->onode.size = offset + length;
+      txc->write_onode(o);
+    } else {
+      // WAL
+      wal_op_t *op = _get_wal_op(txc);
+      op->op = wal_op_t::OP_ZERO;
+      op->offset = offset - f.offset;
+      op->length = length;
+      op->fid = f.fid;
+      if (offset + length > o->onode.size) {
+	f.length = offset + length - f.offset;
+	o->onode.size = offset + length;
+	txc->write_onode(o);
+      }
+    }
+  }
+
+ out:
+  dout(10) << __func__ << " " << c->cid << " " << oid
+	   << " " << offset << "~" << length
+	   << " = " << r << dendl;
+  return r;
+}
+
+int NewStore::_do_truncate(TransContext *txc, OnodeRef o, uint64_t offset)
+{
+  // trim down fragments
+  map<uint64_t,fragment_t>::iterator fp = o->onode.data_map.end();
+  if (fp != o->onode.data_map.begin())
+    --fp;
+  while (fp != o->onode.data_map.end()) {
+    if (fp->first + fp->second.length <= offset) {
+      break;
+    }
+    if (fp->first >= offset) {
+      dout(20) << __func__ << " wal rm fragment " << fp->first << " "
+	       << fp->second << dendl;
+      wal_op_t *op = _get_wal_op(txc);
+      op->op = wal_op_t::OP_REMOVE;
+      op->fid = fp->second.fid;
+      if (fp != o->onode.data_map.begin()) {
+	o->onode.data_map.erase(fp--);
+	continue;
+      } else {
+	o->onode.data_map.erase(fp);
+	break;
+      }
+    } else {
+      assert(fp->first + fp->second.length > offset);
+      assert(fp->first < offset);
+      uint64_t newlen = offset - fp->first;
+      dout(20) << __func__ << " wal truncate fragment " << fp->first << " "
+	       << fp->second << " to " << newlen << dendl;
+      fragment_t& f = fp->second;
+      f.length = newlen;
+      wal_op_t *op = _get_wal_op(txc);
+      op->op = wal_op_t::OP_TRUNCATE;
+      op->offset = offset;
+      op->fid = f.fid;
+      break;
+    }
+  }
+
+  // truncate up trailing fragment?
+  if (!o->onode.data_map.empty() && offset > o->onode.size) {
+    // resize file up.  make sure we don't have trailing bytes
+    assert(o->onode.data_map.size() == 1);
+    fragment_t& f = o->onode.data_map.begin()->second;
+    assert(f.offset == 0);
+    assert(f.length == o->onode.size);
+    dout(20) << __func__ << " truncate up " << f << " to " << offset << dendl;
+    int r = _clean_fid_tail(txc, f);
+    if (r < 0)
+      return r;
+    f.length = offset;
+  }
+
+  // trim down overlays
+  map<uint64_t,overlay_t>::iterator op = o->onode.overlay_map.end();
+  if (op != o->onode.overlay_map.begin())
+    --op;
+  while (op != o->onode.overlay_map.end()) {
+    if (op->first + op->second.length <= offset) {
+      break;
+    }
+    if (op->first >= offset) {
+      if (!o->onode.shared_overlays.count(op->second.key)) {
+	dout(20) << __func__ << " rm overlay " << op->first << " "
+		 << op->second << dendl;
+	string key;
+	get_overlay_key(o->onode.nid, op->second.key, &key);
+	txc->t->rmkey(PREFIX_OVERLAY, key);
+      } else {
+	dout(20) << __func__ << " rm overlay " << op->first << " "
+		 << op->second << " (shared)" << dendl;
+      }
+      if (op != o->onode.overlay_map.begin()) {
+	o->onode.overlay_map.erase(op--);
+	continue;
+      } else {
+	o->onode.overlay_map.erase(op);
+	break;
+      }
+    } else {
+      assert(op->first + op->second.length > offset);
+      assert(op->first < offset);
+      uint64_t newlen = offset - op->first;
+      dout(20) << __func__ << " truncate overlay " << op->first << " "
+	       << op->second << " to " << newlen << dendl;
+      overlay_t& ov = op->second;
+      ov.length = newlen;
+      break;
+    }
+  }
+
+  o->onode.size = offset;
+  txc->write_onode(o);
+  return 0;
+}
+
+int NewStore::_truncate(TransContext *txc,
+			CollectionRef& c,
+			const ghobject_t& oid,
+			uint64_t offset)
+{
+  dout(15) << __func__ << " " << c->cid << " " << oid
+	   << " " << offset
+	   << dendl;
+  int r = 0;
+
+  RWLock::WLocker l(c->lock);
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+  r = _do_truncate(txc, o, offset);
+
+ out:
+  dout(10) << __func__ << " " << c->cid << " " << oid
+	   << " " << offset
+	   << " = " << r << dendl;
+  return r;
+}
+
+int NewStore::_do_remove(TransContext *txc,
+			 OnodeRef o)
+{
+  string key;
+  o->exists = false;
+  if (!o->onode.data_map.empty()) {
+    for (map<uint64_t,fragment_t>::iterator p = o->onode.data_map.begin();
+	 p != o->onode.data_map.end();
+	 ++p) {
+      dout(20) << __func__ << " will wal remove " << p->second.fid << dendl;
+      wal_op_t *op = _get_wal_op(txc);
+      op->op = wal_op_t::OP_REMOVE;
+      op->fid = p->second.fid;
+    }
+  }
+  o->onode.data_map.clear();
+  o->onode.size = 0;
+  if (o->onode.omap_head) {
+    _do_omap_clear(txc, o->onode.omap_head);
+  }
+
+  get_object_key(o->oid, &key);
+  txc->t->rmkey(PREFIX_OBJ, key);
+  return 0;
+}
+
+int NewStore::_remove(TransContext *txc,
+		      CollectionRef& c,
+		      const ghobject_t& oid)
+{
+  dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
+  int r;
+  RWLock::WLocker l(c->lock);
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+  r = _do_remove(txc, o);
+
+ out:
+  dout(10) << __func__ << " " << c->cid << " " << oid << " = " << r << dendl;
+  return r;
+}
+
+int NewStore::_setattr(TransContext *txc,
+		       CollectionRef& c,
+		       const ghobject_t& oid,
+		       const string& name,
+		       bufferptr& val)
+{
+  dout(15) << __func__ << " " << c->cid << " " << oid
+	   << " " << name << " (" << val.length() << " bytes)"
+	   << dendl;
+  int r = 0;
+
+  RWLock::WLocker l(c->lock);
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+  o->onode.attrs[name] = val;
+  txc->write_onode(o);
+  r = 0;
+
+ out:
+  dout(10) << __func__ << " " << c->cid << " " << oid
+	   << " " << name << " (" << val.length() << " bytes)"
+	   << " = " << r << dendl;
+  return r;
+}
+
+int NewStore::_setattrs(TransContext *txc,
+			CollectionRef& c,
+			const ghobject_t& oid,
+			const map<string,bufferptr>& aset)
+{
+  dout(15) << __func__ << " " << c->cid << " " << oid
+	   << " " << aset.size() << " keys"
+	   << dendl;
+  int r = 0;
+
+  RWLock::WLocker l(c->lock);
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+  for (map<string,bufferptr>::const_iterator p = aset.begin();
+       p != aset.end(); ++p)
+    o->onode.attrs[p->first] = p->second;
+  txc->write_onode(o);
+  r = 0;
+
+ out:
+  dout(10) << __func__ << " " << c->cid << " " << oid
+	   << " " << aset.size() << " keys"
+	   << " = " << r << dendl;
+  return r;
+}
+
+
+int NewStore::_rmattr(TransContext *txc,
+		      CollectionRef& c,
+		      const ghobject_t& oid,
+		      const string& name)
+{
+  dout(15) << __func__ << " " << c->cid << " " << oid
+	   << " " << name << dendl;
+  int r = 0;
+
+  RWLock::WLocker l(c->lock);
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+  o->onode.attrs.erase(name);
+  txc->write_onode(o);
+  r = 0;
+
+ out:
+  dout(10) << __func__ << " " << c->cid << " " << oid
+	   << " " << name << " = " << r << dendl;
+  return r;
+}
+
+int NewStore::_rmattrs(TransContext *txc,
+		       CollectionRef& c,
+		       const ghobject_t& oid)
+{
+  dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
+  int r = 0;
+
+  RWLock::WLocker l(c->lock);
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+  o->onode.attrs.clear();
+  txc->write_onode(o);
+  r = 0;
+
+ out:
+  dout(10) << __func__ << " " << c->cid << " " << oid << " = " << r << dendl;
+  return r;
+}
+
+void NewStore::_do_omap_clear(TransContext *txc, uint64_t id)
+{
+  KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
+  string prefix, tail;
+  get_omap_header(id, &prefix);
+  get_omap_tail(id, &tail);
+  it->lower_bound(prefix);
+  while (it->valid()) {
+    if (it->key() >= tail) {
+      dout(30) << __func__ << "  stop at " << tail << dendl;
+      break;
+    }
+    txc->t->rmkey(PREFIX_OMAP, it->key());
+    dout(30) << __func__ << "  rm " << it->key() << dendl;
+    it->next();
+  }
+}
+
+int NewStore::_omap_clear(TransContext *txc,
+			  CollectionRef& c,
+			  const ghobject_t& oid)
+{
+  dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
+  int r = 0;
+
+  RWLock::WLocker l(c->lock);
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+  if (o->onode.omap_head != 0) {
+    _do_omap_clear(txc, o->onode.omap_head);
+  }
+  r = 0;
+
+ out:
+  dout(10) << __func__ << " " << c->cid << " " << oid << " = " << r << dendl;
+  return r;
+}
+
+int NewStore::_omap_setkeys(TransContext *txc,
+			    CollectionRef& c,
+			    const ghobject_t& oid,
+			    const map<string,bufferlist>& m)
+{
+  dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
+  int r = 0;
+
+  RWLock::WLocker l(c->lock);
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+  if (!o->onode.omap_head) {
+    o->onode.omap_head = o->onode.nid;
+    txc->write_onode(o);
+  }
+  for (map<string,bufferlist>::const_iterator p = m.begin(); p != m.end(); ++p) {
+    string key;
+    get_omap_key(o->onode.omap_head, p->first, &key);
+    dout(30) << __func__ << "  " << key << " <- " << p->first << dendl;
+    txc->t->set(PREFIX_OMAP, key, p->second);
+  }
+  r = 0;
+
+ out:
+  dout(10) << __func__ << " " << c->cid << " " << oid << " = " << r << dendl;
+  return r;
+}
+
+int NewStore::_omap_setheader(TransContext *txc,
+			      CollectionRef& c,
+			      const ghobject_t& oid,
+			      bufferlist& bl)
+{
+  dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
+  int r = 0;
+
+  RWLock::WLocker l(c->lock);
+  OnodeRef o = c->get_onode(oid, false);
+  string key;
+  if (!o || !o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+  if (!o->onode.omap_head) {
+    o->onode.omap_head = o->onode.nid;
+    txc->write_onode(o);
+  }
+  get_omap_header(o->onode.omap_head, &key);
+  txc->t->set(PREFIX_OMAP, key, bl);
+  r = 0;
+
+ out:
+  dout(10) << __func__ << " " << c->cid << " " << oid << " = " << r << dendl;
+  return r;
+}
+
+int NewStore::_omap_rmkeys(TransContext *txc,
+			   CollectionRef& c,
+			   const ghobject_t& oid,
+			   const set<string>& m)
+{
+  dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
+  int r = 0;
+
+  RWLock::WLocker l(c->lock);
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+  if (!o->onode.omap_head) {
+    r = 0;
+    goto out;
+  }
+  if (!o->onode.omap_head) {
+    o->onode.omap_head = o->onode.nid;
+    txc->write_onode(o);
+  }
+  for (set<string>::const_iterator p = m.begin(); p != m.end(); ++p) {
+    string key;
+    get_omap_key(o->onode.omap_head, *p, &key);
+    dout(30) << __func__ << "  rm " << key << " <- " << *p << dendl;
+    txc->t->rmkey(PREFIX_OMAP, key);
+  }
+  r = 0;
+
+ out:
+  dout(10) << __func__ << " " << c->cid << " " << oid << " = " << r << dendl;
+  return r;
+}
+
+int NewStore::_omap_rmkey_range(TransContext *txc,
+				CollectionRef& c,
+				const ghobject_t& oid,
+				const string& first, const string& last)
+{
+  dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
+  int r = 0;
+  KeyValueDB::Iterator it;
+  string key_first, key_last;
+
+  RWLock::WLocker l(c->lock);
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+  if (!o->onode.omap_head) {
+    r = 0;
+    goto out;
+  }
+  it = db->get_iterator(PREFIX_OMAP);
+  get_omap_key(o->onode.omap_head, first, &key_first);
+  get_omap_key(o->onode.omap_head, last, &key_last);
+  it->lower_bound(key_first);
+  while (it->valid()) {
+    if (it->key() >= key_last) {
+      dout(30) << __func__ << "  stop at " << key_last << dendl;
+      break;
+    }
+    txc->t->rmkey(PREFIX_OMAP, it->key());
+    dout(30) << __func__ << "  rm " << it->key() << dendl;
+    it->next();
+  }
+  r = 0;
+
+ out:
+  dout(10) << __func__ << " " << c->cid << " " << oid << " = " << r << dendl;
+  return r;
+}
+
+int NewStore::_setallochint(TransContext *txc,
+			    CollectionRef& c,
+			    const ghobject_t& oid,
+			    uint64_t expected_object_size,
+			    uint64_t expected_write_size)
+{
+  dout(15) << __func__ << " " << c->cid << " " << oid
+	   << " object_size " << expected_object_size
+	   << " write_size " << expected_write_size
+	   << dendl;
+  int r = 0;
+  RWLock::WLocker l(c->lock);
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+
+  o->onode.expected_object_size = expected_object_size;
+  o->onode.expected_write_size = expected_write_size;
+  txc->write_onode(o);
+
+ out:
+  dout(10) << __func__ << " " << c->cid << " " << oid
+	   << " object_size " << expected_object_size
+	   << " write_size " << expected_write_size
+	   << " = " << r << dendl;
+  return r;
+}
+
+int NewStore::_clone(TransContext *txc,
+		     CollectionRef& c,
+		     const ghobject_t& old_oid,
+		     const ghobject_t& new_oid)
+{
+  dout(15) << __func__ << " " << c->cid << " " << old_oid << " -> "
+	   << new_oid << dendl;
+  int r = 0;
+
+  RWLock::WLocker l(c->lock);
+  bufferlist bl;
+  OnodeRef newo;
+  OnodeRef oldo = c->get_onode(old_oid, false);
+  if (!oldo || !oldo->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+  newo = c->get_onode(new_oid, true);
+  assert(newo);
+  newo->exists = true;
+  _assign_nid(txc, newo);
+
+  r = _do_read(oldo, 0, oldo->onode.size, bl, 0);
+  if (r < 0)
+    goto out;
+
+  // truncate any old data
+  while (!newo->onode.data_map.empty()) {
+    wal_op_t *op = _get_wal_op(txc);
+    op->op = wal_op_t::OP_REMOVE;
+    op->fid = newo->onode.data_map.rbegin()->second.fid;
+    newo->onode.data_map.erase(newo->onode.data_map.rbegin()->first);
+  }
+
+  r = _do_write(txc, newo, 0, oldo->onode.size, bl, 0);
+
+  newo->onode.attrs = oldo->onode.attrs;
+
+  // clone omap
+  if (newo->onode.omap_head) {
+    dout(20) << __func__ << " clearing old omap data" << dendl;
+    _do_omap_clear(txc, newo->onode.omap_head);
+  }
+  if (oldo->onode.omap_head) {
+    dout(20) << __func__ << " copying omap data" << dendl;
+    if (!newo->onode.omap_head) {
+      newo->onode.omap_head = newo->onode.nid;
+    }
+    KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
+    string head, tail;
+    get_omap_header(oldo->onode.omap_head, &head);
+    get_omap_tail(oldo->onode.omap_head, &tail);
+    it->lower_bound(head);
+    while (it->valid()) {
+      string key;
+      if (it->key() >= tail) {
+	dout(30) << __func__ << "  reached tail" << dendl;
+	break;
+      } else {
+	dout(30) << __func__ << "  got header/data " << it->key() << dendl;
+	assert(it->key() < tail);
+	rewrite_omap_key(newo->onode.omap_head, it->key(), &key);
+	txc->t->set(PREFIX_OMAP, key, it->value());
+      }
+      it->next();
+    }
+  }
+
+  txc->write_onode(newo);
+
+  r = 0;
+
+ out:
+  dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> "
+	   << new_oid << " = " << r << dendl;
+  return r;
+}
+
+int NewStore::_clone_range(TransContext *txc,
+			   CollectionRef& c,
+			   const ghobject_t& old_oid,
+			   const ghobject_t& new_oid,
+			   uint64_t srcoff, uint64_t length, uint64_t dstoff)
+{
+  dout(15) << __func__ << " " << c->cid << " " << old_oid << " -> "
+	   << new_oid << " from " << srcoff << "~" << length
+	   << " to offset " << dstoff << dendl;
+  int r = 0;
+
+  RWLock::WLocker l(c->lock);
+  bufferlist bl;
+  OnodeRef newo;
+  OnodeRef oldo = c->get_onode(old_oid, false);
+  if (!oldo || !oldo->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+  newo = c->get_onode(new_oid, true);
+  assert(newo);
+  newo->exists = true;
+
+  r = _do_read(oldo, srcoff, length, bl, 0);
+  if (r < 0)
+    goto out;
+
+  r = _do_write(txc, newo, dstoff, bl.length(), bl, 0);
+
+  txc->write_onode(newo);
+
+  r = 0;
+
+ out:
+  dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> "
+	   << new_oid << " from " << srcoff << "~" << length
+	   << " to offset " << dstoff
+	   << " = " << r << dendl;
+  return r;
+}
+
+int NewStore::_rename(TransContext *txc,
+		      CollectionRef& c,
+		      const ghobject_t& old_oid,
+		      const ghobject_t& new_oid)
+{
+  dout(15) << __func__ << " " << c->cid << " " << old_oid << " -> "
+	   << new_oid << dendl;
+  int r;
+
+  RWLock::WLocker l(c->lock);
+  bufferlist bl;
+  string old_key, new_key;
+  OnodeRef newo;
+  OnodeRef oldo = c->get_onode(old_oid, false);
+  if (!oldo || !oldo->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+  newo = c->get_onode(new_oid, true);
+  assert(newo);
+
+  if (newo->exists) {
+    r = _do_remove(txc, newo);
+    if (r < 0)
+      return r;
+  }
+
+  get_object_key(old_oid, &old_key);
+  get_object_key(new_oid, &new_key);
+
+  c->onode_map.rename(old_oid, new_oid);
+  oldo->oid = new_oid;
+  oldo->key = new_key;
+
+  txc->t->rmkey(PREFIX_OBJ, old_key);
+  txc->write_onode(oldo);
+  r = 0;
+
+ out:
+  dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> "
+	   << new_oid << " = " << r << dendl;
+  return r;
+}
+
+// collections
+
+int NewStore::_create_collection(
+  TransContext *txc,
+  coll_t cid,
+  unsigned bits,
+  CollectionRef *c)
+{
+  dout(15) << __func__ << " " << cid << " bits " << bits << dendl;
+  int r;
+  bufferlist bl;
+
+  {
+    RWLock::WLocker l(coll_lock);
+    if (*c) {
+      r = -EEXIST;
+      goto out;
+    }
+    c->reset(new Collection(this, cid));
+    (*c)->cnode.bits = bits;
+    coll_map[cid] = *c;
+  }
+  ::encode((*c)->cnode, bl);
+  txc->t->set(PREFIX_COLL, stringify(cid), bl);
+  r = 0;
+
+ out:
+  dout(10) << __func__ << " " << cid << " bits " << bits << " = " << r << dendl;
+  return r;
+}
+
+int NewStore::_remove_collection(TransContext *txc, coll_t cid,
+				 CollectionRef *c)
+{
+  dout(15) << __func__ << " " << cid << dendl;
+  int r;
+  bufferlist empty;
+
+  {
+    RWLock::WLocker l(coll_lock);
+    if (!*c) {
+      r = -ENOENT;
+      goto out;
+    }
+    pair<ghobject_t,OnodeRef> next;
+    while ((*c)->onode_map.get_next(next.first, &next)) {
+      if (next.second->exists) {
+	r = -ENOTEMPTY;
+	goto out;
+      }
+    }
+    coll_map.erase(cid);
+    txc->removed_collections.push_back(*c);
+    c->reset();
+  }
+  txc->t->rmkey(PREFIX_COLL, stringify(cid));
+  r = 0;
+
+ out:
+  dout(10) << __func__ << " " << cid << " = " << r << dendl;
+  return r;
+}
+
+int NewStore::_split_collection(TransContext *txc,
+				CollectionRef& c,
+				CollectionRef& d,
+				unsigned bits, int rem)
+{
+  dout(15) << __func__ << " " << c->cid << " to " << d->cid << " "
+	   << " bits " << bits << dendl;
+  int r;
+  RWLock::WLocker l(c->lock);
+  RWLock::WLocker l2(d->lock);
+  c->onode_map.clear();
+  d->onode_map.clear();
+  c->cnode.bits = bits;
+  assert(d->cnode.bits == bits);
+  r = 0;
+
+  dout(10) << __func__ << " " << c->cid << " to " << d->cid << " "
+	   << " bits " << bits << " = " << r << dendl;
+  return r;
+}
+
+// ===========================================
diff --git a/src/os/newstore/NewStore.h b/src/os/newstore/NewStore.h
new file mode 100644
index 0000000..97c5d6a
--- /dev/null
+++ b/src/os/newstore/NewStore.h
@@ -0,0 +1,850 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OSD_NEWSTORE_H
+#define CEPH_OSD_NEWSTORE_H
+
+#include "acconfig.h"
+
+#include <unistd.h>
+
+#include "include/assert.h"
+#include "include/unordered_map.h"
+#include "include/memory.h"
+#include "common/Finisher.h"
+#include "common/RWLock.h"
+#include "common/WorkQueue.h"
+#include "os/ObjectStore.h"
+#include "os/fs/FS.h"
+#include "os/KeyValueDB.h"
+
+#include "newstore_types.h"
+
+#include "boost/intrusive/list.hpp"
+
+class NewStore : public ObjectStore {
+  // -----------------------------------------------------
+  // types
+public:
+
+  class TransContext;
+
+  /// an in-memory object
+  struct Onode {
+    atomic_t nref;  ///< reference count
+
+    ghobject_t oid;
+    string key;     ///< key under PREFIX_OBJ where we are stored
+    boost::intrusive::list_member_hook<> lru_item;
+
+    onode_t onode;  ///< metadata stored as value in kv store
+    bool dirty;     // ???
+    bool exists;
+
+    Mutex flush_lock;  ///< protect unappliex_txns, num_fsyncs
+    Cond flush_cond;   ///< wait here for unapplied txns, fsyncs
+    set<TransContext*> flush_txns;   ///< fsyncing or committing or wal txns
+
+    Onode(const ghobject_t& o, const string& k);
+
+    void flush() {
+      Mutex::Locker l(flush_lock);
+      while (!flush_txns.empty())
+	flush_cond.Wait(flush_lock);
+    }
+    void get() {
+      nref.inc();
+    }
+    void put() {
+      if (nref.dec() == 0)
+	delete this;
+    }
+  };
+  typedef boost::intrusive_ptr<Onode> OnodeRef;
+
+  struct OnodeHashLRU {
+    typedef boost::intrusive::list<
+      Onode,
+      boost::intrusive::member_hook<
+        Onode,
+	boost::intrusive::list_member_hook<>,
+	&Onode::lru_item> > lru_list_t;
+
+    Mutex lock;
+    ceph::unordered_map<ghobject_t,OnodeRef> onode_map;  ///< forward lookups
+    lru_list_t lru;                                      ///< lru
+
+    OnodeHashLRU() : lock("NewStore::OnodeHashLRU::lock") {}
+
+    void add(const ghobject_t& oid, OnodeRef o);
+    void _touch(OnodeRef o);
+    OnodeRef lookup(const ghobject_t& o);
+    void remove(const ghobject_t& o);
+    void rename(const ghobject_t& old_oid, const ghobject_t& new_oid);
+    void clear();
+    bool get_next(const ghobject_t& after, pair<ghobject_t,OnodeRef> *next);
+    int trim(int max=-1);
+  };
+
+  struct Collection {
+    NewStore *store;
+    coll_t cid;
+    cnode_t cnode;
+    RWLock lock;
+
+    // cache onodes on a per-collection basis to avoid lock
+    // contention.
+    OnodeHashLRU onode_map;
+
+    OnodeRef get_onode(const ghobject_t& oid, bool create);
+
+    Collection(NewStore *ns, coll_t c);
+  };
+  typedef ceph::shared_ptr<Collection> CollectionRef;
+
+  class OmapIteratorImpl : public ObjectMap::ObjectMapIteratorImpl {
+    CollectionRef c;
+    OnodeRef o;
+    KeyValueDB::Iterator it;
+    string head, tail;
+  public:
+    OmapIteratorImpl(CollectionRef c, OnodeRef o, KeyValueDB::Iterator it);
+    int seek_to_first();
+    int upper_bound(const string &after);
+    int lower_bound(const string &to);
+    bool valid();
+    int next();
+    string key();
+    bufferlist value();
+    int status() {
+      return 0;
+    }
+  };
+
+  class OpSequencer;
+  typedef boost::intrusive_ptr<OpSequencer> OpSequencerRef;
+
+  struct fsync_item {
+    boost::intrusive::list_member_hook<> queue_item;
+    int fd;
+    TransContext *txc;
+    fsync_item(int f, TransContext *t) : fd(f), txc(t) {}
+  };
+
+  struct TransContext {
+    typedef enum {
+      STATE_PREPARE,
+      STATE_FSYNC_WAIT,
+      STATE_AIO_WAIT,
+      STATE_IO_DONE,
+      STATE_KV_QUEUED,
+      STATE_KV_COMMITTING,
+      STATE_KV_DONE,
+      STATE_WAL_QUEUED,
+      STATE_WAL_APPLYING,
+      STATE_WAL_AIO_WAIT,
+      STATE_WAL_CLEANUP,   // remove wal kv record
+      STATE_WAL_DONE,
+      STATE_FINISHING,
+      STATE_DONE,
+    } state_t;
+
+    state_t state;
+
+    const char *get_state_name() {
+      switch (state) {
+      case STATE_PREPARE: return "prepare";
+      case STATE_FSYNC_WAIT: return "fsync_wait";
+      case STATE_AIO_WAIT: return "aio_wait";
+      case STATE_IO_DONE: return "io_done";
+      case STATE_KV_QUEUED: return "kv_queued";
+      case STATE_KV_COMMITTING: return "kv_committing";
+      case STATE_KV_DONE: return "kv_done";
+      case STATE_WAL_QUEUED: return "wal_queued";
+      case STATE_WAL_APPLYING: return "wal_applying";
+      case STATE_WAL_AIO_WAIT: return "wal_aio_wait";
+      case STATE_WAL_CLEANUP: return "wal_cleanup";
+      case STATE_WAL_DONE: return "wal_done";
+      case STATE_FINISHING: return "finishing";
+      case STATE_DONE: return "done";
+      }
+      return "???";
+    }
+
+    OpSequencerRef osr;
+    boost::intrusive::list_member_hook<> sequencer_item;
+
+    uint64_t ops, bytes;
+
+    list<fsync_item> sync_items; ///< these fds need to be synced
+    set<OnodeRef> onodes;     ///< these onodes need to be updated/written
+    KeyValueDB::Transaction t; ///< then we will commit this
+    Context *oncommit;         ///< signal on commit
+    Context *onreadable;         ///< signal on readable
+    Context *onreadable_sync;         ///< signal on readable
+    list<Context*> oncommits;  ///< more commit completions
+    list<CollectionRef> removed_collections; ///< colls we removed
+
+    boost::intrusive::list_member_hook<> wal_queue_item;
+    wal_transaction_t *wal_txn; ///< wal transaction (if any)
+    unsigned num_fsyncs_completed;
+
+    list<FS::aio_t> pending_aios;    ///< not yet submitted
+    list<FS::aio_t> submitted_aios;  ///< submitting or submitted
+    bufferlist aio_bl;  // just a pile of refs
+    atomic_t num_aio;
+
+    Mutex lock;
+    Cond cond;
+
+    CollectionRef first_collection;  ///< first referenced collection
+
+    TransContext(OpSequencer *o)
+      : state(STATE_PREPARE),
+	osr(o),
+	ops(0),
+	bytes(0),
+	oncommit(NULL),
+	onreadable(NULL),
+	onreadable_sync(NULL),
+	wal_txn(NULL),
+	num_fsyncs_completed(0),
+	num_aio(0),
+	lock("NewStore::TransContext::lock") {
+      //cout << "txc new " << this << std::endl;
+    }
+    ~TransContext() {
+      delete wal_txn;
+      //cout << "txc del " << this << std::endl;
+    }
+
+    void sync_fd(int f) {
+      sync_items.push_back(fsync_item(f, this));
+    }
+    void write_onode(OnodeRef &o) {
+      onodes.insert(o);
+    }
+
+    bool finish_fsync() {
+      Mutex::Locker l(lock);
+      ++num_fsyncs_completed;
+      if (num_fsyncs_completed == sync_items.size()) {
+	cond.Signal();
+	return true;
+      }
+      return false;
+    }
+    void wait_fsync() {
+      Mutex::Locker l(lock);
+      while (num_fsyncs_completed < sync_items.size())
+	cond.Wait(lock);
+    }
+  };
+
+  class OpSequencer : public Sequencer_impl {
+  public:
+    Mutex qlock;
+    Cond qcond;
+    typedef boost::intrusive::list<
+      TransContext,
+      boost::intrusive::member_hook<
+        TransContext,
+	boost::intrusive::list_member_hook<>,
+	&TransContext::sequencer_item> > q_list_t;
+    q_list_t q;  ///< transactions
+
+    typedef boost::intrusive::list<
+      TransContext,
+      boost::intrusive::member_hook<
+	TransContext,
+	boost::intrusive::list_member_hook<>,
+	&TransContext::wal_queue_item> > wal_queue_t;
+    wal_queue_t wal_q; ///< transactions
+
+    boost::intrusive::list_member_hook<> wal_osr_queue_item;
+
+    Sequencer *parent;
+
+    Mutex wal_apply_lock;
+
+    OpSequencer()
+	//set the qlock to to PTHREAD_MUTEX_RECURSIVE mode
+      : qlock("NewStore::OpSequencer::qlock", true, false),
+	parent(NULL),
+	wal_apply_lock("NewStore::OpSequencer::wal_apply_lock") {
+    }
+    ~OpSequencer() {
+      assert(q.empty());
+    }
+
+    void queue_new(TransContext *txc) {
+      Mutex::Locker l(qlock);
+      q.push_back(*txc);
+    }
+
+    void flush() {
+      Mutex::Locker l(qlock);
+      while (!q.empty())
+	qcond.Wait(qlock);
+    }
+
+    bool flush_commit(Context *c) {
+      Mutex::Locker l(qlock);
+      if (q.empty()) {
+	return true;
+      }
+      TransContext *txc = &q.back();
+      if (txc->state >= TransContext::STATE_KV_DONE) {
+	return true;
+      }
+      assert(txc->state < TransContext::STATE_KV_DONE);
+      txc->oncommits.push_back(c);
+      return false;
+    }
+  };
+
+  class FsyncWQ : public ThreadPool::WorkQueue<fsync_item> {
+  public:
+    typedef boost::intrusive::list<
+      fsync_item,
+      boost::intrusive::member_hook<
+        fsync_item,
+	boost::intrusive::list_member_hook<>,
+	&fsync_item::queue_item> > fsync_queue_t;
+  private:
+    NewStore *store;
+    fsync_queue_t fd_queue;
+
+  public:
+    FsyncWQ(NewStore *s, time_t ti, time_t sti, ThreadPool *tp)
+      : ThreadPool::WorkQueue<fsync_item>("NewStore::FsyncWQ", ti, sti, tp),
+	store(s) {
+    }
+    bool _empty() {
+      return fd_queue.empty();
+    }
+    bool _enqueue(fsync_item *i) {
+      fd_queue.push_back(*i);
+      return true;
+    }
+    void _dequeue(fsync_item *p) {
+      assert(0 == "not needed, not implemented");
+    }
+    fsync_item *_dequeue() {
+      if (fd_queue.empty())
+	return NULL;
+      fsync_item *i = &fd_queue.front();
+      fd_queue.pop_front();
+      return i;
+    }
+    void _process(fsync_item *i, ThreadPool::TPHandle &handle) {
+      store->_txc_process_fsync(i);
+    }
+    void _clear() {
+      fd_queue.clear();
+    }
+
+    void flush() {
+      lock();
+      while (!fd_queue.empty())
+	_wait();
+      unlock();
+      drain();
+    }
+  };
+
+  class WALWQ : public ThreadPool::WorkQueue<TransContext> {
+    // We need to order WAL items within each Sequencer.  To do that,
+    // queue each txc under osr, and queue the osr's here.  When we
+    // dequeue an txc, requeue the osr if there are more pending, and
+    // do it at the end of the list so that the next thread does not
+    // get a conflicted txc.  Hold an osr mutex while doing the wal to
+    // preserve the ordering.
+  public:
+    typedef boost::intrusive::list<
+      OpSequencer,
+      boost::intrusive::member_hook<
+	OpSequencer,
+	boost::intrusive::list_member_hook<>,
+	&OpSequencer::wal_osr_queue_item> > wal_osr_queue_t;
+
+  private:
+    NewStore *store;
+    wal_osr_queue_t wal_queue;
+
+  public:
+    WALWQ(NewStore *s, time_t ti, time_t sti, ThreadPool *tp)
+      : ThreadPool::WorkQueue<TransContext>("NewStore::WALWQ", ti, sti, tp),
+	store(s) {
+    }
+    bool _empty() {
+      return wal_queue.empty();
+    }
+    bool _enqueue(TransContext *i) {
+      if (i->osr->wal_q.empty()) {
+	wal_queue.push_back(*i->osr);
+      }
+      i->osr->wal_q.push_back(*i);
+      return true;
+    }
+    void _dequeue(TransContext *p) {
+      assert(0 == "not needed, not implemented");
+    }
+    TransContext *_dequeue() {
+      if (wal_queue.empty())
+	return NULL;
+      OpSequencer *osr = &wal_queue.front();
+      TransContext *i = &osr->wal_q.front();
+      osr->wal_q.pop_front();
+      wal_queue.pop_front();
+      if (!osr->wal_q.empty()) {
+	// requeue at the end to minimize contention
+	wal_queue.push_back(*i->osr);
+      }
+
+      // preserve wal ordering for this sequencer by taking the lock
+      // while still holding the queue lock
+      i->osr->wal_apply_lock.Lock();
+      return i;
+    }
+    void _process(TransContext *i, ThreadPool::TPHandle &handle) {
+      store->_wal_apply(i);
+      i->osr->wal_apply_lock.Unlock();
+    }
+    void _clear() {
+      assert(wal_queue.empty());
+    }
+
+    void flush() {
+      lock();
+      while (!wal_queue.empty()) {
+	_wait();
+      }
+      unlock();
+      drain();
+    }
+  };
+
+  struct KVSyncThread : public Thread {
+    NewStore *store;
+    KVSyncThread(NewStore *s) : store(s) {}
+    void *entry() {
+      store->_kv_sync_thread();
+      return NULL;
+    }
+  };
+
+  struct AioCompletionThread : public Thread {
+    NewStore *store;
+    AioCompletionThread(NewStore *s) : store(s) {}
+    void *entry() {
+      store->_aio_thread();
+      return NULL;
+    }
+  };
+
+  // --------------------------------------------------------
+  // members
+private:
+  CephContext *cct;
+  KeyValueDB *db;
+  FS *fs;
+  uuid_d fsid;
+  string db_path;
+  int path_fd;  ///< open handle to $path
+  int fsid_fd;  ///< open handle (locked) to $path/fsid
+  int frag_fd;  ///< open handle to $path/fragments
+  int fset_fd;  ///< open handle to $path/fragments/$cur_fid.fset
+  bool mounted;
+
+  RWLock coll_lock;    ///< rwlock to protect coll_map
+  ceph::unordered_map<coll_t, CollectionRef> coll_map;
+
+  Mutex fid_lock;
+  fid_t fid_last;  ///< last allocated fid
+  fid_t fid_max;   ///< max fid we can allocate before reserving more
+
+  Mutex nid_lock;
+  uint64_t nid_last;
+  uint64_t nid_max;
+
+  Throttle throttle_ops, throttle_bytes;          ///< submit to commit
+  Throttle throttle_wal_ops, throttle_wal_bytes;  ///< submit to wal complete
+
+  Mutex wal_lock;
+  atomic64_t wal_seq;
+  ThreadPool wal_tp;
+  WALWQ wal_wq;
+
+  Finisher finisher;
+  ThreadPool fsync_tp;
+  FsyncWQ fsync_wq;
+
+  AioCompletionThread aio_thread;
+  bool aio_stop;
+  FS::aio_queue_t aio_queue;
+
+  KVSyncThread kv_sync_thread;
+  Mutex kv_lock;
+  Cond kv_cond, kv_sync_cond;
+  bool kv_stop;
+  deque<TransContext*> kv_queue, kv_committing;
+  deque<TransContext*> wal_cleanup_queue, wal_cleaning;
+
+  Logger *logger;
+
+  Mutex reap_lock;
+  Cond reap_cond;
+  list<CollectionRef> removed_collections;
+
+
+  // --------------------------------------------------------
+  // private methods
+
+  void _init_logger();
+  void _shutdown_logger();
+
+  int _open_path();
+  void _close_path();
+  int _open_fsid(bool create);
+  int _lock_fsid();
+  int _read_fsid(uuid_d *f);
+  int _write_fsid();
+  void _close_fsid();
+  int _open_frag();
+  int _create_frag();
+  void _close_frag();
+  int _open_db();
+  void _close_db();
+  int _open_collections();
+  void _close_collections();
+
+  CollectionRef _get_collection(coll_t cid);
+  void _queue_reap_collection(CollectionRef& c);
+  void _reap_collections();
+
+  int _recover_next_fid();
+  int _create_fid(TransContext *txc, fid_t *fid, unsigned flags);
+  int _open_fid(fid_t fid, unsigned flags);
+  int _remove_fid(fid_t fid);
+
+  int _recover_next_nid();
+  void _assign_nid(TransContext *txc, OnodeRef o);
+
+  int _clean_fid_tail_fd(const fragment_t& f, int fd);
+  int _clean_fid_tail(TransContext *txc, const fragment_t& f);
+
+  TransContext *_txc_create(OpSequencer *osr);
+  int _txc_add_transaction(TransContext *txc, Transaction *t);
+  int _txc_finalize(OpSequencer *osr, TransContext *txc);
+  void _txc_state_proc(TransContext *txc);
+  void _txc_aio_submit(TransContext *txc);
+  void _txc_do_sync_fsync(TransContext *txc);
+  void _txc_queue_fsync(TransContext *txc);
+  void _txc_process_fsync(fsync_item *i);
+  void _txc_finish_io(TransContext *txc);
+  void _txc_finish_kv(TransContext *txc);
+  void _txc_finish(TransContext *txc);
+
+  void _osr_reap_done(OpSequencer *osr);
+
+  void _aio_thread();
+  int _aio_start();
+  void _aio_stop();
+
+  void _kv_sync_thread();
+  void _kv_stop() {
+    {
+      Mutex::Locker l(kv_lock);
+      kv_stop = true;
+      kv_cond.Signal();
+    }
+    kv_sync_thread.join();
+    kv_stop = false;
+  }
+
+  wal_op_t *_get_wal_op(TransContext *txc);
+  int _wal_apply(TransContext *txc);
+  int _wal_finish(TransContext *txc);
+  int _do_wal_transaction(wal_transaction_t& wt, TransContext *txc);
+  int _wal_replay();
+
+public:
+  NewStore(CephContext *cct, const string& path);
+  ~NewStore();
+
+  bool needs_journal() { return false; };
+  bool wants_journal() { return false; };
+  bool allows_journal() { return false; };
+
+  int peek_journal_fsid(uuid_d *fsid);
+
+  bool test_mount_in_use();
+
+  int mount();
+  int umount();
+  void _sync();
+
+  unsigned get_max_object_name_length() {
+    return 4096;
+  }
+  unsigned get_max_attr_name_length() {
+    return 256;  // arbitrary; there is no real limit internally
+  }
+
+  int mkfs();
+  int mkjournal() {
+    return 0;
+  }
+
+private:
+  bool sharded;
+public:
+  void set_allow_sharded_objects() {
+    sharded = true;
+  }
+  bool get_allow_sharded_objects() {
+    return sharded;
+  }
+
+  int statfs(struct statfs *buf);
+
+  bool exists(coll_t cid, const ghobject_t& oid);
+  int stat(
+    coll_t cid,
+    const ghobject_t& oid,
+    struct stat *st,
+    bool allow_eio = false); // struct stat?
+  int read(
+    coll_t cid,
+    const ghobject_t& oid,
+    uint64_t offset,
+    size_t len,
+    bufferlist& bl,
+    uint32_t op_flags = 0,
+    bool allow_eio = false);
+  int _do_read(
+    OnodeRef o,
+    uint64_t offset,
+    size_t len,
+    bufferlist& bl,
+    uint32_t op_flags = 0);
+
+  int fiemap(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len, bufferlist& bl);
+  int getattr(coll_t cid, const ghobject_t& oid, const char *name, bufferptr& value);
+  int getattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr>& aset);
+
+  int list_collections(vector<coll_t>& ls);
+  bool collection_exists(coll_t c);
+  bool collection_empty(coll_t c);
+
+  int collection_list(coll_t cid, ghobject_t start, ghobject_t end,
+		      bool sort_bitwise, int max,
+		      vector<ghobject_t> *ls, ghobject_t *next);
+
+  int omap_get(
+    coll_t cid,                ///< [in] Collection containing oid
+    const ghobject_t &oid,   ///< [in] Object containing omap
+    bufferlist *header,      ///< [out] omap header
+    map<string, bufferlist> *out /// < [out] Key to value map
+    );
+
+  /// Get omap header
+  int omap_get_header(
+    coll_t cid,                ///< [in] Collection containing oid
+    const ghobject_t &oid,   ///< [in] Object containing omap
+    bufferlist *header,      ///< [out] omap header
+    bool allow_eio = false ///< [in] don't assert on eio
+    );
+
+  /// Get keys defined on oid
+  int omap_get_keys(
+    coll_t cid,              ///< [in] Collection containing oid
+    const ghobject_t &oid, ///< [in] Object containing omap
+    set<string> *keys      ///< [out] Keys defined on oid
+    );
+
+  /// Get key values
+  int omap_get_values(
+    coll_t cid,                    ///< [in] Collection containing oid
+    const ghobject_t &oid,       ///< [in] Object containing omap
+    const set<string> &keys,     ///< [in] Keys to get
+    map<string, bufferlist> *out ///< [out] Returned keys and values
+    );
+
+  /// Filters keys into out which are defined on oid
+  int omap_check_keys(
+    coll_t cid,                ///< [in] Collection containing oid
+    const ghobject_t &oid,   ///< [in] Object containing omap
+    const set<string> &keys, ///< [in] Keys to check
+    set<string> *out         ///< [out] Subset of keys defined on oid
+    );
+
+  ObjectMap::ObjectMapIterator get_omap_iterator(
+    coll_t cid,              ///< [in] collection
+    const ghobject_t &oid  ///< [in] object
+    );
+
+  void set_fsid(uuid_d u) {
+    fsid = u;
+  }
+  uuid_d get_fsid() {
+    return fsid;
+  }
+
+  objectstore_perf_stat_t get_cur_stats() {
+    return objectstore_perf_stat_t();
+  }
+
+  int queue_transactions(
+    Sequencer *osr,
+    list<Transaction*>& tls,
+    TrackedOpRef op = TrackedOpRef(),
+    ThreadPool::TPHandle *handle = NULL);
+
+private:
+  // --------------------------------------------------------
+  // write ops
+
+  int _do_transaction(Transaction *t,
+		      TransContext *txc,
+		      ThreadPool::TPHandle *handle);
+
+  int _write(TransContext *txc,
+	     CollectionRef& c,
+	     const ghobject_t& oid,
+	     uint64_t offset, size_t len,
+	     bufferlist& bl,
+	     uint32_t fadvise_flags);
+  int _do_overlay_clear(TransContext *txc,
+			OnodeRef o);
+  int _do_overlay_trim(TransContext *txc,
+		       OnodeRef o,
+		       uint64_t offset,
+		       uint64_t length);
+  int _do_overlay_write(TransContext *txc,
+			OnodeRef o,
+			uint64_t offset,
+			uint64_t length,
+			const bufferlist& bl);
+  int _do_write_all_overlays(TransContext *txc,
+			     OnodeRef o);
+  void _do_read_all_overlays(wal_transaction_t& wt);
+  int _do_write(TransContext *txc,
+		OnodeRef o,
+		uint64_t offset, uint64_t length,
+		bufferlist& bl,
+		uint32_t fadvise_flags);
+  int _touch(TransContext *txc,
+	     CollectionRef& c,
+	     const ghobject_t& oid);
+  int _zero(TransContext *txc,
+	    CollectionRef& c,
+	    const ghobject_t& oid,
+	    uint64_t offset, size_t len);
+  int _do_truncate(TransContext *txc,
+		   OnodeRef o,
+		   uint64_t offset);
+  int _truncate(TransContext *txc,
+		CollectionRef& c,
+		const ghobject_t& oid,
+		uint64_t offset);
+  int _remove(TransContext *txc,
+	      CollectionRef& c,
+	      const ghobject_t& oid);
+  int _do_remove(TransContext *txc,
+		 OnodeRef o);
+  int _setattr(TransContext *txc,
+	       CollectionRef& c,
+	       const ghobject_t& oid,
+	       const string& name,
+	       bufferptr& val);
+  int _setattrs(TransContext *txc,
+		CollectionRef& c,
+		const ghobject_t& oid,
+		const map<string,bufferptr>& aset);
+  int _rmattr(TransContext *txc,
+	      CollectionRef& c,
+	      const ghobject_t& oid,
+	      const string& name);
+  int _rmattrs(TransContext *txc,
+	       CollectionRef& c,
+	       const ghobject_t& oid);
+  void _do_omap_clear(TransContext *txc, uint64_t id);
+  int _omap_clear(TransContext *txc,
+		  CollectionRef& c,
+		  const ghobject_t& oid);
+  int _omap_setkeys(TransContext *txc,
+		    CollectionRef& c,
+		    const ghobject_t& oid,
+		    const map<string,bufferlist>& m);
+  int _omap_setheader(TransContext *txc,
+		      CollectionRef& c,
+		      const ghobject_t& oid,
+		      bufferlist& header);
+  int _omap_rmkeys(TransContext *txc,
+		   CollectionRef& c,
+		   const ghobject_t& oid,
+		   const set<string>& m);
+  int _omap_rmkey_range(TransContext *txc,
+			CollectionRef& c,
+			const ghobject_t& oid,
+			const string& first, const string& last);
+  int _setallochint(TransContext *txc,
+		    CollectionRef& c,
+		    const ghobject_t& oid,
+		    uint64_t expected_object_size,
+		    uint64_t expected_write_size);
+  int _clone(TransContext *txc,
+	     CollectionRef& c,
+	     const ghobject_t& old_oid,
+	     const ghobject_t& new_oid);
+  int _clone_range(TransContext *txc,
+		   CollectionRef& c,
+		   const ghobject_t& old_oid,
+		   const ghobject_t& new_oid,
+		   uint64_t srcoff, uint64_t length, uint64_t dstoff);
+  int _rename(TransContext *txc,
+	      CollectionRef& c,
+	      const ghobject_t& old_oid,
+	      const ghobject_t& new_oid);
+  int _create_collection(TransContext *txc, coll_t cid, unsigned bits,
+			 CollectionRef *c);
+  int _remove_collection(TransContext *txc, coll_t cid, CollectionRef *c);
+  int _split_collection(TransContext *txc,
+			CollectionRef& c,
+			CollectionRef& d,
+			unsigned bits, int rem);
+
+};
+
+inline ostream& operator<<(ostream& out, const NewStore::OpSequencer& s) {
+  return out << *s.parent;
+}
+
+static inline void intrusive_ptr_add_ref(NewStore::Onode *o) {
+  o->get();
+}
+static inline void intrusive_ptr_release(NewStore::Onode *o) {
+  o->put();
+}
+
+static inline void intrusive_ptr_add_ref(NewStore::OpSequencer *o) {
+  o->get();
+}
+static inline void intrusive_ptr_release(NewStore::OpSequencer *o) {
+  o->put();
+}
+
+#endif
diff --git a/src/os/newstore/newstore_types.cc b/src/os/newstore/newstore_types.cc
new file mode 100644
index 0000000..5489faf
--- /dev/null
+++ b/src/os/newstore/newstore_types.cc
@@ -0,0 +1,304 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "newstore_types.h"
+#include "common/Formatter.h"
+
+// cnode_t
+
+void cnode_t::encode(bufferlist& bl) const
+{
+  ENCODE_START(1, 1, bl);
+  ::encode(bits, bl);
+  ENCODE_FINISH(bl);
+}
+
+void cnode_t::decode(bufferlist::iterator& p)
+{
+  DECODE_START(1, p);
+  ::decode(bits, p);
+  DECODE_FINISH(p);
+}
+
+void cnode_t::dump(Formatter *f) const
+{
+  f->dump_unsigned("bits", bits);
+}
+
+void cnode_t::generate_test_instances(list<cnode_t*>& o)
+{
+  o.push_back(new cnode_t());
+  o.push_back(new cnode_t(0));
+  o.push_back(new cnode_t(123));
+}
+
+// fit_t
+
+void fid_t::dump(Formatter *f) const
+{
+  f->dump_unsigned("fset", fset);
+  f->dump_unsigned("fno", fno);
+}
+
+void fid_t::generate_test_instances(list<fid_t*>& o)
+{
+  o.push_back(new fid_t());
+  o.push_back(new fid_t(0, 1));
+  o.push_back(new fid_t(123, 3278));
+}
+
+// fragment_t
+
+void fragment_t::encode(bufferlist& bl) const
+{
+  ENCODE_START(1, 1, bl);
+  ::encode(offset, bl);
+  ::encode(length, bl);
+  ::encode(fid, bl);
+  ENCODE_FINISH(bl);
+}
+
+void fragment_t::decode(bufferlist::iterator& p)
+{
+  DECODE_START(1, p);
+  ::decode(offset, p);
+  ::decode(length, p);
+  ::decode(fid, p);
+  DECODE_FINISH(p);
+}
+
+void fragment_t::dump(Formatter *f) const
+{
+  f->dump_unsigned("offset", offset);
+  f->dump_unsigned("length", length);
+  f->dump_object("fid", fid);
+}
+
+void fragment_t::generate_test_instances(list<fragment_t*>& o)
+{
+  o.push_back(new fragment_t());
+  o.push_back(new fragment_t(123, 456));
+  o.push_back(new fragment_t(789, 1024, fid_t(3, 400)));
+}
+
+ostream& operator<<(ostream& out, const fragment_t& f)
+{
+  out << "fragment(" << f.offset << "~" << f.length << " " << f.fid << ")";
+  return out;
+}
+
+// overlay_t
+
+void overlay_t::encode(bufferlist& bl) const
+{
+  ENCODE_START(1, 1, bl);
+  ::encode(key, bl);
+  ::encode(value_offset, bl);
+  ::encode(length, bl);
+  ENCODE_FINISH(bl);
+}
+
+void overlay_t::decode(bufferlist::iterator& p)
+{
+  DECODE_START(1, p);
+  ::decode(key, p);
+  ::decode(value_offset, p);
+  ::decode(length, p);
+  DECODE_FINISH(p);
+}
+
+void overlay_t::dump(Formatter *f) const
+{
+  f->dump_unsigned("key", key);
+  f->dump_unsigned("value_offset", value_offset);
+  f->dump_unsigned("length", length);
+}
+
+void overlay_t::generate_test_instances(list<overlay_t*>& o)
+{
+  o.push_back(new overlay_t());
+  o.push_back(new overlay_t(789, 1024, 1232232));
+}
+
+ostream& operator<<(ostream& out, const overlay_t& o)
+{
+  out << "overlay(" << o.value_offset << "~" << o.length
+      << " key " << o.key << ")";
+  return out;
+}
+
+// onode_t
+
+void onode_t::encode(bufferlist& bl) const
+{
+  ENCODE_START(1, 1, bl);
+  ::encode(nid, bl);
+  ::encode(size, bl);
+  ::encode(attrs, bl);
+  ::encode(data_map, bl);
+  ::encode(overlay_map, bl);
+  ::encode(shared_overlays, bl);
+  ::encode(last_overlay_key, bl);
+  ::encode(omap_head, bl);
+  ::encode(expected_object_size, bl);
+  ::encode(expected_write_size, bl);
+  ENCODE_FINISH(bl);
+}
+
+void onode_t::decode(bufferlist::iterator& p)
+{
+  DECODE_START(1, p);
+  ::decode(nid, p);
+  ::decode(size, p);
+  ::decode(attrs, p);
+  ::decode(data_map, p);
+  ::decode(overlay_map, p);
+  ::decode(shared_overlays, p);
+  ::decode(last_overlay_key, p);
+  ::decode(omap_head, p);
+  ::decode(expected_object_size, p);
+  ::decode(expected_write_size, p);
+  DECODE_FINISH(p);
+}
+
+void onode_t::dump(Formatter *f) const
+{
+  f->dump_unsigned("nid", nid);
+  f->dump_unsigned("size", size);
+  f->open_object_section("attrs");
+  for (map<string,bufferptr>::const_iterator p = attrs.begin();
+       p != attrs.end(); ++p) {
+    f->open_object_section("attr");
+    f->dump_string("name", p->first);
+    f->dump_unsigned("len", p->second.length());
+    f->close_section();
+  }
+  f->open_object_section("data_map");
+  for (map<uint64_t, fragment_t>::const_iterator p = data_map.begin();
+       p != data_map.end(); ++p) {
+    f->open_object_section("fragment");
+    f->dump_unsigned("fragment_offset", p->first);
+    p->second.dump(f);
+    f->close_section();
+  }
+  f->close_section();
+  f->open_object_section("overlays");
+  for (map<uint64_t, overlay_t>::const_iterator p = overlay_map.begin();
+       p != overlay_map.end(); ++p) {
+    f->open_object_section("overlay");
+    f->dump_unsigned("offset", p->first);
+    p->second.dump(f);
+    f->close_section();
+  }
+  f->close_section();
+  f->open_array_section("shared_overlays");
+  for (set<uint64_t>::const_iterator p = shared_overlays.begin();
+       p != shared_overlays.end(); ++p) {
+    f->dump_unsigned("offset", *p);
+  }
+  f->close_section();
+  f->dump_unsigned("last_overlay_key", last_overlay_key);
+  f->dump_unsigned("omap_head", omap_head);
+  f->dump_unsigned("expected_object_size", expected_object_size);
+  f->dump_unsigned("expected_write_size", expected_write_size);
+}
+
+void onode_t::generate_test_instances(list<onode_t*>& o)
+{
+  o.push_back(new onode_t());
+  // FIXME
+}
+
+// wal_op_t
+
+void wal_op_t::encode(bufferlist& bl) const
+{
+  ENCODE_START(1, 1, bl);
+  ::encode(op, bl);
+  ::encode(fid, bl);
+  ::encode(offset, bl);
+  ::encode(length, bl);
+  ::encode(nid, bl);
+  ::encode(overlays, bl);
+  if (!overlays.size()) {
+    ::encode(data, bl);
+  }
+  ENCODE_FINISH(bl);
+}
+
+void wal_op_t::decode(bufferlist::iterator& p)
+{
+  DECODE_START(1, p);
+  ::decode(op, p);
+  ::decode(fid, p);
+  ::decode(offset, p);
+  ::decode(length, p);
+  ::decode(nid, p);
+  ::decode(overlays, p);
+  if (!overlays.size()) {
+    ::decode(data, p);
+  }
+  DECODE_FINISH(p);
+}
+
+void wal_op_t::dump(Formatter *f) const
+{
+  f->dump_unsigned("op", (int)op);
+  f->dump_object("fid", fid);
+  f->dump_unsigned("offset", offset);
+  f->dump_unsigned("length", length);
+  if (overlays.size()) {
+    f->dump_unsigned("nid", nid);
+    f->open_array_section("overlays");
+    for (vector<overlay_t>::const_iterator p = overlays.begin();
+         p != overlays.end(); ++p) {
+      f->dump_object("overlay", *p);
+    }
+    f->close_section();
+  }
+}
+
+void wal_transaction_t::encode(bufferlist& bl) const
+{
+  ENCODE_START(1, 1, bl);
+  ::encode(seq, bl);
+  ::encode(ops, bl);
+  ::encode(shared_overlay_keys, bl);
+  ENCODE_FINISH(bl);
+}
+
+void wal_transaction_t::decode(bufferlist::iterator& p)
+{
+  DECODE_START(1, p);
+  ::decode(seq, p);
+  ::decode(ops, p);
+  ::decode(shared_overlay_keys, p);
+  DECODE_FINISH(p);
+}
+
+void wal_transaction_t::dump(Formatter *f) const
+{
+  f->dump_unsigned("seq", seq);
+  f->open_array_section("ops");
+  for (list<wal_op_t>::const_iterator p = ops.begin(); p != ops.end(); ++p) {
+    f->dump_object("op", *p);
+  }
+  f->close_section();
+  f->open_array_section("shared_overlay_keys");
+  for (vector<string>::const_iterator p = shared_overlay_keys.begin();
+       p != shared_overlay_keys.end(); ++p) {
+    f->dump_string("shared_overlay_key", *p);
+  }
+  f->close_section();
+}
diff --git a/src/os/newstore/newstore_types.h b/src/os/newstore/newstore_types.h
new file mode 100644
index 0000000..ca616ad
--- /dev/null
+++ b/src/os/newstore/newstore_types.h
@@ -0,0 +1,192 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OSD_NEWSTORE_TYPES_H
+#define CEPH_OSD_NEWSTORE_TYPES_H
+
+#include <ostream>
+#include "include/types.h"
+
+namespace ceph {
+  class Formatter;
+}
+
+/// collection metadata
+struct cnode_t {
+  uint32_t bits;   ///< how many bits of coll pgid are significant
+
+  cnode_t(int b=0) : bits(b) {}
+
+  void encode(bufferlist& bl) const;
+  void decode(bufferlist::iterator& p);
+  void dump(Formatter *f) const;
+  static void generate_test_instances(list<cnode_t*>& o);
+};
+WRITE_CLASS_ENCODER(cnode_t)
+
+/// unique id for a local file
+struct fid_t {
+  uint32_t fset, fno;
+  string handle;
+  fid_t() : fset(0), fno(0) { }
+  fid_t(uint32_t s, uint32_t n) : fset(s), fno(n) { }
+
+  void encode(bufferlist& bl) const {
+    ::encode(fset, bl);
+    ::encode(fno, bl);
+    ::encode(handle, bl);
+  }
+  void decode(bufferlist::iterator& p) {
+    ::decode(fset, p);
+    ::decode(fno, p);
+    ::decode(handle, p);
+  }
+  void dump(Formatter *f) const;
+  static void generate_test_instances(list<fid_t*>& o);
+};
+WRITE_CLASS_ENCODER(fid_t)
+
+static inline ostream& operator<<(ostream& out, const fid_t& fid) {
+  out << fid.fset << "/" << fid.fno;
+  if (fid.handle.length())
+    out << "~";
+  return out;
+}
+
+static inline bool operator==(const fid_t& a, const fid_t& b) {
+  return a.fset == b.fset && a.fno == b.fno && a.handle == b.handle;
+}
+static inline bool operator!=(const fid_t& a, const fid_t& b) {
+  return !(a == b);
+}
+
+/// fragment: a byte extent backed by a file
+struct fragment_t {
+  uint32_t offset;   ///< offset in file to first byte of this fragment
+  uint32_t length;   ///< length of fragment/extent
+  fid_t fid;         ///< file backing this fragment
+
+  fragment_t() : offset(0), length(0) {}
+  fragment_t(uint32_t o, uint32_t l) : offset(o), length(l) {}
+  fragment_t(uint32_t o, uint32_t l, fid_t f) : offset(o), length(l), fid(f) {}
+
+  void encode(bufferlist& bl) const;
+  void decode(bufferlist::iterator& p);
+  void dump(Formatter *f) const;
+  static void generate_test_instances(list<fragment_t*>& o);
+};
+WRITE_CLASS_ENCODER(fragment_t)
+
+ostream& operator<<(ostream& out, const fragment_t& o);
+
+struct overlay_t {
+  uint64_t key;          ///< key (offset of start of original k/v pair)
+  uint32_t value_offset; ///< offset in associated value for this extent
+  uint32_t length;
+
+  overlay_t() : key(0), value_offset(0), length(0) {}
+  overlay_t(uint64_t k, uint32_t vo, uint32_t l)
+    : key(k), value_offset(vo), length(l) {}
+
+  void encode(bufferlist& bl) const;
+  void decode(bufferlist::iterator& p);
+  void dump(Formatter *f) const;
+  static void generate_test_instances(list<overlay_t*>& o);
+
+};
+WRITE_CLASS_ENCODER(overlay_t)
+
+ostream& operator<<(ostream& out, const overlay_t& o);
+
+/// onode: per-object metadata
+struct onode_t {
+  uint64_t nid;                        ///< numeric id (locally unique)
+  uint64_t size;                       ///< object size
+  map<string, bufferptr> attrs;        ///< attrs
+  map<uint64_t, fragment_t> data_map;  ///< data (offset to fragment mapping)
+  map<uint64_t,overlay_t> overlay_map; ///< overlay data (stored in db)
+  set<uint64_t> shared_overlays;       ///< overlay keys that are shared
+  uint32_t last_overlay_key;           ///< key for next overlay
+  uint64_t omap_head;                  ///< id for omap root node
+
+  uint32_t expected_object_size;
+  uint32_t expected_write_size;
+
+  onode_t()
+    : nid(0),
+      size(0),
+      last_overlay_key(0),
+      omap_head(0),
+      expected_object_size(0),
+      expected_write_size(0) {}
+
+  void encode(bufferlist& bl) const;
+  void decode(bufferlist::iterator& p);
+  void dump(Formatter *f) const;
+  static void generate_test_instances(list<onode_t*>& o);
+};
+WRITE_CLASS_ENCODER(onode_t)
+
+
+/// writeahead-logged op
+struct wal_op_t {
+  typedef enum {
+    OP_WRITE = 1,
+    OP_TRUNCATE = 3,
+    OP_ZERO = 4,
+    OP_REMOVE = 5,
+  } type_t;
+  __u8 op;
+  fid_t fid;
+  uint64_t offset, length;
+  bufferlist data;
+  uint64_t nid;
+  vector<overlay_t> overlays;
+
+  void encode(bufferlist& bl) const;
+  void decode(bufferlist::iterator& p);
+  void dump(Formatter *f) const;
+  static void generate_test_instances(list<wal_op_t*>& o);
+};
+WRITE_CLASS_ENCODER(wal_op_t)
+
+
+/// writeahead-logged transaction
+struct wal_transaction_t {
+  uint64_t seq;
+  list<wal_op_t> ops;
+  vector<string> shared_overlay_keys;
+
+  int64_t _bytes;  ///< cached byte count
+
+  wal_transaction_t() : _bytes(-1) {}
+
+  uint64_t get_bytes() {
+    if (_bytes < 0) {
+      _bytes = 0;
+      for (list<wal_op_t>::iterator p = ops.begin(); p != ops.end(); ++p) {
+	_bytes += p->length;
+      }
+    }
+    return _bytes;
+  }
+
+  void encode(bufferlist& bl) const;
+  void decode(bufferlist::iterator& p);
+  void dump(Formatter *f) const;
+  static void generate_test_instances(list<wal_transaction_t*>& o);
+};
+WRITE_CLASS_ENCODER(wal_transaction_t)
+
+#endif
diff --git a/src/osd/Ager.cc b/src/osd/Ager.cc
deleted file mode 100644
index 3e1100e..0000000
--- a/src/osd/Ager.cc
+++ /dev/null
@@ -1,270 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
-// vim: ts=8 sw=2 smarttab
-
-#include "include/types.h"
-
-#include "Ager.h"
-
-#include "common/Clock.h"
-#include "common/debug.h"
-
-// ick
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-
-#if defined(DARWIN) || defined(__FreeBSD__)
-#include <sys/param.h>
-#include <sys/mount.h>
-#endif // DARWIN || __FreeBSD__
-
-
-int myrand() 
-{
-  if (0) 
-    return rand();
-  else {
-    static int n = 0;
-    srand(n++);
-    return rand();
-  }
-}
-
-
-file_object_t Ager::age_get_oid() {
-  if (!age_free_oids.empty()) {
-    file_object_t o = age_free_oids.front();
-    age_free_oids.pop_front();
-    return o;
-  }
-  file_object_t last = age_cur_oid;
-  ++age_cur_oid.bno;
-  return last;
-}
-
-ssize_t Ager::age_pick_size() {
-  ssize_t max = file_size_distn.sample() * 1024;
-  return max/2 + (myrand() % 100) * max/200 + 1;
-}
-
-bool start_debug = false;
-
-uint64_t Ager::age_fill(float pc, utime_t until) {
-  int max = 1024*1024;
-  bufferptr bp(max);
-  bp.zero();
-  bufferlist bl;
-  bl.push_back(bp);
-  uint64_t wrote = 0;
-  while (1) {
-    if (ceph_clock_now(cct) > until) break;
-    
-    struct statfs st;
-    store->statfs(&st);
-    float free = 1.0 - ((float)(st.f_bfree) / (float)st.f_blocks);
-    float avail = 1.0 - ((float)(st.f_bavail) / (float)st.f_blocks);  // to write to
-    //float a = (float)(st.f_bfree) / (float)st.f_blocks;
-    //dout(10) << "age_fill at " << a << " / " << pc << " .. " << st.f_blocks << " " << st.f_bavail << dendl;
-    if (free >= pc) {
-      generic_dout(2) << "age_fill at " << free << " / " << avail << " / " << " / " << pc << " stopping" << dendl;
-      break;
-    }
-
-    // make sure we can write to it..
-    if (avail > .98 ||
-        avail - free > .02) 
-      store->sync();
-
-    file_object_t poid = age_get_oid();
-    
-    int b = myrand() % 10;
-    age_objects[b].push_back(poid);
-    
-    ssize_t s = age_pick_size();
-    wrote += (s + 4095) / 4096;
-
-
-
-
-    generic_dout(2) << "age_fill at " << free << " / " << avail << " / " << pc << " creating " << hex << poid << dec << " sz " << s << dendl;
-    
-
-    if (false && start_debug && wrote > 1000000ULL) { 
-      /*
-
-
-      1005700
-?
-1005000
-1005700
-      1005710
-      1005725ULL
-      1005750ULL
-      1005800
-      1006000
-
-//  99  1000500 ? 1000750 1006000
-*/
-    }
-
-    off_t off = 0;
-    while (s) {
-      ssize_t t = MIN(s, max);
-      bufferlist sbl;
-      sbl.substr_of(bl, 0, t);
-      ObjectStore::Transaction tr;
-      hobject_t oid(sobject_t(poid, 0));
-      tr.write(coll_t(), oid, off, t, sbl);
-      store->apply_transaction(tr);
-      off += t;
-      s -= t;
-    }
-    poid.bno++;
-  }
-
-  return wrote*4; // KB
-}
-
-void Ager::age_empty(float pc) {
-  int nper = 20;
-  int n = nper;
-
-  while (1) {
-    struct statfs st;
-    store->statfs(&st);
-    float free = 1.0 - ((float)(st.f_bfree) / (float)st.f_blocks);
-    float avail = 1.0 - ((float)(st.f_bavail) / (float)st.f_blocks);  // to write to
-    generic_dout(2) << "age_empty at " << free << " / " << avail << " / " << pc << dendl;//" stopping" << dendl;
-    if (free <= pc) {
-      generic_dout(2) << "age_empty at " << free << " / " << avail << " / " << pc << " stopping" << dendl;
-      break;
-    }
-    
-    int b = myrand() % 10;
-    n--;
-    if (n == 0 || age_objects[b].empty()) {
-      generic_dout(2) << "age_empty sync" << dendl;
-      //sync();
-      //sync();
-      n = nper;
-      continue;
-    }
-    file_object_t poid = age_objects[b].front();
-    age_objects[b].pop_front();
-    
-    generic_dout(2) << "age_empty at " << free << " / " << avail << " / " << pc << " removing " << hex << poid << dec << dendl;
-    
-    ObjectStore::Transaction t;
-    hobject_t oid(sobject_t(poid, 0));
-    t.remove(coll_t(), oid);
-    store->apply_transaction(t);
-    age_free_oids.push_back(poid);
-  }
-}
-
-
-
-
-void Ager::age(int time,
-               float high_water,    // fill to this %
-               float low_water,     // then empty to this %
-               int count,         // this many times
-               float final_water,   // and end here ( <= low_water)
-               int fake_size_mb) { 
-
-  srand(0);
-
-  utime_t start = ceph_clock_now(cct);
-  utime_t until = start;
-  until.sec_ref() += time;
-  
-  //int elapsed = 0;
-  int freelist_inc = 60;
-  utime_t nextfl = start;
-  nextfl.sec_ref() += freelist_inc;
-
-  while (age_objects.size() < 10) age_objects.push_back( list<file_object_t>() );
-  
-  if (fake_size_mb) {
-    int fake_bl = fake_size_mb * 256;
-    struct statfs st;
-    store->statfs(&st);
-    float f = (float)fake_bl / (float)st.f_blocks;
-    high_water = (float)high_water * f;
-    low_water = (float)low_water * f;
-    final_water = (float)final_water * f;
-    generic_dout(2) << "fake " << fake_bl << " / " << st.f_blocks << " is " << f << ", high " << high_water << " low " << low_water << " final " << final_water << dendl;
-  }
-  
-  // init size distn (once)
-  if (!did_distn) {
-    did_distn = true;
-    age_cur_oid = file_object_t(888, 0);
-    file_size_distn.add(1, 19.0758125+0.65434375);
-    file_size_distn.add(512, 35.6566);
-    file_size_distn.add(1024, 27.7271875);
-    file_size_distn.add(2*1024, 16.63503125);
-    //file_size_distn.add(4*1024, 106.82384375);
-    //file_size_distn.add(8*1024, 81.493375);
-    //file_size_distn.add(16*1024, 14.13553125);
-    //file_size_distn.add(32*1024, 2.176);
-    //file_size_distn.add(256*1024, 0.655938);
-    //file_size_distn.add(512*1024, 0.1480625);
-    //file_size_distn.add(1*1024*1024, 0.020125); // actually 2, but 32bit
-    file_size_distn.normalize();
-  }
-  
-  // clear
-  for (int i=0; i<10; i++)
-    age_objects[i].clear();
-  
-  uint64_t wrote = 0;
-
-  for (int c=1; c<=count; c++) {
-    if (ceph_clock_now(cct) > until) break;
-    
-    //if (c == 7) start_debug = true;
-    
-    generic_dout(1) << "#age " << c << "/" << count << " filling to " << high_water << dendl;
-    uint64_t w = age_fill(high_water, until);
-    //dout(1) << "age wrote " << w << dendl;
-    wrote += w;
-    //store->sync();
-    //store->_get_frag_stat(st);
-    //pfrag(st);
-
-
-    if (c == count) {
-      generic_dout(1) << "#age final empty to " << final_water << dendl;
-      age_empty(final_water);    
-    } else {
-      generic_dout(1) << "#age " << c << "/" << count << " emptying to " << low_water << dendl;
-      age_empty(low_water);
-    }
-    //store->sync();
-    //store->sync();
-
-    // show frag state
-    /*store->_get_frag_stat(st);
-    pfrag(wrote / (1024ULL*1024ULL) ,  // GB
-    st);*/
-
-    // dump freelist?
-    /*
-    if (ceph_clock_now(cct) > nextfl) {
-      elapsed += freelist_inc;
-      save_freelist(elapsed);
-      nextfl.sec_ref() += freelist_inc;
-    }
-    */
-  }
-
-  // dump the freelist
-  //save_freelist(0);
-  exit(0);   // hack
-
-  // ok!
-  store->sync();
-  store->sync();
-  generic_dout(1) << "age finished" << dendl;
-}  
diff --git a/src/osd/Ager.h b/src/osd/Ager.h
deleted file mode 100644
index face0a6..0000000
--- a/src/osd/Ager.h
+++ /dev/null
@@ -1,43 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
-// vim: ts=8 sw=2 smarttab
-#ifndef CEPH_AGER_H
-#define CEPH_AGER_H
-
-#include "include/types.h"
-#include "include/Distribution.h"
-#include "os/ObjectStore.h"
-#include "common/Clock.h"
-#include "common/ceph_context.h"
-
-#include <list>
-#include <vector>
-using namespace std;
-
-class Ager {
-  CephContext *cct;
-  ObjectStore *store;
-
- private:
-  list<file_object_t>           age_free_oids;
-  file_object_t                 age_cur_oid;
-  vector< list<file_object_t> > age_objects;
-  Distribution file_size_distn; //kb
-  bool         did_distn;
-
-  void age_empty(float pc);
-  uint64_t age_fill(float pc, utime_t until);
-  ssize_t age_pick_size();
-  file_object_t age_get_oid();
-
- public:
-  Ager(CephContext *cct_, ObjectStore *s) : cct(cct_), store(s), did_distn(false) {}
-
-  void age(int time,
-           float high_water,    // fill to this %
-          float low_water,     // then empty to this %
-          int count,         // this many times
-          float final_water,   // and end here ( <= low_water)
-          int fake_size_mb=0);
-};
-
-#endif
diff --git a/src/osd/ClassHandler.cc b/src/osd/ClassHandler.cc
index 6c1f20d..87d5a75 100644
--- a/src/osd/ClassHandler.cc
+++ b/src/osd/ClassHandler.cc
@@ -56,7 +56,7 @@ int ClassHandler::open_all_classes()
 	strncmp(pde->d_name, CLS_PREFIX, sizeof(CLS_PREFIX) - 1) == 0 &&
 	strcmp(pde->d_name + strlen(pde->d_name) - (sizeof(CLS_SUFFIX) - 1), CLS_SUFFIX) == 0) {
       char cname[PATH_MAX + 1];
-      strcpy(cname, pde->d_name + sizeof(CLS_PREFIX) - 1);
+      strncpy(cname, pde->d_name + sizeof(CLS_PREFIX) - 1, sizeof(cname) -1);
       cname[strlen(cname) - (sizeof(CLS_SUFFIX) - 1)] = '\0';
       dout(10) << __func__ << " found " << cname << dendl;
       ClassData *cls;
@@ -221,6 +221,17 @@ ClassHandler::ClassMethod *ClassHandler::ClassData::register_cxx_method(const ch
   return &method;
 }
 
+ClassHandler::ClassFilter *ClassHandler::ClassData::register_cxx_filter(
+    const std::string &filter_name,
+    cls_cxx_filter_factory_t fn)
+{
+  ClassFilter &filter = filters_map[filter_name];
+  filter.fn = fn;
+  filter.name = filter_name;
+  filter.cls = this;
+  return &filter;
+}
+
 ClassHandler::ClassMethod *ClassHandler::ClassData::_get_method(const char *mname)
 {
   map<string, ClassHandler::ClassMethod>::iterator iter = methods_map.find(mname);
@@ -252,6 +263,20 @@ void ClassHandler::ClassMethod::unregister()
   cls->unregister_method(this);
 }
 
+void ClassHandler::ClassData::unregister_filter(ClassHandler::ClassFilter *filter)
+{
+  /* no need for locking, called under the class_init mutex */
+   map<string, ClassFilter>::iterator iter = filters_map.find(filter->name);
+   if (iter == filters_map.end())
+     return;
+   filters_map.erase(iter);
+}
+
+void ClassHandler::ClassFilter::unregister()
+{
+  cls->unregister_filter(this);
+}
+
 int ClassHandler::ClassMethod::exec(cls_method_context_t ctx, bufferlist& indata, bufferlist& outdata)
 {
   int ret;
diff --git a/src/osd/ClassHandler.h b/src/osd/ClassHandler.h
index 93cf3c0..e4bb999 100644
--- a/src/osd/ClassHandler.h
+++ b/src/osd/ClassHandler.h
@@ -35,6 +35,17 @@ public:
     ClassMethod() : cls(0), flags(0), func(0), cxx_func(0) {}
   };
 
+  struct ClassFilter {
+    struct ClassHandler::ClassData *cls;
+    std::string name;
+    cls_cxx_filter_factory_t fn;
+
+    void unregister();
+
+    ClassFilter() : fn(0)
+    {}
+  };
+
   struct ClassData {
     enum Status { 
       CLASS_UNKNOWN,
@@ -49,6 +60,7 @@ public:
     void *handle;
 
     map<string, ClassMethod> methods_map;
+    map<string, ClassFilter> filters_map;
 
     set<ClassData *> dependencies;         /* our dependencies */
     set<ClassData *> missing_dependencies; /* only missing dependencies */
@@ -64,11 +76,27 @@ public:
     ClassMethod *register_cxx_method(const char *mname, int flags, cls_method_cxx_call_t func);
     void unregister_method(ClassMethod *method);
 
+    ClassFilter *register_cxx_filter(
+        const std::string &filter_name,
+        cls_cxx_filter_factory_t fn);
+    void unregister_filter(ClassFilter *method);
+
     ClassMethod *get_method(const char *mname) {
       Mutex::Locker l(handler->mutex);
       return _get_method(mname);
     }
     int get_method_flags(const char *mname);
+
+    ClassFilter *get_filter(const std::string &filter_name)
+    {
+      Mutex::Locker l(handler->mutex);
+      std::map<std::string, ClassFilter>::iterator i = filters_map.find(name);
+      if (i == filters_map.end()) {
+        return NULL;
+      } else {
+        return &(i->second);
+      }
+    }
   };
 
 private:
diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc
index 845ea83..d6b95a5 100644
--- a/src/osd/ECBackend.cc
+++ b/src/osd/ECBackend.cc
@@ -170,12 +170,11 @@ void ECBackend::RecoveryOp::dump(Formatter *f) const
 ECBackend::ECBackend(
   PGBackend::Listener *pg,
   coll_t coll,
-  coll_t temp_coll,
   ObjectStore *store,
   CephContext *cct,
   ErasureCodeInterfaceRef ec_impl,
   uint64_t stripe_width)
-  : PGBackend(pg, store, coll, temp_coll),
+  : PGBackend(pg, store, coll),
     cct(cct),
     ec_impl(ec_impl),
     sinfo(ec_impl->get_data_chunk_count(), stripe_width) {
@@ -197,6 +196,7 @@ struct OnRecoveryReadComplete :
     : pg(pg), hoid(hoid) {}
   void finish(pair<RecoveryMessages *, ECBackend::read_result_t &> &in) {
     ECBackend::read_result_t &res = in.second;
+    // FIXME???
     assert(res.r == 0);
     assert(res.errors.empty());
     assert(res.returned.size() == 1);
@@ -210,7 +210,7 @@ struct OnRecoveryReadComplete :
 
 struct RecoveryMessages {
   map<hobject_t,
-      ECBackend::read_request_t> reads;
+      ECBackend::read_request_t, hobject_t::BitwiseComparator> reads;
   void read(
     ECBackend *ec,
     const hobject_t &hoid, uint64_t off, uint64_t len,
@@ -235,7 +235,7 @@ struct RecoveryMessages {
   map<pg_shard_t, vector<PushOp> > pushes;
   map<pg_shard_t, vector<PushReplyOp> > push_replies;
   ObjectStore::Transaction *t;
-  RecoveryMessages() : t(new ObjectStore::Transaction) {}
+  RecoveryMessages() : t(NULL) {}
   ~RecoveryMessages() { assert(!t); }
 };
 
@@ -243,20 +243,28 @@ void ECBackend::handle_recovery_push(
   PushOp &op,
   RecoveryMessages *m)
 {
+  assert(m->t);
+
   bool oneshot = op.before_progress.first && op.after_progress.data_complete;
-  coll_t tcoll = oneshot ? coll : get_temp_coll(m->t);
+  ghobject_t tobj;
+  if (oneshot) {
+    tobj = ghobject_t(op.soid, ghobject_t::NO_GEN,
+		      get_parent()->whoami_shard().shard);
+  } else {
+    tobj = ghobject_t(get_parent()->get_temp_recovery_object(op.version,
+							     op.soid.snap),
+		      ghobject_t::NO_GEN,
+		      get_parent()->whoami_shard().shard);
+    if (op.before_progress.first) {
+      dout(10) << __func__ << ": Adding oid "
+	       << tobj.hobj << " in the temp collection" << dendl;
+      add_temp_obj(tobj.hobj);
+    }
+  }
+
   if (op.before_progress.first) {
-    get_parent()->on_local_recover_start(
-      op.soid,
-      m->t);
-    m->t->remove(
-      get_temp_coll(m->t),
-      ghobject_t(
-	op.soid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
-    m->t->touch(
-      tcoll,
-      ghobject_t(
-	op.soid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
+    m->t->remove(coll, tobj);
+    m->t->touch(coll, tobj);
   }
 
   if (!op.data_included.empty()) {
@@ -265,9 +273,8 @@ void ECBackend::handle_recovery_push(
     assert(op.data.length() == (end - start));
 
     m->t->write(
-      tcoll,
-      ghobject_t(
-	op.soid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+      coll,
+      tobj,
       start,
       op.data.length(),
       op.data);
@@ -276,22 +283,22 @@ void ECBackend::handle_recovery_push(
   }
 
   if (op.before_progress.first) {
-    if (!oneshot)
-      add_temp_obj(op.soid);
     assert(op.attrset.count(string("_")));
     m->t->setattrs(
-      tcoll,
-      ghobject_t(
-	op.soid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+      coll,
+      tobj,
       op.attrset);
   }
 
   if (op.after_progress.data_complete && !oneshot) {
-    clear_temp_obj(op.soid);
-    m->t->collection_move(
-      coll,
-      tcoll,
-      ghobject_t(
+    dout(10) << __func__ << ": Removing oid "
+	     << tobj.hobj << " from the temp collection" << dendl;
+    clear_temp_obj(tobj.hobj);
+    m->t->remove(coll, ghobject_t(
+	op.soid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
+    m->t->collection_move_rename(
+      coll, tobj,
+      coll, ghobject_t(
 	op.soid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
   }
   if (op.after_progress.data_complete) {
@@ -361,12 +368,27 @@ void ECBackend::handle_recovery_read_complete(
     from[i->first.shard].claim(i->second);
   }
   dout(10) << __func__ << ": " << from << dendl;
-  ECUtil::decode(sinfo, ec_impl, from, target);
+  int r = ECUtil::decode(sinfo, ec_impl, from, target);
+  assert(r == 0);
   if (attrs) {
     op.xattrs.swap(*attrs);
 
     if (!op.obc) {
-      op.obc = get_parent()->get_obc(hoid, op.xattrs);
+      // attrs only reference the origin bufferlist (decode from ECSubReadReply message)
+      // whose size is much greater than attrs in recovery. If obc cache it (get_obc maybe
+      // cache the attr), this causes the whole origin bufferlist would not be free until
+      // obc is evicted from obc cache. So rebuild the bufferlist before cache it.
+      for (map<string, bufferlist>::iterator it = op.xattrs.begin();
+           it != op.xattrs.end();
+           ++it) {
+        it->second.rebuild();
+      }
+      // Need to remove ECUtil::get_hinfo_key() since it should not leak out
+      // of the backend (see bug #12983)
+      map<string, bufferlist> sanitized_attrs(op.xattrs);
+      sanitized_attrs.erase(ECUtil::get_hinfo_key());
+      op.obc = get_parent()->get_obc(hoid, sanitized_attrs);
+      assert(op.obc);
       op.recovery_info.size = op.obc->obs.oi.size;
       op.recovery_info.oi = op.obc->obs.oi;
     }
@@ -442,22 +464,29 @@ void ECBackend::dispatch_recovery_messages(RecoveryMessages &m, int priority)
     msg->compute_cost(cct);
     replies.insert(make_pair(i->first.osd, msg));
   }
-  m.t->register_on_complete(
-    get_parent()->bless_context(
-      new SendPushReplies(
-	get_parent(),
-	get_parent()->get_epoch(),
-	replies)));
-  m.t->register_on_applied(
-    new ObjectStore::C_DeleteTransaction(m.t));
-  get_parent()->queue_transaction(m.t);
-  m.t = NULL;
+
+  if (!replies.empty()) {
+    m.t->register_on_complete(
+	get_parent()->bless_context(
+	  new SendPushReplies(
+	    get_parent(),
+	    get_parent()->get_epoch(),
+	    replies)));
+    m.t->register_on_applied(
+	new ObjectStore::C_DeleteTransaction(m.t));
+    get_parent()->queue_transaction(m.t);
+    m.t = NULL;
+  } else {
+    assert(!m.t);
+  }
+
   if (m.reads.empty())
     return;
   start_read_op(
     priority,
     m.reads,
-    OpRequestRef());
+    OpRequestRef(),
+    false, true);
 }
 
 void ECBackend::continue_recovery_op(
@@ -475,7 +504,7 @@ void ECBackend::continue_recovery_op(
       set<pg_shard_t> to_read;
       uint64_t recovery_max_chunk = get_recovery_chunk_size();
       int r = get_min_avail_to_read_shards(
-	op.hoid, want, true, &to_read);
+	op.hoid, want, true, false, &to_read);
       if (r != 0) {
 	// we must have lost a recovery source
 	assert(!op.recovery_progress.first);
@@ -582,11 +611,11 @@ void ECBackend::continue_recovery_op(
       }
       return;
     }
-    case RecoveryOp::COMPLETE: {
-      assert(0); // should never be called once complete
-    };
-    default:
+    // should never be called once complete
+    case RecoveryOp::COMPLETE:
+    default: {
       assert(0);
+    };
     }
   }
 }
@@ -687,6 +716,8 @@ bool ECBackend::handle_message(
   case MSG_OSD_PG_PUSH: {
     MOSDPGPush *op = static_cast<MOSDPGPush *>(_op->get_req());
     RecoveryMessages rm;
+    rm.t = new ObjectStore::Transaction;
+    assert(rm.t);
     for (vector<PushOp>::iterator i = op->pushes.begin();
 	 i != op->pushes.end();
 	 ++i) {
@@ -811,17 +842,16 @@ void ECBackend::handle_sub_write(
   ObjectStore::Transaction *localt = new ObjectStore::Transaction;
   localt->set_use_tbl(op.t.get_use_tbl());
   if (!op.temp_added.empty()) {
-    get_temp_coll(localt);
     add_temp_objs(op.temp_added);
   }
   if (op.t.empty()) {
-    for (set<hobject_t>::iterator i = op.temp_removed.begin();
+    for (set<hobject_t, hobject_t::BitwiseComparator>::iterator i = op.temp_removed.begin();
 	 i != op.temp_removed.end();
 	 ++i) {
       dout(10) << __func__ << ": removing object " << *i
 	       << " since we won't get the transaction" << dendl;
       localt->remove(
-	temp_coll,
+	coll,
 	ghobject_t(
 	  *i,
 	  ghobject_t::NO_GEN,
@@ -838,10 +868,9 @@ void ECBackend::handle_sub_write(
     localt);
 
   if (!(dynamic_cast<ReplicatedPG *>(get_parent())->is_undersized()) &&
-      get_parent()->whoami_shard().shard >= ec_impl->get_data_chunk_count())
+      (unsigned)get_parent()->whoami_shard().shard >= ec_impl->get_data_chunk_count())
     op.t.set_fadvise_flag(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED);
 
-  localt->append(op.t);
   if (on_local_applied_sync) {
     dout(10) << "Queueing onreadable_sync: " << on_local_applied_sync << dendl;
     localt->register_on_applied_sync(on_local_applied_sync);
@@ -857,7 +886,13 @@ void ECBackend::handle_sub_write(
       new SubWriteApplied(this, msg, op.tid, op.at_version)));
   localt->register_on_applied(
     new ObjectStore::C_DeleteTransaction(localt));
-  get_parent()->queue_transaction(localt, msg);
+  list<ObjectStore::Transaction*> tls;
+  tls.push_back(localt);
+  tls.push_back(new ObjectStore::Transaction);
+  tls.back()->swap(op.t);
+  tls.back()->register_on_complete(
+    new ObjectStore::C_DeleteTransaction(tls.back()));
+  get_parent()->queue_transactions(tls, msg);
 }
 
 void ECBackend::handle_sub_read(
@@ -865,37 +900,73 @@ void ECBackend::handle_sub_read(
   ECSubRead &op,
   ECSubReadReply *reply)
 {
-  for(map<hobject_t, list<boost::tuple<uint64_t, uint64_t, uint32_t> > >::iterator i =
+  shard_id_t shard = get_parent()->whoami_shard().shard;
+  for(map<hobject_t, list<boost::tuple<uint64_t, uint64_t, uint32_t> >, hobject_t::BitwiseComparator>::iterator i =
         op.to_read.begin();
       i != op.to_read.end();
       ++i) {
-    for (list<boost::tuple<uint64_t, uint64_t, uint32_t> >::iterator j = i->second.begin();
-	 j != i->second.end();
-	 ++j) {
+    int r = 0;
+    ECUtil::HashInfoRef hinfo = get_hash_info(i->first);
+    if (!hinfo) {
+      r = -EIO;
+      get_parent()->clog_error() << __func__ << ": No hinfo for " << i->first << "\n";
+      dout(5) << __func__ << ": No hinfo for " << i->first << dendl;
+      goto error;
+    }
+    for (list<boost::tuple<uint64_t, uint64_t, uint32_t> >::iterator j =
+	   i->second.begin(); j != i->second.end(); ++j) {
       bufferlist bl;
-      int r = store->read(
-	i->first.is_temp() ? temp_coll : coll,
-	ghobject_t(
-	  i->first, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+      r = store->read(
+	coll,
+	ghobject_t(i->first, ghobject_t::NO_GEN, shard),
 	j->get<0>(),
 	j->get<1>(),
 	bl, j->get<2>(),
-	false);
+	true); // Allow EIO return
       if (r < 0) {
-	assert(0);
-	reply->buffers_read.erase(i->first);
-	reply->errors[i->first] = r;
-	break;
+	get_parent()->clog_error() << __func__
+				   << ": Error " << r
+				   << " reading "
+				   << i->first;
+	dout(5) << __func__ << ": Error " << r
+		<< " reading " << i->first << dendl;
+	goto error;
       } else {
+        dout(20) << __func__ << " read request=" << j->get<1>() << " r=" << r << " len=" << bl.length() << dendl;
 	reply->buffers_read[i->first].push_back(
 	  make_pair(
 	    j->get<0>(),
 	    bl)
 	  );
       }
+
+      // This shows that we still need deep scrub because large enough files
+      // are read in sections, so the digest check here won't be done here.
+      // Do NOT check osd_read_eio_on_bad_digest here.  We need to report
+      // the state of our chunk in case other chunks could substitute.
+      if ((bl.length() == hinfo->get_total_chunk_size()) &&
+	  (j->get<0>() == 0)) {
+	dout(20) << __func__ << ": Checking hash of " << i->first << dendl;
+	bufferhash h(-1);
+	h << bl;
+	if (h.digest() != hinfo->get_chunk_hash(shard)) {
+	  get_parent()->clog_error() << __func__ << ": Bad hash for " << i->first << " digest 0x"
+	          << hex << h.digest() << " expected 0x" << hinfo->get_chunk_hash(shard) << dec << "\n";
+	  dout(5) << __func__ << ": Bad hash for " << i->first << " digest 0x"
+	          << hex << h.digest() << " expected 0x" << hinfo->get_chunk_hash(shard) << dec << dendl;
+	  r = -EIO;
+	  goto error;
+	}
+      }
     }
-  }
-  for (set<hobject_t>::iterator i = op.attrs_to_read.begin();
+    continue;
+error:
+    // Do NOT check osd_read_eio_on_bad_digest here.  We need to report
+    // the state of our chunk in case other chunks could substitute.
+    reply->buffers_read.erase(i->first);
+    reply->errors[i->first] = r;
+  }
+  for (set<hobject_t, hobject_t::BitwiseComparator>::iterator i = op.attrs_to_read.begin();
        i != op.attrs_to_read.end();
        ++i) {
     dout(10) << __func__ << ": fulfilling attr request on "
@@ -903,12 +974,11 @@ void ECBackend::handle_sub_read(
     if (reply->errors.count(*i))
       continue;
     int r = store->getattrs(
-      i->is_temp() ? temp_coll : coll,
+      coll,
       ghobject_t(
 	*i, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
       reply->attrs_read[*i]);
     if (r < 0) {
-      assert(0);
       reply->buffers_read.erase(*i);
       reply->errors[*i] = r;
     }
@@ -946,16 +1016,18 @@ void ECBackend::handle_sub_read_reply(
   map<ceph_tid_t, ReadOp>::iterator iter = tid_to_read_map.find(op.tid);
   if (iter == tid_to_read_map.end()) {
     //canceled
+    dout(20) << __func__ << ": dropped " << op << dendl;
     return;
   }
   ReadOp &rop = iter->second;
-  for (map<hobject_t, list<pair<uint64_t, bufferlist> > >::iterator i =
+  for (map<hobject_t, list<pair<uint64_t, bufferlist> >, hobject_t::BitwiseComparator>::iterator i =
 	 op.buffers_read.begin();
        i != op.buffers_read.end();
        ++i) {
-    assert(!op.errors.count(i->first));
+    assert(!op.errors.count(i->first));	// If attribute error we better not have sent a buffer
     if (!rop.to_read.count(i->first)) {
       // We canceled this read! @see filter_read_op
+      dout(20) << __func__ << " to_read skipping" << dendl;
       continue;
     }
     list<boost::tuple<uint64_t, uint64_t, uint32_t> >::const_iterator req_iter =
@@ -976,49 +1048,100 @@ void ECBackend::handle_sub_read_reply(
       riter->get<2>()[from].claim(j->second);
     }
   }
-  for (map<hobject_t, map<string, bufferlist> >::iterator i = op.attrs_read.begin();
+  for (map<hobject_t, map<string, bufferlist>, hobject_t::BitwiseComparator>::iterator i = op.attrs_read.begin();
        i != op.attrs_read.end();
        ++i) {
-    assert(!op.errors.count(i->first));
+    assert(!op.errors.count(i->first));	// if read error better not have sent an attribute
     if (!rop.to_read.count(i->first)) {
       // We canceled this read! @see filter_read_op
+      dout(20) << __func__ << " to_read skipping" << dendl;
       continue;
     }
     rop.complete[i->first].attrs = map<string, bufferlist>();
     (*(rop.complete[i->first].attrs)).swap(i->second);
   }
-  for (map<hobject_t, int>::iterator i = op.errors.begin();
+  for (map<hobject_t, int, hobject_t::BitwiseComparator>::iterator i = op.errors.begin();
        i != op.errors.end();
        ++i) {
     rop.complete[i->first].errors.insert(
       make_pair(
 	from,
 	i->second));
-    if (rop.complete[i->first].r == 0)
-      rop.complete[i->first].r = i->second;
+    dout(20) << __func__ << " shard=" << from << " error=" << i->second << dendl;
   }
 
   map<pg_shard_t, set<ceph_tid_t> >::iterator siter =
-shard_to_read_map.find(from);
+					shard_to_read_map.find(from);
   assert(siter != shard_to_read_map.end());
   assert(siter->second.count(op.tid));
   siter->second.erase(op.tid);
 
   assert(rop.in_progress.count(from));
   rop.in_progress.erase(from);
-  if (!rop.in_progress.empty()) {
-    dout(10) << __func__ << " readop not complete: " << rop << dendl;
-  } else {
-    dout(10) << __func__ << " readop complete: " << rop << dendl;
+  unsigned is_complete = 0;
+  // For redundant reads check for completion as each shard comes in,
+  // or in a non-recovery read check for completion once all the shards read.
+  if (rop.do_redundant_reads || (!rop.for_recovery && rop.in_progress.empty())) {
+    for (map<hobject_t, read_result_t>::const_iterator iter =
+        rop.complete.begin();
+      iter != rop.complete.end();
+      ++iter) {
+      set<int> have;
+      for (map<pg_shard_t, bufferlist>::const_iterator j =
+          iter->second.returned.front().get<2>().begin();
+        j != iter->second.returned.front().get<2>().end();
+        ++j) {
+        have.insert(j->first.shard);
+        dout(20) << __func__ << " have shard=" << j->first.shard << dendl;
+      }
+      set<int> want_to_read, dummy_minimum;
+      get_want_to_read_shards(&want_to_read);
+      int err;
+      if ((err = ec_impl->minimum_to_decode(want_to_read, have, &dummy_minimum)) < 0) {
+	dout(20) << __func__ << " minimum_to_decode failed" << dendl;
+        if (rop.in_progress.empty()) {
+	  // If we don't have enough copies and we haven't sent reads for all shards
+	  // we can send the rest of the reads, if any.
+	  if (!rop.do_redundant_reads) {
+	    int r = objects_remaining_read_async(iter->first, rop);
+	    if (r == 0) {
+	      // We added to in_progress and not incrementing is_complete
+	      continue;
+	    }
+	    // Couldn't read any additional shards so handle as completed with errors
+	  }
+	  if (rop.complete[iter->first].errors.empty()) {
+	    dout(20) << __func__ << " simply not enough copies err=" << err << dendl;
+	  } else {
+	    // Grab the first error
+	    err = rop.complete[iter->first].errors.begin()->second;
+	    dout(20) << __func__ << ": Use one of the shard errors err=" << err << dendl;
+	  }
+	  rop.complete[iter->first].r = err;
+	  ++is_complete;
+	}
+      } else {
+	if (!rop.complete[iter->first].errors.empty())
+	  dout(10) << __func__ << " Enough copies for " << iter->first << " (ignore errors)" << dendl;
+	++is_complete;
+	rop.complete[iter->first].errors.clear();
+        assert(rop.complete[iter->first].r == 0);
+      }
+    }
+  }
+  if (rop.in_progress.empty() || is_complete == rop.complete.size()) {
+    dout(20) << __func__ << " Complete: " << rop << dendl;
     complete_read_op(rop, m);
+  } else {
+    dout(10) << __func__ << " readop not complete: " << rop << dendl;
   }
 }
 
 void ECBackend::complete_read_op(ReadOp &rop, RecoveryMessages *m)
 {
-  map<hobject_t, read_request_t>::iterator reqiter =
+  map<hobject_t, read_request_t, hobject_t::BitwiseComparator>::iterator reqiter =
     rop.to_read.begin();
-  map<hobject_t, read_result_t>::iterator resiter =
+  map<hobject_t, read_result_t, hobject_t::BitwiseComparator>::iterator resiter =
     rop.complete.begin();
   assert(rop.to_read.size() == rop.complete.size());
   for (; reqiter != rop.to_read.end(); ++reqiter, ++resiter) {
@@ -1049,8 +1172,8 @@ void ECBackend::filter_read_op(
   const OSDMapRef osdmap,
   ReadOp &op)
 {
-  set<hobject_t> to_cancel;
-  for (map<pg_shard_t, set<hobject_t> >::iterator i = op.source_to_obj.begin();
+  set<hobject_t, hobject_t::BitwiseComparator> to_cancel;
+  for (map<pg_shard_t, set<hobject_t, hobject_t::BitwiseComparator> >::iterator i = op.source_to_obj.begin();
        i != op.source_to_obj.end();
        ++i) {
     if (osdmap->is_down(i->first.osd)) {
@@ -1063,10 +1186,10 @@ void ECBackend::filter_read_op(
   if (to_cancel.empty())
     return;
 
-  for (map<pg_shard_t, set<hobject_t> >::iterator i = op.source_to_obj.begin();
+  for (map<pg_shard_t, set<hobject_t, hobject_t::BitwiseComparator> >::iterator i = op.source_to_obj.begin();
        i != op.source_to_obj.end();
        ) {
-    for (set<hobject_t>::iterator j = i->second.begin();
+    for (set<hobject_t, hobject_t::BitwiseComparator>::iterator j = i->second.begin();
 	 j != i->second.end();
 	 ) {
       if (to_cancel.count(*j))
@@ -1082,7 +1205,7 @@ void ECBackend::filter_read_op(
     }
   }
 
-  for (set<hobject_t>::iterator i = to_cancel.begin();
+  for (set<hobject_t, hobject_t::BitwiseComparator>::iterator i = to_cancel.begin();
        i != to_cancel.end();
        ++i) {
     get_parent()->cancel_pull(*i);
@@ -1139,7 +1262,7 @@ void ECBackend::on_change()
        i != tid_to_read_map.end();
        ++i) {
     dout(10) << __func__ << ": cancelling " << i->second << dendl;
-    for (map<hobject_t, read_request_t>::iterator j =
+    for (map<hobject_t, read_request_t, hobject_t::BitwiseComparator>::iterator j =
 	   i->second.to_read.begin();
 	 j != i->second.to_read.end();
 	 ++j) {
@@ -1171,7 +1294,7 @@ void ECBackend::on_flushed()
 void ECBackend::dump_recovery_info(Formatter *f) const
 {
   f->open_array_section("recovery_ops");
-  for (map<hobject_t, RecoveryOp>::const_iterator i = recovery_ops.begin();
+  for (map<hobject_t, RecoveryOp, hobject_t::BitwiseComparator>::const_iterator i = recovery_ops.begin();
        i != recovery_ops.end();
        ++i) {
     f->open_object_section("op");
@@ -1249,12 +1372,12 @@ void ECBackend::submit_transaction(
   
   op->t = static_cast<ECTransaction*>(_t);
 
-  set<hobject_t> need_hinfos;
+  set<hobject_t, hobject_t::BitwiseComparator> need_hinfos;
   op->t->get_append_objects(&need_hinfos);
-  for (set<hobject_t>::iterator i = need_hinfos.begin();
+  for (set<hobject_t, hobject_t::BitwiseComparator>::iterator i = need_hinfos.begin();
        i != need_hinfos.end();
        ++i) {
-    ECUtil::HashInfoRef ref = get_hash_info(*i);
+    ECUtil::HashInfoRef ref = get_hash_info(*i, false);
     if (!ref) {
       derr << __func__ << ": get_hash_info(" << *i << ")"
 	   << " returned a null pointer and there is no "
@@ -1299,9 +1422,13 @@ int ECBackend::get_min_avail_to_read_shards(
   const hobject_t &hoid,
   const set<int> &want,
   bool for_recovery,
+  bool do_redundant_reads,
   set<pg_shard_t> *to_read)
 {
-  map<hobject_t, set<pg_shard_t> >::const_iterator miter =
+  // Make sure we don't do redundant reads for recovery
+  assert(!for_recovery || !do_redundant_reads);
+
+  map<hobject_t, set<pg_shard_t>, hobject_t::BitwiseComparator>::const_iterator miter =
     get_parent()->get_missing_loc_shards().find(hoid);
 
   set<int> have;
@@ -1334,7 +1461,8 @@ int ECBackend::get_min_avail_to_read_shards(
       assert(!shards.count(i->shard));
       const pg_info_t &info = get_parent()->get_shard_info(*i);
       const pg_missing_t &missing = get_parent()->get_shard_missing(*i);
-      if (hoid < info.last_backfill && !missing.is_missing(hoid)) {
+      if (cmp(hoid, info.last_backfill, get_parent()->sort_bitwise()) < 0 &&
+	  !missing.is_missing(hoid)) {
 	have.insert(i->shard);
 	shards.insert(make_pair(i->shard, *i));
       }
@@ -1361,6 +1489,10 @@ int ECBackend::get_min_avail_to_read_shards(
   if (r < 0)
     return r;
 
+  if (do_redundant_reads) {
+      need.swap(have);
+  } 
+
   if (!to_read)
     return 0;
 
@@ -1373,10 +1505,50 @@ int ECBackend::get_min_avail_to_read_shards(
   return 0;
 }
 
+int ECBackend::get_remaining_shards(
+  const hobject_t &hoid,
+  const set<int> &avail,
+  set<pg_shard_t> *to_read)
+{
+  map<hobject_t, set<pg_shard_t> >::const_iterator miter =
+    get_parent()->get_missing_loc_shards().find(hoid);
+
+  set<int> need;
+  map<shard_id_t, pg_shard_t> shards;
+
+  for (set<pg_shard_t>::const_iterator i =
+	 get_parent()->get_acting_shards().begin();
+       i != get_parent()->get_acting_shards().end();
+       ++i) {
+    dout(10) << __func__ << ": checking acting " << *i << dendl;
+    const pg_missing_t &missing = get_parent()->get_shard_missing(*i);
+    if (!missing.is_missing(hoid)) {
+      assert(!need.count(i->shard));
+      need.insert(i->shard);
+      assert(!shards.count(i->shard));
+      shards.insert(make_pair(i->shard, *i));
+    }
+  }
+
+  if (!to_read)
+    return 0;
+
+  for (set<int>::iterator i = need.begin();
+       i != need.end();
+       ++i) {
+    assert(shards.count(shard_id_t(*i)));
+    if (avail.find(*i) == avail.end())
+      to_read->insert(shards[shard_id_t(*i)]);
+  }
+  return 0;
+}
+
 void ECBackend::start_read_op(
   int priority,
-  map<hobject_t, read_request_t> &to_read,
-  OpRequestRef _op)
+  map<hobject_t, read_request_t, hobject_t::BitwiseComparator> &to_read,
+  OpRequestRef _op,
+  bool do_redundant_reads,
+  bool for_recovery)
 {
   ceph_tid_t tid = get_parent()->get_tid();
   assert(!tid_to_read_map.count(tid));
@@ -1385,10 +1557,12 @@ void ECBackend::start_read_op(
   op.tid = tid;
   op.to_read.swap(to_read);
   op.op = _op;
+  op.do_redundant_reads = do_redundant_reads;
+  op.for_recovery = for_recovery;
   dout(10) << __func__ << ": starting " << op << dendl;
 
   map<pg_shard_t, ECSubRead> messages;
-  for (map<hobject_t, read_request_t>::iterator i = op.to_read.begin();
+  for (map<hobject_t, read_request_t, hobject_t::BitwiseComparator>::iterator i = op.to_read.begin();
        i != op.to_read.end();
        ++i) {
     list<boost::tuple<
@@ -1450,8 +1624,75 @@ void ECBackend::start_read_op(
   dout(10) << __func__ << ": started " << op << dendl;
 }
 
+// This is based on start_read_op(), maybe this should be refactored
+void ECBackend::start_remaining_read_op(
+  ReadOp &op,
+  map<hobject_t, read_request_t, hobject_t::BitwiseComparator> &to_read)
+{
+  int priority = op.priority;
+  ceph_tid_t tid = op.tid;
+  op.to_read.swap(to_read);
+
+  dout(10) << __func__ << ": starting additional " << op << dendl;
+
+  map<pg_shard_t, ECSubRead> messages;
+  for (map<hobject_t, read_request_t,
+           hobject_t::BitwiseComparator>::iterator i = op.to_read.begin();
+       i != op.to_read.end();
+       ++i) {
+    bool need_attrs = i->second.want_attrs;
+    for (set<pg_shard_t>::const_iterator j = i->second.need.begin();
+	 j != i->second.need.end();
+	 ++j) {
+      if (need_attrs) {
+	messages[*j].attrs_to_read.insert(i->first);
+	need_attrs = false;
+      }
+      op.obj_to_source[i->first].insert(*j);
+      op.source_to_obj[*j].insert(i->first);
+    }
+    for (list<boost::tuple<uint64_t, uint64_t, uint32_t> >::const_iterator j =
+	   i->second.to_read.begin();
+	 j != i->second.to_read.end();
+	 ++j) {
+      pair<uint64_t, uint64_t> chunk_off_len =
+	sinfo.aligned_offset_len_to_chunk(make_pair(j->get<0>(), j->get<1>()));
+      for (set<pg_shard_t>::const_iterator k = i->second.need.begin();
+	   k != i->second.need.end();
+	   ++k) {
+	messages[*k].to_read[i->first].push_back(boost::make_tuple(chunk_off_len.first,
+								    chunk_off_len.second,
+								    j->get<2>()));
+      }
+      assert(!need_attrs);
+    }
+  }
+
+  for (map<pg_shard_t, ECSubRead>::iterator i = messages.begin();
+       i != messages.end();
+       ++i) {
+    op.in_progress.insert(i->first);
+    shard_to_read_map[i->first].insert(op.tid);
+    i->second.tid = tid;
+    MOSDECSubOpRead *msg = new MOSDECSubOpRead;
+    msg->set_priority(priority);
+    msg->pgid = spg_t(
+      get_parent()->whoami_spg_t().pgid,
+      i->first.shard);
+    msg->map_epoch = get_parent()->get_epoch();
+    msg->op = i->second;
+    msg->op.from = get_parent()->whoami_shard();
+    msg->op.tid = tid;
+    get_parent()->send_message_osd_cluster(
+      i->first.osd,
+      msg,
+      get_parent()->get_epoch());
+  }
+  dout(10) << __func__ << ": started additional " << op << dendl;
+}
+
 ECUtil::HashInfoRef ECBackend::get_hash_info(
-  const hobject_t &hoid)
+  const hobject_t &hoid, bool checks)
 {
   dout(10) << __func__ << ": Getting attr on " << hoid << dendl;
   ECUtil::HashInfoRef ref = unstable_hashinfo_registry.lookup(hoid);
@@ -1459,23 +1700,28 @@ ECUtil::HashInfoRef ECBackend::get_hash_info(
     dout(10) << __func__ << ": not in cache " << hoid << dendl;
     struct stat st;
     int r = store->stat(
-      hoid.is_temp() ? temp_coll : coll,
+      coll,
       ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
       &st);
     ECUtil::HashInfo hinfo(ec_impl->get_chunk_count());
-    if (r >= 0 && st.st_size > 0) {
+    // XXX: What does it mean if there is no object on disk?
+    if (r >= 0) {
       dout(10) << __func__ << ": found on disk, size " << st.st_size << dendl;
       bufferlist bl;
       r = store->getattr(
-	hoid.is_temp() ? temp_coll : coll,
+	coll,
 	ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
 	ECUtil::get_hinfo_key(),
 	bl);
       if (r >= 0) {
 	bufferlist::iterator bp = bl.begin();
 	::decode(hinfo, bp);
-	assert(hinfo.get_total_chunk_size() == (uint64_t)st.st_size);
-      } else {
+	if (checks && hinfo.get_total_chunk_size() != (uint64_t)st.st_size) {
+	  dout(0) << __func__ << ": Mismatch of total_chunk_size "
+			       << hinfo.get_total_chunk_size() << dendl;
+	  return ECUtil::HashInfoRef();
+	}
+      } else if (st.st_size > 0) { // If empty object and no hinfo, create it
 	return ECUtil::HashInfoRef();
       }
     }
@@ -1604,6 +1850,8 @@ struct CallClientContexts :
     : ec(ec), status(status), to_read(to_read) {}
   void finish(pair<RecoveryMessages *, ECBackend::read_result_t &> &in) {
     ECBackend::read_result_t &res = in.second;
+    if (res.r != 0)
+      goto out;
     assert(res.returned.size() == to_read.size());
     assert(res.r == 0);
     assert(res.errors.empty());
@@ -1623,11 +1871,15 @@ struct CallClientContexts :
 	   ++j) {
 	to_decode[j->first.shard].claim(j->second);
       }
-      ECUtil::decode(
+      int r = ECUtil::decode(
 	ec->sinfo,
 	ec->ec_impl,
 	to_decode,
 	&bl);
+      if (r < 0) {
+        res.r = r;
+        goto out;
+      }
       assert(i->second.second);
       assert(i->second.first);
       i->second.first->substr_of(
@@ -1639,12 +1891,13 @@ struct CallClientContexts :
       }
       res.returned.pop_front();
     }
+out:
     status->complete = true;
     list<ECBackend::ClientAsyncReadStatus> &ip =
       ec->in_progress_client_reads;
     while (ip.size() && ip.front().complete) {
       if (ip.front().on_complete) {
-	ip.front().on_complete->complete(0);
+	ip.front().on_complete->complete(res.r);
 	ip.front().on_complete = NULL;
       }
       ip.pop_front();
@@ -1664,7 +1917,8 @@ void ECBackend::objects_read_async(
   const hobject_t &hoid,
   const list<pair<boost::tuple<uint64_t, uint64_t, uint32_t>,
 		  pair<bufferlist*, Context*> > > &to_read,
-  Context *on_complete)
+  Context *on_complete,
+  bool fast_read)
 {
   in_progress_client_reads.push_back(ClientAsyncReadStatus(on_complete));
   CallClientContexts *c = new CallClientContexts(
@@ -1681,21 +1935,19 @@ void ECBackend::objects_read_async(
     offsets.push_back(boost::make_tuple(tmp.first, tmp.second, i->first.get<2>()));
   }
 
-  const vector<int> &chunk_mapping = ec_impl->get_chunk_mapping();
   set<int> want_to_read;
-  for (int i = 0; i < (int)ec_impl->get_data_chunk_count(); ++i) {
-    int chunk = (int)chunk_mapping.size() > i ? chunk_mapping[i] : i;
-    want_to_read.insert(chunk);
-  }
+  get_want_to_read_shards(&want_to_read);
+    
   set<pg_shard_t> shards;
   int r = get_min_avail_to_read_shards(
     hoid,
     want_to_read,
     false,
+    fast_read,
     &shards);
   assert(r == 0);
 
-  map<hobject_t, read_request_t> for_read_op;
+  map<hobject_t, read_request_t, hobject_t::BitwiseComparator> for_read_op;
   for_read_op.insert(
     make_pair(
       hoid,
@@ -1709,11 +1961,48 @@ void ECBackend::objects_read_async(
   start_read_op(
     cct->_conf->osd_client_op_priority,
     for_read_op,
-    OpRequestRef());
+    OpRequestRef(),
+    fast_read, false);
   return;
 }
 
 
+int ECBackend::objects_remaining_read_async(
+  const hobject_t &hoid,
+  ReadOp &rop)
+{
+  set<int> already_read;
+  const set<pg_shard_t>& ots = rop.obj_to_source[hoid];
+  for (set<pg_shard_t>::iterator i = ots.begin(); i != ots.end(); ++i)
+    already_read.insert(i->shard);
+  dout(10) << __func__ << " have/error shards=" << already_read << dendl;
+  set<pg_shard_t> shards;
+  int r = get_remaining_shards(hoid, already_read, &shards);
+  if (r)
+    return r;
+  if (shards.empty())
+    return -EIO;
+
+  dout(10) << __func__ << " Read remaining shards " << shards << dendl;
+
+  list<boost::tuple<uint64_t, uint64_t, uint32_t> > offsets = rop.to_read.find(hoid)->second.to_read;
+  GenContext<pair<RecoveryMessages *, read_result_t& > &> *c = rop.to_read.find(hoid)->second.cb;
+
+  map<hobject_t, read_request_t, hobject_t::BitwiseComparator> for_read_op;
+  for_read_op.insert(
+    make_pair(
+      hoid,
+      read_request_t(
+	hoid,
+	offsets,
+	shards,
+	false,
+	c)));
+
+  start_remaining_read_op(rop, for_read_op);
+  return 0;
+}
+
 int ECBackend::objects_get_attrs(
   const hobject_t &hoid,
   map<string, bufferlist> *out)
@@ -1760,6 +2049,9 @@ void ECBackend::be_deep_scrub(
   if (stride % sinfo.get_chunk_size())
     stride += sinfo.get_chunk_size() - (stride % sinfo.get_chunk_size());
   uint64_t pos = 0;
+
+  uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
+
   while (true) {
     bufferlist bl;
     handle.reset_tp_timeout();
@@ -1769,7 +2061,7 @@ void ECBackend::be_deep_scrub(
 	poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
       pos,
       stride, bl,
-      true);
+      fadvise_flags, true);
     if (r < 0)
       break;
     if (bl.length() % sinfo.get_chunk_size()) {
@@ -1788,7 +2080,7 @@ void ECBackend::be_deep_scrub(
     o.read_error = true;
   }
 
-  ECUtil::HashInfoRef hinfo = get_hash_info(poid);
+  ECUtil::HashInfoRef hinfo = get_hash_info(poid, false);
   if (!hinfo) {
     dout(0) << "_scan_list  " << poid << " could not retrieve hash info" << dendl;
     o.read_error = true;
diff --git a/src/osd/ECBackend.h b/src/osd/ECBackend.h
index d6e710d..a039b70 100644
--- a/src/osd/ECBackend.h
+++ b/src/osd/ECBackend.h
@@ -145,7 +145,8 @@ public:
     const hobject_t &hoid,
     const list<pair<boost::tuple<uint64_t, uint64_t, uint32_t>,
 		    pair<bufferlist*, Context*> > > &to_read,
-    Context *on_complete);
+    Context *on_complete,
+    bool fast_read = false);
 
 private:
   friend struct ECRecoveryHandle;
@@ -154,6 +155,14 @@ private:
 			sinfo.get_stripe_width());
   }
 
+  void get_want_to_read_shards(set<int> *want_to_read) const {
+    const vector<int> &chunk_mapping = ec_impl->get_chunk_mapping();
+    for (int i = 0; i < (int)ec_impl->get_data_chunk_count(); ++i) {
+      int chunk = (int)chunk_mapping.size() > i ? chunk_mapping[i] : i;
+      want_to_read->insert(chunk);
+    }
+  }
+
   /**
    * Recovery
    *
@@ -231,7 +240,7 @@ private:
     RecoveryOp() : pending_read(false), state(IDLE) {}
   };
   friend ostream &operator<<(ostream &lhs, const RecoveryOp &rhs);
-  map<hobject_t, RecoveryOp> recovery_ops;
+  map<hobject_t, RecoveryOp, hobject_t::BitwiseComparator> recovery_ops;
 
 public:
   /**
@@ -284,12 +293,19 @@ public:
     int priority;
     ceph_tid_t tid;
     OpRequestRef op; // may be null if not on behalf of a client
+    // True if redundant reads are issued, false otherwise,
+    // this is useful to tradeoff some resources (redundant ops) for
+    // low latency read, especially on relatively idle cluster
+    bool do_redundant_reads;
+    // True if reading for recovery which could possibly reading only a subset
+    // of the available shards.
+    bool for_recovery;
 
-    map<hobject_t, read_request_t> to_read;
-    map<hobject_t, read_result_t> complete;
+    map<hobject_t, read_request_t, hobject_t::BitwiseComparator> to_read;
+    map<hobject_t, read_result_t, hobject_t::BitwiseComparator> complete;
 
-    map<hobject_t, set<pg_shard_t> > obj_to_source;
-    map<pg_shard_t, set<hobject_t> > source_to_obj;
+    map<hobject_t, set<pg_shard_t>, hobject_t::BitwiseComparator> obj_to_source;
+    map<pg_shard_t, set<hobject_t, hobject_t::BitwiseComparator> > source_to_obj;
 
     void dump(Formatter *f) const;
 
@@ -305,8 +321,15 @@ public:
   map<pg_shard_t, set<ceph_tid_t> > shard_to_read_map;
   void start_read_op(
     int priority,
-    map<hobject_t, read_request_t> &to_read,
-    OpRequestRef op);
+    map<hobject_t, read_request_t, hobject_t::BitwiseComparator> &to_read,
+    OpRequestRef op,
+    bool do_redundant_reads, bool for_recovery);
+
+  void start_remaining_read_op(ReadOp &rop,
+    map<hobject_t, read_request_t, hobject_t::BitwiseComparator> &to_read);
+  int objects_remaining_read_async(
+    const hobject_t &hoid,
+    ReadOp &rop);
 
 
   /**
@@ -338,13 +361,13 @@ public:
 
     ECTransaction *t;
 
-    set<hobject_t> temp_added;
-    set<hobject_t> temp_cleared;
+    set<hobject_t, hobject_t::BitwiseComparator> temp_added;
+    set<hobject_t, hobject_t::BitwiseComparator> temp_cleared;
 
     set<pg_shard_t> pending_commit;
     set<pg_shard_t> pending_apply;
 
-    map<hobject_t, ECUtil::HashInfoRef> unstable_hash_infos;
+    map<hobject_t, ECUtil::HashInfoRef, hobject_t::BitwiseComparator> unstable_hash_infos;
     ~Op() {
       delete t;
       delete on_local_applied_sync;
@@ -432,8 +455,8 @@ public:
 
   const ECUtil::stripe_info_t sinfo;
   /// If modified, ensure that the ref is held until the update is applied
-  SharedPtrRegistry<hobject_t, ECUtil::HashInfo> unstable_hashinfo_registry;
-  ECUtil::HashInfoRef get_hash_info(const hobject_t &hoid);
+  SharedPtrRegistry<hobject_t, ECUtil::HashInfo, hobject_t::BitwiseComparator> unstable_hashinfo_registry;
+  ECUtil::HashInfoRef get_hash_info(const hobject_t &hoid, bool checks = true);
 
   friend struct ReadCB;
   void check_op(Op *op);
@@ -442,7 +465,6 @@ public:
   ECBackend(
     PGBackend::Listener *pg,
     coll_t coll,
-    coll_t temp_coll,
     ObjectStore *store,
     CephContext *cct,
     ErasureCodeInterfaceRef ec_impl,
@@ -453,9 +475,15 @@ public:
     const hobject_t &hoid,     ///< [in] object
     const set<int> &want,      ///< [in] desired shards
     bool for_recovery,         ///< [in] true if we may use non-acting replicas
+    bool do_redundant_reads,   ///< [in] true if we want to issue redundant reads to reduce latency
     set<pg_shard_t> *to_read   ///< [out] shards to read
     ); ///< @return error code, 0 on success
 
+  int get_remaining_shards(
+    const hobject_t &hoid,
+    const set<int> &avail,
+    set<pg_shard_t> *to_read);
+
   int objects_get_attrs(
     const hobject_t &hoid,
     map<string, bufferlist> *out);
diff --git a/src/osd/ECMsgTypes.cc b/src/osd/ECMsgTypes.cc
index cf63611..0dc65b0 100644
--- a/src/osd/ECMsgTypes.cc
+++ b/src/osd/ECMsgTypes.cc
@@ -159,8 +159,8 @@ void ECSubRead::encode(bufferlist &bl, uint64_t features) const
     ENCODE_START(1, 1, bl);
     ::encode(from, bl);
     ::encode(tid, bl);
-    map<hobject_t, list<pair<uint64_t, uint64_t> > > tmp;
-    for (map<hobject_t, list<boost::tuple<uint64_t, uint64_t, uint32_t> > >::const_iterator m = to_read.begin();
+    map<hobject_t, list<pair<uint64_t, uint64_t> >, hobject_t::BitwiseComparator> tmp;
+    for (map<hobject_t, list<boost::tuple<uint64_t, uint64_t, uint32_t> >, hobject_t::BitwiseComparator>::const_iterator m = to_read.begin();
 	  m != to_read.end(); ++m) {
       list<pair<uint64_t, uint64_t> > tlist;
       for (list<boost::tuple<uint64_t, uint64_t, uint32_t> >::const_iterator l = m->second.begin();
@@ -189,9 +189,9 @@ void ECSubRead::decode(bufferlist::iterator &bl)
   ::decode(from, bl);
   ::decode(tid, bl);
   if (struct_v == 1) {
-    map<hobject_t, list<pair<uint64_t, uint64_t> > >tmp;
+    map<hobject_t, list<pair<uint64_t, uint64_t> >, hobject_t::BitwiseComparator>tmp;
     ::decode(tmp, bl);
-    for (map<hobject_t, list<pair<uint64_t, uint64_t> > >::const_iterator m = tmp.begin();
+    for (map<hobject_t, list<pair<uint64_t, uint64_t> >, hobject_t::BitwiseComparator>::const_iterator m = tmp.begin();
 	  m != tmp.end(); ++m) {
       list<boost::tuple<uint64_t, uint64_t, uint32_t> > tlist;
       for (list<pair<uint64_t, uint64_t> > ::const_iterator l = m->second.begin();
@@ -221,7 +221,7 @@ void ECSubRead::dump(Formatter *f) const
   f->dump_stream("from") << from;
   f->dump_unsigned("tid", tid);
   f->open_array_section("objects");
-  for (map<hobject_t, list<boost::tuple<uint64_t, uint64_t, uint32_t> > >::const_iterator i =
+  for (map<hobject_t, list<boost::tuple<uint64_t, uint64_t, uint32_t> >, hobject_t::BitwiseComparator>::const_iterator i =
 	 to_read.begin();
        i != to_read.end();
        ++i) {
@@ -244,7 +244,7 @@ void ECSubRead::dump(Formatter *f) const
   f->close_section();
 
   f->open_array_section("object_attrs_requested");
-  for (set<hobject_t>::const_iterator i = attrs_to_read.begin();
+  for (set<hobject_t,hobject_t::BitwiseComparator>::const_iterator i = attrs_to_read.begin();
        i != attrs_to_read.end();
        ++i) {
     f->open_object_section("object");
@@ -310,7 +310,7 @@ void ECSubReadReply::dump(Formatter *f) const
   f->dump_stream("from") << from;
   f->dump_unsigned("tid", tid);
   f->open_array_section("buffers_read");
-  for (map<hobject_t, list<pair<uint64_t, bufferlist> > >::const_iterator i =
+  for (map<hobject_t, list<pair<uint64_t, bufferlist> >, hobject_t::BitwiseComparator>::const_iterator i =
 	 buffers_read.begin();
        i != buffers_read.end();
        ++i) {
@@ -332,7 +332,7 @@ void ECSubReadReply::dump(Formatter *f) const
   f->close_section();
 
   f->open_array_section("attrs_returned");
-  for (map<hobject_t, map<string, bufferlist> >::const_iterator i =
+  for (map<hobject_t, map<string, bufferlist>, hobject_t::BitwiseComparator>::const_iterator i =
 	 attrs_read.begin();
        i != attrs_read.end();
        ++i) {
@@ -353,7 +353,7 @@ void ECSubReadReply::dump(Formatter *f) const
   f->close_section();
 
   f->open_array_section("errors");
-  for (map<hobject_t, int>::const_iterator i = errors.begin();
+  for (map<hobject_t, int, hobject_t::BitwiseComparator>::const_iterator i = errors.begin();
        i != errors.end();
        ++i) {
     f->open_object_section("error_pair");
diff --git a/src/osd/ECMsgTypes.h b/src/osd/ECMsgTypes.h
index 7819383..cc41c5a 100644
--- a/src/osd/ECMsgTypes.h
+++ b/src/osd/ECMsgTypes.h
@@ -31,10 +31,10 @@ struct ECSubWrite {
   eversion_t trim_to;
   eversion_t trim_rollback_to;
   vector<pg_log_entry_t> log_entries;
-  set<hobject_t> temp_added;
-  set<hobject_t> temp_removed;
+  set<hobject_t, hobject_t::BitwiseComparator> temp_added;
+  set<hobject_t, hobject_t::BitwiseComparator> temp_removed;
   boost::optional<pg_hit_set_history_t> updated_hit_set_history;
-  ECSubWrite() {}
+  ECSubWrite() : tid(0) {}
   ECSubWrite(
     pg_shard_t from,
     ceph_tid_t tid,
@@ -47,8 +47,8 @@ struct ECSubWrite {
     eversion_t trim_rollback_to,
     vector<pg_log_entry_t> log_entries,
     boost::optional<pg_hit_set_history_t> updated_hit_set_history,
-    const set<hobject_t> &temp_added,
-    const set<hobject_t> &temp_removed)
+    const set<hobject_t, hobject_t::BitwiseComparator> &temp_added,
+    const set<hobject_t, hobject_t::BitwiseComparator> &temp_removed)
     : from(from), tid(tid), reqid(reqid),
       soid(soid), stats(stats), t(t),
       at_version(at_version),
@@ -57,10 +57,29 @@ struct ECSubWrite {
       temp_added(temp_added),
       temp_removed(temp_removed),
       updated_hit_set_history(updated_hit_set_history) {}
+  void claim(ECSubWrite &other) {
+    from = other.from;
+    tid = other.tid;
+    reqid = other.reqid;
+    soid = other.soid;
+    stats = other.stats;
+    t.swap(other.t);
+    at_version = other.at_version;
+    trim_to = other.trim_to;
+    trim_rollback_to = other.trim_rollback_to;
+    log_entries.swap(other.log_entries);
+    temp_added.swap(other.temp_added);
+    temp_removed.swap(other.temp_removed);
+    updated_hit_set_history = other.updated_hit_set_history;
+  }
   void encode(bufferlist &bl) const;
   void decode(bufferlist::iterator &bl);
   void dump(Formatter *f) const;
   static void generate_test_instances(list<ECSubWrite*>& o);
+private:
+  // no outside copying -- slow
+  ECSubWrite(ECSubWrite& other);
+  const ECSubWrite& operator=(const ECSubWrite& other);
 };
 WRITE_CLASS_ENCODER(ECSubWrite)
 
@@ -70,7 +89,7 @@ struct ECSubWriteReply {
   eversion_t last_complete;
   bool committed;
   bool applied;
-  ECSubWriteReply() : committed(false), applied(false) {}
+  ECSubWriteReply() : tid(0), committed(false), applied(false) {}
   void encode(bufferlist &bl) const;
   void decode(bufferlist::iterator &bl);
   void dump(Formatter *f) const;
@@ -81,8 +100,8 @@ WRITE_CLASS_ENCODER(ECSubWriteReply)
 struct ECSubRead {
   pg_shard_t from;
   ceph_tid_t tid;
-  map<hobject_t, list<boost::tuple<uint64_t, uint64_t, uint32_t> > > to_read;
-  set<hobject_t> attrs_to_read;
+  map<hobject_t, list<boost::tuple<uint64_t, uint64_t, uint32_t> >, hobject_t::BitwiseComparator> to_read;
+  set<hobject_t, hobject_t::BitwiseComparator> attrs_to_read;
   void encode(bufferlist &bl, uint64_t features) const;
   void decode(bufferlist::iterator &bl);
   void dump(Formatter *f) const;
@@ -93,9 +112,9 @@ WRITE_CLASS_ENCODER_FEATURES(ECSubRead)
 struct ECSubReadReply {
   pg_shard_t from;
   ceph_tid_t tid;
-  map<hobject_t, list<pair<uint64_t, bufferlist> > > buffers_read;
-  map<hobject_t, map<string, bufferlist> > attrs_read;
-  map<hobject_t, int> errors;
+  map<hobject_t, list<pair<uint64_t, bufferlist> >, hobject_t::BitwiseComparator> buffers_read;
+  map<hobject_t, map<string, bufferlist>, hobject_t::BitwiseComparator> attrs_read;
+  map<hobject_t, int, hobject_t::BitwiseComparator> errors;
   void encode(bufferlist &bl) const;
   void decode(bufferlist::iterator &bl);
   void dump(Formatter *f) const;
diff --git a/src/osd/ECTransaction.cc b/src/osd/ECTransaction.cc
index e1cf386..c5d39eb 100644
--- a/src/osd/ECTransaction.cc
+++ b/src/osd/ECTransaction.cc
@@ -23,8 +23,8 @@
 #include "os/ObjectStore.h"
 
 struct AppendObjectsGenerator: public boost::static_visitor<void> {
-  set<hobject_t> *out;
-  AppendObjectsGenerator(set<hobject_t> *out) : out(out) {}
+  set<hobject_t, hobject_t::BitwiseComparator> *out;
+  AppendObjectsGenerator(set<hobject_t, hobject_t::BitwiseComparator> *out) : out(out) {}
   void operator()(const ECTransaction::AppendOp &op) {
     out->insert(op.oid);
   }
@@ -51,31 +51,31 @@ struct AppendObjectsGenerator: public boost::static_visitor<void> {
   void operator()(const ECTransaction::NoOp &op) {}
 };
 void ECTransaction::get_append_objects(
-  set<hobject_t> *out) const
+  set<hobject_t, hobject_t::BitwiseComparator> *out) const
 {
   AppendObjectsGenerator gen(out);
   reverse_visit(gen);
 }
 
 struct TransGenerator : public boost::static_visitor<void> {
-  map<hobject_t, ECUtil::HashInfoRef> &hash_infos;
+  map<hobject_t, ECUtil::HashInfoRef, hobject_t::BitwiseComparator> &hash_infos;
 
   ErasureCodeInterfaceRef &ecimpl;
   const pg_t pgid;
   const ECUtil::stripe_info_t sinfo;
   map<shard_id_t, ObjectStore::Transaction> *trans;
   set<int> want;
-  set<hobject_t> *temp_added;
-  set<hobject_t> *temp_removed;
+  set<hobject_t, hobject_t::BitwiseComparator> *temp_added;
+  set<hobject_t, hobject_t::BitwiseComparator> *temp_removed;
   stringstream *out;
   TransGenerator(
-    map<hobject_t, ECUtil::HashInfoRef> &hash_infos,
+    map<hobject_t, ECUtil::HashInfoRef, hobject_t::BitwiseComparator> &hash_infos,
     ErasureCodeInterfaceRef &ecimpl,
     pg_t pgid,
     const ECUtil::stripe_info_t &sinfo,
     map<shard_id_t, ObjectStore::Transaction> *trans,
-    set<hobject_t> *temp_added,
-    set<hobject_t> *temp_removed,
+    set<hobject_t, hobject_t::BitwiseComparator> *temp_added,
+    set<hobject_t, hobject_t::BitwiseComparator> *temp_removed,
     stringstream *out)
     : hash_infos(hash_infos),
       ecimpl(ecimpl), pgid(pgid),
@@ -93,20 +93,17 @@ struct TransGenerator : public boost::static_visitor<void> {
       temp_removed->erase(hoid);
       temp_added->insert(hoid);
     }
-    return get_coll(shard, hoid);
+    return get_coll(shard);
   }
   coll_t get_coll_rm(shard_id_t shard, const hobject_t &hoid) {
     if (hoid.is_temp()) {
       temp_added->erase(hoid);
       temp_removed->insert(hoid);
     }
-    return get_coll(shard, hoid);
+    return get_coll(shard);
   }
-  coll_t get_coll(shard_id_t shard, const hobject_t &hoid) {
-    if (hoid.is_temp())
-      return coll_t::make_temp_coll(spg_t(pgid, shard));
-    else
-      return coll_t(spg_t(pgid, shard));
+  coll_t get_coll(shard_id_t shard) {
+    return coll_t(spg_t(pgid, shard));
   }
 
   void operator()(const ECTransaction::TouchOp &op) {
@@ -276,13 +273,13 @@ struct TransGenerator : public boost::static_visitor<void> {
 
 
 void ECTransaction::generate_transactions(
-  map<hobject_t, ECUtil::HashInfoRef> &hash_infos,
+  map<hobject_t, ECUtil::HashInfoRef, hobject_t::BitwiseComparator> &hash_infos,
   ErasureCodeInterfaceRef &ecimpl,
   pg_t pgid,
   const ECUtil::stripe_info_t &sinfo,
   map<shard_id_t, ObjectStore::Transaction> *transactions,
-  set<hobject_t> *temp_added,
-  set<hobject_t> *temp_removed,
+  set<hobject_t, hobject_t::BitwiseComparator> *temp_added,
+  set<hobject_t, hobject_t::BitwiseComparator> *temp_removed,
   stringstream *out) const
 {
   TransGenerator gen(
diff --git a/src/osd/ECTransaction.h b/src/osd/ECTransaction.h
index 1404b93..2615226 100644
--- a/src/osd/ECTransaction.h
+++ b/src/osd/ECTransaction.h
@@ -194,15 +194,15 @@ public:
     }
   }
   void get_append_objects(
-    set<hobject_t> *out) const;
+     set<hobject_t, hobject_t::BitwiseComparator> *out) const;
   void generate_transactions(
-    map<hobject_t, ECUtil::HashInfoRef> &hash_infos,
+    map<hobject_t, ECUtil::HashInfoRef, hobject_t::BitwiseComparator> &hash_infos,
     ErasureCodeInterfaceRef &ecimpl,
     pg_t pgid,
     const ECUtil::stripe_info_t &sinfo,
     map<shard_id_t, ObjectStore::Transaction> *transactions,
-    set<hobject_t> *temp_added,
-    set<hobject_t> *temp_removed,
+    set<hobject_t, hobject_t::BitwiseComparator> *temp_added,
+    set<hobject_t, hobject_t::BitwiseComparator> *temp_removed,
     stringstream *out = 0) const;
 };
 
diff --git a/src/osd/ECUtil.cc b/src/osd/ECUtil.cc
index 1f3b458..46a16c3 100644
--- a/src/osd/ECUtil.cc
+++ b/src/osd/ECUtil.cc
@@ -9,24 +9,24 @@ int ECUtil::decode(
   ErasureCodeInterfaceRef &ec_impl,
   map<int, bufferlist> &to_decode,
   bufferlist *out) {
+  assert(to_decode.size());
 
-  uint64_t total_chunk_size = to_decode.begin()->second.length();
+  uint64_t total_data_size = to_decode.begin()->second.length();
+  assert(total_data_size % sinfo.get_chunk_size() == 0);
 
-  assert(to_decode.size());
-  assert(total_chunk_size % sinfo.get_chunk_size() == 0);
   assert(out);
   assert(out->length() == 0);
 
   for (map<int, bufferlist>::iterator i = to_decode.begin();
        i != to_decode.end();
        ++i) {
-    assert(i->second.length() == total_chunk_size);
+    assert(i->second.length() == total_data_size);
   }
 
-  if (total_chunk_size == 0)
+  if (total_data_size == 0)
     return 0;
 
-  for (uint64_t i = 0; i < total_chunk_size; i += sinfo.get_chunk_size()) {
+  for (uint64_t i = 0; i < total_data_size; i += sinfo.get_chunk_size()) {
     map<int, bufferlist> chunks;
     for (map<int, bufferlist>::iterator j = to_decode.begin();
 	 j != to_decode.end();
@@ -47,19 +47,18 @@ int ECUtil::decode(
   ErasureCodeInterfaceRef &ec_impl,
   map<int, bufferlist> &to_decode,
   map<int, bufferlist*> &out) {
-
-  uint64_t total_chunk_size = to_decode.begin()->second.length();
-
   assert(to_decode.size());
-  assert(total_chunk_size % sinfo.get_chunk_size() == 0);
+
+  uint64_t total_data_size = to_decode.begin()->second.length();
+  assert(total_data_size % sinfo.get_chunk_size() == 0);
 
   for (map<int, bufferlist>::iterator i = to_decode.begin();
        i != to_decode.end();
        ++i) {
-    assert(i->second.length() == total_chunk_size);
+    assert(i->second.length() == total_data_size);
   }
 
-  if (total_chunk_size == 0)
+  if (total_data_size == 0)
     return 0;
 
   set<int> need;
@@ -71,7 +70,7 @@ int ECUtil::decode(
     need.insert(i->first);
   }
 
-  for (uint64_t i = 0; i < total_chunk_size; i += sinfo.get_chunk_size()) {
+  for (uint64_t i = 0; i < total_data_size; i += sinfo.get_chunk_size()) {
     map<int, bufferlist> chunks;
     for (map<int, bufferlist>::iterator j = to_decode.begin();
 	 j != to_decode.end();
@@ -92,7 +91,7 @@ int ECUtil::decode(
   for (map<int, bufferlist*>::iterator i = out.begin();
        i != out.end();
        ++i) {
-    assert(i->second->length() == total_chunk_size);
+    assert(i->second->length() == total_data_size);
   }
   return 0;
 }
diff --git a/src/osd/HitSet.cc b/src/osd/HitSet.cc
index 700da5d..597b1f7 100644
--- a/src/osd/HitSet.cc
+++ b/src/osd/HitSet.cc
@@ -36,9 +36,6 @@ HitSet::HitSet(const HitSet::Params& params)
     impl.reset(new ExplicitObjectHitSet(static_cast<ExplicitObjectHitSet::Params*>(params.impl.get())));
     break;
 
-  case TYPE_NONE:
-    break;
-
   default:
     assert (0 == "unknown HitSet type");
   }
diff --git a/src/osd/Makefile.am b/src/osd/Makefile.am
index 7998d4d..da805a2 100644
--- a/src/osd/Makefile.am
+++ b/src/osd/Makefile.am
@@ -16,7 +16,6 @@ libosd_la_SOURCES = \
 	osd/ECMsgTypes.cc \
 	osd/ECTransaction.cc \
 	osd/PGBackend.cc \
-	osd/Ager.cc \
 	osd/HitSet.cc \
 	osd/OSD.cc \
 	osd/OSDCap.cc \
@@ -32,13 +31,9 @@ if WITH_KINETIC
 libosd_la_CXXFLAGS += -std=gnu++11
 endif
 libosd_la_LIBADD = $(LIBOSDC) $(LIBOS) $(LIBOSD_TYPES) $(LIBOS_TYPES)
-if WITH_LTTNG
-libosd_la_LIBADD += $(LIBOSD_TP)
-endif
 noinst_LTLIBRARIES += libosd.la
 
 noinst_HEADERS += \
-	osd/Ager.h \
 	osd/ClassHandler.h \
 	osd/HitSet.h \
 	osd/OSD.h \
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index 0c01ba6..413ad59 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -17,7 +17,6 @@
 #include <iostream>
 #include <errno.h>
 #include <sys/stat.h>
-#include <sys/utsname.h>
 #include <signal.h>
 #include <ctype.h>
 #include <boost/scoped_ptr.hpp>
@@ -48,8 +47,6 @@
 
 #include "ReplicatedPG.h"
 
-#include "Ager.h"
-
 
 #include "msg/Messenger.h"
 #include "msg/Message.h"
@@ -132,26 +129,43 @@
 
 #include "common/cmdparse.h"
 #include "include/str_list.h"
+#include "include/util.h"
 
 #include "include/assert.h"
 #include "common/config.h"
 
 #ifdef WITH_LTTNG
+#define TRACEPOINT_DEFINE
+#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
 #include "tracing/osd.h"
+#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#undef TRACEPOINT_DEFINE
 #else
 #define tracepoint(...)
 #endif
 
-static coll_t META_COLL("meta");
-
 #define dout_subsys ceph_subsys_osd
 #undef dout_prefix
 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
 
+const double OSD::OSD_TICK_INTERVAL = 1.0;
+
 static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) {
   return *_dout << "osd." << whoami << " " << epoch << " ";
 }
 
+void PGQueueable::RunVis::operator()(OpRequestRef &op) {
+  return osd->dequeue_op(pg, op, handle);
+}
+
+void PGQueueable::RunVis::operator()(PGSnapTrim &op) {
+  return pg->snap_trimmer(op.epoch_queued);
+}
+
+void PGQueueable::RunVis::operator()(PGScrub &op) {
+  return pg->scrub(op.epoch_queued, handle);
+}
+
 //Initial features in new superblock.
 //Features here are also automatically upgraded
 CompatSet OSD::get_osd_initial_compat_set() {
@@ -185,6 +199,7 @@ CompatSet OSD::get_osd_compat_set() {
 OSDService::OSDService(OSD *osd) :
   osd(osd),
   cct(osd->cct),
+  meta_osr(new ObjectStore::Sequencer("meta")),
   whoami(osd->whoami), store(osd->store),
   log_client(osd->log_client), clog(osd->clog),
   pg_recovery_stats(osd->pg_recovery_stats),
@@ -196,9 +211,6 @@ OSDService::OSDService(OSD *osd) :
   op_wq(osd->op_shardedwq),
   peering_wq(osd->peering_wq),
   recovery_wq(osd->recovery_wq),
-  snap_trim_wq(osd->snap_trim_wq),
-  scrub_wq(osd->scrub_wq),
-  rep_scrub_wq(osd->rep_scrub_wq),
   recovery_gen_wq("recovery_gen_wq", cct->_conf->osd_recovery_thread_timeout,
 		  &osd->recovery_tp),
   op_gen_wq("op_gen_wq", cct->_conf->osd_recovery_thread_timeout, &osd->osd_tp),
@@ -212,6 +224,7 @@ OSDService::OSDService(OSD *osd) :
   agent_lock("OSD::agent_lock"),
   agent_valid_iterator(false),
   agent_ops(0),
+  flush_mode_high_count(0),
   agent_active(true),
   agent_thread(this),
   agent_stop_flag(false),
@@ -467,13 +480,17 @@ void OSDService::init()
   reserver_finisher.start();
   objecter_finisher.start();
   objecter->set_client_incarnation(0);
-  objecter->start();
   watch_timer.init();
   agent_timer.init();
 
   agent_thread.create();
 }
 
+void OSDService::final_init()
+{
+  objecter->start();
+}
+
 void OSDService::activate_map()
 {
   // wake/unwake the tiering agent
@@ -528,9 +545,13 @@ void OSDService::agent_entry()
     }
     PGRef pg = *agent_queue_pos;
     int max = g_conf->osd_agent_max_ops - agent_ops;
+    int agent_flush_quota = max;
+    if (!flush_mode_high_count)
+      agent_flush_quota = g_conf->osd_agent_max_low_ops - agent_ops;
+    dout(10) << "high_count " << flush_mode_high_count << " agent_ops " << agent_ops << " flush_quota " << agent_flush_quota << dendl;
     agent_lock.Unlock();
-    if (!pg->agent_work(max)) {
-      dout(10) << __func__ << " " << *pg
+    if (!pg->agent_work(max, agent_flush_quota)) {
+      dout(10) << __func__ << " " << pg->get_pgid()
 	<< " no agent_work, delay for " << g_conf->osd_agent_delay_time
 	<< " seconds" << dendl;
 
@@ -896,6 +917,21 @@ void OSDService::share_map_peer(int peer, Connection *con, OSDMapRef map)
   }
 }
 
+bool OSDService::can_inc_scrubs_pending()
+{
+  bool can_inc = false;
+  Mutex::Locker l(sched_scrub_lock);
+
+  if (scrubs_pending + scrubs_active < cct->_conf->osd_max_scrubs) {
+    dout(20) << __func__ << scrubs_pending << " -> " << (scrubs_pending+1)
+	     << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
+    can_inc = true;
+  } else {
+    dout(20) << __func__ << scrubs_pending << " + " << scrubs_active << " active >= max " << cct->_conf->osd_max_scrubs << dendl;
+  }
+
+  return can_inc;
+}
 
 bool OSDService::inc_scrubs_pending()
 {
@@ -1092,8 +1128,8 @@ bool OSDService::_get_map_bl(epoch_t e, bufferlist& bl)
   bool found = map_bl_cache.lookup(e, &bl);
   if (found)
     return true;
-  found = store->read(
-    META_COLL, OSD::get_osdmap_pobject_name(e), 0, 0, bl) >= 0;
+  found = store->read(coll_t::meta(),
+		      OSD::get_osdmap_pobject_name(e), 0, 0, bl) >= 0;
   if (found)
     _add_map_bl(e, bl);
   return found;
@@ -1105,8 +1141,8 @@ bool OSDService::get_inc_map_bl(epoch_t e, bufferlist& bl)
   bool found = map_bl_inc_cache.lookup(e, &bl);
   if (found)
     return true;
-  found = store->read(
-    META_COLL, OSD::get_inc_osdmap_pobject_name(e), 0, 0, bl) >= 0;
+  found = store->read(coll_t::meta(),
+		      OSD::get_inc_osdmap_pobject_name(e), 0, 0, bl) >= 0;
   if (found)
     _add_map_inc_bl(e, bl);
   return found;
@@ -1273,7 +1309,10 @@ void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
 
 void OSDService::dequeue_pg(PG *pg, list<OpRequestRef> *dequeued)
 {
-  osd->op_shardedwq.dequeue(pg, dequeued);
+  if (dequeued)
+    osd->op_shardedwq.dequeue_and_get_ops(pg, dequeued);
+  else
+    osd->op_shardedwq.dequeue(pg);
 }
 
 void OSDService::queue_for_peering(PG *pg)
@@ -1293,6 +1332,9 @@ int OSD::mkfs(CephContext *cct, ObjectStore *store, const string &dev,
 {
   int ret;
 
+  ceph::shared_ptr<ObjectStore::Sequencer> osr(
+    new ObjectStore::Sequencer("mkfs"));
+
   try {
     // if we are fed a uuid for this osd, use it.
     store->set_fsid(cct->_conf->osd_uuid);
@@ -1309,22 +1351,9 @@ int OSD::mkfs(CephContext *cct, ObjectStore *store, const string &dev,
       goto free_store;
     }
 
-    // age?
-    if (cct->_conf->osd_age_time != 0) {
-      if (cct->_conf->osd_age_time >= 0) {
-        dout(0) << "aging..." << dendl;
-        Ager ager(cct, store);
-        ager.age(cct->_conf->osd_age_time,
-          cct->_conf->osd_age,
-          cct->_conf->osd_age - .05,
-          50000,
-          cct->_conf->osd_age - .05);
-      }
-    }
-
     OSDSuperblock sb;
     bufferlist sbbl;
-    ret = store->read(META_COLL, OSD_SUPERBLOCK_POBJECT, 0, 0, sbbl);
+    ret = store->read(coll_t::meta(), OSD_SUPERBLOCK_POBJECT, 0, 0, sbbl);
     if (ret >= 0) {
       dout(0) << " have superblock" << dendl;
       if (whoami != sb.whoami) {
@@ -1350,44 +1379,13 @@ int OSD::mkfs(CephContext *cct, ObjectStore *store, const string &dev,
       sb.whoami = whoami;
       sb.compat_features = get_osd_initial_compat_set();
 
-      // benchmark?
-      if (cct->_conf->osd_auto_weight) {
-	bufferlist bl;
-	bufferptr bp(1048576);
-	bp.zero();
-	bl.push_back(bp);
-	dout(0) << "testing disk bandwidth..." << dendl;
-	utime_t start = ceph_clock_now(cct);
-	object_t oid("disk_bw_test");
-	for (int i=0; i<1000; i++) {
-	  ObjectStore::Transaction *t = new ObjectStore::Transaction;
-	  t->write(META_COLL, hobject_t(sobject_t(oid, 0)), i*bl.length(), bl.length(), bl);
-	  store->queue_transaction_and_cleanup(NULL, t);
-	}
-	store->sync();
-	utime_t end = ceph_clock_now(cct);
-	end -= start;
-	dout(0) << "measured " << (1000.0 / (double)end) << " mb/sec" << dendl;
-	ObjectStore::Transaction tr;
-	tr.remove(META_COLL, hobject_t(sobject_t(oid, 0)));
-	ret = store->apply_transaction(tr);
-	if (ret) {
-	  derr << "OSD::mkfs: error while benchmarking: apply_transaction returned "
-	       << ret << dendl;
-	  goto umount_store;
-	}
-	
-	// set osd weight
-	sb.weight = (1000.0 / (double)end);
-      }
-
       bufferlist bl;
       ::encode(sb, bl);
 
       ObjectStore::Transaction t;
-      t.create_collection(META_COLL);
-      t.write(META_COLL, OSD_SUPERBLOCK_POBJECT, 0, bl.length(), bl);
-      ret = store->apply_transaction(t);
+      t.create_collection(coll_t::meta(), 0);
+      t.write(coll_t::meta(), OSD_SUPERBLOCK_POBJECT, 0, bl.length(), bl);
+      ret = store->apply_transaction(osr.get(), t);
       if (ret) {
 	derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_POBJECT: "
 	     << "apply_transaction returned " << ret << dendl;
@@ -1395,7 +1393,10 @@ int OSD::mkfs(CephContext *cct, ObjectStore *store, const string &dev,
       }
     }
 
-    store->sync_and_flush();
+    C_SaferCond waiter;
+    if (!osr->flush_commit(&waiter)) {
+      waiter.wait();
+    }
 
     ret = write_meta(store, sb.cluster_fsid, sb.osd_fsid, whoami);
     if (ret) {
@@ -1500,6 +1501,8 @@ OSD::OSD(CephContext *cct_, ObjectStore *store_,
   Dispatcher(cct_),
   osd_lock("OSD::osd_lock"),
   tick_timer(cct, osd_lock),
+  tick_timer_lock("OSD::tick_timer_lock"),
+  tick_timer_without_osd_lock(cct, tick_timer_lock),
   authorize_handler_cluster_registry(new AuthAuthorizeHandlerRegistry(cct,
 								      cct->_conf->auth_supported.empty() ?
 								      cct->_conf->auth_cluster_required :
@@ -1562,6 +1565,8 @@ OSD::OSD(CephContext *cct_, ObjectStore *store_,
   outstanding_pg_stats(false),
   timeout_mon_on_pg_stats(true),
   up_thru_wanted(0), up_thru_pending(0),
+  requested_full_first(0),
+  requested_full_last(0),
   pg_stat_queue_lock("OSD::pg_stat_queue_lock"),
   osd_stat_updated(false),
   pg_stat_tid(0), pg_stat_tid_flushed(0),
@@ -1577,21 +1582,6 @@ OSD::OSD(CephContext *cct_, ObjectStore *store_,
     cct->_conf->osd_recovery_thread_suicide_timeout,
     &recovery_tp),
   replay_queue_lock("OSD::replay_queue_lock"),
-  snap_trim_wq(
-    this,
-    cct->_conf->osd_snap_trim_thread_timeout,
-    cct->_conf->osd_snap_trim_thread_suicide_timeout,
-    &disk_tp),
-  scrub_wq(
-    this,
-    cct->_conf->osd_scrub_thread_timeout,
-    cct->_conf->osd_scrub_thread_suicide_timeout,
-    &disk_tp),
-  rep_scrub_wq(
-    this,
-    cct->_conf->osd_scrub_thread_timeout,
-    cct->_conf->osd_scrub_thread_suicide_timeout,
-    &disk_tp),
   remove_wq(
     store,
     cct->_conf->osd_remove_thread_timeout,
@@ -1624,7 +1614,6 @@ void OSD::handle_signal(int signum)
 {
   assert(signum == SIGINT || signum == SIGTERM);
   derr << "*** Got signal " << sys_siglist[signum] << " ***" << dendl;
-  //suicide(128 + signum);
   shutdown();
 }
 
@@ -1677,12 +1666,20 @@ bool OSD::asok_command(string command, cmdmap_t& cmdmap, string format,
     }
     f->close_section();
   } else if (command == "flush_journal") {
-    store->sync_and_flush();
+    store->flush_journal();
   } else if (command == "dump_ops_in_flight" ||
 	     command == "ops") {
-    op_tracker.dump_ops_in_flight(f);
+    if (!op_tracker.tracking_enabled) {
+      ss << "op_tracker tracking is not enabled";
+    } else {
+      op_tracker.dump_ops_in_flight(f);
+    }
   } else if (command == "dump_historic_ops") {
-    op_tracker.dump_historic_ops(f);
+    if (!op_tracker.tracking_enabled) {
+      ss << "op_tracker tracking is not enabled";
+    } else {
+      op_tracker.dump_historic_ops(f);
+    }
   } else if (command == "dump_op_pq_state") {
     f->open_object_section("pq");
     op_shardedwq.dump(f);
@@ -1790,6 +1787,7 @@ int OSD::init()
     return 0;
 
   tick_timer.init();
+  tick_timer_without_osd_lock.init();
   service.backfill_request_timer.init();
 
   // mount.
@@ -1848,17 +1846,17 @@ int OSD::init()
     dout(5) << "Upgrading superblock adding: " << diff << dendl;
     ObjectStore::Transaction t;
     write_superblock(t);
-    r = store->apply_transaction(t);
+    r = store->apply_transaction(service.meta_osr.get(), t);
     if (r < 0)
       goto out;
   }
 
   // make sure snap mapper object exists
-  if (!store->exists(META_COLL, OSD::make_snapmapper_oid())) {
+  if (!store->exists(coll_t::meta(), OSD::make_snapmapper_oid())) {
     dout(10) << "init creating/touching snapmapper object" << dendl;
     ObjectStore::Transaction t;
-    t.touch(META_COLL, OSD::make_snapmapper_oid());
-    r = store->apply_transaction(t);
+    t.touch(coll_t::meta(), OSD::make_snapmapper_oid());
+    r = store->apply_transaction(service.meta_osr.get(), t);
     if (r < 0)
       goto out;
   }
@@ -1889,6 +1887,8 @@ int OSD::init()
     service.set_epochs(NULL, NULL, &bind_epoch);
   }
 
+  clear_temp_objects();
+
   // load up pgs (as they previously existed)
   load_pgs();
 
@@ -1928,6 +1928,10 @@ int OSD::init()
 
   // tick
   tick_timer.add_event_after(cct->_conf->osd_heartbeat_interval, new C_Tick(this));
+  {
+    Mutex::Locker l(tick_timer_lock);
+    tick_timer_without_osd_lock.add_event_after(cct->_conf->osd_heartbeat_interval, new C_Tick_WithoutOSDLock(this));
+  }
 
   service.init();
   service.publish_map(osdmap);
@@ -1951,6 +1955,10 @@ int OSD::init()
   if (is_stopping())
     return 0;
 
+  // start objecter *after* we have authenticated, so that we don't ignore
+  // the OSDMaps it requests.
+  service.final_init();
+
   check_config();
 
   dout(10) << "ensuring pgs have consumed prior maps" << dendl;
@@ -1959,6 +1967,9 @@ int OSD::init()
 
   dout(0) << "done with init, starting boot process" << dendl;
   set_state(STATE_BOOTING);
+
+  // we don't need to ask for an osdmap here; objecter will
+
   start_boot();
 
   return 0;
@@ -2073,18 +2084,27 @@ void OSD::final_init()
     "injectdataerr",
     "injectdataerr " \
     "name=pool,type=CephString " \
-    "name=objname,type=CephObjectname",
+    "name=objname,type=CephObjectname " \
+    "name=shardid,type=CephInt,req=false,range=0|255",
     test_ops_hook,
-    "inject data error into omap");
+    "inject data error to an object");
   assert(r == 0);
 
   r = admin_socket->register_command(
     "injectmdataerr",
     "injectmdataerr " \
     "name=pool,type=CephString " \
-    "name=objname,type=CephObjectname",
+    "name=objname,type=CephObjectname " \
+    "name=shardid,type=CephInt,req=false,range=0|255",
+    test_ops_hook,
+    "inject metadata error to an object");
+  assert(r == 0);
+  r = admin_socket->register_command(
+    "set_recovery_delay",
+    "set_recovery_delay " \
+    "name=utime,type=CephInt,req=false",
     test_ops_hook,
-    "inject metadata error");
+     "Delay osd recovery by specified seconds");
   assert(r == 0);
 }
 
@@ -2094,92 +2114,118 @@ void OSD::create_logger()
 
   PerfCountersBuilder osd_plb(cct, "osd", l_osd_first, l_osd_last);
 
-  osd_plb.add_u64(l_osd_op_wip, "op_wip");   // rep ops currently being processed (primary)
-
-  osd_plb.add_u64_counter(l_osd_op,       "op");           // client ops
-  osd_plb.add_u64_counter(l_osd_op_inb,   "op_in_bytes");       // client op in bytes (writes)
-  osd_plb.add_u64_counter(l_osd_op_outb,  "op_out_bytes");      // client op out bytes (reads)
-  osd_plb.add_time_avg(l_osd_op_lat,   "op_latency");       // client op latency
-  osd_plb.add_time_avg(l_osd_op_process_lat, "op_process_latency");   // client op process latency
-
-  osd_plb.add_u64_counter(l_osd_op_r,      "op_r");        // client reads
-  osd_plb.add_u64_counter(l_osd_op_r_outb, "op_r_out_bytes");   // client read out bytes
-  osd_plb.add_time_avg(l_osd_op_r_lat,  "op_r_latency");    // client read latency
-  osd_plb.add_time_avg(l_osd_op_r_process_lat, "op_r_process_latency");   // client read process latency
-  osd_plb.add_u64_counter(l_osd_op_w,      "op_w");        // client writes
-  osd_plb.add_u64_counter(l_osd_op_w_inb,  "op_w_in_bytes");    // client write in bytes
-  osd_plb.add_time_avg(l_osd_op_w_rlat, "op_w_rlat");   // client write readable/applied latency
-  osd_plb.add_time_avg(l_osd_op_w_lat,  "op_w_latency");    // client write latency
-  osd_plb.add_time_avg(l_osd_op_w_process_lat, "op_w_process_latency");   // client write process latency
-  osd_plb.add_u64_counter(l_osd_op_rw,     "op_rw");       // client rmw
-  osd_plb.add_u64_counter(l_osd_op_rw_inb, "op_rw_in_bytes");   // client rmw in bytes
-  osd_plb.add_u64_counter(l_osd_op_rw_outb,"op_rw_out_bytes");  // client rmw out bytes
-  osd_plb.add_time_avg(l_osd_op_rw_rlat,"op_rw_rlat");  // client rmw readable/applied latency
-  osd_plb.add_time_avg(l_osd_op_rw_lat, "op_rw_latency");   // client rmw latency
-  osd_plb.add_time_avg(l_osd_op_rw_process_lat, "op_rw_process_latency");   // client rmw process latency
-
-  osd_plb.add_u64_counter(l_osd_sop,       "subop");         // subops
-  osd_plb.add_u64_counter(l_osd_sop_inb,   "subop_in_bytes");     // subop in bytes
-  osd_plb.add_time_avg(l_osd_sop_lat,   "subop_latency");     // subop latency
-
-  osd_plb.add_u64_counter(l_osd_sop_w,     "subop_w");          // replicated (client) writes
-  osd_plb.add_u64_counter(l_osd_sop_w_inb, "subop_w_in_bytes");      // replicated write in bytes
-  osd_plb.add_time_avg(l_osd_sop_w_lat, "subop_w_latency");      // replicated write latency
-  osd_plb.add_u64_counter(l_osd_sop_pull,     "subop_pull");       // pull request
-  osd_plb.add_time_avg(l_osd_sop_pull_lat, "subop_pull_latency");
-  osd_plb.add_u64_counter(l_osd_sop_push,     "subop_push");       // push (write)
-  osd_plb.add_u64_counter(l_osd_sop_push_inb, "subop_push_in_bytes");
-  osd_plb.add_time_avg(l_osd_sop_push_lat, "subop_push_latency");
-
-  osd_plb.add_u64_counter(l_osd_pull,      "pull");       // pull requests sent
-  osd_plb.add_u64_counter(l_osd_push,      "push");       // push messages
-  osd_plb.add_u64_counter(l_osd_push_outb, "push_out_bytes");  // pushed bytes
-
-  osd_plb.add_u64_counter(l_osd_push_in,    "push_in");        // inbound push messages
-  osd_plb.add_u64_counter(l_osd_push_inb,   "push_in_bytes");  // inbound pushed bytes
-
-  osd_plb.add_u64_counter(l_osd_rop, "recovery_ops");       // recovery ops (started)
-
-  osd_plb.add_u64(l_osd_loadavg, "loadavg");
-  osd_plb.add_u64(l_osd_buf, "buffer_bytes");       // total ceph::buffer bytes
-
-  osd_plb.add_u64(l_osd_pg, "numpg");   // num pgs
-  osd_plb.add_u64(l_osd_pg_primary, "numpg_primary"); // num primary pgs
-  osd_plb.add_u64(l_osd_pg_replica, "numpg_replica"); // num replica pgs
-  osd_plb.add_u64(l_osd_pg_stray, "numpg_stray");   // num stray pgs
-  osd_plb.add_u64(l_osd_hb_to, "heartbeat_to_peers");     // heartbeat peers we send to
-  osd_plb.add_u64(l_osd_hb_from, "heartbeat_from_peers"); // heartbeat peers we recv from
-  osd_plb.add_u64_counter(l_osd_map, "map_messages");           // osdmap messages
-  osd_plb.add_u64_counter(l_osd_mape, "map_message_epochs");         // osdmap epochs
-  osd_plb.add_u64_counter(l_osd_mape_dup, "map_message_epoch_dups"); // dup osdmap epochs
-  osd_plb.add_u64_counter(l_osd_waiting_for_map,
-			  "messages_delayed_for_map"); // dup osdmap epochs
-
-  osd_plb.add_u64(l_osd_stat_bytes, "stat_bytes");
-  osd_plb.add_u64(l_osd_stat_bytes_used, "stat_bytes_used");
-  osd_plb.add_u64(l_osd_stat_bytes_avail, "stat_bytes_avail");
-
-  osd_plb.add_u64_counter(l_osd_copyfrom, "copyfrom");
-
-  osd_plb.add_u64_counter(l_osd_tier_promote, "tier_promote");
-  osd_plb.add_u64_counter(l_osd_tier_flush, "tier_flush");
-  osd_plb.add_u64_counter(l_osd_tier_flush_fail, "tier_flush_fail");
-  osd_plb.add_u64_counter(l_osd_tier_try_flush, "tier_try_flush");
-  osd_plb.add_u64_counter(l_osd_tier_try_flush_fail, "tier_try_flush_fail");
-  osd_plb.add_u64_counter(l_osd_tier_evict, "tier_evict");
-  osd_plb.add_u64_counter(l_osd_tier_whiteout, "tier_whiteout");
-  osd_plb.add_u64_counter(l_osd_tier_dirty, "tier_dirty");
-  osd_plb.add_u64_counter(l_osd_tier_clean, "tier_clean");
-  osd_plb.add_u64_counter(l_osd_tier_delay, "tier_delay");
-  osd_plb.add_u64_counter(l_osd_tier_proxy_read, "tier_proxy_read");
-
-  osd_plb.add_u64_counter(l_osd_agent_wake, "agent_wake");
-  osd_plb.add_u64_counter(l_osd_agent_skip, "agent_skip");
-  osd_plb.add_u64_counter(l_osd_agent_flush, "agent_flush");
-  osd_plb.add_u64_counter(l_osd_agent_evict, "agent_evict");
-
-  osd_plb.add_u64_counter(l_osd_object_ctx_cache_hit, "object_ctx_cache_hit");
-  osd_plb.add_u64_counter(l_osd_object_ctx_cache_total, "object_ctx_cache_total");
+  osd_plb.add_u64(l_osd_op_wip, "op_wip",
+      "Replication operations currently being processed (primary)");   // rep ops currently being processed (primary)
+  osd_plb.add_u64_counter(l_osd_op,       "op",
+      "Client operations", "ops");           // client ops
+  osd_plb.add_u64_counter(l_osd_op_inb,   "op_in_bytes",
+      "Client operations total write size", "wr");       // client op in bytes (writes)
+  osd_plb.add_u64_counter(l_osd_op_outb,  "op_out_bytes",
+      "Client operations total read size", "rd");      // client op out bytes (reads)
+  osd_plb.add_time_avg(l_osd_op_lat,   "op_latency", 
+      "Latency of client operations (including queue time)", "lat");       // client op latency
+  osd_plb.add_time_avg(l_osd_op_process_lat, "op_process_latency", 
+      "Latency of client operations (excluding queue time)");   // client op process latency
+
+  osd_plb.add_u64_counter(l_osd_op_r,      "op_r", 
+      "Client read operations");        // client reads
+  osd_plb.add_u64_counter(l_osd_op_r_outb, "op_r_out_bytes", 
+      "Client data read");   // client read out bytes
+  osd_plb.add_time_avg(l_osd_op_r_lat,  "op_r_latency", 
+      "Latency of read operation (including queue time)");    // client read latency
+  osd_plb.add_time_avg(l_osd_op_r_process_lat, "op_r_process_latency", 
+      "Latency of read operation (excluding queue time)");   // client read process latency
+  osd_plb.add_u64_counter(l_osd_op_w,      "op_w", 
+      "Client write operations");        // client writes
+  osd_plb.add_u64_counter(l_osd_op_w_inb,  "op_w_in_bytes", 
+      "Client data written");    // client write in bytes
+  osd_plb.add_time_avg(l_osd_op_w_rlat, "op_w_rlat", 
+      "Client write operation readable/applied latency");   // client write readable/applied latency
+  osd_plb.add_time_avg(l_osd_op_w_lat,  "op_w_latency", 
+      "Latency of write operation (including queue time)");    // client write latency
+  osd_plb.add_time_avg(l_osd_op_w_process_lat, "op_w_process_latency", 
+      "Latency of write operation (excluding queue time)");   // client write process latency
+  osd_plb.add_u64_counter(l_osd_op_rw,     "op_rw", 
+      "Client read-modify-write operations");       // client rmw
+  osd_plb.add_u64_counter(l_osd_op_rw_inb, "op_rw_in_bytes", 
+      "Client read-modify-write operations write in");   // client rmw in bytes
+  osd_plb.add_u64_counter(l_osd_op_rw_outb,"op_rw_out_bytes", 
+      "Client read-modify-write operations read out ");  // client rmw out bytes
+  osd_plb.add_time_avg(l_osd_op_rw_rlat,"op_rw_rlat", 
+      "Client read-modify-write operation readable/applied latency");  // client rmw readable/applied latency
+  osd_plb.add_time_avg(l_osd_op_rw_lat, "op_rw_latency", 
+      "Latency of read-modify-write operation (including queue time)");   // client rmw latency
+  osd_plb.add_time_avg(l_osd_op_rw_process_lat, "op_rw_process_latency", 
+      "Latency of read-modify-write operation (excluding queue time)");   // client rmw process latency
+
+  osd_plb.add_u64_counter(l_osd_sop,       "subop", "Suboperations");         // subops
+  osd_plb.add_u64_counter(l_osd_sop_inb,   "subop_in_bytes", "Suboperations total size");     // subop in bytes
+  osd_plb.add_time_avg(l_osd_sop_lat,   "subop_latency", "Suboperations latency");     // subop latency
+
+  osd_plb.add_u64_counter(l_osd_sop_w,     "subop_w", "Replicated writes");          // replicated (client) writes
+  osd_plb.add_u64_counter(l_osd_sop_w_inb, "subop_w_in_bytes", "Replicated written data size");      // replicated write in bytes
+  osd_plb.add_time_avg(l_osd_sop_w_lat, "subop_w_latency", "Replicated writes latency");      // replicated write latency
+  osd_plb.add_u64_counter(l_osd_sop_pull,     "subop_pull", "Suboperations pull requests");       // pull request
+  osd_plb.add_time_avg(l_osd_sop_pull_lat, "subop_pull_latency", "Suboperations pull latency");
+  osd_plb.add_u64_counter(l_osd_sop_push,     "subop_push", "Suboperations push messages");       // push (write)
+  osd_plb.add_u64_counter(l_osd_sop_push_inb, "subop_push_in_bytes", "Suboperations pushed size");
+  osd_plb.add_time_avg(l_osd_sop_push_lat, "subop_push_latency", "Suboperations push latency");
+
+  osd_plb.add_u64_counter(l_osd_pull,      "pull", "Pull requests sent");       // pull requests sent
+  osd_plb.add_u64_counter(l_osd_push,      "push", "Push messages sent");       // push messages
+  osd_plb.add_u64_counter(l_osd_push_outb, "push_out_bytes", "Pushed size");  // pushed bytes
+
+  osd_plb.add_u64_counter(l_osd_push_in,    "push_in", "Inbound push messages");        // inbound push messages
+  osd_plb.add_u64_counter(l_osd_push_inb,   "push_in_bytes", "Inbound pushed size");  // inbound pushed bytes
+
+  osd_plb.add_u64_counter(l_osd_rop, "recovery_ops",
+      "Started recovery operations", "recop");       // recovery ops (started)
+
+  osd_plb.add_u64(l_osd_loadavg, "loadavg", "CPU load");
+  osd_plb.add_u64(l_osd_buf, "buffer_bytes", "Total allocated buffer size");       // total ceph::buffer bytes
+
+  osd_plb.add_u64(l_osd_pg, "numpg", "Placement groups");   // num pgs
+  osd_plb.add_u64(l_osd_pg_primary, "numpg_primary", "Placement groups for which this osd is primary"); // num primary pgs
+  osd_plb.add_u64(l_osd_pg_replica, "numpg_replica", "Placement groups for which this osd is replica"); // num replica pgs
+  osd_plb.add_u64(l_osd_pg_stray, "numpg_stray", "Placement groups ready to be deleted from this osd");   // num stray pgs
+  osd_plb.add_u64(l_osd_hb_to, "heartbeat_to_peers", "Heartbeat (ping) peers we send to");     // heartbeat peers we send to
+  osd_plb.add_u64(l_osd_hb_from, "heartbeat_from_peers", "Heartbeat (ping) peers we recv from"); // heartbeat peers we recv from
+  osd_plb.add_u64_counter(l_osd_map, "map_messages", "OSD map messages");           // osdmap messages
+  osd_plb.add_u64_counter(l_osd_mape, "map_message_epochs", "OSD map epochs");         // osdmap epochs
+  osd_plb.add_u64_counter(l_osd_mape_dup, "map_message_epoch_dups", "OSD map duplicates"); // dup osdmap epochs
+  osd_plb.add_u64_counter(l_osd_waiting_for_map, "messages_delayed_for_map", "Operations waiting for OSD map"); // dup osdmap epochs
+
+  osd_plb.add_u64(l_osd_stat_bytes, "stat_bytes", "OSD size");
+  osd_plb.add_u64(l_osd_stat_bytes_used, "stat_bytes_used", "Used space");
+  osd_plb.add_u64(l_osd_stat_bytes_avail, "stat_bytes_avail", "Available space");
+
+  osd_plb.add_u64_counter(l_osd_copyfrom, "copyfrom", "Rados \"copy-from\" operations");
+
+  osd_plb.add_u64_counter(l_osd_tier_promote, "tier_promote", "Tier promotions");
+  osd_plb.add_u64_counter(l_osd_tier_flush, "tier_flush", "Tier flushes");
+  osd_plb.add_u64_counter(l_osd_tier_flush_fail, "tier_flush_fail", "Failed tier flushes");
+  osd_plb.add_u64_counter(l_osd_tier_try_flush, "tier_try_flush", "Tier flush attempts");
+  osd_plb.add_u64_counter(l_osd_tier_try_flush_fail, "tier_try_flush_fail", "Failed tier flush attempts");
+  osd_plb.add_u64_counter(l_osd_tier_evict, "tier_evict", "Tier evictions");
+  osd_plb.add_u64_counter(l_osd_tier_whiteout, "tier_whiteout", "Tier whiteouts");
+  osd_plb.add_u64_counter(l_osd_tier_dirty, "tier_dirty", "Dirty tier flag set");
+  osd_plb.add_u64_counter(l_osd_tier_clean, "tier_clean", "Dirty tier flag cleaned");
+  osd_plb.add_u64_counter(l_osd_tier_delay, "tier_delay", "Tier delays (agent waiting)");
+  osd_plb.add_u64_counter(l_osd_tier_proxy_read, "tier_proxy_read", "Tier proxy reads");
+  osd_plb.add_u64_counter(l_osd_tier_proxy_write, "tier_proxy_write", "Tier proxy writes");
+
+  osd_plb.add_u64_counter(l_osd_agent_wake, "agent_wake", "Tiering agent wake up");
+  osd_plb.add_u64_counter(l_osd_agent_skip, "agent_skip", "Objects skipped by agent");
+  osd_plb.add_u64_counter(l_osd_agent_flush, "agent_flush", "Tiering agent flushes");
+  osd_plb.add_u64_counter(l_osd_agent_evict, "agent_evict", "Tiering agent evictions");
+
+  osd_plb.add_u64_counter(l_osd_object_ctx_cache_hit, "object_ctx_cache_hit", "Object context cache hits");
+  osd_plb.add_u64_counter(l_osd_object_ctx_cache_total, "object_ctx_cache_total", "Object context cache lookups");
+
+  osd_plb.add_u64_counter(l_osd_op_cache_hit, "op_cache_hit");
+  osd_plb.add_time_avg(l_osd_tier_flush_lat, "osd_tier_flush_lat", "Object flush latency");
+  osd_plb.add_time_avg(l_osd_tier_promote_lat, "osd_tier_promote_lat", "Object promote latency");
+  osd_plb.add_time_avg(l_osd_tier_r_lat, "osd_tier_r_lat", "Object proxy read latency");
 
   logger = osd_plb.create_perf_counters();
   cct->get_perfcounters_collection()->add(logger);
@@ -2191,67 +2237,40 @@ void OSD::create_recoverystate_perf()
 
   PerfCountersBuilder rs_perf(cct, "recoverystate_perf", rs_first, rs_last);
 
-  rs_perf.add_time_avg(rs_initial_latency, "initial_latency");
-  rs_perf.add_time_avg(rs_started_latency, "started_latency");
-  rs_perf.add_time_avg(rs_reset_latency, "reset_latency");
-  rs_perf.add_time_avg(rs_start_latency, "start_latency");
-  rs_perf.add_time_avg(rs_primary_latency, "primary_latency");
-  rs_perf.add_time_avg(rs_peering_latency, "peering_latency");
-  rs_perf.add_time_avg(rs_backfilling_latency, "backfilling_latency");
-  rs_perf.add_time_avg(rs_waitremotebackfillreserved_latency, "waitremotebackfillreserved_latency");
-  rs_perf.add_time_avg(rs_waitlocalbackfillreserved_latency, "waitlocalbackfillreserved_latency");
-  rs_perf.add_time_avg(rs_notbackfilling_latency, "notbackfilling_latency");
-  rs_perf.add_time_avg(rs_repnotrecovering_latency, "repnotrecovering_latency");
-  rs_perf.add_time_avg(rs_repwaitrecoveryreserved_latency, "repwaitrecoveryreserved_latency");
-  rs_perf.add_time_avg(rs_repwaitbackfillreserved_latency, "repwaitbackfillreserved_latency");
-  rs_perf.add_time_avg(rs_RepRecovering_latency, "RepRecovering_latency");
-  rs_perf.add_time_avg(rs_activating_latency, "activating_latency");
-  rs_perf.add_time_avg(rs_waitlocalrecoveryreserved_latency, "waitlocalrecoveryreserved_latency");
-  rs_perf.add_time_avg(rs_waitremoterecoveryreserved_latency, "waitremoterecoveryreserved_latency");
-  rs_perf.add_time_avg(rs_recovering_latency, "recovering_latency");
-  rs_perf.add_time_avg(rs_recovered_latency, "recovered_latency");
-  rs_perf.add_time_avg(rs_clean_latency, "clean_latency");
-  rs_perf.add_time_avg(rs_active_latency, "active_latency");
-  rs_perf.add_time_avg(rs_replicaactive_latency, "replicaactive_latency");
-  rs_perf.add_time_avg(rs_stray_latency, "stray_latency");
-  rs_perf.add_time_avg(rs_getinfo_latency, "getinfo_latency");
-  rs_perf.add_time_avg(rs_getlog_latency, "getlog_latency");
-  rs_perf.add_time_avg(rs_waitactingchange_latency, "waitactingchange_latency");
-  rs_perf.add_time_avg(rs_incomplete_latency, "incomplete_latency");
-  rs_perf.add_time_avg(rs_getmissing_latency, "getmissing_latency");
-  rs_perf.add_time_avg(rs_waitupthru_latency, "waitupthru_latency");
+  rs_perf.add_time_avg(rs_initial_latency, "initial_latency", "Initial recovery state latency");
+  rs_perf.add_time_avg(rs_started_latency, "started_latency", "Started recovery state latency");
+  rs_perf.add_time_avg(rs_reset_latency, "reset_latency", "Reset recovery state latency");
+  rs_perf.add_time_avg(rs_start_latency, "start_latency", "Start recovery state latency");
+  rs_perf.add_time_avg(rs_primary_latency, "primary_latency", "Primary recovery state latency");
+  rs_perf.add_time_avg(rs_peering_latency, "peering_latency", "Peering recovery state latency");
+  rs_perf.add_time_avg(rs_backfilling_latency, "backfilling_latency", "Backfilling recovery state latency");
+  rs_perf.add_time_avg(rs_waitremotebackfillreserved_latency, "waitremotebackfillreserved_latency", "Wait remote backfill reserved recovery state latency");
+  rs_perf.add_time_avg(rs_waitlocalbackfillreserved_latency, "waitlocalbackfillreserved_latency", "Wait local backfill reserved recovery state latency");
+  rs_perf.add_time_avg(rs_notbackfilling_latency, "notbackfilling_latency", "Notbackfilling recovery state latency");
+  rs_perf.add_time_avg(rs_repnotrecovering_latency, "repnotrecovering_latency", "Repnotrecovering recovery state latency");
+  rs_perf.add_time_avg(rs_repwaitrecoveryreserved_latency, "repwaitrecoveryreserved_latency", "Rep wait recovery reserved recovery state latency");
+  rs_perf.add_time_avg(rs_repwaitbackfillreserved_latency, "repwaitbackfillreserved_latency", "Rep wait backfill reserved recovery state latency");
+  rs_perf.add_time_avg(rs_RepRecovering_latency, "RepRecovering_latency", "RepRecovering recovery state latency");
+  rs_perf.add_time_avg(rs_activating_latency, "activating_latency", "Activating recovery state latency");
+  rs_perf.add_time_avg(rs_waitlocalrecoveryreserved_latency, "waitlocalrecoveryreserved_latency", "Wait local recovery reserved recovery state latency");
+  rs_perf.add_time_avg(rs_waitremoterecoveryreserved_latency, "waitremoterecoveryreserved_latency", "Wait remote recovery reserved recovery state latency");
+  rs_perf.add_time_avg(rs_recovering_latency, "recovering_latency", "Recovering recovery state latency");
+  rs_perf.add_time_avg(rs_recovered_latency, "recovered_latency", "Recovered recovery state latency");
+  rs_perf.add_time_avg(rs_clean_latency, "clean_latency", "Clean recovery state latency");
+  rs_perf.add_time_avg(rs_active_latency, "active_latency", "Active recovery state latency");
+  rs_perf.add_time_avg(rs_replicaactive_latency, "replicaactive_latency", "Replicaactive recovery state latency");
+  rs_perf.add_time_avg(rs_stray_latency, "stray_latency", "Stray recovery state latency");
+  rs_perf.add_time_avg(rs_getinfo_latency, "getinfo_latency", "Getinfo recovery state latency");
+  rs_perf.add_time_avg(rs_getlog_latency, "getlog_latency", "Getlog recovery state latency");
+  rs_perf.add_time_avg(rs_waitactingchange_latency, "waitactingchange_latency", "Waitactingchange recovery state latency");
+  rs_perf.add_time_avg(rs_incomplete_latency, "incomplete_latency", "Incomplete recovery state latency");
+  rs_perf.add_time_avg(rs_getmissing_latency, "getmissing_latency", "Getmissing recovery state latency");
+  rs_perf.add_time_avg(rs_waitupthru_latency, "waitupthru_latency", "Waitupthru recovery state latency");
 
   recoverystate_perf = rs_perf.create_perf_counters();
   cct->get_perfcounters_collection()->add(recoverystate_perf);
 }
 
-void OSD::suicide(int exitcode)
-{
-  if (cct->_conf->filestore_blackhole) {
-    derr << " filestore_blackhole=true, doing abbreviated shutdown" << dendl;
-    _exit(exitcode);
-  }
-
-  // turn off lockdep; the surviving threads tend to fight with exit() below
-  g_lockdep = 0;
-
-  derr << " pausing thread pools" << dendl;
-  osd_tp.pause();
-  osd_op_tp.pause();
-  disk_tp.pause();
-  recovery_tp.pause();
-  command_tp.pause();
-
-  derr << " flushing io" << dendl;
-  store->sync_and_flush();
-
-  derr << " removing pid file" << dendl;
-  pidfile_remove();
-
-  derr << " exit" << dendl;
-  exit(exitcode);
-}
-
 int OSD::shutdown()
 {
   if (!service.prepare_to_stop())
@@ -2318,6 +2337,7 @@ int OSD::shutdown()
   cct->get_admin_socket()->unregister_command("truncobj");
   cct->get_admin_socket()->unregister_command("injectdataerr");
   cct->get_admin_socket()->unregister_command("injectmdataerr");
+  cct->get_admin_socket()->unregister_command("set_recovery_delay");
   delete test_ops_hook;
   test_ops_hook = NULL;
 
@@ -2359,21 +2379,24 @@ int OSD::shutdown()
 
   tick_timer.shutdown();
 
+  {
+    Mutex::Locker l(tick_timer_lock);
+    tick_timer_without_osd_lock.shutdown();
+  }
+
   // note unmount epoch
   dout(10) << "noting clean unmount in epoch " << osdmap->get_epoch() << dendl;
   superblock.mounted = service.get_boot_epoch();
   superblock.clean_thru = osdmap->get_epoch();
   ObjectStore::Transaction t;
   write_superblock(t);
-  int r = store->apply_transaction(t);
+  int r = store->apply_transaction(service.meta_osr.get(), t);
   if (r) {
     derr << "OSD::shutdown: error writing superblock: "
 	 << cpp_strerror(r) << dendl;
   }
 
   dout(10) << "syncing store" << dendl;
-  store->flush();
-  store->sync();
   store->umount();
   delete store;
   store = 0;
@@ -2398,6 +2421,9 @@ int OSD::shutdown()
       if (p->second->ref.read() != 1) {
         derr << "pgid " << p->first << " has ref count of "
             << p->second->ref.read() << dendl;
+#ifdef PG_DEBUG_REFS
+	p->second->dump_live_ids();
+#endif
         assert(0);
       }
       p->second->unlock();
@@ -2435,19 +2461,18 @@ void OSD::write_superblock(ObjectStore::Transaction& t)
   dout(10) << "write_superblock " << superblock << dendl;
 
   //hack: at minimum it's using the baseline feature set
-  if (!superblock.compat_features.incompat.mask |
-      CEPH_OSD_FEATURE_INCOMPAT_BASE.id)
+  if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
     superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
 
   bufferlist bl;
   ::encode(superblock, bl);
-  t.write(META_COLL, OSD_SUPERBLOCK_POBJECT, 0, bl.length(), bl);
+  t.write(coll_t::meta(), OSD_SUPERBLOCK_POBJECT, 0, bl.length(), bl);
 }
 
 int OSD::read_superblock()
 {
   bufferlist bl;
-  int r = store->read(META_COLL, OSD_SUPERBLOCK_POBJECT, 0, 0, bl);
+  int r = store->read(coll_t::meta(), OSD_SUPERBLOCK_POBJECT, 0, 0, bl);
   if (r < 0)
     return r;
 
@@ -2459,23 +2484,67 @@ int OSD::read_superblock()
   return 0;
 }
 
+void OSD::clear_temp_objects()
+{
+  dout(10) << __func__ << dendl;
+  vector<coll_t> ls;
+  store->list_collections(ls);
+  for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
+    spg_t pgid;
+    if (!p->is_pg(&pgid))
+      continue;
 
+    // list temp objects
+    dout(20) << " clearing temps in " << *p << " pgid " << pgid << dendl;
+
+    vector<ghobject_t> temps;
+    ghobject_t next;
+    while (1) {
+      vector<ghobject_t> objects;
+      store->collection_list(*p, next, ghobject_t::get_max(), true,
+			     store->get_ideal_list_max(),
+			     &objects, &next);
+      if (objects.empty())
+	break;
+      vector<ghobject_t>::iterator q;
+      for (q = objects.begin(); q != objects.end(); ++q) {
+	if (q->hobj.is_temp()) {
+	  temps.push_back(*q);
+	} else {
+	  break;
+	}
+      }
+      // If we saw a non-temp object and hit the break above we can
+      // break out of the while loop too.
+      if (q != objects.end())
+	break;
+    }
+    if (!temps.empty()) {
+      ObjectStore::Transaction t;
+      for (vector<ghobject_t>::iterator q = temps.begin(); q != temps.end(); ++q) {
+	dout(20) << "  removing " << *p << " object " << *q << dendl;
+	t.remove(*p, *q);
+      }
+      store->apply_transaction(service.meta_osr.get(), t);
+    }
+  }
+}
 
-void OSD::recursive_remove_collection(ObjectStore *store, coll_t tmp)
+void OSD::recursive_remove_collection(ObjectStore *store, spg_t pgid, coll_t tmp)
 {
   OSDriver driver(
     store,
     coll_t(),
     make_snapmapper_oid());
 
-  spg_t pg;
-  tmp.is_pg_prefix(pg);
-
+  ceph::shared_ptr<ObjectStore::Sequencer> osr(
+    new ObjectStore::Sequencer("rm"));
   ObjectStore::Transaction t;
-  SnapMapper mapper(&driver, 0, 0, 0, pg.shard);
+  SnapMapper mapper(&driver, 0, 0, 0, pgid.shard);
 
   vector<ghobject_t> objects;
-  store->collection_list(tmp, objects);
+  store->collection_list(tmp, ghobject_t(), ghobject_t::get_max(), true,
+			 INT_MAX, &objects, 0);
 
   // delete them.
   unsigned removed = 0;
@@ -2488,16 +2557,20 @@ void OSD::recursive_remove_collection(ObjectStore *store, coll_t tmp)
       assert(0);
     t.remove(tmp, *p);
     if (removed > 300) {
-      int r = store->apply_transaction(t);
+      int r = store->apply_transaction(osr.get(), t);
       assert(r == 0);
       t = ObjectStore::Transaction();
       removed = 0;
     }
   }
   t.remove_collection(tmp);
-  int r = store->apply_transaction(t);
+  int r = store->apply_transaction(osr.get(), t);
   assert(r == 0);
-  store->sync_and_flush();
+
+  C_SaferCond waiter;
+  if (!osr->flush_commit(&waiter)) {
+    waiter.wait();
+  }
 }
 
 
@@ -2715,14 +2788,12 @@ PG *OSD::get_pg_or_queue_for_pg(const spg_t& pgid, OpRequestRef& op)
 
 bool OSD::_have_pg(spg_t pgid)
 {
-  assert(osd_lock.is_locked());
   RWLock::RLocker l(pg_map_lock);
   return pg_map.count(pgid);
 }
 
 PG *OSD::_lookup_lock_pg(spg_t pgid)
 {
-  assert(osd_lock.is_locked());
   RWLock::RLocker l(pg_map_lock);
   if (!pg_map.count(pgid))
     return NULL;
@@ -2734,7 +2805,6 @@ PG *OSD::_lookup_lock_pg(spg_t pgid)
 
 PG *OSD::_lookup_pg(spg_t pgid)
 {
-  assert(osd_lock.is_locked());
   RWLock::RLocker l(pg_map_lock);
   if (!pg_map.count(pgid))
     return NULL;
@@ -2744,7 +2814,6 @@ PG *OSD::_lookup_pg(spg_t pgid)
 
 PG *OSD::_lookup_lock_pg_with_map_lock_held(spg_t pgid)
 {
-  assert(osd_lock.is_locked());
   assert(pg_map.count(pgid));
   PG *pg = pg_map[pgid];
   pg->lock();
@@ -2766,33 +2835,21 @@ void OSD::load_pgs()
     derr << "failed to list pgs: " << cpp_strerror(-r) << dendl;
   }
 
-  set<spg_t> head_pgs;
-  map<spg_t, interval_set<snapid_t> > pgs;
+  set<spg_t> pgs;
   for (vector<coll_t>::iterator it = ls.begin();
        it != ls.end();
        ++it) {
     spg_t pgid;
-    snapid_t snap;
-    uint64_t seq;
-
-    if (it->is_temp(pgid) ||
-	it->is_removal(&seq, &pgid) ||
-	(it->is_pg(pgid, snap) &&
-	 PG::_has_removal_flag(store, pgid))) {
+    if (it->is_temp(&pgid) ||
+	it->is_removal(&pgid) ||
+	(it->is_pg(&pgid) && PG::_has_removal_flag(store, pgid))) {
       dout(10) << "load_pgs " << *it << " clearing temp" << dendl;
-      recursive_remove_collection(store, *it);
+      recursive_remove_collection(store, pgid, *it);
       continue;
     }
 
-    if (it->is_pg(pgid, snap)) {
-      if (snap != CEPH_NOSNAP) {
-	dout(10) << "load_pgs skipping snapped dir " << *it
-		 << " (pg " << pgid << " snap " << snap << ")" << dendl;
-	pgs[pgid].insert(snap);
-      } else {
-	pgs[pgid];
-	head_pgs.insert(pgid);
-      }
+    if (it->is_pg(&pgid)) {
+      pgs.insert(pgid);
       continue;
     }
 
@@ -2800,16 +2857,8 @@ void OSD::load_pgs()
   }
 
   bool has_upgraded = false;
-  for (map<spg_t, interval_set<snapid_t> >::iterator i = pgs.begin();
-       i != pgs.end();
-       ++i) {
-    spg_t pgid(i->first);
-
-    if (!head_pgs.count(pgid)) {
-      dout(10) << __func__ << ": " << pgid << " has orphan snap collections " << i->second
-	       << " with no head" << dendl;
-      continue;
-    }
+  for (set<spg_t>::iterator i = pgs.begin(); i != pgs.end(); ++i) {
+    spg_t pgid(*i);
 
     if (pgid.preferred() >= 0) {
       dout(10) << __func__ << ": skipping localized PG " << pgid << dendl;
@@ -2866,21 +2915,7 @@ void OSD::load_pgs()
       }
       dout(10) << "PG " << pg->info.pgid
 	       << " must upgrade..." << dendl;
-      pg->upgrade(store, i->second);
-    } else if (!i->second.empty()) {
-      // handle upgrade bug
-      for (interval_set<snapid_t>::iterator j = i->second.begin();
-	   j != i->second.end();
-	   ++j) {
-	for (snapid_t k = j.get_start();
-	     k != j.get_start() + j.get_len();
-	     ++k) {
-	  assert(store->collection_empty(coll_t(pgid, k)));
-	  ObjectStore::Transaction t;
-	  t.remove_collection(coll_t(pgid, k));
-	  store->apply_transaction(t);
-	}
-      }
+      pg->upgrade(store);
     }
 
     service.init_splits_between(pg->info.pgid, pg->get_osdmap(), osdmap);
@@ -2912,11 +2947,11 @@ void OSD::load_pgs()
   }
 
   // clean up old infos object?
-  if (has_upgraded && store->exists(META_COLL, OSD::make_infos_oid())) {
+  if (has_upgraded && store->exists(coll_t::meta(), OSD::make_infos_oid())) {
     dout(1) << __func__ << " removing legacy infos object" << dendl;
     ObjectStore::Transaction t;
-    t.remove(META_COLL, OSD::make_infos_oid());
-    int r = store->apply_transaction(t);
+    t.remove(coll_t::meta(), OSD::make_infos_oid());
+    int r = store->apply_transaction(service.meta_osr.get(), t);
     if (r != 0) {
       derr << __func__ << ": apply_transaction returned "
 	   << cpp_strerror(r) << dendl;
@@ -2960,8 +2995,11 @@ void OSD::build_past_intervals_parallel()
       PG *pg = i->second;
 
       epoch_t start, end;
-      if (!pg->_calc_past_interval_range(&start, &end, superblock.oldest_map))
+      if (!pg->_calc_past_interval_range(&start, &end, superblock.oldest_map)) {
+        if (pg->info.history.same_interval_since == 0)
+          pg->info.history.same_interval_since = end;
         continue;
+      }
 
       dout(10) << pg->info.pgid << " needs " << start << "-" << end << dendl;
       pistate& p = pis[pg];
@@ -3045,6 +3083,24 @@ void OSD::build_past_intervals_parallel()
     }
   }
 
+  // Now that past_intervals have been recomputed let's fix the same_interval_since
+  // if it was cleared by import.
+  for (map<PG*,pistate>::iterator i = pis.begin(); i != pis.end(); ++i) {
+    PG *pg = i->first;
+    pistate& p = i->second;
+
+    // Verify same_interval_since is correct
+    if (pg->info.history.same_interval_since) {
+      assert(pg->info.history.same_interval_since == p.same_interval_since);
+    } else {
+      assert(p.same_interval_since);
+      dout(10) << __func__ << " fix same_interval_since " << p.same_interval_since << " pg " << *pg << dendl;
+      dout(10) << __func__ << " past_intervals " << pg->past_intervals << dendl;
+      // Fix it
+      pg->info.history.same_interval_since = p.same_interval_since;
+    }
+  }
+
   // write info only at the end.  this is necessary because we check
   // whether the past_intervals go far enough back or forward in time,
   // but we don't check for holes.  we could avoid it by discarding
@@ -3062,13 +3118,13 @@ void OSD::build_past_intervals_parallel()
 
     // don't let the transaction get too big
     if (++num >= cct->_conf->osd_target_transaction_size) {
-      store->apply_transaction(t);
+      store->apply_transaction(service.meta_osr.get(), t);
       t = ObjectStore::Transaction();
       num = 0;
     }
   }
   if (!t.empty())
-    store->apply_transaction(t);
+    store->apply_transaction(service.meta_osr.get(), t);
 }
 
 /*
@@ -3148,7 +3204,7 @@ void OSD::handle_pg_peering_evt(
     switch (result) {
     case RES_NONE: {
       const pg_pool_t* pp = osdmap->get_pg_pool(pgid.pool());
-      PG::_create(*rctx.transaction, pgid);
+      PG::_create(*rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num()));
       PG::_init(*rctx.transaction, pgid, pp);
 
       PG *pg = _create_lock_pg(
@@ -3961,10 +4017,6 @@ void OSD::tick()
     // periodically kick recovery work queue
     recovery_tp.wake();
 
-    if (!scrub_random_backoff()) {
-      sched_scrub();
-    }
-
     check_replay_queue();
   }
 
@@ -3980,7 +4032,18 @@ void OSD::tick()
 
   check_ops_in_flight();
 
-  tick_timer.add_event_after(1.0, new C_Tick(this));
+  tick_timer.add_event_after(OSD_TICK_INTERVAL, new C_Tick(this));
+}
+
+void OSD::tick_without_osd_lock()
+{
+  assert(tick_timer_lock.is_locked());
+  dout(5) << "tick_without_osd_lock" << dendl;
+
+  if (!scrub_random_backoff()) {
+    sched_scrub();
+  }
+  tick_timer_without_osd_lock.add_event_after(OSD_TICK_INTERVAL, new C_Tick_WithoutOSDLock(this));
 }
 
 void OSD::check_ops_in_flight()
@@ -4002,8 +4065,10 @@ void OSD::check_ops_in_flight()
 //   setomapheader <pool-id> [namespace/]<obj-name> <header>
 //   getomap <pool> [namespace/]<obj-name>
 //   truncobj <pool-id> [namespace/]<obj-name> <newlen>
-//   injectmdataerr [namespace/]<obj-name>
-//   injectdataerr [namespace/]<obj-name>
+//   injectmdataerr [namespace/]<obj-name> [shardid]
+//   injectdataerr [namespace/]<obj-name> [shardid]
+//
+//   set_recovery_delay [utime]
 void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
      std::string command, cmdmap_t& cmdmap, ostream &ss)
 {
@@ -4046,13 +4111,19 @@ void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
       ss << "Invalid namespace/objname";
       return;
     }
+
+    int64_t shardid;
+    cmd_getval(service->cct, cmdmap, "shardid", shardid, int64_t(shard_id_t::NO_SHARD));
+    hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace);
+    ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid)));
+    spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid));
     if (curmap->pg_is_ec(rawpg)) {
-      ss << "Must not call on ec pool";
-      return;
+        if ((command != "injectdataerr") && (command != "injectmdataerr")) {
+            ss << "Must not call on ec pool, except injectdataerr or injectmdataerr";
+            return;
+        }
     }
-    spg_t pgid = spg_t(curmap->raw_pg_to_pg(rawpg), shard_id_t::NO_SHARD);
 
-    hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace);
     ObjectStore::Transaction t;
 
     if (command == "setomapval") {
@@ -4064,8 +4135,8 @@ void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
 
       val.append(valstr);
       newattrs[key] = val;
-      t.omap_setkeys(coll_t(pgid), obj, newattrs);
-      r = store->apply_transaction(t);
+      t.omap_setkeys(coll_t(pgid), ghobject_t(obj), newattrs);
+      r = store->apply_transaction(service->meta_osr.get(), t);
       if (r < 0)
         ss << "error=" << r;
       else
@@ -4076,8 +4147,8 @@ void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
       cmd_getval(service->cct, cmdmap, "key", key);
 
       keys.insert(key);
-      t.omap_rmkeys(coll_t(pgid), obj, keys);
-      r = store->apply_transaction(t);
+      t.omap_rmkeys(coll_t(pgid), ghobject_t(obj), keys);
+      r = store->apply_transaction(service->meta_osr.get(), t);
       if (r < 0)
         ss << "error=" << r;
       else
@@ -4088,8 +4159,8 @@ void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
 
       cmd_getval(service->cct, cmdmap, "header", headerstr);
       newheader.append(headerstr);
-      t.omap_setheader(coll_t(pgid), obj, newheader);
-      r = store->apply_transaction(t);
+      t.omap_setheader(coll_t(pgid), ghobject_t(obj), newheader);
+      r = store->apply_transaction(service->meta_osr.get(), t);
       if (r < 0)
         ss << "error=" << r;
       else
@@ -4098,7 +4169,7 @@ void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
       //Debug: Output entire omap
       bufferlist hdrbl;
       map<string, bufferlist> keyvals;
-      r = store->omap_get(coll_t(pgid), obj, &hdrbl, &keyvals);
+      r = store->omap_get(coll_t(pgid), ghobject_t(obj), &hdrbl, &keyvals);
       if (r >= 0) {
           ss << "header=" << string(hdrbl.c_str(), hdrbl.length());
           for (map<string, bufferlist>::iterator it = keyvals.begin();
@@ -4111,21 +4182,39 @@ void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
     } else if (command == "truncobj") {
       int64_t trunclen;
       cmd_getval(service->cct, cmdmap, "len", trunclen);
-      t.truncate(coll_t(pgid), obj, trunclen);
-      r = store->apply_transaction(t);
+      t.truncate(coll_t(pgid), ghobject_t(obj), trunclen);
+      r = store->apply_transaction(service->meta_osr.get(), t);
       if (r < 0)
 	ss << "error=" << r;
       else
 	ss << "ok";
     } else if (command == "injectdataerr") {
-      store->inject_data_error(obj);
+      store->inject_data_error(gobj);
       ss << "ok";
     } else if (command == "injectmdataerr") {
-      store->inject_mdata_error(obj);
+      store->inject_mdata_error(gobj);
       ss << "ok";
     }
     return;
   }
+  if (command == "set_recovery_delay") {
+    int64_t delay;
+    cmd_getval(service->cct, cmdmap, "utime", delay, (int64_t)0);
+    ostringstream oss;
+    oss << delay;
+    int r = service->cct->_conf->set_val("osd_recovery_delay_start",
+					 oss.str().c_str());
+    if (r != 0) {
+      ss << "set_recovery_delay: error setting "
+	 << "osd_recovery_delay_start to '" << delay << "': error "
+	 << r;
+      return;
+    }
+    service->cct->_conf->apply_changes(NULL);
+    ss << "set_recovery_delay: set osd_recovery_delay_start "
+       << "to " << service->cct->_conf->osd_recovery_delay_start;
+    return;
+  }
   ss << "Internal error - command=" << command;
   return;
 }
@@ -4137,50 +4226,48 @@ bool remove_dir(
   OSDriver *osdriver,
   ObjectStore::Sequencer *osr,
   coll_t coll, DeletingStateRef dstate,
+  bool *finished,
   ThreadPool::TPHandle &handle)
 {
   vector<ghobject_t> olist;
   int64_t num = 0;
   ObjectStore::Transaction *t = new ObjectStore::Transaction;
   ghobject_t next;
-  while (!next.is_max()) {
-    handle.reset_tp_timeout();
-    store->collection_list_partial(
-      coll,
-      next,
-      store->get_ideal_list_min(),
-      store->get_ideal_list_max(),
-      0,
-      &olist,
-      &next);
-    for (vector<ghobject_t>::iterator i = olist.begin();
-	 i != olist.end();
-	 ++i, ++num) {
-      if (i->is_pgmeta())
-	continue;
-      OSDriver::OSTransaction _t(osdriver->get_transaction(t));
-      int r = mapper->remove_oid(i->hobj, &_t);
-      if (r != 0 && r != -ENOENT) {
-	assert(0);
-      }
-      t->remove(coll, *i);
-      if (num >= cct->_conf->osd_target_transaction_size) {
-	C_SaferCond waiter;
-	store->queue_transaction(osr, t, &waiter);
-	bool cont = dstate->pause_clearing();
-	handle.suspend_tp_timeout();
-	waiter.wait();
-	handle.reset_tp_timeout();
-	if (cont)
-	  cont = dstate->resume_clearing();
-	delete t;
-	if (!cont)
-	  return false;
-	t = new ObjectStore::Transaction;
-	num = 0;
-      }
+  handle.reset_tp_timeout();
+  store->collection_list(
+    coll,
+    next,
+    ghobject_t::get_max(),
+    true,
+    store->get_ideal_list_max(),
+    &olist,
+    &next);
+  for (vector<ghobject_t>::iterator i = olist.begin();
+       i != olist.end();
+       ++i, ++num) {
+    if (i->is_pgmeta())
+      continue;
+    OSDriver::OSTransaction _t(osdriver->get_transaction(t));
+    int r = mapper->remove_oid(i->hobj, &_t);
+    if (r != 0 && r != -ENOENT) {
+      assert(0);
+    }
+    t->remove(coll, *i);
+    if (num >= cct->_conf->osd_target_transaction_size) {
+      C_SaferCond waiter;
+      store->queue_transaction(osr, t, &waiter);
+      bool cont = dstate->pause_clearing();
+      handle.suspend_tp_timeout();
+      waiter.wait();
+      handle.reset_tp_timeout();
+      if (cont)
+        cont = dstate->resume_clearing();
+      delete t;
+      if (!cont)
+	return false;
+      t = new ObjectStore::Transaction;
+      num = 0;
     }
-    olist.clear();
   }
 
   C_SaferCond waiter;
@@ -4192,6 +4279,8 @@ bool remove_dir(
   if (cont)
     cont = dstate->resume_clearing();
   delete t;
+  // whether there are more objects to remove in the collection
+  *finished = next.is_max();
   return cont;
 }
 
@@ -4204,20 +4293,20 @@ void OSD::RemoveWQ::_process(
   OSDriver &driver = pg->osdriver;
   coll_t coll = coll_t(pg->info.pgid);
   pg->osr->flush();
+  bool finished = false;
 
-  if (!item.second->start_clearing())
+  if (!item.second->start_or_resume_clearing())
     return;
 
-  list<coll_t> colls_to_remove;
-  pg->get_colls(&colls_to_remove);
-  for (list<coll_t>::iterator i = colls_to_remove.begin();
-       i != colls_to_remove.end();
-       ++i) {
-    bool cont = remove_dir(
-      pg->cct, store, &mapper, &driver, pg->osr.get(), *i, item.second,
-      handle);
-    if (!cont)
-      return;
+  bool cont = remove_dir(
+    pg->cct, store, &mapper, &driver, pg->osr.get(), coll, item.second,
+    &finished, handle);
+  if (!cont)
+    return;
+  if (!finished) {
+    if (item.second->pause_clearing())
+      queue_front(item);
+    return;
   }
 
   if (!item.second->start_deleting())
@@ -4226,15 +4315,11 @@ void OSD::RemoveWQ::_process(
   ObjectStore::Transaction *t = new ObjectStore::Transaction;
   PGLog::clear_info_log(pg->info.pgid, t);
 
-  for (list<coll_t>::iterator i = colls_to_remove.begin();
-       i != colls_to_remove.end();
-       ++i) {
-    if (g_conf->osd_inject_failure_on_pg_removal) {
-      generic_derr << "osd_inject_failure_on_pg_removal" << dendl;
-      exit(1);
-    }
-    t->remove_collection(*i);
+  if (g_conf->osd_inject_failure_on_pg_removal) {
+    generic_derr << "osd_inject_failure_on_pg_removal" << dendl;
+    exit(1);
   }
+  t->remove_collection(coll);
 
   // We need the sequencer to stick around until the op is complete
   store->queue_transaction(
@@ -4275,6 +4360,10 @@ void OSD::ms_handle_connect(Connection *con)
     if (is_booting()) {
       start_boot();
     } else {
+      utime_t now = ceph_clock_now(NULL);
+      last_mon_report = now;
+
+      // resend everything, it's a new session
       send_alive();
       service.send_pg_temp();
       send_failures();
@@ -4284,6 +4373,15 @@ void OSD::ms_handle_connect(Connection *con)
       monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
       monc->renew_subs();
     }
+
+    // full map requests may happen while active or pre-boot
+    if (requested_full_first) {
+      epoch_t first = requested_full_first;
+      epoch_t last = requested_full_last;
+      requested_full_first = 0;
+      requested_full_last = 0;
+      request_full_map(first, last);
+    }
   }
 }
 
@@ -4369,6 +4467,13 @@ void OSD::_maybe_boot(epoch_t oldest, epoch_t newest)
   // if our map within recent history, try to add ourselves to the osdmap.
   if (osdmap->test_flag(CEPH_OSDMAP_NOUP)) {
     dout(5) << "osdmap NOUP flag is set, waiting for it to clear" << dendl;
+  } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE) &&
+	     !store->can_sort_nibblewise()) {
+    dout(1) << "osdmap SORTBITWISE flag is NOT set but our backend does not support nibblewise sort" << dendl;
+  } else if (osdmap->get_num_up_osds() &&
+	     (osdmap->get_up_osd_features() & CEPH_FEATURE_HAMMER_0_94_4) == 0) {
+    dout(1) << "osdmap indicates one or more pre-v0.94.4 hammer OSDs is running"
+	    << dendl;
   } else if (is_waiting_for_healthy() || !_is_healthy()) {
     // if we are not healthy, do not mark ourselves up (yet)
     dout(1) << "not healthy; waiting to boot" << dendl;
@@ -4487,53 +4592,6 @@ void OSD::_send_boot()
   monc->send_mon_message(mboot);
 }
 
-bool OSD::_lsb_release_set (char *buf, const char *str, map<string,string> *pm, const char *key)
-{
-  if (strncmp (buf, str, strlen (str)) == 0) {
-    char *value;
-
-    if (buf[strlen(buf)-1] == '\n')
-      buf[strlen(buf)-1] = '\0';
-
-    value = buf + strlen (str) + 1;
-    (*pm)[key] = value;
-
-    return true;
-  }
-  return false;
-}
-
-void OSD::_lsb_release_parse (map<string,string> *pm)
-{
-  FILE *fp = NULL;
-  char buf[512];
-
-  fp = popen("lsb_release -idrc", "r");
-  if (!fp) {
-    int ret = -errno;
-    derr << "lsb_release_parse - failed to call lsb_release binary with error: " << cpp_strerror(ret) << dendl;
-    return;
-  }
-
-  while (fgets(buf, sizeof(buf) - 1, fp) != NULL) {
-    if (_lsb_release_set(buf, "Distributor ID:", pm, "distro")) 
-      continue;
-    if (_lsb_release_set(buf, "Description:", pm, "distro_description"))
-      continue;
-    if (_lsb_release_set(buf, "Release:", pm, "distro_version"))
-      continue;
-    if (_lsb_release_set(buf, "Codename:", pm, "distro_codename"))
-      continue;
-    
-    derr << "unhandled output: " << buf << dendl;
-  }
-
-  if (pclose(fp)) {
-    int ret = -errno;
-    derr << "lsb_release_parse - pclose failed: " << cpp_strerror(ret) << dendl;
-  }
-}
-
 void OSD::_collect_metadata(map<string,string> *pm)
 {
   (*pm)["ceph_version"] = pretty_version_to_str();
@@ -4550,64 +4608,7 @@ void OSD::_collect_metadata(map<string,string> *pm)
   (*pm)["osd_objectstore"] = g_conf->osd_objectstore;
   store->collect_metadata(pm);
 
-  // kernel info
-  struct utsname u;
-  int r = uname(&u);
-  if (r >= 0) {
-    (*pm)["os"] = u.sysname;
-    (*pm)["kernel_version"] = u.release;
-    (*pm)["kernel_description"] = u.version;
-    (*pm)["hostname"] = u.nodename;
-    (*pm)["arch"] = u.machine;
-  }
-
-  // memory
-  FILE *f = fopen("/proc/meminfo", "r");
-  if (f) {
-    char buf[100];
-    while (!feof(f)) {
-      char *line = fgets(buf, sizeof(buf), f);
-      if (!line)
-	break;
-      char key[40];
-      long long value;
-      int r = sscanf(line, "%s %lld", key, &value);
-      if (r == 2) {
-	if (strcmp(key, "MemTotal:") == 0)
-	  (*pm)["mem_total_kb"] = stringify(value);
-	else if (strcmp(key, "SwapTotal:") == 0)
-	  (*pm)["mem_swap_kb"] = stringify(value);
-      }
-    }
-    fclose(f);
-  }
-
-  // processor
-  f = fopen("/proc/cpuinfo", "r");
-  if (f) {
-    char buf[100];
-    while (!feof(f)) {
-      char *line = fgets(buf, sizeof(buf), f);
-      if (!line)
-	break;
-      if (strncmp(line, "model name", 10) == 0) {
-	char *c = strchr(buf, ':');
-	c++;
-	while (*c == ' ')
-	  ++c;
-	char *nl = c;
-	while (*nl != '\n')
-	  ++nl;
-	*nl = '\0';
-	(*pm)["cpu"] = c;
-	break;
-      }
-    }
-    fclose(f);
-  }
-
-  // distro info
-  _lsb_release_parse(pm); 
+  collect_sys_info(pm, g_ceph_context);
 
   dout(10) << __func__ << " " << *pm << dendl;
 }
@@ -4646,6 +4647,65 @@ void OSD::send_alive()
   }
 }
 
+void OSD::request_full_map(epoch_t first, epoch_t last)
+{
+  dout(10) << __func__ << " " << first << ".." << last
+	   << ", previously requested "
+	   << requested_full_first << ".." << requested_full_last << dendl;
+  assert(osd_lock.is_locked());
+  assert(first > 0 && last > 0);
+  assert(first <= last);
+  assert(first >= requested_full_first);  // we shouldn't ever ask for older maps
+  if (requested_full_first == 0) {
+    // first request
+    requested_full_first = first;
+    requested_full_last = last;
+  } else if (last <= requested_full_last) {
+    // dup
+    return;
+  } else {
+    // additional request
+    first = requested_full_last + 1;
+    requested_full_last = last;
+  }
+  MMonGetOSDMap *req = new MMonGetOSDMap;
+  req->request_full(first, last);
+  monc->send_mon_message(req);
+}
+
+void OSD::got_full_map(epoch_t e)
+{
+  assert(requested_full_first <= requested_full_last);
+  assert(osd_lock.is_locked());
+  if (requested_full_first == 0) {
+    dout(20) << __func__ << " " << e << ", nothing requested" << dendl;
+    return;
+  }
+  if (e < requested_full_first) {
+    dout(10) << __func__ << " " << e << ", requested " << requested_full_first
+	     << ".." << requested_full_last
+	     << ", ignoring" << dendl;
+    return;
+  }
+  if (e > requested_full_first) {
+    dout(10) << __func__ << " " << e << ", requested " << requested_full_first
+	     << ".." << requested_full_last << ", resetting" << dendl;
+    requested_full_first = requested_full_last = 0;
+    return;
+  }
+  if (requested_full_first == requested_full_last) {
+    dout(10) << __func__ << " " << e << ", requested " << requested_full_first
+	     << ".." << requested_full_last
+	     << ", now done" << dendl;
+    requested_full_first = requested_full_last = 0;
+  } else {
+    dout(10) << __func__ << " " << e << ", requested " << requested_full_first
+	     << ".." << requested_full_last
+	     << ", still need more" << dendl;
+    ++requested_full_first;
+  }
+}
+
 void OSD::send_failures()
 {
   assert(osd_lock.is_locked());
@@ -4896,7 +4956,9 @@ COMMAND("cluster_log " \
 	"osd", "rw", "cli,rest")
 COMMAND("bench " \
 	"name=count,type=CephInt,req=false " \
-	"name=size,type=CephInt,req=false ", \
+	"name=size,type=CephInt,req=false " \
+	"name=object_size,type=CephInt,req=false " \
+	"name=object_num,type=CephInt,req=false ", \
 	"OSD benchmark: write <count> <size>-byte objects, " \
 	"(default 1G size 4MB). Results in log.",
 	"osd", "rw", "cli,rest")
@@ -4905,7 +4967,7 @@ COMMAND("heap " \
 	"name=heapcmd,type=CephChoices,strings=dump|start_profiler|stop_profiler|release|stats", \
 	"show heap usage info (available only if compiled with tcmalloc)", \
 	"osd", "rw", "cli,rest")
-COMMAND("debug_dump_missing " \
+COMMAND("debug dump_missing " \
 	"name=filename,type=CephFilepath",
 	"dump missing objects to a named file", "osd", "r", "cli,rest")
 COMMAND("debug kick_recovery_wq " \
@@ -5070,9 +5132,15 @@ void OSD::do_command(Connection *con, ceph_tid_t tid, vector<string>& cmd, buffe
   else if (prefix == "bench") {
     int64_t count;
     int64_t bsize;
+    int64_t osize, onum;
     // default count 1G, size 4MB
     cmd_getval(cct, cmdmap, "count", count, (int64_t)1 << 30);
     cmd_getval(cct, cmdmap, "size", bsize, (int64_t)4 << 20);
+    cmd_getval(cct, cmdmap, "object_size", osize, (int64_t)0);
+    cmd_getval(cct, cmdmap, "object_num", onum, (int64_t)0);
+
+    ceph::shared_ptr<ObjectStore::Sequencer> osr(
+      new ObjectStore::Sequencer("bench"));
 
     uint32_t duration = g_conf->osd_bench_duration;
 
@@ -5131,30 +5199,74 @@ void OSD::do_command(Connection *con, ceph_tid_t tid, vector<string>& cmd, buffe
     dout(1) << " bench count " << count
             << " bsize " << prettybyte_t(bsize) << dendl;
 
+    ObjectStore::Transaction *cleanupt = new ObjectStore::Transaction;
+
+    if (osize && onum) {
+      bufferlist bl;
+      bufferptr bp(osize);
+      bp.zero();
+      bl.push_back(bp);
+      bl.rebuild_page_aligned();
+      for (int i=0; i<onum; ++i) {
+	char nm[30];
+	snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
+	object_t oid(nm);
+	hobject_t soid(sobject_t(oid, 0));
+	ObjectStore::Transaction *t = new ObjectStore::Transaction;
+	t->write(coll_t(), ghobject_t(soid), 0, osize, bl);
+	store->queue_transaction_and_cleanup(osr.get(), t);
+	cleanupt->remove(coll_t(), ghobject_t(soid));
+      }
+    }
+
     bufferlist bl;
     bufferptr bp(bsize);
     bp.zero();
     bl.push_back(bp);
+    bl.rebuild_page_aligned();
 
-    ObjectStore::Transaction *cleanupt = new ObjectStore::Transaction;
+    {
+      C_SaferCond waiter;
+      if (!osr->flush_commit(&waiter)) {
+	waiter.wait();
+      }
+    }
 
-    store->sync_and_flush();
     utime_t start = ceph_clock_now(cct);
     for (int64_t pos = 0; pos < count; pos += bsize) {
       char nm[30];
-      snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
+      unsigned offset = 0;
+      if (onum && osize) {
+	snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
+	offset = rand() % (osize / bsize) * bsize;
+      } else {
+	snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
+      }
       object_t oid(nm);
       hobject_t soid(sobject_t(oid, 0));
       ObjectStore::Transaction *t = new ObjectStore::Transaction;
-      t->write(META_COLL, soid, 0, bsize, bl);
-      store->queue_transaction_and_cleanup(NULL, t);
-      cleanupt->remove(META_COLL, soid);
+      t->write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
+      store->queue_transaction_and_cleanup(osr.get(), t);
+      if (!onum || !osize)
+	cleanupt->remove(coll_t::meta(), ghobject_t(soid));
+    }
+
+    {
+      C_SaferCond waiter;
+      if (!osr->flush_commit(&waiter)) {
+	waiter.wait();
+      }
     }
-    store->sync_and_flush();
     utime_t end = ceph_clock_now(cct);
 
     // clean up
-    store->queue_transaction_and_cleanup(NULL, cleanupt);
+    store->queue_transaction_and_cleanup(osr.get(), cleanupt);
+    {
+      C_SaferCond waiter;
+      if (!osr->flush_commit(&waiter)) {
+	waiter.wait();
+      }
+    }
 
     uint64_t rate = (double)count / (end - start);
     if (f) {
@@ -5215,9 +5327,9 @@ void OSD::do_command(Connection *con, ceph_tid_t tid, vector<string>& cmd, buffe
       pg->lock();
 
       fout << *pg << std::endl;
-      std::map<hobject_t, pg_missing_t::item>::const_iterator mend =
+      std::map<hobject_t, pg_missing_t::item, hobject_t::BitwiseComparator>::const_iterator mend =
 	pg->pg_log.get_missing().missing.end();
-      std::map<hobject_t, pg_missing_t::item>::const_iterator mi =
+      std::map<hobject_t, pg_missing_t::item, hobject_t::BitwiseComparator>::const_iterator mi =
 	pg->pg_log.get_missing().missing.begin();
       for (; mi != mend; ++mi) {
 	fout << mi->first << " -> " << mi->second << std::endl;
@@ -5652,6 +5764,8 @@ epoch_t op_required_epoch(OpRequestRef op)
     return replica_op_required_epoch<MOSDECSubOpRead, MSG_OSD_EC_READ>(op);
   case MSG_OSD_EC_READ_REPLY:
     return replica_op_required_epoch<MOSDECSubOpReadReply, MSG_OSD_EC_READ_REPLY>(op);
+  case MSG_OSD_REP_SCRUB:
+    return replica_op_required_epoch<MOSDRepScrub, MSG_OSD_REP_SCRUB>(op);
   default:
     assert(0);
     return 0;
@@ -5665,7 +5779,6 @@ void OSD::dispatch_op(OpRequestRef op)
   case MSG_OSD_PG_CREATE:
     handle_pg_create(op);
     break;
-
   case MSG_OSD_PG_NOTIFY:
     handle_pg_notify(op);
     break;
@@ -5766,6 +5879,9 @@ bool OSD::dispatch_op_fast(OpRequestRef& op, OSDMapRef& osdmap)
   case MSG_OSD_EC_READ_REPLY:
     handle_replica_op<MOSDECSubOpReadReply, MSG_OSD_EC_READ_REPLY>(op, osdmap);
     break;
+  case MSG_OSD_REP_SCRUB:
+    handle_replica_op<MOSDRepScrub, MSG_OSD_REP_SCRUB>(op, osdmap);
+    break;
   default:
     assert(0);
   }
@@ -5810,20 +5926,17 @@ void OSD::_dispatch(Message *m)
     handle_scrub(static_cast<MOSDScrub*>(m));
     break;
 
-  case MSG_OSD_REP_SCRUB:
-    handle_rep_scrub(static_cast<MOSDRepScrub*>(m));
-    break;
-
     // -- need OSDMap --
 
   default:
     {
       OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
-      op->mark_event("waiting_for_osdmap");
       // no map?  starting up?
       if (!osdmap) {
         dout(7) << "no OSDMap, not booted" << dendl;
+	logger->inc(l_osd_waiting_for_map);
         waiting_for_osdmap.push_back(op);
+	op->mark_delayed("no osdmap");
         break;
       }
       
@@ -5836,26 +5949,6 @@ void OSD::_dispatch(Message *m)
 
 }
 
-void OSD::handle_rep_scrub(MOSDRepScrub *m)
-{
-  dout(10) << __func__ << " " << *m << dendl;
-  if (!require_self_aliveness(m, m->map_epoch)) {
-    m->put();
-    return;
-  }
-  if (!require_osd_peer(m)) {
-    m->put();
-    return;
-  }
-  if (osdmap->get_epoch() >= m->map_epoch &&
-      !require_same_peer_instance(m, osdmap, true)) {
-    m->put();
-    return;
-  }
-
-  rep_scrub_wq.queue(m);
-}
-
 void OSD::handle_scrub(MOSDScrub *m)
 {
   dout(10) << "handle_scrub " << *m << dendl;
@@ -5919,6 +6012,30 @@ bool OSD::scrub_random_backoff()
   return false;
 }
 
+OSDService::ScrubJob::ScrubJob(const spg_t& pg, const utime_t& timestamp, bool must)
+  : pgid(pg),
+    sched_time(timestamp),
+    deadline(timestamp)
+{
+  // if not explicitly requested, postpone the scrub with a random delay
+  if (!must) {
+    sched_time += g_conf->osd_scrub_min_interval;
+    if (g_conf->osd_scrub_interval_randomize_ratio > 0) {
+      sched_time += rand() % (int)(g_conf->osd_scrub_min_interval *
+				   g_conf->osd_scrub_interval_randomize_ratio);
+    }
+    deadline += g_conf->osd_scrub_max_interval;
+  }
+}
+
+bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob& rhs) const {
+  if (sched_time < rhs.sched_time)
+    return true;
+  if (sched_time > rhs.sched_time)
+    return false;
+  return pgid < rhs.pgid;
+}
+
 bool OSD::scrub_time_permit(utime_t now)
 {
   struct tm bdt; 
@@ -5935,91 +6052,78 @@ bool OSD::scrub_time_permit(utime_t now)
     }    
   }
   if (!time_permit) {
-    dout(20) << "scrub_should_schedule should run between " << cct->_conf->osd_scrub_begin_hour
+    dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
             << " - " << cct->_conf->osd_scrub_end_hour
             << " now " << bdt.tm_hour << " = no" << dendl;
   } else {
-    dout(20) << "scrub_should_schedule should run between " << cct->_conf->osd_scrub_begin_hour
+    dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
             << " - " << cct->_conf->osd_scrub_end_hour
             << " now " << bdt.tm_hour << " = yes" << dendl;
   }
   return time_permit;
 }
 
-bool OSD::scrub_should_schedule()
+bool OSD::scrub_load_below_threshold()
 {
-  if (!scrub_time_permit(ceph_clock_now(cct))) {
-    return false;
-  }
   double loadavgs[1];
   if (getloadavg(loadavgs, 1) != 1) {
-    dout(10) << "scrub_should_schedule couldn't read loadavgs\n" << dendl;
+    dout(10) << __func__ << " couldn't read loadavgs\n" << dendl;
     return false;
   }
 
   if (loadavgs[0] >= cct->_conf->osd_scrub_load_threshold) {
-    dout(20) << "scrub_should_schedule loadavg " << loadavgs[0]
+    dout(20) << __func__ << " loadavg " << loadavgs[0]
 	     << " >= max " << cct->_conf->osd_scrub_load_threshold
 	     << " = no, load too high" << dendl;
     return false;
+  } else {
+    dout(20) << __func__ << " loadavg " << loadavgs[0]
+	     << " < max " << cct->_conf->osd_scrub_load_threshold
+	     << " = yes" << dendl;
+    return true;
   }
-
-  dout(20) << "scrub_should_schedule loadavg " << loadavgs[0]
-	   << " < max " << cct->_conf->osd_scrub_load_threshold
-	   << " = yes" << dendl;
-  return loadavgs[0] < cct->_conf->osd_scrub_load_threshold;
 }
 
 void OSD::sched_scrub()
 {
-  assert(osd_lock.is_locked());
-
-  bool load_is_low = scrub_should_schedule();
-
-  dout(20) << "sched_scrub load_is_low=" << (int)load_is_low << dendl;
+  // if not permitted, fail fast
+  if (!service.can_inc_scrubs_pending()) {
+    return;
+  }
 
   utime_t now = ceph_clock_now(cct);
-  
-  //dout(20) << " " << last_scrub_pg << dendl;
+  bool time_permit = scrub_time_permit(now);
+  bool load_is_low = scrub_load_below_threshold();
+  dout(20) << "sched_scrub load_is_low=" << (int)load_is_low << dendl;
 
-  pair<utime_t, spg_t> pos;
-  if (service.first_scrub_stamp(&pos)) {
+  OSDService::ScrubJob scrub;
+  if (service.first_scrub_stamp(&scrub)) {
     do {
-      utime_t t = pos.first;
-      spg_t pgid = pos.second;
-      dout(30) << "sched_scrub examine " << pgid << " at " << t << dendl;
-
-      utime_t diff = now - t;
-      if ((double)diff < cct->_conf->osd_scrub_min_interval) {
-	dout(10) << "sched_scrub " << pgid << " at " << t
-		 << ": " << (double)diff << " < min (" << cct->_conf->osd_scrub_min_interval << " seconds)" << dendl;
-	break;
-      }
-      if ((double)diff < cct->_conf->osd_scrub_max_interval && !load_is_low) {
+      dout(30) << "sched_scrub examine " << scrub.pgid << " at " << scrub.sched_time << dendl;
+
+      if (scrub.sched_time > now) {
 	// save ourselves some effort
-	dout(10) << "sched_scrub " << pgid << " high load at " << t
-		 << ": " << (double)diff << " < max (" << cct->_conf->osd_scrub_max_interval << " seconds)" << dendl;
+	dout(10) << "sched_scrub " << scrub.pgid << " schedued at " << scrub.sched_time
+		 << " > " << now << dendl;
 	break;
       }
 
-      PG *pg = _lookup_lock_pg(pgid);
-      if (pg) {
-	if (pg->get_pgbackend()->scrub_supported() && pg->is_active() &&
-	    (load_is_low ||
-	     (double)diff >= cct->_conf->osd_scrub_max_interval ||
-	     pg->scrubber.must_scrub)) {
-	  dout(10) << "sched_scrub scrubbing " << pgid << " at " << t
-		   << (pg->scrubber.must_scrub ? ", explicitly requested" :
-		   ( (double)diff >= cct->_conf->osd_scrub_max_interval ? ", diff >= max" : ""))
-		   << dendl;
-	  if (pg->sched_scrub()) {
-	    pg->unlock();
-	    break;
-	  }
+      PG *pg = _lookup_lock_pg(scrub.pgid);
+      if (!pg)
+	continue;
+      if (pg->get_pgbackend()->scrub_supported() && pg->is_active() &&
+	  (scrub.deadline < now || (time_permit && load_is_low))) {
+	dout(10) << "sched_scrub scrubbing " << scrub.pgid << " at " << scrub.sched_time
+		 << (pg->scrubber.must_scrub ? ", explicitly requested" :
+		     (load_is_low ? ", load_is_low" : " deadline < now"))
+		 << dendl;
+	if (pg->sched_scrub()) {
+	  pg->unlock();
+	  break;
 	}
-	pg->unlock();
       }
-    } while  (service.next_scrub_stamp(pos, &pos));
+      pg->unlock();
+    } while (service.next_scrub_stamp(scrub, &scrub));
   }    
   dout(20) << "sched_scrub done" << dendl;
 }
@@ -6172,7 +6276,6 @@ void OSD::handle_osd_map(MOSDMap *m)
   ObjectStore::Transaction &t = *_t;
 
   // store new maps: queue for disk and put in the osdmap cache
-  epoch_t last_marked_full = 0;
   epoch_t start = MAX(osdmap->get_epoch() + 1, first);
   for (epoch_t e = start; e <= last; e++) {
     map<epoch_t,bufferlist>::iterator p;
@@ -6183,13 +6286,13 @@ void OSD::handle_osd_map(MOSDMap *m)
       bufferlist& bl = p->second;
       
       o->decode(bl);
-      if (o->test_flag(CEPH_OSDMAP_FULL))
-	last_marked_full = e;
 
-      hobject_t fulloid = get_osdmap_pobject_name(e);
-      t.write(META_COLL, fulloid, 0, bl.length(), bl);
+      ghobject_t fulloid = get_osdmap_pobject_name(e);
+      t.write(coll_t::meta(), fulloid, 0, bl.length(), bl);
       pin_map_bl(e, bl);
       pinned_maps.push_back(add_map(o));
+
+      got_full_map(e);
       continue;
     }
 
@@ -6197,8 +6300,8 @@ void OSD::handle_osd_map(MOSDMap *m)
     if (p != m->incremental_maps.end()) {
       dout(10) << "handle_osd_map  got inc map for epoch " << e << dendl;
       bufferlist& bl = p->second;
-      hobject_t oid = get_inc_osdmap_pobject_name(e);
-      t.write(META_COLL, oid, 0, bl.length(), bl);
+      ghobject_t oid = get_inc_osdmap_pobject_name(e);
+      t.write(coll_t::meta(), oid, 0, bl.length(), bl);
       pin_map_inc_bl(e, bl);
 
       OSDMap *o = new OSDMap;
@@ -6216,9 +6319,6 @@ void OSD::handle_osd_map(MOSDMap *m)
 	assert(0 == "bad fsid");
       }
 
-      if (o->test_flag(CEPH_OSDMAP_FULL))
-	last_marked_full = e;
-
       bufferlist fbl;
       o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
 
@@ -6234,17 +6334,18 @@ void OSD::handle_osd_map(MOSDMap *m)
 		<< " but failed to encode full with correct crc; requesting"
 		<< dendl;
 	clog->warn() << "failed to encode map e" << e << " with expected crc\n";
+	dout(20) << "my encoded map was:\n";
+	fbl.hexdump(*_dout);
+	*_dout << dendl;
 	delete o;
-	MMonGetOSDMap *req = new MMonGetOSDMap;
-	req->request_full(e, last);
-	monc->send_mon_message(req);
+	request_full_map(e, last);
 	last = e - 1;
 	break;
       }
+      got_full_map(e);
 
-
-      hobject_t fulloid = get_osdmap_pobject_name(e);
-      t.write(META_COLL, fulloid, 0, fbl.length(), fbl);
+      ghobject_t fulloid = get_osdmap_pobject_name(e);
+      t.write(coll_t::meta(), fulloid, 0, fbl.length(), fbl);
       pin_map_bl(e, fbl);
       pinned_maps.push_back(add_map(o));
       continue;
@@ -6270,8 +6371,8 @@ void OSD::handle_osd_map(MOSDMap *m)
 	  service.map_cache.cached_key_lower_bound()));
     for (epoch_t e = superblock.oldest_map; e < min; ++e) {
       dout(20) << " removing old osdmap epoch " << e << dendl;
-      t.remove(META_COLL, get_osdmap_pobject_name(e));
-      t.remove(META_COLL, get_inc_osdmap_pobject_name(e));
+      t.remove(coll_t::meta(), get_osdmap_pobject_name(e));
+      t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e));
       superblock.oldest_map = e+1;
       num++;
       if (num >= cct->_conf->osd_target_transaction_size &&
@@ -6284,13 +6385,8 @@ void OSD::handle_osd_map(MOSDMap *m)
     superblock.oldest_map = first;
   superblock.newest_map = last;
 
-  if (last_marked_full > superblock.last_map_marked_full)
-    superblock.last_map_marked_full = last_marked_full;
- 
   map_lock.get_write();
 
-  C_Contexts *fin = new C_Contexts(cct);
-
   // advance through the new maps
   for (epoch_t cur = start; cur <= superblock.newest_map; cur++) {
     dout(10) << " advance to epoch " << cur << " (<= newest " << superblock.newest_map << ")" << dendl;
@@ -6320,7 +6416,7 @@ void OSD::handle_osd_map(MOSDMap *m)
     osdmap = newmap;
 
     superblock.current_epoch = cur;
-    advance_map(t, fin);
+    advance_map();
     had_map_since = ceph_clock_now(cct);
   }
 
@@ -6421,10 +6517,10 @@ void OSD::handle_osd_map(MOSDMap *m)
   // superblock and commit
   write_superblock(t);
   store->queue_transaction(
-    0,
+    service.meta_osr.get(),
     _t,
     new C_OnMapApply(&service, _t, pinned_maps, osdmap->get_epoch()),
-    0, fin);
+    0, 0);
   service.publish_superblock(superblock);
 
   map_lock.put_write();
@@ -6454,8 +6550,10 @@ void OSD::handle_osd_map(MOSDMap *m)
   else if (do_restart)
     start_boot();
 
+  osd_lock.Unlock();
   if (do_shutdown)
     shutdown();
+  osd_lock.Lock();
 
   m->put();
 }
@@ -6512,7 +6610,7 @@ void OSD::check_osdmap_features(ObjectStore *fs)
       superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
       ObjectStore::Transaction *t = new ObjectStore::Transaction;
       write_superblock(*t);
-      int err = store->queue_transaction_and_cleanup(NULL, t);
+      int err = store->queue_transaction_and_cleanup(service.meta_osr.get(), t);
       assert(err == 0);
       fs->set_allow_sharded_objects();
     }
@@ -6591,10 +6689,9 @@ bool OSD::advance_pg(
 }
 
 /** 
- * scan placement groups, initiate any replication
- * activities.
+ * update service map; check pg creations
  */
-void OSD::advance_map(ObjectStore::Transaction& t, C_Contexts *tfin)
+void OSD::advance_map()
 {
   assert(osd_lock.is_locked());
 
@@ -7089,7 +7186,7 @@ void OSD::handle_pg_create(OpRequestRef op)
     PG *pg = NULL;
     if (can_create_pg(pgid)) {
       const pg_pool_t* pp = osdmap->get_pg_pool(pgid.pool());
-      PG::_create(*rctx.transaction, pgid);
+      PG::_create(*rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num()));
       PG::_init(*rctx.transaction, pgid, pp);
 
       pg_interval_map_t pi;
@@ -7150,37 +7247,6 @@ void OSD::dispatch_context_transaction(PG::RecoveryCtx &ctx, PG *pg,
   }
 }
 
-bool OSD::compat_must_dispatch_immediately(PG *pg)
-{
-  assert(pg->is_locked());
-  set<pg_shard_t> tmpacting;
-  if (!pg->actingbackfill.empty()) {
-    tmpacting = pg->actingbackfill;
-  } else {
-    for (unsigned i = 0; i < pg->acting.size(); ++i) {
-      if (pg->acting[i] == CRUSH_ITEM_NONE)
-	continue;
-      tmpacting.insert(
-	pg_shard_t(
-	  pg->acting[i],
-	  pg->pool.info.ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD));
-    }
-  }
-
-  for (set<pg_shard_t>::iterator i = tmpacting.begin();
-       i != tmpacting.end();
-       ++i) {
-    if (i->osd == whoami || i->osd == CRUSH_ITEM_NONE)
-      continue;
-    ConnectionRef conn =
-      service.get_con_osd_cluster(i->osd, pg->get_osdmap()->get_epoch());
-    if (conn && !conn->has_feature(CEPH_FEATURE_INDEP_PG_MAP)) {
-      return true;
-    }
-  }
-  return false;
-}
-
 void OSD::dispatch_context(PG::RecoveryCtx &ctx, PG *pg, OSDMapRef curmap,
                            ThreadPool::TPHandle *handle)
 {
@@ -7235,26 +7301,11 @@ void OSD::do_notifies(
       continue;
     }
     service.share_map_peer(it->first, con.get(), curmap);
-    if (con->has_feature(CEPH_FEATURE_INDEP_PG_MAP)) {
-      dout(7) << __func__ << " osd " << it->first
-	      << " on " << it->second.size() << " PGs" << dendl;
-      MOSDPGNotify *m = new MOSDPGNotify(curmap->get_epoch(),
-					 it->second);
-      con->send_message(m);
-    } else {
-      dout(7) << __func__ << " osd " << it->first
-	      << " sending separate messages" << dendl;
-      for (vector<pair<pg_notify_t, pg_interval_map_t> >::iterator i =
-	     it->second.begin();
-	   i != it->second.end();
-	   ++i) {
-	vector<pair<pg_notify_t, pg_interval_map_t> > list(1);
-	list[0] = *i;
-	MOSDPGNotify *m = new MOSDPGNotify(i->first.epoch_sent,
-					   list);
-	con->send_message(m);
-      }
-    }
+    dout(7) << __func__ << " osd " << it->first
+	    << " on " << it->second.size() << " PGs" << dendl;
+    MOSDPGNotify *m = new MOSDPGNotify(curmap->get_epoch(),
+				       it->second);
+    con->send_message(m);
   }
 }
 
@@ -7280,24 +7331,10 @@ void OSD::do_queries(map<int, map<spg_t,pg_query_t> >& query_map,
       continue;
     }
     service.share_map_peer(who, con.get(), curmap);
-    if (con->has_feature(CEPH_FEATURE_INDEP_PG_MAP)) {
-      dout(7) << __func__ << " querying osd." << who
-	      << " on " << pit->second.size() << " PGs" << dendl;
-      MOSDPGQuery *m = new MOSDPGQuery(curmap->get_epoch(), pit->second);
-      con->send_message(m);
-    } else {
-      dout(7) << __func__ << " querying osd." << who
-	      << " sending seperate messages on " << pit->second.size()
-	      << " PGs" << dendl;
-      for (map<spg_t, pg_query_t>::iterator i = pit->second.begin();
-	   i != pit->second.end();
-	   ++i) {
-	map<spg_t, pg_query_t> to_send;
-	to_send.insert(*i);
-	MOSDPGQuery *m = new MOSDPGQuery(i->second.epoch_sent, to_send);
-	con->send_message(m);
-      }
-    }
+    dout(7) << __func__ << " querying osd." << who
+	    << " on " << pit->second.size() << " PGs" << dendl;
+    MOSDPGQuery *m = new MOSDPGQuery(curmap->get_epoch(), pit->second);
+    con->send_message(m);
   }
 }
 
@@ -7329,22 +7366,9 @@ void OSD::do_infos(map<int,
       continue;
     }
     service.share_map_peer(p->first, con.get(), curmap);
-    if (con->has_feature(CEPH_FEATURE_INDEP_PG_MAP)) {
-      MOSDPGInfo *m = new MOSDPGInfo(curmap->get_epoch());
-      m->pg_list = p->second;
-      con->send_message(m);
-    } else {
-      for (vector<pair<pg_notify_t, pg_interval_map_t> >::iterator i =
-	     p->second.begin();
-	   i != p->second.end();
-	   ++i) {
-	vector<pair<pg_notify_t, pg_interval_map_t> > to_send(1);
-	to_send[0] = *i;
-	MOSDPGInfo *m = new MOSDPGInfo(i->first.epoch_sent);
-	m->pg_list = to_send;
-	con->send_message(m);
-      }
-    }
+    MOSDPGInfo *m = new MOSDPGInfo(curmap->get_epoch());
+    m->pg_list = p->second;
+    con->send_message(m);
   }
   info_map.clear();
 }
@@ -7703,7 +7727,7 @@ void OSD::handle_pg_query(OpRequestRef op)
      * before the pg is recreated, we'll just start it off backfilling
      * instead of just empty */
     if (service.deleting_pgs.lookup(pgid))
-      empty.last_backfill = hobject_t();
+      empty.set_last_backfill(hobject_t(), true);
     if (it->second.type == pg_query_t::LOG ||
 	it->second.type == pg_query_t::FULLLOG) {
       ConnectionRef con = service.get_con_osd_cluster(from, osdmap->get_epoch());
@@ -7848,11 +7872,11 @@ void OSD::check_replay_queue()
       PG *pg = _lookup_lock_pg_with_map_lock_held(pgid);
       pg_map_lock.unlock();
       dout(10) << "check_replay_queue " << *pg << dendl;
-      if (pg->is_active() &&
-          pg->is_replay() &&
+      if ((pg->is_active() || pg->is_activating()) &&
+	  pg->is_replay() &&
           pg->is_primary() &&
           pg->replay_until == p->second) {
-        pg->replay_queued_ops();
+	pg->replay_queued_ops();
       }
       pg->unlock();
     } else {
@@ -7879,6 +7903,13 @@ bool OSD::_recover_now()
 
 void OSD::do_recovery(PG *pg, ThreadPool::TPHandle &handle)
 {
+  if (g_conf->osd_recovery_sleep > 0) {
+    utime_t t;
+    t.set_from_double(g_conf->osd_recovery_sleep);
+    t.sleep();
+    dout(20) << __func__ << " slept for " << t << dendl;
+  }
+
   // see how many we should try to start.  note that this is a bit racy.
   recovery_wq.lock();
   int max = MIN(cct->_conf->osd_recovery_max_active - recovery_ops_active,
@@ -7909,12 +7940,17 @@ void OSD::do_recovery(PG *pg, ThreadPool::TPHandle &handle)
     dout(20) << "  active was " << recovery_oids[pg->info.pgid] << dendl;
 #endif
     
-    PG::RecoveryCtx rctx = create_context();
-    rctx.handle = &handle;
-
     int started;
-    bool more = pg->start_recovery_ops(max, &rctx, handle, &started);
+    bool more = pg->start_recovery_ops(max, handle, &started);
     dout(10) << "do_recovery started " << started << "/" << max << " on " << *pg << dendl;
+    // If no recovery op is started, don't bother to manipulate the RecoveryCtx
+    if (!started && (more || !pg->have_unfound())) {
+      pg->unlock();
+      goto out;
+    }
+
+    PG::RecoveryCtx rctx = create_context();
+    rctx.handle = &handle;
 
     /*
      * if we couldn't start any recovery ops and things are still
@@ -8109,17 +8145,15 @@ void OSD::handle_op(OpRequestRef& op, OSDMapRef& osdmap)
     }
   }
 
+  // calc actual pgid
+  pg_t _pgid = m->get_pg();
+  int64_t pool = _pgid.pool();
   if (op->may_write()) {
-    // full?
-    if ((service.check_failsafe_full() ||
-	 osdmap->test_flag(CEPH_OSDMAP_FULL) ||
-	 m->get_map_epoch() < superblock.last_map_marked_full) &&
-	!m->get_source().is_mds()) {  // FIXME: we'll exclude mds writes for now.
-      // Drop the request, since the client will retry when the full
-      // flag is unset.
+    const pg_pool_t *pi = osdmap->get_pg_pool(pool);
+    if (!pi) {
       return;
     }
-
+    
     // invalid?
     if (m->get_snapid() != CEPH_NOSNAP) {
       service.reply_op_error(op, -EINVAL);
@@ -8128,7 +8162,7 @@ void OSD::handle_op(OpRequestRef& op, OSDMapRef& osdmap)
 
     // too big?
     if (cct->_conf->osd_max_write_size &&
-	m->get_data_len() > cct->_conf->osd_max_write_size << 20) {
+	m->get_data_len() > ((int64_t)g_conf->osd_max_write_size) << 20) {
       // journal can't hold commit!
       derr << "handle_op msg data len " << m->get_data_len()
 	   << " > osd_max_write_size " << (cct->_conf->osd_max_write_size << 20)
@@ -8138,9 +8172,6 @@ void OSD::handle_op(OpRequestRef& op, OSDMapRef& osdmap)
     }
   }
 
-  // calc actual pgid
-  pg_t _pgid = m->get_pg();
-  int64_t pool = _pgid.pool();
   if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0 &&
       osdmap->have_pg_pool(pool))
     _pgid = osdmap->raw_pg_to_pg(_pgid);
@@ -8288,7 +8319,7 @@ void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb )
       return;
     }
   }
-  pair<PGRef, OpRequestRef> item = sdata->pqueue.dequeue();
+  pair<PGRef, PGQueueable> item = sdata->pqueue.dequeue();
   sdata->pg_for_processing[&*(item.first)].push_back(item.second);
   sdata->sdata_op_ordering_lock.Unlock();
   ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval, 
@@ -8296,7 +8327,7 @@ void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb )
 
   (item.first)->lock_suspend_timeout(tp_handle);
 
-  OpRequestRef op;
+  boost::optional<PGQueueable> op;
   {
     Mutex::Locker l(sdata->sdata_op_ordering_lock);
     if (!sdata->pg_for_processing.count(&*(item.first))) {
@@ -8314,7 +8345,10 @@ void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb )
   // and will begin to be handled by a worker thread.
   {
 #ifdef WITH_LTTNG
-    osd_reqid_t reqid = op->get_reqid();
+    osd_reqid_t reqid;
+    if (boost::optional<OpRequestRef> _op = op->maybe_get_op()) {
+      reqid = (*_op)->get_reqid();
+    }
 #endif
     tracepoint(osd, opwq_process_start, reqid.name._type,
         reqid.name._num, reqid.tid, reqid.inc);
@@ -8329,11 +8363,14 @@ void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb )
   delete f;
   *_dout << dendl;
 
-  osd->dequeue_op(item.first, op, tp_handle);
+  op->run(osd, item.first, tp_handle);
 
   {
 #ifdef WITH_LTTNG
-    osd_reqid_t reqid = op->get_reqid();
+    osd_reqid_t reqid;
+    if (boost::optional<OpRequestRef> _op = op->maybe_get_op()) {
+      reqid = (*_op)->get_reqid();
+    }
 #endif
     tracepoint(osd, opwq_process_finish, reqid.name._type,
         reqid.name._num, reqid.tid, reqid.inc);
@@ -8342,21 +8379,22 @@ void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb )
   (item.first)->unlock();
 }
 
-void OSD::ShardedOpWQ::_enqueue(pair<PGRef, OpRequestRef> item) {
+void OSD::ShardedOpWQ::_enqueue(pair<PGRef, PGQueueable> item) {
 
   uint32_t shard_index = (((item.first)->get_pgid().ps())% shard_list.size());
 
   ShardData* sdata = shard_list[shard_index];
   assert (NULL != sdata);
-  unsigned priority = item.second->get_req()->get_priority();
-  unsigned cost = item.second->get_req()->get_cost();
+  unsigned priority = item.second.get_priority();
+  unsigned cost = item.second.get_cost();
   sdata->sdata_op_ordering_lock.Lock();
  
   if (priority >= CEPH_MSG_PRIO_LOW)
     sdata->pqueue.enqueue_strict(
-      item.second->get_req()->get_source_inst(), priority, item);
+      item.second.get_owner(), priority, item);
   else
-    sdata->pqueue.enqueue(item.second->get_req()->get_source_inst(),
+    sdata->pqueue.enqueue(
+      item.second.get_owner(),
       priority, cost, item);
   sdata->sdata_op_ordering_lock.Unlock();
 
@@ -8366,7 +8404,7 @@ void OSD::ShardedOpWQ::_enqueue(pair<PGRef, OpRequestRef> item) {
 
 }
 
-void OSD::ShardedOpWQ::_enqueue_front(pair<PGRef, OpRequestRef> item) {
+void OSD::ShardedOpWQ::_enqueue_front(pair<PGRef, PGQueueable> item) {
 
   uint32_t shard_index = (((item.first)->get_pgid().ps())% shard_list.size());
 
@@ -8378,13 +8416,15 @@ void OSD::ShardedOpWQ::_enqueue_front(pair<PGRef, OpRequestRef> item) {
     item.second = sdata->pg_for_processing[&*(item.first)].back();
     sdata->pg_for_processing[&*(item.first)].pop_back();
   }
-  unsigned priority = item.second->get_req()->get_priority();
-  unsigned cost = item.second->get_req()->get_cost();
+  unsigned priority = item.second.get_priority();
+  unsigned cost = item.second.get_cost();
   if (priority >= CEPH_MSG_PRIO_LOW)
     sdata->pqueue.enqueue_strict_front(
-      item.second->get_req()->get_source_inst(),priority, item);
+      item.second.get_owner(),
+      priority, item);
   else
-    sdata->pqueue.enqueue_front(item.second->get_req()->get_source_inst(),
+    sdata->pqueue.enqueue_front(
+      item.second.get_owner(),
       priority, cost, item);
 
   sdata->sdata_op_ordering_lock.Unlock();
@@ -8514,13 +8554,7 @@ void OSD::process_peering_events(
       rctx.on_applied->add(new C_CompleteSplits(this, split_pgs));
       split_pgs.clear();
     }
-    if (compat_must_dispatch_immediately(pg)) {
-      dispatch_context(rctx, pg, curmap, &handle);
-      rctx = create_context();
-      rctx.handle = &handle;
-    } else {
-      dispatch_context_transaction(rctx, pg, &handle);
-    }
+    dispatch_context_transaction(rctx, pg, &handle);
     pg->unlock();
     handle.reset_tp_timeout();
   }
@@ -8683,6 +8717,40 @@ int OSD::init_op_flags(OpRequestRef& op)
     if (ceph_osd_op_mode_cache(iter->op.op))
       op->set_cache();
 
+    // check for ec base pool
+    int64_t poolid = m->get_pg().pool();
+    const pg_pool_t *pool = osdmap->get_pg_pool(poolid);
+    if (pool && pool->is_tier()) {
+      const pg_pool_t *base_pool = osdmap->get_pg_pool(pool->tier_of);
+      if (base_pool && base_pool->require_rollback()) {
+        if ((iter->op.op != CEPH_OSD_OP_READ) &&
+            (iter->op.op != CEPH_OSD_OP_STAT) &&
+            (iter->op.op != CEPH_OSD_OP_ISDIRTY) &&
+            (iter->op.op != CEPH_OSD_OP_UNDIRTY) &&
+            (iter->op.op != CEPH_OSD_OP_GETXATTR) &&
+            (iter->op.op != CEPH_OSD_OP_GETXATTRS) &&
+            (iter->op.op != CEPH_OSD_OP_CMPXATTR) &&
+            (iter->op.op != CEPH_OSD_OP_SRC_CMPXATTR) &&
+            (iter->op.op != CEPH_OSD_OP_ASSERT_VER) &&
+            (iter->op.op != CEPH_OSD_OP_LIST_WATCHERS) &&
+            (iter->op.op != CEPH_OSD_OP_LIST_SNAPS) &&
+            (iter->op.op != CEPH_OSD_OP_ASSERT_SRC_VERSION) &&
+            (iter->op.op != CEPH_OSD_OP_SETALLOCHINT) &&
+            (iter->op.op != CEPH_OSD_OP_WRITEFULL) &&
+            (iter->op.op != CEPH_OSD_OP_ROLLBACK) &&
+            (iter->op.op != CEPH_OSD_OP_CREATE) &&
+            (iter->op.op != CEPH_OSD_OP_DELETE) &&
+            (iter->op.op != CEPH_OSD_OP_SETXATTR) &&
+            (iter->op.op != CEPH_OSD_OP_RMXATTR) &&
+            (iter->op.op != CEPH_OSD_OP_STARTSYNC) &&
+            (iter->op.op != CEPH_OSD_OP_COPY_GET_CLASSIC) &&
+            (iter->op.op != CEPH_OSD_OP_COPY_GET) &&
+            (iter->op.op != CEPH_OSD_OP_COPY_FROM)) {
+          op->set_promote();
+        }
+      }
+    }
+
     switch (iter->op.op) {
     case CEPH_OSD_OP_CALL:
       {
@@ -8712,13 +8780,19 @@ int OSD::init_op_flags(OpRequestRef& op)
 	}
 	is_read = flags & CLS_METHOD_RD;
 	is_write = flags & CLS_METHOD_WR;
+        bool is_promote = flags & CLS_METHOD_PROMOTE;
 
-	dout(10) << "class " << cname << " method " << mname
-		<< " flags=" << (is_read ? "r" : "") << (is_write ? "w" : "") << dendl;
+	dout(10) << "class " << cname << " method " << mname << " "
+		 << "flags=" << (is_read ? "r" : "")
+                             << (is_write ? "w" : "")
+                             << (is_promote ? "p" : "")
+                 << dendl;
 	if (is_read)
 	  op->set_class_read();
 	if (is_write)
 	  op->set_class_write();
+        if (is_promote)
+          op->set_promote();
 	break;
       }
 
@@ -8735,6 +8809,38 @@ int OSD::init_op_flags(OpRequestRef& op)
         break;
       }
 
+    case CEPH_OSD_OP_DELETE:
+      // if we get a delete with FAILOK we can skip handle cache. without
+      // FAILOK we still need to promote (or do something smarter) to
+      // determine whether to return ENOENT or 0.
+      if (iter == m->ops.begin() &&
+	  iter->op.flags == CEPH_OSD_OP_FLAG_FAILOK) {
+	op->set_skip_handle_cache();
+      }
+      // skip promotion when proxying a delete op
+      if (m->ops.size() == 1) {
+	op->set_skip_promote();
+      }
+      break;
+
+    case CEPH_OSD_OP_CACHE_TRY_FLUSH:
+    case CEPH_OSD_OP_CACHE_FLUSH:
+    case CEPH_OSD_OP_CACHE_EVICT:
+      // If try_flush/flush/evict is the only op, can skip handle cache.
+      if (m->ops.size() == 1) {
+	op->set_skip_handle_cache();
+      }
+      break;
+
+    case CEPH_OSD_OP_READ:
+    case CEPH_OSD_OP_SYNC_READ:
+    case CEPH_OSD_OP_SPARSE_READ:
+      if (m->ops.size() == 1 &&
+          (iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_NOCACHE ||
+           iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED)) {
+        op->set_skip_promote();
+      }
+      break;
     default:
       break;
     }
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index a3b636e..44a492c 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -135,6 +135,7 @@ enum {
   l_osd_tier_clean,
   l_osd_tier_delay,
   l_osd_tier_proxy_read,
+  l_osd_tier_proxy_write,
 
   l_osd_agent_wake,
   l_osd_agent_skip,
@@ -144,6 +145,11 @@ enum {
   l_osd_object_ctx_cache_hit,
   l_osd_object_ctx_cache_total,
 
+  l_osd_op_cache_hit,
+  l_osd_tier_flush_lat,
+  l_osd_tier_promote_lat,
+  l_osd_tier_r_lat,
+
   l_osd_last,
 };
 
@@ -223,35 +229,37 @@ public:
   const PGRef old_pg_state;
   DeletingState(const pair<spg_t, PGRef> &in) :
     lock("DeletingState::lock"), status(QUEUED), stop_deleting(false),
-    pgid(in.first), old_pg_state(in.second) {}
+    pgid(in.first), old_pg_state(in.second) {
+    }
 
-  /// transition status to clearing
-  bool start_clearing() {
+  /// transition status to CLEARING_WAITING
+  bool pause_clearing() {
     Mutex::Locker l(lock);
-    assert(
-      status == QUEUED ||
-      status == DELETED_DIR);
+    assert(status == CLEARING_DIR);
     if (stop_deleting) {
       status = CANCELED;
       cond.Signal();
       return false;
     }
-    status = CLEARING_DIR;
+    status = CLEARING_WAITING;
     return true;
   } ///< @return false if we should cancel deletion
 
-  /// transition status to CLEARING_WAITING
-  bool pause_clearing() {
+  /// start or resume the clearing - transition the status to CLEARING_DIR
+  bool start_or_resume_clearing() {
     Mutex::Locker l(lock);
-    assert(status == CLEARING_DIR);
+    assert(
+      status == QUEUED ||
+      status == DELETED_DIR ||
+      status == CLEARING_WAITING);
     if (stop_deleting) {
       status = CANCELED;
       cond.Signal();
       return false;
     }
-    status = CLEARING_WAITING;
+    status = CLEARING_DIR;
     return true;
-  } ///< @return false if we should cancel deletion
+  } ///< @return false if we should cancel the deletion
 
   /// transition status to CLEARING_DIR
   bool resume_clearing() {
@@ -307,11 +315,81 @@ public:
 typedef ceph::shared_ptr<DeletingState> DeletingStateRef;
 
 class OSD;
+
+struct PGScrub {
+  epoch_t epoch_queued;
+  PGScrub(epoch_t e) : epoch_queued(e) {}
+  ostream &operator<<(ostream &rhs) {
+    return rhs << "PGScrub";
+  }
+};
+
+struct PGSnapTrim {
+  epoch_t epoch_queued;
+  PGSnapTrim(epoch_t e) : epoch_queued(e) {}
+  ostream &operator<<(ostream &rhs) {
+    return rhs << "PGSnapTrim";
+  }
+};
+
+class PGQueueable {
+  typedef boost::variant<
+    OpRequestRef,
+    PGSnapTrim,
+    PGScrub
+    > QVariant;
+  QVariant qvariant;
+  int cost; 
+  unsigned priority;
+  utime_t start_time;
+  entity_inst_t owner;
+  struct RunVis : public boost::static_visitor<> {
+    OSD *osd;
+    PGRef &pg;
+    ThreadPool::TPHandle &handle;
+    RunVis(OSD *osd, PGRef &pg, ThreadPool::TPHandle &handle)
+      : osd(osd), pg(pg), handle(handle) {}
+    void operator()(OpRequestRef &op);
+    void operator()(PGSnapTrim &op);
+    void operator()(PGScrub &op);
+  };
+public:
+  PGQueueable(OpRequestRef op)
+    : qvariant(op), cost(op->get_req()->get_cost()),
+      priority(op->get_req()->get_priority()),
+      start_time(op->get_req()->get_recv_stamp()),
+      owner(op->get_req()->get_source_inst())
+    {}
+  PGQueueable(
+    const PGSnapTrim &op, int cost, unsigned priority, utime_t start_time,
+    const entity_inst_t &owner)
+    : qvariant(op), cost(cost), priority(priority), start_time(start_time),
+      owner(owner) {}
+  PGQueueable(
+    const PGScrub &op, int cost, unsigned priority, utime_t start_time,
+    const entity_inst_t &owner)
+    : qvariant(op), cost(cost), priority(priority), start_time(start_time),
+      owner(owner) {}
+  boost::optional<OpRequestRef> maybe_get_op() {
+    OpRequestRef *op = boost::get<OpRequestRef>(&qvariant);
+    return op ? *op : boost::optional<OpRequestRef>();
+  }
+  void run(OSD *osd, PGRef &pg, ThreadPool::TPHandle &handle) {
+    RunVis v(osd, pg, handle);
+    boost::apply_visitor(v, qvariant);
+  }
+  unsigned get_priority() const { return priority; }
+  int get_cost() const { return cost; }
+  utime_t get_start_time() const { return start_time; }
+  entity_inst_t get_owner() const { return owner; }
+};
+
 class OSDService {
 public:
   OSD *osd;
   CephContext *cct;
   SharedPtrRegistry<spg_t, ObjectStore::Sequencer> osr_registry;
+  ceph::shared_ptr<ObjectStore::Sequencer> meta_osr;
   SharedPtrRegistry<spg_t, DeletingState> deleting_pgs;
   const int whoami;
   ObjectStore *&store;
@@ -325,12 +403,9 @@ public:
   PerfCounters *&logger;
   PerfCounters *&recoverystate_perf;
   MonClient   *&monc;
-  ShardedThreadPool::ShardedWQ < pair <PGRef, OpRequestRef> > &op_wq;
+  ShardedThreadPool::ShardedWQ < pair <PGRef, PGQueueable> > &op_wq;
   ThreadPool::BatchWorkQueue<PG> &peering_wq;
   ThreadPool::WorkQueue<PG> &recovery_wq;
-  ThreadPool::WorkQueue<PG> &snap_trim_wq;
-  ThreadPool::WorkQueue<PG> &scrub_wq;
-  ThreadPool::WorkQueue<MOSDRepScrub> &rep_scrub_wq;
   GenContextWQ recovery_gen_wq;
   GenContextWQ op_gen_wq;
   ClassHandler  *&class_handler;
@@ -505,42 +580,57 @@ public:
   Mutex sched_scrub_lock;
   int scrubs_pending;
   int scrubs_active;
-  set< pair<utime_t,spg_t> > last_scrub_pg;
+  struct ScrubJob {
+    /// pg to be scrubbed
+    spg_t pgid;
+    /// a time scheduled for scrub. but the scrub could be delayed if system
+    /// load is too high or it fails to fall in the scrub hours
+    utime_t sched_time;
+    /// the hard upper bound of scrub time
+    utime_t deadline;
+    ScrubJob() {}
+    explicit ScrubJob(const spg_t& pg, const utime_t& timestamp, bool must = true);
+    /// order the jobs by sched_time
+    bool operator<(const ScrubJob& rhs) const;
+  };
+  set<ScrubJob> sched_scrub_pg;
 
-  void reg_last_pg_scrub(spg_t pgid, utime_t t) {
+  /// @returns the scrub_reg_stamp used for unregister the scrub job
+  utime_t reg_pg_scrub(spg_t pgid, utime_t t, bool must) {
+    ScrubJob scrub(pgid, t, must);
     Mutex::Locker l(sched_scrub_lock);
-    last_scrub_pg.insert(pair<utime_t,spg_t>(t, pgid));
+    sched_scrub_pg.insert(scrub);
+    return scrub.sched_time;
   }
-  void unreg_last_pg_scrub(spg_t pgid, utime_t t) {
+  void unreg_pg_scrub(spg_t pgid, utime_t t) {
     Mutex::Locker l(sched_scrub_lock);
-    pair<utime_t,spg_t> p(t, pgid);
-    set<pair<utime_t,spg_t> >::iterator it = last_scrub_pg.find(p);
-    assert(it != last_scrub_pg.end());
-    last_scrub_pg.erase(it);
+    size_t removed = sched_scrub_pg.erase(ScrubJob(pgid, t));
+    assert(removed);
   }
-  bool first_scrub_stamp(pair<utime_t, spg_t> *out) {
+  bool first_scrub_stamp(ScrubJob *out) {
     Mutex::Locker l(sched_scrub_lock);
-    if (last_scrub_pg.empty())
+    if (sched_scrub_pg.empty())
       return false;
-    set< pair<utime_t, spg_t> >::iterator iter = last_scrub_pg.begin();
+    set<ScrubJob>::iterator iter = sched_scrub_pg.begin();
     *out = *iter;
     return true;
   }
-  bool next_scrub_stamp(pair<utime_t, spg_t> next,
-			pair<utime_t, spg_t> *out) {
+  bool next_scrub_stamp(const ScrubJob& next,
+			ScrubJob *out) {
     Mutex::Locker l(sched_scrub_lock);
-    if (last_scrub_pg.empty())
+    if (sched_scrub_pg.empty())
       return false;
-    set< pair<utime_t, spg_t> >::iterator iter = last_scrub_pg.lower_bound(next);
-    if (iter == last_scrub_pg.end())
+    set<ScrubJob>::iterator iter = sched_scrub_pg.lower_bound(next);
+    if (iter == sched_scrub_pg.end())
       return false;
     ++iter;
-    if (iter == last_scrub_pg.end())
+    if (iter == sched_scrub_pg.end())
       return false;
     *out = *iter;
     return true;
   }
 
+  bool can_inc_scrubs_pending();
   bool inc_scrubs_pending();
   void inc_scrubs_active(bool reserved);
   void dec_scrubs_pending();
@@ -558,7 +648,8 @@ public:
   set<PGRef>::iterator agent_queue_pos;
   bool agent_valid_iterator;
   int agent_ops;
-  set<hobject_t> agent_oids;
+  int flush_mode_high_count; //once have one pg with FLUSH_MODE_HIGH then flush objects with high speed
+  set<hobject_t, hobject_t::BitwiseComparator> agent_oids;
   bool agent_active;
   struct AgentThread : public Thread {
     OSDService *osd;
@@ -619,6 +710,20 @@ public:
     _dequeue(pg, old_priority);
   }
 
+  /// note start of an async (evict) op
+  void agent_start_evict_op() {
+    Mutex::Locker l(agent_lock);
+    ++agent_ops;
+  }
+
+  /// note finish or cancellation of an async (evict) op
+  void agent_finish_evict_op() {
+    Mutex::Locker l(agent_lock);
+    assert(agent_ops > 0);
+    --agent_ops;
+    agent_cond.Signal();
+  }
+
   /// note start of an async (flush) op
   void agent_start_op(const hobject_t& oid) {
     Mutex::Locker l(agent_lock);
@@ -649,6 +754,16 @@ public:
     return agent_ops;
   }
 
+  void agent_inc_high_count() {
+    Mutex::Locker l(agent_lock);
+    flush_mode_high_count ++;
+  }
+
+  void agent_dec_high_count() {
+    Mutex::Locker l(agent_lock);
+    flush_mode_high_count --;
+  }
+
 
   // -- Objecter, for teiring reads/writes from/to other OSDs --
   Objecter *objecter;
@@ -697,11 +812,27 @@ public:
 
   void queue_for_peering(PG *pg);
   bool queue_for_recovery(PG *pg);
-  bool queue_for_snap_trim(PG *pg) {
-    return snap_trim_wq.queue(pg);
-  }
-  bool queue_for_scrub(PG *pg) {
-    return scrub_wq.queue(pg);
+  void queue_for_snap_trim(PG *pg) {
+    op_wq.queue(
+      make_pair(
+	pg,
+	PGQueueable(
+	  PGSnapTrim(pg->get_osdmap()->get_epoch()),
+	  cct->_conf->osd_snap_trim_cost,
+	  cct->_conf->osd_snap_trim_priority,
+	  ceph_clock_now(cct),
+	  entity_inst_t())));
+  }
+  void queue_for_scrub(PG *pg) {
+    op_wq.queue(
+      make_pair(
+	pg,
+	PGQueueable(
+	  PGScrub(pg->get_osdmap()->get_epoch()),
+	  cct->_conf->osd_scrub_cost,
+	  cct->_conf->osd_scrub_priority,
+	  ceph_clock_now(cct),
+	  entity_inst_t())));
   }
 
   // osd map cache (past osd maps)
@@ -750,6 +881,7 @@ public:
   void pg_stat_queue_dequeue(PG *pg);
 
   void init();
+  void final_init();  
   void start_shutdown();
   void shutdown();
 
@@ -916,6 +1048,12 @@ protected:
   Mutex osd_lock;			// global lock
   SafeTimer tick_timer;    // safe timer (osd_lock)
 
+  // Tick timer for those stuff that do not need osd_lock
+  Mutex tick_timer_lock;
+  SafeTimer tick_timer_without_osd_lock;
+
+  static const double OSD_TICK_INTERVAL; // tick interval for tick_timer and tick_timer_without_osd_lock
+
   AuthAuthorizeHandlerRegistry *authorize_handler_cluster_registry;
   AuthAuthorizeHandlerRegistry *authorize_handler_service_registry;
 
@@ -942,12 +1080,22 @@ protected:
     }
   };
 
+  class C_Tick_WithoutOSDLock : public Context {
+    OSD *osd;
+  public:
+    C_Tick_WithoutOSDLock(OSD *o) : osd(o) {}
+    void finish(int r) {
+      osd->tick_without_osd_lock();
+    }
+  };
+
   Cond dispatch_cond;
   int dispatch_running;
 
   void create_logger();
   void create_recoverystate_perf();
   void tick();
+  void tick_without_osd_lock();
   void _dispatch(Message *m);
   void dispatch_op(OpRequestRef op);
   bool dispatch_op_fast(OpRequestRef& op, OSDMapRef& osdmap);
@@ -963,44 +1111,46 @@ public:
   ClassHandler  *class_handler;
   int get_nodeid() { return whoami; }
   
-  static hobject_t get_osdmap_pobject_name(epoch_t epoch) { 
+  static ghobject_t get_osdmap_pobject_name(epoch_t epoch) {
     char foo[20];
     snprintf(foo, sizeof(foo), "osdmap.%d", epoch);
-    return hobject_t(sobject_t(object_t(foo), 0)); 
+    return ghobject_t(hobject_t(sobject_t(object_t(foo), 0)));
   }
-  static hobject_t get_inc_osdmap_pobject_name(epoch_t epoch) { 
+  static ghobject_t get_inc_osdmap_pobject_name(epoch_t epoch) {
     char foo[20];
     snprintf(foo, sizeof(foo), "inc_osdmap.%d", epoch);
-    return hobject_t(sobject_t(object_t(foo), 0)); 
+    return ghobject_t(hobject_t(sobject_t(object_t(foo), 0)));
   }
 
-  static hobject_t make_snapmapper_oid() {
-    return hobject_t(
+  static ghobject_t make_snapmapper_oid() {
+    return ghobject_t(hobject_t(
       sobject_t(
 	object_t("snapmapper"),
-	0));
+	0)));
   }
 
-  static hobject_t make_pg_log_oid(spg_t pg) {
+  static ghobject_t make_pg_log_oid(spg_t pg) {
     stringstream ss;
     ss << "pglog_" << pg;
     string s;
     getline(ss, s);
-    return hobject_t(sobject_t(object_t(s.c_str()), 0));
+    return ghobject_t(hobject_t(sobject_t(object_t(s.c_str()), 0)));
   }
   
-  static hobject_t make_pg_biginfo_oid(spg_t pg) {
+  static ghobject_t make_pg_biginfo_oid(spg_t pg) {
     stringstream ss;
     ss << "pginfo_" << pg;
     string s;
     getline(ss, s);
-    return hobject_t(sobject_t(object_t(s.c_str()), 0));
+    return ghobject_t(hobject_t(sobject_t(object_t(s.c_str()), 0)));
   }
-  static hobject_t make_infos_oid() {
+  static ghobject_t make_infos_oid() {
     hobject_t oid(sobject_t("infos", CEPH_NOSNAP));
-    return oid;
+    return ghobject_t(oid);
   }
-  static void recursive_remove_collection(ObjectStore *store, coll_t tmp);
+  static void recursive_remove_collection(ObjectStore *store,
+					  spg_t pgid,
+					  coll_t tmp);
 
   /**
    * get_osd_initial_compat_set()
@@ -1030,6 +1180,8 @@ private:
   void write_superblock(ObjectStore::Transaction& t);
   int read_superblock();
 
+  void clear_temp_objects();
+
   CompatSet osd_compat;
 
   // -- state --
@@ -1439,124 +1591,139 @@ private:
 
   // -- op queue --
 
- 
-  class ShardedOpWQ: public ShardedThreadPool::ShardedWQ < pair <PGRef, OpRequestRef> > {
+  friend class PGQueueable;
+  class ShardedOpWQ: public ShardedThreadPool::ShardedWQ < pair <PGRef, PGQueueable> > {
 
     struct ShardData {
       Mutex sdata_lock;
       Cond sdata_cond;
       Mutex sdata_op_ordering_lock;
-      map<PG*, list<OpRequestRef> > pg_for_processing;
-      PrioritizedQueue< pair<PGRef, OpRequestRef>, entity_inst_t> pqueue;
-      ShardData(string lock_name, string ordering_lock, uint64_t max_tok_per_prio, uint64_t min_cost):
-          sdata_lock(lock_name.c_str()),
-          sdata_op_ordering_lock(ordering_lock.c_str()),
-          pqueue(max_tok_per_prio, min_cost) {}
+      map<PG*, list<PGQueueable> > pg_for_processing;
+      PrioritizedQueue< pair<PGRef, PGQueueable>, entity_inst_t> pqueue;
+      ShardData(
+	string lock_name, string ordering_lock,
+	uint64_t max_tok_per_prio, uint64_t min_cost)
+	: sdata_lock(lock_name.c_str()),
+	  sdata_op_ordering_lock(ordering_lock.c_str()),
+	  pqueue(max_tok_per_prio, min_cost) {}
     };
-
+    
     vector<ShardData*> shard_list;
     OSD *osd;
     uint32_t num_shards;
 
-    public:
-      ShardedOpWQ(uint32_t pnum_shards, OSD *o, time_t ti, time_t si, ShardedThreadPool* tp):
-        ShardedThreadPool::ShardedWQ < pair <PGRef, OpRequestRef> >(ti, si, tp),
-        osd(o), num_shards(pnum_shards) {
-        for(uint32_t i = 0; i < num_shards; i++) {
-          char lock_name[32] = {0};
-          snprintf(lock_name, sizeof(lock_name), "%s.%d", "OSD:ShardedOpWQ:", i);
-          char order_lock[32] = {0};
-          snprintf(order_lock, sizeof(order_lock), "%s.%d", "OSD:ShardedOpWQ:order:", i);
-          ShardData* one_shard = new ShardData(lock_name, order_lock, 
-            osd->cct->_conf->osd_op_pq_max_tokens_per_priority, 
-            osd->cct->_conf->osd_op_pq_min_cost);
-          shard_list.push_back(one_shard);
-        }
+  public:
+    ShardedOpWQ(uint32_t pnum_shards, OSD *o, time_t ti, time_t si, ShardedThreadPool* tp):
+      ShardedThreadPool::ShardedWQ < pair <PGRef, PGQueueable> >(ti, si, tp),
+      osd(o), num_shards(pnum_shards) {
+      for(uint32_t i = 0; i < num_shards; i++) {
+	char lock_name[32] = {0};
+	snprintf(lock_name, sizeof(lock_name), "%s.%d", "OSD:ShardedOpWQ:", i);
+	char order_lock[32] = {0};
+	snprintf(
+	  order_lock, sizeof(order_lock), "%s.%d",
+	  "OSD:ShardedOpWQ:order:", i);
+	ShardData* one_shard = new ShardData(
+	  lock_name, order_lock,
+	  osd->cct->_conf->osd_op_pq_max_tokens_per_priority, 
+	  osd->cct->_conf->osd_op_pq_min_cost);
+	shard_list.push_back(one_shard);
       }
-
-      ~ShardedOpWQ() {
-
-        while(!shard_list.empty()) {
-          delete shard_list.back();
-          shard_list.pop_back();
-        }
+    }
+    
+    ~ShardedOpWQ() {
+      while(!shard_list.empty()) {
+	delete shard_list.back();
+	shard_list.pop_back();
       }
+    }
 
-      void _process(uint32_t thread_index, heartbeat_handle_d *hb);
-      void _enqueue(pair <PGRef, OpRequestRef> item);
-      void _enqueue_front(pair <PGRef, OpRequestRef> item);
-      
-      void return_waiting_threads() {
-        for(uint32_t i = 0; i < num_shards; i++) {
-          ShardData* sdata = shard_list[i];
-          assert (NULL != sdata); 
-          sdata->sdata_lock.Lock();
-          sdata->sdata_cond.Signal();
-          sdata->sdata_lock.Unlock();
-        }
+    void _process(uint32_t thread_index, heartbeat_handle_d *hb);
+    void _enqueue(pair <PGRef, PGQueueable> item);
+    void _enqueue_front(pair <PGRef, PGQueueable> item);
       
+    void return_waiting_threads() {
+      for(uint32_t i = 0; i < num_shards; i++) {
+	ShardData* sdata = shard_list[i];
+	assert (NULL != sdata); 
+	sdata->sdata_lock.Lock();
+	sdata->sdata_cond.Signal();
+	sdata->sdata_lock.Unlock();
       }
+    }
 
-      void dump(Formatter *f) {
-        for(uint32_t i = 0; i < num_shards; i++) {
-          ShardData* sdata = shard_list[i];
-	  char lock_name[32] = {0};
-          snprintf(lock_name, sizeof(lock_name), "%s%d", "OSD:ShardedOpWQ:", i);
-          assert (NULL != sdata);
-          sdata->sdata_op_ordering_lock.Lock();
-	  f->open_object_section(lock_name);
-	  sdata->pqueue.dump(f);
-	  f->close_section();
-          sdata->sdata_op_ordering_lock.Unlock();
-        }
+    void dump(Formatter *f) {
+      for(uint32_t i = 0; i < num_shards; i++) {
+	ShardData* sdata = shard_list[i];
+	char lock_name[32] = {0};
+	snprintf(lock_name, sizeof(lock_name), "%s%d", "OSD:ShardedOpWQ:", i);
+	assert (NULL != sdata);
+	sdata->sdata_op_ordering_lock.Lock();
+	f->open_object_section(lock_name);
+	sdata->pqueue.dump(f);
+	f->close_section();
+	sdata->sdata_op_ordering_lock.Unlock();
       }
+    }
 
-      struct Pred {
-        PG *pg;
-        Pred(PG *pg) : pg(pg) {}
-        bool operator()(const pair<PGRef, OpRequestRef> &op) {
-          return op.first == pg;
-        }
-      };
-
-      void dequeue(PG *pg, list<OpRequestRef> *dequeued = 0) {
-        ShardData* sdata = NULL;
-        assert(pg != NULL);
-        uint32_t shard_index = pg->get_pgid().ps()% shard_list.size();
-        sdata = shard_list[shard_index];
-        assert(sdata != NULL);
-        if (!dequeued) {
-          sdata->sdata_op_ordering_lock.Lock();
-          sdata->pqueue.remove_by_filter(Pred(pg));
-          sdata->pg_for_processing.erase(pg);
-          sdata->sdata_op_ordering_lock.Unlock();
-        } else {
-          list<pair<PGRef, OpRequestRef> > _dequeued;
-          sdata->sdata_op_ordering_lock.Lock();
-          sdata->pqueue.remove_by_filter(Pred(pg), &_dequeued);
-          for (list<pair<PGRef, OpRequestRef> >::iterator i = _dequeued.begin();
-            i != _dequeued.end(); ++i) {
-            dequeued->push_back(i->second);
-          }
-	  if (sdata->pg_for_processing.count(pg)) {
-	    dequeued->splice(
-	      dequeued->begin(),
-	      sdata->pg_for_processing[pg]);
-	    sdata->pg_for_processing.erase(pg);
-	  }
-          sdata->sdata_op_ordering_lock.Unlock();          
-        }
+    struct Pred {
+      PG *pg;
+      Pred(PG *pg) : pg(pg) {}
+      bool operator()(const pair<PGRef, PGQueueable> &op) {
+	return op.first == pg;
+      }
+    };
 
+    void dequeue(PG *pg) {
+      ShardData* sdata = NULL;
+      assert(pg != NULL);
+      uint32_t shard_index = pg->get_pgid().ps()% shard_list.size();
+      sdata = shard_list[shard_index];
+      assert(sdata != NULL);
+      sdata->sdata_op_ordering_lock.Lock();
+      sdata->pqueue.remove_by_filter(Pred(pg));
+      sdata->pg_for_processing.erase(pg);
+      sdata->sdata_op_ordering_lock.Unlock();
+    }
+
+    void dequeue_and_get_ops(PG *pg, list<OpRequestRef> *dequeued) {
+      ShardData* sdata = NULL;
+      assert(pg != NULL);
+      uint32_t shard_index = pg->get_pgid().ps()% shard_list.size();
+      sdata = shard_list[shard_index];
+      assert(sdata != NULL);
+      assert(dequeued);
+      list<pair<PGRef, PGQueueable> > _dequeued;
+      sdata->sdata_op_ordering_lock.Lock();
+      sdata->pqueue.remove_by_filter(Pred(pg), &_dequeued);
+      for (list<pair<PGRef, PGQueueable> >::iterator i = _dequeued.begin();
+	   i != _dequeued.end(); ++i) {
+	boost::optional<OpRequestRef> mop = i->second.maybe_get_op();
+	if (mop)
+	  dequeued->push_back(*mop);
       }
- 
-      bool is_shard_empty(uint32_t thread_index) {
-        uint32_t shard_index = thread_index % num_shards; 
-        ShardData* sdata = shard_list[shard_index];
-        assert(NULL != sdata);
-        Mutex::Locker l(sdata->sdata_op_ordering_lock);
-        return sdata->pqueue.empty();
+      map<PG *, list<PGQueueable> >::iterator iter =
+	sdata->pg_for_processing.find(pg);
+      if (iter != sdata->pg_for_processing.end()) {
+	for (list<PGQueueable>::reverse_iterator i = iter->second.rbegin();
+	     i != iter->second.rend();
+	     ++i) {
+	  boost::optional<OpRequestRef> mop = i->maybe_get_op();
+	  if (mop)
+	    dequeued->push_front(*mop);
+	}
+	sdata->pg_for_processing.erase(iter);
       }
-
+      sdata->sdata_op_ordering_lock.Unlock();
+    }
+ 
+    bool is_shard_empty(uint32_t thread_index) {
+      uint32_t shard_index = thread_index % num_shards; 
+      ShardData* sdata = shard_list[shard_index];
+      assert(NULL != sdata);
+      Mutex::Locker l(sdata->sdata_op_ordering_lock);
+      return sdata->pqueue.empty();
+    }
   } op_shardedwq;
 
 
@@ -1605,6 +1772,7 @@ private:
 	(*i)->put("PeeringWQ");
       }
     }
+    using ThreadPool::BatchWorkQueue<PG>::_process;
     void _process_finish(const list<PG *> &pgs) {
       for (list<PG*>::const_iterator i = pgs.begin();
 	   i != pgs.end();
@@ -1646,14 +1814,14 @@ private:
   void handle_osd_map(class MOSDMap *m);
   void note_down_osd(int osd);
   void note_up_osd(int osd);
-  
+
   bool advance_pg(
     epoch_t advance_to, PG *pg,
     ThreadPool::TPHandle &handle,
     PG::RecoveryCtx *rctx,
     set<boost::intrusive_ptr<PG> > *split_pgs
   );
-  void advance_map(ObjectStore::Transaction& t, C_Contexts *tfin);
+  void advance_map();
   void consume_map();
   void activate_map();
 
@@ -1842,6 +2010,12 @@ protected:
   void queue_want_up_thru(epoch_t want);
   void send_alive();
 
+  // -- full map requests --
+  epoch_t requested_full_first, requested_full_last;
+
+  void request_full_map(epoch_t first, epoch_t last);
+  void got_full_map(epoch_t e);
+
   // -- failures --
   map<int,utime_t> failure_queue;
   map<int,entity_inst_t> failure_pending;
@@ -1892,7 +2066,6 @@ protected:
 
   // -- generic pg peering --
   PG::RecoveryCtx create_context();
-  bool compat_must_dispatch_immediately(PG *pg);
   void dispatch_context(PG::RecoveryCtx &ctx, PG *pg, OSDMapRef curmap,
                         ThreadPool::TPHandle *handle = NULL);
   void dispatch_context_transaction(PG::RecoveryCtx &ctx, PG *pg,
@@ -1982,6 +2155,9 @@ protected:
       osd->osd_lock.Unlock();
       delete c;
     }
+    void _process(Command *c, ThreadPool::TPHandle &tp) {
+      _process(c);
+    }
     void _clear() {
       while (!osd->command_queue.empty()) {
 	Command *c = osd->command_queue.front();
@@ -2000,7 +2176,7 @@ protected:
   utime_t defer_recovery_until;
   int recovery_ops_active;
 #ifdef DEBUG_RECOVERY_OIDS
-  map<spg_t, set<hobject_t> > recovery_oids;
+  map<spg_t, set<hobject_t, hobject_t::BitwiseComparator> > recovery_oids;
 #endif
 
   struct RecoveryWQ : public ThreadPool::WorkQueue<PG> {
@@ -2037,6 +2213,7 @@ protected:
       osd->do_recovery(pg, handle);
       pg->put("RecoveryWQ");
     }
+    using ThreadPool::WorkQueue<PG>::_process;
     void _clear() {
       while (!osd->recovery_queue.empty()) {
 	PG *pg = osd->recovery_queue.front();
@@ -2057,153 +2234,12 @@ protected:
   
   void check_replay_queue();
 
-
-  // -- snap trimming --
-  xlist<PG*> snap_trim_queue;
-  
-  struct SnapTrimWQ : public ThreadPool::WorkQueue<PG> {
-    OSD *osd;
-    SnapTrimWQ(OSD *o, time_t ti, time_t si, ThreadPool *tp)
-      : ThreadPool::WorkQueue<PG>("OSD::SnapTrimWQ", ti, si, tp), osd(o) {}
-
-    bool _empty() {
-      return osd->snap_trim_queue.empty();
-    }
-    bool _enqueue(PG *pg) {
-      if (pg->snap_trim_item.is_on_list())
-	return false;
-      pg->get("SnapTrimWQ");
-      osd->snap_trim_queue.push_back(&pg->snap_trim_item);
-      return true;
-    }
-    void _dequeue(PG *pg) {
-      if (pg->snap_trim_item.remove_myself())
-	pg->put("SnapTrimWQ");
-    }
-    PG *_dequeue() {
-      if (osd->snap_trim_queue.empty())
-	return NULL;
-      PG *pg = osd->snap_trim_queue.front();
-      osd->snap_trim_queue.pop_front();
-      return pg;
-    }
-    void _process(PG *pg) {
-      pg->snap_trimmer();
-      pg->put("SnapTrimWQ");
-    }
-    void _clear() {
-      while (PG *pg = _dequeue()) {
-	pg->put("SnapTrimWQ");
-      }
-    }
-  } snap_trim_wq;
-
-
   // -- scrubbing --
   void sched_scrub();
   bool scrub_random_backoff();
-  bool scrub_should_schedule();
+  bool scrub_load_below_threshold();
   bool scrub_time_permit(utime_t now);
 
-  xlist<PG*> scrub_queue;
-
-  struct ScrubWQ : public ThreadPool::WorkQueue<PG> {
-    OSD *osd;
-    ScrubWQ(OSD *o, time_t ti, time_t si, ThreadPool *tp)
-      : ThreadPool::WorkQueue<PG>("OSD::ScrubWQ", ti, si, tp), osd(o) {}
-
-    bool _empty() {
-      return osd->scrub_queue.empty();
-    }
-    bool _enqueue(PG *pg) {
-      if (pg->scrub_item.is_on_list()) {
-	return false;
-      }
-      pg->get("ScrubWQ");
-      osd->scrub_queue.push_back(&pg->scrub_item);
-      return true;
-    }
-    void _dequeue(PG *pg) {
-      if (pg->scrub_item.remove_myself()) {
-	pg->put("ScrubWQ");
-      }
-    }
-    PG *_dequeue() {
-      if (osd->scrub_queue.empty())
-	return NULL;
-      PG *pg = osd->scrub_queue.front();
-      osd->scrub_queue.pop_front();
-      return pg;
-    }
-    void _process(
-      PG *pg,
-      ThreadPool::TPHandle &handle) {
-      pg->scrub(handle);
-      pg->put("ScrubWQ");
-    }
-    void _clear() {
-      while (!osd->scrub_queue.empty()) {
-	PG *pg = osd->scrub_queue.front();
-	osd->scrub_queue.pop_front();
-	pg->put("ScrubWQ");
-      }
-    }
-  } scrub_wq;
-
-  struct RepScrubWQ : public ThreadPool::WorkQueue<MOSDRepScrub> {
-  private: 
-    OSD *osd;
-    list<MOSDRepScrub*> rep_scrub_queue;
-
-  public:
-    RepScrubWQ(OSD *o, time_t ti, time_t si, ThreadPool *tp)
-      : ThreadPool::WorkQueue<MOSDRepScrub>("OSD::RepScrubWQ", ti, si, tp), osd(o) {}
-
-    bool _empty() {
-      return rep_scrub_queue.empty();
-    }
-    bool _enqueue(MOSDRepScrub *msg) {
-      rep_scrub_queue.push_back(msg);
-      return true;
-    }
-    void _dequeue(MOSDRepScrub *msg) {
-      assert(0); // Not applicable for this wq
-      return;
-    }
-    MOSDRepScrub *_dequeue() {
-      if (rep_scrub_queue.empty())
-	return NULL;
-      MOSDRepScrub *msg = rep_scrub_queue.front();
-      rep_scrub_queue.pop_front();
-      return msg;
-    }
-    void _process(
-      MOSDRepScrub *msg,
-      ThreadPool::TPHandle &handle) {
-      PG *pg = NULL;
-      {
-	Mutex::Locker lock(osd->osd_lock);
-	if (osd->is_stopping() ||
-	    !osd->_have_pg(msg->pgid)) {
-	  msg->put();
-	  return;
-	}
-	pg = osd->_lookup_lock_pg(msg->pgid);
-      }
-      assert(pg);
-      pg->replica_scrub(msg, handle);
-      msg->put();
-      pg->unlock();
-    }
-    void _clear() {
-      while (!rep_scrub_queue.empty()) {
-	MOSDRepScrub *msg = rep_scrub_queue.front();
-	rep_scrub_queue.pop_front();
-	msg->put();
-      }
-    }
-  } rep_scrub_wq;
-
   // -- removing --
   struct RemoveWQ :
     public ThreadPool::WorkQueueVal<pair<PGRef, DeletingStateRef> > {
@@ -2232,6 +2268,7 @@ protected:
       remove_queue.pop_front();
       return item;
     }
+    using ThreadPool::WorkQueueVal<pair<PGRef, DeletingStateRef> >::_process;
     void _process(pair<PGRef, DeletingStateRef>, ThreadPool::TPHandle &);
     void _clear() {
       remove_queue.clear();
@@ -2256,6 +2293,7 @@ protected:
     case MSG_OSD_EC_WRITE_REPLY:
     case MSG_OSD_EC_READ:
     case MSG_OSD_EC_READ_REPLY:
+    case MSG_OSD_REP_SCRUB:
       return true;
     default:
       return false;
@@ -2308,6 +2346,16 @@ protected:
 private:
   static int write_meta(ObjectStore *store,
 			uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami);
+
+  void handle_scrub(struct MOSDScrub *m);
+  void handle_osd_ping(class MOSDPing *m);
+  void handle_op(OpRequestRef& op, OSDMapRef& osdmap);
+
+  template <typename T, int MSGTYPE>
+  void handle_replica_op(OpRequestRef& op, OSDMapRef& osdmap);
+
+  int init_op_flags(OpRequestRef& op);
+
 public:
   static int peek_meta(ObjectStore *store, string& magic,
 		       uuid_d& cluster_fsid, uuid_d& osd_fsid, int& whoami);
@@ -2323,22 +2371,10 @@ public:
 
   void handle_signal(int signum);
 
-  void handle_rep_scrub(MOSDRepScrub *m);
-  void handle_scrub(struct MOSDScrub *m);
-  void handle_osd_ping(class MOSDPing *m);
-  void handle_op(OpRequestRef& op, OSDMapRef& osdmap);
-
-  template <typename T, int MSGTYPE>
-  void handle_replica_op(OpRequestRef& op, OSDMapRef& osdmap);
-
   /// check if we can throw out op from a disconnected client
-  static bool op_is_discardable(class MOSDOp *m);
-  /// check if op should be (re)queued for processing
-public:
-  void force_remount();
-
-  int init_op_flags(OpRequestRef& op);
+  static bool op_is_discardable(MOSDOp *m);
 
+public:
   OSDService service;
   friend class OSDService;
 };
diff --git a/src/osd/OSDCap.h b/src/osd/OSDCap.h
index 3fc7fb6..905fa55 100644
--- a/src/osd/OSDCap.h
+++ b/src/osd/OSDCap.h
@@ -91,7 +91,6 @@ struct OSDCapMatch {
   /**
    * check if given request parameters match our constraints
    *
-   * @param auid requesting user's auid
    * @param pool_name pool name
    * @param nspace_name namespace name
    * @param pool_auid pool's auid
diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc
index a9154d4..d308186 100644
--- a/src/osd/OSDMap.cc
+++ b/src/osd/OSDMap.cc
@@ -590,6 +590,10 @@ void OSDMap::Incremental::decode(bufferlist::iterator& bl)
     bl.advance(-struct_v_size);
     decode_classic(bl);
     encode_features = 0;
+    if (struct_v >= 6)
+      encode_features = CEPH_FEATURE_PGID64;
+    else
+      encode_features = 0;
     return;
   }
   {
@@ -641,7 +645,7 @@ void OSDMap::Incremental::decode(bufferlist::iterator& bl)
     if (struct_v >= 2)
       ::decode(encode_features, bl);
     else
-      encode_features = 0;
+      encode_features = CEPH_FEATURE_PGID64 | CEPH_FEATURE_OSDMAP_ENC;
     DECODE_FINISH(bl); // osd-only data
   }
 
@@ -1066,9 +1070,12 @@ uint64_t OSDMap::get_features(int entity_type, uint64_t *pmask) const
 	 ++p) {
       const map<string,string> &profile = p->second;
       map<string,string>::const_iterator plugin = profile.find("plugin");
-      if (plugin != profile.end() && (plugin->second == "isa" ||
-				      plugin->second == "lrc"))
-	features |= CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2;
+      if (plugin != profile.end()) {
+	if (plugin->second == "isa" || plugin->second == "lrc")
+	  features |= CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2;
+	if (plugin->second == "shec")
+	  features |= CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3;
+      }
     }
   }
   mask |= CEPH_FEATURE_OSDHASHPSPOOL | CEPH_FEATURE_OSD_CACHEPOOL;
@@ -2223,15 +2230,6 @@ void OSDMap::dump_erasure_code_profiles(const map<string,map<string,string> > &p
   f->close_section();
 }
 
-void OSDMap::dump_json(ostream& out) const
-{
-  JSONFormatter jsf(true);
-  jsf.open_object_section("osdmap");
-  dump(&jsf);
-  jsf.close_section();
-  jsf.flush(out);
-}
-
 void OSDMap::dump(Formatter *f) const
 {
   f->dump_int("epoch", get_epoch());
@@ -2318,7 +2316,7 @@ void OSDMap::dump(Formatter *f) const
   }
   f->close_section(); // primary_temp
 
-  f->open_array_section("blacklist");
+  f->open_object_section("blacklist");
   for (ceph::unordered_map<entity_addr_t,utime_t>::const_iterator p = blacklist.begin();
        p != blacklist.end();
        ++p) {
@@ -2377,6 +2375,8 @@ string OSDMap::get_flag_string(unsigned f)
     s += ",nodeep-scrub";
   if (f & CEPH_OSDMAP_NOTIERAGENT)
     s += ",notieragent";
+  if (f & CEPH_OSDMAP_SORTBITWISE)
+    s += ",sortbitwise";
   if (s.length())
     s.erase(0, 1);
   return s;
@@ -2561,15 +2561,16 @@ private:
   const OSDMap *osdmap;
 };
 
-void OSDMap::print_tree(ostream *out, Formatter *f) const
+void OSDMap::print_tree(Formatter *f, ostream *out) const
 {
-  if (out) {
+  if (f)
+    OSDTreeFormattingDumper(crush.get(), this).dump(f);
+  else {
+    assert(out);
     TextTable tbl;
     OSDTreePlainDumper(crush.get(), this).dump(&tbl);
     *out << tbl;
   }
-  if (f)
-    OSDTreeFormattingDumper(crush.get(), this).dump(f);
 }
 
 void OSDMap::print_summary(Formatter *f, ostream& out) const
@@ -2671,6 +2672,7 @@ int OSDMap::build_simple(CephContext *cct, epoch_t e, uuid_d &fsid,
     r = build_simple_crush_map(cct, *crush, nosd, &ss);
   else
     r = build_simple_crush_map_from_conf(cct, *crush, &ss);
+  assert(r == 0);
 
   int poolbase = get_max_osd() ? get_max_osd() : 1;
 
@@ -2701,9 +2703,6 @@ int OSDMap::build_simple(CephContext *cct, epoch_t e, uuid_d &fsid,
     name_pool[*p] = pool;
   }
 
-  if (r < 0)
-    lderr(cct) << ss.str() << dendl;
-  
   for (int i=0; i<get_max_osd(); i++) {
     set_state(i, 0);
     set_weight(i, CEPH_OSD_OUT);
@@ -2726,8 +2725,6 @@ int OSDMap::get_erasure_code_profile_default(CephContext *cct,
   int r = get_json_str_map(cct->_conf->osd_pool_default_erasure_code_profile,
 		      *ss,
 		      &profile_map);
-  profile_map["directory"] =
-    cct->_conf->osd_pool_default_erasure_code_directory;
   return r;
 }
 
@@ -2857,12 +2854,18 @@ int OSDMap::build_simple_crush_rulesets(CephContext *cct,
 					const string& root,
 					ostream *ss)
 {
+  int crush_ruleset =
+      crush._get_osd_pool_default_crush_replicated_ruleset(cct, true);
   string failure_domain =
     crush.get_type_name(cct->_conf->osd_crush_chooseleaf_type);
 
+  if (crush_ruleset == CEPH_DEFAULT_CRUSH_REPLICATED_RULESET)
+    crush_ruleset = -1; // create ruleset 0 by default
+
   int r;
-  r = crush.add_simple_ruleset("replicated_ruleset", root, failure_domain,
-			       "firstn", pg_pool_t::TYPE_REPLICATED, ss);
+  r = crush.add_simple_ruleset_at("replicated_ruleset", root, failure_domain,
+                                  "firstn", pg_pool_t::TYPE_REPLICATED,
+                                  crush_ruleset, ss);
   if (r < 0)
     return r;
   // do not add an erasure rule by default or else we will implicitly
diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h
index 3e17d30..39e0ef0 100644
--- a/src/osd/OSDMap.h
+++ b/src/osd/OSDMap.h
@@ -125,7 +125,7 @@ public:
     int32_t new_flags;
 
     // full (rare)
-    bufferlist fullmap;  // in leiu of below.
+    bufferlist fullmap;  // in lieu of below.
     bufferlist crush;
 
     // incremental
@@ -260,7 +260,6 @@ private:
 
   friend class OSDMonitor;
   friend class PGMonitor;
-  friend class MDS;
 
  public:
   OSDMap() : epoch(0), 
@@ -341,7 +340,7 @@ public:
   }
 
   int get_flags() const { return flags; }
-  int test_flag(int f) const { return flags & f; }
+  bool test_flag(int f) const { return flags & f; }
   void set_flag(int f) { flags |= f; }
   void clear_flag(int f) { flags &= ~f; }
 
@@ -430,6 +429,10 @@ public:
     return exists(osd) && (osd_state[osd] & CEPH_OSD_UP);
   }
 
+  bool has_been_up_since(int osd, epoch_t epoch) const {
+    return is_up(osd) && get_up_from(osd) <= epoch;
+  }
+
   bool is_down(int osd) const {
     return !is_up(osd);
   }
@@ -842,13 +845,12 @@ public:
   void print_pools(ostream& out) const;
   void print_summary(Formatter *f, ostream& out) const;
   void print_oneline_summary(ostream& out) const;
-  void print_tree(ostream *out, Formatter *f) const;
+  void print_tree(Formatter *f, ostream *out) const;
 
   string get_flag_string() const;
   static string get_flag_string(unsigned flags);
   static void dump_erasure_code_profiles(const map<string,map<string,string> > &profiles,
 					 Formatter *f);
-  void dump_json(ostream& out) const;
   void dump(Formatter *f) const;
   static void generate_test_instances(list<OSDMap*>& o);
   bool check_new_blacklist_entries() const { return new_blacklist_entries; }
diff --git a/src/osd/OpRequest.cc b/src/osd/OpRequest.cc
index 1296334..65011a1 100644
--- a/src/osd/OpRequest.cc
+++ b/src/osd/OpRequest.cc
@@ -14,7 +14,11 @@
 #include "osd/osd_types.h"
 
 #ifdef WITH_LTTNG
+#define TRACEPOINT_DEFINE
+#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
 #include "tracing/oprequest.h"
+#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#undef TRACEPOINT_DEFINE
 #else
 #define tracepoint(...)
 #endif
@@ -23,7 +27,8 @@ OpRequest::OpRequest(Message *req, OpTracker *tracker) :
   TrackedOp(tracker, req->get_recv_stamp()),
   rmw_flags(0), request(req),
   hit_flag_points(0), latest_flag_point(0),
-  send_map_update(false), sent_epoch(0) {
+  send_map_update(false), sent_epoch(0),
+  hitset_inserted(false) {
   if (req->get_priority() < tracker->cct->_conf->osd_client_op_priority) {
     // don't warn as quickly for low priority ops
     warn_interval_multiplier = tracker->cct->_conf->osd_recovery_op_warn_multiple;
@@ -98,7 +103,13 @@ bool OpRequest::need_class_write_cap() {
   return check_rmw(CEPH_OSD_RMW_FLAG_CLASS_WRITE);
 }
 bool OpRequest::need_promote() {
-  return check_rmw(CEPH_OSD_RMW_FLAG_PROMOTE);
+  return check_rmw(CEPH_OSD_RMW_FLAG_FORCE_PROMOTE);
+}
+bool OpRequest::need_skip_handle_cache() {
+  return check_rmw(CEPH_OSD_RMW_FLAG_SKIP_HANDLE_CACHE);
+}
+bool OpRequest::need_skip_promote() {
+  return check_rmw(CEPH_OSD_RMW_FLAG_SKIP_PROMOTE);
 }
 
 void OpRequest::set_rmw_flags(int flags) {
@@ -117,7 +128,9 @@ void OpRequest::set_class_read() { set_rmw_flags(CEPH_OSD_RMW_FLAG_CLASS_READ);
 void OpRequest::set_class_write() { set_rmw_flags(CEPH_OSD_RMW_FLAG_CLASS_WRITE); }
 void OpRequest::set_pg_op() { set_rmw_flags(CEPH_OSD_RMW_FLAG_PGOP); }
 void OpRequest::set_cache() { set_rmw_flags(CEPH_OSD_RMW_FLAG_CACHE); }
-void OpRequest::set_promote() { set_rmw_flags(CEPH_OSD_RMW_FLAG_PROMOTE); }
+void OpRequest::set_promote() { set_rmw_flags(CEPH_OSD_RMW_FLAG_FORCE_PROMOTE); }
+void OpRequest::set_skip_handle_cache() { set_rmw_flags(CEPH_OSD_RMW_FLAG_SKIP_HANDLE_CACHE); }
+void OpRequest::set_skip_promote() { set_rmw_flags(CEPH_OSD_RMW_FLAG_SKIP_PROMOTE); }
 
 void OpRequest::mark_flag_point(uint8_t flag, const string& s) {
 #ifdef WITH_LTTNG
diff --git a/src/osd/OpRequest.h b/src/osd/OpRequest.h
index 88a2704..c8649b8 100644
--- a/src/osd/OpRequest.h
+++ b/src/osd/OpRequest.h
@@ -66,6 +66,8 @@ struct OpRequest : public TrackedOp {
   bool need_class_read_cap();
   bool need_class_write_cap();
   bool need_promote();
+  bool need_skip_handle_cache();
+  bool need_skip_promote();
   void set_read();
   void set_write();
   void set_cache();
@@ -73,6 +75,8 @@ struct OpRequest : public TrackedOp {
   void set_class_write();
   void set_pg_op();
   void set_promote();
+  void set_skip_handle_cache();
+  void set_skip_promote();
 
   void _dump(utime_t now, Formatter *f) const;
 
@@ -105,6 +109,7 @@ public:
   }
   bool send_map_update;
   epoch_t sent_epoch;
+  bool hitset_inserted;
   Message *get_req() const { return request; }
   bool been_queued_for_pg() { return hit_flag_points & flag_queued_for_pg; }
   bool been_reached_pg() { return hit_flag_points & flag_reached_pg; }
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index 7b91bf8..c13321c 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -45,7 +45,11 @@
 #include "common/BackTrace.h"
 
 #ifdef WITH_LTTNG
+#define TRACEPOINT_DEFINE
+#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
 #include "tracing/pg.h"
+#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#undef TRACEPOINT_DEFINE
 #else
 #define tracepoint(...)
 #endif
@@ -56,8 +60,6 @@
 #undef dout_prefix
 #define dout_prefix _prefix(_dout, this)
 
-static coll_t META_COLL("meta");
-
 // prefix pgmeta_oid keys with _ so that PGLog::read_log() can
 // easily skip them
 const string infover_key("_infover");
@@ -195,7 +197,9 @@ PG::PG(OSDService *o, OSDMapRef curmap,
   coll(p), pg_log(cct),
   pgmeta_oid(p.make_pgmeta_oid()),
   missing_loc(this),
-  recovery_item(this), scrub_item(this), snap_trim_item(this), stat_queue_item(this),
+  recovery_item(this), stat_queue_item(this),
+  snap_trim_queued(false),
+  scrub_queued(false),
   recovery_ops_active(0),
   role(0),
   state(0),
@@ -215,7 +219,10 @@ PG::PG(OSDService *o, OSDMapRef curmap,
   active_pushes(0),
   recovery_state(this),
   pg_id(p),
-  peer_features((uint64_t)-1)
+  peer_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
+  acting_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
+  upacting_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
+  do_sort_bitwise(false)
 {
 #ifdef PG_DEBUG_REFS
   osd->add_pgid(p, this);
@@ -236,7 +243,7 @@ void PG::lock_suspend_timeout(ThreadPool::TPHandle &handle)
   handle.reset_tp_timeout();
 }
 
-void PG::lock(bool no_lockdep)
+void PG::lock(bool no_lockdep) const
 {
   _lock.Lock(no_lockdep);
   // if we have unrecorded dirty state with the lock dropped, there is a bug
@@ -304,7 +311,7 @@ void PG::proc_replica_log(
   dout(10) << " peer osd." << from << " now " << oinfo << " " << omissing << dendl;
   might_have_unfound.insert(from);
 
-  for (map<hobject_t, pg_missing_t::item>::iterator i = omissing.missing.begin();
+  for (map<hobject_t, pg_missing_t::item, hobject_t::ComparatorWithDefault>::iterator i = omissing.missing.begin();
        i != omissing.missing.end();
        ++i) {
     dout(20) << " after missing " << i->first << " need " << i->second.need
@@ -313,7 +320,8 @@ void PG::proc_replica_log(
   peer_missing[from].swap(omissing);
 }
 
-bool PG::proc_replica_info(pg_shard_t from, const pg_info_t &oinfo)
+bool PG::proc_replica_info(
+  pg_shard_t from, const pg_info_t &oinfo, epoch_t send_epoch)
 {
   map<pg_shard_t, pg_info_t>::iterator p = peer_info.find(from);
   if (p != peer_info.end() && p->second.last_update == oinfo.last_update) {
@@ -321,6 +329,12 @@ bool PG::proc_replica_info(pg_shard_t from, const pg_info_t &oinfo)
     return false;
   }
 
+  if (!get_osdmap()->has_been_up_since(from.osd, send_epoch)) {
+    dout(10) << " got info " << oinfo << " from down osd." << from
+	     << " discarding" << dendl;
+    return false;
+  }
+
   dout(10) << " got osd." << from << " " << oinfo << dendl;
   assert(is_primary());
   peer_info[from] = oinfo;
@@ -420,7 +434,7 @@ bool PG::search_for_missing(
 {
   unsigned num_unfound_before = missing_loc.num_unfound();
   bool found_missing = missing_loc.add_source_info(
-    from, oinfo, omissing, ctx->handle);
+    from, oinfo, omissing, get_sort_bitwise(), ctx->handle);
   if (found_missing && num_unfound_before != missing_loc.num_unfound())
     publish_stats_to_osd();
   if (found_missing &&
@@ -457,15 +471,28 @@ bool PG::MissingLoc::readable_with_acting(
   return (*is_readable)(have_acting);
 }
 
+void PG::MissingLoc::add_batch_sources_info(
+  const set<pg_shard_t> &sources)
+{
+  dout(10) << __func__ << ": adding sources in batch " << sources.size() << dendl;
+  for (map<hobject_t, pg_missing_t::item, hobject_t::ComparatorWithDefault>::const_iterator i = needs_recovery_map.begin();
+      i != needs_recovery_map.end();
+      ++i) {
+    missing_loc[i->first].insert(sources.begin(), sources.end());
+    missing_loc_sources.insert(sources.begin(), sources.end());
+  }
+}
+
 bool PG::MissingLoc::add_source_info(
   pg_shard_t fromosd,
   const pg_info_t &oinfo,
   const pg_missing_t &omissing,
+  bool sort_bitwise,
   ThreadPool::TPHandle* handle)
 {
   bool found_missing = false;
   // found items?
-  for (map<hobject_t,pg_missing_t::item>::const_iterator p = needs_recovery_map.begin();
+  for (map<hobject_t,pg_missing_t::item, hobject_t::ComparatorWithDefault>::const_iterator p = needs_recovery_map.begin();
        p != needs_recovery_map.end();
        ++p) {
     const hobject_t &soid(p->first);
@@ -480,7 +507,16 @@ bool PG::MissingLoc::add_source_info(
 	       << dendl;
       continue;
     }
-    if (p->first >= oinfo.last_backfill) {
+    if (oinfo.last_backfill != hobject_t::get_max() &&
+	oinfo.last_backfill_bitwise != sort_bitwise) {
+      dout(10) << "search_for_missing " << soid << " " << need
+	       << " also missing on osd." << fromosd
+	       << " (last_backfill " << oinfo.last_backfill
+	       << " but with wrong sort order)"
+	       << dendl;
+      continue;
+    }
+    if (cmp(p->first, oinfo.last_backfill, sort_bitwise) >= 0) {
       // FIXME: this is _probably_ true, although it could conceivably
       // be in the undefined region!  Hmm!
       dout(10) << "search_for_missing " << soid << " " << need
@@ -572,21 +608,17 @@ bool PG::needs_recovery() const
 {
   assert(is_primary());
 
-  bool ret = false;
-
   const pg_missing_t &missing = pg_log.get_missing();
 
   if (missing.num_missing()) {
     dout(10) << __func__ << " primary has " << missing.num_missing()
       << " missing" << dendl;
-
-    ret = true;
+    return true;
   }
 
   assert(!actingbackfill.empty());
   set<pg_shard_t>::const_iterator end = actingbackfill.end();
   set<pg_shard_t>::const_iterator a = actingbackfill.begin();
-  assert(a != end);
   for (; a != end; ++a) {
     if (*a == get_primary()) continue;
     pg_shard_t peer = *a;
@@ -594,19 +626,17 @@ bool PG::needs_recovery() const
     if (pm == peer_missing.end()) {
       dout(10) << __func__ << " osd." << peer << " doesn't have missing set"
         << dendl;
-      ret = true;
       continue;
     }
     if (pm->second.num_missing()) {
       dout(10) << __func__ << " osd." << peer << " has "
         << pm->second.num_missing() << " missing" << dendl;
-      ret = true;
+      return true;
     }
   }
 
-  if (!ret)
-    dout(10) << __func__ << " is recovered" << dendl;
-  return ret;
+  dout(10) << __func__ << " is recovered" << dendl;
+  return false;
 }
 
 bool PG::needs_backfill() const
@@ -635,7 +665,12 @@ bool PG::needs_backfill() const
 
 bool PG::_calc_past_interval_range(epoch_t *start, epoch_t *end, epoch_t oldest_map)
 {
-  *end = info.history.same_interval_since;
+  if (info.history.same_interval_since) {
+    *end = info.history.same_interval_since;
+  } else {
+    // PG must be imported, so let's calculate the whole range.
+    *end = osdmap_ref->get_epoch();
+  }
 
   // Do we already have the intervals we want?
   map<epoch_t,pg_interval_t>::const_iterator pif = past_intervals.begin();
@@ -666,6 +701,8 @@ void PG::generate_past_intervals()
   epoch_t cur_epoch, end_epoch;
   if (!_calc_past_interval_range(&cur_epoch, &end_epoch,
       osd->get_superblock().oldest_map)) {
+    if (info.history.same_interval_since == 0)
+      info.history.same_interval_since = end_epoch;
     return;
   }
 
@@ -720,6 +757,15 @@ void PG::generate_past_intervals()
     }
   }
 
+  // PG import needs recalculated same_interval_since
+  if (info.history.same_interval_since == 0) {
+    assert(same_interval_since);
+    dout(10) << __func__ << " fix same_interval_since " << same_interval_since << " pg " << *this << dendl;
+    dout(10) << __func__ << " past_intervals " << past_intervals << dendl;
+    // Fix it
+    info.history.same_interval_since = same_interval_since;
+  }
+
   // record our work.
   dirty_info = true;
   dirty_big_info = true;
@@ -795,6 +841,8 @@ bool PG::all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const
     if (iter != peer_info.end() &&
         (iter->second.is_empty() || iter->second.dne()))
       continue;
+    if (!osdmap->exists(peer->osd))
+      continue;
     const osd_info_t &osd_info(osdmap->get_info(peer->osd));
     if (osd_info.lost_at <= osd_info.up_from) {
       // If there is even one OSD in might_have_unfound that isn't lost, we
@@ -807,7 +855,7 @@ bool PG::all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const
   return true;
 }
 
-void PG::build_prior(std::auto_ptr<PriorSet> &prior_set)
+void PG::build_prior(std::unique_ptr<PriorSet> &prior_set)
 {
   if (1) {
     // sanity check
@@ -879,7 +927,6 @@ void PG::clear_primary_state()
   scrub_after_recovery = false;
 
   osd->recovery_wq.dequeue(this);
-  osd->snap_trim_wq.dequeue(this);
 
   agent_clear();
 }
@@ -1578,8 +1625,22 @@ void PG::activate(ObjectStore::Transaction& t,
 
       bool needs_past_intervals = pi.dne();
 
-      if (pi.last_update == info.last_update) {
+      /*
+       * cover case where peer sort order was different and
+       * last_backfill cannot be interpreted
+       */
+      bool force_restart_backfill =
+	!pi.last_backfill.is_max() &&
+	pi.last_backfill_bitwise != get_sort_bitwise();
+
+      if (pi.last_update == info.last_update && !force_restart_backfill) {
         // empty log
+	if (!pi.last_backfill.is_max())
+	  osd->clog->info() << info.pgid << " continuing backfill to osd."
+			    << peer
+			    << " from (" << pi.log_tail << "," << pi.last_update
+			    << "] " << pi.last_backfill
+			    << " to " << info.last_update;
 	if (!pi.is_empty() && activator_map) {
 	  dout(10) << "activate peer osd." << peer << " is up to date, queueing in pending_activators" << dendl;
 	  (*activator_map)[peer.osd].push_back(
@@ -1599,8 +1660,9 @@ void PG::activate(ObjectStore::Transaction& t,
       } else if (
 	pg_log.get_tail() > pi.last_update ||
 	pi.last_backfill == hobject_t() ||
+	force_restart_backfill ||
 	(backfill_targets.count(*i) && pi.last_backfill.is_max())) {
-	/* This last case covers a situation where a replica is not contiguous
+	/* ^ This last case covers a situation where a replica is not contiguous
 	 * with the auth_log, but is contiguous with this replica.  Reshuffling
 	 * the active set to handle this would be tricky, so instead we just go
 	 * ahead and backfill it anyway.  This is probably preferrable in any
@@ -1608,13 +1670,14 @@ void PG::activate(ObjectStore::Transaction& t,
 	 * behind.
 	 */
 	// backfill
-	osd->clog->info() << info.pgid << " restarting backfill on osd." << peer
-			 << " from (" << pi.log_tail << "," << pi.last_update << "] " << pi.last_backfill
+	osd->clog->info() << info.pgid << " starting backfill to osd." << peer
+			 << " from (" << pi.log_tail << "," << pi.last_update
+			  << "] " << pi.last_backfill
 			 << " to " << info.last_update;
 
 	pi.last_update = info.last_update;
 	pi.last_complete = info.last_update;
-	pi.last_backfill = hobject_t();
+	pi.set_last_backfill(hobject_t(), get_sort_bitwise());
 	pi.last_epoch_started = info.last_epoch_started;
 	pi.history = info.history;
 	pi.hit_set = info.hit_set;
@@ -1654,7 +1717,7 @@ void PG::activate(ObjectStore::Transaction& t,
         for (list<pg_log_entry_t>::iterator p = m->log.log.begin();
              p != m->log.log.end();
              ++p)
-	  if (p->soid <= pi.last_backfill)
+	  if (cmp(p->soid, pi.last_backfill, get_sort_bitwise()) <= 0)
 	    pm.add_next_event(*p);
       }
       
@@ -1677,14 +1740,19 @@ void PG::activate(ObjectStore::Transaction& t,
     }
 
     // Set up missing_loc
+    set<pg_shard_t> complete_shards;
     for (set<pg_shard_t>::iterator i = actingbackfill.begin();
 	 i != actingbackfill.end();
 	 ++i) {
       if (*i == get_primary()) {
-	missing_loc.add_active_missing(pg_log.get_missing());
+	missing_loc.add_active_missing(missing);
+        if (!missing.have_missing())
+          complete_shards.insert(*i);
       } else {
 	assert(peer_missing.count(*i));
 	missing_loc.add_active_missing(peer_missing[*i]);
+        if (!peer_missing[*i].have_missing() && peer_info[*i].last_backfill == hobject_t::get_max())
+          complete_shards.insert(*i);
       }
     }
     // If necessary, create might_have_unfound to help us find our unfound objects.
@@ -1692,19 +1760,29 @@ void PG::activate(ObjectStore::Transaction& t,
     // past intervals.
     might_have_unfound.clear();
     if (needs_recovery()) {
-      missing_loc.add_source_info(pg_whoami, info, pg_log.get_missing(), ctx->handle);
-      for (set<pg_shard_t>::iterator i = actingbackfill.begin();
-	   i != actingbackfill.end();
-	   ++i) {
-	if (*i == pg_whoami) continue;
-	dout(10) << __func__ << ": adding " << *i << " as a source" << dendl;
-	assert(peer_missing.count(*i));
-	assert(peer_info.count(*i));
-	missing_loc.add_source_info(
-	  *i,
-	  peer_info[*i],
-	  peer_missing[*i],
-          ctx->handle);
+      // If only one shard has missing, we do a trick to add all others as recovery
+      // source, this is considered safe since the PGLogs have been merged locally,
+      // and covers vast majority of the use cases, like one OSD/host is down for
+      // a while for hardware repairing
+      if (complete_shards.size() + 1 == actingbackfill.size()) {
+        missing_loc.add_batch_sources_info(complete_shards);
+      } else {
+        missing_loc.add_source_info(pg_whoami, info, pg_log.get_missing(),
+				    get_sort_bitwise(), ctx->handle);
+        for (set<pg_shard_t>::iterator i = actingbackfill.begin();
+	     i != actingbackfill.end();
+	     ++i) {
+	  if (*i == pg_whoami) continue;
+	  dout(10) << __func__ << ": adding " << *i << " as a source" << dendl;
+	  assert(peer_missing.count(*i));
+	  assert(peer_info.count(*i));
+	  missing_loc.add_source_info(
+	    *i,
+	    peer_info[*i],
+	    peer_missing[*i],
+	    get_sort_bitwise(),
+            ctx->handle);
+        }
       }
       for (map<pg_shard_t, pg_missing_t>::iterator i = peer_missing.begin();
 	   i != peer_missing.end();
@@ -1798,12 +1876,15 @@ void PG::queue_op(OpRequestRef& op)
   if (!waiting_for_map.empty()) {
     // preserve ordering
     waiting_for_map.push_back(op);
+    op->mark_delayed("waiting_for_map not empty");
     return;
   }
   if (op_must_wait_for_map(get_osdmap_with_maplock()->get_epoch(), op)) {
     waiting_for_map.push_back(op);
+    op->mark_delayed("op must wait for map");
     return;
   }
+  op->mark_queued_for_pg();
   osd->op_wq.queue(make_pair(PGRef(this), op));
   {
     // after queue() to include any locking costs
@@ -1818,6 +1899,7 @@ void PG::queue_op(OpRequestRef& op)
 void PG::replay_queued_ops()
 {
   assert(is_replay());
+  assert(is_active() || is_activating());
   eversion_t c = info.last_update;
   list<OpRequestRef> replay;
   dout(10) << "replay_queued_ops" << dendl;
@@ -1838,9 +1920,13 @@ void PG::replay_queued_ops()
     replay.push_back(p->second);
   }
   replay_queue.clear();
-  requeue_ops(replay);
-  requeue_ops(waiting_for_active);
-  assert(waiting_for_peered.empty());
+  if (is_active()) {
+    requeue_ops(replay);
+    requeue_ops(waiting_for_active);
+    assert(waiting_for_peered.empty());
+  } else {
+    waiting_for_active.splice(waiting_for_active.begin(), replay);
+  }
 
   publish_stats_to_osd();
 }
@@ -1918,10 +2004,27 @@ void PG::all_activated_and_committed()
 
 void PG::queue_snap_trim()
 {
-  if (osd->queue_for_snap_trim(this))
+  if (snap_trim_queued) {
+    dout(10) << "queue_snap_trim -- already queued" << dendl;
+  } else {
     dout(10) << "queue_snap_trim -- queuing" << dendl;
-  else
-    dout(10) << "queue_snap_trim -- already trimming" << dendl;
+    snap_trim_queued = true;
+    osd->queue_for_snap_trim(this);
+  }
+}
+
+bool PG::requeue_scrub()
+{
+  assert(is_locked());
+  if (scrub_queued) {
+    dout(10) << __func__ << ": already queued" << dendl;
+    return false;
+  } else {
+    dout(10) << __func__ << ": queueing" << dendl;
+    scrub_queued = true;
+    osd->queue_for_scrub(this);
+    return true;
+  }
 }
 
 bool PG::queue_scrub()
@@ -1940,7 +2043,7 @@ bool PG::queue_scrub()
     state_set(PG_STATE_REPAIR);
     scrubber.must_repair = false;
   }
-  osd->queue_for_scrub(this);
+  requeue_scrub();
   return true;
 }
 
@@ -2100,7 +2203,9 @@ void PG::split_ops(PG *child, unsigned split_bits) {
   assert(waiting_for_active.empty());
   split_replay_queue(&replay_queue, &(child->replay_queue), match, split_bits);
 
+  snap_trim_queued = false;
   osd->dequeue_pg(this, &waiting_for_peered);
+
   OSD::split_list(
     &waiting_for_peered, &(child->waiting_for_peered), match, split_bits);
   {
@@ -2137,7 +2242,7 @@ void PG::split_into(pg_t child_pgid, PG *child, unsigned split_bits)
   // Info
   child->info.history = info.history;
   child->info.purged_snaps = info.purged_snaps;
-  child->info.last_backfill = info.last_backfill;
+  child->info.set_last_backfill(info.last_backfill, info.last_backfill_bitwise);
 
   child->info.stats = info.stats;
   info.stats.stats_invalid = true;
@@ -2531,12 +2636,12 @@ void PG::init(
 
   if (backfill) {
     dout(10) << __func__ << ": Setting backfill" << dendl;
-    info.last_backfill = hobject_t();
+    info.set_last_backfill(hobject_t(), get_sort_bitwise());
     info.last_complete = info.last_update;
     pg_log.mark_log_for_rewrite();
   }
 
-  reg_next_scrub();
+  on_new_interval();
 
   dirty_info = true;
   dirty_big_info = true;
@@ -2547,21 +2652,19 @@ void PG::init(
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
 
-void PG::upgrade(ObjectStore *store, const interval_set<snapid_t> &snapcolls)
+void PG::upgrade(ObjectStore *store)
 {
   assert(info_struct_v <= 8);
   ObjectStore::Transaction t;
 
-  if (info_struct_v < 7) {
-    _upgrade_v7(store, snapcolls);
-  }
+  assert(info_struct_v == 7);
 
   // 7 -> 8
   pg_log.mark_log_for_rewrite();
-  hobject_t log_oid(OSD::make_pg_log_oid(pg_id));
-  hobject_t biginfo_oid(OSD::make_pg_biginfo_oid(pg_id));
-  t.remove(META_COLL, log_oid);
-  t.remove(META_COLL, biginfo_oid);
+  ghobject_t log_oid(OSD::make_pg_log_oid(pg_id));
+  ghobject_t biginfo_oid(OSD::make_pg_biginfo_oid(pg_id));
+  t.remove(coll_t::meta(), log_oid);
+  t.remove(coll_t::meta(), biginfo_oid);
   t.collection_rmattr(coll, "info");
 
   t.touch(coll, pgmeta_oid);
@@ -2574,175 +2677,54 @@ void PG::upgrade(ObjectStore *store, const interval_set<snapid_t> &snapcolls)
   dirty_big_info = true;
   write_if_dirty(t);
 
-  int r = store->apply_transaction(t);
+  ceph::shared_ptr<ObjectStore::Sequencer> osr(
+    new ObjectStore::Sequencer("upgrade"));
+  int r = store->apply_transaction(osr.get(), t);
   if (r != 0) {
     derr << __func__ << ": apply_transaction returned "
 	 << cpp_strerror(r) << dendl;
     assert(0);
   }
   assert(r == 0);
+
+  C_SaferCond waiter;
+  if (!osr->flush_commit(&waiter)) {
+    waiter.wait();
+  }
 }
 
 #pragma GCC diagnostic pop
 #pragma GCC diagnostic warning "-Wpragmas"
 
-void PG::_upgrade_v7(ObjectStore *store, const interval_set<snapid_t> &snapcolls)
+int PG::_prepare_write_info(map<string,bufferlist> *km,
+			    epoch_t epoch,
+			    pg_info_t &info, coll_t coll,
+			    map<epoch_t,pg_interval_t> &past_intervals,
+			    ghobject_t &pgmeta_oid,
+			    bool dirty_big_info)
 {
-  unsigned removed = 0;
-  for (interval_set<snapid_t>::const_iterator i = snapcolls.begin();
-       i != snapcolls.end();
-       ++i) {
-    for (snapid_t next_dir = i.get_start();
-	 next_dir != i.get_start() + i.get_len();
-	 ++next_dir) {
-      ++removed;
-      coll_t cid(info.pgid, next_dir);
-      dout(1) << "Removing collection " << cid
-	      << " (" << removed << "/" << snapcolls.size()
-	      << ")" << dendl;
-
-      hobject_t cur;
-      vector<hobject_t> objects;
-      while (1) {
-	int r = get_pgbackend()->objects_list_partial(
-	  cur,
-	  store->get_ideal_list_min(),
-	  store->get_ideal_list_max(),
-	  0,
-	  &objects,
-	  &cur);
-	if (r != 0) {
-	  derr << __func__ << ": collection_list_partial returned "
-	       << cpp_strerror(r) << dendl;
-	  assert(0);
-	}
-	if (objects.empty()) {
-	  assert(cur.is_max());
-	  break;
-	}
-	ObjectStore::Transaction t;
-	for (vector<hobject_t>::iterator j = objects.begin();
-	     j != objects.end();
-	     ++j) {
-	  t.remove(cid, *j);
-	}
-	r = store->apply_transaction(t);
-	if (r != 0) {
-	  derr << __func__ << ": apply_transaction returned "
-	       << cpp_strerror(r) << dendl;
-	  assert(0);
-	}
-	objects.clear();
-      }
-      ObjectStore::Transaction t;
-      t.remove_collection(cid);
-      int r = store->apply_transaction(t);
-      if (r != 0) {
-	derr << __func__ << ": apply_transaction returned "
-	     << cpp_strerror(r) << dendl;
-	assert(0);
-      }
-    }
-  }
-
-  hobject_t cur;
-  coll_t cid(info.pgid);
-  unsigned done = 0;
-  vector<hobject_t> objects;
-  while (1) {
-    dout(1) << "Updating snap_mapper from main collection, "
-	    << done << " objects done" << dendl;
-    int r = get_pgbackend()->objects_list_partial(
-      cur,
-      store->get_ideal_list_min(),
-      store->get_ideal_list_max(),
-      0,
-      &objects,
-      &cur);
-    if (r != 0) {
-      derr << __func__ << ": collection_list_partial returned "
-	   << cpp_strerror(r) << dendl;
-      assert(0);
-    }
-    if (objects.empty()) {
-      assert(cur.is_max());
-      break;
-    }
-    done += objects.size();
-    ObjectStore::Transaction t;
-    for (vector<hobject_t>::iterator j = objects.begin();
-	 j != objects.end();
-	 ++j) {
-      if (j->snap < CEPH_MAXSNAP) {
-	OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
-	bufferlist bl;
-	r = get_pgbackend()->objects_get_attr(
-	  *j,
-	  OI_ATTR,
-	  &bl);
-	if (r < 0) {
-	  derr << __func__ << ": getattr returned "
-	       << cpp_strerror(r) << dendl;
-	  assert(0);
-	}
-	object_info_t oi(bl);
-	set<snapid_t> oi_snaps(oi.snaps.begin(), oi.snaps.end());
-	set<snapid_t> cur_snaps;
-	r = snap_mapper.get_snaps(*j, &cur_snaps);
-	if (r == 0) {
-	  assert(cur_snaps == oi_snaps);
-	} else if (r == -ENOENT) {
-	  snap_mapper.add_oid(*j, oi_snaps, &_t);
-	} else {
-	  derr << __func__ << ": get_snaps returned "
-	       << cpp_strerror(r) << dendl;
-	  assert(0);
-	}
-      }
-    }
-    r = store->apply_transaction(t);
-    if (r != 0) {
-      derr << __func__ << ": apply_transaction returned "
-	   << cpp_strerror(r) << dendl;
-      assert(0);
-    }
-    objects.clear();
-  }
-}
-
-int PG::_write_info(ObjectStore::Transaction& t, epoch_t epoch,
-		    pg_info_t &info, coll_t coll,
-		    map<epoch_t,pg_interval_t> &past_intervals,
-		    ghobject_t &pgmeta_oid,
-		    bool dirty_big_info)
-{
-  // pg state
-  map<string,bufferlist> v;
-
   // info.  store purged_snaps separately.
   interval_set<snapid_t> purged_snaps;
-  ::encode(epoch, v[epoch_key]);
+  ::encode(epoch, (*km)[epoch_key]);
   purged_snaps.swap(info.purged_snaps);
-  ::encode(info, v[info_key]);
+  ::encode(info, (*km)[info_key]);
   purged_snaps.swap(info.purged_snaps);
 
   if (dirty_big_info) {
     // potentially big stuff
-    bufferlist& bigbl = v[biginfo_key];
+    bufferlist& bigbl = (*km)[biginfo_key];
     ::encode(past_intervals, bigbl);
     ::encode(info.purged_snaps, bigbl);
     //dout(20) << "write_info bigbl " << bigbl.length() << dendl;
   }
 
-  t.omap_setkeys(coll, pgmeta_oid, v);
-
   return 0;
 }
 
-void PG::_create(ObjectStore::Transaction& t, spg_t pgid)
+void PG::_create(ObjectStore::Transaction& t, spg_t pgid, int bits)
 {
   coll_t coll(pgid);
-  t.create_collection(coll);
+  t.create_collection(coll, bits);
 }
 
 void PG::_init(ObjectStore::Transaction& t, spg_t pgid, const pg_pool_t *pool)
@@ -2768,14 +2750,14 @@ void PG::_init(ObjectStore::Transaction& t, spg_t pgid, const pg_pool_t *pool)
   t.omap_setkeys(coll, pgmeta_oid, values);
 }
 
-void PG::write_info(ObjectStore::Transaction& t)
+void PG::prepare_write_info(map<string,bufferlist> *km)
 {
   info.stats.stats.add(unstable_stats);
   unstable_stats.clear();
 
-  int ret = _write_info(t, get_osdmap()->get_epoch(), info, coll,
-			past_intervals, pgmeta_oid,
-			dirty_big_info);
+  int ret = _prepare_write_info(km, get_osdmap()->get_epoch(), info, coll,
+				past_intervals, pgmeta_oid,
+				dirty_big_info);
   assert(ret == 0);
   last_persisted_osdmap_ref = osdmap_ref;
 
@@ -2814,17 +2796,14 @@ int PG::peek_map_epoch(ObjectStore *store,
 		       bufferlist *bl)
 {
   coll_t coll(pgid);
-  hobject_t legacy_infos_oid(OSD::make_infos_oid());
+  ghobject_t legacy_infos_oid(OSD::make_infos_oid());
   ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
   epoch_t cur_epoch = 0;
 
   assert(bl);
   {
     // validate collection name
-    spg_t pgid_temp;
-    snapid_t snap;
-    bool ok = coll.is_pg(pgid_temp, snap);
-    assert(ok);
+    assert(coll.is_pg());
   }
 
   // try for v8
@@ -2848,16 +2827,18 @@ int PG::peek_map_epoch(ObjectStore *store,
   } else if (r == -ENOENT) {
     // legacy: try v7 or older
     r = store->collection_getattr(coll, "info", *bl);
-    assert(r > 0);
+    if (r <= 0) {
+      // probably bug 10617; see OSD::load_pgs()
+      return -1;
+    }
     bufferlist::iterator bp = bl->begin();
     __u8 struct_v = 0;
     ::decode(struct_v, bp);
-    if (struct_v < 5)
-      return 0;
+    assert(struct_v >= 5);
     if (struct_v < 6) {
       ::decode(cur_epoch, bp);
       *pepoch = cur_epoch;
-      return 0;
+      return cur_epoch;
     }
 
     // get epoch out of leveldb
@@ -2865,11 +2846,9 @@ int PG::peek_map_epoch(ObjectStore *store,
     keys.clear();
     values.clear();
     keys.insert(ek);
-    store->omap_get_values(META_COLL, legacy_infos_oid, keys, &values);
+    store->omap_get_values(coll_t::meta(), legacy_infos_oid, keys, &values);
     if (values.size() < 1) {
-      // see #13060: this suggests we failed to upgrade this pg
-      // because it was a zombie and then removed the legacy infos
-      // object.  skip it.
+      // probably bug 10617; see OSD::load_pgs()
       return -1;
     }
     bufferlist::iterator p = values[ek].begin();
@@ -2877,6 +2856,7 @@ int PG::peek_map_epoch(ObjectStore *store,
   } else {
     assert(0 == "unable to open pg metadata");
   }
+
   *pepoch = cur_epoch;
   return 0;
 }
@@ -2886,9 +2866,12 @@ int PG::peek_map_epoch(ObjectStore *store,
 
 void PG::write_if_dirty(ObjectStore::Transaction& t)
 {
+  map<string,bufferlist> km;
   if (dirty_big_info || dirty_info)
-    write_info(t);
-  pg_log.write_log(t, coll, pgmeta_oid);
+    prepare_write_info(&km);
+  pg_log.write_log(t, &km, coll, pgmeta_oid);
+  if (!km.empty())
+    t.omap_setkeys(coll, pgmeta_oid, km);
 }
 
 void PG::trim_peers()
@@ -2913,7 +2896,7 @@ void PG::trim_peers()
   }
 }
 
-void PG::add_log_entry(const pg_log_entry_t& e, bufferlist& log_bl)
+void PG::add_log_entry(const pg_log_entry_t& e)
 {
   // raise last_complete only if we were previously up to date
   if (info.last_complete == info.last_update)
@@ -2931,8 +2914,6 @@ void PG::add_log_entry(const pg_log_entry_t& e, bufferlist& log_bl)
   // log mutation
   pg_log.add(e);
   dout(10) << "add_log_entry " << e << dendl;
-
-  e.encode_with_checksum(log_bl);
 }
 
 
@@ -2956,11 +2937,10 @@ void PG::append_log(
   }
   dout(10) << "append_log " << pg_log.get_log() << " " << logv << dendl;
 
-  map<string,bufferlist> keys;
   for (vector<pg_log_entry_t>::const_iterator p = logv.begin();
        p != logv.end();
        ++p) {
-    add_log_entry(*p, keys[p->get_key_name()]);
+    add_log_entry(*p);
   }
 
   PGLogEntryHandler handler;
@@ -2982,9 +2962,6 @@ void PG::append_log(
 	trim_rollback_to));
   }
 
-  dout(10) << "append_log  adding " << keys.size() << " keys" << dendl;
-  t.omap_setkeys(coll, pgmeta_oid, keys);
-
   pg_log.trim(&handler, trim_to, info);
 
   dout(10) << __func__ << ": trimming to " << trim_rollback_to
@@ -3050,7 +3027,7 @@ int PG::read_info(
   }
 
   // legacy (ver < 8)
-  hobject_t infos_oid(OSD::make_infos_oid());
+  ghobject_t infos_oid(OSD::make_infos_oid());
   bufferlist::iterator p = bl.begin();
   ::decode(struct_v, p);
   assert(struct_v == 7);
@@ -3062,7 +3039,7 @@ int PG::read_info(
   keys.insert(k);
   keys.insert(bk);
   values.clear();
-  store->omap_get_values(META_COLL, infos_oid, keys, &values);
+  store->omap_get_values(coll_t::meta(), ghobject_t(infos_oid), keys, &values);
   assert(values.size() == 2);
 
   p = values[k].begin();
@@ -3085,11 +3062,11 @@ void PG::read_state(ObjectStore *store, bufferlist &bl)
   ostringstream oss;
   pg_log.read_log(store,
 		  coll,
-		  info_struct_v < 8 ? META_COLL : coll,
-		  info_struct_v < 8 ? OSD::make_pg_log_oid(pg_id) : pgmeta_oid,
+		  info_struct_v < 8 ? coll_t::meta() : coll,
+		  ghobject_t(info_struct_v < 8 ? OSD::make_pg_log_oid(pg_id) : pgmeta_oid),
 		  info, oss);
-  if (oss.str().length())
-    osd->clog->error() << oss;
+  if (oss.tellp())
+    osd->clog->error() << oss.rdbuf();
 
   // log any weirdness
   log_weirdness();
@@ -3208,9 +3185,9 @@ void PG::filter_snapc(vector<snapid_t> &snaps)
   }
 }
 
-void PG::requeue_object_waiters(map<hobject_t, list<OpRequestRef> >& m)
+void PG::requeue_object_waiters(map<hobject_t, list<OpRequestRef>, hobject_t::BitwiseComparator>& m)
 {
-  for (map<hobject_t, list<OpRequestRef> >::iterator it = m.begin();
+  for (map<hobject_t, list<OpRequestRef>, hobject_t::BitwiseComparator>::iterator it = m.begin();
        it != m.end();
        ++it)
     requeue_ops(it->second);
@@ -3268,16 +3245,18 @@ bool PG::sched_scrub()
 
   bool time_for_deep = (ceph_clock_now(cct) >
     info.history.last_deep_scrub_stamp + cct->_conf->osd_deep_scrub_interval);
- 
+
   //NODEEP_SCRUB so ignore time initiated deep-scrub
-  if (osd->osd->get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB))
+  if (osd->osd->get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) ||
+      pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB))
     time_for_deep = false;
 
   if (!scrubber.must_scrub) {
     assert(!scrubber.must_deep_scrub);
 
     //NOSCRUB so skip regular scrubs
-    if (osd->osd->get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) && !time_for_deep)
+    if ((osd->osd->get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) ||
+	 pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB)) && !time_for_deep)
       return false;
   }
 
@@ -3318,20 +3297,27 @@ bool PG::sched_scrub()
 
 void PG::reg_next_scrub()
 {
+  if (!is_primary())
+    return;
+
+  utime_t reg_stamp;
   if (scrubber.must_scrub ||
       (info.stats.stats_invalid && g_conf->osd_scrub_invalid_stats)) {
-    scrubber.scrub_reg_stamp = utime_t();
+    reg_stamp = ceph_clock_now(cct);
   } else {
-    scrubber.scrub_reg_stamp = info.history.last_scrub_stamp;
+    reg_stamp = info.history.last_scrub_stamp;
   }
-  if (is_primary())
-    osd->reg_last_pg_scrub(info.pgid, scrubber.scrub_reg_stamp);
+  // note down the sched_time, so we can locate this scrub, and remove it
+  // later on.
+  scrubber.scrub_reg_stamp = osd->reg_pg_scrub(info.pgid,
+					       reg_stamp,
+					       scrubber.must_scrub);
 }
 
 void PG::unreg_next_scrub()
 {
   if (is_primary())
-    osd->unreg_last_pg_scrub(info.pgid, scrubber.scrub_reg_stamp);
+    osd->unreg_pg_scrub(info.pgid, scrubber.scrub_reg_stamp);
 }
 
 void PG::sub_op_scrub_map(OpRequestRef op)
@@ -3346,6 +3332,11 @@ void PG::sub_op_scrub_map(OpRequestRef op)
     return;
   }
 
+  if (!scrubber.is_chunky_scrub_active()) {
+    dout(10) << "sub_op_scrub_map scrub isn't active" << dendl;
+    return;
+  }
+
   op->mark_started();
 
   dout(10) << " got " << m->from << " scrub map" << dendl;
@@ -3360,7 +3351,7 @@ void PG::sub_op_scrub_map(OpRequestRef op)
   scrubber.waiting_on_whom.erase(m->from);
 
   if (scrubber.waiting_on == 0) {
-    osd->scrub_wq.queue(this);
+    requeue_scrub();
   }
 }
 
@@ -3377,6 +3368,8 @@ void PG::_request_scrub_map(
     spg_t(info.pgid.pgid, replica.shard), version,
     get_osdmap()->get_epoch(),
     start, end, deep, seed);
+  // default priority, we want the rep scrub processed prior to any recovery
+  // or client io messages (we are holding a lock!)
   osd->send_message_osd_cluster(
     replica.osd, repscrubop, get_osdmap()->get_epoch());
 }
@@ -3484,7 +3477,6 @@ void PG::schedule_backfill_full_retry()
 
 void PG::clear_scrub_reserved()
 {
-  osd->scrub_wq.dequeue(this);
   scrubber.reserved_peers.clear();
   scrubber.reserve_failed = false;
 
@@ -3567,7 +3559,7 @@ void PG::_scan_rollback_obs(
 
 void PG::_scan_snaps(ScrubMap &smap) 
 {
-  for (map<hobject_t, ScrubMap::object>::iterator i = smap.objects.begin();
+  for (map<hobject_t, ScrubMap::object, hobject_t::BitwiseComparator>::iterator i = smap.objects.begin();
        i != smap.objects.end();
        ++i) {
     const hobject_t &hoid = i->first;
@@ -3624,7 +3616,7 @@ void PG::_scan_snaps(ScrubMap &smap)
 			    << "...repaired";
 	}
 	snap_mapper.add_oid(hoid, oi_snaps, &_t);
-	r = osd->store->apply_transaction(t);
+	r = osd->store->apply_transaction(osr.get(), t);
 	if (r != 0) {
 	  derr << __func__ << ": apply_transaction got " << cpp_strerror(r)
 	       << dendl;
@@ -3691,15 +3683,20 @@ void PG::repair_object(
     assert(waiting_for_unreadable_object.empty());
 
     pg_log.missing_add(soid, oi.version, eversion_t());
+
+    pg_log.set_last_requested(0);
+    dout(10) << __func__ << ": primary = " << primary << dendl;
+  }
+
+  if (is_ec_pg() || bad_peer == primary) {
+    // we'd better collect all shard for EC pg, and prepare good peers as the
+    // source of pull in the case of replicated pg.
     missing_loc.add_missing(soid, oi.version, eversion_t());
     list<pair<ScrubMap::object, pg_shard_t> >::iterator i;
     for (i = ok_peers->begin();
-	 i != ok_peers->end();
-	 ++i)
+	i != ok_peers->end();
+	++i)
       missing_loc.add_location(soid, i->second);
-
-    pg_log.set_last_requested(0);
-    dout(10) << __func__ << ": primary = " << primary << dendl;
   }
 }
 
@@ -3710,9 +3707,10 @@ void PG::repair_object(
  * scrubmap of objects that are in the range [msg->start, msg->end).
  */
 void PG::replica_scrub(
-  MOSDRepScrub *msg,
+  OpRequestRef op,
   ThreadPool::TPHandle &handle)
 {
+  MOSDRepScrub *msg = static_cast<MOSDRepScrub *>(op->get_req());
   assert(!scrubber.active_rep_scrub);
   dout(7) << "replica_scrub" << dendl;
 
@@ -3728,20 +3726,25 @@ void PG::replica_scrub(
   assert(msg->chunky);
   if (last_update_applied < msg->scrub_to) {
     dout(10) << "waiting for last_update_applied to catch up" << dendl;
-    scrubber.active_rep_scrub = msg;
+    scrubber.active_rep_scrub = op;
     msg->get();
     return;
   }
 
   if (active_pushes > 0) {
     dout(10) << "waiting for active pushes to finish" << dendl;
-    scrubber.active_rep_scrub = msg;
-    msg->get();
+    scrubber.active_rep_scrub = op;
     return;
   }
 
+  // compensate for hobject_t's with wrong pool from sloppy hammer OSDs
+  hobject_t start = msg->start;
+  hobject_t end = msg->end;
+  start.pool = info.pgid.pool();
+  end.pool = info.pgid.pool();
+
   build_scrub_map_chunk(
-    map, msg->start, msg->end, msg->deep, msg->seed,
+    map, start, end, msg->deep, msg->seed,
     handle);
 
   vector<OSDOp> scrub(1);
@@ -3770,9 +3773,8 @@ void PG::replica_scrub(
  * scrub will be chunky if all OSDs in PG support chunky scrub
  * scrub will fail if OSDs are too old.
  */
-void PG::scrub(ThreadPool::TPHandle &handle)
+void PG::scrub(epoch_t queued, ThreadPool::TPHandle &handle)
 {
-  lock();
   if (g_conf->osd_scrub_sleep > 0 &&
       (scrubber.state == PG::Scrubber::NEW_CHUNK ||
        scrubber.state == PG::Scrubber::INACTIVE)) {
@@ -3784,10 +3786,11 @@ void PG::scrub(ThreadPool::TPHandle &handle)
     lock();
     dout(20) << __func__ << " slept for " << t << dendl;
   }
-  if (deleting) {
-    unlock();
+  if (pg_has_reset_since(queued)) {
     return;
   }
+  assert(scrub_queued);
+  scrub_queued = false;
 
   if (!is_primary() || !is_active() || !is_clean() || !is_scrubbing()) {
     dout(10) << "scrub -- not primary or active or not clean" << dendl;
@@ -3795,30 +3798,11 @@ void PG::scrub(ThreadPool::TPHandle &handle)
     state_clear(PG_STATE_REPAIR);
     state_clear(PG_STATE_DEEP_SCRUB);
     publish_stats_to_osd();
-    unlock();
     return;
   }
 
-  // when we're starting a scrub, we need to determine which type of scrub to do
   if (!scrubber.active) {
-    OSDMapRef curmap = osd->get_osdmap();
     assert(backfill_targets.empty());
-    for (unsigned i=0; i<acting.size(); i++) {
-      if (acting[i] == pg_whoami.osd)
-	continue;
-      if (acting[i] == CRUSH_ITEM_NONE)
-	continue;
-      ConnectionRef con = osd->get_con_osd_cluster(acting[i], get_osdmap()->get_epoch());
-      if (!con)
-	continue;
-      if (!con->has_feature(CEPH_FEATURE_CHUNKY_SCRUB)) {
-        dout(20) << "OSD " << acting[i]
-                 << " does not support chunky scrubs, falling back to classic"
-                 << dendl;
-        assert(0 == "Running incompatible OSD");
-        break;
-      }
-    }
 
     scrubber.deep = state_test(PG_STATE_DEEP_SCRUB);
 
@@ -3826,8 +3810,6 @@ void PG::scrub(ThreadPool::TPHandle &handle)
   }
 
   chunky_scrub(handle);
-
-  unlock();
 }
 
 /*
@@ -3952,7 +3934,7 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
 	  osd->clog->info(oss);
 	}
 
-	if (peer_features & CEPH_FEATURE_OSD_OBJECT_DIGEST)
+	if (get_min_acting_features() & CEPH_FEATURE_OSD_OBJECT_DIGEST)
 	  scrubber.seed = -1; // better, and enables oi digest checks
 	else
 	  scrubber.seed = 0;  // compat
@@ -3981,7 +3963,6 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
 	      start,
 	      cct->_conf->osd_scrub_chunk_min,
 	      cct->_conf->osd_scrub_chunk_max,
-	      0,
 	      &objects,
 	      &candidate_end);
             assert(ret >= 0);
@@ -4000,7 +3981,7 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
               hobject_t end = objects.back().get_boundary();
               objects.pop_back();
 
-              if (objects.back().get_filestore_key() != end.get_filestore_key()) {
+              if (objects.back().get_hash() != end.get_hash()) {
                 candidate_end = end;
                 boundary_found = true;
               }
@@ -4023,7 +4004,8 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
         for (list<pg_log_entry_t>::const_iterator p = pg_log.get_log().log.begin();
              p != pg_log.get_log().log.end();
              ++p) {
-          if (p->soid >= scrubber.start && p->soid < scrubber.end)
+          if (cmp(p->soid, scrubber.start, get_sort_bitwise()) >= 0 &&
+	      cmp(p->soid, scrubber.end, get_sort_bitwise()) < 0)
             scrubber.subset_last_update = p->version;
         }
 
@@ -4121,9 +4103,9 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
 	  break;
 	}
 
-	if (scrubber.end < hobject_t::get_max()) {
+	if (cmp(scrubber.end, hobject_t::get_max(), get_sort_bitwise()) < 0) {
           scrubber.state = PG::Scrubber::NEW_CHUNK;
-          osd->scrub_wq.queue(this);
+	  requeue_scrub();
           done = true;
         } else {
           scrubber.state = PG::Scrubber::FINISH;
@@ -4175,7 +4157,7 @@ void PG::scrub_compare_maps()
 
   // construct authoritative scrub map for type specific scrubbing
   ScrubMap authmap(scrubber.primary_scrubmap);
-  map<hobject_t, pair<uint32_t, uint32_t> > missing_digest;
+  map<hobject_t, pair<uint32_t, uint32_t>, hobject_t::BitwiseComparator> missing_digest;
 
   if (acting.size() > 1) {
     dout(10) << __func__ << "  comparing replica scrub maps" << dendl;
@@ -4183,7 +4165,7 @@ void PG::scrub_compare_maps()
     stringstream ss;
 
     // Map from object with errors to good peer
-    map<hobject_t, list<pg_shard_t> > authoritative;
+    map<hobject_t, list<pg_shard_t>, hobject_t::BitwiseComparator> authoritative;
     map<pg_shard_t, ScrubMap *> maps;
 
     dout(2) << __func__ << "   osd." << acting[0] << " has "
@@ -4201,7 +4183,7 @@ void PG::scrub_compare_maps()
     }
 
     // can we relate scrub digests to oi digests?
-    bool okseed = (get_min_peer_features() & CEPH_FEATURE_OSD_OBJECT_DIGEST);
+    bool okseed = (get_min_upacting_features() & CEPH_FEATURE_OSD_OBJECT_DIGEST);
     assert(okseed == (scrubber.seed == 0xffffffff));
 
     get_pgbackend()->be_compare_scrubmaps(
@@ -4222,7 +4204,7 @@ void PG::scrub_compare_maps()
       osd->clog->error(ss);
     }
 
-    for (map<hobject_t, list<pg_shard_t> >::iterator i = authoritative.begin();
+    for (map<hobject_t, list<pg_shard_t>, hobject_t::BitwiseComparator>::iterator i = authoritative.begin();
 	 i != authoritative.end();
 	 ++i) {
       list<pair<ScrubMap::object, pg_shard_t> > good_peers;
@@ -4237,7 +4219,7 @@ void PG::scrub_compare_maps()
 	  good_peers));
     }
 
-    for (map<hobject_t, list<pg_shard_t> >::iterator i = authoritative.begin();
+    for (map<hobject_t, list<pg_shard_t>, hobject_t::BitwiseComparator>::iterator i = authoritative.begin();
 	 i != authoritative.end();
 	 ++i) {
       authmap.objects.erase(i->first);
@@ -4255,8 +4237,9 @@ void PG::scrub_process_inconsistent()
   bool repair = state_test(PG_STATE_REPAIR);
   bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
   const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
-
-  if (!scrubber.authoritative.empty() || !scrubber.inconsistent.empty()) {
+  
+  // authoriative only store objects which missing or inconsistent.
+  if (!scrubber.authoritative.empty()) {
     stringstream ss;
     ss << info.pgid << " " << mode << " "
        << scrubber.missing.size() << " missing, "
@@ -4265,7 +4248,7 @@ void PG::scrub_process_inconsistent()
     osd->clog->error(ss);
     if (repair) {
       state_clear(PG_STATE_CLEAN);
-      for (map<hobject_t, list<pair<ScrubMap::object, pg_shard_t> > >::iterator i =
+      for (map<hobject_t, list<pair<ScrubMap::object, pg_shard_t> >, hobject_t::BitwiseComparator>::iterator i =
 	     scrubber.authoritative.begin();
 	   i != scrubber.authoritative.end();
 	   ++i) {
@@ -4597,6 +4580,29 @@ bool PG::may_need_replay(const OSDMapRef osdmap) const
   return crashed;
 }
 
+void PG::check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap)
+{
+  bool changed = false;
+  if (osdmap->test_flag(CEPH_OSDMAP_FULL) &&
+      !lastmap->test_flag(CEPH_OSDMAP_FULL)) {
+    dout(10) << " cluster was marked full in " << osdmap->get_epoch() << dendl;
+    changed = true;
+  }
+  const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool());
+  assert(pi);
+  if (pi->has_flag(pg_pool_t::FLAG_FULL)) {
+    const pg_pool_t *opi = lastmap->get_pg_pool(info.pgid.pool());
+    if (!opi || !opi->has_flag(pg_pool_t::FLAG_FULL)) {
+      dout(10) << " pool was marked full in " << osdmap->get_epoch() << dendl;
+      changed = true;
+    }
+  }
+  if (changed) {
+    info.history.last_epoch_marked_full = osdmap->get_epoch();
+    dirty_info = true;
+  }
+}
+
 bool PG::should_restart_peering(
   int newupprimary,
   int newactingprimary,
@@ -4677,13 +4683,14 @@ void PG::reset_interval_flush()
   dout(10) << "Clearing blocked outgoing recovery messages" << dendl;
   recovery_state.clear_blocked_outgoing();
   
-  if (!osr->flush_commit(
-      new QueuePeeringEvt<IntervalFlush>(
-	this, get_osdmap()->get_epoch(), IntervalFlush()))) {
+  Context *c = new QueuePeeringEvt<IntervalFlush>(
+    this, get_osdmap()->get_epoch(), IntervalFlush());
+  if (!osr->flush_commit(c)) {
     dout(10) << "Beginning to block outgoing recovery messages" << dendl;
     recovery_state.begin_block_outgoing();
   } else {
     dout(10) << "Not blocking outgoing recovery messages" << dendl;
+    delete c;
   }
 }
 
@@ -4743,8 +4750,6 @@ void PG::start_peering_interval(
   else
     set_role(-1);
 
-  reg_next_scrub();
-
   // did acting, up, primary|acker change?
   if (!lastmap) {
     dout(10) << " no lastmap" << dendl;
@@ -4753,6 +4758,7 @@ void PG::start_peering_interval(
     info.history.same_interval_since = osdmap->get_epoch();
   } else {
     std::stringstream debug;
+    assert(info.history.same_interval_since != 0);
     boost::scoped_ptr<IsPGRecoverablePredicate> recoverable(
       get_is_recoverable_predicate());
     bool new_interval = pg_interval_t::check_new_interval(
@@ -4789,11 +4795,16 @@ void PG::start_peering_interval(
     info.history.same_primary_since = osdmap->get_epoch();
   }
 
+  on_new_interval();
+
   dout(10) << " up " << oldup << " -> " << up 
 	   << ", acting " << oldacting << " -> " << acting 
 	   << ", acting_primary " << old_acting_primary << " -> " << new_acting_primary
 	   << ", up_primary " << old_up_primary << " -> " << new_up_primary
-	   << ", role " << oldrole << " -> " << role << dendl; 
+	   << ", role " << oldrole << " -> " << role
+	   << ", features acting " << acting_features
+	   << " upacting " << upacting_features
+	   << dendl;
 
   // deactivate.
   state_clear(PG_STATE_ACTIVE);
@@ -4805,6 +4816,8 @@ void PG::start_peering_interval(
   peer_missing.clear();
   peer_purged.clear();
   actingbackfill.clear();
+  snap_trim_queued = false;
+  scrub_queued = false;
 
   // reset primary state?
   if (was_old_primary || is_primary()) {
@@ -4870,6 +4883,43 @@ void PG::start_peering_interval(
   }
 }
 
+void PG::on_new_interval()
+{
+  const OSDMapRef osdmap = get_osdmap();
+
+  reg_next_scrub();
+
+  // initialize features
+  acting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
+  upacting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
+  for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p) {
+    if (*p == CRUSH_ITEM_NONE)
+      continue;
+    uint64_t f = osdmap->get_xinfo(*p).features;
+    acting_features &= f;
+    upacting_features &= f;
+  }
+  for (vector<int>::iterator p = up.begin(); p != up.end(); ++p) {
+    if (*p == CRUSH_ITEM_NONE)
+      continue;
+    upacting_features &= osdmap->get_xinfo(*p).features;
+  }
+
+  do_sort_bitwise = get_osdmap()->test_flag(CEPH_OSDMAP_SORTBITWISE);
+  if (do_sort_bitwise) {
+    assert(get_min_upacting_features() & CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT);
+    if (g_conf->osd_debug_randomize_hobject_sort_order) {
+      // randomly use a nibblewise sort (when we otherwise might have
+      // done bitwise) based on some *deterministic* function such that
+      // all peers/osds will agree.
+      do_sort_bitwise =
+	(info.history.same_interval_since + info.pgid.ps()) & 1;
+    }
+  }
+
+  _on_new_interval();
+}
+
 void PG::proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &oinfo)
 {
   assert(!is_primary());
@@ -4974,6 +5024,9 @@ ostream& operator<<(ostream& out, const PG& pg)
   if (pg.scrubber.must_scrub)
     out << " MUST_SCRUB";
 
+  if (!pg.get_sort_bitwise())
+    out << " NIBBLEWISE";
+
   //out << " (" << pg.pg_log.get_tail() << "," << pg.pg_log.get_head() << "]";
   if (pg.pg_log.get_missing().num_missing()) {
     out << " m=" << pg.pg_log.get_missing().num_missing();
@@ -5121,6 +5174,8 @@ bool PG::can_discard_request(OpRequestRef& op)
     return can_discard_replica_op<MOSDECSubOpRead, MSG_OSD_EC_READ>(op);
   case MSG_OSD_EC_READ_REPLY:
     return can_discard_replica_op<MOSDECSubOpReadReply, MSG_OSD_EC_READ_REPLY>(op);
+  case MSG_OSD_REP_SCRUB:
+    return can_discard_replica_op<MOSDRepScrub, MSG_OSD_REP_SCRUB>(op);
 
   case MSG_OSD_PG_SCAN:
     return can_discard_scan(op);
@@ -5202,6 +5257,11 @@ bool PG::op_must_wait_for_map(epoch_t cur_epoch, OpRequestRef& op)
     return !have_same_or_newer_map(
       cur_epoch,
       static_cast<MOSDECSubOpReadReply*>(op->get_req())->map_epoch);
+
+  case MSG_OSD_REP_SCRUB:
+    return !have_same_or_newer_map(
+      cur_epoch,
+      static_cast<MOSDRepScrub*>(op->get_req())->map_epoch);
   }
   assert(0);
   return false;
@@ -5280,12 +5340,12 @@ void PG::handle_advance_map(
 	   << dendl;
   update_osdmap_ref(osdmap);
   pool.update(osdmap);
-  if (pool.info.last_change == osdmap_ref->get_epoch())
-    on_pool_change();
   AdvMap evt(
     osdmap, lastmap, newup, up_primary,
     newacting, acting_primary);
   recovery_state.handle_event(evt, rctx);
+  if (pool.info.last_change == osdmap_ref->get_epoch())
+    on_pool_change();
 }
 
 void PG::handle_activate_map(RecoveryCtx *rctx)
@@ -5377,7 +5437,8 @@ boost::statechart::result PG::RecoveryState::Initial::react(const Load& l)
 boost::statechart::result PG::RecoveryState::Initial::react(const MNotifyRec& notify)
 {
   PG *pg = context< RecoveryMachine >().pg;
-  pg->proc_replica_info(notify.from, notify.notify.info);
+  pg->proc_replica_info(
+    notify.from, notify.notify.info, notify.notify.epoch_sent);
   pg->update_heartbeat_peers();
   pg->set_last_peering_reset();
   return transit< Primary >();
@@ -5437,6 +5498,7 @@ boost::statechart::result PG::RecoveryState::Started::react(const AdvMap& advmap
 {
   dout(10) << "Started advmap" << dendl;
   PG *pg = context< RecoveryMachine >().pg;
+  pg->check_full_transition(advmap.lastmap, advmap.osdmap);
   if (pg->should_restart_peering(
 	advmap.up_primary,
 	advmap.acting_primary,
@@ -5506,6 +5568,8 @@ boost::statechart::result PG::RecoveryState::Reset::react(const AdvMap& advmap)
   // _before_ we are active.
   pg->generate_past_intervals();
 
+  pg->check_full_transition(advmap.lastmap, advmap.osdmap);
+
   if (pg->should_restart_peering(
 	advmap.up_primary,
 	advmap.acting_primary,
@@ -5610,7 +5674,8 @@ boost::statechart::result PG::RecoveryState::Primary::react(const MNotifyRec& no
     dout(10) << *pg << " got dup osd." << notevt.from << " info " << notevt.notify.info
 	     << ", identical to ours" << dendl;
   } else {
-    pg->proc_replica_info(notevt.from, notevt.notify.info);
+    pg->proc_replica_info(
+      notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
   }
   return discard_event();
 }
@@ -5756,14 +5821,12 @@ PG::RecoveryState::Backfilling::react(const RemoteReservationRejected &)
     ConnectionRef con = pg->osd->get_con_osd_cluster(
       it->osd, pg->get_osdmap()->get_epoch());
     if (con) {
-      if (con->has_feature(CEPH_FEATURE_BACKFILL_RESERVATION)) {
-        pg->osd->send_message_osd_cluster(
-          new MBackfillReserve(
-	    MBackfillReserve::REJECT,
-	    spg_t(pg->info.pgid.pgid, it->shard),
-	    pg->get_osdmap()->get_epoch()),
-	  con.get());
-      }
+      pg->osd->send_message_osd_cluster(
+        new MBackfillReserve(
+	  MBackfillReserve::REJECT,
+	  spg_t(pg->info.pgid.pgid, it->shard),
+	  pg->get_osdmap()->get_epoch()),
+	con.get());
     }
   }
 
@@ -5811,17 +5874,13 @@ PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserve
     ConnectionRef con = pg->osd->get_con_osd_cluster(
       backfill_osd_it->osd, pg->get_osdmap()->get_epoch());
     if (con) {
-      if (con->has_feature(CEPH_FEATURE_BACKFILL_RESERVATION)) {
-        pg->osd->send_message_osd_cluster(
-          new MBackfillReserve(
-	  MBackfillReserve::REQUEST,
-	  spg_t(pg->info.pgid.pgid, backfill_osd_it->shard),
-	  pg->get_osdmap()->get_epoch(),
-	  pg->get_backfill_priority()),
-	con.get());
-      } else {
-        post_event(RemoteBackfillReserved());
-      }
+      pg->osd->send_message_osd_cluster(
+        new MBackfillReserve(
+	MBackfillReserve::REQUEST,
+	spg_t(pg->info.pgid.pgid, backfill_osd_it->shard),
+	pg->get_osdmap()->get_epoch(),
+	pg->get_backfill_priority()),
+      con.get());
     }
     ++backfill_osd_it;
   } else {
@@ -5855,14 +5914,12 @@ PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteReservationReje
     ConnectionRef con = pg->osd->get_con_osd_cluster(
       it->osd, pg->get_osdmap()->get_epoch());
     if (con) {
-      if (con->has_feature(CEPH_FEATURE_BACKFILL_RESERVATION)) {
-        pg->osd->send_message_osd_cluster(
-          new MBackfillReserve(
-	  MBackfillReserve::REJECT,
-	  spg_t(pg->info.pgid.pgid, it->shard),
-	  pg->get_osdmap()->get_epoch()),
-	con.get());
-      }
+      pg->osd->send_message_osd_cluster(
+        new MBackfillReserve(
+	MBackfillReserve::REJECT,
+	spg_t(pg->info.pgid.pgid, it->shard),
+	pg->get_osdmap()->get_epoch()),
+      con.get());
     }
   }
 
@@ -6148,16 +6205,12 @@ PG::RecoveryState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserve
     ConnectionRef con = pg->osd->get_con_osd_cluster(
       remote_recovery_reservation_it->osd, pg->get_osdmap()->get_epoch());
     if (con) {
-      if (con->has_feature(CEPH_FEATURE_RECOVERY_RESERVATION)) {
-	pg->osd->send_message_osd_cluster(
-          new MRecoveryReserve(
-	    MRecoveryReserve::REQUEST,
-	    spg_t(pg->info.pgid.pgid, remote_recovery_reservation_it->shard),
-	    pg->get_osdmap()->get_epoch()),
-	  con.get());
-      } else {
-	post_event(RemoteRecoveryReserved());
-      }
+      pg->osd->send_message_osd_cluster(
+        new MRecoveryReserve(
+	  MRecoveryReserve::REQUEST,
+	  spg_t(pg->info.pgid.pgid, remote_recovery_reservation_it->shard),
+	  pg->get_osdmap()->get_epoch()),
+	con.get());
     }
     ++remote_recovery_reservation_it;
   } else {
@@ -6201,14 +6254,12 @@ void PG::RecoveryState::Recovering::release_reservations()
     ConnectionRef con = pg->osd->get_con_osd_cluster(
       i->osd, pg->get_osdmap()->get_epoch());
     if (con) {
-      if (con->has_feature(CEPH_FEATURE_RECOVERY_RESERVATION)) {
-	pg->osd->send_message_osd_cluster(
-          new MRecoveryReserve(
-	    MRecoveryReserve::RELEASE,
-	    spg_t(pg->info.pgid.pgid, i->shard),
-	    pg->get_osdmap()->get_epoch()),
-	  con.get());
-      }
+      pg->osd->send_message_osd_cluster(
+        new MRecoveryReserve(
+	  MRecoveryReserve::RELEASE,
+	  spg_t(pg->info.pgid.pgid, i->shard),
+	  pg->get_osdmap()->get_epoch()),
+	con.get());
     }
   }
 }
@@ -6467,7 +6518,8 @@ boost::statechart::result PG::RecoveryState::Active::react(const MNotifyRec& not
     dout(10) << "Active: got notify from " << notevt.from 
 	     << ", calling proc_replica_info and discover_all_missing"
 	     << dendl;
-    pg->proc_replica_info(notevt.from, notevt.notify.info);
+    pg->proc_replica_info(
+      notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
     if (pg->have_unfound()) {
       pg->discover_all_missing(*context< RecoveryMachine >().get_query_map());
     }
@@ -6842,14 +6894,14 @@ PG::RecoveryState::GetInfo::GetInfo(my_context ctx)
 
   PG *pg = context< RecoveryMachine >().pg;
   pg->generate_past_intervals();
-  auto_ptr<PriorSet> &prior_set = context< Peering >().prior_set;
+  unique_ptr<PriorSet> &prior_set = context< Peering >().prior_set;
 
   assert(pg->blocked_by.empty());
 
   if (!prior_set.get())
     pg->build_prior(prior_set);
 
-  pg->reset_peer_features();
+  pg->reset_min_peer_features();
   get_infos();
   if (peer_info_requested.empty() && !prior_set->pg_down) {
     post_event(GotInfo());
@@ -6859,7 +6911,7 @@ PG::RecoveryState::GetInfo::GetInfo(my_context ctx)
 void PG::RecoveryState::GetInfo::get_infos()
 {
   PG *pg = context< RecoveryMachine >().pg;
-  auto_ptr<PriorSet> &prior_set = context< Peering >().prior_set;
+  unique_ptr<PriorSet> &prior_set = context< Peering >().prior_set;
 
   pg->blocked_by.clear();
   for (set<pg_shard_t>::const_iterator it = prior_set->probe.begin();
@@ -6904,9 +6956,10 @@ boost::statechart::result PG::RecoveryState::GetInfo::react(const MNotifyRec& in
   }
 
   epoch_t old_start = pg->info.history.last_epoch_started;
-  if (pg->proc_replica_info(infoevt.from, infoevt.notify.info)) {
+  if (pg->proc_replica_info(
+	infoevt.from, infoevt.notify.info, infoevt.notify.epoch_sent)) {
     // we got something new ...
-    auto_ptr<PriorSet> &prior_set = context< Peering >().prior_set;
+    unique_ptr<PriorSet> &prior_set = context< Peering >().prior_set;
     if (old_start < pg->info.history.last_epoch_started) {
       dout(10) << " last_epoch_started moved forward, rebuilding prior" << dendl;
       pg->build_prior(prior_set);
@@ -6925,7 +6978,7 @@ boost::statechart::result PG::RecoveryState::GetInfo::react(const MNotifyRec& in
       }
       get_infos();
     }
-    dout(20) << "Adding osd: " << infoevt.from.osd << " features: "
+    dout(20) << "Adding osd: " << infoevt.from.osd << " peer features: "
       << hex << infoevt.features << dec << dendl;
     pg->apply_peer_features(infoevt.features);
 
@@ -6986,7 +7039,9 @@ boost::statechart::result PG::RecoveryState::GetInfo::react(const MNotifyRec& in
 	  break;
 	}
       }
-      dout(20) << "Common features: " << hex << pg->get_min_peer_features() << dec << dendl;
+      dout(20) << "Common peer features: " << hex << pg->get_min_peer_features() << dec << dendl;
+      dout(20) << "Common acting features: " << hex << pg->get_min_acting_features() << dec << dendl;
+      dout(20) << "Common upacting features: " << hex << pg->get_min_upacting_features() << dec << dendl;
       post_event(GotInfo());
     }
   }
@@ -7230,7 +7285,7 @@ PG::RecoveryState::Incomplete::Incomplete(my_context ctx)
   pg->state_clear(PG_STATE_PEERING);
   pg->state_set(PG_STATE_INCOMPLETE);
 
-  auto_ptr<PriorSet> &prior_set = context< Peering >().prior_set;
+  unique_ptr<PriorSet> &prior_set = context< Peering >().prior_set;
   assert(pg->blocked_by.empty());
   pg->blocked_by.insert(prior_set->down.begin(), prior_set->down.end());
   pg->publish_stats_to_osd();
@@ -7259,7 +7314,8 @@ boost::statechart::result PG::RecoveryState::Incomplete::react(const MNotifyRec&
 	     << ", identical to ours" << dendl;
     return discard_event();
   } else {
-    pg->proc_replica_info(notevt.from, notevt.notify.info);
+    pg->proc_replica_info(
+      notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
     // try again!
     return transit< GetLog >();
   }
@@ -7729,6 +7785,16 @@ void PG::RecoveryState::end_handle() {
   orig_ctx = NULL;
 }
 
+ostream& operator<<(ostream& out, const PG::BackfillInterval& bi)
+{
+  out << "BackfillInfo(" << bi.begin << "-" << bi.end
+      << " " << bi.objects.size() << " objects";
+  if (!bi.objects.empty())
+    out << " " << bi.objects;
+  out << ")";
+  return out;
+}
+
 void intrusive_ptr_add_ref(PG *pg) { pg->get("intptr"); }
 void intrusive_ptr_release(PG *pg) { pg->put("intptr"); }
 
diff --git a/src/osd/PG.h b/src/osd/PG.h
index 41de9d6..0ae3879 100644
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -224,11 +224,13 @@ protected:
     return osdmap_ref;
   }
 
+public:
   OSDMapRef get_osdmap() const {
     assert(is_locked());
     assert(osdmap_ref);
     return osdmap_ref;
   }
+protected:
 
   /** locking and reference counting.
    * I destroy myself when the reference count hits zero.
@@ -237,7 +239,7 @@ protected:
    * put() should be called on destruction of some previously copied pointer.
    * put_unlock() when done with the current pointer (_most common_).
    */  
-  Mutex _lock;
+  mutable Mutex _lock;
   atomic_t ref;
 
 #ifdef PG_DEBUG_REFS
@@ -252,8 +254,8 @@ public:
 
 
   void lock_suspend_timeout(ThreadPool::TPHandle &handle);
-  void lock(bool no_lockdep = false);
-  void unlock() {
+  void lock(bool no_lockdep = false) const;
+  void unlock() const {
     //generic_dout(0) << this << " " << info.pgid << " unlock" << dendl;
     assert(!dirty_info);
     assert(!dirty_big_info);
@@ -294,10 +296,7 @@ public:
   bool can_upgrade() {
     return info_struct_v >= compat_struct_v;
   }
-  void upgrade(
-    ObjectStore *store,
-    const interval_set<snapid_t> &snapcolls);
-  void _upgrade_v7(ObjectStore *store, const interval_set<snapid_t> &snapcolls);
+  void upgrade(ObjectStore *store);
 
   const coll_t coll;
   PGLog  pg_log;
@@ -313,8 +312,8 @@ public:
   ghobject_t    pgmeta_oid;
 
   class MissingLoc {
-    map<hobject_t, pg_missing_t::item> needs_recovery_map;
-    map<hobject_t, set<pg_shard_t> > missing_loc;
+    map<hobject_t, pg_missing_t::item, hobject_t::BitwiseComparator> needs_recovery_map;
+    map<hobject_t, set<pg_shard_t>, hobject_t::BitwiseComparator > missing_loc;
     set<pg_shard_t> missing_loc_sources;
     PG *pg;
     set<pg_shard_t> empty_set;
@@ -333,7 +332,7 @@ public:
     bool needs_recovery(
       const hobject_t &hoid,
       eversion_t *v = 0) const {
-      map<hobject_t, pg_missing_t::item>::const_iterator i =
+      map<hobject_t, pg_missing_t::item, hobject_t::BitwiseComparator>::const_iterator i =
 	needs_recovery_map.find(hoid);
       if (i == needs_recovery_map.end())
 	return false;
@@ -351,7 +350,7 @@ public:
       const set<pg_shard_t> &acting) const;
     uint64_t num_unfound() const {
       uint64_t ret = 0;
-      for (map<hobject_t, pg_missing_t::item>::const_iterator i =
+      for (map<hobject_t, pg_missing_t::item, hobject_t::BitwiseComparator>::const_iterator i =
 	     needs_recovery_map.begin();
 	   i != needs_recovery_map.end();
 	   ++i) {
@@ -361,10 +360,6 @@ public:
       return ret;
     }
 
-    const map<hobject_t, pg_missing_t::item> &get_all_missing() const {
-      return needs_recovery_map;
-    }
-
     void clear() {
       needs_recovery_map.clear();
       missing_loc.clear();
@@ -378,11 +373,11 @@ public:
       missing_loc[hoid].erase(location);
     }
     void add_active_missing(const pg_missing_t &missing) {
-      for (map<hobject_t, pg_missing_t::item>::const_iterator i =
+      for (map<hobject_t, pg_missing_t::item, hobject_t::BitwiseComparator>::const_iterator i =
 	     missing.missing.begin();
 	   i != missing.missing.end();
 	   ++i) {
-	map<hobject_t, pg_missing_t::item>::const_iterator j =
+	map<hobject_t, pg_missing_t::item, hobject_t::BitwiseComparator>::const_iterator j =
 	  needs_recovery_map.find(i->first);
 	if (j == needs_recovery_map.end()) {
 	  needs_recovery_map.insert(*i);
@@ -405,9 +400,15 @@ public:
       pg_shard_t source,           ///< [in] source
       const pg_info_t &oinfo,      ///< [in] info
       const pg_missing_t &omissing, ///< [in] (optional) missing
+      bool sort_bitwise,            ///< [in] local sort bitwise (vs nibblewise)
       ThreadPool::TPHandle* handle  ///< [in] ThreadPool handle
       ); ///< @return whether a new object location was discovered
 
+    /// Adds recovery sources in batch
+    void add_batch_sources_info(
+      const set<pg_shard_t> &sources  ///< [in] a set of resources which can be used for all objects
+      );
+
     /// Uses osdmap to update structures for now down sources
     void check_recovery_sources(const OSDMapRef osdmap);
 
@@ -421,10 +422,10 @@ public:
       return missing_loc.count(hoid) ?
 	missing_loc.find(hoid)->second : empty_set;
     }
-    const map<hobject_t, set<pg_shard_t> > &get_missing_locs() const {
+    const map<hobject_t, set<pg_shard_t>, hobject_t::BitwiseComparator> &get_missing_locs() const {
       return missing_loc;
     }
-    const map<hobject_t, pg_missing_t::item> &get_needs_recovery() const {
+    const map<hobject_t, pg_missing_t::item, hobject_t::BitwiseComparator> &get_needs_recovery() const {
       return needs_recovery_map;
     }
   } missing_loc;
@@ -435,11 +436,14 @@ public:
 
   /* You should not use these items without taking their respective queue locks
    * (if they have one) */
-  xlist<PG*>::item recovery_item, scrub_item, snap_trim_item, stat_queue_item;
+  xlist<PG*>::item recovery_item, stat_queue_item;
+  bool snap_trim_queued;
+  bool scrub_queued;
+
   int recovery_ops_active;
   set<pg_shard_t> waiting_on_backfill;
 #ifdef DEBUG_RECOVERY_OIDS
-  set<hobject_t> recovering_oids;
+  set<hobject_t, hobject_t::BitwiseComparator> recovering_oids;
 #endif
 
   utime_t replay_until;
@@ -636,7 +640,6 @@ public:
   set<int> heartbeat_peers;
   set<int> probe_targets;
 
-protected:
   /**
    * BackfillInterval
    *
@@ -649,17 +652,31 @@ protected:
   struct BackfillInterval {
     // info about a backfill interval on a peer
     eversion_t version; /// version at which the scan occurred
-    map<hobject_t,eversion_t> objects;
+    map<hobject_t,eversion_t,hobject_t::Comparator> objects;
+    bool sort_bitwise;
     hobject_t begin;
     hobject_t end;
+
+    BackfillInterval(bool bitwise=true)
+      : objects(hobject_t::Comparator(bitwise)),
+	sort_bitwise(bitwise)
+    {}
     
     /// clear content
-    void clear() {
-      *this = BackfillInterval();
+    void clear(bool bitwise=true) {
+      *this = BackfillInterval(bitwise);
     }
 
-    void reset(hobject_t start) {
-      clear();
+    /// clear objects list only
+    void clear_objects() {
+      // make sure we preserve the allocator and ordering!
+      objects = map<hobject_t,eversion_t,hobject_t::Comparator>(
+        hobject_t::Comparator(sort_bitwise));
+    }
+
+    /// reinstantiate with a new start+end position and sort order
+    void reset(hobject_t start, bool bitwise) {
+      clear(bitwise);
       begin = end = start;
     }
 
@@ -676,7 +693,8 @@ protected:
     /// removes items <= soid and adjusts begin to the first object
     void trim_to(const hobject_t &soid) {
       trim();
-      while (!objects.empty() && objects.begin()->first <= soid) {
+      while (!objects.empty() &&
+	     cmp(objects.begin()->first, soid, sort_bitwise) <= 0) {
 	pop_front();
       }
     }
@@ -701,7 +719,8 @@ protected:
       f->dump_stream("begin") << begin;
       f->dump_stream("end") << end;
       f->open_array_section("objects");
-      for (map<hobject_t, eversion_t>::const_iterator i = objects.begin();
+      for (map<hobject_t, eversion_t, hobject_t::Comparator>::const_iterator i =
+	     objects.begin();
 	   i != objects.end();
 	   ++i) {
 	f->open_object_section("object");
@@ -712,7 +731,8 @@ protected:
       f->close_section();
     }
   };
-  
+
+protected:
   BackfillInterval backfill_info;
   map<pg_shard_t, BackfillInterval> peer_backfill_info;
   bool backfill_reserved;
@@ -741,11 +761,24 @@ protected:
 
   list<OpRequestRef>            waiting_for_cache_not_full;
   list<OpRequestRef>            waiting_for_all_missing;
-  map<hobject_t, list<OpRequestRef> > waiting_for_unreadable_object,
+  map<hobject_t, list<OpRequestRef>, hobject_t::BitwiseComparator> waiting_for_unreadable_object,
 			     waiting_for_degraded_object,
 			     waiting_for_blocked_object;
+
+  set<
+    hobject_t,
+    hobject_t::BitwiseComparator> objects_blocked_on_cache_full;
+  map<
+    hobject_t,
+    snapid_t,
+    hobject_t::BitwiseComparator> objects_blocked_on_degraded_snap;
+  map<
+    hobject_t,
+    ObjectContextRef,
+    hobject_t::BitwiseComparator> objects_blocked_on_snap_promotion;
+
   // Callbacks should assume pg (and nothing else) is locked
-  map<hobject_t, list<Context*> > callbacks_for_degraded_object;
+  map<hobject_t, list<Context*>, hobject_t::BitwiseComparator> callbacks_for_degraded_object;
 
   map<eversion_t,
       list<pair<OpRequestRef, version_t> > > waiting_for_ack, waiting_for_ondisk;
@@ -753,7 +786,7 @@ protected:
   map<eversion_t,OpRequestRef>   replay_queue;
   void split_ops(PG *child, unsigned split_bits);
 
-  void requeue_object_waiters(map<hobject_t, list<OpRequestRef> >& m);
+  void requeue_object_waiters(map<hobject_t, list<OpRequestRef>, hobject_t::BitwiseComparator>& m);
   void requeue_op(OpRequestRef op);
   void requeue_ops(list<OpRequestRef> &l);
 
@@ -782,14 +815,14 @@ public:
   }
   bool is_acting(pg_shard_t osd) const {
     if (pool.info.ec_pool()) {
-      return acting.size() > osd.shard && acting[osd.shard] == osd.osd;
+      return acting.size() > (unsigned)osd.shard && acting[osd.shard] == osd.osd;
     } else {
       return std::find(acting.begin(), acting.end(), osd.osd) != acting.end();
     }
   }
   bool is_up(pg_shard_t osd) const {
     if (pool.info.ec_pool()) {
-      return up.size() > osd.shard && up[osd.shard] == osd.osd;
+      return up.size() > (unsigned)osd.shard && up[osd.shard] == osd.osd;
     } else {
       return std::find(up.begin(), up.end(), osd.osd) != up.end();
     }
@@ -808,7 +841,7 @@ public:
   bool _calc_past_interval_range(epoch_t *start, epoch_t *end, epoch_t oldest_map);
   void generate_past_intervals();
   void trim_past_intervals();
-  void build_prior(std::auto_ptr<PriorSet> &prior_set);
+  void build_prior(std::unique_ptr<PriorSet> &prior_set);
 
   void remove_down_peer_info(const OSDMapRef osdmap);
 
@@ -843,7 +876,8 @@ public:
 			pg_missing_t& omissing, pg_shard_t from);
   void proc_master_log(ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog,
 		       pg_missing_t& omissing, pg_shard_t from);
-  bool proc_replica_info(pg_shard_t from, const pg_info_t &info);
+  bool proc_replica_info(
+    pg_shard_t from, const pg_info_t &info, epoch_t send_epoch);
 
 
   struct LogEntryTrimmer : public ObjectModDesc::Visitor {
@@ -878,7 +912,7 @@ public:
 
   struct PGLogEntryHandler : public PGLog::LogEntryHandler {
     list<pg_log_entry_t> to_rollback;
-    set<hobject_t> to_remove;
+    set<hobject_t, hobject_t::BitwiseComparator> to_remove;
     list<pg_log_entry_t> to_trim;
     
     // LogEntryHandler
@@ -901,7 +935,7 @@ public:
 	SnapRollBacker rollbacker(j->soid, pg, t);
 	j->mod_desc.visit(&rollbacker);
       }
-      for (set<hobject_t>::iterator i = to_remove.begin();
+      for (set<hobject_t, hobject_t::BitwiseComparator>::iterator i = to_remove.begin();
 	   i != to_remove.end();
 	   ++i) {
 	pg->get_pgbackend()->rollback_create(*i, t);
@@ -1002,8 +1036,7 @@ public:
    * @returns true if any useful work was accomplished; false otherwise
    */
   virtual bool start_recovery_ops(
-    int max, RecoveryCtx *prctx,
-    ThreadPool::TPHandle &handle,
+    int max, ThreadPool::TPHandle &handle,
     int *ops_begun) = 0;
 
   void purge_strays();
@@ -1038,7 +1071,6 @@ public:
       epoch_start(0),
       active(false), queue_snap_trim(false),
       waiting_on(0), shallow_errors(0), deep_errors(0), fixed(0),
-      active_rep_scrub(0),
       must_scrub(false), must_deep_scrub(false), must_repair(false),
       num_digest_updates_pending(0),
       state(INACTIVE),
@@ -1062,18 +1094,18 @@ public:
     int fixed;
     ScrubMap primary_scrubmap;
     map<pg_shard_t, ScrubMap> received_maps;
-    MOSDRepScrub *active_rep_scrub;
+    OpRequestRef active_rep_scrub;
     utime_t scrub_reg_stamp;  // stamp we registered for
 
     // flags to indicate explicitly requested scrubs (by admin)
     bool must_scrub, must_deep_scrub, must_repair;
 
     // Maps from objects with errors to missing/inconsistent peers
-    map<hobject_t, set<pg_shard_t> > missing;
-    map<hobject_t, set<pg_shard_t> > inconsistent;
+    map<hobject_t, set<pg_shard_t>, hobject_t::BitwiseComparator> missing;
+    map<hobject_t, set<pg_shard_t>, hobject_t::BitwiseComparator> inconsistent;
 
     // Map from object with errors to good peers
-    map<hobject_t, list<pair<ScrubMap::object, pg_shard_t> > > authoritative;
+    map<hobject_t, list<pair<ScrubMap::object, pg_shard_t> >, hobject_t::BitwiseComparator> authoritative;
 
     // digest updates which we are waiting on
     int num_digest_updates_pending;
@@ -1134,8 +1166,9 @@ public:
 
     // classic (non chunk) scrubs block all writes
     // chunky scrubs only block writes to a range
-    bool write_blocked_by_scrub(const hobject_t &soid) {
-      if (soid >= start && soid < end)
+    bool write_blocked_by_scrub(const hobject_t &soid, bool sort_bitwise) {
+      if (cmp(soid, start, sort_bitwise) >= 0 &&
+	  cmp(soid, end, sort_bitwise) < 0)
 	return true;
 
       return false;
@@ -1148,8 +1181,7 @@ public:
       waiting_on = 0;
       waiting_on_whom.clear();
       if (active_rep_scrub) {
-        active_rep_scrub->put();
-        active_rep_scrub = NULL;
+        active_rep_scrub = OpRequestRef();
       }
       received_maps.clear();
 
@@ -1183,7 +1215,7 @@ public:
     const hobject_t& soid, list<pair<ScrubMap::object, pg_shard_t> > *ok_peers,
     pg_shard_t bad_peer);
 
-  void scrub(ThreadPool::TPHandle &handle);
+  void scrub(epoch_t queued, ThreadPool::TPHandle &handle);
   void chunky_scrub(ThreadPool::TPHandle &handle);
   void scrub_compare_maps();
   void scrub_process_inconsistent();
@@ -1209,10 +1241,9 @@ public:
     const hobject_t &begin, const hobject_t &end) = 0;
   virtual void _scrub(
     ScrubMap &map,
-    const std::map<hobject_t, pair<uint32_t, uint32_t> > &missing_digest) { }
+    const std::map<hobject_t, pair<uint32_t, uint32_t>, hobject_t::BitwiseComparator> &missing_digest) { }
   virtual void _scrub_clear_state() { }
   virtual void _scrub_finish() { }
-  virtual void get_colls(list<coll_t> *out) = 0;
   virtual void split_colls(
     spg_t child,
     int split_bits,
@@ -1233,7 +1264,7 @@ public:
   void unreg_next_scrub();
 
   void replica_scrub(
-    struct MOSDRepScrub *op,
+    OpRequestRef op,
     ThreadPool::TPHandle &handle);
   void sub_op_scrub_map(OpRequestRef op);
   void sub_op_scrub_reserve(OpRequestRef op);
@@ -1640,7 +1671,7 @@ public:
     struct Active;
 
     struct Peering : boost::statechart::state< Peering, Primary, GetInfo >, NamedState {
-      std::auto_ptr< PriorSet > prior_set;
+      std::unique_ptr< PriorSet > prior_set;
 
       Peering(my_context ctx);
       void exit();
@@ -2008,19 +2039,33 @@ public:
 
  private:
   // Prevent copying
-  PG(const PG& rhs);
+  explicit PG(const PG& rhs);
   PG& operator=(const PG& rhs);
   const spg_t pg_id;
   uint64_t peer_features;
+  uint64_t acting_features;
+  uint64_t upacting_features;
+
+  bool do_sort_bitwise;
 
  public:
   const spg_t&      get_pgid() const { return pg_id; }
   int        get_nrep() const { return acting.size(); }
 
-  void reset_peer_features() { peer_features = (uint64_t)-1; }
+  void reset_min_peer_features() {
+    peer_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
+  }
   uint64_t get_min_peer_features() const { return peer_features; }
   void apply_peer_features(uint64_t f) { peer_features &= f; }
 
+  uint64_t get_min_acting_features() const { return acting_features; }
+  uint64_t get_min_upacting_features() const { return upacting_features; }
+
+  /// true if we will sort hobjects bitwise for this pg interval
+  bool get_sort_bitwise() const {
+    return do_sort_bitwise;
+  }
+
   void init_primary_up_acting(
     const vector<int> &newup,
     const vector<int> &newacting,
@@ -2078,6 +2123,7 @@ public:
 
   int get_state() const { return state; }
   bool       is_active() const { return state_test(PG_STATE_ACTIVE); }
+  bool       is_activating() const { return state_test(PG_STATE_ACTIVATING); }
   bool       is_peering() const { return state_test(PG_STATE_PEERING); }
   bool       is_down() const { return state_test(PG_STATE_DOWN); }
   bool       is_replay() const { return state_test(PG_STATE_REPLAY); }
@@ -2106,15 +2152,16 @@ public:
   // pg on-disk state
   void do_pending_flush();
 
-  static void _create(ObjectStore::Transaction& t, spg_t pgid);
+  static void _create(ObjectStore::Transaction& t, spg_t pgid, int bits);
   static void _init(ObjectStore::Transaction& t,
 		    spg_t pgid, const pg_pool_t *pool);
 
 private:
-  void write_info(ObjectStore::Transaction& t);
+  void prepare_write_info(map<string,bufferlist> *km);
 
 public:
-  static int _write_info(ObjectStore::Transaction& t, epoch_t epoch,
+  static int _prepare_write_info(map<string,bufferlist> *km,
+    epoch_t epoch,
     pg_info_t &info, coll_t coll,
     map<epoch_t,pg_interval_t> &past_intervals,
     ghobject_t &pgmeta_oid,
@@ -2129,7 +2176,7 @@ public:
     return at_version;
   }
 
-  void add_log_entry(const pg_log_entry_t& e, bufferlist& log_bl);
+  void add_log_entry(const pg_log_entry_t& e);
   void append_log(
     const vector<pg_log_entry_t>& logv,
     eversion_t trim_to,
@@ -2157,6 +2204,7 @@ public:
   void log_weirdness();
 
   void queue_snap_trim();
+  bool requeue_scrub();
   bool queue_scrub();
 
   /// share pg info after a pg is active
@@ -2170,6 +2218,8 @@ public:
     const vector<int>& newup, int up_primary,
     const vector<int>& newacting, int acting_primary,
     ObjectStore::Transaction *t);
+  void on_new_interval();
+  virtual void _on_new_interval() = 0;
   void start_flush(ObjectStore::Transaction *t,
 		   list<Context *> *on_applied,
 		   list<Context *> *on_safe);
@@ -2184,6 +2234,8 @@ public:
 		    pair<pg_shard_t, pg_info_t> &notify_info);
   void fulfill_log(pg_shard_t from, const pg_query_t &query, epoch_t query_epoch);
 
+  void check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap);
+
   bool should_restart_peering(
     int newupprimary,
     int newactingprimary,
@@ -2252,7 +2304,7 @@ public:
     ThreadPool::TPHandle &handle
   ) = 0;
   virtual void do_backfill(OpRequestRef op) = 0;
-  virtual void snap_trimmer() = 0;
+  virtual void snap_trimmer(epoch_t epoch_queued) = 0;
 
   virtual int do_command(cmdmap_t cmdmap, ostream& ss,
 			 bufferlist& idata, bufferlist& odata) = 0;
@@ -2267,6 +2319,7 @@ public:
   virtual void get_watchers(std::list<obj_watch_item_t>&) = 0;
 
   virtual bool agent_work(int max) = 0;
+  virtual bool agent_work(int max, int agent_flush_quota) = 0;
   virtual void agent_stop() = 0;
   virtual void agent_delay() = 0;
   virtual void agent_clear() = 0;
@@ -2275,4 +2328,6 @@ public:
 
 ostream& operator<<(ostream& out, const PG& pg);
 
+ostream& operator<<(ostream& out, const PG::BackfillInterval& bi);
+
 #endif
diff --git a/src/osd/PGBackend.cc b/src/osd/PGBackend.cc
index 7fc56d1..f42e6be 100644
--- a/src/osd/PGBackend.cc
+++ b/src/osd/PGBackend.cc
@@ -84,52 +84,42 @@ void PGBackend::on_change_cleanup(ObjectStore::Transaction *t)
 {
   dout(10) << __func__ << dendl;
   // clear temp
-  for (set<hobject_t>::iterator i = temp_contents.begin();
+  for (set<hobject_t, hobject_t::BitwiseComparator>::iterator i = temp_contents.begin();
        i != temp_contents.end();
        ++i) {
     dout(10) << __func__ << ": Removing oid "
 	     << *i << " from the temp collection" << dendl;
     t->remove(
-      get_temp_coll(t),
+      coll,
       ghobject_t(*i, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
   }
   temp_contents.clear();
 }
 
-coll_t PGBackend::get_temp_coll(ObjectStore::Transaction *t)
-{
-  if (temp_created)
-    return temp_coll;
-  if (!store->collection_exists(temp_coll))
-      t->create_collection(temp_coll);
-  temp_created = true;
-  return temp_coll;
-}
-
 int PGBackend::objects_list_partial(
   const hobject_t &begin,
   int min,
   int max,
-  snapid_t seq,
   vector<hobject_t> *ls,
   hobject_t *next)
 {
   assert(ls);
-  // Starts with the smallest shard id and generation to
-  // make sure the result list has the marker object (
-  // it might have multiple generations though, which would
-  // be filtered).
-  ghobject_t _next(begin, 0, shard_id_t(0));
+  // Starts with the smallest generation to make sure the result list
+  // has the marker object (it might have multiple generations
+  // though, which would be filtered).
+  ghobject_t _next;
+  if (!begin.is_min())
+    _next = ghobject_t(begin, 0, get_parent()->whoami_shard().shard);
   ls->reserve(max);
   int r = 0;
   while (!_next.is_max() && ls->size() < (unsigned)min) {
     vector<ghobject_t> objects;
-    int r = store->collection_list_partial(
+    int r = store->collection_list(
       coll,
       _next,
-      min - ls->size(),
+      ghobject_t::get_max(),
+      parent->sort_bitwise(),
       max - ls->size(),
-      seq,
       &objects,
       &_next);
     if (r != 0)
@@ -137,7 +127,7 @@ int PGBackend::objects_list_partial(
     for (vector<ghobject_t>::iterator i = objects.begin();
 	 i != objects.end();
 	 ++i) {
-      if (i->is_pgmeta()) {
+      if (i->is_pgmeta() || i->hobj.is_temp()) {
 	continue;
       }
       if (i->is_no_gen()) {
@@ -159,17 +149,19 @@ int PGBackend::objects_list_range(
 {
   assert(ls);
   vector<ghobject_t> objects;
-  int r = store->collection_list_range(
+  int r = store->collection_list(
     coll,
-    start,
-    end,
-    seq,
-    &objects);
+    ghobject_t(start, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+    ghobject_t(end, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+    parent->sort_bitwise(),
+    INT_MAX,
+    &objects,
+    NULL);
   ls->reserve(objects.size());
   for (vector<ghobject_t>::iterator i = objects.begin();
        i != objects.end();
        ++i) {
-    if (i->is_pgmeta()) {
+    if (i->is_pgmeta() || i->hobj.is_temp()) {
       continue;
     }
     if (i->is_no_gen()) {
@@ -188,7 +180,7 @@ int PGBackend::objects_get_attr(
 {
   bufferptr bp;
   int r = store->getattr(
-    hoid.is_temp() ? temp_coll : coll,
+    coll,
     ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
     attr.c_str(),
     bp);
@@ -204,7 +196,7 @@ int PGBackend::objects_get_attrs(
   map<string, bufferlist> *out)
 {
   return store->getattrs(
-    hoid.is_temp() ? temp_coll : coll,
+    coll,
     ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
     *out);
 }
@@ -282,29 +274,28 @@ PGBackend *PGBackend::build_pg_backend(
   const OSDMapRef curmap,
   Listener *l,
   coll_t coll,
-  coll_t temp_coll,
   ObjectStore *store,
   CephContext *cct)
 {
   switch (pool.type) {
   case pg_pool_t::TYPE_REPLICATED: {
-    return new ReplicatedBackend(l, coll, temp_coll, store, cct);
+    return new ReplicatedBackend(l, coll, store, cct);
   }
   case pg_pool_t::TYPE_ERASURE: {
     ErasureCodeInterfaceRef ec_impl;
-    const map<string,string> &profile = curmap->get_erasure_code_profile(pool.erasure_code_profile);
+    ErasureCodeProfile profile = curmap->get_erasure_code_profile(pool.erasure_code_profile);
     assert(profile.count("plugin"));
     stringstream ss;
     ceph::ErasureCodePluginRegistry::instance().factory(
       profile.find("plugin")->second,
+      g_conf->erasure_code_dir,
       profile,
       &ec_impl,
-      ss);
+      &ss);
     assert(ec_impl);
     return new ECBackend(
       l,
       coll,
-      temp_coll,
       store,
       cct,
       ec_impl,
@@ -461,7 +452,7 @@ map<pg_shard_t, ScrubMap *>::const_iterator
   for (map<pg_shard_t, ScrubMap *>::const_iterator j = maps.begin();
        j != maps.end();
        ++j) {
-    map<hobject_t, ScrubMap::object>::iterator i =
+    map<hobject_t, ScrubMap::object, hobject_t::BitwiseComparator>::iterator i =
       j->second->objects.find(obj);
     if (i == j->second->objects.end()) {
       continue;
@@ -547,18 +538,18 @@ void PGBackend::be_compare_scrubmaps(
   const map<pg_shard_t,ScrubMap*> &maps,
   bool okseed,
   bool repair,
-  map<hobject_t, set<pg_shard_t> > &missing,
-  map<hobject_t, set<pg_shard_t> > &inconsistent,
-  map<hobject_t, list<pg_shard_t> > &authoritative,
-  map<hobject_t, pair<uint32_t,uint32_t> > &missing_digest,
+  map<hobject_t, set<pg_shard_t>, hobject_t::BitwiseComparator> &missing,
+  map<hobject_t, set<pg_shard_t>, hobject_t::BitwiseComparator> &inconsistent,
+  map<hobject_t, list<pg_shard_t>, hobject_t::BitwiseComparator> &authoritative,
+  map<hobject_t, pair<uint32_t,uint32_t>, hobject_t::BitwiseComparator> &missing_digest,
   int &shallow_errors, int &deep_errors,
   const spg_t& pgid,
   const vector<int> &acting,
   ostream &errorstream)
 {
-  map<hobject_t,ScrubMap::object>::const_iterator i;
-  map<pg_shard_t, ScrubMap *>::const_iterator j;
-  set<hobject_t> master_set;
+  map<hobject_t,ScrubMap::object, hobject_t::BitwiseComparator>::const_iterator i;
+  map<pg_shard_t, ScrubMap *, hobject_t::BitwiseComparator>::const_iterator j;
+  set<hobject_t, hobject_t::BitwiseComparator> master_set;
   utime_t now = ceph_clock_now(NULL);
 
   // Construct master set
@@ -569,7 +560,7 @@ void PGBackend::be_compare_scrubmaps(
   }
 
   // Check maps against master set and each other
-  for (set<hobject_t>::const_iterator k = master_set.begin();
+  for (set<hobject_t, hobject_t::BitwiseComparator>::const_iterator k = master_set.begin();
        k != master_set.end();
        ++k) {
     object_info_t auth_oi;
diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h
index 1e93641..5259994 100644
--- a/src/osd/PGBackend.h
+++ b/src/osd/PGBackend.h
@@ -43,7 +43,6 @@
  protected:
    ObjectStore *store;
    const coll_t coll;
-   const coll_t temp_coll;
  public:	
    /**
     * Provides interfaces for PGBackend callbacks
@@ -56,9 +55,6 @@
    public:
      /// Recovery
 
-     virtual void on_local_recover_start(
-       const hobject_t &oid,
-       ObjectStore::Transaction *t) = 0;
      /**
       * Called with the transaction recovering oid
       */
@@ -109,6 +105,10 @@
        ObjectStore::Transaction *t,
        OpRequestRef op = OpRequestRef()
        ) = 0;
+     virtual void queue_transactions(
+       list<ObjectStore::Transaction*>& tls,
+       OpRequestRef op = OpRequestRef()
+       ) = 0;
      virtual epoch_t get_epoch() const = 0;
 
      virtual const set<pg_shard_t> &get_actingbackfill_shards() const = 0;
@@ -117,7 +117,7 @@
 
      virtual std::string gen_dbg_prefix() const = 0;
 
-     virtual const map<hobject_t, set<pg_shard_t> > &get_missing_loc_shards()
+     virtual const map<hobject_t, set<pg_shard_t>, hobject_t::BitwiseComparator> &get_missing_loc_shards()
        const = 0;
 
      virtual const pg_missing_t &get_local_missing() const = 0;
@@ -205,8 +205,11 @@
      virtual pg_shard_t primary_shard() const = 0;
 
      virtual uint64_t min_peer_features() const = 0;
+     virtual bool sort_bitwise() const = 0;
 
      virtual bool transaction_use_tbl() = 0;
+     virtual hobject_t get_temp_recovery_object(eversion_t version,
+						snapid_t snap) = 0;
 
      virtual void send_message_osd_cluster(
        int peer, Message *m, epoch_t from_epoch) = 0;
@@ -227,11 +230,10 @@
    };
    Listener *parent;
    Listener *get_parent() const { return parent; }
-   PGBackend(Listener *l, ObjectStore *store, coll_t coll, coll_t temp_coll) :
+   PGBackend(Listener *l, ObjectStore *store, coll_t coll) :
      store(store),
      coll(coll),
-     temp_coll(temp_coll),
-     parent(l), temp_created(false) {}
+     parent(l) {}
    bool is_primary() const { return get_parent()->pgb_is_primary(); }
    OSDMapRef get_osdmap() const { return get_parent()->pgb_get_osdmap(); }
    const pg_info_t &get_info() { return get_parent()->get_info(); }
@@ -249,6 +251,9 @@
     * the pending recovery operations.
     */
    struct RecoveryHandle {
+     bool cache_dont_need;
+
+     RecoveryHandle(): cache_dont_need(false) {}
      virtual ~RecoveryHandle() {}
    };
 
@@ -321,50 +326,23 @@
    virtual IsPGRecoverablePredicate *get_is_recoverable_predicate() = 0;
    virtual IsPGReadablePredicate *get_is_readable_predicate() = 0;
 
-   void temp_colls(list<coll_t> *out) {
-     if (temp_created)
-       out->push_back(temp_coll);
-   }
-   void split_colls(
-     spg_t child,
-     int split_bits,
-     int seed,
-     ObjectStore::Transaction *t) {
-     coll_t target = coll_t::make_temp_coll(child);
-     if (!temp_created)
-       return;
-     t->create_collection(target);
-     t->split_collection(
-       temp_coll,
-       split_bits,
-       seed,
-       target);
-   }
-
    virtual void dump_recovery_info(Formatter *f) const = 0;
 
  private:
-   bool temp_created;
-   set<hobject_t> temp_contents;
+   set<hobject_t, hobject_t::BitwiseComparator> temp_contents;
  public:
-   coll_t get_temp_coll(ObjectStore::Transaction *t);
-   coll_t get_temp_coll() const {
-    return temp_coll;
-   }
-   bool have_temp_coll() const { return temp_created; }
-
    // Track contents of temp collection, clear on reset
    void add_temp_obj(const hobject_t &oid) {
      temp_contents.insert(oid);
    }
-   void add_temp_objs(const set<hobject_t> &oids) {
+   void add_temp_objs(const set<hobject_t, hobject_t::BitwiseComparator> &oids) {
      temp_contents.insert(oids.begin(), oids.end());
    }
    void clear_temp_obj(const hobject_t &oid) {
      temp_contents.erase(oid);
    }
-   void clear_temp_objs(const set<hobject_t> &oids) {
-     for (set<hobject_t>::const_iterator i = oids.begin();
+   void clear_temp_objs(const set<hobject_t, hobject_t::BitwiseComparator> &oids) {
+     for (set<hobject_t, hobject_t::BitwiseComparator>::const_iterator i = oids.begin();
 	  i != oids.end();
 	  ++i) {
        temp_contents.erase(*i);
@@ -537,7 +515,6 @@
      const hobject_t &begin,
      int min,
      int max,
-     snapid_t seq,
      vector<hobject_t> *ls,
      hobject_t *next);
 
@@ -568,7 +545,7 @@
      const hobject_t &hoid,
      const list<pair<boost::tuple<uint64_t, uint64_t, uint32_t>,
 		pair<bufferlist*, Context*> > > &to_read,
-     Context *on_complete) = 0;
+     Context *on_complete, bool fast_read = false) = 0;
 
    virtual bool scrub_supported() { return false; }
    void be_scan_list(
@@ -590,10 +567,10 @@
      const map<pg_shard_t,ScrubMap*> &maps,
      bool okseed,   ///< true if scrub digests have same seed our oi digests
      bool repair,
-     map<hobject_t, set<pg_shard_t> > &missing,
-     map<hobject_t, set<pg_shard_t> > &inconsistent,
-     map<hobject_t, list<pg_shard_t> > &authoritative,
-     map<hobject_t, pair<uint32_t,uint32_t> > &missing_digest,
+     map<hobject_t, set<pg_shard_t>, hobject_t::BitwiseComparator> &missing,
+     map<hobject_t, set<pg_shard_t>, hobject_t::BitwiseComparator> &inconsistent,
+     map<hobject_t, list<pg_shard_t>, hobject_t::BitwiseComparator> &authoritative,
+     map<hobject_t, pair<uint32_t,uint32_t>, hobject_t::BitwiseComparator> &missing_digest,
      int &shallow_errors, int &deep_errors,
      const spg_t& pgid,
      const vector<int> &acting,
@@ -611,7 +588,6 @@
      const OSDMapRef curmap,
      Listener *l,
      coll_t coll,
-     coll_t temp_coll,
      ObjectStore *store,
      CephContext *cct);
  };
diff --git a/src/osd/PGLog.cc b/src/osd/PGLog.cc
index b619bcd..86dbd3a 100644
--- a/src/osd/PGLog.cc
+++ b/src/osd/PGLog.cc
@@ -22,8 +22,6 @@
 
 #define dout_subsys ceph_subsys_osd
 
-static coll_t META_COLL("meta");
-
 //////////////////// PGLog::IndexedLog ////////////////////
 
 void PGLog::IndexedLog::advance_rollback_info_trimmed_to(
@@ -212,7 +210,7 @@ void PGLog::proc_replica_log(
     we will send the peer enough log to arrive at the same state.
   */
 
-  for (map<hobject_t, pg_missing_t::item>::iterator i = omissing.missing.begin();
+  for (map<hobject_t, pg_missing_t::item, hobject_t::BitwiseComparator>::iterator i = omissing.missing.begin();
        i != omissing.missing.end();
        ++i) {
     dout(20) << " before missing " << i->first << " need " << i->second.need
@@ -338,7 +336,7 @@ void PGLog::_merge_object_divergent_entries(
   dout(10) << __func__ << ": merging hoid " << hoid
 	   << " entries: " << entries << dendl;
 
-  if (hoid > info.last_backfill) {
+  if (cmp(hoid, info.last_backfill, info.last_backfill_bitwise) > 0) {
     dout(10) << __func__ << ": hoid " << hoid << " after last_backfill"
 	     << dendl;
     return;
@@ -567,7 +565,7 @@ void PGLog::merge_log(ObjectStore::Transaction& t,
   // The logs must overlap.
   assert(log.head >= olog.tail && olog.head >= log.tail);
 
-  for (map<hobject_t, pg_missing_t::item>::iterator i = missing.missing.begin();
+  for (map<hobject_t, pg_missing_t::item, hobject_t::BitwiseComparator>::iterator i = missing.missing.begin();
        i != missing.missing.end();
        ++i) {
     dout(20) << "pg_missing_t sobject: " << i->first << dendl;
@@ -645,7 +643,7 @@ void PGLog::merge_log(ObjectStore::Transaction& t,
       pg_log_entry_t &ne = *p;
       dout(20) << "merge_log " << ne << dendl;
       log.index(ne);
-      if (ne.soid <= info.last_backfill) {
+      if (cmp(ne.soid, info.last_backfill, info.last_backfill_bitwise) <= 0) {
 	missing.add_next_event(ne);
 	if (ne.is_delete())
 	  rollbacker->remove(ne.soid);
@@ -739,18 +737,22 @@ void PGLog::check() {
 }
 
 void PGLog::write_log(
-  ObjectStore::Transaction& t, const coll_t& coll, const ghobject_t &log_oid)
+  ObjectStore::Transaction& t,
+  map<string,bufferlist> *km,
+  const coll_t& coll, const ghobject_t &log_oid)
 {
   if (is_dirty()) {
-    dout(10) << "write_log with: "
+    dout(5) << "write_log with: "
 	     << "dirty_to: " << dirty_to
 	     << ", dirty_from: " << dirty_from
-	     << ", dirty_divergent_priors: " << dirty_divergent_priors
+	     << ", dirty_divergent_priors: "
+	     << (dirty_divergent_priors ? "true" : "false")
+	     << ", divergent_priors: " << divergent_priors.size()
 	     << ", writeout_from: " << writeout_from
 	     << ", trimmed: " << trimmed
 	     << dendl;
     _write_log(
-      t, log, coll, log_oid, divergent_priors,
+      t, km, log, coll, log_oid, divergent_priors,
       dirty_to,
       dirty_from,
       writeout_from,
@@ -764,19 +766,24 @@ void PGLog::write_log(
   }
 }
 
-void PGLog::write_log(ObjectStore::Transaction& t, pg_log_t &log,
+void PGLog::write_log(
+    ObjectStore::Transaction& t,
+    map<string,bufferlist> *km,
+    pg_log_t &log,
     const coll_t& coll, const ghobject_t &log_oid,
     map<eversion_t, hobject_t> &divergent_priors)
 {
   _write_log(
-    t, log, coll, log_oid,
+    t, km, log, coll, log_oid,
     divergent_priors, eversion_t::max(), eversion_t(), eversion_t(),
     set<eversion_t>(),
     true, true, 0);
 }
 
 void PGLog::_write_log(
-  ObjectStore::Transaction& t, pg_log_t &log,
+  ObjectStore::Transaction& t,
+  map<string,bufferlist> *km,
+  pg_log_t &log,
   const coll_t& coll, const ghobject_t &log_oid,
   map<eversion_t, hobject_t> &divergent_priors,
   eversion_t dirty_to,
@@ -816,13 +823,12 @@ void PGLog::_write_log(
     clear_after(log_keys_debug, dirty_from.get_key_name());
   }
 
-  map<string,bufferlist> keys;
   for (list<pg_log_entry_t>::iterator p = log.log.begin();
        p != log.log.end() && p->version <= dirty_to;
        ++p) {
     bufferlist bl(sizeof(*p) * 2);
     p->encode_with_checksum(bl);
-    keys[p->get_key_name()].claim(bl);
+    (*km)[p->get_key_name()].claim(bl);
   }
 
   for (list<pg_log_entry_t>::reverse_iterator p = log.log.rbegin();
@@ -832,13 +838,15 @@ void PGLog::_write_log(
        ++p) {
     bufferlist bl(sizeof(*p) * 2);
     p->encode_with_checksum(bl);
-    keys[p->get_key_name()].claim(bl);
+    (*km)[p->get_key_name()].claim(bl);
   }
 
   if (log_keys_debug) {
-    for (map<string, bufferlist>::iterator i = keys.begin();
-	 i != keys.end();
+    for (map<string, bufferlist>::iterator i = (*km).begin();
+	 i != (*km).end();
 	 ++i) {
+      if (i->first[0] == '_')
+	continue;
       assert(!log_keys_debug->count(i->first));
       log_keys_debug->insert(i->first);
     }
@@ -846,14 +854,13 @@ void PGLog::_write_log(
 
   if (dirty_divergent_priors) {
     //dout(10) << "write_log: writing divergent_priors" << dendl;
-    ::encode(divergent_priors, keys["divergent_priors"]);
+    ::encode(divergent_priors, (*km)["divergent_priors"]);
   }
-  ::encode(log.can_rollback_to, keys["can_rollback_to"]);
-  ::encode(log.rollback_info_trimmed_to, keys["rollback_info_trimmed_to"]);
+  ::encode(log.can_rollback_to, (*km)["can_rollback_to"]);
+  ::encode(log.rollback_info_trimmed_to, (*km)["rollback_info_trimmed_to"]);
 
   if (!to_remove.empty())
     t.omap_rmkeys(coll, log_oid, to_remove);
-  t.omap_setkeys(coll, log_oid, keys);
 }
 
 void PGLog::read_log(ObjectStore *store, coll_t pg_coll,
@@ -921,12 +928,13 @@ void PGLog::read_log(ObjectStore *store, coll_t pg_coll,
     dout(10) << "read_log checking for missing items over interval (" << info.last_complete
 	     << "," << info.last_update << "]" << dendl;
 
-    set<hobject_t> did;
+    set<hobject_t, hobject_t::BitwiseComparator> did;
     for (list<pg_log_entry_t>::reverse_iterator i = log.log.rbegin();
 	 i != log.log.rend();
 	 ++i) {
       if (i->version <= info.last_complete) break;
-      if (i->soid > info.last_backfill) continue;
+      if (cmp(i->soid, info.last_backfill, info.last_backfill_bitwise) > 0)
+	continue;
       if (did.count(i->soid)) continue;
       did.insert(i->soid);
       
@@ -954,7 +962,8 @@ void PGLog::read_log(ObjectStore *store, coll_t pg_coll,
 	 i != divergent_priors.rend();
 	 ++i) {
       if (i->first <= info.last_complete) break;
-      if (i->second > info.last_backfill) continue;
+      if (cmp(i->second, info.last_backfill, info.last_backfill_bitwise) > 0)
+	continue;
       if (did.count(i->second)) continue;
       did.insert(i->second);
       bufferlist bv;
diff --git a/src/osd/PGLog.h b/src/osd/PGLog.h
index 7029e90..744f318 100644
--- a/src/osd/PGLog.h
+++ b/src/osd/PGLog.h
@@ -388,9 +388,12 @@ public:
   //////////////////// get or set missing ////////////////////
 
   const pg_missing_t& get_missing() const { return missing; }
+  void resort_missing(bool sort_bitwise) {
+    missing.resort(sort_bitwise);
+  }
 
-  void missing_got(map<hobject_t, pg_missing_t::item>::const_iterator m) {
-    map<hobject_t, pg_missing_t::item>::iterator p = missing.missing.find(m->first);
+  void missing_got(map<hobject_t, pg_missing_t::item, hobject_t::ComparatorWithDefault>::const_iterator m) {
+    map<hobject_t, pg_missing_t::item, hobject_t::ComparatorWithDefault>::iterator p = missing.missing.find(m->first);
     missing.got(p);
   }
 
@@ -406,8 +409,8 @@ public:
     missing.add(oid, need, have);
   }
 
-  void missing_rm(map<hobject_t, pg_missing_t::item>::const_iterator m) {
-    map<hobject_t, pg_missing_t::item>::iterator p = missing.missing.find(m->first);
+  void missing_rm(map<hobject_t, pg_missing_t::item, hobject_t::ComparatorWithDefault>::const_iterator m) {
+    map<hobject_t, pg_missing_t::item, hobject_t::ComparatorWithDefault>::iterator p = missing.missing.find(m->first);
     missing.rm(p);
   }
 
@@ -550,7 +553,7 @@ public:
 protected:
   static void split_by_object(
     list<pg_log_entry_t> &entries,
-    map<hobject_t, list<pg_log_entry_t> > *out_entries) {
+    map<hobject_t, list<pg_log_entry_t>, hobject_t::BitwiseComparator> *out_entries) {
     while (!entries.empty()) {
       list<pg_log_entry_t> &out_list = (*out_entries)[entries.front().soid];
       out_list.splice(out_list.end(), entries, entries.begin());
@@ -583,9 +586,9 @@ protected:
     map<eversion_t, hobject_t> *priors,  ///< [out] target for new priors
     LogEntryHandler *rollbacker          ///< [in] optional rollbacker object
     ) {
-    map<hobject_t, list<pg_log_entry_t> > split;
+    map<hobject_t, list<pg_log_entry_t>, hobject_t::BitwiseComparator > split;
     split_by_object(entries, &split);
-    for (map<hobject_t, list<pg_log_entry_t> >::iterator i = split.begin();
+    for (map<hobject_t, list<pg_log_entry_t>, hobject_t::BitwiseComparator>::iterator i = split.begin();
 	 i != split.end();
 	 ++i) {
       boost::optional<pair<eversion_t, hobject_t> > new_divergent_prior;
@@ -640,15 +643,22 @@ public:
 		 pg_info_t &info, LogEntryHandler *rollbacker,
 		 bool &dirty_info, bool &dirty_big_info);
 
-  void write_log(ObjectStore::Transaction& t, const coll_t& coll,
+  void write_log(ObjectStore::Transaction& t,
+		 map<string,bufferlist> *km,
+		 const coll_t& coll,
 		 const ghobject_t &log_oid);
 
-  static void write_log(ObjectStore::Transaction& t, pg_log_t &log,
+  static void write_log(
+    ObjectStore::Transaction& t,
+    map<string,bufferlist>* km,
+    pg_log_t &log,
     const coll_t& coll,
     const ghobject_t &log_oid, map<eversion_t, hobject_t> &divergent_priors);
 
   static void _write_log(
-    ObjectStore::Transaction& t, pg_log_t &log,
+    ObjectStore::Transaction& t,
+    map<string,bufferlist>* km,
+    pg_log_t &log,
     const coll_t& coll, const ghobject_t &log_oid,
     map<eversion_t, hobject_t> &divergent_priors,
     eversion_t dirty_to,
diff --git a/src/osd/ReplicatedBackend.cc b/src/osd/ReplicatedBackend.cc
index b86d4d1..824ce46 100644
--- a/src/osd/ReplicatedBackend.cc
+++ b/src/osd/ReplicatedBackend.cc
@@ -62,11 +62,9 @@ static void log_subop_stats(
 ReplicatedBackend::ReplicatedBackend(
   PGBackend::Listener *pg,
   coll_t coll,
-  coll_t temp_coll,
   ObjectStore *store,
   CephContext *cct) :
-  PGBackend(pg, store,
-	    coll, temp_coll),
+  PGBackend(pg, store, coll),
   cct(cct) {}
 
 void ReplicatedBackend::run_recovery_op(
@@ -110,13 +108,13 @@ void ReplicatedBackend::recover_object(
 
 void ReplicatedBackend::check_recovery_sources(const OSDMapRef osdmap)
 {
-  for(map<pg_shard_t, set<hobject_t> >::iterator i = pull_from_peer.begin();
+  for(map<pg_shard_t, set<hobject_t, hobject_t::BitwiseComparator> >::iterator i = pull_from_peer.begin();
       i != pull_from_peer.end();
       ) {
     if (osdmap->is_down(i->first.osd)) {
       dout(10) << "check_recovery_sources resetting pulls from osd." << i->first
 	       << ", osdmap has it marked down" << dendl;
-      for (set<hobject_t>::iterator j = i->second.begin();
+      for (set<hobject_t, hobject_t::BitwiseComparator>::iterator j = i->second.begin();
 	   j != i->second.end();
 	   ++j) {
 	assert(pulling.count(*j) == 1);
@@ -252,15 +250,6 @@ void ReplicatedBackend::on_change()
 
 void ReplicatedBackend::on_flushed()
 {
-  if (have_temp_coll() &&
-      !store->collection_empty(get_temp_coll())) {
-    vector<hobject_t> objects;
-    store->collection_list(get_temp_coll(), objects);
-    derr << __func__ << ": found objects in the temp collection: "
-	 << objects << ", crashing now"
-	 << dendl;
-    assert(0 == "found garbage in the temp collection");
-  }
 }
 
 int ReplicatedBackend::objects_read_sync(
@@ -270,7 +259,7 @@ int ReplicatedBackend::objects_read_sync(
   uint32_t op_flags,
   bufferlist *bl)
 {
-  return store->read(coll, hoid, off, len, *bl, op_flags);
+  return store->read(coll, ghobject_t(hoid), off, len, *bl, op_flags);
 }
 
 struct AsyncReadCallback : public GenContext<ThreadPool::TPHandle&> {
@@ -289,15 +278,19 @@ void ReplicatedBackend::objects_read_async(
   const hobject_t &hoid,
   const list<pair<boost::tuple<uint64_t, uint64_t, uint32_t>,
 		  pair<bufferlist*, Context*> > > &to_read,
-  Context *on_complete)
+  Context *on_complete,
+  bool fast_read)
 {
+  // There is no fast read implementation for replication backend yet
+  assert(!fast_read);
+
   int r = 0;
   for (list<pair<boost::tuple<uint64_t, uint64_t, uint32_t>,
 		 pair<bufferlist*, Context*> > >::const_iterator i =
 	   to_read.begin();
        i != to_read.end() && r >= 0;
        ++i) {
-    int _r = store->read(coll, hoid, i->first.get<0>(),
+    int _r = store->read(coll, ghobject_t(hoid), i->first.get<0>(),
 			 i->first.get<1>(), *(i->second.first),
 			 i->first.get<2>());
     if (i->second.second) {
@@ -316,9 +309,8 @@ void ReplicatedBackend::objects_read_async(
 
 class RPGTransaction : public PGBackend::PGTransaction {
   coll_t coll;
-  coll_t temp_coll;
-  set<hobject_t> temp_added;
-  set<hobject_t> temp_cleared;
+  set<hobject_t, hobject_t::BitwiseComparator> temp_added;
+  set<hobject_t, hobject_t::BitwiseComparator> temp_cleared;
   ObjectStore::Transaction *t;
   uint64_t written;
   const coll_t &get_coll_ct(const hobject_t &hoid) {
@@ -336,17 +328,13 @@ class RPGTransaction : public PGBackend::PGTransaction {
     return get_coll(hoid);
   }
   const coll_t &get_coll(const hobject_t &hoid) {
-    if (hoid.is_temp())
-      return temp_coll;
-    else
-      return coll;
+    return coll;
   }
 public:
-  RPGTransaction(coll_t coll, coll_t temp_coll, bool use_tbl)
-    : coll(coll), temp_coll(temp_coll), t(new ObjectStore::Transaction), written(0)
-    {
-      t->set_use_tbl(use_tbl);
-    }
+  RPGTransaction(coll_t coll, bool use_tbl)
+    : coll(coll), t(new ObjectStore::Transaction), written(0) {
+    t->set_use_tbl(use_tbl);
+  }
 
   /// Yields ownership of contained transaction
   ObjectStore::Transaction *get_transaction() {
@@ -354,10 +342,10 @@ public:
     t = 0;
     return _t;
   }
-  const set<hobject_t> &get_temp_added() {
+  const set<hobject_t, hobject_t::BitwiseComparator> &get_temp_added() {
     return temp_added;
   }
-  const set<hobject_t> &get_temp_cleared() {
+  const set<hobject_t, hobject_t::BitwiseComparator> &get_temp_cleared() {
     return temp_cleared;
   }
 
@@ -369,38 +357,38 @@ public:
     uint32_t fadvise_flags
     ) {
     written += len;
-    t->write(get_coll_ct(hoid), hoid, off, len, bl, fadvise_flags);
+    t->write(get_coll_ct(hoid), ghobject_t(hoid), off, len, bl, fadvise_flags);
   }
   void remove(
     const hobject_t &hoid
     ) {
-    t->remove(get_coll_rm(hoid), hoid);
+    t->remove(get_coll_rm(hoid), ghobject_t(hoid));
   }
   void stash(
     const hobject_t &hoid,
     version_t former_version) {
     t->collection_move_rename(
-      coll, hoid, coll,
+      coll, ghobject_t(hoid), coll,
       ghobject_t(hoid, former_version, shard_id_t::NO_SHARD));
   }
   void setattrs(
     const hobject_t &hoid,
     map<string, bufferlist> &attrs
     ) {
-    t->setattrs(get_coll(hoid), hoid, attrs);
+    t->setattrs(get_coll(hoid), ghobject_t(hoid), attrs);
   }
   void setattr(
     const hobject_t &hoid,
     const string &attrname,
     bufferlist &bl
     ) {
-    t->setattr(get_coll(hoid), hoid, attrname, bl);
+    t->setattr(get_coll(hoid), ghobject_t(hoid), attrname, bl);
   }
   void rmattr(
     const hobject_t &hoid,
     const string &attrname
     ) {
-    t->rmattr(get_coll(hoid), hoid, attrname);
+    t->rmattr(get_coll(hoid), ghobject_t(hoid), attrname);
   }
   void omap_setkeys(
     const hobject_t &hoid,
@@ -408,25 +396,25 @@ public:
     ) {
     for (map<string, bufferlist>::iterator p = keys.begin(); p != keys.end(); ++p)
       written += p->first.length() + p->second.length();
-    return t->omap_setkeys(get_coll(hoid), hoid, keys);
+    return t->omap_setkeys(get_coll(hoid), ghobject_t(hoid), keys);
   }
   void omap_rmkeys(
     const hobject_t &hoid,
     set<string> &keys
     ) {
-    t->omap_rmkeys(get_coll(hoid), hoid, keys);
+    t->omap_rmkeys(get_coll(hoid), ghobject_t(hoid), keys);
   }
   void omap_clear(
     const hobject_t &hoid
     ) {
-    t->omap_clear(get_coll(hoid), hoid);
+    t->omap_clear(get_coll(hoid), ghobject_t(hoid));
   }
   void omap_setheader(
     const hobject_t &hoid,
     bufferlist &header
     ) {
     written += header.length();
-    t->omap_setheader(get_coll(hoid), hoid, header);
+    t->omap_setheader(get_coll(hoid), ghobject_t(hoid), header);
   }
   void clone_range(
     const hobject_t &from,
@@ -436,14 +424,14 @@ public:
     uint64_t tooff
     ) {
     assert(get_coll(from) == get_coll_ct(to)  && get_coll(from) == coll);
-    t->clone_range(coll, from, to, fromoff, len, tooff);
+    t->clone_range(coll, ghobject_t(from), ghobject_t(to), fromoff, len, tooff);
   }
   void clone(
     const hobject_t &from,
     const hobject_t &to
     ) {
     assert(get_coll(from) == get_coll_ct(to)  && get_coll(from) == coll);
-    t->clone(coll, from, to);
+    t->clone(coll, ghobject_t(from), ghobject_t(to));
   }
   void rename(
     const hobject_t &from,
@@ -451,29 +439,29 @@ public:
     ) {
     t->collection_move_rename(
       get_coll_rm(from),
-      from,
+      ghobject_t(from),
       get_coll_ct(to),
-      to);
+      ghobject_t(to));
   }
 
   void touch(
     const hobject_t &hoid
     ) {
-    t->touch(get_coll_ct(hoid), hoid);
+    t->touch(get_coll_ct(hoid), ghobject_t(hoid));
   }
 
   void truncate(
     const hobject_t &hoid,
     uint64_t off
     ) {
-    t->truncate(get_coll(hoid), hoid, off);
+    t->truncate(get_coll(hoid), ghobject_t(hoid), off);
   }
   void zero(
     const hobject_t &hoid,
     uint64_t off,
     uint64_t len
     ) {
-    t->zero(get_coll(hoid), hoid, off, len);
+    t->zero(get_coll(hoid), ghobject_t(hoid), off, len);
   }
 
   void set_alloc_hint(
@@ -481,10 +469,11 @@ public:
     uint64_t expected_object_size,
     uint64_t expected_write_size
     ) {
-    t->set_alloc_hint(get_coll(hoid), hoid, expected_object_size,
+    t->set_alloc_hint(get_coll(hoid), ghobject_t(hoid), expected_object_size,
                       expected_write_size);
   }
 
+  using PGBackend::PGTransaction::append;
   void append(
     PGTransaction *_to_append
     ) {
@@ -493,13 +482,13 @@ public:
     written += to_append->written;
     to_append->written = 0;
     t->append(*(to_append->t));
-    for (set<hobject_t>::iterator i = to_append->temp_added.begin();
+    for (set<hobject_t, hobject_t::BitwiseComparator>::iterator i = to_append->temp_added.begin();
 	 i != to_append->temp_added.end();
 	 ++i) {
       temp_cleared.erase(*i);
       temp_added.insert(*i);
     }
-    for (set<hobject_t>::iterator i = to_append->temp_cleared.begin();
+    for (set<hobject_t, hobject_t::BitwiseComparator>::iterator i = to_append->temp_cleared.begin();
 	 i != to_append->temp_cleared.end();
 	 ++i) {
       temp_added.erase(*i);
@@ -520,7 +509,7 @@ public:
 
 PGBackend::PGTransaction *ReplicatedBackend::get_transaction()
 {
-  return new RPGTransaction(coll, get_temp_coll(), parent->transaction_use_tbl());
+  return new RPGTransaction(coll, parent->transaction_use_tbl());
 }
 
 class C_OSD_OnOpCommit : public Context {
@@ -600,10 +589,9 @@ void ReplicatedBackend::submit_transaction(
     &op,
     op_t);
 
-  ObjectStore::Transaction local_t;
-  local_t.set_use_tbl(op_t->get_use_tbl());
+  ObjectStore::Transaction *local_t = new ObjectStore::Transaction;
+  local_t->set_use_tbl(op_t->get_use_tbl());
   if (!(t->get_temp_added().empty())) {
-    get_temp_coll(&local_t);
     add_temp_objs(t->get_temp_added());
   }
   clear_temp_objs(t->get_temp_cleared());
@@ -614,10 +602,7 @@ void ReplicatedBackend::submit_transaction(
     trim_to,
     trim_rollback_to,
     true,
-    &local_t);
-
-  local_t.append(*op_t);
-  local_t.swap(*op_t);
+    local_t);
   
   op_t->register_on_applied_sync(on_local_applied_sync);
   op_t->register_on_applied(
@@ -625,11 +610,16 @@ void ReplicatedBackend::submit_transaction(
       new C_OSD_OnOpApplied(this, &op)));
   op_t->register_on_applied(
     new ObjectStore::C_DeleteTransaction(op_t));
+  op_t->register_on_applied(
+    new ObjectStore::C_DeleteTransaction(local_t));
   op_t->register_on_commit(
     parent->bless_context(
       new C_OSD_OnOpCommit(this, &op)));
-      
-  parent->queue_transaction(op_t, op.op);
+
+  list<ObjectStore::Transaction*> tls;
+  tls.push_back(local_t);
+  tls.push_back(op_t);
+  parent->queue_transactions(tls, op.op);
   delete t;
 }
 
@@ -709,12 +699,18 @@ void ReplicatedBackend::sub_op_modify_reply(OpRequestRef op)
     if (r->ack_type & CEPH_OSD_FLAG_ONDISK) {
       assert(ip_op.waiting_for_commit.count(from));
       ip_op.waiting_for_commit.erase(from);
-      if (ip_op.op)
-	ip_op.op->mark_event("sub_op_commit_rec");
+      if (ip_op.op) {
+        ostringstream ss;
+        ss << "sub_op_commit_rec from " << from;
+	ip_op.op->mark_event(ss.str());
+      }
     } else {
       assert(ip_op.waiting_for_applied.count(from));
-      if (ip_op.op)
-	ip_op.op->mark_event("sub_op_applied_rec");
+      if (ip_op.op) {
+        ostringstream ss;
+        ss << "sub_op_applied_rec from " << from;
+	ip_op.op->mark_event(ss.str());
+      }
     }
     ip_op.waiting_for_applied.erase(from);
 
@@ -750,13 +746,16 @@ void ReplicatedBackend::be_deep_scrub(
   bufferlist bl, hdrbl;
   int r;
   __u64 pos = 0;
+
+  uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
+
   while ( (r = store->read(
-	     coll,
-	     ghobject_t(
-	       poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
-	     pos,
-	     cct->_conf->osd_deep_scrub_stride, bl,
-	     true)) > 0) {
+             coll,
+             ghobject_t(
+               poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+             pos,
+             cct->_conf->osd_deep_scrub_stride, bl,
+             fadvise_flags, true)) > 0) {
     handle.reset_tp_timeout();
     h << bl;
     pos += bl.length();
@@ -872,7 +871,7 @@ struct C_ReplicatedBackend_OnPullComplete : GenContext<ThreadPool::TPHandle&> {
 	   to_continue.begin();
 	 i != to_continue.end();
 	 ++i) {
-      map<hobject_t, ReplicatedBackend::PullInfo>::iterator j =
+      map<hobject_t, ReplicatedBackend::PullInfo, hobject_t::BitwiseComparator>::iterator j =
 	bc->pulling.find(*i);
       assert(j != bc->pulling.end());
       if (!bc->start_pushes(*i, j->second.obc, h)) {
@@ -1158,14 +1157,13 @@ void ReplicatedBackend::sub_op_modify_impl(OpRequestRef op)
   if (m->new_temp_oid != hobject_t()) {
     dout(20) << __func__ << " start tracking temp " << m->new_temp_oid << dendl;
     add_temp_obj(m->new_temp_oid);
-    get_temp_coll(&rm->localt);
   }
   if (m->discard_temp_oid != hobject_t()) {
     dout(20) << __func__ << " stop tracking temp " << m->discard_temp_oid << dendl;
     if (rm->opt.empty()) {
       dout(10) << __func__ << ": removing object " << m->discard_temp_oid
 	       << " since we won't get the transaction" << dendl;
-      rm->localt.remove(temp_coll, m->discard_temp_oid);
+      rm->localt.remove(coll, ghobject_t(m->discard_temp_oid));
     }
     clear_temp_obj(m->discard_temp_oid);
   }
@@ -1195,14 +1193,16 @@ void ReplicatedBackend::sub_op_modify_impl(OpRequestRef op)
 
   op->mark_started();
 
-  rm->localt.append(rm->opt);
-  rm->localt.register_on_commit(
+  rm->opt.register_on_commit(
     parent->bless_context(
       new C_OSD_RepModifyCommit(this, rm)));
   rm->localt.register_on_applied(
     parent->bless_context(
       new C_OSD_RepModifyApply(this, rm)));
-  parent->queue_transaction(&(rm->localt), op);
+  list<ObjectStore::Transaction*> tls;
+  tls.push_back(&(rm->localt));
+  tls.push_back(&(rm->opt));
+  parent->queue_transactions(tls, op);
   // op is cleaned up by oncommit/onapply when both are executed
 }
 
@@ -1261,7 +1261,7 @@ void ReplicatedBackend::sub_op_modify_commit(RepModifyRef rm)
   get_parent()->update_last_complete_ondisk(rm->last_complete);
 
   Message *m = rm->op->get_req();
-  Message *commit;
+  Message *commit = NULL;
   if (m->get_type() == MSG_OSD_SUBOP) {
     // doesn't have CLIENT SUBOP feature ,use Subop
     MOSDSubOpReply  *reply = new MOSDSubOpReply(
@@ -1297,7 +1297,7 @@ void ReplicatedBackend::calc_head_subsets(
   const pg_missing_t& missing,
   const hobject_t &last_backfill,
   interval_set<uint64_t>& data_subset,
-  map<hobject_t, interval_set<uint64_t> >& clone_subsets)
+  map<hobject_t, interval_set<uint64_t>, hobject_t::BitwiseComparator>& clone_subsets)
 {
   dout(10) << "calc_head_subsets " << head
 	   << " clone_overlap " << snapset.clone_overlap << dendl;
@@ -1326,7 +1326,8 @@ void ReplicatedBackend::calc_head_subsets(
     hobject_t c = head;
     c.snap = snapset.clones[j];
     prev.intersection_of(snapset.clone_overlap[snapset.clones[j]]);
-    if (!missing.is_missing(c) && c < last_backfill) {
+    if (!missing.is_missing(c) &&
+	cmp(c, last_backfill, get_parent()->sort_bitwise()) < 0) {
       dout(10) << "calc_head_subsets " << head << " has prev " << c
 	       << " overlap " << prev << dendl;
       clone_subsets[c] = prev;
@@ -1357,7 +1358,7 @@ void ReplicatedBackend::calc_clone_subsets(
   const pg_missing_t& missing,
   const hobject_t &last_backfill,
   interval_set<uint64_t>& data_subset,
-  map<hobject_t, interval_set<uint64_t> >& clone_subsets)
+  map<hobject_t, interval_set<uint64_t>, hobject_t::BitwiseComparator>& clone_subsets)
 {
   dout(10) << "calc_clone_subsets " << soid
 	   << " clone_overlap " << snapset.clone_overlap << dendl;
@@ -1390,7 +1391,8 @@ void ReplicatedBackend::calc_clone_subsets(
     hobject_t c = soid;
     c.snap = snapset.clones[j];
     prev.intersection_of(snapset.clone_overlap[snapset.clones[j]]);
-    if (!missing.is_missing(c) && c < last_backfill) {
+    if (!missing.is_missing(c) &&
+	cmp(c, last_backfill, get_parent()->sort_bitwise()) < 0) {
       dout(10) << "calc_clone_subsets " << soid << " has prev " << c
 	       << " overlap " << prev << dendl;
       clone_subsets[c] = prev;
@@ -1409,7 +1411,8 @@ void ReplicatedBackend::calc_clone_subsets(
     hobject_t c = soid;
     c.snap = snapset.clones[j];
     next.intersection_of(snapset.clone_overlap[snapset.clones[j-1]]);
-    if (!missing.is_missing(c) && c < last_backfill) {
+    if (!missing.is_missing(c) &&
+	cmp(c, last_backfill, get_parent()->sort_bitwise()) < 0) {
       dout(10) << "calc_clone_subsets " << soid << " has next " << c
 	       << " overlap " << next << dendl;
       clone_subsets[c] = next;
@@ -1445,11 +1448,11 @@ void ReplicatedBackend::prepare_pull(
   eversion_t _v = get_parent()->get_local_missing().missing.find(
     soid)->second.need;
   assert(_v == v);
-  const map<hobject_t, set<pg_shard_t> > &missing_loc(
+  const map<hobject_t, set<pg_shard_t>, hobject_t::BitwiseComparator> &missing_loc(
     get_parent()->get_missing_loc_shards());
   const map<pg_shard_t, pg_missing_t > &peer_missing(
     get_parent()->get_shard_missing());
-  map<hobject_t, set<pg_shard_t> >::const_iterator q = missing_loc.find(soid);
+  map<hobject_t, set<pg_shard_t>, hobject_t::BitwiseComparator>::const_iterator q = missing_loc.find(soid);
   assert(q != missing_loc.end());
   assert(!q->second.empty());
 
@@ -1500,6 +1503,9 @@ void ReplicatedBackend::prepare_pull(
 		       recovery_info.clone_subset);
     // FIXME: this may overestimate if we are pulling multiple clones in parallel...
     dout(10) << " pulling " << recovery_info << dendl;
+
+    assert(ssc->snapset.clone_size.count(soid.snap));
+    recovery_info.size = ssc->snapset.clone_size[soid.snap];
   } else {
     // pulling head or unversioned object.
     // always pull the whole thing.
@@ -1525,6 +1531,7 @@ void ReplicatedBackend::prepare_pull(
   pi.head_ctx = headctx;
   pi.recovery_info = op.recovery_info;
   pi.recovery_progress = op.recovery_progress;
+  pi.cache_dont_need = h->cache_dont_need;
 }
 
 /*
@@ -1533,7 +1540,7 @@ void ReplicatedBackend::prepare_pull(
  */
 void ReplicatedBackend::prep_push_to_replica(
   ObjectContextRef obc, const hobject_t& soid, pg_shard_t peer,
-  PushOp *pop)
+  PushOp *pop, bool cache_dont_need)
 {
   const object_info_t& oi = obc->obs.oi;
   uint64_t size = obc->obs.oi.size;
@@ -1541,7 +1548,7 @@ void ReplicatedBackend::prep_push_to_replica(
   dout(10) << __func__ << ": " << soid << " v" << oi.version
 	   << " size " << size << " to osd." << peer << dendl;
 
-  map<hobject_t, interval_set<uint64_t> > clone_subsets;
+  map<hobject_t, interval_set<uint64_t>, hobject_t::BitwiseComparator> clone_subsets;
   interval_set<uint64_t> data_subset;
 
   // are we doing a clone on the replica?
@@ -1589,7 +1596,7 @@ void ReplicatedBackend::prep_push_to_replica(
       data_subset, clone_subsets);
   }
 
-  prep_push(obc, soid, peer, oi.version, data_subset, clone_subsets, pop);
+  prep_push(obc, soid, peer, oi.version, data_subset, clone_subsets, pop, cache_dont_need);
 }
 
 void ReplicatedBackend::prep_push(ObjectContextRef obc,
@@ -1599,7 +1606,7 @@ void ReplicatedBackend::prep_push(ObjectContextRef obc,
   interval_set<uint64_t> data_subset;
   if (obc->obs.oi.size)
     data_subset.insert(0, obc->obs.oi.size);
-  map<hobject_t, interval_set<uint64_t> > clone_subsets;
+  map<hobject_t, interval_set<uint64_t>, hobject_t::BitwiseComparator> clone_subsets;
 
   prep_push(obc, soid, peer,
 	    obc->obs.oi.version, data_subset, clone_subsets,
@@ -1611,8 +1618,9 @@ void ReplicatedBackend::prep_push(
   const hobject_t& soid, pg_shard_t peer,
   eversion_t version,
   interval_set<uint64_t> &data_subset,
-  map<hobject_t, interval_set<uint64_t> >& clone_subsets,
-  PushOp *pop)
+  map<hobject_t, interval_set<uint64_t>, hobject_t::BitwiseComparator>& clone_subsets,
+  PushOp *pop,
+  bool cache_dont_need)
 {
   get_parent()->begin_peer_recover(peer, soid);
   // take note.
@@ -1634,7 +1642,7 @@ void ReplicatedBackend::prep_push(
 			pi.recovery_progress,
 			&new_progress,
 			pop,
-			&(pi.stat));
+			&(pi.stat), cache_dont_need);
   assert(r == 0);
   pi.recovery_progress = new_progress;
 }
@@ -1678,6 +1686,7 @@ void ReplicatedBackend::submit_push_data(
   ObjectRecoveryInfo &recovery_info,
   bool first,
   bool complete,
+  bool cache_dont_need,
   const interval_set<uint64_t> &intervals_included,
   bufferlist data_included,
   bufferlist omap_header,
@@ -1685,45 +1694,52 @@ void ReplicatedBackend::submit_push_data(
   map<string, bufferlist> &omap_entries,
   ObjectStore::Transaction *t)
 {
-  coll_t target_coll;
+  hobject_t target_oid;
   if (first && complete) {
-    target_coll = coll;
+    target_oid = recovery_info.soid;
   } else {
-    dout(10) << __func__ << ": Creating oid "
-	     << recovery_info.soid << " in the temp collection" << dendl;
-    add_temp_obj(recovery_info.soid);
-    target_coll = get_temp_coll(t);
+    target_oid = get_parent()->get_temp_recovery_object(recovery_info.version,
+							recovery_info.soid.snap);
+    if (first) {
+      dout(10) << __func__ << ": Adding oid "
+	       << target_oid << " in the temp collection" << dendl;
+      add_temp_obj(target_oid);
+    }
   }
 
   if (first) {
-    get_parent()->on_local_recover_start(recovery_info.soid, t);
-    t->remove(get_temp_coll(t), recovery_info.soid);
-    t->touch(target_coll, recovery_info.soid);
-    t->truncate(target_coll, recovery_info.soid, recovery_info.size);
-    t->omap_setheader(target_coll, recovery_info.soid, omap_header);
+    t->remove(coll, ghobject_t(target_oid));
+    t->touch(coll, ghobject_t(target_oid));
+    t->truncate(coll, ghobject_t(target_oid), recovery_info.size);
+    t->omap_setheader(coll, ghobject_t(target_oid), omap_header);
   }
   uint64_t off = 0;
+  uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL;
+  if (cache_dont_need)
+    fadvise_flags |= CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
   for (interval_set<uint64_t>::const_iterator p = intervals_included.begin();
        p != intervals_included.end();
        ++p) {
     bufferlist bit;
     bit.substr_of(data_included, off, p.get_len());
-    t->write(target_coll, recovery_info.soid,
-	     p.get_start(), p.get_len(), bit);
+    t->write(coll, ghobject_t(target_oid),
+	     p.get_start(), p.get_len(), bit, fadvise_flags);
     off += p.get_len();
   }
 
-  t->omap_setkeys(target_coll, recovery_info.soid,
-		  omap_entries);
-  t->setattrs(target_coll, recovery_info.soid,
-	      attrs);
+  if (!omap_entries.empty())
+    t->omap_setkeys(coll, ghobject_t(target_oid), omap_entries);
+  if (!attrs.empty())
+    t->setattrs(coll, ghobject_t(target_oid), attrs);
 
   if (complete) {
     if (!first) {
       dout(10) << __func__ << ": Removing oid "
-	       << recovery_info.soid << " from the temp collection" << dendl;
-      clear_temp_obj(recovery_info.soid);
-      t->collection_move(coll, target_coll, recovery_info.soid);
+	       << target_oid << " from the temp collection" << dendl;
+      clear_temp_obj(target_oid);
+      t->remove(coll, ghobject_t(recovery_info.soid));
+      t->collection_move_rename(coll, ghobject_t(target_oid),
+				coll, ghobject_t(recovery_info.soid));
     }
 
     submit_push_complete(recovery_info, t);
@@ -1733,7 +1749,7 @@ void ReplicatedBackend::submit_push_data(
 void ReplicatedBackend::submit_push_complete(ObjectRecoveryInfo &recovery_info,
 					     ObjectStore::Transaction *t)
 {
-  for (map<hobject_t, interval_set<uint64_t> >::const_iterator p =
+  for (map<hobject_t, interval_set<uint64_t>, hobject_t::BitwiseComparator>::const_iterator p =
 	 recovery_info.clone_subset.begin();
        p != recovery_info.clone_subset.end();
        ++p) {
@@ -1742,7 +1758,7 @@ void ReplicatedBackend::submit_push_complete(ObjectRecoveryInfo &recovery_info,
 	 ++q) {
       dout(15) << " clone_range " << p->first << " "
 	       << q.get_start() << "~" << q.get_len() << dendl;
-      t->clone_range(coll, p->first, recovery_info.soid,
+      t->clone_range(coll, ghobject_t(p->first), ghobject_t(recovery_info.soid),
 		     q.get_start(), q.get_len(), q.get_start());
     }
   }
@@ -1802,6 +1818,15 @@ bool ReplicatedBackend::handle_pull_response(
 
   bool first = pi.recovery_progress.first;
   if (first) {
+    // attrs only reference the origin bufferlist (decode from MOSDPGPush message)
+    // whose size is much greater than attrs in recovery. If obc cache it (get_obc maybe
+    // cache the attr), this causes the whole origin bufferlist would not be free until
+    // obc is evicted from obc cache. So rebuild the bufferlist before cache it.
+    for (map<string, bufferlist>::iterator it = pop.attrset.begin();
+         it != pop.attrset.end();
+         ++it) {
+      it->second.rebuild();
+    }
     pi.obc = get_parent()->get_obc(pi.recovery_info.soid, pop.attrset);
     pi.recovery_info.oi = pi.obc->obs.oi;
     pi.recovery_info = recalc_subsets(pi.recovery_info, pi.obc->ssc);
@@ -1830,7 +1855,7 @@ bool ReplicatedBackend::handle_pull_response(
   bool complete = pi.is_complete();
 
   submit_push_data(pi.recovery_info, first,
-		   complete,
+		   complete, pi.cache_dont_need,
 		   data_included, data,
 		   pop.omap_header,
 		   pop.attrset,
@@ -1874,6 +1899,7 @@ void ReplicatedBackend::handle_push(
   submit_push_data(pop.recovery_info,
 		   first,
 		   complete,
+		   true, // must be replicate
 		   pop.data_included,
 		   data,
 		   pop.omap_header,
@@ -1900,38 +1926,28 @@ void ReplicatedBackend::send_pushes(int prio, map<pg_shard_t, vector<PushOp> > &
       get_osdmap()->get_epoch());
     if (!con)
       continue;
-    if (!(con->get_features() & CEPH_FEATURE_OSD_PACKED_RECOVERY)) {
-      for (vector<PushOp>::iterator j = i->second.begin();
-	   j != i->second.end();
+    vector<PushOp>::iterator j = i->second.begin();
+    while (j != i->second.end()) {
+      uint64_t cost = 0;
+      uint64_t pushes = 0;
+      MOSDPGPush *msg = new MOSDPGPush();
+      msg->from = get_parent()->whoami_shard();
+      msg->pgid = get_parent()->primary_spg_t();
+      msg->map_epoch = get_osdmap()->get_epoch();
+      msg->set_priority(prio);
+      for (;
+           (j != i->second.end() &&
+	    cost < cct->_conf->osd_max_push_cost &&
+	    pushes < cct->_conf->osd_max_push_objects) ;
 	   ++j) {
-	dout(20) << __func__ << ": sending push (legacy) " << *j
+	dout(20) << __func__ << ": sending push " << *j
 		 << " to osd." << i->first << dendl;
-	send_push_op_legacy(prio, i->first, *j);
-      }
-    } else {
-      vector<PushOp>::iterator j = i->second.begin();
-      while (j != i->second.end()) {
-	uint64_t cost = 0;
-	uint64_t pushes = 0;
-	MOSDPGPush *msg = new MOSDPGPush();
-	msg->from = get_parent()->whoami_shard();
-	msg->pgid = get_parent()->primary_spg_t();
-	msg->map_epoch = get_osdmap()->get_epoch();
-	msg->set_priority(prio);
-	for (;
-	     (j != i->second.end() &&
-	      cost < cct->_conf->osd_max_push_cost &&
-	      pushes < cct->_conf->osd_max_push_objects) ;
-	     ++j) {
-	  dout(20) << __func__ << ": sending push " << *j
-		   << " to osd." << i->first << dendl;
-	  cost += j->cost(cct);
-	  pushes += 1;
-	  msg->pushes.push_back(*j);
-	}
-	msg->compute_cost(cct);
-	get_parent()->send_message_osd_cluster(msg, con);
+	cost += j->cost(cct);
+	pushes += 1;
+	msg->pushes.push_back(*j);
       }
+      msg->compute_cost(cct);
+      get_parent()->send_message_osd_cluster(msg, con);
     }
   }
 }
@@ -1946,30 +1962,16 @@ void ReplicatedBackend::send_pulls(int prio, map<pg_shard_t, vector<PullOp> > &p
       get_osdmap()->get_epoch());
     if (!con)
       continue;
-    if (!(con->get_features() & CEPH_FEATURE_OSD_PACKED_RECOVERY)) {
-      for (vector<PullOp>::iterator j = i->second.begin();
-	   j != i->second.end();
-	   ++j) {
-	dout(20) << __func__ << ": sending pull (legacy) " << *j
-		 << " to osd." << i->first << dendl;
-	send_pull_legacy(
-	  prio,
-	  i->first,
-	  j->recovery_info,
-	  j->recovery_progress);
-      }
-    } else {
-      dout(20) << __func__ << ": sending pulls " << i->second
-	       << " to osd." << i->first << dendl;
-      MOSDPGPull *msg = new MOSDPGPull();
-      msg->from = parent->whoami_shard();
-      msg->set_priority(prio);
-      msg->pgid = get_parent()->primary_spg_t();
-      msg->map_epoch = get_osdmap()->get_epoch();
-      msg->pulls.swap(i->second);
-      msg->compute_cost(cct);
-      get_parent()->send_message_osd_cluster(msg, con);
-    }
+    dout(20) << __func__ << ": sending pulls " << i->second
+	     << " to osd." << i->first << dendl;
+    MOSDPGPull *msg = new MOSDPGPull();
+    msg->from = parent->whoami_shard();
+    msg->set_priority(prio);
+    msg->pgid = get_parent()->primary_spg_t();
+    msg->map_epoch = get_osdmap()->get_epoch();
+    msg->pulls.swap(i->second);
+    msg->compute_cost(cct);
+    get_parent()->send_message_osd_cluster(msg, con);
   }
 }
 
@@ -1977,7 +1979,8 @@ int ReplicatedBackend::build_push_op(const ObjectRecoveryInfo &recovery_info,
 				     const ObjectRecoveryProgress &progress,
 				     ObjectRecoveryProgress *out_progress,
 				     PushOp *out_op,
-				     object_stat_sum_t *stat)
+				     object_stat_sum_t *stat,
+                                     bool cache_dont_need)
 {
   ObjectRecoveryProgress _new_progress;
   if (!out_progress)
@@ -1992,8 +1995,8 @@ int ReplicatedBackend::build_push_op(const ObjectRecoveryInfo &recovery_info,
           << dendl;
 
   if (progress.first) {
-    store->omap_get_header(coll, recovery_info.soid, &out_op->omap_header);
-    store->getattrs(coll, recovery_info.soid, out_op->attrset);
+    store->omap_get_header(coll, ghobject_t(recovery_info.soid), &out_op->omap_header);
+    store->getattrs(coll, ghobject_t(recovery_info.soid), out_op->attrset);
 
     // Debug
     bufferlist bv = out_op->attrset[OI_ATTR];
@@ -2015,7 +2018,7 @@ int ReplicatedBackend::build_push_op(const ObjectRecoveryInfo &recovery_info,
   if (!progress.omap_complete) {
     ObjectMap::ObjectMapIterator iter =
       store->get_omap_iterator(coll,
-			       recovery_info.soid);
+			       ghobject_t(recovery_info.soid));
     for (iter->lower_bound(progress.omap_recovered_to);
 	 iter->valid();
 	 iter->next()) {
@@ -2039,7 +2042,7 @@ int ReplicatedBackend::build_push_op(const ObjectRecoveryInfo &recovery_info,
     if (!recovery_info.copy_subset.empty()) {
       interval_set<uint64_t> copy_subset = recovery_info.copy_subset;
       bufferlist bl;
-      int r = store->fiemap(coll, recovery_info.soid, 0,
+      int r = store->fiemap(coll, ghobject_t(recovery_info.soid), 0,
                             copy_subset.range_end(), bl);
       if (r >= 0)  {
         interval_set<uint64_t> fiemap_included;
@@ -2068,8 +2071,9 @@ int ReplicatedBackend::build_push_op(const ObjectRecoveryInfo &recovery_info,
        p != out_op->data_included.end();
        ++p) {
     bufferlist bit;
-    store->read(coll, recovery_info.soid,
-		     p.get_start(), p.get_len(), bit);
+    store->read(coll, ghobject_t(recovery_info.soid),
+		p.get_start(), p.get_len(), bit,
+                cache_dont_need ? CEPH_OSD_OP_FLAG_FADVISE_DONTNEED: 0);
     if (p.get_len() != bit.length()) {
       dout(10) << " extent " << p.get_start() << "~" << p.get_len()
 	       << " is actually " << p.get_start() << "~" << bit.length()
@@ -2253,7 +2257,7 @@ void ReplicatedBackend::handle_pull(pg_shard_t peer, PullOp &op, PushOp *reply)
 {
   const hobject_t &soid = op.soid;
   struct stat st;
-  int r = store->stat(coll, soid, &st);
+  int r = store->stat(coll, ghobject_t(soid), &st);
   if (r != 0) {
     get_parent()->clog_error() << get_info().pgid << " "
 			       << peer << " tried to pull " << soid
@@ -2414,8 +2418,7 @@ int ReplicatedBackend::start_pushes(
       ++pushes;
       h->pushes[peer].push_back(PushOp());
       prep_push_to_replica(obc, soid, peer,
-			   &(h->pushes[peer].back())
-	);
+			   &(h->pushes[peer].back()), h->cache_dont_need);
     }
   }
   return pushes;
diff --git a/src/osd/ReplicatedBackend.h b/src/osd/ReplicatedBackend.h
index 5090657..a36007d 100644
--- a/src/osd/ReplicatedBackend.h
+++ b/src/osd/ReplicatedBackend.h
@@ -33,7 +33,6 @@ public:
   ReplicatedBackend(
     PGBackend::Listener *pg,
     coll_t coll,
-    coll_t temp_coll,
     ObjectStore *store,
     CephContext *cct);
 
@@ -98,14 +97,14 @@ public:
   virtual void dump_recovery_info(Formatter *f) const {
     {
       f->open_array_section("pull_from_peer");
-      for (map<pg_shard_t, set<hobject_t> >::const_iterator i = pull_from_peer.begin();
+      for (map<pg_shard_t, set<hobject_t, hobject_t::BitwiseComparator> >::const_iterator i = pull_from_peer.begin();
 	   i != pull_from_peer.end();
 	   ++i) {
 	f->open_object_section("pulling_from");
 	f->dump_stream("pull_from") << i->first;
 	{
 	  f->open_array_section("pulls");
-	  for (set<hobject_t>::const_iterator j = i->second.begin();
+	  for (set<hobject_t, hobject_t::BitwiseComparator>::const_iterator j = i->second.begin();
 	       j != i->second.end();
 	       ++j) {
 	    f->open_object_section("pull_info");
@@ -121,7 +120,7 @@ public:
     }
     {
       f->open_array_section("pushing");
-      for (map<hobject_t, map<pg_shard_t, PushInfo> >::const_iterator i =
+      for (map<hobject_t, map<pg_shard_t, PushInfo>, hobject_t::BitwiseComparator>::const_iterator i =
 	     pushing.begin();
 	   i != pushing.end();
 	   ++i) {
@@ -160,7 +159,8 @@ public:
     const hobject_t &hoid,
     const list<pair<boost::tuple<uint64_t, uint64_t, uint32_t>,
 	       pair<bufferlist*, Context*> > > &to_read,
-    Context *on_complete);
+               Context *on_complete,
+               bool fast_read = false);
 
 private:
   // push
@@ -183,7 +183,7 @@ private:
       }
     }
   };
-  map<hobject_t, map<pg_shard_t, PushInfo> > pushing;
+  map<hobject_t, map<pg_shard_t, PushInfo>, hobject_t::BitwiseComparator> pushing;
 
   // pull
   struct PullInfo {
@@ -192,6 +192,7 @@ private:
     ObjectContextRef head_ctx;
     ObjectContextRef obc;
     object_stat_sum_t stat;
+    bool cache_dont_need;
 
     void dump(Formatter *f) const {
       {
@@ -211,10 +212,10 @@ private:
     }
   };
 
-  map<hobject_t, PullInfo> pulling;
+  map<hobject_t, PullInfo, hobject_t::BitwiseComparator> pulling;
 
   // Reverse mapping from osd peer to objects beging pulled from that peer
-  map<pg_shard_t, set<hobject_t> > pull_from_peer;
+  map<pg_shard_t, set<hobject_t, hobject_t::BitwiseComparator> > pull_from_peer;
 
   void sub_op_push(OpRequestRef op);
   void sub_op_push_reply(OpRequestRef op);
@@ -263,10 +264,12 @@ private:
 		    const ObjectRecoveryProgress &progress,
 		    ObjectRecoveryProgress *out_progress,
 		    PushOp *out_op,
-		    object_stat_sum_t *stat = 0);
+		    object_stat_sum_t *stat = 0,
+                    bool cache_dont_need = true);
   void submit_push_data(ObjectRecoveryInfo &recovery_info,
 			bool first,
 			bool complete,
+			bool cache_dont_need,
 			const interval_set<uint64_t> &intervals_included,
 			bufferlist data_included,
 			bufferlist omap_header,
@@ -280,7 +283,7 @@ private:
     SnapSet& snapset, const hobject_t& poid, const pg_missing_t& missing,
     const hobject_t &last_backfill,
     interval_set<uint64_t>& data_subset,
-    map<hobject_t, interval_set<uint64_t> >& clone_subsets);
+    map<hobject_t, interval_set<uint64_t>, hobject_t::BitwiseComparator>& clone_subsets);
   void prepare_pull(
     eversion_t v,
     const hobject_t& soid,
@@ -292,7 +295,7 @@ private:
     RPGHandle *h);
   void prep_push_to_replica(
     ObjectContextRef obc, const hobject_t& soid, pg_shard_t peer,
-    PushOp *pop);
+    PushOp *pop, bool cache_dont_need = true);
   void prep_push(ObjectContextRef obc,
 		 const hobject_t& oid, pg_shard_t dest,
 		 PushOp *op);
@@ -300,13 +303,14 @@ private:
 		 const hobject_t& soid, pg_shard_t peer,
 		 eversion_t version,
 		 interval_set<uint64_t> &data_subset,
-		 map<hobject_t, interval_set<uint64_t> >& clone_subsets,
-		 PushOp *op);
+		 map<hobject_t, interval_set<uint64_t>, hobject_t::BitwiseComparator>& clone_subsets,
+		 PushOp *op,
+                 bool cache = false);
   void calc_head_subsets(ObjectContextRef obc, SnapSet& snapset, const hobject_t& head,
 			 const pg_missing_t& missing,
 			 const hobject_t &last_backfill,
 			 interval_set<uint64_t>& data_subset,
-			 map<hobject_t, interval_set<uint64_t> >& clone_subsets);
+			 map<hobject_t, interval_set<uint64_t>, hobject_t::BitwiseComparator>& clone_subsets);
   ObjectRecoveryInfo recalc_subsets(
     const ObjectRecoveryInfo& recovery_info,
     SnapSetContext *ssc
diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index 59d8efa..7d5946b 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -20,6 +20,7 @@
 #include "ReplicatedPG.h"
 #include "OSD.h"
 #include "OpRequest.h"
+#include "objclass/objclass.h"
 
 #include "common/errno.h"
 #include "common/perf_counters.h"
@@ -111,7 +112,7 @@ void ReplicatedPG::OpContext::start_async_reads(ReplicatedPG *pg)
   pg->pgbackend->objects_read_async(
     obc->obs.oi.soid,
     pending_async_reads,
-    new OnReadComplete(pg, this));
+    new OnReadComplete(pg, this), pg->get_pool().fast_read);
   pending_async_reads.clear();
 }
 void ReplicatedPG::OpContext::finish_read(ReplicatedPG *pg)
@@ -175,15 +176,6 @@ public:
 // ======================
 // PGBackend::Listener
 
-
-void ReplicatedPG::on_local_recover_start(
-  const hobject_t &oid,
-  ObjectStore::Transaction *t)
-{
-  pg_log.revise_have(oid, eversion_t());
-  remove_snap_mapped_object(*t, oid);
-}
-
 void ReplicatedPG::on_local_recover(
   const hobject_t &hoid,
   const object_stat_sum_t &stat_diff,
@@ -193,7 +185,9 @@ void ReplicatedPG::on_local_recover(
   )
 {
   dout(10) << __func__ << ": " << hoid << dendl;
+
   ObjectRecoveryInfo recovery_info(_recovery_info);
+  clear_object_snap_mapping(t, hoid);
   if (recovery_info.soid.snap < CEPH_NOSNAP) {
     assert(recovery_info.oi.snaps.size());
     OSDriver::OSTransaction _t(osdriver.get_transaction(t));
@@ -220,7 +214,8 @@ void ReplicatedPG::on_local_recover(
       recovery_info.oi.version = latest->version;
       bufferlist bl;
       ::encode(recovery_info.oi, bl);
-      t->setattr(coll, recovery_info.soid, OI_ATTR, bl);
+      assert(!pool.info.require_rollback());
+      t->setattr(coll, ghobject_t(recovery_info.soid), OI_ATTR, bl);
       if (obc)
 	obc->attr_cache[OI_ATTR] = bl;
     }
@@ -286,7 +281,7 @@ void ReplicatedPG::on_global_recover(
   missing_loc.recovered(soid);
   publish_stats_to_osd();
   dout(10) << "pushed " << soid << " to all replicas" << dendl;
-  map<hobject_t, ObjectContextRef>::iterator i = recovering.find(soid);
+  map<hobject_t, ObjectContextRef, hobject_t::BitwiseComparator>::iterator i = recovering.find(soid);
   assert(i != recovering.end());
 
   // recover missing won't have had an obc, but it gets filled in
@@ -377,23 +372,22 @@ bool ReplicatedPG::is_missing_object(const hobject_t& soid) const
   return pg_log.get_missing().missing.count(soid);
 }
 
-void ReplicatedPG::wait_for_unreadable_object(
-  const hobject_t& soid, OpRequestRef op)
+void ReplicatedPG::maybe_kick_recovery(
+  const hobject_t &soid)
 {
-  assert(is_unreadable_object(soid));
-
   eversion_t v;
-  bool needs_recovery = missing_loc.needs_recovery(soid, &v);
-  assert(needs_recovery);
+  if (!missing_loc.needs_recovery(soid, &v))
+    return;
 
-  map<hobject_t, ObjectContextRef>::const_iterator p = recovering.find(soid);
+  map<hobject_t, ObjectContextRef, hobject_t::BitwiseComparator>::const_iterator p = recovering.find(soid);
   if (p != recovering.end()) {
-    dout(7) << "missing " << soid << " v " << v << ", already recovering." << dendl;
+    dout(7) << "object " << soid << " v " << v << ", already recovering." << dendl;
   } else if (missing_loc.is_unfound(soid)) {
-    dout(7) << "missing " << soid << " v " << v << ", is unfound." << dendl;
+    dout(7) << "object " << soid << " v " << v << ", is unfound." << dendl;
   } else {
-    dout(7) << "missing " << soid << " v " << v << ", recovering." << dendl;
+    dout(7) << "object " << soid << " v " << v << ", recovering." << dendl;
     PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
+    h->cache_dont_need = false;
     if (is_missing_object(soid)) {
       recover_missing(soid, v, cct->_conf->osd_client_op_priority, h);
     } else {
@@ -401,6 +395,14 @@ void ReplicatedPG::wait_for_unreadable_object(
     }
     pgbackend->run_recovery_op(h, cct->_conf->osd_client_op_priority);
   }
+}
+
+void ReplicatedPG::wait_for_unreadable_object(
+  const hobject_t& soid, OpRequestRef op)
+{
+  assert(is_unreadable_object(soid));
+
+  maybe_kick_recovery(soid);
   waiting_for_unreadable_object[soid].push_back(op);
   op->mark_delayed("waiting for missing object");
 }
@@ -413,6 +415,12 @@ void ReplicatedPG::wait_for_all_missing(OpRequestRef op)
 
 bool ReplicatedPG::is_degraded_or_backfilling_object(const hobject_t& soid)
 {
+  /* The conditions below may clear (on_local_recover, before we queue
+   * the tranasction) before we actually requeue the degraded waiters
+   * in on_global_recover after the transaction completes.
+   */
+  if (waiting_for_degraded_object.count(soid))
+    return true;
   if (pg_log.get_missing().missing.count(soid))
     return true;
   assert(!actingbackfill.empty());
@@ -428,8 +436,8 @@ bool ReplicatedPG::is_degraded_or_backfilling_object(const hobject_t& soid)
     // Object is degraded if after last_backfill AND
     // we are backfilling it
     if (is_backfill_targets(peer) &&
-	peer_info[peer].last_backfill <= soid &&
-	last_backfill_started >= soid &&
+	cmp(peer_info[peer].last_backfill, soid, get_sort_bitwise()) <= 0 &&
+	cmp(last_backfill_started, soid, get_sort_bitwise()) >= 0 &&
 	backfills_in_flight.count(soid))
       return true;
   }
@@ -440,43 +448,44 @@ void ReplicatedPG::wait_for_degraded_object(const hobject_t& soid, OpRequestRef
 {
   assert(is_degraded_or_backfilling_object(soid));
 
-  // we don't have it (yet).
-  if (recovering.count(soid)) {
-    dout(7) << "degraded "
-	    << soid 
-	    << ", already recovering"
-	    << dendl;
-  } else if (missing_loc.is_unfound(soid)) {
-    dout(7) << "degraded "
-	    << soid
-	    << ", still unfound, waiting"
-	    << dendl;
-  } else {
-    dout(7) << "degraded " 
-	    << soid 
-	    << ", recovering"
-	    << dendl;
-    eversion_t v;
-    assert(!actingbackfill.empty());
-    for (set<pg_shard_t>::iterator i = actingbackfill.begin();
-	 i != actingbackfill.end();
-	 ++i) {
-      if (*i == get_primary()) continue;
-      pg_shard_t peer = *i;
-      if (peer_missing.count(peer) &&
-	  peer_missing[peer].missing.count(soid)) {
-	v = peer_missing[peer].missing[soid].need;
-	break;
-      }
-    }
-    PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
-    prep_object_replica_pushes(soid, v, h);
-    pgbackend->run_recovery_op(h, cct->_conf->osd_client_op_priority);
-  }
+  maybe_kick_recovery(soid);
   waiting_for_degraded_object[soid].push_back(op);
   op->mark_delayed("waiting for degraded object");
 }
 
+void ReplicatedPG::block_write_on_full_cache(
+  const hobject_t& _oid, OpRequestRef op)
+{
+  const hobject_t oid = _oid.get_head();
+  dout(20) << __func__ << ": blocking object " << oid
+	   << " on full cache" << dendl;
+  objects_blocked_on_cache_full.insert(oid);
+  waiting_for_cache_not_full.push_back(op);
+}
+
+void ReplicatedPG::block_write_on_snap_rollback(
+  const hobject_t& oid, ObjectContextRef obc, OpRequestRef op)
+{
+  dout(20) << __func__ << ": blocking object " << oid.get_head()
+	   << " on snap promotion " << obc->obs.oi.soid << dendl;
+  // otherwise, we'd have blocked in do_op
+  assert(oid.is_head());
+  assert(objects_blocked_on_snap_promotion.count(oid) == 0);
+  objects_blocked_on_snap_promotion[oid] = obc;
+  wait_for_blocked_object(obc->obs.oi.soid, op);
+}
+
+void ReplicatedPG::block_write_on_degraded_snap(
+  const hobject_t& snap, OpRequestRef op)
+{
+  dout(20) << __func__ << ": blocking object " << snap.get_head()
+	   << " on degraded snap " << snap << dendl;
+  // otherwise, we'd have blocked in do_op
+  assert(objects_blocked_on_degraded_snap.count(snap.get_head()) == 0);
+  objects_blocked_on_degraded_snap[snap.get_head()] = snap.snap;
+  wait_for_degraded_object(snap, op);
+}
+
 bool ReplicatedPG::maybe_await_blocked_snapset(
   const hobject_t &hoid,
   OpRequestRef op)
@@ -510,7 +519,49 @@ void ReplicatedPG::wait_for_blocked_object(const hobject_t& soid, OpRequestRef o
   op->mark_delayed("waiting for blocked object");
 }
 
-bool PGLSParentFilter::filter(bufferlist& xattr_data, bufferlist& outdata)
+class PGLSPlainFilter : public PGLSFilter {
+  string val;
+public:
+  virtual int init(bufferlist::iterator &params)
+  {
+    try {
+      ::decode(xattr, params);
+      ::decode(val, params);
+    } catch (buffer::error &e) {
+      return -EINVAL;
+    }
+
+    return 0;
+  }
+  virtual ~PGLSPlainFilter() {}
+  virtual bool filter(const hobject_t &obj, bufferlist& xattr_data,
+                      bufferlist& outdata);
+};
+
+class PGLSParentFilter : public PGLSFilter {
+  inodeno_t parent_ino;
+public:
+  PGLSParentFilter() {
+    xattr = "_parent";
+  }
+  virtual int init(bufferlist::iterator &params)
+  {
+    try {
+      ::decode(parent_ino, params);
+    } catch (buffer::error &e) {
+      return -EINVAL;
+    }
+    generic_dout(0) << "parent_ino=" << parent_ino << dendl;
+
+    return 0;
+  }
+  virtual ~PGLSParentFilter() {}
+  virtual bool filter(const hobject_t &obj, bufferlist& xattr_data,
+                      bufferlist& outdata);
+};
+
+bool PGLSParentFilter::filter(const hobject_t &obj,
+                              bufferlist& xattr_data, bufferlist& outdata)
 {
   bufferlist::iterator iter = xattr_data.begin();
   inode_backtrace_t bt;
@@ -531,7 +582,8 @@ bool PGLSParentFilter::filter(bufferlist& xattr_data, bufferlist& outdata)
   return false;
 }
 
-bool PGLSPlainFilter::filter(bufferlist& xattr_data, bufferlist& outdata)
+bool PGLSPlainFilter::filter(const hobject_t &obj,
+                             bufferlist& xattr_data, bufferlist& outdata)
 {
   if (val.size() != xattr_data.length())
     return false;
@@ -545,15 +597,22 @@ bool PGLSPlainFilter::filter(bufferlist& xattr_data, bufferlist& outdata)
 bool ReplicatedPG::pgls_filter(PGLSFilter *filter, hobject_t& sobj, bufferlist& outdata)
 {
   bufferlist bl;
-  int ret = pgbackend->objects_get_attr(
-    sobj,
-    filter->get_xattr(),
-    &bl);
-  dout(0) << "getattr (sobj=" << sobj << ", attr=" << filter->get_xattr() << ") returned " << ret << dendl;
-  if (ret < 0)
-    return false;
 
-  return filter->filter(bl, outdata);
+  // If filter has expressed an interest in an xattr, load it.
+  if (!filter->get_xattr().empty()) {
+    int ret = pgbackend->objects_get_attr(
+      sobj,
+      filter->get_xattr(),
+      &bl);
+    dout(0) << "getattr (sobj=" << sobj << ", attr=" << filter->get_xattr() << ") returned " << ret << dendl;
+    if (ret < 0) {
+      if (ret != -ENODATA || filter->reject_empty_xattr()) {
+        return false;
+      }
+    }
+  }
+
+  return filter->filter(sobj, bl, outdata);
 }
 
 int ReplicatedPG::get_pgls_filter(bufferlist::iterator& iter, PGLSFilter **pfilter)
@@ -569,16 +628,55 @@ int ReplicatedPG::get_pgls_filter(bufferlist::iterator& iter, PGLSFilter **pfilt
   }
 
   if (type.compare("parent") == 0) {
-    filter = new PGLSParentFilter(iter);
+    filter = new PGLSParentFilter();
   } else if (type.compare("plain") == 0) {
-    filter = new PGLSPlainFilter(iter);
+    filter = new PGLSPlainFilter();
   } else {
-    return -EINVAL;
-  }
+    std::size_t dot = type.find(".");
+    if (dot == std::string::npos || dot == 0 || dot == type.size() - 1) {
+      return -EINVAL;
+    }
 
-  *pfilter = filter;
+    const std::string class_name = type.substr(0, dot);
+    const std::string filter_name = type.substr(dot + 1);
+    ClassHandler::ClassData *cls = NULL;
+    int r = osd->class_handler->open_class(class_name, &cls);
+    if (r != 0) {
+      derr << "Error opening class '" << class_name << "': "
+           << cpp_strerror(r) << dendl;
+      return -EINVAL;
+    } else {
+      assert(cls);
+    }
+
+    ClassHandler::ClassFilter *class_filter = cls->get_filter(filter_name);
+    if (class_filter == NULL) {
+      derr << "Error finding filter '" << filter_name << "' in class "
+           << class_name << dendl;
+      return -EINVAL;
+    }
+    filter = class_filter->fn();
+    if (!filter) {
+      // Object classes are obliged to return us something, but let's
+      // give an error rather than asserting out.
+      derr << "Buggy class " << class_name << " failed to construct "
+              "filter " << filter_name << dendl;
+      return -EINVAL;
+    }
+  }
 
-  return  0;
+  assert(filter);
+  int r = filter->init(iter);
+  if (r < 0) {
+    derr << "Error initializing filter " << type << ": "
+         << cpp_strerror(r) << dendl;
+    delete filter;
+    return -EINVAL;
+  } else {
+    // Successfully constructed and initialized, return it.
+    *pfilter = filter;
+    return  0;
+  }
 }
 
 
@@ -717,7 +815,7 @@ int ReplicatedPG::do_command(cmdmap_t cmdmap, ostream& ss,
     }
     f->dump_int("num_missing", missing.num_missing());
     f->dump_int("num_unfound", get_num_unfound());
-    map<hobject_t,pg_missing_t::item>::const_iterator p = missing.missing.upper_bound(offset);
+    map<hobject_t,pg_missing_t::item, hobject_t::ComparatorWithDefault>::const_iterator p = missing.missing.upper_bound(offset);
     {
       f->open_array_section("objects");
       int32_t num = 0;
@@ -805,6 +903,10 @@ void ReplicatedPG::do_pg_op(OpRequestRef op)
 	result = -EINVAL;
 	break;
       }
+      if (filter) {
+	delete filter;
+	filter = NULL;
+      }
       result = get_pgls_filter(bp, &filter);
       if (result < 0)
         break;
@@ -840,7 +942,6 @@ void ReplicatedPG::do_pg_op(OpRequestRef op)
 	  current,
 	  list_size,
 	  list_size,
-	  snapid,
 	  &sentries,
 	  &next);
 	if (r != 0) {
@@ -849,7 +950,11 @@ void ReplicatedPG::do_pg_op(OpRequestRef op)
 	}
 
 	assert(snapid == CEPH_NOSNAP || pg_log.get_missing().missing.empty());
-	map<hobject_t, pg_missing_t::item>::const_iterator missing_iter =
+
+	// ensure sort order is correct
+	pg_log.resort_missing(get_sort_bitwise());
+
+	map<hobject_t, pg_missing_t::item, hobject_t::ComparatorWithDefault>::const_iterator missing_iter =
 	  pg_log.get_missing().missing.lower_bound(current);
 	vector<hobject_t>::iterator ls_iter = sentries.begin();
 	hobject_t _max = hobject_t::get_max();
@@ -870,7 +975,7 @@ void ReplicatedPG::do_pg_op(OpRequestRef op)
 	      ++ls_iter;
 	      ++missing_iter;
 	    }
-	  } else if (mcand < lcand) {
+	  } else if (cmp(mcand, lcand, get_sort_bitwise()) < 0) {
 	    candidate = mcand;
 	    assert(!mcand.is_max());
 	    ++missing_iter;
@@ -880,7 +985,7 @@ void ReplicatedPG::do_pg_op(OpRequestRef op)
 	    ++ls_iter;
 	  }
 
-	  if (candidate >= next) {
+	  if (cmp(candidate, next, get_sort_bitwise()) >= 0) {
 	    break;
 	  }
 
@@ -961,6 +1066,10 @@ void ReplicatedPG::do_pg_op(OpRequestRef op)
 	result = -EINVAL;
 	break;
       }
+      if (filter) {
+	delete filter;
+	filter = NULL;
+      }
       result = get_pgls_filter(bp, &filter);
       if (result < 0)
         break;
@@ -996,7 +1105,6 @@ void ReplicatedPG::do_pg_op(OpRequestRef op)
 	  current,
 	  list_size,
 	  list_size,
-	  snapid,
 	  &sentries,
 	  &next);
 	if (r != 0) {
@@ -1005,7 +1113,11 @@ void ReplicatedPG::do_pg_op(OpRequestRef op)
 	}
 
 	assert(snapid == CEPH_NOSNAP || pg_log.get_missing().missing.empty());
-	map<hobject_t, pg_missing_t::item>::const_iterator missing_iter =
+
+	// ensure sort order is correct
+	pg_log.resort_missing(get_sort_bitwise());
+
+	map<hobject_t, pg_missing_t::item, hobject_t::ComparatorWithDefault>::const_iterator missing_iter =
 	  pg_log.get_missing().missing.lower_bound(current);
 	vector<hobject_t>::iterator ls_iter = sentries.begin();
 	hobject_t _max = hobject_t::get_max();
@@ -1026,7 +1138,7 @@ void ReplicatedPG::do_pg_op(OpRequestRef op)
 	      ++ls_iter;
 	      ++missing_iter;
 	    }
-	  } else if (mcand < lcand) {
+	  } else if (cmp(mcand, lcand, get_sort_bitwise()) < 0) {
 	    candidate = mcand;
 	    assert(!mcand.is_max());
 	    ++missing_iter;
@@ -1036,7 +1148,7 @@ void ReplicatedPG::do_pg_op(OpRequestRef op)
 	    ++ls_iter;
 	  }
 
-	  if (candidate >= next) {
+	  if (cmp(candidate, next, get_sort_bitwise()) >= 0) {
 	    break;
 	  }
 	    
@@ -1106,9 +1218,7 @@ void ReplicatedPG::do_pg_op(OpRequestRef op)
 	     p != info.hit_set.history.end();
 	     ++p)
 	  ls.push_back(make_pair(p->begin, p->end));
-	if (info.hit_set.current_info.begin)
-	  ls.push_back(make_pair(info.hit_set.current_info.begin, utime_t()));
-	else if (hit_set)
+	if (hit_set)
 	  ls.push_back(make_pair(hit_set_start_stamp, utime_t()));
 	::encode(ls, osd_op.outdata);
       }
@@ -1117,9 +1227,7 @@ void ReplicatedPG::do_pg_op(OpRequestRef op)
     case CEPH_OSD_OP_PG_HITSET_GET:
       {
 	utime_t stamp(osd_op.op.hit_set_get.stamp);
-	if ((info.hit_set.current_info.begin &&
-	     stamp >= info.hit_set.current_info.begin) ||
-	    stamp >= hit_set_start_stamp) {
+	if (hit_set_start_stamp && stamp >= hit_set_start_stamp) {
 	  // read the current in-memory HitSet, not the version we've
 	  // checkpointed.
 	  if (!hit_set) {
@@ -1135,7 +1243,7 @@ void ReplicatedPG::do_pg_op(OpRequestRef op)
 	       p != info.hit_set.history.end();
 	       ++p) {
 	    if (stamp >= p->begin && stamp <= p->end) {
-	      oid = get_hit_set_archive_object(p->begin, p->end);
+	      oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
 	      break;
 	    }
 	  }
@@ -1150,10 +1258,9 @@ void ReplicatedPG::do_pg_op(OpRequestRef op)
 	  }
 	  if (is_unreadable_object(oid)) {
 	    wait_for_unreadable_object(oid, op);
-	    delete filter;
 	    return;
 	  }
-	  result = osd->store->read(coll, oid, 0, 0, osd_op.outdata);
+	  result = osd->store->read(coll, ghobject_t(oid), 0, 0, osd_op.outdata);
 	}
       }
       break;
@@ -1218,9 +1325,11 @@ ReplicatedPG::ReplicatedPG(OSDService *o, OSDMapRef curmap,
   PG(o, curmap, _pool, p),
   pgbackend(
     PGBackend::build_pg_backend(
-      _pool.info, curmap, this, coll_t(p), coll_t::make_temp_coll(p), o->store, cct)),
+      _pool.info, curmap, this, coll_t(p), o->store, cct)),
   object_contexts(o->cct, g_conf->osd_pg_object_context_cache_count),
   snapset_contexts_lock("ReplicatedPG::snapset_contexts"),
+  backfills_in_flight(hobject_t::Comparator(true)),
+  pending_backfill_updates(hobject_t::Comparator(true)),
   new_backfill(false),
   temp_seq(0),
   snap_trimmer_machine(this)
@@ -1255,6 +1364,7 @@ void ReplicatedPG::do_request(
 	     << " flushes_in_progress pending "
 	     << "waiting for active on " << op << dendl;
     waiting_for_peered.push_back(op);
+    op->mark_delayed("waiting for peered");
     return;
   }
 
@@ -1266,6 +1376,7 @@ void ReplicatedPG::do_request(
       return;
     } else {
       waiting_for_peered.push_back(op);
+      op->mark_delayed("waiting for peered");
       return;
     }
   }
@@ -1279,6 +1390,7 @@ void ReplicatedPG::do_request(
     if (!is_active()) {
       dout(20) << " peered, not active, waiting for active on " << op << dendl;
       waiting_for_active.push_back(op);
+      op->mark_delayed("waiting for active");
       return;
     }
     if (is_replay()) {
@@ -1312,6 +1424,10 @@ void ReplicatedPG::do_request(
     do_backfill(op);
     break;
 
+  case MSG_OSD_REP_SCRUB:
+    replica_scrub(op, handle);
+    break;
+
   default:
     assert(0 == "bad message type in do_request");
   }
@@ -1326,7 +1442,7 @@ hobject_t ReplicatedPG::earliest_backfill() const
     pg_shard_t bt = *i;
     map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(bt);
     assert(iter != peer_info.end());
-    if (iter->second.last_backfill < e)
+    if (cmp(iter->second.last_backfill, e, get_sort_bitwise()) < 0)
       e = iter->second.last_backfill;
   }
   return e;
@@ -1350,8 +1466,15 @@ bool ReplicatedPG::check_src_targ(const hobject_t& soid, const hobject_t& toid)
     map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(bt);
     assert(iter != peer_info.end());
 
-    if (toid <= MAX(last_backfill_started, iter->second.last_backfill) &&
-	soid > MAX(last_backfill_started, iter->second.last_backfill))
+    hobject_t max;
+    if (cmp(last_backfill_started, iter->second.last_backfill,
+	    get_sort_bitwise()) > 0)
+      max = last_backfill_started;
+    else
+      max = iter->second.last_backfill;
+
+    if (cmp(toid, max, get_sort_bitwise()) <= 0 &&
+	cmp(soid, max, get_sort_bitwise()) > 0)
       return true;
   }
   return false;
@@ -1379,11 +1502,30 @@ void ReplicatedPG::do_op(OpRequestRef& op)
     return;
   }
 
+  // discard due to cluster full transition?  (we discard any op that
+  // originates before the cluster or pool is marked full; the client
+  // will resend after the full flag is removed or if they expect the
+  // op to succeed despite being full).  The except is FULL_FORCE ops,
+  // which there is no reason to discard because they bypass all full
+  // checks anyway.
+  // FIXME: we exclude mds writes for now.
+  if (!(m->get_source().is_mds() || m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) &&
+      info.history.last_epoch_marked_full > m->get_map_epoch()) {
+    dout(10) << __func__ << " discarding op sent before full " << m << " "
+	     << *m << dendl;
+    return;
+  }
+  if (!m->get_source().is_mds() && osd->check_failsafe_full()) {
+    dout(10) << __func__ << " fail-safe full check failed, dropping request"
+	     << dendl;
+    return;
+  }
+
   // order this op as a write?
   bool write_ordered =
     op->may_write() ||
     op->may_cache() ||
-    (m->get_flags() & CEPH_OSD_FLAG_RWORDERED);
+    m->has_flag(CEPH_OSD_FLAG_RWORDERED);
 
   dout(10) << "do_op " << *m
 	   << (op->may_write() ? " may_write" : "")
@@ -1398,7 +1540,8 @@ void ReplicatedPG::do_op(OpRequestRef& op)
 		 info.pgid.pool(), m->get_object_locator().nspace);
 
 
-  if (write_ordered && scrubber.write_blocked_by_scrub(head)) {
+  if (write_ordered &&
+      scrubber.write_blocked_by_scrub(head, get_sort_bitwise())) {
     dout(20) << __func__ << ": waiting for scrub" << dendl;
     waiting_for_active.push_back(op);
     op->mark_delayed("waiting for scrub");
@@ -1417,10 +1560,32 @@ void ReplicatedPG::do_op(OpRequestRef& op)
     return;
   }
 
+  // blocked on snap?
+  map<hobject_t, snapid_t>::iterator blocked_iter =
+    objects_blocked_on_degraded_snap.find(head);
+  if (write_ordered && blocked_iter != objects_blocked_on_degraded_snap.end()) {
+    hobject_t to_wait_on(head);
+    to_wait_on.snap = blocked_iter->second;
+    wait_for_degraded_object(to_wait_on, op);
+    return;
+  }
+  map<hobject_t, ObjectContextRef>::iterator blocked_snap_promote_iter =
+    objects_blocked_on_snap_promotion.find(head);
+  if (write_ordered && 
+      blocked_snap_promote_iter != objects_blocked_on_snap_promotion.end()) {
+    wait_for_blocked_object(
+      blocked_snap_promote_iter->second->obs.oi.soid,
+      op);
+    return;
+  }
+  if (write_ordered && objects_blocked_on_cache_full.count(head)) {
+    block_write_on_full_cache(head, op);
+    return;
+  }
+
   // missing snapdir?
-  hobject_t snapdir(m->get_oid(), m->get_object_locator().key,
-		    CEPH_SNAPDIR, m->get_pg().ps(), info.pgid.pool(),
-		    m->get_object_locator().nspace);
+  hobject_t snapdir = head.get_snapdir();
+
   if (is_unreadable_object(snapdir)) {
     wait_for_unreadable_object(snapdir, op);
     return;
@@ -1486,14 +1651,14 @@ void ReplicatedPG::do_op(OpRequestRef& op)
 		m->get_object_locator().nspace);
 
   // io blocked on obc?
-  if (((m->get_flags() & CEPH_OSD_FLAG_FLUSH) == 0) &&
+  if (!m->has_flag(CEPH_OSD_FLAG_FLUSH) &&
       maybe_await_blocked_snapset(oid, op)) {
     return;
   }
 
   int r = find_object_context(
     oid, &obc, can_create,
-    m->get_flags() & CEPH_OSD_FLAG_MAP_SNAP_CLONE,
+    m->has_flag(CEPH_OSD_FLAG_MAP_SNAP_CLONE),
     &missing_oid);
 
   if (r == -EAGAIN) {
@@ -1501,8 +1666,8 @@ void ReplicatedPG::do_op(OpRequestRef& op)
     // CEPH_OSD_FLAG_LOCALIZE_READS set, we just return -EAGAIN. Otherwise,
     // we have to wait for the object.
     if (is_primary() ||
-	(!(m->get_flags() & CEPH_OSD_FLAG_BALANCE_READS) &&
-	 !(m->get_flags() & CEPH_OSD_FLAG_LOCALIZE_READS))) {
+	(!(m->has_flag(CEPH_OSD_FLAG_BALANCE_READS) &&
+	 !(m->has_flag(CEPH_OSD_FLAG_LOCALIZE_READS))))) {
       // missing the specific snap we need; requeue and wait.
       assert(!op->may_write()); // only happens on a read/cache
       wait_for_unreadable_object(missing_oid, op);
@@ -1529,12 +1694,20 @@ void ReplicatedPG::do_op(OpRequestRef& op)
 
   bool in_hit_set = false;
   if (hit_set) {
-    if (missing_oid != hobject_t() && hit_set->contains(missing_oid))
-      in_hit_set = true;
-    hit_set->insert(oid);
-    if (hit_set->is_full() ||
-	hit_set_start_stamp + pool.info.hit_set_period <= m->get_recv_stamp()) {
-      hit_set_persist();
+    if (obc.get()) {
+      if (obc->obs.oi.soid != hobject_t() && hit_set->contains(obc->obs.oi.soid))
+	in_hit_set = true;
+    } else {
+      if (missing_oid != hobject_t() && hit_set->contains(missing_oid))
+        in_hit_set = true;
+    }
+    if (!op->hitset_inserted) {
+      hit_set->insert(oid);
+      op->hitset_inserted = true;
+      if (hit_set->is_full() ||
+          hit_set_start_stamp + pool.info.hit_set_period <= m->get_recv_stamp()) {
+        hit_set_persist();
+      }
     }
   }
 
@@ -1543,11 +1716,27 @@ void ReplicatedPG::do_op(OpRequestRef& op)
       return;
   }
 
-  if ((m->get_flags() & CEPH_OSD_FLAG_IGNORE_CACHE) == 0 &&
-      maybe_handle_cache(op, write_ordered, obc, r, missing_oid, false, in_hit_set))
+  if (maybe_handle_cache(op,
+			 write_ordered,
+			 obc,
+			 r,
+			 missing_oid,
+			 false,
+			 in_hit_set))
     return;
 
   if (r && (r != -ENOENT || !obc)) {
+    // copy the reqids for copy get on ENOENT
+    if (r == -ENOENT &&
+	(m->ops[0].op.op == CEPH_OSD_OP_COPY_GET_CLASSIC ||
+	 m->ops[0].op.op == CEPH_OSD_OP_COPY_GET)) {
+      bool classic = false;
+      if (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET_CLASSIC) {
+	classic = true;
+      }
+      fill_in_copy_get_noent(op, oid, m->ops[0], classic);
+      return;
+    }
     dout(20) << __func__ << "find_object_context got error " << r << dendl;
     osd->reply_op_error(op, r);
     return;
@@ -1565,7 +1754,7 @@ void ReplicatedPG::do_op(OpRequestRef& op)
 
   // io blocked on obc?
   if (obc->is_blocked() &&
-      (m->get_flags() & CEPH_OSD_FLAG_FLUSH) == 0) {
+      !m->has_flag(CEPH_OSD_FLAG_FLUSH)) {
     wait_for_blocked_object(obc->obs.oi.soid, op);
     return;
   }
@@ -1581,7 +1770,7 @@ void ReplicatedPG::do_op(OpRequestRef& op)
   }
 
   // src_oids
-  map<hobject_t,ObjectContextRef> src_obc;
+  map<hobject_t,ObjectContextRef, hobject_t::BitwiseComparator> src_obc;
   for (vector<OSDOp>::iterator p = m->ops.begin(); p != m->ops.end(); ++p) {
     OSDOp& osd_op = *p;
 
@@ -1686,7 +1875,6 @@ void ReplicatedPG::do_op(OpRequestRef& op)
   }
 
   OpContext *ctx = new OpContext(op, m->get_reqid(), m->ops, obc, this);
-  ctx->op_t = pgbackend->get_transaction();
 
   if (!obc->obs.exists)
     ctx->snapset_obc = get_object_context(obc->obs.oi.soid.get_snapdir(), false);
@@ -1699,14 +1887,14 @@ void ReplicatedPG::do_op(OpRequestRef& op)
   if (ctx->snapset_obc && !ctx->snapset_obc->obs.exists)
     ctx->snapset_obc = ObjectContextRef();
 
-  if (m->get_flags() & CEPH_OSD_FLAG_SKIPRWLOCKS) {
+  if (m->has_flag(CEPH_OSD_FLAG_SKIPRWLOCKS)) {
     dout(20) << __func__ << ": skipping rw locks" << dendl;
   } else if (m->get_flags() & CEPH_OSD_FLAG_FLUSH) {
     dout(20) << __func__ << ": part of flush, will ignore write lock" << dendl;
 
     // verify there is in fact a flush in progress
     // FIXME: we could make this a stronger test.
-    map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(obc->obs.oi.soid);
+    map<hobject_t,FlushOpRef, hobject_t::BitwiseComparator>::iterator p = flush_ops.find(obc->obs.oi.soid);
     if (p == flush_ops.end()) {
       dout(10) << __func__ << " no flush in progress, aborting" << dendl;
       reply_ctx(ctx, -EINVAL);
@@ -1726,7 +1914,7 @@ void ReplicatedPG::do_op(OpRequestRef& op)
     return;
   }
 
-  if (m->get_flags() & CEPH_OSD_FLAG_IGNORE_CACHE) {
+  if (m->has_flag(CEPH_OSD_FLAG_IGNORE_CACHE)) {
     ctx->ignore_cache = true;
   }
 
@@ -1742,26 +1930,49 @@ void ReplicatedPG::do_op(OpRequestRef& op)
       (!obc->obs.exists ||
        ((m->get_snapid() != CEPH_SNAPDIR) &&
 	obc->obs.oi.is_whiteout()))) {
+    // copy the reqids for copy get on ENOENT
+    if (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET_CLASSIC ||
+	m->ops[0].op.op == CEPH_OSD_OP_COPY_GET) {
+      bool classic = false;
+      if (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET_CLASSIC) {
+	classic = true;
+      }
+      fill_in_copy_get_noent(op, oid, m->ops[0], classic);
+      close_op_ctx(ctx, -ENOENT);
+      return;
+    }
     reply_ctx(ctx, -ENOENT);
     return;
   }
 
   op->mark_started();
-  ctx->src_obc = src_obc;
+  ctx->src_obc.swap(src_obc);
 
   execute_ctx(ctx);
 }
 
-bool ReplicatedPG::maybe_handle_cache(OpRequestRef op,
-				      bool write_ordered,
-				      ObjectContextRef obc,
-                                      int r, const hobject_t& missing_oid,
-				      bool must_promote,
-				      bool in_hit_set)
-{
+ReplicatedPG::cache_result_t ReplicatedPG::maybe_handle_cache_detail(
+  OpRequestRef op,
+  bool write_ordered,
+  ObjectContextRef obc,
+  int r, hobject_t missing_oid,
+  bool must_promote,
+  bool in_hit_set,
+  ObjectContextRef *promote_obc)
+{
+  if (op &&
+      op->get_req() &&
+      op->get_req()->get_type() == CEPH_MSG_OSD_OP &&
+      (static_cast<MOSDOp *>(op->get_req())->get_flags() &
+       CEPH_OSD_FLAG_IGNORE_CACHE)) {
+    dout(20) << __func__ << ": ignoring cache due to flag" << dendl;
+    return cache_result_t::NOOP;
+  }
   // return quickly if caching is not enabled
   if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)
-    return false;
+    return cache_result_t::NOOP;
+
+  must_promote = must_promote || op->need_promote();
 
   if (obc)
     dout(25) << __func__ << " " << obc->obs.oi << " "
@@ -1781,120 +1992,136 @@ bool ReplicatedPG::maybe_handle_cache(OpRequestRef op,
   if (obc.get() && obc->is_blocked() && write_ordered) {
     // we're already doing something with this object
     dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
-    return false;
+    return cache_result_t::NOOP;
   }
 
   if (r == -ENOENT && missing_oid == hobject_t()) {
     // we know this object is logically absent (e.g., an undefined clone)
-    return false;
+    return cache_result_t::NOOP;
   }
 
   if (obc.get() && obc->obs.exists) {
-    return false;
+    osd->logger->inc(l_osd_op_cache_hit);
+    return cache_result_t::NOOP;
+  }
+
+  if (missing_oid == hobject_t() && obc.get()) {
+    missing_oid = obc->obs.oi.soid;
   }
 
   MOSDOp *m = static_cast<MOSDOp*>(op->get_req());
   const object_locator_t& oloc = m->get_object_locator();
 
-  if (must_promote || op->need_promote()) {
-    promote_object(obc, missing_oid, oloc, op);
-    return true;
+  if (op->need_skip_handle_cache()) {
+    return cache_result_t::NOOP;
   }
 
   // older versions do not proxy the feature bits.
   bool can_proxy_read = get_osdmap()->get_up_osd_features() &
     CEPH_FEATURE_OSD_PROXY_FEATURES;
+  bool can_proxy_write = get_osdmap()->get_up_osd_features() &
+    CEPH_FEATURE_OSD_PROXY_WRITE_FEATURES;
   OpRequestRef promote_op;
 
   switch (pool.info.cache_mode) {
   case pg_pool_t::CACHEMODE_WRITEBACK:
     if (agent_state &&
 	agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
-      if (!op->may_write() && !op->may_cache() && !write_ordered) {
+      if (!op->may_write() && !op->may_cache() &&
+	  !write_ordered && !must_promote) {
 	if (can_proxy_read) {
 	  dout(20) << __func__ << " cache pool full, proxying read" << dendl;
 	  do_proxy_read(op);
+	  return cache_result_t::HANDLED_PROXY;
 	} else {
 	  dout(20) << __func__ << " cache pool full, redirect read" << dendl;
 	  do_cache_redirect(op);
+	  return cache_result_t::HANDLED_REDIRECT;
 	}
-	return true;
+	assert(0 == "unreachable");
       }
       dout(20) << __func__ << " cache pool full, waiting" << dendl;
-      waiting_for_cache_not_full.push_back(op);
-      return true;
-    }
-    if (can_skip_promote(op)) {
-      return false;
-    }
-    if (op->may_write() || write_ordered || !hit_set) {
-      promote_object(obc, missing_oid, oloc, op);
-      return true;
+      block_write_on_full_cache(missing_oid, op);
+      return cache_result_t::BLOCKED_FULL;
     }
 
-    if (can_proxy_read)
-      do_proxy_read(op);
-    else
-      promote_op = op;   // for non-proxy case promote_object needs this
-
-    // Avoid duplicate promotion
-    if (obc.get() && obc->is_blocked()) {
-      return true;
-    }
+    if (!hit_set) {
+      promote_object(obc, missing_oid, oloc, op, promote_obc);
+      return cache_result_t::BLOCKED_PROMOTE;
+    } else if (op->may_write() || op->may_cache()) {
+      if (can_proxy_write && !must_promote) {
+        do_proxy_write(op, missing_oid);
+      } else {
+	// promote if can't proxy the write
+	promote_object(obc, missing_oid, oloc, op, promote_obc);
+	return cache_result_t::BLOCKED_PROMOTE;
+      }
 
-    // Promote too?
-    switch (pool.info.min_read_recency_for_promote) {
-    case 0:
-      promote_object(obc, missing_oid, oloc, promote_op);
-      break;
-    case 1:
-      // Check if in the current hit set
-      if (in_hit_set) {
-	promote_object(obc, missing_oid, oloc, promote_op);
-      } else if (!can_proxy_read) {
-	do_cache_redirect(op);
+      // Promote too?
+      if (!op->need_skip_promote()) {
+        maybe_promote(obc, missing_oid, oloc, in_hit_set,
+	              pool.info.min_write_recency_for_promote,
+		      OpRequestRef(),
+		      promote_obc);
       }
-      break;
-    default:
-      if (in_hit_set) {
-	promote_object(obc, missing_oid, oloc, promote_op);
+      return cache_result_t::HANDLED_PROXY;
+    } else {
+      bool did_proxy_read = false;
+      if (can_proxy_read && !must_promote) {
+        do_proxy_read(op);
+	did_proxy_read = true;
       } else {
-	// Check if in other hit sets
-	map<time_t,HitSetRef>::iterator itor;
-	bool in_other_hit_sets = false;
-	for (itor = agent_state->hit_set_map.begin(); itor != agent_state->hit_set_map.end(); ++itor) {
-	  if (itor->second->contains(missing_oid)) {
-	    in_other_hit_sets = true;
-	    break;
-	  }
-	}
-	if (in_other_hit_sets) {
-	  promote_object(obc, missing_oid, oloc, promote_op);
-	} else if (!can_proxy_read) {
-	  do_cache_redirect(op);
+        promote_op = op;   // for non-proxy case promote_object needs this
+      }
+
+      // Avoid duplicate promotion
+      if (obc.get() && obc->is_blocked()) {
+	if (!did_proxy_read) {
+	  wait_for_blocked_object(obc->obs.oi.soid, op);
 	}
+	if (promote_obc)
+	  *promote_obc = obc;
+        return cache_result_t::BLOCKED_PROMOTE;
+      }
+
+      // Promote too?
+      bool promoted = false;
+      if (!op->need_skip_promote()) {
+        promoted = maybe_promote(obc, missing_oid, oloc, in_hit_set,
+                                 pool.info.min_read_recency_for_promote,
+                                 promote_op, promote_obc);
+      }
+      if (!promoted && !did_proxy_read) {
+	// redirect the op if it's not proxied and not promoting
+	do_cache_redirect(op);
+	return cache_result_t::HANDLED_REDIRECT;
+      } else if (did_proxy_read) {
+	return cache_result_t::HANDLED_PROXY;
+      } else {
+	assert(promoted);
+	return cache_result_t::BLOCKED_PROMOTE;
       }
-      break;
     }
-    return true;
+    assert(0 == "unreachable");
+    return cache_result_t::NOOP;
 
   case pg_pool_t::CACHEMODE_FORWARD:
     do_cache_redirect(op);
-    return true;
+    return cache_result_t::HANDLED_REDIRECT;
 
   case pg_pool_t::CACHEMODE_READONLY:
     // TODO: clean this case up
     if (!obc.get() && r == -ENOENT) {
       // we don't have the object and op's a read
-      promote_object(obc, missing_oid, oloc, op);
-      return true;
+      promote_object(obc, missing_oid, oloc, op, promote_obc);
+      return cache_result_t::BLOCKED_PROMOTE;
     }
     if (!r) { // it must be a write
       do_cache_redirect(op);
-      return true;
+      return cache_result_t::HANDLED_REDIRECT;
     }
     // crap, there was a failure of some kind
-    return false;
+    return cache_result_t::NOOP;
 
   case pg_pool_t::CACHEMODE_READFORWARD:
     // Do writeback to the cache tier for writes
@@ -1902,19 +2129,16 @@ bool ReplicatedPG::maybe_handle_cache(OpRequestRef op,
       if (agent_state &&
 	  agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
 	dout(20) << __func__ << " cache pool full, waiting" << dendl;
-	waiting_for_cache_not_full.push_back(op);
-	return true;
+	block_write_on_full_cache(missing_oid, op);
+	return cache_result_t::BLOCKED_FULL;
       }
-      if (can_skip_promote(op)) {
-	return false;
-      }
-      promote_object(obc, missing_oid, oloc, op);
-      return true;
+      promote_object(obc, missing_oid, oloc, op, promote_obc);
+      return cache_result_t::BLOCKED_PROMOTE;
     }
 
     // If it is a read, we can read, we need to forward it
     do_cache_redirect(op);
-    return true;
+    return cache_result_t::HANDLED_REDIRECT;
 
   case pg_pool_t::CACHEMODE_READPROXY:
     // Do writeback to the cache tier for writes
@@ -1922,38 +2146,81 @@ bool ReplicatedPG::maybe_handle_cache(OpRequestRef op,
       if (agent_state &&
 	  agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
 	dout(20) << __func__ << " cache pool full, waiting" << dendl;
-	waiting_for_cache_not_full.push_back(op);
-	return true;
+	block_write_on_full_cache(missing_oid, op);
+	return cache_result_t::BLOCKED_FULL;
       }
-      if (can_skip_promote(op)) {
-	return false;
-      }
-      promote_object(obc, missing_oid, oloc, op);
-      return true;
+      promote_object(obc, missing_oid, oloc, op, promote_obc);
+      return cache_result_t::BLOCKED_PROMOTE;
     }
 
     // If it is a read, we can read, we need to proxy it
     do_proxy_read(op);
-    return true;
+    return cache_result_t::HANDLED_PROXY;
 
   default:
     assert(0 == "unrecognized cache_mode");
   }
-  return false;
+  return cache_result_t::NOOP;
 }
 
-bool ReplicatedPG::can_skip_promote(OpRequestRef op)
+bool ReplicatedPG::maybe_promote(ObjectContextRef obc,
+				 const hobject_t& missing_oid,
+				 const object_locator_t& oloc,
+				 bool in_hit_set,
+				 uint32_t recency,
+				 OpRequestRef promote_op,
+				 ObjectContextRef *promote_obc)
 {
-  MOSDOp *m = static_cast<MOSDOp*>(op->get_req());
-  if (m->ops.empty())
-    return false;
-  // if we get a delete with FAILOK we can skip promote.  without
-  // FAILOK we still need to promote (or do something smarter) to
-  // determine whether to return ENOENT or 0.
-  if (m->ops[0].op.op == CEPH_OSD_OP_DELETE &&
-      (m->ops[0].op.flags & CEPH_OSD_OP_FLAG_FAILOK))
-    return true;
-  return false;
+  dout(20) << __func__ << " missing_oid " << missing_oid
+	   << "  in_hit_set " << in_hit_set << dendl;
+
+  switch (recency) {
+  case 0:
+    promote_object(obc, missing_oid, oloc, promote_op, promote_obc);
+    break;
+  case 1:
+    // Check if in the current hit set
+    if (in_hit_set) {
+      promote_object(obc, missing_oid, oloc, promote_op, promote_obc);
+    } else {
+      // not promoting
+      return false;
+    }
+    break;
+  default:
+    if (in_hit_set) {
+      promote_object(obc, missing_oid, oloc, promote_op);
+    } else {
+      // Check if in other hit sets
+      map<time_t,HitSetRef>::iterator itor;
+      bool in_other_hit_sets = false;
+      unsigned max_in_memory_read = pool.info.min_read_recency_for_promote > 0 ? pool.info.min_read_recency_for_promote - 1 : 0;
+      unsigned max_in_memory_write = pool.info.min_write_recency_for_promote > 0 ? pool.info.min_write_recency_for_promote - 1 : 0;
+      unsigned max_in_memory = MAX(max_in_memory_read, max_in_memory_write);
+      for (itor = agent_state->hit_set_map.begin(); itor != agent_state->hit_set_map.end() && max_in_memory--; ++itor) {
+        if (obc.get()) {
+          if (obc->obs.oi.soid != hobject_t() && itor->second->contains(obc->obs.oi.soid)) {
+            in_other_hit_sets = true;
+            break;
+          }
+        } else {
+          if (missing_oid != hobject_t() && itor->second->contains(missing_oid)) {
+            in_other_hit_sets = true;
+            break;
+          }
+        }
+      }
+      if (in_other_hit_sets) {
+        promote_object(obc, missing_oid, oloc, promote_op);
+      } else {
+	// not promoting
+        return false;
+      }
+    }
+    break;
+  }
+
+  return true;
 }
 
 void ReplicatedPG::do_cache_redirect(OpRequestRef op)
@@ -1976,10 +2243,11 @@ struct C_ProxyRead : public Context {
   epoch_t last_peering_reset;
   ceph_tid_t tid;
   ReplicatedPG::ProxyReadOpRef prdop;
+  utime_t start;
   C_ProxyRead(ReplicatedPG *p, hobject_t o, epoch_t lpr,
 	     const ReplicatedPG::ProxyReadOpRef& prd)
     : pg(p), oid(o), last_peering_reset(lpr),
-      tid(0), prdop(prd)
+      tid(0), prdop(prd), start(ceph_clock_now(NULL))
   {}
   void finish(int r) {
     if (prdop->canceled)
@@ -1991,6 +2259,7 @@ struct C_ProxyRead : public Context {
     }
     if (last_peering_reset == pg->get_last_peering_reset()) {
       pg->finish_proxy_read(oid, tid, r);
+      pg->osd->logger->tinc(l_osd_tier_r_lat, ceph_clock_now(NULL) - start);
     }
     pg->unlock();
   }
@@ -2009,6 +2278,16 @@ void ReplicatedPG::do_proxy_read(OpRequestRef op)
 		 m->get_object_locator().get_pool(),
 		 m->get_object_locator().nspace);
   unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
+
+  // pass through some original flags that make sense.
+  //  - leave out redirection and balancing flags since we are
+  //    already proxying through the primary
+  //  - leave off read/write/exec flags that are derived from the op
+  flags |= m->get_flags() & (CEPH_OSD_FLAG_RWORDERED |
+			     CEPH_OSD_FLAG_ORDERSNAP |
+			     CEPH_OSD_FLAG_ENFORCE_SNAPC |
+			     CEPH_OSD_FLAG_MAP_SNAP_CLONE);
+
   dout(10) << __func__ << " Start proxy read for " << *m << dendl;
 
   ProxyReadOpRef prdop(new ProxyReadOp(op, soid, m->ops));
@@ -2016,6 +2295,20 @@ void ReplicatedPG::do_proxy_read(OpRequestRef op)
   ObjectOperation obj_op;
   obj_op.dup(prdop->ops);
 
+  if (pool.info.cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
+      (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)) {
+    for (unsigned i = 0; i < obj_op.ops.size(); i++) {
+      ceph_osd_op op = obj_op.ops[i].op;
+      switch (op.op) {
+	case CEPH_OSD_OP_READ:
+	case CEPH_OSD_OP_SYNC_READ:
+	case CEPH_OSD_OP_SPARSE_READ:
+	  op.flags = (op.flags | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL) &
+		       ~(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
+      }
+    }
+  }
+
   C_ProxyRead *fin = new C_ProxyRead(this, soid, get_last_peering_reset(),
 				     prdop);
   ceph_tid_t tid = osd->objecter->read(
@@ -2028,7 +2321,7 @@ void ReplicatedPG::do_proxy_read(OpRequestRef op)
   fin->tid = tid;
   prdop->objecter_tid = tid;
   proxyread_ops[tid] = prdop;
-  in_progress_proxy_reads[soid].push_back(op);
+  in_progress_proxy_ops[soid].push_back(op);
 }
 
 void ReplicatedPG::finish_proxy_read(hobject_t oid, ceph_tid_t tid, int r)
@@ -2054,9 +2347,9 @@ void ReplicatedPG::finish_proxy_read(hobject_t oid, ceph_tid_t tid, int r)
   }
   proxyread_ops.erase(tid);
 
-  map<hobject_t, list<OpRequestRef> >::iterator q = in_progress_proxy_reads.find(oid);
-  if (q == in_progress_proxy_reads.end()) {
-    dout(10) << __func__ << " no in_progress_proxy_reads found" << dendl;
+  map<hobject_t, list<OpRequestRef>, hobject_t::BitwiseComparator>::iterator q = in_progress_proxy_ops.find(oid);
+  if (q == in_progress_proxy_ops.end()) {
+    dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
     return;
   }
   assert(q->second.size());
@@ -2067,7 +2360,7 @@ void ReplicatedPG::finish_proxy_read(hobject_t oid, ceph_tid_t tid, int r)
   OpRequestRef op = *it;
   q->second.erase(it);
   if (q->second.size() == 0) {
-    in_progress_proxy_reads.erase(oid);
+    in_progress_proxy_ops.erase(oid);
   }
 
   osd->logger->inc(l_osd_tier_proxy_read);
@@ -2081,16 +2374,16 @@ void ReplicatedPG::finish_proxy_read(hobject_t oid, ceph_tid_t tid, int r)
   complete_read_ctx(r, ctx);
 }
 
-void ReplicatedPG::kick_proxy_read_blocked(hobject_t& soid)
+void ReplicatedPG::kick_proxy_ops_blocked(hobject_t& soid)
 {
-  map<hobject_t, list<OpRequestRef> >::iterator p = in_progress_proxy_reads.find(soid);
-  if (p == in_progress_proxy_reads.end())
+  map<hobject_t, list<OpRequestRef>, hobject_t::BitwiseComparator>::iterator p = in_progress_proxy_ops.find(soid);
+  if (p == in_progress_proxy_ops.end())
     return;
 
   list<OpRequestRef>& ls = p->second;
   dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
   requeue_ops(ls);
-  in_progress_proxy_reads.erase(p);
+  in_progress_proxy_ops.erase(p);
 }
 
 void ReplicatedPG::cancel_proxy_read(ProxyReadOpRef prdop)
@@ -2101,57 +2394,222 @@ void ReplicatedPG::cancel_proxy_read(ProxyReadOpRef prdop)
   // cancel objecter op, if we can
   if (prdop->objecter_tid) {
     osd->objecter->op_cancel(prdop->objecter_tid, -ECANCELED);
+    for (uint32_t i = 0; i < prdop->ops.size(); i++) {
+      prdop->ops[i].outdata.clear();
+    }
     proxyread_ops.erase(prdop->objecter_tid);
     prdop->objecter_tid = 0;
   }
 }
 
-void ReplicatedPG::cancel_proxy_read_ops(bool requeue)
+void ReplicatedPG::cancel_proxy_ops(bool requeue)
 {
   dout(10) << __func__ << dendl;
+
+  // cancel proxy reads
   map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.begin();
   while (p != proxyread_ops.end()) {
     cancel_proxy_read((p++)->second);
   }
 
+  // cancel proxy writes
+  map<ceph_tid_t, ProxyWriteOpRef>::iterator q = proxywrite_ops.begin();
+  while (q != proxywrite_ops.end()) {
+    cancel_proxy_write((q++)->second);
+  }
+
   if (requeue) {
-    map<hobject_t, list<OpRequestRef> >::iterator p =
-      in_progress_proxy_reads.begin();
-    while (p != in_progress_proxy_reads.end()) {
+    map<hobject_t, list<OpRequestRef>, hobject_t::BitwiseComparator>::iterator p =
+      in_progress_proxy_ops.begin();
+    while (p != in_progress_proxy_ops.end()) {
       list<OpRequestRef>& ls = p->second;
       dout(10) << __func__ << " " << p->first << " requeuing " << ls.size()
 	       << " requests" << dendl;
       requeue_ops(ls);
-      in_progress_proxy_reads.erase(p++);
+      in_progress_proxy_ops.erase(p++);
     }
   } else {
-    in_progress_proxy_reads.clear();
+    in_progress_proxy_ops.clear();
+  }
+}
+
+struct C_ProxyWrite_Commit : public Context {
+  ReplicatedPGRef pg;
+  hobject_t oid;
+  epoch_t last_peering_reset;
+  ceph_tid_t tid;
+  ReplicatedPG::ProxyWriteOpRef pwop;
+  C_ProxyWrite_Commit(ReplicatedPG *p, hobject_t o, epoch_t lpr,
+	              const ReplicatedPG::ProxyWriteOpRef& pw)
+    : pg(p), oid(o), last_peering_reset(lpr),
+      tid(0), pwop(pw)
+  {}
+  void finish(int r) {
+    if (pwop->canceled)
+      return;
+    pg->lock();
+    if (pwop->canceled) {
+      pg->unlock();
+      return;
+    }
+    if (last_peering_reset == pg->get_last_peering_reset()) {
+      pg->finish_proxy_write(oid, tid, r);
+    }
+    pg->unlock();
+  }
+};
+
+void ReplicatedPG::do_proxy_write(OpRequestRef op, const hobject_t& missing_oid)
+{
+  MOSDOp *m = static_cast<MOSDOp*>(op->get_req());
+  object_locator_t oloc(m->get_object_locator());
+  oloc.pool = pool.info.tier_of;
+  SnapContext snapc(m->get_snap_seq(), m->get_snaps());
+
+  hobject_t soid(m->get_oid(),
+		 m->get_object_locator().key,
+		 missing_oid.snap,
+		 m->get_pg().ps(),
+		 m->get_object_locator().get_pool(),
+		 m->get_object_locator().nspace);
+  unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
+  dout(10) << __func__ << " Start proxy write for " << *m << dendl;
+
+  ProxyWriteOpRef pwop(new ProxyWriteOp(op, soid, m->ops, m->get_reqid()));
+  pwop->ctx = new OpContext(op, m->get_reqid(), pwop->ops, this);
+  pwop->mtime = m->get_mtime();
+
+  ObjectOperation obj_op;
+  obj_op.dup(pwop->ops);
+
+  C_ProxyWrite_Commit *fin = new C_ProxyWrite_Commit(
+      this, soid, get_last_peering_reset(), pwop);
+  ceph_tid_t tid = osd->objecter->mutate(soid.oid, oloc, obj_op,
+				         snapc, pwop->mtime,
+					 flags, NULL,
+					 new C_OnFinisher(fin, &osd->objecter_finisher),
+				         &pwop->user_version,
+					 pwop->reqid);
+  fin->tid = tid;
+  pwop->objecter_tid = tid;
+  proxywrite_ops[tid] = pwop;
+  in_progress_proxy_ops[soid].push_back(op);
+}
+
+void ReplicatedPG::finish_proxy_write(hobject_t oid, ceph_tid_t tid, int r)
+{
+  dout(10) << __func__ << " " << oid << " tid " << tid
+	   << " " << cpp_strerror(r) << dendl;
+
+  map<ceph_tid_t, ProxyWriteOpRef>::iterator p = proxywrite_ops.find(tid);
+  if (p == proxywrite_ops.end()) {
+    dout(10) << __func__ << " no proxywrite_op found" << dendl;
+    return;
+  }
+  ProxyWriteOpRef pwop = p->second;
+  assert(tid == pwop->objecter_tid);
+  assert(oid == pwop->soid);
+
+  proxywrite_ops.erase(tid);
+
+  map<hobject_t, list<OpRequestRef> >::iterator q = in_progress_proxy_ops.find(oid);
+  if (q == in_progress_proxy_ops.end()) {
+    dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
+    delete pwop->ctx;
+    pwop->ctx = NULL;
+    return;
+  }
+  list<OpRequestRef>& in_progress_op = q->second;
+  assert(in_progress_op.size());
+  list<OpRequestRef>::iterator it = std::find(in_progress_op.begin(),
+                                              in_progress_op.end(),
+					      pwop->op);
+  assert(it != in_progress_op.end());
+  in_progress_op.erase(it);
+  if (in_progress_op.size() == 0) {
+    in_progress_proxy_ops.erase(oid);
+  }
+
+  osd->logger->inc(l_osd_tier_proxy_write);
+
+  MOSDOp *m = static_cast<MOSDOp*>(pwop->op->get_req());
+  assert(m != NULL);
+
+  if (m->wants_ondisk() && !pwop->sent_disk) {
+    // send commit.
+    MOSDOpReply *reply = pwop->ctx->reply;
+    if (reply)
+      pwop->ctx->reply = NULL;
+    else {
+      reply = new MOSDOpReply(m, r, get_osdmap()->get_epoch(), 0, true);
+      reply->set_reply_versions(eversion_t(), pwop->user_version);
+    }
+    reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
+    dout(10) << " sending commit on " << pwop << " " << reply << dendl;
+    osd->send_message_osd_client(reply, m->get_connection());
+    pwop->sent_disk = true;
+    pwop->ctx->op->mark_commit_sent();
+  } else if (m->wants_ack() && !pwop->sent_ack && !pwop->sent_disk) {
+    // send ack
+    MOSDOpReply *reply = pwop->ctx->reply;
+    if (reply)
+      pwop->ctx->reply = NULL;
+    else {
+      reply = new MOSDOpReply(m, r, get_osdmap()->get_epoch(), 0, true);
+      reply->set_reply_versions(eversion_t(), pwop->user_version);
+    }
+    reply->add_flags(CEPH_OSD_FLAG_ACK);
+    dout(10) << " sending ack on " << pwop << " " << reply << dendl;
+    osd->send_message_osd_client(reply, m->get_connection());
+    pwop->sent_ack = true;
+  }
+
+  delete pwop->ctx;
+  pwop->ctx = NULL;
+}
+
+void ReplicatedPG::cancel_proxy_write(ProxyWriteOpRef pwop)
+{
+  dout(10) << __func__ << " " << pwop->soid << dendl;
+  pwop->canceled = true;
+
+  // cancel objecter op, if we can
+  if (pwop->objecter_tid) {
+    osd->objecter->op_cancel(pwop->objecter_tid, -ECANCELED);
+    delete pwop->ctx;
+    pwop->ctx = NULL;
+    proxywrite_ops.erase(pwop->objecter_tid);
+    pwop->objecter_tid = 0;
   }
 }
 
 class PromoteCallback: public ReplicatedPG::CopyCallback {
   ObjectContextRef obc;
   ReplicatedPG *pg;
+  utime_t start;
 public:
   PromoteCallback(ObjectContextRef obc_, ReplicatedPG *pg_)
     : obc(obc_),
-      pg(pg_) {}
+      pg(pg_),
+      start(ceph_clock_now(NULL)) {}
 
   virtual void finish(ReplicatedPG::CopyCallbackResults results) {
     ReplicatedPG::CopyResults *results_data = results.get<1>();
     int r = results.get<0>();
     pg->finish_promote(r, results_data, obc);
+    pg->osd->logger->tinc(l_osd_tier_promote_lat, ceph_clock_now(NULL) - start);
   }
 };
 
 void ReplicatedPG::promote_object(ObjectContextRef obc,
 				  const hobject_t& missing_oid,
 				  const object_locator_t& oloc,
-				  OpRequestRef op)
+				  OpRequestRef op,
+				  ObjectContextRef *promote_obc)
 {
   hobject_t hoid = obc ? obc->obs.oi.soid : missing_oid;
   assert(hoid != hobject_t());
-  if (scrubber.write_blocked_by_scrub(hoid)) {
+  if (scrubber.write_blocked_by_scrub(hoid, get_sort_bitwise())) {
     dout(10) << __func__ << " " << hoid
 	     << " blocked by scrub" << dendl;
     if (op) {
@@ -2168,20 +2626,36 @@ void ReplicatedPG::promote_object(ObjectContextRef obc,
     assert(missing_oid != hobject_t());
     obc = get_object_context(missing_oid, true);
   }
+  if (promote_obc)
+    *promote_obc = obc;
+
+  /*
+   * Before promote complete, if there are  proxy-reads for the object,
+   * for this case we don't use DONTNEED.
+   */
+  unsigned src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL;
+  map<hobject_t, list<OpRequestRef>, hobject_t::BitwiseComparator>::iterator q = in_progress_proxy_ops.find(obc->obs.oi.soid);
+  if (q == in_progress_proxy_ops.end()) {
+    src_fadvise_flags |= LIBRADOS_OP_FLAG_FADVISE_DONTNEED;
+  }
 
   PromoteCallback *cb = new PromoteCallback(obc, this);
   object_locator_t my_oloc = oloc;
   my_oloc.pool = pool.info.tier_of;
-  start_copy(cb, obc, obc->obs.oi.soid, my_oloc, 0,
-	     CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
-	     CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
-	     CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE,
-	     obc->obs.oi.soid.snap == CEPH_NOSNAP);
+
+  unsigned flags = CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
+                   CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
+                   CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE |
+                   CEPH_OSD_COPY_FROM_FLAG_RWORDERED;
+  start_copy(cb, obc, obc->obs.oi.soid, my_oloc, 0, flags,
+	     obc->obs.oi.soid.snap == CEPH_NOSNAP,
+	     src_fadvise_flags, 0);
 
   assert(obc->is_blocked());
 
   if (op)
     wait_for_blocked_object(obc->obs.oi.soid, op);
+  info.stats.stats.sum.num_promote++;
 }
 
 void ReplicatedPG::execute_ctx(OpContext *ctx)
@@ -2192,7 +2666,7 @@ void ReplicatedPG::execute_ctx(OpContext *ctx)
   MOSDOp *m = static_cast<MOSDOp*>(op->get_req());
   ObjectContextRef obc = ctx->obc;
   const hobject_t& soid = obc->obs.oi.soid;
-  map<hobject_t,ObjectContextRef>& src_obc = ctx->src_obc;
+  map<hobject_t,ObjectContextRef, hobject_t::BitwiseComparator>& src_obc = ctx->src_obc;
 
   // this method must be idempotent since we may call it several times
   // before we finally apply the resulting transaction.
@@ -2201,7 +2675,7 @@ void ReplicatedPG::execute_ctx(OpContext *ctx)
 
   if (op->may_write() || op->may_cache()) {
     // snap
-    if (!(m->get_flags() & CEPH_OSD_FLAG_ENFORCE_SNAPC) &&
+    if (!(m->has_flag(CEPH_OSD_FLAG_ENFORCE_SNAPC)) &&
 	pool.info.is_pool_snaps_mode()) {
       // use pool's snapc
       ctx->snapc = pool.snapc;
@@ -2210,7 +2684,7 @@ void ReplicatedPG::execute_ctx(OpContext *ctx)
       ctx->snapc.seq = m->get_snap_seq();
       ctx->snapc.snaps = m->get_snaps();
     }
-    if ((m->get_flags() & CEPH_OSD_FLAG_ORDERSNAP) &&
+    if ((m->has_flag(CEPH_OSD_FLAG_ORDERSNAP)) &&
 	ctx->snapc.seq < obc->ssc->snapset.seq) {
       dout(10) << " ORDERSNAP flag set and snapc seq " << ctx->snapc.seq
 	       << " < snapset seq " << obc->ssc->snapset.seq
@@ -2242,7 +2716,7 @@ void ReplicatedPG::execute_ctx(OpContext *ctx)
     dout(10) << " taking ondisk_read_lock" << dendl;
     obc->ondisk_read_lock();
   }
-  for (map<hobject_t,ObjectContextRef>::iterator p = src_obc.begin(); p != src_obc.end(); ++p) {
+  for (map<hobject_t,ObjectContextRef, hobject_t::BitwiseComparator>::iterator p = src_obc.begin(); p != src_obc.end(); ++p) {
     dout(10) << " taking ondisk_read_lock for src " << p->first << dendl;
     p->second->ondisk_read_lock();
   }
@@ -2269,7 +2743,7 @@ void ReplicatedPG::execute_ctx(OpContext *ctx)
     dout(10) << " dropping ondisk_read_lock" << dendl;
     obc->ondisk_read_unlock();
   }
-  for (map<hobject_t,ObjectContextRef>::iterator p = src_obc.begin(); p != src_obc.end(); ++p) {
+  for (map<hobject_t,ObjectContextRef, hobject_t::BitwiseComparator>::iterator p = src_obc.begin(); p != src_obc.end(); ++p) {
     dout(10) << " dropping ondisk_read_lock for src " << p->first << dendl;
     p->second->ondisk_read_unlock();
   }
@@ -2285,13 +2759,6 @@ void ReplicatedPG::execute_ctx(OpContext *ctx)
     return;
   }
 
-  // check for full
-  if (ctx->delta_stats.num_bytes > 0 &&
-      pool.info.has_flag(pg_pool_t::FLAG_FULL)) {
-    reply_ctx(ctx, -ENOSPC);
-    return;
-  }
-
   bool successful_write = !ctx->op_t->empty() && op->may_write() && result >= 0;
   // prepare the reply
   ctx->reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0,
@@ -2543,26 +3010,14 @@ void ReplicatedPG::do_scan(
       // Check that from is in backfill_targets vector
       assert(is_backfill_targets(from));
 
-      BackfillInterval bi;
+      BackfillInterval& bi = peer_backfill_info[from];
       bi.begin = m->begin;
       bi.end = m->end;
       bufferlist::iterator p = m->get_data().begin();
-      ::decode(bi.objects, p);
-
-      // handle hobject_t encoding change
-      if (bi.objects.size() && bi.objects.begin()->first.pool == -1) {
-	map<hobject_t, eversion_t> tmp;
-	tmp.swap(bi.objects);
-	for (map<hobject_t, eversion_t>::iterator i = tmp.begin();
-	     i != tmp.end();
-	     ++i) {
-	  hobject_t first(i->first);
-	  if (!first.is_max() && first.pool == -1)
-	    first.pool = info.pgid.pool();
-	  bi.objects[first] = i->second;
-	}
-      }
-      peer_backfill_info[from] = bi;
+
+      // take care to preserve ordering!
+      bi.clear_objects();
+      ::decode_noclear(bi.objects, p);
 
       if (waiting_on_backfill.erase(from)) {
 	if (waiting_on_backfill.empty()) {
@@ -2611,7 +3066,7 @@ void ReplicatedPG::do_backfill(OpRequestRef op)
     {
       assert(cct->_conf->osd_kill_backfill_at != 2);
 
-      info.last_backfill = m->last_backfill;
+      info.set_last_backfill(m->last_backfill, get_sort_bitwise());
       if (m->compat_stat_sum) {
 	info.stats.stats = m->stats.stats; // Previously, we only sent sum
       } else {
@@ -2707,8 +3162,7 @@ ReplicatedPG::RepGather *ReplicatedPG::trim_object(const hobject_t &coid)
       if (*p == last)
 	break;
     assert(p != snapset.clones.end());
-    object_stat_sum_t delta;
-    delta.num_bytes -= snapset.get_clone_bytes(last);
+    ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(last);
 
     if (p != snapset.clones.begin()) {
       // not the oldest... merge overlap into next older clone
@@ -2718,25 +3172,24 @@ ReplicatedPG::RepGather *ReplicatedPG::trim_object(const hobject_t &coid)
       bool adjust_prev_bytes = is_present_clone(prev_coid);
 
       if (adjust_prev_bytes)
-	delta.num_bytes -= snapset.get_clone_bytes(*n);
+	ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(*n);
 
       snapset.clone_overlap[*n].intersection_of(
 	snapset.clone_overlap[*p]);
 
       if (adjust_prev_bytes)
-	delta.num_bytes += snapset.get_clone_bytes(*n);
+	ctx->delta_stats.num_bytes += snapset.get_clone_bytes(*n);
     }
-    delta.num_objects--;
+    ctx->delta_stats.num_objects--;
     if (coi.is_dirty())
-      delta.num_objects_dirty--;
+      ctx->delta_stats.num_objects_dirty--;
     if (coi.is_omap())
-      delta.num_objects_omap--;
+      ctx->delta_stats.num_objects_omap--;
     if (coi.is_whiteout()) {
       dout(20) << __func__ << " trimming whiteout on " << coid << dendl;
-      delta.num_whiteouts--;
+      ctx->delta_stats.num_whiteouts--;
     }
-    delta.num_object_clones--;
-    info.stats.stats.add(delta);
+    ctx->delta_stats.num_object_clones--;
     obc->obs.exists = false;
 
     snapset.clones.erase(p);
@@ -2854,13 +3307,15 @@ ReplicatedPG::RepGather *ReplicatedPG::trim_object(const hobject_t &coid)
       ctx->snapset_obc->obs.oi.version;
     ctx->snapset_obc->obs.oi.version = ctx->at_version;
 
+    map <string, bufferlist> attrs;
     bl.clear();
     ::encode(snapset, bl);
-    setattr_maybe_cache(ctx->snapset_obc, ctx, t, SS_ATTR, bl);
+    attrs[SS_ATTR].claim(bl);
 
     bl.clear();
     ::encode(ctx->snapset_obc->obs.oi, bl);
-    setattr_maybe_cache(ctx->snapset_obc, ctx, t, OI_ATTR, bl);
+    attrs[OI_ATTR].claim(bl);
+    setattrs_maybe_cache(ctx->snapset_obc, ctx, t, attrs);
 
     if (pool.info.require_rollback()) {
       set<string> changing;
@@ -2875,28 +3330,26 @@ ReplicatedPG::RepGather *ReplicatedPG::trim_object(const hobject_t &coid)
   return repop;
 }
 
-void ReplicatedPG::snap_trimmer()
+void ReplicatedPG::snap_trimmer(epoch_t queued)
 {
   if (g_conf->osd_snap_trim_sleep > 0) {
+    unlock();
     utime_t t;
     t.set_from_double(g_conf->osd_snap_trim_sleep);
     t.sleep();
     lock();
     dout(20) << __func__ << " slept for " << t << dendl;
-  } else {
-    lock();
   }
-  if (deleting) {
-    unlock();
+  if (deleting || pg_has_reset_since(queued)) {
     return;
   }
+  snap_trim_queued = false;
   dout(10) << "snap_trimmer entry" << dendl;
   if (is_primary()) {
     entity_inst_t nobody;
     if (scrubber.active) {
       dout(10) << " scrubbing, will requeue snap_trimmer after" << dendl;
       scrubber.queue_snap_trim = true;
-      unlock();
       return;
     }
 
@@ -2913,7 +3366,6 @@ void ReplicatedPG::snap_trimmer()
     // replica collection trimming
     snap_trimmer_machine.process_event(SnapTrim());
   }
-  unlock();
   return;
 }
 
@@ -3267,16 +3719,57 @@ static int check_offset_and_length(uint64_t offset, uint64_t length, uint64_t ma
   return 0;
 }
 
-struct FillInExtent : public Context {
+struct FillInVerifyExtent : public Context {
   ceph_le64 *r;
-  FillInExtent(ceph_le64 *r) : r(r) {}
-  void finish(int _r) {
-    if (_r >= 0) {
-      *r = _r;
+  int32_t *rval;
+  bufferlist *outdatap;
+  boost::optional<uint32_t> maybe_crc;
+  uint64_t size;
+  OSDService *osd;
+  hobject_t soid;
+  __le32 flags;
+  FillInVerifyExtent(ceph_le64 *r, int32_t *rv, bufferlist *blp,
+		     boost::optional<uint32_t> mc, uint64_t size,
+		     OSDService *osd, hobject_t soid, __le32 flags) :
+    r(r), rval(rv), outdatap(blp), maybe_crc(mc),
+    size(size), osd(osd), soid(soid), flags(flags) {}
+  void finish(int len) {
+    *rval = len;
+    *r = len;
+    if (len < 0)
+      return;
+    // whole object?  can we verify the checksum?
+    if (maybe_crc && *r == size) {
+      uint32_t crc = outdatap->crc32c(-1);
+      if (maybe_crc != crc) {
+        osd->clog->error() << std::hex << " full-object read crc 0x" << crc
+			   << " != expected 0x" << *maybe_crc
+			   << std::dec << " on " << soid << "\n";
+        if (!(flags & CEPH_OSD_OP_FLAG_FAILOK)) {
+	  *rval = -EIO;
+	  *r = 0;
+	}
+      }
     }
   }
 };
 
+struct ToSparseReadResult : public Context {
+  bufferlist& data_bl;
+  ceph_le64& len;
+  ToSparseReadResult(bufferlist& bl, ceph_le64& len):
+    data_bl(bl), len(len) {}
+  void finish(int r) {
+    if (r < 0) return;
+    len = r;
+    bufferlist outdata;
+    map<uint64_t, uint64_t> extents = {{0, r}};
+    ::encode(extents, outdata);
+    ::encode_destructively(data_bl, outdata);
+    data_bl.swap(outdata);
+  }
+};
+
 template<typename V>
 static string list_keys(const map<string, V>& m) {
   string s;
@@ -3313,6 +3806,7 @@ bool ReplicatedPG::maybe_create_new_object(OpContext *ctx)
     dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
     ctx->new_obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
     --ctx->delta_stats.num_whiteouts;
+    return true;
   }
   return false;
 }
@@ -3418,6 +3912,9 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	     (op.extent.offset + op.extent.length > op.extent.truncate_size) )
 	  size = op.extent.truncate_size;
 
+	if (op.extent.length == 0) //length is zero mean read the whole object
+	  op.extent.length = size;
+
 	if (op.extent.offset >= size) {
 	  op.extent.length = 0;
 	  trimmed_read = true;
@@ -3428,15 +3925,27 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 
 	// read into a buffer
 	bufferlist bl;
+	bool async = false;
 	if (trimmed_read && op.extent.length == 0) {
 	  // read size was trimmed to zero and it is expected to do nothing
 	  // a read operation of 0 bytes does *not* do nothing, this is why
 	  // the trimmed_read boolean is needed
 	} else if (pool.info.require_rollback()) {
+	  async = true;
+	  boost::optional<uint32_t> maybe_crc;
+	  // If there is a data digest and it is possible we are reading
+	  // entire object, pass the digest.  FillInVerifyExtent will
+	  // will check the oi.size again.
+	  if (oi.is_data_digest() && op.extent.offset == 0 &&
+	      op.extent.length >= oi.size)
+	    maybe_crc = oi.data_digest;
 	  ctx->pending_async_reads.push_back(
 	    make_pair(
 	      boost::make_tuple(op.extent.offset, op.extent.length, op.flags),
-	      make_pair(&osd_op.outdata, new FillInExtent(&op.extent.length))));
+	      make_pair(&osd_op.outdata,
+			new FillInVerifyExtent(&op.extent.length, &osd_op.rval,
+				&osd_op.outdata, maybe_crc, oi.size, osd,
+				soid, op.flags))));
 	  dout(10) << " async_read noted for " << soid << dendl;
 	} else {
 	  int r = pgbackend->objects_read_sync(
@@ -3451,9 +3960,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 		   << " bytes from obj " << soid << dendl;
 
 	  // whole object?  can we verify the checksum?
-	  if (result >= 0 &&
-	      op.extent.offset == 0 && op.extent.length == oi.size &&
-	      oi.is_data_digest()) {
+	  if (op.extent.length == oi.size && oi.is_data_digest()) {
 	    uint32_t crc = osd_op.outdata.crc32c(-1);
 	    if (oi.data_digest != crc) {
 	      osd->clog->error() << info.pgid << std::hex
@@ -3461,8 +3968,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 				 << " != expected 0x" << oi.data_digest
 				 << std::dec << " on " << soid;
 	      // FIXME fall back to replica or something?
-	      if (g_conf->osd_read_eio_on_bad_digest)
-		result = -EIO;
+	      result = -EIO;
 	    }
 	  }
 	}
@@ -3470,9 +3976,15 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	  first_read = false;
 	  ctx->data_off = op.extent.offset;
 	}
+	// XXX the op.extent.length is the requested length for async read
+	// On error this length is changed to 0 after the error comes back.
 	ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(op.extent.length, 10);
 	ctx->delta_stats.num_rd++;
 
+	// Skip checking the result and just proceed to the next operation
+	if (async)
+	  continue;
+
       }
       break;
 
@@ -3487,7 +3999,9 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
       {
 	// read into a buffer
 	bufferlist bl;
-	int r = osd->store->fiemap(coll, soid, op.extent.offset, op.extent.length, bl);
+	int r = osd->store->fiemap(coll, ghobject_t(soid, ghobject_t::NO_GEN,
+						    info.pgid.shard),
+				   op.extent.offset, op.extent.length, bl);
 	osd_op.outdata.claim(bl);
 	if (r < 0)
 	  result = r;
@@ -3501,21 +4015,27 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
     /* map extents */
     case CEPH_OSD_OP_SPARSE_READ:
       tracepoint(osd, do_osd_op_pre_sparse_read, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
-      if (pool.info.require_rollback()) {
-	result = -EOPNOTSUPP;
+      if (op.extent.truncate_seq) {
+	dout(0) << "sparse_read does not support truncation sequence " << dendl;
+	result = -EINVAL;
 	break;
       }
       ++ctx->num_read;
-      {
-        if (op.extent.truncate_seq) {
-          dout(0) << "sparse_read does not support truncation sequence " << dendl;
-          result = -EINVAL;
-          break;
-        }
+      if (pool.info.ec_pool()) {
+	// translate sparse read to a normal one if not supported
+	ctx->pending_async_reads.push_back(
+	  make_pair(
+	    boost::make_tuple(op.extent.offset, op.extent.length, op.flags),
+	    make_pair(&osd_op.outdata, new ToSparseReadResult(osd_op.outdata,
+							      op.extent.length))));
+	dout(10) << " async_read (was sparse_read) noted for " << soid << dendl;
+      } else {
 	// read into a buffer
 	bufferlist bl;
-        int total_read = 0;
-	int r = osd->store->fiemap(coll, soid, op.extent.offset, op.extent.length, bl);
+        uint32_t total_read = 0;
+	int r = osd->store->fiemap(coll, ghobject_t(soid, ghobject_t::NO_GEN,
+						    info.pgid.shard),
+				   op.extent.offset, op.extent.length, bl);
 	if (r < 0)  {
 	  result = r;
           break;
@@ -3571,16 +4091,31 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
           break;
         }
 
-        op.extent.length = total_read;
+	// Why SPARSE_READ need checksum? In fact, librbd always use sparse-read. 
+	// Maybe at first, there is no much whole objects. With continued use, more and more whole object exist.
+	// So from this point, for spare-read add checksum make sense.
+	if (total_read == oi.size && oi.is_data_digest()) {
+	  uint32_t crc = data_bl.crc32c(-1);
+	  if (oi.data_digest != crc) {
+	    osd->clog->error() << info.pgid << std::hex
+	      << " full-object read crc 0x" << crc
+	      << " != expected 0x" << oi.data_digest
+	      << std::dec << " on " << soid;
+	    // FIXME fall back to replica or something?
+	    result = -EIO;
+	    break;
+	  }
+	}
 
-        ::encode(m, osd_op.outdata);
-        ::encode(data_bl, osd_op.outdata);
+        op.extent.length = total_read;
 
-	ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(op.extent.length, 10);
-	ctx->delta_stats.num_rd++;
+        ::encode(m, osd_op.outdata); // re-encode since it might be modified
+        ::encode_destructively(data_bl, osd_op.outdata);
 
 	dout(10) << " sparse_read got " << total_read << " bytes from object " << soid << dendl;
       }
+      ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(op.extent.length, 10);
+      ctx->delta_stats.num_rd++;
       break;
 
     case CEPH_OSD_OP_CALL:
@@ -3756,6 +4291,10 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	  result = -EINVAL;
 	  break;
 	}
+	if (!obs.exists) {
+	  result = 0;
+	  break;
+	}
 	if (oi.is_dirty()) {
 	  result = -EBUSY;
 	  break;
@@ -3896,7 +4435,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
     case CEPH_OSD_OP_ASSERT_VER:
       ++ctx->num_read;
       {
-	uint64_t ver = op.watch.ver;
+	uint64_t ver = op.assert_ver.ver;
 	tracepoint(osd, do_osd_op_pre_assert_ver, soid.oid.name.c_str(), soid.snap.val, ver);
 	if (!ver)
 	  result = -EINVAL;
@@ -3993,6 +4532,9 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 
           resp.clones.push_back(ci);
         }
+	if (result < 0) {
+	  break;
+	}	  
         if (ssc->snapset.head_exists &&
 	    !ctx->obc->obs.oi.is_whiteout()) {
           assert(obs.exists);
@@ -4047,9 +4589,13 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 
 	notify_info_t n;
 	n.timeout = timeout;
+	n.notify_id = osd->get_next_id(get_osdmap()->get_epoch());
 	n.cookie = op.watch.cookie;
         n.bl = bl;
 	ctx->notifies.push_back(n);
+
+	// return our unique notify id to the client
+	::encode(n.notify_id, osd_op.outdata);
       }
       break;
 
@@ -4083,7 +4629,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
       ++ctx->num_write;
       {
 	tracepoint(osd, do_osd_op_pre_setallochint, soid.oid.name.c_str(), soid.snap.val, op.alloc_hint.expected_object_size, op.alloc_hint.expected_write_size);
-        if (!(get_min_peer_features() & CEPH_FEATURE_OSD_SET_ALLOC_HINT)) { 
+        if (!(get_min_upacting_features() & CEPH_FEATURE_OSD_SET_ALLOC_HINT)) { 
           result = -EOPNOTSUPP;
           break;
         }
@@ -4175,27 +4721,31 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	  t->append(soid, op.extent.offset, op.extent.length, osd_op.indata, op.flags);
 	} else {
 	  t->write(soid, op.extent.offset, op.extent.length, osd_op.indata, op.flags);
-	}
-	write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
-				    op.extent.offset, op.extent.length, true);
+	}
+
 	maybe_create_new_object(ctx);
-	if (op.extent.offset == 0 && op.extent.length == oi.size)
+	if (op.extent.offset == 0 && op.extent.length >= oi.size)
 	  obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
+	else if (op.extent.offset == oi.size && obs.oi.is_data_digest())
+	  obs.oi.set_data_digest(osd_op.indata.crc32c(obs.oi.data_digest));
 	else
 	  obs.oi.clear_data_digest();
+	write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
+				    op.extent.offset, op.extent.length, true);
+
       }
       break;
       
     case CEPH_OSD_OP_WRITEFULL:
       ++ctx->num_write;
       { // write full object
-	tracepoint(osd, do_osd_op_pre_writefull, soid.oid.name.c_str(), soid.snap.val, oi.size, op.extent.offset, op.extent.length);
+	tracepoint(osd, do_osd_op_pre_writefull, soid.oid.name.c_str(), soid.snap.val, oi.size, 0, op.extent.length);
 
 	if (op.extent.length != osd_op.indata.length()) {
 	  result = -EINVAL;
 	  break;
 	}
-	result = check_offset_and_length(op.extent.offset, op.extent.length, cct->_conf->osd_max_object_size);
+	result = check_offset_and_length(0, op.extent.length, cct->_conf->osd_max_object_size);
 	if (result < 0)
 	  break;
 
@@ -4211,7 +4761,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	    }
 	  }
 	  ctx->mod_desc.create();
-	  t->append(soid, op.extent.offset, op.extent.length, osd_op.indata, op.flags);
+	  t->append(soid, 0, op.extent.length, osd_op.indata, op.flags);
 	  if (obs.exists) {
 	    map<string, bufferlist> to_set = ctx->obc->attr_cache;
 	    map<string, boost::optional<bufferlist> > &overlay =
@@ -4230,25 +4780,16 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	  }
 	} else {
 	  ctx->mod_desc.mark_unrollbackable();
-	  if (obs.exists) {
-	    t->truncate(soid, 0);
+	  t->write(soid, 0, op.extent.length, osd_op.indata, op.flags);
+	  if (obs.exists && op.extent.length < oi.size) {
+	    t->truncate(soid, op.extent.length);
 	  }
-	  t->write(soid, op.extent.offset, op.extent.length, osd_op.indata, op.flags);
 	}
 	maybe_create_new_object(ctx);
 	obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
 
-	interval_set<uint64_t> ch;
-	if (oi.size > 0)
-	  ch.insert(0, oi.size);
-	ctx->modified_ranges.union_of(ch);
-	if (op.extent.length + op.extent.offset != oi.size) {
-	  ctx->delta_stats.num_bytes -= oi.size;
-	  oi.size = op.extent.length + op.extent.offset;
-	  ctx->delta_stats.num_bytes += oi.size;
-	}
-	ctx->delta_stats.num_wr++;
-	ctx->delta_stats.num_wr_kb += SHIFT_ROUND_UP(op.extent.length, 10);
+	write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
+	    0, op.extent.length, true, op.extent.length != oi.size ? true : false);
       }
       break;
 
@@ -4307,8 +4848,8 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
           if (result >= 0) {
 	    if (maybe_create_new_object(ctx)) {
               ctx->mod_desc.create();
+	      t->touch(soid);
 	    }
-            t->touch(soid);
           }
 	}
       }
@@ -4531,6 +5072,10 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	string aname;
 	bp.copy(op.xattr.name_len, aname);
 	tracepoint(osd, do_osd_op_pre_rmxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
+	if (!obs.exists || oi.is_whiteout()) {
+	  result = -ENOENT;
+	  break;
+	}
 	string name = "_" + aname;
 	if (pool.info.require_rollback()) {
 	  map<string, boost::optional<bufferlist> > to_set;
@@ -4582,7 +5127,6 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	result = -EOPNOTSUPP;
 	break;
       }
-      ++ctx->num_read;
       {
 	vector<OSDOp> nops(1);
 	OSDOp& newop = nops[0];
@@ -4600,7 +5144,6 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	result = -EOPNOTSUPP;
 	break;
       }
-      ++ctx->num_write;
       {
 	//_dout_lock.Lock();
 	//osd_op.data.hexdump(*_dout);
@@ -4687,9 +5230,9 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return);
 	set<string> out_set;
 
-	if (!pool.info.require_rollback()) {
+	if (pool.info.supports_omap()) {
 	  ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
-	    coll, soid
+	    coll, ghobject_t(soid)
 	    );
 	  assert(iter);
 	  iter->upper_bound(start_after);
@@ -4724,9 +5267,9 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return, filter_prefix.c_str());
 	map<string, bufferlist> out_set;
 
-	if (!pool.info.require_rollback()) {
+	if (pool.info.supports_omap()) {
 	  ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
-	    coll, soid
+	    coll, ghobject_t(soid)
 	    );
           if (!iter) {
             result = -ENOENT;
@@ -4750,13 +5293,13 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 
     case CEPH_OSD_OP_OMAPGETHEADER:
       tracepoint(osd, do_osd_op_pre_omapgetheader, soid.oid.name.c_str(), soid.snap.val);
-      if (pool.info.require_rollback()) {
+      if (!pool.info.supports_omap()) {
 	// return empty header
 	break;
       }
       ++ctx->num_read;
       {
-	osd->store->omap_get_header(coll, soid, &osd_op.outdata);
+	osd->store->omap_get_header(coll, ghobject_t(soid), &osd_op.outdata);
 	ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
 	ctx->delta_stats.num_rd++;
       }
@@ -4776,8 +5319,8 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	}
 	tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, list_entries(keys_to_get).c_str());
 	map<string, bufferlist> out;
-	if (!pool.info.require_rollback()) {
-	  osd->store->omap_get_values(coll, soid, keys_to_get, &out);
+	if (pool.info.supports_omap()) {
+	  osd->store->omap_get_values(coll, ghobject_t(soid), keys_to_get, &out);
 	} // else return empty omap entries
 	::encode(out, osd_op.outdata);
 	ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
@@ -4806,13 +5349,14 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	
 	map<string, bufferlist> out;
 
-	if (!pool.info.require_rollback()) {
+	if (pool.info.supports_omap()) {
 	  set<string> to_get;
 	  for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
 	       i != assertions.end();
 	       ++i)
 	    to_get.insert(i->first);
-	  int r = osd->store->omap_get_values(coll, soid, to_get, &out);
+	  int r = osd->store->omap_get_values(coll, ghobject_t(soid),
+					      to_get, &out);
 	  if (r < 0) {
 	    result = r;
 	    break;
@@ -4860,7 +5404,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 
       // OMAP Write ops
     case CEPH_OSD_OP_OMAPSETVALS:
-      if (pool.info.require_rollback()) {
+      if (!pool.info.supports_omap()) {
 	result = -EOPNOTSUPP;
 	tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val, "???");
 	break;
@@ -4896,7 +5440,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 
     case CEPH_OSD_OP_OMAPSETHEADER:
       tracepoint(osd, do_osd_op_pre_omapsetheader, soid.oid.name.c_str(), soid.snap.val);
-      if (pool.info.require_rollback()) {
+      if (!pool.info.supports_omap()) {
 	result = -EOPNOTSUPP;
 	break;
       }
@@ -4905,8 +5449,6 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
       {
 	if (maybe_create_new_object(ctx)) {
 	  t->touch(soid);
-	} else {
-	  obs.oi.clear_omap_digest();
 	}
 	t->omap_setheader(soid, osd_op.indata);
 	ctx->delta_stats.num_wr++;
@@ -4917,7 +5459,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 
     case CEPH_OSD_OP_OMAPCLEAR:
       tracepoint(osd, do_osd_op_pre_omapclear, soid.oid.name.c_str(), soid.snap.val);
-      if (pool.info.require_rollback()) {
+      if (!pool.info.supports_omap()) {
 	result = -EOPNOTSUPP;
 	break;
       }
@@ -4937,7 +5479,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
       break;
 
     case CEPH_OSD_OP_OMAPRMKEYS:
-      if (pool.info.require_rollback()) {
+      if (!pool.info.supports_omap()) {
 	result = -EOPNOTSUPP;
 	tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val, "???");
 	break;
@@ -5037,7 +5579,9 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	  ctx->copy_cb = cb;
 	  start_copy(cb, ctx->obc, src, src_oloc, src_version,
 		     op.copy_from.flags,
-		     false);
+		     false,
+		     op.copy_from.src_fadvise_flags,
+		     op.flags);
 	  result = -EINPROGRESS;
 	} else {
 	  // finish
@@ -5216,13 +5760,36 @@ int ReplicatedPG::_rollback_to(OpContext *ctx, ceph_osd_op& op)
     assert(is_missing_object(missing_oid));
     dout(20) << "_rollback_to attempted to roll back to a missing object "
 	     << missing_oid << " (requested snapid: ) " << snapid << dendl;
-    wait_for_unreadable_object(missing_oid, ctx->op);
+    block_write_on_degraded_snap(missing_oid, ctx->op);
     return ret;
   }
-  if (maybe_handle_cache(ctx->op, true, rollback_to, ret, missing_oid, true)) {
-    // promoting the rollback src, presumably
-    return -EAGAIN;
+  {
+    ObjectContextRef promote_obc;
+    switch (
+      maybe_handle_cache_detail(
+	ctx->op,
+	true,
+	rollback_to,
+	ret,
+	missing_oid,
+	true,
+	false,
+	&promote_obc)) {
+    case cache_result_t::NOOP:
+      break;
+    case cache_result_t::BLOCKED_PROMOTE:
+      assert(promote_obc);
+      block_write_on_snap_rollback(soid, promote_obc, ctx->op);
+      return -EAGAIN;
+    case cache_result_t::BLOCKED_FULL:
+      block_write_on_full_cache(soid, ctx->op);
+      return -EAGAIN;
+    default:
+      assert(0 == "must promote was set, other values are not valid");
+      return -EAGAIN;
+    }
   }
+
   if (ret == -ENOENT || (rollback_to && rollback_to->obs.oi.is_whiteout())) {
     // there's no snapshot here, or there's no object.
     // if there's no snapshot, we delete the object; otherwise, do nothing.
@@ -5243,7 +5810,7 @@ int ReplicatedPG::_rollback_to(OpContext *ctx, ceph_osd_op& op)
     if (is_degraded_or_backfilling_object(rollback_to_sobject)) {
       dout(20) << "_rollback_to attempted to roll back to a degraded object "
 	       << rollback_to_sobject << " (requested snapid: ) " << snapid << dendl;
-      wait_for_degraded_object(rollback_to_sobject, ctx->op);
+      block_write_on_degraded_snap(rollback_to_sobject, ctx->op);
       ret = -EAGAIN;
     } else if (rollback_to->obs.oi.soid.snap == CEPH_NOSNAP) {
       // rolling back to the head; we just need to clone it.
@@ -5330,7 +5897,6 @@ void ReplicatedPG::make_writeable(OpContext *ctx)
 {
   const hobject_t& soid = ctx->obs->oi.soid;
   SnapContext& snapc = ctx->snapc;
-  PGBackend::PGTransaction *t = pgbackend->get_transaction();
 
   // clone?
   assert(soid.snap == CEPH_NOSNAP);
@@ -5421,16 +5987,18 @@ void ReplicatedPG::make_writeable(OpContext *ctx)
     snap_oi->prior_version = ctx->obs->oi.version;
     snap_oi->copy_user_bits(ctx->obs->oi);
     snap_oi->snaps = snaps;
-    if (was_dirty)
-      snap_oi->set_flag(object_info_t::FLAG_DIRTY);
+
+    // prepend transaction to op_t
+    PGBackend::PGTransaction *t = pgbackend->get_transaction();
     _make_clone(ctx, t, ctx->clone_obc, soid, coid, snap_oi);
+    t->append(ctx->op_t);
+    delete ctx->op_t;
+    ctx->op_t = t;
     
     ctx->delta_stats.num_objects++;
-    if (snap_oi->is_dirty())
+    if (snap_oi->is_dirty()) {
       ctx->delta_stats.num_objects_dirty++;
-    if (snap_oi->is_whiteout()) {
-      dout(20) << __func__ << " cloning whiteout on " << soid << " to " << coid << dendl;
-      ctx->delta_stats.num_whiteouts++;
+      osd->logger->inc(l_osd_tier_dirty);
     }
     if (snap_oi->is_omap())
       ctx->delta_stats.num_objects_omap++;
@@ -5473,11 +6041,6 @@ void ReplicatedPG::make_writeable(OpContext *ctx)
     }
   }
   
-  // prepend transaction to op_t
-  t->append(ctx->op_t);
-  delete ctx->op_t;
-  ctx->op_t = t;
-
   // update snapset with latest snap context
   ctx->new_snapset.seq = snapc.seq;
   ctx->new_snapset.snaps = snapc.snaps;
@@ -5488,15 +6051,16 @@ void ReplicatedPG::make_writeable(OpContext *ctx)
 
 void ReplicatedPG::write_update_size_and_usage(object_stat_sum_t& delta_stats, object_info_t& oi,
 					       interval_set<uint64_t>& modified, uint64_t offset,
-					       uint64_t length, bool count_bytes)
+					       uint64_t length, bool count_bytes, bool force_changesize)
 {
   interval_set<uint64_t> ch;
   if (length)
     ch.insert(offset, length);
   modified.union_of(ch);
-  if (length && (offset + length > oi.size)) {
+  if (force_changesize || offset + length > oi.size) {
     uint64_t new_size = offset + length;
-    delta_stats.num_bytes += new_size - oi.size;
+    delta_stats.num_bytes -= oi.size;
+    delta_stats.num_bytes += new_size;
     oi.size = new_size;
   }
   delta_stats.num_wr++;
@@ -5577,7 +6141,7 @@ void ReplicatedPG::do_osd_op_effects(OpContext *ctx, const ConnectionRef& conn)
 	p->bl,
 	p->timeout,
 	p->cookie,
-	osd->get_next_id(get_osdmap()->get_epoch()),
+	p->notify_id,
 	ctx->obc->obs.oi.user_version,
 	osd));
     for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
@@ -5610,16 +6174,24 @@ void ReplicatedPG::do_osd_op_effects(OpContext *ctx, const ConnectionRef& conn)
   }
 }
 
-coll_t ReplicatedPG::get_temp_coll(ObjectStore::Transaction *t)
+hobject_t ReplicatedPG::generate_temp_object()
 {
-  return pgbackend->get_temp_coll(t);
+  ostringstream ss;
+  ss << "temp_" << info.pgid << "_" << get_role() << "_" << osd->monc->get_global_id() << "_" << (++temp_seq);
+  hobject_t hoid = info.pgid.make_temp_object(ss.str());
+  dout(20) << __func__ << " " << hoid << dendl;
+  return hoid;
 }
 
-hobject_t ReplicatedPG::generate_temp_object()
+hobject_t ReplicatedPG::get_temp_recovery_object(eversion_t version, snapid_t snap)
 {
   ostringstream ss;
-  ss << "temp_" << info.pgid << "_" << get_role() << "_" << osd->monc->get_global_id() << "_" << (++temp_seq);
-  hobject_t hoid = hobject_t::make_temp(ss.str());
+  ss << "temp_recovering_" << info.pgid  // (note this includes the shardid)
+     << "_" << version
+     << "_" << info.history.same_interval_since
+     << "_" << snap;
+  // pgid + version + interval + snapid is unique, and short
+  hobject_t hoid = info.pgid.make_temp_object(ss.str());
   dout(20) << __func__ << " " << hoid << dendl;
   return hoid;
 }
@@ -5647,6 +6219,27 @@ int ReplicatedPG::prepare_transaction(OpContext *ctx)
     return result;
   }
 
+  // check for full
+  if ((ctx->delta_stats.num_bytes > 0 ||
+       ctx->delta_stats.num_objects > 0) &&  // FIXME: keys?
+      (pool.info.has_flag(pg_pool_t::FLAG_FULL) ||
+       get_osdmap()->test_flag(CEPH_OSDMAP_FULL))) {
+    MOSDOp *m = static_cast<MOSDOp*>(ctx->op->get_req());
+    if (ctx->reqid.name.is_mds() ||   // FIXME: ignore MDS for now
+	m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) {
+      dout(20) << __func__ << " full, but proceeding due to FULL_FORCE or MDS"
+	       << dendl;
+    } else if (m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
+      // they tried, they failed.
+      dout(20) << __func__ << " full, replying to FULL_TRY op" << dendl;
+      return pool.info.has_flag(pg_pool_t::FLAG_FULL) ? -EDQUOT : -ENOSPC;
+    } else {
+      // drop request
+      dout(20) << __func__ << " full, dropping request (bad client)" << dendl;
+      return -EAGAIN;
+    }
+  }
+
   // clone, if necessary
   if (soid.snap == CEPH_NOSNAP)
     make_writeable(ctx);
@@ -5735,11 +6328,13 @@ void ReplicatedPG::finish_ctx(OpContext *ctx, int log_op_type, bool maintain_ssc
       ctx->snapset_obc->obs.oi.mtime = ctx->mtime;
       ctx->snapset_obc->obs.oi.local_mtime = now;
 
+      map<string, bufferlist> attrs;
       bufferlist bv(sizeof(ctx->new_obs.oi));
       ::encode(ctx->snapset_obc->obs.oi, bv);
       ctx->op_t->touch(snapoid);
-      setattr_maybe_cache(ctx->snapset_obc, ctx, ctx->op_t, OI_ATTR, bv);
-      setattr_maybe_cache(ctx->snapset_obc, ctx, ctx->op_t, SS_ATTR, bss);
+      attrs[OI_ATTR].claim(bv);
+      attrs[SS_ATTR].claim(bss);
+      setattrs_maybe_cache(ctx->snapset_obc, ctx, ctx->op_t, attrs);
       if (pool.info.require_rollback()) {
 	map<string, boost::optional<bufferlist> > to_set;
 	to_set[SS_ATTR];
@@ -5779,17 +6374,19 @@ void ReplicatedPG::finish_ctx(OpContext *ctx, int log_op_type, bool maintain_ssc
       dout(10) << " mtime unchanged at " << ctx->new_obs.oi.mtime << dendl;
     }
 
+    map <string, bufferlist> attrs;
     bufferlist bv(sizeof(ctx->new_obs.oi));
     ::encode(ctx->new_obs.oi, bv);
-    setattr_maybe_cache(ctx->obc, ctx, ctx->op_t, OI_ATTR, bv);
+    attrs[OI_ATTR].claim(bv);
 
     if (soid.snap == CEPH_NOSNAP) {
       dout(10) << " final snapset " << ctx->new_snapset
 	       << " in " << soid << dendl;
-      setattr_maybe_cache(ctx->obc, ctx, ctx->op_t, SS_ATTR, bss);
+      attrs[SS_ATTR].claim(bss);
     } else {
       dout(10) << " no snapset (this is a clone)" << dendl;
     }
+    setattrs_maybe_cache(ctx->obc, ctx, ctx->op_t, attrs);
 
     if (pool.info.require_rollback()) {
       set<string> changing;
@@ -5811,15 +6408,9 @@ void ReplicatedPG::finish_ctx(OpContext *ctx, int log_op_type, bool maintain_ssc
 				    ctx->user_at_version, ctx->reqid,
 				    ctx->mtime));
   if (soid.snap < CEPH_NOSNAP) {
-    set<snapid_t> _snaps(ctx->new_obs.oi.snaps.begin(),
-			 ctx->new_obs.oi.snaps.end());
     switch (log_op_type) {
     case pg_log_entry_t::MODIFY:
     case pg_log_entry_t::PROMOTE:
-      dout(20) << __func__ << " encoding snaps " << ctx->new_obs.oi.snaps
-	       << dendl;
-      ::encode(ctx->new_obs.oi.snaps, ctx->log.back().snaps);
-      break;
     case pg_log_entry_t::CLEAN:
       dout(20) << __func__ << " encoding snaps " << ctx->new_obs.oi.snaps
 	       << dendl;
@@ -5839,7 +6430,8 @@ void ReplicatedPG::finish_ctx(OpContext *ctx, int log_op_type, bool maintain_ssc
   // apply new object state.
   ctx->obc->obs = ctx->new_obs;
 
-  if (!maintain_ssc && soid.is_head()) {
+  if (soid.is_head() && !ctx->obc->obs.exists &&
+      (!maintain_ssc || ctx->cache_evict)) {
     ctx->obc->ssc->exists = false;
     ctx->obc->ssc->snapset = SnapSet();
   } else {
@@ -5847,22 +6439,29 @@ void ReplicatedPG::finish_ctx(OpContext *ctx, int log_op_type, bool maintain_ssc
     ctx->obc->ssc->snapset = ctx->new_snapset;
   }
 
+  apply_ctx_stats(ctx, scrub_ok);
+}
+
+void ReplicatedPG::apply_ctx_stats(OpContext *ctx, bool scrub_ok)
+{
   info.stats.stats.add(ctx->delta_stats);
 
+  const hobject_t& soid = ctx->obs->oi.soid;
   for (set<pg_shard_t>::iterator i = backfill_targets.begin();
        i != backfill_targets.end();
        ++i) {
     pg_shard_t bt = *i;
     pg_info_t& pinfo = peer_info[bt];
-    if (soid <= pinfo.last_backfill)
+    if (cmp(soid, pinfo.last_backfill, get_sort_bitwise()) <= 0)
       pinfo.stats.stats.add(ctx->delta_stats);
-    else if (soid <= last_backfill_started)
+    else if (cmp(soid, last_backfill_started, get_sort_bitwise()) <= 0)
       pending_backfill_updates[soid].stats.add(ctx->delta_stats);
   }
 
   if (!scrub_ok && scrubber.active) {
-    assert(soid < scrubber.start || soid >= scrubber.end);
-    if (soid < scrubber.start)
+    assert(cmp(soid, scrubber.start, get_sort_bitwise()) < 0 ||
+	   cmp(soid, scrubber.end, get_sort_bitwise()) >= 0);
+    if (cmp(soid, scrubber.start, get_sort_bitwise()) < 0)
       scrub_cstat.add(ctx->delta_stats);
   }
 }
@@ -5873,6 +6472,10 @@ void ReplicatedPG::complete_read_ctx(int result, OpContext *ctx)
   assert(ctx->async_reads_complete());
 
   for (vector<OSDOp>::iterator p = ctx->ops.begin(); p != ctx->ops.end(); ++p) {
+    if (p->rval < 0 && !(p->op.flags & CEPH_OSD_OP_FLAG_FAILOK)) {
+      result = p->rval;
+      break;
+    }
     ctx->bytes_read += p->outdata.length();
   }
   ctx->reply->claim_op_out_data(ctx->ops);
@@ -5972,6 +6575,10 @@ int ReplicatedPG::fill_in_copy_get(
     return result;
   }
 
+  if ((osd_op.op.copy_get.flags & CEPH_OSD_COPY_GET_FLAG_NOTSUPP_OMAP) &&
+      oi.is_omap())
+      return -EOPNOTSUPP;
+
   MOSDOp *op = reinterpret_cast<MOSDOp*>(ctx->op->get_req());
   uint64_t features = op->get_features();
 
@@ -6025,13 +6632,14 @@ int ReplicatedPG::fill_in_copy_get(
   bufferlist& bl = reply_obj.data;
   if (left > 0 && !cursor.data_complete) {
     if (cursor.data_offset < oi.size) {
+      left = MIN(oi.size - cursor.data_offset, (uint64_t)left);
       if (cb) {
 	async_read_started = true;
 	ctx->pending_async_reads.push_back(
 	  make_pair(
-	    boost::make_tuple(cursor.data_offset, left, 0),
+	    boost::make_tuple(cursor.data_offset, left, osd_op.op.flags),
 	    make_pair(&bl, cb)));
-	result = MIN(oi.size - cursor.data_offset, (uint64_t)left);
+        result = left;
 	cb->len = result;
       } else {
 	result = pgbackend->objects_read_sync(
@@ -6052,17 +6660,18 @@ int ReplicatedPG::fill_in_copy_get(
 
   // omap
   uint32_t omap_keys = 0;
-  if (pool.info.require_rollback()) {
+  if (!pool.info.supports_omap()) {
     cursor.omap_complete = true;
   } else {
     if (left > 0 && !cursor.omap_complete) {
       assert(cursor.data_complete);
       if (cursor.omap_offset.empty()) {
-	osd->store->omap_get_header(coll, oi.soid, &reply_obj.omap_header);
+	osd->store->omap_get_header(coll, ghobject_t(oi.soid),
+				    &reply_obj.omap_header);
       }
       bufferlist omap_data;
       ObjectMap::ObjectMapIterator iter =
-	osd->store->get_omap_iterator(coll, oi.soid);
+	osd->store->get_omap_iterator(coll, ghobject_t(oi.soid));
       assert(iter);
       iter->upper_bound(cursor.omap_offset);
       for (; iter->valid(); iter->next()) {
@@ -6116,10 +6725,34 @@ int ReplicatedPG::fill_in_copy_get(
   return result;
 }
 
+void ReplicatedPG::fill_in_copy_get_noent(OpRequestRef& op, hobject_t oid,
+                                          OSDOp& osd_op, bool classic)
+{
+  MOSDOp *m = static_cast<MOSDOp*>(op->get_req());
+  uint64_t features = m->get_features();
+  object_copy_data_t reply_obj;
+
+  pg_log.get_log().get_object_reqids(oid, 10, &reply_obj.reqids);
+  dout(20) << __func__ << " got reqids " << reply_obj.reqids << dendl;
+  if (classic) {
+    reply_obj.encode_classic(osd_op.outdata);
+  } else {
+    ::encode(reply_obj, osd_op.outdata, features);
+  }
+  osd_op.rval = -ENOENT;
+  MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, false);
+  reply->claim_op_out_data(m->ops);
+  reply->set_result(-ENOENT);
+  reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
+  osd->send_message_osd_client(reply, m->get_connection());
+}
+
 void ReplicatedPG::start_copy(CopyCallback *cb, ObjectContextRef obc,
 			      hobject_t src, object_locator_t oloc,
 			      version_t version, unsigned flags,
-			      bool mirror_snapset)
+			      bool mirror_snapset,
+			      unsigned src_obj_fadvise_flags,
+			      unsigned dest_obj_fadvise_flags)
 {
   const hobject_t& dest = obc->obs.oi.soid;
   dout(10) << __func__ << " " << dest
@@ -6140,7 +6773,8 @@ void ReplicatedPG::start_copy(CopyCallback *cb, ObjectContextRef obc,
   }
 
   CopyOpRef cop(new CopyOp(cb, obc, src, oloc, version, flags,
-			   mirror_snapset));
+			   mirror_snapset, src_obj_fadvise_flags,
+			   dest_obj_fadvise_flags));
   copy_ops[dest] = cop;
   obc->start_block();
 
@@ -6160,6 +6794,8 @@ void ReplicatedPG::_copy_some(ObjectContextRef obc, CopyOpRef cop)
     flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY;
   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE)
     flags |= CEPH_OSD_FLAG_MAP_SNAP_CLONE;
+  if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_RWORDERED)
+    flags |= CEPH_OSD_FLAG_RWORDERED;
 
   C_GatherBuilder gather(g_ceph_context);
 
@@ -6182,7 +6818,12 @@ void ReplicatedPG::_copy_some(ObjectContextRef obc, CopyOpRef cop)
     // it already!
     assert(cop->cursor.is_initial());
   }
-  op.copy_get(&cop->cursor, get_copy_chunk_size(),
+
+  uint32_t copyget_flags = 0;
+  if (!pool.info.supports_omap())
+   copyget_flags |= CEPH_OSD_COPY_GET_FLAG_NOTSUPP_OMAP;
+
+  op.copy_get(&cop->cursor, get_copy_chunk_size(), copyget_flags,
 	      &cop->results.object_size, &cop->results.mtime,
 	      &cop->attrs, &cop->data, &cop->omap_header, &cop->omap_data,
 	      &cop->results.snaps, &cop->results.snap_seq,
@@ -6193,6 +6834,7 @@ void ReplicatedPG::_copy_some(ObjectContextRef obc, CopyOpRef cop)
 	      &cop->results.truncate_seq,
 	      &cop->results.truncate_size,
 	      &cop->rval);
+  op.set_last_op_flags(cop->src_obj_fadvise_flags);
 
   C_Copyfrom *fin = new C_Copyfrom(this, obc->obs.oi.soid,
 				   get_last_peering_reset(), cop);
@@ -6214,7 +6856,7 @@ void ReplicatedPG::process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r)
 {
   dout(10) << __func__ << " " << oid << " tid " << tid
 	   << " " << cpp_strerror(r) << dendl;
-  map<hobject_t,CopyOpRef>::iterator p = copy_ops.find(oid);
+  map<hobject_t,CopyOpRef, hobject_t::BitwiseComparator>::iterator p = copy_ops.find(oid);
   if (p == copy_ops.end()) {
     dout(10) << __func__ << " no copy_op found" << dendl;
     return;
@@ -6226,10 +6868,11 @@ void ReplicatedPG::process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r)
     return;
   }
 
-  if (cop->omap_data.length())
+  if (cop->omap_data.length() || cop->omap_header.length())
     cop->results.has_omap = true;
 
-  if (r >= 0 && pool.info.require_rollback() && cop->omap_data.length()) {
+  if (r >= 0 && !pool.info.supports_omap() &&
+      (cop->omap_data.length() || cop->omap_header.length())) {
     r = -EOPNOTSUPP;
   }
   cop->objecter_tid = 0;
@@ -6290,14 +6933,16 @@ void ReplicatedPG::process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r)
   _build_finish_copy_transaction(cop, cop->results.final_tx);
 
   // verify digests?
-  dout(20) << __func__ << std::hex
-	   << " got digest: rx data 0x" << cop->results.data_digest
-	   << " omap 0x" << cop->results.omap_digest
-	   << ", source: data 0x" << cop->results.source_data_digest
-	   << " omap 0x" <<  cop->results.source_omap_digest
-	   << std::dec
-	   << " flags " << cop->results.flags
-	   << dendl;
+  if (cop->results.is_data_digest() || cop->results.is_omap_digest()) {
+    dout(20) << __func__ << std::hex
+      << " got digest: rx data 0x" << cop->results.data_digest
+      << " omap 0x" << cop->results.omap_digest
+      << ", source: data 0x" << cop->results.source_data_digest
+      << " omap 0x" <<  cop->results.source_omap_digest
+      << std::dec
+      << " flags " << cop->results.flags
+      << dendl;
+  }
   if (cop->results.is_data_digest() &&
       cop->results.data_digest != cop->results.source_data_digest) {
     derr << __func__ << std::hex << " data digest 0x" << cop->results.data_digest
@@ -6341,13 +6986,21 @@ void ReplicatedPG::process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r)
   copy_ops.erase(cobc->obs.oi.soid);
   cobc->stop_block();
 
-  // cancel and requeue proxy reads on this object
-  kick_proxy_read_blocked(cobc->obs.oi.soid);
-  for (map<ceph_tid_t, ProxyReadOpRef>::iterator it = proxyread_ops.begin();
-      it != proxyread_ops.end(); ++it) {
-    if (it->second->soid == cobc->obs.oi.soid) {
-      cancel_proxy_read(it->second);
+  // cancel and requeue proxy ops on this object
+  if (!r) {
+    for (map<ceph_tid_t, ProxyReadOpRef>::iterator it = proxyread_ops.begin();
+	it != proxyread_ops.end(); ++it) {
+      if (it->second->soid == cobc->obs.oi.soid) {
+	cancel_proxy_read(it->second);
+      }
     }
+    for (map<ceph_tid_t, ProxyWriteOpRef>::iterator it = proxywrite_ops.begin();
+	 it != proxywrite_ops.end(); ++it) {
+      if (it->second->soid == cobc->obs.oi.soid) {
+	cancel_proxy_write(it->second);
+      }
+    }
+    kick_proxy_ops_blocked(cobc->obs.oi.soid);
   }
 
   kick_object_context_blocked(cobc);
@@ -6401,10 +7054,10 @@ void ReplicatedPG::_write_copy_chunk(CopyOpRef cop, PGBackend::PGTransaction *t)
       cop->temp_cursor.data_offset,
       cop->data.length(),
       cop->data,
-      0);
+      cop->dest_obj_fadvise_flags);
     cop->data.clear();
   }
-  if (!pool.info.require_rollback()) {
+  if (pool.info.supports_omap()) {
     if (!cop->temp_cursor.omap_complete) {
       if (cop->omap_header.length()) {
 	cop->results.omap_digest =
@@ -6565,9 +7218,10 @@ void ReplicatedPG::finish_promote(int r, CopyResults *results,
     }
   }
 
-  if (r == -ENOENT && results->started_temp_obj) {
+  if (r < 0 && results->started_temp_obj) {
     dout(10) << __func__ << " abort; will clean up partial work" << dendl;
-    ObjectContextRef tempobc = get_object_context(results->temp_oid, true);
+    ObjectContextRef tempobc = get_object_context(results->temp_oid, false);
+    assert(tempobc);
     RepGather *repop = simple_repop_create(tempobc);
     repop->ctx->op_t->remove(results->temp_oid);
     simple_repop_submit(repop);
@@ -6586,7 +7240,7 @@ void ReplicatedPG::finish_promote(int r, CopyResults *results,
     OpContext *tctx = repop->ctx;
     tctx->at_version = get_next_version();
     filter_snapc(tctx->new_snapset.snaps);
-    vector<snapid_t> new_clones(tctx->new_snapset.clones.size());
+    vector<snapid_t> new_clones;
     for (vector<snapid_t>::iterator i = tctx->new_snapset.clones.begin();
 	 i != tctx->new_snapset.clones.end();
 	 ++i) {
@@ -6622,7 +7276,7 @@ void ReplicatedPG::finish_promote(int r, CopyResults *results,
     // pass error to everyone blocked on this object
     // FIXME: this is pretty sloppy, but at this point we got
     // something unexpected and don't have many other options.
-    map<hobject_t,list<OpRequestRef> >::iterator blocked_iter =
+    map<hobject_t,list<OpRequestRef>, hobject_t::BitwiseComparator>::iterator blocked_iter =
       waiting_for_blocked_object.find(soid);
     if (blocked_iter != waiting_for_blocked_object.end()) {
       while (!blocked_iter->second.empty()) {
@@ -6747,7 +7401,7 @@ void ReplicatedPG::cancel_copy(CopyOpRef cop, bool requeue)
 void ReplicatedPG::cancel_copy_ops(bool requeue)
 {
   dout(10) << __func__ << dendl;
-  map<hobject_t,CopyOpRef>::iterator p = copy_ops.begin();
+  map<hobject_t,CopyOpRef, hobject_t::BitwiseComparator>::iterator p = copy_ops.begin();
   while (p != copy_ops.end()) {
     // requeue this op? can I queue up all of them?
     cancel_copy((p++)->second, requeue);
@@ -6789,9 +7443,10 @@ struct C_Flush : public Context {
   hobject_t oid;
   epoch_t last_peering_reset;
   ceph_tid_t tid;
+  utime_t start;
   C_Flush(ReplicatedPG *p, hobject_t o, epoch_t lpr)
     : pg(p), oid(o), last_peering_reset(lpr),
-      tid(0)
+      tid(0), start(ceph_clock_now(NULL))
   {}
   void finish(int r) {
     if (r == -ECANCELED)
@@ -6799,6 +7454,7 @@ struct C_Flush : public Context {
     pg->lock();
     if (last_peering_reset == pg->get_last_peering_reset()) {
       pg->finish_flush(oid, tid, r);
+      pg->osd->logger->tinc(l_osd_tier_flush_lat, ceph_clock_now(NULL) - start);
     }
     pg->unlock();
   }
@@ -6857,7 +7513,7 @@ int ReplicatedPG::start_flush(
   if (blocking)
     obc->start_block();
 
-  map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(soid);
+  map<hobject_t,FlushOpRef, hobject_t::BitwiseComparator>::iterator p = flush_ops.find(soid);
   if (p != flush_ops.end()) {
     FlushOpRef fop = p->second;
     if (fop->op == op) {
@@ -6982,7 +7638,12 @@ int ReplicatedPG::start_flush(
 		CEPH_OSD_COPY_FROM_FLAG_FLUSH |
 		CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
 		CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
-		CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE);
+		CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE,
+		LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL|LIBRADOS_OP_FLAG_FADVISE_NOCACHE);
+
+    //mean the base tier don't cache data after this
+    if (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)
+      o.set_last_op_flags(LIBRADOS_OP_FLAG_FADVISE_DONTNEED);
   }
   C_Flush *fin = new C_Flush(this, soid, get_last_peering_reset());
 
@@ -6997,6 +7658,8 @@ int ReplicatedPG::start_flush(
   fop->objecter_tid = tid;
 
   flush_ops[soid] = fop;
+  info.stats.stats.sum.num_flush++;
+  info.stats.stats.sum.num_flush_kb += SHIFT_ROUND_UP(oi.size, 10);
   return -EINPROGRESS;
 }
 
@@ -7004,7 +7667,7 @@ void ReplicatedPG::finish_flush(hobject_t oid, ceph_tid_t tid, int r)
 {
   dout(10) << __func__ << " " << oid << " tid " << tid
 	   << " " << cpp_strerror(r) << dendl;
-  map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(oid);
+  map<hobject_t,FlushOpRef, hobject_t::BitwiseComparator>::iterator p = flush_ops.find(oid);
   if (p == flush_ops.end()) {
     dout(10) << __func__ << " no flush_op found" << dendl;
     return;
@@ -7076,7 +7739,8 @@ int ReplicatedPG::try_flush_mark_clean(FlushOpRef fop)
     return -EBUSY;
   }
 
-  if (!fop->blocking && scrubber.write_blocked_by_scrub(oid)) {
+  if (!fop->blocking &&
+      scrubber.write_blocked_by_scrub(oid, get_sort_bitwise())) {
     if (fop->op) {
       dout(10) << __func__ << " blocked by scrub" << dendl;
       requeue_op(fop->op);
@@ -7173,7 +7837,7 @@ void ReplicatedPG::cancel_flush(FlushOpRef fop, bool requeue)
 void ReplicatedPG::cancel_flush_ops(bool requeue)
 {
   dout(10) << __func__ << dendl;
-  map<hobject_t,FlushOpRef>::iterator p = flush_ops.begin();
+  map<hobject_t,FlushOpRef, hobject_t::BitwiseComparator>::iterator p = flush_ops.begin();
   while (p != flush_ops.end()) {
     cancel_flush((p++)->second, requeue);
   }
@@ -7255,16 +7919,20 @@ void ReplicatedPG::op_applied(const eversion_t &applied_version)
   if (is_primary()) {
     if (scrubber.active) {
       if (last_update_applied == scrubber.subset_last_update) {
-        osd->scrub_wq.queue(this);
+        requeue_scrub();
       }
     } else {
       assert(scrubber.start == scrubber.end);
     }
   } else {
     if (scrubber.active_rep_scrub) {
-      if (last_update_applied == scrubber.active_rep_scrub->scrub_to) {
-	osd->rep_scrub_wq.queue(scrubber.active_rep_scrub);
-	scrubber.active_rep_scrub = 0;
+      if (last_update_applied == static_cast<MOSDRepScrub*>(
+	    scrubber.active_rep_scrub->get_req())->scrub_to) {
+	osd->op_wq.queue(
+	  make_pair(
+	    this,
+	    scrubber.active_rep_scrub));
+	scrubber.active_rep_scrub = OpRequestRef();
       }
     }
   }
@@ -7344,7 +8012,7 @@ void ReplicatedPG::eval_repop(RepGather *repop)
 	     waiting_for_ack[repop->v].begin();
 	   i != waiting_for_ack[repop->v].end();
 	   ++i) {
-	MOSDOp *m = (MOSDOp*)i->first->get_req();
+	MOSDOp *m = static_cast<MOSDOp*>(i->first->get_req());
 	MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, true);
 	reply->set_reply_versions(repop->ctx->at_version,
 				  i->second);
@@ -7366,7 +8034,6 @@ void ReplicatedPG::eval_repop(RepGather *repop)
       }
       reply->add_flags(CEPH_OSD_FLAG_ACK);
       dout(10) << " sending ack on " << *repop << " " << reply << dendl;
-      assert(entity_name_t::TYPE_OSD != m->get_connection()->peer_type);
       osd->send_message_osd_client(reply, m->get_connection());
       repop->sent_ack = true;
     }
@@ -7414,7 +8081,7 @@ void ReplicatedPG::issue_repop(RepGather *repop)
   const hobject_t& soid = ctx->obs->oi.soid;
   if (ctx->op &&
     ((static_cast<MOSDOp *>(
-	ctx->op->get_req()))->get_flags() & CEPH_OSD_FLAG_PARALLELEXEC)) {
+	ctx->op->get_req()))->has_flag(CEPH_OSD_FLAG_PARALLELEXEC))) {
     // replicate original op for parallel execution on replica
     assert(0 == "broken implementation, do not use");
   }
@@ -7651,7 +8318,7 @@ void ReplicatedPG::handle_watch_timeout(WatchRef watch)
     return;
   }
 
-  if (scrubber.write_blocked_by_scrub(obc->obs.oi.soid)) {
+  if (scrubber.write_blocked_by_scrub(obc->obs.oi.soid, get_sort_bitwise())) {
     dout(10) << "handle_watch_timeout waiting for scrub on obj "
 	     << obc->obs.oi.soid
 	     << dendl;
@@ -7695,6 +8362,8 @@ void ReplicatedPG::handle_watch_timeout(WatchRef watch)
     ctx->log.back().mod_desc.mark_unrollbackable();
   }
 
+  // no ctx->delta_stats
+
   // obc ref swallowed by repop!
   simple_repop_submit(repop);
 
@@ -7758,8 +8427,7 @@ ObjectContextRef ReplicatedPG::get_object_context(const hobject_t& soid,
 	// new object.
 	object_info_t oi(soid);
 	SnapSetContext *ssc = get_snapset_context(
-	  soid, true,
-	  soid.has_snapset() ? attrs : 0);
+	  soid, true, 0);
 	obc = create_object_context(oi, ssc);
 	dout(10) << __func__ << ": " << obc << " " << soid
 		 << " " << obc->rwstate
@@ -7842,21 +8510,42 @@ int ReplicatedPG::find_object_context(const hobject_t& oid,
 				      bool map_snapid_to_clone,
 				      hobject_t *pmissing)
 {
-  hobject_t head(oid.oid, oid.get_key(), CEPH_NOSNAP, oid.get_hash(),
-		 info.pgid.pool(), oid.get_namespace());
-  hobject_t snapdir(oid.oid, oid.get_key(), CEPH_SNAPDIR, oid.get_hash(),
-		    info.pgid.pool(), oid.get_namespace());
+  assert(oid.pool == static_cast<int64_t>(info.pgid.pool()));
+  // want the head?
+  if (oid.snap == CEPH_NOSNAP) {
+    ObjectContextRef obc = get_object_context(oid, can_create);
+    if (!obc) {
+      if (pmissing)
+        *pmissing = oid;
+      return -ENOENT;
+    }
+    dout(10) << "find_object_context " << oid
+       << " @" << oid.snap
+       << " oi=" << obc->obs.oi
+       << dendl;
+    *pobc = obc;
+
+    if (can_create && !obc->ssc)
+      obc->ssc = get_snapset_context(oid, true);
+
+    return 0;
+  }
+
+  hobject_t head = oid.get_head();
 
   // want the snapdir?
   if (oid.snap == CEPH_SNAPDIR) {
     // return head or snapdir, whichever exists.
-    ObjectContextRef obc = get_object_context(head, can_create);
+    ObjectContextRef headobc = get_object_context(head, can_create);
+    ObjectContextRef obc = headobc;
     if (!obc || !obc->obs.exists)
-      obc = get_object_context(snapdir, can_create);
+      obc = get_object_context(oid, can_create);
     if (!obc || !obc->obs.exists) {
       // if we have neither, we would want to promote the head.
       if (pmissing)
 	*pmissing = head;
+      if (pobc)
+	*pobc = headobc; // may be null
       return -ENOENT;
     }
     dout(10) << "find_object_context " << oid
@@ -7872,26 +8561,6 @@ int ReplicatedPG::find_object_context(const hobject_t& oid,
     return 0;
   }
 
-  // want the head?
-  if (oid.snap == CEPH_NOSNAP) {
-    ObjectContextRef obc = get_object_context(head, can_create);
-    if (!obc) {
-      if (pmissing)
-	*pmissing = head;
-      return -ENOENT;
-    }
-    dout(10) << "find_object_context " << oid
-	     << " @" << oid.snap
-	     << " oi=" << obc->obs.oi
-	     << dendl;
-    *pobc = obc;
-
-    if (can_create && !obc->ssc)
-      obc->ssc = get_snapset_context(oid, true);
-
-    return 0;
-  }
-
   // we want a snap
   if (!map_snapid_to_clone && pool.info.is_removed_snap(oid.snap)) {
     dout(10) << __func__ << " snap " << oid.snap << " is removed" << dendl;
@@ -7899,7 +8568,7 @@ int ReplicatedPG::find_object_context(const hobject_t& oid,
   }
 
   SnapSetContext *ssc = get_snapset_context(oid, can_create);
-  if (!ssc || !(ssc->exists)) {
+  if (!ssc || !(ssc->exists || can_create)) {
     dout(20) << __func__ << " " << oid << " no snapset" << dendl;
     if (pmissing)
       *pmissing = head;  // start by getting the head
@@ -8102,22 +8771,28 @@ void ReplicatedPG::add_object_context_to_pg_stat(ObjectContextRef obc, pg_stat_t
 void ReplicatedPG::kick_object_context_blocked(ObjectContextRef obc)
 {
   const hobject_t& soid = obc->obs.oi.soid;
-  map<hobject_t, list<OpRequestRef> >::iterator p = waiting_for_blocked_object.find(soid);
-  if (p == waiting_for_blocked_object.end())
-    return;
-
   if (obc->is_blocked()) {
     dout(10) << __func__ << " " << soid << " still blocked" << dendl;
     return;
   }
 
-  list<OpRequestRef>& ls = p->second;
-  dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
-  requeue_ops(ls);
-  waiting_for_blocked_object.erase(p);
+  map<hobject_t, list<OpRequestRef>, hobject_t::BitwiseComparator>::iterator p = waiting_for_blocked_object.find(soid);
+  if (p != waiting_for_blocked_object.end()) {
+    list<OpRequestRef>& ls = p->second;
+    dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
+    requeue_ops(ls);
+    waiting_for_blocked_object.erase(p);
+  }
+
+  map<hobject_t, ObjectContextRef>::iterator i =
+    objects_blocked_on_snap_promotion.find(obc->obs.oi.soid.get_head());
+  if (i != objects_blocked_on_snap_promotion.end()) {
+    assert(i->second == obc);
+    objects_blocked_on_snap_promotion.erase(i);
+  }
 
   if (obc->requeue_scrub_on_unblock)
-    osd->queue_for_scrub(this);
+    requeue_scrub();
 }
 
 SnapSetContext *ReplicatedPG::create_snapset_context(const hobject_t& oid)
@@ -8136,12 +8811,11 @@ SnapSetContext *ReplicatedPG::get_snapset_context(
 {
   Mutex::Locker l(snapset_contexts_lock);
   SnapSetContext *ssc;
-  map<hobject_t, SnapSetContext*>::iterator p = snapset_contexts.find(
+  map<hobject_t, SnapSetContext*, hobject_t::BitwiseComparator>::iterator p = snapset_contexts.find(
     oid.get_snapdir());
   if (p != snapset_contexts.end()) {
     if (can_create || p->second->exists) {
       ssc = p->second;
-      ssc->exists = true;
     } else {
       return NULL;
     }
@@ -8164,6 +8838,9 @@ SnapSetContext *ReplicatedPG::get_snapset_context(
     if (bv.length()) {
       bufferlist::iterator bvp = bv.begin();
       ssc->snapset.decode(bvp);
+      ssc->exists = true;
+    } else {
+      ssc->exists = false;
     }
   }
   assert(ssc);
@@ -8282,6 +8959,7 @@ void ReplicatedPG::send_remove_op(
   osd->send_message_osd_cluster(peer.osd, subop, get_osdmap()->get_epoch());
 }
 
+
 void ReplicatedPG::finish_degraded_object(const hobject_t& oid)
 {
   dout(10) << "finish_degraded_object " << oid << dendl;
@@ -8304,6 +8982,11 @@ void ReplicatedPG::finish_degraded_object(const hobject_t& oid)
       (*i)->complete(0);
     }
   }
+  map<hobject_t, snapid_t>::iterator i = objects_blocked_on_degraded_snap.find(
+    oid.get_head());
+  if (i != objects_blocked_on_degraded_snap.end() &&
+      i->second == oid.snap)
+    objects_blocked_on_degraded_snap.erase(i);
 }
 
 void ReplicatedPG::_committed_pushed_object(
@@ -8350,7 +9033,7 @@ void ReplicatedPG::_applied_recovered_object(ObjectContextRef obc)
   // requeue an active chunky scrub waiting on recovery ops
   if (!deleting && active_pushes == 0
       && scrubber.is_chunky_scrub_active()) {
-    osd->scrub_wq.queue(this);
+    requeue_scrub();
   }
 
   unlock();
@@ -8366,9 +9049,13 @@ void ReplicatedPG::_applied_recovered_object_replica()
 
   // requeue an active chunky scrub waiting on recovery ops
   if (!deleting && active_pushes == 0 &&
-      scrubber.active_rep_scrub && scrubber.active_rep_scrub->chunky) {
-    osd->rep_scrub_wq.queue(scrubber.active_rep_scrub);
-    scrubber.active_rep_scrub = 0;
+      scrubber.active_rep_scrub && static_cast<MOSDRepScrub*>(
+	scrubber.active_rep_scrub->get_req())->chunky) {
+    osd->op_wq.queue(
+      make_pair(
+	this,
+	scrubber.active_rep_scrub));
+    scrubber.active_rep_scrub = OpRequestRef();
   }
 
   unlock();
@@ -8454,7 +9141,7 @@ ObjectContextRef ReplicatedPG::mark_object_lost(ObjectStore::Transaction *t,
 {
   // Wake anyone waiting for this object. Now that it's been marked as lost,
   // we will just return an error code.
-  map<hobject_t, list<OpRequestRef> >::iterator wmo =
+  map<hobject_t, list<OpRequestRef>, hobject_t::BitwiseComparator>::iterator wmo =
     waiting_for_unreadable_object.find(oid);
   if (wmo != waiting_for_unreadable_object.end()) {
     requeue_ops(wmo->second);
@@ -8475,7 +9162,8 @@ ObjectContextRef ReplicatedPG::mark_object_lost(ObjectStore::Transaction *t,
 
   bufferlist b2;
   obc->obs.oi.encode(b2);
-  t->setattr(coll, oid, OI_ATTR, b2);
+  assert(!pool.info.require_rollback());
+  t->setattr(coll, ghobject_t(oid), OI_ATTR, b2);
 
   return obc;
 }
@@ -8505,10 +9193,10 @@ void ReplicatedPG::mark_all_unfound_lost(int what)
   utime_t mtime = ceph_clock_now(cct);
   info.last_update.epoch = get_osdmap()->get_epoch();
   const pg_missing_t &missing = pg_log.get_missing();
-  map<hobject_t, pg_missing_t::item>::const_iterator m =
-    missing_loc.get_all_missing().begin();
-  map<hobject_t, pg_missing_t::item>::const_iterator mend =
-    missing_loc.get_all_missing().end();
+  map<hobject_t, pg_missing_t::item, hobject_t::ComparatorWithDefault>::const_iterator m =
+    missing_loc.get_needs_recovery().begin();
+  map<hobject_t, pg_missing_t::item, hobject_t::ComparatorWithDefault>::const_iterator mend =
+    missing_loc.get_needs_recovery().end();
   while (m != mend) {
     const hobject_t &oid(m->first);
     if (!missing_loc.is_unfound(oid)) {
@@ -8710,7 +9398,7 @@ void ReplicatedPG::on_removal(ObjectStore::Transaction *t)
   dout(10) << "on_removal" << dendl;
 
   // adjust info to backfill
-  info.last_backfill = hobject_t();
+  info.set_last_backfill(hobject_t(), true);
   dirty_info = true;
 
 
@@ -8730,8 +9418,6 @@ void ReplicatedPG::on_shutdown()
 
   // remove from queues
   osd->recovery_wq.dequeue(this);
-  osd->scrub_wq.dequeue(this);
-  osd->snap_trim_wq.dequeue(this);
   osd->pg_stat_queue_dequeue(this);
   osd->dequeue_pg(this, 0);
   osd->peering_wq.dequeue(this);
@@ -8739,10 +9425,11 @@ void ReplicatedPG::on_shutdown()
   // handles queue races
   deleting = true;
 
+  clear_scrub_reserved();
   unreg_next_scrub();
   cancel_copy_ops(false);
   cancel_flush_ops(false);
-  cancel_proxy_read_ops(false);
+  cancel_proxy_ops(false);
   apply_and_flush_repops(false);
 
   pgbackend->on_change();
@@ -8807,6 +9494,16 @@ void ReplicatedPG::on_activate()
   agent_setup();
 }
 
+void ReplicatedPG::_on_new_interval()
+{
+  // re-sort obc map?
+  if (object_contexts.get_comparator().bitwise != get_sort_bitwise()) {
+    dout(20) << __func__ << " resorting object_contexts" << dendl;
+    object_contexts.reset_comparator(
+      hobject_t::ComparatorWithDefault(get_sort_bitwise()));
+  }
+}
+
 void ReplicatedPG::on_change(ObjectStore::Transaction *t)
 {
   dout(10) << "on_change" << dendl;
@@ -8829,7 +9526,7 @@ void ReplicatedPG::on_change(ObjectStore::Transaction *t)
 
   cancel_copy_ops(is_primary());
   cancel_flush_ops(is_primary());
-  cancel_proxy_read_ops(is_primary());
+  cancel_proxy_ops(is_primary());
 
   // requeue object waiters
   if (is_primary()) {
@@ -8837,7 +9534,7 @@ void ReplicatedPG::on_change(ObjectStore::Transaction *t)
   } else {
     waiting_for_unreadable_object.clear();
   }
-  for (map<hobject_t,list<OpRequestRef> >::iterator p = waiting_for_degraded_object.begin();
+  for (map<hobject_t,list<OpRequestRef>, hobject_t::BitwiseComparator>::iterator p = waiting_for_degraded_object.begin();
        p != waiting_for_degraded_object.end();
        waiting_for_degraded_object.erase(p++)) {
     if (is_primary())
@@ -8846,7 +9543,7 @@ void ReplicatedPG::on_change(ObjectStore::Transaction *t)
       p->second.clear();
     finish_degraded_object(p->first);
   }
-  for (map<hobject_t,list<OpRequestRef> >::iterator p = waiting_for_blocked_object.begin();
+  for (map<hobject_t,list<OpRequestRef>, hobject_t::BitwiseComparator>::iterator p = waiting_for_blocked_object.begin();
        p != waiting_for_blocked_object.end();
        waiting_for_blocked_object.erase(p++)) {
     if (is_primary())
@@ -8854,7 +9551,7 @@ void ReplicatedPG::on_change(ObjectStore::Transaction *t)
     else
       p->second.clear();
   }
-  for (map<hobject_t, list<Context*> >::iterator i =
+  for (map<hobject_t, list<Context*>, hobject_t::BitwiseComparator>::iterator i =
 	 callbacks_for_degraded_object.begin();
        i != callbacks_for_degraded_object.end();
     ) {
@@ -8869,6 +9566,7 @@ void ReplicatedPG::on_change(ObjectStore::Transaction *t)
     waiting_for_cache_not_full.clear();
     waiting_for_all_missing.clear();
   }
+  objects_blocked_on_cache_full.clear();
 
 
   for (list<pair<OpRequestRef, OpContext*> >::iterator i =
@@ -8897,6 +9595,9 @@ void ReplicatedPG::on_change(ObjectStore::Transaction *t)
   // NOTE: we actually assert that all currently live references are dead
   // by the time the flush for the next interval completes.
   object_contexts.clear();
+
+  // should have been cleared above by finishing all of the degraded objects
+  assert(objects_blocked_on_degraded_snap.empty());
 }
 
 void ReplicatedPG::on_role_change()
@@ -8921,6 +9622,7 @@ void ReplicatedPG::on_pool_change()
     dout(10) << __func__ << " requeuing full waiters (not in writeback) "
 	     << dendl;
     requeue_ops(waiting_for_cache_not_full);
+    objects_blocked_on_cache_full.clear();
   }
   hit_set_setup();
   agent_setup();
@@ -8934,14 +9636,14 @@ void ReplicatedPG::_clear_recovery_state()
   recovering_oids.clear();
 #endif
   last_backfill_started = hobject_t();
-  set<hobject_t>::iterator i = backfills_in_flight.begin();
+  set<hobject_t, hobject_t::Comparator>::iterator i = backfills_in_flight.begin();
   while (i != backfills_in_flight.end()) {
     assert(recovering.count(*i));
     backfills_in_flight.erase(i++);
   }
 
   list<OpRequestRef> blocked_ops;
-  for (map<hobject_t, ObjectContextRef>::iterator i = recovering.begin();
+  for (map<hobject_t, ObjectContextRef, hobject_t::BitwiseComparator>::iterator i = recovering.begin();
        i != recovering.end();
        recovering.erase(i++)) {
     if (i->second) {
@@ -8979,6 +9681,7 @@ void ReplicatedPG::cancel_pull(const hobject_t &soid)
   }
   if (is_missing_object(soid))
     pg_log.set_last_requested(0); // get recover_primary to start over
+  finish_degraded_object(soid);
 }
 
 void ReplicatedPG::check_recovery_sources(const OSDMapRef osdmap)
@@ -9035,7 +9738,7 @@ void PG::MissingLoc::check_recovery_sources(const OSDMapRef osdmap)
 	     << missing_loc_sources << dendl;
     
     // filter missing_loc
-    map<hobject_t, set<pg_shard_t> >::iterator p = missing_loc.begin();
+    map<hobject_t, set<pg_shard_t>, hobject_t::BitwiseComparator>::iterator p = missing_loc.begin();
     while (p != missing_loc.end()) {
       set<pg_shard_t>::iterator q = p->second.begin();
       while (q != p->second.end())
@@ -9054,8 +9757,7 @@ void PG::MissingLoc::check_recovery_sources(const OSDMapRef osdmap)
   
 
 bool ReplicatedPG::start_recovery_ops(
-  int max, RecoveryCtx *prctx,
-  ThreadPool::TPHandle &handle,
+  int max, ThreadPool::TPHandle &handle,
   int *ops_started)
 {
   int& started = *ops_started;
@@ -9241,11 +9943,8 @@ int ReplicatedPG::recover_primary(int max, ThreadPool::TPHandle &handle)
 
     eversion_t need = item.need;
 
-    bool unfound = missing_loc.is_unfound(soid);
-
     dout(10) << "recover_primary "
              << soid << " " << item.need
-	     << (unfound ? " (unfound)":"")
 	     << (missing.is_missing(soid) ? " (missing)":"")
 	     << (missing.is_missing(head) ? " (missing head)":"")
              << (recovering.count(soid) ? " (recovering)":"")
@@ -9279,7 +9978,8 @@ int ReplicatedPG::recover_primary(int max, ThreadPool::TPHandle &handle)
 	      t->register_on_applied(new ObjectStore::C_DeleteTransaction(t));
 	      bufferlist b2;
 	      obc->obs.oi.encode(b2);
-	      t->setattr(coll, soid, OI_ATTR, b2);
+	      assert(!pool.info.require_rollback());
+	      t->setattr(coll, ghobject_t(soid), OI_ATTR, b2);
 
 	      recover_got(soid, latest->version);
 	      missing_loc.add_location(soid, pg_whoami);
@@ -9320,7 +10020,6 @@ int ReplicatedPG::recover_primary(int max, ThreadPool::TPHandle &handle)
 	    dout(10) << " will pull " << alternate_need << " or " << need
 		     << " from one of " << missing_loc.get_locations(soid)
 		     << dendl;
-	    unfound = false;
 	  }
 	}
 	break;
@@ -9330,8 +10029,6 @@ int ReplicatedPG::recover_primary(int max, ThreadPool::TPHandle &handle)
     if (!recovering.count(soid)) {
       if (recovering.count(head)) {
 	++skipped;
-      } else if (unfound) {
-	++skipped;
       } else {
 	int r = recover_missing(
 	  soid, need, cct->_conf->osd_recovery_op_priority, h);
@@ -9456,7 +10153,7 @@ int ReplicatedPG::recover_replicas(int max, ThreadPool::TPHandle &handle)
       handle.reset_tp_timeout();
       const hobject_t soid(p->second);
 
-      if (soid > pi->second.last_backfill) {
+      if (cmp(soid, pi->second.last_backfill, get_sort_bitwise()) > 0) {
 	if (!recovering.count(soid)) {
 	  derr << __func__ << ": object added to missing set for backfill, but "
 	       << "is not in recovering, error!" << dendl;
@@ -9493,7 +10190,7 @@ int ReplicatedPG::recover_replicas(int max, ThreadPool::TPHandle &handle)
       }
 
       dout(10) << __func__ << ": recover_object_replicas(" << soid << ")" << dendl;
-      map<hobject_t,pg_missing_t::item>::const_iterator r = m.missing.find(soid);
+      map<hobject_t,pg_missing_t::item, hobject_t::ComparatorWithDefault>::const_iterator r = m.missing.find(soid);
       started += prep_object_replica_pushes(soid, r->second.need,
 					    h);
     }
@@ -9513,7 +10210,7 @@ hobject_t ReplicatedPG::earliest_peer_backfill() const
     map<pg_shard_t, BackfillInterval>::const_iterator iter =
       peer_backfill_info.find(peer);
     assert(iter != peer_backfill_info.end());
-    if (iter->second.begin < e)
+    if (cmp(iter->second.begin, e, get_sort_bitwise()) < 0)
       e = iter->second.begin;
   }
   return e;
@@ -9573,7 +10270,10 @@ int ReplicatedPG::recover_backfill(
 {
   dout(10) << "recover_backfill (" << max << ")"
            << " bft=" << backfill_targets
-	   << " last_backfill_started " << last_backfill_started << dendl;
+	   << " last_backfill_started " << last_backfill_started
+	   << " sort " << (get_sort_bitwise() ? "bitwise" : "nibblewise")
+	   << (new_backfill ? " new_backfill":"")
+	   << dendl;
   assert(!backfill_targets.empty());
 
   // Initialize from prior backfill state
@@ -9581,13 +10281,35 @@ int ReplicatedPG::recover_backfill(
     // on_activate() was called prior to getting here
     assert(last_backfill_started == earliest_backfill());
     new_backfill = false;
+
+    // initialize BackfillIntervals (with proper sort order)
     for (set<pg_shard_t>::iterator i = backfill_targets.begin();
 	 i != backfill_targets.end();
 	 ++i) {
-      peer_backfill_info[*i].reset(peer_info[*i].last_backfill);
+      peer_backfill_info[*i].reset(peer_info[*i].last_backfill,
+				   get_sort_bitwise());
     }
-    backfill_info.reset(last_backfill_started);
+    backfill_info.reset(last_backfill_started,
+			get_sort_bitwise());
+
+    // initialize comparators
+    backfills_in_flight = set<hobject_t, hobject_t::Comparator>(
+      hobject_t::Comparator(get_sort_bitwise()));
+    pending_backfill_updates = map<hobject_t, pg_stat_t, hobject_t::Comparator>(
+      hobject_t::Comparator(get_sort_bitwise()));
+  }
+
+  // sanity check sort orders
+  assert(backfill_info.sort_bitwise == get_sort_bitwise());
+  for (map<pg_shard_t, BackfillInterval>::iterator i =
+	 peer_backfill_info.begin();
+       i != peer_backfill_info.end();
+       ++i) {
+    assert(i->second.sort_bitwise == get_sort_bitwise());
+    assert(i->second.objects.key_comp().bitwise == get_sort_bitwise());
   }
+  assert(backfills_in_flight.key_comp().bitwise == get_sort_bitwise());
+  assert(pending_backfill_updates.key_comp().bitwise == get_sort_bitwise());
 
   for (set<pg_shard_t>::iterator i = backfill_targets.begin();
        i != backfill_targets.end();
@@ -9608,33 +10330,34 @@ int ReplicatedPG::recover_backfill(
   vector<boost::tuple<hobject_t, eversion_t,
                       ObjectContextRef, vector<pg_shard_t> > > to_push;
   vector<boost::tuple<hobject_t, eversion_t, pg_shard_t> > to_remove;
-  set<hobject_t> add_to_stat;
+  set<hobject_t, hobject_t::BitwiseComparator> add_to_stat;
 
   for (set<pg_shard_t>::iterator i = backfill_targets.begin();
        i != backfill_targets.end();
        ++i) {
     peer_backfill_info[*i].trim_to(
-      MAX(peer_info[*i].last_backfill, last_backfill_started));
+      MAX_HOBJ(peer_info[*i].last_backfill, last_backfill_started,
+	       get_sort_bitwise()));
   }
   backfill_info.trim_to(last_backfill_started);
 
-  hobject_t backfill_pos = MIN(backfill_info.begin, earliest_peer_backfill());
+  hobject_t backfill_pos = MIN_HOBJ(backfill_info.begin,
+				    earliest_peer_backfill(),
+				    get_sort_bitwise());
   while (ops < max) {
-    if (backfill_info.begin <= earliest_peer_backfill() &&
+    if (cmp(backfill_info.begin, earliest_peer_backfill(),
+	    get_sort_bitwise()) <= 0 &&
 	!backfill_info.extends_to_end() && backfill_info.empty()) {
       hobject_t next = backfill_info.end;
-      backfill_info.clear();
-      backfill_info.begin = next;
+      backfill_info.reset(next, get_sort_bitwise());
       backfill_info.end = hobject_t::get_max();
       update_range(&backfill_info, handle);
       backfill_info.trim();
     }
-    backfill_pos = MIN(backfill_info.begin, earliest_peer_backfill());
+    backfill_pos = MIN_HOBJ(backfill_info.begin, earliest_peer_backfill(),
+			    get_sort_bitwise());
 
-    dout(20) << "   my backfill interval " << backfill_info.begin << "-" << backfill_info.end
-	     << " " << backfill_info.objects.size() << " objects"
-	     << " " << backfill_info.objects
-	     << dendl;
+    dout(20) << "   my backfill interval " << backfill_info << dendl;
 
     bool sent_scan = false;
     for (set<pg_shard_t>::iterator i = backfill_targets.begin();
@@ -9643,9 +10366,8 @@ int ReplicatedPG::recover_backfill(
       pg_shard_t bt = *i;
       BackfillInterval& pbi = peer_backfill_info[bt];
 
-      dout(20) << " peer shard " << bt << " backfill " << pbi.begin << "-"
-	       << pbi.end << " " << pbi.objects << dendl;
-      if (pbi.begin <= backfill_info.begin &&
+      dout(20) << " peer shard " << bt << " backfill " << pbi << dendl;
+      if (cmp(pbi.begin, backfill_info.begin, get_sort_bitwise()) <= 0 &&
 	  !pbi.extends_to_end() && pbi.empty()) {
 	dout(10) << " scanning peer osd." << bt << " from " << pbi.end << dendl;
 	epoch_t e = get_osdmap()->get_epoch();
@@ -9662,9 +10384,9 @@ int ReplicatedPG::recover_backfill(
 
     // Count simultaneous scans as a single op and let those complete
     if (sent_scan) {
-        ops++;
-	start_recovery_op(hobject_t::get_max()); // XXX: was pbi.end
-        break;
+      ops++;
+      start_recovery_op(hobject_t::get_max()); // XXX: was pbi.end
+      break;
     }
 
     if (backfill_info.empty() && all_peer_done()) {
@@ -9676,7 +10398,7 @@ int ReplicatedPG::recover_backfill(
     // the set of targets for which that object applies.
     hobject_t check = earliest_peer_backfill();
 
-    if (check < backfill_info.begin) {
+    if (cmp(check, backfill_info.begin, get_sort_bitwise()) < 0) {
 
       set<pg_shard_t> check_targets;
       for (set<pg_shard_t>::iterator i = backfill_targets.begin();
@@ -9728,7 +10450,8 @@ int ReplicatedPG::recover_backfill(
           // Only include peers that we've caught up to their backfill line
 	  // otherwise, they only appear to be missing this object
 	  // because their pbi.begin > backfill_info.begin.
-          if (backfill_info.begin > pinfo.last_backfill)
+          if (cmp(backfill_info.begin, pinfo.last_backfill,
+		  get_sort_bitwise()) > 0)
 	    missing_targs.push_back(bt);
 	  else
 	    skip_targs.push_back(bt);
@@ -9791,12 +10514,14 @@ int ReplicatedPG::recover_backfill(
       }
     }
   }
-  backfill_pos = MIN(backfill_info.begin, earliest_peer_backfill());
+  backfill_pos = MIN_HOBJ(backfill_info.begin, earliest_peer_backfill(),
+			  get_sort_bitwise());
 
-  for (set<hobject_t>::iterator i = add_to_stat.begin();
+  for (set<hobject_t, hobject_t::BitwiseComparator>::iterator i = add_to_stat.begin();
        i != add_to_stat.end();
        ++i) {
     ObjectContextRef obc = get_object_context(*i, false);
+    assert(obc);
     pg_stat_t stat;
     add_object_context_to_pg_stat(obc, &stat);
     pending_backfill_updates[*i] = stat;
@@ -9819,7 +10544,7 @@ int ReplicatedPG::recover_backfill(
   pgbackend->run_recovery_op(h, cct->_conf->osd_recovery_op_priority);
 
   dout(5) << "backfill_pos is " << backfill_pos << dendl;
-  for (set<hobject_t>::iterator i = backfills_in_flight.begin();
+  for (set<hobject_t, hobject_t::Comparator>::iterator i = backfills_in_flight.begin();
        i != backfills_in_flight.end();
        ++i) {
     dout(20) << *i << " is still in flight" << dendl;
@@ -9829,18 +10554,20 @@ int ReplicatedPG::recover_backfill(
     backfill_pos : *(backfills_in_flight.begin());
   hobject_t new_last_backfill = earliest_backfill();
   dout(10) << "starting new_last_backfill at " << new_last_backfill << dendl;
-  for (map<hobject_t, pg_stat_t>::iterator i = pending_backfill_updates.begin();
+  for (map<hobject_t, pg_stat_t, hobject_t::Comparator>::iterator i =
+	 pending_backfill_updates.begin();
        i != pending_backfill_updates.end() &&
-	 i->first < next_backfill_to_complete;
+	 cmp(i->first, next_backfill_to_complete, get_sort_bitwise()) < 0;
        pending_backfill_updates.erase(i++)) {
-    assert(i->first > new_last_backfill);
+    dout(20) << " pending_backfill_update " << i->first << dendl;
+    assert(cmp(i->first, new_last_backfill, get_sort_bitwise()) > 0);
     for (set<pg_shard_t>::iterator j = backfill_targets.begin();
 	 j != backfill_targets.end();
 	 ++j) {
       pg_shard_t bt = *j;
       pg_info_t& pinfo = peer_info[bt];
       //Add stats to all peers that were missing object
-      if (i->first > pinfo.last_backfill)
+      if (cmp(i->first, pinfo.last_backfill, get_sort_bitwise()) > 0)
         pinfo.stats.add(i->second);
     }
     new_last_backfill = i->first;
@@ -9866,8 +10593,8 @@ int ReplicatedPG::recover_backfill(
     pg_shard_t bt = *i;
     pg_info_t& pinfo = peer_info[bt];
 
-    if (new_last_backfill > pinfo.last_backfill) {
-      pinfo.last_backfill = new_last_backfill;
+    if (cmp(new_last_backfill, pinfo.last_backfill, get_sort_bitwise()) > 0) {
+      pinfo.set_last_backfill(new_last_backfill, get_sort_bitwise());
       epoch_t e = get_osdmap()->get_epoch();
       MOSDPGBackfill *m = NULL;
       if (pinfo.last_backfill.is_max()) {
@@ -9987,7 +10714,8 @@ void ReplicatedPG::update_range(
 	     << dendl;
     for (; i != pg_log.get_log().log.end(); ++i) {
       const hobject_t &soid = i->soid;
-      if (soid >= bi->begin && soid < bi->end) {
+      if (cmp(soid, bi->begin, get_sort_bitwise()) >= 0 &&
+	  cmp(soid, bi->end, get_sort_bitwise()) < 0) {
 	if (i->is_update()) {
 	  dout(10) << __func__ << ": " << i->soid << " updated to version "
 		   << i->version << dendl;
@@ -10014,11 +10742,11 @@ void ReplicatedPG::scan_range(
 {
   assert(is_locked());
   dout(10) << "scan_range from " << bi->begin << dendl;
-  bi->objects.clear();  // for good measure
+  bi->clear_objects();
 
   vector<hobject_t> ls;
   ls.reserve(max);
-  int r = pgbackend->objects_list_partial(bi->begin, min, max, 0, &ls, &bi->end);
+  int r = pgbackend->objects_list_partial(bi->begin, min, max, &ls, &bi->end);
   assert(r >= 0);
   dout(10) << " got " << ls.size() << " items, next " << bi->end << dendl;
   dout(20) << ls << dendl;
@@ -10065,7 +10793,7 @@ void ReplicatedPG::check_local()
     return;
 
   // just scan the log.
-  set<hobject_t> did;
+  set<hobject_t, hobject_t::BitwiseComparator> did;
   for (list<pg_log_entry_t>::const_reverse_iterator p = pg_log.get_log().log.rbegin();
        p != pg_log.get_log().log.rend();
        ++p) {
@@ -10108,10 +10836,19 @@ hobject_t ReplicatedPG::get_hit_set_current_object(utime_t stamp)
   return hoid;
 }
 
-hobject_t ReplicatedPG::get_hit_set_archive_object(utime_t start, utime_t end)
+hobject_t ReplicatedPG::get_hit_set_archive_object(utime_t start,
+						   utime_t end,
+						   bool using_gmt)
 {
   ostringstream ss;
-  ss << "hit_set_" << info.pgid.pgid << "_archive_" << start << "_" << end;
+  ss << "hit_set_" << info.pgid.pgid << "_archive_";
+  if (using_gmt) {
+    start.gmtime(ss) << "_";
+    end.gmtime(ss);
+  } else {
+    start.localtime(ss) << "_";
+    end.localtime(ss);
+  }
   hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
 		 info.pgid.ps(), info.pgid.pool(),
 		 cct->_conf->osd_hit_set_namespace);
@@ -10130,12 +10867,19 @@ void ReplicatedPG::hit_set_clear()
 void ReplicatedPG::hit_set_setup()
 {
   if (!is_active() ||
-      !is_primary() ||
-      !pool.info.hit_set_count ||
-      !pool.info.hit_set_period ||
-      pool.info.hit_set_params.get_type() == HitSet::TYPE_NONE) {
+      !is_primary()) {
+    hit_set_clear();
+    return;
+  }
+
+  if (is_active() && is_primary() &&
+      (!pool.info.hit_set_count ||
+       !pool.info.hit_set_period ||
+       pool.info.hit_set_params.get_type() == HitSet::TYPE_NONE)) {
     hit_set_clear();
-    //hit_set_remove_all();  // FIXME: implement me soon
+
+    // only primary is allowed to remove all the hit set objects
+    hit_set_remove_all();
     return;
   }
 
@@ -10147,6 +10891,46 @@ void ReplicatedPG::hit_set_setup()
   hit_set_apply_log();
 }
 
+void ReplicatedPG::hit_set_remove_all()
+{
+  // If any archives are degraded we skip this
+  for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
+       p != info.hit_set.history.end();
+       ++p) {
+    hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
+
+    // Once we hit a degraded object just skip
+    if (is_degraded_or_backfilling_object(aoid))
+      return;
+    if (scrubber.write_blocked_by_scrub(aoid, get_sort_bitwise()))
+      return;
+  }
+
+  if (!info.hit_set.history.empty()) {
+    list<pg_hit_set_info_t>::reverse_iterator p = info.hit_set.history.rbegin();
+    assert(p != info.hit_set.history.rend());
+    hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
+    assert(!is_degraded_or_backfilling_object(oid));
+    ObjectContextRef obc = get_object_context(oid, false);
+    assert(obc);
+
+    RepGather *repop = simple_repop_create(obc);
+    OpContext *ctx = repop->ctx;
+    ctx->at_version = get_next_version();
+    ctx->updated_hset_history = info.hit_set;
+    utime_t now = ceph_clock_now(cct);
+    ctx->mtime = now;
+    hit_set_trim(repop, 0);
+    apply_ctx_stats(ctx);
+    simple_repop_submit(repop);
+  }
+
+  info.hit_set = pg_hit_set_history_t();
+  if (agent_state) {
+    agent_state->discard_hit_sets();
+  }
+}
+
 void ReplicatedPG::hit_set_create()
 {
   utime_t now = ceph_clock_now(NULL);
@@ -10238,32 +11022,20 @@ void ReplicatedPG::hit_set_persist()
   hobject_t oid;
   time_t flush_time = 0;
 
-  // See what start is going to be used later
-  utime_t start = info.hit_set.current_info.begin;
-  if (!start)
-     start = hit_set_start_stamp;
-
   // If any archives are degraded we skip this persist request
   // account for the additional entry being added below
   for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
        p != info.hit_set.history.end();
        ++p) {
-    hobject_t aoid = get_hit_set_archive_object(p->begin, p->end);
+    hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
 
     // Once we hit a degraded object just skip further trim
     if (is_degraded_or_backfilling_object(aoid))
       return;
-    if (scrubber.write_blocked_by_scrub(aoid))
+    if (scrubber.write_blocked_by_scrub(aoid, get_sort_bitwise()))
       return;
   }
 
-  oid = get_hit_set_archive_object(start, now);
-  // If the current object is degraded we skip this persist request
-  if (is_degraded_or_backfilling_object(oid))
-    return;
-  if (scrubber.write_blocked_by_scrub(oid))
-    return;
-
   // If backfill is in progress and we could possibly overlap with the
   // hit_set_* objects, back off.  Since these all have
   // hobject_t::hash set to pgid.ps(), and those sort first, we can
@@ -10284,22 +11056,35 @@ void ReplicatedPG::hit_set_persist()
     }
   }
 
-  if (!info.hit_set.current_info.begin)
-    info.hit_set.current_info.begin = hit_set_start_stamp;
+
+  pg_hit_set_info_t new_hset = pg_hit_set_info_t(pool.info.use_gmt_hitset);
+  new_hset.begin = hit_set_start_stamp;
+  new_hset.end = now;
+  oid = get_hit_set_archive_object(
+    new_hset.begin,
+    new_hset.end,
+    new_hset.using_gmt);
+
+  // If the current object is degraded we skip this persist request
+  if (scrubber.write_blocked_by_scrub(oid, get_sort_bitwise()))
+    return;
 
   hit_set->seal();
   ::encode(*hit_set, bl);
-  info.hit_set.current_info.end = now;
   dout(20) << __func__ << " archive " << oid << dendl;
 
   if (agent_state) {
-    agent_state->add_hit_set(info.hit_set.current_info.begin, hit_set);
-    hit_set_in_memory_trim();
+    agent_state->add_hit_set(new_hset.begin, hit_set);
+    uint32_t size = agent_state->hit_set_map.size();
+    if (size >= pool.info.hit_set_count) {
+      size = pool.info.hit_set_count > 0 ? pool.info.hit_set_count - 1: 0;
+    }
+    hit_set_in_memory_trim(size);
   }
 
   // hold a ref until it is flushed to disk
-  hit_set_flushing[info.hit_set.current_info.begin] = hit_set;
-  flush_time = info.hit_set.current_info.begin;
+  hit_set_flushing[new_hset.begin] = hit_set;
+  flush_time = new_hset.begin;
 
   ObjectContextRef obc = get_object_context(oid, true);
   repop = simple_repop_create(obc);
@@ -10310,49 +11095,11 @@ void ReplicatedPG::hit_set_persist()
   ctx->updated_hset_history = info.hit_set;
   pg_hit_set_history_t &updated_hit_set_hist = *(ctx->updated_hset_history);
 
-  if (updated_hit_set_hist.current_last_stamp != utime_t()) {
-    // FIXME: we cheat slightly here by bundling in a remove on a object
-    // other the RepGather object.  we aren't carrying an ObjectContext for
-    // the deleted object over this period.
-    hobject_t old_obj =
-      get_hit_set_current_object(updated_hit_set_hist.current_last_stamp);
-    ctx->log.push_back(
-      pg_log_entry_t(pg_log_entry_t::DELETE,
-		     old_obj,
-		     ctx->at_version,
-		     updated_hit_set_hist.current_last_update,
-		     0,
-		     osd_reqid_t(),
-		     ctx->mtime));
-    if (pool.info.require_rollback()) {
-      if (ctx->log.back().mod_desc.rmobject(ctx->at_version.version)) {
-	ctx->op_t->stash(old_obj, ctx->at_version.version);
-      } else {
-	ctx->op_t->remove(old_obj);
-      }
-    } else {
-      ctx->op_t->remove(old_obj);
-      ctx->log.back().mod_desc.mark_unrollbackable();
-    }
-    ++ctx->at_version.version;
-
-    struct stat st;
-    int r = osd->store->stat(
-      coll,
-      ghobject_t(old_obj, ghobject_t::NO_GEN, pg_whoami.shard),
-      &st);
-    assert(r == 0);
-    --ctx->delta_stats.num_objects;
-    ctx->delta_stats.num_bytes -= st.st_size;
-  }
-
-  updated_hit_set_hist.current_last_update = info.last_update; // *after* above remove!
-  updated_hit_set_hist.current_info.version = ctx->at_version;
+  updated_hit_set_hist.current_last_update = info.last_update;
+  new_hset.version = ctx->at_version;
 
-  updated_hit_set_hist.history.push_back(updated_hit_set_hist.current_info);
+  updated_hit_set_hist.history.push_back(new_hset);
   hit_set_create();
-  updated_hit_set_hist.current_info = pg_hit_set_info_t();
-  updated_hit_set_hist.current_last_stamp = utime_t();
 
   // fabricate an object_info_t and SnapSet
   obc->obs.oi.version = ctx->at_version;
@@ -10377,8 +11124,10 @@ void ReplicatedPG::hit_set_persist()
   ::encode(ctx->new_obs.oi, boi);
 
   ctx->op_t->append(oid, 0, bl.length(), bl, 0);
-  setattr_maybe_cache(ctx->obc, ctx, ctx->op_t, OI_ATTR, boi);
-  setattr_maybe_cache(ctx->obc, ctx, ctx->op_t, SS_ATTR, bss);
+  map <string, bufferlist> attrs;
+  attrs[OI_ATTR].claim(boi);
+  attrs[SS_ATTR].claim(bss);
+  setattrs_maybe_cache(ctx->obc, ctx, ctx->op_t, attrs);
   ctx->log.push_back(
     pg_log_entry_t(
       pg_log_entry_t::MODIFY,
@@ -10397,12 +11146,7 @@ void ReplicatedPG::hit_set_persist()
 
   hit_set_trim(repop, max);
 
-  info.stats.stats.add(ctx->delta_stats);
-  if (scrubber.active) {
-    if (oid < scrubber.start)
-      scrub_cstat.add(ctx->delta_stats);
-  }
-
+  apply_ctx_stats(ctx);
   simple_repop_submit(repop);
 }
 
@@ -10414,7 +11158,7 @@ void ReplicatedPG::hit_set_trim(RepGather *repop, unsigned max)
   for (unsigned num = updated_hit_set_hist.history.size(); num > max; --num) {
     list<pg_hit_set_info_t>::iterator p = updated_hit_set_hist.history.begin();
     assert(p != updated_hit_set_hist.history.end());
-    hobject_t oid = get_hit_set_archive_object(p->begin, p->end);
+    hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
 
     assert(!is_degraded_or_backfilling_object(oid));
 
@@ -10450,14 +11194,8 @@ void ReplicatedPG::hit_set_trim(RepGather *repop, unsigned max)
   }
 }
 
-void ReplicatedPG::hit_set_in_memory_trim()
+void ReplicatedPG::hit_set_in_memory_trim(uint32_t max_in_memory)
 {
-  unsigned max = pool.info.hit_set_count;
-  unsigned max_in_memory = pool.info.min_read_recency_for_promote > 0 ? pool.info.min_read_recency_for_promote - 1 : 0;
-
-  if (max_in_memory > max) {
-    max_in_memory = max;
-  }
   while (agent_state->hit_set_map.size() > max_in_memory) {
     agent_state->remove_oldest_hit_set();
   }
@@ -10509,7 +11247,7 @@ void ReplicatedPG::agent_clear()
 }
 
 // Return false if no objects operated on since start of object hash space
-bool ReplicatedPG::agent_work(int start_max)
+bool ReplicatedPG::agent_work(int start_max, int agent_flush_quota)
 {
   lock();
   if (!agent_state) {
@@ -10553,7 +11291,6 @@ bool ReplicatedPG::agent_work(int start_max)
   vector<hobject_t> ls;
   hobject_t next;
   int r = pgbackend->objects_list_partial(agent_state->position, ls_min, ls_max,
-					  0 /* no filtering by snapid */,
 					  &ls, &next);
   assert(r >= 0);
   dout(20) << __func__ << " got " << ls.size() << " objects" << dendl;
@@ -10588,7 +11325,7 @@ bool ReplicatedPG::agent_work(int start_max)
       osd->logger->inc(l_osd_agent_skip);
       continue;
     }
-    if (scrubber.write_blocked_by_scrub(obc->obs.oi.soid)) {
+    if (scrubber.write_blocked_by_scrub(obc->obs.oi.soid, get_sort_bitwise())) {
       dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl;
       osd->logger->inc(l_osd_agent_skip);
       continue;
@@ -10605,19 +11342,21 @@ bool ReplicatedPG::agent_work(int start_max)
     }
 
     // be careful flushing omap to an EC pool.
-    if (base_pool->is_erasure() &&
-	obc->obs.oi.test_flag(object_info_t::FLAG_OMAP)) {
+    if (!base_pool->supports_omap() &&
+	obc->obs.oi.is_omap()) {
       dout(20) << __func__ << " skip (omap to EC) " << obc->obs.oi << dendl;
       osd->logger->inc(l_osd_agent_skip);
       continue;
     }
 
-    if (agent_state->flush_mode != TierAgentState::FLUSH_MODE_IDLE &&
-	agent_maybe_flush(obc))
-      ++started;
     if (agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
 	agent_maybe_evict(obc))
       ++started;
+    else if (agent_state->flush_mode != TierAgentState::FLUSH_MODE_IDLE &&
+             agent_flush_quota > 0 && agent_maybe_flush(obc)) {
+      ++started;
+      --agent_flush_quota;
+    }
     if (started >= start_max) {
       // If finishing early, set "next" to the next object
       if (++p != ls.end())
@@ -10644,7 +11383,8 @@ bool ReplicatedPG::agent_work(int start_max)
   // See if we've made a full pass over the object hash space
   // This might check at most ls_max objects a second time to notice that
   // we've checked every objects at least once.
-  if (agent_state->position < agent_state->start && next >= agent_state->start) {
+  if (cmp(agent_state->position, agent_state->start, get_sort_bitwise()) < 0 &&
+      cmp(next, agent_state->start, get_sort_bitwise()) >= 0) {
     dout(20) << __func__ << " wrap around " << agent_state->start << dendl;
     if (total_started == 0)
       need_delay = true;
@@ -10661,7 +11401,7 @@ bool ReplicatedPG::agent_work(int start_max)
     agent_state->position = next;
 
   // Discard old in memory HitSets
-  hit_set_in_memory_trim();
+  hit_set_in_memory_trim(pool.info.hit_set_count);
 
   if (need_delay) {
     assert(agent_state->delaying == false);
@@ -10699,7 +11439,7 @@ void ReplicatedPG::agent_load_hit_sets()
 	  continue;
 	}
 
-	hobject_t oid = get_hit_set_archive_object(p->begin, p->end);
+	hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
 	if (is_unreadable_object(oid)) {
 	  dout(10) << __func__ << " unreadable " << oid << ", waiting" << dendl;
 	  break;
@@ -10714,7 +11454,7 @@ void ReplicatedPG::agent_load_hit_sets()
 	bufferlist bl;
 	{
 	  obc->ondisk_read_lock();
-	  int r = osd->store->read(coll, oid, 0, 0, bl);
+	  int r = osd->store->read(coll, ghobject_t(oid), 0, 0, bl);
 	  assert(r >= 0);
 	  obc->ondisk_read_unlock();
 	}
@@ -10790,6 +11530,16 @@ bool ReplicatedPG::agent_maybe_flush(ObjectContextRef& obc)
   return true;
 }
 
+struct C_AgentEvictStartStop : public Context {
+  ReplicatedPGRef pg;
+  C_AgentEvictStartStop(ReplicatedPG *p) : pg(p) {
+    pg->osd->agent_start_evict_op();
+  }
+  void finish(int r) {
+    pg->osd->agent_finish_evict_op();
+  }
+};
+
 bool ReplicatedPG::agent_maybe_evict(ObjectContextRef& obc)
 {
   const hobject_t& soid = obc->obs.oi.soid;
@@ -10878,12 +11628,16 @@ bool ReplicatedPG::agent_maybe_evict(ObjectContextRef& obc)
   dout(10) << __func__ << " evicting " << obc->obs.oi << dendl;
   RepGather *repop = simple_repop_create(obc);
   OpContext *ctx = repop->ctx;
+  Context *on_evict = new C_AgentEvictStartStop(this);
+  ctx->on_finish = on_evict;
   ctx->lock_to_release = OpContext::W_LOCK;
   ctx->at_version = get_next_version();
   assert(ctx->new_obs.exists);
   int r = _delete_oid(ctx, true);
   if (obc->obs.oi.is_omap())
     ctx->delta_stats.num_objects_omap--;
+  ctx->delta_stats.num_evict++;
+  ctx->delta_stats.num_evict_kb += SHIFT_ROUND_UP(obc->obs.oi.size, 10);
   assert(r == 0);
   finish_ctx(ctx, pg_log_entry_t::DELETE, false);
   simple_repop_submit(repop);
@@ -10932,6 +11686,17 @@ bool ReplicatedPG::agent_choose_mode(bool restart, OpRequestRef op)
     return requeued;
   }
 
+  TierAgentState::flush_mode_t flush_mode = TierAgentState::FLUSH_MODE_IDLE;
+  TierAgentState::evict_mode_t evict_mode = TierAgentState::EVICT_MODE_IDLE;
+  unsigned evict_effort = 0;
+
+  if (info.stats.stats_invalid) {
+    // idle; stats can't be trusted until we scrub.
+    dout(20) << __func__ << " stats invalid (post-split), idle" << dendl;
+    goto skip_calc;
+  }
+
+  {
   uint64_t divisor = pool.info.get_pg_num_divisor(info.pgid.pgid);
   assert(divisor > 0);
 
@@ -10943,7 +11708,7 @@ bool ReplicatedPG::agent_choose_mode(bool restart, OpRequestRef op)
   // also exclude omap objects if ec backing pool
   const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
   assert(base_pool);
-  if (base_pool->is_erasure())
+  if (!base_pool->supports_omap())
     unflushable += info.stats.stats.sum.num_objects_omap;
 
   uint64_t num_user_objects = info.stats.stats.sum.num_objects;
@@ -10958,7 +11723,7 @@ bool ReplicatedPG::agent_choose_mode(bool restart, OpRequestRef op)
 
   // also reduce the num_dirty by num_objects_omap
   int64_t num_dirty = info.stats.stats.sum.num_objects_dirty;
-  if (base_pool->is_erasure()) {
+  if (!base_pool->supports_omap()) {
     if (num_dirty > info.stats.stats.sum.num_objects_omap)
       num_dirty -= info.stats.stats.sum.num_objects_omap;
     else
@@ -11010,24 +11775,24 @@ bool ReplicatedPG::agent_choose_mode(bool restart, OpRequestRef op)
 	   << dendl;
 
   // flush mode
-  TierAgentState::flush_mode_t flush_mode = TierAgentState::FLUSH_MODE_IDLE;
   uint64_t flush_target = pool.info.cache_target_dirty_ratio_micro;
+  uint64_t flush_high_target = pool.info.cache_target_dirty_high_ratio_micro;
   uint64_t flush_slop = (float)flush_target * g_conf->osd_agent_slop;
-  if (restart || agent_state->flush_mode == TierAgentState::FLUSH_MODE_IDLE)
+  if (restart || agent_state->flush_mode == TierAgentState::FLUSH_MODE_IDLE) {
     flush_target += flush_slop;
-  else
+    flush_high_target += flush_slop;
+  } else {
     flush_target -= MIN(flush_target, flush_slop);
+    flush_high_target -= MIN(flush_high_target, flush_slop);
+  }
 
-  if (info.stats.stats_invalid) {
-    // idle; stats can't be trusted until we scrub.
-    dout(20) << __func__ << " stats invalid (post-split), idle" << dendl;
+  if (dirty_micro > flush_high_target) {
+    flush_mode = TierAgentState::FLUSH_MODE_HIGH;
   } else if (dirty_micro > flush_target) {
-    flush_mode = TierAgentState::FLUSH_MODE_ACTIVE;
+    flush_mode = TierAgentState::FLUSH_MODE_LOW;
   }
 
   // evict mode
-  TierAgentState::evict_mode_t evict_mode = TierAgentState::EVICT_MODE_IDLE;
-  unsigned evict_effort = 0;
   uint64_t evict_target = pool.info.cache_target_full_ratio_micro;
   uint64_t evict_slop = (float)evict_target * g_conf->osd_agent_slop;
   if (restart || agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE)
@@ -11035,9 +11800,7 @@ bool ReplicatedPG::agent_choose_mode(bool restart, OpRequestRef op)
   else
     evict_target -= MIN(evict_target, evict_slop);
 
-  if (info.stats.stats_invalid) {
-    // idle; stats can't be trusted until we scrub.
-  } else if (full_micro > 1000000) {
+  if (full_micro > 1000000) {
     // evict anything clean
     evict_mode = TierAgentState::EVICT_MODE_FULL;
     evict_effort = 1000000;
@@ -11059,7 +11822,9 @@ bool ReplicatedPG::agent_choose_mode(bool restart, OpRequestRef op)
     assert(evict_effort >= inc && evict_effort <= 1000000);
     dout(30) << __func__ << " evict_effort " << was << " quantized by " << inc << " to " << evict_effort << dendl;
   }
+  }
 
+  skip_calc:
   bool old_idle = agent_state->is_idle();
   if (flush_mode != agent_state->flush_mode) {
     dout(5) << __func__ << " flush_mode "
@@ -11067,6 +11832,18 @@ bool ReplicatedPG::agent_choose_mode(bool restart, OpRequestRef op)
 	    << " -> "
 	    << TierAgentState::get_flush_mode_name(flush_mode)
 	    << dendl;
+    if (flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
+      osd->agent_inc_high_count();
+      info.stats.stats.sum.num_flush_mode_high = 1;
+    } else if (flush_mode == TierAgentState::FLUSH_MODE_LOW) {
+      info.stats.stats.sum.num_flush_mode_low = 1;
+    }
+    if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
+      osd->agent_dec_high_count();
+      info.stats.stats.sum.num_flush_mode_high = 0;
+    } else if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_LOW) {
+      info.stats.stats.sum.num_flush_mode_low = 0;
+    }
     agent_state->flush_mode = flush_mode;
   }
   if (evict_mode != agent_state->evict_mode) {
@@ -11081,8 +11858,19 @@ bool ReplicatedPG::agent_choose_mode(bool restart, OpRequestRef op)
 	requeue_op(op);
       requeue_ops(waiting_for_active);
       requeue_ops(waiting_for_cache_not_full);
+      objects_blocked_on_cache_full.clear();
       requeued = true;
     }
+    if (evict_mode == TierAgentState::EVICT_MODE_SOME) {
+      info.stats.stats.sum.num_evict_mode_some = 1;
+    } else if (evict_mode == TierAgentState::EVICT_MODE_FULL) {
+      info.stats.stats.sum.num_evict_mode_full = 1;
+    }
+    if (agent_state->evict_mode == TierAgentState::EVICT_MODE_SOME) {
+      info.stats.stats.sum.num_evict_mode_some = 0;
+    } else if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
+      info.stats.stats.sum.num_evict_mode_full = 0;
+    }
     agent_state->evict_mode = evict_mode;
   }
   uint64_t old_effort = agent_state->evict_effort;
@@ -11154,7 +11942,7 @@ bool ReplicatedPG::_range_available_for_scrub(
   next.second = object_contexts.lookup(begin);
   next.first = begin;
   bool more = true;
-  while (more && next.first < end) {
+  while (more && cmp(next.first, end, get_sort_bitwise()) < 0) {
     if (next.second && next.second->is_blocked()) {
       next.second->requeue_scrub_on_unblock = true;
       dout(10) << __func__ << ": scrub delayed, "
@@ -11179,13 +11967,13 @@ void ReplicatedPG::_scrub_digest_updated()
 {
   dout(20) << __func__ << dendl;
   if (--scrubber.num_digest_updates_pending == 0) {
-    osd->scrub_wq.queue(this);
+    requeue_scrub();
   }
 }
 
 void ReplicatedPG::_scrub(
   ScrubMap &scrubmap,
-  const map<hobject_t, pair<uint32_t, uint32_t> > &missing_digest)
+  const map<hobject_t, pair<uint32_t, uint32_t>, hobject_t::BitwiseComparator> &missing_digest)
 {
   dout(10) << "_scrub" << dendl;
 
@@ -11202,7 +11990,7 @@ void ReplicatedPG::_scrub(
 
   bufferlist last_data;
 
-  for (map<hobject_t,ScrubMap::object>::reverse_iterator p = scrubmap.objects.rbegin(); 
+  for (map<hobject_t,ScrubMap::object, hobject_t::BitwiseComparator>::reverse_iterator p = scrubmap.objects.rbegin();
        p != scrubmap.objects.rend(); 
        ++p) {
     const hobject_t& soid = p->first;
@@ -11380,7 +12168,7 @@ void ReplicatedPG::_scrub(
     ++scrubber.shallow_errors;
   }
 
-  for (map<hobject_t,pair<uint32_t,uint32_t> >::const_iterator p =
+  for (map<hobject_t,pair<uint32_t,uint32_t>, hobject_t::BitwiseComparator>::const_iterator p =
 	 missing_digest.begin();
        p != missing_digest.end();
        ++p) {
@@ -11429,7 +12217,7 @@ void ReplicatedPG::_scrub_finish()
 	   << scrub_cstat.sum.num_objects_dirty << "/" << info.stats.stats.sum.num_objects_dirty << " dirty, "
 	   << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, "
 	   << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
-	   << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes,"
+	   << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes, "
 	   << scrub_cstat.sum.num_bytes_hit_set_archive << "/" << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes."
 	   << dendl;
 
@@ -11453,7 +12241,7 @@ void ReplicatedPG::_scrub_finish()
 		      << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, "
 		      << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
 		      << scrub_cstat.sum.num_whiteouts << "/" << info.stats.stats.sum.num_whiteouts << " whiteouts, "
-		      << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes,"
+		      << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes, "
 		      << scrub_cstat.sum.num_bytes_hit_set_archive << "/" << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes.\n";
     ++scrubber.shallow_errors;
 
@@ -11602,11 +12390,14 @@ boost::statechart::result ReplicatedPG::TrimmingObjects::react(const SnapTrim&)
     assert(repop);
     repop->queue_snap_trimmer = true;
 
+    pg->apply_ctx_stats(repop->ctx);
+
     repops.insert(repop->get());
     pg->simple_repop_submit(repop);
   }
   return discard_event();
 }
+
 /* WaitingOnReplicasObjects */
 ReplicatedPG::WaitingOnReplicas::WaitingOnReplicas(my_context ctx)
   : my_base(ctx),
@@ -11697,6 +12488,21 @@ void ReplicatedPG::setattr_maybe_cache(
   t->setattr(obc->obs.oi.soid, key, val);
 }
 
+void ReplicatedPG::setattrs_maybe_cache(
+  ObjectContextRef obc,
+  OpContext *op,
+  PGBackend::PGTransaction *t,
+  map<string, bufferlist> &attrs)
+{
+  if (pool.info.require_rollback()) {
+    for (map<string, bufferlist>::iterator it = attrs.begin();
+      it != attrs.end(); ++it) {
+      op->pending_attrs[obc][it->first] = it->second;
+    }
+  }
+  t->setattrs(obc->obs.oi.soid, attrs);
+}
+
 void ReplicatedPG::rmattr_maybe_cache(
   ObjectContextRef obc,
   OpContext *op,
diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h
index 48e0def..04a6a45 100644
--- a/src/osd/ReplicatedPG.h
+++ b/src/osd/ReplicatedPG.h
@@ -46,6 +46,7 @@ class CopyFromCallback;
 class PromoteCallback;
 
 class ReplicatedPG;
+class PGLSFilter;
 void intrusive_ptr_add_ref(ReplicatedPG *pg);
 void intrusive_ptr_release(ReplicatedPG *pg);
 uint64_t get_with_id(ReplicatedPG *pg);
@@ -57,39 +58,6 @@ void put_with_id(ReplicatedPG *pg, uint64_t id);
   typedef boost::intrusive_ptr<ReplicatedPG> ReplicatedPGRef;
 #endif
 
-class PGLSFilter {
-protected:
-  string xattr;
-public:
-  PGLSFilter();
-  virtual ~PGLSFilter();
-  virtual bool filter(bufferlist& xattr_data, bufferlist& outdata) = 0;
-  virtual string& get_xattr() { return xattr; }
-};
-
-class PGLSPlainFilter : public PGLSFilter {
-  string val;
-public:
-  PGLSPlainFilter(bufferlist::iterator& params) {
-    ::decode(xattr, params);
-    ::decode(val, params);
-  }
-  virtual ~PGLSPlainFilter() {}
-  virtual bool filter(bufferlist& xattr_data, bufferlist& outdata);
-};
-
-class PGLSParentFilter : public PGLSFilter {
-  inodeno_t parent_ino;
-public:
-  PGLSParentFilter(bufferlist::iterator& params) {
-    xattr = "_parent";
-    ::decode(parent_ino, params);
-    generic_dout(0) << "parent_ino=" << parent_ino << dendl;
-  }
-  virtual ~PGLSParentFilter() {}
-  virtual bool filter(bufferlist& xattr_data, bufferlist& outdata);
-};
-
 class ReplicatedPG : public PG, public PGBackend::Listener {
   friend class OSD;
   friend class Watch;
@@ -169,16 +137,30 @@ public:
 
     object_copy_cursor_t temp_cursor;
 
+    /*
+     * For CopyOp the process is:
+     * step1: read the data(attr/omap/data) from the source object
+     * step2: handle those data(w/ those data create a new object)
+     * src_obj_fadvise_flags used in step1;
+     * dest_obj_fadvise_flags used in step2
+     */
+    unsigned src_obj_fadvise_flags;
+    unsigned dest_obj_fadvise_flags;
+
     CopyOp(CopyCallback *cb_, ObjectContextRef _obc, hobject_t s,
 	   object_locator_t l,
            version_t v,
 	   unsigned f,
-	   bool ms)
+	   bool ms,
+	   unsigned src_obj_fadvise_flags,
+	   unsigned dest_obj_fadvise_flags)
       : cb(cb_), obc(_obc), src(s), oloc(l), flags(f),
 	mirror_snapset(ms),
 	objecter_tid(0),
 	objecter_tid2(0),
-	rval(-1)
+	rval(-1),
+	src_obj_fadvise_flags(src_obj_fadvise_flags),
+	dest_obj_fadvise_flags(dest_obj_fadvise_flags)
     {
       results.user_version = v;
       results.mirror_snapset = mirror_snapset;
@@ -231,6 +213,28 @@ public:
   };
   typedef boost::shared_ptr<ProxyReadOp> ProxyReadOpRef;
 
+  struct ProxyWriteOp {
+    OpContext *ctx;
+    OpRequestRef op;
+    hobject_t soid;
+    ceph_tid_t objecter_tid;
+    vector<OSDOp> &ops;
+    version_t user_version;
+    bool sent_disk;
+    bool sent_ack;
+    utime_t mtime;
+    bool canceled;
+    osd_reqid_t reqid;
+
+    ProxyWriteOp(OpRequestRef _op, hobject_t oid, vector<OSDOp>& _ops, osd_reqid_t _reqid)
+      : ctx(NULL), op(_op), soid(oid),
+        objecter_tid(0), ops(_ops),
+	user_version(0), sent_disk(false),
+	sent_ack(false), canceled(false),
+        reqid(_reqid) { }
+  };
+  typedef boost::shared_ptr<ProxyWriteOp> ProxyWriteOpRef;
+
   struct FlushOp {
     ObjectContextRef obc;       ///< obc we are flushing
     OpRequestRef op;            ///< initiating op
@@ -243,7 +247,7 @@ public:
     Context *on_flush;          ///< callback, may be null
 
     FlushOp()
-      : objecter_tid(0), rval(0),
+      : flushed_version(0), objecter_tid(0), rval(0),
 	blocking(false), removal(false),
 	on_flush(NULL) {}
     ~FlushOp() { assert(!on_flush); }
@@ -256,9 +260,6 @@ public:
   }
 
   /// Listener methods
-  void on_local_recover_start(
-    const hobject_t &oid,
-    ObjectStore::Transaction *t);
   void on_local_recover(
     const hobject_t &oid,
     const object_stat_sum_t &stat_diff,
@@ -328,6 +329,9 @@ public:
   void queue_transaction(ObjectStore::Transaction *t, OpRequestRef op) {
     osd->store->queue_transaction(osr.get(), t, 0, 0, 0, op);
   }
+  void queue_transactions(list<ObjectStore::Transaction*>& tls, OpRequestRef op) {
+    osd->store->queue_transactions(osr.get(), tls, 0, 0, 0, op);
+  }
   epoch_t get_epoch() const {
     return get_osdmap()->get_epoch();
   }
@@ -343,15 +347,17 @@ public:
 
   std::string gen_dbg_prefix() const { return gen_prefix(); }
   
-  const map<hobject_t, set<pg_shard_t> > &get_missing_loc_shards() const {
+  const map<hobject_t, set<pg_shard_t>, hobject_t::BitwiseComparator> &get_missing_loc_shards() const {
     return missing_loc.get_missing_locs();
   }
   const map<pg_shard_t, pg_missing_t> &get_shard_missing() const {
     return peer_missing;
   }
+  using PGBackend::Listener::get_shard_missing;
   const map<pg_shard_t, pg_info_t> &get_shard_info() const {
     return peer_info;
   }
+  using PGBackend::Listener::get_shard_info;  
   const pg_missing_t &get_local_missing() const {
     return pg_log.get_missing();
   }
@@ -398,8 +404,10 @@ public:
     if (peer == get_primary())
       return true;
     assert(peer_info.count(peer));
-    bool should_send = hoid.pool != (int64_t)info.pgid.pool() ||
-      hoid <= MAX(last_backfill_started, peer_info[peer].last_backfill);
+    bool should_send =
+      hoid.pool != (int64_t)info.pgid.pool() ||
+      cmp(hoid, last_backfill_started, get_sort_bitwise()) <= 0 ||
+      cmp(hoid, peer_info[peer].last_backfill, get_sort_bitwise()) <= 0;
     if (!should_send)
       assert(is_backfill_targets(peer));
     return should_send;
@@ -436,9 +444,12 @@ public:
   uint64_t min_peer_features() const {
     return get_min_peer_features();
   }
+  bool sort_bitwise() const {
+    return get_sort_bitwise();
+  }
 
   bool transaction_use_tbl() {
-    uint64_t min_features = get_min_peer_features();
+    uint64_t min_features = get_min_upacting_features();
     return !(min_features & CEPH_FEATURE_OSD_TRANSACTION_MAY_LAYOUT);
   }
 
@@ -520,7 +531,7 @@ public:
 
     interval_set<uint64_t> modified_ranges;
     ObjectContextRef obc;
-    map<hobject_t,ObjectContextRef> src_obc;
+    map<hobject_t,ObjectContextRef, hobject_t::BitwiseComparator> src_obc;
     ObjectContextRef clone_obc;    // if we created a clone
     ObjectContextRef snapset_obc;  // if we created/deleted a snapdir
 
@@ -675,7 +686,7 @@ public:
 
     OpContext *ctx;
     ObjectContextRef obc;
-    map<hobject_t,ObjectContextRef> src_obc;
+    map<hobject_t,ObjectContextRef, hobject_t::BitwiseComparator> src_obc;
 
     ceph_tid_t rep_tid;
 
@@ -859,7 +870,8 @@ protected:
     if (!to_req.empty()) {
       assert(ctx->obc);
       // requeue at front of scrub blocking queue if we are blocked by scrub
-      if (scrubber.write_blocked_by_scrub(ctx->obc->obs.oi.soid.get_head())) {
+      if (scrubber.write_blocked_by_scrub(ctx->obc->obs.oi.soid.get_head(),
+					  get_sort_bitwise())) {
 	waiting_for_active.splice(
 	  waiting_for_active.begin(),
 	  to_req,
@@ -900,19 +912,27 @@ protected:
   void hit_set_persist();   ///< persist hit info
   bool hit_set_apply_log(); ///< apply log entries to update in-memory HitSet
   void hit_set_trim(RepGather *repop, unsigned max); ///< discard old HitSets
-  void hit_set_in_memory_trim();                     ///< discard old in memory HitSets
+  void hit_set_in_memory_trim(uint32_t max_in_memory); ///< discard old in memory HitSets
+  void hit_set_remove_all();
 
   hobject_t get_hit_set_current_object(utime_t stamp);
-  hobject_t get_hit_set_archive_object(utime_t start, utime_t end);
+  hobject_t get_hit_set_archive_object(utime_t start,
+				       utime_t end,
+				       bool using_gmt);
 
   // agent
   boost::scoped_ptr<TierAgentState> agent_state;
 
   friend struct C_AgentFlushStartStop;
+  friend struct C_AgentEvictStartStop;
   friend struct C_HitSetFlushing;
 
   void agent_setup();       ///< initialize agent state
-  bool agent_work(int max); ///< entry point to do some agent work
+  bool agent_work(int max) ///< entry point to do some agent work
+  {
+    return agent_work(max, max);
+  }
+  bool agent_work(int max, int agent_flush_quota);
   bool agent_maybe_flush(ObjectContextRef& obc);  ///< maybe flush
   bool agent_maybe_evict(ObjectContextRef& obc);  ///< maybe evict
 
@@ -969,13 +989,13 @@ protected:
   }
 
   // projected object info
-  SharedLRU<hobject_t, ObjectContext> object_contexts;
+  SharedLRU<hobject_t, ObjectContext, hobject_t::ComparatorWithDefault> object_contexts;
   // map from oid.snapdir() to SnapSetContext *
-  map<hobject_t, SnapSetContext*> snapset_contexts;
+  map<hobject_t, SnapSetContext*, hobject_t::BitwiseComparator> snapset_contexts;
   Mutex snapset_contexts_lock;
 
   // debug order that client ops are applied
-  map<hobject_t, map<client_t, ceph_tid_t> > debug_op_order;
+  map<hobject_t, map<client_t, ceph_tid_t>, hobject_t::BitwiseComparator> debug_op_order;
 
   void populate_obc_watchers(ObjectContextRef obc);
   void check_blacklisted_obc_watchers(ObjectContextRef obc);
@@ -1035,7 +1055,7 @@ protected:
   }
   void put_snapset_context(SnapSetContext *ssc);
 
-  map<hobject_t, ObjectContextRef> recovering;
+  map<hobject_t, ObjectContextRef, hobject_t::BitwiseComparator> recovering;
 
   /*
    * Backfill
@@ -1051,8 +1071,8 @@ protected:
    *   - are not included in pg stats (yet)
    *   - have their stats in pending_backfill_updates on the primary
    */
-  set<hobject_t> backfills_in_flight;
-  map<hobject_t, pg_stat_t> pending_backfill_updates;
+  set<hobject_t, hobject_t::Comparator> backfills_in_flight;
+  map<hobject_t, pg_stat_t, hobject_t::Comparator> pending_backfill_updates;
 
   void dump_recovery_info(Formatter *f) const {
     f->open_array_section("backfill_targets");
@@ -1085,7 +1105,7 @@ protected:
     }
     {
       f->open_array_section("backfills_in_flight");
-      for (set<hobject_t>::const_iterator i = backfills_in_flight.begin();
+      for (set<hobject_t, hobject_t::BitwiseComparator>::const_iterator i = backfills_in_flight.begin();
 	   i != backfills_in_flight.end();
 	   ++i) {
 	f->dump_stream("object") << *i;
@@ -1094,7 +1114,7 @@ protected:
     }
     {
       f->open_array_section("recovering");
-      for (map<hobject_t, ObjectContextRef>::const_iterator i = recovering.begin();
+      for (map<hobject_t, ObjectContextRef, hobject_t::BitwiseComparator>::const_iterator i = recovering.begin();
 	   i != recovering.end();
 	   ++i) {
 	f->dump_stream("object") << i->first;
@@ -1141,22 +1161,61 @@ protected:
   void reply_ctx(OpContext *ctx, int err, eversion_t v, version_t uv);
   void make_writeable(OpContext *ctx);
   void log_op_stats(OpContext *ctx);
+  void apply_ctx_stats(OpContext *ctx,
+		       bool scrub_ok=false); ///< true if we should skip scrub stat update
 
   void write_update_size_and_usage(object_stat_sum_t& stats, object_info_t& oi,
 				   interval_set<uint64_t>& modified, uint64_t offset,
-				   uint64_t length, bool count_bytes);
+				   uint64_t length, bool count_bytes,
+				   bool force_changesize=false);
   void add_interval_usage(interval_set<uint64_t>& s, object_stat_sum_t& st);
 
+
+  enum class cache_result_t {
+    NOOP,
+    BLOCKED_FULL,
+    BLOCKED_PROMOTE,
+    HANDLED_PROXY,
+    HANDLED_REDIRECT,
+  };
+  cache_result_t maybe_handle_cache_detail(OpRequestRef op,
+					   bool write_ordered,
+					   ObjectContextRef obc, int r,
+					   hobject_t missing_oid,
+					   bool must_promote,
+					   bool in_hit_set,
+					   ObjectContextRef *promote_obc);
   /**
    * This helper function is called from do_op if the ObjectContext lookup fails.
    * @returns true if the caching code is handling the Op, false otherwise.
    */
-  inline bool maybe_handle_cache(OpRequestRef op,
-				 bool write_ordered,
-				 ObjectContextRef obc, int r,
-				 const hobject_t& missing_oid,
-				 bool must_promote,
-				 bool in_hit_set = false);
+  bool maybe_handle_cache(OpRequestRef op,
+			  bool write_ordered,
+			  ObjectContextRef obc, int r,
+			  const hobject_t& missing_oid,
+			  bool must_promote,
+			  bool in_hit_set = false) {
+    return cache_result_t::NOOP != maybe_handle_cache_detail(
+      op,
+      write_ordered,
+      obc,
+      r,
+      missing_oid,
+      must_promote,
+      in_hit_set,
+      nullptr);
+  }
+
+  /**
+   * This helper function checks if a promotion is needed.
+   */
+  bool maybe_promote(ObjectContextRef obc,
+		     const hobject_t& missing_oid,
+		     const object_locator_t& oloc,
+		     bool in_hit_set,
+		     uint32_t recency,
+		     OpRequestRef promote_op,
+		     ObjectContextRef *promote_obc = nullptr);
   /**
    * This helper function tells the client to redirect their request elsewhere.
    */
@@ -1167,15 +1226,13 @@ protected:
    * this is a noop.  If a future user wants to be able to distinguish
    * these cases, a return value should be added.
    */
-  void promote_object(ObjectContextRef obc,            ///< [optional] obc
-		      const hobject_t& missing_object, ///< oid (if !obc)
-		      const object_locator_t& oloc,    ///< locator for obc|oid
-		      OpRequestRef op);                ///< [optional] client op
-
-  /**
-   * Check if the op is such that we can skip promote (e.g., DELETE)
-   */
-  bool can_skip_promote(OpRequestRef op);
+  void promote_object(
+    ObjectContextRef obc,            ///< [optional] obc
+    const hobject_t& missing_object, ///< oid (if !obc)
+    const object_locator_t& oloc,    ///< locator for obc|oid
+    OpRequestRef op,                 ///< [optional] client op
+    ObjectContextRef *promote_obc = nullptr ///< [optional] new obc for object
+    );
 
   int prepare_transaction(OpContext *ctx);
   list<pair<OpRequestRef, OpContext*> > in_progress_async_reads;
@@ -1188,8 +1245,7 @@ protected:
 
   void queue_for_recovery();
   bool start_recovery_ops(
-    int max, RecoveryCtx *prctx,
-    ThreadPool::TPHandle &handle, int *started);
+    int max, ThreadPool::TPHandle &handle, int *started);
 
   int recover_primary(int max, ThreadPool::TPHandle &handle);
   int recover_replicas(int max, ThreadPool::TPHandle &handle);
@@ -1288,7 +1344,7 @@ protected:
   void recover_got(hobject_t oid, eversion_t v);
 
   // -- copyfrom --
-  map<hobject_t, CopyOpRef> copy_ops;
+  map<hobject_t, CopyOpRef, hobject_t::BitwiseComparator> copy_ops;
 
   int fill_in_copy_get(
     OpContext *ctx,
@@ -1296,6 +1352,8 @@ protected:
     OSDOp& op,
     ObjectContextRef& obc,
     bool classic);
+  void fill_in_copy_get_noent(OpRequestRef& op, hobject_t oid,
+                              OSDOp& osd_op, bool classic);
 
   /**
    * To copy an object, call start_copy.
@@ -1305,11 +1363,11 @@ protected:
    * @param src: The source object
    * @param oloc: the source object locator
    * @param version: the version of the source object to copy (0 for any)
-   * @param temp_dest_oid: the temporary object to use for large objects
    */
   void start_copy(CopyCallback *cb, ObjectContextRef obc, hobject_t src,
 		  object_locator_t oloc, version_t version, unsigned flags,
-		  bool mirror_snapset);
+		  bool mirror_snapset, unsigned src_obj_fadvise_flags,
+		  unsigned dest_obj_fadvise_flags);
   void process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r);
   void _write_copy_chunk(CopyOpRef cop, PGBackend::PGTransaction *t);
   uint64_t get_copy_chunk_size() const {
@@ -1333,7 +1391,7 @@ protected:
   friend struct C_Copyfrom;
 
   // -- flush --
-  map<hobject_t, FlushOpRef> flush_ops;
+  map<hobject_t, FlushOpRef, hobject_t::BitwiseComparator> flush_ops;
 
   /// start_flush takes ownership of on_flush iff ret == -EINPROGRESS
   int start_flush(
@@ -1355,7 +1413,7 @@ protected:
     const hobject_t &begin, const hobject_t &end);
   virtual void _scrub(
     ScrubMap &map,
-    const std::map<hobject_t, pair<uint32_t, uint32_t> > &missing_digest);
+    const std::map<hobject_t, pair<uint32_t, uint32_t>, hobject_t::BitwiseComparator> &missing_digest);
   void _scrub_digest_updated();
   virtual void _scrub_clear_state();
   virtual void _scrub_finish();
@@ -1372,18 +1430,29 @@ protected:
   bool pgls_filter(PGLSFilter *filter, hobject_t& sobj, bufferlist& outdata);
   int get_pgls_filter(bufferlist::iterator& iter, PGLSFilter **pfilter);
 
+  map<hobject_t, list<OpRequestRef>, hobject_t::BitwiseComparator> in_progress_proxy_ops;
+  void kick_proxy_ops_blocked(hobject_t& soid);
+  void cancel_proxy_ops(bool requeue);
+
   // -- proxyread --
   map<ceph_tid_t, ProxyReadOpRef> proxyread_ops;
-  map<hobject_t, list<OpRequestRef> > in_progress_proxy_reads;
 
   void do_proxy_read(OpRequestRef op);
   void finish_proxy_read(hobject_t oid, ceph_tid_t tid, int r);
-  void kick_proxy_read_blocked(hobject_t& soid);
   void cancel_proxy_read(ProxyReadOpRef prdop);
-  void cancel_proxy_read_ops(bool requeue);
 
   friend struct C_ProxyRead;
 
+  // -- proxywrite --
+  map<ceph_tid_t, ProxyWriteOpRef> proxywrite_ops;
+
+  void do_proxy_write(OpRequestRef op, const hobject_t& missing_oid);
+  void finish_proxy_write(hobject_t oid, ceph_tid_t tid, int r);
+  void cancel_proxy_write(ProxyWriteOpRef pwop);
+
+  friend struct C_ProxyWrite_Apply;
+  friend struct C_ProxyWrite_Commit;
+
 public:
   ReplicatedPG(OSDService *o, OSDMapRef curmap,
 	       const PGPool &_pool, spg_t p);
@@ -1406,7 +1475,7 @@ public:
   void do_backfill(OpRequestRef op);
 
   RepGather *trim_object(const hobject_t &coid);
-  void snap_trimmer();
+  void snap_trimmer(epoch_t e);
   int do_osd_ops(OpContext *ctx, vector<OSDOp>& ops);
 
   int _get_tmap(OpContext *ctx, bufferlist *header, bufferlist *vals);
@@ -1418,13 +1487,14 @@ public:
 private:
   hobject_t earliest_backfill() const;
   bool check_src_targ(const hobject_t& soid, const hobject_t& toid) const;
+
   uint64_t temp_seq; ///< last id for naming temp objects
-  coll_t get_temp_coll(ObjectStore::Transaction *t);
   hobject_t generate_temp_object();  ///< generate a new temp object name
+  /// generate a new temp object name (for recovery)
+  hobject_t get_temp_recovery_object(eversion_t version, snapid_t snap);
 public:
-  void get_colls(list<coll_t> *out) {
-    out->push_back(coll);
-    return pgbackend->temp_colls(out);
+  coll_t get_coll() {
+    return coll;
   }
   void split_colls(
     spg_t child,
@@ -1433,14 +1503,13 @@ public:
     const pg_pool_t *pool,
     ObjectStore::Transaction *t) {
     coll_t target = coll_t(child);
-    PG::_create(*t, child);
+    PG::_create(*t, child, split_bits);
     t->split_collection(
       coll,
       split_bits,
       seed,
       target);
     PG::_init(*t, child, pool);
-    pgbackend->split_colls(child, split_bits, seed, t);
   }
 private:
   struct NotTrimming;
@@ -1506,12 +1575,19 @@ public:
     return is_missing_object(oid) ||
       !missing_loc.readable_with_acting(oid, actingset);
   }
+  void maybe_kick_recovery(const hobject_t &soid);
   void wait_for_unreadable_object(const hobject_t& oid, OpRequestRef op);
   void wait_for_all_missing(OpRequestRef op);
 
   bool is_degraded_or_backfilling_object(const hobject_t& oid);
   void wait_for_degraded_object(const hobject_t& oid, OpRequestRef op);
 
+  void block_write_on_full_cache(
+    const hobject_t& oid, OpRequestRef op);
+  void block_write_on_snap_rollback(
+    const hobject_t& oid, ObjectContextRef obc, OpRequestRef op);
+  void block_write_on_degraded_snap(const hobject_t& oid, OpRequestRef op);
+
   bool maybe_await_blocked_snapset(const hobject_t &soid, OpRequestRef op);
   void wait_for_blocked_object(const hobject_t& soid, OpRequestRef op);
   void kick_object_context_blocked(ObjectContextRef obc);
@@ -1525,6 +1601,7 @@ public:
 
   void on_role_change();
   void on_pool_change();
+  void _on_new_interval();
   void on_change(ObjectStore::Transaction *t);
   void on_activate();
   void on_flushed();
@@ -1542,6 +1619,11 @@ public:
     PGBackend::PGTransaction *t,
     const string &key,
     bufferlist &val);
+  void setattrs_maybe_cache(
+    ObjectContextRef obc,
+    OpContext *op,
+    PGBackend::PGTransaction *t,
+    map<string, bufferlist> &attrs);
   void rmattr_maybe_cache(
     ObjectContextRef obc,
     OpContext *op,
@@ -1572,6 +1654,17 @@ inline ostream& operator<<(ostream& out, ReplicatedPG::RepGather& repop)
   return out;
 }
 
+inline ostream& operator<<(ostream& out, ReplicatedPG::ProxyWriteOpRef pwop)
+{
+  out << "proxywrite(" << &pwop
+      << " " << pwop->user_version
+      << " pwop_tid=" << pwop->objecter_tid;
+  if (pwop->ctx->op)
+    out << " op=" << *(pwop->ctx->op->get_req());
+  out << ")";
+  return out;
+}
+
 void intrusive_ptr_add_ref(ReplicatedPG::RepGather *repop);
 void intrusive_ptr_release(ReplicatedPG::RepGather *repop);
 
diff --git a/src/osd/SnapMapper.h b/src/osd/SnapMapper.h
index deb6868..7b95abe 100644
--- a/src/osd/SnapMapper.h
+++ b/src/osd/SnapMapper.h
@@ -30,17 +30,17 @@
 class OSDriver : public MapCacher::StoreDriver<std::string, bufferlist> {
   ObjectStore *os;
   coll_t cid;
-  hobject_t hoid;
+  ghobject_t hoid;
 
 public:
   class OSTransaction : public MapCacher::Transaction<std::string, bufferlist> {
     friend class OSDriver;
     coll_t cid;
-    hobject_t hoid;
+    ghobject_t hoid;
     ObjectStore::Transaction *t;
     OSTransaction(
       coll_t cid,
-      const hobject_t &hoid,
+      const ghobject_t &hoid,
       ObjectStore::Transaction *t)
       : cid(cid), hoid(hoid), t(t) {}
   public:
@@ -63,7 +63,7 @@ public:
     return OSTransaction(cid, hoid, t);
   }
 
-  OSDriver(ObjectStore *os, coll_t cid, const hobject_t &hoid) :
+  OSDriver(ObjectStore *os, coll_t cid, const ghobject_t &hoid) :
     os(os), cid(cid), hoid(hoid) {}
   int get_keys(
     const std::set<std::string> &keys,
diff --git a/src/osd/TierAgentState.h b/src/osd/TierAgentState.h
index e9c22b2..57f2c72 100644
--- a/src/osd/TierAgentState.h
+++ b/src/osd/TierAgentState.h
@@ -35,12 +35,14 @@ struct TierAgentState {
 
   enum flush_mode_t {
     FLUSH_MODE_IDLE,   // nothing to flush
-    FLUSH_MODE_ACTIVE, // flush what we can to bring down dirty count
+    FLUSH_MODE_LOW, // flush dirty objects with a low speed
+    FLUSH_MODE_HIGH, //flush dirty objects with a high speed
   } flush_mode;     ///< current flush behavior
   static const char *get_flush_mode_name(flush_mode_t m) {
     switch (m) {
     case FLUSH_MODE_IDLE: return "idle";
-    case FLUSH_MODE_ACTIVE: return "active";
+    case FLUSH_MODE_LOW: return "low";
+    case FLUSH_MODE_HIGH: return "high";
     default: assert(0 == "bad flush mode");
     }
   }
diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc
index b2bea5b..10458ec 100644
--- a/src/osd/osd_types.cc
+++ b/src/osd/osd_types.cc
@@ -50,6 +50,8 @@ const char *ceph_osd_flag_name(unsigned flag)
   case CEPH_OSD_FLAG_ENFORCE_SNAPC: return "enforce_snapc";
   case CEPH_OSD_FLAG_REDIRECTED: return "redirected";
   case CEPH_OSD_FLAG_KNOWN_REDIR: return "known_if_redirected";
+  case CEPH_OSD_FLAG_FULL_TRY: return "full_try";
+  case CEPH_OSD_FLAG_FULL_FORCE: return "full_force";
   default: return "???";
   }
 }
@@ -548,75 +550,89 @@ ostream& operator<<(ostream& out, const pg_t &pg)
 
 // -- coll_t --
 
-bool coll_t::is_temp(spg_t& pgid) const
+void coll_t::calc_str()
 {
-  const char *cstr(str.c_str());
-  if (!pgid.parse(cstr))
-    return false;
-  const char *tmp_start = strchr(cstr, '_');
-  if (!tmp_start)
-    return false;
-  if (strncmp(tmp_start, "_TEMP", 5) == 0)
-    return true;
-  return false;
-}
-
-bool coll_t::is_pg(spg_t& pgid, snapid_t& snap) const
-{
-  const char *cstr(str.c_str());
-
-  if (!pgid.parse(cstr))
-    return false;
-  const char *snap_start = strchr(cstr, '_');
-  if (!snap_start)
-    return false;
-  if (strncmp(snap_start, "_head", 5) == 0) {
-    snap = CEPH_NOSNAP;
-  } else {
-    errno = 0;
-    snap = strtoull(snap_start+1, 0, 16);
-    if (errno)
-      return false;
+  switch (type) {
+  case TYPE_META:
+    _str = "meta";
+    break;
+  case TYPE_PG:
+    _str = stringify(pgid) + "_head";
+    break;
+  case TYPE_PG_TEMP:
+    _str = stringify(pgid) + "_TEMP";
+    break;
+  case TYPE_PG_REMOVAL:
+    _str = string("FORREMOVAL_") +
+      stringify(removal_seq) + "_" +
+      stringify(pgid);
+    break;
+  default:
+    assert(0 == "unknown collection type");
   }
-  return true;
-}
-
-bool coll_t::is_pg_prefix(spg_t& pgid) const
-{
-  const char *cstr(str.c_str());
-
-  if (!pgid.parse(cstr))
-    return false;
-  const char *snap_start = strchr(cstr, '_');
-  if (!snap_start)
-    return false;
-  return true;
 }
 
-bool coll_t::is_removal(uint64_t *seq, spg_t *pgid) const
+bool coll_t::parse(const std::string& s)
 {
-  if (str.substr(0, 11) != string("FORREMOVAL_"))
-    return false;
-
-  stringstream ss(str.substr(11));
-  ss >> *seq;
-  char sep;
-  ss >> sep;
-  assert(sep == '_');
-  string pgid_str;
-  ss >> pgid_str;
-  if (!pgid->parse(pgid_str.c_str())) {
-    assert(0);
-    return false;
+  if (s == "meta") {
+    type = TYPE_META;
+    pgid = spg_t();
+    removal_seq = 0;
+    calc_str();
+    assert(s == _str);
+    return true;
   }
-  return true;
+  if (s.find("_head") == s.length() - 5 &&
+      pgid.parse(s.substr(0, s.length() - 5))) {
+    type = TYPE_PG;
+    removal_seq = 0;
+    calc_str();
+    assert(s == _str);
+    return true;
+  }
+  if (s.find("_TEMP") == s.length() - 5 &&
+      pgid.parse(s.substr(0, s.length() - 5))) {
+    type = TYPE_PG_TEMP;
+    removal_seq = 0;
+    calc_str();
+    assert(s == _str);
+    return true;
+  }
+  if (s.find("FORREMOVAL_") == 0) {
+    type = TYPE_PG_REMOVAL;
+    stringstream ss(s.substr(11));
+    ss >> removal_seq;
+    char sep;
+    ss >> sep;
+    assert(sep == '_');
+    string pgid_str;
+    ss >> pgid_str;
+    if (!pgid.parse(pgid_str.c_str())) {
+      assert(0);
+      return false;
+    }
+    calc_str();
+    assert(s == _str);
+    return true;
+  }
+  return false;
 }
 
 void coll_t::encode(bufferlist& bl) const
 {
-  __u8 struct_v = 3;
-  ::encode(struct_v, bl);
-  ::encode(str, bl);
+  if (is_removal() || is_temp()) {
+    // can't express this as v2...
+    __u8 struct_v = 3;
+    ::encode(struct_v, bl);
+    ::encode(to_str(), bl);
+  } else {
+    __u8 struct_v = 2;
+    ::encode(struct_v, bl);
+    ::encode((__u8)type, bl);
+    ::encode(pgid, bl);
+    snapid_t snap = CEPH_NOSNAP;
+    ::encode(snap, bl);
+  }
 }
 
 void coll_t::decode(bufferlist::iterator& bl)
@@ -624,72 +640,72 @@ void coll_t::decode(bufferlist::iterator& bl)
   __u8 struct_v;
   ::decode(struct_v, bl);
   switch (struct_v) {
-  case 1: {
-    spg_t pgid;
-    snapid_t snap;
-
-    ::decode(pgid, bl);
-    ::decode(snap, bl);
-    // infer the type
-    if (pgid == spg_t() && snap == 0)
-      str = "meta";
-    else
-      str = pg_and_snap_to_str(pgid, snap);
-    break;
-  }
+  case 1:
+    {
+      snapid_t snap;
+      ::decode(pgid, bl);
+      ::decode(snap, bl);
 
-  case 2: {
-    __u8 type;
-    spg_t pgid;
-    snapid_t snap;
-    
-    ::decode(type, bl);
-    ::decode(pgid, bl);
-    ::decode(snap, bl);
-    switch (type) {
-    case 0:
-      str = "meta";
-      break;
-    case 1:
-      str = "temp";
-      break;
-    case 2:
-      str = pg_and_snap_to_str(pgid, snap);
-      break;
-    default: {
-      ostringstream oss;
-      oss << "coll_t::decode(): can't understand type " << type;
-      throw std::domain_error(oss.str());
+      // infer the type
+      if (pgid == spg_t() && snap == 0) {
+	type = TYPE_META;
+      } else {
+	type = TYPE_PG;
+      }
+      removal_seq = 0;
     }
+    break;
+
+  case 2:
+    {
+      __u8 _type;
+      snapid_t snap;
+      ::decode(_type, bl);
+      ::decode(pgid, bl);
+      ::decode(snap, bl);
+      type = (type_t)_type;
+      removal_seq = 0;
     }
     break;
-  }
 
   case 3:
-    ::decode(str, bl);
+    {
+      string str;
+      ::decode(str, bl);
+      bool ok = parse(str);
+      if (!ok)
+	throw std::domain_error(std::string("unable to parse pg ") + str);
+    }
     break;
-    
-  default: {
-    ostringstream oss;
-    oss << "coll_t::decode(): don't know how to decode version "
-	<< struct_v;
-    throw std::domain_error(oss.str());
-  }
+
+  default:
+    {
+      ostringstream oss;
+      oss << "coll_t::decode(): don't know how to decode version "
+	  << struct_v;
+      throw std::domain_error(oss.str());
+    }
   }
 }
 
 void coll_t::dump(Formatter *f) const
 {
-  f->dump_string("name", str);
+  f->dump_unsigned("type_id", (unsigned)type);
+  if (type != TYPE_META)
+    f->dump_stream("pgid") << pgid;
+  f->dump_string("name", to_str());
 }
 
 void coll_t::generate_test_instances(list<coll_t*>& o)
 {
-  o.push_back(new coll_t);
-  o.push_back(new coll_t("meta"));
-  o.push_back(new coll_t("temp"));
-  o.push_back(new coll_t("foo"));
-  o.push_back(new coll_t("bar"));
+  o.push_back(new coll_t());
+  o.push_back(new coll_t(spg_t(pg_t(1, 0), shard_id_t::NO_SHARD)));
+  o.push_back(new coll_t(o.back()->get_temp()));
+  o.push_back(new coll_t(spg_t(pg_t(3, 2), shard_id_t(12))));
+  o.push_back(new coll_t(o.back()->get_temp()));
+  o.push_back(new coll_t());
+  o.back()->parse("FORREMOVAL_0_0.1");
+  o.back()->parse("FORREMOVAL_123_2.2a3f");
 }
 
 // ---
@@ -916,6 +932,8 @@ void pg_pool_t::dump(Formatter *f) const
   f->dump_unsigned("target_max_objects", target_max_objects);
   f->dump_unsigned("cache_target_dirty_ratio_micro",
 		   cache_target_dirty_ratio_micro);
+  f->dump_unsigned("cache_target_dirty_high_ratio_micro",
+		   cache_target_dirty_high_ratio_micro);
   f->dump_unsigned("cache_target_full_ratio_micro",
 		   cache_target_full_ratio_micro);
   f->dump_unsigned("cache_min_flush_age", cache_min_flush_age);
@@ -926,9 +944,12 @@ void pg_pool_t::dump(Formatter *f) const
   f->close_section(); // hit_set_params
   f->dump_unsigned("hit_set_period", hit_set_period);
   f->dump_unsigned("hit_set_count", hit_set_count);
+  f->dump_bool("use_gmt_hitset", use_gmt_hitset);
   f->dump_unsigned("min_read_recency_for_promote", min_read_recency_for_promote);
+  f->dump_unsigned("min_write_recency_for_promote", min_write_recency_for_promote);
   f->dump_unsigned("stripe_width", get_stripe_width());
   f->dump_unsigned("expected_num_objects", expected_num_objects);
+  f->dump_bool("fast_read", fast_read);
 }
 
 void pg_pool_t::convert_to_pg_shards(const vector<int> &from, set<pg_shard_t>* to) const {
@@ -1068,17 +1089,17 @@ SnapContext pg_pool_t::get_snap_context() const
   return SnapContext(get_snap_seq(), s);
 }
 
-static string make_hash_str(const string &inkey, const string &nspace)
-{
-  if (nspace.empty())
-    return inkey;
-  return nspace + '\037' + inkey;
-}
-
 uint32_t pg_pool_t::hash_key(const string& key, const string& ns) const
 {
-  string n = make_hash_str(key, ns);
-  return ceph_str_hash(object_hash, n.c_str(), n.length());
+ if (ns.empty()) 
+    return ceph_str_hash(object_hash, key.data(), key.length());
+  int nsl = ns.length();
+  int len = key.length() + nsl + 1;
+  char buf[len];
+  memcpy(&buf[0], ns.data(), nsl);
+  buf[nsl] = '\037';
+  memcpy(&buf[nsl+1], key.data(), key.length());
+  return ceph_str_hash(object_hash, &buf[0], len);
 }
 
 uint32_t pg_pool_t::raw_hash_to_pg(uint32_t v) const
@@ -1238,7 +1259,7 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const
     return;
   }
 
-  ENCODE_START(17, 5, bl);
+  ENCODE_START(22, 5, bl);
   ::encode(type, bl);
   ::encode(size, bl);
   ::encode(crush_ruleset, bl);
@@ -1280,12 +1301,16 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const
   ::encode(last_force_op_resend, bl);
   ::encode(min_read_recency_for_promote, bl);
   ::encode(expected_num_objects, bl);
+  ::encode(cache_target_dirty_high_ratio_micro, bl);
+  ::encode(min_write_recency_for_promote, bl);
+  ::encode(use_gmt_hitset, bl);
+  ::encode(fast_read, bl);
   ENCODE_FINISH(bl);
 }
 
 void pg_pool_t::decode(bufferlist::iterator& bl)
 {
-  DECODE_START_LEGACY_COMPAT_LEN(17, 5, 5, bl);
+  DECODE_START_LEGACY_COMPAT_LEN(22, 5, 5, bl);
   ::decode(type, bl);
   ::decode(size, bl);
   ::decode(crush_ruleset, bl);
@@ -1397,6 +1422,26 @@ void pg_pool_t::decode(bufferlist::iterator& bl)
   } else {
     expected_num_objects = 0;
   }
+  if (struct_v >= 19) {
+    ::decode(cache_target_dirty_high_ratio_micro, bl);
+  } else {
+    cache_target_dirty_high_ratio_micro = cache_target_dirty_ratio_micro;
+  }
+  if (struct_v >= 20) {
+    ::decode(min_write_recency_for_promote, bl);
+  } else {
+    min_write_recency_for_promote = 1;
+  }
+  if (struct_v >= 21) {
+    ::decode(use_gmt_hitset, bl);
+  } else {
+    use_gmt_hitset = false;
+  }
+  if (struct_v >= 22) {
+    ::decode(fast_read, bl);
+  } else {
+    fast_read = false;
+  }
   DECODE_FINISH(bl);
   calc_pg_masks();
 }
@@ -1443,15 +1488,18 @@ void pg_pool_t::generate_test_instances(list<pg_pool_t*>& o)
   a.hit_set_period = 3600;
   a.hit_set_count = 8;
   a.min_read_recency_for_promote = 1;
+  a.min_write_recency_for_promote = 1;
   a.set_stripe_width(12345);
   a.target_max_bytes = 1238132132;
   a.target_max_objects = 1232132;
   a.cache_target_dirty_ratio_micro = 187232;
+  a.cache_target_dirty_high_ratio_micro = 309856;
   a.cache_target_full_ratio_micro = 987222;
   a.cache_min_flush_age = 231;
   a.cache_min_evict_age = 2321;
   a.erasure_code_profile = "profile in osdmap";
   a.expected_num_objects = 123456;
+  a.fast_read = false;
   o.push_back(new pg_pool_t(a));
 }
 
@@ -1498,9 +1546,13 @@ ostream& operator<<(ostream& out, const pg_pool_t& p)
   }
   if (p.min_read_recency_for_promote)
     out << " min_read_recency_for_promote " << p.min_read_recency_for_promote;
+  if (p.min_write_recency_for_promote)
+    out << " min_write_recency_for_promote " << p.min_write_recency_for_promote;
   out << " stripe_width " << p.get_stripe_width();
   if (p.expected_num_objects)
     out << " expected_num_objects " << p.expected_num_objects;
+  if (p.fast_read)
+    out << " fast_read " << p.fast_read;
   return out;
 }
 
@@ -1532,11 +1584,20 @@ void object_stat_sum_t::dump(Formatter *f) const
   f->dump_int("num_objects_omap", num_objects_omap);
   f->dump_int("num_objects_hit_set_archive", num_objects_hit_set_archive);
   f->dump_int("num_bytes_hit_set_archive", num_bytes_hit_set_archive);
+  f->dump_int("num_flush", num_flush);
+  f->dump_int("num_flush_kb", num_flush_kb);
+  f->dump_int("num_evict", num_evict);
+  f->dump_int("num_evict_kb", num_evict_kb);
+  f->dump_int("num_promote", num_promote);
+  f->dump_int("num_flush_mode_high", num_flush_mode_high);
+  f->dump_int("num_flush_mode_low", num_flush_mode_low);
+  f->dump_int("num_evict_mode_some", num_evict_mode_some);
+  f->dump_int("num_evict_mode_full", num_evict_mode_full);
 }
 
 void object_stat_sum_t::encode(bufferlist& bl) const
 {
-  ENCODE_START(11, 3, bl);
+  ENCODE_START(13, 3, bl);
   ::encode(num_bytes, bl);
   ::encode(num_objects, bl);
   ::encode(num_object_clones, bl);
@@ -1560,12 +1621,21 @@ void object_stat_sum_t::encode(bufferlist& bl) const
   ::encode(num_objects_hit_set_archive, bl);
   ::encode(num_objects_misplaced, bl);
   ::encode(num_bytes_hit_set_archive, bl);
+  ::encode(num_flush, bl);
+  ::encode(num_flush_kb, bl);
+  ::encode(num_evict, bl);
+  ::encode(num_evict_kb, bl);
+  ::encode(num_promote, bl);
+  ::encode(num_flush_mode_high, bl);
+  ::encode(num_flush_mode_low, bl);
+  ::encode(num_evict_mode_some, bl);
+  ::encode(num_evict_mode_full, bl);
   ENCODE_FINISH(bl);
 }
 
 void object_stat_sum_t::decode(bufferlist::iterator& bl)
 {
-  DECODE_START_LEGACY_COMPAT_LEN(11, 3, 3, bl);
+  DECODE_START_LEGACY_COMPAT_LEN(13, 3, 3, bl);
   ::decode(num_bytes, bl);
   if (struct_v < 3) {
     uint64_t num_kb;
@@ -1629,6 +1699,30 @@ void object_stat_sum_t::decode(bufferlist::iterator& bl)
   } else {
     num_bytes_hit_set_archive = 0;
   }
+  if (struct_v >= 12) {
+    ::decode(num_flush, bl);
+    ::decode(num_flush_kb, bl);
+    ::decode(num_evict, bl);
+    ::decode(num_evict_kb, bl);
+    ::decode(num_promote, bl);
+  } else {
+    num_flush = 0;
+    num_flush_kb = 0;
+    num_evict = 0;
+    num_evict_kb = 0;
+    num_promote = 0;
+  }
+  if (struct_v >= 13) {
+    ::decode(num_flush_mode_high, bl);
+    ::decode(num_flush_mode_low, bl);
+    ::decode(num_evict_mode_some, bl);
+    ::decode(num_evict_mode_full, bl);
+  } else {
+    num_flush_mode_high = 0;
+    num_flush_mode_low = 0;
+    num_evict_mode_some = 0;
+    num_evict_mode_full = 0;
+  }
   DECODE_FINISH(bl);
 }
 
@@ -1656,6 +1750,15 @@ void object_stat_sum_t::generate_test_instances(list<object_stat_sum_t*>& o)
   a.num_objects_misplaced = 1232;
   a.num_objects_hit_set_archive = 2;
   a.num_bytes_hit_set_archive = 27;
+  a.num_flush = 5;
+  a.num_flush_kb = 6;
+  a.num_evict = 7;
+  a.num_evict_kb = 8;
+  a.num_promote = 9;
+  a.num_flush_mode_high = 0;
+  a.num_flush_mode_low = 1;
+  a.num_evict_mode_some = 1;
+  a.num_evict_mode_full = 0;
   o.push_back(new object_stat_sum_t(a));
 }
 
@@ -1684,6 +1787,15 @@ void object_stat_sum_t::add(const object_stat_sum_t& o)
   num_objects_omap += o.num_objects_omap;
   num_objects_hit_set_archive += o.num_objects_hit_set_archive;
   num_bytes_hit_set_archive += o.num_bytes_hit_set_archive;
+  num_flush += o.num_flush;
+  num_flush_kb += o.num_flush_kb;
+  num_evict += o.num_evict;
+  num_evict_kb += o.num_evict_kb;
+  num_promote += o.num_promote;
+  num_flush_mode_high += o.num_flush_mode_high;
+  num_flush_mode_low += o.num_flush_mode_low;
+  num_evict_mode_some += o.num_evict_mode_some;
+  num_evict_mode_full += o.num_evict_mode_full;
 }
 
 void object_stat_sum_t::sub(const object_stat_sum_t& o)
@@ -1711,6 +1823,15 @@ void object_stat_sum_t::sub(const object_stat_sum_t& o)
   num_objects_omap -= o.num_objects_omap;
   num_objects_hit_set_archive -= o.num_objects_hit_set_archive;
   num_bytes_hit_set_archive -= o.num_bytes_hit_set_archive;
+  num_flush -= o.num_flush;
+  num_flush_kb -= o.num_flush_kb;
+  num_evict -= o.num_evict;
+  num_evict_kb -= o.num_evict_kb;
+  num_promote -= o.num_promote;
+  num_flush_mode_high -= o.num_flush_mode_high;
+  num_flush_mode_low -= o.num_flush_mode_low;
+  num_evict_mode_some -= o.num_evict_mode_some;
+  num_evict_mode_full -= o.num_evict_mode_full;
 }
 
 bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r)
@@ -1738,7 +1859,16 @@ bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r)
     l.num_whiteouts == r.num_whiteouts &&
     l.num_objects_omap == r.num_objects_omap &&
     l.num_objects_hit_set_archive == r.num_objects_hit_set_archive &&
-    l.num_bytes_hit_set_archive == r.num_bytes_hit_set_archive;
+    l.num_bytes_hit_set_archive == r.num_bytes_hit_set_archive &&
+    l.num_flush == r.num_flush &&
+    l.num_flush_kb == r.num_flush_kb &&
+    l.num_evict == r.num_evict &&
+    l.num_evict_kb == r.num_evict_kb &&
+    l.num_promote == r.num_promote &&
+    l.num_flush_mode_high == r.num_flush_mode_high &&
+    l.num_flush_mode_low == r.num_flush_mode_low &&
+    l.num_evict_mode_some == r.num_evict_mode_some &&
+    l.num_evict_mode_full == r.num_evict_mode_full;
 }
 
 // -- object_stat_collection_t --
@@ -1790,7 +1920,7 @@ bool pg_stat_t::is_acting_osd(int32_t osd, bool primary) const
     return true;
   } else if (!primary) {
     for(vector<int32_t>::const_iterator it = acting.begin();
-        it != acting.end(); it++)
+        it != acting.end(); ++it)
     {
       if (*it == osd)
         return true;
@@ -2226,7 +2356,7 @@ void pool_stat_t::generate_test_instances(list<pool_stat_t*>& o)
 
 void pg_history_t::encode(bufferlist &bl) const
 {
-  ENCODE_START(6, 4, bl);
+  ENCODE_START(7, 4, bl);
   ::encode(epoch_created, bl);
   ::encode(last_epoch_started, bl);
   ::encode(last_epoch_clean, bl);
@@ -2239,12 +2369,13 @@ void pg_history_t::encode(bufferlist &bl) const
   ::encode(last_deep_scrub, bl);
   ::encode(last_deep_scrub_stamp, bl);
   ::encode(last_clean_scrub_stamp, bl);
+  ::encode(last_epoch_marked_full, bl);
   ENCODE_FINISH(bl);
 }
 
 void pg_history_t::decode(bufferlist::iterator &bl)
 {
-  DECODE_START_LEGACY_COMPAT_LEN(6, 4, 4, bl);
+  DECODE_START_LEGACY_COMPAT_LEN(7, 4, 4, bl);
   ::decode(epoch_created, bl);
   ::decode(last_epoch_started, bl);
   if (struct_v >= 3)
@@ -2266,6 +2397,9 @@ void pg_history_t::decode(bufferlist::iterator &bl)
   if (struct_v >= 6) {
     ::decode(last_clean_scrub_stamp, bl);
   }
+  if (struct_v >= 7) {
+    ::decode(last_epoch_marked_full, bl);
+  }
   DECODE_FINISH(bl);
 }
 
@@ -2275,6 +2409,7 @@ void pg_history_t::dump(Formatter *f) const
   f->dump_int("last_epoch_started", last_epoch_started);
   f->dump_int("last_epoch_clean", last_epoch_clean);
   f->dump_int("last_epoch_split", last_epoch_split);
+  f->dump_int("last_epoch_marked_full", last_epoch_marked_full);
   f->dump_int("same_up_since", same_up_since);
   f->dump_int("same_interval_since", same_interval_since);
   f->dump_int("same_primary_since", same_primary_since);
@@ -2301,6 +2436,7 @@ void pg_history_t::generate_test_instances(list<pg_history_t*>& o)
   o.back()->last_deep_scrub = eversion_t(12, 13);
   o.back()->last_deep_scrub_stamp = utime_t(14, 15);
   o.back()->last_clean_scrub_stamp = utime_t(16, 17);
+  o.back()->last_epoch_marked_full = 18;
 }
 
 
@@ -2308,12 +2444,16 @@ void pg_history_t::generate_test_instances(list<pg_history_t*>& o)
 
 void pg_info_t::encode(bufferlist &bl) const
 {
-  ENCODE_START(30, 26, bl);
+  ENCODE_START(31, 26, bl);
   ::encode(pgid.pgid, bl);
   ::encode(last_update, bl);
   ::encode(last_complete, bl);
   ::encode(log_tail, bl);
-  ::encode(last_backfill, bl);
+  if (last_backfill_bitwise && last_backfill != last_backfill.get_max()) {
+    ::encode(hobject_t(), bl);
+  } else {
+    ::encode(last_backfill, bl);
+  }
   ::encode(stats, bl);
   history.encode(bl);
   ::encode(purged_snaps, bl);
@@ -2321,12 +2461,14 @@ void pg_info_t::encode(bufferlist &bl) const
   ::encode(last_user_version, bl);
   ::encode(hit_set, bl);
   ::encode(pgid.shard, bl);
+  ::encode(last_backfill, bl);
+  ::encode(last_backfill_bitwise, bl);
   ENCODE_FINISH(bl);
 }
 
 void pg_info_t::decode(bufferlist::iterator &bl)
 {
-  DECODE_START_LEGACY_COMPAT_LEN(29, 26, 26, bl);
+  DECODE_START_LEGACY_COMPAT_LEN(31, 26, 26, bl);
   if (struct_v < 23) {
     old_pg_t opgid;
     ::decode(opgid, bl);
@@ -2341,8 +2483,10 @@ void pg_info_t::decode(bufferlist::iterator &bl)
     bool log_backlog;
     ::decode(log_backlog, bl);
   }
-  if (struct_v >= 24)
-    ::decode(last_backfill, bl);
+  hobject_t old_last_backfill;
+  if (struct_v >= 24) {
+    ::decode(old_last_backfill, bl);
+  }
   ::decode(stats, bl);
   history.decode(bl);
   if (struct_v >= 22)
@@ -2366,6 +2510,13 @@ void pg_info_t::decode(bufferlist::iterator &bl)
     ::decode(pgid.shard, bl);
   else
     pgid.shard = shard_id_t::NO_SHARD;
+  if (struct_v >= 31) {
+    ::decode(last_backfill, bl);
+    ::decode(last_backfill_bitwise, bl);
+  } else {
+    last_backfill = old_last_backfill;
+    last_backfill_bitwise = false;
+  }
   DECODE_FINISH(bl);
 }
 
@@ -2379,6 +2530,7 @@ void pg_info_t::dump(Formatter *f) const
   f->dump_stream("log_tail") << log_tail;
   f->dump_int("last_user_version", last_user_version);
   f->dump_stream("last_backfill") << last_backfill;
+  f->dump_int("last_backfill_bitwise", (int)last_backfill_bitwise);
   f->dump_stream("purged_snaps") << purged_snaps;
   f->open_object_section("history");
   history.dump(f);
@@ -2410,6 +2562,7 @@ void pg_info_t::generate_test_instances(list<pg_info_t*>& o)
   o.back()->last_user_version = 2;
   o.back()->log_tail = eversion_t(7, 8);
   o.back()->last_backfill = hobject_t(object_t("objname"), "key", 123, 456, -1, "");
+  o.back()->last_backfill_bitwise = true;
   {
     list<pg_stat_t*> s;
     pg_stat_t::generate_test_instances(s);
@@ -2563,6 +2716,8 @@ bool pg_interval_t::is_new_interval(
   int new_min_size,
   unsigned old_pg_num,
   unsigned new_pg_num,
+  bool old_sort_bitwise,
+  bool new_sort_bitwise,
   pg_t pgid) {
   return old_acting_primary != new_acting_primary ||
     new_acting != old_acting ||
@@ -2570,7 +2725,8 @@ bool pg_interval_t::is_new_interval(
     new_up != old_up ||
     old_min_size != new_min_size ||
     old_size != new_size ||
-    pgid.is_split(old_pg_num, new_pg_num, 0);
+    pgid.is_split(old_pg_num, new_pg_num, 0) ||
+    old_sort_bitwise != new_sort_bitwise;
 }
 
 bool pg_interval_t::is_new_interval(
@@ -2600,6 +2756,8 @@ bool pg_interval_t::is_new_interval(
 		    osdmap->get_pools().find(pgid.pool())->second.min_size,
 		    lastmap->get_pg_num(pgid.pool()),
 		    osdmap->get_pg_num(pgid.pool()),
+		    lastmap->test_flag(CEPH_OSDMAP_SORTBITWISE),
+		    osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE),
 		    pgid);
 }
 
@@ -2640,6 +2798,7 @@ bool pg_interval_t::check_new_interval(
     pg_interval_t& i = (*past_intervals)[same_interval_since];
     i.first = same_interval_since;
     i.last = osdmap->get_epoch() - 1;
+    assert(i.first <= i.last);
     i.acting = old_acting;
     i.up = old_up;
     i.primary = old_acting_primary;
@@ -3127,6 +3286,12 @@ void pg_log_t::filter_log(spg_t import_pgid, const OSDMap &curmap,
   for (list<pg_log_entry_t>::const_iterator i = in.log.begin();
        i != in.log.end(); ++i) {
 
+    // Reject pg log entries for temporary objects
+    if (i->soid.is_temp()) {
+      reject.log.push_back(*i);
+      continue;
+    }
+
     if (i->soid.nspace != hit_set_namespace) {
       object_t oid = i->soid.oid;
       object_locator_t loc(i->soid);
@@ -3280,6 +3445,17 @@ ostream& pg_log_t::print(ostream& out) const
 
 // -- pg_missing_t --
 
+void pg_missing_t::resort(bool sort_bitwise)
+{
+  if (missing.key_comp().bitwise != sort_bitwise) {
+    map<hobject_t, item, hobject_t::ComparatorWithDefault> tmp;
+    tmp.swap(missing);
+    missing = map<hobject_t, item, hobject_t::ComparatorWithDefault>(
+      hobject_t::ComparatorWithDefault(sort_bitwise));
+    missing.insert(tmp.begin(), tmp.end());
+  }
+}
+
 void pg_missing_t::encode(bufferlist &bl) const
 {
   ENCODE_START(3, 2, bl);
@@ -3295,8 +3471,8 @@ void pg_missing_t::decode(bufferlist::iterator &bl, int64_t pool)
 
   if (struct_v < 3) {
     // Handle hobject_t upgrade
-    map<hobject_t, item> tmp;
-    for (map<hobject_t, item>::iterator i = missing.begin();
+    map<hobject_t, item, hobject_t::ComparatorWithDefault> tmp;
+    for (map<hobject_t, item, hobject_t::ComparatorWithDefault>::iterator i = missing.begin();
 	 i != missing.end();
       ) {
       if (!i->first.is_max() && i->first.pool == -1) {
@@ -3311,7 +3487,7 @@ void pg_missing_t::decode(bufferlist::iterator &bl, int64_t pool)
     missing.insert(tmp.begin(), tmp.end());
   }
 
-  for (map<hobject_t,item>::iterator it = missing.begin();
+  for (map<hobject_t,item, hobject_t::ComparatorWithDefault>::iterator it = missing.begin();
        it != missing.end();
        ++it)
     rmissing[it->second.need.version] = it->first;
@@ -3320,7 +3496,7 @@ void pg_missing_t::decode(bufferlist::iterator &bl, int64_t pool)
 void pg_missing_t::dump(Formatter *f) const
 {
   f->open_array_section("missing");
-  for (map<hobject_t,item>::const_iterator p = missing.begin(); p != missing.end(); ++p) {
+  for (map<hobject_t,item, hobject_t::ComparatorWithDefault>::const_iterator p = missing.begin(); p != missing.end(); ++p) {
     f->open_object_section("item");
     f->dump_stream("object") << p->first;
     p->second.dump(f);
@@ -3376,7 +3552,7 @@ bool pg_missing_t::is_missing(const hobject_t& oid) const
 
 bool pg_missing_t::is_missing(const hobject_t& oid, eversion_t v) const
 {
-  map<hobject_t, item>::const_iterator m = missing.find(oid);
+  map<hobject_t, item, hobject_t::ComparatorWithDefault>::const_iterator m = missing.find(oid);
   if (m == missing.end())
     return false;
   const pg_missing_t::item &item(m->second);
@@ -3387,7 +3563,7 @@ bool pg_missing_t::is_missing(const hobject_t& oid, eversion_t v) const
 
 eversion_t pg_missing_t::have_old(const hobject_t& oid) const
 {
-  map<hobject_t, item>::const_iterator m = missing.find(oid);
+  map<hobject_t, item, hobject_t::ComparatorWithDefault>::const_iterator m = missing.find(oid);
   if (m == missing.end())
     return eversion_t();
   const pg_missing_t::item &item(m->second);
@@ -3401,22 +3577,26 @@ eversion_t pg_missing_t::have_old(const hobject_t& oid) const
 void pg_missing_t::add_next_event(const pg_log_entry_t& e)
 {
   if (e.is_update()) {
+    map<hobject_t, item, hobject_t::ComparatorWithDefault>::iterator missing_it;
+    missing_it = missing.find(e.soid);
+    bool is_missing_divergent_item = missing_it != missing.end();
     if (e.prior_version == eversion_t() || e.is_clone()) {
       // new object.
-      //assert(missing.count(e.soid) == 0);  // might already be missing divergent item.
-      if (missing.count(e.soid))  // already missing divergent item
-	rmissing.erase(missing[e.soid].need.version);
-      missing[e.soid] = item(e.version, eversion_t());  // .have = nil
-    } else if (missing.count(e.soid)) {
+      if (is_missing_divergent_item) {  // use iterator
+        rmissing.erase((missing_it->second).need.version);
+        missing_it->second = item(e.version, eversion_t());  // .have = nil
+      } else  // create new element in missing map
+        missing[e.soid] = item(e.version, eversion_t());     // .have = nil
+    } else if (is_missing_divergent_item) {
       // already missing (prior).
-      //assert(missing[e.soid].need == e.prior_version);
-      rmissing.erase(missing[e.soid].need.version);
-      missing[e.soid].need = e.version;  // leave .have unchanged.
+      rmissing.erase((missing_it->second).need.version);
+      (missing_it->second).need = e.version;  // leave .have unchanged.
     } else if (e.is_backlog()) {
       // May not have prior version
       assert(0 == "these don't exist anymore");
     } else {
       // not missing, we must have prior_version (if any)
+      assert(!is_missing_divergent_item);
       missing[e.soid] = item(e.version, e.prior_version);
     }
     rmissing[e.version.version] = e.soid;
@@ -3450,12 +3630,12 @@ void pg_missing_t::add(const hobject_t& oid, eversion_t need, eversion_t have)
 
 void pg_missing_t::rm(const hobject_t& oid, eversion_t v)
 {
-  std::map<hobject_t, pg_missing_t::item>::iterator p = missing.find(oid);
+  std::map<hobject_t, pg_missing_t::item, hobject_t::ComparatorWithDefault>::iterator p = missing.find(oid);
   if (p != missing.end() && p->second.need <= v)
     rm(p);
 }
 
-void pg_missing_t::rm(const std::map<hobject_t, pg_missing_t::item>::iterator &m)
+void pg_missing_t::rm(const std::map<hobject_t, pg_missing_t::item, hobject_t::ComparatorWithDefault>::iterator &m)
 {
   rmissing.erase(m->second.need.version);
   missing.erase(m);
@@ -3463,13 +3643,13 @@ void pg_missing_t::rm(const std::map<hobject_t, pg_missing_t::item>::iterator &m
 
 void pg_missing_t::got(const hobject_t& oid, eversion_t v)
 {
-  std::map<hobject_t, pg_missing_t::item>::iterator p = missing.find(oid);
+  std::map<hobject_t, pg_missing_t::item, hobject_t::ComparatorWithDefault>::iterator p = missing.find(oid);
   assert(p != missing.end());
   assert(p->second.need <= v);
   got(p);
 }
 
-void pg_missing_t::got(const std::map<hobject_t, pg_missing_t::item>::iterator &m)
+void pg_missing_t::got(const std::map<hobject_t, pg_missing_t::item, hobject_t::ComparatorWithDefault>::iterator &m)
 {
   rmissing.erase(m->second.need.version);
   missing.erase(m);
@@ -3481,7 +3661,7 @@ void pg_missing_t::split_into(
   pg_missing_t *omissing)
 {
   unsigned mask = ~((~0)<<split_bits);
-  for (map<hobject_t, item>::iterator i = missing.begin();
+  for (map<hobject_t, item, hobject_t::ComparatorWithDefault>::iterator i = missing.begin();
        i != missing.end();
        ) {
     if ((i->first.get_hash() & mask) == child_pgid.m_seed) {
@@ -3661,15 +3841,9 @@ void object_copy_data_t::decode(bufferlist::iterator& bl)
     ::decode(data, bl);
     ::decode(omap_data, bl);
     ::decode(cursor, bl);
-    if (struct_v >= 2)
-      ::decode(omap_header, bl);
-    if (struct_v >= 3) {
-      ::decode(snaps, bl);
-      ::decode(snap_seq, bl);
-    } else {
-      snaps.clear();
-      snap_seq = 0;
-    }
+    ::decode(omap_header, bl);
+    ::decode(snaps, bl);
+    ::decode(snap_seq, bl);
     if (struct_v >= 4) {
       ::decode(flags, bl);
       ::decode(data_digest, bl);
@@ -3726,7 +3900,7 @@ void object_copy_data_t::dump(Formatter *f) const
   f->dump_int("size", size);
   f->dump_stream("mtime") << mtime;
   /* we should really print out the attrs here, but bufferlist
-     const-correctness prents that */
+     const-correctness prevents that */
   f->dump_int("attrs_size", attrs.size());
   f->dump_int("flags", flags);
   f->dump_unsigned("data_digest", data_digest);
@@ -3789,19 +3963,25 @@ void pg_create_t::generate_test_instances(list<pg_create_t*>& o)
 
 void pg_hit_set_info_t::encode(bufferlist& bl) const
 {
-  ENCODE_START(1, 1, bl);
+  ENCODE_START(2, 1, bl);
   ::encode(begin, bl);
   ::encode(end, bl);
   ::encode(version, bl);
+  ::encode(using_gmt, bl);
   ENCODE_FINISH(bl);
 }
 
 void pg_hit_set_info_t::decode(bufferlist::iterator& p)
 {
-  DECODE_START(1, p);
+  DECODE_START(2, p);
   ::decode(begin, p);
   ::decode(end, p);
   ::decode(version, p);
+  if (struct_v >= 2) {
+    ::decode(using_gmt, p);
+  } else {
+    using_gmt = false;
+  }
   DECODE_FINISH(p);
 }
 
@@ -3810,6 +3990,7 @@ void pg_hit_set_info_t::dump(Formatter *f) const
   f->dump_stream("begin") << begin;
   f->dump_stream("end") << end;
   f->dump_stream("version") << version;
+  f->dump_stream("using_gmt") << using_gmt;
 }
 
 void pg_hit_set_info_t::generate_test_instances(list<pg_hit_set_info_t*>& ls)
@@ -3827,8 +4008,14 @@ void pg_hit_set_history_t::encode(bufferlist& bl) const
 {
   ENCODE_START(1, 1, bl);
   ::encode(current_last_update, bl);
-  ::encode(current_last_stamp, bl);
-  ::encode(current_info, bl);
+  {
+    utime_t dummy_stamp;
+    ::encode(dummy_stamp, bl);
+  }
+  {
+    pg_hit_set_info_t dummy_info;
+    ::encode(dummy_info, bl);
+  }
   ::encode(history, bl);
   ENCODE_FINISH(bl);
 }
@@ -3837,8 +4024,14 @@ void pg_hit_set_history_t::decode(bufferlist::iterator& p)
 {
   DECODE_START(1, p);
   ::decode(current_last_update, p);
-  ::decode(current_last_stamp, p);
-  ::decode(current_info, p);
+  {
+    utime_t dummy_stamp;
+    ::decode(dummy_stamp, p);
+  }
+  {
+    pg_hit_set_info_t dummy_info;
+    ::decode(dummy_info, p);
+  }
   ::decode(history, p);
   DECODE_FINISH(p);
 }
@@ -3846,10 +4039,6 @@ void pg_hit_set_history_t::decode(bufferlist::iterator& p)
 void pg_hit_set_history_t::dump(Formatter *f) const
 {
   f->dump_stream("current_last_update") << current_last_update;
-  f->dump_stream("current_last_stamp") << current_last_stamp;
-  f->open_object_section("current_info");
-  current_info.dump(f);
-  f->close_section();
   f->open_array_section("history");
   for (list<pg_hit_set_info_t>::const_iterator p = history.begin();
        p != history.end(); ++p) {
@@ -3865,10 +4054,6 @@ void pg_hit_set_history_t::generate_test_instances(list<pg_hit_set_history_t*>&
   ls.push_back(new pg_hit_set_history_t);
   ls.push_back(new pg_hit_set_history_t);
   ls.back()->current_last_update = eversion_t(1, 2);
-  ls.back()->current_last_stamp = utime_t(100, 123);
-  ls.back()->current_info.begin = utime_t(2, 4);
-  ls.back()->current_info.end = utime_t(62, 24);
-  ls.back()->history.push_back(ls.back()->current_info);
   ls.back()->history.push_back(pg_hit_set_info_t());
 }
 
@@ -3910,7 +4095,7 @@ ostream& operator<<(ostream& out, const osd_peer_stat_t &stat)
 
 void OSDSuperblock::encode(bufferlist &bl) const
 {
-  ENCODE_START(6, 5, bl);
+  ENCODE_START(8, 5, bl);
   ::encode(cluster_fsid, bl);
   ::encode(whoami, bl);
   ::encode(current_epoch, bl);
@@ -3921,13 +4106,14 @@ void OSDSuperblock::encode(bufferlist &bl) const
   ::encode(clean_thru, bl);
   ::encode(mounted, bl);
   ::encode(osd_fsid, bl);
-  ::encode(last_map_marked_full, bl);
+  ::encode((epoch_t)0, bl);  // epoch_t last_epoch_marked_full
+  ::encode((uint32_t)0, bl);  // map<int64_t,epoch_t> pool_last_epoch_marked_full
   ENCODE_FINISH(bl);
 }
 
 void OSDSuperblock::decode(bufferlist::iterator &bl)
 {
-  DECODE_START_LEGACY_COMPAT_LEN(6, 5, 5, bl);
+  DECODE_START_LEGACY_COMPAT_LEN(8, 5, 5, bl);
   if (struct_v < 3) {
     string magic;
     ::decode(magic, bl);
@@ -3947,8 +4133,14 @@ void OSDSuperblock::decode(bufferlist::iterator &bl)
   ::decode(mounted, bl);
   if (struct_v >= 4)
     ::decode(osd_fsid, bl);
-  if (struct_v >= 6)
+  if (struct_v >= 6) {
+    epoch_t last_map_marked_full;
     ::decode(last_map_marked_full, bl);
+  }
+  if (struct_v >= 7) {
+    map<int64_t,epoch_t> pool_last_map_marked_full;
+    ::decode(pool_last_map_marked_full, bl);
+  }
   DECODE_FINISH(bl);
 }
 
@@ -3966,7 +4158,6 @@ void OSDSuperblock::dump(Formatter *f) const
   f->close_section();
   f->dump_int("clean_thru", clean_thru);
   f->dump_int("last_epoch_mounted", mounted);
-  f->dump_int("last_map_marked_full", last_map_marked_full);
 }
 
 void OSDSuperblock::generate_test_instances(list<OSDSuperblock*>& o)
@@ -3982,7 +4173,6 @@ void OSDSuperblock::generate_test_instances(list<OSDSuperblock*>& o)
   z.mounted = 8;
   z.clean_thru = 7;
   o.push_back(new OSDSuperblock(z));
-  z.last_map_marked_full = 7;
   o.push_back(new OSDSuperblock(z));
 }
 
@@ -4235,7 +4425,7 @@ void object_info_t::encode(bufferlist& bl) const
   ::encode(size, bl);
   ::encode(mtime, bl);
   if (soid.snap == CEPH_NOSNAP)
-    ::encode(wrlock_by, bl);
+    ::encode(osd_reqid_t(), bl);  // used to be wrlock_by
   else
     ::encode(snaps, bl);
   ::encode(truncate_seq, bl);
@@ -4259,7 +4449,7 @@ void object_info_t::encode(bufferlist& bl) const
 void object_info_t::decode(bufferlist::iterator& bl)
 {
   object_locator_t myoloc;
-  DECODE_START_LEGACY_COMPAT_LEN(14, 8, 8, bl);
+  DECODE_START_LEGACY_COMPAT_LEN(15, 8, 8, bl);
   map<entity_name_t, watch_info_t> old_watchers;
   ::decode(soid, bl);
   ::decode(myoloc, bl);
@@ -4272,10 +4462,12 @@ void object_info_t::decode(bufferlist::iterator& bl)
   ::decode(last_reqid, bl);
   ::decode(size, bl);
   ::decode(mtime, bl);
-  if (soid.snap == CEPH_NOSNAP)
+  if (soid.snap == CEPH_NOSNAP) {
+    osd_reqid_t wrlock_by;
     ::decode(wrlock_by, bl);
-  else
+  } else {
     ::decode(snaps, bl);
+  }
   ::decode(truncate_seq, bl);
   ::decode(truncate_size, bl);
 
@@ -4347,7 +4539,6 @@ void object_info_t::dump(Formatter *f) const
   f->dump_stream("local_mtime") << local_mtime;
   f->dump_unsigned("lost", (int)is_lost());
   f->dump_unsigned("flags", (int)flags);
-  f->dump_stream("wrlock_by") << wrlock_by;
   f->open_array_section("snaps");
   for (vector<snapid_t>::const_iterator p = snaps.begin(); p != snaps.end(); ++p)
     f->dump_unsigned("snap", *p);
@@ -4380,9 +4571,7 @@ ostream& operator<<(ostream& out, const object_info_t& oi)
 {
   out << oi.soid << "(" << oi.version
       << " " << oi.last_reqid;
-  if (oi.soid.snap == CEPH_NOSNAP)
-    out << " wrlock_by=" << oi.wrlock_by;
-  else
+  if (oi.soid.snap != CEPH_NOSNAP)
     out << " " << oi.snaps;
   if (oi.flags)
     out << " " << oi.get_flag_string();
@@ -4490,9 +4679,9 @@ void ObjectRecoveryInfo::decode(bufferlist::iterator &bl,
   if (struct_v < 2) {
     if (!soid.is_max() && soid.pool == -1)
       soid.pool = pool;
-    map<hobject_t, interval_set<uint64_t> > tmp;
+    map<hobject_t, interval_set<uint64_t>, hobject_t::BitwiseComparator> tmp;
     tmp.swap(clone_subset);
-    for (map<hobject_t, interval_set<uint64_t> >::iterator i = tmp.begin();
+    for (map<hobject_t, interval_set<uint64_t>, hobject_t::BitwiseComparator>::iterator i = tmp.begin();
 	 i != tmp.end();
 	 ++i) {
       hobject_t first(i->first);
@@ -4541,6 +4730,7 @@ ostream &ObjectRecoveryInfo::print(ostream &out) const
 {
   return out << "ObjectRecoveryInfo("
 	     << soid << "@" << version
+	     << ", size: " << size
 	     << ", copy_subset: " << copy_subset
 	     << ", clone_subset: " << clone_subset
 	     << ")";
@@ -4770,11 +4960,11 @@ void ScrubMap::merge_incr(const ScrubMap &l)
   assert(valid_through == l.incr_since);
   valid_through = l.valid_through;
 
-  for (map<hobject_t,object>::const_iterator p = l.objects.begin();
+  for (map<hobject_t,object, hobject_t::BitwiseComparator>::const_iterator p = l.objects.begin();
        p != l.objects.end();
        ++p){
     if (p->second.negative) {
-      map<hobject_t,object>::iterator q = objects.find(p->first);
+      map<hobject_t,object, hobject_t::BitwiseComparator>::iterator q = objects.find(p->first);
       if (q != objects.end()) {
 	objects.erase(q);
       }
@@ -4812,9 +5002,9 @@ void ScrubMap::decode(bufferlist::iterator& bl, int64_t pool)
 
   // handle hobject_t upgrade
   if (struct_v < 3) {
-    map<hobject_t, object> tmp;
+    map<hobject_t, object, hobject_t::BitwiseComparator> tmp;
     tmp.swap(objects);
-    for (map<hobject_t, object>::iterator i = tmp.begin();
+    for (map<hobject_t, object, hobject_t::BitwiseComparator>::iterator i = tmp.begin();
 	 i != tmp.end();
 	 ++i) {
       hobject_t first(i->first);
@@ -4830,7 +5020,7 @@ void ScrubMap::dump(Formatter *f) const
   f->dump_stream("valid_through") << valid_through;
   f->dump_stream("incremental_since") << incr_since;
   f->open_array_section("objects");
-  for (map<hobject_t,object>::const_iterator p = objects.begin(); p != objects.end(); ++p) {
+  for (map<hobject_t,object, hobject_t::BitwiseComparator>::const_iterator p = objects.begin(); p != objects.end(); ++p) {
     f->open_object_section("object");
     f->dump_string("name", p->first.oid.name);
     f->dump_unsigned("hash", p->first.get_hash());
diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h
index b9b3b81..d51c894 100644
--- a/src/osd/osd_types.h
+++ b/src/osd/osd_types.h
@@ -134,14 +134,14 @@ inline bool operator<=(const osd_reqid_t& l, const osd_reqid_t& r) {
 inline bool operator>(const osd_reqid_t& l, const osd_reqid_t& r) { return !(l <= r); }
 inline bool operator>=(const osd_reqid_t& l, const osd_reqid_t& r) { return !(l < r); }
 
-CEPH_HASH_NAMESPACE_START
+namespace std {
   template<> struct hash<osd_reqid_t> {
     size_t operator()(const osd_reqid_t &r) const { 
       static hash<uint64_t> H;
       return H(r.name.num() ^ r.tid ^ r.inc);
     }
   };
-CEPH_HASH_NAMESPACE_END
+} // namespace std
 
 
 // -----
@@ -259,19 +259,15 @@ enum {
   CEPH_OSD_RMW_FLAG_CLASS_WRITE = (1 << 4),
   CEPH_OSD_RMW_FLAG_PGOP        = (1 << 5),
   CEPH_OSD_RMW_FLAG_CACHE       = (1 << 6),
-  CEPH_OSD_RMW_FLAG_PROMOTE     = (1 << 7),
+  CEPH_OSD_RMW_FLAG_FORCE_PROMOTE   = (1 << 7),
+  CEPH_OSD_RMW_FLAG_SKIP_HANDLE_CACHE = (1 << 8),
+  CEPH_OSD_RMW_FLAG_SKIP_PROMOTE      = (1 << 9),
 };
 
 
 // pg stuff
 
-// object namespaces
-#define CEPH_METADATA_NS       1
-#define CEPH_DATA_NS           2
-#define CEPH_CAS_NS            3
-#define CEPH_OSDMETADATA_NS 0xff
-
-#define OSD_SUPERBLOCK_POBJECT hobject_t(sobject_t(object_t("osd_superblock"), 0))
+#define OSD_SUPERBLOCK_POBJECT ghobject_t(hobject_t(sobject_t(object_t("osd_superblock"), 0)))
 
 // placement seed (a hash value)
 typedef uint32_t ps_t;
@@ -347,6 +343,10 @@ struct pg_t {
    */
   unsigned get_split_bits(unsigned pg_num) const;
 
+  bool contains(int bits, const ghobject_t& oid) {
+    return oid.match(bits, ps());
+  }
+
   void encode(bufferlist& bl) const {
     __u8 v = 1;
     ::encode(v, bl);
@@ -404,7 +404,7 @@ inline bool operator>=(const pg_t& l, const pg_t& r) {
 
 ostream& operator<<(ostream& out, const pg_t &pg);
 
-CEPH_HASH_NAMESPACE_START
+namespace std {
   template<> struct hash< pg_t >
   {
     size_t operator()( const pg_t& x ) const
@@ -413,7 +413,7 @@ CEPH_HASH_NAMESPACE_START
       return H((x.pool() & 0xffffffff) ^ (x.pool() >> 32) ^ x.ps() ^ x.preferred());
     }
   };
-CEPH_HASH_NAMESPACE_END
+} // namespace std
 
 struct spg_t {
   pg_t pgid;
@@ -437,6 +437,9 @@ struct spg_t {
     return pgid.preferred();
   }
   bool parse(const char *s);
+  bool parse(const std::string& s) {
+    return parse(s.c_str());
+  }
   bool is_split(unsigned old_pg_num, unsigned new_pg_num,
 		set<spg_t> *pchildren) const {
     set<pg_t> _children;
@@ -471,12 +474,18 @@ struct spg_t {
     ::decode(shard, bl);
     DECODE_FINISH(bl);
   }
+
+  hobject_t make_temp_object(const string& name) {
+    return hobject_t(object_t(name), "", CEPH_NOSNAP,
+		     pgid.ps(),
+		     hobject_t::POOL_TEMP_START - pgid.pool(), "");
+  }
 };
 WRITE_CLASS_ENCODER(spg_t)
 WRITE_EQ_OPERATORS_2(spg_t, pgid, shard)
 WRITE_CMP_OPERATORS_2(spg_t, pgid, shard)
 
-CEPH_HASH_NAMESPACE_START
+namespace std {
   template<> struct hash< spg_t >
   {
     size_t operator()( const spg_t& x ) const
@@ -485,71 +494,136 @@ CEPH_HASH_NAMESPACE_START
       return H(hash<pg_t>()(x.pgid) ^ x.shard);
     }
   };
-CEPH_HASH_NAMESPACE_END
+} // namespace std
 
 ostream& operator<<(ostream& out, const spg_t &pg);
 
 // ----------------------
 
 class coll_t {
+  enum type_t {
+    TYPE_META = 0,
+    TYPE_LEGACY_TEMP = 1,  /* no longer used */
+    TYPE_PG = 2,
+    TYPE_PG_TEMP = 3,
+    TYPE_PG_REMOVAL = 4,   /* note: deprecated, not encoded */
+  };
+  type_t type;
+  spg_t pgid;
+  uint64_t removal_seq;  // note: deprecated, not encoded
+
+  string _str;  // cached string
+
+  void calc_str();
+
+  coll_t(type_t t, spg_t p, uint64_t r)
+    : type(t), pgid(p), removal_seq(r) {
+    calc_str();
+  }
+
 public:
-  coll_t()
-    : str("meta")
-  { }
+  coll_t() : type(TYPE_META), removal_seq(0)
+  {
+    calc_str();
+  }
 
-  explicit coll_t(const std::string &str_)
-    : str(str_)
-  { }
+  coll_t(const coll_t& other)
+    : type(other.type), pgid(other.pgid), removal_seq(other.removal_seq) {
+    calc_str();
+  }
 
-  explicit coll_t(spg_t pgid, snapid_t snap = CEPH_NOSNAP)
-    : str(pg_and_snap_to_str(pgid, snap))
-  { }
+  explicit coll_t(spg_t pgid)
+    : type(TYPE_PG), pgid(pgid), removal_seq(0)
+  {
+    calc_str();
+  }
 
-  static coll_t make_temp_coll(spg_t pgid) {
-    return coll_t(pg_to_tmp_str(pgid));
+  // named constructors
+  static coll_t meta() {
+    return coll_t();
+  }
+  static coll_t pg(spg_t p) {
+    return coll_t(p);
   }
 
   const std::string& to_str() const {
-    return str;
+    return _str;
   }
-
-  const char* c_str() const {
-    return str.c_str();
+  const char *c_str() const {
+    return _str.c_str();
   }
+  bool parse(const std::string& s);
 
   int operator<(const coll_t &rhs) const {
-    return str < rhs.str;
+    return type < rhs.type ||
+		  (type == rhs.type && pgid < rhs.pgid);
+  }
+
+  bool is_meta() const {
+    return type == TYPE_META;
+  }
+  bool is_pg_prefix(spg_t *pgid_) const {
+    if (type == TYPE_PG || type == TYPE_PG_TEMP || type == TYPE_PG_REMOVAL) {
+      *pgid_ = pgid;
+      return true;
+    }
+    return false;
+  }
+  bool is_pg() const {
+    return type == TYPE_PG;
+  }
+  bool is_pg(spg_t *pgid_) const {
+    if (type == TYPE_PG) {
+      *pgid_ = pgid;
+      return true;
+    }
+    return false;
+  }
+  bool is_temp() const {
+    return type == TYPE_PG_TEMP;
+  }
+  bool is_temp(spg_t *pgid_) const {
+    if (type == TYPE_PG_TEMP) {
+      *pgid_ = pgid;
+      return true;
+    }
+    return false;
+  }
+  bool is_removal() const {
+    return type == TYPE_PG_REMOVAL;
+  }
+  bool is_removal(spg_t *pgid_) const {
+    if (type == TYPE_PG_REMOVAL) {
+      *pgid_ = pgid;
+      return true;
+    }
+    return false;
   }
 
-  bool is_pg_prefix(spg_t& pgid) const;
-  bool is_pg(spg_t& pgid, snapid_t& snap) const;
-  bool is_temp(spg_t& pgid) const;
-  bool is_removal(uint64_t *seq, spg_t *pgid) const;
   void encode(bufferlist& bl) const;
   void decode(bufferlist::iterator& bl);
+
   inline bool operator==(const coll_t& rhs) const {
-    return str == rhs.str;
+    // only compare type if meta
+    if (type != rhs.type)
+      return false;
+    if (type == TYPE_META)
+      return true;
+    return type == rhs.type && pgid == rhs.pgid;
   }
   inline bool operator!=(const coll_t& rhs) const {
-    return str != rhs.str;
+    return !(*this == rhs);
   }
 
-  void dump(Formatter *f) const;
-  static void generate_test_instances(list<coll_t*>& o);
-
-private:
-  static std::string pg_and_snap_to_str(spg_t p, snapid_t s) {
-    std::ostringstream oss;
-    oss << p << "_" << s;
-    return oss.str();
-  }
-  static std::string pg_to_tmp_str(spg_t p) {
-    std::ostringstream oss;
-    oss << p << "_TEMP";
-    return oss.str();
+  // get a TEMP collection that corresponds to the current collection,
+  // which we presume is a pg collection.
+  coll_t get_temp() const {
+    assert(type == TYPE_PG);
+    return coll_t(TYPE_PG_TEMP, pgid, 0);
   }
 
-  std::string str;
+  void dump(Formatter *f) const;
+  static void generate_test_instances(list<coll_t*>& o);
 };
 
 WRITE_CLASS_ENCODER(coll_t)
@@ -559,7 +633,7 @@ inline ostream& operator<<(ostream& out, const coll_t& c) {
   return out;
 }
 
-CEPH_HASH_NAMESPACE_START
+namespace std {
   template<> struct hash<coll_t> {
     size_t operator()(const coll_t &c) const { 
       size_t h = 0;
@@ -576,7 +650,7 @@ CEPH_HASH_NAMESPACE_START
       return h;
     }
   };
-CEPH_HASH_NAMESPACE_END
+} // namespace std
 
 inline ostream& operator<<(ostream& out, const ceph_object_layout &ol)
 {
@@ -848,6 +922,8 @@ struct pg_pool_t {
     FLAG_NOPGCHANGE = 1<<5, // pool's pg and pgp num can't be changed
     FLAG_NOSIZECHANGE = 1<<6, // pool's size and min size can't be changed
     FLAG_WRITE_FADVISE_DONTNEED = 1<<7, // write mode with LIBRADOS_OP_FLAG_FADVISE_DONTNEED
+    FLAG_NOSCRUB = 1<<8, // block periodic scrub
+    FLAG_NODEEP_SCRUB = 1<<9, // block periodic deep-scrub
   };
 
   static const char *get_flag_name(int f) {
@@ -860,6 +936,8 @@ struct pg_pool_t {
     case FLAG_NOPGCHANGE: return "nopgchange";
     case FLAG_NOSIZECHANGE: return "nosizechange";
     case FLAG_WRITE_FADVISE_DONTNEED: return "write_fadvise_dontneed";
+    case FLAG_NOSCRUB: return "noscrub";
+    case FLAG_NODEEP_SCRUB: return "nodeep-scrub";
     default: return "???";
     }
   }
@@ -894,6 +972,10 @@ struct pg_pool_t {
       return FLAG_NOSIZECHANGE;
     if (name == "write_fadvise_dontneed")
       return FLAG_WRITE_FADVISE_DONTNEED;
+    if (name == "noscrub")
+      return FLAG_NOSCRUB;
+    if (name == "nodeep-scrub")
+      return FLAG_NODEEP_SCRUB;
     return 0;
   }
 
@@ -981,7 +1063,7 @@ public:
    */
   map<snapid_t, pool_snap_info_t> snaps;
   /*
-   * Alternatively, if we are definining non-pool snaps (e.g. via the
+   * Alternatively, if we are defining non-pool snaps (e.g. via the
    * Ceph MDS), we must track @removed_snaps (since @snaps is not
    * used).  Snaps and removed_snaps are to be used exclusive of each
    * other!
@@ -1017,6 +1099,7 @@ public:
     target_max_bytes = 0;
     target_max_objects = 0;
     cache_target_dirty_ratio_micro = 0;
+    cache_target_dirty_high_ratio_micro = 0;
     cache_target_full_ratio_micro = 0;
     hit_set_params = HitSet::Params();
     hit_set_period = 0;
@@ -1027,6 +1110,7 @@ public:
   uint64_t target_max_objects; ///< tiering: target max pool size
 
   uint32_t cache_target_dirty_ratio_micro; ///< cache: fraction of target to leave dirty
+  uint32_t cache_target_dirty_high_ratio_micro; ///<cache: fraction of  target to flush with high speed
   uint32_t cache_target_full_ratio_micro;  ///< cache: fraction of target to fill before we evict in earnest
 
   uint32_t cache_min_flush_age;  ///< minimum age (seconds) before we can flush
@@ -1035,12 +1119,15 @@ public:
   HitSet::Params hit_set_params; ///< The HitSet params to use on this pool
   uint32_t hit_set_period;      ///< periodicity of HitSet segments (seconds)
   uint32_t hit_set_count;       ///< number of periods to retain
-  uint32_t min_read_recency_for_promote;   ///< minimum number of HitSet to check before promote
+  bool use_gmt_hitset;	        ///< use gmt to name the hitset archive object
+  uint32_t min_read_recency_for_promote;   ///< minimum number of HitSet to check before promote on read
+  uint32_t min_write_recency_for_promote;  ///< minimum number of HitSet to check before promote on write
 
   uint32_t stripe_width;        ///< erasure coded stripe size in bytes
 
   uint64_t expected_num_objects; ///< expected number of objects on this pool, a value of 0 indicates
                                  ///< user does not specify any expected value
+  bool fast_read;            ///< whether turn on fast read on the pool or not
 
   pg_pool_t()
     : flags(0), type(0), size(0), min_size(0),
@@ -1057,15 +1144,19 @@ public:
       cache_mode(CACHEMODE_NONE),
       target_max_bytes(0), target_max_objects(0),
       cache_target_dirty_ratio_micro(0),
+      cache_target_dirty_high_ratio_micro(0),
       cache_target_full_ratio_micro(0),
       cache_min_flush_age(0),
       cache_min_evict_age(0),
       hit_set_params(),
       hit_set_period(0),
       hit_set_count(0),
+      use_gmt_hitset(true),
       min_read_recency_for_promote(0),
+      min_write_recency_for_promote(0),
       stripe_width(0),
-      expected_num_objects(0)
+      expected_num_objects(0),
+      fast_read(false)
   { }
 
   void dump(Formatter *f) const;
@@ -1112,6 +1203,10 @@ public:
   bool is_replicated()   const { return get_type() == TYPE_REPLICATED; }
   bool is_erasure() const { return get_type() == TYPE_ERASURE; }
 
+  bool supports_omap() const {
+    return !(get_type() == TYPE_ERASURE || has_flag(FLAG_DEBUG_FAKE_EC_POOL));
+  }
+
   bool requires_aligned_append() const { return is_erasure(); }
   uint64_t required_alignment() const { return stripe_width; }
 
@@ -1254,6 +1349,15 @@ struct object_stat_sum_t {
   int64_t num_objects_omap;
   int64_t num_objects_hit_set_archive;
   int64_t num_bytes_hit_set_archive;
+  int64_t num_flush;
+  int64_t num_flush_kb;
+  int64_t num_evict;
+  int64_t num_evict_kb;
+  int64_t num_promote;
+  int32_t num_flush_mode_high;  // 1 when in high flush mode, otherwise 0
+  int32_t num_flush_mode_low;   // 1 when in low flush mode, otherwise 0
+  int32_t num_evict_mode_some;  // 1 when in evict some mode, otherwise 0
+  int32_t num_evict_mode_full;  // 1 when in evict full mode, otherwise 0
 
   object_stat_sum_t()
     : num_bytes(0),
@@ -1271,7 +1375,14 @@ struct object_stat_sum_t {
       num_whiteouts(0),
       num_objects_omap(0),
       num_objects_hit_set_archive(0),
-      num_bytes_hit_set_archive(0)
+      num_bytes_hit_set_archive(0),
+      num_flush(0),
+      num_flush_kb(0),
+      num_evict(0),
+      num_evict_kb(0),
+      num_promote(0),
+      num_flush_mode_high(0), num_flush_mode_low(0),
+      num_evict_mode_some(0), num_evict_mode_full(0)
   {}
 
   void floor(int64_t f) {
@@ -1299,6 +1410,15 @@ struct object_stat_sum_t {
     FLOOR(num_objects_omap);
     FLOOR(num_objects_hit_set_archive);
     FLOOR(num_bytes_hit_set_archive);
+    FLOOR(num_flush);
+    FLOOR(num_flush_kb);
+    FLOOR(num_evict);
+    FLOOR(num_evict_kb);
+    FLOOR(num_promote);
+    FLOOR(num_flush_mode_high);
+    FLOOR(num_flush_mode_low);
+    FLOOR(num_evict_mode_some);
+    FLOOR(num_evict_mode_full);
 #undef FLOOR
   }
 
@@ -1334,6 +1454,15 @@ struct object_stat_sum_t {
     SPLIT(num_objects_omap);
     SPLIT(num_objects_hit_set_archive);
     SPLIT(num_bytes_hit_set_archive);
+    SPLIT(num_flush);
+    SPLIT(num_flush_kb);
+    SPLIT(num_evict);
+    SPLIT(num_evict_kb);
+    SPLIT(num_promote);
+    SPLIT(num_flush_mode_high);
+    SPLIT(num_flush_mode_low);
+    SPLIT(num_evict_mode_some);
+    SPLIT(num_evict_mode_full);
 #undef SPLIT
   }
 
@@ -1600,10 +1729,9 @@ WRITE_CLASS_ENCODER_FEATURES(pool_stat_t)
 struct pg_hit_set_info_t {
   utime_t begin, end;   ///< time interval
   eversion_t version;   ///< version this HitSet object was written
-
-  pg_hit_set_info_t() {}
-  pg_hit_set_info_t(utime_t b)
-    : begin(b) {}
+  bool using_gmt;	///< use gmt for creating the hit_set archive object name
+  pg_hit_set_info_t(bool using_gmt = true)
+    : using_gmt(using_gmt) {}
 
   void encode(bufferlist &bl) const;
   void decode(bufferlist::iterator &bl);
@@ -1620,8 +1748,6 @@ WRITE_CLASS_ENCODER(pg_hit_set_info_t)
  */
 struct pg_hit_set_history_t {
   eversion_t current_last_update;  ///< last version inserted into current set
-  utime_t current_last_stamp;      ///< timestamp of last insert
-  pg_hit_set_info_t current_info;  ///< metadata about the current set
   list<pg_hit_set_info_t> history; ///< archived sets, sorted oldest -> newest
 
   void encode(bufferlist &bl) const;
@@ -1645,6 +1771,7 @@ struct pg_history_t {
   epoch_t last_epoch_started;  // lower bound on last epoch started (anywhere, not necessarily locally)
   epoch_t last_epoch_clean;    // lower bound on last epoch the PG was completely clean.
   epoch_t last_epoch_split;    // as parent
+  epoch_t last_epoch_marked_full;  // pool or cluster
   
   /**
    * In the event of a map discontinuity, same_*_since may reflect the first
@@ -1666,6 +1793,7 @@ struct pg_history_t {
   pg_history_t()
     : epoch_created(0),
       last_epoch_started(0), last_epoch_clean(0), last_epoch_split(0),
+      last_epoch_marked_full(0),
       same_up_since(0), same_interval_since(0), same_primary_since(0) {}
   
   bool merge(const pg_history_t &other) {
@@ -1687,6 +1815,10 @@ struct pg_history_t {
       last_epoch_split = other.last_epoch_split; 
       modified = true;
     }
+    if (last_epoch_marked_full < other.last_epoch_marked_full) {
+      last_epoch_marked_full = other.last_epoch_marked_full;
+      modified = true;
+    }
     if (other.last_scrub > last_scrub) {
       last_scrub = other.last_scrub;
       modified = true;
@@ -1719,7 +1851,8 @@ WRITE_CLASS_ENCODER(pg_history_t)
 
 inline ostream& operator<<(ostream& out, const pg_history_t& h) {
   return out << "ec=" << h.epoch_created
-	     << " les/c " << h.last_epoch_started << "/" << h.last_epoch_clean
+	     << " les/c/f " << h.last_epoch_started << "/" << h.last_epoch_clean
+	     << "/" << h.last_epoch_marked_full
 	     << " " << h.same_up_since << "/" << h.same_interval_since << "/" << h.same_primary_since;
 }
 
@@ -1735,15 +1868,16 @@ inline ostream& operator<<(ostream& out, const pg_history_t& h) {
  */
 struct pg_info_t {
   spg_t pgid;
-  eversion_t last_update;    // last object version applied to store.
-  eversion_t last_complete;  // last version pg was complete through.
-  epoch_t last_epoch_started;// last epoch at which this pg started on this osd
+  eversion_t last_update;      ///< last object version applied to store.
+  eversion_t last_complete;    ///< last version pg was complete through.
+  epoch_t last_epoch_started;  ///< last epoch at which this pg started on this osd
   
-  version_t last_user_version; // last user object version applied to store
+  version_t last_user_version; ///< last user object version applied to store
 
-  eversion_t log_tail;     // oldest log entry.
+  eversion_t log_tail;         ///< oldest log entry.
 
-  hobject_t last_backfill;   // objects >= this and < last_complete may be missing
+  hobject_t last_backfill;     ///< objects >= this and < last_complete may be missing
+  bool last_backfill_bitwise;  ///< true if last_backfill reflects a bitwise (vs nibblewise) sort
 
   interval_set<snapid_t> purged_snaps;
 
@@ -1754,14 +1888,21 @@ struct pg_info_t {
 
   pg_info_t()
     : last_epoch_started(0), last_user_version(0),
-      last_backfill(hobject_t::get_max())
+      last_backfill(hobject_t::get_max()),
+      last_backfill_bitwise(false)
   { }
   pg_info_t(spg_t p)
     : pgid(p),
       last_epoch_started(0), last_user_version(0),
-      last_backfill(hobject_t::get_max())
+      last_backfill(hobject_t::get_max()),
+      last_backfill_bitwise(false)
   { }
   
+  void set_last_backfill(hobject_t pos, bool sort) {
+    last_backfill = pos;
+    last_backfill_bitwise = sort;
+  }
+
   bool is_empty() const { return last_update.version == 0; }
   bool dne() const { return history.epoch_created == 0; }
 
@@ -1793,7 +1934,8 @@ inline ostream& operator<<(ostream& out, const pg_info_t& pgi)
     out << " (" << pgi.log_tail << "," << pgi.last_update << "]";
   }
   if (pgi.is_incomplete())
-    out << " lb " << pgi.last_backfill;
+    out << " lb " << pgi.last_backfill
+	<< (pgi.last_backfill_bitwise ? " (bitwise)" : " (NIBBLEWISE)");
   //out << " c " << pgi.epoch_created;
   out << " local-les=" << pgi.last_epoch_started;
   out << " n=" << pgi.stats.stats.sum.num_objects;
@@ -1872,6 +2014,8 @@ struct pg_interval_t {
     int new_min_size,
     unsigned old_pg_num,
     unsigned new_pg_num,
+    bool old_sort_bitwise,
+    bool new_sort_bitwise,
     pg_t pgid
     );
 
@@ -2328,14 +2472,14 @@ struct pg_log_t {
    *
    * @param other pg_log_t to copy from
    * @param from copy entries after this version
-   * @parem to up to and including this version
+   * @param to up to and including this version
    */
   void copy_range(const pg_log_t &other, eversion_t from, eversion_t to);
 
   /**
    * copy up to N entries
    *
-   * @param o source log
+   * @param other source log
    * @param max max number of entreis to copy
    */
   void copy_up_to(const pg_log_t &other, int max);
@@ -2391,7 +2535,7 @@ struct pg_missing_t {
   }; 
   WRITE_CLASS_ENCODER(item)
 
-  map<hobject_t, item> missing;         // oid -> (need v, have v)
+  map<hobject_t, item, hobject_t::ComparatorWithDefault> missing;  // oid -> (need v, have v)
   map<version_t, hobject_t> rmissing;  // v -> oid
 
   unsigned int num_missing() const;
@@ -2405,9 +2549,9 @@ struct pg_missing_t {
   void revise_have(hobject_t oid, eversion_t have);
   void add(const hobject_t& oid, eversion_t need, eversion_t have);
   void rm(const hobject_t& oid, eversion_t v);
-  void rm(const std::map<hobject_t, pg_missing_t::item>::iterator &m);
+  void rm(const std::map<hobject_t, pg_missing_t::item, hobject_t::ComparatorWithDefault>::iterator &m);
   void got(const hobject_t& oid, eversion_t v);
-  void got(const std::map<hobject_t, pg_missing_t::item>::iterator &m);
+  void got(const std::map<hobject_t, pg_missing_t::item, hobject_t::ComparatorWithDefault>::iterator &m);
   void split_into(pg_t child_pgid, unsigned split_bits, pg_missing_t *omissing);
 
   void clear() {
@@ -2415,6 +2559,8 @@ struct pg_missing_t {
     rmissing.clear();
   }
 
+  void resort(bool sort_bitwise);
+
   void encode(bufferlist &bl) const;
   void decode(bufferlist::iterator &bl, int64_t pool = -1);
   void dump(Formatter *f) const;
@@ -2710,12 +2856,11 @@ public:
   // last interval over which i mounted and was then active
   epoch_t mounted;     // last epoch i mounted
   epoch_t clean_thru;  // epoch i was active and clean thru
-  epoch_t last_map_marked_full; // last epoch osdmap was marked full
 
   OSDSuperblock() : 
     whoami(-1), 
     current_epoch(0), oldest_map(0), newest_map(0), weight(0),
-    mounted(0), clean_thru(0), last_map_marked_full(0) {
+    mounted(0), clean_thru(0) {
   }
 
   void encode(bufferlist &bl) const;
@@ -2836,12 +2981,15 @@ static inline ostream& operator<<(ostream& out, const watch_info_t& w) {
 
 struct notify_info_t {
   uint64_t cookie;
+  uint64_t notify_id;
   uint32_t timeout;
   bufferlist bl;
 };
 
 static inline ostream& operator<<(ostream& out, const notify_info_t& n) {
-  return out << "notify(cookie " << n.cookie << " " << n.timeout << "s)";
+  return out << "notify(cookie " << n.cookie
+	     << " notify" << n.notify_id
+	     << " " << n.timeout << "s)";
 }
 
 
@@ -2894,7 +3042,6 @@ struct object_info_t {
     return get_flag_string(flags);
   }
 
-  osd_reqid_t wrlock_by;   // [head]
   vector<snapid_t> snaps;  // [clone]
 
   uint64_t truncate_seq, truncate_size;
@@ -3306,7 +3453,7 @@ public:
     return blocked;
   }
 
-  // do simple synchronous mutual exclusion, for now.  now waitqueues or anything fancy.
+  // do simple synchronous mutual exclusion, for now.  no waitqueues or anything fancy.
   void ondisk_write_lock() {
     lock.Lock();
     writers_waiting++;
@@ -3395,7 +3542,7 @@ struct ObjectRecoveryInfo {
   object_info_t oi;
   SnapSet ss;
   interval_set<uint64_t> copy_subset;
-  map<hobject_t, interval_set<uint64_t> > clone_subset;
+  map<hobject_t, interval_set<uint64_t>, hobject_t::BitwiseComparator> clone_subset;
 
   ObjectRecoveryInfo() : size(0) { }
 
@@ -3521,7 +3668,7 @@ struct ScrubMap {
   };
   WRITE_CLASS_ENCODER(object)
 
-  map<hobject_t,object> objects;
+  map<hobject_t,object, hobject_t::BitwiseComparator> objects;
   eversion_t valid_through;
   eversion_t incr_since;
 
@@ -3562,7 +3709,7 @@ struct OSDOp {
    * buffer, including the sobject_t soid.
    *
    * @param ops [in] vector of OSDOps
-   * @param in  [out] combined data buffer
+   * @param out [out] combined data buffer
    */
   static void merge_osd_op_vector_in_data(vector<OSDOp>& ops, bufferlist& out);
 
@@ -3578,7 +3725,7 @@ struct OSDOp {
    * merge outdata members of a vector of OSDOps into a single bufferlist
    *
    * @param ops [in] vector of OSDOps
-   * @param in  [out] combined data buffer
+   * @param out [out] combined data buffer
    */
   static void merge_osd_op_vector_out_data(vector<OSDOp>& ops, bufferlist& out);
 };
diff --git a/src/osdc/Filer.cc b/src/osdc/Filer.cc
index d89c878..5c8a13d 100644
--- a/src/osdc/Filer.cc
+++ b/src/osdc/Filer.cc
@@ -324,7 +324,7 @@ void Filer::_do_purge_range(PurgeRange *pr, int fin)
 
   std::vector<object_t> remove_oids;
 
-  int max = 10 - pr->uncommitted;
+  int max = cct->_conf->filer_max_purge_ops - pr->uncommitted;
   while (pr->num > 0 && max > 0) {
     remove_oids.push_back(file_object_t(pr->ino, pr->first));
     pr->uncommitted++;
diff --git a/src/osdc/Filer.h b/src/osdc/Filer.h
index cbf3fa2..26a9204 100644
--- a/src/osdc/Filer.h
+++ b/src/osdc/Filer.h
@@ -106,11 +106,12 @@ class Filer {
            uint64_t len, 
            bufferlist *bl,   // ptr to data
 	   int flags,
-           Context *onfinish) {
+           Context *onfinish,
+	   int op_flags = 0) {
     assert(snap);  // (until there is a non-NOSNAP write)
     vector<ObjectExtent> extents;
     Striper::file_to_extents(cct, ino, layout, offset, len, 0, extents);
-    objecter->sg_read(extents, snap, bl, flags, onfinish);
+    objecter->sg_read(extents, snap, bl, flags, onfinish, op_flags);
     return 0;
   }
 
@@ -123,12 +124,13 @@ class Filer {
 	   int flags,
 	   uint64_t truncate_size,
 	   __u32 truncate_seq,
-           Context *onfinish) {
+           Context *onfinish,
+	   int op_flags = 0) {
     assert(snap);  // (until there is a non-NOSNAP write)
     vector<ObjectExtent> extents;
     Striper::file_to_extents(cct, ino, layout, offset, len, truncate_size, extents);
     objecter->sg_read_trunc(extents, snap, bl, flags,
-			    truncate_size, truncate_seq, onfinish);
+			    truncate_size, truncate_seq, onfinish, op_flags);
     return 0;
   }
 
@@ -141,10 +143,11 @@ class Filer {
 	    utime_t mtime,
             int flags, 
             Context *onack,
-            Context *oncommit) {
+            Context *oncommit,
+	    int op_flags = 0) {
     vector<ObjectExtent> extents;
     Striper::file_to_extents(cct, ino, layout, offset, len, 0, extents);
-    objecter->sg_write(extents, snapc, bl, mtime, flags, onack, oncommit);
+    objecter->sg_write(extents, snapc, bl, mtime, flags, onack, oncommit, op_flags);
     return 0;
   }
 
@@ -159,11 +162,12 @@ class Filer {
 	   uint64_t truncate_size,
 	   __u32 truncate_seq,
             Context *onack,
-            Context *oncommit) {
+            Context *oncommit,
+	    int op_flags = 0) {
     vector<ObjectExtent> extents;
     Striper::file_to_extents(cct, ino, layout, offset, len, truncate_size, extents);
     objecter->sg_write_trunc(extents, snapc, bl, mtime, flags,
-		       truncate_size, truncate_seq, onack, oncommit);
+		       truncate_size, truncate_seq, onack, oncommit, op_flags);
     return 0;
   }
 
diff --git a/src/osdc/Journaler.cc b/src/osdc/Journaler.cc
index a8712e6..5b99517 100644
--- a/src/osdc/Journaler.cc
+++ b/src/osdc/Journaler.cc
@@ -55,7 +55,7 @@ void Journaler::create(ceph_file_layout *l, stream_format_t const sf)
 
   prezeroing_pos = prezero_pos = write_pos = flush_pos = safe_pos =
     read_pos = requested_pos = received_pos =
-    expire_pos = trimming_pos = trimmed_pos = layout.fl_stripe_count * layout.fl_object_size;
+    expire_pos = trimming_pos = trimmed_pos = (uint64_t)layout.fl_stripe_count * layout.fl_object_size;
 
   ldout(cct, 1) << "created blank journal at inode 0x" << std::hex << ino << std::dec
     << ", format=" << stream_format << dendl;
@@ -141,6 +141,10 @@ public:
 void Journaler::recover(Context *onread) 
 {
   Mutex::Locker l(lock);
+  if (stopping) {
+    onread->complete(-EAGAIN);
+    return;
+  }
 
   ldout(cct, 1) << "recover start" << dendl;
   assert(state != STATE_ACTIVE);
@@ -150,7 +154,7 @@ void Journaler::recover(Context *onread)
     waitfor_recover.push_back(onread);
   
   if (state != STATE_UNDEF) {
-    ldout(cct, 1) << "recover - already recoverying" << dendl;
+    ldout(cct, 1) << "recover - already recovering" << dendl;
     return;
   }
 
@@ -202,14 +206,22 @@ void Journaler::_finish_reread_head(int r, bufferlist& bl, Context *finish)
   assert(bl.length() || r < 0 );
 
   // unpack header
-  Header h;
-  bufferlist::iterator p = bl.begin();
-  ::decode(h, p);
-  prezeroing_pos = prezero_pos = write_pos = flush_pos = safe_pos = h.write_pos;
-  expire_pos = h.expire_pos;
-  trimmed_pos = trimming_pos = h.trimmed_pos;
-  init_headers(h);
-  state = STATE_ACTIVE;
+  if (r == 0) {
+    Header h;
+    bufferlist::iterator p = bl.begin();
+    try {
+      ::decode(h, p);
+    } catch (const buffer::error &e) {
+      finish->complete(-EINVAL);
+      return;
+    }
+    prezeroing_pos = prezero_pos = write_pos = flush_pos = safe_pos = h.write_pos;
+    expire_pos = h.expire_pos;
+    trimmed_pos = trimming_pos = h.trimmed_pos;
+    init_headers(h);
+    state = STATE_ACTIVE;
+  }
+
   finish->complete(r);
 }
 
@@ -238,17 +250,21 @@ void Journaler::_finish_read_head(int r, bufferlist& bl)
   } 
 
   // unpack header
+  bool corrupt = false;
   Header h;
   bufferlist::iterator p = bl.begin();
-  ::decode(h, p);
-
-  bool corrupt = false;
-  if (h.magic != magic) {
-    ldout(cct, 0) << "on disk magic '" << h.magic << "' != my magic '"
-	    << magic << "'" << dendl;
-    corrupt = true;
-  } else if (h.write_pos < h.expire_pos || h.expire_pos < h.trimmed_pos) {
-    ldout(cct, 0) << "Corrupt header (bad offsets): " << h << dendl;
+  try {
+    ::decode(h, p);
+
+    if (h.magic != magic) {
+      ldout(cct, 0) << "on disk magic '" << h.magic << "' != my magic '"
+              << magic << "'" << dendl;
+      corrupt = true;
+    } else if (h.write_pos < h.expire_pos || h.expire_pos < h.trimmed_pos) {
+      ldout(cct, 0) << "Corrupt header (bad offsets): " << h << dendl;
+      corrupt = true;
+    }
+  } catch (const buffer::error &e) {
     corrupt = true;
   }
 
@@ -415,7 +431,8 @@ void Journaler::_write_head(Context *oncommit)
   object_locator_t oloc(pg_pool);
   objecter->write_full(oid, oloc, snapc, bl, ceph_clock_now(cct), 0, 
 		       NULL, 
-		       wrap_finisher(new C_WriteHead(this, last_written, wrap_finisher(oncommit))));
+		       wrap_finisher(new C_WriteHead(this, last_written, wrap_finisher(oncommit))),
+		       0, 0, write_iohint);
 }
 
 void Journaler::_finish_write_head(int r, Header &wrote, C_OnFinisher *oncommit)
@@ -607,7 +624,7 @@ void Journaler::_do_flush(unsigned amount)
   filer.write(ino, &layout, snapc,
 	      flush_pos, len, write_bl, ceph_clock_now(cct),
 	      0,
-	      NULL, wrap_finisher(onsafe));
+	      NULL, wrap_finisher(onsafe), write_iohint);
 
   flush_pos += len;
   assert(write_buf.length() == write_pos - flush_pos);
@@ -622,6 +639,10 @@ void Journaler::_do_flush(unsigned amount)
 void Journaler::wait_for_flush(Context *onsafe)
 {
   Mutex::Locker l(lock);
+  if (stopping) {
+    onsafe->complete(-EAGAIN);
+    return;
+  }
   _wait_for_flush(onsafe);
 }
 
@@ -782,11 +803,12 @@ void Journaler::_finish_prezero(int r, uint64_t start, uint64_t len)
 class Journaler::C_Read : public Context {
   Journaler *ls;
   uint64_t offset;
+  uint64_t length;
 public:
   bufferlist bl;
-  C_Read(Journaler *l, uint64_t o) : ls(l), offset(o) {}
+  C_Read(Journaler *j, uint64_t o, uint64_t l) : ls(j), offset(o), length(l) {}
   void finish(int r) {
-    ls->_finish_read(r, offset, bl);
+    ls->_finish_read(r, offset, length, bl);
   }
 };
 
@@ -801,32 +823,50 @@ public:
   }  
 };
 
-void Journaler::_finish_read(int r, uint64_t offset, bufferlist& bl)
+void Journaler::_finish_read(int r, uint64_t offset, uint64_t length, bufferlist& bl)
 {
   Mutex::Locker l(lock);
 
   if (r < 0) {
     ldout(cct, 0) << "_finish_read got error " << r << dendl;
     error = r;
+  } else {
+    ldout(cct, 10) << "_finish_read got " << offset << "~" << bl.length() << dendl;
+    if (bl.length() < length) {
+      ldout(cct, 0) << "_finish_read got less than expected (" << length << ")" << dendl;
+      error = -EINVAL;
+    }
+  }
+
+  if (error) {
     if (on_readable) {
       C_OnFinisher *f = on_readable;
       on_readable = 0;
-      f->complete(r);
+      f->complete(error);
     }
     return;
   }
-  assert(r>=0);
 
-  ldout(cct, 10) << "_finish_read got " << offset << "~" << bl.length() << dendl;
   prefetch_buf[offset].swap(bl);
 
-  _assimilate_prefetch();
+  try {
+    _assimilate_prefetch();
+  } catch (const buffer::error &err) {
+    lderr(cct) << "_decode error from assimilate_prefetch" << dendl;
+    error = -EINVAL;
+    if (on_readable) {
+      C_OnFinisher *f = on_readable;
+      on_readable = 0;
+      f->complete(error);
+    }
+    return;
+  }
   _prefetch();
 }
 
 void Journaler::_assimilate_prefetch()
 {
-  bool was_readable = _is_readable();
+  bool was_readable = readable;
 
   bool got_any = false;
   while (!prefetch_buf.empty()) {
@@ -846,13 +886,17 @@ void Journaler::_assimilate_prefetch()
     got_any = true;
   }
 
-  if (got_any)
+  if (got_any) {
     ldout(cct, 10) << "_assimilate_prefetch read_buf now " << read_pos << "~" << read_buf.length() 
 	     << ", read pointers " << read_pos << "/" << received_pos << "/" << requested_pos
 	     << dendl;
 
-  if ((got_any && !was_readable && _is_readable()) ||
-      read_pos == write_pos) {
+    // Update readability (this will also hit any decode errors resulting
+    // from bad data)
+    readable = _is_readable();
+  }
+
+  if ((got_any && !was_readable && readable) || read_pos == write_pos) {
     // readable!
     ldout(cct, 10) << "_finish_read now readable (or at journal end)" << dendl;
     if (on_readable) {
@@ -904,8 +948,8 @@ void Journaler::_issue_read(uint64_t len)
     uint64_t l = e - requested_pos;
     if (l > len)
       l = len;
-    C_Read *c = new C_Read(this, requested_pos);
-    filer.read(ino, &layout, CEPH_NOSNAP, requested_pos, l, &c->bl, 0, wrap_finisher(c));
+    C_Read *c = new C_Read(this, requested_pos, l);
+    filer.read(ino, &layout, CEPH_NOSNAP, requested_pos, l, &c->bl, 0, wrap_finisher(c), CEPH_OSD_OP_FLAG_FADVISE_DONTNEED);
     requested_pos += l;
     len -= l;
   }
@@ -996,7 +1040,11 @@ bool Journaler::is_readable()
 {
   Mutex::Locker l(lock);
 
-  bool r = _is_readable();
+  if (error != 0) {
+    return false;
+  }
+
+  bool r = readable;
   _prefetch();
   return r;
 }
@@ -1046,27 +1094,42 @@ void Journaler::_finish_erase(int data_result, C_OnFinisher *completion)
 
 /* try_read_entry(bl)
  *  read entry into bl if it's ready.
- *  otherwise, do nothing.  (well, we'll start fetching it for good measure.)
+ *  otherwise, do nothing.
  */
 bool Journaler::try_read_entry(bufferlist& bl)
 {
   Mutex::Locker l(lock);
 
-  if (!_is_readable()) {  // this may start a read.
+  if (!readable) {
     ldout(cct, 10) << "try_read_entry at " << read_pos << " not readable" << dendl;
     return false;
   }
 
   uint64_t start_ptr;
-  size_t consumed = journal_stream.read(read_buf, &bl, &start_ptr);
-  if (stream_format >= JOURNAL_FORMAT_RESILIENT) {
-    assert(start_ptr == read_pos);
+  size_t consumed;
+  try {
+    consumed = journal_stream.read(read_buf, &bl, &start_ptr);
+    if (stream_format >= JOURNAL_FORMAT_RESILIENT) {
+      assert(start_ptr == read_pos);
+    }
+  } catch (const buffer::error &e) {
+    lderr(cct) << __func__ << ": decode error from journal_stream" << dendl;
+    error = -EINVAL;
+    return false;
   }
 
   ldout(cct, 10) << "try_read_entry at " << read_pos << " read " 
 	   << read_pos << "~" << consumed << " (have " << read_buf.length() << ")" << dendl;
 
   read_pos += consumed;
+  try {
+    // We were readable, we might not be any more
+    readable = _is_readable();
+  } catch (const buffer::error &e) {
+    lderr(cct) << __func__ << ": decode error from _is_readable" << dendl;
+    error = -EINVAL;
+    return false;
+  }
 
   // prefetch?
   _prefetch();
@@ -1076,9 +1139,13 @@ bool Journaler::try_read_entry(bufferlist& bl)
 void Journaler::wait_for_readable(Context *onreadable)
 {
   Mutex::Locker l(lock);
+  if (stopping) {
+    onreadable->complete(-EAGAIN);
+    return;
+  }
 
   assert(on_readable == 0);
-  if (!_is_readable()) {
+  if (!readable) {
     ldout(cct, 10) << "wait_for_readable at " << read_pos << " onreadable " << onreadable << dendl;
     on_readable = wrap_finisher(onreadable);
   } else {
@@ -1271,7 +1338,6 @@ size_t JournalStream::read(bufferlist &from, bufferlist *entry, uint64_t *start_
     assert(entry_sentinel == sentinel);
   }
   ::decode(entry_size, from_ptr);
-  assert(entry_size != 0);
 
   // Read out the payload
   from_ptr.copy(entry_size, *entry);
@@ -1350,3 +1416,30 @@ C_OnFinisher *Journaler::wrap_finisher(Context *c)
     return NULL;
   }
 }
+
+void Journaler::shutdown()
+{
+  Mutex::Locker l(lock);
+
+  ldout(cct, 1) << __func__ << dendl;
+
+  readable = false;
+  stopping = true;
+
+  // Kick out anyone reading from journal
+  error = -EAGAIN;
+  if (on_readable) {
+    C_OnFinisher *f = on_readable;
+    on_readable = 0;
+    f->complete(-EAGAIN);
+  }
+
+  finish_contexts(cct, waitfor_recover, 0);
+
+  std::map<uint64_t, std::list<Context*> >::iterator i;
+  for (i = waitfor_safe.begin(); i != waitfor_safe.end(); ++i) {
+    finish_contexts(cct, i->second, -EAGAIN);
+  }
+  waitfor_safe.clear();
+}
+
diff --git a/src/osdc/Journaler.h b/src/osdc/Journaler.h
index c4e9b2f..9db19c1 100644
--- a/src/osdc/Journaler.h
+++ b/src/osdc/Journaler.h
@@ -164,11 +164,11 @@ public:
 	f->open_object_section("layout");
 	{
 	  f->dump_unsigned("stripe_unit", layout.fl_stripe_unit);
-	  f->dump_unsigned("stripe_count", layout.fl_stripe_unit);
-	  f->dump_unsigned("object_size", layout.fl_stripe_unit);
-	  f->dump_unsigned("cas_hash", layout.fl_stripe_unit);
-	  f->dump_unsigned("object_stripe_unit", layout.fl_stripe_unit);
-	  f->dump_unsigned("pg_pool", layout.fl_stripe_unit);
+	  f->dump_unsigned("stripe_count", layout.fl_stripe_count);
+	  f->dump_unsigned("object_size", layout.fl_object_size);
+	  f->dump_unsigned("cas_hash", layout.fl_cas_hash);
+	  f->dump_unsigned("object_stripe_unit", layout.fl_object_stripe_unit);
+	  f->dump_unsigned("pg_pool", layout.fl_pg_pool);
 	}
 	f->close_section(); // layout
       }
@@ -321,7 +321,7 @@ private:
   C_OnFinisher    *on_write_error;
   bool             called_write_error;
 
-  void _finish_read(int r, uint64_t offset, bufferlist &bl); // read completion callback
+  void _finish_read(int r, uint64_t offset, uint64_t length, bufferlist &bl); // read completion callback
   void _finish_retry_read(int r);
   void _assimilate_prefetch();
   void _issue_read(uint64_t len);  // read some more
@@ -336,6 +336,8 @@ private:
   uint64_t trimming_pos;      // what we've requested to trim through
   uint64_t trimmed_pos;   // what has been trimmed
 
+  bool readable;
+
   void _finish_trim(int r, uint64_t to);
   class C_Trim;
   friend class C_Trim;
@@ -369,6 +371,8 @@ private:
 
   C_OnFinisher *wrap_finisher(Context *c);
 
+  uint32_t write_iohint; //the fadvise flags for write op, see CEPH_OSD_OP_FADIVSE_*
+
 public:
   Journaler(inodeno_t ino_, int64_t pool, const char *mag, Objecter *obj, PerfCounters *l, int lkey, SafeTimer *tim, Finisher *f) : 
     last_committed(mag),
@@ -385,7 +389,8 @@ public:
     read_pos(0), requested_pos(0), received_pos(0),
     fetch_len(0), temp_fetch_len(0),
     on_readable(0), on_write_error(NULL), called_write_error(false),
-    expire_pos(0), trimming_pos(0), trimmed_pos(0)
+    expire_pos(0), trimming_pos(0), trimmed_pos(0), readable(false),
+    write_iohint(0), stopping(false)
   {
     memset(&layout, 0, sizeof(layout));
   }
@@ -466,6 +471,18 @@ public:
 
   void set_write_error_handler(Context *c);
 
+  void set_write_iohint(uint32_t iohint_flags) {
+    write_iohint = iohint_flags;
+  }
+  /**
+   * Cause any ongoing waits to error out with -EAGAIN, set error
+   * to -EAGAIN.
+   */
+  void shutdown();
+protected:
+  bool stopping;
+public:
+
   // Synchronous getters
   // ===================
   // TODO: need some locks on reads for true safety
diff --git a/src/osdc/ObjectCacher.cc b/src/osdc/ObjectCacher.cc
index d21292e..d1f8020 100644
--- a/src/osdc/ObjectCacher.cc
+++ b/src/osdc/ObjectCacher.cc
@@ -34,6 +34,7 @@ ObjectCacher::BufferHead *ObjectCacher::Object::split(BufferHead *left, loff_t o
 
   //inherit and if later access, this auto clean.
   right->set_dontneed(left->get_dontneed());
+  right->set_nocache(left->get_nocache());
 
   right->last_write_tid = left->last_write_tid;
   right->last_read_tid = left->last_read_tid;
@@ -103,6 +104,7 @@ void ObjectCacher::Object::merge_left(BufferHead *left, BufferHead *right)
   left->last_write = MAX( left->last_write, right->last_write );
 
   left->set_dontneed(right->get_dontneed() ? left->get_dontneed() : false);
+  left->set_nocache(right->get_nocache() ? left->get_nocache() : false);
 
   // waiters
   for (map<loff_t, list<Context*> >::iterator p = right->waitfor_read.begin();
@@ -560,18 +562,18 @@ void ObjectCacher::perf_start()
   string n = "objectcacher-" + name;
   PerfCountersBuilder plb(cct, n, l_objectcacher_first, l_objectcacher_last);
 
-  plb.add_u64_counter(l_objectcacher_cache_ops_hit, "cache_ops_hit");
-  plb.add_u64_counter(l_objectcacher_cache_ops_miss, "cache_ops_miss");
-  plb.add_u64_counter(l_objectcacher_cache_bytes_hit, "cache_bytes_hit");
-  plb.add_u64_counter(l_objectcacher_cache_bytes_miss, "cache_bytes_miss");
-  plb.add_u64_counter(l_objectcacher_data_read, "data_read");
-  plb.add_u64_counter(l_objectcacher_data_written, "data_written");
-  plb.add_u64_counter(l_objectcacher_data_flushed, "data_flushed");
+  plb.add_u64_counter(l_objectcacher_cache_ops_hit, "cache_ops_hit", "Hit operations");
+  plb.add_u64_counter(l_objectcacher_cache_ops_miss, "cache_ops_miss", "Miss operations");
+  plb.add_u64_counter(l_objectcacher_cache_bytes_hit, "cache_bytes_hit", "Hit data");
+  plb.add_u64_counter(l_objectcacher_cache_bytes_miss, "cache_bytes_miss", "Miss data");
+  plb.add_u64_counter(l_objectcacher_data_read, "data_read", "Read data");
+  plb.add_u64_counter(l_objectcacher_data_written, "data_written", "Data written to cache");
+  plb.add_u64_counter(l_objectcacher_data_flushed, "data_flushed", "Data flushed");
   plb.add_u64_counter(l_objectcacher_overwritten_in_flush,
-                      "data_overwritten_while_flushing");
-  plb.add_u64_counter(l_objectcacher_write_ops_blocked, "write_ops_blocked");
-  plb.add_u64_counter(l_objectcacher_write_bytes_blocked, "write_bytes_blocked");
-  plb.add_time(l_objectcacher_write_time_blocked, "write_time_blocked");
+                      "data_overwritten_while_flushing", "Data overwritten while flushing");
+  plb.add_u64_counter(l_objectcacher_write_ops_blocked, "write_ops_blocked", "Write operations, delayed due to dirty limits");
+  plb.add_u64_counter(l_objectcacher_write_bytes_blocked, "write_bytes_blocked", "Write data blocked on dirty limit");
+  plb.add_time(l_objectcacher_write_time_blocked, "write_time_blocked", "Time spent blocking a write due to dirty limits");
 
   perfcounter = plb.create_perf_counters();
   cct->get_perfcounters_collection()->add(perfcounter);
@@ -918,6 +920,8 @@ void ObjectCacher::bh_write_commit(int64_t poolid, sobject_t oid, loff_t start,
       if (r >= 0) {
 	// ok!  mark bh clean and error-free
 	mark_clean(bh);
+	if (bh->get_nocache())
+	  bh_lru_rest.lru_bottouch(bh);
 	hit.push_back(bh);
 	ldout(cct, 10) << "bh_write_commit clean " << *bh << dendl;
       } else {
@@ -1068,6 +1072,7 @@ int ObjectCacher::_readx(OSDRead *rd, ObjectSet *oset, Context *onfinish,
   uint64_t total_bytes_read = 0;
   map<uint64_t, bufferlist> stripe_map;  // final buffer offset -> substring
   bool dontneed = rd->fadvise_flags & LIBRADOS_OP_FLAG_FADVISE_DONTNEED;
+  bool nocache = rd->fadvise_flags & LIBRADOS_OP_FLAG_FADVISE_NOCACHE;
 
   /*
    * WARNING: we can only meaningfully return ENOENT if the read request
@@ -1158,7 +1163,7 @@ int ObjectCacher::_readx(OSDRead *rd, ObjectSet *oset, Context *onfinish,
 	uint64_t rx_bytes = static_cast<uint64_t>(
 	  stat_rx + bh_it->second->length());
         bytes_not_in_cache += bh_it->second->length();
-	if (!waitfor_read.empty() || rx_bytes > max_size) {
+	if (!waitfor_read.empty() || (stat_rx > 0 && rx_bytes > max_size)) {
 	  // cache is full with concurrent reads -- wait for rx's to complete
 	  // to constrain memory growth (especially during copy-ups)
 	  if (success) {
@@ -1172,6 +1177,7 @@ int ObjectCacher::_readx(OSDRead *rd, ObjectSet *oset, Context *onfinish,
 	  bh_remove(o, bh_it->second);
 	  delete bh_it->second;
 	} else {
+	  bh_it->second->set_nocache(nocache);
 	  bh_read(bh_it->second, rd->fadvise_flags);
 	  if ((success && onfinish) || last != missing.end())
 	    last = bh_it;
@@ -1218,7 +1224,10 @@ int ObjectCacher::_readx(OSDRead *rd, ObjectSet *oset, Context *onfinish,
 	  error = bh->error;
         bytes_in_cache += bh->length();
 
-	touch_bh(bh);
+	if (bh->get_nocache() && bh->is_clean())
+	  bh_lru_rest.lru_bottouch(bh);
+	else
+	  touch_bh(bh);
 	//must be after touch_bh because touch_bh set dontneed false
 	if (dontneed &&
 	    ((loff_t)ex_it->offset <= bh->start() && (bh->end() <= (loff_t)(ex_it->offset + ex_it->length)))) {
@@ -1361,6 +1370,7 @@ int ObjectCacher::writex(OSDWrite *wr, ObjectSet *oset, Context *onfreespace)
   uint64_t bytes_written = 0;
   uint64_t bytes_written_in_flush = 0;
   bool dontneed = wr->fadvise_flags & LIBRADOS_OP_FLAG_FADVISE_DONTNEED;
+  bool nocache = wr->fadvise_flags & LIBRADOS_OP_FLAG_FADVISE_NOCACHE;
   
   for (vector<ObjectExtent>::iterator ex_it = wr->extents.begin();
        ex_it != wr->extents.end();
@@ -1372,6 +1382,7 @@ int ObjectCacher::writex(OSDWrite *wr, ObjectSet *oset, Context *onfreespace)
 
     // map it all into a single bufferhead.
     BufferHead *bh = o->map_write(wr);
+    bool missing = bh->is_missing();
     bh->snapc = wr->snapc;
     
     bytes_written += bh->length();
@@ -1411,6 +1422,8 @@ int ObjectCacher::writex(OSDWrite *wr, ObjectSet *oset, Context *onfreespace)
     mark_dirty(bh);
     if (dontneed)
       bh->set_dontneed(true);
+    else if (nocache && missing)
+      bh->set_nocache(true);
     else
       touch_bh(bh);
 
@@ -1825,7 +1838,7 @@ loff_t ObjectCacher::release(Object *ob)
        p != ob->data.end();
        ++p) {
     BufferHead *bh = p->second;
-    if (bh->is_clean() || bh->is_zero())
+    if (bh->is_clean() || bh->is_zero() || bh->is_error())
       clean.push_back(bh);
     else 
       o_unclean += bh->length();
diff --git a/src/osdc/ObjectCacher.h b/src/osdc/ObjectCacher.h
index 0bef597..eeed83a 100644
--- a/src/osdc/ObjectCacher.h
+++ b/src/osdc/ObjectCacher.h
@@ -102,6 +102,7 @@ class ObjectCacher {
       loff_t start, length;   // bh extent in object
     } ex;
     bool dontneed; //indicate bh don't need by anyone
+    bool nocache; //indicate bh don't need by this caller
 
   public:
     Object *ob;
@@ -119,6 +120,7 @@ class ObjectCacher {
       state(STATE_MISSING),
       ref(0),
       dontneed(false),
+      nocache(false),
       ob(o),
       last_write_tid(0),
       last_read_tid(0),
@@ -169,6 +171,13 @@ class ObjectCacher {
     bool get_dontneed() {
       return dontneed;
     }
+
+    void set_nocache(bool v) {
+      nocache = v;
+    }
+    bool get_nocache() {
+      return nocache;
+    }
   };
 
   // ******* Object *********
@@ -422,6 +431,7 @@ class ObjectCacher {
       bh_lru_rest.lru_touch(bh);
 
     bh->set_dontneed(false);
+    bh->set_nocache(false);
     touch_ob(bh->ob);
   }
   void touch_ob(Object *ob) {
diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc
index 856425a..60efe3a 100644
--- a/src/osdc/Objecter.cc
+++ b/src/osdc/Objecter.cc
@@ -129,6 +129,11 @@ enum {
   l_osdc_osd_session_open,
   l_osdc_osd_session_close,
   l_osdc_osd_laggy,
+
+  l_osdc_osdop_omap_wr,
+  l_osdc_osdop_omap_rd,
+  l_osdc_osdop_omap_del,
+
   l_osdc_last,
 };
 
@@ -182,77 +187,84 @@ void Objecter::init()
   if (!logger) {
     PerfCountersBuilder pcb(cct, "objecter", l_osdc_first, l_osdc_last);
 
-    pcb.add_u64(l_osdc_op_active, "op_active");
-    pcb.add_u64(l_osdc_op_laggy, "op_laggy");
-    pcb.add_u64_counter(l_osdc_op_send, "op_send");
-    pcb.add_u64_counter(l_osdc_op_send_bytes, "op_send_bytes");
-    pcb.add_u64_counter(l_osdc_op_resend, "op_resend");
-    pcb.add_u64_counter(l_osdc_op_ack, "op_ack");
-    pcb.add_u64_counter(l_osdc_op_commit, "op_commit");
-
-    pcb.add_u64_counter(l_osdc_op, "op");
-    pcb.add_u64_counter(l_osdc_op_r, "op_r");
-    pcb.add_u64_counter(l_osdc_op_w, "op_w");
-    pcb.add_u64_counter(l_osdc_op_rmw, "op_rmw");
-    pcb.add_u64_counter(l_osdc_op_pg, "op_pg");
-
-    pcb.add_u64_counter(l_osdc_osdop_stat, "osdop_stat");
-    pcb.add_u64_counter(l_osdc_osdop_create, "osdop_create");
-    pcb.add_u64_counter(l_osdc_osdop_read, "osdop_read");
-    pcb.add_u64_counter(l_osdc_osdop_write, "osdop_write");
-    pcb.add_u64_counter(l_osdc_osdop_writefull, "osdop_writefull");
-    pcb.add_u64_counter(l_osdc_osdop_append, "osdop_append");
-    pcb.add_u64_counter(l_osdc_osdop_zero, "osdop_zero");
-    pcb.add_u64_counter(l_osdc_osdop_truncate, "osdop_truncate");
-    pcb.add_u64_counter(l_osdc_osdop_delete, "osdop_delete");
-    pcb.add_u64_counter(l_osdc_osdop_mapext, "osdop_mapext");
-    pcb.add_u64_counter(l_osdc_osdop_sparse_read, "osdop_sparse_read");
-    pcb.add_u64_counter(l_osdc_osdop_clonerange, "osdop_clonerange");
-    pcb.add_u64_counter(l_osdc_osdop_getxattr, "osdop_getxattr");
-    pcb.add_u64_counter(l_osdc_osdop_setxattr, "osdop_setxattr");
-    pcb.add_u64_counter(l_osdc_osdop_cmpxattr, "osdop_cmpxattr");
-    pcb.add_u64_counter(l_osdc_osdop_rmxattr, "osdop_rmxattr");
-    pcb.add_u64_counter(l_osdc_osdop_resetxattrs, "osdop_resetxattrs");
-    pcb.add_u64_counter(l_osdc_osdop_tmap_up, "osdop_tmap_up");
-    pcb.add_u64_counter(l_osdc_osdop_tmap_put, "osdop_tmap_put");
-    pcb.add_u64_counter(l_osdc_osdop_tmap_get, "osdop_tmap_get");
-    pcb.add_u64_counter(l_osdc_osdop_call, "osdop_call");
-    pcb.add_u64_counter(l_osdc_osdop_watch, "osdop_watch");
-    pcb.add_u64_counter(l_osdc_osdop_notify, "osdop_notify");
-    pcb.add_u64_counter(l_osdc_osdop_src_cmpxattr, "osdop_src_cmpxattr");
+    pcb.add_u64(l_osdc_op_active, "op_active",
+        "Operations active", "actv");
+    pcb.add_u64(l_osdc_op_laggy, "op_laggy", "Laggy operations");
+    pcb.add_u64_counter(l_osdc_op_send, "op_send", "Sent operations");
+    pcb.add_u64_counter(l_osdc_op_send_bytes, "op_send_bytes", "Sent data");
+    pcb.add_u64_counter(l_osdc_op_resend, "op_resend", "Resent operations");
+    pcb.add_u64_counter(l_osdc_op_ack, "op_ack", "Commit callbacks");
+    pcb.add_u64_counter(l_osdc_op_commit, "op_commit", "Operation commits");
+
+    pcb.add_u64_counter(l_osdc_op, "op", "Operations");
+    pcb.add_u64_counter(l_osdc_op_r, "op_r",
+        "Read operations", "read");
+    pcb.add_u64_counter(l_osdc_op_w, "op_w",
+        "Write operations", "writ");
+    pcb.add_u64_counter(l_osdc_op_rmw, "op_rmw", "Read-modify-write operations");
+    pcb.add_u64_counter(l_osdc_op_pg, "op_pg", "PG operation");
+
+    pcb.add_u64_counter(l_osdc_osdop_stat, "osdop_stat", "Stat operations");
+    pcb.add_u64_counter(l_osdc_osdop_create, "osdop_create", "Create object operations");
+    pcb.add_u64_counter(l_osdc_osdop_read, "osdop_read", "Read operations");
+    pcb.add_u64_counter(l_osdc_osdop_write, "osdop_write", "Write operations");
+    pcb.add_u64_counter(l_osdc_osdop_writefull, "osdop_writefull", "Write full object operations");
+    pcb.add_u64_counter(l_osdc_osdop_append, "osdop_append", "Append operation");
+    pcb.add_u64_counter(l_osdc_osdop_zero, "osdop_zero", "Set object to zero operations");
+    pcb.add_u64_counter(l_osdc_osdop_truncate, "osdop_truncate", "Truncate object operations");
+    pcb.add_u64_counter(l_osdc_osdop_delete, "osdop_delete", "Delete object operations");
+    pcb.add_u64_counter(l_osdc_osdop_mapext, "osdop_mapext", "Map extent operations");
+    pcb.add_u64_counter(l_osdc_osdop_sparse_read, "osdop_sparse_read", "Sparse read operations");
+    pcb.add_u64_counter(l_osdc_osdop_clonerange, "osdop_clonerange", "Clone range operations");
+    pcb.add_u64_counter(l_osdc_osdop_getxattr, "osdop_getxattr", "Get xattr operations");
+    pcb.add_u64_counter(l_osdc_osdop_setxattr, "osdop_setxattr", "Set xattr operations");
+    pcb.add_u64_counter(l_osdc_osdop_cmpxattr, "osdop_cmpxattr", "Xattr comparison operations");
+    pcb.add_u64_counter(l_osdc_osdop_rmxattr, "osdop_rmxattr", "Remove xattr operations");
+    pcb.add_u64_counter(l_osdc_osdop_resetxattrs, "osdop_resetxattrs", "Reset xattr operations");
+    pcb.add_u64_counter(l_osdc_osdop_tmap_up, "osdop_tmap_up", "TMAP update operations");
+    pcb.add_u64_counter(l_osdc_osdop_tmap_put, "osdop_tmap_put", "TMAP put operations");
+    pcb.add_u64_counter(l_osdc_osdop_tmap_get, "osdop_tmap_get", "TMAP get operations");
+    pcb.add_u64_counter(l_osdc_osdop_call, "osdop_call", "Call (execute) operations");
+    pcb.add_u64_counter(l_osdc_osdop_watch, "osdop_watch", "Watch by object operations");
+    pcb.add_u64_counter(l_osdc_osdop_notify, "osdop_notify", "Notify about object operations");
+    pcb.add_u64_counter(l_osdc_osdop_src_cmpxattr, "osdop_src_cmpxattr", "Extended attribute comparison in multi operations");
     pcb.add_u64_counter(l_osdc_osdop_pgls, "osdop_pgls");
     pcb.add_u64_counter(l_osdc_osdop_pgls_filter, "osdop_pgls_filter");
-    pcb.add_u64_counter(l_osdc_osdop_other, "osdop_other");
+    pcb.add_u64_counter(l_osdc_osdop_other, "osdop_other", "Other operations");
+
+    pcb.add_u64(l_osdc_linger_active, "linger_active", "Active lingering operations");
+    pcb.add_u64_counter(l_osdc_linger_send, "linger_send", "Sent lingering operations");
+    pcb.add_u64_counter(l_osdc_linger_resend, "linger_resend", "Resent lingering operations");
+    pcb.add_u64_counter(l_osdc_linger_ping, "linger_ping", "Sent pings to lingering operations");
 
-    pcb.add_u64(l_osdc_linger_active, "linger_active");
-    pcb.add_u64_counter(l_osdc_linger_send, "linger_send");
-    pcb.add_u64_counter(l_osdc_linger_resend, "linger_resend");
-    pcb.add_u64_counter(l_osdc_linger_ping, "linger_ping");
+    pcb.add_u64(l_osdc_poolop_active, "poolop_active", "Active pool operations");
+    pcb.add_u64_counter(l_osdc_poolop_send, "poolop_send", "Sent pool operations");
+    pcb.add_u64_counter(l_osdc_poolop_resend, "poolop_resend", "Resent pool operations");
 
-    pcb.add_u64(l_osdc_poolop_active, "poolop_active");
-    pcb.add_u64_counter(l_osdc_poolop_send, "poolop_send");
-    pcb.add_u64_counter(l_osdc_poolop_resend, "poolop_resend");
+    pcb.add_u64(l_osdc_poolstat_active, "poolstat_active", "Active get pool stat operations");
+    pcb.add_u64_counter(l_osdc_poolstat_send, "poolstat_send", "Pool stat operations sent");
+    pcb.add_u64_counter(l_osdc_poolstat_resend, "poolstat_resend", "Resent pool stats");
 
-    pcb.add_u64(l_osdc_poolstat_active, "poolstat_active");
-    pcb.add_u64_counter(l_osdc_poolstat_send, "poolstat_send");
-    pcb.add_u64_counter(l_osdc_poolstat_resend, "poolstat_resend");
+    pcb.add_u64(l_osdc_statfs_active, "statfs_active", "Statfs operations");
+    pcb.add_u64_counter(l_osdc_statfs_send, "statfs_send", "Sent FS stats");
+    pcb.add_u64_counter(l_osdc_statfs_resend, "statfs_resend", "Resent FS stats");
 
-    pcb.add_u64(l_osdc_statfs_active, "statfs_active");
-    pcb.add_u64_counter(l_osdc_statfs_send, "statfs_send");
-    pcb.add_u64_counter(l_osdc_statfs_resend, "statfs_resend");
+    pcb.add_u64(l_osdc_command_active, "command_active", "Active commands");
+    pcb.add_u64_counter(l_osdc_command_send, "command_send", "Sent commands");
+    pcb.add_u64_counter(l_osdc_command_resend, "command_resend", "Resent commands");
 
-    pcb.add_u64(l_osdc_command_active, "command_active");
-    pcb.add_u64_counter(l_osdc_command_send, "command_send");
-    pcb.add_u64_counter(l_osdc_command_resend, "command_resend");
+    pcb.add_u64(l_osdc_map_epoch, "map_epoch", "OSD map epoch");
+    pcb.add_u64_counter(l_osdc_map_full, "map_full", "Full OSD maps received");
+    pcb.add_u64_counter(l_osdc_map_inc, "map_inc", "Incremental OSD maps received");
 
-    pcb.add_u64(l_osdc_map_epoch, "map_epoch");
-    pcb.add_u64_counter(l_osdc_map_full, "map_full");
-    pcb.add_u64_counter(l_osdc_map_inc, "map_inc");
+    pcb.add_u64(l_osdc_osd_sessions, "osd_sessions", "Open sessions");  // open sessions
+    pcb.add_u64_counter(l_osdc_osd_session_open, "osd_session_open", "Sessions opened");
+    pcb.add_u64_counter(l_osdc_osd_session_close, "osd_session_close", "Sessions closed");
+    pcb.add_u64(l_osdc_osd_laggy, "osd_laggy", "Laggy OSD sessions");
 
-    pcb.add_u64(l_osdc_osd_sessions, "osd_sessions");  // open sessions
-    pcb.add_u64_counter(l_osdc_osd_session_open, "osd_session_open");
-    pcb.add_u64_counter(l_osdc_osd_session_close, "osd_session_close");
-    pcb.add_u64(l_osdc_osd_laggy, "osd_laggy");
+    pcb.add_u64_counter(l_osdc_osdop_omap_wr, "omap_wr", "OSD OMAP write operations");
+    pcb.add_u64_counter(l_osdc_osdop_omap_rd, "omap_rd", "OSD OMAP read operations");
+    pcb.add_u64_counter(l_osdc_osdop_omap_del, "omap_del", "OSD OMAP delete operations");
 
     logger = pcb.create_perf_counters();
     cct->get_perfcounters_collection()->add(logger);
@@ -276,6 +288,8 @@ void Objecter::init()
   timer.init();
   timer_lock.Unlock();
 
+  cct->_conf->add_observer(this);
+
   initialized.set(1);
 }
 
@@ -300,6 +314,8 @@ void Objecter::shutdown()
 
   initialized.set(0);
 
+  cct->_conf->remove_observer(this);
+
   map<int,OSDSession*>::iterator p;
   while (!osd_sessions.empty()) {
     p = osd_sessions.begin();
@@ -419,6 +435,7 @@ void Objecter::_send_linger(LingerOp *info)
   vector<OSDOp> opv;
   Context *oncommit = NULL;
   info->watch_lock.get_read(); // just to read registered status
+  bufferlist *poutbl = NULL;
   if (info->registered && info->is_watch) {
     ldout(cct, 15) << "send_linger " << info->linger_id << " reconnect" << dendl;
     opv.push_back(OSDOp());
@@ -430,7 +447,12 @@ void Objecter::_send_linger(LingerOp *info)
   } else {
     ldout(cct, 15) << "send_linger " << info->linger_id << " register" << dendl;
     opv = info->ops;
-    oncommit = new C_Linger_Commit(this, info);
+    C_Linger_Commit *c = new C_Linger_Commit(this, info);
+    if (!info->is_watch) {
+      info->notify_id = 0;
+      poutbl = &c->outbl;
+    }
+    oncommit = c;
   }
   info->watch_lock.put_read();
   Op *o = new Op(info->target.base_oid, info->target.base_oloc,
@@ -438,6 +460,7 @@ void Objecter::_send_linger(LingerOp *info)
 		 NULL, NULL,
 		 info->pobjver);
   o->oncommit_sync = oncommit;
+  o->outbl = poutbl;
   o->snapid = info->snap;
   o->snapc = info->snapc;
   o->mtime = info->mtime;
@@ -467,7 +490,7 @@ void Objecter::_send_linger(LingerOp *info)
   logger->inc(l_osdc_linger_send);
 }
 
-void Objecter::_linger_commit(LingerOp *info, int r) 
+void Objecter::_linger_commit(LingerOp *info, int r, bufferlist& outbl)
 {
   RWLock::WLocker wl(info->watch_lock);
   ldout(cct, 10) << "_linger_commit " << info->linger_id << dendl;
@@ -479,6 +502,17 @@ void Objecter::_linger_commit(LingerOp *info, int r)
   // only tell the user the first time we do this
   info->registered = true;
   info->pobjver = NULL;
+
+  if (!info->is_watch) {
+    // make note of the notify_id
+    bufferlist::iterator p = outbl.begin();
+    try {
+      ::decode(info->notify_id, p);
+      ldout(cct, 10) << "_linger_commit  notify_id=" << info->notify_id << dendl;
+    }
+    catch (buffer::error& e) {
+    }
+  }
 }
 
 struct C_DoWatchError : public Context {
@@ -777,11 +811,17 @@ void Objecter::handle_watch_notify(MWatchNotify *m)
       }
     }
   } else if (!info->is_watch) {
-    // notify completion; we can do this inline since we know the only user
-    // (librados) is safe to call in fast-dispatch context
-    assert(info->on_notify_finish);
-    info->notify_result_bl->claim(m->get_data());
-    info->on_notify_finish->complete(m->return_code);
+    // we have CEPH_WATCH_EVENT_NOTIFY_COMPLETE; we can do this inline since
+    // we know the only user (librados) is safe to call in fast-dispatch context
+    if (info->notify_id &&
+	info->notify_id != m->notify_id) {
+      ldout(cct, 10) << __func__ << " reply notify " << m->notify_id
+		     << " != " << info->notify_id << ", ignoring" << dendl;
+    } else {
+      assert(info->on_notify_finish);
+      info->notify_result_bl->claim(m->get_data());
+      info->on_notify_finish->complete(m->return_code);
+    }
   } else {
     finisher->queue(new C_DoWatchNotify(this, info, m));
     _linger_callback_queue();
@@ -870,7 +910,8 @@ bool Objecter::ms_dispatch(Message *m)
 
 void Objecter::_scan_requests(OSDSession *s,
                              bool force_resend,
-			     bool force_resend_writes,
+			     bool cluster_full,
+                             map<int64_t, bool> *pool_full_map,
 			     map<ceph_tid_t, Op*>& need_resend,
 			     list<LingerOp*>& need_resend_linger,
 			     map<ceph_tid_t, CommandOp*>& need_resend_command)
@@ -890,8 +931,10 @@ void Objecter::_scan_requests(OSDSession *s,
     assert(op->session == s);
     ++lp;   // check_linger_pool_dne() may touch linger_ops; prevent iterator invalidation
     ldout(cct, 10) << " checking linger op " << op->linger_id << dendl;
-    bool unregister;
+    bool unregister, force_resend_writes = cluster_full;
     int r = _recalc_linger_op_target(op, lc);
+    if (pool_full_map)
+      force_resend_writes = force_resend_writes || (*pool_full_map)[op->target.base_oloc.pool];
     switch (r) {
     case RECALC_OP_TARGET_NO_ACTION:
       if (!force_resend && !force_resend_writes)
@@ -919,6 +962,9 @@ void Objecter::_scan_requests(OSDSession *s,
     Op *op = p->second;
     ++p;   // check_op_pool_dne() may touch ops; prevent iterator invalidation
     ldout(cct, 10) << " checking op " << op->tid << dendl;
+    bool force_resend_writes = cluster_full;
+    if (pool_full_map)
+      force_resend_writes = force_resend_writes || (*pool_full_map)[op->target.base_oloc.pool];
     int r = _calc_target(&op->target, &op->last_force_resend);
     switch (r) {
     case RECALC_OP_TARGET_NO_ACTION:
@@ -945,6 +991,9 @@ void Objecter::_scan_requests(OSDSession *s,
     CommandOp *c = cp->second;
     ++cp;
     ldout(cct, 10) << " checking command " << c->tid << dendl;
+    bool force_resend_writes = cluster_full;
+    if (pool_full_map)
+      force_resend_writes = force_resend_writes || (*pool_full_map)[c->target_pg.pool()];
     int r = _calc_command_target(c);
     switch (r) {
     case RECALC_OP_TARGET_NO_ACTION:
@@ -992,9 +1041,14 @@ void Objecter::handle_osd_map(MOSDMap *m)
   }
 
   bool was_pauserd = osdmap->test_flag(CEPH_OSDMAP_PAUSERD);
-  bool was_full = _osdmap_full_flag();
-  bool was_pausewr = osdmap->test_flag(CEPH_OSDMAP_PAUSEWR) || was_full;
+  bool cluster_full = _osdmap_full_flag();
+  bool was_pausewr = osdmap->test_flag(CEPH_OSDMAP_PAUSEWR) || cluster_full || _osdmap_has_pool_full();
+  map<int64_t, bool> pool_full_map;
+  for (map<int64_t, pg_pool_t>::const_iterator it = osdmap->get_pools().begin();
+       it != osdmap->get_pools().end(); ++it)
+    pool_full_map[it->first] = _osdmap_pool_full(it->second);
 
+  
   list<LingerOp*> need_resend_linger;
   map<ceph_tid_t, Op*> need_resend;
   map<ceph_tid_t, CommandOp*> need_resend_command;
@@ -1045,18 +1099,19 @@ void Objecter::handle_osd_map(MOSDMap *m)
 	}
 	logger->set(l_osdc_map_epoch, osdmap->get_epoch());
 
-	was_full = was_full || _osdmap_full_flag();
-	_scan_requests(homeless_session, skipped_map, was_full,
-		       need_resend, need_resend_linger,
-		       need_resend_command);
+	cluster_full = cluster_full || _osdmap_full_flag();
+        update_pool_full_map(pool_full_map);
+	_scan_requests(homeless_session, skipped_map, cluster_full,
+                       &pool_full_map, need_resend,
+                       need_resend_linger, need_resend_command);
 
 	// osd addr changes?
 	for (map<int,OSDSession*>::iterator p = osd_sessions.begin();
 	     p != osd_sessions.end(); ) {
 	  OSDSession *s = p->second;
-	  _scan_requests(s, skipped_map, was_full,
-			 need_resend, need_resend_linger,
-			 need_resend_command);
+	  _scan_requests(s, skipped_map, cluster_full,
+			 &pool_full_map, need_resend,
+                         need_resend_linger, need_resend_command);
 	  ++p;
 	  if (!osdmap->is_up(s->osd) ||
 	      (s->con &&
@@ -1074,14 +1129,14 @@ void Objecter::handle_osd_map(MOSDMap *m)
         for (map<int,OSDSession*>::iterator p = osd_sessions.begin();
 	     p != osd_sessions.end(); ++p) {
 	  OSDSession *s = p->second;
-	  _scan_requests(s, false, false, need_resend, need_resend_linger,
-			 need_resend_command);
+	  _scan_requests(s, false, false, NULL, need_resend,
+                         need_resend_linger, need_resend_command);
         }
 	ldout(cct, 3) << "handle_osd_map decoding full epoch "
 		      << m->get_last() << dendl;
 	osdmap->decode(m->maps[m->get_last()]);
 
-	_scan_requests(homeless_session, false, false,
+	_scan_requests(homeless_session, false, false, NULL,
 		       need_resend, need_resend_linger,
 		       need_resend_command);
       } else {
@@ -1094,7 +1149,7 @@ void Objecter::handle_osd_map(MOSDMap *m)
   }
 
   bool pauserd = osdmap->test_flag(CEPH_OSDMAP_PAUSERD);
-  bool pausewr = osdmap->test_flag(CEPH_OSDMAP_PAUSEWR) || _osdmap_full_flag();
+  bool pausewr = osdmap->test_flag(CEPH_OSDMAP_PAUSEWR) || _osdmap_full_flag() || _osdmap_has_pool_full();
 
   // was/is paused?
   if (was_pauserd || was_pausewr || pauserd || pausewr || osdmap->get_epoch() < epoch_barrier) {
@@ -1126,6 +1181,7 @@ void Objecter::handle_osd_map(MOSDMap *m)
 	_send_op(op);
       }
     } else {
+      _op_cancel_map_check(op);
       _cancel_linger_op(op);
     }
     s->lock.unlock();
@@ -1303,7 +1359,7 @@ void Objecter::_check_op_pool_dne(Op *op, bool session_locked)
       if (!session_locked) {
         s->lock.get_write();
       }
-      _finish_op(op);
+      _finish_op(op, 0);
       if (!session_locked) {
         s->lock.unlock();
       }
@@ -1780,6 +1836,7 @@ void Objecter::_kick_requests(OSDSession *session, map<uint64_t, LingerOp *>& lr
       if (!op->target.paused)
 	resend[op->tid] = op;
     } else {
+      _op_cancel_map_check(op);
       _cancel_linger_op(op);
     }
   }
@@ -1850,54 +1907,53 @@ void Objecter::tick()
 
   set<OSDSession*> toping;
 
-  int r = 0;
 
   // look for laggy requests
   utime_t cutoff = ceph_clock_now(cct);
   cutoff -= cct->_conf->objecter_timeout;  // timeout
 
-  unsigned laggy_ops;
-
-  do {
-    laggy_ops = 0;
-    for (map<int,OSDSession*>::iterator siter = osd_sessions.begin(); siter != osd_sessions.end(); ++siter) {
-      OSDSession *s = siter->second;
-      RWLock::RLocker l(s->lock);
-      for (map<ceph_tid_t,Op*>::iterator p = s->ops.begin();
-           p != s->ops.end();
-           ++p) {
-        Op *op = p->second;
-        assert(op->session);
-        if (op->stamp < cutoff) {
-          ldout(cct, 2) << " tid " << p->first << " on osd." << op->session->osd << " is laggy" << dendl;
-          toping.insert(op->session);
-          ++laggy_ops;
-        }
-      }
-      for (map<uint64_t,LingerOp*>::iterator p = s->linger_ops.begin();
-           p != s->linger_ops.end();
-           ++p) {
-        LingerOp *op = p->second;
-        RWLock::WLocker wl(op->watch_lock);
-        assert(op->session);
-        ldout(cct, 10) << " pinging osd that serves lingering tid " << p->first << " (osd." << op->session->osd << ")" << dendl;
-        toping.insert(op->session);
-	if (op->is_watch && op->registered && !op->last_error)
-	  _send_linger_ping(op);
-      }
-      for (map<uint64_t,CommandOp*>::iterator p = s->command_ops.begin();
-           p != s->command_ops.end();
-           ++p) {
-        CommandOp *op = p->second;
-        assert(op->session);
-        ldout(cct, 10) << " pinging osd that serves command tid " << p->first << " (osd." << op->session->osd << ")" << dendl;
-        toping.insert(op->session);
+  unsigned laggy_ops = 0;
+
+  for (map<int,OSDSession*>::iterator siter = osd_sessions.begin(); siter != osd_sessions.end(); ++siter) {
+    OSDSession *s = siter->second;
+    RWLock::RLocker l(s->lock);
+    bool found = false;
+    for (map<ceph_tid_t,Op*>::iterator p = s->ops.begin();
+	p != s->ops.end();
+	++p) {
+      Op *op = p->second;
+      assert(op->session);
+      if (op->stamp < cutoff) {
+	ldout(cct, 2) << " tid " << p->first << " on osd." << op->session->osd << " is laggy" << dendl;
+	found = true;
+	++laggy_ops;
       }
     }
-    if (num_homeless_ops.read() || !toping.empty()) {
-      _maybe_request_map();
+    for (map<uint64_t,LingerOp*>::iterator p = s->linger_ops.begin();
+	p != s->linger_ops.end();
+	++p) {
+      LingerOp *op = p->second;
+      RWLock::WLocker wl(op->watch_lock);
+      assert(op->session);
+      ldout(cct, 10) << " pinging osd that serves lingering tid " << p->first << " (osd." << op->session->osd << ")" << dendl;
+      found = true;
+      if (op->is_watch && op->registered && !op->last_error)
+	_send_linger_ping(op);
     }
-  } while (r == -EAGAIN);
+    for (map<uint64_t,CommandOp*>::iterator p = s->command_ops.begin();
+	p != s->command_ops.end();
+	++p) {
+      CommandOp *op = p->second;
+      assert(op->session);
+      ldout(cct, 10) << " pinging osd that serves command tid " << p->first << " (osd." << op->session->osd << ")" << dendl;
+      found = true;
+    }
+    if (found)
+      toping.insert(s);
+  }
+  if (num_homeless_ops.read() || !toping.empty()) {
+    _maybe_request_map();
+  }
 
   logger->set(l_osdc_op_laggy, laggy_ops);
   logger->set(l_osdc_osd_laggy, toping.size());
@@ -1968,13 +2024,10 @@ class C_CancelOp : public Context
   ceph_tid_t tid;
   Objecter *objecter;
 public:
-  C_CancelOp(Objecter *objecter) : objecter(objecter) {}
+  C_CancelOp(ceph_tid_t tid, Objecter *objecter) : tid(tid), objecter(objecter) {}
   void finish(int r) {
     objecter->op_cancel(tid, -ETIMEDOUT);
   }
-  void set_tid(ceph_tid_t _tid) {
-    tid = _tid;
-  }
 };
 
 ceph_tid_t Objecter::op_submit(Op *op, int *ctx_budget)
@@ -1993,7 +2046,7 @@ ceph_tid_t Objecter::_op_submit_with_budget(Op *op, RWLock::Context& lc, int *ct
   assert(op->ops.size() == op->out_handler.size());
 
   // throttle.  before we look at any state, because
-  // take_op_budget() may drop our lock while it blocks.
+  // _take_op_budget() may drop our lock while it blocks.
   if (!op->ctx_budgeted || (ctx_budget && (*ctx_budget == -1))) {
     int op_budget = _take_op_budget(op);
     // take and pass out the budget for the first OP
@@ -2003,21 +2056,15 @@ ceph_tid_t Objecter::_op_submit_with_budget(Op *op, RWLock::Context& lc, int *ct
     }
   }
 
-  C_CancelOp *cb = NULL;
   if (osd_timeout > 0) {
-    cb = new C_CancelOp(this);
-    op->ontimeout = cb;
-  }
-
-  ceph_tid_t tid = _op_submit(op, lc);
-
-  if (cb) {
-    cb->set_tid(tid);
+    if (op->tid == 0)
+      op->tid = last_tid.inc();
+    op->ontimeout = new C_CancelOp(op->tid, this);
     Mutex::Locker l(timer_lock);
     timer.add_event_after(osd_timeout, op->ontimeout);
   }
 
-  return tid;
+  return _op_submit(op, lc);
 }
 
 void Objecter::_send_op_account(Op *op)
@@ -2072,6 +2119,22 @@ void Objecter::_send_op_account(Op *op)
     case CEPH_OSD_OP_TMAPUP: code = l_osdc_osdop_tmap_up; break;
     case CEPH_OSD_OP_TMAPPUT: code = l_osdc_osdop_tmap_put; break;
     case CEPH_OSD_OP_TMAPGET: code = l_osdc_osdop_tmap_get; break;
+
+    // OMAP read operations
+    case CEPH_OSD_OP_OMAPGETVALS:
+    case CEPH_OSD_OP_OMAPGETKEYS:
+    case CEPH_OSD_OP_OMAPGETHEADER:
+    case CEPH_OSD_OP_OMAPGETVALSBYKEYS:
+    case CEPH_OSD_OP_OMAP_CMP: code = l_osdc_osdop_omap_rd; break;
+
+    // OMAP write operations
+    case CEPH_OSD_OP_OMAPSETVALS:
+    case CEPH_OSD_OP_OMAPSETHEADER: code = l_osdc_osdop_omap_wr; break;
+
+    // OMAP del operations
+    case CEPH_OSD_OP_OMAPCLEAR:
+    case CEPH_OSD_OP_OMAPRMKEYS: code = l_osdc_osdop_omap_del; break;
+
     case CEPH_OSD_OP_CALL: code = l_osdc_osdop_call; break;
     case CEPH_OSD_OP_WATCH: code = l_osdc_osdop_watch; break;
     case CEPH_OSD_OP_NOTIFY: code = l_osdc_osdop_notify; break;
@@ -2124,16 +2187,23 @@ ceph_tid_t Objecter::_op_submit(Op *op, RWLock::Context& lc)
 
   if ((op->target.flags & CEPH_OSD_FLAG_WRITE) &&
       osdmap->test_flag(CEPH_OSDMAP_PAUSEWR)) {
-    ldout(cct, 10) << " paused modify " << op << " tid " << last_tid.read() << dendl;
+    ldout(cct, 10) << " paused modify " << op << " tid " << last_tid.read()
+		   << dendl;
     op->target.paused = true;
     _maybe_request_map();
   } else if ((op->target.flags & CEPH_OSD_FLAG_READ) &&
 	     osdmap->test_flag(CEPH_OSDMAP_PAUSERD)) {
-    ldout(cct, 10) << " paused read " << op << " tid " << last_tid.read() << dendl;
+    ldout(cct, 10) << " paused read " << op << " tid " << last_tid.read()
+		   << dendl;
     op->target.paused = true;
     _maybe_request_map();
-  } else if ((op->target.flags & CEPH_OSD_FLAG_WRITE) && _osdmap_full_flag()) {
-    ldout(cct, 0) << " FULL, paused modify " << op << " tid " << last_tid.read() << dendl;
+  } else if ((op->target.flags & CEPH_OSD_FLAG_WRITE) &&
+	     !(op->target.flags & (CEPH_OSD_FLAG_FULL_TRY |
+				   CEPH_OSD_FLAG_FULL_FORCE)) &&
+             (_osdmap_full_flag() ||
+	      _osdmap_pool_full(op->target.base_oloc.pool))) {
+    ldout(cct, 0) << " FULL, paused modify " << op << " tid " << last_tid.read()
+		  << dendl;
     op->target.paused = true;
     _maybe_request_map();
   } else if (!s->is_homeless()) {
@@ -2195,7 +2265,10 @@ int Objecter::op_cancel(OSDSession *s, ceph_tid_t tid, int r)
   if (op->onack) {
     op->onack->complete(r);
     op->onack = NULL;
+    num_unacked.dec();
   }
+  if (op->oncommit || op->oncommit_sync)
+    num_uncommitted.dec();
   if (op->oncommit) {
     op->oncommit->complete(r);
     op->oncommit = NULL;
@@ -2205,7 +2278,7 @@ int Objecter::op_cancel(OSDSession *s, ceph_tid_t tid, int r)
     op->oncommit_sync = NULL;
   }
   _op_cancel_map_check(op);
-  _finish_op(op);
+  _finish_op(op, r);
   s->lock.unlock();
 
   return 0;
@@ -2267,41 +2340,44 @@ start:
   return ret;
 }
 
-/**
- * Any write op which is in progress at the start of this call shall no longer
- * be in progress when this call ends.  Operations started after the start
- * of this call may still be in progress when this call ends.
- *
- * @return the latest possible epoch in which a cancelled op could have existed
- */
-epoch_t Objecter::op_cancel_writes(int r)
+
+epoch_t Objecter::op_cancel_writes(int r, int64_t pool)
 {
   rwlock.get_write();
 
   std::vector<ceph_tid_t> to_cancel;
+  bool found = false;
 
   for (map<int, OSDSession *>::iterator siter = osd_sessions.begin(); siter != osd_sessions.end(); ++siter) {
     OSDSession *s = siter->second;
     s->lock.get_read();
     for (map<ceph_tid_t, Op*>::iterator op_i = s->ops.begin(); op_i != s->ops.end(); ++op_i) {
-      if (op_i->second->target.flags & CEPH_OSD_FLAG_WRITE) {
+      if (op_i->second->target.flags & CEPH_OSD_FLAG_WRITE
+        && (pool == -1 || op_i->second->target.target_oloc.pool == pool)) {
         to_cancel.push_back(op_i->first);
       }
     }
     s->lock.unlock();
-  }
 
-  for (std::vector<ceph_tid_t>::iterator titer = to_cancel.begin(); titer != to_cancel.end(); ++titer) {
-    int cancel_result = _op_cancel(*titer, r);
-    // We hold rwlock across search and cancellation, so cancels should always succeed
-    assert(cancel_result == 0);
+    for (std::vector<ceph_tid_t>::iterator titer = to_cancel.begin(); titer != to_cancel.end(); ++titer) {
+      int cancel_result = op_cancel(s, *titer, r);
+      // We hold rwlock across search and cancellation, so cancels should always succeed
+      assert(cancel_result == 0);
+    }
+    if (!found && to_cancel.size())
+      found = true;
+    to_cancel.clear();
   }
 
   const epoch_t epoch = osdmap->get_epoch();
 
   rwlock.unlock();
 
-  return epoch;
+  if (found) {
+    return epoch;
+  } else {
+    return -1;
+  }
 }
 
 bool Objecter::is_pg_changed(
@@ -2324,8 +2400,11 @@ bool Objecter::is_pg_changed(
 
 bool Objecter::target_should_be_paused(op_target_t *t)
 {
+  const pg_pool_t *pi = osdmap->get_pg_pool(t->base_oloc.pool);
   bool pauserd = osdmap->test_flag(CEPH_OSDMAP_PAUSERD);
-  bool pausewr = osdmap->test_flag(CEPH_OSDMAP_PAUSEWR) || _osdmap_full_flag();
+  bool pausewr = osdmap->test_flag(CEPH_OSDMAP_PAUSEWR) ||
+                 _osdmap_full_flag() ||
+                 _osdmap_pool_full(*pi);
 
   return (t->flags & CEPH_OSD_FLAG_READ && pauserd) ||
          (t->flags & CEPH_OSD_FLAG_WRITE && pausewr) ||
@@ -2342,6 +2421,43 @@ bool Objecter::osdmap_full_flag() const
   return _osdmap_full_flag();
 }
 
+bool Objecter::osdmap_pool_full(const int64_t pool_id) const
+{
+  RWLock::RLocker rl(rwlock);
+
+  if (_osdmap_full_flag()) {
+    return true;
+  }
+
+  return _osdmap_pool_full(pool_id);
+}
+
+bool Objecter::_osdmap_pool_full(const int64_t pool_id) const
+{
+  const pg_pool_t *pool = osdmap->get_pg_pool(pool_id);
+  if (pool == NULL) {
+    ldout(cct, 4) << __func__ << ": DNE pool " << pool_id << dendl;
+    return false;
+  }
+
+  return _osdmap_pool_full(*pool);
+}
+
+bool Objecter::_osdmap_has_pool_full() const
+{
+  for (map<int64_t, pg_pool_t>::const_iterator it = osdmap->get_pools().begin();
+       it != osdmap->get_pools().end(); ++it) {
+    if (_osdmap_pool_full(it->second))
+      return true;
+  }
+  return false;
+}
+
+bool Objecter::_osdmap_pool_full(const pg_pool_t &p) const
+{
+  return p.has_flag(pg_pool_t::FLAG_FULL) && honor_osdmap_full;
+}
+
 /**
  * Wrapper around osdmap->test_flag for special handling of the FULL flag.
  */
@@ -2351,6 +2467,17 @@ bool Objecter::_osdmap_full_flag() const
   return osdmap->test_flag(CEPH_OSDMAP_FULL) && honor_osdmap_full;
 }
 
+void Objecter::update_pool_full_map(map<int64_t, bool>& pool_full_map)
+{
+  for (map<int64_t, pg_pool_t>::const_iterator it = osdmap->get_pools().begin();
+       it != osdmap->get_pools().end(); ++it) {
+    if (pool_full_map.find(it->first) == pool_full_map.end()) {
+      pool_full_map[it->first] = _osdmap_pool_full(it->second);
+    } else {
+      pool_full_map[it->first] = _osdmap_pool_full(it->second) || pool_full_map[it->first];
+    }
+  }
+}
 
 int64_t Objecter::get_object_hash_position(int64_t pool, const string& key,
 					   const string& ns)
@@ -2435,6 +2562,7 @@ int Objecter::_calc_target(op_target_t *t, epoch_t *last_force_resend,  bool any
   vector<int> up, acting;
   osdmap->pg_to_up_acting_osds(pgid, &up, &up_primary,
 			       &acting, &acting_primary);
+  bool sort_bitwise = osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE);
   unsigned prev_seed = ceph_stable_mod(pgid.ps(), t->pg_num, t->pg_num_mask);
   if (any_change && pg_interval_t::is_new_interval(
           t->acting_primary,
@@ -2451,6 +2579,8 @@ int Objecter::_calc_target(op_target_t *t, epoch_t *last_force_resend,  bool any
 	  min_size,
 	  t->pg_num,
 	  pg_num,
+	  t->sort_bitwise,
+	  sort_bitwise,
 	  pg_t(prev_seed, pgid.pool(), pgid.preferred()))) {
     force_resend = true;
   }
@@ -2477,6 +2607,7 @@ int Objecter::_calc_target(op_target_t *t, epoch_t *last_force_resend,  bool any
     t->min_size = min_size;
     t->pg_num = pg_num;
     t->pg_num_mask = pi->get_pg_num_mask();
+    t->sort_bitwise = sort_bitwise;
     ldout(cct, 10) << __func__ << " "
 		   << " pgid " << pgid << " acting " << acting << dendl;
     t->used_replica = false;
@@ -2671,14 +2802,20 @@ void Objecter::_cancel_linger_op(Op *op)
   ldout(cct, 15) << "cancel_op " << op->tid << dendl;
 
   assert(!op->should_resend);
-  delete op->onack;
-  delete op->oncommit;
-  delete op->oncommit_sync;
+  if (op->onack) {
+    delete op->onack;
+    num_unacked.dec();
+  }
+  if (op->oncommit || op->oncommit_sync) {
+    delete op->oncommit;
+    delete op->oncommit_sync;
+    num_uncommitted.dec();
+  }
 
-  _finish_op(op);
+  _finish_op(op, 0);
 }
 
-void Objecter::_finish_op(Op *op)
+void Objecter::_finish_op(Op *op, int r)
 {
   ldout(cct, 15) << "finish_op " << op->tid << dendl;
 
@@ -2687,7 +2824,7 @@ void Objecter::_finish_op(Op *op)
   if (!op->ctx_budgeted && op->budgeted)
     put_op_budget(op);
 
-  if (op->ontimeout) {
+  if (op->ontimeout && r != -ETIMEDOUT) {
     Mutex::Locker l(timer_lock);
     timer.cancel_event(op->ontimeout);
   }
@@ -2716,7 +2853,7 @@ void Objecter::finish_op(OSDSession *session, ceph_tid_t tid)
 
   Op *op = iter->second;
 
-  _finish_op(op);
+  _finish_op(op, 0);
 }
 
 MOSDOp *Objecter::_prepare_osd_op(Op *op)
@@ -2730,6 +2867,9 @@ MOSDOp *Objecter::_prepare_osd_op(Op *op)
   if (op->onack)
     flags |= CEPH_OSD_FLAG_ACK;
 
+  if (!honor_osdmap_full)
+    flags |= CEPH_OSD_FLAG_FULL_FORCE;
+
   op->target.paused = false;
   op->stamp = ceph_clock_now(cct);
 
@@ -2755,6 +2895,10 @@ MOSDOp *Objecter::_prepare_osd_op(Op *op)
   else
     m->set_priority(cct->_conf->osd_client_op_priority);
 
+  if (op->reqid != osd_reqid_t()) {
+    m->set_reqid(op->reqid);
+  }
+
   logger->inc(l_osdc_op_send);
   logger->inc(l_osdc_op_send_bytes, m->get_data().length());
 
@@ -3038,7 +3182,7 @@ void Objecter::handle_osd_op_reply(MOSDOpReply *m)
   // done with this tid?
   if (!op->onack && !op->oncommit && !op->oncommit_sync) {
     ldout(cct, 15) << "handle_osd_op_reply completed tid " << tid << dendl;
-    _finish_op(op);
+    _finish_op(op, 0);
   }
 
   ldout(cct, 5) << num_unacked.read() << " unacked, " << num_uncommitted.read() << " uncommitted" << dendl;
@@ -3119,8 +3263,14 @@ void Objecter::list_nobjects(NListContext *list_context, Context *onfinish)
 
   if (list_context->starting_pg_num == 0) {     // there can't be zero pgs!
     list_context->starting_pg_num = pg_num;
+    list_context->sort_bitwise = osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE);
     ldout(cct, 20) << pg_num << " placement groups" << dendl;
   }
+  if (list_context->sort_bitwise != osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
+    ldout(cct, 10) << " hobject sort order changed, restarting this pg" << dendl;
+    list_context->cookie = collection_list_handle_t();
+    list_context->sort_bitwise = osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE);
+  }
   if (list_context->starting_pg_num != pg_num) {
     // start reading from the beginning; the pgs have changed
     ldout(cct, 10) << " pg_num changed; restarting with " << pg_num << dendl;
@@ -3258,8 +3408,14 @@ void Objecter::list_objects(ListContext *list_context, Context *onfinish)
 
   if (list_context->starting_pg_num == 0) {     // there can't be zero pgs!
     list_context->starting_pg_num = pg_num;
+    list_context->sort_bitwise = osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE);
     ldout(cct, 20) << pg_num << " placement groups" << dendl;
   }
+  if (list_context->sort_bitwise != osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
+    ldout(cct, 10) << " hobject sort order changed, restarting this pg" << dendl;
+    list_context->cookie = collection_list_handle_t();
+    list_context->sort_bitwise = osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE);
+  }
   if (list_context->starting_pg_num != pg_num) {
     // start reading from the beginning; the pgs have changed
     ldout(cct, 10) << " pg_num changed; restarting with " << pg_num << dendl;
@@ -3623,7 +3779,7 @@ void Objecter::handle_pool_op_reply(MPoolOpReply *m)
     }
     iter = pool_ops.find(tid);
     if (iter != pool_ops.end()) {
-      _finish_pool_op(op);
+      _finish_pool_op(op, 0);
     }
   } else {
     ldout(cct, 10) << "unknown request " << tid << dendl;
@@ -3652,17 +3808,17 @@ int Objecter::pool_op_cancel(ceph_tid_t tid, int r)
   if (op->onfinish)
     op->onfinish->complete(r);
 
-  _finish_pool_op(op);
+  _finish_pool_op(op, r);
   return 0;
 }
 
-void Objecter::_finish_pool_op(PoolOp *op)
+void Objecter::_finish_pool_op(PoolOp *op, int r)
 {
   assert(rwlock.is_wlocked());
   pool_ops.erase(op->tid);
   logger->set(l_osdc_poolop_active, pool_ops.size());
 
-  if (op->ontimeout) {
+  if (op->ontimeout && r != -ETIMEDOUT) {
     Mutex::Locker l(timer_lock);
     timer.cancel_event(op->ontimeout);
   }
@@ -3740,7 +3896,7 @@ void Objecter::handle_get_pool_stats_reply(MGetPoolStatsReply *m)
       last_seen_pgmap_version = m->version;
     }
     op->onfinish->complete(0);
-    _finish_pool_stat_op(op);
+    _finish_pool_stat_op(op, 0);
   } else {
     ldout(cct, 10) << "unknown request " << tid << dendl;
   } 
@@ -3765,18 +3921,18 @@ int Objecter::pool_stat_op_cancel(ceph_tid_t tid, int r)
   PoolStatOp *op = it->second;
   if (op->onfinish)
     op->onfinish->complete(r);
-  _finish_pool_stat_op(op);
+  _finish_pool_stat_op(op, r);
   return 0;
 }
 
-void Objecter::_finish_pool_stat_op(PoolStatOp *op)
+void Objecter::_finish_pool_stat_op(PoolStatOp *op, int r)
 {
   assert(rwlock.is_wlocked());
 
   poolstat_ops.erase(op->tid);
   logger->set(l_osdc_poolstat_active, poolstat_ops.size());
 
-  if (op->ontimeout) {
+  if (op->ontimeout && r != -ETIMEDOUT) {
     Mutex::Locker l(timer_lock);
     timer.cancel_event(op->ontimeout);
   }
@@ -3847,7 +4003,7 @@ void Objecter::handle_fs_stats_reply(MStatfsReply *m)
     if (m->h.version > last_seen_pgmap_version)
       last_seen_pgmap_version = m->h.version;
     op->onfinish->complete(0);
-    _finish_statfs_op(op);
+    _finish_statfs_op(op, 0);
   } else {
     ldout(cct, 10) << "unknown request " << tid << dendl;
   }
@@ -3872,18 +4028,18 @@ int Objecter::statfs_op_cancel(ceph_tid_t tid, int r)
   StatfsOp *op = it->second;
   if (op->onfinish)
     op->onfinish->complete(r);
-  _finish_statfs_op(op);
+  _finish_statfs_op(op, r);
   return 0;
 }
 
-void Objecter::_finish_statfs_op(StatfsOp *op)
+void Objecter::_finish_statfs_op(StatfsOp *op, int r)
 {
   assert(rwlock.is_wlocked());
 
   statfs_ops.erase(op->tid);
   logger->set(l_osdc_statfs_active, statfs_ops.size());
 
-  if (op->ontimeout) {
+  if (op->ontimeout && r != -ETIMEDOUT) {
     Mutex::Locker l(timer_lock);
     timer.cancel_event(op->ontimeout);
   }
@@ -4449,7 +4605,7 @@ int Objecter::command_op_cancel(OSDSession *s, ceph_tid_t tid, int r)
 
   CommandOp *op = it->second;
   _command_cancel_map_check(op);
-  _finish_command(op, -ETIMEDOUT, "");
+  _finish_command(op, r, "");
   return 0;
 }
 
@@ -4463,7 +4619,7 @@ void Objecter::_finish_command(CommandOp *c, int r, string rs)
   if (c->onfinish)
     c->onfinish->complete(r);
 
-  if (c->ontimeout) {
+  if (c->ontimeout && r != -ETIMEDOUT) {
     Mutex::Locker l(timer_lock);
     timer.cancel_event(c->ontimeout);
   }
diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h
index b9fd0cd..379c0ae 100644
--- a/src/osdc/Objecter.h
+++ b/src/osdc/Objecter.h
@@ -194,8 +194,8 @@ struct ObjectOperation {
     string mname = "filter";
     ::encode(cname, osd_op.indata);
     ::encode(mname, osd_op.indata);
-    ::encode(cookie, osd_op.indata);
     osd_op.indata.append(filter);
+    ::encode(cookie, osd_op.indata);
   }
   void add_alloc_hint(int op, uint64_t expected_object_size,
                       uint64_t expected_write_size) {
@@ -661,12 +661,18 @@ struct ObjectOperation {
         out_truncate_size(otsize),
         prval(r) {}
     void finish(int r) {
-      if (r < 0)
+      // reqids are copied on ENOENT
+      if (r < 0 && r != -ENOENT)
 	return;
       try {
 	bufferlist::iterator p = bl.begin();
 	object_copy_data_t copy_reply;
 	::decode(copy_reply, p);
+	if (r == -ENOENT) {
+	  if (out_reqids)
+	    *out_reqids = copy_reply.reqids;
+	  return;
+	}
 	if (out_size)
 	  *out_size = copy_reply.size;
 	if (out_mtime)
@@ -705,6 +711,7 @@ struct ObjectOperation {
 
   void copy_get(object_copy_cursor_t *cursor,
 		uint64_t max,
+		uint32_t copyget_flags,
 		uint64_t *out_size,
 		utime_t *out_mtime,
 		std::map<std::string,bufferlist> *out_attrs,
@@ -722,6 +729,7 @@ struct ObjectOperation {
 		int *prval) {
     OSDOp& osd_op = add_op(CEPH_OSD_OP_COPY_GET);
     osd_op.op.copy_get.max = max;
+    osd_op.op.copy_get.flags = copyget_flags;
     ::encode(*cursor, osd_op.indata);
     ::encode(max, osd_op.indata);
     unsigned p = ops.size() - 1;
@@ -970,11 +978,13 @@ struct ObjectOperation {
   }
 
   void copy_from(object_t src, snapid_t snapid, object_locator_t src_oloc,
-		 version_t src_version, unsigned flags) {
+		 version_t src_version, unsigned flags,
+		 unsigned src_fadvise_flags) {
     OSDOp& osd_op = add_op(CEPH_OSD_OP_COPY_FROM);
     osd_op.op.copy_from.snapid = snapid;
     osd_op.op.copy_from.src_version = src_version;
     osd_op.op.copy_from.flags = flags;
+    osd_op.op.copy_from.src_fadvise_flags = src_fadvise_flags;
     ::encode(src, osd_op.indata);
     ::encode(src_oloc, osd_op.indata);
   }
@@ -1139,6 +1149,7 @@ public:
     int acting_primary;   ///< primary for last pg we mapped to based on the acting set
     int size;             ///< the size of the pool when were were last mapped
     int min_size;         ///< the min size of the pool when were were last mapped
+    bool sort_bitwise;    ///< whether the hobject_t sort order is bitwise
 
     bool used_replica;
     bool paused;
@@ -1156,6 +1167,7 @@ public:
 	acting_primary(-1),
 	size(-1),
 	min_size(-1),
+	sort_bitwise(false),
 	used_replica(false),
 	paused(false),
 	osd(-1)
@@ -1213,6 +1225,8 @@ public:
 
     epoch_t last_force_resend;
 
+    osd_reqid_t reqid; // explicitly setting reqid
+
     Op(const object_t& o, const object_locator_t& ol, vector<OSDOp>& op,
        int f, Context *ac, Context *co, version_t *ov, int *offset = NULL) :
       session(NULL), incarnation(0),
@@ -1327,6 +1341,7 @@ public:
     int starting_pg_num;
     bool at_end_of_pool;
     bool at_end_of_pg;
+    bool sort_bitwise;
 
     int64_t pool_id;
     int pool_snap_seq;
@@ -1349,6 +1364,7 @@ public:
     NListContext() : current_pg(0), current_pg_epoch(0), starting_pg_num(0),
 		    at_end_of_pool(false),
 		    at_end_of_pg(false),
+		    sort_bitwise(false),
 		    pool_id(0),
 		    pool_snap_seq(0),
                     max_entries(0),
@@ -1392,6 +1408,7 @@ public:
     int starting_pg_num;
     bool at_end_of_pool;
     bool at_end_of_pg;
+    bool sort_bitwise;
 
     int64_t pool_id;
     int pool_snap_seq;
@@ -1414,6 +1431,7 @@ public:
     ListContext() : current_pg(0), current_pg_epoch(0), starting_pg_num(0),
 		    at_end_of_pool(false),
 		    at_end_of_pg(false),
+		    sort_bitwise(false),
 		    pool_id(0),
 		    pool_snap_seq(0),
                     max_entries(0),
@@ -1561,6 +1579,7 @@ public:
     // we trigger these from an async finisher
     Context *on_notify_finish;
     bufferlist *notify_result_bl;
+    uint64_t notify_id;
 
     WatchContext *watch_context;
 
@@ -1595,6 +1614,7 @@ public:
 		 on_reg_commit(NULL),
 		 on_notify_finish(NULL),
 		 notify_result_bl(NULL),
+		 notify_id(0),
 		 watch_context(NULL),
 		 session(NULL),
 		 register_tid(0),
@@ -1619,6 +1639,7 @@ public:
   struct C_Linger_Commit : public Context {
     Objecter *objecter;
     LingerOp *info;
+    bufferlist outbl;  // used for notify only
     C_Linger_Commit(Objecter *o, LingerOp *l) : objecter(o), info(l) {
       info->get();
     }
@@ -1626,7 +1647,7 @@ public:
       info->put();
     }
     void finish(int r) {
-      objecter->_linger_commit(info, r);
+      objecter->_linger_commit(info, r, outbl);
     }
   };
 
@@ -1707,8 +1728,20 @@ public:
   map<int,OSDSession*> osd_sessions;
 
   bool osdmap_full_flag() const;
+  bool osdmap_pool_full(const int64_t pool_id) const;
 
  private:
+
+  /**
+   * Test pg_pool_t::FLAG_FULL on a pool
+   *
+   * @return true if the pool exists and has the flag set, or
+   *         the global full flag is set, else false
+   */
+  bool _osdmap_pool_full(const int64_t pool_id) const;
+  bool _osdmap_pool_full(const pg_pool_t &p) const;
+  void update_pool_full_map(map<int64_t, bool>& pool_full_map);
+
   map<uint64_t, LingerOp*>  linger_ops;
   // we use this just to confirm a cookie is valid before dereferencing the ptr
   set<LingerOp*>            linger_ops_set;
@@ -1738,7 +1771,7 @@ public:
   void _send_op_account(Op *op);
   void _cancel_linger_op(Op *op);
   void finish_op(OSDSession *session, ceph_tid_t tid);
-  void _finish_op(Op *op);
+  void _finish_op(Op *op, int r);
   static bool is_pg_changed(
     int oldprimary,
     const vector<int>& oldacting,
@@ -1753,6 +1786,7 @@ public:
     RECALC_OP_TARGET_OSD_DOWN,
   };
   bool _osdmap_full_flag() const;
+  bool _osdmap_has_pool_full() const;
 
   bool target_should_be_paused(op_target_t *op);
   int _calc_target(op_target_t *t, epoch_t *last_force_resend=0, bool any_change=false);
@@ -1771,7 +1805,7 @@ public:
 
   void _linger_submit(LingerOp *info);
   void _send_linger(LingerOp *info);
-  void _linger_commit(LingerOp *info, int r);
+  void _linger_commit(LingerOp *info, int r, bufferlist& outbl);
   void _linger_reconnect(LingerOp *info, int r);
   void _send_linger_ping(LingerOp *info);
   void _linger_ping(LingerOp *info, int r, utime_t sent, uint32_t register_gen);
@@ -1916,7 +1950,8 @@ private:
 
   void _scan_requests(OSDSession *s,
                      bool force_resend,
-		     bool force_resend_writes,
+		     bool cluster_full,
+                     map<int64_t, bool> *pool_full_map,
 		     map<ceph_tid_t, Op*>& need_resend,
 		     list<LingerOp*>& need_resend_linger,
 		     map<ceph_tid_t, CommandOp*>& need_resend_command);
@@ -1962,6 +1997,7 @@ private:
 public:
   ceph_tid_t op_submit(Op *op, int *ctx_budget = NULL);
   bool is_active() {
+    RWLock::RLocker l(rwlock);
     return !((!inflight_ops.read()) && linger_ops.empty() && poolstat_ops.empty() && statfs_ops.empty());
   }
 
@@ -2007,7 +2043,16 @@ private:
   friend class C_CancelOp;
 public:
   int op_cancel(ceph_tid_t tid, int r);
-  epoch_t op_cancel_writes(int r);
+
+  /**
+   * Any write op which is in progress at the start of this call shall no
+   * longer be in progress when this call ends.  Operations started after the
+   * start of this call may still be in progress when this call ends.
+   *
+   * @return the latest possible epoch in which a cancelled op could have
+   *         existed, or -1 if nothing was cancelled.
+   */
+  epoch_t op_cancel_writes(int r, int64_t pool=-1);
 
   // commands
   int osd_command(int osd, vector<string>& cmd,
@@ -2040,19 +2085,22 @@ public:
   Op *prepare_mutate_op(const object_t& oid, const object_locator_t& oloc,
 	       ObjectOperation& op,
 	       const SnapContext& snapc, utime_t mtime, int flags,
-	       Context *onack, Context *oncommit, version_t *objver = NULL) {
+	       Context *onack, Context *oncommit, version_t *objver = NULL,
+	       osd_reqid_t reqid = osd_reqid_t()) {
     Op *o = new Op(oid, oloc, op.ops, flags | global_op_flags.read() | CEPH_OSD_FLAG_WRITE, onack, oncommit, objver);
     o->priority = op.priority;
     o->mtime = mtime;
     o->snapc = snapc;
     o->out_rval.swap(op.out_rval);
+    o->reqid = reqid;
     return o;
   }
   ceph_tid_t mutate(const object_t& oid, const object_locator_t& oloc,
 	       ObjectOperation& op,
 	       const SnapContext& snapc, utime_t mtime, int flags,
-	       Context *onack, Context *oncommit, version_t *objver = NULL) {
-    Op *o = prepare_mutate_op(oid, oloc, op, snapc, mtime, flags, onack, oncommit, objver);
+	       Context *onack, Context *oncommit, version_t *objver = NULL,
+	       osd_reqid_t reqid = osd_reqid_t()) {
+    Op *o = prepare_mutate_op(oid, oloc, op, snapc, mtime, flags, onack, oncommit, objver, reqid);
     return op_submit(o);
   }
   Op *prepare_read_op(const object_t& oid, const object_locator_t& oloc,
@@ -2171,7 +2219,8 @@ public:
   ceph_tid_t read(const object_t& oid, const object_locator_t& oloc,
 	     uint64_t off, uint64_t len, snapid_t snap, bufferlist *pbl, int flags,
 	     Context *onfinish,
-	     version_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
+	     version_t *objver = NULL, ObjectOperation *extra_ops = NULL,
+	     int op_flags = 0) {
     vector<OSDOp> ops;
     int i = init_ops(ops, 1, extra_ops);
     ops[i].op.op = CEPH_OSD_OP_READ;
@@ -2179,6 +2228,7 @@ public:
     ops[i].op.extent.length = len;
     ops[i].op.extent.truncate_size = 0;
     ops[i].op.extent.truncate_seq = 0;
+    ops[i].op.flags = op_flags;
     Op *o = new Op(oid, oloc, ops, flags | global_op_flags.read() | CEPH_OSD_FLAG_READ, onfinish, 0, objver);
     o->snapid = snap;
     o->outbl = pbl;
@@ -2188,8 +2238,9 @@ public:
   ceph_tid_t read_trunc(const object_t& oid, const object_locator_t& oloc,
 	     uint64_t off, uint64_t len, snapid_t snap, bufferlist *pbl, int flags,
 	     uint64_t trunc_size, __u32 trunc_seq,
-	     Context *onfinish,
-	     version_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
+	     Context *onfinish, 
+	     version_t *objver = NULL, ObjectOperation *extra_ops = NULL,
+	     int op_flags = 0) {
     vector<OSDOp> ops;
     int i = init_ops(ops, 1, extra_ops);
     ops[i].op.op = CEPH_OSD_OP_READ;
@@ -2197,6 +2248,7 @@ public:
     ops[i].op.extent.length = len;
     ops[i].op.extent.truncate_size = trunc_size;
     ops[i].op.extent.truncate_seq = trunc_seq;
+    ops[i].op.flags = op_flags;
     Op *o = new Op(oid, oloc, ops, flags | global_op_flags.read() | CEPH_OSD_FLAG_READ, onfinish, 0, objver);
     o->snapid = snap;
     o->outbl = pbl;
@@ -2253,7 +2305,8 @@ public:
 		  snapid_t snap, bufferlist *pbl, int flags,
 		  Context *onfinish,
 	          version_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
-    return read(oid, oloc, 0, 0, snap, pbl, flags | global_op_flags.read() | CEPH_OSD_FLAG_READ, onfinish, objver);
+    return read(oid, oloc, 0, 0, snap, pbl, flags | global_op_flags.read() | CEPH_OSD_FLAG_READ,
+		onfinish, objver, extra_ops);
   }
 
 
@@ -2272,7 +2325,8 @@ public:
 	      uint64_t off, uint64_t len, const SnapContext& snapc, const bufferlist &bl,
 	      utime_t mtime, int flags,
 	      Context *onack, Context *oncommit,
-	      version_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
+	      version_t *objver = NULL, ObjectOperation *extra_ops = NULL,
+	      int op_flags = 0) {
     vector<OSDOp> ops;
     int i = init_ops(ops, 1, extra_ops);
     ops[i].op.op = CEPH_OSD_OP_WRITE;
@@ -2281,6 +2335,7 @@ public:
     ops[i].op.extent.truncate_size = 0;
     ops[i].op.extent.truncate_seq = 0;
     ops[i].indata = bl;
+    ops[i].op.flags = op_flags;
     Op *o = new Op(oid, oloc, ops, flags | global_op_flags.read() | CEPH_OSD_FLAG_WRITE, onack, oncommit, objver);
     o->mtime = mtime;
     o->snapc = snapc;
@@ -2309,7 +2364,8 @@ public:
 	      utime_t mtime, int flags,
 	      uint64_t trunc_size, __u32 trunc_seq,
 	      Context *onack, Context *oncommit,
-	      version_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
+	      version_t *objver = NULL, ObjectOperation *extra_ops = NULL,
+	      int op_flags = 0) {
     vector<OSDOp> ops;
     int i = init_ops(ops, 1, extra_ops);
     ops[i].op.op = CEPH_OSD_OP_WRITE;
@@ -2318,6 +2374,7 @@ public:
     ops[i].op.extent.truncate_size = trunc_size;
     ops[i].op.extent.truncate_seq = trunc_seq;
     ops[i].indata = bl;
+    ops[i].op.flags = op_flags;
     Op *o = new Op(oid, oloc, ops, flags | global_op_flags.read() | CEPH_OSD_FLAG_WRITE, onack, oncommit, objver);
     o->mtime = mtime;
     o->snapc = snapc;
@@ -2326,13 +2383,15 @@ public:
   ceph_tid_t write_full(const object_t& oid, const object_locator_t& oloc,
 		   const SnapContext& snapc, const bufferlist &bl, utime_t mtime, int flags,
 		   Context *onack, Context *oncommit,
-		   version_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
+		   version_t *objver = NULL, ObjectOperation *extra_ops = NULL,
+		   int op_flags = 0) {
     vector<OSDOp> ops;
     int i = init_ops(ops, 1, extra_ops);
     ops[i].op.op = CEPH_OSD_OP_WRITEFULL;
     ops[i].op.extent.offset = 0;
     ops[i].op.extent.length = bl.length();
     ops[i].indata = bl;
+    ops[i].op.flags = op_flags;
     Op *o = new Op(oid, oloc, ops, flags | global_op_flags.read() | CEPH_OSD_FLAG_WRITE, onack, oncommit, objver);
     o->mtime = mtime;
     o->snapc = snapc;
@@ -2409,16 +2468,6 @@ public:
     return op_submit(o);
   }
 
-  ceph_tid_t lock(const object_t& oid, const object_locator_t& oloc, int op, int flags,
-	     Context *onack, Context *oncommit, version_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
-    SnapContext snapc;  // no snapc for lock ops
-    vector<OSDOp> ops;
-    int i = init_ops(ops, 1, extra_ops);
-    ops[i].op.op = op;
-    Op *o = new Op(oid, oloc, ops, flags | global_op_flags.read() | CEPH_OSD_FLAG_WRITE, onack, oncommit, objver);
-    o->snapc = snapc;
-    return op_submit(o);
-  }
   ceph_tid_t setxattr(const object_t& oid, const object_locator_t& oloc,
 	      const char *name, const SnapContext& snapc, const bufferlist &bl,
 	      utime_t mtime, int flags,
@@ -2465,7 +2514,7 @@ public:
 private:
   void pool_op_submit(PoolOp *op);
   void _pool_op_submit(PoolOp *op);
-  void _finish_pool_op(PoolOp *op);
+  void _finish_pool_op(PoolOp *op, int r);
   void _do_delete_pool(int64_t pool, Context *onfinish);
 public:
   int create_pool_snap(int64_t pool, string& snapName, Context *onfinish);
@@ -2491,7 +2540,7 @@ public:
   void get_pool_stats(list<string>& pools, map<string,pool_stat_t> *result,
 		      Context *onfinish);
   int pool_stat_op_cancel(ceph_tid_t tid, int r);
-  void _finish_pool_stat_op(PoolStatOp *op);
+  void _finish_pool_stat_op(PoolStatOp *op, int r);
 
   // ---------------------------
   // df stats
@@ -2501,7 +2550,7 @@ public:
   void handle_fs_stats_reply(MStatfsReply *m);
   void get_fs_stats(struct ceph_statfs& result, Context *onfinish);
   int statfs_op_cancel(ceph_tid_t tid, int r);
-  void _finish_statfs_op(StatfsOp *op);
+  void _finish_statfs_op(StatfsOp *op, int r);
 
   // ---------------------------
   // some scatter/gather hackery
@@ -2527,33 +2576,36 @@ public:
   };
 
   void sg_read_trunc(vector<ObjectExtent>& extents, snapid_t snap, bufferlist *bl, int flags,
-		uint64_t trunc_size, __u32 trunc_seq, Context *onfinish) {
+		uint64_t trunc_size, __u32 trunc_seq, Context *onfinish, int op_flags = 0) {
     if (extents.size() == 1) {
       read_trunc(extents[0].oid, extents[0].oloc, extents[0].offset, extents[0].length,
-	   snap, bl, flags, extents[0].truncate_size, trunc_seq, onfinish);
+	   snap, bl, flags, extents[0].truncate_size, trunc_seq, onfinish, 
+	   0, 0, op_flags);
     } else {
       C_GatherBuilder gather(cct);
       vector<bufferlist> resultbl(extents.size());
       int i=0;
       for (vector<ObjectExtent>::iterator p = extents.begin(); p != extents.end(); ++p) {
 	read_trunc(p->oid, p->oloc, p->offset, p->length,
-	     snap, &resultbl[i++], flags, p->truncate_size, trunc_seq, gather.new_sub());
+	     snap, &resultbl[i++], flags, p->truncate_size, trunc_seq, gather.new_sub(),
+	     0, 0, op_flags);
       }
       gather.set_finisher(new C_SGRead(this, extents, resultbl, bl, onfinish));
       gather.activate();
     }
   }
 
-  void sg_read(vector<ObjectExtent>& extents, snapid_t snap, bufferlist *bl, int flags, Context *onfinish) {
-    sg_read_trunc(extents, snap, bl, flags, 0, 0, onfinish);
+  void sg_read(vector<ObjectExtent>& extents, snapid_t snap, bufferlist *bl, int flags, Context *onfinish, int op_flags = 0) {
+    sg_read_trunc(extents, snap, bl, flags, 0, 0, onfinish, op_flags);
   }
 
   void sg_write_trunc(vector<ObjectExtent>& extents, const SnapContext& snapc, const bufferlist& bl, utime_t mtime,
 		int flags, uint64_t trunc_size, __u32 trunc_seq,
-		Context *onack, Context *oncommit) {
+		Context *onack, Context *oncommit, int op_flags = 0) {
     if (extents.size() == 1) {
       write_trunc(extents[0].oid, extents[0].oloc, extents[0].offset, extents[0].length,
-	    snapc, bl, mtime, flags, extents[0].truncate_size, trunc_seq, onack, oncommit);
+	    snapc, bl, mtime, flags, extents[0].truncate_size, trunc_seq, onack, oncommit,
+	    0, 0, op_flags);
     } else {
       C_GatherBuilder gack(cct, onack);
       C_GatherBuilder gcom(cct, oncommit);
@@ -2567,7 +2619,8 @@ public:
 	write_trunc(p->oid, p->oloc, p->offset, p->length, 
 	      snapc, cur, mtime, flags, p->truncate_size, trunc_seq,
 	      onack ? gack.new_sub():0,
-	      oncommit ? gcom.new_sub():0);
+	      oncommit ? gcom.new_sub():0,
+	      0, 0, op_flags);
       }
       gack.activate();
       gcom.activate();
@@ -2575,8 +2628,8 @@ public:
   }
 
   void sg_write(vector<ObjectExtent>& extents, const SnapContext& snapc, const bufferlist& bl, utime_t mtime,
-		int flags, Context *onack, Context *oncommit) {
-    sg_write_trunc(extents, snapc, bl, mtime, flags, 0, 0, onack, oncommit);
+		int flags, Context *onack, Context *oncommit, int op_flags = 0) {
+    sg_write_trunc(extents, snapc, bl, mtime, flags, 0, 0, onack, oncommit, op_flags);
   }
 
   void ms_handle_connect(Connection *con);
diff --git a/src/osdc/Striper.cc b/src/osdc/Striper.cc
index 4a855dd..a8682de 100644
--- a/src/osdc/Striper.cc
+++ b/src/osdc/Striper.cc
@@ -59,6 +59,10 @@ void Striper::file_to_extents(CephContext *cct, const char *object_format,
   __u32 su = layout->fl_stripe_unit;
   __u32 stripe_count = layout->fl_stripe_count;
   assert(object_size >= su);
+  if (stripe_count == 1) {
+    ldout(cct, 20) << " sc is one, reset su to os" << dendl;
+    su = object_size;
+  }
   uint64_t stripes_per_object = object_size / su;
   ldout(cct, 20) << " su " << su << " sc " << stripe_count << " os " << object_size
 		 << " stripes_per_object " << stripes_per_object << dendl;
@@ -218,11 +222,11 @@ uint64_t Striper::get_num_objects(const ceph_file_layout& layout, uint64_t size)
   __u32 object_size = layout.fl_object_size;
   __u32 stripe_unit = layout.fl_stripe_unit;
   __u32 stripe_count = layout.fl_stripe_count;
-  uint64_t period = stripe_count * object_size;
+  uint64_t period = (uint64_t)stripe_count * object_size;
   uint64_t num_periods = (size + period - 1) / period;
   uint64_t remainder_bytes = size % period;
   uint64_t remainder_objs = 0;
-  if ((remainder_bytes > 0) && (remainder_bytes < stripe_count * stripe_unit))
+  if ((remainder_bytes > 0) && (remainder_bytes < (uint64_t)stripe_count * stripe_unit))
     remainder_objs = stripe_count - ((remainder_bytes + stripe_unit - 1) / stripe_unit);
   return num_periods * stripe_count - remainder_objs;
 }
diff --git a/src/osdc/WritebackHandler.h b/src/osdc/WritebackHandler.h
index fe7d977..cbcf20d 100644
--- a/src/osdc/WritebackHandler.h
+++ b/src/osdc/WritebackHandler.h
@@ -33,10 +33,6 @@ class WritebackHandler {
 			   const bufferlist &bl, utime_t mtime,
 			   uint64_t trunc_size, __u32 trunc_seq,
 			   Context *oncommit) = 0;
-  virtual ceph_tid_t lock(const object_t& oid, const object_locator_t& oloc,
-			  int op, int flags, Context *onack, Context *oncommit) {
-    assert(0 == "this WritebackHandler does not support the lock operation");
-  }
 
   virtual void get_client_lock() {}
   virtual void put_client_lock() {}
diff --git a/src/pybind/ceph_argparse.py b/src/pybind/ceph_argparse.py
index 12f0b70..9a83057 100644
--- a/src/pybind/ceph_argparse.py
+++ b/src/pybind/ceph_argparse.py
@@ -20,48 +20,56 @@ import sys
 import types
 import uuid
 
+
 class ArgumentError(Exception):
     """
     Something wrong with arguments
     """
     pass
 
+
 class ArgumentNumber(ArgumentError):
     """
     Wrong number of a repeated argument
     """
     pass
 
+
 class ArgumentFormat(ArgumentError):
     """
     Argument value has wrong format
     """
     pass
 
+
 class ArgumentValid(ArgumentError):
     """
     Argument value is otherwise invalid (doesn't match choices, for instance)
     """
     pass
 
+
 class ArgumentTooFew(ArgumentError):
     """
     Fewer arguments than descriptors in signature; may mean to continue
     the search, so gets a special exception type
     """
 
+
 class ArgumentPrefix(ArgumentError):
     """
     Special for mismatched prefix; less severe, don't report by default
     """
     pass
 
+
 class JsonFormat(Exception):
     """
     some syntactic or semantic issue with the JSON
     """
     pass
 
+
 class CephArgtype(object):
     """
     Base class for all Ceph argument types
@@ -110,6 +118,7 @@ class CephArgtype(object):
         """
         return '<{0}>'.format(self.__class__.__name__)
 
+
 class CephInt(CephArgtype):
     """
     range-limited integers, [+|-][0-9]+ or 0x[0-9a-f]+
@@ -178,6 +187,7 @@ class CephFloat(CephArgtype):
             r = '[{0}-{1}]'.format(self.range[0], self.range[1])
         return '<float{0}>'.format(r)
 
+
 class CephString(CephArgtype):
     """
     String; pretty generic.  goodchars is a RE char class of valid chars
@@ -187,8 +197,8 @@ class CephString(CephArgtype):
         try:
             re.compile(goodchars)
         except:
-            raise ValueError('CephString(): "{0}" is not a valid RE'.\
-                format(goodchars))
+            raise ValueError('CephString(): "{0}" is not a valid RE'.
+                             format(goodchars))
         self.goodchars = goodchars
         self.goodset = frozenset(
             [c for c in printable if re.match(goodchars, c)]
@@ -197,8 +207,8 @@ class CephString(CephArgtype):
     def valid(self, s, partial=False):
         sset = set(s)
         if self.goodset and not sset <= self.goodset:
-            raise ArgumentFormat("invalid chars {0} in {1}".\
-                format(''.join(sset - self.goodset), s))
+            raise ArgumentFormat("invalid chars {0} in {1}".
+                                 format(''.join(sset - self.goodset), s))
         self.val = s
 
     def __str__(self):
@@ -207,6 +217,7 @@ class CephString(CephArgtype):
             b += '(goodchars {0})'.format(self.goodchars)
         return '<string{0}>'.format(b)
 
+
 class CephSocketpath(CephArgtype):
     """
     Admin socket path; check that it's readable and S_ISSOCK
@@ -220,6 +231,7 @@ class CephSocketpath(CephArgtype):
     def __str__(self):
         return '<admin-socket-path>'
 
+
 class CephIPAddr(CephArgtype):
     """
     IP address (v4 or v6) with optional port
@@ -235,7 +247,7 @@ class CephIPAddr(CephArgtype):
             port = s.find(':')
             if (port != -1):
                 a = s[:port]
-                p = s[port+1:]
+                p = s[port + 1:]
                 if int(p) > 65535:
                     raise ArgumentValid('{0}: invalid IPv4 port'.format(p))
             else:
@@ -251,9 +263,9 @@ class CephIPAddr(CephArgtype):
                 end = s.find(']')
                 if end == -1:
                     raise ArgumentFormat('{0} missing terminating ]'.format(s))
-                if s[end+1] == ':':
+                if s[end + 1] == ':':
                     try:
-                        p = int(s[end+2])
+                        p = int(s[end + 2])
                     except:
                         raise ArgumentValid('{0}: bad port number'.format(s))
                 a = s[1:end]
@@ -273,6 +285,7 @@ class CephIPAddr(CephArgtype):
     def __str__(self):
         return '<IPaddr[:port]>'
 
+
 class CephEntityAddr(CephIPAddr):
     """
     EntityAddress, that is, IP address[/nonce]
@@ -292,7 +305,7 @@ class CephEntityAddr(CephIPAddr):
                 pass
             if nonce_long is None or nonce_long < 0:
                 raise ArgumentValid(
-                    '{0}: invalid entity, nonce {1} not integer > 0'.\
+                    '{0}: invalid entity, nonce {1} not integer > 0'.
                     format(s, nonce)
                 )
         self.val = s
@@ -300,6 +313,7 @@ class CephEntityAddr(CephIPAddr):
     def __str__(self):
         return '<EntityAddr>'
 
+
 class CephPoolname(CephArgtype):
     """
     Pool name; very little utility
@@ -307,6 +321,7 @@ class CephPoolname(CephArgtype):
     def __str__(self):
         return '<poolname>'
 
+
 class CephObjectname(CephArgtype):
     """
     Object name.  Maybe should be combined with Pool name as they're always
@@ -315,6 +330,7 @@ class CephObjectname(CephArgtype):
     def __str__(self):
         return '<objectname>'
 
+
 class CephPgid(CephArgtype):
     """
     pgid, in form N.xxx (N = pool number, xxx = hex pgnum)
@@ -322,18 +338,19 @@ class CephPgid(CephArgtype):
     def valid(self, s, partial=False):
         if s.find('.') == -1:
             raise ArgumentFormat('pgid has no .')
-        poolid, pgnum = s.split('.')
+        poolid, pgnum = s.split('.', 1)
         if poolid < 0:
             raise ArgumentFormat('pool {0} < 0'.format(poolid))
         try:
             pgnum = int(pgnum, 16)
-        except:
+        except ValueError:
             raise ArgumentFormat('pgnum {0} not hex integer'.format(pgnum))
         self.val = s
 
     def __str__(self):
         return '<pgid>'
 
+
 class CephName(CephArgtype):
     """
     Name (type.id) where:
@@ -353,8 +370,8 @@ class CephName(CephArgtype):
         if s.find('.') == -1:
             raise ArgumentFormat('CephName: no . in {0}'.format(s))
         else:
-            t, i = s.split('.')
-            if not t in ('osd', 'mon', 'client', 'mds'):
+            t, i = s.split('.', 1)
+            if t not in ('osd', 'mon', 'client', 'mds'):
                 raise ArgumentValid('unknown type ' + t)
             if t == 'osd':
                 if i != '*':
@@ -369,6 +386,7 @@ class CephName(CephArgtype):
     def __str__(self):
         return '<name (type.id)>'
 
+
 class CephOsdName(CephArgtype):
     """
     Like CephName, but specific to osds: allow <id> alone
@@ -384,7 +402,7 @@ class CephOsdName(CephArgtype):
             self.val = s
             return
         if s.find('.') != -1:
-            t, i = s.split('.')
+            t, i = s.split('.', 1)
             if t != 'osd':
                 raise ArgumentValid('unknown type ' + t)
         else:
@@ -401,6 +419,7 @@ class CephOsdName(CephArgtype):
     def __str__(self):
         return '<osdname (id|osd.id)>'
 
+
 class CephChoices(CephArgtype):
     """
     Set of string literals; init with valid choices
@@ -410,7 +429,7 @@ class CephChoices(CephArgtype):
 
     def valid(self, s, partial=False):
         if not partial:
-            if not s in self.strings:
+            if s not in self.strings:
                 # show as __str__ does: {s1|s2..}
                 raise ArgumentValid("{0} not in {1}".format(s, self))
             self.val = s
@@ -429,6 +448,7 @@ class CephChoices(CephArgtype):
         else:
             return '{0}'.format('|'.join(self.strings))
 
+
 class CephFilepath(CephArgtype):
     """
     Openable file
@@ -444,6 +464,7 @@ class CephFilepath(CephArgtype):
     def __str__(self):
         return '<outfilename>'
 
+
 class CephFragment(CephArgtype):
     """
     'Fragment' ??? XXX
@@ -492,6 +513,13 @@ class CephPrefix(CephArgtype):
         self.prefix = prefix
 
     def valid(self, s, partial=False):
+        try:
+            # `prefix` can always be converted into unicode when being compared,
+            # but `s` could be anything passed by user.
+            s = unicode(s)
+        except UnicodeDecodeError:
+            raise ArgumentPrefix("no match for {0}".format(s))
+
         if partial:
             if self.prefix.startswith(s):
                 self.val = s
@@ -530,7 +558,7 @@ class argdesc(object):
     def __init__(self, t, name=None, n=1, req=True, **kwargs):
         if isinstance(t, types.StringTypes):
             self.t = CephPrefix
-            self.typeargs = {'prefix':t}
+            self.typeargs = {'prefix': t}
             self.req = True
         else:
             self.t = t
@@ -562,7 +590,7 @@ class argdesc(object):
 
     def __str__(self):
         if ((self.t == CephChoices and len(self.instance.strings) == 1)
-            or (self.t == CephPrefix)):
+           or (self.t == CephPrefix)):
             s = str(self.instance)
         else:
             s = '{0}({1})'.format(self.name, str(self.instance))
@@ -588,12 +616,14 @@ class argdesc(object):
             s = '{' + s + '}'
         return s
 
+
 def concise_sig(sig):
     """
     Return string representation of sig useful for syntax reference in help
     """
     return ' '.join([d.helpstr() for d in sig])
 
+
 def descsort(sh1, sh2):
     """
     sort descriptors by prefixes, defined as the concatenation of all simple
@@ -601,6 +631,7 @@ def descsort(sh1, sh2):
     """
     return cmp(concise_sig(sh1['sig']), concise_sig(sh2['sig']))
 
+
 def parse_funcsig(sig):
     """
     parse a single descriptor (array of strings or dicts) into a
@@ -612,10 +643,10 @@ def parse_funcsig(sig):
         argnum += 1
         if isinstance(desc, types.StringTypes):
             t = CephPrefix
-            desc = {'type':t, 'name':'prefix', 'prefix':desc}
+            desc = {'type': t, 'name': 'prefix', 'prefix': desc}
         else:
             # not a simple string, must be dict
-            if not 'type' in desc:
+            if 'type' not in desc:
                 s = 'JSON descriptor {0} has no type'.format(sig)
                 raise JsonFormat(s)
             # look up type string in our globals() dict; if it's an
@@ -623,7 +654,7 @@ def parse_funcsig(sig):
             # locally-defined class. otherwise, we haven't a clue.
             if desc['type'] in globals():
                 t = globals()[desc['type']]
-                if type(t) != types.TypeType:
+                if not isinstance(t, types.TypeType):
                     s = 'unknown type {0}'.format(desc['type'])
                     raise JsonFormat(s)
             else:
@@ -680,12 +711,12 @@ def parse_json_funcsigs(s, consumer):
         raise e
     sigdict = {}
     for cmdtag, cmd in overall.iteritems():
-        if not 'sig' in cmd:
+        if 'sig' not in cmd:
             s = "JSON descriptor {0} has no 'sig'".format(cmdtag)
             raise JsonFormat(s)
         # check 'avail' and possibly ignore this command
         if 'avail' in cmd:
-            if not consumer in cmd['avail']:
+            if consumer not in cmd['avail']:
                 continue
         # rewrite the 'sig' item with the argdesc-ized version, and...
         cmd['sig'] = parse_funcsig(cmd['sig'])
@@ -693,6 +724,7 @@ def parse_json_funcsigs(s, consumer):
         sigdict[cmdtag] = cmd
     return sigdict
 
+
 def validate_one(word, desc, partial=False):
     """
     validate_one(word, desc, partial=False)
@@ -707,6 +739,7 @@ def validate_one(word, desc, partial=False):
     if desc.N:
         desc.n = desc.numseen + 1
 
+
 def matchnum(args, signature, partial=False):
     """
     matchnum(s, signature, partial=False)
@@ -748,6 +781,7 @@ def matchnum(args, signature, partial=False):
             matchcnt += 1
     return matchcnt
 
+
 def get_next_arg(desc, args):
     '''
     Get either the value matching key 'desc.name' or the next arg in
@@ -773,6 +807,7 @@ def get_next_arg(desc, args):
             arg = arg[0]
     return arg
 
+
 def store_arg(desc, d):
     '''
     Store argument described by, and held in, thanks to valid(),
@@ -797,6 +832,7 @@ def store_arg(desc, d):
         # if first CephPrefix or any other type, just set it
         d[desc.name] = desc.instance.val
 
+
 def validate(args, signature, partial=False):
     """
     validate(args, signature, partial=False)
@@ -839,8 +875,8 @@ def validate(args, signature, partial=False):
                     if partial:
                         return d
                     raise ArgumentNumber(
-                        'saw {0} of {1}, expected at least 1'.\
-                         format(desc.numseen, desc)
+                        'saw {0} of {1}, expected at least 1'.
+                        format(desc.numseen, desc)
                     )
                 elif not desc.N and desc.numseen < desc.n:
                     # wanted n, got too few
@@ -852,7 +888,7 @@ def validate(args, signature, partial=False):
                             'missing required parameter {0}'.format(desc)
                         )
                     raise ArgumentNumber(
-                        'saw {0} of {1}, expected {2}'.\
+                        'saw {0} of {1}, expected {2}'.
                         format(desc.numseen, desc, desc.n)
                     )
                 break
@@ -895,11 +931,13 @@ def validate(args, signature, partial=False):
     # Finally, success
     return d
 
+
 def cmdsiglen(sig):
     sigdict = sig.values()
     assert len(sigdict) == 1
     return len(sig.values()[0]['sig'])
 
+
 def validate_command(sigdict, args, verbose=False):
     """
     turn args into a valid dictionary ready to be sent off as JSON,
@@ -921,21 +959,23 @@ def validate_command(sigdict, args, verbose=False):
             if (matched > best_match_cnt):
                 if verbose:
                     print >> sys.stderr, \
-                        "better match: {0} > {1}: {2}:{3} ".format(matched,
-                                      best_match_cnt, cmdtag, concise_sig(sig))
+                        "better match: {0} > {1}: {2}:{3} ".\
+                        format(matched, best_match_cnt, cmdtag,
+                               concise_sig(sig))
                 best_match_cnt = matched
-                bestcmds = [{cmdtag:cmd}]
+                bestcmds = [{cmdtag: cmd}]
             elif matched == best_match_cnt:
                 if verbose:
                     print >> sys.stderr, \
-                        "equal match: {0} > {1}: {2}:{3} ".format(matched,
-                                      best_match_cnt, cmdtag, concise_sig(sig))
-                bestcmds.append({cmdtag:cmd})
+                        "equal match: {0} > {1}: {2}:{3} ".\
+                        format(matched, best_match_cnt, cmdtag,
+                               concise_sig(sig))
+                bestcmds.append({cmdtag: cmd})
 
         # Sort bestcmds by number of args so we can try shortest first
         # (relies on a cmdsig being key,val where val is a list of len 1)
         bestcmds_sorted = sorted(bestcmds,
-                                 cmp=lambda x,y:cmp(cmdsiglen(x), cmdsiglen(y)))
+                                 cmp=lambda x, y: cmp(cmdsiglen(x), cmdsiglen(y)))
 
         if verbose:
             print >> sys.stderr, "bestcmds_sorted: "
@@ -960,7 +1000,7 @@ def validate_command(sigdict, args, verbose=False):
                     # the next one matches completely.  Whine, but pass.
                     if verbose:
                         print >> sys.stderr, 'Not enough args supplied for ', \
-                                              concise_sig(sig)
+                            concise_sig(sig)
                 except ArgumentError as e:
                     # Solid mismatch on an arg (type, range, etc.)
                     # Stop now, because we have the right command but
@@ -980,6 +1020,7 @@ def validate_command(sigdict, args, verbose=False):
 
         return valid_dict
 
+
 def find_cmd_target(childargs):
     """
     Using a minimal validation, figure out whether the command
@@ -988,7 +1029,7 @@ def find_cmd_target(childargs):
     right daemon.
     Returns ('osd', osdid), ('pg', pgid), or ('mon', '')
     """
-    sig = parse_funcsig(['tell', {'name':'target', 'type':'CephName'}])
+    sig = parse_funcsig(['tell', {'name': 'target', 'type': 'CephName'}])
     try:
         valid_dict = validate(childargs, sig, partial=True)
     except ArgumentError:
@@ -1002,7 +1043,7 @@ def find_cmd_target(childargs):
             name.valid(valid_dict['target'])
             return name.nametype, name.nameid
 
-    sig = parse_funcsig(['tell', {'name':'pgid', 'type':'CephPgid'}])
+    sig = parse_funcsig(['tell', {'name': 'pgid', 'type': 'CephPgid'}])
     try:
         valid_dict = validate(childargs, sig, partial=True)
     except ArgumentError:
@@ -1036,7 +1077,7 @@ def find_cmd_target(childargs):
         name.valid(childargs[1])
         return name.nametype, name.nameid
 
-    sig = parse_funcsig(['pg', {'name':'pgid', 'type':'CephPgid'}])
+    sig = parse_funcsig(['pg', {'name': 'pgid', 'type': 'CephPgid'}])
     try:
         valid_dict = validate(childargs, sig, partial=True)
     except ArgumentError:
@@ -1047,6 +1088,7 @@ def find_cmd_target(childargs):
 
     return 'mon', ''
 
+
 def send_command(cluster, target=('mon', ''), cmd=None, inbuf='', timeout=0,
                  verbose=False):
     """
@@ -1125,6 +1167,7 @@ def send_command(cluster, target=('mon', ''), cmd=None, inbuf='', timeout=0,
 
     return ret, outbuf, outs
 
+
 def json_command(cluster, target=('mon', ''), prefix=None, argdict=None,
                  inbuf='', timeout=0, verbose=False):
     """
@@ -1136,7 +1179,7 @@ def json_command(cluster, target=('mon', ''), prefix=None, argdict=None,
     """
     cmddict = {}
     if prefix:
-        cmddict.update({'prefix':prefix})
+        cmddict.update({'prefix': prefix})
     if argdict:
         cmddict.update(argdict)
 
@@ -1168,4 +1211,3 @@ def json_command(cluster, target=('mon', ''), prefix=None, argdict=None,
 
     return ret, outbuf, outs
 
-
diff --git a/src/pybind/ceph_daemon.py b/src/pybind/ceph_daemon.py
new file mode 100755
index 0000000..638ef89
--- /dev/null
+++ b/src/pybind/ceph_daemon.py
@@ -0,0 +1,278 @@
+# -*- mode:python -*-
+# vim: ts=4 sw=4 smarttab expandtab
+
+"""
+Copyright (C) 2015 Red Hat
+
+This is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public
+License version 2, as published by the Free Software
+Foundation.  See file COPYING.
+"""
+
+import sys
+import json
+import socket
+import struct
+import time
+from collections import defaultdict
+
+from ceph_argparse import parse_json_funcsigs, validate_command
+
+COUNTER = 0x8
+LONG_RUNNING_AVG = 0x4
+
+
+def admin_socket(asok_path, cmd, format=''):
+    """
+    Send a daemon (--admin-daemon) command 'cmd'.  asok_path is the
+    path to the admin socket; cmd is a list of strings; format may be
+    set to one of the formatted forms to get output in that form
+    (daemon commands don't support 'plain' output).
+    """
+
+    def do_sockio(path, cmd_bytes):
+        """ helper: do all the actual low-level stream I/O """
+        sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
+        sock.connect(path)
+        try:
+            sock.sendall(cmd_bytes + '\0')
+            len_str = sock.recv(4)
+            if len(len_str) < 4:
+                raise RuntimeError("no data returned from admin socket")
+            l, = struct.unpack(">I", len_str)
+            sock_ret = ''
+
+            got = 0
+            while got < l:
+                bit = sock.recv(l - got)
+                sock_ret += bit
+                got += len(bit)
+
+        except Exception as sock_e:
+            raise RuntimeError('exception: ' + str(sock_e))
+        return sock_ret
+
+    try:
+        cmd_json = do_sockio(asok_path,
+                             json.dumps({"prefix": "get_command_descriptions"}))
+    except Exception as e:
+        raise RuntimeError('exception getting command descriptions: ' + str(e))
+
+    if cmd == 'get_command_descriptions':
+        return cmd_json
+
+    sigdict = parse_json_funcsigs(cmd_json, 'cli')
+    valid_dict = validate_command(sigdict, cmd)
+    if not valid_dict:
+        raise RuntimeError('invalid command')
+
+    if format:
+        valid_dict['format'] = format
+
+    try:
+        ret = do_sockio(asok_path, json.dumps(valid_dict))
+    except Exception as e:
+        raise RuntimeError('exception: ' + str(e))
+
+    return ret
+
+
+class DaemonWatcher(object):
+    """
+    Given a Ceph daemon's admin socket path, poll its performance counters
+    and output a series of output lines showing the momentary values of
+    counters of interest (those with the 'nick' property in Ceph's schema)
+    """
+    (
+        BLACK,
+        RED,
+        GREEN,
+        YELLOW,
+        BLUE,
+        MAGENTA,
+        CYAN,
+        GRAY
+    ) = range(8)
+
+    RESET_SEQ = "\033[0m"
+    COLOR_SEQ = "\033[1;%dm"
+    COLOR_DARK_SEQ = "\033[0;%dm"
+    BOLD_SEQ = "\033[1m"
+    UNDERLINE_SEQ = "\033[4m"
+
+    def __init__(self, asok):
+        self.asok_path = asok
+        self._colored = False
+
+        self._stats = None
+        self._schema = None
+
+    def supports_color(self, ostr):
+        """
+        Returns True if the running system's terminal supports color, and False
+        otherwise.
+        """
+        unsupported_platform = (sys.platform in ('win32', 'Pocket PC'))
+        # isatty is not always implemented, #6223.
+        is_a_tty = hasattr(ostr, 'isatty') and ostr.isatty()
+        if unsupported_platform or not is_a_tty:
+            return False
+        return True
+
+    def colorize(self, msg, color, dark=False):
+        """
+        Decorate `msg` with escape sequences to give the requested color
+        """
+        return (self.COLOR_DARK_SEQ if dark else self.COLOR_SEQ) % (30 + color) \
+               + msg + self.RESET_SEQ
+
+    def bold(self, msg):
+        """
+        Decorate `msg` with escape sequences to make it appear bold
+        """
+        return self.BOLD_SEQ + msg + self.RESET_SEQ
+
+    def format_dimless(self, n, width):
+        """
+        Format a number without units, so as to fit into `width` characters, substituting
+        an appropriate unit suffix.
+        """
+        units = [' ', 'k', 'M', 'G', 'T', 'P']
+        unit = 0
+        while len("%s" % (int(n) / (1000**unit))) > width - 1:
+            unit += 1
+
+        if unit > 0:
+            truncated_float = ("%f" % (n / (1000.0 ** unit)))[0:width - 1]
+            if truncated_float[-1] == '.':
+                truncated_float = " " + truncated_float[0:-1]
+        else:
+            truncated_float = "%{wid}d".format(wid=width-1) % n
+        formatted = "%s%s" % (truncated_float, units[unit])
+
+        if self._colored:
+            if n == 0:
+                color = self.BLACK, False
+            else:
+                color = self.YELLOW, False
+            return self.bold(self.colorize(formatted[0:-1], color[0], color[1])) \
+                + self.bold(self.colorize(formatted[-1], self.BLACK, False))
+        else:
+            return formatted
+
+    def col_width(self, nick):
+        """
+        Given the short name `nick` for a column, how many characters
+        of width should the column be allocated?  Does not include spacing
+        between columns.
+        """
+        return max(len(nick), 4)
+
+    def _print_headers(self, ostr):
+        """
+        Print a header row to `ostr`
+        """
+        header = ""
+        for section_name, names in self._stats.items():
+            section_width = sum([self.col_width(x)+1 for x in names.values()]) - 1
+            pad = max(section_width - len(section_name), 0)
+            pad_prefix = pad / 2
+            header += (pad_prefix * '-')
+            header += (section_name[0:section_width])
+            header += ((pad - pad_prefix) * '-')
+            header += ' '
+        header += "\n"
+        ostr.write(self.colorize(header, self.BLUE, True))
+
+        sub_header = ""
+        for section_name, names in self._stats.items():
+            for stat_name, stat_nick in names.items():
+                sub_header += self.UNDERLINE_SEQ \
+                              + self.colorize(
+                                    stat_nick.ljust(self.col_width(stat_nick)),
+                                    self.BLUE) \
+                              + ' '
+            sub_header = sub_header[0:-1] + self.colorize('|', self.BLUE)
+        sub_header += "\n"
+        ostr.write(sub_header)
+
+    def _print_vals(self, ostr, dump, last_dump):
+        """
+        Print a single row of values to `ostr`, based on deltas between `dump` and
+        `last_dump`.
+        """
+        val_row = ""
+        for section_name, names in self._stats.items():
+            for stat_name, stat_nick in names.items():
+                stat_type = self._schema[section_name][stat_name]['type']
+                if bool(stat_type & COUNTER):
+                    n = max(dump[section_name][stat_name] -
+                            last_dump[section_name][stat_name], 0)
+                elif bool(stat_type & LONG_RUNNING_AVG):
+                    entries = dump[section_name][stat_name]['avgcount'] - \
+                            last_dump[section_name][stat_name]['avgcount']
+                    if entries:
+                        n = (dump[section_name][stat_name]['sum'] -
+                             last_dump[section_name][stat_name]['sum']) \
+                            / float(entries)
+                        n *= 1000.0  # Present in milliseconds
+                    else:
+                        n = 0
+                else:
+                    n = dump[section_name][stat_name]
+
+                val_row += self.format_dimless(n, self.col_width(stat_nick))
+                val_row += " "
+            val_row = val_row[0:-1]
+            val_row += self.colorize("|", self.BLUE)
+        val_row = val_row[0:-len(self.colorize("|", self.BLUE))]
+        ostr.write("{0}\n".format(val_row))
+
+    def _load_schema(self):
+        """
+        Populate our instance-local copy of the daemon's performance counter
+        schema, and work out which stats we will display.
+        """
+        self._schema = json.loads(admin_socket(self.asok_path, ["perf", "schema"]))
+
+        # Build list of which stats we will display, based on which
+        # stats have a nickname
+        self._stats = defaultdict(dict)
+        for section_name, section_stats in self._schema.items():
+            for name, schema_data in section_stats.items():
+                if schema_data.get('nick'):
+                    self._stats[section_name][name] = schema_data['nick']
+
+    def run(self, interval, count=None, ostr=sys.stdout):
+        """
+        Print output at regular intervals until interrupted.
+
+        :param ostr: Stream to which to send output
+        """
+
+        self._load_schema()
+        self._colored = self.supports_color(ostr)
+
+        self._print_headers(ostr)
+
+        last_dump = json.loads(admin_socket(self.asok_path, ["perf", "dump"]))
+        rows_since_header = 0
+        term_height = 25
+
+        try:
+            while True:
+                dump = json.loads(admin_socket(self.asok_path, ["perf", "dump"]))
+                if rows_since_header > term_height - 2:
+                    self._print_headers(ostr)
+                    rows_since_header = 0
+                self._print_vals(ostr, dump, last_dump)
+                if count is not None:
+                    count -= 1
+                    if count <= 0:
+                        break
+                rows_since_header += 1
+                last_dump = dump
+                time.sleep(interval)
+        except KeyboardInterrupt:
+            return
diff --git a/src/pybind/ceph_rest_api.py b/src/pybind/ceph_rest_api.py
index 77adbe7..7792013 100755
--- a/src/pybind/ceph_rest_api.py
+++ b/src/pybind/ceph_rest_api.py
@@ -34,13 +34,14 @@ APPNAME = '__main__'
 app = flask.Flask(APPNAME)
 
 LOGLEVELS = {
-    'critical':logging.CRITICAL,
-    'error':logging.ERROR,
-    'warning':logging.WARNING,
-    'info':logging.INFO,
-    'debug':logging.DEBUG,
+    'critical': logging.CRITICAL,
+    'error': logging.ERROR,
+    'warning': logging.WARNING,
+    'info': logging.INFO,
+    'debug': logging.DEBUG,
 }
 
+
 def find_up_osd(app):
     '''
     Find an up OSD.  Return the last one that's up.
@@ -60,7 +61,8 @@ def find_up_osd(app):
     return int(osds[-1])
 
 
-METHOD_DICT = {'r':['GET'], 'w':['PUT', 'DELETE']}
+METHOD_DICT = {'r': ['GET'], 'w': ['PUT', 'DELETE']}
+
 
 def api_setup(app, conf, cluster, clientname, clientid, args):
     '''
@@ -72,7 +74,7 @@ def api_setup(app, conf, cluster, clientname, clientid, args):
     signatures, module, perms, and help; stuff them away in the app.ceph_urls
     dict.  Also save app.ceph_sigdict for help() handling.
     '''
-    def get_command_descriptions(cluster, target=('mon','')):
+    def get_command_descriptions(cluster, target=('mon', '')):
         ret, outbuf, outs = json_command(cluster, target,
                                          prefix='get_command_descriptions',
                                          timeout=30)
@@ -104,7 +106,7 @@ def api_setup(app, conf, cluster, clientname, clientid, args):
     app.ceph_cluster.connect()
 
     app.ceph_baseurl = app.ceph_cluster.conf_get('restapi_base_url') \
-         or DEFAULT_BASEURL
+        or DEFAULT_BASEURL
     if app.ceph_baseurl.endswith('/'):
         app.ceph_baseurl = app.ceph_baseurl[:-1]
     addr = app.ceph_cluster.conf_get('public_addr') or DEFAULT_ADDR
@@ -171,13 +173,12 @@ def api_setup(app, conf, cluster, clientname, clientid, args):
         for k in METHOD_DICT.iterkeys():
             if k in perm:
                 methods = METHOD_DICT[k]
-        urldict = {'paramsig':params,
-                   'help':cmddict['help'],
-                   'module':cmddict['module'],
-                   'perm':perm,
-                   'flavor':flavor,
-                   'methods':methods,
-                  }
+        urldict = {'paramsig': params,
+                   'help': cmddict['help'],
+                   'module': cmddict['module'],
+                   'perm': perm,
+                   'flavor': flavor,
+                   'methods': methods, }
 
         # app.ceph_urls contains a list of urldicts (usually only one long)
         if url not in app.ceph_urls:
@@ -216,7 +217,7 @@ def generate_url_and_params(app, sig, flavor):
     # tack it onto the front of sig
     if flavor == 'tell':
         tellsig = parse_funcsig(['tell',
-                                {'name':'target', 'type':'CephOsdName'}])
+                                {'name': 'target', 'type': 'CephOsdName'}])
         sig = tellsig + sig
 
     for desc in sig:
@@ -226,17 +227,17 @@ def generate_url_and_params(app, sig, flavor):
         # CephChoices with 1 required string (not --) do too, unless
         # we've already started collecting params, in which case they
         # too are params
-        elif desc.t == CephChoices and \
-             len(desc.instance.strings) == 1 and \
-             desc.req and \
-             not str(desc.instance).startswith('--') and \
-             not params:
+        elif (desc.t == CephChoices and
+              len(desc.instance.strings) == 1 and
+              desc.req and
+              not str(desc.instance).startswith('--') and
+              not params):
             url += '/' + str(desc.instance)
         else:
             # tell/<target> is a weird case; the URL includes what
             # would everywhere else be a parameter
-            if flavor == 'tell' and  \
-              (desc.t, desc.name) == (CephOsdName, 'target'):
+            if flavor == 'tell' and ((desc.t, desc.name) ==
+               (CephOsdName, 'target')):
                 url += '/<target>'
             else:
                 params.append(desc)
@@ -247,7 +248,6 @@ def generate_url_and_params(app, sig, flavor):
 #
 # end setup (import-time) functions, begin request-time functions
 #
-
 def concise_sig_for_uri(sig, flavor):
     '''
     Return a generic description of how one would send a REST request for sig
@@ -267,6 +267,7 @@ def concise_sig_for_uri(sig, flavor):
         ret += '?' + '&'.join(args)
     return ret
 
+
 def show_human_help(prefix):
     '''
     Dump table showing commands matching prefix
@@ -274,7 +275,7 @@ def show_human_help(prefix):
     # XXX There ought to be a better discovery mechanism than an HTML table
     s = '<html><body><table border=1><th>Possible commands:</th><th>Method</th><th>Description</th>'
 
-    permmap = {'r':'GET', 'rw':'PUT', 'rx':'GET', 'rwx':'PUT'}
+    permmap = {'r': 'GET', 'rw': 'PUT', 'rx': 'GET', 'rwx': 'PUT'}
     line = ''
     for cmdsig in sorted(app.ceph_sigdict.itervalues(), cmp=descsort):
         concise = concise_sig(cmdsig['sig'])
@@ -301,6 +302,7 @@ def show_human_help(prefix):
     else:
         return ''
 
+
 @app.before_request
 def log_request():
     '''
@@ -309,10 +311,12 @@ def log_request():
     app.logger.info(flask.request.url + " from " + flask.request.remote_addr + " " + flask.request.user_agent.string)
     app.logger.debug("Accept: %s", flask.request.accept_mimetypes.values())
 
+
 @app.route('/')
 def root_redir():
     return flask.redirect(app.ceph_baseurl)
 
+
 def make_response(fmt, output, statusmsg, errorcode):
     '''
     If formatted output, cobble up a response object that contains the
@@ -324,8 +328,8 @@ def make_response(fmt, output, statusmsg, errorcode):
         if 'json' in fmt:
             try:
                 native_output = json.loads(output or '[]')
-                response = json.dumps({"output":native_output,
-                                       "status":statusmsg})
+                response = json.dumps({"output": native_output,
+                                       "status": statusmsg})
             except:
                 return flask.make_response("Error decoding JSON from " +
                                            output, 500)
@@ -334,13 +338,13 @@ def make_response(fmt, output, statusmsg, errorcode):
             # one is tempted to do this with xml.etree, but figuring out how
             # to 'un-XML' the XML-dumped output so it can be reassembled into
             # a piece of the tree here is beyond me right now.
-            #ET = xml.etree.ElementTree
-            #resp_elem = ET.Element('response')
-            #o = ET.SubElement(resp_elem, 'output')
-            #o.text = output
-            #s = ET.SubElement(resp_elem, 'status')
-            #s.text = statusmsg
-            #response = ET.tostring(resp_elem)
+            # ET = xml.etree.ElementTree
+            # resp_elem = ET.Element('response')
+            # o = ET.SubElement(resp_elem, 'output')
+            # o.text = output
+            # s = ET.SubElement(resp_elem, 'status')
+            # s.text = statusmsg
+            # response = ET.tostring(resp_elem)
             response = '''
 <response>
   <output>
@@ -356,6 +360,7 @@ def make_response(fmt, output, statusmsg, errorcode):
 
     return flask.make_response(response, errorcode)
 
+
 def handler(catchall_path=None, fmt=None, target=None):
     '''
     Main endpoint handler; generic for every endpoint, including catchall.
@@ -374,7 +379,7 @@ def handler(catchall_path=None, fmt=None, target=None):
     if not ep.startswith(app.ceph_baseurl):
         return make_response(fmt, '', 'Page not found', 404)
 
-    rel_ep = ep[len(app.ceph_baseurl)+1:]
+    rel_ep = ep[len(app.ceph_baseurl) + 1:]
 
     # Extensions override Accept: headers override defaults
     if not fmt:
@@ -414,7 +419,7 @@ def handler(catchall_path=None, fmt=None, target=None):
         prefix = ' '.join(rel_ep.split('/')).strip()
 
     # show "match as much as you gave me" help for unknown endpoints
-    if not ep in app.ceph_urls:
+    if ep not in app.ceph_urls:
         helptext = show_human_help(prefix)
         if helptext:
             resp = flask.make_response(helptext, 400)
@@ -432,8 +437,10 @@ def handler(catchall_path=None, fmt=None, target=None):
 
         # allow '?help' for any specifically-known endpoint
         if 'help' in flask.request.args:
-            response = flask.make_response('{0}: {1}'.\
-                format(prefix + concise_sig(paramsig), urldict['help']))
+            response = flask.make_response('{0}: {1}'.
+                                           format(prefix +
+                                                  concise_sig(paramsig),
+                                                  urldict['help']))
             response.headers['Content-Type'] = 'text/plain'
             return response
 
@@ -482,12 +489,13 @@ def handler(catchall_path=None, fmt=None, target=None):
 
     response = make_response(fmt, outbuf, outs or 'OK', 200)
     if fmt:
-        contenttype = 'application/' + fmt.replace('-pretty','')
+        contenttype = 'application/' + fmt.replace('-pretty', '')
     else:
         contenttype = 'text/plain'
     response.headers['Content-Type'] = contenttype
     return response
 
+
 #
 # Main entry point from wrapper/WSGI server: call with cmdline args,
 # get back the WSGI app entry point
diff --git a/src/pybind/cephfs.py b/src/pybind/cephfs.py
index 574846f..df33da7 100644
--- a/src/pybind/cephfs.py
+++ b/src/pybind/cephfs.py
@@ -2,37 +2,58 @@
 This module is a thin wrapper around libcephfs.
 """
 from ctypes import CDLL, c_char_p, c_size_t, c_void_p, c_int, c_long, c_uint, c_ulong, \
-    create_string_buffer, byref, Structure, pointer, c_char
+    c_ushort, create_string_buffer, byref, Structure, pointer, c_char, POINTER, \
+    c_uint8, c_int64
 from ctypes.util import find_library
+from collections import namedtuple
 import errno
+import os
+
 
 class Error(Exception):
     pass
 
+
 class PermissionError(Error):
     pass
 
+
 class ObjectNotFound(Error):
     pass
 
+
 class NoData(Error):
     pass
 
+
 class ObjectExists(Error):
     pass
 
+
 class IOError(Error):
     pass
 
+
 class NoSpace(Error):
     pass
 
+
+class InvalidValue(Error):
+    pass
+
+
+class OperationNotSupported(Error):
+    pass
+
+
 class IncompleteWriteError(Error):
     pass
 
+
 class LibCephFSStateError(Error):
     pass
 
+
 def make_ex(ret, msg):
     """
     Translate a libcephfs return code into an exception.
@@ -50,7 +71,9 @@ def make_ex(ret, msg):
         errno.EIO       : IOError,
         errno.ENOSPC    : NoSpace,
         errno.EEXIST    : ObjectExists,
-        errno.ENODATA   : NoData
+        errno.ENODATA   : NoData,
+        errno.EINVAL    : InvalidValue,
+        errno.EOPNOTSUPP: OperationNotSupported,
         }
     ret = abs(ret)
     if ret in errors:
@@ -58,18 +81,28 @@ def make_ex(ret, msg):
     else:
         return Error(msg + (": error code %d" % ret))
 
+
 class cephfs_statvfs(Structure):
-    _fields_ = [("f_bsize", c_uint),
-                ("f_frsize", c_uint),
-                ("f_blocks", c_uint),
-                ("f_bfree", c_uint),
-                ("f_bavail", c_uint),
-                ("f_files", c_uint),
-                ("f_ffree", c_uint),
-                ("f_favail", c_uint),
-                ("f_fsid", c_uint),
-                ("f_flag", c_uint),
-                ("f_namemax", c_uint)]
+    _fields_ = [("f_bsize", c_ulong),
+                ("f_frsize", c_ulong),
+                ("f_blocks", c_ulong),
+                ("f_bfree", c_ulong),
+                ("f_bavail", c_ulong),
+                ("f_files", c_ulong),
+                ("f_ffree", c_ulong),
+                ("f_favail", c_ulong),
+                ("f_fsid", c_ulong),
+                ("f_flag", c_ulong),
+                ("f_namemax", c_ulong),
+                ("f_padding", c_ulong*32)]
+
+
+class cephfs_dirent(Structure):
+    _fields_ = [("d_ino", c_long),
+                ("d_off", c_ulong),
+                ("d_reclen", c_ushort),
+                ("d_type", c_uint8),
+                ("d_name", c_char*256)]
 
 # struct timespec {
 #   long int tv_sec;
@@ -79,6 +112,7 @@ class cephfs_timespec(Structure):
     _fields_ = [('tv_sec', c_long),
                 ('tv_nsec', c_long)]
 
+
 # struct stat {
 #   unsigned long st_dev;
 #   unsigned long st_ino;
@@ -97,23 +131,43 @@ class cephfs_timespec(Structure):
 #   long int __unused[3];
 # };
 class cephfs_stat(Structure):
-    _fields_ = [('st_dev', c_ulong), # ID of device containing file
-                ('st_ino', c_ulong), # inode number
-                ('st_nlink', c_ulong), # number of hard links
-                ('st_mode', c_uint), # protection
-                ('st_uid', c_uint), # user ID of owner
-                ('st_gid', c_uint), # group ID of owner
+    _fields_ = [('st_dev', c_ulong),            # ID of device containing file
+                ('st_ino', c_ulong),            # inode number
+                ('st_nlink', c_ulong),          # number of hard links
+                ('st_mode', c_uint),            # protection
+                ('st_uid', c_uint),             # user ID of owner
+                ('st_gid', c_uint),             # group ID of owner
                 ('__pad0', c_int),
-                ('st_rdev', c_ulong), # device ID (if special file)
-                ('st_size', c_long), # total size, in bytes
-                ('st_blksize', c_long), # blocksize for file system I/O
-                ('st_blocks', c_long), # number of 512B blocks allocated
-                ('st_atime', cephfs_timespec), # time of last access
-                ('st_mtime', cephfs_timespec), # time of last modification
-                ('st_ctime', cephfs_timespec), # time of last status change
+                ('st_rdev', c_ulong),           # device ID (if special file)
+                ('st_size', c_long),            # total size, in bytes
+                ('st_blksize', c_long),         # blocksize for file system I/O
+                ('st_blocks', c_long),          # num of 512B blocks allocated
+                ('st_atime', cephfs_timespec),  # time of last access
+                ('st_mtime', cephfs_timespec),  # time of last modification
+                ('st_ctime', cephfs_timespec),  # time of last status change
                 ('__unused1', c_long),
                 ('__unused2', c_long),
-                ('__unused3', c_long) ]
+                ('__unused3', c_long)]
+
+
+class DirEntry(namedtuple('DirEntry',
+               ['d_ino', 'd_off', 'd_reclen', 'd_type', 'd_name'])):
+    DT_DIR = 0x4
+    DT_REG = 0xA
+    DT_LNK = 0xC
+    def is_dir(self):
+        return self.d_type == self.DT_DIR
+
+    def is_symbol_file(self):
+        return self.d_type == self.DT_LNK
+
+    def is_file(self):
+        return self.d_type == self.DT_REG
+
+StatResult = namedtuple('StatResult',
+                        ["st_dev", "st_ino", "st_mode", "st_nlink", "st_uid",
+                         "st_gid", "st_rdev", "st_size", "st_blksize",
+                         "st_blocks", "st_atime", "st_mtime", "st_ctime"])
 
 def load_libcephfs():
     """
@@ -130,6 +184,7 @@ def load_libcephfs():
     except OSError as e:
         raise EnvironmentError("Unable to load libcephfs: %s" % e)
 
+
 class LibCephFS(object):
     """libcephfs python wrapper"""
     def require_state(self, *args):
@@ -143,11 +198,11 @@ class LibCephFS(object):
         self.libcephfs = load_libcephfs()
         self.cluster = c_void_p()
 
-        if conffile is not None and not isinstance(conffile, str):
+        if conffile is not None and not isinstance(conffile, basestring):
             raise TypeError('conffile must be a string or None')
         ret = self.libcephfs.ceph_create(byref(self.cluster), c_char_p(0))
         if ret != 0:
-            raise Error("libcephfs_initialize failed with error code: %d" %ret)
+            raise Error("libcephfs_initialize failed with error code: %d" % ret)
         self.state = "configuring"
         if conffile is not None:
             # read the default conf file when '' is given
@@ -159,7 +214,7 @@ class LibCephFS(object):
                 self.conf_set(key, value)
 
     def conf_read_file(self, conffile=None):
-        if conffile is not None and not isinstance(conffile, str):
+        if conffile is not None and not isinstance(conffile, basestring):
             raise TypeError('conffile param must be a string')
         ret = self.libcephfs.ceph_conf_read_file(self.cluster, c_char_p(conffile))
         if ret != 0:
@@ -168,8 +223,8 @@ class LibCephFS(object):
     def conf_parse_argv(self, argv):
         self.require_state("configuring")
         c_argv = (c_char_p * len(argv))(*argv)
-        ret = self.libcephfs.ceph_conf_parse_argv(
-                self.cluster, len(argv), c_argv)
+        ret = self.libcephfs.ceph_conf_parse_argv(self.cluster, len(argv),
+                                                  c_argv)
         if ret != 0:
             raise make_ex(ret, "error calling conf_parse_argv")
 
@@ -207,13 +262,13 @@ class LibCephFS(object):
 
     def conf_get(self, option):
         self.require_state("configuring", "initialized", "mounted")
-        if not isinstance(option, str):
+        if not isinstance(option, basestring):
             raise TypeError('option must be a string')
         length = 20
         while True:
             ret_buf = create_string_buffer(length)
             ret = self.libcephfs.ceph_conf_get(self.cluster, option,
-                                                ret_buf, c_size_t(length))
+                                               ret_buf, c_size_t(length))
             if ret == 0:
                 return ret_buf.value
             elif ret == -errno.ENAMETOOLONG:
@@ -225,12 +280,12 @@ class LibCephFS(object):
 
     def conf_set(self, option, val):
         self.require_state("configuring", "initialized", "mounted")
-        if not isinstance(option, str):
+        if not isinstance(option, basestring):
             raise TypeError('option must be a string')
-        if not isinstance(val, str):
+        if not isinstance(val, basestring):
             raise TypeError('val must be a string')
         ret = self.libcephfs.ceph_conf_set(self.cluster, c_char_p(option),
-                                            c_char_p(val))
+                                           c_char_p(val))
         if ret != 0:
             raise make_ex(ret, "error calling conf_set")
 
@@ -251,6 +306,8 @@ class LibCephFS(object):
         self.state = "mounted"
 
     def statfs(self, path):
+        if not isinstance(path, basestring):
+            raise TypeError('path must be a string')
         self.require_state("mounted")
         statbuf = cephfs_statvfs()
         ret = self.libcephfs.ceph_statfs(self.cluster, c_char_p(path), byref(statbuf))
@@ -266,7 +323,7 @@ class LibCephFS(object):
                 'f_favail': statbuf.f_favail,
                 'f_fsid': statbuf.f_fsid,
                 'f_flag': statbuf.f_flag,
-                'f_namemax': statbuf.f_namemax }
+                'f_namemax': statbuf.f_namemax}
 
     def sync_fs(self):
         self.require_state("mounted")
@@ -276,6 +333,7 @@ class LibCephFS(object):
 
     def getcwd(self):
         self.require_state("mounted")
+        self.libcephfs.ceph_getcwd.restype = c_char_p
         return self.libcephfs.ceph_getcwd(self.cluster)
 
     def chdir(self, path):
@@ -284,9 +342,40 @@ class LibCephFS(object):
         if ret < 0:
             raise make_ex(ret, "chdir failed")
 
+    def opendir(self, path):
+        self.require_state("mounted")
+        if not isinstance(path, basestring):
+            raise TypeError('path must be a string')
+        dir_handler = c_void_p()
+        ret = self.libcephfs.ceph_opendir(self.cluster, c_char_p(path),
+                                          pointer(dir_handler));
+        if ret < 0:
+            raise make_ex(ret, "opendir failed")
+        return dir_handler
+
+    def readdir(self, dir_handler):
+        self.require_state("mounted")
+        self.libcephfs.ceph_readdir.restype = POINTER(cephfs_dirent)
+        while True:
+            dirent = self.libcephfs.ceph_readdir(self.cluster, dir_handler)
+            if not dirent:
+                return None
+
+            return DirEntry(d_ino=dirent.contents.d_ino,
+                            d_off=dirent.contents.d_off,
+                            d_reclen=dirent.contents.d_reclen,
+                            d_type=dirent.contents.d_type,
+                            d_name=dirent.contents.d_name)
+
+    def closedir(self, dir_handler):
+        self.require_state("mounted")
+        ret = self.libcephfs.ceph_closedir(self.cluster, dir_handler)
+        if ret < 0:
+            raise make_ex(ret, "closedir failed")
+
     def mkdir(self, path, mode):
         self.require_state("mounted")
-        if not isinstance(path, str):
+        if not isinstance(path, basestring):
             raise TypeError('path must be a string')
         ret = self.libcephfs.ceph_mkdir(self.cluster, c_char_p(path), c_int(mode))
         if ret < 0:
@@ -294,23 +383,47 @@ class LibCephFS(object):
 
     def mkdirs(self, path, mode):
         self.require_state("mounted")
-        if not isinstance(path, str):
+        if not isinstance(path, basestring):
             raise TypeError('path must be a string')
-        if not isinstance(mode, int):
+        if not isinstance(mode, basestring):
             raise TypeError('mode must be an int')
         ret = self.libcephfs.ceph_mkdir(self.cluster, c_char_p(path), c_int(mode))
         if ret < 0:
             raise make_ex(ret, "error in mkdirs '%s'" % path)
 
-    def open(self, path, flags, mode):
+    def rmdir(self, path):
         self.require_state("mounted")
-        if not isinstance(path, str):
+        if not isinstance(path, basestring):
             raise TypeError('path must be a string')
+        ret = self.libcephfs.ceph_rmdir(self.cluster, c_char_p(path))
+        if ret < 0:
+            raise make_ex(ret, "error in rmdir '%s'" % path)
+
+    def open(self, path, flags, mode=0):
+        self.require_state("mounted")
+        if not isinstance(path, basestring):
+            raise TypeError('path must be a string')
+        if not isinstance(flags, basestring):
+            raise TypeError('flags must be a string')
         if not isinstance(mode, int):
             raise TypeError('mode must be an int')
-        if not isinstance(flags, int):
-            raise TypeError('flags must be an int')
-        ret = self.libcephfs.ceph_open(self.cluster, c_char_p(path), c_int(flags), c_int(mode))
+        cephfs_flags = 0
+        if flags == '':
+            cephfs_flags = os.O_RDONLY
+        else:
+            for c in flags:
+                if c == 'r':
+                    cephfs_flags |= os.O_RDONLY
+                elif c == 'w':
+                    cephfs_flags |= os.O_WRONLY | os.O_TRUNC | os.O_CREAT
+                elif c == '+':
+                    cephfs_flags |= os.O_RDWR
+                else:
+                    raise OperationNotSupported(
+                        "open flags doesn't support %s" % c)
+
+        ret = self.libcephfs.ceph_open(self.cluster, c_char_p(path),
+                                       c_int(cephfs_flags), c_int(mode))
         if ret < 0:
             raise make_ex(ret, "error in open '%s'" % path)
         return ret
@@ -321,48 +434,80 @@ class LibCephFS(object):
         if ret < 0:
             raise make_ex(ret, "error in close")
 
+    def read(self, fd, offset, l):
+        self.require_state("mounted")
+        if not isinstance(offset, int):
+            raise TypeError('path must be an int')
+        if not isinstance(l, int):
+            raise TypeError('path must be an int')
+
+        buf = create_string_buffer(l)
+        ret = self.libcephfs.ceph_read(self.cluster, c_int(fd),
+                                       buf, c_int64(l), c_int64(offset))
+        if ret < 0:
+            raise make_ex(ret, "error in close")
+        return buf.value
+
+    def write(self, fd, buf, offset):
+        self.require_state("mounted")
+        if not isinstance(buf, basestring):
+            raise TypeError('buf must be a string')
+        if not isinstance(offset, int):
+            raise TypeError('offset must be an int')
+
+        ret = self.libcephfs.ceph_write(self.cluster, c_int(fd),
+                                        c_char_p(buf), c_int64(len(buf)),
+                                        c_int64(offset))
+        if ret < 0:
+            raise make_ex(ret, "error in close")
+        return ret
+
+    def getxattr(self, path, name):
+        if not isinstance(path, basestring):
+            raise TypeError('path must be a string')
+        if not isinstance(name, basestring):
+            raise TypeError('name must be a string')
+
+        self.require_state("mounted")
+        l = 255
+        buf = create_string_buffer(l)
+        actual_l = self.libcephfs.ceph_getxattr(self.cluster, path, name, buf, c_int(l))
+        if actual_l > l:
+            buf = create_string_buffer(actual_)
+            self.libcephfs.ceph_getxattr(path, name, new_buf, actual_l)
+        return buf.value
+
     def setxattr(self, path, name, value, flags):
-        if not isinstance(path, str):
+        if not isinstance(path, basestring):
             raise TypeError('path must be a string')
-        if not isinstance(name, str):
+        if not isinstance(name, basestring):
             raise TypeError('name must be a string')
-        if not isinstance(value, str):
+        if not isinstance(value, basestring):
             raise TypeError('value must be a string')
         self.require_state("mounted")
-        ret = self.libcephfs.ceph_setxattr(
-                    self.cluster,
-                    c_char_p(path),
-                    c_char_p(name),
-                    c_char_p(value),
-                    c_size_t(len(value)),
-                    c_int(flags))
+        ret = self.libcephfs.ceph_setxattr(self.cluster, c_char_p(path),
+                                           c_char_p(name), c_char_p(value),
+                                           c_size_t(len(value)), c_int(flags))
         if ret < 0:
             raise make_ex(ret, "error in setxattr")
 
     def stat(self, path):
         self.require_state("mounted")
-        if not isinstance(path, str):
+        if not isinstance(path, basestring):
             raise TypeError('path must be a string')
         statbuf = cephfs_stat()
-        ret = self.libcephfs.ceph_stat(
-                self.cluster,
-                c_char_p(path),
-                byref(statbuf))
+        ret = self.libcephfs.ceph_stat(self.cluster, c_char_p(path),
+                                       byref(statbuf))
         if ret < 0:
             raise make_ex(ret, "error in stat: %s" % path)
-        return {'st_dev': statbuf.st_dev,
-                'st_ino': statbuf.st_ino,
-                'st_mode': statbuf.st_mode,
-                'st_nlink': statbuf.st_nlink,
-                'st_uid': statbuf.st_uid,
-                'st_gid': statbuf.st_gid,
-                'st_rdev': statbuf.st_rdev,
-                'st_size': statbuf.st_size,
-                'st_blksize': statbuf.st_blksize,
-                'st_blocks': statbuf.st_blocks,
-                'st_atime': statbuf.st_atime,
-                'st_mtime': statbuf.st_mtime,
-                'st_ctime': statbuf.st_ctime }
+        return StatResult(st_dev=statbuf.st_dev, st_ino=statbuf.st_ino,
+                          st_mode=statbuf.st_mode, st_nlink=statbuf.st_nlink,
+                          st_uid=statbuf.st_uid, st_gid=statbuf.st_gid,
+                          st_rdev=statbuf.st_rdev, st_size=statbuf.st_size,
+                          st_blksize=statbuf.st_blksize,
+                          st_blocks=statbuf.st_blocks,
+                          st_atime=statbuf.st_atime, st_mtime=statbuf.st_mtime,
+                          st_ctime=statbuf.st_ctime)
 
     def unlink(self, path):
         self.require_state("mounted")
@@ -372,6 +517,14 @@ class LibCephFS(object):
         if ret < 0:
             raise make_ex(ret, "error in unlink: %s" % path)
 
+    def rename(self, src, dst):
+        self.require_state("mounted")
+        if not isinstance(src, basestring) or not isinstance(dst, basestring):
+            raise TypeError('source and destination must be a string')
+        ret = self.libcephfs.ceph_rename(self.cluster, c_char_p(src), c_char_p(dst))
+        if ret < 0:
+            raise make_ex(ret, "error in rename '%s' to '%s'" % (src, dst))
+
     def mds_command(self, mds_spec, args, input_data):
         """
         :return 3-tuple of output status int, output status string, output data
@@ -384,15 +537,12 @@ class LibCephFS(object):
         outsp = pointer(pointer(c_char()))
         outslen = c_long()
 
-        ret = self.libcephfs.ceph_mds_command(
-                self.cluster,
-                c_char_p(mds_spec),
-                cmdarr,
-                len(args),
-                c_char_p(input_data), len(input_data),
-                outbufp, byref(outbuflen),
-                outsp, byref(outslen)
-        )
+        ret = self.libcephfs.ceph_mds_command(self.cluster, c_char_p(mds_spec),
+                                              cmdarr, len(args),
+                                              c_char_p(input_data),
+                                              len(input_data), outbufp,
+                                              byref(outbuflen), outsp,
+                                              byref(outslen))
 
         my_outbuf = outbufp.contents[:(outbuflen.value)]
         my_outs = outsp.contents[:(outslen.value)]
diff --git a/src/pybind/rados.py b/src/pybind/rados.py
index 4caa269..804a169 100644
--- a/src/pybind/rados.py
+++ b/src/pybind/rados.py
@@ -11,78 +11,97 @@ import ctypes
 import errno
 import threading
 import time
+
 from datetime import datetime
+from functools import wraps
+from itertools import chain
 
 ANONYMOUS_AUID = 0xffffffffffffffff
 ADMIN_AUID = 0
 LIBRADOS_ALL_NSPACES = '\001'
 
-LIBRADOS_OP_FLAG_FADVISE_RANDOM	    = 0x4
+LIBRADOS_OP_FLAG_FADVISE_RANDOM = 0x4
 LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL = 0x8
-LIBRADOS_OP_FLAG_FADVISE_WILLNEED   = 0x10
-LIBRADOS_OP_FLAG_FADVISE_DONTNEED   = 0x20
-LIBRADOS_OP_FLAG_FADVISE_NOCACHE    = 0x40
+LIBRADOS_OP_FLAG_FADVISE_WILLNEED = 0x10
+LIBRADOS_OP_FLAG_FADVISE_DONTNEED = 0x20
+LIBRADOS_OP_FLAG_FADVISE_NOCACHE = 0x40
+
 
 class Error(Exception):
     """ `Error` class, derived from `Exception` """
     pass
 
+
 class InterruptedOrTimeoutError(Error):
     """ `InterruptedOrTimeoutError` class, derived from `Error` """
     pass
 
+
 class PermissionError(Error):
     """ `PermissionError` class, derived from `Error` """
     pass
 
+
 class ObjectNotFound(Error):
     """ `ObjectNotFound` class, derived from `Error` """
     pass
 
+
 class NoData(Error):
     """ `NoData` class, derived from `Error` """
     pass
 
+
 class ObjectExists(Error):
     """ `ObjectExists` class, derived from `Error` """
     pass
 
+
 class ObjectBusy(Error):
     """ `ObjectBusy` class, derived from `Error` """
     pass
 
+
 class IOError(Error):
     """ `IOError` class, derived from `Error` """
     pass
 
+
 class NoSpace(Error):
     """ `NoSpace` class, derived from `Error` """
     pass
 
+
 class IncompleteWriteError(Error):
     """ `IncompleteWriteError` class, derived from `Error` """
     pass
 
+
 class RadosStateError(Error):
     """ `RadosStateError` class, derived from `Error` """
     pass
 
+
 class IoctxStateError(Error):
     """ `IoctxStateError` class, derived from `Error` """
     pass
 
+
 class ObjectStateError(Error):
     """ `ObjectStateError` class, derived from `Error` """
     pass
 
+
 class LogicError(Error):
     """ `` class, derived from `Error` """
     pass
 
+
 class TimedOut(Error):
     """ `TimedOut` class, derived from `Error` """
     pass
 
+
 def make_ex(ret, msg):
     """
     Translate a librados return code into an exception.
@@ -111,6 +130,7 @@ def make_ex(ret, msg):
     else:
         return Error(msg + (": errno %s" % errno.errorcode[ret]))
 
+
 class rados_pool_stat_t(Structure):
     """ Usage information for a pool """
     _fields_ = [("num_bytes", c_uint64),
@@ -126,6 +146,7 @@ class rados_pool_stat_t(Structure):
                 ("num_wr", c_uint64),
                 ("num_wr_kb", c_uint64)]
 
+
 class rados_cluster_stat_t(Structure):
     """ Cluster-wide usage information """
     _fields_ = [("kb", c_uint64),
@@ -133,6 +154,7 @@ class rados_cluster_stat_t(Structure):
                 ("kb_avail", c_uint64),
                 ("num_objects", c_uint64)]
 
+
 class timeval(Structure):
     _fields_ = [("tv_sec", c_long), ("tv_usec", c_long)]
 
@@ -147,6 +169,7 @@ class Version(object):
     def __str__(self):
         return "%d.%d.%d" % (self.major, self.minor, self.extra)
 
+
 class RadosThread(threading.Thread):
     def __init__(self, target, args=None):
         self.args = args
@@ -159,6 +182,7 @@ class RadosThread(threading.Thread):
 # time in seconds between each call to t.join() for child thread
 POLL_TIME_INCR = 0.5
 
+
 def run_in_thread(target, args, timeout=0):
     interrupt = False
 
@@ -194,6 +218,49 @@ def run_in_thread(target, args, timeout=0):
         t.retval = -errno.EINTR
     return t.retval
 
+
+# helper to specify an optional argument, where in addition to `cls`, `None`
+# is also acceptable
+def opt(cls):
+    return (cls, None)
+
+
+# validate argument types of an instance method
+# kwargs is an un-ordered dict, so use args instead
+def requires(*types):
+    def is_type_of(v, t):
+        if t is None:
+            return v is None
+        else:
+            return isinstance(v, t)
+
+    def check_type(val, arg_name, arg_type):
+        if isinstance(arg_type, tuple):
+            if any(is_type_of(val, t) for t in arg_type):
+                return
+            type_names = ' or '.join('None' if t is None else t.__name__
+                                     for t in arg_type)
+            raise TypeError('%s must be %s' % (arg_name, type_names))
+        else:
+            if is_type_of(val, arg_type):
+                return
+            assert(arg_type is not None)
+            raise TypeError('%s must be %s' % (arg_name, arg_type.__name__))
+
+    def wrapper(f):
+        @wraps(f)
+        def validate_func(*args, **kwargs):
+            # ignore the `self` arg
+            pos_args = zip(args[1:], types)
+            named_args = ((kwargs[name], (name, spec)) for name, spec in types
+                          if name in kwargs)
+            for arg_val, (arg_name, arg_type) in chain(pos_args, named_args):
+                check_type(arg_val, arg_name, arg_type)
+            return f(*args, **kwargs)
+        return validate_func
+    return wrapper
+
+
 class Rados(object):
     """librados python wrapper"""
     def require_state(self, *args):
@@ -203,13 +270,15 @@ class Rados(object):
         :raises: RadosStateError
         """
         if self.state in args:
-           return
+            return
         raise RadosStateError("You cannot perform that operation on a \
 Rados object in state %s." % self.state)
 
+    @requires(('rados_id', opt(str)), ('name', opt(str)), ('clustername', opt(str)),
+              ('conffile', opt(str)))
     def __init__(self, rados_id=None, name=None, clustername=None,
                  conf_defaults=None, conffile=None, conf=None, flags=0):
-        library_path  = find_library('rados')
+        library_path = find_library('rados')
         # maybe find_library can not find it correctly on all platforms,
         # so fall back to librados.so.2 in such case.
         self.librados = CDLL(library_path if library_path is not None else 'librados.so.2')
@@ -219,25 +288,17 @@ Rados object in state %s." % self.state)
         self.conffile = conffile
         self.cluster = c_void_p()
         self.rados_id = rados_id
-        if rados_id is not None and not isinstance(rados_id, str):
-            raise TypeError('rados_id must be a string or None')
-        if conffile is not None and not isinstance(conffile, str):
-            raise TypeError('conffile must be a string or None')
-        if name is not None and not isinstance(name, str):
-            raise TypeError('name must be a string or None')
-        if clustername is not None and not isinstance(clustername, str):
-            raise TypeError('clustername must be a string or None')
         if rados_id and name:
             raise Error("Rados(): can't supply both rados_id and name")
         elif rados_id:
-            name = 'client.' +  rados_id
+            name = 'client.' + rados_id
         elif name is None:
             name = 'client.admin'
         if clustername is None:
             clustername = 'ceph'
         ret = run_in_thread(self.librados.rados_create2,
                             (byref(self.cluster), c_char_p(clustername),
-                            c_char_p(name), c_uint64(flags)))
+                             c_char_p(name), c_uint64(flags)))
 
         if ret != 0:
             raise Error("rados_initialize failed with error code: %d" % ret)
@@ -286,6 +347,8 @@ Rados object in state %s." % self.state)
                       (byref(major), byref(minor), byref(extra)))
         return Version(major.value, minor.value, extra.value)
 
+
+    @requires(('path', opt(str)))
     def conf_read_file(self, path=None):
         """
         Configure the cluster handle using a Ceph config file.
@@ -294,8 +357,6 @@ Rados object in state %s." % self.state)
         :type path: str
         """
         self.require_state("configuring", "connected")
-        if path is not None and not isinstance(path, str):
-            raise TypeError('path must be a string')
         ret = run_in_thread(self.librados.rados_conf_read_file,
                             (self.cluster, c_char_p(path)))
         if (ret != 0):
@@ -338,6 +399,7 @@ Rados object in state %s." % self.state)
         if (ret != 0):
             raise make_ex(ret, "error calling conf_parse_env")
 
+    @requires(('option', str))
     def conf_get(self, option):
         """
         Get the value of a configuration option
@@ -349,14 +411,12 @@ Rados object in state %s." % self.state)
         :raises: :class:`TypeError`
         """
         self.require_state("configuring", "connected")
-        if not isinstance(option, str):
-            raise TypeError('option must be a string')
         length = 20
         while True:
             ret_buf = create_string_buffer(length)
             ret = run_in_thread(self.librados.rados_conf_get,
                                 (self.cluster, c_char_p(option), ret_buf,
-                                c_size_t(length)))
+                                 c_size_t(length)))
             if (ret == 0):
                 return ret_buf.value
             elif (ret == -errno.ENAMETOOLONG):
@@ -366,6 +426,7 @@ Rados object in state %s." % self.state)
             else:
                 raise make_ex(ret, "error calling conf_get")
 
+    @requires(('option', str), ('val', str))
     def conf_set(self, option, val):
         """
         Set the value of a configuration option
@@ -378,45 +439,40 @@ Rados object in state %s." % self.state)
         :raises: :class:`TypeError`, :class:`ObjectNotFound`
         """
         self.require_state("configuring", "connected")
-        if not isinstance(option, str):
-            raise TypeError('option must be a string')
-        if not isinstance(val, str):
-            raise TypeError('val must be a string')
         ret = run_in_thread(self.librados.rados_conf_set,
                             (self.cluster, c_char_p(option), c_char_p(val)))
         if (ret != 0):
             raise make_ex(ret, "error calling conf_set")
 
-
     def ping_monitor(self, mon_id):
-      """
-      Ping a monitor to assess liveness
+        """
+        Ping a monitor to assess liveness
 
-      May be used as a simply way to assess liveness, or to obtain
-      information about the monitor in a simple way even in the
-      absence of quorum.
+        May be used as a simply way to assess liveness, or to obtain
+        information about the monitor in a simple way even in the
+        absence of quorum.
 
-      :param mon_id: the ID portion of the monitor's name (i.e., mon.<ID>)
-      :type mon_id: str
-      :returns: the string reply from the monitor
-      """
+        :param mon_id: the ID portion of the monitor's name (i.e., mon.<ID>)
+        :type mon_id: str
+        :returns: the string reply from the monitor
+        """
 
-      self.require_state("configuring", "connected")
+        self.require_state("configuring", "connected")
 
-      outstrp = pointer(pointer(c_char()))
-      outstrlen = c_long()
+        outstrp = pointer(pointer(c_char()))
+        outstrlen = c_long()
 
-      ret = run_in_thread(self.librados.rados_ping_monitor,
-                          (self.cluster, c_char_p(mon_id),
-                           outstrp, byref(outstrlen)))
+        ret = run_in_thread(self.librados.rados_ping_monitor,
+                            (self.cluster, c_char_p(mon_id),
+                             outstrp, byref(outstrlen)))
 
-      my_outstr = outstrp.contents[:(outstrlen.value)]
-      if outstrlen.value:
-        run_in_thread(self.librados.rados_buffer_free, (outstrp.contents,))
+        my_outstr = outstrp.contents[:(outstrlen.value)]
+        if outstrlen.value:
+            run_in_thread(self.librados.rados_buffer_free, (outstrp.contents,))
 
-      if ret != 0:
-        raise make_ex(ret, "error calling ping_monitor")
-      return my_outstr
+        if ret != 0:
+            raise make_ex(ret, "error calling ping_monitor")
+        return my_outstr
 
     def connect(self, timeout=0):
         """
@@ -459,6 +515,7 @@ Rados object in state %s." % self.state)
                 'kb_avail': stats.kb_avail,
                 'num_objects': stats.num_objects}
 
+    @requires(('pool_name', str))
     def pool_exists(self, pool_name):
         """
         Checks if a given pool exists.
@@ -470,8 +527,6 @@ Rados object in state %s." % self.state)
         :returns: bool - whether the pool exists, false otherwise.
         """
         self.require_state("connected")
-        if not isinstance(pool_name, str):
-            raise TypeError('pool_name must be a string')
         ret = run_in_thread(self.librados.rados_pool_lookup,
                             (self.cluster, c_char_p(pool_name)))
         if (ret >= 0):
@@ -481,6 +536,7 @@ Rados object in state %s." % self.state)
         else:
             raise make_ex(ret, "error looking up pool '%s'" % pool_name)
 
+    @requires(('pool_name', str))
     def pool_lookup(self, pool_name):
         """
         Returns a pool's ID based on its name.
@@ -492,8 +548,6 @@ Rados object in state %s." % self.state)
         :returns: int - pool ID, or None if it doesn't exist
         """
         self.require_state("connected")
-        if not isinstance(pool_name, str):
-            raise TypeError('pool_name must be a string')
         ret = run_in_thread(self.librados.rados_pool_lookup,
                             (self.cluster, c_char_p(pool_name)))
         if (ret >= 0):
@@ -503,6 +557,7 @@ Rados object in state %s." % self.state)
         else:
             raise make_ex(ret, "error looking up pool '%s'" % pool_name)
 
+    @requires(('pool_id', int))
     def pool_reverse_lookup(self, pool_id):
         """
         Returns a pool's name based on its ID.
@@ -514,8 +569,6 @@ Rados object in state %s." % self.state)
         :returns: string - pool name, or None if it doesn't exist
         """
         self.require_state("connected")
-        if not isinstance(pool_id, int):
-            raise TypeError('pool_id must be an integer')
         size = c_size_t(512)
         while True:
             c_name = create_string_buffer(size.value)
@@ -531,6 +584,7 @@ Rados object in state %s." % self.state)
                 return c_name.value
                 break
 
+    @requires(('pool_name', str), ('auid', opt(int)), ('crush_rule', opt(int)))
     def create_pool(self, pool_name, auid=None, crush_rule=None):
         """
         Create a pool:
@@ -544,36 +598,33 @@ Rados object in state %s." % self.state)
         :param auid: the id of the owner of the new pool
         :type auid: int
         :param crush_rule: rule to use for placement in the new pool
-        :type crush_rule: str
+        :type crush_rule: int
 
         :raises: :class:`TypeError`, :class:`Error`
         """
         self.require_state("connected")
-        if not isinstance(pool_name, str):
-            raise TypeError('pool_name must be a string')
-        if crush_rule is not None and not isinstance(crush_rule, str):
-            raise TypeError('cruse_rule must be a string')
-        if (auid == None):
-            if (crush_rule == None):
+        if auid is None:
+            if crush_rule is None:
                 ret = run_in_thread(self.librados.rados_pool_create,
                                     (self.cluster, c_char_p(pool_name)))
             else:
-                ret = run_in_thread(self.librados.\
+                ret = run_in_thread(self.librados.
                                     rados_pool_create_with_crush_rule,
                                     (self.cluster, c_char_p(pool_name),
-                                    c_ubyte(crush_rule)))
+                                     c_ubyte(crush_rule)))
 
-        elif (crush_rule == None):
+        elif crush_rule is None:
             ret = run_in_thread(self.librados.rados_pool_create_with_auid,
                                 (self.cluster, c_char_p(pool_name),
-                                c_uint64(auid)))
+                                 c_uint64(auid)))
         else:
             ret = run_in_thread(self.librados.rados_pool_create_with_all,
                                 (self.cluster, c_char_p(pool_name),
-                                c_uint64(auid), c_ubyte(crush_rule)))
+                                 c_uint64(auid), c_ubyte(crush_rule)))
         if ret < 0:
             raise make_ex(ret, "error creating pool '%s'" % pool_name)
 
+    @requires(('pool_id', int))
     def get_pool_base_tier(self, pool_id):
         """
         Get base pool
@@ -581,8 +632,6 @@ Rados object in state %s." % self.state)
         :returns: base pool, or pool_id if tiering is not configured for the pool
         """
         self.require_state("connected")
-        if not isinstance(pool_id, int):
-            raise TypeError('pool_id must be an int')
         base_tier = c_int64(0)
         ret = run_in_thread(self.librados.rados_pool_get_base_tier,
                             (self.cluster, c_int64(pool_id), byref(base_tier)))
@@ -590,6 +639,7 @@ Rados object in state %s." % self.state)
             raise make_ex(ret, "get_pool_base_tier(%d)" % pool_id)
         return base_tier.value
 
+    @requires(('pool_name', str))
     def delete_pool(self, pool_name):
         """
         Delete a pool and all data inside it.
@@ -603,8 +653,6 @@ Rados object in state %s." % self.state)
         :raises: :class:`TypeError`, :class:`Error`
         """
         self.require_state("connected")
-        if not isinstance(pool_name, str):
-            raise TypeError('pool_name must be a string')
         ret = run_in_thread(self.librados.rados_pool_delete,
                             (self.cluster, c_char_p(pool_name)))
         if ret < 0:
@@ -644,6 +692,7 @@ Rados object in state %s." % self.state)
             raise make_ex(ret, "error getting cluster fsid")
         return fsid.value
 
+    @requires(('ioctx_name', str))
     def open_ioctx(self, ioctx_name):
         """
         Create an io context
@@ -658,8 +707,6 @@ Rados object in state %s." % self.state)
         :returns: Ioctx - Rados Ioctx object
         """
         self.require_state("connected")
-        if not isinstance(ioctx_name, str):
-            raise TypeError('the name of the pool must be a string')
         ioctx = c_void_p()
         ret = run_in_thread(self.librados.rados_ioctx_create,
                             (self.cluster, c_char_p(ioctx_name), byref(ioctx)))
@@ -717,8 +764,8 @@ Rados object in state %s." % self.state)
         cmdarr = (c_char_p * len(cmd))(*cmd)
         ret = run_in_thread(self.librados.rados_osd_command,
                             (self.cluster, osdid, cmdarr, len(cmd),
-                            c_char_p(inbuf), len(inbuf),
-                            outbufp, byref(outbuflen), outsp, byref(outslen)),
+                             c_char_p(inbuf), len(inbuf),
+                             outbufp, byref(outbuflen), outsp, byref(outslen)),
                             timeout)
 
         # copy returned memory (ctypes makes a copy, not a reference)
@@ -746,8 +793,8 @@ Rados object in state %s." % self.state)
         cmdarr = (c_char_p * len(cmd))(*cmd)
         ret = run_in_thread(self.librados.rados_pg_command,
                             (self.cluster, c_char_p(pgid), cmdarr, len(cmd),
-                            c_char_p(inbuf), len(inbuf),
-                            outbufp, byref(outbuflen), outsp, byref(outslen)),
+                             c_char_p(inbuf), len(inbuf),
+                             outbufp, byref(outbuflen), outsp, byref(outslen)),
                             timeout)
 
         # copy returned memory (ctypes makes a copy, not a reference)
@@ -766,7 +813,7 @@ Rados object in state %s." % self.state)
         self.require_state("connected")
         return run_in_thread(self.librados.rados_wait_for_latest_osdmap, (self.cluster,))
 
-    def blacklist_add(self, client_address, expire_seconds = 0):
+    def blacklist_add(self, client_address, expire_seconds=0):
         """
         Blacklist a client from the OSDs
 
@@ -784,6 +831,40 @@ Rados object in state %s." % self.state)
         if ret < 0:
             raise make_ex(ret, "error blacklisting client '%s'" % client_address)
 
+
+class OmapIterator(object):
+    """Omap iterator"""
+    def __init__(self, ioctx, ctx):
+        self.ioctx = ioctx
+        self.ctx = ctx
+
+    def __iter__(self):
+        return self
+
+    def next(self):
+        """
+        Get the next key-value pair in the object
+        :returns: next rados.OmapItem
+        """
+        key_ = c_char_p(0)
+        val_ = c_char_p(0)
+        len_ = c_int(0)
+        ret = run_in_thread(self.ioctx.librados.rados_omap_get_next,
+                      (self.ctx, byref(key_), byref(val_), byref(len_)))
+        if (ret != 0):
+            raise make_ex(ret, "error iterating over the omap")
+        if key_.value is None:
+            raise StopIteration()
+        key = ctypes.string_at(key_)
+        val = None
+        if val_.value is not None:
+            val = ctypes.string_at(val_, len_)
+        return (key, val)
+
+    def __del__(self):
+        run_in_thread(self.ioctx.librados.rados_omap_get_end, (self.ctx,))
+
+
 class ObjectIterator(object):
     """rados.Ioctx Object iterator"""
     def __init__(self, ioctx):
@@ -792,8 +873,8 @@ class ObjectIterator(object):
         ret = run_in_thread(self.ioctx.librados.rados_nobjects_list_open,
                             (self.ioctx.io, byref(self.ctx)))
         if ret < 0:
-            raise make_ex(ret, "error iterating over the objects in ioctx '%s'" \
-                % self.ioctx.name)
+            raise make_ex(ret, "error iterating over the objects in ioctx '%s'"
+                          % self.ioctx.name)
 
     def __iter__(self):
         return self
@@ -817,6 +898,7 @@ class ObjectIterator(object):
     def __del__(self):
         run_in_thread(self.ioctx.librados.rados_nobjects_list_close, (self.ctx,))
 
+
 class XattrIterator(object):
     """Extended attribute iterator"""
     def __init__(self, ioctx, it, oid):
@@ -842,7 +924,7 @@ class XattrIterator(object):
         if (ret != 0):
             raise make_ex(ret, "error iterating over the extended attributes \
 in '%s'" % self.oid)
-        if name_.value == None:
+        if name_.value is None:
             raise StopIteration()
         name = ctypes.string_at(name_)
         val = ctypes.string_at(val_, len_)
@@ -851,6 +933,7 @@ in '%s'" % self.oid)
     def __del__(self):
         run_in_thread(self.ioctx.librados.rados_getxattrs_end, (self.it,))
 
+
 class SnapIterator(object):
     """Snapshot iterator"""
     def __init__(self, ioctx):
@@ -900,6 +983,7 @@ ioctx '%s'" % self.ioctx.name)
         self.cur_snap = self.cur_snap + 1
         return snap
 
+
 class Snap(object):
     """Snapshot object"""
     def __init__(self, ioctx, name, snap_id):
@@ -925,6 +1009,7 @@ class Snap(object):
             raise make_ex(ret, "rados_ioctx_snap_get_stamp error")
         return datetime.fromtimestamp(snap_time.value)
 
+
 class Completion(object):
     """completion object"""
     def __init__(self, ioctx, rados_comp, oncomplete, onsafe,
@@ -1018,8 +1103,42 @@ class Completion(object):
         run_in_thread(self.ioctx.librados.rados_aio_release,
                       (self.rados_comp,))
 
+
+class WriteOpCtx(object):
+    """write operation context manager"""
+    def __init__(self, ioctx):
+        self.ioctx = ioctx
+
+    def __enter__(self):
+        self.ioctx.librados.rados_create_write_op.restype = c_void_p
+        ret = run_in_thread(self.ioctx.librados.rados_create_write_op, (None,))
+        self.write_op = ret
+        return ret
+
+    def __exit__(self, type, msg, traceback):
+        self.ioctx.librados.rados_release_write_op.argtypes = [c_void_p]
+        run_in_thread(self.ioctx.librados.rados_release_write_op, (c_void_p(self.write_op),))
+
+
+class ReadOpCtx(object):
+    """read operation context manager"""
+    def __init__(self, ioctx):
+        self.ioctx = ioctx
+
+    def __enter__(self):
+        self.ioctx.librados.rados_create_read_op.restype = c_void_p
+        ret = run_in_thread(self.ioctx.librados.rados_create_read_op, (None,))
+        self.read_op = ret
+        return ret
+
+    def __exit__(self, type, msg, traceback):
+        self.ioctx.librados.rados_release_read_op.argtypes = [c_void_p]
+        run_in_thread(self.ioctx.librados.rados_release_read_op, (c_void_p(self.read_op),))
+
+
 RADOS_CB = CFUNCTYPE(c_int, c_void_p, c_void_p)
 
+
 class Ioctx(object):
     """rados.Ioctx object"""
     def __init__(self, name, librados, io):
@@ -1088,7 +1207,7 @@ class Ioctx(object):
             safe_cb = RADOS_CB(self.__aio_safe_cb)
         ret = run_in_thread(self.librados.rados_aio_create_completion,
                             (c_void_p(0), complete_cb, safe_cb,
-                            byref(completion)))
+                             byref(completion)))
         if ret < 0:
             raise make_ex(ret, "error getting a completion")
         with self.lock:
@@ -1126,8 +1245,8 @@ class Ioctx(object):
         completion = self.__get_completion(oncomplete, onsafe)
         ret = run_in_thread(self.librados.rados_aio_write,
                             (self.io, c_char_p(object_name),
-                            completion.rados_comp, c_char_p(to_write),
-                            c_size_t(len(to_write)), c_uint64(offset)))
+                             completion.rados_comp, c_char_p(to_write),
+                             c_size_t(len(to_write)), c_uint64(offset)))
         if ret < 0:
             raise make_ex(ret, "error writing object %s" % object_name)
         return completion
@@ -1158,8 +1277,8 @@ class Ioctx(object):
         completion = self.__get_completion(oncomplete, onsafe)
         ret = run_in_thread(self.librados.rados_aio_write_full,
                             (self.io, c_char_p(object_name),
-                            completion.rados_comp, c_char_p(to_write),
-                            c_size_t(len(to_write))))
+                             completion.rados_comp, c_char_p(to_write),
+                             c_size_t(len(to_write))))
         if ret < 0:
             raise make_ex(ret, "error writing object %s" % object_name)
         return completion
@@ -1189,8 +1308,8 @@ class Ioctx(object):
         completion = self.__get_completion(oncomplete, onsafe)
         ret = run_in_thread(self.librados.rados_aio_append,
                             (self.io, c_char_p(object_name),
-                            completion.rados_comp, c_char_p(to_append),
-                            c_size_t(len(to_append))))
+                             completion.rados_comp, c_char_p(to_append),
+                             c_size_t(len(to_append))))
         if ret < 0:
             raise make_ex(ret, "error appending to object %s" % object_name)
         return completion
@@ -1227,6 +1346,7 @@ class Ioctx(object):
         :returns: completion object
         """
         buf = create_string_buffer(length)
+
         def oncomplete_(completion_v):
             return_value = completion_v.get_return_value()
             return oncomplete(completion_v,
@@ -1235,8 +1355,8 @@ class Ioctx(object):
         completion = self.__get_completion(oncomplete_, None)
         ret = run_in_thread(self.librados.rados_aio_read,
                             (self.io, c_char_p(object_name),
-                            completion.rados_comp, buf, c_size_t(length),
-                            c_uint64(offset)))
+                             completion.rados_comp, buf, c_size_t(length),
+                             c_uint64(offset)))
         if ret < 0:
             raise make_ex(ret, "error reading %s" % object_name)
         return completion
@@ -1260,7 +1380,7 @@ class Ioctx(object):
         completion = self.__get_completion(oncomplete, onsafe)
         ret = run_in_thread(self.librados.rados_aio_remove,
                             (self.io, c_char_p(object_name),
-                            completion.rados_comp))
+                             completion.rados_comp))
         if ret < 0:
             raise make_ex(ret, "error removing %s" % object_name)
         return completion
@@ -1287,9 +1407,10 @@ class Ioctx(object):
         ret = run_in_thread(self.librados.rados_ioctx_pool_set_auid,
                             (self.io, ctypes.c_uint64(auid)))
         if ret < 0:
-            raise make_ex(ret, "error changing auid of '%s' to %d" %\
-                (self.name, auid))
+            raise make_ex(ret, "error changing auid of '%s' to %d"
+                          % (self.name, auid))
 
+    @requires(('loc_key', str))
     def set_locator_key(self, loc_key):
         """
         Set the key for mapping objects to pgs within an io context.
@@ -1306,10 +1427,8 @@ class Ioctx(object):
         :raises: :class:`TypeError`
         """
         self.require_ioctx_open()
-        if not isinstance(loc_key, str):
-            raise TypeError('loc_key must be a string')
         run_in_thread(self.librados.rados_ioctx_locator_set_key,
-                     (self.io, c_char_p(loc_key)))
+                      (self.io, c_char_p(loc_key)))
         self.locator_key = loc_key
 
     def get_locator_key(self):
@@ -1320,6 +1439,8 @@ class Ioctx(object):
         """
         return self.locator_key
 
+
+    @requires(('nspace', str))
     def set_namespace(self, nspace):
         """
         Set the namespace for objects within an io context.
@@ -1337,10 +1458,8 @@ class Ioctx(object):
         self.require_ioctx_open()
         if nspace is None:
             nspace = ""
-        if not isinstance(nspace, str):
-            raise TypeError('namespace must be a string')
         run_in_thread(self.librados.rados_ioctx_set_namespace,
-                     (self.io, c_char_p(nspace)))
+                      (self.io, c_char_p(nspace)))
         self.nspace = nspace
 
     def get_namespace(self):
@@ -1365,6 +1484,8 @@ class Ioctx(object):
             run_in_thread(self.librados.rados_ioctx_destroy, (self.io,))
             self.state = "closed"
 
+
+    @requires(('key', str), ('data', str))
     def write(self, key, data, offset=0):
         """
         Write data to an object synchronously
@@ -1381,23 +1502,20 @@ class Ioctx(object):
         :returns: int - 0 on success
         """
         self.require_ioctx_open()
-        if not isinstance(key, str):
-            raise TypeError('key must be a string')
-        if not isinstance(data, str):
-            raise TypeError('data must be a string')
         length = len(data)
         ret = run_in_thread(self.librados.rados_write,
                             (self.io, c_char_p(key), c_char_p(data),
-                            c_size_t(length), c_uint64(offset)))
+                             c_size_t(length), c_uint64(offset)))
         if ret == 0:
             return ret
         elif ret < 0:
-            raise make_ex(ret, "Ioctx.write(%s): failed to write %s" % \
-                (self.name, key))
+            raise make_ex(ret, "Ioctx.write(%s): failed to write %s"
+                          % (self.name, key))
         else:
             raise LogicError("Ioctx.write(%s): rados_write \
 returned %d, but should return zero on success." % (self.name, ret))
 
+    @requires(('key', str), ('data', str))
     def write_full(self, key, data):
         """
         Write an entire object synchronously.
@@ -1415,23 +1533,20 @@ returned %d, but should return zero on success." % (self.name, ret))
         :returns: int - 0 on success
         """
         self.require_ioctx_open()
-        if not isinstance(key, str):
-            raise TypeError('key must be a string')
-        if not isinstance(data, str):
-            raise TypeError('data must be a string')
         length = len(data)
         ret = run_in_thread(self.librados.rados_write_full,
                             (self.io, c_char_p(key), c_char_p(data),
-                            c_size_t(length)))
+                             c_size_t(length)))
         if ret == 0:
             return ret
         elif ret < 0:
-            raise make_ex(ret, "Ioctx.write_full(%s): failed to write %s" % \
-                (self.name, key))
+            raise make_ex(ret, "Ioctx.write_full(%s): failed to write %s"
+                          % (self.name, key))
         else:
             raise LogicError("Ioctx.write_full(%s): rados_write_full \
 returned %d, but should return zero on success." % (self.name, ret))
 
+    @requires(('key', str), ('data', str))
     def append(self, key, data):
         """
         Append data to an object synchronously
@@ -1446,23 +1561,20 @@ returned %d, but should return zero on success." % (self.name, ret))
         :returns: int - 0 on success
         """
         self.require_ioctx_open()
-        if not isinstance(key, str):
-            raise TypeError('key must be a string')
-        if not isinstance(data, str):
-            raise TypeError('data must be a string')
         length = len(data)
         ret = run_in_thread(self.librados.rados_append,
                             (self.io, c_char_p(key), c_char_p(data),
-                            c_size_t(length)))
+                             c_size_t(length)))
         if ret == 0:
             return ret
         elif ret < 0:
-            raise make_ex(ret, "Ioctx.append(%s): failed to append %s" % \
-                (self.name, key))
+            raise make_ex(ret, "Ioctx.append(%s): failed to append %s"
+                          % (self.name, key))
         else:
             raise LogicError("Ioctx.append(%s): rados_append \
 returned %d, but should return zero on success." % (self.name, ret))
 
+    @requires(('key', str))
     def read(self, key, length=8192, offset=0):
         """
         Read data from an object synchronously
@@ -1479,12 +1591,10 @@ returned %d, but should return zero on success." % (self.name, ret))
         :returns: str - data read from object
         """
         self.require_ioctx_open()
-        if not isinstance(key, str):
-            raise TypeError('key must be a string')
         ret_buf = create_string_buffer(length)
         ret = run_in_thread(self.librados.rados_read,
                             (self.io, c_char_p(key), ret_buf, c_size_t(length),
-                            c_uint64(offset)))
+                             c_uint64(offset)))
         if ret < 0:
             raise make_ex(ret, "Ioctx.read(%s): failed to read %s" % (self.name, key))
         return ctypes.string_at(ret_buf, ret)
@@ -1537,8 +1647,9 @@ returned %d, but should return zero on success." % (self.name, ret))
                 "num_rd": stats.num_rd,
                 "num_rd_kb": stats.num_rd_kb,
                 "num_wr": stats.num_wr,
-                "num_wr_kb": stats.num_wr_kb }
+                "num_wr_kb": stats.num_wr_kb}
 
+    @requires(('key', str))
     def remove_object(self, key):
         """
         Delete an object
@@ -1553,14 +1664,13 @@ returned %d, but should return zero on success." % (self.name, ret))
         :returns: bool - True on success
         """
         self.require_ioctx_open()
-        if not isinstance(key, str):
-            raise TypeError('key must be a string')
         ret = run_in_thread(self.librados.rados_remove,
                             (self.io, c_char_p(key)))
         if ret < 0:
             raise make_ex(ret, "Failed to remove '%s'" % key)
         return True
 
+    @requires(('key', str))
     def trunc(self, key, size):
         """
         Resize an object
@@ -1579,14 +1689,13 @@ returned %d, but should return zero on success." % (self.name, ret))
         """
 
         self.require_ioctx_open()
-        if not isinstance(key, str):
-            raise TypeError('key must be a string')
         ret = run_in_thread(self.librados.rados_trunc,
                             (self.io, c_char_p(key), c_uint64(size)))
         if ret < 0:
             raise make_ex(ret, "Ioctx.trunc(%s): failed to truncate %s" % (self.name, key))
         return ret
 
+    @requires(('key', str))
     def stat(self, key):
         """
         Get object stats (size/mtime)
@@ -1599,18 +1708,17 @@ returned %d, but should return zero on success." % (self.name, ret))
         :returns: (size,timestamp)
         """
         self.require_ioctx_open()
-        if not isinstance(key, str):
-            raise TypeError('key must be a string')
         psize = c_uint64()
         pmtime = c_uint64()
 
         ret = run_in_thread(self.librados.rados_stat,
                             (self.io, c_char_p(key), pointer(psize),
-                            pointer(pmtime)))
+                             pointer(pmtime)))
         if ret < 0:
             raise make_ex(ret, "Failed to stat %r" % key)
         return psize.value, time.localtime(pmtime.value)
 
+    @requires(('key', str), ('xattr_name', str))
     def get_xattr(self, key, xattr_name):
         """
         Get the value of an extended attribute on an object.
@@ -1625,8 +1733,6 @@ returned %d, but should return zero on success." % (self.name, ret))
         :returns: str - value of the xattr
         """
         self.require_ioctx_open()
-        if not isinstance(xattr_name, str):
-            raise TypeError('xattr_name must be a string')
         ret_length = 4096
         while ret_length < 4096 * 1024 * 1024:
             ret_buf = create_string_buffer(ret_length)
@@ -1641,6 +1747,7 @@ returned %d, but should return zero on success." % (self.name, ret))
                 break
         return ctypes.string_at(ret_buf, ret)
 
+    @requires(('oid', str))
     def get_xattrs(self, oid):
         """
         Start iterating over xattrs on an object.
@@ -1653,8 +1760,6 @@ returned %d, but should return zero on success." % (self.name, ret))
         :returns: XattrIterator
         """
         self.require_ioctx_open()
-        if not isinstance(oid, str):
-            raise TypeError('oid must be a string')
         it = c_void_p(0)
         ret = run_in_thread(self.librados.rados_getxattrs,
                             (self.io, oid, byref(it)))
@@ -1662,6 +1767,7 @@ returned %d, but should return zero on success." % (self.name, ret))
             raise make_ex(ret, "Failed to get rados xattrs for object %r" % oid)
         return XattrIterator(self, it, oid)
 
+    @requires(('key', str), ('xattr_name', str), ('xattr_value', str))
     def set_xattr(self, key, xattr_name, xattr_value):
         """
         Set an extended attribute on an object.
@@ -1678,19 +1784,14 @@ returned %d, but should return zero on success." % (self.name, ret))
         :returns: bool - True on success, otherwise raise an error
         """
         self.require_ioctx_open()
-        if not isinstance(key, str):
-            raise TypeError('key must be a string')
-        if not isinstance(xattr_name, str):
-            raise TypeError('xattr_name must be a string')
-        if not isinstance(xattr_value, str):
-            raise TypeError('xattr_value must be a string')
         ret = run_in_thread(self.librados.rados_setxattr,
                             (self.io, c_char_p(key), c_char_p(xattr_name),
-                            c_char_p(xattr_value), c_size_t(len(xattr_value))))
+                             c_char_p(xattr_value), c_size_t(len(xattr_value))))
         if ret < 0:
             raise make_ex(ret, "Failed to set xattr %r" % xattr_name)
         return True
 
+    @requires(('key', str), ('xattr_name', str))
     def rm_xattr(self, key, xattr_name):
         """
         Removes an extended attribute on from an object.
@@ -1705,15 +1806,11 @@ returned %d, but should return zero on success." % (self.name, ret))
         :returns: bool - True on success, otherwise raise an error
         """
         self.require_ioctx_open()
-        if not isinstance(key, str):
-            raise TypeError('key must be a string')
-        if not isinstance(xattr_name, str):
-            raise TypeError('xattr_name must be a string')
         ret = run_in_thread(self.librados.rados_rmxattr,
                             (self.io, c_char_p(key), c_char_p(xattr_name)))
         if ret < 0:
             raise make_ex(ret, "Failed to delete key %r xattr %r" %
-                (key, xattr_name))
+                          (key, xattr_name))
         return True
 
     def list_objects(self):
@@ -1734,6 +1831,7 @@ returned %d, but should return zero on success." % (self.name, ret))
         self.require_ioctx_open()
         return SnapIterator(self)
 
+    @requires(('snap_name', str))
     def create_snap(self, snap_name):
         """
         Create a pool-wide snapshot
@@ -1745,13 +1843,12 @@ returned %d, but should return zero on success." % (self.name, ret))
         :raises: :class:`Error`
         """
         self.require_ioctx_open()
-        if not isinstance(snap_name, str):
-            raise TypeError('snap_name must be a string')
         ret = run_in_thread(self.librados.rados_ioctx_snap_create,
                             (self.io, c_char_p(snap_name)))
         if (ret != 0):
             raise make_ex(ret, "Failed to create snap %s" % snap_name)
 
+    @requires(('snap_name', str))
     def remove_snap(self, snap_name):
         """
         Removes a pool-wide snapshot
@@ -1763,13 +1860,12 @@ returned %d, but should return zero on success." % (self.name, ret))
         :raises: :class:`Error`
         """
         self.require_ioctx_open()
-        if not isinstance(snap_name, str):
-            raise TypeError('snap_name must be a string')
         ret = run_in_thread(self.librados.rados_ioctx_snap_remove,
                             (self.io, c_char_p(snap_name)))
         if (ret != 0):
             raise make_ex(ret, "Failed to remove snap %s" % snap_name)
 
+    @requires(('snap_name', str))
     def lookup_snap(self, snap_name):
         """
         Get the id of a pool snapshot
@@ -1782,11 +1878,9 @@ returned %d, but should return zero on success." % (self.name, ret))
         :returns: Snap - on success
         """
         self.require_ioctx_open()
-        if not isinstance(snap_name, str):
-            raise TypeError('snap_name must be a string')
         snap_id = c_uint64()
         ret = run_in_thread(self.librados.rados_ioctx_snap_lookup,
-                           (self.io, c_char_p(snap_name), byref(snap_id)))
+                            (self.io, c_char_p(snap_name), byref(snap_id)))
         if (ret != 0):
             raise make_ex(ret, "Failed to lookup snap %s" % snap_name)
         return Snap(self, snap_name, snap_id)
@@ -1803,6 +1897,190 @@ returned %d, but should return zero on success." % (self.name, ret))
         self.require_ioctx_open()
         return run_in_thread(self.librados.rados_get_last_version, (self.io,))
 
+    def create_write_op(self):
+        """
+        create write operation object.
+        need call release_write_op after use
+        """
+        self.librados.rados_create_write_op.restype = c_void_p
+        return run_in_thread(self.librados.rados_create_write_op, (None,))
+
+    def create_read_op(self):
+        """
+        create read operation object.
+        need call release_read_op after use
+        """
+        self.librados.rados_create_read_op.restype = c_void_p
+        return run_in_thread(self.librados.rados_create_read_op, (None,))
+
+    def release_write_op(self, write_op):
+        """
+        release memory alloc by create_write_op
+        """
+        self.librados.rados_release_write_op.argtypes = [c_void_p]
+        run_in_thread(self.librados.rados_release_write_op, (c_void_p(write_op),))
+
+    def release_read_op(self, read_op):
+        """
+        release memory alloc by create_read_op
+        :para read_op: read_op object
+        :type: int
+        """
+        self.librados.rados_release_read_op.argtypes = [c_void_p]
+        run_in_thread(self.librados.rados_release_read_op, (c_void_p(read_op),))
+
+    @requires(('write_op', int), ('keys', tuple), ('values', tuple))
+    def set_omap(self, write_op, keys, values):
+        """
+        set keys values to write_op
+        :para write_op: write_operation object
+        :type write_op: int
+        :para keys: a tuple of keys
+        :type keys: tuple
+        :para values: a tuple of values
+        :type values: tuple
+        """
+        if len(keys) != len(values):
+            raise Error("Rados(): keys and values must have the same number of items")
+        key_num = len(keys)
+        key_array_type = c_char_p*key_num
+        key_array = key_array_type()
+        key_array[:] = keys
+
+        value_array_type = c_char_p*key_num
+        value_array = value_array_type()
+        value_array[:] = values
+
+        lens_array_type = c_size_t*key_num
+        lens_array = lens_array_type()
+        for index, value in enumerate(values):
+            lens_array[index] = c_size_t(len(value))
+
+        run_in_thread(self.librados.rados_write_op_omap_set,
+                      (c_void_p(write_op), byref(key_array), byref(value_array),
+                       byref(lens_array), c_int(key_num),))
+
+    @requires(('write_op', int), ('oid', str), ('mtime', opt(int)), ('flags', opt(int)))
+    def operate_write_op(self, write_op, oid, mtime=0, flags=0):
+        """
+        excute the real write operation
+        :para write_op: write operation object
+        :type write_op: int
+        :para oid: object name
+        :type oid: str
+        :para mtime: the time to set the mtime to, 0 for the current time
+        :type mtime: int
+        :para flags: flags to apply to the entire operation
+        :type flags: int
+        """
+        run_in_thread(self.librados.rados_write_op_operate,
+                      (c_void_p(write_op), self.io, c_char_p(oid),
+                       c_long(mtime), c_int(flags),))
+
+    @requires(('read_op', int), ('oid', str), ('flag', opt(int)))
+    def operate_read_op(self, read_op, oid, flag=0):
+        """
+        excute the real read operation
+        :para read_op: read operation object
+        :type read_op: int
+        :para oid: object name
+        :type oid: str
+        :para flag: flags to apply to the entire operation
+        :type flag: int
+        """
+        run_in_thread(self.librados.rados_read_op_operate,
+                      (c_void_p(read_op), self.io, c_char_p(oid), c_int(flag),))
+
+    @requires(('read_op', int), ('start_after', str), ('filter_prefix', str), ('max_return', int))
+    def get_omap_vals(self, read_op, start_after, filter_prefix, max_return):
+        """
+        get the omap values
+        :para read_op: read operation object
+        :type read_op: int
+        :para start_after: list keys starting after start_after
+        :type start_after: str
+        :para filter_prefix: list only keys beginning with filter_prefix
+        :type filter_prefix: str
+        :para max_return: list no more than max_return key/value pairs
+        :type max_return: int
+        :returns: an iterator over the the requested omap values, return value from this action
+        """
+        prval = c_int()
+        iter_addr = c_void_p()
+        run_in_thread(self.librados.rados_read_op_omap_get_vals,
+                      (c_void_p(read_op), c_char_p(start_after),
+                       c_char_p(filter_prefix), c_int(max_return),
+                       byref(iter_addr), pointer(prval)))
+        return OmapIterator(self, iter_addr), prval.value
+
+    @requires(('read_op', int), ('start_after', str), ('max_return', int))
+    def get_omap_keys(self, read_op, start_after, max_return):
+        """
+        get the omap keys
+        :para read_op: read operation object
+        :type read_op: int
+        :para start_after: list keys starting after start_after
+        :type start_after: str
+        :para max_return: list no more than max_return key/value pairs
+        :type max_return: int
+        :returns: an iterator over the the requested omap values, return value from this action
+        """
+        prval = c_int()
+        iter_addr = c_void_p()
+        run_in_thread(self.librados.rados_read_op_omap_get_keys,
+                      (c_void_p(read_op), c_char_p(start_after),
+                       c_int(max_return), byref(iter_addr), pointer(prval)))
+        return OmapIterator(self, iter_addr), prval.value
+
+    @requires(('read_op', int), ('keys', tuple))
+    def get_omap_vals_by_keys(self, read_op, keys):
+        """
+        get the omap values by keys
+        :para read_op: read operation object
+        :type read_op: int
+        :para keys: input key tuple
+        :type keys: tuple
+        :returns: an iterator over the the requested omap values, return value from this action
+        """
+        prval = c_int()
+        iter_addr = c_void_p()
+        key_num = len(keys)
+        key_array_type = c_char_p*key_num
+        key_array = key_array_type()
+        key_array[:] = keys
+        run_in_thread(self.librados.rados_read_op_omap_get_vals_by_keys,
+                      (c_void_p(read_op), byref(key_array), c_int(key_num),
+                       byref(iter_addr), pointer(prval)))
+        return OmapIterator(self, iter_addr), prval.value
+
+    @requires(('write_op', int), ('keys', tuple))
+    def remove_omap_keys(self, write_op, keys):
+        """
+        remove omap keys specifiled
+        :para write_op: write operation object
+        :type write_op: int
+        :para keys: input key tuple
+        :type keys: tuple
+        """
+        key_num = len(keys)
+        key_array_type = c_char_p*key_num
+        key_array = key_array_type()
+        key_array[:] = keys
+        run_in_thread(self.librados.rados_write_op_omap_rm_keys,
+                      (c_void_p(write_op), byref(key_array), c_int(key_num)))
+
+    @requires(('write_op', int))
+    def clear_omap(self, write_op):
+        """
+        Remove all key/value pairs from an object
+        :para write_op: write operation object
+        :type write_op: int
+        """
+        run_in_thread(self.librados.rados_write_op_omap_clear,
+                      (c_void_p(write_op),))
+
+    @requires(('key', str), ('name', str), ('cookie', str), ('desc', str),
+              ('duration', opt(int)), ('flags', int))
     def lock_exclusive(self, key, name, cookie, desc="", duration=None, flags=0):
 
         """
@@ -1825,18 +2103,6 @@ returned %d, but should return zero on success." % (self.name, ret))
         :raises: :class:`Error`
         """
         self.require_ioctx_open()
-        if not isinstance(key, str):
-            raise TypeError('key must be a string')
-        if not isinstance(name, str):
-            raise TypeError('name must be a string')
-        if not isinstance(cookie, str):
-            raise TypeError('cookie must be a string')
-        if not isinstance(desc, str):
-            raise TypeError('desc must be a string')
-        if duration is not None and not isinstance(duration, int):
-            raise TypeError('duration must be a integer')
-        if not isinstance(flags, int):
-            raise TypeError('flags must be a integer')
 
         ret = run_in_thread(self.librados.rados_lock_exclusive,
                             (self.io, c_char_p(key), c_char_p(name), c_char_p(cookie),
@@ -1846,6 +2112,8 @@ returned %d, but should return zero on success." % (self.name, ret))
         if ret < 0:
             raise make_ex(ret, "Ioctx.rados_lock_exclusive(%s): failed to set lock %s on %s" % (self.name, name, key))
 
+    @requires(('key', str), ('name', str), ('cookie', str), ('tag', str),
+              ('desc', str), ('duration', opt(int)), ('flags', int))
     def lock_shared(self, key, name, cookie, tag, desc="", duration=None, flags=0):
 
         """
@@ -1870,20 +2138,6 @@ returned %d, but should return zero on success." % (self.name, ret))
         :raises: :class:`Error`
         """
         self.require_ioctx_open()
-        if not isinstance(key, str):
-            raise TypeError('key must be a string')
-        if not isinstance(name, str):
-            raise TypeError('name must be a string')
-        if not isinstance(cookie, str):
-            raise TypeError('cookie must be a string')
-        if not isinstance(tag, str):
-            raise TypeError('tag must be a string')
-        if not isinstance(desc, str):
-            raise TypeError('desc must be a string')
-        if duration is not None and not isinstance(duration, int):
-            raise TypeError('duration must be a integer')
-        if not isinstance(flags, int):
-            raise TypeError('flags must be a integer')
 
         ret = run_in_thread(self.librados.rados_lock_shared,
                             (self.io, c_char_p(key), c_char_p(name), c_char_p(cookie),
@@ -1893,6 +2147,7 @@ returned %d, but should return zero on success." % (self.name, ret))
         if ret < 0:
             raise make_ex(ret, "Ioctx.rados_lock_exclusive(%s): failed to set lock %s on %s" % (self.name, name, key))
 
+    @requires(('key', str), ('name', str), ('cookie', str))
     def unlock(self, key, name, cookie):
 
         """
@@ -1909,12 +2164,6 @@ returned %d, but should return zero on success." % (self.name, ret))
         :raises: :class:`Error`
         """
         self.require_ioctx_open()
-        if not isinstance(key, str):
-            raise TypeError('key must be a string')
-        if not isinstance(name, str):
-            raise TypeError('name must be a string')
-        if not isinstance(cookie, str):
-            raise TypeError('cookie must be a string')
 
         ret = run_in_thread(self.librados.rados_unlock,
                             (self.io, c_char_p(key), c_char_p(name), c_char_p(cookie)))
@@ -1922,7 +2171,6 @@ returned %d, but should return zero on success." % (self.name, ret))
             raise make_ex(ret, "Ioctx.rados_lock_exclusive(%s): failed to set lock %s on %s" % (self.name, name, key))
 
 
-
 def set_object_locator(func):
     def retfunc(self, *args, **kwargs):
         if self.locator_key is not None:
@@ -1935,6 +2183,7 @@ def set_object_locator(func):
             return func(self, *args, **kwargs)
     return retfunc
 
+
 def set_object_namespace(func):
     def retfunc(self, *args, **kwargs):
         if self.nspace is None:
@@ -1946,6 +2195,7 @@ def set_object_namespace(func):
         return retval
     return retfunc
 
+
 class Object(object):
     """Rados object wrapper, makes the object look like a file"""
     def __init__(self, ioctx, key, locator_key=None, nspace=None):
@@ -1958,7 +2208,8 @@ class Object(object):
 
     def __str__(self):
         return "rados.Object(ioctx=%s,key=%s,nspace=%s,locator=%s)" % \
-        (str(self.ioctx), self.key, "--default--" if self.nspace is "" else self.nspace, self.locator_key)
+            (str(self.ioctx), self.key, "--default--"
+             if self.nspace is "" else self.nspace, self.locator_key)
 
     def require_object_exists(self):
         if self.state != "exists":
@@ -1966,7 +2217,7 @@ class Object(object):
 
     @set_object_locator
     @set_object_namespace
-    def read(self, length = 1024*1024):
+    def read(self, length=1024 * 1024):
         self.require_object_exists()
         ret = self.ioctx.read(self.key, length, self.offset)
         self.offset += len(ret)
@@ -1977,7 +2228,8 @@ class Object(object):
     def write(self, string_to_write):
         self.require_object_exists()
         ret = self.ioctx.write(self.key, string_to_write, self.offset)
-        self.offset += ret
+        if ret == 0:
+            self.offset += len(string_to_write)
         return ret
 
     @set_object_locator
@@ -2022,12 +2274,12 @@ class Object(object):
         return self.ioctx.rm_xattr(self.key, xattr_name)
 
 MONITOR_LEVELS = [
-   "debug",
-   "info",
-   "warn", "warning",
-   "err", "error",
-   "sec",
-   ]
+    "debug",
+    "info",
+    "warn", "warning",
+    "err", "error",
+    "sec",
+    ]
 
 
 class MonitorLog(object):
@@ -2064,15 +2316,15 @@ class MonitorLog(object):
         self.level = level
         self.callback = callback
         self.arg = arg
-        callback_factory = CFUNCTYPE(c_int,    # return type (really void)
-                                     c_void_p, # arg
-                                     c_char_p, # line
-                                     c_char_p, # who
-                                     c_uint64, # timestamp_sec
-                                     c_uint64, # timestamp_nsec
-                                     c_ulong,  # seq
-                                     c_char_p, # level
-                                     c_char_p) # msg
+        callback_factory = CFUNCTYPE(c_int,     # return type (really void)
+                                     c_void_p,  # arg
+                                     c_char_p,  # line
+                                     c_char_p,  # who
+                                     c_uint64,  # timestamp_sec
+                                     c_uint64,  # timestamp_nsec
+                                     c_ulong,   # seq
+                                     c_char_p,  # level
+                                     c_char_p)  # msg
         self.internal_callback = callback_factory(self.monitor_log_callback)
 
         r = run_in_thread(cluster.librados.rados_monitor_log,
diff --git a/src/pybind/rbd.py b/src/pybind/rbd.py
index 35bf5c7..b570a00 100644
--- a/src/pybind/rbd.py
+++ b/src/pybind/rbd.py
@@ -28,57 +28,108 @@ ADMIN_AUID = 0
 RBD_FEATURE_LAYERING = 1
 RBD_FEATURE_STRIPINGV2 = 2
 RBD_FEATURE_EXCLUSIVE_LOCK = 4
+RBD_FEATURE_OBJECT_MAP = 8
+RBD_FEATURE_FAST_DIFF = 16
+RBD_FEATURE_DEEP_FLATTEN = 32
+
+RBD_FEATURES_ALL = (RBD_FEATURE_LAYERING       |
+                    RBD_FEATURE_STRIPINGV2     |
+                    RBD_FEATURE_EXCLUSIVE_LOCK |
+                    RBD_FEATURE_OBJECT_MAP     |
+                    RBD_FEATURE_FAST_DIFF      |
+                    RBD_FEATURE_DEEP_FLATTEN)
+
+# features that make an image inaccessible for read or write by
+# clients that don't understand them
+RBD_FEATURES_INCOMPATIBLE = (RBD_FEATURE_LAYERING |
+                             RBD_FEATURE_STRIPINGV2)
+
+# features that make an image unwritable by clients that don't
+# understand them
+RBD_FEATURES_RW_INCOMPATIBLE = (RBD_FEATURES_INCOMPATIBLE  |
+                                RBD_FEATURE_EXCLUSIVE_LOCK |
+                                RBD_FEATURE_OBJECT_MAP     |
+                                RBD_FEATURE_FAST_DIFF      |
+                                RBD_FEATURE_DEEP_FLATTEN)
+
+# features that may be dynamically enabled or disabled
+RBD_FEATURES_MUTABLE = (RBD_FEATURE_EXCLUSIVE_LOCK |
+                        RBD_FEATURE_OBJECT_MAP     |
+                        RBD_FEATURE_FAST_DIFF)
+
+# features that only work when used with a single client
+# using the image for writes
+RBD_FEATURES_SINGLE_CLIENT = (RBD_FEATURE_EXCLUSIVE_LOCK |
+                              RBD_FEATURE_OBJECT_MAP     |
+                              RBD_FEATURE_FAST_DIFF)
 
 RBD_FLAG_OBJECT_MAP_INVALID = 1
 
+
 class Error(Exception):
     pass
 
+
 class PermissionError(Error):
     pass
 
+
 class ImageNotFound(Error):
     pass
 
+
 class ImageExists(Error):
     pass
 
+
 class IOError(Error):
     pass
 
+
 class NoSpace(Error):
     pass
 
+
 class IncompleteWriteError(Error):
     pass
 
+
 class InvalidArgument(Error):
     pass
 
+
 class LogicError(Error):
     pass
 
+
 class ReadOnlyImage(Error):
     pass
 
+
 class ImageBusy(Error):
     pass
 
+
 class ImageHasSnapshots(Error):
     pass
 
+
 class FunctionNotSupported(Error):
     pass
 
+
 class ArgumentOutOfRange(Error):
     pass
 
+
 class ConnectionShutdown(Error):
     pass
 
+
 class Timeout(Error):
     pass
 
+
 def make_ex(ret, msg):
     """
     Translate a librbd return code into an exception.
@@ -110,6 +161,7 @@ def make_ex(ret, msg):
     else:
         return Error(msg + (": error code %d" % ret))
 
+
 class rbd_image_info_t(Structure):
     _fields_ = [("size", c_uint64),
                 ("obj_size", c_uint64),
@@ -119,11 +171,13 @@ class rbd_image_info_t(Structure):
                 ("parent_pool", c_int64),
                 ("parent_name", c_char * 96)]
 
+
 class rbd_snap_info_t(Structure):
     _fields_ = [("id", c_uint64),
                 ("size", c_uint64),
                 ("name", c_char_p)]
 
+
 def load_librbd():
     """
     Load the librbd shared library.
@@ -139,6 +193,7 @@ def load_librbd():
     except OSError as e:
         raise EnvironmentError("Unable to load librbd: %s" % e)
 
+
 class RBD(object):
     """
     This class wraps librbd CRUD functions.
@@ -256,8 +311,8 @@ class RBD(object):
         ret = self.librbd.rbd_clone(p_ioctx.io, c_char_p(p_name),
                                     c_char_p(p_snapname),
                                     c_ioctx.io, c_char_p(c_name),
-                                          c_uint64(features),
-                                          byref(c_int(order)))
+                                    c_uint64(features),
+                                    byref(c_int(order)))
         if ret < 0:
             raise make_ex(ret, 'error creating clone')
 
@@ -320,6 +375,7 @@ class RBD(object):
         if ret != 0:
             raise make_ex(ret, 'error renaming image')
 
+
 class Image(object):
     """
     This class represents an RBD image. It is used to perform I/O on
@@ -392,7 +448,10 @@ class Image(object):
         """
         if not self.closed:
             self.closed = True
-            self.librbd.rbd_close(self.image)
+            ret = self.librbd.rbd_close(self.image)
+            if ret < 0:
+                raise make_ex(ret, 'error while closing image %s' % (
+                              self.name,))
 
     def __del__(self):
         self.close()
@@ -475,7 +534,7 @@ class Image(object):
             if ret == -errno.ERANGE:
                 size *= 2
 
-        if (ret != 0):
+        if ret != 0:
             raise make_ex(ret, 'error getting parent info for image %s' % (self.name,))
         return (pool.value, name.value, snapname.value)
 
@@ -487,7 +546,7 @@ class Image(object):
         """
         old = c_uint8()
         ret = self.librbd.rbd_get_old_format(self.image, byref(old))
-        if (ret != 0):
+        if ret != 0:
             raise make_ex(ret, 'error getting old_format for image' % (self.name))
         return old.value != 0
 
@@ -500,7 +559,7 @@ class Image(object):
         """
         image_size = c_uint64()
         ret = self.librbd.rbd_get_size(self.image, byref(image_size))
-        if (ret != 0):
+        if ret != 0:
             raise make_ex(ret, 'error getting size for image' % (self.name))
         return image_size.value
 
@@ -512,10 +571,28 @@ class Image(object):
         """
         features = c_uint64()
         ret = self.librbd.rbd_get_features(self.image, byref(features))
-        if (ret != 0):
+        if ret != 0:
             raise make_ex(ret, 'error getting features for image' % (self.name))
         return features.value
 
+    def update_features(self, features, enabled):
+        """
+        Updates the features bitmask of the image by enabling/disabling
+        a single feature.  The feature must support the ability to be
+        dynamically enabled/disabled.
+
+        :param features: feature bitmask to enable/disable
+        :type features: int
+        :param enabled: whether to enable/disable the feature
+        :type enabled: bool
+        :raises: :class:`InvalidArgument`
+        """
+        ret = self.librbd.rbd_update_features(self.image, c_uint64(features),
+                                              c_uint8(enabled));
+        if ret != 0:
+            raise make_ex(ret, 'error updating features for image %s' %
+                               (self.name))
+
     def overlap(self):
         """
         Gets the number of overlapping bytes between the image and its parent
@@ -527,7 +604,7 @@ class Image(object):
         """
         overlap = c_uint64()
         ret = self.librbd.rbd_get_overlap(self.image, byref(overlap))
-        if (ret != 0):
+        if ret != 0:
             raise make_ex(ret, 'error getting overlap for image' % (self.name))
         return overlap.value
 
@@ -539,7 +616,7 @@ class Image(object):
         """
         flags = c_uint64()
         ret = self.librbd.rbd_get_flags(self.image, byref(flags))
-        if (ret != 0):
+        if ret != 0:
             raise make_ex(ret, 'error getting flags for image' % (self.name))
         return flags.value
 
@@ -551,7 +628,7 @@ class Image(object):
         """
         owner = c_int()
         ret = self.librbd.rbd_is_exclusive_lock_owner(self.image, byref(owner))
-        if (ret != 0):
+        if ret != 0:
             raise make_ex(ret, 'error getting lock status for image' % (self.name))
         return owner.value == 1
 
@@ -695,23 +772,26 @@ class Image(object):
         :type offset: int
         :param length: how many bytes to read
         :type length: int
-	:param fadvise_flags: fadvise flags for this read
-	:type fadvise_flags: int
+        :param fadvise_flags: fadvise flags for this read
+        :type fadvise_flags: int
         :returns: str - the data read
         :raises: :class:`InvalidArgument`, :class:`IOError`
         """
         ret_buf = create_string_buffer(length)
-	if fadvise_flags == 0:
-	  ret = self.librbd.rbd_read(self.image, c_uint64(offset),
-				      c_size_t(length), byref(ret_buf))
-	else:
-	  ret = self.librbd.rbd_read2(self.image, c_uint64(offset),
-					c_size_t(length), byref(ret_buf), c_int(fadvise_flags))
+        if fadvise_flags == 0:
+            ret = self.librbd.rbd_read(self.image, c_uint64(offset),
+                                       c_size_t(length), byref(ret_buf))
+        else:
+            ret = self.librbd.rbd_read2(self.image, c_uint64(offset),
+                                        c_size_t(length), byref(ret_buf),
+                                        c_int(fadvise_flags))
         if ret < 0:
             raise make_ex(ret, 'error reading %s %ld~%ld' % (self.image, offset, length))
+
         return ctypes.string_at(ret_buf, ret)
 
-    def diff_iterate(self, offset, length, from_snapshot, iterate_cb):
+    def diff_iterate(self, offset, length, from_snapshot, iterate_cb,
+                     include_parent = True, whole_object = False):
         """
         Iterate over the changed extents of an image.
 
@@ -744,6 +824,10 @@ class Image(object):
         :param iterate_cb: function to call for each extent
         :type iterate_cb: function acception arguments for offset,
                            length, and exists
+        :param include_parent: True if full history diff should include parent
+        :type include_parent: bool
+        :param whole_object: True if diff extents should cover whole object
+        :type whole_object: bool
         :raises: :class:`InvalidArgument`, :class:`IOError`,
                  :class:`ImageNotFound`
         """
@@ -753,12 +837,14 @@ class Image(object):
         RBD_DIFF_CB = CFUNCTYPE(c_int, c_uint64, c_size_t, c_int, c_void_p)
         cb_holder = DiffIterateCB(iterate_cb)
         cb = RBD_DIFF_CB(cb_holder.callback)
-        ret = self.librbd.rbd_diff_iterate(self.image,
-                                           c_char_p(from_snapshot),
-                                           c_uint64(offset),
-                                           c_uint64(length),
-                                           cb,
-                                           c_void_p(None))
+        ret = self.librbd.rbd_diff_iterate2(self.image,
+                                            c_char_p(from_snapshot),
+                                            c_uint64(offset),
+                                            c_uint64(length),
+                                            c_uint8(include_parent),
+                                            c_uint8(whole_object),
+                                            cb,
+                                            c_void_p(None))
         if ret < 0:
             msg = 'error generating diff from snapshot %s' % from_snapshot
             raise make_ex(ret, msg)
@@ -772,8 +858,8 @@ class Image(object):
         :type data: str
         :param offset: where to start writing data
         :type offset: int
-	:param fadvise_flags: fadvise flags for this write
-	:type fadvise_flags: int
+        :param fadvise_flags: fadvise flags for this write
+        :type fadvise_flags: int
         :returns: int - the number of bytes written
         :raises: :class:`IncompleteWriteError`, :class:`LogicError`,
                  :class:`InvalidArgument`, :class:`IOError`
@@ -781,12 +867,14 @@ class Image(object):
         if not isinstance(data, str):
             raise TypeError('data must be a string')
         length = len(data)
-	if fadvise_flags == 0:
-	  ret = self.librbd.rbd_write(self.image, c_uint64(offset),
-	                              c_size_t(length), c_char_p(data))
-	else:
-	  ret = self.librbd.rbd_write2(self.image, c_uint64(offset),
-	                              c_size_t(length), c_char_p(data), c_int(fadvise_flags))
+
+        if fadvise_flags == 0:
+            ret = self.librbd.rbd_write(self.image, c_uint64(offset),
+                                        c_size_t(length), c_char_p(data))
+        else:
+            ret = self.librbd.rbd_write2(self.image, c_uint64(offset),
+                                         c_size_t(length), c_char_p(data),
+                                         c_int(fadvise_flags))
 
         if ret == length:
             return ret
@@ -852,7 +940,7 @@ written." % (self.name, ret, length))
         Flatten clone image (copy all blocks from parent to child)
         """
         ret = self.librbd.rbd_flatten(self.image)
-        if (ret < 0):
+        if ret < 0:
             raise make_ex(ret, "error flattening %s" % self.name)
 
     def list_children(self):
@@ -984,6 +1072,7 @@ written." % (self.name, ret, length))
         if ret < 0:
             raise make_ex(ret, 'error unlocking image')
 
+
 class DiffIterateCB(object):
     def __init__(self, cb):
         self.cb = cb
@@ -992,6 +1081,7 @@ class DiffIterateCB(object):
         self.cb(offset, length, exists == 1)
         return 0
 
+
 class SnapIterator(object):
     """
     Iterator over snapshot info for an image.
diff --git a/src/rbd.cc b/src/rbd.cc
old mode 100644
new mode 100755
index 6f5457d..ff9cf40
--- a/src/rbd.cc
+++ b/src/rbd.cc
@@ -33,6 +33,7 @@
 #include <boost/accumulators/statistics/stats.hpp>
 #include <boost/accumulators/statistics/rolling_sum.hpp>
 #include <boost/assign/list_of.hpp>
+#include <boost/bind.hpp>
 #include <boost/scope_exit.hpp>
 #include <boost/scoped_ptr.hpp>
 #include <errno.h>
@@ -76,101 +77,163 @@ map<string, string> map_options; // -o / --options map
 
 #define dout_subsys ceph_subsys_rbd
 
+namespace {
+
+void aio_context_callback(librbd::completion_t completion, void *arg)
+{
+  librbd::RBD::AioCompletion *aio_completion =
+    reinterpret_cast<librbd::RBD::AioCompletion*>(completion);
+  Context *context = reinterpret_cast<Context *>(arg);
+  context->complete(aio_completion->get_return_value());
+  aio_completion->release();
+}
+
+} // anonymous namespace
+
+static std::map<uint64_t, std::string> feature_mapping =
+  boost::assign::map_list_of(
+    RBD_FEATURE_LAYERING, "layering")(
+    RBD_FEATURE_STRIPINGV2, "striping")(
+    RBD_FEATURE_EXCLUSIVE_LOCK, "exclusive-lock")(
+    RBD_FEATURE_OBJECT_MAP, "object-map")(
+    RBD_FEATURE_FAST_DIFF, "fast-diff")(
+    RBD_FEATURE_DEEP_FLATTEN, "deep-flatten");
+
 void usage()
 {
   cout <<
 "usage: rbd [-n <auth user>] [OPTIONS] <cmd> ...\n"
 "where 'pool' is a rados pool name (default is 'rbd') and 'cmd' is one of:\n"
-"  (ls | list) [-l | --long ] [pool-name] list rbd images\n"
+"  (ls | list) [-l | --long ] [pool-name]      list rbd images\n"
 "                                              (-l includes snapshots/clones)\n"
-"  info <image-name>                           show information about image size,\n"
+"  (du | disk-usage) [<image-spec> | <snap-spec>]\n"
+"                                              show disk usage stats for pool,\n"
+"                                              image or snapshot\n"
+"  info <image-spec> | <snap-spec>             show information about image size,\n"
 "                                              striping, etc.\n"
 "  create [--order <bits>] [--image-features <features>] [--image-shared]\n"
-"         --size <MB> <name>                   create an empty image\n"
+"         --size <M/G/T> <image-spec>          create an empty image\n"
 "  clone [--order <bits>] [--image-features <features>] [--image-shared]\n"
-"        <parentsnap> <clonename>              clone a snapshot into a COW\n"
+"         <parent-snap-spec> <child-image-spec>\n"
+"                                              clone a snapshot into a COW\n"
 "                                              child image\n"
-"  children <snap-name>                        display children of snapshot\n"
-"  flatten <image-name>                        fill clone with parent data\n"
+"  children <snap-spec>                        display children of snapshot\n"
+"  flatten <image-spec>                        fill clone with parent data\n"
 "                                              (make it independent)\n"
-"  resize --size <MB> <image-name>             resize (expand or contract) image\n"
-"  rm <image-name>                             delete an image\n"
-"  export <image-name> <path>                  export image to file\n"
+"  resize --size <M/G/T> <image-spec>          resize (expand or contract) image\n"
+"  rm <image-spec>                             delete an image\n"
+"  export (<image-spec> | <snap-spec>) [<path>]\n"
+"                                              export image to file\n"
 "                                              \"-\" for stdout\n"
 "  import [--image-features <features>] [--image-shared]\n"
-"         <path> <image-name>                  import image from file (dest\n"
-"                                              defaults as the filename part\n"
-"                                              of file). \"-\" for stdin\n"
-"  diff <image-name> [--from-snap <snap-name>] print extents that differ since\n"
+"         <path> [<image-spec>]                import image from file\n"
+"                                              \"-\" for stdin\n"
+"                                              \"rbd/$(basename <path>)\" is\n"
+"                                              assumed for <image-spec> if\n"
+"                                              omitted\n"
+"  diff [--from-snap <snap-name>] [--whole-object]\n"
+"         <image-spec> | <snap-spec>           print extents that differ since\n"
 "                                              a previous snap, or image creation\n"
-"  export-diff <image-name> [--from-snap <snap-name>] <path>\n"
-"                                              export an incremental diff to\n"
+"  export-diff [--from-snap <snap-name>] [--whole-object]\n"
+"         (<image-spec> | <snap-spec>) <path>  export an incremental diff to\n"
 "                                              path, or \"-\" for stdout\n"
 "  merge-diff <diff1> <diff2> <path>           merge <diff1> and <diff2> into\n"
 "                                              <path>, <diff1> could be \"-\"\n"
 "                                              for stdin, and <path> could be \"-\"\n"
 "                                              for stdout\n"
-"  import-diff <path> <image-name>             import an incremental diff from\n"
+"  import-diff <path> <image-spec>             import an incremental diff from\n"
 "                                              path or \"-\" for stdin\n"
-"  (cp | copy) <src> <dest>                    copy src image to dest\n"
-"  (mv | rename) <src> <dest>                  rename src image to dest\n"
-"  snap ls <image-name>                        dump list of image snapshots\n"
-"  snap create <snap-name>                     create a snapshot\n"
-"  snap rollback <snap-name>                   rollback image to snapshot\n"
-"  snap rm <snap-name>                         deletes a snapshot\n"
-"  snap purge <image-name>                     deletes all snapshots\n"
-"  snap protect <snap-name>                    prevent a snapshot from being deleted\n"
-"  snap unprotect <snap-name>                  allow a snapshot to be deleted\n"
-"  watch <image-name>                          watch events on image\n"
-"  status <image-name>                         show the status of this image\n"
-"  map <image-name>                            map image to a block device\n"
+"  (cp | copy) (<src-image-spec> | <src-snap-spec>) <dest-image-spec>\n"
+"                                              copy src image to dest\n"
+"  (mv | rename) <src-image-spec> <dest-image-spec>\n"
+"                                              rename src image to dest\n"
+"  image-meta list <image-spec>                image metadata list keys with values\n"
+"  image-meta get <image-spec> <key>           image metadata get the value associated with the key\n"
+"  image-meta set <image-spec> <key> <value>   image metadata set key with value\n"
+"  image-meta remove <image-spec> <key>        image metadata remove the key and value associated\n"
+"  object-map rebuild <image-spec> | <snap-spec>\n"
+"                                              rebuild an invalid object map\n"
+"  snap ls <image-spec>                        dump list of image snapshots\n"
+"  snap create <snap-spec>                     create a snapshot\n"
+"  snap rollback <snap-spec>                   rollback image to snapshot\n"
+"  snap rm <snap-spec>                         deletes a snapshot\n"
+"  snap purge <image-spec>                     deletes all snapshots\n"
+"  snap protect <snap-spec>                    prevent a snapshot from being deleted\n"
+"  snap unprotect <snap-spec>                  allow a snapshot to be deleted\n"
+"  watch <image-spec>                          watch events on image\n"
+"  status <image-spec>                         show the status of this image\n"
+"  map <image-spec> | <snap-spec>              map image to a block device\n"
 "                                              using the kernel\n"
-"  unmap <device>                              unmap a rbd device that was\n"
+"  unmap <image-spec> | <snap-spec> | <device> unmap a rbd device that was\n"
 "                                              mapped by the kernel\n"
 "  showmapped                                  show the rbd images mapped\n"
 "                                              by the kernel\n"
-"  lock list <image-name>                      show locks held on an image\n"
-"  lock add <image-name> <id> [--shared <tag>] take a lock called id on an image\n"
-"  lock remove <image-name> <id> <locker>      release a lock on an image\n"
-"  bench-write <image-name>                    simple write benchmark\n"
-"                 --io-size <bytes>              write size\n"
-"                 --io-threads <num>             ios in flight\n"
-"                 --io-total <bytes>             total bytes to write\n"
-"                 --io-pattern <seq|rand>        write pattern\n"
+"  feature disable <image-spec> <feature>      disable the specified image feature\n"
+"  feature enable <image-spec> <feature>       enable the specified image feature\n"
+"  lock list <image-spec>                      show locks held on an image\n"
+"  lock add <image-spec> <id> [--shared <tag>] take a lock called id on an image\n"
+"  lock remove <image-spec> <id> <locker>      release a lock on an image\n"
+"  bench-write <image-spec>                    simple write benchmark\n"
+"               --io-size <size in B/K/M/G/T>    write size\n"
+"               --io-threads <num>               ios in flight\n"
+"               --io-total <size in B/K/M/G/T>   total size to write\n"
+"               --io-pattern <seq|rand>          write pattern\n"
 "\n"
-"<image-name>, <snap-name> are [pool/]name[@snap], or you may specify\n"
-"individual pieces of names with -p/--pool, --image, and/or --snap.\n"
+"<image-spec> is [<pool-name>]/<image-name>,\n"
+"<snap-spec> is [<pool-name>]/<image-name>@<snap-name>,\n"
+"or you may specify individual pieces of names with -p/--pool <pool-name>,\n"
+"--image <image-name> and/or --snap <snap-name>.\n"
 "\n"
 "Other input options:\n"
-"  -p, --pool <pool>                  source pool name\n"
+"  -p, --pool <pool-name>             source pool name\n"
+"  --dest-pool <pool-name>            destination pool name\n"
 "  --image <image-name>               image name\n"
-"  --dest <image-name>                destination [pool and] image name\n"
+"  --dest <image-name>                destination image name\n"
 "  --snap <snap-name>                 snapshot name\n"
-"  --dest-pool <name>                 destination pool name\n"
 "  --path <path-name>                 path name for import/export\n"
-"  --size <size in MB>                size of image for create and resize\n"
+"  -s, --size <size in M/G/T>         size of image for create and resize\n"
 "  --order <bits>                     the object size in bits; object size will be\n"
 "                                     (1 << order) bytes. Default is 22 (4 MB).\n"
 "  --image-format <format-number>     format to use when creating an image\n"
-"                                     format 1 is the original format (default)\n"
-"                                     format 2 supports cloning\n"
-"  --image-features <features>        optional format 2 features to enable\n"
-"                                     +1 layering support, +2 striping v2,\n"
-"                                     +4 exclusive lock, +8 object map\n"
+"                                     format 1 is the original format\n"
+"                                     format 2 supports cloning (default)\n"
+"  --image-feature <feature>          optional format 2 feature to enable.\n"
+"                                     use multiple times to enable multiple features\n"
 "  --image-shared                     image will be used concurrently (disables\n"
 "                                     RBD exclusive lock and dependent features)\n"
-"  --stripe-unit <size-in-bytes>      size (in bytes) of a block of data\n"
+"  --stripe-unit <size in B/K/M>      size of a block of data\n"
 "  --stripe-count <num>               number of consecutive objects in a stripe\n"
 "  --id <username>                    rados user (without 'client.'prefix) to\n"
 "                                     authenticate as\n"
 "  --keyfile <path>                   file containing secret key for use with cephx\n"
+"  --keyring <path>                   file containing keyring for use with cephx\n"
 "  --shared <tag>                     take a shared (rather than exclusive) lock\n"
 "  --format <output-format>           output format (default: plain, json, xml)\n"
 "  --pretty-format                    make json or xml output more readable\n"
 "  --no-progress                      do not show progress for long-running commands\n"
 "  -o, --options <map-options>        options to use when mapping an image\n"
 "  --read-only                        set device readonly when mapping image\n"
-"  --allow-shrink                     allow shrinking of an image when resizing\n";
+"  --allow-shrink                     allow shrinking of an image when resizing\n"
+"\n"
+"Supported image features:\n"
+"  ";
+
+for (std::map<uint64_t, std::string>::const_iterator it = feature_mapping.begin();
+     it != feature_mapping.end(); ++it) {
+  if (it != feature_mapping.begin()) {
+    cout << ", ";
+  }
+  cout << it->second;
+  if ((it->first & RBD_FEATURES_MUTABLE) != 0) {
+    cout << " (*)";
+  }
+  if ((it->first & g_conf->rbd_default_features) != 0) {
+    cout << " (+)";
+  }
+}
+cout << "\n\n"
+     << "  (*) supports enabling/disabling on existing images\n"
+     << "  (+) enabled by default for new images if features are not specified\n";
 }
 
 static void format_bitmask(Formatter *f, const std::string &name,
@@ -208,21 +271,28 @@ static void format_bitmask(Formatter *f, const std::string &name,
 
 static void format_features(Formatter *f, uint64_t features)
 {
-  std::map<uint64_t, std::string> mapping = boost::assign::map_list_of(
-    RBD_FEATURE_LAYERING, "layering")(
-    RBD_FEATURE_STRIPINGV2, "striping")(
-    RBD_FEATURE_EXCLUSIVE_LOCK, "exclusive")(
-    RBD_FEATURE_OBJECT_MAP, "object map");
-  format_bitmask(f, "feature", mapping, features);
+  format_bitmask(f, "feature", feature_mapping, features);
 }
 
 static void format_flags(Formatter *f, uint64_t flags)
 {
   std::map<uint64_t, std::string> mapping = boost::assign::map_list_of(
-    RBD_FLAG_OBJECT_MAP_INVALID, "object map invalid");
+    RBD_FLAG_OBJECT_MAP_INVALID, "object map invalid")(
+    RBD_FLAG_FAST_DIFF_INVALID, "fast diff invalid");
   format_bitmask(f, "flag", mapping, flags);
 }
 
+static bool decode_feature(const char* feature_name, uint64_t *feature) {
+  for (std::map<uint64_t, std::string>::const_iterator it = feature_mapping.begin();
+       it != feature_mapping.end(); ++it) {
+    if (strcmp(feature_name, it->second.c_str()) == 0) {
+      *feature = it->first;
+      return true;
+    }
+  }
+  return false;
+}
+
 struct MyProgressContext : public librbd::ProgressContext {
   const char *operation;
   int last_pc;
@@ -715,23 +785,37 @@ static int do_purge_snaps(librbd::Image& image)
 {
   MyProgressContext pc("Removing all snapshots");
   std::vector<librbd::snap_info_t> snaps;
+  bool is_protected = false;
   int r = image.snap_list(snaps);
   if (r < 0) {
     pc.fail();
     return r;
-  }
-
-  for (size_t i = 0; i < snaps.size(); ++i) {
-    r = image.snap_remove(snaps[i].name.c_str());
-    if (r < 0) {
-      pc.fail();
-      return r;
+  } else if (0 == snaps.size()) {
+    return 0;
+  } else {  
+    for (size_t i = 0; i < snaps.size(); ++i) {
+      r = image.snap_is_protected(snaps[i].name.c_str(), &is_protected);      
+      if (r < 0) {
+        pc.fail();
+        return r;
+      } else if (is_protected == true) {
+        pc.fail();
+        cerr << "\r" << "rbd: snapshot '" <<snaps[i].name.c_str()<< "' is protected from removal." << std::endl;
+        return -EBUSY;
+      }
+    }
+    for (size_t i = 0; i < snaps.size(); ++i) {
+      r = image.snap_remove(snaps[i].name.c_str());
+      if (r < 0) {
+        pc.fail();
+        return r;
+      }
+      pc.update_progress(i + 1, snaps.size());
     }
-    pc.update_progress(i + 1, snaps.size());
-  }
 
-  pc.finish();
-  return 0;
+    pc.finish();
+    return 0;
+  }
 }
 
 static int do_protect_snap(librbd::Image& image, const char *snapname)
@@ -978,15 +1062,7 @@ static int do_bench_write(librbd::Image& image, uint64_t io_size,
   for (off = 0; off < io_bytes; ) {
     b.wait_for(io_threads - 1);
     i = 0;
-    while (i < io_threads && off < io_bytes &&
-	   b.start_write(io_threads, thread_offset[i], io_size, bl, op_flags)) {
-      ++i;
-      ++ios;
-      off += io_size;
-
-      ++cur_ios;
-      cur_off += io_size;
-
+    while (i < io_threads && off < io_bytes) {
       if (pattern == "rand") {
         thread_offset[i] = (rand() % (size / io_size)) * io_size;
       } else {
@@ -994,6 +1070,16 @@ static int do_bench_write(librbd::Image& image, uint64_t io_size,
         if (thread_offset[i] + io_size > size)
           thread_offset[i] = 0;
       }
+
+      if (!b.start_write(io_threads, thread_offset[i], io_size, bl, op_flags))
+	break;
+
+      ++i;
+      ++ios;
+      off += io_size;
+
+      ++cur_ios;
+      cur_off += io_size;
     }
 
     utime_t now = ceph_clock_now(NULL);
@@ -1031,48 +1117,33 @@ static int do_bench_write(librbd::Image& image, uint64_t io_size,
   return 0;
 }
 
-struct ExportContext {
-  librbd::Image *image;
-  int fd;
-  uint64_t totalsize;
-  MyProgressContext pc;
-
-  ExportContext(librbd::Image *i, int f, uint64_t t) :
-    image(i),
-    fd(f),
-    totalsize(t),
-    pc("Exporting image")
-  {}
-};
-
-class AioExportContext : public Context
+class C_Export : public Context
 {
 public:
-  AioExportContext(SimpleThrottle &simple_throttle, librbd::Image &image,
+  C_Export(SimpleThrottle &simple_throttle, librbd::Image &image,
                    uint64_t offset, uint64_t length, int fd)
     : m_aio_completion(
-        new librbd::RBD::AioCompletion(this, &AioExportContext::aio_callback)),
-      m_throttle(simple_throttle),
-      m_offset(offset),
-      m_fd(fd)
+        new librbd::RBD::AioCompletion(this, &aio_context_callback)),
+      m_throttle(simple_throttle), m_image(image), m_offset(offset),
+      m_length(length), m_fd(fd)
+  {
+  }
+
+  void send()
   {
     m_throttle.start_op();
 
     int op_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL |
 		   LIBRADOS_OP_FLAG_FADVISE_NOCACHE;
-    int r = image.aio_read2(offset, length, m_bufferlist, m_aio_completion,
-			    op_flags);
+    int r = m_image.aio_read2(m_offset, m_length, m_bufferlist,
+                              m_aio_completion, op_flags);
     if (r < 0) {
       cerr << "rbd: error requesting read from source image" << std::endl;
+      m_aio_completion->release();
       m_throttle.end_op(r);
     }
   }
 
-  virtual ~AioExportContext()
-  {
-    m_aio_completion->release();
-  }
-
   virtual void finish(int r)
   {
     BOOST_SCOPE_EXIT((&m_throttle) (&r))
@@ -1108,19 +1179,13 @@ public:
     }
   }
 
-  static void aio_callback(librbd::completion_t completion, void *arg)
-  {
-    librbd::RBD::AioCompletion *aio_completion =
-      reinterpret_cast<librbd::RBD::AioCompletion*>(completion);
-    AioExportContext *export_context = reinterpret_cast<AioExportContext*>(arg);
-    export_context->complete(aio_completion->get_return_value());
-  }
-
 private:
   librbd::RBD::AioCompletion *m_aio_completion;
   SimpleThrottle &m_throttle;
+  librbd::Image &m_image;
   bufferlist m_bufferlist;
   uint64_t m_offset;
+  uint64_t m_length;
   int m_fd;
 };
 
@@ -1151,8 +1216,14 @@ static int do_export(librbd::Image& image, const char *path)
   SimpleThrottle throttle(max_concurrent_ops, false);
   uint64_t period = image.get_stripe_count() * (1ull << info.order);
   for (uint64_t offset = 0; offset < info.size; offset += period) {
+    if (throttle.pending_error()) {
+      break;
+    }
+
     uint64_t length = min(period, info.size - offset);
-    new AioExportContext(throttle, image, offset, length, fd);
+    C_Export *ctx = new C_Export(throttle, image, offset, length, fd);
+    ctx->send();
+
     pc.update_progress(offset, info.size);
   }
 
@@ -1172,40 +1243,96 @@ static int do_export(librbd::Image& image, const char *path)
   return r;
 }
 
-static int export_diff_cb(uint64_t ofs, size_t _len, int exists, void *arg)
-{
-  ExportContext *ec = static_cast<ExportContext *>(arg);
-  int r;
+struct ExportDiffContext {
+  librbd::Image *image;
+  int fd;
+  uint64_t totalsize;
+  MyProgressContext pc;
+  OrderedThrottle throttle;
 
-  // extent
-  bufferlist bl;
-  __u8 tag = exists ? 'w' : 'z';
-  ::encode(tag, bl);
-  ::encode(ofs, bl);
-  uint64_t len = _len;
-  ::encode(len, bl);
-  r = bl.write_fd(ec->fd);
-  if (r < 0)
-    return r;
+  ExportDiffContext(librbd::Image *i, int f, uint64_t t, int max_ops) :
+    image(i), fd(f), totalsize(t), pc("Exporting image"),
+    throttle(max_ops, true) {
+  }
+};
 
-  if (exists) {
-    // read block
-    bl.clear();
-    r = ec->image->read2(ofs, len, bl, LIBRADOS_OP_FLAG_FADVISE_NOCACHE);
-    if (r < 0)
-      return r;
-    r = bl.write_fd(ec->fd);
-    if (r < 0)
-      return r;
+class C_ExportDiff : public Context {
+public:
+  C_ExportDiff(ExportDiffContext *edc, uint64_t offset, uint64_t length,
+               bool exists)
+    : m_export_diff_context(edc), m_offset(offset), m_length(length),
+      m_exists(exists) {
   }
 
-  ec->pc.update_progress(ofs, ec->totalsize);
+  int send() {
+    if (m_export_diff_context->throttle.pending_error()) {
+      return m_export_diff_context->throttle.wait_for_ret();
+    }
 
-  return 0;
-}
+    C_OrderedThrottle *ctx = m_export_diff_context->throttle.start_op(this);
+    if (m_exists) {
+      librbd::RBD::AioCompletion *aio_completion =
+        new librbd::RBD::AioCompletion(ctx, &aio_context_callback);
+
+      int op_flags = LIBRADOS_OP_FLAG_FADVISE_NOCACHE;
+      int r = m_export_diff_context->image->aio_read2(
+        m_offset, m_length, m_read_data, aio_completion, op_flags);
+      if (r < 0) {
+        aio_completion->release();
+        ctx->complete(r);
+      }
+    } else {
+      ctx->complete(0);
+    }
+    return 0;
+  }
+
+  static int export_diff_cb(uint64_t offset, size_t length, int exists,
+                            void *arg) {
+    ExportDiffContext *edc = reinterpret_cast<ExportDiffContext *>(arg);
+
+    C_ExportDiff *context = new C_ExportDiff(edc, offset, length, exists);
+    return context->send();
+  }
+
+protected:
+  virtual void finish(int r) {
+    if (r >= 0) {
+      if (m_exists) {
+        m_exists = !m_read_data.is_zero();
+      }
+      r = write_extent(m_export_diff_context, m_offset, m_length, m_exists);
+      if (r == 0 && m_exists) {
+        r = m_read_data.write_fd(m_export_diff_context->fd);
+      }
+    }
+    m_export_diff_context->throttle.end_op(r);
+  }
+
+private:
+  ExportDiffContext *m_export_diff_context;
+  uint64_t m_offset;
+  uint64_t m_length;
+  bool m_exists;
+  bufferlist m_read_data;
+
+  static int write_extent(ExportDiffContext *edc, uint64_t offset,
+                          uint64_t length, bool exists) {
+    // extent
+    bufferlist bl;
+    __u8 tag = exists ? 'w' : 'z';
+    ::encode(tag, bl);
+    ::encode(offset, bl);
+    ::encode(length, bl);
+    int r = bl.write_fd(edc->fd);
+
+    edc->pc.update_progress(offset, edc->totalsize);
+    return r;
+  }
+};
 
 static int do_export_diff(librbd::Image& image, const char *fromsnapname,
-			  const char *endsnapname,
+			  const char *endsnapname, bool whole_object,
 			  const char *path)
 {
   int r;
@@ -1223,6 +1350,13 @@ static int do_export_diff(librbd::Image& image, const char *fromsnapname,
   if (fd < 0)
     return -errno;
 
+  BOOST_SCOPE_EXIT((&r) (&fd) (&path)) {
+    close(fd);
+    if (r < 0 && fd != 1) {
+      remove(path);
+    }
+  } BOOST_SCOPE_EXIT_END
+
   {
     // header
     bufferlist bl;
@@ -1250,15 +1384,22 @@ static int do_export_diff(librbd::Image& image, const char *fromsnapname,
 
     r = bl.write_fd(fd);
     if (r < 0) {
-      close(fd);
       return r;
     }
   }
 
-  ExportContext ec(&image, fd, info.size);
-  r = image.diff_iterate(fromsnapname, 0, info.size, export_diff_cb, (void *)&ec);
-  if (r < 0)
+  ExportDiffContext edc(&image, fd, info.size,
+                        g_conf->rbd_concurrent_management_ops);
+  r = image.diff_iterate2(fromsnapname, 0, info.size, true, whole_object,
+                          &C_ExportDiff::export_diff_cb, (void *)&edc);
+  if (r < 0) {
+    goto out;
+  }
+
+  r = edc.throttle.wait_for_ret();
+  if (r < 0) {
     goto out;
+  }
 
   {
     __u8 tag = 'e';
@@ -1268,11 +1409,10 @@ static int do_export_diff(librbd::Image& image, const char *fromsnapname,
   }
 
  out:
-  close(fd);
   if (r < 0)
-    ec.pc.fail();
+    edc.pc.fail();
   else
-    ec.pc.finish();
+    edc.pc.finish();
   return r;
 }
 
@@ -1301,7 +1441,7 @@ static int diff_cb(uint64_t ofs, size_t len, int exists, void *arg)
 }
 
 static int do_diff(librbd::Image& image, const char *fromsnapname,
-		   Formatter *f)
+                   bool whole_object, Formatter *f)
 {
   int r;
   librbd::image_info_t info;
@@ -1321,7 +1461,8 @@ static int do_diff(librbd::Image& image, const char *fromsnapname,
     om.t->define_column("Type", TextTable::LEFT, TextTable::LEFT);
   }
 
-  r = image.diff_iterate(fromsnapname, 0, info.size, diff_cb, &om);
+  r = image.diff_iterate2(fromsnapname, 0, info.size, true, whole_object,
+                          diff_cb, &om);
   if (f) {
     f->close_section();
     f->flush(cout);
@@ -1389,33 +1530,33 @@ done_img:
   update_snap_name(*new_img, snap);
 }
 
-class AioImportContext : public Context
+class C_Import : public Context
 {
 public:
-  AioImportContext(SimpleThrottle &simple_throttle, librbd::Image &image,
-                   bufferlist &bl, uint64_t offset)
-    : m_throttle(simple_throttle),
+  C_Import(SimpleThrottle &simple_throttle, librbd::Image &image,
+           bufferlist &bl, uint64_t offset)
+    : m_throttle(simple_throttle), m_image(image),
       m_aio_completion(
-        new librbd::RBD::AioCompletion(this, &AioImportContext::aio_callback)),
+        new librbd::RBD::AioCompletion(this, &aio_context_callback)),
       m_bufferlist(bl), m_offset(offset)
   {
+  }
+
+  void send()
+  {
     m_throttle.start_op();
 
     int op_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL |
 		   LIBRADOS_OP_FLAG_FADVISE_NOCACHE;
-    int r = image.aio_write2(m_offset, m_bufferlist.length(), m_bufferlist,
-			     m_aio_completion, op_flags);
+    int r = m_image.aio_write2(m_offset, m_bufferlist.length(), m_bufferlist,
+			       m_aio_completion, op_flags);
     if (r < 0) {
       cerr << "rbd: error requesting write to destination image" << std::endl;
+      m_aio_completion->release();
       m_throttle.end_op(r);
     }
   }
 
-  virtual ~AioImportContext()
-  {
-    m_aio_completion->release();
-  }
-
   virtual void finish(int r)
   {
     if (r < 0) {
@@ -1425,16 +1566,9 @@ public:
     m_throttle.end_op(r);
   }
 
-  static void aio_callback(librbd::completion_t completion, void *arg)
-  {
-    librbd::RBD::AioCompletion *aio_completion =
-      reinterpret_cast<librbd::RBD::AioCompletion*>(completion);
-    AioImportContext *import_context = reinterpret_cast<AioImportContext*>(arg);
-    import_context->complete(aio_completion->get_return_value());
-  }
-
 private:
   SimpleThrottle &m_throttle;
+  librbd::Image &m_image;
   librbd::RBD::AioCompletion *m_aio_completion;
   bufferlist m_bufferlist;
   uint64_t m_offset;
@@ -1519,6 +1653,10 @@ static int do_import(librbd::RBD &rbd, librados::IoCtx& io_ctx,
 
   // loop body handles 0 return, as we may have a block to flush
   while ((readlen = ::read(fd, p + blklen, reqlen)) >= 0) {
+    if (throttle->pending_error()) {
+      break;
+    }
+
     blklen += readlen;
     // if read was short, try again to fill the block before writing
     if (readlen && ((size_t)readlen < reqlen)) {
@@ -1543,7 +1681,8 @@ static int do_import(librbd::RBD &rbd, librados::IoCtx& io_ctx,
     // write as much as we got; perhaps less than imgblklen
     // but skip writing zeros to create sparse images
     if (!bl.is_zero()) {
-      new AioImportContext(*throttle, image, bl, image_pos);
+      C_Import *ctx = new C_Import(*throttle, image, bl, image_pos);
+      ctx->send();
     }
 
     // done with whole block, whether written or not
@@ -1567,7 +1706,7 @@ static int do_import(librbd::RBD &rbd, librados::IoCtx& io_ctx,
     }
   }
 
-  r = 0;
+  r = image.close();
 
  done:
   if (!from_stdin) {
@@ -1927,12 +2066,16 @@ static int do_merge_diff(const char *first, const char *second, const char *path
   // and the (offset,length) in wztag must be ascending order.
 
   r = parse_diff_header(fd, &f_tag, &f_from, &f_to, &f_size);
-  if (r < 0)
+  if (r < 0) {
+    cerr << "rbd: failed to parse first diff header" << std::endl;
     goto done;
+  }
 
   r = parse_diff_header(sd, &s_tag, &s_from, &s_to, &s_size);
-  if (r < 0)
+  if (r < 0) {
+    cerr << "rbd: failed to parse second diff header" << std::endl;
     goto done;
+  }
 
   if (f_to != s_from) {
     r = -EINVAL;
@@ -1963,8 +2106,10 @@ static int do_merge_diff(const char *first, const char *second, const char *path
     ::encode(s_size, bl);
 
     r = bl.write_fd(pd);
-    if (r < 0)
+    if (r < 0) {
+      cerr << "rbd: failed to write merged diff header" << std::endl;
       goto done;
+    }
   }
 
   if (f_size > s_size)
@@ -1981,8 +2126,13 @@ static int do_merge_diff(const char *first, const char *second, const char *path
       uint64_t last_off = f_off;
 
       r = parse_diff_body(fd, &f_tag, &f_off, &f_len);
-      if (r < 0)
+      dout(2) << "first diff data chunk: tag=" << f_tag << ", "
+              << "off=" << f_off << ", "
+              << "len=" << f_len << dendl;
+      if (r < 0) {
+        cerr << "rbd: failed to read first diff data chunk header" << std::endl;
         goto done;
+      }
 
       if (f_tag == 'e') {
         f_end = true;
@@ -1996,6 +2146,8 @@ static int do_merge_diff(const char *first, const char *second, const char *path
 
       if (last_off > f_off) {
         r = -ENOTSUP;
+        cerr << "rbd: out-of-order offset from first diff ("
+             << last_off << " > " << f_off << ")" << std::endl;
         goto done;
       }
     }
@@ -2004,8 +2156,14 @@ static int do_merge_diff(const char *first, const char *second, const char *path
       uint64_t last_off = s_off;
 
       r = parse_diff_body(sd, &s_tag, &s_off, &s_len);
-      if (r < 0)
+      dout(2) << "second diff data chunk: tag=" << f_tag << ", "
+              << "off=" << f_off << ", "
+              << "len=" << f_len << dendl;
+      if (r < 0) {
+        cerr << "rbd: failed to read second diff data chunk header"
+             << std::endl;
         goto done;
+      }
 
       if (s_tag == 'e') {
         s_end = true;
@@ -2018,6 +2176,8 @@ static int do_merge_diff(const char *first, const char *second, const char *path
 
       if (last_off > s_off) {
         r = -ENOTSUP;
+        cerr << "rbd: out-of-order offset from second diff ("
+             << last_off << " > " << s_off << ")" << std::endl;
         goto done;
       }
     }
@@ -2045,12 +2205,12 @@ static int do_merge_diff(const char *first, const char *second, const char *path
         if (first_stdin) {
           bufferptr bp = buffer::create(delta);
           r = safe_read_exact(fd, bp.c_str(), delta);
-          if (r < 0)
-            goto done;
         } else {
           r = lseek(fd, delta, SEEK_CUR);
-          if(r < 0)
-            goto done;
+        }
+        if (r < 0) {
+          cerr << "rbd: failed to skip first diff data" << std::endl;
+          goto done;
         }
       }
       f_off += delta;
@@ -2098,6 +2258,84 @@ done:
   return r;
 }
 
+static int do_metadata_list(librbd::Image& image, Formatter *f)
+{
+  map<string, bufferlist> pairs;
+  int r;
+  TextTable tbl;
+
+  r = image.metadata_list("", 0, &pairs);
+  if (r < 0) {
+    cerr << "failed to list metadata of image : " << cpp_strerror(r) << std::endl;
+    return r;
+  }
+
+  if (f) {
+    f->open_object_section("metadatas");
+  } else {
+    tbl.define_column("Key", TextTable::LEFT, TextTable::LEFT);
+    tbl.define_column("Value", TextTable::LEFT, TextTable::LEFT);
+  }
+
+  if (!pairs.empty()) {
+    bool one = (pairs.size() == 1);
+
+    if (!f) {
+      cout << "There " << (one ? "is " : "are ") << pairs.size()
+           << " metadata" << (one ? "" : "s") << " on this image.\n";
+    }
+
+    for (map<string, bufferlist>::iterator it = pairs.begin();
+         it != pairs.end(); ++it) {
+      string val(it->second.c_str(), it->second.length());
+      if (f) {
+        f->dump_string(it->first.c_str(), val.c_str());
+      } else {
+        tbl << it->first << val.c_str() << TextTable::endrow;
+      }
+    }
+    if (!f)
+      cout << tbl;
+  }
+
+  if (f) {
+    f->close_section();
+    f->flush(cout);
+  }
+  return 0;
+}
+
+static int do_metadata_set(librbd::Image& image, const char *key,
+                          const char *value)
+{
+  int r = image.metadata_set(key, value);
+  if (r < 0) {
+    cerr << "failed to set metadata " << key << " of image : " << cpp_strerror(r) << std::endl;
+  }
+  return r;
+}
+
+static int do_metadata_remove(librbd::Image& image, const char *key)
+{
+  int r = image.metadata_remove(key);
+  if (r < 0) {
+    cerr << "failed to remove metadata " << key << " of image : " << cpp_strerror(r) << std::endl;
+  }
+  return r;
+}
+
+static int do_metadata_get(librbd::Image& image, const char *key)
+{
+  string s;
+  int r = image.metadata_get(key, &s);
+  if (r < 0) {
+    cerr << "failed to get metadata " << key << " of image : " << cpp_strerror(r) << std::endl;
+    return r;
+  }
+  cout << s << std::endl;
+  return r;
+}
+
 static int do_copy(librbd::Image &src, librados::IoCtx& dest_pp,
 		   const char *destname)
 {
@@ -2255,6 +2493,18 @@ static int do_show_status(librados::IoCtx &io_ctx, librbd::Image &image,
   return 0;
 }
 
+static int do_object_map_rebuild(librbd::Image &image)
+{
+  MyProgressContext pc("Object Map Rebuild");
+  int r = image.rebuild_object_map(pc);
+  if (r < 0) {
+    pc.fail();
+    return r;
+  }
+  pc.finish();
+  return 0;
+}
+
 static int do_kernel_map(const char *poolname, const char *imgname,
 			 const char *snapname)
 {
@@ -2267,18 +2517,19 @@ static int do_kernel_map(const char *poolname, const char *imgname,
   if (r < 0)
     return r;
 
-  for (map<string, string>::const_iterator it = map_options.begin();
-       it != map_options.end();
-       ++it) {
+  for (map<string, string>::iterator it = map_options.begin();
+       it != map_options.end(); ) {
     // for compatibility with < 3.7 kernels, assume that rw is on by
     // default and omit it even if it was specified by the user
     // (see ceph.git commit fb0f1986449b)
-    if (it->first == "rw" && it->second == "rw")
-      continue;
-
-    if (it != map_options.begin())
-      oss << ",";
-    oss << it->second;
+    if (it->first == "rw" && it->second == "rw") {
+      map_options.erase(it++);
+    } else {
+      if (it != map_options.begin())
+        oss << ",";
+      oss << it->second;
+      ++it;
+    }
   }
 
   r = krbd_map(krbd, poolname, imgname, snapname, oss.str().c_str(), &devnode);
@@ -2308,7 +2559,8 @@ static int do_kernel_showmapped(Formatter *f)
   return r;
 }
 
-static int do_kernel_unmap(const char *dev)
+static int do_kernel_unmap(const char *dev, const char *poolname,
+                           const char *imgname, const char *snapname)
 {
   struct krbd_ctx *krbd;
   int r;
@@ -2317,7 +2569,10 @@ static int do_kernel_unmap(const char *dev)
   if (r < 0)
     return r;
 
-  r = krbd_unmap(krbd, dev);
+  if (dev)
+    r = krbd_unmap(krbd, dev);
+  else
+    r = krbd_unmap_by_spec(krbd, poolname, imgname, snapname);
 
   krbd_destroy(krbd);
   return r;
@@ -2356,11 +2611,6 @@ static string map_option_int_cb(const char *value_char)
 
 static void put_map_option(const string key, string val)
 {
-  map<string, string>::const_iterator it = map_options.find(key);
-  if (it != map_options.end()) {
-    cerr << "rbd: warning: redefining map option " << key << ": '"
-         << it->second << "' -> '" << val << "'" << std::endl;
-  }
   map_options[key] = val;
 }
 
@@ -2403,6 +2653,12 @@ static int parse_map_options(char *options)
       put_map_option("share", this_char);
     } else if (!strcmp(this_char, "crc") || !strcmp(this_char, "nocrc")) {
       put_map_option("crc", this_char);
+    } else if (!strcmp(this_char, "cephx_require_signatures") ||
+               !strcmp(this_char, "nocephx_require_signatures")) {
+      put_map_option("cephx_require_signatures", this_char);
+    } else if (!strcmp(this_char, "tcp_nodelay") ||
+               !strcmp(this_char, "notcp_nodelay")) {
+      put_map_option("tcp_nodelay", this_char);
     } else if (!strcmp(this_char, "mount_timeout")) {
       if (put_map_option_value("mount_timeout", value_char, map_option_int_cb))
         return 1;
@@ -2414,6 +2670,9 @@ static int parse_map_options(char *options)
         return 1;
     } else if (!strcmp(this_char, "rw") || !strcmp(this_char, "ro")) {
       put_map_option("rw", this_char);
+    } else if (!strcmp(this_char, "queue_depth")) {
+      if (put_map_option_value("queue_depth", value_char, map_option_int_cb))
+        return 1;
     } else {
       cerr << "rbd: unknown map option '" << this_char << "'" << std::endl;
       return 1;
@@ -2423,6 +2682,207 @@ static int parse_map_options(char *options)
   return 0;
 }
 
+static int disk_usage_callback(uint64_t offset, size_t len, int exists,
+                               void *arg) {
+  uint64_t *used_size = reinterpret_cast<uint64_t *>(arg);
+  if (exists) {
+    (*used_size) += len;
+  }
+  return 0;
+}
+
+static int compute_image_disk_usage(const std::string& name,
+                                    const std::string& snap_name,
+                                    const std::string& from_snap_name,
+                                    librbd::Image &image, uint64_t size,
+                                    TextTable& tbl, Formatter *f,
+                                    uint64_t *used_size) {
+  const char* from = NULL;
+  if (!from_snap_name.empty()) {
+    from = from_snap_name.c_str();
+  }
+
+  uint64_t flags;
+  int r = image.get_flags(&flags);
+  if (r < 0) {
+    cerr << "rbd: failed to retrieve image flags: " << cpp_strerror(r)
+         << std::endl;
+    return r;
+  }
+  if ((flags & RBD_FLAG_FAST_DIFF_INVALID) != 0) {
+    cerr << "warning: fast-diff map is invalid for " << name
+         << (snap_name.empty() ? "" : "@" + snap_name) << ". "
+         << "operation may be slow." << std::endl;
+  }
+
+  *used_size = 0;
+  r = image.diff_iterate2(from, 0, size, false, true,
+                          &disk_usage_callback, used_size);
+  if (r < 0) {
+    cerr << "rbd: failed to iterate diffs: " << cpp_strerror(r) << std::endl;
+    return r;
+  }
+
+  if (f) {
+    f->open_object_section("image");
+    f->dump_string("name", name);
+    if (!snap_name.empty()) {
+      f->dump_string("snapshot", snap_name);
+    }
+    f->dump_unsigned("provisioned_size", size);
+    f->dump_unsigned("used_size" , *used_size);
+    f->close_section();
+  } else {
+    std::string full_name = name;
+    if (!snap_name.empty()) {
+      full_name += "@" + snap_name;
+    }
+    tbl << full_name
+        << stringify(si_t(size))
+        << stringify(si_t(*used_size))
+        << TextTable::endrow;
+  }
+  return 0;
+}
+
+static int do_disk_usage(librbd::RBD &rbd, librados::IoCtx &io_ctx,
+                        const char *imgname, const char *snapname,
+                        Formatter *f) {
+  std::vector<string> names;
+  int r = rbd.list(io_ctx, names);
+  if (r == -ENOENT) {
+    r = 0;
+  } else if (r < 0) {
+    return r;
+  }
+
+  TextTable tbl;
+  if (f) {
+    f->open_object_section("stats");
+    f->open_array_section("images");
+  } else {
+    tbl.define_column("NAME", TextTable::LEFT, TextTable::LEFT);
+    tbl.define_column("PROVISIONED", TextTable::RIGHT, TextTable::RIGHT);
+    tbl.define_column("USED", TextTable::RIGHT, TextTable::RIGHT);
+  }
+
+  uint64_t used_size = 0;
+  uint64_t total_prov = 0;
+  uint64_t total_used = 0;
+  std::sort(names.begin(), names.end());
+  for (std::vector<string>::const_iterator name = names.begin();
+       name != names.end(); ++name) {
+    if (imgname != NULL && *name != imgname) {
+      continue;
+    }
+
+    librbd::Image image;
+    r = rbd.open_read_only(io_ctx, image, name->c_str(), NULL);
+    if (r < 0) {
+      if (r != -ENOENT) {
+        cerr << "rbd: error opening " << *name << ": " << cpp_strerror(r)
+             << std::endl;
+      }
+      continue;
+    }
+
+    uint64_t features;
+    int r = image.features(&features);
+    if (r < 0) {
+      cerr << "rbd: failed to retrieve image features: " << cpp_strerror(r)
+           << std::endl;
+      return r;
+    }
+    if ((features & RBD_FEATURE_FAST_DIFF) == 0) {
+      cerr << "warning: fast-diff map is not enabled for " << *name << ". "
+           << "operation may be slow." << std::endl;
+    }
+
+    librbd::image_info_t info;
+    if (image.stat(info, sizeof(info)) < 0) {
+      return -EINVAL;
+    }
+
+    std::vector<librbd::snap_info_t> snap_list;
+    r = image.snap_list(snap_list);
+    if (r < 0) {
+      cerr << "rbd: error opening " << *name << " snapshots: "
+           << cpp_strerror(r) << std::endl;
+      continue;
+    }
+
+    std::string last_snap_name;
+    std::sort(snap_list.begin(), snap_list.end(),
+              boost::bind(&librbd::snap_info_t::id, _1) <
+                boost::bind(&librbd::snap_info_t::id, _2));
+    for (std::vector<librbd::snap_info_t>::const_iterator snap =
+         snap_list.begin(); snap != snap_list.end(); ++snap) {
+      librbd::Image snap_image;
+      r = rbd.open_read_only(io_ctx, snap_image, name->c_str(),
+                             snap->name.c_str());
+      if (r < 0) {
+        cerr << "rbd: error opening snapshot " << *name << "@"
+             << snap->name << ": " << cpp_strerror(r) << std::endl;
+        return r;
+      }
+
+      if (imgname == NULL || (snapname != NULL && snap->name == snapname)) {
+        r = compute_image_disk_usage(*name, snap->name, last_snap_name,
+                                     snap_image, snap->size, tbl, f,
+                                     &used_size);
+        if (r < 0) {
+          return r;
+        }
+
+        if (snapname != NULL) {
+          total_prov += snap->size;
+        }
+        total_used += used_size;
+      }
+      last_snap_name = snap->name;
+    }
+
+    if (snapname == NULL) {
+      r = compute_image_disk_usage(*name, "", last_snap_name, image, info.size,
+                                   tbl, f, &used_size);
+      if (r < 0) {
+        return r;
+      }
+      total_prov += info.size;
+      total_used += used_size;
+    }
+  }
+
+  if (f) {
+    f->close_section();
+    if (imgname == NULL) {
+      f->dump_unsigned("total_provisioned_size", total_prov);
+      f->dump_unsigned("total_used_size", total_used);
+    }
+    f->close_section();
+    f->flush(cout);
+  } else {
+    if (imgname == NULL) {
+      tbl << "<TOTAL>"
+          << stringify(si_t(total_prov))
+          << stringify(si_t(total_used))
+          << TextTable::endrow;
+    }
+    cout << tbl;
+  }
+
+  return 0;
+}
+
+enum CommandType{
+  COMMAND_TYPE_NONE,
+  COMMAND_TYPE_SNAP,
+  COMMAND_TYPE_LOCK,
+  COMMAND_TYPE_METADATA,
+  COMMAND_TYPE_FEATURE,
+  COMMAND_TYPE_OBJECT_MAP
+};
+
 enum {
   OPT_NO_CMD = 0,
   OPT_LIST,
@@ -2452,19 +2912,32 @@ enum {
   OPT_MAP,
   OPT_UNMAP,
   OPT_SHOWMAPPED,
+  OPT_FEATURE_DISABLE,
+  OPT_FEATURE_ENABLE,
   OPT_LOCK_LIST,
   OPT_LOCK_ADD,
   OPT_LOCK_REMOVE,
   OPT_BENCH_WRITE,
   OPT_MERGE_DIFF,
+  OPT_METADATA_LIST,
+  OPT_METADATA_SET,
+  OPT_METADATA_GET,
+  OPT_METADATA_REMOVE,
+  OPT_OBJECT_MAP_REBUILD,
+  OPT_DISK_USAGE
 };
 
-static int get_cmd(const char *cmd, bool snapcmd, bool lockcmd)
+static int get_cmd(const char *cmd, CommandType command_type)
 {
-  if (!snapcmd && !lockcmd) {
+  switch (command_type)
+  {
+  case COMMAND_TYPE_NONE:
     if (strcmp(cmd, "ls") == 0 ||
         strcmp(cmd, "list") == 0)
       return OPT_LIST;
+    if (strcmp(cmd, "du") == 0 ||
+        strcmp(cmd, "disk-usage") == 0)
+      return OPT_DISK_USAGE;
     if (strcmp(cmd, "info") == 0)
       return OPT_INFO;
     if (strcmp(cmd, "create") == 0)
@@ -2509,7 +2982,8 @@ static int get_cmd(const char *cmd, bool snapcmd, bool lockcmd)
       return OPT_UNMAP;
     if (strcmp(cmd, "bench-write") == 0)
       return OPT_BENCH_WRITE;
-  } else if (snapcmd) {
+    break;
+  case COMMAND_TYPE_SNAP:
     if (strcmp(cmd, "create") == 0 ||
         strcmp(cmd, "add") == 0)
       return OPT_SNAP_CREATE;
@@ -2528,7 +3002,18 @@ static int get_cmd(const char *cmd, bool snapcmd, bool lockcmd)
       return OPT_SNAP_PROTECT;
     if (strcmp(cmd, "unprotect") == 0)
       return OPT_SNAP_UNPROTECT;
-  } else {
+    break;
+  case COMMAND_TYPE_METADATA:
+    if (strcmp(cmd, "list") == 0)
+      return OPT_METADATA_LIST;
+    if (strcmp(cmd, "set") == 0)
+      return OPT_METADATA_SET;
+    if (strcmp(cmd, "get") == 0)
+      return OPT_METADATA_GET;
+    if (strcmp(cmd, "remove") == 0)
+      return OPT_METADATA_REMOVE;
+    break;
+  case COMMAND_TYPE_LOCK:
     if (strcmp(cmd, "ls") == 0 ||
         strcmp(cmd, "list") == 0)
       return OPT_LOCK_LIST;
@@ -2537,6 +3022,18 @@ static int get_cmd(const char *cmd, bool snapcmd, bool lockcmd)
     if (strcmp(cmd, "remove") == 0 ||
 	strcmp(cmd, "rm") == 0)
       return OPT_LOCK_REMOVE;
+    break;
+  case COMMAND_TYPE_FEATURE:
+    if (strcmp(cmd, "disable") == 0) {
+      return OPT_FEATURE_DISABLE;
+    } else if (strcmp(cmd, "enable") == 0) {
+      return OPT_FEATURE_ENABLE;
+    }
+    break;
+  case COMMAND_TYPE_OBJECT_MAP:
+    if (strcmp(cmd, "rebuild") == 0)
+      return OPT_OBJECT_MAP_REBUILD;
+    break;
   }
 
   return OPT_NO_CMD;
@@ -2584,9 +3081,9 @@ int main(int argc, const char **argv)
   int order = 0;
   bool format_specified = false,
     output_format_specified = false;
-  int format = 1;
+  int format = 2;
 
-  uint64_t features = g_conf->rbd_default_features;
+  uint64_t features = 0;
   bool shared = false;
 
   const char *imgname = NULL, *snapname = NULL, *destname = NULL,
@@ -2594,16 +3091,20 @@ int main(int argc, const char **argv)
     *devpath = NULL, *lock_cookie = NULL, *lock_client = NULL,
     *lock_tag = NULL, *output_format = "plain",
     *fromsnapname = NULL,
-    *first_diff = NULL, *second_diff = NULL;
+    *first_diff = NULL, *second_diff = NULL, *key = NULL, *value = NULL;
+  char *cli_map_options = NULL;
+  std::vector<const char*> feature_names;
   bool lflag = false;
   int pretty_format = 0;
   long long stripe_unit = 0, stripe_count = 0;
   long long bench_io_size = 4096, bench_io_threads = 16, bench_bytes = 1 << 30;
   string bench_pattern = "seq";
+  bool diff_whole_object = false;
+  bool input_feature = false;
 
   std::string val, parse_err;
   std::ostringstream err;
-  long long sizell = 0;
+  uint64_t sizell = 0;
   std::vector<const char*>::iterator i;
   for (i = args.begin(); i != args.end(); ) {
     if (ceph_argparse_double_dash(args, i)) {
@@ -2615,6 +3116,7 @@ int main(int argc, const char **argv)
       usage();
       return 0;
     } else if (ceph_argparse_flag(args, i, "--new-format", (char*)NULL)) {
+      cerr << "rbd: --new-format is deprecated" << std::endl;
       format = 2;
       format_specified = true;
     } else if (ceph_argparse_witharg(args, i, &val, "--image-format",
@@ -2625,7 +3127,10 @@ int main(int argc, const char **argv)
 	return EXIT_FAILURE;
       }
       format_specified = true;
-      g_conf->set_val_or_die("rbd_default_format", val.c_str());
+      if (0 != g_conf->set_val("rbd_default_format", val.c_str())) {
+        cerr << "rbd: image format must be 1 or 2" << std::endl;
+        return EXIT_FAILURE;
+      }
     } else if (ceph_argparse_witharg(args, i, &val, "-p", "--pool", (char*)NULL)) {
       poolname = strdup(val.c_str());
     } else if (ceph_argparse_witharg(args, i, &val, "--dest-pool", (char*)NULL)) {
@@ -2636,38 +3141,70 @@ int main(int argc, const char **argv)
       fromsnapname = strdup(val.c_str());
     } else if (ceph_argparse_witharg(args, i, &val, "-i", "--image", (char*)NULL)) {
       imgname = strdup(val.c_str());
-    } else if (ceph_argparse_withlonglong(args, i, &sizell, &err, "-s", "--size", (char*)NULL)) {
+    } else if (ceph_argparse_witharg(args, i, &val, err, "-s", "--size", (char*)NULL)) {
       if (!err.str().empty()) {
-	cerr << "rbd: " << err.str() << std::endl;
-	return EXIT_FAILURE;
+        cerr << "rbd: " << err.str() << std::endl;
+        return EXIT_FAILURE;
       }
-      if (sizell < 0) {
-	cerr << "rbd: size must be >= 0" << std::endl;
-	return EXIT_FAILURE;
+      const char *sizeval = val.c_str();
+      size = strict_sistrtoll(sizeval, &parse_err);
+      if (!parse_err.empty()) {
+        cerr << "rbd: error parsing --size " << parse_err << std::endl;
+        return EXIT_FAILURE;
       }
-      size = sizell << 20;   // bytes to MB
+      //NOTE: We can remove below given three lines of code once all applications,
+      //which use this CLI will adopt B/K/M/G/T/P/E with size value 
+      sizell = atoll(sizeval);
+      if (size == sizell) 
+        size = size << 20;   // Default MB to Bytes
       size_set = true;
     } else if (ceph_argparse_flag(args, i, "-l", "--long", (char*)NULL)) {
       lflag = true;
-    } else if (ceph_argparse_withlonglong(args, i, &stripe_unit, &err, "--stripe-unit", (char*)NULL)) {
-    } else if (ceph_argparse_withlonglong(args, i, &stripe_count, &err, "--stripe-count", (char*)NULL)) {
-    } else if (ceph_argparse_withint(args, i, &order, &err, "--order", (char*)NULL)) {
+    } else if (ceph_argparse_witharg(args, i, &val, err, "--stripe-unit", (char*)NULL)) {
+      if (!err.str().empty()) {
+        cerr << "rbd: " << err.str() << std::endl;
+        return EXIT_FAILURE;
+      }
+      const char *stripeval = val.c_str();
+      stripe_unit = strict_sistrtoll(stripeval, &parse_err);
+      if (!parse_err.empty()) {
+        cerr << "rbd: error parsing --stripe-unit " << parse_err << std::endl;
+        return EXIT_FAILURE;
+      }
+    } else if (ceph_argparse_witharg(args, i, &stripe_count, err, "--stripe-count", (char*)NULL)) {
+    } else if (ceph_argparse_witharg(args, i, &order, err, "--order", (char*)NULL)) {
       if (!err.str().empty()) {
 	cerr << "rbd: " << err.str() << std::endl;
 	return EXIT_FAILURE;
       }
-    } else if (ceph_argparse_withlonglong(args, i, &bench_io_size, &err, "--io-size", (char*)NULL)) {
+    } else if (ceph_argparse_witharg(args, i, &val, err, "--io-size", (char*)NULL)) {
       if (!err.str().empty()) {
 	cerr << "rbd: " << err.str() << std::endl;
 	return EXIT_FAILURE;
       }
+      const char *iosval = val.c_str();
+      bench_io_size = strict_sistrtoll(iosval, &parse_err);
+      if (!parse_err.empty()) {
+        cerr << "rbd: error parsing --io-size " << parse_err << std::endl;
+        return EXIT_FAILURE;
+      }
       if (bench_io_size == 0) {
 	cerr << "rbd: io-size must be > 0" << std::endl;
 	return EXIT_FAILURE;
       }
-    } else if (ceph_argparse_withlonglong(args, i, &bench_io_threads, &err, "--io-threads", (char*)NULL)) {
-    } else if (ceph_argparse_withlonglong(args, i, &bench_bytes, &err, "--io-total", (char*)NULL)) {
-    } else if (ceph_argparse_witharg(args, i, &bench_pattern, &err, "--io-pattern", (char*)NULL)) {
+    } else if (ceph_argparse_witharg(args, i, &bench_io_threads, err, "--io-threads", (char*)NULL)) {
+    } else if (ceph_argparse_witharg(args, i, &val, err, "--io-total", (char*)NULL)) {
+      if (!err.str().empty()) {
+       cerr << "rbd: " << err.str() << std::endl;
+       return EXIT_FAILURE;
+      }
+      const char *iotval = val.c_str();
+      bench_bytes = strict_sistrtoll(iotval, &parse_err);
+      if (!parse_err.empty()) {
+        cerr << "rbd: error parsing --io-total " << parse_err << std::endl;
+        return EXIT_FAILURE;
+      }
+    } else if (ceph_argparse_witharg(args, i, &bench_pattern, "--io-pattern", (char*)NULL)) {
     } else if (ceph_argparse_witharg(args, i, &val, "--path", (char*)NULL)) {
       path = strdup(val.c_str());
     } else if (ceph_argparse_witharg(args, i, &val, "--dest", (char*)NULL)) {
@@ -2679,11 +3216,7 @@ int main(int argc, const char **argv)
     } else if (ceph_argparse_flag(args, i, "--no-settle", (char *)NULL)) {
       cerr << "rbd: --no-settle is deprecated" << std::endl;
     } else if (ceph_argparse_witharg(args, i, &val, "-o", "--options", (char*)NULL)) {
-      char *map_options = strdup(val.c_str());
-      if (parse_map_options(map_options)) {
-        cerr << "rbd: couldn't parse map options" << std::endl;
-        return EXIT_FAILURE;
-      }
+      cli_map_options = strdup(val.c_str());
     } else if (ceph_argparse_flag(args, i, "--read-only", (char *)NULL)) {
       // --read-only is equivalent to -o ro
       put_map_option("rw", "ro");
@@ -2691,8 +3224,19 @@ int main(int argc, const char **argv)
       progress = false;
     } else if (ceph_argparse_flag(args, i , "--allow-shrink", (char *)NULL)) {
       resize_allow_shrink = true;
+    } else if (ceph_argparse_witharg(args, i, &val, "--image-feature", (char *)NULL)) {
+      uint64_t feature;
+      input_feature = true;
+      if (!decode_feature(val.c_str(), &feature)) {
+        cerr << "rbd: invalid image feature: " << val << std::endl;
+        return EXIT_FAILURE;
+      }
+      features |= feature;
     } else if (ceph_argparse_witharg(args, i, &val, "--image-features", (char *)NULL)) {
+      cerr << "rbd: using --image-features for specifying the rbd image format is"
+	   << " deprecated, use --image-feature instead" << std::endl;
       features = strict_strtol(val.c_str(), 10, &parse_err);
+      input_feature = true;
       if (!parse_err.empty()) {
 	cerr << "rbd: error parsing --image-features: " << parse_err
              << std::endl;
@@ -2713,12 +3257,20 @@ int main(int argc, const char **argv)
 	output_format = strdup(val.c_str());
 	output_format_specified = true;
       }
+    } else if (ceph_argparse_flag(args, i, "--whole-object", (char *)NULL)) {
+      diff_whole_object = true;
     } else if (ceph_argparse_binary_flag(args, i, &pretty_format, NULL, "--pretty-format", (char*)NULL)) {
     } else {
       ++i;
     }
   }
 
+  if (features != 0 && !format_specified) {
+    format = 2;
+    format_specified = true;
+  } else if (features == 0) {
+    features = g_conf->rbd_default_features;
+  }
   if (shared) {
     features &= ~(RBD_FEATURE_EXCLUSIVE_LOCK | RBD_FEATURE_OBJECT_MAP);
   }
@@ -2731,29 +3283,31 @@ int main(int argc, const char **argv)
 
   common_init_finish(g_ceph_context);
 
+  std::map<std::string, CommandType> command_map = boost::assign::map_list_of
+    ("snap", COMMAND_TYPE_SNAP)
+    ("lock", COMMAND_TYPE_LOCK)
+    ("image-meta", COMMAND_TYPE_METADATA)
+    ("feature", COMMAND_TYPE_FEATURE)
+    ("object-map", COMMAND_TYPE_OBJECT_MAP);
+
   i = args.begin();
   if (i == args.end()) {
     cerr << "rbd: you must specify a command." << std::endl;
     return EXIT_FAILURE;
-  } else if (strcmp(*i, "snap") == 0) {
+  } else if (command_map.count(*i) > 0) {
+    std::string command(*i);
     i = args.erase(i);
     if (i == args.end()) {
-      cerr << "rbd: which snap command do you want?" << std::endl;
+      cerr << "rbd: which " << command << " command do you want?" << std::endl;
       return EXIT_FAILURE;
     }
-    opt_cmd = get_cmd(*i, true, false);
-  } else if (strcmp(*i, "lock") == 0) {
-    i = args.erase(i);
-    if (i == args.end()) {
-      cerr << "rbd: which lock command do you want?" << std::endl;
-      return EXIT_FAILURE;
-    }
-    opt_cmd = get_cmd(*i, false, true);
+    opt_cmd = get_cmd(*i, command_map[command]);
   } else {
-    opt_cmd = get_cmd(*i, false, false);
+    opt_cmd = get_cmd(*i, COMMAND_TYPE_NONE);
   }
   if (opt_cmd == OPT_NO_CMD) {
-    cerr << "rbd: error parsing command '" << *i << "'; -h or --help for usage" << std::endl;
+    cerr << "rbd: error parsing command '" << *i << "'; -h or --help for usage"
+         << std::endl;
     return EXIT_FAILURE;
   }
 
@@ -2789,14 +3343,15 @@ if (!set_conf_param(v, p1, p2, p3)) { \
       case OPT_WATCH:
       case OPT_STATUS:
       case OPT_MAP:
+      case OPT_UNMAP:
       case OPT_BENCH_WRITE:
       case OPT_LOCK_LIST:
+      case OPT_METADATA_LIST:
       case OPT_DIFF:
+      case OPT_OBJECT_MAP_REBUILD:
+      case OPT_DISK_USAGE:
 	SET_CONF_PARAM(v, &imgname, NULL, NULL);
 	break;
-      case OPT_UNMAP:
-	SET_CONF_PARAM(v, &devpath, NULL, NULL);
-	break;
       case OPT_EXPORT:
       case OPT_EXPORT_DIFF:
 	SET_CONF_PARAM(v, &imgname, &path, NULL);
@@ -2823,8 +3378,23 @@ if (!set_conf_param(v, p1, p2, p3)) { \
 	SET_CONF_PARAM(v, &imgname, &lock_cookie, NULL);
 	break;
       case OPT_LOCK_REMOVE:
-	SET_CONF_PARAM(v, &imgname, &lock_client, &lock_cookie);
+	SET_CONF_PARAM(v, &imgname, &lock_cookie, &lock_client);
+	break;
+      case OPT_METADATA_SET:
+	SET_CONF_PARAM(v, &imgname, &key, &value);
 	break;
+      case OPT_METADATA_GET:
+      case OPT_METADATA_REMOVE:
+	SET_CONF_PARAM(v, &imgname, &key, NULL);
+	break;
+      case OPT_FEATURE_DISABLE:
+      case OPT_FEATURE_ENABLE:
+        if (imgname == NULL) {
+          imgname = v;
+        } else {
+          feature_names.push_back(v);
+        }
+        break;
     default:
 	assert(0);
 	break;
@@ -2850,6 +3420,12 @@ if (!set_conf_param(v, p1, p2, p3)) { \
     return EXIT_FAILURE;
   }
 
+  if (opt_cmd != OPT_LOCK_ADD && lock_tag) {
+    cerr << "rbd: only the lock add command uses the --shared option"
+	 << std::endl;
+    return EXIT_FAILURE;
+  }
+
   if (pretty_format && !strcmp(output_format, "plain")) {
     cerr << "rbd: --pretty-format only works when --format is json or xml"
 	 << std::endl;
@@ -2861,7 +3437,8 @@ if (!set_conf_param(v, p1, p2, p3)) { \
       opt_cmd != OPT_INFO && opt_cmd != OPT_LIST &&
       opt_cmd != OPT_SNAP_LIST && opt_cmd != OPT_LOCK_LIST &&
       opt_cmd != OPT_CHILDREN && opt_cmd != OPT_DIFF &&
-      opt_cmd != OPT_STATUS) {
+      opt_cmd != OPT_METADATA_LIST && opt_cmd != OPT_STATUS &&
+      opt_cmd != OPT_DISK_USAGE) {
     cerr << "rbd: command doesn't use output formatting"
 	 << std::endl;
     return EXIT_FAILURE;
@@ -2876,11 +3453,6 @@ if (!set_conf_param(v, p1, p2, p3)) { \
     }
   }
 
-  if (opt_cmd == OPT_EXPORT && !imgname) {
-    cerr << "rbd: image name was not specified" << std::endl;
-    return EXIT_FAILURE;
-  }
-
   if ((opt_cmd == OPT_IMPORT || opt_cmd == OPT_IMPORT_DIFF) && !path) {
     cerr << "rbd: path was not specified" << std::endl;
     return EXIT_FAILURE;
@@ -2893,31 +3465,39 @@ if (!set_conf_param(v, p1, p2, p3)) { \
     imgname = NULL;
   }
 
-  if (opt_cmd != OPT_LOCK_ADD && lock_tag) {
-    cerr << "rbd: only the lock add command uses the --shared option"
-	 << std::endl;
-    return EXIT_FAILURE;
-  }
-
-  if ((opt_cmd == OPT_LOCK_ADD || opt_cmd == OPT_LOCK_REMOVE) &&
-      !lock_cookie) {
-    cerr << "rbd: lock id was not specified" << std::endl;
-    return EXIT_FAILURE;
-  }
-
   if (opt_cmd != OPT_LIST &&
       opt_cmd != OPT_IMPORT &&
-      opt_cmd != OPT_IMPORT_DIFF &&
-      opt_cmd != OPT_UNMAP &&
+      opt_cmd != OPT_UNMAP && /* needs imgname but handled below */
       opt_cmd != OPT_SHOWMAPPED &&
-      opt_cmd != OPT_MERGE_DIFF && !imgname) {
+      opt_cmd != OPT_MERGE_DIFF &&
+      opt_cmd != OPT_DISK_USAGE && !imgname) {
     cerr << "rbd: image name was not specified" << std::endl;
     return EXIT_FAILURE;
   }
 
-  if (opt_cmd == OPT_UNMAP && !devpath) {
-    cerr << "rbd: device path was not specified" << std::endl;
-    return EXIT_FAILURE;
+  if (opt_cmd == OPT_MAP) {
+    char *default_map_options = strdup(g_conf->rbd_default_map_options.c_str());
+
+    // parse default options first so they can be overwritten by cli options
+    if (parse_map_options(default_map_options)) {
+      cerr << "rbd: couldn't parse default map options" << std::endl;
+      return EXIT_FAILURE;
+    }
+    if (cli_map_options && parse_map_options(cli_map_options)) {
+      cerr << "rbd: couldn't parse map options" << std::endl;
+      return EXIT_FAILURE;
+    }
+  }
+  if (opt_cmd == OPT_UNMAP) {
+    if (!imgname) {
+      cerr << "rbd: unmap requires either image name or device path" << std::endl;
+      return EXIT_FAILURE;
+    }
+
+    if (strncmp(imgname, "/dev/", 5) == 0) {
+      devpath = imgname;
+      imgname = NULL;
+    }
   }
 
   // do this unconditionally so we can parse pool/image at snapshot into
@@ -2926,10 +3506,12 @@ if (!set_conf_param(v, p1, p2, p3)) { \
 		      (char **)&imgname, (char **)&snapname);
   if (snapname && opt_cmd != OPT_SNAP_CREATE && opt_cmd != OPT_SNAP_ROLLBACK &&
       opt_cmd != OPT_SNAP_REMOVE && opt_cmd != OPT_INFO &&
-      opt_cmd != OPT_EXPORT && opt_cmd != OPT_EXPORT_DIFF && opt_cmd != OPT_DIFF && opt_cmd != OPT_COPY &&
-      opt_cmd != OPT_MAP && opt_cmd != OPT_CLONE &&
+      opt_cmd != OPT_EXPORT && opt_cmd != OPT_EXPORT_DIFF &&
+      opt_cmd != OPT_DIFF && opt_cmd != OPT_COPY &&
+      opt_cmd != OPT_MAP && opt_cmd != OPT_UNMAP && opt_cmd != OPT_CLONE &&
       opt_cmd != OPT_SNAP_PROTECT && opt_cmd != OPT_SNAP_UNPROTECT &&
-      opt_cmd != OPT_CHILDREN) {
+      opt_cmd != OPT_CHILDREN && opt_cmd != OPT_OBJECT_MAP_REBUILD &&
+      opt_cmd != OPT_DISK_USAGE) {
     cerr << "rbd: snapname specified for a command that doesn't use it"
 	 << std::endl;
     return EXIT_FAILURE;
@@ -2944,6 +3526,12 @@ if (!set_conf_param(v, p1, p2, p3)) { \
 
   set_pool_image_name(destname, (char **)&dest_poolname,
 		      (char **)&destname, (char **)&dest_snapname);
+  if (dest_snapname) {
+    // no command uses dest_snapname
+    cerr << "rbd: destination snapname specified for a command that doesn't use it"
+         << std::endl;
+    return EXIT_FAILURE;
+  }
 
   if (opt_cmd == OPT_IMPORT) {
     if (poolname && dest_poolname) {
@@ -2973,25 +3561,23 @@ if (!set_conf_param(v, p1, p2, p3)) { \
       cerr << "rbd: second diff was not specified" << std::endl;
       return EXIT_FAILURE;
     }
-    if (!path) {
+  }
+  if ((opt_cmd == OPT_EXPORT || opt_cmd == OPT_EXPORT_DIFF ||
+      opt_cmd == OPT_MERGE_DIFF) && !path) {
+    if (opt_cmd == OPT_EXPORT) {
+      path = imgname;
+    } else {
       cerr << "rbd: path was not specified" << std::endl;
       return EXIT_FAILURE;
     }
   }
-  if (opt_cmd == OPT_EXPORT && !path)
-    path = imgname;
 
   if ((opt_cmd == OPT_COPY || opt_cmd == OPT_CLONE || opt_cmd == OPT_RENAME) &&
-      !destname ) {
+      ((!destname) || (destname[0] == '\0')) ) {
     cerr << "rbd: destination image name was not specified" << std::endl;
     return EXIT_FAILURE;
   }
 
-  if ((opt_cmd == OPT_CLONE) && dest_snapname) {
-    cerr << "rbd: cannot clone to a snapshot" << std::endl;
-    return EXIT_FAILURE;
-  }
-
   if ((opt_cmd == OPT_CLONE) && size) {
     cerr << "rbd: clone must begin at size of parent" << std::endl;
     return EXIT_FAILURE;
@@ -3004,6 +3590,47 @@ if (!set_conf_param(v, p1, p2, p3)) { \
     return EXIT_FAILURE;
   }
 
+  if (opt_cmd == OPT_LOCK_ADD || opt_cmd == OPT_LOCK_REMOVE) {
+    if (!lock_cookie) {
+      cerr << "rbd: lock id was not specified" << std::endl;
+      return EXIT_FAILURE;
+    }
+    if (opt_cmd == OPT_LOCK_REMOVE && !lock_client) {
+      cerr << "rbd: locker was not specified" << std::endl;
+      return EXIT_FAILURE;
+    }
+  }
+
+  if (opt_cmd == OPT_FEATURE_DISABLE || opt_cmd == OPT_FEATURE_ENABLE) {
+    if (feature_names.empty()) {
+      cerr << "rbd: at least one feature name must be specified" << std::endl;
+      return EXIT_FAILURE;
+    }
+
+    features = 0;
+    for (size_t i = 0; i < feature_names.size(); ++i) {
+      uint64_t feature;
+      if (!decode_feature(feature_names[i], &feature)) {
+        cerr << "rbd: invalid feature name specified: " << feature_names[i]
+             << std::endl;
+        return EXIT_FAILURE;
+      }
+      features |= feature;
+    }
+  }
+
+  if (opt_cmd == OPT_METADATA_GET || opt_cmd == OPT_METADATA_REMOVE ||
+      opt_cmd == OPT_METADATA_SET) {
+    if (!key) {
+      cerr << "rbd: metadata key was not specified" << std::endl;
+      return EXIT_FAILURE;
+    }
+    if (opt_cmd == OPT_METADATA_SET && !value) {
+      cerr << "rbd: metadata value was not specified" << std::endl;
+      return EXIT_FAILURE;
+    }
+  }
+
   bool talk_to_cluster = (opt_cmd != OPT_MAP &&
 			  opt_cmd != OPT_UNMAP &&
 			  opt_cmd != OPT_SHOWMAPPED &&
@@ -3038,14 +3665,18 @@ if (!set_conf_param(v, p1, p2, p3)) { \
        opt_cmd == OPT_INFO || opt_cmd == OPT_SNAP_LIST ||
        opt_cmd == OPT_IMPORT_DIFF ||
        opt_cmd == OPT_EXPORT || opt_cmd == OPT_EXPORT_DIFF || opt_cmd == OPT_COPY ||
-       opt_cmd == OPT_DIFF ||
+       opt_cmd == OPT_DIFF || opt_cmd == OPT_STATUS ||
        opt_cmd == OPT_CHILDREN || opt_cmd == OPT_LOCK_LIST ||
-       opt_cmd == OPT_STATUS)) {
+       opt_cmd == OPT_METADATA_SET || opt_cmd == OPT_METADATA_LIST ||
+       opt_cmd == OPT_METADATA_REMOVE || opt_cmd == OPT_METADATA_GET ||
+       opt_cmd == OPT_FEATURE_DISABLE || opt_cmd == OPT_FEATURE_ENABLE ||
+       opt_cmd == OPT_OBJECT_MAP_REBUILD || opt_cmd == OPT_DISK_USAGE)) {
 
     if (opt_cmd == OPT_INFO || opt_cmd == OPT_SNAP_LIST ||
 	opt_cmd == OPT_EXPORT || opt_cmd == OPT_EXPORT || opt_cmd == OPT_COPY ||
 	opt_cmd == OPT_CHILDREN || opt_cmd == OPT_LOCK_LIST ||
-        opt_cmd == OPT_STATUS || opt_cmd == OPT_WATCH) {
+        opt_cmd == OPT_METADATA_LIST || opt_cmd == OPT_STATUS ||
+        opt_cmd == OPT_WATCH || opt_cmd == OPT_DISK_USAGE) {
       r = rbd.open_read_only(io_ctx, image, imgname, NULL);
     } else {
       r = rbd.open(io_ctx, image, imgname);
@@ -3063,7 +3694,9 @@ if (!set_conf_param(v, p1, p2, p3)) { \
        opt_cmd == OPT_EXPORT_DIFF ||
        opt_cmd == OPT_DIFF ||
        opt_cmd == OPT_COPY ||
-       opt_cmd == OPT_CHILDREN)) {
+       opt_cmd == OPT_CHILDREN ||
+       opt_cmd == OPT_OBJECT_MAP_REBUILD ||
+       opt_cmd == OPT_DISK_USAGE)) {
     r = image.snap_set(snapname);
     if (r < 0) {
       cerr << "rbd: error setting snapshot context: " << cpp_strerror(-r)
@@ -3083,17 +3716,12 @@ if (!set_conf_param(v, p1, p2, p3)) { \
 
   if (opt_cmd == OPT_CREATE || opt_cmd == OPT_RESIZE) {
     if (!size_set) {
-      cerr << "rbd: must specify --size <MB>" << std::endl;
+      cerr << "rbd: must specify --size <M/G/T>" << std::endl;
       return EINVAL;
     }
   }
 
   if (opt_cmd == OPT_CREATE || opt_cmd == OPT_CLONE || opt_cmd == OPT_IMPORT) {
-    if (order && (order < 12 || order > 25)) {
-      cerr << "rbd: order must be between 12 (4 KB) and 25 (32 MB)"
-	   << std::endl;
-      return EINVAL;
-    }
     if ((stripe_unit && !stripe_count) || (!stripe_unit && stripe_count)) {
       cerr << "must specify both (or neither) of stripe-unit and stripe-count"
 	   << std::endl;
@@ -3119,6 +3747,10 @@ if (!set_conf_param(v, p1, p2, p3)) { \
     break;
 
   case OPT_CREATE:
+    if (input_feature && (format == 1)){
+      cerr << "feature not allowed with format 1; use --image-format 2" << std::endl;
+      return EINVAL;
+    }
     r = do_create(rbd, io_ctx, imgname, size, &order, format, features,
 		  stripe_unit, stripe_count);
     if (r < 0) {
@@ -3202,10 +3834,6 @@ if (!set_conf_param(v, p1, p2, p3)) { \
     break;
 
   case OPT_SNAP_LIST:
-    if (!imgname) {
-      cerr << "rbd: snap list requires an image parameter" << std::endl;
-      return EXIT_FAILURE;
-    }
     r = do_list_snaps(image, formatter.get());
     if (r < 0) {
       cerr << "rbd: failed to list snapshots: " << cpp_strerror(-r)
@@ -3215,10 +3843,6 @@ if (!set_conf_param(v, p1, p2, p3)) { \
     break;
 
   case OPT_SNAP_CREATE:
-    if (!imgname || !snapname) {
-      cerr << "rbd: snap create requires image and snapname" << std::endl;
-      return EINVAL;
-    }
     r = do_add_snap(image, snapname);
     if (r < 0) {
       cerr << "rbd: failed to create snapshot: " << cpp_strerror(-r)
@@ -3228,10 +3852,6 @@ if (!set_conf_param(v, p1, p2, p3)) { \
     break;
 
   case OPT_SNAP_ROLLBACK:
-    if (!imgname) {
-      cerr << "rbd: snap rollback requires image name" << std::endl;
-      return EINVAL;
-    }
     r = do_rollback_snap(image, snapname);
     if (r < 0) {
       cerr << "rbd: rollback failed: " << cpp_strerror(-r) << std::endl;
@@ -3240,10 +3860,6 @@ if (!set_conf_param(v, p1, p2, p3)) { \
     break;
 
   case OPT_SNAP_REMOVE:
-    if (!imgname) {
-      cerr << "rbd: snap remove requires image name" << std::endl;
-      return EINVAL;
-    }
     r = do_remove_snap(image, snapname);
     if (r < 0) {
       if (r == -EBUSY) {
@@ -3258,22 +3874,16 @@ if (!set_conf_param(v, p1, p2, p3)) { \
     break;
 
   case OPT_SNAP_PURGE:
-    if (!imgname) {
-      cerr << "rbd: snap purge requires image name" << std::endl;
-      return EINVAL;
-    }
     r = do_purge_snaps(image);
     if (r < 0) {
-      cerr << "rbd: removing snaps failed: " << cpp_strerror(-r) << std::endl;
+      if (r != -EBUSY) {
+        cerr << "rbd: removing snaps failed: " << cpp_strerror(-r) << std::endl;
+      }
       return -r;
     }
     break;
 
   case OPT_SNAP_PROTECT:
-    if (!imgname) {
-      cerr << "rbd: snap protect requires image name" << std::endl;
-      return EINVAL;
-    }
     r = do_protect_snap(image, snapname);
     if (r < 0) {
       cerr << "rbd: protecting snap failed: " << cpp_strerror(-r) << std::endl;
@@ -3282,10 +3892,6 @@ if (!set_conf_param(v, p1, p2, p3)) { \
     break;
 
   case OPT_SNAP_UNPROTECT:
-    if (!imgname) {
-      cerr << "rbd: snap unprotect requires image name" << std::endl;
-      return EINVAL;
-    }
     r = do_unprotect_snap(image, snapname);
     if (r < 0) {
       cerr << "rbd: unprotecting snap failed: " << cpp_strerror(-r)
@@ -3303,10 +3909,6 @@ if (!set_conf_param(v, p1, p2, p3)) { \
     break;
 
   case OPT_EXPORT:
-    if (!path) {
-      cerr << "rbd: export requires pathname" << std::endl;
-      return EINVAL;
-    }
     r = do_export(image, path);
     if (r < 0) {
       cerr << "rbd: export error: " << cpp_strerror(-r) << std::endl;
@@ -3315,7 +3917,7 @@ if (!set_conf_param(v, p1, p2, p3)) { \
     break;
 
   case OPT_DIFF:
-    r = do_diff(image, fromsnapname, formatter.get());
+    r = do_diff(image, fromsnapname, diff_whole_object, formatter.get());
     if (r < 0) {
       cerr << "rbd: diff error: " << cpp_strerror(-r) << std::endl;
       return -r;
@@ -3323,11 +3925,7 @@ if (!set_conf_param(v, p1, p2, p3)) { \
     break;
 
   case OPT_EXPORT_DIFF:
-    if (!path) {
-      cerr << "rbd: export-diff requires pathname" << std::endl;
-      return EINVAL;
-    }
-    r = do_export_diff(image, fromsnapname, snapname, path);
+    r = do_export_diff(image, fromsnapname, snapname, diff_whole_object, path);
     if (r < 0) {
       cerr << "rbd: export-diff error: " << cpp_strerror(-r) << std::endl;
       return -r;
@@ -3343,10 +3941,6 @@ if (!set_conf_param(v, p1, p2, p3)) { \
     break;
 
   case OPT_IMPORT:
-    if (!path) {
-      cerr << "rbd: import requires pathname" << std::endl;
-      return EINVAL;
-    }
     r = do_import(rbd, dest_io_ctx, destname, &order, path,
 		  format, features, size, stripe_unit, stripe_count);
     if (r < 0) {
@@ -3356,7 +3950,6 @@ if (!set_conf_param(v, p1, p2, p3)) { \
     break;
 
   case OPT_IMPORT_DIFF:
-    assert(path);
     r = do_import_diff(image, path);
     if (r < 0) {
       cerr << "rbd: import-diff failed: " << cpp_strerror(-r) << std::endl;
@@ -3397,7 +3990,7 @@ if (!set_conf_param(v, p1, p2, p3)) { \
     break;
 
   case OPT_UNMAP:
-    r = do_kernel_unmap(devpath);
+    r = do_kernel_unmap(devpath, poolname, imgname, snapname);
     if (r < 0) {
       cerr << "rbd: unmap failed: " << cpp_strerror(-r) << std::endl;
       return -r;
@@ -3438,7 +4031,7 @@ if (!set_conf_param(v, p1, p2, p3)) { \
     break;
 
   case OPT_LOCK_REMOVE:
-    r = do_lock_remove(image, lock_cookie, lock_client);
+    r = do_lock_remove(image, lock_client, lock_cookie);
     if (r < 0) {
       cerr << "rbd: releasing lock failed: " << cpp_strerror(r) << std::endl;
       return -r;
@@ -3452,7 +4045,71 @@ if (!set_conf_param(v, p1, p2, p3)) { \
       return -r;
     }
     break;
+
+  case OPT_METADATA_LIST:
+    r = do_metadata_list(image, formatter.get());
+    if (r < 0) {
+      cerr << "rbd: listing metadata failed: " << cpp_strerror(r) << std::endl;
+      return -r;
+    }
+    break;
+
+  case OPT_METADATA_SET:
+    r = do_metadata_set(image, key, value);
+    if (r < 0) {
+      cerr << "rbd: setting metadata failed: " << cpp_strerror(r) << std::endl;
+      return -r;
+    }
+    break;
+
+  case OPT_METADATA_REMOVE:
+    r = do_metadata_remove(image, key);
+    if (r < 0) {
+      cerr << "rbd: removing metadata failed: " << cpp_strerror(r) << std::endl;
+      return -r;
+    }
+    break;
+
+  case OPT_METADATA_GET:
+    r = do_metadata_get(image, key);
+    if (r < 0) {
+      cerr << "rbd: getting metadata failed: " << cpp_strerror(r) << std::endl;
+      return -r;
+    }
+    break;
+
+  case OPT_FEATURE_DISABLE:
+  case OPT_FEATURE_ENABLE:
+    r = image.update_features(features, opt_cmd == OPT_FEATURE_ENABLE);
+    if (r < 0) {
+      cerr << "rbd: failed to update image features: " << cpp_strerror(r)
+           << std::endl;
+      return -r;
+    }
+    break;
+
+  case OPT_OBJECT_MAP_REBUILD:
+    r = do_object_map_rebuild(image);
+    if (r < 0) {
+      cerr << "rbd: rebuilding object map failed: " << cpp_strerror(r)
+           << std::endl;
+      return -r;
+    }
+    break;
+
+  case OPT_DISK_USAGE:
+    r = do_disk_usage(rbd, io_ctx, imgname, snapname, formatter.get());
+    if (r < 0) {
+      cerr << "du failed: " << cpp_strerror(-r) << std::endl;
+      return -r;
+    }
+    break;
   }
 
+  r = image.close();
+  if (r < 0) {
+    cerr << "rbd: error while closing image: " << cpp_strerror(-r) << std::endl;
+    return -r;
+  }
   return 0;
 }
diff --git a/src/rbd_fuse/rbd-fuse.cc b/src/rbd_fuse/rbd-fuse.cc
index c53e5b6..8abfe45 100644
--- a/src/rbd_fuse/rbd-fuse.cc
+++ b/src/rbd_fuse/rbd-fuse.cc
@@ -19,6 +19,11 @@
 #include <getopt.h>
 #include <assert.h>
 
+#if defined(__FreeBSD__)
+#include <sys/param.h>
+#endif
+
+#include "include/compat.h"
 #include "include/rbd/librbd.h"
 
 static int gotrados = 0;
@@ -349,9 +354,9 @@ static int rbdfs_write(const char *path, const char *buf, size_t size,
 	while (size > 0) {
 		ssize_t ret;
 
-		if (offset + size > rbdsize(fi->fh)) {
+		if ((size_t)(offset + size) > rbdsize(fi->fh)) {
 			int r;
-			fprintf(stderr, "rbdfs_write resizing %s to 0x%"PRIxMAX"\n",
+			fprintf(stderr, "rbdfs_write resizing %s to 0x%" PRIxMAX "\n",
 				path, offset+size);
 			r = rbd_resize(rbd->image, offset+size);
 			if (r < 0)
@@ -562,7 +567,8 @@ rbdfs_truncate(const char *path, off_t size)
 		return -ENOENT;
 
 	rbd = &opentbl[fd];
-	fprintf(stderr, "truncate %s to %"PRIdMAX" (0x%"PRIxMAX")\n", path, size, size);
+	fprintf(stderr, "truncate %s to %" PRIdMAX " (0x%" PRIxMAX ")\n",
+          path, size, size);
 	r = rbd_resize(rbd->image, size);
 	if (r < 0)
 		return r;
@@ -596,7 +602,12 @@ struct rbdfuse_attr {
 
 int
 rbdfs_setxattr(const char *path, const char *name, const char *value,
-		 size_t size, int flags)
+	       size_t size,
+	       int flags
+#if defined(DARWIN)
+	       ,uint32_t pos
+#endif
+    )
 {
 	struct rbdfuse_attr *ap;
 	if (strcmp(path, "/") != 0)
@@ -605,7 +616,7 @@ rbdfs_setxattr(const char *path, const char *name, const char *value,
 	for (ap = attrs; ap->attrname != NULL; ap++) {
 		if (strcmp(name, ap->attrname) == 0) {
 			*ap->attrvalp = strtoull(value, NULL, 0);
-			fprintf(stderr, "rbd-fuse: %s set to 0x%"PRIx64"\n",
+			fprintf(stderr, "rbd-fuse: %s set to 0x%" PRIx64 "\n",
 				ap->attrname, *ap->attrvalp);
 			return 0;
 		}
@@ -615,7 +626,11 @@ rbdfs_setxattr(const char *path, const char *name, const char *value,
 
 int
 rbdfs_getxattr(const char *path, const char *name, char *value,
-		 size_t size)
+		 size_t size
+#if defined(DARWIN)
+	       ,uint32_t position
+#endif
+  )
 {
 	struct rbdfuse_attr *ap;
 	char buf[128];
@@ -624,7 +639,7 @@ rbdfs_getxattr(const char *path, const char *name, char *value,
 
 	for (ap = attrs; ap->attrname != NULL; ap++) {
 		if (strcmp(name, ap->attrname) == 0) {
-			sprintf(buf, "%"PRIu64, *ap->attrvalp);
+			sprintf(buf, "%" PRIu64, *ap->attrvalp);
 			if (value != NULL && size >= strlen(buf))
 				strcpy(value, buf);
 			fprintf(stderr, "rbd-fuse: get %s\n", ap->attrname);
diff --git a/src/rbd_replay/ActionTypes.cc b/src/rbd_replay/ActionTypes.cc
new file mode 100644
index 0000000..36ed3ca
--- /dev/null
+++ b/src/rbd_replay/ActionTypes.cc
@@ -0,0 +1,354 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "rbd_replay/ActionTypes.h"
+#include "include/assert.h"
+#include "include/byteorder.h"
+#include "include/stringify.h"
+#include "common/Formatter.h"
+#include <iostream>
+#include <boost/variant.hpp>
+
+namespace rbd_replay {
+namespace action {
+
+namespace {
+
+bool byte_swap_required(__u8 version) {
+#if defined(CEPH_LITTLE_ENDIAN)
+  return (version == 0);
+#else
+  return false;
+#endif
+}
+
+void decode_big_endian_string(std::string &str, bufferlist::iterator &it) {
+#if defined(CEPH_LITTLE_ENDIAN)
+  uint32_t length;
+  ::decode(length, it);
+  length = swab32(length);
+  str.clear();
+  it.copy(length, str);
+#else
+  assert(false);
+#endif
+}
+
+class EncodeVisitor : public boost::static_visitor<void> {
+public:
+  EncodeVisitor(bufferlist &bl) : m_bl(bl) {
+  }
+
+  template <typename Action>
+  inline void operator()(const Action &action) const {
+    ::encode(static_cast<uint8_t>(Action::ACTION_TYPE), m_bl);
+    action.encode(m_bl);
+  }
+private:
+  bufferlist &m_bl;
+};
+
+class DecodeVisitor : public boost::static_visitor<void> {
+public:
+  DecodeVisitor(__u8 version, bufferlist::iterator &iter)
+    : m_version(version), m_iter(iter) {
+  }
+
+  template <typename Action>
+  inline void operator()(Action &action) const {
+    action.decode(m_version, m_iter);
+  }
+private:
+  __u8 m_version;
+  bufferlist::iterator &m_iter;
+};
+
+class DumpVisitor : public boost::static_visitor<void> {
+public:
+  DumpVisitor(Formatter *formatter) : m_formatter(formatter) {}
+
+  template <typename Action>
+  inline void operator()(const Action &action) const {
+    ActionType action_type = Action::ACTION_TYPE;
+    m_formatter->dump_string("action_type", stringify(action_type));
+    action.dump(m_formatter);
+  }
+private:
+  ceph::Formatter *m_formatter;
+};
+
+} // anonymous namespace
+
+void Dependency::encode(bufferlist &bl) const {
+  ::encode(id, bl);
+  ::encode(time_delta, bl);
+}
+
+void Dependency::decode(bufferlist::iterator &it) {
+  decode(1, it);
+}
+
+void Dependency::decode(__u8 version, bufferlist::iterator &it) {
+  ::decode(id, it);
+  ::decode(time_delta, it);
+  if (byte_swap_required(version)) {
+    id = swab32(id);
+    time_delta = swab64(time_delta);
+  }
+}
+
+void Dependency::dump(Formatter *f) const {
+  f->dump_unsigned("id", id);
+  f->dump_unsigned("time_delta", time_delta);
+}
+
+void Dependency::generate_test_instances(std::list<Dependency *> &o) {
+  o.push_back(new Dependency());
+  o.push_back(new Dependency(1, 123456789));
+}
+
+void ActionBase::encode(bufferlist &bl) const {
+  ::encode(id, bl);
+  ::encode(thread_id, bl);
+  ::encode(dependencies, bl);
+}
+
+void ActionBase::decode(__u8 version, bufferlist::iterator &it) {
+  ::decode(id, it);
+  ::decode(thread_id, it);
+  if (version == 0) {
+    uint32_t num_successors;
+    ::decode(num_successors, it);
+
+    uint32_t num_completion_successors;
+    ::decode(num_completion_successors, it);
+  }
+
+  if (byte_swap_required(version)) {
+    id = swab32(id);
+    thread_id = swab64(thread_id);
+
+    uint32_t dep_count;
+    ::decode(dep_count, it);
+    dep_count = swab32(dep_count);
+    dependencies.resize(dep_count);
+    for (uint32_t i = 0; i < dep_count; ++i) {
+      dependencies[i].decode(0, it);
+    }
+  } else {
+    ::decode(dependencies, it);
+  }
+}
+
+void ActionBase::dump(Formatter *f) const {
+  f->dump_unsigned("id", id);
+  f->dump_unsigned("thread_id", thread_id);
+  f->open_array_section("dependencies");
+  for (size_t i = 0; i < dependencies.size(); ++i) {
+    f->open_object_section("dependency");
+    dependencies[i].dump(f);
+    f->close_section();
+  }
+  f->close_section();
+}
+
+void ImageActionBase::encode(bufferlist &bl) const {
+  ActionBase::encode(bl);
+  ::encode(imagectx_id, bl);
+}
+
+void ImageActionBase::decode(__u8 version, bufferlist::iterator &it) {
+  ActionBase::decode(version, it);
+  ::decode(imagectx_id, it);
+  if (byte_swap_required(version)) {
+    imagectx_id = swab64(imagectx_id);
+  }
+}
+
+void ImageActionBase::dump(Formatter *f) const {
+  ActionBase::dump(f);
+  f->dump_unsigned("imagectx_id", imagectx_id);
+}
+
+void IoActionBase::encode(bufferlist &bl) const {
+  ImageActionBase::encode(bl);
+  ::encode(offset, bl);
+  ::encode(length, bl);
+}
+
+void IoActionBase::decode(__u8 version, bufferlist::iterator &it) {
+  ImageActionBase::decode(version, it);
+  ::decode(offset, it);
+  ::decode(length, it);
+  if (byte_swap_required(version)) {
+    offset = swab64(offset);
+    length = swab64(length);
+  }
+}
+
+void IoActionBase::dump(Formatter *f) const {
+  ImageActionBase::dump(f);
+  f->dump_unsigned("offset", offset);
+  f->dump_unsigned("length", length);
+}
+
+void OpenImageAction::encode(bufferlist &bl) const {
+  ImageActionBase::encode(bl);
+  ::encode(name, bl);
+  ::encode(snap_name, bl);
+  ::encode(read_only, bl);
+}
+
+void OpenImageAction::decode(__u8 version, bufferlist::iterator &it) {
+  ImageActionBase::decode(version, it);
+  if (byte_swap_required(version)) {
+    decode_big_endian_string(name, it);
+    decode_big_endian_string(snap_name, it);
+  } else {
+    ::decode(name, it);
+    ::decode(snap_name, it);
+  }
+  ::decode(read_only, it);
+}
+
+void OpenImageAction::dump(Formatter *f) const {
+  ImageActionBase::dump(f);
+  f->dump_string("name", name);
+  f->dump_string("snap_name", snap_name);
+  f->dump_bool("read_only", read_only);
+}
+
+void UnknownAction::encode(bufferlist &bl) const {
+  assert(false);
+}
+
+void UnknownAction::decode(__u8 version, bufferlist::iterator &it) {
+}
+
+void UnknownAction::dump(Formatter *f) const {
+}
+
+void ActionEntry::encode(bufferlist &bl) const {
+  ENCODE_START(1, 1, bl);
+  boost::apply_visitor(EncodeVisitor(bl), action);
+  ENCODE_FINISH(bl);
+}
+
+void ActionEntry::decode(bufferlist::iterator &it) {
+  DECODE_START(1, it);
+  decode(struct_v, it);
+  DECODE_FINISH(it);
+}
+
+void ActionEntry::decode_unversioned(bufferlist::iterator &it) {
+  decode(0, it);
+}
+
+void ActionEntry::decode(__u8 version, bufferlist::iterator &it) {
+  uint8_t action_type;
+  ::decode(action_type, it);
+
+  // select the correct action variant based upon the action_type
+  switch (action_type) {
+  case ACTION_TYPE_START_THREAD:
+    action = StartThreadAction();
+    break;
+  case ACTION_TYPE_STOP_THREAD:
+    action = StopThreadAction();
+    break;
+  case ACTION_TYPE_READ:
+    action = ReadAction();
+    break;
+  case ACTION_TYPE_WRITE:
+    action = WriteAction();
+    break;
+  case ACTION_TYPE_AIO_READ:
+    action = AioReadAction();
+    break;
+  case ACTION_TYPE_AIO_WRITE:
+    action = AioWriteAction();
+    break;
+  case ACTION_TYPE_OPEN_IMAGE:
+    action = OpenImageAction();
+    break;
+  case ACTION_TYPE_CLOSE_IMAGE:
+    action = CloseImageAction();
+    break;
+  }
+
+  boost::apply_visitor(DecodeVisitor(version, it), action);
+}
+
+void ActionEntry::dump(Formatter *f) const {
+  boost::apply_visitor(DumpVisitor(f), action);
+}
+
+void ActionEntry::generate_test_instances(std::list<ActionEntry *> &o) {
+  Dependencies dependencies;
+  dependencies.push_back(Dependency(3, 123456789));
+  dependencies.push_back(Dependency(4, 234567890));
+
+  o.push_back(new ActionEntry(StartThreadAction()));
+  o.push_back(new ActionEntry(StartThreadAction(1, 123456789, dependencies)));
+  o.push_back(new ActionEntry(StopThreadAction()));
+  o.push_back(new ActionEntry(StopThreadAction(1, 123456789, dependencies)));
+
+  o.push_back(new ActionEntry(ReadAction()));
+  o.push_back(new ActionEntry(ReadAction(1, 123456789, dependencies, 3, 4, 5)));
+  o.push_back(new ActionEntry(WriteAction()));
+  o.push_back(new ActionEntry(WriteAction(1, 123456789, dependencies, 3, 4,
+                                          5)));
+  o.push_back(new ActionEntry(AioReadAction()));
+  o.push_back(new ActionEntry(AioReadAction(1, 123456789, dependencies, 3, 4,
+                                            5)));
+  o.push_back(new ActionEntry(AioWriteAction()));
+  o.push_back(new ActionEntry(AioWriteAction(1, 123456789, dependencies, 3, 4,
+                                             5)));
+
+  o.push_back(new ActionEntry(OpenImageAction()));
+  o.push_back(new ActionEntry(OpenImageAction(1, 123456789, dependencies, 3,
+                                              "image_name", "snap_name",
+                                              true)));
+  o.push_back(new ActionEntry(CloseImageAction()));
+  o.push_back(new ActionEntry(CloseImageAction(1, 123456789, dependencies, 3)));
+}
+
+} // namespace action
+} // namespace rbd_replay
+
+std::ostream &operator<<(std::ostream &out,
+                         const rbd_replay::action::ActionType &type) {
+  using namespace rbd_replay::action;
+
+  switch (type) {
+  case ACTION_TYPE_START_THREAD:
+    out << "StartThread";
+    break;
+  case ACTION_TYPE_STOP_THREAD:
+    out << "StopThread";
+    break;
+  case ACTION_TYPE_READ:
+    out << "Read";
+    break;
+  case ACTION_TYPE_WRITE:
+    out << "Write";
+    break;
+  case ACTION_TYPE_AIO_READ:
+    out << "AioRead";
+    break;
+  case ACTION_TYPE_AIO_WRITE:
+    out << "AioWrite";
+    break;
+  case ACTION_TYPE_OPEN_IMAGE:
+    out << "OpenImage";
+    break;
+  case ACTION_TYPE_CLOSE_IMAGE:
+    out << "CloseImage";
+    break;
+  default:
+    out << "Unknown (" << static_cast<uint32_t>(type) << ")";
+    break;
+  }
+  return out;
+}
+
diff --git a/src/rbd_replay/ActionTypes.h b/src/rbd_replay/ActionTypes.h
new file mode 100644
index 0000000..63ef34e
--- /dev/null
+++ b/src/rbd_replay/ActionTypes.h
@@ -0,0 +1,277 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_REPLAY_ACTION_TYPES_H
+#define CEPH_RBD_REPLAY_ACTION_TYPES_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/encoding.h"
+#include <iosfwd>
+#include <list>
+#include <string>
+#include <vector>
+#include <boost/variant/variant.hpp>
+
+namespace ceph { class Formatter; }
+
+namespace rbd_replay {
+namespace action {
+
+typedef uint64_t imagectx_id_t;
+typedef uint64_t thread_id_t;
+
+/// Even IDs are normal actions, odd IDs are completions.
+typedef uint32_t action_id_t;
+
+static const std::string BANNER("rbd-replay-trace");
+
+/**
+ * Dependencies link actions to earlier actions or completions.
+ * If an action has a dependency \c d then it waits until \c d.time_delta
+ * nanoseconds after the action or completion with ID \c d.id has fired.
+ */
+struct Dependency {
+  /// ID of the action or completion to wait for.
+  action_id_t id;
+
+  /// Nanoseconds of delay to wait until after the action or completion fires.
+  uint64_t time_delta;
+
+  /**
+   * @param id ID of the action or completion to wait for.
+   * @param time_delta Nanoseconds of delay to wait after the action or
+   *                   completion fires.
+   */
+  Dependency() : id(0), time_delta(0) {
+  }
+  Dependency(action_id_t id, uint64_t time_delta)
+    : id(id), time_delta(time_delta) {
+  }
+
+  void encode(bufferlist &bl) const;
+  void decode(bufferlist::iterator &it);
+  void decode(__u8 version, bufferlist::iterator &it);
+  void dump(Formatter *f) const;
+
+  static void generate_test_instances(std::list<Dependency *> &o);
+};
+
+WRITE_CLASS_ENCODER(Dependency);
+
+typedef std::vector<Dependency> Dependencies;
+
+enum ActionType {
+  ACTION_TYPE_START_THREAD = 0,
+  ACTION_TYPE_STOP_THREAD  = 1,
+  ACTION_TYPE_READ         = 2,
+  ACTION_TYPE_WRITE        = 3,
+  ACTION_TYPE_AIO_READ     = 4,
+  ACTION_TYPE_AIO_WRITE    = 5,
+  ACTION_TYPE_OPEN_IMAGE   = 6,
+  ACTION_TYPE_CLOSE_IMAGE  = 7
+};
+
+struct ActionBase {
+  action_id_t id;
+  thread_id_t thread_id;
+  Dependencies dependencies;
+
+  ActionBase() : id(0), thread_id(0) {
+  }
+  ActionBase(action_id_t id, thread_id_t thread_id,
+             const Dependencies &dependencies)
+    : id(id), thread_id(thread_id), dependencies(dependencies) {
+  }
+
+  void encode(bufferlist &bl) const;
+  void decode(__u8 version, bufferlist::iterator &it);
+  void dump(Formatter *f) const;
+};
+
+struct StartThreadAction : public ActionBase {
+  static const ActionType ACTION_TYPE = ACTION_TYPE_START_THREAD;
+
+  StartThreadAction() {
+  }
+  StartThreadAction(action_id_t id, thread_id_t thread_id,
+                    const Dependencies &dependencies)
+    : ActionBase(id, thread_id, dependencies) {
+  }
+};
+
+struct StopThreadAction : public ActionBase {
+  static const ActionType ACTION_TYPE = ACTION_TYPE_STOP_THREAD;
+
+  StopThreadAction() {
+  }
+  StopThreadAction(action_id_t id, thread_id_t thread_id,
+                   const Dependencies &dependencies)
+    : ActionBase(id, thread_id, dependencies) {
+  }
+};
+
+struct ImageActionBase : public ActionBase {
+  imagectx_id_t imagectx_id;
+
+  ImageActionBase() : imagectx_id(0) {
+  }
+  ImageActionBase(action_id_t id, thread_id_t thread_id,
+                  const Dependencies &dependencies, imagectx_id_t imagectx_id)
+    : ActionBase(id, thread_id, dependencies), imagectx_id(imagectx_id) {
+  }
+
+  void encode(bufferlist &bl) const;
+  void decode(__u8 version, bufferlist::iterator &it);
+  void dump(Formatter *f) const;
+};
+
+struct IoActionBase : public ImageActionBase {
+  uint64_t offset;
+  uint64_t length;
+
+  IoActionBase() : offset(0), length(0) {
+  }
+  IoActionBase(action_id_t id, thread_id_t thread_id,
+               const Dependencies &dependencies, imagectx_id_t imagectx_id,
+               uint64_t offset, uint64_t length)
+    : ImageActionBase(id, thread_id, dependencies, imagectx_id),
+      offset(offset), length(length) {
+  }
+
+  void encode(bufferlist &bl) const;
+  void decode(__u8 version, bufferlist::iterator &it);
+  void dump(Formatter *f) const;
+};
+
+struct ReadAction : public IoActionBase {
+  static const ActionType ACTION_TYPE = ACTION_TYPE_READ;
+
+  ReadAction() {
+  }
+  ReadAction(action_id_t id, thread_id_t thread_id,
+             const Dependencies &dependencies, imagectx_id_t imagectx_id,
+             uint64_t offset, uint64_t length)
+    : IoActionBase(id, thread_id, dependencies, imagectx_id, offset, length) {
+  }
+};
+
+struct WriteAction : public IoActionBase {
+  static const ActionType ACTION_TYPE = ACTION_TYPE_WRITE;
+
+  WriteAction() {
+  }
+  WriteAction(action_id_t id, thread_id_t thread_id,
+              const Dependencies &dependencies, imagectx_id_t imagectx_id,
+              uint64_t offset, uint64_t length)
+    : IoActionBase(id, thread_id, dependencies, imagectx_id, offset, length) {
+  }
+};
+
+struct AioReadAction : public IoActionBase {
+  static const ActionType ACTION_TYPE = ACTION_TYPE_AIO_READ;
+
+  AioReadAction() {
+  }
+  AioReadAction(action_id_t id, thread_id_t thread_id,
+                const Dependencies &dependencies, imagectx_id_t imagectx_id,
+                uint64_t offset, uint64_t length)
+    : IoActionBase(id, thread_id, dependencies, imagectx_id, offset, length) {
+  }
+};
+
+struct AioWriteAction : public IoActionBase {
+  static const ActionType ACTION_TYPE = ACTION_TYPE_AIO_WRITE;
+
+  AioWriteAction() {
+  }
+  AioWriteAction(action_id_t id, thread_id_t thread_id,
+                 const Dependencies &dependencies, imagectx_id_t imagectx_id,
+                 uint64_t offset, uint64_t length)
+    : IoActionBase(id, thread_id, dependencies, imagectx_id, offset, length) {
+  }
+};
+
+struct OpenImageAction : public ImageActionBase {
+  static const ActionType ACTION_TYPE = ACTION_TYPE_OPEN_IMAGE;
+
+  std::string name;
+  std::string snap_name;
+  bool read_only;
+
+  OpenImageAction() : read_only(false) {
+  }
+  OpenImageAction(action_id_t id, thread_id_t thread_id,
+                  const Dependencies &dependencies, imagectx_id_t imagectx_id,
+                  const std::string &name, const std::string &snap_name,
+                  bool read_only)
+    : ImageActionBase(id, thread_id, dependencies, imagectx_id),
+      name(name), snap_name(snap_name), read_only(read_only) {
+  }
+
+  void encode(bufferlist &bl) const;
+  void decode(__u8 version, bufferlist::iterator &it);
+  void dump(Formatter *f) const;
+};
+
+struct CloseImageAction : public ImageActionBase {
+  static const ActionType ACTION_TYPE = ACTION_TYPE_CLOSE_IMAGE;
+
+  CloseImageAction() {
+  }
+  CloseImageAction(action_id_t id, thread_id_t thread_id,
+                   const Dependencies &dependencies, imagectx_id_t imagectx_id)
+    : ImageActionBase(id, thread_id, dependencies, imagectx_id) {
+  }
+};
+
+struct UnknownAction {
+  static const ActionType ACTION_TYPE = static_cast<ActionType>(-1);
+
+  void encode(bufferlist &bl) const;
+  void decode(__u8 version, bufferlist::iterator &it);
+  void dump(Formatter *f) const;
+};
+
+typedef boost::variant<StartThreadAction,
+                       StopThreadAction,
+                       ReadAction,
+                       WriteAction,
+                       AioReadAction,
+                       AioWriteAction,
+                       OpenImageAction,
+                       CloseImageAction,
+                       UnknownAction> Action;
+
+class ActionEntry {
+public:
+  Action action;
+
+  ActionEntry() : action(UnknownAction()) {
+  }
+  ActionEntry(const Action &action) : action(action) {
+  }
+
+  void encode(bufferlist &bl) const;
+  void decode(bufferlist::iterator &it);
+  void decode_unversioned(bufferlist::iterator &it);
+  void dump(Formatter *f) const;
+
+  static void generate_test_instances(std::list<ActionEntry *> &o);
+
+private:
+  void decode(__u8 version, bufferlist::iterator &it);
+};
+
+WRITE_CLASS_ENCODER(ActionEntry);
+
+} // namespace action
+} // namespace rbd_replay
+
+std::ostream &operator<<(std::ostream &out,
+                         const rbd_replay::action::ActionType &type);
+
+using rbd_replay::action::decode;
+using rbd_replay::action::encode;
+
+#endif // CEPH_RBD_REPLAY_ACTION_TYPES_H
diff --git a/src/rbd_replay/BufferReader.cc b/src/rbd_replay/BufferReader.cc
new file mode 100644
index 0000000..f1327b7
--- /dev/null
+++ b/src/rbd_replay/BufferReader.cc
@@ -0,0 +1,34 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "rbd_replay/BufferReader.h"
+#include "include/assert.h"
+#include "include/intarith.h"
+
+namespace rbd_replay {
+
+BufferReader::BufferReader(int fd, size_t min_bytes, size_t max_bytes)
+  : m_fd(fd), m_min_bytes(min_bytes), m_max_bytes(max_bytes),
+    m_bl_it(m_bl.begin()) {
+  assert(m_min_bytes <= m_max_bytes);
+}
+
+int BufferReader::fetch(bufferlist::iterator **it) {
+  if (m_bl_it.get_remaining() < m_min_bytes) {
+    ssize_t bytes_to_read = ROUND_UP_TO(m_max_bytes - m_bl_it.get_remaining(),
+                                        CEPH_BUFFER_APPEND_SIZE);
+    while (bytes_to_read > 0) {
+      int r = m_bl.read_fd(m_fd, CEPH_BUFFER_APPEND_SIZE);
+      if (r < 0) {
+        return r;
+      }
+      assert(r <= bytes_to_read);
+      bytes_to_read -= r;
+    }
+  }
+
+  *it = &m_bl_it;
+  return 0;
+}
+
+} // namespace rbd_replay
diff --git a/src/rbd_replay/BufferReader.h b/src/rbd_replay/BufferReader.h
new file mode 100644
index 0000000..95b1533
--- /dev/null
+++ b/src/rbd_replay/BufferReader.h
@@ -0,0 +1,33 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_REPLAY_BUFFER_READER_H
+#define CEPH_RBD_REPLAY_BUFFER_READER_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+
+namespace rbd_replay {
+
+class BufferReader {
+public:
+  static const size_t DEFAULT_MIN_BYTES = 1<<20;
+  static const size_t DEFAULT_MAX_BYTES = 1<<22;
+
+  BufferReader(int fd, size_t min_bytes = DEFAULT_MIN_BYTES,
+               size_t max_bytes = DEFAULT_MAX_BYTES);
+
+  int fetch(bufferlist::iterator **it);
+
+private:
+  int m_fd;
+  size_t m_min_bytes;
+  size_t m_max_bytes;
+  bufferlist m_bl;
+  bufferlist::iterator m_bl_it;
+
+};
+
+} // namespace rbd_replay
+
+#endif // CEPH_RBD_REPLAY_BUFFER_READER_H
diff --git a/src/rbd_replay/Deser.cc b/src/rbd_replay/Deser.cc
deleted file mode 100644
index 986a18c..0000000
--- a/src/rbd_replay/Deser.cc
+++ /dev/null
@@ -1,67 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2014 Adam Crume <adamcrume at gmail.com>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation.  See file COPYING.
- *
- */
-
-#include "Deser.hpp"
-#include <arpa/inet.h>
-#include <cstdlib>
-#include <endian.h>
-
-
-rbd_replay::Deser::Deser(std::istream &in)
-  : m_in(in) {
-}
-
-uint8_t rbd_replay::Deser::read_uint8_t() {
-  uint8_t data;
-  m_in.read(reinterpret_cast<char*>(&data), sizeof(data));
-  return data;
-}
-
-uint16_t rbd_replay::Deser::read_uint16_t() {
-  uint16_t data;
-  m_in.read(reinterpret_cast<char*>(&data), sizeof(data));
-  return ntohs(data);
-}
-
-uint32_t rbd_replay::Deser::read_uint32_t() {
-  uint32_t data;
-  m_in.read(reinterpret_cast<char*>(&data), sizeof(data));
-  return ntohl(data);
-}
-
-uint64_t rbd_replay::Deser::read_uint64_t() {
-  uint64_t data;
-  m_in.read(reinterpret_cast<char*>(&data), sizeof(data));
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-  data = (static_cast<uint64_t>(ntohl(data)) << 32 | ntohl(data >> 32));
-#endif
-  return data;
-}
-
-std::string rbd_replay::Deser::read_string() {
-  uint32_t length = read_uint32_t();
-  char* data = reinterpret_cast<char*>(malloc(length));
-  m_in.read(data, length);
-  std::string s(data, length);
-  free(data);
-  return s;
-}
-
-bool rbd_replay::Deser::read_bool() {
-  return read_uint8_t() != 0;
-}
-
-bool rbd_replay::Deser::eof() {
-  return m_in.eof();
-}
diff --git a/src/rbd_replay/Deser.hpp b/src/rbd_replay/Deser.hpp
deleted file mode 100644
index b466ace..0000000
--- a/src/rbd_replay/Deser.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2014 Adam Crume <adamcrume at gmail.com>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation.  See file COPYING.
- *
- */
-
-#ifndef _INCLUDED_RBD_REPLAY_DESER_HPP
-#define _INCLUDED_RBD_REPLAY_DESER_HPP
-
-#include <iostream>
-#include <stdint.h>
-
-namespace rbd_replay {
-
-/**
-   Helper for deserializing data in an architecture-indepdendent way.
-   Everything is read big-endian.
-   @see Ser
-*/
-class Deser {
-public:
-  Deser(std::istream &in);
-
-  uint8_t read_uint8_t();
-
-  uint16_t read_uint16_t();
-
-  uint32_t read_uint32_t();
-
-  uint64_t read_uint64_t();
-
-  std::string read_string();
-
-  bool read_bool();
-
-  bool eof();
-
-private:
-  std::istream &m_in;
-};
-
-}
-
-#endif
diff --git a/src/rbd_replay/Makefile.am b/src/rbd_replay/Makefile.am
index fa101b7..23a8e91 100644
--- a/src/rbd_replay/Makefile.am
+++ b/src/rbd_replay/Makefile.am
@@ -2,35 +2,46 @@ if ENABLE_CLIENT
 if WITH_RADOS
 if WITH_RBD
 
+librbd_replay_types_la_SOURCES = \
+	rbd_replay/ActionTypes.cc
+noinst_HEADERS += \
+        rbd_replay/ActionTypes.h
+noinst_LTLIBRARIES += librbd_replay_types.la
+DENCODER_DEPS += librbd_replay_types.la
+
 # librbd_replay_la exists only to help with unit tests
-librbd_replay_la_SOURCES = rbd_replay/actions.cc \
-	rbd_replay/Deser.cc \
+librbd_replay_la_SOURCES = \
+	rbd_replay/actions.cc \
+	rbd_replay/BufferReader.cc \
 	rbd_replay/ImageNameMap.cc \
 	rbd_replay/PendingIO.cc \
 	rbd_replay/rbd_loc.cc \
-	rbd_replay/Replayer.cc \
-	rbd_replay/Ser.cc
-librbd_replay_la_LIBADD = $(LIBRBD) \
+	rbd_replay/Replayer.cc
+librbd_replay_la_LIBADD = \
+	$(LIBRBD) \
 	$(LIBRADOS) \
 	$(CEPH_GLOBAL)
 noinst_LTLIBRARIES += librbd_replay.la
-noinst_HEADERS += rbd_replay/BoundedBuffer.hpp \
+noinst_HEADERS += \
 	rbd_replay/actions.hpp \
-	rbd_replay/Deser.hpp \
+	rbd_replay/BoundedBuffer.hpp \
+	rbd_replay/BufferReader.h \
 	rbd_replay/ImageNameMap.hpp \
 	rbd_replay/ios.hpp \
 	rbd_replay/PendingIO.hpp \
 	rbd_replay/rbd_loc.hpp \
 	rbd_replay/rbd_replay_debug.hpp \
-	rbd_replay/Replayer.hpp \
-	rbd_replay/Ser.hpp
-
+	rbd_replay/Replayer.hpp
 
-rbd_replay_SOURCES = rbd_replay/rbd-replay.cc
-rbd_replay_LDADD = $(LIBRBD) \
+rbd_replay_SOURCES = \
+	rbd_replay/rbd-replay.cc
+rbd_replay_LDADD = \
+	librbd_replay.la \
+	librbd_replay_types.la \
+	$(LIBRBD) \
 	$(LIBRADOS) \
 	$(CEPH_GLOBAL) \
-	librbd_replay.la
+	$(LIBCOMMON)
 
 if LINUX
 bin_PROGRAMS += rbd-replay
@@ -43,12 +54,16 @@ librbd_replay_ios_la_LIBADD = $(LIBRBD) \
 	librbd_replay.la
 noinst_LTLIBRARIES += librbd_replay_ios.la
 
-rbd_replay_prep_SOURCES = rbd_replay/rbd-replay-prep.cc
-rbd_replay_prep_LDADD = $(LIBRBD) \
-	$(LIBRADOS) \
-	$(CEPH_GLOBAL) \
+rbd_replay_prep_SOURCES = \
+	rbd_replay/rbd-replay-prep.cc
+rbd_replay_prep_LDADD = \
 	librbd_replay.la \
 	librbd_replay_ios.la \
+	librbd_replay_types.la \
+	$(LIBRBD) \
+	$(LIBRADOS) \
+	$(CEPH_GLOBAL) \
+	$(LIBCOMMON) \
 	-lbabeltrace \
 	-lbabeltrace-ctf \
 	-lboost_date_time
diff --git a/src/rbd_replay/Replayer.cc b/src/rbd_replay/Replayer.cc
index 19403e5..b37f226 100644
--- a/src/rbd_replay/Replayer.cc
+++ b/src/rbd_replay/Replayer.cc
@@ -13,8 +13,12 @@
  */
 
 #include "Replayer.hpp"
+#include "common/errno.h"
+#include "rbd_replay/ActionTypes.h"
+#include "rbd_replay/BufferReader.h"
 #include <boost/foreach.hpp>
 #include <boost/thread/thread.hpp>
+#include <boost/scope_exit.hpp>
 #include <fstream>
 #include "global/global_context.h"
 #include "rbd_replay_debug.hpp"
@@ -23,6 +27,29 @@
 using namespace std;
 using namespace rbd_replay;
 
+namespace {
+
+bool is_versioned_replay(BufferReader &buffer_reader) {
+  bufferlist::iterator *it;
+  int r = buffer_reader.fetch(&it);
+  if (r < 0) {
+    return false;
+  }
+
+  if (it->get_remaining() < action::BANNER.size()) {
+    return false;
+  }
+
+  std::string banner;
+  it->copy(action::BANNER.size(), banner);
+  bool versioned = (banner == action::BANNER);
+  if (!versioned) {
+    it->seek(0);
+  }
+  return versioned;
+}
+
+} // anonymous namespace
 
 Worker::Worker(Replayer &replayer)
   : m_replayer(replayer),
@@ -134,7 +161,9 @@ rbd_loc Worker::map_image_name(string image_name, string snap_name) const {
 
 
 Replayer::Replayer(int num_action_trackers)
-  : m_pool_name("rbd"),
+  : m_rbd(NULL), m_ioctx(0),  
+    m_pool_name("rbd"), m_latency_multiplier(1.0), 
+    m_readonly(false), m_dump_perf_counters(false),
     m_num_action_trackers(num_action_trackers),
     m_action_trackers(new action_tracker_d[m_num_action_trackers]) {
   assertf(num_action_trackers > 0, "num_action_trackers = %d", num_action_trackers);
@@ -154,36 +183,64 @@ void Replayer::run(const std::string& replay_file) {
     rados.init(NULL);
     int r = rados.init_with_context(g_ceph_context);
     if (r) {
-      cerr << "Unable to read conf file: " << r << std::endl;
+      cerr << "Failed to initialize RADOS: " << cpp_strerror(r) << std::endl;
       goto out;
     }
     r = rados.connect();
     if (r) {
-      cerr << "Unable to connect to Rados: " << r << std::endl;
+      cerr << "Failed to connect to cluster: " << cpp_strerror(r) << std::endl;
       goto out;
     }
     m_ioctx = new librados::IoCtx();
     {
       r = rados.ioctx_create(m_pool_name.c_str(), *m_ioctx);
       if (r) {
-	cerr << "Unable to create IoCtx: " << r << std::endl;
+        cerr << "Failed to open pool " << m_pool_name << ": "
+             << cpp_strerror(r) << std::endl;
 	goto out2;
       }
       m_rbd = new librbd::RBD();
       map<thread_id_t, Worker*> workers;
 
-      ifstream input(replay_file.c_str(), ios::in | ios::binary);
-      if (!input.is_open()) {
-	cerr << "Unable to open " << replay_file << std::endl;
-	exit(1);
+      int fd = open(replay_file.c_str(), O_RDONLY);
+      if (fd < 0) {
+        std::cerr << "Failed to open " << replay_file << ": "
+                  << cpp_strerror(errno) << std::endl;
+        exit(1);
       }
+      BOOST_SCOPE_EXIT( (fd) ) {
+        close(fd);
+      } BOOST_SCOPE_EXIT_END;
 
-      Deser deser(input);
+      BufferReader buffer_reader(fd);
+      bool versioned = is_versioned_replay(buffer_reader);
       while (true) {
-	Action::ptr action = Action::read_from(deser);
+        action::ActionEntry action_entry;
+        try {
+          bufferlist::iterator *it;
+          int r = buffer_reader.fetch(&it);
+          if (r < 0) {
+            std::cerr << "Failed to read from trace file: " << cpp_strerror(r)
+                      << std::endl;
+            exit(-r);
+          }
+
+          if (versioned) {
+            action_entry.decode(*it);
+          } else {
+            action_entry.decode_unversioned(*it);
+          }
+        } catch (const buffer::error &err) {
+          std::cerr << "Failed to decode trace action" << std::endl;
+          exit(1);
+        }
+
+	Action::ptr action = Action::construct(action_entry);
 	if (!action) {
-	  break;
+          // unknown / unsupported action
+	  continue;
 	}
+
 	if (action->is_start_thread()) {
 	  Worker *worker = new Worker(*this);
 	  workers[action->thread_id()] = worker;
@@ -257,9 +314,9 @@ bool Replayer::is_action_complete(action_id_t id) {
   return tracker.actions.count(id) > 0;
 }
 
-void Replayer::wait_for_actions(const vector<dependency_d> &deps) {
+void Replayer::wait_for_actions(const action::Dependencies &deps) {
   boost::posix_time::ptime release_time(boost::posix_time::neg_infin);
-  BOOST_FOREACH(const dependency_d &dep, deps) {
+  BOOST_FOREACH(const action::Dependency &dep, deps) {
     dout(DEPGRAPH_LEVEL) << "Waiting for " << dep.id << dendl;
     boost::system_time start_time(boost::get_system_time());
     action_tracker_d &tracker = tracker_for(dep.id);
diff --git a/src/rbd_replay/Replayer.hpp b/src/rbd_replay/Replayer.hpp
index 538e7fd..acad725 100644
--- a/src/rbd_replay/Replayer.hpp
+++ b/src/rbd_replay/Replayer.hpp
@@ -17,6 +17,7 @@
 
 #include <boost/thread/mutex.hpp>
 #include <boost/thread/shared_mutex.hpp>
+#include "rbd_replay/ActionTypes.h"
 #include "BoundedBuffer.hpp"
 #include "ImageNameMap.hpp"
 #include "PendingIO.hpp"
@@ -100,7 +101,7 @@ public:
 
   bool is_action_complete(action_id_t id);
 
-  void wait_for_actions(const std::vector<dependency_d> &deps);
+  void wait_for_actions(const action::Dependencies &deps);
 
   std::string pool_name() const;
 
diff --git a/src/rbd_replay/Ser.cc b/src/rbd_replay/Ser.cc
deleted file mode 100644
index 97a63cd..0000000
--- a/src/rbd_replay/Ser.cc
+++ /dev/null
@@ -1,53 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2014 Adam Crume <adamcrume at gmail.com>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation.  See file COPYING.
- *
- */
-
-#include "Ser.hpp"
-#include <arpa/inet.h>
-#include <cstdlib>
-#include <endian.h>
-
-
-rbd_replay::Ser::Ser(std::ostream &out)
-  : m_out(out) {
-}
-
-void rbd_replay::Ser::write_uint8_t(uint8_t data) {
-  m_out.write(reinterpret_cast<char*>(&data), sizeof(data));
-}
-
-void rbd_replay::Ser::write_uint16_t(uint16_t data) {
-  data = htons(data);
-  m_out.write(reinterpret_cast<char*>(&data), sizeof(data));
-}
-
-void rbd_replay::Ser::write_uint32_t(uint32_t data) {
-  data = htonl(data);
-  m_out.write(reinterpret_cast<char*>(&data), sizeof(data));
-}
-
-void rbd_replay::Ser::write_uint64_t(uint64_t data) {
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-  data = (static_cast<uint64_t>(htonl(data)) << 32 | htonl(data >> 32));
-#endif
-  m_out.write(reinterpret_cast<char*>(&data), sizeof(data));
-}
-
-void rbd_replay::Ser::write_string(const std::string& data) {
-  write_uint32_t(data.length());
-  m_out.write(data.data(), data.length());
-}
-
-void rbd_replay::Ser::write_bool(bool data) {
-  write_uint8_t(data ? 1 : 0);
-}
diff --git a/src/rbd_replay/Ser.hpp b/src/rbd_replay/Ser.hpp
deleted file mode 100644
index 130465d..0000000
--- a/src/rbd_replay/Ser.hpp
+++ /dev/null
@@ -1,50 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2014 Adam Crume <adamcrume at gmail.com>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation.  See file COPYING.
- *
- */
-
-#ifndef _INCLUDED_RBD_REPLAY_SER_HPP
-#define _INCLUDED_RBD_REPLAY_SER_HPP
-
-#include <iostream>
-#include <stdint.h>
-
-namespace rbd_replay {
-
-/**
-   Helper for serializing data in an architecture-indepdendent way.
-   Everything is written big-endian.
-   @see Deser
-*/
-class Ser {
-public:
-  Ser(std::ostream &out);
-
-  void write_uint8_t(uint8_t);
-
-  void write_uint16_t(uint16_t);
-
-  void write_uint32_t(uint32_t);
-
-  void write_uint64_t(uint64_t);
-
-  void write_string(const std::string&);
-
-  void write_bool(bool b);
-
-private:
-  std::ostream &m_out;
-};
-
-}
-
-#endif
diff --git a/src/rbd_replay/actions.cc b/src/rbd_replay/actions.cc
index 2bcdfb1..7726d08 100644
--- a/src/rbd_replay/actions.cc
+++ b/src/rbd_replay/actions.cc
@@ -22,113 +22,67 @@
 using namespace rbd_replay;
 using namespace std;
 
+namespace {
 
-Action::Action(action_id_t id,
-               thread_id_t thread_id,
-               int num_successors,
-               int num_completion_successors,
-               std::vector<dependency_d> &predecessors)
-  : m_id(id),
-    m_thread_id(thread_id),
-    m_num_successors(num_successors),
-    m_num_completion_successors(num_completion_successors),
-    m_predecessors(predecessors) {
-    }
-
-Action::~Action() {
+std::string create_fake_data() {
+  char data[1 << 20]; // 1 MB
+  for (unsigned int i = 0; i < sizeof(data); i++) {
+    data[i] = (char) i;
+  }
+  return std::string(data, sizeof(data));
 }
 
-Action::ptr Action::read_from(Deser &d) {
-  uint8_t type = d.read_uint8_t();
-  if (d.eof()) {
-    return Action::ptr();
-  }
-  uint32_t ionum = d.read_uint32_t();
-  uint64_t thread_id = d.read_uint64_t();
-  uint32_t num_successors = d.read_uint32_t();
-  uint32_t num_completion_successors = d.read_uint32_t();
-  uint32_t num_dependencies = d.read_uint32_t();
-  vector<dependency_d> deps;
-  for (unsigned int i = 0; i < num_dependencies; i++) {
-    uint32_t dep_id = d.read_uint32_t();
-    uint64_t time_delta = d.read_uint64_t();
-    deps.push_back(dependency_d(dep_id, time_delta));
+struct ConstructVisitor : public boost::static_visitor<Action::ptr> {
+  inline Action::ptr operator()(const action::StartThreadAction &action) const {
+    return Action::ptr(new StartThreadAction(action));
   }
-  DummyAction dummy(ionum, thread_id, num_successors, num_completion_successors, deps);
-  switch (type) {
-  case IO_START_THREAD:
-    return StartThreadAction::read_from(dummy, d);
-  case IO_STOP_THREAD:
-    return StopThreadAction::read_from(dummy, d);
-  case IO_READ:
-    return ReadAction::read_from(dummy, d);
-  case IO_WRITE:
-    return WriteAction::read_from(dummy, d);
-  case IO_ASYNC_READ:
-    return AioReadAction::read_from(dummy, d);
-  case IO_ASYNC_WRITE:
-    return AioWriteAction::read_from(dummy, d);
-  case IO_OPEN_IMAGE:
-    return OpenImageAction::read_from(dummy, d);
-  case IO_CLOSE_IMAGE:
-    return CloseImageAction::read_from(dummy, d);
-  default:
-    cerr << "Invalid action type: " << type << std::endl;
-    exit(1);
+
+  inline Action::ptr operator()(const action::StopThreadAction &action) const{
+    return Action::ptr(new StopThreadAction(action));
   }
-}
 
-std::ostream& Action::dump_action_fields(std::ostream& o) const {
-  o << "id=" << m_id << ", thread_id=" << m_thread_id << ", predecessors=[";
-  bool first = true;
-  BOOST_FOREACH(const dependency_d &d, m_predecessors) {
-    if (!first) {
-      o << ",";
-    }
-    o << d.id;
-    first = false;
+  inline Action::ptr operator()(const action::ReadAction &action) const {
+    return Action::ptr(new ReadAction(action));
   }
-  return o << "]";
-}
 
-std::ostream& rbd_replay::operator<<(std::ostream& o, const Action& a) {
-  return a.dump(o);
-}
+  inline Action::ptr operator()(const action::AioReadAction &action) const {
+    return Action::ptr(new AioReadAction(action));
+  }
 
+  inline Action::ptr operator()(const action::WriteAction &action) const {
+    return Action::ptr(new WriteAction(action));
+  }
 
-std::ostream& DummyAction::dump(std::ostream& o) const {
-  o << "DummyAction[";
-  dump_action_fields(o);
-  return o << "]";
-}
+  inline Action::ptr operator()(const action::AioWriteAction &action) const {
+    return Action::ptr(new AioWriteAction(action));
+  }
 
+  inline Action::ptr operator()(const action::OpenImageAction &action) const {
+    return Action::ptr(new OpenImageAction(action));
+  }
 
-StartThreadAction::StartThreadAction(Action &src)
-  : Action(src) {
-}
+  inline Action::ptr operator()(const action::CloseImageAction &action) const {
+    return Action::ptr(new CloseImageAction(action));
+  }
 
-void StartThreadAction::perform(ActionCtx &ctx) {
-  cerr << "StartThreadAction should never actually be performed" << std::endl;
-  exit(1);
-}
+  inline Action::ptr operator()(const action::UnknownAction &action) const {
+    return Action::ptr();
+  }
+};
 
-bool StartThreadAction::is_start_thread() {
-  return true;
-}
+} // anonymous namespace
 
-Action::ptr StartThreadAction::read_from(Action &src, Deser &d) {
-  return Action::ptr(new StartThreadAction(src));
+std::ostream& rbd_replay::operator<<(std::ostream& o, const Action& a) {
+  return a.dump(o);
 }
 
-std::ostream& StartThreadAction::dump(std::ostream& o) const {
-  o << "StartThreadAction[";
-  dump_action_fields(o);
-  return o << "]";
+Action::ptr Action::construct(const action::ActionEntry &action_entry) {
+  return boost::apply_visitor(ConstructVisitor(), action_entry.action);
 }
 
-
-StopThreadAction::StopThreadAction(Action &src)
-  : Action(src) {
+void StartThreadAction::perform(ActionCtx &ctx) {
+  cerr << "StartThreadAction should never actually be performed" << std::endl;
+  exit(1);
 }
 
 void StopThreadAction::perform(ActionCtx &ctx) {
@@ -136,116 +90,33 @@ void StopThreadAction::perform(ActionCtx &ctx) {
   ctx.stop();
 }
 
-Action::ptr StopThreadAction::read_from(Action &src, Deser &d) {
-  return Action::ptr(new StopThreadAction(src));
-}
-
-std::ostream& StopThreadAction::dump(std::ostream& o) const {
-  o << "StopThreadAction[";
-  dump_action_fields(o);
-  return o << "]";
-}
-
-
-AioReadAction::AioReadAction(const Action &src,
-                             imagectx_id_t imagectx_id,
-                             uint64_t offset,
-                             uint64_t length)
-  : Action(src),
-    m_imagectx_id(imagectx_id),
-    m_offset(offset),
-    m_length(length) {
-    }
-
-Action::ptr AioReadAction::read_from(Action &src, Deser &d) {
-  imagectx_id_t imagectx_id = d.read_uint64_t();
-  uint64_t offset = d.read_uint64_t();
-  uint64_t length = d.read_uint64_t();
-  return Action::ptr(new AioReadAction(src, imagectx_id, offset, length));
-}
-
 void AioReadAction::perform(ActionCtx &worker) {
   dout(ACTION_LEVEL) << "Performing " << *this << dendl;
-  librbd::Image *image = worker.get_image(m_imagectx_id);
+  librbd::Image *image = worker.get_image(m_action.imagectx_id);
   assert(image);
   PendingIO::ptr io(new PendingIO(pending_io_id(), worker));
   worker.add_pending(io);
-  int r = image->aio_read(m_offset, m_length, io->bufferlist(), &io->completion());
+  int r = image->aio_read(m_action.offset, m_action.length, io->bufferlist(), &io->completion());
   assertf(r >= 0, "id = %d, r = %d", id(), r);
 }
 
-std::ostream& AioReadAction::dump(std::ostream& o) const {
-  o << "AioReadAction[";
-  dump_action_fields(o);
-  return o << ", imagectx_id=" << m_imagectx_id << ", offset=" << m_offset << ", length=" << m_length << "]";
-}
-
-
-ReadAction::ReadAction(const Action &src,
-                       imagectx_id_t imagectx_id,
-                       uint64_t offset,
-                       uint64_t length)
-  : Action(src),
-    m_imagectx_id(imagectx_id),
-    m_offset(offset),
-    m_length(length) {
-    }
-
-Action::ptr ReadAction::read_from(Action &src, Deser &d) {
-  imagectx_id_t imagectx_id = d.read_uint64_t();
-  uint64_t offset = d.read_uint64_t();
-  uint64_t length = d.read_uint64_t();
-  return Action::ptr(new ReadAction(src, imagectx_id, offset, length));
-}
-
 void ReadAction::perform(ActionCtx &worker) {
   dout(ACTION_LEVEL) << "Performing " << *this << dendl;
-  librbd::Image *image = worker.get_image(m_imagectx_id);
+  librbd::Image *image = worker.get_image(m_action.imagectx_id);
   PendingIO::ptr io(new PendingIO(pending_io_id(), worker));
   worker.add_pending(io);
-  ssize_t r = image->read(m_offset, m_length, io->bufferlist());
+  ssize_t r = image->read(m_action.offset, m_action.length, io->bufferlist());
   assertf(r >= 0, "id = %d, r = %d", id(), r);
   worker.remove_pending(io);
 }
 
-std::ostream& ReadAction::dump(std::ostream& o) const {
-  o << "ReadAction[";
-  dump_action_fields(o);
-  return o << ", imagectx_id=" << m_imagectx_id << ", offset=" << m_offset << ", length=" << m_length << "]";
-}
-
-
-AioWriteAction::AioWriteAction(const Action &src,
-                               imagectx_id_t imagectx_id,
-                               uint64_t offset,
-                               uint64_t length)
-  : Action(src),
-    m_imagectx_id(imagectx_id),
-    m_offset(offset),
-    m_length(length) {
-    }
-
-Action::ptr AioWriteAction::read_from(Action &src, Deser &d) {
-  imagectx_id_t imagectx_id = d.read_uint64_t();
-  uint64_t offset = d.read_uint64_t();
-  uint64_t length = d.read_uint64_t();
-  return Action::ptr(new AioWriteAction(src, imagectx_id, offset, length));
-}
-
-static std::string create_fake_data() {
-  char data[1 << 20]; // 1 MB
-  for (unsigned int i = 0; i < sizeof(data); i++) {
-    data[i] = (char) i;
-  }
-  return std::string(data, sizeof(data));
-}
 
 void AioWriteAction::perform(ActionCtx &worker) {
   static const std::string fake_data(create_fake_data());
   dout(ACTION_LEVEL) << "Performing " << *this << dendl;
-  librbd::Image *image = worker.get_image(m_imagectx_id);
+  librbd::Image *image = worker.get_image(m_action.imagectx_id);
   PendingIO::ptr io(new PendingIO(pending_io_id(), worker));
-  uint64_t remaining = m_length;
+  uint64_t remaining = m_action.length;
   while (remaining > 0) {
     uint64_t n = std::min(remaining, (uint64_t)fake_data.length());
     io->bufferlist().append(fake_data.data(), n);
@@ -255,126 +126,52 @@ void AioWriteAction::perform(ActionCtx &worker) {
   if (worker.readonly()) {
     worker.remove_pending(io);
   } else {
-    int r = image->aio_write(m_offset, m_length, io->bufferlist(), &io->completion());
+    int r = image->aio_write(m_action.offset, m_action.length, io->bufferlist(), &io->completion());
     assertf(r >= 0, "id = %d, r = %d", id(), r);
   }
 }
 
-std::ostream& AioWriteAction::dump(std::ostream& o) const {
-  o << "AioWriteAction[";
-  dump_action_fields(o);
-  return o << ", imagectx_id=" << m_imagectx_id << ", offset=" << m_offset << ", length=" << m_length << "]";
-}
-
-
-WriteAction::WriteAction(const Action &src,
-                         imagectx_id_t imagectx_id,
-                         uint64_t offset,
-                         uint64_t length)
-  : Action(src),
-    m_imagectx_id(imagectx_id),
-    m_offset(offset),
-    m_length(length) {
-    }
-
-Action::ptr WriteAction::read_from(Action &src, Deser &d) {
-  imagectx_id_t imagectx_id = d.read_uint64_t();
-  uint64_t offset = d.read_uint64_t();
-  uint64_t length = d.read_uint64_t();
-  return Action::ptr(new WriteAction(src, imagectx_id, offset, length));
-}
-
 void WriteAction::perform(ActionCtx &worker) {
   dout(ACTION_LEVEL) << "Performing " << *this << dendl;
-  librbd::Image *image = worker.get_image(m_imagectx_id);
+  librbd::Image *image = worker.get_image(m_action.imagectx_id);
   PendingIO::ptr io(new PendingIO(pending_io_id(), worker));
   worker.add_pending(io);
-  io->bufferlist().append_zero(m_length);
+  io->bufferlist().append_zero(m_action.length);
   if (!worker.readonly()) {
-    ssize_t r = image->write(m_offset, m_length, io->bufferlist());
+    ssize_t r = image->write(m_action.offset, m_action.length, io->bufferlist());
     assertf(r >= 0, "id = %d, r = %d", id(), r);
   }
   worker.remove_pending(io);
 }
 
-std::ostream& WriteAction::dump(std::ostream& o) const {
-  o << "WriteAction[";
-  dump_action_fields(o);
-  return o << ", imagectx_id=" << m_imagectx_id << ", offset=" << m_offset << ", length=" << m_length << "]";
-}
-
-
-OpenImageAction::OpenImageAction(Action &src,
-                                 imagectx_id_t imagectx_id,
-                                 string name,
-                                 string snap_name,
-                                 bool readonly)
-  : Action(src),
-    m_imagectx_id(imagectx_id),
-    m_name(name),
-    m_snap_name(snap_name),
-    m_readonly(readonly) {
-    }
-
-Action::ptr OpenImageAction::read_from(Action &src, Deser &d) {
-  imagectx_id_t imagectx_id = d.read_uint64_t();
-  string name = d.read_string();
-  string snap_name = d.read_string();
-  bool readonly = d.read_bool();
-  return Action::ptr(new OpenImageAction(src, imagectx_id, name, snap_name, readonly));
-}
-
 void OpenImageAction::perform(ActionCtx &worker) {
   dout(ACTION_LEVEL) << "Performing " << *this << dendl;
   PendingIO::ptr io(new PendingIO(pending_io_id(), worker));
   worker.add_pending(io);
   librbd::Image *image = new librbd::Image();
   librbd::RBD *rbd = worker.rbd();
-  rbd_loc name(worker.map_image_name(m_name, m_snap_name));
+  rbd_loc name(worker.map_image_name(m_action.name, m_action.snap_name));
   int r;
-  if (m_readonly || worker.readonly()) {
+  if (m_action.read_only || worker.readonly()) {
     r = rbd->open_read_only(*worker.ioctx(), *image, name.image.c_str(), name.snap.c_str());
   } else {
     r = rbd->open(*worker.ioctx(), *image, name.image.c_str(), name.snap.c_str());
   }
   if (r) {
-    cerr << "Unable to open image '" << m_name
-	 << "' with snap '" << m_snap_name
+    cerr << "Unable to open image '" << m_action.name
+	 << "' with snap '" << m_action.snap_name
 	 << "' (mapped to '" << name.str()
-	 << "') and readonly " << m_readonly
+	 << "') and readonly " << m_action.read_only
 	 << ": (" << -r << ") " << strerror(-r) << std::endl;
     exit(1);
   }
-  worker.put_image(m_imagectx_id, image);
+  worker.put_image(m_action.imagectx_id, image);
   worker.remove_pending(io);
 }
 
-std::ostream& OpenImageAction::dump(std::ostream& o) const {
-  o << "OpenImageAction[";
-  dump_action_fields(o);
-  return o << ", imagectx_id=" << m_imagectx_id << ", name='" << m_name << "', snap_name='" << m_snap_name << "', readonly=" << m_readonly << "]";
-}
-
-
-CloseImageAction::CloseImageAction(Action &src,
-                                   imagectx_id_t imagectx_id)
-  : Action(src),
-    m_imagectx_id(imagectx_id) {
-    }
-
-Action::ptr CloseImageAction::read_from(Action &src, Deser &d) {
-  imagectx_id_t imagectx_id = d.read_uint64_t();
-  return Action::ptr(new CloseImageAction(src, imagectx_id));
-}
-
 void CloseImageAction::perform(ActionCtx &worker) {
   dout(ACTION_LEVEL) << "Performing " << *this << dendl;
-  worker.erase_image(m_imagectx_id);
+  worker.erase_image(m_action.imagectx_id);
   worker.set_action_complete(pending_io_id());
 }
 
-std::ostream& CloseImageAction::dump(std::ostream& o) const {
-  o << "CloseImageAction[";
-  dump_action_fields(o);
-  return o << ", imagectx_id=" << m_imagectx_id << "]";
-}
diff --git a/src/rbd_replay/actions.hpp b/src/rbd_replay/actions.hpp
index 068e4dc..ea46a88 100644
--- a/src/rbd_replay/actions.hpp
+++ b/src/rbd_replay/actions.hpp
@@ -17,8 +17,10 @@
 
 #include <boost/shared_ptr.hpp>
 #include "include/rbd/librbd.hpp"
-#include "Deser.hpp"
+#include "common/Formatter.h"
+#include "rbd_replay/ActionTypes.h"
 #include "rbd_loc.hpp"
+#include <iostream>
 
 // Stupid Doxygen requires this or else the typedef docs don't appear anywhere.
 /// @file rbd_replay/actions.hpp
@@ -31,44 +33,8 @@ typedef uint64_t thread_id_t;
 /// Even IDs are normal actions, odd IDs are completions.
 typedef uint32_t action_id_t;
 
-/**
-   Dependencies link actions to earlier actions or completions.
-   If an action has a dependency \c d then it waits until \c d.time_delta nanoseconds after the action or completion with ID \c d.id has fired.
-*/
-struct dependency_d {
-  /// ID of the action or completion to wait for.
-  action_id_t id;
-
-  /// Nanoseconds of delay to wait until after the action or completion fires.
-  uint64_t time_delta;
-
-  /**
-     @param id ID of the action or completion to wait for.
-     @param time_delta Nanoseconds of delay to wait after the action or completion fires.
-   */
-  dependency_d(action_id_t id,
-	       uint64_t time_delta)
-    : id(id),
-      time_delta(time_delta) {
-  }
-};
-
-// These are written to files, so don't change existing assignments.
-enum io_type {
-  IO_START_THREAD,
-  IO_STOP_THREAD,
-  IO_READ,
-  IO_WRITE,
-  IO_ASYNC_READ,
-  IO_ASYNC_WRITE,
-  IO_OPEN_IMAGE,
-  IO_CLOSE_IMAGE,
-};
-
-
 class PendingIO;
 
-
 /**
    %Context through which an Action interacts with its environment.
  */
@@ -131,19 +97,14 @@ class Action {
 public:
   typedef boost::shared_ptr<Action> ptr;
 
-  Action(action_id_t id,
-	 thread_id_t thread_id,
-	 int num_successors,
-	 int num_completion_successors,
-	 std::vector<dependency_d> &predecessors);
-
-  virtual ~Action();
+  virtual ~Action() {
+  }
 
   virtual void perform(ActionCtx &ctx) = 0;
 
   /// Returns the ID of the completion corresponding to this action.
   action_id_t pending_io_id() {
-    return m_id + 1;
+    return id() + 1;
   }
 
   // There's probably a better way to do this, but oh well.
@@ -151,206 +112,172 @@ public:
     return false;
   }
 
-  action_id_t id() const {
-    return m_id;
-  }
+  virtual action_id_t id() const = 0;
+  virtual thread_id_t thread_id() const = 0;
+  virtual const action::Dependencies& predecessors() const = 0;
+
+  virtual std::ostream& dump(std::ostream& o) const = 0;
+
+  static ptr construct(const action::ActionEntry &action_entry);
+};
 
-  thread_id_t thread_id() const {
-    return m_thread_id;
+template <typename ActionType>
+class TypedAction : public Action {
+public:
+  TypedAction(const ActionType &action) : m_action(action) {
   }
 
-  const std::vector<dependency_d>& predecessors() const {
-    return m_predecessors;
+  virtual action_id_t id() const {
+    return m_action.id;
   }
 
-  /// Reads and constructs an action from the replay file.
-  static ptr read_from(Deser &d);
+  virtual thread_id_t thread_id() const {
+    return m_action.thread_id;
+  }
 
-protected:
-  std::ostream& dump_action_fields(std::ostream& o) const;
+  virtual const action::Dependencies& predecessors() const {
+    return m_action.dependencies;
+  }
 
-private:
-  friend std::ostream& operator<<(std::ostream&, const Action&);
+  virtual std::ostream& dump(std::ostream& o) const {
+    o << get_action_name() << ": ";
+    ceph::JSONFormatter formatter(false);
+    formatter.open_object_section("");
+    m_action.dump(&formatter);
+    formatter.close_section();
+    formatter.flush(o);
+    return o;
+  }
 
-  virtual std::ostream& dump(std::ostream& o) const = 0;
+protected:
+  const ActionType m_action;
 
-  const action_id_t m_id;
-  const thread_id_t m_thread_id;
-  const int m_num_successors;
-  const int m_num_completion_successors;
-  const std::vector<dependency_d> m_predecessors;
+  virtual const char *get_action_name() const = 0;
 };
 
 /// Writes human-readable debug information about the action to the stream.
 /// @related Action
 std::ostream& operator<<(std::ostream& o, const Action& a);
 
-
-/**
-   Placeholder for partially-constructed actions.
-   Does nothing, and does not appear in the replay file.
- */
-class DummyAction : public Action {
+class StartThreadAction : public TypedAction<action::StartThreadAction> {
 public:
-  DummyAction(action_id_t id,
-	      thread_id_t thread_id,
-	      int num_successors,
-	      int num_completion_successors,
-	      std::vector<dependency_d> &predecessors)
-    : Action(id, thread_id, num_successors, num_completion_successors, predecessors) {
+  explicit StartThreadAction(const action::StartThreadAction &action)
+    : TypedAction<action::StartThreadAction>(action) {
   }
 
-  void perform(ActionCtx &ctx) {
+  virtual bool is_start_thread() {
+    return true;
   }
+  virtual void perform(ActionCtx &ctx);
 
-private:
-  std::ostream& dump(std::ostream& o) const;
-};
-
-
-class StopThreadAction : public Action {
-public:
-  explicit StopThreadAction(Action &src);
-
-  void perform(ActionCtx &ctx);
-
-  static Action::ptr read_from(Action &src, Deser &d);
-
-private:
-  std::ostream& dump(std::ostream& o) const;
+protected:
+  virtual const char *get_action_name() const {
+    return "StartThreadAction";
+  }
 };
 
-
-class AioReadAction : public Action {
+class StopThreadAction : public TypedAction<action::StopThreadAction> {
 public:
-  AioReadAction(const Action &src,
-		imagectx_id_t imagectx_id,
-		uint64_t offset,
-		uint64_t length);
-
-  void perform(ActionCtx &ctx);
-
-  static Action::ptr read_from(Action &src, Deser &d);
+  explicit StopThreadAction(const action::StopThreadAction &action)
+    : TypedAction<action::StopThreadAction>(action) {
+  }
 
-private:
-  std::ostream& dump(std::ostream& o) const;
+  virtual void perform(ActionCtx &ctx);
 
-  imagectx_id_t m_imagectx_id;
-  uint64_t m_offset;
-  uint64_t m_length;
+protected:
+  virtual const char *get_action_name() const {
+    return "StartThreadAction";
+  }
 };
 
 
-class ReadAction : public Action {
+class AioReadAction : public TypedAction<action::AioReadAction> {
 public:
-  ReadAction(const Action &src,
-	     imagectx_id_t imagectx_id,
-	     uint64_t offset,
-	     uint64_t length);
-
-  void perform(ActionCtx &ctx);
-
-  static Action::ptr read_from(Action &src, Deser &d);
+  AioReadAction(const action::AioReadAction &action)
+    : TypedAction<action::AioReadAction>(action) {
+  }
 
-private:
-  std::ostream& dump(std::ostream& o) const;
+  virtual void perform(ActionCtx &ctx);
 
-  imagectx_id_t m_imagectx_id;
-  uint64_t m_offset;
-  uint64_t m_length;
+protected:
+  virtual const char *get_action_name() const {
+    return "AioReadAction";
+  }
 };
 
 
-class AioWriteAction : public Action {
+class ReadAction : public TypedAction<action::ReadAction> {
 public:
-  AioWriteAction(const Action &src,
-		 imagectx_id_t imagectx_id,
-		 uint64_t offset,
-		 uint64_t length);
-
-  void perform(ActionCtx &ctx);
-
-  static Action::ptr read_from(Action &src, Deser &d);
+  ReadAction(const action::ReadAction &action)
+    : TypedAction<action::ReadAction>(action) {
+  }
 
-private:
-  std::ostream& dump(std::ostream& o) const;
+  virtual void perform(ActionCtx &ctx);
 
-  imagectx_id_t m_imagectx_id;
-  uint64_t m_offset;
-  uint64_t m_length;
+protected:
+  virtual const char *get_action_name() const {
+    return "ReadAction";
+  }
 };
 
 
-class WriteAction : public Action {
+class AioWriteAction : public TypedAction<action::AioWriteAction> {
 public:
-  WriteAction(const Action &src,
-	      imagectx_id_t imagectx_id,
-	      uint64_t offset,
-	      uint64_t length);
-
-  void perform(ActionCtx &ctx);
-
-  static Action::ptr read_from(Action &src, Deser &d);
+  AioWriteAction(const action::AioWriteAction &action)
+    : TypedAction<action::AioWriteAction>(action) {
+  }
 
-private:
-  std::ostream& dump(std::ostream& o) const;
+  virtual void perform(ActionCtx &ctx);
 
-  imagectx_id_t m_imagectx_id;
-  uint64_t m_offset;
-  uint64_t m_length;
+protected:
+  virtual const char *get_action_name() const {
+    return "AioWriteAction";
+  }
 };
 
 
-class OpenImageAction : public Action {
+class WriteAction : public TypedAction<action::WriteAction> {
 public:
-  OpenImageAction(Action &src,
-		  imagectx_id_t imagectx_id,
-		  std::string name,
-		  std::string snap_name,
-		  bool readonly);
-
-  void perform(ActionCtx &ctx);
-
-  static Action::ptr read_from(Action &src, Deser &d);
+  WriteAction(const action::WriteAction &action)
+    : TypedAction<action::WriteAction>(action) {
+  }
 
-private:
-  std::ostream& dump(std::ostream& o) const;
+  virtual void perform(ActionCtx &ctx);
 
-  imagectx_id_t m_imagectx_id;
-  std::string m_name;
-  std::string m_snap_name;
-  bool m_readonly;
+protected:
+  virtual const char *get_action_name() const {
+    return "WriteAction";
+  }
 };
 
 
-class CloseImageAction : public Action {
+class OpenImageAction : public TypedAction<action::OpenImageAction> {
 public:
-  CloseImageAction(Action &src,
-		   imagectx_id_t imagectx_id);
-
-  void perform(ActionCtx &ctx);
-
-  static Action::ptr read_from(Action &src, Deser &d);
+  OpenImageAction(const action::OpenImageAction &action)
+    : TypedAction<action::OpenImageAction>(action) {
+  }
 
-private:
-  std::ostream& dump(std::ostream& o) const;
+  virtual void perform(ActionCtx &ctx);
 
-  imagectx_id_t m_imagectx_id;
+protected:
+  virtual const char *get_action_name() const {
+    return "OpenImageAction";
+  }
 };
 
 
-class StartThreadAction : public Action {
+class CloseImageAction : public TypedAction<action::CloseImageAction> {
 public:
-  explicit StartThreadAction(Action &src);
-
-  void perform(ActionCtx &ctx);
-
-  bool is_start_thread();
+  CloseImageAction(const action::CloseImageAction &action)
+    : TypedAction<action::CloseImageAction>(action) {
+  }
 
-  static Action::ptr read_from(Action &src, Deser &d);
+  virtual void perform(ActionCtx &ctx);
 
-private:
-  std::ostream& dump(std::ostream& o) const;
+protected:
+  virtual const char *get_action_name() const {
+    return "CloseImageAction";
+  }
 };
 
 }
diff --git a/src/rbd_replay/ios.cc b/src/rbd_replay/ios.cc
index ccc560f..7437bed 100644
--- a/src/rbd_replay/ios.cc
+++ b/src/rbd_replay/ios.cc
@@ -16,41 +16,37 @@
 // In other words, (a.id < b.id) == (a.timestamp < b.timestamp) for all IOs a and b.
 
 #include "ios.hpp"
+#include "rbd_replay/ActionTypes.h"
 
 using namespace std;
 using namespace rbd_replay;
 
-bool rbd_replay::compare_io_ptrs_by_start_time(IO::ptr p1, IO::ptr p2) {
-  return p1->start_time() < p2->start_time();
-}
+namespace {
 
-static uint64_t min_time(const map<action_id_t, IO::ptr>& s) {
-  if (s.empty()) {
-    return 0;
-  }
-  return s.begin()->second->start_time();
-}
-
-static uint64_t max_time(const map<action_id_t, IO::ptr>& s) {
-  if (s.empty()) {
-    return 0;
-  }
-  map<action_id_t, IO::ptr>::const_iterator itr(s.end());
-  --itr;
-  return itr->second->start_time();
+bool compare_dependencies_by_start_time(const action::Dependency &lhs,
+                                        const action::Dependency &rhs) {
+  return lhs.time_delta < rhs.time_delta;
 }
 
-void IO::add_dependencies(const io_set_t& deps) {
-  io_set_t base(m_dependencies);
-  for (io_set_t::const_iterator itr = deps.begin(); itr != deps.end(); ++itr) {
-    ptr dep(*itr);
-    for (io_set_t::const_iterator itr2 = dep->m_dependencies.begin(); itr2 != dep->m_dependencies.end(); ++itr2) {
-      base.insert(*itr2);
+action::Dependencies convert_dependencies(uint64_t start_time,
+                                          const io_set_t &deps) {
+  action::Dependencies action_deps;
+  action_deps.reserve(deps.size());
+  for (io_set_t::const_iterator it = deps.begin(); it != deps.end(); ++it) {
+    boost::shared_ptr<IO> io = *it;
+    uint64_t time_delta = 0;
+    if (start_time >= io->start_time()) {
+      time_delta = start_time - io->start_time();
     }
+    action_deps.push_back(action::Dependency(io->ionum(), time_delta));
   }
-  batch_unreachable_from(deps, base, &m_dependencies);
+  std::sort(action_deps.begin(), action_deps.end(),
+            compare_dependencies_by_start_time);
+  return action_deps;
 }
 
+} // anonymous namespace
+
 void IO::write_debug_base(ostream& out, string type) const {
   out << m_ionum << ": " << m_start_time / 1000000.0 << ": " << type << ", thread = " << m_thread_id << ", deps = {";
   bool first = true;
@@ -62,150 +58,40 @@ void IO::write_debug_base(ostream& out, string type) const {
     }
     out << (*itr)->m_ionum << ": " << m_start_time - (*itr)->m_start_time;
   }
-  out << "}, num_successors = " << m_num_successors << ", numCompletionSuccessors = " << num_completion_successors();
+  out << "}";
 }
 
 
-void IO::write_to(Ser& out, io_type iotype) const {
-  out.write_uint8_t(iotype);
-  out.write_uint32_t(m_ionum);
-  out.write_uint64_t(m_thread_id);
-  out.write_uint32_t(m_num_successors);
-  out.write_uint32_t(num_completion_successors());
-  out.write_uint32_t(m_dependencies.size());
-  vector<IO::ptr> deps;
-  for (io_set_t::const_iterator itr = m_dependencies.begin(), end = m_dependencies.end(); itr != end; ++itr) {
-    deps.push_back(*itr);
-  }
-  sort(deps.begin(), deps.end(), compare_io_ptrs_by_start_time);
-  for (vector<IO::ptr>::const_iterator itr = deps.begin(), end = deps.end(); itr != end; ++itr) {
-    out.write_uint32_t((*itr)->m_ionum);
-    out.write_uint64_t(m_start_time - (*itr)->m_start_time);
-  }
-}
-
-IO::ptr IO::create_completion(uint64_t start_time, thread_id_t thread_id) {
-  assert(!m_completion.lock());
-  IO::ptr completion(new CompletionIO(m_ionum + 1, start_time, thread_id));
-  m_completion = completion;
-  completion->m_dependencies.insert(shared_from_this());
-  return completion;
-}
-
-
-// TODO: Add unit tests
-// Anything in 'deps' which is not reachable from 'base' is added to 'unreachable'
-void rbd_replay::batch_unreachable_from(const io_set_t& deps, const io_set_t& base, io_set_t* unreachable) {
-  if (deps.empty()) {
-    return;
-  }
-
-  map<action_id_t, IO::ptr> searching_for;
-  for (io_set_t::const_iterator itr = deps.begin(); itr != deps.end(); ++itr) {
-    searching_for[(*itr)->ionum()] = *itr;
-  }
-
-  map<action_id_t, IO::ptr> boundary;
-  for (io_set_t::const_iterator itr = base.begin(); itr != base.end(); ++itr) {
-    boundary[(*itr)->ionum()] = *itr;
-  }
-
-  // The boundary horizon is the maximum timestamp of IOs in the boundary.
-  // This monotonically decreases, because dependencies (which are added to the set)
-  // have earlier timestamp than the dependent IOs (which were just removed from the set).
-  uint64_t boundary_horizon = max_time(boundary);
-
-  for (io_map_t::iterator itr = searching_for.begin(); itr != searching_for.end(); ) {
-    if (boundary_horizon >= itr->second->start_time()) {
-      break;
-    }
-    unreachable->insert(itr->second);
-    searching_for.erase(itr++);
-  }
-  if (searching_for.empty()) {
-    return;
-  }
-
-  // The searching horizon is the minimum timestamp of IOs in the searching set.
-  // This monotonically increases, because elements are only removed from the set.
-  uint64_t searching_horizon = min_time(searching_for);
-
-  while (!boundary.empty()) {
-    // Take an IO from the end, which has the highest timestamp.
-    // This reduces the boundary horizon as early as possible,
-    // which means we can short cut as soon as possible.
-    map<action_id_t, boost::shared_ptr<IO> >::iterator b_itr(boundary.end());
-    --b_itr;
-    boost::shared_ptr<IO> io(b_itr->second);
-    boundary.erase(b_itr);
-
-    for (io_set_t::const_iterator itr = io->dependencies().begin(), end = io->dependencies().end(); itr != end; ++itr) {
-      IO::ptr dep(*itr);
-      assertf(dep->ionum() < io->ionum(), "IO: %d, dependency: %d", io->ionum(), dep->ionum());
-      io_map_t::iterator p = searching_for.find(dep->ionum());
-      if (p != searching_for.end()) {
-	searching_for.erase(p);
-	if (dep->start_time() == searching_horizon) {
-	  searching_horizon = min_time(searching_for);
-	  if (searching_horizon == 0) {
-	    return;
-	  }
-	}
-      }
-      boundary[dep->ionum()] = dep;
-    }
-
-    boundary_horizon = max_time(boundary);
-    if (boundary_horizon != 0) {
-      // Anything we're searching for that has a timestamp greater than the
-      // boundary horizon will never be found, since the boundary horizon
-      // falls monotonically.
-      for (io_map_t::iterator itr = searching_for.begin(); itr != searching_for.end(); ) {
-	if (boundary_horizon >= itr->second->start_time()) {
-	  break;
-	}
-	unreachable->insert(itr->second);
-	searching_for.erase(itr++);
-      }
-      searching_horizon = min_time(searching_for);
-      if (searching_horizon == 0) {
-	return;
-      }
-    }
-  }
-
-  // Anything we're still searching for has not been found.
-  for (io_map_t::iterator itr = searching_for.begin(), end = searching_for.end(); itr != end; ++itr) {
-    unreachable->insert(itr->second);
-  }
-}
-
 ostream& operator<<(ostream& out, IO::ptr io) {
   io->write_debug(out);
   return out;
 }
 
-void StartThreadIO::write_to(Ser& out) const {
-  IO::write_to(out, IO_START_THREAD);
+void StartThreadIO::encode(bufferlist &bl) const {
+  action::Action action((action::StartThreadAction(
+    ionum(), thread_id(), convert_dependencies(start_time(), dependencies()))));
+  ::encode(action, bl);
 }
 
 void StartThreadIO::write_debug(std::ostream& out) const {
   write_debug_base(out, "start thread");
 }
 
-void StopThreadIO::write_to(Ser& out) const {
-  IO::write_to(out, IO_STOP_THREAD);
+void StopThreadIO::encode(bufferlist &bl) const {
+  action::Action action((action::StopThreadAction(
+    ionum(), thread_id(), convert_dependencies(start_time(), dependencies()))));
+  ::encode(action, bl);
 }
 
 void StopThreadIO::write_debug(std::ostream& out) const {
   write_debug_base(out, "stop thread");
 }
 
-void ReadIO::write_to(Ser& out) const {
-  IO::write_to(out, IO_READ);
-  out.write_uint64_t(m_imagectx);
-  out.write_uint64_t(m_offset);
-  out.write_uint64_t(m_length);
+void ReadIO::encode(bufferlist &bl) const {
+  action::Action action((action::ReadAction(
+    ionum(), thread_id(), convert_dependencies(start_time(), dependencies()),
+    m_imagectx, m_offset, m_length)));
+  ::encode(action, bl);
 }
 
 void ReadIO::write_debug(std::ostream& out) const {
@@ -213,11 +99,11 @@ void ReadIO::write_debug(std::ostream& out) const {
   out << ", imagectx=" << m_imagectx << ", offset=" << m_offset << ", length=" << m_length << "]";
 }
 
-void WriteIO::write_to(Ser& out) const {
-  IO::write_to(out, IO_WRITE);
-  out.write_uint64_t(m_imagectx);
-  out.write_uint64_t(m_offset);
-  out.write_uint64_t(m_length);
+void WriteIO::encode(bufferlist &bl) const {
+  action::Action action((action::WriteAction(
+    ionum(), thread_id(), convert_dependencies(start_time(), dependencies()),
+    m_imagectx, m_offset, m_length)));
+  ::encode(action, bl);
 }
 
 void WriteIO::write_debug(std::ostream& out) const {
@@ -225,11 +111,11 @@ void WriteIO::write_debug(std::ostream& out) const {
   out << ", imagectx=" << m_imagectx << ", offset=" << m_offset << ", length=" << m_length << "]";
 }
 
-void AioReadIO::write_to(Ser& out) const {
-  IO::write_to(out, IO_ASYNC_READ);
-  out.write_uint64_t(m_imagectx);
-  out.write_uint64_t(m_offset);
-  out.write_uint64_t(m_length);
+void AioReadIO::encode(bufferlist &bl) const {
+  action::Action action((action::AioReadAction(
+    ionum(), thread_id(), convert_dependencies(start_time(), dependencies()),
+    m_imagectx, m_offset, m_length)));
+  ::encode(action, bl);
 }
 
 void AioReadIO::write_debug(std::ostream& out) const {
@@ -237,11 +123,11 @@ void AioReadIO::write_debug(std::ostream& out) const {
   out << ", imagectx=" << m_imagectx << ", offset=" << m_offset << ", length=" << m_length << "]";
 }
 
-void AioWriteIO::write_to(Ser& out) const {
-  IO::write_to(out, IO_ASYNC_WRITE);
-  out.write_uint64_t(m_imagectx);
-  out.write_uint64_t(m_offset);
-  out.write_uint64_t(m_length);
+void AioWriteIO::encode(bufferlist &bl) const {
+  action::Action action((action::AioWriteAction(
+    ionum(), thread_id(), convert_dependencies(start_time(), dependencies()),
+    m_imagectx, m_offset, m_length)));
+  ::encode(action, bl);
 }
 
 void AioWriteIO::write_debug(std::ostream& out) const {
@@ -249,12 +135,11 @@ void AioWriteIO::write_debug(std::ostream& out) const {
   out << ", imagectx=" << m_imagectx << ", offset=" << m_offset << ", length=" << m_length << "]";
 }
 
-void OpenImageIO::write_to(Ser& out) const {
-  IO::write_to(out, IO_OPEN_IMAGE);
-  out.write_uint64_t(m_imagectx);
-  out.write_string(m_name);
-  out.write_string(m_snap_name);
-  out.write_bool(m_readonly);
+void OpenImageIO::encode(bufferlist &bl) const {
+  action::Action action((action::OpenImageAction(
+    ionum(), thread_id(), convert_dependencies(start_time(), dependencies()),
+    m_imagectx, m_name, m_snap_name, m_readonly)));
+  ::encode(action, bl);
 }
 
 void OpenImageIO::write_debug(std::ostream& out) const {
@@ -262,9 +147,11 @@ void OpenImageIO::write_debug(std::ostream& out) const {
   out << ", imagectx=" << m_imagectx << ", name='" << m_name << "', snap_name='" << m_snap_name << "', readonly=" << m_readonly;
 }
 
-void CloseImageIO::write_to(Ser& out) const {
-  IO::write_to(out, IO_CLOSE_IMAGE);
-  out.write_uint64_t(m_imagectx);
+void CloseImageIO::encode(bufferlist &bl) const {
+  action::Action action((action::CloseImageAction(
+    ionum(), thread_id(), convert_dependencies(start_time(), dependencies()),
+    m_imagectx)));
+  ::encode(action, bl);
 }
 
 void CloseImageIO::write_debug(std::ostream& out) const {
diff --git a/src/rbd_replay/ios.hpp b/src/rbd_replay/ios.hpp
index 5bebcd7..1755933 100644
--- a/src/rbd_replay/ios.hpp
+++ b/src/rbd_replay/ios.hpp
@@ -18,13 +18,14 @@
 // This code assumes that IO IDs and timestamps are related monotonically.
 // In other words, (a.id < b.id) == (a.timestamp < b.timestamp) for all IOs a and b.
 
+#include "include/buffer.h"
 #include <boost/enable_shared_from_this.hpp>
 #include <boost/shared_ptr.hpp>
 #include <iostream>
 #include <map>
 #include <set>
+#include <vector>
 #include "actions.hpp"
-#include "Ser.hpp"
 
 
 namespace rbd_replay {
@@ -36,23 +37,6 @@ typedef std::set<boost::shared_ptr<IO> > io_set_t;
 typedef std::map<action_id_t, boost::shared_ptr<IO> > io_map_t;
 
 /**
-   Calculates reachability of IOs in the dependency graph.
-   All IOs in \c deps which are not transitive dependencies of anything in \c base
-   is added to \c unreachable.
-   In other words, for every IO \c x in \c deps: if nothing in \c base depends on \c x,
-   and nothing in \c base has dependencies that depend on \c x, etc.,
-   then \c x is added to \c unreachable.
-   Note that \c unreachable is \em not cleared, so the same set can be used across multiple
-   calls to collect dependencies.
-   @param[in] deps IOs to search for
-   @param[in] base root set of IOs to search from
-   @param[out] unreachable collects unreachable IOs
-   @related IO
-*/
-void batch_unreachable_from(const io_set_t& deps, const io_set_t& base, io_set_t* unreachable);
-
-
-/**
    Used by rbd-replay-prep for processing the raw trace.
    Corresponds to the Action class, except that Actions are executed by rbd-replay,
    and IOs are used by rbd-replay-prep for processing the raw trace.
@@ -60,26 +44,22 @@ void batch_unreachable_from(const io_set_t& deps, const io_set_t& base, io_set_t
 class IO : public boost::enable_shared_from_this<IO> {
 public:
   typedef boost::shared_ptr<IO> ptr;
-
-  typedef boost::weak_ptr<IO> weak_ptr;
+  typedef std::vector<ptr> ptrs;
 
   /**
      @param ionum ID of this %IO
      @param start_time time the %IO started, in nanoseconds
      @param thread_id ID of the thread that issued the %IO
-     @param prev previously issued %IO on the same thread.  NULL for the first %IO on a thread.
    */
   IO(action_id_t ionum,
      uint64_t start_time,
      thread_id_t thread_id,
-     ptr prev)
+     const io_set_t& deps)
     : m_ionum(ionum),
       m_start_time(start_time),
-      m_dependencies(io_set_t()),
-      m_completion(weak_ptr()),
-      m_num_successors(0),
+      m_dependencies(deps),
       m_thread_id(thread_id),
-      m_prev(prev) {
+      m_completed(false) {
   }
 
   virtual ~IO() {
@@ -97,21 +77,7 @@ public:
     return m_dependencies;
   }
 
-  void add_dependencies(const io_set_t& deps);
-
-  /**
-     Returns the completion's number of successors, or 0 if the %IO does not have a completion.
-   */
-  uint64_t num_completion_successors() const {
-    ptr c(m_completion.lock());
-    return c ? c->m_num_successors : 0;
-  }
-
-  virtual void write_to(Ser& out) const = 0;
-
-  virtual bool is_completion() const {
-    return false;
-  }
+  virtual void encode(bufferlist &bl) const = 0;
 
   void set_ionum(action_id_t ionum) {
     m_ionum = ionum;
@@ -121,40 +87,21 @@ public:
     return m_ionum;
   }
 
-  ptr prev() const {
-    return m_prev;
-  }
-
-  void set_num_successors(uint32_t n) {
-    m_num_successors = n;
-  }
-
-  uint32_t num_successors() const {
-    return m_num_successors;
+  thread_id_t thread_id() const {
+    return m_thread_id;
   }
 
   virtual void write_debug(std::ostream& out) const = 0;
 
-  /**
-     Creates the completion for this IO.
-     This may only be called once per IO, and may not be called on completion IOs.
-     The completion must be stored, or else m_completion will expire.
-   */
-  ptr create_completion(uint64_t start_time, thread_id_t thread_id);
-
 protected:
-  void write_to(Ser& out, io_type iotype) const;
-
   void write_debug_base(std::ostream& out, std::string iotype) const;
 
 private:
   action_id_t m_ionum;
   uint64_t m_start_time;
   io_set_t m_dependencies;
-  boost::weak_ptr<IO> m_completion;
-  uint32_t m_num_successors;
   thread_id_t m_thread_id;
-  ptr m_prev;
+  bool m_completed;
 };
 
 /// Used for dumping debug info.
@@ -167,10 +114,10 @@ public:
   StartThreadIO(action_id_t ionum,
 		uint64_t start_time,
 		thread_id_t thread_id)
-    : IO(ionum, start_time, thread_id, IO::ptr()) {
+    : IO(ionum, start_time, thread_id, io_set_t()) {
   }
 
-  void write_to(Ser& out) const;
+  virtual void encode(bufferlist &bl) const;
 
   void write_debug(std::ostream& out) const;
 };
@@ -179,11 +126,12 @@ class StopThreadIO : public IO {
 public:
   StopThreadIO(action_id_t ionum,
 	       uint64_t start_time,
-	       thread_id_t thread_id)
-    : IO(ionum, start_time, thread_id, IO::ptr()) {
+	       thread_id_t thread_id,
+               const io_set_t& deps)
+    : IO(ionum, start_time, thread_id, deps) {
   }
 
-  void write_to(Ser& out) const;
+  virtual void encode(bufferlist &bl) const;
 
   void write_debug(std::ostream& out) const;
 };
@@ -193,17 +141,17 @@ public:
   ReadIO(action_id_t ionum,
 	 uint64_t start_time,
 	 thread_id_t thread_id,
-	 IO::ptr prev,
+         const io_set_t& deps,
 	 imagectx_id_t imagectx,
 	 uint64_t offset,
 	 uint64_t length)
-    : IO(ionum, start_time, thread_id, prev),
+    : IO(ionum, start_time, thread_id, deps),
       m_imagectx(imagectx),
       m_offset(offset),
       m_length(length) {
   }
 
-  void write_to(Ser& out) const;
+  virtual void encode(bufferlist &bl) const;
 
   void write_debug(std::ostream& out) const;
 
@@ -218,17 +166,17 @@ public:
   WriteIO(action_id_t ionum,
 	  uint64_t start_time,
 	  thread_id_t thread_id,
-	  IO::ptr prev,
+          const io_set_t& deps,
 	  imagectx_id_t imagectx,
 	  uint64_t offset,
 	  uint64_t length)
-    : IO(ionum, start_time, thread_id, prev),
+    : IO(ionum, start_time, thread_id, deps),
       m_imagectx(imagectx),
       m_offset(offset),
       m_length(length) {
   }
 
-  void write_to(Ser& out) const;
+  virtual void encode(bufferlist &bl) const;
 
   void write_debug(std::ostream& out) const;
 
@@ -243,17 +191,17 @@ public:
   AioReadIO(action_id_t ionum,
 	    uint64_t start_time,
 	    thread_id_t thread_id,
-	    IO::ptr prev,
+            const io_set_t& deps,
 	    imagectx_id_t imagectx,
 	    uint64_t offset,
 	    uint64_t length)
-    : IO(ionum, start_time, thread_id, prev),
+    : IO(ionum, start_time, thread_id, deps),
       m_imagectx(imagectx),
       m_offset(offset),
       m_length(length) {
   }
 
-  void write_to(Ser& out) const;
+  virtual void encode(bufferlist &bl) const;
 
   void write_debug(std::ostream& out) const;
 
@@ -268,17 +216,17 @@ public:
   AioWriteIO(action_id_t ionum,
 	     uint64_t start_time,
 	     thread_id_t thread_id,
-	     IO::ptr prev,
+             const io_set_t& deps,
 	     imagectx_id_t imagectx,
 	     uint64_t offset,
 	     uint64_t length)
-    : IO(ionum, start_time, thread_id, prev),
+    : IO(ionum, start_time, thread_id, deps),
       m_imagectx(imagectx),
       m_offset(offset),
       m_length(length) {
   }
 
-  void write_to(Ser& out) const;
+  virtual void encode(bufferlist &bl) const;
 
   void write_debug(std::ostream& out) const;
 
@@ -293,19 +241,19 @@ public:
   OpenImageIO(action_id_t ionum,
 	      uint64_t start_time,
 	      thread_id_t thread_id,
-	      IO::ptr prev,
+              const io_set_t& deps,
 	      imagectx_id_t imagectx,
 	      const std::string& name,
 	      const std::string& snap_name,
 	      bool readonly)
-    : IO(ionum, start_time, thread_id, prev),
+    : IO(ionum, start_time, thread_id, deps),
       m_imagectx(imagectx),
       m_name(name),
       m_snap_name(snap_name),
       m_readonly(readonly) {
   }
 
-  void write_to(Ser& out) const;
+  virtual void encode(bufferlist &bl) const;
 
   imagectx_id_t imagectx() const {
     return m_imagectx;
@@ -325,13 +273,13 @@ public:
   CloseImageIO(action_id_t ionum,
 	       uint64_t start_time,
 	       thread_id_t thread_id,
-	       IO::ptr prev,
+               const io_set_t& deps,
 	       imagectx_id_t imagectx)
-    : IO(ionum, start_time, thread_id, prev),
+    : IO(ionum, start_time, thread_id, deps),
       m_imagectx(imagectx) {
   }
 
-  void write_to(Ser& out) const;
+  virtual void encode(bufferlist &bl) const;
 
   imagectx_id_t imagectx() const {
     return m_imagectx;
@@ -343,29 +291,6 @@ private:
   imagectx_id_t m_imagectx;
 };
 
-class CompletionIO : public IO {
-public:
-  CompletionIO(action_id_t ionum,
-	       uint64_t start_time,
-	       thread_id_t thread_id)
-    : IO(ionum, start_time, thread_id, IO::ptr()) {
-  }
-
-  void write_to(Ser& out) const {
-  }
-
-  bool is_completion() const {
-    return true;
-  }
-
-  void write_debug(std::ostream& out) const {
-    write_debug_base(out, "completion");
-  }
-};
-
-/// @related IO
-bool compare_io_ptrs_by_start_time(IO::ptr p1, IO::ptr p2);
-
 }
 
 #endif
diff --git a/src/rbd_replay/rbd-replay-prep.cc b/src/rbd_replay/rbd-replay-prep.cc
index 8a602e0..61cff59 100644
--- a/src/rbd_replay/rbd-replay-prep.cc
+++ b/src/rbd_replay/rbd-replay-prep.cc
@@ -15,19 +15,30 @@
 // This code assumes that IO IDs and timestamps are related monotonically.
 // In other words, (a.id < b.id) == (a.timestamp < b.timestamp) for all IOs a and b.
 
+#include "common/errno.h"
+#include "rbd_replay/ActionTypes.h"
 #include <babeltrace/babeltrace.h>
 #include <babeltrace/ctf/events.h>
 #include <babeltrace/ctf/iterator.h>
+#include <sys/types.h>
+#include <fcntl.h>
 #include <cstdlib>
 #include <string>
 #include <assert.h>
 #include <fstream>
+#include <set>
 #include <boost/thread/thread.hpp>
+#include <boost/scope_exit.hpp>
 #include "ios.hpp"
 
 using namespace std;
 using namespace rbd_replay;
 
+#define ASSERT_EXIT(check, str)    \
+  if (!(check)) {                  \
+    std::cerr << str << std::endl; \
+    exit(1);                       \
+  }
 
 class Thread {
 public:
@@ -37,7 +48,6 @@ public:
 	 uint64_t window)
     : m_id(id),
       m_window(window),
-      m_pending_io(IO::ptr()),
       m_latest_io(IO::ptr()),
       m_max_ts(0) {
   }
@@ -52,35 +62,26 @@ public:
     return m_max_ts;
   }
 
-  void issued_io(IO::ptr io, const map<thread_id_t, ptr>& threads) {
+  void issued_io(IO::ptr io, std::set<IO::ptr> *latest_ios) {
     assert(io);
-    io_set_t latest_ios;
-    for (map<thread_id_t, ptr>::const_iterator itr = threads.begin(), end = threads.end(); itr != end; ++itr) {
-      assertf(itr->second, "id = %ld", itr->first);
-      ptr thread(itr->second);
-      if (thread->m_latest_io) {
-	if (thread->m_latest_io->start_time() + m_window > io->start_time()) {
-	  latest_ios.insert(thread->m_latest_io);
-	}
-      }
+    if (m_latest_io.get() != NULL) {
+      latest_ios->erase(m_latest_io);
     }
-    io->add_dependencies(latest_ios);
     m_latest_io = io;
-    m_pending_io = io;
+    latest_ios->insert(io);
   }
 
   thread_id_t id() const {
     return m_id;
   }
 
-  IO::ptr pending_io() {
-    return m_pending_io;
+  IO::ptr latest_io() {
+    return m_latest_io;
   }
 
 private:
   thread_id_t m_id;
   uint64_t m_window;
-  IO::ptr m_pending_io;
   IO::ptr m_latest_io;
   uint64_t m_max_ts;
 };
@@ -119,7 +120,10 @@ private:
 };
 
 static void usage(string prog) {
-  cout << "Usage: " << prog << " [ --window <seconds> ] [ --anonymize ] <trace-input> <replay-output>" << endl;
+  std::stringstream str;
+  str << "Usage: " << prog << " ";
+  std::cout << str.str() << "[ --window <seconds> ] [ --anonymize ] [ --verbose ]" << std::endl
+            << std::string(str.str().size(), ' ') << "<trace-input> <replay-output>" << endl;
 }
 
 __attribute__((noreturn)) static void usage_exit(string prog, string msg) {
@@ -132,14 +136,9 @@ class Processor {
 public:
   Processor()
     : m_window(1000000000ULL), // 1 billion nanoseconds, i.e., one second
-      m_threads(),
       m_io_count(0),
-      m_recent_completions(io_set_t()),
-      m_open_images(set<imagectx_id_t>()),
-      m_ios(vector<IO::ptr>()),
-      m_pending_ios(map<uint64_t, IO::ptr>()),
       m_anonymize(false),
-      m_anonymized_images(map<string, AnonymizedImage>()) {
+      m_verbose(false) {
   }
 
   void run(vector<string> args) {
@@ -155,11 +154,11 @@ public:
 	}
 	m_window = (uint64_t)(1e9 * atof(args[++i].c_str()));
       } else if (arg.find("--window=") == 0) {
-	// TODO: test
-	printf("Arg: '%s'\n", arg.c_str() + sizeof("--window="));
 	m_window = (uint64_t)(1e9 * atof(arg.c_str() + sizeof("--window=")));
       } else if (arg == "--anonymize") {
 	m_anonymize = true;
+      } else if (arg == "--verbose") {
+        m_verbose = true;
       } else if (arg == "-h" || arg == "--help") {
 	usage(args[0]);
 	exit(0);
@@ -186,10 +185,11 @@ public:
 					    NULL, // packet_seek
 					    NULL, // stream_list
 					    NULL); // metadata
-    assertf(trace_handle >= 0, "trace_handle = %d", trace_handle);
+    ASSERT_EXIT(trace_handle >= 0, "Error loading trace file");
 
     uint64_t start_time_ns = bt_trace_handle_get_timestamp_begin(ctx, trace_handle, BT_CLOCK_REAL);
-    assert(start_time_ns != -1ULL);
+    ASSERT_EXIT(start_time_ns != -1ULL,
+                "Error extracting creation time from trace");
 
     struct bt_ctf_iter *itr = bt_ctf_iter_create(ctx,
 						 NULL, // begin_pos
@@ -198,6 +198,15 @@ public:
 
     struct bt_iter *bt_itr = bt_ctf_get_iter(itr);
 
+    int fd = open(output_file_name.c_str(), O_WRONLY | O_CREAT | O_EXCL, 0644);
+    ASSERT_EXIT(fd >= 0, "Error opening output file " << output_file_name <<
+                         ": " << cpp_strerror(errno));
+    BOOST_SCOPE_EXIT( (fd) ) {
+      close(fd);
+    } BOOST_SCOPE_EXIT_END;
+
+    write_banner(fd);
+
     uint64_t trace_start = 0;
     bool first = true;
     while(true) {
@@ -206,7 +215,7 @@ public:
 	break;
       }
       uint64_t ts = bt_ctf_get_timestamp(evt);
-      assert(ts != -1ULL);
+      ASSERT_EXIT(ts != -1ULL, "Error extracting event timestamp");
 
       if (first) {
 	trace_start = ts;
@@ -215,100 +224,76 @@ public:
       ts -= trace_start;
       ts += 4; // This is so we have room to insert two events (thread start and open image) at unique timestamps before whatever the first event is.
 
-      process_event(ts, evt);
+      IO::ptrs ptrs;
+      process_event(ts, evt, &ptrs);
+      serialize_events(fd, ptrs);
 
       int r = bt_iter_next(bt_itr);
-      assert(!r);
+      ASSERT_EXIT(r == 0, "Error advancing event iterator");
     }
 
     bt_ctf_iter_destroy(itr);
 
-    insert_thread_stops();
+    insert_thread_stops(fd);
+  }
 
-    for (vector<IO::ptr>::const_iterator itr = m_ios.begin(); itr != m_ios.end(); ++itr) {
-      IO::ptr io(*itr);
-      IO::ptr prev(io->prev());
-      if (prev) {
-	// TODO: explain when prev is and isn't a dep
-	io_set_t::iterator depitr = io->dependencies().find(prev);
-	if (depitr != io->dependencies().end()) {
-	  io->dependencies().erase(depitr);
-	}
-      }
-      if (io->is_completion()) {
-	io->dependencies().clear();
-      }
-      for (io_set_t::const_iterator depitr = io->dependencies().begin(); depitr != io->dependencies().end(); ++depitr) {
-	IO::ptr dep(*depitr);
-	dep->set_num_successors(dep->num_successors() + 1);
-      }
-    }
+private:
+  void write_banner(int fd) {
+    bufferlist bl;
+    bl.append(rbd_replay::action::BANNER);
+    int r = bl.write_fd(fd);
+    ASSERT_EXIT(r >= 0, "Error writing to output file: " << cpp_strerror(r));
+  }
+
+  void serialize_events(int fd, const IO::ptrs &ptrs) {
+    for (IO::ptrs::const_iterator it = ptrs.begin(); it != ptrs.end(); ++it) {
+      IO::ptr io(*it);
+
+      bufferlist bl;
+      io->encode(bl);
 
-    ofstream myfile;
-    myfile.open(output_file_name.c_str(), ios::out | ios::binary);
-    Ser ser(myfile);
-    for (vector<IO::ptr>::iterator itr = m_ios.begin(); itr != m_ios.end(); ++itr) {
-      (*itr)->write_to(ser);
+      int r = bl.write_fd(fd);
+      ASSERT_EXIT(r >= 0, "Error writing to output file: " << cpp_strerror(r));
+
+      if (m_verbose) {
+        io->write_debug(std::cout);
+        std::cout << std::endl;
+      }
     }
-    myfile.close();
   }
 
-private:
-  void insert_thread_stops() {
-    sort(m_ios.begin(), m_ios.end(), compare_io_ptrs_by_start_time);
-    for (map<thread_id_t, Thread::ptr>::const_iterator itr = m_threads.begin(), end = m_threads.end(); itr != end; ++itr) {
+  void insert_thread_stops(int fd) {
+    IO::ptrs ios;
+    for (map<thread_id_t, Thread::ptr>::const_iterator itr = m_threads.begin(),
+         end = m_threads.end(); itr != end; ++itr) {
       Thread::ptr thread(itr->second);
-      const action_id_t none = -1;
-      action_id_t ionum = none;
-      action_id_t maxIONum = 0; // only valid if ionum is none
-      for (vector<IO::ptr>::const_iterator itr2 = m_ios.begin(); itr2 != m_ios.end(); ++itr2) {
-	IO::ptr io(*itr2);
-	if (io->ionum() > maxIONum) {
-	  maxIONum = io->ionum();
-	}
-	if (io->start_time() > thread->max_ts()) {
-	  ionum = io->ionum();
-	  if (ionum & 1) {
-	    ionum++;
-	  }
-	  break;
-	}
-      }
-      if (ionum == none) {
-	if (maxIONum & 1) {
-	  maxIONum--;
-	}
-	ionum = maxIONum + 2;
-      }
-      for (vector<IO::ptr>::const_iterator itr2 = m_ios.begin(); itr2 != m_ios.end(); ++itr2) {
-	IO::ptr io(*itr2);
-	if (io->ionum() >= ionum) {
-	  io->set_ionum(io->ionum() + 2);
-	}
-      }
-      IO::ptr stop_thread_io(new StopThreadIO(ionum, thread->max_ts(), thread->id()));
-      vector<IO::ptr>::iterator insertion_point = lower_bound(m_ios.begin(), m_ios.end(), stop_thread_io, compare_io_ptrs_by_start_time);
-      m_ios.insert(insertion_point, stop_thread_io);
+      ios.push_back(IO::ptr(new StopThreadIO(next_id(), thread->max_ts(),
+                                             thread->id(),
+                                             m_recent_completions)));
     }
+    serialize_events(fd, ios);
   }
 
-  void process_event(uint64_t ts, struct bt_ctf_event *evt) {
+  void process_event(uint64_t ts, struct bt_ctf_event *evt,
+                     IO::ptrs *ios) {
     const char *event_name = bt_ctf_event_name(evt);
     const struct bt_definition *scope_context = bt_ctf_get_top_level_scope(evt,
 									   BT_STREAM_EVENT_CONTEXT);
-    assert(scope_context);
+    ASSERT_EXIT(scope_context != NULL, "Error retrieving event context");
+
     const struct bt_definition *scope_fields = bt_ctf_get_top_level_scope(evt,
 									  BT_EVENT_FIELDS);
-    assert(scope_fields);
+    ASSERT_EXIT(scope_fields != NULL, "Error retrieving event fields");
 
     const struct bt_definition *pthread_id_field = bt_ctf_get_field(evt, scope_context, "pthread_id");
-    assert(pthread_id_field);
+    ASSERT_EXIT(pthread_id_field != NULL, "Error retrieving thread id");
+
     thread_id_t threadID = bt_ctf_get_uint64(pthread_id_field);
     Thread::ptr &thread(m_threads[threadID]);
     if (!thread) {
       thread.reset(new Thread(threadID, m_window));
       IO::ptr io(new StartThreadIO(next_id(), ts - 4, threadID));
-      m_ios.push_back(io);
+      ios->push_back(io);
     }
     thread->insert_ts(ts);
 
@@ -322,28 +307,34 @@ private:
 
       const char* string(const char* name) {
 	const struct bt_definition *field = bt_ctf_get_field(m_evt, m_scope, name);
-	assertf(field, "field name = '%s'", name);
+        ASSERT_EXIT(field != NULL, "Error retrieving field '" << name << "'");
+
 	const char* c = bt_ctf_get_string(field);
 	int err = bt_ctf_field_get_error();
-	assertf(c && err == 0, "field name = '%s', err = %d", name, err);
+        ASSERT_EXIT(c && err == 0, "Error retrieving field value '" << name <<
+                                   "': error=" << err);
 	return c;
       }
 
       int64_t int64(const char* name) {
 	const struct bt_definition *field = bt_ctf_get_field(m_evt, m_scope, name);
-	assertf(field, "field name = '%s'", name);
+        ASSERT_EXIT(field != NULL, "Error retrieving field '" << name << "'");
+
 	int64_t val = bt_ctf_get_int64(field);
 	int err = bt_ctf_field_get_error();
-	assertf(err == 0, "field name = '%s', err = %d", name, err);
+        ASSERT_EXIT(err == 0, "Error retrieving field value '" << name <<
+                              "': error=" << err);
 	return val;
       }
 
       uint64_t uint64(const char* name) {
 	const struct bt_definition *field = bt_ctf_get_field(m_evt, m_scope, name);
-	assertf(field, "field name = '%s'", name);
+        ASSERT_EXIT(field != NULL, "Error retrieving field '" << name << "'");
+
 	uint64_t val = bt_ctf_get_uint64(field);
 	int err = bt_ctf_field_get_error();
-	assertf(err == 0, "field name = '%s', err = %d", name, err);
+        ASSERT_EXIT(err == 0, "Error retrieving field value '" << name <<
+                              "': error=" << err);
 	return val;
       }
 
@@ -352,72 +343,69 @@ private:
       const struct bt_definition *m_scope;
     } fields(evt, scope_fields);
 
-    if (strcmp(event_name, "librbd:read_enter") == 0) {
+    if (strcmp(event_name, "librbd:open_image_enter") == 0) {
       string name(fields.string("name"));
       string snap_name(fields.string("snap_name"));
-      bool readonly = fields.int64("read_only");
-      imagectx_id_t imagectx = fields.uint64("imagectx");
-      uint64_t offset = fields.uint64("offset");
-      uint64_t length = fields.uint64("length");
-      require_image(ts, thread, imagectx, name, snap_name, readonly);
-      action_id_t ionum = next_id();
-      IO::ptr io(new ReadIO(ionum, ts, threadID, thread->pending_io(), imagectx, offset, length));
-      io->add_dependencies(m_recent_completions);
-      thread->issued_io(io, m_threads);
-      m_ios.push_back(io);
-    } else if (strcmp(event_name, "librbd:open_image_enter") == 0) {
-      string name(fields.string("name"));
-      string snap_name(fields.string("snap_name"));
-      bool readonly = fields.int64("read_only");
+      bool readonly = fields.uint64("read_only");
       imagectx_id_t imagectx = fields.uint64("imagectx");
       action_id_t ionum = next_id();
       pair<string, string> aname(map_image_snap(name, snap_name));
-      IO::ptr io(new OpenImageIO(ionum, ts, threadID, thread->pending_io(), imagectx, aname.first, aname.second, readonly));
-      io->add_dependencies(m_recent_completions);
-      thread->issued_io(io, m_threads);
-      m_ios.push_back(io);
+      IO::ptr io(new OpenImageIO(ionum, ts, threadID, m_recent_completions,
+                                 imagectx, aname.first, aname.second,
+                                 readonly));
+      thread->issued_io(io, &m_latest_ios);
+      ios->push_back(io);
     } else if (strcmp(event_name, "librbd:open_image_exit") == 0) {
-      IO::ptr completionIO(thread->pending_io()->create_completion(ts, threadID));
-      m_ios.push_back(completionIO);
-      boost::shared_ptr<OpenImageIO> io(boost::dynamic_pointer_cast<OpenImageIO>(thread->pending_io()));
+      completed(thread->latest_io());
+      boost::shared_ptr<OpenImageIO> io(boost::dynamic_pointer_cast<OpenImageIO>(thread->latest_io()));
       assert(io);
       m_open_images.insert(io->imagectx());
     } else if (strcmp(event_name, "librbd:close_image_enter") == 0) {
       imagectx_id_t imagectx = fields.uint64("imagectx");
       action_id_t ionum = next_id();
-      IO::ptr io(new CloseImageIO(ionum, ts, threadID, thread->pending_io(), imagectx));
-      io->add_dependencies(m_recent_completions);
-      thread->issued_io(io, m_threads);
-      m_ios.push_back(thread->pending_io());
+      IO::ptr io(new CloseImageIO(ionum, ts, threadID, m_recent_completions,
+                                  imagectx));
+      thread->issued_io(io, &m_latest_ios);
+      ios->push_back(thread->latest_io());
     } else if (strcmp(event_name, "librbd:close_image_exit") == 0) {
-      IO::ptr completionIO(thread->pending_io()->create_completion(ts, threadID));
-      m_ios.push_back(completionIO);
-      completed(completionIO);
-      boost::shared_ptr<CloseImageIO> io(boost::dynamic_pointer_cast<CloseImageIO>(thread->pending_io()));
+      completed(thread->latest_io());
+      boost::shared_ptr<CloseImageIO> io(boost::dynamic_pointer_cast<CloseImageIO>(thread->latest_io()));
       assert(io);
       m_open_images.erase(io->imagectx());
+    } else if (strcmp(event_name, "librbd:read_enter") == 0 ||
+               strcmp(event_name, "librbd:read2_enter") == 0) {
+      string name(fields.string("name"));
+      string snap_name(fields.string("snap_name"));
+      bool readonly = fields.int64("read_only");
+      imagectx_id_t imagectx = fields.uint64("imagectx");
+      uint64_t offset = fields.uint64("offset");
+      uint64_t length = fields.uint64("length");
+      require_image(ts, thread, imagectx, name, snap_name, readonly, ios);
+      action_id_t ionum = next_id();
+      IO::ptr io(new ReadIO(ionum, ts, threadID, m_recent_completions, imagectx,
+                            offset, length));
+      thread->issued_io(io, &m_latest_ios);
+      ios->push_back(io);
     } else if (strcmp(event_name, "librbd:read_exit") == 0) {
-      IO::ptr completionIO(thread->pending_io()->create_completion(ts, threadID));
-      m_ios.push_back(completionIO);
-      completed(completionIO);
-    } else if (strcmp(event_name, "librbd:write_enter") == 0) {
+      completed(thread->latest_io());
+    } else if (strcmp(event_name, "librbd:write_enter") == 0 ||
+               strcmp(event_name, "librbd:write2_enter") == 0) {
       string name(fields.string("name"));
       string snap_name(fields.string("snap_name"));
       bool readonly = fields.int64("read_only");
       uint64_t offset = fields.uint64("off");
       uint64_t length = fields.uint64("buf_len");
       imagectx_id_t imagectx = fields.uint64("imagectx");
-      require_image(ts, thread, imagectx, name, snap_name, readonly);
+      require_image(ts, thread, imagectx, name, snap_name, readonly, ios);
       action_id_t ionum = next_id();
-      IO::ptr io(new WriteIO(ionum, ts, threadID, thread->pending_io(), imagectx, offset, length));
-      io->add_dependencies(m_recent_completions);
-      thread->issued_io(io, m_threads);
-      m_ios.push_back(io);
+      IO::ptr io(new WriteIO(ionum, ts, threadID, m_recent_completions,
+                             imagectx, offset, length));
+      thread->issued_io(io, &m_latest_ios);
+      ios->push_back(io);
     } else if (strcmp(event_name, "librbd:write_exit") == 0) {
-      IO::ptr completionIO(thread->pending_io()->create_completion(ts, threadID));
-      m_ios.push_back(completionIO);
-      completed(completionIO);
-    } else if (strcmp(event_name, "librbd:aio_read_enter") == 0) {
+      completed(thread->latest_io());
+    } else if (strcmp(event_name, "librbd:aio_read_enter") == 0 ||
+               strcmp(event_name, "librbd:aio_read2_enter") == 0) {
       string name(fields.string("name"));
       string snap_name(fields.string("snap_name"));
       bool readonly = fields.int64("read_only");
@@ -425,14 +413,15 @@ private:
       imagectx_id_t imagectx = fields.uint64("imagectx");
       uint64_t offset = fields.uint64("offset");
       uint64_t length = fields.uint64("length");
-      require_image(ts, thread, imagectx, name, snap_name, readonly);
+      require_image(ts, thread, imagectx, name, snap_name, readonly, ios);
       action_id_t ionum = next_id();
-      IO::ptr io(new AioReadIO(ionum, ts, threadID, thread->pending_io(), imagectx, offset, length));
-      io->add_dependencies(m_recent_completions);
-      m_ios.push_back(io);
-      thread->issued_io(io, m_threads);
+      IO::ptr io(new AioReadIO(ionum, ts, threadID, m_recent_completions,
+                               imagectx, offset, length));
+      ios->push_back(io);
+      thread->issued_io(io, &m_latest_ios);
       m_pending_ios[completion] = io;
-    } else if (strcmp(event_name, "librbd:aio_write_enter") == 0) {
+    } else if (strcmp(event_name, "librbd:aio_write_enter") == 0 ||
+               strcmp(event_name, "librbd:aio_write2_enter") == 0) {
       string name(fields.string("name"));
       string snap_name(fields.string("snap_name"));
       bool readonly = fields.int64("read_only");
@@ -440,12 +429,12 @@ private:
       uint64_t length = fields.uint64("len");
       uint64_t completion = fields.uint64("completion");
       imagectx_id_t imagectx = fields.uint64("imagectx");
-      require_image(ts, thread, imagectx, name, snap_name, readonly);
+      require_image(ts, thread, imagectx, name, snap_name, readonly, ios);
       action_id_t ionum = next_id();
-      IO::ptr io(new AioWriteIO(ionum, ts, threadID, thread->pending_io(), imagectx, offset, length));
-      io->add_dependencies(m_recent_completions);
-      thread->issued_io(io, m_threads);
-      m_ios.push_back(io);
+      IO::ptr io(new AioWriteIO(ionum, ts, threadID, m_recent_completions,
+                                imagectx, offset, length));
+      thread->issued_io(io, &m_latest_ios);
+      ios->push_back(io);
       m_pending_ios[completion] = io;
     } else if (strcmp(event_name, "librbd:aio_complete_enter") == 0) {
       uint64_t completion = fields.uint64("completion");
@@ -453,13 +442,9 @@ private:
       if (itr != m_pending_ios.end()) {
 	IO::ptr completedIO(itr->second);
 	m_pending_ios.erase(itr);
-	IO::ptr completionIO(completedIO->create_completion(ts, threadID));
-	m_ios.push_back(completionIO);
-	completed(completionIO);
+        completed(completedIO);
       }
     }
-
-    //        cout << ts << "\t" << event_name << "\tthreadID = " << threadID << endl;
   }
 
   action_id_t next_id() {
@@ -469,9 +454,14 @@ private:
   }
 
   void completed(IO::ptr io) {
-    uint64_t limit = io->start_time() < m_window ? 0 : io->start_time() - m_window;
-    for (io_set_t::iterator itr = m_recent_completions.begin(); itr != m_recent_completions.end(); ) {
-      if ((*itr)->start_time() < limit) {
+    uint64_t limit = (io->start_time() < m_window ?
+      0 : io->start_time() - m_window);
+    for (io_set_t::iterator itr = m_recent_completions.begin();
+         itr != m_recent_completions.end(); ) {
+      IO::ptr recent_comp(*itr);
+      if ((recent_comp->start_time() < limit ||
+           io->dependencies().count(recent_comp) != 0) &&
+          m_latest_ios.count(recent_comp) == 0) {
 	m_recent_completions.erase(itr++);
       } else {
 	++itr;
@@ -496,20 +486,20 @@ private:
 		     imagectx_id_t imagectx,
 		     const string& name,
 		     const string& snap_name,
-		     bool readonly) {
+		     bool readonly,
+                     IO::ptrs *ios) {
     assert(thread);
     if (m_open_images.count(imagectx) > 0) {
       return;
     }
     action_id_t ionum = next_id();
     pair<string, string> aname(map_image_snap(name, snap_name));
-    IO::ptr io(new OpenImageIO(ionum, ts - 2, thread->id(), thread->pending_io(), imagectx, aname.first, aname.second, readonly));
-    io->add_dependencies(m_recent_completions);
-    thread->issued_io(io, m_threads);
-    m_ios.push_back(io);
-    IO::ptr completionIO(io->create_completion(ts - 1, thread->id()));
-    m_ios.push_back(completionIO);
-    completed(completionIO);
+    IO::ptr io(new OpenImageIO(ionum, ts - 2, thread->id(),
+                               m_recent_completions, imagectx, aname.first,
+                               aname.second, readonly));
+    thread->issued_io(io, &m_latest_ios);
+    ios->push_back(io);
+    completed(io);
     m_open_images.insert(imagectx);
   }
 
@@ -518,13 +508,15 @@ private:
   uint32_t m_io_count;
   io_set_t m_recent_completions;
   set<imagectx_id_t> m_open_images;
-  vector<IO::ptr> m_ios;
 
   // keyed by completion
   map<uint64_t, IO::ptr> m_pending_ios;
+  std::set<IO::ptr> m_latest_ios;
 
   bool m_anonymize;
   map<string, AnonymizedImage> m_anonymized_images;
+
+  bool m_verbose;
 };
 
 int main(int argc, char** argv) {
diff --git a/src/rbd_replay/rbd-replay.cc b/src/rbd_replay/rbd-replay.cc
index 695053e..b00c131 100644
--- a/src/rbd_replay/rbd-replay.cc
+++ b/src/rbd_replay/rbd-replay.cc
@@ -75,7 +75,7 @@ int main(int argc, const char **argv) {
       break;
     } else if (ceph_argparse_witharg(args, i, &val, "-p", "--pool", (char*)NULL)) {
       pool_name = val;
-    } else if (ceph_argparse_withfloat(args, i, &latency_multiplier, &err, "--latency-multiplier",
+    } else if (ceph_argparse_witharg(args, i, &latency_multiplier, err, "--latency-multiplier",
 				     (char*)NULL)) {
       if (!err.str().empty()) {
 	cerr << err.str() << std::endl;
diff --git a/src/rgw/Makefile.am b/src/rgw/Makefile.am
index 7620d73..3a30156 100644
--- a/src/rgw/Makefile.am
+++ b/src/rgw/Makefile.am
@@ -45,7 +45,8 @@ librgw_la_SOURCES =  \
 	rgw/rgw_replica_log.cc \
 	rgw/rgw_keystone.cc \
 	rgw/rgw_quota.cc \
-	rgw/rgw_dencoder.cc
+	rgw/rgw_dencoder.cc \
+	rgw/rgw_object_expirer_core.cc
 librgw_la_CXXFLAGS = -Woverloaded-virtual ${AM_CXXFLAGS}
 noinst_LTLIBRARIES += librgw.la
 
@@ -54,6 +55,7 @@ LIBRGW_DEPS += \
 	libcls_rgw_client.la \
 	libcls_log_client.a \
 	libcls_statelog_client.a \
+	libcls_timeindex_client.a \
 	libcls_user_client.a \
 	libcls_replica_log_client.a \
 	libcls_lock_client.la \
@@ -65,7 +67,7 @@ LIBRGW_DEPS += \
 	-lfcgi \
 	-ldl
 
-CIVETWEB_INCLUDE = --include civetweb/include/civetweb_conf.h
+CIVETWEB_INCLUDE = --include $(srcdir)/civetweb/include/civetweb_conf.h
 
 libcivetweb_la_SOURCES =  \
 	rgw/rgw_civetweb.cc \
@@ -73,7 +75,7 @@ libcivetweb_la_SOURCES =  \
 	civetweb/src/civetweb.c
 
 libcivetweb_la_CXXFLAGS = ${CIVETWEB_INCLUDE} -Woverloaded-virtual ${AM_CXXFLAGS}
-libcivetweb_la_CFLAGS = -Icivetweb/include ${CIVETWEB_INCLUDE}
+libcivetweb_la_CFLAGS = -I$(srcdir)/civetweb/include ${CIVETWEB_INCLUDE}
 
 noinst_LTLIBRARIES += libcivetweb.la
 
@@ -104,6 +106,10 @@ radosgw_admin_SOURCES = rgw/rgw_admin.cc rgw/rgw_orphan.cc
 radosgw_admin_LDADD = $(LIBRGW) $(LIBRGW_DEPS) $(CEPH_GLOBAL)
 bin_PROGRAMS += radosgw-admin
 
+radosgw_object_expirer_SOURCES = rgw/rgw_object_expirer.cc
+radosgw_object_expirer_LDADD = $(LIBRGW) $(LIBRGW_DEPS) $(CEPH_GLOBAL)
+bin_PROGRAMS += radosgw-object-expirer
+
 ceph_rgw_multiparser_SOURCES = rgw/rgw_multiparser.cc
 ceph_rgw_multiparser_LDADD = $(LIBRGW) $(LIBRGW_DEPS) $(CEPH_GLOBAL)
 bin_DEBUGPROGRAMS += ceph_rgw_multiparser
@@ -118,7 +124,6 @@ bin_DEBUGPROGRAMS += ceph_rgw_jsonparser
 
 
 noinst_HEADERS += \
-	rgw/logrotate.conf \
 	rgw/rgw_acl.h \
 	rgw/rgw_acl_s3.h \
 	rgw/rgw_acl_swift.h \
@@ -140,6 +145,7 @@ noinst_HEADERS += \
 	rgw/rgw_gc.h \
 	rgw/rgw_metadata.h \
 	rgw/rgw_multi_del.h \
+	rgw/rgw_object_expirer_core.h \
 	rgw/rgw_op.h \
 	rgw/rgw_orphan.h \
 	rgw/rgw_http_client.h \
diff --git a/src/rgw/logrotate.conf b/src/rgw/logrotate.conf
deleted file mode 100644
index 7e527e8..0000000
--- a/src/rgw/logrotate.conf
+++ /dev/null
@@ -1,26 +0,0 @@
-/var/log/radosgw/*.log {
-    rotate 7
-    daily
-    compress
-    sharedscripts
-    postrotate
-        if which invoke-rc.d > /dev/null 2>&1 && [ -x `which invoke-rc.d` ]; then
-            invoke-rc.d radosgw reload >/dev/null
-        elif which service > /dev/null 2>&1 && [ -x `which service` ]; then
-            service ceph-radosgw reload >/dev/null
-        fi
-        # Possibly reload twice, but depending on ceph.conf the reload above may be a no-op
-        if which initctl > /dev/null 2>&1 && [ -x `which initctl` ]; then
-          find -L /var/lib/ceph/radosgw/ -mindepth 1 -maxdepth 1 -regextype posix-egrep -regex '.*/[A-Za-z0-9]+-[A-Za-z0-9._-]+' -printf '%P\n' \
-          | while read f; do
-            if [ -e "/var/lib/ceph/radosgw/$f/done" ]; then
-                cluster="${f%%-*}"
-                id="${f#*-}"
-                initctl reload radosgw cluster="$cluster" id="$id" 2>/dev/null || :
-            fi
-          done
-        fi
-    endscript
-    missingok
-    notifempty
-}
diff --git a/src/rgw/rgw_acl.h b/src/rgw/rgw_acl.h
index 2de62e7..d4a4643 100644
--- a/src/rgw/rgw_acl.h
+++ b/src/rgw/rgw_acl.h
@@ -6,14 +6,13 @@
 
 #include <map>
 #include <string>
-#include <iostream>
 #include <include/types.h>
 
 #include "common/debug.h"
 
 using namespace std;
 
-
+#define RGW_PERM_NONE            0x00
 #define RGW_PERM_READ            0x01
 #define RGW_PERM_WRITE           0x02
 #define RGW_PERM_READ_ACP        0x04
@@ -23,6 +22,7 @@ using namespace std;
 #define RGW_PERM_FULL_CONTROL    ( RGW_PERM_READ | RGW_PERM_WRITE | \
                                   RGW_PERM_READ_ACP | RGW_PERM_WRITE_ACP )
 #define RGW_PERM_ALL_S3          RGW_PERM_FULL_CONTROL
+#define RGW_PERM_INVALID         0xFF00
 
 enum ACLGranteeTypeEnum {
 /* numbers are encoded, should not change */
diff --git a/src/rgw/rgw_acl_s3.cc b/src/rgw/rgw_acl_s3.cc
index 785324c..f0ed081 100644
--- a/src/rgw/rgw_acl_s3.cc
+++ b/src/rgw/rgw_acl_s3.cc
@@ -140,6 +140,15 @@ bool ACLOwner_S3::xml_end(const char *el) {
   return true;
 }
 
+void  ACLOwner_S3::to_xml(ostream& out) {
+  if (id.empty())
+    return;
+  out << "<Owner>" << "<ID>" << id << "</ID>";
+  if (!display_name.empty())
+    out << "<DisplayName>" << display_name << "</DisplayName>";
+  out << "</Owner>";
+}
+
 bool ACLGrant_S3::xml_end(const char *el) {
   ACLGrantee_S3 *acl_grantee;
   ACLID_S3 *acl_id;
@@ -257,6 +266,16 @@ bool RGWAccessControlList_S3::xml_end(const char *el) {
   return true;
 }
 
+void  RGWAccessControlList_S3::to_xml(ostream& out) {
+  multimap<string, ACLGrant>::iterator iter;
+  out << "<AccessControlList>";
+  for (iter = grant_map.begin(); iter != grant_map.end(); ++iter) {
+    ACLGrant_S3& grant = static_cast<ACLGrant_S3 &>(iter->second);
+    grant.to_xml(cct, out);
+  }
+  out << "</AccessControlList>";
+}
+
 struct s3_acl_header {
   int rgw_perm;
   const char *http_header;
@@ -412,6 +431,15 @@ bool RGWAccessControlPolicy_S3::xml_end(const char *el) {
   return true;
 }
 
+void  RGWAccessControlPolicy_S3::to_xml(ostream& out) {
+  out << "<AccessControlPolicy xmlns=\"http://s3.amazonaws.com/doc/2006-03-01/\">";
+  ACLOwner_S3& _owner = static_cast<ACLOwner_S3 &>(owner);
+  RGWAccessControlList_S3& _acl = static_cast<RGWAccessControlList_S3 &>(acl);
+  _owner.to_xml(out);
+  _acl.to_xml(out);
+  out << "</AccessControlPolicy>";
+}
+
 static const s3_acl_header acl_header_perms[] = {
   {RGW_PERM_READ, "HTTP_X_AMZ_GRANT_READ"},
   {RGW_PERM_WRITE, "HTTP_X_AMZ_GRANT_WRITE"},
diff --git a/src/rgw/rgw_acl_s3.h b/src/rgw/rgw_acl_s3.h
index 13a11c1..694cc1d 100644
--- a/src/rgw/rgw_acl_s3.h
+++ b/src/rgw/rgw_acl_s3.h
@@ -6,7 +6,7 @@
 
 #include <map>
 #include <string>
-#include <iostream>
+#include <iosfwd>
 #include <include/types.h>
 
 #include <expat.h>
@@ -61,15 +61,7 @@ public:
   ~RGWAccessControlList_S3() {}
 
   bool xml_end(const char *el);
-  void to_xml(ostream& out) {
-    multimap<string, ACLGrant>::iterator iter;
-    out << "<AccessControlList>";
-    for (iter = grant_map.begin(); iter != grant_map.end(); ++iter) {
-      ACLGrant_S3& grant = static_cast<ACLGrant_S3 &>(iter->second);
-      grant.to_xml(cct, out);
-    }
-    out << "</AccessControlList>";
-  }
+  void to_xml(ostream& out);
 
   int create_canned(ACLOwner& owner, ACLOwner& bucket_owner, const string& canned_acl);
   int create_from_grants(std::list<ACLGrant>& grants);
@@ -82,14 +74,7 @@ public:
   ~ACLOwner_S3() {}
 
   bool xml_end(const char *el);
-  void to_xml(ostream& out) {
-    if (id.empty())
-      return;
-    out << "<Owner>" << "<ID>" << id << "</ID>";
-    if (!display_name.empty())
-      out << "<DisplayName>" << display_name << "</DisplayName>";
-    out << "</Owner>";
-  }
+  void to_xml(ostream& out);
 };
 
 class RGWEnv;
@@ -102,14 +87,7 @@ public:
 
   bool xml_end(const char *el);
 
-  void to_xml(ostream& out) {
-    out << "<AccessControlPolicy xmlns=\"http://s3.amazonaws.com/doc/2006-03-01/\">";
-    ACLOwner_S3& _owner = static_cast<ACLOwner_S3 &>(owner);
-    RGWAccessControlList_S3& _acl = static_cast<RGWAccessControlList_S3 &>(acl);
-    _owner.to_xml(out);
-    _acl.to_xml(out);
-    out << "</AccessControlPolicy>";
-  }
+  void to_xml(ostream& out);
   int rebuild(RGWRados *store, ACLOwner *owner, RGWAccessControlPolicy& dest);
   bool compare_group_name(string& id, ACLGroupTypeEnum group);
 
diff --git a/src/rgw/rgw_acl_swift.h b/src/rgw/rgw_acl_swift.h
index b26a39e..9a5fbf7 100644
--- a/src/rgw/rgw_acl_swift.h
+++ b/src/rgw/rgw_acl_swift.h
@@ -6,7 +6,6 @@
 
 #include <map>
 #include <string>
-#include <iostream>
 #include <vector>
 #include <include/types.h>
 
diff --git a/src/rgw/rgw_admin.cc b/src/rgw/rgw_admin.cc
index 45cb2e1..af82ecb 100644
--- a/src/rgw/rgw_admin.cc
+++ b/src/rgw/rgw_admin.cc
@@ -42,139 +42,140 @@ static RGWRados *store = NULL;
 
 void _usage() 
 {
-  cerr << "usage: radosgw-admin <cmd> [options...]" << std::endl;
-  cerr << "commands:\n";
-  cerr << "  user create                create a new user\n" ;
-  cerr << "  user modify                modify user\n";
-  cerr << "  user info                  get user info\n";
-  cerr << "  user rm                    remove user\n";
-  cerr << "  user suspend               suspend a user\n";
-  cerr << "  user enable                re-enable user after suspension\n";
-  cerr << "  user check                 check user info\n";
-  cerr << "  user stats                 show user stats as accounted by quota subsystem\n";
-  cerr << "  caps add                   add user capabilities\n";
-  cerr << "  caps rm                    remove user capabilities\n";
-  cerr << "  subuser create             create a new subuser\n" ;
-  cerr << "  subuser modify             modify subuser\n";
-  cerr << "  subuser rm                 remove subuser\n";
-  cerr << "  key create                 create access key\n";
-  cerr << "  key rm                     remove access key\n";
-  cerr << "  bucket list                list buckets\n";
-  cerr << "  bucket link                link bucket to specified user\n";
-  cerr << "  bucket unlink              unlink bucket from specified user\n";
-  cerr << "  bucket stats               returns bucket statistics\n";
-  cerr << "  bucket rm                  remove bucket\n";
-  cerr << "  bucket check               check bucket index\n";
-  cerr << "  object rm                  remove object\n";
-  cerr << "  object unlink              unlink object from bucket index\n";
-  cerr << "  quota set                  set quota params\n";
-  cerr << "  quota enable               enable quota\n";
-  cerr << "  quota disable              disable quota\n";
-  cerr << "  region get                 show region info\n";
-  cerr << "  regions list               list all regions set on this cluster\n";
-  cerr << "  region set                 set region info (requires infile)\n";
-  cerr << "  region default             set default region\n";
-  cerr << "  region-map get             show region-map\n";
-  cerr << "  region-map set             set region-map (requires infile)\n";
-  cerr << "  zone get                   show zone cluster params\n";
-  cerr << "  zone set                   set zone cluster params (requires infile)\n";
-  cerr << "  zone list                  list all zones set on this cluster\n";
-  cerr << "  pool add                   add an existing pool for data placement\n";
-  cerr << "  pool rm                    remove an existing pool from data placement set\n";
-  cerr << "  pools list                 list placement active set\n";
-  cerr << "  policy                     read bucket/object policy\n";
-  cerr << "  log list                   list log objects\n";
-  cerr << "  log show                   dump a log from specific object or (bucket + date\n";
-  cerr << "                             + bucket-id)\n";
-  cerr << "  log rm                     remove log object\n";
-  cerr << "  usage show                 show usage (by user, date range)\n";
-  cerr << "  usage trim                 trim usage (by user, date range)\n";
-  cerr << "  temp remove                remove temporary objects that were created up to\n";
-  cerr << "                             specified date (and optional time)\n";
-  cerr << "  gc list                    dump expired garbage collection objects (specify\n";
-  cerr << "                             --include-all to list all entries, including unexpired)\n";
-  cerr << "  gc process                 manually process garbage\n";
-  cerr << "  metadata get               get metadata info\n";
-  cerr << "  metadata put               put metadata info\n";
-  cerr << "  metadata rm                remove metadata info\n";
-  cerr << "  metadata list              list metadata info\n";
-  cerr << "  mdlog list                 list metadata log\n";
-  cerr << "  mdlog trim                 trim metadata log\n";
-  cerr << "  bilog list                 list bucket index log\n";
-  cerr << "  bilog trim                 trim bucket index log (use start-marker, end-marker)\n";
-  cerr << "  datalog list               list data log\n";
-  cerr << "  datalog trim               trim data log\n";
-  cerr << "  opstate list               list stateful operations entries (use client_id,\n";
-  cerr << "                             op_id, object)\n";
-  cerr << "  opstate set                set state on an entry (use client_id, op_id, object, state)\n";
-  cerr << "  opstate renew              renew state on an entry (use client_id, op_id, object)\n";
-  cerr << "  opstate rm                 remove entry (use client_id, op_id, object)\n";
-  cerr << "  replicalog get             get replica metadata log entry\n";
-  cerr << "  replicalog update          update replica metadata log entry\n";
-  cerr << "  replicalog delete          delete replica metadata log entry\n";
-  cerr << "options:\n";
-  cerr << "   --uid=<id>                user id\n";
-  cerr << "   --subuser=<name>          subuser name\n";
-  cerr << "   --access-key=<key>        S3 access key\n";
-  cerr << "   --email=<email>\n";
-  cerr << "   --secret=<key>            specify secret key\n";
-  cerr << "   --gen-access-key          generate random access key (for S3)\n";
-  cerr << "   --gen-secret              generate random secret key\n";
-  cerr << "   --key-type=<type>         key type, options are: swift, s3\n";
-  cerr << "   --temp-url-key[-2]=<key>  temp url key\n";
-  cerr << "   --access=<access>         Set access permissions for sub-user, should be one\n";
-  cerr << "                             of read, write, readwrite, full\n";
-  cerr << "   --display-name=<name>\n";
-  cerr << "   --max_buckets             max number of buckets for a user\n";
-  cerr << "   --system                  set the system flag on the user\n";
-  cerr << "   --bucket=<bucket>\n";
-  cerr << "   --pool=<pool>\n";
-  cerr << "   --object=<object>\n";
-  cerr << "   --date=<date>\n";
-  cerr << "   --start-date=<date>\n";
-  cerr << "   --end-date=<date>\n";
-  cerr << "   --bucket-id=<bucket-id>\n";
-  cerr << "   --shard-id=<shard-id>     optional for mdlog list\n";
-  cerr << "                             required for: \n";
-  cerr << "                               mdlog trim\n";
-  cerr << "                               replica mdlog get/delete\n";
-  cerr << "                               replica datalog get/delete\n";
-  cerr << "   --metadata-key=<key>      key to retrieve metadata from with metadata get\n";
-  cerr << "   --rgw-region=<region>     region in which radosgw is running\n";
-  cerr << "   --rgw-zone=<zone>         zone in which radosgw is running\n";
-  cerr << "   --fix                     besides checking bucket index, will also fix it\n";
-  cerr << "   --check-objects           bucket check: rebuilds bucket index according to\n";
-  cerr << "                             actual objects state\n";
-  cerr << "   --format=<format>         specify output format for certain operations: xml,\n";
-  cerr << "                             json\n";
-  cerr << "   --purge-data              when specified, user removal will also purge all the\n";
-  cerr << "                             user data\n";
-  cerr << "   --purge-keys              when specified, subuser removal will also purge all the\n";
-  cerr << "                             subuser keys\n";
-  cerr << "   --purge-objects           remove a bucket's objects before deleting it\n";
-  cerr << "                             (NOTE: required to delete a non-empty bucket)\n";
-  cerr << "   --sync-stats              option to 'user stats', update user stats with current\n";
-  cerr << "                             stats reported by user's buckets indexes\n";
-  cerr << "   --show-log-entries=<flag> enable/disable dump of log entries on log show\n";
-  cerr << "   --show-log-sum=<flag>     enable/disable dump of log summation on log show\n";
-  cerr << "   --skip-zero-entries       log show only dumps entries that don't have zero value\n";
-  cerr << "                             in one of the numeric field\n";
-  cerr << "   --infile                  specify a file to read in when setting data\n";
-  cerr << "   --state=<state string>    specify a state for the opstate set command\n";
-  cerr << "   --replica-log-type        replica log type (metadata, data, bucket), required for\n";
-  cerr << "                             replica log operations\n";
-  cerr << "   --categories=<list>       comma separated list of categories, used in usage show\n";
-  cerr << "   --caps=<caps>             list of caps (e.g., \"usage=read, write; user=read\"\n";
-  cerr << "   --yes-i-really-mean-it    required for certain operations\n";
-  cerr << "   --reset-regions           reset regionmap when regionmap update";
-  cerr << "\n";
-  cerr << "<date> := \"YYYY-MM-DD[ hh:mm:ss]\"\n";
-  cerr << "\nQuota options:\n";
-  cerr << "   --bucket                  specified bucket for quota command\n";
-  cerr << "   --max-objects             specify max objects (negative value to disable)\n";
-  cerr << "   --max-size                specify max size (in bytes, negative value to disable)\n";
-  cerr << "   --quota-scope             scope of quota (bucket, user)\n";
-  cerr << "\n";
+  cout << "usage: radosgw-admin <cmd> [options...]" << std::endl;
+  cout << "commands:\n";
+  cout << "  user create                create a new user\n" ;
+  cout << "  user modify                modify user\n";
+  cout << "  user info                  get user info\n";
+  cout << "  user rm                    remove user\n";
+  cout << "  user suspend               suspend a user\n";
+  cout << "  user enable                re-enable user after suspension\n";
+  cout << "  user check                 check user info\n";
+  cout << "  user stats                 show user stats as accounted by quota subsystem\n";
+  cout << "  caps add                   add user capabilities\n";
+  cout << "  caps rm                    remove user capabilities\n";
+  cout << "  subuser create             create a new subuser\n" ;
+  cout << "  subuser modify             modify subuser\n";
+  cout << "  subuser rm                 remove subuser\n";
+  cout << "  key create                 create access key\n";
+  cout << "  key rm                     remove access key\n";
+  cout << "  bucket list                list buckets\n";
+  cout << "  bucket link                link bucket to specified user\n";
+  cout << "  bucket unlink              unlink bucket from specified user\n";
+  cout << "  bucket stats               returns bucket statistics\n";
+  cout << "  bucket rm                  remove bucket\n";
+  cout << "  bucket check               check bucket index\n";
+  cout << "  object rm                  remove object\n";
+  cout << "  object unlink              unlink object from bucket index\n";
+  cout << "  objects expire             run expired objects cleanup\n";
+  cout << "  quota set                  set quota params\n";
+  cout << "  quota enable               enable quota\n";
+  cout << "  quota disable              disable quota\n";
+  cout << "  region get                 show region info\n";
+  cout << "  regions list               list all regions set on this cluster\n";
+  cout << "  region set                 set region info (requires infile)\n";
+  cout << "  region default             set default region\n";
+  cout << "  region-map get             show region-map\n";
+  cout << "  region-map set             set region-map (requires infile)\n";
+  cout << "  zone get                   show zone cluster params\n";
+  cout << "  zone set                   set zone cluster params (requires infile)\n";
+  cout << "  zone list                  list all zones set on this cluster\n";
+  cout << "  pool add                   add an existing pool for data placement\n";
+  cout << "  pool rm                    remove an existing pool from data placement set\n";
+  cout << "  pools list                 list placement active set\n";
+  cout << "  policy                     read bucket/object policy\n";
+  cout << "  log list                   list log objects\n";
+  cout << "  log show                   dump a log from specific object or (bucket + date\n";
+  cout << "                             + bucket-id)\n";
+  cout << "  log rm                     remove log object\n";
+  cout << "  usage show                 show usage (by user, date range)\n";
+  cout << "  usage trim                 trim usage (by user, date range)\n";
+  cout << "  temp remove                remove temporary objects that were created up to\n";
+  cout << "                             specified date (and optional time)\n";
+  cout << "  gc list                    dump expired garbage collection objects (specify\n";
+  cout << "                             --include-all to list all entries, including unexpired)\n";
+  cout << "  gc process                 manually process garbage\n";
+  cout << "  metadata get               get metadata info\n";
+  cout << "  metadata put               put metadata info\n";
+  cout << "  metadata rm                remove metadata info\n";
+  cout << "  metadata list              list metadata info\n";
+  cout << "  mdlog list                 list metadata log\n";
+  cout << "  mdlog trim                 trim metadata log\n";
+  cout << "  bilog list                 list bucket index log\n";
+  cout << "  bilog trim                 trim bucket index log (use start-marker, end-marker)\n";
+  cout << "  datalog list               list data log\n";
+  cout << "  datalog trim               trim data log\n";
+  cout << "  opstate list               list stateful operations entries (use client_id,\n";
+  cout << "                             op_id, object)\n";
+  cout << "  opstate set                set state on an entry (use client_id, op_id, object, state)\n";
+  cout << "  opstate renew              renew state on an entry (use client_id, op_id, object)\n";
+  cout << "  opstate rm                 remove entry (use client_id, op_id, object)\n";
+  cout << "  replicalog get             get replica metadata log entry\n";
+  cout << "  replicalog update          update replica metadata log entry\n";
+  cout << "  replicalog delete          delete replica metadata log entry\n";
+  cout << "options:\n";
+  cout << "   --uid=<id>                user id\n";
+  cout << "   --subuser=<name>          subuser name\n";
+  cout << "   --access-key=<key>        S3 access key\n";
+  cout << "   --email=<email>\n";
+  cout << "   --secret=<key>            specify secret key\n";
+  cout << "   --gen-access-key          generate random access key (for S3)\n";
+  cout << "   --gen-secret              generate random secret key\n";
+  cout << "   --key-type=<type>         key type, options are: swift, s3\n";
+  cout << "   --temp-url-key[-2]=<key>  temp url key\n";
+  cout << "   --access=<access>         Set access permissions for sub-user, should be one\n";
+  cout << "                             of read, write, readwrite, full\n";
+  cout << "   --display-name=<name>\n";
+  cout << "   --max_buckets             max number of buckets for a user\n";
+  cout << "   --system                  set the system flag on the user\n";
+  cout << "   --bucket=<bucket>\n";
+  cout << "   --pool=<pool>\n";
+  cout << "   --object=<object>\n";
+  cout << "   --date=<date>\n";
+  cout << "   --start-date=<date>\n";
+  cout << "   --end-date=<date>\n";
+  cout << "   --bucket-id=<bucket-id>\n";
+  cout << "   --shard-id=<shard-id>     optional for mdlog list\n";
+  cout << "                             required for: \n";
+  cout << "                               mdlog trim\n";
+  cout << "                               replica mdlog get/delete\n";
+  cout << "                               replica datalog get/delete\n";
+  cout << "   --metadata-key=<key>      key to retrieve metadata from with metadata get\n";
+  cout << "   --rgw-region=<region>     region in which radosgw is running\n";
+  cout << "   --rgw-zone=<zone>         zone in which radosgw is running\n";
+  cout << "   --fix                     besides checking bucket index, will also fix it\n";
+  cout << "   --check-objects           bucket check: rebuilds bucket index according to\n";
+  cout << "                             actual objects state\n";
+  cout << "   --format=<format>         specify output format for certain operations: xml,\n";
+  cout << "                             json\n";
+  cout << "   --purge-data              when specified, user removal will also purge all the\n";
+  cout << "                             user data\n";
+  cout << "   --purge-keys              when specified, subuser removal will also purge all the\n";
+  cout << "                             subuser keys\n";
+  cout << "   --purge-objects           remove a bucket's objects before deleting it\n";
+  cout << "                             (NOTE: required to delete a non-empty bucket)\n";
+  cout << "   --sync-stats              option to 'user stats', update user stats with current\n";
+  cout << "                             stats reported by user's buckets indexes\n";
+  cout << "   --show-log-entries=<flag> enable/disable dump of log entries on log show\n";
+  cout << "   --show-log-sum=<flag>     enable/disable dump of log summation on log show\n";
+  cout << "   --skip-zero-entries       log show only dumps entries that don't have zero value\n";
+  cout << "                             in one of the numeric field\n";
+  cout << "   --infile                  specify a file to read in when setting data\n";
+  cout << "   --state=<state string>    specify a state for the opstate set command\n";
+  cout << "   --replica-log-type        replica log type (metadata, data, bucket), required for\n";
+  cout << "                             replica log operations\n";
+  cout << "   --categories=<list>       comma separated list of categories, used in usage show\n";
+  cout << "   --caps=<caps>             list of caps (e.g., \"usage=read, write; user=read\"\n";
+  cout << "   --yes-i-really-mean-it    required for certain operations\n";
+  cout << "   --reset-regions           reset regionmap when regionmap update";
+  cout << "\n";
+  cout << "<date> := \"YYYY-MM-DD[ hh:mm:ss]\"\n";
+  cout << "\nQuota options:\n";
+  cout << "   --bucket                  specified bucket for quota command\n";
+  cout << "   --max-objects             specify max objects (negative value to disable)\n";
+  cout << "   --max-size                specify max size (in bytes, negative value to disable)\n";
+  cout << "   --quota-scope             scope of quota (bucket, user)\n";
+  cout << "\n";
   generic_client_usage();
 }
 
@@ -225,6 +226,7 @@ enum {
   OPT_OBJECT_UNLINK,
   OPT_OBJECT_STAT,
   OPT_OBJECT_REWRITE,
+  OPT_OBJECTS_EXPIRE,
   OPT_BI_GET,
   OPT_BI_PUT,
   OPT_BI_LIST,
@@ -284,6 +286,7 @@ static int get_cmd(const char *cmd, const char *prev_cmd, bool *need_more)
       strcmp(cmd, "mdlog") == 0 ||
       strcmp(cmd, "metadata") == 0 ||
       strcmp(cmd, "object") == 0 ||
+      strcmp(cmd, "objects") == 0 ||
       strcmp(cmd, "olh") == 0 ||
       strcmp(cmd, "opstate") == 0 ||
       strcmp(cmd, "orphans") == 0 || 
@@ -393,6 +396,9 @@ static int get_cmd(const char *cmd, const char *prev_cmd, bool *need_more)
       return OPT_OBJECT_STAT;
     if (strcmp(cmd, "rewrite") == 0)
       return OPT_OBJECT_REWRITE;
+  } else if (strcmp(prev_cmd, "objects") == 0) {
+    if (strcmp(cmd, "expire") == 0)
+      return OPT_OBJECTS_EXPIRE;
   } else if (strcmp(prev_cmd, "olh") == 0) {
     if (strcmp(cmd, "get") == 0)
       return OPT_OLH_GET;
@@ -1053,7 +1059,6 @@ int do_check_object_locator(const string& bucket_name, bool fix, bool remove_bad
 
       if (key.name[0] == '_') {
         ret = check_obj_locator_underscore(bucket_info, obj, key, fix, remove_bad, f);
-        /* ignore return code, move to the next one */
 	
 	if (ret >= 0) {
           ret = check_obj_tail_locator_underscore(bucket_info, obj, key, fix, f);
@@ -1221,7 +1226,7 @@ int main(int argc, char **argv)
       // do nothing
     } else if (ceph_argparse_binary_flag(args, i, &system, NULL, "--system", (char*)NULL)) {
       system_specified = true;
-    } else if (ceph_argparse_withlonglong(args, i, &tmp, &errs, "-a", "--auth-uid", (char*)NULL)) {
+    } else if (ceph_argparse_witharg(args, i, &tmp, errs, "-a", "--auth-uid", (char*)NULL)) {
       if (!errs.str().empty()) {
 	cerr << errs.str() << std::endl;
 	exit(EXIT_FAILURE);
@@ -1391,6 +1396,16 @@ int main(int argc, char **argv)
           break;
       }
     }
+
+    /* check key parameter conflict */
+    if ((!access_key.empty()) && gen_access_key) {
+        cerr << "ERROR: key parameter conflict, --access-key & --gen-access-key" << std::endl;
+        return -EINVAL;
+    }
+    if ((!secret_key.empty()) && gen_secret_key) {
+        cerr << "ERROR: key parameter conflict, --secret & --gen-secret" << std::endl;
+        return -EINVAL;
+    }
   }
 
   // default to pretty json
@@ -1749,7 +1764,9 @@ int main(int argc, char **argv)
     break;
   case OPT_USER_RM:
     ret = user.remove(user_op, &err_msg);
-    if (ret < 0) {
+    if (ret == -ENOENT) {
+      cerr << err_msg << std::endl;
+    } else if (ret < 0) {
       cerr << "could not remove user: " << err_msg << std::endl;
       return -ret;
     }
@@ -2354,6 +2371,14 @@ next:
     }
   }
 
+  if (opt_cmd == OPT_OBJECTS_EXPIRE) {
+    int ret = store->process_expire_objects();
+    if (ret < 0) {
+      cerr << "ERROR: process_expire_objects() processing returned error: " << cpp_strerror(-ret) << std::endl;
+      return 1;
+    }
+  }
+
   if (opt_cmd == OPT_BUCKET_REWRITE) {
     if (bucket_name.empty()) {
       cerr << "ERROR: bucket not specified" << std::endl;
diff --git a/src/rgw/rgw_bucket.cc b/src/rgw/rgw_bucket.cc
index 5d2af38..cfa5e20 100644
--- a/src/rgw/rgw_bucket.cc
+++ b/src/rgw/rgw_bucket.cc
@@ -42,8 +42,13 @@ void rgw_get_buckets_obj(const string& user_id, string& buckets_obj_id)
  * Get all the buckets owned by a user and fill up an RGWUserBuckets with them.
  * Returns: 0 on success, -ERR# on failure.
  */
-int rgw_read_user_buckets(RGWRados *store, string user_id, RGWUserBuckets& buckets,
-                          const string& marker, uint64_t max, bool need_stats)
+int rgw_read_user_buckets(RGWRados * store,
+                          string user_id,
+                          RGWUserBuckets& buckets,
+                          const string& marker,
+                          uint64_t max,
+                          bool need_stats,
+                          uint64_t default_amount)
 {
   int ret;
   buckets.clear();
@@ -59,6 +64,10 @@ int rgw_read_user_buckets(RGWRados *store, string user_id, RGWUserBuckets& bucke
 
   uint64_t total = 0;
 
+  if (!max) {
+    max = default_amount;
+  }
+
   do {
     ret = store->cls_user_list_buckets(obj, m, max - total, entries, &m, &truncated);
     if (ret == -ENOENT)
@@ -1514,19 +1523,6 @@ public:
 
 class RGWBucketMetadataHandler : public RGWMetadataHandler {
 
-  int init_bucket(RGWRados *store, string& bucket_name, rgw_bucket& bucket, RGWObjVersionTracker *objv_tracker) {
-    RGWBucketInfo bucket_info;
-    RGWObjectCtx obj_ctx(store);
-    int r = store->get_bucket_info(obj_ctx, bucket_name, bucket_info, NULL);
-    if (r < 0) {
-      cerr << "could not get bucket info for bucket=" << bucket_name << std::endl;
-      return r;
-    }
-    bucket = bucket_info.bucket;
-
-    return 0;
-  }
-
 public:
   string get_type() { return "bucket"; }
 
@@ -1676,19 +1672,6 @@ public:
 
 class RGWBucketInstanceMetadataHandler : public RGWMetadataHandler {
 
-  int init_bucket(RGWRados *store, string& bucket_name, rgw_bucket& bucket, RGWObjVersionTracker *objv_tracker) {
-    RGWBucketInfo bucket_info;
-    RGWObjectCtx obj_ctx(store);
-    int r = store->get_bucket_info(obj_ctx, bucket_name, bucket_info, NULL);
-    if (r < 0) {
-      cerr << "could not get bucket info for bucket=" << bucket_name << std::endl;
-      return r;
-    }
-    bucket = bucket_info.bucket;
-
-    return 0;
-  }
-
 public:
   string get_type() { return "bucket.instance"; }
 
diff --git a/src/rgw/rgw_bucket.h b/src/rgw/rgw_bucket.h
index 830e096..222e152 100644
--- a/src/rgw/rgw_bucket.h
+++ b/src/rgw/rgw_bucket.h
@@ -104,8 +104,13 @@ extern void rgw_bucket_init(RGWMetadataManager *mm);
  * Get all the buckets owned by a user and fill up an RGWUserBuckets with them.
  * Returns: 0 on success, -ERR# on failure.
  */
-extern int rgw_read_user_buckets(RGWRados *store, string user_id, RGWUserBuckets& buckets,
-                                 const string& marker, uint64_t max, bool need_stats);
+extern int rgw_read_user_buckets(RGWRados *store,
+                                 string user_id,
+                                 RGWUserBuckets& buckets,
+                                 const string& marker,
+                                 uint64_t max,
+                                 bool need_stats,
+                                 uint64_t default_amount = 1000);
 
 extern int rgw_link_bucket(RGWRados *store, string user_id, rgw_bucket& bucket, time_t creation_time, bool update_entrypoint = true);
 extern int rgw_unlink_bucket(RGWRados *store, string user_id, const string& bucket_name, bool update_entrypoint = true);
diff --git a/src/rgw/rgw_cache.cc b/src/rgw/rgw_cache.cc
index c3f3b06..03c3b05 100644
--- a/src/rgw/rgw_cache.cc
+++ b/src/rgw/rgw_cache.cc
@@ -105,7 +105,7 @@ bool ObjectCache::chain_cache_entry(list<rgw_cache_entry_info *>& cache_info_ent
   for (liter = cache_entry_list.begin(); liter != cache_entry_list.end(); ++liter) {
     ObjectCacheEntry *entry = *liter;
 
-    entry->chained_entries.push_back(make_pair<RGWChainedCache *, string>(chained_entry->cache, chained_entry->key));
+    entry->chained_entries.push_back(make_pair(chained_entry->cache, chained_entry->key));
   }
 
   return true;
diff --git a/src/rgw/rgw_common.cc b/src/rgw/rgw_common.cc
index 8d9ebf0..d1e2971 100644
--- a/src/rgw/rgw_common.cc
+++ b/src/rgw/rgw_common.cc
@@ -32,24 +32,24 @@ int rgw_perf_start(CephContext *cct)
 {
   PerfCountersBuilder plb(cct, cct->_conf->name.to_str(), l_rgw_first, l_rgw_last);
 
-  plb.add_u64_counter(l_rgw_req, "req");
-  plb.add_u64_counter(l_rgw_failed_req, "failed_req");
+  plb.add_u64_counter(l_rgw_req, "req", "Requests");
+  plb.add_u64_counter(l_rgw_failed_req, "failed_req", "Aborted requests");
 
-  plb.add_u64_counter(l_rgw_get, "get");
-  plb.add_u64_counter(l_rgw_get_b, "get_b");
-  plb.add_time_avg(l_rgw_get_lat, "get_initial_lat");
-  plb.add_u64_counter(l_rgw_put, "put");
-  plb.add_u64_counter(l_rgw_put_b, "put_b");
-  plb.add_time_avg(l_rgw_put_lat, "put_initial_lat");
+  plb.add_u64_counter(l_rgw_get, "get", "Gets");
+  plb.add_u64_counter(l_rgw_get_b, "get_b", "Size of gets");
+  plb.add_time_avg(l_rgw_get_lat, "get_initial_lat", "Get latency");
+  plb.add_u64_counter(l_rgw_put, "put", "Puts");
+  plb.add_u64_counter(l_rgw_put_b, "put_b", "Size of puts");
+  plb.add_time_avg(l_rgw_put_lat, "put_initial_lat", "Put latency");
 
-  plb.add_u64(l_rgw_qlen, "qlen");
-  plb.add_u64(l_rgw_qactive, "qactive");
+  plb.add_u64(l_rgw_qlen, "qlen", "Queue length");
+  plb.add_u64(l_rgw_qactive, "qactive", "Active requests queue");
 
-  plb.add_u64_counter(l_rgw_cache_hit, "cache_hit");
-  plb.add_u64_counter(l_rgw_cache_miss, "cache_miss");
+  plb.add_u64_counter(l_rgw_cache_hit, "cache_hit", "Cache hits");
+  plb.add_u64_counter(l_rgw_cache_miss, "cache_miss", "Cache miss");
 
-  plb.add_u64_counter(l_rgw_keystone_token_cache_hit, "keystone_token_cache_hit");
-  plb.add_u64_counter(l_rgw_keystone_token_cache_miss, "keystone_token_cache_miss");
+  plb.add_u64_counter(l_rgw_keystone_token_cache_hit, "keystone_token_cache_hit", "Keystone token cache hits");
+  plb.add_u64_counter(l_rgw_keystone_token_cache_miss, "keystone_token_cache_miss", "Keystone token cache miss");
 
   perfcounter = plb.create_perf_counters();
   cct->get_perfcounters_collection()->add(perfcounter);
@@ -197,6 +197,7 @@ struct str_len meta_prefixes[] = { STR_LEN_ENTRY("HTTP_X_AMZ"),
                                    STR_LEN_ENTRY("HTTP_X_RGW"),
                                    STR_LEN_ENTRY("HTTP_X_OBJECT"),
                                    STR_LEN_ENTRY("HTTP_X_CONTAINER"),
+                                   STR_LEN_ENTRY("HTTP_X_ACCOUNT"),
                                    {NULL, 0} };
 
 
@@ -417,9 +418,6 @@ void calc_hmac_sha1(const char *key, int key_len,
   HMACSHA1 hmac((const unsigned char *)key, key_len);
   hmac.Update((const unsigned char *)msg, msg_len);
   hmac.Final((unsigned char *)dest);
-  
-  char hex_str[(CEPH_CRYPTO_HMACSHA1_DIGESTSIZE * 2) + 1];
-  buf_to_hex((unsigned char *)dest, CEPH_CRYPTO_HMACSHA1_DIGESTSIZE, hex_str);
 }
 
 int gen_rand_base64(CephContext *cct, char *dest, int size) /* size should be the required string size + 1 */
@@ -442,7 +440,7 @@ int gen_rand_base64(CephContext *cct, char *dest, int size) /* size should be th
   }
   tmp_dest[ret] = '\0';
   memcpy(dest, tmp_dest, size);
-  dest[size] = '\0';
+  dest[size-1] = '\0';
 
   return 0;
 }
diff --git a/src/rgw/rgw_common.h b/src/rgw/rgw_common.h
index 5b4e39b..8426057 100644
--- a/src/rgw/rgw_common.h
+++ b/src/rgw/rgw_common.h
@@ -65,11 +65,15 @@ using ceph::crypto::MD5;
 #define RGW_ATTR_CONTENT_ENC	RGW_ATTR_PREFIX "content_encoding"
 #define RGW_ATTR_CONTENT_LANG	RGW_ATTR_PREFIX "content_language"
 #define RGW_ATTR_EXPIRES	RGW_ATTR_PREFIX "expires"
+#define RGW_ATTR_DELETE_AT 	RGW_ATTR_PREFIX "delete_at"
 #define RGW_ATTR_ID_TAG    	RGW_ATTR_PREFIX "idtag"
 #define RGW_ATTR_SHADOW_OBJ    	RGW_ATTR_PREFIX "shadow_name"
 #define RGW_ATTR_MANIFEST    	RGW_ATTR_PREFIX "manifest"
 #define RGW_ATTR_USER_MANIFEST  RGW_ATTR_PREFIX "user_manifest"
 
+#define RGW_ATTR_TEMPURL_KEY1   RGW_ATTR_META_PREFIX "temp-url-key"
+#define RGW_ATTR_TEMPURL_KEY2   RGW_ATTR_META_PREFIX "temp-url-key-2"
+
 #define RGW_ATTR_OLH_PREFIX     RGW_ATTR_PREFIX "olh."
 
 #define RGW_ATTR_OLH_INFO       RGW_ATTR_OLH_PREFIX "info"
@@ -80,7 +84,6 @@ using ceph::crypto::MD5;
 #define RGW_BUCKETS_OBJ_SUFFIX ".buckets"
 
 #define RGW_MAX_PENDING_CHUNKS  16
-#define RGW_MAX_PUT_SIZE        (5ULL*1024*1024*1024)
 #define RGW_MIN_MULTIPART_SIZE (5ULL*1024*1024)
 
 #define RGW_FORMAT_PLAIN        0
@@ -145,11 +148,14 @@ using ceph::crypto::MD5;
 #define ERR_QUOTA_EXCEEDED       2026
 #define ERR_SIGNATURE_NO_MATCH   2027
 #define ERR_INVALID_ACCESS_KEY   2028
+#define ERR_MALFORMED_XML        2029
+#define ERR_USER_EXIST           2030
 #define ERR_USER_SUSPENDED       2100
 #define ERR_INTERNAL_ERROR       2200
+#define ERR_NOT_IMPLEMENTED      2201
 
 #ifndef UINT32_MAX
-#define UINT32_MAX (4294967295)
+#define UINT32_MAX (0xffffffffu)
 #endif
 
 typedef void *RGWAccessHandle;
@@ -934,6 +940,10 @@ struct rgw_obj_key {
     set(n, i);
   }
 
+  rgw_obj_key(const cls_rgw_obj_key& k) {
+    set(k);
+  }
+
   void set(const cls_rgw_obj_key& k) {
     name = k.name;
     instance = k.instance;
@@ -968,6 +978,9 @@ struct rgw_obj_key {
     }
     return (r < 0);
   }
+  bool operator<=(const rgw_obj_key& k) const {
+    return !(k < *this);
+  }
   void encode(bufferlist& bl) const {
     ENCODE_START(1, 1, bl);
     ::encode(name, bl);
@@ -1182,8 +1195,7 @@ public:
     init(b, o);
   }
   rgw_obj(rgw_bucket& b, const rgw_obj_key& k) : in_extra_data(false) {
-    init(b, k.name);
-    set_instance(k.instance);
+    from_index_key(b, k);
   }
   void init(rgw_bucket& b, const std::string& o) {
     bucket = b;
@@ -1292,6 +1304,29 @@ public:
     return string(buf) + orig_obj;
   };
 
+  void from_index_key(rgw_bucket& b, const rgw_obj_key& key) {
+    if (key.name[0] != '_') {
+      init(b, key.name);
+      set_instance(key.instance);
+      return;
+    }
+    if (key.name[1] == '_') {
+      init(b, key.name.substr(1));
+      set_instance(key.instance);
+      return;
+    }
+    ssize_t pos = key.name.find('_', 1);
+    if (pos < 0) {
+      /* shouldn't happen, just use key */
+      init(b, key.name);
+      set_instance(key.instance);
+      return;
+    }
+
+    init_ns(b, key.name.substr(pos + 1), key.name.substr(1, pos -1));
+    set_instance(key.instance);
+  }
+
   void get_index_key(rgw_obj_key *key) const {
     key->name = get_index_key_name();
     key->instance = instance;
@@ -1382,6 +1417,11 @@ public:
       return false;
     }
 
+    if (obj[1] == '_') {
+      obj = obj.substr(1);
+      return true;
+    }
+
     size_t period_pos = obj.find('.');
     if (period_pos < pos) {
       return false;
@@ -1426,7 +1466,11 @@ public:
     if (struct_v >= 4)
       ::decode(instance, bl);
     if (ns.empty() && instance.empty()) {
-      orig_obj = object;
+      if (object[0] != '_') {
+        orig_obj = object;
+      } else {
+	orig_obj = object.substr(1);
+      }
     } else {
       if (struct_v >= 5) {
         ::decode(orig_obj, bl);
diff --git a/src/rgw/rgw_cors.h b/src/rgw/rgw_cors.h
index 124ebf9..239cfd7 100644
--- a/src/rgw/rgw_cors.h
+++ b/src/rgw/rgw_cors.h
@@ -17,7 +17,6 @@
 
 #include <map>
 #include <string>
-#include <iostream>
 #include <include/types.h>
 
 #define RGW_CORS_GET    0x1
diff --git a/src/rgw/rgw_cors_s3.h b/src/rgw/rgw_cors_s3.h
index 0db03c3..3a96160 100644
--- a/src/rgw/rgw_cors_s3.h
+++ b/src/rgw/rgw_cors_s3.h
@@ -17,7 +17,7 @@
 
 #include <map>
 #include <string>
-#include <iostream>
+#include <iosfwd>
 #include <expat.h>
 
 #include <include/types.h>
diff --git a/src/rgw/rgw_cors_swift.h b/src/rgw/rgw_cors_swift.h
index 8037b4f..6aef5e1 100644
--- a/src/rgw/rgw_cors_swift.h
+++ b/src/rgw/rgw_cors_swift.h
@@ -17,7 +17,6 @@
 
 #include <map>
 #include <string>
-#include <iostream>
 #include <vector>
 #include <include/types.h>
 #include <include/str_list.h>
diff --git a/src/rgw/rgw_dencoder.cc b/src/rgw/rgw_dencoder.cc
index 10e9961..82e3295 100644
--- a/src/rgw/rgw_dencoder.cc
+++ b/src/rgw/rgw_dencoder.cc
@@ -419,6 +419,34 @@ void RGWBucketInfo::generate_test_instances(list<RGWBucketInfo*>& o)
   o.push_back(new RGWBucketInfo);
 }
 
+void RGWRegion::generate_test_instances(list<RGWRegion*>& o)
+{
+  RGWRegion *r = new RGWRegion;
+  o.push_back(r);
+  o.push_back(new RGWRegion);
+}
+
+void RGWZone::generate_test_instances(list<RGWZone*> &o)
+{
+  RGWZone *z = new RGWZone;
+  o.push_back(z);
+  o.push_back(new RGWZone);
+}
+
+void RGWZoneParams::generate_test_instances(list<RGWZoneParams*> &o)
+{
+  o.push_back(new RGWZoneParams);
+  o.push_back(new RGWZoneParams); 
+}
+
+void RGWOLHInfo::generate_test_instances(list<RGWOLHInfo*> &o)
+{
+  RGWOLHInfo *olh = new RGWOLHInfo;
+  olh->removed = false;
+  o.push_back(olh);
+  o.push_back(new RGWOLHInfo);
+}
+
 void RGWBucketEnt::generate_test_instances(list<RGWBucketEnt*>& o)
 {
   RGWBucketEnt *e = new RGWBucketEnt;
diff --git a/src/rgw/rgw_http_errors.h b/src/rgw/rgw_http_errors.h
index 7850807..aaa5312 100644
--- a/src/rgw/rgw_http_errors.h
+++ b/src/rgw/rgw_http_errors.h
@@ -34,6 +34,7 @@ const static struct rgw_http_errors RGW_HTTP_ERRORS[] = {
     { ERR_TOO_LARGE, 400, "EntityTooLarge" },
     { ERR_TOO_SMALL, 400, "EntityTooSmall" },
     { ERR_TOO_MANY_BUCKETS, 400, "TooManyBuckets" },
+    { ERR_MALFORMED_XML, 400, "MalformedXML" },
     { ERR_LENGTH_REQUIRED, 411, "MissingContentLength" },
     { EACCES, 403, "AccessDenied" },
     { EPERM, 403, "AccessDenied" },
@@ -49,12 +50,14 @@ const static struct rgw_http_errors RGW_HTTP_ERRORS[] = {
     { ERR_METHOD_NOT_ALLOWED, 405, "MethodNotAllowed" },
     { ETIMEDOUT, 408, "RequestTimeout" },
     { EEXIST, 409, "BucketAlreadyExists" },
+    { ERR_USER_EXIST, 409, "UserAlreadyExists" },
     { ENOTEMPTY, 409, "BucketNotEmpty" },
     { ERR_PRECONDITION_FAILED, 412, "PreconditionFailed" },
     { ERANGE, 416, "InvalidRange" },
     { ERR_UNPROCESSABLE_ENTITY, 422, "UnprocessableEntity" },
     { ERR_LOCKED, 423, "Locked" },
     { ERR_INTERNAL_ERROR, 500, "InternalError" },
+    { ERR_NOT_IMPLEMENTED, 501, "NotImplemented" },
 };
 
 const static struct rgw_http_errors RGW_HTTP_SWIFT_ERRORS[] = {
@@ -108,6 +111,7 @@ const static struct rgw_http_status_code http_codes[] = {
   { 417, "Expectation Failed" },
   { 422, "Unprocessable Entity" },
   { 500, "Internal Server Error" },
+  { 501, "Not Implemented" },
   { 0, NULL },
 };
 
diff --git a/src/rgw/rgw_json_enc.cc b/src/rgw/rgw_json_enc.cc
index d09fa65..061aba9 100644
--- a/src/rgw/rgw_json_enc.cc
+++ b/src/rgw/rgw_json_enc.cc
@@ -605,6 +605,7 @@ void rgw_obj::dump(Formatter *f) const
   encode_json("ns", ns, f);
   encode_json("object", object, f);
   encode_json("instance", instance, f);
+  encode_json("orig_obj", orig_obj, f);
 }
 
 void RGWZoneParams::dump(Formatter *f) const
diff --git a/src/rgw/rgw_main.cc b/src/rgw/rgw_main.cc
index 0ddd9de..c6ba1c2 100644
--- a/src/rgw/rgw_main.cc
+++ b/src/rgw/rgw_main.cc
@@ -214,6 +214,7 @@ protected:
       perfcounter->inc(l_rgw_qlen, -1);
       return req;
     }
+    using ThreadPool::WorkQueue<RGWRequest>::_process;
     void _process(RGWRequest *req) {
       perfcounter->inc(l_rgw_qactive);
       process->handle_request(req);
@@ -717,9 +718,6 @@ static int civetweb_callback(struct mg_connection *conn) {
   RGWRequest *req = new RGWRequest(store->get_new_req_id());
   RGWMongoose client_io(conn, pe->port);
 
-  client_io.init(g_ceph_context);
-
-
   int ret = process_request(store, rest, req, &client_io, olog);
   if (ret < 0) {
     /* we don't really care about return code */
@@ -1030,7 +1028,6 @@ int main(int argc, const char **argv)
   vector<const char *> def_args;
   def_args.push_back("--debug-rgw=1/5");
   def_args.push_back("--keyring=$rgw_data/keyring");
-  def_args.push_back("--log-file=/var/log/radosgw/$cluster-$name.log");
 
   vector<const char*> args;
   argv_to_vec(argc, argv, args);
@@ -1057,6 +1054,9 @@ int main(int argc, const char **argv)
   init_timer.add_event_after(g_conf->rgw_init_timeout, new C_InitTimeout);
   mutex.Unlock();
 
+  // Enable the perf counter before starting the service thread
+  g_ceph_context->enable_perf_counter();
+
   common_init_finish(g_ceph_context);
 
   rgw_tools_init(g_ceph_context);
diff --git a/src/rgw/rgw_object_expirer.cc b/src/rgw/rgw_object_expirer.cc
new file mode 100644
index 0000000..63f4e96
--- /dev/null
+++ b/src/rgw/rgw_object_expirer.cc
@@ -0,0 +1,104 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <errno.h>
+#include <iostream>
+#include <sstream>
+#include <string>
+
+using namespace std;
+
+#include "auth/Crypto.h"
+
+#include "common/armor.h"
+#include "common/ceph_json.h"
+#include "common/config.h"
+#include "common/ceph_argparse.h"
+#include "common/Formatter.h"
+#include "common/errno.h"
+
+#include "global/global_init.h"
+
+#include "include/utime.h"
+#include "include/str_list.h"
+
+#include "rgw_user.h"
+#include "rgw_bucket.h"
+#include "rgw_rados.h"
+#include "rgw_acl.h"
+#include "rgw_acl_s3.h"
+#include "rgw_log.h"
+#include "rgw_formats.h"
+#include "rgw_usage.h"
+#include "rgw_replica_log.h"
+#include "rgw_object_expirer_core.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+static RGWRados *store = NULL;
+
+class StoreDestructor {
+  RGWRados *store;
+
+public:
+  StoreDestructor(RGWRados *_s) : store(_s) {}
+  ~StoreDestructor() {
+    if (store) {
+      RGWStoreManager::close_storage(store);
+    }
+  }
+};
+
+static void usage()
+{
+  generic_server_usage();
+}
+
+int main(const int argc, const char **argv)
+{
+  vector<const char *> args;
+  argv_to_vec(argc, argv, args);
+  env_to_vec(args);
+
+  global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_DAEMON,
+	      CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS);
+
+  for (std::vector<const char *>::iterator i = args.begin(); i != args.end(); ) {
+    if (ceph_argparse_double_dash(args, i)) {
+      break;
+    } else if (ceph_argparse_flag(args, i, "-h", "--help", (char*)NULL)) {
+      usage();
+      return 0;
+    }
+  }
+
+  if (g_conf->daemonize) {
+    global_init_daemonize(g_ceph_context, 0);
+  }
+
+  common_init_finish(g_ceph_context);
+
+  store = RGWStoreManager::get_storage(g_ceph_context, false, false);
+  if (!store) {
+    std::cerr << "couldn't init storage provider" << std::endl;
+    return EIO;
+  }
+
+  rgw_user_init(store);
+  rgw_bucket_init(store->meta_mgr);
+
+  /* Guard to not forget about closing the rados store. */
+  StoreDestructor store_dtor(store);
+
+  RGWObjectExpirer objexp(store);
+  objexp.start_processor();
+
+  const utime_t interval(g_ceph_context->_conf->rgw_objexp_gc_interval, 0);
+  while (true) {
+    interval.sleep();
+  }
+
+  /* unreachable */
+
+  return EXIT_SUCCESS;
+}
diff --git a/src/rgw/rgw_object_expirer_core.cc b/src/rgw/rgw_object_expirer_core.cc
new file mode 100644
index 0000000..5903511
--- /dev/null
+++ b/src/rgw/rgw_object_expirer_core.cc
@@ -0,0 +1,263 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <errno.h>
+#include <iostream>
+#include <sstream>
+#include <string>
+
+using namespace std;
+
+#include "auth/Crypto.h"
+
+#include "common/armor.h"
+#include "common/ceph_json.h"
+#include "common/config.h"
+#include "common/ceph_argparse.h"
+#include "common/Formatter.h"
+#include "common/errno.h"
+
+#include "global/global_init.h"
+
+#include "include/utime.h"
+#include "include/str_list.h"
+
+#include "rgw_user.h"
+#include "rgw_bucket.h"
+#include "rgw_rados.h"
+#include "rgw_acl.h"
+#include "rgw_acl_s3.h"
+#include "rgw_log.h"
+#include "rgw_formats.h"
+#include "rgw_usage.h"
+#include "rgw_replica_log.h"
+#include "rgw_object_expirer_core.h"
+
+#include "cls/lock/cls_lock_client.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+static string objexp_lock_name = "gc_process";
+
+int RGWObjectExpirer::init_bucket_info(const string& bucket_name,
+                                    const string& bucket_id,
+                                    RGWBucketInfo& bucket_info)
+{
+  RGWObjectCtx obj_ctx(store);
+  const string bucket_instance_id = bucket_name + ":" + bucket_id;
+
+  int ret = store->get_bucket_instance_info(obj_ctx, bucket_instance_id,
+          bucket_info, NULL, NULL);
+
+  return ret;
+}
+
+int RGWObjectExpirer::garbage_single_object(objexp_hint_entry& hint)
+{
+  RGWBucketInfo bucket_info;
+
+  int ret = init_bucket_info(hint.bucket_name, hint.bucket_id, bucket_info);
+  if (-ENOENT == ret) {
+    ldout(store->ctx(), 15) << "NOTICE: cannot find bucket = " \
+        << hint.bucket_name << ". The object must be already removed" << dendl;
+    return -ERR_PRECONDITION_FAILED;
+  } else if (ret < 0) {
+    ldout(store->ctx(),  1) << "ERROR: could not init bucket = " \
+        << hint.bucket_name << "due to ret = " << ret << dendl;
+    return ret;
+  }
+
+  RGWObjectCtx rctx(store);
+
+  rgw_obj_key key = hint.obj_key;
+  if (key.instance.empty()) {
+    key.instance = "null";
+  }
+
+  rgw_obj obj(bucket_info.bucket, key);
+  store->set_atomic(&rctx, obj);
+  ret = store->delete_obj(rctx, bucket_info, obj,
+          bucket_info.versioning_status(), 0, hint.exp_time);
+
+  return ret;
+}
+
+void RGWObjectExpirer::garbage_chunk(list<cls_timeindex_entry>& entries,      /* in  */
+                                  bool& need_trim)                         /* out */
+{
+  need_trim = false;
+
+  for (list<cls_timeindex_entry>::iterator iter = entries.begin();
+       iter != entries.end();
+       ++iter)
+  {
+    objexp_hint_entry hint;
+    ldout(store->ctx(), 15) << "got removal hint for: " << iter->key_ts.sec() \
+        << " - " << iter->key_ext << dendl;
+
+    int ret = store->objexp_hint_parse(*iter, hint);
+    if (ret < 0) {
+      ldout(store->ctx(), 1) << "cannot parse removal hint for " << hint.obj_key << dendl;
+      continue;
+    }
+
+    /* PRECOND_FAILED simply means that our hint is not valid.
+     * We can silently ignore that and move forward. */
+    ret = garbage_single_object(hint);
+    if (ret == -ERR_PRECONDITION_FAILED) {
+      ldout(store->ctx(), 15) << "not actual hint for object: " << hint.obj_key << dendl;
+    } else if (ret < 0) {
+      ldout(store->ctx(), 1) << "cannot remove expired object: " << hint.obj_key << dendl;
+    }
+
+    need_trim = true;
+  }
+
+  return;
+}
+
+void RGWObjectExpirer::trim_chunk(const string& shard,
+                               const utime_t& from,
+                               const utime_t& to)
+{
+  ldout(store->ctx(), 20) << "trying to trim removal hints to  " << to << dendl;
+
+  int ret = store->objexp_hint_trim(shard, from, to);
+  if (ret < 0) {
+    ldout(store->ctx(), 0) << "ERROR during trim: " << ret << dendl;
+  }
+
+  return;
+}
+
+void RGWObjectExpirer::process_single_shard(const string& shard,
+                                         const utime_t& last_run,
+                                         const utime_t& round_start)
+{
+  string marker;
+  string out_marker;
+  bool truncated = false;
+
+  CephContext *cct = store->ctx();
+  int num_entries = cct->_conf->rgw_objexp_chunk_size;
+
+  int max_secs = cct->_conf->rgw_objexp_gc_interval;
+  utime_t end = ceph_clock_now(cct);
+  end += max_secs;
+
+  rados::cls::lock::Lock l(objexp_lock_name);
+
+  utime_t time(max_secs, 0);
+  l.set_duration(time);
+
+  int ret = l.lock_exclusive(&store->objexp_pool_ctx, shard);
+  if (ret == -EBUSY) { /* already locked by another processor */
+    dout(5) << __func__ << "(): failed to acquire lock on " << shard << dendl;
+    return;
+  }
+  do {
+    list<cls_timeindex_entry> entries;
+    ret = store->objexp_hint_list(shard, last_run, round_start,
+                                      num_entries, marker, entries,
+                                      &out_marker, &truncated);
+    if (ret < 0) {
+      ldout(cct, 10) << "cannot get removal hints from shard: " << shard << dendl;
+      continue;
+    }
+
+    bool need_trim;
+    garbage_chunk(entries, need_trim);
+
+    if (need_trim) {
+      trim_chunk(shard, last_run, round_start);
+    }
+
+    utime_t now = ceph_clock_now(g_ceph_context);
+    if (now >= end) {
+      break;
+    }
+
+    marker = out_marker;
+  } while (truncated);
+
+  l.unlock(&store->objexp_pool_ctx, shard);
+  return;
+}
+
+void RGWObjectExpirer::inspect_all_shards(const utime_t& last_run, const utime_t& round_start)
+{
+  utime_t shard_marker;
+
+  CephContext *cct = store->ctx();
+  int num_shards = cct->_conf->rgw_objexp_hints_num_shards;
+
+  for (int i = 0; i < num_shards; i++) {
+    string shard;
+    store->objexp_get_shard(i, shard);
+
+    ldout(store->ctx(), 20) << "proceeding shard = " << shard << dendl;
+
+    process_single_shard(shard, last_run, round_start);
+  }
+
+  return;
+}
+
+bool RGWObjectExpirer::going_down()
+{
+  return (down_flag.read() != 0);
+}
+
+void RGWObjectExpirer::start_processor()
+{
+  worker = new OEWorker(store->ctx(), this);
+  worker->create();
+}
+
+void RGWObjectExpirer::stop_processor()
+{
+  down_flag.set(1);
+  if (worker) {
+    worker->stop();
+    worker->join();
+  }
+  delete worker;
+  worker = NULL;
+}
+
+void *RGWObjectExpirer::OEWorker::entry() {
+  utime_t last_run;
+  do {
+    utime_t start = ceph_clock_now(cct);
+    ldout(cct, 2) << "object expiration: start" << dendl;
+    oe->inspect_all_shards(last_run, start);
+    ldout(cct, 2) << "object expiration: stop" << dendl;
+
+    last_run = start;
+
+    if (oe->going_down())
+      break;
+
+    utime_t end = ceph_clock_now(cct);
+    end -= start;
+    int secs = cct->_conf->rgw_objexp_gc_interval;
+
+    if (secs <= end.sec())
+      continue; // next round
+
+    secs -= end.sec();
+
+    lock.Lock();
+    cond.WaitInterval(cct, lock, utime_t(secs, 0));
+    lock.Unlock();
+  } while (!oe->going_down());
+
+  return NULL;
+}
+
+void RGWObjectExpirer::OEWorker::stop()
+{
+  Mutex::Locker l(lock);
+  cond.Signal();
+}
+
diff --git a/src/rgw/rgw_object_expirer_core.h b/src/rgw/rgw_object_expirer_core.h
new file mode 100644
index 0000000..bd137fa
--- /dev/null
+++ b/src/rgw/rgw_object_expirer_core.h
@@ -0,0 +1,88 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_OBJEXP_H
+#define CEPH_OBJEXP_H
+
+#include <errno.h>
+#include <iostream>
+#include <sstream>
+#include <string>
+
+#include "auth/Crypto.h"
+
+#include "common/armor.h"
+#include "common/ceph_json.h"
+#include "common/config.h"
+#include "common/ceph_argparse.h"
+#include "common/Formatter.h"
+#include "common/errno.h"
+
+#include "common/Mutex.h"
+#include "common/Cond.h"
+#include "common/Thread.h"
+
+#include "global/global_init.h"
+
+#include "include/utime.h"
+#include "include/str_list.h"
+
+#include "rgw_user.h"
+#include "rgw_bucket.h"
+#include "rgw_rados.h"
+#include "rgw_acl.h"
+#include "rgw_acl_s3.h"
+#include "rgw_log.h"
+#include "rgw_formats.h"
+#include "rgw_usage.h"
+#include "rgw_replica_log.h"
+
+class RGWObjectExpirer {
+protected:
+  RGWRados *store;
+
+  int init_bucket_info(const string& bucket_name,
+                       const string& bucket_id,
+                       RGWBucketInfo& bucket_info);
+
+  class OEWorker : public Thread {
+    CephContext *cct;
+    RGWObjectExpirer *oe;
+    Mutex lock;
+    Cond cond;
+
+  public:
+    OEWorker(CephContext *_cct, RGWObjectExpirer *_oe) : cct(_cct), oe(_oe), lock("OEWorker") {}
+    void *entry();
+    void stop();
+  };
+
+  OEWorker *worker;
+  atomic_t down_flag;
+
+public:
+  RGWObjectExpirer(RGWRados *_store)
+    : store(_store)
+  {}
+
+  int garbage_single_object(objexp_hint_entry& hint);
+
+  void garbage_chunk(list<cls_timeindex_entry>& entries,      /* in  */
+                     bool& need_trim);                        /* out */
+
+  void trim_chunk(const string& shard,
+                  const utime_t& from,
+                  const utime_t& to);
+
+  void process_single_shard(const string& shard,
+                            const utime_t& last_run,
+                            const utime_t& round_start);
+
+  void inspect_all_shards(const utime_t& last_run,
+                          const utime_t& round_start);
+
+  bool going_down();
+  void start_processor();
+  void stop_processor();
+};
+#endif /* CEPH_OBJEXP_H */
diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc
index 4301bdd..fb4b6bb 100644
--- a/src/rgw/rgw_op.cc
+++ b/src/rgw/rgw_op.cc
@@ -260,13 +260,13 @@ static int get_policy_from_attr(CephContext *cct, RGWRados *store, void *ctx,
     return get_bucket_policy_from_attr(cct, store, ctx, bucket_info, bucket_attrs,
                                        policy, instance_obj);
   }
-  return get_obj_policy_from_attr(cct, store, *(RGWObjectCtx *)ctx, bucket_info, bucket_attrs,
+  return get_obj_policy_from_attr(cct, store, *static_cast<RGWObjectCtx *>(ctx), bucket_info, bucket_attrs,
                                   policy, obj);
 }
 
 static int get_obj_attrs(RGWRados *store, struct req_state *s, rgw_obj& obj, map<string, bufferlist>& attrs)
 {
-  RGWRados::Object op_target(store, s->bucket_info, *(RGWObjectCtx *)s->obj_ctx, obj);
+  RGWRados::Object op_target(store, s->bucket_info, *static_cast<RGWObjectCtx *>(s->obj_ctx), obj);
   RGWRados::Object::Read read_op(&op_target);
 
   read_op.params.attrs = &attrs;
@@ -278,7 +278,7 @@ static int get_obj_attrs(RGWRados *store, struct req_state *s, rgw_obj& obj, map
 static int get_system_obj_attrs(RGWRados *store, struct req_state *s, rgw_obj& obj, map<string, bufferlist>& attrs,
                          uint64_t *obj_size, RGWObjVersionTracker *objv_tracker)
 {
-  RGWRados::SystemObject src(store, *(RGWObjectCtx *)s->obj_ctx, obj);
+  RGWRados::SystemObject src(store, *static_cast<RGWObjectCtx *>(s->obj_ctx), obj);
   RGWRados::SystemObject::Read rop(&src);
 
   rop.stat_params.attrs = &attrs;
@@ -345,7 +345,7 @@ static int rgw_build_policies(RGWRados *store, struct req_state *s, bool only_bu
   int ret = 0;
   rgw_obj_key obj;
   RGWUserInfo bucket_owner_info;
-  RGWObjectCtx& obj_ctx = *(RGWObjectCtx *)s->obj_ctx;
+  RGWObjectCtx& obj_ctx = *static_cast<RGWObjectCtx *>(s->obj_ctx);
 
   string bi = s->info.args.get(RGW_SYS_PARAM_PREFIX "bucket-instance");
   if (!bi.empty()) {
@@ -808,8 +808,13 @@ int RGWGetObj::handle_user_manifest(const char *prefix)
   if (pos < 0)
     return -EINVAL;
 
-  string bucket_name = prefix_str.substr(0, pos);
-  string obj_prefix = prefix_str.substr(pos + 1);
+  string bucket_name_raw, bucket_name;
+  bucket_name_raw = prefix_str.substr(0, pos);
+  url_decode(bucket_name_raw, bucket_name);
+
+  string obj_prefix_raw, obj_prefix;
+  obj_prefix_raw = prefix_str.substr(pos + 1);
+  url_decode(obj_prefix_raw, obj_prefix);
 
   rgw_bucket bucket;
 
@@ -885,11 +890,57 @@ int RGWGetObj::get_data_cb(bufferlist& bl, off_t bl_ofs, off_t bl_len)
   return send_response_data(bl, bl_ofs, bl_len);
 }
 
+bool RGWGetObj::prefetch_data()
+{
+  /* HEAD request, stop prefetch*/
+  if (!get_data) {
+    return false;
+  }
+
+  bool prefetch_first_chunk = true;
+  range_str = s->info.env->get("HTTP_RANGE");
+
+  if(range_str) {
+    int r = parse_range(range_str, ofs, end, &partial_content);
+    /* error on parsing the range, stop prefetch and will fail in execte() */
+    if (r < 0) {
+      range_parsed = false;
+      return false;
+    } else {
+      range_parsed = true;
+    }
+    /* range get goes to shadown objects, stop prefetch */
+    if (ofs >= s->cct->_conf->rgw_max_chunk_size) {
+      prefetch_first_chunk = false;
+    }
+  }
+
+  return get_data && prefetch_first_chunk;
+}
 void RGWGetObj::pre_exec()
 {
   rgw_bucket_object_pre_exec(s);
 }
 
+static bool object_is_expired(map<string, bufferlist>& attrs) {
+  map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_DELETE_AT);
+  if (iter != attrs.end()) {
+    utime_t delete_at;
+    try {
+      ::decode(delete_at, iter->second);
+    } catch (buffer::error& err) {
+      dout(0) << "ERROR: " << __func__ << ": failed to decode " RGW_ATTR_DELETE_AT " attr" << dendl;
+      return false;
+    }
+
+    if (delete_at <= ceph_clock_now(g_ceph_context)) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
 void RGWGetObj::execute()
 {
   utime_t start_time = s->time;
@@ -904,7 +955,7 @@ void RGWGetObj::execute()
   perfcounter->inc(l_rgw_get);
   int64_t new_ofs, new_end;
 
-  RGWRados::Object op_target(store, s->bucket_info, *(RGWObjectCtx *)s->obj_ctx, obj);
+  RGWRados::Object op_target(store, s->bucket_info, *static_cast<RGWObjectCtx *>(s->obj_ctx), obj);
   RGWRados::Object::Read read_op(&op_target);
 
   ret = get_params();
@@ -941,6 +992,13 @@ void RGWGetObj::execute()
     return;
   }
 
+  /* Check whether the object has expired. Swift API documentation
+   * stands that we should return 404 Not Found in such case. */
+  if (need_object_expiration() && object_is_expired(attrs)) {
+    ret = -ENOENT;
+    goto done_err;
+  }
+
   ofs = new_ofs;
   end = new_end;
 
@@ -966,9 +1024,12 @@ done_err:
 int RGWGetObj::init_common()
 {
   if (range_str) {
-    int r = parse_range(range_str, ofs, end, &partial_content);
-    if (r < 0)
-      return r;
+    /* range parsed error when prefetch*/
+    if (!range_parsed) {
+      int r = parse_range(range_str, ofs, end, &partial_content);
+      if (r < 0)
+        return r;
+    }
   }
   if (if_mod) {
     if (parse_time(if_mod, &mod_time) < 0)
@@ -999,24 +1060,28 @@ void RGWListBuckets::execute()
   uint64_t max_buckets = s->cct->_conf->rgw_list_buckets_max_chunk;
 
   ret = get_params();
-  if (ret < 0)
+  if (ret < 0) {
     goto send_end;
+  }
+
+  if (supports_account_metadata()) {
+    ret = rgw_get_user_attrs_by_uid(store, s->user.user_id, attrs);
+    if (ret < 0) {
+      goto send_end;
+    }
+  }
 
   do {
     RGWUserBuckets buckets;
     uint64_t read_count;
-    if (limit > 0)
+    if (limit >= 0) {
       read_count = min(limit - total_count, (uint64_t)max_buckets);
-    else
+    } else {
       read_count = max_buckets;
+    }
 
     ret = rgw_read_user_buckets(store, s->user.user_id, buckets,
-                                marker, read_count, should_get_stats());
-
-    if (!started) {
-      send_response_begin(buckets.count() > 0);
-      started = true;
-    }
+                                marker, read_count, should_get_stats(), 0);
 
     if (ret < 0) {
       /* hmm.. something wrong here.. the user was authenticated, so it
@@ -1025,10 +1090,24 @@ void RGWListBuckets::execute()
       break;
     }
     map<string, RGWBucketEnt>& m = buckets.get_buckets();
-
+    map<string, RGWBucketEnt>::iterator iter;
+    for (iter = m.begin(); iter != m.end(); ++iter) {
+      RGWBucketEnt& bucket = iter->second;
+      buckets_size += bucket.size;
+      buckets_size_rounded += bucket.size_rounded;
+      buckets_objcount += bucket.count;
+
+      marker = iter->first;
+    }
+    buckets_count += m.size();
     total_count += m.size();
 
-    done = (m.size() < read_count || (limit > 0 && total_count == limit));
+    done = (m.size() < read_count || (limit >= 0 && total_count >= (uint64_t)limit));
+
+    if (!started) {
+      send_response_begin(buckets.count() > 0);
+      started = true;
+    }
 
     if (!m.empty()) {
       send_response_data(buckets);
@@ -1214,7 +1293,7 @@ void RGWListBucket::execute()
     ret = store->update_containers_stats(m);
     if (ret > 0) {
       bucket = m.begin()->second;
-    } 
+    }
   }
 
   RGWRados::Bucket target(store, s->bucket);
@@ -1223,6 +1302,7 @@ void RGWListBucket::execute()
   list_op.params.prefix = prefix;
   list_op.params.delim = delimiter;
   list_op.params.marker = marker;
+  list_op.params.end_marker = end_marker;
   list_op.params.list_versions = list_versions;
 
   ret = list_op.list_objects(max, &objs, &common_prefixes, &is_truncated);
@@ -1318,7 +1398,7 @@ void RGWCreateBucket::execute()
   }
 
   /* we need to make sure we read bucket info, it's not read before for this specific request */
-  RGWObjectCtx& obj_ctx = *(RGWObjectCtx *)s->obj_ctx;
+  RGWObjectCtx& obj_ctx = *static_cast<RGWObjectCtx *>(s->obj_ctx);
   ret = store->get_bucket_info(obj_ctx, s->bucket_name_str, s->bucket_info, NULL, &s->bucket_attrs);
   if (ret < 0 && ret != -ENOENT)
     return;
@@ -1512,7 +1592,7 @@ class RGWPutObjProcessor_Multipart : public RGWPutObjProcessor_Atomic
 protected:
   int prepare(RGWRados *store, string *oid_rand);
   int do_complete(string& etag, time_t *mtime, time_t set_mtime,
-                  map<string, bufferlist>& attrs,
+                  map<string, bufferlist>& attrs, time_t delete_at,
                   const char *if_match = NULL, const char *if_nomatch = NULL);
 
 public:
@@ -1586,7 +1666,7 @@ static bool is_v2_upload_id(const string& upload_id)
 }
 
 int RGWPutObjProcessor_Multipart::do_complete(string& etag, time_t *mtime, time_t set_mtime,
-                                              map<string, bufferlist>& attrs,
+                                              map<string, bufferlist>& attrs, time_t delete_at,
                                               const char *if_match, const char *if_nomatch)
 {
   complete_writing_data();
@@ -1597,6 +1677,7 @@ int RGWPutObjProcessor_Multipart::do_complete(string& etag, time_t *mtime, time_
   head_obj_op.meta.set_mtime = set_mtime;
   head_obj_op.meta.mtime = mtime;
   head_obj_op.meta.owner = s->owner.get_id();
+  head_obj_op.meta.delete_at = delete_at;
 
   int r = head_obj_op.write_meta(s->obj_size, attrs);
   if (r < 0)
@@ -1649,8 +1730,8 @@ RGWPutObjProcessor *RGWPutObj::select_processor(RGWObjectCtx& obj_ctx, bool *is_
 
   if (!multipart) {
     processor = new RGWPutObjProcessor_Atomic(obj_ctx, s->bucket_info, s->bucket, s->object.name, part_size, s->req_id, s->bucket_info.versioning_enabled());
-    ((RGWPutObjProcessor_Atomic *)processor)->set_olh_epoch(olh_epoch);
-    ((RGWPutObjProcessor_Atomic *)processor)->set_version_id(version_id);
+    (static_cast<RGWPutObjProcessor_Atomic *>(processor))->set_olh_epoch(olh_epoch);
+    (static_cast<RGWPutObjProcessor_Atomic *>(processor))->set_version_id(version_id);
   } else {
     processor = new RGWPutObjProcessor_Multipart(obj_ctx, s->bucket_info, part_size, s);
   }
@@ -1719,6 +1800,17 @@ static int get_system_versioning_params(req_state *s, uint64_t *olh_epoch, strin
   return 0;
 }
 
+static void encode_delete_at_attr(time_t delete_at, map<string, bufferlist>& attrs)
+{
+  if (delete_at == 0) {
+    return;
+  }
+
+  bufferlist delatbl;
+  ::encode(utime_t(delete_at, 0), delatbl);
+  attrs[RGW_ATTR_DELETE_AT] = delatbl;
+}
+
 void RGWPutObj::execute()
 {
   RGWPutObjProcessor *processor = NULL;
@@ -1743,11 +1835,15 @@ void RGWPutObj::execute()
   }
 
   ret = get_params();
-  if (ret < 0)
+  if (ret < 0) {
+    ldout(s->cct, 20) << "get_params() returned ret=" << ret << dendl;
     goto done;
+  }
 
   ret = get_system_versioning_params(s, &olh_epoch, &version_id);
   if (ret < 0) {
+    ldout(s->cct, 20) << "get_system_versioning_params() returned ret=" \
+        << ret << dendl;
     goto done;
   }
 
@@ -1772,6 +1868,7 @@ void RGWPutObj::execute()
     ret = store->check_quota(s->bucket_owner.get_id(), s->bucket,
                              user_quota, bucket_quota, s->content_length);
     if (ret < 0) {
+      ldout(s->cct, 20) << "check_quota() returned ret=" << ret << dendl;
       goto done;
     }
   }
@@ -1781,11 +1878,13 @@ void RGWPutObj::execute()
     supplied_md5[sizeof(supplied_md5) - 1] = '\0';
   }
 
-  processor = select_processor(*(RGWObjectCtx *)s->obj_ctx, &multipart);
+  processor = select_processor(*static_cast<RGWObjectCtx *>(s->obj_ctx), &multipart);
 
   ret = processor->prepare(store, NULL);
-  if (ret < 0)
+  if (ret < 0) {
+    ldout(s->cct, 20) << "processor->prepare() returned ret=" << ret << dendl;
     goto done;
+  }
 
   do {
     bufferlist data;
@@ -1823,7 +1922,7 @@ void RGWPutObj::execute()
       /* restart processing with different oid suffix */
 
       dispose_processor(processor);
-      processor = select_processor(*(RGWObjectCtx *)s->obj_ctx, &multipart);
+      processor = select_processor(*static_cast<RGWObjectCtx *>(s->obj_ctx), &multipart);
 
       string oid_rand;
       char buf[33];
@@ -1855,6 +1954,7 @@ void RGWPutObj::execute()
   ret = store->check_quota(s->bucket_owner.get_id(), s->bucket,
                            user_quota, bucket_quota, s->obj_size);
   if (ret < 0) {
+    ldout(s->cct, 20) << "second check_quota() returned ret=" << ret << dendl;
     goto done;
   }
 
@@ -1916,8 +2016,10 @@ void RGWPutObj::execute()
   }
 
   rgw_get_request_metadata(s->cct, s->info, attrs);
+  encode_delete_at_attr(delete_at, attrs);
+
+  ret = processor->complete(etag, &mtime, 0, attrs, delete_at, if_match, if_nomatch);
 
-  ret = processor->complete(etag, &mtime, 0, attrs, if_match, if_nomatch);
 done:
   dispose_processor(processor);
   perfcounter->tinc(l_rgw_put_lat,
@@ -1979,7 +2081,7 @@ void RGWPostObj::execute()
     goto done;
   }
 
-  processor = select_processor(*(RGWObjectCtx *)s->obj_ctx);
+  processor = select_processor(*static_cast<RGWObjectCtx *>(s->obj_ctx));
 
   ret = processor->prepare(store, NULL);
   if (ret < 0)
@@ -2020,6 +2122,7 @@ void RGWPostObj::execute()
     goto done;
   }
 
+  processor->complete_hash(&hash);
   hash.Final(m);
   buf_to_hex(m, CEPH_CRYPTO_MD5_DIGESTSIZE, calc_md5);
 
@@ -2036,149 +2139,277 @@ void RGWPostObj::execute()
     attrs[RGW_ATTR_CONTENT_TYPE] = ct_bl;
   }
 
-  ret = processor->complete(etag, NULL, 0, attrs);
+  ret = processor->complete(etag, NULL, 0, attrs, delete_at);
 
 done:
   dispose_processor(processor);
 }
 
 
-int RGWPutMetadata::verify_permission()
+static void populate_with_generic_attrs(const req_state * const s,
+                                        map<string, bufferlist>& out_attrs)
 {
-  if (!s->object.empty()) {
-    if (!verify_object_permission(s, RGW_PERM_WRITE))
-      return -EACCES;
-  } else {
-    if (!verify_bucket_permission(s, RGW_PERM_WRITE))
-      return -EACCES;
+  map<string, string>::const_iterator giter;
+
+  for (giter = s->generic_attrs.begin(); giter != s->generic_attrs.end(); ++giter) {
+    bufferlist& attrbl = out_attrs[giter->first];
+    const string& val = giter->second;
+    attrbl.clear();
+    attrbl.append(val.c_str(), val.size() + 1);
   }
+}
+
+static void prepare_add_del_attrs(const map<string, bufferlist>& orig_attrs,
+                                  map<string, bufferlist>& out_attrs,
+                                  map<string, bufferlist>& out_rmattrs)
+{
+  map<string, bufferlist>::const_iterator iter;
+
+  for (iter = orig_attrs.begin(); iter != orig_attrs.end(); ++iter) {
+    const string& name = iter->first;
+    /* check if the attr is user-defined metadata item */
+    if (name.compare(0, sizeof(RGW_ATTR_META_PREFIX) - 1, RGW_ATTR_META_PREFIX) == 0) {
+      /* for the objects all existing meta attrs have to be removed */
+      out_rmattrs[name] = iter->second;
+    } else if (out_attrs.find(name) == out_attrs.end()) {
+      out_attrs[name] = iter->second;
+    }
+  }
+}
+
+static void prepare_add_del_attrs(const map<string, bufferlist>& orig_attrs,
+                                  const set<string>& rmattr_names,
+                                  map<string, bufferlist>& out_attrs,
+                                  map<string, bufferlist>& out_rmattrs)
+{
+  map<string, bufferlist>::const_iterator iter;
 
+  for (iter = orig_attrs.begin(); iter != orig_attrs.end(); ++iter) {
+    const string& name = iter->first;
+    /* check if the attr is user-defined metadata item */
+    if (name.compare(0, strlen(RGW_ATTR_META_PREFIX), RGW_ATTR_META_PREFIX) == 0) {
+      /* for the buckets all existing meta attrs are preserved,
+         except those that are listed in rmattr_names. */
+      if (rmattr_names.find(name) != rmattr_names.end()) {
+        map<string, bufferlist>::iterator aiter = out_attrs.find(name);
+        if (aiter != out_attrs.end()) {
+          out_attrs.erase(aiter);
+        }
+        out_rmattrs[name] = iter->second;
+      }
+    } else if (out_attrs.find(name) == out_attrs.end()) {
+      out_attrs[name] = iter->second;
+    }
+  }
+}
+
+int RGWPutMetadataAccount::handle_temp_url_update(const map<int, string>& temp_url_keys) {
+  RGWUserAdminOpState user_op;
+  user_op.set_user_id(s->user.user_id);
+
+  map<int, string>::const_iterator iter;
+  for (iter = temp_url_keys.begin(); iter != temp_url_keys.end(); ++iter) {
+    user_op.set_temp_url_key(iter->second, iter->first);
+  }
+
+  RGWUser user;
+  ret = user.init(store, user_op);
+  if (ret < 0) {
+    ldout(store->ctx(), 0) << "ERROR: could not init user ret=" << ret << dendl;
+    return ret;
+  }
+
+  string err_msg;
+  ret = user.modify(user_op, &err_msg);
+  if (ret < 0) {
+    ldout(store->ctx(), 10) << "user.modify() returned " << ret << ": " << err_msg << dendl;
+    return ret;
+  }
   return 0;
 }
 
-void RGWPutMetadata::pre_exec()
+int RGWPutMetadataAccount::verify_permission()
 {
-  rgw_bucket_object_pre_exec(s);
+  if (!rgw_user_is_authenticated(s->user)) {
+    return -EACCES;
+  }
+  return 0;
 }
 
-void RGWPutMetadata::execute()
+void RGWPutMetadataAccount::filter_out_temp_url(map<string, bufferlist>& add_attrs,
+                                                const set<string>& rmattr_names,
+                                                map<int, string>& temp_url_keys)
 {
-  const char *meta_prefix = RGW_ATTR_META_PREFIX;
-  int meta_prefix_len = sizeof(RGW_ATTR_META_PREFIX) - 1;
-  map<string, bufferlist> attrs, orig_attrs, rmattrs;
   map<string, bufferlist>::iterator iter;
-  bufferlist bl, cors_bl;
 
-  rgw_obj obj(s->bucket, s->object);
+  iter = add_attrs.find(RGW_ATTR_TEMPURL_KEY1);
+  if (iter != add_attrs.end()) {
+    temp_url_keys[0] = iter->second.c_str();
+    add_attrs.erase(iter);
+  }
 
-  store->set_atomic(s->obj_ctx, obj);
+  iter = add_attrs.find(RGW_ATTR_TEMPURL_KEY2);
+  if (iter != add_attrs.end()) {
+    temp_url_keys[1] = iter->second.c_str();
+    add_attrs.erase(iter);
+  }
+
+  set<string>::const_iterator riter;
+  for(riter = rmattr_names.begin(); riter != rmattr_names.end(); ++riter) {
+    const string& name = *riter;
+
+    if (name.compare(RGW_ATTR_TEMPURL_KEY1) == 0) {
+      temp_url_keys[0] = string();
+    }
+    if (name.compare(RGW_ATTR_TEMPURL_KEY2) == 0) {
+      temp_url_keys[1] = string();
+    }
+  }
+}
+
+void RGWPutMetadataAccount::execute()
+{
+  rgw_obj obj;
+  map<string, bufferlist> attrs, orig_attrs, rmattrs;
+  RGWObjVersionTracker acct_op_tracker;
+
+  /* Get the name of raw object which stores the metadata in its xattrs. */
+  string buckets_obj_id;
+  rgw_get_buckets_obj(s->user.user_id, buckets_obj_id);
+  obj = rgw_obj(store->zone.user_uid_pool, buckets_obj_id);
 
   ret = get_params();
-  if (ret < 0)
+  if (ret < 0) {
     return;
+  }
 
-  RGWObjVersionTracker *ptracker = NULL;
-  bool is_object_op = (!s->object.empty());
-
-  rgw_get_request_metadata(s->cct, s->info, attrs, is_object_op);
+  rgw_get_request_metadata(s->cct, s->info, attrs, false);
+  rgw_get_user_attrs_by_uid(store, s->user.user_id, orig_attrs, &acct_op_tracker);
+  prepare_add_del_attrs(orig_attrs, rmattr_names, attrs, rmattrs);
+  populate_with_generic_attrs(s, attrs);
 
-  if (is_object_op) {
-    /* check if obj exists, read orig attrs */
-    ret = get_obj_attrs(store, s, obj, orig_attrs);
-    if (ret < 0)
+  /* Handle the TempURL-related stuff. */
+  map<int, string> temp_url_keys;
+  filter_out_temp_url(attrs, rmattr_names, temp_url_keys);
+  if (!temp_url_keys.empty()) {
+    if (s->perm_mask != RGW_PERM_FULL_CONTROL) {
+      ret = -EPERM;
       return;
-  } else {
-    ptracker = &s->bucket_info.objv_tracker;
-    orig_attrs = s->bucket_attrs;
+    }
+  }
 
-    if (!placement_rule.empty() &&
-        placement_rule != s->bucket_info.placement_rule) {
-      ret = -EEXIST;
+  ret = rgw_store_user_attrs(store, s->user.user_id, attrs, &rmattrs, &acct_op_tracker);
+  if (ret < 0) {
+    return;
+  }
+
+  if (!temp_url_keys.empty()) {
+    ret = handle_temp_url_update(temp_url_keys);
+    if (ret < 0) {
       return;
     }
   }
+}
 
-  for (iter = orig_attrs.begin(); iter != orig_attrs.end(); ++iter) {
-    const string& name = iter->first;
-    /* check if the attr is user-defined metadata item */
-    if (name.compare(0, meta_prefix_len, meta_prefix) == 0) {
-      if (is_object_op) {
-        /* for the objects all existing meta attrs have to be removed */
-        rmattrs[name] = iter->second;
-      } else {
-        /* for the buckets all existing meta attrs are preserved,
-           except those that are listed in rmattr_names. */
-        if (rmattr_names.find(name) != rmattr_names.end()) {
-          map<string, bufferlist>::iterator aiter = attrs.find(name);
-          if (aiter != attrs.end()) {
-            attrs.erase(aiter);
-          }
-          rmattrs[name] = iter->second;
-        }
-      }
-    } else if (attrs.find(name) == attrs.end()) {
-      attrs[name] = iter->second;
-    }
+int RGWPutMetadataBucket::verify_permission()
+{
+  if (!verify_bucket_permission(s, RGW_PERM_WRITE)) {
+    return -EACCES;
   }
 
-  map<string, string>::iterator giter;
-  for (giter = s->generic_attrs.begin(); giter != s->generic_attrs.end(); ++giter) {
-    bufferlist& attrbl = attrs[giter->first];
-    const string& val = giter->second;
-    attrbl.clear();
-    attrbl.append(val.c_str(), val.size() + 1);
+  return 0;
+}
+
+void RGWPutMetadataBucket::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWPutMetadataBucket::execute()
+{
+  rgw_obj obj(s->bucket, s->object);
+  map<string, bufferlist> attrs, orig_attrs, rmattrs;
+
+  ret = get_params();
+  if (ret < 0) {
+    return;
+  }
+
+  rgw_get_request_metadata(s->cct, s->info, attrs, false);
+
+  if (!placement_rule.empty() &&
+      placement_rule != s->bucket_info.placement_rule) {
+    ret = -EEXIST;
+    return;
   }
 
+  orig_attrs = s->bucket_attrs;
+  prepare_add_del_attrs(orig_attrs, rmattr_names, attrs, rmattrs);
+  populate_with_generic_attrs(s, attrs);
+
   if (has_policy) {
+    bufferlist bl;
     policy.encode(bl);
     attrs[RGW_ATTR_ACL] = bl;
   }
+
   if (has_cors) {
-    cors_config.encode(cors_bl);
-    attrs[RGW_ATTR_CORS] = cors_bl;
-  }
-  if (is_object_op) {
-    ret = store->set_attrs(s->obj_ctx, obj, attrs, &rmattrs, ptracker);
-  } else {
-    ret = rgw_bucket_set_attrs(store, s->bucket_info, attrs, &rmattrs, ptracker);
+    bufferlist bl;
+    cors_config.encode(bl);
+    attrs[RGW_ATTR_CORS] = bl;
   }
+
+  ret = rgw_bucket_set_attrs(store, s->bucket_info, attrs, &rmattrs,
+          &s->bucket_info.objv_tracker);
 }
 
-int RGWSetTempUrl::verify_permission()
+int RGWPutMetadataObject::verify_permission()
 {
-  if (s->perm_mask != RGW_PERM_FULL_CONTROL)
+  if (!verify_object_permission(s, RGW_PERM_WRITE)) {
     return -EACCES;
+  }
 
   return 0;
 }
 
-void RGWSetTempUrl::execute()
+void RGWPutMetadataObject::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWPutMetadataObject::execute()
 {
+  rgw_obj obj(s->bucket, s->object);
+  map<string, bufferlist> attrs, orig_attrs, rmattrs;
+
+  store->set_atomic(s->obj_ctx, obj);
+
   ret = get_params();
-  if (ret < 0)
+  if (ret < 0) {
     return;
-
-  RGWUserAdminOpState user_op;
-  user_op.set_user_id(s->user.user_id);
-  map<int, string>::iterator iter;
-  for (iter = temp_url_keys.begin(); iter != temp_url_keys.end(); ++iter) {
-    user_op.set_temp_url_key(iter->second, iter->first);
   }
 
-  RGWUser user;
-  ret = user.init(store, user_op);
+  rgw_get_request_metadata(s->cct, s->info, attrs);
+  /* check if obj exists, read orig attrs */
+  ret = get_obj_attrs(store, s, obj, orig_attrs);
   if (ret < 0) {
-    ldout(store->ctx(), 0) << "ERROR: could not init user ret=" << ret << dendl;
     return;
   }
-  string err_msg;
-  ret = user.modify(user_op, &err_msg);
-  if (ret < 0) {
-    ldout(store->ctx(), 10) << "user.modify() returned " << ret << ": " << err_msg << dendl;
+
+  /* Check whether the object has expired. Swift API documentation
+   * stands that we should return 404 Not Found in such case. */
+  if (need_object_expiration() && object_is_expired(orig_attrs)) {
+    ret = -ENOENT;
     return;
   }
-}
 
+  /* Filter currently existing attributes. */
+  prepare_add_del_attrs(orig_attrs, attrs, rmattrs);
+  populate_with_generic_attrs(s, attrs);
+  encode_delete_at_attr(delete_at, attrs);
+
+  ret = store->set_attrs(s->obj_ctx, obj, attrs, &rmattrs, NULL);
+}
 
 int RGWDeleteObj::verify_permission()
 {
@@ -2198,7 +2429,7 @@ void RGWDeleteObj::execute()
   ret = -EINVAL;
   rgw_obj obj(s->bucket, s->object);
   if (!s->object.empty()) {
-    RGWObjectCtx *obj_ctx = (RGWObjectCtx *)s->obj_ctx;
+    RGWObjectCtx *obj_ctx = static_cast<RGWObjectCtx *>(s->obj_ctx);
 
     obj_ctx->set_atomic(obj);
 
@@ -2280,7 +2511,7 @@ int RGWCopyObj::verify_permission()
   }
   map<string, bufferlist> src_attrs;
 
-  RGWObjectCtx& obj_ctx = *(RGWObjectCtx *)s->obj_ctx;
+  RGWObjectCtx& obj_ctx = *static_cast<RGWObjectCtx *>(s->obj_ctx);
 
   ret = store->get_bucket_info(obj_ctx, src_bucket_name, src_bucket_info, NULL, &src_attrs);
   if (ret < 0)
@@ -2406,10 +2637,12 @@ void RGWCopyObj::execute()
   rgw_obj src_obj(src_bucket, src_object);
   rgw_obj dst_obj(dest_bucket, dest_object);
 
-  RGWObjectCtx& obj_ctx = *(RGWObjectCtx *)s->obj_ctx;
+  RGWObjectCtx& obj_ctx = *static_cast<RGWObjectCtx *>(s->obj_ctx);
   obj_ctx.set_atomic(src_obj);
   obj_ctx.set_atomic(dst_obj);
 
+  encode_delete_at_attr(delete_at, attrs);
+
   ret = store->copy_obj(obj_ctx,
                         s->user.user_id,
                         client_id,
@@ -2429,6 +2662,7 @@ void RGWCopyObj::execute()
                         attrs_mod,
                         attrs, RGW_OBJ_CATEGORY_MAIN,
                         olh_epoch,
+			delete_at,
                         (version_id.empty() ? NULL : &version_id),
                         &s->req_id, /* use req_id as tag */
                         &etag,
@@ -2787,7 +3021,7 @@ void RGWInitMultipart::execute()
     obj.set_in_extra_data(true);
     obj.index_hash_source = s->object.name;
 
-    RGWRados::Object op_target(store, s->bucket_info, *(RGWObjectCtx *)s->obj_ctx, obj);
+    RGWRados::Object op_target(store, s->bucket_info, *static_cast<RGWObjectCtx *>(s->obj_ctx), obj);
     op_target.set_versioning_disabled(true); /* no versioning for multipart meta */
 
     RGWRados::Object::Write obj_op(&op_target);
@@ -2811,8 +3045,12 @@ static int get_multipart_info(RGWRados *store, struct req_state *s, string& meta
   obj.set_in_extra_data(true);
 
   int ret = get_obj_attrs(store, s, obj, attrs);
-  if (ret < 0)
+  if (ret < 0) {
+    if (ret == -ENOENT) {
+      return -ERR_NO_SUCH_UPLOAD;
+    }
     return ret;
+  }
 
   if (policy) {
     for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
@@ -2854,7 +3092,7 @@ static int list_multipart_parts(RGWRados *store, struct req_state *s,
   int ret;
 
   parts.clear();
-  
+
   if (sorted_omap) {
     string p;
     p = "part.";
@@ -2973,24 +3211,29 @@ void RGWCompleteMultipart::execute()
     return;
   }
 
-  if (!data) {
-    ret = -EINVAL;
+  if (!data || !len) {
+    ret = -ERR_MALFORMED_XML;
     return;
   }
 
   if (!parser.init()) {
-    ret = -EINVAL;
+    ret = -EIO;
     return;
   }
 
   if (!parser.parse(data, len, 1)) {
-    ret = -EINVAL;
+    ret = -ERR_MALFORMED_XML;
     return;
   }
 
   parts = static_cast<RGWMultiCompleteUpload *>(parser.find_first("CompleteMultipartUpload"));
-  if (!parts) {
-    ret = -EINVAL;
+  if (!parts || parts->parts.empty()) {
+    ret = -ERR_MALFORMED_XML;
+    return;
+  }
+
+  if ((int)parts->parts.size() > s->cct->_conf->rgw_multipart_part_upload_limit) {
+    ret = -ERANGE;
     return;
   }
 
@@ -3099,11 +3342,11 @@ void RGWCompleteMultipart::execute()
     store->gen_rand_obj_instance_name(&target_obj);
   }
 
-  RGWObjectCtx& obj_ctx = *(RGWObjectCtx *)s->obj_ctx;
+  RGWObjectCtx& obj_ctx = *static_cast<RGWObjectCtx *>(s->obj_ctx);
 
   obj_ctx.set_atomic(target_obj);
 
-  RGWRados::Object op_target(store, s->bucket_info, *(RGWObjectCtx *)s->obj_ctx, target_obj);
+  RGWRados::Object op_target(store, s->bucket_info, *static_cast<RGWObjectCtx *>(s->obj_ctx), target_obj);
   RGWRados::Object::Write obj_op(&op_target);
 
   obj_op.meta.manifest = &manifest;
@@ -3118,7 +3361,7 @@ void RGWCompleteMultipart::execute()
     return;
 
   // remove the upload obj
-  int r = store->delete_obj(*(RGWObjectCtx *)s->obj_ctx, s->bucket_info, meta_obj, 0);
+  int r = store->delete_obj(*static_cast<RGWObjectCtx *>(s->obj_ctx), s->bucket_info, meta_obj, 0);
   if (r < 0) {
     ldout(store->ctx(), 0) << "WARNING: failed to remove object " << meta_obj << dendl;
   }
@@ -3164,7 +3407,7 @@ void RGWAbortMultipart::execute()
   int max_parts = 1000;
 
 
-  RGWObjectCtx *obj_ctx = (RGWObjectCtx *)s->obj_ctx;
+  RGWObjectCtx *obj_ctx = static_cast<RGWObjectCtx *>(s->obj_ctx);
 
   meta_obj.init_ns(s->bucket, meta_oid, mp_ns);
   meta_obj.set_in_extra_data(true);
@@ -3337,7 +3580,7 @@ void RGWDeleteMultiObj::execute()
   vector<rgw_obj_key>::iterator iter;
   RGWMultiDelXMLParser parser;
   int num_processed = 0;
-  RGWObjectCtx *obj_ctx = (RGWObjectCtx *)s->obj_ctx;
+  RGWObjectCtx *obj_ctx = static_cast<RGWObjectCtx *>(s->obj_ctx);
 
   ret = get_params();
   if (ret < 0) {
diff --git a/src/rgw/rgw_op.h b/src/rgw/rgw_op.h
index 0726fc3..7a196a3 100644
--- a/src/rgw/rgw_op.h
+++ b/src/rgw/rgw_op.h
@@ -44,7 +44,9 @@ enum RGWOpType {
   RGW_OP_DELETE_BUCKET,
   RGW_OP_PUT_OBJ,
   RGW_OP_POST_OBJ,
-  RGW_OP_PUT_METADATA,
+  RGW_OP_PUT_METADATA_ACCOUNT,
+  RGW_OP_PUT_METADATA_BUCKET,
+  RGW_OP_PUT_METADATA_OBJECT,
   RGW_OP_SET_TEMPURL,
   RGW_OP_DELETE_OBJ,
   RGW_OP_COPY_OBJ,
@@ -132,6 +134,7 @@ protected:
   int ret;
   bool get_data;
   bool partial_content;
+  bool range_parsed;
   rgw_obj obj;
   utime_t gc_invalidate_time;
 
@@ -154,10 +157,11 @@ public:
     unmod_ptr = NULL;
     get_data = false;
     partial_content = false;
+    range_parsed = false;
     ret = 0;
  }
 
-  virtual bool prefetch_data() { return get_data; }
+  bool prefetch_data();
 
   void set_get_data(bool get_data) {
     this->get_data = get_data;
@@ -176,6 +180,7 @@ public:
   virtual const string name() { return "get_obj"; }
   virtual RGWOpType get_type() { return RGW_OP_GET_OBJ; }
   virtual uint32_t op_mask() { return RGW_OP_TYPE_READ; }
+  virtual bool need_object_expiration() { return false; }
 };
 
 #define RGW_LIST_BUCKETS_LIMIT_MAX 10000
@@ -185,12 +190,21 @@ protected:
   int ret;
   bool sent_data;
   string marker;
-  uint64_t limit;
+  int64_t limit;
   uint64_t limit_max;
+  uint32_t buckets_count;
+  uint64_t buckets_objcount;
+  uint64_t buckets_size;
+  uint64_t buckets_size_rounded;
+  map<string, bufferlist> attrs;
 
 public:
   RGWListBuckets() : ret(0), sent_data(false) {
     limit = limit_max = RGW_LIST_BUCKETS_LIMIT_MAX;
+    buckets_count = 0;
+    buckets_objcount = 0;
+    buckets_size = 0;
+    buckets_size_rounded = 0;
   }
 
   int verify_permission();
@@ -203,6 +217,7 @@ public:
   virtual void send_response() {}
 
   virtual bool should_get_stats() { return false; }
+  virtual bool supports_account_metadata() { return false; }
 
   virtual const string name() { return "list_buckets"; }
   virtual RGWOpType get_type() { return RGW_OP_LIST_BUCKETS; }
@@ -227,7 +242,7 @@ public:
   }
 
   int verify_permission();
-  void execute();
+  virtual void execute();
 
   virtual void send_response() = 0;
   virtual const string name() { return "stat_account"; }
@@ -241,8 +256,10 @@ protected:
   string prefix;
   rgw_obj_key marker; 
   rgw_obj_key next_marker; 
+  rgw_obj_key end_marker;
   string max_keys;
   string delimiter;
+  string encoding_type;
   bool list_versions;
   int max;
   int ret;
@@ -419,6 +436,8 @@ protected:
   uint64_t olh_epoch;
   string version_id;
 
+  time_t delete_at;
+
 public:
   RGWPutObj() {
     ret = 0;
@@ -432,6 +451,7 @@ public:
     mtime = 0;
     user_manifest_parts_hash = NULL;
     olh_epoch = 0;
+    delete_at = 0;
   }
 
   virtual void init(RGWRados *store, struct req_state *s, RGWHandler *h) {
@@ -472,11 +492,12 @@ protected:
   string content_type;
   RGWAccessControlPolicy policy;
   map<string, bufferlist> attrs;
+  time_t delete_at;
 
 public:
   RGWPostObj() : min_len(0), max_len(LLONG_MAX), ret(0), len(0), ofs(0),
 		 supplied_md5_b64(NULL), supplied_etag(NULL),
-		 data_pending(false) {}
+		 data_pending(false), delete_at(0) {}
 
   virtual void init(RGWRados *store, struct req_state *s, RGWHandler *h) {
     RGWOp::init(store, s, h);
@@ -498,7 +519,37 @@ public:
   virtual uint32_t op_mask() { return RGW_OP_TYPE_WRITE; }
 };
 
-class RGWPutMetadata : public RGWOp {
+class RGWPutMetadataAccount : public RGWOp {
+protected:
+  int ret;
+  set<string> rmattr_names;
+  RGWAccessControlPolicy policy;
+
+public:
+  RGWPutMetadataAccount()
+    : ret(0)
+  {}
+
+  virtual void init(RGWRados *store, struct req_state *s, RGWHandler *h) {
+    RGWOp::init(store, s, h);
+    policy.set_ctx(s->cct);
+  }
+  int verify_permission();
+  void pre_exec() { return; }
+  void execute();
+
+  virtual int get_params() = 0;
+  virtual void send_response() = 0;
+  virtual void filter_out_temp_url(map<string, bufferlist>& add_attrs,
+                                   const set<string>& rmattr_names,
+                                   map<int, string>& temp_url_keys);
+  virtual int handle_temp_url_update(const map<int, string>& temp_url_keys);
+  virtual const string name() { return "put_account_metadata"; }
+  virtual RGWOpType get_type() { return RGW_OP_PUT_METADATA_ACCOUNT; }
+  virtual uint32_t op_mask() { return RGW_OP_TYPE_WRITE; }
+};
+
+class RGWPutMetadataBucket : public RGWOp {
 protected:
   int ret;
   set<string> rmattr_names;
@@ -508,11 +559,9 @@ protected:
   string placement_rule;
 
 public:
-  RGWPutMetadata() {
-    has_cors = false;
-    has_policy = false;
-    ret = 0;
-  }
+  RGWPutMetadataBucket()
+    : ret(0), has_policy(false), has_cors(false)
+  {}
 
   virtual void init(RGWRados *store, struct req_state *s, RGWHandler *h) {
     RGWOp::init(store, s, h);
@@ -524,25 +573,37 @@ public:
 
   virtual int get_params() = 0;
   virtual void send_response() = 0;
-  virtual const string name() { return "put_obj_metadata"; }
-  virtual RGWOpType get_type() { return RGW_OP_PUT_METADATA; }
+  virtual const string name() { return "put_bucket_metadata"; }
+  virtual RGWOpType get_type() { return RGW_OP_PUT_METADATA_BUCKET; }
   virtual uint32_t op_mask() { return RGW_OP_TYPE_WRITE; }
 };
 
-class RGWSetTempUrl : public RGWOp {
+class RGWPutMetadataObject : public RGWOp {
 protected:
   int ret;
-  map<int, string> temp_url_keys;
+  RGWAccessControlPolicy policy;
+  string placement_rule;
+  time_t delete_at;
+
 public:
-  RGWSetTempUrl() : ret(0) {}
+  RGWPutMetadataObject()
+    : ret(0), delete_at(0)
+  {}
 
+  virtual void init(RGWRados *store, struct req_state *s, RGWHandler *h) {
+    RGWOp::init(store, s, h);
+    policy.set_ctx(s->cct);
+  }
   int verify_permission();
+  void pre_exec();
   void execute();
 
   virtual int get_params() = 0;
   virtual void send_response() = 0;
-  virtual const string name() { return "set_temp_url"; }
-  virtual RGWOpType get_type() { return RGW_OP_SET_TEMPURL; }
+  virtual const string name() { return "put_obj_metadata"; }
+  virtual RGWOpType get_type() { return RGW_OP_PUT_METADATA_OBJECT; }
+  virtual uint32_t op_mask() { return RGW_OP_TYPE_WRITE; }
+  virtual bool need_object_expiration() { return false; }
 };
 
 class RGWDeleteObj : public RGWOp {
@@ -601,6 +662,7 @@ protected:
   string version_id;
   uint64_t olh_epoch;
 
+  time_t delete_at;
 
   int init_common();
 
@@ -623,6 +685,7 @@ public:
     attrs_mod = RGWRados::ATTRSMOD_NONE;
     last_ofs = 0;
     olh_epoch = 0;
+    delete_at = 0;
   }
 
   static bool parse_copy_location(const string& src, string& bucket_name, rgw_obj_key& object);
diff --git a/src/rgw/rgw_orphan.cc b/src/rgw/rgw_orphan.cc
index 2818d79..0bf34d8 100644
--- a/src/rgw/rgw_orphan.cc
+++ b/src/rgw/rgw_orphan.cc
@@ -277,9 +277,28 @@ int RGWOrphanSearch::build_all_oids_index()
     string oid = i->get_oid();
     string locator = i->get_locator();
 
-    string name = oid;
-    if (locator.size())
-      name += " (@" + locator + ")";  
+    ssize_t pos = oid.find('_');
+    if (pos < 0) {
+      cout << "unidentified oid: " << oid << ", skipping" << std::endl;
+      /* what is this object, oids should be in the format of <bucket marker>_<obj>,
+       * skip this entry
+       */
+      continue;
+    }
+    string stripped_oid = oid.substr(pos + 1);
+    string name, instance, ns;
+    if (!rgw_obj::parse_raw_oid(stripped_oid, &name, &instance, &ns)) {
+      cout << "cannot parse oid: " << oid << ", skipping" << std::endl;
+      continue;
+    }
+
+    if (ns.empty()) {
+      /* skipping head objects, we don't want to remove these as they are mutable and
+       * cleaning them up is racy (can race with object removal and a later recreation)
+       */
+      cout << "skipping head object: oid=" << oid << std::endl;
+      continue;
+    }
 
     string oid_fp = obj_fingerprint(oid);
 
diff --git a/src/rgw/rgw_quota.cc b/src/rgw/rgw_quota.cc
index 910da2f..85536bf 100644
--- a/src/rgw/rgw_quota.cc
+++ b/src/rgw/rgw_quota.cc
@@ -47,7 +47,7 @@ protected:
     uint64_t added_bytes;
     uint64_t removed_bytes;
   public:
-    StatsAsyncTestSet() {}
+    StatsAsyncTestSet() : objs_delta(0), added_bytes(0), removed_bytes(0) {}
     bool update(RGWQuotaCacheStats *entry) {
       if (entry->async_refresh_time.sec() == 0)
         return false;
diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc
index 05c41ef..45fde7a 100644
--- a/src/rgw/rgw_rados.cc
+++ b/src/rgw/rgw_rados.cc
@@ -27,6 +27,7 @@
 #include "cls/version/cls_version_client.h"
 #include "cls/log/cls_log_client.h"
 #include "cls/statelog/cls_statelog_client.h"
+#include "cls/timeindex/cls_timeindex_client.h"
 #include "cls/lock/cls_lock_client.h"
 #include "cls/user/cls_user_client.h"
 
@@ -47,6 +48,7 @@ using namespace librados;
 #include "rgw_log.h"
 
 #include "rgw_gc.h"
+#include "rgw_object_expirer_core.h"
 
 #define dout_subsys ceph_subsys_rgw
 
@@ -520,7 +522,6 @@ int RGWRegionMap::update(RGWRegion& region)
   return 0;
 }
 
-
 void RGWObjVersionTracker::prepare_op_for_read(ObjectReadOperation *op)
 {
   obj_version *check_objv = version_for_check();
@@ -891,10 +892,10 @@ void RGWObjVersionTracker::generate_new_write_ver(CephContext *cct)
 }
 
 int RGWPutObjProcessor::complete(string& etag, time_t *mtime, time_t set_mtime,
-                                 map<string, bufferlist>& attrs,
+                                 map<string, bufferlist>& attrs, time_t delete_at,
                                  const char *if_match, const char * if_nomatch)
 {
-  int r = do_complete(etag, mtime, set_mtime, attrs, if_match, if_nomatch);
+  int r = do_complete(etag, mtime, set_mtime, attrs, delete_at, if_match, if_nomatch);
   if (r < 0)
     return r;
 
@@ -1009,6 +1010,8 @@ int RGWPutObjProcessor_Aio::drain_pending()
 
 int RGWPutObjProcessor_Aio::throttle_data(void *handle, bool need_to_wait)
 {
+  bool _wait = need_to_wait;
+
   if (handle) {
     struct put_obj_aio_info info;
     info.handle = handle;
@@ -1022,7 +1025,7 @@ int RGWPutObjProcessor_Aio::throttle_data(void *handle, bool need_to_wait)
     if (r < 0)
       return r;
 
-    need_to_wait = false;
+    _wait = false;
   }
 
   /* resize window in case messages are draining too fast */
@@ -1031,13 +1034,10 @@ int RGWPutObjProcessor_Aio::throttle_data(void *handle, bool need_to_wait)
   }
 
   /* now throttle. Note that need_to_wait should only affect the first IO operation */
-  if (pending.size() > max_chunks ||
-      need_to_wait) {
+  if (pending.size() > max_chunks || _wait) {
     int r = wait_pending_front();
     if (r < 0)
       return r;
-
-    need_to_wait = false;
   }
   return 0;
 }
@@ -1209,7 +1209,7 @@ int RGWPutObjProcessor_Atomic::complete_writing_data()
 }
 
 int RGWPutObjProcessor_Atomic::do_complete(string& etag, time_t *mtime, time_t set_mtime,
-                                           map<string, bufferlist>& attrs,
+                                           map<string, bufferlist>& attrs, time_t delete_at,
                                            const char *if_match,
                                            const char *if_nomatch) {
   int r = complete_writing_data();
@@ -1235,6 +1235,7 @@ int RGWPutObjProcessor_Atomic::do_complete(string& etag, time_t *mtime, time_t s
   obj_op.meta.owner = bucket_info.owner;
   obj_op.meta.flags = PUT_OBJ_CREATE;
   obj_op.meta.olh_epoch = olh_epoch;
+  obj_op.meta.delete_at = delete_at;
 
   r = obj_op.write_meta(obj_len, attrs);
   if (r < 0) {
@@ -1448,9 +1449,14 @@ void RGWRados::finalize()
   delete data_log;
   if (use_gc_thread) {
     gc->stop_processor();
-    delete gc;
-    gc = NULL;
+    obj_expirer->stop_processor();
   }
+  delete gc;
+  gc = NULL;
+
+  delete obj_expirer;
+  obj_expirer = NULL;
+
   delete rest_master_conn;
 
   map<string, RGWRESTConn *>::iterator iter;
@@ -1619,13 +1625,21 @@ int RGWRados::init_complete()
   if (ret < 0)
     return ret;
 
+  ret = open_objexp_pool_ctx();
+  if (ret < 0)
+    return ret;
+
   pools_initialized = true;
 
   gc = new RGWGC();
   gc->initialize(cct, this);
 
-  if (use_gc_thread)
+  obj_expirer = new RGWObjectExpirer(this);
+
+  if (use_gc_thread) {
     gc->start_processor();
+    obj_expirer->start_processor();
+  }
 
   quota_handler = RGWQuotaHandler::generate_handler(this, quota_threads);
 
@@ -1760,6 +1774,25 @@ int RGWRados::open_gc_pool_ctx()
   return r;
 }
 
+int RGWRados::open_objexp_pool_ctx()
+{
+  const char * const pool_name = zone.log_pool.name.c_str();
+  librados::Rados * const rad = get_rados_handle();
+  int r = rad->ioctx_create(pool_name, objexp_pool_ctx);
+  if (r == -ENOENT) {
+    r = rad->pool_create(pool_name);
+    if (r == -EEXIST) {
+      r = 0;
+    } else if (r < 0) {
+      return r;
+    }
+
+    r = rad->ioctx_create(pool_name, objexp_pool_ctx);
+  }
+
+  return r;
+}
+
 int RGWRados::init_watch()
 {
   const char *control_pool = zone.control_pool.name.c_str();
@@ -2318,6 +2351,115 @@ int RGWRados::time_log_trim(const string& oid, const utime_t& start_time, const
   return cls_log_trim(io_ctx, oid, start_time, end_time, from_marker, to_marker);
 }
 
+string RGWRados::objexp_hint_get_shardname(int shard_num)
+{
+  char buf[32];
+  snprintf(buf, sizeof(buf), "%010u", shard_num);
+
+  string objname("obj_delete_at_hint.");
+  return objname + buf;
+}
+
+#define MAX_PBJEXP_SHARDS_PRIME 7877
+
+int RGWRados::objexp_key_shard(const rgw_obj_key& key)
+{
+  string obj_key = key.name + key.instance;
+  int num_shards = cct->_conf->rgw_objexp_hints_num_shards;
+  uint32_t sid = ceph_str_hash_linux(obj_key.c_str(), obj_key.size());
+  uint32_t sid2 = sid ^ ((sid & 0xFF) << 24);
+  sid = sid2 % MAX_BUCKET_INDEX_SHARDS_PRIME % num_shards;
+  return sid % num_shards;
+}
+
+static string objexp_hint_get_keyext(const string& bucket_name,
+                                     const string& bucket_id,
+                                     const rgw_obj_key& obj_key)
+{
+  return bucket_name + ":" + bucket_id + ":" + obj_key.name + ":" + obj_key.instance;
+}
+
+int RGWRados::objexp_hint_add(const utime_t& delete_at,
+                              const string& bucket_name,
+                              const string& bucket_id,
+                              const rgw_obj_key& obj_key)
+{
+  const string keyext = objexp_hint_get_keyext(bucket_name,
+          bucket_id, obj_key);
+  objexp_hint_entry he = {
+      .bucket_name = bucket_name,
+      .bucket_id = bucket_id,
+      .obj_key = obj_key,
+      .exp_time = delete_at };
+  bufferlist hebl;
+  ::encode(he, hebl);
+  ObjectWriteOperation op;
+  cls_timeindex_add(op, delete_at, keyext, hebl);
+
+  string shard_name = objexp_hint_get_shardname(objexp_key_shard(obj_key));
+  return objexp_pool_ctx.operate(shard_name, &op);
+}
+
+void  RGWRados::objexp_get_shard(int shard_num,
+                                 string& shard)                       /* out */
+{
+  shard = objexp_hint_get_shardname(shard_num);
+}
+
+int RGWRados::objexp_hint_list(const string& oid,
+                               const utime_t& start_time,
+                               const utime_t& end_time,
+                               const int max_entries,
+                               const string& marker,
+                               list<cls_timeindex_entry>& entries, /* out */
+                               string *out_marker,                 /* out */
+                               bool *truncated)                    /* out */
+{
+  librados::ObjectReadOperation op;
+  cls_timeindex_list(op, start_time, end_time, marker, max_entries, entries,
+	       out_marker, truncated);
+
+  bufferlist obl;
+  int ret = objexp_pool_ctx.operate(oid, &op, &obl);
+
+  if ((ret < 0 ) && (ret != -ENOENT)) {
+    return ret;
+  }
+
+  if ((ret == -ENOENT) && truncated) {
+    *truncated = false;
+  }
+
+  return 0;
+}
+
+int RGWRados::objexp_hint_parse(cls_timeindex_entry &ti_entry,  /* in */
+                                objexp_hint_entry& hint_entry)  /* out */
+{
+  try {
+    bufferlist::iterator iter = ti_entry.value.begin();
+    ::decode(hint_entry, iter);
+  } catch (buffer::error& err) {
+    ldout(cct, 0) << "ERROR: couldn't decode avail_pools" << dendl;
+  }
+
+  return 0;
+}
+
+int RGWRados::objexp_hint_trim(const string& oid,
+                               const utime_t& start_time,
+                               const utime_t& end_time,
+                               const string& from_marker,
+                               const string& to_marker)
+{
+  int ret = cls_timeindex_trim(objexp_pool_ctx, oid, start_time, end_time,
+          from_marker, to_marker);
+  if ((ret < 0 ) && (ret != -ENOENT)) {
+    return ret;
+  }
+
+  return 0;
+}
 
 int RGWRados::lock_exclusive(rgw_bucket& pool, const string& oid, utime_t& duration, 
                              string& zone_id, string& owner_id) {
@@ -2402,6 +2544,7 @@ int rgw_policy_from_attrset(CephContext *cct, map<string, bufferlist>& attrset,
  *     Any skipped results will have the matching portion of their name
  *     inserted in common_prefixes with a "true" mark.
  * marker: if filled in, begin the listing with this object.
+ * end_marker: if filled in, end the listing with this object.
  * result: the objects are put in here.
  * common_prefixes: if delim is filled in, any matching prefixes are placed
  *     here.
@@ -2422,13 +2565,22 @@ int RGWRados::Bucket::List::list_objects(int max, vector<RGWObjEnt> *result,
   }
   result->clear();
 
-  rgw_obj marker_obj, prefix_obj;
+  rgw_obj marker_obj, end_marker_obj, prefix_obj;
   marker_obj.set_instance(params.marker.instance);
   marker_obj.set_ns(params.ns);
   marker_obj.set_obj(params.marker.name);
   rgw_obj_key cur_marker;
   marker_obj.get_index_key(&cur_marker);
 
+  end_marker_obj.set_instance(params.end_marker.instance);
+  end_marker_obj.set_ns(params.ns);
+  end_marker_obj.set_obj(params.end_marker.name);
+  rgw_obj_key cur_end_marker;
+  if (params.ns.empty()) { /* no support for end marker for namespaced objects */
+    end_marker_obj.get_index_key(&cur_end_marker);
+  }
+  const bool cur_end_marker_valid = !cur_end_marker.empty();
+
   prefix_obj.set_ns(params.ns);
   prefix_obj.set_obj(params.prefix);
   string cur_prefix = prefix_obj.get_index_key_name();
@@ -2500,6 +2652,11 @@ int RGWRados::Bucket::List::list_objects(int max, vector<RGWObjEnt> *result,
         continue;
       }
 
+      if (cur_end_marker_valid && cur_end_marker <= obj) {
+        truncated = false;
+        goto done;
+      }
+
       if (count < max) {
         params.marker = obj;
         next_marker = obj;
@@ -3454,6 +3611,17 @@ int RGWRados::Object::Write::write_meta(uint64_t size,
     }
   }
 
+  if (meta.delete_at > 0) {
+    rgw_obj_key obj_key;
+    obj.get_index_key(&obj_key);
+
+    r = store->objexp_hint_add(utime_t(meta.delete_at, 0), bucket.name, bucket.bucket_id, obj_key);
+    if (r < 0) {
+      ldout(store->ctx(), 0) << "ERROR: objexp_hint_add() returned r=" << r << ", object will not get removed" << dendl;
+      /* ignoring error, nothing we can do at this point */
+    }
+  }
+
   /* update quota cache */
   store->quota_handler->update_stats(meta.owner, bucket, (orig_exists ? 0 : 1), size, orig_size);
 
@@ -3538,8 +3706,6 @@ int RGWRados::put_system_obj_impl(rgw_obj& obj, uint64_t size, time_t *mtime,
   op.mtime(&set_mtime);
   op.write_full(data);
 
-  string etag;
-  string content_type;
   bufferlist acl_bl;
 
   for (map<string, bufferlist>::iterator iter = attrs.begin(); iter != attrs.end(); ++iter) {
@@ -3693,8 +3859,8 @@ public:
     processor->set_extra_data_len(len);
   }
 
-  int complete(string& etag, time_t *mtime, time_t set_mtime, map<string, bufferlist>& attrs) {
-    return processor->complete(etag, mtime, set_mtime, attrs);
+  int complete(string& etag, time_t *mtime, time_t set_mtime, map<string, bufferlist>& attrs, time_t delete_at) {
+    return processor->complete(etag, mtime, set_mtime, attrs, delete_at);
   }
 };
 
@@ -3757,7 +3923,7 @@ int RGWRados::rewrite_obj(RGWBucketInfo& dest_bucket_info, rgw_obj& obj)
     return ret;
   }
 
-  return copy_obj_data(rctx, dest_bucket_info, read_op, end, obj, obj, max_chunk_size, NULL, mtime, attrset, RGW_OBJ_CATEGORY_MAIN, 0, NULL, NULL, NULL, NULL);
+  return copy_obj_data(rctx, dest_bucket_info, read_op, end, obj, obj, max_chunk_size, NULL, mtime, attrset, RGW_OBJ_CATEGORY_MAIN, 0, 0, NULL, NULL, NULL, NULL);
 }
 
 int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx,
@@ -3780,6 +3946,7 @@ int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx,
                map<string, bufferlist>& attrs,
                RGWObjCategory category,
                uint64_t olh_epoch,
+	       time_t delete_at,
                string *version_id,
                string *ptag,
                string *petag,
@@ -3860,6 +4027,20 @@ int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx,
       JSONDecoder::decode_json("attrs", src_attrs, &jp);
 
       src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout
+      if (source_zone.empty()) { /* need to preserve expiration if copy in the same zonegroup */
+        src_attrs.erase(RGW_ATTR_DELETE_AT);
+      } else {
+	map<string, bufferlist>::iterator iter = src_attrs.find(RGW_ATTR_DELETE_AT);
+	if (iter != src_attrs.end()) {
+	  try {
+	    utime_t da;
+	    ::decode(da, iter->second);
+	    delete_at = (time_t)da.sec();
+	  } catch (buffer::error& err) {
+	    ldout(cct, 0) << "ERROR: failed to decode delete_at field in intra zone copy" << dendl;
+	  }
+	}
+      }
     }
   }
 
@@ -3868,8 +4049,8 @@ int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx,
   }
 
   if (petag) {
-    map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_ETAG);
-    if (iter != attrs.end()) {
+    map<string, bufferlist>::iterator iter = src_attrs.find(RGW_ATTR_ETAG);
+    if (iter != src_attrs.end()) {
       bufferlist& etagbl = iter->second;
       *petag = string(etagbl.c_str(), etagbl.length());
     }
@@ -3881,7 +4062,7 @@ int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx,
     attrs = src_attrs;
   }
 
-  ret = cb.complete(etag, mtime, set_mtime, attrs);
+  ret = cb.complete(etag, mtime, set_mtime, attrs, delete_at);
   if (ret < 0) {
     goto set_err_state;
   }
@@ -3962,6 +4143,7 @@ int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
                map<string, bufferlist>& attrs,
                RGWObjCategory category,
                uint64_t olh_epoch,
+	       time_t delete_at,
                string *version_id,
                string *ptag,
                string *petag,
@@ -3994,7 +4176,7 @@ int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
     return fetch_remote_obj(obj_ctx, user_id, client_id, op_id, info, source_zone,
                dest_obj, src_obj, dest_bucket_info, src_bucket_info, src_mtime, mtime, mod_ptr,
                unmod_ptr, if_match, if_nomatch, attrs_mod, attrs, category,
-               olh_epoch, version_id, ptag, petag, err, progress_cb, progress_data);
+               olh_epoch, delete_at, version_id, ptag, petag, err, progress_cb, progress_data);
   }
 
   map<string, bufferlist> src_attrs;
@@ -4019,6 +4201,7 @@ int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
   }
 
   src_attrs[RGW_ATTR_ACL] = attrs[RGW_ATTR_ACL];
+  src_attrs.erase(RGW_ATTR_DELETE_AT);
 
   set_copy_attrs(src_attrs, attrs, attrs_mod);
   attrs.erase(RGW_ATTR_ID_TAG);
@@ -4063,8 +4246,8 @@ int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
   }
 
   if (petag) {
-    map<string, bufferlist>::iterator iter = src_attrs.find(RGW_ATTR_ETAG);
-    if (iter != src_attrs.end()) {
+    map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_ETAG);
+    if (iter != attrs.end()) {
       bufferlist& etagbl = iter->second;
       *petag = string(etagbl.c_str(), etagbl.length());
     }
@@ -4072,7 +4255,7 @@ int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
 
   if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */
     return copy_obj_data(obj_ctx, dest_bucket_info, read_op, end, dest_obj, src_obj,
-                         max_chunk_size, mtime, 0, attrs, category, olh_epoch,
+                         max_chunk_size, mtime, 0, attrs, category, olh_epoch, delete_at,
                          version_id, ptag, petag, err);
   }
 
@@ -4164,6 +4347,7 @@ int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
   write_op.meta.flags = PUT_OBJ_CREATE;
   write_op.meta.category = category;
   write_op.meta.olh_epoch = olh_epoch;
+  write_op.meta.delete_at = delete_at;
 
   ret = write_op.write_meta(end + 1, attrs);
   if (ret < 0) {
@@ -4207,6 +4391,7 @@ int RGWRados::copy_obj_data(RGWObjectCtx& obj_ctx,
                map<string, bufferlist>& attrs,
                RGWObjCategory category,
                uint64_t olh_epoch,
+	       time_t delete_at,
                string *version_id,
                string *ptag,
                string *petag,
@@ -4263,7 +4448,7 @@ int RGWRados::copy_obj_data(RGWObjectCtx& obj_ctx,
     }
   }
 
-  ret = processor.complete(etag, mtime, set_mtime, attrs);
+  ret = processor.complete(etag, mtime, set_mtime, attrs, delete_at);
 
   return ret;
 }
@@ -4733,6 +4918,25 @@ int RGWRados::Object::Delete::delete_obj()
 
   uint64_t obj_size = state->size;
 
+  if (!params.expiration_time.is_zero()) {
+    bufferlist bl;
+    utime_t delete_at;
+
+    if (state->get_attr(RGW_ATTR_DELETE_AT, bl)) {
+      try {
+        bufferlist::iterator iter = bl.begin();
+        ::decode(delete_at, iter);
+      } catch (buffer::error& err) {
+        ldout(store->ctx(), 0) << "ERROR: couldn't decode RGW_ATTR_DELETE_AT" << dendl;
+	return -EIO;
+      }
+
+      if (params.expiration_time != delete_at) {
+        return -ERR_PRECONDITION_FAILED;
+      }
+    }
+  }
+
   ObjectWriteOperation op;
 
   r = target->prepare_atomic_modification(op, false, NULL, NULL, NULL, true);
@@ -4746,7 +4950,6 @@ int RGWRados::Object::Delete::delete_obj()
 
   index_op.set_bilog_flags(params.bilog_flags);
 
-  string tag;
   r = index_op.prepare(CLS_RGW_OP_DEL);
   if (r < 0)
     return r;
@@ -4794,8 +4997,12 @@ int RGWRados::Object::Delete::delete_obj()
   return 0;
 }
 
-int RGWRados::delete_obj(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, rgw_obj& obj,
-                         int versioning_status, uint16_t bilog_flags)
+int RGWRados::delete_obj(RGWObjectCtx& obj_ctx,
+                         RGWBucketInfo& bucket_info,
+                         rgw_obj& obj,
+                         int versioning_status,
+                         uint16_t bilog_flags,
+                         const utime_t& expiration_time)
 {
   RGWRados::Object del_target(this, bucket_info, obj_ctx, obj);
   RGWRados::Object::Delete del_op(&del_target);
@@ -4803,6 +5010,7 @@ int RGWRados::delete_obj(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, rgw_
   del_op.params.bucket_owner = bucket_info.owner;
   del_op.params.versioning_status = versioning_status;
   del_op.params.bilog_flags = bilog_flags;
+  del_op.params.expiration_time = expiration_time;
 
   return del_op.delete_obj();
 }
@@ -4836,7 +5044,6 @@ int RGWRados::delete_obj_index(rgw_obj& obj)
   std::string oid, key;
   get_obj_bucket_and_oid_loc(obj, bucket, oid, key);
 
-  string tag;
   RGWRados::Bucket bop(this, bucket);
   RGWRados::Bucket::UpdateIndex index_op(&bop, obj, NULL);
 
@@ -5312,6 +5519,20 @@ int RGWRados::set_attrs(void *ctx, rgw_obj& obj,
       continue;
 
     op.setxattr(name.c_str(), bl);
+
+    if (name.compare(RGW_ATTR_DELETE_AT) == 0) {
+      utime_t ts;
+      try {
+        ::decode(ts, bl);
+
+        rgw_obj_key obj_key;
+        obj.get_index_key(&obj_key);
+
+        objexp_hint_add(ts, bucket.name, bucket.bucket_id, obj_key);
+      } catch (buffer::error& err) {
+	ldout(cct, 0) << "ERROR: failed to decode " RGW_ATTR_DELETE_AT << " attr" << dendl;
+      }
+    }
   }
 
   if (!op.size())
@@ -5321,7 +5542,6 @@ int RGWRados::set_attrs(void *ctx, rgw_obj& obj,
   RGWRados::Bucket bop(this, bucket);
   RGWRados::Bucket::UpdateIndex index_op(&bop, obj, state);
 
-  string tag;
   if (state) {
     string tag;
     append_rand_alpha(cct, tag, tag, 32);
@@ -5404,7 +5624,6 @@ int RGWRados::Object::Read::prepare(int64_t *pofs, int64_t *pend)
   CephContext *cct = store->ctx();
 
   bufferlist etag;
-  time_t ctime;
 
   off_t ofs = 0;
   off_t end = -1;
@@ -5434,13 +5653,11 @@ int RGWRados::Object::Read::prepare(int64_t *pofs, int64_t *pend)
         ldout(cct, 20) << "Read xattr: " << iter->first << dendl;
       }
     }
-    if (r < 0)
-      return r;
   }
 
   /* Convert all times go GMT to make them compatible */
   if (conds.mod_ptr || conds.unmod_ptr) {
-    ctime = astate->mtime;
+    time_t ctime = astate->mtime;
 
     if (conds.mod_ptr) {
       ldout(cct, 10) << "If-Modified-Since: " << *conds.mod_ptr << " Last-Modified: " << ctime << dendl;
@@ -6614,6 +6831,7 @@ int RGWRados::apply_olh_log(RGWObjectCtx& obj_ctx, RGWObjState& state, RGWBucket
     vector<rgw_bucket_olh_log_entry>::iterator viter = iter->second.begin();
     for (; viter != iter->second.end(); ++viter) {
       rgw_bucket_olh_log_entry& entry = *viter;
+
       ldout(cct, 20) << "olh_log_entry: op=" << (int)entry.op
                      << " key=" << entry.key.name << "[" << entry.key.instance << "] "
                      << (entry.delete_marker ? "(delete)" : "") << dendl;
@@ -6649,8 +6867,7 @@ int RGWRados::apply_olh_log(RGWObjectCtx& obj_ctx, RGWObjState& state, RGWBucket
   }
 
   if (need_to_link) {
-    rgw_obj target(bucket, key.name);
-    target.set_instance(key.instance);
+    rgw_obj target(bucket, key);
     RGWOLHInfo info;
     info.target = target;
     info.removed = delete_marker;
@@ -6663,8 +6880,7 @@ int RGWRados::apply_olh_log(RGWObjectCtx& obj_ctx, RGWObjState& state, RGWBucket
   for (list<cls_rgw_obj_key>::iterator liter = remove_instances.begin();
        liter != remove_instances.end(); ++liter) {
     cls_rgw_obj_key& key = *liter;
-    rgw_obj obj_instance(bucket, key.name);
-    obj_instance.set_instance(key.instance);
+    rgw_obj obj_instance(bucket, key);
     int ret = delete_obj(obj_ctx, bucket_info, obj_instance, 0, RGW_BILOG_FLAG_VERSIONED_OP);
     if (ret < 0 && ret != -ENOENT) {
       ldout(cct, 0) << "ERROR: delete_obj() returned " << ret << " obj_instance=" << obj_instance << dendl;
@@ -7096,9 +7312,9 @@ int RGWRados::get_bucket_stats(rgw_bucket& bucket, string *bucket_ver, string *m
   char buf[64];
   for(; iter != headers.end(); ++iter, ++viter) {
     accumulate_raw_stats(iter->second, stats);
-    snprintf(buf, sizeof(buf), "%lu", iter->second.ver);
+    snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->second.ver);
     ver_mgr.add(viter->first, string(buf));
-    snprintf(buf, sizeof(buf), "%lu", iter->second.master_ver);
+    snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->second.master_ver);
     master_ver_mgr.add(viter->first, string(buf));
     marker_mgr.add(viter->first, iter->second.max_marker);
   }
@@ -7934,6 +8150,12 @@ int RGWRados::process_gc()
   return gc->process();
 }
 
+int RGWRados::process_expire_objects()
+{
+  obj_expirer->inspect_all_shards(utime_t(), ceph_clock_now(cct));
+  return 0;
+}
+
 int RGWRados::cls_rgw_init_index(librados::IoCtx& index_ctx, librados::ObjectWriteOperation& op, string& oid)
 {
   bufferlist in;
diff --git a/src/rgw/rgw_rados.h b/src/rgw/rgw_rados.h
index 37c7e8a..27787ec 100644
--- a/src/rgw/rgw_rados.h
+++ b/src/rgw/rgw_rados.h
@@ -13,6 +13,7 @@
 #include "cls/version/cls_version_types.h"
 #include "cls/log/cls_log_types.h"
 #include "cls/statelog/cls_statelog_types.h"
+#include "cls/timeindex/cls_timeindex_types.h"
 #include "rgw_log.h"
 #include "rgw_metadata.h"
 #include "rgw_rest_conn.h"
@@ -21,6 +22,7 @@ class RGWWatcher;
 class SafeTimer;
 class ACLOwner;
 class RGWGC;
+class RGWObjectExpirer;
 
 /* flags for put_obj_meta() */
 #define PUT_OBJ_CREATE      0x01
@@ -76,7 +78,7 @@ struct RGWOLHInfo {
      ::decode(removed, bl);
      DECODE_FINISH(bl);
   }
-
+  static void generate_test_instances(list<RGWOLHInfo*>& o);
   void dump(Formatter *f) const;
 };
 WRITE_CLASS_ENCODER(RGWOLHInfo)
@@ -794,6 +796,7 @@ struct RGWZoneParams {
   }
   void dump(Formatter *f) const;
   void decode_json(JSONObj *obj);
+  static void generate_test_instances(list<RGWZoneParams*>& o);
 };
 WRITE_CLASS_ENCODER(RGWZoneParams)
 
@@ -839,6 +842,7 @@ struct RGWZone {
   }
   void dump(Formatter *f) const;
   void decode_json(JSONObj *obj);
+  static void generate_test_instances(list<RGWZone*>& o);
 };
 WRITE_CLASS_ENCODER(RGWZone)
 
@@ -858,6 +862,7 @@ struct RGWDefaultRegionInfo {
   }
   void dump(Formatter *f) const;
   void decode_json(JSONObj *obj);
+  //todo: implement ceph-dencoder
 };
 WRITE_CLASS_ENCODER(RGWDefaultRegionInfo)
 
@@ -960,6 +965,7 @@ struct RGWRegion {
 
   void dump(Formatter *f) const;
   void decode_json(JSONObj *obj);
+  static void generate_test_instances(list<RGWRegion*>& o);
 };
 WRITE_CLASS_ENCODER(RGWRegion)
 
@@ -989,6 +995,32 @@ struct RGWRegionMap {
 };
 WRITE_CLASS_ENCODER(RGWRegionMap)
 
+struct objexp_hint_entry {
+  string bucket_name;
+  string bucket_id;
+  rgw_obj_key obj_key;
+  utime_t exp_time;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    ::encode(bucket_name, bl);
+    ::encode(bucket_id, bl);
+    ::encode(obj_key, bl);
+    ::encode(exp_time, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::iterator& bl) {
+    DECODE_START(1, bl);
+    ::decode(bucket_name, bl);
+    ::decode(bucket_id, bl);
+    ::decode(obj_key, bl);
+    ::decode(exp_time, bl);
+    DECODE_FINISH(bl);
+  }
+};
+WRITE_CLASS_ENCODER(objexp_hint_entry)
+
 class RGWDataChangesLog;
 class RGWReplicaLogger;
   
@@ -1167,12 +1199,14 @@ class Finisher;
 class RGWRados
 {
   friend class RGWGC;
+  friend class RGWObjectExpirer;
   friend class RGWStateLog;
   friend class RGWReplicaLogger;
 
   /** Open the pool used as root for this gateway */
   int open_root_pool_ctx();
   int open_gc_pool_ctx();
+  int open_objexp_pool_ctx();
 
   int open_bucket_pool_ctx(const string& bucket_name, const string& pool, librados::IoCtx&  io_ctx);
   int open_bucket_index_ctx(rgw_bucket& bucket, librados::IoCtx&  index_ctx);
@@ -1209,6 +1243,7 @@ class RGWRados
   };
 
   RGWGC *gc;
+  RGWObjectExpirer *obj_expirer;
   bool use_gc_thread;
   bool quota_threads;
 
@@ -1251,6 +1286,7 @@ protected:
   std::map<pthread_t, int> rados_map;
 
   librados::IoCtx gc_pool_ctx;        // .rgw.gc
+  librados::IoCtx objexp_pool_ctx;
 
   bool pools_initialized;
 
@@ -1264,7 +1300,7 @@ protected:
 
 public:
   RGWRados() : max_req_id(0), lock("rados_timer_lock"), watchers_lock("watchers_lock"), timer(NULL),
-               gc(NULL), use_gc_thread(false), quota_threads(false),
+               gc(NULL), obj_expirer(NULL), use_gc_thread(false), quota_threads(false),
                num_watchers(0), watchers(NULL),
                watch_initialized(false),
                bucket_id_lock("rados_bucket_id"),
@@ -1438,7 +1474,7 @@ public:
         map<string, bufferlist> *attrs;
         struct rgw_err *perr;
 
-        StatParams() : lastmod(NULL), obj_size(NULL), attrs(NULL) {}
+        StatParams() : lastmod(NULL), obj_size(NULL), attrs(NULL), perr(NULL) {}
       } stat_params;
 
       struct ReadParams {
@@ -1542,7 +1578,7 @@ public:
         map<string, bufferlist> *attrs;
         struct rgw_err *perr;
 
-        Params() : lastmod(NULL), read_size(NULL), obj_size(NULL), attrs(NULL) {}
+        Params() : lastmod(NULL), read_size(NULL), obj_size(NULL), attrs(NULL), perr(NULL) {}
       } params;
 
       Read(RGWRados::Object *_source) : source(_source) {}
@@ -1570,10 +1606,11 @@ public:
         const char *if_match;
         const char *if_nomatch;
         uint64_t olh_epoch;
+	time_t delete_at;
 
         MetaParams() : mtime(NULL), rmattrs(NULL), data(NULL), manifest(NULL), ptag(NULL),
                  remove_objs(NULL), set_mtime(0), category(RGW_OBJ_CATEGORY_MAIN), flags(0),
-                 if_match(NULL), if_nomatch(NULL), olh_epoch(0) {}
+                 if_match(NULL), if_nomatch(NULL), olh_epoch(0), delete_at(0) {}
       } meta;
 
       Write(RGWRados::Object *_target) : target(_target) {}
@@ -1593,6 +1630,7 @@ public:
         string marker_version_id;
         uint32_t bilog_flags;
         list<rgw_obj_key> *remove_objs;
+        utime_t expiration_time;
 
         DeleteParams() : versioning_status(0), olh_epoch(0), bilog_flags(0), remove_objs(NULL) {}
       } params;
@@ -1699,6 +1737,7 @@ public:
         string prefix;
         string delim;
         rgw_obj_key marker;
+        rgw_obj_key end_marker;
         string ns;
         bool enforce_ns;
         RGWAccessListFilter *filter;
@@ -1770,6 +1809,7 @@ public:
                        map<string, bufferlist>& attrs,
                        RGWObjCategory category,
                        uint64_t olh_epoch,
+		       time_t delete_at,
                        string *version_id,
                        string *ptag,
                        string *petag,
@@ -1817,6 +1857,7 @@ public:
                map<std::string, bufferlist>& attrs,
                RGWObjCategory category,
                uint64_t olh_epoch,
+	       time_t delete_at,
                string *version_id,
                string *ptag,
                string *petag,
@@ -1835,6 +1876,7 @@ public:
                map<string, bufferlist>& attrs,
                RGWObjCategory category,
                uint64_t olh_epoch,
+	       time_t delete_at,
                string *version_id,
                string *ptag,
                string *petag,
@@ -1857,8 +1899,12 @@ public:
   int bucket_suspended(rgw_bucket& bucket, bool *suspended);
 
   /** Delete an object.*/
-  virtual int delete_obj(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_owner, rgw_obj& src_obj,
-                         int versioning_status, uint16_t bilog_flags = 0);
+  virtual int delete_obj(RGWObjectCtx& obj_ctx,
+                         RGWBucketInfo& bucket_owner,
+                         rgw_obj& src_obj,
+                         int versioning_status,
+                         uint16_t bilog_flags = 0,
+                         const utime_t& expiration_time = utime_t());
 
   /* Delete a system object */
   virtual int delete_system_obj(rgw_obj& src_obj, RGWObjVersionTracker *objv_tracker = NULL);
@@ -2066,6 +2112,31 @@ public:
   int time_log_info(const string& oid, cls_log_header *header);
   int time_log_trim(const string& oid, const utime_t& start_time, const utime_t& end_time,
                     const string& from_marker, const string& to_marker);
+
+  string objexp_hint_get_shardname(int shard_num);
+  int objexp_key_shard(const rgw_obj_key& key);
+  void objexp_get_shard(int shard_num,
+                        string& shard);                       /* out */
+  int objexp_hint_add(const utime_t& delete_at,
+                      const string& bucket_name,
+                      const string& bucket_id,
+                      const rgw_obj_key& obj_key);
+  int objexp_hint_list(const string& oid,
+                       const utime_t& start_time,
+                       const utime_t& end_time,
+                       const int max_entries,
+                       const string& marker,
+                       list<cls_timeindex_entry>& entries, /* out */
+                       string *out_marker,                 /* out */
+                       bool *truncated);                   /* out */
+  int objexp_hint_parse(cls_timeindex_entry &ti_entry,
+                        objexp_hint_entry& hint_entry);    /* out */
+  int objexp_hint_trim(const string& oid,
+                       const utime_t& start_time,
+                       const utime_t& end_time,
+                       const string& from_marker = std::string(),
+                       const string& to_marker   = std::string());
+
   int lock_exclusive(rgw_bucket& pool, const string& oid, utime_t& duration, string& zone_id, string& owner_id);
   int unlock(rgw_bucket& pool, const string& oid, string& zone_id, string& owner_id);
 
@@ -2077,6 +2148,7 @@ public:
 
   int list_gc_objs(int *index, string& marker, uint32_t max, bool expired_only, std::list<cls_rgw_gc_obj_info>& result, bool *truncated);
   int process_gc();
+  int process_expire_objects();
   int defer_gc(void *ctx, rgw_obj& obj);
 
   int bucket_check_index(rgw_bucket& bucket,
@@ -2307,7 +2379,7 @@ protected:
   RGWBucketInfo bucket_info;
 
   virtual int do_complete(string& etag, time_t *mtime, time_t set_mtime,
-                          map<string, bufferlist>& attrs,
+                          map<string, bufferlist>& attrs, time_t delete_at,
                           const char *if_match = NULL, const char *if_nomatch = NULL) = 0;
 
 public:
@@ -2323,7 +2395,7 @@ public:
     assert(0);
   }
   virtual int complete(string& etag, time_t *mtime, time_t set_mtime,
-                       map<string, bufferlist>& attrs,
+                       map<string, bufferlist>& attrs, time_t delete_at,
                        const char *if_match = NULL, const char *if_nomatch = NULL);
 
   CephContext *ctx();
@@ -2394,7 +2466,7 @@ protected:
 
   int write_data(bufferlist& bl, off_t ofs, void **phandle, bool exclusive);
   virtual int do_complete(string& etag, time_t *mtime, time_t set_mtime,
-                          map<string, bufferlist>& attrs,
+                          map<string, bufferlist>& attrs, time_t delete_at,
                           const char *if_match = NULL, const char *if_nomatch = NULL);
 
   int prepare_next_part(off_t ofs);
diff --git a/src/rgw/rgw_rest.cc b/src/rgw/rgw_rest.cc
index 45eba58..e2b1568 100644
--- a/src/rgw/rgw_rest.cc
+++ b/src/rgw/rgw_rest.cc
@@ -161,7 +161,8 @@ string camelcase_dash_http_attr(const string& orig)
   return string(buf);
 }
 
-static list<string> hostnames_list;
+/* avoid duplicate hostnames in hostnames list */
+static set<string> hostnames_set;
 
 void rgw_rest_init(CephContext *cct, RGWRegion& region)
 {
@@ -193,18 +194,19 @@ void rgw_rest_init(CephContext *cct, RGWRegion& region)
     http_status_names[h->code] = h->name;
   }
 
-  /* avoid duplicate hostnames in hostnames list */
-  map<string, bool> hostnames_map;
   if (!cct->_conf->rgw_dns_name.empty()) {
-    hostnames_map[cct->_conf->rgw_dns_name] = true;
-  }
-  for (list<string>::iterator iter = region.hostnames.begin(); iter != region.hostnames.end(); ++iter) {
-    hostnames_map[*iter] = true;
-  }
-
-  for (map<string, bool>::iterator iter = hostnames_map.begin(); iter != hostnames_map.end(); ++iter) {
-    hostnames_list.push_back(iter->first);
-  }
+    hostnames_set.insert(cct->_conf->rgw_dns_name);
+  }
+  hostnames_set.insert(region.hostnames.begin(),  region.hostnames.end());
+  /* TODO: We should have a sanity check that no hostname matches the end of
+   * any other hostname, otherwise we will get ambigious results from
+   * rgw_find_host_in_domains.
+   * Eg: 
+   * Hostnames: [A, B.A]
+   * Inputs: [Z.A, X.B.A]
+   * Z.A clearly splits to subdomain=Z, domain=Z
+   * X.B.A ambigously splits to both {X, B.A} and {X.B, A}
+   */
 }
 
 static bool str_ends_with(const string& s, const string& suffix, size_t *pos)
@@ -224,8 +226,8 @@ static bool str_ends_with(const string& s, const string& suffix, size_t *pos)
 
 static bool rgw_find_host_in_domains(const string& host, string *domain, string *subdomain)
 {
-  list<string>::iterator iter;
-  for (iter = hostnames_list.begin(); iter != hostnames_list.end(); ++iter) {
+  set<string>::iterator iter;
+  for (iter = hostnames_set.begin(); iter != hostnames_set.end(); ++iter) {
     size_t pos;
     if (!str_ends_with(host, *iter, &pos))
       continue;
@@ -337,13 +339,19 @@ void dump_content_length(struct req_state *s, uint64_t len)
   }
 }
 
-void dump_etag(struct req_state *s, const char *etag)
+void dump_etag(struct req_state * const s, const char * const etag)
 {
+  if ('\0' == *etag) {
+    return;
+  }
+
   int r;
-  if (s->prot_flags & RGW_REST_SWIFT)
+  if (s->prot_flags & RGW_REST_SWIFT) {
     r = s->cio->print("etag: %s\r\n", etag);
-  else
+  } else {
     r = s->cio->print("ETag: \"%s\"\r\n", etag);
+  }
+
   if (r < 0) {
     ldout(s->cct, 0) << "ERROR: s->cio->print() returned err=" << r << dendl;
   }
@@ -545,6 +553,8 @@ void end_header(struct req_state *s, RGWOp *op, const char *content_type, const
       s->formatter->dump_string("Code", s->err.s3_code);
     if (!s->err.message.empty())
       s->formatter->dump_string("Message", s->err.message);
+    if (!s->trans_id.empty())
+      s->formatter->dump_string("RequestId", s->trans_id);
     s->formatter->close_section();
     dump_content_length(s, s->formatter->get_len());
   } else {
@@ -555,7 +565,7 @@ void end_header(struct req_state *s, RGWOp *op, const char *content_type, const
 
   int r;
   if (content_type) {
-      r = s->cio->print("Content-type: %s\r\n", content_type);
+      r = s->cio->print("Content-Type: %s\r\n", content_type);
       if (r < 0) {
 	ldout(s->cct, 0) << "ERROR: s->cio->print() returned err=" << r << dendl;
       }
@@ -817,7 +827,7 @@ int RGWPutObj_ObjStore::verify_params()
 {
   if (s->length) {
     off_t len = atoll(s->length);
-    if (len > (off_t)RGW_MAX_PUT_SIZE) {
+    if (len > (off_t)(s->cct->_conf->rgw_max_put_size)) {
       return -ERR_TOO_LARGE;
     }
   }
@@ -856,7 +866,7 @@ int RGWPutObj_ObjStore::get_data(bufferlist& bl)
     bl.append(bp, 0, len);
   }
 
-  if ((uint64_t)ofs + len > RGW_MAX_PUT_SIZE) {
+  if ((uint64_t)ofs + len > s->cct->_conf->rgw_max_put_size) {
     return -ERR_TOO_LARGE;
   }
 
@@ -875,7 +885,7 @@ int RGWPostObj_ObjStore::verify_params()
     return -ERR_LENGTH_REQUIRED;
   }
   off_t len = atoll(s->length);
-  if (len > (off_t)RGW_MAX_PUT_SIZE) {
+  if (len > (off_t)(s->cct->_conf->rgw_max_put_size)) {
     return -ERR_TOO_LARGE;
   }
 
diff --git a/src/rgw/rgw_rest.h b/src/rgw/rgw_rest.h
index 02ae790..c3dc847 100644
--- a/src/rgw/rgw_rest.h
+++ b/src/rgw/rgw_rest.h
@@ -163,17 +163,25 @@ public:
   virtual int verify_params();
 };
 
-class RGWPutMetadata_ObjStore : public RGWPutMetadata
+class RGWPutMetadataAccount_ObjStore : public RGWPutMetadataAccount
 {
 public:
-  RGWPutMetadata_ObjStore() {}
-  ~RGWPutMetadata_ObjStore() {}
+  RGWPutMetadataAccount_ObjStore() {}
+  ~RGWPutMetadataAccount_ObjStore() {}
 };
 
-class RGWSetTempUrl_ObjStore : public RGWSetTempUrl {
+class RGWPutMetadataBucket_ObjStore : public RGWPutMetadataBucket
+{
+public:
+  RGWPutMetadataBucket_ObjStore() {}
+  ~RGWPutMetadataBucket_ObjStore() {}
+};
+
+class RGWPutMetadataObject_ObjStore : public RGWPutMetadataObject
+{
 public:
-  RGWSetTempUrl_ObjStore() {}
-  ~RGWSetTempUrl_ObjStore() {}
+  RGWPutMetadataObject_ObjStore() {}
+  ~RGWPutMetadataObject_ObjStore() {}
 };
 
 class RGWDeleteObj_ObjStore : public RGWDeleteObj {
diff --git a/src/rgw/rgw_rest_log.cc b/src/rgw/rgw_rest_log.cc
index b10fbef..7e3707f 100644
--- a/src/rgw/rgw_rest_log.cc
+++ b/src/rgw/rgw_rest_log.cc
@@ -276,7 +276,7 @@ void RGWOp_BILog_List::execute() {
   RGWBucketInfo bucket_info;
   unsigned max_entries;
 
-  RGWObjectCtx& obj_ctx = *(RGWObjectCtx *)s->obj_ctx;
+  RGWObjectCtx& obj_ctx = *static_cast<RGWObjectCtx *>(s->obj_ctx);
 
   if (bucket_name.empty() && bucket_instance.empty()) {
     dout(5) << "ERROR: neither bucket nor bucket instance specified" << dendl;
@@ -368,7 +368,7 @@ void RGWOp_BILog_Info::execute() {
          bucket_instance = s->info.args.get("bucket-instance");
   RGWBucketInfo bucket_info;
 
-  RGWObjectCtx& obj_ctx = *(RGWObjectCtx *)s->obj_ctx;
+  RGWObjectCtx& obj_ctx = *static_cast<RGWObjectCtx *>(s->obj_ctx);
 
   if (bucket_name.empty() && bucket_instance.empty()) {
     dout(5) << "ERROR: neither bucket nor bucket instance specified" << dendl;
@@ -422,7 +422,7 @@ void RGWOp_BILog_Delete::execute() {
 
   RGWBucketInfo bucket_info;
 
-  RGWObjectCtx& obj_ctx = *(RGWObjectCtx *)s->obj_ctx;
+  RGWObjectCtx& obj_ctx = *static_cast<RGWObjectCtx *>(s->obj_ctx);
 
   http_ret = 0;
   if ((bucket_name.empty() && bucket_instance.empty()) ||
diff --git a/src/rgw/rgw_rest_log.h b/src/rgw/rgw_rest_log.h
index 22221d4..d6873bd 100644
--- a/src/rgw/rgw_rest_log.h
+++ b/src/rgw/rgw_rest_log.h
@@ -76,9 +76,8 @@ class RGWOp_MDLog_List : public RGWRESTOp {
   list<cls_log_entry> entries;
   string last_marker;
   bool truncated;
-  int http_ret;
 public:
-  RGWOp_MDLog_List() : truncated(false), http_ret(0) {}
+  RGWOp_MDLog_List() : truncated(false) {}
   ~RGWOp_MDLog_List() {}
 
   int check_caps(RGWUserCaps& caps) {
@@ -96,9 +95,8 @@ public:
 
 class RGWOp_MDLog_Info : public RGWRESTOp {
   unsigned num_objects;
-  int http_ret;
 public:
-  RGWOp_MDLog_Info() : num_objects(0), http_ret(0) {}
+  RGWOp_MDLog_Info() : num_objects(0) {}
   ~RGWOp_MDLog_Info() {}
 
   int check_caps(RGWUserCaps& caps) {
@@ -179,9 +177,8 @@ class RGWOp_DATALog_List : public RGWRESTOp {
   list<rgw_data_change> entries;
   string last_marker;
   bool truncated;
-  int http_ret;
 public:
-  RGWOp_DATALog_List() : truncated(false), http_ret(0) {}
+  RGWOp_DATALog_List() : truncated(false) {}
   ~RGWOp_DATALog_List() {}
 
   int check_caps(RGWUserCaps& caps) {
@@ -199,9 +196,8 @@ public:
 
 class RGWOp_DATALog_Info : public RGWRESTOp {
   unsigned num_objects;
-  int http_ret;
 public:
-  RGWOp_DATALog_Info() : num_objects(0), http_ret(0) {}
+  RGWOp_DATALog_Info() : num_objects(0) {}
   ~RGWOp_DATALog_Info() {}
 
   int check_caps(RGWUserCaps& caps) {
diff --git a/src/rgw/rgw_rest_s3.cc b/src/rgw/rgw_rest_s3.cc
index 768766f..487f1b1 100644
--- a/src/rgw/rgw_rest_s3.cc
+++ b/src/rgw/rgw_rest_s3.cc
@@ -243,6 +243,7 @@ int RGWListBucket_ObjStore_S3::get_params()
     return ret;
   }
   delimiter = s->info.args.get("delimiter");
+  encoding_type = s->info.args.get("encoding-type");
   return 0;
 }
 
@@ -261,13 +262,23 @@ void RGWListBucket_ObjStore_S3::send_versioned_response()
 
   s->formatter->dump_string("IsTruncated", (max && is_truncated ? "true" : "false"));
 
+  bool encode_key = false;
+  if (strcasecmp(encoding_type.c_str(), "url") == 0)
+    encode_key = true;
+
   if (ret >= 0) {
     vector<RGWObjEnt>::iterator iter;
     for (iter = objs.begin(); iter != objs.end(); ++iter) {
       time_t mtime = iter->mtime.sec();
       const char *section_name = (iter->is_delete_marker() ? "DeleteMarker" : "Version");
       s->formatter->open_array_section(section_name);
-      s->formatter->dump_string("Key", iter->key.name);
+      if (encode_key) {
+        string key_name;
+        url_encode(iter->key.name, key_name);
+        s->formatter->dump_string("Key", key_name);
+      } else {
+        s->formatter->dump_string("Key", iter->key.name);
+      }
       string version_id = iter->key.instance;
       if (version_id.empty()) {
         version_id = "null";
@@ -328,11 +339,21 @@ void RGWListBucket_ObjStore_S3::send_response()
 
   s->formatter->dump_string("IsTruncated", (max && is_truncated ? "true" : "false"));
 
+  bool encode_key = false;
+  if (strcasecmp(encoding_type.c_str(), "url") == 0)
+    encode_key = true;
+
   if (ret >= 0) {
     vector<RGWObjEnt>::iterator iter;
     for (iter = objs.begin(); iter != objs.end(); ++iter) {
       s->formatter->open_array_section("Contents");
-      s->formatter->dump_string("Key", iter->key.name);
+      if (encode_key) {
+        string key_name;
+        url_encode(iter->key.name, key_name);
+        s->formatter->dump_string("Key", key_name);
+      } else {
+        s->formatter->dump_string("Key", iter->key.name);
+      }
       time_t mtime = iter->mtime.sec();
       dump_time(s, "LastModified", &mtime);
       s->formatter->dump_format("ETag", "\"%s\"", iter->etag.c_str());
@@ -372,13 +393,21 @@ void RGWGetBucketLocation_ObjStore_S3::send_response()
   end_header(s, this);
   dump_start(s);
 
-  string location_constraint(s->bucket_info.region);
-  if (s->bucket_info.region == "default")
-    location_constraint.clear();
+  string region = s->bucket_info.region;
+  string api_name;
+
+  map<string, RGWRegion>::iterator iter = store->region_map.regions.find(region);
+  if (iter != store->region_map.regions.end()) {
+    api_name = iter->second.api_name;
+  } else  {
+    if (region != "default") {
+      api_name = region;
+    }
+  }
 
   s->formatter->dump_format_ns("LocationConstraint",
 			       "http://doc.s3.amazonaws.com/doc/2006-03-01/",
-			       "%s",location_constraint.c_str());
+			       "%s",api_name.c_str());
   rgw_flush_formatter_and_reset(s, s->formatter);
 }
 
@@ -1422,6 +1451,10 @@ int RGWCopyObj_ObjStore_S3::init_dest_policy()
 
 int RGWCopyObj_ObjStore_S3::get_params()
 {
+  if (s->info.env->get("HTTP_X_AMZ_COPY_SOURCE_RANGE")) {
+    return -ERR_NOT_IMPLEMENTED;
+  }
+
   if_mod = s->info.env->get("HTTP_X_AMZ_COPY_IF_MODIFIED_SINCE");
   if_unmod = s->info.env->get("HTTP_X_AMZ_COPY_IF_UNMODIFIED_SINCE");
   if_match = s->info.env->get("HTTP_X_AMZ_COPY_IF_MATCH");
diff --git a/src/rgw/rgw_rest_s3.h b/src/rgw/rgw_rest_s3.h
index d32ba5d..5db03da 100644
--- a/src/rgw/rgw_rest_s3.h
+++ b/src/rgw/rgw_rest_s3.h
@@ -30,7 +30,7 @@ public:
   ~RGWListBuckets_ObjStore_S3() {}
 
   int get_params() {
-    limit = 0; /* no limit */
+    limit = -1; /* no limit */
     return 0;
   }
   virtual void send_response_begin(bool has_buckets);
@@ -372,7 +372,8 @@ public:
   virtual ~RGWHandler_ObjStore_S3() {}
 
   int validate_bucket_name(const string& bucket, bool relaxed_names);
-
+  using RGWHandler_ObjStore::validate_bucket_name;
+  
   virtual int init(RGWRados *store, struct req_state *state, RGWClientIO *cio);
   virtual int authorize() {
     return RGW_Auth_S3::authorize(store, s);
@@ -441,9 +442,6 @@ public:
   RGWRESTMgr_S3() {}
   virtual ~RGWRESTMgr_S3() {}
 
-  virtual RGWRESTMgr *get_resource_mgr(struct req_state *s, const string& uri) {
-    return this;
-  }
   virtual RGWHandler *get_handler(struct req_state *s);
 };
 
diff --git a/src/rgw/rgw_rest_swift.cc b/src/rgw/rgw_rest_swift.cc
index f549364..cfb447a 100644
--- a/src/rgw/rgw_rest_swift.cc
+++ b/src/rgw/rgw_rest_swift.cc
@@ -17,25 +17,29 @@
 int RGWListBuckets_ObjStore_SWIFT::get_params()
 {
   marker = s->info.args.get("marker");
-  string limit_str;
-  limit_str = s->info.args.get("limit");
-  long l = strtol(limit_str.c_str(), NULL, 10);
-  if (l > (long)limit_max || l < 0)
-    return -ERR_PRECONDITION_FAILED;
 
-  limit = (uint64_t)l;
+  string limit_str = s->info.args.get("limit");
+  if (!limit_str.empty()) {
+    string err;
+    long l = strict_strtol(limit_str.c_str(), 10, &err);
+    if (!err.empty()) {
+      return -EINVAL;
+    }
 
-  if (limit == 0)
-    limit = limit_max;
+    if (l > (long)limit_max || l < 0) {
+      return -ERR_PRECONDITION_FAILED;
+    }
 
-  need_stats = (s->format != RGW_FORMAT_PLAIN);
+    limit = (uint64_t)l;
+  }
 
   if (need_stats) {
     bool stats, exists;
     int r = s->info.args.get_bool("stats", &stats, &exists);
 
-    if (r < 0)
+    if (r < 0) {
       return r;
+    }
 
     if (exists) {
       need_stats = stats;
@@ -45,6 +49,54 @@ int RGWListBuckets_ObjStore_SWIFT::get_params()
   return 0;
 }
 
+static void dump_account_metadata(struct req_state * const s,
+                                  const uint32_t buckets_count,
+                                  const uint64_t buckets_object_count,
+                                  const uint64_t buckets_size,
+                                  const uint64_t buckets_size_rounded,
+                                  map<string, bufferlist>& attrs)
+{
+  char buf[32];
+  utime_t now = ceph_clock_now(g_ceph_context);
+  snprintf(buf, sizeof(buf), "%0.5f", (double)now);
+  /* Adding X-Timestamp to keep align with Swift API */
+  s->cio->print("X-Timestamp: %s\r\n", buf);
+  snprintf(buf, sizeof(buf), "%lld", (long long)buckets_count);
+  s->cio->print("X-Account-Container-Count: %s\r\n", buf);
+  snprintf(buf, sizeof(buf), "%lld", (long long)buckets_object_count);
+  s->cio->print("X-Account-Object-Count: %s\r\n", buf);
+  snprintf(buf, sizeof(buf), "%lld", (long long)buckets_size);
+  s->cio->print("X-Account-Bytes-Used: %s\r\n", buf);
+  snprintf(buf, sizeof(buf), "%lld", (long long)buckets_size_rounded);
+  s->cio->print("X-Account-Bytes-Used-Actual: %s\r\n", buf);
+
+  /* Dump TempURL-related stuff */
+  if (s->perm_mask == RGW_PERM_FULL_CONTROL) {
+    map<int, string>::iterator iter;
+    iter = s->user.temp_url_keys.find(0);
+    if (iter != s->user.temp_url_keys.end() && !iter->second.empty()) {
+      s->cio->print("X-Account-Meta-Temp-Url-Key: %s\r\n", iter->second.c_str());
+    }
+
+    iter = s->user.temp_url_keys.find(1);
+    if (iter != s->user.temp_url_keys.end() && !iter->second.empty()) {
+      s->cio->print("X-Account-Meta-Temp-Url-Key-2: %s\r\n", iter->second.c_str());
+    }
+  }
+
+  /* Dump user-defined metadata items */
+  const size_t PREFIX_LEN = sizeof(RGW_ATTR_META_PREFIX) - 1;
+  map<string, bufferlist>::iterator iter;
+  for (iter = attrs.lower_bound(RGW_ATTR_META_PREFIX); iter != attrs.end(); ++iter) {
+    const char *name = iter->first.c_str();
+    if (strncmp(name, RGW_ATTR_META_PREFIX, PREFIX_LEN) == 0) {
+      s->cio->print("X-Account-Meta-%s: %s\r\n", name + PREFIX_LEN, iter->second.c_str());
+    } else {
+      break;
+    }
+  }
+}
+
 void RGWListBuckets_ObjStore_SWIFT::send_response_begin(bool has_buckets)
 {
   if (ret) {
@@ -55,6 +107,13 @@ void RGWListBuckets_ObjStore_SWIFT::send_response_begin(bool has_buckets)
   }
 
   if (!g_conf->rgw_swift_enforce_content_length) {
+    /* Adding account stats in the header to keep align with Swift API */
+    dump_account_metadata(s,
+            buckets_count,
+            buckets_objcount,
+            buckets_size,
+            buckets_size_rounded,
+            attrs);
     dump_errno(s);
     end_header(s, NULL, NULL, NO_CONTENT_LENGTH, true);
   }
@@ -98,6 +157,13 @@ void RGWListBuckets_ObjStore_SWIFT::send_response_end()
   }
 
   if (g_conf->rgw_swift_enforce_content_length) {
+    /* Adding account stats in the header to keep align with Swift API */
+    dump_account_metadata(s,
+            buckets_count,
+            buckets_objcount,
+            buckets_size,
+            buckets_size_rounded,
+            attrs);
     dump_errno(s);
     end_header(s, NULL, NULL, s->formatter->get_len(), true);
   }
@@ -111,6 +177,7 @@ int RGWListBucket_ObjStore_SWIFT::get_params()
 {
   prefix = s->info.args.get("prefix");
   marker = s->info.args.get("marker");
+  end_marker = s->info.args.get("end_marker");
   max_keys = s->info.args.get("limit");
   ret = parse_max_keys();
   if (ret < 0) {
@@ -285,25 +352,18 @@ static void dump_container_metadata(struct req_state *s, RGWBucketEnt& bucket)
   }
 }
 
-static void dump_account_metadata(struct req_state *s, uint32_t buckets_count,
-                                  uint64_t buckets_object_count, uint64_t buckets_size, uint64_t buckets_size_rounded)
+void RGWStatAccount_ObjStore_SWIFT::execute()
 {
-  char buf[32];
-  snprintf(buf, sizeof(buf), "%lld", (long long)buckets_count);
-  s->cio->print("X-Account-Container-Count: %s\r\n", buf);
-  snprintf(buf, sizeof(buf), "%lld", (long long)buckets_object_count);
-  s->cio->print("X-Account-Object-Count: %s\r\n", buf);
-  snprintf(buf, sizeof(buf), "%lld", (long long)buckets_size);
-  s->cio->print("X-Account-Bytes-Used: %s\r\n", buf);
-  snprintf(buf, sizeof(buf), "%lld", (long long)buckets_size_rounded);
-  s->cio->print("X-Account-Bytes-Used-Actual: %s\r\n", buf);
+  RGWStatAccount_ObjStore::execute();
+
+  ret = rgw_get_user_attrs_by_uid(store, s->user.user_id, attrs);
 }
 
 void RGWStatAccount_ObjStore_SWIFT::send_response()
 {
   if (ret >= 0) {
     ret = STATUS_NO_CONTENT;
-    dump_account_metadata(s, buckets_count, buckets_objcount, buckets_size, buckets_size_rounded);
+    dump_account_metadata(s, buckets_count, buckets_objcount, buckets_size, buckets_size_rounded, attrs);
   }
 
   set_req_state_err(s, ret);
@@ -422,15 +482,52 @@ void RGWDeleteBucket_ObjStore_SWIFT::send_response()
   rgw_flush_formatter_and_reset(s, s->formatter);
 }
 
+static int get_delete_at_param(req_state *s, time_t *delete_at)
+{
+  /* Handle Swift object expiration. */
+  utime_t delat_proposal;
+  string x_delete = s->info.env->get("HTTP_X_DELETE_AFTER", "");
+
+  if (x_delete.empty()) {
+    x_delete = s->info.env->get("HTTP_X_DELETE_AT", "");
+  } else {
+    /* X-Delete-After HTTP is present. It means we need add its value
+     * to the current time. */
+    delat_proposal = ceph_clock_now(g_ceph_context);
+  }
+
+  if (x_delete.empty()) {
+    return 0;
+  }
+  string err;
+  long ts = strict_strtoll(x_delete.c_str(), 10, &err);
+
+  if (!err.empty()) {
+    return -EINVAL;
+  }
+
+  delat_proposal += utime_t(ts, 0);
+  if (delat_proposal < ceph_clock_now(g_ceph_context)) {
+    return -EINVAL;
+  }
+
+  *delete_at = delat_proposal.sec();
+
+  return 0;
+}
+
 int RGWPutObj_ObjStore_SWIFT::get_params()
 {
-  if (s->has_bad_meta)
+  if (s->has_bad_meta) {
     return -EINVAL;
+  }
 
   if (!s->length) {
     const char *encoding = s->info.env->get("HTTP_TRANSFER_ENCODING");
-    if (!encoding || strcmp(encoding, "chunked") != 0)
+    if (!encoding || strcmp(encoding, "chunked") != 0) {
+      ldout(s->cct, 20) << "neither length nor chunked encoding" << dendl;
       return -ERR_LENGTH_REQUIRED;
+    }
 
     chunked_upload = true;
   }
@@ -438,7 +535,7 @@ int RGWPutObj_ObjStore_SWIFT::get_params()
   supplied_etag = s->info.env->get("HTTP_ETAG");
 
   if (!s->generic_attrs.count(RGW_ATTR_CONTENT_TYPE)) {
-    dout(5) << "content type wasn't provided, trying to guess" << dendl;
+    ldout(s->cct, 5) << "content type wasn't provided, trying to guess" << dendl;
     const char *suffix = strrchr(s->object.name.c_str(), '.');
     if (suffix) {
       suffix++;
@@ -456,6 +553,12 @@ int RGWPutObj_ObjStore_SWIFT::get_params()
 
   obj_manifest = s->info.env->get("HTTP_X_OBJECT_MANIFEST");
 
+  int r = get_delete_at_param(s, &delete_at);
+  if (r < 0) {
+    ldout(s->cct, 5) << "ERROR: failed to get Delete-At param" << dendl;
+    return r;
+  }
+
   return RGWPutObj_ObjStore::get_params();
 }
 
@@ -471,87 +574,116 @@ void RGWPutObj_ObjStore_SWIFT::send_response()
   rgw_flush_formatter_and_reset(s, s->formatter);
 }
 
-#define REMOVE_ATTR_PREFIX     "HTTP_X_REMOVE_CONTAINER_META_"
-#define PUT_ATTR_PREFIX        "HTTP_X_CONTAINER_META_"
-#define REMOVE_ATTR_PREFIX_LEN sizeof(REMOVE_ATTR_PREFIX) - 1
-#define PUT_ATTR_PREFIX_LEN    sizeof(PUT_ATTR_PREFIX) - 1
+#define ACCT_REMOVE_ATTR_PREFIX     "HTTP_X_REMOVE_ACCOUNT_META_"
+#define ACCT_PUT_ATTR_PREFIX        "HTTP_X_ACCOUNT_META_"
+#define CONT_REMOVE_ATTR_PREFIX     "HTTP_X_REMOVE_CONTAINER_META_"
+#define CONT_PUT_ATTR_PREFIX        "HTTP_X_CONTAINER_META_"
 
-int RGWPutMetadata_ObjStore_SWIFT::get_params()
+static void get_rmattrs_from_headers(const req_state * const s,
+                                     const char * const put_prefix,
+                                     const char * const del_prefix,
+                                     set<string>& rmattr_names)
 {
-  if (s->has_bad_meta)
-    return -EINVAL;
+  map<string, string, ltstr_nocase>& m = s->info.env->get_map();
+  map<string, string, ltstr_nocase>::const_iterator iter;
+  const size_t put_prefix_len = strlen(put_prefix);
+  const size_t del_prefix_len = strlen(del_prefix);
 
-  if (s->object.empty()) {
-    int r = get_swift_container_settings(s, store, &policy, &has_policy, &cors_config, &has_cors);
-    if (r < 0) {
-      return r;
+  for (iter = m.begin(); iter != m.end(); ++iter) {
+    size_t prefix_len = 0;
+    const char * const p = iter->first.c_str();
+
+    if (strncasecmp(p, del_prefix, del_prefix_len) == 0) {
+      /* Explicitly requested removal. */
+      prefix_len = del_prefix_len;
+    } else if ((strncasecmp(p, put_prefix, put_prefix_len) == 0)
+        && iter->second.empty()) {
+      /* Removal requested by putting an empty value. */
+      prefix_len = put_prefix_len;
     }
-    map<string, string, ltstr_nocase>& m = s->info.env->get_map();
-    map<string, string, ltstr_nocase>::iterator iter;
-    for (iter = m.begin(); iter != m.end(); ++iter) {
-      size_t prefix_len = 0;
-      const char *p = iter->first.c_str();
-      if (strncasecmp(p, REMOVE_ATTR_PREFIX, REMOVE_ATTR_PREFIX_LEN) == 0) {
-        // Explicitly requested removal
-        prefix_len = REMOVE_ATTR_PREFIX_LEN;
-      } else if ((strncasecmp(p, PUT_ATTR_PREFIX, PUT_ATTR_PREFIX_LEN) == 0) && iter->second.empty()) {
-        // Removal requested by putting an empty value
-        prefix_len = PUT_ATTR_PREFIX_LEN;
-      }
-      if (prefix_len > 0) {
-        string name(RGW_ATTR_META_PREFIX);
-        name.append(lowercase_dash_http_attr(p + prefix_len));
-        rmattr_names.insert(name);
-      }
+
+    if (prefix_len > 0) {
+      string name(RGW_ATTR_META_PREFIX);
+      name.append(lowercase_dash_http_attr(p + prefix_len));
+      rmattr_names.insert(name);
     }
   }
-  placement_rule = s->info.env->get("HTTP_X_STORAGE_POLICY", "");
+}
+
+int RGWPutMetadataAccount_ObjStore_SWIFT::get_params()
+{
+  if (s->has_bad_meta) {
+    return -EINVAL;
+  }
 
+  get_rmattrs_from_headers(s, ACCT_PUT_ATTR_PREFIX, ACCT_REMOVE_ATTR_PREFIX, rmattr_names);
   return 0;
 }
 
-void RGWPutMetadata_ObjStore_SWIFT::send_response()
+void RGWPutMetadataAccount_ObjStore_SWIFT::send_response()
 {
   if (!ret) {
-    /* Return 204 when post metadata on a container */
-    if (s->object.empty())
-      ret = STATUS_NO_CONTENT;
-    else
-      ret = STATUS_ACCEPTED;
+    ret = STATUS_NO_CONTENT;
   }
   set_req_state_err(s, ret);
-  if (!s->err.is_err())
-    dump_content_length(s, 0);
   dump_errno(s);
   end_header(s, this);
   rgw_flush_formatter_and_reset(s, s->formatter);
 }
 
-int RGWSetTempUrl_ObjStore_SWIFT::get_params()
+int RGWPutMetadataBucket_ObjStore_SWIFT::get_params()
 {
-  const char *temp_url = s->info.env->get("HTTP_X_ACCOUNT_META_TEMP_URL_KEY");
-  if (temp_url) {
-    temp_url_keys[0] = temp_url;
+  if (s->has_bad_meta) {
+    return -EINVAL;
   }
 
-  temp_url = s->info.env->get("HTTP_X_ACCOUNT_META_TEMP_URL_KEY_2");
-  if (temp_url) {
-    temp_url_keys[1] = temp_url;
+  int r = get_swift_container_settings(s, store, &policy, &has_policy, &cors_config, &has_cors);
+  if (r < 0) {
+    return r;
   }
 
-  if (temp_url_keys.empty())
+  get_rmattrs_from_headers(s, CONT_PUT_ATTR_PREFIX, CONT_REMOVE_ATTR_PREFIX, rmattr_names);
+  placement_rule = s->info.env->get("HTTP_X_STORAGE_POLICY", "");
+  return 0;
+}
+
+void RGWPutMetadataBucket_ObjStore_SWIFT::send_response()
+{
+  if (!ret) {
+    ret = STATUS_NO_CONTENT;
+  }
+  set_req_state_err(s, ret);
+  dump_errno(s);
+  end_header(s, this);
+  rgw_flush_formatter_and_reset(s, s->formatter);
+}
+
+int RGWPutMetadataObject_ObjStore_SWIFT::get_params()
+{
+  if (s->has_bad_meta) {
     return -EINVAL;
+  }
+
+  /* Handle Swift object expiration. */
+  int r = get_delete_at_param(s, &delete_at);
+  if (r < 0) {
+    ldout(s->cct, 5) << "ERROR: failed to get Delete-At param" << dendl;
+    return r;
+  }
 
+  placement_rule = s->info.env->get("HTTP_X_STORAGE_POLICY", "");
   return 0;
 }
 
-void RGWSetTempUrl_ObjStore_SWIFT::send_response()
+void RGWPutMetadataObject_ObjStore_SWIFT::send_response()
 {
-  int r = ret;
-  if (!r)
-    r = STATUS_NO_CONTENT;
-
-  set_req_state_err(s, r);
+  if (!ret) {
+    ret = STATUS_ACCEPTED;
+  }
+  set_req_state_err(s, ret);
+  if (!s->err.is_err()) {
+    dump_content_length(s, 0);
+  }
   dump_errno(s);
   end_header(s, this);
   rgw_flush_formatter_and_reset(s, s->formatter);
@@ -602,6 +734,17 @@ static void dump_object_metadata(struct req_state * const s,
   for (riter = response_attrs.begin(); riter != response_attrs.end(); ++riter) {
     s->cio->print("%s: %s\r\n", riter->first.c_str(), riter->second.c_str());
   }
+
+  iter = attrs.find(RGW_ATTR_DELETE_AT);
+  if (iter != attrs.end()) {
+    utime_t delete_at;
+    try {
+      ::decode(delete_at, iter->second);
+      s->cio->print("X-Delete-At: %lu\r\n", delete_at.sec());
+    } catch (buffer::error& err) {
+      dout(0) << "ERROR: cannot decode object's " RGW_ATTR_DELETE_AT " attr, ignoring" << dendl;
+    }
+  }
 }
 
 int RGWCopyObj_ObjStore_SWIFT::init_dest_policy()
@@ -630,6 +773,12 @@ int RGWCopyObj_ObjStore_SWIFT::get_params()
     attrs_mod = RGWRados::ATTRSMOD_MERGE;
   }
 
+  int r = get_delete_at_param(s, &delete_at);
+  if (r < 0) {
+    ldout(s->cct, 5) << "ERROR: failed to get Delete-At param" << dendl;
+    return r;
+  }
+
   return 0;
 }
 
@@ -696,11 +845,20 @@ int RGWGetObj_ObjStore_SWIFT::send_response_data(bufferlist& bl, off_t bl_ofs, o
 {
   string content_type;
 
-  if (sent_header)
+  if (sent_header) {
     goto send_data;
+  }
 
-  if (range_str)
+  set_req_state_err(s, (partial_content && !ret) ? STATUS_PARTIAL_CONTENT : ret);
+  dump_errno(s);
+  if (s->err.is_err()) {
+    end_header(s, NULL);
+    return 0;
+  }
+
+  if (range_str) {
     dump_range(s, ofs, end, s->obj_size);
+  }
 
   dump_content_length(s, total_len);
   dump_last_modified(s, lastmod);
@@ -720,8 +878,6 @@ int RGWGetObj_ObjStore_SWIFT::send_response_data(bufferlist& bl, off_t bl_ofs, o
     dump_object_metadata(s, attrs);
   }
 
-  set_req_state_err(s, (partial_content && !ret) ? STATUS_PARTIAL_CONTENT : ret);
-  dump_errno(s);
   end_header(s, this, !content_type.empty() ? content_type.c_str() : "binary/octet-stream");
 
   sent_header = true;
@@ -770,15 +926,7 @@ RGWOp *RGWHandler_ObjStore_Service_SWIFT::op_head()
 
 RGWOp *RGWHandler_ObjStore_Service_SWIFT::op_post()
 {
-  const char *temp_url = s->info.env->get("HTTP_X_ACCOUNT_META_TEMP_URL_KEY");
-  if (temp_url) {
-    return new RGWSetTempUrl_ObjStore_SWIFT;
-  }
-  temp_url = s->info.env->get("HTTP_X_ACCOUNT_META_TEMP_URL_KEY_2");
-  if (temp_url) {
-    return new RGWSetTempUrl_ObjStore_SWIFT;
-  }
-  return NULL;
+  return new RGWPutMetadataAccount_ObjStore_SWIFT;
 }
 
 RGWOp *RGWHandler_ObjStore_Bucket_SWIFT::get_obj_op(bool get_data)
@@ -824,7 +972,7 @@ RGWOp *RGWHandler_ObjStore_Bucket_SWIFT::op_delete()
 
 RGWOp *RGWHandler_ObjStore_Bucket_SWIFT::op_post()
 {
-  return new RGWPutMetadata_ObjStore_SWIFT;
+  return new RGWPutMetadataBucket_ObjStore_SWIFT;
 }
 
 RGWOp *RGWHandler_ObjStore_Bucket_SWIFT::op_options()
@@ -877,7 +1025,7 @@ RGWOp *RGWHandler_ObjStore_Obj_SWIFT::op_delete()
 
 RGWOp *RGWHandler_ObjStore_Obj_SWIFT::op_post()
 {
-  return new RGWPutMetadata_ObjStore_SWIFT;
+  return new RGWPutMetadataObject_ObjStore_SWIFT;
 }
 
 RGWOp *RGWHandler_ObjStore_Obj_SWIFT::op_copy()
diff --git a/src/rgw/rgw_rest_swift.h b/src/rgw/rgw_rest_swift.h
index e722a2f..55b41bb 100644
--- a/src/rgw/rgw_rest_swift.h
+++ b/src/rgw/rgw_rest_swift.h
@@ -14,6 +14,7 @@ public:
   ~RGWGetObj_ObjStore_SWIFT() {}
 
   int send_response_data(bufferlist& bl, off_t ofs, off_t len);
+  bool need_object_expiration() { return true; }
 };
 
 class RGWListBuckets_ObjStore_SWIFT : public RGWListBuckets_ObjStore {
@@ -28,6 +29,7 @@ public:
   void send_response_end();
 
   bool should_get_stats() { return need_stats; }
+  bool supports_account_metadata() { return true; }
 };
 
 class RGWListBucket_ObjStore_SWIFT : public RGWListBucket_ObjStore {
@@ -44,11 +46,13 @@ public:
 };
 
 class RGWStatAccount_ObjStore_SWIFT : public RGWStatAccount_ObjStore {
+  map<string, bufferlist> attrs;
 public:
   RGWStatAccount_ObjStore_SWIFT() {
   }
   ~RGWStatAccount_ObjStore_SWIFT() {}
 
+  void execute();
   void send_response();
 };
 
@@ -86,24 +90,34 @@ public:
   void send_response();
 };
 
-class RGWPutMetadata_ObjStore_SWIFT : public RGWPutMetadata_ObjStore {
+class RGWPutMetadataAccount_ObjStore_SWIFT : public RGWPutMetadataAccount_ObjStore {
 public:
-  RGWPutMetadata_ObjStore_SWIFT() {}
-  ~RGWPutMetadata_ObjStore_SWIFT() {}
+  RGWPutMetadataAccount_ObjStore_SWIFT() {}
+  ~RGWPutMetadataAccount_ObjStore_SWIFT() {}
 
   int get_params();
   void send_response();
 };
 
-class RGWSetTempUrl_ObjStore_SWIFT : public RGWSetTempUrl_ObjStore {
+class RGWPutMetadataBucket_ObjStore_SWIFT : public RGWPutMetadataBucket_ObjStore {
 public:
-  RGWSetTempUrl_ObjStore_SWIFT() {}
-  ~RGWSetTempUrl_ObjStore_SWIFT() {}
+  RGWPutMetadataBucket_ObjStore_SWIFT() {}
+  ~RGWPutMetadataBucket_ObjStore_SWIFT() {}
 
   int get_params();
   void send_response();
 };
 
+class RGWPutMetadataObject_ObjStore_SWIFT : public RGWPutMetadataObject_ObjStore {
+public:
+  RGWPutMetadataObject_ObjStore_SWIFT() {}
+  ~RGWPutMetadataObject_ObjStore_SWIFT() {}
+
+  int get_params();
+  void send_response();
+  bool need_object_expiration() { return true; }
+};
+
 class RGWDeleteObj_ObjStore_SWIFT : public RGWDeleteObj_ObjStore {
 public:
   RGWDeleteObj_ObjStore_SWIFT() {}
@@ -223,9 +237,6 @@ public:
   RGWRESTMgr_SWIFT() {}
   virtual ~RGWRESTMgr_SWIFT() {}
 
-  virtual RGWRESTMgr *get_resource_mgr(struct req_state *s, const string& uri) {
-    return this;
-  }
   virtual RGWHandler *get_handler(struct req_state *s);
 };
 
diff --git a/src/rgw/rgw_rest_user.cc b/src/rgw/rgw_rest_user.cc
index 5e618c4..6cd2591 100644
--- a/src/rgw/rgw_rest_user.cc
+++ b/src/rgw/rgw_rest_user.cc
@@ -307,6 +307,7 @@ void RGWOp_Subuser_Create::execute()
   RESTArgs::get_bool(s, "generate-secret", false, &gen_secret);
 
   perm_mask = rgw_str_to_perm(perm_str.c_str());
+  op_state.set_perm(perm_mask);
 
   // FIXME: no double checking
   if (!uid.empty())
@@ -318,9 +319,6 @@ void RGWOp_Subuser_Create::execute()
   if (!secret_key.empty())
     op_state.set_secret_key(secret_key);
 
-  if (perm_mask != 0)
-    op_state.set_perm(perm_mask);
-
   op_state.set_generate_subuser(gen_subuser);
 
   if (gen_secret)
@@ -374,6 +372,7 @@ void RGWOp_Subuser_Modify::execute()
   RESTArgs::get_bool(s, "generate-secret", false, &gen_secret);
 
   perm_mask = rgw_str_to_perm(perm_str.c_str());
+  op_state.set_perm(perm_mask);
 
   // FIXME: no double checking
   if (!uid.empty())
@@ -388,9 +387,6 @@ void RGWOp_Subuser_Modify::execute()
   if (gen_secret)
     op_state.set_gen_secret();
 
-  if (perm_mask != 0)
-    op_state.set_perm(perm_mask);
-
   if (!key_type_str.empty()) {
     if (key_type_str.compare("swift") == 0)
       key_type = KEY_TYPE_SWIFT;
@@ -628,10 +624,9 @@ struct UserQuotas {
 
   UserQuotas() {}
 
-  UserQuotas(RGWUserInfo& info) {
-    bucket_quota = info.bucket_quota;
-    user_quota = info.user_quota;
-  }
+  UserQuotas(RGWUserInfo& info) : bucket_quota(info.bucket_quota), 
+				  user_quota(info.user_quota) {}
+
   void dump(Formatter *f) const {
     encode_json("bucket_quota", bucket_quota, f);
     encode_json("user_quota", user_quota, f);
diff --git a/src/rgw/rgw_swift.cc b/src/rgw/rgw_swift.cc
index 09fb9b7..0a8d373 100644
--- a/src/rgw/rgw_swift.cc
+++ b/src/rgw/rgw_swift.cc
@@ -559,7 +559,7 @@ int authenticate_temp_url(RGWRados *store, req_state *s)
   /* need to get user info of bucket owner */
   RGWBucketInfo bucket_info;
 
-  int ret = store->get_bucket_info(*(RGWObjectCtx *)s->obj_ctx, s->bucket_name_str, bucket_info, NULL);
+  int ret = store->get_bucket_info(*static_cast<RGWObjectCtx *>(s->obj_ctx), s->bucket_name_str, bucket_info, NULL);
   if (ret < 0)
     return -EPERM;
 
diff --git a/src/rgw/rgw_user.cc b/src/rgw/rgw_user.cc
index 1e122df..8d691e8 100644
--- a/src/rgw/rgw_user.cc
+++ b/src/rgw/rgw_user.cc
@@ -89,8 +89,13 @@ int rgw_user_sync_all_stats(RGWRados *store, const string& user_id)
  * Save the given user information to storage.
  * Returns: 0 on success, -ERR# on failure.
  */
-int rgw_store_user_info(RGWRados *store, RGWUserInfo& info, RGWUserInfo *old_info,
-                        RGWObjVersionTracker *objv_tracker, time_t mtime, bool exclusive)
+int rgw_store_user_info(RGWRados *store,
+                        RGWUserInfo& info,
+                        RGWUserInfo *old_info,
+                        RGWObjVersionTracker *objv_tracker,
+                        time_t mtime,
+                        bool exclusive,
+                        map<string, bufferlist> *pattrs)
 {
   bufferlist bl;
   info.encode(bl);
@@ -151,7 +156,7 @@ int rgw_store_user_info(RGWRados *store, RGWUserInfo& info, RGWUserInfo *old_inf
   ::encode(ui, data_bl);
   ::encode(info, data_bl);
 
-  ret = store->meta_mgr->put_entry(user_meta_handler, info.user_id, data_bl, exclusive, &ot, mtime);
+  ret = store->meta_mgr->put_entry(user_meta_handler, info.user_id, data_bl, exclusive, &ot, mtime, pattrs);
   if (ret < 0)
     return ret;
 
@@ -196,6 +201,18 @@ int rgw_store_user_info(RGWRados *store, RGWUserInfo& info, RGWUserInfo *old_inf
   return ret;
 }
 
+int rgw_store_user_attrs(RGWRados *const store,
+                         string& user_id,
+                         map<string, bufferlist>& attrs,
+                         map<string, bufferlist>* const rmattrs,
+                         RGWObjVersionTracker * const objv_tracker)
+{
+  rgw_obj obj(store->zone.user_uid_pool, user_id);
+
+  return store->meta_mgr->set_attrs(user_meta_handler, user_id, obj,
+                                    attrs, rmattrs, objv_tracker);
+}
+
 struct user_info_entry {
   RGWUserInfo info;
   RGWObjVersionTracker objv_tracker;
@@ -257,17 +274,22 @@ int rgw_get_user_info_from_index(RGWRados *store, string& key, rgw_bucket& bucke
  * Given a uid, finds the user info associated with it.
  * returns: 0 on success, -ERR# on failure (including nonexistence)
  */
-int rgw_get_user_info_by_uid(RGWRados *store, string& uid, RGWUserInfo& info,
-                             RGWObjVersionTracker *objv_tracker, time_t *pmtime,
-                             rgw_cache_entry_info *cache_info)
+int rgw_get_user_info_by_uid(RGWRados *store,
+                             string& uid,
+                             RGWUserInfo& info,
+                             RGWObjVersionTracker *objv_tracker,
+                             time_t *pmtime,
+                             rgw_cache_entry_info *cache_info,
+                             map<string, bufferlist> *pattrs)
 {
   bufferlist bl;
   RGWUID user_id;
 
   RGWObjectCtx obj_ctx(store);
-  int ret = rgw_get_system_obj(store, obj_ctx, store->zone.user_uid_pool, uid, bl, objv_tracker, pmtime, NULL, cache_info);
-  if (ret < 0)
+  int ret = rgw_get_system_obj(store, obj_ctx, store->zone.user_uid_pool, uid, bl, objv_tracker, pmtime, pattrs, cache_info);
+  if (ret < 0) {
     return ret;
+  }
 
   bufferlist::iterator iter = bl.begin();
   try {
@@ -317,6 +339,20 @@ extern int rgw_get_user_info_by_access_key(RGWRados *store, string& access_key,
   return rgw_get_user_info_from_index(store, access_key, store->zone.user_keys_pool, info, objv_tracker, pmtime);
 }
 
+int rgw_get_user_attrs_by_uid(RGWRados *store,
+                              const string& user_id,
+                              map<string, bufferlist>& attrs,
+                              RGWObjVersionTracker *objv_tracker)
+{
+  RGWObjectCtx obj_ctx(store);
+  rgw_obj obj(store->zone.user_uid_pool, user_id);
+  RGWRados::SystemObject src(store, obj_ctx, obj);
+  RGWRados::SystemObject::Read rop(&src);
+
+  rop.stat_params.attrs = &attrs;
+  return rop.stat(objv_tracker);
+}
+
 int rgw_remove_key_index(RGWRados *store, RGWAccessKey& access_key)
 {
   rgw_obj obj(store->zone.user_keys_pool, access_key.id);
@@ -430,7 +466,7 @@ int rgw_delete_user(RGWRados *store, RGWUserInfo& info, RGWObjVersionTracker& ob
   rgw_obj uid_obj(store->zone.user_uid_pool, info.user_id);
   ldout(store->ctx(), 10) << "removing user index: " << info.user_id << dendl;
   ret = store->meta_mgr->remove_entry(user_meta_handler, info.user_id, &objv_tracker);
-  if (ret < 0 && ret != -ENOENT) {
+  if (ret < 0 && ret != -ENOENT && ret  != -ECANCELED) {
     ldout(store->ctx(), 0) << "ERROR: could not remove " << info.user_id << ":" << uid_obj << ", should be fixed (err=" << ret << ")" << dendl;
     return ret;
   }
@@ -498,7 +534,9 @@ void rgw_perm_to_str(uint32_t mask, char *buf, int len)
 
 uint32_t rgw_str_to_perm(const char *str)
 {
-  if (strcasecmp(str, "read") == 0)
+  if (strcasecmp(str, "") == 0)
+    return RGW_PERM_NONE;
+  else if (strcasecmp(str, "read") == 0)
     return RGW_PERM_READ;
   else if (strcasecmp(str, "write") == 0)
     return RGW_PERM_WRITE;
@@ -507,7 +545,7 @@ uint32_t rgw_str_to_perm(const char *str)
   else if (strcasecmp(str, "full") == 0)
     return RGW_PERM_FULL_CONTROL;
 
-  return 0; // better to return no permission
+  return RGW_PERM_INVALID;
 }
 
 static bool validate_access_key(string& key)
@@ -775,9 +813,14 @@ int RGWAccessKeyPool::check_op(RGWUserAdminOpState& op_state,
 
   int32_t key_type = op_state.get_key_type();
 
-  // if a key type wasn't specified set it to s3
-  if (key_type < 0)
-    key_type = KEY_TYPE_S3;
+  // if a key type wasn't specified
+  if (key_type < 0) {
+      if (op_state.has_subuser()) {
+        key_type = KEY_TYPE_SWIFT;
+      } else {
+        key_type = KEY_TYPE_S3;
+      }
+  }
 
   op_state.set_key_type(key_type);
 
@@ -840,12 +883,23 @@ int RGWAccessKeyPool::generate_key(RGWUserAdminOpState& op_state, std::string *e
     }
   }
 
-  if (op_state.has_subuser())
-    new_key.subuser = op_state.get_subuser();
+  //key's subuser
+  if (op_state.has_subuser()) {
+    //create user and subuser at the same time, user's s3 key should not be set this
+    if (!op_state.key_type_setbycontext || (key_type == KEY_TYPE_SWIFT)) {
+      new_key.subuser = op_state.get_subuser();
+    }
+  }
 
+  //Secret key
   if (!gen_secret) {
+    if (op_state.get_secret_key().empty()) {
+      set_err_msg(err_msg, "empty secret key");
+      return -EINVAL; 
+    }
+  
     key = op_state.get_secret_key();
-  } else if (gen_secret) {
+  } else {
     char secret_key_buf[SECRET_KEY_LEN + 1];
 
     ret = gen_rand_alphanumeric_plain(g_ceph_context, secret_key_buf, sizeof(secret_key_buf));
@@ -1195,6 +1249,17 @@ int RGWSubUserPool::check_op(RGWUserAdminOpState& op_state,
     return -EINVAL;
   }
 
+  if (op_state.get_subuser_perm() == RGW_PERM_INVALID) {
+    set_err_msg(err_msg, "invaild subuser access");
+    return -EINVAL;
+  }
+
+  //set key type when it not set or set by context
+  if ((op_state.get_key_type() < 0) || op_state.key_type_setbycontext) {
+    op_state.set_key_type(KEY_TYPE_SWIFT);
+    op_state.key_type_setbycontext = true;
+  }
+
   // check if the subuser exists
   if (!subuser.empty())
     existing = exists(subuser);
@@ -1262,7 +1327,7 @@ int RGWSubUserPool::add(RGWUserAdminOpState& op_state, std::string *err_msg, boo
   }
 
   if (op_state.get_secret_key().empty()) {
-    op_state.set_gen_access();
+    op_state.set_gen_secret();
   }
 
   ret = execute_add(op_state, &subprocess_msg, defer_user_update);
@@ -1713,6 +1778,12 @@ int RGWUser::check_op(RGWUserAdminOpState& op_state, std::string *err_msg)
     return -EINVAL;
   }
 
+  //set key type when it not set or set by context
+  if ((op_state.get_key_type() < 0) || op_state.key_type_setbycontext) {
+    op_state.set_key_type(KEY_TYPE_S3);
+    op_state.key_type_setbycontext = true;
+  }
+
   return 0;
 }
 
@@ -1860,7 +1931,7 @@ int RGWUser::execute_remove(RGWUserAdminOpState& op_state, std::string *err_msg)
 
   if (!op_state.has_existing_user()) {
     set_err_msg(err_msg, "user does not exist");
-    return -EINVAL;
+    return -ENOENT;
   }
 
   bool done;
@@ -2157,8 +2228,11 @@ int RGWUserAdminOp_User::create(RGWRados *store, RGWUserAdminOpState& op_state,
   Formatter *formatter = flusher.get_formatter();
 
   ret = user.add(op_state, NULL);
-  if (ret < 0)
+  if (ret < 0) {
+    if (ret == -EEXIST)
+      ret = -ERR_USER_EXIST;
     return ret;
+  }
 
   ret = user.info(info, NULL);
   if (ret < 0)
@@ -2390,16 +2464,37 @@ int RGWUserAdminOp_Caps::remove(RGWRados *store, RGWUserAdminOpState& op_state,
   return 0;
 }
 
-class RGWUserMetadataObject : public RGWMetadataObject {
+struct RGWUserCompleteInfo {
   RGWUserInfo info;
+  map<string, bufferlist> attrs;
+  bool has_attrs;
+
+  RGWUserCompleteInfo()
+    : has_attrs(false)
+  {}
+
+  void dump(Formatter * const f) const {
+    info.dump(f);
+    encode_json("attrs", attrs, f);
+  }
+
+  void decode_json(JSONObj *obj) {
+    decode_json_obj(info, obj);
+    has_attrs = JSONDecoder::decode_json("attrs", attrs, obj);
+  }
+};
+
+class RGWUserMetadataObject : public RGWMetadataObject {
+  RGWUserCompleteInfo uci;
 public:
-  RGWUserMetadataObject(RGWUserInfo& i, obj_version& v, time_t m) : info(i) {
+  RGWUserMetadataObject(const RGWUserCompleteInfo& _uci, obj_version& v, time_t m)
+      : uci(_uci) {
     objv = v;
     mtime = m;
   }
 
   void dump(Formatter *f) const {
-    info.dump(f);
+    uci.dump(f);
   }
 };
 
@@ -2408,17 +2503,17 @@ public:
   string get_type() { return "user"; }
 
   int get(RGWRados *store, string& entry, RGWMetadataObject **obj) {
-    RGWUserInfo info;
-
+    RGWUserCompleteInfo uci;
     RGWObjVersionTracker objv_tracker;
     time_t mtime;
 
-    int ret = rgw_get_user_info_by_uid(store, entry, info, &objv_tracker, &mtime);
-    if (ret < 0)
+    int ret = rgw_get_user_info_by_uid(store, entry, uci.info, &objv_tracker,
+                                       &mtime, NULL, &uci.attrs);
+    if (ret < 0) {
       return ret;
+    }
 
-    RGWUserMetadataObject *mdo = new RGWUserMetadataObject(info, objv_tracker.read_version, mtime);
-
+    RGWUserMetadataObject *mdo = new RGWUserMetadataObject(uci, objv_tracker.read_version, mtime);
     *obj = mdo;
 
     return 0;
@@ -2426,9 +2521,14 @@ public:
 
   int put(RGWRados *store, string& entry, RGWObjVersionTracker& objv_tracker,
           time_t mtime, JSONObj *obj, sync_type_t sync_mode) {
-    RGWUserInfo info;
+    RGWUserCompleteInfo uci;
 
-    decode_json_obj(info, obj);
+    decode_json_obj(uci, obj);
+
+    map<string, bufferlist> *pattrs = NULL;
+    if (uci.has_attrs) {
+      pattrs = &uci.attrs;
+    }
 
     RGWUserInfo old_info;
     time_t orig_mtime;
@@ -2443,9 +2543,10 @@ public:
       return STATUS_NO_APPLY;
     }
 
-    ret = rgw_store_user_info(store, info, &old_info, &objv_tracker, mtime, false);
-    if (ret < 0)
+    ret = rgw_store_user_info(store, uci.info, &old_info, &objv_tracker, mtime, false, pattrs);
+    if (ret < 0) {
       return ret;
+    }
 
     return STATUS_APPLIED;
   }
diff --git a/src/rgw/rgw_user.h b/src/rgw/rgw_user.h
index 6204b09..0f26cff 100644
--- a/src/rgw/rgw_user.h
+++ b/src/rgw/rgw_user.h
@@ -55,17 +55,37 @@ extern bool rgw_user_is_authenticated(RGWUserInfo& info);
  * Save the given user information to storage.
  * Returns: 0 on success, -ERR# on failure.
  */
-extern int rgw_store_user_info(RGWRados *store, RGWUserInfo& info, RGWUserInfo *old_info,
-                               RGWObjVersionTracker *objv_tracker, time_t mtime, bool exclusive);
+extern int rgw_store_user_info(RGWRados *store,
+                               RGWUserInfo& info,
+                               RGWUserInfo *old_info,
+                               RGWObjVersionTracker *objv_tracker,
+                               time_t mtime,
+                               bool exclusive,
+                               map<string, bufferlist> *pattrs = NULL);
 /**
- * Given an email, finds the user info associated with it.
+ * Save the custom user metadata given in @attrs and delete those in @rmattrs
+ * for user specified in @user_id.
+ * Returns: 0 on success, -ERR# on failure.
+ */
+extern int rgw_store_user_attrs(RGWRados *store,
+                                string& user_id,
+                                map<string, bufferlist>& attrs,
+                                map<string, bufferlist>* rmattrs,
+                                RGWObjVersionTracker *objv_tracker);
+
+/**
+ * Given an user_id, finds the user info associated with it.
  * returns: 0 on success, -ERR# on failure (including nonexistence)
  */
-extern int rgw_get_user_info_by_uid(RGWRados *store, string& user_id, RGWUserInfo& info,
-                                    RGWObjVersionTracker *objv_tracker = NULL, time_t *pmtime = NULL,
-                                    rgw_cache_entry_info *cache_info = NULL);
+extern int rgw_get_user_info_by_uid(RGWRados *store,
+                                    string& user_id,
+                                    RGWUserInfo& info,
+                                    RGWObjVersionTracker *objv_tracker = NULL,
+                                    time_t *pmtime                     = NULL,
+                                    rgw_cache_entry_info *cache_info   = NULL,
+                                    map<string, bufferlist> *pattrs    = NULL);
 /**
- * Given an swift username, finds the user info associated with it.
+ * Given an email, finds the user info associated with it.
  * returns: 0 on success, -ERR# on failure (including nonexistence)
  */
 extern int rgw_get_user_info_by_email(RGWRados *store, string& email, RGWUserInfo& info,
@@ -83,6 +103,15 @@ extern int rgw_get_user_info_by_swift(RGWRados *store, string& swift_name, RGWUs
 extern int rgw_get_user_info_by_access_key(RGWRados *store, string& access_key, RGWUserInfo& info,
                                            RGWObjVersionTracker *objv_tracker = NULL, time_t *pmtime = NULL);
 /**
+ * Get all the custom metadata stored for user specified in @user_id
+ * and put it into @attrs.
+ * Returns: 0 on success, -ERR# on failure.
+ */
+extern int rgw_get_user_attrs_by_uid(RGWRados *store,
+                                     const string& user_id,
+                                     map<string, bufferlist>& attrs,
+                                     RGWObjVersionTracker *objv_tracker = NULL);
+/**
  * Given an RGWUserInfo, deletes the user and its bucket ACLs.
  */
 extern int rgw_delete_user(RGWRados *store, RGWUserInfo& user, RGWObjVersionTracker& objv_tracker);
@@ -161,6 +190,7 @@ struct RGWUserAdminOpState {
   bool id_specified;
   bool key_specified;
   bool type_specified;
+  bool key_type_setbycontext;   // key type set by user or subuser context
   bool purge_data;
   bool purge_keys;
   bool display_name_specified;
@@ -411,7 +441,7 @@ struct RGWUserAdminOpState {
   {
     max_buckets = RGW_DEFAULT_MAX_BUCKETS;
     key_type = -1;
-    perm_mask = 0;
+    perm_mask = RGW_PERM_NONE;
     suspended = 0;
     system = 0;
     exclusive = 0;
@@ -431,6 +461,7 @@ struct RGWUserAdminOpState {
     id_specified = false;
     key_specified = false;
     type_specified = false;
+    key_type_setbycontext = false;
     purge_data = false;
     display_name_specified = false;
     user_email_specified = false;
diff --git a/src/rgw/rgw_xml.h b/src/rgw/rgw_xml.h
index 164e97a..c4722ab 100644
--- a/src/rgw/rgw_xml.h
+++ b/src/rgw/rgw_xml.h
@@ -6,7 +6,7 @@
 
 #include <map>
 #include <string>
-#include <iostream>
+#include <iosfwd>
 #include <include/types.h>
 
 #include <expat.h>
diff --git a/src/rocksdb/.arcconfig b/src/rocksdb/.arcconfig
deleted file mode 100644
index 85ca38f..0000000
--- a/src/rocksdb/.arcconfig
+++ /dev/null
@@ -1,10 +0,0 @@
-{
-  "project_id" : "rocksdb",
-  "conduit_uri" : "https://reviews.facebook.net/",
-  "copyright_holder" : "Facebook",
-  "load" : [
-    "linters"
-  ],
-  "lint.engine" : "FacebookFbcodeLintEngine",
-  "lint.engine.single.linter" : "FbcodeCppLinter"
-}
diff --git a/src/rocksdb/.clang-format b/src/rocksdb/.clang-format
deleted file mode 100644
index 7c27981..0000000
--- a/src/rocksdb/.clang-format
+++ /dev/null
@@ -1,5 +0,0 @@
-# Complete list of style options can be found at: 
-# http://clang.llvm.org/docs/ClangFormatStyleOptions.html
----
-BasedOnStyle: Google
-...
diff --git a/src/rocksdb/.gitignore b/src/rocksdb/.gitignore
index 5b31e68..c537990 100644
--- a/src/rocksdb/.gitignore
+++ b/src/rocksdb/.gitignore
@@ -1,5 +1,5 @@
 TARGETS
-build_config.mk
+make_config.mk
 
 *.a
 *.arc
@@ -20,6 +20,7 @@ build_config.mk
 *.d-e
 *.o-*
 *.swp
+*~
 *.lo
 *~
 
@@ -30,11 +31,27 @@ util/build_version.cc
 build_tools/VALGRIND_LOGS/
 coverage/COVERAGE_REPORT
 .gdbhistory
+package/
 .phutil_module_cache
+unity
 tags
+
+java/out
+java/target
+java/test-libs
 java/*.log
 java/include/org_rocksdb_*.h
 
+.idea/
+*.iml
+
+unity.cc
+java/crossbuild/.vagrant
+.vagrant/
+java/**.asc
+java/javadoc
+
+scan_build_report/
 .dirstamp
 .deps/
 .libs/
@@ -57,4 +74,7 @@ ltmain.sh
 missing
 stamp-h1
 Makefile
+t
+LOG
 
+/m4/
diff --git a/src/rocksdb/AUTHORS b/src/rocksdb/AUTHORS
new file mode 100644
index 0000000..e644f55
--- /dev/null
+++ b/src/rocksdb/AUTHORS
@@ -0,0 +1,11 @@
+Facebook Inc.
+Facebook Engineering Team
+
+Google Inc.
+# Initial version authors:
+Jeffrey Dean <jeff at google.com>
+Sanjay Ghemawat <sanjay at google.com>
+
+# Partial list of contributors:
+Kevin Regan <kevin.d.regan at gmail.com>
+Johan Bilien <jobi at litl.com>
diff --git a/src/rocksdb/CONTRIBUTING.md b/src/rocksdb/CONTRIBUTING.md
index 7270d0c..d6467fe 100644
--- a/src/rocksdb/CONTRIBUTING.md
+++ b/src/rocksdb/CONTRIBUTING.md
@@ -10,9 +10,8 @@ the CLA and we can cross-check with your GitHub username.
 
 Complete your CLA here: <https://code.facebook.com/cla>
 
-If you don't have a Facebook account, we can send you a PDF that you can
-sign offline. Send us an e-mail or create a new github issue to
-request the CLA in PDF format.
+If you prefer to sign a paper copy, we can send you a PDF.  Send us an 
+e-mail or create a new github issue to request the CLA in PDF format.
 
 ## License
 
diff --git a/src/rocksdb/HISTORY.md b/src/rocksdb/HISTORY.md
index f64d532..2a52367 100644
--- a/src/rocksdb/HISTORY.md
+++ b/src/rocksdb/HISTORY.md
@@ -1,10 +1,189 @@
 # Rocksdb Change Log
 
+## 3.11.2 (6/11/2015)
+
+### Fixes
+* Adjust the way we compensate for tombstones when chosing compactions. Previous heuristics led to pathological behavior in some cases.
+* Don't let two L0->L1 compactions run in parallel (only affected through experimental feature SuggestCompactRange)
+
+## 3.11.1 (6/1/2015)
+
+### Changes
+* Just a single change to fix the Java linking (github issue #606)
+
+## 3.11.0 (5/19/2015)
+
+### New Features
+* Added a new API Cache::SetCapacity(size_t capacity) to dynamically change the maximum configured capacity of the cache. If the new capacity is less than the existing cache usage, the implementation will try to lower the usage by evicting the necessary number of elements following a strict LRU policy.
+
+### New Features
+* Added an experimental API for handling flashcache devices (blacklists background threads from caching their reads) -- NewFlashcacheAwareEnv
+* If universal compaction is used and options.num_levels > 1, compact files are tried to be stored in none-L0 with smaller files based on options.target_file_size_base. The limitation of DB size when using universal compaction is greatly mitigated by using more levels. You can set num_levels = 1 to make universal compaction behave as before. If you set num_levels > 1 and want to roll back to a previous version, you need to compact all files to a big file in level 0 (by setting target_fil [...]
+* More information about rocksdb background threads are available in Env::GetThreadList(), including the number of bytes read / written by a compaction job, mem-table size and current number of bytes written by a flush job and many more.  Check include/rocksdb/thread_status.h for more detail.
+
+### Public API changes
+* TablePropertiesCollector::AddUserKey() is added to replace TablePropertiesCollector::Add(). AddUserKey() exposes key type, sequence number and file size up to now to users.
+* DBOptions::bytes_per_sync used to apply to both WAL and table files. As of 3.11 it applies only to table files. If you want to use this option to sync WAL in the background, please use wal_bytes_per_sync
+
+## 3.10.0 (3/24/2015)
+### New Features
+* GetThreadStatus() is now able to report detailed thread status, including:
+ - Thread Operation including flush and compaction.
+ - The stage of the current thread operation.
+ - The elapsed time in micros since the current thread operation started.
+ More information can be found in include/rocksdb/thread_status.h.  In addition, when running db_bench with --thread_status_per_interval, db_bench will also report thread status periodically.
+* Changed the LRU caching algorithm so that referenced blocks (by iterators) are never evicted. This change made parameter removeScanCountLimit obsolete. Because of that NewLRUCache doesn't take three arguments anymore. table_cache_remove_scan_limit option is also removed
+* By default we now optimize the compilation for the compilation platform (using -march=native). If you want to build portable binary, use 'PORTABLE=1' before the make command.
+* We now allow level-compaction to place files in different paths by
+  specifying them in db_paths along with the target_size.
+  Lower numbered levels will be placed earlier in the db_paths and higher
+  numbered levels will be placed later in the db_paths vector.
+* Potentially big performance improvements if you're using RocksDB with lots of column families (100-1000)
+* Added BlockBasedTableOptions.format_version option, which allows user to specify which version of block based table he wants. As a general guidline, newer versions have more features, but might not be readable by older versions of RocksDB.
+* Added new block based table format (version 2), which you can enable by setting BlockBasedTableOptions.format_version = 2. This format changes how we encode size information in compressed blocks and should help with memory allocations if you're using Zlib or BZip2 compressions.
+* MemEnv (env that stores data in memory) is now available in default library build. You can create it by calling NewMemEnv().
+* Add SliceTransform.SameResultWhenAppended() to help users determine it is safe to apply prefix bloom/hash.
+* Block based table now makes use of prefix bloom filter if it is a full fulter.
+* Block based table remembers whether a whole key or prefix based bloom filter is supported in SST files. Do a sanity check when reading the file with users' configuration.
+* Fixed a bug in ReadOnlyBackupEngine that deleted corrupted backups in some cases, even though the engine was ReadOnly
+* options.level_compaction_dynamic_level_bytes, a feature to allow RocksDB to pick dynamic base of bytes for levels. With this feature turned on, we will automatically adjust max bytes for each level. The goal of this feature is to have lower bound on size amplification. For more details, see comments in options.h.
+* Added an abstract base class WriteBatchBase for write batches
+* Fixed a bug where we start deleting files of a dropped column families even if there are still live references to it
+
+### Public API changes
+* Deprecated skip_log_error_on_recovery and table_cache_remove_scan_count_limit options.
+* Logger method logv with log level parameter is now virtual
+
+### RocksJava
+* Added compression per level API.
+* MemEnv is now available in RocksJava via RocksMemEnv class.
+* lz4 compression is now included in rocksjava static library when running `make rocksdbjavastatic`.
+* Overflowing a size_t when setting rocksdb options now throws an IllegalArgumentException, which removes the necessity for a developer to catch these Exceptions explicitly.
+
+## 3.9.0 (12/8/2014)
+
+### New Features
+* Add rocksdb::GetThreadList(), which in the future will return the current status of all
+  rocksdb-related threads.  We will have more code instruments in the following RocksDB
+  releases.
+* Change convert function in rocksdb/utilities/convenience.h to return Status instead of boolean.
+  Also add support for nested options in convert function
+
+### Public API changes
+* New API to create a checkpoint added. Given a directory name, creates a new
+  database which is an image of the existing database.
+* New API LinkFile added to Env. If you implement your own Env class, an
+  implementation of the API LinkFile will have to be provided.
+* MemTableRep takes MemTableAllocator instead of Arena
+
+### Improvements
+* RocksDBLite library now becomes smaller and will be compiled with -fno-exceptions flag.
+
+## 3.8.0 (11/14/2014)
+
+### Public API changes
+* BackupEngine::NewBackupEngine() was deprecated; please use BackupEngine::Open() from now on.
+* BackupableDB/RestoreBackupableDB have new GarbageCollect() methods, which will clean up files from corrupt and obsolete backups.
+* BackupableDB/RestoreBackupableDB have new GetCorruptedBackups() methods which list corrupt backups.
+
+### Cleanup
+* Bunch of code cleanup, some extra warnings turned on (-Wshadow, -Wshorten-64-to-32, -Wnon-virtual-dtor)
+
+### New features
+* CompactFiles and EventListener, although they are still in experimental state
+* Full ColumnFamily support in RocksJava.
+
+## 3.7.0 (11/6/2014)
+### Public API changes
+* Introduce SetOptions() API to allow adjusting a subset of options dynamically online
+* Introduce 4 new convenient functions for converting Options from string: GetColumnFamilyOptionsFromMap(), GetColumnFamilyOptionsFromString(), GetDBOptionsFromMap(), GetDBOptionsFromString()
+* Remove WriteBatchWithIndex.Delete() overloads using SliceParts
+* When opening a DB, if options.max_background_compactions is larger than the existing low pri pool of options.env, it will enlarge it. Similarly, options.max_background_flushes is larger than the existing high pri pool of options.env, it will enlarge it.
+
+## 3.6.0 (10/7/2014)
+### Disk format changes
+* If you're using RocksDB on ARM platforms and you're using default bloom filter, there is a disk format change you need to be aware of. There are three steps you need to do when you convert to new release: 1. turn off filter policy, 2. compact the whole database, 3. turn on filter policy
+
+### Behavior changes
+* We have refactored our system of stalling writes.  Any stall-related statistics' meanings are changed. Instead of per-write stall counts, we now count stalls per-epoch, where epochs are periods between flushes and compactions. You'll find more information in our Tuning Perf Guide once we release RocksDB 3.6.
+* When disableDataSync=true, we no longer sync the MANIFEST file.
+* Add identity_as_first_hash property to CuckooTable. SST file needs to be rebuilt to be opened by reader properly.
+
+### Public API changes
+* Change target_file_size_base type to uint64_t from int.
+* Remove allow_thread_local. This feature was proved to be stable, so we are turning it always-on.
+
+## 3.5.0 (9/3/2014)
+### New Features
+* Add include/utilities/write_batch_with_index.h, providing a utilitiy class to query data out of WriteBatch when building it.
+* Move BlockBasedTable related options to BlockBasedTableOptions from Options. Change corresponding JNI interface. Options affected include:
+  no_block_cache, block_cache, block_cache_compressed, block_size, block_size_deviation, block_restart_interval, filter_policy, whole_key_filtering. filter_policy is changed to shared_ptr from a raw pointer.
+* Remove deprecated options: disable_seek_compaction and db_stats_log_interval
+* OptimizeForPointLookup() takes one parameter for block cache size. It now builds hash index, bloom filter, and block cache.
+
+### Public API changes
+* The Prefix Extractor used with V2 compaction filters is now passed user key to SliceTransform::Transform instead of unparsed RocksDB key.
+
+## 3.4.0 (8/18/2014)
+### New Features
+* Support Multiple DB paths in universal style compactions
+* Add feature of storing plain table index and bloom filter in SST file.
+* CompactRange() will never output compacted files to level 0. This used to be the case when all the compaction input files were at level 0.
+* Added iterate_upper_bound to define the extent upto which the forward iterator will return entries. This will prevent iterating over delete markers and overwritten entries for edge cases where you want to break out the iterator anyways. This may improve perfomance in case there are a large number of delete markers or overwritten entries.
+
+### Public API changes
+* DBOptions.db_paths now is a vector of a DBPath structure which indicates both of path and target size
+* NewPlainTableFactory instead of bunch of parameters now accepts PlainTableOptions, which is defined in include/rocksdb/table.h
+* Moved include/utilities/*.h to include/rocksdb/utilities/*.h
+* Statistics APIs now take uint32_t as type instead of Tickers. Also make two access functions getTickerCount and histogramData const
+* Add DB property rocksdb.estimate-num-keys, estimated number of live keys in DB.
+* Add DB::GetIntProperty(), which returns DB properties that are integer as uint64_t.
+* The Prefix Extractor used with V2 compaction filters is now passed user key to SliceTransform::Transform instead of unparsed RocksDB key.
+
+## 3.3.0 (7/10/2014)
+### New Features
+* Added JSON API prototype.
+* HashLinklist reduces performance outlier caused by skewed bucket by switching data in the bucket from linked list to skip list. Add parameter threshold_use_skiplist in NewHashLinkListRepFactory().
+* RocksDB is now able to reclaim storage space more effectively during the compaction process.  This is done by compensating the size of each deletion entry by the 2X average value size, which makes compaction to be triggerred by deletion entries more easily.
+* Add TimeOut API to write.  Now WriteOptions have a variable called timeout_hint_us.  With timeout_hint_us set to non-zero, any write associated with this timeout_hint_us may be aborted when it runs longer than the specified timeout_hint_us, and it is guaranteed that any write completes earlier than the specified time-out will not be aborted due to the time-out condition.
+* Add a rate_limiter option, which controls total throughput of flush and compaction. The throughput is specified in bytes/sec. Flush always has precedence over compaction when available bandwidth is constrained.
+
+### Public API changes
+* Removed NewTotalOrderPlainTableFactory because it is not used and implemented semantically incorrect.
+
+## 3.2.0 (06/20/2014)
+
+### Public API changes
+* We removed seek compaction as a concept from RocksDB because:
+1) It makes more sense for spinning disk workloads, while RocksDB is primarily designed for flash and memory,
+2) It added some complexity to the important code-paths,
+3) None of our internal customers were really using it.
+Because of that, Options::disable_seek_compaction is now obsolete. It is still a parameter in Options, so it does not break the build, but it does not have any effect. We plan to completely remove it at some point, so we ask users to please remove this option from your code base.
+* Add two paramters to NewHashLinkListRepFactory() for logging on too many entries in a hash bucket when flushing.
+* Added new option BlockBasedTableOptions::hash_index_allow_collision. When enabled, prefix hash index for block-based table will not store prefix and allow hash collision, reducing memory consumption.
+
+### New Features
+* PlainTable now supports a new key encoding: for keys of the same prefix, the prefix is only written once. It can be enabled through encoding_type paramter of NewPlainTableFactory()
+* Add AdaptiveTableFactory, which is used to convert from a DB of PlainTable to BlockBasedTabe, or vise versa. It can be created using NewAdaptiveTableFactory()
+
+### Performance Improvements
+* Tailing Iterator re-implemeted with ForwardIterator + Cascading Search Hint , see ~20% throughput improvement.
+
+## 3.1.0 (05/21/2014)
+
+### Public API changes
+* Replaced ColumnFamilyOptions::table_properties_collectors with ColumnFamilyOptions::table_properties_collector_factories
+
+### New Features
+* Hash index for block-based table will be materialized and reconstructed more efficiently. Previously hash index is constructed by scanning the whole table during every table open.
+* FIFO compaction style
+
 ## 3.0.0 (05/05/2014)
 
 ### Public API changes
 * Added _LEVEL to all InfoLogLevel enums
 * Deprecated ReadOptions.prefix and ReadOptions.prefix_seek. Seek() defaults to prefix-based seek when Options.prefix_extractor is supplied. More detail is documented in https://github.com/facebook/rocksdb/wiki/Prefix-Seek-API-Changes
+* MemTableRepFactory::CreateMemTableRep() takes info logger as an extra parameter.
 
 ### New Features
 * Column family support
diff --git a/src/rocksdb/INSTALL.md b/src/rocksdb/INSTALL.md
index 2a91be6..330f8bc 100644
--- a/src/rocksdb/INSTALL.md
+++ b/src/rocksdb/INSTALL.md
@@ -1,19 +1,36 @@
-## Dependencies
+## Compilation
+
+RocksDB's library should be able to compile without any dependency installed,
+although we recommend installing some compression libraries (see below).
+We do depend on newer gcc/clang with C++11 support.
+
+There are few options when compiling RocksDB:
+
+* [recommended] `make static_lib` will compile librocksdb.a, RocksDB static library.
+
+* `make shared_lib` will compile librocksdb.so, RocksDB shared library.
+
+* `make check` will compile and run all the unit tests
+
+* `make all` will compile our static library, and all our tools and unit tests. Our tools
+depend on gflags. You will need to have gflags installed to run `make all`.
 
-RocksDB is developed on Linux (CentOS release 5.2), with gcc 4.8.1.
-It depends on gcc with C++11 support.
+* By default the binary we produce is optimized for the platform you're compiling on
+(-march=native). If you want to build a portable binary, add 'PORTABLE=1' before
+your make commands, like this: `PORTABLE=1 make static_lib`
 
-* RocksDB depends on the following libraries:
+## Dependencies
+
+* You can link RocksDB with following compression libraries:
   - [zlib](http://www.zlib.net/) - a library for data compression.
   - [bzip2](http://www.bzip.org/) - a library for data compression.
   - [snappy](https://code.google.com/p/snappy/) - a library for fast
       data compression.
-  - [gflags](https://code.google.com/p/gflags/) - a library that handles
-      command line flags processing.
 
-RocksDB will successfully compile without the compression libraries included,
-but some things may fail. We do not support releases without the compression
-libraries. You are on your own.
+* All our tools depend on:
+  - [gflags](https://code.google.com/p/gflags/) - a library that handles
+      command line flags processing. You can compile rocksdb library even
+      if you don't have gflags installed.
 
 ## Supported platforms
 
@@ -59,22 +76,7 @@ libraries. You are on your own.
         * Install via [homebrew](http://brew.sh/).
             * If you're first time developer in MacOS, you still need to run: `xcode-select --install` in your command line.
             * run `brew tap homebrew/dupes; brew install gcc47 --use-llvm` to install gcc 4.7 (or higher).
-    * Install zlib, bzip2 and snappy libraries for compression.
-    * Install gflags. We have included a script
-    `build_tools/mac-install-gflags.sh`, which should automatically install it.
-    If you installed gflags by other means (for example, `brew install gflags`),
-    please set `LIBRARY_PATH` and `CPATH` accordingly.
-    * Please note that some of the optimizations/features are disabled in OSX.
-    We did not run any production workloads on it.
+    * run `brew install rocksdb`
 
 * **iOS**:
-  * Run: `TARGET_OS=IOS make static_lib`
-
-## Compilation
-`make clean; make` will compile librocksdb.a (RocksDB static library) and all
-the unit tests. You can run all unit tests with `make check`.
-
-For shared library builds, exec `make shared_lib` instead.
-
-If you followed the above steps and your compile or unit tests fail,
-please submit an issue: (https://github.com/facebook/rocksdb/issues)
+  * Run: `TARGET_OS=IOS make static_lib`. When building the project which uses rocksdb iOS library, make sure to define two important pre-processing macros: `ROCKSDB_LITE` and `IOS_CROSS_COMPILE`.
diff --git a/src/rocksdb/Makefile.am b/src/rocksdb/Makefile.am
index d45927d..55c41ae 100644
--- a/src/rocksdb/Makefile.am
+++ b/src/rocksdb/Makefile.am
@@ -2,14 +2,14 @@ AUTOMAKE_OPTIONS = subdir-objects
 OPT ?= -DNDEBUG
 
 ACLOCAL_AMFLAGS = -I m4
-WARNING_FLAGS = -Wall -Werror 
+WARNING_FLAGS = -Wall -Werror
 ROCKSDB_CFLAGS = -g -O2 -fPIC
 ROCKSDB_CXXFLAGS = -std=c++11 -g -O2 -fPIC
 GFLAG = gflags
 noinst_LTLIBRARIES = librocksdb.la
 
 librocksdb_la_CFLAGS = $(WARNING_FLAGS) $(ROCKSDB_CFLAGS) -I$(srcdir) -I$(srcdir)/include -DOS_LINUX -DROCKSDB_PLATFORM_POSIX -fno-builtin-memcmp $(OPT) -DHAVE_JEMALLOC -Woverloaded-virtual
-librocksdb_la_CXXFLAGS = $(WARNING_FLAGS) $(ROCKSDB_CXXFLAGS) -I$(srcdir) -I$(srcdir)/include -DOS_LINUX -DROCKSDB_PLATFORM_POSIX -fno-builtin-memcmp $(OPT) -DHAVE_JEMALLOC -Woverloaded-virtual -DROCKSDB_ATOMIC_PRESENT -DROCKSDB_FALLOCATE_PRESENT 
+librocksdb_la_CXXFLAGS = $(WARNING_FLAGS) $(ROCKSDB_CXXFLAGS) -I$(srcdir) -I$(srcdir)/include -I$(srcdir)/third-party/gtest-1.7.0/fused-src -DOS_LINUX -DROCKSDB_PLATFORM_POSIX -fno-builtin-memcmp $(OPT) -DHAVE_JEMALLOC -Woverloaded-virtual -DROCKSDB_ATOMIC_PRESENT -DROCKSDB_FALLOCATE_PRESENT
 librocksdb_la_LDFLAGS = -shared -Wl,-soname -Wl
 if WITH_ATOMIC
    librocksdb_la_CFLAGS += -DROCKSDB_ATOMIC_PRESENT
@@ -21,7 +21,7 @@ if WITH_FALLOCATE
    librocksdb_la_CXXFLAGS += -DROCKSDB_FALLOCATE_PRESENT
 endif
 #
-librocksdb_la_LDFLAGS += -lpthread 
+librocksdb_la_LDFLAGS += -lpthread
 if WITH_TCMALLOC
    librocksdb_la_LDFLAGS += -ltcmalloc
 if WITH_RT
@@ -56,217 +56,327 @@ if WITH_GFLAGS_FLAGS
    librocksdb_la_LDFLAGS += -lgflags
 endif
 
-$(shell (./build_tools/build_detect_version))
+# Record the version of the source that we are compiling.
+# We keep a record of the git revision in this file.  It is then built
+# as a regular source file as part of the compilation process.
+# One can run "strings executable_filename | grep _build_" to find
+# the version of the source that we used to build the executable file.
+date := $(shell date +%F)
+git_sha := $(shell git describe HEAD 2>/dev/null)
+gen_build_version =                                                     \
+  printf '%s\n'                                                         \
+    '\#include "build_version.h"'                                       \
+    'const char* rocksdb_build_git_sha =                                \
+      "rocksdb_build_git_sha:$(git_sha)";'                              \
+    'const char* rocksdb_build_git_date =                               \
+      "rocksdb_build_git_date:$(date)";'                                \
+    'const char* rocksdb_build_compile_date = __DATE__;'
+$(shell $(gen_build_version) > util/build_version.cc)
 
 SOURCE_FILES = db/builder.cc \
-	db/c.cc \
-	db/column_family.cc \
-	db/compaction.cc \
-	db/compaction_picker.cc \
-	db/db_filesnapshot.cc \
-	db/dbformat.cc \
-	db/db_impl.cc \
-	db/db_impl_debug.cc \
-	db/db_impl_readonly.cc \
-	db/db_iter.cc \
-	db/db_stats_logger.cc \
-	db/file_indexer.cc \
-	db/filename.cc \
-	db/internal_stats.cc \
-	db/log_reader.cc \
-	db/log_writer.cc \
-	db/memtable.cc \
-	db/memtable_list.cc \
-	db/merge_helper.cc \
-	db/merge_operator.cc \
-	db/repair.cc \
-	db/table_cache.cc \
-	db/table_properties_collector.cc \
-	db/tailing_iter.cc \
-	db/transaction_log_impl.cc \
-	db/version_edit.cc \
-	db/version_set.cc \
-	db/write_batch.cc \
-	table/block_based_table_builder.cc \
-	table/block_based_table_factory.cc \
-	table/block_based_table_reader.cc \
-	table/block_builder.cc \
-	table/block.cc \
-	table/block_hash_index.cc \
-	table/filter_block.cc \
-	table/flush_block_policy.cc \
-	table/format.cc \
-	table/iterator.cc \
-	table/merger.cc \
-	table/meta_blocks.cc \
-	table/plain_table_builder.cc \
-	table/plain_table_factory.cc \
-	table/plain_table_reader.cc \
-	table/table_properties.cc \
-	table/two_level_iterator.cc \
-	util/arena.cc \
-	util/auto_roll_logger.cc \
-	util/blob_store.cc \
-	util/bloom.cc \
-	util/build_version.cc \
-	util/cache.cc \
-	util/coding.cc \
-	util/comparator.cc \
-	util/crc32c.cc \
-	util/dynamic_bloom.cc \
-	util/env.cc \
-	util/env_hdfs.cc \
-	util/env_posix.cc \
-	util/filter_policy.cc \
-	util/hash.cc \
-	util/hash_cuckoo_rep.cc \
-	util/hash_linklist_rep.cc \
-	util/hash_skiplist_rep.cc \
-	util/histogram.cc \
-	utilities/backupable/backupable_db.cc \
-	utilities/geodb/geodb_impl.cc \
-	utilities/merge_operators/put.cc \
-	utilities/merge_operators/string_append/stringappend2.cc \
-	utilities/merge_operators/string_append/stringappend.cc \
-	utilities/merge_operators/uint64add.cc \
-	utilities/redis/redis_lists.cc \
-	utilities/ttl/db_ttl_impl.cc \
-	util/ldb_cmd.cc \
-	util/ldb_tool.cc \
-	util/log_buffer.cc \
-	util/logging.cc \
-	util/murmurhash.cc \
-	util/options.cc \
-	util/perf_context.cc \
-	util/skiplistrep.cc \
-	util/slice.cc \
-	util/statistics.cc \
-	util/status.cc \
-	util/string_util.cc \
-	util/sync_point.cc \
-	util/thread_local.cc \
-	util/vectorrep.cc \
-	util/xxhash.cc \
-	port/port_posix.cc \
-	port/stack_trace.cc 
+        db/c.cc \
+        db/column_family.cc \
+        db/compaction.cc \
+        db/compaction_job.cc \
+        db/compaction_picker.cc \
+        db/db_filesnapshot.cc \
+        db/dbformat.cc \
+        db/db_impl.cc \
+        db/db_impl_debug.cc \
+        db/db_impl_readonly.cc \
+        db/db_iter.cc \
+	db/event_logger_helpers.cc \
+        db/file_indexer.cc \
+        db/filename.cc \
+        db/flush_job.cc \
+        db/flush_scheduler.cc \
+        db/forward_iterator.cc \
+        db/internal_stats.cc \
+        db/log_reader.cc \
+        db/log_writer.cc \
+        db/managed_iterator.cc \
+        db/memtable_allocator.cc \
+        db/memtable.cc \
+        db/memtable_list.cc \
+        db/merge_helper.cc \
+        db/merge_operator.cc \
+        db/repair.cc \
+        db/slice.cc \
+        db/table_cache.cc \
+        db/table_properties_collector.cc \
+        db/transaction_log_impl.cc \
+        db/version_builder.cc \
+        db/version_edit.cc \
+        db/version_set.cc \
+        db/wal_manager.cc \
+        db/write_batch.cc \
+        db/write_batch_base.cc \
+        db/write_controller.cc \
+        db/write_thread.cc \
+        port/stack_trace.cc \
+        port/port_posix.cc \
+        table/adaptive_table_factory.cc \
+        table/block_based_filter_block.cc \
+        table/block_based_table_builder.cc \
+        table/block_based_table_factory.cc \
+        table/block_based_table_reader.cc \
+        table/block_builder.cc \
+        table/block.cc \
+        table/block_hash_index.cc \
+        table/block_prefix_index.cc \
+        table/bloom_block.cc \
+        table/cuckoo_table_builder.cc \
+        table/cuckoo_table_factory.cc \
+        table/cuckoo_table_reader.cc \
+        table/flush_block_policy.cc \
+        table/format.cc \
+        table/full_filter_block.cc \
+        table/get_context.cc \
+        table/iterator.cc \
+        table/merger.cc \
+        table/meta_blocks.cc \
+        table/plain_table_builder.cc \
+        table/plain_table_factory.cc \
+        table/plain_table_index.cc \
+        table/plain_table_key_coding.cc \
+        table/plain_table_reader.cc \
+        table/table_properties.cc \
+        table/two_level_iterator.cc \
+        util/arena.cc \
+        util/auto_roll_logger.cc \
+        util/bloom.cc \
+        util/build_version.cc \
+        util/cache.cc \
+        util/coding.cc \
+        util/comparator.cc \
+        util/crc32c.cc \
+        util/db_info_dumper.cc \
+        util/dynamic_bloom.cc \
+        util/env.cc \
+        util/env_hdfs.cc \
+        util/env_posix.cc \
+        util/file_util.cc \
+        util/filter_policy.cc \
+        util/hash.cc \
+        util/hash_cuckoo_rep.cc \
+        util/hash_linklist_rep.cc \
+        util/hash_skiplist_rep.cc \
+        util/histogram.cc \
+        util/instrumented_mutex.cc \
+        util/iostats_context.cc \
+        utilities/backupable/backupable_db.cc \
+        utilities/convenience/convenience.cc \
+        utilities/checkpoint/checkpoint.cc \
+        utilities/compacted_db/compacted_db_impl.cc \
+        utilities/document/document_db.cc \
+        utilities/document/json_document_builder.cc \
+        utilities/document/json_document.cc \
+        utilities/geodb/geodb_impl.cc \
+        utilities/leveldb_options/leveldb_options.cc \
+        utilities/merge_operators/put.cc \
+        utilities/merge_operators/string_append/stringappend2.cc \
+        utilities/merge_operators/string_append/stringappend.cc \
+        utilities/merge_operators/uint64add.cc \
+        utilities/redis/redis_lists.cc \
+        utilities/spatialdb/spatial_db.cc \
+        utilities/ttl/db_ttl_impl.cc \
+        utilities/write_batch_with_index/write_batch_with_index.cc \
+        util/event_logger.cc \
+        util/ldb_cmd.cc \
+        util/ldb_tool.cc \
+        util/log_buffer.cc \
+        util/logging.cc \
+        util/memenv.cc \
+        util/murmurhash.cc \
+        util/mutable_cf_options.cc \
+        util/options_builder.cc \
+        util/options.cc \
+        util/options_helper.cc \
+        util/perf_context.cc \
+        util/rate_limiter.cc \
+        util/skiplistrep.cc \
+        util/slice.cc \
+        util/sst_dump_tool.cc \
+        util/statistics.cc \
+        util/status.cc \
+        util/string_util.cc \
+        util/sync_point.cc \
+        util/thread_local.cc \
+        util/thread_status_impl.cc \
+        util/thread_status_updater.cc \
+        util/thread_status_updater_debug.cc \
+        util/thread_status_util.cc \
+        util/thread_status_util_debug.cc \
+        util/vectorrep.cc \
+        util/xfunc.cc \
+        util/xxhash.cc
 
-
-SOURCE_H = util/ldb_cmd_execute_result.h \
-	util/xxhash.h \
-	util/mutexlock.h \
-	util/logging.h \
-	util/coding.h \
-	util/posix_logger.h \
-	util/crc32c.h \
-	util/ldb_cmd.h \
-	util/hash_linklist_rep.h \
-	util/log_buffer.h \
-	util/sync_point.h \
-	util/murmurhash.h \
-	util/autovector.h \
-	util/perf_context_imp.h \
-	util/testharness.h \
-	util/hash.h \
-	util/histogram.h \
-	util/stats_logger.h \
-	util/arena.h \
-	util/string_util.h \
-	util/stl_wrappers.h \
-	util/stop_watch.h \
-	util/statistics.h \
-	util/build_version.h \
-	util/benchharness.h \
-	util/auto_roll_logger.h \
-	util/dynamic_bloom.h \
-	util/random.h \
-	util/hash_skiplist_rep.h \
-	util/testutil.h \
-	util/thread_local.h \
-	util/blob_store.h \
-	util/hash_cuckoo_rep.h \
-	db/compaction_picker.h \
-	db/internal_stats.h \
-	db/builder.h \
-	db/memtable.h \
-	db/write_batch_internal.h \
-	db/file_indexer.h \
-	db/log_format.h \
-	db/merge_context.h \
-	db/log_writer.h \
-	db/snapshot.h \
-	db/table_properties_collector.h \
-	db/log_reader.h \
-	db/version_set.h \
-	db/merge_helper.h \
-	db/table_cache.h \
-	db/version_edit.h \
-	db/memtable_list.h \
-	db/db_impl_readonly.h \
-	db/tailing_iter.h \
-	db/filename.h \
-	db/db_iter.h \
-	db/compaction.h \
-	db/column_family.h \
-	db/skiplist.h \
-	db/transaction_log_impl.h \
-	db/dbformat.h \
-	db/db_impl.h \
-	table/block.h \
-	table/iter_heap.h \
-	table/block_based_table_reader.h \
-	table/iterator_wrapper.h \
-	table/block_builder.h \
-	table/table_builder.h \
-	table/plain_table_factory.h \
-	table/block_hash_index.h \
-	table/plain_table_reader.h \
-	table/table_reader.h \
-	table/meta_blocks.h \
-	table/format.h \
-	table/block_based_table_builder.h \
-	table/merger.h \
-	table/plain_table_builder.h \
-	table/two_level_iterator.h \
-	table/block_based_table_factory.h \
-	table/filter_block.h \
-	utilities/geodb/geodb_impl.h \
-	utilities/merge_operators.h \
-	utilities/merge_operators/string_append/stringappend2.h \
-	utilities/merge_operators/string_append/stringappend.h \
-	utilities/ttl/db_ttl_impl.h \
-	utilities/redis/redis_list_iterator.h \
-	utilities/redis/redis_lists.h \
-	utilities/redis/redis_list_exception.h \
+SOURCE_H = util/allocator.h \
+        util/arena.h \
+        util/auto_roll_logger.h \
+        util/autovector.h \
+        util/build_version.h \
+        util/coding.h \
+        util/compression.h \
+        util/crc32c.h \
+        util/db_info_dumper.h \
+        util/dynamic_bloom.h \
+        util/event_logger.h \
+        util/file_util.h \
+        util/hash.h \
+        util/hash_cuckoo_rep.h \
+        util/hash_linklist_rep.h \
+        util/hash_skiplist_rep.h \
+        util/histogram.h \
+        util/instrumented_mutex.h \
+        util/iostats_context_imp.h \
+        util/ldb_cmd.h \
+        util/ldb_cmd_execute_result.h \
+        util/log_buffer.h \
+        util/logging.h \
+        util/mock_env.h \
+        util/murmurhash.h \
+        util/mutable_cf_options.h \
+        util/mutexlock.h \
+        util/options_helper.h \
+        util/perf_context_imp.h \
+        util/posix_logger.h \
+        util/random.h \
+        util/rate_limiter.h \
+        util/scoped_arena_iterator.h \
+        util/sst_dump_tool_imp.h \
+        util/statistics.h \
+        util/stl_wrappers.h \
+        util/stop_watch.h \
+        util/string_util.h \
+        util/sync_point.h \
+        util/testharness.h \
+        util/testutil.h \
+        util/thread_local.h \
+        util/thread_operation.h \
+        util/thread_status_updater.h \
+        util/thread_status_util.h \
+        util/xfunc.h \
+        util/xxhash.h \
+        db/builder.h \
+        db/column_family.h \
+        db/compaction.h \
+        db/compaction_job.h \
+        db/compaction_picker.h \
+        db/db_impl.h \
+        db/db_impl_readonly.h \
+        db/db_iter.h \
+        db/dbformat.h \
+        db/file_indexer.h \
+        db/filename.h \
+        db/flush_job.h \
+        db/flush_scheduler.h \
+        db/forward_iterator.h \
+        db/internal_stats.h \
+        db/job_context.h \
+        db/log_format.h \
+        db/log_reader.h \
+        db/log_writer.h \
+        db/managed_iterator.h \
+        db/memtable.h \
+        db/memtable_allocator.h \
+        db/memtable_list.h \
+        db/merge_context.h \
+        db/merge_helper.h \
+        db/skiplist.h \
+        db/snapshot.h \
+        db/table_cache.h \
+        db/table_properties_collector.h \
+        db/transaction_log_impl.h \
+        db/version_builder.h \
+        db/version_edit.h \
+        db/version_set.h \
+        db/wal_manager.h \
+        db/write_batch_internal.h \
+        db/write_controller.h \
+        db/write_thread.h \
+        db/writebuffer.h \
+        table/adaptive_table_factory.h \
+        table/block.h \
+        table/block_based_filter_block.h \
+        table/block_based_table_builder.h \
+        table/block_based_table_factory.h \
+        table/block_based_table_reader.h \
+        table/block_builder.h \
+        table/block_hash_index.h \
+        table/block_prefix_index.h \
+        table/bloom_block.h \
+        table/cuckoo_table_builder.h \
+        table/cuckoo_table_factory.h \
+        table/cuckoo_table_reader.h \
+        table/filter_block.h \
+        table/format.h \
+        table/full_filter_block.h \
+        table/get_context.h \
+        table/iter_heap.h \
+        table/iterator_wrapper.h \
+        table/merger.h \
+        table/meta_blocks.h \
+        table/mock_table.h \
+        table/plain_table_builder.h \
+        table/plain_table_factory.h \
+        table/plain_table_index.h \
+        table/plain_table_key_coding.h \
+        table/plain_table_reader.h \
+        table/table_builder.h \
+        table/table_properties_internal.h \
+        table/table_reader.h \
+        table/two_level_iterator.h \
 	include/utilities/geo_db.h \
 	include/utilities/stackable_db.h \
 	include/utilities/db_ttl.h \
 	include/utilities/utility_db.h \
 	include/utilities/backupable_db.h \
-	include/rocksdb/universal_compaction.h \
-	include/rocksdb/options.h \
-	include/rocksdb/comparator.h \
-	include/rocksdb/flush_block_policy.h \
-	include/rocksdb/ldb_tool.h \
-	include/rocksdb/perf_context.h \
-	include/rocksdb/slice_transform.h \
-	include/rocksdb/filter_policy.h \
-	include/rocksdb/types.h \
-	include/rocksdb/write_batch.h \
-	include/rocksdb/statistics.h \
-	include/rocksdb/slice.h \
-	include/rocksdb/merge_operator.h \
-	include/rocksdb/iterator.h \
-	include/rocksdb/env.h \
-	include/rocksdb/compaction_filter.h \
-	include/rocksdb/table_properties.h \
-	include/rocksdb/db.h \
-	include/rocksdb/table.h \
-	include/rocksdb/status.h \
-	include/rocksdb/memtablerep.h \
-	include/rocksdb/version.h \
-	include/rocksdb/c.h \
-	include/rocksdb/transaction_log.h \
-	include/rocksdb/cache.h 
+        include/rocksdb/c.h \
+        include/rocksdb/cache.h \
+        include/rocksdb/compaction_filter.h \
+        include/rocksdb/comparator.h \
+        include/rocksdb/db.h \
+        include/rocksdb/env.h \
+        include/rocksdb/filter_policy.h \
+        include/rocksdb/flush_block_policy.h \
+        include/rocksdb/immutable_options.h \
+        include/rocksdb/iostats_context.h \
+        include/rocksdb/iterator.h \
+        include/rocksdb/ldb_tool.h \
+        include/rocksdb/listener.h \
+        include/rocksdb/memtablerep.h \
+        include/rocksdb/merge_operator.h \
+        include/rocksdb/metadata.h \
+        include/rocksdb/options.h \
+        include/rocksdb/perf_context.h \
+        include/rocksdb/rate_limiter.h \
+        include/rocksdb/slice.h \
+        include/rocksdb/slice_transform.h \
+        include/rocksdb/sst_dump_tool.h \
+        include/rocksdb/statistics.h \
+        include/rocksdb/status.h \
+        include/rocksdb/table.h \
+        include/rocksdb/table_properties.h \
+        include/rocksdb/thread_status.h \
+        include/rocksdb/transaction_log.h \
+        include/rocksdb/types.h \
+        include/rocksdb/universal_compaction.h \
+        include/rocksdb/version.h \
+        include/rocksdb/write_batch.h \
+        include/rocksdb/write_batch_base.h \
+        include/rocksdb/utilities/backupable_db.h \
+        include/rocksdb/utilities/checkpoint.h \
+        include/rocksdb/utilities/convenience.h \
+        include/rocksdb/utilities/db_ttl.h \
+        include/rocksdb/utilities/document_db.h \
+        include/rocksdb/utilities/geo_db.h \
+        include/rocksdb/utilities/json_document.h \
+        include/rocksdb/utilities/leveldb_options.h \
+        include/rocksdb/utilities/spatial_db.h \
+        include/rocksdb/utilities/stackable_db.h \
+        include/rocksdb/utilities/utility_db.h \
+        include/rocksdb/utilities/write_batch_with_index.h
 
 noinst_HEADERS = $(SOURCE_H)
 
diff --git a/src/rocksdb/PATENTS b/src/rocksdb/PATENTS
index 8a6fca4..65332e3 100644
--- a/src/rocksdb/PATENTS
+++ b/src/rocksdb/PATENTS
@@ -1,23 +1,33 @@
-Additional Grant of Patent Rights
+Additional Grant of Patent Rights Version 2
 
-“Software” means the rocksdb software distributed by Facebook, Inc.
+"Software" means the RocksDB software distributed by Facebook, Inc.
 
-Facebook hereby grants you a perpetual, worldwide, royalty-free,
-non-exclusive, irrevocable (subject to the termination provision below)
-license under any rights in any patent claims owned by Facebook, to make,
-have made, use, sell, offer to sell, import, and otherwise transfer the
-Software. For avoidance of doubt, no license is granted under Facebook’s
-rights in any patent claims that are infringed by (i) modifications to the
-Software made by you or a third party, or (ii) the Software in combination
-with any software or other technology provided by you or a third party.
+Facebook, Inc. ("Facebook") hereby grants to each recipient of the Software
+("you") a perpetual, worldwide, royalty-free, non-exclusive, irrevocable
+(subject to the termination provision below) license under any Necessary
+Claims, to make, have made, use, sell, offer to sell, import, and otherwise
+transfer the Software. For avoidance of doubt, no license is granted under
+Facebook’s rights in any patent claims that are infringed by (i) modifications
+to the Software made by you or any third party or (ii) the Software in
+combination with any software or other technology.
 
-The license granted hereunder will terminate, automatically and without
-notice, for anyone that makes any claim (including by filing any lawsuit,
-assertion or other action) alleging (a) direct, indirect, or contributory
-infringement or inducement to infringe any patent: (i) by Facebook or any
-of its subsidiaries or affiliates, whether or not such claim is related
-to the Software, (ii) by any party if such claim arises in whole or in
-part from any software, product or service of Facebook or any of its
-subsidiaries or affiliates, whether or not such claim is related to the
-Software, or (iii) by any party relating to the Software; or (b) that
-any right in any patent claim of Facebook is invalid or unenforceable.
+The license granted hereunder will terminate, automatically and without notice,
+if you (or any of your subsidiaries, corporate affiliates or agents) initiate
+directly or indirectly, or take a direct financial interest in, any Patent
+Assertion: (i) against Facebook or any of its subsidiaries or corporate
+affiliates, (ii) against any party if such Patent Assertion arises in whole or
+in part from any software, technology, product or service of Facebook or any of
+its subsidiaries or corporate affiliates, or (iii) against any party relating
+to the Software. Notwithstanding the foregoing, if Facebook or any of its
+subsidiaries or corporate affiliates files a lawsuit alleging patent
+infringement against you in the first instance, and you respond by filing a
+patent infringement counterclaim in that lawsuit against that party that is
+unrelated to the Software, the license granted hereunder will not terminate
+under section (i) of this paragraph due to such counterclaim.
+
+A "Necessary Claim" is a claim of a patent owned by Facebook that is
+necessarily infringed by the Software standing alone.
+
+A "Patent Assertion" is any lawsuit or other action alleging direct, indirect,
+or contributory infringement or inducement to infringe any patent, including a
+cross-claim or counterclaim.
diff --git a/src/rocksdb/README b/src/rocksdb/README
deleted file mode 100644
index 473e414..0000000
--- a/src/rocksdb/README
+++ /dev/null
@@ -1,82 +0,0 @@
-rocksdb: A persistent key-value store for flash storage
-Authors: * The Facebook Database Engineering Team
-         * Build on earlier work on leveldb by Sanjay Ghemawat
-           (sanjay at google.com) and Jeff Dean (jeff at google.com)
-
-This code is a library that forms the core building block for a fast
-key value server, especially suited for storing data on flash drives.
-It has an Log-Structured-Merge-Database (LSM) design with flexible tradeoffs
-between Write-Amplification-Factor(WAF), Read-Amplification-Factor (RAF)
-and Space-Amplification-Factor(SAF). It has multi-threaded compactions,
-making it specially suitable for storing multiple terabytes of data in a
-single database.
-
-The core of this code has been derived from open-source leveldb.
-
-The code under this directory implements a system for maintaining a
-persistent key/value store.
-
-See doc/index.html and github wiki (https://github.com/facebook/rocksdb/wiki)
-for more explanation.
-
-The public interface is in include/*.  Callers should not include or
-rely on the details of any other header files in this package.  Those
-internal APIs may be changed without warning.
-
-Guide to header files:
-
-include/rocksdb/db.h
-    Main interface to the DB: Start here
-
-include/rocksdb/options.h
-    Control over the behavior of an entire database, and also
-    control over the behavior of individual reads and writes.
-
-include/rocksdb/comparator.h
-    Abstraction for user-specified comparison function.  If you want
-    just bytewise comparison of keys, you can use the default comparator,
-    but clients can write their own comparator implementations if they
-    want custom ordering (e.g. to handle different character
-    encodings, etc.)
-
-include/rocksdb/iterator.h
-    Interface for iterating over data. You can get an iterator
-    from a DB object.
-
-include/rocksdb/write_batch.h
-    Interface for atomically applying multiple updates to a database.
-
-include/rocksdb/slice.h
-    A simple module for maintaining a pointer and a length into some
-    other byte array.
-
-include/rocksdb/status.h
-    Status is returned from many of the public interfaces and is used
-    to report success and various kinds of errors.
-
-include/rocksdb/env.h
-    Abstraction of the OS environment.  A posix implementation of
-    this interface is in util/env_posix.cc
-
-include/rocksdb/table_builder.h
-    Lower-level modules that most clients probably won't use directly
-
-include/rocksdb/cache.h
-    An API for the block cache.
-
-include/rocksdb/compaction_filter.h
-    An API for a application filter invoked on every compaction.
-
-include/rocksdb/filter_policy.h
-    An API for configuring a bloom filter.
-
-include/rocksdb/memtablerep.h
-    An API for implementing a memtable.
-
-include/rocksdb/statistics.h
-    An API to retrieve various database statistics.
-
-include/rocksdb/transaction_log.h
-    An API to retrieve transaction logs from a database.
-
-Design discussions are conducted in https://www.facebook.com/groups/rocksdb.dev/
diff --git a/src/rocksdb/README.md b/src/rocksdb/README.md
new file mode 100644
index 0000000..916bdec
--- /dev/null
+++ b/src/rocksdb/README.md
@@ -0,0 +1,25 @@
+## RocksDB: A Persistent Key-Value Store for Flash and RAM Storage
+
+[![Build Status](https://travis-ci.org/facebook/rocksdb.svg?branch=master)](https://travis-ci.org/facebook/rocksdb)
+
+RocksDB is developed and maintained by Facebook Database Engineering Team.
+It is built on earlier work on LevelDB by Sanjay Ghemawat (sanjay at google.com)
+and Jeff Dean (jeff at google.com)
+
+This code is a library that forms the core building block for a fast
+key value server, especially suited for storing data on flash drives.
+It has a Log-Structured-Merge-Database (LSM) design with flexible tradeoffs
+between Write-Amplification-Factor (WAF), Read-Amplification-Factor (RAF)
+and Space-Amplification-Factor (SAF). It has multi-threaded compactions,
+making it specially suitable for storing multiple terabytes of data in a
+single database.
+
+Start with example usage here: https://github.com/facebook/rocksdb/tree/master/examples
+
+See the [github wiki](https://github.com/facebook/rocksdb/wiki) for more explanation.
+
+The public interface is in `include/`.  Callers should not include or
+rely on the details of any other header files in this package.  Those
+internal APIs may be changed without warning.
+
+Design discussions are conducted in https://www.facebook.com/groups/rocksdb.dev/
diff --git a/src/rocksdb/USERS.md b/src/rocksdb/USERS.md
new file mode 100644
index 0000000..394aa30
--- /dev/null
+++ b/src/rocksdb/USERS.md
@@ -0,0 +1,36 @@
+This document lists users of RocksDB and their use cases. If you are using RocksDB, please open a pull request and add yourself to the list.
+
+## Facebook
+At Facebook, we use RocksDB as a backend for many different stateful services. We're also experimenting with running RocksDB as a storage engine for two databases:
+
+1. MyRocks -- https://github.com/MySQLOnRocksDB/mysql-5.6
+2. MongoRocks -- https://github.com/mongodb-partners/mongo-rocks
+
+## LinkedIn
+Two different use cases at Linkedin are using RocksDB as a storage engine:
+
+1. LinkedIn's follow feed for storing user's activities
+2. Apache Samza, open source framework for stream processing
+
+Learn more about those use cases in a Tech Talk by Ankit Gupta and Naveen Somasundaram: http://www.youtube.com/watch?v=plqVp_OnSzg
+
+## Yahoo
+Yahoo is using RocksDB as a storage engine for their biggest distributed data store Sherpa.
+
+## CockroachDB
+CockroachDB is an open-source geo-replicated transactional database (still in development). They are using RocksDB as their storage engine. Check out their github: https://github.com/cockroachdb/cockroach
+
+## DNANexus
+DNANexus is using RocksDB to speed up processing of genomics data.
+You can learn more from this great blog post by Mike Lin: http://devblog.dnanexus.com/faster-bam-sorting-with-samtools-and-rocksdb/
+
+## Iron.io
+Iron.io is using RocksDB as a storage engine for their distributed queueing system.
+Learn more from Tech Talk by Reed Allman: http://www.youtube.com/watch?v=HTjt6oj-RL4
+
+## Tango Me
+Tango is using RocksDB as a graph storage to store all users' connection data and other social activity data.
+
+## Turn
+Turn is using RocksDB as a storage layer for their key/value store, serving at peak 2.4MM QPS out of different datacenters.
+Check out our RocksDB Protobuf merge operator at: https://github.com/vladb38/rocksdb_protobuf
diff --git a/src/rocksdb/build_tools/build_detect_platform b/src/rocksdb/build_tools/build_detect_platform
deleted file mode 100755
index 99a212b..0000000
--- a/src/rocksdb/build_tools/build_detect_platform
+++ /dev/null
@@ -1,313 +0,0 @@
-#!/bin/sh
-#
-# Detects OS we're compiling on and outputs a file specified by the first
-# argument, which in turn gets read while processing Makefile.
-#
-# The output will set the following variables:
-#   CC                          C Compiler path
-#   CXX                         C++ Compiler path
-#   PLATFORM_LDFLAGS            Linker flags
-#   PLATFORM_SHARED_EXT         Extension for shared libraries
-#   PLATFORM_SHARED_LDFLAGS     Flags for building shared library
-#   PLATFORM_SHARED_CFLAGS      Flags for compiling objects for shared library
-#   PLATFORM_CCFLAGS            C compiler flags
-#   PLATFORM_CXXFLAGS           C++ compiler flags.  Will contain:
-#   PLATFORM_SHARED_VERSIONED   Set to 'true' if platform supports versioned
-#                               shared libraries, empty otherwise.
-#
-# The PLATFORM_CCFLAGS and PLATFORM_CXXFLAGS might include the following:
-#
-#       -DLEVELDB_PLATFORM_POSIX if cstdatomic is present
-#       -DLEVELDB_PLATFORM_NOATOMIC if it is not
-#       -DSNAPPY                    if the Snappy library is present
-#       -DLZ4                       if the LZ4 library is present
-#
-# Using gflags in rocksdb:
-# Our project depends on gflags, which requires users to take some extra steps
-# before they can compile the whole repository:
-#   1. Install gflags. You may download it from here:
-#      https://code.google.com/p/gflags/
-#   2. Once install, add the include path/lib path for gflags to CPATH and
-#      LIBRARY_PATH respectively. If installed with default mode, the
-#      lib and include path will be /usr/local/lib and /usr/local/include
-# Mac user can do this by running build_tools/mac-install-gflags.sh
-
-OUTPUT=$1
-if test -z "$OUTPUT"; then
-  echo "usage: $0 <output-filename>" >&2
-  exit 1
-fi
-
-# we depend on C++11
-PLATFORM_CXXFLAGS="-std=c++11"
-# we currently depend on POSIX platform
-COMMON_FLAGS="-DROCKSDB_PLATFORM_POSIX"
-
-# Default to fbcode gcc on internal fb machines
-if [ -d /mnt/gvfs/third-party -a -z "$CXX" ]; then
-    FBCODE_BUILD="true"
-    if [ -z "$USE_CLANG" ]; then
-        CENTOS_VERSION=`rpm -q --qf "%{VERSION}" \
-          $(rpm -q --whatprovides redhat-release)`
-        if [ "$CENTOS_VERSION" = "6" ]; then
-          source $PWD/build_tools/fbcode.gcc481.sh
-        else
-          source $PWD/build_tools/fbcode.gcc471.sh
-        fi
-    else
-        source $PWD/build_tools/fbcode.clang31.sh
-    fi
-fi
-
-# Delete existing output, if it exists
-rm -f $OUTPUT
-touch $OUTPUT
-
-if test -z "$CC"; then
-   CC=cc
-fi
-
-if test -z "$CXX"; then
-    CXX=g++
-fi
-
-# Detect OS
-if test -z "$TARGET_OS"; then
-    TARGET_OS=`uname -s`
-fi
-
-COMMON_FLAGS="$COMMON_FLAGS ${CFLAGS}"
-CROSS_COMPILE=
-PLATFORM_CCFLAGS=
-PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS ${CXXFLAGS}"
-PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS"
-PLATFORM_SHARED_EXT="so"
-PLATFORM_SHARED_LDFLAGS="-shared -Wl,-soname -Wl,"
-PLATFORM_SHARED_CFLAGS="-fPIC"
-PLATFORM_SHARED_VERSIONED=false
-
-# generic port files (working on all platform by #ifdef) go directly in /port
-GENERIC_PORT_FILES=`cd $ROCKSDB_ROOT; find port -name '*.cc' | tr "\n" " "`
-
-# On GCC, we pick libc's memcmp over GCC's memcmp via -fno-builtin-memcmp
-case "$TARGET_OS" in
-    Darwin)
-        PLATFORM=OS_MACOSX
-        COMMON_FLAGS="$COMMON_FLAGS -DOS_MACOSX"
-        PLATFORM_SHARED_EXT=dylib
-        PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name "
-        # PORT_FILES=port/darwin/darwin_specific.cc
-        ;;
-    IOS)
-        PLATFORM=IOS
-        COMMON_FLAGS="$COMMON_FLAGS -DOS_MACOSX -DIOS_CROSS_COMPILE -DROCKSDB_LITE"
-        PLATFORM_SHARED_EXT=dylib
-        PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name "
-        CROSS_COMPILE=true
-        ;;
-    Linux)
-        PLATFORM=OS_LINUX
-        COMMON_FLAGS="$COMMON_FLAGS -DOS_LINUX"
-        if [ -z "$USE_CLANG" ]; then
-            COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp"
-        fi
-        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt"
-        # PORT_FILES=port/linux/linux_specific.cc
-        ;;
-    SunOS)
-        PLATFORM=OS_SOLARIS
-        COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_SOLARIS"
-        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt"
-        # PORT_FILES=port/sunos/sunos_specific.cc
-        ;;
-    FreeBSD)
-        PLATFORM=OS_FREEBSD
-        COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_FREEBSD"
-        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread"
-        # PORT_FILES=port/freebsd/freebsd_specific.cc
-        ;;
-    NetBSD)
-        PLATFORM=OS_NETBSD
-        COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_NETBSD"
-        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lgcc_s"
-        # PORT_FILES=port/netbsd/netbsd_specific.cc
-        ;;
-    OpenBSD)
-        PLATFORM=OS_OPENBSD
-        COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_OPENBSD"
-        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -pthread"
-        # PORT_FILES=port/openbsd/openbsd_specific.cc
-        ;;
-    DragonFly)
-        PLATFORM=OS_DRAGONFLYBSD
-        COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_DRAGONFLYBSD"
-        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread"
-        # PORT_FILES=port/dragonfly/dragonfly_specific.cc
-        ;;
-    OS_ANDROID_CROSSCOMPILE)
-        PLATFORM=OS_ANDROID
-	COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_ANDROID -DLEVELDB_PLATFORM_POSIX"
-	PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS "  # All pthread features are in the Android C library
-        # PORT_FILES=port/android/android.cc
-        CROSS_COMPILE=true
-        ;;
-    *)
-        echo "Unknown platform!" >&2
-        exit 1
-esac
-
-if test -z "$DO_NOT_RUN_BUILD_DETECT_VERSION"; then
-  $PWD/build_tools/build_detect_version
-fi
-
-# We want to make a list of all cc files within util, db, table, and helpers
-# except for the test and benchmark files. By default, find will output a list
-# of all files matching either rule, so we need to append -print to make the
-# prune take effect.
-DIRS="util db table utilities"
-
-set -f # temporarily disable globbing so that our patterns arent expanded
-PRUNE_TEST="-name *test*.cc -prune"
-PRUNE_BENCH="-name *bench*.cc -prune"
-PORTABLE_FILES=`cd $ROCKSDB_ROOT; find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o -name '*.cc' -print | sort | tr "\n" " "`
-PORTABLE_CPP=`cd $ROCKSDB_ROOT; find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o -name '*.cpp' -print | sort | tr "\n" " "`
-set +f # re-enable globbing
-
-# The sources consist of the portable files, plus the platform-specific port
-# file.
-echo "SOURCES=$PORTABLE_FILES $GENERIC_PORT_FILES $PORT_FILES" >> $OUTPUT
-echo "SOURCESCPP=$PORTABLE_CPP" >> $OUTPUT
-echo "MEMENV_SOURCES=helpers/memenv/memenv.cc" >> $OUTPUT
-
-if [ "$CROSS_COMPILE" = "true" -o "$FBCODE_BUILD" = "true" ]; then
-    # Cross-compiling; do not try any compilation tests.
-    # Also don't need any compilation tests if compiling on fbcode
-    true
-else
-    # do fPIC on 64 bit in non-fbcode environment
-    case "$TARGET_OS" in
-        x86_64)
-            PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS -fPIC"
-    esac
-
-    # If -std=c++0x works, use <atomic>.  Otherwise use port_posix.h.
-    $CXX $CFLAGS -std=c++0x -x c++ - -o /dev/null 2>/dev/null  <<EOF
-      #include <atomic>
-      int main() {}
-EOF
-    if [ "$?" = 0 ]; then
-        COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_ATOMIC_PRESENT"
-    fi
-
-    # Test whether fallocate is available
-    $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
-      #include <fcntl.h>
-      int main() {
-	int fd = open("/dev/null", 0);
-	fallocate(fd, 0, 0, 1024);
-      }
-EOF
-    if [ "$?" = 0 ]; then
-        COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_FALLOCATE_PRESENT"
-    fi
-
-    # Test whether Snappy library is installed
-    # http://code.google.com/p/snappy/
-    $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
-      #include <snappy.h>
-      int main() {}
-EOF
-    if [ "$?" = 0 ]; then
-        COMMON_FLAGS="$COMMON_FLAGS -DSNAPPY"
-        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lsnappy"
-    fi
-
-
-    # Test whether gflags library is installed
-    # http://code.google.com/p/gflags/
-    $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
-      #include <gflags/gflags.h>
-      int main() {}
-EOF
-    if [ "$?" = 0 ]; then
-        COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS"
-        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags"
-    fi
-
-    # Test whether zlib library is installed
-    $CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
-      #include <zlib.h>
-      int main() {}
-EOF
-    if [ "$?" = 0 ]; then
-        COMMON_FLAGS="$COMMON_FLAGS -DZLIB"
-        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lz"
-    fi
-
-    # Test whether bzip library is installed
-    $CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
-      #include <bzlib.h>
-      int main() {}
-EOF
-    if [ "$?" = 0 ]; then
-        COMMON_FLAGS="$COMMON_FLAGS -DBZIP2"
-        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lbz2"
-    fi
-
-    # Test whether lz4 library is installed
-    $CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
-      #include <lz4.h>
-      #include <lz4hc.h>
-      int main() {}
-EOF
-    if [ "$?" = 0 ]; then
-        COMMON_FLAGS="$COMMON_FLAGS -DLZ4"
-        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -llz4"
-    fi
-
-    # Test whether tcmalloc is available
-    $CXX $CFLAGS -x c++ - -o /dev/null -ltcmalloc 2>/dev/null  <<EOF
-      int main() {}
-EOF
-    if [ "$?" = 0 ]; then
-        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -ltcmalloc"
-    fi
-fi
-
-# shall we use HDFS?
-
-if test "$USE_HDFS"; then
-  if test -z "$JAVA_HOME"; then
-    echo "JAVA_HOME has to be set for HDFS usage."
-    exit 1
-  fi
-  HDFS_CCFLAGS="$HDFS_CCFLAGS -I$JAVA_HOME/include -I$JAVA_HOME/include/linux -DUSE_HDFS"
-  HDFS_LDFLAGS="$HDFS_LDFLAGS -Wl,--no-whole-archive hdfs/libhdfs.a -L$JAVA_HOME/jre/lib/amd64"
-  HDFS_LDFLAGS="$HDFS_LDFLAGS -L$JAVA_HOME/jre/lib/amd64/server -L$GLIBC_RUNTIME_PATH/lib"
-  HDFS_LDFLAGS="$HDFS_LDFLAGS -ldl -lverify -ljava -ljvm"
-  COMMON_FLAGS="$COMMON_FLAGS $HDFS_CCFLAGS"
-  PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS $HDFS_LDFLAGS"
-fi
-
-# if Intel SSE instruction set is supported, set USE_SSE=" -msse -msse4.2 "
-COMMON_FLAGS="$COMMON_FLAGS $USE_SSE"
-
-PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS"
-PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS $COMMON_FLAGS"
-
-VALGRIND_VER="$VALGRIND_VER"
-
-echo "CC=$CC" >> $OUTPUT
-echo "CXX=$CXX" >> $OUTPUT
-echo "PLATFORM=$PLATFORM" >> $OUTPUT
-echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> $OUTPUT
-echo "VALGRIND_VER=$VALGRIND_VER" >> $OUTPUT
-echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> $OUTPUT
-echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> $OUTPUT
-echo "PLATFORM_SHARED_CFLAGS=$PLATFORM_SHARED_CFLAGS" >> $OUTPUT
-echo "PLATFORM_SHARED_EXT=$PLATFORM_SHARED_EXT" >> $OUTPUT
-echo "PLATFORM_SHARED_LDFLAGS=$PLATFORM_SHARED_LDFLAGS" >> $OUTPUT
-echo "PLATFORM_SHARED_VERSIONED=$PLATFORM_SHARED_VERSIONED" >> $OUTPUT
-echo "EXEC_LDFLAGS=$EXEC_LDFLAGS" >> $OUTPUT
-echo "JEMALLOC_INCLUDE=$JEMALLOC_INCLUDE" >> $OUTPUT
-echo "JEMALLOC_LIB=$JEMALLOC_LIB" >> $OUTPUT
diff --git a/src/rocksdb/build_tools/build_detect_version b/src/rocksdb/build_tools/build_detect_version
deleted file mode 100755
index f7d711f..0000000
--- a/src/rocksdb/build_tools/build_detect_version
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/bin/sh
-#
-# Record the version of the source that we are compiling.
-# We keep a record of the git revision in util/version.cc. This source file
-# is then built as a regular source file as part of the compilation process.
-# One can run "strings executable_filename | grep _build_" to find the version of
-# the source that we used to build the executable file.
-
-OUTFILE="$PWD/util/build_version.cc"
-
-GIT_SHA=""
-if command -v git >/dev/null 2>&1; then
-    GIT_SHA=$(git rev-parse HEAD 2>/dev/null)
-fi
-
-cat > "${OUTFILE}" <<EOF
-#include "build_version.h"
-const char* rocksdb_build_git_sha = "rocksdb_build_git_sha:${GIT_SHA}";
-const char* rocksdb_build_git_datetime = "rocksdb_build_git_datetime:$(date)";
-const char* rocksdb_build_compile_date = __DATE__;
-const char* rocksdb_build_compile_time = __TIME__;
-EOF
diff --git a/src/rocksdb/build_tools/fbcode.clang31.sh b/src/rocksdb/build_tools/fbcode.clang31.sh
deleted file mode 100644
index 25a2ca7..0000000
--- a/src/rocksdb/build_tools/fbcode.clang31.sh
+++ /dev/null
@@ -1,74 +0,0 @@
-#!/bin/sh
-#
-# Set environment variables so that we can compile leveldb using
-# fbcode settings.  It uses the latest g++ compiler and also
-# uses jemalloc
-
-TOOLCHAIN_REV=fbe3b095a4cc4a3713730050d182b7b4a80c342f
-TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos5.2-native"
-TOOLCHAIN_LIB_BASE="/mnt/gvfs/third-party/$TOOLCHAIN_REV/gcc-4.7.1-glibc-2.14.1"
-TOOL_JEMALLOC=jemalloc-3.3.1/9202ce3
-GLIBC_RUNTIME_PATH=/usr/local/fbcode/gcc-4.7.1-glibc-2.14.1
-
-# location of libgcc
-LIBGCC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include"
-LIBGCC_LIBS=" -L $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/libs"
-
-# location of glibc
-GLIBC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/include"
-GLIBC_LIBS=" -L $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/lib"
-
-# location of snappy headers and libraries
-SNAPPY_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/include"
-SNAPPY_LIBS=" $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/lib/libsnappy.a"
-
-# location of zlib headers and libraries
-ZLIB_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/91ddd43/include"
-ZLIB_LIBS=" $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/91ddd43/lib/libz.a"
-
-# location of gflags headers and libraries
-GFLAGS_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/include"
-GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/lib/libgflags.a"
-
-# location of bzip headers and libraries
-BZIP_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/91ddd43/include"
-BZIP_LIBS=" $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/91ddd43/lib/libbz2.a"
-
-# location of gflags headers and libraries
-GFLAGS_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/include"
-GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/lib/libgflags.a"
-
-# use Intel SSE support for checksum calculations
-export USE_SSE=" -msse -msse4.2 "
-
-CC="$TOOLCHAIN_EXECUTABLES/clang/clang-3.2/0b7c69d/bin/clang $CLANG_INCLUDES"
-CXX="$TOOLCHAIN_EXECUTABLES/clang/clang-3.2/0b7c69d/bin/clang++ $CLANG_INCLUDES $JINCLUDE $SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $GFLAGS_INCLUDE"
-AR=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ar
-RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib
-
-CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin -nostdlib "
-CFLAGS+=" -nostdinc -isystem $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include/c++/4.7.1 "
-CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include/c++/4.7.1/x86_64-facebook-linux "
-CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include/c++/4.7.1/backward "
-CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/include "
-CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include "
-CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/clang/clang-3.2/0b7c69d/lib/clang/3.2/include "
-CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/kernel-headers/kernel-headers-3.2.18_70_fbk11_00129_gc8882d0/da39a3e/include/linux "
-CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/kernel-headers/kernel-headers-3.2.18_70_fbk11_00129_gc8882d0/da39a3e/include "
-CFLAGS+=" -Wall -Wno-sign-compare -Wno-unused-variable -Winvalid-pch -Wno-deprecated -Woverloaded-virtual"
-CFLAGS+=" $LIBGCC_INCLUDE $GLIBC_INCLUDE"
-CXXFLAGS="$CFLAGS -nostdinc++"
-
-CFLAGS+=" -I $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/include -DHAVE_JEMALLOC"
-
-EXEC_LDFLAGS=" -Wl,--whole-archive $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/lib/libjemalloc.a"
-EXEC_LDFLAGS+=" -Wl,--no-whole-archive $TOOLCHAIN_LIB_BASE/libunwind/libunwind-1.0.1/350336c/lib/libunwind.a"
-EXEC_LDFLAGS+=" $HDFSLIB $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $GFLAGS_LIBS"
-EXEC_LDFLAGS+=" -Wl,--dynamic-linker,$GLIBC_RUNTIME_PATH/lib/ld-linux-x86-64.so.2"
-EXEC_LDFLAGS+=" -B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin"
-
-PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS "
-
-EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $GFLAGS_LIBS"
-
-export CC CXX AR RANLIB CFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED 
diff --git a/src/rocksdb/build_tools/fbcode.gcc471.sh b/src/rocksdb/build_tools/fbcode.gcc471.sh
deleted file mode 100644
index 9294057..0000000
--- a/src/rocksdb/build_tools/fbcode.gcc471.sh
+++ /dev/null
@@ -1,70 +0,0 @@
-#!/bin/sh
-#
-# Set environment variables so that we can compile leveldb using
-# fbcode settings.  It uses the latest g++ compiler and also
-# uses jemalloc
-
-TOOLCHAIN_REV=fbe3b095a4cc4a3713730050d182b7b4a80c342f
-TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos5.2-native"
-TOOLCHAIN_LIB_BASE="/mnt/gvfs/third-party/$TOOLCHAIN_REV/gcc-4.7.1-glibc-2.14.1"
-TOOL_JEMALLOC=jemalloc-3.3.1/9202ce3
-
-# location of libhdfs libraries
-if test "$USE_HDFS"; then
-  JAVA_HOME="/usr/local/jdk-6u22-64"
-  JINCLUDE="-I$JAVA_HOME/include -I$JAVA_HOME/include/linux"
-  GLIBC_RUNTIME_PATH="/usr/local/fbcode/gcc-4.7.1-glibc-2.14.1"
-  HDFSLIB=" -Wl,--no-whole-archive hdfs/libhdfs.a -L$JAVA_HOME/jre/lib/amd64 "
-  HDFSLIB+=" -L$JAVA_HOME/jre/lib/amd64/server -L$GLIBC_RUNTIME_PATH/lib "
-  HDFSLIB+=" -ldl -lverify -ljava -ljvm "
-fi
-
-# location of libgcc
-LIBGCC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include"
-LIBGCC_LIBS=" -L $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/libs"
-
-# location of glibc
-GLIBC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/include"
-GLIBC_LIBS=" -L $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/lib"
-
-# location of snappy headers and libraries
-SNAPPY_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/include"
-SNAPPY_LIBS=" $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/lib/libsnappy.a"
-
-# location of zlib headers and libraries
-ZLIB_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/91ddd43/include"
-ZLIB_LIBS=" $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/91ddd43/lib/libz.a"
-
-# location of bzip headers and libraries
-BZIP_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/91ddd43/include"
-BZIP_LIBS=" $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/91ddd43/lib/libbz2.a"
-
-# location of gflags headers and libraries
-GFLAGS_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/include"
-GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/lib/libgflags.a"
-
-# use Intel SSE support for checksum calculations
-export USE_SSE=" -msse -msse4.2 "
-
-CC="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.7.1-glibc-2.14.1/bin/gcc"
-CXX="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.7.1-glibc-2.14.1/bin/g++ $JINCLUDE $SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $GFLAGS_INCLUDE"
-AR=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ar
-RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib
-
-CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/gold -m64 -mtune=generic"
-CFLAGS+=" -I $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/include -DHAVE_JEMALLOC"
-CFLAGS+=" $LIBGCC_INCLUDE $GLIBC_INCLUDE"
-CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_ATOMIC_PRESENT -DROCKSDB_FALLOCATE_PRESENT"
-CFLAGS+=" -DSNAPPY -DGFLAGS -DZLIB -DBZIP2"
-
-EXEC_LDFLAGS=" -Wl,--whole-archive $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/lib/libjemalloc.a"
-EXEC_LDFLAGS+=" -Wl,--no-whole-archive $TOOLCHAIN_LIB_BASE/libunwind/libunwind-1.0.1/350336c/lib/libunwind.a"
-EXEC_LDFLAGS+=" $HDFSLIB $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $GFLAGS_LIBS"
-
-PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS "
-
-EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $GFLAGS_LIBS"
-
-VALGRIND_VER="$TOOLCHAIN_LIB_BASE/valgrind/valgrind-3.8.1/91ddd43/bin/"
-
-export CC CXX AR RANLIB CFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER
diff --git a/src/rocksdb/build_tools/fbcode.gcc481.sh b/src/rocksdb/build_tools/fbcode.gcc481.sh
deleted file mode 100644
index d02596f..0000000
--- a/src/rocksdb/build_tools/fbcode.gcc481.sh
+++ /dev/null
@@ -1,81 +0,0 @@
-#!/bin/sh
-#
-# Set environment variables so that we can compile rocksdb using
-# fbcode settings.  It uses the latest g++ compiler and also
-# uses jemalloc
-
-TOOLCHAIN_REV=53dc1fe83f84e9145b9ffb81b81aa7f6a49c87cc
-CENTOS_VERSION=`rpm -q --qf "%{VERSION}" $(rpm -q --whatprovides redhat-release)`
-if [ "$CENTOS_VERSION" = "6" ]; then
-  TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos6-native"
-else
-  TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos5.2-native"
-fi
-TOOLCHAIN_LIB_BASE="/mnt/gvfs/third-party/$TOOLCHAIN_REV/gcc-4.8.1-glibc-2.17"
-
-# location of libhdfs libraries
-if test "$USE_HDFS"; then
-  JAVA_HOME="/usr/local/jdk-6u22-64"
-  JINCLUDE="-I$JAVA_HOME/include -I$JAVA_HOME/include/linux"
-  GLIBC_RUNTIME_PATH="/usr/local/fbcode/gcc-4.8.1-glibc-2.17"
-  HDFSLIB=" -Wl,--no-whole-archive hdfs/libhdfs.a -L$JAVA_HOME/jre/lib/amd64 "
-  HDFSLIB+=" -L$JAVA_HOME/jre/lib/amd64/server -L$GLIBC_RUNTIME_PATH/lib "
-  HDFSLIB+=" -ldl -lverify -ljava -ljvm "
-fi
-
-# location of libgcc
-LIBGCC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.8.1/8aac7fc/include"
-LIBGCC_LIBS=" -L $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.8.1/8aac7fc/libs"
-
-# location of glibc
-GLIBC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/glibc/glibc-2.17/99df8fc/include"
-GLIBC_LIBS=" -L $TOOLCHAIN_LIB_BASE/glibc/glibc-2.17/99df8fc/lib"
-
-# location of snappy headers and libraries
-SNAPPY_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/43d84e2/include"
-SNAPPY_LIBS=" $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/43d84e2/lib/libsnappy.a"
-
-# location of zlib headers and libraries
-ZLIB_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/c3f970a/include"
-ZLIB_LIBS=" $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/c3f970a/lib/libz.a"
-
-# location of bzip headers and libraries
-BZIP_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/c3f970a/include"
-BZIP_LIBS=" $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/c3f970a/lib/libbz2.a"
-
-LZ4_REV=065ec7e38fe83329031f6668c43bef83eff5808b
-LZ4_INCLUDE=" -I /mnt/gvfs/third-party2/lz4/$LZ4_REV/r108/gcc-4.8.1-glibc-2.17/c3f970a/include"
-LZ4_LIBS=" /mnt/gvfs/third-party2/lz4/$LZ4_REV/r108/gcc-4.8.1-glibc-2.17/c3f970a/lib/liblz4.a"
-
-# location of gflags headers and libraries
-GFLAGS_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/c3f970a/include"
-GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/c3f970a/lib/libgflags.a"
-
-# location of jemalloc
-JEMALLOC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/jemalloc/jemalloc-3.4.1/4d53c6f/include/"
-JEMALLOC_LIB=" -Wl,--whole-archive $TOOLCHAIN_LIB_BASE/jemalloc/jemalloc-3.4.1/4d53c6f/lib/libjemalloc.a"
-
-# use Intel SSE support for checksum calculations
-export USE_SSE=" -msse -msse4.2 "
-
-CC="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.8.1/cc6c9dc/bin/gcc"
-CXX="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.8.1/cc6c9dc/bin/g++ $JINCLUDE $SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $LZ4_INCLUDE $GFLAGS_INCLUDE"
-AR=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ar
-RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib
-
-CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/gold -m64 -mtune=generic"
-CFLAGS+=" $LIBGCC_INCLUDE $GLIBC_INCLUDE"
-CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_ATOMIC_PRESENT -DROCKSDB_FALLOCATE_PRESENT"
-CFLAGS+=" -DSNAPPY -DGFLAGS -DZLIB -DBZIP2 -DLZ4"
-
-EXEC_LDFLAGS="-Wl,--dynamic-linker,/usr/local/fbcode/gcc-4.8.1-glibc-2.17/lib/ld.so"
-EXEC_LDFLAGS+=" -Wl,--no-whole-archive $TOOLCHAIN_LIB_BASE/libunwind/libunwind-1.0.1/675d945/lib/libunwind.a"
-EXEC_LDFLAGS+=" $HDFSLIB $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $GFLAGS_LIBS"
-
-PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS "
-
-EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $GFLAGS_LIBS"
-
-VALGRIND_VER="$TOOLCHAIN_LIB_BASE/valgrind/valgrind-3.8.1/c3f970a/bin/"
-
-export CC CXX AR RANLIB CFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE
diff --git a/src/rocksdb/build_tools/format-diff.sh b/src/rocksdb/build_tools/format-diff.sh
deleted file mode 100755
index 2d60620..0000000
--- a/src/rocksdb/build_tools/format-diff.sh
+++ /dev/null
@@ -1,107 +0,0 @@
-#!/bin/bash
-# If clang_format_diff.py command is not specfied, we assume we are able to
-# access directly without any path.
-if [ -z $CLANG_FORMAT_DIFF ]
-then
-CLANG_FORMAT_DIFF="clang-format-diff.py"
-fi
-
-# Check clang-format-diff.py
-if ! which $CLANG_FORMAT_DIFF &> /dev/null
-then
-  echo "You didn't have clang-format-diff.py available in your computer!"
-  echo "You can download it by running: "
-  echo "    curl http://goo.gl/iUW1u2"
-  exit 128
-fi
-
-# Check argparse, a library that clang-format-diff.py requires.
-python 2>/dev/null << EOF
-import argparse
-EOF
-
-if [ "$?" != 0 ]
-then
-  echo "To run clang-format-diff.py, we'll need the library "argparse" to be"
-  echo "installed. You can try either of the follow ways to install it:"
-  echo "  1. Manually download argparse: https://pypi.python.org/pypi/argparse"
-  echo "  2. easy_install argparse (if you have easy_install)"
-  echo "  3. pip install argparse (if you have pip)"
-  exit 129
-fi
-
-# TODO(kailiu) following work is not complete since we still need to figure
-# out how to add the modified files done pre-commit hook to git's commit index.
-#
-# Check if this script has already been added to pre-commit hook.
-# Will suggest user to add this script to pre-commit hook if their pre-commit
-# is empty.
-# PRE_COMMIT_SCRIPT_PATH="`git rev-parse --show-toplevel`/.git/hooks/pre-commit"
-# if ! ls $PRE_COMMIT_SCRIPT_PATH &> /dev/null
-# then
-#   echo "Would you like to add this script to pre-commit hook, which will do "
-#   echo -n "the format check for all the affected lines before you check in (y/n):"
-#   read add_to_hook
-#   if [ "$add_to_hook" == "y" ]
-#   then
-#     ln -s `git rev-parse --show-toplevel`/build_tools/format-diff.sh $PRE_COMMIT_SCRIPT_PATH
-#   fi
-# fi
-set -e
-
-uncommitted_code=`git diff HEAD`
-
-# If there's no uncommitted changes, we assume user are doing post-commit
-# format check, in which case we'll check the modified lines from latest commit.
-# Otherwise, we'll check format of the uncommitted code only.
-if [ -z "$uncommitted_code" ]
-then
-  # Check the format of last commit
-  diffs=$(git diff -U0 HEAD^ | $CLANG_FORMAT_DIFF -p 1)
-else
-  # Check the format of uncommitted lines,
-  diffs=$(git diff -U0 HEAD | $CLANG_FORMAT_DIFF -p 1)
-fi
-
-if [ -z "$diffs" ]
-then
-  echo "Nothing needs to be reformatted!"
-  exit 0
-fi
-
-# Highlight the insertion/deletion from the clang-format-diff.py's output
-COLOR_END="\033[0m"
-COLOR_RED="\033[0;31m" 
-COLOR_GREEN="\033[0;32m" 
-
-echo -e "Detect lines that doesn't follow the format rules:\r"
-# Add the color to the diff. lines added will be green; lines removed will be red.
-echo "$diffs" | 
-  sed -e "s/\(^-.*$\)/`echo -e \"$COLOR_RED\1$COLOR_END\"`/" |
-  sed -e "s/\(^+.*$\)/`echo -e \"$COLOR_GREEN\1$COLOR_END\"`/"
-echo -e "Would you like to fix the format automatically (y/n): \c"
-
-# Make sure under any mode, we can read user input.
-exec < /dev/tty
-read to_fix
-
-if [ "$to_fix" != "y" ]
-then
-  exit 1
-fi
-
-# Do in-place format adjustment.
-git diff -U0 HEAD^ | $CLANG_FORMAT_DIFF -i -p 1
-echo "Files reformatted!"
-
-# Amend to last commit if user do the post-commit format check
-if [ -z "$uncommitted_code" ]; then
-  echo -e "Would you like to amend the changes to last commit (`git log HEAD --oneline | head -1`)? (y/n): \c"
-  read to_amend
-
-  if [ "$to_amend" == "y" ]
-  then
-    git commit -a --amend --reuse-message HEAD
-    echo "Amended to last commit"
-  fi
-fi
diff --git a/src/rocksdb/build_tools/mac-install-gflags.sh b/src/rocksdb/build_tools/mac-install-gflags.sh
deleted file mode 100755
index ef0339c..0000000
--- a/src/rocksdb/build_tools/mac-install-gflags.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/sh
-# Install gflags for mac developers.
-
-set -e
-
-DIR=`mktemp -d /tmp/rocksdb_gflags_XXXX`
-
-cd $DIR
-wget https://gflags.googlecode.com/files/gflags-2.0.tar.gz
-tar xvfz gflags-2.0.tar.gz
-cd gflags-2.0
-
-./configure
-make
-make install
-
-# Add include/lib path for g++
-echo 'export LIBRARY_PATH+=":/usr/local/lib"' >> ~/.bash_profile
-echo 'export CPATH+=":/usr/local/include"' >> ~/.bash_profile
-
-echo ""
-echo "-----------------------------------------------------------------------------"
-echo "|                         Installation Completed                            |"
-echo "-----------------------------------------------------------------------------"
-echo "Please run `. ~/bash_profile` to be able to compile with gflags"
diff --git a/src/rocksdb/build_tools/make_new_version.sh b/src/rocksdb/build_tools/make_new_version.sh
deleted file mode 100755
index a8d524f..0000000
--- a/src/rocksdb/build_tools/make_new_version.sh
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/bin/bash
-#  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-#  This source code is licensed under the BSD-style license found in the
-#  LICENSE file in the root directory of this source tree. An additional grant
-#  of patent rights can be found in the PATENTS file in the same directory.
-
-set -e
-if [ -z "$GIT" ]
-then
-  GIT="git"
-fi
-
-# Print out the colored progress info so that it can be brainlessly 
-# distinguished by users.
-function title() {
-  echo -e "\033[1;32m$*\033[0m"
-}
-
-usage="Create new RocksDB version and prepare it for the release process\n"
-usage+="USAGE: ./make_new_version.sh <version>"
-
-# -- Pre-check
-if [[ $# < 1 ]]; then
-  echo -e $usage
-  exit 1
-fi
-
-ROCKSDB_VERSION=$1
-
-GIT_BRANCH=`git rev-parse --abbrev-ref HEAD`
-echo $GIT_BRANCH
-
-if [ $GIT_BRANCH != "master" ]; then
-  echo "Error: Current branch is '$GIT_BRANCH', Please switch to master branch."
-  exit 1
-fi
-
-title "Adding new tag for this release ..."
-BRANCH="$ROCKSDB_VERSION.fb"
-$GIT co -b $BRANCH
-
-# Setting up the proxy for remote repo access
-title "Pushing new branch to remote repo ..."
-git push origin --set-upstream $BRANCH
-
-title "Branch $BRANCH is pushed to github;"
diff --git a/src/rocksdb/build_tools/regression_build_test.sh b/src/rocksdb/build_tools/regression_build_test.sh
deleted file mode 100755
index 58766f5..0000000
--- a/src/rocksdb/build_tools/regression_build_test.sh
+++ /dev/null
@@ -1,330 +0,0 @@
-#!/bin/bash
-
-set -e
-
-NUM=10000000
-
-if [ $# -eq 1 ];then
-  DATA_DIR=$1
-elif [ $# -eq 2 ];then
-  DATA_DIR=$1
-  STAT_FILE=$2
-fi
-
-# On the production build servers, set data and stat
-# files/directories not in /tmp or else the tempdir cleaning
-# scripts will make you very unhappy.
-DATA_DIR=${DATA_DIR:-$(mktemp -t -d rocksdb_XXXX)}
-STAT_FILE=${STAT_FILE:-$(mktemp -t -u rocksdb_test_stats_XXXX)}
-
-function cleanup {
-  rm -rf $DATA_DIR
-  rm -f $STAT_FILE.fillseq
-  rm -f $STAT_FILE.readrandom
-  rm -f $STAT_FILE.overwrite
-  rm -f $STAT_FILE.memtablefillreadrandom
-}
-
-trap cleanup EXIT
-
-if [ -z $GIT_BRANCH ]; then
-  git_br=`git rev-parse --abbrev-ref HEAD`
-else
-  git_br=$(basename $GIT_BRANCH)
-fi
-
-if [ $git_br == "master" ]; then
-  git_br=""
-else
-  git_br="."$git_br
-fi
-
-make release
-
-# measure fillseq + fill up the DB for overwrite benchmark
-./db_bench \
-    --benchmarks=fillseq \
-    --db=$DATA_DIR \
-    --use_existing_db=0 \
-    --bloom_bits=10 \
-    --num=$NUM \
-    --writes=$NUM \
-    --cache_size=6442450944 \
-    --cache_numshardbits=6 \
-    --table_cache_numshardbits=4 \
-    --open_files=55000 \
-    --statistics=1 \
-    --histogram=1 \
-    --disable_data_sync=1 \
-    --disable_wal=1 \
-    --sync=0  > ${STAT_FILE}.fillseq
-
-# measure overwrite performance
-./db_bench \
-    --benchmarks=overwrite \
-    --db=$DATA_DIR \
-    --use_existing_db=1 \
-    --bloom_bits=10 \
-    --num=$NUM \
-    --writes=$((NUM / 10)) \
-    --cache_size=6442450944 \
-    --cache_numshardbits=6  \
-    --table_cache_numshardbits=4 \
-    --open_files=55000 \
-    --statistics=1 \
-    --histogram=1 \
-    --disable_data_sync=1 \
-    --disable_wal=1 \
-    --sync=0 \
-    --threads=8 > ${STAT_FILE}.overwrite
-
-# fill up the db for readrandom benchmark (1GB total size)
-./db_bench \
-    --benchmarks=fillseq \
-    --db=$DATA_DIR \
-    --use_existing_db=0 \
-    --bloom_bits=10 \
-    --num=$NUM \
-    --writes=$NUM \
-    --cache_size=6442450944 \
-    --cache_numshardbits=6 \
-    --table_cache_numshardbits=4 \
-    --open_files=55000 \
-    --statistics=1 \
-    --histogram=1 \
-    --disable_data_sync=1 \
-    --disable_wal=1 \
-    --sync=0 \
-    --threads=1 > /dev/null
-
-# measure readrandom with 6GB block cache
-./db_bench \
-    --benchmarks=readrandom \
-    --db=$DATA_DIR \
-    --use_existing_db=1 \
-    --bloom_bits=10 \
-    --num=$NUM \
-    --reads=$((NUM / 5)) \
-    --cache_size=6442450944 \
-    --cache_numshardbits=6 \
-    --table_cache_numshardbits=4 \
-    --open_files=55000 \
-    --disable_seek_compaction=1 \
-    --statistics=1 \
-    --histogram=1 \
-    --disable_data_sync=1 \
-    --disable_wal=1 \
-    --sync=0 \
-    --threads=16 > ${STAT_FILE}.readrandom
-
-# measure readrandom with 6GB block cache and tailing iterator
-./db_bench \
-    --benchmarks=readrandom \
-    --db=$DATA_DIR \
-    --use_existing_db=1 \
-    --bloom_bits=10 \
-    --num=$NUM \
-    --reads=$((NUM / 5)) \
-    --cache_size=6442450944 \
-    --cache_numshardbits=6 \
-    --table_cache_numshardbits=4 \
-    --open_files=55000 \
-    --disable_seek_compaction=1 \
-    --use_tailing_iterator=1 \
-    --statistics=1 \
-    --histogram=1 \
-    --disable_data_sync=1 \
-    --disable_wal=1 \
-    --sync=0 \
-    --threads=16 > ${STAT_FILE}.readrandomtailing
-
-# measure readrandom with 100MB block cache
-./db_bench \
-    --benchmarks=readrandom \
-    --db=$DATA_DIR \
-    --use_existing_db=1 \
-    --bloom_bits=10 \
-    --num=$NUM \
-    --reads=$((NUM / 5)) \
-    --cache_size=104857600 \
-    --cache_numshardbits=6 \
-    --table_cache_numshardbits=4 \
-    --open_files=55000 \
-    --disable_seek_compaction=1 \
-    --statistics=1 \
-    --histogram=1 \
-    --disable_data_sync=1 \
-    --disable_wal=1 \
-    --sync=0 \
-    --threads=16 > ${STAT_FILE}.readrandomsmallblockcache
-
-# measure readrandom with 8k data in memtable
-./db_bench \
-    --benchmarks=overwrite,readrandom \
-    --db=$DATA_DIR \
-    --use_existing_db=1 \
-    --bloom_bits=10 \
-    --num=$NUM \
-    --reads=$((NUM / 5)) \
-    --writes=512 \
-    --cache_size=6442450944 \
-    --cache_numshardbits=6 \
-    --table_cache_numshardbits=4 \
-    --write_buffer_size=1000000000 \
-    --open_files=55000 \
-    --disable_seek_compaction=1 \
-    --statistics=1 \
-    --histogram=1 \
-    --disable_data_sync=1 \
-    --disable_wal=1 \
-    --sync=0 \
-    --threads=16 > ${STAT_FILE}.readrandom_mem_sst
-
-
-# fill up the db for readrandom benchmark with filluniquerandom (1GB total size)
-./db_bench \
-    --benchmarks=filluniquerandom \
-    --db=$DATA_DIR \
-    --use_existing_db=0 \
-    --bloom_bits=10 \
-    --num=$((NUM / 4)) \
-    --writes=$((NUM / 4)) \
-    --cache_size=6442450944 \
-    --cache_numshardbits=6 \
-    --table_cache_numshardbits=4 \
-    --open_files=55000 \
-    --statistics=1 \
-    --histogram=1 \
-    --disable_data_sync=1 \
-    --disable_wal=1 \
-    --sync=0 \
-    --threads=1 > /dev/null
-
-# dummy test just to compact the data
-./db_bench \
-    --benchmarks=readrandom \
-    --db=$DATA_DIR \
-    --use_existing_db=1 \
-    --bloom_bits=10 \
-    --num=$((NUM / 1000)) \
-    --reads=$((NUM / 1000)) \
-    --cache_size=6442450944 \
-    --cache_numshardbits=6 \
-    --table_cache_numshardbits=4 \
-    --open_files=55000 \
-    --statistics=1 \
-    --histogram=1 \
-    --disable_data_sync=1 \
-    --disable_wal=1 \
-    --sync=0 \
-    --threads=16 > /dev/null
-
-# measure readrandom after load with filluniquerandom with 6GB block cache
-./db_bench \
-    --benchmarks=readrandom \
-    --db=$DATA_DIR \
-    --use_existing_db=1 \
-    --bloom_bits=10 \
-    --num=$((NUM / 4)) \
-    --reads=$((NUM / 4)) \
-    --cache_size=6442450944 \
-    --cache_numshardbits=6 \
-    --table_cache_numshardbits=4 \
-    --open_files=55000 \
-    --disable_seek_compaction=1 \
-    --disable_auto_compactions=1 \
-    --statistics=1 \
-    --histogram=1 \
-    --disable_data_sync=1 \
-    --disable_wal=1 \
-    --sync=0 \
-    --threads=16 > ${STAT_FILE}.readrandom_filluniquerandom
-
-# measure readwhilewriting after load with filluniquerandom with 6GB block cache
-./db_bench \
-    --benchmarks=readwhilewriting \
-    --db=$DATA_DIR \
-    --use_existing_db=1 \
-    --bloom_bits=10 \
-    --num=$((NUM / 4)) \
-    --reads=$((NUM / 4)) \
-    --writes_per_second=1000 \
-    --write_buffer_size=100000000 \
-    --cache_size=6442450944 \
-    --cache_numshardbits=6 \
-    --table_cache_numshardbits=4 \
-    --open_files=55000 \
-    --disable_seek_compaction=1 \
-    --statistics=1 \
-    --histogram=1 \
-    --disable_data_sync=1 \
-    --disable_wal=1 \
-    --sync=0 \
-    --threads=16 > ${STAT_FILE}.readwhilewriting
-
-# measure memtable performance -- none of the data gets flushed to disk
-./db_bench \
-    --benchmarks=fillrandom,readrandom, \
-    --db=$DATA_DIR \
-    --use_existing_db=0 \
-    --num=$((NUM / 10)) \
-    --reads=$NUM \
-    --cache_size=6442450944 \
-    --cache_numshardbits=6 \
-    --table_cache_numshardbits=4 \
-    --write_buffer_size=1000000000 \
-    --open_files=55000 \
-    --disable_seek_compaction=1 \
-    --statistics=1 \
-    --histogram=1 \
-    --disable_data_sync=1 \
-    --disable_wal=1 \
-    --sync=0 \
-    --value_size=10 \
-    --threads=16 > ${STAT_FILE}.memtablefillreadrandom
-
-# send data to ods
-function send_to_ods {
-  key="$1"
-  value="$2"
-
-  if [ -z $JENKINS_HOME ]; then
-    # running on devbox, just print out the values
-    echo $1 $2
-    return
-  fi
-
-  if [ -z "$value" ];then
-    echo >&2 "ERROR: Key $key doesn't have a value."
-    return
-  fi
-  curl -s "https://www.intern.facebook.com/intern/agent/ods_set.php?entity=rocksdb_build$git_br&key=$key&value=$value" \
-    --connect-timeout 60
-}
-
-function send_benchmark_to_ods {
-  bench="$1"
-  bench_key="$2"
-  file="$3"
-
-  QPS=$(grep $bench $file | awk '{print $5}')
-  P50_MICROS=$(grep $bench $file -A 4 | tail -n1 | awk '{print $3}' )
-  P75_MICROS=$(grep $bench $file -A 4 | tail -n1 | awk '{print $5}' )
-  P99_MICROS=$(grep $bench $file -A 4 | tail -n1 | awk '{print $7}' )
-
-  send_to_ods rocksdb.build.$bench_key.qps $QPS
-  send_to_ods rocksdb.build.$bench_key.p50_micros $P50_MICROS
-  send_to_ods rocksdb.build.$bench_key.p75_micros $P75_MICROS
-  send_to_ods rocksdb.build.$bench_key.p99_micros $P99_MICROS
-}
-
-send_benchmark_to_ods overwrite overwrite $STAT_FILE.overwrite
-send_benchmark_to_ods fillseq fillseq $STAT_FILE.fillseq
-send_benchmark_to_ods readrandom readrandom $STAT_FILE.readrandom
-send_benchmark_to_ods readrandom readrandom_tailing $STAT_FILE.readrandomtailing
-send_benchmark_to_ods readrandom readrandom_smallblockcache $STAT_FILE.readrandomsmallblockcache
-send_benchmark_to_ods readrandom readrandom_memtable_sst $STAT_FILE.readrandom_mem_sst
-send_benchmark_to_ods readrandom readrandom_fillunique_random $STAT_FILE.readrandom_filluniquerandom
-send_benchmark_to_ods fillrandom memtablefillrandom $STAT_FILE.memtablefillreadrandom
-send_benchmark_to_ods readrandom memtablereadrandom $STAT_FILE.memtablefillreadrandom
-send_benchmark_to_ods readwhilewriting readwhilewriting $STAT_FILE.readwhilewriting
diff --git a/src/rocksdb/build_tools/valgrind_test.sh b/src/rocksdb/build_tools/valgrind_test.sh
deleted file mode 100755
index 8c7e521..0000000
--- a/src/rocksdb/build_tools/valgrind_test.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/bash
-#A shell script for Jenknis to run valgrind on rocksdb tests
-#Returns 0 on success when there are no failed tests 
-
-VALGRIND_DIR=build_tools/VALGRIND_LOGS
-make clean
-make -j$(nproc) valgrind_check
-NUM_FAILED_TESTS=$((`wc -l $VALGRIND_DIR/valgrind_failed_tests | awk '{print $1}'` - 1))
-if [ $NUM_FAILED_TESTS -lt 1 ]; then
-  echo No tests have valgrind errors
-  exit 0
-else
-  cat $VALGRIND_DIR/valgrind_failed_tests
-  exit 1
-fi
diff --git a/src/rocksdb/configure.ac b/src/rocksdb/configure.ac
index 855c6a8..b312784 100644
--- a/src/rocksdb/configure.ac
+++ b/src/rocksdb/configure.ac
@@ -19,7 +19,14 @@ AC_CHECK_LIB([snappy], [snappy_compress], [HAVE_LIBSNAPPY=yes], [AC_MSG_FAILURE(
 AC_CHECK_LIB([z], [gzread], [HAVE_LIBZ=yes], [AC_MSG_FAILURE([libz not found])])
 AC_CHECK_LIB([bz2], [BZ2_bzCompressInit], [HAVE_LIBBZ2=yes], [AC_MSG_FAILURE([libbz2 not found])])
 AC_CHECK_LIB([rt], [clock_gettime], [HAVE_LIBRT=yes], [AC_MSG_FAILURE([librt not found])])
-AC_CHECK_LIB([tcmalloc], [malloc],  [HAVE_LIBTCMALLOC=yes],[AC_MSG_FAILURE([no tcmalloc found ])])
+
+AC_ARG_WITH([tcmalloc],
+	    [AS_HELP_STRING([--without-tcmalloc], [disable tcmalloc for memory allocations])],
+	    [],
+	    [with_tcmalloc=no])
+AS_IF([test "x$with_tcmalloc" != xno],
+	    [AC_CHECK_LIB([tcmalloc], [malloc],  [HAVE_LIBTCMALLOC=yes],[AC_MSG_FAILURE([no tcmalloc found ])])]
+	    [])
 
 OLD_CXXFLAGS="$CXXFLAGS"
 CXXFLAGS="$CXXFLAGS -std=c++11"
diff --git a/src/rocksdb/coverage/coverage_test.sh b/src/rocksdb/coverage/coverage_test.sh
deleted file mode 100755
index 08dbd05..0000000
--- a/src/rocksdb/coverage/coverage_test.sh
+++ /dev/null
@@ -1,78 +0,0 @@
-#!/bin/bash
-
-# Exit on error.
-set -e
-
-if [ -n "$USE_CLANG" ]; then
-  echo "Error: Coverage test is supported only for gcc."
-  exit 1
-fi
-
-ROOT=".."
-# Fetch right version of gcov
-if [ -d /mnt/gvfs/third-party -a -z "$CXX" ]; then
-  source $ROOT/build_tools/fbcode.gcc471.sh
-  GCOV=$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.7.1/cc6c9dc/bin/gcov
-else
-  GCOV=$(which gcov)
-fi
-
-COVERAGE_DIR="$PWD/COVERAGE_REPORT"
-mkdir -p $COVERAGE_DIR
-
-# Find all gcno files to generate the coverage report
-
-GCNO_FILES=`find $ROOT -name "*.gcno"`
-$GCOV --preserve-paths --relative-only --no-output $GCNO_FILES 2>/dev/null |
-  # Parse the raw gcov report to more human readable form.
-  python $ROOT/coverage/parse_gcov_output.py |
-  # Write the output to both stdout and report file.
-  tee $COVERAGE_DIR/coverage_report_all.txt &&
-echo -e "Generated coverage report for all files: $COVERAGE_DIR/coverage_report_all.txt\n"
-
-# TODO: we also need to get the files of the latest commits.
-# Get the most recently committed files.
-LATEST_FILES=`
-  git show --pretty="format:" --name-only HEAD |
-  grep -v "^$" |
-  paste -s -d,`
-RECENT_REPORT=$COVERAGE_DIR/coverage_report_recent.txt
-
-echo -e "Recently updated files: $LATEST_FILES\n" > $RECENT_REPORT
-$GCOV --preserve-paths --relative-only --no-output $GCNO_FILES 2>/dev/null |
-  python $ROOT/coverage/parse_gcov_output.py -interested-files $LATEST_FILES |
-  tee -a $RECENT_REPORT &&
-echo -e "Generated coverage report for recently updated files: $RECENT_REPORT\n"
-
-# Unless otherwise specified, we'll not generate html report by default
-if [ -z "$HTML" ]; then
-  exit 0
-fi
-
-# Generate the html report. If we cannot find lcov in this machine, we'll simply
-# skip this step.
-echo "Generating the html coverage report..."
-
-LCOV=$(which lcov || true 2>/dev/null)
-if [ -z $LCOV ]
-then
-  echo "Skip: Cannot find lcov to generate the html report."
-  exit 0
-fi
-
-LCOV_VERSION=$(lcov -v | grep 1.1 || true)
-if [ $LCOV_VERSION ]
-then
-  echo "Not supported lcov version. Expect lcov 1.1."
-  exit 0
-fi
-
-(cd $ROOT; lcov --no-external \
-     --capture  \
-     --directory $PWD \
-     --gcov-tool $GCOV \
-     --output-file $COVERAGE_DIR/coverage.info)
-
-genhtml $COVERAGE_DIR/coverage.info -o $COVERAGE_DIR
-
-echo "HTML Coverage report is generated in $COVERAGE_DIR"
diff --git a/src/rocksdb/coverage/parse_gcov_output.py b/src/rocksdb/coverage/parse_gcov_output.py
deleted file mode 100644
index 72e8b07..0000000
--- a/src/rocksdb/coverage/parse_gcov_output.py
+++ /dev/null
@@ -1,118 +0,0 @@
-import optparse
-import re
-import sys
-
-from optparse import OptionParser
-
-# the gcov report follows certain pattern. Each file will have two lines
-# of report, from which we can extract the file name, total lines and coverage
-# percentage.
-def parse_gcov_report(gcov_input):
-    per_file_coverage = {}
-    total_coverage = None
-
-    for line in sys.stdin:
-        line = line.strip()
-
-        # --First line of the coverage report (with file name in it)?
-        match_obj = re.match("^File '(.*)'$", line)
-        if match_obj:
-            # fetch the file name from the first line of the report.
-            current_file = match_obj.group(1)
-            continue
-
-        # -- Second line of the file report (with coverage percentage)
-        match_obj = re.match("^Lines executed:(.*)% of (.*)", line)
-
-        if match_obj:
-            coverage = float(match_obj.group(1))
-            lines = int(match_obj.group(2))
-
-            if current_file is not None:
-                per_file_coverage[current_file] = (coverage, lines)
-                current_file = None
-            else:
-                # If current_file is not set, we reach the last line of report,
-                # which contains the summarized coverage percentage.
-                total_coverage = (coverage, lines)
-            continue
-
-        # If the line's pattern doesn't fall into the above categories. We
-        # can simply ignore them since they're either empty line or doesn't
-        # find executable lines of the given file.
-        current_file = None
-
-    return per_file_coverage, total_coverage
-
-def get_option_parser():
-    usage = "Parse the gcov output and generate more human-readable code " +\
-            "coverage report."
-    parser = OptionParser(usage)
-
-    parser.add_option(
-        "--interested-files", "-i",
-        dest="filenames",
-        help="Comma separated files names. if specified, we will display " +
-             "the coverage report only for interested source files. " +
-             "Otherwise we will display the coverage report for all " +
-             "source files."
-    )
-    return parser
-
-def display_file_coverage(per_file_coverage, total_coverage):
-    # To print out auto-adjustable column, we need to know the longest
-    # length of file names.
-    max_file_name_length = max(
-        len(fname) for fname in per_file_coverage.keys()
-    )
-
-    # -- Print header
-    # size of separator is determined by 3 column sizes:
-    # file name, coverage percentage and lines.
-    header_template = \
-        "%" + str(max_file_name_length) + "s\t%s\t%s"
-    separator = "-" * (max_file_name_length + 10 + 20)
-    print header_template % ("Filename", "Coverage", "Lines")
-    print separator
-
-    # -- Print body
-    # template for printing coverage report for each file.
-    record_template = "%" + str(max_file_name_length) + "s\t%5.2f%%\t%10d"
-
-    for fname, coverage_info in per_file_coverage.items():
-        coverage, lines = coverage_info
-        print record_template % (fname, coverage, lines)
-
-    # -- Print footer
-    if total_coverage:
-        print separator
-        print record_template % ("Total", total_coverage[0], total_coverage[1])
-
-def report_coverage():
-    parser = get_option_parser()
-    (options, args) = parser.parse_args()
-
-    interested_files = set()
-    if options.filenames is not None:
-        interested_files = set(f.strip() for f in options.filenames.split(','))
-
-    # To make things simple, right now we only read gcov report from the input
-    per_file_coverage, total_coverage = parse_gcov_report(sys.stdin)
-
-    # Check if we need to display coverage info for interested files.
-    if len(interested_files):
-        per_file_coverage = dict(
-            (fname, per_file_coverage[fname]) for fname in interested_files
-            if fname in per_file_coverage
-        )
-        # If we only interested in several files, it makes no sense to report
-        # the total_coverage
-        total_coverage = None
-
-    if not len(per_file_coverage):
-        print >> sys.stderr, "Cannot find coverage info for the given files."
-        return
-    display_file_coverage(per_file_coverage, total_coverage)
-
-if __name__ == "__main__":
-    report_coverage()
diff --git a/src/rocksdb/db/builder.cc b/src/rocksdb/db/builder.cc
index ce85ae5..2a33bb0 100644
--- a/src/rocksdb/db/builder.cc
+++ b/src/rocksdb/db/builder.cc
@@ -9,6 +9,7 @@
 
 #include "db/builder.h"
 
+#include <vector>
 #include "db/dbformat.h"
 #include "db/filename.h"
 #include "db/merge_helper.h"
@@ -20,60 +21,79 @@
 #include "rocksdb/options.h"
 #include "rocksdb/table.h"
 #include "table/block_based_table_builder.h"
+#include "util/iostats_context_imp.h"
+#include "util/thread_status_util.h"
 #include "util/stop_watch.h"
 
 namespace rocksdb {
 
 class TableFactory;
 
-TableBuilder* NewTableBuilder(const Options& options,
-                              const InternalKeyComparator& internal_comparator,
-                              WritableFile* file,
-                              CompressionType compression_type) {
-  return options.table_factory->NewTableBuilder(options, internal_comparator,
-                                                file, compression_type);
+TableBuilder* NewTableBuilder(
+    const ImmutableCFOptions& ioptions,
+    const InternalKeyComparator& internal_comparator,
+    const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
+        int_tbl_prop_collector_factories,
+    WritableFile* file, const CompressionType compression_type,
+    const CompressionOptions& compression_opts, const bool skip_filters) {
+  return ioptions.table_factory->NewTableBuilder(
+      TableBuilderOptions(ioptions, internal_comparator,
+                          int_tbl_prop_collector_factories, compression_type,
+                          compression_opts, skip_filters),
+      file);
 }
 
-Status BuildTable(const std::string& dbname, Env* env, const Options& options,
-                  const EnvOptions& soptions, TableCache* table_cache,
-                  Iterator* iter, FileMetaData* meta,
-                  const InternalKeyComparator& internal_comparator,
-                  const SequenceNumber newest_snapshot,
-                  const SequenceNumber earliest_seqno_in_memtable,
-                  const CompressionType compression) {
+Status BuildTable(
+    const std::string& dbname, Env* env, const ImmutableCFOptions& ioptions,
+    const EnvOptions& env_options, TableCache* table_cache, Iterator* iter,
+    FileMetaData* meta, const InternalKeyComparator& internal_comparator,
+    const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
+        int_tbl_prop_collector_factories,
+    const SequenceNumber newest_snapshot,
+    const SequenceNumber earliest_seqno_in_memtable,
+    const CompressionType compression,
+    const CompressionOptions& compression_opts, bool paranoid_file_checks,
+    const Env::IOPriority io_priority, TableProperties* table_properties) {
+  // Reports the IOStats for flush for every following bytes.
+  const size_t kReportFlushIOStatsEvery = 1048576;
   Status s;
-  meta->file_size = 0;
+  meta->fd.file_size = 0;
   meta->smallest_seqno = meta->largest_seqno = 0;
   iter->SeekToFirst();
 
   // If the sequence number of the smallest entry in the memtable is
   // smaller than the most recent snapshot, then we do not trigger
   // removal of duplicate/deleted keys as part of this builder.
-  bool purge = options.purge_redundant_kvs_while_flush;
+  bool purge = ioptions.purge_redundant_kvs_while_flush;
   if (earliest_seqno_in_memtable <= newest_snapshot) {
     purge = false;
   }
 
-  std::string fname = TableFileName(dbname, meta->number);
+  std::string fname = TableFileName(ioptions.db_paths, meta->fd.GetNumber(),
+                                    meta->fd.GetPathId());
   if (iter->Valid()) {
     unique_ptr<WritableFile> file;
-    s = env->NewWritableFile(fname, &file, soptions);
+    s = env->NewWritableFile(fname, &file, env_options);
     if (!s.ok()) {
       return s;
     }
+    file->SetIOPriority(io_priority);
 
-    TableBuilder* builder =
-        NewTableBuilder(options, internal_comparator, file.get(), compression);
+    TableBuilder* builder = NewTableBuilder(
+        ioptions, internal_comparator, int_tbl_prop_collector_factories,
+        file.get(), compression, compression_opts);
 
-    // the first key is the smallest key
-    Slice key = iter->key();
-    meta->smallest.DecodeFrom(key);
-    meta->smallest_seqno = GetInternalKeySeqno(key);
-    meta->largest_seqno = meta->smallest_seqno;
+    {
+      // the first key is the smallest key
+      Slice key = iter->key();
+      meta->smallest.DecodeFrom(key);
+      meta->smallest_seqno = GetInternalKeySeqno(key);
+      meta->largest_seqno = meta->smallest_seqno;
+    }
 
     MergeHelper merge(internal_comparator.user_comparator(),
-                      options.merge_operator.get(), options.info_log.get(),
-                      options.min_partial_merge_operands,
+                      ioptions.merge_operator, ioptions.info_log,
+                      ioptions.min_partial_merge_operands,
                       true /* internal key corruption is not ok */);
 
     if (purge) {
@@ -110,6 +130,13 @@ Status BuildTable(const std::string& dbname, Env* env, const Options& options,
           is_first_key = false;
 
           if (this_ikey.type == kTypeMerge) {
+            // TODO(tbd): Add a check here to prevent RocksDB from crash when
+            // reopening a DB w/o properly specifying the merge operator.  But
+            // currently we observed a memory leak on failing in RocksDB
+            // recovery, so we decide to let it crash instead of causing
+            // memory leak for now before we have identified the real cause
+            // of the memory leak.
+
             // Handle merge-type keys using the MergeHelper
             // TODO: pass statistics to MergeUntil
             merge.MergeUntil(iter, 0 /* don't worry about snapshot */);
@@ -153,6 +180,13 @@ Status BuildTable(const std::string& dbname, Env* env, const Options& options,
           }
         }
 
+        if (io_priority == Env::IO_HIGH &&
+            IOSTATS(bytes_written) >= kReportFlushIOStatsEvery) {
+          ThreadStatusUtil::IncreaseThreadOperationProperty(
+              ThreadStatus::FLUSH_BYTES_WRITTEN,
+              IOSTATS(bytes_written));
+          IOSTATS_RESET(bytes_written);
+        }
         if (!iterator_at_next) iter->Next();
       }
 
@@ -170,28 +204,38 @@ Status BuildTable(const std::string& dbname, Env* env, const Options& options,
         SequenceNumber seqno = GetInternalKeySeqno(key);
         meta->smallest_seqno = std::min(meta->smallest_seqno, seqno);
         meta->largest_seqno = std::max(meta->largest_seqno, seqno);
+        if (io_priority == Env::IO_HIGH &&
+            IOSTATS(bytes_written) >= kReportFlushIOStatsEvery) {
+          ThreadStatusUtil::IncreaseThreadOperationProperty(
+              ThreadStatus::FLUSH_BYTES_WRITTEN,
+              IOSTATS(bytes_written));
+          IOSTATS_RESET(bytes_written);
+        }
       }
     }
 
     // Finish and check for builder errors
     if (s.ok()) {
       s = builder->Finish();
-      if (s.ok()) {
-        meta->file_size = builder->FileSize();
-        assert(meta->file_size > 0);
-      }
     } else {
       builder->Abandon();
     }
+    if (s.ok()) {
+      meta->fd.file_size = builder->FileSize();
+      assert(meta->fd.GetFileSize() > 0);
+      if (table_properties) {
+        *table_properties = builder->GetTableProperties();
+      }
+    }
     delete builder;
 
     // Finish and check for file errors
-    if (s.ok() && !options.disableDataSync) {
-      if (options.use_fsync) {
-        StopWatch sw(env, options.statistics.get(), TABLE_SYNC_MICROS);
+    if (s.ok() && !ioptions.disable_data_sync) {
+      if (ioptions.use_fsync) {
+        StopWatch sw(env, ioptions.statistics, TABLE_SYNC_MICROS);
         s = file->Fsync();
       } else {
-        StopWatch sw(env, options.statistics.get(), TABLE_SYNC_MICROS);
+        StopWatch sw(env, ioptions.statistics, TABLE_SYNC_MICROS);
         s = file->Sync();
       }
     }
@@ -201,9 +245,14 @@ Status BuildTable(const std::string& dbname, Env* env, const Options& options,
 
     if (s.ok()) {
       // Verify that the table is usable
-      Iterator* it = table_cache->NewIterator(ReadOptions(), soptions,
-                                              internal_comparator, *meta);
+      Iterator* it = table_cache->NewIterator(ReadOptions(), env_options,
+                                              internal_comparator, meta->fd);
       s = it->status();
+      if (s.ok() && paranoid_file_checks) {
+        for (it->SeekToFirst(); it->Valid(); it->Next()) {}
+        s = it->status();
+      }
+
       delete it;
     }
   }
@@ -213,7 +262,7 @@ Status BuildTable(const std::string& dbname, Env* env, const Options& options,
     s = iter->status();
   }
 
-  if (s.ok() && meta->file_size > 0) {
+  if (s.ok() && meta->fd.GetFileSize() > 0) {
     // Keep it
   } else {
     env->DeleteFile(fname);
diff --git a/src/rocksdb/db/builder.h b/src/rocksdb/db/builder.h
index 6301629..9d2888d 100644
--- a/src/rocksdb/db/builder.h
+++ b/src/rocksdb/db/builder.h
@@ -6,10 +6,17 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #pragma once
+#include <string>
+#include <utility>
+#include <vector>
+#include "db/table_properties_collector.h"
 #include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
 #include "rocksdb/status.h"
 #include "rocksdb/types.h"
 #include "rocksdb/options.h"
+#include "rocksdb/immutable_options.h"
+#include "rocksdb/table_properties.h"
 
 namespace rocksdb {
 
@@ -24,22 +31,31 @@ class VersionEdit;
 class TableBuilder;
 class WritableFile;
 
-extern TableBuilder* NewTableBuilder(
-    const Options& options, const InternalKeyComparator& internal_comparator,
-    WritableFile* file, CompressionType compression_type);
+TableBuilder* NewTableBuilder(
+    const ImmutableCFOptions& options,
+    const InternalKeyComparator& internal_comparator,
+    const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
+        int_tbl_prop_collector_factories,
+    WritableFile* file, const CompressionType compression_type,
+    const CompressionOptions& compression_opts,
+    const bool skip_filters = false);
 
 // Build a Table file from the contents of *iter.  The generated file
-// will be named according to meta->number.  On success, the rest of
+// will be named according to number specified in meta. On success, the rest of
 // *meta will be filled with metadata about the generated table.
 // If no data is present in *iter, meta->file_size will be set to
 // zero, and no Table file will be produced.
-extern Status BuildTable(const std::string& dbname, Env* env,
-                         const Options& options, const EnvOptions& soptions,
-                         TableCache* table_cache, Iterator* iter,
-                         FileMetaData* meta,
-                         const InternalKeyComparator& internal_comparator,
-                         const SequenceNumber newest_snapshot,
-                         const SequenceNumber earliest_seqno_in_memtable,
-                         const CompressionType compression);
+extern Status BuildTable(
+    const std::string& dbname, Env* env, const ImmutableCFOptions& options,
+    const EnvOptions& env_options, TableCache* table_cache, Iterator* iter,
+    FileMetaData* meta, const InternalKeyComparator& internal_comparator,
+    const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
+        int_tbl_prop_collector_factories,
+    const SequenceNumber newest_snapshot,
+    const SequenceNumber earliest_seqno_in_memtable,
+    const CompressionType compression,
+    const CompressionOptions& compression_opts, bool paranoid_file_checks,
+    const Env::IOPriority io_priority = Env::IO_HIGH,
+    TableProperties* table_properties = nullptr);
 
 }  // namespace rocksdb
diff --git a/src/rocksdb/db/c.cc b/src/rocksdb/db/c.cc
index b50e59e..985c9fb 100644
--- a/src/rocksdb/db/c.cc
+++ b/src/rocksdb/db/c.cc
@@ -14,6 +14,7 @@
 #include <stdlib.h>
 #include <unistd.h>
 #include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
@@ -28,11 +29,22 @@
 #include "rocksdb/statistics.h"
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/table.h"
+#include "rocksdb/utilities/backupable_db.h"
 
 using rocksdb::Cache;
+using rocksdb::ColumnFamilyDescriptor;
+using rocksdb::ColumnFamilyHandle;
+using rocksdb::ColumnFamilyOptions;
+using rocksdb::CompactionFilter;
+using rocksdb::CompactionFilterFactory;
+using rocksdb::CompactionFilterV2;
+using rocksdb::CompactionFilterFactoryV2;
+using rocksdb::CompactionFilterContext;
+using rocksdb::CompactionOptionsFIFO;
 using rocksdb::Comparator;
 using rocksdb::CompressionType;
 using rocksdb::DB;
+using rocksdb::DBOptions;
 using rocksdb::Env;
 using rocksdb::InfoLogLevel;
 using rocksdb::FileLock;
@@ -44,6 +56,8 @@ using rocksdb::MergeOperator;
 using rocksdb::NewBloomFilterPolicy;
 using rocksdb::NewLRUCache;
 using rocksdb::Options;
+using rocksdb::BlockBasedTableOptions;
+using rocksdb::CuckooTableOptions;
 using rocksdb::RandomAccessFile;
 using rocksdb::Range;
 using rocksdb::ReadOptions;
@@ -56,19 +70,32 @@ using rocksdb::WritableFile;
 using rocksdb::WriteBatch;
 using rocksdb::WriteOptions;
 using rocksdb::LiveFileMetaData;
+using rocksdb::BackupEngine;
+using rocksdb::BackupableDBOptions;
+using rocksdb::BackupInfo;
+using rocksdb::RestoreOptions;
 
 using std::shared_ptr;
 
 extern "C" {
 
 struct rocksdb_t                 { DB*               rep; };
+struct rocksdb_backup_engine_t   { BackupEngine*     rep; };
+struct rocksdb_backup_engine_info_t { std::vector<BackupInfo> rep; };
+struct rocksdb_restore_options_t { RestoreOptions rep; };
 struct rocksdb_iterator_t        { Iterator*         rep; };
 struct rocksdb_writebatch_t      { WriteBatch        rep; };
 struct rocksdb_snapshot_t        { const Snapshot*   rep; };
 struct rocksdb_flushoptions_t    { FlushOptions      rep; };
-struct rocksdb_readoptions_t     { ReadOptions       rep; };
+struct rocksdb_fifo_compaction_options_t { CompactionOptionsFIFO rep; };
+struct rocksdb_readoptions_t {
+   ReadOptions rep;
+   Slice upper_bound; // stack variable to set pointer to in ReadOptions
+};
 struct rocksdb_writeoptions_t    { WriteOptions      rep; };
 struct rocksdb_options_t         { Options           rep; };
+struct rocksdb_block_based_table_options_t  { BlockBasedTableOptions rep; };
+struct rocksdb_cuckoo_table_options_t  { CuckooTableOptions rep; };
 struct rocksdb_seqfile_t         { SequentialFile*   rep; };
 struct rocksdb_randomfile_t      { RandomAccessFile* rep; };
 struct rocksdb_writablefile_t    { WritableFile*     rep; };
@@ -76,6 +103,162 @@ struct rocksdb_filelock_t        { FileLock*         rep; };
 struct rocksdb_logger_t          { shared_ptr<Logger>  rep; };
 struct rocksdb_cache_t           { shared_ptr<Cache>   rep; };
 struct rocksdb_livefiles_t       { std::vector<LiveFileMetaData> rep; };
+struct rocksdb_column_family_handle_t  { ColumnFamilyHandle* rep; };
+
+struct rocksdb_compactionfiltercontext_t {
+  CompactionFilter::Context rep;
+};
+
+struct rocksdb_compactionfilter_t : public CompactionFilter {
+  void* state_;
+  void (*destructor_)(void*);
+  unsigned char (*filter_)(
+      void*,
+      int level,
+      const char* key, size_t key_length,
+      const char* existing_value, size_t value_length,
+      char** new_value, size_t *new_value_length,
+      unsigned char* value_changed);
+  const char* (*name_)(void*);
+
+  virtual ~rocksdb_compactionfilter_t() {
+    (*destructor_)(state_);
+  }
+
+  virtual bool Filter(int level, const Slice& key, const Slice& existing_value,
+                      std::string* new_value,
+                      bool* value_changed) const override {
+    char* c_new_value = nullptr;
+    size_t new_value_length = 0;
+    unsigned char c_value_changed = 0;
+    unsigned char result = (*filter_)(
+        state_,
+        level,
+        key.data(), key.size(),
+        existing_value.data(), existing_value.size(),
+        &c_new_value, &new_value_length, &c_value_changed);
+    if (c_value_changed) {
+      new_value->assign(c_new_value, new_value_length);
+      *value_changed = true;
+    }
+    return result;
+  }
+
+  virtual const char* Name() const override { return (*name_)(state_); }
+};
+
+struct rocksdb_compactionfilterfactory_t : public CompactionFilterFactory {
+  void* state_;
+  void (*destructor_)(void*);
+  rocksdb_compactionfilter_t* (*create_compaction_filter_)(
+      void*, rocksdb_compactionfiltercontext_t* context);
+  const char* (*name_)(void*);
+
+  virtual ~rocksdb_compactionfilterfactory_t() { (*destructor_)(state_); }
+
+  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override {
+    rocksdb_compactionfiltercontext_t ccontext;
+    ccontext.rep = context;
+    CompactionFilter* cf = (*create_compaction_filter_)(state_, &ccontext);
+    return std::unique_ptr<CompactionFilter>(cf);
+  }
+
+  virtual const char* Name() const override { return (*name_)(state_); }
+};
+
+struct rocksdb_compactionfilterv2_t : public CompactionFilterV2 {
+  void* state_;
+  void (*destructor_)(void*);
+  const char* (*name_)(void*);
+  void (*filter_)(void*, int level, size_t num_keys,
+                  const char* const* keys_list, const size_t* keys_list_sizes,
+                  const char* const* existing_values_list, const size_t* existing_values_list_sizes,
+                  char** new_values_list, size_t* new_values_list_sizes,
+                  unsigned char* to_delete_list);
+
+  virtual ~rocksdb_compactionfilterv2_t() {
+    (*destructor_)(state_);
+  }
+
+  virtual const char* Name() const override { return (*name_)(state_); }
+
+  virtual std::vector<bool> Filter(
+      int level, const SliceVector& keys, const SliceVector& existing_values,
+      std::vector<std::string>* new_values,
+      std::vector<bool>* values_changed) const override {
+    // Make a vector pointing to the underlying key data.
+    size_t num_keys = keys.size();
+    std::vector<const char*> keys_list(num_keys);
+    std::vector<size_t> keys_list_sizes(num_keys);
+    for (size_t i = 0; i < num_keys; ++i) {
+      keys_list[i] = keys[i].data();
+      keys_list_sizes[i] = keys[i].size();
+    }
+    // Make a vector pointing to the underlying value data.
+    std::vector<const char*> existing_values_list(num_keys);
+    std::vector<size_t> existing_values_list_sizes(num_keys);
+    for (size_t i = 0; i < num_keys; ++i) {
+      existing_values_list[i] = existing_values[i].data();
+      existing_values_list_sizes[i] = existing_values[i].size();
+    }
+    // Make a vector which will accept newly-allocated char* arrays
+    // which we will take ownership of and assign to strings in new_values.
+    new_values->clear();
+    std::vector<char*> new_values_list(num_keys);
+    std::vector<size_t> new_values_list_sizes(num_keys);
+    // Resize values_changed to hold all keys.
+    values_changed->resize(num_keys);
+    // Make a vector for bools indicating a value should be deleted
+    // on compaction (true) or maintained (false).
+    std::vector<unsigned char> to_delete_list(num_keys);
+
+    (*filter_)(
+        state_, level, num_keys, &keys_list[0], &keys_list_sizes[0],
+        &existing_values_list[0], &existing_values_list_sizes[0],
+        &new_values_list[0], &new_values_list_sizes[0], &to_delete_list[0]);
+
+    // Now, we transfer any changed values, setting values_changed and
+    // initializing new_values in the event a value changed.
+    std::vector<bool> to_delete(num_keys);
+    for (size_t i = 0; i < num_keys; ++i) {
+      to_delete[i] = to_delete_list[i];
+      (*values_changed)[i] = new_values_list[i] != nullptr;
+      if ((*values_changed)[i]) {
+        new_values->push_back(std::string(new_values_list[i], new_values_list_sizes[i]));
+        free(new_values_list[i]);
+      }
+    }
+    return to_delete;
+  }
+};
+
+struct rocksdb_compactionfilterfactoryv2_t : public CompactionFilterFactoryV2 {
+  void* state_;
+  void (*destructor_)(void*);
+  const char* (*name_)(void*);
+  rocksdb_compactionfilterv2_t* (*create_compaction_filter_v2_)(
+      void* state, const rocksdb_compactionfiltercontext_t* context);
+
+  rocksdb_compactionfilterfactoryv2_t(const SliceTransform* prefix_extractor)
+      : CompactionFilterFactoryV2(prefix_extractor) {
+  }
+
+  virtual ~rocksdb_compactionfilterfactoryv2_t() {
+    (*destructor_)(state_);
+  }
+
+  virtual const char* Name() const override { return (*name_)(state_); }
+
+  virtual std::unique_ptr<CompactionFilterV2> CreateCompactionFilterV2(
+      const CompactionFilterContext& context) override {
+    struct rocksdb_compactionfiltercontext_t c_context;
+    c_context.rep.is_full_compaction = context.is_full_compaction;
+    c_context.rep.is_manual_compaction = context.is_manual_compaction;
+    return std::unique_ptr<CompactionFilterV2>(
+        (*create_compaction_filter_v2_)(state_, &c_context));
+  }
+};
 
 struct rocksdb_comparator_t : public Comparator {
   void* state_;
@@ -90,17 +273,16 @@ struct rocksdb_comparator_t : public Comparator {
     (*destructor_)(state_);
   }
 
-  virtual int Compare(const Slice& a, const Slice& b) const {
+  virtual int Compare(const Slice& a, const Slice& b) const override {
     return (*compare_)(state_, a.data(), a.size(), b.data(), b.size());
   }
 
-  virtual const char* Name() const {
-    return (*name_)(state_);
-  }
+  virtual const char* Name() const override { return (*name_)(state_); }
 
   // No-ops since the C binding does not support key shortening methods.
-  virtual void FindShortestSeparator(std::string*, const Slice&) const { }
-  virtual void FindShortSuccessor(std::string* key) const { }
+  virtual void FindShortestSeparator(std::string*,
+                                     const Slice&) const override {}
+  virtual void FindShortSuccessor(std::string* key) const override {}
 };
 
 struct rocksdb_filterpolicy_t : public FilterPolicy {
@@ -124,11 +306,10 @@ struct rocksdb_filterpolicy_t : public FilterPolicy {
     (*destructor_)(state_);
   }
 
-  virtual const char* Name() const {
-    return (*name_)(state_);
-  }
+  virtual const char* Name() const override { return (*name_)(state_); }
 
-  virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const {
+  virtual void CreateFilter(const Slice* keys, int n,
+                            std::string* dst) const override {
     std::vector<const char*> key_pointers(n);
     std::vector<size_t> key_sizes(n);
     for (int i = 0; i < n; i++) {
@@ -146,7 +327,8 @@ struct rocksdb_filterpolicy_t : public FilterPolicy {
     }
   }
 
-  virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const {
+  virtual bool KeyMayMatch(const Slice& key,
+                           const Slice& filter) const override {
     return (*key_match_)(state_, key.data(), key.size(),
                          filter.data(), filter.size());
   }
@@ -175,17 +357,12 @@ struct rocksdb_mergeoperator_t : public MergeOperator {
     (*destructor_)(state_);
   }
 
-  virtual const char* Name() const {
-    return (*name_)(state_);
-  }
-
-  virtual bool FullMerge(
-      const Slice& key,
-      const Slice* existing_value,
-      const std::deque<std::string>& operand_list,
-      std::string* new_value,
-      Logger* logger) const {
+  virtual const char* Name() const override { return (*name_)(state_); }
 
+  virtual bool FullMerge(const Slice& key, const Slice* existing_value,
+                         const std::deque<std::string>& operand_list,
+                         std::string* new_value,
+                         Logger* logger) const override {
     size_t n = operand_list.size();
     std::vector<const char*> operand_pointers(n);
     std::vector<size_t> operand_sizes(n);
@@ -205,11 +382,9 @@ struct rocksdb_mergeoperator_t : public MergeOperator {
     unsigned char success;
     size_t new_value_len;
     char* tmp_new_value = (*full_merge_)(
-        state_,
-        key.data(), key.size(),
-        existing_value_data, existing_value_len,
-        &operand_pointers[0], &operand_sizes[0], n,
-        &success, &new_value_len);
+        state_, key.data(), key.size(), existing_value_data, existing_value_len,
+        &operand_pointers[0], &operand_sizes[0], static_cast<int>(n), &success,
+        &new_value_len);
     new_value->assign(tmp_new_value, new_value_len);
 
     if (delete_value_ != nullptr) {
@@ -223,7 +398,8 @@ struct rocksdb_mergeoperator_t : public MergeOperator {
 
   virtual bool PartialMergeMulti(const Slice& key,
                                  const std::deque<Slice>& operand_list,
-                                 std::string* new_value, Logger* logger) const {
+                                 std::string* new_value,
+                                 Logger* logger) const override {
     size_t operand_count = operand_list.size();
     std::vector<const char*> operand_pointers(operand_count);
     std::vector<size_t> operand_sizes(operand_count);
@@ -237,7 +413,7 @@ struct rocksdb_mergeoperator_t : public MergeOperator {
     size_t new_value_len;
     char* tmp_new_value = (*partial_merge_)(
         state_, key.data(), key.size(), &operand_pointers[0], &operand_sizes[0],
-        operand_count, &success, &new_value_len);
+        static_cast<int>(operand_count), &success, &new_value_len);
     new_value->assign(tmp_new_value, new_value_len);
 
     if (delete_value_ != nullptr) {
@@ -274,21 +450,19 @@ struct rocksdb_slicetransform_t : public SliceTransform {
     (*destructor_)(state_);
   }
 
-  virtual const char* Name() const {
-    return (*name_)(state_);
-  }
+  virtual const char* Name() const override { return (*name_)(state_); }
 
-  virtual Slice Transform(const Slice& src) const {
+  virtual Slice Transform(const Slice& src) const override {
     size_t len;
     char* dst = (*transform_)(state_, src.data(), src.size(), &len);
     return Slice(dst, len);
   }
 
-  virtual bool InDomain(const Slice& src) const {
+  virtual bool InDomain(const Slice& src) const override {
     return (*in_domain_)(state_, src.data(), src.size());
   }
 
-  virtual bool InRange(const Slice& src) const {
+  virtual bool InRange(const Slice& src) const override {
     return (*in_range_)(state_, src.data(), src.size());
   }
 };
@@ -344,11 +518,204 @@ rocksdb_t* rocksdb_open_for_read_only(
   return result;
 }
 
+rocksdb_backup_engine_t* rocksdb_backup_engine_open(
+    const rocksdb_options_t* options, const char* path, char** errptr) {
+  BackupEngine* be;
+  if (SaveError(errptr, BackupEngine::Open(options->rep.env,
+                                           BackupableDBOptions(path), &be))) {
+    return nullptr;
+  }
+  rocksdb_backup_engine_t* result = new rocksdb_backup_engine_t;
+  result->rep = be;
+  return result;
+}
+
+void rocksdb_backup_engine_create_new_backup(rocksdb_backup_engine_t* be,
+                                             rocksdb_t* db, char** errptr) {
+  SaveError(errptr, be->rep->CreateNewBackup(db->rep));
+}
+
+rocksdb_restore_options_t* rocksdb_restore_options_create() {
+  return new rocksdb_restore_options_t;
+}
+
+void rocksdb_restore_options_destroy(rocksdb_restore_options_t* opt) {
+  delete opt;
+}
+
+void rocksdb_restore_options_set_keep_log_files(rocksdb_restore_options_t* opt,
+                                                int v) {
+  opt->rep.keep_log_files = v;
+}
+
+void rocksdb_backup_engine_restore_db_from_latest_backup(
+    rocksdb_backup_engine_t* be, const char* db_dir, const char* wal_dir,
+    const rocksdb_restore_options_t* restore_options, char** errptr) {
+  SaveError(errptr, be->rep->RestoreDBFromLatestBackup(std::string(db_dir),
+                                                       std::string(wal_dir),
+                                                       restore_options->rep));
+}
+
+const rocksdb_backup_engine_info_t* rocksdb_backup_engine_get_backup_info(
+    rocksdb_backup_engine_t* be) {
+  rocksdb_backup_engine_info_t* result = new rocksdb_backup_engine_info_t;
+  be->rep->GetBackupInfo(&result->rep);
+  return result;
+}
+
+int rocksdb_backup_engine_info_count(const rocksdb_backup_engine_info_t* info) {
+  return static_cast<int>(info->rep.size());
+}
+
+int64_t rocksdb_backup_engine_info_timestamp(
+    const rocksdb_backup_engine_info_t* info, int index) {
+  return info->rep[index].timestamp;
+}
+
+uint32_t rocksdb_backup_engine_info_backup_id(
+    const rocksdb_backup_engine_info_t* info, int index) {
+  return info->rep[index].backup_id;
+}
+
+uint64_t rocksdb_backup_engine_info_size(
+    const rocksdb_backup_engine_info_t* info, int index) {
+  return info->rep[index].size;
+}
+
+uint32_t rocksdb_backup_engine_info_number_files(
+    const rocksdb_backup_engine_info_t* info, int index) {
+  return info->rep[index].number_files;
+}
+
+void rocksdb_backup_engine_info_destroy(
+    const rocksdb_backup_engine_info_t* info) {
+  delete info;
+}
+
+void rocksdb_backup_engine_close(rocksdb_backup_engine_t* be) {
+  delete be->rep;
+  delete be;
+}
+
 void rocksdb_close(rocksdb_t* db) {
   delete db->rep;
   delete db;
 }
 
+rocksdb_t* rocksdb_open_column_families(
+    const rocksdb_options_t* db_options,
+    const char* name,
+    int num_column_families,
+    const char** column_family_names,
+    const rocksdb_options_t** column_family_options,
+    rocksdb_column_family_handle_t** column_family_handles,
+    char** errptr) {
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (int i = 0; i < num_column_families; i++) {
+    column_families.push_back(ColumnFamilyDescriptor(
+        std::string(column_family_names[i]),
+        ColumnFamilyOptions(column_family_options[i]->rep)));
+  }
+
+  DB* db;
+  std::vector<ColumnFamilyHandle*> handles;
+  if (SaveError(errptr, DB::Open(DBOptions(db_options->rep),
+          std::string(name), column_families, &handles, &db))) {
+    return nullptr;
+  }
+
+  for (size_t i = 0; i < handles.size(); i++) {
+    rocksdb_column_family_handle_t* c_handle = new rocksdb_column_family_handle_t;
+    c_handle->rep = handles[i];
+    column_family_handles[i] = c_handle;
+  }
+  rocksdb_t* result = new rocksdb_t;
+  result->rep = db;
+  return result;
+}
+
+rocksdb_t* rocksdb_open_for_read_only_column_families(
+    const rocksdb_options_t* db_options,
+    const char* name,
+    int num_column_families,
+    const char** column_family_names,
+    const rocksdb_options_t** column_family_options,
+    rocksdb_column_family_handle_t** column_family_handles,
+    unsigned char error_if_log_file_exist,
+    char** errptr) {
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (int i = 0; i < num_column_families; i++) {
+    column_families.push_back(ColumnFamilyDescriptor(
+        std::string(column_family_names[i]),
+        ColumnFamilyOptions(column_family_options[i]->rep)));
+  }
+
+  DB* db;
+  std::vector<ColumnFamilyHandle*> handles;
+  if (SaveError(errptr, DB::OpenForReadOnly(DBOptions(db_options->rep),
+          std::string(name), column_families, &handles, &db, error_if_log_file_exist))) {
+    return nullptr;
+  }
+
+  for (size_t i = 0; i < handles.size(); i++) {
+    rocksdb_column_family_handle_t* c_handle = new rocksdb_column_family_handle_t;
+    c_handle->rep = handles[i];
+    column_family_handles[i] = c_handle;
+  }
+  rocksdb_t* result = new rocksdb_t;
+  result->rep = db;
+  return result;
+}
+
+char** rocksdb_list_column_families(
+    const rocksdb_options_t* options,
+    const char* name,
+    size_t* lencfs,
+    char** errptr) {
+  std::vector<std::string> fams;
+  SaveError(errptr,
+      DB::ListColumnFamilies(DBOptions(options->rep),
+        std::string(name), &fams));
+
+  *lencfs = fams.size();
+  char** column_families = static_cast<char**>(malloc(sizeof(char*) * fams.size()));
+  for (size_t i = 0; i < fams.size(); i++) {
+    column_families[i] = strdup(fams[i].c_str());
+  }
+  return column_families;
+}
+
+void rocksdb_list_column_families_destroy(char** list, size_t len) {
+  for (size_t i = 0; i < len; ++i) {
+    free(list[i]);
+  }
+  free(list);
+}
+
+rocksdb_column_family_handle_t* rocksdb_create_column_family(
+    rocksdb_t* db,
+    const rocksdb_options_t* column_family_options,
+    const char* column_family_name,
+    char** errptr) {
+  rocksdb_column_family_handle_t* handle = new rocksdb_column_family_handle_t;
+  SaveError(errptr,
+      db->rep->CreateColumnFamily(ColumnFamilyOptions(column_family_options->rep),
+        std::string(column_family_name), &(handle->rep)));
+  return handle;
+}
+
+void rocksdb_drop_column_family(
+    rocksdb_t* db,
+    rocksdb_column_family_handle_t* handle,
+    char** errptr) {
+  SaveError(errptr, db->rep->DropColumnFamily(handle->rep));
+}
+
+void rocksdb_column_family_handle_destroy(rocksdb_column_family_handle_t* handle) {
+  delete handle->rep;
+  delete handle;
+}
+
 void rocksdb_put(
     rocksdb_t* db,
     const rocksdb_writeoptions_t* options,
@@ -359,6 +726,18 @@ void rocksdb_put(
             db->rep->Put(options->rep, Slice(key, keylen), Slice(val, vallen)));
 }
 
+void rocksdb_put_cf(
+    rocksdb_t* db,
+    const rocksdb_writeoptions_t* options,
+    rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t keylen,
+    const char* val, size_t vallen,
+    char** errptr) {
+  SaveError(errptr,
+            db->rep->Put(options->rep, column_family->rep,
+              Slice(key, keylen), Slice(val, vallen)));
+}
+
 void rocksdb_delete(
     rocksdb_t* db,
     const rocksdb_writeoptions_t* options,
@@ -367,6 +746,16 @@ void rocksdb_delete(
   SaveError(errptr, db->rep->Delete(options->rep, Slice(key, keylen)));
 }
 
+void rocksdb_delete_cf(
+    rocksdb_t* db,
+    const rocksdb_writeoptions_t* options,
+    rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t keylen,
+    char** errptr) {
+  SaveError(errptr, db->rep->Delete(options->rep, column_family->rep,
+        Slice(key, keylen)));
+}
+
 void rocksdb_merge(
     rocksdb_t* db,
     const rocksdb_writeoptions_t* options,
@@ -377,6 +766,18 @@ void rocksdb_merge(
             db->rep->Merge(options->rep, Slice(key, keylen), Slice(val, vallen)));
 }
 
+void rocksdb_merge_cf(
+    rocksdb_t* db,
+    const rocksdb_writeoptions_t* options,
+    rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t keylen,
+    const char* val, size_t vallen,
+    char** errptr) {
+  SaveError(errptr,
+            db->rep->Merge(options->rep, column_family->rep,
+              Slice(key, keylen), Slice(val, vallen)));
+}
+
 void rocksdb_write(
     rocksdb_t* db,
     const rocksdb_writeoptions_t* options,
@@ -406,6 +807,29 @@ char* rocksdb_get(
   return result;
 }
 
+char* rocksdb_get_cf(
+    rocksdb_t* db,
+    const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t keylen,
+    size_t* vallen,
+    char** errptr) {
+  char* result = nullptr;
+  std::string tmp;
+  Status s = db->rep->Get(options->rep, column_family->rep,
+      Slice(key, keylen), &tmp);
+  if (s.ok()) {
+    *vallen = tmp.size();
+    result = CopyString(tmp);
+  } else {
+    *vallen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+  }
+  return result;
+}
+
 rocksdb_iterator_t* rocksdb_create_iterator(
     rocksdb_t* db,
     const rocksdb_readoptions_t* options) {
@@ -414,6 +838,15 @@ rocksdb_iterator_t* rocksdb_create_iterator(
   return result;
 }
 
+rocksdb_iterator_t* rocksdb_create_iterator_cf(
+    rocksdb_t* db,
+    const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family) {
+  rocksdb_iterator_t* result = new rocksdb_iterator_t;
+  result->rep = db->rep->NewIterator(options->rep, column_family->rep);
+  return result;
+}
+
 const rocksdb_snapshot_t* rocksdb_create_snapshot(
     rocksdb_t* db) {
   rocksdb_snapshot_t* result = new rocksdb_snapshot_t;
@@ -440,6 +873,19 @@ char* rocksdb_property_value(
   }
 }
 
+char* rocksdb_property_value_cf(
+    rocksdb_t* db,
+    rocksdb_column_family_handle_t* column_family,
+    const char* propname) {
+  std::string tmp;
+  if (db->rep->GetProperty(column_family->rep, Slice(propname), &tmp)) {
+    // We use strdup() since we expect human readable output.
+    return strdup(tmp.c_str());
+  } else {
+    return nullptr;
+  }
+}
+
 void rocksdb_approximate_sizes(
     rocksdb_t* db,
     int num_ranges,
@@ -455,6 +901,22 @@ void rocksdb_approximate_sizes(
   delete[] ranges;
 }
 
+void rocksdb_approximate_sizes_cf(
+    rocksdb_t* db,
+    rocksdb_column_family_handle_t* column_family,
+    int num_ranges,
+    const char* const* range_start_key, const size_t* range_start_key_len,
+    const char* const* range_limit_key, const size_t* range_limit_key_len,
+    uint64_t* sizes) {
+  Range* ranges = new Range[num_ranges];
+  for (int i = 0; i < num_ranges; i++) {
+    ranges[i].start = Slice(range_start_key[i], range_start_key_len[i]);
+    ranges[i].limit = Slice(range_limit_key[i], range_limit_key_len[i]);
+  }
+  db->rep->GetApproximateSizes(column_family->rep, ranges, num_ranges, sizes);
+  delete[] ranges;
+}
+
 void rocksdb_delete_file(
     rocksdb_t* db,
     const char* name) {
@@ -479,6 +941,19 @@ void rocksdb_compact_range(
       (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr));
 }
 
+void rocksdb_compact_range_cf(
+    rocksdb_t* db,
+    rocksdb_column_family_handle_t* column_family,
+    const char* start_key, size_t start_key_len,
+    const char* limit_key, size_t limit_key_len) {
+  Slice a, b;
+  db->rep->CompactRange(
+      column_family->rep,
+      // Pass nullptr Slice if corresponding "const char*" is nullptr
+      (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
+      (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr));
+}
+
 void rocksdb_flush(
     rocksdb_t* db,
     const rocksdb_flushoptions_t* options,
@@ -562,6 +1037,13 @@ rocksdb_writebatch_t* rocksdb_writebatch_create() {
   return new rocksdb_writebatch_t;
 }
 
+rocksdb_writebatch_t* rocksdb_writebatch_create_from(const char* rep,
+                                                     size_t size) {
+  rocksdb_writebatch_t* b = new rocksdb_writebatch_t;
+  b->rep = WriteBatch(std::string(rep, size));
+  return b;
+}
+
 void rocksdb_writebatch_destroy(rocksdb_writebatch_t* b) {
   delete b;
 }
@@ -581,6 +1063,14 @@ void rocksdb_writebatch_put(
   b->rep.Put(Slice(key, klen), Slice(val, vlen));
 }
 
+void rocksdb_writebatch_put_cf(
+    rocksdb_writebatch_t* b,
+    rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen,
+    const char* val, size_t vlen) {
+  b->rep.Put(column_family->rep, Slice(key, klen), Slice(val, vlen));
+}
+
 void rocksdb_writebatch_merge(
     rocksdb_writebatch_t* b,
     const char* key, size_t klen,
@@ -588,12 +1078,27 @@ void rocksdb_writebatch_merge(
   b->rep.Merge(Slice(key, klen), Slice(val, vlen));
 }
 
+void rocksdb_writebatch_merge_cf(
+    rocksdb_writebatch_t* b,
+    rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen,
+    const char* val, size_t vlen) {
+  b->rep.Merge(column_family->rep, Slice(key, klen), Slice(val, vlen));
+}
+
 void rocksdb_writebatch_delete(
     rocksdb_writebatch_t* b,
     const char* key, size_t klen) {
   b->rep.Delete(Slice(key, klen));
 }
 
+void rocksdb_writebatch_delete_cf(
+    rocksdb_writebatch_t* b,
+    rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen) {
+  b->rep.Delete(column_family->rep, Slice(key, klen));
+}
+
 void rocksdb_writebatch_iterate(
     rocksdb_writebatch_t* b,
     void* state,
@@ -604,10 +1109,10 @@ void rocksdb_writebatch_iterate(
     void* state_;
     void (*put_)(void*, const char* k, size_t klen, const char* v, size_t vlen);
     void (*deleted_)(void*, const char* k, size_t klen);
-    virtual void Put(const Slice& key, const Slice& value) {
+    virtual void Put(const Slice& key, const Slice& value) override {
       (*put_)(state_, key.data(), key.size(), value.data(), value.size());
     }
-    virtual void Delete(const Slice& key) {
+    virtual void Delete(const Slice& key) override {
       (*deleted_)(state_, key.data(), key.size());
     }
   };
@@ -623,6 +1128,119 @@ const char* rocksdb_writebatch_data(rocksdb_writebatch_t* b, size_t* size) {
   return b->rep.Data().c_str();
 }
 
+rocksdb_block_based_table_options_t*
+rocksdb_block_based_options_create() {
+  return new rocksdb_block_based_table_options_t;
+}
+
+void rocksdb_block_based_options_destroy(
+    rocksdb_block_based_table_options_t* options) {
+  delete options;
+}
+
+void rocksdb_block_based_options_set_block_size(
+    rocksdb_block_based_table_options_t* options, size_t block_size) {
+  options->rep.block_size = block_size;
+}
+
+void rocksdb_block_based_options_set_block_size_deviation(
+    rocksdb_block_based_table_options_t* options, int block_size_deviation) {
+  options->rep.block_size_deviation = block_size_deviation;
+}
+
+void rocksdb_block_based_options_set_block_restart_interval(
+    rocksdb_block_based_table_options_t* options, int block_restart_interval) {
+  options->rep.block_restart_interval = block_restart_interval;
+}
+
+void rocksdb_block_based_options_set_filter_policy(
+    rocksdb_block_based_table_options_t* options,
+    rocksdb_filterpolicy_t* filter_policy) {
+  options->rep.filter_policy.reset(filter_policy);
+}
+
+void rocksdb_block_based_options_set_no_block_cache(
+    rocksdb_block_based_table_options_t* options,
+    unsigned char no_block_cache) {
+  options->rep.no_block_cache = no_block_cache;
+}
+
+void rocksdb_block_based_options_set_block_cache(
+    rocksdb_block_based_table_options_t* options,
+    rocksdb_cache_t* block_cache) {
+  if (block_cache) {
+    options->rep.block_cache = block_cache->rep;
+  }
+}
+
+void rocksdb_block_based_options_set_block_cache_compressed(
+    rocksdb_block_based_table_options_t* options,
+    rocksdb_cache_t* block_cache_compressed) {
+  if (block_cache_compressed) {
+    options->rep.block_cache_compressed = block_cache_compressed->rep;
+  }
+}
+
+void rocksdb_block_based_options_set_whole_key_filtering(
+    rocksdb_block_based_table_options_t* options, unsigned char v) {
+  options->rep.whole_key_filtering = v;
+}
+
+void rocksdb_options_set_block_based_table_factory(
+    rocksdb_options_t *opt,
+    rocksdb_block_based_table_options_t* table_options) {
+  if (table_options) {
+    opt->rep.table_factory.reset(
+        rocksdb::NewBlockBasedTableFactory(table_options->rep));
+  }
+}
+
+
+rocksdb_cuckoo_table_options_t*
+rocksdb_cuckoo_options_create() {
+  return new rocksdb_cuckoo_table_options_t;
+}
+
+void rocksdb_cuckoo_options_destroy(
+    rocksdb_cuckoo_table_options_t* options) {
+  delete options;
+}
+
+void rocksdb_cuckoo_options_set_hash_ratio(
+    rocksdb_cuckoo_table_options_t* options, double v) {
+  options->rep.hash_table_ratio = v;
+}
+
+void rocksdb_cuckoo_options_set_max_search_depth(
+    rocksdb_cuckoo_table_options_t* options, uint32_t v) {
+  options->rep.max_search_depth = v;
+}
+
+void rocksdb_cuckoo_options_set_cuckoo_block_size(
+    rocksdb_cuckoo_table_options_t* options, uint32_t v) {
+  options->rep.cuckoo_block_size = v;
+}
+
+void rocksdb_cuckoo_options_set_identity_as_first_hash(
+    rocksdb_cuckoo_table_options_t* options, unsigned char v) {
+  options->rep.identity_as_first_hash = v;
+}
+
+void rocksdb_cuckoo_options_set_use_module_hash(
+    rocksdb_cuckoo_table_options_t* options, unsigned char v) {
+  options->rep.use_module_hash = v;
+}
+
+void rocksdb_options_set_cuckoo_table_factory(
+    rocksdb_options_t *opt,
+    rocksdb_cuckoo_table_options_t* table_options) {
+  if (table_options) {
+    opt->rep.table_factory.reset(
+        rocksdb::NewCuckooTableFactory(table_options->rep));
+  }
+}
+
+
 rocksdb_options_t* rocksdb_options_create() {
   return new rocksdb_options_t;
 }
@@ -631,6 +1249,38 @@ void rocksdb_options_destroy(rocksdb_options_t* options) {
   delete options;
 }
 
+void rocksdb_options_increase_parallelism(
+    rocksdb_options_t* opt, int total_threads) {
+  opt->rep.IncreaseParallelism(total_threads);
+}
+
+void rocksdb_options_optimize_for_point_lookup(
+    rocksdb_options_t* opt, uint64_t block_cache_size_mb) {
+  opt->rep.OptimizeForPointLookup(block_cache_size_mb);
+}
+
+void rocksdb_options_optimize_level_style_compaction(
+    rocksdb_options_t* opt, uint64_t memtable_memory_budget) {
+  opt->rep.OptimizeLevelStyleCompaction(memtable_memory_budget);
+}
+
+void rocksdb_options_optimize_universal_style_compaction(
+    rocksdb_options_t* opt, uint64_t memtable_memory_budget) {
+  opt->rep.OptimizeUniversalStyleCompaction(memtable_memory_budget);
+}
+
+void rocksdb_options_set_compaction_filter(
+    rocksdb_options_t* opt,
+    rocksdb_compactionfilter_t* filter) {
+  opt->rep.compaction_filter = filter;
+}
+
+void rocksdb_options_set_compaction_filter_factory(
+    rocksdb_options_t* opt, rocksdb_compactionfilterfactory_t* factory) {
+  opt->rep.compaction_filter_factory =
+      std::shared_ptr<CompactionFilterFactory>(factory);
+}
+
 void rocksdb_options_set_comparator(
     rocksdb_options_t* opt,
     rocksdb_comparator_t* cmp) {
@@ -643,10 +1293,10 @@ void rocksdb_options_set_merge_operator(
   opt->rep.merge_operator = std::shared_ptr<MergeOperator>(merge_operator);
 }
 
-void rocksdb_options_set_filter_policy(
+void rocksdb_options_set_compaction_filter_factory_v2(
     rocksdb_options_t* opt,
-    rocksdb_filterpolicy_t* policy) {
-  opt->rep.filter_policy = policy;
+    rocksdb_compactionfilterfactoryv2_t* compaction_filter_factory_v2) {
+  opt->rep.compaction_filter_factory_v2 = std::shared_ptr<CompactionFilterFactoryV2>(compaction_filter_factory_v2);
 }
 
 void rocksdb_options_set_create_if_missing(
@@ -654,6 +1304,11 @@ void rocksdb_options_set_create_if_missing(
   opt->rep.create_if_missing = v;
 }
 
+void rocksdb_options_set_create_missing_column_families(
+    rocksdb_options_t* opt, unsigned char v) {
+  opt->rep.create_missing_column_families = v;
+}
+
 void rocksdb_options_set_error_if_exists(
     rocksdb_options_t* opt, unsigned char v) {
   opt->rep.error_if_exists = v;
@@ -679,6 +1334,11 @@ void rocksdb_options_set_info_log_level(
   opt->rep.info_log_level = static_cast<InfoLogLevel>(v);
 }
 
+void rocksdb_options_set_db_write_buffer_size(rocksdb_options_t* opt,
+                                              size_t s) {
+  opt->rep.db_write_buffer_size = s;
+}
+
 void rocksdb_options_set_write_buffer_size(rocksdb_options_t* opt, size_t s) {
   opt->rep.write_buffer_size = s;
 }
@@ -687,24 +1347,8 @@ void rocksdb_options_set_max_open_files(rocksdb_options_t* opt, int n) {
   opt->rep.max_open_files = n;
 }
 
-void rocksdb_options_set_cache(rocksdb_options_t* opt, rocksdb_cache_t* c) {
-  if (c) {
-    opt->rep.block_cache = c->rep;
-  }
-}
-
-void rocksdb_options_set_cache_compressed(rocksdb_options_t* opt, rocksdb_cache_t* c) {
-  if (c) {
-    opt->rep.block_cache_compressed = c->rep;
-  }
-}
-
-void rocksdb_options_set_block_size(rocksdb_options_t* opt, size_t s) {
-  opt->rep.block_size = s;
-}
-
-void rocksdb_options_set_block_restart_interval(rocksdb_options_t* opt, int n) {
-  opt->rep.block_restart_interval = n;
+void rocksdb_options_set_max_total_wal_size(rocksdb_options_t* opt, uint64_t n) {
+  opt->rep.max_total_wal_size = n;
 }
 
 void rocksdb_options_set_target_file_size_base(
@@ -799,11 +1443,6 @@ void rocksdb_options_set_prefix_extractor(
   opt->rep.prefix_extractor.reset(prefix_extractor);
 }
 
-void rocksdb_options_set_whole_key_filtering(
-    rocksdb_options_t* opt, unsigned char v) {
-  opt->rep.whole_key_filtering = v;
-}
-
 void rocksdb_options_set_disable_data_sync(
     rocksdb_options_t* opt, int disable_data_sync) {
   opt->rep.disableDataSync = disable_data_sync;
@@ -814,11 +1453,6 @@ void rocksdb_options_set_use_fsync(
   opt->rep.use_fsync = use_fsync;
 }
 
-void rocksdb_options_set_db_stats_log_interval(
-    rocksdb_options_t* opt, int db_stats_log_interval) {
-  opt->rep.db_stats_log_interval = db_stats_log_interval;
-}
-
 void rocksdb_options_set_db_log_dir(
     rocksdb_options_t* opt, const char* db_log_dir) {
   opt->rep.db_log_dir = db_log_dir;
@@ -848,8 +1482,8 @@ void rocksdb_options_set_purge_redundant_kvs_while_flush(
   opt->rep.purge_redundant_kvs_while_flush = v;
 }
 
-void rocksdb_options_set_allow_os_buffer(
-    rocksdb_options_t* opt, unsigned char v) {
+void rocksdb_options_set_allow_os_buffer(rocksdb_options_t* opt,
+                                         unsigned char v) {
   opt->rep.allow_os_buffer = v;
 }
 
@@ -878,11 +1512,6 @@ void rocksdb_options_set_stats_dump_period_sec(
   opt->rep.stats_dump_period_sec = v;
 }
 
-void rocksdb_options_set_block_size_deviation(
-    rocksdb_options_t* opt, int v) {
-  opt->rep.block_size_deviation = v;
-}
-
 void rocksdb_options_set_advise_random_on_open(
     rocksdb_options_t* opt, unsigned char v) {
   opt->rep.advise_random_on_open = v;
@@ -977,11 +1606,6 @@ void rocksdb_options_set_max_manifest_file_size(
   opt->rep.max_manifest_file_size = v;
 }
 
-void rocksdb_options_set_no_block_cache(
-    rocksdb_options_t* opt, unsigned char v) {
-  opt->rep.no_block_cache = v;
-}
-
 void rocksdb_options_set_table_cache_numshardbits(
     rocksdb_options_t* opt, int v) {
   opt->rep.table_cache_numshardbits = v;
@@ -989,7 +1613,7 @@ void rocksdb_options_set_table_cache_numshardbits(
 
 void rocksdb_options_set_table_cache_remove_scan_count_limit(
     rocksdb_options_t* opt, int v) {
-  opt->rep.table_cache_remove_scan_count_limit = v;
+  // this option is deprecated
 }
 
 void rocksdb_options_set_arena_block_size(
@@ -1001,10 +1625,6 @@ void rocksdb_options_set_disable_auto_compactions(rocksdb_options_t* opt, int di
   opt->rep.disable_auto_compactions = disable;
 }
 
-void rocksdb_options_set_disable_seek_compaction(rocksdb_options_t* opt, int disable) {
-  opt->rep.disable_seek_compaction = disable;
-}
-
 void rocksdb_options_set_delete_obsolete_files_period_micros(
     rocksdb_options_t* opt, uint64_t v) {
   opt->rep.delete_obsolete_files_period_micros = v;
@@ -1062,9 +1682,13 @@ void rocksdb_options_set_plain_table_factory(
     double hash_table_ratio, size_t index_sparseness) {
   static rocksdb::TableFactory* factory = 0;
   if (!factory) {
-    factory = rocksdb::NewPlainTableFactory(
-        user_key_len, bloom_bits_per_key,
-        hash_table_ratio, index_sparseness);
+    rocksdb::PlainTableOptions options;
+    options.user_key_len = user_key_len;
+    options.bloom_bits_per_key = bloom_bits_per_key;
+    options.hash_table_ratio = hash_table_ratio;
+    options.index_sparseness = index_sparseness;
+
+    factory = rocksdb::NewPlainTableFactory(options);
   }
   opt->rep.table_factory.reset(factory);
 }
@@ -1084,11 +1708,6 @@ void rocksdb_options_set_bloom_locality(
   opt->rep.bloom_locality = v;
 }
 
-void rocksdb_options_set_allow_thread_local(
-    rocksdb_options_t* opt, unsigned char v) {
-  opt->rep.allow_thread_local = v;
-}
-
 void rocksdb_options_set_inplace_update_support(
     rocksdb_options_t* opt, unsigned char v) {
   opt->rep.inplace_update_support = v;
@@ -1107,6 +1726,12 @@ void rocksdb_options_set_universal_compaction_options(rocksdb_options_t *opt, ro
   opt->rep.compaction_options_universal = *(uco->rep);
 }
 
+void rocksdb_options_set_fifo_compaction_options(
+    rocksdb_options_t* opt,
+    rocksdb_fifo_compaction_options_t* fifo) {
+  opt->rep.compaction_options_fifo = fifo->rep;
+}
+
 /*
 TODO:
 DB::OpenForReadOnly
@@ -1119,10 +1744,101 @@ DB::GetUpdatesSince
 DB::GetDbIdentity
 DB::RunManualCompaction
 custom cache
-compaction_filter
 table_properties_collectors
 */
 
+rocksdb_compactionfilter_t* rocksdb_compactionfilter_create(
+    void* state,
+    void (*destructor)(void*),
+    unsigned char (*filter)(
+        void*,
+        int level,
+        const char* key, size_t key_length,
+        const char* existing_value, size_t value_length,
+        char** new_value, size_t *new_value_length,
+        unsigned char* value_changed),
+    const char* (*name)(void*)) {
+  rocksdb_compactionfilter_t* result = new rocksdb_compactionfilter_t;
+  result->state_ = state;
+  result->destructor_ = destructor;
+  result->filter_ = filter;
+  result->name_ = name;
+  return result;
+}
+
+void rocksdb_compactionfilter_destroy(rocksdb_compactionfilter_t* filter) {
+  delete filter;
+}
+
+unsigned char rocksdb_compactionfiltercontext_is_full_compaction(
+    rocksdb_compactionfiltercontext_t* context) {
+  return context->rep.is_full_compaction;
+}
+
+unsigned char rocksdb_compactionfiltercontext_is_manual_compaction(
+    rocksdb_compactionfiltercontext_t* context) {
+  return context->rep.is_manual_compaction;
+}
+
+rocksdb_compactionfilterfactory_t* rocksdb_compactionfilterfactory_create(
+    void* state, void (*destructor)(void*),
+    rocksdb_compactionfilter_t* (*create_compaction_filter)(
+        void*, rocksdb_compactionfiltercontext_t* context),
+    const char* (*name)(void*)) {
+  rocksdb_compactionfilterfactory_t* result =
+      new rocksdb_compactionfilterfactory_t;
+  result->state_ = state;
+  result->destructor_ = destructor;
+  result->create_compaction_filter_ = create_compaction_filter;
+  result->name_ = name;
+  return result;
+}
+
+void rocksdb_compactionfilterfactory_destroy(
+    rocksdb_compactionfilterfactory_t* factory) {
+  delete factory;
+}
+
+rocksdb_compactionfilterv2_t* rocksdb_compactionfilterv2_create(
+    void* state,
+    void (*destructor)(void*),
+    void (*filter)(void*, int level, size_t num_keys,
+                   const char* const* keys_list, const size_t* keys_list_sizes,
+                   const char* const* existing_values_list, const size_t* existing_values_list_sizes,
+                   char** new_values_list, size_t* new_values_list_sizes,
+                   unsigned char* to_delete_list),
+    const char* (*name)(void*)) {
+  rocksdb_compactionfilterv2_t* result = new rocksdb_compactionfilterv2_t;
+  result->state_ = state;
+  result->destructor_ = destructor;
+  result->filter_ = filter;
+  result->name_ = name;
+  return result;
+}
+
+void rocksdb_compactionfilterv2_destroy(rocksdb_compactionfilterv2_t* filter) {
+  delete filter;
+}
+
+rocksdb_compactionfilterfactoryv2_t* rocksdb_compactionfilterfactoryv2_create(
+    void* state,
+    rocksdb_slicetransform_t* prefix_extractor,
+    void (*destructor)(void*),
+    rocksdb_compactionfilterv2_t* (*create_compaction_filter_v2)(
+        void* state, const rocksdb_compactionfiltercontext_t* context),
+    const char* (*name)(void*)) {
+  rocksdb_compactionfilterfactoryv2_t* result = new rocksdb_compactionfilterfactoryv2_t(prefix_extractor);
+  result->state_ = state;
+  result->destructor_ = destructor;
+  result->create_compaction_filter_v2_ = create_compaction_filter_v2;
+  result->name_ = name;
+  return result;
+}
+
+void rocksdb_compactionfilterfactoryv2_destroy(rocksdb_compactionfilterfactoryv2_t* factory) {
+  delete factory;
+}
+
 rocksdb_comparator_t* rocksdb_comparator_create(
     void* state,
     void (*destructor)(void*),
@@ -1180,11 +1896,12 @@ rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom(int bits_per_key) {
   struct Wrapper : public rocksdb_filterpolicy_t {
     const FilterPolicy* rep_;
     ~Wrapper() { delete rep_; }
-    const char* Name() const { return rep_->Name(); }
-    void CreateFilter(const Slice* keys, int n, std::string* dst) const {
+    const char* Name() const override { return rep_->Name(); }
+    void CreateFilter(const Slice* keys, int n,
+                      std::string* dst) const override {
       return rep_->CreateFilter(keys, n, dst);
     }
-    bool KeyMayMatch(const Slice& key, const Slice& filter) const {
+    bool KeyMayMatch(const Slice& key, const Slice& filter) const override {
       return rep_->KeyMayMatch(key, filter);
     }
     static void DoNothing(void*) { }
@@ -1250,6 +1967,19 @@ void rocksdb_readoptions_set_snapshot(
   opt->rep.snapshot = (snap ? snap->rep : nullptr);
 }
 
+void rocksdb_readoptions_set_iterate_upper_bound(
+    rocksdb_readoptions_t* opt,
+    const char* key, size_t keylen) {
+  if (key == nullptr) {
+    opt->upper_bound = Slice();
+    opt->rep.iterate_upper_bound = nullptr;
+
+  } else {
+    opt->upper_bound = Slice(key, keylen);
+    opt->rep.iterate_upper_bound = &opt->upper_bound;
+  }
+}
+
 void rocksdb_readoptions_set_read_tier(
     rocksdb_readoptions_t* opt, int v) {
   opt->rep.read_tier = static_cast<rocksdb::ReadTier>(v);
@@ -1353,16 +2083,14 @@ rocksdb_slicetransform_t* rocksdb_slicetransform_create_fixed_prefix(size_t pref
   struct Wrapper : public rocksdb_slicetransform_t {
     const SliceTransform* rep_;
     ~Wrapper() { delete rep_; }
-    const char* Name() const { return rep_->Name(); }
-    Slice Transform(const Slice& src) const {
+    const char* Name() const override { return rep_->Name(); }
+    Slice Transform(const Slice& src) const override {
       return rep_->Transform(src);
     }
-    bool InDomain(const Slice& src) const {
+    bool InDomain(const Slice& src) const override {
       return rep_->InDomain(src);
     }
-    bool InRange(const Slice& src) const {
-      return rep_->InRange(src);
-    }
+    bool InRange(const Slice& src) const override { return rep_->InRange(src); }
     static void DoNothing(void*) { }
   };
   Wrapper* wrapper = new Wrapper;
@@ -1414,6 +2142,22 @@ void rocksdb_universal_compaction_options_destroy(
   delete uco;
 }
 
+rocksdb_fifo_compaction_options_t* rocksdb_fifo_compaction_options_create() {
+  rocksdb_fifo_compaction_options_t* result = new rocksdb_fifo_compaction_options_t;
+  result->rep =  CompactionOptionsFIFO();
+  return result;
+}
+
+void rocksdb_fifo_compaction_options_set_max_table_files_size(
+    rocksdb_fifo_compaction_options_t* fifo_opts, uint64_t size) {
+  fifo_opts->rep.max_table_files_size = size;
+}
+
+void rocksdb_fifo_compaction_options_destroy(
+    rocksdb_fifo_compaction_options_t* fifo_opts) {
+  delete fifo_opts;
+}
+
 void rocksdb_options_set_min_level_to_compress(rocksdb_options_t* opt, int level) {
   if (level >= 0) {
     assert(level <= opt->rep.num_levels);
@@ -1429,7 +2173,7 @@ void rocksdb_options_set_min_level_to_compress(rocksdb_options_t* opt, int level
 
 int rocksdb_livefiles_count(
   const rocksdb_livefiles_t* lf) {
-  return lf->rep.size();
+  return static_cast<int>(lf->rep.size());
 }
 
 const char* rocksdb_livefiles_name(
diff --git a/src/rocksdb/db/c_test.c b/src/rocksdb/db/c_test.c
index 8ebce90..2a9dc20 100644
--- a/src/rocksdb/db/c_test.c
+++ b/src/rocksdb/db/c_test.c
@@ -10,9 +10,11 @@
 #include <string.h>
 #include <sys/types.h>
 #include <unistd.h>
+#include <inttypes.h>
 
 const char* phase = "";
 static char dbname[200];
+static char dbbackupname[200];
 
 static void StartPhase(const char* name) {
   fprintf(stderr, "=== Test %s\n", name);
@@ -75,6 +77,22 @@ static void CheckGet(
   Free(&val);
 }
 
+static void CheckGetCF(
+    rocksdb_t* db,
+    const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* handle,
+    const char* key,
+    const char* expected) {
+  char* err = NULL;
+  size_t val_len;
+  char* val;
+  val = rocksdb_get_cf(db, options, handle, key, strlen(key), &val_len, &err);
+  CheckNoError(err);
+  CheckEqual(expected, val, val_len);
+  Free(&val);
+}
+
+
 static void CheckIter(rocksdb_iterator_t* iter,
                       const char* key, const char* val) {
   size_t len;
@@ -116,7 +134,7 @@ static void CmpDestroy(void* arg) { }
 
 static int CmpCompare(void* arg, const char* a, size_t alen,
                       const char* b, size_t blen) {
-  int n = (alen < blen) ? alen : blen;
+  size_t n = (alen < blen) ? alen : blen;
   int r = memcmp(a, b, n);
   if (r == 0) {
     if (alen < blen) r = -1;
@@ -154,6 +172,134 @@ static unsigned char FilterKeyMatch(
   return fake_filter_result;
 }
 
+// Custom compaction filter
+static void CFilterDestroy(void* arg) {}
+static const char* CFilterName(void* arg) { return "foo"; }
+static unsigned char CFilterFilter(void* arg, int level, const char* key,
+                                   size_t key_length,
+                                   const char* existing_value,
+                                   size_t value_length, char** new_value,
+                                   size_t* new_value_length,
+                                   unsigned char* value_changed) {
+  if (key_length == 3) {
+    if (memcmp(key, "bar", key_length) == 0) {
+      return 1;
+    } else if (memcmp(key, "baz", key_length) == 0) {
+      *value_changed = 1;
+      *new_value = "newbazvalue";
+      *new_value_length = 11;
+      return 0;
+    }
+  }
+  return 0;
+}
+
+static void CFilterFactoryDestroy(void* arg) {}
+static const char* CFilterFactoryName(void* arg) { return "foo"; }
+static rocksdb_compactionfilter_t* CFilterCreate(
+    void* arg, rocksdb_compactionfiltercontext_t* context) {
+  return rocksdb_compactionfilter_create(NULL, CFilterDestroy, CFilterFilter,
+                                         CFilterName);
+}
+
+static rocksdb_t* CheckCompaction(rocksdb_t* db, rocksdb_options_t* options,
+                                  rocksdb_readoptions_t* roptions,
+                                  rocksdb_writeoptions_t* woptions) {
+  char* err = NULL;
+  db = rocksdb_open(options, dbname, &err);
+  CheckNoError(err);
+  rocksdb_put(db, woptions, "foo", 3, "foovalue", 8, &err);
+  CheckNoError(err);
+  CheckGet(db, roptions, "foo", "foovalue");
+  rocksdb_put(db, woptions, "bar", 3, "barvalue", 8, &err);
+  CheckNoError(err);
+  CheckGet(db, roptions, "bar", "barvalue");
+  rocksdb_put(db, woptions, "baz", 3, "bazvalue", 8, &err);
+  CheckNoError(err);
+  CheckGet(db, roptions, "baz", "bazvalue");
+
+  // Force compaction
+  rocksdb_compact_range(db, NULL, 0, NULL, 0);
+  // should have filtered bar, but not foo
+  CheckGet(db, roptions, "foo", "foovalue");
+  CheckGet(db, roptions, "bar", NULL);
+  CheckGet(db, roptions, "baz", "newbazvalue");
+  return db;
+}
+
+// Custom compaction filter V2.
+static void CompactionFilterV2Destroy(void* arg) { }
+static const char* CompactionFilterV2Name(void* arg) {
+  return "TestCompactionFilterV2";
+}
+static void CompactionFilterV2Filter(
+    void* arg, int level, size_t num_keys,
+    const char* const* keys_list, const size_t* keys_list_sizes,
+    const char* const* existing_values_list, const size_t* existing_values_list_sizes,
+    char** new_values_list, size_t* new_values_list_sizes,
+    unsigned char* to_delete_list) {
+  size_t i;
+  for (i = 0; i < num_keys; i++) {
+    // If any value is "gc", it's removed.
+    if (existing_values_list_sizes[i] == 2 && memcmp(existing_values_list[i], "gc", 2) == 0) {
+      to_delete_list[i] = 1;
+    } else if (existing_values_list_sizes[i] == 6 && memcmp(existing_values_list[i], "gc all", 6) == 0) {
+      // If any value is "gc all", all keys are removed.
+      size_t j;
+      for (j = 0; j < num_keys; j++) {
+        to_delete_list[j] = 1;
+      }
+      return;
+    } else if (existing_values_list_sizes[i] == 6 && memcmp(existing_values_list[i], "change", 6) == 0) {
+      // If value is "change", set changed value to "changed".
+      size_t len;
+      len = strlen("changed");
+      new_values_list[i] = malloc(len);
+      memcpy(new_values_list[i], "changed", len);
+      new_values_list_sizes[i] = len;
+    } else {
+      // Otherwise, no keys are removed.
+    }
+  }
+}
+
+// Custom prefix extractor for compaction filter V2 which extracts first 3 characters.
+static void CFV2PrefixExtractorDestroy(void* arg) { }
+static char* CFV2PrefixExtractorTransform(void* arg, const char* key, size_t length, size_t* dst_length) {
+  // Verify keys are maximum length 4; this verifies fix for a
+  // prior bug which was passing the RocksDB-encoded key with
+  // logical timestamp suffix instead of parsed user key.
+  if (length > 4) {
+    fprintf(stderr, "%s:%d: %s: key %s is not user key\n", __FILE__, __LINE__, phase, key);
+    abort();
+  }
+  *dst_length = length < 3 ? length : 3;
+  return (char*)key;
+}
+static unsigned char CFV2PrefixExtractorInDomain(void* state, const char* key, size_t length) {
+  return 1;
+}
+static unsigned char CFV2PrefixExtractorInRange(void* state, const char* key, size_t length) {
+  return 1;
+}
+static const char* CFV2PrefixExtractorName(void* state) {
+  return "TestCFV2PrefixExtractor";
+}
+
+// Custom compaction filter factory V2.
+static void CompactionFilterFactoryV2Destroy(void* arg) {
+  rocksdb_slicetransform_destroy((rocksdb_slicetransform_t*)arg);
+}
+static const char* CompactionFilterFactoryV2Name(void* arg) {
+  return "TestCompactionFilterFactoryV2";
+}
+static rocksdb_compactionfilterv2_t* CompactionFilterFactoryV2Create(
+    void* state, const rocksdb_compactionfiltercontext_t* context) {
+  return rocksdb_compactionfilterv2_create(state, CompactionFilterV2Destroy,
+                                           CompactionFilterV2Filter,
+                                           CompactionFilterV2Name);
+}
+
 // Custom merge operator
 static void MergeOperatorDestroy(void* arg) { }
 static const char* MergeOperatorName(void* arg) {
@@ -191,6 +337,7 @@ int main(int argc, char** argv) {
   rocksdb_cache_t* cache;
   rocksdb_env_t* env;
   rocksdb_options_t* options;
+  rocksdb_block_based_table_options_t* table_options;
   rocksdb_readoptions_t* roptions;
   rocksdb_writeoptions_t* woptions;
   char* err = NULL;
@@ -201,6 +348,11 @@ int main(int argc, char** argv) {
            GetTempDir(),
            ((int) geteuid()));
 
+  snprintf(dbbackupname, sizeof(dbbackupname),
+           "%s/rocksdb_c_test-%d-backup",
+           GetTempDir(),
+           ((int) geteuid()));
+
   StartPhase("create_objects");
   cmp = rocksdb_comparator_create(NULL, CmpDestroy, CmpCompare, CmpName);
   env = rocksdb_create_default_env();
@@ -209,14 +361,15 @@ int main(int argc, char** argv) {
   options = rocksdb_options_create();
   rocksdb_options_set_comparator(options, cmp);
   rocksdb_options_set_error_if_exists(options, 1);
-  rocksdb_options_set_cache(options, cache);
   rocksdb_options_set_env(options, env);
   rocksdb_options_set_info_log(options, NULL);
   rocksdb_options_set_write_buffer_size(options, 100000);
   rocksdb_options_set_paranoid_checks(options, 1);
   rocksdb_options_set_max_open_files(options, 10);
-  rocksdb_options_set_block_size(options, 1024);
-  rocksdb_options_set_block_restart_interval(options, 8);
+  table_options = rocksdb_block_based_options_create();
+  rocksdb_block_based_options_set_block_cache(table_options, cache);
+  rocksdb_options_set_block_based_table_factory(options, table_options);
+
   rocksdb_options_set_compression(options, rocksdb_no_compression);
   rocksdb_options_set_compression_options(options, -14, -1, 0);
   int compression_levels[] = {rocksdb_no_compression, rocksdb_no_compression,
@@ -235,7 +388,7 @@ int main(int argc, char** argv) {
   Free(&err);
 
   StartPhase("open_error");
-  db = rocksdb_open(options, dbname, &err);
+  rocksdb_open(options, dbname, &err);
   CheckCondition(err != NULL);
   Free(&err);
 
@@ -250,6 +403,41 @@ int main(int argc, char** argv) {
   CheckNoError(err);
   CheckGet(db, roptions, "foo", "hello");
 
+  StartPhase("backup_and_restore");
+  {
+    rocksdb_destroy_db(options, dbbackupname, &err);
+    CheckNoError(err);
+
+    rocksdb_backup_engine_t *be = rocksdb_backup_engine_open(options, dbbackupname, &err);
+    CheckNoError(err);
+
+    rocksdb_backup_engine_create_new_backup(be, db, &err);
+    CheckNoError(err);
+
+    rocksdb_delete(db, woptions, "foo", 3, &err);
+    CheckNoError(err);
+
+    rocksdb_close(db);
+
+    rocksdb_destroy_db(options, dbname, &err);
+    CheckNoError(err);
+
+    rocksdb_restore_options_t *restore_options = rocksdb_restore_options_create();
+    rocksdb_restore_options_set_keep_log_files(restore_options, 0);
+    rocksdb_backup_engine_restore_db_from_latest_backup(be, dbname, dbname, restore_options, &err);
+    CheckNoError(err);
+    rocksdb_restore_options_destroy(restore_options);
+
+    rocksdb_options_set_error_if_exists(options, 0);
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+    rocksdb_options_set_error_if_exists(options, 1);
+
+    CheckGet(db, roptions, "foo", "hello");
+
+    rocksdb_backup_engine_close(be);
+  }
+
   StartPhase("compactall");
   rocksdb_compact_range(db, NULL, 0, NULL, 0);
   CheckGet(db, roptions, "foo", "hello");
@@ -277,6 +465,24 @@ int main(int argc, char** argv) {
     rocksdb_writebatch_destroy(wb);
   }
 
+  StartPhase("writebatch_rep");
+  {
+    rocksdb_writebatch_t* wb1 = rocksdb_writebatch_create();
+    rocksdb_writebatch_put(wb1, "baz", 3, "d", 1);
+    rocksdb_writebatch_put(wb1, "quux", 4, "e", 1);
+    rocksdb_writebatch_delete(wb1, "quux", 4);
+    size_t repsize1 = 0;
+    const char* rep = rocksdb_writebatch_data(wb1, &repsize1);
+    rocksdb_writebatch_t* wb2 = rocksdb_writebatch_create_from(rep, repsize1);
+    CheckCondition(rocksdb_writebatch_count(wb1) ==
+                   rocksdb_writebatch_count(wb2));
+    size_t repsize2 = 0;
+    CheckCondition(
+        memcmp(rep, rocksdb_writebatch_data(wb2, &repsize2), repsize1) == 0);
+    rocksdb_writebatch_destroy(wb1);
+    rocksdb_writebatch_destroy(wb2);
+  }
+
   StartPhase("iter");
   {
     rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions);
@@ -378,10 +584,12 @@ int main(int argc, char** argv) {
       policy = rocksdb_filterpolicy_create_bloom(10);
     }
 
+    rocksdb_block_based_options_set_filter_policy(table_options, policy);
+
     // Create new database
     rocksdb_close(db);
     rocksdb_destroy_db(options, dbname, &err);
-    rocksdb_options_set_filter_policy(options, policy);
+    rocksdb_options_set_block_based_table_factory(options, table_options);
     db = rocksdb_open(options, dbname, &err);
     CheckNoError(err);
     rocksdb_put(db, woptions, "foo", 3, "foovalue", 8, &err);
@@ -403,8 +611,90 @@ int main(int argc, char** argv) {
       CheckGet(db, roptions, "foo", "foovalue");
       CheckGet(db, roptions, "bar", "barvalue");
     }
-    rocksdb_options_set_filter_policy(options, NULL);
-    rocksdb_filterpolicy_destroy(policy);
+    // Reset the policy
+    rocksdb_block_based_options_set_filter_policy(table_options, NULL);
+    rocksdb_options_set_block_based_table_factory(options, table_options);
+  }
+
+  StartPhase("compaction_filter");
+  {
+    rocksdb_options_t* options_with_filter = rocksdb_options_create();
+    rocksdb_options_set_create_if_missing(options_with_filter, 1);
+    rocksdb_compactionfilter_t* cfilter;
+    cfilter = rocksdb_compactionfilter_create(NULL, CFilterDestroy,
+                                              CFilterFilter, CFilterName);
+    // Create new database
+    rocksdb_close(db);
+    rocksdb_destroy_db(options_with_filter, dbname, &err);
+    rocksdb_options_set_compaction_filter(options_with_filter, cfilter);
+    db = CheckCompaction(db, options_with_filter, roptions, woptions);
+
+    rocksdb_options_set_compaction_filter(options_with_filter, NULL);
+    rocksdb_compactionfilter_destroy(cfilter);
+    rocksdb_options_destroy(options_with_filter);
+  }
+
+  StartPhase("compaction_filter_factory");
+  {
+    rocksdb_options_t* options_with_filter_factory = rocksdb_options_create();
+    rocksdb_options_set_create_if_missing(options_with_filter_factory, 1);
+    rocksdb_compactionfilterfactory_t* factory;
+    factory = rocksdb_compactionfilterfactory_create(
+        NULL, CFilterFactoryDestroy, CFilterCreate, CFilterFactoryName);
+    // Create new database
+    rocksdb_close(db);
+    rocksdb_destroy_db(options_with_filter_factory, dbname, &err);
+    rocksdb_options_set_compaction_filter_factory(options_with_filter_factory,
+                                                  factory);
+    db = CheckCompaction(db, options_with_filter_factory, roptions, woptions);
+
+    rocksdb_options_set_compaction_filter_factory(
+        options_with_filter_factory, NULL);
+    rocksdb_options_destroy(options_with_filter_factory);
+  }
+
+  StartPhase("compaction_filter_v2");
+  {
+    rocksdb_compactionfilterfactoryv2_t* factory;
+    rocksdb_slicetransform_t* prefix_extractor;
+    prefix_extractor = rocksdb_slicetransform_create(
+        NULL, CFV2PrefixExtractorDestroy, CFV2PrefixExtractorTransform,
+        CFV2PrefixExtractorInDomain, CFV2PrefixExtractorInRange,
+        CFV2PrefixExtractorName);
+    factory = rocksdb_compactionfilterfactoryv2_create(
+        prefix_extractor, prefix_extractor, CompactionFilterFactoryV2Destroy,
+        CompactionFilterFactoryV2Create, CompactionFilterFactoryV2Name);
+    // Create new database
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+    rocksdb_options_set_compaction_filter_factory_v2(options, factory);
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+    // Only foo2 is GC'd, foo3 is changed.
+    rocksdb_put(db, woptions, "foo1", 4, "no gc", 5, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "foo2", 4, "gc", 2, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "foo3", 4, "change", 6, &err);
+    CheckNoError(err);
+    // All bars are GC'd.
+    rocksdb_put(db, woptions, "bar1", 4, "no gc", 5, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "bar2", 4, "gc all", 6, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "bar3", 4, "no gc", 5, &err);
+    CheckNoError(err);
+    // Compact the DB to garbage collect.
+    rocksdb_compact_range(db, NULL, 0, NULL, 0);
+
+    // Verify foo entries.
+    CheckGet(db, roptions, "foo1", "no gc");
+    CheckGet(db, roptions, "foo2", NULL);
+    CheckGet(db, roptions, "foo3", "changed");
+    // Verify bar entries were all deleted.
+    CheckGet(db, roptions, "bar1", NULL);
+    CheckGet(db, roptions, "bar2", NULL);
+    CheckGet(db, roptions, "bar3", NULL);
   }
 
   StartPhase("merge_operator");
@@ -433,16 +723,92 @@ int main(int argc, char** argv) {
 
   }
 
-  StartPhase("prefix");
+  StartPhase("columnfamilies");
   {
-    // Create new database
     rocksdb_close(db);
     rocksdb_destroy_db(options, dbname, &err);
+    CheckNoError(err)
+
+    rocksdb_options_t* db_options = rocksdb_options_create();
+    rocksdb_options_set_create_if_missing(db_options, 1);
+    db = rocksdb_open(db_options, dbname, &err);
+    CheckNoError(err)
+    rocksdb_column_family_handle_t* cfh;
+    cfh = rocksdb_create_column_family(db, db_options, "cf1", &err);
+    rocksdb_column_family_handle_destroy(cfh);
+    CheckNoError(err);
+    rocksdb_close(db);
+
+    size_t cflen;
+    char** column_fams = rocksdb_list_column_families(db_options, dbname, &cflen, &err);
+    CheckNoError(err);
+    CheckEqual("default", column_fams[0], 7);
+    CheckEqual("cf1", column_fams[1], 3);
+    CheckCondition(cflen == 2);
+    rocksdb_list_column_families_destroy(column_fams, cflen);
+
+    rocksdb_options_t* cf_options = rocksdb_options_create();
+
+    const char* cf_names[2] = {"default", "cf1"};
+    const rocksdb_options_t* cf_opts[2] = {cf_options, cf_options};
+    rocksdb_column_family_handle_t* handles[2];
+    db = rocksdb_open_column_families(db_options, dbname, 2, cf_names, cf_opts, handles, &err);
+    CheckNoError(err);
+
+    rocksdb_put_cf(db, woptions, handles[1], "foo", 3, "hello", 5, &err);
+    CheckNoError(err);
 
-    rocksdb_filterpolicy_t* policy = rocksdb_filterpolicy_create_bloom(10);
-    rocksdb_options_set_filter_policy(options, policy);
+    CheckGetCF(db, roptions, handles[1], "foo", "hello");
+
+    rocksdb_delete_cf(db, woptions, handles[1], "foo", 3, &err);
+    CheckNoError(err);
+
+    CheckGetCF(db, roptions, handles[1], "foo", NULL);
+
+    rocksdb_writebatch_t* wb = rocksdb_writebatch_create();
+    rocksdb_writebatch_put_cf(wb, handles[1], "baz", 3, "a", 1);
+    rocksdb_writebatch_clear(wb);
+    rocksdb_writebatch_put_cf(wb, handles[1], "bar", 3, "b", 1);
+    rocksdb_writebatch_put_cf(wb, handles[1], "box", 3, "c", 1);
+    rocksdb_writebatch_delete_cf(wb, handles[1], "bar", 3);
+    rocksdb_write(db, woptions, wb, &err);
+    CheckNoError(err);
+    CheckGetCF(db, roptions, handles[1], "baz", NULL);
+    CheckGetCF(db, roptions, handles[1], "bar", NULL);
+    CheckGetCF(db, roptions, handles[1], "box", "c");
+    rocksdb_writebatch_destroy(wb);
+
+    rocksdb_iterator_t* iter = rocksdb_create_iterator_cf(db, roptions, handles[1]);
+    CheckCondition(!rocksdb_iter_valid(iter));
+    rocksdb_iter_seek_to_first(iter);
+    CheckCondition(rocksdb_iter_valid(iter));
+
+    int i;
+    for (i = 0; rocksdb_iter_valid(iter) != 0; rocksdb_iter_next(iter)) {
+      i++;
+    }
+    CheckCondition(i == 1);
+    rocksdb_iter_get_error(iter, &err);
+    CheckNoError(err);
+    rocksdb_iter_destroy(iter);
+
+    rocksdb_drop_column_family(db, handles[1], &err);
+    CheckNoError(err);
+    for (i = 0; i < 2; i++) {
+      rocksdb_column_family_handle_destroy(handles[i]);
+    }
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+    rocksdb_options_destroy(db_options);
+    rocksdb_options_destroy(cf_options);
+  }
+
+  StartPhase("prefix");
+  {
+    // Create new database
+    rocksdb_options_set_allow_mmap_reads(options, 1);
     rocksdb_options_set_prefix_extractor(options, rocksdb_slicetransform_create_fixed_prefix(3));
-    rocksdb_options_set_hash_skip_list_rep(options, 50000, 4, 4);
+    rocksdb_options_set_hash_skip_list_rep(options, 5000, 4, 4);
     rocksdb_options_set_plain_table_factory(options, 4, 10, 0.75, 16);
 
     db = rocksdb_open(options, dbname, &err);
@@ -477,12 +843,92 @@ int main(int argc, char** argv) {
     rocksdb_iter_get_error(iter, &err);
     CheckNoError(err);
     rocksdb_iter_destroy(iter);
-    rocksdb_filterpolicy_destroy(policy);
+
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+  }
+
+  StartPhase("cuckoo_options");
+  {
+    rocksdb_cuckoo_table_options_t* cuckoo_options;
+    cuckoo_options = rocksdb_cuckoo_options_create();
+    rocksdb_cuckoo_options_set_hash_ratio(cuckoo_options, 0.5);
+    rocksdb_cuckoo_options_set_max_search_depth(cuckoo_options, 200);
+    rocksdb_cuckoo_options_set_cuckoo_block_size(cuckoo_options, 10);
+    rocksdb_cuckoo_options_set_identity_as_first_hash(cuckoo_options, 1);
+    rocksdb_cuckoo_options_set_use_module_hash(cuckoo_options, 0);
+    rocksdb_options_set_cuckoo_table_factory(options, cuckoo_options);
+
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+
+    rocksdb_cuckoo_options_destroy(cuckoo_options);
+  }
+
+  StartPhase("iterate_upper_bound");
+  {
+    // Create new empty database
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+    CheckNoError(err);
+
+    rocksdb_options_set_prefix_extractor(options, NULL);
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+
+    rocksdb_put(db, woptions, "a",    1, "0",    1, &err); CheckNoError(err);
+    rocksdb_put(db, woptions, "foo",  3, "bar",  3, &err); CheckNoError(err);
+    rocksdb_put(db, woptions, "foo1", 4, "bar1", 4, &err); CheckNoError(err);
+    rocksdb_put(db, woptions, "g1",   2, "0",    1, &err); CheckNoError(err);
+
+    // testing basic case with no iterate_upper_bound and no prefix_extractor
+    {
+       rocksdb_readoptions_set_iterate_upper_bound(roptions, NULL, 0);
+       rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions);
+
+       rocksdb_iter_seek(iter, "foo", 3);
+       CheckCondition(rocksdb_iter_valid(iter));
+       CheckIter(iter, "foo", "bar");
+
+       rocksdb_iter_next(iter);
+       CheckCondition(rocksdb_iter_valid(iter));
+       CheckIter(iter, "foo1", "bar1");
+
+       rocksdb_iter_next(iter);
+       CheckCondition(rocksdb_iter_valid(iter));
+       CheckIter(iter, "g1", "0");
+
+       rocksdb_iter_destroy(iter);
+    }
+
+    // testing iterate_upper_bound and forward iterator
+    // to make sure it stops at bound
+    {
+       // iterate_upper_bound points beyond the last expected entry
+       rocksdb_readoptions_set_iterate_upper_bound(roptions, "foo2", 4);
+
+       rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions);
+
+       rocksdb_iter_seek(iter, "foo", 3);
+       CheckCondition(rocksdb_iter_valid(iter));
+       CheckIter(iter, "foo", "bar");
+
+       rocksdb_iter_next(iter);
+       CheckCondition(rocksdb_iter_valid(iter));
+       CheckIter(iter, "foo1", "bar1");
+
+       rocksdb_iter_next(iter);
+       // should stop here...
+       CheckCondition(!rocksdb_iter_valid(iter));
+
+       rocksdb_iter_destroy(iter);
+    }
   }
 
   StartPhase("cleanup");
   rocksdb_close(db);
   rocksdb_options_destroy(options);
+  rocksdb_block_based_options_destroy(table_options);
   rocksdb_readoptions_destroy(roptions);
   rocksdb_writeoptions_destroy(woptions);
   rocksdb_cache_destroy(cache);
diff --git a/src/rocksdb/db/column_family.cc b/src/rocksdb/db/column_family.cc
index 2fd68e3..7df5c97 100644
--- a/src/rocksdb/db/column_family.cc
+++ b/src/rocksdb/db/column_family.cc
@@ -9,23 +9,65 @@
 
 #include "db/column_family.h"
 
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
 #include <vector>
 #include <string>
 #include <algorithm>
+#include <limits>
 
+#include "db/compaction_picker.h"
 #include "db/db_impl.h"
+#include "db/job_context.h"
 #include "db/version_set.h"
+#include "db/writebuffer.h"
 #include "db/internal_stats.h"
-#include "db/compaction_picker.h"
+#include "db/job_context.h"
 #include "db/table_properties_collector.h"
+#include "db/version_set.h"
+#include "db/write_controller.h"
 #include "util/autovector.h"
 #include "util/hash_skiplist_rep.h"
+#include "util/options_helper.h"
 
 namespace rocksdb {
 
-ColumnFamilyHandleImpl::ColumnFamilyHandleImpl(ColumnFamilyData* cfd,
-                                               DBImpl* db, port::Mutex* mutex)
-    : cfd_(cfd), db_(db), mutex_(mutex) {
+namespace {
+// This function computes the amount of time in microseconds by which a write
+// should be delayed based on the number of level-0 files according to the
+// following formula:
+// if n < bottom, return 0;
+// if n >= top, return 1000;
+// otherwise, let r = (n - bottom) /
+//                    (top - bottom)
+//  and return r^2 * 1000.
+// The goal of this formula is to gradually increase the rate at which writes
+// are slowed. We also tried linear delay (r * 1000), but it seemed to do
+// slightly worse. There is no other particular reason for choosing quadratic.
+uint64_t SlowdownAmount(int n, double bottom, double top) {
+  uint64_t delay;
+  if (n >= top) {
+    delay = 1000;
+  } else if (n < bottom) {
+    delay = 0;
+  } else {
+    // If we are here, we know that:
+    //   level0_start_slowdown <= n < level0_slowdown
+    // since the previous two conditions are false.
+    double how_much = static_cast<double>(n - bottom) / (top - bottom);
+    delay = std::max(how_much * how_much * 1000, 100.0);
+  }
+  assert(delay <= 1000);
+  return delay;
+}
+}  // namespace
+
+ColumnFamilyHandleImpl::ColumnFamilyHandleImpl(
+    ColumnFamilyData* column_family_data, DBImpl* db, InstrumentedMutex* mutex)
+    : cfd_(column_family_data), db_(db), mutex_(mutex) {
   if (cfd_ != nullptr) {
     cfd_->Ref();
   }
@@ -33,36 +75,52 @@ ColumnFamilyHandleImpl::ColumnFamilyHandleImpl(ColumnFamilyData* cfd,
 
 ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() {
   if (cfd_ != nullptr) {
-    DBImpl::DeletionState deletion_state;
+    // Job id == 0 means that this is not our background process, but rather
+    // user thread
+    JobContext job_context(0);
     mutex_->Lock();
     if (cfd_->Unref()) {
       delete cfd_;
     }
-    db_->FindObsoleteFiles(deletion_state, false, true);
+    db_->FindObsoleteFiles(&job_context, false, true);
     mutex_->Unlock();
-    if (deletion_state.HaveSomethingToDelete()) {
-      db_->PurgeObsoleteFiles(deletion_state);
+    if (job_context.HaveSomethingToDelete()) {
+      db_->PurgeObsoleteFiles(job_context);
     }
   }
 }
 
 uint32_t ColumnFamilyHandleImpl::GetID() const { return cfd()->GetID(); }
 
-namespace {
-// Fix user-supplied options to be reasonable
-template <class T, class V>
-static void ClipToRange(T* ptr, V minvalue, V maxvalue) {
-  if (static_cast<V>(*ptr) > maxvalue) *ptr = maxvalue;
-  if (static_cast<V>(*ptr) < minvalue) *ptr = minvalue;
+const std::string& ColumnFamilyHandleImpl::GetName() const {
+  return cfd()->GetName();
 }
-}  // anonymous namespace
 
-ColumnFamilyOptions SanitizeOptions(const InternalKeyComparator* icmp,
-                                    const InternalFilterPolicy* ipolicy,
+const Comparator* ColumnFamilyHandleImpl::user_comparator() const {
+  return cfd()->user_comparator();
+}
+
+void GetIntTblPropCollectorFactory(
+    const ColumnFamilyOptions& cf_options,
+    std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
+        int_tbl_prop_collector_factories) {
+  auto& collector_factories = cf_options.table_properties_collector_factories;
+  for (size_t i = 0; i < cf_options.table_properties_collector_factories.size();
+       ++i) {
+    assert(collector_factories[i]);
+    int_tbl_prop_collector_factories->emplace_back(
+        new UserKeyTablePropertiesCollectorFactory(collector_factories[i]));
+  }
+  // Add collector to collect internal key statistics
+  int_tbl_prop_collector_factories->emplace_back(
+      new InternalKeyPropertiesCollectorFactory);
+}
+
+ColumnFamilyOptions SanitizeOptions(const DBOptions& db_options,
+                                    const InternalKeyComparator* icmp,
                                     const ColumnFamilyOptions& src) {
   ColumnFamilyOptions result = src;
   result.comparator = icmp;
-  result.filter_policy = (src.filter_policy != nullptr) ? ipolicy : nullptr;
 #ifdef OS_MACOSX
   // TODO(icanadi) make write_buffer_size uint64_t instead of size_t
   ClipToRange(&result.write_buffer_size, ((size_t)64) << 10, ((size_t)1) << 30);
@@ -78,19 +136,15 @@ ColumnFamilyOptions SanitizeOptions(const InternalKeyComparator* icmp,
   result.min_write_buffer_number_to_merge =
       std::min(result.min_write_buffer_number_to_merge,
                result.max_write_buffer_number - 1);
-  if (result.block_cache == nullptr && !result.no_block_cache) {
-    result.block_cache = NewLRUCache(8 << 20);
-  }
-  result.compression_per_level = src.compression_per_level;
-  if (result.block_size_deviation < 0 || result.block_size_deviation > 100) {
-    result.block_size_deviation = 0;
-  }
   if (result.max_mem_compaction_level >= result.num_levels) {
     result.max_mem_compaction_level = result.num_levels - 1;
   }
   if (result.soft_rate_limit > result.hard_rate_limit) {
     result.soft_rate_limit = result.hard_rate_limit;
   }
+  if (result.max_write_buffer_number < 2) {
+    result.max_write_buffer_number = 2;
+  }
   if (!result.prefix_extractor) {
     assert(result.memtable_factory);
     Slice name = result.memtable_factory->Name();
@@ -100,18 +154,73 @@ ColumnFamilyOptions SanitizeOptions(const InternalKeyComparator* icmp,
     }
   }
 
-  // -- Sanitize the table properties collector
-  // All user defined properties collectors will be wrapped by
-  // UserKeyTablePropertiesCollector since for them they only have the
-  // knowledge of the user keys; internal keys are invisible to them.
-  auto& collectors = result.table_properties_collectors;
-  for (size_t i = 0; i < result.table_properties_collectors.size(); ++i) {
-    assert(collectors[i]);
-    collectors[i] =
-        std::make_shared<UserKeyTablePropertiesCollector>(collectors[i]);
+  if (!src.compression_per_level.empty()) {
+    for (size_t level = 0; level < src.compression_per_level.size(); ++level) {
+      if (!CompressionTypeSupported(src.compression_per_level[level])) {
+        Log(InfoLogLevel::WARN_LEVEL, db_options.info_log,
+            "Compression type chosen for level %zu is not supported: %s. "
+            "RocksDB "
+            "will not compress data on level %zu.",
+            level, CompressionTypeToString(src.compression_per_level[level]),
+            level);
+      }
+    }
+  } else if (!CompressionTypeSupported(src.compression)) {
+    Log(InfoLogLevel::WARN_LEVEL, db_options.info_log,
+        "Compression type chosen is not supported: %s. RocksDB will not "
+        "compress data.",
+        CompressionTypeToString(src.compression));
+  }
+
+  if (result.compaction_style == kCompactionStyleFIFO) {
+    result.num_levels = 1;
+    // since we delete level0 files in FIFO compaction when there are too many
+    // of them, these options don't really mean anything
+    result.level0_file_num_compaction_trigger = std::numeric_limits<int>::max();
+    result.level0_slowdown_writes_trigger = std::numeric_limits<int>::max();
+    result.level0_stop_writes_trigger = std::numeric_limits<int>::max();
+  }
+
+  if (result.level0_stop_writes_trigger <
+          result.level0_slowdown_writes_trigger ||
+      result.level0_slowdown_writes_trigger <
+          result.level0_file_num_compaction_trigger) {
+    Warn(db_options.info_log.get(),
+         "This condition must be satisfied: "
+         "level0_stop_writes_trigger(%d) >= "
+         "level0_slowdown_writes_trigger(%d) >= "
+         "level0_file_num_compaction_trigger(%d)",
+         result.level0_stop_writes_trigger,
+         result.level0_slowdown_writes_trigger,
+         result.level0_file_num_compaction_trigger);
+    if (result.level0_slowdown_writes_trigger <
+        result.level0_file_num_compaction_trigger) {
+      result.level0_slowdown_writes_trigger =
+          result.level0_file_num_compaction_trigger;
+    }
+    if (result.level0_stop_writes_trigger <
+        result.level0_slowdown_writes_trigger) {
+      result.level0_stop_writes_trigger = result.level0_slowdown_writes_trigger;
+    }
+    Warn(db_options.info_log.get(),
+         "Adjust the value to "
+         "level0_stop_writes_trigger(%d)"
+         "level0_slowdown_writes_trigger(%d)"
+         "level0_file_num_compaction_trigger(%d)",
+         result.level0_stop_writes_trigger,
+         result.level0_slowdown_writes_trigger,
+         result.level0_file_num_compaction_trigger);
+  }
+  if (result.level_compaction_dynamic_level_bytes) {
+    if (result.compaction_style != kCompactionStyleLevel ||
+        db_options.db_paths.size() > 1U) {
+      // 1. level_compaction_dynamic_level_bytes only makes sense for
+      //    level-based compaction.
+      // 2. we don't yet know how to make both of this feature and multiple
+      //    DB path work.
+      result.level_compaction_dynamic_level_bytes = false;
+    }
   }
-  // Add collector to collect internal key statistics
-  collectors.push_back(std::make_shared<InternalKeyPropertiesCollector>());
 
   return result;
 }
@@ -133,7 +242,7 @@ SuperVersion* SuperVersion::Ref() {
 
 bool SuperVersion::Unref() {
   // fetch_sub returns the previous value of ref
-  uint32_t previous_refs = refs.fetch_sub(1, std::memory_order_relaxed);
+  uint32_t previous_refs = refs.fetch_sub(1);
   assert(previous_refs > 0);
   return previous_refs == 1;
 }
@@ -175,68 +284,97 @@ void SuperVersionUnrefHandle(void* ptr) {
 }
 }  // anonymous namespace
 
-ColumnFamilyData::ColumnFamilyData(const std::string& dbname, uint32_t id,
-                                   const std::string& name,
-                                   Version* dummy_versions, Cache* table_cache,
-                                   const ColumnFamilyOptions& options,
-                                   const DBOptions* db_options,
-                                   const EnvOptions& storage_options,
-                                   ColumnFamilySet* column_family_set)
+ColumnFamilyData::ColumnFamilyData(
+    uint32_t id, const std::string& name, Version* _dummy_versions,
+    Cache* _table_cache, WriteBuffer* write_buffer,
+    const ColumnFamilyOptions& cf_options, const DBOptions* db_options,
+    const EnvOptions& env_options, ColumnFamilySet* column_family_set)
     : id_(id),
       name_(name),
-      dummy_versions_(dummy_versions),
+      dummy_versions_(_dummy_versions),
       current_(nullptr),
       refs_(0),
       dropped_(false),
-      internal_comparator_(options.comparator),
-      internal_filter_policy_(options.filter_policy),
-      options_(*db_options, SanitizeOptions(&internal_comparator_,
-                                            &internal_filter_policy_, options)),
+      internal_comparator_(cf_options.comparator),
+      options_(*db_options,
+               SanitizeOptions(*db_options, &internal_comparator_, cf_options)),
+      ioptions_(options_),
+      mutable_cf_options_(options_, ioptions_),
+      write_buffer_(write_buffer),
       mem_(nullptr),
-      imm_(options.min_write_buffer_number_to_merge),
+      imm_(options_.min_write_buffer_number_to_merge),
       super_version_(nullptr),
       super_version_number_(0),
       local_sv_(new ThreadLocalPtr(&SuperVersionUnrefHandle)),
       next_(nullptr),
       prev_(nullptr),
       log_number_(0),
-      need_slowdown_for_num_level0_files_(false),
-      column_family_set_(column_family_set) {
+      column_family_set_(column_family_set),
+      pending_flush_(false),
+      pending_compaction_(false) {
   Ref();
 
-  // if dummy_versions is nullptr, then this is a dummy column family.
-  if (dummy_versions != nullptr) {
-    internal_stats_.reset(new InternalStats(options.num_levels, db_options->env,
-                                            db_options->statistics.get()));
-    table_cache_.reset(
-        new TableCache(dbname, &options_, storage_options, table_cache));
-    if (options_.compaction_style == kCompactionStyleUniversal) {
+  // Convert user defined table properties collector factories to internal ones.
+  GetIntTblPropCollectorFactory(options_, &int_tbl_prop_collector_factories_);
+
+  // if _dummy_versions is nullptr, then this is a dummy column family.
+  if (_dummy_versions != nullptr) {
+    internal_stats_.reset(
+        new InternalStats(ioptions_.num_levels, db_options->env, this));
+    table_cache_.reset(new TableCache(ioptions_, env_options, _table_cache));
+    if (ioptions_.compaction_style == kCompactionStyleLevel) {
       compaction_picker_.reset(
-          new UniversalCompactionPicker(&options_, &internal_comparator_));
+          new LevelCompactionPicker(ioptions_, &internal_comparator_));
+#ifndef ROCKSDB_LITE
+    } else if (ioptions_.compaction_style == kCompactionStyleUniversal) {
+      compaction_picker_.reset(
+          new UniversalCompactionPicker(ioptions_, &internal_comparator_));
+    } else if (ioptions_.compaction_style == kCompactionStyleFIFO) {
+      compaction_picker_.reset(
+          new FIFOCompactionPicker(ioptions_, &internal_comparator_));
+    } else if (ioptions_.compaction_style == kCompactionStyleNone) {
+      compaction_picker_.reset(new NullCompactionPicker(
+          ioptions_, &internal_comparator_));
+      Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
+          "Column family %s does not use any background compaction. "
+          "Compactions can only be done via CompactFiles\n",
+          GetName().c_str());
+#endif  // !ROCKSDB_LITE
     } else {
+      Log(InfoLogLevel::ERROR_LEVEL, ioptions_.info_log,
+          "Unable to recognize the specified compaction style %d. "
+          "Column family %s will use kCompactionStyleLevel.\n",
+          ioptions_.compaction_style, GetName().c_str());
       compaction_picker_.reset(
-          new LevelCompactionPicker(&options_, &internal_comparator_));
+          new LevelCompactionPicker(ioptions_, &internal_comparator_));
     }
 
-    Log(options_.info_log, "Options for column family \"%s\":\n",
-        name.c_str());
-    const ColumnFamilyOptions* cf_options = &options_;
-    cf_options->Dump(options_.info_log.get());
+    if (column_family_set_->NumberOfColumnFamilies() < 10) {
+      Log(InfoLogLevel::INFO_LEVEL, ioptions_.info_log,
+          "--------------- Options for column family [%s]:\n", name.c_str());
+      options_.Dump(ioptions_.info_log);
+    } else {
+      Log(InfoLogLevel::INFO_LEVEL, ioptions_.info_log,
+          "\t(skipping printing options)\n");
+    }
   }
+
+  RecalculateWriteStallConditions(mutable_cf_options_);
 }
 
 // DB mutex held
 ColumnFamilyData::~ColumnFamilyData() {
-  assert(refs_ == 0);
+  assert(refs_.load(std::memory_order_relaxed) == 0);
   // remove from linked list
   auto prev = prev_;
   auto next = next_;
   prev->next_ = next;
   next->prev_ = prev;
 
-  // it's nullptr for dummy CFD
-  if (column_family_set_ != nullptr) {
-    // remove from column_family_set
+  if (!dropped_ && column_family_set_ != nullptr) {
+    // If it's dropped, it's already removed from column family set
+    // If column_family_set_ == nullptr, this is dummy CFD and not in
+    // ColumnFamilySet
     column_family_set_->RemoveColumnFamily(this);
   }
 
@@ -244,6 +382,11 @@ ColumnFamilyData::~ColumnFamilyData() {
     current_->Unref();
   }
 
+  // It would be wrong if this ColumnFamilyData is in flush_queue_ or
+  // compaction_queue_ and we destroyed it
+  assert(!pending_flush_);
+  assert(!pending_compaction_);
+
   if (super_version_ != nullptr) {
     // Release SuperVersion reference kept in ThreadLocalPtr.
     // This must be done outside of mutex_ since unref handler can lock mutex.
@@ -261,8 +404,9 @@ ColumnFamilyData::~ColumnFamilyData() {
 
   if (dummy_versions_ != nullptr) {
     // List must be empty
-    assert(dummy_versions_->next_ == dummy_versions_);
-    delete dummy_versions_;
+    assert(dummy_versions_->TEST_Next() == dummy_versions_);
+    bool deleted __attribute__((unused)) = dummy_versions_->Unref();
+    assert(deleted);
   }
 
   if (mem_ != nullptr) {
@@ -275,57 +419,152 @@ ColumnFamilyData::~ColumnFamilyData() {
   }
 }
 
+void ColumnFamilyData::SetDropped() {
+  // can't drop default CF
+  assert(id_ != 0);
+  dropped_ = true;
+  write_controller_token_.reset();
+
+  // remove from column_family_set
+  column_family_set_->RemoveColumnFamily(this);
+}
+
+void ColumnFamilyData::RecalculateWriteStallConditions(
+      const MutableCFOptions& mutable_cf_options) {
+  if (current_ != nullptr) {
+    auto* vstorage = current_->storage_info();
+    const double score = vstorage->max_compaction_score();
+    const int max_level = vstorage->max_compaction_score_level();
+
+    auto write_controller = column_family_set_->write_controller_;
+
+    if (imm()->size() >= mutable_cf_options.max_write_buffer_number) {
+      write_controller_token_ = write_controller->GetStopToken();
+      internal_stats_->AddCFStats(InternalStats::MEMTABLE_COMPACTION, 1);
+      Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
+          "[%s] Stopping writes because we have %d immutable memtables "
+          "(waiting for flush), max_write_buffer_number is set to %d",
+          name_.c_str(), imm()->size(),
+          mutable_cf_options.max_write_buffer_number);
+    } else if (vstorage->l0_delay_trigger_count() >=
+               mutable_cf_options.level0_stop_writes_trigger) {
+      write_controller_token_ = write_controller->GetStopToken();
+      internal_stats_->AddCFStats(InternalStats::LEVEL0_NUM_FILES, 1);
+      Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
+          "[%s] Stopping writes because we have %d level-0 files",
+          name_.c_str(), vstorage->l0_delay_trigger_count());
+    } else if (mutable_cf_options.level0_slowdown_writes_trigger >= 0 &&
+               vstorage->l0_delay_trigger_count() >=
+                   mutable_cf_options.level0_slowdown_writes_trigger) {
+      uint64_t slowdown =
+          SlowdownAmount(vstorage->l0_delay_trigger_count(),
+                         mutable_cf_options.level0_slowdown_writes_trigger,
+                         mutable_cf_options.level0_stop_writes_trigger);
+      write_controller_token_ = write_controller->GetDelayToken(slowdown);
+      internal_stats_->AddCFStats(InternalStats::LEVEL0_SLOWDOWN, slowdown);
+      Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
+          "[%s] Stalling writes because we have %d level-0 files (%" PRIu64
+          "us)",
+          name_.c_str(), vstorage->l0_delay_trigger_count(), slowdown);
+    } else if (mutable_cf_options.hard_rate_limit > 1.0 &&
+               score > mutable_cf_options.hard_rate_limit) {
+      uint64_t kHardLimitSlowdown = 1000;
+      write_controller_token_ =
+          write_controller->GetDelayToken(kHardLimitSlowdown);
+      internal_stats_->RecordLevelNSlowdown(max_level, false);
+      Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
+          "[%s] Stalling writes because we hit hard limit on level %d. "
+          "(%" PRIu64 "us)",
+          name_.c_str(), max_level, kHardLimitSlowdown);
+    } else if (mutable_cf_options.soft_rate_limit > 0.0 &&
+               score > mutable_cf_options.soft_rate_limit) {
+      uint64_t slowdown = SlowdownAmount(score,
+          mutable_cf_options.soft_rate_limit,
+          mutable_cf_options.hard_rate_limit);
+      write_controller_token_ = write_controller->GetDelayToken(slowdown);
+      internal_stats_->RecordLevelNSlowdown(max_level, true);
+      Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
+          "[%s] Stalling writes because we hit soft limit on level %d (%" PRIu64
+          "us)",
+          name_.c_str(), max_level, slowdown);
+    } else {
+      write_controller_token_.reset();
+    }
+  }
+}
+
 const EnvOptions* ColumnFamilyData::soptions() const {
-  return &(column_family_set_->storage_options_);
+  return &(column_family_set_->env_options_);
 }
 
-void ColumnFamilyData::SetCurrent(Version* current) {
-  current_ = current;
-  need_slowdown_for_num_level0_files_ =
-      (options_.level0_slowdown_writes_trigger >= 0 &&
-       current_->NumLevelFiles(0) >= options_.level0_slowdown_writes_trigger);
+void ColumnFamilyData::SetCurrent(Version* current_version) {
+  current_ = current_version;
 }
 
-void ColumnFamilyData::CreateNewMemtable() {
+uint64_t ColumnFamilyData::GetNumLiveVersions() const {
+  return VersionSet::GetNumLiveVersions(dummy_versions_);
+}
+
+MemTable* ColumnFamilyData::ConstructNewMemtable(
+    const MutableCFOptions& mutable_cf_options) {
   assert(current_ != nullptr);
+  return new MemTable(internal_comparator_, ioptions_,
+                      mutable_cf_options, write_buffer_);
+}
+
+void ColumnFamilyData::CreateNewMemtable(
+    const MutableCFOptions& mutable_cf_options) {
   if (mem_ != nullptr) {
     delete mem_->Unref();
   }
-  mem_ = new MemTable(internal_comparator_, options_);
+  SetMemtable(ConstructNewMemtable(mutable_cf_options));
   mem_->Ref();
 }
 
-Compaction* ColumnFamilyData::PickCompaction(LogBuffer* log_buffer) {
-  return compaction_picker_->PickCompaction(current_, log_buffer);
+bool ColumnFamilyData::NeedsCompaction() const {
+  return compaction_picker_->NeedsCompaction(current_->storage_info());
 }
 
-Compaction* ColumnFamilyData::CompactRange(int input_level, int output_level,
-                                           const InternalKey* begin,
-                                           const InternalKey* end,
-                                           InternalKey** compaction_end) {
-  return compaction_picker_->CompactRange(current_, input_level, output_level,
-                                          begin, end, compaction_end);
+Compaction* ColumnFamilyData::PickCompaction(
+    const MutableCFOptions& mutable_options, LogBuffer* log_buffer) {
+  auto* result = compaction_picker_->PickCompaction(
+      GetName(), mutable_options, current_->storage_info(), log_buffer);
+  if (result != nullptr) {
+    result->SetInputVersion(current_);
+  }
+  return result;
+}
+
+const int ColumnFamilyData::kCompactAllLevels = -1;
+const int ColumnFamilyData::kCompactToBaseLevel = -2;
+
+Compaction* ColumnFamilyData::CompactRange(
+    const MutableCFOptions& mutable_cf_options,
+    int input_level, int output_level, uint32_t output_path_id,
+    const InternalKey* begin, const InternalKey* end,
+    InternalKey** compaction_end) {
+  auto* result = compaction_picker_->CompactRange(
+      GetName(), mutable_cf_options, current_->storage_info(), input_level,
+      output_level, output_path_id, begin, end, compaction_end);
+  if (result != nullptr) {
+    result->SetInputVersion(current_);
+  }
+  return result;
 }
 
 SuperVersion* ColumnFamilyData::GetReferencedSuperVersion(
-    port::Mutex* db_mutex) {
+    InstrumentedMutex* db_mutex) {
   SuperVersion* sv = nullptr;
-  if (LIKELY(column_family_set_->db_options_->allow_thread_local)) {
-    sv = GetThreadLocalSuperVersion(db_mutex);
-    sv->Ref();
-    if (!ReturnThreadLocalSuperVersion(sv)) {
-      sv->Unref();
-    }
-  } else {
-    db_mutex->Lock();
-    sv = super_version_->Ref();
-    db_mutex->Unlock();
+  sv = GetThreadLocalSuperVersion(db_mutex);
+  sv->Ref();
+  if (!ReturnThreadLocalSuperVersion(sv)) {
+    sv->Unref();
   }
   return sv;
 }
 
 SuperVersion* ColumnFamilyData::GetThreadLocalSuperVersion(
-    port::Mutex* db_mutex) {
+    InstrumentedMutex* db_mutex) {
   SuperVersion* sv = nullptr;
   // The SuperVersion is cached in thread local storage to avoid acquiring
   // mutex when SuperVersion does not change since the last use. When a new
@@ -348,11 +587,11 @@ SuperVersion* ColumnFamilyData::GetThreadLocalSuperVersion(
   sv = static_cast<SuperVersion*>(ptr);
   if (sv == SuperVersion::kSVObsolete ||
       sv->version_number != super_version_number_.load()) {
-    RecordTick(options_.statistics.get(), NUMBER_SUPERVERSION_ACQUIRES);
+    RecordTick(ioptions_.statistics, NUMBER_SUPERVERSION_ACQUIRES);
     SuperVersion* sv_to_delete = nullptr;
 
     if (sv && sv->Unref()) {
-      RecordTick(options_.statistics.get(), NUMBER_SUPERVERSION_CLEANUPS);
+      RecordTick(ioptions_.statistics, NUMBER_SUPERVERSION_CLEANUPS);
       db_mutex->Lock();
       // NOTE: underlying resources held by superversion (sst files) might
       // not be released until the next background job.
@@ -376,7 +615,7 @@ bool ColumnFamilyData::ReturnThreadLocalSuperVersion(SuperVersion* sv) {
   void* expected = SuperVersion::kSVInUse;
   if (local_sv_->CompareAndSwap(static_cast<void*>(sv), expected)) {
     // When we see kSVInUse in the ThreadLocal, we are sure ThreadLocal
-    // storage has not been altered and no Scrape has happend. The
+    // storage has not been altered and no Scrape has happened. The
     // SuperVersion is still current.
     return true;
   } else {
@@ -388,18 +627,72 @@ bool ColumnFamilyData::ReturnThreadLocalSuperVersion(SuperVersion* sv) {
   return false;
 }
 
+void ColumnFamilyData::NotifyOnCompactionCompleted(
+    DB* db, Compaction* c, const Status& status) {
+#ifndef ROCKSDB_LITE
+  auto listeners = ioptions()->listeners;
+  assert(listeners.size() > 0U);
+  CompactionJobInfo info;
+  info.cf_name = c->column_family_data()->GetName();
+  info.status = status;
+  info.output_level = c->output_level();
+  for (size_t i = 0; i < c->num_input_levels(); ++i) {
+    for (const auto fmd : *c->inputs(i)) {
+      info.input_files.push_back(
+          TableFileName(options_.db_paths,
+                        fmd->fd.GetNumber(),
+                        fmd->fd.GetPathId()));
+    }
+  }
+  for (const auto newf : c->edit()->GetNewFiles()) {
+    info.output_files.push_back(
+        TableFileName(options_.db_paths,
+                      newf.second.fd.GetNumber(),
+                      newf.second.fd.GetPathId()));
+  }
+  for (auto listener : listeners) {
+    listener->OnCompactionCompleted(db, info);
+  }
+#endif  // ROCKSDB_LITE
+}
+
+void ColumnFamilyData::NotifyOnFlushCompleted(
+    DB* db, const std::string& file_path,
+    bool triggered_flush_slowdown,
+    bool triggered_flush_stop) {
+
+#ifndef ROCKSDB_LITE
+  auto listeners = ioptions()->listeners;
+  for (auto listener : listeners) {
+    listener->OnFlushCompleted(
+        db, GetName(), file_path,
+        // Use path 0 as fulled memtables are first flushed into path 0.
+        triggered_flush_slowdown, triggered_flush_stop);
+  }
+#endif  // ROCKSDB_LITE
+}
+
+SuperVersion* ColumnFamilyData::InstallSuperVersion(
+    SuperVersion* new_superversion, InstrumentedMutex* db_mutex) {
+  db_mutex->AssertHeld();
+  return InstallSuperVersion(new_superversion, db_mutex, mutable_cf_options_);
+}
+
 SuperVersion* ColumnFamilyData::InstallSuperVersion(
-    SuperVersion* new_superversion, port::Mutex* db_mutex) {
+    SuperVersion* new_superversion, InstrumentedMutex* db_mutex,
+    const MutableCFOptions& mutable_cf_options) {
   new_superversion->db_mutex = db_mutex;
+  new_superversion->mutable_cf_options = mutable_cf_options;
   new_superversion->Init(mem_, imm_.current(), current_);
   SuperVersion* old_superversion = super_version_;
   super_version_ = new_superversion;
   ++super_version_number_;
   super_version_->version_number = super_version_number_;
   // Reset SuperVersions cached in thread local storage
-  if (column_family_set_->db_options_->allow_thread_local) {
-    ResetThreadLocalSuperVersions();
-  }
+  ResetThreadLocalSuperVersions();
+
+  RecalculateWriteStallConditions(mutable_cf_options);
+
   if (old_superversion != nullptr && old_superversion->Unref()) {
     old_superversion->Cleanup();
     return old_superversion;  // will let caller delete outside of mutex
@@ -423,20 +716,37 @@ void ColumnFamilyData::ResetThreadLocalSuperVersions() {
   }
 }
 
+#ifndef ROCKSDB_LITE
+Status ColumnFamilyData::SetOptions(
+      const std::unordered_map<std::string, std::string>& options_map) {
+  MutableCFOptions new_mutable_cf_options;
+  Status s = GetMutableOptionsFromStrings(mutable_cf_options_, options_map,
+                                          &new_mutable_cf_options);
+  if (s.ok()) {
+    mutable_cf_options_ = new_mutable_cf_options;
+    mutable_cf_options_.RefreshDerivedOptions(ioptions_);
+  }
+  return s;
+}
+#endif  // ROCKSDB_LITE
+
 ColumnFamilySet::ColumnFamilySet(const std::string& dbname,
                                  const DBOptions* db_options,
-                                 const EnvOptions& storage_options,
-                                 Cache* table_cache)
+                                 const EnvOptions& env_options,
+                                 Cache* table_cache,
+                                 WriteBuffer* write_buffer,
+                                 WriteController* write_controller)
     : max_column_family_(0),
-      dummy_cfd_(new ColumnFamilyData(dbname, 0, "", nullptr, nullptr,
+      dummy_cfd_(new ColumnFamilyData(0, "", nullptr, nullptr, nullptr,
                                       ColumnFamilyOptions(), db_options,
-                                      storage_options_, nullptr)),
+                                      env_options, nullptr)),
       default_cfd_cache_(nullptr),
       db_name_(dbname),
       db_options_(db_options),
-      storage_options_(storage_options),
+      env_options_(env_options),
       table_cache_(table_cache),
-      spin_lock_(ATOMIC_FLAG_INIT) {
+      write_buffer_(write_buffer),
+      write_controller_(write_controller) {
   // initialize linked list
   dummy_cfd_->prev_ = dummy_cfd_;
   dummy_cfd_->next_ = dummy_cfd_;
@@ -489,18 +799,21 @@ void ColumnFamilySet::UpdateMaxColumnFamily(uint32_t new_max_column_family) {
   max_column_family_ = std::max(new_max_column_family, max_column_family_);
 }
 
-// under a DB mutex
+size_t ColumnFamilySet::NumberOfColumnFamilies() const {
+  return column_families_.size();
+}
+
+// under a DB mutex AND write thread
 ColumnFamilyData* ColumnFamilySet::CreateColumnFamily(
     const std::string& name, uint32_t id, Version* dummy_versions,
     const ColumnFamilyOptions& options) {
   assert(column_families_.find(name) == column_families_.end());
   ColumnFamilyData* new_cfd =
-      new ColumnFamilyData(db_name_, id, name, dummy_versions, table_cache_,
-                           options, db_options_, storage_options_, this);
-  Lock();
+      new ColumnFamilyData(id, name, dummy_versions, table_cache_,
+                           write_buffer_, options, db_options_,
+                           env_options_, this);
   column_families_.insert({name, id});
   column_family_data_.insert({id, new_cfd});
-  Unlock();
   max_column_family_ = std::max(max_column_family_, id);
   // add to linked list
   new_cfd->next_ = dummy_cfd_;
@@ -514,19 +827,11 @@ ColumnFamilyData* ColumnFamilySet::CreateColumnFamily(
   return new_cfd;
 }
 
-void ColumnFamilySet::Lock() {
-  // spin lock
-  while (spin_lock_.test_and_set(std::memory_order_acquire)) {
-  }
-}
-
-void ColumnFamilySet::Unlock() { spin_lock_.clear(std::memory_order_release); }
-
 // REQUIRES: DB mutex held
 void ColumnFamilySet::FreeDeadColumnFamilies() {
   autovector<ColumnFamilyData*> to_delete;
   for (auto cfd = dummy_cfd_->next_; cfd != dummy_cfd_; cfd = cfd->next_) {
-    if (cfd->refs_ == 0) {
+    if (cfd->refs_.load(std::memory_order_relaxed) == 0) {
       to_delete.push_back(cfd);
     }
   }
@@ -536,25 +841,21 @@ void ColumnFamilySet::FreeDeadColumnFamilies() {
   }
 }
 
-// under a DB mutex
+// under a DB mutex AND from a write thread
 void ColumnFamilySet::RemoveColumnFamily(ColumnFamilyData* cfd) {
   auto cfd_iter = column_family_data_.find(cfd->GetID());
   assert(cfd_iter != column_family_data_.end());
-  Lock();
   column_family_data_.erase(cfd_iter);
   column_families_.erase(cfd->GetName());
-  Unlock();
 }
 
+// under a DB mutex OR from a write thread
 bool ColumnFamilyMemTablesImpl::Seek(uint32_t column_family_id) {
   if (column_family_id == 0) {
     // optimization for common case
     current_ = column_family_set_->GetDefault();
   } else {
-    // maybe outside of db mutex, should lock
-    column_family_set_->Lock();
     current_ = column_family_set_->GetColumnFamily(column_family_id);
-    column_family_set_->Unlock();
   }
   handle_.SetCFD(current_);
   return current_ != nullptr;
@@ -570,14 +871,34 @@ MemTable* ColumnFamilyMemTablesImpl::GetMemTable() const {
   return current_->mem();
 }
 
-const Options* ColumnFamilyMemTablesImpl::GetOptions() const {
-  assert(current_ != nullptr);
-  return current_->options();
-}
-
 ColumnFamilyHandle* ColumnFamilyMemTablesImpl::GetColumnFamilyHandle() {
   assert(current_ != nullptr);
   return &handle_;
 }
 
+void ColumnFamilyMemTablesImpl::CheckMemtableFull() {
+  if (current_ != nullptr && current_->mem()->ShouldScheduleFlush()) {
+    flush_scheduler_->ScheduleFlush(current_);
+    current_->mem()->MarkFlushScheduled();
+  }
+}
+
+uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family) {
+  uint32_t column_family_id = 0;
+  if (column_family != nullptr) {
+    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+    column_family_id = cfh->GetID();
+  }
+  return column_family_id;
+}
+
+const Comparator* GetColumnFamilyUserComparator(
+    ColumnFamilyHandle* column_family) {
+  if (column_family != nullptr) {
+    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+    return cfh->user_comparator();
+  }
+  return nullptr;
+}
+
 }  // namespace rocksdb
diff --git a/src/rocksdb/db/column_family.h b/src/rocksdb/db/column_family.h
index d306f4e..77af5c7 100644
--- a/src/rocksdb/db/column_family.h
+++ b/src/rocksdb/db/column_family.h
@@ -19,7 +19,12 @@
 #include "rocksdb/env.h"
 #include "db/memtable_list.h"
 #include "db/write_batch_internal.h"
+#include "db/write_controller.h"
 #include "db/table_cache.h"
+#include "db/table_properties_collector.h"
+#include "db/flush_scheduler.h"
+#include "util/instrumented_mutex.h"
+#include "util/mutable_cf_options.h"
 #include "util/thread_local.h"
 
 namespace rocksdb {
@@ -35,6 +40,8 @@ class InternalStats;
 class ColumnFamilyData;
 class DBImpl;
 class LogBuffer;
+class InstrumentedMutex;
+class InstrumentedMutexLock;
 
 // ColumnFamilyHandleImpl is the class that clients use to access different
 // column families. It has non-trivial destructor, which gets called when client
@@ -42,17 +49,20 @@ class LogBuffer;
 class ColumnFamilyHandleImpl : public ColumnFamilyHandle {
  public:
   // create while holding the mutex
-  ColumnFamilyHandleImpl(ColumnFamilyData* cfd, DBImpl* db, port::Mutex* mutex);
+  ColumnFamilyHandleImpl(
+      ColumnFamilyData* cfd, DBImpl* db, InstrumentedMutex* mutex);
   // destroy without mutex
   virtual ~ColumnFamilyHandleImpl();
   virtual ColumnFamilyData* cfd() const { return cfd_; }
+  virtual const Comparator* user_comparator() const;
 
-  virtual uint32_t GetID() const;
+  virtual uint32_t GetID() const override;
+  virtual const std::string& GetName() const override;
 
  private:
   ColumnFamilyData* cfd_;
   DBImpl* db_;
-  port::Mutex* mutex_;
+  InstrumentedMutex* mutex_;
 };
 
 // Does not ref-count ColumnFamilyData
@@ -66,7 +76,7 @@ class ColumnFamilyHandleInternal : public ColumnFamilyHandleImpl {
   ColumnFamilyHandleInternal()
       : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr) {}
 
-  void SetCFD(ColumnFamilyData* cfd) { internal_cfd_ = cfd; }
+  void SetCFD(ColumnFamilyData* _cfd) { internal_cfd_ = _cfd; }
   virtual ColumnFamilyData* cfd() const override { return internal_cfd_; }
 
  private:
@@ -75,23 +85,23 @@ class ColumnFamilyHandleInternal : public ColumnFamilyHandleImpl {
 
 // holds references to memtable, all immutable memtables and version
 struct SuperVersion {
+  // Accessing members of this class is not thread-safe and requires external
+  // synchronization (ie db mutex held or on write thread).
   MemTable* mem;
   MemTableListVersion* imm;
   Version* current;
-  std::atomic<uint32_t> refs;
-  // We need to_delete because during Cleanup(), imm->Unref() returns
-  // all memtables that we need to free through this vector. We then
-  // delete all those memtables outside of mutex, during destruction
-  autovector<MemTable*> to_delete;
+  MutableCFOptions mutable_cf_options;
   // Version number of the current SuperVersion
   uint64_t version_number;
-  port::Mutex* db_mutex;
+
+  InstrumentedMutex* db_mutex;
 
   // should be called outside the mutex
   SuperVersion() = default;
   ~SuperVersion();
   SuperVersion* Ref();
-
+  // If Unref() returns true, Cleanup() should be called with mutex held
+  // before deleting this SuperVersion.
   bool Unref();
 
   // call these two methods with db mutex held
@@ -110,16 +120,29 @@ struct SuperVersion {
   static int dummy;
   static void* const kSVInUse;
   static void* const kSVObsolete;
+
+ private:
+  std::atomic<uint32_t> refs;
+  // We need to_delete because during Cleanup(), imm->Unref() returns
+  // all memtables that we need to free through this vector. We then
+  // delete all those memtables outside of mutex, during destruction
+  autovector<MemTable*> to_delete;
 };
 
-extern ColumnFamilyOptions SanitizeOptions(const InternalKeyComparator* icmp,
-                                           const InternalFilterPolicy* ipolicy,
+extern ColumnFamilyOptions SanitizeOptions(const DBOptions& db_options,
+                                           const InternalKeyComparator* icmp,
                                            const ColumnFamilyOptions& src);
+// Wrap user defined table proproties collector factories `from cf_options`
+// into internal ones in int_tbl_prop_collector_factories. Add a system internal
+// one too.
+extern void GetIntTblPropCollectorFactory(
+    const ColumnFamilyOptions& cf_options,
+    std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
+        int_tbl_prop_collector_factories);
 
 class ColumnFamilySet;
 
-// This class keeps all the data that a column family needs. It's mosly dumb and
-// used just to provide access to metadata.
+// This class keeps all the data that a column family needs.
 // Most methods require DB mutex held, unless otherwise noted
 class ColumnFamilyData {
  public:
@@ -130,45 +153,67 @@ class ColumnFamilyData {
   // thread-safe
   const std::string& GetName() const { return name_; }
 
-  void Ref() { ++refs_; }
+  // Ref() can only be called whily holding a DB mutex or during a
+  // single-threaded write.
+  void Ref() { refs_.fetch_add(1, std::memory_order_relaxed); }
   // will just decrease reference count to 0, but will not delete it. returns
   // true if the ref count was decreased to zero. in that case, it can be
-  // deleted by the caller immediatelly, or later, by calling
+  // deleted by the caller immediately, or later, by calling
   // FreeDeadColumnFamilies()
+  // Unref() can only be called while holding a DB mutex
   bool Unref() {
-    assert(refs_ > 0);
-    return --refs_ == 0;
+    int old_refs = refs_.fetch_sub(1, std::memory_order_relaxed);
+    assert(old_refs > 0);
+    return old_refs == 1;
   }
 
-  // This can only be called from single-threaded VersionSet::LogAndApply()
+  // SetDropped() can only be called under following conditions:
+  // 1) Holding a DB mutex,
+  // 2) from single-threaded write thread, AND
+  // 3) from single-threaded VersionSet::LogAndApply()
   // After dropping column family no other operation on that column family
   // will be executed. All the files and memory will be, however, kept around
   // until client drops the column family handle. That way, client can still
   // access data from dropped column family.
   // Column family can be dropped and still alive. In that state:
-  // *) Column family is not included in the iteration.
   // *) Compaction and flush is not executed on the dropped column family.
-  // *) Client can continue writing and reading from column family. However, all
-  // writes stay in the current memtable.
+  // *) Client can continue reading from column family. Writes will fail unless
+  // WriteOptions::ignore_missing_column_families is true
   // When the dropped column family is unreferenced, then we:
+  // *) Remove column family from the linked list maintained by ColumnFamilySet
   // *) delete all memory associated with that column family
   // *) delete all the files associated with that column family
-  void SetDropped() {
-    // can't drop default CF
-    assert(id_ != 0);
-    dropped_ = true;
-  }
+  void SetDropped();
   bool IsDropped() const { return dropped_; }
 
   // thread-safe
-  int NumberLevels() const { return options_.num_levels; }
+  int NumberLevels() const { return ioptions_.num_levels; }
 
   void SetLogNumber(uint64_t log_number) { log_number_ = log_number; }
   uint64_t GetLogNumber() const { return log_number_; }
 
-  // thread-safe
+  // !!! To be deprecated! Please don't not use this function anymore!
   const Options* options() const { return &options_; }
+
+  // thread-safe
   const EnvOptions* soptions() const;
+  const ImmutableCFOptions* ioptions() const { return &ioptions_; }
+  // REQUIRES: DB mutex held
+  // This returns the MutableCFOptions used by current SuperVersion
+  // You shoul use this API to reference MutableCFOptions most of the time.
+  const MutableCFOptions* GetCurrentMutableCFOptions() const {
+    return &(super_version_->mutable_cf_options);
+  }
+  // REQUIRES: DB mutex held
+  // This returns the latest MutableCFOptions, which may be not in effect yet.
+  const MutableCFOptions* GetLatestMutableCFOptions() const {
+    return &mutable_cf_options_;
+  }
+#ifndef ROCKSDB_LITE
+  // REQUIRES: DB mutex held
+  Status SetOptions(
+      const std::unordered_map<std::string, std::string>& options_map);
+#endif  // ROCKSDB_LITE
 
   InternalStats* internal_stats() { return internal_stats_.get(); }
 
@@ -176,17 +221,32 @@ class ColumnFamilyData {
   MemTable* mem() { return mem_; }
   Version* current() { return current_; }
   Version* dummy_versions() { return dummy_versions_; }
-  void SetMemtable(MemTable* new_mem) { mem_ = new_mem; }
   void SetCurrent(Version* current);
-  void CreateNewMemtable();
+  uint64_t GetNumLiveVersions() const;  // REQUIRE: DB mutex held
 
-  TableCache* table_cache() { return table_cache_.get(); }
+  MemTable* ConstructNewMemtable(const MutableCFOptions& mutable_cf_options);
+  void SetMemtable(MemTable* new_mem) { mem_ = new_mem; }
+  void CreateNewMemtable(const MutableCFOptions& mutable_cf_options);
+
+  TableCache* table_cache() const { return table_cache_.get(); }
 
   // See documentation in compaction_picker.h
-  Compaction* PickCompaction(LogBuffer* log_buffer);
-  Compaction* CompactRange(int input_level, int output_level,
-                           const InternalKey* begin, const InternalKey* end,
-                           InternalKey** compaction_end);
+  // REQUIRES: DB mutex held
+  bool NeedsCompaction() const;
+  // REQUIRES: DB mutex held
+  Compaction* PickCompaction(const MutableCFOptions& mutable_options,
+                             LogBuffer* log_buffer);
+  // A flag to tell a manual compaction is to compact all levels together
+  // instad of for specific level.
+  static const int kCompactAllLevels;
+  // A flag to tell a manual compaction's output is base level.
+  static const int kCompactToBaseLevel;
+  // REQUIRES: DB mutex held
+  Compaction* CompactRange(
+      const MutableCFOptions& mutable_cf_options,
+      int input_level, int output_level, uint32_t output_path_id,
+      const InternalKey* begin, const InternalKey* end,
+      InternalKey** compaction_end);
 
   CompactionPicker* compaction_picker() { return compaction_picker_.get(); }
   // thread-safe
@@ -198,14 +258,19 @@ class ColumnFamilyData {
     return internal_comparator_;
   }
 
+  const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
+  int_tbl_prop_collector_factories() const {
+    return &int_tbl_prop_collector_factories_;
+  }
+
   SuperVersion* GetSuperVersion() { return super_version_; }
   // thread-safe
   // Return a already referenced SuperVersion to be used safely.
-  SuperVersion* GetReferencedSuperVersion(port::Mutex* db_mutex);
+  SuperVersion* GetReferencedSuperVersion(InstrumentedMutex* db_mutex);
   // thread-safe
   // Get SuperVersion stored in thread local storage. If it does not exist,
   // get a reference from a current SuperVersion.
-  SuperVersion* GetThreadLocalSuperVersion(port::Mutex* db_mutex);
+  SuperVersion* GetThreadLocalSuperVersion(InstrumentedMutex* db_mutex);
   // Try to return SuperVersion back to thread local storage. Retrun true on
   // success and false on failure. It fails when the thread local storage
   // contains anything other than SuperVersion::kSVInUse flag.
@@ -218,43 +283,67 @@ class ColumnFamilyData {
   // if its reference count is zero and needs deletion or nullptr if not
   // As argument takes a pointer to allocated SuperVersion to enable
   // the clients to allocate SuperVersion outside of mutex.
+  // IMPORTANT: Only call this from DBImpl::InstallSuperVersion()
+  SuperVersion* InstallSuperVersion(SuperVersion* new_superversion,
+                                    InstrumentedMutex* db_mutex,
+                                    const MutableCFOptions& mutable_cf_options);
   SuperVersion* InstallSuperVersion(SuperVersion* new_superversion,
-                                    port::Mutex* db_mutex);
+                                    InstrumentedMutex* db_mutex);
 
   void ResetThreadLocalSuperVersions();
 
-  // A Flag indicating whether write needs to slowdown because of there are
-  // too many number of level0 files.
-  bool NeedSlowdownForNumLevel0Files() const {
-    return need_slowdown_for_num_level0_files_;
-  }
+  void NotifyOnCompactionCompleted(DB* db, Compaction* c, const Status& status);
+
+  void NotifyOnFlushCompleted(
+      DB* db, const std::string& file_path,
+      bool triggered_flush_slowdown,
+      bool triggered_flush_stop);
+
+  // Protected by DB mutex
+  void set_pending_flush(bool value) { pending_flush_ = value; }
+  void set_pending_compaction(bool value) { pending_compaction_ = value; }
+  bool pending_flush() { return pending_flush_; }
+  bool pending_compaction() { return pending_compaction_; }
 
  private:
   friend class ColumnFamilySet;
-  ColumnFamilyData(const std::string& dbname, uint32_t id,
-                   const std::string& name, Version* dummy_versions,
-                   Cache* table_cache, const ColumnFamilyOptions& options,
-                   const DBOptions* db_options,
-                   const EnvOptions& storage_options,
+  ColumnFamilyData(uint32_t id, const std::string& name,
+                   Version* dummy_versions, Cache* table_cache,
+                   WriteBuffer* write_buffer,
+                   const ColumnFamilyOptions& options,
+                   const DBOptions* db_options, const EnvOptions& env_options,
                    ColumnFamilySet* column_family_set);
 
+  // Recalculate some small conditions, which are changed only during
+  // compaction, adding new memtable and/or
+  // recalculation of compaction score. These values are used in
+  // DBImpl::MakeRoomForWrite function to decide, if it need to make
+  // a write stall
+  void RecalculateWriteStallConditions(
+      const MutableCFOptions& mutable_cf_options);
+
   uint32_t id_;
   const std::string name_;
   Version* dummy_versions_;  // Head of circular doubly-linked list of versions.
   Version* current_;         // == dummy_versions->prev_
 
-  int refs_;                   // outstanding references to ColumnFamilyData
+  std::atomic<int> refs_;      // outstanding references to ColumnFamilyData
   bool dropped_;               // true if client dropped it
 
   const InternalKeyComparator internal_comparator_;
-  const InternalFilterPolicy internal_filter_policy_;
+  std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
+      int_tbl_prop_collector_factories_;
 
-  Options const options_;
+  const Options options_;
+  const ImmutableCFOptions ioptions_;
+  MutableCFOptions mutable_cf_options_;
 
   std::unique_ptr<TableCache> table_cache_;
 
   std::unique_ptr<InternalStats> internal_stats_;
 
+  WriteBuffer* write_buffer_;
+
   MemTable* mem_;
   MemTableList imm_;
   SuperVersion* super_version_;
@@ -268,8 +357,9 @@ class ColumnFamilyData {
   // This needs to be destructed before mutex_
   std::unique_ptr<ThreadLocalPtr> local_sv_;
 
-  // pointers for a circular linked list. we use it to support iterations
-  // that can be concurrent with writes
+  // pointers for a circular linked list. we use it to support iterations over
+  // all column families that are alive (note: dropped column families can also
+  // be alive as long as client holds a reference)
   ColumnFamilyData* next_;
   ColumnFamilyData* prev_;
 
@@ -278,32 +368,40 @@ class ColumnFamilyData {
   // recovered from
   uint64_t log_number_;
 
-  // A flag indicating whether we should delay writes because
-  // we have too many level 0 files
-  bool need_slowdown_for_num_level0_files_;
-
   // An object that keeps all the compaction stats
   // and picks the next compaction
   std::unique_ptr<CompactionPicker> compaction_picker_;
 
   ColumnFamilySet* column_family_set_;
+
+  std::unique_ptr<WriteControllerToken> write_controller_token_;
+
+  // If true --> this ColumnFamily is currently present in DBImpl::flush_queue_
+  bool pending_flush_;
+
+  // If true --> this ColumnFamily is currently present in
+  // DBImpl::compaction_queue_
+  bool pending_compaction_;
 };
 
 // ColumnFamilySet has interesting thread-safety requirements
-// * CreateColumnFamily() or RemoveColumnFamily() -- need to protect by DB
-// mutex. Inside, column_family_data_ and column_families_ will be protected
-// by Lock() and Unlock(). CreateColumnFamily() should ONLY be called from
-// VersionSet::LogAndApply() in the normal runtime. It is also called
-// during Recovery and in DumpManifest(). RemoveColumnFamily() is called
-// from ColumnFamilyData destructor
+// * CreateColumnFamily() or RemoveColumnFamily() -- need to be protected by DB
+// mutex AND executed in the write thread.
+// CreateColumnFamily() should ONLY be called from VersionSet::LogAndApply() AND
+// single-threaded write thread. It is also called during Recovery and in
+// DumpManifest().
+// RemoveColumnFamily() is only called from SetDropped(). DB mutex needs to be
+// held and it needs to be executed from the write thread. SetDropped() also
+// guarantees that it will be called only from single-threaded LogAndApply(),
+// but this condition is not that important.
 // * Iteration -- hold DB mutex, but you can release it in the body of
 // iteration. If you release DB mutex in body, reference the column
 // family before the mutex and unreference after you unlock, since the column
 // family might get dropped when the DB mutex is released
 // * GetDefault() -- thread safe
-// * GetColumnFamily() -- either inside of DB mutex or call Lock() <-> Unlock()
-// * GetNextColumnFamilyID(), GetMaxColumnFamily(), UpdateMaxColumnFamily() --
-// inside of DB mutex
+// * GetColumnFamily() -- either inside of DB mutex or from a write thread
+// * GetNextColumnFamilyID(), GetMaxColumnFamily(), UpdateMaxColumnFamily(),
+// NumberOfColumnFamilies -- inside of DB mutex
 class ColumnFamilySet {
  public:
   // ColumnFamilySet supports iteration
@@ -312,10 +410,13 @@ class ColumnFamilySet {
     explicit iterator(ColumnFamilyData* cfd)
         : current_(cfd) {}
     iterator& operator++() {
-      // dummy is never dead or dropped, so this will never be infinite
+      // dropped column families might still be included in this iteration
+      // (we're only removing them when client drops the last reference to the
+      // column family).
+      // dummy is never dead, so this will never be infinite
       do {
         current_ = current_->next_;
-      } while (current_->refs_ == 0 || current_->IsDropped());
+      } while (current_->refs_.load(std::memory_order_relaxed) == 0);
       return *this;
     }
     bool operator!=(const iterator& other) {
@@ -328,7 +429,8 @@ class ColumnFamilySet {
   };
 
   ColumnFamilySet(const std::string& dbname, const DBOptions* db_options,
-                  const EnvOptions& storage_options, Cache* table_cache);
+                  const EnvOptions& env_options, Cache* table_cache,
+                  WriteBuffer* write_buffer, WriteController* write_controller);
   ~ColumnFamilySet();
 
   ColumnFamilyData* GetDefault() const;
@@ -342,6 +444,7 @@ class ColumnFamilySet {
   uint32_t GetNextColumnFamilyID();
   uint32_t GetMaxColumnFamily();
   void UpdateMaxColumnFamily(uint32_t new_max_column_family);
+  size_t NumberOfColumnFamilies() const;
 
   ColumnFamilyData* CreateColumnFamily(const std::string& name, uint32_t id,
                                        Version* dummy_version,
@@ -350,9 +453,6 @@ class ColumnFamilySet {
   iterator begin() { return iterator(dummy_cfd_->next_); }
   iterator end() { return iterator(dummy_cfd_); }
 
-  void Lock();
-  void Unlock();
-
   // REQUIRES: DB mutex held
   // Don't call while iterating over ColumnFamilySet
   void FreeDeadColumnFamilies();
@@ -364,9 +464,12 @@ class ColumnFamilySet {
   void RemoveColumnFamily(ColumnFamilyData* cfd);
 
   // column_families_ and column_family_data_ need to be protected:
-  // * when mutating: 1. DB mutex locked first, 2. spinlock locked second
-  // * when reading, either: 1. lock DB mutex, or 2. lock spinlock
-  //  (if both, respect the ordering to avoid deadlock!)
+  // * when mutating both conditions have to be satisfied:
+  // 1. DB mutex locked
+  // 2. thread currently in single-threaded write thread
+  // * when reading, at least one condition needs to be satisfied:
+  // 1. DB mutex locked
+  // 2. accessed from a single-threaded write thread
   std::unordered_map<std::string, uint32_t> column_families_;
   std::unordered_map<uint32_t, ColumnFamilyData*> column_family_data_;
 
@@ -380,39 +483,52 @@ class ColumnFamilySet {
 
   const std::string db_name_;
   const DBOptions* const db_options_;
-  const EnvOptions storage_options_;
+  const EnvOptions env_options_;
   Cache* table_cache_;
-  std::atomic_flag spin_lock_;
+  WriteBuffer* write_buffer_;
+  WriteController* write_controller_;
 };
 
 // We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access
 // memtables of different column families (specified by ID in the write batch)
 class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables {
  public:
-  explicit ColumnFamilyMemTablesImpl(ColumnFamilySet* column_family_set)
-      : column_family_set_(column_family_set), current_(nullptr) {}
+  explicit ColumnFamilyMemTablesImpl(ColumnFamilySet* column_family_set,
+                                     FlushScheduler* flush_scheduler)
+      : column_family_set_(column_family_set),
+        current_(nullptr),
+        flush_scheduler_(flush_scheduler) {}
 
   // sets current_ to ColumnFamilyData with column_family_id
   // returns false if column family doesn't exist
+  // REQUIRES: under a DB mutex OR from a write thread
   bool Seek(uint32_t column_family_id) override;
 
   // Returns log number of the selected column family
+  // REQUIRES: under a DB mutex OR from a write thread
   uint64_t GetLogNumber() const override;
 
   // REQUIRES: Seek() called first
+  // REQUIRES: under a DB mutex OR from a write thread
   virtual MemTable* GetMemTable() const override;
 
-  // Returns options for selected column family
-  // REQUIRES: Seek() called first
-  virtual const Options* GetOptions() const override;
-
   // Returns column family handle for the selected column family
+  // REQUIRES: under a DB mutex OR from a write thread
   virtual ColumnFamilyHandle* GetColumnFamilyHandle() override;
 
+  // REQUIRES: under a DB mutex OR from a write thread
+  virtual void CheckMemtableFull() override;
+
  private:
   ColumnFamilySet* column_family_set_;
   ColumnFamilyData* current_;
+  FlushScheduler* flush_scheduler_;
   ColumnFamilyHandleInternal handle_;
 };
 
+extern uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family);
+
+extern const Comparator* GetColumnFamilyUserComparator(
+    ColumnFamilyHandle* column_family);
+
 }  // namespace rocksdb
diff --git a/src/rocksdb/db/column_family_test.cc b/src/rocksdb/db/column_family_test.cc
index 5f7ff48..8be8cd2 100644
--- a/src/rocksdb/db/column_family_test.cc
+++ b/src/rocksdb/db/column_family_test.cc
@@ -12,8 +12,10 @@
 #include <string>
 
 #include "db/db_impl.h"
-#include "rocksdb/env.h"
 #include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "util/string_util.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
 #include "util/coding.h"
@@ -38,7 +40,7 @@ class EnvCounter : public EnvWrapper {
     return num_new_writable_file_;
   }
   Status NewWritableFile(const std::string& f, unique_ptr<WritableFile>* r,
-                         const EnvOptions& soptions) {
+                         const EnvOptions& soptions) override {
     ++num_new_writable_file_;
     return EnvWrapper::NewWritableFile(f, r, soptions);
   }
@@ -47,7 +49,7 @@ class EnvCounter : public EnvWrapper {
   int num_new_writable_file_;
 };
 
-class ColumnFamilyTest {
+class ColumnFamilyTest : public testing::Test {
  public:
   ColumnFamilyTest() : rnd_(139) {
     env_ = new EnvCounter(Env::Default());
@@ -115,8 +117,12 @@ class ColumnFamilyTest {
 
   int GetProperty(int cf, std::string property) {
     std::string value;
-    ASSERT_TRUE(dbfull()->GetProperty(handles_[cf], property, &value));
+    EXPECT_TRUE(dbfull()->GetProperty(handles_[cf], property, &value));
+#ifndef CYGWIN
     return std::stoi(value);
+#else
+    return std::strtol(value.c_str(), 0);
+#endif
   }
 
   void Destroy() {
@@ -133,7 +139,7 @@ class ColumnFamilyTest {
   void CreateColumnFamilies(
       const std::vector<std::string>& cfs,
       const std::vector<ColumnFamilyOptions> options = {}) {
-    int cfi = handles_.size();
+    int cfi = static_cast<int>(handles_.size());
     handles_.resize(cfi + cfs.size());
     names_.resize(cfi + cfs.size());
     for (size_t i = 0; i < cfs.size(); ++i) {
@@ -218,7 +224,7 @@ class ColumnFamilyTest {
 
   int NumTableFilesAtLevel(int level, int cf) {
     return GetProperty(cf,
-                       "rocksdb.num-files-at-level" + std::to_string(level));
+                       "rocksdb.num-files-at-level" + ToString(level));
   }
 
   // Return spread of files per level
@@ -231,7 +237,7 @@ class ColumnFamilyTest {
       snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
       result += buf;
       if (f > 0) {
-        last_non_zero_offset = result.size();
+        last_non_zero_offset = static_cast<int>(result.size());
       }
     }
     result.resize(last_non_zero_offset);
@@ -262,7 +268,7 @@ class ColumnFamilyTest {
     VectorLogPtr wal_files;
     Status s;
     // GetSortedWalFiles is a flakey function -- it gets all the wal_dir
-    // children files and then later checks for their existance. if some of the
+    // children files and then later checks for their existence. if some of the
     // log files doesn't exist anymore, it reports an error. it does all of this
     // without DB mutex held, so if a background process deletes the log file
     // while the function is being executed, it returns an error. We retry the
@@ -274,7 +280,7 @@ class ColumnFamilyTest {
         break;
       }
     }
-    ASSERT_OK(s);
+    EXPECT_OK(s);
     for (const auto& wal : wal_files) {
       if (wal->Type() == kAliveLogFile) {
         ++ret;
@@ -287,8 +293,8 @@ class ColumnFamilyTest {
     assert(num_per_cf.size() == handles_.size());
 
     for (size_t i = 0; i < num_per_cf.size(); ++i) {
-      ASSERT_EQ(num_per_cf[i],
-                GetProperty(i, "rocksdb.num-immutable-mem-table"));
+      ASSERT_EQ(num_per_cf[i], GetProperty(static_cast<int>(i),
+                                           "rocksdb.num-immutable-mem-table"));
     }
   }
 
@@ -326,7 +332,14 @@ class ColumnFamilyTest {
   Random rnd_;
 };
 
-TEST(ColumnFamilyTest, DontReuseColumnFamilyID) {
+class DumbLogger : public Logger {
+ public:
+  using Logger::Logv;
+  virtual void Logv(const char* format, va_list ap) override {}
+  virtual size_t GetLogFileSize() const override { return 0; }
+};
+
+TEST_F(ColumnFamilyTest, DontReuseColumnFamilyID) {
   for (int iter = 0; iter < 3; ++iter) {
     Open();
     CreateColumnFamilies({"one", "two", "three"});
@@ -353,8 +366,7 @@ TEST(ColumnFamilyTest, DontReuseColumnFamilyID) {
   }
 }
 
-
-TEST(ColumnFamilyTest, AddDrop) {
+TEST_F(ColumnFamilyTest, AddDrop) {
   Open();
   CreateColumnFamilies({"one", "two", "three"});
   ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
@@ -380,14 +392,14 @@ TEST(ColumnFamilyTest, AddDrop) {
               std::vector<std::string>({"default", "four", "three"}));
 }
 
-TEST(ColumnFamilyTest, DropTest) {
+TEST_F(ColumnFamilyTest, DropTest) {
   // first iteration - dont reopen DB before dropping
   // second iteration - reopen DB before dropping
   for (int iter = 0; iter < 2; ++iter) {
     Open({"default"});
     CreateColumnFamiliesAndReopen({"pikachu"});
     for (int i = 0; i < 100; ++i) {
-      ASSERT_OK(Put(1, std::to_string(i), "bar" + std::to_string(i)));
+      ASSERT_OK(Put(1, ToString(i), "bar" + ToString(i)));
     }
     ASSERT_OK(Flush(1));
 
@@ -404,19 +416,25 @@ TEST(ColumnFamilyTest, DropTest) {
   }
 }
 
-TEST(ColumnFamilyTest, WriteBatchFailure) {
+TEST_F(ColumnFamilyTest, WriteBatchFailure) {
   Open();
   CreateColumnFamiliesAndReopen({"one", "two"});
   WriteBatch batch;
+  batch.Put(handles_[0], Slice("existing"), Slice("column-family"));
   batch.Put(handles_[1], Slice("non-existing"), Slice("column-family"));
   ASSERT_OK(db_->Write(WriteOptions(), &batch));
   DropColumnFamilies({1});
+  WriteOptions woptions_ignore_missing_cf;
+  woptions_ignore_missing_cf.ignore_missing_column_families = true;
+  batch.Put(handles_[0], Slice("still here"), Slice("column-family"));
+  ASSERT_OK(db_->Write(woptions_ignore_missing_cf, &batch));
+  ASSERT_EQ("column-family", Get(0, "still here"));
   Status s = db_->Write(WriteOptions(), &batch);
   ASSERT_TRUE(s.IsInvalidArgument());
   Close();
 }
 
-TEST(ColumnFamilyTest, ReadWrite) {
+TEST_F(ColumnFamilyTest, ReadWrite) {
   Open();
   CreateColumnFamiliesAndReopen({"one", "two"});
   ASSERT_OK(Put(0, "foo", "v1"));
@@ -440,7 +458,7 @@ TEST(ColumnFamilyTest, ReadWrite) {
   Close();
 }
 
-TEST(ColumnFamilyTest, IgnoreRecoveredLog) {
+TEST_F(ColumnFamilyTest, IgnoreRecoveredLog) {
   std::string backup_logs = dbname_ + "/backup_logs";
 
   // delete old files in backup_logs directory
@@ -515,7 +533,7 @@ TEST(ColumnFamilyTest, IgnoreRecoveredLog) {
   }
 }
 
-TEST(ColumnFamilyTest, FlushTest) {
+TEST_F(ColumnFamilyTest, FlushTest) {
   Open();
   CreateColumnFamiliesAndReopen({"one", "two"});
   ASSERT_OK(Put(0, "foo", "v1"));
@@ -523,8 +541,28 @@ TEST(ColumnFamilyTest, FlushTest) {
   ASSERT_OK(Put(1, "mirko", "v3"));
   ASSERT_OK(Put(0, "foo", "v2"));
   ASSERT_OK(Put(2, "fodor", "v5"));
-  for (int i = 0; i < 3; ++i) {
-    Flush(i);
+
+  for (int j = 0; j < 2; j++) {
+    ReadOptions ro;
+    std::vector<Iterator*> iterators;
+    // Hold super version.
+    if (j == 0) {
+      ASSERT_OK(db_->NewIterators(ro, handles_, &iterators));
+    }
+
+    for (int i = 0; i < 3; ++i) {
+      uint64_t max_total_in_memory_state =
+          dbfull()->TEST_MaxTotalInMemoryState();
+      Flush(i);
+      ASSERT_EQ(dbfull()->TEST_MaxTotalInMemoryState(),
+                max_total_in_memory_state);
+    }
+    ASSERT_OK(Put(1, "foofoo", "bar"));
+    ASSERT_OK(Put(0, "foofoo", "bar"));
+
+    for (auto* it : iterators) {
+      delete it;
+    }
   }
   Reopen();
 
@@ -544,7 +582,7 @@ TEST(ColumnFamilyTest, FlushTest) {
 }
 
 // Makes sure that obsolete log files get deleted
-TEST(ColumnFamilyTest, LogDeletionTest) {
+TEST_F(ColumnFamilyTest, LogDeletionTest) {
   db_options_.max_total_wal_size = std::numeric_limits<uint64_t>::max();
   column_family_options_.write_buffer_size = 100000;  // 100KB
   Open();
@@ -611,7 +649,7 @@ TEST(ColumnFamilyTest, LogDeletionTest) {
 }
 
 // Makes sure that obsolete log files get deleted
-TEST(ColumnFamilyTest, DifferentWriteBufferSizes) {
+TEST_F(ColumnFamilyTest, DifferentWriteBufferSizes) {
   // disable flushing stale column families
   db_options_.max_total_wal_size = std::numeric_limits<uint64_t>::max();
   Open();
@@ -686,7 +724,9 @@ TEST(ColumnFamilyTest, DifferentWriteBufferSizes) {
   WaitForFlush(1);
   AssertNumberOfImmutableMemtables({0, 0, 0, 1});
   ASSERT_EQ(CountLiveLogFiles(), 5);
-  PutRandomData(3, 90*6, 1000);
+  PutRandomData(3, 240, 1000);
+  WaitForFlush(3);
+  PutRandomData(3, 300, 1000);
   WaitForFlush(3);
   AssertNumberOfImmutableMemtables({0, 0, 0, 0});
   ASSERT_EQ(CountLiveLogFiles(), 12);
@@ -705,7 +745,28 @@ TEST(ColumnFamilyTest, DifferentWriteBufferSizes) {
   Close();
 }
 
-TEST(ColumnFamilyTest, DifferentMergeOperators) {
+TEST_F(ColumnFamilyTest, MemtableNotSupportSnapshot) {
+  Open();
+  auto* s1 = dbfull()->GetSnapshot();
+  ASSERT_TRUE(s1 != nullptr);
+  dbfull()->ReleaseSnapshot(s1);
+
+  // Add a column family that doesn't support snapshot
+  ColumnFamilyOptions first;
+  first.memtable_factory.reset(NewHashCuckooRepFactory(1024 * 1024));
+  CreateColumnFamilies({"first"}, {first});
+  auto* s2 = dbfull()->GetSnapshot();
+  ASSERT_TRUE(s2 == nullptr);
+
+  // Add a column family that supports snapshot. Snapshot stays not supported.
+  ColumnFamilyOptions second;
+  CreateColumnFamilies({"second"}, {second});
+  auto* s3 = dbfull()->GetSnapshot();
+  ASSERT_TRUE(s3 == nullptr);
+  Close();
+}
+
+TEST_F(ColumnFamilyTest, DifferentMergeOperators) {
   Open();
   CreateColumnFamilies({"first", "second"});
   ColumnFamilyOptions default_cf, first, second;
@@ -735,7 +796,7 @@ TEST(ColumnFamilyTest, DifferentMergeOperators) {
   Close();
 }
 
-TEST(ColumnFamilyTest, DifferentCompactionStyles) {
+TEST_F(ColumnFamilyTest, DifferentCompactionStyles) {
   Open();
   CreateColumnFamilies({"one", "two"});
   ColumnFamilyOptions default_cf, one, two;
@@ -746,12 +807,13 @@ TEST(ColumnFamilyTest, DifferentCompactionStyles) {
   default_cf.num_levels = 3;
   default_cf.write_buffer_size = 64 << 10;  // 64KB
   default_cf.target_file_size_base = 30 << 10;
-  default_cf.filter_policy = nullptr;
-  default_cf.no_block_cache = true;
   default_cf.source_compaction_factor = 100;
-  default_cf.disable_seek_compaction = false;
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
   one.compaction_style = kCompactionStyleUniversal;
+  one.num_levels = 1;
   // trigger compaction if there are >= 4 files
   one.level0_file_num_compaction_trigger = 4;
   one.write_buffer_size = 100000;
@@ -764,51 +826,18 @@ TEST(ColumnFamilyTest, DifferentCompactionStyles) {
 
   Reopen({default_cf, one, two});
 
-  // SETUP column family "default" - test read compaction
-  ASSERT_EQ("", FilesPerLevel(0));
-  PutRandomData(0, 1, 4096);
-  ASSERT_OK(Flush(0));
-  ASSERT_EQ("0,0,1", FilesPerLevel(0));
-  // write 8MB
-  PutRandomData(0, 2000, 4096);
-  ASSERT_OK(Flush(0));
-  // clear levels 0 and 1
-  dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[0]);
-  dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[0]);
-  ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);
-  ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0);
-  // write some new keys into level 0 and 1
-  PutRandomData(0, 1024, 512);
-  ASSERT_OK(Flush(0));
-  WaitForCompaction();
-  PutRandomData(0, 10, 512);
-  ASSERT_OK(Flush(0));
-  // remember number of files in each level
-  int l1 = NumTableFilesAtLevel(0, 0);
-  int l2 = NumTableFilesAtLevel(1, 0);
-  int l3 = NumTableFilesAtLevel(2, 0);
-  ASSERT_NE(l1, 0);
-  ASSERT_NE(l2, 0);
-  ASSERT_NE(l3, 0);
-
   // SETUP column family "one" -- universal style
   for (int i = 0; i < one.level0_file_num_compaction_trigger - 1; ++i) {
     PutRandomData(1, 11, 10000);
     WaitForFlush(1);
-    ASSERT_EQ(std::to_string(i + 1), FilesPerLevel(1));
+    ASSERT_EQ(ToString(i + 1), FilesPerLevel(1));
   }
 
   // SETUP column family "two" -- level style with 4 levels
   for (int i = 0; i < two.level0_file_num_compaction_trigger - 1; ++i) {
     PutRandomData(2, 15, 10000);
     WaitForFlush(2);
-    ASSERT_EQ(std::to_string(i + 1), FilesPerLevel(2));
-  }
-
-  // TRIGGER compaction "default"
-  // read a bunch of times, trigger read compaction
-  for (int i = 0; i < 200000; ++i) {
-    Get(0, std::to_string(i));
+    ASSERT_EQ(ToString(i + 1), FilesPerLevel(2));
   }
 
   // TRIGGER compaction "one"
@@ -820,13 +849,6 @@ TEST(ColumnFamilyTest, DifferentCompactionStyles) {
   // WAIT for compactions
   WaitForCompaction();
 
-  // VERIFY compaction "default"
-  // verify that the number of files have decreased
-  // in some level, indicating that there was a compaction
-  ASSERT_TRUE(NumTableFilesAtLevel(0, 0) < l1 ||
-              NumTableFilesAtLevel(1, 0) < l2 ||
-              NumTableFilesAtLevel(2, 0) < l3);
-
   // VERIFY compaction "one"
   ASSERT_EQ("1", FilesPerLevel(1));
 
@@ -850,7 +872,7 @@ std::string IterStatus(Iterator* iter) {
 }
 }  // anonymous namespace
 
-TEST(ColumnFamilyTest, NewIteratorsTest) {
+TEST_F(ColumnFamilyTest, NewIteratorsTest) {
   // iter == 0 -- no tailing
   // iter == 2 -- tailing
   for (int iter = 0; iter < 2; ++iter) {
@@ -895,9 +917,10 @@ TEST(ColumnFamilyTest, NewIteratorsTest) {
   }
 }
 
-TEST(ColumnFamilyTest, ReadOnlyDBTest) {
+TEST_F(ColumnFamilyTest, ReadOnlyDBTest) {
   Open();
   CreateColumnFamiliesAndReopen({"one", "two", "three", "four"});
+  ASSERT_OK(Put(0, "a", "b"));
   ASSERT_OK(Put(1, "foo", "bla"));
   ASSERT_OK(Put(2, "foo", "blabla"));
   ASSERT_OK(Put(3, "foo", "blablabla"));
@@ -911,6 +934,29 @@ TEST(ColumnFamilyTest, ReadOnlyDBTest) {
   ASSERT_EQ("bla", Get(1, "foo"));
   ASSERT_EQ("blablablabla", Get(2, "foo"));
 
+
+  // test newiterators
+  {
+    std::vector<Iterator*> iterators;
+    ASSERT_OK(db_->NewIterators(ReadOptions(), handles_, &iterators));
+    for (auto it : iterators) {
+      it->SeekToFirst();
+    }
+    ASSERT_EQ(IterStatus(iterators[0]), "a->b");
+    ASSERT_EQ(IterStatus(iterators[1]), "foo->bla");
+    ASSERT_EQ(IterStatus(iterators[2]), "foo->blablablabla");
+    for (auto it : iterators) {
+      it->Next();
+    }
+    ASSERT_EQ(IterStatus(iterators[0]), "(invalid)");
+    ASSERT_EQ(IterStatus(iterators[1]), "(invalid)");
+    ASSERT_EQ(IterStatus(iterators[2]), "(invalid)");
+
+    for (auto it : iterators) {
+      delete it;
+    }
+  }
+
   Close();
   // can't open dropped column family
   Status s = OpenReadOnly({"default", "one", "two"});
@@ -921,16 +967,18 @@ TEST(ColumnFamilyTest, ReadOnlyDBTest) {
   ASSERT_TRUE(!s.ok());
 }
 
-TEST(ColumnFamilyTest, DontRollEmptyLogs) {
+TEST_F(ColumnFamilyTest, DontRollEmptyLogs) {
   Open();
   CreateColumnFamiliesAndReopen({"one", "two", "three", "four"});
 
   for (size_t i = 0; i < handles_.size(); ++i) {
-    PutRandomData(i, 10, 100);
+    PutRandomData(static_cast<int>(i), 10, 100);
   }
   int num_writable_file_start = env_->GetNumberOfNewWritableFileCalls();
   // this will trigger the flushes
-  ASSERT_OK(db_->Write(WriteOptions(), nullptr));
+  for (int i = 0; i <= 4; ++i) {
+    ASSERT_OK(Flush(i));
+  }
 
   for (int i = 0; i < 4; ++i) {
     dbfull()->TEST_WaitForFlushMemTable(handles_[i]);
@@ -941,7 +989,7 @@ TEST(ColumnFamilyTest, DontRollEmptyLogs) {
   Close();
 }
 
-TEST(ColumnFamilyTest, FlushStaleColumnFamilies) {
+TEST_F(ColumnFamilyTest, FlushStaleColumnFamilies) {
   Open();
   CreateColumnFamilies({"one", "two"});
   ColumnFamilyOptions default_cf, one, two;
@@ -970,8 +1018,97 @@ TEST(ColumnFamilyTest, FlushStaleColumnFamilies) {
   Close();
 }
 
+TEST_F(ColumnFamilyTest, CreateMissingColumnFamilies) {
+  Status s = TryOpen({"one", "two"});
+  ASSERT_TRUE(!s.ok());
+  db_options_.create_missing_column_families = true;
+  s = TryOpen({"default", "one", "two"});
+  ASSERT_TRUE(s.ok());
+  Close();
+}
+
+TEST_F(ColumnFamilyTest, SanitizeOptions) {
+  DBOptions db_options;
+  for (int i = 1; i <= 3; i++) {
+    for (int j = 1; j <= 3; j++) {
+      for (int k = 1; k <= 3; k++) {
+        ColumnFamilyOptions original;
+        original.level0_stop_writes_trigger = i;
+        original.level0_slowdown_writes_trigger = j;
+        original.level0_file_num_compaction_trigger = k;
+        ColumnFamilyOptions result =
+            SanitizeOptions(db_options, nullptr, original);
+        ASSERT_TRUE(result.level0_stop_writes_trigger >=
+                    result.level0_slowdown_writes_trigger);
+        ASSERT_TRUE(result.level0_slowdown_writes_trigger >=
+                    result.level0_file_num_compaction_trigger);
+        ASSERT_TRUE(result.level0_file_num_compaction_trigger ==
+                    original.level0_file_num_compaction_trigger);
+      }
+    }
+  }
+}
+
+TEST_F(ColumnFamilyTest, ReadDroppedColumnFamily) {
+  // iter 0 -- drop CF, don't reopen
+  // iter 1 -- delete CF, reopen
+  for (int iter = 0; iter < 2; ++iter) {
+    db_options_.create_missing_column_families = true;
+    db_options_.max_open_files = 20;
+    // delete obsolete files always
+    db_options_.delete_obsolete_files_period_micros = 0;
+    Open({"default", "one", "two"});
+    ColumnFamilyOptions options;
+    options.level0_file_num_compaction_trigger = 100;
+    options.level0_slowdown_writes_trigger = 200;
+    options.level0_stop_writes_trigger = 200;
+    options.write_buffer_size = 100000;  // small write buffer size
+    Reopen({options, options, options});
+
+    // 1MB should create ~10 files for each CF
+    int kKeysNum = 10000;
+    PutRandomData(0, kKeysNum, 100);
+    PutRandomData(1, kKeysNum, 100);
+    PutRandomData(2, kKeysNum, 100);
+
+    if (iter == 0) {
+      // Drop CF two
+      ASSERT_OK(db_->DropColumnFamily(handles_[2]));
+    } else {
+      // delete CF two
+      delete handles_[2];
+      handles_[2] = nullptr;
+    }
+
+    // Add bunch more data to other CFs
+    PutRandomData(0, kKeysNum, 100);
+    PutRandomData(1, kKeysNum, 100);
+
+    if (iter == 1) {
+      Reopen();
+    }
+
+    // Since we didn't delete CF handle, RocksDB's contract guarantees that
+    // we're still able to read dropped CF
+    for (int i = 0; i < 3; ++i) {
+      std::unique_ptr<Iterator> iterator(
+          db_->NewIterator(ReadOptions(), handles_[i]));
+      int count = 0;
+      for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) {
+        ASSERT_OK(iterator->status());
+        ++count;
+      }
+      ASSERT_EQ(count, kKeysNum * ((i == 2) ? 1 : 2));
+    }
+
+    Close();
+    Destroy();
+  }
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
-  return rocksdb::test::RunAllTests();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
 }
diff --git a/src/rocksdb/db/compact_files_test.cc b/src/rocksdb/db/compact_files_test.cc
new file mode 100644
index 0000000..b7255c2
--- /dev/null
+++ b/src/rocksdb/db/compact_files_test.cc
@@ -0,0 +1,105 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include <mutex>
+#include <string>
+#include <vector>
+
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+class CompactFilesTest : public testing::Test {
+ public:
+  CompactFilesTest() {
+    env_ = Env::Default();
+    db_name_ = test::TmpDir(env_) + "/compact_files_test";
+  }
+
+  std::string db_name_;
+  Env* env_;
+};
+
+// A class which remembers the name of each flushed file.
+class FlushedFileCollector : public EventListener {
+ public:
+  FlushedFileCollector() {}
+  ~FlushedFileCollector() {}
+
+  virtual void OnFlushCompleted(
+      DB* db, const std::string& column_family_name,
+      const std::string& file_path,
+      bool triggered_writes_slowdown,
+      bool triggered_writes_stop) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    flushed_files_.push_back(file_path);
+  }
+
+  std::vector<std::string> GetFlushedFiles() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    std::vector<std::string> result;
+    for (auto fname : flushed_files_) {
+      result.push_back(fname);
+    }
+    return result;
+  }
+
+ private:
+  std::vector<std::string> flushed_files_;
+  std::mutex mutex_;
+};
+
+TEST_F(CompactFilesTest, ObsoleteFiles) {
+  Options options;
+  // to trigger compaction more easily
+  const int kWriteBufferSize = 10000;
+  options.create_if_missing = true;
+  // Disable RocksDB background compaction.
+  options.compaction_style = kCompactionStyleNone;
+  // Small slowdown and stop trigger for experimental purpose.
+  options.level0_slowdown_writes_trigger = 20;
+  options.level0_stop_writes_trigger = 20;
+  options.write_buffer_size = kWriteBufferSize;
+  options.max_write_buffer_number = 2;
+  options.compression = kNoCompression;
+
+  // Add listener
+  FlushedFileCollector* collector = new FlushedFileCollector();
+  options.listeners.emplace_back(collector);
+
+  DB* db = nullptr;
+  DestroyDB(db_name_, options);
+  Status s = DB::Open(options, db_name_, &db);
+  assert(s.ok());
+  assert(db);
+
+  // create couple files
+  for (int i = 1000; i < 2000; ++i) {
+    db->Put(WriteOptions(),
+        std::to_string(i),
+        std::string(kWriteBufferSize / 10, 'a' + (i % 26)));
+  }
+
+  auto l0_files = collector->GetFlushedFiles();
+  CompactionOptions compact_opt;
+  compact_opt.compression = kNoCompression;
+  compact_opt.output_file_size_limit = kWriteBufferSize * 5;
+  ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files, 1));
+
+  // verify all compaction input files are deleted
+  for (auto fname : l0_files) {
+    ASSERT_TRUE(!env_->FileExists(fname));
+  }
+  delete db;
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/compaction.cc b/src/rocksdb/db/compaction.cc
index bafb5b4..7ece0c4 100644
--- a/src/rocksdb/db/compaction.cc
+++ b/src/rocksdb/db/compaction.cc
@@ -8,53 +8,119 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "db/compaction.h"
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
+#include <vector>
+
 #include "db/column_family.h"
+#include "util/logging.h"
+#include "util/sync_point.h"
 
 namespace rocksdb {
 
-static uint64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
+uint64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
   uint64_t sum = 0;
   for (size_t i = 0; i < files.size() && files[i]; i++) {
-    sum += files[i]->file_size;
+    sum += files[i]->fd.GetFileSize();
   }
   return sum;
 }
 
-Compaction::Compaction(Version* input_version, int level, int out_level,
-                       uint64_t target_file_size,
-                       uint64_t max_grandparent_overlap_bytes,
-                       bool seek_compaction, bool enable_compression)
-    : level_(level),
-      out_level_(out_level),
-      max_output_file_size_(target_file_size),
-      max_grandparent_overlap_bytes_(max_grandparent_overlap_bytes),
-      input_version_(input_version),
-      number_levels_(input_version_->NumberLevels()),
-      cfd_(input_version_->cfd_),
-      seek_compaction_(seek_compaction),
-      enable_compression_(enable_compression),
+void Compaction::SetInputVersion(Version* _input_version) {
+  input_version_ = _input_version;
+  cfd_ = input_version_->cfd();
+
+  cfd_->Ref();
+  input_version_->Ref();
+  edit_.SetColumnFamily(cfd_->GetID());
+}
+
+// helper function to determine if compaction is creating files at the
+// bottommost level
+bool Compaction::IsBottommostLevel(
+    int output_level, VersionStorageInfo* vstorage,
+    const std::vector<CompactionInputFiles>& inputs) {
+  if (inputs[0].level == 0 &&
+      inputs[0].files.back() != vstorage->LevelFiles(0).back()) {
+    return false;
+  }
+
+  // checks whether there are files living beyond the output_level.
+  for (int i = output_level + 1; i < vstorage->num_levels(); i++) {
+    if (vstorage->NumLevelFiles(i) > 0) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool Compaction::IsFullCompaction(
+    VersionStorageInfo* vstorage,
+    const std::vector<CompactionInputFiles>& inputs) {
+  int num_files_in_compaction = 0;
+  int total_num_files = 0;
+  for (int l = 0; l < vstorage->num_levels(); l++) {
+    total_num_files += vstorage->NumLevelFiles(l);
+  }
+  for (size_t i = 0; i < inputs.size(); i++) {
+    num_files_in_compaction += inputs[i].size();
+  }
+  return num_files_in_compaction == total_num_files;
+}
+
+Compaction::Compaction(VersionStorageInfo* vstorage,
+                       const MutableCFOptions& _mutable_cf_options,
+                       std::vector<CompactionInputFiles> _inputs,
+                       int _output_level, uint64_t _target_file_size,
+                       uint64_t _max_grandparent_overlap_bytes,
+                       uint32_t _output_path_id, CompressionType _compression,
+                       std::vector<FileMetaData*> _grandparents,
+                       bool _manual_compaction, double _score,
+                       bool _deletion_compaction)
+    : start_level_(_inputs[0].level),
+      output_level_(_output_level),
+      max_output_file_size_(_target_file_size),
+      max_grandparent_overlap_bytes_(_max_grandparent_overlap_bytes),
+      mutable_cf_options_(_mutable_cf_options),
+      input_version_(nullptr),
+      number_levels_(vstorage->num_levels()),
+      cfd_(nullptr),
+      output_path_id_(_output_path_id),
+      output_compression_(_compression),
+      deletion_compaction_(_deletion_compaction),
+      inputs_(std::move(_inputs)),
+      grandparents_(std::move(_grandparents)),
       grandparent_index_(0),
       seen_key_(false),
       overlapped_bytes_(0),
-      base_index_(-1),
-      parent_index_(-1),
-      score_(0),
-      bottommost_level_(false),
-      is_full_compaction_(false),
-      is_manual_compaction_(false),
-      level_ptrs_(std::vector<size_t>(number_levels_)) {
+      score_(_score),
+      bottommost_level_(IsBottommostLevel(output_level_, vstorage, inputs_)),
+      is_full_compaction_(IsFullCompaction(vstorage, inputs_)),
+      is_manual_compaction_(_manual_compaction),
+      level_ptrs_(std::vector<size_t>(number_levels_, 0)) {
+  MarkFilesBeingCompacted(true);
 
-  cfd_->Ref();
-  input_version_->Ref();
-  edit_ = new VersionEdit();
-  edit_->SetColumnFamily(cfd_->GetID());
-  for (int i = 0; i < number_levels_; i++) {
-    level_ptrs_[i] = 0;
+#ifndef NDEBUG
+  for (size_t i = 1; i < inputs_.size(); ++i) {
+    assert(inputs_[i].level > inputs_[i - 1].level);
+  }
+#endif
+
+  // setup input_levels_
+  {
+    input_levels_.resize(num_input_levels());
+    for (size_t which = 0; which < num_input_levels(); which++) {
+      DoGenerateLevelFilesBrief(&input_levels_[which], inputs_[which].files,
+                                &arena_);
+    }
   }
 }
 
 Compaction::~Compaction() {
-  delete edit_;
   if (input_version_ != nullptr) {
     input_version_->Unref();
   }
@@ -65,40 +131,57 @@ Compaction::~Compaction() {
   }
 }
 
+bool Compaction::InputCompressionMatchesOutput() const {
+  int base_level = input_version_->storage_info()->base_level();
+  bool matches = (GetCompressionType(*cfd_->ioptions(), start_level_,
+                                     base_level) == output_compression_);
+  if (matches) {
+    TEST_SYNC_POINT("Compaction::InputCompressionMatchesOutput:Matches");
+    return true;
+  }
+  TEST_SYNC_POINT("Compaction::InputCompressionMatchesOutput:DidntMatch");
+  return matches;
+}
+
 bool Compaction::IsTrivialMove() const {
   // Avoid a move if there is lots of overlapping grandparent data.
   // Otherwise, the move could create a parent file that will require
   // a very expensive merge later on.
-  // If level_== out_level_, the purpose is to force compaction filter to be
-  // applied to that level, and thus cannot be a trivia move.
-  return (level_ != out_level_ &&
+  // If start_level_== output_level_, the purpose is to force compaction
+  // filter to be applied to that level, and thus cannot be a trivia move.
+  return (start_level_ != output_level_ && num_input_levels() == 1 &&
           num_input_files(0) == 1 &&
-          num_input_files(1) == 0 &&
+          input(0, 0)->fd.GetPathId() == GetOutputPathId() &&
+          InputCompressionMatchesOutput() &&
           TotalFileSize(grandparents_) <= max_grandparent_overlap_bytes_);
 }
 
-void Compaction::AddInputDeletions(VersionEdit* edit) {
-  for (int which = 0; which < 2; which++) {
+void Compaction::AddInputDeletions(VersionEdit* out_edit) {
+  for (size_t which = 0; which < num_input_levels(); which++) {
     for (size_t i = 0; i < inputs_[which].size(); i++) {
-      edit->DeleteFile(level_ + which, inputs_[which][i]->number);
+      out_edit->DeleteFile(level(which), inputs_[which][i]->fd.GetNumber());
     }
   }
 }
 
-bool Compaction::IsBaseLevelForKey(const Slice& user_key) {
-  if (cfd_->options()->compaction_style == kCompactionStyleUniversal) {
+bool Compaction::KeyNotExistsBeyondOutputLevel(const Slice& user_key) {
+  assert(input_version_ != nullptr);
+  assert(cfd_->ioptions()->compaction_style != kCompactionStyleFIFO);
+  if (cfd_->ioptions()->compaction_style == kCompactionStyleUniversal) {
     return bottommost_level_;
   }
   // Maybe use binary search to find right entry instead of linear search?
   const Comparator* user_cmp = cfd_->user_comparator();
-  for (int lvl = level_ + 2; lvl < number_levels_; lvl++) {
-    const std::vector<FileMetaData*>& files = input_version_->files_[lvl];
+  for (int lvl = output_level_ + 1; lvl < number_levels_; lvl++) {
+    const std::vector<FileMetaData*>& files =
+        input_version_->storage_info()->LevelFiles(lvl);
     for (; level_ptrs_[lvl] < files.size(); ) {
       FileMetaData* f = files[level_ptrs_[lvl]];
       if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) {
         // We've advanced far enough
         if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) {
-          // Key falls in this file's range, so definitely not base level
+          // Key falls in this file's range, so definitely
+          // exists beyond output level
           return false;
         }
         break;
@@ -116,7 +199,7 @@ bool Compaction::ShouldStopBefore(const Slice& internal_key) {
       icmp->Compare(internal_key,
                     grandparents_[grandparent_index_]->largest.Encode()) > 0) {
     if (seen_key_) {
-      overlapped_bytes_ += grandparents_[grandparent_index_]->file_size;
+      overlapped_bytes_ += grandparents_[grandparent_index_]->fd.GetFileSize();
     }
     assert(grandparent_index_ + 1 >= grandparents_.size() ||
            icmp->Compare(grandparents_[grandparent_index_]->largest.Encode(),
@@ -136,126 +219,125 @@ bool Compaction::ShouldStopBefore(const Slice& internal_key) {
 }
 
 // Mark (or clear) each file that is being compacted
-void Compaction::MarkFilesBeingCompacted(bool value) {
-  for (int i = 0; i < 2; i++) {
-    std::vector<FileMetaData*> v = inputs_[i];
+void Compaction::MarkFilesBeingCompacted(bool mark_as_compacted) {
+  for (size_t i = 0; i < num_input_levels(); i++) {
     for (unsigned int j = 0; j < inputs_[i].size(); j++) {
-      assert(value ? !inputs_[i][j]->being_compacted :
-                      inputs_[i][j]->being_compacted);
-      inputs_[i][j]->being_compacted = value;
+      assert(mark_as_compacted ? !inputs_[i][j]->being_compacted :
+                                  inputs_[i][j]->being_compacted);
+      inputs_[i][j]->being_compacted = mark_as_compacted;
     }
   }
 }
 
-// Is this compaction producing files at the bottommost level?
-void Compaction::SetupBottomMostLevel(bool isManual) {
-  if (cfd_->options()->compaction_style == kCompactionStyleUniversal) {
-    // If universal compaction style is used and manual
-    // compaction is occuring, then we are guaranteed that
-    // all files will be picked in a single compaction
-    // run. We can safely set bottommost_level_ = true.
-    // If it is not manual compaction, then bottommost_level_
-    // is already set when the Compaction was created.
-    if (isManual) {
-      bottommost_level_ = true;
+// Sample output:
+// If compacting 3 L0 files, 2 L3 files and 1 L4 file, and outputting to L5,
+// print: "3 at 0 + 2 at 3 + 1 at 4 files to L5"
+const char* Compaction::InputLevelSummary(
+    InputLevelSummaryBuffer* scratch) const {
+  int len = 0;
+  bool is_first = true;
+  for (auto& input_level : inputs_) {
+    if (input_level.empty()) {
+      continue;
     }
-    return;
-  }
-  bottommost_level_ = true;
-  for (int i = output_level() + 1; i < number_levels_; i++) {
-    if (input_version_->NumLevelFiles(i) > 0) {
-      bottommost_level_ = false;
-      break;
+    if (!is_first) {
+      len +=
+          snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, " + ");
+    } else {
+      is_first = false;
     }
+    len += snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len,
+                    "%zu@%d", input_level.size(), input_level.level);
   }
+  snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len,
+           " files to L%d", output_level());
+
+  return scratch->buffer;
 }
 
-void Compaction::ReleaseInputs() {
-  if (input_version_ != nullptr) {
-    input_version_->Unref();
-    input_version_ = nullptr;
-  }
-  if (cfd_ != nullptr) {
-    if (cfd_->Unref()) {
-      delete cfd_;
+uint64_t Compaction::CalculateTotalInputSize() const {
+  uint64_t size = 0;
+  for (auto& input_level : inputs_) {
+    for (auto f : input_level.files) {
+      size += f->fd.GetFileSize();
     }
-    cfd_ = nullptr;
   }
+  return size;
 }
 
 void Compaction::ReleaseCompactionFiles(Status status) {
+  MarkFilesBeingCompacted(false);
   cfd_->compaction_picker()->ReleaseCompactionFiles(this, status);
 }
 
 void Compaction::ResetNextCompactionIndex() {
-  input_version_->ResetNextCompactionIndex(level_);
+  assert(input_version_ != nullptr);
+  input_version_->storage_info()->ResetNextCompactionIndex(start_level_);
 }
 
-/*
-for sizes >=10TB, print "XXTB"
-for sizes >=10GB, print "XXGB"
-etc.
-*/
-static void FileSizeSummary(unsigned long long sz, char* output, int len) {
-  const unsigned long long ull10 = 10;
-  if (sz >= ull10<<40) {
-    snprintf(output, len, "%lluTB", sz>>40);
-  } else if (sz >= ull10<<30) {
-    snprintf(output, len, "%lluGB", sz>>30);
-  } else if (sz >= ull10<<20) {
-    snprintf(output, len, "%lluMB", sz>>20);
-  } else if (sz >= ull10<<10) {
-    snprintf(output, len, "%lluKB", sz>>10);
-  } else {
-    snprintf(output, len, "%lluB", sz);
-  }
-}
-
-static int InputSummary(std::vector<FileMetaData*>& files, char* output,
-                         int len) {
+namespace {
+int InputSummary(const std::vector<FileMetaData*>& files, char* output,
+                 int len) {
   *output = '\0';
   int write = 0;
   for (unsigned int i = 0; i < files.size(); i++) {
     int sz = len - write;
     int ret;
     char sztxt[16];
-    FileSizeSummary((unsigned long long)files.at(i)->file_size, sztxt, 16);
-    ret = snprintf(output + write, sz, "%lu(%s) ",
-                   (unsigned long)files.at(i)->number,
-                   sztxt);
-    if (ret < 0 || ret >= sz)
-      break;
+    AppendHumanBytes(files.at(i)->fd.GetFileSize(), sztxt, 16);
+    ret = snprintf(output + write, sz, "%" PRIu64 "(%s) ",
+                   files.at(i)->fd.GetNumber(), sztxt);
+    if (ret < 0 || ret >= sz) break;
     write += ret;
   }
-  return write;
+  // if files.size() is non-zero, overwrite the last space
+  return write - !!files.size();
 }
+}  // namespace
 
 void Compaction::Summary(char* output, int len) {
-  int write = snprintf(output, len,
-      "Base version %lu Base level %d, seek compaction:%d, inputs: [",
-      (unsigned long)input_version_->GetVersionNumber(),
-      level_,
-      seek_compaction_);
+  int write =
+      snprintf(output, len, "Base version %" PRIu64
+                            " Base level %d, inputs: [",
+               input_version_->GetVersionNumber(),
+               start_level_);
   if (write < 0 || write >= len) {
     return;
   }
 
-  write += InputSummary(inputs_[0], output+write, len-write);
-  if (write < 0 || write >= len) {
-    return;
+  for (size_t level_iter = 0; level_iter < num_input_levels(); ++level_iter) {
+    if (level_iter > 0) {
+      write += snprintf(output + write, len - write, "], [");
+      if (write < 0 || write >= len) {
+        return;
+      }
+    }
+    write +=
+        InputSummary(inputs_[level_iter].files, output + write, len - write);
+    if (write < 0 || write >= len) {
+      return;
+    }
   }
 
-  write += snprintf(output+write, len-write, "],[");
-  if (write < 0 || write >= len) {
-    return;
-  }
+  snprintf(output + write, len - write, "]");
+}
 
-  write += InputSummary(inputs_[1], output+write, len-write);
-  if (write < 0 || write >= len) {
-    return;
-  }
+uint64_t Compaction::OutputFilePreallocationSize() {
+  uint64_t preallocation_size = 0;
 
-  snprintf(output+write, len-write, "]");
+  if (cfd_->ioptions()->compaction_style == kCompactionStyleLevel ||
+      output_level() > 0) {
+    preallocation_size = max_output_file_size_;
+  } else {
+    // output_level() == 0
+    assert(num_input_levels() > 0);
+    for (const auto& f : inputs_[0].files) {
+      preallocation_size += f->fd.GetFileSize();
+    }
+  }
+  // Over-estimate slightly so we don't end up just barely crossing
+  // the threshold
+  return preallocation_size * 1.1;
 }
 
 }  // namespace rocksdb
diff --git a/src/rocksdb/db/compaction.h b/src/rocksdb/db/compaction.h
index 8fd95f9..3bb87c2 100644
--- a/src/rocksdb/db/compaction.h
+++ b/src/rocksdb/db/compaction.h
@@ -8,72 +8,140 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #pragma once
+#include "util/arena.h"
+#include "util/autovector.h"
+#include "util/mutable_cf_options.h"
 #include "db/version_set.h"
 
 namespace rocksdb {
 
+// The structure that manages compaction input files associated
+// with the same physical level.
+struct CompactionInputFiles {
+  int level;
+  std::vector<FileMetaData*> files;
+  inline bool empty() const { return files.empty(); }
+  inline size_t size() const { return files.size(); }
+  inline void clear() { files.clear(); }
+  inline FileMetaData* operator[](size_t i) const { return files[i]; }
+};
+
 class Version;
 class ColumnFamilyData;
+class VersionStorageInfo;
 
 // A Compaction encapsulates information about a compaction.
 class Compaction {
  public:
+  Compaction(VersionStorageInfo* input_version,
+             const MutableCFOptions& mutable_cf_options,
+             std::vector<CompactionInputFiles> inputs, int output_level,
+             uint64_t target_file_size, uint64_t max_grandparent_overlap_bytes,
+             uint32_t output_path_id, CompressionType compression,
+             std::vector<FileMetaData*> grandparents,
+             bool manual_compaction = false, double score = -1,
+             bool deletion_compaction = false);
+
+  // No copying allowed
+  Compaction(const Compaction&) = delete;
+  void operator=(const Compaction&) = delete;
+
   ~Compaction();
 
-  // Return the level that is being compacted.  Inputs from "level"
-  // will be merged.
-  int level() const { return level_; }
+  // Returns the level associated to the specified compaction input level.
+  // If compaction_input_level is not specified, then input_level is set to 0.
+  int level(size_t compaction_input_level = 0) const {
+    return inputs_[compaction_input_level].level;
+  }
+
+  int start_level() const { return start_level_; }
 
   // Outputs will go to this level
-  int output_level() const { return out_level_; }
+  int output_level() const { return output_level_; }
+
+  // Returns the number of input levels in this compaction.
+  size_t num_input_levels() const { return inputs_.size(); }
 
   // Return the object that holds the edits to the descriptor done
   // by this compaction.
-  VersionEdit* edit() { return edit_; }
-
-  // "which" must be either 0 or 1
-  int num_input_files(int which) const { return inputs_[which].size(); }
+  VersionEdit* edit() { return &edit_; }
+
+  // Returns the number of input files associated to the specified
+  // compaction input level.
+  // The function will return 0 if when "compaction_input_level" < 0
+  // or "compaction_input_level" >= "num_input_levels()".
+  size_t num_input_files(size_t compaction_input_level) const {
+    if (compaction_input_level < inputs_.size()) {
+      return inputs_[compaction_input_level].size();
+    }
+    return 0;
+  }
 
   // Returns input version of the compaction
   Version* input_version() const { return input_version_; }
 
+  // Returns the ColumnFamilyData associated with the compaction.
   ColumnFamilyData* column_family_data() const { return cfd_; }
 
-  // Return the ith input file at "level()+which" ("which" must be 0 or 1).
-  FileMetaData* input(int which, int i) const { return inputs_[which][i]; }
-
-  std::vector<FileMetaData*>* inputs(int which) { return &inputs_[which]; }
+  // Returns the file meta data of the 'i'th input file at the
+  // specified compaction input level.
+  // REQUIREMENT: "compaction_input_level" must be >= 0 and
+  //              < "input_levels()"
+  FileMetaData* input(size_t compaction_input_level, size_t i) const {
+    assert(compaction_input_level < inputs_.size());
+    return inputs_[compaction_input_level][i];
+  }
+
+  // Returns the list of file meta data of the specified compaction
+  // input level.
+  // REQUIREMENT: "compaction_input_level" must be >= 0 and
+  //              < "input_levels()"
+  const std::vector<FileMetaData*>* inputs(size_t compaction_input_level) {
+    assert(compaction_input_level < inputs_.size());
+    return &inputs_[compaction_input_level].files;
+  }
+
+  // Returns the LevelFilesBrief of the specified compaction input level.
+  LevelFilesBrief* input_levels(size_t compaction_input_level) {
+    return &input_levels_[compaction_input_level];
+  }
 
   // Maximum size of files to build during this compaction.
   uint64_t MaxOutputFileSize() const { return max_output_file_size_; }
 
-  // Whether compression will be enabled for compaction outputs
-  bool enable_compression() const { return enable_compression_; }
+  // What compression for output
+  CompressionType OutputCompressionType() const { return output_compression_; }
+
+  // Whether need to write output file to second DB path.
+  uint32_t GetOutputPathId() const { return output_path_id_; }
 
   // Is this a trivial compaction that can be implemented by just
   // moving a single input file to the next level (no merging or splitting)
   bool IsTrivialMove() const;
 
+  // If true, then the compaction can be done by simply deleting input files.
+  bool IsDeletionCompaction() const {
+    return deletion_compaction_;
+  }
+
   // Add all inputs to this compaction as delete operations to *edit.
   void AddInputDeletions(VersionEdit* edit);
 
-  // Returns true if the information we have available guarantees that
-  // the compaction is producing data in "level+1" for which no data exists
-  // in levels greater than "level+1".
-  bool IsBaseLevelForKey(const Slice& user_key);
+  // Returns true if the available information we have guarantees that
+  // the input "user_key" does not exist in any level beyond "output_level()".
+  bool KeyNotExistsBeyondOutputLevel(const Slice& user_key);
 
   // Returns true iff we should stop building the current output
   // before processing "internal_key".
   bool ShouldStopBefore(const Slice& internal_key);
 
-  // Release the input version for the compaction, once the compaction
-  // is successful.
-  void ReleaseInputs();
-
   // Clear all files to indicate that they are not being compacted
   // Delete this compaction from the list of running compactions.
   void ReleaseCompactionFiles(Status status);
 
+  // Returns the summary of the compaction in "output" with maximum "len"
+  // in bytes.  The caller is responsible for the memory management of
+  // "output".
   void Summary(char* output, int len);
 
   // Return the score that was used to pick this compaction run.
@@ -88,64 +156,92 @@ class Compaction {
   // Was this compaction triggered manually by the client?
   bool IsManualCompaction() { return is_manual_compaction_; }
 
- private:
-  friend class CompactionPicker;
-  friend class UniversalCompactionPicker;
-  friend class LevelCompactionPicker;
+  // Return the MutableCFOptions that should be used throughout the compaction
+  // procedure
+  const MutableCFOptions* mutable_cf_options() { return &mutable_cf_options_; }
 
-  Compaction(Version* input_version, int level, int out_level,
-             uint64_t target_file_size, uint64_t max_grandparent_overlap_bytes,
-             bool seek_compaction = false, bool enable_compression = true);
+  // Returns the size in bytes that the output file should be preallocated to.
+  // In level compaction, that is max_file_size_. In universal compaction, that
+  // is the sum of all input file sizes.
+  uint64_t OutputFilePreallocationSize();
+
+  void SetInputVersion(Version* input_version);
+
+  struct InputLevelSummaryBuffer {
+    char buffer[128];
+  };
 
-  int level_;
-  int out_level_; // levels to which output files are stored
+  const char* InputLevelSummary(InputLevelSummaryBuffer* scratch) const;
+
+  uint64_t CalculateTotalInputSize() const;
+
+  // In case of compaction error, reset the nextIndex that is used
+  // to pick up the next file to be compacted from files_by_size_
+  void ResetNextCompactionIndex();
+
+ private:
+  // mark (or clear) all files that are being compacted
+  void MarkFilesBeingCompacted(bool mark_as_compacted);
+
+  // helper function to determine if compaction with inputs and storage is
+  // bottommost
+  static bool IsBottommostLevel(
+      int output_level, VersionStorageInfo* vstorage,
+      const std::vector<CompactionInputFiles>& inputs);
+  static bool IsFullCompaction(VersionStorageInfo* vstorage,
+                               const std::vector<CompactionInputFiles>& inputs);
+
+  const int start_level_;    // the lowest level to be compacted
+  const int output_level_;  // levels to which output files are stored
   uint64_t max_output_file_size_;
   uint64_t max_grandparent_overlap_bytes_;
+  MutableCFOptions mutable_cf_options_;
   Version* input_version_;
-  VersionEdit* edit_;
-  int number_levels_;
+  VersionEdit edit_;
+  const int number_levels_;
   ColumnFamilyData* cfd_;
+  Arena arena_;          // Arena used to allocate space for file_levels_
 
-  bool seek_compaction_;
-  bool enable_compression_;
+  const uint32_t output_path_id_;
+  CompressionType output_compression_;
+  // If true, then the comaction can be done by simply deleting input files.
+  const bool deletion_compaction_;
 
-  // Each compaction reads inputs from "level_" and "level_+1"
-  std::vector<FileMetaData*> inputs_[2];      // The two sets of inputs
+  // Compaction input files organized by level. Constant after construction
+  const std::vector<CompactionInputFiles> inputs_;
+
+  // A copy of inputs_, organized more closely in memory
+  autovector<LevelFilesBrief, 2> input_levels_;
 
   // State used to check for number of of overlapping grandparent files
-  // (parent == level_ + 1, grandparent == level_ + 2)
+  // (grandparent == "output_level_ + 1")
   std::vector<FileMetaData*> grandparents_;
-  size_t grandparent_index_;  // Index in grandparent_starts_
-  bool seen_key_;             // Some output key has been seen
+  size_t grandparent_index_;   // Index in grandparent_starts_
+  bool seen_key_;              // Some output key has been seen
   uint64_t overlapped_bytes_;  // Bytes of overlap between current output
-                              // and grandparent files
-  int base_index_;   // index of the file in files_[level_]
-  int parent_index_; // index of some file with same range in files_[level_+1]
-  double score_;     // score that was used to pick this compaction.
+                               // and grandparent files
+  const double score_;         // score that was used to pick this compaction.
 
   // Is this compaction creating a file in the bottom most level?
-  bool bottommost_level_;
+  const bool bottommost_level_;
   // Does this compaction include all sst files?
-  bool is_full_compaction_;
+  const bool is_full_compaction_;
 
   // Is this compaction requested by the client?
-  bool is_manual_compaction_;
+  const bool is_manual_compaction_;
 
-  // level_ptrs_ holds indices into input_version_->levels_: our state
-  // is that we are positioned at one of the file ranges for each
-  // higher level than the ones involved in this compaction (i.e. for
-  // all L >= level_ + 2).
+  // "level_ptrs_" holds indices into "input_version_->levels_", where each
+  // index remembers which file of an associated level we are currently used
+  // to check KeyNotExistsBeyondOutputLevel() for deletion operation.
+  // As it is for checking KeyNotExistsBeyondOutputLevel(), it only
+  // records indices for all levels beyond "output_level_".
   std::vector<size_t> level_ptrs_;
 
-  // mark (or clear) all files that are being compacted
-  void MarkFilesBeingCompacted(bool);
-
-  // Initialize whether compaction producing files at the bottommost level
-  void SetupBottomMostLevel(bool isManual);
-
-  // In case of compaction error, reset the nextIndex that is used
-  // to pick up the next file to be compacted from files_by_size_
-  void ResetNextCompactionIndex();
+  // Does input compression match the output compression?
+  bool InputCompressionMatchesOutput() const;
 };
 
+// Utility function
+extern uint64_t TotalFileSize(const std::vector<FileMetaData*>& files);
+
 }  // namespace rocksdb
diff --git a/src/rocksdb/db/compaction_job.cc b/src/rocksdb/db/compaction_job.cc
new file mode 100644
index 0000000..12f35cb
--- /dev/null
+++ b/src/rocksdb/db/compaction_job.cc
@@ -0,0 +1,1226 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction_job.h"
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
+#include <algorithm>
+#include <vector>
+#include <memory>
+#include <list>
+
+#include "db/builder.h"
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "db/event_logger_helpers.h"
+#include "db/filename.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/memtable.h"
+#include "db/merge_helper.h"
+#include "db/memtable_list.h"
+#include "db/merge_context.h"
+#include "db/version_set.h"
+#include "port/port.h"
+#include "port/likely.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "table/block.h"
+#include "table/block_based_table_factory.h"
+#include "table/merger.h"
+#include "table/table_builder.h"
+#include "table/two_level_iterator.h"
+#include "util/coding.h"
+#include "util/logging.h"
+#include "util/log_buffer.h"
+#include "util/mutexlock.h"
+#include "util/perf_context_imp.h"
+#include "util/iostats_context_imp.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+#include "util/sync_point.h"
+#include "util/thread_status_util.h"
+
+namespace rocksdb {
+
+struct CompactionJob::CompactionState {
+  Compaction* const compaction;
+
+  // Files produced by compaction
+  struct Output {
+    uint64_t number;
+    uint32_t path_id;
+    uint64_t file_size;
+    InternalKey smallest, largest;
+    SequenceNumber smallest_seqno, largest_seqno;
+  };
+  std::vector<Output> outputs;
+
+  // State kept for output being generated
+  std::unique_ptr<WritableFile> outfile;
+  std::unique_ptr<TableBuilder> builder;
+
+  uint64_t total_bytes;
+
+  Output* current_output() { return &outputs[outputs.size() - 1]; }
+
+  explicit CompactionState(Compaction* c)
+      : compaction(c),
+        total_bytes(0),
+        num_input_records(0),
+        num_output_records(0) {}
+
+  // Create a client visible context of this compaction
+  CompactionFilter::Context GetFilterContextV1() {
+    CompactionFilter::Context context;
+    context.is_full_compaction = compaction->IsFullCompaction();
+    context.is_manual_compaction = compaction->IsManualCompaction();
+    return context;
+  }
+
+  // Create a client visible context of this compaction
+  CompactionFilterContext GetFilterContext() {
+    CompactionFilterContext context;
+    context.is_full_compaction = compaction->IsFullCompaction();
+    context.is_manual_compaction = compaction->IsManualCompaction();
+    return context;
+  }
+
+  std::vector<std::string> key_str_buf_;
+  std::vector<std::string> existing_value_str_buf_;
+  // new_value_buf_ will only be appended if a value changes
+  std::vector<std::string> new_value_buf_;
+  // if values_changed_buf_[i] is true
+  // new_value_buf_ will add a new entry with the changed value
+  std::vector<bool> value_changed_buf_;
+  // to_delete_buf_[i] is true iff key_buf_[i] is deleted
+  std::vector<bool> to_delete_buf_;
+
+  std::vector<std::string> other_key_str_buf_;
+  std::vector<std::string> other_value_str_buf_;
+
+  std::vector<Slice> combined_key_buf_;
+  std::vector<Slice> combined_value_buf_;
+
+  std::string cur_prefix_;
+
+  uint64_t num_input_records;
+  uint64_t num_output_records;
+
+  // Buffers the kv-pair that will be run through compaction filter V2
+  // in the future.
+  void BufferKeyValueSlices(const Slice& key, const Slice& value) {
+    key_str_buf_.emplace_back(key.ToString());
+    existing_value_str_buf_.emplace_back(value.ToString());
+  }
+
+  // Buffers the kv-pair that will not be run through compaction filter V2
+  // in the future.
+  void BufferOtherKeyValueSlices(const Slice& key, const Slice& value) {
+    other_key_str_buf_.emplace_back(key.ToString());
+    other_value_str_buf_.emplace_back(value.ToString());
+  }
+
+  // Add a kv-pair to the combined buffer
+  void AddToCombinedKeyValueSlices(const Slice& key, const Slice& value) {
+    // The real strings are stored in the batch buffers
+    combined_key_buf_.emplace_back(key);
+    combined_value_buf_.emplace_back(value);
+  }
+
+  // Merging the two buffers
+  void MergeKeyValueSliceBuffer(const InternalKeyComparator* comparator) {
+    size_t i = 0;
+    size_t j = 0;
+    size_t total_size = key_str_buf_.size() + other_key_str_buf_.size();
+    combined_key_buf_.reserve(total_size);
+    combined_value_buf_.reserve(total_size);
+
+    while (i + j < total_size) {
+      int comp_res = 0;
+      if (i < key_str_buf_.size() && j < other_key_str_buf_.size()) {
+        comp_res = comparator->Compare(key_str_buf_[i], other_key_str_buf_[j]);
+      } else if (i >= key_str_buf_.size() && j < other_key_str_buf_.size()) {
+        comp_res = 1;
+      } else if (j >= other_key_str_buf_.size() && i < key_str_buf_.size()) {
+        comp_res = -1;
+      }
+      if (comp_res > 0) {
+        AddToCombinedKeyValueSlices(other_key_str_buf_[j],
+                                    other_value_str_buf_[j]);
+        j++;
+      } else if (comp_res < 0) {
+        AddToCombinedKeyValueSlices(key_str_buf_[i],
+                                    existing_value_str_buf_[i]);
+        i++;
+      }
+    }
+  }
+
+  void CleanupBatchBuffer() {
+    to_delete_buf_.clear();
+    key_str_buf_.clear();
+    existing_value_str_buf_.clear();
+    new_value_buf_.clear();
+    value_changed_buf_.clear();
+
+    to_delete_buf_.shrink_to_fit();
+    key_str_buf_.shrink_to_fit();
+    existing_value_str_buf_.shrink_to_fit();
+    new_value_buf_.shrink_to_fit();
+    value_changed_buf_.shrink_to_fit();
+
+    other_key_str_buf_.clear();
+    other_value_str_buf_.clear();
+    other_key_str_buf_.shrink_to_fit();
+    other_value_str_buf_.shrink_to_fit();
+  }
+
+  void CleanupMergedBuffer() {
+    combined_key_buf_.clear();
+    combined_value_buf_.clear();
+    combined_key_buf_.shrink_to_fit();
+    combined_value_buf_.shrink_to_fit();
+  }
+};
+
+CompactionJob::CompactionJob(
+    int job_id, Compaction* compaction, const DBOptions& db_options,
+    const EnvOptions& env_options, VersionSet* versions,
+    std::atomic<bool>* shutting_down, LogBuffer* log_buffer,
+    Directory* db_directory, Directory* output_directory, Statistics* stats,
+    std::vector<SequenceNumber> existing_snapshots,
+    std::shared_ptr<Cache> table_cache,
+    std::function<uint64_t()> yield_callback, EventLogger* event_logger,
+    bool paranoid_file_checks)
+    : job_id_(job_id),
+      compact_(new CompactionState(compaction)),
+      compaction_stats_(1),
+      db_options_(db_options),
+      env_options_(env_options),
+      env_(db_options.env),
+      versions_(versions),
+      shutting_down_(shutting_down),
+      log_buffer_(log_buffer),
+      db_directory_(db_directory),
+      output_directory_(output_directory),
+      stats_(stats),
+      existing_snapshots_(std::move(existing_snapshots)),
+      table_cache_(std::move(table_cache)),
+      yield_callback_(std::move(yield_callback)),
+      event_logger_(event_logger),
+      paranoid_file_checks_(paranoid_file_checks) {
+  ThreadStatusUtil::SetColumnFamily(compact_->compaction->column_family_data());
+  ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION);
+  ReportStartedCompaction(compaction);
+}
+
+CompactionJob::~CompactionJob() {
+  assert(compact_ == nullptr);
+  ThreadStatusUtil::ResetThreadStatus();
+}
+
+void CompactionJob::ReportStartedCompaction(
+    Compaction* compaction) {
+  ThreadStatusUtil::SetColumnFamily(
+      compact_->compaction->column_family_data());
+
+  ThreadStatusUtil::SetThreadOperationProperty(
+      ThreadStatus::COMPACTION_JOB_ID,
+      job_id_);
+
+  ThreadStatusUtil::SetThreadOperationProperty(
+      ThreadStatus::COMPACTION_INPUT_OUTPUT_LEVEL,
+      (static_cast<uint64_t>(compact_->compaction->start_level()) << 32) +
+          compact_->compaction->output_level());
+
+  ThreadStatusUtil::SetThreadOperationProperty(
+      ThreadStatus::COMPACTION_PROP_FLAGS,
+      compaction->IsManualCompaction() +
+          (compaction->IsDeletionCompaction() << 1) +
+          (compaction->IsTrivialMove() << 2));
+
+  ThreadStatusUtil::SetThreadOperationProperty(
+      ThreadStatus::COMPACTION_TOTAL_INPUT_BYTES,
+      compaction->CalculateTotalInputSize());
+
+  IOSTATS_RESET(bytes_written);
+  IOSTATS_RESET(bytes_read);
+  ThreadStatusUtil::SetThreadOperationProperty(
+      ThreadStatus::COMPACTION_BYTES_WRITTEN, 0);
+  ThreadStatusUtil::SetThreadOperationProperty(
+      ThreadStatus::COMPACTION_BYTES_READ, 0);
+
+  // Set the thread operation after operation properties
+  // to ensure GetThreadList() can always show them all together.
+  ThreadStatusUtil::SetThreadOperation(
+      ThreadStatus::OP_COMPACTION);
+}
+
+void CompactionJob::Prepare() {
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_COMPACTION_PREPARE);
+  compact_->CleanupBatchBuffer();
+  compact_->CleanupMergedBuffer();
+
+  // Generate file_levels_ for compaction berfore making Iterator
+  ColumnFamilyData* cfd __attribute__((unused)) =
+      compact_->compaction->column_family_data();
+  assert(cfd != nullptr);
+
+  assert(cfd->current()->storage_info()->NumLevelFiles(
+             compact_->compaction->level()) > 0);
+  assert(compact_->builder == nullptr);
+  assert(!compact_->outfile);
+
+  visible_at_tip_ = 0;
+  latest_snapshot_ = 0;
+  if (existing_snapshots_.size() == 0) {
+    // optimize for fast path if there are no snapshots
+    visible_at_tip_ = versions_->LastSequence();
+    earliest_snapshot_ = visible_at_tip_;
+  } else {
+    latest_snapshot_ = existing_snapshots_.back();
+    // Add the current seqno as the 'latest' virtual
+    // snapshot to the end of this list.
+    existing_snapshots_.push_back(versions_->LastSequence());
+    earliest_snapshot_ = existing_snapshots_[0];
+  }
+
+  // Is this compaction producing files at the bottommost level?
+  bottommost_level_ = compact_->compaction->BottomMostLevel();
+}
+
+Status CompactionJob::Run() {
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_COMPACTION_RUN);
+  TEST_SYNC_POINT("CompactionJob::Run():Start");
+  log_buffer_->FlushBufferToLog();
+  ColumnFamilyData* cfd = compact_->compaction->column_family_data();
+
+  auto* compaction = compact_->compaction;
+  // Let's check if anything will get logged. Don't prepare all the info if
+  // we're not logging
+  if (db_options_.info_log_level <= InfoLogLevel::INFO_LEVEL) {
+    Compaction::InputLevelSummaryBuffer inputs_summary;
+    Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+        "[%s] [JOB %d] Compacting %s, score %.2f", cfd->GetName().c_str(),
+        job_id_, compaction->InputLevelSummary(&inputs_summary),
+        compaction->score());
+    char scratch[2345];
+    compact_->compaction->Summary(scratch, sizeof(scratch));
+    Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+        "[%s] Compaction start summary: %s\n", cfd->GetName().c_str(), scratch);
+    // build event logger report
+    auto stream = event_logger_->Log();
+    stream << "job" << job_id_ << "event"
+           << "compaction_started";
+    for (size_t i = 0; i < compaction->num_input_levels(); ++i) {
+      stream << ("files_L" + ToString(compaction->level(i)));
+      stream.StartArray();
+      for (auto f : *compaction->inputs(i)) {
+        stream << f->fd.GetNumber();
+      }
+      stream.EndArray();
+    }
+    stream << "score" << compaction->score() << "input_data_size"
+           << compaction->CalculateTotalInputSize();
+  }
+
+  const uint64_t start_micros = env_->NowMicros();
+  std::unique_ptr<Iterator> input(
+      versions_->MakeInputIterator(compact_->compaction));
+  input->SeekToFirst();
+
+  Status status;
+  ParsedInternalKey ikey;
+  std::unique_ptr<CompactionFilterV2> compaction_filter_from_factory_v2 =
+      nullptr;
+  auto context = compact_->GetFilterContext();
+  compaction_filter_from_factory_v2 =
+      cfd->ioptions()->compaction_filter_factory_v2->CreateCompactionFilterV2(
+          context);
+  auto compaction_filter_v2 = compaction_filter_from_factory_v2.get();
+
+  int64_t imm_micros = 0;  // Micros spent doing imm_ compactions
+  if (!compaction_filter_v2) {
+    status = ProcessKeyValueCompaction(&imm_micros, input.get(), false);
+  } else {
+    // temp_backup_input always point to the start of the current buffer
+    // temp_backup_input = backup_input;
+    // iterate through input,
+    // 1) buffer ineligible keys and value keys into 2 separate buffers;
+    // 2) send value_buffer to compaction filter and alternate the values;
+    // 3) merge value_buffer with ineligible_value_buffer;
+    // 4) run the modified "compaction" using the old for loop.
+    bool prefix_initialized = false;
+    shared_ptr<Iterator> backup_input(
+        versions_->MakeInputIterator(compact_->compaction));
+    backup_input->SeekToFirst();
+    uint64_t total_filter_time = 0;
+    while (backup_input->Valid() &&
+           !shutting_down_->load(std::memory_order_acquire) &&
+           !cfd->IsDropped()) {
+      // FLUSH preempts compaction
+      // TODO(icanadi) this currently only checks if flush is necessary on
+      // compacting column family. we should also check if flush is necessary on
+      // other column families, too
+
+      imm_micros += yield_callback_();
+
+      Slice key = backup_input->key();
+      Slice value = backup_input->value();
+
+      if (!ParseInternalKey(key, &ikey)) {
+        // log error
+        Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
+            "[%s] [JOB %d] Failed to parse key: %s", cfd->GetName().c_str(),
+            job_id_, key.ToString().c_str());
+        continue;
+      } else {
+        const SliceTransform* transformer =
+            cfd->ioptions()->compaction_filter_factory_v2->GetPrefixExtractor();
+        const auto key_prefix = transformer->Transform(ikey.user_key);
+        if (!prefix_initialized) {
+          compact_->cur_prefix_ = key_prefix.ToString();
+          prefix_initialized = true;
+        }
+        // If the prefix remains the same, keep buffering
+        if (key_prefix.compare(Slice(compact_->cur_prefix_)) == 0) {
+          // Apply the compaction filter V2 to all the kv pairs sharing
+          // the same prefix
+          if (ikey.type == kTypeValue &&
+              (visible_at_tip_ || ikey.sequence > latest_snapshot_)) {
+            // Buffer all keys sharing the same prefix for CompactionFilterV2
+            // Iterate through keys to check prefix
+            compact_->BufferKeyValueSlices(key, value);
+          } else {
+            // buffer ineligible keys
+            compact_->BufferOtherKeyValueSlices(key, value);
+          }
+          backup_input->Next();
+          continue;
+          // finish changing values for eligible keys
+        } else {
+          // Now prefix changes, this batch is done.
+          // Call compaction filter on the buffered values to change the value
+          if (compact_->key_str_buf_.size() > 0) {
+            uint64_t time = 0;
+            CallCompactionFilterV2(compaction_filter_v2, &time);
+            total_filter_time += time;
+          }
+          compact_->cur_prefix_ = key_prefix.ToString();
+        }
+      }
+
+      // Merge this batch of data (values + ineligible keys)
+      compact_->MergeKeyValueSliceBuffer(&cfd->internal_comparator());
+
+      // Done buffering for the current prefix. Spit it out to disk
+      // Now just iterate through all the kv-pairs
+      status = ProcessKeyValueCompaction(&imm_micros, input.get(), true);
+
+      if (!status.ok()) {
+        break;
+      }
+
+      // After writing the kv-pairs, we can safely remove the reference
+      // to the string buffer and clean them up
+      compact_->CleanupBatchBuffer();
+      compact_->CleanupMergedBuffer();
+      // Buffer the key that triggers the mismatch in prefix
+      if (ikey.type == kTypeValue &&
+          (visible_at_tip_ || ikey.sequence > latest_snapshot_)) {
+        compact_->BufferKeyValueSlices(key, value);
+      } else {
+        compact_->BufferOtherKeyValueSlices(key, value);
+      }
+      backup_input->Next();
+      if (!backup_input->Valid()) {
+        // If this is the single last value, we need to merge it.
+        if (compact_->key_str_buf_.size() > 0) {
+          uint64_t time = 0;
+          CallCompactionFilterV2(compaction_filter_v2, &time);
+          total_filter_time += time;
+        }
+        compact_->MergeKeyValueSliceBuffer(&cfd->internal_comparator());
+
+        status = ProcessKeyValueCompaction(&imm_micros, input.get(), true);
+        if (!status.ok()) {
+          break;
+        }
+
+        compact_->CleanupBatchBuffer();
+        compact_->CleanupMergedBuffer();
+      }
+    }  // done processing all prefix batches
+    // finish the last batch
+    if (status.ok()) {
+      if (compact_->key_str_buf_.size() > 0) {
+        uint64_t time = 0;
+        CallCompactionFilterV2(compaction_filter_v2, &time);
+        total_filter_time += time;
+      }
+      compact_->MergeKeyValueSliceBuffer(&cfd->internal_comparator());
+      status = ProcessKeyValueCompaction(&imm_micros, input.get(), true);
+    }
+    RecordTick(stats_, FILTER_OPERATION_TOTAL_TIME, total_filter_time);
+  }  // checking for compaction filter v2
+
+  if (status.ok() &&
+      (shutting_down_->load(std::memory_order_acquire) || cfd->IsDropped())) {
+    status = Status::ShutdownInProgress(
+        "Database shutdown or Column family drop during compaction");
+  }
+  if (status.ok() && compact_->builder != nullptr) {
+    status = FinishCompactionOutputFile(input.get());
+  }
+  if (status.ok()) {
+    status = input->status();
+  }
+  input.reset();
+
+  if (output_directory_ && !db_options_.disableDataSync) {
+    output_directory_->Fsync();
+  }
+
+  compaction_stats_.micros = env_->NowMicros() - start_micros - imm_micros;
+  compaction_stats_.files_in_leveln =
+      static_cast<int>(compact_->compaction->num_input_files(0));
+  compaction_stats_.files_in_levelnp1 =
+      static_cast<int>(compact_->compaction->num_input_files(1));
+  MeasureTime(stats_, COMPACTION_TIME, compaction_stats_.micros);
+
+  size_t num_output_files = compact_->outputs.size();
+  if (compact_->builder != nullptr) {
+    // An error occurred so ignore the last output.
+    assert(num_output_files > 0);
+    --num_output_files;
+  }
+  compaction_stats_.files_out_levelnp1 = static_cast<int>(num_output_files);
+
+  for (size_t i = 0; i < compact_->compaction->num_input_files(0); i++) {
+    compaction_stats_.bytes_readn +=
+        compact_->compaction->input(0, i)->fd.GetFileSize();
+    compaction_stats_.num_input_records +=
+        static_cast<uint64_t>(compact_->compaction->input(0, i)->num_entries);
+  }
+
+  for (size_t i = 0; i < compact_->compaction->num_input_files(1); i++) {
+    compaction_stats_.bytes_readnp1 +=
+        compact_->compaction->input(1, i)->fd.GetFileSize();
+  }
+
+  for (size_t i = 0; i < num_output_files; i++) {
+    compaction_stats_.bytes_written += compact_->outputs[i].file_size;
+  }
+  if (compact_->num_input_records > compact_->num_output_records) {
+    compaction_stats_.num_dropped_records +=
+        compact_->num_input_records - compact_->num_output_records;
+  }
+
+  RecordCompactionIOStats();
+
+  LogFlush(db_options_.info_log);
+  TEST_SYNC_POINT("CompactionJob::Run():End");
+  return status;
+}
+
+void CompactionJob::Install(Status* status,
+                            const MutableCFOptions& mutable_cf_options,
+                            InstrumentedMutex* db_mutex) {
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_COMPACTION_INSTALL);
+  db_mutex->AssertHeld();
+  ColumnFamilyData* cfd = compact_->compaction->column_family_data();
+  cfd->internal_stats()->AddCompactionStats(
+      compact_->compaction->output_level(), compaction_stats_);
+
+  if (status->ok()) {
+    *status = InstallCompactionResults(db_mutex, mutable_cf_options);
+  }
+  VersionStorageInfo::LevelSummaryStorage tmp;
+  auto vstorage = cfd->current()->storage_info();
+  const auto& stats = compaction_stats_;
+  LogToBuffer(log_buffer_,
+              "[%s] compacted to: %s, MB/sec: %.1f rd, %.1f wr, level %d, "
+              "files in(%d, %d) out(%d) "
+              "MB in(%.1f, %.1f) out(%.1f), read-write-amplify(%.1f) "
+              "write-amplify(%.1f) %s, records in: %d, records dropped: %d\n",
+              cfd->GetName().c_str(), vstorage->LevelSummary(&tmp),
+              (stats.bytes_readn + stats.bytes_readnp1) /
+                  static_cast<double>(stats.micros),
+              stats.bytes_written / static_cast<double>(stats.micros),
+              compact_->compaction->output_level(), stats.files_in_leveln,
+              stats.files_in_levelnp1, stats.files_out_levelnp1,
+              stats.bytes_readn / 1048576.0, stats.bytes_readnp1 / 1048576.0,
+              stats.bytes_written / 1048576.0,
+              (stats.bytes_written + stats.bytes_readnp1 + stats.bytes_readn) /
+                  static_cast<double>(stats.bytes_readn),
+              stats.bytes_written / static_cast<double>(stats.bytes_readn),
+              status->ToString().c_str(), stats.num_input_records,
+              stats.num_dropped_records);
+
+  auto stream = event_logger_->LogToBuffer(log_buffer_);
+  stream << "job" << job_id_ << "event"
+         << "compaction_finished"
+         << "output_level" << compact_->compaction->output_level()
+         << "num_output_files" << compact_->outputs.size()
+         << "total_output_size" << compact_->total_bytes << "num_input_records"
+         << compact_->num_input_records << "num_output_records"
+         << compact_->num_output_records;
+  stream << "lsm_state";
+  stream.StartArray();
+  for (int level = 0; level < vstorage->num_levels(); ++level) {
+    stream << vstorage->NumLevelFiles(level);
+  }
+  stream.EndArray();
+
+  CleanupCompaction(*status);
+}
+
+Status CompactionJob::ProcessKeyValueCompaction(int64_t* imm_micros,
+                                                Iterator* input,
+                                                bool is_compaction_v2) {
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_COMPACTION_PROCESS_KV);
+  size_t combined_idx = 0;
+  Status status;
+  std::string compaction_filter_value;
+  ParsedInternalKey ikey;
+  IterKey current_user_key;
+  bool has_current_user_key = false;
+  IterKey delete_key;
+  SequenceNumber last_sequence_for_key __attribute__((unused)) =
+      kMaxSequenceNumber;
+  SequenceNumber visible_in_snapshot = kMaxSequenceNumber;
+  ColumnFamilyData* cfd = compact_->compaction->column_family_data();
+  MergeHelper merge(cfd->user_comparator(), cfd->ioptions()->merge_operator,
+                    db_options_.info_log.get(),
+                    cfd->ioptions()->min_partial_merge_operands,
+                    false /* internal key corruption is expected */);
+  auto compaction_filter = cfd->ioptions()->compaction_filter;
+  std::unique_ptr<CompactionFilter> compaction_filter_from_factory = nullptr;
+  if (!compaction_filter) {
+    auto context = compact_->GetFilterContextV1();
+    compaction_filter_from_factory =
+        cfd->ioptions()->compaction_filter_factory->CreateCompactionFilter(
+            context);
+    compaction_filter = compaction_filter_from_factory.get();
+  }
+
+  TEST_SYNC_POINT("CompactionJob::Run():Inprogress");
+
+  int64_t key_drop_user = 0;
+  int64_t key_drop_newer_entry = 0;
+  int64_t key_drop_obsolete = 0;
+  int64_t loop_cnt = 0;
+
+  StopWatchNano timer(env_, stats_ != nullptr);
+  uint64_t total_filter_time = 0;
+  while (input->Valid() && !shutting_down_->load(std::memory_order_acquire) &&
+         !cfd->IsDropped() && status.ok()) {
+    compact_->num_input_records++;
+    if (++loop_cnt > 1000) {
+      if (key_drop_user > 0) {
+        RecordTick(stats_, COMPACTION_KEY_DROP_USER, key_drop_user);
+        key_drop_user = 0;
+      }
+      if (key_drop_newer_entry > 0) {
+        RecordTick(stats_, COMPACTION_KEY_DROP_NEWER_ENTRY,
+                   key_drop_newer_entry);
+        key_drop_newer_entry = 0;
+      }
+      if (key_drop_obsolete > 0) {
+        RecordTick(stats_, COMPACTION_KEY_DROP_OBSOLETE, key_drop_obsolete);
+        key_drop_obsolete = 0;
+      }
+      RecordCompactionIOStats();
+      loop_cnt = 0;
+    }
+    // FLUSH preempts compaction
+    // TODO(icanadi) this currently only checks if flush is necessary on
+    // compacting column family. we should also check if flush is necessary on
+    // other column families, too
+    (*imm_micros) += yield_callback_();
+
+    Slice key;
+    Slice value;
+    // If is_compaction_v2 is on, kv-pairs are reset to the prefix batch.
+    // This prefix batch should contain results after calling
+    // compaction_filter_v2.
+    //
+    // If is_compaction_v2 is off, this function will go through all the
+    // kv-pairs in input.
+    if (!is_compaction_v2) {
+      key = input->key();
+      value = input->value();
+    } else {
+      if (combined_idx >= compact_->combined_key_buf_.size()) {
+        break;
+      }
+      assert(combined_idx < compact_->combined_key_buf_.size());
+      key = compact_->combined_key_buf_[combined_idx];
+      value = compact_->combined_value_buf_[combined_idx];
+
+      ++combined_idx;
+    }
+
+    if (compact_->compaction->ShouldStopBefore(key) &&
+        compact_->builder != nullptr) {
+      status = FinishCompactionOutputFile(input);
+      if (!status.ok()) {
+        break;
+      }
+    }
+
+    // Handle key/value, add to state, etc.
+    bool drop = false;
+    bool current_entry_is_merging = false;
+    if (!ParseInternalKey(key, &ikey)) {
+      // Do not hide error keys
+      // TODO: error key stays in db forever? Figure out the intention/rationale
+      // v10 error v8 : we cannot hide v8 even though it's pretty obvious.
+      current_user_key.Clear();
+      has_current_user_key = false;
+      last_sequence_for_key = kMaxSequenceNumber;
+      visible_in_snapshot = kMaxSequenceNumber;
+    } else {
+      if (!has_current_user_key ||
+          cfd->user_comparator()->Compare(ikey.user_key,
+                                          current_user_key.GetKey()) != 0) {
+        // First occurrence of this user key
+        current_user_key.SetKey(ikey.user_key);
+        has_current_user_key = true;
+        last_sequence_for_key = kMaxSequenceNumber;
+        visible_in_snapshot = kMaxSequenceNumber;
+        // apply the compaction filter to the first occurrence of the user key
+        if (compaction_filter && !is_compaction_v2 && ikey.type == kTypeValue &&
+            (visible_at_tip_ || ikey.sequence > latest_snapshot_)) {
+          // If the user has specified a compaction filter and the sequence
+          // number is greater than any external snapshot, then invoke the
+          // filter.
+          // If the return value of the compaction filter is true, replace
+          // the entry with a delete marker.
+          bool value_changed = false;
+          compaction_filter_value.clear();
+          if (stats_ != nullptr) {
+            timer.Start();
+          }
+          bool to_delete = compaction_filter->Filter(
+              compact_->compaction->level(), ikey.user_key, value,
+              &compaction_filter_value, &value_changed);
+          total_filter_time += timer.ElapsedNanos();
+          if (to_delete) {
+            // make a copy of the original key and convert it to a delete
+            delete_key.SetInternalKey(ExtractUserKey(key), ikey.sequence,
+                                      kTypeDeletion);
+            // anchor the key again
+            key = delete_key.GetKey();
+            // needed because ikey is backed by key
+            ParseInternalKey(key, &ikey);
+            // no value associated with delete
+            value.clear();
+            ++key_drop_user;
+          } else if (value_changed) {
+            value = compaction_filter_value;
+          }
+        }
+      }
+
+      // If there are no snapshots, then this kv affect visibility at tip.
+      // Otherwise, search though all existing snapshots to find
+      // the earlist snapshot that is affected by this kv.
+      SequenceNumber prev_snapshot = 0;  // 0 means no previous snapshot
+      SequenceNumber visible =
+          visible_at_tip_
+              ? visible_at_tip_
+              : findEarliestVisibleSnapshot(ikey.sequence, existing_snapshots_,
+                                            &prev_snapshot);
+
+      if (visible_in_snapshot == visible) {
+        // If the earliest snapshot is which this key is visible in
+        // is the same as the visibily of a previous instance of the
+        // same key, then this kv is not visible in any snapshot.
+        // Hidden by an newer entry for same user key
+        // TODO: why not > ?
+        assert(last_sequence_for_key >= ikey.sequence);
+        drop = true;  // (A)
+        ++key_drop_newer_entry;
+      } else if (ikey.type == kTypeDeletion &&
+                 ikey.sequence <= earliest_snapshot_ &&
+                 compact_->compaction->KeyNotExistsBeyondOutputLevel(
+                     ikey.user_key)) {
+        // For this user key:
+        // (1) there is no data in higher levels
+        // (2) data in lower levels will have larger sequence numbers
+        // (3) data in layers that are being compacted here and have
+        //     smaller sequence numbers will be dropped in the next
+        //     few iterations of this loop (by rule (A) above).
+        // Therefore this deletion marker is obsolete and can be dropped.
+        drop = true;
+        ++key_drop_obsolete;
+      } else if (ikey.type == kTypeMerge) {
+        if (!merge.HasOperator()) {
+          LogToBuffer(log_buffer_, "Options::merge_operator is null.");
+          status = Status::InvalidArgument(
+              "merge_operator is not properly initialized.");
+          break;
+        }
+        // We know the merge type entry is not hidden, otherwise we would
+        // have hit (A)
+        // We encapsulate the merge related state machine in a different
+        // object to minimize change to the existing flow. Turn out this
+        // logic could also be nicely re-used for memtable flush purge
+        // optimization in BuildTable.
+        int steps = 0;
+        merge.MergeUntil(input, prev_snapshot, bottommost_level_,
+                         db_options_.statistics.get(), &steps, env_);
+        // Skip the Merge ops
+        combined_idx = combined_idx - 1 + steps;
+
+        current_entry_is_merging = true;
+        if (merge.IsSuccess()) {
+          // Successfully found Put/Delete/(end-of-key-range) while merging
+          // Get the merge result
+          key = merge.key();
+          ParseInternalKey(key, &ikey);
+          value = merge.value();
+        } else {
+          // Did not find a Put/Delete/(end-of-key-range) while merging
+          // We now have some stack of merge operands to write out.
+          // NOTE: key,value, and ikey are now referring to old entries.
+          //       These will be correctly set below.
+          assert(!merge.keys().empty());
+          assert(merge.keys().size() == merge.values().size());
+
+          // Hack to make sure last_sequence_for_key is correct
+          ParseInternalKey(merge.keys().front(), &ikey);
+        }
+      }
+
+      last_sequence_for_key = ikey.sequence;
+      visible_in_snapshot = visible;
+    }
+
+    if (!drop) {
+      // We may write a single key (e.g.: for Put/Delete or successful merge).
+      // Or we may instead have to write a sequence/list of keys.
+      // We have to write a sequence iff we have an unsuccessful merge
+      bool has_merge_list = current_entry_is_merging && !merge.IsSuccess();
+      const std::deque<std::string>* keys = nullptr;
+      const std::deque<std::string>* values = nullptr;
+      std::deque<std::string>::const_reverse_iterator key_iter;
+      std::deque<std::string>::const_reverse_iterator value_iter;
+      if (has_merge_list) {
+        keys = &merge.keys();
+        values = &merge.values();
+        key_iter = keys->rbegin();  // The back (*rbegin()) is the first key
+        value_iter = values->rbegin();
+
+        key = Slice(*key_iter);
+        value = Slice(*value_iter);
+      }
+
+      // If we have a list of keys to write, traverse the list.
+      // If we have a single key to write, simply write that key.
+      while (true) {
+        // Invariant: key,value,ikey will always be the next entry to write
+        char* kptr = (char*)key.data();
+        std::string kstr;
+
+        // Zeroing out the sequence number leads to better compression.
+        // If this is the bottommost level (no files in lower levels)
+        // and the earliest snapshot is larger than this seqno
+        // then we can squash the seqno to zero.
+        if (bottommost_level_ && ikey.sequence < earliest_snapshot_ &&
+            ikey.type != kTypeMerge) {
+          assert(ikey.type != kTypeDeletion);
+          // make a copy because updating in place would cause problems
+          // with the priority queue that is managing the input key iterator
+          kstr.assign(key.data(), key.size());
+          kptr = (char*)kstr.c_str();
+          UpdateInternalKey(kptr, key.size(), (uint64_t)0, ikey.type);
+        }
+
+        Slice newkey(kptr, key.size());
+        assert((key.clear(), 1));  // we do not need 'key' anymore
+
+        // Open output file if necessary
+        if (compact_->builder == nullptr) {
+          status = OpenCompactionOutputFile();
+          if (!status.ok()) {
+            break;
+          }
+        }
+
+        SequenceNumber seqno = GetInternalKeySeqno(newkey);
+        if (compact_->builder->NumEntries() == 0) {
+          compact_->current_output()->smallest.DecodeFrom(newkey);
+          compact_->current_output()->smallest_seqno = seqno;
+        } else {
+          compact_->current_output()->smallest_seqno =
+              std::min(compact_->current_output()->smallest_seqno, seqno);
+        }
+        compact_->current_output()->largest.DecodeFrom(newkey);
+        compact_->builder->Add(newkey, value);
+        compact_->num_output_records++,
+            compact_->current_output()->largest_seqno =
+                std::max(compact_->current_output()->largest_seqno, seqno);
+
+        // Close output file if it is big enough
+        if (compact_->builder->FileSize() >=
+            compact_->compaction->MaxOutputFileSize()) {
+          status = FinishCompactionOutputFile(input);
+          if (!status.ok()) {
+            break;
+          }
+        }
+
+        // If we have a list of entries, move to next element
+        // If we only had one entry, then break the loop.
+        if (has_merge_list) {
+          ++key_iter;
+          ++value_iter;
+
+          // If at end of list
+          if (key_iter == keys->rend() || value_iter == values->rend()) {
+            // Sanity Check: if one ends, then both end
+            assert(key_iter == keys->rend() && value_iter == values->rend());
+            break;
+          }
+
+          // Otherwise not at end of list. Update key, value, and ikey.
+          key = Slice(*key_iter);
+          value = Slice(*value_iter);
+          ParseInternalKey(key, &ikey);
+
+        } else {
+          // Only had one item to begin with (Put/Delete)
+          break;
+        }
+      }  // while (true)
+    }    // if (!drop)
+
+    // MergeUntil has moved input to the next entry
+    if (!current_entry_is_merging) {
+      input->Next();
+    }
+  }
+  RecordTick(stats_, FILTER_OPERATION_TOTAL_TIME, total_filter_time);
+  if (key_drop_user > 0) {
+    RecordTick(stats_, COMPACTION_KEY_DROP_USER, key_drop_user);
+  }
+  if (key_drop_newer_entry > 0) {
+    RecordTick(stats_, COMPACTION_KEY_DROP_NEWER_ENTRY, key_drop_newer_entry);
+  }
+  if (key_drop_obsolete > 0) {
+    RecordTick(stats_, COMPACTION_KEY_DROP_OBSOLETE, key_drop_obsolete);
+  }
+  RecordCompactionIOStats();
+
+  return status;
+}
+
+void CompactionJob::CallCompactionFilterV2(
+    CompactionFilterV2* compaction_filter_v2, uint64_t* time) {
+  if (compact_ == nullptr || compaction_filter_v2 == nullptr) {
+    return;
+  }
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_COMPACTION_FILTER_V2);
+
+  // Assemble slice vectors for user keys and existing values.
+  // We also keep track of our parsed internal key structs because
+  // we may need to access the sequence number in the event that
+  // keys are garbage collected during the filter process.
+  std::vector<ParsedInternalKey> ikey_buf;
+  std::vector<Slice> user_key_buf;
+  std::vector<Slice> existing_value_buf;
+
+  for (const auto& key : compact_->key_str_buf_) {
+    ParsedInternalKey ikey;
+    ParseInternalKey(Slice(key), &ikey);
+    ikey_buf.emplace_back(ikey);
+    user_key_buf.emplace_back(ikey.user_key);
+  }
+  for (const auto& value : compact_->existing_value_str_buf_) {
+    existing_value_buf.emplace_back(Slice(value));
+  }
+
+  // If the user has specified a compaction filter and the sequence
+  // number is greater than any external snapshot, then invoke the
+  // filter.
+  // If the return value of the compaction filter is true, replace
+  // the entry with a delete marker.
+  StopWatchNano timer(env_, stats_ != nullptr);
+  compact_->to_delete_buf_ = compaction_filter_v2->Filter(
+      compact_->compaction->level(), user_key_buf, existing_value_buf,
+      &compact_->new_value_buf_, &compact_->value_changed_buf_);
+  *time = timer.ElapsedNanos();
+  // new_value_buf_.size() <= to_delete__buf_.size(). "=" iff all
+  // kv-pairs in this compaction run needs to be deleted.
+  assert(compact_->to_delete_buf_.size() == compact_->key_str_buf_.size());
+  assert(compact_->to_delete_buf_.size() ==
+         compact_->existing_value_str_buf_.size());
+  assert(compact_->value_changed_buf_.empty() ||
+         compact_->to_delete_buf_.size() ==
+         compact_->value_changed_buf_.size());
+
+  int new_value_idx = 0;
+  for (unsigned int i = 0; i < compact_->to_delete_buf_.size(); ++i) {
+    if (compact_->to_delete_buf_[i]) {
+      // update the string buffer directly
+      // the Slice buffer points to the updated buffer
+      UpdateInternalKey(&compact_->key_str_buf_[i][0],
+                        compact_->key_str_buf_[i].size(), ikey_buf[i].sequence,
+                        kTypeDeletion);
+
+      // no value associated with delete
+      compact_->existing_value_str_buf_[i].clear();
+      RecordTick(stats_, COMPACTION_KEY_DROP_USER);
+    } else if (!compact_->value_changed_buf_.empty() &&
+        compact_->value_changed_buf_[i]) {
+      compact_->existing_value_str_buf_[i] =
+          compact_->new_value_buf_[new_value_idx++];
+    }
+  }  // for
+}
+
+Status CompactionJob::FinishCompactionOutputFile(Iterator* input) {
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_COMPACTION_SYNC_FILE);
+  assert(compact_ != nullptr);
+  assert(compact_->outfile);
+  assert(compact_->builder != nullptr);
+
+  const uint64_t output_number = compact_->current_output()->number;
+  const uint32_t output_path_id = compact_->current_output()->path_id;
+  assert(output_number != 0);
+
+  TableProperties table_properties;
+  // Check for iterator errors
+  Status s = input->status();
+  const uint64_t current_entries = compact_->builder->NumEntries();
+  if (s.ok()) {
+    s = compact_->builder->Finish();
+  } else {
+    compact_->builder->Abandon();
+  }
+  if (s.ok()) {
+    table_properties = compact_->builder->GetTableProperties();
+  }
+  const uint64_t current_bytes = compact_->builder->FileSize();
+  compact_->current_output()->file_size = current_bytes;
+  compact_->total_bytes += current_bytes;
+  compact_->builder.reset();
+
+  // Finish and check for file errors
+  if (s.ok() && !db_options_.disableDataSync) {
+    if (db_options_.use_fsync) {
+      StopWatch sw(env_, stats_, COMPACTION_OUTFILE_SYNC_MICROS);
+      s = compact_->outfile->Fsync();
+    } else {
+      StopWatch sw(env_, stats_, COMPACTION_OUTFILE_SYNC_MICROS);
+      s = compact_->outfile->Sync();
+    }
+  }
+  if (s.ok()) {
+    s = compact_->outfile->Close();
+  }
+  compact_->outfile.reset();
+
+  if (s.ok() && current_entries > 0) {
+    // Verify that the table is usable
+    ColumnFamilyData* cfd = compact_->compaction->column_family_data();
+    FileDescriptor fd(output_number, output_path_id, current_bytes);
+    Iterator* iter = cfd->table_cache()->NewIterator(
+        ReadOptions(), env_options_, cfd->internal_comparator(), fd);
+    s = iter->status();
+
+    if (s.ok() && paranoid_file_checks_) {
+      for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {}
+      s = iter->status();
+    }
+
+    delete iter;
+    if (s.ok()) {
+      Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+          "[%s] [JOB %d] Generated table #%" PRIu64 ": %" PRIu64
+          " keys, %" PRIu64 " bytes",
+          cfd->GetName().c_str(), job_id_, output_number, current_entries,
+          current_bytes);
+      EventLoggerHelpers::LogTableFileCreation(event_logger_, job_id_,
+                                               output_number, current_bytes,
+                                               table_properties);
+    }
+  }
+  return s;
+}
+
+Status CompactionJob::InstallCompactionResults(
+    InstrumentedMutex* db_mutex, const MutableCFOptions& mutable_cf_options) {
+  db_mutex->AssertHeld();
+
+  auto* compaction = compact_->compaction;
+  // paranoia: verify that the files that we started with
+  // still exist in the current version and in the same original level.
+  // This ensures that a concurrent compaction did not erroneously
+  // pick the same files to compact_.
+  if (!versions_->VerifyCompactionFileConsistency(compaction)) {
+    Compaction::InputLevelSummaryBuffer inputs_summary;
+
+    Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+        "[%s] [JOB %d] Compaction %s aborted",
+        compaction->column_family_data()->GetName().c_str(), job_id_,
+        compaction->InputLevelSummary(&inputs_summary));
+    return Status::Corruption("Compaction input files inconsistent");
+  }
+
+  {
+    Compaction::InputLevelSummaryBuffer inputs_summary;
+    Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+        "[%s] [JOB %d] Compacted %s => %" PRIu64 " bytes",
+        compaction->column_family_data()->GetName().c_str(), job_id_,
+        compaction->InputLevelSummary(&inputs_summary), compact_->total_bytes);
+  }
+
+  // Add compaction outputs
+  compaction->AddInputDeletions(compact_->compaction->edit());
+  for (size_t i = 0; i < compact_->outputs.size(); i++) {
+    const CompactionState::Output& out = compact_->outputs[i];
+    compaction->edit()->AddFile(
+        compaction->output_level(), out.number, out.path_id, out.file_size,
+        out.smallest, out.largest, out.smallest_seqno, out.largest_seqno);
+  }
+  return versions_->LogAndApply(compaction->column_family_data(),
+                                mutable_cf_options, compaction->edit(),
+                                db_mutex, db_directory_);
+}
+
+// Given a sequence number, return the sequence number of the
+// earliest snapshot that this sequence number is visible in.
+// The snapshots themselves are arranged in ascending order of
+// sequence numbers.
+// Employ a sequential search because the total number of
+// snapshots are typically small.
+inline SequenceNumber CompactionJob::findEarliestVisibleSnapshot(
+    SequenceNumber in, const std::vector<SequenceNumber>& snapshots,
+    SequenceNumber* prev_snapshot) {
+  assert(snapshots.size());
+  SequenceNumber prev __attribute__((unused)) = 0;
+  for (const auto cur : snapshots) {
+    assert(prev <= cur);
+    if (cur >= in) {
+      *prev_snapshot = prev;
+      return cur;
+    }
+    prev = cur;  // assignment
+    assert(prev);
+  }
+  Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
+      "CompactionJob is not able to find snapshot"
+      " with SeqId later than %" PRIu64
+      ": current MaxSeqId is %" PRIu64 "",
+      in, snapshots[snapshots.size() - 1]);
+  assert(0);
+  return 0;
+}
+
+void CompactionJob::RecordCompactionIOStats() {
+  RecordTick(stats_, COMPACT_READ_BYTES, IOSTATS(bytes_read));
+  ThreadStatusUtil::IncreaseThreadOperationProperty(
+      ThreadStatus::COMPACTION_BYTES_READ, IOSTATS(bytes_read));
+  IOSTATS_RESET(bytes_read);
+  RecordTick(stats_, COMPACT_WRITE_BYTES, IOSTATS(bytes_written));
+  ThreadStatusUtil::IncreaseThreadOperationProperty(
+      ThreadStatus::COMPACTION_BYTES_WRITTEN, IOSTATS(bytes_written));
+  IOSTATS_RESET(bytes_written);
+}
+
+Status CompactionJob::OpenCompactionOutputFile() {
+  assert(compact_ != nullptr);
+  assert(compact_->builder == nullptr);
+  // no need to lock because VersionSet::next_file_number_ is atomic
+  uint64_t file_number = versions_->NewFileNumber();
+  // Make the output file
+  std::string fname = TableFileName(db_options_.db_paths, file_number,
+                                    compact_->compaction->GetOutputPathId());
+  Status s = env_->NewWritableFile(fname, &compact_->outfile, env_options_);
+
+  if (!s.ok()) {
+    Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+        "[%s] [JOB %d] OpenCompactionOutputFiles for table #%" PRIu64
+        " fails at NewWritableFile with status %s",
+        compact_->compaction->column_family_data()->GetName().c_str(), job_id_,
+        file_number, s.ToString().c_str());
+    LogFlush(db_options_.info_log);
+    return s;
+  }
+  CompactionState::Output out;
+  out.number = file_number;
+  out.path_id = compact_->compaction->GetOutputPathId();
+  out.smallest.Clear();
+  out.largest.Clear();
+  out.smallest_seqno = out.largest_seqno = 0;
+
+  compact_->outputs.push_back(out);
+  compact_->outfile->SetIOPriority(Env::IO_LOW);
+  compact_->outfile->SetPreallocationBlockSize(
+      static_cast<size_t>(compact_->compaction->OutputFilePreallocationSize()));
+
+  ColumnFamilyData* cfd = compact_->compaction->column_family_data();
+  bool skip_filters = false;
+
+  // If the Column family flag is to only optimize filters for hits,
+  // we can skip creating filters if this is the bottommost_level where
+  // data is going to be found
+  //
+  if (cfd->ioptions()->optimize_filters_for_hits && bottommost_level_) {
+    skip_filters = true;
+  }
+
+  compact_->builder.reset(NewTableBuilder(
+      *cfd->ioptions(), cfd->internal_comparator(),
+      cfd->int_tbl_prop_collector_factories(), compact_->outfile.get(),
+      compact_->compaction->OutputCompressionType(),
+      cfd->ioptions()->compression_opts, skip_filters));
+  LogFlush(db_options_.info_log);
+  return s;
+}
+
+void CompactionJob::CleanupCompaction(const Status& status) {
+  if (compact_->builder != nullptr) {
+    // May happen if we get a shutdown call in the middle of compaction
+    compact_->builder->Abandon();
+    compact_->builder.reset();
+  } else {
+    assert(!status.ok() || compact_->outfile == nullptr);
+  }
+  for (size_t i = 0; i < compact_->outputs.size(); i++) {
+    const CompactionState::Output& out = compact_->outputs[i];
+
+    // If this file was inserted into the table cache then remove
+    // them here because this compaction was not committed.
+    if (!status.ok()) {
+      TableCache::Evict(table_cache_.get(), out.number);
+    }
+  }
+  delete compact_;
+  compact_ = nullptr;
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/db/compaction_job.h b/src/rocksdb/db/compaction_job.h
new file mode 100644
index 0000000..d34e4bd
--- /dev/null
+++ b/src/rocksdb/db/compaction_job.h
@@ -0,0 +1,138 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <atomic>
+#include <deque>
+#include <limits>
+#include <set>
+#include <utility>
+#include <vector>
+#include <string>
+#include <functional>
+
+#include "db/dbformat.h"
+#include "db/log_writer.h"
+#include "db/column_family.h"
+#include "db/version_edit.h"
+#include "db/memtable_list.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/transaction_log.h"
+#include "util/autovector.h"
+#include "util/event_logger.h"
+#include "util/stop_watch.h"
+#include "util/thread_local.h"
+#include "util/scoped_arena_iterator.h"
+#include "db/internal_stats.h"
+#include "db/write_controller.h"
+#include "db/flush_scheduler.h"
+#include "db/write_thread.h"
+#include "db/job_context.h"
+
+namespace rocksdb {
+
+class MemTable;
+class TableCache;
+class Version;
+class VersionEdit;
+class VersionSet;
+class Arena;
+
+class CompactionJob {
+ public:
+  CompactionJob(int job_id, Compaction* compaction, const DBOptions& db_options,
+                const EnvOptions& env_options, VersionSet* versions,
+                std::atomic<bool>* shutting_down, LogBuffer* log_buffer,
+                Directory* db_directory, Directory* output_directory,
+                Statistics* stats,
+                std::vector<SequenceNumber> existing_snapshots,
+                std::shared_ptr<Cache> table_cache,
+                std::function<uint64_t()> yield_callback,
+                EventLogger* event_logger, bool paranoid_file_checks);
+
+  ~CompactionJob();
+
+  // no copy/move
+  CompactionJob(CompactionJob&& job) = delete;
+  CompactionJob(const CompactionJob& job) = delete;
+  CompactionJob& operator=(const CompactionJob& job) = delete;
+
+  // REQUIRED: mutex held
+  void Prepare();
+  // REQUIRED mutex not held
+  Status Run();
+  // REQUIRED: mutex held
+  // status is the return of Run()
+  void Install(Status* status, const MutableCFOptions& mutable_cf_options,
+               InstrumentedMutex* db_mutex);
+
+ private:
+  // update the thread status for starting a compaction.
+  void ReportStartedCompaction(Compaction* compaction);
+  void AllocateCompactionOutputFileNumbers();
+  // Call compaction filter if is_compaction_v2 is not true. Then iterate
+  // through input and compact the kv-pairs
+  Status ProcessKeyValueCompaction(int64_t* imm_micros, Iterator* input,
+                                   bool is_compaction_v2);
+  // Call compaction_filter_v2->Filter() on kv-pairs in compact
+  void CallCompactionFilterV2(CompactionFilterV2* compaction_filter_v2,
+                              uint64_t* time);
+  Status FinishCompactionOutputFile(Iterator* input);
+  Status InstallCompactionResults(InstrumentedMutex* db_mutex,
+                                  const MutableCFOptions& mutable_cf_options);
+  SequenceNumber findEarliestVisibleSnapshot(
+      SequenceNumber in, const std::vector<SequenceNumber>& snapshots,
+      SequenceNumber* prev_snapshot);
+  void RecordCompactionIOStats();
+  Status OpenCompactionOutputFile();
+  void CleanupCompaction(const Status& status);
+
+  int job_id_;
+
+  // CompactionJob state
+  struct CompactionState;
+  CompactionState* compact_;
+
+  bool bottommost_level_;
+  SequenceNumber earliest_snapshot_;
+  SequenceNumber visible_at_tip_;
+  SequenceNumber latest_snapshot_;
+
+  InternalStats::CompactionStats compaction_stats_;
+
+  // DBImpl state
+  const DBOptions& db_options_;
+  const EnvOptions& env_options_;
+  Env* env_;
+  VersionSet* versions_;
+  std::atomic<bool>* shutting_down_;
+  LogBuffer* log_buffer_;
+  Directory* db_directory_;
+  Directory* output_directory_;
+  Statistics* stats_;
+  // If there were two snapshots with seq numbers s1 and
+  // s2 and s1 < s2, and if we find two instances of a key k1 then lies
+  // entirely within s1 and s2, then the earlier version of k1 can be safely
+  // deleted because that version is not visible in any snapshot.
+  std::vector<SequenceNumber> existing_snapshots_;
+  std::shared_ptr<Cache> table_cache_;
+
+  // yield callback
+  std::function<uint64_t()> yield_callback_;
+
+  EventLogger* event_logger_;
+
+  bool paranoid_file_checks_;
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/db/compaction_job_test.cc b/src/rocksdb/db/compaction_job_test.cc
new file mode 100644
index 0000000..e4c407a
--- /dev/null
+++ b/src/rocksdb/db/compaction_job_test.cc
@@ -0,0 +1,189 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include <map>
+#include <string>
+
+#include "db/compaction_job.h"
+#include "db/column_family.h"
+#include "db/version_set.h"
+#include "db/writebuffer.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/options.h"
+#include "rocksdb/db.h"
+#include "util/string_util.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+#include "table/mock_table.h"
+
+namespace rocksdb {
+
+// TODO(icanadi) Make it simpler once we mock out VersionSet
+class CompactionJobTest : public testing::Test {
+ public:
+  CompactionJobTest()
+      : env_(Env::Default()),
+        dbname_(test::TmpDir() + "/compaction_job_test"),
+        mutable_cf_options_(Options(), ImmutableCFOptions(Options())),
+        table_cache_(NewLRUCache(50000, 16)),
+        write_buffer_(db_options_.db_write_buffer_size),
+        versions_(new VersionSet(dbname_, &db_options_, env_options_,
+                                 table_cache_.get(), &write_buffer_,
+                                 &write_controller_)),
+        shutting_down_(false),
+        mock_table_factory_(new mock::MockTableFactory()) {
+    EXPECT_OK(env_->CreateDirIfMissing(dbname_));
+    db_options_.db_paths.emplace_back(dbname_,
+                                      std::numeric_limits<uint64_t>::max());
+    NewDB();
+    std::vector<ColumnFamilyDescriptor> column_families;
+    cf_options_.table_factory = mock_table_factory_;
+    column_families.emplace_back(kDefaultColumnFamilyName, cf_options_);
+
+    EXPECT_OK(versions_->Recover(column_families, false));
+  }
+
+  std::string GenerateFileName(uint64_t file_number) {
+    FileMetaData meta;
+    std::vector<DbPath> db_paths;
+    db_paths.emplace_back(dbname_, std::numeric_limits<uint64_t>::max());
+    meta.fd = FileDescriptor(file_number, 0, 0);
+    return TableFileName(db_paths, meta.fd.GetNumber(), meta.fd.GetPathId());
+  }
+
+  // returns expected result after compaction
+  mock::MockFileContents CreateTwoFiles() {
+    mock::MockFileContents expected_results;
+    const int kKeysPerFile = 10000;
+    SequenceNumber sequence_number = 0;
+    for (int i = 0; i < 2; ++i) {
+      mock::MockFileContents contents;
+      SequenceNumber smallest_seqno = 0, largest_seqno = 0;
+      InternalKey smallest, largest;
+      for (int k = 0; k < kKeysPerFile; ++k) {
+        auto key = ToString(i * (kKeysPerFile / 2) + k);
+        auto value = ToString(i * kKeysPerFile + k);
+        InternalKey internal_key(key, ++sequence_number, kTypeValue);
+        // This is how the key will look like once it's written in bottommost
+        // file
+        InternalKey bottommost_internal_key(key, 0, kTypeValue);
+        if (k == 0) {
+          smallest = internal_key;
+          smallest_seqno = sequence_number;
+        } else if (k == kKeysPerFile - 1) {
+          largest = internal_key;
+          largest_seqno = sequence_number;
+        }
+        std::pair<std::string, std::string> key_value(
+            {bottommost_internal_key.Encode().ToString(), value});
+        contents.insert(key_value);
+        if (i == 1 || k < kKeysPerFile / 2) {
+          expected_results.insert(key_value);
+        }
+      }
+
+      uint64_t file_number = versions_->NewFileNumber();
+      EXPECT_OK(mock_table_factory_->CreateMockTable(
+          env_, GenerateFileName(file_number), std::move(contents)));
+
+      VersionEdit edit;
+      edit.AddFile(0, file_number, 0, 10, smallest, largest, smallest_seqno,
+                   largest_seqno);
+
+      mutex_.Lock();
+      versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
+                             mutable_cf_options_, &edit, &mutex_);
+      mutex_.Unlock();
+    }
+    versions_->SetLastSequence(sequence_number);
+    return expected_results;
+  }
+
+  void NewDB() {
+    VersionEdit new_db;
+    new_db.SetLogNumber(0);
+    new_db.SetNextFile(2);
+    new_db.SetLastSequence(0);
+
+    const std::string manifest = DescriptorFileName(dbname_, 1);
+    unique_ptr<WritableFile> file;
+    Status s = env_->NewWritableFile(
+        manifest, &file, env_->OptimizeForManifestWrite(env_options_));
+    ASSERT_OK(s);
+    {
+      log::Writer log(std::move(file));
+      std::string record;
+      new_db.EncodeTo(&record);
+      s = log.AddRecord(record);
+    }
+    ASSERT_OK(s);
+    // Make "CURRENT" file that points to the new manifest file.
+    s = SetCurrentFile(env_, dbname_, 1, nullptr);
+  }
+
+  Env* env_;
+  std::string dbname_;
+  EnvOptions env_options_;
+  MutableCFOptions mutable_cf_options_;
+  std::shared_ptr<Cache> table_cache_;
+  WriteController write_controller_;
+  DBOptions db_options_;
+  ColumnFamilyOptions cf_options_;
+  WriteBuffer write_buffer_;
+  std::unique_ptr<VersionSet> versions_;
+  InstrumentedMutex mutex_;
+  std::atomic<bool> shutting_down_;
+  std::shared_ptr<mock::MockTableFactory> mock_table_factory_;
+};
+
+TEST_F(CompactionJobTest, Simple) {
+  auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+
+  auto expected_results = CreateTwoFiles();
+
+  auto files = cfd->current()->storage_info()->LevelFiles(0);
+  ASSERT_EQ(2U, files.size());
+
+  CompactionInputFiles compaction_input_files;
+  compaction_input_files.level = 0;
+  compaction_input_files.files.push_back(files[0]);
+  compaction_input_files.files.push_back(files[1]);
+  std::unique_ptr<Compaction> compaction(new Compaction(
+      cfd->current()->storage_info(), *cfd->GetLatestMutableCFOptions(),
+      {compaction_input_files}, 1, 1024 * 1024, 10, 0, kNoCompression, {}));
+  compaction->SetInputVersion(cfd->current());
+
+  int yield_callback_called = 0;
+  std::function<uint64_t()> yield_callback = [&]() {
+    yield_callback_called++;
+    return 0;
+  };
+  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get());
+  mutex_.Lock();
+  EventLogger event_logger(db_options_.info_log.get());
+  CompactionJob compaction_job(0, compaction.get(), db_options_, env_options_,
+                               versions_.get(), &shutting_down_, &log_buffer,
+                               nullptr, nullptr, nullptr, {}, table_cache_,
+                               std::move(yield_callback), &event_logger, false);
+
+  compaction_job.Prepare();
+  mutex_.Unlock();
+  ASSERT_OK(compaction_job.Run());
+  mutex_.Lock();
+  Status s;
+  compaction_job.Install(&s, *cfd->GetLatestMutableCFOptions(), &mutex_);
+  ASSERT_OK(s);
+  mutex_.Unlock();
+
+  mock_table_factory_->AssertLatestFile(expected_results);
+  ASSERT_EQ(yield_callback_called, 20000);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/compaction_picker.cc b/src/rocksdb/db/compaction_picker.cc
index a8700bb..70e4814 100644
--- a/src/rocksdb/db/compaction_picker.cc
+++ b/src/rocksdb/db/compaction_picker.cc
@@ -9,199 +9,176 @@
 
 #include "db/compaction_picker.h"
 
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
 #include <limits>
+#include <string>
+#include <utility>
+
+#include "db/column_family.h"
+#include "db/filename.h"
 #include "util/log_buffer.h"
+#include "util/random.h"
 #include "util/statistics.h"
+#include "util/string_util.h"
+#include "util/sync_point.h"
 
 namespace rocksdb {
 
 namespace {
-
-uint64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
+uint64_t TotalCompensatedFileSize(const std::vector<FileMetaData*>& files) {
   uint64_t sum = 0;
   for (size_t i = 0; i < files.size() && files[i]; i++) {
-    sum += files[i]->file_size;
+    sum += files[i]->compensated_file_size;
   }
   return sum;
 }
 
-// Multiple two operands. If they overflow, return op1.
-uint64_t MultiplyCheckOverflow(uint64_t op1, int op2) {
-  if (op1 == 0) {
-    return 0;
-  }
-  if (op2 <= 0) {
-    return op1;
+}  // anonymous namespace
+
+// Determine compression type, based on user options, level of the output
+// file and whether compression is disabled.
+// If enable_compression is false, then compression is always disabled no
+// matter what the values of the other two parameters are.
+// Otherwise, the compression type is determined based on options and level.
+CompressionType GetCompressionType(const ImmutableCFOptions& ioptions,
+                                   int level, int base_level,
+                                   const bool enable_compression) {
+  if (!enable_compression) {
+    // disable compression
+    return kNoCompression;
   }
-  uint64_t casted_op2 = (uint64_t) op2;
-  if (std::numeric_limits<uint64_t>::max() / op1 < casted_op2) {
-    return op1;
+  // If the use has specified a different compression level for each level,
+  // then pick the compression for that level.
+  if (!ioptions.compression_per_level.empty()) {
+    assert(level == 0 || level >= base_level);
+    int idx = (level == 0) ? 0 : level - base_level + 1;
+
+    const int n = static_cast<int>(ioptions.compression_per_level.size()) - 1;
+    // It is possible for level_ to be -1; in that case, we use level
+    // 0's compression.  This occurs mostly in backwards compatibility
+    // situations when the builder doesn't know what level the file
+    // belongs to.  Likewise, if level is beyond the end of the
+    // specified compression levels, use the last value.
+    return ioptions.compression_per_level[std::max(0, std::min(idx, n))];
+  } else {
+    return ioptions.compression;
   }
-  return op1 * casted_op2;
 }
 
-}  // anonymous namespace
-
-CompactionPicker::CompactionPicker(const Options* options,
+CompactionPicker::CompactionPicker(const ImmutableCFOptions& ioptions,
                                    const InternalKeyComparator* icmp)
-    : compactions_in_progress_(options->num_levels),
-      options_(options),
-      num_levels_(options->num_levels),
-      icmp_(icmp) {
-
-  max_file_size_.reset(new uint64_t[NumberLevels()]);
-  level_max_bytes_.reset(new uint64_t[NumberLevels()]);
-  int target_file_size_multiplier = options_->target_file_size_multiplier;
-  int max_bytes_multiplier = options_->max_bytes_for_level_multiplier;
-  for (int i = 0; i < NumberLevels(); i++) {
-    if (i == 0 && options_->compaction_style == kCompactionStyleUniversal) {
-      max_file_size_[i] = ULLONG_MAX;
-      level_max_bytes_[i] = options_->max_bytes_for_level_base;
-    } else if (i > 1) {
-      max_file_size_[i] = MultiplyCheckOverflow(max_file_size_[i - 1],
-                                                target_file_size_multiplier);
-      level_max_bytes_[i] = MultiplyCheckOverflow(
-          MultiplyCheckOverflow(level_max_bytes_[i - 1], max_bytes_multiplier),
-          options_->max_bytes_for_level_multiplier_additional[i - 1]);
-    } else {
-      max_file_size_[i] = options_->target_file_size_base;
-      level_max_bytes_[i] = options_->max_bytes_for_level_base;
-    }
-  }
-}
+    : ioptions_(ioptions), icmp_(icmp) {}
 
 CompactionPicker::~CompactionPicker() {}
 
-void CompactionPicker::SizeBeingCompacted(std::vector<uint64_t>& sizes) {
-  for (int level = 0; level < NumberLevels() - 1; level++) {
-    uint64_t total = 0;
-    for (auto c : compactions_in_progress_[level]) {
-      assert(c->level() == level);
-      for (int i = 0; i < c->num_input_files(0); i++) {
-        total += c->input(0,i)->file_size;
-      }
-    }
-    sizes[level] = total;
-  }
-}
-
-// Clear all files to indicate that they are not being compacted
 // Delete this compaction from the list of running compactions.
 void CompactionPicker::ReleaseCompactionFiles(Compaction* c, Status status) {
-  c->MarkFilesBeingCompacted(false);
-  compactions_in_progress_[c->level()].erase(c);
+  if (c->start_level() == 0) {
+    level0_compactions_in_progress_.erase(c);
+  }
   if (!status.ok()) {
     c->ResetNextCompactionIndex();
   }
 }
 
-uint64_t CompactionPicker::MaxFileSizeForLevel(int level) const {
-  assert(level >= 0);
-  assert(level < NumberLevels());
-  return max_file_size_[level];
-}
-
-uint64_t CompactionPicker::MaxGrandParentOverlapBytes(int level) {
-  uint64_t result = MaxFileSizeForLevel(level);
-  result *= options_->max_grandparent_overlap_factor;
-  return result;
-}
-
-double CompactionPicker::MaxBytesForLevel(int level) {
-  // Note: the result for level zero is not really used since we set
-  // the level-0 compaction threshold based on number of files.
-  assert(level >= 0);
-  assert(level < NumberLevels());
-  return level_max_bytes_[level];
-}
-
-void CompactionPicker::GetRange(const std::vector<FileMetaData*>& inputs,
+void CompactionPicker::GetRange(const CompactionInputFiles& inputs,
                                 InternalKey* smallest, InternalKey* largest) {
+  const int level = inputs.level;
   assert(!inputs.empty());
   smallest->Clear();
   largest->Clear();
-  for (size_t i = 0; i < inputs.size(); i++) {
-    FileMetaData* f = inputs[i];
-    if (i == 0) {
-      *smallest = f->smallest;
-      *largest = f->largest;
-    } else {
-      if (icmp_->Compare(f->smallest, *smallest) < 0) {
+
+  if (level == 0) {
+    for (size_t i = 0; i < inputs.size(); i++) {
+      FileMetaData* f = inputs[i];
+      if (i == 0) {
         *smallest = f->smallest;
-      }
-      if (icmp_->Compare(f->largest, *largest) > 0) {
         *largest = f->largest;
+      } else {
+        if (icmp_->Compare(f->smallest, *smallest) < 0) {
+          *smallest = f->smallest;
+        }
+        if (icmp_->Compare(f->largest, *largest) > 0) {
+          *largest = f->largest;
+        }
       }
     }
+  } else {
+    *smallest = inputs[0]->smallest;
+    *largest = inputs[inputs.size() - 1]->largest;
   }
 }
 
-void CompactionPicker::GetRange(const std::vector<FileMetaData*>& inputs1,
-                                const std::vector<FileMetaData*>& inputs2,
+void CompactionPicker::GetRange(const CompactionInputFiles& inputs1,
+                                const CompactionInputFiles& inputs2,
                                 InternalKey* smallest, InternalKey* largest) {
-  std::vector<FileMetaData*> all = inputs1;
-  all.insert(all.end(), inputs2.begin(), inputs2.end());
-  GetRange(all, smallest, largest);
+  assert(!inputs1.empty() || !inputs2.empty());
+  if (inputs1.empty()) {
+    GetRange(inputs2, smallest, largest);
+  } else if (inputs2.empty()) {
+    GetRange(inputs1, smallest, largest);
+  } else {
+    InternalKey smallest1, smallest2, largest1, largest2;
+    GetRange(inputs1, &smallest1, &largest1);
+    GetRange(inputs2, &smallest2, &largest2);
+    *smallest = icmp_->Compare(smallest1, smallest2) < 0 ?
+                smallest1 : smallest2;
+    *largest = icmp_->Compare(largest1, largest2) < 0 ?
+               largest2 : largest1;
+  }
 }
 
-bool CompactionPicker::ExpandWhileOverlapping(Compaction* c) {
-  // If inputs are empty then there is nothing to expand.
-  if (!c || c->inputs_[0].empty()) {
-    return true;
-  }
+bool CompactionPicker::ExpandWhileOverlapping(const std::string& cf_name,
+                                              VersionStorageInfo* vstorage,
+                                              CompactionInputFiles* inputs) {
+  // This isn't good compaction
+  assert(!inputs->empty());
 
+  const int level = inputs->level;
   // GetOverlappingInputs will always do the right thing for level-0.
   // So we don't need to do any expansion if level == 0.
-  if (c->level() == 0) {
+  if (level == 0) {
     return true;
   }
 
-  const int level = c->level();
   InternalKey smallest, largest;
 
-  // Keep expanding c->inputs_[0] until we are sure that there is a
-  // "clean cut" boundary between the files in input and the surrounding files.
+  // Keep expanding inputs until we are sure that there is a "clean cut"
+  // boundary between the files in input and the surrounding files.
   // This will ensure that no parts of a key are lost during compaction.
   int hint_index = -1;
   size_t old_size;
   do {
-    old_size = c->inputs_[0].size();
-    GetRange(c->inputs_[0], &smallest, &largest);
-    c->inputs_[0].clear();
-    c->input_version_->GetOverlappingInputs(
-        level, &smallest, &largest, &c->inputs_[0], hint_index, &hint_index);
-  } while(c->inputs_[0].size() > old_size);
+    old_size = inputs->size();
+    GetRange(*inputs, &smallest, &largest);
+    inputs->clear();
+    vstorage->GetOverlappingInputs(level, &smallest, &largest, &inputs->files,
+                                   hint_index, &hint_index);
+  } while (inputs->size() > old_size);
 
-  // Get the new range
-  GetRange(c->inputs_[0], &smallest, &largest);
+  // we started off with inputs non-empty and the previous loop only grew
+  // inputs. thus, inputs should be non-empty here
+  assert(!inputs->empty());
 
   // If, after the expansion, there are files that are already under
   // compaction, then we must drop/cancel this compaction.
-  int parent_index = -1;
-  if (c->inputs_[0].empty()) {
-    Log(options_->info_log,
-        "[%s] ExpandWhileOverlapping() failure because zero input files",
-        c->column_family_data()->GetName().c_str());
-  }
-  if (c->inputs_[0].empty() || FilesInCompaction(c->inputs_[0]) ||
-      (c->level() != c->output_level() &&
-       ParentRangeInCompaction(c->input_version_, &smallest, &largest, level,
-                               &parent_index))) {
-    c->inputs_[0].clear();
-    c->inputs_[1].clear();
+  if (FilesInCompaction(inputs->files)) {
+    Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
+        "[%s] ExpandWhileOverlapping() failure because some of the necessary"
+        " compaction input files are currently being compacted.",
+        cf_name.c_str());
     return false;
   }
   return true;
 }
 
-uint64_t CompactionPicker::ExpandedCompactionByteSizeLimit(int level) {
-  uint64_t result = MaxFileSizeForLevel(level);
-  result *= options_->expanded_compaction_factor;
-  return result;
-}
-
 // Returns true if any one of specified files are being compacted
-bool CompactionPicker::FilesInCompaction(std::vector<FileMetaData*>& files) {
+bool CompactionPicker::FilesInCompaction(
+    const std::vector<FileMetaData*>& files) {
   for (unsigned int i = 0; i < files.size(); i++) {
     if (files[i]->being_compacted) {
       return true;
@@ -210,113 +187,250 @@ bool CompactionPicker::FilesInCompaction(std::vector<FileMetaData*>& files) {
   return false;
 }
 
+Compaction* CompactionPicker::FormCompaction(
+    const CompactionOptions& compact_options,
+    const std::vector<CompactionInputFiles>& input_files, int output_level,
+    VersionStorageInfo* vstorage, const MutableCFOptions& mutable_cf_options,
+    uint32_t output_path_id) const {
+  uint64_t max_grandparent_overlap_bytes =
+      output_level + 1 < vstorage->num_levels() ?
+          mutable_cf_options.MaxGrandParentOverlapBytes(output_level + 1) :
+          std::numeric_limits<uint64_t>::max();
+  assert(input_files.size());
+  return new Compaction(vstorage, mutable_cf_options, input_files, output_level,
+                        compact_options.output_file_size_limit,
+                        max_grandparent_overlap_bytes, output_path_id,
+                        compact_options.compression, /* grandparents */ {});
+}
+
+Status CompactionPicker::GetCompactionInputsFromFileNumbers(
+    std::vector<CompactionInputFiles>* input_files,
+    std::unordered_set<uint64_t>* input_set,
+    const VersionStorageInfo* vstorage,
+    const CompactionOptions& compact_options) const {
+  if (input_set->size() == 0U) {
+    return Status::InvalidArgument(
+        "Compaction must include at least one file.");
+  }
+  assert(input_files);
+
+  std::vector<CompactionInputFiles> matched_input_files;
+  matched_input_files.resize(vstorage->num_levels());
+  int first_non_empty_level = -1;
+  int last_non_empty_level = -1;
+  // TODO(yhchiang): use a lazy-initialized mapping from
+  //                 file_number to FileMetaData in Version.
+  for (int level = 0; level < vstorage->num_levels(); ++level) {
+    for (auto file : vstorage->LevelFiles(level)) {
+      auto iter = input_set->find(file->fd.GetNumber());
+      if (iter != input_set->end()) {
+        matched_input_files[level].files.push_back(file);
+        input_set->erase(iter);
+        last_non_empty_level = level;
+        if (first_non_empty_level == -1) {
+          first_non_empty_level = level;
+        }
+      }
+    }
+  }
+
+  if (!input_set->empty()) {
+    std::string message(
+        "Cannot find matched SST files for the following file numbers:");
+    for (auto fn : *input_set) {
+      message += " ";
+      message += ToString(fn);
+    }
+    return Status::InvalidArgument(message);
+  }
+
+  for (int level = first_non_empty_level;
+       level <= last_non_empty_level; ++level) {
+    matched_input_files[level].level = level;
+    input_files->emplace_back(std::move(matched_input_files[level]));
+  }
+
+  return Status::OK();
+}
+
+
+
 // Returns true if any one of the parent files are being compacted
-bool CompactionPicker::ParentRangeInCompaction(Version* version,
-                                               const InternalKey* smallest,
-                                               const InternalKey* largest,
-                                               int level, int* parent_index) {
+bool CompactionPicker::RangeInCompaction(VersionStorageInfo* vstorage,
+                                         const InternalKey* smallest,
+                                         const InternalKey* largest, int level,
+                                         int* level_index) {
   std::vector<FileMetaData*> inputs;
-  assert(level + 1 < NumberLevels());
+  assert(level < NumberLevels());
 
-  version->GetOverlappingInputs(level + 1, smallest, largest, &inputs,
-                                *parent_index, parent_index);
+  vstorage->GetOverlappingInputs(level, smallest, largest, &inputs,
+                                 *level_index, level_index);
   return FilesInCompaction(inputs);
 }
 
-// Populates the set of inputs from "level+1" that overlap with "level".
-// Will also attempt to expand "level" if that doesn't expand "level+1"
-// or cause "level" to include a file for compaction that has an overlapping
-// user-key with another file.
-void CompactionPicker::SetupOtherInputs(Compaction* c) {
-  // If inputs are empty, then there is nothing to expand.
-  // If both input and output levels are the same, no need to consider
-  // files at level "level+1"
-  if (c->inputs_[0].empty() || c->level() == c->output_level()) {
-    return;
+// Populates the set of inputs of all other levels that overlap with the
+// start level.
+// Now we assume all levels except start level and output level are empty.
+// Will also attempt to expand "start level" if that doesn't expand
+// "output level" or cause "level" to include a file for compaction that has an
+// overlapping user-key with another file.
+// REQUIRES: input_level and output_level are different
+// REQUIRES: inputs->empty() == false
+// Returns false if files on parent level are currently in compaction, which
+// means that we can't compact them
+bool CompactionPicker::SetupOtherInputs(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    VersionStorageInfo* vstorage, CompactionInputFiles* inputs,
+    CompactionInputFiles* output_level_inputs, int* parent_index,
+    int base_index) {
+  assert(!inputs->empty());
+  assert(output_level_inputs->empty());
+  const int input_level = inputs->level;
+  const int output_level = output_level_inputs->level;
+  assert(input_level != output_level);
+
+  // For now, we only support merging two levels, start level and output level.
+  // We need to assert other levels are empty.
+  for (int l = input_level + 1; l < output_level; l++) {
+    assert(vstorage->NumLevelFiles(l) == 0);
   }
 
-  const int level = c->level();
   InternalKey smallest, largest;
 
   // Get the range one last time.
-  GetRange(c->inputs_[0], &smallest, &largest);
+  GetRange(*inputs, &smallest, &largest);
 
-  // Populate the set of next-level files (inputs_[1]) to include in compaction
-  c->input_version_->GetOverlappingInputs(level + 1, &smallest, &largest,
-                                          &c->inputs_[1], c->parent_index_,
-                                          &c->parent_index_);
+  // Populate the set of next-level files (inputs_GetOutputLevelInputs()) to
+  // include in compaction
+  vstorage->GetOverlappingInputs(output_level, &smallest, &largest,
+                                 &output_level_inputs->files, *parent_index,
+                                 parent_index);
 
-  // Get entire range covered by compaction
-  InternalKey all_start, all_limit;
-  GetRange(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);
+  if (FilesInCompaction(output_level_inputs->files)) {
+    return false;
+  }
 
   // See if we can further grow the number of inputs in "level" without
   // changing the number of "level+1" files we pick up. We also choose NOT
   // to expand if this would cause "level" to include some entries for some
   // user key, while excluding other entries for the same user key. This
   // can happen when one user key spans multiple files.
-  if (!c->inputs_[1].empty()) {
-    std::vector<FileMetaData*> expanded0;
-    c->input_version_->GetOverlappingInputs(
-        level, &all_start, &all_limit, &expanded0, c->base_index_, nullptr);
-    const uint64_t inputs0_size = TotalFileSize(c->inputs_[0]);
-    const uint64_t inputs1_size = TotalFileSize(c->inputs_[1]);
-    const uint64_t expanded0_size = TotalFileSize(expanded0);
-    uint64_t limit = ExpandedCompactionByteSizeLimit(level);
-    if (expanded0.size() > c->inputs_[0].size() &&
+  if (!output_level_inputs->empty()) {
+    CompactionInputFiles expanded0;
+    expanded0.level = input_level;
+    // Get entire range covered by compaction
+    InternalKey all_start, all_limit;
+    GetRange(*inputs, *output_level_inputs, &all_start, &all_limit);
+
+    vstorage->GetOverlappingInputs(input_level, &all_start, &all_limit,
+                                   &expanded0.files, base_index, nullptr);
+    const uint64_t inputs0_size = TotalCompensatedFileSize(inputs->files);
+    const uint64_t inputs1_size =
+        TotalCompensatedFileSize(output_level_inputs->files);
+    const uint64_t expanded0_size = TotalCompensatedFileSize(expanded0.files);
+    uint64_t limit =
+        mutable_cf_options.ExpandedCompactionByteSizeLimit(input_level);
+    if (expanded0.size() > inputs->size() &&
         inputs1_size + expanded0_size < limit &&
-        !FilesInCompaction(expanded0) &&
-        !c->input_version_->HasOverlappingUserKey(&expanded0, level)) {
+        !FilesInCompaction(expanded0.files) &&
+        !vstorage->HasOverlappingUserKey(&expanded0.files, input_level)) {
       InternalKey new_start, new_limit;
       GetRange(expanded0, &new_start, &new_limit);
       std::vector<FileMetaData*> expanded1;
-      c->input_version_->GetOverlappingInputs(level + 1, &new_start, &new_limit,
-                                              &expanded1, c->parent_index_,
-                                              &c->parent_index_);
-      if (expanded1.size() == c->inputs_[1].size() &&
+      vstorage->GetOverlappingInputs(output_level, &new_start, &new_limit,
+                                     &expanded1, *parent_index, parent_index);
+      if (expanded1.size() == output_level_inputs->size() &&
           !FilesInCompaction(expanded1)) {
-        Log(options_->info_log,
-            "[%s] Expanding@%lu %lu+%lu (%lu+%lu bytes) to %lu+%lu (%lu+%lu "
-            "bytes)\n",
-            c->column_family_data()->GetName().c_str(), (unsigned long)level,
-            (unsigned long)(c->inputs_[0].size()),
-            (unsigned long)(c->inputs_[1].size()), (unsigned long)inputs0_size,
-            (unsigned long)inputs1_size, (unsigned long)(expanded0.size()),
-            (unsigned long)(expanded1.size()), (unsigned long)expanded0_size,
-            (unsigned long)inputs1_size);
+        Log(InfoLogLevel::INFO_LEVEL, ioptions_.info_log,
+            "[%s] Expanding@%d %zu+%zu (%" PRIu64 "+%" PRIu64
+            " bytes) to %zu+%zu (%" PRIu64 "+%" PRIu64 "bytes)\n",
+            cf_name.c_str(), input_level, inputs->size(),
+            output_level_inputs->size(), inputs0_size, inputs1_size,
+            expanded0.size(), expanded1.size(), expanded0_size, inputs1_size);
         smallest = new_start;
         largest = new_limit;
-        c->inputs_[0] = expanded0;
-        c->inputs_[1] = expanded1;
-        GetRange(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);
+        inputs->files = expanded0.files;
+        output_level_inputs->files = expanded1;
       }
     }
   }
 
+  return true;
+}
+
+void CompactionPicker::GetGrandparents(
+    VersionStorageInfo* vstorage, const CompactionInputFiles& inputs,
+    const CompactionInputFiles& output_level_inputs,
+    std::vector<FileMetaData*>* grandparents) {
+  InternalKey start, limit;
+  GetRange(inputs, output_level_inputs, &start, &limit);
   // Compute the set of grandparent files that overlap this compaction
   // (parent == level+1; grandparent == level+2)
-  if (level + 2 < NumberLevels()) {
-    c->input_version_->GetOverlappingInputs(level + 2, &all_start, &all_limit,
-                                            &c->grandparents_);
+  if (output_level_inputs.level + 1 < NumberLevels()) {
+    vstorage->GetOverlappingInputs(output_level_inputs.level + 1, &start,
+                                   &limit, grandparents);
   }
 }
 
+Compaction* CompactionPicker::CompactRange(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    VersionStorageInfo* vstorage, int input_level, int output_level,
+    uint32_t output_path_id, const InternalKey* begin, const InternalKey* end,
+    InternalKey** compaction_end) {
+  // CompactionPickerFIFO has its own implementation of compact range
+  assert(ioptions_.compaction_style != kCompactionStyleFIFO);
+
+  if (input_level == ColumnFamilyData::kCompactAllLevels) {
+    assert(ioptions_.compaction_style == kCompactionStyleUniversal);
+
+    // Universal compaction with more than one level always compacts all the
+    // files together to the last level.
+    assert(vstorage->num_levels() > 1);
+    // DBImpl::CompactRange() set output level to be the last level
+    assert(output_level == vstorage->num_levels() - 1);
+    // DBImpl::RunManualCompaction will make full range for universal compaction
+    assert(begin == nullptr);
+    assert(end == nullptr);
+    *compaction_end = nullptr;
 
-Compaction* CompactionPicker::CompactRange(Version* version, int input_level,
-                                           int output_level,
-                                           const InternalKey* begin,
-                                           const InternalKey* end,
-                                           InternalKey** compaction_end) {
-  std::vector<FileMetaData*> inputs;
+    int start_level = 0;
+    for (; start_level < vstorage->num_levels() &&
+           vstorage->NumLevelFiles(start_level) == 0;
+         start_level++) {
+    }
+    if (start_level == vstorage->num_levels()) {
+      return nullptr;
+    }
+
+    std::vector<CompactionInputFiles> inputs(vstorage->num_levels() -
+                                             start_level);
+    for (int level = start_level; level < vstorage->num_levels(); level++) {
+      inputs[level - start_level].level = level;
+      auto& files = inputs[level - start_level].files;
+      for (FileMetaData* f : vstorage->LevelFiles(level)) {
+        files.push_back(f);
+      }
+    }
+    return new Compaction(
+        vstorage, mutable_cf_options, std::move(inputs), output_level,
+        mutable_cf_options.MaxFileSizeForLevel(output_level),
+        /* max_grandparent_overlap_bytes */ LLONG_MAX, output_path_id,
+        GetCompressionType(ioptions_, output_level, 1),
+        /* grandparents */ {}, /* is manual */ true);
+  }
+
+  CompactionInputFiles inputs;
+  inputs.level = input_level;
   bool covering_the_whole_range = true;
 
   // All files are 'overlapping' in universal style compaction.
   // We have to compact the entire range in one shot.
-  if (options_->compaction_style == kCompactionStyleUniversal) {
+  if (ioptions_.compaction_style == kCompactionStyleUniversal) {
     begin = nullptr;
     end = nullptr;
   }
-  version->GetOverlappingInputs(input_level, begin, end, &inputs);
+
+  vstorage->GetOverlappingInputs(input_level, begin, end, &inputs.files);
   if (inputs.empty()) {
     return nullptr;
   }
@@ -326,187 +440,539 @@ Compaction* CompactionPicker::CompactRange(Version* version, int input_level,
   // and we must not pick one file and drop another older file if the
   // two files overlap.
   if (input_level > 0) {
-    const uint64_t limit =
-        MaxFileSizeForLevel(input_level) * options_->source_compaction_factor;
+    const uint64_t limit = mutable_cf_options.MaxFileSizeForLevel(input_level) *
+      mutable_cf_options.source_compaction_factor;
     uint64_t total = 0;
     for (size_t i = 0; i + 1 < inputs.size(); ++i) {
-      uint64_t s = inputs[i]->file_size;
+      uint64_t s = inputs[i]->compensated_file_size;
       total += s;
       if (total >= limit) {
         **compaction_end = inputs[i + 1]->smallest;
         covering_the_whole_range = false;
-        inputs.resize(i + 1);
+        inputs.files.resize(i + 1);
         break;
       }
     }
   }
-  Compaction* c = new Compaction(version, input_level, output_level,
-                                 MaxFileSizeForLevel(output_level),
-                                 MaxGrandParentOverlapBytes(input_level));
+  assert(output_path_id < static_cast<uint32_t>(ioptions_.db_paths.size()));
 
-  c->inputs_[0] = inputs;
-  if (ExpandWhileOverlapping(c) == false) {
-    delete c;
-    Log(options_->info_log,
-        "[%s] Could not compact due to expansion failure.\n",
-        version->cfd_->GetName().c_str());
+  if (ExpandWhileOverlapping(cf_name, vstorage, &inputs) == false) {
+    // manual compaction is currently single-threaded, so it should never
+    // happen that ExpandWhileOverlapping fails
+    assert(false);
     return nullptr;
   }
 
-  SetupOtherInputs(c);
-
   if (covering_the_whole_range) {
     *compaction_end = nullptr;
   }
 
-  // These files that are to be manaully compacted do not trample
-  // upon other files because manual compactions are processed when
-  // the system has a max of 1 background compaction thread.
-  c->MarkFilesBeingCompacted(true);
+  CompactionInputFiles output_level_inputs;
+  if (output_level == ColumnFamilyData::kCompactToBaseLevel) {
+    assert(input_level == 0);
+    output_level = vstorage->base_level();
+    assert(output_level > 0);
+  }
+  output_level_inputs.level = output_level;
+  if (input_level != output_level) {
+    int parent_index = -1;
+    if (!SetupOtherInputs(cf_name, mutable_cf_options, vstorage, &inputs,
+                          &output_level_inputs, &parent_index, -1)) {
+      // manual compaction is currently single-threaded, so it should never
+      // happen that SetupOtherInputs fails
+      assert(false);
+      return nullptr;
+    }
+  }
 
-  // Is this compaction creating a file at the bottommost level
-  c->SetupBottomMostLevel(true);
+  std::vector<CompactionInputFiles> compaction_inputs({inputs});
+  if (!output_level_inputs.empty()) {
+    compaction_inputs.push_back(output_level_inputs);
+  }
 
-  c->is_manual_compaction_ = true;
+  std::vector<FileMetaData*> grandparents;
+  GetGrandparents(vstorage, inputs, output_level_inputs, &grandparents);
+  Compaction* compaction = new Compaction(
+      vstorage, mutable_cf_options, std::move(compaction_inputs), output_level,
+      mutable_cf_options.MaxFileSizeForLevel(output_level),
+      mutable_cf_options.MaxGrandParentOverlapBytes(input_level),
+      output_path_id,
+      GetCompressionType(ioptions_, output_level, vstorage->base_level()),
+      std::move(grandparents), /* is manual compaction */ true);
+
+  TEST_SYNC_POINT_CALLBACK("CompactionPicker::CompactRange:Return", compaction);
+  return compaction;
+}
 
-  return c;
+#ifndef ROCKSDB_LITE
+namespace {
+// Test whether two files have overlapping key-ranges.
+bool HaveOverlappingKeyRanges(
+    const Comparator* c,
+    const SstFileMetaData& a, const SstFileMetaData& b) {
+  if (c->Compare(a.smallestkey, b.smallestkey) >= 0) {
+    if (c->Compare(a.smallestkey, b.largestkey) <= 0) {
+      // b.smallestkey <= a.smallestkey <= b.largestkey
+      return true;
+    }
+  } else if (c->Compare(a.largestkey, b.smallestkey) >= 0) {
+    // a.smallestkey < b.smallestkey <= a.largestkey
+    return true;
+  }
+  if (c->Compare(a.largestkey, b.largestkey) <= 0) {
+    if (c->Compare(a.largestkey, b.smallestkey) >= 0) {
+      // b.smallestkey <= a.largestkey <= b.largestkey
+      return true;
+    }
+  } else if (c->Compare(a.smallestkey, b.largestkey) <= 0) {
+    // a.smallestkey <= b.largestkey < a.largestkey
+    return true;
+  }
+  return false;
 }
+}  // namespace
+
+Status CompactionPicker::SanitizeCompactionInputFilesForAllLevels(
+      std::unordered_set<uint64_t>* input_files,
+      const ColumnFamilyMetaData& cf_meta,
+      const int output_level) const {
+  auto& levels = cf_meta.levels;
+  auto comparator = icmp_->user_comparator();
+
+  // TODO(yhchiang): If there is any input files of L1 or up and there
+  // is at least one L0 files. All L0 files older than the L0 file needs
+  // to be included. Otherwise, it is a false conditoin
+
+  // TODO(yhchiang): add is_adjustable to CompactionOptions
+
+  // the smallest and largest key of the current compaction input
+  std::string smallestkey;
+  std::string largestkey;
+  // a flag for initializing smallest and largest key
+  bool is_first = false;
+  const int kNotFound = -1;
+
+  // For each level, it does the following things:
+  // 1. Find the first and the last compaction input files
+  //    in the current level.
+  // 2. Include all files between the first and the last
+  //    compaction input files.
+  // 3. Update the compaction key-range.
+  // 4. For all remaining levels, include files that have
+  //    overlapping key-range with the compaction key-range.
+  for (int l = 0; l <= output_level; ++l) {
+    auto& current_files = levels[l].files;
+    int first_included = static_cast<int>(current_files.size());
+    int last_included = kNotFound;
+
+    // identify the first and the last compaction input files
+    // in the current level.
+    for (size_t f = 0; f < current_files.size(); ++f) {
+      if (input_files->find(TableFileNameToNumber(current_files[f].name)) !=
+          input_files->end()) {
+        first_included = std::min(first_included, static_cast<int>(f));
+        last_included = std::max(last_included, static_cast<int>(f));
+        if (is_first == false) {
+          smallestkey = current_files[f].smallestkey;
+          largestkey = current_files[f].largestkey;
+          is_first = true;
+        }
+      }
+    }
+    if (last_included == kNotFound) {
+      continue;
+    }
 
-Compaction* LevelCompactionPicker::PickCompaction(Version* version,
-                                                  LogBuffer* log_buffer) {
-  Compaction* c = nullptr;
-  int level = -1;
+    if (l != 0) {
+      // expend the compaction input of the current level if it
+      // has overlapping key-range with other non-compaction input
+      // files in the same level.
+      while (first_included > 0) {
+        if (comparator->Compare(
+                current_files[first_included - 1].largestkey,
+                current_files[first_included].smallestkey) < 0) {
+          break;
+        }
+        first_included--;
+      }
 
-  // Compute the compactions needed. It is better to do it here
-  // and also in LogAndApply(), otherwise the values could be stale.
-  std::vector<uint64_t> size_being_compacted(NumberLevels() - 1);
-  SizeBeingCompacted(size_being_compacted);
-  version->ComputeCompactionScore(size_being_compacted);
+      while (last_included < static_cast<int>(current_files.size()) - 1) {
+        if (comparator->Compare(
+                current_files[last_included + 1].smallestkey,
+                current_files[last_included].largestkey) > 0) {
+          break;
+        }
+        last_included++;
+      }
+    }
 
-  // We prefer compactions triggered by too much data in a level over
-  // the compactions triggered by seeks.
-  //
-  // Find the compactions by size on all levels.
-  for (int i = 0; i < NumberLevels() - 1; i++) {
-    assert(i == 0 ||
-           version->compaction_score_[i] <= version->compaction_score_[i - 1]);
-    level = version->compaction_level_[i];
-    if ((version->compaction_score_[i] >= 1)) {
-      c = PickCompactionBySize(version, level, version->compaction_score_[i]);
-      if (ExpandWhileOverlapping(c) == false) {
-        delete c;
-        c = nullptr;
-      } else {
-        break;
+    // include all files between the first and the last compaction input files.
+    for (int f = first_included; f <= last_included; ++f) {
+      if (current_files[f].being_compacted) {
+        return Status::Aborted(
+            "Necessary compaction input file " + current_files[f].name +
+            " is currently being compacted.");
+      }
+      input_files->insert(
+          TableFileNameToNumber(current_files[f].name));
+    }
+
+    // update smallest and largest key
+    if (l == 0) {
+      for (int f = first_included; f <= last_included; ++f) {
+        if (comparator->Compare(
+            smallestkey, current_files[f].smallestkey) > 0) {
+          smallestkey = current_files[f].smallestkey;
+        }
+        if (comparator->Compare(
+            largestkey, current_files[f].largestkey) < 0) {
+          largestkey = current_files[f].largestkey;
+        }
+      }
+    } else {
+      if (comparator->Compare(
+          smallestkey, current_files[first_included].smallestkey) > 0) {
+        smallestkey = current_files[first_included].smallestkey;
+      }
+      if (comparator->Compare(
+          largestkey, current_files[last_included].largestkey) < 0) {
+        largestkey = current_files[last_included].largestkey;
+      }
+    }
+
+    SstFileMetaData aggregated_file_meta;
+    aggregated_file_meta.smallestkey = smallestkey;
+    aggregated_file_meta.largestkey = largestkey;
+
+    // For all lower levels, include all overlapping files.
+    for (int m = l + 1; m <= output_level; ++m) {
+      for (auto& next_lv_file : levels[m].files) {
+        if (HaveOverlappingKeyRanges(
+            comparator, aggregated_file_meta, next_lv_file)) {
+          if (next_lv_file.being_compacted) {
+            return Status::Aborted(
+                "File " + next_lv_file.name +
+                " that has overlapping key range with one of the compaction "
+                " input file is currently being compacted.");
+          }
+          input_files->insert(
+              TableFileNameToNumber(next_lv_file.name));
+        }
       }
     }
   }
+  return Status::OK();
+}
 
-  // Find compactions needed by seeks
-  FileMetaData* f = version->file_to_compact_;
-  if (c == nullptr && f != nullptr && !f->being_compacted) {
+Status CompactionPicker::SanitizeCompactionInputFiles(
+    std::unordered_set<uint64_t>* input_files,
+    const ColumnFamilyMetaData& cf_meta,
+    const int output_level) const {
+  assert(static_cast<int>(cf_meta.levels.size()) - 1 ==
+         cf_meta.levels[cf_meta.levels.size() - 1].level);
+  if (output_level >= static_cast<int>(cf_meta.levels.size())) {
+    return Status::InvalidArgument(
+        "Output level for column family " + cf_meta.name +
+        " must between [0, " +
+        ToString(cf_meta.levels[cf_meta.levels.size() - 1].level) +
+        "].");
+  }
 
-    level = version->file_to_compact_level_;
-    int parent_index = -1;
+  if (output_level > MaxOutputLevel()) {
+    return Status::InvalidArgument(
+        "Exceed the maximum output level defined by "
+        "the current compaction algorithm --- " +
+            ToString(MaxOutputLevel()));
+  }
 
-    // Only allow one level 0 compaction at a time.
-    // Do not pick this file if its parents at level+1 are being compacted.
-    if (level != 0 || compactions_in_progress_[0].empty()) {
-      if (!ParentRangeInCompaction(version, &f->smallest, &f->largest, level,
-                                   &parent_index)) {
-        c = new Compaction(version, level, level + 1,
-                           MaxFileSizeForLevel(level + 1),
-                           MaxGrandParentOverlapBytes(level), true);
-        c->inputs_[0].push_back(f);
-        c->parent_index_ = parent_index;
-        c->input_version_->file_to_compact_ = nullptr;
-        if (ExpandWhileOverlapping(c) == false) {
-          return nullptr;
+  if (output_level < 0) {
+    return Status::InvalidArgument(
+        "Output level cannot be negative.");
+  }
+
+  if (input_files->size() == 0) {
+    return Status::InvalidArgument(
+        "A compaction must contain at least one file.");
+  }
+
+  Status s = SanitizeCompactionInputFilesForAllLevels(
+      input_files, cf_meta, output_level);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  // for all input files, check whether the file number matches
+  // any currently-existing files.
+  for (auto file_num : *input_files) {
+    bool found = false;
+    for (auto level_meta : cf_meta.levels) {
+      for (auto file_meta : level_meta.files) {
+        if (file_num == TableFileNameToNumber(file_meta.name)) {
+          if (file_meta.being_compacted) {
+            return Status::Aborted(
+                "Specified compaction input file " +
+                MakeTableFileName("", file_num) +
+                " is already being compacted.");
+          }
+          found = true;
+          break;
         }
       }
+      if (found) {
+        break;
+      }
+    }
+    if (!found) {
+      return Status::InvalidArgument(
+          "Specified compaction input file " +
+          MakeTableFileName("", file_num) +
+          " does not exist in column family " + cf_meta.name + ".");
     }
   }
 
-  if (c == nullptr) {
+  return Status::OK();
+}
+#endif  // !ROCKSDB_LITE
+
+bool LevelCompactionPicker::NeedsCompaction(const VersionStorageInfo* vstorage)
+    const {
+  if (!vstorage->FilesMarkedForCompaction().empty()) {
+    return true;
+  }
+  for (int i = 0; i <= vstorage->MaxInputLevel(); i++) {
+    if (vstorage->CompactionScore(i) >= 1) {
+      return true;
+    }
+  }
+  return false;
+}
+
+void LevelCompactionPicker::PickFilesMarkedForCompactionExperimental(
+    const std::string& cf_name, VersionStorageInfo* vstorage,
+    CompactionInputFiles* inputs, int* level, int* output_level) {
+  if (vstorage->FilesMarkedForCompaction().empty()) {
+    return;
+  }
+
+  auto continuation = [&](std::pair<int, FileMetaData*> level_file) {
+    // If it's being compacted it has nothing to do here.
+    // If this assert() fails that means that some function marked some
+    // files as being_compacted, but didn't call ComputeCompactionScore()
+    assert(!level_file.second->being_compacted);
+    *level = level_file.first;
+    *output_level = (*level == 0) ? vstorage->base_level() : *level + 1;
+
+    if (*level == 0 && !level0_compactions_in_progress_.empty()) {
+      return false;
+    }
+
+    inputs->files = {level_file.second};
+    inputs->level = *level;
+    return ExpandWhileOverlapping(cf_name, vstorage, inputs);
+  };
+
+  // take a chance on a random file first
+  Random64 rnd(/* seed */ reinterpret_cast<uint64_t>(vstorage));
+  size_t random_file_index = static_cast<size_t>(rnd.Uniform(
+      static_cast<uint64_t>(vstorage->FilesMarkedForCompaction().size())));
+
+  if (continuation(vstorage->FilesMarkedForCompaction()[random_file_index])) {
+    // found the compaction!
+    return;
+  }
+
+  for (auto& level_file : vstorage->FilesMarkedForCompaction()) {
+    if (continuation(level_file)) {
+      // found the compaction!
+      return;
+    }
+  }
+  inputs->files.clear();
+}
+
+Compaction* LevelCompactionPicker::PickCompaction(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    VersionStorageInfo* vstorage, LogBuffer* log_buffer) {
+  int level = -1;
+  int output_level = -1;
+  int parent_index = -1;
+  int base_index = -1;
+  CompactionInputFiles inputs;
+  double score = 0;
+
+  // Find the compactions by size on all levels.
+  for (int i = 0; i < NumberLevels() - 1; i++) {
+    score = vstorage->CompactionScore(i);
+    level = vstorage->CompactionScoreLevel(i);
+    assert(i == 0 || score <= vstorage->CompactionScore(i - 1));
+    if (score >= 1) {
+      output_level = (level == 0) ? vstorage->base_level() : level + 1;
+      if (PickCompactionBySize(vstorage, level, output_level, &inputs,
+                               &parent_index, &base_index) &&
+          ExpandWhileOverlapping(cf_name, vstorage, &inputs)) {
+        // found the compaction!
+        break;
+      } else {
+        // didn't find the compaction, clear the inputs
+        inputs.clear();
+      }
+    }
+  }
+
+  bool is_manual = false;
+  // if we didn't find a compaction, check if there are any files marked for
+  // compaction
+  if (inputs.empty()) {
+    is_manual = true;
+    parent_index = base_index = -1;
+    PickFilesMarkedForCompactionExperimental(cf_name, vstorage, &inputs, &level,
+                                             &output_level);
+  }
+  if (inputs.empty()) {
     return nullptr;
   }
+  assert(level >= 0 && output_level >= 0);
 
   // Two level 0 compaction won't run at the same time, so don't need to worry
   // about files on level 0 being compacted.
   if (level == 0) {
-    assert(compactions_in_progress_[0].empty());
+    assert(level0_compactions_in_progress_.empty());
     InternalKey smallest, largest;
-    GetRange(c->inputs_[0], &smallest, &largest);
+    GetRange(inputs, &smallest, &largest);
     // Note that the next call will discard the file we placed in
     // c->inputs_[0] earlier and replace it with an overlapping set
     // which will include the picked file.
-    c->inputs_[0].clear();
-    c->input_version_->GetOverlappingInputs(0, &smallest, &largest,
-                                            &c->inputs_[0]);
+    inputs.files.clear();
+    vstorage->GetOverlappingInputs(0, &smallest, &largest, &inputs.files);
 
     // If we include more L0 files in the same compaction run it can
     // cause the 'smallest' and 'largest' key to get extended to a
     // larger range. So, re-invoke GetRange to get the new key range
-    GetRange(c->inputs_[0], &smallest, &largest);
-    if (ParentRangeInCompaction(c->input_version_, &smallest, &largest, level,
-                                &c->parent_index_)) {
-      delete c;
+    GetRange(inputs, &smallest, &largest);
+    if (RangeInCompaction(vstorage, &smallest, &largest, output_level,
+                          &parent_index)) {
       return nullptr;
     }
-    assert(!c->inputs_[0].empty());
+    assert(!inputs.files.empty());
   }
 
-  // Setup "level+1" files (inputs_[1])
-  SetupOtherInputs(c);
+  // Setup input files from output level
+  CompactionInputFiles output_level_inputs;
+  output_level_inputs.level = output_level;
+  if (!SetupOtherInputs(cf_name, mutable_cf_options, vstorage, &inputs,
+                   &output_level_inputs, &parent_index, base_index)) {
+    return nullptr;
+  }
+
+  std::vector<CompactionInputFiles> compaction_inputs({inputs});
+  if (!output_level_inputs.empty()) {
+    compaction_inputs.push_back(output_level_inputs);
+  }
 
-  // mark all the files that are being compacted
-  c->MarkFilesBeingCompacted(true);
+  std::vector<FileMetaData*> grandparents;
+  GetGrandparents(vstorage, inputs, output_level_inputs, &grandparents);
+  auto c = new Compaction(
+      vstorage, mutable_cf_options, std::move(compaction_inputs), output_level,
+      mutable_cf_options.MaxFileSizeForLevel(output_level),
+      mutable_cf_options.MaxGrandParentOverlapBytes(level),
+      GetPathId(ioptions_, mutable_cf_options, output_level),
+      GetCompressionType(ioptions_, output_level, vstorage->base_level()),
+      std::move(grandparents), is_manual, score);
+
+  // If it's level 0 compaction, make sure we don't execute any other level 0
+  // compactions in parallel
+  if (level == 0) {
+    level0_compactions_in_progress_.insert(c);
+  }
 
-  // Is this compaction creating a file at the bottommost level
-  c->SetupBottomMostLevel(false);
+  // Creating a compaction influences the compaction score because the score
+  // takes running compactions into account (by skipping files that are already
+  // being compacted). Since we just changed compaction score, we recalculate it
+  // here
+  {  // this piece of code recomputes compaction score
+    CompactionOptionsFIFO dummy_compaction_options_fifo;
+    vstorage->ComputeCompactionScore(mutable_cf_options,
+                                     dummy_compaction_options_fifo);
+  }
 
-  // remember this currently undergoing compaction
-  compactions_in_progress_[level].insert(c);
+  TEST_SYNC_POINT_CALLBACK("LevelCompactionPicker::PickCompaction:Return", c);
 
   return c;
 }
 
-Compaction* LevelCompactionPicker::PickCompactionBySize(Version* version,
-                                                        int level,
-                                                        double score) {
-  Compaction* c = nullptr;
+/*
+ * Find the optimal path to place a file
+ * Given a level, finds the path where levels up to it will fit in levels
+ * up to and including this path
+ */
+uint32_t LevelCompactionPicker::GetPathId(
+    const ImmutableCFOptions& ioptions,
+    const MutableCFOptions& mutable_cf_options, int level) {
+  uint32_t p = 0;
+  assert(!ioptions.db_paths.empty());
+
+  // size remaining in the most recent path
+  uint64_t current_path_size = ioptions.db_paths[0].target_size;
+
+  uint64_t level_size;
+  int cur_level = 0;
+
+  level_size = mutable_cf_options.max_bytes_for_level_base;
+
+  // Last path is the fallback
+  while (p < ioptions.db_paths.size() - 1) {
+    if (level_size <= current_path_size) {
+      if (cur_level == level) {
+        // Does desired level fit in this path?
+        return p;
+      } else {
+        current_path_size -= level_size;
+        level_size *= mutable_cf_options.max_bytes_for_level_multiplier;
+        cur_level++;
+        continue;
+      }
+    }
+    p++;
+    current_path_size = ioptions.db_paths[p].target_size;
+  }
+  return p;
+}
 
+bool LevelCompactionPicker::PickCompactionBySize(VersionStorageInfo* vstorage,
+                                                 int level, int output_level,
+                                                 CompactionInputFiles* inputs,
+                                                 int* parent_index,
+                                                 int* base_index) {
   // level 0 files are overlapping. So we cannot pick more
   // than one concurrent compactions at this level. This
   // could be made better by looking at key-ranges that are
   // being compacted at level 0.
-  if (level == 0 && compactions_in_progress_[level].size() == 1) {
-    return nullptr;
+  if (level == 0 && !level0_compactions_in_progress_.empty()) {
+    return false;
   }
 
+  inputs->clear();
+
   assert(level >= 0);
-  assert(level + 1 < NumberLevels());
-  c = new Compaction(version, level, level + 1, MaxFileSizeForLevel(level + 1),
-                     MaxGrandParentOverlapBytes(level));
-  c->score_ = score;
 
   // Pick the largest file in this level that is not already
   // being compacted
-  std::vector<int>& file_size = c->input_version_->files_by_size_[level];
+  const std::vector<int>& file_size = vstorage->FilesBySize(level);
+  const std::vector<FileMetaData*>& level_files = vstorage->LevelFiles(level);
 
   // record the first file that is not yet compacted
   int nextIndex = -1;
 
-  for (unsigned int i = c->input_version_->next_file_to_compact_by_size_[level];
+  for (unsigned int i = vstorage->NextCompactionIndex(level);
        i < file_size.size(); i++) {
     int index = file_size[i];
-    FileMetaData* f = c->input_version_->files_[level][index];
+    auto* f = level_files[index];
 
-    // check to verify files are arranged in descending size
     assert((i == file_size.size() - 1) ||
-           (i >= Version::number_of_files_to_sort_ - 1) ||
-           (f->file_size >=
-            c->input_version_->files_[level][file_size[i + 1]]->file_size));
+           (i >= VersionStorageInfo::kNumberFilesToSort - 1) ||
+           (f->compensated_file_size >=
+            level_files[file_size[i + 1]]->compensated_file_size));
 
     // do not pick a file to compact if it is being compacted
     // from n-1 level.
@@ -521,140 +987,279 @@ Compaction* LevelCompactionPicker::PickCompactionBySize(Version* version,
 
     // Do not pick this file if its parents at level+1 are being compacted.
     // Maybe we can avoid redoing this work in SetupOtherInputs
-    int parent_index = -1;
-    if (ParentRangeInCompaction(c->input_version_, &f->smallest, &f->largest,
-                                level, &parent_index)) {
+    *parent_index = -1;
+    if (RangeInCompaction(vstorage, &f->smallest, &f->largest, output_level,
+                          parent_index)) {
       continue;
     }
-    c->inputs_[0].push_back(f);
-    c->base_index_ = index;
-    c->parent_index_ = parent_index;
+    inputs->files.push_back(f);
+    inputs->level = level;
+    *base_index = index;
     break;
   }
 
-  if (c->inputs_[0].empty()) {
-    delete c;
-    c = nullptr;
+  // store where to start the iteration in the next call to PickCompaction
+  vstorage->SetNextCompactionIndex(level, nextIndex);
+
+  return inputs->size() > 0;
+}
+
+#ifndef ROCKSDB_LITE
+bool UniversalCompactionPicker::NeedsCompaction(
+    const VersionStorageInfo* vstorage) const {
+  const int kLevel0 = 0;
+  return vstorage->CompactionScore(kLevel0) >= 1;
+}
+
+void UniversalCompactionPicker::SortedRun::Dump(char* out_buf,
+                                                size_t out_buf_size,
+                                                bool print_path) const {
+  if (level == 0) {
+    assert(file != nullptr);
+    if (file->fd.GetPathId() == 0 || !print_path) {
+      snprintf(out_buf, out_buf_size, "file %" PRIu64, file->fd.GetNumber());
+    } else {
+      snprintf(out_buf, out_buf_size, "file %" PRIu64
+                                      "(path "
+                                      "%" PRIu32 ")",
+               file->fd.GetNumber(), file->fd.GetPathId());
+    }
+  } else {
+    snprintf(out_buf, out_buf_size, "level %d", level);
+  }
+}
+
+void UniversalCompactionPicker::SortedRun::DumpSizeInfo(
+    char* out_buf, size_t out_buf_size, int sorted_run_count) const {
+  if (level == 0) {
+    assert(file != nullptr);
+    snprintf(out_buf, out_buf_size,
+             "file %" PRIu64
+             "[%d] "
+             "with size %" PRIu64 " (compensated size %" PRIu64 ")",
+             file->fd.GetNumber(), sorted_run_count, file->fd.GetFileSize(),
+             file->compensated_file_size);
+  } else {
+    snprintf(out_buf, out_buf_size,
+             "level %d[%d] "
+             "with size %" PRIu64 " (compensated size %" PRIu64 ")",
+             level, sorted_run_count, size, compensated_file_size);
   }
+}
 
-  // store where to start the iteration in the next call to PickCompaction
-  version->next_file_to_compact_by_size_[level] = nextIndex;
+std::vector<UniversalCompactionPicker::SortedRun>
+UniversalCompactionPicker::CalculateSortedRuns(
+    const VersionStorageInfo& vstorage) {
+  std::vector<UniversalCompactionPicker::SortedRun> ret;
+  for (FileMetaData* f : vstorage.LevelFiles(0)) {
+    ret.emplace_back(0, f, f->fd.GetFileSize(), f->compensated_file_size,
+                     f->being_compacted);
+  }
+  for (int level = 1; level < vstorage.num_levels(); level++) {
+    uint64_t total_compensated_size = 0U;
+    uint64_t total_size = 0U;
+    bool being_compacted = false;
+    bool is_first = true;
+    for (FileMetaData* f : vstorage.LevelFiles(level)) {
+      total_compensated_size += f->compensated_file_size;
+      total_size += f->fd.GetFileSize();
+      // Compaction always includes all files for a non-zero level, so for a
+      // non-zero level, all the files should share the same being_compacted
+      // value.
+      assert(is_first || f->being_compacted == being_compacted);
+      if (is_first) {
+        being_compacted = f->being_compacted;
+        is_first = false;
+      }
+    }
+    if (total_compensated_size > 0) {
+      ret.emplace_back(level, nullptr, total_size, total_compensated_size,
+                       being_compacted);
+    }
+  }
+  return ret;
+}
 
-  return c;
+#ifndef NDEBUG
+namespace {
+// smallest_seqno and largest_seqno are set iff. `files` is not empty.
+void GetSmallestLargestSeqno(const std::vector<FileMetaData*>& files,
+                             SequenceNumber* smallest_seqno,
+                             SequenceNumber* largest_seqno) {
+  bool is_first = true;
+  for (FileMetaData* f : files) {
+    assert(f->smallest_seqno <= f->largest_seqno);
+    if (is_first) {
+      is_first = false;
+      *smallest_seqno = f->smallest_seqno;
+      *largest_seqno = f->largest_seqno;
+    } else {
+      if (f->smallest_seqno < *smallest_seqno) {
+        *smallest_seqno = f->smallest_seqno;
+      }
+      if (f->largest_seqno > *largest_seqno) {
+        *largest_seqno = f->largest_seqno;
+      }
+    }
+  }
 }
+}  // namespace
+#endif
 
 // Universal style of compaction. Pick files that are contiguous in
 // time-range to compact.
 //
-Compaction* UniversalCompactionPicker::PickCompaction(Version* version,
-                                                      LogBuffer* log_buffer) {
-  int level = 0;
-  double score = version->compaction_score_[0];
-
-  if ((version->files_[level].size() <
-       (unsigned int)options_->level0_file_num_compaction_trigger)) {
-    LogToBuffer(log_buffer, "[%s] Universal: nothing to do\n",
-                version->cfd_->GetName().c_str());
+Compaction* UniversalCompactionPicker::PickCompaction(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    VersionStorageInfo* vstorage, LogBuffer* log_buffer) {
+  const int kLevel0 = 0;
+  double score = vstorage->CompactionScore(kLevel0);
+  std::vector<SortedRun> sorted_runs = CalculateSortedRuns(*vstorage);
+
+  if (sorted_runs.size() <
+      (unsigned int)mutable_cf_options.level0_file_num_compaction_trigger) {
+    LogToBuffer(log_buffer, "[%s] Universal: nothing to do\n", cf_name.c_str());
     return nullptr;
   }
-  Version::FileSummaryStorage tmp;
-  LogToBuffer(log_buffer, "[%s] Universal: candidate files(%zu): %s\n",
-              version->cfd_->GetName().c_str(), version->files_[level].size(),
-              version->LevelFileSummary(&tmp, 0));
+  VersionStorageInfo::LevelSummaryStorage tmp;
+  LogToBuffer(log_buffer, 3072, "[%s] Universal: sorted runs files(%zu): %s\n",
+              cf_name.c_str(), sorted_runs.size(),
+              vstorage->LevelSummary(&tmp));
 
   // Check for size amplification first.
   Compaction* c;
-  if ((c = PickCompactionUniversalSizeAmp(version, score, log_buffer)) !=
+  if ((c = PickCompactionUniversalSizeAmp(cf_name, mutable_cf_options, vstorage,
+                                          score, sorted_runs, log_buffer)) !=
       nullptr) {
     LogToBuffer(log_buffer, "[%s] Universal: compacting for size amp\n",
-                version->cfd_->GetName().c_str());
+                cf_name.c_str());
   } else {
     // Size amplification is within limits. Try reducing read
     // amplification while maintaining file size ratios.
-    unsigned int ratio = options_->compaction_options_universal.size_ratio;
+    unsigned int ratio = ioptions_.compaction_options_universal.size_ratio;
 
-    if ((c = PickCompactionUniversalReadAmp(version, score, ratio, UINT_MAX,
-                                            log_buffer)) != nullptr) {
+    if ((c = PickCompactionUniversalReadAmp(
+             cf_name, mutable_cf_options, vstorage, score, ratio, UINT_MAX,
+             sorted_runs, log_buffer)) != nullptr) {
       LogToBuffer(log_buffer, "[%s] Universal: compacting for size ratio\n",
-                  version->cfd_->GetName().c_str());
+                  cf_name.c_str());
     } else {
       // Size amplification and file size ratios are within configured limits.
       // If max read amplification is exceeding configured limits, then force
       // compaction without looking at filesize ratios and try to reduce
       // the number of files to fewer than level0_file_num_compaction_trigger.
-      unsigned int num_files = version->files_[level].size() -
-                               options_->level0_file_num_compaction_trigger;
+      // This is guaranteed by NeedsCompaction()
+      assert(sorted_runs.size() >=
+             static_cast<size_t>(
+                 mutable_cf_options.level0_file_num_compaction_trigger));
+      unsigned int num_files =
+          static_cast<unsigned int>(sorted_runs.size()) -
+          mutable_cf_options.level0_file_num_compaction_trigger;
       if ((c = PickCompactionUniversalReadAmp(
-               version, score, UINT_MAX, num_files, log_buffer)) != nullptr) {
-        LogToBuffer(log_buffer, "[%s] Universal: compacting for file num\n",
-                    version->cfd_->GetName().c_str());
+               cf_name, mutable_cf_options, vstorage, score, UINT_MAX,
+               num_files, sorted_runs, log_buffer)) != nullptr) {
+        LogToBuffer(log_buffer,
+                    "[%s] Universal: compacting for file num -- %u\n",
+                    cf_name.c_str(), num_files);
       }
     }
   }
   if (c == nullptr) {
     return nullptr;
   }
-  assert(c->inputs_[0].size() > 1);
 
-  // validate that all the chosen files are non overlapping in time
-  FileMetaData* newerfile __attribute__((unused)) = nullptr;
-  for (unsigned int i = 0; i < c->inputs_[0].size(); i++) {
-    FileMetaData* f = c->inputs_[0][i];
-    assert (f->smallest_seqno <= f->largest_seqno);
-    assert(newerfile == nullptr ||
-           newerfile->smallest_seqno > f->largest_seqno);
-    newerfile = f;
+// validate that all the chosen files of L0 are non overlapping in time
+#ifndef NDEBUG
+  SequenceNumber prev_smallest_seqno = 0U;
+  bool is_first = true;
+
+  size_t level_index = 0U;
+  if (c->start_level() == 0) {
+    for (auto f : *c->inputs(0)) {
+      assert(f->smallest_seqno <= f->largest_seqno);
+      if (is_first) {
+        is_first = false;
+      } else {
+        assert(prev_smallest_seqno > f->largest_seqno);
+      }
+      prev_smallest_seqno = f->smallest_seqno;
+    }
+    level_index = 1U;
   }
-
-  // The files are sorted from newest first to oldest last.
-  std::vector<int>& file_by_time = c->input_version_->files_by_size_[level];
-
-  // Is the earliest file part of this compaction?
-  int last_index = file_by_time[file_by_time.size()-1];
-  FileMetaData* last_file = c->input_version_->files_[level][last_index];
-  if (c->inputs_[0][c->inputs_[0].size()-1] == last_file) {
-    c->bottommost_level_ = true;
+  for (; level_index < c->num_input_levels(); level_index++) {
+    if (c->num_input_files(level_index) != 0) {
+      SequenceNumber smallest_seqno = 0U;
+      SequenceNumber largest_seqno = 0U;
+      GetSmallestLargestSeqno(*(c->inputs(level_index)), &smallest_seqno,
+                              &largest_seqno);
+      if (is_first) {
+        is_first = false;
+      } else {
+        assert(prev_smallest_seqno > largest_seqno);
+      }
+      prev_smallest_seqno = smallest_seqno;
+    }
   }
-
+#endif
   // update statistics
-  MeasureTime(options_->statistics.get(), NUM_FILES_IN_SINGLE_COMPACTION,
-              c->inputs_[0].size());
-
-  // mark all the files that are being compacted
-  c->MarkFilesBeingCompacted(true);
+  MeasureTime(ioptions_.statistics, NUM_FILES_IN_SINGLE_COMPACTION,
+              c->inputs(0)->size());
 
-  // remember this currently undergoing compaction
-  compactions_in_progress_[level].insert(c);
-
-  // Record whether this compaction includes all sst files.
-  // For now, it is only relevant in universal compaction mode.
-  c->is_full_compaction_ =
-      (c->inputs_[0].size() == c->input_version_->files_[0].size());
+  level0_compactions_in_progress_.insert(c);
 
   return c;
 }
 
+uint32_t UniversalCompactionPicker::GetPathId(
+    const ImmutableCFOptions& ioptions, uint64_t file_size) {
+  // Two conditions need to be satisfied:
+  // (1) the target path needs to be able to hold the file's size
+  // (2) Total size left in this and previous paths need to be not
+  //     smaller than expected future file size before this new file is
+  //     compacted, which is estimated based on size_ratio.
+  // For example, if now we are compacting files of size (1, 1, 2, 4, 8),
+  // we will make sure the target file, probably with size of 16, will be
+  // placed in a path so that eventually when new files are generated and
+  // compacted to (1, 1, 2, 4, 8, 16), all those files can be stored in or
+  // before the path we chose.
+  //
+  // TODO(sdong): now the case of multiple column families is not
+  // considered in this algorithm. So the target size can be violated in
+  // that case. We need to improve it.
+  uint64_t accumulated_size = 0;
+  uint64_t future_size = file_size *
+    (100 - ioptions.compaction_options_universal.size_ratio) / 100;
+  uint32_t p = 0;
+  assert(!ioptions.db_paths.empty());
+  for (; p < ioptions.db_paths.size() - 1; p++) {
+    uint64_t target_size = ioptions.db_paths[p].target_size;
+    if (target_size > file_size &&
+        accumulated_size + (target_size - file_size) > future_size) {
+      return p;
+    }
+    accumulated_size += target_size;
+  }
+  return p;
+}
+
 //
 // Consider compaction files based on their size differences with
 // the next file in time order.
 //
 Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp(
-    Version* version, double score, unsigned int ratio,
-    unsigned int max_number_of_files_to_compact, LogBuffer* log_buffer) {
-  int level = 0;
-
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    VersionStorageInfo* vstorage, double score, unsigned int ratio,
+    unsigned int max_number_of_files_to_compact,
+    const std::vector<SortedRun>& sorted_runs, LogBuffer* log_buffer) {
   unsigned int min_merge_width =
-    options_->compaction_options_universal.min_merge_width;
+    ioptions_.compaction_options_universal.min_merge_width;
   unsigned int max_merge_width =
-    options_->compaction_options_universal.max_merge_width;
+    ioptions_.compaction_options_universal.max_merge_width;
 
-  // The files are sorted from newest first to oldest last.
-  std::vector<int>& file_by_time = version->files_by_size_[level];
-  FileMetaData* f = nullptr;
+  const SortedRun* sr = nullptr;
   bool done = false;
   int start_index = 0;
   unsigned int candidate_count = 0;
-  assert(file_by_time.size() == version->files_[level].size());
 
   unsigned int max_files_to_compact = std::min(max_merge_width,
                                        max_number_of_files_to_compact);
@@ -662,41 +1267,43 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp(
 
   // Considers a candidate file only if it is smaller than the
   // total size accumulated so far.
-  for (unsigned int loop = 0; loop < file_by_time.size(); loop++) {
-
+  for (unsigned int loop = 0; loop < sorted_runs.size(); loop++) {
     candidate_count = 0;
 
     // Skip files that are already being compacted
-    for (f = nullptr; loop < file_by_time.size(); loop++) {
-      int index = file_by_time[loop];
-      f = version->files_[level][index];
+    for (sr = nullptr; loop < sorted_runs.size(); loop++) {
+      sr = &sorted_runs[loop];
 
-      if (!f->being_compacted) {
+      if (!sr->being_compacted) {
         candidate_count = 1;
         break;
       }
-      LogToBuffer(
-          log_buffer, "[%s] Universal: file %lu[%d] being compacted, skipping",
-          version->cfd_->GetName().c_str(), (unsigned long)f->number, loop);
-      f = nullptr;
+      char file_num_buf[kFormatFileNumberBufSize];
+      sr->Dump(file_num_buf, sizeof(file_num_buf));
+      LogToBuffer(log_buffer,
+                  "[%s] Universal: %s"
+                  "[%d] being compacted, skipping",
+                  cf_name.c_str(), file_num_buf, loop);
+
+      sr = nullptr;
     }
 
     // This file is not being compacted. Consider it as the
     // first candidate to be compacted.
-    uint64_t candidate_size =  f != nullptr? f->file_size : 0;
-    if (f != nullptr) {
-      LogToBuffer(
-          log_buffer, "[%s] Universal: Possible candidate file %lu[%d].",
-          version->cfd_->GetName().c_str(), (unsigned long)f->number, loop);
+    uint64_t candidate_size = sr != nullptr ? sr->compensated_file_size : 0;
+    if (sr != nullptr) {
+      char file_num_buf[kFormatFileNumberBufSize];
+      sr->Dump(file_num_buf, sizeof(file_num_buf), true);
+      LogToBuffer(log_buffer, "[%s] Universal: Possible candidate %s[%d].",
+                  cf_name.c_str(), file_num_buf, loop);
     }
 
-    // Check if the suceeding files need compaction.
-    for (unsigned int i = loop+1;
-         candidate_count < max_files_to_compact && i < file_by_time.size();
+    // Check if the succeeding files need compaction.
+    for (unsigned int i = loop + 1;
+         candidate_count < max_files_to_compact && i < sorted_runs.size();
          i++) {
-      int index = file_by_time[i];
-      FileMetaData* f = version->files_[level][index];
-      if (f->being_compacted) {
+      const SortedRun* succeeding_sr = &sorted_runs[i];
+      if (succeeding_sr->being_compacted) {
         break;
       }
       // Pick files if the total/last candidate file size (increased by the
@@ -705,24 +1312,25 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp(
       // default kCompactionStopStyleTotalSize; with
       // kCompactionStopStyleSimilarSize, it's simply the size of the last
       // picked file.
-      uint64_t sz = (candidate_size * (100L + ratio)) /100;
-      if (sz < f->file_size) {
+      double sz = candidate_size * (100.0 + ratio) / 100.0;
+      if (sz < static_cast<double>(succeeding_sr->size)) {
         break;
       }
-      if (options_->compaction_options_universal.stop_style == kCompactionStopStyleSimilarSize) {
+      if (ioptions_.compaction_options_universal.stop_style ==
+          kCompactionStopStyleSimilarSize) {
         // Similar-size stopping rule: also check the last picked file isn't
         // far larger than the next candidate file.
-        sz = (f->file_size * (100L + ratio)) / 100;
-        if (sz < candidate_size) {
+        sz = (succeeding_sr->size * (100.0 + ratio)) / 100.0;
+        if (sz < static_cast<double>(candidate_size)) {
           // If the small file we've encountered begins a run of similar-size
           // files, we'll pick them up on a future iteration of the outer
           // loop. If it's some lonely straggler, it'll eventually get picked
           // by the last-resort read amp strategy which disregards size ratios.
           break;
         }
-        candidate_size = f->file_size;
-      } else { // default kCompactionStopStyleTotalSize
-        candidate_size += f->file_size;
+        candidate_size = succeeding_sr->compensated_file_size;
+      } else {  // default kCompactionStopStyleTotalSize
+        candidate_size += succeeding_sr->compensated_file_size;
       }
       candidate_count++;
     }
@@ -734,13 +1342,12 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp(
       break;
     } else {
       for (unsigned int i = loop;
-           i < loop + candidate_count && i < file_by_time.size(); i++) {
-       int index = file_by_time[i];
-       FileMetaData* f = version->files_[level][index];
-       LogToBuffer(log_buffer,
-                   "[%s] Universal: Skipping file %lu[%d] with size %lu %d\n",
-                   version->cfd_->GetName().c_str(), (unsigned long)f->number,
-                   i, (unsigned long)f->file_size, f->being_compacted);
+           i < loop + candidate_count && i < sorted_runs.size(); i++) {
+        const SortedRun* skipping_sr = &sorted_runs[i];
+        char file_num_buf[256];
+        skipping_sr->DumpSizeInfo(file_num_buf, sizeof(file_num_buf), loop);
+        LogToBuffer(log_buffer, "[%s] Universal: Skipping %s", cf_name.c_str(),
+                    file_num_buf);
       }
     }
   }
@@ -752,34 +1359,64 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp(
   // size ratio of compression.
   bool enable_compression = true;
   int ratio_to_compress =
-      options_->compaction_options_universal.compression_size_percent;
+      ioptions_.compaction_options_universal.compression_size_percent;
   if (ratio_to_compress >= 0) {
-    uint64_t total_size = version->NumLevelBytes(level);
+    uint64_t total_size = 0;
+    for (auto& sorted_run : sorted_runs) {
+      total_size += sorted_run.compensated_file_size;
+    }
+
     uint64_t older_file_size = 0;
-    for (unsigned int i = file_by_time.size() - 1; i >= first_index_after;
-        i--) {
-      older_file_size += version->files_[level][file_by_time[i]]->file_size;
+    for (size_t i = sorted_runs.size() - 1; i >= first_index_after; i--) {
+      older_file_size += sorted_runs[i].size;
       if (older_file_size * 100L >= total_size * (long) ratio_to_compress) {
         enable_compression = false;
         break;
       }
     }
   }
-  Compaction* c =
-      new Compaction(version, level, level, MaxFileSizeForLevel(level),
-                     LLONG_MAX, false, enable_compression);
-  c->score_ = score;
 
+  uint64_t estimated_total_size = 0;
+  for (unsigned int i = 0; i < first_index_after; i++) {
+    estimated_total_size += sorted_runs[i].size;
+  }
+  uint32_t path_id = GetPathId(ioptions_, estimated_total_size);
+  int start_level = sorted_runs[start_index].level;
+  int output_level;
+  if (first_index_after == sorted_runs.size()) {
+    output_level = vstorage->num_levels() - 1;
+  } else if (sorted_runs[first_index_after].level == 0) {
+    output_level = 0;
+  } else {
+    output_level = sorted_runs[first_index_after].level - 1;
+  }
+
+  std::vector<CompactionInputFiles> inputs(vstorage->num_levels());
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    inputs[i].level = start_level + static_cast<int>(i);
+  }
   for (unsigned int i = start_index; i < first_index_after; i++) {
-    int index = file_by_time[i];
-    FileMetaData* f = c->input_version_->files_[level][index];
-    c->inputs_[0].push_back(f);
-    LogToBuffer(log_buffer,
-                "[%s] Universal: Picking file %lu[%d] with size %lu\n",
-                version->cfd_->GetName().c_str(), (unsigned long)f->number, i,
-                (unsigned long)f->file_size);
+    auto& picking_sr = sorted_runs[i];
+    if (picking_sr.level == 0) {
+      FileMetaData* picking_file = picking_sr.file;
+      inputs[0].files.push_back(picking_file);
+    } else {
+      auto& files = inputs[picking_sr.level - start_level].files;
+      for (auto* f : vstorage->LevelFiles(picking_sr.level)) {
+        files.push_back(f);
+      }
+    }
+    char file_num_buf[256];
+    picking_sr.DumpSizeInfo(file_num_buf, sizeof(file_num_buf), i);
+    LogToBuffer(log_buffer, "[%s] Universal: Picking %s", cf_name.c_str(),
+                file_num_buf);
   }
-  return c;
+
+  return new Compaction(
+      vstorage, mutable_cf_options, std::move(inputs), output_level,
+      mutable_cf_options.MaxFileSizeForLevel(output_level), LLONG_MAX, path_id,
+      GetCompressionType(ioptions_, start_level, 1, enable_compression),
+      /* grandparents */ {}, /* is manual */ false, score);
 }
 
 // Look at overall size amplification. If size amplification
@@ -789,57 +1426,57 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp(
 // min_merge_width and max_merge_width).
 //
 Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp(
-    Version* version, double score, LogBuffer* log_buffer) {
-  int level = 0;
-
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    VersionStorageInfo* vstorage, double score,
+    const std::vector<SortedRun>& sorted_runs, LogBuffer* log_buffer) {
   // percentage flexibilty while reducing size amplification
-  uint64_t ratio = options_->compaction_options_universal.
+  uint64_t ratio = ioptions_.compaction_options_universal.
                      max_size_amplification_percent;
 
-  // The files are sorted from newest first to oldest last.
-  std::vector<int>& file_by_time = version->files_by_size_[level];
-  assert(file_by_time.size() == version->files_[level].size());
-
   unsigned int candidate_count = 0;
   uint64_t candidate_size = 0;
   unsigned int start_index = 0;
-  FileMetaData* f = nullptr;
+  const SortedRun* sr = nullptr;
 
   // Skip files that are already being compacted
-  for (unsigned int loop = 0; loop < file_by_time.size() - 1; loop++) {
-    int index = file_by_time[loop];
-    f = version->files_[level][index];
-    if (!f->being_compacted) {
+  for (unsigned int loop = 0; loop < sorted_runs.size() - 1; loop++) {
+    sr = &sorted_runs[loop];
+    if (!sr->being_compacted) {
       start_index = loop;         // Consider this as the first candidate.
       break;
     }
-    LogToBuffer(log_buffer,
-                "[%s] Universal: skipping file %lu[%d] compacted %s",
-                version->cfd_->GetName().c_str(), (unsigned long)f->number,
-                loop, " cannot be a candidate to reduce size amp.\n");
-    f = nullptr;
+    char file_num_buf[kFormatFileNumberBufSize];
+    sr->Dump(file_num_buf, sizeof(file_num_buf), true);
+    LogToBuffer(log_buffer, "[%s] Universal: skipping %s[%d] compacted %s",
+                cf_name.c_str(), file_num_buf, loop,
+                " cannot be a candidate to reduce size amp.\n");
+    sr = nullptr;
   }
-  if (f == nullptr) {
+
+  if (sr == nullptr) {
     return nullptr;             // no candidate files
   }
-
-  LogToBuffer(log_buffer, "[%s] Universal: First candidate file %lu[%d] %s",
-              version->cfd_->GetName().c_str(), (unsigned long)f->number,
-              start_index, " to reduce size amp.\n");
+  {
+    char file_num_buf[kFormatFileNumberBufSize];
+    sr->Dump(file_num_buf, sizeof(file_num_buf), true);
+    LogToBuffer(log_buffer, "[%s] Universal: First candidate %s[%d] %s",
+                cf_name.c_str(), file_num_buf, start_index,
+                " to reduce size amp.\n");
+  }
 
   // keep adding up all the remaining files
-  for (unsigned int loop = start_index; loop < file_by_time.size() - 1;
-       loop++) {
-    int index = file_by_time[loop];
-    f = version->files_[level][index];
-    if (f->being_compacted) {
+  for (unsigned int loop = start_index; loop < sorted_runs.size() - 1; loop++) {
+    sr = &sorted_runs[loop];
+    if (sr->being_compacted) {
+      char file_num_buf[kFormatFileNumberBufSize];
+      sr->Dump(file_num_buf, sizeof(file_num_buf), true);
       LogToBuffer(
-          log_buffer, "[%s] Universal: Possible candidate file %lu[%d] %s.",
-          version->cfd_->GetName().c_str(), (unsigned long)f->number, loop,
+          log_buffer, "[%s] Universal: Possible candidate %s[%d] %s",
+          cf_name.c_str(), file_num_buf, start_index,
           " is already being compacted. No size amp reduction possible.\n");
       return nullptr;
     }
-    candidate_size += f->file_size;
+    candidate_size += sr->compensated_file_size;
     candidate_count++;
   }
   if (candidate_count == 0) {
@@ -847,43 +1484,140 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp(
   }
 
   // size of earliest file
-  int index = file_by_time[file_by_time.size() - 1];
-  uint64_t earliest_file_size = version->files_[level][index]->file_size;
+  uint64_t earliest_file_size = sorted_runs.back().size;
 
   // size amplification = percentage of additional size
   if (candidate_size * 100 < ratio * earliest_file_size) {
     LogToBuffer(
         log_buffer,
-        "[%s] Universal: size amp not needed. newer-files-total-size %lu "
-        "earliest-file-size %lu",
-        version->cfd_->GetName().c_str(), (unsigned long)candidate_size,
-        (unsigned long)earliest_file_size);
+        "[%s] Universal: size amp not needed. newer-files-total-size %" PRIu64
+        "earliest-file-size %" PRIu64,
+        cf_name.c_str(), candidate_size, earliest_file_size);
     return nullptr;
   } else {
-    LogToBuffer(log_buffer,
-                "[%s] Universal: size amp needed. newer-files-total-size %lu "
-                "earliest-file-size %lu",
-                version->cfd_->GetName().c_str(), (unsigned long)candidate_size,
-                (unsigned long)earliest_file_size);
+    LogToBuffer(
+        log_buffer,
+        "[%s] Universal: size amp needed. newer-files-total-size %" PRIu64
+        "earliest-file-size %" PRIu64,
+        cf_name.c_str(), candidate_size, earliest_file_size);
   }
-  assert(start_index >= 0 && start_index < file_by_time.size() - 1);
+  assert(start_index < sorted_runs.size() - 1);
 
-  // create a compaction request
+  // Estimate total file size
+  uint64_t estimated_total_size = 0;
+  for (unsigned int loop = start_index; loop < sorted_runs.size(); loop++) {
+    estimated_total_size += sorted_runs[loop].size;
+  }
+  uint32_t path_id = GetPathId(ioptions_, estimated_total_size);
+  int start_level = sorted_runs[start_index].level;
+
+  std::vector<CompactionInputFiles> inputs(vstorage->num_levels());
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    inputs[i].level = start_level + static_cast<int>(i);
+  }
   // We always compact all the files, so always compress.
-  Compaction* c =
-      new Compaction(version, level, level, MaxFileSizeForLevel(level),
-                     LLONG_MAX, false, true);
-  c->score_ = score;
-  for (unsigned int loop = start_index; loop < file_by_time.size(); loop++) {
-    int index = file_by_time[loop];
-    f = c->input_version_->files_[level][index];
-    c->inputs_[0].push_back(f);
+  for (unsigned int loop = start_index; loop < sorted_runs.size(); loop++) {
+    auto& picking_sr = sorted_runs[loop];
+    if (picking_sr.level == 0) {
+      FileMetaData* f = picking_sr.file;
+      inputs[0].files.push_back(f);
+    } else {
+      auto& files = inputs[picking_sr.level - start_level].files;
+      for (auto* f : vstorage->LevelFiles(picking_sr.level)) {
+        files.push_back(f);
+      }
+    }
+    char file_num_buf[256];
+    sr->DumpSizeInfo(file_num_buf, sizeof(file_num_buf), loop);
+    LogToBuffer(log_buffer, "[%s] Universal: size amp picking %s",
+                cf_name.c_str(), file_num_buf);
+  }
+
+  return new Compaction(
+      vstorage, mutable_cf_options, std::move(inputs),
+      vstorage->num_levels() - 1,
+      mutable_cf_options.MaxFileSizeForLevel(vstorage->num_levels() - 1),
+      /* max_grandparent_overlap_bytes */ LLONG_MAX, path_id,
+      GetCompressionType(ioptions_, vstorage->num_levels() - 1, 1),
+      /* grandparents */ {}, /* is manual */ false, score);
+}
+
+bool FIFOCompactionPicker::NeedsCompaction(const VersionStorageInfo* vstorage)
+    const {
+  const int kLevel0 = 0;
+  return vstorage->CompactionScore(kLevel0) >= 1;
+}
+
+Compaction* FIFOCompactionPicker::PickCompaction(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    VersionStorageInfo* vstorage, LogBuffer* log_buffer) {
+  assert(vstorage->num_levels() == 1);
+  const int kLevel0 = 0;
+  const std::vector<FileMetaData*>& level_files = vstorage->LevelFiles(kLevel0);
+  uint64_t total_size = 0;
+  for (const auto& file : level_files) {
+    total_size += file->fd.file_size;
+  }
+
+  if (total_size <= ioptions_.compaction_options_fifo.max_table_files_size ||
+      level_files.size() == 0) {
+    // total size not exceeded
     LogToBuffer(log_buffer,
-                "[%s] Universal: size amp picking file %lu[%d] with size %lu",
-                version->cfd_->GetName().c_str(), (unsigned long)f->number,
-                index, (unsigned long)f->file_size);
+                "[%s] FIFO compaction: nothing to do. Total size %" PRIu64
+                ", max size %" PRIu64 "\n",
+                cf_name.c_str(), total_size,
+                ioptions_.compaction_options_fifo.max_table_files_size);
+    return nullptr;
+  }
+
+  if (!level0_compactions_in_progress_.empty()) {
+    LogToBuffer(log_buffer,
+                "[%s] FIFO compaction: Already executing compaction. No need "
+                "to run parallel compactions since compactions are very fast",
+                cf_name.c_str());
+    return nullptr;
   }
+
+  std::vector<CompactionInputFiles> inputs;
+  inputs.emplace_back();
+  inputs[0].level = 0;
+  // delete old files (FIFO)
+  for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) {
+    auto f = *ritr;
+    total_size -= f->compensated_file_size;
+    inputs[0].files.push_back(f);
+    char tmp_fsize[16];
+    AppendHumanBytes(f->fd.GetFileSize(), tmp_fsize, sizeof(tmp_fsize));
+    LogToBuffer(log_buffer, "[%s] FIFO compaction: picking file %" PRIu64
+                            " with size %s for deletion",
+                cf_name.c_str(), f->fd.GetNumber(), tmp_fsize);
+    if (total_size <= ioptions_.compaction_options_fifo.max_table_files_size) {
+      break;
+    }
+  }
+  Compaction* c = new Compaction(
+      vstorage, mutable_cf_options, std::move(inputs), 0, 0, 0, 0,
+      kNoCompression, {}, /* is manual */ false, vstorage->CompactionScore(0),
+      /* is deletion compaction */ true);
+  level0_compactions_in_progress_.insert(c);
   return c;
 }
 
+Compaction* FIFOCompactionPicker::CompactRange(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    VersionStorageInfo* vstorage, int input_level, int output_level,
+    uint32_t output_path_id, const InternalKey* begin, const InternalKey* end,
+    InternalKey** compaction_end) {
+  assert(input_level == 0);
+  assert(output_level == 0);
+  *compaction_end = nullptr;
+  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, ioptions_.info_log);
+  Compaction* c =
+      PickCompaction(cf_name, mutable_cf_options, vstorage, &log_buffer);
+  log_buffer.FlushBufferToLog();
+  return c;
+}
+
+#endif  // !ROCKSDB_LITE
+
 }  // namespace rocksdb
diff --git a/src/rocksdb/db/compaction_picker.h b/src/rocksdb/db/compaction_picker.h
index 6527ef9..4034101 100644
--- a/src/rocksdb/db/compaction_picker.h
+++ b/src/rocksdb/db/compaction_picker.h
@@ -8,32 +8,43 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #pragma once
+#include <vector>
+#include <memory>
+#include <set>
+#include <unordered_set>
+
 #include "db/version_set.h"
 #include "db/compaction.h"
 #include "rocksdb/status.h"
 #include "rocksdb/options.h"
 #include "rocksdb/env.h"
+#include "util/mutable_cf_options.h"
 
 #include <vector>
 #include <memory>
 #include <set>
+#include <string>
 
 namespace rocksdb {
 
 class LogBuffer;
 class Compaction;
-class Version;
+class VersionStorageInfo;
+struct CompactionInputFiles;
 
 class CompactionPicker {
  public:
-  CompactionPicker(const Options* options, const InternalKeyComparator* icmp);
+  CompactionPicker(const ImmutableCFOptions& ioptions,
+                   const InternalKeyComparator* icmp);
   virtual ~CompactionPicker();
 
   // Pick level and inputs for a new compaction.
   // Returns nullptr if there is no compaction to be done.
   // Otherwise returns a pointer to a heap-allocated object that
   // describes the compaction.  Caller should delete the result.
-  virtual Compaction* PickCompaction(Version* version,
+  virtual Compaction* PickCompaction(const std::string& cf_name,
+                                     const MutableCFOptions& mutable_cf_options,
+                                     VersionStorageInfo* vstorage,
                                      LogBuffer* log_buffer) = 0;
 
   // Return a compaction object for compacting the range [begin,end] in
@@ -47,42 +58,67 @@ class CompactionPicker {
   // compaction_end will be set to nullptr.
   // Client is responsible for compaction_end storage -- when called,
   // *compaction_end should point to valid InternalKey!
-  Compaction* CompactRange(Version* version, int input_level, int output_level,
-                           const InternalKey* begin, const InternalKey* end,
-                           InternalKey** compaction_end);
+  virtual Compaction* CompactRange(
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      VersionStorageInfo* vstorage, int input_level, int output_level,
+      uint32_t output_path_id, const InternalKey* begin, const InternalKey* end,
+      InternalKey** compaction_end);
+
+  // The maximum allowed output level.  Default value is NumberLevels() - 1.
+  virtual int MaxOutputLevel() const {
+    return NumberLevels() - 1;
+  }
+
+  virtual bool NeedsCompaction(const VersionStorageInfo* vstorage) const = 0;
+
+  // Sanitize the input set of compaction input files.
+  // When the input parameters do not describe a valid compaction, the
+  // function will try to fix the input_files by adding necessary
+  // files.  If it's not possible to conver an invalid input_files
+  // into a valid one by adding more files, the function will return a
+  // non-ok status with specific reason.
+#ifndef ROCKSDB_LITE
+  Status SanitizeCompactionInputFiles(
+      std::unordered_set<uint64_t>* input_files,
+      const ColumnFamilyMetaData& cf_meta,
+      const int output_level) const;
+#endif  // ROCKSDB_LITE
 
   // Free up the files that participated in a compaction
   void ReleaseCompactionFiles(Compaction* c, Status status);
 
-  // Return the total amount of data that is undergoing
-  // compactions per level
-  void SizeBeingCompacted(std::vector<uint64_t>& sizes);
-
-  // Returns maximum total overlap bytes with grandparent
-  // level (i.e., level+2) before we stop building a single
-  // file in level->level+1 compaction.
-  uint64_t MaxGrandParentOverlapBytes(int level);
-
-  // Returns maximum total bytes of data on a given level.
-  double MaxBytesForLevel(int level);
-
-  // Get the max file size in a given level.
-  uint64_t MaxFileSizeForLevel(int level) const;
+  // Returns true if any one of the specified files are being compacted
+  bool FilesInCompaction(const std::vector<FileMetaData*>& files);
+
+  // Takes a list of CompactionInputFiles and returns a Compaction object.
+  Compaction* FormCompaction(
+      const CompactionOptions& compact_options,
+      const std::vector<CompactionInputFiles>& input_files, int output_level,
+      VersionStorageInfo* vstorage, const MutableCFOptions& mutable_cf_options,
+      uint32_t output_path_id) const;
+
+  // Converts a set of compaction input file numbers into
+  // a list of CompactionInputFiles.
+  Status GetCompactionInputsFromFileNumbers(
+      std::vector<CompactionInputFiles>* input_files,
+      std::unordered_set<uint64_t>* input_set,
+      const VersionStorageInfo* vstorage,
+      const CompactionOptions& compact_options) const;
 
  protected:
-  int NumberLevels() const { return num_levels_; }
+  int NumberLevels() const { return ioptions_.num_levels; }
 
   // Stores the minimal range that covers all entries in inputs in
   // *smallest, *largest.
   // REQUIRES: inputs is not empty
-  void GetRange(const std::vector<FileMetaData*>& inputs, InternalKey* smallest,
-                InternalKey* largest);
+  void GetRange(const CompactionInputFiles& inputs,
+                InternalKey* smallest, InternalKey* largest);
 
   // Stores the minimal range that covers all entries in inputs1 and inputs2
   // in *smallest, *largest.
   // REQUIRES: inputs is not empty
-  void GetRange(const std::vector<FileMetaData*>& inputs1,
-                const std::vector<FileMetaData*>& inputs2,
+  void GetRange(const CompactionInputFiles& inputs1,
+                const CompactionInputFiles& inputs2,
                 InternalKey* smallest, InternalKey* largest);
 
   // Add more files to the inputs on "level" to make sure that
@@ -95,71 +131,212 @@ class CompactionPicker {
   // populated.
   //
   // Will return false if it is impossible to apply this compaction.
-  bool ExpandWhileOverlapping(Compaction* c);
-
-  uint64_t ExpandedCompactionByteSizeLimit(int level);
-
-  // Returns true if any one of the specified files are being compacted
-  bool FilesInCompaction(std::vector<FileMetaData*>& files);
+  bool ExpandWhileOverlapping(const std::string& cf_name,
+                              VersionStorageInfo* vstorage,
+                              CompactionInputFiles* inputs);
 
   // Returns true if any one of the parent files are being compacted
-  bool ParentRangeInCompaction(Version* version, const InternalKey* smallest,
-                               const InternalKey* largest, int level,
-                               int* index);
-
-  void SetupOtherInputs(Compaction* c);
+  bool RangeInCompaction(VersionStorageInfo* vstorage,
+                         const InternalKey* smallest,
+                         const InternalKey* largest, int level, int* index);
+
+  bool SetupOtherInputs(const std::string& cf_name,
+                        const MutableCFOptions& mutable_cf_options,
+                        VersionStorageInfo* vstorage,
+                        CompactionInputFiles* inputs,
+                        CompactionInputFiles* output_level_inputs,
+                        int* parent_index, int base_index);
+
+  void GetGrandparents(VersionStorageInfo* vstorage,
+                       const CompactionInputFiles& inputs,
+                       const CompactionInputFiles& output_level_inputs,
+                       std::vector<FileMetaData*>* grandparents);
+
+  const ImmutableCFOptions& ioptions_;
+
+  // A helper function to SanitizeCompactionInputFiles() that
+  // sanitizes "input_files" by adding necessary files.
+#ifndef ROCKSDB_LITE
+  virtual Status SanitizeCompactionInputFilesForAllLevels(
+      std::unordered_set<uint64_t>* input_files,
+      const ColumnFamilyMetaData& cf_meta,
+      const int output_level) const;
+#endif  // ROCKSDB_LITE
+
+  // Keeps track of all compactions that are running on Level0.
+  // It is protected by DB mutex
+  std::set<Compaction*> level0_compactions_in_progress_;
 
-  // record all the ongoing compactions for all levels
-  std::vector<std::set<Compaction*>> compactions_in_progress_;
+  const InternalKeyComparator* const icmp_;
+};
 
-  // Per-level target file size.
-  std::unique_ptr<uint64_t[]> max_file_size_;
+class LevelCompactionPicker : public CompactionPicker {
+ public:
+  LevelCompactionPicker(const ImmutableCFOptions& ioptions,
+                        const InternalKeyComparator* icmp)
+      : CompactionPicker(ioptions, icmp) {}
+  virtual Compaction* PickCompaction(const std::string& cf_name,
+                                     const MutableCFOptions& mutable_cf_options,
+                                     VersionStorageInfo* vstorage,
+                                     LogBuffer* log_buffer) override;
 
-  // Per-level max bytes
-  std::unique_ptr<uint64_t[]> level_max_bytes_;
+  virtual bool NeedsCompaction(const VersionStorageInfo* vstorage) const
+      override;
 
-  const Options* const options_;
+  // Pick a path ID to place a newly generated file, with its level
+  static uint32_t GetPathId(const ImmutableCFOptions& ioptions,
+                            const MutableCFOptions& mutable_cf_options,
+                            int level);
 
  private:
-  int num_levels_;
-
-  const InternalKeyComparator* const icmp_;
+  // For the specfied level, pick a file that we want to compact.
+  // Returns false if there is no file to compact.
+  // If it returns true, inputs->files.size() will be exactly one.
+  // If level is 0 and there is already a compaction on that level, this
+  // function will return false.
+  bool PickCompactionBySize(VersionStorageInfo* vstorage, int level,
+                            int output_level, CompactionInputFiles* inputs,
+                            int* parent_index, int* base_index);
+
+  // If there is any file marked for compaction, put put it into inputs.
+  // This is still experimental. It will return meaningful results only if
+  // clients call experimental feature SuggestCompactRange()
+  void PickFilesMarkedForCompactionExperimental(const std::string& cf_name,
+                                                VersionStorageInfo* vstorage,
+                                                CompactionInputFiles* inputs,
+                                                int* level, int* output_level);
 };
 
+#ifndef ROCKSDB_LITE
 class UniversalCompactionPicker : public CompactionPicker {
  public:
-  UniversalCompactionPicker(const Options* options,
+  UniversalCompactionPicker(const ImmutableCFOptions& ioptions,
                             const InternalKeyComparator* icmp)
-      : CompactionPicker(options, icmp) {}
-  virtual Compaction* PickCompaction(Version* version,
+      : CompactionPicker(ioptions, icmp) {}
+  virtual Compaction* PickCompaction(const std::string& cf_name,
+                                     const MutableCFOptions& mutable_cf_options,
+                                     VersionStorageInfo* vstorage,
                                      LogBuffer* log_buffer) override;
 
+  virtual int MaxOutputLevel() const override { return NumberLevels() - 1; }
+
+  virtual bool NeedsCompaction(const VersionStorageInfo* vstorage) const
+      override;
+
  private:
+  struct SortedRun {
+    SortedRun(int _level, FileMetaData* _file, uint64_t _size,
+              uint64_t _compensated_file_size, bool _being_compacted)
+        : level(_level),
+          file(_file),
+          size(_size),
+          compensated_file_size(_compensated_file_size),
+          being_compacted(_being_compacted) {
+      assert(compensated_file_size > 0);
+      assert(level != 0 || file != nullptr);
+    }
+
+    void Dump(char* out_buf, size_t out_buf_size,
+              bool print_path = false) const;
+
+    // sorted_run_count is added into the string to print
+    void DumpSizeInfo(char* out_buf, size_t out_buf_size,
+                      int sorted_run_count) const;
+
+    int level;
+    // `file` Will be null for level > 0. For level = 0, the sorted run is
+    // for this file.
+    FileMetaData* file;
+    // For level > 0, `size` and `compensated_file_size` are sum of sizes all
+    // files in the level. `being_compacted` should be the same for all files
+    // in a non-zero level. Use the value here.
+    uint64_t size;
+    uint64_t compensated_file_size;
+    bool being_compacted;
+  };
+
   // Pick Universal compaction to limit read amplification
-  Compaction* PickCompactionUniversalReadAmp(Version* version, double score,
-                                             unsigned int ratio,
-                                             unsigned int num_files,
-                                             LogBuffer* log_buffer);
+  Compaction* PickCompactionUniversalReadAmp(
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      VersionStorageInfo* vstorage, double score, unsigned int ratio,
+      unsigned int num_files, const std::vector<SortedRun>& sorted_runs,
+      LogBuffer* log_buffer);
 
   // Pick Universal compaction to limit space amplification.
-  Compaction* PickCompactionUniversalSizeAmp(Version* version, double score,
-                                             LogBuffer* log_buffer);
+  Compaction* PickCompactionUniversalSizeAmp(
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      VersionStorageInfo* vstorage, double score,
+      const std::vector<SortedRun>& sorted_runs, LogBuffer* log_buffer);
+
+  static std::vector<SortedRun> CalculateSortedRuns(
+      const VersionStorageInfo& vstorage);
+
+  // Pick a path ID to place a newly generated file, with its estimated file
+  // size.
+  static uint32_t GetPathId(const ImmutableCFOptions& ioptions,
+                            uint64_t file_size);
 };
 
-class LevelCompactionPicker : public CompactionPicker {
+class FIFOCompactionPicker : public CompactionPicker {
  public:
-  LevelCompactionPicker(const Options* options,
-                        const InternalKeyComparator* icmp)
-      : CompactionPicker(options, icmp) {}
-  virtual Compaction* PickCompaction(Version* version,
+  FIFOCompactionPicker(const ImmutableCFOptions& ioptions,
+                       const InternalKeyComparator* icmp)
+      : CompactionPicker(ioptions, icmp) {}
+
+  virtual Compaction* PickCompaction(const std::string& cf_name,
+                                     const MutableCFOptions& mutable_cf_options,
+                                     VersionStorageInfo* version,
                                      LogBuffer* log_buffer) override;
 
- private:
-  // For the specfied level, pick a compaction.
-  // Returns nullptr if there is no compaction to be done.
-  // If level is 0 and there is already a compaction on that level, this
-  // function will return nullptr.
-  Compaction* PickCompactionBySize(Version* version, int level, double score);
+  virtual Compaction* CompactRange(
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      VersionStorageInfo* vstorage, int input_level, int output_level,
+      uint32_t output_path_id, const InternalKey* begin, const InternalKey* end,
+      InternalKey** compaction_end) override;
+
+  // The maximum allowed output level.  Always returns 0.
+  virtual int MaxOutputLevel() const override {
+    return 0;
+  }
+
+  virtual bool NeedsCompaction(const VersionStorageInfo* vstorage) const
+      override;
 };
 
+class NullCompactionPicker : public CompactionPicker {
+ public:
+  NullCompactionPicker(const ImmutableCFOptions& ioptions,
+                       const InternalKeyComparator* icmp) :
+      CompactionPicker(ioptions, icmp) {}
+  virtual ~NullCompactionPicker() {}
+
+  // Always return "nullptr"
+  Compaction* PickCompaction(const std::string& cf_name,
+                             const MutableCFOptions& mutable_cf_options,
+                             VersionStorageInfo* vstorage,
+                             LogBuffer* log_buffer) override {
+    return nullptr;
+  }
+
+  // Always return "nullptr"
+  Compaction* CompactRange(
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      VersionStorageInfo* vstorage, int input_level, int output_level,
+      uint32_t output_path_id, const InternalKey* begin, const InternalKey* end,
+      InternalKey** compaction_end) override {
+    return nullptr;
+  }
+
+  // Always returns false.
+  virtual bool NeedsCompaction(const VersionStorageInfo* vstorage) const
+      override {
+    return false;
+  }
+};
+#endif  // !ROCKSDB_LITE
+
+CompressionType GetCompressionType(const ImmutableCFOptions& ioptions,
+                                   int level, int base_level,
+                                   const bool enable_compression = true);
+
 }  // namespace rocksdb
diff --git a/src/rocksdb/db/compaction_picker_test.cc b/src/rocksdb/db/compaction_picker_test.cc
new file mode 100644
index 0000000..9efd951
--- /dev/null
+++ b/src/rocksdb/db/compaction_picker_test.cc
@@ -0,0 +1,426 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "db/compaction_picker.h"
+#include <limits>
+#include <string>
+#include "util/logging.h"
+#include "util/string_util.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+class CountingLogger : public Logger {
+ public:
+  using Logger::Logv;
+  virtual void Logv(const char* format, va_list ap) override { log_count++; }
+  size_t log_count;
+};
+
+class CompactionPickerTest : public testing::Test {
+ public:
+  const Comparator* ucmp_;
+  InternalKeyComparator icmp_;
+  Options options_;
+  ImmutableCFOptions ioptions_;
+  MutableCFOptions mutable_cf_options_;
+  LevelCompactionPicker level_compaction_picker;
+  std::string cf_name_;
+  CountingLogger logger_;
+  LogBuffer log_buffer_;
+  uint32_t file_num_;
+  CompactionOptionsFIFO fifo_options_;
+  std::unique_ptr<VersionStorageInfo> vstorage_;
+  std::vector<std::unique_ptr<FileMetaData>> files_;
+
+  CompactionPickerTest()
+      : ucmp_(BytewiseComparator()),
+        icmp_(ucmp_),
+        ioptions_(options_),
+        mutable_cf_options_(options_, ioptions_),
+        level_compaction_picker(ioptions_, &icmp_),
+        cf_name_("dummy"),
+        log_buffer_(InfoLogLevel::INFO_LEVEL, &logger_),
+        file_num_(1),
+        vstorage_(nullptr) {
+    fifo_options_.max_table_files_size = 1;
+    mutable_cf_options_.RefreshDerivedOptions(ioptions_);
+    ioptions_.db_paths.emplace_back("dummy",
+                                    std::numeric_limits<uint64_t>::max());
+  }
+
+  ~CompactionPickerTest() {
+  }
+
+  void NewVersionStorage(int num_levels, CompactionStyle style) {
+    DeleteVersionStorage();
+    options_.num_levels = num_levels;
+    vstorage_.reset(new VersionStorageInfo(
+        &icmp_, ucmp_, options_.num_levels, style, nullptr));
+    vstorage_->CalculateBaseBytes(ioptions_, mutable_cf_options_);
+  }
+
+  void DeleteVersionStorage() {
+    vstorage_.reset();
+    files_.clear();
+  }
+
+  void Add(int level, uint32_t file_number, const char* smallest,
+           const char* largest, uint64_t file_size = 0, uint32_t path_id = 0,
+           SequenceNumber smallest_seq = 100,
+           SequenceNumber largest_seq = 100) {
+    assert(level < vstorage_->num_levels());
+    FileMetaData* f = new FileMetaData;
+    f->fd = FileDescriptor(file_number, path_id, file_size);
+    f->smallest = InternalKey(smallest, smallest_seq, kTypeValue);
+    f->largest = InternalKey(largest, largest_seq, kTypeValue);
+    f->compensated_file_size = file_size;
+    f->refs = 0;
+    vstorage_->AddFile(level, f);
+    files_.emplace_back(f);
+  }
+
+  void UpdateVersionStorageInfo() {
+    vstorage_->CalculateBaseBytes(ioptions_, mutable_cf_options_);
+    vstorage_->UpdateFilesBySize();
+    vstorage_->UpdateNumNonEmptyLevels();
+    vstorage_->GenerateFileIndexer();
+    vstorage_->GenerateLevelFilesBrief();
+    vstorage_->ComputeCompactionScore(mutable_cf_options_, fifo_options_);
+    vstorage_->SetFinalized();
+  }
+};
+
+TEST_F(CompactionPickerTest, Empty) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  UpdateVersionStorageInfo();
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() == nullptr);
+}
+
+TEST_F(CompactionPickerTest, Single) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  Add(0, 1U, "p", "q");
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() == nullptr);
+}
+
+TEST_F(CompactionPickerTest, Level0Trigger) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  Add(0, 1U, "150", "200");
+  Add(0, 2U, "200", "250");
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, Level1Trigger) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  Add(1, 66U, "150", "200", 1000000000U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(66U, compaction->input(0, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, Level1Trigger2) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  Add(1, 66U, "150", "200", 1000000001U);
+  Add(1, 88U, "201", "300", 1000000000U);
+  Add(2, 6U, "150", "179", 1000000000U);
+  Add(2, 7U, "180", "220", 1000000000U);
+  Add(2, 8U, "221", "300", 1000000000U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(2U, compaction->num_input_files(1));
+  ASSERT_EQ(66U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber());
+  ASSERT_EQ(7U, compaction->input(1, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, LevelMaxScore) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.target_file_size_base = 10000000;
+  mutable_cf_options_.target_file_size_multiplier = 10;
+  Add(0, 1U, "150", "200", 1000000000U);
+  // Level 1 score 1.2
+  Add(1, 66U, "150", "200", 6000000U);
+  Add(1, 88U, "201", "300", 6000000U);
+  // Level 2 score 1.8. File 7 is the largest. Should be picked
+  Add(2, 6U, "150", "179", 60000000U);
+  Add(2, 7U, "180", "220", 60000001U);
+  Add(2, 8U, "221", "300", 60000000U);
+  // Level 3 score slightly larger than 1
+  Add(3, 26U, "150", "170", 260000000U);
+  Add(3, 27U, "171", "179", 260000000U);
+  Add(3, 28U, "191", "220", 260000000U);
+  Add(3, 29U, "221", "300", 260000000U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, NeedsCompactionLevel) {
+  const int kLevels = 6;
+  const int kFileCount = 20;
+
+  for (int level = 0; level < kLevels - 1; ++level) {
+    NewVersionStorage(kLevels, kCompactionStyleLevel);
+    uint64_t file_size = vstorage_->MaxBytesForLevel(level) * 2 / kFileCount;
+    for (int file_count = 1; file_count <= kFileCount; ++file_count) {
+      // start a brand new version in each test.
+      NewVersionStorage(kLevels, kCompactionStyleLevel);
+      for (int i = 0; i < file_count; ++i) {
+        Add(level, i, ToString((i + 100) * 1000).c_str(),
+            ToString((i + 100) * 1000 + 999).c_str(),
+            file_size, 0, i * 100, i * 100 + 99);
+      }
+      UpdateVersionStorageInfo();
+      ASSERT_EQ(vstorage_->CompactionScoreLevel(0), level);
+      ASSERT_EQ(level_compaction_picker.NeedsCompaction(vstorage_.get()),
+                vstorage_->CompactionScore(0) >= 1);
+      // release the version storage
+      DeleteVersionStorage();
+    }
+  }
+}
+
+TEST_F(CompactionPickerTest, Level0TriggerDynamic) {
+  int num_levels = ioptions_.num_levels;
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_bytes_for_level_base = 200;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+  NewVersionStorage(num_levels, kCompactionStyleLevel);
+  Add(0, 1U, "150", "200");
+  Add(0, 2U, "200", "250");
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+  ASSERT_EQ(1, static_cast<int>(compaction->num_input_levels()));
+  ASSERT_EQ(num_levels - 1, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, Level0TriggerDynamic2) {
+  int num_levels = ioptions_.num_levels;
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_bytes_for_level_base = 200;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+  NewVersionStorage(num_levels, kCompactionStyleLevel);
+  Add(0, 1U, "150", "200");
+  Add(0, 2U, "200", "250");
+  Add(num_levels - 1, 3U, "200", "250", 300U);
+
+  UpdateVersionStorageInfo();
+  ASSERT_EQ(vstorage_->base_level(), num_levels - 2);
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+  ASSERT_EQ(1, static_cast<int>(compaction->num_input_levels()));
+  ASSERT_EQ(num_levels - 2, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, Level0TriggerDynamic3) {
+  int num_levels = ioptions_.num_levels;
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_bytes_for_level_base = 200;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+  NewVersionStorage(num_levels, kCompactionStyleLevel);
+  Add(0, 1U, "150", "200");
+  Add(0, 2U, "200", "250");
+  Add(num_levels - 1, 3U, "200", "250", 300U);
+  Add(num_levels - 1, 4U, "300", "350", 3000U);
+
+  UpdateVersionStorageInfo();
+  ASSERT_EQ(vstorage_->base_level(), num_levels - 3);
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+  ASSERT_EQ(1, static_cast<int>(compaction->num_input_levels()));
+  ASSERT_EQ(num_levels - 3, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, Level0TriggerDynamic4) {
+  int num_levels = ioptions_.num_levels;
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_bytes_for_level_base = 200;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+  NewVersionStorage(num_levels, kCompactionStyleLevel);
+  Add(0, 1U, "150", "200");
+  Add(0, 2U, "200", "250");
+  Add(num_levels - 1, 3U, "200", "250", 300U);
+  Add(num_levels - 1, 4U, "300", "350", 3000U);
+  Add(num_levels - 3, 5U, "150", "180", 3U);
+  Add(num_levels - 3, 6U, "181", "300", 3U);
+  Add(num_levels - 3, 7U, "400", "450", 3U);
+
+  UpdateVersionStorageInfo();
+  ASSERT_EQ(vstorage_->base_level(), num_levels - 3);
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->num_input_files(1));
+  ASSERT_EQ(num_levels - 3, compaction->level(1));
+  ASSERT_EQ(5U, compaction->input(1, 0)->fd.GetNumber());
+  ASSERT_EQ(6U, compaction->input(1, 1)->fd.GetNumber());
+  ASSERT_EQ(2, static_cast<int>(compaction->num_input_levels()));
+  ASSERT_EQ(num_levels - 3, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, LevelTriggerDynamic4) {
+  int num_levels = ioptions_.num_levels;
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_bytes_for_level_base = 200;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+  NewVersionStorage(num_levels, kCompactionStyleLevel);
+  Add(0, 1U, "150", "200");
+  Add(num_levels - 1, 3U, "200", "250", 300U);
+  Add(num_levels - 1, 4U, "300", "350", 3000U);
+  Add(num_levels - 1, 4U, "400", "450", 3U);
+  Add(num_levels - 2, 5U, "150", "180", 300U);
+  Add(num_levels - 2, 6U, "181", "350", 500U);
+  Add(num_levels - 2, 7U, "400", "450", 200U);
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(6U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->num_input_files(1));
+  ASSERT_EQ(3U, compaction->input(1, 0)->fd.GetNumber());
+  ASSERT_EQ(4U, compaction->input(1, 1)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->num_input_levels());
+  ASSERT_EQ(num_levels - 1, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, NeedsCompactionUniversal) {
+  NewVersionStorage(1, kCompactionStyleUniversal);
+  UniversalCompactionPicker universal_compaction_picker(
+      ioptions_, &icmp_);
+  // must return false when there's no files.
+  ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()),
+            false);
+  UpdateVersionStorageInfo();
+
+  // verify the trigger given different number of L0 files.
+  for (int i = 1;
+       i <= mutable_cf_options_.level0_file_num_compaction_trigger * 2; ++i) {
+    NewVersionStorage(1, kCompactionStyleUniversal);
+    Add(0, i, ToString((i + 100) * 1000).c_str(),
+        ToString((i + 100) * 1000 + 999).c_str(), 1000000, 0, i * 100,
+        i * 100 + 99);
+    UpdateVersionStorageInfo();
+    ASSERT_EQ(level_compaction_picker.NeedsCompaction(vstorage_.get()),
+              vstorage_->CompactionScore(0) >= 1);
+  }
+}
+
+TEST_F(CompactionPickerTest, NeedsCompactionFIFO) {
+  NewVersionStorage(1, kCompactionStyleFIFO);
+  const int kFileCount =
+      mutable_cf_options_.level0_file_num_compaction_trigger * 3;
+  const uint64_t kFileSize = 100000;
+  const uint64_t kMaxSize = kFileSize * kFileCount / 2;
+
+  fifo_options_.max_table_files_size = kMaxSize;
+  ioptions_.compaction_options_fifo = fifo_options_;
+  FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+
+  UpdateVersionStorageInfo();
+  // must return false when there's no files.
+  ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), false);
+
+  // verify whether compaction is needed based on the current
+  // size of L0 files.
+  uint64_t current_size = 0;
+  for (int i = 1; i <= kFileCount; ++i) {
+    NewVersionStorage(1, kCompactionStyleFIFO);
+    Add(0, i, ToString((i + 100) * 1000).c_str(),
+        ToString((i + 100) * 1000 + 999).c_str(),
+        kFileSize, 0, i * 100, i * 100 + 99);
+    current_size += kFileSize;
+    UpdateVersionStorageInfo();
+    ASSERT_EQ(level_compaction_picker.NeedsCompaction(vstorage_.get()),
+              vstorage_->CompactionScore(0) >= 1);
+  }
+}
+
+// This test exhibits the bug where we don't properly reset parent_index in
+// PickCompaction()
+TEST_F(CompactionPickerTest, ParentIndexResetBug) {
+  int num_levels = ioptions_.num_levels;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_bytes_for_level_base = 200;
+  NewVersionStorage(num_levels, kCompactionStyleLevel);
+  Add(0, 1U, "150", "200");       // <- marked for compaction
+  Add(1, 3U, "400", "500", 600);  // <- this one needs compacting
+  Add(2, 4U, "150", "200");
+  Add(2, 5U, "201", "210");
+  Add(2, 6U, "300", "310");
+  Add(2, 7U, "400", "500");  // <- being compacted
+
+  vstorage_->LevelFiles(2)[3]->being_compacted = true;
+  vstorage_->LevelFiles(0)[0]->marked_for_compaction = true;
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/comparator_db_test.cc b/src/rocksdb/db/comparator_db_test.cc
new file mode 100644
index 0000000..6013f75
--- /dev/null
+++ b/src/rocksdb/db/comparator_db_test.cc
@@ -0,0 +1,440 @@
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+#include <map>
+#include <string>
+
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "util/hash.h"
+#include "util/string_util.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+#include "utilities/merge_operators.h"
+
+using std::unique_ptr;
+
+namespace rocksdb {
+namespace {
+
+static const Comparator* comparator;
+
+// A comparator for std::map, using comparator
+struct MapComparator {
+  bool operator()(const std::string& a, const std::string& b) const {
+    return comparator->Compare(a, b) < 0;
+  }
+};
+
+typedef std::map<std::string, std::string, MapComparator> KVMap;
+
+class KVIter : public Iterator {
+ public:
+  explicit KVIter(const KVMap* map) : map_(map), iter_(map_->end()) {}
+  virtual bool Valid() const override { return iter_ != map_->end(); }
+  virtual void SeekToFirst() override { iter_ = map_->begin(); }
+  virtual void SeekToLast() override {
+    if (map_->empty()) {
+      iter_ = map_->end();
+    } else {
+      iter_ = map_->find(map_->rbegin()->first);
+    }
+  }
+  virtual void Seek(const Slice& k) override {
+    iter_ = map_->lower_bound(k.ToString());
+  }
+  virtual void Next() override { ++iter_; }
+  virtual void Prev() override {
+    if (iter_ == map_->begin()) {
+      iter_ = map_->end();
+      return;
+    }
+    --iter_;
+  }
+
+  virtual Slice key() const override { return iter_->first; }
+  virtual Slice value() const override { return iter_->second; }
+  virtual Status status() const override { return Status::OK(); }
+
+ private:
+  const KVMap* const map_;
+  KVMap::const_iterator iter_;
+};
+
+void AssertItersEqual(Iterator* iter1, Iterator* iter2) {
+  ASSERT_EQ(iter1->Valid(), iter2->Valid());
+  if (iter1->Valid()) {
+    ASSERT_EQ(iter1->key().ToString(), iter2->key().ToString());
+    ASSERT_EQ(iter1->value().ToString(), iter2->value().ToString());
+  }
+}
+
+// Measuring operations on DB (expect to be empty).
+// source_strings are candidate keys
+void DoRandomIteraratorTest(DB* db, std::vector<std::string> source_strings,
+                            Random* rnd, int num_writes, int num_iter_ops,
+                            int num_trigger_flush) {
+  KVMap map;
+
+  for (int i = 0; i < num_writes; i++) {
+    if (num_trigger_flush > 0 && i != 0 && i % num_trigger_flush == 0) {
+      db->Flush(FlushOptions());
+    }
+
+    int type = rnd->Uniform(2);
+    int index = rnd->Uniform(static_cast<int>(source_strings.size()));
+    auto& key = source_strings[index];
+    switch (type) {
+      case 0:
+        // put
+        map[key] = key;
+        ASSERT_OK(db->Put(WriteOptions(), key, key));
+        break;
+      case 1:
+        // delete
+        if (map.find(key) != map.end()) {
+          map.erase(key);
+        }
+        ASSERT_OK(db->Delete(WriteOptions(), key));
+        break;
+      default:
+        assert(false);
+    }
+  }
+
+  std::unique_ptr<Iterator> iter(db->NewIterator(ReadOptions()));
+  std::unique_ptr<Iterator> result_iter(new KVIter(&map));
+
+  bool is_valid = false;
+  for (int i = 0; i < num_iter_ops; i++) {
+    // Random walk and make sure iter and result_iter returns the
+    // same key and value
+    int type = rnd->Uniform(6);
+    ASSERT_OK(iter->status());
+    switch (type) {
+      case 0:
+        // Seek to First
+        iter->SeekToFirst();
+        result_iter->SeekToFirst();
+        break;
+      case 1:
+        // Seek to last
+        iter->SeekToLast();
+        result_iter->SeekToLast();
+        break;
+      case 2: {
+        // Seek to random key
+        auto key_idx = rnd->Uniform(static_cast<int>(source_strings.size()));
+        auto key = source_strings[key_idx];
+        iter->Seek(key);
+        result_iter->Seek(key);
+        break;
+      }
+      case 3:
+        // Next
+        if (is_valid) {
+          iter->Next();
+          result_iter->Next();
+        } else {
+          continue;
+        }
+        break;
+      case 4:
+        // Prev
+        if (is_valid) {
+          iter->Prev();
+          result_iter->Prev();
+        } else {
+          continue;
+        }
+        break;
+      default: {
+        assert(type == 5);
+        auto key_idx = rnd->Uniform(static_cast<int>(source_strings.size()));
+        auto key = source_strings[key_idx];
+        std::string result;
+        auto status = db->Get(ReadOptions(), key, &result);
+        if (map.find(key) == map.end()) {
+          ASSERT_TRUE(status.IsNotFound());
+        } else {
+          ASSERT_EQ(map[key], result);
+        }
+        break;
+      }
+    }
+    AssertItersEqual(iter.get(), result_iter.get());
+    is_valid = iter->Valid();
+  }
+}
+
+class DoubleComparator : public Comparator {
+ public:
+  DoubleComparator() {}
+
+  virtual const char* Name() const override { return "DoubleComparator"; }
+
+  virtual int Compare(const Slice& a, const Slice& b) const override {
+    double da = std::stod(a.ToString());
+    double db = std::stod(b.ToString());
+    if (da == db) {
+      return a.compare(b);
+    } else if (da > db) {
+      return 1;
+    } else {
+      return -1;
+    }
+  }
+  virtual void FindShortestSeparator(std::string* start,
+                                     const Slice& limit) const override {}
+
+  virtual void FindShortSuccessor(std::string* key) const override {}
+};
+
+class HashComparator : public Comparator {
+ public:
+  HashComparator() {}
+
+  virtual const char* Name() const override { return "HashComparator"; }
+
+  virtual int Compare(const Slice& a, const Slice& b) const override {
+    uint32_t ha = Hash(a.data(), a.size(), 66);
+    uint32_t hb = Hash(b.data(), b.size(), 66);
+    if (ha == hb) {
+      return a.compare(b);
+    } else if (ha > hb) {
+      return 1;
+    } else {
+      return -1;
+    }
+  }
+  virtual void FindShortestSeparator(std::string* start,
+                                     const Slice& limit) const override {}
+
+  virtual void FindShortSuccessor(std::string* key) const override {}
+};
+
+class TwoStrComparator : public Comparator {
+ public:
+  TwoStrComparator() {}
+
+  virtual const char* Name() const override { return "TwoStrComparator"; }
+
+  virtual int Compare(const Slice& a, const Slice& b) const override {
+    assert(a.size() >= 2);
+    assert(b.size() >= 2);
+    size_t size_a1 = static_cast<size_t>(a[0]);
+    size_t size_b1 = static_cast<size_t>(b[0]);
+    size_t size_a2 = static_cast<size_t>(a[1]);
+    size_t size_b2 = static_cast<size_t>(b[1]);
+    assert(size_a1 + size_a2 + 2 == a.size());
+    assert(size_b1 + size_b2 + 2 == b.size());
+
+    Slice a1 = Slice(a.data() + 2, size_a1);
+    Slice b1 = Slice(b.data() + 2, size_b1);
+    Slice a2 = Slice(a.data() + 2 + size_a1, size_a2);
+    Slice b2 = Slice(b.data() + 2 + size_b1, size_b2);
+
+    if (a1 != b1) {
+      return a1.compare(b1);
+    }
+    return a2.compare(b2);
+  }
+  virtual void FindShortestSeparator(std::string* start,
+                                     const Slice& limit) const override {}
+
+  virtual void FindShortSuccessor(std::string* key) const override {}
+};
+}  // namespace
+
+class ComparatorDBTest : public testing::Test {
+ private:
+  std::string dbname_;
+  Env* env_;
+  DB* db_;
+  Options last_options_;
+  std::unique_ptr<const Comparator> comparator_guard;
+
+ public:
+  ComparatorDBTest() : env_(Env::Default()), db_(nullptr) {
+    comparator = BytewiseComparator();
+    dbname_ = test::TmpDir() + "/comparator_db_test";
+    EXPECT_OK(DestroyDB(dbname_, last_options_));
+  }
+
+  ~ComparatorDBTest() {
+    delete db_;
+    EXPECT_OK(DestroyDB(dbname_, last_options_));
+    comparator = BytewiseComparator();
+  }
+
+  DB* GetDB() { return db_; }
+
+  void SetOwnedComparator(const Comparator* cmp) {
+    comparator_guard.reset(cmp);
+    comparator = cmp;
+    last_options_.comparator = cmp;
+  }
+
+  // Return the current option configuration.
+  Options* GetOptions() { return &last_options_; }
+
+  void DestroyAndReopen() {
+    // Destroy using last options
+    Destroy();
+    ASSERT_OK(TryReopen());
+  }
+
+  void Destroy() {
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, last_options_));
+  }
+
+  Status TryReopen() {
+    delete db_;
+    db_ = nullptr;
+    last_options_.create_if_missing = true;
+
+    return DB::Open(last_options_, dbname_, &db_);
+  }
+};
+
+TEST_F(ComparatorDBTest, Bytewise) {
+  for (int rand_seed = 301; rand_seed < 306; rand_seed++) {
+    DestroyAndReopen();
+    Random rnd(rand_seed);
+    DoRandomIteraratorTest(GetDB(),
+                           {"a", "b", "c", "d", "e", "f", "g", "h", "i"}, &rnd,
+                           8, 100, 3);
+  }
+}
+
+TEST_F(ComparatorDBTest, SimpleSuffixReverseComparator) {
+  SetOwnedComparator(new test::SimpleSuffixReverseComparator());
+
+  for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
+    Options* opt = GetOptions();
+    opt->comparator = comparator;
+    DestroyAndReopen();
+    Random rnd(rnd_seed);
+
+    std::vector<std::string> source_strings;
+    std::vector<std::string> source_prefixes;
+    // Randomly generate 5 prefixes
+    for (int i = 0; i < 5; i++) {
+      source_prefixes.push_back(test::RandomHumanReadableString(&rnd, 8));
+    }
+    for (int j = 0; j < 20; j++) {
+      int prefix_index = rnd.Uniform(static_cast<int>(source_prefixes.size()));
+      std::string key = source_prefixes[prefix_index] +
+                        test::RandomHumanReadableString(&rnd, rnd.Uniform(8));
+      source_strings.push_back(key);
+    }
+
+    DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 30, 600, 66);
+  }
+}
+
+TEST_F(ComparatorDBTest, Uint64Comparator) {
+  SetOwnedComparator(test::Uint64Comparator());
+
+  for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
+    Options* opt = GetOptions();
+    opt->comparator = comparator;
+    DestroyAndReopen();
+    Random rnd(rnd_seed);
+    Random64 rnd64(rnd_seed);
+
+    std::vector<std::string> source_strings;
+    // Randomly generate source keys
+    for (int i = 0; i < 100; i++) {
+      uint64_t r = rnd64.Next();
+      std::string str;
+      str.resize(8);
+      memcpy(&str[0], static_cast<void*>(&r), 8);
+      source_strings.push_back(str);
+    }
+
+    DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 200, 1000, 66);
+  }
+}
+
+TEST_F(ComparatorDBTest, DoubleComparator) {
+  SetOwnedComparator(new DoubleComparator());
+
+  for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
+    Options* opt = GetOptions();
+    opt->comparator = comparator;
+    DestroyAndReopen();
+    Random rnd(rnd_seed);
+
+    std::vector<std::string> source_strings;
+    // Randomly generate source keys
+    for (int i = 0; i < 100; i++) {
+      uint32_t r = rnd.Next();
+      uint32_t divide_order = rnd.Uniform(8);
+      double to_divide = 1.0;
+      for (uint32_t j = 0; j < divide_order; j++) {
+        to_divide *= 10.0;
+      }
+      source_strings.push_back(ToString(r / to_divide));
+    }
+
+    DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 200, 1000, 66);
+  }
+}
+
+TEST_F(ComparatorDBTest, HashComparator) {
+  SetOwnedComparator(new HashComparator());
+
+  for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
+    Options* opt = GetOptions();
+    opt->comparator = comparator;
+    DestroyAndReopen();
+    Random rnd(rnd_seed);
+
+    std::vector<std::string> source_strings;
+    // Randomly generate source keys
+    for (int i = 0; i < 100; i++) {
+      source_strings.push_back(test::RandomKey(&rnd, 8));
+    }
+
+    DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 200, 1000, 66);
+  }
+}
+
+TEST_F(ComparatorDBTest, TwoStrComparator) {
+  SetOwnedComparator(new TwoStrComparator());
+
+  for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
+    Options* opt = GetOptions();
+    opt->comparator = comparator;
+    DestroyAndReopen();
+    Random rnd(rnd_seed);
+
+    std::vector<std::string> source_strings;
+    // Randomly generate source keys
+    for (int i = 0; i < 100; i++) {
+      std::string str;
+      uint32_t size1 = rnd.Uniform(8);
+      uint32_t size2 = rnd.Uniform(8);
+      str.append(1, static_cast<char>(size1));
+      str.append(1, static_cast<char>(size2));
+      str.append(test::RandomKey(&rnd, size1));
+      str.append(test::RandomKey(&rnd, size2));
+      source_strings.push_back(str);
+    }
+
+    DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 200, 1000, 66);
+  }
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/corruption_test.cc b/src/rocksdb/db/corruption_test.cc
index 4726e92..b9a2461 100644
--- a/src/rocksdb/db/corruption_test.cc
+++ b/src/rocksdb/db/corruption_test.cc
@@ -29,7 +29,7 @@ namespace rocksdb {
 
 static const int kValueSize = 1000;
 
-class CorruptionTest {
+class CorruptionTest : public testing::Test {
  public:
   test::ErrorEnv env_;
   std::string dbname_;
@@ -45,7 +45,9 @@ class CorruptionTest {
 
     db_ = nullptr;
     options_.create_if_missing = true;
-    options_.block_size_deviation = 0; // make unit test pass for now
+    BlockBasedTableOptions table_options;
+    table_options.block_size_deviation = 0;  // make unit test pass for now
+    options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
     Reopen();
     options_.create_if_missing = false;
   }
@@ -60,9 +62,11 @@ class CorruptionTest {
     db_ = nullptr;
     Options opt = (options ? *options : options_);
     opt.env = &env_;
-    opt.block_cache = tiny_cache_;
-    opt.block_size_deviation = 0;
     opt.arena_block_size = 4096;
+    BlockBasedTableOptions table_options;
+    table_options.block_cache = tiny_cache_;
+    table_options.block_size_deviation = 0;
+    opt.table_factory.reset(NewBlockBasedTableFactory(table_options));
     return DB::Open(opt, dbname_, &db_);
   }
 
@@ -99,7 +103,7 @@ class CorruptionTest {
     // db itself will raise errors because data is corrupted.
     // Instead, we want the reads to be successful and this test
     // will detect whether the appropriate corruptions have
-    // occured.
+    // occurred.
     Iterator* iter = db_->NewIterator(ReadOptions(false, true));
     for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
       uint64_t key;
@@ -111,8 +115,8 @@ class CorruptionTest {
         continue;
       }
       missed += (key - next_expected);
-      next_expected = key + 1;
-      if (iter->value() != Value(key, &value_space)) {
+      next_expected = static_cast<unsigned int>(key + 1);
+      if (iter->value() != Value(static_cast<int>(key), &value_space)) {
         bad_values++;
       } else {
         correct++;
@@ -127,7 +131,7 @@ class CorruptionTest {
     ASSERT_GE(max_expected, correct);
   }
 
-  void CorruptFile(const std::string fname, int offset, int bytes_to_corrupt) {
+  void CorruptFile(const std::string& fname, int offset, int bytes_to_corrupt) {
     struct stat sbuf;
     if (stat(fname.c_str(), &sbuf) != 0) {
       const char* msg = strerror(errno);
@@ -139,14 +143,14 @@ class CorruptionTest {
       if (-offset > sbuf.st_size) {
         offset = 0;
       } else {
-        offset = sbuf.st_size + offset;
+        offset = static_cast<int>(sbuf.st_size + offset);
       }
     }
     if (offset > sbuf.st_size) {
-      offset = sbuf.st_size;
+      offset = static_cast<int>(sbuf.st_size);
     }
     if (offset + bytes_to_corrupt > sbuf.st_size) {
-      bytes_to_corrupt = sbuf.st_size - offset;
+      bytes_to_corrupt = static_cast<int>(sbuf.st_size - offset);
     }
 
     // Do it
@@ -173,7 +177,7 @@ class CorruptionTest {
           type == filetype &&
           static_cast<int>(number) > picked_number) {  // Pick latest file
         fname = dbname_ + "/" + filenames[i];
-        picked_number = number;
+        picked_number = static_cast<int>(number);
       }
     }
     ASSERT_TRUE(!fname.empty()) << filetype;
@@ -222,27 +226,30 @@ class CorruptionTest {
   }
 };
 
-TEST(CorruptionTest, Recovery) {
+TEST_F(CorruptionTest, Recovery) {
   Build(100);
   Check(100, 100);
   Corrupt(kLogFile, 19, 1);      // WriteBatch tag for first record
   Corrupt(kLogFile, log::kBlockSize + 1000, 1);  // Somewhere in second block
-  Reopen();
+  ASSERT_TRUE(!TryReopen().ok());
+  options_.paranoid_checks = false;
+  Reopen(&options_);
 
   // The 64 records in the first two log blocks are completely lost.
   Check(36, 36);
 }
 
-TEST(CorruptionTest, RecoverWriteError) {
+TEST_F(CorruptionTest, RecoverWriteError) {
   env_.writable_file_error_ = true;
   Status s = TryReopen();
   ASSERT_TRUE(!s.ok());
 }
 
-TEST(CorruptionTest, NewFileErrorDuringWrite) {
+TEST_F(CorruptionTest, NewFileErrorDuringWrite) {
   // Do enough writing to force minor compaction
   env_.writable_file_error_ = true;
-  const int num = 3 + (Options().write_buffer_size / kValueSize);
+  const int num =
+      static_cast<int>(3 + (Options().write_buffer_size / kValueSize));
   std::string value_storage;
   Status s;
   bool failed = false;
@@ -261,7 +268,7 @@ TEST(CorruptionTest, NewFileErrorDuringWrite) {
   Reopen();
 }
 
-TEST(CorruptionTest, TableFile) {
+TEST_F(CorruptionTest, TableFile) {
   Build(100);
   DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
   dbi->TEST_FlushMemTable();
@@ -272,7 +279,7 @@ TEST(CorruptionTest, TableFile) {
   Check(99, 99);
 }
 
-TEST(CorruptionTest, TableFileIndexData) {
+TEST_F(CorruptionTest, TableFileIndexData) {
   Build(10000);  // Enough to build multiple Tables
   DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
   dbi->TEST_FlushMemTable();
@@ -282,14 +289,14 @@ TEST(CorruptionTest, TableFileIndexData) {
   Check(5000, 9999);
 }
 
-TEST(CorruptionTest, MissingDescriptor) {
+TEST_F(CorruptionTest, MissingDescriptor) {
   Build(1000);
   RepairDB();
   Reopen();
   Check(1000, 1000);
 }
 
-TEST(CorruptionTest, SequenceNumberRecovery) {
+TEST_F(CorruptionTest, SequenceNumberRecovery) {
   ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1"));
   ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2"));
   ASSERT_OK(db_->Put(WriteOptions(), "foo", "v3"));
@@ -310,7 +317,7 @@ TEST(CorruptionTest, SequenceNumberRecovery) {
   ASSERT_EQ("v6", v);
 }
 
-TEST(CorruptionTest, CorruptedDescriptor) {
+TEST_F(CorruptionTest, CorruptedDescriptor) {
   ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello"));
   DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
   dbi->TEST_FlushMemTable();
@@ -327,7 +334,10 @@ TEST(CorruptionTest, CorruptedDescriptor) {
   ASSERT_EQ("hello", v);
 }
 
-TEST(CorruptionTest, CompactionInputError) {
+TEST_F(CorruptionTest, CompactionInputError) {
+  Options options;
+  options.max_background_flushes = 0;
+  Reopen(&options);
   Build(10);
   DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
   dbi->TEST_FlushMemTable();
@@ -342,11 +352,12 @@ TEST(CorruptionTest, CompactionInputError) {
   Check(10000, 10000);
 }
 
-TEST(CorruptionTest, CompactionInputErrorParanoid) {
+TEST_F(CorruptionTest, CompactionInputErrorParanoid) {
   Options options;
   options.paranoid_checks = true;
   options.write_buffer_size = 131072;
   options.max_write_buffer_number = 2;
+  options.max_background_flushes = 0;
   Reopen(&options);
   DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
 
@@ -384,7 +395,7 @@ TEST(CorruptionTest, CompactionInputErrorParanoid) {
   ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db";
 }
 
-TEST(CorruptionTest, UnrelatedKeys) {
+TEST_F(CorruptionTest, UnrelatedKeys) {
   Build(10);
   DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
   dbi->TEST_FlushMemTable();
@@ -400,7 +411,7 @@ TEST(CorruptionTest, UnrelatedKeys) {
   ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
 }
 
-TEST(CorruptionTest, FileSystemStateCorrupted) {
+TEST_F(CorruptionTest, FileSystemStateCorrupted) {
   for (int iter = 0; iter < 2; ++iter) {
     Options options;
     options.paranoid_checks = true;
@@ -436,5 +447,6 @@ TEST(CorruptionTest, FileSystemStateCorrupted) {
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
-  return rocksdb::test::RunAllTests();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
 }
diff --git a/src/rocksdb/db/cuckoo_table_db_test.cc b/src/rocksdb/db/cuckoo_table_db_test.cc
new file mode 100644
index 0000000..8c2113b
--- /dev/null
+++ b/src/rocksdb/db/cuckoo_table_db_test.cc
@@ -0,0 +1,321 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "db/db_impl.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "table/meta_blocks.h"
+#include "table/cuckoo_table_factory.h"
+#include "table/cuckoo_table_reader.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+class CuckooTableDBTest : public testing::Test {
+ private:
+  std::string dbname_;
+  Env* env_;
+  DB* db_;
+
+ public:
+  CuckooTableDBTest() : env_(Env::Default()) {
+    dbname_ = test::TmpDir() + "/cuckoo_table_db_test";
+    EXPECT_OK(DestroyDB(dbname_, Options()));
+    db_ = nullptr;
+    Reopen();
+  }
+
+  ~CuckooTableDBTest() {
+    delete db_;
+    EXPECT_OK(DestroyDB(dbname_, Options()));
+  }
+
+  Options CurrentOptions() {
+    Options options;
+    options.table_factory.reset(NewCuckooTableFactory());
+    options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0, 3, true));
+    options.allow_mmap_reads = true;
+    options.create_if_missing = true;
+    options.max_mem_compaction_level = 0;
+    return options;
+  }
+
+  DBImpl* dbfull() {
+    return reinterpret_cast<DBImpl*>(db_);
+  }
+
+  // The following util methods are copied from plain_table_db_test.
+  void Reopen(Options* options = nullptr) {
+    delete db_;
+    db_ = nullptr;
+    Options opts;
+    if (options != nullptr) {
+      opts = *options;
+    } else {
+      opts = CurrentOptions();
+      opts.create_if_missing = true;
+    }
+    ASSERT_OK(DB::Open(opts, dbname_, &db_));
+  }
+
+  Status Put(const Slice& k, const Slice& v) {
+    return db_->Put(WriteOptions(), k, v);
+  }
+
+  Status Delete(const std::string& k) {
+    return db_->Delete(WriteOptions(), k);
+  }
+
+  std::string Get(const std::string& k) {
+    ReadOptions options;
+    std::string result;
+    Status s = db_->Get(options, k, &result);
+    if (s.IsNotFound()) {
+      result = "NOT_FOUND";
+    } else if (!s.ok()) {
+      result = s.ToString();
+    }
+    return result;
+  }
+
+  int NumTableFilesAtLevel(int level) {
+    std::string property;
+    EXPECT_TRUE(db_->GetProperty(
+        "rocksdb.num-files-at-level" + NumberToString(level), &property));
+    return atoi(property.c_str());
+  }
+
+  // Return spread of files per level
+  std::string FilesPerLevel() {
+    std::string result;
+    size_t last_non_zero_offset = 0;
+    for (int level = 0; level < db_->NumberLevels(); level++) {
+      int f = NumTableFilesAtLevel(level);
+      char buf[100];
+      snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
+      result += buf;
+      if (f > 0) {
+        last_non_zero_offset = result.size();
+      }
+    }
+    result.resize(last_non_zero_offset);
+    return result;
+  }
+};
+
+TEST_F(CuckooTableDBTest, Flush) {
+  // Try with empty DB first.
+  ASSERT_TRUE(dbfull() != nullptr);
+  ASSERT_EQ("NOT_FOUND", Get("key2"));
+
+  // Add some values to db.
+  Options options = CurrentOptions();
+  Reopen(&options);
+
+  ASSERT_OK(Put("key1", "v1"));
+  ASSERT_OK(Put("key2", "v2"));
+  ASSERT_OK(Put("key3", "v3"));
+  dbfull()->TEST_FlushMemTable();
+
+  TablePropertiesCollection ptc;
+  reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc);
+  ASSERT_EQ(1U, ptc.size());
+  ASSERT_EQ(3U, ptc.begin()->second->num_entries);
+  ASSERT_EQ("1", FilesPerLevel());
+
+  ASSERT_EQ("v1", Get("key1"));
+  ASSERT_EQ("v2", Get("key2"));
+  ASSERT_EQ("v3", Get("key3"));
+  ASSERT_EQ("NOT_FOUND", Get("key4"));
+
+  // Now add more keys and flush.
+  ASSERT_OK(Put("key4", "v4"));
+  ASSERT_OK(Put("key5", "v5"));
+  ASSERT_OK(Put("key6", "v6"));
+  dbfull()->TEST_FlushMemTable();
+
+  reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc);
+  ASSERT_EQ(2U, ptc.size());
+  auto row = ptc.begin();
+  ASSERT_EQ(3U, row->second->num_entries);
+  ASSERT_EQ(3U, (++row)->second->num_entries);
+  ASSERT_EQ("2", FilesPerLevel());
+  ASSERT_EQ("v1", Get("key1"));
+  ASSERT_EQ("v2", Get("key2"));
+  ASSERT_EQ("v3", Get("key3"));
+  ASSERT_EQ("v4", Get("key4"));
+  ASSERT_EQ("v5", Get("key5"));
+  ASSERT_EQ("v6", Get("key6"));
+
+  ASSERT_OK(Delete("key6"));
+  ASSERT_OK(Delete("key5"));
+  ASSERT_OK(Delete("key4"));
+  dbfull()->TEST_FlushMemTable();
+  reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc);
+  ASSERT_EQ(3U, ptc.size());
+  row = ptc.begin();
+  ASSERT_EQ(3U, row->second->num_entries);
+  ASSERT_EQ(3U, (++row)->second->num_entries);
+  ASSERT_EQ(3U, (++row)->second->num_entries);
+  ASSERT_EQ("3", FilesPerLevel());
+  ASSERT_EQ("v1", Get("key1"));
+  ASSERT_EQ("v2", Get("key2"));
+  ASSERT_EQ("v3", Get("key3"));
+  ASSERT_EQ("NOT_FOUND", Get("key4"));
+  ASSERT_EQ("NOT_FOUND", Get("key5"));
+  ASSERT_EQ("NOT_FOUND", Get("key6"));
+}
+
+TEST_F(CuckooTableDBTest, FlushWithDuplicateKeys) {
+  Options options = CurrentOptions();
+  Reopen(&options);
+  ASSERT_OK(Put("key1", "v1"));
+  ASSERT_OK(Put("key2", "v2"));
+  ASSERT_OK(Put("key1", "v3"));  // Duplicate
+  dbfull()->TEST_FlushMemTable();
+
+  TablePropertiesCollection ptc;
+  reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc);
+  ASSERT_EQ(1U, ptc.size());
+  ASSERT_EQ(2U, ptc.begin()->second->num_entries);
+  ASSERT_EQ("1", FilesPerLevel());
+  ASSERT_EQ("v3", Get("key1"));
+  ASSERT_EQ("v2", Get("key2"));
+}
+
+namespace {
+static std::string Key(int i) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "key_______%06d", i);
+  return std::string(buf);
+}
+static std::string Uint64Key(uint64_t i) {
+  std::string str;
+  str.resize(8);
+  memcpy(&str[0], static_cast<void*>(&i), 8);
+  return str;
+}
+}  // namespace.
+
+TEST_F(CuckooTableDBTest, Uint64Comparator) {
+  Options options = CurrentOptions();
+  options.comparator = test::Uint64Comparator();
+  Reopen(&options);
+
+  ASSERT_OK(Put(Uint64Key(1), "v1"));
+  ASSERT_OK(Put(Uint64Key(2), "v2"));
+  ASSERT_OK(Put(Uint64Key(3), "v3"));
+  dbfull()->TEST_FlushMemTable();
+
+  ASSERT_EQ("v1", Get(Uint64Key(1)));
+  ASSERT_EQ("v2", Get(Uint64Key(2)));
+  ASSERT_EQ("v3", Get(Uint64Key(3)));
+  ASSERT_EQ("NOT_FOUND", Get(Uint64Key(4)));
+
+  // Add more keys.
+  ASSERT_OK(Delete(Uint64Key(2)));  // Delete.
+  dbfull()->TEST_FlushMemTable();
+  ASSERT_OK(Put(Uint64Key(3), "v0"));  // Update.
+  ASSERT_OK(Put(Uint64Key(4), "v4"));
+  dbfull()->TEST_FlushMemTable();
+  ASSERT_EQ("v1", Get(Uint64Key(1)));
+  ASSERT_EQ("NOT_FOUND", Get(Uint64Key(2)));
+  ASSERT_EQ("v0", Get(Uint64Key(3)));
+  ASSERT_EQ("v4", Get(Uint64Key(4)));
+}
+
+TEST_F(CuckooTableDBTest, CompactionIntoMultipleFiles) {
+  // Create a big L0 file and check it compacts into multiple files in L1.
+  Options options = CurrentOptions();
+  options.write_buffer_size = 270 << 10;
+  // Two SST files should be created, each containing 14 keys.
+  // Number of buckets will be 16. Total size ~156 KB.
+  options.target_file_size_base = 160 << 10;
+  Reopen(&options);
+
+  // Write 28 values, each 10016 B ~ 10KB
+  for (int idx = 0; idx < 28; ++idx) {
+    ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + idx)));
+  }
+  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_EQ("1", FilesPerLevel());
+
+  dbfull()->TEST_CompactRange(0, nullptr, nullptr);
+  ASSERT_EQ("0,2", FilesPerLevel());
+  for (int idx = 0; idx < 28; ++idx) {
+    ASSERT_EQ(std::string(10000, 'a' + idx), Get(Key(idx)));
+  }
+}
+
+TEST_F(CuckooTableDBTest, SameKeyInsertedInTwoDifferentFilesAndCompacted) {
+  // Insert same key twice so that they go to different SST files. Then wait for
+  // compaction and check if the latest value is stored and old value removed.
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100 << 10;  // 100KB
+  options.level0_file_num_compaction_trigger = 2;
+  Reopen(&options);
+
+  // Write 11 values, each 10016 B
+  for (int idx = 0; idx < 11; ++idx) {
+    ASSERT_OK(Put(Key(idx), std::string(10000, 'a')));
+  }
+  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_EQ("1", FilesPerLevel());
+
+  // Generate one more file in level-0, and should trigger level-0 compaction
+  for (int idx = 0; idx < 11; ++idx) {
+    ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + idx)));
+  }
+  dbfull()->TEST_WaitForFlushMemTable();
+  dbfull()->TEST_CompactRange(0, nullptr, nullptr);
+
+  ASSERT_EQ("0,1", FilesPerLevel());
+  for (int idx = 0; idx < 11; ++idx) {
+    ASSERT_EQ(std::string(10000, 'a' + idx), Get(Key(idx)));
+  }
+}
+
+TEST_F(CuckooTableDBTest, AdaptiveTable) {
+  Options options = CurrentOptions();
+
+  // Write some keys using cuckoo table.
+  options.table_factory.reset(NewCuckooTableFactory());
+  Reopen(&options);
+
+  ASSERT_OK(Put("key1", "v1"));
+  ASSERT_OK(Put("key2", "v2"));
+  ASSERT_OK(Put("key3", "v3"));
+  dbfull()->TEST_FlushMemTable();
+
+  // Write some keys using plain table.
+  options.create_if_missing = false;
+  options.table_factory.reset(NewPlainTableFactory());
+  Reopen(&options);
+  ASSERT_OK(Put("key4", "v4"));
+  ASSERT_OK(Put("key1", "v5"));
+  dbfull()->TEST_FlushMemTable();
+
+  // Write some keys using block based table.
+  std::shared_ptr<TableFactory> block_based_factory(
+      NewBlockBasedTableFactory());
+  options.table_factory.reset(NewAdaptiveTableFactory(block_based_factory));
+  Reopen(&options);
+  ASSERT_OK(Put("key5", "v6"));
+  ASSERT_OK(Put("key2", "v7"));
+  dbfull()->TEST_FlushMemTable();
+
+  ASSERT_EQ("v5", Get("key1"));
+  ASSERT_EQ("v7", Get("key2"));
+  ASSERT_EQ("v3", Get("key3"));
+  ASSERT_EQ("v4", Get("key4"));
+  ASSERT_EQ("v6", Get("key5"));
+}
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_bench.cc b/src/rocksdb/db/db_bench.cc
index 2e8da9e..e4fc1c4 100644
--- a/src/rocksdb/db/db_bench.cc
+++ b/src/rocksdb/db/db_bench.cc
@@ -7,7 +7,25 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
+#endif
+
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+  return 1;
+}
+#else
+
+#ifdef NUMA
+#include <numa.h>
+#include <numaif.h>
+#endif
+
+#include <unistd.h>
+#include <fcntl.h>
 #include <inttypes.h>
 #include <cstddef>
 #include <sys/types.h>
@@ -16,7 +34,6 @@
 #include <gflags/gflags.h>
 #include "db/db_impl.h"
 #include "db/version_set.h"
-#include "rocksdb/statistics.h"
 #include "rocksdb/options.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/db.h"
@@ -24,12 +41,15 @@
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/write_batch.h"
 #include "rocksdb/slice.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/rate_limiter.h"
 #include "rocksdb/slice_transform.h"
-#include "rocksdb/statistics.h"
 #include "rocksdb/perf_context.h"
+#include "rocksdb/utilities/flashcache.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
 #include "util/crc32c.h"
+#include "util/compression.h"
 #include "util/histogram.h"
 #include "util/mutexlock.h"
 #include "util/random.h"
@@ -40,6 +60,9 @@
 #include "hdfs/env_hdfs.h"
 #include "utilities/merge_operators.h"
 
+using GFLAGS::ParseCommandLineFlags;
+using GFLAGS::RegisterFlagValidator;
+using GFLAGS::SetUsageMessage;
 
 DEFINE_string(benchmarks,
               "fillseq,"
@@ -51,6 +74,7 @@ DEFINE_string(benchmarks,
               "newiteratorwhilewriting,"
               "seekrandom,"
               "seekrandomwhilewriting,"
+              "seekrandomwhilemerging,"
               "readseq,"
               "readreverse,"
               "compact,"
@@ -60,6 +84,7 @@ DEFINE_string(benchmarks,
               "readtocache,"
               "readreverse,"
               "readwhilewriting,"
+              "readwhilemerging,"
               "readrandomwriterandom,"
               "updaterandom,"
               "randomwithverify,"
@@ -68,7 +93,8 @@ DEFINE_string(benchmarks,
               "xxhash,"
               "compress,"
               "uncompress,"
-              "acquireload,",
+              "acquireload,"
+              "fillseekseq,",
 
               "Comma-separated list of operations to run in the specified order"
               "Actual benchmarks:\n"
@@ -93,6 +119,8 @@ DEFINE_string(benchmarks,
               "of DB\n"
               "\treadwhilewriting      -- 1 writer, N threads doing random "
               "reads\n"
+              "\treadwhilemerging      -- 1 merger, N threads doing random "
+              "reads\n"
               "\treadrandomwriterandom -- N threads doing random-read, "
               "random-write\n"
               "\tprefixscanrandom      -- prefix scan N times in random order\n"
@@ -106,11 +134,17 @@ DEFINE_string(benchmarks,
               "\treadrandommergerandom -- perform N random read-or-merge "
               "operations. Must be used with merge_operator\n"
               "\tnewiterator   -- repeated iterator creation\n"
-              "\tseekrandom    -- N random seeks\n"
-              "\tseekrandom    -- 1 writer, N threads doing random seeks\n"
+              "\tseekrandom    -- N random seeks, call Next seek_nexts times "
+              "per seek\n"
+              "\tseekrandomwhilewriting -- seekrandom and 1 thread doing "
+              "overwrite\n"
+              "\tseekrandomwhilemerging -- seekrandom and 1 thread doing "
+              "merge\n"
               "\tcrc32c        -- repeated crc32c of 4K of data\n"
               "\txxhash        -- repeated xxHash of 4K of data\n"
               "\tacquireload   -- load N*1000 times\n"
+              "\tfillseekseq   -- write N values in sequential key, then read "
+              "them by seeking to each key\n"
               "Meta operations:\n"
               "\tcompact     -- Compact the entire DB\n"
               "\tstats       -- Print DB stats\n"
@@ -130,6 +164,14 @@ DEFINE_int64(merge_keys, -1,
              "Number of distinct keys to use for MergeRandom and "
              "ReadRandomMergeRandom. "
              "If negative, there will be FLAGS_num keys.");
+DEFINE_int32(num_column_families, 1, "Number of Column Families to use.");
+
+DEFINE_int32(
+    num_hot_column_families, 0,
+    "Number of Hot Column Families. If more than 0, only write to this "
+    "number of column families. After finishing all the writes to them, "
+    "create new set of column families and insert to them. Only used "
+    "when num_column_families > 1.");
 
 DEFINE_int64(reads, -1, "Number of read operations to do.  "
              "If negative, do FLAGS_num reads.");
@@ -146,15 +188,20 @@ DEFINE_int32(duration, 0, "Time in seconds for the random-ops tests to run."
 
 DEFINE_int32(value_size, 100, "Size of each value");
 
+DEFINE_int32(seek_nexts, 0,
+             "How many times to call Next() after Seek() in "
+             "fillseekseq, seekrandom, seekrandomwhilewriting and "
+             "seekrandomwhilemerging");
+
+DEFINE_bool(reverse_iterator, false,
+            "When true use Prev rather than Next for iterators that do "
+            "Seek and then Next");
+
+DEFINE_bool(use_uint64_comparator, false, "use Uint64 user comparator");
+
+DEFINE_int64(batch_size, 1, "Batch size");
 
-// the maximum size of key in bytes
-static const int kMaxKeySize = 128;
 static bool ValidateKeySize(const char* flagname, int32_t value) {
-  if (value > kMaxKeySize) {
-    fprintf(stderr, "Invalid value for --%s: %d, must be < %d\n",
-            flagname, value, kMaxKeySize);
-    return false;
-  }
   return true;
 }
 
@@ -166,8 +213,25 @@ DEFINE_int32(num_multi_db, 0,
 DEFINE_double(compression_ratio, 0.5, "Arrange to generate values that shrink"
               " to this fraction of their original size after compression");
 
+DEFINE_double(read_random_exp_range, 0.0,
+              "Read random's key will be generated using distribution of "
+              "num * exp(r) where r is uniform number from 0 to this value. "
+              "The larger the number is, the more skewed the reads are. "
+              "Only used in readrandom and multireadrandom benchmarks.");
+
 DEFINE_bool(histogram, false, "Print histogram of operation timings");
 
+DEFINE_bool(enable_numa, false,
+            "Make operations aware of NUMA architecture and bind memory "
+            "and cpus corresponding to nodes together. In NUMA, memory "
+            "in same node as CPUs are closer when compared to memory in "
+            "other nodes. Reads can be faster when the process is bound to "
+            "CPU and memory of same node. Use \"$numactl --hardware\" command "
+            "to see NUMA memory architecture.");
+
+DEFINE_int64(db_write_buffer_size, rocksdb::Options().db_write_buffer_size,
+             "Number of bytes to buffer in all memtables before compacting");
+
 DEFINE_int64(write_buffer_size, rocksdb::Options().write_buffer_size,
              "Number of bytes to buffer in memtable before compacting");
 
@@ -221,9 +285,15 @@ DEFINE_int32(universal_compression_size_percent, -1,
 DEFINE_int64(cache_size, -1, "Number of bytes to use as a cache of uncompressed"
              "data. Negative means use default settings.");
 
-DEFINE_int32(block_size, rocksdb::Options().block_size,
+DEFINE_int32(block_size,
+             static_cast<int32_t>(rocksdb::BlockBasedTableOptions().block_size),
              "Number of bytes in a block.");
 
+DEFINE_int32(block_restart_interval,
+             rocksdb::BlockBasedTableOptions().block_restart_interval,
+             "Number of keys between restart points "
+             "for delta encoding of keys.");
+
 DEFINE_int64(compressed_cache_size, -1,
              "Number of bytes to use as a cache of compressed data.");
 
@@ -254,8 +324,6 @@ DEFINE_int32(cache_numshardbits, -1, "Number of shards for the block cache"
              " is 2 ** cache_numshardbits. Negative means use default settings."
              " This is applied only if FLAGS_cache_size is non-negative.");
 
-DEFINE_int32(cache_remove_scan_count_limit, 32, "");
-
 DEFINE_bool(verify_checksum, false, "Verify checksum for every block read"
             " from storage");
 
@@ -265,8 +333,9 @@ static class std::shared_ptr<rocksdb::Statistics> dbstats;
 DEFINE_int64(writes, -1, "Number of write operations to do. If negative, do"
              " --num reads.");
 
-DEFINE_int32(writes_per_second, 0, "Per-thread rate limit on writes per second."
-             " No limit when <= 0. Only for the readwhilewriting test.");
+DEFINE_int32(writes_per_second, 0, "Per-thread rate limit on writes and merges "
+             "  per second. No limit when <= 0. Only for the readwhilewriting "
+             "  and readwhilemerging tests.");
 
 DEFINE_bool(sync, false, "Sync all writes to disk");
 
@@ -281,13 +350,16 @@ DEFINE_string(wal_dir, "", "If not empty, use the given dir for WAL");
 
 DEFINE_int32(num_levels, 7, "The total number of levels");
 
-DEFINE_int32(target_file_size_base, 2 * 1048576, "Target file size at level-1");
+DEFINE_int64(target_file_size_base, 2 * 1048576, "Target file size at level-1");
 
 DEFINE_int32(target_file_size_multiplier, 1,
              "A multiplier to compute target level-N file size (N >= 2)");
 
 DEFINE_uint64(max_bytes_for_level_base,  10 * 1048576, "Max bytes for level-1");
 
+DEFINE_bool(level_compaction_dynamic_level_bytes, false,
+            "Whether level size base is dynamic");
+
 DEFINE_int32(max_bytes_for_level_multiplier, 10,
              "A multiplier to compute max bytes for level-N (N >= 2)");
 
@@ -328,12 +400,8 @@ DEFINE_int32(deletepercent, 2, "Percentage of deletes out of reads/writes/"
              "deletepercent), so deletepercent must be smaller than (100 - "
              "FLAGS_readwritepercent)");
 
-DEFINE_int32(disable_seek_compaction, false, "Option to disable compaction"
-             " triggered by read.");
-
-DEFINE_uint64(delete_obsolete_files_period_micros, 0, "Option to delete "
-              "obsolete files periodically. 0 means that obsolete files are"
-              " deleted after every compaction run.");
+DEFINE_uint64(delete_obsolete_files_period_micros, 0,
+              "Ignored. Left here for backward compatibility");
 
 namespace {
 enum rocksdb::CompressionType StringToCompressionType(const char* ctype) {
@@ -355,6 +423,16 @@ enum rocksdb::CompressionType StringToCompressionType(const char* ctype) {
   fprintf(stdout, "Cannot parse compression type '%s'\n", ctype);
   return rocksdb::kSnappyCompression; //default value
 }
+
+std::string ColumnFamilyName(size_t i) {
+  if (i == 0) {
+    return rocksdb::kDefaultColumnFamilyName;
+  } else {
+    char name[100];
+    snprintf(name, sizeof(name), "column_family_name_%06zu", i);
+    return std::string(name);
+  }
+}
 }  // namespace
 
 DEFINE_string(compression_type, "snappy",
@@ -376,8 +454,7 @@ static bool ValidateCompressionLevel(const char* flagname, int32_t value) {
 }
 
 static const bool FLAGS_compression_level_dummy __attribute__((unused)) =
-    google::RegisterFlagValidator(&FLAGS_compression_level,
-                                  &ValidateCompressionLevel);
+    RegisterFlagValidator(&FLAGS_compression_level, &ValidateCompressionLevel);
 
 DEFINE_int32(min_level_to_compress, -1, "If non-negative, compression starts"
              " from this level. Levels with number < min_level_to_compress are"
@@ -402,9 +479,16 @@ static rocksdb::Env* FLAGS_env = rocksdb::Env::Default();
 DEFINE_int64(stats_interval, 0, "Stats are reported every N operations when "
              "this is greater than zero. When 0 the interval grows over time.");
 
+DEFINE_int64(stats_interval_seconds, 0, "Report stats every N seconds. This "
+             "overrides stats_interval when both are > 0.");
+
 DEFINE_int32(stats_per_interval, 0, "Reports additional stats per interval when"
              " this is greater than 0.");
 
+DEFINE_int32(thread_status_per_interval, 0,
+             "Takes and report a snapshot of the current status of each thread"
+             " when this is greater than 0.");
+
 DEFINE_int32(perf_level, 0, "Level of perf collection");
 
 static bool ValidateRateLimit(const char* flagname, double value) {
@@ -426,6 +510,8 @@ DEFINE_int32(rate_limit_delay_max_milliseconds, 1000,
              "When hard_rate_limit is set then this is the max time a put will"
              " be stalled.");
 
+DEFINE_uint64(rate_limiter_bytes_per_sec, 0, "Set options.rate_limiter value.");
+
 DEFINE_int32(max_grandparent_overlap_factor, 10, "Control maximum bytes of "
              "overlaps in grandparent (i.e., level+2) before we stop building a"
              " single file in a level->level+1 compaction.");
@@ -441,6 +527,7 @@ DEFINE_int32(source_compaction_factor, 1, "Cap the size of data in level-K for"
 DEFINE_uint64(wal_ttl_seconds, 0, "Set the TTL for the WAL Files in seconds.");
 DEFINE_uint64(wal_size_limit_MB, 0, "Set the size limit for the WAL Files"
               " in MB.");
+DEFINE_uint64(max_total_wal_size, 0, "Set total max WAL size");
 
 DEFINE_bool(bufferedio, rocksdb::EnvOptions().use_os_buffer,
             "Allow buffered io using OS buffers");
@@ -459,6 +546,11 @@ DEFINE_string(compaction_fadvice, "NORMAL",
 static auto FLAGS_compaction_fadvice_e =
   rocksdb::Options().access_hint_on_compaction_start;
 
+DEFINE_bool(disable_flashcache_for_background_threads, false,
+            "Disable flashcache for background threads");
+
+DEFINE_string(flashcache_dev, "", "Path to flashcache device");
+
 DEFINE_bool(use_tailing_iterator, false,
             "Use tailing iterator to access a series of keys instead of get");
 
@@ -466,9 +558,15 @@ DEFINE_bool(use_adaptive_mutex, rocksdb::Options().use_adaptive_mutex,
             "Use adaptive mutex");
 
 DEFINE_uint64(bytes_per_sync,  rocksdb::Options().bytes_per_sync,
-              "Allows OS to incrementally sync files to disk while they are"
+              "Allows OS to incrementally sync SST files to disk while they are"
               " being written, in the background. Issue one request for every"
               " bytes_per_sync written. 0 turns it off.");
+
+DEFINE_uint64(wal_bytes_per_sync,  rocksdb::Options().wal_bytes_per_sync,
+              "Allows OS to incrementally sync WAL files to disk while they are"
+              " being written, in the background. Issue one request for every"
+              " wal_bytes_per_sync written. 0 turns it off.");
+
 DEFINE_bool(filter_deletes, false, " On true, deletes use bloom-filter and drop"
             " the delete if key not present");
 
@@ -488,6 +586,16 @@ DEFINE_int32(prefix_size, 0, "control the prefix size for HashSkipList and "
 DEFINE_int64(keys_per_prefix, 0, "control average number of keys generated "
              "per prefix, 0 means no special handling of the prefix, "
              "i.e. use the prefix comes with the generated random number.");
+DEFINE_bool(enable_io_prio, false, "Lower the background flush/compaction "
+            "threads' IO priority");
+DEFINE_bool(identity_as_first_hash, false, "the first hash function of cuckoo "
+            "table becomes an identity function. This is only valid when key "
+            "is 8 bytes");
+
+enum PutOrMerge {
+  kPut,
+  kMerge
+};
 
 enum RepFactory {
   kSkipList,
@@ -522,43 +630,179 @@ DEFINE_string(memtablerep, "skip_list", "");
 DEFINE_int64(hash_bucket_count, 1024 * 1024, "hash bucket count");
 DEFINE_bool(use_plain_table, false, "if use plain table "
             "instead of block-based table format");
-
+DEFINE_bool(use_cuckoo_table, false, "if use cuckoo table format");
+DEFINE_double(cuckoo_hash_ratio, 0.9, "Hash ratio for Cuckoo SST table.");
+DEFINE_bool(use_hash_search, false, "if use kHashSearch "
+            "instead of kBinarySearch. "
+            "This is valid if only we use BlockTable");
+DEFINE_bool(use_block_based_filter, false, "if use kBlockBasedFilter "
+            "instead of kFullFilter for filter block. "
+            "This is valid if only we use BlockTable");
 DEFINE_string(merge_operator, "", "The merge operator to use with the database."
               "If a new merge operator is specified, be sure to use fresh"
               " database The possible merge operators are defined in"
               " utilities/merge_operators.h");
+DEFINE_int32(skip_list_lookahead, 0, "Used with skip_list memtablerep; try "
+             "linear search first for this many steps from the previous "
+             "position");
+DEFINE_bool(report_file_operations, false, "if report number of file "
+            "operations");
 
 static const bool FLAGS_soft_rate_limit_dummy __attribute__((unused)) =
-  google::RegisterFlagValidator(&FLAGS_soft_rate_limit,
-                                &ValidateRateLimit);
+    RegisterFlagValidator(&FLAGS_soft_rate_limit, &ValidateRateLimit);
 
 static const bool FLAGS_hard_rate_limit_dummy __attribute__((unused)) =
-  google::RegisterFlagValidator(&FLAGS_hard_rate_limit, &ValidateRateLimit);
+    RegisterFlagValidator(&FLAGS_hard_rate_limit, &ValidateRateLimit);
 
 static const bool FLAGS_prefix_size_dummy __attribute__((unused)) =
-  google::RegisterFlagValidator(&FLAGS_prefix_size, &ValidatePrefixSize);
+    RegisterFlagValidator(&FLAGS_prefix_size, &ValidatePrefixSize);
 
 static const bool FLAGS_key_size_dummy __attribute__((unused)) =
-  google::RegisterFlagValidator(&FLAGS_key_size, &ValidateKeySize);
+    RegisterFlagValidator(&FLAGS_key_size, &ValidateKeySize);
 
 static const bool FLAGS_cache_numshardbits_dummy __attribute__((unused)) =
-  google::RegisterFlagValidator(&FLAGS_cache_numshardbits,
-                                &ValidateCacheNumshardbits);
+    RegisterFlagValidator(&FLAGS_cache_numshardbits,
+                          &ValidateCacheNumshardbits);
 
 static const bool FLAGS_readwritepercent_dummy __attribute__((unused)) =
-  google::RegisterFlagValidator(&FLAGS_readwritepercent,
-                                &ValidateInt32Percent);
+    RegisterFlagValidator(&FLAGS_readwritepercent, &ValidateInt32Percent);
+
+DEFINE_int32(disable_seek_compaction, false,
+             "Not used, left here for backwards compatibility");
 
 static const bool FLAGS_deletepercent_dummy __attribute__((unused)) =
-  google::RegisterFlagValidator(&FLAGS_deletepercent,
-                                &ValidateInt32Percent);
-static const bool
-  FLAGS_table_cache_numshardbits_dummy __attribute__((unused)) =
-  google::RegisterFlagValidator(&FLAGS_table_cache_numshardbits,
-                                &ValidateTableCacheNumshardbits);
+    RegisterFlagValidator(&FLAGS_deletepercent, &ValidateInt32Percent);
+static const bool FLAGS_table_cache_numshardbits_dummy __attribute__((unused)) =
+    RegisterFlagValidator(&FLAGS_table_cache_numshardbits,
+                          &ValidateTableCacheNumshardbits);
 
 namespace rocksdb {
 
+namespace {
+struct ReportFileOpCounters {
+  std::atomic<int> open_counter_;
+  std::atomic<int> read_counter_;
+  std::atomic<int> append_counter_;
+  std::atomic<uint64_t> bytes_read_;
+  std::atomic<uint64_t> bytes_written_;
+};
+
+// A special Env to records and report file operations in db_bench
+class ReportFileOpEnv : public EnvWrapper {
+ public:
+  explicit ReportFileOpEnv(Env* base) : EnvWrapper(base) { reset(); }
+
+  void reset() {
+    counters_.open_counter_ = 0;
+    counters_.read_counter_ = 0;
+    counters_.append_counter_ = 0;
+    counters_.bytes_read_ = 0;
+    counters_.bytes_written_ = 0;
+  }
+
+  Status NewSequentialFile(const std::string& f, unique_ptr<SequentialFile>* r,
+                           const EnvOptions& soptions) override {
+    class CountingFile : public SequentialFile {
+     private:
+      unique_ptr<SequentialFile> target_;
+      ReportFileOpCounters* counters_;
+
+     public:
+      CountingFile(unique_ptr<SequentialFile>&& target,
+                   ReportFileOpCounters* counters)
+          : target_(std::move(target)), counters_(counters) {}
+
+      virtual Status Read(size_t n, Slice* result, char* scratch) override {
+        counters_->read_counter_.fetch_add(1, std::memory_order_relaxed);
+        Status rv = target_->Read(n, result, scratch);
+        counters_->bytes_read_.fetch_add(result->size(),
+                                         std::memory_order_relaxed);
+        return rv;
+      }
+
+      virtual Status Skip(uint64_t n) override { return target_->Skip(n); }
+    };
+
+    Status s = target()->NewSequentialFile(f, r, soptions);
+    if (s.ok()) {
+      counters()->open_counter_.fetch_add(1, std::memory_order_relaxed);
+      r->reset(new CountingFile(std::move(*r), counters()));
+    }
+    return s;
+  }
+
+  Status NewRandomAccessFile(const std::string& f,
+                             unique_ptr<RandomAccessFile>* r,
+                             const EnvOptions& soptions) override {
+    class CountingFile : public RandomAccessFile {
+     private:
+      unique_ptr<RandomAccessFile> target_;
+      ReportFileOpCounters* counters_;
+
+     public:
+      CountingFile(unique_ptr<RandomAccessFile>&& target,
+                   ReportFileOpCounters* counters)
+          : target_(std::move(target)), counters_(counters) {}
+      virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                          char* scratch) const override {
+        counters_->read_counter_.fetch_add(1, std::memory_order_relaxed);
+        Status rv = target_->Read(offset, n, result, scratch);
+        counters_->bytes_read_.fetch_add(result->size(),
+                                         std::memory_order_relaxed);
+        return rv;
+      }
+    };
+
+    Status s = target()->NewRandomAccessFile(f, r, soptions);
+    if (s.ok()) {
+      counters()->open_counter_.fetch_add(1, std::memory_order_relaxed);
+      r->reset(new CountingFile(std::move(*r), counters()));
+    }
+    return s;
+  }
+
+  Status NewWritableFile(const std::string& f, unique_ptr<WritableFile>* r,
+                         const EnvOptions& soptions) override {
+    class CountingFile : public WritableFile {
+     private:
+      unique_ptr<WritableFile> target_;
+      ReportFileOpCounters* counters_;
+
+     public:
+      CountingFile(unique_ptr<WritableFile>&& target,
+                   ReportFileOpCounters* counters)
+          : target_(std::move(target)), counters_(counters) {}
+
+      Status Append(const Slice& data) override {
+        counters_->append_counter_.fetch_add(1, std::memory_order_relaxed);
+        Status rv = target_->Append(data);
+        counters_->bytes_written_.fetch_add(data.size(),
+                                            std::memory_order_relaxed);
+        return rv;
+      }
+
+      Status Close() override { return target_->Close(); }
+      Status Flush() override { return target_->Flush(); }
+      Status Sync() override { return target_->Sync(); }
+    };
+
+    Status s = target()->NewWritableFile(f, r, soptions);
+    if (s.ok()) {
+      counters()->open_counter_.fetch_add(1, std::memory_order_relaxed);
+      r->reset(new CountingFile(std::move(*r), counters()));
+    }
+    return s;
+  }
+
+  // getter
+  ReportFileOpCounters* counters() { return &counters_; }
+
+ private:
+  ReportFileOpCounters counters_;
+};
+
+}  // namespace
+
 // Helper for quickly generating random data.
 class RandomGenerator {
  private:
@@ -582,9 +826,9 @@ class RandomGenerator {
   }
 
   Slice Generate(unsigned int len) {
+    assert(len <= data_.size());
     if (pos_ + len > data_.size()) {
       pos_ = 0;
-      assert(len < data_.size());
     }
     pos_ += len;
     return Slice(data_.data() + pos_ - len, len);
@@ -599,6 +843,57 @@ static void AppendWithSpace(std::string* str, Slice msg) {
   str->append(msg.data(), msg.size());
 }
 
+struct DBWithColumnFamilies {
+  std::vector<ColumnFamilyHandle*> cfh;
+  DB* db;
+  std::atomic<size_t> num_created;  // Need to be updated after all the
+                                    // new entries in cfh are set.
+  size_t num_hot;  // Number of column families to be queried at each moment.
+                   // After each CreateNewCf(), another num_hot number of new
+                   // Column families will be created and used to be queried.
+  port::Mutex create_cf_mutex;  // Only one thread can execute CreateNewCf()
+
+  DBWithColumnFamilies() : db(nullptr) {
+    cfh.clear();
+    num_created = 0;
+    num_hot = 0;
+  }
+
+  DBWithColumnFamilies(const DBWithColumnFamilies& other)
+      : cfh(other.cfh),
+        db(other.db),
+        num_created(other.num_created.load()),
+        num_hot(other.num_hot) {}
+
+  ColumnFamilyHandle* GetCfh(int64_t rand_num) {
+    assert(num_hot > 0);
+    return cfh[num_created.load(std::memory_order_acquire) - num_hot +
+               rand_num % num_hot];
+  }
+
+  // stage: assume CF from 0 to stage * num_hot has be created. Need to create
+  //        stage * num_hot + 1 to stage * (num_hot + 1).
+  void CreateNewCf(ColumnFamilyOptions options, int64_t stage) {
+    MutexLock l(&create_cf_mutex);
+    if ((stage + 1) * num_hot <= num_created) {
+      // Already created.
+      return;
+    }
+    auto new_num_created = num_created + num_hot;
+    assert(new_num_created <= cfh.size());
+    for (size_t i = num_created; i < new_num_created; i++) {
+      Status s =
+          db->CreateColumnFamily(options, ColumnFamilyName(i), &(cfh[i]));
+      if (!s.ok()) {
+        fprintf(stderr, "create column family error: %s\n",
+                s.ToString().c_str());
+        abort();
+      }
+    }
+    num_created.store(new_num_created, std::memory_order_release);
+  }
+};
+
 class Stats {
  private:
   int id_;
@@ -662,7 +957,37 @@ class Stats {
   void SetId(int id) { id_ = id; }
   void SetExcludeFromMerge() { exclude_from_merge_ = true; }
 
-  void FinishedSingleOp(DB* db) {
+  void PrintThreadStatus() {
+    std::vector<ThreadStatus> thread_list;
+    FLAGS_env->GetThreadList(&thread_list);
+
+    fprintf(stderr, "\n%18s %10s %12s %20s %13s %45s %12s %s\n",
+        "ThreadID", "ThreadType", "cfName", "Operation",
+        "ElapsedTime", "Stage", "State", "OperationProperties");
+
+    int64_t current_time = 0;
+    Env::Default()->GetCurrentTime(&current_time);
+    for (auto ts : thread_list) {
+      fprintf(stderr, "%18" PRIu64 " %10s %12s %20s %13s %45s %12s",
+          ts.thread_id,
+          ThreadStatus::GetThreadTypeName(ts.thread_type).c_str(),
+          ts.cf_name.c_str(),
+          ThreadStatus::GetOperationName(ts.operation_type).c_str(),
+          ThreadStatus::MicrosToString(ts.op_elapsed_micros).c_str(),
+          ThreadStatus::GetOperationStageName(ts.operation_stage).c_str(),
+          ThreadStatus::GetStateName(ts.state_type).c_str());
+
+      auto op_properties = ThreadStatus::InterpretOperationProperties(
+          ts.operation_type, ts.op_properties);
+      for (const auto& op_prop : op_properties) {
+        fprintf(stderr, " %s %" PRIu64" |",
+            op_prop.first.c_str(), op_prop.second);
+      }
+      fprintf(stderr, "\n");
+    }
+  }
+
+  void FinishedOps(DBWithColumnFamilies* db_with_cfh, DB* db, int64_t num_ops) {
     if (FLAGS_histogram) {
       double now = FLAGS_env->NowMicros();
       double micros = now - last_op_finish_;
@@ -674,7 +999,7 @@ class Stats {
       last_op_finish_ = now;
     }
 
-    done_++;
+    done_ += num_ops;
     if (done_ >= next_report_) {
       if (!FLAGS_stats_interval) {
         if      (next_report_ < 1000)   next_report_ += 100;
@@ -685,32 +1010,56 @@ class Stats {
         else if (next_report_ < 500000) next_report_ += 50000;
         else                            next_report_ += 100000;
         fprintf(stderr, "... finished %" PRIu64 " ops%30s\r", done_, "");
-        fflush(stderr);
       } else {
         double now = FLAGS_env->NowMicros();
-        fprintf(stderr,
-                "%s ... thread %d: (%" PRIu64 ",%" PRIu64 ") ops and "
-                "(%.1f,%.1f) ops/second in (%.6f,%.6f) seconds\n",
-                FLAGS_env->TimeToString((uint64_t) now/1000000).c_str(),
-                id_,
-                done_ - last_report_done_, done_,
-                (done_ - last_report_done_) /
-                ((now - last_report_finish_) / 1000000.0),
-                done_ / ((now - start_) / 1000000.0),
-                (now - last_report_finish_) / 1000000.0,
-                (now - start_) / 1000000.0);
-
-        if (FLAGS_stats_per_interval) {
-          std::string stats;
-          if (db && db->GetProperty("rocksdb.stats", &stats))
-            fprintf(stderr, "%s\n", stats.c_str());
-        }
+        int64_t usecs_since_last = now - last_report_finish_;
 
-        fflush(stderr);
-        next_report_ += FLAGS_stats_interval;
-        last_report_finish_ = now;
-        last_report_done_ = done_;
+        // Determine whether to print status where interval is either
+        // each N operations or each N seconds.
+
+        if (FLAGS_stats_interval_seconds &&
+            usecs_since_last < (FLAGS_stats_interval_seconds * 1000000)) {
+          // Don't check again for this many operations
+          next_report_ += FLAGS_stats_interval;
+
+        } else {
+
+          fprintf(stderr,
+                  "%s ... thread %d: (%" PRIu64 ",%" PRIu64 ") ops and "
+                  "(%.1f,%.1f) ops/second in (%.6f,%.6f) seconds\n",
+                  FLAGS_env->TimeToString((uint64_t) now/1000000).c_str(),
+                  id_,
+                  done_ - last_report_done_, done_,
+                  (done_ - last_report_done_) /
+                  (usecs_since_last / 1000000.0),
+                  done_ / ((now - start_) / 1000000.0),
+                  (now - last_report_finish_) / 1000000.0,
+                  (now - start_) / 1000000.0);
+
+          if (FLAGS_stats_per_interval) {
+            std::string stats;
+
+            if (db_with_cfh && db_with_cfh->num_created.load()) {
+              for (size_t i = 0; i < db_with_cfh->num_created.load(); ++i) {
+                if (db->GetProperty(db_with_cfh->cfh[i], "rocksdb.cfstats",
+                                    &stats))
+                  fprintf(stderr, "%s\n", stats.c_str());
+              }
+
+            } else if (db && db->GetProperty("rocksdb.stats", &stats)) {
+              fprintf(stderr, "%s\n", stats.c_str());
+            }
+          }
+
+          next_report_ += FLAGS_stats_interval;
+          last_report_finish_ = now;
+          last_report_done_ = done_;
+        }
+      }
+      if (id_ == 0 && FLAGS_thread_status_per_interval) {
+        PrintThreadStatus();
       }
+      fflush(stderr);
     }
   }
 
@@ -720,7 +1069,7 @@ class Stats {
 
   void Report(const Slice& name) {
     // Pretend at least one op was done in case we are running a benchmark
-    // that does not call FinishedSingleOp().
+    // that does not call FinishedOps().
     if (done_ < 1) done_ = 1;
 
     std::string extra;
@@ -746,6 +1095,21 @@ class Stats {
     if (FLAGS_histogram) {
       fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str());
     }
+    if (FLAGS_report_file_operations) {
+      ReportFileOpEnv* env = static_cast<ReportFileOpEnv*>(FLAGS_env);
+      ReportFileOpCounters* counters = env->counters();
+      fprintf(stdout, "Num files opened: %d\n",
+              counters->open_counter_.load(std::memory_order_relaxed));
+      fprintf(stdout, "Num Read(): %d\n",
+              counters->read_counter_.load(std::memory_order_relaxed));
+      fprintf(stdout, "Num Append(): %d\n",
+              counters->append_counter_.load(std::memory_order_relaxed));
+      fprintf(stdout, "Num bytes read: %" PRIu64 "\n",
+              counters->bytes_read_.load(std::memory_order_relaxed));
+      fprintf(stdout, "Num bytes written: %" PRIu64 "\n",
+              counters->bytes_written_.load(std::memory_order_relaxed));
+      env->reset();
+    }
     fflush(stdout);
   }
 };
@@ -785,13 +1149,16 @@ struct ThreadState {
 
 class Duration {
  public:
-  Duration(int max_seconds, int64_t max_ops) {
+  Duration(int max_seconds, int64_t max_ops, int64_t ops_per_stage = 0) {
     max_seconds_ = max_seconds;
     max_ops_= max_ops;
+    ops_per_stage_ = (ops_per_stage > 0) ? ops_per_stage : max_ops;
     ops_ = 0;
     start_at_ = FLAGS_env->NowMicros();
   }
 
+  int64_t GetStage() { return std::min(ops_, max_ops_ - 1) / ops_per_stage_; }
+
   bool Done(int64_t increment) {
     if (increment <= 0) increment = 1;    // avoid Done(0) and infinite loops
     ops_ += increment;
@@ -812,18 +1179,19 @@ class Duration {
  private:
   int max_seconds_;
   int64_t max_ops_;
+  int64_t ops_per_stage_;
   int64_t ops_;
   double start_at_;
 };
 
 class Benchmark {
  private:
-  shared_ptr<Cache> cache_;
-  shared_ptr<Cache> compressed_cache_;
-  const FilterPolicy* filter_policy_;
+  std::shared_ptr<Cache> cache_;
+  std::shared_ptr<Cache> compressed_cache_;
+  std::shared_ptr<const FilterPolicy> filter_policy_;
   const SliceTransform* prefix_extractor_;
-  DB* db_;
-  std::vector<DB*> multi_dbs_;
+  DBWithColumnFamilies db_;
+  std::vector<DBWithColumnFamilies> multi_dbs_;
   int64_t num_;
   int value_size_;
   int key_size_;
@@ -831,10 +1199,23 @@ class Benchmark {
   int64_t keys_per_prefix_;
   int64_t entries_per_batch_;
   WriteOptions write_options_;
+  Options open_options_;  // keep options around to properly destroy db later
   int64_t reads_;
+  double read_random_exp_range_;
   int64_t writes_;
   int64_t readwrites_;
   int64_t merge_keys_;
+  bool report_file_operations_;
+  int cachedev_fd_;
+
+  bool SanityCheck() {
+    if (FLAGS_compression_ratio > 1) {
+      fprintf(stderr, "compression_ratio should be between 0 and 1\n");
+      return false;
+    }
+    return true;
+  }
+
   void PrintHeader() {
     PrintEnvironment();
     fprintf(stdout, "Keys:       %d bytes each\n", FLAGS_key_size);
@@ -852,6 +1233,18 @@ class Benchmark {
               * num_)
              / 1048576.0));
     fprintf(stdout, "Write rate limit: %d\n", FLAGS_writes_per_second);
+    if (FLAGS_enable_numa) {
+      fprintf(stderr, "Running in NUMA enabled mode.\n");
+#ifndef NUMA
+      fprintf(stderr, "NUMA is not defined in the system.\n");
+      exit(1);
+#else
+      if (numa_available() == -1) {
+        fprintf(stderr, "NUMA is not supported by the system.\n");
+        exit(1);
+      }
+#endif
+    }
     switch (FLAGS_compression_type_e) {
       case rocksdb::kNoCompression:
         fprintf(stdout, "Compression: none\n");
@@ -918,28 +1311,28 @@ class Benchmark {
       text[len] = '\0';
       switch (FLAGS_compression_type_e) {
         case kSnappyCompression:
-          result = port::Snappy_Compress(Options().compression_opts, text,
-                                         strlen(text), &compressed);
+          result = Snappy_Compress(Options().compression_opts, text,
+                                   strlen(text), &compressed);
           name = "Snappy";
           break;
         case kZlibCompression:
-          result = port::Zlib_Compress(Options().compression_opts, text,
-                                       strlen(text), &compressed);
+          result = Zlib_Compress(Options().compression_opts, 2, text,
+                                 strlen(text), &compressed);
           name = "Zlib";
           break;
         case kBZip2Compression:
-          result = port::BZip2_Compress(Options().compression_opts, text,
-                                        strlen(text), &compressed);
+          result = BZip2_Compress(Options().compression_opts, 2, text,
+                                  strlen(text), &compressed);
           name = "BZip2";
           break;
         case kLZ4Compression:
-          result = port::LZ4_Compress(Options().compression_opts, text,
-                                      strlen(text), &compressed);
+          result = LZ4_Compress(Options().compression_opts, 2, text,
+                                strlen(text), &compressed);
           name = "LZ4";
           break;
         case kLZ4HCCompression:
-          result = port::LZ4HC_Compress(Options().compression_opts, text,
-                                        strlen(text), &compressed);
+          result = LZ4HC_Compress(Options().compression_opts, 2, text,
+                                  strlen(text), &compressed);
           name = "LZ4HC";
           break;
         case kNoCompression:
@@ -964,7 +1357,7 @@ class Benchmark {
     while (start < s.size() && isspace(s[start])) {
       start++;
     }
-    unsigned int limit = s.size();
+    unsigned int limit = static_cast<unsigned int>(s.size());
     while (limit > start && isspace(s[limit-1])) {
       limit--;
     }
@@ -1009,32 +1402,49 @@ class Benchmark {
 
  public:
   Benchmark()
-  : cache_(FLAGS_cache_size >= 0 ?
-           (FLAGS_cache_numshardbits >= 1 ?
-            NewLRUCache(FLAGS_cache_size, FLAGS_cache_numshardbits,
-                        FLAGS_cache_remove_scan_count_limit) :
-            NewLRUCache(FLAGS_cache_size)) : nullptr),
-    compressed_cache_(FLAGS_compressed_cache_size >= 0 ?
-           (FLAGS_cache_numshardbits >= 1 ?
-            NewLRUCache(FLAGS_compressed_cache_size, FLAGS_cache_numshardbits) :
-            NewLRUCache(FLAGS_compressed_cache_size)) : nullptr),
-    filter_policy_(FLAGS_bloom_bits >= 0
-                   ? NewBloomFilterPolicy(FLAGS_bloom_bits)
-                   : nullptr),
-    prefix_extractor_(NewFixedPrefixTransform(FLAGS_prefix_size)),
-    db_(nullptr),
-    num_(FLAGS_num),
-    value_size_(FLAGS_value_size),
-    key_size_(FLAGS_key_size),
-    prefix_size_(FLAGS_prefix_size),
-    keys_per_prefix_(FLAGS_keys_per_prefix),
-    entries_per_batch_(1),
-    reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads),
-    writes_(FLAGS_writes < 0 ? FLAGS_num : FLAGS_writes),
-    readwrites_((FLAGS_writes < 0  && FLAGS_reads < 0)? FLAGS_num :
-                ((FLAGS_writes > FLAGS_reads) ? FLAGS_writes : FLAGS_reads)
-               ),
-    merge_keys_(FLAGS_merge_keys < 0 ? FLAGS_num : FLAGS_merge_keys) {
+      : cache_(
+            FLAGS_cache_size >= 0
+                ? (FLAGS_cache_numshardbits >= 1
+                       ? NewLRUCache(FLAGS_cache_size, FLAGS_cache_numshardbits)
+                       : NewLRUCache(FLAGS_cache_size))
+                : nullptr),
+        compressed_cache_(FLAGS_compressed_cache_size >= 0
+                              ? (FLAGS_cache_numshardbits >= 1
+                                     ? NewLRUCache(FLAGS_compressed_cache_size,
+                                                   FLAGS_cache_numshardbits)
+                                     : NewLRUCache(FLAGS_compressed_cache_size))
+                              : nullptr),
+        filter_policy_(FLAGS_bloom_bits >= 0
+                           ? NewBloomFilterPolicy(FLAGS_bloom_bits,
+                                                  FLAGS_use_block_based_filter)
+                           : nullptr),
+        prefix_extractor_(NewFixedPrefixTransform(FLAGS_prefix_size)),
+        num_(FLAGS_num),
+        value_size_(FLAGS_value_size),
+        key_size_(FLAGS_key_size),
+        prefix_size_(FLAGS_prefix_size),
+        keys_per_prefix_(FLAGS_keys_per_prefix),
+        entries_per_batch_(1),
+        reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads),
+        read_random_exp_range_(0.0),
+        writes_(FLAGS_writes < 0 ? FLAGS_num : FLAGS_writes),
+        readwrites_(
+            (FLAGS_writes < 0 && FLAGS_reads < 0)
+                ? FLAGS_num
+                : ((FLAGS_writes > FLAGS_reads) ? FLAGS_writes : FLAGS_reads)),
+        merge_keys_(FLAGS_merge_keys < 0 ? FLAGS_num : FLAGS_merge_keys),
+        report_file_operations_(FLAGS_report_file_operations),
+        cachedev_fd_(-1) {
+    if (report_file_operations_) {
+      if (!FLAGS_hdfs.empty()) {
+        fprintf(stderr,
+                "--hdfs and --report_file_operations cannot be enabled "
+                "at the same time");
+        exit(1);
+      }
+      FLAGS_env = new ReportFileOpEnv(rocksdb::Env::Default());
+    }
+
     if (FLAGS_prefix_size > FLAGS_key_size) {
       fprintf(stderr, "prefix size is larger than key size");
       exit(1);
@@ -1048,18 +1458,35 @@ class Benchmark {
       }
     }
     if (!FLAGS_use_existing_db) {
-      DestroyDB(FLAGS_db, Options());
+      Options options;
+      if (!FLAGS_wal_dir.empty()) {
+        options.wal_dir = FLAGS_wal_dir;
+      }
+      DestroyDB(FLAGS_db, options);
     }
   }
 
   ~Benchmark() {
-    delete db_;
-    delete filter_policy_;
+    std::for_each(db_.cfh.begin(), db_.cfh.end(),
+                  [](ColumnFamilyHandle* cfh) { delete cfh; });
+    delete db_.db;
     delete prefix_extractor_;
+    if (cache_.get() != nullptr) {
+      // this will leak, but we're shutting down so nobody cares
+      cache_->DisownData();
+    }
+    if (FLAGS_disable_flashcache_for_background_threads && cachedev_fd_ != -1) {
+      // Dtor for this env should run before cachedev_fd_ is closed
+      flashcache_aware_env_ = nullptr;
+      close(cachedev_fd_);
+    }
   }
 
-  Slice AllocateKey() {
-    return Slice(new char[key_size_], key_size_);
+  Slice AllocateKey(std::unique_ptr<const char[]>* key_guard) {
+    char* data = new char[key_size_];
+    const char* const_data = data;
+    key_guard->reset(const_data);
+    return Slice(key_guard->get(), key_size_);
   }
 
   // Generate key according to the given specification and random number.
@@ -1110,12 +1537,15 @@ class Benchmark {
   }
 
   std::string GetDbNameForMultiple(std::string base_name, size_t id) {
-    return base_name + std::to_string(id);
+    return base_name + ToString(id);
   }
 
   void Run() {
+    if (!SanityCheck()) {
+      exit(1);
+    }
     PrintHeader();
-    Open();
+    Open(&open_options_);
     const char* benchmarks = FLAGS_benchmarks.c_str();
     while (benchmarks != nullptr) {
       const char* sep = strchr(benchmarks, ',');
@@ -1134,8 +1564,9 @@ class Benchmark {
       writes_ = (FLAGS_writes < 0 ? FLAGS_num : FLAGS_writes);
       value_size_ = FLAGS_value_size;
       key_size_ = FLAGS_key_size;
-      entries_per_batch_ = 1;
+      entries_per_batch_ = FLAGS_batch_size;
       write_options_ = WriteOptions();
+      read_random_exp_range_ = FLAGS_read_random_exp_range;
       if (FLAGS_sync) {
         write_options_.sync = true;
       }
@@ -1186,7 +1617,11 @@ class Benchmark {
         method = &Benchmark::ReadReverse;
       } else if (name == Slice("readrandom")) {
         method = &Benchmark::ReadRandom;
+      } else if (name == Slice("readrandomfast")) {
+        method = &Benchmark::ReadRandomFast;
       } else if (name == Slice("multireadrandom")) {
+        fprintf(stderr, "entries_per_batch = %" PRIi64 "\n",
+                entries_per_batch_);
         method = &Benchmark::MultiReadRandom;
       } else if (name == Slice("readmissing")) {
         ++key_size_;
@@ -1201,6 +1636,9 @@ class Benchmark {
       } else if (name == Slice("seekrandomwhilewriting")) {
         num_threads++;  // Add extra thread for writing
         method = &Benchmark::SeekRandomWhileWriting;
+      } else if (name == Slice("seekrandomwhilemerging")) {
+        num_threads++;  // Add extra thread for merging
+        method = &Benchmark::SeekRandomWhileMerging;
       } else if (name == Slice("readrandomsmall")) {
         reads_ /= 1000;
         method = &Benchmark::ReadRandom;
@@ -1211,6 +1649,9 @@ class Benchmark {
       } else if (name == Slice("readwhilewriting")) {
         num_threads++;  // Add extra thread for writing
         method = &Benchmark::ReadWhileWriting;
+      } else if (name == Slice("readwhilemerging")) {
+        num_threads++;  // Add extra thread for writing
+        method = &Benchmark::ReadWhileMerging;
       } else if (name == Slice("readrandomwriterandom")) {
         method = &Benchmark::ReadRandomWriteRandom;
       } else if (name == Slice("readrandommergerandom")) {
@@ -1233,6 +1674,8 @@ class Benchmark {
         method = &Benchmark::MergeRandom;
       } else if (name == Slice("randomwithverify")) {
         method = &Benchmark::RandomWithVerify;
+      } else if (name == Slice("fillseekseq")) {
+        method = &Benchmark::WriteSeqSeekSeq;
       } else if (name == Slice("compact")) {
         method = &Benchmark::Compact;
       } else if (name == Slice("crc32c")) {
@@ -1264,18 +1707,21 @@ class Benchmark {
                   name.ToString().c_str());
           method = nullptr;
         } else {
-          if (db_ != nullptr) {
-            delete db_;
-            db_ = nullptr;
-            DestroyDB(FLAGS_db, Options());
+          if (db_.db != nullptr) {
+            std::for_each(db_.cfh.begin(), db_.cfh.end(),
+                          [](ColumnFamilyHandle* cfh) { delete cfh; });
+            delete db_.db;
+            db_.db = nullptr;
+            db_.cfh.clear();
+            DestroyDB(FLAGS_db, open_options_);
           }
           for (size_t i = 0; i < multi_dbs_.size(); i++) {
-            delete multi_dbs_[i];
-            DestroyDB(GetDbNameForMultiple(FLAGS_db, i), Options());
+            delete multi_dbs_[i].db;
+            DestroyDB(GetDbNameForMultiple(FLAGS_db, i), open_options_);
           }
           multi_dbs_.clear();
         }
-        Open();
+        Open(&open_options_);  // use open_options for the last accessed
       }
 
       if (method != nullptr) {
@@ -1289,6 +1735,8 @@ class Benchmark {
   }
 
  private:
+  std::unique_ptr<Env> flashcache_aware_env_;
+
   struct ThreadArg {
     Benchmark* bm;
     SharedState* shared;
@@ -1334,7 +1782,25 @@ class Benchmark {
     shared.start = false;
 
     ThreadArg* arg = new ThreadArg[n];
+
     for (int i = 0; i < n; i++) {
+#ifdef NUMA
+      if (FLAGS_enable_numa) {
+        // Performs a local allocation of memory to threads in numa node.
+        int n_nodes = numa_num_task_nodes();  // Number of nodes in NUMA.
+        numa_exit_on_error = 1;
+        int numa_node = i % n_nodes;
+        bitmask* nodes = numa_allocate_nodemask();
+        numa_bitmask_clearall(nodes);
+        numa_bitmask_setbit(nodes, numa_node);
+        // numa_bind() call binds the process to the node and these
+        // properties are passed on to the thread that is created in
+        // StartThread method called later in the loop.
+        numa_bind(nodes);
+        numa_set_strict(1);
+        numa_free_nodemask(nodes);
+      }
+#endif
       arg[i].bm = this;
       arg[i].method = method;
       arg[i].shared = &shared;
@@ -1377,7 +1843,7 @@ class Benchmark {
     uint32_t crc = 0;
     while (bytes < 500 * 1048576) {
       crc = crc32c::Value(data.data(), size);
-      thread->stats.FinishedSingleOp(nullptr);
+      thread->stats.FinishedOps(nullptr, nullptr, 1);
       bytes += size;
     }
     // Print so result is not dead
@@ -1396,7 +1862,7 @@ class Benchmark {
     unsigned int xxh32 = 0;
     while (bytes < 500 * 1048576) {
       xxh32 = XXH32(data.data(), size, 0);
-      thread->stats.FinishedSingleOp(nullptr);
+      thread->stats.FinishedOps(nullptr, nullptr, 1);
       bytes += size;
     }
     // Print so result is not dead
@@ -1408,23 +1874,23 @@ class Benchmark {
 
   void AcquireLoad(ThreadState* thread) {
     int dummy;
-    port::AtomicPointer ap(&dummy);
+    std::atomic<void*> ap(&dummy);
     int count = 0;
     void *ptr = nullptr;
     thread->stats.AddMessage("(each op is 1000 loads)");
     while (count < 100000) {
       for (int i = 0; i < 1000; i++) {
-        ptr = ap.Acquire_Load();
+        ptr = ap.load(std::memory_order_acquire);
       }
       count++;
-      thread->stats.FinishedSingleOp(nullptr);
+      thread->stats.FinishedOps(nullptr, nullptr, 1);
     }
     if (ptr == nullptr) exit(1); // Disable unused variable warning.
   }
 
   void Compress(ThreadState *thread) {
     RandomGenerator gen;
-    Slice input = gen.Generate(Options().block_size);
+    Slice input = gen.Generate(FLAGS_block_size);
     int64_t bytes = 0;
     int64_t produced = 0;
     bool ok = true;
@@ -1434,31 +1900,31 @@ class Benchmark {
     while (ok && bytes < int64_t(1) << 30) {
       switch (FLAGS_compression_type_e) {
       case rocksdb::kSnappyCompression:
-        ok = port::Snappy_Compress(Options().compression_opts, input.data(),
-                                   input.size(), &compressed);
+        ok = Snappy_Compress(Options().compression_opts, input.data(),
+                             input.size(), &compressed);
         break;
       case rocksdb::kZlibCompression:
-        ok = port::Zlib_Compress(Options().compression_opts, input.data(),
-                                 input.size(), &compressed);
+        ok = Zlib_Compress(Options().compression_opts, 2, input.data(),
+                           input.size(), &compressed);
         break;
       case rocksdb::kBZip2Compression:
-        ok = port::BZip2_Compress(Options().compression_opts, input.data(),
-                                  input.size(), &compressed);
+        ok = BZip2_Compress(Options().compression_opts, 2, input.data(),
+                            input.size(), &compressed);
         break;
       case rocksdb::kLZ4Compression:
-        ok = port::LZ4_Compress(Options().compression_opts, input.data(),
-                                input.size(), &compressed);
+        ok = LZ4_Compress(Options().compression_opts, 2, input.data(),
+                          input.size(), &compressed);
         break;
       case rocksdb::kLZ4HCCompression:
-        ok = port::LZ4HC_Compress(Options().compression_opts, input.data(),
-                                  input.size(), &compressed);
+        ok = LZ4HC_Compress(Options().compression_opts, 2, input.data(),
+                            input.size(), &compressed);
         break;
       default:
         ok = false;
       }
       produced += compressed.size();
       bytes += input.size();
-      thread->stats.FinishedSingleOp(nullptr);
+      thread->stats.FinishedOps(nullptr, nullptr, 1);
     }
 
     if (!ok) {
@@ -1474,30 +1940,30 @@ class Benchmark {
 
   void Uncompress(ThreadState *thread) {
     RandomGenerator gen;
-    Slice input = gen.Generate(Options().block_size);
+    Slice input = gen.Generate(FLAGS_block_size);
     std::string compressed;
 
     bool ok;
     switch (FLAGS_compression_type_e) {
     case rocksdb::kSnappyCompression:
-      ok = port::Snappy_Compress(Options().compression_opts, input.data(),
-                                 input.size(), &compressed);
+      ok = Snappy_Compress(Options().compression_opts, input.data(),
+                           input.size(), &compressed);
       break;
     case rocksdb::kZlibCompression:
-      ok = port::Zlib_Compress(Options().compression_opts, input.data(),
-                               input.size(), &compressed);
+      ok = Zlib_Compress(Options().compression_opts, 2, input.data(),
+                         input.size(), &compressed);
       break;
     case rocksdb::kBZip2Compression:
-      ok = port::BZip2_Compress(Options().compression_opts, input.data(),
-                                input.size(), &compressed);
+      ok = BZip2_Compress(Options().compression_opts, 2, input.data(),
+                          input.size(), &compressed);
       break;
     case rocksdb::kLZ4Compression:
-      ok = port::LZ4_Compress(Options().compression_opts, input.data(),
-                              input.size(), &compressed);
+      ok = LZ4_Compress(Options().compression_opts, 2, input.data(),
+                        input.size(), &compressed);
       break;
     case rocksdb::kLZ4HCCompression:
-      ok = port::LZ4HC_Compress(Options().compression_opts, input.data(),
-                                input.size(), &compressed);
+      ok = LZ4HC_Compress(Options().compression_opts, 2, input.data(),
+                          input.size(), &compressed);
       break;
     default:
       ok = false;
@@ -1511,27 +1977,27 @@ class Benchmark {
       case rocksdb::kSnappyCompression:
         // allocate here to make comparison fair
         uncompressed = new char[input.size()];
-        ok = port::Snappy_Uncompress(compressed.data(), compressed.size(),
-                                     uncompressed);
+        ok = Snappy_Uncompress(compressed.data(), compressed.size(),
+                               uncompressed);
         break;
       case rocksdb::kZlibCompression:
-        uncompressed = port::Zlib_Uncompress(
-            compressed.data(), compressed.size(), &decompress_size);
+        uncompressed = Zlib_Uncompress(compressed.data(), compressed.size(),
+                                       &decompress_size, 2);
         ok = uncompressed != nullptr;
         break;
       case rocksdb::kBZip2Compression:
-        uncompressed = port::BZip2_Uncompress(
-            compressed.data(), compressed.size(), &decompress_size);
+        uncompressed = BZip2_Uncompress(compressed.data(), compressed.size(),
+                                        &decompress_size, 2);
         ok = uncompressed != nullptr;
         break;
       case rocksdb::kLZ4Compression:
-        uncompressed = port::LZ4_Uncompress(
-            compressed.data(), compressed.size(), &decompress_size);
+        uncompressed = LZ4_Uncompress(compressed.data(), compressed.size(),
+                                      &decompress_size, 2);
         ok = uncompressed != nullptr;
         break;
       case rocksdb::kLZ4HCCompression:
-        uncompressed = port::LZ4_Uncompress(
-            compressed.data(), compressed.size(), &decompress_size);
+        uncompressed = LZ4_Uncompress(compressed.data(), compressed.size(),
+                                      &decompress_size, 2);
         ok = uncompressed != nullptr;
         break;
       default:
@@ -1539,7 +2005,7 @@ class Benchmark {
       }
       delete[] uncompressed;
       bytes += input.size();
-      thread->stats.FinishedSingleOp(nullptr);
+      thread->stats.FinishedOps(nullptr, nullptr, 1);
     }
 
     if (!ok) {
@@ -1549,15 +2015,14 @@ class Benchmark {
     }
   }
 
-  void Open() {
-    assert(db_ == nullptr);
-    Options options;
+  void Open(Options* opts) {
+    Options& options = *opts;
+
+    assert(db_.db == nullptr);
+
     options.create_if_missing = !FLAGS_use_existing_db;
-    options.block_cache = cache_;
-    options.block_cache_compressed = compressed_cache_;
-    if (cache_ == nullptr) {
-      options.no_block_cache = true;
-    }
+    options.create_missing_column_families = FLAGS_num_column_families > 1;
+    options.db_write_buffer_size = FLAGS_db_write_buffer_size;
     options.write_buffer_size = FLAGS_write_buffer_size;
     options.max_write_buffer_number = FLAGS_max_write_buffer_number;
     options.min_write_buffer_number_to_merge =
@@ -1565,17 +2030,44 @@ class Benchmark {
     options.max_background_compactions = FLAGS_max_background_compactions;
     options.max_background_flushes = FLAGS_max_background_flushes;
     options.compaction_style = FLAGS_compaction_style_e;
-    options.block_size = FLAGS_block_size;
-    options.filter_policy = filter_policy_;
-    if (FLAGS_use_plain_table) {
+    if (FLAGS_prefix_size != 0) {
       options.prefix_extractor.reset(
           NewFixedPrefixTransform(FLAGS_prefix_size));
     }
+    if (FLAGS_use_uint64_comparator) {
+      options.comparator = test::Uint64Comparator();
+      if (FLAGS_key_size != 8) {
+        fprintf(stderr, "Using Uint64 comparator but key size is not 8.\n");
+        exit(1);
+      }
+    }
     options.memtable_prefix_bloom_bits = FLAGS_memtable_bloom_bits;
     options.bloom_locality = FLAGS_bloom_locality;
     options.max_open_files = FLAGS_open_files;
     options.statistics = dbstats;
-    options.env = FLAGS_env;
+    if (FLAGS_enable_io_prio) {
+      FLAGS_env->LowerThreadPoolIOPriority(Env::LOW);
+      FLAGS_env->LowerThreadPoolIOPriority(Env::HIGH);
+    }
+    if (FLAGS_disable_flashcache_for_background_threads &&
+        cachedev_fd_ == -1) {
+      // Avoid creating the env twice when an use_existing_db is true
+      cachedev_fd_ = open(FLAGS_flashcache_dev.c_str(), O_RDONLY);
+      if (cachedev_fd_ < 0) {
+        fprintf(stderr, "Open flash device failed\n");
+        exit(1);
+      }
+      flashcache_aware_env_ =
+          std::move(NewFlashcacheAwareEnv(FLAGS_env, cachedev_fd_));
+      if (flashcache_aware_env_.get() == nullptr) {
+        fprintf(stderr, "Failed to open flashcahce device at %s\n",
+                FLAGS_flashcache_dev.c_str());
+        std::abort();
+      }
+      options.env = flashcache_aware_env_.get();
+    } else {
+      options.env = FLAGS_env;
+    }
     options.disableDataSync = FLAGS_disable_data_sync;
     options.use_fsync = FLAGS_use_fsync;
     options.wal_dir = FLAGS_wal_dir;
@@ -1583,6 +2075,8 @@ class Benchmark {
     options.target_file_size_base = FLAGS_target_file_size_base;
     options.target_file_size_multiplier = FLAGS_target_file_size_multiplier;
     options.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base;
+    options.level_compaction_dynamic_level_bytes =
+        FLAGS_level_compaction_dynamic_level_bytes;
     options.max_bytes_for_level_multiplier =
         FLAGS_max_bytes_for_level_multiplier;
     options.filter_deletes = FLAGS_filter_deletes;
@@ -1593,12 +2087,14 @@ class Benchmark {
       exit(1);
     }
     switch (FLAGS_rep_factory) {
-      case kPrefixHash:
-        options.memtable_factory.reset(NewHashSkipListRepFactory(
-            FLAGS_hash_bucket_count));
-        break;
       case kSkipList:
-        // no need to do anything
+        options.memtable_factory.reset(new SkipListFactory(
+            FLAGS_skip_list_lookahead));
+        break;
+#ifndef ROCKSDB_LITE
+      case kPrefixHash:
+        options.memtable_factory.reset(
+            NewHashSkipListRepFactory(FLAGS_hash_bucket_count));
         break;
       case kHashLinkedList:
         options.memtable_factory.reset(NewHashLinkListRepFactory(
@@ -1613,8 +2109,14 @@ class Benchmark {
         options.memtable_factory.reset(NewHashCuckooRepFactory(
             options.write_buffer_size, FLAGS_key_size + FLAGS_value_size));
         break;
+#else
+      default:
+        fprintf(stderr, "Only skip list is supported in lite mode\n");
+        exit(1);
+#endif  // ROCKSDB_LITE
     }
     if (FLAGS_use_plain_table) {
+#ifndef ROCKSDB_LITE
       if (FLAGS_rep_factory != kPrefixHash &&
           FLAGS_rep_factory != kHashLinkedList) {
         fprintf(stderr, "Waring: plain table is used with skipList\n");
@@ -1628,8 +2130,55 @@ class Benchmark {
       if (bloom_bits_per_key < 0) {
         bloom_bits_per_key = 0;
       }
+
+      PlainTableOptions plain_table_options;
+      plain_table_options.user_key_len = FLAGS_key_size;
+      plain_table_options.bloom_bits_per_key = bloom_bits_per_key;
+      plain_table_options.hash_table_ratio = 0.75;
       options.table_factory = std::shared_ptr<TableFactory>(
-          NewPlainTableFactory(FLAGS_key_size, bloom_bits_per_key, 0.75));
+          NewPlainTableFactory(plain_table_options));
+#else
+      fprintf(stderr, "Plain table is not supported in lite mode\n");
+      exit(1);
+#endif  // ROCKSDB_LITE
+    } else if (FLAGS_use_cuckoo_table) {
+#ifndef ROCKSDB_LITE
+      if (FLAGS_cuckoo_hash_ratio > 1 || FLAGS_cuckoo_hash_ratio < 0) {
+        fprintf(stderr, "Invalid cuckoo_hash_ratio\n");
+        exit(1);
+      }
+      rocksdb::CuckooTableOptions table_options;
+      table_options.hash_table_ratio = FLAGS_cuckoo_hash_ratio;
+      table_options.identity_as_first_hash = FLAGS_identity_as_first_hash;
+      options.table_factory = std::shared_ptr<TableFactory>(
+          NewCuckooTableFactory(table_options));
+#else
+      fprintf(stderr, "Cuckoo table is not supported in lite mode\n");
+      exit(1);
+#endif  // ROCKSDB_LITE
+    } else {
+      BlockBasedTableOptions block_based_options;
+      if (FLAGS_use_hash_search) {
+        if (FLAGS_prefix_size == 0) {
+          fprintf(stderr,
+              "prefix_size not assigned when enable use_hash_search \n");
+          exit(1);
+        }
+        block_based_options.index_type = BlockBasedTableOptions::kHashSearch;
+      } else {
+        block_based_options.index_type = BlockBasedTableOptions::kBinarySearch;
+      }
+      if (cache_ == nullptr) {
+        block_based_options.no_block_cache = true;
+      }
+      block_based_options.block_cache = cache_;
+      block_based_options.block_cache_compressed = compressed_cache_;
+      block_based_options.block_size = FLAGS_block_size;
+      block_based_options.block_restart_interval = FLAGS_block_restart_interval;
+      block_based_options.filter_policy = filter_policy_;
+      block_based_options.format_version = 2;
+      options.table_factory.reset(
+          NewBlockBasedTableFactory(block_based_options));
     }
     if (FLAGS_max_bytes_for_level_multiplier_additional_v.size() > 0) {
       if (FLAGS_max_bytes_for_level_multiplier_additional_v.size() !=
@@ -1650,6 +2199,8 @@ class Benchmark {
     options.compression_opts.level = FLAGS_compression_level;
     options.WAL_ttl_seconds = FLAGS_wal_ttl_seconds;
     options.WAL_size_limit_MB = FLAGS_wal_size_limit_MB;
+    options.max_total_wal_size = FLAGS_max_total_wal_size;
+
     if (FLAGS_min_level_to_compress >= 0) {
       assert(FLAGS_min_level_to_compress <= FLAGS_num_levels);
       options.compression_per_level.resize(FLAGS_num_levels);
@@ -1661,9 +2212,6 @@ class Benchmark {
         options.compression_per_level[i] = FLAGS_compression_type_e;
       }
     }
-    options.disable_seek_compaction = FLAGS_disable_seek_compaction;
-    options.delete_obsolete_files_period_micros =
-      FLAGS_delete_obsolete_files_period_micros;
     options.soft_rate_limit = FLAGS_soft_rate_limit;
     options.hard_rate_limit = FLAGS_hard_rate_limit;
     options.rate_limit_delay_max_milliseconds =
@@ -1682,6 +2230,7 @@ class Benchmark {
     options.access_hint_on_compaction_start = FLAGS_compaction_fadvice_e;
     options.use_adaptive_mutex = FLAGS_use_adaptive_mutex;
     options.bytes_per_sync = FLAGS_bytes_per_sync;
+    options.wal_bytes_per_sync = FLAGS_wal_bytes_per_sync;
 
     // merge operator options
     options.merge_operator = MergeOperators::CreateFromStringId(
@@ -1714,15 +2263,21 @@ class Benchmark {
       options.compaction_options_universal.compression_size_percent =
         FLAGS_universal_compression_size_percent;
     }
+    if (FLAGS_thread_status_per_interval > 0) {
+      options.enable_thread_tracking = true;
+    }
+    if (FLAGS_rate_limiter_bytes_per_sec > 0) {
+      options.rate_limiter.reset(
+          NewGenericRateLimiter(FLAGS_rate_limiter_bytes_per_sec));
+    }
 
     if (FLAGS_num_multi_db <= 1) {
       OpenDb(options, FLAGS_db, &db_);
     } else {
       multi_dbs_.clear();
+      multi_dbs_.resize(FLAGS_num_multi_db);
       for (int i = 0; i < FLAGS_num_multi_db; i++) {
-        DB* db;
-        OpenDb(options, GetDbNameForMultiple(FLAGS_db, i), &db);
-        multi_dbs_.push_back(db);
+        OpenDb(options, GetDbNameForMultiple(FLAGS_db, i), &multi_dbs_[i]);
       }
     }
     if (FLAGS_min_level_to_compress >= 0) {
@@ -1730,12 +2285,37 @@ class Benchmark {
     }
   }
 
-  void OpenDb(Options options, std::string db_name, DB** db) {
+  void OpenDb(const Options& options, const std::string& db_name,
+      DBWithColumnFamilies* db) {
     Status s;
-    if(FLAGS_readonly) {
-      s = DB::OpenForReadOnly(options, db_name, db);
+    // Open with column families if necessary.
+    if (FLAGS_num_column_families > 1) {
+      size_t num_hot = FLAGS_num_column_families;
+      if (FLAGS_num_hot_column_families > 0 &&
+          FLAGS_num_hot_column_families < FLAGS_num_column_families) {
+        num_hot = FLAGS_num_hot_column_families;
+      } else {
+        FLAGS_num_hot_column_families = FLAGS_num_column_families;
+      }
+      std::vector<ColumnFamilyDescriptor> column_families;
+      for (size_t i = 0; i < num_hot; i++) {
+        column_families.push_back(ColumnFamilyDescriptor(
+              ColumnFamilyName(i), ColumnFamilyOptions(options)));
+      }
+      if (FLAGS_readonly) {
+        s = DB::OpenForReadOnly(options, db_name, column_families,
+            &db->cfh, &db->db);
+      } else {
+        s = DB::Open(options, db_name, column_families, &db->cfh, &db->db);
+      }
+      db->cfh.resize(FLAGS_num_column_families);
+      db->num_created = num_hot;
+      db->num_hot = num_hot;
+
+    } else if (FLAGS_readonly) {
+      s = DB::OpenForReadOnly(options, db_name, &db->db);
     } else {
-      s = DB::Open(options, db_name, db);
+      s = DB::Open(options, db_name, &db->db);
     }
     if (!s.ok()) {
       fprintf(stderr, "open error: %s\n", s.ToString().c_str());
@@ -1776,8 +2356,9 @@ class Benchmark {
         for (uint64_t i = 0; i < num_; ++i) {
           values_[i] = i;
         }
-        std::shuffle(values_.begin(), values_.end(),
-            std::default_random_engine(FLAGS_seed));
+        std::shuffle(
+            values_.begin(), values_.end(),
+            std::default_random_engine(static_cast<unsigned int>(FLAGS_seed)));
       }
     }
 
@@ -1803,10 +2384,18 @@ class Benchmark {
   };
 
   DB* SelectDB(ThreadState* thread) {
-    if (db_ != nullptr) {
-      return db_;
-    } else {
-      return multi_dbs_[thread->rand.Next() % multi_dbs_.size()];
+    return SelectDBWithCfh(thread)->db;
+  }
+
+  DBWithColumnFamilies* SelectDBWithCfh(ThreadState* thread) {
+    return SelectDBWithCfh(thread->rand.Next());
+  }
+
+  DBWithColumnFamilies* SelectDBWithCfh(uint64_t rand_int) {
+    if (db_.db != nullptr) {
+      return &db_;
+    } else  {
+      return &multi_dbs_[rand_int % multi_dbs_.size()];
     }
   }
 
@@ -1815,13 +2404,22 @@ class Benchmark {
     const int64_t num_ops = writes_ == 0 ? num_ : writes_;
 
     size_t num_key_gens = 1;
-    if (db_ == nullptr) {
+    if (db_.db == nullptr) {
       num_key_gens = multi_dbs_.size();
     }
     std::vector<std::unique_ptr<KeyGenerator>> key_gens(num_key_gens);
-    Duration duration(test_duration, num_ops * num_key_gens);
+    int64_t max_ops = num_ops * num_key_gens;
+    int64_t ops_per_stage = max_ops;
+    if (FLAGS_num_column_families > 1 && FLAGS_num_hot_column_families > 0) {
+      ops_per_stage = (max_ops - 1) / (FLAGS_num_column_families /
+                                       FLAGS_num_hot_column_families) +
+                      1;
+    }
+
+    Duration duration(test_duration, max_ops, ops_per_stage);
     for (size_t i = 0; i < num_key_gens; i++) {
-      key_gens[i].reset(new KeyGenerator(&(thread->rand), write_mode, num_ops));
+      key_gens[i].reset(new KeyGenerator(&(thread->rand), write_mode, num_,
+                                         ops_per_stage));
     }
 
     if (num_ != FLAGS_num) {
@@ -1835,23 +2433,40 @@ class Benchmark {
     Status s;
     int64_t bytes = 0;
 
-    Slice key = AllocateKey();
-    std::unique_ptr<const char[]> key_guard(key.data());
+    std::unique_ptr<const char[]> key_guard;
+    Slice key = AllocateKey(&key_guard);
+    int64_t stage = 0;
     while (!duration.Done(entries_per_batch_)) {
-      size_t id = 0;
-      DB* db_to_write = db_;
-      if (db_to_write == nullptr) {
-        id = thread->rand.Next() % num_key_gens;
-        db_to_write = multi_dbs_[id];
+      if (duration.GetStage() != stage) {
+        stage = duration.GetStage();
+        if (db_.db != nullptr) {
+          db_.CreateNewCf(open_options_, stage);
+        } else {
+          for (auto& db : multi_dbs_) {
+            db.CreateNewCf(open_options_, stage);
+          }
+        }
       }
+      size_t id = thread->rand.Next() % num_key_gens;
+      DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(id);
       batch.Clear();
       for (int64_t j = 0; j < entries_per_batch_; j++) {
-        GenerateKeyFromInt(key_gens[id]->Next(), FLAGS_num, &key);
-        batch.Put(key, gen.Generate(value_size_));
+        int64_t rand_num = key_gens[id]->Next();
+        GenerateKeyFromInt(rand_num, FLAGS_num, &key);
+        if (FLAGS_num_column_families <= 1) {
+          batch.Put(key, gen.Generate(value_size_));
+        } else {
+          // We use same rand_num as seed for key and column family so that we
+          // can deterministically find the cfh corresponding to a particular
+          // key while reading the key.
+          batch.Put(db_with_cfh->GetCfh(rand_num), key,
+                    gen.Generate(value_size_));
+        }
         bytes += value_size_ + key_size_;
-        thread->stats.FinishedSingleOp(db_to_write);
       }
-      s = db_to_write->Write(write_options_, &batch);
+      s = db_with_cfh->db->Write(write_options_, &batch);
+      thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db,
+                                entries_per_batch_);
       if (!s.ok()) {
         fprintf(stderr, "put error: %s\n", s.ToString().c_str());
         exit(1);
@@ -1861,22 +2476,25 @@ class Benchmark {
   }
 
   void ReadSequential(ThreadState* thread) {
-    if (db_ != nullptr) {
-      ReadSequential(thread, db_);
+    if (db_.db != nullptr) {
+      ReadSequential(thread, db_.db);
     } else {
-      for (DB* db : multi_dbs_) {
-        ReadSequential(thread, db);
+      for (const auto& db_with_cfh : multi_dbs_) {
+        ReadSequential(thread, db_with_cfh.db);
       }
     }
   }
 
   void ReadSequential(ThreadState* thread, DB* db) {
-    Iterator* iter = db->NewIterator(ReadOptions(FLAGS_verify_checksum, true));
+    ReadOptions options(FLAGS_verify_checksum, true);
+    options.tailing = FLAGS_use_tailing_iterator;
+
+    Iterator* iter = db->NewIterator(options);
     int64_t i = 0;
     int64_t bytes = 0;
     for (iter->SeekToFirst(); i < reads_ && iter->Valid(); iter->Next()) {
       bytes += iter->key().size() + iter->value().size();
-      thread->stats.FinishedSingleOp(db);
+      thread->stats.FinishedOps(nullptr, db, 1);
       ++i;
     }
     delete iter;
@@ -1884,11 +2502,11 @@ class Benchmark {
   }
 
   void ReadReverse(ThreadState* thread) {
-    if (db_ != nullptr) {
-      ReadReverse(thread, db_);
+    if (db_.db != nullptr) {
+      ReadReverse(thread, db_.db);
     } else {
-      for (DB* db : multi_dbs_) {
-        ReadReverse(thread, db);
+      for (const auto& db_with_cfh : multi_dbs_) {
+        ReadReverse(thread, db_with_cfh.db);
       }
     }
   }
@@ -1899,36 +2517,122 @@ class Benchmark {
     int64_t bytes = 0;
     for (iter->SeekToLast(); i < reads_ && iter->Valid(); iter->Prev()) {
       bytes += iter->key().size() + iter->value().size();
-      thread->stats.FinishedSingleOp(db_);
+      thread->stats.FinishedOps(nullptr, db, 1);
       ++i;
     }
     delete iter;
     thread->stats.AddBytes(bytes);
   }
 
+  void ReadRandomFast(ThreadState* thread) {
+    int64_t read = 0;
+    int64_t found = 0;
+    int64_t nonexist = 0;
+    ReadOptions options(FLAGS_verify_checksum, true);
+    std::unique_ptr<const char[]> key_guard;
+    Slice key = AllocateKey(&key_guard);
+    std::string value;
+    DB* db = SelectDBWithCfh(thread)->db;
+
+    int64_t pot = 1;
+    while (pot < FLAGS_num) {
+      pot <<= 1;
+    }
+
+    Duration duration(FLAGS_duration, reads_);
+    do {
+      for (int i = 0; i < 100; ++i) {
+        int64_t key_rand = thread->rand.Next() & (pot - 1);
+        GenerateKeyFromInt(key_rand, FLAGS_num, &key);
+        ++read;
+        auto status = db->Get(options, key, &value);
+        if (status.ok()) {
+          ++found;
+        } else if (!status.IsNotFound()) {
+          fprintf(stderr, "Get returned an error: %s\n",
+                  status.ToString().c_str());
+          abort();
+        }
+        if (key_rand >= FLAGS_num) {
+          ++nonexist;
+        }
+      }
+      thread->stats.FinishedOps(nullptr, db, 100);
+    } while (!duration.Done(100));
+
+    char msg[100];
+    snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found, "
+             "issued %" PRIu64 " non-exist keys)\n",
+             found, read, nonexist);
+
+    thread->stats.AddMessage(msg);
+
+    if (FLAGS_perf_level > 0) {
+      thread->stats.AddMessage(perf_context.ToString());
+    }
+  }
+
+  int64_t GetRandomKey(Random64* rand) {
+    uint64_t rand_int = rand->Next();
+    int64_t key_rand;
+    if (read_random_exp_range_ == 0) {
+      key_rand = rand_int % FLAGS_num;
+    } else {
+      const uint64_t kBigInt = static_cast<uint64_t>(1U) << 62;
+      long double order = -static_cast<long double>(rand_int % kBigInt) /
+                          static_cast<long double>(kBigInt) *
+                          read_random_exp_range_;
+      long double exp_ran = std::exp(order);
+      uint64_t rand_num =
+          static_cast<int64_t>(exp_ran * static_cast<long double>(FLAGS_num));
+      // Map to a different number to avoid locality.
+      const uint64_t kBigPrime = 0x5bd1e995;
+      // Overflow is like %(2^64). Will have little impact of results.
+      key_rand = static_cast<int64_t>((rand_num * kBigPrime) % FLAGS_num);
+    }
+    return key_rand;
+  }
+
   void ReadRandom(ThreadState* thread) {
     int64_t read = 0;
     int64_t found = 0;
+    int64_t bytes = 0;
     ReadOptions options(FLAGS_verify_checksum, true);
-    Slice key = AllocateKey();
-    std::unique_ptr<const char[]> key_guard(key.data());
+    std::unique_ptr<const char[]> key_guard;
+    Slice key = AllocateKey(&key_guard);
     std::string value;
 
     Duration duration(FLAGS_duration, reads_);
     while (!duration.Done(1)) {
-      DB* db = SelectDB(thread);
-      GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
+      DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread);
+      // We use same key_rand as seed for key and column family so that we can
+      // deterministically find the cfh corresponding to a particular key, as it
+      // is done in DoWrite method.
+      int64_t key_rand = GetRandomKey(&thread->rand);
+      GenerateKeyFromInt(key_rand, FLAGS_num, &key);
       read++;
-      if (db->Get(options, key, &value).ok()) {
+      Status s;
+      if (FLAGS_num_column_families > 1) {
+        s = db_with_cfh->db->Get(options, db_with_cfh->GetCfh(key_rand), key,
+                                 &value);
+      } else {
+        s = db_with_cfh->db->Get(options, key, &value);
+      }
+      if (s.ok()) {
         found++;
+        bytes += key.size() + value.size();
+      } else if (!s.IsNotFound()) {
+        fprintf(stderr, "Get returned an error: %s\n", s.ToString().c_str());
+        abort();
       }
-      thread->stats.FinishedSingleOp(db_);
+      thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1);
     }
 
     char msg[100];
-    snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)",
+    snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n",
              found, read);
 
+    thread->stats.AddBytes(bytes);
     thread->stats.AddMessage(msg);
 
     if (FLAGS_perf_level > 0) {
@@ -1943,17 +2647,18 @@ class Benchmark {
     int64_t found = 0;
     ReadOptions options(FLAGS_verify_checksum, true);
     std::vector<Slice> keys;
+    std::vector<std::unique_ptr<const char[]> > key_guards;
     std::vector<std::string> values(entries_per_batch_);
     while (static_cast<int64_t>(keys.size()) < entries_per_batch_) {
-      keys.push_back(AllocateKey());
+      key_guards.push_back(std::move(std::unique_ptr<const char[]>()));
+      keys.push_back(AllocateKey(&key_guards.back()));
     }
 
     Duration duration(FLAGS_duration, reads_);
     while (!duration.Done(1)) {
       DB* db = SelectDB(thread);
       for (int64_t i = 0; i < entries_per_batch_; ++i) {
-        GenerateKeyFromInt(thread->rand.Next() % FLAGS_num,
-            FLAGS_num, &keys[i]);
+        GenerateKeyFromInt(GetRandomKey(&thread->rand), FLAGS_num, &keys[i]);
       }
       std::vector<Status> statuses = db->MultiGet(options, keys, &values);
       assert(static_cast<int64_t>(statuses.size()) == entries_per_batch_);
@@ -1962,11 +2667,13 @@ class Benchmark {
       for (int64_t i = 0; i < entries_per_batch_; ++i) {
         if (statuses[i].ok()) {
           ++found;
+        } else if (!statuses[i].IsNotFound()) {
+          fprintf(stderr, "MultiGet returned an error: %s\n",
+                  statuses[i].ToString().c_str());
+          abort();
         }
       }
-    }
-    for (auto& k : keys) {
-      delete k.data();
+      thread->stats.FinishedOps(nullptr, db, entries_per_batch_);
     }
 
     char msg[100];
@@ -1982,7 +2689,7 @@ class Benchmark {
       DB* db = SelectDB(thread);
       Iterator* iter = db->NewIterator(options);
       delete iter;
-      thread->stats.FinishedSingleOp(db);
+      thread->stats.FinishedOps(nullptr, db, 1);
     }
   }
 
@@ -1990,31 +2697,47 @@ class Benchmark {
     if (thread->tid > 0) {
       IteratorCreation(thread);
     } else {
-      BGWriter(thread);
+      BGWriter(thread, kPut);
     }
   }
 
   void SeekRandom(ThreadState* thread) {
     int64_t read = 0;
     int64_t found = 0;
+    int64_t bytes = 0;
     ReadOptions options(FLAGS_verify_checksum, true);
     options.tailing = FLAGS_use_tailing_iterator;
 
     Iterator* single_iter = nullptr;
     std::vector<Iterator*> multi_iters;
-    if (db_ != nullptr) {
-      single_iter = db_->NewIterator(options);
+    if (db_.db != nullptr) {
+      single_iter = db_.db->NewIterator(options);
     } else {
-      for (DB* db : multi_dbs_) {
-        multi_iters.push_back(db->NewIterator(options));
+      for (const auto& db_with_cfh : multi_dbs_) {
+        multi_iters.push_back(db_with_cfh.db->NewIterator(options));
       }
     }
 
-    Slice key = AllocateKey();
-    std::unique_ptr<const char[]> key_guard(key.data());
+    std::unique_ptr<const char[]> key_guard;
+    Slice key = AllocateKey(&key_guard);
 
     Duration duration(FLAGS_duration, reads_);
+    char value_buffer[256];
     while (!duration.Done(1)) {
+      if (!FLAGS_use_tailing_iterator) {
+        if (db_.db != nullptr) {
+          delete single_iter;
+          single_iter = db_.db->NewIterator(options);
+        } else {
+          for (auto iter : multi_iters) {
+            delete iter;
+          }
+          multi_iters.clear();
+          for (const auto& db_with_cfh : multi_dbs_) {
+            multi_iters.push_back(db_with_cfh.db->NewIterator(options));
+          }
+        }
+      }
       // Pick a Iterator to use
       Iterator* iter_to_use = single_iter;
       if (single_iter == nullptr) {
@@ -2027,7 +2750,23 @@ class Benchmark {
       if (iter_to_use->Valid() && iter_to_use->key().compare(key) == 0) {
         found++;
       }
-      thread->stats.FinishedSingleOp(db_);
+
+      for (int j = 0; j < FLAGS_seek_nexts && iter_to_use->Valid(); ++j) {
+        // Copy out iterator's value to make sure we read them.
+        Slice value = iter_to_use->value();
+        memcpy(value_buffer, value.data(),
+               std::min(value.size(), sizeof(value_buffer)));
+        bytes += iter_to_use->key().size() + iter_to_use->value().size();
+
+        if (!FLAGS_reverse_iterator) {
+          iter_to_use->Next();
+        } else {
+          iter_to_use->Prev();
+        }
+        assert(iter_to_use->status().ok());
+      }
+
+      thread->stats.FinishedOps(&db_, db_.db, 1);
     }
     delete single_iter;
     for (auto iter : multi_iters) {
@@ -2035,16 +2774,28 @@ class Benchmark {
     }
 
     char msg[100];
-    snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)",
+    snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n",
              found, read);
+    thread->stats.AddBytes(bytes);
     thread->stats.AddMessage(msg);
+    if (FLAGS_perf_level > 0) {
+      thread->stats.AddMessage(perf_context.ToString());
+    }
   }
 
   void SeekRandomWhileWriting(ThreadState* thread) {
     if (thread->tid > 0) {
       SeekRandom(thread);
     } else {
-      BGWriter(thread);
+      BGWriter(thread, kPut);
+    }
+  }
+
+  void SeekRandomWhileMerging(ThreadState* thread) {
+    if (thread->tid > 0) {
+      SeekRandom(thread);
+    } else {
+      BGWriter(thread, kMerge);
     }
   }
 
@@ -2052,8 +2803,8 @@ class Benchmark {
     WriteBatch batch;
     Duration duration(seq ? 0 : FLAGS_duration, num_);
     int64_t i = 0;
-    Slice key = AllocateKey();
-    std::unique_ptr<const char[]> key_guard(key.data());
+    std::unique_ptr<const char[]> key_guard;
+    Slice key = AllocateKey(&key_guard);
 
     while (!duration.Done(entries_per_batch_)) {
       DB* db = SelectDB(thread);
@@ -2062,9 +2813,9 @@ class Benchmark {
         const int64_t k = seq ? i + j : (thread->rand.Next() % FLAGS_num);
         GenerateKeyFromInt(k, FLAGS_num, &key);
         batch.Delete(key);
-        thread->stats.FinishedSingleOp(db);
       }
       auto s = db->Write(write_options_, &batch);
+      thread->stats.FinishedOps(nullptr, db, entries_per_batch_);
       if (!s.ok()) {
         fprintf(stderr, "del error: %s\n", s.ToString().c_str());
         exit(1);
@@ -2085,16 +2836,25 @@ class Benchmark {
     if (thread->tid > 0) {
       ReadRandom(thread);
     } else {
-      BGWriter(thread);
+      BGWriter(thread, kPut);
+    }
+  }
+
+  void ReadWhileMerging(ThreadState* thread) {
+    if (thread->tid > 0) {
+      ReadRandom(thread);
+    } else {
+      BGWriter(thread, kMerge);
     }
   }
 
-  void BGWriter(ThreadState* thread) {
+  void BGWriter(ThreadState* thread, enum PutOrMerge write_merge) {
     // Special thread that keeps writing until other threads are done.
     RandomGenerator gen;
     double last = FLAGS_env->NowMicros();
     int writes_per_second_by_10 = 0;
     int num_writes = 0;
+    int64_t bytes = 0;
 
     // --writes_per_second rate limit is enforced per 100 milliseconds
     // intervals to avoid a burst of writes at the start of each second.
@@ -2105,8 +2865,8 @@ class Benchmark {
     // Don't merge stats from this thread with the readers.
     thread->stats.SetExcludeFromMerge();
 
-    Slice key = AllocateKey();
-    std::unique_ptr<const char[]> key_guard(key.data());
+    std::unique_ptr<const char[]> key_guard;
+    Slice key = AllocateKey(&key_guard);
 
     while (true) {
       DB* db = SelectDB(thread);
@@ -2119,12 +2879,20 @@ class Benchmark {
       }
 
       GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
-      Status s = db->Put(write_options_, key, gen.Generate(value_size_));
+      Status s;
+
+      if (write_merge == kPut) {
+          s = db->Put(write_options_, key, gen.Generate(value_size_));
+      } else {
+          s = db->Merge(write_options_, key, gen.Generate(value_size_));
+      }
+
       if (!s.ok()) {
-        fprintf(stderr, "put error: %s\n", s.ToString().c_str());
+        fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str());
         exit(1);
       }
-      thread->stats.FinishedSingleOp(db_);
+      bytes += key.size() + value_size_;
+      thread->stats.FinishedOps(&db_, db_.db, 1);
 
       ++num_writes;
       if (writes_per_second_by_10 && num_writes >= writes_per_second_by_10) {
@@ -2140,6 +2908,7 @@ class Benchmark {
         }
       }
     }
+    thread->stats.AddBytes(bytes);
   }
 
   // Given a key K and value V, this puts (K+"0", V), (K+"1", V), (K+"2", V)
@@ -2238,8 +3007,8 @@ class Benchmark {
     int64_t puts_done = 0;
     int64_t deletes_done = 0;
 
-    Slice key = AllocateKey();
-    std::unique_ptr<const char[]> key_guard(key.data());
+    std::unique_ptr<const char[]> key_guard;
+    Slice key = AllocateKey(&key_guard);
 
     // the number of iterations is the larger of read_ or write_
     for (int64_t i = 0; i < readwrites_; i++) {
@@ -2284,7 +3053,7 @@ class Benchmark {
         deletes_done++;
       }
 
-      thread->stats.FinishedSingleOp(db_);
+      thread->stats.FinishedOps(&db_, db_.db, 1);
     }
     char msg[100];
     snprintf(msg, sizeof(msg),
@@ -2307,8 +3076,8 @@ class Benchmark {
     int64_t writes_done = 0;
     Duration duration(FLAGS_duration, readwrites_);
 
-    Slice key = AllocateKey();
-    std::unique_ptr<const char[]> key_guard(key.data());
+    std::unique_ptr<const char[]> key_guard;
+    Slice key = AllocateKey(&key_guard);
 
     // the number of iterations is the larger of read_ or write_
     while (!duration.Done(1)) {
@@ -2342,7 +3111,7 @@ class Benchmark {
         put_weight--;
         writes_done++;
       }
-      thread->stats.FinishedSingleOp(db);
+      thread->stats.FinishedOps(nullptr, db, 1);
     }
     char msg[100];
     snprintf(msg, sizeof(msg), "( reads:%" PRIu64 " writes:%" PRIu64 \
@@ -2358,17 +3127,24 @@ class Benchmark {
     RandomGenerator gen;
     std::string value;
     int64_t found = 0;
+    int64_t bytes = 0;
     Duration duration(FLAGS_duration, readwrites_);
 
-    Slice key = AllocateKey();
-    std::unique_ptr<const char[]> key_guard(key.data());
+    std::unique_ptr<const char[]> key_guard;
+    Slice key = AllocateKey(&key_guard);
     // the number of iterations is the larger of read_ or write_
     while (!duration.Done(1)) {
       DB* db = SelectDB(thread);
       GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
 
-      if (db->Get(options, key, &value).ok()) {
-        found++;
+      auto status = db->Get(options, key, &value);
+      if (status.ok()) {
+        ++found;
+        bytes += key.size() + value.size();
+      } else if (!status.IsNotFound()) {
+        fprintf(stderr, "Get returned an error: %s\n",
+                status.ToString().c_str());
+        abort();
       }
 
       Status s = db->Put(write_options_, key, gen.Generate(value_size_));
@@ -2376,11 +3152,13 @@ class Benchmark {
         fprintf(stderr, "put error: %s\n", s.ToString().c_str());
         exit(1);
       }
-      thread->stats.FinishedSingleOp(db);
+      bytes += key.size() + value_size_;
+      thread->stats.FinishedOps(nullptr, db, 1);
     }
     char msg[100];
     snprintf(msg, sizeof(msg),
              "( updates:%" PRIu64 " found:%" PRIu64 ")", readwrites_, found);
+    thread->stats.AddBytes(bytes);
     thread->stats.AddMessage(msg);
   }
 
@@ -2392,18 +3170,24 @@ class Benchmark {
     RandomGenerator gen;
     std::string value;
     int64_t found = 0;
+    int64_t bytes = 0;
 
-    Slice key = AllocateKey();
-    std::unique_ptr<const char[]> key_guard(key.data());
+    std::unique_ptr<const char[]> key_guard;
+    Slice key = AllocateKey(&key_guard);
     // The number of iterations is the larger of read_ or write_
     Duration duration(FLAGS_duration, readwrites_);
     while (!duration.Done(1)) {
       DB* db = SelectDB(thread);
       GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
 
-      // Get the existing value
-      if (db->Get(options, key, &value).ok()) {
-        found++;
+      auto status = db->Get(options, key, &value);
+      if (status.ok()) {
+        ++found;
+        bytes += key.size() + value.size();
+      } else if (!status.IsNotFound()) {
+        fprintf(stderr, "Get returned an error: %s\n",
+                status.ToString().c_str());
+        abort();
       } else {
         // If not existing, then just assume an empty string of data
         value.clear();
@@ -2423,12 +3207,14 @@ class Benchmark {
         fprintf(stderr, "put error: %s\n", s.ToString().c_str());
         exit(1);
       }
-      thread->stats.FinishedSingleOp(db_);
+      bytes += key.size() + value.size();
+      thread->stats.FinishedOps(nullptr, db, 1);
     }
 
     char msg[100];
     snprintf(msg, sizeof(msg), "( updates:%" PRIu64 " found:%" PRIu64 ")",
             readwrites_, found);
+    thread->stats.AddBytes(bytes);
     thread->stats.AddMessage(msg);
   }
 
@@ -2444,9 +3230,9 @@ class Benchmark {
   // FLAGS_merge_keys.
   void MergeRandom(ThreadState* thread) {
     RandomGenerator gen;
-
-    Slice key = AllocateKey();
-    std::unique_ptr<const char[]> key_guard(key.data());
+    int64_t bytes = 0;
+    std::unique_ptr<const char[]> key_guard;
+    Slice key = AllocateKey(&key_guard);
     // The number of iterations is the larger of read_ or write_
     Duration duration(FLAGS_duration, readwrites_);
     while (!duration.Done(1)) {
@@ -2459,12 +3245,14 @@ class Benchmark {
         fprintf(stderr, "merge error: %s\n", s.ToString().c_str());
         exit(1);
       }
-      thread->stats.FinishedSingleOp(db_);
+      bytes += key.size() + value_size_;
+      thread->stats.FinishedOps(nullptr, db, 1);
     }
 
     // Print some statistics
     char msg[100];
     snprintf(msg, sizeof(msg), "( updates:%" PRIu64 ")", readwrites_);
+    thread->stats.AddBytes(bytes);
     thread->stats.AddMessage(msg);
   }
 
@@ -2484,8 +3272,8 @@ class Benchmark {
     int64_t num_merges = 0;
     size_t max_length = 0;
 
-    Slice key = AllocateKey();
-    std::unique_ptr<const char[]> key_guard(key.data());
+    std::unique_ptr<const char[]> key_guard;
+    Slice key = AllocateKey(&key_guard);
     // the number of iterations is the larger of read_ or write_
     Duration duration(FLAGS_duration, readwrites_);
     while (!duration.Done(1)) {
@@ -2520,7 +3308,7 @@ class Benchmark {
 
       }
 
-      thread->stats.FinishedSingleOp(db_);
+      thread->stats.FinishedOps(nullptr, db, 1);
     }
 
     char msg[100];
@@ -2531,17 +3319,52 @@ class Benchmark {
     thread->stats.AddMessage(msg);
   }
 
+  void WriteSeqSeekSeq(ThreadState* thread) {
+    writes_ = FLAGS_num;
+    DoWrite(thread, SEQUENTIAL);
+    // exclude writes from the ops/sec calculation
+    thread->stats.Start(thread->tid);
+
+    DB* db = SelectDB(thread);
+    std::unique_ptr<Iterator> iter(
+      db->NewIterator(ReadOptions(FLAGS_verify_checksum, true)));
+
+    std::unique_ptr<const char[]> key_guard;
+    Slice key = AllocateKey(&key_guard);
+    for (int64_t i = 0; i < FLAGS_num; ++i) {
+      GenerateKeyFromInt(i, FLAGS_num, &key);
+      iter->Seek(key);
+      assert(iter->Valid() && iter->key() == key);
+      thread->stats.FinishedOps(nullptr, db, 1);
+
+      for (int j = 0; j < FLAGS_seek_nexts && i + 1 < FLAGS_num; ++j) {
+        if (!FLAGS_reverse_iterator) {
+          iter->Next();
+        } else {
+          iter->Prev();
+        }
+        GenerateKeyFromInt(++i, FLAGS_num, &key);
+        assert(iter->Valid() && iter->key() == key);
+        thread->stats.FinishedOps(nullptr, db, 1);
+      }
+
+      iter->Seek(key);
+      assert(iter->Valid() && iter->key() == key);
+      thread->stats.FinishedOps(nullptr, db, 1);
+    }
+  }
+
   void Compact(ThreadState* thread) {
     DB* db = SelectDB(thread);
     db->CompactRange(nullptr, nullptr);
   }
 
   void PrintStats(const char* key) {
-    if (db_ != nullptr) {
-      PrintStats(db_, key, false);
+    if (db_.db != nullptr) {
+      PrintStats(db_.db, key, false);
     }
-    for (DB* db : multi_dbs_) {
-      PrintStats(db, key, true);
+    for (const auto& db_with_cfh : multi_dbs_) {
+      PrintStats(db_with_cfh.db, key, true);
     }
   }
 
@@ -2561,17 +3384,17 @@ class Benchmark {
 
 int main(int argc, char** argv) {
   rocksdb::port::InstallStackTraceHandler();
-  google::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
-                          " [OPTIONS]...");
-  google::ParseCommandLineFlags(&argc, &argv, true);
+  SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
+                  " [OPTIONS]...");
+  ParseCommandLineFlags(&argc, &argv, true);
 
   FLAGS_compaction_style_e = (rocksdb::CompactionStyle) FLAGS_compaction_style;
   if (FLAGS_statistics) {
     dbstats = rocksdb::CreateDBStatistics();
   }
 
-  std::vector<std::string> fanout =
-    rocksdb::stringSplit(FLAGS_max_bytes_for_level_multiplier_additional, ',');
+  std::vector<std::string> fanout = rocksdb::StringSplit(
+      FLAGS_max_bytes_for_level_multiplier_additional, ',');
   for (unsigned int j= 0; j < fanout.size(); j++) {
     FLAGS_max_bytes_for_level_multiplier_additional_v.push_back(
       std::stoi(fanout[j]));
@@ -2602,6 +3425,9 @@ int main(int argc, char** argv) {
   // The number of background threads should be at least as much the
   // max number of concurrent compactions.
   FLAGS_env->SetBackgroundThreads(FLAGS_max_background_compactions);
+  FLAGS_env->SetBackgroundThreads(FLAGS_max_background_flushes,
+                                  rocksdb::Env::Priority::HIGH);
+
   // Choose a location for the test database if none given with --db=<path>
   if (FLAGS_db.empty()) {
     std::string default_db_path;
@@ -2610,7 +3436,15 @@ int main(int argc, char** argv) {
     FLAGS_db = default_db_path;
   }
 
+  if (FLAGS_stats_interval_seconds > 0) {
+    // When both are set then FLAGS_stats_interval determines the frequency
+    // at which the timer is checked for FLAGS_stats_interval_seconds
+    FLAGS_stats_interval = 1000;
+  }
+
   rocksdb::Benchmark benchmark;
   benchmark.Run();
   return 0;
 }
+
+#endif  // GFLAGS
diff --git a/src/rocksdb/db/db_filesnapshot.cc b/src/rocksdb/db/db_filesnapshot.cc
index 1e1ec97..c724303 100644
--- a/src/rocksdb/db/db_filesnapshot.cc
+++ b/src/rocksdb/db/db_filesnapshot.cc
@@ -9,37 +9,48 @@
 
 #ifndef ROCKSDB_LITE
 
+#ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
+#endif
+
 #include <inttypes.h>
 #include <algorithm>
 #include <string>
 #include <stdint.h>
 #include "db/db_impl.h"
 #include "db/filename.h"
+#include "db/job_context.h"
 #include "db/version_set.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "port/port.h"
 #include "util/mutexlock.h"
 #include "util/sync_point.h"
+#include "util/file_util.h"
 
 namespace rocksdb {
 
 Status DBImpl::DisableFileDeletions() {
-  MutexLock l(&mutex_);
+  InstrumentedMutexLock l(&mutex_);
   ++disable_delete_obsolete_files_;
   if (disable_delete_obsolete_files_ == 1) {
-    // if not, it has already been disabled, so don't log anything
-    Log(options_.info_log, "File Deletions Disabled");
+    Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+        "File Deletions Disabled");
+  } else {
+    Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
+        "File Deletions Disabled, but already disabled. Counter: %d",
+        disable_delete_obsolete_files_);
   }
   return Status::OK();
 }
 
 Status DBImpl::EnableFileDeletions(bool force) {
-  DeletionState deletion_state;
+  // Job id == 0 means that this is not our background process, but rather
+  // user thread
+  JobContext job_context(0);
   bool should_purge_files = false;
   {
-    MutexLock l(&mutex_);
+    InstrumentedMutexLock l(&mutex_);
     if (force) {
       // if force, we need to enable file deletions right away
       disable_delete_obsolete_files_ = 0;
@@ -47,18 +58,28 @@ Status DBImpl::EnableFileDeletions(bool force) {
       --disable_delete_obsolete_files_;
     }
     if (disable_delete_obsolete_files_ == 0)  {
-      Log(options_.info_log, "File Deletions Enabled");
+      Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+          "File Deletions Enabled");
       should_purge_files = true;
-      FindObsoleteFiles(deletion_state, true);
+      FindObsoleteFiles(&job_context, true);
+    } else {
+      Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
+          "File Deletions Enable, but not really enabled. Counter: %d",
+          disable_delete_obsolete_files_);
     }
   }
   if (should_purge_files)  {
-    PurgeObsoleteFiles(deletion_state);
+    PurgeObsoleteFiles(job_context);
   }
-  LogFlush(options_.info_log);
+  job_context.Clean();
+  LogFlush(db_options_.info_log);
   return Status::OK();
 }
 
+int DBImpl::IsFileDeletionsEnabled() const {
+  return disable_delete_obsolete_files_;
+}
+
 Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
                             uint64_t* manifest_file_size,
                             bool flush_memtable) {
@@ -71,6 +92,9 @@ Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
     // flush all dirty data to disk.
     Status status;
     for (auto cfd : *versions_->GetColumnFamilySet()) {
+      if (cfd->IsDropped()) {
+        continue;
+      }
       cfd->Ref();
       mutex_.Unlock();
       status = FlushMemTable(cfd, FlushOptions());
@@ -84,15 +108,18 @@ Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
 
     if (!status.ok()) {
       mutex_.Unlock();
-      Log(options_.info_log, "Cannot Flush data %s\n",
-          status.ToString().c_str());
+      Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+          "Cannot Flush data %s\n", status.ToString().c_str());
       return status;
     }
   }
 
   // Make a set of all of the live *.sst files
-  std::set<uint64_t> live;
+  std::vector<FileDescriptor> live;
   for (auto cfd : *versions_->GetColumnFamilySet()) {
+    if (cfd->IsDropped()) {
+      continue;
+    }
     cfd->current()->AddLiveFiles(&live);
   }
 
@@ -102,69 +129,21 @@ Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
   // create names of the live files. The names are not absolute
   // paths, instead they are relative to dbname_;
   for (auto live_file : live) {
-    ret.push_back(TableFileName("", live_file));
+    ret.push_back(MakeTableFileName("", live_file.GetNumber()));
   }
 
   ret.push_back(CurrentFileName(""));
-  ret.push_back(DescriptorFileName("", versions_->ManifestFileNumber()));
+  ret.push_back(DescriptorFileName("", versions_->manifest_file_number()));
 
   // find length of manifest file while holding the mutex lock
-  *manifest_file_size = versions_->ManifestFileSize();
+  *manifest_file_size = versions_->manifest_file_size();
 
   mutex_.Unlock();
   return Status::OK();
 }
 
 Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) {
-  // First get sorted files in db dir, then get sorted files from archived
-  // dir, to avoid a race condition where a log file is moved to archived
-  // dir in between.
-  Status s;
-  // list wal files in main db dir.
-  VectorLogPtr logs;
-  s = GetSortedWalsOfType(options_.wal_dir, logs, kAliveLogFile);
-  if (!s.ok()) {
-    return s;
-  }
-
-  // Reproduce the race condition where a log file is moved
-  // to archived dir, between these two sync points, used in
-  // (DBTest,TransactionLogIteratorRace)
-  TEST_SYNC_POINT("DBImpl::GetSortedWalFiles:1");
-  TEST_SYNC_POINT("DBImpl::GetSortedWalFiles:2");
-
-  files.clear();
-  // list wal files in archive dir.
-  std::string archivedir = ArchivalDirectory(options_.wal_dir);
-  if (env_->FileExists(archivedir)) {
-    s = GetSortedWalsOfType(archivedir, files, kArchivedLogFile);
-    if (!s.ok()) {
-      return s;
-    }
-  }
-
-  uint64_t latest_archived_log_number = 0;
-  if (!files.empty()) {
-    latest_archived_log_number = files.back()->LogNumber();
-    Log(options_.info_log, "Latest Archived log: %" PRIu64,
-        latest_archived_log_number);
-  }
-
-  files.reserve(files.size() + logs.size());
-  for (auto& log : logs) {
-    if (log->LogNumber() > latest_archived_log_number) {
-      files.push_back(std::move(log));
-    } else {
-      // When the race condition happens, we could see the
-      // same log in both db dir and archived dir. Simply
-      // ignore the one in db dir. Note that, if we read
-      // archived dir first, we would have missed the log file.
-      Log(options_.info_log, "%s already moved to archive",
-          log->PathName().c_str());
-    }
-  }
-
-  return s;
+  return wal_manager_.GetSortedWalFiles(files);
 }
 
 }
diff --git a/src/rocksdb/db/db_impl.cc b/src/rocksdb/db/db_impl.cc
index 25d8a07..757571d 100644
--- a/src/rocksdb/db/db_impl.cc
+++ b/src/rocksdb/db/db_impl.cc
@@ -9,7 +9,10 @@
 
 #include "db/db_impl.h"
 
+#ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
+#endif
+
 #include <inttypes.h>
 #include <algorithm>
 #include <climits>
@@ -24,20 +27,26 @@
 #include <vector>
 
 #include "db/builder.h"
+#include "db/flush_job.h"
+#include "db/compaction_job.h"
 #include "db/db_iter.h"
 #include "db/dbformat.h"
+#include "db/event_logger_helpers.h"
 #include "db/filename.h"
+#include "db/job_context.h"
 #include "db/log_reader.h"
 #include "db/log_writer.h"
+#include "db/managed_iterator.h"
 #include "db/memtable.h"
 #include "db/memtable_list.h"
 #include "db/merge_context.h"
 #include "db/merge_helper.h"
 #include "db/table_cache.h"
 #include "db/table_properties_collector.h"
-#include "db/tailing_iter.h"
+#include "db/forward_iterator.h"
 #include "db/transaction_log_impl.h"
 #include "db/version_set.h"
+#include "db/writebuffer.h"
 #include "db/write_batch_internal.h"
 #include "port/port.h"
 #include "rocksdb/cache.h"
@@ -46,6 +55,7 @@
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/merge_operator.h"
+#include "rocksdb/version.h"
 #include "rocksdb/statistics.h"
 #include "rocksdb/status.h"
 #include "rocksdb/table.h"
@@ -58,219 +68,51 @@
 #include "util/autovector.h"
 #include "util/build_version.h"
 #include "util/coding.h"
+#include "util/compression.h"
+#include "util/db_info_dumper.h"
+#include "util/file_util.h"
 #include "util/hash_skiplist_rep.h"
 #include "util/hash_linklist_rep.h"
 #include "util/logging.h"
 #include "util/log_buffer.h"
 #include "util/mutexlock.h"
 #include "util/perf_context_imp.h"
+#include "util/iostats_context_imp.h"
 #include "util/stop_watch.h"
 #include "util/sync_point.h"
+#include "util/string_util.h"
+#include "util/thread_status_updater.h"
+#include "util/thread_status_util.h"
+#include "util/xfunc.h"
 
 namespace rocksdb {
 
 const std::string kDefaultColumnFamilyName("default");
 
-void DumpLeveldbBuildVersion(Logger * log);
-
-// Information kept for every waiting writer
-struct DBImpl::Writer {
-  Status status;
-  WriteBatch* batch;
-  bool sync;
-  bool disableWAL;
-  bool done;
-  port::CondVar cv;
-
-  explicit Writer(port::Mutex* mu) : cv(mu) { }
-};
-
-struct DBImpl::CompactionState {
-  Compaction* const compaction;
+void DumpRocksDBBuildVersion(Logger * log);
 
-  // If there were two snapshots with seq numbers s1 and
-  // s2 and s1 < s2, and if we find two instances of a key k1 then lies
-  // entirely within s1 and s2, then the earlier version of k1 can be safely
-  // deleted because that version is not visible in any snapshot.
-  std::vector<SequenceNumber> existing_snapshots;
+struct DBImpl::WriteContext {
+  autovector<SuperVersion*> superversions_to_free_;
+  bool schedule_bg_work_ = false;
 
-  // Files produced by compaction
-  struct Output {
-    uint64_t number;
-    uint64_t file_size;
-    InternalKey smallest, largest;
-    SequenceNumber smallest_seqno, largest_seqno;
-  };
-  std::vector<Output> outputs;
-  std::list<uint64_t> allocated_file_numbers;
-
-  // State kept for output being generated
-  unique_ptr<WritableFile> outfile;
-  unique_ptr<TableBuilder> builder;
-
-  uint64_t total_bytes;
-
-  Output* current_output() { return &outputs[outputs.size()-1]; }
-
-  explicit CompactionState(Compaction* c)
-      : compaction(c),
-        total_bytes(0) {
-  }
-
-  // Create a client visible context of this compaction
-  CompactionFilter::Context GetFilterContextV1() {
-    CompactionFilter::Context context;
-    context.is_full_compaction = compaction->IsFullCompaction();
-    context.is_manual_compaction = compaction->IsManualCompaction();
-    return context;
-  }
-
-  // Create a client visible context of this compaction
-  CompactionFilterContext GetFilterContext() {
-    CompactionFilterContext context;
-    context.is_full_compaction = compaction->IsFullCompaction();
-    context.is_manual_compaction = compaction->IsManualCompaction();
-    return context;
-  }
-
-  std::vector<Slice> key_buf_;
-  std::vector<Slice> existing_value_buf_;
-  std::vector<std::string> key_str_buf_;
-  std::vector<std::string> existing_value_str_buf_;
-  // new_value_buf_ will only be appended if a value changes
-  std::vector<std::string> new_value_buf_;
-  // if values_changed_buf_[i] is true
-  // new_value_buf_ will add a new entry with the changed value
-  std::vector<bool> value_changed_buf_;
-  // to_delete_buf_[i] is true iff key_buf_[i] is deleted
-  std::vector<bool> to_delete_buf_;
-  // buffer for the parsed internal keys, the string buffer is backed
-  // by key_str_buf_
-  std::vector<ParsedInternalKey> ikey_buf_;
-
-  std::vector<Slice> other_key_buf_;
-  std::vector<Slice> other_value_buf_;
-  std::vector<std::string> other_key_str_buf_;
-  std::vector<std::string> other_value_str_buf_;
-
-  std::vector<Slice> combined_key_buf_;
-  std::vector<Slice> combined_value_buf_;
-
-  std::string cur_prefix_;
-
-  // Buffers the kv-pair that will be run through compaction filter V2
-  // in the future.
-  void BufferKeyValueSlices(const Slice& key, const Slice& value) {
-    key_str_buf_.emplace_back(key.ToString());
-    existing_value_str_buf_.emplace_back(value.ToString());
-    key_buf_.emplace_back(Slice(key_str_buf_.back()));
-    existing_value_buf_.emplace_back(Slice(existing_value_str_buf_.back()));
-
-    ParsedInternalKey ikey;
-    ParseInternalKey(key_buf_.back(), &ikey);
-    ikey_buf_.emplace_back(ikey);
-  }
-
-  // Buffers the kv-pair that will not be run through compaction filter V2
-  // in the future.
-  void BufferOtherKeyValueSlices(const Slice& key, const Slice& value) {
-    other_key_str_buf_.emplace_back(key.ToString());
-    other_value_str_buf_.emplace_back(value.ToString());
-    other_key_buf_.emplace_back(Slice(other_key_str_buf_.back()));
-    other_value_buf_.emplace_back(Slice(other_value_str_buf_.back()));
-  }
-
-  // Add a kv-pair to the combined buffer
-  void AddToCombinedKeyValueSlices(const Slice& key, const Slice& value) {
-    // The real strings are stored in the batch buffers
-    combined_key_buf_.emplace_back(key);
-    combined_value_buf_.emplace_back(value);
-  }
-
-  // Merging the two buffers
-  void MergeKeyValueSliceBuffer(const InternalKeyComparator* comparator) {
-    size_t i = 0;
-    size_t j = 0;
-    size_t total_size = key_buf_.size() + other_key_buf_.size();
-    combined_key_buf_.reserve(total_size);
-    combined_value_buf_.reserve(total_size);
-
-    while (i + j < total_size) {
-      int comp_res = 0;
-      if (i < key_buf_.size() && j < other_key_buf_.size()) {
-        comp_res = comparator->Compare(key_buf_[i], other_key_buf_[j]);
-      } else if (i >= key_buf_.size() && j < other_key_buf_.size()) {
-        comp_res = 1;
-      } else if (j >= other_key_buf_.size() && i < key_buf_.size()) {
-        comp_res = -1;
-      }
-      if (comp_res > 0) {
-        AddToCombinedKeyValueSlices(other_key_buf_[j], other_value_buf_[j]);
-        j++;
-      } else if (comp_res < 0) {
-        AddToCombinedKeyValueSlices(key_buf_[i], existing_value_buf_[i]);
-        i++;
-      }
+  ~WriteContext() {
+    for (auto& sv : superversions_to_free_) {
+      delete sv;
     }
   }
-
-  void CleanupBatchBuffer() {
-    to_delete_buf_.clear();
-    key_buf_.clear();
-    existing_value_buf_.clear();
-    key_str_buf_.clear();
-    existing_value_str_buf_.clear();
-    new_value_buf_.clear();
-    value_changed_buf_.clear();
-    ikey_buf_.clear();
-
-    to_delete_buf_.shrink_to_fit();
-    key_buf_.shrink_to_fit();
-    existing_value_buf_.shrink_to_fit();
-    key_str_buf_.shrink_to_fit();
-    existing_value_str_buf_.shrink_to_fit();
-    new_value_buf_.shrink_to_fit();
-    value_changed_buf_.shrink_to_fit();
-    ikey_buf_.shrink_to_fit();
-
-    other_key_buf_.clear();
-    other_value_buf_.clear();
-    other_key_str_buf_.clear();
-    other_value_str_buf_.clear();
-    other_key_buf_.shrink_to_fit();
-    other_value_buf_.shrink_to_fit();
-    other_key_str_buf_.shrink_to_fit();
-    other_value_str_buf_.shrink_to_fit();
-  }
-
-  void CleanupMergedBuffer() {
-    combined_key_buf_.clear();
-    combined_value_buf_.clear();
-    combined_key_buf_.shrink_to_fit();
-    combined_value_buf_.shrink_to_fit();
-  }
 };
 
-namespace {
-// Fix user-supplied options to be reasonable
-template <class T, class V>
-static void ClipToRange(T* ptr, V minvalue, V maxvalue) {
-  if (static_cast<V>(*ptr) > maxvalue) *ptr = maxvalue;
-  if (static_cast<V>(*ptr) < minvalue) *ptr = minvalue;
-}
-}  // anonymous namespace
-
 Options SanitizeOptions(const std::string& dbname,
                         const InternalKeyComparator* icmp,
-                        const InternalFilterPolicy* ipolicy,
                         const Options& src) {
   auto db_options = SanitizeOptions(dbname, DBOptions(src));
-  auto cf_options = SanitizeOptions(icmp, ipolicy, ColumnFamilyOptions(src));
+  auto cf_options = SanitizeOptions(db_options, icmp, ColumnFamilyOptions(src));
   return Options(db_options, cf_options);
 }
 
 DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) {
   DBOptions result = src;
+
   // result.max_open_files means an "infinite" open files.
   if (result.max_open_files != -1) {
     ClipToRange(&result.max_open_files, 20, 1000000);
@@ -284,6 +126,16 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) {
       result.info_log = nullptr;
     }
   }
+  result.env->IncBackgroundThreadsIfNeeded(src.max_background_compactions,
+                                           Env::Priority::LOW);
+  result.env->IncBackgroundThreadsIfNeeded(src.max_background_flushes,
+                                           Env::Priority::HIGH);
+
+  if (result.rate_limiter.get() != nullptr) {
+    if (result.bytes_per_sync == 0) {
+      result.bytes_per_sync = 1024 * 1024;
+    }
+  }
 
   if (result.wal_dir.empty()) {
     // Use dbname as default
@@ -293,121 +145,148 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) {
     result.wal_dir = result.wal_dir.substr(0, result.wal_dir.size() - 1);
   }
 
+  if (result.db_paths.size() == 0) {
+    result.db_paths.emplace_back(dbname, std::numeric_limits<uint64_t>::max());
+  }
+
   return result;
 }
 
-CompressionType GetCompressionType(const Options& options, int level,
-                                   const bool enable_compression) {
-  if (!enable_compression) {
-    // disable compression
-    return kNoCompression;
-  }
-  // If the use has specified a different compression level for each level,
-  // then pick the compresison for that level.
-  if (!options.compression_per_level.empty()) {
-    const int n = options.compression_per_level.size() - 1;
-    // It is possible for level_ to be -1; in that case, we use level
-    // 0's compression.  This occurs mostly in backwards compatibility
-    // situations when the builder doesn't know what level the file
-    // belongs to.  Likewise, if level_ is beyond the end of the
-    // specified compression levels, use the last value.
-    return options.compression_per_level[std::max(0, std::min(level, n))];
-  } else {
-    return options.compression;
+namespace {
+
+Status SanitizeOptionsByTable(
+    const DBOptions& db_opts,
+    const std::vector<ColumnFamilyDescriptor>& column_families) {
+  Status s;
+  for (auto cf : column_families) {
+    s = cf.options.table_factory->SanitizeOptions(db_opts, cf.options);
+    if (!s.ok()) {
+      return s;
+    }
   }
+  return Status::OK();
 }
 
-CompressionType GetCompressionFlush(const Options& options) {
+CompressionType GetCompressionFlush(const ImmutableCFOptions& ioptions) {
   // Compressing memtable flushes might not help unless the sequential load
   // optimization is used for leveled compaction. Otherwise the CPU and
   // latency overhead is not offset by saving much space.
 
   bool can_compress;
 
-  if  (options.compaction_style == kCompactionStyleUniversal) {
+  if (ioptions.compaction_style == kCompactionStyleUniversal) {
     can_compress =
-        (options.compaction_options_universal.compression_size_percent < 0);
+        (ioptions.compaction_options_universal.compression_size_percent < 0);
   } else {
     // For leveled compress when min_level_to_compress == 0.
-    can_compress = (GetCompressionType(options, 0, true) != kNoCompression);
+    can_compress = ioptions.compression_per_level.empty() ||
+                   ioptions.compression_per_level[0] != kNoCompression;
   }
 
   if (can_compress) {
-    return options.compression;
+    return ioptions.compression;
   } else {
     return kNoCompression;
   }
 }
 
+void DumpCompressionInfo(Logger* logger) {
+  Log(InfoLogLevel::INFO_LEVEL, logger, "Compression algorithms supported:");
+  Log(InfoLogLevel::INFO_LEVEL, logger, "\tSnappy supported: %d",
+      Snappy_Supported());
+  Log(InfoLogLevel::INFO_LEVEL, logger, "\tZlib supported: %d",
+      Zlib_Supported());
+  Log(InfoLogLevel::INFO_LEVEL, logger, "\tBzip supported: %d",
+      BZip2_Supported());
+  Log(InfoLogLevel::INFO_LEVEL, logger, "\tLZ4 supported: %d", LZ4_Supported());
+}
+
+}  // namespace
+
 DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
     : env_(options.env),
       dbname_(dbname),
-      options_(SanitizeOptions(dbname, options)),
+      db_options_(SanitizeOptions(dbname, options)),
+      stats_(db_options_.statistics.get()),
       db_lock_(nullptr),
-      mutex_(options.use_adaptive_mutex),
-      shutting_down_(nullptr),
+      mutex_(stats_, env_, DB_MUTEX_WAIT_MICROS, options.use_adaptive_mutex),
+      shutting_down_(false),
       bg_cv_(&mutex_),
       logfile_number_(0),
+      log_dir_synced_(false),
       log_empty_(true),
       default_cf_handle_(nullptr),
       total_log_size_(0),
       max_total_in_memory_state_(0),
-      tmp_batch_(),
-      bg_schedule_needed_(false),
+      is_snapshot_supported_(true),
+      write_buffer_(options.db_write_buffer_size),
+      unscheduled_flushes_(0),
+      unscheduled_compactions_(0),
       bg_compaction_scheduled_(0),
       bg_manual_only_(0),
       bg_flush_scheduled_(0),
-      bg_logstats_scheduled_(false),
       manual_compaction_(nullptr),
-      logger_(nullptr),
       disable_delete_obsolete_files_(0),
-      delete_obsolete_files_last_run_(options.env->NowMicros()),
-      purge_wal_files_last_run_(0),
+      delete_obsolete_files_next_run_(
+          options.env->NowMicros() +
+          db_options_.delete_obsolete_files_period_micros),
       last_stats_dump_time_microsec_(0),
-      default_interval_to_delete_obsolete_WAL_(600),
+      next_job_id_(1),
       flush_on_destroy_(false),
-      delayed_writes_(0),
-      storage_options_(options),
+      env_options_(db_options_),
+#ifndef ROCKSDB_LITE
+      wal_manager_(db_options_, env_options_),
+#endif  // ROCKSDB_LITE
+      event_logger_(db_options_.info_log.get()),
       bg_work_gate_closed_(false),
       refitting_level_(false),
-      opened_successfully_(false) {
+      opened_successfully_(false),
+      notifying_events_(0) {
   env_->GetAbsolutePath(dbname, &db_absolute_path_);
 
   // Reserve ten files or so for other uses and give the rest to TableCache.
   // Give a large number for setting of "infinite" open files.
-  const int table_cache_size =
-      (options_.max_open_files == -1) ? 4194304 : options_.max_open_files - 10;
+  const int table_cache_size = (db_options_.max_open_files == -1) ?
+        4194304 : db_options_.max_open_files - 10;
   // Reserve ten files or so for other uses and give the rest to TableCache.
   table_cache_ =
-      NewLRUCache(table_cache_size, options_.table_cache_numshardbits,
-                  options_.table_cache_remove_scan_count_limit);
+      NewLRUCache(table_cache_size, db_options_.table_cache_numshardbits);
 
-  versions_.reset(
-      new VersionSet(dbname_, &options_, storage_options_, table_cache_.get()));
-  column_family_memtables_.reset(
-      new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet()));
+  versions_.reset(new VersionSet(dbname_, &db_options_, env_options_,
+                                 table_cache_.get(), &write_buffer_,
+                                 &write_controller_));
+  column_family_memtables_.reset(new ColumnFamilyMemTablesImpl(
+      versions_->GetColumnFamilySet(), &flush_scheduler_));
 
-  DumpLeveldbBuildVersion(options_.info_log.get());
-  options_.Dump(options_.info_log.get());
+  DumpRocksDBBuildVersion(db_options_.info_log.get());
+  DumpDBFileSummary(db_options_, dbname_);
+  db_options_.Dump(db_options_.info_log.get());
+  DumpCompressionInfo(db_options_.info_log.get());
 
-  char name[100];
-  Status s = env_->GetHostName(name, 100L);
-  if (s.ok()) {
-    host_name_ = name;
-  } else {
-    Log(options_.info_log, "Can't get hostname, use localhost as host name.");
-    host_name_ = "localhost";
-  }
-  last_log_ts = 0;
+  LogFlush(db_options_.info_log);
+}
 
-  LogFlush(options_.info_log);
+// Will only lock the mutex_ and wait for completion if wait is true
+void DBImpl::CancelAllBackgroundWork(bool wait) {
+  shutting_down_.store(true, std::memory_order_release);
+  if (!wait) {
+    return;
+  }
+  // Wait for background work to finish
+  mutex_.Lock();
+  while (bg_compaction_scheduled_ || bg_flush_scheduled_ || notifying_events_) {
+    bg_cv_.Wait();
+  }
+  mutex_.Unlock();
 }
 
 DBImpl::~DBImpl() {
+  EraseThreadStatusDbInfo();
   mutex_.Lock();
+
   if (flush_on_destroy_) {
     for (auto cfd : *versions_->GetColumnFamilySet()) {
-      if (cfd->mem()->GetFirstSequenceNumber() != 0) {
+      if (!cfd->IsDropped() && !cfd->mem()->IsEmpty()) {
         cfd->Ref();
         mutex_.Unlock();
         FlushMemTable(cfd, FlushOptions());
@@ -417,14 +296,37 @@ DBImpl::~DBImpl() {
     }
     versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
   }
+  // CancelAllBackgroundWork called with false means we just set the
+  // shutdown marker, while holding the mutex_ here. After which we
+  // do a variant of the waiting after we release the lock and unschedule work
+  // (to consider: moving all the waiting into CancelAllBackgroundWork(true))
+  CancelAllBackgroundWork(false);
+  mutex_.Unlock();
+  int compactions_unscheduled = env_->UnSchedule(this, Env::Priority::LOW);
+  int flushes_unscheduled = env_->UnSchedule(this, Env::Priority::HIGH);
+  mutex_.Lock();
+  bg_compaction_scheduled_ -= compactions_unscheduled;
+  bg_flush_scheduled_ -= flushes_unscheduled;
 
   // Wait for background work to finish
-  shutting_down_.Release_Store(this);  // Any non-nullptr value is ok
-  while (bg_compaction_scheduled_ ||
-         bg_flush_scheduled_ ||
-         bg_logstats_scheduled_) {
+  while (bg_compaction_scheduled_ || bg_flush_scheduled_ || notifying_events_) {
     bg_cv_.Wait();
   }
+  listeners_.clear();
+  flush_scheduler_.Clear();
+
+  while (!flush_queue_.empty()) {
+    auto cfd = PopFirstFromFlushQueue();
+    if (cfd->Unref()) {
+      delete cfd;
+    }
+  }
+  while (!compaction_queue_.empty()) {
+    auto cfd = PopFirstFromCompactionQueue();
+    if (cfd->Unref()) {
+      delete cfd;
+    }
+  }
 
   if (default_cf_handle_ != nullptr) {
     // we need to delete handle outside of lock because it does its own locking
@@ -433,25 +335,28 @@ DBImpl::~DBImpl() {
     mutex_.Lock();
   }
 
-  if (options_.allow_thread_local) {
-    // Clean up obsolete files due to SuperVersion release.
-    // (1) Need to delete to obsolete files before closing because RepairDB()
-    // scans all existing files in the file system and builds manifest file.
-    // Keeping obsolete files confuses the repair process.
-    // (2) Need to check if we Open()/Recover() the DB successfully before
-    // deleting because if VersionSet recover fails (may be due to corrupted
-    // manifest file), it is not able to identify live files correctly. As a
-    // result, all "live" files can get deleted by accident. However, corrupted
-    // manifest is recoverable by RepairDB().
-    if (opened_successfully_) {
-      DeletionState deletion_state;
-      FindObsoleteFiles(deletion_state, true);
-      // manifest number starting from 2
-      deletion_state.manifest_file_number = 1;
-      if (deletion_state.HaveSomethingToDelete()) {
-        PurgeObsoleteFiles(deletion_state);
-      }
+  // Clean up obsolete files due to SuperVersion release.
+  // (1) Need to delete to obsolete files before closing because RepairDB()
+  // scans all existing files in the file system and builds manifest file.
+  // Keeping obsolete files confuses the repair process.
+  // (2) Need to check if we Open()/Recover() the DB successfully before
+  // deleting because if VersionSet recover fails (may be due to corrupted
+  // manifest file), it is not able to identify live files correctly. As a
+  // result, all "live" files can get deleted by accident. However, corrupted
+  // manifest is recoverable by RepairDB().
+  if (opened_successfully_) {
+    JobContext job_context(next_job_id_.fetch_add(1));
+    FindObsoleteFiles(&job_context, true);
+    // manifest number starting from 2
+    job_context.manifest_file_number = 1;
+    if (job_context.HaveSomethingToDelete()) {
+      PurgeObsoleteFiles(job_context);
     }
+    job_context.Clean();
+  }
+
+  for (auto l : logs_to_free_) {
+    delete l;
   }
 
   // versions need to be destroyed before table_cache since it can hold
@@ -462,7 +367,7 @@ DBImpl::~DBImpl() {
     env_->UnlockFile(db_lock_);
   }
 
-  LogFlush(options_.info_log);
+  LogFlush(db_options_.info_log);
 }
 
 Status DBImpl::NewDB() {
@@ -471,23 +376,28 @@ Status DBImpl::NewDB() {
   new_db.SetNextFile(2);
   new_db.SetLastSequence(0);
 
+  Log(InfoLogLevel::INFO_LEVEL,
+      db_options_.info_log, "Creating manifest 1 \n");
   const std::string manifest = DescriptorFileName(dbname_, 1);
   unique_ptr<WritableFile> file;
   Status s = env_->NewWritableFile(
-      manifest, &file, env_->OptimizeForManifestWrite(storage_options_));
+      manifest, &file, env_->OptimizeForManifestWrite(env_options_));
   if (!s.ok()) {
     return s;
   }
-  file->SetPreallocationBlockSize(options_.manifest_preallocation_size);
+  file->SetPreallocationBlockSize(db_options_.manifest_preallocation_size);
   {
     log::Writer log(std::move(file));
     std::string record;
     new_db.EncodeTo(&record);
     s = log.AddRecord(record);
+    if (s.ok()) {
+      s = SyncManifest(env_, &db_options_, log.file());
+    }
   }
   if (s.ok()) {
     // Make "CURRENT" file that points to the new manifest file.
-    s = SetCurrentFile(env_, dbname_, 1);
+    s = SetCurrentFile(env_, dbname_, 1, directories_.GetDbDir());
   } else {
     env_->DeleteFile(manifest);
   }
@@ -495,59 +405,85 @@ Status DBImpl::NewDB() {
 }
 
 void DBImpl::MaybeIgnoreError(Status* s) const {
-  if (s->ok() || options_.paranoid_checks) {
+  if (s->ok() || db_options_.paranoid_checks) {
     // No change needed
   } else {
-    Log(options_.info_log, "Ignoring error %s", s->ToString().c_str());
+    Log(InfoLogLevel::WARN_LEVEL,
+        db_options_.info_log, "Ignoring error %s", s->ToString().c_str());
     *s = Status::OK();
   }
 }
 
 const Status DBImpl::CreateArchivalDirectory() {
-  if (options_.WAL_ttl_seconds > 0 || options_.WAL_size_limit_MB > 0) {
-    std::string archivalPath = ArchivalDirectory(options_.wal_dir);
+  if (db_options_.WAL_ttl_seconds > 0 || db_options_.WAL_size_limit_MB > 0) {
+    std::string archivalPath = ArchivalDirectory(db_options_.wal_dir);
     return env_->CreateDirIfMissing(archivalPath);
   }
   return Status::OK();
 }
 
 void DBImpl::PrintStatistics() {
-  auto dbstats = options_.statistics.get();
+  auto dbstats = db_options_.statistics.get();
   if (dbstats) {
-    Log(options_.info_log,
-        "STATISTCS:\n %s",
+    Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
+        "STATISTICS:\n %s",
         dbstats->ToString().c_str());
   }
 }
 
 void DBImpl::MaybeDumpStats() {
-  if (options_.stats_dump_period_sec == 0) return;
+  if (db_options_.stats_dump_period_sec == 0) return;
 
   const uint64_t now_micros = env_->NowMicros();
 
   if (last_stats_dump_time_microsec_ +
-      options_.stats_dump_period_sec * 1000000
+      db_options_.stats_dump_period_sec * 1000000
       <= now_micros) {
     // Multiple threads could race in here simultaneously.
     // However, the last one will update last_stats_dump_time_microsec_
     // atomically. We could see more than one dump during one dump
     // period in rare cases.
     last_stats_dump_time_microsec_ = now_micros;
+
+#ifndef ROCKSDB_LITE
+    bool tmp1 = false;
+    bool tmp2 = false;
+    DBPropertyType cf_property_type =
+        GetPropertyType(DB::Properties::kCFStats, &tmp1, &tmp2);
+    DBPropertyType db_property_type =
+        GetPropertyType(DB::Properties::kDBStats, &tmp1, &tmp2);
     std::string stats;
-    GetProperty("rocksdb.stats", &stats);
-    Log(options_.info_log, "%s", stats.c_str());
+    {
+      InstrumentedMutexLock l(&mutex_);
+      for (auto cfd : *versions_->GetColumnFamilySet()) {
+        cfd->internal_stats()->GetStringProperty(cf_property_type,
+                                                 DB::Properties::kCFStats,
+                                                 &stats);
+      }
+      default_cf_internal_stats_->GetStringProperty(db_property_type,
+                                                    DB::Properties::kDBStats,
+                                                    &stats);
+    }
+    Log(InfoLogLevel::WARN_LEVEL,
+        db_options_.info_log, "------- DUMPING STATS -------");
+    Log(InfoLogLevel::WARN_LEVEL,
+        db_options_.info_log, "%s", stats.c_str());
+#endif  // !ROCKSDB_LITE
+
     PrintStatistics();
   }
 }
 
-// Returns the list of live files in 'sst_live' and the list
-// of all files in the filesystem in 'candidate_files'.
+// * Returns the list of live files in 'sst_live'
+// If it's doing full scan:
+// * Returns the list of all files in the filesystem in
+// 'full_scan_candidate_files'.
+// Otherwise, gets obsolete files from VersionSet.
 // no_full_scan = true -- never do the full scan using GetChildren()
 // force = false -- don't force the full scan, except every
-//  options_.delete_obsolete_files_period_micros
+//  db_options_.delete_obsolete_files_period_micros
 // force = true -- force the full scan
-void DBImpl::FindObsoleteFiles(DeletionState& deletion_state,
-                               bool force,
+void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
                                bool no_full_scan) {
   mutex_.AssertHeld();
 
@@ -561,64 +497,91 @@ void DBImpl::FindObsoleteFiles(DeletionState& deletion_state,
   // logic for figurint out if we're doing the full scan
   if (no_full_scan) {
     doing_the_full_scan = false;
-  } else if (force || options_.delete_obsolete_files_period_micros == 0) {
+  } else if (force || db_options_.delete_obsolete_files_period_micros == 0) {
     doing_the_full_scan = true;
   } else {
     const uint64_t now_micros = env_->NowMicros();
-    if (delete_obsolete_files_last_run_ +
-        options_.delete_obsolete_files_period_micros < now_micros) {
+    if (delete_obsolete_files_next_run_ < now_micros) {
       doing_the_full_scan = true;
-      delete_obsolete_files_last_run_ = now_micros;
+      delete_obsolete_files_next_run_ =
+          now_micros + db_options_.delete_obsolete_files_period_micros;
     }
   }
 
+  // don't delete files that might be currently written to from compaction
+  // threads
+  if (!pending_outputs_.empty()) {
+    job_context->min_pending_output = *pending_outputs_.begin();
+  } else {
+    // delete all of them
+    job_context->min_pending_output = std::numeric_limits<uint64_t>::max();
+  }
+
   // get obsolete files
-  versions_->GetObsoleteFiles(&deletion_state.sst_delete_files);
+  versions_->GetObsoleteFiles(&job_context->sst_delete_files,
+                              job_context->min_pending_output);
 
   // store the current filenum, lognum, etc
-  deletion_state.manifest_file_number = versions_->ManifestFileNumber();
-  deletion_state.pending_manifest_file_number =
-      versions_->PendingManifestFileNumber();
-  deletion_state.log_number = versions_->MinLogNumber();
-  deletion_state.prev_log_number = versions_->PrevLogNumber();
-
-  if (!doing_the_full_scan && !deletion_state.HaveSomethingToDelete()) {
-    // avoid filling up sst_live if we're sure that we
-    // are not going to do the full scan and that we don't have
-    // anything to delete at the moment
-    return;
-  }
-
-  // don't delete live files
-  deletion_state.sst_live.assign(pending_outputs_.begin(),
-                                 pending_outputs_.end());
-  versions_->AddLiveFiles(&deletion_state.sst_live);
+  job_context->manifest_file_number = versions_->manifest_file_number();
+  job_context->pending_manifest_file_number =
+      versions_->pending_manifest_file_number();
+  job_context->log_number = versions_->MinLogNumber();
+  job_context->prev_log_number = versions_->prev_log_number();
 
+  versions_->AddLiveFiles(&job_context->sst_live);
   if (doing_the_full_scan) {
-    // set of all files in the directory. We'll exclude files that are still
-    // alive in the subsequent processings.
-    env_->GetChildren(
-        dbname_, &deletion_state.candidate_files
-    ); // Ignore errors
+    for (uint32_t path_id = 0; path_id < db_options_.db_paths.size();
+         path_id++) {
+      // set of all files in the directory. We'll exclude files that are still
+      // alive in the subsequent processings.
+      std::vector<std::string> files;
+      env_->GetChildren(db_options_.db_paths[path_id].path,
+                        &files);  // Ignore errors
+      for (std::string file : files) {
+        // TODO(icanadi) clean up this mess to avoid having one-off "/" prefixes
+        job_context->full_scan_candidate_files.emplace_back("/" + file,
+                                                            path_id);
+      }
+    }
 
     //Add log files in wal_dir
-    if (options_.wal_dir != dbname_) {
+    if (db_options_.wal_dir != dbname_) {
       std::vector<std::string> log_files;
-      env_->GetChildren(options_.wal_dir, &log_files); // Ignore errors
-      deletion_state.candidate_files.insert(
-        deletion_state.candidate_files.end(),
-        log_files.begin(),
-        log_files.end()
-      );
+      env_->GetChildren(db_options_.wal_dir, &log_files);  // Ignore errors
+      for (std::string log_file : log_files) {
+        job_context->full_scan_candidate_files.emplace_back(log_file, 0);
+      }
+    }
+    // Add info log files in db_log_dir
+    if (!db_options_.db_log_dir.empty() && db_options_.db_log_dir != dbname_) {
+      std::vector<std::string> info_log_files;
+      // Ignore errors
+      env_->GetChildren(db_options_.db_log_dir, &info_log_files);
+      for (std::string log_file : info_log_files) {
+        job_context->full_scan_candidate_files.emplace_back(log_file, 0);
+      }
     }
   }
 }
 
+namespace {
+bool CompareCandidateFile(const JobContext::CandidateFileInfo& first,
+                          const JobContext::CandidateFileInfo& second) {
+  if (first.file_name > second.file_name) {
+    return true;
+  } else if (first.file_name < second.file_name) {
+    return false;
+  } else {
+    return (first.path_id > second.path_id);
+  }
+}
+};  // namespace
+
 // Diffs the files listed in filenames and those that do not
 // belong to live files are posibly removed. Also, removes all the
 // files in sst_delete_files and log_delete_files.
 // It is not necessary to hold the mutex when invoking this method.
-void DBImpl::PurgeObsoleteFiles(DeletionState& state) {
+void DBImpl::PurgeObsoleteFiles(const JobContext& state) {
   // we'd better have sth to delete
   assert(state.HaveSomethingToDelete());
 
@@ -629,44 +592,48 @@ void DBImpl::PurgeObsoleteFiles(DeletionState& state) {
     return;
   }
 
-  // Now, convert live list to an unordered set, WITHOUT mutex held;
+  // Now, convert live list to an unordered map, WITHOUT mutex held;
   // set is slow.
-  std::unordered_set<uint64_t> sst_live(state.sst_live.begin(),
-                                        state.sst_live.end());
-
-  auto& candidate_files = state.candidate_files;
-  candidate_files.reserve(
-      candidate_files.size() +
-      state.sst_delete_files.size() +
-      state.log_delete_files.size());
+  std::unordered_map<uint64_t, const FileDescriptor*> sst_live_map;
+  for (const FileDescriptor& fd : state.sst_live) {
+    sst_live_map[fd.GetNumber()] = &fd;
+  }
+
+  auto candidate_files = state.full_scan_candidate_files;
+  candidate_files.reserve(candidate_files.size() +
+                          state.sst_delete_files.size() +
+                          state.log_delete_files.size());
   // We may ignore the dbname when generating the file names.
   const char* kDumbDbName = "";
   for (auto file : state.sst_delete_files) {
-    candidate_files.push_back(
-        TableFileName(kDumbDbName, file->number).substr(1)
-    );
+    candidate_files.emplace_back(
+        MakeTableFileName(kDumbDbName, file->fd.GetNumber()),
+        file->fd.GetPathId());
     delete file;
   }
 
   for (auto file_num : state.log_delete_files) {
     if (file_num > 0) {
-      candidate_files.push_back(LogFileName(kDumbDbName, file_num).substr(1));
+      candidate_files.emplace_back(LogFileName(kDumbDbName, file_num).substr(1),
+                                   0);
     }
   }
 
   // dedup state.candidate_files so we don't try to delete the same
   // file twice
-  sort(candidate_files.begin(), candidate_files.end());
+  sort(candidate_files.begin(), candidate_files.end(), CompareCandidateFile);
   candidate_files.erase(unique(candidate_files.begin(), candidate_files.end()),
                         candidate_files.end());
 
   std::vector<std::string> old_info_log_files;
-
-  for (const auto& to_delete : candidate_files) {
+  InfoLogPrefix info_log_prefix(!db_options_.db_log_dir.empty(), dbname_);
+  for (const auto& candidate_file : candidate_files) {
+    std::string to_delete = candidate_file.file_name;
+    uint32_t path_id = candidate_file.path_id;
     uint64_t number;
     FileType type;
     // Ignore file if we cannot recognize it.
-    if (!ParseFileName(to_delete, &number, &type)) {
+    if (!ParseFileName(to_delete, &number, info_log_prefix.prefix, &type)) {
       continue;
     }
 
@@ -682,7 +649,10 @@ void DBImpl::PurgeObsoleteFiles(DeletionState& state) {
         keep = (number >= state.manifest_file_number);
         break;
       case kTableFile:
-        keep = (sst_live.find(number) != sst_live.end());
+        // If the second condition is not there, this makes
+        // DontDeletePendingOutputs fail
+        keep = (sst_live_map.find(number) != sst_live_map.end()) ||
+               number >= state.min_pending_output;
         break;
       case kTempFile:
         // Any temp files that are currently being written to must
@@ -690,7 +660,7 @@ void DBImpl::PurgeObsoleteFiles(DeletionState& state) {
         // Also, SetCurrentFile creates a temp file when writing out new
         // manifest, which is equal to state.pending_manifest_file_number. We
         // should not delete that file
-        keep = (sst_live.find(number) != sst_live.end()) ||
+        keep = (sst_live_map.find(number) != sst_live_map.end()) ||
                (number == state.pending_manifest_file_number);
         break;
       case kInfoLogFile:
@@ -711,367 +681,132 @@ void DBImpl::PurgeObsoleteFiles(DeletionState& state) {
       continue;
     }
 
+    std::string fname;
     if (type == kTableFile) {
       // evict from cache
       TableCache::Evict(table_cache_.get(), number);
+      fname = TableFileName(db_options_.db_paths, number, path_id);
+      event_logger_.Log() << "job" << state.job_id << "event"
+                          << "table_file_deletion"
+                          << "file_number" << number;
+    } else {
+      fname = ((type == kLogFile) ?
+          db_options_.wal_dir : dbname_) + "/" + to_delete;
     }
 
-    std::string fname = ((type == kLogFile) ? options_.wal_dir : dbname_) +
-        "/" + to_delete;
-    if (type == kLogFile &&
-        (options_.WAL_ttl_seconds > 0 || options_.WAL_size_limit_MB > 0)) {
-      auto archived_log_name = ArchivedLogFileName(options_.wal_dir, number);
-      // The sync point below is used in (DBTest,TransactionLogIteratorRace)
-      TEST_SYNC_POINT("DBImpl::PurgeObsoleteFiles:1");
-      Status s = env_->RenameFile(fname, archived_log_name);
-      // The sync point below is used in (DBTest,TransactionLogIteratorRace)
-      TEST_SYNC_POINT("DBImpl::PurgeObsoleteFiles:2");
-      Log(options_.info_log,
-          "Move log file %s to %s -- %s\n",
-          fname.c_str(), archived_log_name.c_str(), s.ToString().c_str());
+#ifdef ROCKSDB_LITE
+    Status s = env_->DeleteFile(fname);
+    Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
+        "[JOB %d] Delete %s type=%d #%" PRIu64 " -- %s\n", state.job_id,
+        fname.c_str(), type, number, s.ToString().c_str());
+#else   // not ROCKSDB_LITE
+    if (type == kLogFile && (db_options_.WAL_ttl_seconds > 0 ||
+                             db_options_.WAL_size_limit_MB > 0)) {
+      wal_manager_.ArchiveWALFile(fname, number);
     } else {
       Status s = env_->DeleteFile(fname);
-      Log(options_.info_log, "Delete %s type=%d #%lu -- %s\n",
-          fname.c_str(), type, (unsigned long)number,
-          s.ToString().c_str());
+      Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
+          "[JOB %d] Delete %s type=%d #%" PRIu64 " -- %s\n", state.job_id,
+          fname.c_str(), type, number, s.ToString().c_str());
     }
+#endif  // ROCKSDB_LITE
   }
 
   // Delete old info log files.
   size_t old_info_log_file_count = old_info_log_files.size();
-  // NOTE: Currently we only support log purge when options_.db_log_dir is
-  // located in `dbname` directory.
-  if (old_info_log_file_count >= options_.keep_log_file_num &&
-      options_.db_log_dir.empty()) {
+  if (old_info_log_file_count >= db_options_.keep_log_file_num) {
     std::sort(old_info_log_files.begin(), old_info_log_files.end());
-    size_t end = old_info_log_file_count - options_.keep_log_file_num;
+    size_t end = old_info_log_file_count - db_options_.keep_log_file_num;
     for (unsigned int i = 0; i <= end; i++) {
       std::string& to_delete = old_info_log_files.at(i);
-      Log(options_.info_log, "Delete info log file %s\n", to_delete.c_str());
-      Status s = env_->DeleteFile(dbname_ + "/" + to_delete);
+      std::string full_path_to_delete = (db_options_.db_log_dir.empty() ?
+           dbname_ : db_options_.db_log_dir) + "/" + to_delete;
+      Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+          "[JOB %d] Delete info log file %s\n", state.job_id,
+          full_path_to_delete.c_str());
+      Status s = env_->DeleteFile(full_path_to_delete);
       if (!s.ok()) {
-        Log(options_.info_log, "Delete info log file %s FAILED -- %s\n",
+        Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+            "[JOB %d] Delete info log file %s FAILED -- %s\n", state.job_id,
             to_delete.c_str(), s.ToString().c_str());
       }
     }
   }
-  PurgeObsoleteWALFiles();
-  LogFlush(options_.info_log);
+#ifndef ROCKSDB_LITE
+  wal_manager_.PurgeObsoleteWALFiles();
+#endif  // ROCKSDB_LITE
+  LogFlush(db_options_.info_log);
 }
 
 void DBImpl::DeleteObsoleteFiles() {
   mutex_.AssertHeld();
-  DeletionState deletion_state;
-  FindObsoleteFiles(deletion_state, true);
-  if (deletion_state.HaveSomethingToDelete()) {
-    PurgeObsoleteFiles(deletion_state);
+  JobContext job_context(next_job_id_.fetch_add(1));
+  FindObsoleteFiles(&job_context, true);
+  if (job_context.HaveSomethingToDelete()) {
+    PurgeObsoleteFiles(job_context);
   }
+  job_context.Clean();
 }
 
-#ifndef ROCKSDB_LITE
-// 1. Go through all archived files and
-//    a. if ttl is enabled, delete outdated files
-//    b. if archive size limit is enabled, delete empty files,
-//        compute file number and size.
-// 2. If size limit is enabled:
-//    a. compute how many files should be deleted
-//    b. get sorted non-empty archived logs
-//    c. delete what should be deleted
-void DBImpl::PurgeObsoleteWALFiles() {
-  bool const ttl_enabled = options_.WAL_ttl_seconds > 0;
-  bool const size_limit_enabled =  options_.WAL_size_limit_MB > 0;
-  if (!ttl_enabled && !size_limit_enabled) {
-    return;
-  }
-
-  int64_t current_time;
-  Status s = env_->GetCurrentTime(&current_time);
+Status DBImpl::Directories::CreateAndNewDirectory(
+    Env* env, const std::string& dirname,
+    std::unique_ptr<Directory>* directory) const {
+  // We call CreateDirIfMissing() as the directory may already exist (if we
+  // are reopening a DB), when this happens we don't want creating the
+  // directory to cause an error. However, we need to check if creating the
+  // directory fails or else we may get an obscure message about the lock
+  // file not existing. One real-world example of this occurring is if
+  // env->CreateDirIfMissing() doesn't create intermediate directories, e.g.
+  // when dbname_ is "dir/db" but when "dir" doesn't exist.
+  Status s = env->CreateDirIfMissing(dirname);
   if (!s.ok()) {
-    Log(options_.info_log, "Can't get current time: %s", s.ToString().c_str());
-    assert(false);
-    return;
-  }
-  uint64_t const now_seconds = static_cast<uint64_t>(current_time);
-  uint64_t const time_to_check = (ttl_enabled && !size_limit_enabled) ?
-    options_.WAL_ttl_seconds / 2 : default_interval_to_delete_obsolete_WAL_;
-
-  if (purge_wal_files_last_run_ + time_to_check > now_seconds) {
-    return;
+    return s;
   }
+  return env->NewDirectory(dirname, directory);
+}
 
-  purge_wal_files_last_run_ = now_seconds;
-
-  std::string archival_dir = ArchivalDirectory(options_.wal_dir);
-  std::vector<std::string> files;
-  s = env_->GetChildren(archival_dir, &files);
+Status DBImpl::Directories::SetDirectories(
+    Env* env, const std::string& dbname, const std::string& wal_dir,
+    const std::vector<DbPath>& data_paths) {
+  Status s = CreateAndNewDirectory(env, dbname, &db_dir_);
   if (!s.ok()) {
-    Log(options_.info_log, "Can't get archive files: %s", s.ToString().c_str());
-    assert(false);
-    return;
-  }
-
-  size_t log_files_num = 0;
-  uint64_t log_file_size = 0;
-
-  for (auto& f : files) {
-    uint64_t number;
-    FileType type;
-    if (ParseFileName(f, &number, &type) && type == kLogFile) {
-      std::string const file_path = archival_dir + "/" + f;
-      if (ttl_enabled) {
-        uint64_t file_m_time;
-        Status const s = env_->GetFileModificationTime(file_path,
-          &file_m_time);
-        if (!s.ok()) {
-          Log(options_.info_log, "Can't get file mod time: %s: %s",
-              file_path.c_str(), s.ToString().c_str());
-          continue;
-        }
-        if (now_seconds - file_m_time > options_.WAL_ttl_seconds) {
-          Status const s = env_->DeleteFile(file_path);
-          if (!s.ok()) {
-            Log(options_.info_log, "Can't delete file: %s: %s",
-                file_path.c_str(), s.ToString().c_str());
-            continue;
-          } else {
-            MutexLock l(&read_first_record_cache_mutex_);
-            read_first_record_cache_.erase(number);
-          }
-          continue;
-        }
-      }
-
-      if (size_limit_enabled) {
-        uint64_t file_size;
-        Status const s = env_->GetFileSize(file_path, &file_size);
-        if (!s.ok()) {
-          Log(options_.info_log, "Can't get file size: %s: %s",
-              file_path.c_str(), s.ToString().c_str());
-          return;
-        } else {
-          if (file_size > 0) {
-            log_file_size = std::max(log_file_size, file_size);
-            ++log_files_num;
-          } else {
-            Status s = env_->DeleteFile(file_path);
-            if (!s.ok()) {
-              Log(options_.info_log, "Can't delete file: %s: %s",
-                  file_path.c_str(), s.ToString().c_str());
-              continue;
-            } else {
-              MutexLock l(&read_first_record_cache_mutex_);
-              read_first_record_cache_.erase(number);
-            }
-          }
-        }
-      }
-    }
-  }
-
-  if (0 == log_files_num || !size_limit_enabled) {
-    return;
-  }
-
-  size_t const files_keep_num = options_.WAL_size_limit_MB *
-    1024 * 1024 / log_file_size;
-  if (log_files_num <= files_keep_num) {
-    return;
-  }
-
-  size_t files_del_num = log_files_num - files_keep_num;
-  VectorLogPtr archived_logs;
-  GetSortedWalsOfType(archival_dir, archived_logs, kArchivedLogFile);
-
-  if (files_del_num > archived_logs.size()) {
-    Log(options_.info_log, "Trying to delete more archived log files than "
-        "exist. Deleting all");
-    files_del_num = archived_logs.size();
+    return s;
   }
-
-  for (size_t i = 0; i < files_del_num; ++i) {
-    std::string const file_path = archived_logs[i]->PathName();
-    Status const s = DeleteFile(file_path);
+  if (!wal_dir.empty() && dbname != wal_dir) {
+    s = CreateAndNewDirectory(env, wal_dir, &wal_dir_);
     if (!s.ok()) {
-      Log(options_.info_log, "Can't delete file: %s: %s",
-          file_path.c_str(), s.ToString().c_str());
-      continue;
-    } else {
-      MutexLock l(&read_first_record_cache_mutex_);
-      read_first_record_cache_.erase(archived_logs[i]->LogNumber());
+      return s;
     }
   }
-}
-
-namespace {
-struct CompareLogByPointer {
-  bool operator()(const unique_ptr<LogFile>& a, const unique_ptr<LogFile>& b) {
-    LogFileImpl* a_impl = dynamic_cast<LogFileImpl*>(a.get());
-    LogFileImpl* b_impl = dynamic_cast<LogFileImpl*>(b.get());
-    return *a_impl < *b_impl;
-  }
-};
-}
-
-Status DBImpl::GetSortedWalsOfType(const std::string& path,
-                                   VectorLogPtr& log_files,
-                                   WalFileType log_type) {
-  std::vector<std::string> all_files;
-  const Status status = env_->GetChildren(path, &all_files);
-  if (!status.ok()) {
-    return status;
-  }
-  log_files.reserve(all_files.size());
-  for (const auto& f : all_files) {
-    uint64_t number;
-    FileType type;
-    if (ParseFileName(f, &number, &type) && type == kLogFile) {
-      SequenceNumber sequence;
-      Status s = ReadFirstRecord(log_type, number, &sequence);
-      if (!s.ok()) {
-        return s;
-      }
-      if (sequence == 0) {
-        // empty file
-        continue;
-      }
 
-      uint64_t size_bytes;
-      s = env_->GetFileSize(LogFileName(path, number), &size_bytes);
+  data_dirs_.clear();
+  for (auto& p : data_paths) {
+    const std::string db_path = p.path;
+    if (db_path == dbname) {
+      data_dirs_.emplace_back(nullptr);
+    } else {
+      std::unique_ptr<Directory> path_directory;
+      s = CreateAndNewDirectory(env, db_path, &path_directory);
       if (!s.ok()) {
         return s;
       }
-
-      log_files.push_back(std::move(unique_ptr<LogFile>(
-          new LogFileImpl(number, log_type, sequence, size_bytes))));
-    }
-  }
-  CompareLogByPointer compare_log_files;
-  std::sort(log_files.begin(), log_files.end(), compare_log_files);
-  return status;
-}
-
-Status DBImpl::RetainProbableWalFiles(VectorLogPtr& all_logs,
-                                      const SequenceNumber target) {
-  int64_t start = 0;  // signed to avoid overflow when target is < first file.
-  int64_t end = static_cast<int64_t>(all_logs.size()) - 1;
-  // Binary Search. avoid opening all files.
-  while (end >= start) {
-    int64_t mid = start + (end - start) / 2;  // Avoid overflow.
-    SequenceNumber current_seq_num = all_logs.at(mid)->StartSequence();
-    if (current_seq_num == target) {
-      end = mid;
-      break;
-    } else if (current_seq_num < target) {
-      start = mid + 1;
-    } else {
-      end = mid - 1;
+      data_dirs_.emplace_back(path_directory.release());
     }
   }
-  // end could be -ve.
-  size_t start_index = std::max(static_cast<int64_t>(0), end);
-  // The last wal file is always included
-  all_logs.erase(all_logs.begin(), all_logs.begin() + start_index);
+  assert(data_dirs_.size() == data_paths.size());
   return Status::OK();
 }
 
-Status DBImpl::ReadFirstRecord(const WalFileType type, const uint64_t number,
-                               SequenceNumber* sequence) {
-  if (type != kAliveLogFile && type != kArchivedLogFile) {
-    return Status::NotSupported("File Type Not Known " + std::to_string(type));
-  }
-  {
-    MutexLock l(&read_first_record_cache_mutex_);
-    auto itr = read_first_record_cache_.find(number);
-    if (itr != read_first_record_cache_.end()) {
-      *sequence = itr->second;
-      return Status::OK();
-    }
-  }
-  Status s;
-  if (type == kAliveLogFile) {
-    std::string fname = LogFileName(options_.wal_dir, number);
-    s = ReadFirstLine(fname, sequence);
-    if (env_->FileExists(fname) && !s.ok()) {
-      // return any error that is not caused by non-existing file
-      return s;
-    }
-  }
-
-  if (type == kArchivedLogFile || !s.ok()) {
-    //  check if the file got moved to archive.
-    std::string archived_file = ArchivedLogFileName(options_.wal_dir, number);
-    s = ReadFirstLine(archived_file, sequence);
-  }
-
-  if (s.ok() && *sequence != 0) {
-    MutexLock l(&read_first_record_cache_mutex_);
-    read_first_record_cache_.insert({number, *sequence});
-  }
-  return s;
-}
-
-// the function returns status.ok() and sequence == 0 if the file exists, but is
-// empty
-Status DBImpl::ReadFirstLine(const std::string& fname,
-                             SequenceNumber* sequence) {
-  struct LogReporter : public log::Reader::Reporter {
-    Env* env;
-    Logger* info_log;
-    const char* fname;
-
-    Status* status;
-    bool ignore_error;  // true if options_.paranoid_checks==false
-    virtual void Corruption(size_t bytes, const Status& s) {
-      Log(info_log, "%s%s: dropping %d bytes; %s",
-          (this->ignore_error ? "(ignoring error) " : ""), fname,
-          static_cast<int>(bytes), s.ToString().c_str());
-      if (this->status->ok()) {
-        // only keep the first error
-        *this->status = s;
-      }
-    }
-  };
-
-  unique_ptr<SequentialFile> file;
-  Status status = env_->NewSequentialFile(fname, &file, storage_options_);
-
-  if (!status.ok()) {
-    return status;
-  }
-
-  LogReporter reporter;
-  reporter.env = env_;
-  reporter.info_log = options_.info_log.get();
-  reporter.fname = fname.c_str();
-  reporter.status = &status;
-  reporter.ignore_error = !options_.paranoid_checks;
-  log::Reader reader(std::move(file), &reporter, true /*checksum*/,
-                     0 /*initial_offset*/);
-  std::string scratch;
-  Slice record;
-
-  if (reader.ReadRecord(&record, &scratch) &&
-      (status.ok() || !options_.paranoid_checks)) {
-    if (record.size() < 12) {
-      reporter.Corruption(record.size(),
-                          Status::Corruption("log record too small"));
-      // TODO read record's till the first no corrupt entry?
-    } else {
-      WriteBatch batch;
-      WriteBatchInternal::SetContents(&batch, record);
-      *sequence = WriteBatchInternal::Sequence(&batch);
-      return Status::OK();
-    }
+Directory* DBImpl::Directories::GetDataDir(size_t path_id) {
+  assert(path_id < data_dirs_.size());
+  Directory* ret_dir = data_dirs_[path_id].get();
+  if (ret_dir == nullptr) {
+    // Should use db_dir_
+    return db_dir_.get();
   }
-
-  // ReadRecord returns false on EOF, which means that the log file is empty. we
-  // return status.ok() in that case and set sequence number to 0
-  *sequence = 0;
-  return status;
+  return ret_dir;
 }
 
-#endif  // ROCKSDB_LITE
-
 Status DBImpl::Recover(
     const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only,
     bool error_if_log_file_exist) {
@@ -1080,19 +815,8 @@ Status DBImpl::Recover(
   bool is_new_db = false;
   assert(db_lock_ == nullptr);
   if (!read_only) {
-    // We call CreateDirIfMissing() as the directory may already exist (if we
-    // are reopening a DB), when this happens we don't want creating the
-    // directory to cause an error. However, we need to check if creating the
-    // directory fails or else we may get an obscure message about the lock
-    // file not existing. One real-world example of this occurring is if
-    // env->CreateDirIfMissing() doesn't create intermediate directories, e.g.
-    // when dbname_ is "dir/db" but when "dir" doesn't exist.
-    Status s = env_->CreateDirIfMissing(dbname_);
-    if (!s.ok()) {
-      return s;
-    }
-
-    s = env_->NewDirectory(dbname_, &db_directory_);
+    Status s = directories_.SetDirectories(env_, dbname_, db_options_.wal_dir,
+                                           db_options_.db_paths);
     if (!s.ok()) {
       return s;
     }
@@ -1103,8 +827,7 @@ Status DBImpl::Recover(
     }
 
     if (!env_->FileExists(CurrentFileName(dbname_))) {
-      if (options_.create_if_missing) {
-        // TODO: add merge_operator name check
+      if (db_options_.create_if_missing) {
         s = NewDB();
         is_new_db = true;
         if (!s.ok()) {
@@ -1115,7 +838,7 @@ Status DBImpl::Recover(
             dbname_, "does not exist (create_if_missing is false)");
       }
     } else {
-      if (options_.error_if_exists) {
+      if (db_options_.error_if_exists) {
         return Status::InvalidArgument(
             dbname_, "exists (error_if_exists is true)");
       }
@@ -1130,25 +853,28 @@ Status DBImpl::Recover(
   }
 
   Status s = versions_->Recover(column_families, read_only);
-  if (options_.paranoid_checks && s.ok()) {
+  if (db_options_.paranoid_checks && s.ok()) {
     s = CheckConsistency();
   }
   if (s.ok()) {
     SequenceNumber max_sequence(0);
     default_cf_handle_ = new ColumnFamilyHandleImpl(
         versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_);
+    default_cf_internal_stats_ = default_cf_handle_->cfd()->internal_stats();
+    single_column_family_mode_ =
+        versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1;
 
     // Recover from all newer log files than the ones named in the
     // descriptor (new log files may have been added by the previous
     // incarnation without registering them in the descriptor).
     //
-    // Note that PrevLogNumber() is no longer used, but we pay
+    // Note that prev_log_number() is no longer used, but we pay
     // attention to it in case we are recovering a database
     // produced by an older version of rocksdb.
     const uint64_t min_log = versions_->MinLogNumber();
-    const uint64_t prev_log = versions_->PrevLogNumber();
+    const uint64_t prev_log = versions_->prev_log_number();
     std::vector<std::string> filenames;
-    s = env_->GetChildren(options_.wal_dir, &filenames);
+    s = env_->GetChildren(db_options_.wal_dir, &filenames);
     if (!s.ok()) {
       return s;
     }
@@ -1175,37 +901,42 @@ Status DBImpl::Recover(
           "flag but a log file already exists");
     }
 
-    // Recover in the order in which the logs were generated
-    std::sort(logs.begin(), logs.end());
-    for (const auto& log : logs) {
-      // The previous incarnation may not have written any MANIFEST
-      // records after allocating this log number.  So we manually
-      // update the file number allocation counter in VersionSet.
-      versions_->MarkFileNumberUsed(log);
-      s = RecoverLogFile(log, &max_sequence, read_only);
+    if (!logs.empty()) {
+      // Recover in the order in which the logs were generated
+      std::sort(logs.begin(), logs.end());
+      s = RecoverLogFiles(logs, &max_sequence, read_only);
+      if (!s.ok()) {
+        // Clear memtables if recovery failed
+        for (auto cfd : *versions_->GetColumnFamilySet()) {
+          cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions());
+        }
+      }
     }
-    SetTickerCount(options_.statistics.get(), SEQUENCE_NUMBER,
-                   versions_->LastSequence());
+    SetTickerCount(stats_, SEQUENCE_NUMBER, versions_->LastSequence());
   }
 
+  // Initial value
+  max_total_in_memory_state_ = 0;
   for (auto cfd : *versions_->GetColumnFamilySet()) {
-    max_total_in_memory_state_ += cfd->options()->write_buffer_size *
-                                  cfd->options()->max_write_buffer_number;
+    auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
+    max_total_in_memory_state_ += mutable_cf_options->write_buffer_size *
+                                  mutable_cf_options->max_write_buffer_number;
   }
 
   return s;
 }
 
-Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
-                              bool read_only) {
+// REQUIRES: log_numbers are sorted in ascending order
+Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
+                               SequenceNumber* max_sequence, bool read_only) {
   struct LogReporter : public log::Reader::Reporter {
     Env* env;
     Logger* info_log;
     const char* fname;
-    Status* status;  // nullptr if options_.paranoid_checks==false or
-                     //            options_.skip_log_error_on_recovery==true
-    virtual void Corruption(size_t bytes, const Status& s) {
-      Log(info_log, "%s%s: dropping %d bytes; %s",
+    Status* status;  // nullptr if db_options_.paranoid_checks==false
+    virtual void Corruption(size_t bytes, const Status& s) override {
+      Log(InfoLogLevel::WARN_LEVEL,
+          info_log, "%s%s: dropping %d bytes; %s",
           (this->status == nullptr ? "(ignoring error) " : ""),
           fname, static_cast<int>(bytes), s.ToString().c_str());
       if (this->status != nullptr && this->status->ok()) *this->status = s;
@@ -1213,7 +944,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
   };
 
   mutex_.AssertHeld();
-
+  Status status;
   std::unordered_map<int, VersionEdit> version_edits;
   // no need to refcount because iteration is under mutex
   for (auto cfd : *versions_->GetColumnFamilySet()) {
@@ -1221,97 +952,130 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
     edit.SetColumnFamily(cfd->GetID());
     version_edits.insert({cfd->GetID(), edit});
   }
+  int job_id = next_job_id_.fetch_add(1);
+  {
+    auto stream = event_logger_.Log();
+    stream << "job" << job_id << "event"
+           << "recovery_started";
+    stream << "log_files";
+    stream.StartArray();
+    for (auto log_number : log_numbers) {
+      stream << log_number;
+    }
+    stream.EndArray();
+  }
+
+  for (auto log_number : log_numbers) {
+    // The previous incarnation may not have written any MANIFEST
+    // records after allocating this log number.  So we manually
+    // update the file number allocation counter in VersionSet.
+    versions_->MarkFileNumberUsedDuringRecovery(log_number);
+    // Open the log file
+    std::string fname = LogFileName(db_options_.wal_dir, log_number);
+    unique_ptr<SequentialFile> file;
+    status = env_->NewSequentialFile(fname, &file, env_options_);
+    if (!status.ok()) {
+      MaybeIgnoreError(&status);
+      if (!status.ok()) {
+        return status;
+      } else {
+        // Fail with one log file, but that's ok.
+        // Try next one.
+        continue;
+      }
+    }
 
-  // Open the log file
-  std::string fname = LogFileName(options_.wal_dir, log_number);
-  unique_ptr<SequentialFile> file;
-  Status status = env_->NewSequentialFile(fname, &file, storage_options_);
-  if (!status.ok()) {
-    MaybeIgnoreError(&status);
-    return status;
-  }
+    // Create the log reader.
+    LogReporter reporter;
+    reporter.env = env_;
+    reporter.info_log = db_options_.info_log.get();
+    reporter.fname = fname.c_str();
+    reporter.status = (db_options_.paranoid_checks) ? &status : nullptr;
+    // We intentially make log::Reader do checksumming even if
+    // paranoid_checks==false so that corruptions cause entire commits
+    // to be skipped instead of propagating bad information (like overly
+    // large sequence numbers).
+    log::Reader reader(std::move(file), &reporter, true /*checksum*/,
+                       0 /*initial_offset*/);
+    Log(InfoLogLevel::INFO_LEVEL,
+        db_options_.info_log, "Recovering log #%" PRIu64 "", log_number);
+
+    // Read all the records and add to a memtable
+    std::string scratch;
+    Slice record;
+    WriteBatch batch;
+    while (reader.ReadRecord(&record, &scratch) && status.ok()) {
+      if (record.size() < 12) {
+        reporter.Corruption(record.size(),
+                            Status::Corruption("log record too small"));
+        continue;
+      }
+      WriteBatchInternal::SetContents(&batch, record);
 
-  // Create the log reader.
-  LogReporter reporter;
-  reporter.env = env_;
-  reporter.info_log = options_.info_log.get();
-  reporter.fname = fname.c_str();
-  reporter.status = (options_.paranoid_checks &&
-                     !options_.skip_log_error_on_recovery ? &status : nullptr);
-  // We intentially make log::Reader do checksumming even if
-  // paranoid_checks==false so that corruptions cause entire commits
-  // to be skipped instead of propagating bad information (like overly
-  // large sequence numbers).
-  log::Reader reader(std::move(file), &reporter, true/*checksum*/,
-                     0/*initial_offset*/);
-  Log(options_.info_log, "Recovering log #%lu",
-      (unsigned long) log_number);
-
-  // Read all the records and add to a memtable
-  std::string scratch;
-  Slice record;
-  WriteBatch batch;
-  while (reader.ReadRecord(&record, &scratch)) {
-    if (record.size() < 12) {
-      reporter.Corruption(
-          record.size(), Status::Corruption("log record too small"));
-      continue;
-    }
-    WriteBatchInternal::SetContents(&batch, record);
+      // If column family was not found, it might mean that the WAL write
+      // batch references to the column family that was dropped after the
+      // insert. We don't want to fail the whole write batch in that case --
+      // we just ignore the update.
+      // That's why we set ignore missing column families to true
+      status = WriteBatchInternal::InsertInto(
+          &batch, column_family_memtables_.get(), true, log_number);
 
-    status = WriteBatchInternal::InsertInto(
-        &batch, column_family_memtables_.get(), true, log_number);
+      MaybeIgnoreError(&status);
+      if (!status.ok()) {
+        return status;
+      }
+      const SequenceNumber last_seq = WriteBatchInternal::Sequence(&batch) +
+                                      WriteBatchInternal::Count(&batch) - 1;
+      if (last_seq > *max_sequence) {
+        *max_sequence = last_seq;
+      }
 
-    MaybeIgnoreError(&status);
-    if (!status.ok()) {
-      return status;
-    }
-    const SequenceNumber last_seq =
-        WriteBatchInternal::Sequence(&batch) +
-        WriteBatchInternal::Count(&batch) - 1;
-    if (last_seq > *max_sequence) {
-      *max_sequence = last_seq;
-    }
+      if (!read_only) {
+        // we can do this because this is called before client has access to the
+        // DB and there is only a single thread operating on DB
+        ColumnFamilyData* cfd;
 
-    if (!read_only) {
-      // no need to refcount since client still doesn't have access
-      // to the DB and can not drop column families while we iterate
-      for (auto cfd : *versions_->GetColumnFamilySet()) {
-        if (cfd->mem()->ShouldFlush()) {
+        while ((cfd = flush_scheduler_.GetNextColumnFamily()) != nullptr) {
+          cfd->Unref();
           // If this asserts, it means that InsertInto failed in
           // filtering updates to already-flushed column families
           assert(cfd->GetLogNumber() <= log_number);
           auto iter = version_edits.find(cfd->GetID());
           assert(iter != version_edits.end());
           VersionEdit* edit = &iter->second;
-          status = WriteLevel0TableForRecovery(cfd, cfd->mem(), edit);
-          // we still want to clear the memtable, even if the recovery failed
-          cfd->CreateNewMemtable();
+          status = WriteLevel0TableForRecovery(job_id, cfd, cfd->mem(), edit);
           if (!status.ok()) {
             // Reflect errors immediately so that conditions like full
             // file-systems cause the DB::Open() to fail.
             return status;
           }
+          cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions());
         }
       }
     }
-  }
 
-  if (versions_->LastSequence() < *max_sequence) {
-    versions_->SetLastSequence(*max_sequence);
+    if (!status.ok()) {
+      return status;
+    }
+
+    flush_scheduler_.Clear();
+    if (versions_->LastSequence() < *max_sequence) {
+      versions_->SetLastSequence(*max_sequence);
+    }
   }
 
   if (!read_only) {
     // no need to refcount since client still doesn't have access
     // to the DB and can not drop column families while we iterate
+    auto max_log_number = log_numbers.back();
     for (auto cfd : *versions_->GetColumnFamilySet()) {
       auto iter = version_edits.find(cfd->GetID());
       assert(iter != version_edits.end());
       VersionEdit* edit = &iter->second;
 
-      if (cfd->GetLogNumber() > log_number) {
+      if (cfd->GetLogNumber() > max_log_number) {
         // Column family cfd has already flushed the data
-        // from log_number. Memtable has to be empty because
+        // from all logs. Memtable has to be empty because
         // we filter the updates based on log_number
         // (in WriteBatch::InsertInto)
         assert(cfd->mem()->GetFirstSequenceNumber() == 0);
@@ -1321,308 +1085,530 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
 
       // flush the final memtable (if non-empty)
       if (cfd->mem()->GetFirstSequenceNumber() != 0) {
-        status = WriteLevel0TableForRecovery(cfd, cfd->mem(), edit);
-      }
-      // we still want to clear the memtable, even if the recovery failed
-      cfd->CreateNewMemtable();
-      if (!status.ok()) {
-        return status;
+        status = WriteLevel0TableForRecovery(job_id, cfd, cfd->mem(), edit);
+        if (!status.ok()) {
+          // Recovery failed
+          break;
+        }
+        cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions());
       }
 
       // write MANIFEST with update
-      // writing log number in the manifest means that any log file
+      // writing log_number in the manifest means that any log file
       // with number strongly less than (log_number + 1) is already
       // recovered and should be ignored on next reincarnation.
-      // Since we already recovered log_number, we want all logs
-      // with numbers `<= log_number` (includes this one) to be ignored
-      edit->SetLogNumber(log_number + 1);
+      // Since we already recovered max_log_number, we want all logs
+      // with numbers `<= max_log_number` (includes this one) to be ignored
+      edit->SetLogNumber(max_log_number + 1);
       // we must mark the next log number as used, even though it's
       // not actually used. that is because VersionSet assumes
       // VersionSet::next_file_number_ always to be strictly greater than any
       // log number
-      versions_->MarkFileNumberUsed(log_number + 1);
-      status = versions_->LogAndApply(cfd, edit, &mutex_);
+      versions_->MarkFileNumberUsedDuringRecovery(max_log_number + 1);
+      status = versions_->LogAndApply(
+          cfd, *cfd->GetLatestMutableCFOptions(), edit, &mutex_);
       if (!status.ok()) {
-        return status;
+        // Recovery failed
+        break;
       }
     }
   }
 
-  return status;
-}
-
-Status DBImpl::WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem,
-                                           VersionEdit* edit) {
-  mutex_.AssertHeld();
-  const uint64_t start_micros = env_->NowMicros();
-  FileMetaData meta;
-  meta.number = versions_->NewFileNumber();
-  pending_outputs_.insert(meta.number);
-  Iterator* iter = mem->NewIterator(ReadOptions(), true);
-  const SequenceNumber newest_snapshot = snapshots_.GetNewest();
-  const SequenceNumber earliest_seqno_in_memtable =
-    mem->GetFirstSequenceNumber();
-  Log(options_.info_log, "[%s] Level-0 table #%lu: started",
-      cfd->GetName().c_str(), (unsigned long)meta.number);
-
-  Status s;
-  {
-    mutex_.Unlock();
-    s = BuildTable(dbname_, env_, *cfd->options(), storage_options_,
-                   cfd->table_cache(), iter, &meta, cfd->internal_comparator(),
-                   newest_snapshot, earliest_seqno_in_memtable,
-                   GetCompressionFlush(*cfd->options()));
-    LogFlush(options_.info_log);
-    mutex_.Lock();
-  }
-
-  Log(options_.info_log, "[%s] Level-0 table #%lu: %lu bytes %s",
-      cfd->GetName().c_str(), (unsigned long)meta.number,
-      (unsigned long)meta.file_size, s.ToString().c_str());
-  delete iter;
-
-  pending_outputs_.erase(meta.number);
-
-  // Note that if file_size is zero, the file has been deleted and
-  // should not be added to the manifest.
-  int level = 0;
-  if (s.ok() && meta.file_size > 0) {
-    edit->AddFile(level, meta.number, meta.file_size,
-                  meta.smallest, meta.largest,
-                  meta.smallest_seqno, meta.largest_seqno);
-  }
+  event_logger_.Log() << "job" << job_id << "event"
+                      << "recovery_finished";
 
-  InternalStats::CompactionStats stats;
-  stats.micros = env_->NowMicros() - start_micros;
-  stats.bytes_written = meta.file_size;
-  stats.files_out_levelnp1 = 1;
-  cfd->internal_stats()->AddCompactionStats(level, stats);
-  RecordTick(options_.statistics.get(), COMPACT_WRITE_BYTES, meta.file_size);
-  return s;
+  return status;
 }
 
-Status DBImpl::WriteLevel0Table(ColumnFamilyData* cfd,
-                                autovector<MemTable*>& mems, VersionEdit* edit,
-                                uint64_t* filenumber, LogBuffer* log_buffer) {
+Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
+                                           MemTable* mem, VersionEdit* edit) {
   mutex_.AssertHeld();
   const uint64_t start_micros = env_->NowMicros();
   FileMetaData meta;
-  meta.number = versions_->NewFileNumber();
-  *filenumber = meta.number;
-  pending_outputs_.insert(meta.number);
-
-  const SequenceNumber newest_snapshot = snapshots_.GetNewest();
-  const SequenceNumber earliest_seqno_in_memtable =
-    mems[0]->GetFirstSequenceNumber();
-  Version* base = cfd->current();
-  base->Ref();          // it is likely that we do not need this reference
+  meta.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0);
+  auto pending_outputs_inserted_elem =
+      CaptureCurrentFileNumberInPendingOutputs();
+  ReadOptions ro;
+  ro.total_order_seek = true;
+  Arena arena;
   Status s;
+  TableProperties table_properties;
   {
-    mutex_.Unlock();
-    log_buffer->FlushBufferToLog();
-    std::vector<Iterator*> memtables;
-    for (MemTable* m : mems) {
-      Log(options_.info_log, "[%s] Flushing memtable with next log file: %lu\n",
-          cfd->GetName().c_str(), (unsigned long)m->GetNextLogNumber());
-      memtables.push_back(m->NewIterator(ReadOptions(), true));
-    }
-    Iterator* iter = NewMergingIterator(&cfd->internal_comparator(),
-                                        &memtables[0], memtables.size());
-    Log(options_.info_log, "[%s] Level-0 flush table #%lu: started",
-        cfd->GetName().c_str(), (unsigned long)meta.number);
-
-    s = BuildTable(dbname_, env_, *cfd->options(), storage_options_,
-                   cfd->table_cache(), iter, &meta, cfd->internal_comparator(),
-                   newest_snapshot, earliest_seqno_in_memtable,
-                   GetCompressionFlush(*cfd->options()));
-    LogFlush(options_.info_log);
-    delete iter;
-    Log(options_.info_log, "[%s] Level-0 flush table #%lu: %lu bytes %s",
-        cfd->GetName().c_str(), (unsigned long)meta.number,
-        (unsigned long)meta.file_size, s.ToString().c_str());
-
-    if (!options_.disableDataSync) {
-      db_directory_->Fsync();
+    ScopedArenaIterator iter(mem->NewIterator(ro, &arena));
+    const SequenceNumber newest_snapshot = snapshots_.GetNewest();
+    const SequenceNumber earliest_seqno_in_memtable =
+        mem->GetFirstSequenceNumber();
+    Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
+        "[%s] [WriteLevel0TableForRecovery]"
+        " Level-0 table #%" PRIu64 ": started",
+        cfd->GetName().c_str(), meta.fd.GetNumber());
+
+    bool paranoid_file_checks =
+        cfd->GetLatestMutableCFOptions()->paranoid_file_checks;
+    {
+      mutex_.Unlock();
+      s = BuildTable(
+          dbname_, env_, *cfd->ioptions(), env_options_, cfd->table_cache(),
+          iter.get(), &meta, cfd->internal_comparator(),
+          cfd->int_tbl_prop_collector_factories(), newest_snapshot,
+          earliest_seqno_in_memtable, GetCompressionFlush(*cfd->ioptions()),
+          cfd->ioptions()->compression_opts, paranoid_file_checks, Env::IO_HIGH,
+          &table_properties);
+      LogFlush(db_options_.info_log);
+      mutex_.Lock();
     }
-    mutex_.Lock();
   }
-  base->Unref();
+  Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
+      "[%s] [WriteLevel0TableForRecovery]"
+      " Level-0 table #%" PRIu64 ": %" PRIu64 " bytes %s",
+      cfd->GetName().c_str(), meta.fd.GetNumber(), meta.fd.GetFileSize(),
+      s.ToString().c_str());
 
-  // re-acquire the most current version
-  base = cfd->current();
+  // output to event logger
+  if (s.ok()) {
+    EventLoggerHelpers::LogTableFileCreation(
+        &event_logger_, job_id, meta.fd.GetNumber(), meta.fd.GetFileSize(),
+        table_properties);
+  }
 
-  // There could be multiple threads writing to its own level-0 file.
-  // The pending_outputs cannot be cleared here, otherwise this newly
-  // created file might not be considered as a live-file by another
-  // compaction thread that is concurrently deleting obselete files.
-  // The pending_outputs can be cleared only after the new version is
-  // committed so that other threads can recognize this file as a
-  // valid one.
-  // pending_outputs_.erase(meta.number);
+  ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
 
   // Note that if file_size is zero, the file has been deleted and
   // should not be added to the manifest.
   int level = 0;
-  if (s.ok() && meta.file_size > 0) {
-    const Slice min_user_key = meta.smallest.user_key();
-    const Slice max_user_key = meta.largest.user_key();
-    // if we have more than 1 background thread, then we cannot
-    // insert files directly into higher levels because some other
-    // threads could be concurrently producing compacted files for
-    // that key range.
-    if (base != nullptr && options_.max_background_compactions <= 1 &&
-        cfd->options()->compaction_style == kCompactionStyleLevel) {
-      level = base->PickLevelForMemTableOutput(min_user_key, max_user_key);
-    }
-    edit->AddFile(level, meta.number, meta.file_size,
-                  meta.smallest, meta.largest,
+  if (s.ok() && meta.fd.GetFileSize() > 0) {
+    edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(),
+                  meta.fd.GetFileSize(), meta.smallest, meta.largest,
                   meta.smallest_seqno, meta.largest_seqno);
   }
 
-  InternalStats::CompactionStats stats;
+  InternalStats::CompactionStats stats(1);
   stats.micros = env_->NowMicros() - start_micros;
-  stats.bytes_written = meta.file_size;
+  stats.bytes_written = meta.fd.GetFileSize();
+  stats.files_out_levelnp1 = 1;
   cfd->internal_stats()->AddCompactionStats(level, stats);
-  RecordTick(options_.statistics.get(), COMPACT_WRITE_BYTES, meta.file_size);
+  cfd->internal_stats()->AddCFStats(
+      InternalStats::BYTES_FLUSHED, meta.fd.GetFileSize());
+  RecordTick(stats_, COMPACT_WRITE_BYTES, meta.fd.GetFileSize());
   return s;
 }
 
-Status DBImpl::FlushMemTableToOutputFile(ColumnFamilyData* cfd,
-                                         bool* madeProgress,
-                                         DeletionState& deletion_state,
-                                         LogBuffer* log_buffer) {
+Status DBImpl::FlushMemTableToOutputFile(
+    ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+    bool* madeProgress, JobContext* job_context, LogBuffer* log_buffer) {
   mutex_.AssertHeld();
   assert(cfd->imm()->size() != 0);
   assert(cfd->imm()->IsFlushPending());
 
-  // Save the contents of the earliest memtable as a new Table
-  uint64_t file_number;
-  autovector<MemTable*> mems;
-  cfd->imm()->PickMemtablesToFlush(&mems);
-  if (mems.empty()) {
-    LogToBuffer(log_buffer, "[%s] Nothing in memtable to flush",
-                cfd->GetName().c_str());
-    return Status::OK();
-  }
-
-  // record the logfile_number_ before we release the mutex
-  // entries mems are (implicitly) sorted in ascending order by their created
-  // time. We will use the first memtable's `edit` to keep the meta info for
-  // this flush.
-  MemTable* m = mems[0];
-  VersionEdit* edit = m->GetEdits();
-  edit->SetPrevLogNumber(0);
-  // SetLogNumber(log_num) indicates logs with number smaller than log_num
-  // will no longer be picked up for recovery.
-  edit->SetLogNumber(mems.back()->GetNextLogNumber());
-  edit->SetColumnFamily(cfd->GetID());
-
-  // This will release and re-acquire the mutex.
-  Status s = WriteLevel0Table(cfd, mems, edit, &file_number, log_buffer);
+  FlushJob flush_job(dbname_, cfd, db_options_, mutable_cf_options,
+                     env_options_, versions_.get(), &mutex_, &shutting_down_,
+                     snapshots_.GetNewest(), job_context, log_buffer,
+                     directories_.GetDbDir(), directories_.GetDataDir(0U),
+                     GetCompressionFlush(*cfd->ioptions()), stats_,
+                     &event_logger_);
 
-  if (s.ok() && shutting_down_.Acquire_Load() && cfd->IsDropped()) {
-    s = Status::ShutdownInProgress(
-        "Database shutdown or Column family drop during flush");
-  }
-
-  if (!s.ok()) {
-    cfd->imm()->RollbackMemtableFlush(mems, file_number, &pending_outputs_);
-  } else {
-    // Replace immutable memtable with the generated Table
-    s = cfd->imm()->InstallMemtableFlushResults(
-        cfd, mems, versions_.get(), &mutex_, options_.info_log.get(),
-        file_number, pending_outputs_, &deletion_state.memtables_to_free,
-        db_directory_.get(), log_buffer);
-  }
+  uint64_t file_number;
+  Status s = flush_job.Run(&file_number);
 
   if (s.ok()) {
-    InstallSuperVersion(cfd, deletion_state);
+    InstallSuperVersionBackground(cfd, job_context, mutable_cf_options);
     if (madeProgress) {
       *madeProgress = 1;
     }
-    Version::LevelSummaryStorage tmp;
+    VersionStorageInfo::LevelSummaryStorage tmp;
     LogToBuffer(log_buffer, "[%s] Level summary: %s\n", cfd->GetName().c_str(),
-                cfd->current()->LevelSummary(&tmp));
-
-    MaybeScheduleLogDBDeployStats();
+                cfd->current()->storage_info()->LevelSummary(&tmp));
 
     if (disable_delete_obsolete_files_ == 0) {
       // add to deletion state
       while (alive_log_files_.size() &&
              alive_log_files_.begin()->number < versions_->MinLogNumber()) {
         const auto& earliest = *alive_log_files_.begin();
-        deletion_state.log_delete_files.push_back(earliest.number);
+        job_context->log_delete_files.push_back(earliest.number);
         total_log_size_ -= earliest.size;
         alive_log_files_.pop_front();
       }
     }
   }
 
-  if (!s.ok() && !s.IsShutdownInProgress() && options_.paranoid_checks &&
+  if (!s.ok() && !s.IsShutdownInProgress() && db_options_.paranoid_checks &&
       bg_error_.ok()) {
     // if a bad error happened (not ShutdownInProgress) and paranoid_checks is
     // true, mark DB read-only
     bg_error_ = s;
   }
+  RecordFlushIOStats();
+#ifndef ROCKSDB_LITE
+  if (s.ok()) {
+    // may temporarily unlock and lock the mutex.
+    NotifyOnFlushCompleted(cfd, file_number, mutable_cf_options);
+  }
+#endif  // ROCKSDB_LITE
   return s;
 }
 
+void DBImpl::NotifyOnFlushCompleted(
+    ColumnFamilyData* cfd, uint64_t file_number,
+    const MutableCFOptions& mutable_cf_options) {
+#ifndef ROCKSDB_LITE
+  if (cfd->ioptions()->listeners.size() == 0U) {
+    return;
+  }
+  mutex_.AssertHeld();
+  if (shutting_down_.load(std::memory_order_acquire)) {
+    return;
+  }
+  bool triggered_flush_slowdown =
+      (cfd->current()->storage_info()->NumLevelFiles(0) >=
+       mutable_cf_options.level0_slowdown_writes_trigger);
+  bool triggered_flush_stop =
+      (cfd->current()->storage_info()->NumLevelFiles(0) >=
+       mutable_cf_options.level0_stop_writes_trigger);
+  notifying_events_++;
+  // release lock while notifying events
+  mutex_.Unlock();
+  // TODO(yhchiang): make db_paths dynamic.
+  cfd->NotifyOnFlushCompleted(
+        this, MakeTableFileName(db_options_.db_paths[0].path, file_number),
+        triggered_flush_slowdown,
+        triggered_flush_stop);
+  mutex_.Lock();
+  notifying_events_--;
+  assert(notifying_events_ >= 0);
+  // no need to signal bg_cv_ as it will be signaled at the end of the
+  // flush process.
+#endif  // ROCKSDB_LITE
+}
+
 Status DBImpl::CompactRange(ColumnFamilyHandle* column_family,
                             const Slice* begin, const Slice* end,
-                            bool reduce_level, int target_level) {
+                            bool reduce_level, int target_level,
+                            uint32_t target_path_id) {
+  if (target_path_id >= db_options_.db_paths.size()) {
+    return Status::InvalidArgument("Invalid target path ID");
+  }
+
   auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
   auto cfd = cfh->cfd();
 
   Status s = FlushMemTable(cfd, FlushOptions());
   if (!s.ok()) {
-    LogFlush(options_.info_log);
+    LogFlush(db_options_.info_log);
     return s;
   }
 
-  int max_level_with_files = 1;
+  int max_level_with_files = 0;
   {
-    MutexLock l(&mutex_);
+    InstrumentedMutexLock l(&mutex_);
     Version* base = cfd->current();
-    for (int level = 1; level < cfd->NumberLevels(); level++) {
-      if (base->OverlapInLevel(level, begin, end)) {
+    for (int level = 1; level < base->storage_info()->num_non_empty_levels();
+         level++) {
+      if (base->storage_info()->OverlapInLevel(level, begin, end)) {
         max_level_with_files = level;
       }
     }
   }
-  for (int level = 0; level <= max_level_with_files; level++) {
-    // in case the compaction is unversal or if we're compacting the
-    // bottom-most level, the output level will be the same as input one
-    if (cfd->options()->compaction_style == kCompactionStyleUniversal ||
-        level == max_level_with_files) {
-      s = RunManualCompaction(cfd, level, level, begin, end);
-    } else {
-      s = RunManualCompaction(cfd, level, level + 1, begin, end);
-    }
-    if (!s.ok()) {
-      LogFlush(options_.info_log);
-      return s;
+
+  if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal &&
+      cfd->NumberLevels() > 1) {
+    // Always compact all files together.
+    s = RunManualCompaction(cfd, ColumnFamilyData::kCompactAllLevels,
+                            cfd->NumberLevels() - 1, target_path_id, begin,
+                            end);
+  } else {
+    for (int level = 0; level <= max_level_with_files; level++) {
+      // in case the compaction is unversal or if we're compacting the
+      // bottom-most level, the output level will be the same as input one.
+      // level 0 can never be the bottommost level (i.e. if all files are in
+      // level 0, we will compact to level 1)
+      if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
+          cfd->ioptions()->compaction_style == kCompactionStyleFIFO ||
+          (level == max_level_with_files && level > 0)) {
+        s = RunManualCompaction(cfd, level, level, target_path_id, begin, end);
+      } else {
+        int output_level = level + 1;
+        if (cfd->ioptions()->compaction_style == kCompactionStyleLevel &&
+            cfd->ioptions()->level_compaction_dynamic_level_bytes &&
+            level == 0) {
+          output_level = ColumnFamilyData::kCompactToBaseLevel;
+        }
+        s = RunManualCompaction(cfd, level, output_level, target_path_id, begin,
+                                end);
+      }
+      if (!s.ok()) {
+        break;
+      }
+      TEST_SYNC_POINT("DBImpl::RunManualCompaction()::1");
+      TEST_SYNC_POINT("DBImpl::RunManualCompaction()::2");
     }
   }
+  if (!s.ok()) {
+    LogFlush(db_options_.info_log);
+    return s;
+  }
 
   if (reduce_level) {
     s = ReFitLevel(cfd, max_level_with_files, target_level);
   }
-  LogFlush(options_.info_log);
+  LogFlush(db_options_.info_log);
+
+  {
+    InstrumentedMutexLock l(&mutex_);
+    // an automatic compaction that has been scheduled might have been
+    // preempted by the manual compactions. Need to schedule it back.
+    MaybeScheduleFlushOrCompaction();
+  }
+
+  return s;
+}
+
+Status DBImpl::CompactFiles(
+    const CompactionOptions& compact_options,
+    ColumnFamilyHandle* column_family,
+    const std::vector<std::string>& input_file_names,
+    const int output_level, const int output_path_id) {
+#ifdef ROCKSDB_LITE
+    // not supported in lite version
+  return Status::NotSupported("Not supported in ROCKSDB LITE");
+#else
+  if (column_family == nullptr) {
+    return Status::InvalidArgument("ColumnFamilyHandle must be non-null.");
+  }
+
+  auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
+  assert(cfd);
+
+  Status s;
+  JobContext job_context(0, true);
+  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL,
+                       db_options_.info_log.get());
+
+  // Perform CompactFiles
+  SuperVersion* sv = GetAndRefSuperVersion(cfd);
+  {
+    InstrumentedMutexLock l(&mutex_);
+
+    s = CompactFilesImpl(compact_options, cfd, sv->current,
+                         input_file_names, output_level,
+                         output_path_id, &job_context, &log_buffer);
+  }
+  ReturnAndCleanupSuperVersion(cfd, sv);
+
+  // Find and delete obsolete files
+  {
+    InstrumentedMutexLock l(&mutex_);
+    // If !s.ok(), this means that Compaction failed. In that case, we want
+    // to delete all obsolete files we might have created and we force
+    // FindObsoleteFiles(). This is because job_context does not
+    // catch all created files if compaction failed.
+    FindObsoleteFiles(&job_context, !s.ok());
+  }
+
+  // delete unnecessary files if any, this is done outside the mutex
+  if (job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
+    // Have to flush the info logs before bg_compaction_scheduled_--
+    // because if bg_flush_scheduled_ becomes 0 and the lock is
+    // released, the deconstructor of DB can kick in and destroy all the
+    // states of DB so info_log might not be available after that point.
+    // It also applies to access other states that DB owns.
+    log_buffer.FlushBufferToLog();
+    if (job_context.HaveSomethingToDelete()) {
+      PurgeObsoleteFiles(job_context);
+    }
+    job_context.Clean();
+  }
 
   return s;
+#endif  // ROCKSDB_LITE
+}
+
+#ifndef ROCKSDB_LITE
+Status DBImpl::CompactFilesImpl(
+    const CompactionOptions& compact_options, ColumnFamilyData* cfd,
+    Version* version, const std::vector<std::string>& input_file_names,
+    const int output_level, int output_path_id, JobContext* job_context,
+    LogBuffer* log_buffer) {
+  mutex_.AssertHeld();
+
+  if (shutting_down_.load(std::memory_order_acquire)) {
+    return Status::ShutdownInProgress();
+  }
+
+  std::unordered_set<uint64_t> input_set;
+  for (auto file_name : input_file_names) {
+    input_set.insert(TableFileNameToNumber(file_name));
+  }
+
+  ColumnFamilyMetaData cf_meta;
+  // TODO(yhchiang): can directly use version here if none of the
+  // following functions call is pluggable to external developers.
+  version->GetColumnFamilyMetaData(&cf_meta);
+
+  if (output_path_id < 0) {
+    if (db_options_.db_paths.size() == 1U) {
+      output_path_id = 0;
+    } else {
+      return Status::NotSupported(
+          "Automatic output path selection is not "
+          "yet supported in CompactFiles()");
+    }
+  }
+
+  Status s = cfd->compaction_picker()->SanitizeCompactionInputFiles(
+      &input_set, cf_meta, output_level);
+  if (!s.ok()) {
+    return s;
+  }
+
+  std::vector<CompactionInputFiles> input_files;
+  s = cfd->compaction_picker()->GetCompactionInputsFromFileNumbers(
+      &input_files, &input_set, version->storage_info(), compact_options);
+  if (!s.ok()) {
+    return s;
+  }
+
+  for (auto inputs : input_files) {
+    if (cfd->compaction_picker()->FilesInCompaction(inputs.files)) {
+      return Status::Aborted(
+          "Some of the necessary compaction input "
+          "files are already being compacted");
+    }
+  }
+
+  // At this point, CompactFiles will be run.
+  bg_compaction_scheduled_++;
+
+  unique_ptr<Compaction> c;
+  assert(cfd->compaction_picker());
+  c.reset(cfd->compaction_picker()->FormCompaction(
+      compact_options, input_files, output_level, version->storage_info(),
+      *cfd->GetLatestMutableCFOptions(), output_path_id));
+  assert(c);
+  c->SetInputVersion(version);
+  // deletion compaction currently not allowed in CompactFiles.
+  assert(!c->IsDeletionCompaction());
+
+  auto yield_callback = [&]() {
+    return CallFlushDuringCompaction(
+        c->column_family_data(), *c->mutable_cf_options(),
+        job_context, log_buffer);
+  };
+  assert(is_snapshot_supported_ || snapshots_.empty());
+  CompactionJob compaction_job(
+      job_context->job_id, c.get(), db_options_, env_options_, versions_.get(),
+      &shutting_down_, log_buffer, directories_.GetDbDir(),
+      directories_.GetDataDir(c->GetOutputPathId()), stats_,
+      snapshots_.GetAll(), table_cache_, std::move(yield_callback),
+      &event_logger_, c->mutable_cf_options()->paranoid_file_checks);
+  compaction_job.Prepare();
+
+  mutex_.Unlock();
+  Status status = compaction_job.Run();
+  mutex_.Lock();
+  compaction_job.Install(&status, *c->mutable_cf_options(), &mutex_);
+  if (status.ok()) {
+    InstallSuperVersionBackground(c->column_family_data(), job_context,
+                                  *c->mutable_cf_options());
+  }
+  c->ReleaseCompactionFiles(s);
+  c.reset();
+
+  if (status.ok()) {
+    // Done
+  } else if (status.IsShutdownInProgress()) {
+    // Ignore compaction errors found during shutting down
+  } else {
+    Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
+        "[%s] [JOB %d] Compaction error: %s",
+        c->column_family_data()->GetName().c_str(), job_context->job_id,
+        status.ToString().c_str());
+    if (db_options_.paranoid_checks && bg_error_.ok()) {
+      bg_error_ = status;
+    }
+  }
+
+  bg_compaction_scheduled_--;
+
+  return status;
+}
+#endif  // ROCKSDB_LITE
+
+void DBImpl::NotifyOnCompactionCompleted(
+    ColumnFamilyData* cfd, Compaction *c, const Status &st) {
+#ifndef ROCKSDB_LITE
+  if (cfd->ioptions()->listeners.size() == 0U) {
+    return;
+  }
+  mutex_.AssertHeld();
+  if (shutting_down_.load(std::memory_order_acquire)) {
+    return;
+  }
+  notifying_events_++;
+  // release lock while notifying events
+  mutex_.Unlock();
+  cfd->NotifyOnCompactionCompleted(this, c, st);
+  mutex_.Lock();
+  notifying_events_--;
+  assert(notifying_events_ >= 0);
+  // no need to signal bg_cv_ as it will be signaled at the end of the
+  // flush process.
+#endif  // ROCKSDB_LITE
+}
+
+Status DBImpl::SetOptions(ColumnFamilyHandle* column_family,
+    const std::unordered_map<std::string, std::string>& options_map) {
+#ifdef ROCKSDB_LITE
+  return Status::NotSupported("Not supported in ROCKSDB LITE");
+#else
+  auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
+  if (options_map.empty()) {
+    Log(InfoLogLevel::WARN_LEVEL,
+        db_options_.info_log, "SetOptions() on column family [%s], empty input",
+        cfd->GetName().c_str());
+    return Status::InvalidArgument("empty input");
+  }
+
+  MutableCFOptions new_options;
+  Status s;
+  {
+    InstrumentedMutexLock l(&mutex_);
+    s = cfd->SetOptions(options_map);
+    if (s.ok()) {
+      new_options = *cfd->GetLatestMutableCFOptions();
+    }
+  }
+
+  Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+      "SetOptions() on column family [%s], inputs:",
+      cfd->GetName().c_str());
+  for (const auto& o : options_map) {
+    Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+        "%s: %s\n", o.first.c_str(), o.second.c_str());
+  }
+  if (s.ok()) {
+    Log(InfoLogLevel::INFO_LEVEL,
+        db_options_.info_log, "[%s] SetOptions succeeded",
+        cfd->GetName().c_str());
+    new_options.Dump(db_options_.info_log.get());
+  } else {
+    Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
+        "[%s] SetOptions failed", cfd->GetName().c_str());
+  }
+  return s;
+#endif  // ROCKSDB_LITE
 }
 
 // return the same level if it cannot be moved
-int DBImpl::FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd, int level) {
+int DBImpl::FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd,
+    const MutableCFOptions& mutable_cf_options, int level) {
   mutex_.AssertHeld();
-  Version* current = cfd->current();
+  const auto* vstorage = cfd->current()->storage_info();
   int minimum_level = level;
   for (int i = level - 1; i > 0; --i) {
     // stop if level i is not empty
-    if (current->NumLevelFiles(i) > 0) break;
+    if (vstorage->NumLevelFiles(i) > 0) break;
     // stop if level i is too small (cannot fit the level files)
-    if (cfd->compaction_picker()->MaxBytesForLevel(i) <
-        current->NumLevelBytes(level)) {
+    if (vstorage->MaxBytesForLevel(i) < vstorage->NumLevelBytes(level)) {
       break;
     }
 
@@ -1642,7 +1628,8 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
   // only allow one thread refitting
   if (refitting_level_) {
     mutex_.Unlock();
-    Log(options_.info_log, "ReFitLevel: another thread is refitting");
+    Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+        "[ReFitLevel] another thread is refitting");
     delete new_superversion;
     return Status::NotSupported("another thread is refitting");
   }
@@ -1651,44 +1638,53 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
   // wait for all background threads to stop
   bg_work_gate_closed_ = true;
   while (bg_compaction_scheduled_ > 0 || bg_flush_scheduled_) {
-    Log(options_.info_log,
-        "RefitLevel: waiting for background threads to stop: %d %d",
+    Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+        "[RefitLevel] waiting for background threads to stop: %d %d",
         bg_compaction_scheduled_, bg_flush_scheduled_);
     bg_cv_.Wait();
   }
 
+  const MutableCFOptions mutable_cf_options =
+    *cfd->GetLatestMutableCFOptions();
   // move to a smaller level
   int to_level = target_level;
   if (target_level < 0) {
-    to_level = FindMinimumEmptyLevelFitting(cfd, level);
+    to_level = FindMinimumEmptyLevelFitting(cfd, mutable_cf_options, level);
   }
 
   assert(to_level <= level);
 
   Status status;
   if (to_level < level) {
-    Log(options_.info_log, "[%s] Before refitting:\n%s", cfd->GetName().c_str(),
-        cfd->current()->DebugString().data());
+    Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
+        "[%s] Before refitting:\n%s",
+        cfd->GetName().c_str(), cfd->current()->DebugString().data());
 
     VersionEdit edit;
     edit.SetColumnFamily(cfd->GetID());
-    for (const auto& f : cfd->current()->files_[level]) {
-      edit.DeleteFile(level, f->number);
-      edit.AddFile(to_level, f->number, f->file_size, f->smallest, f->largest,
+    for (const auto& f : cfd->current()->storage_info()->LevelFiles(level)) {
+      edit.DeleteFile(level, f->fd.GetNumber());
+      edit.AddFile(to_level, f->fd.GetNumber(), f->fd.GetPathId(),
+                   f->fd.GetFileSize(), f->smallest, f->largest,
                    f->smallest_seqno, f->largest_seqno);
     }
-    Log(options_.info_log, "[%s] Apply version edit:\n%s",
+    Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
+        "[%s] Apply version edit:\n%s",
         cfd->GetName().c_str(), edit.DebugString().data());
 
-    status = versions_->LogAndApply(cfd, &edit, &mutex_, db_directory_.get());
-    superversion_to_free = cfd->InstallSuperVersion(new_superversion, &mutex_);
+    status = versions_->LogAndApply(cfd, mutable_cf_options, &edit, &mutex_,
+                                    directories_.GetDbDir());
+    superversion_to_free = InstallSuperVersion(
+        cfd, new_superversion, mutable_cf_options);
     new_superversion = nullptr;
 
-    Log(options_.info_log, "[%s] LogAndApply: %s\n", cfd->GetName().c_str(),
+    Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
+        "[%s] LogAndApply: %s\n", cfd->GetName().c_str(),
         status.ToString().data());
 
     if (status.ok()) {
-      Log(options_.info_log, "[%s] After refitting:\n%s",
+      Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
+          "[%s] After refitting:\n%s",
           cfd->GetName().c_str(), cfd->current()->DebugString().data());
     }
   }
@@ -1709,18 +1705,22 @@ int DBImpl::NumberLevels(ColumnFamilyHandle* column_family) {
 
 int DBImpl::MaxMemCompactionLevel(ColumnFamilyHandle* column_family) {
   auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
-  return cfh->cfd()->options()->max_mem_compaction_level;
+  InstrumentedMutexLock l(&mutex_);
+  return cfh->cfd()->GetSuperVersion()->
+      mutable_cf_options.max_mem_compaction_level;
 }
 
 int DBImpl::Level0StopWriteTrigger(ColumnFamilyHandle* column_family) {
   auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
-  return cfh->cfd()->options()->level0_stop_writes_trigger;
+  InstrumentedMutexLock l(&mutex_);
+  return cfh->cfd()->GetSuperVersion()->
+      mutable_cf_options.level0_stop_writes_trigger;
 }
 
-Status DBImpl::Flush(const FlushOptions& options,
+Status DBImpl::Flush(const FlushOptions& flush_options,
                      ColumnFamilyHandle* column_family) {
   auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
-  return FlushMemTable(cfh->cfd(), options);
+  return FlushMemTable(cfh->cfd(), flush_options);
 }
 
 SequenceNumber DBImpl::GetLatestSequenceNumber() const {
@@ -1728,9 +1728,10 @@ SequenceNumber DBImpl::GetLatestSequenceNumber() const {
 }
 
 Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level,
-                                   int output_level, const Slice* begin,
-                                   const Slice* end) {
-  assert(input_level >= 0);
+                                   int output_level, uint32_t output_path_id,
+                                   const Slice* begin, const Slice* end) {
+  assert(input_level == ColumnFamilyData::kCompactAllLevels ||
+         input_level >= 0);
 
   InternalKey begin_storage, end_storage;
 
@@ -1738,26 +1739,29 @@ Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level,
   manual.cfd = cfd;
   manual.input_level = input_level;
   manual.output_level = output_level;
+  manual.output_path_id = output_path_id;
   manual.done = false;
   manual.in_progress = false;
   // For universal compaction, we enforce every manual compaction to compact
   // all files.
   if (begin == nullptr ||
-      cfd->options()->compaction_style == kCompactionStyleUniversal) {
+      cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
+      cfd->ioptions()->compaction_style == kCompactionStyleFIFO) {
     manual.begin = nullptr;
   } else {
-    begin_storage = InternalKey(*begin, kMaxSequenceNumber, kValueTypeForSeek);
+    begin_storage.SetMaxPossibleForUserKey(*begin);
     manual.begin = &begin_storage;
   }
   if (end == nullptr ||
-      cfd->options()->compaction_style == kCompactionStyleUniversal) {
+      cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
+      cfd->ioptions()->compaction_style == kCompactionStyleFIFO) {
     manual.end = nullptr;
   } else {
-    end_storage = InternalKey(*end, 0, static_cast<ValueType>(0));
+    end_storage.SetMinPossibleForUserKey(*end);
     manual.end = &end_storage;
   }
 
-  MutexLock l(&mutex_);
+  InstrumentedMutexLock l(&mutex_);
 
   // When a manual compaction arrives, temporarily disable scheduling of
   // non-manual compactions and wait until the number of scheduled compaction
@@ -1775,24 +1779,29 @@ Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level,
 
   ++bg_manual_only_;
   while (bg_compaction_scheduled_ > 0) {
-    Log(options_.info_log,
+    Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
         "[%s] Manual compaction waiting for all other scheduled background "
         "compactions to finish",
         cfd->GetName().c_str());
     bg_cv_.Wait();
   }
 
-  Log(options_.info_log, "[%s] Manual compaction starting",
+  Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+      "[%s] Manual compaction starting",
       cfd->GetName().c_str());
 
-  while (!manual.done && !shutting_down_.Acquire_Load() && bg_error_.ok()) {
+  // We don't check bg_error_ here, because if we get the error in compaction,
+  // the compaction will set manual.status to bg_error_ and set manual.done to
+  // true.
+  while (!manual.done) {
     assert(bg_manual_only_ > 0);
     if (manual_compaction_ != nullptr) {
       // Running either this or some other manual compaction
       bg_cv_.Wait();
     } else {
       manual_compaction_ = &manual;
-      MaybeScheduleFlushOrCompaction();
+      bg_compaction_scheduled_++;
+      env_->Schedule(&DBImpl::BGWorkCompaction, this, Env::Priority::LOW, this);
     }
   }
 
@@ -1803,10 +1812,34 @@ Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level,
 }
 
 Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
-                             const FlushOptions& options) {
-  // nullptr batch means just wait for earlier writes to be done
-  Status s = Write(WriteOptions(), nullptr);
-  if (s.ok() && options.wait) {
+                             const FlushOptions& flush_options) {
+  Status s;
+  {
+    WriteContext context;
+    InstrumentedMutexLock guard_lock(&mutex_);
+
+    if (cfd->imm()->size() == 0 && cfd->mem()->IsEmpty()) {
+      // Nothing to flush
+      return Status::OK();
+    }
+
+    WriteThread::Writer w(&mutex_);
+    s = write_thread_.EnterWriteThread(&w, 0);
+    assert(s.ok() && !w.done);  // No timeout and nobody should do our job
+
+    // SetNewMemtableAndNewLogFile() will release and reacquire mutex
+    // during execution
+    s = SetNewMemtableAndNewLogFile(cfd, &context);
+    write_thread_.ExitWriteThread(&w, &w, s);
+
+    cfd->imm()->FlushRequested();
+
+    // schedule flush
+    SchedulePendingFlush(cfd);
+    MaybeScheduleFlushOrCompaction();
+  }
+
+  if (s.ok() && flush_options.wait) {
     // Wait until the compaction completes
     s = WaitForFlushMemTable(cfd);
   }
@@ -1816,7 +1849,7 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
 Status DBImpl::WaitForFlushMemTable(ColumnFamilyData* cfd) {
   Status s;
   // Wait until the compaction completes
-  MutexLock l(&mutex_);
+  InstrumentedMutexLock l(&mutex_);
   while (cfd->imm()->size() > 0 && bg_error_.ok()) {
     bg_cv_.Wait();
   }
@@ -1828,131 +1861,205 @@ Status DBImpl::WaitForFlushMemTable(ColumnFamilyData* cfd) {
 
 void DBImpl::MaybeScheduleFlushOrCompaction() {
   mutex_.AssertHeld();
-  bg_schedule_needed_ = false;
   if (bg_work_gate_closed_) {
-    // gate closed for backgrond work
-  } else if (shutting_down_.Acquire_Load()) {
+    // gate closed for background work
+    return;
+  } else if (shutting_down_.load(std::memory_order_acquire)) {
     // DB is being deleted; no more background compactions
-  } else {
-    bool is_flush_pending = false;
-    // no need to refcount since we're under a mutex
-    for (auto cfd : *versions_->GetColumnFamilySet()) {
-      if (cfd->imm()->IsFlushPending()) {
-        is_flush_pending = true;
-      }
-    }
-    if (is_flush_pending) {
-      // memtable flush needed
-      if (bg_flush_scheduled_ < options_.max_background_flushes) {
-        bg_flush_scheduled_++;
-        env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::HIGH);
-      } else if (options_.max_background_flushes > 0) {
-        bg_schedule_needed_ = true;
-      }
-    }
-    bool is_compaction_needed = false;
-    // no need to refcount since we're under a mutex
-    for (auto cfd : *versions_->GetColumnFamilySet()) {
-      if (cfd->current()->NeedsCompaction()) {
-        is_compaction_needed = true;
-        break;
-      }
-    }
+    return;
+  }
 
-    // Schedule BGWorkCompaction if there's a compaction pending (or a memtable
-    // flush, but the HIGH pool is not enabled)
-    // Do it only if max_background_compactions hasn't been reached and, in case
-    // bg_manual_only_ > 0, if it's a manual compaction.
-    if ((manual_compaction_ || is_compaction_needed ||
-         (is_flush_pending && options_.max_background_flushes == 0)) &&
-        (!bg_manual_only_ || manual_compaction_)) {
-      if (bg_compaction_scheduled_ < options_.max_background_compactions) {
-        bg_compaction_scheduled_++;
-        env_->Schedule(&DBImpl::BGWorkCompaction, this, Env::Priority::LOW);
-      } else {
-        bg_schedule_needed_ = true;
-      }
+  while (unscheduled_flushes_ > 0 &&
+         bg_flush_scheduled_ < db_options_.max_background_flushes) {
+    unscheduled_flushes_--;
+    bg_flush_scheduled_++;
+    env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::HIGH, this);
+  }
+
+  if (bg_manual_only_) {
+    // only manual compactions are allowed to run. don't schedule automatic
+    // compactions
+    return;
+  }
+
+  if (db_options_.max_background_flushes == 0 &&
+      bg_compaction_scheduled_ < db_options_.max_background_compactions &&
+      unscheduled_flushes_ > 0) {
+    // special case where flush is executed by compaction thread
+    // (if max_background_flushes == 0).
+    // Compaction thread will execute all the flushes
+    unscheduled_flushes_ = 0;
+    if (unscheduled_compactions_ > 0) {
+      // bg compaction will execute one compaction
+      unscheduled_compactions_--;
     }
+    bg_compaction_scheduled_++;
+    env_->Schedule(&DBImpl::BGWorkCompaction, this, Env::Priority::LOW, this);
+  }
+
+  while (bg_compaction_scheduled_ < db_options_.max_background_compactions &&
+         unscheduled_compactions_ > 0) {
+    bg_compaction_scheduled_++;
+    unscheduled_compactions_--;
+    env_->Schedule(&DBImpl::BGWorkCompaction, this, Env::Priority::LOW, this);
   }
 }
 
+void DBImpl::AddToCompactionQueue(ColumnFamilyData* cfd) {
+  assert(!cfd->pending_compaction());
+  cfd->Ref();
+  compaction_queue_.push_back(cfd);
+  cfd->set_pending_compaction(true);
+}
+
+ColumnFamilyData* DBImpl::PopFirstFromCompactionQueue() {
+  assert(!compaction_queue_.empty());
+  auto cfd = *compaction_queue_.begin();
+  compaction_queue_.pop_front();
+  assert(cfd->pending_compaction());
+  cfd->set_pending_compaction(false);
+  return cfd;
+}
+
+void DBImpl::AddToFlushQueue(ColumnFamilyData* cfd) {
+  assert(!cfd->pending_flush());
+  cfd->Ref();
+  flush_queue_.push_back(cfd);
+  cfd->set_pending_flush(true);
+}
+
+ColumnFamilyData* DBImpl::PopFirstFromFlushQueue() {
+  assert(!flush_queue_.empty());
+  auto cfd = *flush_queue_.begin();
+  flush_queue_.pop_front();
+  assert(cfd->pending_flush());
+  cfd->set_pending_flush(false);
+  return cfd;
+}
+
+void DBImpl::SchedulePendingFlush(ColumnFamilyData* cfd) {
+  if (!cfd->pending_flush() && cfd->imm()->IsFlushPending()) {
+    AddToFlushQueue(cfd);
+    ++unscheduled_flushes_;
+  }
+}
+
+void DBImpl::SchedulePendingCompaction(ColumnFamilyData* cfd) {
+  if (!cfd->pending_compaction() && cfd->NeedsCompaction()) {
+    AddToCompactionQueue(cfd);
+    ++unscheduled_compactions_;
+  }
+}
+
+void DBImpl::RecordFlushIOStats() {
+  RecordTick(stats_, FLUSH_WRITE_BYTES, IOSTATS(bytes_written));
+  IOSTATS_RESET(bytes_written);
+}
+
 void DBImpl::BGWorkFlush(void* db) {
+  IOSTATS_SET_THREAD_POOL_ID(Env::Priority::HIGH);
   reinterpret_cast<DBImpl*>(db)->BackgroundCallFlush();
 }
 
 void DBImpl::BGWorkCompaction(void* db) {
+  IOSTATS_SET_THREAD_POOL_ID(Env::Priority::LOW);
+  TEST_SYNC_POINT("DBImpl::BGWorkCompaction");
   reinterpret_cast<DBImpl*>(db)->BackgroundCallCompaction();
 }
 
-Status DBImpl::BackgroundFlush(bool* madeProgress,
-                               DeletionState& deletion_state,
+Status DBImpl::BackgroundFlush(bool* madeProgress, JobContext* job_context,
                                LogBuffer* log_buffer) {
   mutex_.AssertHeld();
-  // call_status is failure if at least one flush was a failure. even if
-  // flushing one column family reports a failure, we will continue flushing
-  // other column families. however, call_status will be a failure in that case.
-  Status call_status;
-  // refcounting in iteration
-  for (auto cfd : *versions_->GetColumnFamilySet()) {
-    cfd->Ref();
-    Status flush_status;
-    while (flush_status.ok() && cfd->imm()->IsFlushPending()) {
-      LogToBuffer(
-          log_buffer,
-          "BackgroundCallFlush doing FlushMemTableToOutputFile with column "
-          "family [%s], flush slots available %d",
-          cfd->GetName().c_str(),
-          options_.max_background_flushes - bg_flush_scheduled_);
-      flush_status = FlushMemTableToOutputFile(cfd, madeProgress,
-                                               deletion_state, log_buffer);
+
+  Status status = bg_error_;
+  if (status.ok() && shutting_down_.load(std::memory_order_acquire)) {
+    status = Status::ShutdownInProgress();
+  }
+
+  if (!status.ok()) {
+    return status;
+  }
+
+  ColumnFamilyData* cfd = nullptr;
+  while (!flush_queue_.empty()) {
+    // This cfd is already referenced
+    auto first_cfd = PopFirstFromFlushQueue();
+
+    if (first_cfd->IsDropped() || !first_cfd->imm()->IsFlushPending()) {
+      // can't flush this CF, try next one
+      if (first_cfd->Unref()) {
+        delete first_cfd;
+      }
+      continue;
     }
-    if (call_status.ok() && !flush_status.ok()) {
-      call_status = flush_status;
+
+    // found a flush!
+    cfd = first_cfd;
+    break;
+  }
+
+  if (cfd != nullptr) {
+    const MutableCFOptions mutable_cf_options =
+        *cfd->GetLatestMutableCFOptions();
+    LogToBuffer(
+        log_buffer,
+        "Calling FlushMemTableToOutputFile with column "
+        "family [%s], flush slots available %d, compaction slots available %d",
+        cfd->GetName().c_str(),
+        db_options_.max_background_flushes - bg_flush_scheduled_,
+        db_options_.max_background_compactions - bg_compaction_scheduled_);
+    status = FlushMemTableToOutputFile(cfd, mutable_cf_options, madeProgress,
+                                       job_context, log_buffer);
+    if (cfd->Unref()) {
+      delete cfd;
     }
-    cfd->Unref();
   }
-  versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
-  return call_status;
+  return status;
 }
 
 void DBImpl::BackgroundCallFlush() {
   bool madeProgress = false;
-  DeletionState deletion_state(true);
+  JobContext job_context(next_job_id_.fetch_add(1), true);
   assert(bg_flush_scheduled_);
 
-  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, options_.info_log.get());
+  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get());
   {
-    MutexLock l(&mutex_);
-
-    Status s;
-    if (!shutting_down_.Acquire_Load()) {
-      s = BackgroundFlush(&madeProgress, deletion_state, &log_buffer);
-      if (!s.ok()) {
-        // Wait a little bit before retrying background compaction in
-        // case this is an environmental problem and we do not want to
-        // chew up resources for failed compactions for the duration of
-        // the problem.
-        uint64_t error_cnt = default_cf_handle_->cfd()
-                                 ->internal_stats()
-                                 ->BumpAndGetBackgroundErrorCount();
-        bg_cv_.SignalAll();  // In case a waiter can proceed despite the error
-        mutex_.Unlock();
-        Log(options_.info_log,
-            "Waiting after background flush error: %s"
-            "Accumulated background error counts: %" PRIu64,
-            s.ToString().c_str(), error_cnt);
-        log_buffer.FlushBufferToLog();
-        LogFlush(options_.info_log);
-        env_->SleepForMicroseconds(1000000);
-        mutex_.Lock();
-      }
+    InstrumentedMutexLock l(&mutex_);
+
+    auto pending_outputs_inserted_elem =
+        CaptureCurrentFileNumberInPendingOutputs();
+
+    Status s = BackgroundFlush(&madeProgress, &job_context, &log_buffer);
+    if (!s.ok() && !s.IsShutdownInProgress()) {
+      // Wait a little bit before retrying background flush in
+      // case this is an environmental problem and we do not want to
+      // chew up resources for failed flushes for the duration of
+      // the problem.
+      uint64_t error_cnt =
+        default_cf_internal_stats_->BumpAndGetBackgroundErrorCount();
+      bg_cv_.SignalAll();  // In case a waiter can proceed despite the error
+      mutex_.Unlock();
+      Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+          "Waiting after background flush error: %s"
+          "Accumulated background error counts: %" PRIu64,
+          s.ToString().c_str(), error_cnt);
+      log_buffer.FlushBufferToLog();
+      LogFlush(db_options_.info_log);
+      env_->SleepForMicroseconds(1000000);
+      mutex_.Lock();
     }
 
-    // If !s.ok(), this means that Flush failed. In that case, we want
-    // to delete all obsolete files and we force FindObsoleteFiles()
-    FindObsoleteFiles(deletion_state, !s.ok());
+    ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
+
+    // We're just cleaning up for DB::Write()
+    job_context.logs_to_free = logs_to_free_;
+    logs_to_free_.clear();
+
+    // If flush failed, we want to delete all temporary files that we might have
+    // created. Thus, we force full scan in FindObsoleteFiles()
+    FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress());
     // delete unnecessary files if any, this is done outside the mutex
-    if (deletion_state.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
+    if (job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
       mutex_.Unlock();
       // Have to flush the info logs before bg_flush_scheduled_--
       // because if bg_flush_scheduled_ becomes 0 and the lock is
@@ -1960,20 +2067,17 @@ void DBImpl::BackgroundCallFlush() {
       // states of DB so info_log might not be available after that point.
       // It also applies to access other states that DB owns.
       log_buffer.FlushBufferToLog();
-      if (deletion_state.HaveSomethingToDelete()) {
-        PurgeObsoleteFiles(deletion_state);
+      if (job_context.HaveSomethingToDelete()) {
+        PurgeObsoleteFiles(job_context);
       }
+      job_context.Clean();
       mutex_.Lock();
     }
 
     bg_flush_scheduled_--;
-    // Any time the mutex is released After finding the work to do, another
-    // thread might execute MaybeScheduleFlushOrCompaction(). It is possible
-    // that there is a pending job but it is not scheduled because of the
-    // max thread limit.
-    if (madeProgress || bg_schedule_needed_) {
-      MaybeScheduleFlushOrCompaction();
-    }
+    // See if there's more work to be done
+    MaybeScheduleFlushOrCompaction();
+    RecordFlushIOStats();
     bg_cv_.SignalAll();
     // IMPORTANT: there should be no code after calling SignalAll. This call may
     // signal the DB destructor that it's OK to proceed with destruction. In
@@ -1984,45 +2088,50 @@ void DBImpl::BackgroundCallFlush() {
 
 void DBImpl::BackgroundCallCompaction() {
   bool madeProgress = false;
-  DeletionState deletion_state(true);
+  JobContext job_context(next_job_id_.fetch_add(1), true);
 
   MaybeDumpStats();
-  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, options_.info_log.get());
+  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get());
   {
-    MutexLock l(&mutex_);
+    InstrumentedMutexLock l(&mutex_);
+
+    auto pending_outputs_inserted_elem =
+        CaptureCurrentFileNumberInPendingOutputs();
+
     assert(bg_compaction_scheduled_);
-    Status s;
-    if (!shutting_down_.Acquire_Load()) {
-      s = BackgroundCompaction(&madeProgress, deletion_state, &log_buffer);
-      if (!s.ok()) {
-        // Wait a little bit before retrying background compaction in
-        // case this is an environmental problem and we do not want to
-        // chew up resources for failed compactions for the duration of
-        // the problem.
-        uint64_t error_cnt = default_cf_handle_->cfd()
-                                 ->internal_stats()
-                                 ->BumpAndGetBackgroundErrorCount();
-        bg_cv_.SignalAll();  // In case a waiter can proceed despite the error
-        mutex_.Unlock();
-        log_buffer.FlushBufferToLog();
-        Log(options_.info_log,
-            "Waiting after background compaction error: %s, "
-            "Accumulated background error counts: %" PRIu64,
-            s.ToString().c_str(), error_cnt);
-        LogFlush(options_.info_log);
-        env_->SleepForMicroseconds(1000000);
-        mutex_.Lock();
-      }
+    Status s = BackgroundCompaction(&madeProgress, &job_context, &log_buffer);
+    if (!s.ok() && !s.IsShutdownInProgress()) {
+      // Wait a little bit before retrying background compaction in
+      // case this is an environmental problem and we do not want to
+      // chew up resources for failed compactions for the duration of
+      // the problem.
+      uint64_t error_cnt =
+          default_cf_internal_stats_->BumpAndGetBackgroundErrorCount();
+      bg_cv_.SignalAll();  // In case a waiter can proceed despite the error
+      mutex_.Unlock();
+      log_buffer.FlushBufferToLog();
+      Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+          "Waiting after background compaction error: %s, "
+          "Accumulated background error counts: %" PRIu64,
+          s.ToString().c_str(), error_cnt);
+      LogFlush(db_options_.info_log);
+      env_->SleepForMicroseconds(1000000);
+      mutex_.Lock();
     }
 
-    // If !s.ok(), this means that Compaction failed. In that case, we want
-    // to delete all obsolete files we might have created and we force
-    // FindObsoleteFiles(). This is because deletion_state does not catch
-    // all created files if compaction failed.
-    FindObsoleteFiles(deletion_state, !s.ok());
+    ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
+
+    // We're just cleaning up for DB::Write()
+    job_context.logs_to_free = logs_to_free_;
+    logs_to_free_.clear();
+
+    // If compaction failed, we want to delete all temporary files that we might
+    // have created (they might not be all recorded in job_context in case of a
+    // failure). Thus, we force full scan in FindObsoleteFiles()
+    FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress());
 
     // delete unnecessary files if any, this is done outside the mutex
-    if (deletion_state.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
+    if (job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
       mutex_.Unlock();
       // Have to flush the info logs before bg_compaction_scheduled_--
       // because if bg_flush_scheduled_ becomes 0 and the lock is
@@ -2030,30 +2139,28 @@ void DBImpl::BackgroundCallCompaction() {
       // states of DB so info_log might not be available after that point.
       // It also applies to access other states that DB owns.
       log_buffer.FlushBufferToLog();
-      if (deletion_state.HaveSomethingToDelete()) {
-        PurgeObsoleteFiles(deletion_state);
+      if (job_context.HaveSomethingToDelete()) {
+        PurgeObsoleteFiles(job_context);
       }
+      job_context.Clean();
       mutex_.Lock();
     }
 
     bg_compaction_scheduled_--;
 
-    MaybeScheduleLogDBDeployStats();
-
     versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
 
-    // Previous compaction may have produced too many files in a level,
-    // So reschedule another compaction if we made progress in the
-    // last compaction.
-    //
-    // Also, any time the mutex is released After finding the work to do,
-    // another thread might execute MaybeScheduleFlushOrCompaction(). It is
-    // possible  that there is a pending job but it is not scheduled because of
-    // the max thread limit.
-    if (madeProgress || bg_schedule_needed_) {
-      MaybeScheduleFlushOrCompaction();
+    // See if there's more work to be done
+    MaybeScheduleFlushOrCompaction();
+    if (madeProgress || bg_compaction_scheduled_ == 0 || bg_manual_only_ > 0) {
+      // signal if
+      // * madeProgress -- need to wakeup DelayWrite
+      // * bg_compaction_scheduled_ == 0 -- need to wakeup ~DBImpl
+      // * bg_manual_only_ > 0 -- need to wakeup RunManualCompaction
+      // If none of this is true, there is no need to signal since nobody is
+      // waiting for it
+      bg_cv_.SignalAll();
     }
-    bg_cv_.SignalAll();
     // IMPORTANT: there should be no code after calling SignalAll. This call may
     // signal the DB destructor that it's OK to proceed with destruction. In
     // that case, all DB variables will be dealloacated and referencing them
@@ -2061,8 +2168,7 @@ void DBImpl::BackgroundCallCompaction() {
   }
 }
 
-Status DBImpl::BackgroundCompaction(bool* madeProgress,
-                                    DeletionState& deletion_state,
+Status DBImpl::BackgroundCompaction(bool* madeProgress, JobContext* job_context,
                                     LogBuffer* log_buffer) {
   *madeProgress = false;
   mutex_.AssertHeld();
@@ -2070,32 +2176,55 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress,
   bool is_manual = (manual_compaction_ != nullptr) &&
                    (manual_compaction_->in_progress == false);
 
+  Status status = bg_error_;
+  if (status.ok() && shutting_down_.load(std::memory_order_acquire)) {
+    status = Status::ShutdownInProgress();
+  }
+
+  if (!status.ok()) {
+    if (is_manual) {
+      manual_compaction_->status = status;
+      manual_compaction_->done = true;
+      manual_compaction_->in_progress = false;
+      manual_compaction_ = nullptr;
+    }
+    return status;
+  }
+
   if (is_manual) {
     // another thread cannot pick up the same work
     manual_compaction_->in_progress = true;
+  } else if (manual_compaction_ != nullptr) {
+    // there should be no automatic compactions running when manual compaction
+    // is running
+    return Status::OK();
   }
 
-  // FLUSH preempts compaction
-  Status flush_stat;
-  for (auto cfd : *versions_->GetColumnFamilySet()) {
-    while (cfd->imm()->IsFlushPending()) {
+  // If there are no flush threads, then compaction thread needs to execute the
+  // flushes
+  if (db_options_.max_background_flushes == 0) {
+    // BackgroundFlush() will only execute a single flush. We keep calling it as
+    // long as there's more flushes to be done
+    while (!flush_queue_.empty()) {
       LogToBuffer(
           log_buffer,
-          "BackgroundCompaction doing FlushMemTableToOutputFile, "
-          "compaction slots available %d",
-          options_.max_background_compactions - bg_compaction_scheduled_);
-      cfd->Ref();
-      flush_stat = FlushMemTableToOutputFile(cfd, madeProgress, deletion_state,
-                                             log_buffer);
-      cfd->Unref();
-      if (!flush_stat.ok()) {
+          "BackgroundCompaction calling BackgroundFlush. flush slots available "
+          "%d, compaction slots available %d",
+          db_options_.max_background_flushes - bg_flush_scheduled_,
+          db_options_.max_background_compactions - bg_compaction_scheduled_);
+      auto flush_status =
+          BackgroundFlush(madeProgress, job_context, log_buffer);
+      // the second condition will be false when a column family is dropped. we
+      // don't want to fail compaction because of that (because it might be a
+      // different column family)
+      if (!flush_status.ok() && !flush_status.IsShutdownInProgress()) {
         if (is_manual) {
-          manual_compaction_->status = flush_stat;
+          manual_compaction_->status = flush_status;
           manual_compaction_->done = true;
           manual_compaction_->in_progress = false;
           manual_compaction_ = nullptr;
         }
-        return flush_stat;
+        return flush_status;
       }
     }
   }
@@ -2106,79 +2235,185 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress,
   if (is_manual) {
     ManualCompaction* m = manual_compaction_;
     assert(m->in_progress);
-    c.reset(m->cfd->CompactRange(m->input_level, m->output_level, m->begin,
-                                 m->end, &manual_end));
+    c.reset(m->cfd->CompactRange(
+          *m->cfd->GetLatestMutableCFOptions(), m->input_level, m->output_level,
+          m->output_path_id, m->begin, m->end, &manual_end));
     if (!c) {
       m->done = true;
+      LogToBuffer(log_buffer,
+                  "[%s] Manual compaction from level-%d from %s .. "
+                  "%s; nothing to do\n",
+                  m->cfd->GetName().c_str(), m->input_level,
+                  (m->begin ? m->begin->DebugString().c_str() : "(begin)"),
+                  (m->end ? m->end->DebugString().c_str() : "(end)"));
+    } else {
+      LogToBuffer(log_buffer,
+                  "[%s] Manual compaction from level-%d to level-%d from %s .. "
+                  "%s; will stop at %s\n",
+                  m->cfd->GetName().c_str(), m->input_level, c->output_level(),
+                  (m->begin ? m->begin->DebugString().c_str() : "(begin)"),
+                  (m->end ? m->end->DebugString().c_str() : "(end)"),
+                  ((m->done || manual_end == nullptr)
+                       ? "(end)"
+                       : manual_end->DebugString().c_str()));
     }
-    LogToBuffer(log_buffer,
-                "[%s] Manual compaction from level-%d to level-%d from %s .. "
-                "%s; will stop at %s\n",
-                m->cfd->GetName().c_str(), m->input_level, m->output_level,
-                (m->begin ? m->begin->DebugString().c_str() : "(begin)"),
-                (m->end ? m->end->DebugString().c_str() : "(end)"),
-                ((m->done || manual_end == nullptr)
-                     ? "(end)"
-                     : manual_end->DebugString().c_str()));
-  } else {
-    // no need to refcount in iteration since it's always under a mutex
-    for (auto cfd : *versions_->GetColumnFamilySet()) {
-      if (!cfd->options()->disable_auto_compactions) {
-        c.reset(cfd->PickCompaction(log_buffer));
-        if (c != nullptr) {
-          // update statistics
-          MeasureTime(options_.statistics.get(), NUM_FILES_IN_SINGLE_COMPACTION,
-                      c->inputs(0)->size());
-          break;
+  } else if (!compaction_queue_.empty()) {
+    // cfd is referenced here
+    auto cfd = PopFirstFromCompactionQueue();
+    // We unreference here because the following code will take a Ref() on
+    // this cfd if it is going to use it (Compaction class holds a
+    // reference).
+    // This will all happen under a mutex so we don't have to be afraid of
+    // somebody else deleting it.
+    if (cfd->Unref()) {
+      delete cfd;
+      // This was the last reference of the column family, so no need to
+      // compact.
+      return Status::OK();
+    }
+
+    // Pick up latest mutable CF Options and use it throughout the
+    // compaction job
+    // Compaction makes a copy of the latest MutableCFOptions. It should be used
+    // throughout the compaction procedure to make sure consistency. It will
+    // eventually be installed into SuperVersion
+    auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
+    if (!mutable_cf_options->disable_auto_compactions && !cfd->IsDropped()) {
+      // NOTE: try to avoid unnecessary copy of MutableCFOptions if
+      // compaction is not necessary. Need to make sure mutex is held
+      // until we make a copy in the following code
+      c.reset(cfd->PickCompaction(*mutable_cf_options, log_buffer));
+      if (c != nullptr) {
+        // update statistics
+        MeasureTime(stats_, NUM_FILES_IN_SINGLE_COMPACTION,
+                    c->inputs(0)->size());
+        // There are three things that can change compaction score:
+        // 1) When flush or compaction finish. This case is covered by
+        // InstallSuperVersion()
+        // 2) When MutableCFOptions changes. This case is also covered by
+        // InstallSuperVersion(), because this is when the new options take
+        // effect.
+        // 3) When we Pick a new compaction, we "remove" those files being
+        // compacted from the calculation, which then influences compaction
+        // score. Here we check if we need the new compaction even without the
+        // files that are currently being compacted. If we need another
+        // compaction, we might be able to execute it in parallel, so we add it
+        // to the queue and schedule a new thread.
+        if (cfd->NeedsCompaction()) {
+          // Yes, we need more compactions!
+          AddToCompactionQueue(cfd);
+          ++unscheduled_compactions_;
+          MaybeScheduleFlushOrCompaction();
         }
       }
     }
   }
 
-  Status status;
   if (!c) {
     // Nothing to do
     LogToBuffer(log_buffer, "Compaction nothing to do");
+  } else if (c->IsDeletionCompaction()) {
+    // TODO(icanadi) Do we want to honor snapshots here? i.e. not delete old
+    // file if there is alive snapshot pointing to it
+    assert(c->num_input_files(1) == 0);
+    assert(c->level() == 0);
+    assert(c->column_family_data()->ioptions()->compaction_style ==
+           kCompactionStyleFIFO);
+    for (const auto& f : *c->inputs(0)) {
+      c->edit()->DeleteFile(c->level(), f->fd.GetNumber());
+    }
+    status = versions_->LogAndApply(c->column_family_data(),
+                                    *c->mutable_cf_options(), c->edit(),
+                                    &mutex_, directories_.GetDbDir());
+    InstallSuperVersionBackground(c->column_family_data(), job_context,
+                                  *c->mutable_cf_options());
+    LogToBuffer(log_buffer, "[%s] Deleted %d files\n",
+                c->column_family_data()->GetName().c_str(),
+                c->num_input_files(0));
+    *madeProgress = true;
   } else if (!is_manual && c->IsTrivialMove()) {
+    TEST_SYNC_POINT("DBImpl::BackgroundCompaction:TrivialMove");
+    // Instrument for event update
+    // TODO(yhchiang): add op details for showing trivial-move.
+    ThreadStatusUtil::SetColumnFamily(c->column_family_data());
+    ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION);
+
     // Move file to next level
     assert(c->num_input_files(0) == 1);
     FileMetaData* f = c->input(0, 0);
-    c->edit()->DeleteFile(c->level(), f->number);
-    c->edit()->AddFile(c->level() + 1, f->number, f->file_size,
-                       f->smallest, f->largest,
+    c->edit()->DeleteFile(c->level(), f->fd.GetNumber());
+    c->edit()->AddFile(c->output_level(), f->fd.GetNumber(), f->fd.GetPathId(),
+                       f->fd.GetFileSize(), f->smallest, f->largest,
                        f->smallest_seqno, f->largest_seqno);
-    status = versions_->LogAndApply(c->column_family_data(), c->edit(), &mutex_,
-                                    db_directory_.get());
-    InstallSuperVersion(c->column_family_data(), deletion_state);
-
-    Version::LevelSummaryStorage tmp;
-    LogToBuffer(log_buffer, "[%s] Moved #%lld to level-%d %lld bytes %s: %s\n",
-                c->column_family_data()->GetName().c_str(),
-                static_cast<unsigned long long>(f->number), c->level() + 1,
-                static_cast<unsigned long long>(f->file_size),
-                status.ToString().c_str(),
-                c->input_version()->LevelSummary(&tmp));
-    c->ReleaseCompactionFiles(status);
+    status = versions_->LogAndApply(c->column_family_data(),
+                                    *c->mutable_cf_options(), c->edit(),
+                                    &mutex_, directories_.GetDbDir());
+    // Use latest MutableCFOptions
+    InstallSuperVersionBackground(c->column_family_data(), job_context,
+                                  *c->mutable_cf_options());
+
+    VersionStorageInfo::LevelSummaryStorage tmp;
+    c->column_family_data()->internal_stats()->IncBytesMoved(
+        c->level() + 1, f->fd.GetFileSize());
+    {
+      event_logger_.LogToBuffer(log_buffer)
+          << "job" << job_context->job_id << "event"
+          << "trivial_move"
+          << "destination_level" << c->level() + 1 << "file_number"
+          << f->fd.GetNumber() << "file_size" << f->fd.GetFileSize();
+    }
+    LogToBuffer(
+        log_buffer,
+        "[%s] Moved #%" PRIu64 " to level-%d %" PRIu64 " bytes %s: %s\n",
+        c->column_family_data()->GetName().c_str(), f->fd.GetNumber(),
+        c->level() + 1, f->fd.GetFileSize(), status.ToString().c_str(),
+        c->column_family_data()->current()->storage_info()->LevelSummary(&tmp));
     *madeProgress = true;
+
+    // Clear Instrument
+    ThreadStatusUtil::ResetThreadStatus();
   } else {
-    MaybeScheduleFlushOrCompaction(); // do more compaction work in parallel.
-    CompactionState* compact = new CompactionState(c.get());
-    status = DoCompactionWork(compact, deletion_state, log_buffer);
-    CleanupCompaction(compact, status);
+    TEST_SYNC_POINT("DBImpl::BackgroundCompaction:NonTrivial");
+    auto yield_callback = [&]() {
+      return CallFlushDuringCompaction(c->column_family_data(),
+                                       *c->mutable_cf_options(), job_context,
+                                       log_buffer);
+    };
+    assert(is_snapshot_supported_ || snapshots_.empty());
+    CompactionJob compaction_job(
+        job_context->job_id, c.get(), db_options_, env_options_,
+        versions_.get(), &shutting_down_, log_buffer, directories_.GetDbDir(),
+        directories_.GetDataDir(c->GetOutputPathId()), stats_,
+        snapshots_.GetAll(), table_cache_, std::move(yield_callback),
+        &event_logger_, c->mutable_cf_options()->paranoid_file_checks);
+    compaction_job.Prepare();
+    mutex_.Unlock();
+    status = compaction_job.Run();
+    mutex_.Lock();
+    compaction_job.Install(&status, *c->mutable_cf_options(), &mutex_);
+    if (status.ok()) {
+      InstallSuperVersionBackground(c->column_family_data(), job_context,
+                                    *c->mutable_cf_options());
+    }
+    *madeProgress = true;
+  }
+  // FIXME(orib): should I check if column family data is null?
+  if (c != nullptr) {
+    NotifyOnCompactionCompleted(c->column_family_data(), c.get(), status);
     c->ReleaseCompactionFiles(status);
-    c->ReleaseInputs();
     *madeProgress = true;
   }
+  // this will unref its input_version and column_family_data
   c.reset();
 
   if (status.ok()) {
     // Done
-  } else if (shutting_down_.Acquire_Load()) {
+  } else if (status.IsShutdownInProgress()) {
     // Ignore compaction errors found during shutting down
   } else {
-    Log(InfoLogLevel::WARN_LEVEL, options_.info_log, "Compaction error: %s",
+    Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log, "Compaction error: %s",
         status.ToString().c_str());
-    if (options_.paranoid_checks && bg_error_.ok()) {
+    if (db_options_.paranoid_checks && bg_error_.ok()) {
       bg_error_ = status;
     }
   }
@@ -2208,8 +2443,11 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress,
     if (!m->done) {
       // We only compacted part of the requested range.  Update *m
       // to the range that is left to be compacted.
-      // Universal compaction should always compact the whole range
-      assert(m->cfd->options()->compaction_style != kCompactionStyleUniversal);
+      // Universal and FIFO compactions should always compact the whole range
+      assert(m->cfd->ioptions()->compaction_style !=
+                 kCompactionStyleUniversal ||
+             m->cfd->ioptions()->num_levels > 1);
+      assert(m->cfd->ioptions()->compaction_style != kCompactionStyleFIFO);
       m->tmp_storage = *manual_end;
       m->begin = &m->tmp_storage;
     }
@@ -2219,234 +2457,22 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress,
   return status;
 }
 
-void DBImpl::CleanupCompaction(CompactionState* compact, Status status) {
-  mutex_.AssertHeld();
-  if (compact->builder != nullptr) {
-    // May happen if we get a shutdown call in the middle of compaction
-    compact->builder->Abandon();
-    compact->builder.reset();
-  } else {
-    assert(compact->outfile == nullptr);
-  }
-  for (size_t i = 0; i < compact->outputs.size(); i++) {
-    const CompactionState::Output& out = compact->outputs[i];
-    pending_outputs_.erase(out.number);
-
-    // If this file was inserted into the table cache then remove
-    // them here because this compaction was not committed.
-    if (!status.ok()) {
-      TableCache::Evict(table_cache_.get(), out.number);
-    }
-  }
-  delete compact;
-}
-
-// Allocate the file numbers for the output file. We allocate as
-// many output file numbers as there are files in level+1 (at least one)
-// Insert them into pending_outputs so that they do not get deleted.
-void DBImpl::AllocateCompactionOutputFileNumbers(CompactionState* compact) {
-  mutex_.AssertHeld();
-  assert(compact != nullptr);
-  assert(compact->builder == nullptr);
-  int filesNeeded = compact->compaction->num_input_files(1);
-  for (int i = 0; i < std::max(filesNeeded, 1); i++) {
-    uint64_t file_number = versions_->NewFileNumber();
-    pending_outputs_.insert(file_number);
-    compact->allocated_file_numbers.push_back(file_number);
-  }
-}
-
-// Frees up unused file number.
-void DBImpl::ReleaseCompactionUnusedFileNumbers(CompactionState* compact) {
-  mutex_.AssertHeld();
-  for (const auto file_number : compact->allocated_file_numbers) {
-    pending_outputs_.erase(file_number);
-  }
-}
-
-Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) {
-  assert(compact != nullptr);
-  assert(compact->builder == nullptr);
-  uint64_t file_number;
-  // If we have not yet exhausted the pre-allocated file numbers,
-  // then use the one from the front. Otherwise, we have to acquire
-  // the heavyweight lock and allocate a new file number.
-  if (!compact->allocated_file_numbers.empty()) {
-    file_number = compact->allocated_file_numbers.front();
-    compact->allocated_file_numbers.pop_front();
-  } else {
-    mutex_.Lock();
-    file_number = versions_->NewFileNumber();
-    pending_outputs_.insert(file_number);
-    mutex_.Unlock();
-  }
-  CompactionState::Output out;
-  out.number = file_number;
-  out.smallest.Clear();
-  out.largest.Clear();
-  out.smallest_seqno = out.largest_seqno = 0;
-  compact->outputs.push_back(out);
-
-  // Make the output file
-  std::string fname = TableFileName(dbname_, file_number);
-  Status s = env_->NewWritableFile(fname, &compact->outfile, storage_options_);
-
-  if (s.ok()) {
-    // Over-estimate slightly so we don't end up just barely crossing
-    // the threshold.
-    ColumnFamilyData* cfd = compact->compaction->column_family_data();
-    compact->outfile->SetPreallocationBlockSize(
-        1.1 * cfd->compaction_picker()->MaxFileSizeForLevel(
-                  compact->compaction->output_level()));
-
-    CompressionType compression_type =
-        GetCompressionType(*cfd->options(), compact->compaction->output_level(),
-                           compact->compaction->enable_compression());
-
-    compact->builder.reset(
-        NewTableBuilder(*cfd->options(), cfd->internal_comparator(),
-                        compact->outfile.get(), compression_type));
-  }
-  LogFlush(options_.info_log);
-  return s;
-}
-
-Status DBImpl::FinishCompactionOutputFile(CompactionState* compact,
-                                          Iterator* input) {
-  assert(compact != nullptr);
-  assert(compact->outfile);
-  assert(compact->builder != nullptr);
-
-  const uint64_t output_number = compact->current_output()->number;
-  assert(output_number != 0);
-
-  // Check for iterator errors
-  Status s = input->status();
-  const uint64_t current_entries = compact->builder->NumEntries();
-  if (s.ok()) {
-    s = compact->builder->Finish();
-  } else {
-    compact->builder->Abandon();
-  }
-  const uint64_t current_bytes = compact->builder->FileSize();
-  compact->current_output()->file_size = current_bytes;
-  compact->total_bytes += current_bytes;
-  compact->builder.reset();
-
-  // Finish and check for file errors
-  if (s.ok() && !options_.disableDataSync) {
-    if (options_.use_fsync) {
-      StopWatch sw(env_, options_.statistics.get(),
-                   COMPACTION_OUTFILE_SYNC_MICROS, false);
-      s = compact->outfile->Fsync();
-    } else {
-      StopWatch sw(env_, options_.statistics.get(),
-                   COMPACTION_OUTFILE_SYNC_MICROS, false);
-      s = compact->outfile->Sync();
-    }
-  }
-  if (s.ok()) {
-    s = compact->outfile->Close();
-  }
-  compact->outfile.reset();
-
-  if (s.ok() && current_entries > 0) {
-    // Verify that the table is usable
-    ColumnFamilyData* cfd = compact->compaction->column_family_data();
-    FileMetaData meta(output_number, current_bytes);
-    Iterator* iter = cfd->table_cache()->NewIterator(
-        ReadOptions(), storage_options_, cfd->internal_comparator(), meta);
-    s = iter->status();
-    delete iter;
-    if (s.ok()) {
-      Log(options_.info_log, "[%s] Generated table #%lu: %lu keys, %lu bytes",
-          cfd->GetName().c_str(), (unsigned long)output_number,
-          (unsigned long)current_entries, (unsigned long)current_bytes);
-    }
-  }
-  return s;
-}
-
-
-Status DBImpl::InstallCompactionResults(CompactionState* compact,
-                                        LogBuffer* log_buffer) {
-  mutex_.AssertHeld();
-
-  // paranoia: verify that the files that we started with
-  // still exist in the current version and in the same original level.
-  // This ensures that a concurrent compaction did not erroneously
-  // pick the same files to compact.
-  if (!versions_->VerifyCompactionFileConsistency(compact->compaction)) {
-    Log(options_.info_log, "[%s] Compaction %d@%d + %d@%d files aborted",
-        compact->compaction->column_family_data()->GetName().c_str(),
-        compact->compaction->num_input_files(0), compact->compaction->level(),
-        compact->compaction->num_input_files(1),
-        compact->compaction->output_level());
-    return Status::Corruption("Compaction input files inconsistent");
-  }
-
-  LogToBuffer(log_buffer, "[%s] Compacted %d@%d + %d@%d files => %lld bytes",
-              compact->compaction->column_family_data()->GetName().c_str(),
-              compact->compaction->num_input_files(0),
-              compact->compaction->level(),
-              compact->compaction->num_input_files(1),
-              compact->compaction->output_level(),
-              static_cast<long long>(compact->total_bytes));
-
-  // Add compaction outputs
-  compact->compaction->AddInputDeletions(compact->compaction->edit());
-  for (size_t i = 0; i < compact->outputs.size(); i++) {
-    const CompactionState::Output& out = compact->outputs[i];
-    compact->compaction->edit()->AddFile(
-        compact->compaction->output_level(), out.number, out.file_size,
-        out.smallest, out.largest, out.smallest_seqno, out.largest_seqno);
-  }
-  return versions_->LogAndApply(compact->compaction->column_family_data(),
-                                compact->compaction->edit(), &mutex_,
-                                db_directory_.get());
-}
-
-// Given a sequence number, return the sequence number of the
-// earliest snapshot that this sequence number is visible in.
-// The snapshots themselves are arranged in ascending order of
-// sequence numbers.
-// Employ a sequential search because the total number of
-// snapshots are typically small.
-inline SequenceNumber DBImpl::findEarliestVisibleSnapshot(
-  SequenceNumber in, std::vector<SequenceNumber>& snapshots,
-  SequenceNumber* prev_snapshot) {
-  if (!IsSnapshotSupported()) {
+uint64_t DBImpl::CallFlushDuringCompaction(
+    ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+    JobContext* job_context, LogBuffer* log_buffer) {
+  if (db_options_.max_background_flushes > 0) {
+    // flush thread will take care of this
     return 0;
   }
-  SequenceNumber prev __attribute__((unused)) = 0;
-  for (const auto cur : snapshots) {
-    assert(prev <= cur);
-    if (cur >= in) {
-      *prev_snapshot = prev;
-      return cur;
-    }
-    prev = cur; // assignment
-    assert(prev);
-  }
-  Log(options_.info_log,
-      "Looking for seqid %lu but maxseqid is %lu",
-      (unsigned long)in,
-      (unsigned long)snapshots[snapshots.size()-1]);
-  assert(0);
-  return 0;
-}
-
-uint64_t DBImpl::CallFlushDuringCompaction(ColumnFamilyData* cfd,
-                                           DeletionState& deletion_state,
-                                           LogBuffer* log_buffer) {
-  if (cfd->imm()->imm_flush_needed.NoBarrier_Load() != nullptr) {
+  if (cfd->imm()->imm_flush_needed.load(std::memory_order_relaxed)) {
     const uint64_t imm_start = env_->NowMicros();
     mutex_.Lock();
     if (cfd->imm()->IsFlushPending()) {
       cfd->Ref();
-      FlushMemTableToOutputFile(cfd, nullptr, deletion_state, log_buffer);
+      FlushMemTableToOutputFile(cfd, mutable_cf_options, nullptr, job_context,
+                                log_buffer);
       cfd->Unref();
-      bg_cv_.SignalAll();  // Wakeup MakeRoomForWrite() if necessary
+      bg_cv_.SignalAll();  // Wakeup DelayWrite() if necessary
     }
     mutex_.Unlock();
     log_buffer->FlushBufferToLog();
@@ -2455,662 +2481,13 @@ uint64_t DBImpl::CallFlushDuringCompaction(ColumnFamilyData* cfd,
   return 0;
 }
 
-Status DBImpl::ProcessKeyValueCompaction(
-    SequenceNumber visible_at_tip,
-    SequenceNumber earliest_snapshot,
-    SequenceNumber latest_snapshot,
-    DeletionState& deletion_state,
-    bool bottommost_level,
-    int64_t& imm_micros,
-    Iterator* input,
-    CompactionState* compact,
-    bool is_compaction_v2,
-    LogBuffer* log_buffer) {
-  size_t combined_idx = 0;
-  Status status;
-  std::string compaction_filter_value;
-  ParsedInternalKey ikey;
-  IterKey current_user_key;
-  bool has_current_user_key = false;
-  IterKey delete_key;
-  SequenceNumber last_sequence_for_key __attribute__((unused)) =
-    kMaxSequenceNumber;
-  SequenceNumber visible_in_snapshot = kMaxSequenceNumber;
-  ColumnFamilyData* cfd = compact->compaction->column_family_data();
-  MergeHelper merge(
-      cfd->user_comparator(), cfd->options()->merge_operator.get(),
-      options_.info_log.get(), cfd->options()->min_partial_merge_operands,
-      false /* internal key corruption is expected */);
-  auto compaction_filter = cfd->options()->compaction_filter;
-  std::unique_ptr<CompactionFilter> compaction_filter_from_factory = nullptr;
-  if (!compaction_filter) {
-    auto context = compact->GetFilterContextV1();
-    compaction_filter_from_factory =
-        cfd->options()->compaction_filter_factory->CreateCompactionFilter(
-            context);
-    compaction_filter = compaction_filter_from_factory.get();
-  }
-
-  while (input->Valid() && !shutting_down_.Acquire_Load() &&
-         !cfd->IsDropped()) {
-    // FLUSH preempts compaction
-    // TODO(icanadi) this currently only checks if flush is necessary on
-    // compacting column family. we should also check if flush is necessary on
-    // other column families, too
-    imm_micros += CallFlushDuringCompaction(cfd, deletion_state, log_buffer);
-
-    Slice key;
-    Slice value;
-    // If is_compaction_v2 is on, kv-pairs are reset to the prefix batch.
-    // This prefix batch should contain results after calling
-    // compaction_filter_v2.
-    //
-    // If is_compaction_v2 is off, this function will go through all the
-    // kv-pairs in input.
-    if (!is_compaction_v2) {
-      key = input->key();
-      value = input->value();
-    } else {
-      if (combined_idx >= compact->combined_key_buf_.size()) {
-        break;
-      }
-      assert(combined_idx < compact->combined_key_buf_.size());
-      key = compact->combined_key_buf_[combined_idx];
-      value = compact->combined_value_buf_[combined_idx];
-
-      ++combined_idx;
-    }
-
-    if (compact->compaction->ShouldStopBefore(key) &&
-        compact->builder != nullptr) {
-      status = FinishCompactionOutputFile(compact, input);
-      if (!status.ok()) {
-        break;
-      }
-    }
-
-    // Handle key/value, add to state, etc.
-    bool drop = false;
-    bool current_entry_is_merging = false;
-    if (!ParseInternalKey(key, &ikey)) {
-      // Do not hide error keys
-      // TODO: error key stays in db forever? Figure out the intention/rationale
-      // v10 error v8 : we cannot hide v8 even though it's pretty obvious.
-      current_user_key.Clear();
-      has_current_user_key = false;
-      last_sequence_for_key = kMaxSequenceNumber;
-      visible_in_snapshot = kMaxSequenceNumber;
-    } else {
-      if (!has_current_user_key ||
-          cfd->user_comparator()->Compare(ikey.user_key,
-                                          current_user_key.GetKey()) != 0) {
-        // First occurrence of this user key
-        current_user_key.SetUserKey(ikey.user_key);
-        has_current_user_key = true;
-        last_sequence_for_key = kMaxSequenceNumber;
-        visible_in_snapshot = kMaxSequenceNumber;
-        // apply the compaction filter to the first occurrence of the user key
-        if (compaction_filter && !is_compaction_v2 &&
-            ikey.type == kTypeValue &&
-            (visible_at_tip || ikey.sequence > latest_snapshot)) {
-          // If the user has specified a compaction filter and the sequence
-          // number is greater than any external snapshot, then invoke the
-          // filter.
-          // If the return value of the compaction filter is true, replace
-          // the entry with a delete marker.
-          bool value_changed = false;
-          compaction_filter_value.clear();
-          bool to_delete = compaction_filter->Filter(
-              compact->compaction->level(), ikey.user_key, value,
-              &compaction_filter_value, &value_changed);
-          if (to_delete) {
-            // make a copy of the original key and convert it to a delete
-            delete_key.SetInternalKey(ExtractUserKey(key), ikey.sequence,
-                                      kTypeDeletion);
-            // anchor the key again
-            key = delete_key.GetKey();
-            // needed because ikey is backed by key
-            ParseInternalKey(key, &ikey);
-            // no value associated with delete
-            value.clear();
-            RecordTick(options_.statistics.get(), COMPACTION_KEY_DROP_USER);
-          } else if (value_changed) {
-            value = compaction_filter_value;
-          }
-        }
-      }
-
-      // If there are no snapshots, then this kv affect visibility at tip.
-      // Otherwise, search though all existing snapshots to find
-      // the earlist snapshot that is affected by this kv.
-      SequenceNumber prev_snapshot = 0; // 0 means no previous snapshot
-      SequenceNumber visible = visible_at_tip ?
-        visible_at_tip :
-        findEarliestVisibleSnapshot(ikey.sequence,
-            compact->existing_snapshots,
-            &prev_snapshot);
-
-      if (visible_in_snapshot == visible) {
-        // If the earliest snapshot is which this key is visible in
-        // is the same as the visibily of a previous instance of the
-        // same key, then this kv is not visible in any snapshot.
-        // Hidden by an newer entry for same user key
-        // TODO: why not > ?
-        assert(last_sequence_for_key >= ikey.sequence);
-        drop = true;    // (A)
-        RecordTick(options_.statistics.get(), COMPACTION_KEY_DROP_NEWER_ENTRY);
-      } else if (ikey.type == kTypeDeletion &&
-          ikey.sequence <= earliest_snapshot &&
-          compact->compaction->IsBaseLevelForKey(ikey.user_key)) {
-        // For this user key:
-        // (1) there is no data in higher levels
-        // (2) data in lower levels will have larger sequence numbers
-        // (3) data in layers that are being compacted here and have
-        //     smaller sequence numbers will be dropped in the next
-        //     few iterations of this loop (by rule (A) above).
-        // Therefore this deletion marker is obsolete and can be dropped.
-        drop = true;
-        RecordTick(options_.statistics.get(), COMPACTION_KEY_DROP_OBSOLETE);
-      } else if (ikey.type == kTypeMerge) {
-        // We know the merge type entry is not hidden, otherwise we would
-        // have hit (A)
-        // We encapsulate the merge related state machine in a different
-        // object to minimize change to the existing flow. Turn out this
-        // logic could also be nicely re-used for memtable flush purge
-        // optimization in BuildTable.
-        int steps = 0;
-        merge.MergeUntil(input, prev_snapshot, bottommost_level,
-            options_.statistics.get(), &steps);
-        // Skip the Merge ops
-        combined_idx = combined_idx - 1 + steps;
-
-        current_entry_is_merging = true;
-        if (merge.IsSuccess()) {
-          // Successfully found Put/Delete/(end-of-key-range) while merging
-          // Get the merge result
-          key = merge.key();
-          ParseInternalKey(key, &ikey);
-          value = merge.value();
-        } else {
-          // Did not find a Put/Delete/(end-of-key-range) while merging
-          // We now have some stack of merge operands to write out.
-          // NOTE: key,value, and ikey are now referring to old entries.
-          //       These will be correctly set below.
-          assert(!merge.keys().empty());
-          assert(merge.keys().size() == merge.values().size());
-
-          // Hack to make sure last_sequence_for_key is correct
-          ParseInternalKey(merge.keys().front(), &ikey);
-        }
-      }
-
-      last_sequence_for_key = ikey.sequence;
-      visible_in_snapshot = visible;
-    }
-
-    if (!drop) {
-      // We may write a single key (e.g.: for Put/Delete or successful merge).
-      // Or we may instead have to write a sequence/list of keys.
-      // We have to write a sequence iff we have an unsuccessful merge
-      bool has_merge_list = current_entry_is_merging && !merge.IsSuccess();
-      const std::deque<std::string>* keys = nullptr;
-      const std::deque<std::string>* values = nullptr;
-      std::deque<std::string>::const_reverse_iterator key_iter;
-      std::deque<std::string>::const_reverse_iterator value_iter;
-      if (has_merge_list) {
-        keys = &merge.keys();
-        values = &merge.values();
-        key_iter = keys->rbegin();    // The back (*rbegin()) is the first key
-        value_iter = values->rbegin();
-
-        key = Slice(*key_iter);
-        value = Slice(*value_iter);
-      }
-
-      // If we have a list of keys to write, traverse the list.
-      // If we have a single key to write, simply write that key.
-      while (true) {
-        // Invariant: key,value,ikey will always be the next entry to write
-        char* kptr = (char*)key.data();
-        std::string kstr;
-
-        // Zeroing out the sequence number leads to better compression.
-        // If this is the bottommost level (no files in lower levels)
-        // and the earliest snapshot is larger than this seqno
-        // then we can squash the seqno to zero.
-        if (bottommost_level && ikey.sequence < earliest_snapshot &&
-            ikey.type != kTypeMerge) {
-          assert(ikey.type != kTypeDeletion);
-          // make a copy because updating in place would cause problems
-          // with the priority queue that is managing the input key iterator
-          kstr.assign(key.data(), key.size());
-          kptr = (char *)kstr.c_str();
-          UpdateInternalKey(kptr, key.size(), (uint64_t)0, ikey.type);
-        }
-
-        Slice newkey(kptr, key.size());
-        assert((key.clear(), 1)); // we do not need 'key' anymore
-
-        // Open output file if necessary
-        if (compact->builder == nullptr) {
-          status = OpenCompactionOutputFile(compact);
-          if (!status.ok()) {
-            break;
-          }
-        }
-
-        SequenceNumber seqno = GetInternalKeySeqno(newkey);
-        if (compact->builder->NumEntries() == 0) {
-          compact->current_output()->smallest.DecodeFrom(newkey);
-          compact->current_output()->smallest_seqno = seqno;
-        } else {
-          compact->current_output()->smallest_seqno =
-            std::min(compact->current_output()->smallest_seqno, seqno);
-        }
-        compact->current_output()->largest.DecodeFrom(newkey);
-        compact->builder->Add(newkey, value);
-        compact->current_output()->largest_seqno =
-          std::max(compact->current_output()->largest_seqno, seqno);
-
-        // Close output file if it is big enough
-        if (compact->builder->FileSize() >=
-            compact->compaction->MaxOutputFileSize()) {
-          status = FinishCompactionOutputFile(compact, input);
-          if (!status.ok()) {
-            break;
-          }
-        }
-
-        // If we have a list of entries, move to next element
-        // If we only had one entry, then break the loop.
-        if (has_merge_list) {
-          ++key_iter;
-          ++value_iter;
-
-          // If at end of list
-          if (key_iter == keys->rend() || value_iter == values->rend()) {
-            // Sanity Check: if one ends, then both end
-            assert(key_iter == keys->rend() && value_iter == values->rend());
-            break;
-          }
-
-          // Otherwise not at end of list. Update key, value, and ikey.
-          key = Slice(*key_iter);
-          value = Slice(*value_iter);
-          ParseInternalKey(key, &ikey);
-
-        } else{
-          // Only had one item to begin with (Put/Delete)
-          break;
-        }
-      }
-    }
-
-    // MergeUntil has moved input to the next entry
-    if (!current_entry_is_merging) {
-      input->Next();
-    }
-  }
-
-  return status;
-}
-
-void DBImpl::CallCompactionFilterV2(CompactionState* compact,
-  CompactionFilterV2* compaction_filter_v2) {
-  if (compact == nullptr || compaction_filter_v2 == nullptr) {
-    return;
-  }
-
-  std::vector<Slice> user_key_buf;
-  for (const auto& key : compact->ikey_buf_) {
-    user_key_buf.emplace_back(key.user_key);
-  }
-
-  // If the user has specified a compaction filter and the sequence
-  // number is greater than any external snapshot, then invoke the
-  // filter.
-  // If the return value of the compaction filter is true, replace
-  // the entry with a delete marker.
-  compact->to_delete_buf_ = compaction_filter_v2->Filter(
-      compact->compaction->level(),
-      user_key_buf, compact->existing_value_buf_,
-      &compact->new_value_buf_,
-      &compact->value_changed_buf_);
-
-  // new_value_buf_.size() <= to_delete__buf_.size(). "=" iff all
-  // kv-pairs in this compaction run needs to be deleted.
-  assert(compact->to_delete_buf_.size() ==
-      compact->key_buf_.size());
-  assert(compact->to_delete_buf_.size() ==
-      compact->existing_value_buf_.size());
-  assert(compact->to_delete_buf_.size() ==
-      compact->value_changed_buf_.size());
-
-  int new_value_idx = 0;
-  for (unsigned int i = 0; i < compact->to_delete_buf_.size(); ++i) {
-    if (compact->to_delete_buf_[i]) {
-      // update the string buffer directly
-      // the Slice buffer points to the updated buffer
-      UpdateInternalKey(&compact->key_str_buf_[i][0],
-          compact->key_str_buf_[i].size(),
-          compact->ikey_buf_[i].sequence,
-          kTypeDeletion);
-
-      // no value associated with delete
-      compact->existing_value_buf_[i].clear();
-      RecordTick(options_.statistics.get(), COMPACTION_KEY_DROP_USER);
-    } else if (compact->value_changed_buf_[i]) {
-      compact->existing_value_buf_[i] =
-        Slice(compact->new_value_buf_[new_value_idx++]);
-    }
-  }  // for
-}
-
-Status DBImpl::DoCompactionWork(CompactionState* compact,
-                                DeletionState& deletion_state,
-                                LogBuffer* log_buffer) {
-  assert(compact);
-  compact->CleanupBatchBuffer();
-  compact->CleanupMergedBuffer();
-  bool prefix_initialized = false;
-
-  int64_t imm_micros = 0;  // Micros spent doing imm_ compactions
-  ColumnFamilyData* cfd = compact->compaction->column_family_data();
-  LogToBuffer(
-      log_buffer,
-      "[%s] Compacting %d@%d + %d@%d files, score %.2f slots available %d",
-      cfd->GetName().c_str(), compact->compaction->num_input_files(0),
-      compact->compaction->level(), compact->compaction->num_input_files(1),
-      compact->compaction->output_level(), compact->compaction->score(),
-      options_.max_background_compactions - bg_compaction_scheduled_);
-  char scratch[2345];
-  compact->compaction->Summary(scratch, sizeof(scratch));
-  LogToBuffer(log_buffer, "[%s] Compaction start summary: %s\n",
-              cfd->GetName().c_str(), scratch);
-
-  assert(cfd->current()->NumLevelFiles(compact->compaction->level()) > 0);
-  assert(compact->builder == nullptr);
-  assert(!compact->outfile);
-
-  SequenceNumber visible_at_tip = 0;
-  SequenceNumber earliest_snapshot;
-  SequenceNumber latest_snapshot = 0;
-  snapshots_.getAll(compact->existing_snapshots);
-  if (compact->existing_snapshots.size() == 0) {
-    // optimize for fast path if there are no snapshots
-    visible_at_tip = versions_->LastSequence();
-    earliest_snapshot = visible_at_tip;
-  } else {
-    latest_snapshot = compact->existing_snapshots.back();
-    // Add the current seqno as the 'latest' virtual
-    // snapshot to the end of this list.
-    compact->existing_snapshots.push_back(versions_->LastSequence());
-    earliest_snapshot = compact->existing_snapshots[0];
-  }
-
-  // Is this compaction producing files at the bottommost level?
-  bool bottommost_level = compact->compaction->BottomMostLevel();
-
-  // Allocate the output file numbers before we release the lock
-  AllocateCompactionOutputFileNumbers(compact);
-
-  // Release mutex while we're actually doing the compaction work
-  mutex_.Unlock();
-  log_buffer->FlushBufferToLog();
-
-  const uint64_t start_micros = env_->NowMicros();
-  unique_ptr<Iterator> input(versions_->MakeInputIterator(compact->compaction));
-  input->SeekToFirst();
-  shared_ptr<Iterator> backup_input(
-      versions_->MakeInputIterator(compact->compaction));
-  backup_input->SeekToFirst();
-
-  Status status;
-  ParsedInternalKey ikey;
-  std::unique_ptr<CompactionFilterV2> compaction_filter_from_factory_v2
-    = nullptr;
-  auto context = compact->GetFilterContext();
-  compaction_filter_from_factory_v2 =
-      cfd->options()->compaction_filter_factory_v2->CreateCompactionFilterV2(
-          context);
-  auto compaction_filter_v2 =
-    compaction_filter_from_factory_v2.get();
-
-  // temp_backup_input always point to the start of the current buffer
-  // temp_backup_input = backup_input;
-  // iterate through input,
-  // 1) buffer ineligible keys and value keys into 2 separate buffers;
-  // 2) send value_buffer to compaction filter and alternate the values;
-  // 3) merge value_buffer with ineligible_value_buffer;
-  // 4) run the modified "compaction" using the old for loop.
-  if (compaction_filter_v2) {
-    while (backup_input->Valid() && !shutting_down_.Acquire_Load() &&
-           !cfd->IsDropped()) {
-      // FLUSH preempts compaction
-      // TODO(icanadi) this currently only checks if flush is necessary on
-      // compacting column family. we should also check if flush is necessary on
-      // other column families, too
-      imm_micros += CallFlushDuringCompaction(cfd, deletion_state, log_buffer);
-
-      Slice key = backup_input->key();
-      Slice value = backup_input->value();
-
-      const SliceTransform* transformer =
-          cfd->options()->compaction_filter_factory_v2->GetPrefixExtractor();
-      const auto key_prefix = transformer->Transform(key);
-      if (!prefix_initialized) {
-        compact->cur_prefix_ = key_prefix.ToString();
-        prefix_initialized = true;
-      }
-      if (!ParseInternalKey(key, &ikey)) {
-        // log error
-        Log(options_.info_log, "[%s] Failed to parse key: %s",
-            cfd->GetName().c_str(), key.ToString().c_str());
-        continue;
-      } else {
-        // If the prefix remains the same, keep buffering
-        if (key_prefix.compare(Slice(compact->cur_prefix_)) == 0) {
-          // Apply the compaction filter V2 to all the kv pairs sharing
-          // the same prefix
-          if (ikey.type == kTypeValue &&
-              (visible_at_tip || ikey.sequence > latest_snapshot)) {
-            // Buffer all keys sharing the same prefix for CompactionFilterV2
-            // Iterate through keys to check prefix
-            compact->BufferKeyValueSlices(key, value);
-          } else {
-            // buffer ineligible keys
-            compact->BufferOtherKeyValueSlices(key, value);
-          }
-          backup_input->Next();
-          continue;
-          // finish changing values for eligible keys
-        } else {
-          // Now prefix changes, this batch is done.
-          // Call compaction filter on the buffered values to change the value
-          if (compact->key_buf_.size() > 0) {
-            CallCompactionFilterV2(compact, compaction_filter_v2);
-          }
-          compact->cur_prefix_ = key_prefix.ToString();
-        }
-      }
-
-      // Merge this batch of data (values + ineligible keys)
-      compact->MergeKeyValueSliceBuffer(&cfd->internal_comparator());
-
-      // Done buffering for the current prefix. Spit it out to disk
-      // Now just iterate through all the kv-pairs
-      status = ProcessKeyValueCompaction(
-          visible_at_tip,
-          earliest_snapshot,
-          latest_snapshot,
-          deletion_state,
-          bottommost_level,
-          imm_micros,
-          input.get(),
-          compact,
-          true,
-          log_buffer);
-
-      if (!status.ok()) {
-        break;
-      }
-
-      // After writing the kv-pairs, we can safely remove the reference
-      // to the string buffer and clean them up
-      compact->CleanupBatchBuffer();
-      compact->CleanupMergedBuffer();
-      // Buffer the key that triggers the mismatch in prefix
-      if (ikey.type == kTypeValue &&
-        (visible_at_tip || ikey.sequence > latest_snapshot)) {
-        compact->BufferKeyValueSlices(key, value);
-      } else {
-        compact->BufferOtherKeyValueSlices(key, value);
-      }
-      backup_input->Next();
-      if (!backup_input->Valid()) {
-        // If this is the single last value, we need to merge it.
-        if (compact->key_buf_.size() > 0) {
-          CallCompactionFilterV2(compact, compaction_filter_v2);
-        }
-        compact->MergeKeyValueSliceBuffer(&cfd->internal_comparator());
-
-        status = ProcessKeyValueCompaction(
-            visible_at_tip,
-            earliest_snapshot,
-            latest_snapshot,
-            deletion_state,
-            bottommost_level,
-            imm_micros,
-            input.get(),
-            compact,
-            true,
-            log_buffer);
-
-        compact->CleanupBatchBuffer();
-        compact->CleanupMergedBuffer();
-      }
-    }  // done processing all prefix batches
-    // finish the last batch
-    if (compact->key_buf_.size() > 0) {
-      CallCompactionFilterV2(compact, compaction_filter_v2);
-    }
-    compact->MergeKeyValueSliceBuffer(&cfd->internal_comparator());
-    status = ProcessKeyValueCompaction(
-        visible_at_tip,
-        earliest_snapshot,
-        latest_snapshot,
-        deletion_state,
-        bottommost_level,
-        imm_micros,
-        input.get(),
-        compact,
-        true,
-        log_buffer);
-  }  // checking for compaction filter v2
-
-  if (!compaction_filter_v2) {
-    status = ProcessKeyValueCompaction(
-      visible_at_tip,
-      earliest_snapshot,
-      latest_snapshot,
-      deletion_state,
-      bottommost_level,
-      imm_micros,
-      input.get(),
-      compact,
-      false,
-      log_buffer);
-  }
-
-  if (status.ok() && (shutting_down_.Acquire_Load() || cfd->IsDropped())) {
-    status = Status::ShutdownInProgress(
-        "Database shutdown or Column family drop during compaction");
-  }
-  if (status.ok() && compact->builder != nullptr) {
-    status = FinishCompactionOutputFile(compact, input.get());
-  }
-  if (status.ok()) {
-    status = input->status();
-  }
-  input.reset();
-
-  if (!options_.disableDataSync) {
-    db_directory_->Fsync();
-  }
-
-  InternalStats::CompactionStats stats;
-  stats.micros = env_->NowMicros() - start_micros - imm_micros;
-  MeasureTime(options_.statistics.get(), COMPACTION_TIME, stats.micros);
-  stats.files_in_leveln = compact->compaction->num_input_files(0);
-  stats.files_in_levelnp1 = compact->compaction->num_input_files(1);
-
-  int num_output_files = compact->outputs.size();
-  if (compact->builder != nullptr) {
-    // An error occurred so ignore the last output.
-    assert(num_output_files > 0);
-    --num_output_files;
-  }
-  stats.files_out_levelnp1 = num_output_files;
-
-  for (int i = 0; i < compact->compaction->num_input_files(0); i++) {
-    stats.bytes_readn += compact->compaction->input(0, i)->file_size;
-    RecordTick(options_.statistics.get(), COMPACT_READ_BYTES,
-               compact->compaction->input(0, i)->file_size);
-  }
-
-  for (int i = 0; i < compact->compaction->num_input_files(1); i++) {
-    stats.bytes_readnp1 += compact->compaction->input(1, i)->file_size;
-    RecordTick(options_.statistics.get(), COMPACT_READ_BYTES,
-               compact->compaction->input(1, i)->file_size);
-  }
-
-  for (int i = 0; i < num_output_files; i++) {
-    stats.bytes_written += compact->outputs[i].file_size;
-    RecordTick(options_.statistics.get(), COMPACT_WRITE_BYTES,
-               compact->outputs[i].file_size);
-  }
-
-  LogFlush(options_.info_log);
-  mutex_.Lock();
-  cfd->internal_stats()->AddCompactionStats(compact->compaction->output_level(),
-                                            stats);
-
-  // if there were any unused file number (mostly in case of
-  // compaction error), free up the entry from pending_putputs
-  ReleaseCompactionUnusedFileNumbers(compact);
-
-  if (status.ok()) {
-    status = InstallCompactionResults(compact, log_buffer);
-    InstallSuperVersion(cfd, deletion_state);
-  }
-  Version::LevelSummaryStorage tmp;
-  LogToBuffer(
-      log_buffer,
-      "[%s] compacted to: %s, %.1f MB/sec, level %d, files in(%d, %d) out(%d) "
-      "MB in(%.1f, %.1f) out(%.1f), read-write-amplify(%.1f) "
-      "write-amplify(%.1f) %s\n",
-      cfd->GetName().c_str(), cfd->current()->LevelSummary(&tmp),
-      (stats.bytes_readn + stats.bytes_readnp1 + stats.bytes_written) /
-          (double)stats.micros,
-      compact->compaction->output_level(), stats.files_in_leveln,
-      stats.files_in_levelnp1, stats.files_out_levelnp1,
-      stats.bytes_readn / 1048576.0, stats.bytes_readnp1 / 1048576.0,
-      stats.bytes_written / 1048576.0,
-      (stats.bytes_written + stats.bytes_readnp1 + stats.bytes_readn) /
-          (double)stats.bytes_readn,
-      stats.bytes_written / (double)stats.bytes_readn,
-      status.ToString().c_str());
-
-  return status;
-}
-
 namespace {
 struct IterState {
-  IterState(DBImpl* db, port::Mutex* mu, SuperVersion* super_version)
-      : db(db), mu(mu), super_version(super_version) {}
+  IterState(DBImpl* _db, InstrumentedMutex* _mu, SuperVersion* _super_version)
+      : db(_db), mu(_mu), super_version(_super_version) {}
 
   DBImpl* db;
-  port::Mutex* mu;
+  InstrumentedMutex* mu;
   SuperVersion* super_version;
 };
 
@@ -3118,37 +2495,43 @@ static void CleanupIteratorState(void* arg1, void* arg2) {
   IterState* state = reinterpret_cast<IterState*>(arg1);
 
   if (state->super_version->Unref()) {
-    DBImpl::DeletionState deletion_state;
+    // Job id == 0 means that this is not our background process, but rather
+    // user thread
+    JobContext job_context(0);
 
     state->mu->Lock();
     state->super_version->Cleanup();
-    state->db->FindObsoleteFiles(deletion_state, false, true);
+    state->db->FindObsoleteFiles(&job_context, false, true);
     state->mu->Unlock();
 
     delete state->super_version;
-    if (deletion_state.HaveSomethingToDelete()) {
-      state->db->PurgeObsoleteFiles(deletion_state);
+    if (job_context.HaveSomethingToDelete()) {
+      state->db->PurgeObsoleteFiles(job_context);
     }
+    job_context.Clean();
   }
 
   delete state;
 }
 }  // namespace
 
-Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
+Iterator* DBImpl::NewInternalIterator(const ReadOptions& read_options,
                                       ColumnFamilyData* cfd,
-                                      SuperVersion* super_version) {
-  std::vector<Iterator*> iterator_list;
+                                      SuperVersion* super_version,
+                                      Arena* arena) {
+  Iterator* internal_iter;
+  assert(arena != nullptr);
+  // Need to create internal iterator from the arena.
+  MergeIteratorBuilder merge_iter_builder(&cfd->internal_comparator(), arena);
   // Collect iterator for mutable mem
-  iterator_list.push_back(super_version->mem->NewIterator(options));
+  merge_iter_builder.AddIterator(
+      super_version->mem->NewIterator(read_options, arena));
   // Collect all needed child iterators for immutable memtables
-  super_version->imm->AddIterators(options, &iterator_list);
+  super_version->imm->AddIterators(read_options, &merge_iter_builder);
   // Collect iterators for files in L0 - Ln
-  super_version->current->AddIterators(options, storage_options_,
-                                       &iterator_list);
-  Iterator* internal_iter = NewMergingIterator(
-      &cfd->internal_comparator(), &iterator_list[0], iterator_list.size());
-
+  super_version->current->AddIterators(read_options, env_options_,
+                                       &merge_iter_builder);
+  internal_iter = merge_iter_builder.Finish();
   IterState* cleanup = new IterState(this, &mutex_, super_version);
   internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, nullptr);
 
@@ -3159,64 +2542,88 @@ ColumnFamilyHandle* DBImpl::DefaultColumnFamily() const {
   return default_cf_handle_;
 }
 
-Status DBImpl::Get(const ReadOptions& options,
+Status DBImpl::Get(const ReadOptions& read_options,
                    ColumnFamilyHandle* column_family, const Slice& key,
                    std::string* value) {
-  return GetImpl(options, column_family, key, value);
+  return GetImpl(read_options, column_family, key, value);
 }
 
-// DeletionState gets created and destructed outside of the lock -- we
+// JobContext gets created and destructed outside of the lock --
+// we
 // use this convinently to:
 // * malloc one SuperVersion() outside of the lock -- new_superversion
 // * delete SuperVersion()s outside of the lock -- superversions_to_free
 //
-// However, if InstallSuperVersion() gets called twice with the same,
-// deletion_state, we can't reuse the SuperVersion() that got malloced because
+// However, if InstallSuperVersion() gets called twice with the same
+// job_context, we can't reuse the SuperVersion() that got
+// malloced
+// because
 // first call already used it. In that rare case, we take a hit and create a
 // new SuperVersion() inside of the mutex. We do similar thing
 // for superversion_to_free
-void DBImpl::InstallSuperVersion(ColumnFamilyData* cfd,
-                                 DeletionState& deletion_state) {
+void DBImpl::InstallSuperVersionBackground(
+    ColumnFamilyData* cfd, JobContext* job_context,
+    const MutableCFOptions& mutable_cf_options) {
+  mutex_.AssertHeld();
+  SuperVersion* old_superversion = InstallSuperVersion(
+      cfd, job_context->new_superversion, mutable_cf_options);
+  job_context->new_superversion = nullptr;
+  job_context->superversions_to_free.push_back(old_superversion);
+}
+
+SuperVersion* DBImpl::InstallSuperVersion(
+    ColumnFamilyData* cfd, SuperVersion* new_sv,
+    const MutableCFOptions& mutable_cf_options, bool dont_schedule_bg_work) {
   mutex_.AssertHeld();
-  // if new_superversion == nullptr, it means somebody already used it
-  SuperVersion* new_superversion =
-    (deletion_state.new_superversion != nullptr) ?
-    deletion_state.new_superversion : new SuperVersion();
-  SuperVersion* old_superversion =
-      cfd->InstallSuperVersion(new_superversion, &mutex_);
-  deletion_state.new_superversion = nullptr;
-  deletion_state.superversions_to_free.push_back(old_superversion);
+
+  // Update max_total_in_memory_state_
+  size_t old_memtable_size = 0;
+  auto* old_sv = cfd->GetSuperVersion();
+  if (old_sv) {
+    old_memtable_size = old_sv->mutable_cf_options.write_buffer_size *
+                        old_sv->mutable_cf_options.max_write_buffer_number;
+  }
+
+  auto* old = cfd->InstallSuperVersion(
+      new_sv ? new_sv : new SuperVersion(), &mutex_, mutable_cf_options);
+
+  // Whenever we install new SuperVersion, we might need to issue new flushes or
+  // compactions. dont_schedule_bg_work is true when scheduling from write
+  // thread and we don't want to add additional overhead. Callers promise to
+  // call SchedulePendingFlush() and MaybeScheduleFlushOrCompaction() eventually
+  if (!dont_schedule_bg_work) {
+    SchedulePendingFlush(cfd);
+    SchedulePendingCompaction(cfd);
+    MaybeScheduleFlushOrCompaction();
+  }
+
+  // Update max_total_in_memory_state_
+  max_total_in_memory_state_ =
+      max_total_in_memory_state_ - old_memtable_size +
+      mutable_cf_options.write_buffer_size *
+      mutable_cf_options.max_write_buffer_number;
+  return old;
 }
 
-Status DBImpl::GetImpl(const ReadOptions& options,
+Status DBImpl::GetImpl(const ReadOptions& read_options,
                        ColumnFamilyHandle* column_family, const Slice& key,
                        std::string* value, bool* value_found) {
-  StopWatch sw(env_, options_.statistics.get(), DB_GET, false);
-  PERF_TIMER_AUTO(get_snapshot_time);
+  StopWatch sw(env_, stats_, DB_GET);
+  PERF_TIMER_GUARD(get_snapshot_time);
 
   auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
   auto cfd = cfh->cfd();
 
   SequenceNumber snapshot;
-  if (options.snapshot != nullptr) {
-    snapshot = reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_;
+  if (read_options.snapshot != nullptr) {
+    snapshot = reinterpret_cast<const SnapshotImpl*>(
+        read_options.snapshot)->number_;
   } else {
     snapshot = versions_->LastSequence();
   }
 
   // Acquire SuperVersion
-  SuperVersion* sv = nullptr;
-  // TODO(ljin): consider using GetReferencedSuperVersion() directly
-  if (LIKELY(options_.allow_thread_local)) {
-    sv = cfd->GetThreadLocalSuperVersion(&mutex_);
-  } else {
-    mutex_.Lock();
-    sv = cfd->GetSuperVersion()->Ref();
-    mutex_.Unlock();
-  }
-
-  bool have_stat_update = false;
-  Version::GetStats stats;
+  SuperVersion* sv = GetAndRefSuperVersion(cfd);
 
   // Prepare to store a list of merge operations if merge occurs.
   MergeContext merge_context;
@@ -3227,70 +2634,44 @@ Status DBImpl::GetImpl(const ReadOptions& options,
   // merge_operands will contain the sequence of merges in the latter case.
   LookupKey lkey(key, snapshot);
   PERF_TIMER_STOP(get_snapshot_time);
-  if (sv->mem->Get(lkey, value, &s, merge_context, *cfd->options())) {
+
+  if (sv->mem->Get(lkey, value, &s, &merge_context)) {
     // Done
-    RecordTick(options_.statistics.get(), MEMTABLE_HIT);
-  } else if (sv->imm->Get(lkey, value, &s, merge_context, *cfd->options())) {
+    RecordTick(stats_, MEMTABLE_HIT);
+  } else if (sv->imm->Get(lkey, value, &s, &merge_context)) {
     // Done
-    RecordTick(options_.statistics.get(), MEMTABLE_HIT);
+    RecordTick(stats_, MEMTABLE_HIT);
   } else {
-    PERF_TIMER_START(get_from_output_files_time);
-
-    sv->current->Get(options, lkey, value, &s, &merge_context, &stats,
+    PERF_TIMER_GUARD(get_from_output_files_time);
+    sv->current->Get(read_options, lkey, value, &s, &merge_context,
                      value_found);
-    have_stat_update = true;
-    PERF_TIMER_STOP(get_from_output_files_time);
-    RecordTick(options_.statistics.get(), MEMTABLE_MISS);
+    RecordTick(stats_, MEMTABLE_MISS);
   }
 
-  PERF_TIMER_START(get_post_process_time);
-
-  if (!cfd->options()->disable_seek_compaction && have_stat_update) {
-    mutex_.Lock();
-    if (sv->current->UpdateStats(stats)) {
-      MaybeScheduleFlushOrCompaction();
-    }
-    mutex_.Unlock();
-  }
+  {
+    PERF_TIMER_GUARD(get_post_process_time);
 
-  bool unref_sv = true;
-  if (LIKELY(options_.allow_thread_local)) {
-    unref_sv = !cfd->ReturnThreadLocalSuperVersion(sv);
-  }
+    ReturnAndCleanupSuperVersion(cfd, sv);
 
-  if (unref_sv) {
-    // Release SuperVersion
-    if (sv->Unref()) {
-      mutex_.Lock();
-      sv->Cleanup();
-      mutex_.Unlock();
-      delete sv;
-      RecordTick(options_.statistics.get(), NUMBER_SUPERVERSION_CLEANUPS);
-    }
-    RecordTick(options_.statistics.get(), NUMBER_SUPERVERSION_RELEASES);
+    RecordTick(stats_, NUMBER_KEYS_READ);
+    RecordTick(stats_, BYTES_READ, value->size());
   }
-
-  RecordTick(options_.statistics.get(), NUMBER_KEYS_READ);
-  RecordTick(options_.statistics.get(), BYTES_READ, value->size());
-  PERF_TIMER_STOP(get_post_process_time);
   return s;
 }
 
 std::vector<Status> DBImpl::MultiGet(
-    const ReadOptions& options,
+    const ReadOptions& read_options,
     const std::vector<ColumnFamilyHandle*>& column_family,
     const std::vector<Slice>& keys, std::vector<std::string>* values) {
 
-  StopWatch sw(env_, options_.statistics.get(), DB_MULTIGET, false);
-  PERF_TIMER_AUTO(get_snapshot_time);
+  StopWatch sw(env_, stats_, DB_MULTIGET);
+  PERF_TIMER_GUARD(get_snapshot_time);
 
   SequenceNumber snapshot;
 
   struct MultiGetColumnFamilyData {
     ColumnFamilyData* cfd;
     SuperVersion* super_version;
-    Version::GetStats stats;
-    bool have_stat_update = false;
   };
   std::unordered_map<uint32_t, MultiGetColumnFamilyData*> multiget_cf_data;
   // fill up and allocate outside of mutex
@@ -3305,8 +2686,9 @@ std::vector<Status> DBImpl::MultiGet(
   }
 
   mutex_.Lock();
-  if (options.snapshot != nullptr) {
-    snapshot = reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_;
+  if (read_options.snapshot != nullptr) {
+    snapshot = reinterpret_cast<const SnapshotImpl*>(
+        read_options.snapshot)->number_;
   } else {
     snapshot = versions_->LastSequence();
   }
@@ -3343,17 +2725,14 @@ std::vector<Status> DBImpl::MultiGet(
     assert(mgd_iter != multiget_cf_data.end());
     auto mgd = mgd_iter->second;
     auto super_version = mgd->super_version;
-    auto cfd = mgd->cfd;
-    if (super_version->mem->Get(lkey, value, &s, merge_context,
-                                *cfd->options())) {
+    if (super_version->mem->Get(lkey, value, &s, &merge_context)) {
       // Done
-    } else if (super_version->imm->Get(lkey, value, &s, merge_context,
-                                       *cfd->options())) {
+    } else if (super_version->imm->Get(lkey, value, &s, &merge_context)) {
       // Done
     } else {
-      super_version->current->Get(options, lkey, value, &s, &merge_context,
-                                  &mgd->stats);
-      mgd->have_stat_update = true;
+      PERF_TIMER_GUARD(get_from_output_files_time);
+      super_version->current->Get(read_options, lkey, value, &s,
+                                  &merge_context);
     }
 
     if (s.ok()) {
@@ -3362,27 +2741,18 @@ std::vector<Status> DBImpl::MultiGet(
   }
 
   // Post processing (decrement reference counts and record statistics)
-  PERF_TIMER_START(get_post_process_time);
+  PERF_TIMER_GUARD(get_post_process_time);
   autovector<SuperVersion*> superversions_to_delete;
 
-  bool schedule_flush_or_compaction = false;
+  // TODO(icanadi) do we need lock here or just around Cleanup()?
   mutex_.Lock();
   for (auto mgd_iter : multiget_cf_data) {
     auto mgd = mgd_iter.second;
-    auto cfd = mgd->cfd;
-    if (!cfd->options()->disable_seek_compaction && mgd->have_stat_update) {
-      if (mgd->super_version->current->UpdateStats(mgd->stats)) {
-        schedule_flush_or_compaction = true;
-      }
-    }
     if (mgd->super_version->Unref()) {
       mgd->super_version->Cleanup();
       superversions_to_delete.push_back(mgd->super_version);
     }
   }
-  if (schedule_flush_or_compaction) {
-    MaybeScheduleFlushOrCompaction();
-  }
   mutex_.Unlock();
 
   for (auto td : superversions_to_delete) {
@@ -3392,48 +2762,74 @@ std::vector<Status> DBImpl::MultiGet(
     delete mgd.second;
   }
 
-  RecordTick(options_.statistics.get(), NUMBER_MULTIGET_CALLS);
-  RecordTick(options_.statistics.get(), NUMBER_MULTIGET_KEYS_READ, num_keys);
-  RecordTick(options_.statistics.get(), NUMBER_MULTIGET_BYTES_READ, bytes_read);
+  RecordTick(stats_, NUMBER_MULTIGET_CALLS);
+  RecordTick(stats_, NUMBER_MULTIGET_KEYS_READ, num_keys);
+  RecordTick(stats_, NUMBER_MULTIGET_BYTES_READ, bytes_read);
   PERF_TIMER_STOP(get_post_process_time);
 
   return stat_list;
 }
 
-Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& options,
+Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& cf_options,
                                   const std::string& column_family_name,
                                   ColumnFamilyHandle** handle) {
+  Status s;
   *handle = nullptr;
-  MutexLock l(&mutex_);
+  {
+    InstrumentedMutexLock l(&mutex_);
 
-  if (versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name) !=
-      nullptr) {
-    return Status::InvalidArgument("Column family already exists");
-  }
-  VersionEdit edit;
-  edit.AddColumnFamily(column_family_name);
-  uint32_t new_id = versions_->GetColumnFamilySet()->GetNextColumnFamilyID();
-  edit.SetColumnFamily(new_id);
-  edit.SetLogNumber(logfile_number_);
-  edit.SetComparatorName(options.comparator->Name());
-
-  // LogAndApply will both write the creation in MANIFEST and create
-  // ColumnFamilyData object
-  Status s = versions_->LogAndApply(nullptr, &edit, &mutex_,
-                                    db_directory_.get(), false, &options);
+    if (versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name) !=
+        nullptr) {
+      return Status::InvalidArgument("Column family already exists");
+    }
+    VersionEdit edit;
+    edit.AddColumnFamily(column_family_name);
+    uint32_t new_id = versions_->GetColumnFamilySet()->GetNextColumnFamilyID();
+    edit.SetColumnFamily(new_id);
+    edit.SetLogNumber(logfile_number_);
+    edit.SetComparatorName(cf_options.comparator->Name());
+
+    // LogAndApply will both write the creation in MANIFEST and create
+    // ColumnFamilyData object
+    Options opt(db_options_, cf_options);
+    {  // write thread
+      WriteThread::Writer w(&mutex_);
+      s = write_thread_.EnterWriteThread(&w, 0);
+      assert(s.ok() && !w.done);  // No timeout and nobody should do our job
+      // LogAndApply will both write the creation in MANIFEST and create
+      // ColumnFamilyData object
+      s = versions_->LogAndApply(
+          nullptr, MutableCFOptions(opt, ImmutableCFOptions(opt)), &edit,
+          &mutex_, directories_.GetDbDir(), false, &cf_options);
+      write_thread_.ExitWriteThread(&w, &w, s);
+    }
+    if (s.ok()) {
+      single_column_family_mode_ = false;
+      auto* cfd =
+          versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name);
+      assert(cfd != nullptr);
+      delete InstallSuperVersion(
+          cfd, nullptr, *cfd->GetLatestMutableCFOptions());
+
+      if (!cfd->mem()->IsSnapshotSupported()) {
+        is_snapshot_supported_ = false;
+      }
+
+      *handle = new ColumnFamilyHandleImpl(cfd, this, &mutex_);
+      Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+          "Created column family [%s] (ID %u)",
+          column_family_name.c_str(), (unsigned)cfd->GetID());
+    } else {
+      Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+          "Creating column family [%s] FAILED -- %s",
+          column_family_name.c_str(), s.ToString().c_str());
+    }
+  }  // InstrumentedMutexLock l(&mutex_)
+
+  // this is outside the mutex
   if (s.ok()) {
-    auto cfd =
-        versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name);
-    assert(cfd != nullptr);
-    delete cfd->InstallSuperVersion(new SuperVersion(), &mutex_);
-    *handle = new ColumnFamilyHandleImpl(cfd, this, &mutex_);
-    Log(options_.info_log, "Created column family [%s] (ID %u)",
-        column_family_name.c_str(), (unsigned)cfd->GetID());
-    max_total_in_memory_state_ += cfd->options()->write_buffer_size *
-                                  cfd->options()->max_write_buffer_number;
-  } else {
-    Log(options_.info_log, "Creating column family [%s] FAILED -- %s",
-        column_family_name.c_str(), s.ToString().c_str());
+    NewThreadStatusCfInfo(
+        reinterpret_cast<ColumnFamilyHandleImpl*>(*handle)->cfd());
   }
   return s;
 }
@@ -3445,154 +2841,260 @@ Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) {
     return Status::InvalidArgument("Can't drop default column family");
   }
 
+  bool cf_support_snapshot = cfd->mem()->IsSnapshotSupported();
+
   VersionEdit edit;
   edit.DropColumnFamily();
   edit.SetColumnFamily(cfd->GetID());
 
   Status s;
   {
-    MutexLock l(&mutex_);
+    InstrumentedMutexLock l(&mutex_);
     if (cfd->IsDropped()) {
       s = Status::InvalidArgument("Column family already dropped!\n");
     }
     if (s.ok()) {
-      s = versions_->LogAndApply(cfd, &edit, &mutex_);
+      // we drop column family from a single write thread
+      WriteThread::Writer w(&mutex_);
+      s = write_thread_.EnterWriteThread(&w, 0);
+      assert(s.ok() && !w.done);  // No timeout and nobody should do our job
+      s = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
+                                 &edit, &mutex_);
+      write_thread_.ExitWriteThread(&w, &w, s);
+    }
+
+    if (!cf_support_snapshot) {
+      // Dropped Column Family doesn't support snapshot. Need to recalculate
+      // is_snapshot_supported_.
+      bool new_is_snapshot_supported = true;
+      for (auto c : *versions_->GetColumnFamilySet()) {
+        if (!c->IsDropped() && !c->mem()->IsSnapshotSupported()) {
+          new_is_snapshot_supported = false;
+          break;
+        }
+      }
+      is_snapshot_supported_ = new_is_snapshot_supported;
     }
   }
 
   if (s.ok()) {
+    // Note that here we erase the associated cf_info of the to-be-dropped
+    // cfd before its ref-count goes to zero to avoid having to erase cf_info
+    // later inside db_mutex.
+    EraseThreadStatusCfInfo(cfd);
     assert(cfd->IsDropped());
-    max_total_in_memory_state_ -= cfd->options()->write_buffer_size *
-                                  cfd->options()->max_write_buffer_number;
-    Log(options_.info_log, "Dropped column family with id %u\n", cfd->GetID());
-    // Flush the memtables. This will make all WAL files referencing dropped
-    // column family to be obsolete. They will be deleted once user deletes
-    // column family handle
-    Write(WriteOptions(), nullptr);  // ignore error
+    auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
+    max_total_in_memory_state_ -= mutable_cf_options->write_buffer_size *
+                                  mutable_cf_options->max_write_buffer_number;
+    Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+        "Dropped column family with id %u\n",
+        cfd->GetID());
   } else {
-    Log(options_.info_log, "Dropping column family with id %u FAILED -- %s\n",
+    Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+        "Dropping column family with id %u FAILED -- %s\n",
         cfd->GetID(), s.ToString().c_str());
   }
 
   return s;
 }
 
-bool DBImpl::KeyMayExist(const ReadOptions& options,
+bool DBImpl::KeyMayExist(const ReadOptions& read_options,
                          ColumnFamilyHandle* column_family, const Slice& key,
                          std::string* value, bool* value_found) {
   if (value_found != nullptr) {
     // falsify later if key-may-exist but can't fetch value
     *value_found = true;
   }
-  ReadOptions roptions = options;
+  ReadOptions roptions = read_options;
   roptions.read_tier = kBlockCacheTier; // read from block cache only
   auto s = GetImpl(roptions, column_family, key, value, value_found);
 
-  // If options.block_cache != nullptr and the index block of the table didn't
+  // If block_cache is enabled and the index block of the table didn't
   // not present in block_cache, the return value will be Status::Incomplete.
   // In this case, key may still exist in the table.
   return s.ok() || s.IsIncomplete();
 }
 
-Iterator* DBImpl::NewIterator(const ReadOptions& options,
+Iterator* DBImpl::NewIterator(const ReadOptions& read_options,
                               ColumnFamilyHandle* column_family) {
   auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
   auto cfd = cfh->cfd();
 
-  Iterator* iter;
-  if (options.tailing) {
+  XFUNC_TEST("", "managed_new", managed_new1, xf_manage_new,
+             reinterpret_cast<DBImpl*>(this),
+             const_cast<ReadOptions*>(&read_options), is_snapshot_supported_);
+  if (read_options.managed) {
+#ifdef ROCKSDB_LITE
+    // not supported in lite version
+    return NewErrorIterator(Status::InvalidArgument(
+        "Managed Iterators not supported in RocksDBLite."));
+#else
+    if ((read_options.tailing) || (read_options.snapshot != nullptr) ||
+        (is_snapshot_supported_)) {
+      return new ManagedIterator(this, read_options, cfd);
+    }
+    // Managed iter not supported
+    return NewErrorIterator(Status::InvalidArgument(
+        "Managed Iterators not supported without snapshots."));
+#endif
+  } else if (read_options.tailing) {
 #ifdef ROCKSDB_LITE
     // not supported in lite version
     return nullptr;
 #else
-    iter = new TailingIterator(env_, this, options, cfd);
+    SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_);
+    auto iter = new ForwardIterator(this, read_options, cfd, sv);
+    return NewDBIterator(env_, *cfd->ioptions(), cfd->user_comparator(), iter,
+        kMaxSequenceNumber,
+        sv->mutable_cf_options.max_sequential_skip_in_iterations,
+        read_options.iterate_upper_bound);
 #endif
   } else {
     SequenceNumber latest_snapshot = versions_->LastSequence();
-    SuperVersion* sv = nullptr;
-    sv = cfd->GetReferencedSuperVersion(&mutex_);
-
-    iter = NewInternalIterator(options, cfd, sv);
+    SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_);
 
     auto snapshot =
-        options.snapshot != nullptr
-            ? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
+        read_options.snapshot != nullptr
+            ? reinterpret_cast<const SnapshotImpl*>(
+                read_options.snapshot)->number_
             : latest_snapshot;
-    iter = NewDBIterator(env_, *cfd->options(),
-                         cfd->user_comparator(), iter, snapshot);
-  }
 
-  return iter;
+    // Try to generate a DB iterator tree in continuous memory area to be
+    // cache friendly. Here is an example of result:
+    // +-------------------------------+
+    // |                               |
+    // | ArenaWrappedDBIter            |
+    // |  +                            |
+    // |  +---> Inner Iterator   ------------+
+    // |  |                            |     |
+    // |  |    +-- -- -- -- -- -- -- --+     |
+    // |  +--- | Arena                 |     |
+    // |       |                       |     |
+    // |          Allocated Memory:    |     |
+    // |       |   +-------------------+     |
+    // |       |   | DBIter            | <---+
+    // |           |  +                |
+    // |       |   |  +-> iter_  ------------+
+    // |       |   |                   |     |
+    // |       |   +-------------------+     |
+    // |       |   | MergingIterator   | <---+
+    // |           |  +                |
+    // |       |   |  +->child iter1  ------------+
+    // |       |   |  |                |          |
+    // |           |  +->child iter2  ----------+ |
+    // |       |   |  |                |        | |
+    // |       |   |  +->child iter3  --------+ | |
+    // |           |                   |      | | |
+    // |       |   +-------------------+      | | |
+    // |       |   | Iterator1         | <--------+
+    // |       |   +-------------------+      | |
+    // |       |   | Iterator2         | <------+
+    // |       |   +-------------------+      |
+    // |       |   | Iterator3         | <----+
+    // |       |   +-------------------+
+    // |       |                       |
+    // +-------+-----------------------+
+    //
+    // ArenaWrappedDBIter inlines an arena area where all the iterartor in the
+    // the iterator tree is allocated in the order of being accessed when
+    // querying.
+    // Laying out the iterators in the order of being accessed makes it more
+    // likely that any iterator pointer is close to the iterator it points to so
+    // that they are likely to be in the same cache line and/or page.
+    ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator(
+        env_, *cfd->ioptions(), cfd->user_comparator(),
+        snapshot, sv->mutable_cf_options.max_sequential_skip_in_iterations,
+        read_options.iterate_upper_bound);
+
+    Iterator* internal_iter =
+        NewInternalIterator(read_options, cfd, sv, db_iter->GetArena());
+    db_iter->SetIterUnderDBIter(internal_iter);
+
+    return db_iter;
+  }
+  // To stop compiler from complaining
+  return nullptr;
 }
 
 Status DBImpl::NewIterators(
-    const ReadOptions& options,
+    const ReadOptions& read_options,
     const std::vector<ColumnFamilyHandle*>& column_families,
     std::vector<Iterator*>* iterators) {
   iterators->clear();
   iterators->reserve(column_families.size());
-  SequenceNumber latest_snapshot = 0;
-  std::vector<SuperVersion*> super_versions;
-  super_versions.reserve(column_families.size());
-
-  if (!options.tailing) {
-    mutex_.Lock();
-    latest_snapshot = versions_->LastSequence();
+  XFUNC_TEST("", "managed_new", managed_new1, xf_manage_new,
+             reinterpret_cast<DBImpl*>(this),
+             const_cast<ReadOptions*>(&read_options), is_snapshot_supported_);
+  if (read_options.managed) {
+#ifdef ROCKSDB_LITE
+    return Status::InvalidArgument(
+        "Managed interator not supported in RocksDB lite");
+#else
+    if ((!read_options.tailing) && (read_options.snapshot == nullptr) &&
+        (!is_snapshot_supported_)) {
+      return Status::InvalidArgument(
+          "Managed interator not supported without snapshots");
+    }
     for (auto cfh : column_families) {
       auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh)->cfd();
-      super_versions.push_back(cfd->GetSuperVersion()->Ref());
+      auto iter = new ManagedIterator(this, read_options, cfd);
+      iterators->push_back(iter);
     }
-    mutex_.Unlock();
-  }
-
-  if (options.tailing) {
+#endif
+  } else if (read_options.tailing) {
 #ifdef ROCKSDB_LITE
     return Status::InvalidArgument(
         "Tailing interator not supported in RocksDB lite");
 #else
     for (auto cfh : column_families) {
       auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh)->cfd();
-      iterators->push_back(new TailingIterator(env_, this, options, cfd));
+      SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_);
+      auto iter = new ForwardIterator(this, read_options, cfd, sv);
+      iterators->push_back(
+          NewDBIterator(env_, *cfd->ioptions(), cfd->user_comparator(), iter,
+              kMaxSequenceNumber,
+              sv->mutable_cf_options.max_sequential_skip_in_iterations));
     }
 #endif
   } else {
+    SequenceNumber latest_snapshot = versions_->LastSequence();
+
     for (size_t i = 0; i < column_families.size(); ++i) {
-      auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_families[i]);
-      auto cfd = cfh->cfd();
+      auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(
+          column_families[i])->cfd();
+      SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_);
 
       auto snapshot =
-          options.snapshot != nullptr
-              ? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
+          read_options.snapshot != nullptr
+              ? reinterpret_cast<const SnapshotImpl*>(
+                  read_options.snapshot)->number_
               : latest_snapshot;
 
-      auto iter = NewInternalIterator(options, cfd, super_versions[i]);
-      iter = NewDBIterator(env_, *cfd->options(),
-                           cfd->user_comparator(), iter, snapshot);
-      iterators->push_back(iter);
+      ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator(
+          env_, *cfd->ioptions(), cfd->user_comparator(), snapshot,
+          sv->mutable_cf_options.max_sequential_skip_in_iterations);
+      Iterator* internal_iter = NewInternalIterator(
+          read_options, cfd, sv, db_iter->GetArena());
+      db_iter->SetIterUnderDBIter(internal_iter);
+      iterators->push_back(db_iter);
     }
   }
 
   return Status::OK();
 }
 
-bool DBImpl::IsSnapshotSupported() const {
-  for (auto cfd : *versions_->GetColumnFamilySet()) {
-    if (!cfd->mem()->IsSnapshotSupported()) {
-      return false;
-    }
-  }
-  return true;
-}
-
 const Snapshot* DBImpl::GetSnapshot() {
+  int64_t unix_time = 0;
+  env_->GetCurrentTime(&unix_time);  // Ignore error
+
+  InstrumentedMutexLock l(&mutex_);
   // returns null if the underlying memtable does not support snapshot.
-  if (!IsSnapshotSupported()) return nullptr;
-  MutexLock l(&mutex_);
-  return snapshots_.New(versions_->LastSequence());
+  if (!is_snapshot_supported_) return nullptr;
+  return snapshots_.New(versions_->LastSequence(), unix_time);
 }
 
 void DBImpl::ReleaseSnapshot(const Snapshot* s) {
-  MutexLock l(&mutex_);
+  InstrumentedMutexLock l(&mutex_);
   snapshots_.Delete(reinterpret_cast<const SnapshotImpl*>(s));
 }
 
@@ -3605,87 +3107,156 @@ Status DBImpl::Put(const WriteOptions& o, ColumnFamilyHandle* column_family,
 Status DBImpl::Merge(const WriteOptions& o, ColumnFamilyHandle* column_family,
                      const Slice& key, const Slice& val) {
   auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
-  if (!cfh->cfd()->options()->merge_operator) {
+  if (!cfh->cfd()->ioptions()->merge_operator) {
     return Status::NotSupported("Provide a merge_operator when opening DB");
   } else {
     return DB::Merge(o, column_family, key, val);
   }
 }
 
-Status DBImpl::Delete(const WriteOptions& options,
+Status DBImpl::Delete(const WriteOptions& write_options,
                       ColumnFamilyHandle* column_family, const Slice& key) {
-  return DB::Delete(options, column_family, key);
+  return DB::Delete(write_options, column_family, key);
 }
 
-Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
-  PERF_TIMER_AUTO(write_pre_and_post_process_time);
-  Writer w(&mutex_);
+Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) {
+  if (my_batch == nullptr) {
+    return Status::Corruption("Batch is nullptr!");
+  }
+  PERF_TIMER_GUARD(write_pre_and_post_process_time);
+  WriteThread::Writer w(&mutex_);
   w.batch = my_batch;
-  w.sync = options.sync;
-  w.disableWAL = options.disableWAL;
+  w.sync = write_options.sync;
+  w.disableWAL = write_options.disableWAL;
+  w.in_batch_group = false;
   w.done = false;
+  w.timeout_hint_us = write_options.timeout_hint_us;
 
-  StopWatch sw(env_, options_.statistics.get(), DB_WRITE, false);
-  mutex_.Lock();
-  writers_.push_back(&w);
-  while (!w.done && &w != writers_.front()) {
-    w.cv.Wait();
+  uint64_t expiration_time = 0;
+  bool has_timeout = false;
+  if (w.timeout_hint_us == 0) {
+    w.timeout_hint_us = WriteThread::kNoTimeOut;
+  } else {
+    expiration_time = env_->NowMicros() + w.timeout_hint_us;
+    has_timeout = true;
+  }
+
+  if (!write_options.disableWAL) {
+    RecordTick(stats_, WRITE_WITH_WAL);
   }
 
-  if (!options.disableWAL) {
-    RecordTick(options_.statistics.get(), WRITE_WITH_WAL, 1);
+  WriteContext context;
+  mutex_.Lock();
+
+  if (!write_options.disableWAL) {
+    default_cf_internal_stats_->AddDBStats(InternalStats::WRITE_WITH_WAL, 1);
   }
 
-  if (w.done) {
+  Status status = write_thread_.EnterWriteThread(&w, expiration_time);
+  assert(status.ok() || status.IsTimedOut());
+  if (status.IsTimedOut()) {
+    mutex_.Unlock();
+    RecordTick(stats_, WRITE_TIMEDOUT);
+    return Status::TimedOut();
+  }
+  if (w.done) {  // write was done by someone else
+    default_cf_internal_stats_->AddDBStats(InternalStats::WRITE_DONE_BY_OTHER,
+                                           1);
     mutex_.Unlock();
-    RecordTick(options_.statistics.get(), WRITE_DONE_BY_OTHER, 1);
+    RecordTick(stats_, WRITE_DONE_BY_OTHER);
     return w.status;
-  } else {
-    RecordTick(options_.statistics.get(), WRITE_DONE_BY_SELF, 1);
   }
 
-  uint64_t flush_column_family_if_log_file = 0;
-  uint64_t max_total_wal_size = (options_.max_total_wal_size == 0)
-                                    ? 2 * max_total_in_memory_state_
-                                    : options_.max_total_wal_size;
-  if (alive_log_files_.begin()->getting_flushed == false &&
+  RecordTick(stats_, WRITE_DONE_BY_SELF);
+  default_cf_internal_stats_->AddDBStats(InternalStats::WRITE_DONE_BY_SELF, 1);
+
+  // Once reaches this point, the current writer "w" will try to do its write
+  // job.  It may also pick up some of the remaining writers in the "writers_"
+  // when it finds suitable, and finish them in the same write batch.
+  // This is how a write job could be done by the other writer.
+  assert(!single_column_family_mode_ ||
+         versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1);
+
+  uint64_t max_total_wal_size = (db_options_.max_total_wal_size == 0)
+                                    ? 4 * max_total_in_memory_state_
+                                    : db_options_.max_total_wal_size;
+  if (UNLIKELY(!single_column_family_mode_) &&
+      alive_log_files_.begin()->getting_flushed == false &&
       total_log_size_ > max_total_wal_size) {
-    flush_column_family_if_log_file = alive_log_files_.begin()->number;
+    uint64_t flush_column_family_if_log_file = alive_log_files_.begin()->number;
     alive_log_files_.begin()->getting_flushed = true;
-    Log(options_.info_log,
-        "Flushing all column families with data in WAL number %" PRIu64,
-        flush_column_family_if_log_file);
+    Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+        "Flushing all column families with data in WAL number %" PRIu64
+        ". Total log size is %" PRIu64 " while max_total_wal_size is %" PRIu64,
+        flush_column_family_if_log_file, total_log_size_, max_total_wal_size);
+    // no need to refcount because drop is happening in write thread, so can't
+    // happen while we're in the write thread
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      if (cfd->IsDropped()) {
+        continue;
+      }
+      if (cfd->GetLogNumber() <= flush_column_family_if_log_file) {
+        status = SetNewMemtableAndNewLogFile(cfd, &context);
+        if (!status.ok()) {
+          break;
+        }
+        cfd->imm()->FlushRequested();
+        SchedulePendingFlush(cfd);
+        context.schedule_bg_work_ = true;
+      }
+    }
+  } else if (UNLIKELY(write_buffer_.ShouldFlush())) {
+    Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+        "Flushing all column families. Write buffer is using %" PRIu64
+        " bytes out of a total of %" PRIu64 ".",
+        write_buffer_.memory_usage(), write_buffer_.buffer_size());
+    // no need to refcount because drop is happening in write thread, so can't
+    // happen while we're in the write thread
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      if (cfd->IsDropped()) {
+        continue;
+      }
+      if (!cfd->mem()->IsEmpty()) {
+        status = SetNewMemtableAndNewLogFile(cfd, &context);
+        if (!status.ok()) {
+          break;
+        }
+        cfd->imm()->FlushRequested();
+        SchedulePendingFlush(cfd);
+        context.schedule_bg_work_ = true;
+      }
+    }
+    MaybeScheduleFlushOrCompaction();
   }
 
-  Status status;
-  // refcounting cfd in iteration
-  bool dead_cfd = false;
-  autovector<SuperVersion*> superversions_to_free;
-  autovector<log::Writer*> logs_to_free;
-  for (auto cfd : *versions_->GetColumnFamilySet()) {
-    cfd->Ref();
-    bool force_flush = my_batch == nullptr ||
-                       (flush_column_family_if_log_file != 0 &&
-                        cfd->GetLogNumber() <= flush_column_family_if_log_file);
-    // May temporarily unlock and wait.
-    status = MakeRoomForWrite(cfd, force_flush, &superversions_to_free,
-                              &logs_to_free);
-    if (cfd->Unref()) {
-      dead_cfd = true;
-    }
-    if (!status.ok()) {
-      break;
+  if (UNLIKELY(status.ok() && !bg_error_.ok())) {
+    status = bg_error_;
+  }
+
+  if (UNLIKELY(status.ok() && !flush_scheduler_.Empty())) {
+    status = ScheduleFlushes(&context);
+  }
+
+  if (UNLIKELY(status.ok() && (write_controller_.IsStopped() ||
+                               write_controller_.GetDelay() > 0))) {
+    // If writer is stopped, we need to get it going,
+    // so schedule flushes/compactions
+    if (context.schedule_bg_work_) {
+      MaybeScheduleFlushOrCompaction();
     }
+    status = DelayWrite(expiration_time);
   }
-  if (dead_cfd) {
-    versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
+
+  if (UNLIKELY(status.ok() && has_timeout &&
+               env_->NowMicros() > expiration_time)) {
+    status = Status::TimedOut();
   }
 
   uint64_t last_sequence = versions_->LastSequence();
-  Writer* last_writer = &w;
-  if (status.ok() && my_batch != nullptr) {  // nullptr batch is for compactions
+  WriteThread::Writer* last_writer = &w;
+  if (status.ok()) {
     autovector<WriteBatch*> write_batch_group;
-    BuildBatchGroup(&last_writer, &write_batch_group);
+    write_thread_.BuildBatchGroup(&last_writer, &write_batch_group);
 
     // Add to log and apply to memtable.  We can release the lock
     // during this phase since &w is currently responsible for logging
@@ -3707,380 +3278,234 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
       WriteBatchInternal::SetSequence(updates, current_sequence);
       int my_batch_count = WriteBatchInternal::Count(updates);
       last_sequence += my_batch_count;
+      const uint64_t batch_size = WriteBatchInternal::ByteSize(updates);
       // Record statistics
-      RecordTick(options_.statistics.get(),
-                 NUMBER_KEYS_WRITTEN, my_batch_count);
-      RecordTick(options_.statistics.get(),
-                 BYTES_WRITTEN,
-                 WriteBatchInternal::ByteSize(updates));
-      if (options.disableWAL) {
+      RecordTick(stats_, NUMBER_KEYS_WRITTEN, my_batch_count);
+      RecordTick(stats_, BYTES_WRITTEN, batch_size);
+      if (write_options.disableWAL) {
         flush_on_destroy_ = true;
       }
       PERF_TIMER_STOP(write_pre_and_post_process_time);
 
-      if (!options.disableWAL) {
-        PERF_TIMER_START(write_wal_time);
+      uint64_t log_size = 0;
+      if (!write_options.disableWAL) {
+        PERF_TIMER_GUARD(write_wal_time);
         Slice log_entry = WriteBatchInternal::Contents(updates);
         status = log_->AddRecord(log_entry);
         total_log_size_ += log_entry.size();
         alive_log_files_.back().AddSize(log_entry.size());
         log_empty_ = false;
-        RecordTick(options_.statistics.get(), WAL_FILE_SYNCED, 1);
-        RecordTick(options_.statistics.get(), WAL_FILE_BYTES, log_entry.size());
-        if (status.ok() && options.sync) {
-          if (options_.use_fsync) {
-            StopWatch(env_, options_.statistics.get(), WAL_FILE_SYNC_MICROS);
+        log_size = log_entry.size();
+        RecordTick(stats_, WAL_FILE_BYTES, log_size);
+        if (status.ok() && write_options.sync) {
+          RecordTick(stats_, WAL_FILE_SYNCED);
+          StopWatch sw(env_, stats_, WAL_FILE_SYNC_MICROS);
+          if (db_options_.use_fsync) {
             status = log_->file()->Fsync();
           } else {
-            StopWatch(env_, options_.statistics.get(), WAL_FILE_SYNC_MICROS);
             status = log_->file()->Sync();
           }
+          if (status.ok() && !log_dir_synced_) {
+            // We only sync WAL directory the first time WAL syncing is
+            // requested, so that in case users never turn on WAL sync,
+            // we can avoid the disk I/O in the write code path.
+            status = directories_.GetWalDir()->Fsync();
+          }
+          log_dir_synced_ = true;
         }
-        PERF_TIMER_STOP(write_wal_time);
       }
       if (status.ok()) {
-        PERF_TIMER_START(write_memtable_time);
-        status = WriteBatchInternal::InsertInto(
-            updates, column_family_memtables_.get(), false, 0, this, false);
-        PERF_TIMER_STOP(write_memtable_time);
+        PERF_TIMER_GUARD(write_memtable_time);
 
-        if (!status.ok()) {
-          // Iteration failed (either in-memory writebatch corruption (very
-          // bad), or the client specified invalid column family). Return
-          // failure.
-          // Note that existing logic was not sound. Any partial failure writing
-          // into the memtable would result in a state that some write ops might
-          // have succeeded in memtable but Status reports error for all writes.
-          return status;
-        }
-        SetTickerCount(options_.statistics.get(), SEQUENCE_NUMBER,
-                       last_sequence);
+        status = WriteBatchInternal::InsertInto(
+            updates, column_family_memtables_.get(),
+            write_options.ignore_missing_column_families, 0, this, false);
+        // A non-OK status here indicates iteration failure (either in-memory
+        // writebatch corruption (very bad), or the client specified invalid
+        // column family).  This will later on trigger bg_error_.
+        //
+        // Note that existing logic was not sound. Any partial failure writing
+        // into the memtable would result in a state that some write ops might
+        // have succeeded in memtable but Status reports error for all writes.
+
+        SetTickerCount(stats_, SEQUENCE_NUMBER, last_sequence);
       }
       PERF_TIMER_START(write_pre_and_post_process_time);
-      if (updates == &tmp_batch_) tmp_batch_.Clear();
+      if (updates == &tmp_batch_) {
+        tmp_batch_.Clear();
+      }
       mutex_.Lock();
+      // internal stats
+      default_cf_internal_stats_->AddDBStats(
+          InternalStats::BYTES_WRITTEN, batch_size);
+      default_cf_internal_stats_->AddDBStats(InternalStats::NUMBER_KEYS_WRITTEN,
+                                             my_batch_count);
+      if (!write_options.disableWAL) {
+        default_cf_internal_stats_->AddDBStats(
+            InternalStats::WAL_FILE_SYNCED, 1);
+        default_cf_internal_stats_->AddDBStats(
+            InternalStats::WAL_FILE_BYTES, log_size);
+      }
       if (status.ok()) {
         versions_->SetLastSequence(last_sequence);
       }
     }
   }
-  if (options_.paranoid_checks && !status.ok() && bg_error_.ok()) {
+  if (db_options_.paranoid_checks && !status.ok() &&
+      !status.IsTimedOut() && bg_error_.ok()) {
     bg_error_ = status; // stop compaction & fail any further writes
   }
 
-  while (true) {
-    Writer* ready = writers_.front();
-    writers_.pop_front();
-    if (ready != &w) {
-      ready->status = status;
-      ready->done = true;
-      ready->cv.Signal();
-    }
-    if (ready == last_writer) break;
-  }
+  write_thread_.ExitWriteThread(&w, last_writer, status);
 
-  // Notify new head of write queue
-  if (!writers_.empty()) {
-    writers_.front()->cv.Signal();
+  if (context.schedule_bg_work_) {
+    MaybeScheduleFlushOrCompaction();
   }
   mutex_.Unlock();
 
-  for (auto& sv : superversions_to_free) {
-    delete sv;
-  }
-  for (auto& log : logs_to_free) {
-    delete log;
+  if (status.IsTimedOut()) {
+    RecordTick(stats_, WRITE_TIMEDOUT);
   }
 
-  PERF_TIMER_STOP(write_pre_and_post_process_time);
   return status;
 }
 
-// REQUIRES: Writer list must be non-empty
-// REQUIRES: First writer must have a non-nullptr batch
-void DBImpl::BuildBatchGroup(Writer** last_writer,
-                             autovector<WriteBatch*>* write_batch_group) {
-  assert(!writers_.empty());
-  Writer* first = writers_.front();
-  assert(first->batch != nullptr);
-
-  size_t size = WriteBatchInternal::ByteSize(first->batch);
-  write_batch_group->push_back(first->batch);
-
-  // Allow the group to grow up to a maximum size, but if the
-  // original write is small, limit the growth so we do not slow
-  // down the small write too much.
-  size_t max_size = 1 << 20;
-  if (size <= (128<<10)) {
-    max_size = size + (128<<10);
-  }
-
-  *last_writer = first;
-  std::deque<Writer*>::iterator iter = writers_.begin();
-  ++iter;  // Advance past "first"
-  for (; iter != writers_.end(); ++iter) {
-    Writer* w = *iter;
-    if (w->sync && !first->sync) {
-      // Do not include a sync write into a batch handled by a non-sync write.
-      break;
-    }
-
-    if (!w->disableWAL && first->disableWAL) {
-      // Do not include a write that needs WAL into a batch that has
-      // WAL disabled.
-      break;
+// REQUIRES: mutex_ is held
+// REQUIRES: this thread is currently at the front of the writer queue
+Status DBImpl::DelayWrite(uint64_t expiration_time) {
+  uint64_t time_delayed = 0;
+  bool delayed = false;
+  bool timed_out = false;
+  {
+    StopWatch sw(env_, stats_, WRITE_STALL, &time_delayed);
+    bool has_timeout = (expiration_time > 0);
+    auto delay = write_controller_.GetDelay();
+    if (write_controller_.IsStopped() == false && delay > 0) {
+      mutex_.Unlock();
+      delayed = true;
+      // hopefully we don't have to sleep more than 2 billion microseconds
+      TEST_SYNC_POINT("DBImpl::DelayWrite:Sleep");
+      env_->SleepForMicroseconds(static_cast<int>(delay));
+      mutex_.Lock();
     }
 
-    if (w->batch != nullptr) {
-      size += WriteBatchInternal::ByteSize(w->batch);
-      if (size > max_size) {
-        // Do not make batch too big
-        break;
+    while (bg_error_.ok() && write_controller_.IsStopped()) {
+      delayed = true;
+      if (has_timeout) {
+        TEST_SYNC_POINT("DBImpl::DelayWrite:TimedWait");
+        bg_cv_.TimedWait(expiration_time);
+        if (env_->NowMicros() > expiration_time) {
+          timed_out = true;
+          break;
+        }
+      } else {
+        bg_cv_.Wait();
       }
-
-      write_batch_group->push_back(w->batch);
     }
-    *last_writer = w;
   }
+  if (delayed) {
+    default_cf_internal_stats_->AddDBStats(InternalStats::WRITE_STALL_MICROS,
+                                           time_delayed);
+    RecordTick(stats_, STALL_MICROS, time_delayed);
+  }
+
+  if (timed_out) {
+    return Status::TimedOut();
+  }
+
+  return bg_error_;
 }
 
-// This function computes the amount of time in microseconds by which a write
-// should be delayed based on the number of level-0 files according to the
-// following formula:
-// if n < bottom, return 0;
-// if n >= top, return 1000;
-// otherwise, let r = (n - bottom) /
-//                    (top - bottom)
-//  and return r^2 * 1000.
-// The goal of this formula is to gradually increase the rate at which writes
-// are slowed. We also tried linear delay (r * 1000), but it seemed to do
-// slightly worse. There is no other particular reason for choosing quadratic.
-uint64_t DBImpl::SlowdownAmount(int n, double bottom, double top) {
-  uint64_t delay;
-  if (n >= top) {
-    delay = 1000;
-  }
-  else if (n < bottom) {
-    delay = 0;
-  }
-  else {
-    // If we are here, we know that:
-    //   level0_start_slowdown <= n < level0_slowdown
-    // since the previous two conditions are false.
-    double how_much =
-      (double) (n - bottom) /
-              (top - bottom);
-    delay = std::max(how_much * how_much * 1000, 100.0);
-  }
-  assert(delay <= 1000);
-  return delay;
+Status DBImpl::ScheduleFlushes(WriteContext* context) {
+  ColumnFamilyData* cfd;
+  while ((cfd = flush_scheduler_.GetNextColumnFamily()) != nullptr) {
+    auto status = SetNewMemtableAndNewLogFile(cfd, context);
+    SchedulePendingFlush(cfd);
+    context->schedule_bg_work_ = true;
+    if (cfd->Unref()) {
+      delete cfd;
+    }
+    if (!status.ok()) {
+      return status;
+    }
+  }
+  return Status::OK();
 }
 
 // REQUIRES: mutex_ is held
 // REQUIRES: this thread is currently at the front of the writer queue
-Status DBImpl::MakeRoomForWrite(
-    ColumnFamilyData* cfd, bool force,
-    autovector<SuperVersion*>* superversions_to_free,
-    autovector<log::Writer*>* logs_to_free) {
+Status DBImpl::SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd,
+                                           WriteContext* context) {
   mutex_.AssertHeld();
-  assert(!writers_.empty());
-  bool allow_delay = !force;
-  bool allow_hard_rate_limit_delay = !force;
-  bool allow_soft_rate_limit_delay = !force;
-  uint64_t rate_limit_delay_millis = 0;
+  unique_ptr<WritableFile> lfile;
+  log::Writer* new_log = nullptr;
+  MemTable* new_mem = nullptr;
+
+  // Attempt to switch to a new memtable and trigger flush of old.
+  // Do this without holding the dbmutex lock.
+  assert(versions_->prev_log_number() == 0);
+  bool creating_new_log = !log_empty_;
+  uint64_t new_log_number =
+      creating_new_log ? versions_->NewFileNumber() : logfile_number_;
+  SuperVersion* new_superversion = nullptr;
+  const MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions();
+  mutex_.Unlock();
   Status s;
-  double score;
-
-  while (true) {
-    if (!bg_error_.ok()) {
-      // Yield previous error
-      s = bg_error_;
-      break;
-    } else if (allow_delay && cfd->NeedSlowdownForNumLevel0Files()) {
-      // We are getting close to hitting a hard limit on the number of
-      // L0 files.  Rather than delaying a single write by several
-      // seconds when we hit the hard limit, start delaying each
-      // individual write by 0-1ms to reduce latency variance.  Also,
-      // this delay hands over some CPU to the compaction thread in
-      // case it is sharing the same core as the writer.
-      uint64_t slowdown =
-          SlowdownAmount(cfd->current()->NumLevelFiles(0),
-                         cfd->options()->level0_slowdown_writes_trigger,
-                         cfd->options()->level0_stop_writes_trigger);
-      mutex_.Unlock();
-      uint64_t delayed;
-      {
-        StopWatch sw(env_, options_.statistics.get(), STALL_L0_SLOWDOWN_COUNT);
-        env_->SleepForMicroseconds(slowdown);
-        delayed = sw.ElapsedMicros();
-      }
-      RecordTick(options_.statistics.get(), STALL_L0_SLOWDOWN_MICROS, delayed);
-      cfd->internal_stats()->RecordWriteStall(InternalStats::LEVEL0_SLOWDOWN,
-                                              delayed);
-      allow_delay = false;  // Do not delay a single write more than once
-      mutex_.Lock();
-      delayed_writes_++;
-    } else if (!force && !cfd->mem()->ShouldFlush()) {
-      // There is room in current memtable
-      if (allow_delay) {
-        DelayLoggingAndReset();
-      }
-      break;
-    } else if (cfd->imm()->size() ==
-               cfd->options()->max_write_buffer_number - 1) {
-      // We have filled up the current memtable, but the previous
-      // ones are still being flushed, so we wait.
-      DelayLoggingAndReset();
-      Log(options_.info_log, "[%s] wait for memtable flush...\n",
-          cfd->GetName().c_str());
-      MaybeScheduleFlushOrCompaction();
-      uint64_t stall;
-      {
-        StopWatch sw(env_, options_.statistics.get(),
-                     STALL_MEMTABLE_COMPACTION_COUNT);
-        bg_cv_.Wait();
-        stall = sw.ElapsedMicros();
-      }
-      RecordTick(options_.statistics.get(),
-                 STALL_MEMTABLE_COMPACTION_MICROS, stall);
-      cfd->internal_stats()->RecordWriteStall(
-          InternalStats::MEMTABLE_COMPACTION, stall);
-    } else if (cfd->current()->NumLevelFiles(0) >=
-               cfd->options()->level0_stop_writes_trigger) {
-      // There are too many level-0 files.
-      DelayLoggingAndReset();
-      Log(options_.info_log, "[%s] wait for fewer level0 files...\n",
-          cfd->GetName().c_str());
-      uint64_t stall;
-      {
-        StopWatch sw(env_, options_.statistics.get(),
-                     STALL_L0_NUM_FILES_COUNT);
-        bg_cv_.Wait();
-        stall = sw.ElapsedMicros();
-      }
-      RecordTick(options_.statistics.get(), STALL_L0_NUM_FILES_MICROS, stall);
-      cfd->internal_stats()->RecordWriteStall(InternalStats::LEVEL0_NUM_FILES,
-                                              stall);
-    } else if (allow_hard_rate_limit_delay &&
-               cfd->options()->hard_rate_limit > 1.0 &&
-               (score = cfd->current()->MaxCompactionScore()) >
-                   cfd->options()->hard_rate_limit) {
-      // Delay a write when the compaction score for any level is too large.
-      int max_level = cfd->current()->MaxCompactionScoreLevel();
-      mutex_.Unlock();
-      uint64_t delayed;
-      {
-        StopWatch sw(env_, options_.statistics.get(),
-                     HARD_RATE_LIMIT_DELAY_COUNT);
-        env_->SleepForMicroseconds(1000);
-        delayed = sw.ElapsedMicros();
-      }
-      cfd->internal_stats()->RecordLevelNSlowdown(max_level, delayed);
-      // Make sure the following value doesn't round to zero.
-      uint64_t rate_limit = std::max((delayed / 1000), (uint64_t) 1);
-      rate_limit_delay_millis += rate_limit;
-      RecordTick(options_.statistics.get(),
-                 RATE_LIMIT_DELAY_MILLIS, rate_limit);
-      if (cfd->options()->rate_limit_delay_max_milliseconds > 0 &&
-          rate_limit_delay_millis >=
-              (unsigned)cfd->options()->rate_limit_delay_max_milliseconds) {
-        allow_hard_rate_limit_delay = false;
-      }
-      mutex_.Lock();
-    } else if (allow_soft_rate_limit_delay &&
-               cfd->options()->soft_rate_limit > 0.0 &&
-               (score = cfd->current()->MaxCompactionScore()) >
-                   cfd->options()->soft_rate_limit) {
-      // Delay a write when the compaction score for any level is too large.
-      // TODO: add statistics
-      mutex_.Unlock();
-      {
-        StopWatch sw(env_, options_.statistics.get(),
-                     SOFT_RATE_LIMIT_DELAY_COUNT);
-        env_->SleepForMicroseconds(
-            SlowdownAmount(score, cfd->options()->soft_rate_limit,
-                           cfd->options()->hard_rate_limit));
-        rate_limit_delay_millis += sw.ElapsedMicros();
+  {
+    if (creating_new_log) {
+      s = env_->NewWritableFile(
+          LogFileName(db_options_.wal_dir, new_log_number), &lfile,
+          env_->OptimizeForLogWrite(env_options_, db_options_));
+      if (s.ok()) {
+        // Our final size should be less than write_buffer_size
+        // (compression, etc) but err on the side of caution.
+        lfile->SetPreallocationBlockSize(
+            1.1 * mutable_cf_options.write_buffer_size);
+        new_log = new log::Writer(std::move(lfile));
+        log_dir_synced_ = false;
       }
-      allow_soft_rate_limit_delay = false;
-      mutex_.Lock();
-
-    } else {
-      unique_ptr<WritableFile> lfile;
-      log::Writer* new_log = nullptr;
-      MemTable* new_mem = nullptr;
-
-      // Attempt to switch to a new memtable and trigger flush of old.
-      // Do this without holding the dbmutex lock.
-      assert(versions_->PrevLogNumber() == 0);
-      bool creating_new_log = !log_empty_;
-      uint64_t new_log_number =
-          creating_new_log ? versions_->NewFileNumber() : logfile_number_;
-      SuperVersion* new_superversion = nullptr;
-      mutex_.Unlock();
-      {
-        DelayLoggingAndReset();
-        if (creating_new_log) {
-          s = env_->NewWritableFile(
-              LogFileName(options_.wal_dir, new_log_number), &lfile,
-              env_->OptimizeForLogWrite(storage_options_));
-          if (s.ok()) {
-            // Our final size should be less than write_buffer_size
-            // (compression, etc) but err on the side of caution.
-            lfile->SetPreallocationBlockSize(1.1 *
-                                             cfd->options()->write_buffer_size);
-            new_log = new log::Writer(std::move(lfile));
-          }
-        }
+    }
 
-        if (s.ok()) {
-          new_mem = new MemTable(cfd->internal_comparator(), *cfd->options());
-          new_superversion = new SuperVersion();
-        }
-      }
-      mutex_.Lock();
-      if (!s.ok()) {
-        // how do we fail if we're not creating new log?
-        assert(creating_new_log);
-        // Avoid chewing through file number space in a tight loop.
-        versions_->ReuseFileNumber(new_log_number);
-        assert(!new_mem);
-        assert(!new_log);
-        break;
-      }
-      if (creating_new_log) {
-        logfile_number_ = new_log_number;
-        assert(new_log != nullptr);
-        logs_to_free->push_back(log_.release());
-        log_.reset(new_log);
-        log_empty_ = true;
-        alive_log_files_.push_back(LogFileNumberSize(logfile_number_));
-        for (auto cfd : *versions_->GetColumnFamilySet()) {
-          // all this is just optimization to delete logs that
-          // are no longer needed -- if CF is empty, that means it
-          // doesn't need that particular log to stay alive, so we just
-          // advance the log number. no need to persist this in the manifest
-          if (cfd->mem()->GetFirstSequenceNumber() == 0 &&
-              cfd->imm()->size() == 0) {
-            cfd->SetLogNumber(logfile_number_);
-          }
-        }
-      }
-      cfd->mem()->SetNextLogNumber(logfile_number_);
-      cfd->imm()->Add(cfd->mem());
-      if (force) {
-        cfd->imm()->FlushRequested();
+    if (s.ok()) {
+      new_mem = cfd->ConstructNewMemtable(mutable_cf_options);
+      new_superversion = new SuperVersion();
+    }
+  }
+  Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
+      "[%s] New memtable created with log file: #%" PRIu64 "\n",
+      cfd->GetName().c_str(), new_log_number);
+  mutex_.Lock();
+  if (!s.ok()) {
+    // how do we fail if we're not creating new log?
+    assert(creating_new_log);
+    assert(!new_mem);
+    assert(!new_log);
+    return s;
+  }
+  if (creating_new_log) {
+    logfile_number_ = new_log_number;
+    assert(new_log != nullptr);
+    logs_to_free_.push_back(log_.release());
+    log_.reset(new_log);
+    log_empty_ = true;
+    alive_log_files_.push_back(LogFileNumberSize(logfile_number_));
+    for (auto loop_cfd : *versions_->GetColumnFamilySet()) {
+      // all this is just optimization to delete logs that
+      // are no longer needed -- if CF is empty, that means it
+      // doesn't need that particular log to stay alive, so we just
+      // advance the log number. no need to persist this in the manifest
+      if (loop_cfd->mem()->GetFirstSequenceNumber() == 0 &&
+          loop_cfd->imm()->size() == 0) {
+        loop_cfd->SetLogNumber(logfile_number_);
       }
-      new_mem->Ref();
-      cfd->SetMemtable(new_mem);
-      Log(options_.info_log, "[%s] New memtable created with log file: #%lu\n",
-          cfd->GetName().c_str(), (unsigned long)logfile_number_);
-      force = false;  // Do not force another compaction if have room
-      MaybeScheduleFlushOrCompaction();
-      superversions_to_free->push_back(
-          cfd->InstallSuperVersion(new_superversion, &mutex_));
     }
   }
+  cfd->mem()->SetNextLogNumber(logfile_number_);
+  cfd->imm()->Add(cfd->mem());
+  new_mem->Ref();
+  cfd->SetMemtable(new_mem);
+  context->superversions_to_free_.push_back(
+      InstallSuperVersion(cfd, new_superversion, mutable_cf_options, true));
   return s;
 }
 
@@ -4120,25 +3545,97 @@ const Options& DBImpl::GetOptions(ColumnFamilyHandle* column_family) const {
   return *cfh->cfd()->options();
 }
 
+const DBOptions& DBImpl::GetDBOptions() const { return db_options_; }
+
 bool DBImpl::GetProperty(ColumnFamilyHandle* column_family,
                          const Slice& property, std::string* value) {
+  bool is_int_property = false;
+  bool need_out_of_mutex = false;
+  DBPropertyType property_type =
+      GetPropertyType(property, &is_int_property, &need_out_of_mutex);
+
   value->clear();
+  if (is_int_property) {
+    uint64_t int_value;
+    bool ret_value = GetIntPropertyInternal(column_family, property_type,
+                                            need_out_of_mutex, &int_value);
+    if (ret_value) {
+      *value = ToString(int_value);
+    }
+    return ret_value;
+  } else {
+    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+    auto cfd = cfh->cfd();
+    InstrumentedMutexLock l(&mutex_);
+    return cfd->internal_stats()->GetStringProperty(property_type, property,
+                                                    value);
+  }
+}
+
+bool DBImpl::GetIntProperty(ColumnFamilyHandle* column_family,
+                            const Slice& property, uint64_t* value) {
+  bool is_int_property = false;
+  bool need_out_of_mutex = false;
+  DBPropertyType property_type =
+      GetPropertyType(property, &is_int_property, &need_out_of_mutex);
+  if (!is_int_property) {
+    return false;
+  }
+  return GetIntPropertyInternal(column_family, property_type, need_out_of_mutex,
+                                value);
+}
+
+bool DBImpl::GetIntPropertyInternal(ColumnFamilyHandle* column_family,
+                                    DBPropertyType property_type,
+                                    bool need_out_of_mutex, uint64_t* value) {
   auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
   auto cfd = cfh->cfd();
-  DBPropertyType property_type = GetPropertyType(property);
-  MutexLock l(&mutex_);
-  return cfd->internal_stats()->GetProperty(property_type, property, value,
-                                            cfd);
+
+  if (!need_out_of_mutex) {
+    InstrumentedMutexLock l(&mutex_);
+    return cfd->internal_stats()->GetIntProperty(property_type, value, this);
+  } else {
+    SuperVersion* sv = GetAndRefSuperVersion(cfd);
+
+    bool ret = cfd->internal_stats()->GetIntPropertyOutOfMutex(
+        property_type, sv->current, value);
+
+    ReturnAndCleanupSuperVersion(cfd, sv);
+
+    return ret;
+  }
+}
+
+SuperVersion* DBImpl::GetAndRefSuperVersion(ColumnFamilyData* cfd) {
+  // TODO(ljin): consider using GetReferencedSuperVersion() directly
+  return cfd->GetThreadLocalSuperVersion(&mutex_);
+}
+
+void DBImpl::ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd,
+                                          SuperVersion* sv) {
+  bool unref_sv = !cfd->ReturnThreadLocalSuperVersion(sv);
+
+  if (unref_sv) {
+    // Release SuperVersion
+    if (sv->Unref()) {
+      {
+        InstrumentedMutexLock l(&mutex_);
+        sv->Cleanup();
+      }
+      delete sv;
+      RecordTick(stats_, NUMBER_SUPERVERSION_CLEANUPS);
+    }
+    RecordTick(stats_, NUMBER_SUPERVERSION_RELEASES);
+  }
 }
 
 void DBImpl::GetApproximateSizes(ColumnFamilyHandle* column_family,
                                  const Range* range, int n, uint64_t* sizes) {
-  // TODO(opt): better implementation
   Version* v;
   auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
   auto cfd = cfh->cfd();
   {
-    MutexLock l(&mutex_);
+    InstrumentedMutexLock l(&mutex_);
     v = cfd->current();
     v->Ref();
   }
@@ -4147,22 +3644,29 @@ void DBImpl::GetApproximateSizes(ColumnFamilyHandle* column_family,
     // Convert user_key into a corresponding internal key.
     InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek);
     InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek);
-    uint64_t start = versions_->ApproximateOffsetOf(v, k1);
-    uint64_t limit = versions_->ApproximateOffsetOf(v, k2);
-    sizes[i] = (limit >= start ? limit - start : 0);
+    sizes[i] = versions_->ApproximateSize(v, k1.Encode(), k2.Encode());
   }
 
   {
-    MutexLock l(&mutex_);
+    InstrumentedMutexLock l(&mutex_);
     v->Unref();
   }
 }
 
-inline void DBImpl::DelayLoggingAndReset() {
-  if (delayed_writes_ > 0) {
-    Log(options_.info_log, "delayed %d write...\n", delayed_writes_ );
-    delayed_writes_ = 0;
-  }
+std::list<uint64_t>::iterator
+DBImpl::CaptureCurrentFileNumberInPendingOutputs() {
+  // We need to remember the iterator of our insert, because after the
+  // background job is done, we need to remove that element from
+  // pending_outputs_.
+  pending_outputs_.push_back(versions_->current_next_file_number());
+  auto pending_outputs_inserted_elem = pending_outputs_.end();
+  --pending_outputs_inserted_elem;
+  return pending_outputs_inserted_elem;
+}
+
+void DBImpl::ReleaseFileNumberFromPendingOutputs(
+    std::list<uint64_t>::iterator v) {
+  pending_outputs_.erase(v);
 }
 
 #ifndef ROCKSDB_LITE
@@ -4170,27 +3674,11 @@ Status DBImpl::GetUpdatesSince(
     SequenceNumber seq, unique_ptr<TransactionLogIterator>* iter,
     const TransactionLogIterator::ReadOptions& read_options) {
 
-  RecordTick(options_.statistics.get(), GET_UPDATES_SINCE_CALLS);
+  RecordTick(stats_, GET_UPDATES_SINCE_CALLS);
   if (seq > versions_->LastSequence()) {
     return Status::NotFound("Requested sequence not yet written in the db");
   }
-  //  Get all sorted Wal Files.
-  //  Do binary search and open files and find the seq number.
-
-  std::unique_ptr<VectorLogPtr> wal_files(new VectorLogPtr);
-  Status s = GetSortedWalFiles(*wal_files);
-  if (!s.ok()) {
-    return s;
-  }
-
-  s = RetainProbableWalFiles(*wal_files, seq);
-  if (!s.ok()) {
-    return s;
-  }
-  iter->reset(new TransactionLogIteratorImpl(options_.wal_dir, &options_,
-                                             read_options, storage_options_,
-                                             seq, std::move(wal_files), this));
-  return (*iter)->status();
+  return wal_manager_.GetUpdatesSince(seq, iter, read_options, versions_.get());
 }
 
 Status DBImpl::DeleteFile(std::string name) {
@@ -4199,7 +3687,8 @@ Status DBImpl::DeleteFile(std::string name) {
   WalFileType log_type;
   if (!ParseFileName(name, &number, &type, &log_type) ||
       (type != kTableFile && type != kLogFile)) {
-    Log(options_.info_log, "DeleteFile %s failed.\n", name.c_str());
+    Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+        "DeleteFile %s failed.\n", name.c_str());
     return Status::InvalidArgument("Invalid file name");
   }
 
@@ -4207,13 +3696,15 @@ Status DBImpl::DeleteFile(std::string name) {
   if (type == kLogFile) {
     // Only allow deleting archived log files
     if (log_type != kArchivedLogFile) {
-      Log(options_.info_log, "DeleteFile %s failed - not archived log.\n",
+      Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+          "DeleteFile %s failed - not archived log.\n",
           name.c_str());
       return Status::NotSupported("Delete only supported for archived logs");
     }
-    status = env_->DeleteFile(options_.wal_dir + "/" + name.c_str());
+    status = env_->DeleteFile(db_options_.wal_dir + "/" + name.c_str());
     if (!status.ok()) {
-      Log(options_.info_log, "DeleteFile %s failed -- %s.\n",
+      Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+          "DeleteFile %s failed -- %s.\n",
           name.c_str(), status.ToString().c_str());
     }
     return status;
@@ -4223,59 +3714,81 @@ Status DBImpl::DeleteFile(std::string name) {
   FileMetaData* metadata;
   ColumnFamilyData* cfd;
   VersionEdit edit;
-  DeletionState deletion_state(true);
+  JobContext job_context(next_job_id_.fetch_add(1), true);
   {
-    MutexLock l(&mutex_);
+    InstrumentedMutexLock l(&mutex_);
     status = versions_->GetMetadataForFile(number, &level, &metadata, &cfd);
     if (!status.ok()) {
-      Log(options_.info_log, "DeleteFile %s failed. File not found\n",
-                             name.c_str());
+      Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
+          "DeleteFile %s failed. File not found\n", name.c_str());
+      job_context.Clean();
       return Status::InvalidArgument("File not found");
     }
-    assert((level > 0) && (level < cfd->NumberLevels()));
+    assert(level < cfd->NumberLevels());
 
     // If the file is being compacted no need to delete.
     if (metadata->being_compacted) {
-      Log(options_.info_log,
+      Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
           "DeleteFile %s Skipped. File about to be compacted\n", name.c_str());
+      job_context.Clean();
       return Status::OK();
     }
 
     // Only the files in the last level can be deleted externally.
     // This is to make sure that any deletion tombstones are not
     // lost. Check that the level passed is the last level.
+    auto* vstoreage = cfd->current()->storage_info();
     for (int i = level + 1; i < cfd->NumberLevels(); i++) {
-      if (cfd->current()->NumLevelFiles(i) != 0) {
-        Log(options_.info_log,
+      if (vstoreage->NumLevelFiles(i) != 0) {
+        Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
             "DeleteFile %s FAILED. File not in last level\n", name.c_str());
+        job_context.Clean();
         return Status::InvalidArgument("File not in last level");
       }
     }
+    // if level == 0, it has to be the oldest file
+    if (level == 0 &&
+        vstoreage->LevelFiles(0).back()->fd.GetNumber() != number) {
+      Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
+          "DeleteFile %s failed ---"
+          " target file in level 0 must be the oldest.", name.c_str());
+      job_context.Clean();
+      return Status::InvalidArgument("File in level 0, but not oldest");
+    }
+    edit.SetColumnFamily(cfd->GetID());
     edit.DeleteFile(level, number);
-    status = versions_->LogAndApply(cfd, &edit, &mutex_, db_directory_.get());
+    status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
+                                    &edit, &mutex_, directories_.GetDbDir());
     if (status.ok()) {
-      InstallSuperVersion(cfd, deletion_state);
+      InstallSuperVersionBackground(cfd, &job_context,
+                                    *cfd->GetLatestMutableCFOptions());
     }
-    FindObsoleteFiles(deletion_state, false);
-  } // lock released here
-  LogFlush(options_.info_log);
+    FindObsoleteFiles(&job_context, false);
+  }  // lock released here
+  LogFlush(db_options_.info_log);
   // remove files outside the db-lock
-  if (deletion_state.HaveSomethingToDelete()) {
-    PurgeObsoleteFiles(deletion_state);
-  }
-  {
-    MutexLock l(&mutex_);
-    // schedule flush if file deletion means we freed the space for flushes to
-    // continue
-    MaybeScheduleFlushOrCompaction();
+  if (job_context.HaveSomethingToDelete()) {
+    PurgeObsoleteFiles(job_context);
   }
+  job_context.Clean();
   return status;
 }
 
 void DBImpl::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
-  MutexLock l(&mutex_);
+  InstrumentedMutexLock l(&mutex_);
   versions_->GetLiveFilesMetaData(metadata);
 }
+
+void DBImpl::GetColumnFamilyMetaData(
+    ColumnFamilyHandle* column_family,
+    ColumnFamilyMetaData* cf_meta) {
+  assert(column_family);
+  auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
+  auto* sv = GetAndRefSuperVersion(cfd);
+  sv->current->GetColumnFamilyMetaData(cf_meta);
+  ReturnAndCleanupSuperVersion(cfd, sv);
+}
+
 #endif  // ROCKSDB_LITE
 
 Status DBImpl::CheckConsistency() {
@@ -4285,17 +3798,19 @@ Status DBImpl::CheckConsistency() {
 
   std::string corruption_messages;
   for (const auto& md : metadata) {
-    std::string file_path = dbname_ + md.name;
+    // md.name has a leading "/".
+    std::string file_path = md.db_path + md.name;
+
     uint64_t fsize = 0;
     Status s = env_->GetFileSize(file_path, &fsize);
     if (!s.ok()) {
       corruption_messages +=
           "Can't access " + md.name + ": " + s.ToString() + "\n";
     } else if (fsize != md.size) {
-      corruption_messages += "Sst file size mismatch: " + md.name +
+      corruption_messages += "Sst file size mismatch: " + file_path +
                              ". Size recorded in manifest " +
-                             std::to_string(md.size) + ", actual size " +
-                             std::to_string(fsize) + "\n";
+                             ToString(md.size) + ", actual size " +
+                             ToString(fsize) + "\n";
     }
   }
   if (corruption_messages.size() == 0) {
@@ -4320,7 +3835,7 @@ Status DBImpl::GetDbIdentity(std::string& identity) {
   }
   char buffer[file_size];
   Slice id;
-  s = idfile->Read(file_size, &id, buffer);
+  s = idfile->Read(static_cast<size_t>(file_size), &id, buffer);
   if (!s.ok()) {
     return s;
   }
@@ -4359,7 +3874,7 @@ Status DB::Merge(const WriteOptions& opt, ColumnFamilyHandle* column_family,
 }
 
 // Default implementation -- returns not supported status
-Status DB::CreateColumnFamily(const ColumnFamilyOptions& options,
+Status DB::CreateColumnFamily(const ColumnFamilyOptions& cf_options,
                               const std::string& column_family_name,
                               ColumnFamilyHandle** handle) {
   return Status::NotSupported("");
@@ -4390,6 +3905,27 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) {
 Status DB::Open(const DBOptions& db_options, const std::string& dbname,
                 const std::vector<ColumnFamilyDescriptor>& column_families,
                 std::vector<ColumnFamilyHandle*>* handles, DB** dbptr) {
+  Status s = SanitizeOptionsByTable(db_options, column_families);
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (db_options.db_paths.size() > 1) {
+    for (auto& cfd : column_families) {
+      if ((cfd.options.compaction_style != kCompactionStyleUniversal) &&
+          (cfd.options.compaction_style != kCompactionStyleLevel)) {
+        return Status::NotSupported(
+            "More than one DB paths are only supported in "
+            "universal and level compaction styles. ");
+      }
+    }
+
+    if (db_options.db_paths.size() > 4) {
+      return Status::NotSupported(
+          "More than four DB paths are not supported yet. ");
+    }
+  }
+
   *dbptr = nullptr;
   handles->clear();
 
@@ -4397,14 +3933,19 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
   for (auto cf : column_families) {
     max_write_buffer_size =
         std::max(max_write_buffer_size, cf.options.write_buffer_size);
-    if (cf.options.block_cache != nullptr && cf.options.no_block_cache) {
-      return Status::InvalidArgument(
-          "no_block_cache is true while block_cache is not nullptr");
-    }
   }
 
   DBImpl* impl = new DBImpl(db_options, dbname);
-  Status s = impl->env_->CreateDirIfMissing(impl->options_.wal_dir);
+  s = impl->env_->CreateDirIfMissing(impl->db_options_.wal_dir);
+  if (s.ok()) {
+    for (auto db_path : impl->db_options_.db_paths) {
+      s = impl->env_->CreateDirIfMissing(db_path.path);
+      if (!s.ok()) {
+        break;
+      }
+    }
+  }
+
   if (!s.ok()) {
     delete impl;
     return s;
@@ -4422,9 +3963,10 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
     uint64_t new_log_number = impl->versions_->NewFileNumber();
     unique_ptr<WritableFile> lfile;
     EnvOptions soptions(db_options);
-    s = impl->options_.env->NewWritableFile(
-        LogFileName(impl->options_.wal_dir, new_log_number), &lfile,
-        impl->options_.env->OptimizeForLogWrite(soptions));
+    s = impl->db_options_.env->NewWritableFile(
+        LogFileName(impl->db_options_.wal_dir, new_log_number), &lfile,
+        impl->db_options_.env->OptimizeForLogWrite(soptions,
+                                                   impl->db_options_));
     if (s.ok()) {
       lfile->SetPreallocationBlockSize(1.1 * max_write_buffer_size);
       impl->logfile_number_ = new_log_number;
@@ -4434,41 +3976,59 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
       for (auto cf : column_families) {
         auto cfd =
             impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name);
-        if (cfd == nullptr) {
-          s = Status::InvalidArgument("Column family not found: ", cf.name);
-          break;
+        if (cfd != nullptr) {
+          handles->push_back(
+              new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_));
+          impl->NewThreadStatusCfInfo(cfd);
+        } else {
+          if (db_options.create_missing_column_families) {
+            // missing column family, create it
+            ColumnFamilyHandle* handle;
+            impl->mutex_.Unlock();
+            s = impl->CreateColumnFamily(cf.options, cf.name, &handle);
+            impl->mutex_.Lock();
+            if (s.ok()) {
+              handles->push_back(handle);
+            } else {
+              break;
+            }
+          } else {
+            s = Status::InvalidArgument("Column family not found: ", cf.name);
+            break;
+          }
         }
-        handles->push_back(
-            new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_));
       }
     }
     if (s.ok()) {
       for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
-        delete cfd->InstallSuperVersion(new SuperVersion(), &impl->mutex_);
+        delete impl->InstallSuperVersion(
+            cfd, nullptr, *cfd->GetLatestMutableCFOptions());
       }
       impl->alive_log_files_.push_back(
           DBImpl::LogFileNumberSize(impl->logfile_number_));
       impl->DeleteObsoleteFiles();
-      impl->MaybeScheduleFlushOrCompaction();
-      impl->MaybeScheduleLogDBDeployStats();
-      s = impl->db_directory_->Fsync();
+      s = impl->directories_.GetDbDir()->Fsync();
     }
   }
 
   if (s.ok()) {
     for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
-      if (cfd->options()->compaction_style == kCompactionStyleUniversal) {
-        Version* current = cfd->current();
-        for (int i = 1; i < current->NumberLevels(); ++i) {
-          int num_files = current->NumLevelFiles(i);
+      if (cfd->ioptions()->compaction_style == kCompactionStyleFIFO) {
+        auto* vstorage = cfd->current()->storage_info();
+        for (int i = 1; i < vstorage->num_levels(); ++i) {
+          int num_files = vstorage->NumLevelFiles(i);
           if (num_files > 0) {
-            s = Status::InvalidArgument("Not all files are at level 0. Cannot "
-                "open with universal compaction style.");
+            s = Status::InvalidArgument(
+                "Not all files are at level 0. Cannot "
+                "open with FIFO compaction style.");
             break;
           }
         }
       }
-      if (cfd->options()->merge_operator != nullptr &&
+      if (!cfd->mem()->IsSnapshotSupported()) {
+        impl->is_snapshot_supported_ = false;
+      }
+      if (cfd->ioptions()->merge_operator != nullptr &&
           !cfd->mem()->IsMergeOperatorSupported()) {
         s = Status::InvalidArgument(
             "The memtable of column family %s does not support merge operator "
@@ -4484,9 +4044,11 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
 
   if (s.ok()) {
     impl->opened_successfully_ = true;
+    Log(InfoLogLevel::INFO_LEVEL, impl->db_options_.info_log, "DB pointer %p",
+        impl);
     *dbptr = impl;
   } else {
-    for (auto h : *handles) {
+    for (auto* h : *handles) {
       delete h;
     }
     handles->clear();
@@ -4506,42 +4068,26 @@ Snapshot::~Snapshot() {
 
 Status DestroyDB(const std::string& dbname, const Options& options) {
   const InternalKeyComparator comparator(options.comparator);
-  const InternalFilterPolicy filter_policy(options.filter_policy);
-  const Options& soptions(SanitizeOptions(
-    dbname, &comparator, &filter_policy, options));
+  const Options& soptions(SanitizeOptions(dbname, &comparator, options));
   Env* env = soptions.env;
   std::vector<std::string> filenames;
-  std::vector<std::string> archiveFiles;
 
-  std::string archivedir = ArchivalDirectory(dbname);
   // Ignore error in case directory does not exist
   env->GetChildren(dbname, &filenames);
 
-  if (dbname != soptions.wal_dir) {
-    std::vector<std::string> logfilenames;
-    env->GetChildren(soptions.wal_dir, &logfilenames);
-    filenames.insert(filenames.end(), logfilenames.begin(), logfilenames.end());
-    archivedir = ArchivalDirectory(soptions.wal_dir);
-  }
-
-  if (filenames.empty()) {
-    return Status::OK();
-  }
-
   FileLock* lock;
   const std::string lockname = LockFileName(dbname);
   Status result = env->LockFile(lockname, &lock);
   if (result.ok()) {
     uint64_t number;
     FileType type;
+    InfoLogPrefix info_log_prefix(!options.db_log_dir.empty(), dbname);
     for (size_t i = 0; i < filenames.size(); i++) {
-      if (ParseFileName(filenames[i], &number, &type) &&
+      if (ParseFileName(filenames[i], &number, info_log_prefix.prefix, &type) &&
           type != kDBLockFile) {  // Lock file will be deleted at end
         Status del;
         if (type == kMetaDatabase) {
           del = DestroyDB(dbname + "/" + filenames[i], options);
-        } else if (type == kLogFile) {
-          del = env->DeleteFile(soptions.wal_dir + "/" + filenames[i]);
         } else {
           del = env->DeleteFile(dbname + "/" + filenames[i]);
         }
@@ -4551,6 +4097,37 @@ Status DestroyDB(const std::string& dbname, const Options& options) {
       }
     }
 
+    for (auto& db_path : options.db_paths) {
+      env->GetChildren(db_path.path, &filenames);
+      for (size_t i = 0; i < filenames.size(); i++) {
+        if (ParseFileName(filenames[i], &number, &type) &&
+            type == kTableFile) {  // Lock file will be deleted at end
+          Status del = env->DeleteFile(db_path.path + "/" + filenames[i]);
+          if (result.ok() && !del.ok()) {
+            result = del;
+          }
+        }
+      }
+    }
+
+    std::vector<std::string> walDirFiles;
+    std::string archivedir = ArchivalDirectory(dbname);
+    if (dbname != soptions.wal_dir) {
+      env->GetChildren(soptions.wal_dir, &walDirFiles);
+      archivedir = ArchivalDirectory(soptions.wal_dir);
+    }
+
+    // Delete log files in the WAL dir
+    for (const auto& file : walDirFiles) {
+      if (ParseFileName(file, &number, &type) && type == kLogFile) {
+        Status del = env->DeleteFile(soptions.wal_dir + "/" + file);
+        if (result.ok() && !del.ok()) {
+          result = del;
+        }
+      }
+    }
+
+    std::vector<std::string> archiveFiles;
     env->GetChildren(archivedir, &archiveFiles);
     // Delete archival files.
     for (size_t i = 0; i < archiveFiles.size(); ++i) {
@@ -4573,14 +4150,53 @@ Status DestroyDB(const std::string& dbname, const Options& options) {
   return result;
 }
 
+#if ROCKSDB_USING_THREAD_STATUS
+
+void DBImpl::NewThreadStatusCfInfo(
+    ColumnFamilyData* cfd) const {
+  if (db_options_.enable_thread_tracking) {
+    ThreadStatusUtil::NewColumnFamilyInfo(this, cfd);
+  }
+}
+
+void DBImpl::EraseThreadStatusCfInfo(
+    ColumnFamilyData* cfd) const {
+  if (db_options_.enable_thread_tracking) {
+    ThreadStatusUtil::EraseColumnFamilyInfo(cfd);
+  }
+}
+
+void DBImpl::EraseThreadStatusDbInfo() const {
+  if (db_options_.enable_thread_tracking) {
+    ThreadStatusUtil::EraseDatabaseInfo(this);
+  }
+}
+
+#else
+void DBImpl::NewThreadStatusCfInfo(
+    ColumnFamilyData* cfd) const {
+}
+
+void DBImpl::EraseThreadStatusCfInfo(
+    ColumnFamilyData* cfd) const {
+}
+
+void DBImpl::EraseThreadStatusDbInfo() const {
+}
+#endif  // ROCKSDB_USING_THREAD_STATUS
+
 //
 // A global method that can dump out the build version
-void DumpLeveldbBuildVersion(Logger * log) {
+void DumpRocksDBBuildVersion(Logger * log) {
 #if !defined(IOS_CROSS_COMPILE)
-  // if we compile with Xcode, we don't run build_detect_vesion, so we don't generate util/build_version.cc
-  Log(log, "Git sha %s", rocksdb_build_git_sha);
-  Log(log, "Compile time %s %s",
-      rocksdb_build_compile_time, rocksdb_build_compile_date);
+  // if we compile with Xcode, we don't run build_detect_vesion, so we don't
+  // generate util/build_version.cc
+  Log(InfoLogLevel::INFO_LEVEL, log,
+      "RocksDB version: %d.%d.%d\n", ROCKSDB_MAJOR, ROCKSDB_MINOR,
+      ROCKSDB_PATCH);
+  Log(InfoLogLevel::INFO_LEVEL, log, "Git sha %s", rocksdb_build_git_sha);
+  Log(InfoLogLevel::INFO_LEVEL, log, "Compile date %s",
+      rocksdb_build_compile_date);
 #endif
 }
 
diff --git a/src/rocksdb/db/db_impl.h b/src/rocksdb/db/db_impl.h
index cc59cfd..91a5963 100644
--- a/src/rocksdb/db/db_impl.h
+++ b/src/rocksdb/db/db_impl.h
@@ -10,8 +10,11 @@
 
 #include <atomic>
 #include <deque>
+#include <limits>
 #include <set>
+#include <list>
 #include <utility>
+#include <list>
 #include <vector>
 #include <string>
 
@@ -20,6 +23,8 @@
 #include "db/snapshot.h"
 #include "db/column_family.h"
 #include "db/version_edit.h"
+#include "db/wal_manager.h"
+#include "db/writebuffer.h"
 #include "memtable_list.h"
 #include "port/port.h"
 #include "rocksdb/db.h"
@@ -27,9 +32,17 @@
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/transaction_log.h"
 #include "util/autovector.h"
-#include "util/stats_logger.h"
+#include "util/event_logger.h"
+#include "util/hash.h"
+#include "util/stop_watch.h"
 #include "util/thread_local.h"
+#include "util/scoped_arena_iterator.h"
+#include "util/hash.h"
+#include "util/instrumented_mutex.h"
 #include "db/internal_stats.h"
+#include "db/write_controller.h"
+#include "db/flush_scheduler.h"
+#include "db/write_thread.h"
 
 namespace rocksdb {
 
@@ -39,6 +52,8 @@ class Version;
 class VersionEdit;
 class VersionSet;
 class CompactionFilterV2;
+class Arena;
+struct JobContext;
 
 class DBImpl : public DB {
  public:
@@ -49,30 +64,33 @@ class DBImpl : public DB {
   using DB::Put;
   virtual Status Put(const WriteOptions& options,
                      ColumnFamilyHandle* column_family, const Slice& key,
-                     const Slice& value);
+                     const Slice& value) override;
   using DB::Merge;
   virtual Status Merge(const WriteOptions& options,
                        ColumnFamilyHandle* column_family, const Slice& key,
-                       const Slice& value);
+                       const Slice& value) override;
   using DB::Delete;
   virtual Status Delete(const WriteOptions& options,
-                        ColumnFamilyHandle* column_family, const Slice& key);
+                        ColumnFamilyHandle* column_family,
+                        const Slice& key) override;
   using DB::Write;
-  virtual Status Write(const WriteOptions& options, WriteBatch* updates);
+  virtual Status Write(const WriteOptions& options,
+                       WriteBatch* updates) override;
   using DB::Get;
   virtual Status Get(const ReadOptions& options,
                      ColumnFamilyHandle* column_family, const Slice& key,
-                     std::string* value);
+                     std::string* value) override;
   using DB::MultiGet;
   virtual std::vector<Status> MultiGet(
       const ReadOptions& options,
       const std::vector<ColumnFamilyHandle*>& column_family,
-      const std::vector<Slice>& keys, std::vector<std::string>* values);
+      const std::vector<Slice>& keys,
+      std::vector<std::string>* values) override;
 
   virtual Status CreateColumnFamily(const ColumnFamilyOptions& options,
                                     const std::string& column_family,
-                                    ColumnFamilyHandle** handle);
-  virtual Status DropColumnFamily(ColumnFamilyHandle* column_family);
+                                    ColumnFamilyHandle** handle) override;
+  virtual Status DropColumnFamily(ColumnFamilyHandle* column_family) override;
 
   // Returns false if key doesn't exist in the database and true if it may.
   // If value_found is not passed in as null, then return the value if found in
@@ -81,70 +99,109 @@ class DBImpl : public DB {
   using DB::KeyMayExist;
   virtual bool KeyMayExist(const ReadOptions& options,
                            ColumnFamilyHandle* column_family, const Slice& key,
-                           std::string* value, bool* value_found = nullptr);
+                           std::string* value,
+                           bool* value_found = nullptr) override;
   using DB::NewIterator;
   virtual Iterator* NewIterator(const ReadOptions& options,
-                                ColumnFamilyHandle* column_family);
+                                ColumnFamilyHandle* column_family) override;
   virtual Status NewIterators(
       const ReadOptions& options,
       const std::vector<ColumnFamilyHandle*>& column_families,
-      std::vector<Iterator*>* iterators);
-  virtual const Snapshot* GetSnapshot();
-  virtual void ReleaseSnapshot(const Snapshot* snapshot);
+      std::vector<Iterator*>* iterators) override;
+  virtual const Snapshot* GetSnapshot() override;
+  virtual void ReleaseSnapshot(const Snapshot* snapshot) override;
   using DB::GetProperty;
   virtual bool GetProperty(ColumnFamilyHandle* column_family,
-                           const Slice& property, std::string* value);
+                           const Slice& property, std::string* value) override;
+  using DB::GetIntProperty;
+  virtual bool GetIntProperty(ColumnFamilyHandle* column_family,
+                              const Slice& property, uint64_t* value) override;
   using DB::GetApproximateSizes;
   virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
-                                   const Range* range, int n, uint64_t* sizes);
+                                   const Range* range, int n,
+                                   uint64_t* sizes) override;
   using DB::CompactRange;
   virtual Status CompactRange(ColumnFamilyHandle* column_family,
                               const Slice* begin, const Slice* end,
-                              bool reduce_level = false, int target_level = -1);
+                              bool reduce_level = false, int target_level = -1,
+                              uint32_t target_path_id = 0) override;
+
+  using DB::CompactFiles;
+  virtual Status CompactFiles(const CompactionOptions& compact_options,
+                              ColumnFamilyHandle* column_family,
+                              const std::vector<std::string>& input_file_names,
+                              const int output_level,
+                              const int output_path_id = -1) override;
+
+  using DB::SetOptions;
+  Status SetOptions(
+      ColumnFamilyHandle* column_family,
+      const std::unordered_map<std::string, std::string>& options_map) override;
 
   using DB::NumberLevels;
-  virtual int NumberLevels(ColumnFamilyHandle* column_family);
+  virtual int NumberLevels(ColumnFamilyHandle* column_family) override;
   using DB::MaxMemCompactionLevel;
-  virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family);
+  virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) override;
   using DB::Level0StopWriteTrigger;
-  virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family);
-  virtual const std::string& GetName() const;
-  virtual Env* GetEnv() const;
+  virtual int Level0StopWriteTrigger(
+      ColumnFamilyHandle* column_family) override;
+  virtual const std::string& GetName() const override;
+  virtual Env* GetEnv() const override;
   using DB::GetOptions;
-  virtual const Options& GetOptions(ColumnFamilyHandle* column_family) const;
+  virtual const Options& GetOptions(
+      ColumnFamilyHandle* column_family) const override;
+  using DB::GetDBOptions;
+  virtual const DBOptions& GetDBOptions() const override;
   using DB::Flush;
   virtual Status Flush(const FlushOptions& options,
-                       ColumnFamilyHandle* column_family);
+                       ColumnFamilyHandle* column_family) override;
 
-  virtual SequenceNumber GetLatestSequenceNumber() const;
+  virtual SequenceNumber GetLatestSequenceNumber() const override;
 
 #ifndef ROCKSDB_LITE
-  virtual Status DisableFileDeletions();
-  virtual Status EnableFileDeletions(bool force);
+  virtual Status DisableFileDeletions() override;
+  virtual Status EnableFileDeletions(bool force) override;
+  virtual int IsFileDeletionsEnabled() const;
   // All the returned filenames start with "/"
   virtual Status GetLiveFiles(std::vector<std::string>&,
                               uint64_t* manifest_file_size,
-                              bool flush_memtable = true);
-  virtual Status GetSortedWalFiles(VectorLogPtr& files);
+                              bool flush_memtable = true) override;
+  virtual Status GetSortedWalFiles(VectorLogPtr& files) override;
 
   virtual Status GetUpdatesSince(
       SequenceNumber seq_number, unique_ptr<TransactionLogIterator>* iter,
       const TransactionLogIterator::ReadOptions&
-          read_options = TransactionLogIterator::ReadOptions());
-  virtual Status DeleteFile(std::string name);
+          read_options = TransactionLogIterator::ReadOptions()) override;
+  virtual Status DeleteFile(std::string name) override;
+
+  virtual void GetLiveFilesMetaData(
+      std::vector<LiveFileMetaData>* metadata) override;
+
+  // Obtains the meta data of the specified column family of the DB.
+  // Status::NotFound() will be returned if the current DB does not have
+  // any column family match the specified name.
+  // TODO(yhchiang): output parameter is placed in the end in this codebase.
+  virtual void GetColumnFamilyMetaData(
+      ColumnFamilyHandle* column_family,
+      ColumnFamilyMetaData* metadata) override;
+
+  // experimental API
+  Status SuggestCompactRange(ColumnFamilyHandle* column_family,
+                             const Slice* begin, const Slice* end);
+
+  Status PromoteL0(ColumnFamilyHandle* column_family, int target_level);
 
-  virtual void GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata);
 #endif  // ROCKSDB_LITE
 
   // checks if all live files exist on file system and that their file sizes
   // match to our in-memory records
   virtual Status CheckConsistency();
 
-  virtual Status GetDbIdentity(std::string& identity);
+  virtual Status GetDbIdentity(std::string& identity) override;
 
   Status RunManualCompaction(ColumnFamilyData* cfd, int input_level,
-                             int output_level, const Slice* begin,
-                             const Slice* end);
+                             int output_level, uint32_t output_path_id,
+                             const Slice* begin, const Slice* end);
 
 #ifndef ROCKSDB_LITE
   // Extra methods (for testing) that are not in the public DB interface
@@ -166,8 +223,8 @@ class DBImpl : public DB {
   // Return an internal iterator over the current state of the database.
   // The keys of this iterator are internal keys (see format.h).
   // The returned iterator should be deleted when no longer needed.
-  Iterator* TEST_NewInternalIterator(ColumnFamilyHandle* column_family =
-                                         nullptr);
+  Iterator* TEST_NewInternalIterator(
+      Arena* arena, ColumnFamilyHandle* column_family = nullptr);
 
   // Return the maximum overlapping data (in bytes) at next level for any
   // file at a level >= 1.
@@ -177,118 +234,84 @@ class DBImpl : public DB {
   // Return the current manifest file no.
   uint64_t TEST_Current_Manifest_FileNo();
 
-  // Trigger's a background call for testing.
-  void TEST_PurgeObsoleteteWAL();
-
   // get total level0 file size. Only for testing.
   uint64_t TEST_GetLevel0TotalSize();
 
-  void TEST_SetDefaultTimeToCheck(uint64_t default_interval_to_delete_obsolete_WAL)
-  {
-    default_interval_to_delete_obsolete_WAL_ = default_interval_to_delete_obsolete_WAL;
-  }
-
   void TEST_GetFilesMetaData(ColumnFamilyHandle* column_family,
                              std::vector<std::vector<FileMetaData>>* metadata);
 
-  Status TEST_ReadFirstRecord(const WalFileType type, const uint64_t number,
-                              SequenceNumber* sequence);
+  void TEST_LockMutex();
 
-  Status TEST_ReadFirstLine(const std::string& fname, SequenceNumber* sequence);
-#endif  // NDEBUG
+  void TEST_UnlockMutex();
 
-  // needed for CleanupIteratorState
-  struct DeletionState {
-    inline bool HaveSomethingToDelete() const {
-      return  candidate_files.size() ||
-        sst_delete_files.size() ||
-        log_delete_files.size();
-    }
-
-    // a list of all files that we'll consider deleting
-    // (every once in a while this is filled up with all files
-    // in the DB directory)
-    std::vector<std::string> candidate_files;
-
-    // the list of all live sst files that cannot be deleted
-    std::vector<uint64_t> sst_live;
-
-    // a list of sst files that we need to delete
-    std::vector<FileMetaData*> sst_delete_files;
+  // REQUIRES: mutex locked
+  void* TEST_BeginWrite();
 
-    // a list of log files that we need to delete
-    std::vector<uint64_t> log_delete_files;
+  // REQUIRES: mutex locked
+  // pass the pointer that you got from TEST_BeginWrite()
+  void TEST_EndWrite(void* w);
 
-    // a list of memtables to be free
-    autovector<MemTable*> memtables_to_free;
-
-    autovector<SuperVersion*> superversions_to_free;
-
-    SuperVersion* new_superversion;  // if nullptr no new superversion
-
-    // the current manifest_file_number, log_number and prev_log_number
-    // that corresponds to the set of files in 'live'.
-    uint64_t manifest_file_number, pending_manifest_file_number, log_number,
-        prev_log_number;
+  uint64_t TEST_MaxTotalInMemoryState() const {
+    return max_total_in_memory_state_;
+  }
 
-    explicit DeletionState(bool create_superversion = false) {
-      manifest_file_number = 0;
-      pending_manifest_file_number = 0;
-      log_number = 0;
-      prev_log_number = 0;
-      new_superversion = create_superversion ? new SuperVersion() : nullptr;
-    }
+  size_t TEST_LogsToFreeSize();
 
-    ~DeletionState() {
-      // free pending memtables
-      for (auto m : memtables_to_free) {
-        delete m;
-      }
-      // free superversions
-      for (auto s : superversions_to_free) {
-        delete s;
-      }
-      // if new_superversion was not used, it will be non-nullptr and needs
-      // to be freed here
-      delete new_superversion;
-    }
-  };
+#endif  // ROCKSDB_LITE
 
   // Returns the list of live files in 'live' and the list
   // of all files in the filesystem in 'candidate_files'.
   // If force == false and the last call was less than
-  // options_.delete_obsolete_files_period_micros microseconds ago,
-  // it will not fill up the deletion_state
-  void FindObsoleteFiles(DeletionState& deletion_state,
-                         bool force,
+  // db_options_.delete_obsolete_files_period_micros microseconds ago,
+  // it will not fill up the job_context
+  void FindObsoleteFiles(JobContext* job_context, bool force,
                          bool no_full_scan = false);
 
   // Diffs the files listed in filenames and those that do not
   // belong to live files are posibly removed. Also, removes all the
   // files in sst_delete_files and log_delete_files.
   // It is not necessary to hold the mutex when invoking this method.
-  void PurgeObsoleteFiles(DeletionState& deletion_state);
+  void PurgeObsoleteFiles(const JobContext& background_contet);
+
+  ColumnFamilyHandle* DefaultColumnFamily() const override;
 
-  ColumnFamilyHandle* DefaultColumnFamily() const;
+  const SnapshotList& snapshots() const { return snapshots_; }
+
+  void CancelAllBackgroundWork(bool wait);
 
  protected:
   Env* const env_;
   const std::string dbname_;
   unique_ptr<VersionSet> versions_;
-  const DBOptions options_;
+  const DBOptions db_options_;
+  Statistics* stats_;
 
   Iterator* NewInternalIterator(const ReadOptions&, ColumnFamilyData* cfd,
-                                SuperVersion* super_version);
+                                SuperVersion* super_version, Arena* arena);
+
+  void NotifyOnFlushCompleted(ColumnFamilyData* cfd, uint64_t file_number,
+                              const MutableCFOptions& mutable_cf_options);
+
+  void NotifyOnCompactionCompleted(ColumnFamilyData* cfd,
+                                   Compaction *c, const Status &st);
+
+  void NewThreadStatusCfInfo(ColumnFamilyData* cfd) const;
+
+  void EraseThreadStatusCfInfo(ColumnFamilyData* cfd) const;
+
+  void EraseThreadStatusDbInfo() const;
 
  private:
   friend class DB;
   friend class InternalStats;
 #ifndef ROCKSDB_LITE
-  friend class TailingIterator;
+  friend class ForwardIterator;
 #endif
   friend struct SuperVersion;
+  friend class CompactedDBImpl;
   struct CompactionState;
-  struct Writer;
+
+  struct WriteContext;
 
   Status NewDB();
 
@@ -305,36 +328,48 @@ class DBImpl : public DB {
   // Delete any unneeded files and stale in-memory entries.
   void DeleteObsoleteFiles();
 
+  // Background process needs to call
+  //     auto x = CaptureCurrentFileNumberInPendingOutputs()
+  //     <do something>
+  //     ReleaseFileNumberFromPendingOutputs(x)
+  // This will protect any temporary files created while <do something> is
+  // executing from being deleted.
+  // -----------
+  // This function will capture current file number and append it to
+  // pending_outputs_. This will prevent any background process to delete any
+  // file created after this point.
+  std::list<uint64_t>::iterator CaptureCurrentFileNumberInPendingOutputs();
+  // This function should be called with the result of
+  // CaptureCurrentFileNumberInPendingOutputs(). It then marks that any file
+  // created between the calls CaptureCurrentFileNumberInPendingOutputs() and
+  // ReleaseFileNumberFromPendingOutputs() can now be deleted (if it's not live
+  // and blocked by any other pending_outputs_ calls)
+  void ReleaseFileNumberFromPendingOutputs(std::list<uint64_t>::iterator v);
+
   // Flush the in-memory write buffer to storage.  Switches to a new
   // log-file/memtable and writes a new descriptor iff successful.
-  Status FlushMemTableToOutputFile(ColumnFamilyData* cfd, bool* madeProgress,
-                                   DeletionState& deletion_state,
+  Status FlushMemTableToOutputFile(ColumnFamilyData* cfd,
+                                   const MutableCFOptions& mutable_cf_options,
+                                   bool* madeProgress, JobContext* job_context,
                                    LogBuffer* log_buffer);
 
-  Status RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
-                        bool read_only);
+  // REQUIRES: log_numbers are sorted in ascending order
+  Status RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
+                         SequenceNumber* max_sequence, bool read_only);
 
   // The following two methods are used to flush a memtable to
   // storage. The first one is used atdatabase RecoveryTime (when the
   // database is opened) and is heavyweight because it holds the mutex
   // for the entire period. The second method WriteLevel0Table supports
   // concurrent flush memtables to storage.
-  Status WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem,
-                                     VersionEdit* edit);
-  Status WriteLevel0Table(ColumnFamilyData* cfd, autovector<MemTable*>& mems,
-                          VersionEdit* edit, uint64_t* filenumber,
-                          LogBuffer* log_buffer);
+  Status WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
+                                     MemTable* mem, VersionEdit* edit);
+  Status DelayWrite(uint64_t expiration_time);
 
-  uint64_t SlowdownAmount(int n, double bottom, double top);
+  Status ScheduleFlushes(WriteContext* context);
 
-  // TODO(icanadi) free superversion_to_free and old_log outside of mutex
-  Status MakeRoomForWrite(ColumnFamilyData* cfd,
-                          bool force /* flush even if there is room? */,
-                          autovector<SuperVersion*>* superversions_to_free,
-                          autovector<log::Writer*>* logs_to_free);
-
-  void BuildBatchGroup(Writer** last_writer,
-                       autovector<WriteBatch*>* write_batch_group);
+  Status SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd,
+                                     WriteContext* context);
 
   // Force current memtable contents to be flushed.
   Status FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options);
@@ -342,104 +377,59 @@ class DBImpl : public DB {
   // Wait for memtable flushed
   Status WaitForFlushMemTable(ColumnFamilyData* cfd);
 
-  void MaybeScheduleLogDBDeployStats();
+  void RecordFlushIOStats();
+  void RecordCompactionIOStats();
 
 #ifndef ROCKSDB_LITE
-  static void BGLogDBDeployStats(void* db);
-  void LogDBDeployStats();
+  Status CompactFilesImpl(
+      const CompactionOptions& compact_options, ColumnFamilyData* cfd,
+      Version* version, const std::vector<std::string>& input_file_names,
+      const int output_level, int output_path_id, JobContext* job_context,
+      LogBuffer* log_buffer);
 #endif  // ROCKSDB_LITE
 
+  ColumnFamilyData* GetColumnFamilyDataByName(const std::string& cf_name);
+
   void MaybeScheduleFlushOrCompaction();
+  void SchedulePendingFlush(ColumnFamilyData* cfd);
+  void SchedulePendingCompaction(ColumnFamilyData* cfd);
   static void BGWorkCompaction(void* db);
   static void BGWorkFlush(void* db);
   void BackgroundCallCompaction();
   void BackgroundCallFlush();
-  Status BackgroundCompaction(bool* madeProgress, DeletionState& deletion_state,
+  Status BackgroundCompaction(bool* madeProgress, JobContext* job_context,
                               LogBuffer* log_buffer);
-  Status BackgroundFlush(bool* madeProgress, DeletionState& deletion_state,
+  Status BackgroundFlush(bool* madeProgress, JobContext* job_context,
                          LogBuffer* log_buffer);
-  void CleanupCompaction(CompactionState* compact, Status status);
-  Status DoCompactionWork(CompactionState* compact,
-                          DeletionState& deletion_state,
-                          LogBuffer* log_buffer);
 
   // This function is called as part of compaction. It enables Flush process to
   // preempt compaction, since it's higher prioirty
-  // Returns: micros spent executing
   uint64_t CallFlushDuringCompaction(ColumnFamilyData* cfd,
-                                     DeletionState& deletion_state,
+                                     const MutableCFOptions& mutable_cf_options,
+                                     JobContext* job_context,
                                      LogBuffer* log_buffer);
 
-  // Call compaction filter if is_compaction_v2 is not true. Then iterate
-  // through input and compact the kv-pairs
-  Status ProcessKeyValueCompaction(
-    SequenceNumber visible_at_tip,
-    SequenceNumber earliest_snapshot,
-    SequenceNumber latest_snapshot,
-    DeletionState& deletion_state,
-    bool bottommost_level,
-    int64_t& imm_micros,
-    Iterator* input,
-    CompactionState* compact,
-    bool is_compaction_v2,
-    LogBuffer* log_buffer);
-
-  // Call compaction_filter_v2->Filter() on kv-pairs in compact
-  void CallCompactionFilterV2(CompactionState* compact,
-    CompactionFilterV2* compaction_filter_v2);
-
-  Status OpenCompactionOutputFile(CompactionState* compact);
-  Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input);
-  Status InstallCompactionResults(CompactionState* compact,
-                                  LogBuffer* log_buffer);
-  void AllocateCompactionOutputFileNumbers(CompactionState* compact);
-  void ReleaseCompactionUnusedFileNumbers(CompactionState* compact);
-
-#ifdef ROCKSDB_LITE
-  void PurgeObsoleteWALFiles() {
-    // this function is used for archiving WAL files. we don't need this in
-    // ROCKSDB_LITE
-  }
-#else
-  void PurgeObsoleteWALFiles();
-
-  Status GetSortedWalsOfType(const std::string& path,
-                             VectorLogPtr& log_files,
-                             WalFileType type);
-
-  // Requires: all_logs should be sorted with earliest log file first
-  // Retains all log files in all_logs which contain updates with seq no.
-  // Greater Than or Equal to the requested SequenceNumber.
-  Status RetainProbableWalFiles(VectorLogPtr& all_logs,
-                                const SequenceNumber target);
-
-  Status ReadFirstRecord(const WalFileType type, const uint64_t number,
-                         SequenceNumber* sequence);
-
-  Status ReadFirstLine(const std::string& fname, SequenceNumber* sequence);
-#endif  // ROCKSDB_LITE
-
   void PrintStatistics();
 
   // dump rocksdb.stats to LOG
   void MaybeDumpStats();
 
-  // Return true if the current db supports snapshot.  If the current
-  // DB does not support snapshot, then calling GetSnapshot() will always
-  // return nullptr.
-  //
-  // @see GetSnapshot()
-  virtual bool IsSnapshotSupported() const;
-
   // Return the minimum empty level that could hold the total data in the
   // input level. Return the input level, if such level could not be found.
-  int FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd, int level);
+  int FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd,
+      const MutableCFOptions& mutable_cf_options, int level);
 
   // Move the files in the input level to the target level.
   // If target_level < 0, automatically calculate the minimum level that could
   // hold the data set.
   Status ReFitLevel(ColumnFamilyData* cfd, int level, int target_level = -1);
 
+  // helper functions for adding and removing from flush & compaction queues
+  void AddToCompactionQueue(ColumnFamilyData* cfd);
+  ColumnFamilyData* PopFirstFromCompactionQueue();
+  void AddToFlushQueue(ColumnFamilyData* cfd);
+  ColumnFamilyData* PopFirstFromFlushQueue();
+
   // table_cache_ provides its own synchronization
   std::shared_ptr<Cache> table_cache_;
 
@@ -447,13 +437,23 @@ class DBImpl : public DB {
   FileLock* db_lock_;
 
   // State below is protected by mutex_
-  port::Mutex mutex_;
-  port::AtomicPointer shutting_down_;
-  port::CondVar bg_cv_;          // Signalled when background work finishes
+  InstrumentedMutex mutex_;
+  std::atomic<bool> shutting_down_;
+  // This condition variable is signaled on these conditions:
+  // * whenever bg_compaction_scheduled_ goes down to 0
+  // * if bg_manual_only_ > 0, whenever a compaction finishes, even if it hasn't
+  // made any progress
+  // * whenever a compaction made any progress
+  // * whenever bg_flush_scheduled_ value decreases (i.e. whenever a flush is
+  // done, even if it didn't make any progress)
+  // * whenever there is an error in background flush or compaction
+  InstrumentedCondVar bg_cv_;
   uint64_t logfile_number_;
   unique_ptr<log::Writer> log_;
+  bool log_dir_synced_;
   bool log_empty_;
   ColumnFamilyHandleImpl* default_cf_handle_;
+  InternalStats* default_cf_internal_stats_;
   unique_ptr<ColumnFamilyMemTablesImpl> column_family_memtables_;
   struct LogFileNumberSize {
     explicit LogFileNumberSize(uint64_t _number)
@@ -468,28 +468,92 @@ class DBImpl : public DB {
   // only used for dynamically adjusting max_total_wal_size. it is a sum of
   // [write_buffer_size * max_write_buffer_number] over all column families
   uint64_t max_total_in_memory_state_;
+  // If true, we have only one (default) column family. We use this to optimize
+  // some code-paths
+  bool single_column_family_mode_;
+  // If this is non-empty, we need to delete these log files in background
+  // threads. Protected by db mutex.
+  autovector<log::Writer*> logs_to_free_;
+
+  bool is_snapshot_supported_;
+
+  // Class to maintain directories for all database paths other than main one.
+  class Directories {
+   public:
+    Status SetDirectories(Env* env, const std::string& dbname,
+                          const std::string& wal_dir,
+                          const std::vector<DbPath>& data_paths);
+
+    Directory* GetDataDir(size_t path_id);
+
+    Directory* GetWalDir() {
+      if (wal_dir_) {
+        return wal_dir_.get();
+      }
+      return db_dir_.get();
+    }
 
-  std::string host_name_;
+    Directory* GetDbDir() { return db_dir_.get(); }
 
-  std::unique_ptr<Directory> db_directory_;
+   private:
+    std::unique_ptr<Directory> db_dir_;
+    std::vector<std::unique_ptr<Directory>> data_dirs_;
+    std::unique_ptr<Directory> wal_dir_;
 
-  // Queue of writers.
-  std::deque<Writer*> writers_;
-  WriteBatch tmp_batch_;
+    Status CreateAndNewDirectory(Env* env, const std::string& dirname,
+                                 std::unique_ptr<Directory>* directory) const;
+  };
 
-  SnapshotList snapshots_;
+  Directories directories_;
+
+  WriteBuffer write_buffer_;
 
-  // cache for ReadFirstRecord() calls
-  std::unordered_map<uint64_t, SequenceNumber> read_first_record_cache_;
-  port::Mutex read_first_record_cache_mutex_;
+  WriteThread write_thread_;
 
-  // Set of table files to protect from deletion because they are
-  // part of ongoing compactions.
-  std::set<uint64_t> pending_outputs_;
+  WriteBatch tmp_batch_;
 
-  // At least one compaction or flush job is pending but not yet scheduled
-  // because of the max background thread limit.
-  bool bg_schedule_needed_;
+  WriteController write_controller_;
+  FlushScheduler flush_scheduler_;
+
+  SnapshotList snapshots_;
+
+  // For each background job, pending_outputs_ keeps the current file number at
+  // the time that background job started.
+  // FindObsoleteFiles()/PurgeObsoleteFiles() never deletes any file that has
+  // number bigger than any of the file number in pending_outputs_. Since file
+  // numbers grow monotonically, this also means that pending_outputs_ is always
+  // sorted. After a background job is done executing, its file number is
+  // deleted from pending_outputs_, which allows PurgeObsoleteFiles() to clean
+  // it up.
+  // State is protected with db mutex.
+  std::list<uint64_t> pending_outputs_;
+
+  // flush_queue_ and compaction_queue_ hold column families that we need to
+  // flush and compact, respectively.
+  // A column family is inserted into flush_queue_ when it satisfies condition
+  // cfd->imm()->IsFlushPending()
+  // A column family is inserted into compaction_queue_ when it satisfied
+  // condition cfd->NeedsCompaction()
+  // Column families in this list are all Ref()-erenced
+  // TODO(icanadi) Provide some kind of ReferencedColumnFamily class that will
+  // do RAII on ColumnFamilyData
+  // Column families are in this queue when they need to be flushed or
+  // compacted. Consumers of these queues are flush and compaction threads. When
+  // column family is put on this queue, we increase unscheduled_flushes_ and
+  // unscheduled_compactions_. When these variables are bigger than zero, that
+  // means we need to schedule background threads for compaction and thread.
+  // Once the background threads are scheduled, we decrease unscheduled_flushes_
+  // and unscheduled_compactions_. That way we keep track of number of
+  // compaction and flush threads we need to schedule. This scheduling is done
+  // in MaybeScheduleFlushOrCompaction()
+  // invariant(column family present in flush_queue_ <==>
+  // ColumnFamilyData::pending_flush_ == true)
+  std::deque<ColumnFamilyData*> flush_queue_;
+  // invariant(column family present in compaction_queue_ <==>
+  // ColumnFamilyData::pending_compaction_ == true)
+  std::deque<ColumnFamilyData*> compaction_queue_;
+  int unscheduled_flushes_;
+  int unscheduled_compactions_;
 
   // count how many background compactions are running or have been scheduled
   int bg_compaction_scheduled_;
@@ -502,14 +566,12 @@ class DBImpl : public DB {
   // number of background memtable flush jobs, submitted to the HIGH pool
   int bg_flush_scheduled_;
 
-  // Has a background stats log thread scheduled?
-  bool bg_logstats_scheduled_;
-
   // Information for a manual compaction
   struct ManualCompaction {
     ColumnFamilyData* cfd;
     int input_level;
     int output_level;
+    uint32_t output_path_id;
     bool done;
     Status status;
     bool in_progress;           // compaction request being processed?
@@ -522,10 +584,6 @@ class DBImpl : public DB {
   // Have we encountered a background error in paranoid mode?
   Status bg_error_;
 
-  std::unique_ptr<StatsLogger> logger_;
-
-  int64_t volatile last_log_ts;
-
   // shall we disable deletion of obsolete files
   // if 0 the deletion is enabled.
   // if non-zero, files will not be getting deleted
@@ -534,29 +592,30 @@ class DBImpl : public DB {
   // without any synchronization
   int disable_delete_obsolete_files_;
 
-  // last time when DeleteObsoleteFiles was invoked
-  uint64_t delete_obsolete_files_last_run_;
-
-  // last time when PurgeObsoleteWALFiles ran.
-  uint64_t purge_wal_files_last_run_;
+  // next time when we should run DeleteObsoleteFiles with full scan
+  uint64_t delete_obsolete_files_next_run_;
 
   // last time stats were dumped to LOG
   std::atomic<uint64_t> last_stats_dump_time_microsec_;
 
-  // obsolete files will be deleted every this seconds if ttl deletion is
-  // enabled and archive size_limit is disabled.
-  uint64_t default_interval_to_delete_obsolete_WAL_;
+  // Each flush or compaction gets its own job id. this counter makes sure
+  // they're unique
+  std::atomic<int> next_job_id_;
 
   bool flush_on_destroy_; // Used when disableWAL is true.
 
   static const int KEEP_LOG_FILE_NUM = 1000;
   std::string db_absolute_path_;
 
-  // count of the number of contiguous delaying writes
-  int delayed_writes_;
-
   // The options to access storage files
-  const EnvOptions storage_options_;
+  const EnvOptions env_options_;
+
+#ifndef ROCKSDB_LITE
+  WalManager wal_manager_;
+#endif  // ROCKSDB_LITE
+
+  // Unified interface for logging events
+  EventLogger event_logger_;
 
   // A value of true temporarily disables scheduling of background work
   bool bg_work_gate_closed_;
@@ -567,13 +626,16 @@ class DBImpl : public DB {
   // Indicate DB was opened successfully
   bool opened_successfully_;
 
+  // The list of registered event listeners.
+  std::list<EventListener*> listeners_;
+
+  // count how many events are currently being notified.
+  int notifying_events_;
+
   // No copying allowed
   DBImpl(const DBImpl&);
   void operator=(const DBImpl&);
 
-  // dump the delayed_writes_ to the log file and reset counter.
-  void DelayLoggingAndReset();
-
   // Return the earliest snapshot where seqno is visible.
   // Store the snapshot right before that, if any, in prev_snapshot
   inline SequenceNumber findEarliestVisibleSnapshot(
@@ -582,10 +644,34 @@ class DBImpl : public DB {
     SequenceNumber* prev_snapshot);
 
   // Background threads call this function, which is just a wrapper around
-  // the cfd->InstallSuperVersion() function. Background threads carry
-  // deletion_state which can have new_superversion already allocated.
-  void InstallSuperVersion(ColumnFamilyData* cfd,
-                           DeletionState& deletion_state);
+  // the InstallSuperVersion() function. Background threads carry
+  // job_context which can have new_superversion already
+  // allocated.
+  void InstallSuperVersionBackground(
+      ColumnFamilyData* cfd, JobContext* job_context,
+      const MutableCFOptions& mutable_cf_options);
+
+  // All ColumnFamily state changes go through this function. Here we analyze
+  // the new state and we schedule background work if we detect that the new
+  // state needs flush or compaction.
+  // If dont_schedule_bg_work == true, then caller asks us to not schedule flush
+  // or compaction here, but it also promises to schedule needed background
+  // work. We use this to  scheduling background compactions when we are in the
+  // write thread, which is very performance critical. Caller schedules
+  // background work as soon as it exits the write thread
+  SuperVersion* InstallSuperVersion(ColumnFamilyData* cfd, SuperVersion* new_sv,
+                                    const MutableCFOptions& mutable_cf_options,
+                                    bool dont_schedule_bg_work = false);
+
+  // Find Super version and reference it. Based on options, it might return
+  // the thread local cached one.
+  inline SuperVersion* GetAndRefSuperVersion(ColumnFamilyData* cfd);
+
+  // Un-reference the super version and return it to thread local cache if
+  // needed. If it is the last reference of the super version. Clean it up
+  // after un-referencing it.
+  inline void ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd,
+                                           SuperVersion* sv);
 
 #ifndef ROCKSDB_LITE
   using DB::GetPropertiesOfAllTables;
@@ -599,25 +685,24 @@ class DBImpl : public DB {
   Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family,
                  const Slice& key, std::string* value,
                  bool* value_found = nullptr);
+
+  bool GetIntPropertyInternal(ColumnFamilyHandle* column_family,
+                              DBPropertyType property_type,
+                              bool need_out_of_mutex, uint64_t* value);
 };
 
 // Sanitize db options.  The caller should delete result.info_log if
 // it is not equal to src.info_log.
 extern Options SanitizeOptions(const std::string& db,
                                const InternalKeyComparator* icmp,
-                               const InternalFilterPolicy* ipolicy,
                                const Options& src);
 extern DBOptions SanitizeOptions(const std::string& db, const DBOptions& src);
 
-// Determine compression type, based on user options, level of the output
-// file and whether compression is disabled.
-// If enable_compression is false, then compression is always disabled no
-// matter what the values of the other two parameters are.
-// Otherwise, the compression type is determined based on options and level.
-CompressionType GetCompressionType(const Options& options, int level,
-                                   const bool enable_compression);
-
-// Determine compression type for L0 file written by memtable flush.
-CompressionType GetCompressionFlush(const Options& options);
+// Fix user-supplied options to be reasonable
+template <class T, class V>
+static void ClipToRange(T* ptr, V minvalue, V maxvalue) {
+  if (static_cast<V>(*ptr) > maxvalue) *ptr = maxvalue;
+  if (static_cast<V>(*ptr) < minvalue) *ptr = minvalue;
+}
 
 }  // namespace rocksdb
diff --git a/src/rocksdb/db/db_impl_debug.cc b/src/rocksdb/db/db_impl_debug.cc
index d6551b4..5c7a353 100644
--- a/src/rocksdb/db/db_impl_debug.cc
+++ b/src/rocksdb/db/db_impl_debug.cc
@@ -10,17 +10,17 @@
 #ifndef ROCKSDB_LITE
 
 #include "db/db_impl.h"
+#include "util/thread_status_updater.h"
 
 namespace rocksdb {
 
-void DBImpl::TEST_PurgeObsoleteteWAL() { PurgeObsoleteWALFiles(); }
-
 uint64_t DBImpl::TEST_GetLevel0TotalSize() {
-  MutexLock l(&mutex_);
-  return default_cf_handle_->cfd()->current()->NumLevelBytes(0);
+  InstrumentedMutexLock l(&mutex_);
+  return default_cf_handle_->cfd()->current()->storage_info()->NumLevelBytes(0);
 }
 
-Iterator* DBImpl::TEST_NewInternalIterator(ColumnFamilyHandle* column_family) {
+Iterator* DBImpl::TEST_NewInternalIterator(Arena* arena,
+                                           ColumnFamilyHandle* column_family) {
   ColumnFamilyData* cfd;
   if (column_family == nullptr) {
     cfd = default_cf_handle_->cfd();
@@ -33,7 +33,7 @@ Iterator* DBImpl::TEST_NewInternalIterator(ColumnFamilyHandle* column_family) {
   SuperVersion* super_version = cfd->GetSuperVersion()->Ref();
   mutex_.Unlock();
   ReadOptions roptions;
-  return NewInternalIterator(roptions, cfd, super_version);
+  return NewInternalIterator(roptions, cfd, super_version, arena);
 }
 
 int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes(
@@ -45,8 +45,8 @@ int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes(
     auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
     cfd = cfh->cfd();
   }
-  MutexLock l(&mutex_);
-  return cfd->current()->MaxNextLevelOverlappingBytes();
+  InstrumentedMutexLock l(&mutex_);
+  return cfd->current()->storage_info()->MaxNextLevelOverlappingBytes();
 }
 
 void DBImpl::TEST_GetFilesMetaData(
@@ -54,10 +54,11 @@ void DBImpl::TEST_GetFilesMetaData(
     std::vector<std::vector<FileMetaData>>* metadata) {
   auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
   auto cfd = cfh->cfd();
-  MutexLock l(&mutex_);
+  InstrumentedMutexLock l(&mutex_);
   metadata->resize(NumberLevels());
   for (int level = 0; level < NumberLevels(); level++) {
-    const std::vector<FileMetaData*>& files = cfd->current()->files_[level];
+    const std::vector<FileMetaData*>& files =
+        cfd->current()->storage_info()->LevelFiles(level);
 
     (*metadata)[level].clear();
     for (const auto& f : files) {
@@ -67,7 +68,7 @@ void DBImpl::TEST_GetFilesMetaData(
 }
 
 uint64_t DBImpl::TEST_Current_Manifest_FileNo() {
-  return versions_->ManifestFileNumber();
+  return versions_->manifest_file_number();
 }
 
 Status DBImpl::TEST_CompactRange(int level, const Slice* begin,
@@ -81,10 +82,11 @@ Status DBImpl::TEST_CompactRange(int level, const Slice* begin,
     cfd = cfh->cfd();
   }
   int output_level =
-      (cfd->options()->compaction_style == kCompactionStyleUniversal)
+      (cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
+       cfd->ioptions()->compaction_style == kCompactionStyleFIFO)
           ? level
           : level + 1;
-  return RunManualCompaction(cfd, level, output_level, begin, end);
+  return RunManualCompaction(cfd, level, output_level, 0, begin, end);
 }
 
 Status DBImpl::TEST_FlushMemTable(bool wait) {
@@ -111,22 +113,38 @@ Status DBImpl::TEST_WaitForCompact() {
   // wait for compact. It actually waits for scheduled compaction
   // OR flush to finish.
 
-  MutexLock l(&mutex_);
+  InstrumentedMutexLock l(&mutex_);
   while ((bg_compaction_scheduled_ || bg_flush_scheduled_) && bg_error_.ok()) {
     bg_cv_.Wait();
   }
   return bg_error_;
 }
 
-Status DBImpl::TEST_ReadFirstRecord(const WalFileType type,
-                                    const uint64_t number,
-                                    SequenceNumber* sequence) {
-  return ReadFirstRecord(type, number, sequence);
+void DBImpl::TEST_LockMutex() {
+  mutex_.Lock();
+}
+
+void DBImpl::TEST_UnlockMutex() {
+  mutex_.Unlock();
+}
+
+void* DBImpl::TEST_BeginWrite() {
+  auto w = new WriteThread::Writer(&mutex_);
+  Status s = write_thread_.EnterWriteThread(w, 0);
+  assert(s.ok() && !w->done);  // No timeout and nobody should do our job
+  return reinterpret_cast<void*>(w);
 }
 
-Status DBImpl::TEST_ReadFirstLine(const std::string& fname,
-                                  SequenceNumber* sequence) {
-  return ReadFirstLine(fname, sequence);
+void DBImpl::TEST_EndWrite(void* w) {
+  auto writer = reinterpret_cast<WriteThread::Writer*>(w);
+  write_thread_.ExitWriteThread(writer, writer, Status::OK());
+  delete writer;
 }
+
+size_t DBImpl::TEST_LogsToFreeSize() {
+  InstrumentedMutexLock l(&mutex_);
+  return logs_to_free_.size();
+}
+
 }  // namespace rocksdb
 #endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/db_impl_experimental.cc b/src/rocksdb/db/db_impl_experimental.cc
new file mode 100644
index 0000000..d6c3dfc
--- /dev/null
+++ b/src/rocksdb/db/db_impl_experimental.cc
@@ -0,0 +1,150 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_impl.h"
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/job_context.h"
+#include "db/version_set.h"
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+#ifndef ROCKSDB_LITE
+Status DBImpl::SuggestCompactRange(ColumnFamilyHandle* column_family,
+                                   const Slice* begin, const Slice* end) {
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  auto cfd = cfh->cfd();
+  InternalKey start_key, end_key;
+  if (begin != nullptr) {
+    start_key.SetMaxPossibleForUserKey(*begin);
+  }
+  if (end != nullptr) {
+    end_key.SetMinPossibleForUserKey(*end);
+  }
+  {
+    InstrumentedMutexLock l(&mutex_);
+    auto vstorage = cfd->current()->storage_info();
+    for (int level = 0; level < vstorage->num_non_empty_levels() - 1; ++level) {
+      std::vector<FileMetaData*> inputs;
+      vstorage->GetOverlappingInputs(
+          level, begin == nullptr ? nullptr : &start_key,
+          end == nullptr ? nullptr : &end_key, &inputs);
+      for (auto f : inputs) {
+        f->marked_for_compaction = true;
+      }
+    }
+    // Since we have some more files to compact, we should also recompute
+    // compaction score
+    vstorage->ComputeCompactionScore(*cfd->GetLatestMutableCFOptions(),
+                                     CompactionOptionsFIFO());
+    SchedulePendingCompaction(cfd);
+    MaybeScheduleFlushOrCompaction();
+  }
+  return Status::OK();
+}
+
+Status DBImpl::PromoteL0(ColumnFamilyHandle* column_family, int target_level) {
+  assert(column_family);
+
+  if (target_level < 1) {
+    Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+        "PromoteL0 FAILED. Invalid target level %d\n", target_level);
+    return Status::InvalidArgument("Invalid target level");
+  }
+
+  Status status;
+  VersionEdit edit;
+  JobContext job_context(next_job_id_.fetch_add(1), true);
+  {
+    InstrumentedMutexLock l(&mutex_);
+    auto* cfd = static_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
+    const auto* vstorage = cfd->current()->storage_info();
+
+    if (target_level >= vstorage->num_levels()) {
+      Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+          "PromoteL0 FAILED. Target level %d does not exist\n", target_level);
+      job_context.Clean();
+      return Status::InvalidArgument("Target level does not exist");
+    }
+
+    // Sort L0 files by range.
+    const InternalKeyComparator* icmp = &cfd->internal_comparator();
+    auto l0_files = vstorage->LevelFiles(0);
+    std::sort(l0_files.begin(), l0_files.end(),
+              [icmp](FileMetaData* f1, FileMetaData* f2) {
+                return icmp->Compare(f1->largest, f2->largest) < 0;
+              });
+
+    // Check that no L0 file is being compacted and that they have
+    // non-overlapping ranges.
+    for (size_t i = 0; i < l0_files.size(); ++i) {
+      auto f = l0_files[i];
+      if (f->being_compacted) {
+        Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+            "PromoteL0 FAILED. File %" PRIu64 " being compacted\n",
+            f->fd.GetNumber());
+        job_context.Clean();
+        return Status::InvalidArgument("PromoteL0 called during L0 compaction");
+      }
+
+      if (i == 0) continue;
+      auto prev_f = l0_files[i - 1];
+      if (icmp->Compare(prev_f->largest, f->smallest) >= 0) {
+        Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+            "PromoteL0 FAILED. Files %" PRIu64 " and %" PRIu64
+            " have overlapping ranges\n",
+            prev_f->fd.GetNumber(), f->fd.GetNumber());
+        job_context.Clean();
+        return Status::InvalidArgument("L0 has overlapping files");
+      }
+    }
+
+    // Check that all levels up to target_level are empty.
+    for (int level = 1; level <= target_level; ++level) {
+      if (vstorage->NumLevelFiles(level) > 0) {
+        Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+            "PromoteL0 FAILED. Level %d not empty\n", level);
+        job_context.Clean();
+        return Status::InvalidArgument(
+            "All levels up to target_level "
+            "must be empty");
+      }
+    }
+
+    edit.SetColumnFamily(cfd->GetID());
+    for (const auto& f : l0_files) {
+      edit.DeleteFile(0, f->fd.GetNumber());
+      edit.AddFile(target_level, f->fd.GetNumber(), f->fd.GetPathId(),
+                   f->fd.GetFileSize(), f->smallest, f->largest,
+                   f->smallest_seqno, f->largest_seqno);
+    }
+
+    status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
+                                    &edit, &mutex_, directories_.GetDbDir());
+    if (status.ok()) {
+      InstallSuperVersionBackground(cfd, &job_context,
+                                    *cfd->GetLatestMutableCFOptions());
+    }
+  }  // lock released here
+  LogFlush(db_options_.info_log);
+  job_context.Clean();
+
+  return status;
+}
+#endif  // ROCKSDB_LITE
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/db/db_impl_readonly.cc b/src/rocksdb/db/db_impl_readonly.cc
index 4308374..c1d61e3 100644
--- a/src/rocksdb/db/db_impl_readonly.cc
+++ b/src/rocksdb/db/db_impl_readonly.cc
@@ -2,57 +2,31 @@
 //  This source code is licensed under the BSD-style license found in the
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
-//
-// Copyright (c) 2012 Facebook. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
+
 
 #include "db/db_impl_readonly.h"
+#include "utilities/compacted_db/compacted_db_impl.h"
 #include "db/db_impl.h"
-
-#include <algorithm>
-#include <set>
-#include <string>
-#include <stdint.h>
-#include <stdio.h>
-#include <vector>
-#include <algorithm>
-#include "db/db_iter.h"
-#include "db/dbformat.h"
-#include "db/filename.h"
-#include "db/log_reader.h"
-#include "db/log_writer.h"
-#include "db/memtable.h"
 #include "db/merge_context.h"
-#include "db/table_cache.h"
-#include "db/version_set.h"
-#include "db/write_batch_internal.h"
-#include "rocksdb/db.h"
-#include "rocksdb/env.h"
-#include "rocksdb/status.h"
-#include "rocksdb/table.h"
-#include "rocksdb/merge_operator.h"
-#include "port/port.h"
-#include "table/block.h"
-#include "table/merger.h"
-#include "table/two_level_iterator.h"
-#include "util/coding.h"
-#include "util/logging.h"
-#include "util/build_version.h"
+#include "db/db_iter.h"
+#include "util/perf_context_imp.h"
 
 namespace rocksdb {
 
-DBImplReadOnly::DBImplReadOnly(const DBOptions& options,
+#ifndef ROCKSDB_LITE
+
+DBImplReadOnly::DBImplReadOnly(const DBOptions& db_options,
                                const std::string& dbname)
-    : DBImpl(options, dbname) {
-  Log(options_.info_log, "Opening the db in read only mode");
+    : DBImpl(db_options, dbname) {
+  Log(INFO_LEVEL, db_options_.info_log, "Opening the db in read only mode");
+  LogFlush(db_options_.info_log);
 }
 
 DBImplReadOnly::~DBImplReadOnly() {
 }
 
 // Implementations of the DB interface
-Status DBImplReadOnly::Get(const ReadOptions& options,
+Status DBImplReadOnly::Get(const ReadOptions& read_options,
                            ColumnFamilyHandle* column_family, const Slice& key,
                            std::string* value) {
   Status s;
@@ -62,34 +36,74 @@ Status DBImplReadOnly::Get(const ReadOptions& options,
   SuperVersion* super_version = cfd->GetSuperVersion();
   MergeContext merge_context;
   LookupKey lkey(key, snapshot);
-  if (super_version->mem->Get(lkey, value, &s, merge_context,
-                              *cfd->options())) {
+  if (super_version->mem->Get(lkey, value, &s, &merge_context)) {
   } else {
-    Version::GetStats stats;
-    super_version->current->Get(options, lkey, value, &s, &merge_context,
-                                &stats);
+    PERF_TIMER_GUARD(get_from_output_files_time);
+    super_version->current->Get(read_options, lkey, value, &s, &merge_context);
   }
   return s;
 }
 
-Iterator* DBImplReadOnly::NewIterator(const ReadOptions& options,
+Iterator* DBImplReadOnly::NewIterator(const ReadOptions& read_options,
                                       ColumnFamilyHandle* column_family) {
   auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
   auto cfd = cfh->cfd();
   SuperVersion* super_version = cfd->GetSuperVersion()->Ref();
   SequenceNumber latest_snapshot = versions_->LastSequence();
-  Iterator* internal_iter = NewInternalIterator(options, cfd, super_version);
-  return NewDBIterator(
-      env_, *cfd->options(), cfd->user_comparator(), internal_iter,
-      (options.snapshot != nullptr
-           ? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
-           : latest_snapshot));
+  auto db_iter = NewArenaWrappedDbIterator(
+      env_, *cfd->ioptions(), cfd->user_comparator(),
+      (read_options.snapshot != nullptr
+           ? reinterpret_cast<const SnapshotImpl*>(
+                read_options.snapshot)->number_
+           : latest_snapshot),
+      super_version->mutable_cf_options.max_sequential_skip_in_iterations);
+  auto internal_iter = NewInternalIterator(
+      read_options, cfd, super_version, db_iter->GetArena());
+  db_iter->SetIterUnderDBIter(internal_iter);
+  return db_iter;
+}
+
+Status DBImplReadOnly::NewIterators(
+    const ReadOptions& read_options,
+    const std::vector<ColumnFamilyHandle*>& column_families,
+    std::vector<Iterator*>* iterators) {
+  if (iterators == nullptr) {
+    return Status::InvalidArgument("iterators not allowed to be nullptr");
+  }
+  iterators->clear();
+  iterators->reserve(column_families.size());
+  SequenceNumber latest_snapshot = versions_->LastSequence();
+
+  for (auto cfh : column_families) {
+    auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh)->cfd();
+    auto* sv = cfd->GetSuperVersion()->Ref();
+    auto* db_iter = NewArenaWrappedDbIterator(
+        env_, *cfd->ioptions(), cfd->user_comparator(),
+        (read_options.snapshot != nullptr
+            ? reinterpret_cast<const SnapshotImpl*>(
+                  read_options.snapshot)->number_
+            : latest_snapshot),
+        sv->mutable_cf_options.max_sequential_skip_in_iterations);
+    auto* internal_iter = NewInternalIterator(
+        read_options, cfd, sv, db_iter->GetArena());
+    db_iter->SetIterUnderDBIter(internal_iter);
+    iterators->push_back(db_iter);
+  }
+
+  return Status::OK();
 }
 
 Status DB::OpenForReadOnly(const Options& options, const std::string& dbname,
                            DB** dbptr, bool error_if_log_file_exist) {
   *dbptr = nullptr;
 
+  // Try to first open DB as fully compacted DB
+  Status s;
+  s = CompactedDBImpl::Open(options, dbname, dbptr);
+  if (s.ok()) {
+    return s;
+  }
+
   DBOptions db_options(options);
   ColumnFamilyOptions cf_options(options);
   std::vector<ColumnFamilyDescriptor> column_families;
@@ -97,8 +111,7 @@ Status DB::OpenForReadOnly(const Options& options, const std::string& dbname,
       ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
   std::vector<ColumnFamilyHandle*> handles;
 
-  Status s =
-      DB::OpenForReadOnly(db_options, dbname, column_families, &handles, dbptr);
+  s = DB::OpenForReadOnly(db_options, dbname, column_families, &handles, dbptr);
   if (s.ok()) {
     assert(handles.size() == 1);
     // i can delete the handle since DBImpl is always holding a
@@ -140,6 +153,10 @@ Status DB::OpenForReadOnly(
   impl->mutex_.Unlock();
   if (s.ok()) {
     *dbptr = impl;
+    for (auto* h : *handles) {
+      impl->NewThreadStatusCfInfo(
+          reinterpret_cast<ColumnFamilyHandleImpl*>(h)->cfd());
+    }
   } else {
     for (auto h : *handles) {
       delete h;
@@ -150,5 +167,20 @@ Status DB::OpenForReadOnly(
   return s;
 }
 
+#else  // !ROCKSDB_LITE
+
+Status DB::OpenForReadOnly(const Options& options, const std::string& dbname,
+                           DB** dbptr, bool error_if_log_file_exist) {
+  return Status::NotSupported("Not supported in ROCKSDB_LITE.");
+}
+
+Status DB::OpenForReadOnly(
+    const DBOptions& db_options, const std::string& dbname,
+    const std::vector<ColumnFamilyDescriptor>& column_families,
+    std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
+    bool error_if_log_file_exist) {
+  return Status::NotSupported("Not supported in ROCKSDB_LITE.");
+}
+#endif  // !ROCKSDB_LITE
 
 }   // namespace rocksdb
diff --git a/src/rocksdb/db/db_impl_readonly.h b/src/rocksdb/db/db_impl_readonly.h
index c4703ba..25fcb43 100644
--- a/src/rocksdb/db/db_impl_readonly.h
+++ b/src/rocksdb/db/db_impl_readonly.h
@@ -2,25 +2,14 @@
 //  This source code is licensed under the BSD-style license found in the
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
-//
-// Copyright (c) 2012 Facebook. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
 
 #pragma once
-#include "db/db_impl.h"
 
-#include <deque>
-#include <set>
+#ifndef ROCKSDB_LITE
+
+#include "db/db_impl.h"
 #include <vector>
 #include <string>
-#include "db/dbformat.h"
-#include "db/log_writer.h"
-#include "db/snapshot.h"
-#include "rocksdb/db.h"
-#include "rocksdb/env.h"
-#include "port/port.h"
-#include "util/stats_logger.h"
 
 namespace rocksdb {
 
@@ -33,63 +22,76 @@ class DBImplReadOnly : public DBImpl {
   using DB::Get;
   virtual Status Get(const ReadOptions& options,
                      ColumnFamilyHandle* column_family, const Slice& key,
-                     std::string* value);
+                     std::string* value) override;
 
   // TODO: Implement ReadOnly MultiGet?
 
   using DBImpl::NewIterator;
   virtual Iterator* NewIterator(const ReadOptions&,
-                                ColumnFamilyHandle* column_family);
+                                ColumnFamilyHandle* column_family) override;
 
   virtual Status NewIterators(
       const ReadOptions& options,
-      const std::vector<ColumnFamilyHandle*>& column_family,
-      std::vector<Iterator*>* iterators) {
-   // TODO
-    return Status::NotSupported("Not supported yet.");
-  }
+      const std::vector<ColumnFamilyHandle*>& column_families,
+      std::vector<Iterator*>* iterators) override;
 
   using DBImpl::Put;
   virtual Status Put(const WriteOptions& options,
                      ColumnFamilyHandle* column_family, const Slice& key,
-                     const Slice& value) {
+                     const Slice& value) override {
     return Status::NotSupported("Not supported operation in read only mode.");
   }
   using DBImpl::Merge;
   virtual Status Merge(const WriteOptions& options,
                        ColumnFamilyHandle* column_family, const Slice& key,
-                       const Slice& value) {
+                       const Slice& value) override {
     return Status::NotSupported("Not supported operation in read only mode.");
   }
   using DBImpl::Delete;
   virtual Status Delete(const WriteOptions& options,
-                        ColumnFamilyHandle* column_family, const Slice& key) {
+                        ColumnFamilyHandle* column_family,
+                        const Slice& key) override {
     return Status::NotSupported("Not supported operation in read only mode.");
   }
-  virtual Status Write(const WriteOptions& options, WriteBatch* updates) {
+  virtual Status Write(const WriteOptions& options,
+                       WriteBatch* updates) override {
     return Status::NotSupported("Not supported operation in read only mode.");
   }
   using DBImpl::CompactRange;
   virtual Status CompactRange(ColumnFamilyHandle* column_family,
                               const Slice* begin, const Slice* end,
-                              bool reduce_level = false,
-                              int target_level = -1) {
+                              bool reduce_level = false, int target_level = -1,
+                              uint32_t target_path_id = 0) override {
     return Status::NotSupported("Not supported operation in read only mode.");
   }
-  virtual Status DisableFileDeletions() {
+
+  using DBImpl::CompactFiles;
+  virtual Status CompactFiles(
+      const CompactionOptions& compact_options,
+      ColumnFamilyHandle* column_family,
+      const std::vector<std::string>& input_file_names,
+      const int output_level, const int output_path_id = -1) override {
     return Status::NotSupported("Not supported operation in read only mode.");
   }
-  virtual Status EnableFileDeletions(bool force) {
+
+#ifndef ROCKSDB_LITE
+  virtual Status DisableFileDeletions() override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
+  virtual Status EnableFileDeletions(bool force) override {
     return Status::NotSupported("Not supported operation in read only mode.");
   }
   virtual Status GetLiveFiles(std::vector<std::string>&,
                               uint64_t* manifest_file_size,
-                              bool flush_memtable = true) {
+                              bool flush_memtable = true) override {
     return Status::NotSupported("Not supported operation in read only mode.");
   }
+#endif  // ROCKSDB_LITE
+
   using DBImpl::Flush;
   virtual Status Flush(const FlushOptions& options,
-                       ColumnFamilyHandle* column_family) {
+                       ColumnFamilyHandle* column_family) override {
     return Status::NotSupported("Not supported operation in read only mode.");
   }
 
@@ -101,3 +103,5 @@ class DBImplReadOnly : public DBImpl {
   void operator=(const DBImplReadOnly&);
 };
 }
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/db_iter.cc b/src/rocksdb/db/db_iter.cc
index a6d765d..ce75f43 100644
--- a/src/rocksdb/db/db_iter.cc
+++ b/src/rocksdb/db/db_iter.cc
@@ -10,6 +10,8 @@
 #include "db/db_iter.h"
 #include <stdexcept>
 #include <deque>
+#include <string>
+#include <limits>
 
 #include "db/filename.h"
 #include "db/dbformat.h"
@@ -18,6 +20,7 @@
 #include "rocksdb/iterator.h"
 #include "rocksdb/merge_operator.h"
 #include "port/port.h"
+#include "util/arena.h"
 #include "util/logging.h"
 #include "util/mutexlock.h"
 #include "util/perf_context_imp.h"
@@ -37,8 +40,6 @@ static void DumpInternalIter(Iterator* iter) {
 }
 #endif
 
-namespace {
-
 // Memtables and sstables that make the DB representation contain
 // (userkey,seq,type) => uservalue entries.  DBIter
 // combines multiple entries for the same userkey found in the DB
@@ -57,36 +58,49 @@ class DBIter: public Iterator {
     kReverse
   };
 
-  DBIter(Env* env, const Options& options,
-         const Comparator* cmp, Iterator* iter, SequenceNumber s)
-      : env_(env),
-        logger_(options.info_log.get()),
+  DBIter(Env* env, const ImmutableCFOptions& ioptions,
+         const Comparator* cmp, Iterator* iter, SequenceNumber s,
+         bool arena_mode, uint64_t max_sequential_skip_in_iterations,
+         const Slice* iterate_upper_bound = nullptr)
+      : arena_mode_(arena_mode),
+        env_(env),
+        logger_(ioptions.info_log),
         user_comparator_(cmp),
-        user_merge_operator_(options.merge_operator.get()),
+        user_merge_operator_(ioptions.merge_operator),
         iter_(iter),
         sequence_(s),
         direction_(kForward),
         valid_(false),
         current_entry_is_merged_(false),
-        statistics_(options.statistics.get()) {
-    RecordTick(statistics_, NO_ITERATORS, 1);
-    max_skip_ = options.max_sequential_skip_in_iterations;
+        statistics_(ioptions.statistics),
+        iterate_upper_bound_(iterate_upper_bound) {
+    RecordTick(statistics_, NO_ITERATORS);
+    prefix_extractor_ = ioptions.prefix_extractor;
+    max_skip_ = max_sequential_skip_in_iterations;
   }
   virtual ~DBIter() {
     RecordTick(statistics_, NO_ITERATORS, -1);
-    delete iter_;
+    if (!arena_mode_) {
+      delete iter_;
+    } else {
+      iter_->~Iterator();
+    }
   }
-  virtual bool Valid() const { return valid_; }
-  virtual Slice key() const {
+  virtual void SetIter(Iterator* iter) {
+    assert(iter_ == nullptr);
+    iter_ = iter;
+  }
+  virtual bool Valid() const override { return valid_; }
+  virtual Slice key() const override {
     assert(valid_);
     return saved_key_.GetKey();
   }
-  virtual Slice value() const {
+  virtual Slice value() const override {
     assert(valid_);
     return (direction_ == kForward && !current_entry_is_merged_) ?
       iter_->value() : saved_value_;
   }
-  virtual Status status() const {
+  virtual Status status() const override {
     if (status_.ok()) {
       return iter_->status();
     } else {
@@ -94,16 +108,21 @@ class DBIter: public Iterator {
     }
   }
 
-  virtual void Next();
-  virtual void Prev();
-  virtual void Seek(const Slice& target);
-  virtual void SeekToFirst();
-  virtual void SeekToLast();
+  virtual void Next() override;
+  virtual void Prev() override;
+  virtual void Seek(const Slice& target) override;
+  virtual void SeekToFirst() override;
+  virtual void SeekToLast() override;
 
  private:
+  void PrevInternal();
+  void FindParseableKey(ParsedInternalKey* ikey, Direction direction);
+  bool FindValueForCurrentKey();
+  bool FindValueForCurrentKeyUsingSeek();
+  void FindPrevUserKey();
+  void FindNextUserKey();
   inline void FindNextUserEntry(bool skipping);
   void FindNextUserEntryInternal(bool skipping);
-  void FindPrevUserEntry();
   bool ParseKey(ParsedInternalKey* key);
   void MergeValuesNewToOld();
 
@@ -116,22 +135,24 @@ class DBIter: public Iterator {
     }
   }
 
+  const SliceTransform* prefix_extractor_;
+  bool arena_mode_;
   Env* const env_;
   Logger* logger_;
   const Comparator* const user_comparator_;
   const MergeOperator* const user_merge_operator_;
-  Iterator* const iter_;
+  Iterator* iter_;
   SequenceNumber const sequence_;
 
   Status status_;
-  IterKey saved_key_;   // == current key when direction_==kReverse
-  std::string saved_value_;   // == current raw value when direction_==kReverse
-  std::string skip_key_;
+  IterKey saved_key_;
+  std::string saved_value_;
   Direction direction_;
   bool valid_;
   bool current_entry_is_merged_;
   Statistics* statistics_;
   uint64_t max_skip_;
+  const Slice* iterate_upper_bound_;
 
   // No copying allowed
   DBIter(const DBIter&);
@@ -141,7 +162,8 @@ class DBIter: public Iterator {
 inline bool DBIter::ParseKey(ParsedInternalKey* ikey) {
   if (!ParseInternalKey(iter_->key(), ikey)) {
     status_ = Status::Corruption("corrupted internal key in DBIter");
-    Log(logger_, "corrupted internal key in DBIter: %s",
+    Log(InfoLogLevel::ERROR_LEVEL,
+        logger_, "corrupted internal key in DBIter: %s",
         iter_->key().ToString(true).c_str());
     return false;
   } else {
@@ -152,20 +174,11 @@ inline bool DBIter::ParseKey(ParsedInternalKey* ikey) {
 void DBIter::Next() {
   assert(valid_);
 
-  if (direction_ == kReverse) {  // Switch directions?
+  if (direction_ == kReverse) {
+    FindNextUserKey();
     direction_ = kForward;
-    // iter_ is pointing just before the entries for this->key(),
-    // so advance into the range of entries for this->key() and then
-    // use the normal skipping code below.
     if (!iter_->Valid()) {
       iter_->SeekToFirst();
-    } else {
-      iter_->Next();
-    }
-    if (!iter_->Valid()) {
-      valid_ = false;
-      saved_key_.Clear();
-      return;
     }
   }
 
@@ -177,7 +190,6 @@ void DBIter::Next() {
   FindNextUserEntry(true /* skipping the current user key */);
 }
 
-
 // PRE: saved_key_ has the current user key if skipping
 // POST: saved_key_ should have the next user key if valid_,
 //       if the current entry is a result of merge
@@ -187,9 +199,8 @@ void DBIter::Next() {
 // NOTE: In between, saved_key_ can point to a user key that has
 //       a delete marker
 inline void DBIter::FindNextUserEntry(bool skipping) {
-  PERF_TIMER_AUTO(find_next_user_entry_time);
+  PERF_TIMER_GUARD(find_next_user_entry_time);
   FindNextUserEntryInternal(skipping);
-  PERF_TIMER_STOP(find_next_user_entry_time);
 }
 
 // Actual implementation of DBIter::FindNextUserEntry()
@@ -201,42 +212,49 @@ void DBIter::FindNextUserEntryInternal(bool skipping) {
   uint64_t num_skipped = 0;
   do {
     ParsedInternalKey ikey;
-    if (ParseKey(&ikey) && ikey.sequence <= sequence_) {
-      if (skipping &&
-          user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) <= 0) {
-        num_skipped++; // skip this entry
-        PERF_COUNTER_ADD(internal_key_skipped_count, 1);
-      } else {
-        skipping = false;
-        switch (ikey.type) {
-          case kTypeDeletion:
-            // Arrange to skip all upcoming entries for this key since
-            // they are hidden by this deletion.
-            saved_key_.SetUserKey(ikey.user_key);
-            skipping = true;
-            num_skipped = 0;
-            PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
-            break;
-          case kTypeValue:
-            valid_ = true;
-            saved_key_.SetUserKey(ikey.user_key);
-            return;
-          case kTypeMerge:
-            // By now, we are sure the current ikey is going to yield a value
-            saved_key_.SetUserKey(ikey.user_key);
-            current_entry_is_merged_ = true;
-            valid_ = true;
-            MergeValuesNewToOld();  // Go to a different state machine
-            return;
-          default:
-            assert(false);
-            break;
+
+    if (ParseKey(&ikey)) {
+      if (iterate_upper_bound_ != nullptr &&
+          ikey.user_key.compare(*iterate_upper_bound_) >= 0) {
+        break;
+      }
+
+      if (ikey.sequence <= sequence_) {
+        if (skipping &&
+           user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) <= 0) {
+          num_skipped++;  // skip this entry
+          PERF_COUNTER_ADD(internal_key_skipped_count, 1);
+        } else {
+          switch (ikey.type) {
+            case kTypeDeletion:
+              // Arrange to skip all upcoming entries for this key since
+              // they are hidden by this deletion.
+              saved_key_.SetKey(ikey.user_key);
+              skipping = true;
+              num_skipped = 0;
+              PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
+              break;
+            case kTypeValue:
+              valid_ = true;
+              saved_key_.SetKey(ikey.user_key);
+              return;
+            case kTypeMerge:
+              // By now, we are sure the current ikey is going to yield a value
+              saved_key_.SetKey(ikey.user_key);
+              current_entry_is_merged_ = true;
+              valid_ = true;
+              MergeValuesNewToOld();  // Go to a different state machine
+              return;
+            default:
+              assert(false);
+              break;
+          }
         }
       }
     }
     // If we have sequentially iterated via numerous keys and still not
     // found the next user-key, then it is better to seek so that we can
-    // avoid too many key comparisons. We seek to the last occurence of
+    // avoid too many key comparisons. We seek to the last occurrence of
     // our current key by looking for sequence number 0.
     if (skipping && num_skipped > max_skip_) {
       num_skipped = 0;
@@ -260,16 +278,17 @@ void DBIter::FindNextUserEntryInternal(bool skipping) {
 //       iter_ points to the next entry (or invalid)
 void DBIter::MergeValuesNewToOld() {
   if (!user_merge_operator_) {
-    Log(logger_, "Options::merge_operator is null.");
-    throw std::logic_error("DBIter::MergeValuesNewToOld() with"
-                           " Options::merge_operator null");
+    Log(InfoLogLevel::ERROR_LEVEL,
+        logger_, "Options::merge_operator is null.");
+    status_ = Status::InvalidArgument("user_merge_operator_ must be set.");
+    valid_ = false;
+    return;
   }
 
   // Start the merge process by pushing the first operand
   std::deque<std::string> operands;
   operands.push_front(iter_->value().ToString());
 
-  std::string merge_result;   // Temporary string to hold merge result later
   ParsedInternalKey ikey;
   for (iter_->Next(); iter_->Valid(); iter_->Next()) {
     if (!ParseKey(&ikey)) {
@@ -293,9 +312,15 @@ void DBIter::MergeValuesNewToOld() {
       // hit a put, merge the put value with operands and store the
       // final result in saved_value_. We are done!
       // ignore corruption if there is any.
-      const Slice value = iter_->value();
-      user_merge_operator_->FullMerge(ikey.user_key, &value, operands,
-                                      &saved_value_, logger_);
+      const Slice val = iter_->value();
+      {
+        StopWatchNano timer(env_, statistics_ != nullptr);
+        PERF_TIMER_GUARD(merge_operator_time_nanos);
+        user_merge_operator_->FullMerge(ikey.user_key, &val, operands,
+                                        &saved_value_, logger_);
+        RecordTick(statistics_, MERGE_OPERATION_TOTAL_TIME,
+                   timer.ElapsedNanos());
+      }
       // iter_ is positioned after put
       iter_->Next();
       return;
@@ -304,125 +329,298 @@ void DBIter::MergeValuesNewToOld() {
     if (kTypeMerge == ikey.type) {
       // hit a merge, add the value as an operand and run associative merge.
       // when complete, add result to operands and continue.
-      const Slice& value = iter_->value();
-      operands.push_front(value.ToString());
+      const Slice& val = iter_->value();
+      operands.push_front(val.ToString());
     }
   }
 
-  // we either exhausted all internal keys under this user key, or hit
-  // a deletion marker.
-  // feed null as the existing value to the merge operator, such that
-  // client can differentiate this scenario and do things accordingly.
-  user_merge_operator_->FullMerge(saved_key_.GetKey(), nullptr, operands,
-                                  &saved_value_, logger_);
+  {
+    StopWatchNano timer(env_, statistics_ != nullptr);
+    PERF_TIMER_GUARD(merge_operator_time_nanos);
+    // we either exhausted all internal keys under this user key, or hit
+    // a deletion marker.
+    // feed null as the existing value to the merge operator, such that
+    // client can differentiate this scenario and do things accordingly.
+    user_merge_operator_->FullMerge(saved_key_.GetKey(), nullptr, operands,
+                                    &saved_value_, logger_);
+    RecordTick(statistics_, MERGE_OPERATION_TOTAL_TIME, timer.ElapsedNanos());
+  }
 }
 
 void DBIter::Prev() {
   assert(valid_);
+  if (direction_ == kForward) {
+    FindPrevUserKey();
+    direction_ = kReverse;
+  }
+  PrevInternal();
+}
 
-  // Throw an exception now if merge_operator is provided
-  // TODO: support backward iteration
-  if (user_merge_operator_) {
-    Log(logger_, "Prev not supported yet if merge_operator is provided");
-    throw std::logic_error("DBIter::Prev backward iteration not supported"
-                           " if merge_operator is provided");
+void DBIter::PrevInternal() {
+  if (!iter_->Valid()) {
+    valid_ = false;
+    return;
   }
 
-  if (direction_ == kForward) {  // Switch directions?
-    // iter_ is pointing at the current entry.  Scan backwards until
-    // the key changes so we can use the normal reverse scanning code.
-    assert(iter_->Valid());  // Otherwise valid_ would have been false
-    saved_key_.SetUserKey(ExtractUserKey(iter_->key()));
-    while (true) {
-      iter_->Prev();
+  ParsedInternalKey ikey;
+
+  while (iter_->Valid()) {
+    saved_key_.SetKey(ExtractUserKey(iter_->key()));
+    if (FindValueForCurrentKey()) {
+      valid_ = true;
       if (!iter_->Valid()) {
-        valid_ = false;
-        saved_key_.Clear();
-        ClearSavedValue();
         return;
       }
-      if (user_comparator_->Compare(ExtractUserKey(iter_->key()),
-                                    saved_key_.GetKey()) < 0) {
-        break;
+      FindParseableKey(&ikey, kReverse);
+      if (user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) == 0) {
+        FindPrevUserKey();
       }
+      return;
     }
-    direction_ = kReverse;
-  }
+    if (!iter_->Valid()) {
+      break;
+    }
+    FindParseableKey(&ikey, kReverse);
+    if (user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) == 0) {
 
-  FindPrevUserEntry();
+      FindPrevUserKey();
+    }
+  }
+  // We haven't found any key - iterator is not valid
+  assert(!iter_->Valid());
+  valid_ = false;
 }
 
-void DBIter::FindPrevUserEntry() {
-  assert(direction_ == kReverse);
-  uint64_t num_skipped = 0;
+// This function checks, if the entry with biggest sequence_number <= sequence_
+// is non kTypeDeletion. If it's not, we save value in saved_value_
+bool DBIter::FindValueForCurrentKey() {
+  assert(iter_->Valid());
+  // Contains operands for merge operator.
+  std::deque<std::string> operands;
+  // last entry before merge (could be kTypeDeletion or kTypeValue)
+  ValueType last_not_merge_type = kTypeDeletion;
+  ValueType last_key_entry_type = kTypeDeletion;
 
-  ValueType value_type = kTypeDeletion;
-  bool saved_key_valid = true;
-  if (iter_->Valid()) {
-    do {
-      ParsedInternalKey ikey;
-      if (ParseKey(&ikey) && ikey.sequence <= sequence_) {
-        if ((value_type != kTypeDeletion) &&
-            user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) < 0) {
-          // We encountered a non-deleted value in entries for previous keys,
-          break;
-        }
-        value_type = ikey.type;
-        if (value_type == kTypeDeletion) {
-          saved_key_.Clear();
-          ClearSavedValue();
-          saved_key_valid = false;
-        } else {
-          Slice raw_value = iter_->value();
-          if (saved_value_.capacity() > raw_value.size() + 1048576) {
-            std::string empty;
-            swap(empty, saved_value_);
-          }
-          saved_key_.SetUserKey(ExtractUserKey(iter_->key()));
-          saved_value_.assign(raw_value.data(), raw_value.size());
-        }
-      } else {
-        // In the case of ikey.sequence > sequence_, we might have already
-        // iterated to a different user key.
-        saved_key_valid = false;
-      }
-      num_skipped++;
-      // If we have sequentially iterated via numerous keys and still not
-      // found the prev user-key, then it is better to seek so that we can
-      // avoid too many key comparisons. We seek to the first occurence of
-      // our current key by looking for max sequence number.
-      if (saved_key_valid && num_skipped > max_skip_) {
-        num_skipped = 0;
-        std::string last_key;
-        AppendInternalKey(&last_key, ParsedInternalKey(saved_key_.GetKey(),
-                                                       kMaxSequenceNumber,
-                                                       kValueTypeForSeek));
-        iter_->Seek(last_key);
-        RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
+  ParsedInternalKey ikey;
+  FindParseableKey(&ikey, kReverse);
+
+  size_t num_skipped = 0;
+  while (iter_->Valid() && ikey.sequence <= sequence_ &&
+         (user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) == 0)) {
+    // We iterate too much: let's use Seek() to avoid too much key comparisons
+    if (num_skipped >= max_skip_) {
+      return FindValueForCurrentKeyUsingSeek();
+    }
+
+    last_key_entry_type = ikey.type;
+    switch (last_key_entry_type) {
+      case kTypeValue:
+        operands.clear();
+        saved_value_ = iter_->value().ToString();
+        last_not_merge_type = kTypeValue;
+        break;
+      case kTypeDeletion:
+        operands.clear();
+        last_not_merge_type = kTypeDeletion;
+        PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
+        break;
+      case kTypeMerge:
+        assert(user_merge_operator_ != nullptr);
+        operands.push_back(iter_->value().ToString());
+        break;
+      default:
+        assert(false);
+    }
+
+    PERF_COUNTER_ADD(internal_key_skipped_count, 1);
+    assert(user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) == 0);
+    iter_->Prev();
+    ++num_skipped;
+    FindParseableKey(&ikey, kReverse);
+  }
+
+  switch (last_key_entry_type) {
+    case kTypeDeletion:
+      valid_ = false;
+      return false;
+    case kTypeMerge:
+      if (last_not_merge_type == kTypeDeletion) {
+        StopWatchNano timer(env_, statistics_ != nullptr);
+        PERF_TIMER_GUARD(merge_operator_time_nanos);
+        user_merge_operator_->FullMerge(saved_key_.GetKey(), nullptr, operands,
+                                        &saved_value_, logger_);
+        RecordTick(statistics_, MERGE_OPERATION_TOTAL_TIME,
+                   timer.ElapsedNanos());
       } else {
-        iter_->Prev();
+        assert(last_not_merge_type == kTypeValue);
+        std::string last_put_value = saved_value_;
+        Slice temp_slice(last_put_value);
+        {
+          StopWatchNano timer(env_, statistics_ != nullptr);
+          PERF_TIMER_GUARD(merge_operator_time_nanos);
+          user_merge_operator_->FullMerge(saved_key_.GetKey(), &temp_slice,
+                                          operands, &saved_value_, logger_);
+          RecordTick(statistics_, MERGE_OPERATION_TOTAL_TIME,
+                     timer.ElapsedNanos());
+        }
       }
-    } while (iter_->Valid());
+      break;
+    case kTypeValue:
+      // do nothing - we've already has value in saved_value_
+      break;
+    default:
+      assert(false);
+      break;
   }
+  valid_ = true;
+  return true;
+}
+
+// This function is used in FindValueForCurrentKey.
+// We use Seek() function instead of Prev() to find necessary value
+bool DBIter::FindValueForCurrentKeyUsingSeek() {
+  std::string last_key;
+  AppendInternalKey(&last_key, ParsedInternalKey(saved_key_.GetKey(), sequence_,
+                                                 kValueTypeForSeek));
+  iter_->Seek(last_key);
+  RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
 
-  if (value_type == kTypeDeletion) {
-    // End
+  // assume there is at least one parseable key for this user key
+  ParsedInternalKey ikey;
+  FindParseableKey(&ikey, kForward);
+
+  if (ikey.type == kTypeValue || ikey.type == kTypeDeletion) {
+    if (ikey.type == kTypeValue) {
+      saved_value_ = iter_->value().ToString();
+      valid_ = true;
+      return true;
+    }
     valid_ = false;
-    saved_key_.Clear();
-    ClearSavedValue();
-    direction_ = kForward;
-  } else {
+    return false;
+  }
+
+  // kTypeMerge. We need to collect all kTypeMerge values and save them
+  // in operands
+  std::deque<std::string> operands;
+  while (iter_->Valid() &&
+         (user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) == 0) &&
+         ikey.type == kTypeMerge) {
+    operands.push_front(iter_->value().ToString());
+    iter_->Next();
+    FindParseableKey(&ikey, kForward);
+  }
+
+  if (!iter_->Valid() ||
+      (user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) != 0) ||
+      ikey.type == kTypeDeletion) {
+    {
+      StopWatchNano timer(env_, statistics_ != nullptr);
+      PERF_TIMER_GUARD(merge_operator_time_nanos);
+      user_merge_operator_->FullMerge(saved_key_.GetKey(), nullptr, operands,
+                                      &saved_value_, logger_);
+      RecordTick(statistics_, MERGE_OPERATION_TOTAL_TIME, timer.ElapsedNanos());
+    }
+    // Make iter_ valid and point to saved_key_
+    if (!iter_->Valid() ||
+        (user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) != 0)) {
+      iter_->Seek(last_key);
+      RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
+    }
     valid_ = true;
+    return true;
+  }
+
+  const Slice& val = iter_->value();
+  {
+    StopWatchNano timer(env_, statistics_ != nullptr);
+    PERF_TIMER_GUARD(merge_operator_time_nanos);
+    user_merge_operator_->FullMerge(saved_key_.GetKey(), &val, operands,
+                                    &saved_value_, logger_);
+    RecordTick(statistics_, MERGE_OPERATION_TOTAL_TIME, timer.ElapsedNanos());
+  }
+  valid_ = true;
+  return true;
+}
+
+// Used in Next to change directions
+// Go to next user key
+// Don't use Seek(),
+// because next user key will be very close
+void DBIter::FindNextUserKey() {
+  if (!iter_->Valid()) {
+    return;
+  }
+  ParsedInternalKey ikey;
+  FindParseableKey(&ikey, kForward);
+  while (iter_->Valid() &&
+         user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) != 0) {
+    iter_->Next();
+    FindParseableKey(&ikey, kForward);
+  }
+}
+
+// Go to previous user_key
+void DBIter::FindPrevUserKey() {
+  if (!iter_->Valid()) {
+    return;
+  }
+  size_t num_skipped = 0;
+  ParsedInternalKey ikey;
+  FindParseableKey(&ikey, kReverse);
+  while (iter_->Valid() &&
+         user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) == 0) {
+    if (num_skipped >= max_skip_) {
+      num_skipped = 0;
+      IterKey last_key;
+      last_key.SetInternalKey(ParsedInternalKey(
+          saved_key_.GetKey(), kMaxSequenceNumber, kValueTypeForSeek));
+      iter_->Seek(last_key.GetKey());
+      RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
+    }
+
+    iter_->Prev();
+    ++num_skipped;
+    FindParseableKey(&ikey, kReverse);
+  }
+}
+
+// Skip all unparseable keys
+void DBIter::FindParseableKey(ParsedInternalKey* ikey, Direction direction) {
+  while (iter_->Valid() && !ParseKey(ikey)) {
+    if (direction == kReverse) {
+      iter_->Prev();
+    } else {
+      iter_->Next();
+    }
   }
 }
 
 void DBIter::Seek(const Slice& target) {
+  StopWatch sw(env_, statistics_, DB_SEEK);
+
+  // total ordering is not guaranteed if prefix_extractor is set
+  // hence prefix based seeks will not give correct results
+  if (iterate_upper_bound_ != nullptr && prefix_extractor_ != nullptr) {
+    if (!prefix_extractor_->InDomain(*iterate_upper_bound_) ||
+        !prefix_extractor_->InDomain(target) ||
+        prefix_extractor_->Transform(*iterate_upper_bound_).compare(
+          prefix_extractor_->Transform(target)) != 0) {
+      status_ = Status::InvalidArgument("read_options.iterate_*_bound "
+                  " and seek target need to have the same prefix.");
+      valid_ = false;
+      return;
+    }
+  }
+
   saved_key_.Clear();
   // now savved_key is used to store internal key.
   saved_key_.SetInternalKey(target, sequence_);
-  PERF_TIMER_AUTO(seek_internal_seek_time);
-  iter_->Seek(saved_key_.GetKey());
-  PERF_TIMER_STOP(seek_internal_seek_time);
+
+  {
+    PERF_TIMER_GUARD(seek_internal_seek_time);
+    iter_->Seek(saved_key_.GetKey());
+  }
+
   if (iter_->Valid()) {
     direction_ = kForward;
     ClearSavedValue();
@@ -433,11 +631,19 @@ void DBIter::Seek(const Slice& target) {
 }
 
 void DBIter::SeekToFirst() {
+  // Don't use iter_::Seek() if we set a prefix extractor
+  // because prefix seek wiil be used.
+  if (prefix_extractor_ != nullptr) {
+    max_skip_ = std::numeric_limits<uint64_t>::max();
+  }
   direction_ = kForward;
   ClearSavedValue();
-  PERF_TIMER_AUTO(seek_internal_seek_time);
-  iter_->SeekToFirst();
-  PERF_TIMER_STOP(seek_internal_seek_time);
+
+  {
+    PERF_TIMER_GUARD(seek_internal_seek_time);
+    iter_->SeekToFirst();
+  }
+
   if (iter_->Valid()) {
     FindNextUserEntry(false /* not skipping */);
   } else {
@@ -446,32 +652,73 @@ void DBIter::SeekToFirst() {
 }
 
 void DBIter::SeekToLast() {
-  // Throw an exception for now if merge_operator is provided
-  // TODO: support backward iteration
-  if (user_merge_operator_) {
-    Log(logger_, "SeekToLast not supported yet if merge_operator is provided");
-    throw std::logic_error("DBIter::SeekToLast: backward iteration not"
-                           " supported if merge_operator is provided");
+  // Don't use iter_::Seek() if we set a prefix extractor
+  // because prefix seek wiil be used.
+  if (prefix_extractor_ != nullptr) {
+    max_skip_ = std::numeric_limits<uint64_t>::max();
   }
-
   direction_ = kReverse;
   ClearSavedValue();
-  PERF_TIMER_AUTO(seek_internal_seek_time);
-  iter_->SeekToLast();
-  PERF_TIMER_STOP(seek_internal_seek_time);
-  FindPrevUserEntry();
+
+  {
+    PERF_TIMER_GUARD(seek_internal_seek_time);
+    iter_->SeekToLast();
+  }
+
+  PrevInternal();
 }
 
-}  // anonymous namespace
+Iterator* NewDBIterator(Env* env, const ImmutableCFOptions& ioptions,
+                        const Comparator* user_key_comparator,
+                        Iterator* internal_iter,
+                        const SequenceNumber& sequence,
+                        uint64_t max_sequential_skip_in_iterations,
+                        const Slice* iterate_upper_bound) {
+  return new DBIter(env, ioptions, user_key_comparator, internal_iter, sequence,
+                    false, max_sequential_skip_in_iterations,
+                    iterate_upper_bound);
+}
+
+ArenaWrappedDBIter::~ArenaWrappedDBIter() { db_iter_->~DBIter(); }
+
+void ArenaWrappedDBIter::SetDBIter(DBIter* iter) { db_iter_ = iter; }
+
+void ArenaWrappedDBIter::SetIterUnderDBIter(Iterator* iter) {
+  static_cast<DBIter*>(db_iter_)->SetIter(iter);
+}
+
+inline bool ArenaWrappedDBIter::Valid() const { return db_iter_->Valid(); }
+inline void ArenaWrappedDBIter::SeekToFirst() { db_iter_->SeekToFirst(); }
+inline void ArenaWrappedDBIter::SeekToLast() { db_iter_->SeekToLast(); }
+inline void ArenaWrappedDBIter::Seek(const Slice& target) {
+  db_iter_->Seek(target);
+}
+inline void ArenaWrappedDBIter::Next() { db_iter_->Next(); }
+inline void ArenaWrappedDBIter::Prev() { db_iter_->Prev(); }
+inline Slice ArenaWrappedDBIter::key() const { return db_iter_->key(); }
+inline Slice ArenaWrappedDBIter::value() const { return db_iter_->value(); }
+inline Status ArenaWrappedDBIter::status() const { return db_iter_->status(); }
+void ArenaWrappedDBIter::RegisterCleanup(CleanupFunction function, void* arg1,
+                                         void* arg2) {
+  db_iter_->RegisterCleanup(function, arg1, arg2);
+}
 
-Iterator* NewDBIterator(
-    Env* env,
-    const Options& options,
-    const Comparator *user_key_comparator,
-    Iterator* internal_iter,
-    const SequenceNumber& sequence) {
-  return new DBIter(env, options, user_key_comparator,
-                    internal_iter, sequence);
+ArenaWrappedDBIter* NewArenaWrappedDbIterator(
+    Env* env, const ImmutableCFOptions& ioptions,
+    const Comparator* user_key_comparator,
+    const SequenceNumber& sequence,
+    uint64_t max_sequential_skip_in_iterations,
+    const Slice* iterate_upper_bound) {
+  ArenaWrappedDBIter* iter = new ArenaWrappedDBIter();
+  Arena* arena = iter->GetArena();
+  auto mem = arena->AllocateAligned(sizeof(DBIter));
+  DBIter* db_iter = new (mem) DBIter(env, ioptions, user_key_comparator,
+      nullptr, sequence, true, max_sequential_skip_in_iterations,
+      iterate_upper_bound);
+
+  iter->SetDBIter(db_iter);
+
+  return iter;
 }
 
 }  // namespace rocksdb
diff --git a/src/rocksdb/db/db_iter.h b/src/rocksdb/db/db_iter.h
index d8a3bad..c676d6c 100644
--- a/src/rocksdb/db/db_iter.h
+++ b/src/rocksdb/db/db_iter.h
@@ -11,17 +11,67 @@
 #include <stdint.h>
 #include "rocksdb/db.h"
 #include "db/dbformat.h"
+#include "util/arena.h"
+#include "util/autovector.h"
 
 namespace rocksdb {
 
+class Arena;
+class DBIter;
+
 // Return a new iterator that converts internal keys (yielded by
 // "*internal_iter") that were live at the specified "sequence" number
 // into appropriate user keys.
 extern Iterator* NewDBIterator(
     Env* env,
-    const Options& options,
+    const ImmutableCFOptions& options,
     const Comparator *user_key_comparator,
     Iterator* internal_iter,
-    const SequenceNumber& sequence);
+    const SequenceNumber& sequence,
+    uint64_t max_sequential_skip_in_iterations,
+    const Slice* iterate_upper_bound = nullptr);
+
+// A wrapper iterator which wraps DB Iterator and the arena, with which the DB
+// iterator is supposed be allocated. This class is used as an entry point of
+// a iterator hierarchy whose memory can be allocated inline. In that way,
+// accessing the iterator tree can be more cache friendly. It is also faster
+// to allocate.
+class ArenaWrappedDBIter : public Iterator {
+ public:
+  virtual ~ArenaWrappedDBIter();
+
+  // Get the arena to be used to allocate memory for DBIter to be wrapped,
+  // as well as child iterators in it.
+  virtual Arena* GetArena() { return &arena_; }
+
+  // Set the DB Iterator to be wrapped
+
+  virtual void SetDBIter(DBIter* iter);
+
+  // Set the internal iterator wrapped inside the DB Iterator. Usually it is
+  // a merging iterator.
+  virtual void SetIterUnderDBIter(Iterator* iter);
+  virtual bool Valid() const override;
+  virtual void SeekToFirst() override;
+  virtual void SeekToLast() override;
+  virtual void Seek(const Slice& target) override;
+  virtual void Next() override;
+  virtual void Prev() override;
+  virtual Slice key() const override;
+  virtual Slice value() const override;
+  virtual Status status() const override;
+  void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2);
+
+ private:
+  DBIter* db_iter_;
+  Arena arena_;
+};
+
+// Generate the arena wrapped iterator class.
+extern ArenaWrappedDBIter* NewArenaWrappedDbIterator(
+    Env* env, const ImmutableCFOptions& options,
+    const Comparator* user_key_comparator,
+    const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations,
+    const Slice* iterate_upper_bound = nullptr);
 
 }  // namespace rocksdb
diff --git a/src/rocksdb/db/db_iter_test.cc b/src/rocksdb/db/db_iter_test.cc
new file mode 100644
index 0000000..18b38ac
--- /dev/null
+++ b/src/rocksdb/db/db_iter_test.cc
@@ -0,0 +1,1409 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include <string>
+#include <vector>
+#include <algorithm>
+#include <utility>
+
+#include "db/dbformat.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/statistics.h"
+#include "db/db_iter.h"
+#include "util/string_util.h"
+#include "util/testharness.h"
+#include "utilities/merge_operators.h"
+
+namespace rocksdb {
+
+static uint64_t TestGetTickerCount(const Options& options,
+                                   Tickers ticker_type) {
+  return options.statistics->getTickerCount(ticker_type);
+}
+
+class TestIterator : public Iterator {
+ public:
+  explicit TestIterator(const Comparator* comparator)
+      : initialized_(false),
+        valid_(false),
+        sequence_number_(0),
+        iter_(0),
+        cmp(comparator) {}
+
+  void AddMerge(std::string argkey, std::string argvalue) {
+    Add(argkey, kTypeMerge, argvalue);
+  }
+
+  void AddDeletion(std::string argkey) {
+    Add(argkey, kTypeDeletion, std::string());
+  }
+
+  void AddPut(std::string argkey, std::string argvalue) {
+    Add(argkey, kTypeValue, argvalue);
+  }
+
+  void Add(std::string argkey, ValueType type, std::string argvalue) {
+    valid_ = true;
+    ParsedInternalKey internal_key(argkey, sequence_number_++, type);
+    data_.push_back(
+        std::pair<std::string, std::string>(std::string(), argvalue));
+    AppendInternalKey(&data_.back().first, internal_key);
+  }
+
+  // should be called before operations with iterator
+  void Finish() {
+    initialized_ = true;
+    std::sort(data_.begin(), data_.end(),
+              [this](std::pair<std::string, std::string> a,
+                     std::pair<std::string, std::string> b) {
+      return (cmp.Compare(a.first, b.first) < 0);
+    });
+  }
+
+  virtual bool Valid() const override {
+    assert(initialized_);
+    return valid_;
+  }
+
+  virtual void SeekToFirst() override {
+    assert(initialized_);
+    valid_ = (data_.size() > 0);
+    iter_ = 0;
+  }
+
+  virtual void SeekToLast() override {
+    assert(initialized_);
+    valid_ = (data_.size() > 0);
+    iter_ = data_.size() - 1;
+  }
+
+  virtual void Seek(const Slice& target) override {
+    assert(initialized_);
+    SeekToFirst();
+    if (!valid_) {
+      return;
+    }
+    while (iter_ < data_.size() &&
+           (cmp.Compare(data_[iter_].first, target) < 0)) {
+      ++iter_;
+    }
+
+    if (iter_ == data_.size()) {
+      valid_ = false;
+    }
+  }
+
+  virtual void Next() override {
+    assert(initialized_);
+    if (data_.empty() || (iter_ == data_.size() - 1)) {
+      valid_ = false;
+    } else {
+      ++iter_;
+    }
+  }
+
+  virtual void Prev() override {
+    assert(initialized_);
+    if (iter_ == 0) {
+      valid_ = false;
+    } else {
+      --iter_;
+    }
+  }
+
+  virtual Slice key() const override {
+    assert(initialized_);
+    return data_[iter_].first;
+  }
+
+  virtual Slice value() const override {
+    assert(initialized_);
+    return data_[iter_].second;
+  }
+
+  virtual Status status() const override {
+    assert(initialized_);
+    return Status::OK();
+  }
+
+ private:
+  bool initialized_;
+  bool valid_;
+  size_t sequence_number_;
+  size_t iter_;
+
+  InternalKeyComparator cmp;
+  std::vector<std::pair<std::string, std::string>> data_;
+};
+
+class DBIteratorTest : public testing::Test {
+ public:
+  Env* env_;
+
+  DBIteratorTest() : env_(Env::Default()) {}
+};
+
+TEST_F(DBIteratorTest, DBIteratorPrevNext) {
+  Options options;
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddDeletion("a");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddPut("a", "val_a");
+
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(
+        NewDBIterator(env_, ImmutableCFOptions(options),
+                      BytewiseComparator(), internal_iter, 10,
+                      options.max_sequential_skip_in_iterations));
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "val_b");
+
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+    db_iter->Next();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "val_b");
+
+    db_iter->Next();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddDeletion("a");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddPut("a", "val_a");
+
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(
+        NewDBIterator(env_, ImmutableCFOptions(options),
+                      BytewiseComparator(), internal_iter, 10,
+                      options.max_sequential_skip_in_iterations));
+
+    db_iter->SeekToFirst();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+    db_iter->Next();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "val_b");
+
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(
+        NewDBIterator(env_, ImmutableCFOptions(options),
+                      BytewiseComparator(), internal_iter, 2,
+                      options.max_sequential_skip_in_iterations));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "val_b");
+
+    db_iter->Next();
+    ASSERT_TRUE(!db_iter->Valid());
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "val_b");
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("a", "val_a");
+
+    internal_iter->AddPut("b", "val_b");
+
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(
+        NewDBIterator(env_, ImmutableCFOptions(options),
+                      BytewiseComparator(), internal_iter, 10,
+                      options.max_sequential_skip_in_iterations));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "val_c");
+
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "val_b");
+
+    db_iter->Next();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "val_c");
+  }
+}
+
+TEST_F(DBIteratorTest, DBIteratorEmpty) {
+  Options options;
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(
+        NewDBIterator(env_, ImmutableCFOptions(options),
+                      BytewiseComparator(), internal_iter, 0,
+                      options.max_sequential_skip_in_iterations));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(
+        NewDBIterator(env_, ImmutableCFOptions(options),
+                      BytewiseComparator(), internal_iter, 0,
+                      options.max_sequential_skip_in_iterations));
+    db_iter->SeekToFirst();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+}
+
+TEST_F(DBIteratorTest, DBIteratorUseSkipCountSkips) {
+  Options options;
+  options.statistics = rocksdb::CreateDBStatistics();
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  for (size_t i = 0; i < 200; ++i) {
+    internal_iter->AddPut("a", "a");
+    internal_iter->AddPut("b", "b");
+    internal_iter->AddPut("c", "c");
+  }
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(
+      NewDBIterator(env_, ImmutableCFOptions(options),
+                    BytewiseComparator(), internal_iter, 2,
+                    options.max_sequential_skip_in_iterations));
+  db_iter->SeekToLast();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "c");
+  ASSERT_EQ(db_iter->value().ToString(), "c");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1u);
+
+  db_iter->Prev();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "b");
+  ASSERT_EQ(db_iter->value().ToString(), "b");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2u);
+
+  db_iter->Prev();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "a");
+  ASSERT_EQ(db_iter->value().ToString(), "a");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 3u);
+
+  db_iter->Prev();
+  ASSERT_TRUE(!db_iter->Valid());
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 3u);
+}
+
+TEST_F(DBIteratorTest, DBIteratorUseSkip) {
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+  {
+    for (size_t i = 0; i < 200; ++i) {
+      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+      internal_iter->AddMerge("b", "merge_1");
+      internal_iter->AddMerge("a", "merge_2");
+      for (size_t k = 0; k < 200; ++k) {
+        internal_iter->AddPut("c", ToString(k));
+      }
+      internal_iter->Finish();
+
+      options.statistics = rocksdb::CreateDBStatistics();
+      std::unique_ptr<Iterator> db_iter(NewDBIterator(
+          env_, ImmutableCFOptions(options),
+          BytewiseComparator(), internal_iter, i + 2,
+          options.max_sequential_skip_in_iterations));
+      db_iter->SeekToLast();
+      ASSERT_TRUE(db_iter->Valid());
+
+      ASSERT_EQ(db_iter->key().ToString(), "c");
+      ASSERT_EQ(db_iter->value().ToString(), ToString(i));
+      db_iter->Prev();
+      ASSERT_TRUE(db_iter->Valid());
+
+      ASSERT_EQ(db_iter->key().ToString(), "b");
+      ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+      db_iter->Prev();
+      ASSERT_TRUE(db_iter->Valid());
+
+      ASSERT_EQ(db_iter->key().ToString(), "a");
+      ASSERT_EQ(db_iter->value().ToString(), "merge_2");
+      db_iter->Prev();
+
+      ASSERT_TRUE(!db_iter->Valid());
+    }
+  }
+
+  {
+    for (size_t i = 0; i < 200; ++i) {
+      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+      internal_iter->AddMerge("b", "merge_1");
+      internal_iter->AddMerge("a", "merge_2");
+      for (size_t k = 0; k < 200; ++k) {
+        internal_iter->AddDeletion("c");
+      }
+      internal_iter->AddPut("c", "200");
+      internal_iter->Finish();
+
+      std::unique_ptr<Iterator> db_iter(NewDBIterator(
+          env_, ImmutableCFOptions(options),
+          BytewiseComparator(), internal_iter, i + 2,
+          options.max_sequential_skip_in_iterations));
+      db_iter->SeekToLast();
+      ASSERT_TRUE(db_iter->Valid());
+
+      ASSERT_EQ(db_iter->key().ToString(), "b");
+      ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+      db_iter->Prev();
+      ASSERT_TRUE(db_iter->Valid());
+
+      ASSERT_EQ(db_iter->key().ToString(), "a");
+      ASSERT_EQ(db_iter->value().ToString(), "merge_2");
+      db_iter->Prev();
+
+      ASSERT_TRUE(!db_iter->Valid());
+    }
+
+    {
+      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+      internal_iter->AddMerge("b", "merge_1");
+      internal_iter->AddMerge("a", "merge_2");
+      for (size_t i = 0; i < 200; ++i) {
+        internal_iter->AddDeletion("c");
+      }
+      internal_iter->AddPut("c", "200");
+      internal_iter->Finish();
+
+      std::unique_ptr<Iterator> db_iter(NewDBIterator(
+          env_, ImmutableCFOptions(options),
+          BytewiseComparator(), internal_iter, 202,
+          options.max_sequential_skip_in_iterations));
+      db_iter->SeekToLast();
+      ASSERT_TRUE(db_iter->Valid());
+
+      ASSERT_EQ(db_iter->key().ToString(), "c");
+      ASSERT_EQ(db_iter->value().ToString(), "200");
+      db_iter->Prev();
+      ASSERT_TRUE(db_iter->Valid());
+
+      ASSERT_EQ(db_iter->key().ToString(), "b");
+      ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+      db_iter->Prev();
+      ASSERT_TRUE(db_iter->Valid());
+
+      ASSERT_EQ(db_iter->key().ToString(), "a");
+      ASSERT_EQ(db_iter->value().ToString(), "merge_2");
+      db_iter->Prev();
+
+      ASSERT_TRUE(!db_iter->Valid());
+    }
+  }
+
+  {
+    for (size_t i = 0; i < 200; ++i) {
+      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+      for (size_t k = 0; k < 200; ++k) {
+        internal_iter->AddDeletion("c");
+      }
+      internal_iter->AddPut("c", "200");
+      internal_iter->Finish();
+      std::unique_ptr<Iterator> db_iter(
+          NewDBIterator(env_, ImmutableCFOptions(options),
+                        BytewiseComparator(), internal_iter, i,
+                        options.max_sequential_skip_in_iterations));
+      db_iter->SeekToLast();
+      ASSERT_TRUE(!db_iter->Valid());
+
+      db_iter->SeekToFirst();
+      ASSERT_TRUE(!db_iter->Valid());
+    }
+
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    for (size_t i = 0; i < 200; ++i) {
+      internal_iter->AddDeletion("c");
+    }
+    internal_iter->AddPut("c", "200");
+    internal_iter->Finish();
+    std::unique_ptr<Iterator> db_iter(
+        NewDBIterator(env_, ImmutableCFOptions(options),
+                      BytewiseComparator(), internal_iter, 200,
+                      options.max_sequential_skip_in_iterations));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "200");
+
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+
+    db_iter->SeekToFirst();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "200");
+
+    db_iter->Next();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    for (size_t i = 0; i < 200; ++i) {
+      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+      internal_iter->AddMerge("b", "merge_1");
+      internal_iter->AddMerge("a", "merge_2");
+      for (size_t k = 0; k < 200; ++k) {
+        internal_iter->AddPut("d", ToString(k));
+      }
+
+      for (size_t k = 0; k < 200; ++k) {
+        internal_iter->AddPut("c", ToString(k));
+      }
+      internal_iter->Finish();
+
+      std::unique_ptr<Iterator> db_iter(NewDBIterator(
+          env_, ImmutableCFOptions(options),
+          BytewiseComparator(), internal_iter, i + 2,
+          options.max_sequential_skip_in_iterations));
+      db_iter->SeekToLast();
+      ASSERT_TRUE(db_iter->Valid());
+
+      ASSERT_EQ(db_iter->key().ToString(), "d");
+      ASSERT_EQ(db_iter->value().ToString(), ToString(i));
+      db_iter->Prev();
+      ASSERT_TRUE(db_iter->Valid());
+
+      ASSERT_EQ(db_iter->key().ToString(), "b");
+      ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+      db_iter->Prev();
+      ASSERT_TRUE(db_iter->Valid());
+
+      ASSERT_EQ(db_iter->key().ToString(), "a");
+      ASSERT_EQ(db_iter->value().ToString(), "merge_2");
+      db_iter->Prev();
+
+      ASSERT_TRUE(!db_iter->Valid());
+    }
+  }
+
+  {
+    for (size_t i = 0; i < 200; ++i) {
+      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+      internal_iter->AddMerge("b", "b");
+      internal_iter->AddMerge("a", "a");
+      for (size_t k = 0; k < 200; ++k) {
+        internal_iter->AddMerge("c", ToString(k));
+      }
+      internal_iter->Finish();
+
+      std::unique_ptr<Iterator> db_iter(NewDBIterator(
+          env_, ImmutableCFOptions(options),
+          BytewiseComparator(), internal_iter, i + 2,
+          options.max_sequential_skip_in_iterations));
+      db_iter->SeekToLast();
+      ASSERT_TRUE(db_iter->Valid());
+
+      ASSERT_EQ(db_iter->key().ToString(), "c");
+      std::string merge_result = "0";
+      for (size_t j = 1; j <= i; ++j) {
+        merge_result += "," + ToString(j);
+      }
+      ASSERT_EQ(db_iter->value().ToString(), merge_result);
+
+      db_iter->Prev();
+      ASSERT_TRUE(db_iter->Valid());
+      ASSERT_EQ(db_iter->key().ToString(), "b");
+      ASSERT_EQ(db_iter->value().ToString(), "b");
+
+      db_iter->Prev();
+      ASSERT_TRUE(db_iter->Valid());
+      ASSERT_EQ(db_iter->key().ToString(), "a");
+      ASSERT_EQ(db_iter->value().ToString(), "a");
+
+      db_iter->Prev();
+      ASSERT_TRUE(!db_iter->Valid());
+    }
+  }
+}
+
+TEST_F(DBIteratorTest, DBIterator1) {
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddPut("a", "0");
+  internal_iter->AddPut("b", "0");
+  internal_iter->AddDeletion("b");
+  internal_iter->AddMerge("a", "1");
+  internal_iter->AddMerge("b", "2");
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, 1,
+      options.max_sequential_skip_in_iterations));
+  db_iter->SeekToFirst();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "a");
+  ASSERT_EQ(db_iter->value().ToString(), "0");
+  db_iter->Next();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "b");
+}
+
+TEST_F(DBIteratorTest, DBIterator2) {
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddPut("a", "0");
+  internal_iter->AddPut("b", "0");
+  internal_iter->AddDeletion("b");
+  internal_iter->AddMerge("a", "1");
+  internal_iter->AddMerge("b", "2");
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, 0,
+      options.max_sequential_skip_in_iterations));
+  db_iter->SeekToFirst();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "a");
+  ASSERT_EQ(db_iter->value().ToString(), "0");
+  db_iter->Next();
+  ASSERT_TRUE(!db_iter->Valid());
+}
+
+TEST_F(DBIteratorTest, DBIterator3) {
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddPut("a", "0");
+  internal_iter->AddPut("b", "0");
+  internal_iter->AddDeletion("b");
+  internal_iter->AddMerge("a", "1");
+  internal_iter->AddMerge("b", "2");
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, 2,
+      options.max_sequential_skip_in_iterations));
+  db_iter->SeekToFirst();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "a");
+  ASSERT_EQ(db_iter->value().ToString(), "0");
+  db_iter->Next();
+  ASSERT_TRUE(!db_iter->Valid());
+}
+TEST_F(DBIteratorTest, DBIterator4) {
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddPut("a", "0");
+  internal_iter->AddPut("b", "0");
+  internal_iter->AddDeletion("b");
+  internal_iter->AddMerge("a", "1");
+  internal_iter->AddMerge("b", "2");
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, 4,
+      options.max_sequential_skip_in_iterations));
+  db_iter->SeekToFirst();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "a");
+  ASSERT_EQ(db_iter->value().ToString(), "0,1");
+  db_iter->Next();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "b");
+  ASSERT_EQ(db_iter->value().ToString(), "2");
+  db_iter->Next();
+  ASSERT_TRUE(!db_iter->Valid());
+}
+
+TEST_F(DBIteratorTest, DBIterator5) {
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddPut("a", "put_1");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        0, options.max_sequential_skip_in_iterations));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddPut("a", "put_1");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        1, options.max_sequential_skip_in_iterations));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddPut("a", "put_1");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        2, options.max_sequential_skip_in_iterations));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2,merge_3");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddPut("a", "put_1");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        3, options.max_sequential_skip_in_iterations));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "put_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddPut("a", "put_1");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        4, options.max_sequential_skip_in_iterations));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "put_1,merge_4");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddPut("a", "put_1");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        5, options.max_sequential_skip_in_iterations));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "put_1,merge_4,merge_5");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddPut("a", "put_1");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        6, options.max_sequential_skip_in_iterations));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "put_1,merge_4,merge_5,merge_6");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+}
+
+TEST_F(DBIteratorTest, DBIterator6) {
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        0, options.max_sequential_skip_in_iterations));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        1, options.max_sequential_skip_in_iterations));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        2, options.max_sequential_skip_in_iterations));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2,merge_3");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        3, options.max_sequential_skip_in_iterations));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        4, options.max_sequential_skip_in_iterations));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_4");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        5, options.max_sequential_skip_in_iterations));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        6, options.max_sequential_skip_in_iterations));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5,merge_6");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+}
+
+TEST_F(DBIteratorTest, DBIterator7) {
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddPut("b", "val");
+    internal_iter->AddMerge("b", "merge_2");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_3");
+
+    internal_iter->AddMerge("c", "merge_4");
+    internal_iter->AddMerge("c", "merge_5");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_6");
+    internal_iter->AddMerge("b", "merge_7");
+    internal_iter->AddMerge("b", "merge_8");
+    internal_iter->AddMerge("b", "merge_9");
+    internal_iter->AddMerge("b", "merge_10");
+    internal_iter->AddMerge("b", "merge_11");
+
+    internal_iter->AddDeletion("c");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        0, options.max_sequential_skip_in_iterations));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddPut("b", "val");
+    internal_iter->AddMerge("b", "merge_2");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_3");
+
+    internal_iter->AddMerge("c", "merge_4");
+    internal_iter->AddMerge("c", "merge_5");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_6");
+    internal_iter->AddMerge("b", "merge_7");
+    internal_iter->AddMerge("b", "merge_8");
+    internal_iter->AddMerge("b", "merge_9");
+    internal_iter->AddMerge("b", "merge_10");
+    internal_iter->AddMerge("b", "merge_11");
+
+    internal_iter->AddDeletion("c");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        2, options.max_sequential_skip_in_iterations));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "val,merge_2");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddPut("b", "val");
+    internal_iter->AddMerge("b", "merge_2");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_3");
+
+    internal_iter->AddMerge("c", "merge_4");
+    internal_iter->AddMerge("c", "merge_5");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_6");
+    internal_iter->AddMerge("b", "merge_7");
+    internal_iter->AddMerge("b", "merge_8");
+    internal_iter->AddMerge("b", "merge_9");
+    internal_iter->AddMerge("b", "merge_10");
+    internal_iter->AddMerge("b", "merge_11");
+
+    internal_iter->AddDeletion("c");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        4, options.max_sequential_skip_in_iterations));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_3");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddPut("b", "val");
+    internal_iter->AddMerge("b", "merge_2");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_3");
+
+    internal_iter->AddMerge("c", "merge_4");
+    internal_iter->AddMerge("c", "merge_5");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_6");
+    internal_iter->AddMerge("b", "merge_7");
+    internal_iter->AddMerge("b", "merge_8");
+    internal_iter->AddMerge("b", "merge_9");
+    internal_iter->AddMerge("b", "merge_10");
+    internal_iter->AddMerge("b", "merge_11");
+
+    internal_iter->AddDeletion("c");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        5, options.max_sequential_skip_in_iterations));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_4");
+    db_iter->Prev();
+
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_3");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddPut("b", "val");
+    internal_iter->AddMerge("b", "merge_2");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_3");
+
+    internal_iter->AddMerge("c", "merge_4");
+    internal_iter->AddMerge("c", "merge_5");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_6");
+    internal_iter->AddMerge("b", "merge_7");
+    internal_iter->AddMerge("b", "merge_8");
+    internal_iter->AddMerge("b", "merge_9");
+    internal_iter->AddMerge("b", "merge_10");
+    internal_iter->AddMerge("b", "merge_11");
+
+    internal_iter->AddDeletion("c");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        6, options.max_sequential_skip_in_iterations));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_3");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddPut("b", "val");
+    internal_iter->AddMerge("b", "merge_2");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_3");
+
+    internal_iter->AddMerge("c", "merge_4");
+    internal_iter->AddMerge("c", "merge_5");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_6");
+    internal_iter->AddMerge("b", "merge_7");
+    internal_iter->AddMerge("b", "merge_8");
+    internal_iter->AddMerge("b", "merge_9");
+    internal_iter->AddMerge("b", "merge_10");
+    internal_iter->AddMerge("b", "merge_11");
+
+    internal_iter->AddDeletion("c");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        7, options.max_sequential_skip_in_iterations));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddPut("b", "val");
+    internal_iter->AddMerge("b", "merge_2");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_3");
+
+    internal_iter->AddMerge("c", "merge_4");
+    internal_iter->AddMerge("c", "merge_5");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_6");
+    internal_iter->AddMerge("b", "merge_7");
+    internal_iter->AddMerge("b", "merge_8");
+    internal_iter->AddMerge("b", "merge_9");
+    internal_iter->AddMerge("b", "merge_10");
+    internal_iter->AddMerge("b", "merge_11");
+
+    internal_iter->AddDeletion("c");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        9, options.max_sequential_skip_in_iterations));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_6,merge_7");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddPut("b", "val");
+    internal_iter->AddMerge("b", "merge_2");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_3");
+
+    internal_iter->AddMerge("c", "merge_4");
+    internal_iter->AddMerge("c", "merge_5");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_6");
+    internal_iter->AddMerge("b", "merge_7");
+    internal_iter->AddMerge("b", "merge_8");
+    internal_iter->AddMerge("b", "merge_9");
+    internal_iter->AddMerge("b", "merge_10");
+    internal_iter->AddMerge("b", "merge_11");
+
+    internal_iter->AddDeletion("c");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        13, options.max_sequential_skip_in_iterations));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(),
+              "merge_6,merge_7,merge_8,merge_9,merge_10,merge_11");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddPut("b", "val");
+    internal_iter->AddMerge("b", "merge_2");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_3");
+
+    internal_iter->AddMerge("c", "merge_4");
+    internal_iter->AddMerge("c", "merge_5");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_6");
+    internal_iter->AddMerge("b", "merge_7");
+    internal_iter->AddMerge("b", "merge_8");
+    internal_iter->AddMerge("b", "merge_9");
+    internal_iter->AddMerge("b", "merge_10");
+    internal_iter->AddMerge("b", "merge_11");
+
+    internal_iter->AddDeletion("c");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        14, options.max_sequential_skip_in_iterations));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(),
+              "merge_6,merge_7,merge_8,merge_9,merge_10,merge_11");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+}
+TEST_F(DBIteratorTest, DBIterator8) {
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddDeletion("a");
+  internal_iter->AddPut("a", "0");
+  internal_iter->AddPut("b", "0");
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+      10, options.max_sequential_skip_in_iterations));
+  db_iter->SeekToLast();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "b");
+  ASSERT_EQ(db_iter->value().ToString(), "0");
+
+  db_iter->Prev();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "a");
+  ASSERT_EQ(db_iter->value().ToString(), "0");
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_stats_logger.cc b/src/rocksdb/db/db_stats_logger.cc
deleted file mode 100644
index 288e1bf..0000000
--- a/src/rocksdb/db/db_stats_logger.cc
+++ /dev/null
@@ -1,95 +0,0 @@
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-//
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#include "db/db_impl.h"
-#include <string>
-#include <stdint.h>
-#include <stdio.h>
-#include "db/version_set.h"
-#include "rocksdb/db.h"
-#include "rocksdb/env.h"
-#include "port/port.h"
-#include "util/mutexlock.h"
-
-namespace rocksdb {
-
-void DBImpl::MaybeScheduleLogDBDeployStats() {
-// we did say maybe
-#ifndef ROCKSDB_LITE
-  // There is a lock in the actual logger.
-  if (!logger_ || options_.db_stats_log_interval < 0
-      || host_name_.empty()) {
-    return;
-  }
-
-  if(bg_logstats_scheduled_ || shutting_down_.Acquire_Load()) {
-    // Already scheduled
-  } else {
-    int64_t current_ts = 0;
-    Status st = env_->GetCurrentTime(&current_ts);
-    if (!st.ok()) {
-      return;
-    }
-    if ((current_ts - last_log_ts) < options_.db_stats_log_interval) {
-      return;
-    }
-    last_log_ts = current_ts;
-    bg_logstats_scheduled_ = true;
-    env_->Schedule(&DBImpl::BGLogDBDeployStats, this);
-  }
-}
-
-void DBImpl::BGLogDBDeployStats(void* db) {
-  DBImpl* db_inst = reinterpret_cast<DBImpl*>(db);
-  db_inst->LogDBDeployStats();
-}
-
-void DBImpl::LogDBDeployStats() {
-  mutex_.Lock();
-
-  if (shutting_down_.Acquire_Load()) {
-    bg_logstats_scheduled_ = false;
-    bg_cv_.SignalAll();
-    mutex_.Unlock();
-    return;
-  }
-
-  char tmp_ver[100];
-  sprintf(tmp_ver, "%d.%d", kMajorVersion, kMinorVersion);
-  std::string version_info(tmp_ver);
-
-  uint64_t file_total_size = 0;
-  uint32_t file_total_num = 0;
-  Version* current = default_cf_handle_->cfd()->current();
-  for (int i = 0; i < current->NumberLevels(); i++) {
-    file_total_num += current->NumLevelFiles(i);
-    file_total_size += current->NumLevelBytes(i);
-  }
-
-  Version::LevelSummaryStorage scratch;
-  const char* file_num_summary = current->LevelSummary(&scratch);
-  std::string file_num_per_level(file_num_summary);
-  std::string data_size_per_level(file_num_summary);
-
-  mutex_.Unlock();
-
-  int64_t unix_ts;
-  env_->GetCurrentTime(&unix_ts);
-
-  logger_->Log_Deploy_Stats(version_info, host_name_,
-      db_absolute_path_, file_total_size, file_total_num, file_num_per_level,
-      data_size_per_level, unix_ts);
-
-  mutex_.Lock();
-  bg_logstats_scheduled_ = false;
-  bg_cv_.SignalAll();
-  mutex_.Unlock();
-#endif
-}
-}
diff --git a/src/rocksdb/db/db_test.cc b/src/rocksdb/db/db_test.cc
index 88637ef..eaef2a6 100644
--- a/src/rocksdb/db/db_test.cc
+++ b/src/rocksdb/db/db_test.cc
@@ -11,68 +11,57 @@
 #include <iostream>
 #include <set>
 #include <unistd.h>
+#include <thread>
 #include <unordered_set>
+#include <utility>
 
+#include "db/filename.h"
 #include "db/dbformat.h"
 #include "db/db_impl.h"
 #include "db/filename.h"
+#include "db/job_context.h"
 #include "db/version_set.h"
 #include "db/write_batch_internal.h"
+#include "port/stack_trace.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/compaction_filter.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
+#include "rocksdb/experimental.h"
 #include "rocksdb/filter_policy.h"
 #include "rocksdb/perf_context.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/table.h"
+#include "rocksdb/options.h"
 #include "rocksdb/table_properties.h"
+#include "rocksdb/thread_status.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "rocksdb/utilities/checkpoint.h"
+#include "rocksdb/utilities/convenience.h"
 #include "table/block_based_table_factory.h"
+#include "table/mock_table.h"
 #include "table/plain_table_factory.h"
 #include "util/hash.h"
 #include "util/hash_linklist_rep.h"
 #include "utilities/merge_operators.h"
 #include "util/logging.h"
+#include "util/compression.h"
 #include "util/mutexlock.h"
+#include "util/rate_limiter.h"
 #include "util/statistics.h"
 #include "util/testharness.h"
+#include "util/scoped_arena_iterator.h"
 #include "util/sync_point.h"
 #include "util/testutil.h"
+#include "util/mock_env.h"
+#include "util/string_util.h"
+#include "util/thread_status_util.h"
+#include "util/xfunc.h"
 
 namespace rocksdb {
 
-static bool SnappyCompressionSupported(const CompressionOptions& options) {
-  std::string out;
-  Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
-  return port::Snappy_Compress(options, in.data(), in.size(), &out);
-}
-
-static bool ZlibCompressionSupported(const CompressionOptions& options) {
-  std::string out;
-  Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
-  return port::Zlib_Compress(options, in.data(), in.size(), &out);
-}
-
-static bool BZip2CompressionSupported(const CompressionOptions& options) {
-  std::string out;
-  Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
-  return port::BZip2_Compress(options, in.data(), in.size(), &out);
-}
-
-static bool LZ4CompressionSupported(const CompressionOptions &options) {
-  std::string out;
-  Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
-  return port::LZ4_Compress(options, in.data(), in.size(), &out);
-}
-
-static bool LZ4HCCompressionSupported(const CompressionOptions &options) {
-  std::string out;
-  Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
-  return port::LZ4HC_Compress(options, in.data(), in.size(), &out);
-}
-
-static std::string RandomString(Random *rnd, int len) {
+static std::string RandomString(Random* rnd, int len) {
   std::string r;
   test::RandomString(rnd, len, &r);
   return r;
@@ -99,28 +88,50 @@ class AtomicCounter {
   }
 };
 
+struct OptionsOverride {
+  std::shared_ptr<const FilterPolicy> filter_policy = nullptr;
+
+  // Used as a bit mask of individual enums in which to skip an XF test point
+  int skip_policy = 0;
+};
+
+}  // namespace anon
+
+static std::string Key(int i) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "key%06d", i);
+  return std::string(buf);
 }
 
 // Special Env used to delay background operations
 class SpecialEnv : public EnvWrapper {
  public:
+  Random rnd_;
+  port::Mutex rnd_mutex_;  // Lock to pretect rnd_
+
   // sstable Sync() calls are blocked while this pointer is non-nullptr.
-  port::AtomicPointer delay_sstable_sync_;
+  std::atomic<bool> delay_sstable_sync_;
+
+  // Drop writes on the floor while this pointer is non-nullptr.
+  std::atomic<bool> drop_writes_;
 
   // Simulate no-space errors while this pointer is non-nullptr.
-  port::AtomicPointer no_space_;
+  std::atomic<bool> no_space_;
 
   // Simulate non-writable file system while this pointer is non-nullptr
-  port::AtomicPointer non_writable_;
+  std::atomic<bool> non_writable_;
 
   // Force sync of manifest files to fail while this pointer is non-nullptr
-  port::AtomicPointer manifest_sync_error_;
+  std::atomic<bool> manifest_sync_error_;
 
   // Force write to manifest files to fail while this pointer is non-nullptr
-  port::AtomicPointer manifest_write_error_;
+  std::atomic<bool> manifest_write_error_;
 
   // Force write to log files to fail while this pointer is non-nullptr
-  port::AtomicPointer log_write_error_;
+  std::atomic<bool> log_write_error_;
+
+  // Slow down every log write, in micro-seconds.
+  std::atomic<int> log_write_slowdown_;
 
   bool count_random_reads_;
   anon::AtomicCounter random_read_counter_;
@@ -130,19 +141,41 @@ class SpecialEnv : public EnvWrapper {
 
   anon::AtomicCounter sleep_counter_;
 
-  explicit SpecialEnv(Env* base) : EnvWrapper(base) {
-    delay_sstable_sync_.Release_Store(nullptr);
-    no_space_.Release_Store(nullptr);
-    non_writable_.Release_Store(nullptr);
+  std::atomic<int64_t> bytes_written_;
+
+  std::atomic<int> sync_counter_;
+
+  std::atomic<uint32_t> non_writeable_rate_;
+
+  std::atomic<uint32_t> new_writable_count_;
+
+  std::atomic<uint32_t> non_writable_count_;
+
+  std::function<void()>* table_write_callback_;
+
+  int64_t addon_time_;
+
+  explicit SpecialEnv(Env* base) : EnvWrapper(base), rnd_(301), addon_time_(0) {
+    delay_sstable_sync_.store(false, std::memory_order_release);
+    drop_writes_.store(false, std::memory_order_release);
+    no_space_.store(false, std::memory_order_release);
+    non_writable_.store(false, std::memory_order_release);
     count_random_reads_ = false;
     count_sequential_reads_ = false;
-    manifest_sync_error_.Release_Store(nullptr);
-    manifest_write_error_.Release_Store(nullptr);
-    log_write_error_.Release_Store(nullptr);
-   }
+    manifest_sync_error_.store(false, std::memory_order_release);
+    manifest_write_error_.store(false, std::memory_order_release);
+    log_write_error_.store(false, std::memory_order_release);
+    log_write_slowdown_ = 0;
+    bytes_written_ = 0;
+    sync_counter_ = 0;
+    non_writeable_rate_ = 0;
+    new_writable_count_ = 0;
+    non_writable_count_ = 0;
+    table_write_callback_ = nullptr;
+  }
 
   Status NewWritableFile(const std::string& f, unique_ptr<WritableFile>* r,
-                         const EnvOptions& soptions) {
+                         const EnvOptions& soptions) override {
     class SSTableFile : public WritableFile {
      private:
       SpecialEnv* env_;
@@ -153,22 +186,39 @@ class SpecialEnv : public EnvWrapper {
           : env_(env),
             base_(std::move(base)) {
       }
-      Status Append(const Slice& data) {
-        if (env_->no_space_.Acquire_Load() != nullptr) {
+      Status Append(const Slice& data) override {
+        if (env_->table_write_callback_) {
+          (*env_->table_write_callback_)();
+        }
+        if (env_->drop_writes_.load(std::memory_order_acquire)) {
           // Drop writes on the floor
           return Status::OK();
+        } else if (env_->no_space_.load(std::memory_order_acquire)) {
+          return Status::IOError("No space left on device");
         } else {
+          env_->bytes_written_ += data.size();
           return base_->Append(data);
         }
       }
-      Status Close() { return base_->Close(); }
-      Status Flush() { return base_->Flush(); }
-      Status Sync() {
-        while (env_->delay_sstable_sync_.Acquire_Load() != nullptr) {
+      Status Close() override {
+        // Check preallocation size
+        // preallocation size is never passed to base file.
+        size_t preallocation_size = preallocation_block_size();
+        TEST_SYNC_POINT_CALLBACK("DBTestWritableFile.GetPreallocationStatus",
+                                 &preallocation_size);
+        return base_->Close();
+      }
+      Status Flush() override { return base_->Flush(); }
+      Status Sync() override {
+        ++env_->sync_counter_;
+        while (env_->delay_sstable_sync_.load(std::memory_order_acquire)) {
           env_->SleepForMicroseconds(100000);
         }
         return base_->Sync();
       }
+      void SetIOPriority(Env::IOPriority pri) override {
+        base_->SetIOPriority(pri);
+      }
     };
     class ManifestFile : public WritableFile {
      private:
@@ -177,43 +227,67 @@ class SpecialEnv : public EnvWrapper {
      public:
       ManifestFile(SpecialEnv* env, unique_ptr<WritableFile>&& b)
           : env_(env), base_(std::move(b)) { }
-      Status Append(const Slice& data) {
-        if (env_->manifest_write_error_.Acquire_Load() != nullptr) {
+      Status Append(const Slice& data) override {
+        if (env_->manifest_write_error_.load(std::memory_order_acquire)) {
           return Status::IOError("simulated writer error");
         } else {
           return base_->Append(data);
         }
       }
-      Status Close() { return base_->Close(); }
-      Status Flush() { return base_->Flush(); }
-      Status Sync() {
-        if (env_->manifest_sync_error_.Acquire_Load() != nullptr) {
+      Status Close() override { return base_->Close(); }
+      Status Flush() override { return base_->Flush(); }
+      Status Sync() override {
+        ++env_->sync_counter_;
+        if (env_->manifest_sync_error_.load(std::memory_order_acquire)) {
           return Status::IOError("simulated sync error");
         } else {
           return base_->Sync();
         }
       }
+      uint64_t GetFileSize() override { return base_->GetFileSize(); }
     };
-    class LogFile : public WritableFile {
+    class WalFile : public WritableFile {
      private:
       SpecialEnv* env_;
       unique_ptr<WritableFile> base_;
      public:
-      LogFile(SpecialEnv* env, unique_ptr<WritableFile>&& b)
-          : env_(env), base_(std::move(b)) { }
-      Status Append(const Slice& data) {
-        if (env_->log_write_error_.Acquire_Load() != nullptr) {
+      WalFile(SpecialEnv* env, unique_ptr<WritableFile>&& b)
+          : env_(env), base_(std::move(b)) {}
+      Status Append(const Slice& data) override {
+        if (env_->log_write_error_.load(std::memory_order_acquire)) {
           return Status::IOError("simulated writer error");
         } else {
+          int slowdown =
+              env_->log_write_slowdown_.load(std::memory_order_acquire);
+          if (slowdown > 0) {
+            env_->SleepForMicroseconds(slowdown);
+          }
           return base_->Append(data);
         }
       }
-      Status Close() { return base_->Close(); }
-      Status Flush() { return base_->Flush(); }
-      Status Sync() { return base_->Sync(); }
+      Status Close() override { return base_->Close(); }
+      Status Flush() override { return base_->Flush(); }
+      Status Sync() override {
+        ++env_->sync_counter_;
+        return base_->Sync();
+      }
     };
 
-    if (non_writable_.Acquire_Load() != nullptr) {
+    if (non_writeable_rate_.load(std::memory_order_acquire) > 0) {
+      uint32_t random_number;
+      {
+        MutexLock l(&rnd_mutex_);
+        random_number = rnd_.Uniform(100);
+      }
+      if (random_number < non_writeable_rate_.load()) {
+        return Status::IOError("simulated random write error");
+      }
+    }
+
+    new_writable_count_++;
+
+    if (non_writable_count_.load() > 0) {
+      non_writable_count_--;
       return Status::IOError("simulated write error");
     }
 
@@ -224,7 +298,7 @@ class SpecialEnv : public EnvWrapper {
       } else if (strstr(f.c_str(), "MANIFEST") != nullptr) {
         r->reset(new ManifestFile(this, std::move(*r)));
       } else if (strstr(f.c_str(), "log") != nullptr) {
-        r->reset(new LogFile(this, std::move(*r)));
+        r->reset(new WalFile(this, std::move(*r)));
       }
     }
     return s;
@@ -232,7 +306,7 @@ class SpecialEnv : public EnvWrapper {
 
   Status NewRandomAccessFile(const std::string& f,
                              unique_ptr<RandomAccessFile>* r,
-                             const EnvOptions& soptions) {
+                             const EnvOptions& soptions) override {
     class CountingFile : public RandomAccessFile {
      private:
       unique_ptr<RandomAccessFile> target_;
@@ -243,7 +317,7 @@ class SpecialEnv : public EnvWrapper {
           : target_(std::move(target)), counter_(counter) {
       }
       virtual Status Read(uint64_t offset, size_t n, Slice* result,
-                          char* scratch) const {
+                          char* scratch) const override {
         counter_->Increment();
         return target_->Read(offset, n, result, scratch);
       }
@@ -257,7 +331,7 @@ class SpecialEnv : public EnvWrapper {
   }
 
   Status NewSequentialFile(const std::string& f, unique_ptr<SequentialFile>* r,
-                           const EnvOptions& soptions) {
+                           const EnvOptions& soptions) override {
     class CountingFile : public SequentialFile {
      private:
       unique_ptr<SequentialFile> target_;
@@ -267,11 +341,11 @@ class SpecialEnv : public EnvWrapper {
       CountingFile(unique_ptr<SequentialFile>&& target,
                    anon::AtomicCounter* counter)
           : target_(std::move(target)), counter_(counter) {}
-      virtual Status Read(size_t n, Slice* result, char* scratch) {
+      virtual Status Read(size_t n, Slice* result, char* scratch) override {
         counter_->Increment();
         return target_->Read(n, result, scratch);
       }
-      virtual Status Skip(uint64_t n) { return target_->Skip(n); }
+      virtual Status Skip(uint64_t n) override { return target_->Skip(n); }
     };
 
     Status s = target()->NewSequentialFile(f, r, soptions);
@@ -281,48 +355,64 @@ class SpecialEnv : public EnvWrapper {
     return s;
   }
 
-  virtual void SleepForMicroseconds(int micros) {
+  virtual void SleepForMicroseconds(int micros) override {
     sleep_counter_.Increment();
     target()->SleepForMicroseconds(micros);
   }
-};
 
-class DBTest {
- private:
-  const FilterPolicy* filter_policy_;
+  virtual Status GetCurrentTime(int64_t* unix_time) override {
+    Status s = target()->GetCurrentTime(unix_time);
+    if (s.ok()) {
+      *unix_time += addon_time_;
+    }
+    return s;
+  }
 
+  virtual uint64_t NowNanos() override {
+    return target()->NowNanos() + addon_time_ * 1000;
+  }
+};
+
+class DBTest : public testing::Test {
  protected:
   // Sequence of option configurations to try
   enum OptionConfig {
-    kBlockBasedTableWithWholeKeyHashIndex,
-    kDefault,
-    kBlockBasedTableWithPrefixHashIndex,
-    kPlainTableFirstBytePrefix,
-    kPlainTableAllBytesPrefix,
-    kVectorRep,
-    kHashLinkList,
-    kHashCuckoo,
-    kMergePut,
-    kFilter,
-    kUncompressed,
-    kNumLevel_3,
-    kDBLogDir,
-    kWalDir,
-    kManifestFileSize,
-    kCompactOnFlush,
-    kPerfOptions,
-    kDeletesFilterFirst,
-    kHashSkipList,
-    kUniversalCompaction,
-    kCompressedBlockCache,
-    kInfiniteMaxOpenFiles,
-    kxxHashChecksum,
-    kEnd
+    kDefault = 0,
+    kBlockBasedTableWithPrefixHashIndex = 1,
+    kBlockBasedTableWithWholeKeyHashIndex = 2,
+    kPlainTableFirstBytePrefix = 3,
+    kPlainTableCappedPrefix = 4,
+    kPlainTableAllBytesPrefix = 5,
+    kVectorRep = 6,
+    kHashLinkList = 7,
+    kHashCuckoo = 8,
+    kMergePut = 9,
+    kFilter = 10,
+    kFullFilter = 11,
+    kUncompressed = 12,
+    kNumLevel_3 = 13,
+    kDBLogDir = 14,
+    kWalDirAndMmapReads = 15,
+    kManifestFileSize = 16,
+    kCompactOnFlush = 17,
+    kPerfOptions = 18,
+    kDeletesFilterFirst = 19,
+    kHashSkipList = 20,
+    kUniversalCompaction = 21,
+    kUniversalCompactionMultiLevel = 22,
+    kCompressedBlockCache = 23,
+    kInfiniteMaxOpenFiles = 24,
+    kxxHashChecksum = 25,
+    kFIFOCompaction = 26,
+    kOptimizeFiltersForHits = 27,
+    kEnd = 28
   };
   int option_config_;
 
  public:
   std::string dbname_;
+  std::string alternative_wal_dir_;
+  MockEnv* mem_env_;
   SpecialEnv* env_;
   DB* db_;
   std::vector<ColumnFamilyHandle*> handles_;
@@ -339,23 +429,42 @@ class DBTest {
     kSkipPlainTable = 8,
     kSkipHashIndex = 16,
     kSkipNoSeekToLast = 32,
-    kSkipHashCuckoo = 64
+    kSkipHashCuckoo = 64,
+    kSkipFIFOCompaction = 128,
+    kSkipMmapReads = 256,
   };
 
+
   DBTest() : option_config_(kDefault),
-             env_(new SpecialEnv(Env::Default())) {
-    filter_policy_ = NewBloomFilterPolicy(10);
-    dbname_ = test::TmpDir() + "/db_test";
-    ASSERT_OK(DestroyDB(dbname_, Options()));
+             mem_env_(!getenv("MEM_ENV") ? nullptr :
+                                           new MockEnv(Env::Default())),
+             env_(new SpecialEnv(mem_env_ ? mem_env_ : Env::Default())) {
+    env_->SetBackgroundThreads(1, Env::LOW);
+    env_->SetBackgroundThreads(1, Env::HIGH);
+    dbname_ = test::TmpDir(env_) + "/db_test";
+    alternative_wal_dir_ = dbname_ + "/wal";
+    auto options = CurrentOptions();
+    auto delete_options = options;
+    delete_options.wal_dir = alternative_wal_dir_;
+    EXPECT_OK(DestroyDB(dbname_, delete_options));
+    // Destroy it for not alternative WAL dir is used.
+    EXPECT_OK(DestroyDB(dbname_, options));
     db_ = nullptr;
-    Reopen();
+    Reopen(options);
   }
 
   ~DBTest() {
+    rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+    rocksdb::SyncPoint::GetInstance()->LoadDependency({});
+    rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
     Close();
-    ASSERT_OK(DestroyDB(dbname_, Options()));
+    Options options;
+    options.db_paths.emplace_back(dbname_, 0);
+    options.db_paths.emplace_back(dbname_ + "_2", 0);
+    options.db_paths.emplace_back(dbname_ + "_3", 0);
+    options.db_paths.emplace_back(dbname_ + "_4", 0);
+    EXPECT_OK(DestroyDB(dbname_, options));
     delete env_;
-    delete filter_policy_;
   }
 
   // Switch to a fresh database with the next option configuration to
@@ -367,7 +476,8 @@ class DBTest {
         continue;
       }
       if ((skip_mask & kSkipUniversalCompaction) &&
-          option_config_ == kUniversalCompaction) {
+          (option_config_ == kUniversalCompaction ||
+           option_config_ == kUniversalCompactionMultiLevel)) {
         continue;
       }
       if ((skip_mask & kSkipMergePut) && option_config_ == kMergePut) {
@@ -378,12 +488,13 @@ class DBTest {
            option_config_ == kHashSkipList)) {;
         continue;
       }
-      if ((skip_mask & kSkipPlainTable)
-          && (option_config_ == kPlainTableAllBytesPrefix
-              || option_config_ == kPlainTableFirstBytePrefix)) {
+      if ((skip_mask & kSkipPlainTable) &&
+          (option_config_ == kPlainTableAllBytesPrefix ||
+           option_config_ == kPlainTableFirstBytePrefix ||
+           option_config_ == kPlainTableCappedPrefix)) {
         continue;
       }
-      if ((skip_mask & kSkipPlainTable) &&
+      if ((skip_mask & kSkipHashIndex) &&
           (option_config_ == kBlockBasedTableWithPrefixHashIndex ||
            option_config_ == kBlockBasedTableWithWholeKeyHashIndex)) {
         continue;
@@ -391,64 +502,119 @@ class DBTest {
       if ((skip_mask & kSkipHashCuckoo) && (option_config_ == kHashCuckoo)) {
         continue;
       }
+      if ((skip_mask & kSkipFIFOCompaction) &&
+          option_config_ == kFIFOCompaction) {
+        continue;
+      }
+      if ((skip_mask & kSkipMmapReads) &&
+          option_config_ == kWalDirAndMmapReads) {
+        continue;
+      }
       break;
     }
 
     if (option_config_ >= kEnd) {
-      Destroy(&last_options_);
+      Destroy(last_options_);
       return false;
     } else {
-      DestroyAndReopen();
+      auto options = CurrentOptions();
+      options.create_if_missing = true;
+      DestroyAndReopen(options);
       return true;
     }
   }
 
   // Switch between different compaction styles (we have only 2 now).
-  bool ChangeCompactOptions(Options* prev_options = nullptr) {
+  bool ChangeCompactOptions() {
     if (option_config_ == kDefault) {
       option_config_ = kUniversalCompaction;
-      if (prev_options == nullptr) {
-        prev_options = &last_options_;
-      }
-      Destroy(prev_options);
-      TryReopen();
+      Destroy(last_options_);
+      auto options = CurrentOptions();
+      options.create_if_missing = true;
+      TryReopen(options);
+      return true;
+    } else if (option_config_ == kUniversalCompaction) {
+      option_config_ = kUniversalCompactionMultiLevel;
+      Destroy(last_options_);
+      auto options = CurrentOptions();
+      options.create_if_missing = true;
+      TryReopen(options);
       return true;
     } else {
       return false;
     }
   }
 
+  // Switch between different filter policy
+  // Jump from kDefault to kFilter to kFullFilter
+  bool ChangeFilterOptions() {
+    if (option_config_ == kDefault) {
+      option_config_ = kFilter;
+    } else if (option_config_ == kFilter) {
+      option_config_ = kFullFilter;
+    } else {
+      return false;
+    }
+    Destroy(last_options_);
+
+    auto options = CurrentOptions();
+    options.create_if_missing = true;
+    TryReopen(options);
+    return true;
+  }
+
   // Return the current option configuration.
-  Options CurrentOptions() {
+  Options CurrentOptions(
+      const anon::OptionsOverride& options_override = anon::OptionsOverride()) {
     Options options;
-    return CurrentOptions(options);
+    return CurrentOptions(options, options_override);
   }
 
-  Options CurrentOptions(const Options& defaultOptions) {
+  Options CurrentOptions(
+      const Options& defaultOptions,
+      const anon::OptionsOverride& options_override = anon::OptionsOverride()) {
     // this redudant copy is to minimize code change w/o having lint error.
     Options options = defaultOptions;
+    XFUNC_TEST("", "dbtest_options", inplace_options1, GetXFTestOptions,
+               reinterpret_cast<Options*>(&options),
+               options_override.skip_policy);
+    BlockBasedTableOptions table_options;
+    bool set_block_based_table_factory = true;
     switch (option_config_) {
       case kHashSkipList:
         options.prefix_extractor.reset(NewFixedPrefixTransform(1));
-        options.memtable_factory.reset(NewHashSkipListRepFactory());
+        options.memtable_factory.reset(
+            NewHashSkipListRepFactory(16));
         break;
       case kPlainTableFirstBytePrefix:
         options.table_factory.reset(new PlainTableFactory());
         options.prefix_extractor.reset(NewFixedPrefixTransform(1));
         options.allow_mmap_reads = true;
         options.max_sequential_skip_in_iterations = 999999;
+        set_block_based_table_factory = false;
+        break;
+      case kPlainTableCappedPrefix:
+        options.table_factory.reset(new PlainTableFactory());
+        options.prefix_extractor.reset(NewCappedPrefixTransform(8));
+        options.allow_mmap_reads = true;
+        options.max_sequential_skip_in_iterations = 999999;
+        set_block_based_table_factory = false;
         break;
       case kPlainTableAllBytesPrefix:
         options.table_factory.reset(new PlainTableFactory());
         options.prefix_extractor.reset(NewNoopTransform());
         options.allow_mmap_reads = true;
         options.max_sequential_skip_in_iterations = 999999;
+        set_block_based_table_factory = false;
         break;
       case kMergePut:
         options.merge_operator = MergeOperators::CreatePutOperator();
         break;
       case kFilter:
-        options.filter_policy = filter_policy_;
+        table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
+        break;
+      case kFullFilter:
+        table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
         break;
       case kUncompressed:
         options.compression = kNoCompression;
@@ -457,10 +623,13 @@ class DBTest {
         options.num_levels = 3;
         break;
       case kDBLogDir:
-        options.db_log_dir = test::TmpDir();
+        options.db_log_dir = test::TmpDir(env_);
         break;
-      case kWalDir:
-        options.wal_dir = "/tmp/wal";
+      case kWalDirAndMmapReads:
+        options.wal_dir = alternative_wal_dir_;
+        // mmap reads should be orthogonal to WalDir setting, so we piggyback to
+        // this option config to test mmap reads as well
+        options.allow_mmap_reads = true;
         break;
       case kManifestFileSize:
         options.max_manifest_file_size = 50; // 50 bytes
@@ -481,7 +650,8 @@ class DBTest {
         break;
       case kHashLinkList:
         options.prefix_extractor.reset(NewFixedPrefixTransform(1));
-        options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0));
+        options.memtable_factory.reset(
+            NewHashLinkListRepFactory(4, 0, 3, true, 4));
         break;
       case kHashCuckoo:
         options.memtable_factory.reset(
@@ -489,37 +659,55 @@ class DBTest {
         break;
       case kUniversalCompaction:
         options.compaction_style = kCompactionStyleUniversal;
+        options.num_levels = 1;
+        break;
+      case kUniversalCompactionMultiLevel:
+        options.compaction_style = kCompactionStyleUniversal;
+        options.num_levels = 8;
         break;
       case kCompressedBlockCache:
         options.allow_mmap_writes = true;
-        options.block_cache_compressed = NewLRUCache(8*1024*1024);
+        table_options.block_cache_compressed = NewLRUCache(8*1024*1024);
         break;
       case kInfiniteMaxOpenFiles:
         options.max_open_files = -1;
         break;
       case kxxHashChecksum: {
-        BlockBasedTableOptions table_options;
         table_options.checksum = kxxHash;
-        options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+        break;
+      }
+      case kFIFOCompaction: {
+        options.compaction_style = kCompactionStyleFIFO;
         break;
       }
       case kBlockBasedTableWithPrefixHashIndex: {
-        BlockBasedTableOptions table_options;
         table_options.index_type = BlockBasedTableOptions::kHashSearch;
-        options.table_factory.reset(NewBlockBasedTableFactory(table_options));
         options.prefix_extractor.reset(NewFixedPrefixTransform(1));
         break;
       }
       case kBlockBasedTableWithWholeKeyHashIndex: {
-        BlockBasedTableOptions table_options;
         table_options.index_type = BlockBasedTableOptions::kHashSearch;
-        options.table_factory.reset(NewBlockBasedTableFactory(table_options));
         options.prefix_extractor.reset(NewNoopTransform());
         break;
       }
+      case kOptimizeFiltersForHits: {
+        options.optimize_filters_for_hits = true;
+        set_block_based_table_factory = true;
+        break;
+      }
+
       default:
         break;
     }
+
+    if (options_override.filter_policy) {
+      table_options.filter_policy = options_override.filter_policy;
+    }
+    if (set_block_based_table_factory) {
+      options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    }
+    options.env = env_;
+    options.create_if_missing = true;
     return options;
   }
 
@@ -528,14 +716,9 @@ class DBTest {
   }
 
   void CreateColumnFamilies(const std::vector<std::string>& cfs,
-                            const ColumnFamilyOptions* options = nullptr) {
-    ColumnFamilyOptions cf_opts;
-    if (options != nullptr) {
-      cf_opts = ColumnFamilyOptions(*options);
-    } else {
-      cf_opts = ColumnFamilyOptions(CurrentOptions());
-    }
-    int cfi = handles_.size();
+                            const Options& options) {
+    ColumnFamilyOptions cf_opts(options);
+    size_t cfi = handles_.size();
     handles_.resize(cfi + cfs.size());
     for (auto cf : cfs) {
       ASSERT_OK(db_->CreateColumnFamily(cf_opts, cf, &handles_[cfi++]));
@@ -543,7 +726,7 @@ class DBTest {
   }
 
   void CreateAndReopenWithCF(const std::vector<std::string>& cfs,
-                             const Options* options = nullptr) {
+                             const Options& options) {
     CreateColumnFamilies(cfs, options);
     std::vector<std::string> cfs_plus_default = cfs;
     cfs_plus_default.insert(cfs_plus_default.begin(), kDefaultColumnFamilyName);
@@ -551,37 +734,36 @@ class DBTest {
   }
 
   void ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
-                                const std::vector<const Options*>& options) {
+                                const std::vector<Options>& options) {
     ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
   }
 
   void ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
-                                const Options* options = nullptr) {
+                                const Options& options) {
     ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
   }
 
   Status TryReopenWithColumnFamilies(
       const std::vector<std::string>& cfs,
-      const std::vector<const Options*>& options) {
+      const std::vector<Options>& options) {
     Close();
-    ASSERT_EQ(cfs.size(), options.size());
+    EXPECT_EQ(cfs.size(), options.size());
     std::vector<ColumnFamilyDescriptor> column_families;
     for (size_t i = 0; i < cfs.size(); ++i) {
-      column_families.push_back(ColumnFamilyDescriptor(cfs[i], *options[i]));
+      column_families.push_back(ColumnFamilyDescriptor(cfs[i], options[i]));
     }
-    DBOptions db_opts = DBOptions(*options[0]);
+    DBOptions db_opts = DBOptions(options[0]);
     return DB::Open(db_opts, dbname_, column_families, &handles_, &db_);
   }
 
   Status TryReopenWithColumnFamilies(const std::vector<std::string>& cfs,
-                                     const Options* options = nullptr) {
+                                     const Options& options) {
     Close();
-    Options opts = (options == nullptr) ? CurrentOptions() : *options;
-    std::vector<const Options*> v_opts(cfs.size(), &opts);
+    std::vector<Options> v_opts(cfs.size(), options);
     return TryReopenWithColumnFamilies(cfs, v_opts);
   }
 
-  void Reopen(Options* options = nullptr) {
+  void Reopen(const Options& options) {
     ASSERT_OK(TryReopen(options));
   }
 
@@ -594,33 +776,25 @@ class DBTest {
     db_ = nullptr;
   }
 
-  void DestroyAndReopen(Options* options = nullptr) {
+  void DestroyAndReopen(const Options& options) {
     //Destroy using last options
-    Destroy(&last_options_);
+    Destroy(last_options_);
     ASSERT_OK(TryReopen(options));
   }
 
-  void Destroy(Options* options) {
+  void Destroy(const Options& options) {
     Close();
-    ASSERT_OK(DestroyDB(dbname_, *options));
+    ASSERT_OK(DestroyDB(dbname_, options));
   }
 
-  Status ReadOnlyReopen(Options* options) {
-    return DB::OpenForReadOnly(*options, dbname_, &db_);
+  Status ReadOnlyReopen(const Options& options) {
+    return DB::OpenForReadOnly(options, dbname_, &db_);
   }
 
-  Status TryReopen(Options* options = nullptr) {
+  Status TryReopen(const Options& options) {
     Close();
-    Options opts;
-    if (options != nullptr) {
-      opts = *options;
-    } else {
-      opts = CurrentOptions();
-      opts.create_if_missing = true;
-    }
-    last_options_ = opts;
-
-    return DB::Open(opts, dbname_, &db_);
+    last_options_ = options;
+    return DB::Open(options, dbname_, &db_);
   }
 
   Status Flush(int cf = 0) {
@@ -685,6 +859,19 @@ class DBTest {
     return result;
   }
 
+  uint64_t GetNumSnapshots() {
+    uint64_t int_num;
+    EXPECT_TRUE(dbfull()->GetIntProperty("rocksdb.num-snapshots", &int_num));
+    return int_num;
+  }
+
+  uint64_t GetTimeOldestSnapshots() {
+    uint64_t int_num;
+    EXPECT_TRUE(
+        dbfull()->GetIntProperty("rocksdb.oldest-snapshot-time", &int_num));
+    return int_num;
+  }
+
   // Return a string that contains all key,value pairs in order,
   // formatted like "(k1->v1)(k2->v2)".
   std::string Contents(int cf = 0) {
@@ -703,22 +890,23 @@ class DBTest {
     // Check reverse iteration results are the reverse of forward results
     unsigned int matched = 0;
     for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
-      ASSERT_LT(matched, forward.size());
-      ASSERT_EQ(IterStatus(iter), forward[forward.size() - matched - 1]);
+      EXPECT_LT(matched, forward.size());
+      EXPECT_EQ(IterStatus(iter), forward[forward.size() - matched - 1]);
       matched++;
     }
-    ASSERT_EQ(matched, forward.size());
+    EXPECT_EQ(matched, forward.size());
 
     delete iter;
     return result;
   }
 
   std::string AllEntriesFor(const Slice& user_key, int cf = 0) {
-    Iterator* iter;
+    Arena arena;
+    ScopedArenaIterator iter;
     if (cf == 0) {
-      iter = dbfull()->TEST_NewInternalIterator();
+      iter.set(dbfull()->TEST_NewInternalIterator(&arena));
     } else {
-      iter = dbfull()->TEST_NewInternalIterator(handles_[cf]);
+      iter.set(dbfull()->TEST_NewInternalIterator(&arena, handles_[cf]));
     }
     InternalKey target(user_key, kMaxSequenceNumber, kTypeValue);
     iter->Seek(target.Encode());
@@ -763,24 +951,75 @@ class DBTest {
       }
       result += "]";
     }
-    delete iter;
     return result;
   }
 
+  int NumSortedRuns(int cf = 0) {
+    ColumnFamilyMetaData cf_meta;
+    if (cf == 0) {
+      db_->GetColumnFamilyMetaData(&cf_meta);
+    } else {
+      db_->GetColumnFamilyMetaData(handles_[cf], &cf_meta);
+    }
+    int num_sr = static_cast<int>(cf_meta.levels[0].files.size());
+    for (size_t i = 1U; i < cf_meta.levels.size(); i++) {
+      if (cf_meta.levels[i].files.size() > 0) {
+        num_sr++;
+      }
+    }
+    return num_sr;
+  }
+
+  uint64_t TotalSize(int cf = 0) {
+    ColumnFamilyMetaData cf_meta;
+    if (cf == 0) {
+      db_->GetColumnFamilyMetaData(&cf_meta);
+    } else {
+      db_->GetColumnFamilyMetaData(handles_[cf], &cf_meta);
+    }
+    return cf_meta.size;
+  }
+
   int NumTableFilesAtLevel(int level, int cf = 0) {
     std::string property;
     if (cf == 0) {
       // default cfd
-      ASSERT_TRUE(db_->GetProperty(
+      EXPECT_TRUE(db_->GetProperty(
           "rocksdb.num-files-at-level" + NumberToString(level), &property));
     } else {
-      ASSERT_TRUE(db_->GetProperty(
+      EXPECT_TRUE(db_->GetProperty(
           handles_[cf], "rocksdb.num-files-at-level" + NumberToString(level),
           &property));
     }
     return atoi(property.c_str());
   }
 
+  uint64_t SizeAtLevel(int level) {
+    std::vector<LiveFileMetaData> metadata;
+    db_->GetLiveFilesMetaData(&metadata);
+    uint64_t sum = 0;
+    for (const auto& m : metadata) {
+      if (m.level == level) {
+        sum += m.size;
+      }
+    }
+    return sum;
+  }
+
+  int TotalLiveFiles(int cf = 0) {
+    ColumnFamilyMetaData cf_meta;
+    if (cf == 0) {
+      db_->GetColumnFamilyMetaData(&cf_meta);
+    } else {
+      db_->GetColumnFamilyMetaData(handles_[cf], &cf_meta);
+    }
+    int num_files = 0;
+    for (auto& level : cf_meta.levels) {
+      num_files += level.files.size();
+    }
+    return num_files;
+  }
+
   int TotalTableFiles(int cf = 0, int levels = -1) {
     if (levels == -1) {
       levels = CurrentOptions().num_levels;
@@ -797,7 +1036,7 @@ class DBTest {
     int num_levels =
         (cf == 0) ? db_->NumberLevels() : db_->NumberLevels(handles_[1]);
     std::string result;
-    int last_non_zero_offset = 0;
+    size_t last_non_zero_offset = 0;
     for (int level = 0; level < num_levels; level++) {
       int f = NumTableFilesAtLevel(level, cf);
       char buf[100];
@@ -811,7 +1050,7 @@ class DBTest {
     return result;
   }
 
-  int CountFiles() {
+  size_t CountFiles() {
     std::vector<std::string> files;
     env_->GetChildren(dbname_, &files);
 
@@ -820,10 +1059,10 @@ class DBTest {
       env_->GetChildren(last_options_.wal_dir, &logfiles);
     }
 
-    return static_cast<int>(files.size() + logfiles.size());
+    return files.size() + logfiles.size();
   }
 
-  int CountLiveFiles() {
+  size_t CountLiveFiles() {
     std::vector<LiveFileMetaData> metadata;
     db_->GetLiveFilesMetaData(&metadata);
     return metadata.size();
@@ -840,6 +1079,12 @@ class DBTest {
     return size;
   }
 
+  void Compact(int cf, const Slice& start, const Slice& limit,
+               uint32_t target_path_id) {
+    ASSERT_OK(db_->CompactRange(handles_[cf], &start, &limit, false, -1,
+                                target_path_id));
+  }
+
   void Compact(int cf, const Slice& start, const Slice& limit) {
     ASSERT_OK(db_->CompactRange(handles_[cf], &start, &limit));
   }
@@ -885,6 +1130,44 @@ class DBTest {
     return property;
   }
 
+  int GetSstFileCount(std::string path) {
+    std::vector<std::string> files;
+    env_->GetChildren(path, &files);
+
+    int sst_count = 0;
+    uint64_t number;
+    FileType type;
+    for (size_t i = 0; i < files.size(); i++) {
+      if (ParseFileName(files[i], &number, &type) && type == kTableFile) {
+        sst_count++;
+      }
+    }
+    return sst_count;
+  }
+
+  // this will generate non-overlapping files since it keeps increasing key_idx
+  void GenerateNewFile(Random* rnd, int* key_idx, bool nowait = false) {
+    for (int i = 0; i < 11; i++) {
+      ASSERT_OK(Put(Key(*key_idx), RandomString(rnd, (i == 10) ? 1 : 10000)));
+      (*key_idx)++;
+    }
+    if (!nowait) {
+      dbfull()->TEST_WaitForFlushMemTable();
+      dbfull()->TEST_WaitForCompact();
+    }
+  }
+
+  void GenerateNewRandomFile(Random* rnd, bool nowait = false) {
+    for (int i = 0; i < 100; i++) {
+      ASSERT_OK(Put("key" + RandomString(rnd, 7), RandomString(rnd, 1000)));
+    }
+    ASSERT_OK(Put("key" + RandomString(rnd, 7), RandomString(rnd, 1)));
+    if (!nowait) {
+      dbfull()->TEST_WaitForFlushMemTable();
+      dbfull()->TEST_WaitForCompact();
+    }
+  }
+
   std::string IterStatus(Iterator* iter) {
     std::string result;
     if (iter->Valid()) {
@@ -906,8 +1189,8 @@ class DBTest {
       const SequenceNumber seq) {
     unique_ptr<TransactionLogIterator> iter;
     Status status = dbfull()->GetUpdatesSince(seq, &iter);
-    ASSERT_OK(status);
-    ASSERT_TRUE(iter->Valid());
+    EXPECT_OK(status);
+    EXPECT_TRUE(iter->Valid());
     return std::move(iter);
   }
 
@@ -977,11 +1260,12 @@ class DBTest {
 
   // Utility method to test InplaceUpdate
   void validateNumberOfEntries(int numValues, int cf = 0) {
-    Iterator* iter;
+    ScopedArenaIterator iter;
+    Arena arena;
     if (cf != 0) {
-      iter = dbfull()->TEST_NewInternalIterator(handles_[cf]);
+      iter.set(dbfull()->TEST_NewInternalIterator(&arena, handles_[cf]));
     } else {
-      iter = dbfull()->TEST_NewInternalIterator();
+      iter.set(dbfull()->TEST_NewInternalIterator(&arena));
     }
     iter->SeekToFirst();
     ASSERT_EQ(iter->status().ok(), true);
@@ -995,7 +1279,6 @@ class DBTest {
       ASSERT_EQ(ikey.sequence, (unsigned)seq--);
       iter->Next();
     }
-    delete iter;
     ASSERT_EQ(0, seq);
   }
 
@@ -1025,19 +1308,13 @@ class DBTest {
 
 };
 
-static std::string Key(int i) {
-  char buf[100];
-  snprintf(buf, sizeof(buf), "key%06d", i);
-  return std::string(buf);
-}
-
 static long TestGetTickerCount(const Options& options, Tickers ticker_type) {
   return options.statistics->getTickerCount(ticker_type);
 }
 
 // A helper function that ensures the table properties returned in
 // `GetPropertiesOfAllTablesTest` is correct.
-// This test assumes entries size is differnt for each of the tables.
+// This test assumes entries size is different for each of the tables.
 namespace {
 void VerifyTableProperties(DB* db, uint64_t expected_entries_size) {
   TablePropertiesCollection props;
@@ -1056,15 +1333,26 @@ void VerifyTableProperties(DB* db, uint64_t expected_entries_size) {
   ASSERT_EQ(props.size(), unique_entries.size());
   ASSERT_EQ(expected_entries_size, sum);
 }
+
+uint64_t GetNumberOfSstFilesForColumnFamily(DB* db,
+                                            std::string column_family_name) {
+  std::vector<LiveFileMetaData> metadata;
+  db->GetLiveFilesMetaData(&metadata);
+  uint64_t result = 0;
+  for (auto& fileMetadata : metadata) {
+    result += (fileMetadata.column_family_name == column_family_name);
+  }
+  return result;
+}
 }  // namespace
 
-TEST(DBTest, Empty) {
+TEST_F(DBTest, Empty) {
   do {
     Options options;
     options.env = env_;
     options.write_buffer_size = 100000;  // Small write buffer
     options = CurrentOptions(options);
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
 
     std::string num;
     ASSERT_TRUE(dbfull()->GetProperty(
@@ -1077,7 +1365,8 @@ TEST(DBTest, Empty) {
         handles_[1], "rocksdb.num-entries-active-mem-table", &num));
     ASSERT_EQ("1", num);
 
-    env_->delay_sstable_sync_.Release_Store(env_);  // Block sync calls
+    // Block sync calls
+    env_->delay_sstable_sync_.store(true, std::memory_order_release);
     Put(1, "k1", std::string(100000, 'x'));         // Fill memtable
     ASSERT_TRUE(dbfull()->GetProperty(
         handles_[1], "rocksdb.num-entries-active-mem-table", &num));
@@ -1089,18 +1378,66 @@ TEST(DBTest, Empty) {
     ASSERT_EQ("1", num);
 
     ASSERT_EQ("v1", Get(1, "foo"));
-    env_->delay_sstable_sync_.Release_Store(nullptr);   // Release sync calls
+    // Release sync calls
+    env_->delay_sstable_sync_.store(false, std::memory_order_release);
+
+    ASSERT_OK(db_->DisableFileDeletions());
+    ASSERT_TRUE(
+        dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num));
+    ASSERT_EQ("1", num);
+
+    ASSERT_OK(db_->DisableFileDeletions());
+    ASSERT_TRUE(
+        dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num));
+    ASSERT_EQ("2", num);
+
+    ASSERT_OK(db_->DisableFileDeletions());
+    ASSERT_TRUE(
+        dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num));
+    ASSERT_EQ("3", num);
+
+    ASSERT_OK(db_->EnableFileDeletions(false));
+    ASSERT_TRUE(
+        dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num));
+    ASSERT_EQ("2", num);
+
+    ASSERT_OK(db_->EnableFileDeletions());
+    ASSERT_TRUE(
+        dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num));
+    ASSERT_EQ("0", num);
   } while (ChangeOptions());
 }
 
-TEST(DBTest, ReadOnlyDB) {
+TEST_F(DBTest, WriteEmptyBatch) {
+  Options options;
+  options.env = env_;
+  options.write_buffer_size = 100000;
+  options = CurrentOptions(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ASSERT_OK(Put(1, "foo", "bar"));
+  env_->sync_counter_.store(0);
+  WriteOptions wo;
+  wo.sync = true;
+  wo.disableWAL = false;
+  WriteBatch empty_batch;
+  ASSERT_OK(dbfull()->Write(wo, &empty_batch));
+  ASSERT_GE(env_->sync_counter_.load(), 1);
+
+  // make sure we can re-open it.
+  ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
+  ASSERT_EQ("bar", Get(1, "foo"));
+}
+
+TEST_F(DBTest, ReadOnlyDB) {
   ASSERT_OK(Put("foo", "v1"));
   ASSERT_OK(Put("bar", "v2"));
   ASSERT_OK(Put("foo", "v3"));
   Close();
 
-  Options options;
-  ASSERT_OK(ReadOnlyReopen(&options));
+  auto options = CurrentOptions();
+  assert(options.env = env_);
+  ASSERT_OK(ReadOnlyReopen(options));
   ASSERT_EQ("v3", Get("foo"));
   ASSERT_EQ("v2", Get("bar"));
   Iterator* iter = db_->NewIterator(ReadOptions());
@@ -1111,20 +1448,122 @@ TEST(DBTest, ReadOnlyDB) {
   }
   ASSERT_EQ(count, 2);
   delete iter;
+  Close();
+
+  // Reopen and flush memtable.
+  Reopen(options);
+  Flush();
+  Close();
+  // Now check keys in read only mode.
+  ASSERT_OK(ReadOnlyReopen(options));
+  ASSERT_EQ("v3", Get("foo"));
+  ASSERT_EQ("v2", Get("bar"));
+}
+
+TEST_F(DBTest, CompactedDB) {
+  const uint64_t kFileSize = 1 << 20;
+  Options options;
+  options.disable_auto_compactions = true;
+  options.max_mem_compaction_level = 0;
+  options.write_buffer_size = kFileSize;
+  options.target_file_size_base = kFileSize;
+  options.max_bytes_for_level_base = 1 << 30;
+  options.compression = kNoCompression;
+  options = CurrentOptions(options);
+  Reopen(options);
+  // 1 L0 file, use CompactedDB if max_open_files = -1
+  ASSERT_OK(Put("aaa", DummyString(kFileSize / 2, '1')));
+  Flush();
+  Close();
+  ASSERT_OK(ReadOnlyReopen(options));
+  Status s = Put("new", "value");
+  ASSERT_EQ(s.ToString(),
+            "Not implemented: Not supported operation in read only mode.");
+  ASSERT_EQ(DummyString(kFileSize / 2, '1'), Get("aaa"));
+  Close();
+  options.max_open_files = -1;
+  ASSERT_OK(ReadOnlyReopen(options));
+  s = Put("new", "value");
+  ASSERT_EQ(s.ToString(),
+            "Not implemented: Not supported in compacted db mode.");
+  ASSERT_EQ(DummyString(kFileSize / 2, '1'), Get("aaa"));
+  Close();
+  Reopen(options);
+  // Add more L0 files
+  ASSERT_OK(Put("bbb", DummyString(kFileSize / 2, '2')));
+  Flush();
+  ASSERT_OK(Put("aaa", DummyString(kFileSize / 2, 'a')));
+  Flush();
+  ASSERT_OK(Put("bbb", DummyString(kFileSize / 2, 'b')));
+  Flush();
+  Close();
+
+  ASSERT_OK(ReadOnlyReopen(options));
+  // Fallback to read-only DB
+  s = Put("new", "value");
+  ASSERT_EQ(s.ToString(),
+            "Not implemented: Not supported operation in read only mode.");
+  Close();
+
+  // Full compaction
+  Reopen(options);
+  // Add more keys
+  ASSERT_OK(Put("eee", DummyString(kFileSize / 2, 'e')));
+  ASSERT_OK(Put("fff", DummyString(kFileSize / 2, 'f')));
+  ASSERT_OK(Put("hhh", DummyString(kFileSize / 2, 'h')));
+  ASSERT_OK(Put("iii", DummyString(kFileSize / 2, 'i')));
+  ASSERT_OK(Put("jjj", DummyString(kFileSize / 2, 'j')));
+  db_->CompactRange(nullptr, nullptr);
+  ASSERT_EQ(3, NumTableFilesAtLevel(1));
+  Close();
+
+  // CompactedDB
+  ASSERT_OK(ReadOnlyReopen(options));
+  s = Put("new", "value");
+  ASSERT_EQ(s.ToString(),
+            "Not implemented: Not supported in compacted db mode.");
+  ASSERT_EQ("NOT_FOUND", Get("abc"));
+  ASSERT_EQ(DummyString(kFileSize / 2, 'a'), Get("aaa"));
+  ASSERT_EQ(DummyString(kFileSize / 2, 'b'), Get("bbb"));
+  ASSERT_EQ("NOT_FOUND", Get("ccc"));
+  ASSERT_EQ(DummyString(kFileSize / 2, 'e'), Get("eee"));
+  ASSERT_EQ(DummyString(kFileSize / 2, 'f'), Get("fff"));
+  ASSERT_EQ("NOT_FOUND", Get("ggg"));
+  ASSERT_EQ(DummyString(kFileSize / 2, 'h'), Get("hhh"));
+  ASSERT_EQ(DummyString(kFileSize / 2, 'i'), Get("iii"));
+  ASSERT_EQ(DummyString(kFileSize / 2, 'j'), Get("jjj"));
+  ASSERT_EQ("NOT_FOUND", Get("kkk"));
+
+  // MultiGet
+  std::vector<std::string> values;
+  std::vector<Status> status_list = dbfull()->MultiGet(ReadOptions(),
+      std::vector<Slice>({Slice("aaa"), Slice("ccc"), Slice("eee"),
+                          Slice("ggg"), Slice("iii"), Slice("kkk")}),
+      &values);
+  ASSERT_EQ(status_list.size(), static_cast<uint64_t>(6));
+  ASSERT_EQ(values.size(), static_cast<uint64_t>(6));
+  ASSERT_OK(status_list[0]);
+  ASSERT_EQ(DummyString(kFileSize / 2, 'a'), values[0]);
+  ASSERT_TRUE(status_list[1].IsNotFound());
+  ASSERT_OK(status_list[2]);
+  ASSERT_EQ(DummyString(kFileSize / 2, 'e'), values[2]);
+  ASSERT_TRUE(status_list[3].IsNotFound());
+  ASSERT_OK(status_list[4]);
+  ASSERT_EQ(DummyString(kFileSize / 2, 'i'), values[4]);
+  ASSERT_TRUE(status_list[5].IsNotFound());
 }
 
 // Make sure that when options.block_cache is set, after a new table is
 // created its index/filter blocks are added to block cache.
-TEST(DBTest, IndexAndFilterBlocksOfNewTableAddedToCache) {
+TEST_F(DBTest, IndexAndFilterBlocksOfNewTableAddedToCache) {
   Options options = CurrentOptions();
-  std::unique_ptr<const FilterPolicy> filter_policy(NewBloomFilterPolicy(20));
-  options.filter_policy = filter_policy.get();
   options.create_if_missing = true;
   options.statistics = rocksdb::CreateDBStatistics();
   BlockBasedTableOptions table_options;
   table_options.cache_index_and_filter_blocks = true;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(20));
   options.table_factory.reset(new BlockBasedTableFactory(table_options));
-  CreateAndReopenWithCF({"pikachu"}, &options);
+  CreateAndReopenWithCF({"pikachu"}, options);
 
   ASSERT_OK(Put(1, "key", "val"));
   // Create a new table.
@@ -1136,6 +1575,10 @@ TEST(DBTest, IndexAndFilterBlocksOfNewTableAddedToCache) {
   ASSERT_EQ(2, /* only index/filter were added */
             TestGetTickerCount(options, BLOCK_CACHE_ADD));
   ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS));
+  uint64_t int_num;
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num));
+  ASSERT_EQ(int_num, 0U);
 
   // Make sure filter block is in cache.
   std::string value;
@@ -1163,96 +1606,183 @@ TEST(DBTest, IndexAndFilterBlocksOfNewTableAddedToCache) {
             TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
 }
 
-TEST(DBTest, GetPropertiesOfAllTablesTest) {
+TEST_F(DBTest, ParanoidFileChecks) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = rocksdb::CreateDBStatistics();
+  options.level0_file_num_compaction_trigger = 2;
+  options.paranoid_file_checks = true;
+  BlockBasedTableOptions table_options;
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(20));
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ASSERT_OK(Put(1, "1_key", "val"));
+  ASSERT_OK(Put(1, "9_key", "val"));
+  // Create a new table.
+  ASSERT_OK(Flush(1));
+  ASSERT_EQ(1, /* read and cache data block */
+            TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+  ASSERT_OK(Put(1, "1_key2", "val2"));
+  ASSERT_OK(Put(1, "9_key2", "val2"));
+  // Create a new SST file. This will further trigger a compaction
+  // and generate another file.
+  ASSERT_OK(Flush(1));
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(3, /* Totally 3 files created up to now */
+            TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+  // After disabling options.paranoid_file_checks. NO further block
+  // is added after generating a new file.
+  ASSERT_OK(
+      dbfull()->SetOptions(handles_[1], {{"paranoid_file_checks", "false"}}));
+
+  ASSERT_OK(Put(1, "1_key3", "val3"));
+  ASSERT_OK(Put(1, "9_key3", "val3"));
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(Put(1, "1_key4", "val4"));
+  ASSERT_OK(Put(1, "9_key4", "val4"));
+  ASSERT_OK(Flush(1));
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(3, /* Totally 3 files created up to now */
+            TestGetTickerCount(options, BLOCK_CACHE_ADD));
+}
+
+TEST_F(DBTest, GetPropertiesOfAllTablesTest) {
   Options options = CurrentOptions();
-  Reopen(&options);
+  options.max_background_flushes = 0;
+  Reopen(options);
   // Create 4 tables
   for (int table = 0; table < 4; ++table) {
     for (int i = 0; i < 10 + table; ++i) {
-      db_->Put(WriteOptions(), std::to_string(table * 100 + i), "val");
+      db_->Put(WriteOptions(), ToString(table * 100 + i), "val");
     }
     db_->Flush(FlushOptions());
   }
 
   // 1. Read table properties directly from file
-  Reopen(&options);
+  Reopen(options);
   VerifyTableProperties(db_, 10 + 11 + 12 + 13);
 
   // 2. Put two tables to table cache and
-  Reopen(&options);
+  Reopen(options);
   // fetch key from 1st and 2nd table, which will internally place that table to
   // the table cache.
   for (int i = 0; i < 2; ++i) {
-    Get(std::to_string(i * 100 + 0));
+    Get(ToString(i * 100 + 0));
   }
 
   VerifyTableProperties(db_, 10 + 11 + 12 + 13);
 
   // 3. Put all tables to table cache
-  Reopen(&options);
+  Reopen(options);
   // fetch key from 1st and 2nd table, which will internally place that table to
   // the table cache.
   for (int i = 0; i < 4; ++i) {
-    Get(std::to_string(i * 100 + 0));
+    Get(ToString(i * 100 + 0));
   }
   VerifyTableProperties(db_, 10 + 11 + 12 + 13);
 }
 
-TEST(DBTest, LevelLimitReopen) {
-  Options options = CurrentOptions();
-  CreateAndReopenWithCF({"pikachu"}, &options);
+class CoutingUserTblPropCollector : public TablePropertiesCollector {
+ public:
+  const char* Name() const override { return "CoutingUserTblPropCollector"; }
 
-  const std::string value(1024 * 1024, ' ');
-  int i = 0;
-  while (NumTableFilesAtLevel(2, 1) == 0) {
-    ASSERT_OK(Put(1, Key(i++), value));
+  Status Finish(UserCollectedProperties* properties) override {
+    std::string encoded;
+    PutVarint32(&encoded, count_);
+    *properties = UserCollectedProperties{
+        {"CoutingUserTblPropCollector", message_}, {"Count", encoded},
+    };
+    return Status::OK();
   }
 
-  options.num_levels = 1;
-  options.max_bytes_for_level_multiplier_additional.resize(1, 1);
-  Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, &options);
-  ASSERT_EQ(s.IsInvalidArgument(), true);
-  ASSERT_EQ(s.ToString(),
-            "Invalid argument: db has more levels than options.num_levels");
+  Status AddUserKey(const Slice& user_key, const Slice& value, EntryType type,
+                    SequenceNumber seq, uint64_t file_size) override {
+    ++count_;
+    return Status::OK();
+  }
 
-  options.num_levels = 10;
-  options.max_bytes_for_level_multiplier_additional.resize(10, 1);
-  ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, &options));
-}
+  virtual UserCollectedProperties GetReadableProperties() const override {
+    return UserCollectedProperties{};
+  }
+
+ private:
+  std::string message_ = "Rocksdb";
+  uint32_t count_ = 0;
+};
+
+class CoutingUserTblPropCollectorFactory
+    : public TablePropertiesCollectorFactory {
+ public:
+  virtual TablePropertiesCollector* CreateTablePropertiesCollector() override {
+    return new CoutingUserTblPropCollector();
+  }
+  const char* Name() const override {
+    return "CoutingUserTblPropCollectorFactory";
+  }
+};
+
+TEST_F(DBTest, GetUserDefinedTablaProperties) {
+  Options options = CurrentOptions();
+  options.max_background_flushes = 0;
+  options.table_properties_collector_factories.resize(1);
+  options.table_properties_collector_factories[0] =
+      std::make_shared<CoutingUserTblPropCollectorFactory>();
+  Reopen(options);
+  // Create 4 tables
+  for (int table = 0; table < 4; ++table) {
+    for (int i = 0; i < 10 + table; ++i) {
+      db_->Put(WriteOptions(), ToString(table * 100 + i), "val");
+    }
+    db_->Flush(FlushOptions());
+  }
 
-TEST(DBTest, Preallocation) {
-  const std::string src = dbname_ + "/alloc_test";
-  unique_ptr<WritableFile> srcfile;
-  const EnvOptions soptions;
-  ASSERT_OK(env_->NewWritableFile(src, &srcfile, soptions));
-  srcfile->SetPreallocationBlockSize(1024 * 1024);
+  TablePropertiesCollection props;
+  ASSERT_OK(db_->GetPropertiesOfAllTables(&props));
+  ASSERT_EQ(4U, props.size());
+  uint32_t sum = 0;
+  for (const auto& item : props) {
+    auto& user_collected = item.second->user_collected_properties;
+    ASSERT_TRUE(user_collected.find("CoutingUserTblPropCollector") !=
+                user_collected.end());
+    ASSERT_EQ(user_collected.at("CoutingUserTblPropCollector"), "Rocksdb");
+    ASSERT_TRUE(user_collected.find("Count") != user_collected.end());
+    Slice key(user_collected.at("Count"));
+    uint32_t count;
+    ASSERT_TRUE(GetVarint32(&key, &count));
+    sum += count;
+  }
+  ASSERT_EQ(10u + 11u + 12u + 13u, sum);
+}
 
-  // No writes should mean no preallocation
-  size_t block_size, last_allocated_block;
-  srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
-  ASSERT_EQ(last_allocated_block, 0UL);
+TEST_F(DBTest, LevelLimitReopen) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu"}, options);
 
-  // Small write should preallocate one block
-  srcfile->Append("test");
-  srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
-  ASSERT_EQ(last_allocated_block, 1UL);
+  const std::string value(1024 * 1024, ' ');
+  int i = 0;
+  while (NumTableFilesAtLevel(2, 1) == 0) {
+    ASSERT_OK(Put(1, Key(i++), value));
+  }
 
-  // Write an entire preallocation block, make sure we increased by two.
-  std::string buf(block_size, ' ');
-  srcfile->Append(buf);
-  srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
-  ASSERT_EQ(last_allocated_block, 2UL);
+  options.num_levels = 1;
+  options.max_bytes_for_level_multiplier_additional.resize(1, 1);
+  Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_EQ(s.IsInvalidArgument(), true);
+  ASSERT_EQ(s.ToString(),
+            "Invalid argument: db has more levels than options.num_levels");
 
-  // Write five more blocks at once, ensure we're where we need to be.
-  buf = std::string(block_size * 5, ' ');
-  srcfile->Append(buf);
-  srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
-  ASSERT_EQ(last_allocated_block, 7UL);
+  options.num_levels = 10;
+  options.max_bytes_for_level_multiplier_additional.resize(10, 1);
+  ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
 }
 
-TEST(DBTest, PutDeleteGet) {
+TEST_F(DBTest, PutDeleteGet) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     ASSERT_OK(Put(1, "foo", "v1"));
     ASSERT_EQ("v1", Get(1, "foo"));
     ASSERT_OK(Put(1, "foo", "v2"));
@@ -1262,30 +1792,31 @@ TEST(DBTest, PutDeleteGet) {
   } while (ChangeOptions());
 }
 
-
-TEST(DBTest, GetFromImmutableLayer) {
+TEST_F(DBTest, GetFromImmutableLayer) {
   do {
     Options options;
     options.env = env_;
     options.write_buffer_size = 100000;  // Small write buffer
     options = CurrentOptions(options);
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
 
     ASSERT_OK(Put(1, "foo", "v1"));
     ASSERT_EQ("v1", Get(1, "foo"));
 
-    env_->delay_sstable_sync_.Release_Store(env_);   // Block sync calls
+    // Block sync calls
+    env_->delay_sstable_sync_.store(true, std::memory_order_release);
     Put(1, "k1", std::string(100000, 'x'));          // Fill memtable
     Put(1, "k2", std::string(100000, 'y'));          // Trigger flush
     ASSERT_EQ("v1", Get(1, "foo"));
     ASSERT_EQ("NOT_FOUND", Get(0, "foo"));
-    env_->delay_sstable_sync_.Release_Store(nullptr);   // Release sync calls
+    // Release sync calls
+    env_->delay_sstable_sync_.store(false, std::memory_order_release);
   } while (ChangeOptions());
 }
 
-TEST(DBTest, GetFromVersions) {
+TEST_F(DBTest, GetFromVersions) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     ASSERT_OK(Put(1, "foo", "v1"));
     ASSERT_OK(Flush(1));
     ASSERT_EQ("v1", Get(1, "foo"));
@@ -1293,14 +1824,21 @@ TEST(DBTest, GetFromVersions) {
   } while (ChangeOptions());
 }
 
-TEST(DBTest, GetSnapshot) {
+TEST_F(DBTest, GetSnapshot) {
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override));
     // Try with both a short key and a long key
     for (int i = 0; i < 2; i++) {
       std::string key = (i == 0) ? std::string("foo") : std::string(200, 'x');
       ASSERT_OK(Put(1, key, "v1"));
       const Snapshot* s1 = db_->GetSnapshot();
+      if (option_config_ == kHashCuckoo) {
+        // NOt supported case.
+        ASSERT_TRUE(s1 == nullptr);
+        break;
+      }
       ASSERT_OK(Put(1, key, "v2"));
       ASSERT_EQ("v2", Get(1, key));
       ASSERT_EQ("v1", Get(1, key, s1));
@@ -1309,13 +1847,69 @@ TEST(DBTest, GetSnapshot) {
       ASSERT_EQ("v1", Get(1, key, s1));
       db_->ReleaseSnapshot(s1);
     }
-    // skip as HashCuckooRep does not support snapshot
-  } while (ChangeOptions(kSkipHashCuckoo));
+  } while (ChangeOptions());
+}
+
+TEST_F(DBTest, GetSnapshotLink) {
+  do {
+    Options options;
+    const std::string snapshot_name = test::TmpDir(env_) + "/snapshot";
+    DB* snapshotDB;
+    ReadOptions roptions;
+    std::string result;
+    Checkpoint* checkpoint;
+
+    options = CurrentOptions(options);
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, options));
+    ASSERT_OK(DestroyDB(snapshot_name, options));
+    env_->DeleteDir(snapshot_name);
+
+    // Create a database
+    Status s;
+    options.create_if_missing = true;
+    ASSERT_OK(DB::Open(options, dbname_, &db_));
+    std::string key = std::string("foo");
+    ASSERT_OK(Put(key, "v1"));
+    // Take a snapshot
+    ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+    ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name));
+    ASSERT_OK(Put(key, "v2"));
+    ASSERT_EQ("v2", Get(key));
+    ASSERT_OK(Flush());
+    ASSERT_EQ("v2", Get(key));
+    // Open snapshot and verify contents while DB is running
+    options.create_if_missing = false;
+    ASSERT_OK(DB::Open(options, snapshot_name, &snapshotDB));
+    ASSERT_OK(snapshotDB->Get(roptions, key, &result));
+    ASSERT_EQ("v1", result);
+    delete snapshotDB;
+    snapshotDB = nullptr;
+    delete db_;
+    db_ = nullptr;
+
+    // Destroy original DB
+    ASSERT_OK(DestroyDB(dbname_, options));
+
+    // Open snapshot and verify contents
+    options.create_if_missing = false;
+    dbname_ = snapshot_name;
+    ASSERT_OK(DB::Open(options, dbname_, &db_));
+    ASSERT_EQ("v1", Get(key));
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, options));
+    delete checkpoint;
+
+    // Restore DB name
+    dbname_ = test::TmpDir(env_) + "/db_test";
+  } while (ChangeOptions());
 }
 
-TEST(DBTest, GetLevel0Ordering) {
+TEST_F(DBTest, GetLevel0Ordering) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     // Check that we process level-0 files in correct order.  The code
     // below generates two level-0 files where the earlier one comes
     // before the later one in the level-0 file list since the earlier
@@ -1329,9 +1923,19 @@ TEST(DBTest, GetLevel0Ordering) {
   } while (ChangeOptions());
 }
 
-TEST(DBTest, GetOrderedByLevels) {
+TEST_F(DBTest, WrongLevel0Config) {
+  Options options = CurrentOptions();
+  Close();
+  ASSERT_OK(DestroyDB(dbname_, options));
+  options.level0_stop_writes_trigger = 1;
+  options.level0_slowdown_writes_trigger = 2;
+  options.level0_file_num_compaction_trigger = 3;
+  ASSERT_OK(DB::Open(options, dbname_, &db_));
+}
+
+TEST_F(DBTest, GetOrderedByLevels) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     ASSERT_OK(Put(1, "foo", "v1"));
     Compact(1, "a", "z");
     ASSERT_EQ("v1", Get(1, "foo"));
@@ -1342,9 +1946,9 @@ TEST(DBTest, GetOrderedByLevels) {
   } while (ChangeOptions());
 }
 
-TEST(DBTest, GetPicksCorrectFile) {
+TEST_F(DBTest, GetPicksCorrectFile) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     // Arrange to have multiple files in a non-level-0 level.
     ASSERT_OK(Put(1, "a", "va"));
     Compact(1, "a", "b");
@@ -1358,16 +1962,19 @@ TEST(DBTest, GetPicksCorrectFile) {
   } while (ChangeOptions());
 }
 
-TEST(DBTest, GetEncountersEmptyLevel) {
+TEST_F(DBTest, GetEncountersEmptyLevel) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    Options options = CurrentOptions();
+    options.max_background_flushes = 0;
+    options.disableDataSync = true;
+    CreateAndReopenWithCF({"pikachu"}, options);
     // Arrange for the following to happen:
     //   * sstable A in level 0
     //   * nothing in level 1
     //   * sstable B in level 2
     // Then do enough Get() calls to arrange for an automatic compaction
     // of sstable A.  A bug would cause the compaction to be marked as
-    // occuring at level 1 (instead of the correct level 0).
+    // occurring at level 1 (instead of the correct level 0).
 
     // Step 1: First place sstables in levels 0 and 2
     int compaction_count = 0;
@@ -1394,20 +2001,21 @@ TEST(DBTest, GetEncountersEmptyLevel) {
     env_->SleepForMicroseconds(1000000);
 
     ASSERT_EQ(NumTableFilesAtLevel(0, 1), 1);  // XXX
-  } while (ChangeOptions(kSkipUniversalCompaction));
+  } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction));
 }
 
 // KeyMayExist can lead to a few false positives, but not false negatives.
 // To make test deterministic, use a much larger number of bits per key-20 than
 // bits in the key, so that false positives are eliminated
-TEST(DBTest, KeyMayExist) {
+TEST_F(DBTest, KeyMayExist) {
   do {
     ReadOptions ropts;
     std::string value;
-    Options options = CurrentOptions();
-    options.filter_policy = NewBloomFilterPolicy(20);
+    anon::OptionsOverride options_override;
+    options_override.filter_policy.reset(NewBloomFilterPolicy(20));
+    Options options = CurrentOptions(options_override);
     options.statistics = rocksdb::CreateDBStatistics();
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
 
     ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value));
 
@@ -1456,20 +2064,19 @@ TEST(DBTest, KeyMayExist) {
     ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
     ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
 
-    delete options.filter_policy;
-
     // KeyMayExist function only checks data in block caches, which is not used
     // by plain table format.
-  } while (ChangeOptions(kSkipPlainTable | kSkipHashIndex));
+  } while (
+      ChangeOptions(kSkipPlainTable | kSkipHashIndex | kSkipFIFOCompaction));
 }
 
-TEST(DBTest, NonBlockingIteration) {
+TEST_F(DBTest, NonBlockingIteration) {
   do {
     ReadOptions non_blocking_opts, regular_opts;
     Options options = CurrentOptions();
     options.statistics = rocksdb::CreateDBStatistics();
     non_blocking_opts.read_tier = kBlockCacheTier;
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
     // write one kv to the database.
     ASSERT_OK(Put(1, "a", "b"));
 
@@ -1523,18 +2130,84 @@ TEST(DBTest, NonBlockingIteration) {
     // This test verifies block cache behaviors, which is not used by plain
     // table format.
     // Exclude kHashCuckoo as it does not support iteration currently
-  } while (ChangeOptions(kSkipPlainTable | kSkipNoSeekToLast |
-                         kSkipHashCuckoo));
+  } while (ChangeOptions(kSkipPlainTable | kSkipNoSeekToLast | kSkipHashCuckoo |
+                         kSkipMmapReads));
+}
+
+TEST_F(DBTest, ManagedNonBlockingIteration) {
+  do {
+    ReadOptions non_blocking_opts, regular_opts;
+    Options options = CurrentOptions();
+    options.statistics = rocksdb::CreateDBStatistics();
+    non_blocking_opts.read_tier = kBlockCacheTier;
+    non_blocking_opts.managed = true;
+    CreateAndReopenWithCF({"pikachu"}, options);
+    // write one kv to the database.
+    ASSERT_OK(Put(1, "a", "b"));
+
+    // scan using non-blocking iterator. We should find it because
+    // it is in memtable.
+    Iterator* iter = db_->NewIterator(non_blocking_opts, handles_[1]);
+    int count = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_OK(iter->status());
+      count++;
+    }
+    ASSERT_EQ(count, 1);
+    delete iter;
+
+    // flush memtable to storage. Now, the key should not be in the
+    // memtable neither in the block cache.
+    ASSERT_OK(Flush(1));
+
+    // verify that a non-blocking iterator does not find any
+    // kvs. Neither does it do any IOs to storage.
+    int64_t numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+    int64_t cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+    iter = db_->NewIterator(non_blocking_opts, handles_[1]);
+    count = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      count++;
+    }
+    ASSERT_EQ(count, 0);
+    ASSERT_TRUE(iter->status().IsIncomplete());
+    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+    delete iter;
+
+    // read in the specified block via a regular get
+    ASSERT_EQ(Get(1, "a"), "b");
+
+    // verify that we can find it via a non-blocking scan
+    numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+    cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+    iter = db_->NewIterator(non_blocking_opts, handles_[1]);
+    count = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_OK(iter->status());
+      count++;
+    }
+    ASSERT_EQ(count, 1);
+    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+    delete iter;
+
+    // This test verifies block cache behaviors, which is not used by plain
+    // table format.
+    // Exclude kHashCuckoo as it does not support iteration currently
+  } while (ChangeOptions(kSkipPlainTable | kSkipNoSeekToLast | kSkipHashCuckoo |
+                         kSkipMmapReads));
 }
 
 // A delete is skipped for key if KeyMayExist(key) returns False
 // Tests Writebatch consistency and proper delete behaviour
-TEST(DBTest, FilterDeletes) {
+TEST_F(DBTest, FilterDeletes) {
   do {
-    Options options = CurrentOptions();
-    options.filter_policy = NewBloomFilterPolicy(20);
+    anon::OptionsOverride options_override;
+    options_override.filter_policy.reset(NewBloomFilterPolicy(20));
+    Options options = CurrentOptions(options_override);
     options.filter_deletes = true;
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
     WriteBatch batch;
 
     batch.Delete(handles_[1], "a");
@@ -1562,13 +2235,195 @@ TEST(DBTest, FilterDeletes) {
     dbfull()->Write(WriteOptions(), &batch);
     ASSERT_EQ(AllEntriesFor("c", 1), "[ DEL, d ]");  // Delete issued
     batch.Clear();
-
-    delete options.filter_policy;
   } while (ChangeCompactOptions());
 }
 
+TEST_F(DBTest, GetFilterByPrefixBloom) {
+  Options options = last_options_;
+  options.prefix_extractor.reset(NewFixedPrefixTransform(8));
+  options.statistics = rocksdb::CreateDBStatistics();
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.whole_key_filtering = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(options);
+
+  WriteOptions wo;
+  ReadOptions ro;
+  FlushOptions fo;
+  fo.wait = true;
+  std::string value;
+
+  ASSERT_OK(dbfull()->Put(wo, "barbarbar", "foo"));
+  ASSERT_OK(dbfull()->Put(wo, "barbarbar2", "foo2"));
+  ASSERT_OK(dbfull()->Put(wo, "foofoofoo", "bar"));
+
+  dbfull()->Flush(fo);
+
+  ASSERT_EQ("foo", Get("barbarbar"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+  ASSERT_EQ("foo2", Get("barbarbar2"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+  ASSERT_EQ("NOT_FOUND", Get("barbarbar3"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+
+  ASSERT_EQ("NOT_FOUND", Get("barfoofoo"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+
+  ASSERT_EQ("NOT_FOUND", Get("foobarbar"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2);
+}
+
+TEST_F(DBTest, WholeKeyFilterProp) {
+  Options options = last_options_;
+  options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+  options.statistics = rocksdb::CreateDBStatistics();
+
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.whole_key_filtering = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(options);
 
-TEST(DBTest, IterSeekBeforePrev) {
+  WriteOptions wo;
+  ReadOptions ro;
+  FlushOptions fo;
+  fo.wait = true;
+  std::string value;
+
+  ASSERT_OK(dbfull()->Put(wo, "foobar", "foo"));
+  // Needs insert some keys to make sure files are not filtered out by key
+  // ranges.
+  ASSERT_OK(dbfull()->Put(wo, "aaa", ""));
+  ASSERT_OK(dbfull()->Put(wo, "zzz", ""));
+  dbfull()->Flush(fo);
+
+  Reopen(options);
+  ASSERT_EQ("NOT_FOUND", Get("foo"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+  ASSERT_EQ("NOT_FOUND", Get("bar"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+  ASSERT_EQ("foo", Get("foobar"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+
+  // Reopen with whole key filtering enabled and prefix extractor
+  // NULL. Bloom filter should be off for both of whole key and
+  // prefix bloom.
+  bbto.whole_key_filtering = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  options.prefix_extractor.reset();
+  Reopen(options);
+
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+  ASSERT_EQ("NOT_FOUND", Get("foo"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+  ASSERT_EQ("NOT_FOUND", Get("bar"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+  ASSERT_EQ("foo", Get("foobar"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+  // Write DB with only full key filtering.
+  ASSERT_OK(dbfull()->Put(wo, "foobar", "foo"));
+  // Needs insert some keys to make sure files are not filtered out by key
+  // ranges.
+  ASSERT_OK(dbfull()->Put(wo, "aaa", ""));
+  ASSERT_OK(dbfull()->Put(wo, "zzz", ""));
+  db_->CompactRange(nullptr, nullptr);
+
+  // Reopen with both of whole key off and prefix extractor enabled.
+  // Still no bloom filter should be used.
+  options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+  bbto.whole_key_filtering = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  Reopen(options);
+
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+  ASSERT_EQ("NOT_FOUND", Get("foo"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+  ASSERT_EQ("NOT_FOUND", Get("bar"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+  ASSERT_EQ("foo", Get("foobar"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+
+  // Try to create a DB with mixed files:
+  ASSERT_OK(dbfull()->Put(wo, "foobar", "foo"));
+  // Needs insert some keys to make sure files are not filtered out by key
+  // ranges.
+  ASSERT_OK(dbfull()->Put(wo, "aaa", ""));
+  ASSERT_OK(dbfull()->Put(wo, "zzz", ""));
+  db_->CompactRange(nullptr, nullptr);
+
+  options.prefix_extractor.reset();
+  bbto.whole_key_filtering = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  Reopen(options);
+
+  // Try to create a DB with mixed files.
+  ASSERT_OK(dbfull()->Put(wo, "barfoo", "bar"));
+  // In this case needs insert some keys to make sure files are
+  // not filtered out by key ranges.
+  ASSERT_OK(dbfull()->Put(wo, "aaa", ""));
+  ASSERT_OK(dbfull()->Put(wo, "zzz", ""));
+  Flush();
+
+  // Now we have two files:
+  // File 1: An older file with prefix bloom.
+  // File 2: A newer file with whole bloom filter.
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+  ASSERT_EQ("NOT_FOUND", Get("foo"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2);
+  ASSERT_EQ("NOT_FOUND", Get("bar"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3);
+  ASSERT_EQ("foo", Get("foobar"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4);
+  ASSERT_EQ("bar", Get("barfoo"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4);
+
+  // Reopen with the same setting: only whole key is used
+  Reopen(options);
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4);
+  ASSERT_EQ("NOT_FOUND", Get("foo"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 5);
+  ASSERT_EQ("NOT_FOUND", Get("bar"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 6);
+  ASSERT_EQ("foo", Get("foobar"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7);
+  ASSERT_EQ("bar", Get("barfoo"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7);
+
+  // Restart with both filters are allowed
+  options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+  bbto.whole_key_filtering = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  Reopen(options);
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7);
+  // File 1 will has it filtered out.
+  // File 2 will not, as prefix `foo` exists in the file.
+  ASSERT_EQ("NOT_FOUND", Get("foo"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 8);
+  ASSERT_EQ("NOT_FOUND", Get("bar"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 10);
+  ASSERT_EQ("foo", Get("foobar"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11);
+  ASSERT_EQ("bar", Get("barfoo"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11);
+
+  // Restart with only prefix bloom is allowed.
+  options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+  bbto.whole_key_filtering = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  Reopen(options);
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11);
+  ASSERT_EQ("NOT_FOUND", Get("foo"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11);
+  ASSERT_EQ("NOT_FOUND", Get("bar"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12);
+  ASSERT_EQ("foo", Get("foobar"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12);
+  ASSERT_EQ("bar", Get("barfoo"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12);
+}
+
+TEST_F(DBTest, IterSeekBeforePrev) {
   ASSERT_OK(Put("a", "b"));
   ASSERT_OK(Put("c", "d"));
   dbfull()->Flush(FlushOptions());
@@ -1590,7 +2445,7 @@ std::string MakeLongKey(size_t length, char c) {
 }
 }  // namespace
 
-TEST(DBTest, IterLongKeys) {
+TEST_F(DBTest, IterLongKeys) {
   ASSERT_OK(Put(MakeLongKey(20, 0), "0"));
   ASSERT_OK(Put(MakeLongKey(32, 2), "2"));
   ASSERT_OK(Put("a", "b"));
@@ -1623,8 +2478,7 @@ TEST(DBTest, IterLongKeys) {
   delete iter;
 }
 
-
-TEST(DBTest, IterNextWithNewerSeq) {
+TEST_F(DBTest, IterNextWithNewerSeq) {
   ASSERT_OK(Put("0", "0"));
   dbfull()->Flush(FlushOptions());
   ASSERT_OK(Put("a", "b"));
@@ -1645,7 +2499,7 @@ TEST(DBTest, IterNextWithNewerSeq) {
   delete iter;
 }
 
-TEST(DBTest, IterPrevWithNewerSeq) {
+TEST_F(DBTest, IterPrevWithNewerSeq) {
   ASSERT_OK(Put("0", "0"));
   dbfull()->Flush(FlushOptions());
   ASSERT_OK(Put("a", "b"));
@@ -1670,7 +2524,7 @@ TEST(DBTest, IterPrevWithNewerSeq) {
   delete iter;
 }
 
-TEST(DBTest, IterPrevWithNewerSeq2) {
+TEST_F(DBTest, IterPrevWithNewerSeq2) {
   ASSERT_OK(Put("0", "0"));
   dbfull()->Flush(FlushOptions());
   ASSERT_OK(Put("a", "b"));
@@ -1693,9 +2547,9 @@ TEST(DBTest, IterPrevWithNewerSeq2) {
   delete iter;
 }
 
-TEST(DBTest, IterEmpty) {
+TEST_F(DBTest, IterEmpty) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]);
 
     iter->SeekToFirst();
@@ -1711,9 +2565,9 @@ TEST(DBTest, IterEmpty) {
   } while (ChangeCompactOptions());
 }
 
-TEST(DBTest, IterSingle) {
+TEST_F(DBTest, IterSingle) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     ASSERT_OK(Put(1, "a", "va"));
     Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]);
 
@@ -1752,9 +2606,9 @@ TEST(DBTest, IterSingle) {
   } while (ChangeCompactOptions());
 }
 
-TEST(DBTest, IterMulti) {
+TEST_F(DBTest, IterMulti) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     ASSERT_OK(Put(1, "a", "va"));
     ASSERT_OK(Put(1, "b", "vb"));
     ASSERT_OK(Put(1, "c", "vc"));
@@ -1841,13 +2695,15 @@ TEST(DBTest, IterMulti) {
 
 // Check that we can skip over a run of user keys
 // by using reseek rather than sequential scan
-TEST(DBTest, IterReseek) {
-  Options options = CurrentOptions();
+TEST_F(DBTest, IterReseek) {
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
+  Options options = CurrentOptions(options_override);
   options.max_sequential_skip_in_iterations = 3;
   options.create_if_missing = true;
   options.statistics = rocksdb::CreateDBStatistics();
-  DestroyAndReopen(&options);
-  CreateAndReopenWithCF({"pikachu"}, &options);
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
 
   // insert two keys with same userkey and verify that
   // reseek is not invoked. For each of these test cases,
@@ -1924,9 +2780,9 @@ TEST(DBTest, IterReseek) {
   delete iter;
 }
 
-TEST(DBTest, IterSmallAndLargeMix) {
+TEST_F(DBTest, IterSmallAndLargeMix) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     ASSERT_OK(Put(1, "a", "va"));
     ASSERT_OK(Put(1, "b", std::string(100000, 'b')));
     ASSERT_OK(Put(1, "c", "vc"));
@@ -1965,9 +2821,9 @@ TEST(DBTest, IterSmallAndLargeMix) {
   } while (ChangeCompactOptions());
 }
 
-TEST(DBTest, IterMultiWithDelete) {
+TEST_F(DBTest, IterMultiWithDelete) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     ASSERT_OK(Put(1, "ka", "va"));
     ASSERT_OK(Put(1, "kb", "vb"));
     ASSERT_OK(Put(1, "kc", "vc"));
@@ -1990,9 +2846,9 @@ TEST(DBTest, IterMultiWithDelete) {
   } while (ChangeOptions());
 }
 
-TEST(DBTest, IterPrevMaxSkip) {
+TEST_F(DBTest, IterPrevMaxSkip) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     for (int i = 0; i < 2; i++) {
       ASSERT_OK(Put(1, "key1", "v1"));
       ASSERT_OK(Put(1, "key2", "v2"));
@@ -2020,9 +2876,11 @@ TEST(DBTest, IterPrevMaxSkip) {
   } while (ChangeOptions(kSkipMergePut | kSkipNoSeekToLast));
 }
 
-TEST(DBTest, IterWithSnapshot) {
+TEST_F(DBTest, IterWithSnapshot) {
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override));
     ASSERT_OK(Put(1, "key1", "val1"));
     ASSERT_OK(Put(1, "key2", "val2"));
     ASSERT_OK(Put(1, "key3", "val3"));
@@ -2064,13 +2922,13 @@ TEST(DBTest, IterWithSnapshot) {
   } while (ChangeOptions(kSkipHashCuckoo));
 }
 
-TEST(DBTest, Recover) {
+TEST_F(DBTest, Recover) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     ASSERT_OK(Put(1, "foo", "v1"));
     ASSERT_OK(Put(1, "baz", "v5"));
 
-    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     ASSERT_EQ("v1", Get(1, "foo"));
 
     ASSERT_EQ("v1", Get(1, "foo"));
@@ -2078,7 +2936,7 @@ TEST(DBTest, Recover) {
     ASSERT_OK(Put(1, "bar", "v2"));
     ASSERT_OK(Put(1, "foo", "v3"));
 
-    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     ASSERT_EQ("v3", Get(1, "foo"));
     ASSERT_OK(Put(1, "foo", "v4"));
     ASSERT_EQ("v4", Get(1, "foo"));
@@ -2087,15 +2945,15 @@ TEST(DBTest, Recover) {
   } while (ChangeOptions());
 }
 
-TEST(DBTest, RecoverWithTableHandle) {
+TEST_F(DBTest, RecoverWithTableHandle) {
   do {
     Options options;
     options.create_if_missing = true;
     options.write_buffer_size = 100;
     options.disable_auto_compactions = true;
     options = CurrentOptions(options);
-    DestroyAndReopen(&options);
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    DestroyAndReopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
 
     ASSERT_OK(Put(1, "foo", "v1"));
     ASSERT_OK(Put(1, "bar", "v2"));
@@ -2104,7 +2962,7 @@ TEST(DBTest, RecoverWithTableHandle) {
     ASSERT_OK(Put(1, "bar", "v4"));
     ASSERT_OK(Flush(1));
     ASSERT_OK(Put(1, "big", std::string(100, 'a')));
-    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
 
     std::vector<std::vector<FileMetaData>> files;
     dbfull()->TEST_GetFilesMetaData(handles_[1], &files);
@@ -2125,7 +2983,7 @@ TEST(DBTest, RecoverWithTableHandle) {
   } while (ChangeOptions());
 }
 
-TEST(DBTest, IgnoreRecoveredLog) {
+TEST_F(DBTest, IgnoreRecoveredLog) {
   std::string backup_logs = dbname_ + "/backup_logs";
 
   // delete old files in backup_logs directory
@@ -2143,7 +3001,7 @@ TEST(DBTest, IgnoreRecoveredLog) {
     options.create_if_missing = true;
     options.merge_operator = MergeOperators::CreateUInt64AddOperator();
     options.wal_dir = dbname_ + "/logs";
-    DestroyAndReopen(&options);
+    DestroyAndReopen(options);
 
     // fill up the DB
     std::string one, two;
@@ -2163,7 +3021,7 @@ TEST(DBTest, IgnoreRecoveredLog) {
     }
 
     // recover the DB
-    Reopen(&options);
+    Reopen(options);
     ASSERT_EQ(two, Get("foo"));
     ASSERT_EQ(one, Get("bar"));
     Close();
@@ -2177,12 +3035,12 @@ TEST(DBTest, IgnoreRecoveredLog) {
     // this should ignore the log files, recovery should not happen again
     // if the recovery happens, the same merge operator would be called twice,
     // leading to incorrect results
-    Reopen(&options);
+    Reopen(options);
     ASSERT_EQ(two, Get("foo"));
     ASSERT_EQ(one, Get("bar"));
     Close();
-    Destroy(&options);
-    Reopen(&options);
+    Destroy(options);
+    Reopen(options);
     Close();
 
     // copy the logs from backup back to wal dir
@@ -2194,12 +3052,12 @@ TEST(DBTest, IgnoreRecoveredLog) {
     }
     // assert that we successfully recovered only from logs, even though we
     // destroyed the DB
-    Reopen(&options);
+    Reopen(options);
     ASSERT_EQ(two, Get("foo"));
     ASSERT_EQ(one, Get("bar"));
 
     // Recovery will fail if DB directory doesn't exist.
-    Destroy(&options);
+    Destroy(options);
     // copy the logs from backup back to wal dir
     env_->CreateDirIfMissing(options.wal_dir);
     for (auto& log : logs) {
@@ -2209,37 +3067,37 @@ TEST(DBTest, IgnoreRecoveredLog) {
         env_->DeleteFile(backup_logs + "/" + log);
       }
     }
-    Status s = TryReopen(&options);
+    Status s = TryReopen(options);
     ASSERT_TRUE(!s.ok());
   } while (ChangeOptions(kSkipHashCuckoo));
 }
 
-TEST(DBTest, RollLog) {
+TEST_F(DBTest, RollLog) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     ASSERT_OK(Put(1, "foo", "v1"));
     ASSERT_OK(Put(1, "baz", "v5"));
 
-    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     for (int i = 0; i < 10; i++) {
-      ReopenWithColumnFamilies({"default", "pikachu"});
+      ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     }
     ASSERT_OK(Put(1, "foo", "v4"));
     for (int i = 0; i < 10; i++) {
-      ReopenWithColumnFamilies({"default", "pikachu"});
+      ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     }
   } while (ChangeOptions());
 }
 
-TEST(DBTest, WAL) {
+TEST_F(DBTest, WAL) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     WriteOptions writeOpt = WriteOptions();
     writeOpt.disableWAL = true;
     ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1"));
     ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
 
-    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     ASSERT_EQ("v1", Get(1, "foo"));
     ASSERT_EQ("v1", Get(1, "bar"));
 
@@ -2248,7 +3106,7 @@ TEST(DBTest, WAL) {
     writeOpt.disableWAL = true;
     ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v2"));
 
-    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     // Both value's should be present.
     ASSERT_EQ("v2", Get(1, "bar"));
     ASSERT_EQ("v2", Get(1, "foo"));
@@ -2258,32 +3116,32 @@ TEST(DBTest, WAL) {
     writeOpt.disableWAL = false;
     ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v3"));
 
-    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     // again both values should be present.
     ASSERT_EQ("v3", Get(1, "foo"));
     ASSERT_EQ("v3", Get(1, "bar"));
   } while (ChangeCompactOptions());
 }
 
-TEST(DBTest, CheckLock) {
+TEST_F(DBTest, CheckLock) {
   do {
     DB* localdb;
     Options options = CurrentOptions();
-    ASSERT_OK(TryReopen(&options));
+    ASSERT_OK(TryReopen(options));
 
     // second open should fail
     ASSERT_TRUE(!(DB::Open(options, dbname_, &localdb)).ok());
   } while (ChangeCompactOptions());
 }
 
-TEST(DBTest, FlushMultipleMemtable) {
+TEST_F(DBTest, FlushMultipleMemtable) {
   do {
     Options options = CurrentOptions();
     WriteOptions writeOpt = WriteOptions();
     writeOpt.disableWAL = true;
     options.max_write_buffer_number = 4;
     options.min_write_buffer_number_to_merge = 3;
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
     ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1"));
     ASSERT_OK(Flush(1));
     ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
@@ -2294,7 +3152,7 @@ TEST(DBTest, FlushMultipleMemtable) {
   } while (ChangeCompactOptions());
 }
 
-TEST(DBTest, NumImmutableMemTable) {
+TEST_F(DBTest, NumImmutableMemTable) {
   do {
     Options options = CurrentOptions();
     WriteOptions writeOpt = WriteOptions();
@@ -2302,11 +3160,12 @@ TEST(DBTest, NumImmutableMemTable) {
     options.max_write_buffer_number = 4;
     options.min_write_buffer_number_to_merge = 3;
     options.write_buffer_size = 1000000;
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
 
     std::string big_value(1000000 * 2, 'x');
     std::string num;
     SetPerfLevel(kEnableTime);;
+    ASSERT_TRUE(GetPerfLevel() == kEnableTime);
 
     ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k1", big_value));
     ASSERT_TRUE(dbfull()->GetProperty(handles_[1],
@@ -2365,10 +3224,40 @@ TEST(DBTest, NumImmutableMemTable) {
     ASSERT_EQ(num, "0");
     ASSERT_TRUE(dbfull()->GetProperty(
         handles_[1], "rocksdb.cur-size-active-mem-table", &num));
-    // "208" is the size of the metadata of an empty skiplist, this would
+    // "200" is the size of the metadata of an empty skiplist, this would
     // break if we change the default skiplist implementation
-    ASSERT_EQ(num, "208");
+    ASSERT_EQ(num, "200");
+
+    uint64_t int_num;
+    uint64_t base_total_size;
+    ASSERT_TRUE(dbfull()->GetIntProperty(
+        handles_[1], "rocksdb.estimate-num-keys", &base_total_size));
+
+    ASSERT_OK(dbfull()->Delete(writeOpt, handles_[1], "k2"));
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k3", ""));
+    ASSERT_OK(dbfull()->Delete(writeOpt, handles_[1], "k3"));
+    ASSERT_TRUE(dbfull()->GetIntProperty(
+        handles_[1], "rocksdb.num-deletes-active-mem-table", &int_num));
+    ASSERT_EQ(int_num, 2U);
+    ASSERT_TRUE(dbfull()->GetIntProperty(
+        handles_[1], "rocksdb.num-entries-active-mem-table", &int_num));
+    ASSERT_EQ(int_num, 3U);
+
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k2", big_value));
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k2", big_value));
+    ASSERT_TRUE(dbfull()->GetIntProperty(
+        handles_[1], "rocksdb.num-entries-imm-mem-tables", &int_num));
+    ASSERT_EQ(int_num, 4U);
+    ASSERT_TRUE(dbfull()->GetIntProperty(
+        handles_[1], "rocksdb.num-deletes-imm-mem-tables", &int_num));
+    ASSERT_EQ(int_num, 2U);
+
+    ASSERT_TRUE(dbfull()->GetIntProperty(
+        handles_[1], "rocksdb.estimate-num-keys", &int_num));
+    ASSERT_EQ(int_num, base_total_size + 1);
+
     SetPerfLevel(kDisable);
+    ASSERT_TRUE(GetPerfLevel() == kDisable);
   } while (ChangeCompactOptions());
 }
 
@@ -2407,7 +3296,50 @@ class SleepingBackgroundTask {
   bool done_with_sleep_;
 };
 
-TEST(DBTest, GetProperty) {
+TEST_F(DBTest, FlushEmptyColumnFamily) {
+  // Block flush thread and disable compaction thread
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  env_->SetBackgroundThreads(1, Env::LOW);
+  SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+  SleepingBackgroundTask sleeping_task_high;
+  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_high,
+                 Env::Priority::HIGH);
+
+  Options options = CurrentOptions();
+  // disable compaction
+  options.disable_auto_compactions = true;
+  WriteOptions writeOpt = WriteOptions();
+  writeOpt.disableWAL = true;
+  options.max_write_buffer_number = 2;
+  options.min_write_buffer_number_to_merge = 1;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Compaction can still go through even if no thread can flush the
+  // mem table.
+  ASSERT_OK(Flush(0));
+  ASSERT_OK(Flush(1));
+
+  // Insert can go through
+  ASSERT_OK(dbfull()->Put(writeOpt, handles_[0], "foo", "v1"));
+  ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
+
+  ASSERT_EQ("v1", Get(0, "foo"));
+  ASSERT_EQ("v1", Get(1, "bar"));
+
+  sleeping_task_high.WakeUp();
+  sleeping_task_high.WaitUntilDone();
+
+  // Flush can still go through.
+  ASSERT_OK(Flush(0));
+  ASSERT_OK(Flush(1));
+
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+}
+
+TEST_F(DBTest, GetProperty) {
   // Set sizes to both background thread pool to be 1 and block them.
   env_->SetBackgroundThreads(1, Env::HIGH);
   env_->SetBackgroundThreads(1, Env::LOW);
@@ -2429,12 +3361,17 @@ TEST(DBTest, GetProperty) {
   options.max_write_buffer_number = 10;
   options.min_write_buffer_number_to_merge = 1;
   options.write_buffer_size = 1000000;
-  Reopen(&options);
+  Reopen(options);
 
   std::string big_value(1000000 * 2, 'x');
   std::string num;
+  uint64_t int_num;
   SetPerfLevel(kEnableTime);
 
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num));
+  ASSERT_EQ(int_num, 0U);
+
   ASSERT_OK(dbfull()->Put(writeOpt, "k1", big_value));
   ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
   ASSERT_EQ(num, "0");
@@ -2442,11 +3379,14 @@ TEST(DBTest, GetProperty) {
   ASSERT_EQ(num, "0");
   ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num));
   ASSERT_EQ(num, "0");
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num));
+  ASSERT_EQ(num, "1");
   perf_context.Reset();
 
   ASSERT_OK(dbfull()->Put(writeOpt, "k2", big_value));
   ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
   ASSERT_EQ(num, "1");
+  ASSERT_OK(dbfull()->Delete(writeOpt, "k-non-existing"));
   ASSERT_OK(dbfull()->Put(writeOpt, "k3", big_value));
   ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
   ASSERT_EQ(num, "2");
@@ -2454,6 +3394,23 @@ TEST(DBTest, GetProperty) {
   ASSERT_EQ(num, "1");
   ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num));
   ASSERT_EQ(num, "0");
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num));
+  ASSERT_EQ(num, "2");
+  // Verify the same set of properties through GetIntProperty
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.num-immutable-mem-table", &int_num));
+  ASSERT_EQ(int_num, 2U);
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.mem-table-flush-pending", &int_num));
+  ASSERT_EQ(int_num, 1U);
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.compaction-pending", &int_num));
+  ASSERT_EQ(int_num, 0U);
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.estimate-num-keys", &int_num));
+  ASSERT_EQ(int_num, 2U);
+
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num));
+  ASSERT_EQ(int_num, 0U);
 
   sleeping_task_high.WakeUp();
   sleeping_task_high.WaitUntilDone();
@@ -2466,13 +3423,73 @@ TEST(DBTest, GetProperty) {
   ASSERT_EQ(num, "0");
   ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num));
   ASSERT_EQ(num, "1");
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num));
+  ASSERT_EQ(num, "4");
+
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num));
+  ASSERT_GT(int_num, 0U);
+
   sleeping_task_low.WakeUp();
   sleeping_task_low.WaitUntilDone();
+
+  dbfull()->TEST_WaitForFlushMemTable();
+  options.max_open_files = 10;
+  Reopen(options);
+  // After reopening, no table reader is loaded, so no memory for table readers
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num));
+  ASSERT_EQ(int_num, 0U);
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.estimate-num-keys", &int_num));
+  ASSERT_GT(int_num, 0U);
+
+  // After reading a key, at least one table reader is loaded.
+  Get("k5");
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num));
+  ASSERT_GT(int_num, 0U);
+
+  // Test rocksdb.num-live-versions
+  {
+    options.level0_file_num_compaction_trigger = 20;
+    Reopen(options);
+    ASSERT_TRUE(
+        dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num));
+    ASSERT_EQ(int_num, 1U);
+
+    // Use an iterator to hold current version
+    std::unique_ptr<Iterator> iter1(dbfull()->NewIterator(ReadOptions()));
+
+    ASSERT_OK(dbfull()->Put(writeOpt, "k6", big_value));
+    Flush();
+    ASSERT_TRUE(
+        dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num));
+    ASSERT_EQ(int_num, 2U);
+
+    // Use an iterator to hold current version
+    std::unique_ptr<Iterator> iter2(dbfull()->NewIterator(ReadOptions()));
+
+    ASSERT_OK(dbfull()->Put(writeOpt, "k7", big_value));
+    Flush();
+    ASSERT_TRUE(
+        dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num));
+    ASSERT_EQ(int_num, 3U);
+
+    iter2.reset();
+    ASSERT_TRUE(
+        dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num));
+    ASSERT_EQ(int_num, 2U);
+
+    iter1.reset();
+    ASSERT_TRUE(
+        dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num));
+    ASSERT_EQ(int_num, 1U);
+  }
 }
 
-TEST(DBTest, FLUSH) {
+TEST_F(DBTest, FLUSH) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     WriteOptions writeOpt = WriteOptions();
     writeOpt.disableWAL = true;
     SetPerfLevel(kEnableTime);;
@@ -2485,7 +3502,7 @@ TEST(DBTest, FLUSH) {
     Get(1, "foo");
     ASSERT_TRUE((int) perf_context.get_from_output_files_time > 0);
 
-    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     ASSERT_EQ("v1", Get(1, "foo"));
     ASSERT_EQ("v1", Get(1, "bar"));
 
@@ -2494,7 +3511,7 @@ TEST(DBTest, FLUSH) {
     ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v2"));
     ASSERT_OK(Flush(1));
 
-    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     ASSERT_EQ("v2", Get(1, "bar"));
     perf_context.Reset();
     ASSERT_EQ("v2", Get(1, "foo"));
@@ -2505,7 +3522,7 @@ TEST(DBTest, FLUSH) {
     ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v3"));
     ASSERT_OK(Flush(1));
 
-    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     // 'foo' should be there because its put
     // has WAL enabled.
     ASSERT_EQ("v3", Get(1, "foo"));
@@ -2515,28 +3532,28 @@ TEST(DBTest, FLUSH) {
   } while (ChangeCompactOptions());
 }
 
-TEST(DBTest, RecoveryWithEmptyLog) {
+TEST_F(DBTest, RecoveryWithEmptyLog) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     ASSERT_OK(Put(1, "foo", "v1"));
     ASSERT_OK(Put(1, "foo", "v2"));
-    ReopenWithColumnFamilies({"default", "pikachu"});
-    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     ASSERT_OK(Put(1, "foo", "v3"));
-    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     ASSERT_EQ("v3", Get(1, "foo"));
   } while (ChangeOptions());
 }
 
 // Check that writes done during a memtable compaction are recovered
 // if the database is shutdown during the memtable compaction.
-TEST(DBTest, RecoverDuringMemtableCompaction) {
+TEST_F(DBTest, RecoverDuringMemtableCompaction) {
   do {
     Options options;
     options.env = env_;
     options.write_buffer_size = 1000000;
     options = CurrentOptions(options);
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
 
     // Trigger a long memtable compaction and reopen the database during it
     ASSERT_OK(Put(1, "foo", "v1"));  // Goes to 1st log file
@@ -2544,7 +3561,7 @@ TEST(DBTest, RecoverDuringMemtableCompaction) {
     ASSERT_OK(Put(1, "big2", std::string(1000, 'y')));  // Triggers compaction
     ASSERT_OK(Put(1, "bar", "v2"));                     // Goes to new log file
 
-    ReopenWithColumnFamilies({"default", "pikachu"}, &options);
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
     ASSERT_EQ("v1", Get(1, "foo"));
     ASSERT_EQ("v2", Get(1, "bar"));
     ASSERT_EQ(std::string(10000000, 'x'), Get(1, "big1"));
@@ -2552,12 +3569,54 @@ TEST(DBTest, RecoverDuringMemtableCompaction) {
   } while (ChangeOptions());
 }
 
-TEST(DBTest, MinorCompactionsHappen) {
+// false positive TSAN report on shared_ptr --
+// https://groups.google.com/forum/#!topic/thread-sanitizer/vz_s-t226Vg
+#ifndef ROCKSDB_TSAN_RUN
+TEST_F(DBTest, FlushSchedule) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.level0_stop_writes_trigger = 1 << 10;
+  options.level0_slowdown_writes_trigger = 1 << 10;
+  options.min_write_buffer_number_to_merge = 1;
+  options.max_write_buffer_number = 2;
+  options.write_buffer_size = 100 * 1000;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  std::vector<std::thread> threads;
+
+  std::atomic<int> thread_num(0);
+  // each column family will have 5 thread, each thread generating 2 memtables.
+  // each column family should end up with 10 table files
+  for (int i = 0; i < 10; ++i) {
+    threads.emplace_back([&]() {
+      int a = thread_num.fetch_add(1);
+      Random rnd(a);
+      WriteOptions wo;
+      // this should fill up 2 memtables
+      for (int k = 0; k < 5000; ++k) {
+        ASSERT_OK(db_->Put(wo, handles_[a & 1], RandomString(&rnd, 13), ""));
+      }
+    });
+  }
+
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  auto default_tables = GetNumberOfSstFilesForColumnFamily(db_, "default");
+  auto pikachu_tables = GetNumberOfSstFilesForColumnFamily(db_, "pikachu");
+  ASSERT_LE(default_tables, static_cast<uint64_t>(10));
+  ASSERT_GT(default_tables, static_cast<uint64_t>(0));
+  ASSERT_LE(pikachu_tables, static_cast<uint64_t>(10));
+  ASSERT_GT(pikachu_tables, static_cast<uint64_t>(0));
+}
+#endif  // enabled only if not TSAN run
+
+TEST_F(DBTest, MinorCompactionsHappen) {
   do {
     Options options;
     options.write_buffer_size = 10000;
     options = CurrentOptions(options);
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
 
     const int N = 500;
 
@@ -2572,7 +3631,7 @@ TEST(DBTest, MinorCompactionsHappen) {
       ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(1, Key(i)));
     }
 
-    ReopenWithColumnFamilies({"default", "pikachu"}, &options);
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
 
     for (int i = 0; i < N; i++) {
       ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(1, Key(i)));
@@ -2580,12 +3639,12 @@ TEST(DBTest, MinorCompactionsHappen) {
   } while (ChangeCompactOptions());
 }
 
-TEST(DBTest, ManifestRollOver) {
+TEST_F(DBTest, ManifestRollOver) {
   do {
     Options options;
     options.max_manifest_file_size = 10 ;  // 10 bytes
     options = CurrentOptions(options);
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
     {
       ASSERT_OK(Put(1, "manifest_key1", std::string(1000, '1')));
       ASSERT_OK(Put(1, "manifest_key2", std::string(1000, '2')));
@@ -2594,7 +3653,7 @@ TEST(DBTest, ManifestRollOver) {
       ASSERT_OK(Flush(1));  // This should trigger LogAndApply.
       uint64_t manifest_after_flush = dbfull()->TEST_Current_Manifest_FileNo();
       ASSERT_GT(manifest_after_flush, manifest_before_flush);
-      ReopenWithColumnFamilies({"default", "pikachu"}, &options);
+      ReopenWithColumnFamilies({"default", "pikachu"}, options);
       ASSERT_GT(dbfull()->TEST_Current_Manifest_FileNo(), manifest_after_flush);
       // check if a new manifest file got inserted or not.
       ASSERT_EQ(std::string(1000, '1'), Get(1, "manifest_key1"));
@@ -2604,13 +3663,13 @@ TEST(DBTest, ManifestRollOver) {
   } while (ChangeCompactOptions());
 }
 
-TEST(DBTest, IdentityAcrossRestarts) {
+TEST_F(DBTest, IdentityAcrossRestarts) {
   do {
     std::string id1;
     ASSERT_OK(db_->GetDbIdentity(id1));
 
     Options options = CurrentOptions();
-    Reopen(&options);
+    Reopen(options);
     std::string id2;
     ASSERT_OK(db_->GetDbIdentity(id2));
     // id1 should match id2 because identity was not regenerated
@@ -2618,7 +3677,7 @@ TEST(DBTest, IdentityAcrossRestarts) {
 
     std::string idfilename = IdentityFileName(dbname_);
     ASSERT_OK(env_->DeleteFile(idfilename));
-    Reopen(&options);
+    Reopen(options);
     std::string id3;
     ASSERT_OK(db_->GetDbIdentity(id3));
     // id1 should NOT match id3 because identity was regenerated
@@ -2626,11 +3685,11 @@ TEST(DBTest, IdentityAcrossRestarts) {
   } while (ChangeCompactOptions());
 }
 
-TEST(DBTest, RecoverWithLargeLog) {
+TEST_F(DBTest, RecoverWithLargeLog) {
   do {
     {
       Options options = CurrentOptions();
-      CreateAndReopenWithCF({"pikachu"}, &options);
+      CreateAndReopenWithCF({"pikachu"}, options);
       ASSERT_OK(Put(1, "big1", std::string(200000, '1')));
       ASSERT_OK(Put(1, "big2", std::string(200000, '2')));
       ASSERT_OK(Put(1, "small3", std::string(10, '3')));
@@ -2643,7 +3702,7 @@ TEST(DBTest, RecoverWithLargeLog) {
     Options options;
     options.write_buffer_size = 100000;
     options = CurrentOptions(options);
-    ReopenWithColumnFamilies({"default", "pikachu"}, &options);
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
     ASSERT_EQ(NumTableFilesAtLevel(0, 1), 3);
     ASSERT_EQ(std::string(200000, '1'), Get(1, "big1"));
     ASSERT_EQ(std::string(200000, '2'), Get(1, "big2"));
@@ -2653,11 +3712,11 @@ TEST(DBTest, RecoverWithLargeLog) {
   } while (ChangeCompactOptions());
 }
 
-TEST(DBTest, CompactionsGenerateMultipleFiles) {
+TEST_F(DBTest, CompactionsGenerateMultipleFiles) {
   Options options;
   options.write_buffer_size = 100000000;        // Large write buffer
   options = CurrentOptions(options);
-  CreateAndReopenWithCF({"pikachu"}, &options);
+  CreateAndReopenWithCF({"pikachu"}, options);
 
   Random rnd(301);
 
@@ -2670,7 +3729,7 @@ TEST(DBTest, CompactionsGenerateMultipleFiles) {
   }
 
   // Reopening moves updates to level-0
-  ReopenWithColumnFamilies({"default", "pikachu"}, &options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
   dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
 
   ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
@@ -2680,14 +3739,14 @@ TEST(DBTest, CompactionsGenerateMultipleFiles) {
   }
 }
 
-TEST(DBTest, CompactionTrigger) {
+TEST_F(DBTest, CompactionTrigger) {
   Options options;
   options.write_buffer_size = 100<<10; //100KB
   options.num_levels = 3;
   options.max_mem_compaction_level = 0;
   options.level0_file_num_compaction_trigger = 3;
   options = CurrentOptions(options);
-  CreateAndReopenWithCF({"pikachu"}, &options);
+  CreateAndReopenWithCF({"pikachu"}, options);
 
   Random rnd(301);
 
@@ -2715,38 +3774,184 @@ TEST(DBTest, CompactionTrigger) {
   ASSERT_EQ(NumTableFilesAtLevel(1, 1), 1);
 }
 
-// This is a static filter used for filtering
-// kvs during the compaction process.
-static int cfilter_count;
-static std::string NEW_VALUE = "NewValue";
+namespace {
+static const int kCDTValueSize = 1000;
+static const int kCDTKeysPerBuffer = 4;
+static const int kCDTNumLevels = 8;
+Options DeletionTriggerOptions() {
+  Options options;
+  options.compression = kNoCompression;
+  options.write_buffer_size = kCDTKeysPerBuffer * (kCDTValueSize + 24);
+  options.min_write_buffer_number_to_merge = 1;
+  options.num_levels = kCDTNumLevels;
+  options.max_mem_compaction_level = 0;
+  options.level0_file_num_compaction_trigger = 1;
+  options.target_file_size_base = options.write_buffer_size * 2;
+  options.target_file_size_multiplier = 2;
+  options.max_bytes_for_level_base =
+      options.target_file_size_base * options.target_file_size_multiplier;
+  options.max_bytes_for_level_multiplier = 2;
+  options.disable_auto_compactions = false;
+  return options;
+}
+}  // anonymous namespace
 
-class KeepFilter : public CompactionFilter {
- public:
-  virtual bool Filter(int level, const Slice& key, const Slice& value,
-                      std::string* new_value, bool* value_changed) const
-      override {
-    cfilter_count++;
-    return false;
-  }
+TEST_F(DBTest, CompactionDeletionTrigger) {
+  for (int tid = 0; tid < 2; ++tid) {
+    uint64_t db_size[2];
+    Options options = CurrentOptions(DeletionTriggerOptions());
 
-  virtual const char* Name() const override { return "KeepFilter"; }
-};
+    if (tid == 1) {
+      // second pass with universal compaction
+      options.compaction_style = kCompactionStyleUniversal;
+      options.num_levels = 1;
+    }
 
-class DeleteFilter : public CompactionFilter {
- public:
-  virtual bool Filter(int level, const Slice& key, const Slice& value,
-                      std::string* new_value, bool* value_changed) const
-      override {
-    cfilter_count++;
-    return true;
-  }
+    DestroyAndReopen(options);
+    Random rnd(301);
 
-  virtual const char* Name() const override { return "DeleteFilter"; }
-};
+    const int kTestSize = kCDTKeysPerBuffer * 512;
+    std::vector<std::string> values;
+    for (int k = 0; k < kTestSize; ++k) {
+      values.push_back(RandomString(&rnd, kCDTValueSize));
+      ASSERT_OK(Put(Key(k), values[k]));
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+    db_size[0] = Size(Key(0), Key(kTestSize - 1));
 
-class ChangeFilter : public CompactionFilter {
- public:
-  explicit ChangeFilter() {}
+    for (int k = 0; k < kTestSize; ++k) {
+      ASSERT_OK(Delete(Key(k)));
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+    db_size[1] = Size(Key(0), Key(kTestSize - 1));
+
+    // must have much smaller db size.
+    ASSERT_GT(db_size[0] / 3, db_size[1]);
+  }
+}
+
+TEST_F(DBTest, CompactionDeletionTriggerReopen) {
+  for (int tid = 0; tid < 2; ++tid) {
+    uint64_t db_size[3];
+    Options options = CurrentOptions(DeletionTriggerOptions());
+
+    if (tid == 1) {
+      // second pass with universal compaction
+      options.compaction_style = kCompactionStyleUniversal;
+      options.num_levels = 1;
+    }
+
+    DestroyAndReopen(options);
+    Random rnd(301);
+
+    // round 1 --- insert key/value pairs.
+    const int kTestSize = kCDTKeysPerBuffer * 512;
+    std::vector<std::string> values;
+    for (int k = 0; k < kTestSize; ++k) {
+      values.push_back(RandomString(&rnd, kCDTValueSize));
+      ASSERT_OK(Put(Key(k), values[k]));
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+    db_size[0] = Size(Key(0), Key(kTestSize - 1));
+    Close();
+
+    // round 2 --- disable auto-compactions and issue deletions.
+    options.create_if_missing = false;
+    options.disable_auto_compactions = true;
+    Reopen(options);
+
+    for (int k = 0; k < kTestSize; ++k) {
+      ASSERT_OK(Delete(Key(k)));
+    }
+    db_size[1] = Size(Key(0), Key(kTestSize - 1));
+    Close();
+    // as auto_compaction is off, we shouldn't see too much reduce
+    // in db size.
+    ASSERT_LT(db_size[0] / 3, db_size[1]);
+
+    // round 3 --- reopen db with auto_compaction on and see if
+    // deletion compensation still work.
+    options.disable_auto_compactions = false;
+    Reopen(options);
+    // insert relatively small amount of data to trigger auto compaction.
+    for (int k = 0; k < kTestSize / 10; ++k) {
+      ASSERT_OK(Put(Key(k), values[k]));
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+    db_size[2] = Size(Key(0), Key(kTestSize - 1));
+    // this time we're expecting significant drop in size.
+    ASSERT_GT(db_size[0] / 3, db_size[2]);
+  }
+}
+
+// This is a static filter used for filtering
+// kvs during the compaction process.
+static int cfilter_count;
+static std::string NEW_VALUE = "NewValue";
+
+class KeepFilter : public CompactionFilter {
+ public:
+  virtual bool Filter(int level, const Slice& key, const Slice& value,
+                      std::string* new_value, bool* value_changed) const
+      override {
+    cfilter_count++;
+    return false;
+  }
+
+  virtual const char* Name() const override { return "KeepFilter"; }
+};
+
+class DeleteFilter : public CompactionFilter {
+ public:
+  virtual bool Filter(int level, const Slice& key, const Slice& value,
+                      std::string* new_value, bool* value_changed) const
+      override {
+    cfilter_count++;
+    return true;
+  }
+
+  virtual const char* Name() const override { return "DeleteFilter"; }
+};
+
+class DelayFilter : public CompactionFilter {
+ public:
+  explicit DelayFilter(DBTest* d) : db_test(d) {}
+  virtual bool Filter(int level, const Slice& key, const Slice& value,
+                      std::string* new_value,
+                      bool* value_changed) const override {
+    db_test->env_->addon_time_ += 1000;
+    return true;
+  }
+
+  virtual const char* Name() const override { return "DelayFilter"; }
+
+ private:
+  DBTest* db_test;
+};
+
+class ConditionalFilter : public CompactionFilter {
+ public:
+  explicit ConditionalFilter(const std::string* filtered_value)
+      : filtered_value_(filtered_value) {}
+  virtual bool Filter(int level, const Slice& key, const Slice& value,
+                      std::string* new_value,
+                      bool* value_changed) const override {
+    return value.ToString() == *filtered_value_;
+  }
+
+  virtual const char* Name() const override { return "ConditionalFilter"; }
+
+ private:
+  const std::string* filtered_value_;
+};
+
+class ChangeFilter : public CompactionFilter {
+ public:
+  explicit ChangeFilter() {}
 
   virtual bool Filter(int level, const Slice& key, const Slice& value,
                       std::string* new_value, bool* value_changed) const
@@ -2768,8 +3973,8 @@ class KeepFilterFactory : public CompactionFilterFactory {
   virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
       const CompactionFilter::Context& context) override {
     if (check_context_) {
-      ASSERT_EQ(expect_full_compaction_.load(), context.is_full_compaction);
-      ASSERT_EQ(expect_manual_compaction_.load(), context.is_manual_compaction);
+      EXPECT_EQ(expect_full_compaction_.load(), context.is_full_compaction);
+      EXPECT_EQ(expect_manual_compaction_.load(), context.is_manual_compaction);
     }
     return std::unique_ptr<CompactionFilter>(new KeepFilter());
   }
@@ -2794,6 +3999,39 @@ class DeleteFilterFactory : public CompactionFilterFactory {
   virtual const char* Name() const override { return "DeleteFilterFactory"; }
 };
 
+class DelayFilterFactory : public CompactionFilterFactory {
+ public:
+  explicit DelayFilterFactory(DBTest* d) : db_test(d) {}
+  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override {
+    return std::unique_ptr<CompactionFilter>(new DelayFilter(db_test));
+  }
+
+  virtual const char* Name() const override { return "DelayFilterFactory"; }
+
+ private:
+  DBTest* db_test;
+};
+
+class ConditionalFilterFactory : public CompactionFilterFactory {
+ public:
+  explicit ConditionalFilterFactory(const Slice& filtered_value)
+      : filtered_value_(filtered_value.ToString()) {}
+
+  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override {
+    return std::unique_ptr<CompactionFilter>(
+        new ConditionalFilter(&filtered_value_));
+  }
+
+  virtual const char* Name() const override {
+    return "ConditionalFilterFactory";
+  }
+
+ private:
+  std::string filtered_value_;
+};
+
 class ChangeFilterFactory : public CompactionFilterFactory {
  public:
   explicit ChangeFilterFactory() {}
@@ -2806,14 +4044,26 @@ class ChangeFilterFactory : public CompactionFilterFactory {
   virtual const char* Name() const override { return "ChangeFilterFactory"; }
 };
 
+class DBTestUniversalCompactionBase
+    : public DBTest,
+      public ::testing::WithParamInterface<int> {
+ public:
+  virtual void SetUp() override { num_levels_ = GetParam(); }
+  int num_levels_;
+};
+
+class DBTestUniversalCompaction : public DBTestUniversalCompactionBase {};
+
 // TODO(kailiu) The tests on UniversalCompaction has some issues:
 //  1. A lot of magic numbers ("11" or "12").
-//  2. Made assumption on the memtable flush conidtions, which may change from
+//  2. Made assumption on the memtable flush conditions, which may change from
 //     time to time.
-TEST(DBTest, UniversalCompactionTrigger) {
+TEST_P(DBTestUniversalCompaction, UniversalCompactionTrigger) {
   Options options;
   options.compaction_style = kCompactionStyleUniversal;
-  options.write_buffer_size = 100<<10; //100KB
+  options.num_levels = num_levels_;
+  options.write_buffer_size = 100 << 10;     // 100KB
+  options.target_file_size_base = 32 << 10;  // 32KB
   // trigger compaction if there are >= 4 files
   options.level0_file_num_compaction_trigger = 4;
   KeepFilterFactory* filter = new KeepFilterFactory(true);
@@ -2821,7 +4071,18 @@ TEST(DBTest, UniversalCompactionTrigger) {
   options.compaction_filter_factory.reset(filter);
 
   options = CurrentOptions(options);
-  CreateAndReopenWithCF({"pikachu"}, &options);
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBTestWritableFile.GetPreallocationStatus", [&](void* arg) {
+        ASSERT_TRUE(arg != nullptr);
+        size_t preallocation_size = *(static_cast<size_t*>(arg));
+        if (num_levels_ > 3) {
+          ASSERT_LE(preallocation_size, options.target_file_size_base * 1.1);
+        }
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
   Random rnd(301);
   int key_idx = 0;
@@ -2838,7 +4099,7 @@ TEST(DBTest, UniversalCompactionTrigger) {
       key_idx++;
     }
     dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
-    ASSERT_EQ(NumTableFilesAtLevel(0, 1), num + 1);
+    ASSERT_EQ(NumSortedRuns(1), num + 1);
   }
 
   // Generate one more file at level-0, which should trigger level-0
@@ -2851,10 +4112,7 @@ TEST(DBTest, UniversalCompactionTrigger) {
   // Suppose each file flushed from mem table has size 1. Now we compact
   // (level0_file_num_compaction_trigger+1)=4 files and should have a big
   // file of size 4.
-  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 1);
-  for (int i = 1; i < options.num_levels ; i++) {
-    ASSERT_EQ(NumTableFilesAtLevel(i, 1), 0);
-  }
+  ASSERT_EQ(NumSortedRuns(1), 1);
 
   // Stage 2:
   //   Now we have one file at level 0, with size 4. We also have some data in
@@ -2873,7 +4131,7 @@ TEST(DBTest, UniversalCompactionTrigger) {
       key_idx++;
     }
     dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
-    ASSERT_EQ(NumTableFilesAtLevel(0, 1), num + 3);
+    ASSERT_EQ(NumSortedRuns(1), num + 3);
   }
 
   // Generate one more file at level-0, which should trigger level-0
@@ -2884,11 +4142,8 @@ TEST(DBTest, UniversalCompactionTrigger) {
   }
   dbfull()->TEST_WaitForCompact();
   // Before compaction, we have 4 files at level 0, with size 4, 0.4, 1, 1.
-  // After comapction, we should have 2 files, with size 4, 2.4.
-  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 2);
-  for (int i = 1; i < options.num_levels ; i++) {
-    ASSERT_EQ(NumTableFilesAtLevel(i, 1), 0);
-  }
+  // After compaction, we should have 2 files, with size 4, 2.4.
+  ASSERT_EQ(NumSortedRuns(1), 2);
 
   // Stage 3:
   //   Now we have 2 files at level 0, with size 4 and 2.4. Continue
@@ -2901,7 +4156,7 @@ TEST(DBTest, UniversalCompactionTrigger) {
       key_idx++;
     }
     dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
-    ASSERT_EQ(NumTableFilesAtLevel(0, 1), num + 3);
+    ASSERT_EQ(NumSortedRuns(1), num + 3);
   }
 
   // Generate one more file at level-0, which should trigger level-0
@@ -2912,11 +4167,8 @@ TEST(DBTest, UniversalCompactionTrigger) {
   }
   dbfull()->TEST_WaitForCompact();
   // Before compaction, we have 4 files at level 0, with size 4, 2.4, 1, 1.
-  // After comapction, we should have 3 files, with size 4, 2.4, 2.
-  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 3);
-  for (int i = 1; i < options.num_levels ; i++) {
-    ASSERT_EQ(NumTableFilesAtLevel(i, 1), 0);
-  }
+  // After compaction, we should have 3 files, with size 4, 2.4, 2.
+  ASSERT_EQ(NumSortedRuns(1), 3);
 
   // Stage 4:
   //   Now we have 3 files at level 0, with size 4, 2.4, 2. Let's generate a
@@ -2927,10 +4179,7 @@ TEST(DBTest, UniversalCompactionTrigger) {
   }
   dbfull()->TEST_WaitForCompact();
   // Level-0 compaction is triggered, but no file will be picked up.
-  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 4);
-  for (int i = 1; i < options.num_levels ; i++) {
-    ASSERT_EQ(NumTableFilesAtLevel(i, 1), 0);
-  }
+  ASSERT_EQ(NumSortedRuns(1), 4);
 
   // Stage 5:
   //   Now we have 4 files at level 0, with size 4, 2.4, 2, 1. Let's generate
@@ -2942,23 +4191,26 @@ TEST(DBTest, UniversalCompactionTrigger) {
   }
   dbfull()->TEST_WaitForCompact();
   // All files at level 0 will be compacted into a single one.
-  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 1);
-  for (int i = 1; i < options.num_levels ; i++) {
-    ASSERT_EQ(NumTableFilesAtLevel(i, 1), 0);
-  }
+  ASSERT_EQ(NumSortedRuns(1), 1);
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
 }
 
-TEST(DBTest, UniversalCompactionSizeAmplification) {
+TEST_P(DBTestUniversalCompaction, UniversalCompactionSizeAmplification) {
   Options options;
   options.compaction_style = kCompactionStyleUniversal;
-  options.write_buffer_size = 100<<10; //100KB
+  options.num_levels = num_levels_;
+  options.write_buffer_size = 100 << 10;     // 100KB
+  options.target_file_size_base = 32 << 10;  // 32KB
   options.level0_file_num_compaction_trigger = 3;
-  CreateAndReopenWithCF({"pikachu"}, &options);
+  options = CurrentOptions(options);
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
 
   // Trigger compaction if size amplification exceeds 110%
   options.compaction_options_universal.max_size_amplification_percent = 110;
   options = CurrentOptions(options);
-  ReopenWithColumnFamilies({"default", "pikachu"}, &options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
 
   Random rnd(301);
   int key_idx = 0;
@@ -2972,9 +4224,9 @@ TEST(DBTest, UniversalCompactionSizeAmplification) {
       key_idx++;
     }
     dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
-    ASSERT_EQ(NumTableFilesAtLevel(0, 1), num + 1);
+    ASSERT_EQ(NumSortedRuns(1), num + 1);
   }
-  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 2);
+  ASSERT_EQ(NumSortedRuns(1), 2);
 
   // Flush whatever is remaining in memtable. This is typically
   // small, which should not trigger size ratio based compaction
@@ -2984,18 +4236,124 @@ TEST(DBTest, UniversalCompactionSizeAmplification) {
   dbfull()->TEST_WaitForCompact();
 
   // Verify that size amplification did occur
-  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 1);
+  ASSERT_EQ(NumSortedRuns(1), 1);
 }
 
-TEST(DBTest, UniversalCompactionOptions) {
+class DBTestUniversalCompactionMultiLevels
+    : public DBTestUniversalCompactionBase {};
+
+TEST_P(DBTestUniversalCompactionMultiLevels, UniversalCompactionMultiLevels) {
   Options options;
   options.compaction_style = kCompactionStyleUniversal;
-  options.write_buffer_size = 100<<10; //100KB
+  options.num_levels = num_levels_;
+  options.write_buffer_size = 100 << 10;  // 100KB
+  options.level0_file_num_compaction_trigger = 8;
+  options.max_background_compactions = 3;
+  options.target_file_size_base = 32 * 1024;
+  options = CurrentOptions(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Trigger compaction if size amplification exceeds 110%
+  options.compaction_options_universal.max_size_amplification_percent = 110;
+  options = CurrentOptions(options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  Random rnd(301);
+  int num_keys = 100000;
+  for (int i = 0; i < num_keys * 2; i++) {
+    ASSERT_OK(Put(1, Key(i % num_keys), Key(i)));
+  }
+
+  dbfull()->TEST_WaitForCompact();
+
+  for (int i = num_keys; i < num_keys * 2; i++) {
+    ASSERT_EQ(Get(1, Key(i % num_keys)), Key(i));
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(DBTestUniversalCompactionMultiLevels,
+                        DBTestUniversalCompactionMultiLevels,
+                        ::testing::Values(3, 20));
+
+class DBTestUniversalCompactionParallel : public DBTestUniversalCompactionBase {
+};
+
+TEST_P(DBTestUniversalCompactionParallel, UniversalCompactionParallel) {
+  Options options;
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = num_levels_;
+  options.write_buffer_size = 1 << 10;  // 1KB
+  options.level0_file_num_compaction_trigger = 3;
+  options.max_background_compactions = 3;
+  options.max_background_flushes = 3;
+  options.target_file_size_base = 1 * 1024;
+  options.compaction_options_universal.max_size_amplification_percent = 110;
+  options = CurrentOptions(options);
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Delay every compaction so multiple compactions will happen.
+  std::atomic<int> num_compactions_running(0);
+  std::atomic<bool> has_parallel(false);
+  rocksdb::SyncPoint::GetInstance()->SetCallBack("CompactionJob::Run():Start",
+                                                 [&](void* arg) {
+    if (num_compactions_running.fetch_add(1) > 0) {
+      has_parallel.store(true);
+      return;
+    }
+    for (int nwait = 0; nwait < 20000; nwait++) {
+      if (has_parallel.load() || num_compactions_running.load() > 1) {
+        has_parallel.store(true);
+        break;
+      }
+      env_->SleepForMicroseconds(1000);
+    }
+  });
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run():End",
+      [&](void* arg) { num_compactions_running.fetch_add(-1); });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  options = CurrentOptions(options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  Random rnd(301);
+  int num_keys = 30000;
+  for (int i = 0; i < num_keys * 2; i++) {
+    ASSERT_OK(Put(1, Key(i % num_keys), Key(i)));
+  }
+  dbfull()->TEST_WaitForCompact();
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  ASSERT_EQ(num_compactions_running.load(), 0);
+  ASSERT_TRUE(has_parallel.load());
+
+  for (int i = num_keys; i < num_keys * 2; i++) {
+    ASSERT_EQ(Get(1, Key(i % num_keys)), Key(i));
+  }
+
+  // Reopen and check.
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  for (int i = num_keys; i < num_keys * 2; i++) {
+    ASSERT_EQ(Get(1, Key(i % num_keys)), Key(i));
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(DBTestUniversalCompactionParallel,
+                        DBTestUniversalCompactionParallel,
+                        ::testing::Values(1, 10));
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionOptions) {
+  Options options;
+  options.compaction_style = kCompactionStyleUniversal;
+  options.write_buffer_size = 100 << 10;     // 100KB
+  options.target_file_size_base = 32 << 10;  // 32KB
   options.level0_file_num_compaction_trigger = 4;
-  options.num_levels = 1;
+  options.num_levels = num_levels_;
   options.compaction_options_universal.compression_size_percent = -1;
   options = CurrentOptions(options);
-  CreateAndReopenWithCF({"pikachu"}, &options);
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
 
   Random rnd(301);
   int key_idx = 0;
@@ -3009,27 +4367,26 @@ TEST(DBTest, UniversalCompactionOptions) {
     dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
 
     if (num < options.level0_file_num_compaction_trigger - 1) {
-      ASSERT_EQ(NumTableFilesAtLevel(0, 1), num + 1);
+      ASSERT_EQ(NumSortedRuns(1), num + 1);
     }
   }
 
   dbfull()->TEST_WaitForCompact();
-  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 1);
-  for (int i = 1; i < options.num_levels ; i++) {
-    ASSERT_EQ(NumTableFilesAtLevel(i, 1), 0);
-  }
+  ASSERT_EQ(NumSortedRuns(1), 1);
 }
 
-TEST(DBTest, UniversalCompactionStopStyleSimilarSize) {
+TEST_P(DBTestUniversalCompaction, UniversalCompactionStopStyleSimilarSize) {
   Options options = CurrentOptions();
   options.compaction_style = kCompactionStyleUniversal;
-  options.write_buffer_size = 100<<10; //100KB
+  options.write_buffer_size = 100 << 10;     // 100KB
+  options.target_file_size_base = 32 << 10;  // 32KB
   // trigger compaction if there are >= 4 files
   options.level0_file_num_compaction_trigger = 4;
   options.compaction_options_universal.size_ratio = 10;
-  options.compaction_options_universal.stop_style = kCompactionStopStyleSimilarSize;
-  options.num_levels=1;
-  Reopen(&options);
+  options.compaction_options_universal.stop_style =
+      kCompactionStopStyleSimilarSize;
+  options.num_levels = num_levels_;
+  DestroyAndReopen(options);
 
   Random rnd(301);
   int key_idx = 0;
@@ -3037,8 +4394,7 @@ TEST(DBTest, UniversalCompactionStopStyleSimilarSize) {
   // Stage 1:
   //   Generate a set of files at level 0, but don't trigger level-0
   //   compaction.
-  for (int num = 0;
-       num < options.level0_file_num_compaction_trigger-1;
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
        num++) {
     // Write 110KB (11 values, each 10K)
     for (int i = 0; i < 11; i++) {
@@ -3046,7 +4402,7 @@ TEST(DBTest, UniversalCompactionStopStyleSimilarSize) {
       key_idx++;
     }
     dbfull()->TEST_WaitForFlushMemTable();
-    ASSERT_EQ(NumTableFilesAtLevel(0), num + 1);
+    ASSERT_EQ(NumSortedRuns(), num + 1);
   }
 
   // Generate one more file at level-0, which should trigger level-0
@@ -3059,7 +4415,7 @@ TEST(DBTest, UniversalCompactionStopStyleSimilarSize) {
   // Suppose each file flushed from mem table has size 1. Now we compact
   // (level0_file_num_compaction_trigger+1)=4 files and should have a big
   // file of size 4.
-  ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+  ASSERT_EQ(NumSortedRuns(), 1);
 
   // Stage 2:
   //   Now we have one file at level 0, with size 4. We also have some data in
@@ -3069,8 +4425,7 @@ TEST(DBTest, UniversalCompactionStopStyleSimilarSize) {
   //   a level-0 file, with size around 0.4 (according to previously written
   //   data amount).
   dbfull()->Flush(FlushOptions());
-  for (int num = 0;
-       num < options.level0_file_num_compaction_trigger-3;
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 3;
        num++) {
     // Write 110KB (11 values, each 10K)
     for (int i = 0; i < 11; i++) {
@@ -3078,7 +4433,7 @@ TEST(DBTest, UniversalCompactionStopStyleSimilarSize) {
       key_idx++;
     }
     dbfull()->TEST_WaitForFlushMemTable();
-    ASSERT_EQ(NumTableFilesAtLevel(0), num + 3);
+    ASSERT_EQ(NumSortedRuns(), num + 3);
   }
 
   // Generate one more file at level-0, which should trigger level-0
@@ -3090,7 +4445,7 @@ TEST(DBTest, UniversalCompactionStopStyleSimilarSize) {
   dbfull()->TEST_WaitForCompact();
   // Before compaction, we have 4 files at level 0, with size 4, 0.4, 1, 1.
   // After compaction, we should have 3 files, with size 4, 0.4, 2.
-  ASSERT_EQ(NumTableFilesAtLevel(0), 3);
+  ASSERT_EQ(NumSortedRuns(), 3);
   // Stage 3:
   //   Now we have 3 files at level 0, with size 4, 0.4, 2. Generate one
   //   more file at level-0, which should trigger level-0 compaction.
@@ -3100,11 +4455,13 @@ TEST(DBTest, UniversalCompactionStopStyleSimilarSize) {
   }
   dbfull()->TEST_WaitForCompact();
   // Level-0 compaction is triggered, but no file will be picked up.
-  ASSERT_EQ(NumTableFilesAtLevel(0), 4);
+  ASSERT_EQ(NumSortedRuns(), 4);
 }
 
-#if defined(SNAPPY)
-TEST(DBTest, CompressedCache) {
+TEST_F(DBTest, CompressedCache) {
+  if (!Snappy_Supported()) {
+    return;
+  }
   int num_iter = 80;
 
   // Run this test three iterations.
@@ -3114,45 +4471,54 @@ TEST(DBTest, CompressedCache) {
   // Iteration 4: both block cache and compressed cache, but DB is not
   // compressed
   for (int iter = 0; iter < 4; iter++) {
-    Options options = CurrentOptions();
+    Options options;
     options.write_buffer_size = 64*1024;        // small write buffer
     options.statistics = rocksdb::CreateDBStatistics();
+    options = CurrentOptions(options);
 
+    BlockBasedTableOptions table_options;
     switch (iter) {
       case 0:
         // only uncompressed block cache
-        options.block_cache = NewLRUCache(8*1024);
-        options.block_cache_compressed = nullptr;
+        table_options.block_cache = NewLRUCache(8*1024);
+        table_options.block_cache_compressed = nullptr;
+        options.table_factory.reset(NewBlockBasedTableFactory(table_options));
         break;
       case 1:
         // no block cache, only compressed cache
-        options.no_block_cache = true;
-        options.block_cache = nullptr;
-        options.block_cache_compressed = NewLRUCache(8*1024);
+        table_options.no_block_cache = true;
+        table_options.block_cache = nullptr;
+        table_options.block_cache_compressed = NewLRUCache(8*1024);
+        options.table_factory.reset(NewBlockBasedTableFactory(table_options));
         break;
       case 2:
         // both compressed and uncompressed block cache
-        options.block_cache = NewLRUCache(1024);
-        options.block_cache_compressed = NewLRUCache(8*1024);
+        table_options.block_cache = NewLRUCache(1024);
+        table_options.block_cache_compressed = NewLRUCache(8*1024);
+        options.table_factory.reset(NewBlockBasedTableFactory(table_options));
         break;
       case 3:
         // both block cache and compressed cache, but DB is not compressed
         // also, make block cache sizes bigger, to trigger block cache hits
-        options.block_cache = NewLRUCache(1024 * 1024);
-        options.block_cache_compressed = NewLRUCache(8 * 1024 * 1024);
+        table_options.block_cache = NewLRUCache(1024 * 1024);
+        table_options.block_cache_compressed = NewLRUCache(8 * 1024 * 1024);
+        options.table_factory.reset(NewBlockBasedTableFactory(table_options));
         options.compression = kNoCompression;
         break;
       default:
         ASSERT_TRUE(false);
     }
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
     // default column family doesn't have block cache
     Options no_block_cache_opts;
-    no_block_cache_opts.no_block_cache = true;
     no_block_cache_opts.statistics = options.statistics;
-    options = CurrentOptions(options);
+    no_block_cache_opts = CurrentOptions(no_block_cache_opts);
+    BlockBasedTableOptions table_options_no_bc;
+    table_options_no_bc.no_block_cache = true;
+    no_block_cache_opts.table_factory.reset(
+        NewBlockBasedTableFactory(table_options_no_bc));
     ReopenWithColumnFamilies({"default", "pikachu"},
-                             {&no_block_cache_opts, &options});
+        std::vector<Options>({no_block_cache_opts, options}));
 
     Random rnd(301);
 
@@ -3206,7 +4572,7 @@ TEST(DBTest, CompressedCache) {
     }
 
     options.create_if_missing = true;
-    DestroyAndReopen(&options);
+    DestroyAndReopen(options);
   }
 }
 
@@ -3216,15 +4582,20 @@ static std::string CompressibleString(Random* rnd, int len) {
   return r;
 }
 
-TEST(DBTest, UniversalCompactionCompressRatio1) {
+TEST_P(DBTestUniversalCompaction, UniversalCompactionCompressRatio1) {
+  if (!Snappy_Supported()) {
+    return;
+  }
+
   Options options;
   options.compaction_style = kCompactionStyleUniversal;
-  options.write_buffer_size = 100<<10; //100KB
+  options.write_buffer_size = 100 << 10;     // 100KB
+  options.target_file_size_base = 32 << 10;  // 32KB
   options.level0_file_num_compaction_trigger = 2;
-  options.num_levels = 1;
+  options.num_levels = num_levels_;
   options.compaction_options_universal.compression_size_percent = 70;
   options = CurrentOptions(options);
-  Reopen(&options);
+  DestroyAndReopen(options);
 
   Random rnd(301);
   int key_idx = 0;
@@ -3239,7 +4610,7 @@ TEST(DBTest, UniversalCompactionCompressRatio1) {
     dbfull()->TEST_WaitForFlushMemTable();
     dbfull()->TEST_WaitForCompact();
   }
-  ASSERT_LT((int)dbfull()->TEST_GetLevel0TotalSize(), 110000 * 2 * 0.9);
+  ASSERT_LT(TotalSize(), 110000U * 2 * 0.9);
 
   // The second compaction (4) is compressed
   for (int num = 0; num < 2; num++) {
@@ -3251,7 +4622,7 @@ TEST(DBTest, UniversalCompactionCompressRatio1) {
     dbfull()->TEST_WaitForFlushMemTable();
     dbfull()->TEST_WaitForCompact();
   }
-  ASSERT_LT((int)dbfull()->TEST_GetLevel0TotalSize(), 110000 * 4 * 0.9);
+  ASSERT_LT(TotalSize(), 110000 * 4 * 0.9);
 
   // The third compaction (2 4) is compressed since this time it is
   // (1 1 3.2) and 3.2/5.2 doesn't reach ratio.
@@ -3264,7 +4635,7 @@ TEST(DBTest, UniversalCompactionCompressRatio1) {
     dbfull()->TEST_WaitForFlushMemTable();
     dbfull()->TEST_WaitForCompact();
   }
-  ASSERT_LT((int)dbfull()->TEST_GetLevel0TotalSize(), 110000 * 6 * 0.9);
+  ASSERT_LT(TotalSize(), 110000 * 6 * 0.9);
 
   // When we start for the compaction up to (2 4 8), the latest
   // compressed is not compressed.
@@ -3277,19 +4648,22 @@ TEST(DBTest, UniversalCompactionCompressRatio1) {
     dbfull()->TEST_WaitForFlushMemTable();
     dbfull()->TEST_WaitForCompact();
   }
-  ASSERT_GT((int)dbfull()->TEST_GetLevel0TotalSize(),
-            110000 * 11 * 0.8 + 110000 * 2);
+  ASSERT_GT(TotalSize(), 110000 * 11 * 0.8 + 110000 * 2);
 }
 
-TEST(DBTest, UniversalCompactionCompressRatio2) {
+TEST_P(DBTestUniversalCompaction, UniversalCompactionCompressRatio2) {
+  if (!Snappy_Supported()) {
+    return;
+  }
   Options options;
   options.compaction_style = kCompactionStyleUniversal;
-  options.write_buffer_size = 100<<10; //100KB
+  options.write_buffer_size = 100 << 10;     // 100KB
+  options.target_file_size_base = 32 << 10;  // 32KB
   options.level0_file_num_compaction_trigger = 2;
-  options.num_levels = 1;
+  options.num_levels = num_levels_;
   options.compaction_options_universal.compression_size_percent = 95;
   options = CurrentOptions(options);
-  Reopen(&options);
+  DestroyAndReopen(options);
 
   Random rnd(301);
   int key_idx = 0;
@@ -3305,81 +4679,555 @@ TEST(DBTest, UniversalCompactionCompressRatio2) {
     dbfull()->TEST_WaitForFlushMemTable();
     dbfull()->TEST_WaitForCompact();
   }
-  ASSERT_LT((int)dbfull()->TEST_GetLevel0TotalSize(),
-            120000 * 12 * 0.8 + 120000 * 2);
+  ASSERT_LT(TotalSize(), 120000U * 12 * 0.8 + 120000 * 2);
 }
-#endif
 
-TEST(DBTest, ConvertCompactionStyle) {
-  Random rnd(301);
-  int max_key_level_insert = 200;
-  int max_key_universal_insert = 600;
+INSTANTIATE_TEST_CASE_P(UniversalCompactionNumLevels, DBTestUniversalCompaction,
+                        ::testing::Values(1, 3, 5));
 
-  // Stage 1: generate a db with level compaction
+TEST_F(DBTest, FailMoreDbPaths) {
+  Options options = CurrentOptions();
+  options.db_paths.emplace_back(dbname_, 10000000);
+  options.db_paths.emplace_back(dbname_ + "_2", 1000000);
+  options.db_paths.emplace_back(dbname_ + "_3", 1000000);
+  options.db_paths.emplace_back(dbname_ + "_4", 1000000);
+  options.db_paths.emplace_back(dbname_ + "_5", 1000000);
+  ASSERT_TRUE(TryReopen(options).IsNotSupported());
+}
+
+TEST_F(DBTest, UniversalCompactionSecondPathRatio) {
+  if (!Snappy_Supported()) {
+    return;
+  }
   Options options;
-  options.write_buffer_size = 100<<10; //100KB
-  options.num_levels = 4;
-  options.level0_file_num_compaction_trigger = 3;
-  options.max_bytes_for_level_base = 500<<10; // 500KB
-  options.max_bytes_for_level_multiplier = 1;
-  options.target_file_size_base = 200<<10; // 200KB
-  options.target_file_size_multiplier = 1;
+  options.db_paths.emplace_back(dbname_, 500 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_2", 1024 * 1024 * 1024);
+  options.compaction_style = kCompactionStyleUniversal;
+  options.write_buffer_size = 100 << 10;  // 100KB
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 1;
   options = CurrentOptions(options);
-  CreateAndReopenWithCF({"pikachu"}, &options);
 
-  for (int i = 0; i <= max_key_level_insert; i++) {
-    // each value is 10K
-    ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000)));
+  std::vector<std::string> filenames;
+  env_->GetChildren(options.db_paths[1].path, &filenames);
+  // Delete archival files.
+  for (size_t i = 0; i < filenames.size(); ++i) {
+    env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]);
   }
-  ASSERT_OK(Flush(1));
-  dbfull()->TEST_WaitForCompact();
+  env_->DeleteDir(options.db_paths[1].path);
+  Reopen(options);
 
-  ASSERT_GT(TotalTableFiles(1, 4), 1);
-  int non_level0_num_files = 0;
-  for (int i = 1; i < options.num_levels; i++) {
-    non_level0_num_files += NumTableFilesAtLevel(i, 1);
+  Random rnd(301);
+  int key_idx = 0;
+
+  // First three 110KB files are not going to second path.
+  // After that, (100K, 200K)
+  for (int num = 0; num < 3; num++) {
+    GenerateNewFile(&rnd, &key_idx);
   }
-  ASSERT_GT(non_level0_num_files, 0);
 
-  // Stage 2: reopen with universal compaction - should fail
-  options = CurrentOptions();
-  options.compaction_style = kCompactionStyleUniversal;
-  options = CurrentOptions(options);
-  Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, &options);
-  ASSERT_TRUE(s.IsInvalidArgument());
+  // Another 110KB triggers a compaction to 400K file to second path
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
 
-  // Stage 3: compact into a single file and move the file to level 0
-  options = CurrentOptions();
-  options.disable_auto_compactions = true;
-  options.target_file_size_base = INT_MAX;
-  options.target_file_size_multiplier = 1;
-  options.max_bytes_for_level_base = INT_MAX;
-  options.max_bytes_for_level_multiplier = 1;
-  options = CurrentOptions(options);
-  ReopenWithColumnFamilies({"default", "pikachu"}, &options);
+  // (1, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
 
-  dbfull()->CompactRange(handles_[1], nullptr, nullptr, true /* reduce level */,
-                         0 /* reduce to level 0 */);
+  // (1,1,4) -> (2, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
 
-  for (int i = 0; i < options.num_levels; i++) {
-    int num = NumTableFilesAtLevel(i, 1);
-    if (i == 0) {
-      ASSERT_EQ(num, 1);
-    } else {
-      ASSERT_EQ(num, 0);
-    }
-  }
+  // (1, 2, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(2, GetSstFileCount(dbname_));
 
-  // Stage 4: re-open in universal compaction style and do some db operations
-  options = CurrentOptions();
-  options.compaction_style = kCompactionStyleUniversal;
-  options.write_buffer_size = 100<<10; //100KB
-  options.level0_file_num_compaction_trigger = 3;
-  options = CurrentOptions(options);
-  ReopenWithColumnFamilies({"default", "pikachu"}, &options);
+  // (1, 1, 2, 4) -> (8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
 
-  for (int i = max_key_level_insert / 2; i <= max_key_universal_insert; i++) {
-    ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000)));
+  // (1, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 1, 8) -> (2, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 2, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(2, GetSstFileCount(dbname_));
+
+  // (1, 1, 2, 8) -> (4, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  // (1, 4, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 10000);
+  }
+
+  Reopen(options);
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 10000);
+  }
+
+  Destroy(options);
+}
+
+TEST_F(DBTest, LevelCompactionThirdPath) {
+  Options options = CurrentOptions();
+  options.db_paths.emplace_back(dbname_, 500 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024);
+  options.compaction_style = kCompactionStyleLevel;
+  options.write_buffer_size = 100 << 10;  // 100KB
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 4;
+  options.max_bytes_for_level_base = 400 * 1024;
+  //  options = CurrentOptions(options);
+
+  std::vector<std::string> filenames;
+  env_->GetChildren(options.db_paths[1].path, &filenames);
+  // Delete archival files.
+  for (size_t i = 0; i < filenames.size(); ++i) {
+    env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]);
+  }
+  env_->DeleteDir(options.db_paths[1].path);
+  Reopen(options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // First three 110KB files are not going to second path.
+  // After that, (100K, 200K)
+  for (int num = 0; num < 3; num++) {
+    GenerateNewFile(&rnd, &key_idx);
+  }
+
+  // Another 110KB triggers a compaction to 400K file to fill up first path
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(3, GetSstFileCount(options.db_paths[1].path));
+
+  // (1, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4", FilesPerLevel(0));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 4, 1)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,1", FilesPerLevel(0));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 4, 2)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,2", FilesPerLevel(0));
+  ASSERT_EQ(2, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 4, 3)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,3", FilesPerLevel(0));
+  ASSERT_EQ(3, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 4, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,4", FilesPerLevel(0));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 4, 5)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,5", FilesPerLevel(0));
+  ASSERT_EQ(5, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 4, 6)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,6", FilesPerLevel(0));
+  ASSERT_EQ(6, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 4, 7)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,7", FilesPerLevel(0));
+  ASSERT_EQ(7, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 4, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,8", FilesPerLevel(0));
+  ASSERT_EQ(8, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 10000);
+  }
+
+  Reopen(options);
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 10000);
+  }
+
+  Destroy(options);
+}
+
+TEST_F(DBTest, LevelCompactionPathUse) {
+  Options options = CurrentOptions();
+  options.db_paths.emplace_back(dbname_, 500 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024);
+  options.compaction_style = kCompactionStyleLevel;
+  options.write_buffer_size = 100 << 10;  // 100KB
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 4;
+  options.max_bytes_for_level_base = 400 * 1024;
+  //  options = CurrentOptions(options);
+
+  std::vector<std::string> filenames;
+  env_->GetChildren(options.db_paths[1].path, &filenames);
+  // Delete archival files.
+  for (size_t i = 0; i < filenames.size(); ++i) {
+    env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]);
+  }
+  env_->DeleteDir(options.db_paths[1].path);
+  Reopen(options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // Always gets compacted into 1 Level1 file,
+  // 0/1 Level 0 file
+  for (int num = 0; num < 3; num++) {
+    key_idx = 0;
+    GenerateNewFile(&rnd, &key_idx);
+  }
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,1", FilesPerLevel(0));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,1", FilesPerLevel(0));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,1", FilesPerLevel(0));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,1", FilesPerLevel(0));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,1", FilesPerLevel(0));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 10000);
+  }
+
+  Reopen(options);
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 10000);
+  }
+
+  Destroy(options);
+}
+
+TEST_F(DBTest, UniversalCompactionFourPaths) {
+  Options options;
+  options.db_paths.emplace_back(dbname_, 300 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_2", 300 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_3", 500 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_4", 1024 * 1024 * 1024);
+  options.compaction_style = kCompactionStyleUniversal;
+  options.write_buffer_size = 100 << 10;  // 100KB
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 1;
+  options = CurrentOptions(options);
+
+  std::vector<std::string> filenames;
+  env_->GetChildren(options.db_paths[1].path, &filenames);
+  // Delete archival files.
+  for (size_t i = 0; i < filenames.size(); ++i) {
+    env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]);
+  }
+  env_->DeleteDir(options.db_paths[1].path);
+  Reopen(options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // First three 110KB files are not going to second path.
+  // After that, (100K, 200K)
+  for (int num = 0; num < 3; num++) {
+    GenerateNewFile(&rnd, &key_idx);
+  }
+
+  // Another 110KB triggers a compaction to 400K file to second path
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+
+  // (1, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1,1,4) -> (2, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  // (1, 2, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 1, 2, 4) -> (8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path));
+
+  // (1, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 1, 8) -> (2, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+  // (1, 2, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 1, 2, 8) -> (4, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path));
+
+  // (1, 4, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 10000);
+  }
+
+  Reopen(options);
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 10000);
+  }
+
+  Destroy(options);
+}
+
+void CheckColumnFamilyMeta(const ColumnFamilyMetaData& cf_meta) {
+  uint64_t cf_size = 0;
+  uint64_t cf_csize = 0;
+  size_t file_count = 0;
+  for (auto level_meta : cf_meta.levels) {
+    uint64_t level_size = 0;
+    uint64_t level_csize = 0;
+    file_count += level_meta.files.size();
+    for (auto file_meta : level_meta.files) {
+      level_size += file_meta.size;
+    }
+    ASSERT_EQ(level_meta.size, level_size);
+    cf_size += level_size;
+    cf_csize += level_csize;
+  }
+  ASSERT_EQ(cf_meta.file_count, file_count);
+  ASSERT_EQ(cf_meta.size, cf_size);
+}
+
+TEST_F(DBTest, ColumnFamilyMetaDataTest) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  int key_index = 0;
+  ColumnFamilyMetaData cf_meta;
+  for (int i = 0; i < 100; ++i) {
+    GenerateNewFile(&rnd, &key_index);
+    db_->GetColumnFamilyMetaData(&cf_meta);
+    CheckColumnFamilyMeta(cf_meta);
+  }
+}
+
+TEST_F(DBTest, ConvertCompactionStyle) {
+  Random rnd(301);
+  int max_key_level_insert = 200;
+  int max_key_universal_insert = 600;
+
+  // Stage 1: generate a db with level compaction
+  Options options;
+  options.write_buffer_size = 100<<10; //100KB
+  options.num_levels = 4;
+  options.level0_file_num_compaction_trigger = 3;
+  options.max_bytes_for_level_base = 500<<10; // 500KB
+  options.max_bytes_for_level_multiplier = 1;
+  options.target_file_size_base = 200<<10; // 200KB
+  options.target_file_size_multiplier = 1;
+  options = CurrentOptions(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  for (int i = 0; i <= max_key_level_insert; i++) {
+    // each value is 10K
+    ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000)));
+  }
+  ASSERT_OK(Flush(1));
+  dbfull()->TEST_WaitForCompact();
+
+  ASSERT_GT(TotalTableFiles(1, 4), 1);
+  int non_level0_num_files = 0;
+  for (int i = 1; i < options.num_levels; i++) {
+    non_level0_num_files += NumTableFilesAtLevel(i, 1);
+  }
+  ASSERT_GT(non_level0_num_files, 0);
+
+  // Stage 2: reopen with universal compaction - should fail
+  options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 1;
+  options = CurrentOptions(options);
+  Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_TRUE(s.IsInvalidArgument());
+
+  // Stage 3: compact into a single file and move the file to level 0
+  options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.target_file_size_base = INT_MAX;
+  options.target_file_size_multiplier = 1;
+  options.max_bytes_for_level_base = INT_MAX;
+  options.max_bytes_for_level_multiplier = 1;
+  options.num_levels = 4;
+  options = CurrentOptions(options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  dbfull()->CompactRange(handles_[1], nullptr, nullptr, true /* reduce level */,
+                         0 /* reduce to level 0 */);
+
+  for (int i = 0; i < options.num_levels; i++) {
+    int num = NumTableFilesAtLevel(i, 1);
+    if (i == 0) {
+      ASSERT_EQ(num, 1);
+    } else {
+      ASSERT_EQ(num, 0);
+    }
+  }
+
+  // Stage 4: re-open in universal compaction style and do some db operations
+  options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 4;
+  options.write_buffer_size = 100<<10; //100KB
+  options.level0_file_num_compaction_trigger = 3;
+  options = CurrentOptions(options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  options.num_levels = 1;
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  for (int i = max_key_level_insert / 2; i <= max_key_universal_insert; i++) {
+    ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000)));
   }
   dbfull()->Flush(FlushOptions());
   ASSERT_OK(Flush(1));
@@ -3408,6 +5256,97 @@ TEST(DBTest, ConvertCompactionStyle) {
   ASSERT_EQ(keys_in_db, expected_keys);
 }
 
+TEST_F(DBTest, IncreaseUniversalCompactionNumLevels) {
+  std::function<void(int)> verify_func = [&](int num_keys_in_db) {
+    std::string keys_in_db;
+    Iterator* iter = dbfull()->NewIterator(ReadOptions(), handles_[1]);
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      keys_in_db.append(iter->key().ToString());
+      keys_in_db.push_back(',');
+    }
+    delete iter;
+
+    std::string expected_keys;
+    for (int i = 0; i <= num_keys_in_db; i++) {
+      expected_keys.append(Key(i));
+      expected_keys.push_back(',');
+    }
+
+    ASSERT_EQ(keys_in_db, expected_keys);
+  };
+
+  Random rnd(301);
+  int max_key1 = 200;
+  int max_key2 = 600;
+  int max_key3 = 800;
+
+  // Stage 1: open a DB with universal compaction, num_levels=1
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 1;
+  options.write_buffer_size = 100 << 10;  // 100KB
+  options.level0_file_num_compaction_trigger = 3;
+  options = CurrentOptions(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  for (int i = 0; i <= max_key1; i++) {
+    // each value is 10K
+    ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000)));
+  }
+  ASSERT_OK(Flush(1));
+  dbfull()->TEST_WaitForCompact();
+
+  int non_level0_num_files = 0;
+  for (int i = 1; i < options.num_levels; i++) {
+    non_level0_num_files += NumTableFilesAtLevel(i, 1);
+  }
+  ASSERT_EQ(non_level0_num_files, 0);
+
+  // Stage 2: reopen with universal compaction, num_levels=4
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 4;
+  options = CurrentOptions(options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  verify_func(max_key1);
+
+  // Insert more keys
+  for (int i = max_key1 + 1; i <= max_key2; i++) {
+    // each value is 10K
+    ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000)));
+  }
+  ASSERT_OK(Flush(1));
+  dbfull()->TEST_WaitForCompact();
+
+  verify_func(max_key2);
+  // Compaction to non-L0 has happened.
+  ASSERT_GT(NumTableFilesAtLevel(options.num_levels - 1, 1), 0);
+
+  // Stage 3: Revert it back to one level and revert to num_levels=1.
+  options.num_levels = 4;
+  options.target_file_size_base = INT_MAX;
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  // Compact all to level 0
+  dbfull()->CompactRange(handles_[1], nullptr, nullptr, true /* reduce level */,
+                         0 /* reduce to level 0 */);
+  // Need to restart it once to remove higher level records in manifest.
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  // Final reopen
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 1;
+  options = CurrentOptions(options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  // Insert more keys
+  for (int i = max_key2 + 1; i <= max_key3; i++) {
+    // each value is 10K
+    ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000)));
+  }
+  ASSERT_OK(Flush(1));
+  dbfull()->TEST_WaitForCompact();
+  verify_func(max_key3);
+}
+
 namespace {
 void MinLevelHelper(DBTest* self, Options& options) {
   Random rnd(301);
@@ -3448,25 +5387,18 @@ bool MinLevelToCompress(CompressionType& type, Options& options, int wbits,
   options.level0_file_num_compaction_trigger = 3;
   options.create_if_missing = true;
 
-  if (SnappyCompressionSupported(CompressionOptions(wbits, lev, strategy))) {
+  if (Snappy_Supported()) {
     type = kSnappyCompression;
     fprintf(stderr, "using snappy\n");
-  } else if (ZlibCompressionSupported(
-               CompressionOptions(wbits, lev, strategy))) {
+  } else if (Zlib_Supported()) {
     type = kZlibCompression;
     fprintf(stderr, "using zlib\n");
-  } else if (BZip2CompressionSupported(
-               CompressionOptions(wbits, lev, strategy))) {
+  } else if (BZip2_Supported()) {
     type = kBZip2Compression;
     fprintf(stderr, "using bzip2\n");
-  } else if (LZ4CompressionSupported(
-                 CompressionOptions(wbits, lev, strategy))) {
+  } else if (LZ4_Supported()) {
     type = kLZ4Compression;
     fprintf(stderr, "using lz4\n");
-  } else if (LZ4HCCompressionSupported(
-                 CompressionOptions(wbits, lev, strategy))) {
-    type = kLZ4HCCompression;
-    fprintf(stderr, "using lz4hc\n");
   } else {
     fprintf(stderr, "skipping test, compression disabled\n");
     return false;
@@ -3484,13 +5416,13 @@ bool MinLevelToCompress(CompressionType& type, Options& options, int wbits,
 }
 }  // namespace
 
-TEST(DBTest, MinLevelToCompress1) {
+TEST_F(DBTest, MinLevelToCompress1) {
   Options options = CurrentOptions();
-  CompressionType type;
+  CompressionType type = kSnappyCompression;
   if (!MinLevelToCompress(type, options, -14, -1, 0)) {
     return;
   }
-  Reopen(&options);
+  Reopen(options);
   MinLevelHelper(this, options);
 
   // do not compress L0 and L1
@@ -3500,17 +5432,17 @@ TEST(DBTest, MinLevelToCompress1) {
   for (int i = 2; i < options.num_levels; i++) {
     options.compression_per_level[i] = type;
   }
-  DestroyAndReopen(&options);
+  DestroyAndReopen(options);
   MinLevelHelper(this, options);
 }
 
-TEST(DBTest, MinLevelToCompress2) {
+TEST_F(DBTest, MinLevelToCompress2) {
   Options options = CurrentOptions();
-  CompressionType type;
+  CompressionType type = kSnappyCompression;
   if (!MinLevelToCompress(type, options, 15, -1, 0)) {
     return;
   }
-  Reopen(&options);
+  Reopen(options);
   MinLevelHelper(this, options);
 
   // do not compress L0 and L1
@@ -3520,17 +5452,17 @@ TEST(DBTest, MinLevelToCompress2) {
   for (int i = 2; i < options.num_levels; i++) {
     options.compression_per_level[i] = type;
   }
-  DestroyAndReopen(&options);
+  DestroyAndReopen(options);
   MinLevelHelper(this, options);
 }
 
-TEST(DBTest, RepeatedWritesToSameKey) {
+TEST_F(DBTest, RepeatedWritesToSameKey) {
   do {
     Options options;
     options.env = env_;
     options.write_buffer_size = 100000;  // Small write buffer
     options = CurrentOptions(options);
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
 
     // We must have at most one file per level except for level-0,
     // which may have up to kL0_StopWritesTrigger files.
@@ -3538,7 +5470,8 @@ TEST(DBTest, RepeatedWritesToSameKey) {
         options.num_levels + options.level0_stop_writes_trigger;
 
     Random rnd(301);
-    std::string value = RandomString(&rnd, 2 * options.write_buffer_size);
+    std::string value =
+        RandomString(&rnd, static_cast<int>(2 * options.write_buffer_size));
     for (int i = 0; i < 5 * kMaxFiles; i++) {
       ASSERT_OK(Put(1, "key", value));
       ASSERT_LE(TotalTableFiles(1), kMaxFiles);
@@ -3546,7 +5479,7 @@ TEST(DBTest, RepeatedWritesToSameKey) {
   } while (ChangeCompactOptions());
 }
 
-TEST(DBTest, InPlaceUpdate) {
+TEST_F(DBTest, InPlaceUpdate) {
   do {
     Options options;
     options.create_if_missing = true;
@@ -3554,7 +5487,7 @@ TEST(DBTest, InPlaceUpdate) {
     options.env = env_;
     options.write_buffer_size = 100000;
     options = CurrentOptions(options);
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
 
     // Update key with values of smaller size
     int numValues = 10;
@@ -3570,7 +5503,7 @@ TEST(DBTest, InPlaceUpdate) {
   } while (ChangeCompactOptions());
 }
 
-TEST(DBTest, InPlaceUpdateLargeNewValue) {
+TEST_F(DBTest, InPlaceUpdateLargeNewValue) {
   do {
     Options options;
     options.create_if_missing = true;
@@ -3578,7 +5511,7 @@ TEST(DBTest, InPlaceUpdateLargeNewValue) {
     options.env = env_;
     options.write_buffer_size = 100000;
     options = CurrentOptions(options);
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
 
     // Update key with values of larger size
     int numValues = 10;
@@ -3594,8 +5527,7 @@ TEST(DBTest, InPlaceUpdateLargeNewValue) {
   } while (ChangeCompactOptions());
 }
 
-
-TEST(DBTest, InPlaceUpdateCallbackSmallerSize) {
+TEST_F(DBTest, InPlaceUpdateCallbackSmallerSize) {
   do {
     Options options;
     options.create_if_missing = true;
@@ -3606,7 +5538,7 @@ TEST(DBTest, InPlaceUpdateCallbackSmallerSize) {
     options.inplace_callback =
       rocksdb::DBTest::updateInPlaceSmallerSize;
     options = CurrentOptions(options);
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
 
     // Update key with values of smaller size
     int numValues = 10;
@@ -3624,7 +5556,7 @@ TEST(DBTest, InPlaceUpdateCallbackSmallerSize) {
   } while (ChangeCompactOptions());
 }
 
-TEST(DBTest, InPlaceUpdateCallbackSmallerVarintSize) {
+TEST_F(DBTest, InPlaceUpdateCallbackSmallerVarintSize) {
   do {
     Options options;
     options.create_if_missing = true;
@@ -3635,7 +5567,7 @@ TEST(DBTest, InPlaceUpdateCallbackSmallerVarintSize) {
     options.inplace_callback =
       rocksdb::DBTest::updateInPlaceSmallerVarintSize;
     options = CurrentOptions(options);
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
 
     // Update key with values of smaller varint size
     int numValues = 265;
@@ -3653,7 +5585,7 @@ TEST(DBTest, InPlaceUpdateCallbackSmallerVarintSize) {
   } while (ChangeCompactOptions());
 }
 
-TEST(DBTest, InPlaceUpdateCallbackLargeNewValue) {
+TEST_F(DBTest, InPlaceUpdateCallbackLargeNewValue) {
   do {
     Options options;
     options.create_if_missing = true;
@@ -3664,7 +5596,7 @@ TEST(DBTest, InPlaceUpdateCallbackLargeNewValue) {
     options.inplace_callback =
       rocksdb::DBTest::updateInPlaceLargerSize;
     options = CurrentOptions(options);
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
 
     // Update key with values of larger size
     int numValues = 10;
@@ -3680,7 +5612,7 @@ TEST(DBTest, InPlaceUpdateCallbackLargeNewValue) {
   } while (ChangeCompactOptions());
 }
 
-TEST(DBTest, InPlaceUpdateCallbackNoAction) {
+TEST_F(DBTest, InPlaceUpdateCallbackNoAction) {
   do {
     Options options;
     options.create_if_missing = true;
@@ -3691,7 +5623,7 @@ TEST(DBTest, InPlaceUpdateCallbackNoAction) {
     options.inplace_callback =
       rocksdb::DBTest::updateInPlaceNoAction;
     options = CurrentOptions(options);
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
 
     // Callback function requests no actions from db
     ASSERT_OK(Put(1, "key", DummyString(1, 'a')));
@@ -3700,14 +5632,14 @@ TEST(DBTest, InPlaceUpdateCallbackNoAction) {
   } while (ChangeCompactOptions());
 }
 
-TEST(DBTest, CompactionFilter) {
+TEST_F(DBTest, CompactionFilter) {
   Options options = CurrentOptions();
   options.max_open_files = -1;
   options.num_levels = 3;
   options.max_mem_compaction_level = 0;
   options.compaction_filter_factory = std::make_shared<KeepFilterFactory>();
   options = CurrentOptions(options);
-  CreateAndReopenWithCF({"pikachu"}, &options);
+  CreateAndReopenWithCF({"pikachu"}, options);
 
   // Write 100K keys, these are written to a few files in L0.
   const std::string value(10, 'x');
@@ -3741,22 +5673,25 @@ TEST(DBTest, CompactionFilter) {
   // TODO: figure out sequence number squashtoo
   int count = 0;
   int total = 0;
-  Iterator* iter = dbfull()->TEST_NewInternalIterator(handles_[1]);
-  iter->SeekToFirst();
-  ASSERT_OK(iter->status());
-  while (iter->Valid()) {
-    ParsedInternalKey ikey(Slice(), 0, kTypeValue);
-    ikey.sequence = -1;
-    ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
-    total++;
-    if (ikey.sequence != 0) {
-      count++;
+  Arena arena;
+  {
+    ScopedArenaIterator iter(
+        dbfull()->TEST_NewInternalIterator(&arena, handles_[1]));
+    iter->SeekToFirst();
+    ASSERT_OK(iter->status());
+    while (iter->Valid()) {
+      ParsedInternalKey ikey(Slice(), 0, kTypeValue);
+      ikey.sequence = -1;
+      ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
+      total++;
+      if (ikey.sequence != 0) {
+        count++;
+      }
+      iter->Next();
     }
-    iter->Next();
   }
   ASSERT_EQ(total, 100000);
   ASSERT_EQ(count, 1);
-  delete iter;
 
   // overwrite all the 100K keys once again.
   for (int i = 0; i < 100000; i++) {
@@ -3783,8 +5718,8 @@ TEST(DBTest, CompactionFilter) {
   // filter in such a way that it deletes all keys
   options.compaction_filter_factory = std::make_shared<DeleteFilterFactory>();
   options.create_if_missing = true;
-  DestroyAndReopen(&options);
-  CreateAndReopenWithCF({"pikachu"}, &options);
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
 
   // write all the keys once again.
   for (int i = 0; i < 100000; i++) {
@@ -3810,16 +5745,18 @@ TEST(DBTest, CompactionFilter) {
   ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
   ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
 
-  // Scan the entire database to ensure that nothing is left
-  iter = db_->NewIterator(ReadOptions(), handles_[1]);
-  iter->SeekToFirst();
-  count = 0;
-  while (iter->Valid()) {
-    count++;
-    iter->Next();
+  {
+    // Scan the entire database to ensure that nothing is left
+    std::unique_ptr<Iterator> iter(
+        db_->NewIterator(ReadOptions(), handles_[1]));
+    iter->SeekToFirst();
+    count = 0;
+    while (iter->Valid()) {
+      count++;
+      iter->Next();
+    }
+    ASSERT_EQ(count, 0);
   }
-  ASSERT_EQ(count, 0);
-  delete iter;
 
   // The sequence number of the remaining record
   // is not zeroed out even though it is at the
@@ -3827,31 +5764,66 @@ TEST(DBTest, CompactionFilter) {
   // TODO: remove the following or design a different
   // test
   count = 0;
-  iter = dbfull()->TEST_NewInternalIterator(handles_[1]);
-  iter->SeekToFirst();
-  ASSERT_OK(iter->status());
-  while (iter->Valid()) {
-    ParsedInternalKey ikey(Slice(), 0, kTypeValue);
-    ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
-    ASSERT_NE(ikey.sequence, (unsigned)0);
-    count++;
-    iter->Next();
+  {
+    ScopedArenaIterator iter(
+        dbfull()->TEST_NewInternalIterator(&arena, handles_[1]));
+    iter->SeekToFirst();
+    ASSERT_OK(iter->status());
+    while (iter->Valid()) {
+      ParsedInternalKey ikey(Slice(), 0, kTypeValue);
+      ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
+      ASSERT_NE(ikey.sequence, (unsigned)0);
+      count++;
+      iter->Next();
+    }
+    ASSERT_EQ(count, 0);
   }
-  ASSERT_EQ(count, 0);
-  delete iter;
 }
 
-TEST(DBTest, CompactionFilterWithValueChange) {
-  do {
-    Options options;
-    options.num_levels = 3;
-    options.max_mem_compaction_level = 0;
-    options.compaction_filter_factory =
-      std::make_shared<ChangeFilterFactory>();
-    options = CurrentOptions(options);
-    CreateAndReopenWithCF({"pikachu"}, &options);
+// Tests the edge case where compaction does not produce any output -- all
+// entries are deleted. The compaction should create bunch of 'DeleteFile'
+// entries in VersionEdit, but none of the 'AddFile's.
+TEST_F(DBTest, CompactionFilterDeletesAll) {
+  Options options;
+  options.compaction_filter_factory = std::make_shared<DeleteFilterFactory>();
+  options.disable_auto_compactions = true;
+  options.create_if_missing = true;
+  options = CurrentOptions(options);
+  DestroyAndReopen(options);
 
-    // Write 100K+1 keys, these are written to a few files
+  // put some data
+  for (int table = 0; table < 4; ++table) {
+    for (int i = 0; i < 10 + table; ++i) {
+      Put(ToString(table * 100 + i), "val");
+    }
+    Flush();
+  }
+
+  // this will produce empty file (delete compaction filter)
+  ASSERT_OK(db_->CompactRange(nullptr, nullptr));
+  ASSERT_EQ(0U, CountLiveFiles());
+
+  Reopen(options);
+
+  Iterator* itr = db_->NewIterator(ReadOptions());
+  itr->SeekToFirst();
+  // empty db
+  ASSERT_TRUE(!itr->Valid());
+
+  delete itr;
+}
+
+TEST_F(DBTest, CompactionFilterWithValueChange) {
+  do {
+    Options options;
+    options.num_levels = 3;
+    options.max_mem_compaction_level = 0;
+    options.compaction_filter_factory =
+      std::make_shared<ChangeFilterFactory>();
+    options = CurrentOptions(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Write 100K+1 keys, these are written to a few files
     // in L0. We do this so that the current snapshot points
     // to the 100001 key.The compaction filter is  not invoked
     // on keys that are visible via a snapshot because we
@@ -3865,8 +5837,12 @@ TEST(DBTest, CompactionFilterWithValueChange) {
 
     // push all files to  lower levels
     ASSERT_OK(Flush(1));
-    dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
-    dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
+    if (option_config_ != kUniversalCompactionMultiLevel) {
+      dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
+      dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
+    } else {
+      dbfull()->CompactRange(handles_[1], nullptr, nullptr);
+    }
 
     // re-write all data again
     for (int i = 0; i < 100001; i++) {
@@ -3878,8 +5854,12 @@ TEST(DBTest, CompactionFilterWithValueChange) {
     // push all files to  lower levels. This should
     // invoke the compaction filter for all 100000 keys.
     ASSERT_OK(Flush(1));
-    dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
-    dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
+    if (option_config_ != kUniversalCompactionMultiLevel) {
+      dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
+      dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
+    } else {
+      dbfull()->CompactRange(handles_[1], nullptr, nullptr);
+    }
 
     // verify that all keys now have the new value that
     // was set by the compaction process.
@@ -3892,7 +5872,76 @@ TEST(DBTest, CompactionFilterWithValueChange) {
   } while (ChangeCompactOptions());
 }
 
-TEST(DBTest, CompactionFilterContextManual) {
+TEST_F(DBTest, CompactionFilterWithMergeOperator) {
+  std::string one, two, three, four;
+  PutFixed64(&one, 1);
+  PutFixed64(&two, 2);
+  PutFixed64(&three, 3);
+  PutFixed64(&four, 4);
+
+  Options options;
+  options = CurrentOptions(options);
+  options.create_if_missing = true;
+  options.merge_operator = MergeOperators::CreateUInt64AddOperator();
+  options.num_levels = 3;
+  options.max_mem_compaction_level = 0;
+  // Filter out keys with value is 2.
+  options.compaction_filter_factory =
+      std::make_shared<ConditionalFilterFactory>(two);
+  DestroyAndReopen(options);
+
+  // In the same compaction, a value type needs to be deleted based on
+  // compaction filter, and there is a merge type for the key. compaction
+  // filter result is ignored.
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", two));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->Merge(WriteOptions(), "foo", one));
+  ASSERT_OK(Flush());
+  std::string newvalue = Get("foo");
+  ASSERT_EQ(newvalue, three);
+  dbfull()->CompactRange(nullptr, nullptr);
+  newvalue = Get("foo");
+  ASSERT_EQ(newvalue, three);
+
+  // value key can be deleted based on compaction filter, leaving only
+  // merge keys.
+  ASSERT_OK(db_->Put(WriteOptions(), "bar", two));
+  ASSERT_OK(Flush());
+  dbfull()->CompactRange(nullptr, nullptr);
+  newvalue = Get("bar");
+  ASSERT_EQ("NOT_FOUND", newvalue);
+  ASSERT_OK(db_->Merge(WriteOptions(), "bar", two));
+  ASSERT_OK(Flush());
+  dbfull()->CompactRange(nullptr, nullptr);
+  newvalue = Get("bar");
+  ASSERT_EQ(two, two);
+
+  // Compaction filter never applies to merge keys.
+  ASSERT_OK(db_->Put(WriteOptions(), "foobar", one));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->Merge(WriteOptions(), "foobar", two));
+  ASSERT_OK(Flush());
+  newvalue = Get("foobar");
+  ASSERT_EQ(newvalue, three);
+  dbfull()->CompactRange(nullptr, nullptr);
+  newvalue = Get("foobar");
+  ASSERT_EQ(newvalue, three);
+
+  // In the same compaction, both of value type and merge type keys need to be
+  // deleted based on compaction filter, and there is a merge type for the key.
+  // For both keys, compaction filter results are ignored.
+  ASSERT_OK(db_->Put(WriteOptions(), "barfoo", two));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->Merge(WriteOptions(), "barfoo", two));
+  ASSERT_OK(Flush());
+  newvalue = Get("barfoo");
+  ASSERT_EQ(newvalue, four);
+  dbfull()->CompactRange(nullptr, nullptr);
+  newvalue = Get("barfoo");
+  ASSERT_EQ(newvalue, four);
+}
+
+TEST_F(DBTest, CompactionFilterContextManual) {
   KeepFilterFactory* filter = new KeepFilterFactory();
 
   Options options = CurrentOptions();
@@ -3900,7 +5949,7 @@ TEST(DBTest, CompactionFilterContextManual) {
   options.compaction_filter_factory.reset(filter);
   options.compression = kNoCompression;
   options.level0_file_num_compaction_trigger = 8;
-  Reopen(&options);
+  Reopen(options);
   int num_keys_per_file = 400;
   for (int j = 0; j < 3; j++) {
     // Write several keys.
@@ -3923,27 +5972,29 @@ TEST(DBTest, CompactionFilterContextManual) {
                                                  // set this flag.
   dbfull()->CompactRange(nullptr, nullptr);
   ASSERT_EQ(cfilter_count, 700);
-  ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+  ASSERT_EQ(NumSortedRuns(0), 1);
 
   // Verify total number of keys is correct after manual compaction.
-  int count = 0;
-  int total = 0;
-  Iterator* iter = dbfull()->TEST_NewInternalIterator();
-  iter->SeekToFirst();
-  ASSERT_OK(iter->status());
-  while (iter->Valid()) {
-    ParsedInternalKey ikey(Slice(), 0, kTypeValue);
-    ikey.sequence = -1;
-    ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
-    total++;
-    if (ikey.sequence != 0) {
-      count++;
+  {
+    int count = 0;
+    int total = 0;
+    Arena arena;
+    ScopedArenaIterator iter(dbfull()->TEST_NewInternalIterator(&arena));
+    iter->SeekToFirst();
+    ASSERT_OK(iter->status());
+    while (iter->Valid()) {
+      ParsedInternalKey ikey(Slice(), 0, kTypeValue);
+      ikey.sequence = -1;
+      ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
+      total++;
+      if (ikey.sequence != 0) {
+        count++;
+      }
+      iter->Next();
     }
-    iter->Next();
+    ASSERT_EQ(total, 700);
+    ASSERT_EQ(count, 1);
   }
-  ASSERT_EQ(total, 700);
-  ASSERT_EQ(count, 1);
-  delete iter;
 }
 
 class KeepFilterV2 : public CompactionFilterV2 {
@@ -4066,7 +6117,7 @@ class ChangeFilterFactoryV2 : public CompactionFilterFactoryV2 {
   }
 };
 
-TEST(DBTest, CompactionFilterV2) {
+TEST_F(DBTest, CompactionFilterV2) {
   Options options = CurrentOptions();
   options.num_levels = 3;
   options.max_mem_compaction_level = 0;
@@ -4080,7 +6131,7 @@ TEST(DBTest, CompactionFilterV2) {
   // compaction filter buffer using universal compaction
   option_config_ = kUniversalCompaction;
   options.compaction_style = (rocksdb::CompactionStyle)1;
-  Reopen(&options);
+  Reopen(options);
 
   // Write 100K keys, these are written to a few files in L0.
   const std::string value(10, 'x');
@@ -4095,37 +6146,39 @@ TEST(DBTest, CompactionFilterV2) {
   dbfull()->TEST_CompactRange(0, nullptr, nullptr);
   dbfull()->TEST_CompactRange(1, nullptr, nullptr);
 
-  ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+  ASSERT_EQ(NumSortedRuns(0), 1);
 
   // All the files are in the lowest level.
   int count = 0;
   int total = 0;
-  Iterator* iter = dbfull()->TEST_NewInternalIterator();
-  iter->SeekToFirst();
-  ASSERT_OK(iter->status());
-  while (iter->Valid()) {
-    ParsedInternalKey ikey(Slice(), 0, kTypeValue);
-    ikey.sequence = -1;
-    ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
-    total++;
-    if (ikey.sequence != 0) {
-      count++;
+  {
+    Arena arena;
+    ScopedArenaIterator iter(dbfull()->TEST_NewInternalIterator(&arena));
+    iter->SeekToFirst();
+    ASSERT_OK(iter->status());
+    while (iter->Valid()) {
+      ParsedInternalKey ikey(Slice(), 0, kTypeValue);
+      ikey.sequence = -1;
+      ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
+      total++;
+      if (ikey.sequence != 0) {
+        count++;
+      }
+      iter->Next();
     }
-    iter->Next();
   }
 
   ASSERT_EQ(total, 100000);
   // 1 snapshot only. Since we are using universal compacton,
   // the sequence no is cleared for better compression
   ASSERT_EQ(count, 1);
-  delete iter;
 
   // create a new database with the compaction
   // filter in such a way that it deletes all keys
   options.compaction_filter_factory_v2 =
     std::make_shared<DeleteFilterFactoryV2>(prefix_extractor.get());
   options.create_if_missing = true;
-  DestroyAndReopen(&options);
+  DestroyAndReopen(options);
 
   // write all the keys once again.
   for (int i = 0; i < 100000; i++) {
@@ -4142,7 +6195,7 @@ TEST(DBTest, CompactionFilterV2) {
   ASSERT_EQ(NumTableFilesAtLevel(1), 0);
 
   // Scan the entire database to ensure that nothing is left
-  iter = db_->NewIterator(ReadOptions());
+  Iterator* iter = db_->NewIterator(ReadOptions());
   iter->SeekToFirst();
   count = 0;
   while (iter->Valid()) {
@@ -4154,7 +6207,7 @@ TEST(DBTest, CompactionFilterV2) {
   delete iter;
 }
 
-TEST(DBTest, CompactionFilterV2WithValueChange) {
+TEST_F(DBTest, CompactionFilterV2WithValueChange) {
   Options options = CurrentOptions();
   options.num_levels = 3;
   options.max_mem_compaction_level = 0;
@@ -4167,7 +6220,7 @@ TEST(DBTest, CompactionFilterV2WithValueChange) {
   option_config_ = kUniversalCompaction;
   options.compaction_style = (rocksdb::CompactionStyle)1;
   options = CurrentOptions(options);
-  Reopen(&options);
+  Reopen(options);
 
   // Write 100K+1 keys, these are written to a few files
   // in L0. We do this so that the current snapshot points
@@ -4196,7 +6249,7 @@ TEST(DBTest, CompactionFilterV2WithValueChange) {
   }
 }
 
-TEST(DBTest, CompactionFilterV2NULLPrefix) {
+TEST_F(DBTest, CompactionFilterV2NULLPrefix) {
   Options options = CurrentOptions();
   options.num_levels = 3;
   options.max_mem_compaction_level = 0;
@@ -4208,7 +6261,7 @@ TEST(DBTest, CompactionFilterV2NULLPrefix) {
   // compaction filter buffer using universal compaction
   option_config_ = kUniversalCompaction;
   options.compaction_style = (rocksdb::CompactionStyle)1;
-  Reopen(&options);
+  Reopen(options);
 
   // Write 100K+1 keys, these are written to a few files
   // in L0. We do this so that the current snapshot points
@@ -4242,16 +6295,16 @@ TEST(DBTest, CompactionFilterV2NULLPrefix) {
   for (int i = 1; i < 100000; i++) {
     char key[100];
     snprintf(key, sizeof(key), "%08d%010d", i, i);
-    std::string newvalue = Get(key);
+    newvalue = Get(key);
     ASSERT_EQ(newvalue.compare(NEW_VALUE), 0);
   }
 }
 
-TEST(DBTest, SparseMerge) {
+TEST_F(DBTest, SparseMerge) {
   do {
     Options options = CurrentOptions();
     options.compression = kNoCompression;
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
 
     FillLevels("A", "Z", 1);
 
@@ -4303,17 +6356,18 @@ static bool Between(uint64_t val, uint64_t low, uint64_t high) {
   return result;
 }
 
-TEST(DBTest, ApproximateSizes) {
+TEST_F(DBTest, ApproximateSizes) {
   do {
     Options options;
     options.write_buffer_size = 100000000;        // Large write buffer
     options.compression = kNoCompression;
+    options.create_if_missing = true;
     options = CurrentOptions(options);
-    DestroyAndReopen();
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    DestroyAndReopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
 
     ASSERT_TRUE(Between(Size("", "xyz", 1), 0, 0));
-    ReopenWithColumnFamilies({"default", "pikachu"}, &options);
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
     ASSERT_TRUE(Between(Size("", "xyz", 1), 0, 0));
 
     // Write 8MB (80 values, each 100K)
@@ -4331,7 +6385,7 @@ TEST(DBTest, ApproximateSizes) {
 
     // Check sizes across recovery by reopening a few times
     for (int run = 0; run < 3; run++) {
-      ReopenWithColumnFamilies({"default", "pikachu"}, &options);
+      ReopenWithColumnFamilies({"default", "pikachu"}, options);
 
       for (int compact_start = 0; compact_start < N; compact_start += 10) {
         for (int i = 0; i < N; i += 10) {
@@ -4355,14 +6409,15 @@ TEST(DBTest, ApproximateSizes) {
       ASSERT_GT(NumTableFilesAtLevel(1, 1), 0);
     }
     // ApproximateOffsetOf() is not yet implemented in plain table format.
-  } while (ChangeOptions(kSkipUniversalCompaction | kSkipPlainTable));
+  } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction |
+                         kSkipPlainTable | kSkipHashIndex));
 }
 
-TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
+TEST_F(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
   do {
     Options options = CurrentOptions();
     options.compression = kNoCompression;
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
 
     Random rnd(301);
     std::string big1 = RandomString(&rnd, 100000);
@@ -4377,7 +6432,7 @@ TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
 
     // Check sizes across recovery by reopening a few times
     for (int run = 0; run < 3; run++) {
-      ReopenWithColumnFamilies({"default", "pikachu"}, &options);
+      ReopenWithColumnFamilies({"default", "pikachu"}, options);
 
       ASSERT_TRUE(Between(Size("", Key(0), 1), 0, 0));
       ASSERT_TRUE(Between(Size("", Key(1), 1), 10000, 11000));
@@ -4397,9 +6452,9 @@ TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
   } while (ChangeOptions(kSkipPlainTable));
 }
 
-TEST(DBTest, IteratorPinsRef) {
+TEST_F(DBTest, IteratorPinsRef) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     Put(1, "foo", "hello");
 
     // Get iterator that will yield the current contents of the DB.
@@ -4423,18 +6478,32 @@ TEST(DBTest, IteratorPinsRef) {
   } while (ChangeCompactOptions());
 }
 
-TEST(DBTest, Snapshot) {
+TEST_F(DBTest, Snapshot) {
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override));
     Put(0, "foo", "0v1");
     Put(1, "foo", "1v1");
+
     const Snapshot* s1 = db_->GetSnapshot();
+    ASSERT_EQ(1U, GetNumSnapshots());
+    uint64_t time_snap1 = GetTimeOldestSnapshots();
+    ASSERT_GT(time_snap1, 0U);
     Put(0, "foo", "0v2");
     Put(1, "foo", "1v2");
+
+    env_->addon_time_++;
+
     const Snapshot* s2 = db_->GetSnapshot();
+    ASSERT_EQ(2U, GetNumSnapshots());
+    ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
     Put(0, "foo", "0v3");
     Put(1, "foo", "1v3");
+
     const Snapshot* s3 = db_->GetSnapshot();
+    ASSERT_EQ(3U, GetNumSnapshots());
+    ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
 
     Put(0, "foo", "0v4");
     Put(1, "foo", "1v4");
@@ -4448,6 +6517,8 @@ TEST(DBTest, Snapshot) {
     ASSERT_EQ("1v4", Get(1, "foo"));
 
     db_->ReleaseSnapshot(s3);
+    ASSERT_EQ(2U, GetNumSnapshots());
+    ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
     ASSERT_EQ("0v1", Get(0, "foo", s1));
     ASSERT_EQ("1v1", Get(1, "foo", s1));
     ASSERT_EQ("0v2", Get(0, "foo", s2));
@@ -4460,16 +6531,23 @@ TEST(DBTest, Snapshot) {
     ASSERT_EQ("1v2", Get(1, "foo", s2));
     ASSERT_EQ("0v4", Get(0, "foo"));
     ASSERT_EQ("1v4", Get(1, "foo"));
+    ASSERT_EQ(1U, GetNumSnapshots());
+    ASSERT_LT(time_snap1, GetTimeOldestSnapshots());
 
     db_->ReleaseSnapshot(s2);
+    ASSERT_EQ(0U, GetNumSnapshots());
     ASSERT_EQ("0v4", Get(0, "foo"));
     ASSERT_EQ("1v4", Get(1, "foo"));
   } while (ChangeOptions(kSkipHashCuckoo));
 }
 
-TEST(DBTest, HiddenValuesAreRemoved) {
+TEST_F(DBTest, HiddenValuesAreRemoved) {
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    Options options = CurrentOptions(options_override);
+    options.max_background_flushes = 0;
+    CreateAndReopenWithCF({"pikachu"}, options);
     Random rnd(301);
     FillLevels("a", "z", 1);
 
@@ -4499,15 +6577,17 @@ TEST(DBTest, HiddenValuesAreRemoved) {
     // ApproximateOffsetOf() is not yet implemented in plain table format,
     // which is used by Size().
     // skip HashCuckooRep as it does not support snapshot
-  } while (ChangeOptions(kSkipUniversalCompaction | kSkipPlainTable |
-                         kSkipHashCuckoo));
+  } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction |
+                         kSkipPlainTable | kSkipHashCuckoo));
 }
 
-TEST(DBTest, CompactBetweenSnapshots) {
+TEST_F(DBTest, CompactBetweenSnapshots) {
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
   do {
-    Options options = CurrentOptions();
+    Options options = CurrentOptions(options_override);
     options.disable_auto_compactions = true;
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, options);
     Random rnd(301);
     FillLevels("a", "z", 1);
 
@@ -4556,11 +6636,13 @@ TEST(DBTest, CompactBetweenSnapshots) {
     ASSERT_EQ("sixth", Get(1, "foo"));
     ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth ]");
     // skip HashCuckooRep as it does not support snapshot
-  } while (ChangeOptions(kSkipHashCuckoo));
+  } while (ChangeOptions(kSkipHashCuckoo | kSkipFIFOCompaction));
 }
 
-TEST(DBTest, DeletionMarkers1) {
-  CreateAndReopenWithCF({"pikachu"});
+TEST_F(DBTest, DeletionMarkers1) {
+  Options options = CurrentOptions();
+  options.max_background_flushes = 0;
+  CreateAndReopenWithCF({"pikachu"}, options);
   Put(1, "foo", "v1");
   ASSERT_OK(Flush(1));
   const int last = CurrentOptions().max_mem_compaction_level;
@@ -4594,8 +6676,10 @@ TEST(DBTest, DeletionMarkers1) {
   ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2 ]");
 }
 
-TEST(DBTest, DeletionMarkers2) {
-  CreateAndReopenWithCF({"pikachu"});
+TEST_F(DBTest, DeletionMarkers2) {
+  Options options = CurrentOptions();
+  options.max_background_flushes = 0;
+  CreateAndReopenWithCF({"pikachu"}, options);
   Put(1, "foo", "v1");
   ASSERT_OK(Flush(1));
   const int last = CurrentOptions().max_mem_compaction_level;
@@ -4622,9 +6706,11 @@ TEST(DBTest, DeletionMarkers2) {
   ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
 }
 
-TEST(DBTest, OverlapInLevel0) {
+TEST_F(DBTest, OverlapInLevel0) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    Options options = CurrentOptions();
+    options.max_background_flushes = 0;
+    CreateAndReopenWithCF({"pikachu"}, options);
     int tmp = CurrentOptions().max_mem_compaction_level;
     ASSERT_EQ(tmp, 2) << "Fix test to match config";
 
@@ -4662,108 +6748,114 @@ TEST(DBTest, OverlapInLevel0) {
     Flush(1);
     ASSERT_EQ("3", FilesPerLevel(1));
     ASSERT_EQ("NOT_FOUND", Get(1, "600"));
-  } while (ChangeOptions(kSkipUniversalCompaction));
+  } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction));
 }
 
-TEST(DBTest, L0_CompactionBug_Issue44_a) {
+TEST_F(DBTest, L0_CompactionBug_Issue44_a) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     ASSERT_OK(Put(1, "b", "v"));
-    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     ASSERT_OK(Delete(1, "b"));
     ASSERT_OK(Delete(1, "a"));
-    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     ASSERT_OK(Delete(1, "a"));
-    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     ASSERT_OK(Put(1, "a", "v"));
-    ReopenWithColumnFamilies({"default", "pikachu"});
-    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     ASSERT_EQ("(a->v)", Contents(1));
     env_->SleepForMicroseconds(1000000);  // Wait for compaction to finish
     ASSERT_EQ("(a->v)", Contents(1));
   } while (ChangeCompactOptions());
 }
 
-TEST(DBTest, L0_CompactionBug_Issue44_b) {
+TEST_F(DBTest, L0_CompactionBug_Issue44_b) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     Put(1, "", "");
-    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     Delete(1, "e");
     Put(1, "", "");
-    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     Put(1, "c", "cv");
-    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     Put(1, "", "");
-    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     Put(1, "", "");
     env_->SleepForMicroseconds(1000000);  // Wait for compaction to finish
-    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     Put(1, "d", "dv");
-    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     Put(1, "", "");
-    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     Delete(1, "d");
     Delete(1, "b");
-    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     ASSERT_EQ("(->)(c->cv)", Contents(1));
     env_->SleepForMicroseconds(1000000);  // Wait for compaction to finish
     ASSERT_EQ("(->)(c->cv)", Contents(1));
   } while (ChangeCompactOptions());
 }
 
-TEST(DBTest, ComparatorCheck) {
+TEST_F(DBTest, ComparatorCheck) {
   class NewComparator : public Comparator {
    public:
-    virtual const char* Name() const { return "rocksdb.NewComparator"; }
-    virtual int Compare(const Slice& a, const Slice& b) const {
+    virtual const char* Name() const override {
+      return "rocksdb.NewComparator";
+    }
+    virtual int Compare(const Slice& a, const Slice& b) const override {
       return BytewiseComparator()->Compare(a, b);
     }
-    virtual void FindShortestSeparator(std::string* s, const Slice& l) const {
+    virtual void FindShortestSeparator(std::string* s,
+                                       const Slice& l) const override {
       BytewiseComparator()->FindShortestSeparator(s, l);
     }
-    virtual void FindShortSuccessor(std::string* key) const {
+    virtual void FindShortSuccessor(std::string* key) const override {
       BytewiseComparator()->FindShortSuccessor(key);
     }
   };
   Options new_options, options;
   NewComparator cmp;
   do {
-    CreateAndReopenWithCF({"pikachu"});
     options = CurrentOptions();
+    CreateAndReopenWithCF({"pikachu"}, options);
     new_options = CurrentOptions();
     new_options.comparator = &cmp;
     // only the non-default column family has non-matching comparator
     Status s = TryReopenWithColumnFamilies({"default", "pikachu"},
-                                           {&options, &new_options});
+        std::vector<Options>({options, new_options}));
     ASSERT_TRUE(!s.ok());
     ASSERT_TRUE(s.ToString().find("comparator") != std::string::npos)
         << s.ToString();
-  } while (ChangeCompactOptions(&new_options));
+  } while (ChangeCompactOptions());
 }
 
-TEST(DBTest, CustomComparator) {
+TEST_F(DBTest, CustomComparator) {
   class NumberComparator : public Comparator {
    public:
-    virtual const char* Name() const { return "test.NumberComparator"; }
-    virtual int Compare(const Slice& a, const Slice& b) const {
+    virtual const char* Name() const override {
+      return "test.NumberComparator";
+    }
+    virtual int Compare(const Slice& a, const Slice& b) const override {
       return ToNumber(a) - ToNumber(b);
     }
-    virtual void FindShortestSeparator(std::string* s, const Slice& l) const {
+    virtual void FindShortestSeparator(std::string* s,
+                                       const Slice& l) const override {
       ToNumber(*s);     // Check format
       ToNumber(l);      // Check format
     }
-    virtual void FindShortSuccessor(std::string* key) const {
+    virtual void FindShortSuccessor(std::string* key) const override {
       ToNumber(*key);   // Check format
     }
    private:
     static int ToNumber(const Slice& x) {
       // Check that there are no extra characters.
-      ASSERT_TRUE(x.size() >= 2 && x[0] == '[' && x[x.size()-1] == ']')
+      EXPECT_TRUE(x.size() >= 2 && x[0] == '[' && x[x.size() - 1] == ']')
           << EscapeString(x);
       int val;
       char ignored;
-      ASSERT_TRUE(sscanf(x.ToString().c_str(), "[%i]%c", &val, &ignored) == 1)
+      EXPECT_TRUE(sscanf(x.ToString().c_str(), "[%i]%c", &val, &ignored) == 1)
           << EscapeString(x);
       return val;
     }
@@ -4774,11 +6866,10 @@ TEST(DBTest, CustomComparator) {
     new_options = CurrentOptions();
     new_options.create_if_missing = true;
     new_options.comparator = &cmp;
-    new_options.filter_policy = nullptr;     // Cannot use bloom filters
     new_options.write_buffer_size = 1000;  // Compact more often
     new_options = CurrentOptions(new_options);
-    DestroyAndReopen(&new_options);
-    CreateAndReopenWithCF({"pikachu"}, &new_options);
+    DestroyAndReopen(new_options);
+    CreateAndReopenWithCF({"pikachu"}, new_options);
     ASSERT_OK(Put(1, "[10]", "ten"));
     ASSERT_OK(Put(1, "[0x14]", "twenty"));
     for (int i = 0; i < 2; i++) {
@@ -4799,11 +6890,13 @@ TEST(DBTest, CustomComparator) {
       }
       Compact(1, "[0]", "[1000000]");
     }
-  } while (ChangeCompactOptions(&new_options));
+  } while (ChangeCompactOptions());
 }
 
-TEST(DBTest, ManualCompaction) {
-  CreateAndReopenWithCF({"pikachu"});
+TEST_F(DBTest, ManualCompaction) {
+  Options options = CurrentOptions();
+  options.max_background_flushes = 0;
+  CreateAndReopenWithCF({"pikachu"}, options);
   ASSERT_EQ(dbfull()->MaxMemCompactionLevel(), 2)
       << "Need to update this test to match kMaxMemCompactLevel";
 
@@ -4840,31 +6933,159 @@ TEST(DBTest, ManualCompaction) {
     ASSERT_EQ("0,0,1", FilesPerLevel(1));
 
     if (iter == 0) {
-      Options options = CurrentOptions();
+      options = CurrentOptions();
+      options.max_background_flushes = 0;
       options.num_levels = 3;
       options.create_if_missing = true;
-      DestroyAndReopen(&options);
-      CreateAndReopenWithCF({"pikachu"}, &options);
+      DestroyAndReopen(options);
+      CreateAndReopenWithCF({"pikachu"}, options);
     }
   }
 
 }
 
-TEST(DBTest, DBOpen_Options) {
-  std::string dbname = test::TmpDir() + "/db_options_test";
-  ASSERT_OK(DestroyDB(dbname, Options()));
+class DBTestUniversalManualCompactionOutputPathId
+    : public DBTestUniversalCompactionBase {};
+
+TEST_P(DBTestUniversalManualCompactionOutputPathId,
+       ManualCompactionOutputPathId) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.db_paths.emplace_back(dbname_, 1000000000);
+  options.db_paths.emplace_back(dbname_ + "_2", 1000000000);
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = num_levels_;
+  options.target_file_size_base = 1 << 30;  // Big size
+  options.level0_file_num_compaction_trigger = 10;
+  Destroy(options);
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+  MakeTables(3, "p", "q", 1);
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(3, TotalLiveFiles(1));
+  ASSERT_EQ(3, GetSstFileCount(options.db_paths[0].path));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[1].path));
+
+  // Full compaction to DB path 0
+  db_->CompactRange(handles_[1], nullptr, nullptr, false, -1, 1);
+  ASSERT_EQ(1, TotalLiveFiles(1));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
+  ASSERT_EQ(1, TotalLiveFiles(1));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+  MakeTables(1, "p", "q", 1);
+  ASSERT_EQ(2, TotalLiveFiles(1));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
+  ASSERT_EQ(2, TotalLiveFiles(1));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+  // Full compaction to DB path 0
+  db_->CompactRange(handles_[1], nullptr, nullptr, false, -1, 0);
+  ASSERT_EQ(1, TotalLiveFiles(1));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[1].path));
+
+  // Fail when compacting to an invalid path ID
+  ASSERT_TRUE(db_->CompactRange(handles_[1], nullptr, nullptr, false, -1, 2)
+                  .IsInvalidArgument());
+}
+
+INSTANTIATE_TEST_CASE_P(DBTestUniversalManualCompactionOutputPathId,
+                        DBTestUniversalManualCompactionOutputPathId,
+                        ::testing::Values(1, 8));
+
+TEST_F(DBTest, ManualLevelCompactionOutputPathId) {
+  Options options = CurrentOptions();
+  options.db_paths.emplace_back(dbname_ + "_2", 2 * 10485760);
+  options.db_paths.emplace_back(dbname_ + "_3", 100 * 10485760);
+  options.db_paths.emplace_back(dbname_ + "_4", 120 * 10485760);
+  options.max_background_flushes = 1;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ASSERT_EQ(dbfull()->MaxMemCompactionLevel(), 2)
+      << "Need to update this test to match kMaxMemCompactLevel";
+
+  // iter - 0 with 7 levels
+  // iter - 1 with 3 levels
+  for (int iter = 0; iter < 2; ++iter) {
+    MakeTables(3, "p", "q", 1);
+    ASSERT_EQ("3", FilesPerLevel(1));
+    ASSERT_EQ(3, GetSstFileCount(options.db_paths[0].path));
+    ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+    // Compaction range falls before files
+    Compact(1, "", "c");
+    ASSERT_EQ("3", FilesPerLevel(1));
+
+    // Compaction range falls after files
+    Compact(1, "r", "z");
+    ASSERT_EQ("3", FilesPerLevel(1));
+
+    // Compaction range overlaps files
+    Compact(1, "p1", "p9", 1);
+    ASSERT_EQ("0,1", FilesPerLevel(1));
+    ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+    ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
+    ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+    // Populate a different range
+    MakeTables(3, "c", "e", 1);
+    ASSERT_EQ("3,1", FilesPerLevel(1));
+
+    // Compact just the new range
+    Compact(1, "b", "f", 1);
+    ASSERT_EQ("0,2", FilesPerLevel(1));
+    ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
+    ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
+    ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+    // Compact all
+    MakeTables(1, "a", "z", 1);
+    ASSERT_EQ("1,2", FilesPerLevel(1));
+    ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
+    ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path));
+    db_->CompactRange(handles_[1], nullptr, nullptr, false, 1, 1);
+    ASSERT_EQ("0,1", FilesPerLevel(1));
+    ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+    ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
+    ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+    if (iter == 0) {
+      DestroyAndReopen(options);
+      options = CurrentOptions();
+      options.db_paths.emplace_back(dbname_ + "_2", 2 * 10485760);
+      options.db_paths.emplace_back(dbname_ + "_3", 100 * 10485760);
+      options.db_paths.emplace_back(dbname_ + "_4", 120 * 10485760);
+      options.max_background_flushes = 1;
+      options.num_levels = 3;
+      options.create_if_missing = true;
+      CreateAndReopenWithCF({"pikachu"}, options);
+    }
+  }
+}
+
+TEST_F(DBTest, DBOpen_Options) {
+  Options options = CurrentOptions();
+  std::string dbname = test::TmpDir(env_) + "/db_options_test";
+  ASSERT_OK(DestroyDB(dbname, options));
 
   // Does not exist, and create_if_missing == false: error
   DB* db = nullptr;
-  Options opts;
-  opts.create_if_missing = false;
-  Status s = DB::Open(opts, dbname, &db);
+  options.create_if_missing = false;
+  Status s = DB::Open(options, dbname, &db);
   ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != nullptr);
   ASSERT_TRUE(db == nullptr);
 
   // Does not exist, and create_if_missing == true: OK
-  opts.create_if_missing = true;
-  s = DB::Open(opts, dbname, &db);
+  options.create_if_missing = true;
+  s = DB::Open(options, dbname, &db);
   ASSERT_OK(s);
   ASSERT_TRUE(db != nullptr);
 
@@ -4872,16 +7093,16 @@ TEST(DBTest, DBOpen_Options) {
   db = nullptr;
 
   // Does exist, and error_if_exists == true: error
-  opts.create_if_missing = false;
-  opts.error_if_exists = true;
-  s = DB::Open(opts, dbname, &db);
+  options.create_if_missing = false;
+  options.error_if_exists = true;
+  s = DB::Open(options, dbname, &db);
   ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != nullptr);
   ASSERT_TRUE(db == nullptr);
 
   // Does exist, and error_if_exists == false: OK
-  opts.create_if_missing = true;
-  opts.error_if_exists = false;
-  s = DB::Open(opts, dbname, &db);
+  options.create_if_missing = true;
+  options.error_if_exists = false;
+  s = DB::Open(options, dbname, &db);
   ASSERT_OK(s);
   ASSERT_TRUE(db != nullptr);
 
@@ -4889,76 +7110,87 @@ TEST(DBTest, DBOpen_Options) {
   db = nullptr;
 }
 
-TEST(DBTest, DBOpen_Change_NumLevels) {
-  Options opts;
-  opts.create_if_missing = true;
-  DestroyAndReopen(&opts);
+TEST_F(DBTest, DBOpen_Change_NumLevels) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.max_background_flushes = 0;
+  DestroyAndReopen(options);
   ASSERT_TRUE(db_ != nullptr);
-  CreateAndReopenWithCF({"pikachu"}, &opts);
+  CreateAndReopenWithCF({"pikachu"}, options);
 
   ASSERT_OK(Put(1, "a", "123"));
   ASSERT_OK(Put(1, "b", "234"));
   db_->CompactRange(handles_[1], nullptr, nullptr);
   Close();
 
-  opts.create_if_missing = false;
-  opts.num_levels = 2;
-  Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, &opts);
+  options.create_if_missing = false;
+  options.num_levels = 2;
+  Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options);
   ASSERT_TRUE(strstr(s.ToString().c_str(), "Invalid argument") != nullptr);
   ASSERT_TRUE(db_ == nullptr);
 }
 
-TEST(DBTest, DestroyDBMetaDatabase) {
-  std::string dbname = test::TmpDir() + "/db_meta";
+TEST_F(DBTest, DestroyDBMetaDatabase) {
+  std::string dbname = test::TmpDir(env_) + "/db_meta";
+  ASSERT_OK(env_->CreateDirIfMissing(dbname));
   std::string metadbname = MetaDatabaseName(dbname, 0);
+  ASSERT_OK(env_->CreateDirIfMissing(metadbname));
   std::string metametadbname = MetaDatabaseName(metadbname, 0);
+  ASSERT_OK(env_->CreateDirIfMissing(metametadbname));
 
   // Destroy previous versions if they exist. Using the long way.
-  ASSERT_OK(DestroyDB(metametadbname, Options()));
-  ASSERT_OK(DestroyDB(metadbname, Options()));
-  ASSERT_OK(DestroyDB(dbname, Options()));
+  Options options = CurrentOptions();
+  ASSERT_OK(DestroyDB(metametadbname, options));
+  ASSERT_OK(DestroyDB(metadbname, options));
+  ASSERT_OK(DestroyDB(dbname, options));
 
   // Setup databases
-  Options opts;
-  opts.create_if_missing = true;
   DB* db = nullptr;
-  ASSERT_OK(DB::Open(opts, dbname, &db));
+  ASSERT_OK(DB::Open(options, dbname, &db));
   delete db;
   db = nullptr;
-  ASSERT_OK(DB::Open(opts, metadbname, &db));
+  ASSERT_OK(DB::Open(options, metadbname, &db));
   delete db;
   db = nullptr;
-  ASSERT_OK(DB::Open(opts, metametadbname, &db));
+  ASSERT_OK(DB::Open(options, metametadbname, &db));
   delete db;
   db = nullptr;
 
   // Delete databases
-  ASSERT_OK(DestroyDB(dbname, Options()));
+  ASSERT_OK(DestroyDB(dbname, options));
 
   // Check if deletion worked.
-  opts.create_if_missing = false;
-  ASSERT_TRUE(!(DB::Open(opts, dbname, &db)).ok());
-  ASSERT_TRUE(!(DB::Open(opts, metadbname, &db)).ok());
-  ASSERT_TRUE(!(DB::Open(opts, metametadbname, &db)).ok());
+  options.create_if_missing = false;
+  ASSERT_TRUE(!(DB::Open(options, dbname, &db)).ok());
+  ASSERT_TRUE(!(DB::Open(options, metadbname, &db)).ok());
+  ASSERT_TRUE(!(DB::Open(options, metametadbname, &db)).ok());
 }
 
-// Check that number of files does not grow when we are out of space
-TEST(DBTest, NoSpace) {
+// Check that number of files does not grow when writes are dropped
+TEST_F(DBTest, DropWrites) {
   do {
     Options options = CurrentOptions();
     options.env = env_;
     options.paranoid_checks = false;
-    Reopen(&options);
+    Reopen(options);
 
     ASSERT_OK(Put("foo", "v1"));
     ASSERT_EQ("v1", Get("foo"));
     Compact("a", "z");
-    const int num_files = CountFiles();
-    env_->no_space_.Release_Store(env_);   // Force out-of-space errors
+    const size_t num_files = CountFiles();
+    // Force out-of-space errors
+    env_->drop_writes_.store(true, std::memory_order_release);
     env_->sleep_counter_.Reset();
     for (int i = 0; i < 5; i++) {
-      for (int level = 0; level < dbfull()->NumberLevels()-1; level++) {
-        dbfull()->TEST_CompactRange(level, nullptr, nullptr);
+      if (option_config_ != kUniversalCompactionMultiLevel) {
+        for (int level = 0; level < dbfull()->NumberLevels(); level++) {
+          if (level > 0 && level == dbfull()->NumberLevels() - 1) {
+            break;
+          }
+          dbfull()->TEST_CompactRange(level, nullptr, nullptr);
+        }
+      } else {
+        dbfull()->CompactRange(nullptr, nullptr);
       }
     }
 
@@ -4966,7 +7198,7 @@ TEST(DBTest, NoSpace) {
     ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value));
     ASSERT_EQ("5", property_value);
 
-    env_->no_space_.Release_Store(nullptr);
+    env_->drop_writes_.store(false, std::memory_order_release);
     ASSERT_LT(CountFiles(), num_files + 3);
 
     // Check that compaction attempts slept after errors
@@ -4975,50 +7207,64 @@ TEST(DBTest, NoSpace) {
 }
 
 // Check background error counter bumped on flush failures.
-TEST(DBTest, NoSpaceFlush) {
+TEST_F(DBTest, DropWritesFlush) {
   do {
     Options options = CurrentOptions();
     options.env = env_;
     options.max_background_flushes = 1;
-    Reopen(&options);
+    Reopen(options);
 
     ASSERT_OK(Put("foo", "v1"));
-    env_->no_space_.Release_Store(env_);  // Force out-of-space errors
+    // Force out-of-space errors
+    env_->drop_writes_.store(true, std::memory_order_release);
 
     std::string property_value;
     // Background error count is 0 now.
     ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value));
     ASSERT_EQ("0", property_value);
 
-    dbfull()->TEST_FlushMemTable(false);
+    dbfull()->TEST_FlushMemTable(true);
+
+    ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value));
+    ASSERT_EQ("1", property_value);
 
-    // Wait 300 milliseconds or background-errors turned 1 from 0.
-    int time_to_sleep_limit = 300000;
-    while (time_to_sleep_limit > 0) {
-      int to_sleep = (time_to_sleep_limit > 1000) ? 1000 : time_to_sleep_limit;
-      time_to_sleep_limit -= to_sleep;
-      env_->SleepForMicroseconds(to_sleep);
+    env_->drop_writes_.store(false, std::memory_order_release);
+  } while (ChangeCompactOptions());
+}
 
-      ASSERT_TRUE(
-          db_->GetProperty("rocksdb.background-errors", &property_value));
-      if (property_value == "1") {
-        break;
-      }
+// Check that CompactRange() returns failure if there is not enough space left
+// on device
+TEST_F(DBTest, NoSpaceCompactRange) {
+  do {
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.disable_auto_compactions = true;
+    Reopen(options);
+
+    // generate 5 tables
+    for (int i = 0; i < 5; ++i) {
+      ASSERT_OK(Put(Key(i), Key(i) + "v"));
+      ASSERT_OK(Flush());
     }
-    ASSERT_EQ("1", property_value);
 
-    env_->no_space_.Release_Store(nullptr);
+    // Force out-of-space errors
+    env_->no_space_.store(true, std::memory_order_release);
+
+    Status s = db_->CompactRange(nullptr, nullptr);
+    ASSERT_TRUE(s.IsIOError());
+
+    env_->no_space_.store(false, std::memory_order_release);
   } while (ChangeCompactOptions());
 }
 
-TEST(DBTest, NonWritableFileSystem) {
+TEST_F(DBTest, NonWritableFileSystem) {
   do {
     Options options = CurrentOptions();
     options.write_buffer_size = 1000;
     options.env = env_;
-    Reopen(&options);
+    Reopen(options);
     ASSERT_OK(Put("foo", "v1"));
-    env_->non_writable_.Release_Store(env_); // Force errors for new files
+    env_->non_writeable_rate_.store(100);
     std::string big(100000, 'x');
     int errors = 0;
     for (int i = 0; i < 20; i++) {
@@ -5028,11 +7274,11 @@ TEST(DBTest, NonWritableFileSystem) {
       }
     }
     ASSERT_GT(errors, 0);
-    env_->non_writable_.Release_Store(nullptr);
+    env_->non_writeable_rate_.store(0);
   } while (ChangeCompactOptions());
 }
 
-TEST(DBTest, ManifestWriteError) {
+TEST_F(DBTest, ManifestWriteError) {
   // Test for the following problem:
   // (a) Compaction produces file F
   // (b) Log record containing F is written to MANIFEST file, but Sync() fails
@@ -5042,7 +7288,7 @@ TEST(DBTest, ManifestWriteError) {
   // We iterate twice.  In the second iteration, everything is the
   // same except the log record never makes it to the MANIFEST file.
   for (int iter = 0; iter < 2; iter++) {
-    port::AtomicPointer* error_type = (iter == 0)
+    std::atomic<bool>* error_type = (iter == 0)
         ? &env_->manifest_sync_error_
         : &env_->manifest_write_error_;
 
@@ -5051,7 +7297,8 @@ TEST(DBTest, ManifestWriteError) {
     options.env = env_;
     options.create_if_missing = true;
     options.error_if_exists = false;
-    DestroyAndReopen(&options);
+    options.max_background_flushes = 0;
+    DestroyAndReopen(options);
     ASSERT_OK(Put("foo", "bar"));
     ASSERT_EQ("bar", Get("foo"));
 
@@ -5062,18 +7309,18 @@ TEST(DBTest, ManifestWriteError) {
     ASSERT_EQ(NumTableFilesAtLevel(last), 1);   // foo=>bar is now in last level
 
     // Merging compaction (will fail)
-    error_type->Release_Store(env_);
+    error_type->store(true, std::memory_order_release);
     dbfull()->TEST_CompactRange(last, nullptr, nullptr);  // Should fail
     ASSERT_EQ("bar", Get("foo"));
 
     // Recovery: should not lose data
-    error_type->Release_Store(nullptr);
-    Reopen(&options);
+    error_type->store(false, std::memory_order_release);
+    Reopen(options);
     ASSERT_EQ("bar", Get("foo"));
   }
 }
 
-TEST(DBTest, PutFailsParanoid) {
+TEST_F(DBTest, PutFailsParanoid) {
   // Test the following:
   // (a) A random put fails in paranoid mode (simulate by sync fail)
   // (b) All other puts have to fail, even if writes would succeed
@@ -5084,17 +7331,17 @@ TEST(DBTest, PutFailsParanoid) {
   options.create_if_missing = true;
   options.error_if_exists = false;
   options.paranoid_checks = true;
-  DestroyAndReopen(&options);
-  CreateAndReopenWithCF({"pikachu"}, &options);
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
   Status s;
 
   ASSERT_OK(Put(1, "foo", "bar"));
   ASSERT_OK(Put(1, "foo1", "bar1"));
   // simulate error
-  env_->log_write_error_.Release_Store(env_);
+  env_->log_write_error_.store(true, std::memory_order_release);
   s = Put(1, "foo2", "bar2");
   ASSERT_TRUE(!s.ok());
-  env_->log_write_error_.Release_Store(nullptr);
+  env_->log_write_error_.store(false, std::memory_order_release);
   s = Put(1, "foo3", "bar3");
   // the next put should fail, too
   ASSERT_TRUE(!s.ok());
@@ -5103,27 +7350,27 @@ TEST(DBTest, PutFailsParanoid) {
 
   // do the same thing with paranoid checks off
   options.paranoid_checks = false;
-  DestroyAndReopen(&options);
-  CreateAndReopenWithCF({"pikachu"}, &options);
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
 
   ASSERT_OK(Put(1, "foo", "bar"));
   ASSERT_OK(Put(1, "foo1", "bar1"));
   // simulate error
-  env_->log_write_error_.Release_Store(env_);
+  env_->log_write_error_.store(true, std::memory_order_release);
   s = Put(1, "foo2", "bar2");
   ASSERT_TRUE(!s.ok());
-  env_->log_write_error_.Release_Store(nullptr);
+  env_->log_write_error_.store(false, std::memory_order_release);
   s = Put(1, "foo3", "bar3");
   // the next put should NOT fail
   ASSERT_TRUE(s.ok());
 }
 
-TEST(DBTest, FilesDeletedAfterCompaction) {
+TEST_F(DBTest, FilesDeletedAfterCompaction) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     ASSERT_OK(Put(1, "foo", "v2"));
     Compact(1, "a", "z");
-    const int num_files = CountLiveFiles();
+    const size_t num_files = CountLiveFiles();
     for (int i = 0; i < 10; i++) {
       ASSERT_OK(Put(1, "foo", "v2"));
       Compact(1, "a", "z");
@@ -5132,14 +7379,19 @@ TEST(DBTest, FilesDeletedAfterCompaction) {
   } while (ChangeCompactOptions());
 }
 
-TEST(DBTest, BloomFilter) {
+TEST_F(DBTest, BloomFilter) {
   do {
-    env_->count_random_reads_ = true;
     Options options = CurrentOptions();
+    env_->count_random_reads_ = true;
     options.env = env_;
-    options.no_block_cache = true;
-    options.filter_policy = NewBloomFilterPolicy(10);
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    // ChangeCompactOptions() only changes compaction style, which does not
+    // trigger reset of table_factory
+    BlockBasedTableOptions table_options;
+    table_options.no_block_cache = true;
+    table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+    CreateAndReopenWithCF({"pikachu"}, options);
 
     // Populate multiple layers
     const int N = 10000;
@@ -5153,7 +7405,7 @@ TEST(DBTest, BloomFilter) {
     Flush(1);
 
     // Prevent auto compactions triggered by seeks
-    env_->delay_sstable_sync_.Release_Store(env_);
+    env_->delay_sstable_sync_.store(true, std::memory_order_release);
 
     // Lookup present keys.  Should rarely read from small sstable.
     env_->random_read_counter_.Reset();
@@ -5174,17 +7426,176 @@ TEST(DBTest, BloomFilter) {
     fprintf(stderr, "%d missing => %d reads\n", N, reads);
     ASSERT_LE(reads, 3*N/100);
 
-    env_->delay_sstable_sync_.Release_Store(nullptr);
+    env_->delay_sstable_sync_.store(false, std::memory_order_release);
     Close();
-    delete options.filter_policy;
   } while (ChangeCompactOptions());
 }
 
-TEST(DBTest, SnapshotFiles) {
+TEST_F(DBTest, BloomFilterRate) {
+  while (ChangeFilterOptions()) {
+    Options options = CurrentOptions();
+    options.statistics = rocksdb::CreateDBStatistics();
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    const int maxKey = 10000;
+    for (int i = 0; i < maxKey; i++) {
+      ASSERT_OK(Put(1, Key(i), Key(i)));
+    }
+    // Add a large key to make the file contain wide range
+    ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555)));
+    Flush(1);
+
+    // Check if they can be found
+    for (int i = 0; i < maxKey; i++) {
+      ASSERT_EQ(Key(i), Get(1, Key(i)));
+    }
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+
+    // Check if filter is useful
+    for (int i = 0; i < maxKey; i++) {
+      ASSERT_EQ("NOT_FOUND", Get(1, Key(i+33333)));
+    }
+    ASSERT_GE(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey*0.98);
+  }
+}
+
+TEST_F(DBTest, BloomFilterCompatibility) {
+  Options options = CurrentOptions();
+  options.statistics = rocksdb::CreateDBStatistics();
+  BlockBasedTableOptions table_options;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  // Create with block based filter
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  const int maxKey = 10000;
+  for (int i = 0; i < maxKey; i++) {
+    ASSERT_OK(Put(1, Key(i), Key(i)));
+  }
+  ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555)));
+  Flush(1);
+
+  // Check db with full filter
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  // Check if they can be found
+  for (int i = 0; i < maxKey; i++) {
+    ASSERT_EQ(Key(i), Get(1, Key(i)));
+  }
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+}
+
+TEST_F(DBTest, BloomFilterReverseCompatibility) {
+  Options options = CurrentOptions();
+  options.statistics = rocksdb::CreateDBStatistics();
+  BlockBasedTableOptions table_options;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  // Create with full filter
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  const int maxKey = 10000;
+  for (int i = 0; i < maxKey; i++) {
+    ASSERT_OK(Put(1, Key(i), Key(i)));
+  }
+  ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555)));
+  Flush(1);
+
+  // Check db with block_based filter
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  // Check if they can be found
+  for (int i = 0; i < maxKey; i++) {
+    ASSERT_EQ(Key(i), Get(1, Key(i)));
+  }
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+}
+
+namespace {
+// A wrapped bloom over default FilterPolicy
+class WrappedBloom : public FilterPolicy {
+ public:
+  explicit WrappedBloom(int bits_per_key) :
+        filter_(NewBloomFilterPolicy(bits_per_key)),
+        counter_(0) {}
+
+  ~WrappedBloom() { delete filter_; }
+
+  const char* Name() const override { return "WrappedRocksDbFilterPolicy"; }
+
+  void CreateFilter(const rocksdb::Slice* keys, int n, std::string* dst)
+      const override {
+    std::unique_ptr<rocksdb::Slice[]> user_keys(new rocksdb::Slice[n]);
+    for (int i = 0; i < n; ++i) {
+      user_keys[i] = convertKey(keys[i]);
+    }
+    return filter_->CreateFilter(user_keys.get(), n, dst);
+  }
+
+  bool KeyMayMatch(const rocksdb::Slice& key, const rocksdb::Slice& filter)
+      const override {
+    counter_++;
+    return filter_->KeyMayMatch(convertKey(key), filter);
+  }
+
+  uint32_t GetCounter() { return counter_; }
+
+ private:
+  const FilterPolicy* filter_;
+  mutable uint32_t counter_;
+
+  rocksdb::Slice convertKey(const rocksdb::Slice& key) const {
+    return key;
+  }
+};
+}  // namespace
+
+TEST_F(DBTest, BloomFilterWrapper) {
+  Options options = CurrentOptions();
+  options.statistics = rocksdb::CreateDBStatistics();
+
+  BlockBasedTableOptions table_options;
+  WrappedBloom* policy = new WrappedBloom(10);
+  table_options.filter_policy.reset(policy);
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  const int maxKey = 10000;
+  for (int i = 0; i < maxKey; i++) {
+    ASSERT_OK(Put(1, Key(i), Key(i)));
+  }
+  // Add a large key to make the file contain wide range
+  ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555)));
+  ASSERT_EQ(0U, policy->GetCounter());
+  Flush(1);
+
+  // Check if they can be found
+  for (int i = 0; i < maxKey; i++) {
+    ASSERT_EQ(Key(i), Get(1, Key(i)));
+  }
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+  ASSERT_EQ(1U * maxKey, policy->GetCounter());
+
+  // Check if filter is useful
+  for (int i = 0; i < maxKey; i++) {
+    ASSERT_EQ("NOT_FOUND", Get(1, Key(i+33333)));
+  }
+  ASSERT_GE(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey*0.98);
+  ASSERT_EQ(2U * maxKey, policy->GetCounter());
+}
+
+TEST_F(DBTest, SnapshotFiles) {
   do {
     Options options = CurrentOptions();
     options.write_buffer_size = 100000000;        // Large write buffer
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
 
     Random rnd(301);
 
@@ -5214,8 +7625,7 @@ TEST(DBTest, SnapshotFiles) {
 
     // copy these files to a new snapshot directory
     std::string snapdir = dbname_ + ".snapdir/";
-    std::string mkdir = "mkdir -p " + snapdir;
-    ASSERT_EQ(system(mkdir.c_str()), 0);
+    ASSERT_OK(env_->CreateDirIfMissing(snapdir));
 
     for (unsigned int i = 0; i < files.size(); i++) {
       // our clients require that GetLiveFiles returns
@@ -5243,7 +7653,6 @@ TEST(DBTest, SnapshotFiles) {
 
     // release file snapshot
     dbfull()->DisableFileDeletions();
-
     // overwrite one key, this key should not appear in the snapshot
     std::vector<std::string> extras;
     for (unsigned int i = 0; i < 1; i++) {
@@ -5258,6 +7667,7 @@ TEST(DBTest, SnapshotFiles) {
     std::vector<ColumnFamilyHandle*> cf_handles;
     DB* snapdb;
     DBOptions opts;
+    opts.env = env_;
     opts.create_if_missing = false;
     Status stat =
         DB::Open(opts, snapdir, column_families, &cf_handles, &snapdb);
@@ -5309,12 +7719,14 @@ TEST(DBTest, SnapshotFiles) {
   } while (ChangeCompactOptions());
 }
 
-TEST(DBTest, CompactOnFlush) {
+TEST_F(DBTest, CompactOnFlush) {
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
   do {
-    Options options = CurrentOptions();
+    Options options = CurrentOptions(options_override);
     options.purge_redundant_kvs_while_flush = true;
     options.disable_auto_compactions = true;
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
 
     Put(1, "foo", "v1");
     ASSERT_OK(Flush(1));
@@ -5396,129 +7808,298 @@ TEST(DBTest, CompactOnFlush) {
 }
 
 namespace {
-std::vector<std::uint64_t> ListLogFiles(Env* env, const std::string& path) {
+std::vector<std::uint64_t> ListSpecificFiles(
+    Env* env, const std::string& path, const FileType expected_file_type) {
   std::vector<std::string> files;
-  std::vector<uint64_t> log_files;
+  std::vector<uint64_t> file_numbers;
   env->GetChildren(path, &files);
   uint64_t number;
   FileType type;
   for (size_t i = 0; i < files.size(); ++i) {
     if (ParseFileName(files[i], &number, &type)) {
-      if (type == kLogFile) {
-        log_files.push_back(number);
+      if (type == expected_file_type) {
+        file_numbers.push_back(number);
       }
     }
   }
-  return std::move(log_files);
+  return std::move(file_numbers);
 }
-}  // namespace
-
-TEST(DBTest, WALArchivalTtl) {
-  do {
-    Options options = CurrentOptions();
-    options.create_if_missing = true;
-    options.WAL_ttl_seconds = 1000;
-    DestroyAndReopen(&options);
-
-    //  TEST : Create DB with a ttl and no size limit.
-    //  Put some keys. Count the log files present in the DB just after insert.
-    //  Re-open db. Causes deletion/archival to take place.
-    //  Assert that the files moved under "/archive".
-    //  Reopen db with small ttl.
-    //  Assert that archive was removed.
 
-    std::string archiveDir = ArchivalDirectory(dbname_);
+std::vector<std::uint64_t> ListTableFiles(Env* env, const std::string& path) {
+  return ListSpecificFiles(env, path, kTableFile);
+}
+}  // namespace
 
-    for (int i = 0; i < 10; ++i) {
-      for (int j = 0; j < 10; ++j) {
-        ASSERT_OK(Put(Key(10 * i + j), DummyString(1024)));
-      }
+TEST_F(DBTest, FlushOneColumnFamily) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich",
+                         "alyosha", "popovich"},
+                        options);
+
+  ASSERT_OK(Put(0, "Default", "Default"));
+  ASSERT_OK(Put(1, "pikachu", "pikachu"));
+  ASSERT_OK(Put(2, "ilya", "ilya"));
+  ASSERT_OK(Put(3, "muromec", "muromec"));
+  ASSERT_OK(Put(4, "dobrynia", "dobrynia"));
+  ASSERT_OK(Put(5, "nikitich", "nikitich"));
+  ASSERT_OK(Put(6, "alyosha", "alyosha"));
+  ASSERT_OK(Put(7, "popovich", "popovich"));
+
+  for (int i = 0; i < 8; ++i) {
+    Flush(i);
+    auto tables = ListTableFiles(env_, dbname_);
+    ASSERT_EQ(tables.size(), i + 1U);
+  }
+}
+
+// In https://reviews.facebook.net/D20661 we change
+// recovery behavior: previously for each log file each column family
+// memtable was flushed, even it was empty. Now it's changed:
+// we try to create the smallest number of table files by merging
+// updates from multiple logs
+TEST_F(DBTest, RecoverCheckFileAmountWithSmallWriteBuffer) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 5000000;
+  CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options);
+
+  // Since we will reopen DB with smaller write_buffer_size,
+  // each key will go to new SST file
+  ASSERT_OK(Put(1, Key(10), DummyString(1000000)));
+  ASSERT_OK(Put(1, Key(10), DummyString(1000000)));
+  ASSERT_OK(Put(1, Key(10), DummyString(1000000)));
+  ASSERT_OK(Put(1, Key(10), DummyString(1000000)));
+
+  ASSERT_OK(Put(3, Key(10), DummyString(1)));
+  // Make 'dobrynia' to be flushed and new WAL file to be created
+  ASSERT_OK(Put(2, Key(10), DummyString(7500000)));
+  ASSERT_OK(Put(2, Key(1), DummyString(1)));
+  dbfull()->TEST_WaitForFlushMemTable(handles_[2]);
+  {
+    auto tables = ListTableFiles(env_, dbname_);
+    ASSERT_EQ(tables.size(), static_cast<size_t>(1));
+    // Make sure 'dobrynia' was flushed: check sst files amount
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+              static_cast<uint64_t>(1));
+  }
+  // New WAL file
+  ASSERT_OK(Put(1, Key(1), DummyString(1)));
+  ASSERT_OK(Put(1, Key(1), DummyString(1)));
+  ASSERT_OK(Put(3, Key(10), DummyString(1)));
+  ASSERT_OK(Put(3, Key(10), DummyString(1)));
+  ASSERT_OK(Put(3, Key(10), DummyString(1)));
+
+  options.write_buffer_size = 10;
+  ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"},
+                           options);
+  {
+    // No inserts => default is empty
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+              static_cast<uint64_t>(0));
+    // First 4 keys goes to separate SSTs + 1 more SST for 2 smaller keys
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+              static_cast<uint64_t>(5));
+    // 1 SST for big key + 1 SST for small one
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+              static_cast<uint64_t>(2));
+    // 1 SST for all keys
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(1));
+  }
+}
+
+// In https://reviews.facebook.net/D20661 we change
+// recovery behavior: previously for each log file each column family
+// memtable was flushed, even it wasn't empty. Now it's changed:
+// we try to create the smallest number of table files by merging
+// updates from multiple logs
+TEST_F(DBTest, RecoverCheckFileAmount) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100000;
+  CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options);
+
+  ASSERT_OK(Put(0, Key(1), DummyString(1)));
+  ASSERT_OK(Put(1, Key(1), DummyString(1)));
+  ASSERT_OK(Put(2, Key(1), DummyString(1)));
+
+  // Make 'nikitich' memtable to be flushed
+  ASSERT_OK(Put(3, Key(10), DummyString(1002400)));
+  ASSERT_OK(Put(3, Key(1), DummyString(1)));
+  dbfull()->TEST_WaitForFlushMemTable(handles_[3]);
+  // 4 memtable are not flushed, 1 sst file
+  {
+    auto tables = ListTableFiles(env_, dbname_);
+    ASSERT_EQ(tables.size(), static_cast<size_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(1));
+  }
+  // Memtable for 'nikitich' has flushed, new WAL file has opened
+  // 4 memtable still not flushed
+
+  // Write to new WAL file
+  ASSERT_OK(Put(0, Key(1), DummyString(1)));
+  ASSERT_OK(Put(1, Key(1), DummyString(1)));
+  ASSERT_OK(Put(2, Key(1), DummyString(1)));
+
+  // Fill up 'nikitich' one more time
+  ASSERT_OK(Put(3, Key(10), DummyString(1002400)));
+  // make it flush
+  ASSERT_OK(Put(3, Key(1), DummyString(1)));
+  dbfull()->TEST_WaitForFlushMemTable(handles_[3]);
+  // There are still 4 memtable not flushed, and 2 sst tables
+  ASSERT_OK(Put(0, Key(1), DummyString(1)));
+  ASSERT_OK(Put(1, Key(1), DummyString(1)));
+  ASSERT_OK(Put(2, Key(1), DummyString(1)));
 
-      std::vector<uint64_t> log_files = ListLogFiles(env_, dbname_);
+  {
+    auto tables = ListTableFiles(env_, dbname_);
+    ASSERT_EQ(tables.size(), static_cast<size_t>(2));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(2));
+  }
 
-      options.create_if_missing = false;
-      Reopen(&options);
+  ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"},
+                           options);
+  {
+    std::vector<uint64_t> table_files = ListTableFiles(env_, dbname_);
+    // Check, that records for 'default', 'dobrynia' and 'pikachu' from
+    // first, second and third WALs  went to the same SST.
+    // So, there is 6 SSTs: three  for 'nikitich', one for 'default', one for
+    // 'dobrynia', one for 'pikachu'
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(3));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+              static_cast<uint64_t>(1));
+  }
+}
+
+TEST_F(DBTest, SharedWriteBuffer) {
+  Options options = CurrentOptions();
+  options.db_write_buffer_size = 100000;  // this is the real limit
+  options.write_buffer_size    = 500000;  // this is never hit
+  CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options);
+
+  // Trigger a flush on every CF
+  ASSERT_OK(Put(0, Key(1), DummyString(1)));
+  ASSERT_OK(Put(1, Key(1), DummyString(1)));
+  ASSERT_OK(Put(3, Key(1), DummyString(90000)));
+  ASSERT_OK(Put(2, Key(2), DummyString(20000)));
+  ASSERT_OK(Put(2, Key(1), DummyString(1)));
+  dbfull()->TEST_WaitForFlushMemTable(handles_[0]);
+  dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+  dbfull()->TEST_WaitForFlushMemTable(handles_[2]);
+  dbfull()->TEST_WaitForFlushMemTable(handles_[3]);
+  {
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(1));
+  }
+
+  // Flush 'dobrynia' and 'nikitich'
+  ASSERT_OK(Put(2, Key(2), DummyString(50000)));
+  ASSERT_OK(Put(3, Key(2), DummyString(40000)));
+  ASSERT_OK(Put(2, Key(3), DummyString(20000)));
+  ASSERT_OK(Put(3, Key(2), DummyString(40000)));
+  dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+  dbfull()->TEST_WaitForFlushMemTable(handles_[2]);
+  dbfull()->TEST_WaitForFlushMemTable(handles_[3]);
+  {
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+              static_cast<uint64_t>(2));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(2));
+  }
+
+  // Make 'dobrynia' and 'nikitich' both take up 40% of space
+  // When 'pikachu' puts us over 100%, all 3 flush.
+  ASSERT_OK(Put(2, Key(2), DummyString(40000)));
+  ASSERT_OK(Put(1, Key(2), DummyString(20000)));
+  ASSERT_OK(Put(0, Key(1), DummyString(1)));
+  dbfull()->TEST_WaitForFlushMemTable(handles_[2]);
+  dbfull()->TEST_WaitForFlushMemTable(handles_[3]);
+  {
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+              static_cast<uint64_t>(2));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+              static_cast<uint64_t>(3));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(3));
+  }
+
+  // Some remaining writes so 'default' and 'nikitich' flush on closure.
+  ASSERT_OK(Put(3, Key(1), DummyString(1)));
+  ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"},
+                           options);
+  {
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+              static_cast<uint64_t>(2));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+              static_cast<uint64_t>(2));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+              static_cast<uint64_t>(3));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(4));
+  }
+}
 
-      std::vector<uint64_t> logs = ListLogFiles(env_, archiveDir);
-      std::set<uint64_t> archivedFiles(logs.begin(), logs.end());
+TEST_F(DBTest, PurgeInfoLogs) {
+  Options options = CurrentOptions();
+  options.keep_log_file_num = 5;
+  options.create_if_missing = true;
+  for (int mode = 0; mode <= 1; mode++) {
+    if (mode == 1) {
+      options.db_log_dir = dbname_ + "_logs";
+      env_->CreateDirIfMissing(options.db_log_dir);
+    } else {
+      options.db_log_dir = "";
+    }
+    for (int i = 0; i < 8; i++) {
+      Reopen(options);
+    }
 
-      for (auto& log : log_files) {
-        ASSERT_TRUE(archivedFiles.find(log) != archivedFiles.end());
+    std::vector<std::string> files;
+    env_->GetChildren(options.db_log_dir.empty() ? dbname_ : options.db_log_dir,
+                      &files);
+    int info_log_count = 0;
+    for (std::string file : files) {
+      if (file.find("LOG") != std::string::npos) {
+        info_log_count++;
       }
     }
+    ASSERT_EQ(5, info_log_count);
 
-    std::vector<uint64_t> log_files = ListLogFiles(env_, archiveDir);
-    ASSERT_TRUE(log_files.size() > 0);
-
-    options.WAL_ttl_seconds = 1;
-    env_->SleepForMicroseconds(2 * 1000 * 1000);
-    Reopen(&options);
-
-    log_files = ListLogFiles(env_, archiveDir);
-    ASSERT_TRUE(log_files.empty());
-  } while (ChangeCompactOptions());
-}
-
-namespace {
-uint64_t GetLogDirSize(std::string dir_path, SpecialEnv* env) {
-  uint64_t dir_size = 0;
-  std::vector<std::string> files;
-  env->GetChildren(dir_path, &files);
-  for (auto& f : files) {
-    uint64_t number;
-    FileType type;
-    if (ParseFileName(f, &number, &type) && type == kLogFile) {
-      std::string const file_path = dir_path + "/" + f;
-      uint64_t file_size;
-      env->GetFileSize(file_path, &file_size);
-      dir_size += file_size;
+    Destroy(options);
+    // For mode (1), test DestroyDB() to delete all the logs under DB dir.
+    // For mode (2), no info log file should have been put under DB dir.
+    std::vector<std::string> db_files;
+    env_->GetChildren(dbname_, &db_files);
+    for (std::string file : db_files) {
+      ASSERT_TRUE(file.find("LOG") == std::string::npos);
     }
-  }
-  return dir_size;
-}
-}  // namespace
-
-TEST(DBTest, WALArchivalSizeLimit) {
-  do {
-    Options options = CurrentOptions();
-    options.create_if_missing = true;
-    options.WAL_ttl_seconds = 0;
-    options.WAL_size_limit_MB = 1000;
-
-    // TEST : Create DB with huge size limit and no ttl.
-    // Put some keys. Count the archived log files present in the DB
-    // just after insert. Assert that there are many enough.
-    // Change size limit. Re-open db.
-    // Assert that archive is not greater than WAL_size_limit_MB.
-    // Set ttl and time_to_check_ to small values. Re-open db.
-    // Assert that there are no archived logs left.
 
-    DestroyAndReopen(&options);
-    for (int i = 0; i < 128 * 128; ++i) {
-      ASSERT_OK(Put(Key(i), DummyString(1024)));
+    if (mode == 1) {
+      // Cleaning up
+      env_->GetChildren(options.db_log_dir, &files);
+      for (std::string file : files) {
+        env_->DeleteFile(options.db_log_dir + "/" + file);
+      }
+      env_->DeleteDir(options.db_log_dir);
     }
-    Reopen(&options);
-
-    std::string archive_dir = ArchivalDirectory(dbname_);
-    std::vector<std::uint64_t> log_files = ListLogFiles(env_, archive_dir);
-    ASSERT_TRUE(log_files.size() > 2);
-
-    options.WAL_size_limit_MB = 8;
-    Reopen(&options);
-    dbfull()->TEST_PurgeObsoleteteWAL();
-
-    uint64_t archive_size = GetLogDirSize(archive_dir, env_);
-    ASSERT_TRUE(archive_size <= options.WAL_size_limit_MB * 1024 * 1024);
-
-    options.WAL_ttl_seconds = 1;
-    dbfull()->TEST_SetDefaultTimeToCheck(1);
-    env_->SleepForMicroseconds(2 * 1000 * 1000);
-    Reopen(&options);
-    dbfull()->TEST_PurgeObsoleteteWAL();
-
-    log_files = ListLogFiles(env_, archive_dir);
-    ASSERT_TRUE(log_files.empty());
-  } while (ChangeCompactOptions());
+  }
 }
 
 namespace {
@@ -5530,10 +8111,10 @@ SequenceNumber ReadRecords(
   BatchResult res;
   while (iter->Valid()) {
     res = iter->GetBatch();
-    ASSERT_TRUE(res.sequence > lastSequence);
+    EXPECT_TRUE(res.sequence > lastSequence);
     ++count;
     lastSequence = res.sequence;
-    ASSERT_OK(iter->status());
+    EXPECT_OK(iter->status());
     iter->Next();
   }
   return res.sequence;
@@ -5548,11 +8129,11 @@ void ExpectRecords(
 }
 }  // namespace
 
-TEST(DBTest, TransactionLogIterator) {
+TEST_F(DBTest, TransactionLogIterator) {
   do {
     Options options = OptionsForLogIterTest();
-    DestroyAndReopen(&options);
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    DestroyAndReopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
     Put(0, "key1", DummyString(1024));
     Put(1, "key2", DummyString(1024));
     Put(1, "key2", DummyString(1024));
@@ -5561,7 +8142,7 @@ TEST(DBTest, TransactionLogIterator) {
       auto iter = OpenTransactionLogIter(0);
       ExpectRecords(3, iter);
     }
-    ReopenWithColumnFamilies({"default", "pikachu"}, &options);
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
     env_->SleepForMicroseconds(2 * 1000 * 1000);
     {
       Put(0, "key4", DummyString(1024));
@@ -5576,74 +8157,66 @@ TEST(DBTest, TransactionLogIterator) {
 }
 
 #ifndef NDEBUG // sync point is not included with DNDEBUG build
-TEST(DBTest, TransactionLogIteratorRace) {
-  // Setup sync point dependency to reproduce the race condition of
-  // a log file moved to archived dir, in the middle of GetSortedWalFiles
-  rocksdb::SyncPoint::GetInstance()->LoadDependency(
-    { { "DBImpl::GetSortedWalFiles:1", "DBImpl::PurgeObsoleteFiles:1" },
-      { "DBImpl::PurgeObsoleteFiles:2", "DBImpl::GetSortedWalFiles:2" },
-    });
-
-  do {
-    rocksdb::SyncPoint::GetInstance()->ClearTrace();
-    rocksdb::SyncPoint::GetInstance()->DisableProcessing();
-    Options options = OptionsForLogIterTest();
-    DestroyAndReopen(&options);
-    Put("key1", DummyString(1024));
-    dbfull()->Flush(FlushOptions());
-    Put("key2", DummyString(1024));
-    dbfull()->Flush(FlushOptions());
-    Put("key3", DummyString(1024));
-    dbfull()->Flush(FlushOptions());
-    Put("key4", DummyString(1024));
-    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 4U);
-
-    {
-      auto iter = OpenTransactionLogIter(0);
-      ExpectRecords(4, iter);
-    }
+TEST_F(DBTest, TransactionLogIteratorRace) {
+  static const int LOG_ITERATOR_RACE_TEST_COUNT = 2;
+  static const char* sync_points[LOG_ITERATOR_RACE_TEST_COUNT][4] = {
+      {"WalManager::GetSortedWalFiles:1",  "WalManager::PurgeObsoleteFiles:1",
+       "WalManager::PurgeObsoleteFiles:2", "WalManager::GetSortedWalFiles:2"},
+      {"WalManager::GetSortedWalsOfType:1",
+       "WalManager::PurgeObsoleteFiles:1",
+       "WalManager::PurgeObsoleteFiles:2",
+       "WalManager::GetSortedWalsOfType:2"}};
+  for (int test = 0; test < LOG_ITERATOR_RACE_TEST_COUNT; ++test) {
+    // Setup sync point dependency to reproduce the race condition of
+    // a log file moved to archived dir, in the middle of GetSortedWalFiles
+    rocksdb::SyncPoint::GetInstance()->LoadDependency(
+      { { sync_points[test][0], sync_points[test][1] },
+        { sync_points[test][2], sync_points[test][3] },
+      });
+
+    do {
+      rocksdb::SyncPoint::GetInstance()->ClearTrace();
+      rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+      Options options = OptionsForLogIterTest();
+      DestroyAndReopen(options);
+      Put("key1", DummyString(1024));
+      dbfull()->Flush(FlushOptions());
+      Put("key2", DummyString(1024));
+      dbfull()->Flush(FlushOptions());
+      Put("key3", DummyString(1024));
+      dbfull()->Flush(FlushOptions());
+      Put("key4", DummyString(1024));
+      ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 4U);
+
+      {
+        auto iter = OpenTransactionLogIter(0);
+        ExpectRecords(4, iter);
+      }
 
-    rocksdb::SyncPoint::GetInstance()->EnableProcessing();
-    // trigger async flush, and log move. Well, log move will
-    // wait until the GetSortedWalFiles:1 to reproduce the race
-    // condition
-    FlushOptions flush_options;
-    flush_options.wait = false;
-    dbfull()->Flush(flush_options);
-
-    // "key5" would be written in a new memtable and log
-    Put("key5", DummyString(1024));
-    {
-      // this iter would miss "key4" if not fixed
-      auto iter = OpenTransactionLogIter(0);
-      ExpectRecords(5, iter);
-    }
-  } while (ChangeCompactOptions());
+      rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+      // trigger async flush, and log move. Well, log move will
+      // wait until the GetSortedWalFiles:1 to reproduce the race
+      // condition
+      FlushOptions flush_options;
+      flush_options.wait = false;
+      dbfull()->Flush(flush_options);
+
+      // "key5" would be written in a new memtable and log
+      Put("key5", DummyString(1024));
+      {
+        // this iter would miss "key4" if not fixed
+        auto iter = OpenTransactionLogIter(0);
+        ExpectRecords(5, iter);
+      }
+    } while (ChangeCompactOptions());
+  }
 }
 #endif
 
-TEST(DBTest, TransactionLogIteratorMoveOverZeroFiles) {
-  do {
-    Options options = OptionsForLogIterTest();
-    DestroyAndReopen(&options);
-    CreateAndReopenWithCF({"pikachu"}, &options);
-    // Do a plain Reopen.
-    Put(1, "key1", DummyString(1024));
-    // Two reopens should create a zero record WAL file.
-    ReopenWithColumnFamilies({"default", "pikachu"}, &options);
-    ReopenWithColumnFamilies({"default", "pikachu"}, &options);
-
-    Put(1, "key2", DummyString(1024));
-
-    auto iter = OpenTransactionLogIter(0);
-    ExpectRecords(2, iter);
-  } while (ChangeCompactOptions());
-}
-
-TEST(DBTest, TransactionLogIteratorStallAtLastRecord) {
+TEST_F(DBTest, TransactionLogIteratorStallAtLastRecord) {
   do {
     Options options = OptionsForLogIterTest();
-    DestroyAndReopen(&options);
+    DestroyAndReopen(options);
     Put("key1", DummyString(1024));
     auto iter = OpenTransactionLogIter(0);
     ASSERT_OK(iter->status());
@@ -5658,64 +8231,57 @@ TEST(DBTest, TransactionLogIteratorStallAtLastRecord) {
   } while (ChangeCompactOptions());
 }
 
-TEST(DBTest, TransactionLogIteratorJustEmptyFile) {
-  do {
-    Options options = OptionsForLogIterTest();
-    DestroyAndReopen(&options);
-    unique_ptr<TransactionLogIterator> iter;
-    Status status = dbfull()->GetUpdatesSince(0, &iter);
-    // Check that an empty iterator is returned
-    ASSERT_TRUE(!iter->Valid());
-  } while (ChangeCompactOptions());
-}
-
-TEST(DBTest, TransactionLogIteratorCheckAfterRestart) {
+TEST_F(DBTest, TransactionLogIteratorCheckAfterRestart) {
   do {
     Options options = OptionsForLogIterTest();
-    DestroyAndReopen(&options);
+    DestroyAndReopen(options);
     Put("key1", DummyString(1024));
     Put("key2", DummyString(1023));
     dbfull()->Flush(FlushOptions());
-    Reopen(&options);
+    Reopen(options);
     auto iter = OpenTransactionLogIter(0);
     ExpectRecords(2, iter);
   } while (ChangeCompactOptions());
 }
 
-TEST(DBTest, TransactionLogIteratorCorruptedLog) {
+TEST_F(DBTest, TransactionLogIteratorCorruptedLog) {
   do {
     Options options = OptionsForLogIterTest();
-    DestroyAndReopen(&options);
+    DestroyAndReopen(options);
     for (int i = 0; i < 1024; i++) {
-      Put("key"+std::to_string(i), DummyString(10));
+      Put("key"+ToString(i), DummyString(10));
     }
     dbfull()->Flush(FlushOptions());
     // Corrupt this log to create a gap
     rocksdb::VectorLogPtr wal_files;
     ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files));
-    const auto logfilePath = dbname_ + "/" + wal_files.front()->PathName();
-    ASSERT_EQ(
-      0,
-      truncate(logfilePath.c_str(), wal_files.front()->SizeFileBytes() / 2));
+    const auto logfile_path = dbname_ + "/" + wal_files.front()->PathName();
+    if (mem_env_) {
+      mem_env_->Truncate(logfile_path, wal_files.front()->SizeFileBytes() / 2);
+    } else {
+      ASSERT_EQ(0, truncate(logfile_path.c_str(),
+                   wal_files.front()->SizeFileBytes() / 2));
+    }
+
     // Insert a new entry to a new log file
     Put("key1025", DummyString(10));
     // Try to read from the beginning. Should stop before the gap and read less
     // than 1025 entries
     auto iter = OpenTransactionLogIter(0);
     int count;
-    int last_sequence_read = ReadRecords(iter, count);
-    ASSERT_LT(last_sequence_read, 1025);
+    SequenceNumber last_sequence_read = ReadRecords(iter, count);
+    ASSERT_LT(last_sequence_read, 1025U);
     // Try to read past the gap, should be able to seek to key1025
     auto iter2 = OpenTransactionLogIter(last_sequence_read + 1);
     ExpectRecords(1, iter2);
   } while (ChangeCompactOptions());
 }
 
-TEST(DBTest, TransactionLogIteratorBatchOperations) {
+TEST_F(DBTest, TransactionLogIteratorBatchOperations) {
   do {
     Options options = OptionsForLogIterTest();
-    DestroyAndReopen(&options);
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    DestroyAndReopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
     WriteBatch batch;
     batch.Put(handles_[1], "key1", DummyString(1024));
     batch.Put(handles_[0], "key2", DummyString(1024));
@@ -5724,17 +8290,17 @@ TEST(DBTest, TransactionLogIteratorBatchOperations) {
     dbfull()->Write(WriteOptions(), &batch);
     Flush(1);
     Flush(0);
-    ReopenWithColumnFamilies({"default", "pikachu"}, &options);
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
     Put(1, "key4", DummyString(1024));
     auto iter = OpenTransactionLogIter(3);
     ExpectRecords(2, iter);
   } while (ChangeCompactOptions());
 }
 
-TEST(DBTest, TransactionLogIteratorBlobs) {
+TEST_F(DBTest, TransactionLogIteratorBlobs) {
   Options options = OptionsForLogIterTest();
-  DestroyAndReopen(&options);
-  CreateAndReopenWithCF({"pikachu"}, &options);
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
   {
     WriteBatch batch;
     batch.Put(handles_[1], "key1", DummyString(1024));
@@ -5744,27 +8310,29 @@ TEST(DBTest, TransactionLogIteratorBlobs) {
     batch.PutLogData(Slice("blob2"));
     batch.Delete(handles_[0], "key2");
     dbfull()->Write(WriteOptions(), &batch);
-    ReopenWithColumnFamilies({"default", "pikachu"}, &options);
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
   }
 
   auto res = OpenTransactionLogIter(0)->GetBatch();
   struct Handler : public WriteBatch::Handler {
     std::string seen;
-    virtual Status PutCF(uint32_t cf, const Slice& key, const Slice& value) {
-      seen += "Put(" + std::to_string(cf) + ", " + key.ToString() + ", " +
-              std::to_string(value.size()) + ")";
+    virtual Status PutCF(uint32_t cf, const Slice& key,
+                         const Slice& value) override {
+      seen += "Put(" + ToString(cf) + ", " + key.ToString() + ", " +
+              ToString(value.size()) + ")";
       return Status::OK();
     }
-    virtual Status MergeCF(uint32_t cf, const Slice& key, const Slice& value) {
-      seen += "Merge(" + std::to_string(cf) + ", " + key.ToString() + ", " +
-              std::to_string(value.size()) + ")";
+    virtual Status MergeCF(uint32_t cf, const Slice& key,
+                           const Slice& value) override {
+      seen += "Merge(" + ToString(cf) + ", " + key.ToString() + ", " +
+              ToString(value.size()) + ")";
       return Status::OK();
     }
-    virtual void LogData(const Slice& blob) {
+    virtual void LogData(const Slice& blob) override {
       seen += "LogData(" + blob.ToString() + ")";
     }
-    virtual Status DeleteCF(uint32_t cf, const Slice& key) {
-      seen += "Delete(" + std::to_string(cf) + ", " + key.ToString() + ")";
+    virtual Status DeleteCF(uint32_t cf, const Slice& key) override {
+      seen += "Delete(" + ToString(cf) + ", " + key.ToString() + ")";
       return Status::OK();
     }
   } handler;
@@ -5779,107 +8347,6 @@ TEST(DBTest, TransactionLogIteratorBlobs) {
       handler.seen);
 }
 
-TEST(DBTest, ReadFirstRecordCache) {
-  Options options = CurrentOptions();
-  options.env = env_;
-  options.create_if_missing = true;
-  DestroyAndReopen(&options);
-
-  std::string path = dbname_ + "/000001.log";
-  unique_ptr<WritableFile> file;
-  ASSERT_OK(env_->NewWritableFile(path, &file, EnvOptions()));
-
-  SequenceNumber s;
-  ASSERT_OK(dbfull()->TEST_ReadFirstLine(path, &s));
-  ASSERT_EQ(s, 0U);
-
-  ASSERT_OK(dbfull()->TEST_ReadFirstRecord(kAliveLogFile, 1, &s));
-  ASSERT_EQ(s, 0U);
-
-  log::Writer writer(std::move(file));
-  WriteBatch batch;
-  batch.Put("foo", "bar");
-  WriteBatchInternal::SetSequence(&batch, 10);
-  writer.AddRecord(WriteBatchInternal::Contents(&batch));
-
-  env_->count_sequential_reads_ = true;
-  // sequential_read_counter_ sanity test
-  ASSERT_EQ(env_->sequential_read_counter_.Read(), 0);
-
-  ASSERT_OK(dbfull()->TEST_ReadFirstRecord(kAliveLogFile, 1, &s));
-  ASSERT_EQ(s, 10U);
-  // did a read
-  ASSERT_EQ(env_->sequential_read_counter_.Read(), 1);
-
-  ASSERT_OK(dbfull()->TEST_ReadFirstRecord(kAliveLogFile, 1, &s));
-  ASSERT_EQ(s, 10U);
-  // no new reads since the value is cached
-  ASSERT_EQ(env_->sequential_read_counter_.Read(), 1);
-}
-
-TEST(DBTest, ReadCompaction) {
-  std::string value(4096, '4'); // a string of size 4K
-  {
-    Options options = CurrentOptions();
-    options.create_if_missing = true;
-    options.max_open_files = 20; // only 10 file in file-cache
-    options.target_file_size_base = 512;
-    options.write_buffer_size = 64 * 1024;
-    options.filter_policy = nullptr;
-    options.block_size = 4096;
-    options.no_block_cache = true;
-    options.disable_seek_compaction = false;
-
-    CreateAndReopenWithCF({"pikachu"}, &options);
-
-    // Write 8MB (2000 values, each 4K)
-    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
-    std::vector<std::string> values;
-    for (int i = 0; i < 2000; i++) {
-      ASSERT_OK(Put(1, Key(i), value));
-    }
-
-    // clear level 0 and 1 if necessary.
-    Flush(1);
-    dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
-    dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
-    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
-    ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
-
-    // write some new keys into level 0
-    for (int i = 0; i < 2000; i = i + 16) {
-      ASSERT_OK(Put(1, Key(i), value));
-    }
-    Flush(1);
-
-    // Wait for any write compaction to finish
-    dbfull()->TEST_WaitForCompact();
-
-    // remember number of files in each level
-    int l1 = NumTableFilesAtLevel(0, 1);
-    int l2 = NumTableFilesAtLevel(1, 1);
-    int l3 = NumTableFilesAtLevel(2, 1);
-    ASSERT_NE(NumTableFilesAtLevel(0, 1), 0);
-    ASSERT_NE(NumTableFilesAtLevel(1, 1), 0);
-    ASSERT_NE(NumTableFilesAtLevel(2, 1), 0);
-
-    // read a bunch of times, trigger read compaction
-    for (int j = 0; j < 100; j++) {
-      for (int i = 0; i < 2000; i++) {
-        Get(1, Key(i));
-      }
-    }
-    // wait for read compaction to finish
-    env_->SleepForMicroseconds(1000000);
-
-    // verify that the number of files have decreased
-    // in some level, indicating that there was a compaction
-    ASSERT_TRUE(NumTableFilesAtLevel(0, 1) < l1 ||
-                NumTableFilesAtLevel(1, 1) < l2 ||
-                NumTableFilesAtLevel(2, 1) < l3);
-  }
-}
-
 // Multi-threaded test:
 namespace {
 
@@ -5890,9 +8357,9 @@ static const int kNumKeys = 1000;
 
 struct MTState {
   DBTest* test;
-  port::AtomicPointer stop;
-  port::AtomicPointer counter[kNumThreads];
-  port::AtomicPointer thread_done[kNumThreads];
+  std::atomic<bool> stop;
+  std::atomic<int> counter[kNumThreads];
+  std::atomic<bool> thread_done[kNumThreads];
 };
 
 struct MTThread {
@@ -5904,12 +8371,12 @@ static void MTThreadBody(void* arg) {
   MTThread* t = reinterpret_cast<MTThread*>(arg);
   int id = t->id;
   DB* db = t->state->test->db_;
-  uintptr_t counter = 0;
+  int counter = 0;
   fprintf(stderr, "... starting thread %d\n", id);
   Random rnd(1000 + id);
   char valbuf[1500];
-  while (t->state->stop.Acquire_Load() == nullptr) {
-    t->state->counter[id].Release_Store(reinterpret_cast<void*>(counter));
+  while (t->state->stop.load(std::memory_order_acquire) == false) {
+    t->state->counter[id].store(counter, std::memory_order_release);
 
     int key = rnd.Uniform(kNumKeys);
     char keybuf[20];
@@ -5920,13 +8387,26 @@ static void MTThreadBody(void* arg) {
       // into each of the CFs
       // We add some padding for force compactions.
       int unique_id = rnd.Uniform(1000000);
-      WriteBatch batch;
-      for (int cf = 0; cf < kColumnFamilies; ++cf) {
-        snprintf(valbuf, sizeof(valbuf), "%d.%d.%d.%d.%-1000d", key, id,
-                 static_cast<int>(counter), cf, unique_id);
-        batch.Put(t->state->test->handles_[cf], Slice(keybuf), Slice(valbuf));
+
+      // Half of the time directly use WriteBatch. Half of the time use
+      // WriteBatchWithIndex.
+      if (rnd.OneIn(2)) {
+        WriteBatch batch;
+        for (int cf = 0; cf < kColumnFamilies; ++cf) {
+          snprintf(valbuf, sizeof(valbuf), "%d.%d.%d.%d.%-1000d", key, id,
+                   static_cast<int>(counter), cf, unique_id);
+          batch.Put(t->state->test->handles_[cf], Slice(keybuf), Slice(valbuf));
+        }
+        ASSERT_OK(db->Write(WriteOptions(), &batch));
+      } else {
+        WriteBatchWithIndex batch(db->GetOptions().comparator);
+        for (int cf = 0; cf < kColumnFamilies; ++cf) {
+          snprintf(valbuf, sizeof(valbuf), "%d.%d.%d.%d.%-1000d", key, id,
+                   static_cast<int>(counter), cf, unique_id);
+          batch.Put(t->state->test->handles_[cf], Slice(keybuf), Slice(valbuf));
+        }
+        ASSERT_OK(db->Write(WriteOptions(), batch.GetWriteBatch()));
       }
-      ASSERT_OK(db->Write(WriteOptions(), &batch));
     } else {
       // Read a value and verify that it matches the pattern written above
       // and that writes to all column families were atomic (unique_id is the
@@ -5956,8 +8436,7 @@ static void MTThreadBody(void* arg) {
           ASSERT_EQ(k, key);
           ASSERT_GE(w, 0);
           ASSERT_LT(w, kNumThreads);
-          ASSERT_LE((unsigned int)c, reinterpret_cast<uintptr_t>(
-                                         t->state->counter[w].Acquire_Load()));
+          ASSERT_LE(c, t->state->counter[w].load(std::memory_order_acquire));
           ASSERT_EQ(cf, i);
           if (i == 0) {
             unique_id = u;
@@ -5971,50 +8450,70 @@ static void MTThreadBody(void* arg) {
     }
     counter++;
   }
-  t->state->thread_done[id].Release_Store(t);
+  t->state->thread_done[id].store(true, std::memory_order_release);
   fprintf(stderr, "... stopping thread %d after %d ops\n", id, int(counter));
 }
 
 }  // namespace
 
-TEST(DBTest, MultiThreaded) {
-  do {
-    std::vector<std::string> cfs;
-    for (int i = 1; i < kColumnFamilies; ++i) {
-      cfs.push_back(std::to_string(i));
-    }
-    CreateAndReopenWithCF(cfs);
-    // Initialize state
-    MTState mt;
-    mt.test = this;
-    mt.stop.Release_Store(0);
-    for (int id = 0; id < kNumThreads; id++) {
-      mt.counter[id].Release_Store(0);
-      mt.thread_done[id].Release_Store(0);
+class MultiThreadedDBTest : public DBTest,
+                            public ::testing::WithParamInterface<int> {
+ public:
+  virtual void SetUp() override { option_config_ = GetParam(); }
+
+  static std::vector<int> GenerateOptionConfigs() {
+    std::vector<int> optionConfigs;
+    for (int optionConfig = kDefault; optionConfig < kEnd; ++optionConfig) {
+      // skip as HashCuckooRep does not support snapshot
+      if (optionConfig != kHashCuckoo) {
+        optionConfigs.push_back(optionConfig);
+      }
     }
+    return optionConfigs;
+  }
+};
 
-    // Start threads
-    MTThread thread[kNumThreads];
-    for (int id = 0; id < kNumThreads; id++) {
-      thread[id].state = &mt;
-      thread[id].id = id;
-      env_->StartThread(MTThreadBody, &thread[id]);
-    }
+TEST_P(MultiThreadedDBTest, MultiThreaded) {
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
+  std::vector<std::string> cfs;
+  for (int i = 1; i < kColumnFamilies; ++i) {
+    cfs.push_back(ToString(i));
+  }
+  CreateAndReopenWithCF(cfs, CurrentOptions(options_override));
+  // Initialize state
+  MTState mt;
+  mt.test = this;
+  mt.stop.store(false, std::memory_order_release);
+  for (int id = 0; id < kNumThreads; id++) {
+    mt.counter[id].store(0, std::memory_order_release);
+    mt.thread_done[id].store(false, std::memory_order_release);
+  }
 
-    // Let them run for a while
-    env_->SleepForMicroseconds(kTestSeconds * 1000000);
+  // Start threads
+  MTThread thread[kNumThreads];
+  for (int id = 0; id < kNumThreads; id++) {
+    thread[id].state = &mt;
+    thread[id].id = id;
+    env_->StartThread(MTThreadBody, &thread[id]);
+  }
 
-    // Stop the threads and wait for them to finish
-    mt.stop.Release_Store(&mt);
-    for (int id = 0; id < kNumThreads; id++) {
-      while (mt.thread_done[id].Acquire_Load() == nullptr) {
-        env_->SleepForMicroseconds(100000);
-      }
+  // Let them run for a while
+  env_->SleepForMicroseconds(kTestSeconds * 1000000);
+
+  // Stop the threads and wait for them to finish
+  mt.stop.store(true, std::memory_order_release);
+  for (int id = 0; id < kNumThreads; id++) {
+    while (mt.thread_done[id].load(std::memory_order_acquire) == false) {
+      env_->SleepForMicroseconds(100000);
     }
-    // skip as HashCuckooRep does not support snapshot
-  } while (ChangeOptions(kSkipHashCuckoo));
+  }
 }
 
+INSTANTIATE_TEST_CASE_P(
+    MultiThreaded, MultiThreadedDBTest,
+    ::testing::ValuesIn(MultiThreadedDBTest::GenerateOptionConfigs()));
+
 // Group commit test:
 namespace {
 
@@ -6034,7 +8533,7 @@ static void GCThreadBody(void* arg) {
   WriteOptions wo;
 
   for (int i = 0; i < kGCNumKeys; ++i) {
-    std::string kv(std::to_string(i + id * kGCNumKeys));
+    std::string kv(ToString(i + id * kGCNumKeys));
     ASSERT_OK(db->Put(wo, kv, kv));
   }
   t->done = true;
@@ -6042,11 +8541,13 @@ static void GCThreadBody(void* arg) {
 
 }  // namespace
 
-TEST(DBTest, GroupCommitTest) {
+TEST_F(DBTest, GroupCommitTest) {
   do {
     Options options = CurrentOptions();
+    options.env = env_;
+    env_->log_write_slowdown_.store(100);
     options.statistics = rocksdb::CreateDBStatistics();
-    Reopen(&options);
+    Reopen(options);
 
     // Start threads
     GCThread thread[kGCNumThreads];
@@ -6062,11 +8563,13 @@ TEST(DBTest, GroupCommitTest) {
         env_->SleepForMicroseconds(100000);
       }
     }
+    env_->log_write_slowdown_.store(0);
+
     ASSERT_GT(TestGetTickerCount(options, WRITE_DONE_BY_OTHER), 0);
 
     std::vector<std::string> expected_db;
     for (int i = 0; i < kGCNumThreads * kGCNumKeys; ++i) {
-      expected_db.push_back(std::to_string(i));
+      expected_db.push_back(ToString(i));
     }
     sort(expected_db.begin(), expected_db.end());
 
@@ -6093,33 +8596,39 @@ class ModelDB: public DB {
   class ModelSnapshot : public Snapshot {
    public:
     KVMap map_;
+
+    virtual SequenceNumber GetSequenceNumber() const override {
+      // no need to call this
+      assert(false);
+      return 0;
+    }
   };
 
   explicit ModelDB(const Options& options) : options_(options) {}
   using DB::Put;
   virtual Status Put(const WriteOptions& o, ColumnFamilyHandle* cf,
-                     const Slice& k, const Slice& v) {
+                     const Slice& k, const Slice& v) override {
     WriteBatch batch;
     batch.Put(cf, k, v);
     return Write(o, &batch);
   }
   using DB::Merge;
   virtual Status Merge(const WriteOptions& o, ColumnFamilyHandle* cf,
-                       const Slice& k, const Slice& v) {
+                       const Slice& k, const Slice& v) override {
     WriteBatch batch;
     batch.Merge(cf, k, v);
     return Write(o, &batch);
   }
   using DB::Delete;
   virtual Status Delete(const WriteOptions& o, ColumnFamilyHandle* cf,
-                        const Slice& key) {
+                        const Slice& key) override {
     WriteBatch batch;
     batch.Delete(cf, key);
     return Write(o, &batch);
   }
   using DB::Get;
   virtual Status Get(const ReadOptions& options, ColumnFamilyHandle* cf,
-                     const Slice& key, std::string* value) {
+                     const Slice& key, std::string* value) override {
     return Status::NotSupported(key);
   }
 
@@ -6127,22 +8636,25 @@ class ModelDB: public DB {
   virtual std::vector<Status> MultiGet(
       const ReadOptions& options,
       const std::vector<ColumnFamilyHandle*>& column_family,
-      const std::vector<Slice>& keys, std::vector<std::string>* values) {
+      const std::vector<Slice>& keys,
+      std::vector<std::string>* values) override {
     std::vector<Status> s(keys.size(),
                           Status::NotSupported("Not implemented."));
     return s;
   }
 
   using DB::GetPropertiesOfAllTables;
-  virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
-                                          TablePropertiesCollection* props) {
+  virtual Status GetPropertiesOfAllTables(
+      ColumnFamilyHandle* column_family,
+      TablePropertiesCollection* props) override {
     return Status();
   }
 
   using DB::KeyMayExist;
   virtual bool KeyMayExist(const ReadOptions& options,
                            ColumnFamilyHandle* column_family, const Slice& key,
-                           std::string* value, bool* value_found = nullptr) {
+                           std::string* value,
+                           bool* value_found = nullptr) override {
     if (value_found != nullptr) {
       *value_found = false;
     }
@@ -6150,7 +8662,7 @@ class ModelDB: public DB {
   }
   using DB::NewIterator;
   virtual Iterator* NewIterator(const ReadOptions& options,
-                                ColumnFamilyHandle* column_family) {
+                                ColumnFamilyHandle* column_family) override {
     if (options.snapshot == nullptr) {
       KVMap* saved = new KVMap;
       *saved = map_;
@@ -6164,31 +8676,32 @@ class ModelDB: public DB {
   virtual Status NewIterators(
       const ReadOptions& options,
       const std::vector<ColumnFamilyHandle*>& column_family,
-      std::vector<Iterator*>* iterators) {
+      std::vector<Iterator*>* iterators) override {
     return Status::NotSupported("Not supported yet");
   }
-  virtual const Snapshot* GetSnapshot() {
+  virtual const Snapshot* GetSnapshot() override {
     ModelSnapshot* snapshot = new ModelSnapshot;
     snapshot->map_ = map_;
     return snapshot;
   }
 
-  virtual void ReleaseSnapshot(const Snapshot* snapshot) {
+  virtual void ReleaseSnapshot(const Snapshot* snapshot) override {
     delete reinterpret_cast<const ModelSnapshot*>(snapshot);
   }
 
-  virtual Status Write(const WriteOptions& options, WriteBatch* batch) {
+  virtual Status Write(const WriteOptions& options,
+                       WriteBatch* batch) override {
     class Handler : public WriteBatch::Handler {
      public:
       KVMap* map_;
-      virtual void Put(const Slice& key, const Slice& value) {
+      virtual void Put(const Slice& key, const Slice& value) override {
         (*map_)[key.ToString()] = value.ToString();
       }
-      virtual void Merge(const Slice& key, const Slice& value) {
+      virtual void Merge(const Slice& key, const Slice& value) override {
         // ignore merge for now
         //(*map_)[key.ToString()] = value.ToString();
       }
-      virtual void Delete(const Slice& key) {
+      virtual void Delete(const Slice& key) override {
         map_->erase(key.ToString());
       }
     };
@@ -6199,12 +8712,18 @@ class ModelDB: public DB {
 
   using DB::GetProperty;
   virtual bool GetProperty(ColumnFamilyHandle* column_family,
-                           const Slice& property, std::string* value) {
+                           const Slice& property, std::string* value) override {
+    return false;
+  }
+  using DB::GetIntProperty;
+  virtual bool GetIntProperty(ColumnFamilyHandle* column_family,
+                              const Slice& property, uint64_t* value) override {
     return false;
   }
   using DB::GetApproximateSizes;
   virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
-                                   const Range* range, int n, uint64_t* sizes) {
+                                   const Range* range, int n,
+                                   uint64_t* sizes) override {
     for (int i = 0; i < n; i++) {
       sizes[i] = 0;
     }
@@ -6212,77 +8731,91 @@ class ModelDB: public DB {
   using DB::CompactRange;
   virtual Status CompactRange(ColumnFamilyHandle* column_family,
                               const Slice* start, const Slice* end,
-                              bool reduce_level, int target_level) {
+                              bool reduce_level, int target_level,
+                              uint32_t output_path_id) override {
+    return Status::NotSupported("Not supported operation.");
+  }
+
+  using DB::CompactFiles;
+  virtual Status CompactFiles(
+      const CompactionOptions& compact_options,
+      ColumnFamilyHandle* column_family,
+      const std::vector<std::string>& input_file_names,
+      const int output_level, const int output_path_id = -1) override {
     return Status::NotSupported("Not supported operation.");
   }
 
   using DB::NumberLevels;
-  virtual int NumberLevels(ColumnFamilyHandle* column_family) { return 1; }
+  virtual int NumberLevels(ColumnFamilyHandle* column_family) override {
+    return 1;
+  }
 
   using DB::MaxMemCompactionLevel;
-  virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) {
+  virtual int MaxMemCompactionLevel(
+      ColumnFamilyHandle* column_family) override {
     return 1;
   }
 
   using DB::Level0StopWriteTrigger;
-  virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family) {
+  virtual int Level0StopWriteTrigger(
+      ColumnFamilyHandle* column_family) override {
     return -1;
   }
 
-  virtual const std::string& GetName() const {
-    return name_;
-  }
+  virtual const std::string& GetName() const override { return name_; }
 
-  virtual Env* GetEnv() const {
-    return nullptr;
-  }
+  virtual Env* GetEnv() const override { return nullptr; }
 
   using DB::GetOptions;
-  virtual const Options& GetOptions(ColumnFamilyHandle* column_family) const {
+  virtual const Options& GetOptions(
+      ColumnFamilyHandle* column_family) const override {
     return options_;
   }
 
+  using DB::GetDBOptions;
+  virtual const DBOptions& GetDBOptions() const override { return options_; }
+
   using DB::Flush;
   virtual Status Flush(const rocksdb::FlushOptions& options,
-                       ColumnFamilyHandle* column_family) {
+                       ColumnFamilyHandle* column_family) override {
     Status ret;
     return ret;
   }
 
-  virtual Status DisableFileDeletions() {
-    return Status::OK();
-  }
-  virtual Status EnableFileDeletions(bool force) {
+  virtual Status DisableFileDeletions() override { return Status::OK(); }
+  virtual Status EnableFileDeletions(bool force) override {
     return Status::OK();
   }
   virtual Status GetLiveFiles(std::vector<std::string>&, uint64_t* size,
-                              bool flush_memtable = true) {
+                              bool flush_memtable = true) override {
     return Status::OK();
   }
 
-  virtual Status GetSortedWalFiles(VectorLogPtr& files) {
+  virtual Status GetSortedWalFiles(VectorLogPtr& files) override {
     return Status::OK();
   }
 
-  virtual Status DeleteFile(std::string name) {
-    return Status::OK();
-  }
+  virtual Status DeleteFile(std::string name) override { return Status::OK(); }
 
-  virtual Status GetDbIdentity(std::string& identity) {
+  virtual Status GetDbIdentity(std::string& identity) override {
     return Status::OK();
   }
 
-  virtual SequenceNumber GetLatestSequenceNumber() const {
-    return 0;
-  }
+  virtual SequenceNumber GetLatestSequenceNumber() const override { return 0; }
   virtual Status GetUpdatesSince(
       rocksdb::SequenceNumber, unique_ptr<rocksdb::TransactionLogIterator>*,
       const TransactionLogIterator::ReadOptions&
-          read_options = TransactionLogIterator::ReadOptions()) {
+          read_options = TransactionLogIterator::ReadOptions()) override {
     return Status::NotSupported("Not supported in Model DB");
   }
 
-  virtual ColumnFamilyHandle* DefaultColumnFamily() const { return nullptr; }
+  virtual ColumnFamilyHandle* DefaultColumnFamily() const override {
+    return nullptr;
+  }
+
+  virtual void GetColumnFamilyMetaData(
+      ColumnFamilyHandle* column_family,
+      ColumnFamilyMetaData* metadata) override {}
 
  private:
   class ModelIter: public Iterator {
@@ -6293,23 +8826,31 @@ class ModelDB: public DB {
     ~ModelIter() {
       if (owned_) delete map_;
     }
-    virtual bool Valid() const { return iter_ != map_->end(); }
-    virtual void SeekToFirst() { iter_ = map_->begin(); }
-    virtual void SeekToLast() {
+    virtual bool Valid() const override { return iter_ != map_->end(); }
+    virtual void SeekToFirst() override { iter_ = map_->begin(); }
+    virtual void SeekToLast() override {
       if (map_->empty()) {
         iter_ = map_->end();
       } else {
         iter_ = map_->find(map_->rbegin()->first);
       }
     }
-    virtual void Seek(const Slice& k) {
+    virtual void Seek(const Slice& k) override {
       iter_ = map_->lower_bound(k.ToString());
     }
-    virtual void Next() { ++iter_; }
-    virtual void Prev() { --iter_; }
-    virtual Slice key() const { return iter_->first; }
-    virtual Slice value() const { return iter_->second; }
-    virtual Status status() const { return Status::OK(); }
+    virtual void Next() override { ++iter_; }
+    virtual void Prev() override {
+      if (iter_ == map_->begin()) {
+        iter_ = map_->end();
+        return;
+      }
+      --iter_;
+    }
+
+    virtual Slice key() const override { return iter_->first; }
+    virtual Slice value() const override { return iter_->second; }
+    virtual Status status() const override { return Status::OK(); }
+
    private:
     const KVMap* const map_;
     const bool owned_;  // Do we own map_
@@ -6377,10 +8918,12 @@ static bool CompareIterators(int step,
   return ok;
 }
 
-TEST(DBTest, Randomized) {
+TEST_F(DBTest, Randomized) {
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
   Random rnd(test::RandomSeed());
   do {
-    ModelDB model(CurrentOptions());
+    ModelDB model(CurrentOptions(options_override));
     const int N = 10000;
     const Snapshot* model_snap = nullptr;
     const Snapshot* db_snap = nullptr;
@@ -6434,8 +8977,14 @@ TEST(DBTest, Randomized) {
       }
 
       if ((step % 100) == 0) {
-        ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr));
-        ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap));
+        // For DB instances that use the hash index + block-based table, the
+        // iterator will be invalid right when seeking a non-existent key, right
+        // than return a key that is close to it.
+        if (option_config_ != kBlockBasedTableWithWholeKeyHashIndex &&
+            option_config_ != kBlockBasedTableWithPrefixHashIndex) {
+          ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr));
+          ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap));
+        }
 
         // Save a snapshot from each DB this time that we'll use next
         // time we compare things, to make sure the current state is
@@ -6443,23 +8992,31 @@ TEST(DBTest, Randomized) {
         if (model_snap != nullptr) model.ReleaseSnapshot(model_snap);
         if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap);
 
-        Reopen();
+
+        auto options = CurrentOptions(options_override);
+        Reopen(options);
         ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr));
 
         model_snap = model.GetSnapshot();
         db_snap = db_->GetSnapshot();
       }
+
+      if ((step % 2000) == 0) {
+        fprintf(stderr,
+                "DBTest.Randomized, option ID: %d, step: %d out of %d\n",
+                option_config_, step, N);
+      }
     }
     if (model_snap != nullptr) model.ReleaseSnapshot(model_snap);
     if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap);
     // skip cuckoo hash as it does not support snapshot.
-  } while (ChangeOptions(kSkipDeletesFilterFirst |
-                         kSkipNoSeekToLast | kSkipHashCuckoo));
+  } while (ChangeOptions(kSkipDeletesFilterFirst | kSkipNoSeekToLast |
+                         kSkipHashCuckoo));
 }
 
-TEST(DBTest, MultiGetSimple) {
+TEST_F(DBTest, MultiGetSimple) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     ASSERT_OK(Put(1, "k1", "v1"));
     ASSERT_OK(Put(1, "k2", "v2"));
     ASSERT_OK(Put(1, "k3", "v3"));
@@ -6489,9 +9046,9 @@ TEST(DBTest, MultiGetSimple) {
   } while (ChangeCompactOptions());
 }
 
-TEST(DBTest, MultiGetEmpty) {
+TEST_F(DBTest, MultiGetEmpty) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     // Empty Key Set
     std::vector<Slice> keys;
     std::vector<std::string> values;
@@ -6500,8 +9057,10 @@ TEST(DBTest, MultiGetEmpty) {
     ASSERT_EQ(s.size(), 0U);
 
     // Empty Database, Empty Key Set
-    DestroyAndReopen();
-    CreateAndReopenWithCF({"pikachu"});
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    DestroyAndReopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
     s = db_->MultiGet(ReadOptions(), cfs, keys, &values);
     ASSERT_EQ(s.size(), 0U);
 
@@ -6556,7 +9115,6 @@ void PrefixScanInit(DBTest *dbtest) {
 
   // GROUP 2
   for (int i = 1; i <= big_range_sstfiles; i++) {
-    std::string keystr;
     snprintf(buf, sizeof(buf), "%02d______:start", 0);
     keystr = std::string(buf);
     ASSERT_OK(dbtest->Put(keystr, keystr));
@@ -6569,50 +9127,56 @@ void PrefixScanInit(DBTest *dbtest) {
 }
 }  // namespace
 
-TEST(DBTest, PrefixScan) {
-  int count;
-  Slice prefix;
-  Slice key;
-  char buf[100];
-  Iterator* iter;
-  snprintf(buf, sizeof(buf), "03______:");
-  prefix = Slice(buf, 8);
-  key = Slice(buf, 9);
-  // db configs
-  env_->count_random_reads_ = true;
-  Options options = CurrentOptions();
-  options.env = env_;
-  options.no_block_cache = true;
-  options.filter_policy = NewBloomFilterPolicy(10);
-  options.prefix_extractor.reset(NewFixedPrefixTransform(8));
-  options.whole_key_filtering = false;
-  options.disable_auto_compactions = true;
-  options.max_background_compactions = 2;
-  options.create_if_missing = true;
-  options.disable_seek_compaction = true;
-  options.memtable_factory.reset(NewHashSkipListRepFactory());
+TEST_F(DBTest, PrefixScan) {
+  XFUNC_TEST("", "dbtest_prefix", prefix_skip1, XFuncPoint::SetSkip,
+             kSkipNoPrefix);
+  while (ChangeFilterOptions()) {
+    int count;
+    Slice prefix;
+    Slice key;
+    char buf[100];
+    Iterator* iter;
+    snprintf(buf, sizeof(buf), "03______:");
+    prefix = Slice(buf, 8);
+    key = Slice(buf, 9);
+    // db configs
+    env_->count_random_reads_ = true;
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.prefix_extractor.reset(NewFixedPrefixTransform(8));
+    options.disable_auto_compactions = true;
+    options.max_background_compactions = 2;
+    options.create_if_missing = true;
+    options.memtable_factory.reset(NewHashSkipListRepFactory(16));
 
-  // 11 RAND I/Os
-  DestroyAndReopen(&options);
-  PrefixScanInit(this);
-  count = 0;
-  env_->random_read_counter_.Reset();
-  iter = db_->NewIterator(ReadOptions());
-  for (iter->Seek(prefix); iter->Valid(); iter->Next()) {
-    if (! iter->key().starts_with(prefix)) {
-      break;
+    BlockBasedTableOptions table_options;
+    table_options.no_block_cache = true;
+    table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+    table_options.whole_key_filtering = false;
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+    // 11 RAND I/Os
+    DestroyAndReopen(options);
+    PrefixScanInit(this);
+    count = 0;
+    env_->random_read_counter_.Reset();
+    iter = db_->NewIterator(ReadOptions());
+    for (iter->Seek(prefix); iter->Valid(); iter->Next()) {
+      if (! iter->key().starts_with(prefix)) {
+        break;
+      }
+      count++;
     }
-    count++;
-  }
-  ASSERT_OK(iter->status());
-  delete iter;
-  ASSERT_EQ(count, 2);
-  ASSERT_EQ(env_->random_read_counter_.Read(), 2);
-  Close();
-  delete options.filter_policy;
+    ASSERT_OK(iter->status());
+    delete iter;
+    ASSERT_EQ(count, 2);
+    ASSERT_EQ(env_->random_read_counter_.Read(), 2);
+    Close();
+  }  // end of while
+  XFUNC_TEST("", "dbtest_prefix", prefix_skip1, XFuncPoint::SetSkip, 0);
 }
 
-TEST(DBTest, TailingIteratorSingle) {
+TEST_F(DBTest, TailingIteratorSingle) {
   ReadOptions read_options;
   read_options.tailing = true;
 
@@ -6630,8 +9194,8 @@ TEST(DBTest, TailingIteratorSingle) {
   ASSERT_TRUE(!iter->Valid());
 }
 
-TEST(DBTest, TailingIteratorKeepAdding) {
-  CreateAndReopenWithCF({"pikachu"});
+TEST_F(DBTest, TailingIteratorKeepAdding) {
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
   ReadOptions read_options;
   read_options.tailing = true;
 
@@ -6652,8 +9216,55 @@ TEST(DBTest, TailingIteratorKeepAdding) {
   }
 }
 
-TEST(DBTest, TailingIteratorDeletes) {
-  CreateAndReopenWithCF({"pikachu"});
+TEST_F(DBTest, TailingIteratorSeekToNext) {
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+  ReadOptions read_options;
+  read_options.tailing = true;
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+  std::string value(1024, 'a');
+
+  const int num_records = 1000;
+  for (int i = 1; i < num_records; ++i) {
+    char buf1[32];
+    char buf2[32];
+    snprintf(buf1, sizeof(buf1), "00a0%016d", i * 5);
+
+    Slice key(buf1, 20);
+    ASSERT_OK(Put(1, key, value));
+
+    if (i % 100 == 99) {
+      ASSERT_OK(Flush(1));
+    }
+
+    snprintf(buf2, sizeof(buf2), "00a0%016d", i * 5 - 2);
+    Slice target(buf2, 20);
+    iter->Seek(target);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(key), 0);
+  }
+  for (int i = 2 * num_records; i > 0; --i) {
+    char buf1[32];
+    char buf2[32];
+    snprintf(buf1, sizeof(buf1), "00a0%016d", i * 5);
+
+    Slice key(buf1, 20);
+    ASSERT_OK(Put(1, key, value));
+
+    if (i % 100 == 99) {
+      ASSERT_OK(Flush(1));
+    }
+
+    snprintf(buf2, sizeof(buf2), "00a0%016d", i * 5 - 2);
+    Slice target(buf2, 20);
+    iter->Seek(target);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(key), 0);
+  }
+}
+
+TEST_F(DBTest, TailingIteratorDeletes) {
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
   ReadOptions read_options;
   read_options.tailing = true;
 
@@ -6691,7 +9302,9 @@ TEST(DBTest, TailingIteratorDeletes) {
   ASSERT_EQ(count, num_records);
 }
 
-TEST(DBTest, TailingIteratorPrefixSeek) {
+TEST_F(DBTest, TailingIteratorPrefixSeek) {
+  XFUNC_TEST("", "dbtest_prefix", prefix_skip1, XFuncPoint::SetSkip,
+             kSkipNoPrefix);
   ReadOptions read_options;
   read_options.tailing = true;
 
@@ -6700,9 +9313,9 @@ TEST(DBTest, TailingIteratorPrefixSeek) {
   options.create_if_missing = true;
   options.disable_auto_compactions = true;
   options.prefix_extractor.reset(NewFixedPrefixTransform(2));
-  options.memtable_factory.reset(NewHashSkipListRepFactory());
-  DestroyAndReopen(&options);
-  CreateAndReopenWithCF({"pikachu"}, &options);
+  options.memtable_factory.reset(NewHashSkipListRepFactory(16));
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
 
   std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
   ASSERT_OK(Put(1, "0101", "test"));
@@ -6721,29 +9334,338 @@ TEST(DBTest, TailingIteratorPrefixSeek) {
 
   iter->Next();
   ASSERT_TRUE(!iter->Valid());
+  XFUNC_TEST("", "dbtest_prefix", prefix_skip1, XFuncPoint::SetSkip, 0);
 }
 
-TEST(DBTest, ChecksumTest) {
-  BlockBasedTableOptions table_options;
-  Options options = CurrentOptions();
+TEST_F(DBTest, TailingIteratorIncomplete) {
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+  ReadOptions read_options;
+  read_options.tailing = true;
+  read_options.read_tier = kBlockCacheTier;
 
-  table_options.checksum = kCRC32c;
-  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-  Reopen(&options);
+  std::string key("key");
+  std::string value("value");
+
+  ASSERT_OK(db_->Put(WriteOptions(), key, value));
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+  iter->SeekToFirst();
+  // we either see the entry or it's not in cache
+  ASSERT_TRUE(iter->Valid() || iter->status().IsIncomplete());
+
+  ASSERT_OK(db_->CompactRange(nullptr, nullptr));
+  iter->SeekToFirst();
+  // should still be true after compaction
+  ASSERT_TRUE(iter->Valid() || iter->status().IsIncomplete());
+}
+
+TEST_F(DBTest, TailingIteratorSeekToSame) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.write_buffer_size = 1000;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ReadOptions read_options;
+  read_options.tailing = true;
+
+  const int NROWS = 10000;
+  // Write rows with keys 00000, 00002, 00004 etc.
+  for (int i = 0; i < NROWS; ++i) {
+    char buf[100];
+    snprintf(buf, sizeof(buf), "%05d", 2*i);
+    std::string key(buf);
+    std::string value("value");
+    ASSERT_OK(db_->Put(WriteOptions(), key, value));
+  }
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+  // Seek to 00001.  We expect to find 00002.
+  std::string start_key = "00001";
+  iter->Seek(start_key);
+  ASSERT_TRUE(iter->Valid());
+
+  std::string found = iter->key().ToString();
+  ASSERT_EQ("00002", found);
+
+  // Now seek to the same key.  The iterator should remain in the same
+  // position.
+  iter->Seek(found);
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(found, iter->key().ToString());
+}
+
+TEST_F(DBTest, ManagedTailingIteratorSingle) {
+  ReadOptions read_options;
+  read_options.tailing = true;
+  read_options.managed = true;
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+  iter->SeekToFirst();
+  ASSERT_TRUE(!iter->Valid());
+
+  // add a record and check that iter can see it
+  ASSERT_OK(db_->Put(WriteOptions(), "mirko", "fodor"));
+  iter->SeekToFirst();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), "mirko");
+
+  iter->Next();
+  ASSERT_TRUE(!iter->Valid());
+}
+
+TEST_F(DBTest, ManagedTailingIteratorKeepAdding) {
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+  ReadOptions read_options;
+  read_options.tailing = true;
+  read_options.managed = true;
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+  std::string value(1024, 'a');
+
+  const int num_records = 10000;
+  for (int i = 0; i < num_records; ++i) {
+    char buf[32];
+    snprintf(buf, sizeof(buf), "%016d", i);
+
+    Slice key(buf, 16);
+    ASSERT_OK(Put(1, key, value));
+
+    iter->Seek(key);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(key), 0);
+  }
+}
+
+TEST_F(DBTest, ManagedTailingIteratorSeekToNext) {
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+  ReadOptions read_options;
+  read_options.tailing = true;
+  read_options.managed = true;
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+  std::string value(1024, 'a');
+
+  const int num_records = 1000;
+  for (int i = 1; i < num_records; ++i) {
+    char buf1[32];
+    char buf2[32];
+    snprintf(buf1, sizeof(buf1), "00a0%016d", i * 5);
+
+    Slice key(buf1, 20);
+    ASSERT_OK(Put(1, key, value));
+
+    if (i % 100 == 99) {
+      ASSERT_OK(Flush(1));
+    }
+
+    snprintf(buf2, sizeof(buf2), "00a0%016d", i * 5 - 2);
+    Slice target(buf2, 20);
+    iter->Seek(target);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(key), 0);
+  }
+  for (int i = 2 * num_records; i > 0; --i) {
+    char buf1[32];
+    char buf2[32];
+    snprintf(buf1, sizeof(buf1), "00a0%016d", i * 5);
+
+    Slice key(buf1, 20);
+    ASSERT_OK(Put(1, key, value));
+
+    if (i % 100 == 99) {
+      ASSERT_OK(Flush(1));
+    }
+
+    snprintf(buf2, sizeof(buf2), "00a0%016d", i * 5 - 2);
+    Slice target(buf2, 20);
+    iter->Seek(target);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(key), 0);
+  }
+}
+
+TEST_F(DBTest, ManagedTailingIteratorDeletes) {
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+  ReadOptions read_options;
+  read_options.tailing = true;
+  read_options.managed = true;
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+
+  // write a single record, read it using the iterator, then delete it
+  ASSERT_OK(Put(1, "0test", "test"));
+  iter->SeekToFirst();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), "0test");
+  ASSERT_OK(Delete(1, "0test"));
+
+  // write many more records
+  const int num_records = 10000;
+  std::string value(1024, 'A');
+
+  for (int i = 0; i < num_records; ++i) {
+    char buf[32];
+    snprintf(buf, sizeof(buf), "1%015d", i);
+
+    Slice key(buf, 16);
+    ASSERT_OK(Put(1, key, value));
+  }
+
+  // force a flush to make sure that no records are read from memtable
+  ASSERT_OK(Flush(1));
+
+  // skip "0test"
+  iter->Next();
+
+  // make sure we can read all new records using the existing iterator
+  int count = 0;
+  for (; iter->Valid(); iter->Next(), ++count) {
+  }
+
+  ASSERT_EQ(count, num_records);
+}
+
+TEST_F(DBTest, ManagedTailingIteratorPrefixSeek) {
+  XFUNC_TEST("", "dbtest_prefix", prefix_skip1, XFuncPoint::SetSkip,
+             kSkipNoPrefix);
+  ReadOptions read_options;
+  read_options.tailing = true;
+  read_options.managed = true;
+
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.prefix_extractor.reset(NewFixedPrefixTransform(2));
+  options.memtable_factory.reset(NewHashSkipListRepFactory(16));
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+  ASSERT_OK(Put(1, "0101", "test"));
+
+  ASSERT_OK(Flush(1));
+
+  ASSERT_OK(Put(1, "0202", "test"));
+
+  // Seek(0102) shouldn't find any records since 0202 has a different prefix
+  iter->Seek("0102");
+  ASSERT_TRUE(!iter->Valid());
+
+  iter->Seek("0202");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), "0202");
+
+  iter->Next();
+  ASSERT_TRUE(!iter->Valid());
+  XFUNC_TEST("", "dbtest_prefix", prefix_skip1, XFuncPoint::SetSkip, 0);
+}
+
+TEST_F(DBTest, ManagedTailingIteratorIncomplete) {
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+  ReadOptions read_options;
+  read_options.tailing = true;
+  read_options.managed = true;
+  read_options.read_tier = kBlockCacheTier;
+
+  std::string key = "key";
+  std::string value = "value";
+
+  ASSERT_OK(db_->Put(WriteOptions(), key, value));
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+  iter->SeekToFirst();
+  // we either see the entry or it's not in cache
+  ASSERT_TRUE(iter->Valid() || iter->status().IsIncomplete());
+
+  ASSERT_OK(db_->CompactRange(nullptr, nullptr));
+  iter->SeekToFirst();
+  // should still be true after compaction
+  ASSERT_TRUE(iter->Valid() || iter->status().IsIncomplete());
+}
+
+TEST_F(DBTest, ManagedTailingIteratorSeekToSame) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.write_buffer_size = 1000;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ReadOptions read_options;
+  read_options.tailing = true;
+  read_options.managed = true;
+
+  const int NROWS = 10000;
+  // Write rows with keys 00000, 00002, 00004 etc.
+  for (int i = 0; i < NROWS; ++i) {
+    char buf[100];
+    snprintf(buf, sizeof(buf), "%05d", 2 * i);
+    std::string key(buf);
+    std::string value("value");
+    ASSERT_OK(db_->Put(WriteOptions(), key, value));
+  }
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+  // Seek to 00001.  We expect to find 00002.
+  std::string start_key = "00001";
+  iter->Seek(start_key);
+  ASSERT_TRUE(iter->Valid());
+
+  std::string found = iter->key().ToString();
+  ASSERT_EQ("00002", found);
+
+  // Now seek to the same key.  The iterator should remain in the same
+  // position.
+  iter->Seek(found);
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(found, iter->key().ToString());
+}
+
+TEST_F(DBTest, BlockBasedTablePrefixIndexTest) {
+  // create a DB with block prefix index
+  BlockBasedTableOptions table_options;
+  Options options = CurrentOptions();
+  table_options.index_type = BlockBasedTableOptions::kHashSearch;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+
+
+  Reopen(options);
+  ASSERT_OK(Put("k1", "v1"));
+  Flush();
+  ASSERT_OK(Put("k2", "v2"));
+
+  // Reopen it without prefix extractor, make sure everything still works.
+  // RocksDB should just fall back to the binary index.
+  table_options.index_type = BlockBasedTableOptions::kBinarySearch;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.prefix_extractor.reset();
+
+  Reopen(options);
+  ASSERT_EQ("v1", Get("k1"));
+  ASSERT_EQ("v2", Get("k2"));
+}
+
+TEST_F(DBTest, ChecksumTest) {
+  BlockBasedTableOptions table_options;
+  Options options = CurrentOptions();
+
+  table_options.checksum = kCRC32c;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
   ASSERT_OK(Put("a", "b"));
   ASSERT_OK(Put("c", "d"));
   ASSERT_OK(Flush());  // table with crc checksum
 
   table_options.checksum = kxxHash;
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-  Reopen(&options);
+  Reopen(options);
   ASSERT_OK(Put("e", "f"));
   ASSERT_OK(Put("g", "h"));
   ASSERT_OK(Flush());  // table with xxhash checksum
 
   table_options.checksum = kCRC32c;
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-  Reopen(&options);
+  Reopen(options);
   ASSERT_EQ("b", Get("a"));
   ASSERT_EQ("d", Get("c"));
   ASSERT_EQ("f", Get("e"));
@@ -6751,14 +9673,3331 @@ TEST(DBTest, ChecksumTest) {
 
   table_options.checksum = kCRC32c;
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-  Reopen(&options);
+  Reopen(options);
   ASSERT_EQ("b", Get("a"));
   ASSERT_EQ("d", Get("c"));
   ASSERT_EQ("f", Get("e"));
   ASSERT_EQ("h", Get("g"));
 }
+
+TEST_F(DBTest, FIFOCompactionTest) {
+  for (int iter = 0; iter < 2; ++iter) {
+    // first iteration -- auto compaction
+    // second iteration -- manual compaction
+    Options options;
+    options.compaction_style = kCompactionStyleFIFO;
+    options.write_buffer_size = 100 << 10;                             // 100KB
+    options.compaction_options_fifo.max_table_files_size = 500 << 10;  // 500KB
+    options.compression = kNoCompression;
+    options.create_if_missing = true;
+    if (iter == 1) {
+      options.disable_auto_compactions = true;
+    }
+    options = CurrentOptions(options);
+    DestroyAndReopen(options);
+
+    Random rnd(301);
+    for (int i = 0; i < 6; ++i) {
+      for (int j = 0; j < 100; ++j) {
+        ASSERT_OK(Put(ToString(i * 100 + j), RandomString(&rnd, 1024)));
+      }
+      // flush should happen here
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    }
+    if (iter == 0) {
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    } else {
+      ASSERT_OK(db_->CompactRange(nullptr, nullptr));
+    }
+    // only 5 files should survive
+    ASSERT_EQ(NumTableFilesAtLevel(0), 5);
+    for (int i = 0; i < 50; ++i) {
+      // these keys should be deleted in previous compaction
+      ASSERT_EQ("NOT_FOUND", Get(ToString(i)));
+    }
+  }
+}
+
+TEST_F(DBTest, SimpleWriteTimeoutTest) {
+  // Block compaction thread, which will also block the flushes because
+  // max_background_flushes == 0, so flushes are getting executed by the
+  // compaction thread
+  env_->SetBackgroundThreads(1, Env::LOW);
+  SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+
+  Options options;
+  options.env = env_;
+  options.create_if_missing = true;
+  options.write_buffer_size = 100000;
+  options.max_background_flushes = 0;
+  options.max_write_buffer_number = 2;
+  options.max_total_wal_size = std::numeric_limits<uint64_t>::max();
+  WriteOptions write_opt;
+  write_opt.timeout_hint_us = 0;
+  DestroyAndReopen(options);
+  // fill the two write buffers
+  ASSERT_OK(Put(Key(1), Key(1) + std::string(100000, 'v'), write_opt));
+  ASSERT_OK(Put(Key(2), Key(2) + std::string(100000, 'v'), write_opt));
+  // As the only two write buffers are full in this moment, the third
+  // Put is expected to be timed-out.
+  write_opt.timeout_hint_us = 50;
+  ASSERT_TRUE(
+      Put(Key(3), Key(3) + std::string(100000, 'v'), write_opt).IsTimedOut());
+
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+}
+
+// Multi-threaded Timeout Test
+namespace {
+
+static const int kValueSize = 1000;
+static const int kWriteBufferSize = 100000;
+
+struct TimeoutWriterState {
+  int id;
+  DB* db;
+  std::atomic<bool> done;
+  std::map<int, std::string> success_kvs;
+};
+
+static void RandomTimeoutWriter(void* arg) {
+  TimeoutWriterState* state = reinterpret_cast<TimeoutWriterState*>(arg);
+  static const uint64_t kTimerBias = 50;
+  int thread_id = state->id;
+  DB* db = state->db;
+
+  Random rnd(1000 + thread_id);
+  WriteOptions write_opt;
+  write_opt.timeout_hint_us = 500;
+  int timeout_count = 0;
+  int num_keys = kNumKeys * 5;
+
+  for (int k = 0; k < num_keys; ++k) {
+    int key = k + thread_id * num_keys;
+    std::string value = RandomString(&rnd, kValueSize);
+    // only the second-half is randomized
+    if (k > num_keys / 2) {
+      switch (rnd.Next() % 5) {
+        case 0:
+          write_opt.timeout_hint_us = 500 * thread_id;
+          break;
+        case 1:
+          write_opt.timeout_hint_us = num_keys - k;
+          break;
+        case 2:
+          write_opt.timeout_hint_us = 1;
+          break;
+        default:
+          write_opt.timeout_hint_us = 0;
+          state->success_kvs.insert({key, value});
+      }
+    }
+
+    uint64_t time_before_put = db->GetEnv()->NowMicros();
+    Status s = db->Put(write_opt, Key(key), value);
+    uint64_t put_duration = db->GetEnv()->NowMicros() - time_before_put;
+    if (write_opt.timeout_hint_us == 0 ||
+        put_duration + kTimerBias < write_opt.timeout_hint_us) {
+      ASSERT_OK(s);
+    }
+    if (s.IsTimedOut()) {
+      timeout_count++;
+      ASSERT_GT(put_duration + kTimerBias, write_opt.timeout_hint_us);
+    }
+  }
+
+  state->done = true;
+}
+
+TEST_F(DBTest, MTRandomTimeoutTest) {
+  Options options;
+  options.env = env_;
+  options.create_if_missing = true;
+  options.max_write_buffer_number = 2;
+  options.compression = kNoCompression;
+  options.level0_slowdown_writes_trigger = 10;
+  options.level0_stop_writes_trigger = 20;
+  options.write_buffer_size = kWriteBufferSize;
+  DestroyAndReopen(options);
+
+  TimeoutWriterState thread_states[kNumThreads];
+  for (int tid = 0; tid < kNumThreads; ++tid) {
+    thread_states[tid].id = tid;
+    thread_states[tid].db = db_;
+    thread_states[tid].done = false;
+    env_->StartThread(RandomTimeoutWriter, &thread_states[tid]);
+  }
+
+  for (int tid = 0; tid < kNumThreads; ++tid) {
+    while (thread_states[tid].done == false) {
+      env_->SleepForMicroseconds(100000);
+    }
+  }
+
+  Flush();
+
+  for (int tid = 0; tid < kNumThreads; ++tid) {
+    auto& success_kvs = thread_states[tid].success_kvs;
+    for (auto it = success_kvs.begin(); it != success_kvs.end(); ++it) {
+      ASSERT_EQ(Get(Key(it->first)), it->second);
+    }
+  }
+}
+
+TEST_F(DBTest, Level0StopWritesTest) {
+  Options options = CurrentOptions();
+  options.level0_slowdown_writes_trigger = 2;
+  options.level0_stop_writes_trigger = 4;
+  options.disable_auto_compactions = true;
+  options.max_mem_compaction_level = 0;
+  Reopen(options);
+
+  // create 4 level0 tables
+  for (int i = 0; i < 4; ++i) {
+    Put("a", "b");
+    Flush();
+  }
+
+  WriteOptions woptions;
+  woptions.timeout_hint_us = 30 * 1000;  // 30 ms
+  Status s = Put("a", "b", woptions);
+  ASSERT_TRUE(s.IsTimedOut());
+}
+
+}  // anonymous namespace
+
+/*
+ * This test is not reliable enough as it heavily depends on disk behavior.
+ */
+TEST_F(DBTest, RateLimitingTest) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 1 << 20;         // 1MB
+  options.level0_file_num_compaction_trigger = 2;
+  options.target_file_size_base = 1 << 20;     // 1MB
+  options.max_bytes_for_level_base = 4 << 20;  // 4MB
+  options.max_bytes_for_level_multiplier = 4;
+  options.compression = kNoCompression;
+  options.create_if_missing = true;
+  options.env = env_;
+  options.IncreaseParallelism(4);
+  DestroyAndReopen(options);
+
+  WriteOptions wo;
+  wo.disableWAL = true;
+
+  // # no rate limiting
+  Random rnd(301);
+  uint64_t start = env_->NowMicros();
+  // Write ~96M data
+  for (int64_t i = 0; i < (96 << 10); ++i) {
+    ASSERT_OK(Put(RandomString(&rnd, 32),
+                  RandomString(&rnd, (1 << 10) + 1), wo));
+  }
+  uint64_t elapsed = env_->NowMicros() - start;
+  double raw_rate = env_->bytes_written_ * 1000000 / elapsed;
+  Close();
+
+  // # rate limiting with 0.7 x threshold
+  options.rate_limiter.reset(
+    NewGenericRateLimiter(static_cast<int64_t>(0.7 * raw_rate)));
+  env_->bytes_written_ = 0;
+  DestroyAndReopen(options);
+
+  start = env_->NowMicros();
+  // Write ~96M data
+  for (int64_t i = 0; i < (96 << 10); ++i) {
+    ASSERT_OK(Put(RandomString(&rnd, 32),
+                  RandomString(&rnd, (1 << 10) + 1), wo));
+  }
+  elapsed = env_->NowMicros() - start;
+  Close();
+  ASSERT_TRUE(options.rate_limiter->GetTotalBytesThrough() ==
+              env_->bytes_written_);
+  double ratio = env_->bytes_written_ * 1000000 / elapsed / raw_rate;
+  fprintf(stderr, "write rate ratio = %.2lf, expected 0.7\n", ratio);
+  ASSERT_TRUE(ratio < 0.8);
+
+  // # rate limiting with half of the raw_rate
+  options.rate_limiter.reset(
+    NewGenericRateLimiter(static_cast<int64_t>(raw_rate / 2)));
+  env_->bytes_written_ = 0;
+  DestroyAndReopen(options);
+
+  start = env_->NowMicros();
+  // Write ~96M data
+  for (int64_t i = 0; i < (96 << 10); ++i) {
+    ASSERT_OK(Put(RandomString(&rnd, 32),
+                  RandomString(&rnd, (1 << 10) + 1), wo));
+  }
+  elapsed = env_->NowMicros() - start;
+  Close();
+  ASSERT_TRUE(options.rate_limiter->GetTotalBytesThrough() ==
+              env_->bytes_written_);
+  ratio = env_->bytes_written_ * 1000000 / elapsed / raw_rate;
+  fprintf(stderr, "write rate ratio = %.2lf, expected 0.5\n", ratio);
+  ASSERT_TRUE(ratio < 0.6);
+}
+
+namespace {
+  bool HaveOverlappingKeyRanges(
+      const Comparator* c,
+      const SstFileMetaData& a, const SstFileMetaData& b) {
+    if (c->Compare(a.smallestkey, b.smallestkey) >= 0) {
+      if (c->Compare(a.smallestkey, b.largestkey) <= 0) {
+        // b.smallestkey <= a.smallestkey <= b.largestkey
+        return true;
+      }
+    } else if (c->Compare(a.largestkey, b.smallestkey) >= 0) {
+      // a.smallestkey < b.smallestkey <= a.largestkey
+      return true;
+    }
+    if (c->Compare(a.largestkey, b.largestkey) <= 0) {
+      if (c->Compare(a.largestkey, b.smallestkey) >= 0) {
+        // b.smallestkey <= a.largestkey <= b.largestkey
+        return true;
+      }
+    } else if (c->Compare(a.smallestkey, b.largestkey) <= 0) {
+      // a.smallestkey <= b.largestkey < a.largestkey
+      return true;
+    }
+    return false;
+  }
+
+  // Identifies all files between level "min_level" and "max_level"
+  // which has overlapping key range with "input_file_meta".
+  void GetOverlappingFileNumbersForLevelCompaction(
+      const ColumnFamilyMetaData& cf_meta,
+      const Comparator* comparator,
+      int min_level, int max_level,
+      const SstFileMetaData* input_file_meta,
+      std::set<std::string>* overlapping_file_names) {
+    std::set<const SstFileMetaData*> overlapping_files;
+    overlapping_files.insert(input_file_meta);
+    for (int m = min_level; m <= max_level; ++m) {
+      for (auto& file : cf_meta.levels[m].files) {
+        for (auto* included_file : overlapping_files) {
+          if (HaveOverlappingKeyRanges(
+                  comparator, *included_file, file)) {
+            overlapping_files.insert(&file);
+            overlapping_file_names->insert(file.name);
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  void VerifyCompactionResult(
+      const ColumnFamilyMetaData& cf_meta,
+      const std::set<std::string>& overlapping_file_numbers) {
+#ifndef NDEBUG
+    for (auto& level : cf_meta.levels) {
+      for (auto& file : level.files) {
+        assert(overlapping_file_numbers.find(file.name) ==
+               overlapping_file_numbers.end());
+      }
+    }
+#endif
+  }
+
+  const SstFileMetaData* PickFileRandomly(
+      const ColumnFamilyMetaData& cf_meta,
+      Random* rand,
+      int* level = nullptr) {
+    auto file_id = rand->Uniform(static_cast<int>(
+        cf_meta.file_count)) + 1;
+    for (auto& level_meta : cf_meta.levels) {
+      if (file_id <= level_meta.files.size()) {
+        if (level != nullptr) {
+          *level = level_meta.level;
+        }
+        auto result = rand->Uniform(file_id);
+        return &(level_meta.files[result]);
+      }
+      file_id -= level_meta.files.size();
+    }
+    assert(false);
+    return nullptr;
+  }
+}  // namespace
+
+// TODO t6534343 -- Don't run two level 0 CompactFiles concurrently
+TEST_F(DBTest, DISABLED_CompactFilesOnLevelCompaction) {
+  const int kTestKeySize = 16;
+  const int kTestValueSize = 984;
+  const int kEntrySize = kTestKeySize + kTestValueSize;
+  const int kEntriesPerBuffer = 100;
+  Options options;
+  options.create_if_missing = true;
+  options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
+  options.compaction_style = kCompactionStyleLevel;
+  options.target_file_size_base = options.write_buffer_size;
+  options.max_bytes_for_level_base = options.target_file_size_base * 2;
+  options.level0_stop_writes_trigger = 2;
+  options.max_bytes_for_level_multiplier = 2;
+  options.compression = kNoCompression;
+  options = CurrentOptions(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  Random rnd(301);
+  for (int key = 64 * kEntriesPerBuffer; key >= 0; --key) {
+    ASSERT_OK(Put(1, ToString(key), RandomString(&rnd, kTestValueSize)));
+  }
+  dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+  dbfull()->TEST_WaitForCompact();
+
+  ColumnFamilyMetaData cf_meta;
+  dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+  int output_level = static_cast<int>(cf_meta.levels.size()) - 1;
+  for (int file_picked = 5; file_picked > 0; --file_picked) {
+    std::set<std::string> overlapping_file_names;
+    std::vector<std::string> compaction_input_file_names;
+    for (int f = 0; f < file_picked; ++f) {
+      int level;
+      auto file_meta = PickFileRandomly(cf_meta, &rnd, &level);
+      compaction_input_file_names.push_back(file_meta->name);
+      GetOverlappingFileNumbersForLevelCompaction(
+          cf_meta, options.comparator, level, output_level,
+          file_meta, &overlapping_file_names);
+    }
+
+    ASSERT_OK(dbfull()->CompactFiles(
+        CompactionOptions(), handles_[1],
+        compaction_input_file_names,
+        output_level));
+
+    // Make sure all overlapping files do not exist after compaction
+    dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+    VerifyCompactionResult(cf_meta, overlapping_file_names);
+  }
+
+  // make sure all key-values are still there.
+  for (int key = 64 * kEntriesPerBuffer; key >= 0; --key) {
+    ASSERT_NE(Get(1, ToString(key)), "NOT_FOUND");
+  }
+}
+
+TEST_F(DBTest, CompactFilesOnUniversalCompaction) {
+  const int kTestKeySize = 16;
+  const int kTestValueSize = 984;
+  const int kEntrySize = kTestKeySize + kTestValueSize;
+  const int kEntriesPerBuffer = 10;
+
+  ChangeCompactOptions();
+  Options options;
+  options.create_if_missing = true;
+  options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
+  options.compaction_style = kCompactionStyleLevel;
+  options.num_levels = 1;
+  options.target_file_size_base = options.write_buffer_size;
+  options.compression = kNoCompression;
+  options = CurrentOptions(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ASSERT_EQ(options.compaction_style, kCompactionStyleUniversal);
+  Random rnd(301);
+  for (int key = 1024 * kEntriesPerBuffer; key >= 0; --key) {
+    ASSERT_OK(Put(1, ToString(key), RandomString(&rnd, kTestValueSize)));
+  }
+  dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+  dbfull()->TEST_WaitForCompact();
+  ColumnFamilyMetaData cf_meta;
+  dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+  std::vector<std::string> compaction_input_file_names;
+  for (auto file : cf_meta.levels[0].files) {
+    if (rnd.OneIn(2)) {
+      compaction_input_file_names.push_back(file.name);
+    }
+  }
+
+  if (compaction_input_file_names.size() == 0) {
+    compaction_input_file_names.push_back(
+        cf_meta.levels[0].files[0].name);
+  }
+
+  // expect fail since universal compaction only allow L0 output
+  ASSERT_TRUE(!dbfull()->CompactFiles(
+      CompactionOptions(), handles_[1],
+      compaction_input_file_names, 1).ok());
+
+  // expect ok and verify the compacted files no longer exist.
+  ASSERT_OK(dbfull()->CompactFiles(
+      CompactionOptions(), handles_[1],
+      compaction_input_file_names, 0));
+
+  dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+  VerifyCompactionResult(
+      cf_meta,
+      std::set<std::string>(compaction_input_file_names.begin(),
+          compaction_input_file_names.end()));
+
+  compaction_input_file_names.clear();
+
+  // Pick the first and the last file, expect everything is
+  // compacted into one single file.
+  compaction_input_file_names.push_back(
+      cf_meta.levels[0].files[0].name);
+  compaction_input_file_names.push_back(
+      cf_meta.levels[0].files[
+          cf_meta.levels[0].files.size() - 1].name);
+  ASSERT_OK(dbfull()->CompactFiles(
+      CompactionOptions(), handles_[1],
+      compaction_input_file_names, 0));
+
+  dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+  ASSERT_EQ(cf_meta.levels[0].files.size(), 1U);
+}
+
+TEST_F(DBTest, TableOptionsSanitizeTest) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+  ASSERT_EQ(db_->GetOptions().allow_mmap_reads, false);
+
+  options.table_factory.reset(new PlainTableFactory());
+  options.prefix_extractor.reset(NewNoopTransform());
+  Destroy(options);
+  ASSERT_TRUE(TryReopen(options).IsNotSupported());
+
+  // Test for check of prefix_extractor when hash index is used for
+  // block-based table
+  BlockBasedTableOptions to;
+  to.index_type = BlockBasedTableOptions::kHashSearch;
+  options = CurrentOptions();
+  options.create_if_missing = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(to));
+  ASSERT_TRUE(TryReopen(options).IsInvalidArgument());
+  options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+  ASSERT_OK(TryReopen(options));
+}
+
+TEST_F(DBTest, SanitizeNumThreads) {
+  for (int attempt = 0; attempt < 2; attempt++) {
+    const size_t kTotalTasks = 8;
+    SleepingBackgroundTask sleeping_tasks[kTotalTasks];
+
+    Options options = CurrentOptions();
+    if (attempt == 0) {
+      options.max_background_compactions = 3;
+      options.max_background_flushes = 2;
+    }
+    options.create_if_missing = true;
+    DestroyAndReopen(options);
+
+    for (size_t i = 0; i < kTotalTasks; i++) {
+      // Insert 5 tasks to low priority queue and 5 tasks to high priority queue
+      env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_tasks[i],
+                     (i < 4) ? Env::Priority::LOW : Env::Priority::HIGH);
+    }
+
+    // Wait 100 milliseconds for they are scheduled.
+    env_->SleepForMicroseconds(100000);
+
+    // pool size 3, total task 4. Queue size should be 1.
+    ASSERT_EQ(1U, options.env->GetThreadPoolQueueLen(Env::Priority::LOW));
+    // pool size 2, total task 4. Queue size should be 2.
+    ASSERT_EQ(2U, options.env->GetThreadPoolQueueLen(Env::Priority::HIGH));
+
+    for (size_t i = 0; i < kTotalTasks; i++) {
+      sleeping_tasks[i].WakeUp();
+      sleeping_tasks[i].WaitUntilDone();
+    }
+
+    ASSERT_OK(Put("abc", "def"));
+    ASSERT_EQ("def", Get("abc"));
+    Flush();
+    ASSERT_EQ("def", Get("abc"));
+  }
+}
+
+TEST_F(DBTest, DBIteratorBoundTest) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+
+  options.prefix_extractor = nullptr;
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("a", "0"));
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("foo1", "bar1"));
+  ASSERT_OK(Put("g1", "0"));
+
+  // testing basic case with no iterate_upper_bound and no prefix_extractor
+  {
+    ReadOptions ro;
+    ro.iterate_upper_bound = nullptr;
+
+    std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
+
+    iter->Seek("foo");
+
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("foo")), 0);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("foo1")), 0);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("g1")), 0);
+  }
+
+  // testing iterate_upper_bound and forward iterator
+  // to make sure it stops at bound
+  {
+    ReadOptions ro;
+    // iterate_upper_bound points beyond the last expected entry
+    Slice prefix("foo2");
+    ro.iterate_upper_bound = &prefix;
+
+    std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
+
+    iter->Seek("foo");
+
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("foo")), 0);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(("foo1")), 0);
+
+    iter->Next();
+    // should stop here...
+    ASSERT_TRUE(!iter->Valid());
+  }
+
+  // prefix is the first letter of the key
+  options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("a", "0"));
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("foo1", "bar1"));
+  ASSERT_OK(Put("g1", "0"));
+
+  // testing with iterate_upper_bound and prefix_extractor
+  // Seek target and iterate_upper_bound are not is same prefix
+  // This should be an error
+  {
+    ReadOptions ro;
+    Slice prefix("g1");
+    ro.iterate_upper_bound = &prefix;
+
+    std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
+
+    iter->Seek("foo");
+
+    ASSERT_TRUE(!iter->Valid());
+    ASSERT_TRUE(iter->status().IsInvalidArgument());
+  }
+
+  // testing that iterate_upper_bound prevents iterating over deleted items
+  // if the bound has already reached
+  {
+    options.prefix_extractor = nullptr;
+    DestroyAndReopen(options);
+    ASSERT_OK(Put("a", "0"));
+    ASSERT_OK(Put("b", "0"));
+    ASSERT_OK(Put("b1", "0"));
+    ASSERT_OK(Put("c", "0"));
+    ASSERT_OK(Put("d", "0"));
+    ASSERT_OK(Put("e", "0"));
+    ASSERT_OK(Delete("c"));
+    ASSERT_OK(Delete("d"));
+
+    // base case with no bound
+    ReadOptions ro;
+    ro.iterate_upper_bound = nullptr;
+
+    std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
+
+    iter->Seek("b");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("b")), 0);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(("b1")), 0);
+
+    perf_context.Reset();
+    iter->Next();
+
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(static_cast<int>(perf_context.internal_delete_skipped_count), 2);
+
+    // now testing with iterate_bound
+    Slice prefix("c");
+    ro.iterate_upper_bound = &prefix;
+
+    iter.reset(db_->NewIterator(ro));
+
+    perf_context.Reset();
+
+    iter->Seek("b");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("b")), 0);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(("b1")), 0);
+
+    iter->Next();
+    // the iteration should stop as soon as the the bound key is reached
+    // even though the key is deleted
+    // hence internal_delete_skipped_count should be 0
+    ASSERT_TRUE(!iter->Valid());
+    ASSERT_EQ(static_cast<int>(perf_context.internal_delete_skipped_count), 0);
+  }
+}
+
+TEST_F(DBTest, WriteSingleThreadEntry) {
+  std::vector<std::thread> threads;
+  dbfull()->TEST_LockMutex();
+  auto w = dbfull()->TEST_BeginWrite();
+  threads.emplace_back([&] { Put("a", "b"); });
+  env_->SleepForMicroseconds(10000);
+  threads.emplace_back([&] { Flush(); });
+  env_->SleepForMicroseconds(10000);
+  dbfull()->TEST_UnlockMutex();
+  dbfull()->TEST_LockMutex();
+  dbfull()->TEST_EndWrite(w);
+  dbfull()->TEST_UnlockMutex();
+
+  for (auto& t : threads) {
+    t.join();
+  }
+}
+
+TEST_F(DBTest, DisableDataSyncTest) {
+  env_->sync_counter_.store(0);
+  // iter 0 -- no sync
+  // iter 1 -- sync
+  for (int iter = 0; iter < 2; ++iter) {
+    Options options = CurrentOptions();
+    options.disableDataSync = iter == 0;
+    options.create_if_missing = true;
+    options.env = env_;
+    Reopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    MakeTables(10, "a", "z");
+    Compact("a", "z");
+
+    if (iter == 0) {
+      ASSERT_EQ(env_->sync_counter_.load(), 0);
+    } else {
+      ASSERT_GT(env_->sync_counter_.load(), 0);
+    }
+    Destroy(options);
+  }
+}
+
+TEST_F(DBTest, DynamicMemtableOptions) {
+  const uint64_t k64KB = 1 << 16;
+  const uint64_t k128KB = 1 << 17;
+  const uint64_t k5KB = 5 * 1024;
+  Options options;
+  options.env = env_;
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.max_background_compactions = 1;
+  options.max_mem_compaction_level = 0;
+  options.write_buffer_size = k64KB;
+  options.max_write_buffer_number = 2;
+  // Don't trigger compact/slowdown/stop
+  options.level0_file_num_compaction_trigger = 1024;
+  options.level0_slowdown_writes_trigger = 1024;
+  options.level0_stop_writes_trigger = 1024;
+  DestroyAndReopen(options);
+
+  auto gen_l0_kb = [this](int size) {
+    Random rnd(301);
+    for (int i = 0; i < size; i++) {
+      ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+  };
+
+  // Test write_buffer_size
+  gen_l0_kb(64);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+  ASSERT_LT(SizeAtLevel(0), k64KB + k5KB);
+  ASSERT_GT(SizeAtLevel(0), k64KB - k5KB);
+
+  // Clean up L0
+  dbfull()->CompactRange(nullptr, nullptr);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+
+  // Increase buffer size
+  ASSERT_OK(dbfull()->SetOptions({
+    {"write_buffer_size", "131072"},
+  }));
+
+  // The existing memtable is still 64KB in size, after it becomes immutable,
+  // the next memtable will be 128KB in size. Write 256KB total, we should
+  // have a 64KB L0 file, a 128KB L0 file, and a memtable with 64KB data
+  gen_l0_kb(256);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 2);
+  ASSERT_LT(SizeAtLevel(0), k128KB + k64KB + 2 * k5KB);
+  ASSERT_GT(SizeAtLevel(0), k128KB + k64KB - 2 * k5KB);
+
+  // Test max_write_buffer_number
+  // Block compaction thread, which will also block the flushes because
+  // max_background_flushes == 0, so flushes are getting executed by the
+  // compaction thread
+  env_->SetBackgroundThreads(1, Env::LOW);
+  SleepingBackgroundTask sleeping_task_low1;
+  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low1,
+                 Env::Priority::LOW);
+  // Start from scratch and disable compaction/flush. Flush can only happen
+  // during compaction but trigger is pretty high
+  options.max_background_flushes = 0;
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+
+  // Put until timeout, bounded by 256 puts. We should see timeout at ~128KB
+  int count = 0;
+  Random rnd(301);
+  WriteOptions wo;
+  wo.timeout_hint_us = 100000;  // Reasonabley long timeout to make sure sleep
+                                // triggers but not forever.
+
+  std::atomic<int> sleep_count(0);
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::DelayWrite:TimedWait",
+      [&](void* arg) { sleep_count.fetch_add(1); });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  while (Put(Key(count), RandomString(&rnd, 1024), wo).ok() && count < 256) {
+    count++;
+  }
+  ASSERT_GT(sleep_count.load(), 0);
+  ASSERT_GT(static_cast<double>(count), 128 * 0.8);
+  ASSERT_LT(static_cast<double>(count), 128 * 1.2);
+
+  sleeping_task_low1.WakeUp();
+  sleeping_task_low1.WaitUntilDone();
+
+  // Increase
+  ASSERT_OK(dbfull()->SetOptions({
+    {"max_write_buffer_number", "8"},
+  }));
+  // Clean up memtable and L0
+  dbfull()->CompactRange(nullptr, nullptr);
+
+  SleepingBackgroundTask sleeping_task_low2;
+  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low2,
+                 Env::Priority::LOW);
+  count = 0;
+  sleep_count.store(0);
+  while (Put(Key(count), RandomString(&rnd, 1024), wo).ok() && count < 1024) {
+    count++;
+  }
+  ASSERT_GT(sleep_count.load(), 0);
+  ASSERT_GT(static_cast<double>(count), 512 * 0.8);
+  ASSERT_LT(static_cast<double>(count), 512 * 1.2);
+  sleeping_task_low2.WakeUp();
+  sleeping_task_low2.WaitUntilDone();
+
+  // Decrease
+  ASSERT_OK(dbfull()->SetOptions({
+    {"max_write_buffer_number", "4"},
+  }));
+  // Clean up memtable and L0
+  dbfull()->CompactRange(nullptr, nullptr);
+
+  SleepingBackgroundTask sleeping_task_low3;
+  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low3,
+                 Env::Priority::LOW);
+
+  count = 0;
+  sleep_count.store(0);
+  while (Put(Key(count), RandomString(&rnd, 1024), wo).ok() && count < 1024) {
+    count++;
+  }
+  ASSERT_GT(sleep_count.load(), 0);
+  ASSERT_GT(static_cast<double>(count), 256 * 0.8);
+  ASSERT_LT(static_cast<double>(count), 266 * 1.2);
+  sleeping_task_low3.WakeUp();
+  sleeping_task_low3.WaitUntilDone();
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+#if ROCKSDB_USING_THREAD_STATUS
+namespace {
+void VerifyOperationCount(Env* env, ThreadStatus::OperationType op_type,
+                          int expected_count) {
+  int op_count = 0;
+  std::vector<ThreadStatus> thread_list;
+  ASSERT_OK(env->GetThreadList(&thread_list));
+  for (auto thread : thread_list) {
+    if (thread.operation_type == op_type) {
+      op_count++;
+    }
+  }
+  ASSERT_EQ(op_count, expected_count);
+}
+}  // namespace
+
+TEST_F(DBTest, GetThreadStatus) {
+  Options options;
+  options.env = env_;
+  options.enable_thread_tracking = true;
+  TryReopen(options);
+
+  std::vector<ThreadStatus> thread_list;
+  Status s = env_->GetThreadList(&thread_list);
+
+  for (int i = 0; i < 2; ++i) {
+    // repeat the test with differet number of high / low priority threads
+    const int kTestCount = 3;
+    const unsigned int kHighPriCounts[kTestCount] = {3, 2, 5};
+    const unsigned int kLowPriCounts[kTestCount] = {10, 15, 3};
+    for (int test = 0; test < kTestCount; ++test) {
+      // Change the number of threads in high / low priority pool.
+      env_->SetBackgroundThreads(kHighPriCounts[test], Env::HIGH);
+      env_->SetBackgroundThreads(kLowPriCounts[test], Env::LOW);
+      // Wait to ensure the all threads has been registered
+      env_->SleepForMicroseconds(100000);
+      s = env_->GetThreadList(&thread_list);
+      ASSERT_OK(s);
+      unsigned int thread_type_counts[ThreadStatus::NUM_THREAD_TYPES];
+      memset(thread_type_counts, 0, sizeof(thread_type_counts));
+      for (auto thread : thread_list) {
+        ASSERT_LT(thread.thread_type, ThreadStatus::NUM_THREAD_TYPES);
+        thread_type_counts[thread.thread_type]++;
+      }
+      // Verify the total number of threades
+      ASSERT_EQ(
+          thread_type_counts[ThreadStatus::HIGH_PRIORITY] +
+              thread_type_counts[ThreadStatus::LOW_PRIORITY],
+          kHighPriCounts[test] + kLowPriCounts[test]);
+      // Verify the number of high-priority threads
+      ASSERT_EQ(
+          thread_type_counts[ThreadStatus::HIGH_PRIORITY],
+          kHighPriCounts[test]);
+      // Verify the number of low-priority threads
+      ASSERT_EQ(
+          thread_type_counts[ThreadStatus::LOW_PRIORITY],
+          kLowPriCounts[test]);
+    }
+    if (i == 0) {
+      // repeat the test with multiple column families
+      CreateAndReopenWithCF({"pikachu", "about-to-remove"}, options);
+      env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(
+          handles_, true);
+    }
+  }
+  db_->DropColumnFamily(handles_[2]);
+  delete handles_[2];
+  handles_.erase(handles_.begin() + 2);
+  env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(
+      handles_, true);
+  Close();
+  env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(
+      handles_, true);
+}
+
+TEST_F(DBTest, DisableThreadStatus) {
+  Options options;
+  options.env = env_;
+  options.enable_thread_tracking = false;
+  TryReopen(options);
+  CreateAndReopenWithCF({"pikachu", "about-to-remove"}, options);
+  // Verify non of the column family info exists
+  env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(
+      handles_, false);
+}
+
+TEST_F(DBTest, ThreadStatusFlush) {
+  Options options;
+  options.env = env_;
+  options.write_buffer_size = 100000;  // Small write buffer
+  options.enable_thread_tracking = true;
+  options = CurrentOptions(options);
+
+  rocksdb::SyncPoint::GetInstance()->LoadDependency({
+      {"FlushJob::FlushJob()", "DBTest::ThreadStatusFlush:1"},
+      {"DBTest::ThreadStatusFlush:2", "FlushJob::~FlushJob()"},
+  });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  CreateAndReopenWithCF({"pikachu"}, options);
+  VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 0);
+
+  ASSERT_OK(Put(1, "foo", "v1"));
+  ASSERT_EQ("v1", Get(1, "foo"));
+  VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 0);
+
+  Put(1, "k1", std::string(100000, 'x'));  // Fill memtable
+  VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 0);
+  Put(1, "k2", std::string(100000, 'y'));  // Trigger flush
+  // wait for flush to be scheduled
+  env_->SleepForMicroseconds(250000);
+  TEST_SYNC_POINT("DBTest::ThreadStatusFlush:1");
+  VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 1);
+  TEST_SYNC_POINT("DBTest::ThreadStatusFlush:2");
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest, ThreadStatusSingleCompaction) {
+  const int kTestKeySize = 16;
+  const int kTestValueSize = 984;
+  const int kEntrySize = kTestKeySize + kTestValueSize;
+  const int kEntriesPerBuffer = 100;
+  Options options;
+  options.create_if_missing = true;
+  options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
+  options.compaction_style = kCompactionStyleLevel;
+  options.target_file_size_base = options.write_buffer_size;
+  options.max_bytes_for_level_base = options.target_file_size_base * 2;
+  options.max_bytes_for_level_multiplier = 2;
+  options.compression = kNoCompression;
+  options = CurrentOptions(options);
+  options.env = env_;
+  options.enable_thread_tracking = true;
+  const int kNumL0Files = 4;
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+
+  rocksdb::SyncPoint::GetInstance()->LoadDependency({
+      {"DBTest::ThreadStatusSingleCompaction:0", "DBImpl::BGWorkCompaction"},
+      {"CompactionJob::Run():Start", "DBTest::ThreadStatusSingleCompaction:1"},
+      {"DBTest::ThreadStatusSingleCompaction:2", "CompactionJob::Run():End"},
+  });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  for (int tests = 0; tests < 2; ++tests) {
+    DestroyAndReopen(options);
+
+    Random rnd(301);
+    // The Put Phase.
+    for (int file = 0; file < kNumL0Files; ++file) {
+      for (int key = 0; key < kEntriesPerBuffer; ++key) {
+        ASSERT_OK(Put(ToString(key + file * kEntriesPerBuffer),
+                      RandomString(&rnd, kTestValueSize)));
+      }
+      Flush();
+    }
+    // This makes sure a compaction won't be scheduled until
+    // we have done with the above Put Phase.
+    TEST_SYNC_POINT("DBTest::ThreadStatusSingleCompaction:0");
+    ASSERT_GE(NumTableFilesAtLevel(0),
+              options.level0_file_num_compaction_trigger);
+
+    // This makes sure at least one compaction is running.
+    TEST_SYNC_POINT("DBTest::ThreadStatusSingleCompaction:1");
+
+    if (options.enable_thread_tracking) {
+      // expecting one single L0 to L1 compaction
+      VerifyOperationCount(env_, ThreadStatus::OP_COMPACTION, 1);
+    } else {
+      // If thread tracking is not enabled, compaction count should be 0.
+      VerifyOperationCount(env_, ThreadStatus::OP_COMPACTION, 0);
+    }
+    // TODO(yhchiang): adding assert to verify each compaction stage.
+    TEST_SYNC_POINT("DBTest::ThreadStatusSingleCompaction:2");
+
+    // repeat the test with disabling thread tracking.
+    options.enable_thread_tracking = false;
+  }
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest, PreShutdownManualCompaction) {
+  Options options = CurrentOptions();
+  options.max_background_flushes = 0;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ASSERT_EQ(dbfull()->MaxMemCompactionLevel(), 2)
+      << "Need to update this test to match kMaxMemCompactLevel";
+
+  // iter - 0 with 7 levels
+  // iter - 1 with 3 levels
+  for (int iter = 0; iter < 2; ++iter) {
+    MakeTables(3, "p", "q", 1);
+    ASSERT_EQ("1,1,1", FilesPerLevel(1));
+
+    // Compaction range falls before files
+    Compact(1, "", "c");
+    ASSERT_EQ("1,1,1", FilesPerLevel(1));
+
+    // Compaction range falls after files
+    Compact(1, "r", "z");
+    ASSERT_EQ("1,1,1", FilesPerLevel(1));
+
+    // Compaction range overlaps files
+    Compact(1, "p1", "p9");
+    ASSERT_EQ("0,0,1", FilesPerLevel(1));
+
+    // Populate a different range
+    MakeTables(3, "c", "e", 1);
+    ASSERT_EQ("1,1,2", FilesPerLevel(1));
+
+    // Compact just the new range
+    Compact(1, "b", "f");
+    ASSERT_EQ("0,0,2", FilesPerLevel(1));
+
+    // Compact all
+    MakeTables(1, "a", "z", 1);
+    ASSERT_EQ("0,1,2", FilesPerLevel(1));
+    CancelAllBackgroundWork(db_);
+    db_->CompactRange(handles_[1], nullptr, nullptr);
+    ASSERT_EQ("0,1,2", FilesPerLevel(1));
+
+    if (iter == 0) {
+      options = CurrentOptions();
+      options.max_background_flushes = 0;
+      options.num_levels = 3;
+      options.create_if_missing = true;
+      DestroyAndReopen(options);
+      CreateAndReopenWithCF({"pikachu"}, options);
+    }
+  }
+}
+
+TEST_F(DBTest, PreShutdownMultipleCompaction) {
+  const int kTestKeySize = 16;
+  const int kTestValueSize = 984;
+  const int kEntrySize = kTestKeySize + kTestValueSize;
+  const int kEntriesPerBuffer = 40;
+  const int kNumL0Files = 4;
+
+  const int kHighPriCount = 3;
+  const int kLowPriCount = 5;
+  env_->SetBackgroundThreads(kHighPriCount, Env::HIGH);
+  env_->SetBackgroundThreads(kLowPriCount, Env::LOW);
+
+  Options options;
+  options.create_if_missing = true;
+  options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
+  options.compaction_style = kCompactionStyleLevel;
+  options.target_file_size_base = options.write_buffer_size;
+  options.max_bytes_for_level_base =
+      options.target_file_size_base * kNumL0Files;
+  options.compression = kNoCompression;
+  options = CurrentOptions(options);
+  options.env = env_;
+  options.enable_thread_tracking = true;
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  options.max_bytes_for_level_multiplier = 2;
+  options.max_background_compactions = kLowPriCount;
+  options.level0_stop_writes_trigger = 1 << 10;
+  options.level0_slowdown_writes_trigger = 1 << 10;
+
+  TryReopen(options);
+  Random rnd(301);
+
+  std::vector<ThreadStatus> thread_list;
+  // Delay both flush and compaction
+  rocksdb::SyncPoint::GetInstance()->LoadDependency(
+      {{"FlushJob::FlushJob()", "CompactionJob::Run():Start"},
+       {"CompactionJob::Run():Start",
+        "DBTest::PreShutdownMultipleCompaction:Preshutdown"},
+        {"CompactionJob::Run():Start",
+        "DBTest::PreShutdownMultipleCompaction:VerifyCompaction"},
+       {"DBTest::PreShutdownMultipleCompaction:Preshutdown",
+        "CompactionJob::Run():End"},
+       {"CompactionJob::Run():End",
+        "DBTest::PreShutdownMultipleCompaction:VerifyPreshutdown"}});
+
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Make rocksdb busy
+  int key = 0;
+  // check how many threads are doing compaction using GetThreadList
+  int operation_count[ThreadStatus::NUM_OP_TYPES] = {0};
+  for (int file = 0; file < 16 * kNumL0Files; ++file) {
+    for (int k = 0; k < kEntriesPerBuffer; ++k) {
+      ASSERT_OK(Put(ToString(key++), RandomString(&rnd, kTestValueSize)));
+    }
+
+    Status s = env_->GetThreadList(&thread_list);
+    for (auto thread : thread_list) {
+      operation_count[thread.operation_type]++;
+    }
+
+    // Speed up the test
+    if (operation_count[ThreadStatus::OP_FLUSH] > 1 &&
+        operation_count[ThreadStatus::OP_COMPACTION] >
+            0.6 * options.max_background_compactions) {
+      break;
+    }
+    if (file == 15 * kNumL0Files) {
+      TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:Preshutdown");
+    }
+  }
+
+  TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:Preshutdown");
+  ASSERT_GE(operation_count[ThreadStatus::OP_COMPACTION], 1);
+  CancelAllBackgroundWork(db_);
+  TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:VerifyPreshutdown");
+  dbfull()->TEST_WaitForCompact();
+  // Record the number of compactions at a time.
+  for (int i = 0; i < ThreadStatus::NUM_OP_TYPES; ++i) {
+    operation_count[i] = 0;
+  }
+  Status s = env_->GetThreadList(&thread_list);
+  for (auto thread : thread_list) {
+    operation_count[thread.operation_type]++;
+  }
+  ASSERT_EQ(operation_count[ThreadStatus::OP_COMPACTION], 0);
+}
+
+TEST_F(DBTest, PreShutdownCompactionMiddle) {
+  const int kTestKeySize = 16;
+  const int kTestValueSize = 984;
+  const int kEntrySize = kTestKeySize + kTestValueSize;
+  const int kEntriesPerBuffer = 40;
+  const int kNumL0Files = 4;
+
+  const int kHighPriCount = 3;
+  const int kLowPriCount = 5;
+  env_->SetBackgroundThreads(kHighPriCount, Env::HIGH);
+  env_->SetBackgroundThreads(kLowPriCount, Env::LOW);
+
+  Options options;
+  options.create_if_missing = true;
+  options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
+  options.compaction_style = kCompactionStyleLevel;
+  options.target_file_size_base = options.write_buffer_size;
+  options.max_bytes_for_level_base =
+      options.target_file_size_base * kNumL0Files;
+  options.compression = kNoCompression;
+  options = CurrentOptions(options);
+  options.env = env_;
+  options.enable_thread_tracking = true;
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  options.max_bytes_for_level_multiplier = 2;
+  options.max_background_compactions = kLowPriCount;
+  options.level0_stop_writes_trigger = 1 << 10;
+  options.level0_slowdown_writes_trigger = 1 << 10;
+
+  TryReopen(options);
+  Random rnd(301);
+
+  std::vector<ThreadStatus> thread_list;
+  // Delay both flush and compaction
+  rocksdb::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBTest::PreShutdownCompactionMiddle:Preshutdown",
+        "CompactionJob::Run():Inprogress"},
+        {"CompactionJob::Run():Start",
+        "DBTest::PreShutdownCompactionMiddle:VerifyCompaction"},
+       {"CompactionJob::Run():Inprogress", "CompactionJob::Run():End"},
+       {"CompactionJob::Run():End",
+        "DBTest::PreShutdownCompactionMiddle:VerifyPreshutdown"}});
+
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Make rocksdb busy
+  int key = 0;
+  // check how many threads are doing compaction using GetThreadList
+  int operation_count[ThreadStatus::NUM_OP_TYPES] = {0};
+  for (int file = 0; file < 16 * kNumL0Files; ++file) {
+    for (int k = 0; k < kEntriesPerBuffer; ++k) {
+      ASSERT_OK(Put(ToString(key++), RandomString(&rnd, kTestValueSize)));
+    }
+
+    Status s = env_->GetThreadList(&thread_list);
+    for (auto thread : thread_list) {
+      operation_count[thread.operation_type]++;
+    }
+
+    // Speed up the test
+    if (operation_count[ThreadStatus::OP_FLUSH] > 1 &&
+        operation_count[ThreadStatus::OP_COMPACTION] >
+            0.6 * options.max_background_compactions) {
+      break;
+    }
+    if (file == 15 * kNumL0Files) {
+      TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:VerifyCompaction");
+    }
+  }
+
+  ASSERT_GE(operation_count[ThreadStatus::OP_COMPACTION], 1);
+  CancelAllBackgroundWork(db_);
+  TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:Preshutdown");
+  TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:VerifyPreshutdown");
+  dbfull()->TEST_WaitForCompact();
+  // Record the number of compactions at a time.
+  for (int i = 0; i < ThreadStatus::NUM_OP_TYPES; ++i) {
+    operation_count[i] = 0;
+  }
+  Status s = env_->GetThreadList(&thread_list);
+  for (auto thread : thread_list) {
+    operation_count[thread.operation_type]++;
+  }
+  ASSERT_EQ(operation_count[ThreadStatus::OP_COMPACTION], 0);
+}
+
+#endif  // ROCKSDB_USING_THREAD_STATUS
+
+TEST_F(DBTest, DynamicLevelMaxBytesBase) {
+  // Use InMemoryEnv, or it would be too slow.
+  unique_ptr<Env> env(new MockEnv(env_));
+
+  const int kNKeys = 1000;
+  int keys[kNKeys];
+
+  auto verify_func = [&]() {
+    for (int i = 0; i < kNKeys; i++) {
+      ASSERT_NE("NOT_FOUND", Get(Key(i)));
+      ASSERT_NE("NOT_FOUND", Get(Key(kNKeys * 2 + i)));
+      if (i < kNKeys / 10) {
+        ASSERT_EQ("NOT_FOUND", Get(Key(kNKeys + keys[i])));
+      } else {
+        ASSERT_NE("NOT_FOUND", Get(Key(kNKeys + keys[i])));
+      }
+    }
+  };
+
+  Random rnd(301);
+  for (int ordered_insert = 0; ordered_insert <= 1; ordered_insert++) {
+    for (int i = 0; i < kNKeys; i++) {
+      keys[i] = i;
+    }
+    if (ordered_insert == 0) {
+      std::random_shuffle(std::begin(keys), std::end(keys));
+    }
+    for (int max_background_compactions = 1; max_background_compactions < 4;
+         max_background_compactions += 2) {
+      Options options;
+      options.env = env.get();
+      options.create_if_missing = true;
+      options.db_write_buffer_size = 2048;
+      options.write_buffer_size = 2048;
+      options.max_write_buffer_number = 2;
+      options.level0_file_num_compaction_trigger = 2;
+      options.level0_slowdown_writes_trigger = 2;
+      options.level0_stop_writes_trigger = 2;
+      options.target_file_size_base = 2048;
+      options.level_compaction_dynamic_level_bytes = true;
+      options.max_bytes_for_level_base = 10240;
+      options.max_bytes_for_level_multiplier = 4;
+      options.hard_rate_limit = 1.1;
+      options.max_background_compactions = max_background_compactions;
+      options.num_levels = 5;
+
+      options.compression_per_level.resize(3);
+      options.compression_per_level[0] = kNoCompression;
+      options.compression_per_level[1] = kLZ4Compression;
+      options.compression_per_level[2] = kSnappyCompression;
+
+      DestroyAndReopen(options);
+
+      for (int i = 0; i < kNKeys; i++) {
+        int key = keys[i];
+        ASSERT_OK(Put(Key(kNKeys + key), RandomString(&rnd, 102)));
+        ASSERT_OK(Put(Key(key), RandomString(&rnd, 102)));
+        ASSERT_OK(Put(Key(kNKeys * 2 + key), RandomString(&rnd, 102)));
+        ASSERT_OK(Delete(Key(kNKeys + keys[i / 10])));
+        env_->SleepForMicroseconds(5000);
+      }
+
+      uint64_t int_prop;
+      ASSERT_TRUE(db_->GetIntProperty("rocksdb.background-errors", &int_prop));
+      ASSERT_EQ(0U, int_prop);
+
+      // Verify DB
+      for (int j = 0; j < 2; j++) {
+        verify_func();
+        if (j == 0) {
+          Reopen(options);
+        }
+      }
+
+      // Test compact range works
+      dbfull()->CompactRange(nullptr, nullptr);
+      // All data should be in the last level.
+      ColumnFamilyMetaData cf_meta;
+      db_->GetColumnFamilyMetaData(&cf_meta);
+      ASSERT_EQ(5U, cf_meta.levels.size());
+      for (int i = 0; i < 4; i++) {
+        ASSERT_EQ(0U, cf_meta.levels[i].files.size());
+      }
+      ASSERT_GT(cf_meta.levels[4U].files.size(), 0U);
+      verify_func();
+
+      Close();
+    }
+  }
+
+  env_->SetBackgroundThreads(1, Env::LOW);
+  env_->SetBackgroundThreads(1, Env::HIGH);
+}
+
+// Test specific cases in dynamic max bytes
+TEST_F(DBTest, DynamicLevelMaxBytesBase2) {
+  Random rnd(301);
+  int kMaxKey = 1000000;
+
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.db_write_buffer_size = 2048;
+  options.write_buffer_size = 2048;
+  options.max_write_buffer_number = 2;
+  options.level0_file_num_compaction_trigger = 2;
+  options.level0_slowdown_writes_trigger = 9999;
+  options.level0_stop_writes_trigger = 9999;
+  options.target_file_size_base = 2048;
+  options.level_compaction_dynamic_level_bytes = true;
+  options.max_bytes_for_level_base = 10240;
+  options.max_bytes_for_level_multiplier = 4;
+  options.max_background_compactions = 2;
+  options.num_levels = 5;
+  options.expanded_compaction_factor = 0;  // Force not expanding in compactions
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 1024;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  DestroyAndReopen(options);
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "true"},
+  }));
+
+  uint64_t int_prop;
+  std::string str_prop;
+
+  // Initial base level is the last level
+  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+  ASSERT_EQ(4U, int_prop);
+
+  // Put about 7K to L0
+  for (int i = 0; i < 70; i++) {
+    ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
+                  RandomString(&rnd, 80)));
+  }
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "false"},
+  }));
+  Flush();
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+  ASSERT_EQ(4U, int_prop);
+
+  // Insert extra about 3.5K to L0. After they are compacted to L4, base level
+  // should be changed to L3.
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "true"},
+  }));
+  for (int i = 0; i < 70; i++) {
+    ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
+                  RandomString(&rnd, 80)));
+  }
+
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "false"},
+  }));
+  Flush();
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+  ASSERT_EQ(3U, int_prop);
+  ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level1", &str_prop));
+  ASSERT_EQ("0", str_prop);
+  ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level2", &str_prop));
+  ASSERT_EQ("0", str_prop);
+
+  // Trigger parallel compaction, and the first one would change the base
+  // level.
+  // Hold compaction jobs to make sure
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run():Start",
+      [&](void* arg) { env_->SleepForMicroseconds(100000); });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "true"},
+  }));
+  // Write about 10K more
+  for (int i = 0; i < 100; i++) {
+    ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
+                  RandomString(&rnd, 80)));
+  }
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "false"},
+  }));
+  Flush();
+  // Wait for 200 milliseconds before proceeding compactions to make sure two
+  // parallel ones are executed.
+  env_->SleepForMicroseconds(200000);
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+  ASSERT_EQ(3U, int_prop);
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+
+  // Trigger a condition that the compaction changes base level and L0->Lbase
+  // happens at the same time.
+  // We try to make last levels' targets to be 10K, 40K, 160K, add triggers
+  // another compaction from 40K->160K.
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "true"},
+  }));
+  // Write about 150K more
+  for (int i = 0; i < 1350; i++) {
+    ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
+                  RandomString(&rnd, 80)));
+  }
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "false"},
+  }));
+  Flush();
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+  ASSERT_EQ(2U, int_prop);
+
+  // Keep Writing data until base level changed 2->1. There will be L0->L2
+  // compaction going on at the same time.
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  for (int attempt = 0; attempt <= 20; attempt++) {
+    // Write about 5K more data with two flushes. It should be flush to level 2
+    // but when it is applied, base level is already 1.
+    for (int i = 0; i < 50; i++) {
+      ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
+                    RandomString(&rnd, 80)));
+    }
+    Flush();
+
+    ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+    if (int_prop == 2U) {
+      env_->SleepForMicroseconds(50000);
+    } else {
+      break;
+    }
+  }
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  env_->SleepForMicroseconds(200000);
+
+  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+  ASSERT_EQ(1U, int_prop);
+}
+
+// Test specific cases in dynamic max bytes
+TEST_F(DBTest, DynamicLevelMaxBytesCompactRange) {
+  Random rnd(301);
+  int kMaxKey = 1000000;
+
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.db_write_buffer_size = 2048;
+  options.write_buffer_size = 2048;
+  options.max_write_buffer_number = 2;
+  options.level0_file_num_compaction_trigger = 2;
+  options.level0_slowdown_writes_trigger = 9999;
+  options.level0_stop_writes_trigger = 9999;
+  options.target_file_size_base = 2;
+  options.level_compaction_dynamic_level_bytes = true;
+  options.max_bytes_for_level_base = 10240;
+  options.max_bytes_for_level_multiplier = 4;
+  options.max_background_compactions = 1;
+  const int kNumLevels = 5;
+  options.num_levels = kNumLevels;
+  options.expanded_compaction_factor = 0;  // Force not expanding in compactions
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 1024;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  DestroyAndReopen(options);
+
+  // Compact against empty DB
+  dbfull()->CompactRange(nullptr, nullptr);
+
+  uint64_t int_prop;
+  std::string str_prop;
+
+  // Initial base level is the last level
+  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+  ASSERT_EQ(4U, int_prop);
+
+  // Put about 7K to L0
+  for (int i = 0; i < 140; i++) {
+    ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
+                  RandomString(&rnd, 80)));
+  }
+  Flush();
+  dbfull()->TEST_WaitForCompact();
+  if (NumTableFilesAtLevel(0) == 0) {
+    // Make sure level 0 is not empty
+    ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
+                  RandomString(&rnd, 80)));
+    Flush();
+  }
+
+  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+  ASSERT_EQ(3U, int_prop);
+  ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level1", &str_prop));
+  ASSERT_EQ("0", str_prop);
+  ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level2", &str_prop));
+  ASSERT_EQ("0", str_prop);
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  std::set<int> output_levels;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionPicker::CompactRange:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        output_levels.insert(compaction->output_level());
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  dbfull()->CompactRange(nullptr, nullptr);
+  ASSERT_EQ(output_levels.size(), 2);
+  ASSERT_TRUE(output_levels.find(3) != output_levels.end());
+  ASSERT_TRUE(output_levels.find(4) != output_levels.end());
+  ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level0", &str_prop));
+  ASSERT_EQ("0", str_prop);
+  ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level3", &str_prop));
+  ASSERT_EQ("0", str_prop);
+  // Base level is still level 3.
+  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+  ASSERT_EQ(3U, int_prop);
+}
+
+TEST_F(DBTest, DynamicLevelMaxBytesBaseInc) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.db_write_buffer_size = 2048;
+  options.write_buffer_size = 2048;
+  options.max_write_buffer_number = 2;
+  options.level0_file_num_compaction_trigger = 2;
+  options.level0_slowdown_writes_trigger = 2;
+  options.level0_stop_writes_trigger = 2;
+  options.target_file_size_base = 2048;
+  options.level_compaction_dynamic_level_bytes = true;
+  options.max_bytes_for_level_base = 10240;
+  options.max_bytes_for_level_multiplier = 4;
+  options.hard_rate_limit = 1.1;
+  options.max_background_compactions = 2;
+  options.num_levels = 5;
+
+  DestroyAndReopen(options);
+
+  int non_trivial = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial",
+      [&](void* arg) { non_trivial++; });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+  const int total_keys = 3000;
+  const int random_part_size = 100;
+  for (int i = 0; i < total_keys; i++) {
+    std::string value = RandomString(&rnd, random_part_size);
+    PutFixed32(&value, static_cast<uint32_t>(i));
+    ASSERT_OK(Put(Key(i), value));
+  }
+  Flush();
+  dbfull()->TEST_WaitForCompact();
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+
+  ASSERT_EQ(non_trivial, 0);
+
+  for (int i = 0; i < total_keys; i++) {
+    std::string value = Get(Key(i));
+    ASSERT_EQ(DecodeFixed32(value.c_str() + random_part_size),
+              static_cast<uint32_t>(i));
+  }
+
+  env_->SetBackgroundThreads(1, Env::LOW);
+  env_->SetBackgroundThreads(1, Env::HIGH);
+}
+
+
+TEST_F(DBTest, DynamicLevelCompressionPerLevel) {
+  if (!Snappy_Supported()) {
+    return;
+  }
+  const int kNKeys = 120;
+  int keys[kNKeys];
+  for (int i = 0; i < kNKeys; i++) {
+    keys[i] = i;
+  }
+  std::random_shuffle(std::begin(keys), std::end(keys));
+
+  Random rnd(301);
+  Options options;
+  options.create_if_missing = true;
+  options.db_write_buffer_size = 20480;
+  options.write_buffer_size = 20480;
+  options.max_write_buffer_number = 2;
+  options.level0_file_num_compaction_trigger = 2;
+  options.level0_slowdown_writes_trigger = 2;
+  options.level0_stop_writes_trigger = 2;
+  options.target_file_size_base = 2048;
+  options.level_compaction_dynamic_level_bytes = true;
+  options.max_bytes_for_level_base = 102400;
+  options.max_bytes_for_level_multiplier = 4;
+  options.max_background_compactions = 1;
+  options.num_levels = 5;
+
+  options.compression_per_level.resize(3);
+  options.compression_per_level[0] = kNoCompression;
+  options.compression_per_level[1] = kNoCompression;
+  options.compression_per_level[2] = kSnappyCompression;
+
+  DestroyAndReopen(options);
+
+  // Insert more than 80K. L4 should be base level. Neither L0 nor L4 should
+  // be compressed, so total data size should be more than 80K.
+  for (int i = 0; i < 20; i++) {
+    ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000)));
+  }
+  Flush();
+  dbfull()->TEST_WaitForCompact();
+
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(3), 0);
+  ASSERT_GT(SizeAtLevel(0) + SizeAtLevel(4), 20U * 4000U);
+
+  // Insert 400KB. Some data will be compressed
+  for (int i = 21; i < 120; i++) {
+    ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000)));
+  }
+  Flush();
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+  ASSERT_LT(SizeAtLevel(0) + SizeAtLevel(3) + SizeAtLevel(4), 120U * 4000U);
+  // Make sure data in files in L3 is not compacted by removing all files
+  // in L4 and calculate number of rows
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "true"},
+  }));
+  ColumnFamilyMetaData cf_meta;
+  db_->GetColumnFamilyMetaData(&cf_meta);
+  for (auto file : cf_meta.levels[4].files) {
+    ASSERT_OK(dbfull()->DeleteFile(file.name));
+  }
+  int num_keys = 0;
+  std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    num_keys++;
+  }
+  ASSERT_OK(iter->status());
+  ASSERT_GT(SizeAtLevel(0) + SizeAtLevel(3), num_keys * 4000U);
+}
+
+TEST_F(DBTest, DynamicLevelCompressionPerLevel2) {
+  const int kNKeys = 500;
+  int keys[kNKeys];
+  for (int i = 0; i < kNKeys; i++) {
+    keys[i] = i;
+  }
+  std::random_shuffle(std::begin(keys), std::end(keys));
+
+  Random rnd(301);
+  Options options;
+  options.create_if_missing = true;
+  options.db_write_buffer_size = 6000;
+  options.write_buffer_size = 6000;
+  options.max_write_buffer_number = 2;
+  options.level0_file_num_compaction_trigger = 2;
+  options.level0_slowdown_writes_trigger = 2;
+  options.level0_stop_writes_trigger = 2;
+  options.hard_rate_limit = 1.1;
+
+  // Use file size to distinguish levels
+  // L1: 10, L2: 20, L3 40, L4 80
+  // L0 is less than 30
+  options.target_file_size_base = 10;
+  options.target_file_size_multiplier = 2;
+
+  options.level_compaction_dynamic_level_bytes = true;
+  options.max_bytes_for_level_base = 200;
+  options.max_bytes_for_level_multiplier = 8;
+  options.max_background_compactions = 1;
+  options.num_levels = 5;
+  std::shared_ptr<mock::MockTableFactory> mtf(new mock::MockTableFactory);
+  options.table_factory = mtf;
+
+  options.compression_per_level.resize(3);
+  options.compression_per_level[0] = kNoCompression;
+  options.compression_per_level[1] = kLZ4Compression;
+  options.compression_per_level[2] = kZlibCompression;
+
+  DestroyAndReopen(options);
+  // When base level is L4, L4 is LZ4.
+  std::atomic<int> num_zlib(0);
+  std::atomic<int> num_lz4(0);
+  std::atomic<int> num_no(0);
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        if (compaction->output_level() == 4) {
+          ASSERT_TRUE(compaction->OutputCompressionType() == kLZ4Compression);
+          num_lz4.fetch_add(1);
+        }
+      });
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "FlushJob::WriteLevel0Table:output_compression", [&](void* arg) {
+        auto* compression = reinterpret_cast<CompressionType*>(arg);
+        ASSERT_TRUE(*compression == kNoCompression);
+        num_no.fetch_add(1);
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  for (int i = 0; i < 100; i++) {
+    ASSERT_OK(Put(Key(keys[i]), RandomString(&rnd, 200)));
+  }
+  Flush();
+  dbfull()->TEST_WaitForCompact();
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(3), 0);
+  ASSERT_GT(NumTableFilesAtLevel(4), 0);
+  ASSERT_GT(num_no.load(), 2);
+  ASSERT_GT(num_lz4.load(), 0);
+  int prev_num_files_l4 = NumTableFilesAtLevel(4);
+
+  // After base level turn L4->L3, L3 becomes LZ4 and L4 becomes Zlib
+  num_lz4.store(0);
+  num_no.store(0);
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        if (compaction->output_level() == 4 && compaction->start_level() == 3) {
+          ASSERT_TRUE(compaction->OutputCompressionType() == kZlibCompression);
+          num_zlib.fetch_add(1);
+        } else {
+          ASSERT_TRUE(compaction->OutputCompressionType() == kLZ4Compression);
+          num_lz4.fetch_add(1);
+        }
+      });
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "FlushJob::WriteLevel0Table:output_compression", [&](void* arg) {
+        auto* compression = reinterpret_cast<CompressionType*>(arg);
+        ASSERT_TRUE(*compression == kNoCompression);
+        num_no.fetch_add(1);
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  for (int i = 101; i < 500; i++) {
+    ASSERT_OK(Put(Key(keys[i]), RandomString(&rnd, 200)));
+    if (i % 100 == 99) {
+      Flush();
+      dbfull()->TEST_WaitForCompact();
+    }
+  }
+
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+  ASSERT_GT(NumTableFilesAtLevel(3), 0);
+  ASSERT_GT(NumTableFilesAtLevel(4), prev_num_files_l4);
+  ASSERT_GT(num_no.load(), 2);
+  ASSERT_GT(num_lz4.load(), 0);
+  ASSERT_GT(num_zlib.load(), 0);
+}
+
+TEST_F(DBTest, DynamicCompactionOptions) {
+  // minimum write buffer size is enforced at 64KB
+  const uint64_t k32KB = 1 << 15;
+  const uint64_t k64KB = 1 << 16;
+  const uint64_t k128KB = 1 << 17;
+  const uint64_t k1MB = 1 << 20;
+  const uint64_t k4KB = 1 << 12;
+  Options options;
+  options.env = env_;
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.hard_rate_limit = 1.1;
+  options.write_buffer_size = k64KB;
+  options.max_write_buffer_number = 2;
+  // Compaction related options
+  options.level0_file_num_compaction_trigger = 3;
+  options.level0_slowdown_writes_trigger = 4;
+  options.level0_stop_writes_trigger = 8;
+  options.max_grandparent_overlap_factor = 10;
+  options.expanded_compaction_factor = 25;
+  options.source_compaction_factor = 1;
+  options.target_file_size_base = k64KB;
+  options.target_file_size_multiplier = 1;
+  options.max_bytes_for_level_base = k128KB;
+  options.max_bytes_for_level_multiplier = 4;
+
+  // Block flush thread and disable compaction thread
+  env_->SetBackgroundThreads(1, Env::LOW);
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  DestroyAndReopen(options);
+
+  auto gen_l0_kb = [this](int start, int size, int stride) {
+    Random rnd(301);
+    for (int i = 0; i < size; i++) {
+      ASSERT_OK(Put(Key(start + stride * i), RandomString(&rnd, 1024)));
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+  };
+
+  // Write 3 files that have the same key range.
+  // Since level0_file_num_compaction_trigger is 3, compaction should be
+  // triggered. The compaction should result in one L1 file
+  gen_l0_kb(0, 64, 1);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+  gen_l0_kb(0, 64, 1);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 2);
+  gen_l0_kb(0, 64, 1);
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ("0,1", FilesPerLevel());
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+  ASSERT_EQ(1U, metadata.size());
+  ASSERT_LE(metadata[0].size, k64KB + k4KB);
+  ASSERT_GE(metadata[0].size, k64KB - k4KB);
+
+  // Test compaction trigger and target_file_size_base
+  // Reduce compaction trigger to 2, and reduce L1 file size to 32KB.
+  // Writing to 64KB L0 files should trigger a compaction. Since these
+  // 2 L0 files have the same key range, compaction merge them and should
+  // result in 2 32KB L1 files.
+  ASSERT_OK(dbfull()->SetOptions({
+    {"level0_file_num_compaction_trigger", "2"},
+    {"target_file_size_base", ToString(k32KB) }
+  }));
+
+  gen_l0_kb(0, 64, 1);
+  ASSERT_EQ("1,1", FilesPerLevel());
+  gen_l0_kb(0, 64, 1);
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ("0,2", FilesPerLevel());
+  metadata.clear();
+  db_->GetLiveFilesMetaData(&metadata);
+  ASSERT_EQ(2U, metadata.size());
+  ASSERT_LE(metadata[0].size, k32KB + k4KB);
+  ASSERT_GE(metadata[0].size, k32KB - k4KB);
+  ASSERT_LE(metadata[1].size, k32KB + k4KB);
+  ASSERT_GE(metadata[1].size, k32KB - k4KB);
+
+  // Test max_bytes_for_level_base
+  // Increase level base size to 256KB and write enough data that will
+  // fill L1 and L2. L1 size should be around 256KB while L2 size should be
+  // around 256KB x 4.
+  ASSERT_OK(dbfull()->SetOptions({
+    {"max_bytes_for_level_base", ToString(k1MB) }
+  }));
+
+  // writing 96 x 64KB => 6 * 1024KB
+  // (L1 + L2) = (1 + 4) * 1024KB
+  for (int i = 0; i < 96; ++i) {
+    gen_l0_kb(i, 64, 96);
+  }
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_GT(SizeAtLevel(1), k1MB / 2);
+  ASSERT_LT(SizeAtLevel(1), k1MB + k1MB / 2);
+
+  // Within (0.5, 1.5) of 4MB.
+  ASSERT_GT(SizeAtLevel(2), 2 * k1MB);
+  ASSERT_LT(SizeAtLevel(2), 6 * k1MB);
+
+  // Test max_bytes_for_level_multiplier and
+  // max_bytes_for_level_base. Now, reduce both mulitplier and level base,
+  // After filling enough data that can fit in L1 - L3, we should see L1 size
+  // reduces to 128KB from 256KB which was asserted previously. Same for L2.
+  ASSERT_OK(dbfull()->SetOptions({
+    {"max_bytes_for_level_multiplier", "2"},
+    {"max_bytes_for_level_base", ToString(k128KB) }
+  }));
+
+  // writing 20 x 64KB = 10 x 128KB
+  // (L1 + L2 + L3) = (1 + 2 + 4) * 128KB
+  for (int i = 0; i < 20; ++i) {
+    gen_l0_kb(i, 64, 32);
+  }
+  dbfull()->TEST_WaitForCompact();
+  uint64_t total_size =
+    SizeAtLevel(1) + SizeAtLevel(2) + SizeAtLevel(3);
+  ASSERT_TRUE(total_size < k128KB * 7 * 1.5);
+
+  // Test level0_stop_writes_trigger.
+  // Clean up memtable and L0. Block compaction threads. If continue to write
+  // and flush memtables. We should see put timeout after 8 memtable flushes
+  // since level0_stop_writes_trigger = 8
+  dbfull()->CompactRange(nullptr, nullptr);
+  // Block compaction
+  SleepingBackgroundTask sleeping_task_low1;
+  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low1,
+                 Env::Priority::LOW);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  int count = 0;
+  Random rnd(301);
+  WriteOptions wo;
+  wo.timeout_hint_us = 10000;
+  while (Put(Key(count), RandomString(&rnd, 1024), wo).ok() && count < 64) {
+    dbfull()->TEST_FlushMemTable(true);
+    count++;
+  }
+  // Stop trigger = 8
+  ASSERT_EQ(count, 8);
+  // Unblock
+  sleeping_task_low1.WakeUp();
+  sleeping_task_low1.WaitUntilDone();
+
+  // Now reduce level0_stop_writes_trigger to 6. Clear up memtables and L0.
+  // Block compaction thread again. Perform the put and memtable flushes
+  // until we see timeout after 6 memtable flushes.
+  ASSERT_OK(dbfull()->SetOptions({
+    {"level0_stop_writes_trigger", "6"}
+  }));
+  dbfull()->CompactRange(nullptr, nullptr);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+
+  // Block compaction
+  SleepingBackgroundTask sleeping_task_low2;
+  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low2,
+                 Env::Priority::LOW);
+  count = 0;
+  while (Put(Key(count), RandomString(&rnd, 1024), wo).ok() && count < 64) {
+    dbfull()->TEST_FlushMemTable(true);
+    count++;
+  }
+  ASSERT_EQ(count, 6);
+  // Unblock
+  sleeping_task_low2.WakeUp();
+  sleeping_task_low2.WaitUntilDone();
+
+  // Test disable_auto_compactions
+  // Compaction thread is unblocked but auto compaction is disabled. Write
+  // 4 L0 files and compaction should be triggered. If auto compaction is
+  // disabled, then TEST_WaitForCompact will be waiting for nothing. Number of
+  // L0 files do not change after the call.
+  ASSERT_OK(dbfull()->SetOptions({
+    {"disable_auto_compactions", "true"}
+  }));
+  dbfull()->CompactRange(nullptr, nullptr);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+
+  for (int i = 0; i < 4; ++i) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
+    // Wait for compaction so that put won't timeout
+    dbfull()->TEST_FlushMemTable(true);
+  }
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(NumTableFilesAtLevel(0), 4);
+
+  // Enable auto compaction and perform the same test, # of L0 files should be
+  // reduced after compaction.
+  ASSERT_OK(dbfull()->SetOptions({
+    {"disable_auto_compactions", "false"}
+  }));
+  dbfull()->CompactRange(nullptr, nullptr);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+
+  for (int i = 0; i < 4; ++i) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
+    // Wait for compaction so that put won't timeout
+    dbfull()->TEST_FlushMemTable(true);
+  }
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_LT(NumTableFilesAtLevel(0), 4);
+
+  // Test for hard_rate_limit.
+  // First change max_bytes_for_level_base to a big value and populate
+  // L1 - L3. Then thrink max_bytes_for_level_base and disable auto compaction
+  // at the same time, we should see some level with score greater than 2.
+  ASSERT_OK(dbfull()->SetOptions({
+    {"max_bytes_for_level_base", ToString(k1MB) }
+  }));
+  // writing 40 x 64KB = 10 x 256KB
+  // (L1 + L2 + L3) = (1 + 2 + 4) * 256KB
+  for (int i = 0; i < 40; ++i) {
+    gen_l0_kb(i, 64, 32);
+  }
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_TRUE((SizeAtLevel(1) > k1MB * 0.8 &&
+               SizeAtLevel(1) < k1MB * 1.2) ||
+              (SizeAtLevel(2) > 2 * k1MB * 0.8 &&
+               SizeAtLevel(2) < 2 * k1MB * 1.2) ||
+              (SizeAtLevel(3) > 4 * k1MB * 0.8 &&
+               SizeAtLevel(3) < 4 * k1MB * 1.2));
+  // Reduce max_bytes_for_level_base and disable compaction at the same time
+  // This should cause score to increase
+  ASSERT_OK(dbfull()->SetOptions({
+    {"disable_auto_compactions", "true"},
+    {"max_bytes_for_level_base", "65536"},
+  }));
+  ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024)));
+  dbfull()->TEST_FlushMemTable(true);
+
+  // Check score is above 2
+  ASSERT_TRUE(SizeAtLevel(1) / k64KB > 2 ||
+              SizeAtLevel(2) / k64KB > 4 ||
+              SizeAtLevel(3) / k64KB > 8);
+
+  // Enfoce hard rate limit. Now set hard_rate_limit to 2,
+  // we should start to see put delay (1000 us) and timeout as a result
+  // (L0 score is not regulated by this limit).
+  ASSERT_OK(dbfull()->SetOptions({
+    {"hard_rate_limit", "2"},
+    {"level0_slowdown_writes_trigger", "18"},
+    {"level0_stop_writes_trigger", "20"}
+  }));
+  ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024)));
+  dbfull()->TEST_FlushMemTable(true);
+
+  std::atomic<int> sleep_count(0);
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::DelayWrite:Sleep", [&](void* arg) { sleep_count.fetch_add(1); });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Hard rate limit slow down for 1000 us, so default 10ms should be ok
+  ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), wo));
+  sleep_count.store(0);
+  ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), wo));
+  ASSERT_GT(sleep_count.load(), 0);
+
+  // Lift the limit and no timeout
+  ASSERT_OK(dbfull()->SetOptions({
+    {"hard_rate_limit", "200"},
+  }));
+  dbfull()->TEST_FlushMemTable(true);
+  sleep_count.store(0);
+  ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), wo));
+  // Technically, time out is still possible for timing issue.
+  ASSERT_EQ(sleep_count.load(), 0);
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+
+
+  // Test max_mem_compaction_level.
+  // Destroy DB and start from scratch
+  options.max_background_compactions = 1;
+  options.max_background_flushes = 0;
+  options.max_mem_compaction_level = 2;
+  DestroyAndReopen(options);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+
+  ASSERT_OK(Put("max_mem_compaction_level_key", RandomString(&rnd, 8)));
+  dbfull()->TEST_FlushMemTable(true);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2), 1);
+
+  ASSERT_TRUE(Put("max_mem_compaction_level_key",
+              RandomString(&rnd, 8)).ok());
+  // Set new value and it becomes effective in this flush
+  ASSERT_OK(dbfull()->SetOptions({
+    {"max_mem_compaction_level", "1"}
+  }));
+  dbfull()->TEST_FlushMemTable(true);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1), 1);
+  ASSERT_EQ(NumTableFilesAtLevel(2), 1);
+
+  ASSERT_TRUE(Put("max_mem_compaction_level_key",
+              RandomString(&rnd, 8)).ok());
+  // Set new value and it becomes effective in this flush
+  ASSERT_OK(dbfull()->SetOptions({
+    {"max_mem_compaction_level", "0"}
+  }));
+  dbfull()->TEST_FlushMemTable(true);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+  ASSERT_EQ(NumTableFilesAtLevel(1), 1);
+  ASSERT_EQ(NumTableFilesAtLevel(2), 1);
+}
+
+TEST_F(DBTest, FileCreationRandomFailure) {
+  Options options;
+  options.env = env_;
+  options.create_if_missing = true;
+  options.write_buffer_size = 100000;  // Small write buffer
+  options.target_file_size_base = 200000;
+  options.max_bytes_for_level_base = 1000000;
+  options.max_bytes_for_level_multiplier = 2;
+
+  DestroyAndReopen(options);
+  Random rnd(301);
+
+  const int kTestSize = kCDTKeysPerBuffer * 4096;
+  const int kTotalIteration = 100;
+  // the second half of the test involves in random failure
+  // of file creation.
+  const int kRandomFailureTest = kTotalIteration / 2;
+  std::vector<std::string> values;
+  for (int i = 0; i < kTestSize; ++i) {
+    values.push_back("NOT_FOUND");
+  }
+  for (int j = 0; j < kTotalIteration; ++j) {
+    if (j == kRandomFailureTest) {
+      env_->non_writeable_rate_.store(90);
+    }
+    for (int k = 0; k < kTestSize; ++k) {
+      // here we expect some of the Put fails.
+      std::string value = RandomString(&rnd, 100);
+      Status s = Put(Key(k), Slice(value));
+      if (s.ok()) {
+        // update the latest successful put
+        values[k] = value;
+      }
+      // But everything before we simulate the failure-test should succeed.
+      if (j < kRandomFailureTest) {
+        ASSERT_OK(s);
+      }
+    }
+  }
+
+  // If rocksdb does not do the correct job, internal assert will fail here.
+  dbfull()->TEST_WaitForFlushMemTable();
+  dbfull()->TEST_WaitForCompact();
+
+  // verify we have the latest successful update
+  for (int k = 0; k < kTestSize; ++k) {
+    auto v = Get(Key(k));
+    ASSERT_EQ(v, values[k]);
+  }
+
+  // reopen and reverify we have the latest successful update
+  env_->non_writeable_rate_.store(0);
+  Reopen(options);
+  for (int k = 0; k < kTestSize; ++k) {
+    auto v = Get(Key(k));
+    ASSERT_EQ(v, values[k]);
+  }
+}
+
+TEST_F(DBTest, PartialCompactionFailure) {
+  Options options;
+  const int kKeySize = 16;
+  const int kKvSize = 1000;
+  const int kKeysPerBuffer = 100;
+  const int kNumL1Files = 5;
+  options.create_if_missing = true;
+  options.write_buffer_size = kKeysPerBuffer * kKvSize;
+  options.max_write_buffer_number = 2;
+  options.target_file_size_base =
+      options.write_buffer_size *
+      (options.max_write_buffer_number - 1);
+  options.level0_file_num_compaction_trigger = kNumL1Files;
+  options.max_bytes_for_level_base =
+      options.level0_file_num_compaction_trigger *
+      options.target_file_size_base;
+  options.max_bytes_for_level_multiplier = 2;
+  options.compression = kNoCompression;
+
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  env_->SetBackgroundThreads(1, Env::LOW);
+  // stop the compaction thread until we simulate the file creation failure.
+  SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+
+  options.env = env_;
+
+  DestroyAndReopen(options);
+
+  const int kNumInsertedKeys =
+      options.level0_file_num_compaction_trigger *
+      (options.max_write_buffer_number - 1) *
+      kKeysPerBuffer;
+
+  Random rnd(301);
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+  for (int k = 0; k < kNumInsertedKeys; ++k) {
+    keys.emplace_back(RandomString(&rnd, kKeySize));
+    values.emplace_back(RandomString(&rnd, kKvSize - kKeySize));
+    ASSERT_OK(Put(Slice(keys[k]), Slice(values[k])));
+  }
+
+  dbfull()->TEST_FlushMemTable(true);
+  // Make sure the number of L0 files can trigger compaction.
+  ASSERT_GE(NumTableFilesAtLevel(0),
+            options.level0_file_num_compaction_trigger);
+
+  auto previous_num_level0_files = NumTableFilesAtLevel(0);
+
+  // Fail the first file creation.
+  env_->non_writable_count_ = 1;
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+
+  // Expect compaction to fail here as one file will fail its
+  // creation.
+  ASSERT_TRUE(!dbfull()->TEST_WaitForCompact().ok());
+
+  // Verify L0 -> L1 compaction does fail.
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+
+  // Verify all L0 files are still there.
+  ASSERT_EQ(NumTableFilesAtLevel(0), previous_num_level0_files);
+
+  // All key-values must exist after compaction fails.
+  for (int k = 0; k < kNumInsertedKeys; ++k) {
+    ASSERT_EQ(values[k], Get(keys[k]));
+  }
+
+  env_->non_writable_count_ = 0;
+
+  // Make sure RocksDB will not get into corrupted state.
+  Reopen(options);
+
+  // Verify again after reopen.
+  for (int k = 0; k < kNumInsertedKeys; ++k) {
+    ASSERT_EQ(values[k], Get(keys[k]));
+  }
+}
+
+TEST_F(DBTest, DynamicMiscOptions) {
+  // Test max_sequential_skip_in_iterations
+  Options options;
+  options.env = env_;
+  options.create_if_missing = true;
+  options.max_sequential_skip_in_iterations = 16;
+  options.compression = kNoCompression;
+  options.statistics = rocksdb::CreateDBStatistics();
+  DestroyAndReopen(options);
+
+  auto assert_reseek_count = [this, &options](int key_start, int num_reseek) {
+    int key0 = key_start;
+    int key1 = key_start + 1;
+    int key2 = key_start + 2;
+    Random rnd(301);
+    ASSERT_OK(Put(Key(key0), RandomString(&rnd, 8)));
+    for (int i = 0; i < 10; ++i) {
+      ASSERT_OK(Put(Key(key1), RandomString(&rnd, 8)));
+    }
+    ASSERT_OK(Put(Key(key2), RandomString(&rnd, 8)));
+    std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+    iter->Seek(Key(key1));
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Key(key1)), 0);
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Key(key2)), 0);
+    ASSERT_EQ(num_reseek,
+              TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION));
+  };
+  // No reseek
+  assert_reseek_count(100, 0);
+
+  ASSERT_OK(dbfull()->SetOptions({
+    {"max_sequential_skip_in_iterations", "4"}
+  }));
+  // Clear memtable and make new option effective
+  dbfull()->TEST_FlushMemTable(true);
+  // Trigger reseek
+  assert_reseek_count(200, 1);
+
+  ASSERT_OK(dbfull()->SetOptions({
+    {"max_sequential_skip_in_iterations", "16"}
+  }));
+  // Clear memtable and make new option effective
+  dbfull()->TEST_FlushMemTable(true);
+  // No reseek
+  assert_reseek_count(300, 1);
+}
+
+TEST_F(DBTest, DontDeletePendingOutputs) {
+  Options options;
+  options.env = env_;
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+
+  // Every time we write to a table file, call FOF/POF with full DB scan. This
+  // will make sure our pending_outputs_ protection work correctly
+  std::function<void()> purge_obsolete_files_function = [&]() {
+    JobContext job_context(0);
+    dbfull()->TEST_LockMutex();
+    dbfull()->FindObsoleteFiles(&job_context, true /*force*/);
+    dbfull()->TEST_UnlockMutex();
+    dbfull()->PurgeObsoleteFiles(job_context);
+  };
+
+  env_->table_write_callback_ = &purge_obsolete_files_function;
+
+  for (int i = 0; i < 2; ++i) {
+    ASSERT_OK(Put("a", "begin"));
+    ASSERT_OK(Put("z", "end"));
+    ASSERT_OK(Flush());
+  }
+
+  // If pending output guard does not work correctly, PurgeObsoleteFiles() will
+  // delete the file that Compaction is trying to create, causing this: error
+  // db/db_test.cc:975: IO error:
+  // /tmp/rocksdbtest-1552237650/db_test/000009.sst: No such file or directory
+  Compact("a", "b");
+}
+
+TEST_F(DBTest, DontDeleteMovedFile) {
+  // This test triggers move compaction and verifies that the file is not
+  // deleted when it's part of move compaction
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.max_bytes_for_level_base = 1024 * 1024;  // 1 MB
+  options.level0_file_num_compaction_trigger =
+      2;  // trigger compaction when we have 2 files
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  // Create two 1MB sst files
+  for (int i = 0; i < 2; ++i) {
+    // Create 1MB sst file
+    for (int j = 0; j < 100; ++j) {
+      ASSERT_OK(Put(Key(i * 50 + j), RandomString(&rnd, 10 * 1024)));
+    }
+    ASSERT_OK(Flush());
+  }
+  // this should execute both L0->L1 and L1->(move)->L2 compactions
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ("0,0,1", FilesPerLevel(0));
+
+  // If the moved file is actually deleted (the move-safeguard in
+  // ~Version::Version() is not there), we get this failure:
+  // Corruption: Can't access /000009.sst
+  Reopen(options);
+}
+
+TEST_F(DBTest, DeleteMovedFileAfterCompaction) {
+  // iter 1 -- delete_obsolete_files_period_micros == 0
+  for (int iter = 0; iter < 2; ++iter) {
+    // This test triggers move compaction and verifies that the file is not
+    // deleted when it's part of move compaction
+    Options options = CurrentOptions();
+    options.env = env_;
+    if (iter == 1) {
+      options.delete_obsolete_files_period_micros = 0;
+    }
+    options.create_if_missing = true;
+    options.level0_file_num_compaction_trigger =
+        2;  // trigger compaction when we have 2 files
+    DestroyAndReopen(options);
+
+    Random rnd(301);
+    // Create two 1MB sst files
+    for (int i = 0; i < 2; ++i) {
+      // Create 1MB sst file
+      for (int j = 0; j < 100; ++j) {
+        ASSERT_OK(Put(Key(i * 50 + j), RandomString(&rnd, 10 * 1024)));
+      }
+      ASSERT_OK(Flush());
+    }
+    // this should execute L0->L1
+    dbfull()->TEST_WaitForCompact();
+    ASSERT_EQ("0,1", FilesPerLevel(0));
+
+    // block compactions
+    SleepingBackgroundTask sleeping_task;
+    env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task,
+                   Env::Priority::LOW);
+
+    options.max_bytes_for_level_base = 1024 * 1024;  // 1 MB
+    Reopen(options);
+    std::unique_ptr<Iterator> iterator(db_->NewIterator(ReadOptions()));
+    ASSERT_EQ("0,1", FilesPerLevel(0));
+    // let compactions go
+    sleeping_task.WakeUp();
+    sleeping_task.WaitUntilDone();
+
+    // this should execute L1->L2 (move)
+    dbfull()->TEST_WaitForCompact();
+
+    ASSERT_EQ("0,0,1", FilesPerLevel(0));
+
+    std::vector<LiveFileMetaData> metadata;
+    db_->GetLiveFilesMetaData(&metadata);
+    ASSERT_EQ(metadata.size(), 1U);
+    auto moved_file_name = metadata[0].name;
+
+    // Create two more 1MB sst files
+    for (int i = 0; i < 2; ++i) {
+      // Create 1MB sst file
+      for (int j = 0; j < 100; ++j) {
+        ASSERT_OK(Put(Key(i * 50 + j + 100), RandomString(&rnd, 10 * 1024)));
+      }
+      ASSERT_OK(Flush());
+    }
+    // this should execute both L0->L1 and L1->L2 (merge with previous file)
+    dbfull()->TEST_WaitForCompact();
+
+    ASSERT_EQ("0,0,2", FilesPerLevel(0));
+
+    // iterator is holding the file
+    ASSERT_TRUE(env_->FileExists(dbname_ + "/" + moved_file_name));
+
+    iterator.reset();
+
+    // this file should have been compacted away
+    ASSERT_TRUE(!env_->FileExists(dbname_ + "/" + moved_file_name));
+  }
+}
+
+TEST_F(DBTest, OptimizeFiltersForHits) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 256 * 1024;
+  options.target_file_size_base = 256 * 1024;
+  options.level0_file_num_compaction_trigger = 2;
+  options.level0_slowdown_writes_trigger = 2;
+  options.level0_stop_writes_trigger = 4;
+  options.max_bytes_for_level_base = 256 * 1024;
+  options.max_write_buffer_number = 2;
+  options.max_background_compactions = 8;
+  options.max_background_flushes = 8;
+  options.compaction_style = kCompactionStyleLevel;
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, true));
+  bbto.whole_key_filtering = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  options.optimize_filters_for_hits = true;
+  options.statistics = rocksdb::CreateDBStatistics();
+  CreateAndReopenWithCF({"mypikachu"}, options);
+
+  int numkeys = 200000;
+  for (int i = 0; i < 20; i += 2) {
+    for (int j = i; j < numkeys; j += 20) {
+      ASSERT_OK(Put(1, Key(j), "val"));
+    }
+  }
+
+
+  ASSERT_OK(Flush(1));
+  dbfull()->TEST_WaitForCompact();
+
+  for (int i = 1; i < numkeys; i += 2) {
+    ASSERT_EQ(Get(1, Key(i)), "NOT_FOUND");
+  }
+
+  ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L0));
+  ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L1));
+  ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L2_AND_UP));
+
+  // When the skip_filters_on_last_level is ON, the last level which has
+  // most of the keys does not use bloom filters. We end up using
+  // bloom filters in a very small number of cases. Without the flag.
+  // this number would be close to 150000 (all the key at the last level) +
+  // some use in the upper levels
+  //
+  ASSERT_GT(90000, TestGetTickerCount(options, BLOOM_FILTER_USEFUL));
+
+  for (int i = 0; i < numkeys; i += 2) {
+    ASSERT_EQ(Get(1, Key(i)), "val");
+  }
+}
+
+TEST_F(DBTest, L0L1L2AndUpHitCounter) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 32 * 1024;
+  options.target_file_size_base = 32 * 1024;
+  options.level0_file_num_compaction_trigger = 2;
+  options.level0_slowdown_writes_trigger = 2;
+  options.level0_stop_writes_trigger = 4;
+  options.max_bytes_for_level_base = 64 * 1024;
+  options.max_write_buffer_number = 2;
+  options.max_background_compactions = 8;
+  options.max_background_flushes = 8;
+  options.statistics = rocksdb::CreateDBStatistics();
+  CreateAndReopenWithCF({"mypikachu"}, options);
+
+  int numkeys = 20000;
+  for (int i = 0; i < numkeys; i++) {
+    ASSERT_OK(Put(1, Key(i), "val"));
+  }
+  ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L0));
+  ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L1));
+  ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L2_AND_UP));
+
+  ASSERT_OK(Flush(1));
+  dbfull()->TEST_WaitForCompact();
+
+  for (int i = 0; i < numkeys; i++) {
+    ASSERT_EQ(Get(1, Key(i)), "val");
+  }
+
+  ASSERT_GT(TestGetTickerCount(options, GET_HIT_L0), 100);
+  ASSERT_GT(TestGetTickerCount(options, GET_HIT_L1), 100);
+  ASSERT_GT(TestGetTickerCount(options, GET_HIT_L2_AND_UP), 100);
+
+  ASSERT_EQ(numkeys, TestGetTickerCount(options, GET_HIT_L0) +
+                         TestGetTickerCount(options, GET_HIT_L1) +
+                         TestGetTickerCount(options, GET_HIT_L2_AND_UP));
+}
+
+TEST_F(DBTest, EncodeDecompressedBlockSizeTest) {
+  // iter 0 -- zlib
+  // iter 1 -- bzip2
+  // iter 2 -- lz4
+  // iter 3 -- lz4HC
+  CompressionType compressions[] = {kZlibCompression, kBZip2Compression,
+                                    kLZ4Compression,  kLZ4HCCompression};
+  for (int iter = 0; iter < 4; ++iter) {
+    // first_table_version 1 -- generate with table_version == 1, read with
+    // table_version == 2
+    // first_table_version 2 -- generate with table_version == 2, read with
+    // table_version == 1
+    for (int first_table_version = 1; first_table_version <= 2;
+         ++first_table_version) {
+      BlockBasedTableOptions table_options;
+      table_options.format_version = first_table_version;
+      table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+      Options options = CurrentOptions();
+      options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+      options.create_if_missing = true;
+      options.compression = compressions[iter];
+      DestroyAndReopen(options);
+
+      int kNumKeysWritten = 100000;
+
+      Random rnd(301);
+      for (int i = 0; i < kNumKeysWritten; ++i) {
+        // compressible string
+        ASSERT_OK(Put(Key(i), RandomString(&rnd, 128) + std::string(128, 'a')));
+      }
+
+      table_options.format_version = first_table_version == 1 ? 2 : 1;
+      options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+      Reopen(options);
+      for (int i = 0; i < kNumKeysWritten; ++i) {
+        auto r = Get(Key(i));
+        ASSERT_EQ(r.substr(128), std::string(128, 'a'));
+      }
+    }
+  }
+}
+
+TEST_F(DBTest, MutexWaitStats) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = rocksdb::CreateDBStatistics();
+  CreateAndReopenWithCF({"pikachu"}, options);
+  const int64_t kMutexWaitDelay = 100;
+  ThreadStatusUtil::TEST_SetStateDelay(
+      ThreadStatus::STATE_MUTEX_WAIT, kMutexWaitDelay);
+  ASSERT_OK(Put("hello", "rocksdb"));
+  ASSERT_GE(TestGetTickerCount(
+            options, DB_MUTEX_WAIT_MICROS), kMutexWaitDelay);
+  ThreadStatusUtil::TEST_SetStateDelay(
+      ThreadStatus::STATE_MUTEX_WAIT, 0);
+}
+
+// This reproduces a bug where we don't delete a file because when it was
+// supposed to be deleted, it was blocked by pending_outputs
+// Consider:
+// 1. current file_number is 13
+// 2. compaction (1) starts, blocks deletion of all files starting with 13
+// (pending outputs)
+// 3. file 13 is created by compaction (2)
+// 4. file 13 is consumed by compaction (3) and file 15 was created. Since file
+// 13 has no references, it is put into VersionSet::obsolete_files_
+// 5. FindObsoleteFiles() gets file 13 from VersionSet::obsolete_files_. File 13
+// is deleted from obsolete_files_ set.
+// 6. PurgeObsoleteFiles() tries to delete file 13, but this file is blocked by
+// pending outputs since compaction (1) is still running. It is not deleted and
+// it is not present in obsolete_files_ anymore. Therefore, we never delete it.
+TEST_F(DBTest, DeleteObsoleteFilesPendingOutputs) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.write_buffer_size = 2 * 1024 * 1024;     // 2 MB
+  options.max_bytes_for_level_base = 1024 * 1024;  // 1 MB
+  options.level0_file_num_compaction_trigger =
+      2;  // trigger compaction when we have 2 files
+  options.max_background_flushes = 2;
+  options.max_background_compactions = 2;
+  Reopen(options);
+
+  Random rnd(301);
+  // Create two 1MB sst files
+  for (int i = 0; i < 2; ++i) {
+    // Create 1MB sst file
+    for (int j = 0; j < 100; ++j) {
+      ASSERT_OK(Put(Key(i * 50 + j), RandomString(&rnd, 10 * 1024)));
+    }
+    ASSERT_OK(Flush());
+  }
+  // this should execute both L0->L1 and L1->(move)->L2 compactions
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ("0,0,1", FilesPerLevel(0));
+
+  SleepingBackgroundTask blocking_thread;
+  port::Mutex mutex_;
+  bool already_blocked(false);
+
+  // block the flush
+  std::function<void()> block_first_time = [&]() {
+    bool blocking = false;
+    {
+      MutexLock l(&mutex_);
+      if (!already_blocked) {
+        blocking = true;
+        already_blocked = true;
+      }
+    }
+    if (blocking) {
+      blocking_thread.DoSleep();
+    }
+  };
+  env_->table_write_callback_ = &block_first_time;
+  // Create 1MB sst file
+  for (int j = 0; j < 256; ++j) {
+    ASSERT_OK(Put(Key(j), RandomString(&rnd, 10 * 1024)));
+  }
+  // this should trigger a flush, which is blocked with block_first_time
+  // pending_file is protecting all the files created after
+
+  ASSERT_OK(dbfull()->TEST_CompactRange(2, nullptr, nullptr));
+
+  ASSERT_EQ("0,0,0,1", FilesPerLevel(0));
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+  ASSERT_EQ(metadata.size(), 1U);
+  auto file_on_L2 = metadata[0].name;
+
+  ASSERT_OK(dbfull()->TEST_CompactRange(3, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,1", FilesPerLevel(0));
+
+  // finish the flush!
+  blocking_thread.WakeUp();
+  blocking_thread.WaitUntilDone();
+  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_EQ("1,0,0,0,1", FilesPerLevel(0));
+
+  metadata.clear();
+  db_->GetLiveFilesMetaData(&metadata);
+  ASSERT_EQ(metadata.size(), 2U);
+
+  // This file should have been deleted
+  ASSERT_TRUE(!env_->FileExists(dbname_ + "/" + file_on_L2));
+}
+
+TEST_F(DBTest, CloseSpeedup) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleLevel;
+  options.write_buffer_size = 100 << 10;  // 100KB
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 4;
+  options.max_bytes_for_level_base = 400 * 1024;
+  options.max_write_buffer_number = 16;
+
+  // Block background threads
+  env_->SetBackgroundThreads(1, Env::LOW);
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+  SleepingBackgroundTask sleeping_task_high;
+  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_high,
+                 Env::Priority::HIGH);
+
+  std::vector<std::string> filenames;
+  env_->GetChildren(dbname_, &filenames);
+  // Delete archival files.
+  for (size_t i = 0; i < filenames.size(); ++i) {
+    env_->DeleteFile(dbname_ + "/" + filenames[i]);
+  }
+  env_->DeleteDir(dbname_);
+  DestroyAndReopen(options);
+
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  env_->SetBackgroundThreads(1, Env::LOW);
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  Random rnd(301);
+  int key_idx = 0;
+
+  // First three 110KB files are not going to level 2
+  // After that, (100K, 200K)
+  for (int num = 0; num < 5; num++) {
+    GenerateNewFile(&rnd, &key_idx, true);
+  }
+
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  Close();
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  // Unblock background threads
+  sleeping_task_high.WakeUp();
+  sleeping_task_high.WaitUntilDone();
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+
+  Destroy(options);
+}
+
+class DelayedMergeOperator : public AssociativeMergeOperator {
+ private:
+  DBTest* db_test_;
+
+ public:
+  explicit DelayedMergeOperator(DBTest* d) : db_test_(d) {}
+  virtual bool Merge(const Slice& key, const Slice* existing_value,
+                     const Slice& value, std::string* new_value,
+                     Logger* logger) const override {
+    db_test_->env_->addon_time_ += 1000;
+    return true;
+  }
+
+  virtual const char* Name() const override { return "DelayedMergeOperator"; }
+};
+
+TEST_F(DBTest, MergeTestTime) {
+  std::string one, two, three;
+  PutFixed64(&one, 1);
+  PutFixed64(&two, 2);
+  PutFixed64(&three, 3);
+
+  // Enable time profiling
+  SetPerfLevel(kEnableTime);
+  this->env_->addon_time_ = 0;
+  Options options;
+  options = CurrentOptions(options);
+  options.statistics = rocksdb::CreateDBStatistics();
+  options.merge_operator.reset(new DelayedMergeOperator(this));
+  DestroyAndReopen(options);
+
+  ASSERT_EQ(TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME), 0);
+  db_->Put(WriteOptions(), "foo", one);
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->Merge(WriteOptions(), "foo", two));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->Merge(WriteOptions(), "foo", three));
+  ASSERT_OK(Flush());
+
+  ReadOptions opt;
+  opt.verify_checksums = true;
+  opt.snapshot = nullptr;
+  std::string result;
+  db_->Get(opt, "foo", &result);
+
+  ASSERT_LT(TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME), 2800000);
+  ASSERT_GT(TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME), 1200000);
+
+  ReadOptions read_options;
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+  int count = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ASSERT_OK(iter->status());
+    ++count;
+  }
+
+  ASSERT_EQ(1, count);
+
+  ASSERT_LT(TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME), 6000000);
+  ASSERT_GT(TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME), 3200000);
+}
+
+TEST_F(DBTest, MergeCompactionTimeTest) {
+  SetPerfLevel(kEnableTime);
+  Options options;
+  options = CurrentOptions(options);
+  options.compaction_filter_factory = std::make_shared<KeepFilterFactory>();
+  options.statistics = rocksdb::CreateDBStatistics();
+  options.merge_operator.reset(new DelayedMergeOperator(this));
+  options.compaction_style = kCompactionStyleUniversal;
+  DestroyAndReopen(options);
+
+  for (int i = 0; i < 1000; i++) {
+    ASSERT_OK(db_->Merge(WriteOptions(), "foo", "TEST"));
+    ASSERT_OK(Flush());
+  }
+  dbfull()->TEST_WaitForFlushMemTable();
+  dbfull()->TEST_WaitForCompact();
+
+  ASSERT_NE(TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME), 0);
+}
+
+TEST_F(DBTest, FilterCompactionTimeTest) {
+  Options options;
+  options.compaction_filter_factory =
+      std::make_shared<DelayFilterFactory>(this);
+  options.disable_auto_compactions = true;
+  options.create_if_missing = true;
+  options.statistics = rocksdb::CreateDBStatistics();
+  options = CurrentOptions(options);
+  DestroyAndReopen(options);
+
+  // put some data
+  for (int table = 0; table < 4; ++table) {
+    for (int i = 0; i < 10 + table; ++i) {
+      Put(ToString(table * 100 + i), "val");
+    }
+    Flush();
+  }
+
+  ASSERT_OK(db_->CompactRange(nullptr, nullptr));
+  ASSERT_EQ(0U, CountLiveFiles());
+
+  Reopen(options);
+
+  Iterator* itr = db_->NewIterator(ReadOptions());
+  itr->SeekToFirst();
+  ASSERT_NE(TestGetTickerCount(options, FILTER_OPERATION_TOTAL_TIME), 0);
+  delete itr;
+}
+
+TEST_F(DBTest, TestLogCleanup) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 64 * 1024;  // very small
+  // only two memtables allowed ==> only two log files
+  options.max_write_buffer_number = 2;
+  Reopen(options);
+
+  for (int i = 0; i < 100000; ++i) {
+    Put(Key(i), "val");
+    // only 2 memtables will be alive, so logs_to_free needs to always be below
+    // 2
+    ASSERT_LT(dbfull()->TEST_LogsToFreeSize(), static_cast<size_t>(3));
+  }
+}
+
+TEST_F(DBTest, EmptyCompactedDB) {
+  Options options;
+  options.max_open_files = -1;
+  options = CurrentOptions(options);
+  Close();
+  ASSERT_OK(ReadOnlyReopen(options));
+  Status s = Put("new", "value");
+  ASSERT_TRUE(s.IsNotSupported());
+  Close();
+}
+
+TEST_F(DBTest, CompressLevelCompaction) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleLevel;
+  options.write_buffer_size = 100 << 10;  // 100KB
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 4;
+  options.max_bytes_for_level_base = 400 * 1024;
+  // First two levels have no compression, so that a trivial move between
+  // them will be allowed. Level 2 has Zlib compression so that a trivial
+  // move to level 3 will not be allowed
+  options.compression_per_level = {kNoCompression, kNoCompression,
+                                   kZlibCompression};
+  int matches = 0, didnt_match = 0, trivial_move = 0, non_trivial = 0;
+
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "Compaction::InputCompressionMatchesOutput:Matches",
+      [&](void* arg) { matches++; });
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "Compaction::InputCompressionMatchesOutput:DidntMatch",
+      [&](void* arg) { didnt_match++; });
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial",
+      [&](void* arg) { non_trivial++; });
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:TrivialMove",
+      [&](void* arg) { trivial_move++; });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  Reopen(options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // First three 110KB files are going to level 0
+  // After that, (100K, 200K)
+  for (int num = 0; num < 3; num++) {
+    GenerateNewFile(&rnd, &key_idx);
+  }
+
+  // Another 110KB triggers a compaction to 400K file to fill up level 0
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(4, GetSstFileCount(dbname_));
+
+  // (1, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4", FilesPerLevel(0));
+
+  // (1, 4, 1)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,1", FilesPerLevel(0));
+
+  // (1, 4, 2)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,2", FilesPerLevel(0));
+
+  // (1, 4, 3)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,3", FilesPerLevel(0));
+
+  // (1, 4, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,4", FilesPerLevel(0));
+
+  // (1, 4, 5)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,5", FilesPerLevel(0));
+
+  // (1, 4, 6)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,6", FilesPerLevel(0));
+
+  // (1, 4, 7)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,7", FilesPerLevel(0));
+
+  // (1, 4, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,8", FilesPerLevel(0));
+
+  ASSERT_EQ(matches, 12);
+  // Currently, the test relies on the number of calls to
+  // InputCompressionMatchesOutput() per compaction.
+  const int kCallsToInputCompressionMatch = 2;
+  ASSERT_EQ(didnt_match, 8 * kCallsToInputCompressionMatch);
+  ASSERT_EQ(trivial_move, 12);
+  ASSERT_EQ(non_trivial, 8);
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 10000);
+  }
+
+  Reopen(options);
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 10000);
+  }
+
+  Destroy(options);
+}
+
+TEST_F(DBTest, SuggestCompactRangeTest) {
+  class CompactionFilterFactoryGetContext : public CompactionFilterFactory {
+   public:
+    virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+        const CompactionFilter::Context& context) override {
+      saved_context = context;
+      std::unique_ptr<CompactionFilter> empty_filter;
+      return empty_filter;
+    }
+    const char* Name() const override {
+      return "CompactionFilterFactoryGetContext";
+    }
+    static bool IsManual(CompactionFilterFactory* compaction_filter_factory) {
+      return reinterpret_cast<CompactionFilterFactoryGetContext*>(
+          compaction_filter_factory)->saved_context.is_manual_compaction;
+    }
+    CompactionFilter::Context saved_context;
+  };
+
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleLevel;
+  options.compaction_filter_factory.reset(
+      new CompactionFilterFactoryGetContext());
+  options.write_buffer_size = 110 << 10;
+  options.level0_file_num_compaction_trigger = 4;
+  options.num_levels = 4;
+  options.compression = kNoCompression;
+  options.max_bytes_for_level_base = 450 << 10;
+  options.target_file_size_base = 98 << 10;
+  options.max_grandparent_overlap_factor = 1 << 20;  // inf
+
+  Reopen(options);
+
+  Random rnd(301);
+
+  for (int num = 0; num < 3; num++) {
+    GenerateNewRandomFile(&rnd);
+  }
+
+  GenerateNewRandomFile(&rnd);
+  ASSERT_EQ("0,4", FilesPerLevel(0));
+  ASSERT_TRUE(!CompactionFilterFactoryGetContext::IsManual(
+                   options.compaction_filter_factory.get()));
+
+  GenerateNewRandomFile(&rnd);
+  ASSERT_EQ("1,4", FilesPerLevel(0));
+
+  GenerateNewRandomFile(&rnd);
+  ASSERT_EQ("2,4", FilesPerLevel(0));
+
+  GenerateNewRandomFile(&rnd);
+  ASSERT_EQ("3,4", FilesPerLevel(0));
+
+  GenerateNewRandomFile(&rnd);
+  ASSERT_EQ("0,4,4", FilesPerLevel(0));
+
+  GenerateNewRandomFile(&rnd);
+  ASSERT_EQ("1,4,4", FilesPerLevel(0));
+
+  GenerateNewRandomFile(&rnd);
+  ASSERT_EQ("2,4,4", FilesPerLevel(0));
+
+  GenerateNewRandomFile(&rnd);
+  ASSERT_EQ("3,4,4", FilesPerLevel(0));
+
+  GenerateNewRandomFile(&rnd);
+  ASSERT_EQ("0,4,8", FilesPerLevel(0));
+
+  GenerateNewRandomFile(&rnd);
+  ASSERT_EQ("1,4,8", FilesPerLevel(0));
+
+  // compact it three times
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_OK(experimental::SuggestCompactRange(db_, nullptr, nullptr));
+    dbfull()->TEST_WaitForCompact();
+  }
+
+  ASSERT_EQ("0,0,13", FilesPerLevel(0));
+
+  GenerateNewRandomFile(&rnd);
+  ASSERT_EQ("1,0,13", FilesPerLevel(0));
+
+  // nonoverlapping with the file on level 0
+  Slice start("a"), end("b");
+  ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end));
+  dbfull()->TEST_WaitForCompact();
+
+  // should not compact the level 0 file
+  ASSERT_EQ("1,0,13", FilesPerLevel(0));
+
+  start = Slice("j");
+  end = Slice("m");
+  ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end));
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_TRUE(CompactionFilterFactoryGetContext::IsManual(
+      options.compaction_filter_factory.get()));
+
+  // now it should compact the level 0 file
+  ASSERT_EQ("0,1,13", FilesPerLevel(0));
+}
+
+TEST_F(DBTest, PromoteL0) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.write_buffer_size = 10 * 1024 * 1024;
+  DestroyAndReopen(options);
+
+  // non overlapping ranges
+  std::vector<std::pair<int32_t, int32_t>> ranges = {
+      {81, 160}, {0, 80}, {161, 240}, {241, 320}};
+
+  int32_t value_size = 10 * 1024;  // 10 KB
+
+  Random rnd(301);
+  std::map<int32_t, std::string> values;
+  for (const auto& range : ranges) {
+    for (int32_t j = range.first; j < range.second; j++) {
+      values[j] = RandomString(&rnd, value_size);
+      ASSERT_OK(Put(Key(j), values[j]));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  int32_t level0_files = NumTableFilesAtLevel(0, 0);
+  ASSERT_EQ(level0_files, ranges.size());
+  ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0);  // No files in L1
+
+  // Promote L0 level to L2.
+  ASSERT_OK(experimental::PromoteL0(db_, db_->DefaultColumnFamily(), 2));
+  // We expect that all the files were trivially moved from L0 to L2
+  ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2, 0), level0_files);
+
+  for (const auto& kv : values) {
+    ASSERT_EQ(Get(Key(kv.first)), kv.second);
+  }
+}
+
+TEST_F(DBTest, PromoteL0Failure) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.write_buffer_size = 10 * 1024 * 1024;
+  DestroyAndReopen(options);
+
+  // Produce two L0 files with overlapping ranges.
+  ASSERT_OK(Put(Key(0), ""));
+  ASSERT_OK(Put(Key(3), ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Key(1), ""));
+  ASSERT_OK(Flush());
+
+  Status status;
+  // Fails because L0 has overlapping files.
+  status = experimental::PromoteL0(db_, db_->DefaultColumnFamily());
+  ASSERT_TRUE(status.IsInvalidArgument());
+
+  ASSERT_OK(db_->CompactRange(nullptr, nullptr));
+  // Now there is a file in L1.
+  ASSERT_GE(NumTableFilesAtLevel(1, 0), 1);
+
+  ASSERT_OK(Put(Key(5), ""));
+  ASSERT_OK(Flush());
+  // Fails because L1 is non-empty.
+  status = experimental::PromoteL0(db_, db_->DefaultColumnFamily());
+  ASSERT_TRUE(status.IsInvalidArgument());
+}
+
+// Github issue #596
+TEST_F(DBTest, HugeNumberOfLevels) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 2 * 1024 * 1024;         // 2MB
+  options.max_bytes_for_level_base = 2 * 1024 * 1024;  // 2MB
+  options.num_levels = 12;
+  options.max_background_compactions = 10;
+  options.max_bytes_for_level_multiplier = 2;
+  options.level_compaction_dynamic_level_bytes = true;
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < 300000; ++i) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
+  }
+
+  ASSERT_OK(db_->CompactRange(nullptr, nullptr));
+}
+
+// Github issue #595
+// Large write batch with column families
+TEST_F(DBTest, LargeBatchWithColumnFamilies) {
+  Options options;
+  options.env = env_;
+  options = CurrentOptions(options);
+  options.write_buffer_size = 100000;  // Small write buffer
+  CreateAndReopenWithCF({"pikachu"}, options);
+  int64_t j = 0;
+  for (int i = 0; i < 5; i++) {
+    for (int pass = 1; pass <= 3; pass++) {
+      WriteBatch batch;
+      size_t write_size = 1024 * 1024 * (5 + i);
+      fprintf(stderr, "prepare: %ld MB, pass:%d\n", (write_size / 1024 / 1024),
+              pass);
+      for (;;) {
+        std::string data(3000, j++ % 127 + 20);
+        data += std::to_string(j);
+        batch.Put(handles_[0], Slice(data), Slice(data));
+        if (batch.GetDataSize() > write_size) {
+          break;
+        }
+      }
+      fprintf(stderr, "write: %ld MB\n", (batch.GetDataSize() / 1024 / 1024));
+      ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+      fprintf(stderr, "done\n");
+    }
+  }
+  // make sure we can re-open it.
+  ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
+}
+
+// Make sure that Flushes can proceed in parallel with CompactRange()
+TEST_F(DBTest, FlushesInParallelWithCompactRange) {
+  // iter == 0 -- leveled
+  // iter == 1 -- leveled, but throw in a flush between two levels compacting
+  // iter == 2 -- universal
+  for (int iter = 0; iter < 3; ++iter) {
+    Options options = CurrentOptions();
+    if (iter < 2) {
+      options.compaction_style = kCompactionStyleLevel;
+    } else {
+      options.compaction_style = kCompactionStyleUniversal;
+    }
+    options.write_buffer_size = 110 << 10;
+    options.level0_file_num_compaction_trigger = 4;
+    options.num_levels = 4;
+    options.compression = kNoCompression;
+    options.max_bytes_for_level_base = 450 << 10;
+    options.target_file_size_base = 98 << 10;
+    options.max_write_buffer_number = 2;
+
+    DestroyAndReopen(options);
+
+    Random rnd(301);
+    for (int num = 0; num < 14; num++) {
+      GenerateNewRandomFile(&rnd);
+    }
+
+    if (iter == 1) {
+    rocksdb::SyncPoint::GetInstance()->LoadDependency(
+        {{"DBImpl::RunManualCompaction()::1",
+          "DBTest::FlushesInParallelWithCompactRange:1"},
+         {"DBTest::FlushesInParallelWithCompactRange:2",
+          "DBImpl::RunManualCompaction()::2"}});
+    } else {
+      rocksdb::SyncPoint::GetInstance()->LoadDependency(
+          {{"CompactionJob::Run():Start",
+            "DBTest::FlushesInParallelWithCompactRange:1"},
+           {"DBTest::FlushesInParallelWithCompactRange:2",
+            "CompactionJob::Run():End"}});
+    }
+    rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+    std::vector<std::thread> threads;
+    threads.emplace_back([&]() { Compact("a", "z"); });
+
+    TEST_SYNC_POINT("DBTest::FlushesInParallelWithCompactRange:1");
+
+    // this has to start a flush. if flushes are blocked, this will try to
+    // create
+    // 3 memtables, and that will fail because max_write_buffer_number is 2
+    for (int num = 0; num < 3; num++) {
+      GenerateNewRandomFile(&rnd, /* nowait */ true);
+    }
+
+    TEST_SYNC_POINT("DBTest::FlushesInParallelWithCompactRange:2");
+
+    for (auto& t : threads) {
+      t.join();
+    }
+    rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+// This tests for a bug that could cause two level0 compactions running
+// concurrently
+TEST_F(DBTest, SuggestCompactRangeNoTwoLevel0Compactions) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleLevel;
+  options.write_buffer_size = 110 << 10;
+  options.level0_file_num_compaction_trigger = 4;
+  options.num_levels = 4;
+  options.compression = kNoCompression;
+  options.max_bytes_for_level_base = 450 << 10;
+  options.target_file_size_base = 98 << 10;
+  options.max_write_buffer_number = 2;
+  options.max_background_compactions = 2;
+
+  DestroyAndReopen(options);
+
+  // fill up the DB
+  Random rnd(301);
+  for (int num = 0; num < 10; num++) {
+    GenerateNewRandomFile(&rnd);
+  }
+  db_->CompactRange(nullptr, nullptr);
+
+  rocksdb::SyncPoint::GetInstance()->LoadDependency(
+      {{"CompactionJob::Run():Start",
+        "DBTest::SuggestCompactRangeNoTwoLevel0Compactions:1"},
+       {"DBTest::SuggestCompactRangeNoTwoLevel0Compactions:2",
+        "CompactionJob::Run():End"}});
+
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  // trigger L0 compaction
+  for (int num = 0; num < options.level0_file_num_compaction_trigger + 1;
+       num++) {
+    GenerateNewRandomFile(&rnd, /* nowait */ true);
+  }
+
+  TEST_SYNC_POINT("DBTest::SuggestCompactRangeNoTwoLevel0Compactions:1");
+
+  GenerateNewRandomFile(&rnd, /* nowait */ true);
+  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_OK(experimental::SuggestCompactRange(db_, nullptr, nullptr));
+  for (int num = 0; num < options.level0_file_num_compaction_trigger + 1;
+       num++) {
+    GenerateNewRandomFile(&rnd, /* nowait */ true);
+  }
+
+  TEST_SYNC_POINT("DBTest::SuggestCompactRangeNoTwoLevel0Compactions:2");
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
-  return rocksdb::test::RunAllTests();
+  rocksdb::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
 }
diff --git a/src/rocksdb/db/dbformat.cc b/src/rocksdb/db/dbformat.cc
index e53d16d..f0bd9d0 100644
--- a/src/rocksdb/db/dbformat.cc
+++ b/src/rocksdb/db/dbformat.cc
@@ -127,28 +127,8 @@ void InternalKeyComparator::FindShortSuccessor(std::string* key) const {
   }
 }
 
-const char* InternalFilterPolicy::Name() const {
-  return user_policy_->Name();
-}
-
-void InternalFilterPolicy::CreateFilter(const Slice* keys, int n,
-                                        std::string* dst) const {
-  // We rely on the fact that the code in table.cc does not mind us
-  // adjusting keys[].
-  Slice* mkey = const_cast<Slice*>(keys);
-  for (int i = 0; i < n; i++) {
-    mkey[i] = ExtractUserKey(keys[i]);
-    // TODO(sanjay): Suppress dups?
-  }
-  user_policy_->CreateFilter(keys, n, dst);
-}
-
-bool InternalFilterPolicy::KeyMayMatch(const Slice& key, const Slice& f) const {
-  return user_policy_->KeyMayMatch(ExtractUserKey(key), f);
-}
-
-LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) {
-  size_t usize = user_key.size();
+LookupKey::LookupKey(const Slice& _user_key, SequenceNumber s) {
+  size_t usize = _user_key.size();
   size_t needed = usize + 13;  // A conservative estimate
   char* dst;
   if (needed <= sizeof(space_)) {
@@ -157,9 +137,10 @@ LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) {
     dst = new char[needed];
   }
   start_ = dst;
-  dst = EncodeVarint32(dst, usize + 8);
+  // NOTE: We don't support users keys of more than 2GB :)
+  dst = EncodeVarint32(dst, static_cast<uint32_t>(usize + 8));
   kstart_ = dst;
-  memcpy(dst, user_key.data(), usize);
+  memcpy(dst, _user_key.data(), usize);
   dst += usize;
   EncodeFixed64(dst, PackSequenceAndType(s, kValueTypeForSeek));
   dst += 8;
diff --git a/src/rocksdb/db/dbformat.h b/src/rocksdb/db/dbformat.h
index 1647661..f15a8c0 100644
--- a/src/rocksdb/db/dbformat.h
+++ b/src/rocksdb/db/dbformat.h
@@ -9,6 +9,7 @@
 
 #pragma once
 #include <stdio.h>
+#include <string>
 #include "rocksdb/comparator.h"
 #include "rocksdb/db.h"
 #include "rocksdb/filter_policy.h"
@@ -110,12 +111,11 @@ class InternalKeyComparator : public Comparator {
   }
   virtual ~InternalKeyComparator() {}
 
-  virtual const char* Name() const;
-  virtual int Compare(const Slice& a, const Slice& b) const;
-  virtual void FindShortestSeparator(
-      std::string* start,
-      const Slice& limit) const;
-  virtual void FindShortSuccessor(std::string* key) const;
+  virtual const char* Name() const override;
+  virtual int Compare(const Slice& a, const Slice& b) const override;
+  virtual void FindShortestSeparator(std::string* start,
+                                     const Slice& limit) const override;
+  virtual void FindShortSuccessor(std::string* key) const override;
 
   const Comparator* user_comparator() const { return user_comparator_; }
 
@@ -123,17 +123,6 @@ class InternalKeyComparator : public Comparator {
   int Compare(const ParsedInternalKey& a, const ParsedInternalKey& b) const;
 };
 
-// Filter policy wrapper that converts from internal keys to user keys
-class InternalFilterPolicy : public FilterPolicy {
- private:
-  const FilterPolicy* const user_policy_;
- public:
-  explicit InternalFilterPolicy(const FilterPolicy* p) : user_policy_(p) { }
-  virtual const char* Name() const;
-  virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const;
-  virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const;
-};
-
 // Modules in this directory should keep internal keys wrapped inside
 // the following class instead of plain strings so that we do not
 // incorrectly use string comparisons instead of an InternalKeyComparator.
@@ -142,8 +131,27 @@ class InternalKey {
   std::string rep_;
  public:
   InternalKey() { }   // Leave rep_ as empty to indicate it is invalid
-  InternalKey(const Slice& user_key, SequenceNumber s, ValueType t) {
-    AppendInternalKey(&rep_, ParsedInternalKey(user_key, s, t));
+  InternalKey(const Slice& _user_key, SequenceNumber s, ValueType t) {
+    AppendInternalKey(&rep_, ParsedInternalKey(_user_key, s, t));
+  }
+
+  // sets the internal key to be bigger or equal to all internal keys with this
+  // user key
+  void SetMaxPossibleForUserKey(const Slice& _user_key) {
+    AppendInternalKey(&rep_, ParsedInternalKey(_user_key, kMaxSequenceNumber,
+                                               kValueTypeForSeek));
+  }
+
+  // sets the internal key to be smaller or equal to all internal keys with this
+  // user key
+  void SetMinPossibleForUserKey(const Slice& _user_key) {
+    AppendInternalKey(
+        &rep_, ParsedInternalKey(_user_key, 0, static_cast<ValueType>(0)));
+  }
+
+  bool Valid() const {
+    ParsedInternalKey parsed;
+    return ParseInternalKey(Slice(rep_), &parsed);
   }
 
   void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); }
@@ -206,18 +214,24 @@ class LookupKey {
  public:
   // Initialize *this for looking up user_key at a snapshot with
   // the specified sequence number.
-  LookupKey(const Slice& user_key, SequenceNumber sequence);
+  LookupKey(const Slice& _user_key, SequenceNumber sequence);
 
   ~LookupKey();
 
   // Return a key suitable for lookup in a MemTable.
-  Slice memtable_key() const { return Slice(start_, end_ - start_); }
+  Slice memtable_key() const {
+    return Slice(start_, static_cast<size_t>(end_ - start_));
+  }
 
   // Return an internal key (suitable for passing to an internal iterator)
-  Slice internal_key() const { return Slice(kstart_, end_ - kstart_); }
+  Slice internal_key() const {
+    return Slice(kstart_, static_cast<size_t>(end_ - kstart_));
+  }
 
   // Return the user key
-  Slice user_key() const { return Slice(kstart_, end_ - kstart_ - 8); }
+  Slice user_key() const {
+    return Slice(kstart_, static_cast<size_t>(end_ - kstart_ - 8));
+  }
 
  private:
   // We construct a char array of the form:
@@ -249,26 +263,84 @@ class IterKey {
 
   Slice GetKey() const { return Slice(key_, key_size_); }
 
+  size_t Size() { return key_size_; }
+
   void Clear() { key_size_ = 0; }
 
-  void SetUserKey(const Slice& user_key) {
-    size_t size = user_key.size();
+  // Append "non_shared_data" to its back, from "shared_len"
+  // This function is used in Block::Iter::ParseNextKey
+  // shared_len: bytes in [0, shard_len-1] would be remained
+  // non_shared_data: data to be append, its length must be >= non_shared_len
+  void TrimAppend(const size_t shared_len, const char* non_shared_data,
+                  const size_t non_shared_len) {
+    assert(shared_len <= key_size_);
+
+    size_t total_size = shared_len + non_shared_len;
+    if (total_size <= buf_size_) {
+      key_size_ = total_size;
+    } else {
+      // Need to allocate space, delete previous space
+      char* p = new char[total_size];
+      memcpy(p, key_, shared_len);
+
+      if (key_ != nullptr && key_ != space_) {
+        delete[] key_;
+      }
+
+      key_ = p;
+      key_size_ = total_size;
+      buf_size_ = total_size;
+    }
+
+    memcpy(key_ + shared_len, non_shared_data, non_shared_len);
+  }
+
+  void SetKey(const Slice& key) {
+    size_t size = key.size();
     EnlargeBufferIfNeeded(size);
-    memcpy(key_, user_key.data(), size);
+    memcpy(key_, key.data(), size);
     key_size_ = size;
   }
 
-  void SetInternalKey(const Slice& user_key, SequenceNumber s,
+  void SetInternalKey(const Slice& key_prefix, const Slice& user_key,
+                      SequenceNumber s,
                       ValueType value_type = kValueTypeForSeek) {
+    size_t psize = key_prefix.size();
     size_t usize = user_key.size();
-    EnlargeBufferIfNeeded(usize + sizeof(uint64_t));
-    memcpy(key_, user_key.data(), usize);
-    EncodeFixed64(key_ + usize, PackSequenceAndType(s, value_type));
-    key_size_ = usize + sizeof(uint64_t);
+    EnlargeBufferIfNeeded(psize + usize + sizeof(uint64_t));
+    if (psize > 0) {
+      memcpy(key_, key_prefix.data(), psize);
+    }
+    memcpy(key_ + psize, user_key.data(), usize);
+    EncodeFixed64(key_ + usize + psize, PackSequenceAndType(s, value_type));
+    key_size_ = psize + usize + sizeof(uint64_t);
+  }
+
+  void SetInternalKey(const Slice& user_key, SequenceNumber s,
+                      ValueType value_type = kValueTypeForSeek) {
+    SetInternalKey(Slice(), user_key, s, value_type);
+  }
+
+  void Reserve(size_t size) {
+    EnlargeBufferIfNeeded(size);
+    key_size_ = size;
   }
 
   void SetInternalKey(const ParsedInternalKey& parsed_key) {
-    SetInternalKey(parsed_key.user_key, parsed_key.sequence, parsed_key.type);
+    SetInternalKey(Slice(), parsed_key);
+  }
+
+  void SetInternalKey(const Slice& key_prefix,
+                      const ParsedInternalKey& parsed_key_suffix) {
+    SetInternalKey(key_prefix, parsed_key_suffix.user_key,
+                   parsed_key_suffix.sequence, parsed_key_suffix.type);
+  }
+
+  void EncodeLengthPrefixedKey(const Slice& key) {
+    auto size = key.size();
+    EnlargeBufferIfNeeded(size + static_cast<size_t>(VarintLength(size)));
+    char* ptr = EncodeVarint32(key_, static_cast<uint32_t>(size));
+    memcpy(ptr, key.data(), size);
   }
 
  private:
@@ -312,19 +384,19 @@ class InternalKeySliceTransform : public SliceTransform {
   explicit InternalKeySliceTransform(const SliceTransform* transform)
       : transform_(transform) {}
 
-  virtual const char* Name() const { return transform_->Name(); }
+  virtual const char* Name() const override { return transform_->Name(); }
 
-  virtual Slice Transform(const Slice& src) const {
+  virtual Slice Transform(const Slice& src) const override {
     auto user_key = ExtractUserKey(src);
     return transform_->Transform(user_key);
   }
 
-  virtual bool InDomain(const Slice& src) const {
+  virtual bool InDomain(const Slice& src) const override {
     auto user_key = ExtractUserKey(src);
     return transform_->InDomain(user_key);
   }
 
-  virtual bool InRange(const Slice& dst) const {
+  virtual bool InRange(const Slice& dst) const override {
     auto user_key = ExtractUserKey(dst);
     return transform_->InRange(user_key);
   }
@@ -337,4 +409,12 @@ class InternalKeySliceTransform : public SliceTransform {
   const SliceTransform* const transform_;
 };
 
+// Read record from a write batch piece from input.
+// tag, column_family, key, value and blob are return values. Callers own the
+// Slice they point to.
+// Tag is defined as ValueType.
+// input will be advanced to after the record.
+extern Status ReadRecordFromWriteBatch(Slice* input, char* tag,
+                                       uint32_t* column_family, Slice* key,
+                                       Slice* value, Slice* blob);
 }  // namespace rocksdb
diff --git a/src/rocksdb/db/dbformat_test.cc b/src/rocksdb/db/dbformat_test.cc
index b520f3c..56e2927 100644
--- a/src/rocksdb/db/dbformat_test.cc
+++ b/src/rocksdb/db/dbformat_test.cc
@@ -49,9 +49,9 @@ static void TestKey(const std::string& key,
   ASSERT_TRUE(!ParseInternalKey(Slice("bar"), &decoded));
 }
 
-class FormatTest { };
+class FormatTest : public testing::Test {};
 
-TEST(FormatTest, InternalKey_EncodeDecode) {
+TEST_F(FormatTest, InternalKey_EncodeDecode) {
   const char* keys[] = { "", "k", "hello", "longggggggggggggggggggggg" };
   const uint64_t seq[] = {
     1, 2, 3,
@@ -67,7 +67,7 @@ TEST(FormatTest, InternalKey_EncodeDecode) {
   }
 }
 
-TEST(FormatTest, InternalKeyShortSeparator) {
+TEST_F(FormatTest, InternalKeyShortSeparator) {
   // When user keys are same
   ASSERT_EQ(IKey("foo", 100, kTypeValue),
             Shorten(IKey("foo", 100, kTypeValue),
@@ -103,15 +103,55 @@ TEST(FormatTest, InternalKeyShortSeparator) {
                     IKey("foo", 200, kTypeValue)));
 }
 
-TEST(FormatTest, InternalKeyShortestSuccessor) {
+TEST_F(FormatTest, InternalKeyShortestSuccessor) {
   ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
             ShortSuccessor(IKey("foo", 100, kTypeValue)));
   ASSERT_EQ(IKey("\xff\xff", 100, kTypeValue),
             ShortSuccessor(IKey("\xff\xff", 100, kTypeValue)));
 }
 
+TEST_F(FormatTest, IterKeyOperation) {
+  IterKey k;
+  const char p[] = "abcdefghijklmnopqrstuvwxyz";
+  const char q[] = "0123456789";
+
+  ASSERT_EQ(std::string(k.GetKey().data(), k.GetKey().size()),
+            std::string(""));
+
+  k.TrimAppend(0, p, 3);
+  ASSERT_EQ(std::string(k.GetKey().data(), k.GetKey().size()),
+            std::string("abc"));
+
+  k.TrimAppend(1, p, 3);
+  ASSERT_EQ(std::string(k.GetKey().data(), k.GetKey().size()),
+            std::string("aabc"));
+
+  k.TrimAppend(0, p, 26);
+  ASSERT_EQ(std::string(k.GetKey().data(), k.GetKey().size()),
+            std::string("abcdefghijklmnopqrstuvwxyz"));
+
+  k.TrimAppend(26, q, 10);
+  ASSERT_EQ(std::string(k.GetKey().data(), k.GetKey().size()),
+            std::string("abcdefghijklmnopqrstuvwxyz0123456789"));
+
+  k.TrimAppend(36, q, 1);
+  ASSERT_EQ(std::string(k.GetKey().data(), k.GetKey().size()),
+            std::string("abcdefghijklmnopqrstuvwxyz01234567890"));
+
+  k.TrimAppend(26, q, 1);
+  ASSERT_EQ(std::string(k.GetKey().data(), k.GetKey().size()),
+            std::string("abcdefghijklmnopqrstuvwxyz0"));
+
+  // Size going up, memory allocation is triggered
+  k.TrimAppend(27, p, 26);
+  ASSERT_EQ(std::string(k.GetKey().data(), k.GetKey().size()),
+            std::string("abcdefghijklmnopqrstuvwxyz0"
+              "abcdefghijklmnopqrstuvwxyz"));
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
-  return rocksdb::test::RunAllTests();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
 }
diff --git a/src/rocksdb/db/deletefile_test.cc b/src/rocksdb/db/deletefile_test.cc
index 14f0324..83d7b0f 100644
--- a/src/rocksdb/db/deletefile_test.cc
+++ b/src/rocksdb/db/deletefile_test.cc
@@ -12,6 +12,7 @@
 #include "db/filename.h"
 #include "db/version_set.h"
 #include "db/write_batch_internal.h"
+#include "util/string_util.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
 #include "rocksdb/env.h"
@@ -23,7 +24,7 @@
 
 namespace rocksdb {
 
-class DeleteFileTest {
+class DeleteFileTest : public testing::Test {
  public:
   std::string dbname_;
   Options options_;
@@ -34,6 +35,8 @@ class DeleteFileTest {
   DeleteFileTest() {
     db_ = nullptr;
     env_ = Env::Default();
+    options_.enable_thread_tracking = true;
+    options_.max_background_flushes = 0;
     options_.write_buffer_size = 1024*1024*1000;
     options_.target_file_size_base = 1024*1024*1000;
     options_.max_bytes_for_level_base = 1024*1024*1000;
@@ -55,7 +58,7 @@ class DeleteFileTest {
 
     DestroyDB(dbname_, options_);
     numlevels_ = 7;
-    ASSERT_OK(ReopenDB(true));
+    EXPECT_OK(ReopenDB(true));
   }
 
   Status ReopenDB(bool create) {
@@ -77,7 +80,7 @@ class DeleteFileTest {
     options.sync = false;
     ReadOptions roptions;
     for (int i = startkey; i < (numkeys + startkey) ; i++) {
-      std::string temp = std::to_string(i);
+      std::string temp = ToString(i);
       Slice key(temp);
       Slice value(temp);
       ASSERT_OK(db_->Put(options, key, value));
@@ -144,10 +147,9 @@ class DeleteFileTest {
 
 };
 
-TEST(DeleteFileTest, AddKeysAndQueryLevels) {
+TEST_F(DeleteFileTest, AddKeysAndQueryLevels) {
   CreateTwoLevels();
   std::vector<LiveFileMetaData> metadata;
-  std::vector<int> keysinlevel;
   db_->GetLiveFilesMetaData(&metadata);
 
   std::string level1file = "";
@@ -191,7 +193,7 @@ TEST(DeleteFileTest, AddKeysAndQueryLevels) {
   CloseDB();
 }
 
-TEST(DeleteFileTest, PurgeObsoleteFilesTest) {
+TEST_F(DeleteFileTest, PurgeObsoleteFilesTest) {
   CreateTwoLevels();
   // there should be only one (empty) log file because CreateTwoLevels()
   // flushes the memtables to disk
@@ -219,7 +221,7 @@ TEST(DeleteFileTest, PurgeObsoleteFilesTest) {
   CloseDB();
 }
 
-TEST(DeleteFileTest, DeleteFileWithIterator) {
+TEST_F(DeleteFileTest, DeleteFileWithIterator) {
   CreateTwoLevels();
   ReadOptions options;
   Iterator* it = db_->NewIterator(options);
@@ -250,7 +252,7 @@ TEST(DeleteFileTest, DeleteFileWithIterator) {
   CloseDB();
 }
 
-TEST(DeleteFileTest, DeleteLogFiles) {
+TEST_F(DeleteFileTest, DeleteLogFiles) {
   AddKeys(10, 0);
   VectorLogPtr logfiles;
   db_->GetSortedWalFiles(logfiles);
@@ -287,9 +289,79 @@ TEST(DeleteFileTest, DeleteLogFiles) {
   CloseDB();
 }
 
+TEST_F(DeleteFileTest, DeleteNonDefaultColumnFamily) {
+  CloseDB();
+  DBOptions db_options;
+  db_options.create_if_missing = true;
+  db_options.create_missing_column_families = true;
+  std::vector<ColumnFamilyDescriptor> column_families;
+  column_families.emplace_back();
+  column_families.emplace_back("new_cf", ColumnFamilyOptions());
+
+  std::vector<rocksdb::ColumnFamilyHandle*> handles;
+  rocksdb::DB* db;
+  ASSERT_OK(DB::Open(db_options, dbname_, column_families, &handles, &db));
+
+  Random rnd(5);
+  for (int i = 0; i < 1000; ++i) {
+    ASSERT_OK(db->Put(WriteOptions(), handles[1], test::RandomKey(&rnd, 10),
+                      test::RandomKey(&rnd, 10)));
+  }
+  ASSERT_OK(db->Flush(FlushOptions(), handles[1]));
+  for (int i = 0; i < 1000; ++i) {
+    ASSERT_OK(db->Put(WriteOptions(), handles[1], test::RandomKey(&rnd, 10),
+                      test::RandomKey(&rnd, 10)));
+  }
+  ASSERT_OK(db->Flush(FlushOptions(), handles[1]));
+
+  std::vector<LiveFileMetaData> metadata;
+  db->GetLiveFilesMetaData(&metadata);
+  ASSERT_EQ(2U, metadata.size());
+  ASSERT_EQ("new_cf", metadata[0].column_family_name);
+  ASSERT_EQ("new_cf", metadata[1].column_family_name);
+  auto old_file = metadata[0].smallest_seqno < metadata[1].smallest_seqno
+                      ? metadata[0].name
+                      : metadata[1].name;
+  auto new_file = metadata[0].smallest_seqno > metadata[1].smallest_seqno
+                      ? metadata[0].name
+                      : metadata[1].name;
+  ASSERT_TRUE(db->DeleteFile(new_file).IsInvalidArgument());
+  ASSERT_OK(db->DeleteFile(old_file));
+
+  {
+    std::unique_ptr<Iterator> itr(db->NewIterator(ReadOptions(), handles[1]));
+    int count = 0;
+    for (itr->SeekToFirst(); itr->Valid(); itr->Next()) {
+      ASSERT_OK(itr->status());
+      ++count;
+    }
+    ASSERT_EQ(count, 1000);
+  }
+
+  delete handles[0];
+  delete handles[1];
+  delete db;
+
+  ASSERT_OK(DB::Open(db_options, dbname_, column_families, &handles, &db));
+  {
+    std::unique_ptr<Iterator> itr(db->NewIterator(ReadOptions(), handles[1]));
+    int count = 0;
+    for (itr->SeekToFirst(); itr->Valid(); itr->Next()) {
+      ASSERT_OK(itr->status());
+      ++count;
+    }
+    ASSERT_EQ(count, 1000);
+  }
+
+  delete handles[0];
+  delete handles[1];
+  delete db;
+}
+
 } //namespace rocksdb
 
 int main(int argc, char** argv) {
-  return rocksdb::test::RunAllTests();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
 }
 
diff --git a/src/rocksdb/db/event_logger_helpers.cc b/src/rocksdb/db/event_logger_helpers.cc
new file mode 100644
index 0000000..521b684
--- /dev/null
+++ b/src/rocksdb/db/event_logger_helpers.cc
@@ -0,0 +1,46 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "db/event_logger_helpers.h"
+
+namespace rocksdb {
+
+namespace {
+inline double SafeDivide(double a, double b) { return b == 0.0 ? 0 : a / b; }
+}  // namespace
+
+void EventLoggerHelpers::LogTableFileCreation(
+    EventLogger* event_logger, int job_id, uint64_t file_number,
+    uint64_t file_size, const TableProperties& table_properties) {
+  auto stream = event_logger->Log();
+  stream << "job" << job_id << "event"
+         << "table_file_creation"
+         << "file_number" << file_number << "file_size" << file_size
+         << "table_properties";
+  stream.StartObject();
+
+  // basic properties:
+  stream << "data_size" << table_properties.data_size
+         << "index_size" << table_properties.index_size
+         << "filter_size" << table_properties.filter_size
+         << "raw_key_size" << table_properties.raw_key_size
+         << "raw_average_key_size" << SafeDivide(table_properties.raw_key_size,
+             table_properties.num_entries)
+         << "raw_value_size" << table_properties.raw_value_size
+         << "raw_average_value_size" << SafeDivide(
+             table_properties.raw_value_size, table_properties.num_entries)
+         << "num_data_blocks" << table_properties.num_data_blocks
+         << "num_entries" << table_properties.num_entries
+         << "filter_policy_name" << table_properties.filter_policy_name;
+
+  // user collected properties
+  for (const auto& prop : table_properties.user_collected_properties) {
+    stream << prop.first << prop.second;
+  }
+
+  stream.EndObject();
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/db/event_logger_helpers.h b/src/rocksdb/db/event_logger_helpers.h
new file mode 100644
index 0000000..86e9adc
--- /dev/null
+++ b/src/rocksdb/db/event_logger_helpers.h
@@ -0,0 +1,18 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+#pragma once
+
+#include "util/event_logger.h"
+#include "rocksdb/table_properties.h"
+
+namespace rocksdb {
+
+class EventLoggerHelpers {
+ public:
+  static void LogTableFileCreation(EventLogger* event_logger, int job_id,
+                                   uint64_t file_number, uint64_t file_size,
+                                   const TableProperties& table_properties);
+};
+}  // namespace rocksdb
diff --git a/src/rocksdb/db/experimental.cc b/src/rocksdb/db/experimental.cc
new file mode 100644
index 0000000..0b5018a
--- /dev/null
+++ b/src/rocksdb/db/experimental.cc
@@ -0,0 +1,51 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "rocksdb/experimental.h"
+
+#include "db/db_impl.h"
+
+namespace rocksdb {
+namespace experimental {
+
+#ifndef ROCKSDB_LITE
+
+Status SuggestCompactRange(DB* db, ColumnFamilyHandle* column_family,
+                           const Slice* begin, const Slice* end) {
+  auto dbimpl = dynamic_cast<DBImpl*>(db);
+  if (dbimpl == nullptr) {
+    return Status::InvalidArgument("Didn't recognize DB object");
+  }
+
+  return dbimpl->SuggestCompactRange(column_family, begin, end);
+}
+
+Status PromoteL0(DB* db, ColumnFamilyHandle* column_family, int target_level) {
+  auto dbimpl = dynamic_cast<DBImpl*>(db);
+  if (dbimpl == nullptr) {
+    return Status::InvalidArgument("Didn't recognize DB object");
+  }
+  return dbimpl->PromoteL0(column_family, target_level);
+}
+
+#else  // ROCKSDB_LITE
+
+Status SuggestCompactRange(DB* db, ColumnFamilyHandle* column_family,
+                           const Slice* begin, const Slice* end) {
+  return Status::NotSupported("Not supported in RocksDB LITE");
+}
+
+Status PromoteL0(DB* db, ColumnFamilyHandle* column_family, int target_level) {
+  return Status::NotSupported("Not supported in RocksDB LITE");
+}
+
+#endif  // ROCKSDB_LITE
+
+Status SuggestCompactRange(DB* db, const Slice* begin, const Slice* end) {
+  return SuggestCompactRange(db, db->DefaultColumnFamily(), begin, end);
+}
+
+}  // namespace experimental
+}  // namespace rocksdb
diff --git a/src/rocksdb/db/fault_injection_test.cc b/src/rocksdb/db/fault_injection_test.cc
new file mode 100644
index 0000000..6926e24
--- /dev/null
+++ b/src/rocksdb/db/fault_injection_test.cc
@@ -0,0 +1,804 @@
+//  Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright 2014 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// This test uses a custom Env to keep track of the state of a filesystem as of
+// the last "sync". It then checks for data loss errors by purposely dropping
+// file data (or entire files) not protected by a "sync".
+
+#include <map>
+#include <set>
+#include "db/db_impl.h"
+#include "db/filename.h"
+#include "db/log_format.h"
+#include "db/version_set.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/table.h"
+#include "rocksdb/write_batch.h"
+#include "util/logging.h"
+#include "util/mock_env.h"
+#include "util/mutexlock.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+static const int kValueSize = 1000;
+static const int kMaxNumValues = 2000;
+static const size_t kNumIterations = 3;
+
+class TestWritableFile;
+class FaultInjectionTestEnv;
+
+namespace {
+
+// Assume a filename, and not a directory name like "/foo/bar/"
+static std::string GetDirName(const std::string filename) {
+  size_t found = filename.find_last_of("/\\");
+  if (found == std::string::npos) {
+    return "";
+  } else {
+    return filename.substr(0, found);
+  }
+}
+
+// Trim the tailing "/" in the end of `str`
+static std::string TrimDirname(const std::string& str) {
+  size_t found = str.find_last_not_of("/");
+  if (found == std::string::npos) {
+    return str;
+  }
+  return str.substr(0, found + 1);
+}
+
+// Return pair <parent directory name, file name> of a full path.
+static std::pair<std::string, std::string> GetDirAndName(
+    const std::string& name) {
+  std::string dirname = GetDirName(name);
+  std::string fname = name.substr(dirname.size() + 1);
+  return std::make_pair(dirname, fname);
+}
+
+// A basic file truncation function suitable for this test.
+Status Truncate(Env* env, const std::string& filename, uint64_t length) {
+  unique_ptr<SequentialFile> orig_file;
+  const EnvOptions options;
+  Status s = env->NewSequentialFile(filename, &orig_file, options);
+  if (!s.ok()) {
+    fprintf(stderr, "Cannot truncate file %s: %s\n", filename.c_str(),
+            s.ToString().c_str());
+    return s;
+  }
+
+  char* scratch = new char[length];
+  rocksdb::Slice result;
+  s = orig_file->Read(length, &result, scratch);
+  if (s.ok()) {
+    std::string tmp_name = GetDirName(filename) + "/truncate.tmp";
+    unique_ptr<WritableFile> tmp_file;
+    s = env->NewWritableFile(tmp_name, &tmp_file, options);
+    if (s.ok()) {
+      s = tmp_file->Append(result);
+      if (s.ok()) {
+        s = env->RenameFile(tmp_name, filename);
+      } else {
+        fprintf(stderr, "Cannot rename file %s to %s: %s\n", tmp_name.c_str(),
+                filename.c_str(), s.ToString().c_str());
+        env->DeleteFile(tmp_name);
+      }
+    }
+  }
+  if (!s.ok()) {
+    fprintf(stderr, "Cannot truncate file %s: %s\n", filename.c_str(),
+            s.ToString().c_str());
+  }
+
+  delete[] scratch;
+
+  return s;
+}
+
+struct FileState {
+  std::string filename_;
+  ssize_t pos_;
+  ssize_t pos_at_last_sync_;
+  ssize_t pos_at_last_flush_;
+
+  explicit FileState(const std::string& filename)
+      : filename_(filename),
+        pos_(-1),
+        pos_at_last_sync_(-1),
+        pos_at_last_flush_(-1) { }
+
+  FileState() : pos_(-1), pos_at_last_sync_(-1), pos_at_last_flush_(-1) {}
+
+  bool IsFullySynced() const { return pos_ <= 0 || pos_ == pos_at_last_sync_; }
+
+  Status DropUnsyncedData(Env* env) const;
+
+  Status DropRandomUnsyncedData(Env* env, Random* rand) const;
+};
+
+}  // anonymous namespace
+
+// A wrapper around WritableFile which informs another Env whenever this file
+// is written to or sync'ed.
+class TestWritableFile : public WritableFile {
+ public:
+  explicit TestWritableFile(const std::string& fname,
+                            unique_ptr<WritableFile>&& f,
+                            FaultInjectionTestEnv* env);
+  virtual ~TestWritableFile();
+  virtual Status Append(const Slice& data) override;
+  virtual Status Close() override;
+  virtual Status Flush() override;
+  virtual Status Sync() override;
+
+ private:
+  FileState state_;
+  unique_ptr<WritableFile> target_;
+  bool writable_file_opened_;
+  FaultInjectionTestEnv* env_;
+};
+
+class TestDirectory : public Directory {
+ public:
+  explicit TestDirectory(FaultInjectionTestEnv* env, std::string dirname,
+                         Directory* dir)
+      : env_(env), dirname_(dirname), dir_(dir) {}
+  ~TestDirectory() {}
+
+  virtual Status Fsync() override;
+
+ private:
+  FaultInjectionTestEnv* env_;
+  std::string dirname_;
+  unique_ptr<Directory> dir_;
+};
+
+class FaultInjectionTestEnv : public EnvWrapper {
+ public:
+  explicit FaultInjectionTestEnv(Env* base)
+      : EnvWrapper(base),
+        filesystem_active_(true) {}
+  virtual ~FaultInjectionTestEnv() { }
+
+  Status NewDirectory(const std::string& name,
+                      unique_ptr<Directory>* result) override {
+    unique_ptr<Directory> r;
+    Status s = target()->NewDirectory(name, &r);
+    EXPECT_OK(s);
+    if (!s.ok()) {
+      return s;
+    }
+    result->reset(new TestDirectory(this, TrimDirname(name), r.release()));
+    return Status::OK();
+  }
+
+  Status NewWritableFile(const std::string& fname,
+                         unique_ptr<WritableFile>* result,
+                         const EnvOptions& soptions) override {
+    Status s = target()->NewWritableFile(fname, result, soptions);
+    if (s.ok()) {
+      result->reset(new TestWritableFile(fname, std::move(*result), this));
+      // WritableFile doesn't append to files, so if the same file is opened
+      // again then it will be truncated - so forget our saved state.
+      UntrackFile(fname);
+      MutexLock l(&mutex_);
+      open_files_.insert(fname);
+      auto dir_and_name = GetDirAndName(fname);
+      auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first];
+      list.insert(dir_and_name.second);
+    }
+    return s;
+  }
+
+  virtual Status DeleteFile(const std::string& f) override {
+    Status s = EnvWrapper::DeleteFile(f);
+    if (!s.ok()) {
+      fprintf(stderr, "Cannot delete file %s: %s\n", f.c_str(),
+              s.ToString().c_str());
+    }
+    EXPECT_OK(s);
+    if (s.ok()) {
+      UntrackFile(f);
+    }
+    return s;
+  }
+
+  virtual Status RenameFile(const std::string& s,
+                            const std::string& t) override {
+    Status ret = EnvWrapper::RenameFile(s, t);
+
+    if (ret.ok()) {
+      MutexLock l(&mutex_);
+      if (db_file_state_.find(s) != db_file_state_.end()) {
+        db_file_state_[t] = db_file_state_[s];
+        db_file_state_.erase(s);
+      }
+
+      auto sdn = GetDirAndName(s);
+      auto tdn = GetDirAndName(t);
+      if (dir_to_new_files_since_last_sync_[sdn.first].erase(sdn.second) != 0) {
+        auto& tlist = dir_to_new_files_since_last_sync_[tdn.first];
+        assert(tlist.find(tdn.second) == tlist.end());
+        tlist.insert(tdn.second);
+      }
+    }
+
+    return ret;
+  }
+
+  void WritableFileClosed(const FileState& state) {
+    MutexLock l(&mutex_);
+    if (open_files_.find(state.filename_) != open_files_.end()) {
+      db_file_state_[state.filename_] = state;
+      open_files_.erase(state.filename_);
+    }
+  }
+
+  // For every file that is not fully synced, make a call to `func` with
+  // FileState of the file as the parameter.
+  Status DropFileData(std::function<Status(Env*, FileState)> func) {
+    Status s;
+    MutexLock l(&mutex_);
+    for (std::map<std::string, FileState>::const_iterator it =
+             db_file_state_.begin();
+         s.ok() && it != db_file_state_.end(); ++it) {
+      const FileState& state = it->second;
+      if (!state.IsFullySynced()) {
+        s = func(target(), state);
+      }
+    }
+    return s;
+  }
+
+  Status DropUnsyncedFileData() {
+    return DropFileData([&](Env* env, const FileState& state) {
+      return state.DropUnsyncedData(env);
+    });
+  }
+
+  Status DropRandomUnsyncedFileData(Random* rnd) {
+    return DropFileData([&](Env* env, const FileState& state) {
+      return state.DropRandomUnsyncedData(env, rnd);
+    });
+  }
+
+  Status DeleteFilesCreatedAfterLastDirSync() {
+    // Because DeleteFile access this container make a copy to avoid deadlock
+    std::map<std::string, std::set<std::string>> map_copy;
+    {
+      MutexLock l(&mutex_);
+      map_copy.insert(dir_to_new_files_since_last_sync_.begin(),
+                      dir_to_new_files_since_last_sync_.end());
+    }
+
+    for (auto& pair : map_copy) {
+      for (std::string name : pair.second) {
+        Status s = DeleteFile(pair.first + "/" + name);
+        if (!s.ok()) {
+          return s;
+        }
+      }
+    }
+    return Status::OK();
+  }
+  void ResetState() {
+    MutexLock l(&mutex_);
+    db_file_state_.clear();
+    dir_to_new_files_since_last_sync_.clear();
+    SetFilesystemActiveNoLock(true);
+  }
+
+  void UntrackFile(const std::string& f) {
+    MutexLock l(&mutex_);
+    auto dir_and_name = GetDirAndName(f);
+    dir_to_new_files_since_last_sync_[dir_and_name.first].erase(
+        dir_and_name.second);
+    db_file_state_.erase(f);
+    open_files_.erase(f);
+  }
+
+  void SyncDir(const std::string& dirname) {
+    MutexLock l(&mutex_);
+    dir_to_new_files_since_last_sync_.erase(dirname);
+  }
+
+  // Setting the filesystem to inactive is the test equivalent to simulating a
+  // system reset. Setting to inactive will freeze our saved filesystem state so
+  // that it will stop being recorded. It can then be reset back to the state at
+  // the time of the reset.
+  bool IsFilesystemActive() {
+    MutexLock l(&mutex_);
+    return filesystem_active_;
+  }
+  void SetFilesystemActiveNoLock(bool active) { filesystem_active_ = active; }
+  void SetFilesystemActive(bool active) {
+    MutexLock l(&mutex_);
+    SetFilesystemActiveNoLock(active);
+  }
+  void AssertNoOpenFile() { ASSERT_TRUE(open_files_.empty()); }
+
+ private:
+  port::Mutex mutex_;
+  std::map<std::string, FileState> db_file_state_;
+  std::set<std::string> open_files_;
+  std::unordered_map<std::string, std::set<std::string>>
+      dir_to_new_files_since_last_sync_;
+  bool filesystem_active_;  // Record flushes, syncs, writes
+};
+
+Status FileState::DropUnsyncedData(Env* env) const {
+  ssize_t sync_pos = pos_at_last_sync_ == -1 ? 0 : pos_at_last_sync_;
+  return Truncate(env, filename_, sync_pos);
+}
+
+Status FileState::DropRandomUnsyncedData(Env* env, Random* rand) const {
+  ssize_t sync_pos = pos_at_last_sync_ == -1 ? 0 : pos_at_last_sync_;
+  assert(pos_ >= sync_pos);
+  int range = static_cast<int>(pos_ - sync_pos);
+  uint64_t truncated_size =
+      static_cast<uint64_t>(sync_pos) + rand->Uniform(range);
+  return Truncate(env, filename_, truncated_size);
+}
+
+Status TestDirectory::Fsync() {
+  env_->SyncDir(dirname_);
+  return dir_->Fsync();
+}
+
+TestWritableFile::TestWritableFile(const std::string& fname,
+                                   unique_ptr<WritableFile>&& f,
+                                   FaultInjectionTestEnv* env)
+      : state_(fname),
+        target_(std::move(f)),
+        writable_file_opened_(true),
+        env_(env) {
+  assert(target_ != nullptr);
+  state_.pos_ = 0;
+}
+
+TestWritableFile::~TestWritableFile() {
+  if (writable_file_opened_) {
+    Close();
+  }
+}
+
+Status TestWritableFile::Append(const Slice& data) {
+  Status s = target_->Append(data);
+  if (s.ok() && env_->IsFilesystemActive()) {
+    state_.pos_ += data.size();
+  }
+  return s;
+}
+
+Status TestWritableFile::Close() {
+  writable_file_opened_ = false;
+  Status s = target_->Close();
+  if (s.ok()) {
+    env_->WritableFileClosed(state_);
+  }
+  return s;
+}
+
+Status TestWritableFile::Flush() {
+  Status s = target_->Flush();
+  if (s.ok() && env_->IsFilesystemActive()) {
+    state_.pos_at_last_flush_ = state_.pos_;
+  }
+  return s;
+}
+
+Status TestWritableFile::Sync() {
+  if (!env_->IsFilesystemActive()) {
+    return Status::OK();
+  }
+  // No need to actual sync.
+  state_.pos_at_last_sync_ = state_.pos_;
+  return Status::OK();
+}
+
+class FaultInjectionTest : public testing::Test {
+ protected:
+  enum OptionConfig {
+    kDefault,
+    kDifferentDataDir,
+    kWalDir,
+    kSyncWal,
+    kWalDirSyncWal,
+    kMultiLevels,
+    kEnd,
+  };
+  int option_config_;
+  // When need to make sure data is persistent, sync WAL
+  bool sync_use_wal_;
+  // When need to make sure data is persistent, call DB::CompactRange()
+  bool sync_use_compact_;
+
+ protected:
+ public:
+  enum ExpectedVerifResult { kValExpectFound, kValExpectNoError };
+  enum ResetMethod {
+    kResetDropUnsyncedData,
+    kResetDropRandomUnsyncedData,
+    kResetDeleteUnsyncedFiles,
+    kResetDropAndDeleteUnsynced
+  };
+
+  std::unique_ptr<Env> base_env_;
+  FaultInjectionTestEnv* env_;
+  std::string dbname_;
+  shared_ptr<Cache> tiny_cache_;
+  Options options_;
+  DB* db_;
+
+  FaultInjectionTest()
+      : option_config_(kDefault),
+        sync_use_wal_(false),
+        sync_use_compact_(true),
+        base_env_(nullptr),
+        env_(NULL),
+        db_(NULL) {
+  }
+
+  bool ChangeOptions() {
+    option_config_++;
+    if (option_config_ >= kEnd) {
+      return false;
+    } else {
+      if (option_config_ == kMultiLevels) {
+        base_env_.reset(new MockEnv(Env::Default()));
+      }
+      return true;
+    }
+  }
+
+  // Return the current option configuration.
+  Options CurrentOptions() {
+    sync_use_wal_ = false;
+    sync_use_compact_ = true;
+    Options options;
+    switch (option_config_) {
+      case kWalDir:
+        options.wal_dir = test::TmpDir(env_) + "/fault_test_wal";
+        break;
+      case kDifferentDataDir:
+        options.db_paths.emplace_back(test::TmpDir(env_) + "/fault_test_data",
+                                      1000000U);
+        break;
+      case kSyncWal:
+        sync_use_wal_ = true;
+        sync_use_compact_ = false;
+        break;
+      case kWalDirSyncWal:
+        options.wal_dir = test::TmpDir(env_) + "/fault_test_wal";
+        sync_use_wal_ = true;
+        sync_use_compact_ = false;
+        break;
+      case kMultiLevels:
+        options.write_buffer_size = 64 * 1024;
+        options.target_file_size_base = 64 * 1024;
+        options.level0_file_num_compaction_trigger = 2;
+        options.level0_slowdown_writes_trigger = 2;
+        options.level0_stop_writes_trigger = 4;
+        options.max_bytes_for_level_base = 128 * 1024;
+        options.max_write_buffer_number = 2;
+        options.max_background_compactions = 8;
+        options.max_background_flushes = 8;
+        sync_use_wal_ = true;
+        sync_use_compact_ = false;
+        break;
+      default:
+        break;
+    }
+    return options;
+  }
+
+  Status NewDB() {
+    assert(db_ == NULL);
+    assert(tiny_cache_ == nullptr);
+    assert(env_ == NULL);
+
+    env_ =
+        new FaultInjectionTestEnv(base_env_ ? base_env_.get() : Env::Default());
+
+    options_ = CurrentOptions();
+    options_.env = env_;
+    options_.paranoid_checks = true;
+
+    BlockBasedTableOptions table_options;
+    tiny_cache_ = NewLRUCache(100);
+    table_options.block_cache = tiny_cache_;
+    options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+    dbname_ = test::TmpDir() + "/fault_test";
+
+    EXPECT_OK(DestroyDB(dbname_, options_));
+
+    options_.create_if_missing = true;
+    Status s = OpenDB();
+    options_.create_if_missing = false;
+    return s;
+  }
+
+  void SetUp() override { ASSERT_OK(NewDB()); }
+
+  void TearDown() override {
+    CloseDB();
+
+    Status s = DestroyDB(dbname_, options_);
+
+    delete env_;
+    env_ = NULL;
+
+    tiny_cache_.reset();
+
+    ASSERT_OK(s);
+  }
+
+  void Build(const WriteOptions& write_options, int start_idx, int num_vals) {
+    std::string key_space, value_space;
+    WriteBatch batch;
+    for (int i = start_idx; i < start_idx + num_vals; i++) {
+      Slice key = Key(i, &key_space);
+      batch.Clear();
+      batch.Put(key, Value(i, &value_space));
+      ASSERT_OK(db_->Write(write_options, &batch));
+    }
+  }
+
+  Status ReadValue(int i, std::string* val) const {
+    std::string key_space, value_space;
+    Slice key = Key(i, &key_space);
+    Value(i, &value_space);
+    ReadOptions options;
+    return db_->Get(options, key, val);
+  }
+
+  Status Verify(int start_idx, int num_vals,
+                ExpectedVerifResult expected) const {
+    std::string val;
+    std::string value_space;
+    Status s;
+    for (int i = start_idx; i < start_idx + num_vals && s.ok(); i++) {
+      Value(i, &value_space);
+      s = ReadValue(i, &val);
+      if (s.ok()) {
+        EXPECT_EQ(value_space, val);
+      }
+      if (expected == kValExpectFound) {
+        if (!s.ok()) {
+          fprintf(stderr, "Error when read %dth record (expect found): %s\n", i,
+                  s.ToString().c_str());
+          return s;
+        }
+      } else if (!s.ok() && !s.IsNotFound()) {
+        fprintf(stderr, "Error when read %dth record: %s\n", i,
+                s.ToString().c_str());
+        return s;
+      }
+    }
+    return Status::OK();
+  }
+
+  // Return the ith key
+  Slice Key(int i, std::string* storage) const {
+    char buf[100];
+    snprintf(buf, sizeof(buf), "%016d", i);
+    storage->assign(buf, strlen(buf));
+    return Slice(*storage);
+  }
+
+  // Return the value to associate with the specified key
+  Slice Value(int k, std::string* storage) const {
+    Random r(k);
+    return test::RandomString(&r, kValueSize, storage);
+  }
+
+  Status OpenDB() {
+    delete db_;
+    db_ = NULL;
+    env_->ResetState();
+    return DB::Open(options_, dbname_, &db_);
+  }
+
+  void CloseDB() {
+    delete db_;
+    db_ = NULL;
+  }
+
+  void DeleteAllData() {
+    Iterator* iter = db_->NewIterator(ReadOptions());
+    WriteOptions options;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_OK(db_->Delete(WriteOptions(), iter->key()));
+    }
+
+    delete iter;
+
+    FlushOptions flush_options;
+    flush_options.wait = true;
+    db_->Flush(flush_options);
+  }
+
+  // rnd cannot be null for kResetDropRandomUnsyncedData
+  void ResetDBState(ResetMethod reset_method, Random* rnd = nullptr) {
+    env_->AssertNoOpenFile();
+    switch (reset_method) {
+      case kResetDropUnsyncedData:
+        ASSERT_OK(env_->DropUnsyncedFileData());
+        break;
+      case kResetDropRandomUnsyncedData:
+        ASSERT_OK(env_->DropRandomUnsyncedFileData(rnd));
+        break;
+      case kResetDeleteUnsyncedFiles:
+        ASSERT_OK(env_->DeleteFilesCreatedAfterLastDirSync());
+        break;
+      case kResetDropAndDeleteUnsynced:
+        ASSERT_OK(env_->DropUnsyncedFileData());
+        ASSERT_OK(env_->DeleteFilesCreatedAfterLastDirSync());
+        break;
+      default:
+        assert(false);
+    }
+  }
+
+  void PartialCompactTestPreFault(int num_pre_sync, int num_post_sync) {
+    DeleteAllData();
+
+    WriteOptions write_options;
+    write_options.sync = sync_use_wal_;
+
+    Build(write_options, 0, num_pre_sync);
+    if (sync_use_compact_) {
+      db_->CompactRange(nullptr, nullptr);
+    }
+    write_options.sync = false;
+    Build(write_options, num_pre_sync, num_post_sync);
+  }
+
+  void PartialCompactTestReopenWithFault(ResetMethod reset_method,
+                                         int num_pre_sync, int num_post_sync,
+                                         Random* rnd = nullptr) {
+    env_->SetFilesystemActive(false);
+    CloseDB();
+    ResetDBState(reset_method, rnd);
+    ASSERT_OK(OpenDB());
+    ASSERT_OK(Verify(0, num_pre_sync, FaultInjectionTest::kValExpectFound));
+    ASSERT_OK(Verify(num_pre_sync, num_post_sync,
+                     FaultInjectionTest::kValExpectNoError));
+  }
+
+  void NoWriteTestPreFault() {
+  }
+
+  void NoWriteTestReopenWithFault(ResetMethod reset_method) {
+    CloseDB();
+    ResetDBState(reset_method);
+    ASSERT_OK(OpenDB());
+  }
+};
+
+TEST_F(FaultInjectionTest, FaultTest) {
+  do {
+    Random rnd(301);
+
+    for (size_t idx = 0; idx < kNumIterations; idx++) {
+      int num_pre_sync = rnd.Uniform(kMaxNumValues);
+      int num_post_sync = rnd.Uniform(kMaxNumValues);
+
+      PartialCompactTestPreFault(num_pre_sync, num_post_sync);
+      PartialCompactTestReopenWithFault(kResetDropUnsyncedData, num_pre_sync,
+                                        num_post_sync);
+      NoWriteTestPreFault();
+      NoWriteTestReopenWithFault(kResetDropUnsyncedData);
+
+      PartialCompactTestPreFault(num_pre_sync, num_post_sync);
+      PartialCompactTestReopenWithFault(kResetDropRandomUnsyncedData,
+                                        num_pre_sync, num_post_sync, &rnd);
+      NoWriteTestPreFault();
+      NoWriteTestReopenWithFault(kResetDropUnsyncedData);
+
+      // Setting a separate data path won't pass the test as we don't sync
+      // it after creating new files,
+      PartialCompactTestPreFault(num_pre_sync, num_post_sync);
+      PartialCompactTestReopenWithFault(kResetDropAndDeleteUnsynced,
+                                        num_pre_sync, num_post_sync);
+      NoWriteTestPreFault();
+      NoWriteTestReopenWithFault(kResetDropAndDeleteUnsynced);
+
+      PartialCompactTestPreFault(num_pre_sync, num_post_sync);
+      // No new files created so we expect all values since no files will be
+      // dropped.
+      PartialCompactTestReopenWithFault(kResetDeleteUnsyncedFiles, num_pre_sync,
+                                        num_post_sync);
+      NoWriteTestPreFault();
+      NoWriteTestReopenWithFault(kResetDeleteUnsyncedFiles);
+    }
+  } while (ChangeOptions());
+}
+
+class SleepingBackgroundTask {
+ public:
+  SleepingBackgroundTask()
+      : bg_cv_(&mutex_), should_sleep_(true), done_with_sleep_(false) {}
+  void DoSleep() {
+    MutexLock l(&mutex_);
+    while (should_sleep_) {
+      bg_cv_.Wait();
+    }
+    done_with_sleep_ = true;
+    bg_cv_.SignalAll();
+  }
+  void WakeUp() {
+    MutexLock l(&mutex_);
+    should_sleep_ = false;
+    bg_cv_.SignalAll();
+    while (!done_with_sleep_) {
+      bg_cv_.Wait();
+    }
+  }
+
+  static void DoSleepTask(void* arg) {
+    reinterpret_cast<SleepingBackgroundTask*>(arg)->DoSleep();
+  }
+
+ private:
+  port::Mutex mutex_;
+  port::CondVar bg_cv_;  // Signalled when background work finishes
+  bool should_sleep_;
+  bool done_with_sleep_;
+};
+
+// Disable the test because it is not passing.
+// Previous log file is not fsynced if sync is forced after log rolling.
+// TODO(FB internal task#6730880) Fix the bug
+TEST_F(FaultInjectionTest, DISABLED_WriteOptionSyncTest) {
+  SleepingBackgroundTask sleeping_task_low;
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  // Block the job queue to prevent flush job from running.
+  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::HIGH);
+
+  WriteOptions write_options;
+  write_options.sync = false;
+
+  std::string key_space, value_space;
+  ASSERT_OK(
+      db_->Put(write_options, Key(1, &key_space), Value(1, &value_space)));
+  FlushOptions flush_options;
+  flush_options.wait = false;
+  ASSERT_OK(db_->Flush(flush_options));
+  write_options.sync = true;
+  ASSERT_OK(
+      db_->Put(write_options, Key(2, &key_space), Value(2, &value_space)));
+
+  env_->SetFilesystemActive(false);
+  NoWriteTestReopenWithFault(kResetDropAndDeleteUnsynced);
+  sleeping_task_low.WakeUp();
+
+  ASSERT_OK(OpenDB());
+  std::string val;
+  Value(2, &value_space);
+  ASSERT_OK(ReadValue(2, &val));
+  ASSERT_EQ(value_space, val);
+
+  Value(1, &value_space);
+  ASSERT_OK(ReadValue(1, &val));
+  ASSERT_EQ(value_space, val);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/file_indexer.cc b/src/rocksdb/db/file_indexer.cc
index 2de7660..222cca9 100644
--- a/src/rocksdb/db/file_indexer.cc
+++ b/src/rocksdb/db/file_indexer.cc
@@ -14,26 +14,22 @@
 
 namespace rocksdb {
 
-FileIndexer::FileIndexer(const uint32_t num_levels,
-                         const Comparator* ucmp)
-  : num_levels_(num_levels),
-    ucmp_(ucmp),
-    next_level_index_(num_levels),
-    level_rb_(num_levels, -1) {
-}
-
+FileIndexer::FileIndexer(const Comparator* ucmp)
+    : num_levels_(0), ucmp_(ucmp), level_rb_(nullptr) {}
 
-uint32_t FileIndexer::NumLevelIndex() {
-  return next_level_index_.size();
-}
+size_t FileIndexer::NumLevelIndex() const { return next_level_index_.size(); }
 
-uint32_t FileIndexer::LevelIndexSize(uint32_t level) {
-  return next_level_index_[level].size();
+size_t FileIndexer::LevelIndexSize(size_t level) const {
+  if (level >= next_level_index_.size()) {
+    return 0;
+  }
+  return next_level_index_[level].num_index;
 }
 
-void FileIndexer::GetNextLevelIndex(
-    const uint32_t level, const uint32_t file_index, const int cmp_smallest,
-    const int cmp_largest, int32_t* left_bound, int32_t* right_bound) {
+void FileIndexer::GetNextLevelIndex(const size_t level, const size_t file_index,
+                                    const int cmp_smallest,
+                                    const int cmp_largest, int32_t* left_bound,
+                                    int32_t* right_bound) const {
   assert(level > 0);
 
   // Last level, no hint
@@ -46,11 +42,13 @@ void FileIndexer::GetNextLevelIndex(
   assert(level < num_levels_ - 1);
   assert(static_cast<int32_t>(file_index) <= level_rb_[level]);
 
-  const auto& index = next_level_index_[level][file_index];
+  const IndexUnit* index_units = next_level_index_[level].index_units;
+  const auto& index = index_units[file_index];
 
   if (cmp_smallest < 0) {
-    *left_bound = (level > 0 && file_index > 0) ?
-      next_level_index_[level][file_index - 1].largest_lb : 0;
+    *left_bound = (level > 0 && file_index > 0)
+                      ? index_units[file_index - 1].largest_lb
+                      : 0;
     *right_bound = index.smallest_rb;
   } else if (cmp_smallest == 0) {
     *left_bound = index.smallest_lb;
@@ -73,75 +71,86 @@ void FileIndexer::GetNextLevelIndex(
   assert(*right_bound <= level_rb_[level + 1]);
 }
 
-void FileIndexer::ClearIndex() {
-  for (uint32_t level = 1; level < num_levels_; ++level) {
-    next_level_index_[level].clear();
-  }
-}
-
-void FileIndexer::UpdateIndex(std::vector<FileMetaData*>* const files) {
+void FileIndexer::UpdateIndex(Arena* arena, const size_t num_levels,
+                              std::vector<FileMetaData*>* const files) {
   if (files == nullptr) {
     return;
   }
+  if (num_levels == 0) {  // uint_32 0-1 would cause bad behavior
+    num_levels_ = num_levels;
+    return;
+  }
+  assert(level_rb_ == nullptr);  // level_rb_ should be init here
+
+  num_levels_ = num_levels;
+  next_level_index_.resize(num_levels);
+
+  char* mem = arena->AllocateAligned(num_levels_ * sizeof(int32_t));
+  level_rb_ = new (mem) int32_t[num_levels_];
+  for (size_t i = 0; i < num_levels_; i++) {
+    level_rb_[i] = -1;
+  }
 
   // L1 - Ln-1
-  for (uint32_t level = 1; level < num_levels_ - 1; ++level) {
+  for (size_t level = 1; level < num_levels_ - 1; ++level) {
     const auto& upper_files = files[level];
-    const int32_t upper_size = upper_files.size();
+    const int32_t upper_size = static_cast<int32_t>(upper_files.size());
     const auto& lower_files = files[level + 1];
-    level_rb_[level] = upper_files.size() - 1;
+    level_rb_[level] = static_cast<int32_t>(upper_files.size()) - 1;
     if (upper_size == 0) {
       continue;
     }
-    auto& index = next_level_index_[level];
-    index.resize(upper_size);
-
-    CalculateLB(upper_files, lower_files, &index,
-        [this](const FileMetaData* a, const FileMetaData* b) -> int {
+    IndexLevel& index_level = next_level_index_[level];
+    index_level.num_index = upper_size;
+    mem = arena->AllocateAligned(upper_size * sizeof(IndexUnit));
+    index_level.index_units = new (mem) IndexUnit[upper_size];
+
+    CalculateLB(
+        upper_files, lower_files, &index_level,
+        [this](const FileMetaData * a, const FileMetaData * b)->int {
           return ucmp_->Compare(a->smallest.user_key(), b->largest.user_key());
         },
-        [](IndexUnit* index, int32_t f_idx) {
-          index->smallest_lb = f_idx;
-        });
-    CalculateLB(upper_files, lower_files, &index,
-        [this](const FileMetaData* a, const FileMetaData* b) -> int {
+        [](IndexUnit* index, int32_t f_idx) { index->smallest_lb = f_idx; });
+    CalculateLB(
+        upper_files, lower_files, &index_level,
+        [this](const FileMetaData * a, const FileMetaData * b)->int {
           return ucmp_->Compare(a->largest.user_key(), b->largest.user_key());
         },
-        [](IndexUnit* index, int32_t f_idx) {
-          index->largest_lb = f_idx;
-        });
-    CalculateRB(upper_files, lower_files, &index,
-        [this](const FileMetaData* a, const FileMetaData* b) -> int {
+        [](IndexUnit* index, int32_t f_idx) { index->largest_lb = f_idx; });
+    CalculateRB(
+        upper_files, lower_files, &index_level,
+        [this](const FileMetaData * a, const FileMetaData * b)->int {
           return ucmp_->Compare(a->smallest.user_key(), b->smallest.user_key());
         },
-        [](IndexUnit* index, int32_t f_idx) {
-          index->smallest_rb = f_idx;
-        });
-    CalculateRB(upper_files, lower_files, &index,
-        [this](const FileMetaData* a, const FileMetaData* b) -> int {
+        [](IndexUnit* index, int32_t f_idx) { index->smallest_rb = f_idx; });
+    CalculateRB(
+        upper_files, lower_files, &index_level,
+        [this](const FileMetaData * a, const FileMetaData * b)->int {
           return ucmp_->Compare(a->largest.user_key(), b->smallest.user_key());
         },
-        [](IndexUnit* index, int32_t f_idx) {
-          index->largest_rb = f_idx;
-        });
+        [](IndexUnit* index, int32_t f_idx) { index->largest_rb = f_idx; });
   }
-  level_rb_[num_levels_ - 1] = files[num_levels_ - 1].size() - 1;
+
+  level_rb_[num_levels_ - 1] =
+      static_cast<int32_t>(files[num_levels_ - 1].size()) - 1;
 }
 
-void FileIndexer::CalculateLB(const std::vector<FileMetaData*>& upper_files,
-    const std::vector<FileMetaData*>& lower_files,
-    std::vector<IndexUnit>* index,
+void FileIndexer::CalculateLB(
+    const std::vector<FileMetaData*>& upper_files,
+    const std::vector<FileMetaData*>& lower_files, IndexLevel* index_level,
     std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
     std::function<void(IndexUnit*, int32_t)> set_index) {
-  const int32_t upper_size = upper_files.size();
-  const int32_t lower_size = lower_files.size();
+  const int32_t upper_size = static_cast<int32_t>(upper_files.size());
+  const int32_t lower_size = static_cast<int32_t>(lower_files.size());
   int32_t upper_idx = 0;
   int32_t lower_idx = 0;
+
+  IndexUnit* index = index_level->index_units;
   while (upper_idx < upper_size && lower_idx < lower_size) {
     int cmp = cmp_op(upper_files[upper_idx], lower_files[lower_idx]);
 
     if (cmp == 0) {
-      set_index(&(*index)[upper_idx], lower_idx);
+      set_index(&index[upper_idx], lower_idx);
       ++upper_idx;
       ++lower_idx;
     } else if (cmp > 0) {
@@ -151,7 +160,7 @@ void FileIndexer::CalculateLB(const std::vector<FileMetaData*>& upper_files,
     } else {
       // Lower level's file becomes larger, update the index, and
       // move to the next upper file
-      set_index(&(*index)[upper_idx], lower_idx);
+      set_index(&index[upper_idx], lower_idx);
       ++upper_idx;
     }
   }
@@ -159,25 +168,27 @@ void FileIndexer::CalculateLB(const std::vector<FileMetaData*>& upper_files,
   while (upper_idx < upper_size) {
     // Lower files are exhausted, that means the remaining upper files are
     // greater than any lower files. Set the index to be the lower level size.
-    set_index(&(*index)[upper_idx], lower_size);
+    set_index(&index[upper_idx], lower_size);
     ++upper_idx;
   }
 }
 
-void FileIndexer::CalculateRB(const std::vector<FileMetaData*>& upper_files,
-    const std::vector<FileMetaData*>& lower_files,
-    std::vector<IndexUnit>* index,
+void FileIndexer::CalculateRB(
+    const std::vector<FileMetaData*>& upper_files,
+    const std::vector<FileMetaData*>& lower_files, IndexLevel* index_level,
     std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
     std::function<void(IndexUnit*, int32_t)> set_index) {
-  const int32_t upper_size = upper_files.size();
-  const int32_t lower_size = lower_files.size();
+  const int32_t upper_size = static_cast<int32_t>(upper_files.size());
+  const int32_t lower_size = static_cast<int32_t>(lower_files.size());
   int32_t upper_idx = upper_size - 1;
   int32_t lower_idx = lower_size - 1;
+
+  IndexUnit* index = index_level->index_units;
   while (upper_idx >= 0 && lower_idx >= 0) {
     int cmp = cmp_op(upper_files[upper_idx], lower_files[lower_idx]);
 
     if (cmp == 0) {
-      set_index(&(*index)[upper_idx], lower_idx);
+      set_index(&index[upper_idx], lower_idx);
       --upper_idx;
       --lower_idx;
     } else if (cmp < 0) {
@@ -187,14 +198,14 @@ void FileIndexer::CalculateRB(const std::vector<FileMetaData*>& upper_files,
     } else {
       // Lower level's file becomes smaller, update the index, and move to
       // the next the upper file
-      set_index(&(*index)[upper_idx], lower_idx);
+      set_index(&index[upper_idx], lower_idx);
       --upper_idx;
     }
   }
   while (upper_idx >= 0) {
     // Lower files are exhausted, that means the remaining upper files are
     // smaller than any lower files. Set it to -1.
-    set_index(&(*index)[upper_idx], -1);
+    set_index(&index[upper_idx], -1);
     --upper_idx;
   }
 }
diff --git a/src/rocksdb/db/file_indexer.h b/src/rocksdb/db/file_indexer.h
index 5e405df..e673499 100644
--- a/src/rocksdb/db/file_indexer.h
+++ b/src/rocksdb/db/file_indexer.h
@@ -12,11 +12,15 @@
 #include <functional>
 #include <limits>
 #include <vector>
+#include "util/arena.h"
+#include "util/autovector.h"
 
 namespace rocksdb {
 
 class Comparator;
 struct FileMetaData;
+struct FdWithKeyRange;
+struct FileLevel;
 
 // The file tree structure in Version is prebuilt and the range of each file
 // is known. On Version::Get(), it uses binary search to find a potential file
@@ -36,30 +40,29 @@ struct FileMetaData;
 // naive approach.
 class FileIndexer {
  public:
-  FileIndexer(const uint32_t num_levels, const Comparator* ucmp);
+  explicit FileIndexer(const Comparator* ucmp);
 
-  uint32_t NumLevelIndex();
+  size_t NumLevelIndex() const;
 
-  uint32_t LevelIndexSize(uint32_t level);
+  size_t LevelIndexSize(size_t level) const;
 
   // Return a file index range in the next level to search for a key based on
   // smallest and largest key comparision for the current file specified by
   // level and file_index. When *left_index < *right_index, both index should
   // be valid and fit in the vector size.
-  void GetNextLevelIndex(
-    const uint32_t level, const uint32_t file_index, const int cmp_smallest,
-    const int cmp_largest, int32_t* left_bound, int32_t* right_bound);
+  void GetNextLevelIndex(const size_t level, const size_t file_index,
+                         const int cmp_smallest, const int cmp_largest,
+                         int32_t* left_bound, int32_t* right_bound) const;
 
-  void ClearIndex();
-
-  void UpdateIndex(std::vector<FileMetaData*>* const files);
+  void UpdateIndex(Arena* arena, const size_t num_levels,
+                   std::vector<FileMetaData*>* const files);
 
   enum {
     kLevelMaxIndex = std::numeric_limits<int32_t>::max()
   };
 
  private:
-  const uint32_t num_levels_;
+  size_t num_levels_;
   const Comparator* ucmp_;
 
   struct IndexUnit {
@@ -110,20 +113,28 @@ class FileIndexer {
     int32_t largest_rb;
   };
 
-  void CalculateLB(const std::vector<FileMetaData*>& upper_files,
-    const std::vector<FileMetaData*>& lower_files,
-    std::vector<IndexUnit>* index,
-    std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
-    std::function<void(IndexUnit*, int32_t)> set_index);
+  // Data structure to store IndexUnits in a whole level
+  struct IndexLevel {
+    size_t num_index;
+    IndexUnit* index_units;
+
+    IndexLevel() : num_index(0), index_units(nullptr) {}
+  };
+
+  void CalculateLB(
+      const std::vector<FileMetaData*>& upper_files,
+      const std::vector<FileMetaData*>& lower_files, IndexLevel* index_level,
+      std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
+      std::function<void(IndexUnit*, int32_t)> set_index);
 
-  void CalculateRB(const std::vector<FileMetaData*>& upper_files,
-    const std::vector<FileMetaData*>& lower_files,
-    std::vector<IndexUnit>* index,
-    std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
-    std::function<void(IndexUnit*, int32_t)> set_index);
+  void CalculateRB(
+      const std::vector<FileMetaData*>& upper_files,
+      const std::vector<FileMetaData*>& lower_files, IndexLevel* index_level,
+      std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
+      std::function<void(IndexUnit*, int32_t)> set_index);
 
-  std::vector<std::vector<IndexUnit>> next_level_index_;
-  std::vector<int32_t> level_rb_;
+  autovector<IndexLevel> next_level_index_;
+  int32_t* level_rb_;
 };
 
 }  // namespace rocksdb
diff --git a/src/rocksdb/db/file_indexer_test.cc b/src/rocksdb/db/file_indexer_test.cc
index 14d67f4..98fea47 100644
--- a/src/rocksdb/db/file_indexer_test.cc
+++ b/src/rocksdb/db/file_indexer_test.cc
@@ -11,6 +11,7 @@
 #include "db/file_indexer.h"
 #include "db/dbformat.h"
 #include "db/version_edit.h"
+#include "port/stack_trace.h"
 #include "rocksdb/comparator.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
@@ -19,32 +20,35 @@ namespace rocksdb {
 
 class IntComparator : public Comparator {
  public:
-  int Compare(const Slice& a, const Slice& b) const {
+  int Compare(const Slice& a, const Slice& b) const override {
     assert(a.size() == 8);
     assert(b.size() == 8);
-    return *reinterpret_cast<const int64_t*>(a.data()) -
-      *reinterpret_cast<const int64_t*>(b.data());
+    int64_t diff = *reinterpret_cast<const int64_t*>(a.data()) -
+                   *reinterpret_cast<const int64_t*>(b.data());
+    if (diff < 0) {
+      return -1;
+    } else if (diff == 0) {
+      return 0;
+    } else {
+      return 1;
+    }
   }
 
-  const char* Name() const {
-    return "IntComparator";
-  }
+  const char* Name() const override { return "IntComparator"; }
 
-  void FindShortestSeparator(std::string* start, const Slice& limit) const {}
+  void FindShortestSeparator(std::string* start,
+                             const Slice& limit) const override {}
 
-  void FindShortSuccessor(std::string* key) const {}
+  void FindShortSuccessor(std::string* key) const override {}
 };
 
-
-struct FileIndexerTest {
+class FileIndexerTest : public testing::Test {
  public:
-  FileIndexerTest() :
-    kNumLevels(4), indexer(kNumLevels, &ucmp),
-    files(new std::vector<FileMetaData*>[kNumLevels]) {
-  }
+  FileIndexerTest()
+      : kNumLevels(4), files(new std::vector<FileMetaData*>[kNumLevels]) {}
 
   ~FileIndexerTest() {
-    Reset();
+    ClearFiles();
     delete[] files;
   }
 
@@ -59,14 +63,13 @@ struct FileIndexerTest {
     return InternalKey(Slice(reinterpret_cast<char*>(&v), 8), 0, kTypeValue);
   }
 
-  void Reset() {
+  void ClearFiles() {
     for (uint32_t i = 0; i < kNumLevels; ++i) {
       for (auto* f : files[i]) {
         delete f;
       }
       files[i].clear();
     }
-    indexer.ClearIndex();
   }
 
   void GetNextLevelIndex(const uint32_t level, const uint32_t file_index,
@@ -74,23 +77,31 @@ struct FileIndexerTest {
       int32_t* right_index) {
     *left_index = 100;
     *right_index = 100;
-    indexer.GetNextLevelIndex(level, file_index, cmp_smallest, cmp_largest,
-        left_index, right_index);
+    indexer->GetNextLevelIndex(level, file_index, cmp_smallest, cmp_largest,
+                               left_index, right_index);
   }
 
+  int32_t left = 100;
+  int32_t right = 100;
   const uint32_t kNumLevels;
   IntComparator ucmp;
-  FileIndexer indexer;
+  FileIndexer* indexer;
 
   std::vector<FileMetaData*>* files;
 };
 
-TEST(FileIndexerTest, next_level_hint) {
-  for (uint32_t i = 0; i < kNumLevels; ++i) {
-    ASSERT_EQ(0U, indexer.LevelIndexSize(i));
-  }
+// Case 0: Empty
+TEST_F(FileIndexerTest, Empty) {
+  Arena arena;
+  indexer = new FileIndexer(&ucmp);
+  indexer->UpdateIndex(&arena, 0, files);
+  delete indexer;
+}
 
-  // Case 1: no overlap, files are on the left of next level files
+// Case 1: no overlap, files are on the left of next level files
+TEST_F(FileIndexerTest, no_overlap_left) {
+  Arena arena;
+  indexer = new FileIndexer(&ucmp);
   // level 1
   AddFile(1, 100, 200);
   AddFile(1, 300, 400);
@@ -103,9 +114,7 @@ TEST(FileIndexerTest, next_level_hint) {
   AddFile(3, 2500, 2600);
   AddFile(3, 2601, 2699);
   AddFile(3, 2700, 2800);
-  indexer.UpdateIndex(files);
-  int32_t left = 100;
-  int32_t right = 100;
+  indexer->UpdateIndex(&arena, kNumLevels, files);
   for (uint32_t level = 1; level < 3; ++level) {
     for (uint32_t f = 0; f < 3; ++f) {
       GetNextLevelIndex(level, f, -1, -1, &left, &right);
@@ -125,12 +134,14 @@ TEST(FileIndexerTest, next_level_hint) {
       ASSERT_EQ(2, right);
     }
   }
+  delete indexer;
+  ClearFiles();
+}
 
-  // Case 2: no overlap, files are on the right of next level files
-  Reset();
-  for (uint32_t i = 1; i < kNumLevels; ++i) {
-    ASSERT_EQ(0U, indexer.LevelIndexSize(i));
-  }
+// Case 2: no overlap, files are on the right of next level files
+TEST_F(FileIndexerTest, no_overlap_right) {
+  Arena arena;
+  indexer = new FileIndexer(&ucmp);
   // level 1
   AddFile(1, 2100, 2200);
   AddFile(1, 2300, 2400);
@@ -143,7 +154,7 @@ TEST(FileIndexerTest, next_level_hint) {
   AddFile(3, 500, 600);
   AddFile(3, 501, 699);
   AddFile(3, 700, 800);
-  indexer.UpdateIndex(files);
+  indexer->UpdateIndex(&arena, kNumLevels, files);
   for (uint32_t level = 1; level < 3; ++level) {
     for (uint32_t f = 0; f < 3; ++f) {
       GetNextLevelIndex(level, f, -1, -1, &left, &right);
@@ -166,11 +177,15 @@ TEST(FileIndexerTest, next_level_hint) {
       ASSERT_EQ(2, right);
     }
   }
+  delete indexer;
+}
 
-  // Case 3: empty L2
-  Reset();
+// Case 3: empty L2
+TEST_F(FileIndexerTest, empty_L2) {
+  Arena arena;
+  indexer = new FileIndexer(&ucmp);
   for (uint32_t i = 1; i < kNumLevels; ++i) {
-    ASSERT_EQ(0U, indexer.LevelIndexSize(i));
+    ASSERT_EQ(0U, indexer->LevelIndexSize(i));
   }
   // level 1
   AddFile(1, 2100, 2200);
@@ -180,7 +195,7 @@ TEST(FileIndexerTest, next_level_hint) {
   AddFile(3, 500, 600);
   AddFile(3, 501, 699);
   AddFile(3, 700, 800);
-  indexer.UpdateIndex(files);
+  indexer->UpdateIndex(&arena, kNumLevels, files);
   for (uint32_t f = 0; f < 3; ++f) {
     GetNextLevelIndex(1, f, -1, -1, &left, &right);
     ASSERT_EQ(0, left);
@@ -201,13 +216,14 @@ TEST(FileIndexerTest, next_level_hint) {
     ASSERT_EQ(0, left);
     ASSERT_EQ(-1, right);
   }
+  delete indexer;
+  ClearFiles();
+}
 
-
-  // Case 4: mixed
-  Reset();
-  for (uint32_t i = 1; i < kNumLevels; ++i) {
-    ASSERT_EQ(0U, indexer.LevelIndexSize(i));
-  }
+// Case 4: mixed
+TEST_F(FileIndexerTest, mixed) {
+  Arena arena;
+  indexer = new FileIndexer(&ucmp);
   // level 1
   AddFile(1, 100, 200);
   AddFile(1, 250, 400);
@@ -222,7 +238,7 @@ TEST(FileIndexerTest, next_level_hint) {
   AddFile(3, 0, 50);
   AddFile(3, 100, 200);
   AddFile(3, 201, 250);
-  indexer.UpdateIndex(files);
+  indexer->UpdateIndex(&arena, kNumLevels, files);
   // level 1, 0
   GetNextLevelIndex(1, 0, -1, -1, &left, &right);
   ASSERT_EQ(0, left);
@@ -321,10 +337,14 @@ TEST(FileIndexerTest, next_level_hint) {
     ASSERT_EQ(3, left);
     ASSERT_EQ(2, right);
   }
+  delete indexer;
+  ClearFiles();
 }
 
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
-  return rocksdb::test::RunAllTests();
+  rocksdb::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
 }
diff --git a/src/rocksdb/db/filename.cc b/src/rocksdb/db/filename.cc
index 4b3ac8e..160005d 100644
--- a/src/rocksdb/db/filename.cc
+++ b/src/rocksdb/db/filename.cc
@@ -6,26 +6,34 @@
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
 
 #include "db/filename.h"
+#include <inttypes.h>
 
 #include <ctype.h>
 #include <stdio.h>
+#include <vector>
 #include "db/dbformat.h"
 #include "rocksdb/env.h"
 #include "util/logging.h"
+#include "util/stop_watch.h"
 
 namespace rocksdb {
 
 // Given a path, flatten the path name by replacing all chars not in
-// {[0-9,a-z,A-Z,-,_,.]} with _. And append '\0' at the end.
+// {[0-9,a-z,A-Z,-,_,.]} with _. And append '_LOG\0' at the end.
 // Return the number of chars stored in dest not including the trailing '\0'.
-static int FlattenPath(const std::string& path, char* dest, int len) {
-  int write_idx = 0;
-  int i = 0;
-  int src_len = path.size();
+static size_t GetInfoLogPrefix(const std::string& path, char* dest, int len) {
+  const char suffix[] = "_LOG";
 
-  while (i < src_len && write_idx < len - 1) {
+  size_t write_idx = 0;
+  size_t i = 0;
+  size_t src_len = path.size();
+
+  while (i < src_len && write_idx < len - sizeof(suffix)) {
     if ((path[i] >= 'a' && path[i] <= 'z') ||
         (path[i] >= '0' && path[i] <= '9') ||
         (path[i] >= 'A' && path[i] <= 'Z') ||
@@ -39,8 +47,10 @@ static int FlattenPath(const std::string& path, char* dest, int len) {
     }
     i++;
   }
-
-  dest[write_idx] = '\0';
+  assert(sizeof(suffix) <= len - write_idx);
+  // "\0" is automatically added by snprintf
+  snprintf(dest + write_idx, len - write_idx, suffix);
+  write_idx += sizeof(suffix) - 1;
   return write_idx;
 }
 
@@ -66,9 +76,45 @@ std::string ArchivedLogFileName(const std::string& name, uint64_t number) {
   return MakeFileName(name + "/" + ARCHIVAL_DIR, number, "log");
 }
 
-std::string TableFileName(const std::string& name, uint64_t number) {
+std::string MakeTableFileName(const std::string& path, uint64_t number) {
+  return MakeFileName(path, number, "sst");
+}
+
+uint64_t TableFileNameToNumber(const std::string& name) {
+  uint64_t number = 0;
+  uint64_t base = 1;
+  int pos = static_cast<int>(name.find_last_of('.'));
+  while (--pos >= 0 && name[pos] >= '0' && name[pos] <= '9') {
+    number += (name[pos] - '0') * base;
+    base *= 10;
+  }
+  return number;
+}
+
+std::string TableFileName(const std::vector<DbPath>& db_paths, uint64_t number,
+                          uint32_t path_id) {
   assert(number > 0);
-  return MakeFileName(name, number, "sst");
+  std::string path;
+  if (path_id >= db_paths.size()) {
+    path = db_paths.back().path;
+  } else {
+    path = db_paths[path_id].path;
+  }
+  return MakeTableFileName(path, number);
+}
+
+const size_t kFormatFileNumberBufSize = 38;
+
+void FormatFileNumber(uint64_t number, uint32_t path_id, char* out_buf,
+                      size_t out_buf_size) {
+  if (path_id == 0) {
+    snprintf(out_buf, out_buf_size, "%" PRIu64, number);
+  } else {
+    snprintf(out_buf, out_buf_size, "%" PRIu64
+                                    "(path "
+                                    "%" PRIu32 ")",
+             number, path_id);
+  }
 }
 
 std::string DescriptorFileName(const std::string& dbname, uint64_t number) {
@@ -91,14 +137,26 @@ std::string TempFileName(const std::string& dbname, uint64_t number) {
   return MakeFileName(dbname, number, "dbtmp");
 }
 
+InfoLogPrefix::InfoLogPrefix(bool has_log_dir,
+                             const std::string& db_absolute_path) {
+  if (!has_log_dir) {
+    const char kInfoLogPrefix[] = "LOG";
+    // "\0" is automatically added to the end
+    snprintf(buf, sizeof(buf), kInfoLogPrefix);
+    prefix = Slice(buf, sizeof(kInfoLogPrefix) - 1);
+  } else {
+    size_t len = GetInfoLogPrefix(db_absolute_path, buf, sizeof(buf));
+    prefix = Slice(buf, len);
+  }
+}
+
 std::string InfoLogFileName(const std::string& dbname,
     const std::string& db_path, const std::string& log_dir) {
   if (log_dir.empty())
     return dbname + "/LOG";
 
-  char flatten_db_path[256];
-  FlattenPath(db_path, flatten_db_path, 256);
-  return log_dir + "/" + flatten_db_path + "_LOG";
+  InfoLogPrefix info_log_prefix(true, db_path);
+  return log_dir + "/" + info_log_prefix.buf;
 }
 
 // Return the name of the old info log file for "dbname".
@@ -110,9 +168,8 @@ std::string OldInfoLogFileName(const std::string& dbname, uint64_t ts,
   if (log_dir.empty())
     return dbname + "/LOG.old." + buf;
 
-  char flatten_db_path[256];
-  FlattenPath(db_path, flatten_db_path, 256);
-  return log_dir + "/" + flatten_db_path + "_LOG.old." + buf;
+  InfoLogPrefix info_log_prefix(true, db_path);
+  return log_dir + "/" + info_log_prefix.buf + ".old." + buf;
 }
 
 std::string MetaDatabaseName(const std::string& dbname, uint64_t number) {
@@ -130,8 +187,8 @@ std::string IdentityFileName(const std::string& dbname) {
 //    dbname/IDENTITY
 //    dbname/CURRENT
 //    dbname/LOCK
-//    dbname/LOG
-//    dbname/LOG.old.[0-9]+
+//    dbname/<info_log_name_prefix>
+//    dbname/<info_log_name_prefix>.old.[0-9]+
 //    dbname/MANIFEST-[0-9]+
 //    dbname/[0-9]+.(log|sst)
 //    dbname/METADB-[0-9]+
@@ -140,6 +197,12 @@ bool ParseFileName(const std::string& fname,
                    uint64_t* number,
                    FileType* type,
                    WalFileType* log_type) {
+  return ParseFileName(fname, number, "", type, log_type);
+}
+
+bool ParseFileName(const std::string& fname, uint64_t* number,
+                   const Slice& info_log_name_prefix, FileType* type,
+                   WalFileType* log_type) {
   Slice rest(fname);
   if (fname.length() > 1 && fname[0] == '/') {
     rest.remove_prefix(1);
@@ -153,18 +216,22 @@ bool ParseFileName(const std::string& fname,
   } else if (rest == "LOCK") {
     *number = 0;
     *type = kDBLockFile;
-  } else if (rest == "LOG" || rest == "LOG.old") {
-    *number = 0;
-    *type = kInfoLogFile;
-  } else if (rest.starts_with("LOG.old.")) {
-    uint64_t ts_suffix;
-    // sizeof also counts the trailing '\0'.
-    rest.remove_prefix(sizeof("LOG.old.") - 1);
-    if (!ConsumeDecimalNumber(&rest, &ts_suffix)) {
-      return false;
+  } else if (info_log_name_prefix.size() > 0 &&
+             rest.starts_with(info_log_name_prefix)) {
+    rest.remove_prefix(info_log_name_prefix.size());
+    if (rest == "" || rest == ".old") {
+      *number = 0;
+      *type = kInfoLogFile;
+    } else if (rest.starts_with(".old.")) {
+      uint64_t ts_suffix;
+      // sizeof also counts the trailing '\0'.
+      rest.remove_prefix(sizeof(".old.") - 1);
+      if (!ConsumeDecimalNumber(&rest, &ts_suffix)) {
+        return false;
+      }
+      *number = ts_suffix;
+      *type = kInfoLogFile;
     }
-    *number = ts_suffix;
-    *type = kInfoLogFile;
   } else if (rest.starts_with("MANIFEST-")) {
     rest.remove_prefix(strlen("MANIFEST-"));
     uint64_t num;
@@ -226,7 +293,8 @@ bool ParseFileName(const std::string& fname,
 }
 
 Status SetCurrentFile(Env* env, const std::string& dbname,
-                      uint64_t descriptor_number) {
+                      uint64_t descriptor_number,
+                      Directory* directory_to_fsync) {
   // Remove leading "dbname/" and add newline to manifest file name
   std::string manifest = DescriptorFileName(dbname, descriptor_number);
   Slice contents = manifest;
@@ -237,7 +305,11 @@ Status SetCurrentFile(Env* env, const std::string& dbname,
   if (s.ok()) {
     s = env->RenameFile(tmp, CurrentFileName(dbname));
   }
-  if (!s.ok()) {
+  if (s.ok()) {
+    if (directory_to_fsync != nullptr) {
+      directory_to_fsync->Fsync();
+    }
+  } else {
     env->DeleteFile(tmp);
   }
   return s;
@@ -258,4 +330,16 @@ Status SetIdentityFile(Env* env, const std::string& dbname) {
   return s;
 }
 
+Status SyncManifest(Env* env, const DBOptions* db_options, WritableFile* file) {
+  if (db_options->disableDataSync) {
+    return Status::OK();
+  } else if (db_options->use_fsync) {
+    StopWatch sw(env, db_options->statistics.get(), MANIFEST_FILE_SYNC_MICROS);
+    return file->Fsync();
+  } else {
+    StopWatch sw(env, db_options->statistics.get(), MANIFEST_FILE_SYNC_MICROS);
+    return file->Sync();
+  }
+}
+
 }  // namespace rocksdb
diff --git a/src/rocksdb/db/filename.h b/src/rocksdb/db/filename.h
index 8e55f11..33f5ace 100644
--- a/src/rocksdb/db/filename.h
+++ b/src/rocksdb/db/filename.h
@@ -11,15 +11,21 @@
 
 #pragma once
 #include <stdint.h>
+#include <unordered_map>
 #include <string>
+#include <vector>
+
+#include "port/port.h"
+#include "rocksdb/options.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/status.h"
 #include "rocksdb/transaction_log.h"
-#include "port/port.h"
 
 namespace rocksdb {
 
 class Env;
+class Directory;
+class WritableFile;
 
 enum FileType {
   kLogFile,
@@ -47,10 +53,23 @@ extern std::string ArchivalDirectory(const std::string& dbname);
 extern std::string ArchivedLogFileName(const std::string& dbname,
                                        uint64_t num);
 
+extern std::string MakeTableFileName(const std::string& name, uint64_t number);
+
+// the reverse function of MakeTableFileName
+// TODO(yhchiang): could merge this function with ParseFileName()
+extern uint64_t TableFileNameToNumber(const std::string& name);
+
 // Return the name of the sstable with the specified number
 // in the db named by "dbname".  The result will be prefixed with
 // "dbname".
-extern std::string TableFileName(const std::string& dbname, uint64_t number);
+extern std::string TableFileName(const std::vector<DbPath>& db_paths,
+                                 uint64_t number, uint32_t path_id);
+
+// Sufficient buffer size for FormatFileNumber.
+extern const size_t kFormatFileNumberBufSize;
+
+extern void FormatFileNumber(uint64_t number, uint32_t path_id, char* out_buf,
+                             size_t out_buf_size);
 
 // Return the name of the descriptor file for the db named by
 // "dbname" and the specified incarnation number.  The result will be
@@ -71,6 +90,16 @@ extern std::string LockFileName(const std::string& dbname);
 // The result will be prefixed with "dbname".
 extern std::string TempFileName(const std::string& dbname, uint64_t number);
 
+// A helper structure for prefix of info log names.
+struct InfoLogPrefix {
+  char buf[260];
+  Slice prefix;
+  // Prefix with DB absolute path encoded
+  explicit InfoLogPrefix(bool has_log_dir, const std::string& db_absolute_path);
+  // Default Prefix
+  explicit InfoLogPrefix();
+};
+
 // Return the name of the info log file for "dbname".
 extern std::string InfoLogFileName(const std::string& dbname,
     const std::string& db_path="", const std::string& log_dir="");
@@ -92,17 +121,25 @@ extern std::string IdentityFileName(const std::string& dbname);
 // If filename is a rocksdb file, store the type of the file in *type.
 // The number encoded in the filename is stored in *number.  If the
 // filename was successfully parsed, returns true.  Else return false.
-extern bool ParseFileName(const std::string& filename,
-                          uint64_t* number,
-                          FileType* type,
+// info_log_name_prefix is the path of info logs.
+extern bool ParseFileName(const std::string& filename, uint64_t* number,
+                          const Slice& info_log_name_prefix, FileType* type,
                           WalFileType* log_type = nullptr);
+// Same as previous function, but skip info log files.
+extern bool ParseFileName(const std::string& filename, uint64_t* number,
+                          FileType* type, WalFileType* log_type = nullptr);
 
 // Make the CURRENT file point to the descriptor file with the
 // specified number.
 extern Status SetCurrentFile(Env* env, const std::string& dbname,
-                             uint64_t descriptor_number);
+                             uint64_t descriptor_number,
+                             Directory* directory_to_fsync);
 
 // Make the IDENTITY file for the db
 extern Status SetIdentityFile(Env* env, const std::string& dbname);
 
+// Sync manifest file `file`.
+extern Status SyncManifest(Env* env, const DBOptions* db_options,
+                           WritableFile* file);
+
 }  // namespace rocksdb
diff --git a/src/rocksdb/db/filename_test.cc b/src/rocksdb/db/filename_test.cc
index 0baa7fd..2eafd52 100644
--- a/src/rocksdb/db/filename_test.cc
+++ b/src/rocksdb/db/filename_test.cc
@@ -16,37 +16,57 @@
 
 namespace rocksdb {
 
-class FileNameTest { };
+class FileNameTest : public testing::Test {};
 
-TEST(FileNameTest, Parse) {
+TEST_F(FileNameTest, Parse) {
   Slice db;
   FileType type;
   uint64_t number;
 
+  char kDefautInfoLogDir = 1;
+  char kDifferentInfoLogDir = 2;
+  char kNoCheckLogDir = 4;
+  char kAllMode = kDefautInfoLogDir | kDifferentInfoLogDir | kNoCheckLogDir;
+
   // Successful parses
   static struct {
     const char* fname;
     uint64_t number;
     FileType type;
+    char mode;
   } cases[] = {
-    { "100.log",            100,   kLogFile },
-    { "0.log",              0,     kLogFile },
-    { "0.sst",              0,     kTableFile },
-    { "CURRENT",            0,     kCurrentFile },
-    { "LOCK",               0,     kDBLockFile },
-    { "MANIFEST-2",         2,     kDescriptorFile },
-    { "MANIFEST-7",         7,     kDescriptorFile },
-    { "METADB-2",           2,     kMetaDatabase },
-    { "METADB-7",           7,     kMetaDatabase },
-    { "LOG",                0,     kInfoLogFile },
-    { "LOG.old",            0,     kInfoLogFile },
-    { "18446744073709551615.log", 18446744073709551615ull, kLogFile },
-  };
-  for (unsigned int i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) {
-    std::string f = cases[i].fname;
-    ASSERT_TRUE(ParseFileName(f, &number, &type)) << f;
-    ASSERT_EQ(cases[i].type, type) << f;
-    ASSERT_EQ(cases[i].number, number) << f;
+        {"100.log", 100, kLogFile, kAllMode},
+        {"0.log", 0, kLogFile, kAllMode},
+        {"0.sst", 0, kTableFile, kAllMode},
+        {"CURRENT", 0, kCurrentFile, kAllMode},
+        {"LOCK", 0, kDBLockFile, kAllMode},
+        {"MANIFEST-2", 2, kDescriptorFile, kAllMode},
+        {"MANIFEST-7", 7, kDescriptorFile, kAllMode},
+        {"METADB-2", 2, kMetaDatabase, kAllMode},
+        {"METADB-7", 7, kMetaDatabase, kAllMode},
+        {"LOG", 0, kInfoLogFile, kDefautInfoLogDir},
+        {"LOG.old", 0, kInfoLogFile, kDefautInfoLogDir},
+        {"LOG.old.6688", 6688, kInfoLogFile, kDefautInfoLogDir},
+        {"rocksdb_dir_LOG", 0, kInfoLogFile, kDifferentInfoLogDir},
+        {"rocksdb_dir_LOG.old", 0, kInfoLogFile, kDifferentInfoLogDir},
+        {"rocksdb_dir_LOG.old.6688", 6688, kInfoLogFile, kDifferentInfoLogDir},
+        {"18446744073709551615.log", 18446744073709551615ull, kLogFile,
+         kAllMode}, };
+  for (char mode : {kDifferentInfoLogDir, kDefautInfoLogDir, kNoCheckLogDir}) {
+    for (unsigned int i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) {
+      InfoLogPrefix info_log_prefix(mode != kDefautInfoLogDir, "/rocksdb/dir");
+      if (cases[i].mode & mode) {
+        std::string f = cases[i].fname;
+        if (mode == kNoCheckLogDir) {
+          ASSERT_TRUE(ParseFileName(f, &number, &type)) << f;
+        } else {
+          ASSERT_TRUE(ParseFileName(f, &number, info_log_prefix.prefix, &type))
+              << f;
+        }
+        ASSERT_EQ(cases[i].type, type) << f;
+        ASSERT_EQ(cases[i].number, number) << f;
+      }
+    }
   }
 
   // Errors
@@ -85,7 +105,23 @@ TEST(FileNameTest, Parse) {
   };
 }
 
-TEST(FileNameTest, Construction) {
+TEST_F(FileNameTest, InfoLogFileName) {
+  std::string dbname = ("/data/rocksdb");
+  std::string db_absolute_path;
+  Env::Default()->GetAbsolutePath(dbname, &db_absolute_path);
+
+  ASSERT_EQ("/data/rocksdb/LOG", InfoLogFileName(dbname, db_absolute_path, ""));
+  ASSERT_EQ("/data/rocksdb/LOG.old.666",
+            OldInfoLogFileName(dbname, 666u, db_absolute_path, ""));
+
+  ASSERT_EQ("/data/rocksdb_log/data_rocksdb_LOG",
+            InfoLogFileName(dbname, db_absolute_path, "/data/rocksdb_log"));
+  ASSERT_EQ(
+      "/data/rocksdb_log/data_rocksdb_LOG.old.666",
+      OldInfoLogFileName(dbname, 666u, db_absolute_path, "/data/rocksdb_log"));
+}
+
+TEST_F(FileNameTest, Construction) {
   uint64_t number;
   FileType type;
   std::string fname;
@@ -108,7 +144,10 @@ TEST(FileNameTest, Construction) {
   ASSERT_EQ(192U, number);
   ASSERT_EQ(kLogFile, type);
 
-  fname = TableFileName("bar", 200);
+  fname = TableFileName({DbPath("bar", 0)}, 200, 0);
+  std::string fname1 =
+      TableFileName({DbPath("foo", 0), DbPath("bar", 0)}, 200, 1);
+  ASSERT_EQ(fname, fname1);
   ASSERT_EQ("bar/", std::string(fname.data(), 4));
   ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
   ASSERT_EQ(200U, number);
@@ -136,5 +175,6 @@ TEST(FileNameTest, Construction) {
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
-  return rocksdb::test::RunAllTests();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
 }
diff --git a/src/rocksdb/db/flush_job.cc b/src/rocksdb/db/flush_job.cc
new file mode 100644
index 0000000..0f6c85f
--- /dev/null
+++ b/src/rocksdb/db/flush_job.cc
@@ -0,0 +1,305 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/flush_job.h"
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
+#include <algorithm>
+#include <vector>
+
+#include "db/builder.h"
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "db/event_logger_helpers.h"
+#include "db/filename.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/memtable.h"
+#include "db/memtable_list.h"
+#include "db/merge_context.h"
+#include "db/version_set.h"
+#include "port/port.h"
+#include "port/likely.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "table/block.h"
+#include "table/block_based_table_factory.h"
+#include "table/merger.h"
+#include "table/table_builder.h"
+#include "table/two_level_iterator.h"
+#include "util/coding.h"
+#include "util/event_logger.h"
+#include "util/file_util.h"
+#include "util/logging.h"
+#include "util/log_buffer.h"
+#include "util/mutexlock.h"
+#include "util/perf_context_imp.h"
+#include "util/iostats_context_imp.h"
+#include "util/stop_watch.h"
+#include "util/sync_point.h"
+#include "util/thread_status_util.h"
+
+namespace rocksdb {
+
+FlushJob::FlushJob(const std::string& dbname, ColumnFamilyData* cfd,
+                   const DBOptions& db_options,
+                   const MutableCFOptions& mutable_cf_options,
+                   const EnvOptions& env_options, VersionSet* versions,
+                   InstrumentedMutex* db_mutex,
+                   std::atomic<bool>* shutting_down,
+                   SequenceNumber newest_snapshot, JobContext* job_context,
+                   LogBuffer* log_buffer, Directory* db_directory,
+                   Directory* output_file_directory,
+                   CompressionType output_compression, Statistics* stats,
+                   EventLogger* event_logger)
+    : dbname_(dbname),
+      cfd_(cfd),
+      db_options_(db_options),
+      mutable_cf_options_(mutable_cf_options),
+      env_options_(env_options),
+      versions_(versions),
+      db_mutex_(db_mutex),
+      shutting_down_(shutting_down),
+      newest_snapshot_(newest_snapshot),
+      job_context_(job_context),
+      log_buffer_(log_buffer),
+      db_directory_(db_directory),
+      output_file_directory_(output_file_directory),
+      output_compression_(output_compression),
+      stats_(stats),
+      event_logger_(event_logger) {
+  // Update the thread status to indicate flush.
+  ReportStartedFlush();
+  TEST_SYNC_POINT("FlushJob::FlushJob()");
+}
+
+FlushJob::~FlushJob() {
+  TEST_SYNC_POINT("FlushJob::~FlushJob()");
+  ThreadStatusUtil::ResetThreadStatus();
+}
+
+void FlushJob::ReportStartedFlush() {
+  ThreadStatusUtil::SetColumnFamily(cfd_);
+  ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_FLUSH);
+  ThreadStatusUtil::SetThreadOperationProperty(
+      ThreadStatus::COMPACTION_JOB_ID,
+      job_context_->job_id);
+  IOSTATS_RESET(bytes_written);
+}
+
+void FlushJob::ReportFlushInputSize(const autovector<MemTable*>& mems) {
+  uint64_t input_size = 0;
+  for (auto* mem : mems) {
+    input_size += mem->ApproximateMemoryUsage();
+  }
+  ThreadStatusUtil::IncreaseThreadOperationProperty(
+      ThreadStatus::FLUSH_BYTES_MEMTABLES,
+      input_size);
+}
+
+void FlushJob::RecordFlushIOStats() {
+  ThreadStatusUtil::IncreaseThreadOperationProperty(
+      ThreadStatus::FLUSH_BYTES_WRITTEN, IOSTATS(bytes_written));
+  IOSTATS_RESET(bytes_written);
+}
+
+Status FlushJob::Run(uint64_t* file_number) {
+  AutoThreadOperationStageUpdater stage_run(
+      ThreadStatus::STAGE_FLUSH_RUN);
+  // Save the contents of the earliest memtable as a new Table
+  uint64_t fn;
+  autovector<MemTable*> mems;
+  cfd_->imm()->PickMemtablesToFlush(&mems);
+  if (mems.empty()) {
+    LogToBuffer(log_buffer_, "[%s] Nothing in memtable to flush",
+                cfd_->GetName().c_str());
+    return Status::OK();
+  }
+
+  ReportFlushInputSize(mems);
+
+  // entries mems are (implicitly) sorted in ascending order by their created
+  // time. We will use the first memtable's `edit` to keep the meta info for
+  // this flush.
+  MemTable* m = mems[0];
+  VersionEdit* edit = m->GetEdits();
+  edit->SetPrevLogNumber(0);
+  // SetLogNumber(log_num) indicates logs with number smaller than log_num
+  // will no longer be picked up for recovery.
+  edit->SetLogNumber(mems.back()->GetNextLogNumber());
+  edit->SetColumnFamily(cfd_->GetID());
+
+  // This will release and re-acquire the mutex.
+  Status s = WriteLevel0Table(mems, edit, &fn);
+
+  if (s.ok() &&
+      (shutting_down_->load(std::memory_order_acquire) || cfd_->IsDropped())) {
+    s = Status::ShutdownInProgress(
+        "Database shutdown or Column family drop during flush");
+  }
+
+  if (!s.ok()) {
+    cfd_->imm()->RollbackMemtableFlush(mems, fn);
+  } else {
+    // Replace immutable memtable with the generated Table
+    s = cfd_->imm()->InstallMemtableFlushResults(
+        cfd_, mutable_cf_options_, mems, versions_, db_mutex_, fn,
+        &job_context_->memtables_to_free, db_directory_, log_buffer_);
+  }
+
+  if (s.ok() && file_number != nullptr) {
+    *file_number = fn;
+  }
+  RecordFlushIOStats();
+
+  auto stream = event_logger_->LogToBuffer(log_buffer_);
+  stream << "job" << job_context_->job_id << "event"
+         << "flush_finished";
+  stream << "lsm_state";
+  stream.StartArray();
+  auto vstorage = cfd_->current()->storage_info();
+  for (int level = 0; level < vstorage->num_levels(); ++level) {
+    stream << vstorage->NumLevelFiles(level);
+  }
+  stream.EndArray();
+
+  return s;
+}
+
+Status FlushJob::WriteLevel0Table(const autovector<MemTable*>& mems,
+                                  VersionEdit* edit, uint64_t* filenumber) {
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_FLUSH_WRITE_L0);
+  db_mutex_->AssertHeld();
+  const uint64_t start_micros = db_options_.env->NowMicros();
+  FileMetaData meta;
+  // path 0 for level 0 file.
+  meta.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0);
+  *filenumber = meta.fd.GetNumber();
+
+  const SequenceNumber earliest_seqno_in_memtable =
+      mems[0]->GetFirstSequenceNumber();
+  Version* base = cfd_->current();
+  base->Ref();  // it is likely that we do not need this reference
+  Status s;
+  {
+    db_mutex_->Unlock();
+    if (log_buffer_) {
+      log_buffer_->FlushBufferToLog();
+    }
+    std::vector<Iterator*> memtables;
+    ReadOptions ro;
+    ro.total_order_seek = true;
+    Arena arena;
+    uint64_t total_num_entries = 0, total_num_deletes = 0;
+    size_t total_memory_usage = 0;
+    for (MemTable* m : mems) {
+      Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+          "[%s] [JOB %d] Flushing memtable with next log file: %" PRIu64 "\n",
+          cfd_->GetName().c_str(), job_context_->job_id, m->GetNextLogNumber());
+      memtables.push_back(m->NewIterator(ro, &arena));
+      total_num_entries += m->num_entries();
+      total_num_deletes += m->num_deletes();
+      total_memory_usage += m->ApproximateMemoryUsage();
+    }
+
+    event_logger_->Log() << "job" << job_context_->job_id << "event"
+                         << "flush_started"
+                         << "num_memtables" << mems.size() << "num_entries"
+                         << total_num_entries << "num_deletes"
+                         << total_num_deletes << "memory_usage"
+                         << total_memory_usage;
+    TableProperties table_properties;
+    {
+      ScopedArenaIterator iter(
+          NewMergingIterator(&cfd_->internal_comparator(), &memtables[0],
+                             static_cast<int>(memtables.size()), &arena));
+      Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+          "[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": started",
+          cfd_->GetName().c_str(), job_context_->job_id, meta.fd.GetNumber());
+
+      TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table:output_compression",
+                               &output_compression_);
+      s = BuildTable(dbname_, db_options_.env, *cfd_->ioptions(), env_options_,
+                     cfd_->table_cache(), iter.get(), &meta,
+                     cfd_->internal_comparator(),
+                     cfd_->int_tbl_prop_collector_factories(), newest_snapshot_,
+                     earliest_seqno_in_memtable, output_compression_,
+                     cfd_->ioptions()->compression_opts,
+                     mutable_cf_options_.paranoid_file_checks, Env::IO_HIGH,
+                     &table_properties);
+      LogFlush(db_options_.info_log);
+    }
+    Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+        "[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": %" PRIu64 " bytes %s",
+        cfd_->GetName().c_str(), job_context_->job_id, meta.fd.GetNumber(),
+        meta.fd.GetFileSize(), s.ToString().c_str());
+
+    // output to event logger
+    if (s.ok()) {
+      EventLoggerHelpers::LogTableFileCreation(
+          event_logger_, job_context_->job_id, meta.fd.GetNumber(),
+          meta.fd.GetFileSize(), table_properties);
+    }
+
+    if (!db_options_.disableDataSync && output_file_directory_ != nullptr) {
+      output_file_directory_->Fsync();
+    }
+    db_mutex_->Lock();
+  }
+  base->Unref();
+
+  // re-acquire the most current version
+  base = cfd_->current();
+
+  // Note that if file_size is zero, the file has been deleted and
+  // should not be added to the manifest.
+  int level = 0;
+  if (s.ok() && meta.fd.GetFileSize() > 0) {
+    const Slice min_user_key = meta.smallest.user_key();
+    const Slice max_user_key = meta.largest.user_key();
+    // if we have more than 1 background thread, then we cannot
+    // insert files directly into higher levels because some other
+    // threads could be concurrently producing compacted files for
+    // that key range.
+    if (base != nullptr && db_options_.max_background_compactions <= 1 &&
+        db_options_.max_background_flushes == 0 &&
+        cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
+      level = base->storage_info()->PickLevelForMemTableOutput(
+          mutable_cf_options_, min_user_key, max_user_key);
+      // If level does not match path id, reset level back to 0
+      uint32_t fdpath = LevelCompactionPicker::GetPathId(
+          *cfd_->ioptions(), mutable_cf_options_, level);
+      if (fdpath != 0) {
+        level = 0;
+      }
+    }
+    edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(),
+                  meta.fd.GetFileSize(), meta.smallest, meta.largest,
+                  meta.smallest_seqno, meta.largest_seqno);
+  }
+
+  InternalStats::CompactionStats stats(1);
+  stats.micros = db_options_.env->NowMicros() - start_micros;
+  stats.bytes_written = meta.fd.GetFileSize();
+  cfd_->internal_stats()->AddCompactionStats(level, stats);
+  cfd_->internal_stats()->AddCFStats(InternalStats::BYTES_FLUSHED,
+                                     meta.fd.GetFileSize());
+  RecordTick(stats_, COMPACT_WRITE_BYTES, meta.fd.GetFileSize());
+  return s;
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/db/flush_job.h b/src/rocksdb/db/flush_job.h
new file mode 100644
index 0000000..c504b14
--- /dev/null
+++ b/src/rocksdb/db/flush_job.h
@@ -0,0 +1,93 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <atomic>
+#include <deque>
+#include <limits>
+#include <set>
+#include <utility>
+#include <vector>
+#include <string>
+
+#include "db/dbformat.h"
+#include "db/log_writer.h"
+#include "db/snapshot.h"
+#include "db/column_family.h"
+#include "db/version_edit.h"
+#include "db/memtable_list.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/transaction_log.h"
+#include "util/autovector.h"
+#include "util/event_logger.h"
+#include "util/instrumented_mutex.h"
+#include "util/stop_watch.h"
+#include "util/thread_local.h"
+#include "util/scoped_arena_iterator.h"
+#include "db/internal_stats.h"
+#include "db/write_controller.h"
+#include "db/flush_scheduler.h"
+#include "db/write_thread.h"
+#include "db/job_context.h"
+
+namespace rocksdb {
+
+class MemTable;
+class TableCache;
+class Version;
+class VersionEdit;
+class VersionSet;
+class Arena;
+
+class FlushJob {
+ public:
+  // TODO(icanadi) make effort to reduce number of parameters here
+  // IMPORTANT: mutable_cf_options needs to be alive while FlushJob is alive
+  FlushJob(const std::string& dbname, ColumnFamilyData* cfd,
+           const DBOptions& db_options,
+           const MutableCFOptions& mutable_cf_options,
+           const EnvOptions& env_options, VersionSet* versions,
+           InstrumentedMutex* db_mutex, std::atomic<bool>* shutting_down,
+           SequenceNumber newest_snapshot, JobContext* job_context,
+           LogBuffer* log_buffer, Directory* db_directory,
+           Directory* output_file_directory, CompressionType output_compression,
+           Statistics* stats, EventLogger* event_logger);
+
+  ~FlushJob();
+
+  Status Run(uint64_t* file_number = nullptr);
+
+ private:
+  void ReportStartedFlush();
+  void ReportFlushInputSize(const autovector<MemTable*>& mems);
+  void RecordFlushIOStats();
+  Status WriteLevel0Table(const autovector<MemTable*>& mems, VersionEdit* edit,
+                          uint64_t* filenumber);
+  const std::string& dbname_;
+  ColumnFamilyData* cfd_;
+  const DBOptions& db_options_;
+  const MutableCFOptions& mutable_cf_options_;
+  const EnvOptions& env_options_;
+  VersionSet* versions_;
+  InstrumentedMutex* db_mutex_;
+  std::atomic<bool>* shutting_down_;
+  SequenceNumber newest_snapshot_;
+  JobContext* job_context_;
+  LogBuffer* log_buffer_;
+  Directory* db_directory_;
+  Directory* output_file_directory_;
+  CompressionType output_compression_;
+  Statistics* stats_;
+  EventLogger* event_logger_;
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/db/flush_job_test.cc b/src/rocksdb/db/flush_job_test.cc
new file mode 100644
index 0000000..6946ae0
--- /dev/null
+++ b/src/rocksdb/db/flush_job_test.cc
@@ -0,0 +1,130 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include <map>
+#include <string>
+
+#include "db/flush_job.h"
+#include "db/column_family.h"
+#include "db/version_set.h"
+#include "db/writebuffer.h"
+#include "rocksdb/cache.h"
+#include "util/string_util.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+#include "table/mock_table.h"
+
+namespace rocksdb {
+
+// TODO(icanadi) Mock out everything else:
+// 1. VersionSet
+// 2. Memtable
+class FlushJobTest : public testing::Test {
+ public:
+  FlushJobTest()
+      : env_(Env::Default()),
+        dbname_(test::TmpDir() + "/flush_job_test"),
+        table_cache_(NewLRUCache(50000, 16)),
+        write_buffer_(db_options_.db_write_buffer_size),
+        versions_(new VersionSet(dbname_, &db_options_, env_options_,
+                                 table_cache_.get(), &write_buffer_,
+                                 &write_controller_)),
+        shutting_down_(false),
+        mock_table_factory_(new mock::MockTableFactory()) {
+    EXPECT_OK(env_->CreateDirIfMissing(dbname_));
+    db_options_.db_paths.emplace_back(dbname_,
+                                      std::numeric_limits<uint64_t>::max());
+    // TODO(icanadi) Remove this once we mock out VersionSet
+    NewDB();
+    std::vector<ColumnFamilyDescriptor> column_families;
+    cf_options_.table_factory = mock_table_factory_;
+    column_families.emplace_back(kDefaultColumnFamilyName, cf_options_);
+
+    EXPECT_OK(versions_->Recover(column_families, false));
+  }
+
+  void NewDB() {
+    VersionEdit new_db;
+    new_db.SetLogNumber(0);
+    new_db.SetNextFile(2);
+    new_db.SetLastSequence(0);
+
+    const std::string manifest = DescriptorFileName(dbname_, 1);
+    unique_ptr<WritableFile> file;
+    Status s = env_->NewWritableFile(
+        manifest, &file, env_->OptimizeForManifestWrite(env_options_));
+    ASSERT_OK(s);
+    {
+      log::Writer log(std::move(file));
+      std::string record;
+      new_db.EncodeTo(&record);
+      s = log.AddRecord(record);
+    }
+    ASSERT_OK(s);
+    // Make "CURRENT" file that points to the new manifest file.
+    s = SetCurrentFile(env_, dbname_, 1, nullptr);
+  }
+
+  Env* env_;
+  std::string dbname_;
+  EnvOptions env_options_;
+  std::shared_ptr<Cache> table_cache_;
+  WriteController write_controller_;
+  DBOptions db_options_;
+  WriteBuffer write_buffer_;
+  ColumnFamilyOptions cf_options_;
+  std::unique_ptr<VersionSet> versions_;
+  InstrumentedMutex mutex_;
+  std::atomic<bool> shutting_down_;
+  std::shared_ptr<mock::MockTableFactory> mock_table_factory_;
+};
+
+TEST_F(FlushJobTest, Empty) {
+  JobContext job_context(0);
+  auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+  EventLogger event_logger(db_options_.info_log.get());
+  FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(),
+                     db_options_, *cfd->GetLatestMutableCFOptions(),
+                     env_options_, versions_.get(), &mutex_, &shutting_down_,
+                     SequenceNumber(), &job_context, nullptr, nullptr, nullptr,
+                     kNoCompression, nullptr, &event_logger);
+  ASSERT_OK(flush_job.Run());
+  job_context.Clean();
+}
+
+TEST_F(FlushJobTest, NonEmpty) {
+  JobContext job_context(0);
+  auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+  auto new_mem = cfd->ConstructNewMemtable(*cfd->GetLatestMutableCFOptions());
+  new_mem->Ref();
+  std::map<std::string, std::string> inserted_keys;
+  for (int i = 1; i < 10000; ++i) {
+    std::string key(ToString(i));
+    std::string value("value" + ToString(i));
+    new_mem->Add(SequenceNumber(i), kTypeValue, key, value);
+    InternalKey internal_key(key, SequenceNumber(i), kTypeValue);
+    inserted_keys.insert({internal_key.Encode().ToString(), value});
+  }
+  cfd->imm()->Add(new_mem);
+
+  EventLogger event_logger(db_options_.info_log.get());
+  FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(),
+                     db_options_, *cfd->GetLatestMutableCFOptions(),
+                     env_options_, versions_.get(), &mutex_, &shutting_down_,
+                     SequenceNumber(), &job_context, nullptr, nullptr, nullptr,
+                     kNoCompression, nullptr, &event_logger);
+  mutex_.Lock();
+  ASSERT_OK(flush_job.Run());
+  mutex_.Unlock();
+  mock_table_factory_->AssertSingleFile(inserted_keys);
+  job_context.Clean();
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/flush_scheduler.cc b/src/rocksdb/db/flush_scheduler.cc
new file mode 100644
index 0000000..5681615
--- /dev/null
+++ b/src/rocksdb/db/flush_scheduler.cc
@@ -0,0 +1,63 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "db/flush_scheduler.h"
+
+#include <cassert>
+
+#include "db/column_family.h"
+
+namespace rocksdb {
+
+void FlushScheduler::ScheduleFlush(ColumnFamilyData* cfd) {
+#ifndef NDEBUG
+  assert(column_families_set_.find(cfd) == column_families_set_.end());
+  column_families_set_.insert(cfd);
+#endif  // NDEBUG
+  cfd->Ref();
+  column_families_.push_back(cfd);
+}
+
+ColumnFamilyData* FlushScheduler::GetNextColumnFamily() {
+  ColumnFamilyData* cfd = nullptr;
+  while (column_families_.size() > 0) {
+    cfd = column_families_.front();
+    column_families_.pop_front();
+    if (cfd->IsDropped()) {
+      if (cfd->Unref()) {
+        delete cfd;
+        cfd = nullptr;
+      }
+    } else {
+      break;
+    }
+  }
+#ifndef NDEBUG
+  if (cfd != nullptr) {
+    auto itr = column_families_set_.find(cfd);
+    assert(itr != column_families_set_.end());
+    column_families_set_.erase(itr);
+  }
+#endif  // NDEBUG
+  return cfd;
+}
+
+bool FlushScheduler::Empty() { return column_families_.empty(); }
+
+void FlushScheduler::Clear() {
+  for (auto cfd : column_families_) {
+#ifndef NDEBUG
+    auto itr = column_families_set_.find(cfd);
+    assert(itr != column_families_set_.end());
+    column_families_set_.erase(itr);
+#endif  // NDEBUG
+    if (cfd->Unref()) {
+      delete cfd;
+    }
+  }
+  column_families_.clear();
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/db/flush_scheduler.h b/src/rocksdb/db/flush_scheduler.h
new file mode 100644
index 0000000..0c96709
--- /dev/null
+++ b/src/rocksdb/db/flush_scheduler.h
@@ -0,0 +1,40 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include <stdint.h>
+#include <deque>
+#include <set>
+#include <vector>
+
+namespace rocksdb {
+
+class ColumnFamilyData;
+
+// This class is thread-compatible. It's should only be accessed from single
+// write thread (between BeginWrite() and EndWrite())
+class FlushScheduler {
+ public:
+  FlushScheduler() = default;
+  ~FlushScheduler() = default;
+
+  void ScheduleFlush(ColumnFamilyData* cfd);
+  // Returns Ref()-ed column family. Client needs to Unref()
+  // REQUIRES: db mutex is held (exception is single-threaded recovery)
+  ColumnFamilyData* GetNextColumnFamily();
+
+  bool Empty();
+
+  void Clear();
+
+ private:
+  std::deque<ColumnFamilyData*> column_families_;
+#ifndef NDEBUG
+  std::set<ColumnFamilyData*> column_families_set_;
+#endif  // NDEBUG
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/db/forward_iterator.cc b/src/rocksdb/db/forward_iterator.cc
new file mode 100644
index 0000000..b441019
--- /dev/null
+++ b/src/rocksdb/db/forward_iterator.cc
@@ -0,0 +1,541 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef ROCKSDB_LITE
+#include "db/forward_iterator.h"
+
+#include <limits>
+#include <string>
+#include <utility>
+
+#include "db/job_context.h"
+#include "db/db_impl.h"
+#include "db/db_iter.h"
+#include "db/column_family.h"
+#include "rocksdb/env.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "table/merger.h"
+#include "db/dbformat.h"
+
+namespace rocksdb {
+
+// Usage:
+//     LevelIterator iter;
+//     iter.SetFileIndex(file_index);
+//     iter.Seek(target);
+//     iter.Next()
+class LevelIterator : public Iterator {
+ public:
+  LevelIterator(const ColumnFamilyData* const cfd,
+      const ReadOptions& read_options,
+      const std::vector<FileMetaData*>& files)
+    : cfd_(cfd), read_options_(read_options), files_(files), valid_(false),
+      file_index_(std::numeric_limits<uint32_t>::max()) {}
+
+  void SetFileIndex(uint32_t file_index) {
+    assert(file_index < files_.size());
+    if (file_index != file_index_) {
+      file_index_ = file_index;
+      Reset();
+    }
+    valid_ = false;
+  }
+  void Reset() {
+    assert(file_index_ < files_.size());
+    file_iter_.reset(cfd_->table_cache()->NewIterator(
+        read_options_, *(cfd_->soptions()), cfd_->internal_comparator(),
+        files_[file_index_]->fd, nullptr /* table_reader_ptr */, false));
+  }
+  void SeekToLast() override {
+    status_ = Status::NotSupported("LevelIterator::SeekToLast()");
+    valid_ = false;
+  }
+  void Prev() override {
+    status_ = Status::NotSupported("LevelIterator::Prev()");
+    valid_ = false;
+  }
+  bool Valid() const override {
+    return valid_;
+  }
+  void SeekToFirst() override {
+    SetFileIndex(0);
+    file_iter_->SeekToFirst();
+    valid_ = file_iter_->Valid();
+  }
+  void Seek(const Slice& internal_key) override {
+    assert(file_iter_ != nullptr);
+    file_iter_->Seek(internal_key);
+    valid_ = file_iter_->Valid();
+  }
+  void Next() override {
+    assert(valid_);
+    file_iter_->Next();
+    for (;;) {
+      if (file_iter_->status().IsIncomplete() || file_iter_->Valid()) {
+        valid_ = !file_iter_->status().IsIncomplete();
+        return;
+      }
+      if (file_index_ + 1 >= files_.size()) {
+        valid_ = false;
+        return;
+      }
+      SetFileIndex(file_index_ + 1);
+      file_iter_->SeekToFirst();
+    }
+  }
+  Slice key() const override {
+    assert(valid_);
+    return file_iter_->key();
+  }
+  Slice value() const override {
+    assert(valid_);
+    return file_iter_->value();
+  }
+  Status status() const override {
+    if (!status_.ok()) {
+      return status_;
+    } else if (file_iter_ && !file_iter_->status().ok()) {
+      return file_iter_->status();
+    }
+    return Status::OK();
+  }
+
+ private:
+  const ColumnFamilyData* const cfd_;
+  const ReadOptions& read_options_;
+  const std::vector<FileMetaData*>& files_;
+
+  bool valid_;
+  uint32_t file_index_;
+  Status status_;
+  std::unique_ptr<Iterator> file_iter_;
+};
+
+ForwardIterator::ForwardIterator(DBImpl* db, const ReadOptions& read_options,
+    ColumnFamilyData* cfd, SuperVersion* current_sv)
+    : db_(db),
+      read_options_(read_options),
+      cfd_(cfd),
+      prefix_extractor_(cfd->ioptions()->prefix_extractor),
+      user_comparator_(cfd->user_comparator()),
+      immutable_min_heap_(MinIterComparator(&cfd_->internal_comparator())),
+      sv_(current_sv),
+      mutable_iter_(nullptr),
+      current_(nullptr),
+      status_(Status::OK()),
+      immutable_status_(Status::OK()),
+      valid_(false),
+      is_prev_set_(false),
+      is_prev_inclusive_(false) {
+  if (sv_) {
+    RebuildIterators(false);
+  }
+}
+
+ForwardIterator::~ForwardIterator() {
+  Cleanup(true);
+}
+
+void ForwardIterator::Cleanup(bool release_sv) {
+  if (mutable_iter_ != nullptr) {
+    mutable_iter_->~Iterator();
+  }
+  for (auto* m : imm_iters_) {
+    m->~Iterator();
+  }
+  imm_iters_.clear();
+  for (auto* f : l0_iters_) {
+    delete f;
+  }
+  l0_iters_.clear();
+  for (auto* l : level_iters_) {
+    delete l;
+  }
+  level_iters_.clear();
+
+  if (release_sv) {
+    if (sv_ != nullptr && sv_->Unref()) {
+      // Job id == 0 means that this is not our background process, but rather
+      // user thread
+      JobContext job_context(0);
+      db_->mutex_.Lock();
+      sv_->Cleanup();
+      db_->FindObsoleteFiles(&job_context, false, true);
+      db_->mutex_.Unlock();
+      delete sv_;
+      if (job_context.HaveSomethingToDelete()) {
+        db_->PurgeObsoleteFiles(job_context);
+      }
+    }
+  }
+}
+
+bool ForwardIterator::Valid() const {
+  return valid_;
+}
+
+void ForwardIterator::SeekToFirst() {
+  if (sv_ == nullptr ||
+      sv_ ->version_number != cfd_->GetSuperVersionNumber()) {
+    RebuildIterators(true);
+  } else if (immutable_status_.IsIncomplete()) {
+    ResetIncompleteIterators();
+  }
+  SeekInternal(Slice(), true);
+}
+
+void ForwardIterator::Seek(const Slice& internal_key) {
+  if (sv_ == nullptr ||
+      sv_ ->version_number != cfd_->GetSuperVersionNumber()) {
+    RebuildIterators(true);
+  } else if (immutable_status_.IsIncomplete()) {
+    ResetIncompleteIterators();
+  }
+  SeekInternal(internal_key, false);
+}
+
+void ForwardIterator::SeekInternal(const Slice& internal_key,
+                                   bool seek_to_first) {
+  assert(mutable_iter_);
+  // mutable
+  seek_to_first ? mutable_iter_->SeekToFirst() :
+                  mutable_iter_->Seek(internal_key);
+
+  // immutable
+  // TODO(ljin): NeedToSeekImmutable has negative impact on performance
+  // if it turns to need to seek immutable often. We probably want to have
+  // an option to turn it off.
+  if (seek_to_first || NeedToSeekImmutable(internal_key)) {
+    immutable_status_ = Status::OK();
+    {
+      auto tmp = MinIterHeap(MinIterComparator(&cfd_->internal_comparator()));
+      immutable_min_heap_.swap(tmp);
+    }
+    for (auto* m : imm_iters_) {
+      seek_to_first ? m->SeekToFirst() : m->Seek(internal_key);
+      if (!m->status().ok()) {
+        immutable_status_ = m->status();
+      } else if (m->Valid()) {
+        immutable_min_heap_.push(m);
+      }
+    }
+
+    Slice user_key;
+    if (!seek_to_first) {
+      user_key = ExtractUserKey(internal_key);
+    }
+    const VersionStorageInfo* vstorage = sv_->current->storage_info();
+    const std::vector<FileMetaData*>& l0 = vstorage->LevelFiles(0);
+    for (uint32_t i = 0; i < l0.size(); ++i) {
+      if (seek_to_first) {
+        l0_iters_[i]->SeekToFirst();
+      } else {
+        // If the target key passes over the larget key, we are sure Next()
+        // won't go over this file.
+        if (user_comparator_->Compare(user_key,
+              l0[i]->largest.user_key()) > 0) {
+          continue;
+        }
+        l0_iters_[i]->Seek(internal_key);
+      }
+
+      if (!l0_iters_[i]->status().ok()) {
+        immutable_status_ = l0_iters_[i]->status();
+      } else if (l0_iters_[i]->Valid()) {
+        immutable_min_heap_.push(l0_iters_[i]);
+      }
+    }
+
+    int32_t search_left_bound = 0;
+    int32_t search_right_bound = FileIndexer::kLevelMaxIndex;
+    for (int32_t level = 1; level < vstorage->num_levels(); ++level) {
+      const std::vector<FileMetaData*>& level_files =
+          vstorage->LevelFiles(level);
+      if (level_files.empty()) {
+        search_left_bound = 0;
+        search_right_bound = FileIndexer::kLevelMaxIndex;
+        continue;
+      }
+      assert(level_iters_[level - 1] != nullptr);
+      uint32_t f_idx = 0;
+      const auto& indexer = vstorage->file_indexer();
+      if (!seek_to_first) {
+        if (search_left_bound == search_right_bound) {
+          f_idx = search_left_bound;
+        } else if (search_left_bound < search_right_bound) {
+          f_idx =
+              FindFileInRange(level_files, internal_key, search_left_bound,
+                              search_right_bound == FileIndexer::kLevelMaxIndex
+                                  ? static_cast<uint32_t>(level_files.size())
+                                  : search_right_bound);
+        } else {
+          // search_left_bound > search_right_bound
+          // There are only 2 cases this can happen:
+          // (1) target key is smaller than left most file
+          // (2) target key is larger than right most file
+          assert(search_left_bound == (int32_t)level_files.size() ||
+                 search_right_bound == -1);
+          if (search_right_bound == -1) {
+            assert(search_left_bound == 0);
+            f_idx = 0;
+          } else {
+            indexer.GetNextLevelIndex(
+                level, level_files.size() - 1,
+                1, 1, &search_left_bound, &search_right_bound);
+            continue;
+          }
+        }
+
+        // Prepare hints for the next level
+        if (f_idx < level_files.size()) {
+          int cmp_smallest = user_comparator_->Compare(
+              user_key, level_files[f_idx]->smallest.user_key());
+          int cmp_largest = -1;
+          if (cmp_smallest >= 0) {
+            cmp_smallest = user_comparator_->Compare(
+                user_key, level_files[f_idx]->smallest.user_key());
+          }
+          indexer.GetNextLevelIndex(level, f_idx,
+              cmp_smallest, cmp_largest,
+              &search_left_bound, &search_right_bound);
+        } else {
+          indexer.GetNextLevelIndex(
+              level, level_files.size() - 1,
+              1, 1, &search_left_bound, &search_right_bound);
+        }
+      }
+
+      // Seek
+      if (f_idx < level_files.size()) {
+        level_iters_[level - 1]->SetFileIndex(f_idx);
+        seek_to_first ? level_iters_[level - 1]->SeekToFirst() :
+                        level_iters_[level - 1]->Seek(internal_key);
+
+        if (!level_iters_[level - 1]->status().ok()) {
+          immutable_status_ = level_iters_[level - 1]->status();
+        } else if (level_iters_[level - 1]->Valid()) {
+          immutable_min_heap_.push(level_iters_[level - 1]);
+        }
+      }
+    }
+
+    if (seek_to_first) {
+      is_prev_set_ = false;
+    } else {
+      prev_key_.SetKey(internal_key);
+      is_prev_set_ = true;
+      is_prev_inclusive_ = true;
+    }
+  } else if (current_ && current_ != mutable_iter_) {
+    // current_ is one of immutable iterators, push it back to the heap
+    immutable_min_heap_.push(current_);
+  }
+
+  UpdateCurrent();
+}
+
+void ForwardIterator::Next() {
+  assert(valid_);
+
+  if (sv_ == nullptr ||
+      sv_->version_number != cfd_->GetSuperVersionNumber()) {
+    std::string current_key = key().ToString();
+    Slice old_key(current_key.data(), current_key.size());
+
+    RebuildIterators(true);
+    SeekInternal(old_key, false);
+    if (!valid_ || key().compare(old_key) != 0) {
+      return;
+    }
+  } else if (current_ != mutable_iter_) {
+    // It is going to advance immutable iterator
+
+    bool update_prev_key = true;
+    if (is_prev_set_ && prefix_extractor_) {
+      // advance prev_key_ to current_ only if they share the same prefix
+      update_prev_key =
+        prefix_extractor_->Transform(prev_key_.GetKey()).compare(
+          prefix_extractor_->Transform(current_->key())) == 0;
+    }
+
+    if (update_prev_key) {
+      prev_key_.SetKey(current_->key());
+      is_prev_set_ = true;
+      is_prev_inclusive_ = false;
+    }
+  }
+
+  current_->Next();
+  if (current_ != mutable_iter_) {
+    if (!current_->status().ok()) {
+      immutable_status_ = current_->status();
+    } else if (current_->Valid()) {
+      immutable_min_heap_.push(current_);
+    }
+  }
+
+  UpdateCurrent();
+}
+
+Slice ForwardIterator::key() const {
+  assert(valid_);
+  return current_->key();
+}
+
+Slice ForwardIterator::value() const {
+  assert(valid_);
+  return current_->value();
+}
+
+Status ForwardIterator::status() const {
+  if (!status_.ok()) {
+    return status_;
+  } else if (!mutable_iter_->status().ok()) {
+    return mutable_iter_->status();
+  }
+
+  return immutable_status_;
+}
+
+void ForwardIterator::RebuildIterators(bool refresh_sv) {
+  // Clean up
+  Cleanup(refresh_sv);
+  if (refresh_sv) {
+    // New
+    sv_ = cfd_->GetReferencedSuperVersion(&(db_->mutex_));
+  }
+  mutable_iter_ = sv_->mem->NewIterator(read_options_, &arena_);
+  sv_->imm->AddIterators(read_options_, &imm_iters_, &arena_);
+
+  const auto* vstorage = sv_->current->storage_info();
+  const auto& l0_files = vstorage->LevelFiles(0);
+  l0_iters_.reserve(l0_files.size());
+  for (const auto* l0 : l0_files) {
+    l0_iters_.push_back(cfd_->table_cache()->NewIterator(
+        read_options_, *cfd_->soptions(), cfd_->internal_comparator(), l0->fd));
+  }
+  level_iters_.reserve(vstorage->num_levels() - 1);
+  for (int32_t level = 1; level < vstorage->num_levels(); ++level) {
+    const auto& level_files = vstorage->LevelFiles(level);
+
+    if (level_files.empty()) {
+      level_iters_.push_back(nullptr);
+    } else {
+      level_iters_.push_back(
+          new LevelIterator(cfd_, read_options_, level_files));
+    }
+  }
+
+  current_ = nullptr;
+  is_prev_set_ = false;
+}
+
+void ForwardIterator::ResetIncompleteIterators() {
+  const auto& l0_files = sv_->current->storage_info()->LevelFiles(0);
+  for (uint32_t i = 0; i < l0_iters_.size(); ++i) {
+    assert(i < l0_files.size());
+    if (!l0_iters_[i]->status().IsIncomplete()) {
+      continue;
+    }
+    delete l0_iters_[i];
+    l0_iters_[i] = cfd_->table_cache()->NewIterator(
+        read_options_, *cfd_->soptions(), cfd_->internal_comparator(),
+        l0_files[i]->fd);
+  }
+
+  for (auto* level_iter : level_iters_) {
+    if (level_iter && level_iter->status().IsIncomplete()) {
+      level_iter->Reset();
+    }
+  }
+
+  current_ = nullptr;
+  is_prev_set_ = false;
+}
+
+void ForwardIterator::UpdateCurrent() {
+  if (immutable_min_heap_.empty() && !mutable_iter_->Valid()) {
+    current_ = nullptr;
+  } else if (immutable_min_heap_.empty()) {
+    current_ = mutable_iter_;
+  } else if (!mutable_iter_->Valid()) {
+    current_ = immutable_min_heap_.top();
+    immutable_min_heap_.pop();
+  } else {
+    current_ = immutable_min_heap_.top();
+    assert(current_ != nullptr);
+    assert(current_->Valid());
+    int cmp = cfd_->internal_comparator().InternalKeyComparator::Compare(
+        mutable_iter_->key(), current_->key());
+    assert(cmp != 0);
+    if (cmp > 0) {
+      immutable_min_heap_.pop();
+    } else {
+      current_ = mutable_iter_;
+    }
+  }
+  valid_ = (current_ != nullptr);
+  if (!status_.ok()) {
+    status_ = Status::OK();
+  }
+}
+
+bool ForwardIterator::NeedToSeekImmutable(const Slice& target) {
+  // We maintain the interval (prev_key_, immutable_min_heap_.top()->key())
+  // such that there are no records with keys within that range in
+  // immutable_min_heap_. Since immutable structures (SST files and immutable
+  // memtables) can't change in this version, we don't need to do a seek if
+  // 'target' belongs to that interval (immutable_min_heap_.top() is already
+  // at the correct position).
+
+  if (!valid_ || !current_ || !is_prev_set_ || !immutable_status_.ok()) {
+    return true;
+  }
+  Slice prev_key = prev_key_.GetKey();
+  if (prefix_extractor_ && prefix_extractor_->Transform(target).compare(
+    prefix_extractor_->Transform(prev_key)) != 0) {
+    return true;
+  }
+  if (cfd_->internal_comparator().InternalKeyComparator::Compare(
+        prev_key, target) >= (is_prev_inclusive_ ? 1 : 0)) {
+    return true;
+  }
+
+  if (immutable_min_heap_.empty() && current_ == mutable_iter_) {
+    // Nothing to seek on.
+    return false;
+  }
+  if (cfd_->internal_comparator().InternalKeyComparator::Compare(
+        target, current_ == mutable_iter_ ? immutable_min_heap_.top()->key()
+                                          : current_->key()) > 0) {
+    return true;
+  }
+  return false;
+}
+
+uint32_t ForwardIterator::FindFileInRange(
+    const std::vector<FileMetaData*>& files, const Slice& internal_key,
+    uint32_t left, uint32_t right) {
+  while (left < right) {
+    uint32_t mid = (left + right) / 2;
+    const FileMetaData* f = files[mid];
+    if (cfd_->internal_comparator().InternalKeyComparator::Compare(
+          f->largest.Encode(), internal_key) < 0) {
+      // Key at "mid.largest" is < "target".  Therefore all
+      // files at or before "mid" are uninteresting.
+      left = mid + 1;
+    } else {
+      // Key at "mid.largest" is >= "target".  Therefore all files
+      // after "mid" are uninteresting.
+      right = mid;
+    }
+  }
+  return right;
+}
+
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/forward_iterator.h b/src/rocksdb/db/forward_iterator.h
new file mode 100644
index 0000000..f72c9cb
--- /dev/null
+++ b/src/rocksdb/db/forward_iterator.h
@@ -0,0 +1,110 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <vector>
+#include <queue>
+
+#include "rocksdb/db.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "db/dbformat.h"
+#include "util/arena.h"
+
+namespace rocksdb {
+
+class DBImpl;
+class Env;
+struct SuperVersion;
+class ColumnFamilyData;
+class LevelIterator;
+struct FileMetaData;
+
+class MinIterComparator {
+ public:
+  explicit MinIterComparator(const Comparator* comparator) :
+    comparator_(comparator) {}
+
+  bool operator()(Iterator* a, Iterator* b) {
+    return comparator_->Compare(a->key(), b->key()) > 0;
+  }
+ private:
+  const Comparator* comparator_;
+};
+
+typedef std::priority_queue<Iterator*,
+          std::vector<Iterator*>,
+          MinIterComparator> MinIterHeap;
+
+/**
+ * ForwardIterator is a special type of iterator that only supports Seek()
+ * and Next(). It is expected to perform better than TailingIterator by
+ * removing the encapsulation and making all information accessible within
+ * the iterator. At the current implementation, snapshot is taken at the
+ * time Seek() is called. The Next() followed do not see new values after.
+ */
+class ForwardIterator : public Iterator {
+ public:
+  ForwardIterator(DBImpl* db, const ReadOptions& read_options,
+                  ColumnFamilyData* cfd, SuperVersion* current_sv = nullptr);
+  virtual ~ForwardIterator();
+
+  void SeekToLast() override {
+    status_ = Status::NotSupported("ForwardIterator::SeekToLast()");
+    valid_ = false;
+  }
+  void Prev() override {
+    status_ = Status::NotSupported("ForwardIterator::Prev");
+    valid_ = false;
+  }
+
+  virtual bool Valid() const override;
+  void SeekToFirst() override;
+  virtual void Seek(const Slice& target) override;
+  virtual void Next() override;
+  virtual Slice key() const override;
+  virtual Slice value() const override;
+  virtual Status status() const override;
+
+ private:
+  void Cleanup(bool release_sv);
+  void RebuildIterators(bool refresh_sv);
+  void ResetIncompleteIterators();
+  void SeekInternal(const Slice& internal_key, bool seek_to_first);
+  void UpdateCurrent();
+  bool NeedToSeekImmutable(const Slice& internal_key);
+  uint32_t FindFileInRange(
+    const std::vector<FileMetaData*>& files, const Slice& internal_key,
+    uint32_t left, uint32_t right);
+
+  DBImpl* const db_;
+  const ReadOptions read_options_;
+  ColumnFamilyData* const cfd_;
+  const SliceTransform* const prefix_extractor_;
+  const Comparator* user_comparator_;
+  MinIterHeap immutable_min_heap_;
+
+  SuperVersion* sv_;
+  Iterator* mutable_iter_;
+  std::vector<Iterator*> imm_iters_;
+  std::vector<Iterator*> l0_iters_;
+  std::vector<LevelIterator*> level_iters_;
+  Iterator* current_;
+  // internal iterator status
+  Status status_;
+  Status immutable_status_;
+  bool valid_;
+
+  IterKey prev_key_;
+  bool is_prev_set_;
+  bool is_prev_inclusive_;
+  Arena arena_;
+};
+
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/internal_stats.cc b/src/rocksdb/db/internal_stats.cc
index e8b22a7..e6eb9fb 100644
--- a/src/rocksdb/db/internal_stats.cc
+++ b/src/rocksdb/db/internal_stats.cc
@@ -7,48 +7,242 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "db/internal_stats.h"
-#include "db/column_family.h"
 
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
+#include <algorithm>
 #include <vector>
+#include "db/column_family.h"
+
+#include "db/db_impl.h"
+#include "util/string_util.h"
 
 namespace rocksdb {
 
-DBPropertyType GetPropertyType(const Slice& property) {
+#ifndef ROCKSDB_LITE
+namespace {
+const double kMB = 1048576.0;
+const double kGB = kMB * 1024;
+
+void PrintLevelStatsHeader(char* buf, size_t len, const std::string& cf_name) {
+  snprintf(
+      buf, len,
+      "\n** Compaction Stats [%s] **\n"
+      "Level    Files   Size(MB) Score Read(GB)  Rn(GB) Rnp1(GB) "
+      "Write(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) "
+      "Comp(sec) Comp(cnt) Avg(sec) "
+      "Stall(cnt)  KeyIn KeyDrop\n"
+      "--------------------------------------------------------------------"
+      "-----------------------------------------------------------"
+      "--------------------------------------\n",
+      cf_name.c_str());
+}
+
+void PrintLevelStats(char* buf, size_t len, const std::string& name,
+    int num_files, int being_compacted, double total_file_size, double score,
+    double w_amp, uint64_t stalls,
+    const InternalStats::CompactionStats& stats) {
+  uint64_t bytes_read = stats.bytes_readn + stats.bytes_readnp1;
+  int64_t bytes_new = stats.bytes_written - stats.bytes_readnp1;
+  double elapsed = (stats.micros + 1) / 1000000.0;
+  std::string num_input_records = NumberToHumanString(stats.num_input_records);
+  std::string num_dropped_records =
+      NumberToHumanString(stats.num_dropped_records);
+
+  snprintf(buf, len,
+           "%4s %6d/%-3d %8.0f %5.1f " /* Level, Files, Size(MB), Score */
+           "%8.1f "                    /* Read(GB) */
+           "%7.1f "                    /* Rn(GB) */
+           "%8.1f "                    /* Rnp1(GB) */
+           "%9.1f "                    /* Write(GB) */
+           "%8.1f "                    /* Wnew(GB) */
+           "%9.1f "                    /* Moved(GB) */
+           "%5.1f "                    /* W-Amp */
+           "%8.1f "                    /* Rd(MB/s) */
+           "%8.1f "                    /* Wr(MB/s) */
+           "%9.0f "                    /* Comp(sec) */
+           "%9d "                      /* Comp(cnt) */
+           "%8.3f "                    /* Avg(sec) */
+           "%10" PRIu64
+           " "      /* Stall(cnt) */
+           "%7s "   /* KeyIn */
+           "%6s\n", /* KeyDrop */
+           name.c_str(), num_files, being_compacted, total_file_size / kMB,
+           score, bytes_read / kGB, stats.bytes_readn / kGB,
+           stats.bytes_readnp1 / kGB, stats.bytes_written / kGB,
+           bytes_new / kGB, stats.bytes_moved / kGB,
+           w_amp, bytes_read / kMB / elapsed,
+           stats.bytes_written / kMB / elapsed, stats.micros / 1000000.0,
+           stats.count,
+           stats.count == 0 ? 0 : stats.micros / 1000000.0 / stats.count,
+           stalls,
+           num_input_records.c_str(), num_dropped_records.c_str());
+}
+}
+
+static const std::string rocksdb_prefix = "rocksdb.";
+
+static const std::string num_files_at_level_prefix = "num-files-at-level";
+static const std::string allstats = "stats";
+static const std::string sstables = "sstables";
+static const std::string cfstats = "cfstats";
+static const std::string dbstats = "dbstats";
+static const std::string levelstats = "levelstats";
+static const std::string num_immutable_mem_table = "num-immutable-mem-table";
+static const std::string mem_table_flush_pending = "mem-table-flush-pending";
+static const std::string compaction_pending = "compaction-pending";
+static const std::string background_errors = "background-errors";
+static const std::string cur_size_active_mem_table =
+                          "cur-size-active-mem-table";
+static const std::string cur_size_all_mem_tables = "cur-size-all-mem-tables";
+static const std::string num_entries_active_mem_table =
+                          "num-entries-active-mem-table";
+static const std::string num_entries_imm_mem_tables =
+                          "num-entries-imm-mem-tables";
+static const std::string num_deletes_active_mem_table =
+                          "num-deletes-active-mem-table";
+static const std::string num_deletes_imm_mem_tables =
+                          "num-deletes-imm-mem-tables";
+static const std::string estimate_num_keys = "estimate-num-keys";
+static const std::string estimate_table_readers_mem =
+                          "estimate-table-readers-mem";
+static const std::string is_file_deletions_enabled =
+                          "is-file-deletions-enabled";
+static const std::string num_snapshots = "num-snapshots";
+static const std::string oldest_snapshot_time = "oldest-snapshot-time";
+static const std::string num_live_versions = "num-live-versions";
+static const std::string base_level = "base-level";
+
+const std::string DB::Properties::kNumFilesAtLevelPrefix =
+                      rocksdb_prefix + num_files_at_level_prefix;
+const std::string DB::Properties::kStats = rocksdb_prefix + allstats;
+const std::string DB::Properties::kSSTables = rocksdb_prefix + sstables;
+const std::string DB::Properties::kCFStats = rocksdb_prefix + cfstats;
+const std::string DB::Properties::kDBStats = rocksdb_prefix + dbstats;
+const std::string DB::Properties::kNumImmutableMemTable =
+                      rocksdb_prefix + num_immutable_mem_table;
+const std::string DB::Properties::kMemTableFlushPending =
+                      rocksdb_prefix + mem_table_flush_pending;
+const std::string DB::Properties::kCompactionPending =
+                      rocksdb_prefix + compaction_pending;
+const std::string DB::Properties::kBackgroundErrors =
+                      rocksdb_prefix + background_errors;
+const std::string DB::Properties::kCurSizeActiveMemTable =
+                      rocksdb_prefix + cur_size_active_mem_table;
+const std::string DB::Properties::kCurSizeAllMemTables =
+                      rocksdb_prefix + cur_size_all_mem_tables;
+const std::string DB::Properties::kNumEntriesActiveMemTable =
+                      rocksdb_prefix + num_entries_active_mem_table;
+const std::string DB::Properties::kNumEntriesImmMemTables =
+                      rocksdb_prefix + num_entries_imm_mem_tables;
+const std::string DB::Properties::kNumDeletesActiveMemTable =
+                      rocksdb_prefix + num_deletes_active_mem_table;
+const std::string DB::Properties::kNumDeletesImmMemTables =
+                      rocksdb_prefix + num_deletes_imm_mem_tables;
+const std::string DB::Properties::kEstimateNumKeys =
+                      rocksdb_prefix + estimate_num_keys;
+const std::string DB::Properties::kEstimateTableReadersMem =
+                      rocksdb_prefix + estimate_table_readers_mem;
+const std::string DB::Properties::kIsFileDeletionsEnabled =
+                      rocksdb_prefix + is_file_deletions_enabled;
+const std::string DB::Properties::kNumSnapshots =
+                      rocksdb_prefix + num_snapshots;
+const std::string DB::Properties::kOldestSnapshotTime =
+                      rocksdb_prefix + oldest_snapshot_time;
+const std::string DB::Properties::kNumLiveVersions =
+                      rocksdb_prefix + num_live_versions;
+
+DBPropertyType GetPropertyType(const Slice& property, bool* is_int_property,
+                               bool* need_out_of_mutex) {
+  assert(is_int_property != nullptr);
+  assert(need_out_of_mutex != nullptr);
   Slice in = property;
-  Slice prefix("rocksdb.");
-  if (!in.starts_with(prefix)) return kUnknown;
+  Slice prefix(rocksdb_prefix);
+  *need_out_of_mutex = false;
+  *is_int_property = false;
+  if (!in.starts_with(prefix)) {
+    return kUnknown;
+  }
   in.remove_prefix(prefix.size());
 
-  if (in.starts_with("num-files-at-level")) {
+  if (in.starts_with(num_files_at_level_prefix)) {
     return kNumFilesAtLevel;
-  } else if (in == "levelstats") {
+  } else if (in == levelstats) {
     return kLevelStats;
-  } else if (in == "stats") {
+  } else if (in == allstats) {
     return kStats;
-  } else if (in == "sstables") {
+  } else if (in == cfstats) {
+    return kCFStats;
+  } else if (in == dbstats) {
+    return kDBStats;
+  } else if (in == sstables) {
     return kSsTables;
-  } else if (in == "num-immutable-mem-table") {
+  }
+
+  *is_int_property = true;
+  if (in == num_immutable_mem_table) {
     return kNumImmutableMemTable;
-  } else if (in == "mem-table-flush-pending") {
+  } else if (in == mem_table_flush_pending) {
     return kMemtableFlushPending;
-  } else if (in == "compaction-pending") {
+  } else if (in == compaction_pending) {
     return kCompactionPending;
-  } else if (in == "background-errors") {
+  } else if (in == background_errors) {
     return kBackgroundErrors;
-  } else if (in == "cur-size-active-mem-table") {
+  } else if (in == cur_size_active_mem_table) {
     return kCurSizeActiveMemTable;
-  } else if (in == "num-entries-active-mem-table") {
+  } else if (in == cur_size_all_mem_tables) {
+    return kCurSizeAllMemTables;
+  } else if (in == num_entries_active_mem_table) {
     return kNumEntriesInMutableMemtable;
-  } else if (in == "num-entries-imm-mem-tables") {
+  } else if (in == num_entries_imm_mem_tables) {
     return kNumEntriesInImmutableMemtable;
+  } else if (in == num_deletes_active_mem_table) {
+    return kNumDeletesInMutableMemtable;
+  } else if (in == num_deletes_imm_mem_tables) {
+    return kNumDeletesInImmutableMemtable;
+  } else if (in == estimate_num_keys) {
+    return kEstimatedNumKeys;
+  } else if (in == estimate_table_readers_mem) {
+    *need_out_of_mutex = true;
+    return kEstimatedUsageByTableReaders;
+  } else if (in == is_file_deletions_enabled) {
+    return kIsFileDeletionEnabled;
+  } else if (in == num_snapshots) {
+    return kNumSnapshots;
+  } else if (in == oldest_snapshot_time) {
+    return kOldestSnapshotTime;
+  } else if (in == num_live_versions) {
+    return kNumLiveVersions;
+  } else if (in == base_level) {
+    return kBaseLevel;
   }
   return kUnknown;
 }
 
-bool InternalStats::GetProperty(DBPropertyType property_type,
-                                const Slice& property, std::string* value,
-                                ColumnFamilyData* cfd) {
-  Version* current = cfd->current();
+bool InternalStats::GetIntPropertyOutOfMutex(DBPropertyType property_type,
+                                             Version* version,
+                                             uint64_t* value) const {
+  assert(value != nullptr);
+  if (property_type != kEstimatedUsageByTableReaders) {
+    return false;
+  }
+  if (version == nullptr) {
+    *value = 0;
+  } else {
+    *value = version->GetMemoryUsageByTableReaders();
+  }
+  return true;
+}
+
+bool InternalStats::GetStringProperty(DBPropertyType property_type,
+                                      const Slice& property,
+                                      std::string* value) {
+  assert(value != nullptr);
+  auto* current = cfd_->current();
+  const auto* vstorage = current->storage_info();
   Slice in = property;
 
   switch (property_type) {
@@ -61,7 +255,7 @@ bool InternalStats::GetProperty(DBPropertyType property_type,
       } else {
         char buf[100];
         snprintf(buf, sizeof(buf), "%d",
-                 current->NumLevelFiles(static_cast<int>(level)));
+                 vstorage->NumLevelFiles(static_cast<int>(level)));
         *value = buf;
         return true;
       }
@@ -75,295 +269,387 @@ bool InternalStats::GetProperty(DBPropertyType property_type,
 
       for (int level = 0; level < number_levels_; level++) {
         snprintf(buf, sizeof(buf), "%3d %8d %8.0f\n", level,
-                 current->NumLevelFiles(level),
-                 current->NumLevelBytes(level) / 1048576.0);
+                 vstorage->NumLevelFiles(level),
+                 vstorage->NumLevelBytes(level) / kMB);
         value->append(buf);
       }
       return true;
     }
     case kStats: {
-      char buf[1000];
-
-      uint64_t wal_bytes = 0;
-      uint64_t wal_synced = 0;
-      uint64_t user_bytes_written = 0;
-      uint64_t write_other = 0;
-      uint64_t write_self = 0;
-      uint64_t write_with_wal = 0;
-      uint64_t total_bytes_written = 0;
-      uint64_t total_bytes_read = 0;
-      uint64_t micros_up = env_->NowMicros() - started_at_;
-      // Add "+1" to make sure seconds_up is > 0 and avoid NaN later
-      double seconds_up = (micros_up + 1) / 1000000.0;
-      uint64_t total_slowdown = 0;
-      uint64_t total_slowdown_count = 0;
-      uint64_t interval_bytes_written = 0;
-      uint64_t interval_bytes_read = 0;
-      uint64_t interval_bytes_new = 0;
-      double interval_seconds_up = 0;
-
-      if (statistics_) {
-        wal_bytes = statistics_->getTickerCount(WAL_FILE_BYTES);
-        wal_synced = statistics_->getTickerCount(WAL_FILE_SYNCED);
-        user_bytes_written = statistics_->getTickerCount(BYTES_WRITTEN);
-        write_other = statistics_->getTickerCount(WRITE_DONE_BY_OTHER);
-        write_self = statistics_->getTickerCount(WRITE_DONE_BY_SELF);
-        write_with_wal = statistics_->getTickerCount(WRITE_WITH_WAL);
+      if (!GetStringProperty(kCFStats, DB::Properties::kCFStats, value)) {
+        return false;
       }
-
-      snprintf(
-          buf, sizeof(buf),
-          "                               Compactions\n"
-          "Level  Files Size(MB) Score Time(sec)  Read(MB) Write(MB)    Rn(MB) "
-          " "
-          "Rnp1(MB)  Wnew(MB) RW-Amplify Read(MB/s) Write(MB/s)      Rn     "
-          "Rnp1 "
-          "    Wnp1     NewW    Count   msComp   msStall  Ln-stall Stall-cnt\n"
-          "--------------------------------------------------------------------"
-          "--"
-          "--------------------------------------------------------------------"
-          "--"
-          "----------------------------------------------------------------\n");
-      value->append(buf);
-      for (int level = 0; level < number_levels_; level++) {
-        int files = current->NumLevelFiles(level);
-        if (compaction_stats_[level].micros > 0 || files > 0) {
-          int64_t bytes_read = compaction_stats_[level].bytes_readn +
-                               compaction_stats_[level].bytes_readnp1;
-          int64_t bytes_new = compaction_stats_[level].bytes_written -
-                              compaction_stats_[level].bytes_readnp1;
-          double amplify =
-              (compaction_stats_[level].bytes_readn == 0)
-                  ? 0.0
-                  : (compaction_stats_[level].bytes_written +
-                     compaction_stats_[level].bytes_readnp1 +
-                     compaction_stats_[level].bytes_readn) /
-                        (double)compaction_stats_[level].bytes_readn;
-
-          total_bytes_read += bytes_read;
-          total_bytes_written += compaction_stats_[level].bytes_written;
-
-          uint64_t stalls = level == 0 ? (stall_counts_[LEVEL0_SLOWDOWN] +
-                                          stall_counts_[LEVEL0_NUM_FILES] +
-                                          stall_counts_[MEMTABLE_COMPACTION])
-                                       : stall_leveln_slowdown_count_[level];
-
-          double stall_us = level == 0 ? (stall_micros_[LEVEL0_SLOWDOWN] +
-                                          stall_micros_[LEVEL0_NUM_FILES] +
-                                          stall_micros_[MEMTABLE_COMPACTION])
-                                       : stall_leveln_slowdown_[level];
-
-          snprintf(buf, sizeof(buf),
-                   "%3d %8d %8.0f %5.1f %9.0f %9.0f %9.0f %9.0f %9.0f %9.0f "
-                   "%10.1f %9.1f %11.1f %8d %8d %8d %8d %8d %8d %9.1f %9.1f "
-                   "%9lu\n",
-                   level, files, current->NumLevelBytes(level) / 1048576.0,
-                   current->NumLevelBytes(level) /
-                       cfd->compaction_picker()->MaxBytesForLevel(level),
-                   compaction_stats_[level].micros / 1e6,
-                   bytes_read / 1048576.0,
-                   compaction_stats_[level].bytes_written / 1048576.0,
-                   compaction_stats_[level].bytes_readn / 1048576.0,
-                   compaction_stats_[level].bytes_readnp1 / 1048576.0,
-                   bytes_new / 1048576.0, amplify,
-                   // +1 to avoid division by 0
-                   (bytes_read / 1048576.0) /
-                       ((compaction_stats_[level].micros + 1) / 1000000.0),
-                   (compaction_stats_[level].bytes_written / 1048576.0) /
-                       ((compaction_stats_[level].micros + 1) / 1000000.0),
-                   compaction_stats_[level].files_in_leveln,
-                   compaction_stats_[level].files_in_levelnp1,
-                   compaction_stats_[level].files_out_levelnp1,
-                   compaction_stats_[level].files_out_levelnp1 -
-                       compaction_stats_[level].files_in_levelnp1,
-                   compaction_stats_[level].count,
-                   (int)((double)compaction_stats_[level].micros / 1000.0 /
-                         (compaction_stats_[level].count + 1)),
-                   (double)stall_us / 1000.0 / (stalls + 1),
-                   stall_us / 1000000.0, (unsigned long)stalls);
-          total_slowdown += stall_leveln_slowdown_[level];
-          total_slowdown_count += stall_leveln_slowdown_count_[level];
-          value->append(buf);
-        }
+      if (!GetStringProperty(kDBStats, DB::Properties::kDBStats, value)) {
+        return false;
       }
-
-      interval_bytes_new = user_bytes_written - last_stats_.ingest_bytes_;
-      interval_bytes_read =
-          total_bytes_read - last_stats_.compaction_bytes_read_;
-      interval_bytes_written =
-          total_bytes_written - last_stats_.compaction_bytes_written_;
-      interval_seconds_up = seconds_up - last_stats_.seconds_up_;
-
-      snprintf(buf, sizeof(buf), "Uptime(secs): %.1f total, %.1f interval\n",
-               seconds_up, interval_seconds_up);
-      value->append(buf);
-
-      snprintf(buf, sizeof(buf),
-               "Writes cumulative: %llu total, %llu batches, "
-               "%.1f per batch, %.2f ingest GB\n",
-               (unsigned long long)(write_other + write_self),
-               (unsigned long long)write_self,
-               (write_other + write_self) / (double)(write_self + 1),
-               user_bytes_written / (1048576.0 * 1024));
-      value->append(buf);
-
-      snprintf(buf, sizeof(buf),
-               "WAL cumulative: %llu WAL writes, %llu WAL syncs, "
-               "%.2f writes per sync, %.2f GB written\n",
-               (unsigned long long)write_with_wal,
-               (unsigned long long)wal_synced,
-               write_with_wal / (double)(wal_synced + 1),
-               wal_bytes / (1048576.0 * 1024));
-      value->append(buf);
-
-      snprintf(buf, sizeof(buf),
-               "Compaction IO cumulative (GB): "
-               "%.2f new, %.2f read, %.2f write, %.2f read+write\n",
-               user_bytes_written / (1048576.0 * 1024),
-               total_bytes_read / (1048576.0 * 1024),
-               total_bytes_written / (1048576.0 * 1024),
-               (total_bytes_read + total_bytes_written) / (1048576.0 * 1024));
-      value->append(buf);
-
-      snprintf(
-          buf, sizeof(buf),
-          "Compaction IO cumulative (MB/sec): "
-          "%.1f new, %.1f read, %.1f write, %.1f read+write\n",
-          user_bytes_written / 1048576.0 / seconds_up,
-          total_bytes_read / 1048576.0 / seconds_up,
-          total_bytes_written / 1048576.0 / seconds_up,
-          (total_bytes_read + total_bytes_written) / 1048576.0 / seconds_up);
-      value->append(buf);
-
-      // +1 to avoid divide by 0 and NaN
-      snprintf(
-          buf, sizeof(buf),
-          "Amplification cumulative: %.1f write, %.1f compaction\n",
-          (double)(total_bytes_written + wal_bytes) / (user_bytes_written + 1),
-          (double)(total_bytes_written + total_bytes_read + wal_bytes) /
-              (user_bytes_written + 1));
-      value->append(buf);
-
-      uint64_t interval_write_other = write_other - last_stats_.write_other_;
-      uint64_t interval_write_self = write_self - last_stats_.write_self_;
-
-      snprintf(buf, sizeof(buf),
-               "Writes interval: %llu total, %llu batches, "
-               "%.1f per batch, %.1f ingest MB\n",
-               (unsigned long long)(interval_write_other + interval_write_self),
-               (unsigned long long)interval_write_self,
-               (double)(interval_write_other + interval_write_self) /
-                   (interval_write_self + 1),
-               (user_bytes_written - last_stats_.ingest_bytes_) / 1048576.0);
-      value->append(buf);
-
-      uint64_t interval_write_with_wal =
-          write_with_wal - last_stats_.write_with_wal_;
-
-      uint64_t interval_wal_synced = wal_synced - last_stats_.wal_synced_;
-      uint64_t interval_wal_bytes = wal_bytes - last_stats_.wal_bytes_;
-
-      snprintf(buf, sizeof(buf),
-               "WAL interval: %llu WAL writes, %llu WAL syncs, "
-               "%.2f writes per sync, %.2f MB written\n",
-               (unsigned long long)interval_write_with_wal,
-               (unsigned long long)interval_wal_synced,
-               interval_write_with_wal / (double)(interval_wal_synced + 1),
-               interval_wal_bytes / (1048576.0 * 1024));
-      value->append(buf);
-
-      snprintf(buf, sizeof(buf),
-               "Compaction IO interval (MB): "
-               "%.2f new, %.2f read, %.2f write, %.2f read+write\n",
-               interval_bytes_new / 1048576.0, interval_bytes_read / 1048576.0,
-               interval_bytes_written / 1048576.0,
-               (interval_bytes_read + interval_bytes_written) / 1048576.0);
-      value->append(buf);
-
-      snprintf(buf, sizeof(buf),
-               "Compaction IO interval (MB/sec): "
-               "%.1f new, %.1f read, %.1f write, %.1f read+write\n",
-               interval_bytes_new / 1048576.0 / interval_seconds_up,
-               interval_bytes_read / 1048576.0 / interval_seconds_up,
-               interval_bytes_written / 1048576.0 / interval_seconds_up,
-               (interval_bytes_read + interval_bytes_written) / 1048576.0 /
-                   interval_seconds_up);
-      value->append(buf);
-
-      // +1 to avoid divide by 0 and NaN
-      snprintf(
-          buf, sizeof(buf),
-          "Amplification interval: %.1f write, %.1f compaction\n",
-          (double)(interval_bytes_written + wal_bytes) /
-              (interval_bytes_new + 1),
-          (double)(interval_bytes_written + interval_bytes_read + wal_bytes) /
-              (interval_bytes_new + 1));
-      value->append(buf);
-
-      snprintf(buf, sizeof(buf),
-               "Stalls(secs): %.3f level0_slowdown, %.3f level0_numfiles, "
-               "%.3f memtable_compaction, %.3f leveln_slowdown\n",
-               stall_micros_[LEVEL0_SLOWDOWN] / 1000000.0,
-               stall_micros_[LEVEL0_NUM_FILES] / 1000000.0,
-               stall_micros_[MEMTABLE_COMPACTION] / 1000000.0,
-               total_slowdown / 1000000.0);
-      value->append(buf);
-
-      snprintf(buf, sizeof(buf),
-               "Stalls(count): %lu level0_slowdown, %lu level0_numfiles, "
-               "%lu memtable_compaction, %lu leveln_slowdown\n",
-               (unsigned long)stall_counts_[LEVEL0_SLOWDOWN],
-               (unsigned long)stall_counts_[LEVEL0_NUM_FILES],
-               (unsigned long)stall_counts_[MEMTABLE_COMPACTION],
-               (unsigned long)total_slowdown_count);
-      value->append(buf);
-
-      last_stats_.compaction_bytes_read_ = total_bytes_read;
-      last_stats_.compaction_bytes_written_ = total_bytes_written;
-      last_stats_.ingest_bytes_ = user_bytes_written;
-      last_stats_.seconds_up_ = seconds_up;
-      last_stats_.wal_bytes_ = wal_bytes;
-      last_stats_.wal_synced_ = wal_synced;
-      last_stats_.write_with_wal_ = write_with_wal;
-      last_stats_.write_other_ = write_other;
-      last_stats_.write_self_ = write_self;
-
+      return true;
+    }
+    case kCFStats: {
+      DumpCFStats(value);
+      return true;
+    }
+    case kDBStats: {
+      DumpDBStats(value);
       return true;
     }
     case kSsTables:
       *value = current->DebugString();
       return true;
+    default:
+      return false;
+  }
+}
+
+bool InternalStats::GetIntProperty(DBPropertyType property_type,
+                                   uint64_t* value, DBImpl* db) const {
+  db->mutex_.AssertHeld();
+  const auto* vstorage = cfd_->current()->storage_info();
+
+  switch (property_type) {
     case kNumImmutableMemTable:
-      *value = std::to_string(cfd->imm()->size());
+      *value = cfd_->imm()->size();
       return true;
     case kMemtableFlushPending:
       // Return number of mem tables that are ready to flush (made immutable)
-      *value = std::to_string(cfd->imm()->IsFlushPending() ? 1 : 0);
+      *value = (cfd_->imm()->IsFlushPending() ? 1 : 0);
       return true;
     case kCompactionPending:
       // 1 if the system already determines at least one compacdtion is needed.
       // 0 otherwise,
-      *value = std::to_string(current->NeedsCompaction() ? 1 : 0);
+      *value = (cfd_->compaction_picker()->NeedsCompaction(vstorage) ? 1 : 0);
       return true;
     case kBackgroundErrors:
       // Accumulated number of  errors in background flushes or compactions.
-      *value = std::to_string(GetBackgroundErrorCount());
+      *value = GetBackgroundErrorCount();
       return true;
     case kCurSizeActiveMemTable:
       // Current size of the active memtable
-      *value = std::to_string(cfd->mem()->ApproximateMemoryUsage());
+      *value = cfd_->mem()->ApproximateMemoryUsage();
+      return true;
+    case kCurSizeAllMemTables:
+      // Current size of the active memtable + immutable memtables
+      *value = cfd_->mem()->ApproximateMemoryUsage() +
+               cfd_->imm()->ApproximateMemoryUsage();
       return true;
     case kNumEntriesInMutableMemtable:
-      // Current size of the active memtable
-      *value = std::to_string(cfd->mem()->GetNumEntries());
+      // Current number of entires in the active memtable
+      *value = cfd_->mem()->num_entries();
       return true;
     case kNumEntriesInImmutableMemtable:
-      // Current size of the active memtable
-      *value = std::to_string(cfd->imm()->current()->GetTotalNumEntries());
+      // Current number of entries in the immutable memtables
+      *value = cfd_->imm()->current()->GetTotalNumEntries();
+      return true;
+    case kNumDeletesInMutableMemtable:
+      // Current number of entires in the active memtable
+      *value = cfd_->mem()->num_deletes();
+      return true;
+    case kNumDeletesInImmutableMemtable:
+      // Current number of entries in the immutable memtables
+      *value = cfd_->imm()->current()->GetTotalNumDeletes();
+      return true;
+    case kEstimatedNumKeys:
+      // Estimate number of entries in the column family:
+      // Use estimated entries in tables + total entries in memtables.
+      *value = cfd_->mem()->num_entries() +
+               cfd_->imm()->current()->GetTotalNumEntries() -
+               (cfd_->mem()->num_deletes() +
+                cfd_->imm()->current()->GetTotalNumDeletes()) *
+                   2 +
+               vstorage->GetEstimatedActiveKeys();
+      return true;
+    case kNumSnapshots:
+      *value = db->snapshots().count();
+      return true;
+    case kOldestSnapshotTime:
+      *value = static_cast<uint64_t>(db->snapshots().GetOldestSnapshotTime());
+      return true;
+    case kNumLiveVersions:
+      *value = cfd_->GetNumLiveVersions();
+      return true;
+#ifndef ROCKSDB_LITE
+    case kIsFileDeletionEnabled:
+      *value = db->IsFileDeletionsEnabled();
+      return true;
+#endif
+    case kBaseLevel:
+      *value = vstorage->base_level();
       return true;
     default:
       return false;
   }
 }
 
+void InternalStats::DumpDBStats(std::string* value) {
+  char buf[1000];
+  // DB-level stats, only available from default column family
+  double seconds_up = (env_->NowMicros() - started_at_ + 1) / 1000000.0;
+  double interval_seconds_up = seconds_up - db_stats_snapshot_.seconds_up;
+  snprintf(buf, sizeof(buf),
+           "\n** DB Stats **\nUptime(secs): %.1f total, %.1f interval\n",
+           seconds_up, interval_seconds_up);
+  value->append(buf);
+  // Cumulative
+  uint64_t user_bytes_written = db_stats_[InternalStats::BYTES_WRITTEN];
+  uint64_t num_keys_written = db_stats_[InternalStats::NUMBER_KEYS_WRITTEN];
+  uint64_t write_other = db_stats_[InternalStats::WRITE_DONE_BY_OTHER];
+  uint64_t write_self = db_stats_[InternalStats::WRITE_DONE_BY_SELF];
+  uint64_t wal_bytes = db_stats_[InternalStats::WAL_FILE_BYTES];
+  uint64_t wal_synced = db_stats_[InternalStats::WAL_FILE_SYNCED];
+  uint64_t write_with_wal = db_stats_[InternalStats::WRITE_WITH_WAL];
+  uint64_t write_stall_micros = db_stats_[InternalStats::WRITE_STALL_MICROS];
+  uint64_t compact_bytes_read = 0;
+  uint64_t compact_bytes_write = 0;
+  uint64_t compact_micros = 0;
+
+  const int kHumanMicrosLen = 32;
+  char human_micros[kHumanMicrosLen];
+
+  // Data
+  // writes: total number of write requests.
+  // keys: total number of key updates issued by all the write requests
+  // batches: number of group commits issued to the DB. Each group can contain
+  //          one or more writes.
+  // so writes/keys is the average number of put in multi-put or put
+  // writes/batches is the average group commit size.
+  //
+  // The format is the same for interval stats.
+  snprintf(buf, sizeof(buf),
+           "Cumulative writes: %s writes, %s keys, %s batches, "
+           "%.1f writes per batch, ingest: %.2f GB, %.2f MB/s\n",
+           NumberToHumanString(write_other + write_self).c_str(),
+           NumberToHumanString(num_keys_written).c_str(),
+           NumberToHumanString(write_self).c_str(),
+           (write_other + write_self) / static_cast<double>(write_self + 1),
+           user_bytes_written / kGB, user_bytes_written / kMB / seconds_up);
+  value->append(buf);
+  // WAL
+  snprintf(buf, sizeof(buf),
+           "Cumulative WAL: %s writes, %s syncs, "
+           "%.2f writes per sync, written: %.2f GB, %.2f MB/s\n",
+           NumberToHumanString(write_with_wal).c_str(),
+           NumberToHumanString(wal_synced).c_str(),
+           write_with_wal / static_cast<double>(wal_synced + 1),
+           wal_bytes / kGB, wal_bytes / kMB / seconds_up);
+  value->append(buf);
+  // Compact
+  for (int level = 0; level < number_levels_; level++) {
+    compact_bytes_read += comp_stats_[level].bytes_readnp1 +
+                          comp_stats_[level].bytes_readn;
+    compact_bytes_write += comp_stats_[level].bytes_written;
+    compact_micros += comp_stats_[level].micros;
+  }
+  snprintf(buf, sizeof(buf),
+           "Cumulative compaction: %.2f GB write, %.2f MB/s write, "
+           "%.2f GB read, %.2f MB/s read, %.1f seconds\n",
+           compact_bytes_write / kGB,
+           compact_bytes_write / kMB / seconds_up,
+           compact_bytes_read / kGB,
+           compact_bytes_read / kMB / seconds_up,
+           compact_micros / 1000000.0);
+  value->append(buf);
+  // Stall
+  AppendHumanMicros(write_stall_micros, human_micros, kHumanMicrosLen, true);
+  snprintf(buf, sizeof(buf),
+           "Cumulative stall: %s, %.1f percent\n",
+           human_micros,
+           // 10000 = divide by 1M to get secs, then multiply by 100 for pct
+           write_stall_micros / 10000.0 / std::max(seconds_up, 0.001));
+  value->append(buf);
+
+  // Interval
+  uint64_t interval_write_other = write_other - db_stats_snapshot_.write_other;
+  uint64_t interval_write_self = write_self - db_stats_snapshot_.write_self;
+  uint64_t interval_num_keys_written =
+      num_keys_written - db_stats_snapshot_.num_keys_written;
+  snprintf(buf, sizeof(buf),
+           "Interval writes: %s writes, %s keys, %s batches, "
+           "%.1f writes per batch, ingest: %.2f MB, %.2f MB/s\n",
+           NumberToHumanString(
+               interval_write_other + interval_write_self).c_str(),
+           NumberToHumanString(interval_num_keys_written).c_str(),
+           NumberToHumanString(interval_write_self).c_str(),
+           static_cast<double>(interval_write_other + interval_write_self) /
+               (interval_write_self + 1),
+           (user_bytes_written - db_stats_snapshot_.ingest_bytes) / kMB,
+           (user_bytes_written - db_stats_snapshot_.ingest_bytes) / kMB /
+               std::max(interval_seconds_up, 0.001)),
+  value->append(buf);
+
+  uint64_t interval_write_with_wal =
+      write_with_wal - db_stats_snapshot_.write_with_wal;
+  uint64_t interval_wal_synced = wal_synced - db_stats_snapshot_.wal_synced;
+  uint64_t interval_wal_bytes = wal_bytes - db_stats_snapshot_.wal_bytes;
+
+  snprintf(buf, sizeof(buf),
+           "Interval WAL: %s writes, %s syncs, "
+           "%.2f writes per sync, written: %.2f MB, %.2f MB/s\n",
+           NumberToHumanString(interval_write_with_wal).c_str(),
+           NumberToHumanString(interval_wal_synced).c_str(),
+           interval_write_with_wal /
+              static_cast<double>(interval_wal_synced + 1),
+           interval_wal_bytes / kGB,
+           interval_wal_bytes / kMB / std::max(interval_seconds_up, 0.001));
+  value->append(buf);
+
+  // Compaction
+  uint64_t interval_compact_bytes_write =
+      compact_bytes_write - db_stats_snapshot_.compact_bytes_write;
+  uint64_t interval_compact_bytes_read =
+      compact_bytes_read - db_stats_snapshot_.compact_bytes_read;
+  uint64_t interval_compact_micros =
+      compact_micros - db_stats_snapshot_.compact_micros;
+
+  snprintf(buf, sizeof(buf),
+           "Interval compaction: %.2f GB write, %.2f MB/s write, "
+           "%.2f GB read, %.2f MB/s read, %.1f seconds\n",
+           interval_compact_bytes_write / kGB,
+           interval_compact_bytes_write / kMB /
+               std::max(interval_seconds_up, 0.001),
+           interval_compact_bytes_read / kGB,
+           interval_compact_bytes_read / kMB /
+               std::max(interval_seconds_up, 0.001),
+           interval_compact_micros / 1000000.0);
+  value->append(buf);
+
+  // Stall
+  AppendHumanMicros(
+      write_stall_micros - db_stats_snapshot_.write_stall_micros,
+      human_micros, kHumanMicrosLen, true);
+  snprintf(buf, sizeof(buf),
+           "Interval stall: %s, %.1f percent\n",
+           human_micros,
+           // 10000 = divide by 1M to get secs, then multiply by 100 for pct
+           (write_stall_micros - db_stats_snapshot_.write_stall_micros) /
+               10000.0 / std::max(interval_seconds_up, 0.001));
+  value->append(buf);
+
+  db_stats_snapshot_.seconds_up = seconds_up;
+  db_stats_snapshot_.ingest_bytes = user_bytes_written;
+  db_stats_snapshot_.write_other = write_other;
+  db_stats_snapshot_.write_self = write_self;
+  db_stats_snapshot_.num_keys_written = num_keys_written;
+  db_stats_snapshot_.wal_bytes = wal_bytes;
+  db_stats_snapshot_.wal_synced = wal_synced;
+  db_stats_snapshot_.write_with_wal = write_with_wal;
+  db_stats_snapshot_.write_stall_micros = write_stall_micros;
+  db_stats_snapshot_.compact_bytes_write = compact_bytes_write;
+  db_stats_snapshot_.compact_bytes_read = compact_bytes_read;
+  db_stats_snapshot_.compact_micros = compact_micros;
+}
+
+void InternalStats::DumpCFStats(std::string* value) {
+  const VersionStorageInfo* vstorage = cfd_->current()->storage_info();
+
+  int num_levels_to_check =
+      (cfd_->ioptions()->compaction_style != kCompactionStyleUniversal &&
+       cfd_->ioptions()->compaction_style != kCompactionStyleFIFO)
+          ? vstorage->num_levels() - 1
+          : 1;
+
+  // Compaction scores are sorted base on its value. Restore them to the
+  // level order
+  std::vector<double> compaction_score(number_levels_, 0);
+  for (int i = 0; i < num_levels_to_check; ++i) {
+    compaction_score[vstorage->CompactionScoreLevel(i)] =
+        vstorage->CompactionScore(i);
+  }
+  // Count # of files being compacted for each level
+  std::vector<int> files_being_compacted(number_levels_, 0);
+  for (int level = 0; level < num_levels_to_check; ++level) {
+    for (auto* f : vstorage->LevelFiles(level)) {
+      if (f->being_compacted) {
+        ++files_being_compacted[level];
+      }
+    }
+  }
+
+  char buf[1000];
+  // Per-ColumnFamily stats
+  PrintLevelStatsHeader(buf, sizeof(buf), cfd_->GetName());
+  value->append(buf);
+
+  CompactionStats stats_sum(0);
+  int total_files = 0;
+  int total_files_being_compacted = 0;
+  double total_file_size = 0;
+  uint64_t total_slowdown_count_soft = 0;
+  uint64_t total_slowdown_count_hard = 0;
+  uint64_t total_stall_count = 0;
+  for (int level = 0; level < number_levels_; level++) {
+    int files = vstorage->NumLevelFiles(level);
+    total_files += files;
+    total_files_being_compacted += files_being_compacted[level];
+    if (comp_stats_[level].micros > 0 || files > 0) {
+      uint64_t stalls = level == 0 ?
+        (cf_stats_count_[LEVEL0_SLOWDOWN] +
+         cf_stats_count_[LEVEL0_NUM_FILES] +
+         cf_stats_count_[MEMTABLE_COMPACTION])
+        : (stall_leveln_slowdown_count_soft_[level] +
+           stall_leveln_slowdown_count_hard_[level]);
+
+      stats_sum.Add(comp_stats_[level]);
+      total_file_size += vstorage->NumLevelBytes(level);
+      total_stall_count += stalls;
+      total_slowdown_count_soft += stall_leveln_slowdown_count_soft_[level];
+      total_slowdown_count_hard += stall_leveln_slowdown_count_hard_[level];
+      double w_amp = (comp_stats_[level].bytes_readn == 0) ? 0.0
+          : comp_stats_[level].bytes_written /
+            static_cast<double>(comp_stats_[level].bytes_readn);
+      PrintLevelStats(buf, sizeof(buf), "L" + ToString(level), files,
+                      files_being_compacted[level],
+                      vstorage->NumLevelBytes(level), compaction_score[level],
+                      w_amp, stalls, comp_stats_[level]);
+      value->append(buf);
+    }
+  }
+  uint64_t curr_ingest = cf_stats_value_[BYTES_FLUSHED];
+  // Cumulative summary
+  double w_amp = stats_sum.bytes_written / static_cast<double>(curr_ingest + 1);
+  // Stats summary across levels
+  PrintLevelStats(buf, sizeof(buf), "Sum", total_files,
+      total_files_being_compacted, total_file_size, 0, w_amp,
+      total_stall_count, stats_sum);
+  value->append(buf);
+  // Interval summary
+  uint64_t interval_ingest =
+      curr_ingest - cf_stats_snapshot_.ingest_bytes + 1;
+  CompactionStats interval_stats(stats_sum);
+  interval_stats.Subtract(cf_stats_snapshot_.comp_stats);
+  w_amp = interval_stats.bytes_written / static_cast<double>(interval_ingest);
+  PrintLevelStats(buf, sizeof(buf), "Int", 0, 0, 0, 0,
+      w_amp, total_stall_count - cf_stats_snapshot_.stall_count,
+      interval_stats);
+  value->append(buf);
+
+  snprintf(buf, sizeof(buf),
+           "Flush(GB): cumulative %.3f, interval %.3f\n",
+           curr_ingest / kGB, interval_ingest / kGB);
+  value->append(buf);
+
+  snprintf(buf, sizeof(buf),
+           "Stalls(count): %" PRIu64 " level0_slowdown, "
+           "%" PRIu64 " level0_numfiles, %" PRIu64 " memtable_compaction, "
+           "%" PRIu64 " leveln_slowdown_soft, "
+           "%" PRIu64 " leveln_slowdown_hard\n",
+           cf_stats_count_[LEVEL0_SLOWDOWN],
+           cf_stats_count_[LEVEL0_NUM_FILES],
+           cf_stats_count_[MEMTABLE_COMPACTION],
+           total_slowdown_count_soft, total_slowdown_count_hard);
+  value->append(buf);
+
+  cf_stats_snapshot_.ingest_bytes = curr_ingest;
+  cf_stats_snapshot_.comp_stats = stats_sum;
+  cf_stats_snapshot_.stall_count = total_stall_count;
+}
+
+
+#else
+
+DBPropertyType GetPropertyType(const Slice& property, bool* is_int_property,
+                               bool* need_out_of_mutex) {
+  return kUnknown;
+}
+
+#endif  // !ROCKSDB_LITE
+
 }  // namespace rocksdb
diff --git a/src/rocksdb/db/internal_stats.h b/src/rocksdb/db/internal_stats.h
index 2a74359..55f1467 100644
--- a/src/rocksdb/db/internal_stats.h
+++ b/src/rocksdb/db/internal_stats.h
@@ -9,8 +9,6 @@
 //
 
 #pragma once
-#include "rocksdb/statistics.h"
-#include "util/statistics.h"
 #include "db/version_set.h"
 
 #include <vector>
@@ -23,60 +21,113 @@ namespace rocksdb {
 class MemTableList;
 class DBImpl;
 
-enum DBPropertyType {
+// IMPORTANT: If you add a new property here, also add it to the list in
+//            include/rocksdb/db.h
+enum DBPropertyType : uint32_t {
+  kUnknown,
   kNumFilesAtLevel,  // Number of files at a specific level
   kLevelStats,       // Return number of files and total sizes of each level
-  kStats,            // Return general statitistics of DB
+  kCFStats,          // Return general statitistics of CF
+  kDBStats,          // Return general statitistics of DB
+  kStats,            // Return general statitistics of both DB and CF
   kSsTables,         // Return a human readable string of current SST files
+  kStartIntTypes,    // ---- Dummy value to indicate the start of integer values
   kNumImmutableMemTable,   // Return number of immutable mem tables
   kMemtableFlushPending,   // Return 1 if mem table flushing is pending,
                            // otherwise 0.
   kCompactionPending,      // Return 1 if a compaction is pending. Otherwise 0.
   kBackgroundErrors,       // Return accumulated background errors encountered.
   kCurSizeActiveMemTable,  // Return current size of the active memtable
-  kNumEntriesInMutableMemtable,    // Return number of entries in the mutable
+  kCurSizeAllMemTables,    // Return current size of all (active + immutable)
+                           // memtables
+  kNumEntriesInMutableMemtable,    // Return number of deletes in the mutable
                                    // memtable.
   kNumEntriesInImmutableMemtable,  // Return sum of number of entries in all
                                    // the immutable mem tables.
-  kUnknown,
+  kNumDeletesInMutableMemtable,    // Return number of entries in the mutable
+                                   // memtable.
+  kNumDeletesInImmutableMemtable,  // Return sum of number of deletes in all
+                                   // the immutable mem tables.
+  kEstimatedNumKeys,  // Estimated total number of keys in the database.
+  kEstimatedUsageByTableReaders,  // Estimated memory by table readers.
+  kIsFileDeletionEnabled,         // Equals disable_delete_obsolete_files_,
+                                  // 0 means file deletions enabled
+  kNumSnapshots,                  // Number of snapshots in the system
+  kOldestSnapshotTime,            // Unix timestamp of the first snapshot
+  kNumLiveVersions,
+  kBaseLevel,  // The level that L0 data is compacted to
 };
 
-extern DBPropertyType GetPropertyType(const Slice& property);
+extern DBPropertyType GetPropertyType(const Slice& property,
+                                      bool* is_int_property,
+                                      bool* need_out_of_mutex);
 
+
+#ifndef ROCKSDB_LITE
 class InternalStats {
  public:
-  enum WriteStallType {
+  enum InternalCFStatsType {
     LEVEL0_SLOWDOWN,
     MEMTABLE_COMPACTION,
     LEVEL0_NUM_FILES,
     WRITE_STALLS_ENUM_MAX,
+    BYTES_FLUSHED,
+    INTERNAL_CF_STATS_ENUM_MAX,
+  };
+
+  enum InternalDBStatsType {
+    WAL_FILE_BYTES,
+    WAL_FILE_SYNCED,
+    BYTES_WRITTEN,
+    NUMBER_KEYS_WRITTEN,
+    WRITE_DONE_BY_OTHER,
+    WRITE_DONE_BY_SELF,
+    WRITE_WITH_WAL,
+    WRITE_STALL_MICROS,
+    INTERNAL_DB_STATS_ENUM_MAX,
   };
 
-  InternalStats(int num_levels, Env* env, Statistics* statistics)
-      : compaction_stats_(num_levels),
-        stall_micros_(WRITE_STALLS_ENUM_MAX, 0),
-        stall_counts_(WRITE_STALLS_ENUM_MAX, 0),
-        stall_leveln_slowdown_(num_levels, 0),
-        stall_leveln_slowdown_count_(num_levels, 0),
+  InternalStats(int num_levels, Env* env, ColumnFamilyData* cfd)
+      : db_stats_(INTERNAL_DB_STATS_ENUM_MAX),
+        cf_stats_value_(INTERNAL_CF_STATS_ENUM_MAX),
+        cf_stats_count_(INTERNAL_CF_STATS_ENUM_MAX),
+        comp_stats_(num_levels),
+        stall_leveln_slowdown_count_hard_(num_levels),
+        stall_leveln_slowdown_count_soft_(num_levels),
         bg_error_count_(0),
         number_levels_(num_levels),
-        statistics_(statistics),
         env_(env),
-        started_at_(env->NowMicros()) {}
+        cfd_(cfd),
+        started_at_(env->NowMicros()) {
+    for (int i = 0; i< INTERNAL_DB_STATS_ENUM_MAX; ++i) {
+      db_stats_[i] = 0;
+    }
+    for (int i = 0; i< INTERNAL_CF_STATS_ENUM_MAX; ++i) {
+      cf_stats_value_[i] = 0;
+      cf_stats_count_[i] = 0;
+    }
+    for (int i = 0; i < num_levels; ++i) {
+      stall_leveln_slowdown_count_hard_[i] = 0;
+      stall_leveln_slowdown_count_soft_[i] = 0;
+    }
+  }
 
-  // Per level compaction stats.  compaction_stats_[level] stores the stats for
+  // Per level compaction stats.  comp_stats_[level] stores the stats for
   // compactions that produced data for the specified "level".
   struct CompactionStats {
     uint64_t micros;
 
     // Bytes read from level N during compaction between levels N and N+1
-    int64_t bytes_readn;
+    uint64_t bytes_readn;
 
     // Bytes read from level N+1 during compaction between levels N and N+1
-    int64_t bytes_readnp1;
+    uint64_t bytes_readnp1;
 
     // Total bytes written during compaction between levels N and N+1
-    int64_t bytes_written;
+    uint64_t bytes_written;
+
+    // Total bytes moved to this level
+    uint64_t bytes_moved;
 
     // Files read from level N during compaction between levels N and N+1
     int files_in_leveln;
@@ -87,89 +138,174 @@ class InternalStats {
     // Files written during compaction between levels N and N+1
     int files_out_levelnp1;
 
+    // Total incoming entries during compaction between levels N and N+1
+    uint64_t num_input_records;
+
+    // Accumulated diff number of entries
+    // (num input entries - num output entires) for compaction  levels N and N+1
+    uint64_t num_dropped_records;
+
     // Number of compactions done
     int count;
 
-    CompactionStats()
+    explicit CompactionStats(int _count = 0)
         : micros(0),
           bytes_readn(0),
           bytes_readnp1(0),
           bytes_written(0),
+          bytes_moved(0),
           files_in_leveln(0),
           files_in_levelnp1(0),
           files_out_levelnp1(0),
-          count(0) {}
+          num_input_records(0),
+          num_dropped_records(0),
+          count(_count) {}
+
+    explicit CompactionStats(const CompactionStats& c)
+        : micros(c.micros),
+          bytes_readn(c.bytes_readn),
+          bytes_readnp1(c.bytes_readnp1),
+          bytes_written(c.bytes_written),
+          bytes_moved(c.bytes_moved),
+          files_in_leveln(c.files_in_leveln),
+          files_in_levelnp1(c.files_in_levelnp1),
+          files_out_levelnp1(c.files_out_levelnp1),
+          num_input_records(c.num_input_records),
+          num_dropped_records(c.num_dropped_records),
+          count(c.count) {}
 
     void Add(const CompactionStats& c) {
       this->micros += c.micros;
       this->bytes_readn += c.bytes_readn;
       this->bytes_readnp1 += c.bytes_readnp1;
       this->bytes_written += c.bytes_written;
+      this->bytes_moved += c.bytes_moved;
       this->files_in_leveln += c.files_in_leveln;
       this->files_in_levelnp1 += c.files_in_levelnp1;
       this->files_out_levelnp1 += c.files_out_levelnp1;
-      this->count += 1;
+      this->num_input_records += c.num_input_records;
+      this->num_dropped_records += c.num_dropped_records;
+      this->count += c.count;
+    }
+
+    void Subtract(const CompactionStats& c) {
+      this->micros -= c.micros;
+      this->bytes_readn -= c.bytes_readn;
+      this->bytes_readnp1 -= c.bytes_readnp1;
+      this->bytes_written -= c.bytes_written;
+      this->bytes_moved -= c.bytes_moved;
+      this->files_in_leveln -= c.files_in_leveln;
+      this->files_in_levelnp1 -= c.files_in_levelnp1;
+      this->files_out_levelnp1 -= c.files_out_levelnp1;
+      this->num_input_records -= c.num_input_records;
+      this->num_dropped_records -= c.num_dropped_records;
+      this->count -= c.count;
     }
   };
 
   void AddCompactionStats(int level, const CompactionStats& stats) {
-    compaction_stats_[level].Add(stats);
+    comp_stats_[level].Add(stats);
   }
 
-  void RecordWriteStall(WriteStallType write_stall_type, uint64_t micros) {
-    stall_micros_[write_stall_type] += micros;
-    stall_counts_[write_stall_type]++;
+  void IncBytesMoved(int level, uint64_t amount) {
+    comp_stats_[level].bytes_moved += amount;
   }
 
-  void RecordLevelNSlowdown(int level, uint64_t micros) {
-    stall_leveln_slowdown_[level] += micros;
-    stall_leveln_slowdown_count_[level] += micros;
+  void RecordLevelNSlowdown(int level, bool soft) {
+    if (soft) {
+      ++stall_leveln_slowdown_count_soft_[level];
+    } else {
+      ++stall_leveln_slowdown_count_hard_[level];
+    }
+  }
+
+  void AddCFStats(InternalCFStatsType type, uint64_t value) {
+    cf_stats_value_[type] += value;
+    ++cf_stats_count_[type];
+  }
+
+  void AddDBStats(InternalDBStatsType type, uint64_t value) {
+    db_stats_[type] += value;
   }
 
   uint64_t GetBackgroundErrorCount() const { return bg_error_count_; }
 
   uint64_t BumpAndGetBackgroundErrorCount() { return ++bg_error_count_; }
 
-  bool GetProperty(DBPropertyType property_type, const Slice& property,
-                   std::string* value, ColumnFamilyData* cfd);
+  bool GetStringProperty(DBPropertyType property_type, const Slice& property,
+                         std::string* value);
+
+  bool GetIntProperty(DBPropertyType property_type, uint64_t* value,
+                      DBImpl* db) const;
+
+  bool GetIntPropertyOutOfMutex(DBPropertyType property_type, Version* version,
+                                uint64_t* value) const;
 
  private:
-  std::vector<CompactionStats> compaction_stats_;
+  void DumpDBStats(std::string* value);
+  void DumpCFStats(std::string* value);
+
+  // Per-DB stats
+  std::vector<uint64_t> db_stats_;
+  // Per-ColumnFamily stats
+  std::vector<uint64_t> cf_stats_value_;
+  std::vector<uint64_t> cf_stats_count_;
+  // Per-ColumnFamily/level compaction stats
+  std::vector<CompactionStats> comp_stats_;
+  // These count the number of microseconds for which MakeRoomForWrite stalls.
+  std::vector<uint64_t> stall_leveln_slowdown_count_hard_;
+  std::vector<uint64_t> stall_leveln_slowdown_count_soft_;
 
   // Used to compute per-interval statistics
-  struct StatsSnapshot {
-    uint64_t compaction_bytes_read_;     // Bytes read by compaction
-    uint64_t compaction_bytes_written_;  // Bytes written by compaction
-    uint64_t ingest_bytes_;              // Bytes written by user
-    uint64_t wal_bytes_;                 // Bytes written to WAL
-    uint64_t wal_synced_;                // Number of times WAL is synced
-    uint64_t write_with_wal_;            // Number of writes that request WAL
+  struct CFStatsSnapshot {
+    // ColumnFamily-level stats
+    CompactionStats comp_stats;
+    uint64_t ingest_bytes;            // Bytes written to L0
+    uint64_t stall_count;             // Stall count
+
+    CFStatsSnapshot()
+        : comp_stats(0),
+          ingest_bytes(0),
+          stall_count(0) {}
+  } cf_stats_snapshot_;
+
+  struct DBStatsSnapshot {
+    // DB-level stats
+    uint64_t ingest_bytes;            // Bytes written by user
+    uint64_t wal_bytes;               // Bytes written to WAL
+    uint64_t wal_synced;              // Number of times WAL is synced
+    uint64_t write_with_wal;          // Number of writes that request WAL
     // These count the number of writes processed by the calling thread or
     // another thread.
-    uint64_t write_other_;
-    uint64_t write_self_;
-    double seconds_up_;
-
-    StatsSnapshot()
-        : compaction_bytes_read_(0),
-          compaction_bytes_written_(0),
-          ingest_bytes_(0),
-          wal_bytes_(0),
-          wal_synced_(0),
-          write_with_wal_(0),
-          write_other_(0),
-          write_self_(0),
-          seconds_up_(0) {}
-  };
-
-  // Counters from the previous time per-interval stats were computed
-  StatsSnapshot last_stats_;
-
-  // These count the number of microseconds for which MakeRoomForWrite stalls.
-  std::vector<uint64_t> stall_micros_;
-  std::vector<uint64_t> stall_counts_;
-  std::vector<uint64_t> stall_leveln_slowdown_;
-  std::vector<uint64_t> stall_leveln_slowdown_count_;
+    uint64_t write_other;
+    uint64_t write_self;
+    // Stats from compaction jobs - bytes written, bytes read, duration.
+    uint64_t compact_bytes_write;
+    uint64_t compact_bytes_read;
+    uint64_t compact_micros;
+    // Total number of keys written. write_self and write_other measure number
+    // of write requests written, Each of the write request can contain updates
+    // to multiple keys. num_keys_written is total number of keys updated by all
+    // those writes.
+    uint64_t num_keys_written;
+    // Total time writes delayed by stalls.
+    uint64_t write_stall_micros;
+    double seconds_up;
+
+    DBStatsSnapshot()
+        : ingest_bytes(0),
+          wal_bytes(0),
+          wal_synced(0),
+          write_with_wal(0),
+          write_other(0),
+          write_self(0),
+          compact_bytes_write(0),
+          compact_bytes_read(0),
+          compact_micros(0),
+          num_keys_written(0),
+          write_stall_micros(0),
+          seconds_up(0) {}
+  } db_stats_snapshot_;
 
   // Total number of background errors encountered. Every time a flush task
   // or compaction task fails, this counter is incremented. The failure can
@@ -178,10 +314,84 @@ class InternalStats {
   // or compaction will cause the counter to increase too.
   uint64_t bg_error_count_;
 
-  int number_levels_;
-  Statistics* statistics_;
+  const int number_levels_;
   Env* env_;
-  uint64_t started_at_;
+  ColumnFamilyData* cfd_;
+  const uint64_t started_at_;
+};
+
+#else
+
+class InternalStats {
+ public:
+  enum InternalCFStatsType {
+    LEVEL0_SLOWDOWN,
+    MEMTABLE_COMPACTION,
+    LEVEL0_NUM_FILES,
+    WRITE_STALLS_ENUM_MAX,
+    BYTES_FLUSHED,
+    INTERNAL_CF_STATS_ENUM_MAX,
+  };
+
+  enum InternalDBStatsType {
+    WAL_FILE_BYTES,
+    WAL_FILE_SYNCED,
+    BYTES_WRITTEN,
+    NUMBER_KEYS_WRITTEN,
+    WRITE_DONE_BY_OTHER,
+    WRITE_DONE_BY_SELF,
+    WRITE_WITH_WAL,
+    WRITE_STALL_MICROS,
+    INTERNAL_DB_STATS_ENUM_MAX,
+  };
+
+  InternalStats(int num_levels, Env* env, ColumnFamilyData* cfd) {}
+
+  struct CompactionStats {
+    uint64_t micros;
+    uint64_t bytes_readn;
+    uint64_t bytes_readnp1;
+    uint64_t bytes_written;
+    uint64_t bytes_moved;
+    int files_in_leveln;
+    int files_in_levelnp1;
+    int files_out_levelnp1;
+    uint64_t num_input_records;
+    uint64_t num_dropped_records;
+    int count;
+
+    explicit CompactionStats(int _count = 0) {}
+
+    explicit CompactionStats(const CompactionStats& c) {}
+
+    void Add(const CompactionStats& c) {}
+
+    void Subtract(const CompactionStats& c) {}
+  };
+
+  void AddCompactionStats(int level, const CompactionStats& stats) {}
+
+  void IncBytesMoved(int level, uint64_t amount) {}
+
+  void RecordLevelNSlowdown(int level, bool soft) {}
+
+  void AddCFStats(InternalCFStatsType type, uint64_t value) {}
+
+  void AddDBStats(InternalDBStatsType type, uint64_t value) {}
+
+  uint64_t GetBackgroundErrorCount() const { return 0; }
+
+  uint64_t BumpAndGetBackgroundErrorCount() { return 0; }
+
+  bool GetStringProperty(DBPropertyType property_type, const Slice& property,
+                         std::string* value) { return false; }
+
+  bool GetIntProperty(DBPropertyType property_type, uint64_t* value,
+                      DBImpl* db) const { return false; }
+
+  bool GetIntPropertyOutOfMutex(DBPropertyType property_type, Version* version,
+                                uint64_t* value) const { return false; }
 };
+#endif  // !ROCKSDB_LITE
 
 }  // namespace rocksdb
diff --git a/src/rocksdb/db/job_context.h b/src/rocksdb/db/job_context.h
new file mode 100644
index 0000000..d028144
--- /dev/null
+++ b/src/rocksdb/db/job_context.h
@@ -0,0 +1,115 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/log_writer.h"
+
+namespace rocksdb {
+
+class MemTable;
+
+struct JobContext {
+  inline bool HaveSomethingToDelete() const {
+    return full_scan_candidate_files.size() || sst_delete_files.size() ||
+           log_delete_files.size() || new_superversion != nullptr ||
+           superversions_to_free.size() > 0 || memtables_to_free.size() > 0 ||
+           logs_to_free.size() > 0;
+  }
+
+  // Structure to store information for candidate files to delete.
+  struct CandidateFileInfo {
+    std::string file_name;
+    uint32_t path_id;
+    CandidateFileInfo(std::string name, uint32_t path)
+        : file_name(std::move(name)), path_id(path) {}
+    bool operator==(const CandidateFileInfo& other) const {
+      return file_name == other.file_name && path_id == other.path_id;
+    }
+  };
+
+  // Unique job id
+  int job_id;
+
+  // a list of all files that we'll consider deleting
+  // (every once in a while this is filled up with all files
+  // in the DB directory)
+  // (filled only if we're doing full scan)
+  std::vector<CandidateFileInfo> full_scan_candidate_files;
+
+  // the list of all live sst files that cannot be deleted
+  std::vector<FileDescriptor> sst_live;
+
+  // a list of sst files that we need to delete
+  std::vector<FileMetaData*> sst_delete_files;
+
+  // a list of log files that we need to delete
+  std::vector<uint64_t> log_delete_files;
+
+  // a list of memtables to be free
+  autovector<MemTable*> memtables_to_free;
+
+  autovector<SuperVersion*> superversions_to_free;
+
+  autovector<log::Writer*> logs_to_free;
+
+  SuperVersion* new_superversion;  // if nullptr no new superversion
+
+  // the current manifest_file_number, log_number and prev_log_number
+  // that corresponds to the set of files in 'live'.
+  uint64_t manifest_file_number;
+  uint64_t pending_manifest_file_number;
+  uint64_t log_number;
+  uint64_t prev_log_number;
+
+  uint64_t min_pending_output = 0;
+
+  explicit JobContext(int _job_id, bool create_superversion = false) {
+    job_id = _job_id;
+    manifest_file_number = 0;
+    pending_manifest_file_number = 0;
+    log_number = 0;
+    prev_log_number = 0;
+    new_superversion = create_superversion ? new SuperVersion() : nullptr;
+  }
+
+  void Clean() {
+    // free pending memtables
+    for (auto m : memtables_to_free) {
+      delete m;
+    }
+    // free superversions
+    for (auto s : superversions_to_free) {
+      delete s;
+    }
+    for (auto l : logs_to_free) {
+      delete l;
+    }
+    // if new_superversion was not used, it will be non-nullptr and needs
+    // to be freed here
+    delete new_superversion;
+
+    memtables_to_free.clear();
+    superversions_to_free.clear();
+    logs_to_free.clear();
+    new_superversion = nullptr;
+  }
+
+  ~JobContext() {
+    assert(memtables_to_free.size() == 0);
+    assert(superversions_to_free.size() == 0);
+    assert(new_superversion == nullptr);
+  }
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/db/listener_test.cc b/src/rocksdb/db/listener_test.cc
new file mode 100644
index 0000000..a605bff
--- /dev/null
+++ b/src/rocksdb/db/listener_test.cc
@@ -0,0 +1,407 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+#include "db/dbformat.h"
+#include "db/db_impl.h"
+#include "db/filename.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table_properties.h"
+#include "table/block_based_table_factory.h"
+#include "table/plain_table_factory.h"
+#include "util/hash.h"
+#include "util/hash_linklist_rep.h"
+#include "utilities/merge_operators.h"
+#include "util/logging.h"
+#include "util/mutexlock.h"
+#include "util/rate_limiter.h"
+#include "util/statistics.h"
+#include "util/string_util.h"
+#include "util/testharness.h"
+#include "util/sync_point.h"
+#include "util/testutil.h"
+
+#ifndef ROCKSDB_LITE
+
+namespace rocksdb {
+
+class EventListenerTest : public testing::Test {
+ public:
+  EventListenerTest() {
+    dbname_ = test::TmpDir() + "/listener_test";
+    EXPECT_OK(DestroyDB(dbname_, Options()));
+    db_ = nullptr;
+    Reopen();
+  }
+
+  ~EventListenerTest() {
+    Close();
+    Options options;
+    options.db_paths.emplace_back(dbname_, 0);
+    options.db_paths.emplace_back(dbname_ + "_2", 0);
+    options.db_paths.emplace_back(dbname_ + "_3", 0);
+    options.db_paths.emplace_back(dbname_ + "_4", 0);
+    EXPECT_OK(DestroyDB(dbname_, options));
+  }
+
+  void CreateColumnFamilies(const std::vector<std::string>& cfs,
+                            const ColumnFamilyOptions* options = nullptr) {
+    ColumnFamilyOptions cf_opts;
+    cf_opts = ColumnFamilyOptions(Options());
+    size_t cfi = handles_.size();
+    handles_.resize(cfi + cfs.size());
+    for (auto cf : cfs) {
+      ASSERT_OK(db_->CreateColumnFamily(cf_opts, cf, &handles_[cfi++]));
+    }
+  }
+
+  void Close() {
+    for (auto h : handles_) {
+      delete h;
+    }
+    handles_.clear();
+    delete db_;
+    db_ = nullptr;
+  }
+
+  void ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                const Options* options = nullptr) {
+    ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
+  }
+
+  Status TryReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                     const Options* options = nullptr) {
+    Close();
+    Options opts = (options == nullptr) ? Options() : *options;
+    std::vector<const Options*> v_opts(cfs.size(), &opts);
+    return TryReopenWithColumnFamilies(cfs, v_opts);
+  }
+
+  Status TryReopenWithColumnFamilies(
+      const std::vector<std::string>& cfs,
+      const std::vector<const Options*>& options) {
+    Close();
+    EXPECT_EQ(cfs.size(), options.size());
+    std::vector<ColumnFamilyDescriptor> column_families;
+    for (size_t i = 0; i < cfs.size(); ++i) {
+      column_families.push_back(ColumnFamilyDescriptor(cfs[i], *options[i]));
+    }
+    DBOptions db_opts = DBOptions(*options[0]);
+    return DB::Open(db_opts, dbname_, column_families, &handles_, &db_);
+  }
+
+  Status TryReopen(Options* options = nullptr) {
+    Close();
+    Options opts;
+    if (options != nullptr) {
+      opts = *options;
+    } else {
+      opts.create_if_missing = true;
+    }
+
+    return DB::Open(opts, dbname_, &db_);
+  }
+
+  void Reopen(Options* options = nullptr) {
+    ASSERT_OK(TryReopen(options));
+  }
+
+  void CreateAndReopenWithCF(const std::vector<std::string>& cfs,
+                             const Options* options = nullptr) {
+    CreateColumnFamilies(cfs, options);
+    std::vector<std::string> cfs_plus_default = cfs;
+    cfs_plus_default.insert(cfs_plus_default.begin(), kDefaultColumnFamilyName);
+    ReopenWithColumnFamilies(cfs_plus_default, options);
+  }
+
+  DBImpl* dbfull() {
+    return reinterpret_cast<DBImpl*>(db_);
+  }
+
+  Status Put(int cf, const Slice& k, const Slice& v,
+             WriteOptions wo = WriteOptions()) {
+    return db_->Put(wo, handles_[cf], k, v);
+  }
+
+  Status Flush(int cf = 0) {
+    FlushOptions opt = FlushOptions();
+    opt.wait = true;
+    if (cf == 0) {
+      return db_->Flush(opt);
+    } else {
+      return db_->Flush(opt, handles_[cf]);
+    }
+  }
+
+  DB* db_;
+  std::string dbname_;
+  std::vector<ColumnFamilyHandle*> handles_;
+};
+
+class TestCompactionListener : public EventListener {
+ public:
+  void OnCompactionCompleted(DB *db, const CompactionJobInfo& ci) override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    compacted_dbs_.push_back(db);
+    ASSERT_GT(ci.input_files.size(), 0U);
+    ASSERT_GT(ci.output_files.size(), 0U);
+  }
+
+  std::vector<DB*> compacted_dbs_;
+  std::mutex mutex_;
+};
+
+TEST_F(EventListenerTest, OnSingleDBCompactionTest) {
+  const int kTestKeySize = 16;
+  const int kTestValueSize = 984;
+  const int kEntrySize = kTestKeySize + kTestValueSize;
+  const int kEntriesPerBuffer = 100;
+  const int kNumL0Files = 4;
+
+  Options options;
+  options.create_if_missing = true;
+  options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
+  options.compaction_style = kCompactionStyleLevel;
+  options.target_file_size_base = options.write_buffer_size;
+  options.max_bytes_for_level_base = options.target_file_size_base * 2;
+  options.max_bytes_for_level_multiplier = 2;
+  options.compression = kNoCompression;
+  options.enable_thread_tracking = true;
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+
+  TestCompactionListener* listener = new TestCompactionListener();
+  options.listeners.emplace_back(listener);
+  std::vector<std::string> cf_names = {
+      "pikachu", "ilya", "muromec", "dobrynia",
+      "nikitich", "alyosha", "popovich"};
+  CreateAndReopenWithCF(cf_names, &options);
+  ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p')));
+  ASSERT_OK(Put(2, "ilya", std::string(90000, 'i')));
+  ASSERT_OK(Put(3, "muromec", std::string(90000, 'm')));
+  ASSERT_OK(Put(4, "dobrynia", std::string(90000, 'd')));
+  ASSERT_OK(Put(5, "nikitich", std::string(90000, 'n')));
+  ASSERT_OK(Put(6, "alyosha", std::string(90000, 'a')));
+  ASSERT_OK(Put(7, "popovich", std::string(90000, 'p')));
+  for (size_t i = 1; i < 8; ++i) {
+    ASSERT_OK(Flush(static_cast<int>(i)));
+    const Slice kStart = "a";
+    const Slice kEnd = "z";
+    ASSERT_OK(dbfull()->CompactRange(handles_[i], &kStart, &kEnd));
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+  }
+
+  ASSERT_EQ(listener->compacted_dbs_.size(), cf_names.size());
+  for (size_t i = 0; i < cf_names.size(); ++i) {
+    ASSERT_EQ(listener->compacted_dbs_[i], db_);
+  }
+}
+
+class TestFlushListener : public EventListener {
+ public:
+  void OnFlushCompleted(
+      DB* db, const std::string& name,
+      const std::string& file_path,
+      bool triggered_writes_slowdown,
+      bool triggered_writes_stop) override {
+    flushed_dbs_.push_back(db);
+    flushed_column_family_names_.push_back(name);
+    if (triggered_writes_slowdown) {
+      slowdown_count++;
+    }
+    if (triggered_writes_stop) {
+      stop_count++;
+    }
+  }
+
+  std::vector<std::string> flushed_column_family_names_;
+  std::vector<DB*> flushed_dbs_;
+  int slowdown_count;
+  int stop_count;
+};
+
+TEST_F(EventListenerTest, OnSingleDBFlushTest) {
+  Options options;
+  options.write_buffer_size = 100000;
+  TestFlushListener* listener = new TestFlushListener();
+  options.listeners.emplace_back(listener);
+  std::vector<std::string> cf_names = {
+      "pikachu", "ilya", "muromec", "dobrynia",
+      "nikitich", "alyosha", "popovich"};
+  CreateAndReopenWithCF(cf_names, &options);
+
+  ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p')));
+  ASSERT_OK(Put(2, "ilya", std::string(90000, 'i')));
+  ASSERT_OK(Put(3, "muromec", std::string(90000, 'm')));
+  ASSERT_OK(Put(4, "dobrynia", std::string(90000, 'd')));
+  ASSERT_OK(Put(5, "nikitich", std::string(90000, 'n')));
+  ASSERT_OK(Put(6, "alyosha", std::string(90000, 'a')));
+  ASSERT_OK(Put(7, "popovich", std::string(90000, 'p')));
+  for (size_t i = 1; i < 8; ++i) {
+    ASSERT_OK(Flush(static_cast<int>(i)));
+    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_EQ(listener->flushed_dbs_.size(), i);
+    ASSERT_EQ(listener->flushed_column_family_names_.size(), i);
+  }
+
+  // make sure call-back functions are called in the right order
+  for (size_t i = 0; i < cf_names.size(); ++i) {
+    ASSERT_EQ(listener->flushed_dbs_[i], db_);
+    ASSERT_EQ(listener->flushed_column_family_names_[i], cf_names[i]);
+  }
+}
+
+TEST_F(EventListenerTest, MultiCF) {
+  Options options;
+  options.write_buffer_size = 100000;
+  TestFlushListener* listener = new TestFlushListener();
+  options.listeners.emplace_back(listener);
+  std::vector<std::string> cf_names = {
+      "pikachu", "ilya", "muromec", "dobrynia",
+      "nikitich", "alyosha", "popovich"};
+  CreateAndReopenWithCF(cf_names, &options);
+
+  ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p')));
+  ASSERT_OK(Put(2, "ilya", std::string(90000, 'i')));
+  ASSERT_OK(Put(3, "muromec", std::string(90000, 'm')));
+  ASSERT_OK(Put(4, "dobrynia", std::string(90000, 'd')));
+  ASSERT_OK(Put(5, "nikitich", std::string(90000, 'n')));
+  ASSERT_OK(Put(6, "alyosha", std::string(90000, 'a')));
+  ASSERT_OK(Put(7, "popovich", std::string(90000, 'p')));
+  for (size_t i = 1; i < 8; ++i) {
+    ASSERT_OK(Flush(static_cast<int>(i)));
+    ASSERT_EQ(listener->flushed_dbs_.size(), i);
+    ASSERT_EQ(listener->flushed_column_family_names_.size(), i);
+  }
+
+  // make sure call-back functions are called in the right order
+  for (size_t i = 0; i < cf_names.size(); i++) {
+    ASSERT_EQ(listener->flushed_dbs_[i], db_);
+    ASSERT_EQ(listener->flushed_column_family_names_[i], cf_names[i]);
+  }
+}
+
+TEST_F(EventListenerTest, MultiDBMultiListeners) {
+  std::vector<TestFlushListener*> listeners;
+  const int kNumDBs = 5;
+  const int kNumListeners = 10;
+  for (int i = 0; i < kNumListeners; ++i) {
+    listeners.emplace_back(new TestFlushListener());
+  }
+
+  std::vector<std::string> cf_names = {
+      "pikachu", "ilya", "muromec", "dobrynia",
+      "nikitich", "alyosha", "popovich"};
+
+  Options options;
+  options.create_if_missing = true;
+  for (int i = 0; i < kNumListeners; ++i) {
+    options.listeners.emplace_back(listeners[i]);
+  }
+  DBOptions db_opts(options);
+  ColumnFamilyOptions cf_opts(options);
+
+  std::vector<DB*> dbs;
+  std::vector<std::vector<ColumnFamilyHandle *>> vec_handles;
+
+  for (int d = 0; d < kNumDBs; ++d) {
+    ASSERT_OK(DestroyDB(dbname_ + ToString(d), options));
+    DB* db;
+    std::vector<ColumnFamilyHandle*> handles;
+    ASSERT_OK(DB::Open(options, dbname_ + ToString(d), &db));
+    for (size_t c = 0; c < cf_names.size(); ++c) {
+      ColumnFamilyHandle* handle;
+      db->CreateColumnFamily(cf_opts, cf_names[c], &handle);
+      handles.push_back(handle);
+    }
+
+    vec_handles.push_back(std::move(handles));
+    dbs.push_back(db);
+  }
+
+  for (int d = 0; d < kNumDBs; ++d) {
+    for (size_t c = 0; c < cf_names.size(); ++c) {
+      ASSERT_OK(dbs[d]->Put(WriteOptions(), vec_handles[d][c],
+                cf_names[c], cf_names[c]));
+    }
+  }
+
+  for (size_t c = 0; c < cf_names.size(); ++c) {
+    for (int d = 0; d < kNumDBs; ++d) {
+      ASSERT_OK(dbs[d]->Flush(FlushOptions(), vec_handles[d][c]));
+      reinterpret_cast<DBImpl*>(dbs[d])->TEST_WaitForFlushMemTable();
+    }
+  }
+
+  for (auto* listener : listeners) {
+    int pos = 0;
+    for (size_t c = 0; c < cf_names.size(); ++c) {
+      for (int d = 0; d < kNumDBs; ++d) {
+        ASSERT_EQ(listener->flushed_dbs_[pos], dbs[d]);
+        ASSERT_EQ(listener->flushed_column_family_names_[pos], cf_names[c]);
+        pos++;
+      }
+    }
+  }
+
+  for (auto handles : vec_handles) {
+    for (auto h : handles) {
+      delete h;
+    }
+    handles.clear();
+  }
+  vec_handles.clear();
+
+  for (auto db : dbs) {
+    delete db;
+  }
+}
+
+TEST_F(EventListenerTest, DisableBGCompaction) {
+  Options options;
+  TestFlushListener* listener = new TestFlushListener();
+  const int kSlowdownTrigger = 5;
+  const int kStopTrigger = 10;
+  options.level0_slowdown_writes_trigger = kSlowdownTrigger;
+  options.level0_stop_writes_trigger = kStopTrigger;
+  options.listeners.emplace_back(listener);
+  // BG compaction is disabled.  Number of L0 files will simply keeps
+  // increasing in this test.
+  options.compaction_style = kCompactionStyleNone;
+  options.compression = kNoCompression;
+  options.write_buffer_size = 100000;  // Small write buffer
+
+  CreateAndReopenWithCF({"pikachu"}, &options);
+  WriteOptions wopts;
+  wopts.timeout_hint_us = 100000;
+  ColumnFamilyMetaData cf_meta;
+  db_->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+  // keep writing until writes are forced to stop.
+  for (int i = 0; static_cast<int>(cf_meta.file_count) < kStopTrigger; ++i) {
+    Put(1, ToString(i), std::string(100000, 'x'), wopts);
+    db_->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+  }
+  ASSERT_GE(listener->slowdown_count, kStopTrigger - kSlowdownTrigger);
+  ASSERT_GE(listener->stop_count, 1);
+}
+
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
diff --git a/src/rocksdb/db/log_and_apply_bench.cc b/src/rocksdb/db/log_and_apply_bench.cc
deleted file mode 100644
index ab9716d..0000000
--- a/src/rocksdb/db/log_and_apply_bench.cc
+++ /dev/null
@@ -1,79 +0,0 @@
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-
-
-#include <vector>
-
-#include "util/testharness.h"
-#include "util/benchharness.h"
-#include "db/version_set.h"
-#include "util/mutexlock.h"
-
-namespace rocksdb {
-
-std::string MakeKey(unsigned int num) {
-  char buf[30];
-  snprintf(buf, sizeof(buf), "%016u", num);
-  return std::string(buf);
-}
-
-void BM_LogAndApply(int iters, int num_base_files) {
-  VersionSet* vset;
-  ColumnFamilyData* default_cfd;
-  uint64_t fnum = 1;
-  port::Mutex mu;
-  MutexLock l(&mu);
-
-  BENCHMARK_SUSPEND {
-    std::string dbname = test::TmpDir() + "/rocksdb_test_benchmark";
-    ASSERT_OK(DestroyDB(dbname, Options()));
-
-    DB* db = nullptr;
-    Options opts;
-    opts.create_if_missing = true;
-    Status s = DB::Open(opts, dbname, &db);
-    ASSERT_OK(s);
-    ASSERT_TRUE(db != nullptr);
-
-    delete db;
-    db = nullptr;
-
-    Options options;
-    EnvOptions sopt;
-    vset = new VersionSet(dbname, &options, sopt, nullptr);
-    std::vector<ColumnFamilyDescriptor> dummy;
-    dummy.push_back(ColumnFamilyDescriptor());
-    ASSERT_OK(vset->Recover(dummy));
-    default_cfd = vset->GetColumnFamilySet()->GetDefault();
-    VersionEdit vbase;
-    for (int i = 0; i < num_base_files; i++) {
-      InternalKey start(MakeKey(2 * fnum), 1, kTypeValue);
-      InternalKey limit(MakeKey(2 * fnum + 1), 1, kTypeDeletion);
-      vbase.AddFile(2, ++fnum, 1 /* file size */, start, limit, 1, 1);
-    }
-    ASSERT_OK(vset->LogAndApply(default_cfd, &vbase, &mu));
-  }
-
-  for (int i = 0; i < iters; i++) {
-    VersionEdit vedit;
-    vedit.DeleteFile(2, fnum);
-    InternalKey start(MakeKey(2 * fnum), 1, kTypeValue);
-    InternalKey limit(MakeKey(2 * fnum + 1), 1, kTypeDeletion);
-    vedit.AddFile(2, ++fnum, 1 /* file size */, start, limit, 1, 1);
-    vset->LogAndApply(default_cfd, &vedit, &mu);
-  }
-}
-
-BENCHMARK_NAMED_PARAM(BM_LogAndApply, 1000_iters_1_file, 1000, 1)
-BENCHMARK_NAMED_PARAM(BM_LogAndApply, 1000_iters_100_files, 1000, 100)
-BENCHMARK_NAMED_PARAM(BM_LogAndApply, 1000_iters_10000_files, 1000, 10000)
-BENCHMARK_NAMED_PARAM(BM_LogAndApply, 100_iters_100000_files, 100, 100000)
-
-}  // namespace rocksdb
-
-int main(int argc, char** argv) {
-  rocksdb::benchmark::RunBenchmarks();
-  return 0;
-}
diff --git a/src/rocksdb/db/log_reader.cc b/src/rocksdb/db/log_reader.cc
index be1fb8c..f6514cf 100644
--- a/src/rocksdb/db/log_reader.cc
+++ b/src/rocksdb/db/log_reader.cc
@@ -20,9 +20,9 @@ namespace log {
 Reader::Reporter::~Reporter() {
 }
 
-Reader::Reader(unique_ptr<SequentialFile>&& file, Reporter* reporter,
+Reader::Reader(unique_ptr<SequentialFile>&& _file, Reporter* reporter,
                bool checksum, uint64_t initial_offset)
-    : file_(std::move(file)),
+    : file_(std::move(_file)),
       reporter_(reporter),
       checksum_(checksum),
       backing_store_(new char[kBlockSize]),
@@ -32,20 +32,18 @@ Reader::Reader(unique_ptr<SequentialFile>&& file, Reporter* reporter,
       eof_offset_(0),
       last_record_offset_(0),
       end_of_buffer_offset_(0),
-      initial_offset_(initial_offset) {
-}
+      initial_offset_(initial_offset) {}
 
 Reader::~Reader() {
   delete[] backing_store_;
 }
 
 bool Reader::SkipToInitialBlock() {
-  size_t offset_in_block = initial_offset_ % kBlockSize;
-  uint64_t block_start_location = initial_offset_ - offset_in_block;
+  size_t initial_offset_in_block = initial_offset_ % kBlockSize;
+  uint64_t block_start_location = initial_offset_ - initial_offset_in_block;
 
   // Don't search a block if we'd be in the trailer
-  if (offset_in_block > kBlockSize - 6) {
-    offset_in_block = 0;
+  if (initial_offset_in_block > kBlockSize - 6) {
     block_start_location += kBlockSize;
   }
 
@@ -55,7 +53,7 @@ bool Reader::SkipToInitialBlock() {
   if (block_start_location > 0) {
     Status skip_status = file_->Skip(block_start_location);
     if (!skip_status.ok()) {
-      ReportDrop(block_start_location, skip_status);
+      ReportDrop(static_cast<size_t>(block_start_location), skip_status);
       return false;
     }
   }
@@ -83,16 +81,12 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch) {
     const unsigned int record_type = ReadPhysicalRecord(&fragment);
     switch (record_type) {
       case kFullType:
-        if (in_fragmented_record) {
+        if (in_fragmented_record && !scratch->empty()) {
           // Handle bug in earlier versions of log::Writer where
           // it could emit an empty kFirstType record at the tail end
           // of a block followed by a kFullType or kFirstType record
           // at the beginning of the next block.
-          if (scratch->empty()) {
-            in_fragmented_record = false;
-          } else {
-            ReportCorruption(scratch->size(), "partial record without end(1)");
-          }
+          ReportCorruption(scratch->size(), "partial record without end(1)");
         }
         prospective_record_offset = physical_record_offset;
         scratch->clear();
@@ -101,16 +95,12 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch) {
         return true;
 
       case kFirstType:
-        if (in_fragmented_record) {
+        if (in_fragmented_record && !scratch->empty()) {
           // Handle bug in earlier versions of log::Writer where
           // it could emit an empty kFirstType record at the tail end
           // of a block followed by a kFullType or kFirstType record
           // at the beginning of the next block.
-          if (scratch->empty()) {
-            in_fragmented_record = false;
-          } else {
-            ReportCorruption(scratch->size(), "partial record without end(2)");
-          }
+          ReportCorruption(scratch->size(), "partial record without end(2)");
         }
         prospective_record_offset = physical_record_offset;
         scratch->assign(fragment.data(), fragment.size());
diff --git a/src/rocksdb/db/log_reader.h b/src/rocksdb/db/log_reader.h
index 81d334d..a7cf45b 100644
--- a/src/rocksdb/db/log_reader.h
+++ b/src/rocksdb/db/log_reader.h
@@ -22,6 +22,12 @@ using std::unique_ptr;
 
 namespace log {
 
+/**
+ * Reader is a general purpose log stream reader implementation. The actual job
+ * of reading from the device is implemented by the SequentialFile interface.
+ *
+ * Please see Writer for details on the file and record layout.
+ */
 class Reader {
  public:
   // Interface for reporting errors.
diff --git a/src/rocksdb/db/log_test.cc b/src/rocksdb/db/log_test.cc
index 6577a6a..816e38d 100644
--- a/src/rocksdb/db/log_test.cc
+++ b/src/rocksdb/db/log_test.cc
@@ -41,7 +41,7 @@ static std::string RandomSkewedString(int i, Random* rnd) {
   return BigString(NumberString(i), rnd->Skewed(17));
 }
 
-class LogTest {
+class LogTest : public testing::Test {
  private:
   class StringDest : public WritableFile {
    public:
@@ -55,9 +55,9 @@ class LogTest {
       reader_contents_ = Slice(contents_.data(), 0);
     };
 
-    virtual Status Close() { return Status::OK(); }
-    virtual Status Flush() {
-      ASSERT_TRUE(reader_contents_.size() <= last_flush_);
+    virtual Status Close() override { return Status::OK(); }
+    virtual Status Flush() override {
+      EXPECT_TRUE(reader_contents_.size() <= last_flush_);
       size_t offset = last_flush_ - reader_contents_.size();
       reader_contents_ = Slice(
           contents_.data() + offset,
@@ -66,8 +66,8 @@ class LogTest {
 
       return Status::OK();
     }
-    virtual Status Sync() { return Status::OK(); }
-    virtual Status Append(const Slice& slice) {
+    virtual Status Sync() override { return Status::OK(); }
+    virtual Status Append(const Slice& slice) override {
       contents_.append(slice.data(), slice.size());
       return Status::OK();
     }
@@ -99,8 +99,8 @@ class LogTest {
       force_eof_position_(0),
       returned_partial_(false) { }
 
-    virtual Status Read(size_t n, Slice* result, char* scratch) {
-      ASSERT_TRUE(!returned_partial_) << "must not Read() after eof/error";
+    virtual Status Read(size_t n, Slice* result, char* scratch) override {
+      EXPECT_TRUE(!returned_partial_) << "must not Read() after eof/error";
 
       if (force_error_) {
         if (force_error_position_ >= n) {
@@ -138,7 +138,7 @@ class LogTest {
       return Status::OK();
     }
 
-    virtual Status Skip(uint64_t n) {
+    virtual Status Skip(uint64_t n) override {
       if (n > contents_.size()) {
         contents_.clear();
         return Status::NotFound("in-memory file skipepd past end");
@@ -156,7 +156,7 @@ class LogTest {
     std::string message_;
 
     ReportCollector() : dropped_bytes_(0) { }
-    virtual void Corruption(size_t bytes, const Status& status) {
+    virtual void Corruption(size_t bytes, const Status& status) override {
       dropped_bytes_ += bytes;
       message_.append(status.ToString());
     }
@@ -329,12 +329,9 @@ uint64_t LogTest::initial_offset_last_record_offsets_[] =
      2 * (kHeaderSize + 10000) +
          (2 * log::kBlockSize - 1000) + 3 * kHeaderSize};
 
+TEST_F(LogTest, Empty) { ASSERT_EQ("EOF", Read()); }
 
-TEST(LogTest, Empty) {
-  ASSERT_EQ("EOF", Read());
-}
-
-TEST(LogTest, ReadWrite) {
+TEST_F(LogTest, ReadWrite) {
   Write("foo");
   Write("bar");
   Write("");
@@ -347,7 +344,7 @@ TEST(LogTest, ReadWrite) {
   ASSERT_EQ("EOF", Read());  // Make sure reads at eof work
 }
 
-TEST(LogTest, ManyBlocks) {
+TEST_F(LogTest, ManyBlocks) {
   for (int i = 0; i < 100000; i++) {
     Write(NumberString(i));
   }
@@ -357,7 +354,7 @@ TEST(LogTest, ManyBlocks) {
   ASSERT_EQ("EOF", Read());
 }
 
-TEST(LogTest, Fragmentation) {
+TEST_F(LogTest, Fragmentation) {
   Write("small");
   Write(BigString("medium", 50000));
   Write(BigString("large", 100000));
@@ -367,7 +364,7 @@ TEST(LogTest, Fragmentation) {
   ASSERT_EQ("EOF", Read());
 }
 
-TEST(LogTest, MarginalTrailer) {
+TEST_F(LogTest, MarginalTrailer) {
   // Make a trailer that is exactly the same length as an empty record.
   const int n = kBlockSize - 2*kHeaderSize;
   Write(BigString("foo", n));
@@ -380,7 +377,7 @@ TEST(LogTest, MarginalTrailer) {
   ASSERT_EQ("EOF", Read());
 }
 
-TEST(LogTest, MarginalTrailer2) {
+TEST_F(LogTest, MarginalTrailer2) {
   // Make a trailer that is exactly the same length as an empty record.
   const int n = kBlockSize - 2*kHeaderSize;
   Write(BigString("foo", n));
@@ -393,7 +390,7 @@ TEST(LogTest, MarginalTrailer2) {
   ASSERT_EQ("", ReportMessage());
 }
 
-TEST(LogTest, ShortTrailer) {
+TEST_F(LogTest, ShortTrailer) {
   const int n = kBlockSize - 2*kHeaderSize + 4;
   Write(BigString("foo", n));
   ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize + 4), WrittenBytes());
@@ -405,7 +402,7 @@ TEST(LogTest, ShortTrailer) {
   ASSERT_EQ("EOF", Read());
 }
 
-TEST(LogTest, AlignedEof) {
+TEST_F(LogTest, AlignedEof) {
   const int n = kBlockSize - 2*kHeaderSize + 4;
   Write(BigString("foo", n));
   ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize + 4), WrittenBytes());
@@ -413,7 +410,7 @@ TEST(LogTest, AlignedEof) {
   ASSERT_EQ("EOF", Read());
 }
 
-TEST(LogTest, RandomRead) {
+TEST_F(LogTest, RandomRead) {
   const int N = 500;
   Random write_rnd(301);
   for (int i = 0; i < N; i++) {
@@ -428,7 +425,7 @@ TEST(LogTest, RandomRead) {
 
 // Tests of all the error paths in log_reader.cc follow:
 
-TEST(LogTest, ReadError) {
+TEST_F(LogTest, ReadError) {
   Write("foo");
   ForceError();
   ASSERT_EQ("EOF", Read());
@@ -436,7 +433,7 @@ TEST(LogTest, ReadError) {
   ASSERT_EQ("OK", MatchError("read error"));
 }
 
-TEST(LogTest, BadRecordType) {
+TEST_F(LogTest, BadRecordType) {
   Write("foo");
   // Type is stored in header[6]
   IncrementByte(6, 100);
@@ -446,7 +443,7 @@ TEST(LogTest, BadRecordType) {
   ASSERT_EQ("OK", MatchError("unknown record type"));
 }
 
-TEST(LogTest, TruncatedTrailingRecordIsIgnored) {
+TEST_F(LogTest, TruncatedTrailingRecordIsIgnored) {
   Write("foo");
   ShrinkSize(4);   // Drop all payload as well as a header byte
   ASSERT_EQ("EOF", Read());
@@ -455,7 +452,7 @@ TEST(LogTest, TruncatedTrailingRecordIsIgnored) {
   ASSERT_EQ("", ReportMessage());
 }
 
-TEST(LogTest, BadLength) {
+TEST_F(LogTest, BadLength) {
   const int kPayloadSize = kBlockSize - kHeaderSize;
   Write(BigString("bar", kPayloadSize));
   Write("foo");
@@ -466,7 +463,7 @@ TEST(LogTest, BadLength) {
   ASSERT_EQ("OK", MatchError("bad record length"));
 }
 
-TEST(LogTest, BadLengthAtEndIsIgnored) {
+TEST_F(LogTest, BadLengthAtEndIsIgnored) {
   Write("foo");
   ShrinkSize(1);
   ASSERT_EQ("EOF", Read());
@@ -474,7 +471,7 @@ TEST(LogTest, BadLengthAtEndIsIgnored) {
   ASSERT_EQ("", ReportMessage());
 }
 
-TEST(LogTest, ChecksumMismatch) {
+TEST_F(LogTest, ChecksumMismatch) {
   Write("foo");
   IncrementByte(0, 10);
   ASSERT_EQ("EOF", Read());
@@ -482,7 +479,7 @@ TEST(LogTest, ChecksumMismatch) {
   ASSERT_EQ("OK", MatchError("checksum mismatch"));
 }
 
-TEST(LogTest, UnexpectedMiddleType) {
+TEST_F(LogTest, UnexpectedMiddleType) {
   Write("foo");
   SetByte(6, kMiddleType);
   FixChecksum(0, 3);
@@ -491,7 +488,7 @@ TEST(LogTest, UnexpectedMiddleType) {
   ASSERT_EQ("OK", MatchError("missing start"));
 }
 
-TEST(LogTest, UnexpectedLastType) {
+TEST_F(LogTest, UnexpectedLastType) {
   Write("foo");
   SetByte(6, kLastType);
   FixChecksum(0, 3);
@@ -500,7 +497,7 @@ TEST(LogTest, UnexpectedLastType) {
   ASSERT_EQ("OK", MatchError("missing start"));
 }
 
-TEST(LogTest, UnexpectedFullType) {
+TEST_F(LogTest, UnexpectedFullType) {
   Write("foo");
   Write("bar");
   SetByte(6, kFirstType);
@@ -511,7 +508,7 @@ TEST(LogTest, UnexpectedFullType) {
   ASSERT_EQ("OK", MatchError("partial record without end"));
 }
 
-TEST(LogTest, UnexpectedFirstType) {
+TEST_F(LogTest, UnexpectedFirstType) {
   Write("foo");
   Write(BigString("bar", 100000));
   SetByte(6, kFirstType);
@@ -522,7 +519,7 @@ TEST(LogTest, UnexpectedFirstType) {
   ASSERT_EQ("OK", MatchError("partial record without end"));
 }
 
-TEST(LogTest, MissingLastIsIgnored) {
+TEST_F(LogTest, MissingLastIsIgnored) {
   Write(BigString("bar", kBlockSize));
   // Remove the LAST block, including header.
   ShrinkSize(14);
@@ -531,7 +528,7 @@ TEST(LogTest, MissingLastIsIgnored) {
   ASSERT_EQ(0U, DroppedBytes());
 }
 
-TEST(LogTest, PartialLastIsIgnored) {
+TEST_F(LogTest, PartialLastIsIgnored) {
   Write(BigString("bar", kBlockSize));
   // Cause a bad record length in the LAST block.
   ShrinkSize(1);
@@ -540,7 +537,7 @@ TEST(LogTest, PartialLastIsIgnored) {
   ASSERT_EQ(0U, DroppedBytes());
 }
 
-TEST(LogTest, ErrorJoinsRecords) {
+TEST_F(LogTest, ErrorJoinsRecords) {
   // Consider two fragmented records:
   //    first(R1) last(R1) first(R2) last(R2)
   // where the middle two fragments disappear.  We do not want
@@ -558,66 +555,48 @@ TEST(LogTest, ErrorJoinsRecords) {
 
   ASSERT_EQ("correct", Read());
   ASSERT_EQ("EOF", Read());
-  const unsigned int dropped = DroppedBytes();
-  ASSERT_LE(dropped, 2*kBlockSize + 100);
-  ASSERT_GE(dropped, 2*kBlockSize);
+  size_t dropped = DroppedBytes();
+  ASSERT_LE(dropped, 2 * kBlockSize + 100);
+  ASSERT_GE(dropped, 2 * kBlockSize);
 }
 
-TEST(LogTest, ReadStart) {
-  CheckInitialOffsetRecord(0, 0);
-}
+TEST_F(LogTest, ReadStart) { CheckInitialOffsetRecord(0, 0); }
 
-TEST(LogTest, ReadSecondOneOff) {
-  CheckInitialOffsetRecord(1, 1);
-}
+TEST_F(LogTest, ReadSecondOneOff) { CheckInitialOffsetRecord(1, 1); }
 
-TEST(LogTest, ReadSecondTenThousand) {
-  CheckInitialOffsetRecord(10000, 1);
-}
+TEST_F(LogTest, ReadSecondTenThousand) { CheckInitialOffsetRecord(10000, 1); }
 
-TEST(LogTest, ReadSecondStart) {
-  CheckInitialOffsetRecord(10007, 1);
-}
+TEST_F(LogTest, ReadSecondStart) { CheckInitialOffsetRecord(10007, 1); }
 
-TEST(LogTest, ReadThirdOneOff) {
-  CheckInitialOffsetRecord(10008, 2);
-}
+TEST_F(LogTest, ReadThirdOneOff) { CheckInitialOffsetRecord(10008, 2); }
 
-TEST(LogTest, ReadThirdStart) {
-  CheckInitialOffsetRecord(20014, 2);
-}
+TEST_F(LogTest, ReadThirdStart) { CheckInitialOffsetRecord(20014, 2); }
 
-TEST(LogTest, ReadFourthOneOff) {
-  CheckInitialOffsetRecord(20015, 3);
-}
+TEST_F(LogTest, ReadFourthOneOff) { CheckInitialOffsetRecord(20015, 3); }
 
-TEST(LogTest, ReadFourthFirstBlockTrailer) {
+TEST_F(LogTest, ReadFourthFirstBlockTrailer) {
   CheckInitialOffsetRecord(log::kBlockSize - 4, 3);
 }
 
-TEST(LogTest, ReadFourthMiddleBlock) {
+TEST_F(LogTest, ReadFourthMiddleBlock) {
   CheckInitialOffsetRecord(log::kBlockSize + 1, 3);
 }
 
-TEST(LogTest, ReadFourthLastBlock) {
+TEST_F(LogTest, ReadFourthLastBlock) {
   CheckInitialOffsetRecord(2 * log::kBlockSize + 1, 3);
 }
 
-TEST(LogTest, ReadFourthStart) {
+TEST_F(LogTest, ReadFourthStart) {
   CheckInitialOffsetRecord(
       2 * (kHeaderSize + 1000) + (2 * log::kBlockSize - 1000) + 3 * kHeaderSize,
       3);
 }
 
-TEST(LogTest, ReadEnd) {
-  CheckOffsetPastEndReturnsNoRecords(0);
-}
+TEST_F(LogTest, ReadEnd) { CheckOffsetPastEndReturnsNoRecords(0); }
 
-TEST(LogTest, ReadPastEnd) {
-  CheckOffsetPastEndReturnsNoRecords(5);
-}
+TEST_F(LogTest, ReadPastEnd) { CheckOffsetPastEndReturnsNoRecords(5); }
 
-TEST(LogTest, ClearEofSingleBlock) {
+TEST_F(LogTest, ClearEofSingleBlock) {
   Write("foo");
   Write("bar");
   ForceEOF(3 + kHeaderSize + 2);
@@ -632,7 +611,7 @@ TEST(LogTest, ClearEofSingleBlock) {
   ASSERT_TRUE(IsEOF());
 }
 
-TEST(LogTest, ClearEofMultiBlock) {
+TEST_F(LogTest, ClearEofMultiBlock) {
   size_t num_full_blocks = 5;
   size_t n = (kBlockSize - kHeaderSize) * num_full_blocks + 25;
   Write(BigString("foo", n));
@@ -649,7 +628,7 @@ TEST(LogTest, ClearEofMultiBlock) {
   ASSERT_TRUE(IsEOF());
 }
 
-TEST(LogTest, ClearEofError) {
+TEST_F(LogTest, ClearEofError) {
   // If an error occurs during Read() in UnmarkEOF(), the records contained
   // in the buffer should be returned on subsequent calls of ReadRecord()
   // until no more full records are left, whereafter ReadRecord() should return
@@ -667,7 +646,7 @@ TEST(LogTest, ClearEofError) {
   ASSERT_EQ("EOF", Read());
 }
 
-TEST(LogTest, ClearEofError2) {
+TEST_F(LogTest, ClearEofError2) {
   Write("foo");
   Write("bar");
   UnmarkEOF();
@@ -685,5 +664,6 @@ TEST(LogTest, ClearEofError2) {
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
-  return rocksdb::test::RunAllTests();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
 }
diff --git a/src/rocksdb/db/log_writer.cc b/src/rocksdb/db/log_writer.cc
index df601a4..d78de5e 100644
--- a/src/rocksdb/db/log_writer.cc
+++ b/src/rocksdb/db/log_writer.cc
@@ -52,7 +52,7 @@ Status Writer::AddRecord(const Slice& slice) {
     }
 
     // Invariant: we never leave < kHeaderSize bytes in a block.
-    assert(kBlockSize - block_offset_ - kHeaderSize >= 0);
+    assert(static_cast<int>(kBlockSize) - block_offset_ >= kHeaderSize);
 
     const size_t avail = kBlockSize - block_offset_ - kHeaderSize;
     const size_t fragment_length = (left < avail) ? left : avail;
diff --git a/src/rocksdb/db/log_writer.h b/src/rocksdb/db/log_writer.h
index d7b7aff..46226ec 100644
--- a/src/rocksdb/db/log_writer.h
+++ b/src/rocksdb/db/log_writer.h
@@ -22,6 +22,40 @@ using std::unique_ptr;
 
 namespace log {
 
+/**
+ * Writer is a general purpose log stream writer. It provides an append-only
+ * abstraction for writing data. The details of the how the data is written is
+ * handled by the WriteableFile sub-class implementation.
+ *
+ * File format:
+ *
+ * File is broken down into variable sized records. The format of each record
+ * is described below.
+ *       +-----+-------------+--+----+----------+------+-- ... ----+
+ * File  | r0  |        r1   |P | r2 |    r3    |  r4  |           |
+ *       +-----+-------------+--+----+----------+------+-- ... ----+
+ *       <--- kBlockSize ------>|<-- kBlockSize ------>|
+ *  rn = variable size records
+ *  P = Padding
+ *
+ * Data is written out in kBlockSize chunks. If next record does not fit
+ * into the space left, the leftover space will be padded with \0.
+ *
+ * Record format:
+ *
+ * +---------+-----------+-----------+--- ... ---+
+ * |CRC (4B) | Size (2B) | Type (1B) | Payload   |
+ * +---------+-----------+-----------+--- ... ---+
+ *
+ * CRC = 32bit hash computed over the payload using CRC
+ * Size = Length of the payload data
+ * Type = Type of record
+ *        (kZeroType, kFullType, kFirstType, kLastType, kMiddleType )
+ *        The type is used to group a bunch of records together to represent
+ *        blocks that are larger than kBlockSize
+ * Payload = Byte stream as long as specified by the payload size
+ *
+ */
 class Writer {
  public:
   // Create a writer that will append data to "*dest".
diff --git a/src/rocksdb/db/managed_iterator.cc b/src/rocksdb/db/managed_iterator.cc
new file mode 100644
index 0000000..8dd5f4d
--- /dev/null
+++ b/src/rocksdb/db/managed_iterator.cc
@@ -0,0 +1,256 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef ROCKSDB_LITE
+
+#include <limits>
+#include <string>
+#include <utility>
+
+#include "db/column_family.h"
+#include "db/db_impl.h"
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "db/managed_iterator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "table/merger.h"
+#include "util/xfunc.h"
+
+namespace rocksdb {
+
+namespace {
+// Helper class that locks a mutex on construction and unlocks the mutex when
+// the destructor of the MutexLock object is invoked.
+//
+// Typical usage:
+//
+//   void MyClass::MyMethod() {
+//     MILock l(&mu_);       // mu_ is an instance variable
+//     ... some complex code, possibly with multiple return paths ...
+//   }
+
+class MILock {
+ public:
+  explicit MILock(std::mutex* mu, ManagedIterator* mi) : mu_(mu), mi_(mi) {
+    this->mu_->lock();
+  }
+  ~MILock() {
+    this->mu_->unlock();
+    XFUNC_TEST("managed_xftest_release", "managed_unlock", managed_unlock1,
+               xf_manage_release, mi_);
+  }
+  ManagedIterator* GetManagedIterator() { return mi_; }
+
+ private:
+  std::mutex* const mu_;
+  ManagedIterator* mi_;
+  // No copying allowed
+  MILock(const MILock&) = delete;
+  void operator=(const MILock&) = delete;
+};
+}  // anonymous namespace
+
+//
+// Synchronization between modifiers, releasers, creators
+// If iterator operation, wait till (!in_use), set in_use, do op, reset in_use
+//  if modifying mutable_iter, atomically exchange in_use:
+//  return if in_use set / otherwise set in use,
+//  atomically replace new iter with old , reset in use
+//  The releaser is the new operation and it holds a lock for a very short time
+//  The existing non-const iterator operations are supposed to be single
+//  threaded and hold the lock for the duration of the operation
+//  The existing const iterator operations use the cached key/values
+//  and don't do any locking.
+ManagedIterator::ManagedIterator(DBImpl* db, const ReadOptions& read_options,
+                                 ColumnFamilyData* cfd)
+    : db_(db),
+      read_options_(read_options),
+      cfd_(cfd),
+      svnum_(cfd->GetSuperVersionNumber()),
+      mutable_iter_(nullptr),
+      valid_(false),
+      snapshot_created_(false),
+      release_supported_(true) {
+  read_options_.managed = false;
+  if ((!read_options_.tailing) && (read_options_.snapshot == nullptr)) {
+    assert(read_options_.snapshot = db_->GetSnapshot());
+    snapshot_created_ = true;
+  }
+  cfh_.SetCFD(cfd);
+  mutable_iter_ = unique_ptr<Iterator>(db->NewIterator(read_options_, &cfh_));
+  XFUNC_TEST("managed_xftest_dropold", "managed_create", xf_managed_create1,
+             xf_manage_create, this);
+}
+
+ManagedIterator::~ManagedIterator() {
+  Lock();
+  if (snapshot_created_) {
+    db_->ReleaseSnapshot(read_options_.snapshot);
+    snapshot_created_ = false;
+    read_options_.snapshot = nullptr;
+  }
+}
+
+bool ManagedIterator::Valid() const { return valid_; }
+
+void ManagedIterator::SeekToLast() {
+  MILock l(&in_use_, this);
+  if (NeedToRebuild()) {
+    RebuildIterator();
+  }
+  assert(mutable_iter_ != nullptr);
+  mutable_iter_->SeekToLast();
+  if (mutable_iter_->status().ok()) {
+    UpdateCurrent();
+  }
+}
+
+void ManagedIterator::SeekToFirst() {
+  MILock l(&in_use_, this);
+  SeekInternal(Slice(), true);
+}
+
+void ManagedIterator::Seek(const Slice& user_key) {
+  MILock l(&in_use_, this);
+  SeekInternal(user_key, false);
+}
+
+void ManagedIterator::SeekInternal(const Slice& user_key, bool seek_to_first) {
+  if (NeedToRebuild()) {
+    RebuildIterator();
+  }
+  assert(mutable_iter_ != nullptr);
+  if (seek_to_first) {
+    mutable_iter_->SeekToFirst();
+  } else {
+    mutable_iter_->Seek(user_key);
+  }
+  UpdateCurrent();
+}
+
+void ManagedIterator::Prev() {
+  if (!valid_) {
+    status_ = Status::InvalidArgument("Iterator value invalid");
+    return;
+  }
+  MILock l(&in_use_, this);
+  if (NeedToRebuild()) {
+    std::string current_key = key().ToString();
+    Slice old_key(current_key);
+    RebuildIterator();
+    SeekInternal(old_key, false);
+    UpdateCurrent();
+    if (!valid_) {
+      return;
+    }
+    if (key().compare(old_key) != 0) {
+      valid_ = false;
+      status_ = Status::Incomplete("Cannot do Prev now");
+      return;
+    }
+  }
+  mutable_iter_->Prev();
+  if (mutable_iter_->status().ok()) {
+    UpdateCurrent();
+    status_ = Status::OK();
+  } else {
+    status_ = mutable_iter_->status();
+  }
+}
+
+void ManagedIterator::Next() {
+  if (!valid_) {
+    status_ = Status::InvalidArgument("Iterator value invalid");
+    return;
+  }
+  MILock l(&in_use_, this);
+  if (NeedToRebuild()) {
+    std::string current_key = key().ToString();
+    Slice old_key(current_key.data(), cached_key_.Size());
+    RebuildIterator();
+    SeekInternal(old_key, false);
+    UpdateCurrent();
+    if (!valid_) {
+      return;
+    }
+    if (key().compare(old_key) != 0) {
+      valid_ = false;
+      status_ = Status::Incomplete("Cannot do Next now");
+      return;
+    }
+  }
+  mutable_iter_->Next();
+  UpdateCurrent();
+}
+
+Slice ManagedIterator::key() const {
+  assert(valid_);
+  return cached_key_.GetKey();
+}
+
+Slice ManagedIterator::value() const {
+  assert(valid_);
+  return cached_value_.GetKey();
+}
+
+Status ManagedIterator::status() const { return status_; }
+
+void ManagedIterator::RebuildIterator() {
+  svnum_ = cfd_->GetSuperVersionNumber();
+  mutable_iter_ = unique_ptr<Iterator>(db_->NewIterator(read_options_, &cfh_));
+}
+
+void ManagedIterator::UpdateCurrent() {
+  assert(mutable_iter_ != nullptr);
+
+  if (!(valid_ = mutable_iter_->Valid())) {
+    status_ = mutable_iter_->status();
+    return;
+  }
+
+  status_ = Status::OK();
+  cached_key_.SetKey(mutable_iter_->key());
+  cached_value_.SetKey(mutable_iter_->value());
+}
+
+void ManagedIterator::ReleaseIter(bool only_old) {
+  if ((mutable_iter_ == nullptr) || (!release_supported_)) {
+    return;
+  }
+  if (svnum_ != cfd_->GetSuperVersionNumber() || !only_old) {
+    if (!TryLock()) {  // Don't release iter if in use
+      return;
+    }
+    mutable_iter_ = nullptr;  // in_use for a very short time
+    UnLock();
+  }
+}
+
+bool ManagedIterator::NeedToRebuild() {
+  if ((mutable_iter_ == nullptr) || (status_.IsIncomplete()) ||
+      (!only_drop_old_ && (svnum_ != cfd_->GetSuperVersionNumber()))) {
+    return true;
+  }
+  return false;
+}
+
+void ManagedIterator::Lock() {
+  in_use_.lock();
+  return;
+}
+
+bool ManagedIterator::TryLock() { return in_use_.try_lock(); }
+
+void ManagedIterator::UnLock() {
+  in_use_.unlock();
+  XFUNC_TEST("managed_xftest_release", "managed_unlock", managed_unlock1,
+             xf_manage_release, this);
+}
+
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/managed_iterator.h b/src/rocksdb/db/managed_iterator.h
new file mode 100644
index 0000000..00f56ae
--- /dev/null
+++ b/src/rocksdb/db/managed_iterator.h
@@ -0,0 +1,84 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <mutex>
+#include <queue>
+#include <string>
+#include <vector>
+
+#include "db/column_family.h"
+#include "rocksdb/db.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "util/arena.h"
+
+namespace rocksdb {
+
+class DBImpl;
+struct SuperVersion;
+class ColumnFamilyData;
+
+/**
+ * ManagedIterator is a special type of iterator that supports freeing the
+ * underlying iterator and still being able to access the current key/value
+ * pair.  This is done by copying the key/value pair so that clients can
+ * continue to access the data without getting a SIGSEGV.
+ * The underlying iterator can be freed manually through the  call to
+ * ReleaseIter or automatically (as needed on space pressure or age.)
+ * The iterator is recreated using the saved original arguments.
+ */
+class ManagedIterator : public Iterator {
+ public:
+  ManagedIterator(DBImpl* db, const ReadOptions& read_options,
+                  ColumnFamilyData* cfd);
+  virtual ~ManagedIterator();
+
+  virtual void SeekToLast() override;
+  virtual void Prev() override;
+  virtual bool Valid() const override;
+  void SeekToFirst() override;
+  virtual void Seek(const Slice& target) override;
+  virtual void Next() override;
+  virtual Slice key() const override;
+  virtual Slice value() const override;
+  virtual Status status() const override;
+  void ReleaseIter(bool only_old);
+  void SetDropOld(bool only_old) {
+    only_drop_old_ = read_options_.tailing || only_old;
+  }
+
+ private:
+  void RebuildIterator();
+  void UpdateCurrent();
+  void SeekInternal(const Slice& user_key, bool seek_to_first);
+  bool NeedToRebuild();
+  void Lock();
+  bool TryLock();
+  void UnLock();
+  DBImpl* const db_;
+  ReadOptions read_options_;
+  ColumnFamilyData* const cfd_;
+  ColumnFamilyHandleInternal cfh_;
+
+  uint64_t svnum_;
+  std::unique_ptr<Iterator> mutable_iter_;
+  // internal iterator status
+  Status status_;
+  bool valid_;
+
+  IterKey cached_key_;
+  IterKey cached_value_;
+
+  bool only_drop_old_ = true;
+  bool snapshot_created_;
+  bool release_supported_;
+  std::mutex in_use_;  // is managed iterator in use
+};
+
+}  // namespace rocksdb
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/memtable.cc b/src/rocksdb/db/memtable.cc
index f95ad3c..76392d6 100644
--- a/src/rocksdb/db/memtable.cc
+++ b/src/rocksdb/db/memtable.cc
@@ -15,11 +15,13 @@
 
 #include "db/dbformat.h"
 #include "db/merge_context.h"
+#include "db/writebuffer.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/slice_transform.h"
+#include "table/merger.h"
 #include "util/arena.h"
 #include "util/coding.h"
 #include "util/murmurhash.h"
@@ -30,38 +32,66 @@
 
 namespace rocksdb {
 
-MemTable::MemTable(const InternalKeyComparator& cmp, const Options& options)
+MemTableOptions::MemTableOptions(
+    const ImmutableCFOptions& ioptions,
+    const MutableCFOptions& mutable_cf_options)
+  : write_buffer_size(mutable_cf_options.write_buffer_size),
+    arena_block_size(mutable_cf_options.arena_block_size),
+    memtable_prefix_bloom_bits(mutable_cf_options.memtable_prefix_bloom_bits),
+    memtable_prefix_bloom_probes(
+        mutable_cf_options.memtable_prefix_bloom_probes),
+    memtable_prefix_bloom_huge_page_tlb_size(
+        mutable_cf_options.memtable_prefix_bloom_huge_page_tlb_size),
+    inplace_update_support(ioptions.inplace_update_support),
+    inplace_update_num_locks(mutable_cf_options.inplace_update_num_locks),
+    inplace_callback(ioptions.inplace_callback),
+    max_successive_merges(mutable_cf_options.max_successive_merges),
+    filter_deletes(mutable_cf_options.filter_deletes),
+    statistics(ioptions.statistics),
+    merge_operator(ioptions.merge_operator),
+    info_log(ioptions.info_log) {}
+
+MemTable::MemTable(const InternalKeyComparator& cmp,
+                   const ImmutableCFOptions& ioptions,
+                   const MutableCFOptions& mutable_cf_options,
+                   WriteBuffer* write_buffer)
     : comparator_(cmp),
+      moptions_(ioptions, mutable_cf_options),
       refs_(0),
-      kArenaBlockSize(OptimizeBlockSize(options.arena_block_size)),
-      kWriteBufferSize(options.write_buffer_size),
-      arena_(options.arena_block_size),
-      table_(options.memtable_factory->CreateMemTableRep(
-          comparator_, &arena_, options.prefix_extractor.get())),
+      kArenaBlockSize(OptimizeBlockSize(moptions_.arena_block_size)),
+      arena_(moptions_.arena_block_size),
+      allocator_(&arena_, write_buffer),
+      table_(ioptions.memtable_factory->CreateMemTableRep(
+          comparator_, &allocator_, ioptions.prefix_extractor,
+          ioptions.info_log)),
       num_entries_(0),
+      num_deletes_(0),
       flush_in_progress_(false),
       flush_completed_(false),
       file_number_(0),
       first_seqno_(0),
       mem_next_logfile_number_(0),
-      locks_(options.inplace_update_support ? options.inplace_update_num_locks
-                                            : 0),
-      prefix_extractor_(options.prefix_extractor.get()),
-      should_flush_(ShouldFlushNow()) {
+      locks_(moptions_.inplace_update_support
+                 ? moptions_.inplace_update_num_locks
+                 : 0),
+      prefix_extractor_(ioptions.prefix_extractor),
+      should_flush_(ShouldFlushNow()),
+      flush_scheduled_(false),
+      env_(ioptions.env) {
   // if should_flush_ == true without an entry inserted, something must have
   // gone wrong already.
   assert(!should_flush_);
-  if (prefix_extractor_ && options.memtable_prefix_bloom_bits > 0) {
+  if (prefix_extractor_ && moptions_.memtable_prefix_bloom_bits > 0) {
     prefix_bloom_.reset(new DynamicBloom(
-        options.memtable_prefix_bloom_bits, options.bloom_locality,
-        options.memtable_prefix_bloom_probes, nullptr,
-        options.memtable_prefix_bloom_huge_page_tlb_size));
+        &allocator_,
+        moptions_.memtable_prefix_bloom_bits, ioptions.bloom_locality,
+        moptions_.memtable_prefix_bloom_probes, nullptr,
+        moptions_.memtable_prefix_bloom_huge_page_tlb_size,
+        ioptions.info_log));
   }
 }
 
-MemTable::~MemTable() {
-  assert(refs_ == 0);
-}
+MemTable::~MemTable() { assert(refs_ == 0); }
 
 size_t MemTable::ApproximateMemoryUsage() {
   size_t arena_usage = arena_.ApproximateMemoryUsage();
@@ -93,14 +123,16 @@ bool MemTable::ShouldFlushNow() const {
   // if we can still allocate one more block without exceeding the
   // over-allocation ratio, then we should not flush.
   if (allocated_memory + kArenaBlockSize <
-      kWriteBufferSize + kArenaBlockSize * kAllowOverAllocationRatio) {
+      moptions_.write_buffer_size +
+      kArenaBlockSize * kAllowOverAllocationRatio) {
     return false;
   }
 
-  // if user keeps adding entries that exceeds kWriteBufferSize, we need to
-  // flush earlier even though we still have much available memory left.
-  if (allocated_memory >
-      kWriteBufferSize + kArenaBlockSize * kAllowOverAllocationRatio) {
+  // if user keeps adding entries that exceeds moptions.write_buffer_size,
+  // we need to flush earlier even though we still have much available
+  // memory left.
+  if (allocated_memory > moptions_.write_buffer_size +
+      kArenaBlockSize * kAllowOverAllocationRatio) {
     return true;
   }
 
@@ -154,7 +186,7 @@ Slice MemTableRep::UserKey(const char* key) const {
 }
 
 KeyHandle MemTableRep::Allocate(const size_t len, char** buf) {
-  *buf = arena_->Allocate(len);
+  *buf = allocator_->Allocate(len);
   return static_cast<KeyHandle>(*buf);
 }
 
@@ -163,28 +195,39 @@ KeyHandle MemTableRep::Allocate(const size_t len, char** buf) {
 // into this scratch space.
 const char* EncodeKey(std::string* scratch, const Slice& target) {
   scratch->clear();
-  PutVarint32(scratch, target.size());
+  PutVarint32(scratch, static_cast<uint32_t>(target.size()));
   scratch->append(target.data(), target.size());
   return scratch->data();
 }
 
 class MemTableIterator: public Iterator {
  public:
-  MemTableIterator(const MemTable& mem, const ReadOptions& options,
-                   bool enforce_total_order)
+  MemTableIterator(
+      const MemTable& mem, const ReadOptions& read_options, Arena* arena)
       : bloom_(nullptr),
         prefix_extractor_(mem.prefix_extractor_),
-        valid_(false) {
-    if (prefix_extractor_ != nullptr && !enforce_total_order) {
+        valid_(false),
+        arena_mode_(arena != nullptr) {
+    if (prefix_extractor_ != nullptr && !read_options.total_order_seek) {
       bloom_ = mem.prefix_bloom_.get();
-      iter_.reset(mem.table_->GetDynamicPrefixIterator());
+      iter_ = mem.table_->GetDynamicPrefixIterator(arena);
+    } else {
+      iter_ = mem.table_->GetIterator(arena);
+    }
+  }
+
+  ~MemTableIterator() {
+    if (arena_mode_) {
+      iter_->~Iterator();
     } else {
-      iter_.reset(mem.table_->GetIterator());
+      delete iter_;
     }
   }
 
-  virtual bool Valid() const { return valid_; }
-  virtual void Seek(const Slice& k) {
+  virtual bool Valid() const override { return valid_; }
+  virtual void Seek(const Slice& k) override {
+    PERF_TIMER_GUARD(seek_on_memtable_time);
+    PERF_COUNTER_ADD(seek_on_memtable_count, 1);
     if (bloom_ != nullptr &&
         !bloom_->MayContain(prefix_extractor_->Transform(ExtractUserKey(k)))) {
       valid_ = false;
@@ -193,50 +236,52 @@ class MemTableIterator: public Iterator {
     iter_->Seek(k, nullptr);
     valid_ = iter_->Valid();
   }
-  virtual void SeekToFirst() {
+  virtual void SeekToFirst() override {
     iter_->SeekToFirst();
     valid_ = iter_->Valid();
   }
-  virtual void SeekToLast() {
+  virtual void SeekToLast() override {
     iter_->SeekToLast();
     valid_ = iter_->Valid();
   }
-  virtual void Next() {
+  virtual void Next() override {
     assert(Valid());
     iter_->Next();
     valid_ = iter_->Valid();
   }
-  virtual void Prev() {
+  virtual void Prev() override {
     assert(Valid());
     iter_->Prev();
     valid_ = iter_->Valid();
   }
-  virtual Slice key() const {
+  virtual Slice key() const override {
     assert(Valid());
     return GetLengthPrefixedSlice(iter_->key());
   }
-  virtual Slice value() const {
+  virtual Slice value() const override {
     assert(Valid());
     Slice key_slice = GetLengthPrefixedSlice(iter_->key());
     return GetLengthPrefixedSlice(key_slice.data() + key_slice.size());
   }
 
-  virtual Status status() const { return Status::OK(); }
+  virtual Status status() const override { return Status::OK(); }
 
  private:
   DynamicBloom* bloom_;
   const SliceTransform* const prefix_extractor_;
-  std::unique_ptr<MemTableRep::Iterator> iter_;
+  MemTableRep::Iterator* iter_;
   bool valid_;
+  bool arena_mode_;
 
   // No copying allowed
   MemTableIterator(const MemTableIterator&);
   void operator=(const MemTableIterator&);
 };
 
-Iterator* MemTable::NewIterator(const ReadOptions& options,
-    bool enforce_total_order) {
-  return new MemTableIterator(*this, options, enforce_total_order);
+Iterator* MemTable::NewIterator(const ReadOptions& read_options, Arena* arena) {
+  assert(arena != nullptr);
+  auto mem = arena->AllocateAligned(sizeof(MemTableIterator));
+  return new (mem) MemTableIterator(*this, read_options, arena);
 }
 
 port::RWMutex* MemTable::GetLock(const Slice& key) {
@@ -252,12 +297,12 @@ void MemTable::Add(SequenceNumber s, ValueType type,
   //  key bytes    : char[internal_key.size()]
   //  value_size   : varint32 of value.size()
   //  value bytes  : char[value.size()]
-  size_t key_size = key.size();
-  size_t val_size = value.size();
-  size_t internal_key_size = key_size + 8;
-  const size_t encoded_len =
-      VarintLength(internal_key_size) + internal_key_size +
-      VarintLength(val_size) + val_size;
+  uint32_t key_size = static_cast<uint32_t>(key.size());
+  uint32_t val_size = static_cast<uint32_t>(value.size());
+  uint32_t internal_key_size = key_size + 8;
+  const uint32_t encoded_len = VarintLength(internal_key_size) +
+                               internal_key_size + VarintLength(val_size) +
+                               val_size;
   char* buf = nullptr;
   KeyHandle handle = table_->Allocate(encoded_len, &buf);
   assert(buf != nullptr);
@@ -271,6 +316,9 @@ void MemTable::Add(SequenceNumber s, ValueType type,
   assert((unsigned)(p + val_size - buf) == (unsigned)encoded_len);
   table_->Insert(handle);
   num_entries_++;
+  if (type == kTypeDeletion) {
+    num_deletes_++;
+  }
 
   if (prefix_bloom_) {
     assert(prefix_extractor_);
@@ -302,6 +350,7 @@ struct Saver {
   Logger* logger;
   Statistics* statistics;
   bool inplace_update_support;
+  Env* env_;
 };
 }  // namespace
 
@@ -336,9 +385,17 @@ static bool SaveValue(void* arg, const char* entry) {
         *(s->status) = Status::OK();
         if (*(s->merge_in_progress)) {
           assert(merge_operator);
-          if (!merge_operator->FullMerge(s->key->user_key(), &v,
-                                         merge_context->GetOperands(), s->value,
-                                         s->logger)) {
+          bool merge_success = false;
+          {
+            StopWatchNano timer(s->env_, s->statistics != nullptr);
+            PERF_TIMER_GUARD(merge_operator_time_nanos);
+            merge_success = merge_operator->FullMerge(
+                s->key->user_key(), &v, merge_context->GetOperands(), s->value,
+                s->logger);
+            RecordTick(s->statistics, MERGE_OPERATION_TOTAL_TIME,
+                       timer.ElapsedNanos());
+          }
+          if (!merge_success) {
             RecordTick(s->statistics, NUMBER_MERGE_FAILURES);
             *(s->status) =
                 Status::Corruption("Error: Could not perform merge.");
@@ -347,7 +404,7 @@ static bool SaveValue(void* arg, const char* entry) {
           s->value->assign(v.data(), v.size());
         }
         if (s->inplace_update_support) {
-          s->mem->GetLock(s->key->user_key())->Unlock();
+          s->mem->GetLock(s->key->user_key())->ReadUnlock();
         }
         *(s->found_final_value) = true;
         return false;
@@ -356,9 +413,17 @@ static bool SaveValue(void* arg, const char* entry) {
         if (*(s->merge_in_progress)) {
           assert(merge_operator);
           *(s->status) = Status::OK();
-          if (!merge_operator->FullMerge(s->key->user_key(), nullptr,
-                                         merge_context->GetOperands(), s->value,
-                                         s->logger)) {
+          bool merge_success = false;
+          {
+            StopWatchNano timer(s->env_, s->statistics != nullptr);
+            PERF_TIMER_GUARD(merge_operator_time_nanos);
+            merge_success = merge_operator->FullMerge(
+                s->key->user_key(), nullptr, merge_context->GetOperands(),
+                s->value, s->logger);
+            RecordTick(s->statistics, MERGE_OPERATION_TOTAL_TIME,
+                       timer.ElapsedNanos());
+          }
+          if (!merge_success) {
             RecordTick(s->statistics, NUMBER_MERGE_FAILURES);
             *(s->status) =
                 Status::Corruption("Error: Could not perform merge.");
@@ -370,7 +435,16 @@ static bool SaveValue(void* arg, const char* entry) {
         return false;
       }
       case kTypeMerge: {
-        std::string merge_result;  // temporary area for merge results later
+        if (!merge_operator) {
+          *(s->status) = Status::InvalidArgument(
+              "merge_operator is not properly initialized.");
+          // Normally we continue the loop (return true) when we see a merge
+          // operand.  But in case of an error, we should stop the loop
+          // immediately and pretend we have found the value to stop further
+          // seek.  Otherwise, the later call will override this error status.
+          *(s->found_final_value) = true;
+          return false;
+        }
         Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
         *(s->merge_in_progress) = true;
         merge_context->PushOperand(v);
@@ -387,8 +461,13 @@ static bool SaveValue(void* arg, const char* entry) {
 }
 
 bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
-                   MergeContext& merge_context, const Options& options) {
-  PERF_TIMER_AUTO(get_from_memtable_time);
+                   MergeContext* merge_context) {
+  // The sequence number is updated synchronously in version_set.h
+  if (IsEmpty()) {
+    // Avoiding recording stats for speed.
+    return false;
+  }
+  PERF_TIMER_GUARD(get_from_memtable_time);
 
   Slice user_key = key.user_key();
   bool found_final_value = false;
@@ -406,11 +485,12 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
     saver.value = value;
     saver.status = s;
     saver.mem = this;
-    saver.merge_context = &merge_context;
-    saver.merge_operator = options.merge_operator.get();
-    saver.logger = options.info_log.get();
-    saver.inplace_update_support = options.inplace_update_support;
-    saver.statistics = options.statistics.get();
+    saver.merge_context = merge_context;
+    saver.merge_operator = moptions_.merge_operator;
+    saver.logger = moptions_.info_log;
+    saver.inplace_update_support = moptions_.inplace_update_support;
+    saver.statistics = moptions_.statistics;
+    saver.env_ = env_;
     table_->Get(key, &saver, SaveValue);
   }
 
@@ -418,7 +498,6 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
   if (!found_final_value && merge_in_progress) {
     *s = Status::MergeInProgress("");
   }
-  PERF_TIMER_STOP(get_from_memtable_time);
   PERF_COUNTER_ADD(get_from_memtable_count, 1);
   return found_final_value;
 }
@@ -430,7 +509,7 @@ void MemTable::Update(SequenceNumber seq,
   Slice mem_key = lkey.memtable_key();
 
   std::unique_ptr<MemTableRep::Iterator> iter(
-    table_->GetIterator(lkey.user_key()));
+      table_->GetDynamicPrefixIterator());
   iter->Seek(lkey.internal_key(), mem_key.data());
 
   if (iter->Valid()) {
@@ -453,8 +532,8 @@ void MemTable::Update(SequenceNumber seq,
       switch (static_cast<ValueType>(tag & 0xff)) {
         case kTypeValue: {
           Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length);
-          uint32_t prev_size = prev_value.size();
-          uint32_t new_size = value.size();
+          uint32_t prev_size = static_cast<uint32_t>(prev_value.size());
+          uint32_t new_size = static_cast<uint32_t>(value.size());
 
           // Update value, if new value size  <= previous value size
           if (new_size <= prev_size ) {
@@ -483,13 +562,12 @@ void MemTable::Update(SequenceNumber seq,
 
 bool MemTable::UpdateCallback(SequenceNumber seq,
                               const Slice& key,
-                              const Slice& delta,
-                              const Options& options) {
+                              const Slice& delta) {
   LookupKey lkey(key, seq);
   Slice memkey = lkey.memtable_key();
 
   std::unique_ptr<MemTableRep::Iterator> iter(
-    table_->GetIterator(lkey.user_key()));
+      table_->GetDynamicPrefixIterator());
   iter->Seek(lkey.internal_key(), memkey.data());
 
   if (iter->Valid()) {
@@ -512,15 +590,15 @@ bool MemTable::UpdateCallback(SequenceNumber seq,
       switch (static_cast<ValueType>(tag & 0xff)) {
         case kTypeValue: {
           Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length);
-          uint32_t  prev_size = prev_value.size();
+          uint32_t prev_size = static_cast<uint32_t>(prev_value.size());
 
           char* prev_buffer = const_cast<char*>(prev_value.data());
-          uint32_t  new_prev_size = prev_size;
+          uint32_t new_prev_size = prev_size;
 
           std::string str_value;
           WriteLock wl(GetLock(lkey.user_key()));
-          auto status = options.inplace_callback(prev_buffer, &new_prev_size,
-                                                    delta, &str_value);
+          auto status = moptions_.inplace_callback(prev_buffer, &new_prev_size,
+                                                   delta, &str_value);
           if (status == UpdateStatus::UPDATED_INPLACE) {
             // Value already updated by callback.
             assert(new_prev_size <= prev_size);
@@ -533,12 +611,12 @@ bool MemTable::UpdateCallback(SequenceNumber seq,
                 memcpy(p, prev_buffer, new_prev_size);
               }
             }
-            RecordTick(options.statistics.get(), NUMBER_KEYS_UPDATED);
+            RecordTick(moptions_.statistics, NUMBER_KEYS_UPDATED);
             should_flush_ = ShouldFlushNow();
             return true;
           } else if (status == UpdateStatus::UPDATED) {
             Add(seq, kTypeValue, key, Slice(str_value));
-            RecordTick(options.statistics.get(), NUMBER_KEYS_WRITTEN);
+            RecordTick(moptions_.statistics, NUMBER_KEYS_WRITTEN);
             should_flush_ = ShouldFlushNow();
             return true;
           } else if (status == UpdateStatus::UPDATE_FAILED) {
@@ -564,7 +642,7 @@ size_t MemTable::CountSuccessiveMergeEntries(const LookupKey& key) {
   // reps). By passing in the user key, we allow efficient iterator creation.
   // The iterator only needs to be ordered within the same user key.
   std::unique_ptr<MemTableRep::Iterator> iter(
-      table_->GetIterator(key.user_key()));
+      table_->GetDynamicPrefixIterator());
   iter->Seek(key.internal_key(), memkey.data());
 
   size_t num_successive_merges = 0;
@@ -591,7 +669,7 @@ size_t MemTable::CountSuccessiveMergeEntries(const LookupKey& key) {
 
 void MemTableRep::Get(const LookupKey& k, void* callback_args,
                       bool (*callback_func)(void* arg, const char* entry)) {
-  auto iter = GetIterator(k.user_key());
+  auto iter = GetDynamicPrefixIterator();
   for (iter->Seek(k.internal_key(), k.memtable_key().data());
        iter->Valid() && callback_func(callback_args, iter->key());
        iter->Next()) {
diff --git a/src/rocksdb/db/memtable.h b/src/rocksdb/db/memtable.h
index 7e9af35..aa26b32 100644
--- a/src/rocksdb/db/memtable.h
+++ b/src/rocksdb/db/memtable.h
@@ -10,28 +10,70 @@
 #pragma once
 #include <string>
 #include <memory>
+#include <functional>
 #include <deque>
+#include <vector>
 #include "db/dbformat.h"
 #include "db/skiplist.h"
 #include "db/version_edit.h"
 #include "rocksdb/db.h"
+#include "rocksdb/env.h"
 #include "rocksdb/memtablerep.h"
+#include "rocksdb/immutable_options.h"
+#include "db/memtable_allocator.h"
 #include "util/arena.h"
 #include "util/dynamic_bloom.h"
+#include "util/mutable_cf_options.h"
 
 namespace rocksdb {
 
 class Mutex;
 class MemTableIterator;
 class MergeContext;
+class WriteBuffer;
+
+struct MemTableOptions {
+  explicit MemTableOptions(
+      const ImmutableCFOptions& ioptions,
+      const MutableCFOptions& mutable_cf_options);
+  size_t write_buffer_size;
+  size_t arena_block_size;
+  uint32_t memtable_prefix_bloom_bits;
+  uint32_t memtable_prefix_bloom_probes;
+  size_t memtable_prefix_bloom_huge_page_tlb_size;
+  bool inplace_update_support;
+  size_t inplace_update_num_locks;
+  UpdateStatus (*inplace_callback)(char* existing_value,
+                                   uint32_t* existing_value_size,
+                                   Slice delta_value,
+                                   std::string* merged_value);
+  size_t max_successive_merges;
+  bool filter_deletes;
+  Statistics* statistics;
+  MergeOperator* merge_operator;
+  Logger* info_log;
+};
 
+// Note:  Many of the methods in this class have comments indicating that
+// external synchromization is required as these methods are not thread-safe.
+// It is up to higher layers of code to decide how to prevent concurrent
+// invokation of these methods.  This is usually done by acquiring either
+// the db mutex or the single writer thread.
+//
+// Some of these methods are documented to only require external
+// synchronization if this memtable is immutable.  Calling MarkImmutable() is
+// not sufficient to guarantee immutability.  It is up to higher layers of
+// code to determine if this MemTable can still be modified by other threads.
+// Eg: The Superversion stores a pointer to the current MemTable (that can
+// be modified) and a separate list of the MemTables that can no longer be
+// written to (aka the 'immutable memtables').
 class MemTable {
  public:
   struct KeyComparator : public MemTableRep::KeyComparator {
     const InternalKeyComparator comparator;
     explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { }
     virtual int operator()(const char* prefix_len_key1,
-                           const char* prefix_len_key2) const;
+                           const char* prefix_len_key2) const override;
     virtual int operator()(const char* prefix_len_key,
                            const Slice& key) const override;
   };
@@ -39,15 +81,22 @@ class MemTable {
   // MemTables are reference counted.  The initial reference count
   // is zero and the caller must call Ref() at least once.
   explicit MemTable(const InternalKeyComparator& comparator,
-                    const Options& options);
+                    const ImmutableCFOptions& ioptions,
+                    const MutableCFOptions& mutable_cf_options,
+                    WriteBuffer* write_buffer);
 
+  // Do not delete this MemTable unless Unref() indicates it not in use.
   ~MemTable();
 
   // Increase reference count.
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
   void Ref() { ++refs_; }
 
   // Drop reference count.
-  // If the refcount goes to zero return this memtable, otherwise return null
+  // If the refcount goes to zero return this memtable, otherwise return null.
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
   MemTable* Unref() {
     --refs_;
     assert(refs_ >= 0);
@@ -61,12 +110,16 @@ class MemTable {
   // data structure.
   //
   // REQUIRES: external synchronization to prevent simultaneous
-  // operations on the same MemTable.
+  // operations on the same MemTable (unless this Memtable is immutable).
   size_t ApproximateMemoryUsage();
 
   // This method heuristically determines if the memtable should continue to
   // host more data.
-  bool ShouldFlush() const { return should_flush_; }
+  bool ShouldScheduleFlush() const {
+    return flush_scheduled_ == false && should_flush_;
+  }
+
+  void MarkFlushScheduled() { flush_scheduled_ = true; }
 
   // Return an iterator that yields the contents of the memtable.
   //
@@ -77,12 +130,17 @@ class MemTable {
   //
   // By default, it returns an iterator for prefix seek if prefix_extractor
   // is configured in Options.
-  Iterator* NewIterator(const ReadOptions& options,
-                        bool enforce_total_order = false);
+  // arena: If not null, the arena needs to be used to allocate the Iterator.
+  //        Calling ~Iterator of the iterator will destroy all the states but
+  //        those allocated in arena.
+  Iterator* NewIterator(const ReadOptions& read_options, Arena* arena);
 
   // Add an entry into memtable that maps key to value at the
   // specified sequence number and with the specified type.
   // Typically value will be empty if type==kTypeDeletion.
+  //
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
   void Add(SequenceNumber seq, ValueType type,
            const Slice& key,
            const Slice& value);
@@ -96,7 +154,7 @@ class MemTable {
   //   store MergeInProgress in s, and return false.
   // Else, return false.
   bool Get(const LookupKey& key, std::string* value, Status* s,
-           MergeContext& merge_context, const Options& options);
+           MergeContext* merge_context);
 
   // Attempts to update the new_value inplace, else does normal Add
   // Pseudocode
@@ -105,6 +163,9 @@ class MemTable {
   //       update inplace
   //     else add(key, new_value)
   //   else add(key, new_value)
+  //
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
   void Update(SequenceNumber seq,
               const Slice& key,
               const Slice& value);
@@ -118,10 +179,12 @@ class MemTable {
   //       update inplace
   //     else add(key, new_value)
   //   else return false
+  //
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
   bool UpdateCallback(SequenceNumber seq,
                       const Slice& key,
-                      const Slice& delta,
-                      const Options& options);
+                      const Slice& delta);
 
   // Returns the number of successive merge entries starting from the newest
   // entry for the key up to the last non-merge entry or last entry for the
@@ -129,25 +192,50 @@ class MemTable {
   size_t CountSuccessiveMergeEntries(const LookupKey& key);
 
   // Get total number of entries in the mem table.
-  uint64_t GetNumEntries() const { return num_entries_; }
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable (unless this Memtable is immutable).
+  uint64_t num_entries() const { return num_entries_; }
+
+  // Get total number of deletes in the mem table.
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable (unless this Memtable is immutable).
+  uint64_t num_deletes() const { return num_deletes_; }
 
   // Returns the edits area that is needed for flushing the memtable
   VersionEdit* GetEdits() { return &edit_; }
 
+  // Returns if there is no entry inserted to the mem table.
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable (unless this Memtable is immutable).
+  bool IsEmpty() const { return first_seqno_ == 0; }
+
   // Returns the sequence number of the first element that was inserted
-  // into the memtable
+  // into the memtable.
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable (unless this Memtable is immutable).
   SequenceNumber GetFirstSequenceNumber() { return first_seqno_; }
 
   // Returns the next active logfile number when this memtable is about to
   // be flushed to storage
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
   uint64_t GetNextLogNumber() { return mem_next_logfile_number_; }
 
   // Sets the next active logfile number when this memtable is about to
   // be flushed to storage
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
   void SetNextLogNumber(uint64_t num) { mem_next_logfile_number_ = num; }
 
-  // Notify the underlying storage that no more items will be added
-  void MarkImmutable() { table_->MarkReadOnly(); }
+  // Notify the underlying storage that no more items will be added.
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
+  // After MarkImmutable() is called, you should not attempt to
+  // write anything to this MemTable().  (Ie. do not call Add() or Update()).
+  void MarkImmutable() {
+    table_->MarkReadOnly();
+    allocator_.DoneAllocating();
+  }
 
   // return true if the current MemTableRep supports merge operator.
   bool IsMergeOperatorSupported() const {
@@ -155,7 +243,10 @@ class MemTable {
   }
 
   // return true if the current MemTableRep supports snapshots.
-  bool IsSnapshotSupported() const { return table_->IsSnapshotSupported(); }
+  // inplace update prevents snapshots,
+  bool IsSnapshotSupported() const {
+    return table_->IsSnapshotSupported() && !moptions_.inplace_update_support;
+  }
 
   // Get the lock associated for the key
   port::RWMutex* GetLock(const Slice& key);
@@ -164,10 +255,10 @@ class MemTable {
     return comparator_.comparator;
   }
 
-  const Arena& TEST_GetArena() const { return arena_; }
+  const MemTableOptions* GetMemTableOptions() const { return &moptions_; }
 
  private:
-  // Dynamically check if we can add more incoming entries.
+  // Dynamically check if we can add more incoming entries
   bool ShouldFlushNow() const;
 
   friend class MemTableIterator;
@@ -175,13 +266,15 @@ class MemTable {
   friend class MemTableList;
 
   KeyComparator comparator_;
+  const MemTableOptions moptions_;
   int refs_;
   const size_t kArenaBlockSize;
-  const size_t kWriteBufferSize;
   Arena arena_;
+  MemTableAllocator allocator_;
   unique_ptr<MemTableRep> table_;
 
   uint64_t num_entries_;
+  uint64_t num_deletes_;
 
   // These are used to manage memtable flushes to storage
   bool flush_in_progress_; // started the flush
@@ -210,6 +303,10 @@ class MemTable {
 
   // a flag indicating if a memtable has met the criteria to flush
   bool should_flush_;
+
+  // a flag indicating if flush has been scheduled
+  bool flush_scheduled_;
+  Env* env_;
 };
 
 extern const char* EncodeKey(std::string* scratch, const Slice& target);
diff --git a/src/rocksdb/db/memtable_allocator.cc b/src/rocksdb/db/memtable_allocator.cc
new file mode 100644
index 0000000..d3ecea2
--- /dev/null
+++ b/src/rocksdb/db/memtable_allocator.cc
@@ -0,0 +1,52 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <assert.h>
+
+#include "db/memtable_allocator.h"
+#include "db/writebuffer.h"
+#include "util/arena.h"
+
+namespace rocksdb {
+
+MemTableAllocator::MemTableAllocator(Arena* arena, WriteBuffer* write_buffer)
+    : arena_(arena), write_buffer_(write_buffer), bytes_allocated_(0) {
+}
+
+MemTableAllocator::~MemTableAllocator() {
+  DoneAllocating();
+}
+
+char* MemTableAllocator::Allocate(size_t bytes) {
+  assert(write_buffer_ != nullptr);
+  bytes_allocated_ += bytes;
+  write_buffer_->ReserveMem(bytes);
+  return arena_->Allocate(bytes);
+}
+
+char* MemTableAllocator::AllocateAligned(size_t bytes, size_t huge_page_size,
+                                         Logger* logger) {
+  assert(write_buffer_ != nullptr);
+  bytes_allocated_ += bytes;
+  write_buffer_->ReserveMem(bytes);
+  return arena_->AllocateAligned(bytes, huge_page_size, logger);
+}
+
+void MemTableAllocator::DoneAllocating() {
+  if (write_buffer_ != nullptr) {
+    write_buffer_->FreeMem(bytes_allocated_);
+    write_buffer_ = nullptr;
+  }
+}
+
+size_t MemTableAllocator::BlockSize() const {
+  return arena_->BlockSize();
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/db/memtable_allocator.h b/src/rocksdb/db/memtable_allocator.h
new file mode 100644
index 0000000..fa8ee12
--- /dev/null
+++ b/src/rocksdb/db/memtable_allocator.h
@@ -0,0 +1,47 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// This is used by the MemTable to allocate write buffer memory. It connects
+// to WriteBuffer so we can track and enforce overall write buffer limits.
+
+#pragma once
+#include "util/allocator.h"
+
+namespace rocksdb {
+
+class Arena;
+class Logger;
+class WriteBuffer;
+
+class MemTableAllocator : public Allocator {
+ public:
+  explicit MemTableAllocator(Arena* arena, WriteBuffer* write_buffer);
+  ~MemTableAllocator();
+
+  // Allocator interface
+  char* Allocate(size_t bytes) override;
+  char* AllocateAligned(size_t bytes, size_t huge_page_size = 0,
+                        Logger* logger = nullptr) override;
+  size_t BlockSize() const override;
+
+  // Call when we're finished allocating memory so we can free it from
+  // the write buffer's limit.
+  void DoneAllocating();
+
+ private:
+  Arena* arena_;
+  WriteBuffer* write_buffer_;
+  size_t bytes_allocated_;
+
+  // No copying allowed
+  MemTableAllocator(const MemTableAllocator&);
+  void operator=(const MemTableAllocator&);
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/db/memtable_list.cc b/src/rocksdb/db/memtable_list.cc
index 2354219..54473dc 100644
--- a/src/rocksdb/db/memtable_list.cc
+++ b/src/rocksdb/db/memtable_list.cc
@@ -5,14 +5,21 @@
 //
 #include "db/memtable_list.h"
 
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
 #include <string>
 #include "rocksdb/db.h"
 #include "db/memtable.h"
 #include "db/version_set.h"
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
+#include "table/merger.h"
 #include "util/coding.h"
 #include "util/log_buffer.h"
+#include "util/thread_status_util.h"
 
 namespace rocksdb {
 
@@ -61,10 +68,9 @@ int MemTableList::size() const {
 // Return the most recent value found, if any.
 // Operands stores the list of merge operations to apply, so far.
 bool MemTableListVersion::Get(const LookupKey& key, std::string* value,
-                              Status* s, MergeContext& merge_context,
-                              const Options& options) {
+                              Status* s, MergeContext* merge_context) {
   for (auto& memtable : memlist_) {
-    if (memtable->Get(key, value, s, merge_context, options)) {
+    if (memtable->Get(key, value, s, merge_context)) {
       return true;
     }
   }
@@ -72,16 +78,33 @@ bool MemTableListVersion::Get(const LookupKey& key, std::string* value,
 }
 
 void MemTableListVersion::AddIterators(const ReadOptions& options,
-                                       std::vector<Iterator*>* iterator_list) {
+                                       std::vector<Iterator*>* iterator_list,
+                                       Arena* arena) {
+  for (auto& m : memlist_) {
+    iterator_list->push_back(m->NewIterator(options, arena));
+  }
+}
+
+void MemTableListVersion::AddIterators(
+    const ReadOptions& options, MergeIteratorBuilder* merge_iter_builder) {
   for (auto& m : memlist_) {
-    iterator_list->push_back(m->NewIterator(options));
+    merge_iter_builder->AddIterator(
+        m->NewIterator(options, merge_iter_builder->GetArena()));
   }
 }
 
 uint64_t MemTableListVersion::GetTotalNumEntries() const {
   uint64_t total_num = 0;
   for (auto& m : memlist_) {
-    total_num += m->GetNumEntries();
+    total_num += m->num_entries();
+  }
+  return total_num;
+}
+
+uint64_t MemTableListVersion::GetTotalNumDeletes() const {
+  uint64_t total_num = 0;
+  for (auto& m : memlist_) {
+    total_num += m->num_deletes();
   }
   return total_num;
 }
@@ -105,7 +128,7 @@ void MemTableListVersion::Remove(MemTable* m) {
 bool MemTableList::IsFlushPending() const {
   if ((flush_requested_ && num_flush_not_started_ >= 1) ||
       (num_flush_not_started_ >= min_write_buffer_number_to_merge_)) {
-    assert(imm_flush_needed.NoBarrier_Load() != nullptr);
+    assert(imm_flush_needed.load(std::memory_order_relaxed));
     return true;
   }
   return false;
@@ -113,6 +136,8 @@ bool MemTableList::IsFlushPending() const {
 
 // Returns the memtables that need to be flushed.
 void MemTableList::PickMemtablesToFlush(autovector<MemTable*>* ret) {
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_PICK_MEMTABLES_TO_FLUSH);
   const auto& memlist = current_->memlist_;
   for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) {
     MemTable* m = *it;
@@ -120,7 +145,7 @@ void MemTableList::PickMemtablesToFlush(autovector<MemTable*>* ret) {
       assert(!m->flush_completed_);
       num_flush_not_started_--;
       if (num_flush_not_started_ == 0) {
-        imm_flush_needed.Release_Store(nullptr);
+        imm_flush_needed.store(false, std::memory_order_release);
       }
       m->flush_in_progress_ = true;  // flushing will start very soon
       ret->push_back(m);
@@ -130,12 +155,13 @@ void MemTableList::PickMemtablesToFlush(autovector<MemTable*>* ret) {
 }
 
 void MemTableList::RollbackMemtableFlush(const autovector<MemTable*>& mems,
-                                         uint64_t file_number,
-                                         std::set<uint64_t>* pending_outputs) {
+                                         uint64_t file_number) {
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_MEMTABLE_ROLLBACK);
   assert(!mems.empty());
 
   // If the flush was not successful, then just reset state.
-  // Maybe a suceeding attempt to flush will be successful.
+  // Maybe a succeeding attempt to flush will be successful.
   for (MemTable* m : mems) {
     assert(m->flush_in_progress_);
     assert(m->file_number_ == 0);
@@ -145,19 +171,20 @@ void MemTableList::RollbackMemtableFlush(const autovector<MemTable*>& mems,
     m->edit_.Clear();
     num_flush_not_started_++;
   }
-  pending_outputs->erase(file_number);
-  imm_flush_needed.Release_Store(reinterpret_cast<void *>(1));
+  imm_flush_needed.store(true, std::memory_order_release);
 }
 
 // Record a successful flush in the manifest file
 Status MemTableList::InstallMemtableFlushResults(
-    ColumnFamilyData* cfd, const autovector<MemTable*>& mems, VersionSet* vset,
-    port::Mutex* mu, Logger* info_log, uint64_t file_number,
-    std::set<uint64_t>& pending_outputs, autovector<MemTable*>* to_delete,
+    ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+    const autovector<MemTable*>& mems, VersionSet* vset, InstrumentedMutex* mu,
+    uint64_t file_number, autovector<MemTable*>* to_delete,
     Directory* db_directory, LogBuffer* log_buffer) {
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS);
   mu->AssertHeld();
 
-  // flush was sucessful
+  // flush was successful
   for (size_t i = 0; i < mems.size(); ++i) {
     // All the edits are associated with the first memtable of this batch.
     assert(i == 0 || mems[i]->GetEdits()->NumEntries() == 0);
@@ -166,7 +193,7 @@ Status MemTableList::InstallMemtableFlushResults(
     mems[i]->file_number_ = file_number;
   }
 
-  // if some other thread is already commiting, then return
+  // if some other thread is already committing, then return
   Status s;
   if (commit_in_progress_) {
     return s;
@@ -184,11 +211,11 @@ Status MemTableList::InstallMemtableFlushResults(
       break;
     }
 
-    LogToBuffer(log_buffer, "[%s] Level-0 commit table #%lu started",
-                cfd->GetName().c_str(), (unsigned long)m->file_number_);
+    LogToBuffer(log_buffer, "[%s] Level-0 commit table #%" PRIu64 " started",
+                cfd->GetName().c_str(), m->file_number_);
 
     // this can release and reacquire the mutex.
-    s = vset->LogAndApply(cfd, &m->edit_, mu, db_directory);
+    s = vset->LogAndApply(cfd, mutable_cf_options, &m->edit_, mu, db_directory);
 
     // we will be changing the version in the next code path,
     // so we better create a new one, since versions are immutable
@@ -199,34 +226,26 @@ Status MemTableList::InstallMemtableFlushResults(
     uint64_t mem_id = 1;  // how many memtables has been flushed.
     do {
       if (s.ok()) { // commit new state
-        LogToBuffer(log_buffer,
-                    "[%s] Level-0 commit table #%lu: memtable #%lu done",
-                    cfd->GetName().c_str(), (unsigned long)m->file_number_,
-                    (unsigned long)mem_id);
+        LogToBuffer(log_buffer, "[%s] Level-0 commit table #%" PRIu64
+                                ": memtable #%" PRIu64 " done",
+                    cfd->GetName().c_str(), m->file_number_, mem_id);
         current_->Remove(m);
         assert(m->file_number_ > 0);
 
-        // pending_outputs can be cleared only after the newly created file
-        // has been written to a committed version so that other concurrently
-        // executing compaction threads do not mistakenly assume that this
-        // file is not live.
-        pending_outputs.erase(m->file_number_);
         if (m->Unref() != nullptr) {
           to_delete->push_back(m);
         }
       } else {
         //commit failed. setup state so that we can flush again.
-        Log(info_log,
-            "Level-0 commit table #%lu: memtable #%lu failed",
-            (unsigned long)m->file_number_,
-            (unsigned long)mem_id);
+        LogToBuffer(log_buffer, "Level-0 commit table #%" PRIu64
+                                ": memtable #%" PRIu64 " failed",
+                    m->file_number_, mem_id);
         m->flush_completed_ = false;
         m->flush_in_progress_ = false;
         m->edit_.Clear();
         num_flush_not_started_++;
-        pending_outputs.erase(m->file_number_);
         m->file_number_ = 0;
-        imm_flush_needed.Release_Store((void *)1);
+        imm_flush_needed.store(true, std::memory_order_release);
       }
       ++mem_id;
     } while (!current_->memlist_.empty() && (m = current_->memlist_.back()) &&
@@ -249,17 +268,17 @@ void MemTableList::Add(MemTable* m) {
   m->MarkImmutable();
   num_flush_not_started_++;
   if (num_flush_not_started_ == 1) {
-    imm_flush_needed.Release_Store((void *)1);
+    imm_flush_needed.store(true, std::memory_order_release);
   }
 }
 
 // Returns an estimate of the number of bytes of data in use.
 size_t MemTableList::ApproximateMemoryUsage() {
-  size_t size = 0;
+  size_t total_size = 0;
   for (auto& memtable : current_->memlist_) {
-    size += memtable->ApproximateMemoryUsage();
+    total_size += memtable->ApproximateMemoryUsage();
   }
-  return size;
+  return total_size;
 }
 
 void MemTableList::InstallNewVersion() {
diff --git a/src/rocksdb/db/memtable_list.h b/src/rocksdb/db/memtable_list.h
index d85380b..7b75dfa 100644
--- a/src/rocksdb/db/memtable_list.h
+++ b/src/rocksdb/db/memtable_list.h
@@ -15,23 +15,29 @@
 #include "rocksdb/iterator.h"
 
 #include "db/dbformat.h"
+#include "db/filename.h"
 #include "db/skiplist.h"
 #include "db/memtable.h"
 #include "rocksdb/db.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/options.h"
 #include "util/autovector.h"
+#include "util/instrumented_mutex.h"
 #include "util/log_buffer.h"
 
 namespace rocksdb {
 
 class ColumnFamilyData;
 class InternalKeyComparator;
-class Mutex;
+class InstrumentedMutex;
+class MergeIteratorBuilder;
 
 // keeps a list of immutable memtables in a vector. the list is immutable
 // if refcount is bigger than one. It is used as a state for Get() and
 // Iterator code paths
+//
+// This class is not thread-safe.  External synchronization is required
+// (such as holding the db mutex or being on the write thread).
 class MemTableListVersion {
  public:
   explicit MemTableListVersion(MemTableListVersion* old = nullptr);
@@ -44,13 +50,18 @@ class MemTableListVersion {
   // Search all the memtables starting from the most recent one.
   // Return the most recent value found, if any.
   bool Get(const LookupKey& key, std::string* value, Status* s,
-           MergeContext& merge_context, const Options& options);
+           MergeContext* merge_context);
+
+  void AddIterators(const ReadOptions& options,
+                    std::vector<Iterator*>* iterator_list, Arena* arena);
 
   void AddIterators(const ReadOptions& options,
-                    std::vector<Iterator*>* iterator_list);
+                    MergeIteratorBuilder* merge_iter_builder);
 
   uint64_t GetTotalNumEntries() const;
 
+  uint64_t GetTotalNumDeletes() const;
+
  private:
   // REQUIRE: m is mutable memtable
   void Add(MemTable* m);
@@ -69,25 +80,33 @@ class MemTableListVersion {
 // flushes can occur concurrently.  However, they are 'committed'
 // to the manifest in FIFO order to maintain correctness and
 // recoverability from a crash.
+//
+//
+// Other than imm_flush_needed, this class is not thread-safe and requires
+// external synchronization (such as holding the db mutex or being on the
+// write thread.)
 class MemTableList {
  public:
   // A list of memtables.
   explicit MemTableList(int min_write_buffer_number_to_merge)
-      : min_write_buffer_number_to_merge_(min_write_buffer_number_to_merge),
+      : imm_flush_needed(false),
+        min_write_buffer_number_to_merge_(min_write_buffer_number_to_merge),
         current_(new MemTableListVersion()),
         num_flush_not_started_(0),
         commit_in_progress_(false),
         flush_requested_(false) {
-    imm_flush_needed.Release_Store(nullptr);
     current_->Ref();
   }
+
+  // Should not delete MemTableList without making sure MemTableList::current()
+  // is Unref()'d.
   ~MemTableList() {}
 
   MemTableListVersion* current() { return current_; }
 
   // so that background threads can detect non-nullptr pointer to
   // determine whether there is anything more to start flushing.
-  port::AtomicPointer imm_flush_needed;
+  std::atomic<bool> imm_flush_needed;
 
   // Returns the total number of memtables in the list
   int size() const;
@@ -103,18 +122,14 @@ class MemTableList {
   // Reset status of the given memtable list back to pending state so that
   // they can get picked up again on the next round of flush.
   void RollbackMemtableFlush(const autovector<MemTable*>& mems,
-                             uint64_t file_number,
-                             std::set<uint64_t>* pending_outputs);
+                             uint64_t file_number);
 
   // Commit a successful flush in the manifest file
-  Status InstallMemtableFlushResults(ColumnFamilyData* cfd,
-                                     const autovector<MemTable*>& m,
-                                     VersionSet* vset, port::Mutex* mu,
-                                     Logger* info_log, uint64_t file_number,
-                                     std::set<uint64_t>& pending_outputs,
-                                     autovector<MemTable*>* to_delete,
-                                     Directory* db_directory,
-                                     LogBuffer* log_buffer);
+  Status InstallMemtableFlushResults(
+      ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+      const autovector<MemTable*>& m, VersionSet* vset, InstrumentedMutex* mu,
+      uint64_t file_number, autovector<MemTable*>* to_delete,
+      Directory* db_directory, LogBuffer* log_buffer);
 
   // New memtables are inserted at the front of the list.
   // Takes ownership of the referenced held on *m by the caller of Add().
@@ -123,7 +138,11 @@ class MemTableList {
   // Returns an estimate of the number of bytes of data in use.
   size_t ApproximateMemoryUsage();
 
-  // Request a flush of all existing memtables to storage
+  // Request a flush of all existing memtables to storage.  This will
+  // cause future calls to IsFlushPending() to return true if this list is
+  // non-empty (regardless of the min_write_buffer_number_to_merge
+  // parameter). This flush request will persist until the next time
+  // PickMemtablesToFlush() is called.
   void FlushRequested() { flush_requested_ = true; }
 
   // Copying allowed
diff --git a/src/rocksdb/db/memtable_list_test.cc b/src/rocksdb/db/memtable_list_test.cc
new file mode 100644
index 0000000..fc4e948
--- /dev/null
+++ b/src/rocksdb/db/memtable_list_test.cc
@@ -0,0 +1,414 @@
+//  Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include <string>
+#include <vector>
+#include "db/memtable_list.h"
+#include "db/merge_context.h"
+#include "db/version_set.h"
+#include "db/write_controller.h"
+#include "db/writebuffer.h"
+#include "rocksdb/db.h"
+#include "rocksdb/status.h"
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+class DumbLogger : public Logger {
+ public:
+  using Logger::Logv;
+  virtual void Logv(const char* format, va_list ap) override {}
+  virtual size_t GetLogFileSize() const override { return 0; }
+};
+
+class MemTableListTest : public testing::Test {
+ public:
+  std::string dbname;
+  DB* db;
+  Options options;
+
+  MemTableListTest() : db(nullptr) {
+    dbname = test::TmpDir() + "/memtable_list_test";
+  }
+
+  // Create a test db if not yet created
+  void CreateDB() {
+    if (db == nullptr) {
+      options.create_if_missing = true;
+      DestroyDB(dbname, options);
+      Status s = DB::Open(options, dbname, &db);
+      EXPECT_OK(s);
+    }
+  }
+
+  ~MemTableListTest() {
+    if (db) {
+      delete db;
+      DestroyDB(dbname, options);
+    }
+  }
+
+  // Calls MemTableList::InstallMemtableFlushResults() and sets up all
+  // structures needed to call this function.
+  Status Mock_InstallMemtableFlushResults(
+      MemTableList* list, const MutableCFOptions& mutable_cf_options,
+      const autovector<MemTable*>& m, autovector<MemTable*>* to_delete) {
+    // Create a mock Logger
+    DumbLogger logger;
+    LogBuffer log_buffer(DEBUG_LEVEL, &logger);
+
+    // Create a mock VersionSet
+    DBOptions db_options;
+    EnvOptions env_options;
+    shared_ptr<Cache> table_cache(NewLRUCache(50000, 16));
+    WriteBuffer write_buffer(db_options.db_write_buffer_size);
+    WriteController write_controller;
+
+    CreateDB();
+    VersionSet versions(dbname, &db_options, env_options, table_cache.get(),
+                        &write_buffer, &write_controller);
+
+    // Create mock default ColumnFamilyData
+    ColumnFamilyOptions cf_options;
+    std::vector<ColumnFamilyDescriptor> column_families;
+    column_families.emplace_back(kDefaultColumnFamilyName, cf_options);
+    EXPECT_OK(versions.Recover(column_families, false));
+
+    auto column_family_set = versions.GetColumnFamilySet();
+    auto cfd = column_family_set->GetColumnFamily(0);
+    EXPECT_TRUE(cfd != nullptr);
+
+    // Create dummy mutex.
+    InstrumentedMutex mutex;
+    InstrumentedMutexLock l(&mutex);
+
+    return list->InstallMemtableFlushResults(cfd, mutable_cf_options, m,
+                                             &versions, &mutex, 1, to_delete,
+                                             nullptr, &log_buffer);
+  }
+};
+
+TEST_F(MemTableListTest, Empty) {
+  // Create an empty MemTableList and validate basic functions.
+  MemTableList list(1);
+
+  ASSERT_EQ(0, list.size());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+  ASSERT_FALSE(list.IsFlushPending());
+
+  autovector<MemTable*> mems;
+  list.PickMemtablesToFlush(&mems);
+  ASSERT_EQ(0, mems.size());
+
+  autovector<MemTable*> to_delete;
+  list.current()->Unref(&to_delete);
+  ASSERT_EQ(0, to_delete.size());
+}
+
+TEST_F(MemTableListTest, GetTest) {
+  // Create MemTableList
+  int min_write_buffer_number_to_merge = 2;
+  MemTableList list(min_write_buffer_number_to_merge);
+
+  SequenceNumber seq = 1;
+  std::string value;
+  Status s;
+  MergeContext merge_context;
+
+  LookupKey lkey("key1", seq);
+  bool found = list.current()->Get(lkey, &value, &s, &merge_context);
+  ASSERT_FALSE(found);
+
+  // Create a MemTable
+  InternalKeyComparator cmp(BytewiseComparator());
+  auto factory = std::make_shared<SkipListFactory>();
+  options.memtable_factory = factory;
+  ImmutableCFOptions ioptions(options);
+
+  WriteBuffer wb(options.db_write_buffer_size);
+  MemTable* mem =
+      new MemTable(cmp, ioptions, MutableCFOptions(options, ioptions), &wb);
+  mem->Ref();
+
+  // Write some keys to this memtable.
+  mem->Add(++seq, kTypeDeletion, "key1", "");
+  mem->Add(++seq, kTypeValue, "key2", "value2");
+  mem->Add(++seq, kTypeValue, "key1", "value1");
+  mem->Add(++seq, kTypeValue, "key2", "value2.2");
+
+  // Fetch the newly written keys
+  merge_context.Clear();
+  found = mem->Get(LookupKey("key1", seq), &value, &s, &merge_context);
+  ASSERT_TRUE(s.ok() && found);
+  ASSERT_EQ(value, "value1");
+
+  merge_context.Clear();
+  found = mem->Get(LookupKey("key1", 2), &value, &s, &merge_context);
+  // MemTable found out that this key is *not* found (at this sequence#)
+  ASSERT_TRUE(found && s.IsNotFound());
+
+  merge_context.Clear();
+  found = mem->Get(LookupKey("key2", seq), &value, &s, &merge_context);
+  ASSERT_TRUE(s.ok() && found);
+  ASSERT_EQ(value, "value2.2");
+
+  ASSERT_EQ(4, mem->num_entries());
+  ASSERT_EQ(1, mem->num_deletes());
+
+  // Add memtable to list
+  list.Add(mem);
+
+  SequenceNumber saved_seq = seq;
+
+  // Create another memtable and write some keys to it
+  WriteBuffer wb2(options.db_write_buffer_size);
+  MemTable* mem2 =
+      new MemTable(cmp, ioptions, MutableCFOptions(options, ioptions), &wb2);
+  mem2->Ref();
+
+  mem2->Add(++seq, kTypeDeletion, "key1", "");
+  mem2->Add(++seq, kTypeValue, "key2", "value2.3");
+
+  // Add second memtable to list
+  list.Add(mem2);
+
+  // Fetch keys via MemTableList
+  merge_context.Clear();
+  found =
+      list.current()->Get(LookupKey("key1", seq), &value, &s, &merge_context);
+  ASSERT_TRUE(found && s.IsNotFound());
+
+  merge_context.Clear();
+  found = list.current()->Get(LookupKey("key1", saved_seq), &value, &s,
+                              &merge_context);
+  ASSERT_TRUE(s.ok() && found);
+  ASSERT_EQ("value1", value);
+
+  merge_context.Clear();
+  found =
+      list.current()->Get(LookupKey("key2", seq), &value, &s, &merge_context);
+  ASSERT_TRUE(s.ok() && found);
+  ASSERT_EQ(value, "value2.3");
+
+  merge_context.Clear();
+  found = list.current()->Get(LookupKey("key2", 1), &value, &s, &merge_context);
+  ASSERT_FALSE(found);
+
+  ASSERT_EQ(2, list.size());
+
+  autovector<MemTable*> to_delete;
+  list.current()->Unref(&to_delete);
+  for (MemTable* m : to_delete) {
+    delete m;
+  }
+}
+
+TEST_F(MemTableListTest, FlushPendingTest) {
+  const int num_tables = 5;
+  SequenceNumber seq = 1;
+  Status s;
+
+  auto factory = std::make_shared<SkipListFactory>();
+  options.memtable_factory = factory;
+  ImmutableCFOptions ioptions(options);
+  InternalKeyComparator cmp(BytewiseComparator());
+  WriteBuffer wb(options.db_write_buffer_size);
+
+  // Create MemTableList
+  int min_write_buffer_number_to_merge = 3;
+  MemTableList list(min_write_buffer_number_to_merge);
+
+  // Create some MemTables
+  std::vector<MemTable*> tables;
+  MutableCFOptions mutable_cf_options(options, ioptions);
+  for (int i = 0; i < num_tables; i++) {
+    MemTable* mem = new MemTable(cmp, ioptions, mutable_cf_options, &wb);
+    mem->Ref();
+
+    std::string value;
+    MergeContext merge_context;
+
+    mem->Add(++seq, kTypeValue, "key1", std::to_string(i));
+    mem->Add(++seq, kTypeValue, "keyN" + std::to_string(i), "valueN");
+    mem->Add(++seq, kTypeValue, "keyX" + std::to_string(i), "value");
+    mem->Add(++seq, kTypeValue, "keyM" + std::to_string(i), "valueM");
+    mem->Add(++seq, kTypeDeletion, "keyX" + std::to_string(i), "");
+
+    tables.push_back(mem);
+  }
+
+  // Nothing to flush
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+  autovector<MemTable*> to_flush;
+  list.PickMemtablesToFlush(&to_flush);
+  ASSERT_EQ(0, to_flush.size());
+
+  // Request a flush even though there is nothing to flush
+  list.FlushRequested();
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Attempt to 'flush' to clear request for flush
+  list.PickMemtablesToFlush(&to_flush);
+  ASSERT_EQ(0, to_flush.size());
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Request a flush again
+  list.FlushRequested();
+  // No flush pending since the list is empty.
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Add 2 tables
+  list.Add(tables[0]);
+  list.Add(tables[1]);
+  ASSERT_EQ(2, list.size());
+
+  // Even though we have less than the minimum to flush, a flush is
+  // pending since we had previously requested a flush and never called
+  // PickMemtablesToFlush() to clear the flush.
+  ASSERT_TRUE(list.IsFlushPending());
+  ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Pick tables to flush
+  list.PickMemtablesToFlush(&to_flush);
+  ASSERT_EQ(2, to_flush.size());
+  ASSERT_EQ(2, list.size());
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Revert flush
+  list.RollbackMemtableFlush(to_flush, 0);
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+  to_flush.clear();
+
+  // Add another table
+  list.Add(tables[2]);
+  // We now have the minimum to flush regardles of whether FlushRequested()
+  // was called.
+  ASSERT_TRUE(list.IsFlushPending());
+  ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Pick tables to flush
+  list.PickMemtablesToFlush(&to_flush);
+  ASSERT_EQ(3, to_flush.size());
+  ASSERT_EQ(3, list.size());
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Pick tables to flush again
+  autovector<MemTable*> to_flush2;
+  list.PickMemtablesToFlush(&to_flush2);
+  ASSERT_EQ(0, to_flush2.size());
+  ASSERT_EQ(3, list.size());
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Add another table
+  list.Add(tables[3]);
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Request a flush again
+  list.FlushRequested();
+  ASSERT_TRUE(list.IsFlushPending());
+  ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Pick tables to flush again
+  list.PickMemtablesToFlush(&to_flush2);
+  ASSERT_EQ(1, to_flush2.size());
+  ASSERT_EQ(4, list.size());
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Rollback first pick of tables
+  list.RollbackMemtableFlush(to_flush, 0);
+  ASSERT_TRUE(list.IsFlushPending());
+  ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+  to_flush.clear();
+
+  // Add another tables
+  list.Add(tables[4]);
+  ASSERT_EQ(5, list.size());
+  // We now have the minimum to flush regardles of whether FlushRequested()
+  ASSERT_TRUE(list.IsFlushPending());
+  ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Pick tables to flush
+  list.PickMemtablesToFlush(&to_flush);
+  // Should pick 4 of 5 since 1 table has been picked in to_flush2
+  ASSERT_EQ(4, to_flush.size());
+  ASSERT_EQ(5, list.size());
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Pick tables to flush again
+  autovector<MemTable*> to_flush3;
+  ASSERT_EQ(0, to_flush3.size());  // nothing not in progress of being flushed
+  ASSERT_EQ(5, list.size());
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  autovector<MemTable*> to_delete;
+
+  // Flush the 4 memtables that were picked in to_flush
+  s = Mock_InstallMemtableFlushResults(
+      &list, MutableCFOptions(options, ioptions), to_flush, &to_delete);
+  ASSERT_OK(s);
+
+  // Note:  now to_flush contains tables[0,1,2,4].  to_flush2 contains
+  // tables[3].
+  // Current implementation will only commit memtables in the order they were
+  // created.  So InstallMemtableFlushResults will install the first 3 tables
+  // in to_flush and stop when it encounters a table not yet flushed.
+  ASSERT_EQ(3, to_delete.size());
+  ASSERT_EQ(2, list.size());
+
+  for (const auto& m : to_delete) {
+    // Refcount should be 0 after calling InstallMemtableFlushResults.
+    // Verify this, by Ref'ing then UnRef'ing:
+    m->Ref();
+    ASSERT_EQ(m, m->Unref());
+    delete m;
+  }
+  to_delete.clear();
+
+  // Request a flush again. Should be nothing to flush
+  list.FlushRequested();
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Flush the 1 memtable that was picked in to_flush2
+  s = MemTableListTest::Mock_InstallMemtableFlushResults(
+      &list, MutableCFOptions(options, ioptions), to_flush2, &to_delete);
+  ASSERT_OK(s);
+
+  // This will actually intall 2 tables.  The 1 we told it to flush, and also
+  // tables[4] which has been waiting for tables[3] to commit.
+  ASSERT_EQ(2, to_delete.size());
+  ASSERT_EQ(0, list.size());
+
+  for (const auto& m : to_delete) {
+    // Refcount should be 0 after calling InstallMemtableFlushResults.
+    // Verify this, by Ref'ing then UnRef'ing:
+    m->Ref();
+    ASSERT_EQ(m, m->Unref());
+    delete m;
+  }
+  to_delete.clear();
+
+  list.current()->Unref(&to_delete);
+  ASSERT_EQ(0, to_delete.size());
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/memtablerep_bench.cc b/src/rocksdb/db/memtablerep_bench.cc
new file mode 100644
index 0000000..feb3723
--- /dev/null
+++ b/src/rocksdb/db/memtablerep_bench.cc
@@ -0,0 +1,694 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#define __STDC_FORMAT_MACROS
+
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+  return 1;
+}
+#else
+
+#include <gflags/gflags.h>
+
+#include <atomic>
+#include <iostream>
+#include <memory>
+#include <thread>
+#include <type_traits>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/memtable.h"
+#include "db/writebuffer.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice_transform.h"
+#include "util/arena.h"
+#include "util/mutexlock.h"
+#include "util/stop_watch.h"
+#include "util/testutil.h"
+
+using GFLAGS::ParseCommandLineFlags;
+using GFLAGS::RegisterFlagValidator;
+using GFLAGS::SetUsageMessage;
+
+DEFINE_string(benchmarks, "fillrandom",
+              "Comma-separated list of benchmarks to run. Options:\n"
+              "\tfillrandom             -- write N random values\n"
+              "\tfillseq                -- write N values in sequential order\n"
+              "\treadrandom             -- read N values in random order\n"
+              "\treadseq                -- scan the DB\n"
+              "\treadwrite              -- 1 thread writes while N - 1 threads "
+              "do random\n"
+              "\t                          reads\n"
+              "\tseqreadwrite           -- 1 thread writes while N - 1 threads "
+              "do scans\n");
+
+DEFINE_string(memtablerep, "skiplist",
+              "Which implementation of memtablerep to use. See "
+              "include/memtablerep.h for\n"
+              "  more details. Options:\n"
+              "\tskiplist            -- backed by a skiplist\n"
+              "\tvector              -- backed by an std::vector\n"
+              "\thashskiplist        -- backed by a hash skip list\n"
+              "\thashlinklist        -- backed by a hash linked list\n"
+              "\tcuckoo              -- backed by a cuckoo hash table");
+
+DEFINE_int64(bucket_count, 1000000,
+             "bucket_count parameter to pass into NewHashSkiplistRepFactory or "
+             "NewHashLinkListRepFactory");
+
+DEFINE_int32(
+    hashskiplist_height, 4,
+    "skiplist_height parameter to pass into NewHashSkiplistRepFactory");
+
+DEFINE_int32(
+    hashskiplist_branching_factor, 4,
+    "branching_factor parameter to pass into NewHashSkiplistRepFactory");
+
+DEFINE_int32(
+    huge_page_tlb_size, 0,
+    "huge_page_tlb_size parameter to pass into NewHashLinkListRepFactory");
+
+DEFINE_int32(bucket_entries_logging_threshold, 4096,
+             "bucket_entries_logging_threshold parameter to pass into "
+             "NewHashLinkListRepFactory");
+
+DEFINE_bool(if_log_bucket_dist_when_flash, true,
+            "if_log_bucket_dist_when_flash parameter to pass into "
+            "NewHashLinkListRepFactory");
+
+DEFINE_int32(
+    threshold_use_skiplist, 256,
+    "threshold_use_skiplist parameter to pass into NewHashLinkListRepFactory");
+
+DEFINE_int64(
+    write_buffer_size, 256,
+    "write_buffer_size parameter to pass into NewHashCuckooRepFactory");
+
+DEFINE_int64(
+    average_data_size, 64,
+    "average_data_size parameter to pass into NewHashCuckooRepFactory");
+
+DEFINE_int64(
+    hash_function_count, 4,
+    "hash_function_count parameter to pass into NewHashCuckooRepFactory");
+
+DEFINE_int32(
+    num_threads, 1,
+    "Number of concurrent threads to run. If the benchmark includes writes,\n"
+    "then at most one thread will be a writer");
+
+DEFINE_int32(num_operations, 1000000,
+             "Number of operations to do for write and random read benchmarks");
+
+DEFINE_int32(num_scans, 10,
+             "Number of times for each thread to scan the memtablerep for "
+             "sequential read "
+             "benchmarks");
+
+DEFINE_int32(item_size, 100, "Number of bytes each item should be");
+
+DEFINE_int32(prefix_length, 8,
+             "Prefix length to pass into NewFixedPrefixTransform");
+
+/* VectorRep settings */
+DEFINE_int64(vectorrep_count, 0,
+             "Number of entries to reserve on VectorRep initialization");
+
+DEFINE_int64(seed, 0,
+             "Seed base for random number generators. "
+             "When 0 it is deterministic.");
+
+namespace rocksdb {
+
+namespace {
+struct CallbackVerifyArgs {
+  bool found;
+  LookupKey* key;
+  MemTableRep* table;
+  InternalKeyComparator* comparator;
+};
+}  // namespace
+
+// Helper for quickly generating random data.
+class RandomGenerator {
+ private:
+  std::string data_;
+  unsigned int pos_;
+
+ public:
+  RandomGenerator() {
+    Random rnd(301);
+    auto size = (unsigned)std::max(1048576, FLAGS_item_size);
+    test::RandomString(&rnd, size, &data_);
+    pos_ = 0;
+  }
+
+  Slice Generate(unsigned int len) {
+    assert(len <= data_.size());
+    if (pos_ + len > data_.size()) {
+      pos_ = 0;
+    }
+    pos_ += len;
+    return Slice(data_.data() + pos_ - len, len);
+  }
+};
+
+enum WriteMode { SEQUENTIAL, RANDOM, UNIQUE_RANDOM };
+
+class KeyGenerator {
+ public:
+  KeyGenerator(Random64* rand, WriteMode mode, uint64_t num)
+      : rand_(rand), mode_(mode), num_(num), next_(0) {
+    if (mode_ == UNIQUE_RANDOM) {
+      // NOTE: if memory consumption of this approach becomes a concern,
+      // we can either break it into pieces and only random shuffle a section
+      // each time. Alternatively, use a bit map implementation
+      // (https://reviews.facebook.net/differential/diff/54627/)
+      values_.resize(num_);
+      for (uint64_t i = 0; i < num_; ++i) {
+        values_[i] = i;
+      }
+      std::shuffle(
+          values_.begin(), values_.end(),
+          std::default_random_engine(static_cast<unsigned int>(FLAGS_seed)));
+    }
+  }
+
+  uint64_t Next() {
+    switch (mode_) {
+      case SEQUENTIAL:
+        return next_++;
+      case RANDOM:
+        return rand_->Next() % num_;
+      case UNIQUE_RANDOM:
+        return values_[next_++];
+    }
+    assert(false);
+    return std::numeric_limits<uint64_t>::max();
+  }
+
+ private:
+  Random64* rand_;
+  WriteMode mode_;
+  const uint64_t num_;
+  uint64_t next_;
+  std::vector<uint64_t> values_;
+};
+
+class BenchmarkThread {
+ public:
+  explicit BenchmarkThread(MemTableRep* table, KeyGenerator* key_gen,
+                           uint64_t* bytes_written, uint64_t* bytes_read,
+                           uint64_t* sequence, uint64_t num_ops,
+                           uint64_t* read_hits)
+      : table_(table),
+        key_gen_(key_gen),
+        bytes_written_(bytes_written),
+        bytes_read_(bytes_read),
+        sequence_(sequence),
+        num_ops_(num_ops),
+        read_hits_(read_hits) {}
+
+  virtual void operator()() = 0;
+  virtual ~BenchmarkThread() {}
+
+ protected:
+  MemTableRep* table_;
+  KeyGenerator* key_gen_;
+  uint64_t* bytes_written_;
+  uint64_t* bytes_read_;
+  uint64_t* sequence_;
+  uint64_t num_ops_;
+  uint64_t* read_hits_;
+  RandomGenerator generator_;
+};
+
+class FillBenchmarkThread : public BenchmarkThread {
+ public:
+  FillBenchmarkThread(MemTableRep* table, KeyGenerator* key_gen,
+                      uint64_t* bytes_written, uint64_t* bytes_read,
+                      uint64_t* sequence, uint64_t num_ops, uint64_t* read_hits)
+      : BenchmarkThread(table, key_gen, bytes_written, bytes_read, sequence,
+                        num_ops, read_hits) {}
+
+  void FillOne() {
+    char* buf = nullptr;
+    auto internal_key_size = 16;
+    auto encoded_len =
+        FLAGS_item_size + VarintLength(internal_key_size) + internal_key_size;
+    KeyHandle handle = table_->Allocate(encoded_len, &buf);
+    assert(buf != nullptr);
+    char* p = EncodeVarint32(buf, internal_key_size);
+    auto key = key_gen_->Next();
+    EncodeFixed64(p, key);
+    p += 8;
+    EncodeFixed64(p, ++(*sequence_));
+    p += 8;
+    Slice bytes = generator_.Generate(FLAGS_item_size);
+    memcpy(p, bytes.data(), FLAGS_item_size);
+    p += FLAGS_item_size;
+    assert(p == buf + encoded_len);
+    table_->Insert(handle);
+    *bytes_written_ += encoded_len;
+  }
+
+  void operator()() override {
+    for (unsigned int i = 0; i < num_ops_; ++i) {
+      FillOne();
+    }
+  }
+};
+
+class ConcurrentFillBenchmarkThread : public FillBenchmarkThread {
+ public:
+  ConcurrentFillBenchmarkThread(MemTableRep* table, KeyGenerator* key_gen,
+                                uint64_t* bytes_written, uint64_t* bytes_read,
+                                uint64_t* sequence, uint64_t num_ops,
+                                uint64_t* read_hits,
+                                std::atomic_int* threads_done)
+      : FillBenchmarkThread(table, key_gen, bytes_written, bytes_read, sequence,
+                            num_ops, read_hits) {
+    threads_done_ = threads_done;
+  }
+
+  void operator()() override {
+    // # of read threads will be total threads - write threads (always 1). Loop
+    // while all reads complete.
+    while ((*threads_done_).load() < (FLAGS_num_threads - 1)) {
+      FillOne();
+    }
+  }
+
+ private:
+  std::atomic_int* threads_done_;
+};
+
+class ReadBenchmarkThread : public BenchmarkThread {
+ public:
+  ReadBenchmarkThread(MemTableRep* table, KeyGenerator* key_gen,
+                      uint64_t* bytes_written, uint64_t* bytes_read,
+                      uint64_t* sequence, uint64_t num_ops, uint64_t* read_hits)
+      : BenchmarkThread(table, key_gen, bytes_written, bytes_read, sequence,
+                        num_ops, read_hits) {}
+
+  static bool callback(void* arg, const char* entry) {
+    CallbackVerifyArgs* callback_args = static_cast<CallbackVerifyArgs*>(arg);
+    assert(callback_args != nullptr);
+    uint32_t key_length;
+    const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
+    if ((callback_args->comparator)->user_comparator()->Compare(
+            Slice(key_ptr, key_length - 8), callback_args->key->user_key()) ==
+        0) {
+      callback_args->found = true;
+    }
+    return false;
+  }
+
+  void ReadOne() {
+    std::string user_key;
+    auto key = key_gen_->Next();
+    PutFixed64(&user_key, key);
+    LookupKey lookup_key(user_key, *sequence_);
+    InternalKeyComparator internal_key_comp(BytewiseComparator());
+    CallbackVerifyArgs verify_args;
+    verify_args.found = false;
+    verify_args.key = &lookup_key;
+    verify_args.table = table_;
+    verify_args.comparator = &internal_key_comp;
+    table_->Get(lookup_key, &verify_args, callback);
+    if (verify_args.found) {
+      *bytes_read_ += VarintLength(16) + 16 + FLAGS_item_size;
+      ++*read_hits_;
+    }
+  }
+  void operator()() override {
+    for (unsigned int i = 0; i < num_ops_; ++i) {
+      ReadOne();
+    }
+  }
+};
+
+class SeqReadBenchmarkThread : public BenchmarkThread {
+ public:
+  SeqReadBenchmarkThread(MemTableRep* table, KeyGenerator* key_gen,
+                         uint64_t* bytes_written, uint64_t* bytes_read,
+                         uint64_t* sequence, uint64_t num_ops,
+                         uint64_t* read_hits)
+      : BenchmarkThread(table, key_gen, bytes_written, bytes_read, sequence,
+                        num_ops, read_hits) {}
+
+  void ReadOneSeq() {
+    std::unique_ptr<MemTableRep::Iterator> iter(table_->GetIterator());
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      // pretend to read the value
+      *bytes_read_ += VarintLength(16) + 16 + FLAGS_item_size;
+    }
+    ++*read_hits_;
+  }
+
+  void operator()() override {
+    for (unsigned int i = 0; i < num_ops_; ++i) {
+      { ReadOneSeq(); }
+    }
+  }
+};
+
+class ConcurrentReadBenchmarkThread : public ReadBenchmarkThread {
+ public:
+  ConcurrentReadBenchmarkThread(MemTableRep* table, KeyGenerator* key_gen,
+                                uint64_t* bytes_written, uint64_t* bytes_read,
+                                uint64_t* sequence, uint64_t num_ops,
+                                uint64_t* read_hits,
+                                std::atomic_int* threads_done)
+      : ReadBenchmarkThread(table, key_gen, bytes_written, bytes_read, sequence,
+                            num_ops, read_hits) {
+    threads_done_ = threads_done;
+  }
+
+  void operator()() override {
+    for (unsigned int i = 0; i < num_ops_; ++i) {
+      ReadOne();
+    }
+    ++*threads_done_;
+  }
+
+ private:
+  std::atomic_int* threads_done_;
+};
+
+class SeqConcurrentReadBenchmarkThread : public SeqReadBenchmarkThread {
+ public:
+  SeqConcurrentReadBenchmarkThread(MemTableRep* table, KeyGenerator* key_gen,
+                                   uint64_t* bytes_written,
+                                   uint64_t* bytes_read, uint64_t* sequence,
+                                   uint64_t num_ops, uint64_t* read_hits,
+                                   std::atomic_int* threads_done)
+      : SeqReadBenchmarkThread(table, key_gen, bytes_written, bytes_read,
+                               sequence, num_ops, read_hits) {
+    threads_done_ = threads_done;
+  }
+
+  void operator()() override {
+    for (unsigned int i = 0; i < num_ops_; ++i) {
+      ReadOneSeq();
+    }
+    ++*threads_done_;
+  }
+
+ private:
+  std::atomic_int* threads_done_;
+};
+
+class Benchmark {
+ public:
+  explicit Benchmark(MemTableRep* table, KeyGenerator* key_gen,
+                     uint64_t* sequence, uint32_t num_threads)
+      : table_(table),
+        key_gen_(key_gen),
+        sequence_(sequence),
+        num_threads_(num_threads) {}
+
+  virtual ~Benchmark() {}
+  virtual void Run() {
+    std::cout << "Number of threads: " << num_threads_ << std::endl;
+    std::vector<std::thread> threads;
+    uint64_t bytes_written = 0;
+    uint64_t bytes_read = 0;
+    uint64_t read_hits = 0;
+    StopWatchNano timer(Env::Default(), true);
+    RunThreads(&threads, &bytes_written, &bytes_read, true, &read_hits);
+    auto elapsed_time = static_cast<double>(timer.ElapsedNanos() / 1000);
+    std::cout << "Elapsed time: " << static_cast<int>(elapsed_time) << " us"
+              << std::endl;
+
+    if (bytes_written > 0) {
+      auto MiB_written = static_cast<double>(bytes_written) / (1 << 20);
+      auto write_throughput = MiB_written / (elapsed_time / 1000000);
+      std::cout << "Total bytes written: " << MiB_written << " MiB"
+                << std::endl;
+      std::cout << "Write throughput: " << write_throughput << " MiB/s"
+                << std::endl;
+      auto us_per_op = elapsed_time / num_write_ops_per_thread_;
+      std::cout << "write us/op: " << us_per_op << std::endl;
+    }
+    if (bytes_read > 0) {
+      auto MiB_read = static_cast<double>(bytes_read) / (1 << 20);
+      auto read_throughput = MiB_read / (elapsed_time / 1000000);
+      std::cout << "Total bytes read: " << MiB_read << " MiB" << std::endl;
+      std::cout << "Read throughput: " << read_throughput << " MiB/s"
+                << std::endl;
+      auto us_per_op = elapsed_time / num_read_ops_per_thread_;
+      std::cout << "read us/op: " << us_per_op << std::endl;
+    }
+  }
+
+  virtual void RunThreads(std::vector<std::thread>* threads,
+                          uint64_t* bytes_written, uint64_t* bytes_read,
+                          bool write, uint64_t* read_hits) = 0;
+
+ protected:
+  MemTableRep* table_;
+  KeyGenerator* key_gen_;
+  uint64_t* sequence_;
+  uint64_t num_write_ops_per_thread_;
+  uint64_t num_read_ops_per_thread_;
+  const uint32_t num_threads_;
+};
+
+class FillBenchmark : public Benchmark {
+ public:
+  explicit FillBenchmark(MemTableRep* table, KeyGenerator* key_gen,
+                         uint64_t* sequence)
+      : Benchmark(table, key_gen, sequence, 1) {
+    num_write_ops_per_thread_ = FLAGS_num_operations;
+  }
+
+  void RunThreads(std::vector<std::thread>* threads, uint64_t* bytes_written,
+                  uint64_t* bytes_read, bool write,
+                  uint64_t* read_hits) override {
+    FillBenchmarkThread(table_, key_gen_, bytes_written, bytes_read, sequence_,
+                        num_write_ops_per_thread_, read_hits)();
+  }
+};
+
+class ReadBenchmark : public Benchmark {
+ public:
+  explicit ReadBenchmark(MemTableRep* table, KeyGenerator* key_gen,
+                         uint64_t* sequence)
+      : Benchmark(table, key_gen, sequence, FLAGS_num_threads) {
+    num_read_ops_per_thread_ = FLAGS_num_operations / FLAGS_num_threads;
+  }
+
+  void RunThreads(std::vector<std::thread>* threads, uint64_t* bytes_written,
+                  uint64_t* bytes_read, bool write,
+                  uint64_t* read_hits) override {
+    for (int i = 0; i < FLAGS_num_threads; ++i) {
+      threads->emplace_back(
+          ReadBenchmarkThread(table_, key_gen_, bytes_written, bytes_read,
+                              sequence_, num_read_ops_per_thread_, read_hits));
+    }
+    for (auto& thread : *threads) {
+      thread.join();
+    }
+    std::cout << "read hit%: "
+              << (static_cast<double>(*read_hits) / FLAGS_num_operations) * 100
+              << std::endl;
+  }
+};
+
+class SeqReadBenchmark : public Benchmark {
+ public:
+  explicit SeqReadBenchmark(MemTableRep* table, uint64_t* sequence)
+      : Benchmark(table, nullptr, sequence, FLAGS_num_threads) {
+    num_read_ops_per_thread_ = FLAGS_num_scans;
+  }
+
+  void RunThreads(std::vector<std::thread>* threads, uint64_t* bytes_written,
+                  uint64_t* bytes_read, bool write,
+                  uint64_t* read_hits) override {
+    for (int i = 0; i < FLAGS_num_threads; ++i) {
+      threads->emplace_back(SeqReadBenchmarkThread(
+          table_, key_gen_, bytes_written, bytes_read, sequence_,
+          num_read_ops_per_thread_, read_hits));
+    }
+    for (auto& thread : *threads) {
+      thread.join();
+    }
+  }
+};
+
+template <class ReadThreadType>
+class ReadWriteBenchmark : public Benchmark {
+ public:
+  explicit ReadWriteBenchmark(MemTableRep* table, KeyGenerator* key_gen,
+                              uint64_t* sequence)
+      : Benchmark(table, key_gen, sequence, FLAGS_num_threads) {
+    num_read_ops_per_thread_ =
+        FLAGS_num_threads <= 1
+            ? 0
+            : (FLAGS_num_operations / (FLAGS_num_threads - 1));
+    num_write_ops_per_thread_ = FLAGS_num_operations;
+  }
+
+  void RunThreads(std::vector<std::thread>* threads, uint64_t* bytes_written,
+                  uint64_t* bytes_read, bool write,
+                  uint64_t* read_hits) override {
+    std::atomic_int threads_done;
+    threads_done.store(0);
+    threads->emplace_back(ConcurrentFillBenchmarkThread(
+        table_, key_gen_, bytes_written, bytes_read, sequence_,
+        num_write_ops_per_thread_, read_hits, &threads_done));
+    for (int i = 1; i < FLAGS_num_threads; ++i) {
+      threads->emplace_back(
+          ReadThreadType(table_, key_gen_, bytes_written, bytes_read, sequence_,
+                         num_read_ops_per_thread_, read_hits, &threads_done));
+    }
+    for (auto& thread : *threads) {
+      thread.join();
+    }
+  }
+};
+
+}  // namespace rocksdb
+
+void PrintWarnings() {
+#if defined(__GNUC__) && !defined(__OPTIMIZE__)
+  fprintf(stdout,
+          "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n");
+#endif
+#ifndef NDEBUG
+  fprintf(stdout,
+          "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
+#endif
+}
+
+int main(int argc, char** argv) {
+  rocksdb::port::InstallStackTraceHandler();
+  SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
+                  " [OPTIONS]...");
+  ParseCommandLineFlags(&argc, &argv, true);
+
+  PrintWarnings();
+
+  rocksdb::Options options;
+
+  std::unique_ptr<rocksdb::MemTableRepFactory> factory;
+  if (FLAGS_memtablerep == "skiplist") {
+    factory.reset(new rocksdb::SkipListFactory);
+  } else if (FLAGS_memtablerep == "vector") {
+    factory.reset(new rocksdb::VectorRepFactory);
+  } else if (FLAGS_memtablerep == "hashskiplist") {
+    factory.reset(rocksdb::NewHashSkipListRepFactory(
+        FLAGS_bucket_count, FLAGS_hashskiplist_height,
+        FLAGS_hashskiplist_branching_factor));
+    options.prefix_extractor.reset(
+        rocksdb::NewFixedPrefixTransform(FLAGS_prefix_length));
+  } else if (FLAGS_memtablerep == "hashlinklist") {
+    factory.reset(rocksdb::NewHashLinkListRepFactory(
+        FLAGS_bucket_count, FLAGS_huge_page_tlb_size,
+        FLAGS_bucket_entries_logging_threshold,
+        FLAGS_if_log_bucket_dist_when_flash, FLAGS_threshold_use_skiplist));
+    options.prefix_extractor.reset(
+        rocksdb::NewFixedPrefixTransform(FLAGS_prefix_length));
+  } else if (FLAGS_memtablerep == "cuckoo") {
+    factory.reset(rocksdb::NewHashCuckooRepFactory(
+        FLAGS_write_buffer_size, FLAGS_average_data_size,
+        static_cast<uint32_t>(FLAGS_hash_function_count)));
+    options.prefix_extractor.reset(
+        rocksdb::NewFixedPrefixTransform(FLAGS_prefix_length));
+  } else {
+    fprintf(stdout, "Unknown memtablerep: %s\n", FLAGS_memtablerep.c_str());
+    exit(1);
+  }
+
+  rocksdb::InternalKeyComparator internal_key_comp(
+      rocksdb::BytewiseComparator());
+  rocksdb::MemTable::KeyComparator key_comp(internal_key_comp);
+  rocksdb::Arena arena;
+  rocksdb::WriteBuffer wb(FLAGS_write_buffer_size);
+  rocksdb::MemTableAllocator memtable_allocator(&arena, &wb);
+  uint64_t sequence;
+  auto createMemtableRep = [&] {
+    sequence = 0;
+    return factory->CreateMemTableRep(key_comp, &memtable_allocator,
+                                      options.prefix_extractor.get(),
+                                      options.info_log.get());
+  };
+  std::unique_ptr<rocksdb::MemTableRep> memtablerep;
+  rocksdb::Random64 rng(FLAGS_seed);
+  const char* benchmarks = FLAGS_benchmarks.c_str();
+  while (benchmarks != nullptr) {
+    std::unique_ptr<rocksdb::KeyGenerator> key_gen;
+    const char* sep = strchr(benchmarks, ',');
+    rocksdb::Slice name;
+    if (sep == nullptr) {
+      name = benchmarks;
+      benchmarks = nullptr;
+    } else {
+      name = rocksdb::Slice(benchmarks, sep - benchmarks);
+      benchmarks = sep + 1;
+    }
+    std::unique_ptr<rocksdb::Benchmark> benchmark;
+    if (name == rocksdb::Slice("fillseq")) {
+      memtablerep.reset(createMemtableRep());
+      key_gen.reset(new rocksdb::KeyGenerator(&rng, rocksdb::SEQUENTIAL,
+                                              FLAGS_num_operations));
+      benchmark.reset(new rocksdb::FillBenchmark(memtablerep.get(),
+                                                 key_gen.get(), &sequence));
+    } else if (name == rocksdb::Slice("fillrandom")) {
+      memtablerep.reset(createMemtableRep());
+      key_gen.reset(new rocksdb::KeyGenerator(&rng, rocksdb::UNIQUE_RANDOM,
+                                              FLAGS_num_operations));
+      benchmark.reset(new rocksdb::FillBenchmark(memtablerep.get(),
+                                                 key_gen.get(), &sequence));
+    } else if (name == rocksdb::Slice("readrandom")) {
+      key_gen.reset(new rocksdb::KeyGenerator(&rng, rocksdb::RANDOM,
+                                              FLAGS_num_operations));
+      benchmark.reset(new rocksdb::ReadBenchmark(memtablerep.get(),
+                                                 key_gen.get(), &sequence));
+    } else if (name == rocksdb::Slice("readseq")) {
+      key_gen.reset(new rocksdb::KeyGenerator(&rng, rocksdb::SEQUENTIAL,
+                                              FLAGS_num_operations));
+      benchmark.reset(
+          new rocksdb::SeqReadBenchmark(memtablerep.get(), &sequence));
+    } else if (name == rocksdb::Slice("readwrite")) {
+      memtablerep.reset(createMemtableRep());
+      key_gen.reset(new rocksdb::KeyGenerator(&rng, rocksdb::RANDOM,
+                                              FLAGS_num_operations));
+      benchmark.reset(new rocksdb::ReadWriteBenchmark<
+          rocksdb::ConcurrentReadBenchmarkThread>(memtablerep.get(),
+                                                  key_gen.get(), &sequence));
+    } else if (name == rocksdb::Slice("seqreadwrite")) {
+      memtablerep.reset(createMemtableRep());
+      key_gen.reset(new rocksdb::KeyGenerator(&rng, rocksdb::RANDOM,
+                                              FLAGS_num_operations));
+      benchmark.reset(new rocksdb::ReadWriteBenchmark<
+          rocksdb::SeqConcurrentReadBenchmarkThread>(memtablerep.get(),
+                                                     key_gen.get(), &sequence));
+    } else {
+      std::cout << "WARNING: skipping unknown benchmark '" << name.ToString()
+                << std::endl;
+      continue;
+    }
+    std::cout << "Running " << name.ToString() << std::endl;
+    benchmark->Run();
+  }
+
+  return 0;
+}
+
+#endif  // GFLAGS
diff --git a/src/rocksdb/db/merge_context.h b/src/rocksdb/db/merge_context.h
index bf483a8..f8609da 100644
--- a/src/rocksdb/db/merge_context.h
+++ b/src/rocksdb/db/merge_context.h
@@ -66,4 +66,3 @@ private:
 };
 
 } // namespace rocksdb
-
diff --git a/src/rocksdb/db/merge_helper.cc b/src/rocksdb/db/merge_helper.cc
index 0e36f6a..cd4d456 100644
--- a/src/rocksdb/db/merge_helper.cc
+++ b/src/rocksdb/db/merge_helper.cc
@@ -11,9 +11,45 @@
 #include "util/statistics.h"
 #include <string>
 #include <stdio.h>
+#include "util/perf_context_imp.h"
+#include "util/stop_watch.h"
 
 namespace rocksdb {
 
+// TODO(agiardullo): Clean up merge callsites to use this func
+Status MergeHelper::TimedFullMerge(const Slice& key, const Slice* value,
+                                   const std::deque<std::string>& operands,
+                                   const MergeOperator* merge_operator,
+                                   Statistics* statistics, Env* env,
+                                   Logger* logger, std::string* result) {
+  if (operands.size() == 0) {
+    result->assign(value->data(), value->size());
+    return Status::OK();
+  }
+
+  if (merge_operator == nullptr) {
+    return Status::NotSupported("Provide a merge_operator when opening DB");
+  }
+
+  // Setup to time the merge
+  StopWatchNano timer(env, statistics != nullptr);
+  PERF_TIMER_GUARD(merge_operator_time_nanos);
+
+  // Do the merge
+  bool success =
+      merge_operator->FullMerge(key, value, operands, result, logger);
+
+  RecordTick(statistics, MERGE_OPERATION_TOTAL_TIME,
+             env != nullptr ? timer.ElapsedNanos() : 0);
+
+  if (!success) {
+    RecordTick(statistics, NUMBER_MERGE_FAILURES);
+    return Status::Corruption("Error: Could not perform merge.");
+  }
+
+  return Status::OK();
+}
+
 // PRE:  iter points to the first merge type entry
 // POST: iter points to the first entry beyond the merge process (or the end)
 //       keys_, operands_ are updated to reflect the merge result.
@@ -21,13 +57,16 @@ namespace rocksdb {
 //       operands_ stores the list of merge operands encountered while merging.
 //       keys_[i] corresponds to operands_[i] for each i.
 void MergeHelper::MergeUntil(Iterator* iter, SequenceNumber stop_before,
-                             bool at_bottom, Statistics* stats, int* steps) {
+                             bool at_bottom, Statistics* stats, int* steps,
+                             Env* env_) {
   // Get a copy of the internal key, before it's invalidated by iter->Next()
   // Also maintain the list of merge operands seen.
+  assert(HasOperator());
   keys_.clear();
   operands_.clear();
   keys_.push_front(iter->key().ToString());
   operands_.push_front(iter->value().ToString());
+  assert(user_merge_operator_);
 
   success_ = false;   // Will become true if we hit Put/Delete or bottom
 
@@ -76,20 +115,20 @@ void MergeHelper::MergeUntil(Iterator* iter, SequenceNumber stop_before,
       //   => store result in operands_.back() (and update keys_.back())
       //   => change the entry type to kTypeValue for keys_.back()
       // We are done! Return a success if the merge passes.
-      success_ = user_merge_operator_->FullMerge(ikey.user_key, nullptr,
-                                                 operands_, &merge_result,
-                                                 logger_);
+
+      Status s = TimedFullMerge(ikey.user_key, nullptr, operands_,
+                                user_merge_operator_, stats, env_, logger_,
+                                &merge_result);
 
       // We store the result in keys_.back() and operands_.back()
       // if nothing went wrong (i.e.: no operand corruption on disk)
-      if (success_) {
-        std::string& key = keys_.back();  // The original key encountered
+      if (s.ok()) {
+        std::string& original_key =
+            keys_.back();  // The original key encountered
         orig_ikey.type = kTypeValue;
-        UpdateInternalKey(&key[0], key.size(),
+        UpdateInternalKey(&original_key[0], original_key.size(),
                           orig_ikey.sequence, orig_ikey.type);
         swap(operands_.back(), merge_result);
-      } else {
-        RecordTick(stats, NUMBER_MERGE_FAILURES);
       }
 
       // move iter to the next entry (before doing anything else)
@@ -106,21 +145,20 @@ void MergeHelper::MergeUntil(Iterator* iter, SequenceNumber stop_before,
       //   => store result in operands_.back() (and update keys_.back())
       //   => change the entry type to kTypeValue for keys_.back()
       // We are done! Success!
-      const Slice value = iter->value();
-      success_ = user_merge_operator_->FullMerge(ikey.user_key, &value,
-                                                 operands_, &merge_result,
-                                                 logger_);
+      const Slice val = iter->value();
+      Status s =
+          TimedFullMerge(ikey.user_key, &val, operands_, user_merge_operator_,
+                         stats, env_, logger_, &merge_result);
 
       // We store the result in keys_.back() and operands_.back()
       // if nothing went wrong (i.e.: no operand corruption on disk)
-      if (success_) {
-        std::string& key = keys_.back();  // The original key encountered
+      if (s.ok()) {
+        std::string& original_key =
+            keys_.back();  // The original key encountered
         orig_ikey.type = kTypeValue;
-        UpdateInternalKey(&key[0], key.size(),
+        UpdateInternalKey(&original_key[0], original_key.size(),
                           orig_ikey.sequence, orig_ikey.type);
         swap(operands_.back(), merge_result);
-      } else {
-        RecordTick(stats, NUMBER_MERGE_FAILURES);
       }
 
       // move iter to the next entry
@@ -170,14 +208,18 @@ void MergeHelper::MergeUntil(Iterator* iter, SequenceNumber stop_before,
     assert(kTypeMerge == orig_ikey.type);
     assert(operands_.size() >= 1);
     assert(operands_.size() == keys_.size());
-    success_ = user_merge_operator_->FullMerge(orig_ikey.user_key, nullptr,
-                                               operands_, &merge_result,
-                                               logger_);
-
+    {
+      StopWatchNano timer(env_, stats != nullptr);
+      PERF_TIMER_GUARD(merge_operator_time_nanos);
+      success_ = user_merge_operator_->FullMerge(
+          orig_ikey.user_key, nullptr, operands_, &merge_result, logger_);
+      RecordTick(stats, MERGE_OPERATION_TOTAL_TIME,
+                 env_ != nullptr ? timer.ElapsedNanos() : 0);
+    }
     if (success_) {
-      std::string& key = keys_.back();  // The original key encountered
+      std::string& original_key = keys_.back();  // The original key encountered
       orig_ikey.type = kTypeValue;
-      UpdateInternalKey(&key[0], key.size(),
+      UpdateInternalKey(&original_key[0], original_key.size(),
                         orig_ikey.sequence, orig_ikey.type);
 
       // The final value() is always stored in operands_.back()
@@ -192,16 +234,25 @@ void MergeHelper::MergeUntil(Iterator* iter, SequenceNumber stop_before,
     // merge the stacked merge operands into a single operand.
 
     if (operands_.size() >= 2 &&
-        operands_.size() >= min_partial_merge_operands_ &&
-        user_merge_operator_->PartialMergeMulti(
+        operands_.size() >= min_partial_merge_operands_) {
+      bool merge_success = false;
+      {
+        StopWatchNano timer(env_, stats != nullptr);
+        PERF_TIMER_GUARD(merge_operator_time_nanos);
+        merge_success = user_merge_operator_->PartialMergeMulti(
             orig_ikey.user_key,
             std::deque<Slice>(operands_.begin(), operands_.end()),
-            &merge_result, logger_)) {
-      // Merging of operands (associative merge) was successful.
-      // Replace operands with the merge result
-      operands_.clear();
-      operands_.push_front(std::move(merge_result));
-      keys_.erase(keys_.begin(), keys_.end() - 1);
+            &merge_result, logger_);
+        RecordTick(stats, MERGE_OPERATION_TOTAL_TIME,
+                   env_ != nullptr ? timer.ElapsedNanos() : 0);
+      }
+      if (merge_success) {
+        // Merging of operands (associative merge) was successful.
+        // Replace operands with the merge result
+        operands_.clear();
+        operands_.push_front(std::move(merge_result));
+        keys_.erase(keys_.begin(), keys_.end() - 1);
+      }
     }
   }
 }
diff --git a/src/rocksdb/db/merge_helper.h b/src/rocksdb/db/merge_helper.h
index fef153e..7722446 100644
--- a/src/rocksdb/db/merge_helper.h
+++ b/src/rocksdb/db/merge_helper.h
@@ -10,6 +10,7 @@
 #include "rocksdb/slice.h"
 #include <string>
 #include <deque>
+#include "rocksdb/env.h"
 
 namespace rocksdb {
 
@@ -34,6 +35,15 @@ class MergeHelper {
         operands_(),
         success_(false) {}
 
+  // Wrapper around MergeOperator::FullMerge() that records perf statistics.
+  // Result of merge will be written to result if status returned is OK.
+  // If operands is empty, the value will simply be copied to result.
+  static Status TimedFullMerge(const Slice& key, const Slice* value,
+                               const std::deque<std::string>& operands,
+                               const MergeOperator* merge_operator,
+                               Statistics* statistics, Env* env, Logger* logger,
+                               std::string* result);
+
   // Merge entries until we hit
   //     - a corrupted key
   //     - a Put/Delete,
@@ -48,7 +58,7 @@ class MergeHelper {
   //                   we could reach the start of the history of this user key.
   void MergeUntil(Iterator* iter, SequenceNumber stop_before = 0,
                   bool at_bottom = false, Statistics* stats = nullptr,
-                  int* steps = nullptr);
+                  int* steps = nullptr, Env* env_ = nullptr);
 
   // Query the merge result
   // These are valid until the next MergeUntil call
@@ -78,13 +88,16 @@ class MergeHelper {
   //   IMPORTANT 2: The entries were traversed in order from BACK to FRONT.
   //                So keys().back() was the first key seen by iterator.
   // TODO: Re-style this comment to be like the first one
-  bool IsSuccess() { return success_; }
-  Slice key() { assert(success_); return Slice(keys_.back()); }
-  Slice value() { assert(success_); return Slice(operands_.back()); }
-  const std::deque<std::string>& keys() { assert(!success_); return keys_; }
-  const std::deque<std::string>& values() {
+  bool IsSuccess() const { return success_; }
+  Slice key() const { assert(success_); return Slice(keys_.back()); }
+  Slice value() const { assert(success_); return Slice(operands_.back()); }
+  const std::deque<std::string>& keys() const {
+    assert(!success_); return keys_;
+  }
+  const std::deque<std::string>& values() const {
     assert(!success_); return operands_;
   }
+  bool HasOperator() const { return user_merge_operator_ != nullptr; }
 
  private:
   const Comparator* user_comparator_;
diff --git a/src/rocksdb/db/merge_test.cc b/src/rocksdb/db/merge_test.cc
index 9bdf543..2fa7fae 100644
--- a/src/rocksdb/db/merge_test.cc
+++ b/src/rocksdb/db/merge_test.cc
@@ -12,26 +12,22 @@
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/merge_operator.h"
+#include "rocksdb/utilities/db_ttl.h"
 #include "db/dbformat.h"
 #include "db/db_impl.h"
 #include "db/write_batch_internal.h"
 #include "utilities/merge_operators.h"
 #include "util/testharness.h"
-#include "utilities/db_ttl.h"
 
 using namespace std;
 using namespace rocksdb;
 
 namespace {
-  int numMergeOperatorCalls;
-  void resetNumMergeOperatorCalls() {
-    numMergeOperatorCalls = 0;
-  }
+size_t num_merge_operator_calls;
+void resetNumMergeOperatorCalls() { num_merge_operator_calls = 0; }
 
-  int num_partial_merge_calls;
-  void resetNumPartialMergeCalls() {
-    num_partial_merge_calls = 0;
-  }
+size_t num_partial_merge_calls;
+void resetNumPartialMergeCalls() { num_partial_merge_calls = 0; }
 }
 
 class CountMergeOperator : public AssociativeMergeOperator {
@@ -45,7 +41,7 @@ class CountMergeOperator : public AssociativeMergeOperator {
                      const Slice& value,
                      std::string* new_value,
                      Logger* logger) const override {
-    ++numMergeOperatorCalls;
+    ++num_merge_operator_calls;
     if (existing_value == nullptr) {
       new_value->assign(value.data(), value.size());
       return true;
@@ -61,7 +57,8 @@ class CountMergeOperator : public AssociativeMergeOperator {
 
   virtual bool PartialMergeMulti(const Slice& key,
                                  const std::deque<Slice>& operand_list,
-                                 std::string* new_value, Logger* logger) const {
+                                 std::string* new_value,
+                                 Logger* logger) const override {
     ++num_partial_merge_calls;
     return mergeOperator_->PartialMergeMulti(key, operand_list, new_value,
                                              logger);
@@ -212,7 +209,7 @@ class Counters {
   void assert_add(const string& key, uint64_t value) {
     int result = add(key, value);
     assert(result);
-    if (result == 0) exit(1); // Disable unused variable warning. 
+    if (result == 0) exit(1); // Disable unused variable warning.
   }
 };
 
@@ -307,31 +304,31 @@ void testCounters(Counters& counters, DB* db, bool test_compaction) {
   }
 }
 
-void testSuccessiveMerge(
-    Counters& counters, int max_num_merges, int num_merges) {
+void testSuccessiveMerge(Counters& counters, size_t max_num_merges,
+                         size_t num_merges) {
 
   counters.assert_remove("z");
   uint64_t sum = 0;
 
-  for (int i = 1; i <= num_merges; ++i) {
+  for (size_t i = 1; i <= num_merges; ++i) {
     resetNumMergeOperatorCalls();
     counters.assert_add("z", i);
     sum += i;
 
     if (i % (max_num_merges + 1) == 0) {
-      assert(numMergeOperatorCalls == max_num_merges + 1);
+      assert(num_merge_operator_calls == max_num_merges + 1);
     } else {
-      assert(numMergeOperatorCalls == 0);
+      assert(num_merge_operator_calls == 0);
     }
 
     resetNumMergeOperatorCalls();
     assert(counters.assert_get("z") == sum);
-    assert(numMergeOperatorCalls == i % (max_num_merges + 1));
+    assert(num_merge_operator_calls == i % (max_num_merges + 1));
   }
 }
 
-void testPartialMerge(Counters* counters, DB* db, int max_merge, int min_merge,
-                      int count) {
+void testPartialMerge(Counters* counters, DB* db, size_t max_merge,
+                      size_t min_merge, size_t count) {
   FlushOptions o;
   o.wait = true;
 
@@ -339,7 +336,7 @@ void testPartialMerge(Counters* counters, DB* db, int max_merge, int min_merge,
   //              operands exceeds the threshold.
   uint64_t tmp_sum = 0;
   resetNumPartialMergeCalls();
-  for (int i = 1; i <= count; i++) {
+  for (size_t i = 1; i <= count; i++) {
     counters->assert_add("b", i);
     tmp_sum += i;
   }
@@ -348,7 +345,7 @@ void testPartialMerge(Counters* counters, DB* db, int max_merge, int min_merge,
   ASSERT_EQ(tmp_sum, counters->assert_get("b"));
   if (count > max_merge) {
     // in this case, FullMerge should be called instead.
-    ASSERT_EQ(num_partial_merge_calls, 0);
+    ASSERT_EQ(num_partial_merge_calls, 0U);
   } else {
     // if count >= min_merge, then partial merge should be called once.
     ASSERT_EQ((count >= min_merge), (num_partial_merge_calls == 1));
@@ -358,20 +355,18 @@ void testPartialMerge(Counters* counters, DB* db, int max_merge, int min_merge,
   resetNumPartialMergeCalls();
   tmp_sum = 0;
   db->Put(rocksdb::WriteOptions(), "c", "10");
-  for (int i = 1; i <= count; i++) {
+  for (size_t i = 1; i <= count; i++) {
     counters->assert_add("c", i);
     tmp_sum += i;
   }
   db->Flush(o);
   db->CompactRange(nullptr, nullptr);
   ASSERT_EQ(tmp_sum, counters->assert_get("c"));
-  ASSERT_EQ(num_partial_merge_calls, 0);
+  ASSERT_EQ(num_partial_merge_calls, 0U);
 }
 
-void testSingleBatchSuccessiveMerge(
-    DB* db,
-    int max_num_merges,
-    int num_merges) {
+void testSingleBatchSuccessiveMerge(DB* db, size_t max_num_merges,
+                                    size_t num_merges) {
   assert(num_merges > max_num_merges);
 
   Slice key("BatchSuccessiveMerge");
@@ -380,7 +375,7 @@ void testSingleBatchSuccessiveMerge(
 
   // Create the batch
   WriteBatch batch;
-  for (int i = 0; i < num_merges; ++i) {
+  for (size_t i = 0; i < num_merges; ++i) {
     batch.Merge(key, merge_value_slice);
   }
 
@@ -390,8 +385,9 @@ void testSingleBatchSuccessiveMerge(
     Status s = db->Write(WriteOptions(), &batch);
     assert(s.ok());
   }
-  assert(numMergeOperatorCalls ==
-      num_merges - (num_merges % (max_num_merges + 1)));
+  ASSERT_EQ(
+      num_merge_operator_calls,
+      static_cast<size_t>(num_merges - (num_merges % (max_num_merges + 1))));
 
   // Get the value
   resetNumMergeOperatorCalls();
@@ -403,18 +399,11 @@ void testSingleBatchSuccessiveMerge(
   assert(get_value_str.size() == sizeof(uint64_t));
   uint64_t get_value = DecodeFixed64(&get_value_str[0]);
   ASSERT_EQ(get_value, num_merges * merge_value);
-  ASSERT_EQ(numMergeOperatorCalls, (num_merges % (max_num_merges + 1)));
+  ASSERT_EQ(num_merge_operator_calls,
+            static_cast<size_t>((num_merges % (max_num_merges + 1))));
 }
 
 void runTest(int argc, const string& dbname, const bool use_ttl = false) {
-  auto db = OpenDb(dbname, use_ttl);
-
-  {
-    cout << "Test read-modify-write counters... \n";
-    Counters counters(db, 0);
-    testCounters(counters, db.get(), true);
-  }
-
   bool compact = false;
   if (argc > 1) {
     compact = true;
@@ -422,13 +411,22 @@ void runTest(int argc, const string& dbname, const bool use_ttl = false) {
   }
 
   {
-    cout << "Test merge-based counters... \n";
-    MergeBasedCounters counters(db, 0);
-    testCounters(counters, db.get(), compact);
+    auto db = OpenDb(dbname, use_ttl);
+
+    {
+      cout << "Test read-modify-write counters... \n";
+      Counters counters(db, 0);
+      testCounters(counters, db.get(), true);
+    }
+
+    {
+      cout << "Test merge-based counters... \n";
+      MergeBasedCounters counters(db, 0);
+      testCounters(counters, db.get(), compact);
+    }
   }
 
   DestroyDB(dbname, Options());
-  db.reset();
 
   {
     cout << "Test merge in memtable... \n";
@@ -460,6 +458,41 @@ void runTest(int argc, const string& dbname, const bool use_ttl = false) {
       }
     }
   }
+
+  {
+    cout << "Test merge-operator not set after reopen\n";
+    {
+      auto db = OpenDb(dbname);
+      MergeBasedCounters counters(db, 0);
+      counters.add("test-key", 1);
+      counters.add("test-key", 1);
+      counters.add("test-key", 1);
+      db->CompactRange(nullptr, nullptr);
+    }
+
+    DB* reopen_db;
+    ASSERT_OK(DB::Open(Options(), dbname, &reopen_db));
+    std::string value;
+    ASSERT_TRUE(!(reopen_db->Get(ReadOptions(), "test-key", &value).ok()));
+    delete reopen_db;
+    DestroyDB(dbname, Options());
+  }
+
+  /* Temporary remove this test
+  {
+    cout << "Test merge-operator not set after reopen (recovery case)\n";
+    {
+      auto db = OpenDb(dbname);
+      MergeBasedCounters counters(db, 0);
+      counters.add("test-key", 1);
+      counters.add("test-key", 1);
+      counters.add("test-key", 1);
+    }
+
+    DB* reopen_db;
+    ASSERT_TRUE(DB::Open(Options(), dbname, &reopen_db).IsInvalidArgument());
+  }
+  */
 }
 }  // namespace
 
diff --git a/src/rocksdb/db/perf_context_test.cc b/src/rocksdb/db/perf_context_test.cc
index a182fb5..359562a 100644
--- a/src/rocksdb/db/perf_context_test.cc
+++ b/src/rocksdb/db/perf_context_test.cc
@@ -6,7 +6,6 @@
 #include <algorithm>
 #include <iostream>
 #include <vector>
-#include "/usr/include/valgrind/callgrind.h"
 
 #include "rocksdb/db.h"
 #include "rocksdb/perf_context.h"
@@ -15,6 +14,8 @@
 #include "util/histogram.h"
 #include "util/stop_watch.h"
 #include "util/testharness.h"
+#include "util/thread_status_util.h"
+#include "util/string_util.h"
 
 
 bool FLAGS_random_key = false;
@@ -23,13 +24,14 @@ int FLAGS_total_keys = 100;
 int FLAGS_write_buffer_size = 1000000000;
 int FLAGS_max_write_buffer_number = 8;
 int FLAGS_min_write_buffer_number_to_merge = 7;
+bool FLAGS_verbose = false;
 
 // Path to the database on file system
 const std::string kDbName = rocksdb::test::TmpDir() + "/perf_context_test";
 
 namespace rocksdb {
 
-std::shared_ptr<DB> OpenDb() {
+std::shared_ptr<DB> OpenDb(bool read_only = false) {
     DB* db;
     Options options;
     options.create_if_missing = true;
@@ -39,44 +41,51 @@ std::shared_ptr<DB> OpenDb() {
       FLAGS_min_write_buffer_number_to_merge;
 
     if (FLAGS_use_set_based_memetable) {
-      auto prefix_extractor = rocksdb::NewFixedPrefixTransform(0);
-      options.memtable_factory.reset(
-          NewHashSkipListRepFactory(prefix_extractor));
+#ifndef ROCKSDB_LITE
+      options.prefix_extractor.reset(rocksdb::NewFixedPrefixTransform(0));
+      options.memtable_factory.reset(NewHashSkipListRepFactory());
+#endif  // ROCKSDB_LITE
     }
 
-    Status s = DB::Open(options, kDbName,  &db);
-    ASSERT_OK(s);
+    Status s;
+    if (!read_only) {
+      s = DB::Open(options, kDbName, &db);
+    } else {
+      s = DB::OpenForReadOnly(options, kDbName, &db);
+    }
+    EXPECT_OK(s);
     return std::shared_ptr<DB>(db);
 }
 
-class PerfContextTest { };
+class PerfContextTest : public testing::Test {};
 
-TEST(PerfContextTest, SeekIntoDeletion) {
+TEST_F(PerfContextTest, SeekIntoDeletion) {
   DestroyDB(kDbName, Options());
   auto db = OpenDb();
   WriteOptions write_options;
   ReadOptions read_options;
 
   for (int i = 0; i < FLAGS_total_keys; ++i) {
-    std::string key = "k" + std::to_string(i);
-    std::string value = "v" + std::to_string(i);
+    std::string key = "k" + ToString(i);
+    std::string value = "v" + ToString(i);
 
     db->Put(write_options, key, value);
   }
 
   for (int i = 0; i < FLAGS_total_keys -1 ; ++i) {
-    std::string key = "k" + std::to_string(i);
+    std::string key = "k" + ToString(i);
     db->Delete(write_options, key);
   }
 
   HistogramImpl hist_get;
   HistogramImpl hist_get_time;
   for (int i = 0; i < FLAGS_total_keys - 1; ++i) {
-    std::string key = "k" + std::to_string(i);
+    std::string key = "k" + ToString(i);
     std::string value;
 
     perf_context.Reset();
-    StopWatchNano timer(Env::Default(), true);
+    StopWatchNano timer(Env::Default());
+    timer.Start();
     auto status = db->Get(read_options, key, &value);
     auto elapsed_nanos = timer.ElapsedNanos();
     ASSERT_TRUE(status.IsNotFound());
@@ -84,51 +93,67 @@ TEST(PerfContextTest, SeekIntoDeletion) {
     hist_get_time.Add(elapsed_nanos);
   }
 
-  std::cout << "Get uesr key comparison: \n" << hist_get.ToString()
-            << "Get time: \n" << hist_get_time.ToString();
+  if (FLAGS_verbose) {
+    std::cout << "Get user key comparison: \n" << hist_get.ToString()
+              << "Get time: \n" << hist_get_time.ToString();
+  }
 
-  HistogramImpl hist_seek_to_first;
-  std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+  {
+    HistogramImpl hist_seek_to_first;
+    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
 
-  perf_context.Reset();
-  StopWatchNano timer(Env::Default(), true);
-  iter->SeekToFirst();
-  hist_seek_to_first.Add(perf_context.user_key_comparison_count);
-  auto elapsed_nanos = timer.ElapsedNanos();
+    perf_context.Reset();
+    StopWatchNano timer(Env::Default(), true);
+    iter->SeekToFirst();
+    hist_seek_to_first.Add(perf_context.user_key_comparison_count);
+    auto elapsed_nanos = timer.ElapsedNanos();
 
-  std::cout << "SeekToFirst uesr key comparison: \n" << hist_seek_to_first.ToString()
-            << "ikey skipped: " << perf_context.internal_key_skipped_count << "\n"
-            << "idelete skipped: " << perf_context.internal_delete_skipped_count << "\n"
-            << "elapsed: " << elapsed_nanos << "\n";
+    if (FLAGS_verbose) {
+      std::cout << "SeekToFirst uesr key comparison: \n"
+                << hist_seek_to_first.ToString()
+                << "ikey skipped: " << perf_context.internal_key_skipped_count
+                << "\n"
+                << "idelete skipped: "
+                << perf_context.internal_delete_skipped_count << "\n"
+                << "elapsed: " << elapsed_nanos << "\n";
+    }
+  }
 
   HistogramImpl hist_seek;
   for (int i = 0; i < FLAGS_total_keys; ++i) {
     std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
-    std::string key = "k" + std::to_string(i);
+    std::string key = "k" + ToString(i);
 
     perf_context.Reset();
     StopWatchNano timer(Env::Default(), true);
     iter->Seek(key);
     auto elapsed_nanos = timer.ElapsedNanos();
     hist_seek.Add(perf_context.user_key_comparison_count);
-    std::cout << "seek cmp: " << perf_context.user_key_comparison_count
-              << " ikey skipped " << perf_context.internal_key_skipped_count
-              << " idelete skipped " << perf_context.internal_delete_skipped_count
-              << " elapsed: " << elapsed_nanos << "ns\n";
+    if (FLAGS_verbose) {
+      std::cout << "seek cmp: " << perf_context.user_key_comparison_count
+                << " ikey skipped " << perf_context.internal_key_skipped_count
+                << " idelete skipped "
+                << perf_context.internal_delete_skipped_count
+                << " elapsed: " << elapsed_nanos << "ns\n";
+    }
 
     perf_context.Reset();
     ASSERT_TRUE(iter->Valid());
     StopWatchNano timer2(Env::Default(), true);
     iter->Next();
     auto elapsed_nanos2 = timer2.ElapsedNanos();
-    std::cout << "next cmp: " << perf_context.user_key_comparison_count
-              << "elapsed: " << elapsed_nanos2 << "ns\n";
+    if (FLAGS_verbose) {
+      std::cout << "next cmp: " << perf_context.user_key_comparison_count
+                << "elapsed: " << elapsed_nanos2 << "ns\n";
+    }
   }
 
-  std::cout << "Seek uesr key comparison: \n" << hist_seek.ToString();
+  if (FLAGS_verbose) {
+    std::cout << "Seek uesr key comparison: \n" << hist_seek.ToString();
+  }
 }
 
-TEST(PerfContextTest, StopWatchNanoOverhead) {
+TEST_F(PerfContextTest, StopWatchNanoOverhead) {
   // profile the timer cost by itself!
   const int kTotalIterations = 1000000;
   std::vector<uint64_t> timings(kTotalIterations);
@@ -143,17 +168,20 @@ TEST(PerfContextTest, StopWatchNanoOverhead) {
     histogram.Add(timing);
   }
 
-  std::cout << histogram.ToString();
+  if (FLAGS_verbose) {
+    std::cout << histogram.ToString();
+  }
 }
 
-TEST(PerfContextTest, StopWatchOverhead) {
+TEST_F(PerfContextTest, StopWatchOverhead) {
   // profile the timer cost by itself!
   const int kTotalIterations = 1000000;
+  uint64_t elapsed = 0;
   std::vector<uint64_t> timings(kTotalIterations);
 
-  StopWatch timer(Env::Default());
+  StopWatch timer(Env::Default(), nullptr, 0, &elapsed);
   for (auto& timing : timings) {
-    timing = timer.ElapsedMicros();
+    timing = elapsed;
   }
 
   HistogramImpl histogram;
@@ -163,10 +191,12 @@ TEST(PerfContextTest, StopWatchOverhead) {
     prev_timing = timing;
   }
 
-  std::cout << histogram.ToString();
+  if (FLAGS_verbose) {
+    std::cout << histogram.ToString();
+  }
 }
 
-void ProfileKeyComparison() {
+void ProfileQueries(bool enabled_time = false) {
   DestroyDB(kDbName, Options());    // Start this test with a fresh DB
 
   auto db = OpenDb();
@@ -175,74 +205,254 @@ void ProfileKeyComparison() {
   ReadOptions read_options;
 
   HistogramImpl hist_put;
+
   HistogramImpl hist_get;
   HistogramImpl hist_get_snapshot;
   HistogramImpl hist_get_memtable;
+  HistogramImpl hist_get_files;
   HistogramImpl hist_get_post_process;
   HistogramImpl hist_num_memtable_checked;
+
+  HistogramImpl hist_mget;
+  HistogramImpl hist_mget_snapshot;
+  HistogramImpl hist_mget_memtable;
+  HistogramImpl hist_mget_files;
+  HistogramImpl hist_mget_post_process;
+  HistogramImpl hist_mget_num_memtable_checked;
+
   HistogramImpl hist_write_pre_post;
   HistogramImpl hist_write_wal_time;
   HistogramImpl hist_write_memtable_time;
 
-  std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n";
+  uint64_t total_db_mutex_nanos = 0;
+
+  if (FLAGS_verbose) {
+    std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n";
+  }
 
   std::vector<int> keys;
+  const int kFlushFlag = -1;
   for (int i = 0; i < FLAGS_total_keys; ++i) {
     keys.push_back(i);
+    if (i == FLAGS_total_keys / 2) {
+      // Issuing a flush in the middle.
+      keys.push_back(kFlushFlag);
+    }
   }
 
   if (FLAGS_random_key) {
     std::random_shuffle(keys.begin(), keys.end());
   }
-
+#ifndef NDEBUG
+  ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 1U);
+#endif
+  int num_mutex_waited = 0;
   for (const int i : keys) {
-    std::string key = "k" + std::to_string(i);
-    std::string value = "v" + std::to_string(i);
+    if (i == kFlushFlag) {
+      FlushOptions fo;
+      db->Flush(fo);
+      continue;
+    }
+
+    std::string key = "k" + ToString(i);
+    std::string value = "v" + ToString(i);
+
+    std::vector<std::string> values;
 
     perf_context.Reset();
     db->Put(write_options, key, value);
+    if (++num_mutex_waited > 3) {
+#ifndef NDEBUG
+      ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0U);
+#endif
+    }
     hist_write_pre_post.Add(perf_context.write_pre_and_post_process_time);
     hist_write_wal_time.Add(perf_context.write_wal_time);
     hist_write_memtable_time.Add(perf_context.write_memtable_time);
     hist_put.Add(perf_context.user_key_comparison_count);
+    total_db_mutex_nanos += perf_context.db_mutex_lock_nanos;
+  }
+#ifndef NDEBUG
+  ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0U);
+#endif
+
+  for (const int i : keys) {
+    std::string key = "k" + ToString(i);
+    std::string value = "v" + ToString(i);
+
+    std::vector<Slice> multiget_keys = {Slice(key)};
+    std::vector<std::string> values;
 
     perf_context.Reset();
     db->Get(read_options, key, &value);
     hist_get_snapshot.Add(perf_context.get_snapshot_time);
     hist_get_memtable.Add(perf_context.get_from_memtable_time);
+    hist_get_files.Add(perf_context.get_from_output_files_time);
     hist_num_memtable_checked.Add(perf_context.get_from_memtable_count);
     hist_get_post_process.Add(perf_context.get_post_process_time);
     hist_get.Add(perf_context.user_key_comparison_count);
+
+    perf_context.Reset();
+    db->MultiGet(read_options, multiget_keys, &values);
+    hist_mget_snapshot.Add(perf_context.get_snapshot_time);
+    hist_mget_memtable.Add(perf_context.get_from_memtable_time);
+    hist_mget_files.Add(perf_context.get_from_output_files_time);
+    hist_mget_num_memtable_checked.Add(perf_context.get_from_memtable_count);
+    hist_mget_post_process.Add(perf_context.get_post_process_time);
+    hist_mget.Add(perf_context.user_key_comparison_count);
+  }
+
+  if (FLAGS_verbose) {
+    std::cout << "Put uesr key comparison: \n" << hist_put.ToString()
+              << "Get uesr key comparison: \n" << hist_get.ToString()
+              << "MultiGet uesr key comparison: \n" << hist_get.ToString();
+    std::cout << "Put(): Pre and Post Process Time: \n"
+              << hist_write_pre_post.ToString() << " Writing WAL time: \n"
+              << hist_write_wal_time.ToString() << "\n"
+              << " Writing Mem Table time: \n"
+              << hist_write_memtable_time.ToString() << "\n"
+              << " Total DB mutex nanos: \n" << total_db_mutex_nanos << "\n";
+
+    std::cout << "Get(): Time to get snapshot: \n"
+              << hist_get_snapshot.ToString()
+              << " Time to get value from memtables: \n"
+              << hist_get_memtable.ToString() << "\n"
+              << " Time to get value from output files: \n"
+              << hist_get_files.ToString() << "\n"
+              << " Number of memtables checked: \n"
+              << hist_num_memtable_checked.ToString() << "\n"
+              << " Time to post process: \n" << hist_get_post_process.ToString()
+              << "\n";
+
+    std::cout << "MultiGet(): Time to get snapshot: \n"
+              << hist_mget_snapshot.ToString()
+              << " Time to get value from memtables: \n"
+              << hist_mget_memtable.ToString() << "\n"
+              << " Time to get value from output files: \n"
+              << hist_mget_files.ToString() << "\n"
+              << " Number of memtables checked: \n"
+              << hist_mget_num_memtable_checked.ToString() << "\n"
+              << " Time to post process: \n"
+              << hist_mget_post_process.ToString() << "\n";
+  }
+
+  if (enabled_time) {
+    ASSERT_GT(hist_get.Average(), 0);
+    ASSERT_GT(hist_get_snapshot.Average(), 0);
+    ASSERT_GT(hist_get_memtable.Average(), 0);
+    ASSERT_GT(hist_get_files.Average(), 0);
+    ASSERT_GT(hist_get_post_process.Average(), 0);
+    ASSERT_GT(hist_num_memtable_checked.Average(), 0);
+
+    ASSERT_GT(hist_mget.Average(), 0);
+    ASSERT_GT(hist_mget_snapshot.Average(), 0);
+    ASSERT_GT(hist_mget_memtable.Average(), 0);
+    ASSERT_GT(hist_mget_files.Average(), 0);
+    ASSERT_GT(hist_mget_post_process.Average(), 0);
+    ASSERT_GT(hist_mget_num_memtable_checked.Average(), 0);
+#ifndef NDEBUG
+    ASSERT_GT(total_db_mutex_nanos, 2000U);
+#endif
+  }
+
+  db.reset();
+  db = OpenDb(true);
+
+  hist_get.Clear();
+  hist_get_snapshot.Clear();
+  hist_get_memtable.Clear();
+  hist_get_files.Clear();
+  hist_get_post_process.Clear();
+  hist_num_memtable_checked.Clear();
+
+  hist_mget.Clear();
+  hist_mget_snapshot.Clear();
+  hist_mget_memtable.Clear();
+  hist_mget_files.Clear();
+  hist_mget_post_process.Clear();
+  hist_mget_num_memtable_checked.Clear();
+
+  for (const int i : keys) {
+    std::string key = "k" + ToString(i);
+    std::string value = "v" + ToString(i);
+
+    std::vector<Slice> multiget_keys = {Slice(key)};
+    std::vector<std::string> values;
+
+    perf_context.Reset();
+    db->Get(read_options, key, &value);
+    hist_get_snapshot.Add(perf_context.get_snapshot_time);
+    hist_get_memtable.Add(perf_context.get_from_memtable_time);
+    hist_get_files.Add(perf_context.get_from_output_files_time);
+    hist_num_memtable_checked.Add(perf_context.get_from_memtable_count);
+    hist_get_post_process.Add(perf_context.get_post_process_time);
+    hist_get.Add(perf_context.user_key_comparison_count);
+
+    perf_context.Reset();
+    db->MultiGet(read_options, multiget_keys, &values);
+    hist_mget_snapshot.Add(perf_context.get_snapshot_time);
+    hist_mget_memtable.Add(perf_context.get_from_memtable_time);
+    hist_mget_files.Add(perf_context.get_from_output_files_time);
+    hist_mget_num_memtable_checked.Add(perf_context.get_from_memtable_count);
+    hist_mget_post_process.Add(perf_context.get_post_process_time);
+    hist_mget.Add(perf_context.user_key_comparison_count);
+  }
+
+  if (FLAGS_verbose) {
+    std::cout << "ReadOnly Get uesr key comparison: \n" << hist_get.ToString()
+              << "ReadOnly MultiGet uesr key comparison: \n"
+              << hist_mget.ToString();
+
+    std::cout << "ReadOnly Get(): Time to get snapshot: \n"
+              << hist_get_snapshot.ToString()
+              << " Time to get value from memtables: \n"
+              << hist_get_memtable.ToString() << "\n"
+              << " Time to get value from output files: \n"
+              << hist_get_files.ToString() << "\n"
+              << " Number of memtables checked: \n"
+              << hist_num_memtable_checked.ToString() << "\n"
+              << " Time to post process: \n" << hist_get_post_process.ToString()
+              << "\n";
+
+    std::cout << "ReadOnly MultiGet(): Time to get snapshot: \n"
+              << hist_mget_snapshot.ToString()
+              << " Time to get value from memtables: \n"
+              << hist_mget_memtable.ToString() << "\n"
+              << " Time to get value from output files: \n"
+              << hist_mget_files.ToString() << "\n"
+              << " Number of memtables checked: \n"
+              << hist_mget_num_memtable_checked.ToString() << "\n"
+              << " Time to post process: \n"
+              << hist_mget_post_process.ToString() << "\n";
   }
 
-  std::cout << "Put uesr key comparison: \n" << hist_put.ToString()
-            << "Get uesr key comparison: \n" << hist_get.ToString();
-  std::cout << "Put(): Pre and Post Process Time: \n"
-            << hist_write_pre_post.ToString()
-            << " Writing WAL time: \n"
-            << hist_write_wal_time.ToString() << "\n"
-            << " Writing Mem Table time: \n"
-            << hist_write_memtable_time.ToString() << "\n";
-
-  std::cout << "Get(): Time to get snapshot: \n"
-            << hist_get_snapshot.ToString()
-            << " Time to get value from memtables: \n"
-            << hist_get_memtable.ToString() << "\n"
-            << " Number of memtables checked: \n"
-            << hist_num_memtable_checked.ToString() << "\n"
-            << " Time to post process: \n"
-            << hist_get_post_process.ToString() << "\n";
+  if (enabled_time) {
+    ASSERT_GT(hist_get.Average(), 0);
+    ASSERT_GT(hist_get_memtable.Average(), 0);
+    ASSERT_GT(hist_get_files.Average(), 0);
+    ASSERT_GT(hist_num_memtable_checked.Average(), 0);
+    // In read-only mode Get(), no super version operation is needed
+    ASSERT_EQ(hist_get_post_process.Average(), 0);
+    ASSERT_EQ(hist_get_snapshot.Average(), 0);
+
+    ASSERT_GT(hist_mget.Average(), 0);
+    ASSERT_GT(hist_mget_snapshot.Average(), 0);
+    ASSERT_GT(hist_mget_memtable.Average(), 0);
+    ASSERT_GT(hist_mget_files.Average(), 0);
+    ASSERT_GT(hist_mget_post_process.Average(), 0);
+    ASSERT_GT(hist_mget_num_memtable_checked.Average(), 0);
+  }
 }
 
-TEST(PerfContextTest, KeyComparisonCount) {
+TEST_F(PerfContextTest, KeyComparisonCount) {
   SetPerfLevel(kEnableCount);
-  ProfileKeyComparison();
+  ProfileQueries();
 
   SetPerfLevel(kDisable);
-  ProfileKeyComparison();
+  ProfileQueries();
 
   SetPerfLevel(kEnableTime);
-  ProfileKeyComparison();
+  ProfileQueries(true);
 }
 
 // make perf_context_test
@@ -257,13 +467,15 @@ TEST(PerfContextTest, KeyComparisonCount) {
 // memtable. When there are two memtables, even the avg Seek Key comparison
 // starts to become linear to the input size.
 
-TEST(PerfContextTest, SeekKeyComparison) {
+TEST_F(PerfContextTest, SeekKeyComparison) {
   DestroyDB(kDbName, Options());
   auto db = OpenDb();
   WriteOptions write_options;
   ReadOptions read_options;
 
-  std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n";
+  if (FLAGS_verbose) {
+    std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n";
+  }
 
   std::vector<int> keys;
   for (int i = 0; i < FLAGS_total_keys; ++i) {
@@ -281,8 +493,8 @@ TEST(PerfContextTest, SeekKeyComparison) {
   SetPerfLevel(kEnableTime);
   StopWatchNano timer(Env::Default());
   for (const int i : keys) {
-    std::string key = "k" + std::to_string(i);
-    std::string value = "v" + std::to_string(i);
+    std::string key = "k" + ToString(i);
+    std::string value = "v" + ToString(i);
 
     perf_context.Reset();
     timer.Start();
@@ -293,16 +505,18 @@ TEST(PerfContextTest, SeekKeyComparison) {
     hist_time_diff.Add(put_time - perf_context.write_wal_time);
   }
 
-  std::cout << "Put time:\n" << hist_put_time.ToString()
-            << "WAL time:\n" << hist_wal_time.ToString()
-            << "time diff:\n" << hist_time_diff.ToString();
+  if (FLAGS_verbose) {
+    std::cout << "Put time:\n" << hist_put_time.ToString() << "WAL time:\n"
+              << hist_wal_time.ToString() << "time diff:\n"
+              << hist_time_diff.ToString();
+  }
 
   HistogramImpl hist_seek;
   HistogramImpl hist_next;
 
   for (int i = 0; i < FLAGS_total_keys; ++i) {
-    std::string key = "k" + std::to_string(i);
-    std::string value = "v" + std::to_string(i);
+    std::string key = "k" + ToString(i);
+    std::string value = "v" + ToString(i);
 
     std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
     perf_context.Reset();
@@ -319,13 +533,16 @@ TEST(PerfContextTest, SeekKeyComparison) {
     hist_next.Add(perf_context.user_key_comparison_count);
   }
 
-  std::cout << "Seek:\n" << hist_seek.ToString()
-            << "Next:\n" << hist_next.ToString();
+  if (FLAGS_verbose) {
+    std::cout << "Seek:\n" << hist_seek.ToString() << "Next:\n"
+              << hist_next.ToString();
+  }
 }
 
 }
 
 int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
 
   for (int i = 1; i < argc; i++) {
     int n;
@@ -349,10 +566,15 @@ int main(int argc, char** argv) {
       FLAGS_use_set_based_memetable = n;
     }
 
+    if (sscanf(argv[i], "--verbose=%d%c", &n, &junk) == 1 &&
+        (n == 0 || n == 1)) {
+      FLAGS_verbose = n;
+    }
   }
 
-  std::cout << kDbName << "\n";
+  if (FLAGS_verbose) {
+    std::cout << kDbName << "\n";
+  }
 
-  rocksdb::test::RunAllTests();
-  return 0;
+  return RUN_ALL_TESTS();
 }
diff --git a/src/rocksdb/db/plain_table_db_test.cc b/src/rocksdb/db/plain_table_db_test.cc
index 17e3e61..edcfde7 100644
--- a/src/rocksdb/db/plain_table_db_test.cc
+++ b/src/rocksdb/db/plain_table_db_test.cc
@@ -23,11 +23,13 @@
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/table.h"
 #include "table/meta_blocks.h"
+#include "table/bloom_block.h"
 #include "table/plain_table_factory.h"
 #include "table/plain_table_reader.h"
 #include "util/hash.h"
 #include "util/logging.h"
 #include "util/mutexlock.h"
+#include "util/string_util.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
 #include "utilities/merge_operators.h"
@@ -36,7 +38,7 @@ using std::unique_ptr;
 
 namespace rocksdb {
 
-class PlainTableDBTest {
+class PlainTableDBTest : public testing::Test {
  protected:
  private:
   std::string dbname_;
@@ -48,20 +50,33 @@ class PlainTableDBTest {
  public:
   PlainTableDBTest() : env_(Env::Default()) {
     dbname_ = test::TmpDir() + "/plain_table_db_test";
-    ASSERT_OK(DestroyDB(dbname_, Options()));
+    EXPECT_OK(DestroyDB(dbname_, Options()));
     db_ = nullptr;
     Reopen();
   }
 
   ~PlainTableDBTest() {
     delete db_;
-    ASSERT_OK(DestroyDB(dbname_, Options()));
+    EXPECT_OK(DestroyDB(dbname_, Options()));
   }
 
   // Return the current option configuration.
   Options CurrentOptions() {
     Options options;
-    options.table_factory.reset(NewPlainTableFactory(16, 2, 0.8, 3));
+
+    PlainTableOptions plain_table_options;
+    plain_table_options.user_key_len = 0;
+    plain_table_options.bloom_bits_per_key = 2;
+    plain_table_options.hash_table_ratio = 0.8;
+    plain_table_options.index_sparseness = 3;
+    plain_table_options.huge_page_tlb_size = 0;
+    plain_table_options.encoding_type = kPrefix;
+    plain_table_options.full_scan_mode = false;
+    plain_table_options.store_index_in_file = false;
+
+    options.table_factory.reset(NewPlainTableFactory(plain_table_options));
+    options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0, 3, true));
+
     options.prefix_extractor.reset(NewFixedPrefixTransform(8));
     options.allow_mmap_reads = true;
     return options;
@@ -135,16 +150,15 @@ class PlainTableDBTest {
 
   int NumTableFilesAtLevel(int level) {
     std::string property;
-    ASSERT_TRUE(
-        db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(level),
-                         &property));
+    EXPECT_TRUE(db_->GetProperty(
+        "rocksdb.num-files-at-level" + NumberToString(level), &property));
     return atoi(property.c_str());
   }
 
   // Return spread of files per level
   std::string FilesPerLevel() {
     std::string result;
-    int last_non_zero_offset = 0;
+    size_t last_non_zero_offset = 0;
     for (int level = 0; level < db_->NumberLevels(); level++) {
       int f = NumTableFilesAtLevel(level);
       char buf[100];
@@ -169,26 +183,48 @@ class PlainTableDBTest {
   }
 };
 
-TEST(PlainTableDBTest, Empty) {
+TEST_F(PlainTableDBTest, Empty) {
   ASSERT_TRUE(dbfull() != nullptr);
   ASSERT_EQ("NOT_FOUND", Get("0000000000000foo"));
 }
 
+extern const uint64_t kPlainTableMagicNumber;
+
 class TestPlainTableReader : public PlainTableReader {
  public:
-  TestPlainTableReader(const EnvOptions& storage_options,
+  TestPlainTableReader(const EnvOptions& env_options,
                        const InternalKeyComparator& icomparator,
-                       uint64_t file_size, int bloom_bits_per_key,
-                       double hash_table_ratio, size_t index_sparseness,
+                       EncodingType encoding_type, uint64_t file_size,
+                       int bloom_bits_per_key, double hash_table_ratio,
+                       size_t index_sparseness,
                        const TableProperties* table_properties,
                        unique_ptr<RandomAccessFile>&& file,
-                       const Options& options, bool* expect_bloom_not_match)
-      : PlainTableReader(options, std::move(file), storage_options, icomparator,
-                         file_size, bloom_bits_per_key, hash_table_ratio,
-                         index_sparseness, table_properties, 2 * 1024 * 1024),
+                       const ImmutableCFOptions& ioptions,
+                       bool* expect_bloom_not_match,
+                       bool store_index_in_file)
+      : PlainTableReader(ioptions, std::move(file), env_options, icomparator,
+                         encoding_type, file_size, table_properties),
         expect_bloom_not_match_(expect_bloom_not_match) {
-    Status s = PopulateIndex(const_cast<TableProperties*>(table_properties));
-    ASSERT_TRUE(s.ok());
+    Status s = MmapDataFile();
+    EXPECT_TRUE(s.ok());
+
+    s = PopulateIndex(const_cast<TableProperties*>(table_properties),
+                      bloom_bits_per_key, hash_table_ratio, index_sparseness,
+                      2 * 1024 * 1024);
+    EXPECT_TRUE(s.ok());
+
+    TableProperties* props = const_cast<TableProperties*>(table_properties);
+    if (store_index_in_file) {
+      auto bloom_version_ptr = props->user_collected_properties.find(
+          PlainTablePropertyNames::kBloomVersion);
+      EXPECT_TRUE(bloom_version_ptr != props->user_collected_properties.end());
+      EXPECT_EQ(bloom_version_ptr->second, std::string("1"));
+      if (ioptions.bloom_locality > 0) {
+        auto num_blocks_ptr = props->user_collected_properties.find(
+            PlainTablePropertyNames::kNumBloomBlocks);
+        EXPECT_TRUE(num_blocks_ptr != props->user_collected_properties.end());
+      }
+    }
   }
 
   virtual ~TestPlainTableReader() {}
@@ -196,7 +232,11 @@ class TestPlainTableReader : public PlainTableReader {
  private:
   virtual bool MatchBloom(uint32_t hash) const override {
     bool ret = PlainTableReader::MatchBloom(hash);
-    ASSERT_TRUE(!*expect_bloom_not_match_ || !ret);
+    if (*expect_bloom_not_match_) {
+      EXPECT_TRUE(!ret);
+    } else {
+      EXPECT_TRUE(ret);
+    }
     return ret;
   }
   bool* expect_bloom_not_match_;
@@ -206,30 +246,50 @@ extern const uint64_t kPlainTableMagicNumber;
 class TestPlainTableFactory : public PlainTableFactory {
  public:
   explicit TestPlainTableFactory(bool* expect_bloom_not_match,
-                                 uint32_t user_key_len, int bloom_bits_per_key,
-                                 double hash_table_ratio,
-                                 size_t index_sparseness,
-                                 size_t huge_page_tlb_size)
-      : PlainTableFactory(user_key_len, user_key_len, hash_table_ratio,
-                          index_sparseness, huge_page_tlb_size),
-        bloom_bits_per_key_(bloom_bits_per_key),
-        hash_table_ratio_(hash_table_ratio),
-        index_sparseness_(index_sparseness),
+                                 const PlainTableOptions& options)
+      : PlainTableFactory(options),
+        bloom_bits_per_key_(options.bloom_bits_per_key),
+        hash_table_ratio_(options.hash_table_ratio),
+        index_sparseness_(options.index_sparseness),
+        store_index_in_file_(options.store_index_in_file),
         expect_bloom_not_match_(expect_bloom_not_match) {}
 
-  Status NewTableReader(const Options& options, const EnvOptions& soptions,
+  Status NewTableReader(const ImmutableCFOptions& ioptions,
+                        const EnvOptions& env_options,
                         const InternalKeyComparator& internal_comparator,
                         unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
                         unique_ptr<TableReader>* table) const override {
     TableProperties* props = nullptr;
     auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber,
-                                 options.env, options.info_log.get(), &props);
-    ASSERT_TRUE(s.ok());
+                                 ioptions.env, ioptions.info_log, &props);
+    EXPECT_TRUE(s.ok());
+
+    if (store_index_in_file_) {
+      BlockHandle bloom_block_handle;
+      s = FindMetaBlock(file.get(), file_size, kPlainTableMagicNumber,
+                        ioptions.env, BloomBlockBuilder::kBloomBlock,
+                        &bloom_block_handle);
+      EXPECT_TRUE(s.ok());
+
+      BlockHandle index_block_handle;
+      s = FindMetaBlock(
+          file.get(), file_size, kPlainTableMagicNumber, ioptions.env,
+          PlainTableIndexBuilder::kPlainTableIndexBlock, &index_block_handle);
+      EXPECT_TRUE(s.ok());
+    }
+
+    auto& user_props = props->user_collected_properties;
+    auto encoding_type_prop =
+        user_props.find(PlainTablePropertyNames::kEncodingType);
+    assert(encoding_type_prop != user_props.end());
+    EncodingType encoding_type = static_cast<EncodingType>(
+        DecodeFixed32(encoding_type_prop->second.c_str()));
 
     std::unique_ptr<PlainTableReader> new_reader(new TestPlainTableReader(
-        soptions, internal_comparator, file_size, bloom_bits_per_key_,
-        hash_table_ratio_, index_sparseness_, props, std::move(file), options,
-        expect_bloom_not_match_));
+        env_options, internal_comparator, encoding_type, file_size,
+        bloom_bits_per_key_, hash_table_ratio_, index_sparseness_, props,
+        std::move(file), ioptions, expect_bloom_not_match_,
+        store_index_in_file_));
 
     *table = std::move(new_reader);
     return s;
@@ -239,69 +299,136 @@ class TestPlainTableFactory : public PlainTableFactory {
   int bloom_bits_per_key_;
   double hash_table_ratio_;
   size_t index_sparseness_;
+  bool store_index_in_file_;
   bool* expect_bloom_not_match_;
 };
 
-TEST(PlainTableDBTest, Flush) {
+TEST_F(PlainTableDBTest, Flush) {
   for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
        huge_page_tlb_size += 2 * 1024 * 1024) {
+    for (EncodingType encoding_type : {kPlain, kPrefix}) {
     for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
       for (int total_order = 0; total_order <= 1; total_order++) {
-        Options options = CurrentOptions();
-        options.create_if_missing = true;
-        // Set only one bucket to force bucket conflict.
-        // Test index interval for the same prefix to be 1, 2 and 4
-        if (total_order) {
-          options.table_factory.reset(NewTotalOrderPlainTableFactory(
-              16, bloom_bits, 2, huge_page_tlb_size));
-        } else {
-          options.table_factory.reset(NewPlainTableFactory(
-              16, bloom_bits, 0.75, 16, huge_page_tlb_size));
-        }
-        DestroyAndReopen(&options);
-
-        ASSERT_OK(Put("1000000000000foo", "v1"));
-        ASSERT_OK(Put("0000000000000bar", "v2"));
-        ASSERT_OK(Put("1000000000000foo", "v3"));
-        dbfull()->TEST_FlushMemTable();
+        for (int store_index_in_file = 0; store_index_in_file <= 1;
+             ++store_index_in_file) {
+          if (!bloom_bits && store_index_in_file) {
+            continue;
+          }
 
-        TablePropertiesCollection ptc;
-        reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc);
-        ASSERT_EQ(1U, ptc.size());
-        auto row = ptc.begin();
-        auto tp = row->second;
-        ASSERT_EQ(total_order ? "4" : "12", (tp->user_collected_properties).at(
-                                                "plain_table_hash_table_size"));
-        ASSERT_EQ(total_order ? "9" : "0", (tp->user_collected_properties).at(
-                                               "plain_table_sub_index_size"));
-
-        ASSERT_EQ("v3", Get("1000000000000foo"));
-        ASSERT_EQ("v2", Get("0000000000000bar"));
+          Options options = CurrentOptions();
+          options.create_if_missing = true;
+          // Set only one bucket to force bucket conflict.
+          // Test index interval for the same prefix to be 1, 2 and 4
+          if (total_order) {
+            options.prefix_extractor.reset();
+
+            PlainTableOptions plain_table_options;
+            plain_table_options.user_key_len = 0;
+            plain_table_options.bloom_bits_per_key = bloom_bits;
+            plain_table_options.hash_table_ratio = 0;
+            plain_table_options.index_sparseness = 2;
+            plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
+            plain_table_options.encoding_type = encoding_type;
+            plain_table_options.full_scan_mode = false;
+            plain_table_options.store_index_in_file = store_index_in_file;
+
+            options.table_factory.reset(
+                NewPlainTableFactory(plain_table_options));
+          } else {
+            PlainTableOptions plain_table_options;
+            plain_table_options.user_key_len = 0;
+            plain_table_options.bloom_bits_per_key = bloom_bits;
+            plain_table_options.hash_table_ratio = 0.75;
+            plain_table_options.index_sparseness = 16;
+            plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
+            plain_table_options.encoding_type = encoding_type;
+            plain_table_options.full_scan_mode = false;
+            plain_table_options.store_index_in_file = store_index_in_file;
+
+            options.table_factory.reset(
+                NewPlainTableFactory(plain_table_options));
+          }
+          DestroyAndReopen(&options);
+          uint64_t int_num;
+          ASSERT_TRUE(dbfull()->GetIntProperty(
+              "rocksdb.estimate-table-readers-mem", &int_num));
+          ASSERT_EQ(int_num, 0U);
+
+          ASSERT_OK(Put("1000000000000foo", "v1"));
+          ASSERT_OK(Put("0000000000000bar", "v2"));
+          ASSERT_OK(Put("1000000000000foo", "v3"));
+          dbfull()->TEST_FlushMemTable();
+
+          ASSERT_TRUE(dbfull()->GetIntProperty(
+              "rocksdb.estimate-table-readers-mem", &int_num));
+          ASSERT_GT(int_num, 0U);
+
+          TablePropertiesCollection ptc;
+          reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc);
+          ASSERT_EQ(1U, ptc.size());
+          auto row = ptc.begin();
+          auto tp = row->second;
+
+          if (!store_index_in_file) {
+            ASSERT_EQ(total_order ? "4" : "12",
+                      (tp->user_collected_properties)
+                          .at("plain_table_hash_table_size"));
+            ASSERT_EQ("0", (tp->user_collected_properties)
+                               .at("plain_table_sub_index_size"));
+          } else {
+            ASSERT_EQ("0", (tp->user_collected_properties)
+                               .at("plain_table_hash_table_size"));
+            ASSERT_EQ("0", (tp->user_collected_properties)
+                               .at("plain_table_sub_index_size"));
+          }
+          ASSERT_EQ("v3", Get("1000000000000foo"));
+          ASSERT_EQ("v2", Get("0000000000000bar"));
+        }
+        }
       }
     }
   }
 }
 
-TEST(PlainTableDBTest, Flush2) {
+TEST_F(PlainTableDBTest, Flush2) {
   for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
        huge_page_tlb_size += 2 * 1024 * 1024) {
+    for (EncodingType encoding_type : {kPlain, kPrefix}) {
     for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
       for (int total_order = 0; total_order <= 1; total_order++) {
+        for (int store_index_in_file = 0; store_index_in_file <= 1;
+             ++store_index_in_file) {
+          if (encoding_type == kPrefix && total_order) {
+            continue;
+          }
+          if (!bloom_bits && store_index_in_file) {
+            continue;
+          }
+          if (total_order && store_index_in_file) {
+          continue;
+        }
         bool expect_bloom_not_match = false;
         Options options = CurrentOptions();
         options.create_if_missing = true;
         // Set only one bucket to force bucket conflict.
         // Test index interval for the same prefix to be 1, 2 and 4
+        PlainTableOptions plain_table_options;
         if (total_order) {
           options.prefix_extractor = nullptr;
-          options.table_factory.reset(
-              new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits,
-                                        0, 2, huge_page_tlb_size));
+          plain_table_options.hash_table_ratio = 0;
+          plain_table_options.index_sparseness = 2;
         } else {
-          options.table_factory.reset(
-              new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits,
-                                        0.75, 16, huge_page_tlb_size));
+          plain_table_options.hash_table_ratio = 0.75;
+          plain_table_options.index_sparseness = 16;
         }
+        plain_table_options.user_key_len = kPlainTableVariableLength;
+        plain_table_options.bloom_bits_per_key = bloom_bits;
+        plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
+        plain_table_options.encoding_type = encoding_type;
+        plain_table_options.store_index_in_file = store_index_in_file;
+        options.table_factory.reset(new TestPlainTableFactory(
+            &expect_bloom_not_match, plain_table_options));
+
         DestroyAndReopen(&options);
         ASSERT_OK(Put("0000000000000bar", "b"));
         ASSERT_OK(Put("1000000000000foo", "v1"));
@@ -329,7 +456,6 @@ TEST(PlainTableDBTest, Flush2) {
           // Neither key nor value should exist.
           expect_bloom_not_match = true;
           ASSERT_EQ("NOT_FOUND", Get("5_not00000000bar"));
-
           // Key doesn't exist any more but prefix exists.
           if (total_order) {
             ASSERT_EQ("NOT_FOUND", Get("1000000000000not"));
@@ -338,15 +464,21 @@ TEST(PlainTableDBTest, Flush2) {
           expect_bloom_not_match = false;
         }
       }
+      }
+    }
     }
   }
 }
 
-TEST(PlainTableDBTest, Iterator) {
+TEST_F(PlainTableDBTest, Iterator) {
   for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
        huge_page_tlb_size += 2 * 1024 * 1024) {
+    for (EncodingType encoding_type : {kPlain, kPrefix}) {
     for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
       for (int total_order = 0; total_order <= 1; total_order++) {
+        if (encoding_type == kPrefix && total_order == 1) {
+          continue;
+        }
         bool expect_bloom_not_match = false;
         Options options = CurrentOptions();
         options.create_if_missing = true;
@@ -354,13 +486,28 @@ TEST(PlainTableDBTest, Iterator) {
         // Test index interval for the same prefix to be 1, 2 and 4
         if (total_order) {
           options.prefix_extractor = nullptr;
-          options.table_factory.reset(
-              new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits,
-                                        0, 2, huge_page_tlb_size));
+
+          PlainTableOptions plain_table_options;
+          plain_table_options.user_key_len = 16;
+          plain_table_options.bloom_bits_per_key = bloom_bits;
+          plain_table_options.hash_table_ratio = 0;
+          plain_table_options.index_sparseness = 2;
+          plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
+          plain_table_options.encoding_type = encoding_type;
+
+          options.table_factory.reset(new TestPlainTableFactory(
+              &expect_bloom_not_match, plain_table_options));
         } else {
-          options.table_factory.reset(
-              new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits,
-                                        0.75, 16, huge_page_tlb_size));
+          PlainTableOptions plain_table_options;
+          plain_table_options.user_key_len = 16;
+          plain_table_options.bloom_bits_per_key = bloom_bits;
+          plain_table_options.hash_table_ratio = 0.75;
+          plain_table_options.index_sparseness = 16;
+          plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
+          plain_table_options.encoding_type = encoding_type;
+
+          options.table_factory.reset(new TestPlainTableFactory(
+              &expect_bloom_not_match, plain_table_options));
         }
         DestroyAndReopen(&options);
 
@@ -447,6 +594,7 @@ TEST(PlainTableDBTest, Iterator) {
         delete iter;
       }
     }
+    }
   }
 }
 
@@ -456,9 +604,15 @@ std::string MakeLongKey(size_t length, char c) {
 }
 }  // namespace
 
-TEST(PlainTableDBTest, IteratorLargeKeys) {
+TEST_F(PlainTableDBTest, IteratorLargeKeys) {
   Options options = CurrentOptions();
-  options.table_factory.reset(NewTotalOrderPlainTableFactory(0, 0, 16));
+
+  PlainTableOptions plain_table_options;
+  plain_table_options.user_key_len = 0;
+  plain_table_options.bloom_bits_per_key = 0;
+  plain_table_options.hash_table_ratio = 0;
+
+  options.table_factory.reset(NewPlainTableFactory(plain_table_options));
   options.create_if_missing = true;
   options.prefix_extractor.reset();
   DestroyAndReopen(&options);
@@ -474,7 +628,7 @@ TEST(PlainTableDBTest, IteratorLargeKeys) {
   };
 
   for (size_t i = 0; i < 7; i++) {
-    ASSERT_OK(Put(key_list[i], std::to_string(i)));
+    ASSERT_OK(Put(key_list[i], ToString(i)));
   }
 
   dbfull()->TEST_FlushMemTable();
@@ -485,7 +639,7 @@ TEST(PlainTableDBTest, IteratorLargeKeys) {
   for (size_t i = 0; i < 7; i++) {
     ASSERT_TRUE(iter->Valid());
     ASSERT_EQ(key_list[i], iter->key().ToString());
-    ASSERT_EQ(std::to_string(i), iter->value().ToString());
+    ASSERT_EQ(ToString(i), iter->value().ToString());
     iter->Next();
   }
 
@@ -494,40 +648,60 @@ TEST(PlainTableDBTest, IteratorLargeKeys) {
   delete iter;
 }
 
-// A test comparator which compare two strings in this way:
-// (1) first compare prefix of 8 bytes in alphabet order,
-// (2) if two strings share the same prefix, sort the other part of the string
-//     in the reverse alphabet order.
-class SimpleSuffixReverseComparator : public Comparator {
- public:
-  SimpleSuffixReverseComparator() {}
+namespace {
+std::string MakeLongKeyWithPrefix(size_t length, char c) {
+  return "00000000" + std::string(length - 8, c);
+}
+}  // namespace
 
-  virtual const char* Name() const { return "SimpleSuffixReverseComparator"; }
+TEST_F(PlainTableDBTest, IteratorLargeKeysWithPrefix) {
+  Options options = CurrentOptions();
 
-  virtual int Compare(const Slice& a, const Slice& b) const {
-    Slice prefix_a = Slice(a.data(), 8);
-    Slice prefix_b = Slice(b.data(), 8);
-    int prefix_comp = prefix_a.compare(prefix_b);
-    if (prefix_comp != 0) {
-      return prefix_comp;
-    } else {
-      Slice suffix_a = Slice(a.data() + 8, a.size() - 8);
-      Slice suffix_b = Slice(b.data() + 8, b.size() - 8);
-      return -(suffix_a.compare(suffix_b));
-    }
+  PlainTableOptions plain_table_options;
+  plain_table_options.user_key_len = 16;
+  plain_table_options.bloom_bits_per_key = 0;
+  plain_table_options.hash_table_ratio = 0.8;
+  plain_table_options.index_sparseness = 3;
+  plain_table_options.huge_page_tlb_size = 0;
+  plain_table_options.encoding_type = kPrefix;
+
+  options.table_factory.reset(NewPlainTableFactory(plain_table_options));
+  options.create_if_missing = true;
+  DestroyAndReopen(&options);
+
+  std::string key_list[] = {
+      MakeLongKeyWithPrefix(30, '0'), MakeLongKeyWithPrefix(16, '1'),
+      MakeLongKeyWithPrefix(32, '2'), MakeLongKeyWithPrefix(60, '3'),
+      MakeLongKeyWithPrefix(90, '4'), MakeLongKeyWithPrefix(50, '5'),
+      MakeLongKeyWithPrefix(26, '6')};
+
+  for (size_t i = 0; i < 7; i++) {
+    ASSERT_OK(Put(key_list[i], ToString(i)));
   }
-  virtual void FindShortestSeparator(std::string* start,
-                                     const Slice& limit) const {}
 
-  virtual void FindShortSuccessor(std::string* key) const {}
-};
+  dbfull()->TEST_FlushMemTable();
+
+  Iterator* iter = dbfull()->NewIterator(ReadOptions());
+  iter->Seek(key_list[0]);
+
+  for (size_t i = 0; i < 7; i++) {
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(key_list[i], iter->key().ToString());
+    ASSERT_EQ(ToString(i), iter->value().ToString());
+    iter->Next();
+  }
+
+  ASSERT_TRUE(!iter->Valid());
+
+  delete iter;
+}
 
-TEST(PlainTableDBTest, IteratorReverseSuffixComparator) {
+TEST_F(PlainTableDBTest, IteratorReverseSuffixComparator) {
   Options options = CurrentOptions();
   options.create_if_missing = true;
   // Set only one bucket to force bucket conflict.
   // Test index interval for the same prefix to be 1, 2 and 4
-  SimpleSuffixReverseComparator comp;
+  test::SimpleSuffixReverseComparator comp;
   options.comparator = ∁
   DestroyAndReopen(&options);
 
@@ -591,7 +765,7 @@ TEST(PlainTableDBTest, IteratorReverseSuffixComparator) {
   delete iter;
 }
 
-TEST(PlainTableDBTest, HashBucketConflict) {
+TEST_F(PlainTableDBTest, HashBucketConflict) {
   for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
        huge_page_tlb_size += 2 * 1024 * 1024) {
     for (unsigned char i = 1; i <= 3; i++) {
@@ -599,8 +773,16 @@ TEST(PlainTableDBTest, HashBucketConflict) {
       options.create_if_missing = true;
       // Set only one bucket to force bucket conflict.
       // Test index interval for the same prefix to be 1, 2 and 4
-      options.table_factory.reset(
-          NewTotalOrderPlainTableFactory(16, 0, 2 ^ i, huge_page_tlb_size));
+
+      PlainTableOptions plain_table_options;
+      plain_table_options.user_key_len = 16;
+      plain_table_options.bloom_bits_per_key = 0;
+      plain_table_options.hash_table_ratio = 0;
+      plain_table_options.index_sparseness = 2 ^ i;
+      plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
+
+      options.table_factory.reset(NewPlainTableFactory(plain_table_options));
+
       DestroyAndReopen(&options);
       ASSERT_OK(Put("5000000000000fo0", "v1"));
       ASSERT_OK(Put("5000000000000fo1", "v2"));
@@ -676,18 +858,25 @@ TEST(PlainTableDBTest, HashBucketConflict) {
   }
 }
 
-TEST(PlainTableDBTest, HashBucketConflictReverseSuffixComparator) {
+TEST_F(PlainTableDBTest, HashBucketConflictReverseSuffixComparator) {
   for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
        huge_page_tlb_size += 2 * 1024 * 1024) {
     for (unsigned char i = 1; i <= 3; i++) {
       Options options = CurrentOptions();
       options.create_if_missing = true;
-      SimpleSuffixReverseComparator comp;
+      test::SimpleSuffixReverseComparator comp;
       options.comparator = ∁
       // Set only one bucket to force bucket conflict.
       // Test index interval for the same prefix to be 1, 2 and 4
-      options.table_factory.reset(
-          NewTotalOrderPlainTableFactory(16, 0, 2 ^ i, huge_page_tlb_size));
+
+      PlainTableOptions plain_table_options;
+      plain_table_options.user_key_len = 16;
+      plain_table_options.bloom_bits_per_key = 0;
+      plain_table_options.hash_table_ratio = 0;
+      plain_table_options.index_sparseness = 2 ^ i;
+      plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
+
+      options.table_factory.reset(NewPlainTableFactory(plain_table_options));
       DestroyAndReopen(&options);
       ASSERT_OK(Put("5000000000000fo0", "v1"));
       ASSERT_OK(Put("5000000000000fo1", "v2"));
@@ -762,12 +951,18 @@ TEST(PlainTableDBTest, HashBucketConflictReverseSuffixComparator) {
   }
 }
 
-TEST(PlainTableDBTest, NonExistingKeyToNonEmptyBucket) {
+TEST_F(PlainTableDBTest, NonExistingKeyToNonEmptyBucket) {
   Options options = CurrentOptions();
   options.create_if_missing = true;
   // Set only one bucket to force bucket conflict.
   // Test index interval for the same prefix to be 1, 2 and 4
-  options.table_factory.reset(NewTotalOrderPlainTableFactory(16, 0, 5));
+  PlainTableOptions plain_table_options;
+  plain_table_options.user_key_len = 16;
+  plain_table_options.bloom_bits_per_key = 0;
+  plain_table_options.hash_table_ratio = 0;
+  plain_table_options.index_sparseness = 5;
+
+  options.table_factory.reset(NewPlainTableFactory(plain_table_options));
   DestroyAndReopen(&options);
   ASSERT_OK(Put("5000000000000fo0", "v1"));
   ASSERT_OK(Put("5000000000000fo1", "v2"));
@@ -812,7 +1007,7 @@ static std::string RandomString(Random* rnd, int len) {
   return r;
 }
 
-TEST(PlainTableDBTest, CompactionTrigger) {
+TEST_F(PlainTableDBTest, CompactionTrigger) {
   Options options = CurrentOptions();
   options.write_buffer_size = 100 << 10; //100KB
   options.num_levels = 3;
@@ -846,8 +1041,52 @@ TEST(PlainTableDBTest, CompactionTrigger) {
   ASSERT_EQ(NumTableFilesAtLevel(1), 1);
 }
 
+TEST_F(PlainTableDBTest, AdaptiveTable) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+
+  options.table_factory.reset(NewPlainTableFactory());
+  DestroyAndReopen(&options);
+
+  ASSERT_OK(Put("1000000000000foo", "v1"));
+  ASSERT_OK(Put("0000000000000bar", "v2"));
+  ASSERT_OK(Put("1000000000000foo", "v3"));
+  dbfull()->TEST_FlushMemTable();
+
+  options.create_if_missing = false;
+  std::shared_ptr<TableFactory> dummy_factory;
+  std::shared_ptr<TableFactory> block_based_factory(
+      NewBlockBasedTableFactory());
+  options.table_factory.reset(NewAdaptiveTableFactory(
+      block_based_factory, dummy_factory, dummy_factory));
+  Reopen(&options);
+  ASSERT_EQ("v3", Get("1000000000000foo"));
+  ASSERT_EQ("v2", Get("0000000000000bar"));
+
+  ASSERT_OK(Put("2000000000000foo", "v4"));
+  ASSERT_OK(Put("3000000000000bar", "v5"));
+  dbfull()->TEST_FlushMemTable();
+  ASSERT_EQ("v4", Get("2000000000000foo"));
+  ASSERT_EQ("v5", Get("3000000000000bar"));
+
+  Reopen(&options);
+  ASSERT_EQ("v3", Get("1000000000000foo"));
+  ASSERT_EQ("v2", Get("0000000000000bar"));
+  ASSERT_EQ("v4", Get("2000000000000foo"));
+  ASSERT_EQ("v5", Get("3000000000000bar"));
+
+  options.table_factory.reset(NewBlockBasedTableFactory());
+  Reopen(&options);
+  ASSERT_NE("v3", Get("1000000000000foo"));
+
+  options.table_factory.reset(NewPlainTableFactory());
+  Reopen(&options);
+  ASSERT_NE("v5", Get("3000000000000bar"));
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
-  return rocksdb::test::RunAllTests();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
 }
diff --git a/src/rocksdb/db/prefix_test.cc b/src/rocksdb/db/prefix_test.cc
index 3a88fc8..3cc4e89 100644
--- a/src/rocksdb/db/prefix_test.cc
+++ b/src/rocksdb/db/prefix_test.cc
@@ -3,6 +3,14 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+  return 1;
+}
+#else
+
 #include <algorithm>
 #include <iostream>
 #include <vector>
@@ -15,18 +23,21 @@
 #include "rocksdb/memtablerep.h"
 #include "util/histogram.h"
 #include "util/stop_watch.h"
+#include "util/string_util.h"
 #include "util/testharness.h"
 
+using GFLAGS::ParseCommandLineFlags;
+
 DEFINE_bool(trigger_deadlock, false,
             "issue delete in range scan to trigger PrefixHashMap deadlock");
-DEFINE_uint64(bucket_count, 100000, "number of buckets");
+DEFINE_int32(bucket_count, 100000, "number of buckets");
 DEFINE_uint64(num_locks, 10001, "number of locks");
 DEFINE_bool(random_prefix, false, "randomize prefix");
 DEFINE_uint64(total_prefixes, 100000, "total number of prefixes");
 DEFINE_uint64(items_per_prefix, 1, "total number of values per prefix");
 DEFINE_int64(write_buffer_size, 33554432, "");
-DEFINE_int64(max_write_buffer_number, 2, "");
-DEFINE_int64(min_write_buffer_number_to_merge, 1, "");
+DEFINE_int32(max_write_buffer_number, 2, "");
+DEFINE_int32(min_write_buffer_number_to_merge, 1, "");
 DEFINE_int32(skiplist_height, 4, "");
 DEFINE_int32(memtable_prefix_bloom_bits, 10000000, "");
 DEFINE_int32(memtable_prefix_bloom_probes, 10, "");
@@ -42,7 +53,8 @@ struct TestKey {
   uint64_t prefix;
   uint64_t sorted;
 
-  TestKey(uint64_t prefix, uint64_t sorted) : prefix(prefix), sorted(sorted) {}
+  TestKey(uint64_t _prefix, uint64_t _sorted)
+      : prefix(_prefix), sorted(_sorted) {}
 };
 
 // return a slice backed by test_key
@@ -59,20 +71,20 @@ class TestKeyComparator : public Comparator {
 
   // Compare needs to be aware of the possibility of a and/or b is
   // prefix only
-  virtual int Compare(const Slice& a, const Slice& b) const {
+  virtual int Compare(const Slice& a, const Slice& b) const override {
     const TestKey* key_a = SliceToTestKey(a);
     const TestKey* key_b = SliceToTestKey(b);
     if (key_a->prefix != key_b->prefix) {
       if (key_a->prefix < key_b->prefix) return -1;
       if (key_a->prefix > key_b->prefix) return 1;
     } else {
-      ASSERT_TRUE(key_a->prefix == key_b->prefix);
+      EXPECT_TRUE(key_a->prefix == key_b->prefix);
       // note, both a and b could be prefix only
       if (a.size() != b.size()) {
         // one of them is prefix
-        ASSERT_TRUE(
-          (a.size() == sizeof(uint64_t) && b.size() == sizeof(TestKey)) ||
-          (b.size() == sizeof(uint64_t) && a.size() == sizeof(TestKey)));
+        EXPECT_TRUE(
+            (a.size() == sizeof(uint64_t) && b.size() == sizeof(TestKey)) ||
+            (b.size() == sizeof(uint64_t) && a.size() == sizeof(TestKey)));
         if (a.size() < b.size()) return -1;
         if (a.size() > b.size()) return 1;
       } else {
@@ -82,7 +94,7 @@ class TestKeyComparator : public Comparator {
         }
 
         // both a and b are whole key
-        ASSERT_TRUE(a.size() == sizeof(TestKey) && b.size() == sizeof(TestKey));
+        EXPECT_TRUE(a.size() == sizeof(TestKey) && b.size() == sizeof(TestKey));
         if (key_a->sorted < key_b->sorted) return -1;
         if (key_a->sorted > key_b->sorted) return 1;
         if (key_a->sorted == key_b->sorted) return 0;
@@ -95,13 +107,10 @@ class TestKeyComparator : public Comparator {
     return "TestKeyComparator";
   }
 
-  virtual void FindShortestSeparator(
-      std::string* start,
-      const Slice& limit) const {
-  }
-
-  virtual void FindShortSuccessor(std::string* key) const {}
+  virtual void FindShortestSeparator(std::string* start,
+                                     const Slice& limit) const override {}
 
+  virtual void FindShortSuccessor(std::string* key) const override {}
 };
 
 namespace {
@@ -136,7 +145,7 @@ std::string Get(DB* db, const ReadOptions& read_options, uint64_t prefix,
 }
 }  // namespace
 
-class PrefixTest {
+class PrefixTest : public testing::Test {
  public:
   std::shared_ptr<DB> OpenDb() {
     DB* db;
@@ -153,7 +162,7 @@ class PrefixTest {
         FLAGS_memtable_prefix_bloom_huge_page_tlb_size;
 
     Status s = DB::Open(options, kDbName,  &db);
-    ASSERT_OK(s);
+    EXPECT_OK(s);
     return std::shared_ptr<DB>(db);
   }
 
@@ -179,6 +188,10 @@ class PrefixTest {
           options.memtable_factory.reset(
               NewHashLinkListRepFactory(bucket_count, 2 * 1024 * 1024));
           return true;
+        case kHashLinkListTriggerSkipList:
+          options.memtable_factory.reset(
+              NewHashLinkListRepFactory(bucket_count, 0, 3));
+          return true;
         default:
           return false;
       }
@@ -198,13 +211,14 @@ class PrefixTest {
     kHashSkipList,
     kHashLinkList,
     kHashLinkListHugePageTlb,
+    kHashLinkListTriggerSkipList,
     kEnd
   };
   int option_config_;
   Options options;
 };
 
-TEST(PrefixTest, TestResult) {
+TEST_F(PrefixTest, TestResult) {
   for (int num_buckets = 1; num_buckets <= 2; num_buckets++) {
     FirstOption();
     while (NextOptions(num_buckets)) {
@@ -377,7 +391,7 @@ TEST(PrefixTest, TestResult) {
   }
 }
 
-TEST(PrefixTest, DynamicPrefixIterator) {
+TEST_F(PrefixTest, DynamicPrefixIterator) {
   while (NextOptions(FLAGS_bucket_count)) {
     std::cout << "*** Mem table: " << options.memtable_factory->Name()
         << std::endl;
@@ -426,7 +440,7 @@ TEST(PrefixTest, DynamicPrefixIterator) {
     for (auto prefix : prefixes) {
       TestKey test_key(prefix, FLAGS_items_per_prefix / 2);
       Slice key = TestKeyToSlice(test_key);
-      std::string value = "v" + std::to_string(0);
+      std::string value = "v" + ToString(0);
 
       perf_context.Reset();
       StopWatchNano timer(Env::Default(), true);
@@ -479,9 +493,11 @@ TEST(PrefixTest, DynamicPrefixIterator) {
 }
 
 int main(int argc, char** argv) {
-  google::ParseCommandLineFlags(&argc, &argv, true);
+  ::testing::InitGoogleTest(&argc, argv);
+  ParseCommandLineFlags(&argc, &argv, true);
   std::cout << kDbName << "\n";
 
-  rocksdb::test::RunAllTests();
-  return 0;
+  return RUN_ALL_TESTS();
 }
+
+#endif  // GFLAGS
diff --git a/src/rocksdb/db/repair.cc b/src/rocksdb/db/repair.cc
index 8ae64b2..8b15eaa 100644
--- a/src/rocksdb/db/repair.cc
+++ b/src/rocksdb/db/repair.cc
@@ -7,30 +7,64 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 //
-// We recover the contents of the descriptor from the other files we find.
-// (1) Any log files are first converted to tables
-// (2) We scan every table to compute
-//     (a) smallest/largest for the table
-//     (b) largest sequence number in the table
-// (3) We generate descriptor contents:
-//      - log number is set to zero
-//      - next-file-number is set to 1 + largest file number we found
-//      - last-sequence-number is set to largest sequence# found across
-//        all tables (see 2c)
-//      - compaction pointers are cleared
-//      - every table file is added at level 0
+// Repairer does best effort recovery to recover as much data as possible after
+// a disaster without compromising consistency. It does not guarantee bringing
+// the database to a time consistent state.
+//
+// Repair process is broken into 4 phases:
+// (a) Find files
+// (b) Convert logs to tables
+// (c) Extract metadata
+// (d) Write Descriptor
+//
+// (a) Find files
+//
+// The repairer goes through all the files in the directory, and classifies them
+// based on their file name. Any file that cannot be identified by name will be
+// ignored.
+//
+// (b) Convert logs to table
+//
+// Every log file that is active is replayed. All sections of the file where the
+// checksum does not match is skipped over. We intentionally give preference to
+// data consistency.
+//
+// (c) Extract metadata
+//
+// We scan every table to compute
+// (1) smallest/largest for the table
+// (2) largest sequence number in the table
+//
+// If we are unable to scan the file, then we ignore the table.
+//
+// (d) Write Descriptor
+//
+// We generate descriptor contents:
+//  - log number is set to zero
+//  - next-file-number is set to 1 + largest file number we found
+//  - last-sequence-number is set to largest sequence# found across
+//    all tables (see 2c)
+//  - compaction pointers are cleared
+//  - every table file is added at level 0
 //
 // Possible optimization 1:
 //   (a) Compute total size and use to pick appropriate max-level M
 //   (b) Sort tables by largest sequence# in the table
 //   (c) For each table: if it overlaps earlier table, place in level-0,
 //       else place in level-M.
+//   (d) We can provide options for time consistent recovery and unsafe recovery
+//       (ignore checksum failure when applicable)
 // Possible optimization 2:
 //   Store per-table metadata (smallest, largest, largest-seq#, ...)
 //   in the table's meta section to speed up ScanTable.
 
 #ifndef ROCKSDB_LITE
 
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
 #include "db/builder.h"
 #include "db/db_impl.h"
 #include "db/dbformat.h"
@@ -40,10 +74,14 @@
 #include "db/memtable.h"
 #include "db/table_cache.h"
 #include "db/version_edit.h"
+#include "db/writebuffer.h"
 #include "db/write_batch_internal.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/immutable_options.h"
+#include "util/scoped_arena_iterator.h"
 
 namespace rocksdb {
 
@@ -55,16 +93,17 @@ class Repairer {
       : dbname_(dbname),
         env_(options.env),
         icmp_(options.comparator),
-        ipolicy_(options.filter_policy),
-        options_(SanitizeOptions(dbname, &icmp_, &ipolicy_, options)),
+        options_(SanitizeOptions(dbname, &icmp_, options)),
+        ioptions_(options_),
         raw_table_cache_(
             // TableCache can be small since we expect each table to be opened
             // once.
-            NewLRUCache(10, options_.table_cache_numshardbits,
-                        options_.table_cache_remove_scan_count_limit)),
+            NewLRUCache(10, options_.table_cache_numshardbits)),
         next_file_number_(1) {
-    table_cache_ = new TableCache(dbname_, &options_, storage_options_,
-                                  raw_table_cache_.get());
+    GetIntTblPropCollectorFactory(options, &int_tbl_prop_collector_factories_);
+
+    table_cache_ =
+        new TableCache(ioptions_, env_options_, raw_table_cache_.get());
     edit_ = new VersionEdit();
   }
 
@@ -82,18 +121,17 @@ class Repairer {
       status = WriteDescriptor();
     }
     if (status.ok()) {
-      unsigned long long bytes = 0;
+      uint64_t bytes = 0;
       for (size_t i = 0; i < tables_.size(); i++) {
-        bytes += tables_[i].meta.file_size;
+        bytes += tables_[i].meta.fd.GetFileSize();
       }
-      Log(options_.info_log,
+      Log(InfoLogLevel::WARN_LEVEL, options_.info_log,
           "**** Repaired rocksdb %s; "
-          "recovered %d files; %llu bytes. "
+          "recovered %zu files; %" PRIu64
+          "bytes. "
           "Some data may have been lost. "
           "****",
-          dbname_.c_str(),
-          static_cast<int>(tables_.size()),
-          bytes);
+          dbname_.c_str(), tables_.size(), bytes);
     }
     return status;
   }
@@ -107,51 +145,62 @@ class Repairer {
 
   std::string const dbname_;
   Env* const env_;
-  InternalKeyComparator const icmp_;
-  InternalFilterPolicy const ipolicy_;
-  Options const options_;
+  const InternalKeyComparator icmp_;
+  std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
+      int_tbl_prop_collector_factories_;
+  const Options options_;
+  const ImmutableCFOptions ioptions_;
   std::shared_ptr<Cache> raw_table_cache_;
   TableCache* table_cache_;
   VersionEdit* edit_;
 
   std::vector<std::string> manifests_;
-  std::vector<uint64_t> table_numbers_;
+  std::vector<FileDescriptor> table_fds_;
   std::vector<uint64_t> logs_;
   std::vector<TableInfo> tables_;
   uint64_t next_file_number_;
-  const EnvOptions storage_options_;
+  const EnvOptions env_options_;
 
   Status FindFiles() {
     std::vector<std::string> filenames;
-    Status status = env_->GetChildren(dbname_, &filenames);
-    if (!status.ok()) {
-      return status;
-    }
-    if (filenames.empty()) {
-      return Status::Corruption(dbname_, "repair found no files");
-    }
+    bool found_file = false;
+    for (uint32_t path_id = 0; path_id < options_.db_paths.size(); path_id++) {
+      Status status =
+          env_->GetChildren(options_.db_paths[path_id].path, &filenames);
+      if (!status.ok()) {
+        return status;
+      }
+      if (!filenames.empty()) {
+        found_file = true;
+      }
 
-    uint64_t number;
-    FileType type;
-    for (size_t i = 0; i < filenames.size(); i++) {
-      if (ParseFileName(filenames[i], &number, &type)) {
-        if (type == kDescriptorFile) {
-          manifests_.push_back(filenames[i]);
-        } else {
-          if (number + 1 > next_file_number_) {
-            next_file_number_ = number + 1;
-          }
-          if (type == kLogFile) {
-            logs_.push_back(number);
-          } else if (type == kTableFile) {
-            table_numbers_.push_back(number);
+      uint64_t number;
+      FileType type;
+      for (size_t i = 0; i < filenames.size(); i++) {
+        if (ParseFileName(filenames[i], &number, &type)) {
+          if (type == kDescriptorFile) {
+            assert(path_id == 0);
+            manifests_.push_back(filenames[i]);
           } else {
-            // Ignore other files
+            if (number + 1 > next_file_number_) {
+              next_file_number_ = number + 1;
+            }
+            if (type == kLogFile) {
+              assert(path_id == 0);
+              logs_.push_back(number);
+            } else if (type == kTableFile) {
+              table_fds_.emplace_back(number, path_id, 0);
+            } else {
+              // Ignore other files
+            }
           }
         }
       }
     }
-    return status;
+    if (!found_file) {
+      return Status::Corruption(dbname_, "repair found no files");
+    }
+    return Status::OK();
   }
 
   void ConvertLogFilesToTables() {
@@ -159,8 +208,8 @@ class Repairer {
       std::string logname = LogFileName(dbname_, logs_[i]);
       Status status = ConvertLogToTable(logs_[i]);
       if (!status.ok()) {
-        Log(options_.info_log, "Log #%llu: ignoring conversion error: %s",
-            (unsigned long long) logs_[i],
+        Log(InfoLogLevel::WARN_LEVEL, options_.info_log,
+            "Log #%" PRIu64 ": ignoring conversion error: %s", logs_[i],
             status.ToString().c_str());
       }
       ArchiveFile(logname);
@@ -172,19 +221,18 @@ class Repairer {
       Env* env;
       std::shared_ptr<Logger> info_log;
       uint64_t lognum;
-      virtual void Corruption(size_t bytes, const Status& s) {
+      virtual void Corruption(size_t bytes, const Status& s) override {
         // We print error messages for corruption, but continue repairing.
-        Log(info_log, "Log #%llu: dropping %d bytes; %s",
-            (unsigned long long) lognum,
-            static_cast<int>(bytes),
-            s.ToString().c_str());
+        Log(InfoLogLevel::ERROR_LEVEL, info_log,
+            "Log #%" PRIu64 ": dropping %d bytes; %s", lognum,
+            static_cast<int>(bytes), s.ToString().c_str());
       }
     };
 
     // Open the log file
     std::string logname = LogFileName(dbname_, log);
     unique_ptr<SequentialFile> lfile;
-    Status status = env_->NewSequentialFile(logname, &lfile, storage_options_);
+    Status status = env_->NewSequentialFile(logname, &lfile, env_options_);
     if (!status.ok()) {
       return status;
     }
@@ -198,15 +246,17 @@ class Repairer {
     // corruptions cause entire commits to be skipped instead of
     // propagating bad information (like overly large sequence
     // numbers).
-    log::Reader reader(std::move(lfile), &reporter, false/*do not checksum*/,
+    log::Reader reader(std::move(lfile), &reporter, true /*enable checksum*/,
                        0/*initial_offset*/);
 
     // Read all the records and add to a memtable
     std::string scratch;
     Slice record;
     WriteBatch batch;
-    MemTable* mem = new MemTable(icmp_, options_);
-    auto cf_mems_default = new ColumnFamilyMemTablesDefault(mem, &options_);
+    WriteBuffer wb(options_.db_write_buffer_size);
+    MemTable* mem = new MemTable(icmp_, ioptions_,
+                                 MutableCFOptions(options_, ioptions_), &wb);
+    auto cf_mems_default = new ColumnFamilyMemTablesDefault(mem);
     mem->Ref();
     int counter = 0;
     while (reader.ReadRecord(&record, &scratch)) {
@@ -220,8 +270,8 @@ class Repairer {
       if (status.ok()) {
         counter += WriteBatchInternal::Count(&batch);
       } else {
-        Log(options_.info_log, "Log #%llu: ignoring %s",
-            (unsigned long long) log,
+        Log(InfoLogLevel::WARN_LEVEL,
+            options_.info_log, "Log #%" PRIu64 ": ignoring %s", log,
             status.ToString().c_str());
         status = Status::OK();  // Keep going with rest of file
       }
@@ -230,37 +280,44 @@ class Repairer {
     // Do not record a version edit for this conversion to a Table
     // since ExtractMetaData() will also generate edits.
     FileMetaData meta;
-    meta.number = next_file_number_++;
-    ReadOptions ro;
-    Iterator* iter = mem->NewIterator(ro, true /* enforce_total_order */);
-    status = BuildTable(dbname_, env_, options_, storage_options_, table_cache_,
-                        iter, &meta, icmp_, 0, 0, kNoCompression);
-    delete iter;
+    meta.fd = FileDescriptor(next_file_number_++, 0, 0);
+    {
+      ReadOptions ro;
+      ro.total_order_seek = true;
+      Arena arena;
+      ScopedArenaIterator iter(mem->NewIterator(ro, &arena));
+      status = BuildTable(dbname_, env_, ioptions_, env_options_, table_cache_,
+                          iter.get(), &meta, icmp_,
+                          &int_tbl_prop_collector_factories_, 0, 0,
+                          kNoCompression, CompressionOptions(), false);
+    }
     delete mem->Unref();
     delete cf_mems_default;
     mem = nullptr;
     if (status.ok()) {
-      if (meta.file_size > 0) {
-        table_numbers_.push_back(meta.number);
+      if (meta.fd.GetFileSize() > 0) {
+        table_fds_.push_back(meta.fd);
       }
     }
-    Log(options_.info_log, "Log #%llu: %d ops saved to Table #%llu %s",
-        (unsigned long long) log,
-        counter,
-        (unsigned long long) meta.number,
-        status.ToString().c_str());
+    Log(InfoLogLevel::INFO_LEVEL, options_.info_log,
+        "Log #%" PRIu64 ": %d ops saved to Table #%" PRIu64 " %s",
+        log, counter, meta.fd.GetNumber(), status.ToString().c_str());
     return status;
   }
 
   void ExtractMetaData() {
-    for (size_t i = 0; i < table_numbers_.size(); i++) {
+    for (size_t i = 0; i < table_fds_.size(); i++) {
       TableInfo t;
-      t.meta.number = table_numbers_[i];
+      t.meta.fd = table_fds_[i];
       Status status = ScanTable(&t);
       if (!status.ok()) {
-        std::string fname = TableFileName(dbname_, table_numbers_[i]);
-        Log(options_.info_log, "Table #%llu: ignoring %s",
-            (unsigned long long) table_numbers_[i],
+        std::string fname = TableFileName(
+            options_.db_paths, t.meta.fd.GetNumber(), t.meta.fd.GetPathId());
+        char file_num_buf[kFormatFileNumberBufSize];
+        FormatFileNumber(t.meta.fd.GetNumber(), t.meta.fd.GetPathId(),
+                         file_num_buf, sizeof(file_num_buf));
+        Log(InfoLogLevel::WARN_LEVEL, options_.info_log,
+            "Table #%s: ignoring %s", file_num_buf,
             status.ToString().c_str());
         ArchiveFile(fname);
       } else {
@@ -270,13 +327,16 @@ class Repairer {
   }
 
   Status ScanTable(TableInfo* t) {
-    std::string fname = TableFileName(dbname_, t->meta.number);
+    std::string fname = TableFileName(options_.db_paths, t->meta.fd.GetNumber(),
+                                      t->meta.fd.GetPathId());
     int counter = 0;
-    Status status = env_->GetFileSize(fname, &t->meta.file_size);
+    uint64_t file_size;
+    Status status = env_->GetFileSize(fname, &file_size);
+    t->meta.fd = FileDescriptor(t->meta.fd.GetNumber(), t->meta.fd.GetPathId(),
+                                file_size);
     if (status.ok()) {
-      FileMetaData dummy_meta(t->meta.number, t->meta.file_size);
       Iterator* iter = table_cache_->NewIterator(
-          ReadOptions(), storage_options_, icmp_, dummy_meta);
+          ReadOptions(), env_options_, icmp_, t->meta.fd);
       bool empty = true;
       ParsedInternalKey parsed;
       t->min_sequence = 0;
@@ -284,9 +344,9 @@ class Repairer {
       for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
         Slice key = iter->key();
         if (!ParseInternalKey(key, &parsed)) {
-          Log(options_.info_log, "Table #%llu: unparsable key %s",
-              (unsigned long long) t->meta.number,
-              EscapeString(key).c_str());
+          Log(InfoLogLevel::ERROR_LEVEL,
+              options_.info_log, "Table #%" PRIu64 ": unparsable key %s",
+              t->meta.fd.GetNumber(), EscapeString(key).c_str());
           continue;
         }
 
@@ -308,10 +368,9 @@ class Repairer {
       }
       delete iter;
     }
-    Log(options_.info_log, "Table #%llu: %d entries %s",
-        (unsigned long long) t->meta.number,
-        counter,
-        status.ToString().c_str());
+    Log(InfoLogLevel::INFO_LEVEL,
+        options_.info_log, "Table #%" PRIu64 ": %d entries %s",
+        t->meta.fd.GetNumber(), counter, status.ToString().c_str());
     return status;
   }
 
@@ -319,7 +378,7 @@ class Repairer {
     std::string tmp = TempFileName(dbname_, 1);
     unique_ptr<WritableFile> file;
     Status status = env_->NewWritableFile(
-        tmp, &file, env_->OptimizeForManifestWrite(storage_options_));
+        tmp, &file, env_->OptimizeForManifestWrite(env_options_));
     if (!status.ok()) {
       return status;
     }
@@ -339,9 +398,9 @@ class Repairer {
     for (size_t i = 0; i < tables_.size(); i++) {
       // TODO(opt): separate out into multiple levels
       const TableInfo& t = tables_[i];
-      edit_->AddFile(0, t.meta.number, t.meta.file_size,
-                    t.meta.smallest, t.meta.largest,
-                    t.min_sequence, t.max_sequence);
+      edit_->AddFile(0, t.meta.fd.GetNumber(), t.meta.fd.GetPathId(),
+                     t.meta.fd.GetFileSize(), t.meta.smallest, t.meta.largest,
+                     t.min_sequence, t.max_sequence);
     }
 
     //fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str());
@@ -363,7 +422,7 @@ class Repairer {
       // Install new manifest
       status = env_->RenameFile(tmp, DescriptorFileName(dbname_, 1));
       if (status.ok()) {
-        status = SetCurrentFile(env_, dbname_, 1);
+        status = SetCurrentFile(env_, dbname_, 1, nullptr);
       } else {
         env_->DeleteFile(tmp);
       }
@@ -387,7 +446,8 @@ class Repairer {
     new_file.append("/");
     new_file.append((slash == nullptr) ? fname.c_str() : slash + 1);
     Status s = env_->RenameFile(fname, new_file);
-    Log(options_.info_log, "Archiving %s: %s\n",
+    Log(InfoLogLevel::INFO_LEVEL,
+        options_.info_log, "Archiving %s: %s\n",
         fname.c_str(), s.ToString().c_str());
   }
 };
diff --git a/src/rocksdb/db/simple_table_db_test.cc b/src/rocksdb/db/simple_table_db_test.cc
deleted file mode 100644
index affa614..0000000
--- a/src/rocksdb/db/simple_table_db_test.cc
+++ /dev/null
@@ -1,794 +0,0 @@
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-//
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-#include <algorithm>
-#include <set>
-
-#include "rocksdb/db.h"
-#include "rocksdb/filter_policy.h"
-#include "db/db_impl.h"
-#include "db/filename.h"
-#include "db/version_set.h"
-#include "db/write_batch_internal.h"
-#include "rocksdb/statistics.h"
-#include "rocksdb/cache.h"
-#include "rocksdb/compaction_filter.h"
-#include "rocksdb/env.h"
-#include "rocksdb/table.h"
-#include "rocksdb/table_properties.h"
-#include "table/table_builder.h"
-#include "util/hash.h"
-#include "util/logging.h"
-#include "util/mutexlock.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
-#include "utilities/merge_operators.h"
-
-using std::unique_ptr;
-
-// IS THIS FILE STILL NEEDED?
-namespace rocksdb {
-
-// SimpleTable is a simple table format for UNIT TEST ONLY. It is not built
-// as production quality.
-// SimpleTable requires the input key size to be fixed 16 bytes, value cannot
-// be longer than 150000 bytes and stored data on disk in this format:
-// +--------------------------------------------+  <= key1 offset
-// | key1            | value_size (4 bytes) |   |
-// +----------------------------------------+   |
-// | value1                                     |
-// |                                            |
-// +----------------------------------------+---+  <= key2 offset
-// | key2            | value_size (4 bytes) |   |
-// +----------------------------------------+   |
-// | value2                                     |
-// |                                            |
-// |        ......                              |
-// +-----------------+--------------------------+   <= index_block_offset
-// | key1            | key1 offset (8 bytes)    |
-// +-----------------+--------------------------+
-// | key2            | key2 offset (8 bytes)    |
-// +-----------------+--------------------------+
-// | key3            | key3 offset (8 bytes)    |
-// +-----------------+--------------------------+
-// |        ......                              |
-// +-----------------+------------+-------------+
-// | index_block_offset (8 bytes) |
-// +------------------------------+
-
-// SimpleTable is a simple table format for UNIT TEST ONLY. It is not built
-// as production quality.
-class SimpleTableReader: public TableReader {
-public:
-  // Attempt to open the table that is stored in bytes [0..file_size)
-  // of "file", and read the metadata entries necessary to allow
-  // retrieving data from the table.
-  //
-  // If successful, returns ok and sets "*table" to the newly opened
-  // table.  The client should delete "*table" when no longer needed.
-  // If there was an error while initializing the table, sets "*table"
-  // to nullptr and returns a non-ok status.  Does not take ownership of
-  // "*source", but the client must ensure that "source" remains live
-  // for the duration of the returned table's lifetime.
-  //
-  // *file must remain live while this Table is in use.
-  static Status Open(const Options& options, const EnvOptions& soptions,
-                     unique_ptr<RandomAccessFile> && file, uint64_t file_size,
-                     unique_ptr<TableReader>* table_reader);
-
-  Iterator* NewIterator(const ReadOptions&) override;
-
-  Status Get(const ReadOptions&, const Slice& key, void* arg,
-             bool (*handle_result)(void* arg, const ParsedInternalKey& k,
-                                   const Slice& v, bool),
-             void (*mark_key_may_exist)(void*) = nullptr) override;
-
-  uint64_t ApproximateOffsetOf(const Slice& key) override;
-
-  void SetupForCompaction() override;
-
-  std::shared_ptr<const TableProperties> GetTableProperties() const override;
-
-  ~SimpleTableReader();
-
-private:
-  struct Rep;
-  Rep* rep_;
-
-  explicit SimpleTableReader(Rep* rep) {
-    rep_ = rep;
-  }
-  friend class TableCache;
-  friend class SimpleTableIterator;
-
-  Status GetOffset(const Slice& target, uint64_t* offset);
-
-  // No copying allowed
-  explicit SimpleTableReader(const TableReader&) = delete;
-  void operator=(const TableReader&) = delete;
-};
-
-// Iterator to iterate SimpleTable
-class SimpleTableIterator: public Iterator {
-public:
-  explicit SimpleTableIterator(SimpleTableReader* table);
-  ~SimpleTableIterator();
-
-  bool Valid() const;
-
-  void SeekToFirst();
-
-  void SeekToLast();
-
-  void Seek(const Slice& target);
-
-  void Next();
-
-  void Prev();
-
-  Slice key() const;
-
-  Slice value() const;
-
-  Status status() const;
-
-private:
-  SimpleTableReader* table_;
-  uint64_t offset_;
-  uint64_t next_offset_;
-  Slice key_;
-  Slice value_;
-  char tmp_str_[4];
-  char* key_str_;
-  char* value_str_;
-  int value_str_len_;
-  Status status_;
-  // No copying allowed
-  SimpleTableIterator(const SimpleTableIterator&) = delete;
-  void operator=(const Iterator&) = delete;
-};
-
-struct SimpleTableReader::Rep {
-  ~Rep() {
-  }
-  Rep(const EnvOptions& storage_options, uint64_t index_start_offset,
-      int num_entries) :
-      soptions(storage_options), index_start_offset(index_start_offset),
-      num_entries(num_entries) {
-  }
-
-  Options options;
-  const EnvOptions& soptions;
-  Status status;
-  unique_ptr<RandomAccessFile> file;
-  uint64_t index_start_offset;
-  int num_entries;
-  std::shared_ptr<TableProperties> table_properties;
-
-  const static int user_key_size = 16;
-  const static int offset_length = 8;
-  const static int key_footer_len = 8;
-
-  static int GetInternalKeyLength() {
-    return user_key_size + key_footer_len;
-  }
-};
-
-SimpleTableReader::~SimpleTableReader() {
-  delete rep_;
-}
-
-Status SimpleTableReader::Open(const Options& options,
-                               const EnvOptions& soptions,
-                               unique_ptr<RandomAccessFile> && file,
-                               uint64_t size,
-                               unique_ptr<TableReader>* table_reader) {
-  char footer_space[Rep::offset_length];
-  Slice footer_input;
-  Status s = file->Read(size - Rep::offset_length, Rep::offset_length,
-                        &footer_input, footer_space);
-  if (s.ok()) {
-    uint64_t index_start_offset = DecodeFixed64(footer_space);
-
-    int num_entries = (size - Rep::offset_length - index_start_offset)
-        / (Rep::GetInternalKeyLength() + Rep::offset_length);
-    SimpleTableReader::Rep* rep = new SimpleTableReader::Rep(soptions,
-                                                             index_start_offset,
-                                                             num_entries);
-
-    rep->file = std::move(file);
-    rep->options = options;
-    table_reader->reset(new SimpleTableReader(rep));
-  }
-  return s;
-}
-
-void SimpleTableReader::SetupForCompaction() {
-}
-
-std::shared_ptr<const TableProperties> SimpleTableReader::GetTableProperties()
-    const {
-  return rep_->table_properties;
-}
-
-Iterator* SimpleTableReader::NewIterator(const ReadOptions& options) {
-  return new SimpleTableIterator(this);
-}
-
-Status SimpleTableReader::GetOffset(const Slice& target, uint64_t* offset) {
-  uint32_t left = 0;
-  uint32_t right = rep_->num_entries - 1;
-  char key_chars[Rep::GetInternalKeyLength()];
-  Slice tmp_slice;
-
-  uint32_t target_offset = 0;
-  while (left <= right) {
-    uint32_t mid = (left + right + 1) / 2;
-
-    uint64_t offset_to_read = rep_->index_start_offset
-        + (Rep::GetInternalKeyLength() + Rep::offset_length) * mid;
-    Status s = rep_->file->Read(offset_to_read, Rep::GetInternalKeyLength(),
-                                &tmp_slice, key_chars);
-    if (!s.ok()) {
-      return s;
-    }
-
-    InternalKeyComparator ikc(rep_->options.comparator);
-    int compare_result = ikc.Compare(tmp_slice, target);
-
-    if (compare_result < 0) {
-      if (left == right) {
-        target_offset = right + 1;
-        break;
-      }
-      left = mid;
-    } else {
-      if (left == right) {
-        target_offset = left;
-        break;
-      }
-      right = mid - 1;
-    }
-  }
-
-  if (target_offset >= (uint32_t) rep_->num_entries) {
-    *offset = rep_->index_start_offset;
-    return Status::OK();
-  }
-
-  char value_offset_chars[Rep::offset_length];
-
-  int64_t offset_for_value_offset = rep_->index_start_offset
-      + (Rep::GetInternalKeyLength() + Rep::offset_length) * target_offset
-      + Rep::GetInternalKeyLength();
-  Status s = rep_->file->Read(offset_for_value_offset, Rep::offset_length,
-                              &tmp_slice, value_offset_chars);
-  if (s.ok()) {
-    *offset = DecodeFixed64(value_offset_chars);
-  }
-  return s;
-}
-
-Status SimpleTableReader::Get(const ReadOptions& options, const Slice& k,
-                              void* arg,
-                              bool (*saver)(void*, const ParsedInternalKey&,
-                                            const Slice&, bool),
-                              void (*mark_key_may_exist)(void*)) {
-  Status s;
-  SimpleTableIterator* iter = new SimpleTableIterator(this);
-  for (iter->Seek(k); iter->Valid(); iter->Next()) {
-    ParsedInternalKey parsed_key;
-    if (!ParseInternalKey(iter->key(), &parsed_key)) {
-      return Status::Corruption(Slice());
-    }
-
-    if (!(*saver)(arg, parsed_key, iter->value(), true)) {
-      break;
-    }
-  }
-  s = iter->status();
-  delete iter;
-  return s;
-}
-
-uint64_t SimpleTableReader::ApproximateOffsetOf(const Slice& key) {
-  return 0;
-}
-
-SimpleTableIterator::SimpleTableIterator(SimpleTableReader* table) :
-    table_(table) {
-  key_str_ = new char[SimpleTableReader::Rep::GetInternalKeyLength()];
-  value_str_len_ = -1;
-  SeekToFirst();
-}
-
-SimpleTableIterator::~SimpleTableIterator() {
- delete[] key_str_;
- if (value_str_len_ >= 0) {
-   delete[] value_str_;
- }
-}
-
-bool SimpleTableIterator::Valid() const {
-  return offset_ < table_->rep_->index_start_offset;
-}
-
-void SimpleTableIterator::SeekToFirst() {
-  next_offset_ = 0;
-  Next();
-}
-
-void SimpleTableIterator::SeekToLast() {
-  assert(false);
-}
-
-void SimpleTableIterator::Seek(const Slice& target) {
-  Status s = table_->GetOffset(target, &next_offset_);
-  if (!s.ok()) {
-    status_ = s;
-  }
-  Next();
-}
-
-void SimpleTableIterator::Next() {
-  offset_ = next_offset_;
-  if (offset_ >= table_->rep_->index_start_offset) {
-    return;
-  }
-  Slice result;
-  int internal_key_size = SimpleTableReader::Rep::GetInternalKeyLength();
-
-  Status s = table_->rep_->file->Read(next_offset_, internal_key_size, &result,
-                                      key_str_);
-  next_offset_ += internal_key_size;
-  key_ = result;
-
-  Slice value_size_slice;
-  s = table_->rep_->file->Read(next_offset_, 4, &value_size_slice, tmp_str_);
-  next_offset_ += 4;
-  uint32_t value_size = DecodeFixed32(tmp_str_);
-
-  Slice value_slice;
-  if ((int) value_size > value_str_len_) {
-    if (value_str_len_ >= 0) {
-      delete[] value_str_;
-    }
-    value_str_ = new char[value_size];
-    value_str_len_ = value_size;
-  }
-  s = table_->rep_->file->Read(next_offset_, value_size, &value_slice,
-                               value_str_);
-  next_offset_ += value_size;
-  value_ = value_slice;
-}
-
-void SimpleTableIterator::Prev() {
-  assert(false);
-}
-
-Slice SimpleTableIterator::key() const {
-  Log(table_->rep_->options.info_log, "key!!!!");
-  return key_;
-}
-
-Slice SimpleTableIterator::value() const {
-  return value_;
-}
-
-Status SimpleTableIterator::status() const {
-  return status_;
-}
-
-class SimpleTableBuilder: public TableBuilder {
-public:
-  // Create a builder that will store the contents of the table it is
-  // building in *file.  Does not close the file.  It is up to the
-  // caller to close the file after calling Finish(). The output file
-  // will be part of level specified by 'level'.  A value of -1 means
-  // that the caller does not know which level the output file will reside.
-  SimpleTableBuilder(const Options& options, WritableFile* file,
-                     CompressionType compression_type);
-
-  // REQUIRES: Either Finish() or Abandon() has been called.
-  ~SimpleTableBuilder();
-
-  // Add key,value to the table being constructed.
-  // REQUIRES: key is after any previously added key according to comparator.
-  // REQUIRES: Finish(), Abandon() have not been called
-  void Add(const Slice& key, const Slice& value) override;
-
-  // Return non-ok iff some error has been detected.
-  Status status() const override;
-
-  // Finish building the table.  Stops using the file passed to the
-  // constructor after this function returns.
-  // REQUIRES: Finish(), Abandon() have not been called
-  Status Finish() override;
-
-  // Indicate that the contents of this builder should be abandoned.  Stops
-  // using the file passed to the constructor after this function returns.
-  // If the caller is not going to call Finish(), it must call Abandon()
-  // before destroying this builder.
-  // REQUIRES: Finish(), Abandon() have not been called
-  void Abandon() override;
-
-  // Number of calls to Add() so far.
-  uint64_t NumEntries() const override;
-
-  // Size of the file generated so far.  If invoked after a successful
-  // Finish() call, returns the size of the final generated file.
-  uint64_t FileSize() const override;
-
-private:
-  struct Rep;
-  Rep* rep_;
-
-  // No copying allowed
-  SimpleTableBuilder(const SimpleTableBuilder&) = delete;
-  void operator=(const SimpleTableBuilder&) = delete;
-};
-
-struct SimpleTableBuilder::Rep {
-  Options options;
-  WritableFile* file;
-  uint64_t offset = 0;
-  Status status;
-
-  uint64_t num_entries = 0;
-
-  bool closed = false;  // Either Finish() or Abandon() has been called.
-
-  const static int user_key_size = 16;
-  const static int offset_length = 8;
-  const static int key_footer_len = 8;
-
-  static int GetInternalKeyLength() {
-    return user_key_size + key_footer_len;
-  }
-
-  std::string index;
-
-  Rep(const Options& opt, WritableFile* f) :
-      options(opt), file(f) {
-  }
-  ~Rep() {
-  }
-};
-
-SimpleTableBuilder::SimpleTableBuilder(const Options& options,
-                                       WritableFile* file,
-                                       CompressionType compression_type) :
-    rep_(new SimpleTableBuilder::Rep(options, file)) {
-}
-
-SimpleTableBuilder::~SimpleTableBuilder() {
-  delete (rep_);
-}
-
-void SimpleTableBuilder::Add(const Slice& key, const Slice& value) {
-  assert((int ) key.size() == Rep::GetInternalKeyLength());
-
-  // Update index
-  rep_->index.append(key.data(), key.size());
-  PutFixed64(&(rep_->index), rep_->offset);
-
-  // Write key-value pair
-  rep_->file->Append(key);
-  rep_->offset += Rep::GetInternalKeyLength();
-
-  std::string size;
-  int value_size = value.size();
-  PutFixed32(&size, value_size);
-  Slice sizeSlice(size);
-  rep_->file->Append(sizeSlice);
-  rep_->file->Append(value);
-  rep_->offset += value_size + 4;
-
-  rep_->num_entries++;
-}
-
-Status SimpleTableBuilder::status() const {
-  return Status::OK();
-}
-
-Status SimpleTableBuilder::Finish() {
-  Rep* r = rep_;
-  assert(!r->closed);
-  r->closed = true;
-
-  uint64_t index_offset = rep_->offset;
-  Slice index_slice(rep_->index);
-  rep_->file->Append(index_slice);
-  rep_->offset += index_slice.size();
-
-  std::string index_offset_str;
-  PutFixed64(&index_offset_str, index_offset);
-  Slice foot_slice(index_offset_str);
-  rep_->file->Append(foot_slice);
-  rep_->offset += foot_slice.size();
-
-  return Status::OK();
-}
-
-void SimpleTableBuilder::Abandon() {
-  rep_->closed = true;
-}
-
-uint64_t SimpleTableBuilder::NumEntries() const {
-  return rep_->num_entries;
-}
-
-uint64_t SimpleTableBuilder::FileSize() const {
-  return rep_->offset;
-}
-
-class SimpleTableFactory: public TableFactory {
-public:
-  ~SimpleTableFactory() {
-  }
-  SimpleTableFactory() {
-  }
-  const char* Name() const override {
-    return "SimpleTable";
-  }
-  Status NewTableReader(const Options& options, const EnvOptions& soptions,
-                        const InternalKeyComparator& internal_key,
-                        unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
-                        unique_ptr<TableReader>* table_reader) const;
-
-  TableBuilder* NewTableBuilder(const Options& options,
-                                const InternalKeyComparator& internal_key,
-                                WritableFile* file,
-                                CompressionType compression_type) const;
-};
-
-Status SimpleTableFactory::NewTableReader(
-    const Options& options, const EnvOptions& soptions,
-    const InternalKeyComparator& internal_key,
-    unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
-    unique_ptr<TableReader>* table_reader) const {
-
-  return SimpleTableReader::Open(options, soptions, std::move(file), file_size,
-                                 table_reader);
-}
-
-TableBuilder* SimpleTableFactory::NewTableBuilder(
-    const Options& options, const InternalKeyComparator& internal_key,
-    WritableFile* file, CompressionType compression_type) const {
-  return new SimpleTableBuilder(options, file, compression_type);
-}
-
-class SimpleTableDBTest {
-protected:
-public:
-  std::string dbname_;
-  Env* env_;
-  DB* db_;
-
-  Options last_options_;
-
-  SimpleTableDBTest() :
-      env_(Env::Default()) {
-    dbname_ = test::TmpDir() + "/simple_table_db_test";
-    ASSERT_OK(DestroyDB(dbname_, Options()));
-    db_ = nullptr;
-    Reopen();
-  }
-
-  ~SimpleTableDBTest() {
-    delete db_;
-    ASSERT_OK(DestroyDB(dbname_, Options()));
-  }
-
-  // Return the current option configuration.
-  Options CurrentOptions() {
-    Options options;
-    options.table_factory.reset(new SimpleTableFactory());
-    return options;
-  }
-
-  DBImpl* dbfull() {
-    return reinterpret_cast<DBImpl*>(db_);
-  }
-
-  void Reopen(Options* options = nullptr) {
-    ASSERT_OK(TryReopen(options));
-  }
-
-  void Close() {
-    delete db_;
-    db_ = nullptr;
-  }
-
-  void DestroyAndReopen(Options* options = nullptr) {
-    //Destroy using last options
-    Destroy(&last_options_);
-    ASSERT_OK(TryReopen(options));
-  }
-
-  void Destroy(Options* options) {
-    delete db_;
-    db_ = nullptr;
-    ASSERT_OK(DestroyDB(dbname_, *options));
-  }
-
-  Status PureReopen(Options* options, DB** db) {
-    return DB::Open(*options, dbname_, db);
-  }
-
-  Status TryReopen(Options* options = nullptr) {
-    delete db_;
-    db_ = nullptr;
-    Options opts;
-    if (options != nullptr) {
-      opts = *options;
-    } else {
-      opts = CurrentOptions();
-      opts.create_if_missing = true;
-    }
-    last_options_ = opts;
-
-    return DB::Open(opts, dbname_, &db_);
-  }
-
-  Status Put(const Slice& k, const Slice& v) {
-    return db_->Put(WriteOptions(), k, v);
-  }
-
-  Status Delete(const std::string& k) {
-    return db_->Delete(WriteOptions(), k);
-  }
-
-  std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) {
-    ReadOptions options;
-    options.snapshot = snapshot;
-    std::string result;
-    Status s = db_->Get(options, k, &result);
-    if (s.IsNotFound()) {
-      result = "NOT_FOUND";
-    } else if (!s.ok()) {
-      result = s.ToString();
-    }
-    return result;
-  }
-
-
-  int NumTableFilesAtLevel(int level) {
-    std::string property;
-    ASSERT_TRUE(
-        db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(level),
-                         &property));
-    return atoi(property.c_str());
-  }
-
-  // Return spread of files per level
-  std::string FilesPerLevel() {
-    std::string result;
-    int last_non_zero_offset = 0;
-    for (int level = 0; level < db_->NumberLevels(); level++) {
-      int f = NumTableFilesAtLevel(level);
-      char buf[100];
-      snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
-      result += buf;
-      if (f > 0) {
-        last_non_zero_offset = result.size();
-      }
-    }
-    result.resize(last_non_zero_offset);
-    return result;
-  }
-
-  std::string IterStatus(Iterator* iter) {
-    std::string result;
-    if (iter->Valid()) {
-      result = iter->key().ToString() + "->" + iter->value().ToString();
-    } else {
-      result = "(invalid)";
-    }
-    return result;
-  }
-};
-
-TEST(SimpleTableDBTest, Empty) {
-  ASSERT_TRUE(db_ != nullptr);
-  ASSERT_EQ("NOT_FOUND", Get("0000000000000foo"));
-}
-
-TEST(SimpleTableDBTest, ReadWrite) {
-  ASSERT_OK(Put("0000000000000foo", "v1"));
-  ASSERT_EQ("v1", Get("0000000000000foo"));
-  ASSERT_OK(Put("0000000000000bar", "v2"));
-  ASSERT_OK(Put("0000000000000foo", "v3"));
-  ASSERT_EQ("v3", Get("0000000000000foo"));
-  ASSERT_EQ("v2", Get("0000000000000bar"));
-}
-
-TEST(SimpleTableDBTest, Flush) {
-  ASSERT_OK(Put("0000000000000foo", "v1"));
-  ASSERT_OK(Put("0000000000000bar", "v2"));
-  ASSERT_OK(Put("0000000000000foo", "v3"));
-  dbfull()->TEST_FlushMemTable();
-  ASSERT_EQ("v3", Get("0000000000000foo"));
-  ASSERT_EQ("v2", Get("0000000000000bar"));
-}
-
-TEST(SimpleTableDBTest, Flush2) {
-  ASSERT_OK(Put("0000000000000bar", "b"));
-  ASSERT_OK(Put("0000000000000foo", "v1"));
-  dbfull()->TEST_FlushMemTable();
-
-  ASSERT_OK(Put("0000000000000foo", "v2"));
-  dbfull()->TEST_FlushMemTable();
-  ASSERT_EQ("v2", Get("0000000000000foo"));
-
-  ASSERT_OK(Put("0000000000000eee", "v3"));
-  dbfull()->TEST_FlushMemTable();
-  ASSERT_EQ("v3", Get("0000000000000eee"));
-
-  ASSERT_OK(Delete("0000000000000bar"));
-  dbfull()->TEST_FlushMemTable();
-  ASSERT_EQ("NOT_FOUND", Get("0000000000000bar"));
-
-  ASSERT_OK(Put("0000000000000eee", "v5"));
-  dbfull()->TEST_FlushMemTable();
-  ASSERT_EQ("v5", Get("0000000000000eee"));
-}
-
-static std::string Key(int i) {
-  char buf[100];
-  snprintf(buf, sizeof(buf), "key_______%06d", i);
-  return std::string(buf);
-}
-
-static std::string RandomString(Random* rnd, int len) {
-  std::string r;
-  test::RandomString(rnd, len, &r);
-  return r;
-}
-
-TEST(SimpleTableDBTest, CompactionTrigger) {
-  Options options = CurrentOptions();
-  options.write_buffer_size = 100 << 10; //100KB
-  options.num_levels = 3;
-  options.max_mem_compaction_level = 0;
-  options.level0_file_num_compaction_trigger = 3;
-  Reopen(&options);
-
-  Random rnd(301);
-
-  for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
-      num++) {
-    std::vector<std::string> values;
-    // Write 120KB (12 values, each 10K)
-    for (int i = 0; i < 12; i++) {
-      values.push_back(RandomString(&rnd, 10000));
-      ASSERT_OK(Put(Key(i), values[i]));
-    }
-    dbfull()->TEST_WaitForFlushMemTable();
-    ASSERT_EQ(NumTableFilesAtLevel(0), num + 1);
-  }
-
-  //generate one more file in level-0, and should trigger level-0 compaction
-  std::vector<std::string> values;
-  for (int i = 0; i < 12; i++) {
-    values.push_back(RandomString(&rnd, 10000));
-    ASSERT_OK(Put(Key(i), values[i]));
-  }
-  dbfull()->TEST_WaitForCompact();
-
-  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
-  ASSERT_EQ(NumTableFilesAtLevel(1), 1);
-}
-
-}  // namespace rocksdb
-
-int main(int argc, char** argv) {
-  return rocksdb::test::RunAllTests();
-}
diff --git a/src/rocksdb/db/skiplist.h b/src/rocksdb/db/skiplist.h
index 751f7c3..c1e3750 100644
--- a/src/rocksdb/db/skiplist.h
+++ b/src/rocksdb/db/skiplist.h
@@ -32,10 +32,10 @@
 
 #pragma once
 #include <assert.h>
+#include <atomic>
 #include <stdlib.h>
-#include "util/arena.h"
 #include "port/port.h"
-#include "util/arena.h"
+#include "util/allocator.h"
 #include "util/random.h"
 
 namespace rocksdb {
@@ -47,9 +47,9 @@ class SkipList {
 
  public:
   // Create a new SkipList object that will use "cmp" for comparing keys,
-  // and will allocate memory using "*arena".  Objects allocated in the arena
-  // must remain allocated for the lifetime of the skiplist object.
-  explicit SkipList(Comparator cmp, Arena* arena,
+  // and will allocate memory using "*allocator".  Objects allocated in the
+  // allocator must remain allocated for the lifetime of the skiplist object.
+  explicit SkipList(Comparator cmp, Allocator* allocator,
                     int32_t max_height = 12, int32_t branching_factor = 4);
 
   // Insert key into the list.
@@ -109,21 +109,20 @@ class SkipList {
 
   // Immutable after construction
   Comparator const compare_;
-  Arena* const arena_;    // Arena used for allocations of nodes
+  Allocator* const allocator_;    // Allocator used for allocations of nodes
 
   Node* const head_;
 
   // Modified only by Insert().  Read racily by readers, but stale
   // values are ok.
-  port::AtomicPointer max_height_;   // Height of the entire list
+  std::atomic<int> max_height_;  // Height of the entire list
 
   // Used for optimizing sequential insert patterns
   Node** prev_;
   int32_t prev_height_;
 
   inline int GetMaxHeight() const {
-    return static_cast<int>(
-        reinterpret_cast<intptr_t>(max_height_.NoBarrier_Load()));
+    return max_height_.load(std::memory_order_relaxed);
   }
 
   // Read/written only by Insert().
@@ -169,35 +168,35 @@ struct SkipList<Key, Comparator>::Node {
     assert(n >= 0);
     // Use an 'acquire load' so that we observe a fully initialized
     // version of the returned Node.
-    return reinterpret_cast<Node*>(next_[n].Acquire_Load());
+    return (next_[n].load(std::memory_order_acquire));
   }
   void SetNext(int n, Node* x) {
     assert(n >= 0);
     // Use a 'release store' so that anybody who reads through this
     // pointer observes a fully initialized version of the inserted node.
-    next_[n].Release_Store(x);
+    next_[n].store(x, std::memory_order_release);
   }
 
   // No-barrier variants that can be safely used in a few locations.
   Node* NoBarrier_Next(int n) {
     assert(n >= 0);
-    return reinterpret_cast<Node*>(next_[n].NoBarrier_Load());
+    return next_[n].load(std::memory_order_relaxed);
   }
   void NoBarrier_SetNext(int n, Node* x) {
     assert(n >= 0);
-    next_[n].NoBarrier_Store(x);
+    next_[n].store(x, std::memory_order_relaxed);
   }
 
  private:
   // Array of length equal to the node height.  next_[0] is lowest level link.
-  port::AtomicPointer next_[1];
+  std::atomic<Node*> next_[1];
 };
 
 template<typename Key, class Comparator>
 typename SkipList<Key, Comparator>::Node*
 SkipList<Key, Comparator>::NewNode(const Key& key, int height) {
-  char* mem = arena_->AllocateAligned(
-      sizeof(Node) + sizeof(port::AtomicPointer) * (height - 1));
+  char* mem = allocator_->AllocateAligned(
+      sizeof(Node) + sizeof(std::atomic<Node*>) * (height - 1));
   return new (mem) Node(key);
 }
 
@@ -356,23 +355,24 @@ typename SkipList<Key, Comparator>::Node* SkipList<Key, Comparator>::FindLast()
 }
 
 template<typename Key, class Comparator>
-SkipList<Key, Comparator>::SkipList(const Comparator cmp, Arena* arena,
+SkipList<Key, Comparator>::SkipList(const Comparator cmp, Allocator* allocator,
                                    int32_t max_height,
                                    int32_t branching_factor)
     : kMaxHeight_(max_height),
       kBranching_(branching_factor),
       compare_(cmp),
-      arena_(arena),
+      allocator_(allocator),
       head_(NewNode(0 /* any key will do */, max_height)),
-      max_height_(reinterpret_cast<void*>(1)),
+      max_height_(1),
       prev_height_(1),
       rnd_(0xdeadbeef) {
   assert(kMaxHeight_ > 0);
   assert(kBranching_ > 0);
-  // Allocate the prev_ Node* array, directly from the passed-in arena.
+  // Allocate the prev_ Node* array, directly from the passed-in allocator.
   // prev_ does not need to be freed, as its life cycle is tied up with
-  // the arena as a whole.
-  prev_ = (Node**) arena_->AllocateAligned(sizeof(Node*) * kMaxHeight_);
+  // the allocator as a whole.
+  prev_ = reinterpret_cast<Node**>(
+            allocator_->AllocateAligned(sizeof(Node*) * kMaxHeight_));
   for (int i = 0; i < kMaxHeight_; i++) {
     head_->SetNext(i, nullptr);
     prev_[i] = head_;
@@ -402,7 +402,7 @@ void SkipList<Key, Comparator>::Insert(const Key& key) {
     // the loop below.  In the former case the reader will
     // immediately drop to the next level since nullptr sorts after all
     // keys.  In the latter case the reader will use the new node.
-    max_height_.NoBarrier_Store(reinterpret_cast<void*>(height));
+    max_height_.store(height, std::memory_order_relaxed);
   }
 
   x = NewNode(key, height);
diff --git a/src/rocksdb/db/skiplist_test.cc b/src/rocksdb/db/skiplist_test.cc
index b87ddcb..3d14186 100644
--- a/src/rocksdb/db/skiplist_test.cc
+++ b/src/rocksdb/db/skiplist_test.cc
@@ -31,9 +31,9 @@ struct TestComparator {
   }
 };
 
-class SkipTest { };
+class SkipTest : public testing::Test {};
 
-TEST(SkipTest, Empty) {
+TEST_F(SkipTest, Empty) {
   Arena arena;
   TestComparator cmp;
   SkipList<Key, TestComparator> list(cmp, &arena);
@@ -49,7 +49,7 @@ TEST(SkipTest, Empty) {
   ASSERT_TRUE(!iter.Valid());
 }
 
-TEST(SkipTest, InsertAndLookup) {
+TEST_F(SkipTest, InsertAndLookup) {
   const int N = 2000;
   const int R = 5000;
   Random rnd(1000);
@@ -191,13 +191,11 @@ class ConcurrentTest {
 
   // Per-key generation
   struct State {
-    port::AtomicPointer generation[K];
-    void Set(int k, intptr_t v) {
-      generation[k].Release_Store(reinterpret_cast<void*>(v));
-    }
-    intptr_t Get(int k) {
-      return reinterpret_cast<intptr_t>(generation[k].Acquire_Load());
+    std::atomic<int> generation[K];
+    void Set(int k, int v) {
+      generation[k].store(v, std::memory_order_release);
     }
+    int Get(int k) { return generation[k].load(std::memory_order_acquire); }
 
     State() {
       for (unsigned int k = 0; k < K; k++) {
@@ -221,9 +219,9 @@ class ConcurrentTest {
   // REQUIRES: External synchronization
   void WriteStep(Random* rnd) {
     const uint32_t k = rnd->Next() % K;
-    const intptr_t g = current_.Get(k) + 1;
-    const Key key = MakeKey(k, g);
-    list_.Insert(key);
+    const int g = current_.Get(k) + 1;
+    const Key new_key = MakeKey(k, g);
+    list_.Insert(new_key);
     current_.Set(k, g);
   }
 
@@ -255,11 +253,10 @@ class ConcurrentTest {
         // Note that generation 0 is never inserted, so it is ok if
         // <*,0,*> is missing.
         ASSERT_TRUE((gen(pos) == 0U) ||
-                    (gen(pos) > (uint64_t)initial_state.Get(key(pos)))
-                    ) << "key: " << key(pos)
-                      << "; gen: " << gen(pos)
-                      << "; initgen: "
-                      << initial_state.Get(key(pos));
+                    (gen(pos) > static_cast<uint64_t>(initial_state.Get(
+                                    static_cast<int>(key(pos))))))
+            << "key: " << key(pos) << "; gen: " << gen(pos)
+            << "; initgen: " << initial_state.Get(static_cast<int>(key(pos)));
 
         // Advance to next key in the valid key space
         if (key(pos) < key(current)) {
@@ -290,7 +287,7 @@ const uint32_t ConcurrentTest::K;
 
 // Simple test that does single-threaded testing of the ConcurrentTest
 // scaffolding.
-TEST(SkipTest, ConcurrentWithoutThreads) {
+TEST_F(SkipTest, ConcurrentWithoutThreads) {
   ConcurrentTest test;
   Random rnd(test::RandomSeed());
   for (int i = 0; i < 10000; i++) {
@@ -303,7 +300,7 @@ class TestState {
  public:
   ConcurrentTest t_;
   int seed_;
-  port::AtomicPointer quit_flag_;
+  std::atomic<bool> quit_flag_;
 
   enum ReaderState {
     STARTING,
@@ -312,10 +309,7 @@ class TestState {
   };
 
   explicit TestState(int s)
-      : seed_(s),
-        quit_flag_(nullptr),
-        state_(STARTING),
-        state_cv_(&mu_) {}
+      : seed_(s), quit_flag_(false), state_(STARTING), state_cv_(&mu_) {}
 
   void Wait(ReaderState s) {
     mu_.Lock();
@@ -343,7 +337,7 @@ static void ConcurrentReader(void* arg) {
   Random rnd(state->seed_);
   int64_t reads = 0;
   state->Change(TestState::RUNNING);
-  while (!state->quit_flag_.Acquire_Load()) {
+  while (!state->quit_flag_.load(std::memory_order_acquire)) {
     state->t_.ReadStep(&rnd);
     ++reads;
   }
@@ -362,22 +356,23 @@ static void RunConcurrent(int run) {
     TestState state(seed + 1);
     Env::Default()->Schedule(ConcurrentReader, &state);
     state.Wait(TestState::RUNNING);
-    for (int i = 0; i < kSize; i++) {
+    for (int k = 0; k < kSize; k++) {
       state.t_.WriteStep(&rnd);
     }
-    state.quit_flag_.Release_Store(&state);  // Any non-nullptr arg will do
+    state.quit_flag_.store(true, std::memory_order_release);
     state.Wait(TestState::DONE);
   }
 }
 
-TEST(SkipTest, Concurrent1) { RunConcurrent(1); }
-TEST(SkipTest, Concurrent2) { RunConcurrent(2); }
-TEST(SkipTest, Concurrent3) { RunConcurrent(3); }
-TEST(SkipTest, Concurrent4) { RunConcurrent(4); }
-TEST(SkipTest, Concurrent5) { RunConcurrent(5); }
+TEST_F(SkipTest, Concurrent1) { RunConcurrent(1); }
+TEST_F(SkipTest, Concurrent2) { RunConcurrent(2); }
+TEST_F(SkipTest, Concurrent3) { RunConcurrent(3); }
+TEST_F(SkipTest, Concurrent4) { RunConcurrent(4); }
+TEST_F(SkipTest, Concurrent5) { RunConcurrent(5); }
 
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
-  return rocksdb::test::RunAllTests();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
 }
diff --git a/src/rocksdb/db/slice.cc b/src/rocksdb/db/slice.cc
new file mode 100644
index 0000000..7e7245d
--- /dev/null
+++ b/src/rocksdb/db/slice.cc
@@ -0,0 +1,24 @@
+//  Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "rocksdb/slice.h"
+
+namespace rocksdb {
+
+Slice::Slice(const SliceParts& parts, std::string* buf) {
+  size_t length = 0;
+  for (int i = 0; i < parts.num_parts; ++i) {
+    length += parts.parts[i].size();
+  }
+  buf->reserve(length);
+
+  for (int i = 0; i < parts.num_parts; ++i) {
+    buf->append(parts.parts[i].data(), parts.parts[i].size());
+  }
+  data_ = buf->data();
+  size_ = buf->size();
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/db/snapshot.h b/src/rocksdb/db/snapshot.h
index 2c2e3ea..c6852f5 100644
--- a/src/rocksdb/db/snapshot.h
+++ b/src/rocksdb/db/snapshot.h
@@ -8,6 +8,8 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #pragma once
+#include <vector>
+
 #include "rocksdb/db.h"
 
 namespace rocksdb {
@@ -20,6 +22,8 @@ class SnapshotImpl : public Snapshot {
  public:
   SequenceNumber number_;  // const after creation
 
+  virtual SequenceNumber GetSequenceNumber() const override { return number_; }
+
  private:
   friend class SnapshotList;
 
@@ -28,6 +32,8 @@ class SnapshotImpl : public Snapshot {
   SnapshotImpl* next_;
 
   SnapshotList* list_;                 // just for sanity checks
+
+  int64_t unix_time_;
 };
 
 class SnapshotList {
@@ -36,20 +42,23 @@ class SnapshotList {
     list_.prev_ = &list_;
     list_.next_ = &list_;
     list_.number_ = 0xFFFFFFFFL;      // placeholder marker, for debugging
+    count_ = 0;
   }
 
   bool empty() const { return list_.next_ == &list_; }
   SnapshotImpl* oldest() const { assert(!empty()); return list_.next_; }
   SnapshotImpl* newest() const { assert(!empty()); return list_.prev_; }
 
-  const SnapshotImpl* New(SequenceNumber seq) {
+  const SnapshotImpl* New(SequenceNumber seq, uint64_t unix_time) {
     SnapshotImpl* s = new SnapshotImpl;
     s->number_ = seq;
+    s->unix_time_ = unix_time;
     s->list_ = this;
     s->next_ = &list_;
     s->prev_ = list_.prev_;
     s->prev_->next_ = s;
     s->next_->prev_ = s;
+    count_++;
     return s;
   }
 
@@ -57,30 +66,46 @@ class SnapshotList {
     assert(s->list_ == this);
     s->prev_->next_ = s->next_;
     s->next_->prev_ = s->prev_;
+    count_--;
     delete s;
   }
 
   // retrieve all snapshot numbers. They are sorted in ascending order.
-  void getAll(std::vector<SequenceNumber>& ret) {
-    if (empty()) return;
+  std::vector<SequenceNumber> GetAll() {
+    std::vector<SequenceNumber> ret;
+    if (empty()) {
+      return ret;
+    }
     SnapshotImpl* s = &list_;
     while (s->next_ != &list_) {
       ret.push_back(s->next_->number_);
-      s = s ->next_;
+      s = s->next_;
     }
+    return ret;
   }
 
   // get the sequence number of the most recent snapshot
-  const SequenceNumber GetNewest() {
+  SequenceNumber GetNewest() {
     if (empty()) {
       return 0;
     }
     return newest()->number_;
   }
 
+  int64_t GetOldestSnapshotTime() const {
+    if (empty()) {
+      return 0;
+    } else {
+      return oldest()->unix_time_;
+    }
+  }
+
+  uint64_t count() const { return count_; }
+
  private:
   // Dummy head of doubly-linked list of snapshots
   SnapshotImpl list_;
+  uint64_t count_;
 };
 
 }  // namespace rocksdb
diff --git a/src/rocksdb/db/table_cache.cc b/src/rocksdb/db/table_cache.cc
index 2321d03..e1b0ca8 100644
--- a/src/rocksdb/db/table_cache.cc
+++ b/src/rocksdb/db/table_cache.cc
@@ -13,7 +13,9 @@
 #include "db/version_edit.h"
 
 #include "rocksdb/statistics.h"
+#include "table/iterator_wrapper.h"
 #include "table/table_reader.h"
+#include "table/get_context.h"
 #include "util/coding.h"
 #include "util/stop_watch.h"
 
@@ -30,17 +32,15 @@ static void UnrefEntry(void* arg1, void* arg2) {
   cache->Release(h);
 }
 
-static Slice GetSliceForFileNumber(uint64_t* file_number) {
+static Slice GetSliceForFileNumber(const uint64_t* file_number) {
   return Slice(reinterpret_cast<const char*>(file_number),
                sizeof(*file_number));
 }
 
-TableCache::TableCache(const std::string& dbname, const Options* options,
-                       const EnvOptions& storage_options, Cache* const cache)
-    : env_(options->env),
-      dbname_(dbname),
-      options_(options),
-      storage_options_(storage_options),
+TableCache::TableCache(const ImmutableCFOptions& ioptions,
+                       const EnvOptions& env_options, Cache* const cache)
+    : ioptions_(ioptions),
+      env_options_(env_options),
       cache_(cache) {}
 
 TableCache::~TableCache() {
@@ -54,43 +54,40 @@ void TableCache::ReleaseHandle(Cache::Handle* handle) {
   cache_->Release(handle);
 }
 
-Status TableCache::FindTable(const EnvOptions& toptions,
+Status TableCache::FindTable(const EnvOptions& env_options,
                              const InternalKeyComparator& internal_comparator,
-                             uint64_t file_number, uint64_t file_size,
-                             Cache::Handle** handle, bool* table_io,
+                             const FileDescriptor& fd, Cache::Handle** handle,
                              const bool no_io) {
   Status s;
-  Slice key = GetSliceForFileNumber(&file_number);
+  uint64_t number = fd.GetNumber();
+  Slice key = GetSliceForFileNumber(&number);
   *handle = cache_->Lookup(key);
   if (*handle == nullptr) {
     if (no_io) { // Dont do IO and return a not-found status
       return Status::Incomplete("Table not found in table_cache, no_io is set");
     }
-    if (table_io != nullptr) {
-      *table_io = true;    // we had to do IO from storage
-    }
-    std::string fname = TableFileName(dbname_, file_number);
+    std::string fname =
+        TableFileName(ioptions_.db_paths, fd.GetNumber(), fd.GetPathId());
     unique_ptr<RandomAccessFile> file;
     unique_ptr<TableReader> table_reader;
-    s = env_->NewRandomAccessFile(fname, &file, toptions);
-    RecordTick(options_->statistics.get(), NO_FILE_OPENS);
+    s = ioptions_.env->NewRandomAccessFile(fname, &file, env_options);
+    RecordTick(ioptions_.statistics, NO_FILE_OPENS);
     if (s.ok()) {
-      if (options_->advise_random_on_open) {
+      if (ioptions_.advise_random_on_open) {
         file->Hint(RandomAccessFile::RANDOM);
       }
-      StopWatch sw(env_, options_->statistics.get(), TABLE_OPEN_IO_MICROS);
-      s = options_->table_factory->NewTableReader(
-          *options_, toptions, internal_comparator, std::move(file), file_size,
-          &table_reader);
+      StopWatch sw(ioptions_.env, ioptions_.statistics, TABLE_OPEN_IO_MICROS);
+      s = ioptions_.table_factory->NewTableReader(
+          ioptions_, env_options, internal_comparator, std::move(file),
+          fd.GetFileSize(), &table_reader);
     }
 
     if (!s.ok()) {
       assert(table_reader == nullptr);
-      RecordTick(options_->statistics.get(), NO_FILE_ERRORS);
+      RecordTick(ioptions_.statistics, NO_FILE_ERRORS);
       // We do not cache error results so that if the error is transient,
       // or somebody repairs the file, we recover automatically.
     } else {
-      assert(file.get() == nullptr);
       *handle = cache_->Insert(key, table_reader.release(), 1, &DeleteEntry);
     }
   }
@@ -98,27 +95,27 @@ Status TableCache::FindTable(const EnvOptions& toptions,
 }
 
 Iterator* TableCache::NewIterator(const ReadOptions& options,
-                                  const EnvOptions& toptions,
+                                  const EnvOptions& env_options,
                                   const InternalKeyComparator& icomparator,
-                                  const FileMetaData& file_meta,
+                                  const FileDescriptor& fd,
                                   TableReader** table_reader_ptr,
-                                  bool for_compaction) {
+                                  bool for_compaction, Arena* arena) {
   if (table_reader_ptr != nullptr) {
     *table_reader_ptr = nullptr;
   }
-  TableReader* table_reader = file_meta.table_reader;
+  TableReader* table_reader = fd.table_reader;
   Cache::Handle* handle = nullptr;
   Status s;
   if (table_reader == nullptr) {
-    s = FindTable(toptions, icomparator, file_meta.number, file_meta.file_size,
-                  &handle, nullptr, options.read_tier == kBlockCacheTier);
+    s = FindTable(env_options, icomparator, fd, &handle,
+                  options.read_tier == kBlockCacheTier);
     if (!s.ok()) {
-      return NewErrorIterator(s);
+      return NewErrorIterator(s, arena);
     }
     table_reader = GetTableReaderFromHandle(handle);
   }
 
-  Iterator* result = table_reader->NewIterator(options);
+  Iterator* result = table_reader->NewIterator(options, arena);
   if (handle != nullptr) {
     result->RegisterCleanup(&UnrefEntry, cache_, handle);
   }
@@ -135,40 +132,37 @@ Iterator* TableCache::NewIterator(const ReadOptions& options,
 
 Status TableCache::Get(const ReadOptions& options,
                        const InternalKeyComparator& internal_comparator,
-                       const FileMetaData& file_meta, const Slice& k, void* arg,
-                       bool (*saver)(void*, const ParsedInternalKey&,
-                                     const Slice&, bool),
-                       bool* table_io, void (*mark_key_may_exist)(void*)) {
-  TableReader* t = file_meta.table_reader;
+                       const FileDescriptor& fd, const Slice& k,
+                       GetContext* get_context) {
+  TableReader* t = fd.table_reader;
   Status s;
   Cache::Handle* handle = nullptr;
   if (!t) {
-    s = FindTable(storage_options_, internal_comparator, file_meta.number,
-                  file_meta.file_size, &handle, table_io,
+    s = FindTable(env_options_, internal_comparator, fd, &handle,
                   options.read_tier == kBlockCacheTier);
     if (s.ok()) {
       t = GetTableReaderFromHandle(handle);
     }
   }
   if (s.ok()) {
-    s = t->Get(options, k, arg, saver, mark_key_may_exist);
+    s = t->Get(options, k, get_context);
     if (handle != nullptr) {
       ReleaseHandle(handle);
     }
   } else if (options.read_tier && s.IsIncomplete()) {
     // Couldnt find Table in cache but treat as kFound if no_io set
-    (*mark_key_may_exist)(arg);
+    get_context->MarkKeyMayExist();
     return Status::OK();
   }
   return s;
 }
+
 Status TableCache::GetTableProperties(
-    const EnvOptions& toptions,
-    const InternalKeyComparator& internal_comparator,
-    const FileMetaData& file_meta,
+    const EnvOptions& env_options,
+    const InternalKeyComparator& internal_comparator, const FileDescriptor& fd,
     std::shared_ptr<const TableProperties>* properties, bool no_io) {
   Status s;
-  auto table_reader = file_meta.table_reader;
+  auto table_reader = fd.table_reader;
   // table already been pre-loaded?
   if (table_reader) {
     *properties = table_reader->GetTableProperties();
@@ -176,10 +170,8 @@ Status TableCache::GetTableProperties(
     return s;
   }
 
-  bool table_io;
   Cache::Handle* table_handle = nullptr;
-  s = FindTable(toptions, internal_comparator, file_meta.number,
-                file_meta.file_size, &table_handle, &table_io, no_io);
+  s = FindTable(env_options, internal_comparator, fd, &table_handle, no_io);
   if (!s.ok()) {
     return s;
   }
@@ -190,6 +182,29 @@ Status TableCache::GetTableProperties(
   return s;
 }
 
+size_t TableCache::GetMemoryUsageByTableReader(
+    const EnvOptions& env_options,
+    const InternalKeyComparator& internal_comparator,
+    const FileDescriptor& fd) {
+  Status s;
+  auto table_reader = fd.table_reader;
+  // table already been pre-loaded?
+  if (table_reader) {
+    return table_reader->ApproximateMemoryUsage();
+  }
+
+  Cache::Handle* table_handle = nullptr;
+  s = FindTable(env_options, internal_comparator, fd, &table_handle, true);
+  if (!s.ok()) {
+    return 0;
+  }
+  assert(table_handle);
+  auto table = GetTableReaderFromHandle(table_handle);
+  auto ret = table->ApproximateMemoryUsage();
+  ReleaseHandle(table_handle);
+  return ret;
+}
+
 void TableCache::Evict(Cache* cache, uint64_t file_number) {
   cache->Erase(GetSliceForFileNumber(&file_number));
 }
diff --git a/src/rocksdb/db/table_cache.h b/src/rocksdb/db/table_cache.h
index e8cd7ea..76bb1c0 100644
--- a/src/rocksdb/db/table_cache.h
+++ b/src/rocksdb/db/table_cache.h
@@ -11,6 +11,7 @@
 
 #pragma once
 #include <string>
+#include <vector>
 #include <stdint.h>
 
 #include "db/dbformat.h"
@@ -18,18 +19,19 @@
 #include "rocksdb/cache.h"
 #include "rocksdb/env.h"
 #include "rocksdb/table.h"
+#include "rocksdb/options.h"
 #include "table/table_reader.h"
 
 namespace rocksdb {
 
 class Env;
-struct FileMetaData;
+class Arena;
+struct FileDescriptor;
+class GetContext;
 
-// TODO(sdong): try to come up with a better API to pass the file information
-//              other than simply passing FileMetaData.
 class TableCache {
  public:
-  TableCache(const std::string& dbname, const Options* options,
+  TableCache(const ImmutableCFOptions& ioptions,
              const EnvOptions& storage_options, Cache* cache);
   ~TableCache();
 
@@ -42,19 +44,17 @@ class TableCache {
   // returned iterator is live.
   Iterator* NewIterator(const ReadOptions& options, const EnvOptions& toptions,
                         const InternalKeyComparator& internal_comparator,
-                        const FileMetaData& file_meta,
+                        const FileDescriptor& file_fd,
                         TableReader** table_reader_ptr = nullptr,
-                        bool for_compaction = false);
+                        bool for_compaction = false, Arena* arena = nullptr);
 
   // If a seek to internal key "k" in specified file finds an entry,
   // call (*handle_result)(arg, found_key, found_value) repeatedly until
   // it returns false.
   Status Get(const ReadOptions& options,
              const InternalKeyComparator& internal_comparator,
-             const FileMetaData& file_meta, const Slice& k, void* arg,
-             bool (*handle_result)(void*, const ParsedInternalKey&,
-                                   const Slice&, bool),
-             bool* table_io, void (*mark_key_may_exist)(void*) = nullptr);
+             const FileDescriptor& file_fd, const Slice& k,
+             GetContext* get_context);
 
   // Evict any entry for the specified file number
   static void Evict(Cache* cache, uint64_t file_number);
@@ -62,8 +62,8 @@ class TableCache {
   // Find table reader
   Status FindTable(const EnvOptions& toptions,
                    const InternalKeyComparator& internal_comparator,
-                   uint64_t file_number, uint64_t file_size, Cache::Handle**,
-                   bool* table_io = nullptr, const bool no_io = false);
+                   const FileDescriptor& file_fd, Cache::Handle**,
+                   const bool no_io = false);
 
   // Get TableReader from a cache handle.
   TableReader* GetTableReaderFromHandle(Cache::Handle* handle);
@@ -76,18 +76,23 @@ class TableCache {
   //            we set `no_io` to be true.
   Status GetTableProperties(const EnvOptions& toptions,
                             const InternalKeyComparator& internal_comparator,
-                            const FileMetaData& file_meta,
+                            const FileDescriptor& file_meta,
                             std::shared_ptr<const TableProperties>* properties,
                             bool no_io = false);
 
+  // Return total memory usage of the table reader of the file.
+  // 0 of table reader of the file is not loaded.
+  size_t GetMemoryUsageByTableReader(
+      const EnvOptions& toptions,
+      const InternalKeyComparator& internal_comparator,
+      const FileDescriptor& fd);
+
   // Release the handle from a cache
   void ReleaseHandle(Cache::Handle* handle);
 
  private:
-  Env* const env_;
-  const std::string dbname_;
-  const Options* options_;
-  const EnvOptions& storage_options_;
+  const ImmutableCFOptions& ioptions_;
+  const EnvOptions& env_options_;
   Cache* const cache_;
 };
 
diff --git a/src/rocksdb/db/table_properties_collector.cc b/src/rocksdb/db/table_properties_collector.cc
index 25bd700..2e0a679 100644
--- a/src/rocksdb/db/table_properties_collector.cc
+++ b/src/rocksdb/db/table_properties_collector.cc
@@ -7,11 +7,13 @@
 
 #include "db/dbformat.h"
 #include "util/coding.h"
+#include "util/string_util.h"
 
 namespace rocksdb {
 
-Status InternalKeyPropertiesCollector::Add(
-    const Slice& key, const Slice& value) {
+Status InternalKeyPropertiesCollector::InternalAdd(const Slice& key,
+                                                   const Slice& value,
+                                                   uint64_t file_size) {
   ParsedInternalKey ikey;
   if (!ParseInternalKey(key, &ikey)) {
     return Status::InvalidArgument("Invalid internal key");
@@ -40,19 +42,35 @@ Status InternalKeyPropertiesCollector::Finish(
 UserCollectedProperties
 InternalKeyPropertiesCollector::GetReadableProperties() const {
   return {
-    { "kDeletedKeys", std::to_string(deleted_keys_) }
+    { "kDeletedKeys", ToString(deleted_keys_) }
   };
 }
 
+namespace {
+EntryType GetEntryType(ValueType value_type) {
+  switch (value_type) {
+    case kTypeValue:
+      return kEntryPut;
+    case kTypeDeletion:
+      return kEntryDelete;
+    case kTypeMerge:
+      return kEntryMerge;
+    default:
+      return kEntryOther;
+  }
+}
+}  // namespace
 
-Status UserKeyTablePropertiesCollector::Add(
-    const Slice& key, const Slice& value) {
+Status UserKeyTablePropertiesCollector::InternalAdd(const Slice& key,
+                                                    const Slice& value,
+                                                    uint64_t file_size) {
   ParsedInternalKey ikey;
   if (!ParseInternalKey(key, &ikey)) {
     return Status::InvalidArgument("Invalid internal key");
   }
 
-  return collector_->Add(ikey.user_key, value);
+  return collector_->AddUserKey(ikey.user_key, value, GetEntryType(ikey.type),
+                                ikey.sequence, file_size);
 }
 
 Status UserKeyTablePropertiesCollector::Finish(
diff --git a/src/rocksdb/db/table_properties_collector.h b/src/rocksdb/db/table_properties_collector.h
index 6cf5629..79bf132 100644
--- a/src/rocksdb/db/table_properties_collector.h
+++ b/src/rocksdb/db/table_properties_collector.h
@@ -18,11 +18,39 @@ struct InternalKeyTablePropertiesNames {
   static const std::string kDeletedKeys;
 };
 
+// Base class for internal table properties collector.
+class IntTblPropCollector {
+ public:
+  virtual ~IntTblPropCollector() {}
+  virtual Status Finish(UserCollectedProperties* properties) = 0;
+
+  virtual const char* Name() const = 0;
+
+  // @params key    the user key that is inserted into the table.
+  // @params value  the value that is inserted into the table.
+  virtual Status InternalAdd(const Slice& key, const Slice& value,
+                             uint64_t file_size) = 0;
+
+  virtual UserCollectedProperties GetReadableProperties() const = 0;
+};
+
+// Facrtory for internal table properties collector.
+class IntTblPropCollectorFactory {
+ public:
+  virtual ~IntTblPropCollectorFactory() {}
+  // has to be thread-safe
+  virtual IntTblPropCollector* CreateIntTblPropCollector() = 0;
+
+  // The name of the properties collector can be used for debugging purpose.
+  virtual const char* Name() const = 0;
+};
+
 // Collecting the statistics for internal keys. Visible only by internal
 // rocksdb modules.
-class InternalKeyPropertiesCollector : public TablePropertiesCollector {
+class InternalKeyPropertiesCollector : public IntTblPropCollector {
  public:
-  virtual Status Add(const Slice& key, const Slice& value) override;
+  virtual Status InternalAdd(const Slice& key, const Slice& value,
+                             uint64_t file_size) override;
 
   virtual Status Finish(UserCollectedProperties* properties) override;
 
@@ -36,28 +64,33 @@ class InternalKeyPropertiesCollector : public TablePropertiesCollector {
   uint64_t deleted_keys_ = 0;
 };
 
+class InternalKeyPropertiesCollectorFactory
+    : public IntTblPropCollectorFactory {
+ public:
+  virtual IntTblPropCollector* CreateIntTblPropCollector() override {
+    return new InternalKeyPropertiesCollector();
+  }
+
+  virtual const char* Name() const override {
+    return "InternalKeyPropertiesCollectorFactory";
+  }
+};
+
 // When rocksdb creates a new table, it will encode all "user keys" into
 // "internal keys", which contains meta information of a given entry.
 //
 // This class extracts user key from the encoded internal key when Add() is
 // invoked.
-class UserKeyTablePropertiesCollector : public TablePropertiesCollector {
+class UserKeyTablePropertiesCollector : public IntTblPropCollector {
  public:
-  explicit UserKeyTablePropertiesCollector(
-      TablePropertiesCollector* collector) :
-      UserKeyTablePropertiesCollector(
-        std::shared_ptr<TablePropertiesCollector>(collector)
-    ) {
-  }
+  // transfer of ownership
+  explicit UserKeyTablePropertiesCollector(TablePropertiesCollector* collector)
+      : collector_(collector) {}
 
-  explicit UserKeyTablePropertiesCollector(
-      std::shared_ptr<TablePropertiesCollector> collector) :
-      collector_(collector) {
-  }
+  virtual ~UserKeyTablePropertiesCollector() {}
 
-  virtual ~UserKeyTablePropertiesCollector() { }
-
-  virtual Status Add(const Slice& key, const Slice& value) override;
+  virtual Status InternalAdd(const Slice& key, const Slice& value,
+                             uint64_t file_size) override;
 
   virtual Status Finish(UserCollectedProperties* properties) override;
 
@@ -66,7 +99,26 @@ class UserKeyTablePropertiesCollector : public TablePropertiesCollector {
   UserCollectedProperties GetReadableProperties() const override;
 
  protected:
-  std::shared_ptr<TablePropertiesCollector> collector_;
+  std::unique_ptr<TablePropertiesCollector> collector_;
+};
+
+class UserKeyTablePropertiesCollectorFactory
+    : public IntTblPropCollectorFactory {
+ public:
+  explicit UserKeyTablePropertiesCollectorFactory(
+      std::shared_ptr<TablePropertiesCollectorFactory> user_collector_factory)
+      : user_collector_factory_(user_collector_factory) {}
+  virtual IntTblPropCollector* CreateIntTblPropCollector() override {
+    return new UserKeyTablePropertiesCollector(
+        user_collector_factory_->CreateTablePropertiesCollector());
+  }
+
+  virtual const char* Name() const override {
+    return user_collector_factory_->Name();
+  }
+
+ private:
+  std::shared_ptr<TablePropertiesCollectorFactory> user_collector_factory_;
 };
 
 }  // namespace rocksdb
diff --git a/src/rocksdb/db/table_properties_collector_test.cc b/src/rocksdb/db/table_properties_collector_test.cc
index ea15260..6f1a8d9 100644
--- a/src/rocksdb/db/table_properties_collector_test.cc
+++ b/src/rocksdb/db/table_properties_collector_test.cc
@@ -6,11 +6,13 @@
 #include <map>
 #include <memory>
 #include <string>
+#include <vector>
 
 #include "db/db_impl.h"
 #include "db/dbformat.h"
 #include "db/table_properties_collector.h"
 #include "rocksdb/table.h"
+#include "rocksdb/immutable_options.h"
 #include "table/block_based_table_factory.h"
 #include "table/meta_blocks.h"
 #include "table/plain_table_factory.h"
@@ -21,7 +23,12 @@
 
 namespace rocksdb {
 
-class TablePropertiesTest {
+class TablePropertiesTest : public testing::Test,
+                            public testing::WithParamInterface<bool> {
+ public:
+  virtual void SetUp() override { backward_mode_ = GetParam(); }
+
+  bool backward_mode_;
 };
 
 // TODO(kailiu) the following classes should be moved to some more general
@@ -34,11 +41,11 @@ class FakeWritableFile : public WritableFile {
 
   const std::string& contents() const { return contents_; }
 
-  virtual Status Close() { return Status::OK(); }
-  virtual Status Flush() { return Status::OK(); }
-  virtual Status Sync() { return Status::OK(); }
+  virtual Status Close() override { return Status::OK(); }
+  virtual Status Flush() override { return Status::OK(); }
+  virtual Status Sync() override { return Status::OK(); }
 
-  virtual Status Append(const Slice& data) {
+  virtual Status Append(const Slice& data) override {
     contents_.append(data.data(), data.size());
     return Status::OK();
   }
@@ -59,7 +66,7 @@ class FakeRandomeAccessFile : public RandomAccessFile {
   uint64_t Size() const { return contents_.size(); }
 
   virtual Status Read(uint64_t offset, size_t n, Slice* result,
-                       char* scratch) const {
+                      char* scratch) const override {
     if (offset > contents_.size()) {
       return Status::InvalidArgument("invalid Read offset");
     }
@@ -78,83 +85,229 @@ class FakeRandomeAccessFile : public RandomAccessFile {
 
 class DumbLogger : public Logger {
  public:
-  virtual void Logv(const char* format, va_list ap) { }
-  virtual size_t GetLogFileSize() const { return 0; }
+  using Logger::Logv;
+  virtual void Logv(const char* format, va_list ap) override {}
+  virtual size_t GetLogFileSize() const override { return 0; }
 };
 
 // Utilities test functions
 namespace {
-void MakeBuilder(const Options& options,
+void MakeBuilder(const Options& options, const ImmutableCFOptions& ioptions,
                  const InternalKeyComparator& internal_comparator,
+                 const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
+                     int_tbl_prop_collector_factories,
                  std::unique_ptr<FakeWritableFile>* writable,
                  std::unique_ptr<TableBuilder>* builder) {
   writable->reset(new FakeWritableFile);
-  builder->reset(options.table_factory->NewTableBuilder(
-      options, internal_comparator, writable->get(), options.compression));
+  builder->reset(NewTableBuilder(
+      ioptions, internal_comparator, int_tbl_prop_collector_factories,
+      writable->get(), options.compression, options.compression_opts));
 }
 }  // namespace
 
 // Collects keys that starts with "A" in a table.
 class RegularKeysStartWithA: public TablePropertiesCollector {
  public:
-   const char* Name() const { return "RegularKeysStartWithA"; }
+  const char* Name() const override { return "RegularKeysStartWithA"; }
 
-   Status Finish(UserCollectedProperties* properties) {
+  Status Finish(UserCollectedProperties* properties) override {
      std::string encoded;
+     std::string encoded_num_puts;
+     std::string encoded_num_deletes;
+     std::string encoded_num_size_changes;
      PutVarint32(&encoded, count_);
-     *properties = UserCollectedProperties {
-       { "TablePropertiesTest", "Rocksdb" },
-       { "Count", encoded }
+     PutVarint32(&encoded_num_puts, num_puts_);
+     PutVarint32(&encoded_num_deletes, num_deletes_);
+     PutVarint32(&encoded_num_size_changes, num_size_changes_);
+     *properties = UserCollectedProperties{
+         {"TablePropertiesTest", message_},
+         {"Count", encoded},
+         {"NumPuts", encoded_num_puts},
+         {"NumDeletes", encoded_num_deletes},
+         {"NumSizeChanges", encoded_num_size_changes},
      };
      return Status::OK();
-   }
+  }
 
-   Status Add(const Slice& user_key, const Slice& value) {
-     // simply asssume all user keys are not empty.
-     if (user_key.data()[0] == 'A') {
-       ++count_;
-     }
-     return Status::OK();
-   }
+  Status AddUserKey(const Slice& user_key, const Slice& value, EntryType type,
+                    SequenceNumber seq, uint64_t file_size) override {
+    // simply asssume all user keys are not empty.
+    if (user_key.data()[0] == 'A') {
+      ++count_;
+    }
+    if (type == kEntryPut) {
+      num_puts_++;
+    } else if (type == kEntryDelete) {
+      num_deletes_++;
+    }
+    if (file_size < file_size_) {
+      message_ = "File size should not decrease.";
+    } else if (file_size != file_size_) {
+      num_size_changes_++;
+    }
 
-  virtual UserCollectedProperties GetReadableProperties() const {
+    return Status::OK();
+  }
+
+  virtual UserCollectedProperties GetReadableProperties() const override {
     return UserCollectedProperties{};
   }
 
+ private:
+  std::string message_ = "Rocksdb";
+  uint32_t count_ = 0;
+  uint32_t num_puts_ = 0;
+  uint32_t num_deletes_ = 0;
+  uint32_t num_size_changes_ = 0;
+  uint64_t file_size_ = 0;
+};
+
+// Collects keys that starts with "A" in a table. Backward compatible mode
+// It is also used to test internal key table property collector
+class RegularKeysStartWithABackwardCompatible
+    : public TablePropertiesCollector {
+ public:
+  const char* Name() const override { return "RegularKeysStartWithA"; }
+
+  Status Finish(UserCollectedProperties* properties) override {
+    std::string encoded;
+    PutVarint32(&encoded, count_);
+    *properties = UserCollectedProperties{{"TablePropertiesTest", "Rocksdb"},
+                                          {"Count", encoded}};
+    return Status::OK();
+  }
+
+  Status Add(const Slice& user_key, const Slice& value) override {
+    // simply asssume all user keys are not empty.
+    if (user_key.data()[0] == 'A') {
+      ++count_;
+    }
+    return Status::OK();
+  }
+
+  virtual UserCollectedProperties GetReadableProperties() const override {
+    return UserCollectedProperties{};
+  }
 
  private:
   uint32_t count_ = 0;
 };
 
+class RegularKeysStartWithAInternal : public IntTblPropCollector {
+ public:
+  const char* Name() const override { return "RegularKeysStartWithA"; }
+
+  Status Finish(UserCollectedProperties* properties) override {
+    std::string encoded;
+    PutVarint32(&encoded, count_);
+    *properties = UserCollectedProperties{{"TablePropertiesTest", "Rocksdb"},
+                                          {"Count", encoded}};
+    return Status::OK();
+  }
+
+  Status InternalAdd(const Slice& user_key, const Slice& value,
+                     uint64_t file_size) override {
+    // simply asssume all user keys are not empty.
+    if (user_key.data()[0] == 'A') {
+      ++count_;
+    }
+    return Status::OK();
+  }
+
+  virtual UserCollectedProperties GetReadableProperties() const override {
+    return UserCollectedProperties{};
+  }
+
+ private:
+  uint32_t count_ = 0;
+};
+
+class RegularKeysStartWithAFactory : public IntTblPropCollectorFactory,
+                                     public TablePropertiesCollectorFactory {
+ public:
+  explicit RegularKeysStartWithAFactory(bool backward_mode)
+      : backward_mode_(backward_mode) {}
+  virtual TablePropertiesCollector* CreateTablePropertiesCollector() override {
+    if (!backward_mode_) {
+      return new RegularKeysStartWithA();
+    } else {
+      return new RegularKeysStartWithABackwardCompatible();
+    }
+  }
+  virtual IntTblPropCollector* CreateIntTblPropCollector() override {
+    return new RegularKeysStartWithAInternal();
+  }
+  const char* Name() const override { return "RegularKeysStartWithA"; }
+
+  bool backward_mode_;
+};
+
+class FlushBlockEveryThreePolicy : public FlushBlockPolicy {
+ public:
+  virtual bool Update(const Slice& key, const Slice& value) override {
+    return (++count_ % 3U == 0);
+  }
+
+ private:
+  uint64_t count_ = 0;
+};
+
+class FlushBlockEveryThreePolicyFactory : public FlushBlockPolicyFactory {
+ public:
+  explicit FlushBlockEveryThreePolicyFactory() {}
+
+  const char* Name() const override {
+    return "FlushBlockEveryThreePolicyFactory";
+  }
+
+  FlushBlockPolicy* NewFlushBlockPolicy(
+      const BlockBasedTableOptions& table_options,
+      const BlockBuilder& data_block_builder) const override {
+    return new FlushBlockEveryThreePolicy;
+  }
+};
+
 extern uint64_t kBlockBasedTableMagicNumber;
 extern uint64_t kPlainTableMagicNumber;
 namespace {
 void TestCustomizedTablePropertiesCollector(
-    uint64_t magic_number, bool encode_as_internal, const Options& options,
-    const InternalKeyComparator& internal_comparator) {
+    bool backward_mode, uint64_t magic_number, bool test_int_tbl_prop_collector,
+    const Options& options, const InternalKeyComparator& internal_comparator) {
+  const std::string kDeleteFlag = "D";
   // make sure the entries will be inserted with order.
   std::map<std::string, std::string> kvs = {
-    {"About   ", "val5"},  // starts with 'A'
-    {"Abstract", "val2"},  // starts with 'A'
-    {"Around  ", "val7"},  // starts with 'A'
-    {"Beyond  ", "val3"},
-    {"Builder ", "val1"},
-    {"Cancel  ", "val4"},
-    {"Find    ", "val6"},
+      {"About   ", "val5"},  // starts with 'A'
+      {"Abstract", "val2"},  // starts with 'A'
+      {"Around  ", "val7"},  // starts with 'A'
+      {"Beyond  ", "val3"},
+      {"Builder ", "val1"},
+      {"Love    ", kDeleteFlag},
+      {"Cancel  ", "val4"},
+      {"Find    ", "val6"},
+      {"Rocks   ", kDeleteFlag},
   };
 
   // -- Step 1: build table
   std::unique_ptr<TableBuilder> builder;
   std::unique_ptr<FakeWritableFile> writable;
-  MakeBuilder(options, internal_comparator, &writable, &builder);
+  const ImmutableCFOptions ioptions(options);
+  std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
+      int_tbl_prop_collector_factories;
+  if (test_int_tbl_prop_collector) {
+    int_tbl_prop_collector_factories.emplace_back(
+        new RegularKeysStartWithAFactory(backward_mode));
+  } else {
+    GetIntTblPropCollectorFactory(options, &int_tbl_prop_collector_factories);
+  }
+  MakeBuilder(options, ioptions, internal_comparator,
+              &int_tbl_prop_collector_factories, &writable, &builder);
 
+  SequenceNumber seqNum = 0U;
   for (const auto& kv : kvs) {
-    if (encode_as_internal) {
-      InternalKey ikey(kv.first, 0, ValueType::kTypeValue);
-      builder->Add(ikey.Encode(), kv.second);
-    } else {
-      builder->Add(kv.first, kv.second);
-    }
+    InternalKey ikey(kv.first, seqNum++, (kv.second != kDeleteFlag)
+                                             ? ValueType::kTypeValue
+                                             : ValueType::kTypeDeletion);
+    builder->Add(ikey.Encode(), kv.second);
   }
   ASSERT_OK(builder->Finish());
 
@@ -174,58 +327,88 @@ void TestCustomizedTablePropertiesCollector(
 
   auto user_collected = props->user_collected_properties;
 
+  ASSERT_TRUE(user_collected.find("TablePropertiesTest") !=
+              user_collected.end());
   ASSERT_EQ("Rocksdb", user_collected.at("TablePropertiesTest"));
 
   uint32_t starts_with_A = 0;
+  ASSERT_TRUE(user_collected.find("Count") != user_collected.end());
   Slice key(user_collected.at("Count"));
   ASSERT_TRUE(GetVarint32(&key, &starts_with_A));
   ASSERT_EQ(3u, starts_with_A);
+
+  if (!backward_mode && !test_int_tbl_prop_collector) {
+    uint32_t num_deletes;
+    ASSERT_TRUE(user_collected.find("NumDeletes") != user_collected.end());
+    Slice key_deletes(user_collected.at("NumDeletes"));
+    ASSERT_TRUE(GetVarint32(&key_deletes, &num_deletes));
+    ASSERT_EQ(2u, num_deletes);
+
+    uint32_t num_puts;
+    ASSERT_TRUE(user_collected.find("NumPuts") != user_collected.end());
+    Slice key_puts(user_collected.at("NumPuts"));
+    ASSERT_TRUE(GetVarint32(&key_puts, &num_puts));
+    ASSERT_EQ(7u, num_puts);
+
+    uint32_t num_size_changes;
+    ASSERT_TRUE(user_collected.find("NumSizeChanges") != user_collected.end());
+    Slice key_size_changes(user_collected.at("NumSizeChanges"));
+    ASSERT_TRUE(GetVarint32(&key_size_changes, &num_size_changes));
+    ASSERT_GE(num_size_changes, 2u);
+  }
 }
 }  // namespace
 
-TEST(TablePropertiesTest, CustomizedTablePropertiesCollector) {
+TEST_P(TablePropertiesTest, CustomizedTablePropertiesCollector) {
   // Test properties collectors with internal keys or regular keys
   // for block based table
   for (bool encode_as_internal : { true, false }) {
-    Options options;
-    auto collector = new RegularKeysStartWithA();
-    if (encode_as_internal) {
-      options.table_properties_collectors = {
-        std::make_shared<UserKeyTablePropertiesCollector>(collector)
-      };
-    } else {
-      options.table_properties_collectors.resize(1);
-      options.table_properties_collectors[0].reset(collector);
+    if (!backward_mode_ && !encode_as_internal) {
+      continue;
     }
+
+    Options options;
+    BlockBasedTableOptions table_options;
+    table_options.flush_block_policy_factory =
+        std::make_shared<FlushBlockEveryThreePolicyFactory>();
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
     test::PlainInternalKeyComparator ikc(options.comparator);
-    TestCustomizedTablePropertiesCollector(kBlockBasedTableMagicNumber,
+    std::shared_ptr<TablePropertiesCollectorFactory> collector_factory(
+        new RegularKeysStartWithAFactory(backward_mode_));
+    options.table_properties_collector_factories.resize(1);
+    options.table_properties_collector_factories[0] = collector_factory;
+
+    TestCustomizedTablePropertiesCollector(backward_mode_,
+                                           kBlockBasedTableMagicNumber,
                                            encode_as_internal, options, ikc);
-  }
 
-  // test plain table
-  Options options;
-  options.table_properties_collectors.push_back(
-      std::make_shared<RegularKeysStartWithA>()
-  );
-  options.table_factory = std::make_shared<PlainTableFactory>(8, 8, 0);
-  test::PlainInternalKeyComparator ikc(options.comparator);
-  TestCustomizedTablePropertiesCollector(kPlainTableMagicNumber, true, options,
-                                         ikc);
+    // test plain table
+    PlainTableOptions plain_table_options;
+    plain_table_options.user_key_len = 8;
+    plain_table_options.bloom_bits_per_key = 8;
+    plain_table_options.hash_table_ratio = 0;
+
+    options.table_factory =
+        std::make_shared<PlainTableFactory>(plain_table_options);
+    TestCustomizedTablePropertiesCollector(backward_mode_,
+                                           kPlainTableMagicNumber,
+                                           encode_as_internal, options, ikc);
+  }
 }
 
 namespace {
 void TestInternalKeyPropertiesCollector(
-    uint64_t magic_number,
-    bool sanitized,
+    bool backward_mode, uint64_t magic_number, bool sanitized,
     std::shared_ptr<TableFactory> table_factory) {
   InternalKey keys[] = {
-    InternalKey("A       ", 0, ValueType::kTypeValue),
-    InternalKey("B       ", 0, ValueType::kTypeValue),
-    InternalKey("C       ", 0, ValueType::kTypeValue),
-    InternalKey("W       ", 0, ValueType::kTypeDeletion),
-    InternalKey("X       ", 0, ValueType::kTypeDeletion),
-    InternalKey("Y       ", 0, ValueType::kTypeDeletion),
-    InternalKey("Z       ", 0, ValueType::kTypeDeletion),
+      InternalKey("A       ", 0, ValueType::kTypeValue),
+      InternalKey("B       ", 1, ValueType::kTypeValue),
+      InternalKey("C       ", 2, ValueType::kTypeValue),
+      InternalKey("W       ", 3, ValueType::kTypeDeletion),
+      InternalKey("X       ", 4, ValueType::kTypeDeletion),
+      InternalKey("Y       ", 5, ValueType::kTypeDeletion),
+      InternalKey("Z       ", 6, ValueType::kTypeDeletion),
   };
 
   std::unique_ptr<TableBuilder> builder;
@@ -233,11 +416,12 @@ void TestInternalKeyPropertiesCollector(
   Options options;
   test::PlainInternalKeyComparator pikc(options.comparator);
 
+  std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
+      int_tbl_prop_collector_factories;
   options.table_factory = table_factory;
   if (sanitized) {
-    options.table_properties_collectors = {
-      std::make_shared<RegularKeysStartWithA>()
-    };
+    options.table_properties_collector_factories.emplace_back(
+        new RegularKeysStartWithAFactory(backward_mode));
     // with sanitization, even regular properties collector will be able to
     // handle internal keys.
     auto comparator = options.comparator;
@@ -245,68 +429,91 @@ void TestInternalKeyPropertiesCollector(
     // SanitizeOptions().
     options.info_log = std::make_shared<DumbLogger>();
     options = SanitizeOptions("db",            // just a place holder
-                              &pikc, nullptr,  // don't care filter policy
+                              &pikc,
                               options);
+    GetIntTblPropCollectorFactory(options, &int_tbl_prop_collector_factories);
     options.comparator = comparator;
   } else {
-    options.table_properties_collectors = {
-      std::make_shared<InternalKeyPropertiesCollector>()
-    };
+    int_tbl_prop_collector_factories.emplace_back(
+        new InternalKeyPropertiesCollectorFactory);
   }
+  const ImmutableCFOptions ioptions(options);
 
-  MakeBuilder(options, pikc, &writable, &builder);
-  for (const auto& k : keys) {
-    builder->Add(k.Encode(), "val");
-  }
-
-  ASSERT_OK(builder->Finish());
-
-  FakeRandomeAccessFile readable(writable->contents());
-  TableProperties* props;
-  Status s = ReadTableProperties(
-      &readable,
-      writable->contents().size(),
-      magic_number,
-      Env::Default(),
-      nullptr,
-      &props
-  );
-  ASSERT_OK(s);
-
-  std::unique_ptr<TableProperties> props_guard(props);
-  auto user_collected = props->user_collected_properties;
-  uint64_t deleted = GetDeletedKeys(user_collected);
-  ASSERT_EQ(4u, deleted);
+  for (int iter = 0; iter < 2; ++iter) {
+    MakeBuilder(options, ioptions, pikc, &int_tbl_prop_collector_factories,
+                &writable, &builder);
+    for (const auto& k : keys) {
+      builder->Add(k.Encode(), "val");
+    }
 
-  if (sanitized) {
-    uint32_t starts_with_A = 0;
-    Slice key(user_collected.at("Count"));
-    ASSERT_TRUE(GetVarint32(&key, &starts_with_A));
-    ASSERT_EQ(1u, starts_with_A);
+    ASSERT_OK(builder->Finish());
+
+    FakeRandomeAccessFile readable(writable->contents());
+    TableProperties* props;
+    Status s =
+        ReadTableProperties(&readable, writable->contents().size(),
+                            magic_number, Env::Default(), nullptr, &props);
+    ASSERT_OK(s);
+
+    std::unique_ptr<TableProperties> props_guard(props);
+    auto user_collected = props->user_collected_properties;
+    uint64_t deleted = GetDeletedKeys(user_collected);
+    ASSERT_EQ(4u, deleted);
+
+    if (sanitized) {
+      uint32_t starts_with_A = 0;
+      ASSERT_TRUE(user_collected.find("Count") != user_collected.end());
+      Slice key(user_collected.at("Count"));
+      ASSERT_TRUE(GetVarint32(&key, &starts_with_A));
+      ASSERT_EQ(1u, starts_with_A);
+
+      if (!backward_mode) {
+        uint32_t num_deletes;
+        ASSERT_TRUE(user_collected.find("NumDeletes") != user_collected.end());
+        Slice key_deletes(user_collected.at("NumDeletes"));
+        ASSERT_TRUE(GetVarint32(&key_deletes, &num_deletes));
+        ASSERT_EQ(4u, num_deletes);
+
+        uint32_t num_puts;
+        ASSERT_TRUE(user_collected.find("NumPuts") != user_collected.end());
+        Slice key_puts(user_collected.at("NumPuts"));
+        ASSERT_TRUE(GetVarint32(&key_puts, &num_puts));
+        ASSERT_EQ(3u, num_puts);
+      }
+    }
   }
 }
 }  // namespace
 
-TEST(TablePropertiesTest, InternalKeyPropertiesCollector) {
+TEST_P(TablePropertiesTest, InternalKeyPropertiesCollector) {
   TestInternalKeyPropertiesCollector(
-      kBlockBasedTableMagicNumber,
-      true /* sanitize */,
-      std::make_shared<BlockBasedTableFactory>()
-  );
-  TestInternalKeyPropertiesCollector(
-      kBlockBasedTableMagicNumber,
-      true /* not sanitize */,
-      std::make_shared<BlockBasedTableFactory>()
-  );
+      backward_mode_, kBlockBasedTableMagicNumber, true /* sanitize */,
+      std::make_shared<BlockBasedTableFactory>());
+  if (backward_mode_) {
+    TestInternalKeyPropertiesCollector(
+        backward_mode_, kBlockBasedTableMagicNumber, false /* not sanitize */,
+        std::make_shared<BlockBasedTableFactory>());
+  }
+
+  PlainTableOptions plain_table_options;
+  plain_table_options.user_key_len = 8;
+  plain_table_options.bloom_bits_per_key = 8;
+  plain_table_options.hash_table_ratio = 0;
+
   TestInternalKeyPropertiesCollector(
-      kPlainTableMagicNumber,
-      false /* not sanitize */,
-      std::make_shared<PlainTableFactory>(8, 8, 0)
-  );
+      backward_mode_, kPlainTableMagicNumber, false /* not sanitize */,
+      std::make_shared<PlainTableFactory>(plain_table_options));
 }
 
+INSTANTIATE_TEST_CASE_P(InternalKeyPropertiesCollector, TablePropertiesTest,
+                        ::testing::Bool());
+
+INSTANTIATE_TEST_CASE_P(CustomizedTablePropertiesCollector, TablePropertiesTest,
+                        ::testing::Bool());
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
-  return rocksdb::test::RunAllTests();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
 }
diff --git a/src/rocksdb/db/tailing_iter.cc b/src/rocksdb/db/tailing_iter.cc
deleted file mode 100644
index 67b59b2..0000000
--- a/src/rocksdb/db/tailing_iter.cc
+++ /dev/null
@@ -1,221 +0,0 @@
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-
-#ifndef ROCKSDB_LITE
-#include "db/tailing_iter.h"
-
-#include <string>
-#include <utility>
-#include <vector>
-#include "db/db_impl.h"
-#include "db/db_iter.h"
-#include "db/column_family.h"
-#include "rocksdb/env.h"
-#include "rocksdb/slice.h"
-#include "rocksdb/slice_transform.h"
-#include "table/merger.h"
-
-namespace rocksdb {
-
-TailingIterator::TailingIterator(Env* const env, DBImpl* db,
-    const ReadOptions& read_options, ColumnFamilyData* cfd)
-    : env_(env),
-      db_(db),
-      read_options_(read_options),
-      cfd_(cfd),
-      super_version_(nullptr),
-      current_(nullptr),
-      status_(Status::InvalidArgument("Seek() not called on this iterator")) {}
-
-TailingIterator::~TailingIterator() {
-  Cleanup();
-}
-
-bool TailingIterator::Valid() const {
-  return current_ != nullptr;
-}
-
-void TailingIterator::SeekToFirst() {
-  if (!IsCurrentVersion()) {
-    CreateIterators();
-  }
-
-  mutable_->SeekToFirst();
-  immutable_->SeekToFirst();
-  UpdateCurrent();
-}
-
-void TailingIterator::Seek(const Slice& target) {
-  if (!IsCurrentVersion()) {
-    CreateIterators();
-  }
-
-  mutable_->Seek(target);
-
-  // We maintain the interval (prev_key_, immutable_->key()] such that there
-  // are no records with keys within that range in immutable_ other than
-  // immutable_->key(). Since immutable_ can't change in this version, we don't
-  // need to do a seek if 'target' belongs to that interval (i.e. immutable_ is
-  // already at the correct position)!
-  //
-  // If prefix seek is used and immutable_ is not valid, seek if target has a
-  // different prefix than prev_key.
-  //
-  // prev_key_ is updated by Next(). SeekImmutable() sets prev_key_ to
-  // 'target' -- in this case, prev_key_ is included in the interval, so
-  // prev_inclusive_ has to be set.
-
-  const Comparator* cmp = cfd_->user_comparator();
-  if (!is_prev_set_ || cmp->Compare(prev_key_, target) >= !is_prev_inclusive_ ||
-      (immutable_->Valid() && cmp->Compare(target, immutable_->key()) > 0) ||
-      (cfd_->options()->prefix_extractor != nullptr && !IsSamePrefix(target))) {
-    SeekImmutable(target);
-  }
-
-  UpdateCurrent();
-}
-
-void TailingIterator::Next() {
-  assert(Valid());
-
-  if (!IsCurrentVersion()) {
-    // save the current key, create new iterators and then seek
-    std::string current_key = key().ToString();
-    Slice key_slice(current_key.data(), current_key.size());
-
-    CreateIterators();
-    Seek(key_slice);
-
-    if (!Valid() || key().compare(key_slice) != 0) {
-      // record with current_key no longer exists
-      return;
-    }
-
-  } else if (current_ == immutable_.get()) {
-    // immutable iterator is advanced -- update prev_key_
-    prev_key_ = key().ToString();
-    is_prev_inclusive_ = false;
-    is_prev_set_ = true;
-  }
-
-  current_->Next();
-  UpdateCurrent();
-}
-
-Slice TailingIterator::key() const {
-  assert(Valid());
-  return current_->key();
-}
-
-Slice TailingIterator::value() const {
-  assert(Valid());
-  return current_->value();
-}
-
-Status TailingIterator::status() const {
-  if (!status_.ok()) {
-    return status_;
-  } else if (!mutable_->status().ok()) {
-    return mutable_->status();
-  } else {
-    return immutable_->status();
-  }
-}
-
-void TailingIterator::Prev() {
-  status_ = Status::NotSupported("This iterator doesn't support Prev()");
-}
-
-void TailingIterator::SeekToLast() {
-  status_ = Status::NotSupported("This iterator doesn't support SeekToLast()");
-}
-
-void TailingIterator::Cleanup() {
-  // Release old super version if necessary
-  mutable_.reset();
-  immutable_.reset();
-  if (super_version_ != nullptr && super_version_->Unref()) {
-    DBImpl::DeletionState deletion_state;
-    db_->mutex_.Lock();
-    super_version_->Cleanup();
-    db_->FindObsoleteFiles(deletion_state, false, true);
-    db_->mutex_.Unlock();
-    delete super_version_;
-    if (deletion_state.HaveSomethingToDelete()) {
-      db_->PurgeObsoleteFiles(deletion_state);
-    }
-  }
-}
-
-void TailingIterator::CreateIterators() {
-  Cleanup();
-  super_version_= cfd_->GetReferencedSuperVersion(&(db_->mutex_));
-
-  Iterator* mutable_iter = super_version_->mem->NewIterator(read_options_);
-  // create a DBIter that only uses memtable content; see NewIterator()
-  mutable_.reset(
-      NewDBIterator(env_, *cfd_->options(), cfd_->user_comparator(),
-                    mutable_iter, kMaxSequenceNumber));
-
-  std::vector<Iterator*> list;
-  super_version_->imm->AddIterators(read_options_, &list);
-  super_version_->current->AddIterators(
-      read_options_, *cfd_->soptions(), &list);
-  Iterator* immutable_iter =
-      NewMergingIterator(&cfd_->internal_comparator(), &list[0], list.size());
-
-  // create a DBIter that only uses memtable content; see NewIterator()
-  immutable_.reset(
-      NewDBIterator(env_, *cfd_->options(), cfd_->user_comparator(),
-                    immutable_iter, kMaxSequenceNumber));
-
-  current_ = nullptr;
-  is_prev_set_ = false;
-}
-
-void TailingIterator::UpdateCurrent() {
-  current_ = nullptr;
-
-  if (mutable_->Valid()) {
-    current_ = mutable_.get();
-  }
-  const Comparator* cmp = cfd_->user_comparator();
-  if (immutable_->Valid() &&
-      (current_ == nullptr ||
-       cmp->Compare(immutable_->key(), current_->key()) < 0)) {
-    current_ = immutable_.get();
-  }
-
-  if (!status_.ok()) {
-    // reset status that was set by Prev() or SeekToLast()
-    status_ = Status::OK();
-  }
-}
-
-bool TailingIterator::IsCurrentVersion() const {
-  return super_version_ != nullptr &&
-         super_version_->version_number == cfd_->GetSuperVersionNumber();
-}
-
-bool TailingIterator::IsSamePrefix(const Slice& target) const {
-  const SliceTransform* extractor = cfd_->options()->prefix_extractor.get();
-
-  assert(extractor);
-  assert(is_prev_set_);
-
-  return extractor->Transform(target)
-    .compare(extractor->Transform(prev_key_)) == 0;
-}
-
-void TailingIterator::SeekImmutable(const Slice& target) {
-  prev_key_ = target.ToString();
-  is_prev_inclusive_ = true;
-  is_prev_set_ = true;
-
-  immutable_->Seek(target);
-}
-
-}  // namespace rocksdb
-#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/tailing_iter.h b/src/rocksdb/db/tailing_iter.h
deleted file mode 100644
index 6b9c513..0000000
--- a/src/rocksdb/db/tailing_iter.h
+++ /dev/null
@@ -1,97 +0,0 @@
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-#pragma once
-
-#ifndef ROCKSDB_LITE
-
-#include <string>
-
-#include "rocksdb/db.h"
-#include "rocksdb/iterator.h"
-#include "rocksdb/options.h"
-
-namespace rocksdb {
-
-class DBImpl;
-class Env;
-struct SuperVersion;
-class ColumnFamilyData;
-
-/**
- * TailingIterator is a special type of iterator that doesn't use an (implicit)
- * snapshot. In other words, it can be used to read data that was added to the
- * db after the iterator had been created.
- *
- * TailingIterator is optimized for sequential reading. It doesn't support
- * Prev() and SeekToLast() operations.
- */
-class TailingIterator : public Iterator {
- public:
-  TailingIterator(Env* const env, DBImpl* db, const ReadOptions& read_options,
-                  ColumnFamilyData* cfd);
-  virtual ~TailingIterator();
-
-  virtual bool Valid() const override;
-  virtual void SeekToFirst() override;
-  virtual void SeekToLast() override;
-  virtual void Seek(const Slice& target) override;
-  virtual void Next() override;
-  virtual void Prev() override;
-  virtual Slice key() const override;
-  virtual Slice value() const override;
-  virtual Status status() const override;
-
- private:
-  void Cleanup();
-
-  Env* const env_;
-  DBImpl* const db_;
-  const ReadOptions read_options_;
-  ColumnFamilyData* const cfd_;
-  SuperVersion* super_version_;
-
-  // TailingIterator merges the contents of the two iterators below (one using
-  // mutable memtable contents only, other over SSTs and immutable memtables).
-  // See DBIter::GetTailingIteratorPair().
-  std::unique_ptr<Iterator> mutable_;
-  std::unique_ptr<Iterator> immutable_;
-
-  // points to either mutable_ or immutable_
-  Iterator* current_;
-
-  // key that precedes immutable iterator's current key
-  std::string prev_key_;
-
-  // unless prev_set is true, prev_key/prev_head is not valid and shouldn't be
-  // used; reset by createIterators()
-  bool is_prev_set_;
-
-  // prev_key_ was set by SeekImmutable(), which means that the interval of
-  // keys covered by immutable_ is [prev_key_, current], i.e. it includes the
-  // left endpoint
-  bool is_prev_inclusive_;
-
-  // internal iterator status
-  Status status_;
-
-  // check if this iterator's version matches DB's version
-  bool IsCurrentVersion() const;
-
-  // check if SeekImmutable() is needed due to target having a different prefix
-  // than prev_key_ (used when in prefix seek mode)
-  bool IsSamePrefix(const Slice& target) const;
-
-  // creates mutable_ and immutable_ iterators and updates version_number_
-  void CreateIterators();
-
-  // set current_ to be one of the iterators with the smallest key
-  void UpdateCurrent();
-
-  // seek on immutable_ and update prev_key
-  void SeekImmutable(const Slice& target);
-};
-
-}  // namespace rocksdb
-#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/transaction_log_impl.cc b/src/rocksdb/db/transaction_log_impl.cc
index 82e58f1..b0bf6e4 100644
--- a/src/rocksdb/db/transaction_log_impl.cc
+++ b/src/rocksdb/db/transaction_log_impl.cc
@@ -4,6 +4,11 @@
 //  of patent rights can be found in the PATENTS file in the same directory.
 
 #ifndef ROCKSDB_LITE
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
 #include "db/transaction_log_impl.h"
 #include "db/write_batch_internal.h"
 
@@ -13,7 +18,7 @@ TransactionLogIteratorImpl::TransactionLogIteratorImpl(
     const std::string& dir, const DBOptions* options,
     const TransactionLogIterator::ReadOptions& read_options,
     const EnvOptions& soptions, const SequenceNumber seq,
-    std::unique_ptr<VectorLogPtr> files, DBImpl const* const dbimpl)
+    std::unique_ptr<VectorLogPtr> files, VersionSet const* const versions)
     : dir_(dir),
       options_(options),
       read_options_(read_options),
@@ -25,9 +30,9 @@ TransactionLogIteratorImpl::TransactionLogIteratorImpl(
       currentFileIndex_(0),
       currentBatchSeq_(0),
       currentLastSeq_(0),
-      dbimpl_(dbimpl) {
+      versions_(versions) {
   assert(files_ != nullptr);
-  assert(dbimpl_ != nullptr);
+  assert(versions_ != nullptr);
 
   reporter_.env = options_->env;
   reporter_.info_log = options_->info_log.get();
@@ -43,14 +48,14 @@ Status TransactionLogIteratorImpl::OpenLogFile(
     return env->NewSequentialFile(fname, file, soptions_);
   } else {
     std::string fname = LogFileName(dir_, logFile->LogNumber());
-    Status status = env->NewSequentialFile(fname, file, soptions_);
-    if (!status.ok()) {
+    Status s = env->NewSequentialFile(fname, file, soptions_);
+    if (!s.ok()) {
       //  If cannot open file in DB directory.
       //  Try the archive dir, as it could have moved in the meanwhile.
       fname = ArchivedLogFileName(dir_, logFile->LogNumber());
-      status = env->NewSequentialFile(fname, file, soptions_);
+      s = env->NewSequentialFile(fname, file, soptions_);
     }
-    return status;
+    return s;
   }
 }
 
@@ -74,7 +79,7 @@ bool TransactionLogIteratorImpl::RestrictedRead(
     Slice* record,
     std::string* scratch) {
   // Don't read if no more complete entries to read from logs
-  if (currentLastSeq_ >= dbimpl_->GetLatestSequenceNumber()) {
+  if (currentLastSeq_ >= versions_->LastSequence()) {
     return false;
   }
   return currentLogReader_->ReadRecord(record, scratch);
@@ -93,6 +98,7 @@ void TransactionLogIteratorImpl::SeekToStartSequence(
   Status s = OpenLogReader(files_->at(startFileIndex).get());
   if (!s.ok()) {
     currentStatus_ = s;
+    reporter_.Info(currentStatus_.ToString().c_str());
     return;
   }
   while (RestrictedRead(&record, &scratch)) {
@@ -176,15 +182,15 @@ void TransactionLogIteratorImpl::NextImpl(bool internal) {
     // Open the next file
     if (currentFileIndex_ < files_->size() - 1) {
       ++currentFileIndex_;
-      Status status =OpenLogReader(files_->at(currentFileIndex_).get());
-      if (!status.ok()) {
+      Status s = OpenLogReader(files_->at(currentFileIndex_).get());
+      if (!s.ok()) {
         isValid_ = false;
-        currentStatus_ = status;
+        currentStatus_ = s;
         return;
       }
     } else {
       isValid_ = false;
-      if (currentLastSeq_ == dbimpl_->GetLatestSequenceNumber()) {
+      if (currentLastSeq_ == versions_->LastSequence()) {
         currentStatus_ = Status::OK();
       } else {
         currentStatus_ = Status::Corruption("NO MORE DATA LEFT");
@@ -202,12 +208,10 @@ bool TransactionLogIteratorImpl::IsBatchExpected(
   if (batchSeq != expectedSeq) {
     char buf[200];
     snprintf(buf, sizeof(buf),
-             "Discontinuity in log records. Got seq=%lu, Expected seq=%lu, "
-             "Last flushed seq=%lu.Log iterator will reseek the correct "
-             "batch.",
-             (unsigned long)batchSeq,
-             (unsigned long)expectedSeq,
-             (unsigned long)dbimpl_->GetLatestSequenceNumber());
+             "Discontinuity in log records. Got seq=%" PRIu64
+             ", Expected seq=%" PRIu64 ", Last flushed seq=%" PRIu64
+             ".Log iterator will reseek the correct batch.",
+             batchSeq, expectedSeq, versions_->LastSequence());
     reporter_.Info(buf);
     return false;
   }
@@ -239,7 +243,7 @@ void TransactionLogIteratorImpl::UpdateCurrentWriteBatch(const Slice& record) {
   currentLastSeq_ = currentBatchSeq_ +
                     WriteBatchInternal::Count(batch.get()) - 1;
   // currentBatchSeq_ can only change here
-  assert(currentLastSeq_ <= dbimpl_->GetLatestSequenceNumber());
+  assert(currentLastSeq_ <= versions_->LastSequence());
 
   currentBatch_ = move(batch);
   isValid_ = true;
@@ -248,9 +252,9 @@ void TransactionLogIteratorImpl::UpdateCurrentWriteBatch(const Slice& record) {
 
 Status TransactionLogIteratorImpl::OpenLogReader(const LogFile* logFile) {
   unique_ptr<SequentialFile> file;
-  Status status = OpenLogFile(logFile, &file);
-  if (!status.ok()) {
-    return status;
+  Status s = OpenLogFile(logFile, &file);
+  if (!s.ok()) {
+    return s;
   }
   assert(file);
   currentLogReader_.reset(new log::Reader(std::move(file), &reporter_,
diff --git a/src/rocksdb/db/transaction_log_impl.h b/src/rocksdb/db/transaction_log_impl.h
index 319b01c..af06154 100644
--- a/src/rocksdb/db/transaction_log_impl.h
+++ b/src/rocksdb/db/transaction_log_impl.h
@@ -11,23 +11,12 @@
 #include "rocksdb/options.h"
 #include "rocksdb/types.h"
 #include "rocksdb/transaction_log.h"
-#include "db/db_impl.h"
+#include "db/version_set.h"
 #include "db/log_reader.h"
 #include "db/filename.h"
 
 namespace rocksdb {
 
-struct LogReporter : public log::Reader::Reporter {
-  Env* env;
-  Logger* info_log;
-  virtual void Corruption(size_t bytes, const Status& s) {
-    Log(info_log, "dropping %zu bytes; %s", bytes, s.ToString().c_str());
-  }
-  virtual void Info(const char* s) {
-    Log(info_log, "%s", s);
-  }
-};
-
 class LogFileImpl : public LogFile {
  public:
   LogFileImpl(uint64_t logNum, WalFileType logType, SequenceNumber startSeq,
@@ -38,20 +27,20 @@ class LogFileImpl : public LogFile {
     sizeFileBytes_(sizeBytes) {
   }
 
-  std::string PathName() const {
+  std::string PathName() const override {
     if (type_ == kArchivedLogFile) {
       return ArchivedLogFileName("", logNumber_);
     }
     return LogFileName("", logNumber_);
   }
 
-  uint64_t LogNumber() const { return logNumber_; }
+  uint64_t LogNumber() const override { return logNumber_; }
 
-  WalFileType Type() const { return type_; }
+  WalFileType Type() const override { return type_; }
 
-  SequenceNumber StartSequence() const { return startSequence_; }
+  SequenceNumber StartSequence() const override { return startSequence_; }
 
-  uint64_t SizeFileBytes() const { return sizeFileBytes_; }
+  uint64_t SizeFileBytes() const override { return sizeFileBytes_; }
 
   bool operator < (const LogFile& that) const {
     return LogNumber() < that.LogNumber();
@@ -71,15 +60,15 @@ class TransactionLogIteratorImpl : public TransactionLogIterator {
       const std::string& dir, const DBOptions* options,
       const TransactionLogIterator::ReadOptions& read_options,
       const EnvOptions& soptions, const SequenceNumber seqNum,
-      std::unique_ptr<VectorLogPtr> files, DBImpl const* const dbimpl);
+      std::unique_ptr<VectorLogPtr> files, VersionSet const* const versions);
 
-  virtual bool Valid();
+  virtual bool Valid() override;
 
-  virtual void Next();
+  virtual void Next() override;
 
-  virtual Status status();
+  virtual Status status() override;
 
-  virtual BatchResult GetBatch();
+  virtual BatchResult GetBatch() override;
 
  private:
   const std::string& dir_;
@@ -95,10 +84,24 @@ class TransactionLogIteratorImpl : public TransactionLogIterator {
   std::unique_ptr<WriteBatch> currentBatch_;
   unique_ptr<log::Reader> currentLogReader_;
   Status OpenLogFile(const LogFile* logFile, unique_ptr<SequentialFile>* file);
-  LogReporter reporter_;
+
+  struct LogReporter : public log::Reader::Reporter {
+    Env* env;
+    Logger* info_log;
+    virtual void Corruption(size_t bytes, const Status& s) override {
+      Log(InfoLogLevel::ERROR_LEVEL, info_log, "dropping %zu bytes; %s", bytes,
+          s.ToString().c_str());
+    }
+    virtual void Info(const char* s) {
+      Log(InfoLogLevel::INFO_LEVEL, info_log, "%s", s);
+    }
+  } reporter_;
+
   SequenceNumber currentBatchSeq_; // sequence number at start of current batch
   SequenceNumber currentLastSeq_; // last sequence in the current batch
-  DBImpl const * const dbimpl_; // The db on whose log files this iterates
+  // Used only to get latest seq. num
+  // TODO(icanadi) can this be just a callback?
+  VersionSet const* const versions_;
 
   // Reads from transaction log only if the writebatch record has been written
   bool RestrictedRead(Slice* record, std::string* scratch);
diff --git a/src/rocksdb/db/version_builder.cc b/src/rocksdb/db/version_builder.cc
new file mode 100644
index 0000000..c010ee4
--- /dev/null
+++ b/src/rocksdb/db/version_builder.cc
@@ -0,0 +1,330 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_builder.h"
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
+#include <algorithm>
+#include <set>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/table_cache.h"
+#include "db/version_set.h"
+#include "table/table_reader.h"
+
+namespace rocksdb {
+
+bool NewestFirstBySeqNo(FileMetaData* a, FileMetaData* b) {
+  if (a->smallest_seqno != b->smallest_seqno) {
+    return a->smallest_seqno > b->smallest_seqno;
+  }
+  if (a->largest_seqno != b->largest_seqno) {
+    return a->largest_seqno > b->largest_seqno;
+  }
+  // Break ties by file number
+  return a->fd.GetNumber() > b->fd.GetNumber();
+}
+
+namespace {
+bool BySmallestKey(FileMetaData* a, FileMetaData* b,
+                   const InternalKeyComparator* cmp) {
+  int r = cmp->Compare(a->smallest, b->smallest);
+  if (r != 0) {
+    return (r < 0);
+  }
+  // Break ties by file number
+  return (a->fd.GetNumber() < b->fd.GetNumber());
+}
+}  // namespace
+
+class VersionBuilder::Rep {
+ private:
+  // Helper to sort files_ in v
+  // kLevel0 -- NewestFirstBySeqNo
+  // kLevelNon0 -- BySmallestKey
+  struct FileComparator {
+    enum SortMethod { kLevel0 = 0, kLevelNon0 = 1, } sort_method;
+    const InternalKeyComparator* internal_comparator;
+
+    bool operator()(FileMetaData* f1, FileMetaData* f2) const {
+      switch (sort_method) {
+        case kLevel0:
+          return NewestFirstBySeqNo(f1, f2);
+        case kLevelNon0:
+          return BySmallestKey(f1, f2, internal_comparator);
+      }
+      assert(false);
+      return false;
+    }
+  };
+
+  struct LevelState {
+    std::unordered_set<uint64_t> deleted_files;
+    // Map from file number to file meta data.
+    std::unordered_map<uint64_t, FileMetaData*> added_files;
+  };
+
+  const EnvOptions& env_options_;
+  TableCache* table_cache_;
+  VersionStorageInfo* base_vstorage_;
+  LevelState* levels_;
+  FileComparator level_zero_cmp_;
+  FileComparator level_nonzero_cmp_;
+
+ public:
+  Rep(const EnvOptions& env_options, TableCache* table_cache,
+      VersionStorageInfo* base_vstorage)
+      : env_options_(env_options),
+        table_cache_(table_cache),
+        base_vstorage_(base_vstorage) {
+    levels_ = new LevelState[base_vstorage_->num_levels()];
+    level_zero_cmp_.sort_method = FileComparator::kLevel0;
+    level_nonzero_cmp_.sort_method = FileComparator::kLevelNon0;
+    level_nonzero_cmp_.internal_comparator =
+        base_vstorage_->InternalComparator();
+  }
+
+  ~Rep() {
+    for (int level = 0; level < base_vstorage_->num_levels(); level++) {
+      const auto& added = levels_[level].added_files;
+      for (auto& pair : added) {
+        UnrefFile(pair.second);
+      }
+    }
+
+    delete[] levels_;
+  }
+
+  void UnrefFile(FileMetaData* f) {
+    f->refs--;
+    if (f->refs <= 0) {
+      if (f->table_reader_handle) {
+        assert(table_cache_ != nullptr);
+        table_cache_->ReleaseHandle(f->table_reader_handle);
+        f->table_reader_handle = nullptr;
+      }
+      delete f;
+    }
+  }
+
+  void CheckConsistency(VersionStorageInfo* vstorage) {
+#ifndef NDEBUG
+    // make sure the files are sorted correctly
+    for (int level = 0; level < vstorage->num_levels(); level++) {
+      auto& level_files = vstorage->LevelFiles(level);
+      for (size_t i = 1; i < level_files.size(); i++) {
+        auto f1 = level_files[i - 1];
+        auto f2 = level_files[i];
+        if (level == 0) {
+          assert(level_zero_cmp_(f1, f2));
+          assert(f1->largest_seqno > f2->largest_seqno);
+        } else {
+          assert(level_nonzero_cmp_(f1, f2));
+
+          // Make sure there is no overlap in levels > 0
+          if (vstorage->InternalComparator()->Compare(f1->largest,
+                                                      f2->smallest) >= 0) {
+            fprintf(stderr, "overlapping ranges in same level %s vs. %s\n",
+                    (f1->largest).DebugString().c_str(),
+                    (f2->smallest).DebugString().c_str());
+            abort();
+          }
+        }
+      }
+    }
+#endif
+  }
+
+  void CheckConsistencyForDeletes(VersionEdit* edit, uint64_t number,
+                                  int level) {
+#ifndef NDEBUG
+    // a file to be deleted better exist in the previous version
+    bool found = false;
+    for (int l = 0; !found && l < base_vstorage_->num_levels(); l++) {
+      const std::vector<FileMetaData*>& base_files =
+          base_vstorage_->LevelFiles(l);
+      for (unsigned int i = 0; i < base_files.size(); i++) {
+        FileMetaData* f = base_files[i];
+        if (f->fd.GetNumber() == number) {
+          found = true;
+          break;
+        }
+      }
+    }
+    // if the file did not exist in the previous version, then it
+    // is possibly moved from lower level to higher level in current
+    // version
+    for (int l = level + 1; !found && l < base_vstorage_->num_levels(); l++) {
+      auto& level_added = levels_[l].added_files;
+      auto got = level_added.find(number);
+      if (got != level_added.end()) {
+        found = true;
+        break;
+      }
+    }
+
+    // maybe this file was added in a previous edit that was Applied
+    if (!found) {
+      auto& level_added = levels_[level].added_files;
+      auto got = level_added.find(number);
+      if (got != level_added.end()) {
+        found = true;
+      }
+    }
+    if (!found) {
+      fprintf(stderr, "not found %" PRIu64 "\n", number);
+    }
+    assert(found);
+#endif
+  }
+
+  // Apply all of the edits in *edit to the current state.
+  void Apply(VersionEdit* edit) {
+    CheckConsistency(base_vstorage_);
+
+    // Delete files
+    const VersionEdit::DeletedFileSet& del = edit->GetDeletedFiles();
+    for (const auto& del_file : del) {
+      const auto level = del_file.first;
+      const auto number = del_file.second;
+      levels_[level].deleted_files.insert(number);
+      CheckConsistencyForDeletes(edit, number, level);
+
+      auto exising = levels_[level].added_files.find(number);
+      if (exising != levels_[level].added_files.end()) {
+        UnrefFile(exising->second);
+        levels_[level].added_files.erase(number);
+      }
+    }
+
+    // Add new files
+    for (const auto& new_file : edit->GetNewFiles()) {
+      const int level = new_file.first;
+      FileMetaData* f = new FileMetaData(new_file.second);
+      f->refs = 1;
+
+      assert(levels_[level].added_files.find(f->fd.GetNumber()) ==
+             levels_[level].added_files.end());
+      levels_[level].deleted_files.erase(f->fd.GetNumber());
+      levels_[level].added_files[f->fd.GetNumber()] = f;
+    }
+  }
+
+  // Save the current state in *v.
+  void SaveTo(VersionStorageInfo* vstorage) {
+    CheckConsistency(base_vstorage_);
+    CheckConsistency(vstorage);
+
+    for (int level = 0; level < base_vstorage_->num_levels(); level++) {
+      const auto& cmp = (level == 0) ? level_zero_cmp_ : level_nonzero_cmp_;
+      // Merge the set of added files with the set of pre-existing files.
+      // Drop any deleted files.  Store the result in *v.
+      const auto& base_files = base_vstorage_->LevelFiles(level);
+      auto base_iter = base_files.begin();
+      auto base_end = base_files.end();
+      const auto& unordered_added_files = levels_[level].added_files;
+      vstorage->Reserve(level,
+                        base_files.size() + unordered_added_files.size());
+
+      // Sort added files for the level.
+      std::vector<FileMetaData*> added_files;
+      added_files.reserve(unordered_added_files.size());
+      for (const auto& pair : unordered_added_files) {
+        added_files.push_back(pair.second);
+      }
+      std::sort(added_files.begin(), added_files.end(), cmp);
+
+#ifndef NDEBUG
+      FileMetaData* prev_file = nullptr;
+#endif
+
+      for (const auto& added : added_files) {
+#ifndef NDEBUG
+        if (level > 0 && prev_file != nullptr) {
+          assert(base_vstorage_->InternalComparator()->Compare(
+                     prev_file->smallest, added->smallest) <= 0);
+        }
+        prev_file = added;
+#endif
+
+        // Add all smaller files listed in base_
+        for (auto bpos = std::upper_bound(base_iter, base_end, added, cmp);
+             base_iter != bpos; ++base_iter) {
+          MaybeAddFile(vstorage, level, *base_iter);
+        }
+
+        MaybeAddFile(vstorage, level, added);
+      }
+
+      // Add remaining base files
+      for (; base_iter != base_end; ++base_iter) {
+        MaybeAddFile(vstorage, level, *base_iter);
+      }
+    }
+
+    CheckConsistency(vstorage);
+  }
+
+  void LoadTableHandlers() {
+    assert(table_cache_ != nullptr);
+    for (int level = 0; level < base_vstorage_->num_levels(); level++) {
+      for (auto& file_meta_pair : levels_[level].added_files) {
+        auto* file_meta = file_meta_pair.second;
+        assert(!file_meta->table_reader_handle);
+        table_cache_->FindTable(
+            env_options_, *(base_vstorage_->InternalComparator()),
+            file_meta->fd, &file_meta->table_reader_handle, false);
+        if (file_meta->table_reader_handle != nullptr) {
+          // Load table_reader
+          file_meta->fd.table_reader = table_cache_->GetTableReaderFromHandle(
+              file_meta->table_reader_handle);
+        }
+      }
+    }
+  }
+
+  void MaybeAddFile(VersionStorageInfo* vstorage, int level, FileMetaData* f) {
+    if (levels_[level].deleted_files.count(f->fd.GetNumber()) > 0) {
+      // File is deleted: do nothing
+    } else {
+      vstorage->AddFile(level, f);
+    }
+  }
+};
+
+VersionBuilder::VersionBuilder(const EnvOptions& env_options,
+                               TableCache* table_cache,
+                               VersionStorageInfo* base_vstorage)
+    : rep_(new Rep(env_options, table_cache, base_vstorage)) {}
+VersionBuilder::~VersionBuilder() { delete rep_; }
+void VersionBuilder::CheckConsistency(VersionStorageInfo* vstorage) {
+  rep_->CheckConsistency(vstorage);
+}
+void VersionBuilder::CheckConsistencyForDeletes(VersionEdit* edit,
+                                                uint64_t number, int level) {
+  rep_->CheckConsistencyForDeletes(edit, number, level);
+}
+void VersionBuilder::Apply(VersionEdit* edit) { rep_->Apply(edit); }
+void VersionBuilder::SaveTo(VersionStorageInfo* vstorage) {
+  rep_->SaveTo(vstorage);
+}
+void VersionBuilder::LoadTableHandlers() { rep_->LoadTableHandlers(); }
+void VersionBuilder::MaybeAddFile(VersionStorageInfo* vstorage, int level,
+                                  FileMetaData* f) {
+  rep_->MaybeAddFile(vstorage, level, f);
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/db/version_builder.h b/src/rocksdb/db/version_builder.h
new file mode 100644
index 0000000..452604f
--- /dev/null
+++ b/src/rocksdb/db/version_builder.h
@@ -0,0 +1,42 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+#pragma once
+#include "rocksdb/env.h"
+
+namespace rocksdb {
+
+class TableCache;
+class VersionStorageInfo;
+class VersionEdit;
+struct FileMetaData;
+
+// A helper class so we can efficiently apply a whole sequence
+// of edits to a particular state without creating intermediate
+// Versions that contain full copies of the intermediate state.
+class VersionBuilder {
+ public:
+  VersionBuilder(const EnvOptions& env_options, TableCache* table_cache,
+                 VersionStorageInfo* base_vstorage);
+  ~VersionBuilder();
+  void CheckConsistency(VersionStorageInfo* vstorage);
+  void CheckConsistencyForDeletes(VersionEdit* edit, uint64_t number,
+                                  int level);
+  void Apply(VersionEdit* edit);
+  void SaveTo(VersionStorageInfo* vstorage);
+  void LoadTableHandlers();
+  void MaybeAddFile(VersionStorageInfo* vstorage, int level, FileMetaData* f);
+
+ private:
+  class Rep;
+  Rep* rep_;
+};
+
+extern bool NewestFirstBySeqNo(FileMetaData* a, FileMetaData* b);
+}  // namespace rocksdb
diff --git a/src/rocksdb/db/version_builder_test.cc b/src/rocksdb/db/version_builder_test.cc
new file mode 100644
index 0000000..099bb78
--- /dev/null
+++ b/src/rocksdb/db/version_builder_test.cc
@@ -0,0 +1,304 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include <string>
+#include "db/version_edit.h"
+#include "db/version_set.h"
+#include "util/logging.h"
+#include "util/string_util.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+class VersionBuilderTest : public testing::Test {
+ public:
+  const Comparator* ucmp_;
+  InternalKeyComparator icmp_;
+  Options options_;
+  ImmutableCFOptions ioptions_;
+  MutableCFOptions mutable_cf_options_;
+  VersionStorageInfo vstorage_;
+  uint32_t file_num_;
+  CompactionOptionsFIFO fifo_options_;
+  std::vector<uint64_t> size_being_compacted_;
+
+  VersionBuilderTest()
+      : ucmp_(BytewiseComparator()),
+        icmp_(ucmp_),
+        ioptions_(options_),
+        mutable_cf_options_(options_, ioptions_),
+        vstorage_(&icmp_, ucmp_, options_.num_levels, kCompactionStyleLevel,
+                 nullptr),
+        file_num_(1) {
+    mutable_cf_options_.RefreshDerivedOptions(ioptions_);
+    size_being_compacted_.resize(options_.num_levels);
+  }
+
+  ~VersionBuilderTest() {
+    for (int i = 0; i < vstorage_.num_levels(); i++) {
+      for (auto* f : vstorage_.LevelFiles(i)) {
+        if (--f->refs == 0) {
+          delete f;
+        }
+      }
+    }
+  }
+
+  InternalKey GetInternalKey(const char* ukey,
+                             SequenceNumber smallest_seq = 100) {
+    return InternalKey(ukey, smallest_seq, kTypeValue);
+  }
+
+  void Add(int level, uint32_t file_number, const char* smallest,
+           const char* largest, uint64_t file_size = 0, uint32_t path_id = 0,
+           SequenceNumber smallest_seq = 100, SequenceNumber largest_seq = 100,
+           uint64_t num_entries = 0, uint64_t num_deletions = 0,
+           bool sampled = false, SequenceNumber smallest_seqno = 0,
+           SequenceNumber largest_seqno = 0) {
+    assert(level < vstorage_.num_levels());
+    FileMetaData* f = new FileMetaData;
+    f->fd = FileDescriptor(file_number, path_id, file_size);
+    f->smallest = GetInternalKey(smallest, smallest_seq);
+    f->largest = GetInternalKey(largest, largest_seq);
+    f->smallest_seqno = smallest_seqno;
+    f->largest_seqno = largest_seqno;
+    f->compensated_file_size = file_size;
+    f->refs = 0;
+    f->num_entries = num_entries;
+    f->num_deletions = num_deletions;
+    vstorage_.AddFile(level, f);
+    if (sampled) {
+      f->init_stats_from_file = true;
+      vstorage_.UpdateAccumulatedStats(f);
+    }
+  }
+
+  void UpdateVersionStorageInfo() {
+    vstorage_.UpdateFilesBySize();
+    vstorage_.UpdateNumNonEmptyLevels();
+    vstorage_.GenerateFileIndexer();
+    vstorage_.GenerateLevelFilesBrief();
+    vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_);
+    vstorage_.SetFinalized();
+  }
+};
+
+void UnrefFilesInVersion(VersionStorageInfo* new_vstorage) {
+  for (int i = 0; i < new_vstorage->num_levels(); i++) {
+    for (auto* f : new_vstorage->LevelFiles(i)) {
+      if (--f->refs == 0) {
+        delete f;
+      }
+    }
+  }
+}
+
+TEST_F(VersionBuilderTest, ApplyAndSaveTo) {
+  Add(0, 1U, "150", "200", 100U);
+
+  Add(1, 66U, "150", "200", 100U);
+  Add(1, 88U, "201", "300", 100U);
+
+  Add(2, 6U, "150", "179", 100U);
+  Add(2, 7U, "180", "220", 100U);
+  Add(2, 8U, "221", "300", 100U);
+
+  Add(3, 26U, "150", "170", 100U);
+  Add(3, 27U, "171", "179", 100U);
+  Add(3, 28U, "191", "220", 100U);
+  Add(3, 29U, "221", "300", 100U);
+  UpdateVersionStorageInfo();
+
+  VersionEdit version_edit;
+  version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"),
+                       GetInternalKey("350"), 200, 200);
+  version_edit.DeleteFile(3, 27U);
+
+  EnvOptions env_options;
+
+  VersionBuilder version_builder(env_options, nullptr, &vstorage_);
+
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, nullptr);
+  version_builder.Apply(&version_edit);
+  version_builder.SaveTo(&new_vstorage);
+
+  ASSERT_EQ(400U, new_vstorage.NumLevelBytes(2));
+  ASSERT_EQ(300U, new_vstorage.NumLevelBytes(3));
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic) {
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+
+  Add(0, 1U, "150", "200", 100U, 0, 200U, 200U, 0, 0, false, 200U, 200U);
+  Add(0, 88U, "201", "300", 100U, 0, 100U, 100U, 0, 0, false, 100U, 100U);
+
+  Add(4, 6U, "150", "179", 100U);
+  Add(4, 7U, "180", "220", 100U);
+  Add(4, 8U, "221", "300", 100U);
+
+  Add(5, 26U, "150", "170", 100U);
+  Add(5, 27U, "171", "179", 100U);
+  UpdateVersionStorageInfo();
+
+  VersionEdit version_edit;
+  version_edit.AddFile(3, 666, 0, 100U, GetInternalKey("301"),
+                       GetInternalKey("350"), 200, 200);
+  version_edit.DeleteFile(0, 1U);
+  version_edit.DeleteFile(0, 88U);
+
+  EnvOptions env_options;
+
+  VersionBuilder version_builder(env_options, nullptr, &vstorage_);
+
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, nullptr);
+  version_builder.Apply(&version_edit);
+  version_builder.SaveTo(&new_vstorage);
+
+  ASSERT_EQ(0U, new_vstorage.NumLevelBytes(0));
+  ASSERT_EQ(100U, new_vstorage.NumLevelBytes(3));
+  ASSERT_EQ(300U, new_vstorage.NumLevelBytes(4));
+  ASSERT_EQ(200U, new_vstorage.NumLevelBytes(5));
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic2) {
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+
+  Add(0, 1U, "150", "200", 100U, 0, 200U, 200U, 0, 0, false, 200U, 200U);
+  Add(0, 88U, "201", "300", 100U, 0, 100U, 100U, 0, 0, false, 100U, 100U);
+
+  Add(4, 6U, "150", "179", 100U);
+  Add(4, 7U, "180", "220", 100U);
+  Add(4, 8U, "221", "300", 100U);
+
+  Add(5, 26U, "150", "170", 100U);
+  Add(5, 27U, "171", "179", 100U);
+  UpdateVersionStorageInfo();
+
+  VersionEdit version_edit;
+  version_edit.AddFile(4, 666, 0, 100U, GetInternalKey("301"),
+                       GetInternalKey("350"), 200, 200);
+  version_edit.DeleteFile(0, 1U);
+  version_edit.DeleteFile(0, 88U);
+  version_edit.DeleteFile(4, 6U);
+  version_edit.DeleteFile(4, 7U);
+  version_edit.DeleteFile(4, 8U);
+
+  EnvOptions env_options;
+
+  VersionBuilder version_builder(env_options, nullptr, &vstorage_);
+
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, nullptr);
+  version_builder.Apply(&version_edit);
+  version_builder.SaveTo(&new_vstorage);
+
+  ASSERT_EQ(0U, new_vstorage.NumLevelBytes(0));
+  ASSERT_EQ(100U, new_vstorage.NumLevelBytes(4));
+  ASSERT_EQ(200U, new_vstorage.NumLevelBytes(5));
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, ApplyMultipleAndSaveTo) {
+  UpdateVersionStorageInfo();
+
+  VersionEdit version_edit;
+  version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"),
+                       GetInternalKey("350"), 200, 200);
+  version_edit.AddFile(2, 676, 0, 100U, GetInternalKey("401"),
+                       GetInternalKey("450"), 200, 200);
+  version_edit.AddFile(2, 636, 0, 100U, GetInternalKey("601"),
+                       GetInternalKey("650"), 200, 200);
+  version_edit.AddFile(2, 616, 0, 100U, GetInternalKey("501"),
+                       GetInternalKey("550"), 200, 200);
+  version_edit.AddFile(2, 606, 0, 100U, GetInternalKey("701"),
+                       GetInternalKey("750"), 200, 200);
+
+  EnvOptions env_options;
+
+  VersionBuilder version_builder(env_options, nullptr, &vstorage_);
+
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, nullptr);
+  version_builder.Apply(&version_edit);
+  version_builder.SaveTo(&new_vstorage);
+
+  ASSERT_EQ(500U, new_vstorage.NumLevelBytes(2));
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, ApplyDeleteAndSaveTo) {
+  UpdateVersionStorageInfo();
+
+  EnvOptions env_options;
+  VersionBuilder version_builder(env_options, nullptr, &vstorage_);
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, nullptr);
+
+  VersionEdit version_edit;
+  version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"),
+                       GetInternalKey("350"), 200, 200);
+  version_edit.AddFile(2, 676, 0, 100U, GetInternalKey("401"),
+                       GetInternalKey("450"), 200, 200);
+  version_edit.AddFile(2, 636, 0, 100U, GetInternalKey("601"),
+                       GetInternalKey("650"), 200, 200);
+  version_edit.AddFile(2, 616, 0, 100U, GetInternalKey("501"),
+                       GetInternalKey("550"), 200, 200);
+  version_edit.AddFile(2, 606, 0, 100U, GetInternalKey("701"),
+                       GetInternalKey("750"), 200, 200);
+  version_builder.Apply(&version_edit);
+
+  VersionEdit version_edit2;
+  version_edit.AddFile(2, 808, 0, 100U, GetInternalKey("901"),
+                       GetInternalKey("950"), 200, 200);
+  version_edit2.DeleteFile(2, 616);
+  version_edit2.DeleteFile(2, 636);
+  version_edit.AddFile(2, 806, 0, 100U, GetInternalKey("801"),
+                       GetInternalKey("850"), 200, 200);
+  version_builder.Apply(&version_edit2);
+
+  version_builder.SaveTo(&new_vstorage);
+
+  ASSERT_EQ(300U, new_vstorage.NumLevelBytes(2));
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, EstimatedActiveKeys) {
+  const uint32_t kTotalSamples = 20;
+  const uint32_t kNumLevels = 5;
+  const uint32_t kFilesPerLevel = 8;
+  const uint32_t kNumFiles = kNumLevels * kFilesPerLevel;
+  const uint32_t kEntriesPerFile = 1000;
+  const uint32_t kDeletionsPerFile = 100;
+  for (uint32_t i = 0; i < kNumFiles; ++i) {
+    Add(static_cast<int>(i / kFilesPerLevel), i + 1,
+        ToString((i + 100) * 1000).c_str(),
+        ToString((i + 100) * 1000 + 999).c_str(),
+        100U,  0, 100, 100,
+        kEntriesPerFile, kDeletionsPerFile,
+        (i < kTotalSamples));
+  }
+  // minus 2X for the number of deletion entries because:
+  // 1x for deletion entry does not count as a data entry.
+  // 1x for each deletion entry will actually remove one data entry.
+  ASSERT_EQ(vstorage_.GetEstimatedActiveKeys(),
+            (kEntriesPerFile - 2 * kDeletionsPerFile) * kNumFiles);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/version_edit.cc b/src/rocksdb/db/version_edit.cc
index 24d7f0d..f7b2888 100644
--- a/src/rocksdb/db/version_edit.cc
+++ b/src/rocksdb/db/version_edit.cc
@@ -18,25 +18,30 @@ namespace rocksdb {
 // Tag numbers for serialized VersionEdit.  These numbers are written to
 // disk and should not be changed.
 enum Tag {
-  kComparator           = 1,
-  kLogNumber            = 2,
-  kNextFileNumber       = 3,
-  kLastSequence         = 4,
-  kCompactPointer       = 5,
-  kDeletedFile          = 6,
-  kNewFile              = 7,
+  kComparator = 1,
+  kLogNumber = 2,
+  kNextFileNumber = 3,
+  kLastSequence = 4,
+  kCompactPointer = 5,
+  kDeletedFile = 6,
+  kNewFile = 7,
   // 8 was used for large value refs
-  kPrevLogNumber        = 9,
+  kPrevLogNumber = 9,
 
   // these are new formats divergent from open source leveldb
-  kNewFile2             = 100,  // store smallest & largest seqno
-
-  kColumnFamily         = 200,  // specify column family for version edit
-  kColumnFamilyAdd      = 201,
-  kColumnFamilyDrop     = 202,
-  kMaxColumnFamily      = 203,
+  kNewFile2 = 100,
+  kNewFile3 = 102,
+  kColumnFamily = 200,  // specify column family for version edit
+  kColumnFamilyAdd = 201,
+  kColumnFamilyDrop = 202,
+  kMaxColumnFamily = 203,
 };
 
+uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id) {
+  assert(number <= kFileNumberMask);
+  return number | (path_id * (kFileNumberMask + 1));
+}
+
 void VersionEdit::Clear() {
   comparator_.clear();
   max_level_ = 0;
@@ -59,7 +64,7 @@ void VersionEdit::Clear() {
   column_family_name_.clear();
 }
 
-void VersionEdit::EncodeTo(std::string* dst) const {
+bool VersionEdit::EncodeTo(std::string* dst) const {
   if (has_comparator_) {
     PutVarint32(dst, kComparator);
     PutLengthPrefixedSlice(dst, comparator_);
@@ -93,10 +98,22 @@ void VersionEdit::EncodeTo(std::string* dst) const {
 
   for (size_t i = 0; i < new_files_.size(); i++) {
     const FileMetaData& f = new_files_[i].second;
-    PutVarint32(dst, kNewFile2);
+    if (!f.smallest.Valid() || !f.largest.Valid()) {
+      return false;
+    }
+    if (f.fd.GetPathId() == 0) {
+      // Use older format to make sure user can roll back the build if they
+      // don't config multiple DB paths.
+      PutVarint32(dst, kNewFile2);
+    } else {
+      PutVarint32(dst, kNewFile3);
+    }
     PutVarint32(dst, new_files_[i].first);  // level
-    PutVarint64(dst, f.number);
-    PutVarint64(dst, f.file_size);
+    PutVarint64(dst, f.fd.GetNumber());
+    if (f.fd.GetPathId() != 0) {
+      PutVarint32(dst, f.fd.GetPathId());
+    }
+    PutVarint64(dst, f.fd.GetFileSize());
     PutLengthPrefixedSlice(dst, f.smallest.Encode());
     PutLengthPrefixedSlice(dst, f.largest.Encode());
     PutVarint64(dst, f.smallest_seqno);
@@ -117,13 +134,14 @@ void VersionEdit::EncodeTo(std::string* dst) const {
   if (is_column_family_drop_) {
     PutVarint32(dst, kColumnFamilyDrop);
   }
+  return true;
 }
 
 static bool GetInternalKey(Slice* input, InternalKey* dst) {
   Slice str;
   if (GetLengthPrefixedSlice(input, &str)) {
     dst->DecodeFrom(str);
-    return true;
+    return dst->Valid();
   } else {
     return false;
   }
@@ -150,7 +168,6 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
 
   // Temporary storage for parsing
   int level;
-  uint64_t number;
   FileMetaData f;
   Slice str;
   InternalKey key;
@@ -219,9 +236,9 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
         }
         break;
 
-      case kDeletedFile:
-        if (GetLevel(&input, &level, &msg) &&
-            GetVarint64(&input, &number)) {
+      case kDeletedFile: {
+        uint64_t number;
+        if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number)) {
           deleted_files_.insert(std::make_pair(level, number));
         } else {
           if (!msg) {
@@ -229,13 +246,16 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
           }
         }
         break;
+      }
 
-      case kNewFile:
-        if (GetLevel(&input, &level, &msg) &&
-            GetVarint64(&input, &f.number) &&
-            GetVarint64(&input, &f.file_size) &&
+      case kNewFile: {
+        uint64_t number;
+        uint64_t file_size;
+        if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) &&
+            GetVarint64(&input, &file_size) &&
             GetInternalKey(&input, &f.smallest) &&
             GetInternalKey(&input, &f.largest)) {
+          f.fd = FileDescriptor(number, 0, file_size);
           new_files_.push_back(std::make_pair(level, f));
         } else {
           if (!msg) {
@@ -243,15 +263,17 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
           }
         }
         break;
-
-      case kNewFile2:
-        if (GetLevel(&input, &level, &msg) &&
-            GetVarint64(&input, &f.number) &&
-            GetVarint64(&input, &f.file_size) &&
+      }
+      case kNewFile2: {
+        uint64_t number;
+        uint64_t file_size;
+        if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) &&
+            GetVarint64(&input, &file_size) &&
             GetInternalKey(&input, &f.smallest) &&
             GetInternalKey(&input, &f.largest) &&
             GetVarint64(&input, &f.smallest_seqno) &&
-            GetVarint64(&input, &f.largest_seqno) ) {
+            GetVarint64(&input, &f.largest_seqno)) {
+          f.fd = FileDescriptor(number, 0, file_size);
           new_files_.push_back(std::make_pair(level, f));
         } else {
           if (!msg) {
@@ -259,6 +281,27 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
           }
         }
         break;
+      }
+
+      case kNewFile3: {
+        uint64_t number;
+        uint32_t path_id;
+        uint64_t file_size;
+        if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) &&
+            GetVarint32(&input, &path_id) && GetVarint64(&input, &file_size) &&
+            GetInternalKey(&input, &f.smallest) &&
+            GetInternalKey(&input, &f.largest) &&
+            GetVarint64(&input, &f.smallest_seqno) &&
+            GetVarint64(&input, &f.largest_seqno)) {
+          f.fd = FileDescriptor(number, path_id, file_size);
+          new_files_.push_back(std::make_pair(level, f));
+        } else {
+          if (!msg) {
+            msg = "new-file3 entry";
+          }
+        }
+        break;
+      }
 
       case kColumnFamily:
         if (!GetVarint32(&input, &column_family_)) {
@@ -336,9 +379,9 @@ std::string VersionEdit::DebugString(bool hex_key) const {
     r.append("\n  AddFile: ");
     AppendNumberTo(&r, new_files_[i].first);
     r.append(" ");
-    AppendNumberTo(&r, f.number);
+    AppendNumberTo(&r, f.fd.GetNumber());
     r.append(" ");
-    AppendNumberTo(&r, f.file_size);
+    AppendNumberTo(&r, f.fd.GetFileSize());
     r.append(" ");
     r.append(f.smallest.DebugString(hex_key));
     r.append(" .. ");
diff --git a/src/rocksdb/db/version_edit.h b/src/rocksdb/db/version_edit.h
index acaec8a..6da4f5b 100644
--- a/src/rocksdb/db/version_edit.h
+++ b/src/rocksdb/db/version_edit.h
@@ -8,42 +8,127 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #pragma once
+#include <algorithm>
 #include <set>
 #include <utility>
 #include <vector>
 #include <string>
 #include "rocksdb/cache.h"
 #include "db/dbformat.h"
+#include "util/arena.h"
+#include "util/autovector.h"
 
 namespace rocksdb {
 
 class VersionSet;
 
+const uint64_t kFileNumberMask = 0x3FFFFFFFFFFFFFFF;
+
+extern uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id);
+
+// A copyable structure contains information needed to read data from an SST
+// file. It can contains a pointer to a table reader opened for the file, or
+// file number and size, which can be used to create a new table reader for it.
+// The behavior is undefined when a copied of the structure is used when the
+// file is not in any live version any more.
+struct FileDescriptor {
+  // Table reader in table_reader_handle
+  TableReader* table_reader;
+  uint64_t packed_number_and_path_id;
+  uint64_t file_size;  // File size in bytes
+
+  FileDescriptor() : FileDescriptor(0, 0, 0) {}
+
+  FileDescriptor(uint64_t number, uint32_t path_id, uint64_t _file_size)
+      : table_reader(nullptr),
+        packed_number_and_path_id(PackFileNumberAndPathId(number, path_id)),
+        file_size(_file_size) {}
+
+  FileDescriptor& operator=(const FileDescriptor& fd) {
+    table_reader = fd.table_reader;
+    packed_number_and_path_id = fd.packed_number_and_path_id;
+    file_size = fd.file_size;
+    return *this;
+  }
+
+  uint64_t GetNumber() const {
+    return packed_number_and_path_id & kFileNumberMask;
+  }
+  uint32_t GetPathId() const {
+    return packed_number_and_path_id / (kFileNumberMask + 1);
+  }
+  uint64_t GetFileSize() const { return file_size; }
+};
+
 struct FileMetaData {
   int refs;
-  int allowed_seeks;          // Seeks allowed until compaction
-  uint64_t number;
-  uint64_t file_size;         // File size in bytes
-  InternalKey smallest;       // Smallest internal key served by table
-  InternalKey largest;        // Largest internal key served by table
-  bool being_compacted;       // Is this file undergoing compaction?
-  SequenceNumber smallest_seqno;// The smallest seqno in this file
-  SequenceNumber largest_seqno; // The largest seqno in this file
+  FileDescriptor fd;
+  InternalKey smallest;            // Smallest internal key served by table
+  InternalKey largest;             // Largest internal key served by table
+  bool being_compacted;            // Is this file undergoing compaction?
+  SequenceNumber smallest_seqno;   // The smallest seqno in this file
+  SequenceNumber largest_seqno;    // The largest seqno in this file
 
   // Needs to be disposed when refs becomes 0.
   Cache::Handle* table_reader_handle;
-  // Table reader in table_reader_handle
-  TableReader* table_reader;
 
-  FileMetaData(uint64_t number, uint64_t file_size)
+  // Stats for compensating deletion entries during compaction
+
+  // File size compensated by deletion entry.
+  // This is updated in Version::UpdateAccumulatedStats() first time when the
+  // file is created or loaded.  After it is updated (!= 0), it is immutable.
+  uint64_t compensated_file_size;
+  // These values can mutate, but they can only be read or written from
+  // single-threaded LogAndApply thread
+  uint64_t num_entries;            // the number of entries.
+  uint64_t num_deletions;          // the number of deletion entries.
+  uint64_t raw_key_size;           // total uncompressed key size.
+  uint64_t raw_value_size;         // total uncompressed value size.
+  bool init_stats_from_file;   // true if the data-entry stats of this file
+                               // has initialized from file.
+
+  bool marked_for_compaction;  // True if client asked us nicely to compact this
+                               // file.
+
+  FileMetaData()
       : refs(0),
-        allowed_seeks(1 << 30),
-        number(number),
-        file_size(file_size),
         being_compacted(false),
         table_reader_handle(nullptr),
-        table_reader(nullptr) {}
-  FileMetaData() : FileMetaData(0, 0) {}
+        compensated_file_size(0),
+        num_entries(0),
+        num_deletions(0),
+        raw_key_size(0),
+        raw_value_size(0),
+        init_stats_from_file(false),
+        marked_for_compaction(false) {}
+};
+
+// A compressed copy of file meta data that just contain
+// smallest and largest key's slice
+struct FdWithKeyRange {
+  FileDescriptor fd;
+  Slice smallest_key;    // slice that contain smallest key
+  Slice largest_key;     // slice that contain largest key
+
+  FdWithKeyRange()
+      : fd(),
+        smallest_key(),
+        largest_key() {
+  }
+
+  FdWithKeyRange(FileDescriptor _fd, Slice _smallest_key, Slice _largest_key)
+      : fd(_fd), smallest_key(_smallest_key), largest_key(_largest_key) {}
+};
+
+// Data structure to store an array of FdWithKeyRange in one level
+// Actual data is guaranteed to be stored closely
+struct LevelFilesBrief {
+  size_t num_files;
+  FdWithKeyRange* files;
+  LevelFilesBrief() {
+    num_files = 0;
+    files = nullptr;
+  }
 };
 
 class VersionEdit {
@@ -81,16 +166,13 @@ class VersionEdit {
   // Add the specified file at the specified number.
   // REQUIRES: This version has not been saved (see VersionSet::SaveTo)
   // REQUIRES: "smallest" and "largest" are smallest and largest keys in file
-  void AddFile(int level, uint64_t file,
-               uint64_t file_size,
-               const InternalKey& smallest,
-               const InternalKey& largest,
-               const SequenceNumber& smallest_seqno,
+  void AddFile(int level, uint64_t file, uint32_t file_path_id,
+               uint64_t file_size, const InternalKey& smallest,
+               const InternalKey& largest, const SequenceNumber& smallest_seqno,
                const SequenceNumber& largest_seqno) {
     assert(smallest_seqno <= largest_seqno);
     FileMetaData f;
-    f.number = file;
-    f.file_size = file_size;
+    f.fd = FileDescriptor(file, file_path_id, file_size);
     f.smallest = smallest;
     f.largest = largest;
     f.smallest_seqno = smallest_seqno;
@@ -104,9 +186,7 @@ class VersionEdit {
   }
 
   // Number of edits
-  int NumEntries() {
-    return new_files_.size() + deleted_files_.size();
-  }
+  size_t NumEntries() { return new_files_.size() + deleted_files_.size(); }
 
   bool IsColumnFamilyManipulation() {
     return is_column_family_add_ || is_column_family_drop_;
@@ -133,15 +213,22 @@ class VersionEdit {
     is_column_family_drop_ = true;
   }
 
-  void EncodeTo(std::string* dst) const;
+  // return true on success.
+  bool EncodeTo(std::string* dst) const;
   Status DecodeFrom(const Slice& src);
 
+  typedef std::set<std::pair<int, uint64_t>> DeletedFileSet;
+
+  const DeletedFileSet& GetDeletedFiles() { return deleted_files_; }
+  const std::vector<std::pair<int, FileMetaData>>& GetNewFiles() {
+    return new_files_;
+  }
+
   std::string DebugString(bool hex_key = false) const;
 
  private:
   friend class VersionSet;
-
-  typedef std::set< std::pair<int, uint64_t>> DeletedFileSet;
+  friend class Version;
 
   bool GetLevel(Slice* input, int* level, const char** msg);
 
diff --git a/src/rocksdb/db/version_edit_test.cc b/src/rocksdb/db/version_edit_test.cc
index 7842b32..8b7b31b 100644
--- a/src/rocksdb/db/version_edit_test.cc
+++ b/src/rocksdb/db/version_edit_test.cc
@@ -22,19 +22,19 @@ static void TestEncodeDecode(const VersionEdit& edit) {
   ASSERT_EQ(encoded, encoded2);
 }
 
-class VersionEditTest { };
+class VersionEditTest : public testing::Test {};
 
-TEST(VersionEditTest, EncodeDecode) {
+TEST_F(VersionEditTest, EncodeDecode) {
   static const uint64_t kBig = 1ull << 50;
+  static const uint32_t kBig32Bit = 1ull << 30;
 
   VersionEdit edit;
   for (int i = 0; i < 4; i++) {
     TestEncodeDecode(edit);
-    edit.AddFile(3, kBig + 300 + i, kBig + 400 + i,
+    edit.AddFile(3, kBig + 300 + i, kBig32Bit + 400 + i, 0,
                  InternalKey("foo", kBig + 500 + i, kTypeValue),
                  InternalKey("zoo", kBig + 600 + i, kTypeDeletion),
-                 kBig + 500 + i,
-                 kBig + 600 + i);
+                 kBig + 500 + i, kBig + 600 + i);
     edit.DeleteFile(4, kBig + 700 + i);
   }
 
@@ -45,7 +45,17 @@ TEST(VersionEditTest, EncodeDecode) {
   TestEncodeDecode(edit);
 }
 
-TEST(VersionEditTest, ColumnFamilyTest) {
+TEST_F(VersionEditTest, EncodeEmptyFile) {
+  VersionEdit edit;
+  edit.AddFile(0, 0, 0, 0,
+               InternalKey(),
+               InternalKey(),
+               0, 0);
+  std::string buffer;
+  ASSERT_TRUE(!edit.EncodeTo(&buffer));
+}
+
+TEST_F(VersionEditTest, ColumnFamilyTest) {
   VersionEdit edit;
   edit.SetColumnFamily(2);
   edit.AddColumnFamily("column_family");
@@ -61,5 +71,6 @@ TEST(VersionEditTest, ColumnFamilyTest) {
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
-  return rocksdb::test::RunAllTests();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
 }
diff --git a/src/rocksdb/db/version_set.cc b/src/rocksdb/db/version_set.cc
index 00d9caf..7cf010a 100644
--- a/src/rocksdb/db/version_set.cc
+++ b/src/rocksdb/db/version_set.cc
@@ -7,16 +7,21 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#define __STDC_FORMAT_MACROS
 #include "db/version_set.h"
 
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
 #include <inttypes.h>
+#include <stdio.h>
 #include <algorithm>
 #include <map>
 #include <set>
 #include <climits>
 #include <unordered_map>
-#include <stdio.h>
+#include <vector>
+#include <string>
 
 #include "db/filename.h"
 #include "db/log_reader.h"
@@ -25,6 +30,8 @@
 #include "db/merge_context.h"
 #include "db/table_cache.h"
 #include "db/compaction.h"
+#include "db/version_builder.h"
+#include "db/writebuffer.h"
 #include "rocksdb/env.h"
 #include "rocksdb/merge_operator.h"
 #include "table/table_reader.h"
@@ -33,20 +40,264 @@
 #include "table/format.h"
 #include "table/plain_table_factory.h"
 #include "table/meta_blocks.h"
+#include "table/get_context.h"
 #include "util/coding.h"
 #include "util/logging.h"
 #include "util/stop_watch.h"
 
 namespace rocksdb {
 
-static uint64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
-  uint64_t sum = 0;
-  for (size_t i = 0; i < files.size() && files[i]; i++) {
-    sum += files[i]->file_size;
+namespace {
+
+// Find File in LevelFilesBrief data structure
+// Within an index range defined by left and right
+int FindFileInRange(const InternalKeyComparator& icmp,
+    const LevelFilesBrief& file_level,
+    const Slice& key,
+    uint32_t left,
+    uint32_t right) {
+  while (left < right) {
+    uint32_t mid = (left + right) / 2;
+    const FdWithKeyRange& f = file_level.files[mid];
+    if (icmp.InternalKeyComparator::Compare(f.largest_key, key) < 0) {
+      // Key at "mid.largest" is < "target".  Therefore all
+      // files at or before "mid" are uninteresting.
+      left = mid + 1;
+    } else {
+      // Key at "mid.largest" is >= "target".  Therefore all files
+      // after "mid" are uninteresting.
+      right = mid;
+    }
   }
-  return sum;
+  return right;
 }
 
+// Class to help choose the next file to search for the particular key.
+// Searches and returns files level by level.
+// We can search level-by-level since entries never hop across
+// levels. Therefore we are guaranteed that if we find data
+// in a smaller level, later levels are irrelevant (unless we
+// are MergeInProgress).
+class FilePicker {
+ public:
+  FilePicker(
+      std::vector<FileMetaData*>* files,
+      const Slice& user_key,
+      const Slice& ikey,
+      autovector<LevelFilesBrief>* file_levels,
+      unsigned int num_levels,
+      FileIndexer* file_indexer,
+      const Comparator* user_comparator,
+      const InternalKeyComparator* internal_comparator)
+      : num_levels_(num_levels),
+        curr_level_(-1),
+        hit_file_level_(-1),
+        search_left_bound_(0),
+        search_right_bound_(FileIndexer::kLevelMaxIndex),
+#ifndef NDEBUG
+        files_(files),
+#endif
+        level_files_brief_(file_levels),
+        user_key_(user_key),
+        ikey_(ikey),
+        file_indexer_(file_indexer),
+        user_comparator_(user_comparator),
+        internal_comparator_(internal_comparator) {
+    // Setup member variables to search first level.
+    search_ended_ = !PrepareNextLevel();
+    if (!search_ended_) {
+      // Prefetch Level 0 table data to avoid cache miss if possible.
+      for (unsigned int i = 0; i < (*level_files_brief_)[0].num_files; ++i) {
+        auto* r = (*level_files_brief_)[0].files[i].fd.table_reader;
+        if (r) {
+          r->Prepare(ikey);
+        }
+      }
+    }
+  }
+
+  FdWithKeyRange* GetNextFile() {
+    while (!search_ended_) {  // Loops over different levels.
+      while (curr_index_in_curr_level_ < curr_file_level_->num_files) {
+        // Loops over all files in current level.
+        FdWithKeyRange* f = &curr_file_level_->files[curr_index_in_curr_level_];
+        hit_file_level_ = curr_level_;
+        int cmp_largest = -1;
+
+        // Do key range filtering of files or/and fractional cascading if:
+        // (1) not all the files are in level 0, or
+        // (2) there are more than 3 Level 0 files
+        // If there are only 3 or less level 0 files in the system, we skip
+        // the key range filtering. In this case, more likely, the system is
+        // highly tuned to minimize number of tables queried by each query,
+        // so it is unlikely that key range filtering is more efficient than
+        // querying the files.
+        if (num_levels_ > 1 || curr_file_level_->num_files > 3) {
+          // Check if key is within a file's range. If search left bound and
+          // right bound point to the same find, we are sure key falls in
+          // range.
+          assert(
+              curr_level_ == 0 ||
+              curr_index_in_curr_level_ == start_index_in_curr_level_ ||
+              user_comparator_->Compare(user_key_,
+                ExtractUserKey(f->smallest_key)) <= 0);
+
+          int cmp_smallest = user_comparator_->Compare(user_key_,
+              ExtractUserKey(f->smallest_key));
+          if (cmp_smallest >= 0) {
+            cmp_largest = user_comparator_->Compare(user_key_,
+                ExtractUserKey(f->largest_key));
+          }
+
+          // Setup file search bound for the next level based on the
+          // comparison results
+          if (curr_level_ > 0) {
+            file_indexer_->GetNextLevelIndex(curr_level_,
+                                            curr_index_in_curr_level_,
+                                            cmp_smallest, cmp_largest,
+                                            &search_left_bound_,
+                                            &search_right_bound_);
+          }
+          // Key falls out of current file's range
+          if (cmp_smallest < 0 || cmp_largest > 0) {
+            if (curr_level_ == 0) {
+              ++curr_index_in_curr_level_;
+              continue;
+            } else {
+              // Search next level.
+              break;
+            }
+          }
+        }
+#ifndef NDEBUG
+        // Sanity check to make sure that the files are correctly sorted
+        if (prev_file_) {
+          if (curr_level_ != 0) {
+            int comp_sign = internal_comparator_->Compare(
+                prev_file_->largest_key, f->smallest_key);
+            assert(comp_sign < 0);
+          } else {
+            // level == 0, the current file cannot be newer than the previous
+            // one. Use compressed data structure, has no attribute seqNo
+            assert(curr_index_in_curr_level_ > 0);
+            assert(!NewestFirstBySeqNo(files_[0][curr_index_in_curr_level_],
+                  files_[0][curr_index_in_curr_level_-1]));
+          }
+        }
+        prev_file_ = f;
+#endif
+        if (curr_level_ > 0 && cmp_largest < 0) {
+          // No more files to search in this level.
+          search_ended_ = !PrepareNextLevel();
+        } else {
+          ++curr_index_in_curr_level_;
+        }
+        return f;
+      }
+      // Start searching next level.
+      search_ended_ = !PrepareNextLevel();
+    }
+    // Search ended.
+    return nullptr;
+  }
+
+  // getter for current file level
+  // for GET_HIT_L0, GET_HIT_L1 & GET_HIT_L2_AND_UP counts
+  unsigned int GetHitFileLevel() { return hit_file_level_; }
+
+ private:
+  unsigned int num_levels_;
+  unsigned int curr_level_;
+  unsigned int hit_file_level_;
+  int32_t search_left_bound_;
+  int32_t search_right_bound_;
+#ifndef NDEBUG
+  std::vector<FileMetaData*>* files_;
+#endif
+  autovector<LevelFilesBrief>* level_files_brief_;
+  bool search_ended_;
+  LevelFilesBrief* curr_file_level_;
+  unsigned int curr_index_in_curr_level_;
+  unsigned int start_index_in_curr_level_;
+  Slice user_key_;
+  Slice ikey_;
+  FileIndexer* file_indexer_;
+  const Comparator* user_comparator_;
+  const InternalKeyComparator* internal_comparator_;
+#ifndef NDEBUG
+  FdWithKeyRange* prev_file_;
+#endif
+
+  // Setup local variables to search next level.
+  // Returns false if there are no more levels to search.
+  bool PrepareNextLevel() {
+    curr_level_++;
+    while (curr_level_ < num_levels_) {
+      curr_file_level_ = &(*level_files_brief_)[curr_level_];
+      if (curr_file_level_->num_files == 0) {
+        // When current level is empty, the search bound generated from upper
+        // level must be [0, -1] or [0, FileIndexer::kLevelMaxIndex] if it is
+        // also empty.
+        assert(search_left_bound_ == 0);
+        assert(search_right_bound_ == -1 ||
+               search_right_bound_ == FileIndexer::kLevelMaxIndex);
+        // Since current level is empty, it will need to search all files in
+        // the next level
+        search_left_bound_ = 0;
+        search_right_bound_ = FileIndexer::kLevelMaxIndex;
+        curr_level_++;
+        continue;
+      }
+
+      // Some files may overlap each other. We find
+      // all files that overlap user_key and process them in order from
+      // newest to oldest. In the context of merge-operator, this can occur at
+      // any level. Otherwise, it only occurs at Level-0 (since Put/Deletes
+      // are always compacted into a single entry).
+      int32_t start_index;
+      if (curr_level_ == 0) {
+        // On Level-0, we read through all files to check for overlap.
+        start_index = 0;
+      } else {
+        // On Level-n (n>=1), files are sorted. Binary search to find the
+        // earliest file whose largest key >= ikey. Search left bound and
+        // right bound are used to narrow the range.
+        if (search_left_bound_ == search_right_bound_) {
+          start_index = search_left_bound_;
+        } else if (search_left_bound_ < search_right_bound_) {
+          if (search_right_bound_ == FileIndexer::kLevelMaxIndex) {
+            search_right_bound_ =
+                static_cast<int32_t>(curr_file_level_->num_files) - 1;
+          }
+          start_index =
+              FindFileInRange(*internal_comparator_, *curr_file_level_, ikey_,
+                              static_cast<uint32_t>(search_left_bound_),
+                              static_cast<uint32_t>(search_right_bound_));
+        } else {
+          // search_left_bound > search_right_bound, key does not exist in
+          // this level. Since no comparison is done in this level, it will
+          // need to search all files in the next level.
+          search_left_bound_ = 0;
+          search_right_bound_ = FileIndexer::kLevelMaxIndex;
+          curr_level_++;
+          continue;
+        }
+      }
+      start_index_in_curr_level_ = start_index;
+      curr_index_in_curr_level_ = start_index;
+#ifndef NDEBUG
+      prev_file_ = nullptr;
+#endif
+      return true;
+    }
+    // curr_level_ = num_levels_. So, no more levels to search.
+    return false;
+  }
+};
+}  // anonymous namespace
+
+VersionStorageInfo::~VersionStorageInfo() { delete[] files_; }
+
 Version::~Version() {
   assert(refs_ == 0);
 
@@ -55,9 +306,9 @@ Version::~Version() {
   next_->prev_ = prev_;
 
   // Drop references to files
-  for (int level = 0; level < num_levels_; level++) {
-    for (size_t i = 0; i < files_[level].size(); i++) {
-      FileMetaData* f = files_[level][i];
+  for (int level = 0; level < storage_info_.num_levels_; level++) {
+    for (size_t i = 0; i < storage_info_.files_[level].size(); i++) {
+      FileMetaData* f = storage_info_.files_[level][i];
       assert(f->refs > 0);
       f->refs--;
       if (f->refs <= 0) {
@@ -69,61 +320,69 @@ Version::~Version() {
       }
     }
   }
-  delete[] files_;
-}
-
-int FindFileInRange(const InternalKeyComparator& icmp,
-    const std::vector<FileMetaData*>& files,
-    const Slice& key,
-    uint32_t left,
-    uint32_t right) {
-  while (left < right) {
-    uint32_t mid = (left + right) / 2;
-    const FileMetaData* f = files[mid];
-    if (icmp.InternalKeyComparator::Compare(f->largest.Encode(), key) < 0) {
-      // Key at "mid.largest" is < "target".  Therefore all
-      // files at or before "mid" are uninteresting.
-      left = mid + 1;
-    } else {
-      // Key at "mid.largest" is >= "target".  Therefore all files
-      // after "mid" are uninteresting.
-      right = mid;
-    }
-  }
-  return right;
 }
 
 int FindFile(const InternalKeyComparator& icmp,
-             const std::vector<FileMetaData*>& files,
+             const LevelFilesBrief& file_level,
              const Slice& key) {
-  return FindFileInRange(icmp, files, key, 0, files.size());
+  return FindFileInRange(icmp, file_level, key, 0,
+                         static_cast<uint32_t>(file_level.num_files));
+}
+
+void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level,
+        const std::vector<FileMetaData*>& files,
+        Arena* arena) {
+  assert(file_level);
+  assert(arena);
+
+  size_t num = files.size();
+  file_level->num_files = num;
+  char* mem = arena->AllocateAligned(num * sizeof(FdWithKeyRange));
+  file_level->files = new (mem)FdWithKeyRange[num];
+
+  for (size_t i = 0; i < num; i++) {
+    Slice smallest_key = files[i]->smallest.Encode();
+    Slice largest_key = files[i]->largest.Encode();
+
+    // Copy key slice to sequential memory
+    size_t smallest_size = smallest_key.size();
+    size_t largest_size = largest_key.size();
+    mem = arena->AllocateAligned(smallest_size + largest_size);
+    memcpy(mem, smallest_key.data(), smallest_size);
+    memcpy(mem + smallest_size, largest_key.data(), largest_size);
+
+    FdWithKeyRange& f = file_level->files[i];
+    f.fd = files[i]->fd;
+    f.smallest_key = Slice(mem, smallest_size);
+    f.largest_key = Slice(mem + smallest_size, largest_size);
+  }
 }
 
 static bool AfterFile(const Comparator* ucmp,
-                      const Slice* user_key, const FileMetaData* f) {
+                      const Slice* user_key, const FdWithKeyRange* f) {
   // nullptr user_key occurs before all keys and is therefore never after *f
   return (user_key != nullptr &&
-          ucmp->Compare(*user_key, f->largest.user_key()) > 0);
+          ucmp->Compare(*user_key, ExtractUserKey(f->largest_key)) > 0);
 }
 
 static bool BeforeFile(const Comparator* ucmp,
-                       const Slice* user_key, const FileMetaData* f) {
+                       const Slice* user_key, const FdWithKeyRange* f) {
   // nullptr user_key occurs after all keys and is therefore never before *f
   return (user_key != nullptr &&
-          ucmp->Compare(*user_key, f->smallest.user_key()) < 0);
+          ucmp->Compare(*user_key, ExtractUserKey(f->smallest_key)) < 0);
 }
 
 bool SomeFileOverlapsRange(
     const InternalKeyComparator& icmp,
     bool disjoint_sorted_files,
-    const std::vector<FileMetaData*>& files,
+    const LevelFilesBrief& file_level,
     const Slice* smallest_user_key,
     const Slice* largest_user_key) {
   const Comparator* ucmp = icmp.user_comparator();
   if (!disjoint_sorted_files) {
     // Need to check against all files
-    for (size_t i = 0; i < files.size(); i++) {
-      const FileMetaData* f = files[i];
+    for (size_t i = 0; i < file_level.num_files; i++) {
+      const FdWithKeyRange* f = &(file_level.files[i]);
       if (AfterFile(ucmp, smallest_user_key, f) ||
           BeforeFile(ucmp, largest_user_key, f)) {
         // No overlap
@@ -138,87 +397,79 @@ bool SomeFileOverlapsRange(
   uint32_t index = 0;
   if (smallest_user_key != nullptr) {
     // Find the earliest possible internal key for smallest_user_key
-    InternalKey small(*smallest_user_key, kMaxSequenceNumber,kValueTypeForSeek);
-    index = FindFile(icmp, files, small.Encode());
+    InternalKey small;
+    small.SetMaxPossibleForUserKey(*smallest_user_key);
+    index = FindFile(icmp, file_level, small.Encode());
   }
 
-  if (index >= files.size()) {
+  if (index >= file_level.num_files) {
     // beginning of range is after all files, so no overlap.
     return false;
   }
 
-  return !BeforeFile(ucmp, largest_user_key, files[index]);
+  return !BeforeFile(ucmp, largest_user_key, &file_level.files[index]);
 }
 
 namespace {
-// Used for LevelFileNumIterator to pass "block handle" value,
-// which actually means file information in this iterator.
-// It contains subset of fields of FileMetaData, that is sufficient
-// for table cache to use.
-struct EncodedFileMetaData {
-  uint64_t number;   // file number
-  uint64_t file_size;   // file size
-  TableReader* table_reader;   // cached table reader
-};
-}  // namespace
 
 // An internal iterator.  For a given version/level pair, yields
 // information about the files in the level.  For a given entry, key()
 // is the largest key that occurs in the file, and value() is an
 // 16-byte value containing the file number and file size, both
 // encoded using EncodeFixed64.
-class Version::LevelFileNumIterator : public Iterator {
+class LevelFileNumIterator : public Iterator {
  public:
   LevelFileNumIterator(const InternalKeyComparator& icmp,
-                       const std::vector<FileMetaData*>* flist)
+                       const LevelFilesBrief* flevel)
       : icmp_(icmp),
-        flist_(flist),
-        index_(flist->size()) {        // Marks as invalid
-  }
-  virtual bool Valid() const {
-    return index_ < flist_->size();
+        flevel_(flevel),
+        index_(static_cast<uint32_t>(flevel->num_files)),
+        current_value_(0, 0, 0) {  // Marks as invalid
   }
-  virtual void Seek(const Slice& target) {
-    index_ = FindFile(icmp_, *flist_, target);
+  virtual bool Valid() const override { return index_ < flevel_->num_files; }
+  virtual void Seek(const Slice& target) override {
+    index_ = FindFile(icmp_, *flevel_, target);
   }
-  virtual void SeekToFirst() { index_ = 0; }
-  virtual void SeekToLast() {
-    index_ = flist_->empty() ? 0 : flist_->size() - 1;
+  virtual void SeekToFirst() override { index_ = 0; }
+  virtual void SeekToLast() override {
+    index_ = (flevel_->num_files == 0)
+                 ? 0
+                 : static_cast<uint32_t>(flevel_->num_files) - 1;
   }
-  virtual void Next() {
+  virtual void Next() override {
     assert(Valid());
     index_++;
   }
-  virtual void Prev() {
+  virtual void Prev() override {
     assert(Valid());
     if (index_ == 0) {
-      index_ = flist_->size();  // Marks as invalid
+      index_ = static_cast<uint32_t>(flevel_->num_files);  // Marks as invalid
     } else {
       index_--;
     }
   }
-  Slice key() const {
+  Slice key() const override {
     assert(Valid());
-    return (*flist_)[index_]->largest.Encode();
+    return flevel_->files[index_].largest_key;
   }
-  Slice value() const {
+  Slice value() const override {
     assert(Valid());
-    auto* file_meta = (*flist_)[index_];
-    current_value_.number = file_meta->number;
-    current_value_.file_size = file_meta->file_size;
-    current_value_.table_reader = file_meta->table_reader;
+
+    auto file_meta = flevel_->files[index_];
+    current_value_ = file_meta.fd;
     return Slice(reinterpret_cast<const char*>(&current_value_),
-                 sizeof(EncodedFileMetaData));
+                 sizeof(FileDescriptor));
   }
-  virtual Status status() const { return Status::OK(); }
+  virtual Status status() const override { return Status::OK(); }
+
  private:
   const InternalKeyComparator icmp_;
-  const std::vector<FileMetaData*>* const flist_;
+  const LevelFilesBrief* flevel_;
   uint32_t index_;
-  mutable EncodedFileMetaData current_value_;
+  mutable FileDescriptor current_value_;
 };
 
-class Version::LevelFileIteratorState : public TwoLevelIteratorState {
+class LevelFileIteratorState : public TwoLevelIteratorState {
  public:
   LevelFileIteratorState(TableCache* table_cache,
     const ReadOptions& read_options, const EnvOptions& env_options,
@@ -230,17 +481,15 @@ class Version::LevelFileIteratorState : public TwoLevelIteratorState {
       for_compaction_(for_compaction) {}
 
   Iterator* NewSecondaryIterator(const Slice& meta_handle) override {
-    if (meta_handle.size() != sizeof(EncodedFileMetaData)) {
+    if (meta_handle.size() != sizeof(FileDescriptor)) {
       return NewErrorIterator(
           Status::Corruption("FileReader invoked with unexpected value"));
     } else {
-      const EncodedFileMetaData* encoded_meta =
-          reinterpret_cast<const EncodedFileMetaData*>(meta_handle.data());
-      FileMetaData meta(encoded_meta->number, encoded_meta->file_size);
-      meta.table_reader = encoded_meta->table_reader;
-      return table_cache_->NewIterator(read_options_, env_options_,
-          icomparator_, meta, nullptr /* don't need reference to table*/,
-          for_compaction_);
+      const FileDescriptor* fd =
+          reinterpret_cast<const FileDescriptor*>(meta_handle.data());
+      return table_cache_->NewIterator(
+          read_options_, env_options_, icomparator_, *fd,
+          nullptr /* don't need reference to table*/, for_compaction_);
     }
   }
 
@@ -256,430 +505,345 @@ class Version::LevelFileIteratorState : public TwoLevelIteratorState {
   bool for_compaction_;
 };
 
-Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props) {
+// A wrapper of version builder which references the current version in
+// constructor and unref it in the destructor.
+// Both of the constructor and destructor need to be called inside DB Mutex.
+class BaseReferencedVersionBuilder {
+ public:
+  explicit BaseReferencedVersionBuilder(ColumnFamilyData* cfd)
+      : version_builder_(new VersionBuilder(
+            cfd->current()->version_set()->env_options(), cfd->table_cache(),
+            cfd->current()->storage_info())),
+        version_(cfd->current()) {
+    version_->Ref();
+  }
+  ~BaseReferencedVersionBuilder() {
+    delete version_builder_;
+    version_->Unref();
+  }
+  VersionBuilder* version_builder() { return version_builder_; }
+
+ private:
+  VersionBuilder* version_builder_;
+  Version* version_;
+};
+}  // anonymous namespace
+
+Status Version::GetTableProperties(std::shared_ptr<const TableProperties>* tp,
+                                   const FileMetaData* file_meta,
+                                   const std::string* fname) {
   auto table_cache = cfd_->table_cache();
-  auto options = cfd_->options();
-  for (int level = 0; level < num_levels_; level++) {
-    for (const auto& file_meta : files_[level]) {
-      auto fname = TableFileName(vset_->dbname_, file_meta->number);
+  auto ioptions = cfd_->ioptions();
+  Status s = table_cache->GetTableProperties(
+      vset_->env_options_, cfd_->internal_comparator(), file_meta->fd,
+      tp, true /* no io */);
+  if (s.ok()) {
+    return s;
+  }
+
+  // We only ignore error type `Incomplete` since it's by design that we
+  // disallow table when it's not in table cache.
+  if (!s.IsIncomplete()) {
+    return s;
+  }
+
+  // 2. Table is not present in table cache, we'll read the table properties
+  // directly from the properties block in the file.
+  std::unique_ptr<RandomAccessFile> file;
+  if (fname != nullptr) {
+    s = ioptions->env->NewRandomAccessFile(
+        *fname, &file, vset_->env_options_);
+  } else {
+    s = ioptions->env->NewRandomAccessFile(
+        TableFileName(vset_->db_options_->db_paths, file_meta->fd.GetNumber(),
+                      file_meta->fd.GetPathId()),
+        &file, vset_->env_options_);
+  }
+  if (!s.ok()) {
+    return s;
+  }
+
+  TableProperties* raw_table_properties;
+  // By setting the magic number to kInvalidTableMagicNumber, we can by
+  // pass the magic number check in the footer.
+  s = ReadTableProperties(
+      file.get(), file_meta->fd.GetFileSize(),
+      Footer::kInvalidTableMagicNumber /* table's magic number */,
+      vset_->env_, ioptions->info_log, &raw_table_properties);
+  if (!s.ok()) {
+    return s;
+  }
+  RecordTick(ioptions->statistics, NUMBER_DIRECT_LOAD_TABLE_PROPERTIES);
+
+  *tp = std::shared_ptr<const TableProperties>(raw_table_properties);
+  return s;
+}
+
+Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props) {
+  for (int level = 0; level < storage_info_.num_levels_; level++) {
+    for (const auto& file_meta : storage_info_.files_[level]) {
+      auto fname =
+          TableFileName(vset_->db_options_->db_paths, file_meta->fd.GetNumber(),
+                        file_meta->fd.GetPathId());
       // 1. If the table is already present in table cache, load table
       // properties from there.
       std::shared_ptr<const TableProperties> table_properties;
-      Status s = table_cache->GetTableProperties(
-          vset_->storage_options_, cfd_->internal_comparator(), *file_meta,
-          &table_properties, true /* no io */);
+      Status s = GetTableProperties(&table_properties, file_meta, &fname);
       if (s.ok()) {
         props->insert({fname, table_properties});
-        continue;
-      }
-
-      // We only ignore error type `Incomplete` since it's by design that we
-      // disallow table when it's not in table cache.
-      if (!s.IsIncomplete()) {
-        return s;
-      }
-
-      // 2. Table is not present in table cache, we'll read the table properties
-      // directly from the properties block in the file.
-      std::unique_ptr<RandomAccessFile> file;
-      s = options->env->NewRandomAccessFile(fname, &file,
-                                            vset_->storage_options_);
-      if (!s.ok()) {
-        return s;
-      }
-
-      TableProperties* raw_table_properties;
-      // By setting the magic number to kInvalidTableMagicNumber, we can by
-      // pass the magic number check in the footer.
-      s = ReadTableProperties(
-          file.get(), file_meta->file_size,
-          Footer::kInvalidTableMagicNumber /* table's magic number */,
-          vset_->env_, options->info_log.get(), &raw_table_properties);
-      if (!s.ok()) {
+      } else {
         return s;
       }
-      RecordTick(options->statistics.get(),
-                 NUMBER_DIRECT_LOAD_TABLE_PROPERTIES);
-
-      props->insert({fname, std::shared_ptr<const TableProperties>(
-                                raw_table_properties)});
     }
   }
 
   return Status::OK();
 }
 
-void Version::AddIterators(const ReadOptions& read_options,
-                           const EnvOptions& soptions,
-                           std::vector<Iterator*>* iters) {
-  // Merge all level zero files together since they may overlap
-  for (const FileMetaData* file : files_[0]) {
-    iters->push_back(cfd_->table_cache()->NewIterator(
-        read_options, soptions, cfd_->internal_comparator(), *file));
-  }
-
-  // For levels > 0, we can use a concatenating iterator that sequentially
-  // walks through the non-overlapping files in the level, opening them
-  // lazily.
-  for (int level = 1; level < num_levels_; level++) {
-    if (!files_[level].empty()) {
-      iters->push_back(NewTwoLevelIterator(new LevelFileIteratorState(
-          cfd_->table_cache(), read_options, soptions,
-          cfd_->internal_comparator(), false /* for_compaction */,
-          cfd_->options()->prefix_extractor != nullptr),
-        new LevelFileNumIterator(cfd_->internal_comparator(), &files_[level])));
+size_t Version::GetMemoryUsageByTableReaders() {
+  size_t total_usage = 0;
+  for (auto& file_level : storage_info_.level_files_brief_) {
+    for (size_t i = 0; i < file_level.num_files; i++) {
+      total_usage += cfd_->table_cache()->GetMemoryUsageByTableReader(
+          vset_->env_options_, cfd_->internal_comparator(),
+          file_level.files[i].fd);
     }
   }
+  return total_usage;
 }
 
-// Callback from TableCache::Get()
-namespace {
-enum SaverState {
-  kNotFound,
-  kFound,
-  kDeleted,
-  kCorrupt,
-  kMerge // saver contains the current merge result (the operands)
-};
-struct Saver {
-  SaverState state;
-  const Comparator* ucmp;
-  Slice user_key;
-  bool* value_found; // Is value set correctly? Used by KeyMayExist
-  std::string* value;
-  const MergeOperator* merge_operator;
-  // the merge operations encountered;
-  MergeContext* merge_context;
-  Logger* logger;
-  bool didIO;    // did we do any disk io?
-  Statistics* statistics;
-};
+void Version::GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta) {
+  assert(cf_meta);
+  assert(cfd_);
+
+  cf_meta->name = cfd_->GetName();
+  cf_meta->size = 0;
+  cf_meta->file_count = 0;
+  cf_meta->levels.clear();
+
+  auto* ioptions = cfd_->ioptions();
+  auto* vstorage = storage_info();
+
+  for (int level = 0; level < cfd_->NumberLevels(); level++) {
+    uint64_t level_size = 0;
+    cf_meta->file_count += vstorage->LevelFiles(level).size();
+    std::vector<SstFileMetaData> files;
+    for (const auto& file : vstorage->LevelFiles(level)) {
+      uint32_t path_id = file->fd.GetPathId();
+      std::string file_path;
+      if (path_id < ioptions->db_paths.size()) {
+        file_path = ioptions->db_paths[path_id].path;
+      } else {
+        assert(!ioptions->db_paths.empty());
+        file_path = ioptions->db_paths.back().path;
+      }
+      files.emplace_back(
+          MakeTableFileName("", file->fd.GetNumber()),
+          file_path,
+          file->fd.GetFileSize(),
+          file->smallest_seqno,
+          file->largest_seqno,
+          file->smallest.user_key().ToString(),
+          file->largest.user_key().ToString(),
+          file->being_compacted);
+      level_size += file->fd.GetFileSize();
+    }
+    cf_meta->levels.emplace_back(
+        level, level_size, std::move(files));
+    cf_meta->size += level_size;
+  }
 }
 
-// Called from TableCache::Get and Table::Get when file/block in which
-// key may  exist are not there in TableCache/BlockCache respectively. In this
-// case we  can't guarantee that key does not exist and are not permitted to do
-// IO to be  certain.Set the status=kFound and value_found=false to let the
-// caller know that key may exist but is not there in memory
-static void MarkKeyMayExist(void* arg) {
-  Saver* s = reinterpret_cast<Saver*>(arg);
-  s->state = kFound;
-  if (s->value_found != nullptr) {
-    *(s->value_found) = false;
-  }
-}
-
-static bool SaveValue(void* arg, const ParsedInternalKey& parsed_key,
-                      const Slice& v, bool didIO) {
-  Saver* s = reinterpret_cast<Saver*>(arg);
-  MergeContext* merge_contex = s->merge_context;
-  std::string merge_result;  // temporary area for merge results later
-
-  assert(s != nullptr && merge_contex != nullptr);
-
-  // TODO: didIO and Merge?
-  s->didIO = didIO;
-  if (s->ucmp->Compare(parsed_key.user_key, s->user_key) == 0) {
-    // Key matches. Process it
-    switch (parsed_key.type) {
-      case kTypeValue:
-        if (kNotFound == s->state) {
-          s->state = kFound;
-          s->value->assign(v.data(), v.size());
-        } else if (kMerge == s->state) {
-          assert(s->merge_operator != nullptr);
-          s->state = kFound;
-          if (!s->merge_operator->FullMerge(s->user_key, &v,
-                                            merge_contex->GetOperands(),
-                                            s->value, s->logger)) {
-            RecordTick(s->statistics, NUMBER_MERGE_FAILURES);
-            s->state = kCorrupt;
-          }
-        } else {
-          assert(false);
-        }
-        return false;
-
-      case kTypeDeletion:
-        if (kNotFound == s->state) {
-          s->state = kDeleted;
-        } else if (kMerge == s->state) {
-          s->state = kFound;
-          if (!s->merge_operator->FullMerge(s->user_key, nullptr,
-                                            merge_contex->GetOperands(),
-                                            s->value, s->logger)) {
-            RecordTick(s->statistics, NUMBER_MERGE_FAILURES);
-            s->state = kCorrupt;
-          }
-        } else {
-          assert(false);
-        }
-        return false;
 
-      case kTypeMerge:
-        assert(s->state == kNotFound || s->state == kMerge);
-        s->state = kMerge;
-        merge_contex->PushOperand(v);
-        return true;
+uint64_t VersionStorageInfo::GetEstimatedActiveKeys() const {
+  // Estimation will be inaccurate when:
+  // (1) there exist merge keys
+  // (2) keys are directly overwritten
+  // (3) deletion on non-existing keys
+  // (4) low number of samples
+  if (num_samples_ == 0) {
+    return 0;
+  }
 
-      default:
-        assert(false);
-        break;
-    }
+  if (accumulated_num_non_deletions_ <= accumulated_num_deletions_) {
+    return 0;
   }
 
-  // s->state could be Corrupt, merge or notfound
+  uint64_t est = accumulated_num_non_deletions_ - accumulated_num_deletions_;
 
-  return false;
-}
+  uint64_t file_count = 0;
+  for (int level = 0; level < num_levels_; ++level) {
+    file_count += files_[level].size();
+  }
 
-namespace {
-bool NewestFirst(FileMetaData* a, FileMetaData* b) {
-  return a->number > b->number;
+  if (num_samples_ < file_count) {
+    // casting to avoid overflowing
+    return (est * static_cast<double>(file_count) / num_samples_);
+  } else {
+    return est;
+  }
 }
-bool NewestFirstBySeqNo(FileMetaData* a, FileMetaData* b) {
-  if (a->smallest_seqno != b->smallest_seqno) {
-    return a->smallest_seqno > b->smallest_seqno;
+
+void Version::AddIterators(const ReadOptions& read_options,
+                           const EnvOptions& soptions,
+                           MergeIteratorBuilder* merge_iter_builder) {
+  assert(storage_info_.finalized_);
+
+  if (storage_info_.num_non_empty_levels() == 0) {
+    // No file in the Version.
+    return;
   }
-  if (a->largest_seqno != b->largest_seqno) {
-    return a->largest_seqno > b->largest_seqno;
+
+  // Merge all level zero files together since they may overlap
+  for (size_t i = 0; i < storage_info_.LevelFilesBrief(0).num_files; i++) {
+    const auto& file = storage_info_.LevelFilesBrief(0).files[i];
+    merge_iter_builder->AddIterator(cfd_->table_cache()->NewIterator(
+        read_options, soptions, cfd_->internal_comparator(), file.fd, nullptr,
+        false, merge_iter_builder->GetArena()));
   }
-  // Break ties by file number
-  return NewestFirst(a, b);
-}
-bool BySmallestKey(FileMetaData* a, FileMetaData* b,
-                   const InternalKeyComparator* cmp) {
-  int r = cmp->Compare(a->smallest, b->smallest);
-  if (r != 0) {
-    return (r < 0);
+
+  // For levels > 0, we can use a concatenating iterator that sequentially
+  // walks through the non-overlapping files in the level, opening them
+  // lazily.
+  for (int level = 1; level < storage_info_.num_non_empty_levels(); level++) {
+    if (storage_info_.LevelFilesBrief(level).num_files != 0) {
+      merge_iter_builder->AddIterator(NewTwoLevelIterator(
+          new LevelFileIteratorState(
+              cfd_->table_cache(), read_options, soptions,
+              cfd_->internal_comparator(), false /* for_compaction */,
+              cfd_->ioptions()->prefix_extractor != nullptr),
+          new LevelFileNumIterator(cfd_->internal_comparator(),
+                                   &storage_info_.LevelFilesBrief(level)),
+          merge_iter_builder->GetArena()));
+    }
   }
-  // Break ties by file number
-  return (a->number < b->number);
 }
-}  // anonymous namespace
 
-Version::Version(ColumnFamilyData* cfd, VersionSet* vset,
-                 uint64_t version_number)
-    : cfd_(cfd),
-      internal_comparator_((cfd == nullptr) ? nullptr
-                                            : &cfd->internal_comparator()),
-      user_comparator_((cfd == nullptr)
-                           ? nullptr
-                           : internal_comparator_->user_comparator()),
-      table_cache_((cfd == nullptr) ? nullptr : cfd->table_cache()),
-      merge_operator_((cfd == nullptr) ? nullptr
-                                       : cfd->options()->merge_operator.get()),
-      info_log_((cfd == nullptr) ? nullptr : cfd->options()->info_log.get()),
-      db_statistics_((cfd == nullptr) ? nullptr
-                                      : cfd->options()->statistics.get()),
-      vset_(vset),
-      next_(this),
-      prev_(this),
-      refs_(0),
+VersionStorageInfo::VersionStorageInfo(
+    const InternalKeyComparator* internal_comparator,
+    const Comparator* user_comparator, int levels,
+    CompactionStyle compaction_style, VersionStorageInfo* ref_vstorage)
+    : internal_comparator_(internal_comparator),
+      user_comparator_(user_comparator),
       // cfd is nullptr if Version is dummy
-      num_levels_(cfd == nullptr ? 0 : cfd->NumberLevels()),
+      num_levels_(levels),
+      num_non_empty_levels_(0),
+      file_indexer_(user_comparator),
+      compaction_style_(compaction_style),
       files_(new std::vector<FileMetaData*>[num_levels_]),
+      base_level_(num_levels_ == 1 ? -1 : 1),
       files_by_size_(num_levels_),
       next_file_to_compact_by_size_(num_levels_),
-      file_to_compact_(nullptr),
-      file_to_compact_level_(-1),
       compaction_score_(num_levels_),
       compaction_level_(num_levels_),
-      version_number_(version_number),
-      file_indexer_(num_levels_, cfd == nullptr ?  nullptr
-          : cfd->internal_comparator().user_comparator()) {
+      l0_delay_trigger_count_(0),
+      accumulated_file_size_(0),
+      accumulated_raw_key_size_(0),
+      accumulated_raw_value_size_(0),
+      accumulated_num_non_deletions_(0),
+      accumulated_num_deletions_(0),
+      num_samples_(0),
+      finalized_(false) {
+  if (ref_vstorage != nullptr) {
+    accumulated_file_size_ = ref_vstorage->accumulated_file_size_;
+    accumulated_raw_key_size_ = ref_vstorage->accumulated_raw_key_size_;
+    accumulated_raw_value_size_ = ref_vstorage->accumulated_raw_value_size_;
+    accumulated_num_non_deletions_ =
+        ref_vstorage->accumulated_num_non_deletions_;
+    accumulated_num_deletions_ = ref_vstorage->accumulated_num_deletions_;
+    num_samples_ = ref_vstorage->num_samples_;
+  }
 }
 
-void Version::Get(const ReadOptions& options,
+Version::Version(ColumnFamilyData* column_family_data, VersionSet* vset,
+                 uint64_t version_number)
+    : env_(vset->env_),
+      cfd_(column_family_data),
+      info_log_((cfd_ == nullptr) ? nullptr : cfd_->ioptions()->info_log),
+      db_statistics_((cfd_ == nullptr) ? nullptr
+                                       : cfd_->ioptions()->statistics),
+      table_cache_((cfd_ == nullptr) ? nullptr : cfd_->table_cache()),
+      merge_operator_((cfd_ == nullptr) ? nullptr
+                                        : cfd_->ioptions()->merge_operator),
+      storage_info_((cfd_ == nullptr) ? nullptr : &cfd_->internal_comparator(),
+                    (cfd_ == nullptr) ? nullptr : cfd_->user_comparator(),
+                    cfd_ == nullptr ? 0 : cfd_->NumberLevels(),
+                    cfd_ == nullptr ? kCompactionStyleLevel
+                                    : cfd_->ioptions()->compaction_style,
+                    (cfd_ == nullptr || cfd_->current() == nullptr)
+                        ? nullptr
+                        : cfd_->current()->storage_info()),
+      vset_(vset),
+      next_(this),
+      prev_(this),
+      refs_(0),
+      version_number_(version_number) {}
+
+void Version::Get(const ReadOptions& read_options,
                   const LookupKey& k,
                   std::string* value,
                   Status* status,
                   MergeContext* merge_context,
-                  GetStats* stats,
                   bool* value_found) {
   Slice ikey = k.internal_key();
   Slice user_key = k.user_key();
 
   assert(status->ok() || status->IsMergeInProgress());
-  Saver saver;
-  saver.state = status->ok()? kNotFound : kMerge;
-  saver.ucmp = user_comparator_;
-  saver.user_key = user_key;
-  saver.value_found = value_found;
-  saver.value = value;
-  saver.merge_operator = merge_operator_;
-  saver.merge_context = merge_context;
-  saver.logger = info_log_;
-  saver.didIO = false;
-  saver.statistics = db_statistics_;
-
-  stats->seek_file = nullptr;
-  stats->seek_file_level = -1;
-  FileMetaData* last_file_read = nullptr;
-  int last_file_read_level = -1;
-
-  // We can search level-by-level since entries never hop across
-  // levels. Therefore we are guaranteed that if we find data
-  // in an smaller level, later levels are irrelevant (unless we
-  // are MergeInProgress).
-
-  int32_t search_left_bound = 0;
-  int32_t search_right_bound = FileIndexer::kLevelMaxIndex;
-  for (int level = 0; level < num_levels_; ++level) {
-    int num_files = files_[level].size();
-    if (num_files == 0) {
-      // When current level is empty, the search bound generated from upper
-      // level must be [0, -1] or [0, FileIndexer::kLevelMaxIndex] if it is
-      // also empty.
-      assert(search_left_bound == 0);
-      assert(search_right_bound == -1 ||
-             search_right_bound == FileIndexer::kLevelMaxIndex);
-      // Since current level is empty, it will need to search all files in the
-      // next level
-      search_left_bound = 0;
-      search_right_bound = FileIndexer::kLevelMaxIndex;
-      continue;
-    }
-
-    // Get the list of files to search in this level
-    FileMetaData* const* files = &files_[level][0];
-
-    // Some files may overlap each other. We find
-    // all files that overlap user_key and process them in order from
-    // newest to oldest. In the context of merge-operator,
-    // this can occur at any level. Otherwise, it only occurs
-    // at Level-0 (since Put/Deletes are always compacted into a single entry).
-    int32_t start_index;
-    if (level == 0) {
-      // On Level-0, we read through all files to check for overlap.
-      start_index = 0;
-    } else {
-      // On Level-n (n>=1), files are sorted. Binary search to find the earliest
-      // file whose largest key >= ikey. Search left bound and right bound are
-      // used to narrow the range.
-      if (search_left_bound == search_right_bound) {
-        start_index = search_left_bound;
-      } else if (search_left_bound < search_right_bound) {
-        if (search_right_bound == FileIndexer::kLevelMaxIndex) {
-          search_right_bound = num_files - 1;
-        }
-        start_index = FindFileInRange(cfd_->internal_comparator(),
-            files_[level], ikey, search_left_bound, search_right_bound);
-      } else {
-        // search_left_bound > search_right_bound, key does not exist in this
-        // level. Since no comparision is done in this level, it will need to
-        // search all files in the next level.
-        search_left_bound = 0;
-        search_right_bound = FileIndexer::kLevelMaxIndex;
-        continue;
-      }
-    }
-    // Traverse each relevant file to find the desired key
-#ifndef NDEBUG
-    FileMetaData* prev_file = nullptr;
-#endif
-
-    for (int32_t i = start_index; i < num_files;) {
-      FileMetaData* f = files[i];
-      // Check if key is within a file's range. If search left bound and right
-      // bound point to the same find, we are sure key falls in range.
-      assert(level == 0 || i == start_index ||
-             user_comparator_->Compare(user_key, f->smallest.user_key()) <= 0);
-
-      int cmp_smallest = user_comparator_->Compare(user_key, f->smallest.user_key());
-      int cmp_largest = -1;
-      if (cmp_smallest >= 0) {
-        cmp_largest = user_comparator_->Compare(user_key, f->largest.user_key());
-      }
-
-      // Setup file search bound for the next level based on the comparison
-      // results
-      if (level > 0) {
-        file_indexer_.GetNextLevelIndex(level, i, cmp_smallest, cmp_largest,
-            &search_left_bound, &search_right_bound);
-      }
-      // Key falls out of current file's range
-      if (cmp_smallest < 0 || cmp_largest > 0) {
-        if (level == 0) {
-          ++i;
-          continue;
-        } else {
-          break;
-        }
-      }
 
-#ifndef NDEBUG
-      // Sanity check to make sure that the files are correctly sorted
-      if (prev_file) {
-        if (level != 0) {
-          int comp_sign =
-              internal_comparator_->Compare(prev_file->largest, f->smallest);
-          assert(comp_sign < 0);
-        } else {
-          // level == 0, the current file cannot be newer than the previous one.
-          if (cfd_->options()->compaction_style == kCompactionStyleUniversal) {
-            assert(!NewestFirstBySeqNo(f, prev_file));
-          } else {
-            assert(!NewestFirst(f, prev_file));
-          }
+  GetContext get_context(
+      user_comparator(), merge_operator_, info_log_, db_statistics_,
+      status->ok() ? GetContext::kNotFound : GetContext::kMerge, user_key,
+      value, value_found, merge_context, this->env_);
+
+  FilePicker fp(
+      storage_info_.files_, user_key, ikey, &storage_info_.level_files_brief_,
+      storage_info_.num_non_empty_levels_, &storage_info_.file_indexer_,
+      user_comparator(), internal_comparator());
+  FdWithKeyRange* f = fp.GetNextFile();
+  while (f != nullptr) {
+    *status = table_cache_->Get(read_options, *internal_comparator(), f->fd,
+                                ikey, &get_context);
+    // TODO: examine the behavior for corrupted key
+    if (!status->ok()) {
+      return;
+    }
+
+    switch (get_context.State()) {
+      case GetContext::kNotFound:
+        // Keep searching in other files
+        break;
+      case GetContext::kFound:
+        if (fp.GetHitFileLevel() == 0) {
+          RecordTick(db_statistics_, GET_HIT_L0);
+        } else if (fp.GetHitFileLevel() == 1) {
+          RecordTick(db_statistics_, GET_HIT_L1);
+        } else if (fp.GetHitFileLevel() >= 2) {
+          RecordTick(db_statistics_, GET_HIT_L2_AND_UP);
         }
-      }
-      prev_file = f;
-#endif
-      bool tableIO = false;
-      *status = table_cache_->Get(options, *internal_comparator_, *f, ikey,
-                                  &saver, SaveValue, &tableIO, MarkKeyMayExist);
-      // TODO: examine the behavior for corrupted key
-      if (!status->ok()) {
         return;
-      }
-
-      if (last_file_read != nullptr && stats->seek_file == nullptr) {
-        // We have had more than one seek for this read.  Charge the 1st file.
-        stats->seek_file = last_file_read;
-        stats->seek_file_level = last_file_read_level;
-      }
-
-      // If we did any IO as part of the read, then we remember it because
-      // it is a possible candidate for seek-based compaction. saver.didIO
-      // is true if the block had to be read in from storage and was not
-      // pre-exisiting in the block cache. Also, if this file was not pre-
-      // existing in the table cache and had to be freshly opened that needed
-      // the index blocks to be read-in, then tableIO is true. One thing
-      // to note is that the index blocks are not part of the block cache.
-      if (saver.didIO || tableIO) {
-        last_file_read = f;
-        last_file_read_level = level;
-      }
-
-      switch (saver.state) {
-        case kNotFound:
-          break;      // Keep searching in other files
-        case kFound:
-          return;
-        case kDeleted:
-          *status = Status::NotFound();  // Use empty error message for speed
-          return;
-        case kCorrupt:
-          *status = Status::Corruption("corrupted key for ", user_key);
-          return;
-        case kMerge:
-          break;
-      }
-      if (level > 0 && cmp_largest < 0) {
+      case GetContext::kDeleted:
+        // Use empty error message for speed
+        *status = Status::NotFound();
+        return;
+      case GetContext::kCorrupt:
+        *status = Status::Corruption("corrupted key for ", user_key);
+        return;
+      case GetContext::kMerge:
         break;
-      } else {
-        ++i;
-      }
     }
+    f = fp.GetNextFile();
   }
 
-
-  if (kMerge == saver.state) {
+  if (GetContext::kMerge == get_context.State()) {
+    if (!merge_operator_) {
+      *status =  Status::InvalidArgument(
+          "merge_operator is not properly initialized.");
+      return;
+    }
     // merge_operands are in saver and we hit the beginning of the key history
     // do a final merge of nullptr and operands;
     if (merge_operator_->FullMerge(user_key, nullptr,
-                                   saver.merge_context->GetOperands(), value,
+                                   merge_context->GetOperands(), value,
                                    info_log_)) {
       *status = Status::OK();
     } else {
@@ -692,30 +856,145 @@ void Version::Get(const ReadOptions& options,
   }
 }
 
-bool Version::UpdateStats(const GetStats& stats) {
-  FileMetaData* f = stats.seek_file;
-  if (f != nullptr) {
-    f->allowed_seeks--;
-    if (f->allowed_seeks <= 0 && file_to_compact_ == nullptr) {
-      file_to_compact_ = f;
-      file_to_compact_level_ = stats.seek_file_level;
-      return true;
+void VersionStorageInfo::GenerateLevelFilesBrief() {
+  level_files_brief_.resize(num_non_empty_levels_);
+  for (int level = 0; level < num_non_empty_levels_; level++) {
+    DoGenerateLevelFilesBrief(
+        &level_files_brief_[level], files_[level], &arena_);
+  }
+}
+
+void Version::PrepareApply(const MutableCFOptions& mutable_cf_options) {
+  UpdateAccumulatedStats();
+  storage_info_.UpdateNumNonEmptyLevels();
+  storage_info_.CalculateBaseBytes(*cfd_->ioptions(), mutable_cf_options);
+  storage_info_.UpdateFilesBySize();
+  storage_info_.GenerateFileIndexer();
+  storage_info_.GenerateLevelFilesBrief();
+}
+
+bool Version::MaybeInitializeFileMetaData(FileMetaData* file_meta) {
+  if (file_meta->init_stats_from_file ||
+      file_meta->compensated_file_size > 0) {
+    return false;
+  }
+  std::shared_ptr<const TableProperties> tp;
+  Status s = GetTableProperties(&tp, file_meta);
+  file_meta->init_stats_from_file = true;
+  if (!s.ok()) {
+    Log(InfoLogLevel::ERROR_LEVEL, vset_->db_options_->info_log,
+        "Unable to load table properties for file %" PRIu64 " --- %s\n",
+        file_meta->fd.GetNumber(), s.ToString().c_str());
+    return false;
+  }
+  if (tp.get() == nullptr) return false;
+  file_meta->num_entries = tp->num_entries;
+  file_meta->num_deletions = GetDeletedKeys(tp->user_collected_properties);
+  file_meta->raw_value_size = tp->raw_value_size;
+  file_meta->raw_key_size = tp->raw_key_size;
+
+  return true;
+}
+
+void VersionStorageInfo::UpdateAccumulatedStats(FileMetaData* file_meta) {
+  assert(file_meta->init_stats_from_file);
+  accumulated_file_size_ += file_meta->fd.GetFileSize();
+  accumulated_raw_key_size_ += file_meta->raw_key_size;
+  accumulated_raw_value_size_ += file_meta->raw_value_size;
+  accumulated_num_non_deletions_ +=
+      file_meta->num_entries - file_meta->num_deletions;
+  accumulated_num_deletions_ += file_meta->num_deletions;
+  num_samples_++;
+}
+
+void Version::UpdateAccumulatedStats() {
+  // maximum number of table properties loaded from files.
+  const int kMaxInitCount = 20;
+  int init_count = 0;
+  // here only the first kMaxInitCount files which haven't been
+  // initialized from file will be updated with num_deletions.
+  // The motivation here is to cap the maximum I/O per Version creation.
+  // The reason for choosing files from lower-level instead of higher-level
+  // is that such design is able to propagate the initialization from
+  // lower-level to higher-level:  When the num_deletions of lower-level
+  // files are updated, it will make the lower-level files have accurate
+  // compensated_file_size, making lower-level to higher-level compaction
+  // will be triggered, which creates higher-level files whose num_deletions
+  // will be updated here.
+  for (int level = 0;
+       level < storage_info_.num_levels_ && init_count < kMaxInitCount;
+       ++level) {
+    for (auto* file_meta : storage_info_.files_[level]) {
+      if (MaybeInitializeFileMetaData(file_meta)) {
+        // each FileMeta will be initialized only once.
+        storage_info_.UpdateAccumulatedStats(file_meta);
+        if (++init_count >= kMaxInitCount) {
+          break;
+        }
+      }
     }
   }
-  return false;
+  // In case all sampled-files contain only deletion entries, then we
+  // load the table-property of a file in higher-level to initialize
+  // that value.
+  for (int level = storage_info_.num_levels_ - 1;
+       storage_info_.accumulated_raw_value_size_ == 0 && level >= 0; --level) {
+    for (int i = static_cast<int>(storage_info_.files_[level].size()) - 1;
+         storage_info_.accumulated_raw_value_size_ == 0 && i >= 0; --i) {
+      if (MaybeInitializeFileMetaData(storage_info_.files_[level][i])) {
+        storage_info_.UpdateAccumulatedStats(storage_info_.files_[level][i]);
+      }
+    }
+  }
+
+  storage_info_.ComputeCompensatedSizes();
+}
+
+void VersionStorageInfo::ComputeCompensatedSizes() {
+  static const int kDeletionWeightOnCompaction = 2;
+  uint64_t average_value_size = GetAverageValueSize();
+
+  // compute the compensated size
+  for (int level = 0; level < num_levels_; level++) {
+    for (auto* file_meta : files_[level]) {
+      // Here we only compute compensated_file_size for those file_meta
+      // which compensated_file_size is uninitialized (== 0). This is true only
+      // for files that have been created right now and no other thread has
+      // access to them. That's why we can safely mutate compensated_file_size.
+      if (file_meta->compensated_file_size == 0) {
+        file_meta->compensated_file_size = file_meta->fd.GetFileSize();
+        // Here we only boost the size of deletion entries of a file only
+        // when the number of deletion entries is greater than the number of
+        // non-deletion entries in the file.  The motivation here is that in
+        // a stable workload, the number of deletion entries should be roughly
+        // equal to the number of non-deletion entries.  If we compensate the
+        // size of deletion entries in a stable workload, the deletion
+        // compensation logic might introduce unwanted effet which changes the
+        // shape of LSM tree.
+        if (file_meta->num_deletions * 2 >= file_meta->num_entries) {
+          file_meta->compensated_file_size +=
+              (file_meta->num_deletions * 2 - file_meta->num_entries)
+              * average_value_size * kDeletionWeightOnCompaction;
+        }
+      }
+    }
+  }
+}
+
+int VersionStorageInfo::MaxInputLevel() const {
+  if (compaction_style_ == kCompactionStyleLevel) {
+    return num_levels() - 2;
+  }
+  return 0;
 }
 
-void Version::ComputeCompactionScore(
-    std::vector<uint64_t>& size_being_compacted) {
+void VersionStorageInfo::ComputeCompactionScore(
+    const MutableCFOptions& mutable_cf_options,
+    const CompactionOptionsFIFO& compaction_options_fifo) {
   double max_score = 0;
   int max_score_level = 0;
 
-  int num_levels_to_check =
-      (cfd_->options()->compaction_style != kCompactionStyleUniversal)
-          ? NumberLevels() - 1
-          : 1;
-
-  for (int level = 0; level < num_levels_to_check; level++) {
+  for (int level = 0; level <= MaxInputLevel(); level++) {
     double score;
     if (level == 0) {
       // We treat level-0 specially by bounding the number of files
@@ -729,28 +1008,49 @@ void Version::ComputeCompactionScore(
       // file size is small (perhaps because of a small write-buffer
       // setting, or very high compression ratios, or lots of
       // overwrites/deletions).
-      int numfiles = 0;
-      for (unsigned int i = 0; i < files_[level].size(); i++) {
-        if (!files_[level][i]->being_compacted) {
-          numfiles++;
+      int num_sorted_runs = 0;
+      uint64_t total_size = 0;
+      for (auto* f : files_[level]) {
+        if (!f->being_compacted) {
+          total_size += f->compensated_file_size;
+          num_sorted_runs++;
+        }
+      }
+      if (compaction_style_ == kCompactionStyleUniversal) {
+        // For universal compaction, we use level0 score to indicate
+        // compaction score for the whole DB. Adding other levels as if
+        // they are L0 files.
+        for (int i = 1; i < num_levels(); i++) {
+          if (!files_[i].empty() && !files_[i][0]->being_compacted) {
+            num_sorted_runs++;
+          }
         }
       }
 
-      // If we are slowing down writes, then we better compact that first
-      if (numfiles >= cfd_->options()->level0_stop_writes_trigger) {
+      if (compaction_style_ == kCompactionStyleFIFO) {
+        score = static_cast<double>(total_size) /
+                compaction_options_fifo.max_table_files_size;
+      } else if (num_sorted_runs >=
+                 mutable_cf_options.level0_stop_writes_trigger) {
+        // If we are slowing down writes, then we better compact that first
         score = 1000000;
-      } else if (numfiles >= cfd_->options()->level0_slowdown_writes_trigger) {
+      } else if (num_sorted_runs >=
+                 mutable_cf_options.level0_slowdown_writes_trigger) {
         score = 10000;
       } else {
-        score = static_cast<double>(numfiles) /
-                cfd_->options()->level0_file_num_compaction_trigger;
+        score = static_cast<double>(num_sorted_runs) /
+                mutable_cf_options.level0_file_num_compaction_trigger;
       }
     } else {
       // Compute the ratio of current size to size limit.
-      const uint64_t level_bytes =
-          TotalFileSize(files_[level]) - size_being_compacted[level];
-      score = static_cast<double>(level_bytes) /
-              cfd_->compaction_picker()->MaxBytesForLevel(level);
+      uint64_t level_bytes_no_compacting = 0;
+      for (auto f : files_[level]) {
+        if (!f->being_compacted) {
+          level_bytes_no_compacting += f->compensated_file_size;
+        }
+      }
+      score = static_cast<double>(level_bytes_no_compacting) /
+              MaxBytesForLevel(level);
       if (max_score < score) {
         max_score = score;
         max_score_level = level;
@@ -766,8 +1066,8 @@ void Version::ComputeCompactionScore(
 
   // sort all the levels based on their score. Higher scores get listed
   // first. Use bubble sort because the number of entries are small.
-  for (int i = 0; i < NumberLevels() - 2; i++) {
-    for (int j = i + 1; j < NumberLevels() - 1; j++) {
+  for (int i = 0; i < num_levels() - 2; i++) {
+    for (int j = i + 1; j < num_levels() - 1; j++) {
       if (compaction_score_[i] < compaction_score_[j]) {
         double score = compaction_score_[i];
         int level = compaction_level_[i];
@@ -778,40 +1078,114 @@ void Version::ComputeCompactionScore(
       }
     }
   }
+  ComputeFilesMarkedForCompaction();
+}
+
+void VersionStorageInfo::ComputeFilesMarkedForCompaction() {
+  files_marked_for_compaction_.clear();
+  for (int level = 0; level <= MaxInputLevel(); level++) {
+    for (auto* f : files_[level]) {
+      if (!f->being_compacted && f->marked_for_compaction) {
+        files_marked_for_compaction_.emplace_back(level, f);
+      }
+    }
+  }
 }
 
 namespace {
 
+// used to sort files by size
+struct Fsize {
+  int index;
+  FileMetaData* file;
+};
+
 // Compator that is used to sort files based on their size
 // In normal mode: descending size
-bool CompareSizeDescending(const Version::Fsize& first,
-                           const Version::Fsize& second) {
-  return (first.file->file_size > second.file->file_size);
-}
-// A static compator used to sort files based on their seqno
-// In universal style : descending seqno
-bool CompareSeqnoDescending(const Version::Fsize& first,
-                            const Version::Fsize& second) {
-  if (first.file->smallest_seqno > second.file->smallest_seqno) {
-    assert(first.file->largest_seqno > second.file->largest_seqno);
-    return true;
-  }
-  assert(first.file->largest_seqno <= second.file->largest_seqno);
-  return false;
+bool CompareCompensatedSizeDescending(const Fsize& first, const Fsize& second) {
+  return (first.file->compensated_file_size >
+      second.file->compensated_file_size);
 }
 
 } // anonymous namespace
 
-void Version::UpdateFilesBySize() {
-  // No need to sort the highest level because it is never compacted.
-  int max_level =
-      (cfd_->options()->compaction_style == kCompactionStyleUniversal)
-          ? NumberLevels()
-          : NumberLevels() - 1;
+void VersionStorageInfo::AddFile(int level, FileMetaData* f) {
+  assert(level < num_levels());
+  auto* level_files = &files_[level];
+  // Must not overlap
+  assert(level <= 0 || level_files->empty() ||
+         internal_comparator_->Compare(
+             (*level_files)[level_files->size() - 1]->largest, f->smallest) <
+             0);
+  f->refs++;
+  level_files->push_back(f);
+}
+
+// Version::PrepareApply() need to be called before calling the function, or
+// following functions called:
+// 1. UpdateNumNonEmptyLevels();
+// 2. CalculateBaseBytes();
+// 3. UpdateFilesBySize();
+// 4. GenerateFileIndexer();
+// 5. GenerateLevelFilesBrief();
+void VersionStorageInfo::SetFinalized() {
+  finalized_ = true;
+#ifndef NDEBUG
+  if (compaction_style_ != kCompactionStyleLevel) {
+    // Not level based compaction.
+    return;
+  }
+  assert(base_level_ < 0 || num_levels() == 1 ||
+         (base_level_ >= 1 && base_level_ < num_levels()));
+  // Verify all levels newer than base_level are empty except L0
+  for (int level = 1; level < base_level(); level++) {
+    assert(NumLevelBytes(level) == 0);
+  }
+  uint64_t max_bytes_prev_level = 0;
+  for (int level = base_level(); level < num_levels() - 1; level++) {
+    if (LevelFiles(level).size() == 0) {
+      continue;
+    }
+    assert(MaxBytesForLevel(level) >= max_bytes_prev_level);
+    max_bytes_prev_level = MaxBytesForLevel(level);
+  }
+  int num_empty_non_l0_level = 0;
+  for (int level = 0; level < num_levels(); level++) {
+    assert(LevelFiles(level).size() == 0 ||
+           LevelFiles(level).size() == LevelFilesBrief(level).num_files);
+    if (level > 0 && NumLevelBytes(level) > 0) {
+      num_empty_non_l0_level++;
+    }
+    if (LevelFiles(level).size() > 0) {
+      assert(level < num_non_empty_levels());
+    }
+  }
+  assert(compaction_level_.size() > 0);
+  assert(compaction_level_.size() == compaction_score_.size());
+#endif
+}
 
-  for (int level = 0; level < max_level; level++) {
+void VersionStorageInfo::UpdateNumNonEmptyLevels() {
+  num_non_empty_levels_ = num_levels_;
+  for (int i = num_levels_ - 1; i >= 0; i--) {
+    if (files_[i].size() != 0) {
+      return;
+    } else {
+      num_non_empty_levels_ = i;
+    }
+  }
+}
+
+void VersionStorageInfo::UpdateFilesBySize() {
+  if (compaction_style_ == kCompactionStyleFIFO ||
+      compaction_style_ == kCompactionStyleUniversal) {
+    // don't need this
+    return;
+  }
+  // No need to sort the highest level because it is never compacted.
+  for (int level = 0; level < num_levels() - 1; level++) {
     const std::vector<FileMetaData*>& files = files_[level];
-    std::vector<int>& files_by_size = files_by_size_[level];
+    auto& files_by_size = files_by_size_[level];
     assert(files_by_size.size() == 0);
 
     // populate a temp vector for sorting based on size
@@ -822,18 +1196,12 @@ void Version::UpdateFilesBySize() {
     }
 
     // sort the top number_of_files_to_sort_ based on file size
-    if (cfd_->options()->compaction_style == kCompactionStyleUniversal) {
-      int num = temp.size();
-      std::partial_sort(temp.begin(), temp.begin() + num, temp.end(),
-                        CompareSeqnoDescending);
-    } else {
-      int num = Version::number_of_files_to_sort_;
-      if (num > (int)temp.size()) {
-        num = temp.size();
-      }
-      std::partial_sort(temp.begin(), temp.begin() + num, temp.end(),
-                        CompareSizeDescending);
+    size_t num = VersionStorageInfo::kNumberFilesToSort;
+    if (num > temp.size()) {
+      num = temp.size();
     }
+    std::partial_sort(temp.begin(), temp.begin() + num, temp.end(),
+                      CompareCompensatedSizeDescending);
     assert(temp.size() == files.size());
 
     // initialize files_by_size_
@@ -859,49 +1227,31 @@ bool Version::Unref() {
   return false;
 }
 
-bool Version::NeedsCompaction() const {
-  if (file_to_compact_ != nullptr) {
-    return true;
-  }
-  // In universal compaction case, this check doesn't really
-  // check the compaction condition, but checks num of files threshold
-  // only. We are not going to miss any compaction opportunity
-  // but it's likely that more compactions are scheduled but
-  // ending up with nothing to do. We can improve it later.
-  // TODO(sdong): improve this function to be accurate for universal
-  //              compactions.
-  int num_levels_to_check =
-      (cfd_->options()->compaction_style != kCompactionStyleUniversal)
-          ? NumberLevels() - 1
-          : 1;
-  for (int i = 0; i < num_levels_to_check; i++) {
-    if (compaction_score_[i] >= 1) {
-      return true;
-    }
+bool VersionStorageInfo::OverlapInLevel(int level,
+                                        const Slice* smallest_user_key,
+                                        const Slice* largest_user_key) {
+  if (level >= num_non_empty_levels_) {
+    // empty level, no overlap
+    return false;
   }
-  return false;
-}
-
-bool Version::OverlapInLevel(int level,
-                             const Slice* smallest_user_key,
-                             const Slice* largest_user_key) {
-  return SomeFileOverlapsRange(cfd_->internal_comparator(), (level > 0),
-                               files_[level], smallest_user_key,
+  return SomeFileOverlapsRange(*internal_comparator_, (level > 0),
+                               level_files_brief_[level], smallest_user_key,
                                largest_user_key);
 }
 
-int Version::PickLevelForMemTableOutput(
-    const Slice& smallest_user_key,
+int VersionStorageInfo::PickLevelForMemTableOutput(
+    const MutableCFOptions& mutable_cf_options, const Slice& smallest_user_key,
     const Slice& largest_user_key) {
   int level = 0;
   if (!OverlapInLevel(0, &smallest_user_key, &largest_user_key)) {
     // Push to next level if there is no overlap in next level,
     // and the #bytes overlapping in the level after that are limited.
-    InternalKey start(smallest_user_key, kMaxSequenceNumber, kValueTypeForSeek);
+    InternalKey start;
+    start.SetMaxPossibleForUserKey(smallest_user_key);
     InternalKey limit(largest_user_key, 0, static_cast<ValueType>(0));
     std::vector<FileMetaData*> overlaps;
-    int max_mem_compact_level = cfd_->options()->max_mem_compaction_level;
-    while (max_mem_compact_level > 0 && level < max_mem_compact_level) {
+    while (mutable_cf_options.max_mem_compaction_level > 0 &&
+           level < mutable_cf_options.max_mem_compaction_level) {
       if (OverlapInLevel(level + 1, &smallest_user_key, &largest_user_key)) {
         break;
       }
@@ -911,7 +1261,7 @@ int Version::PickLevelForMemTableOutput(
       }
       GetOverlappingInputs(level + 2, &start, &limit, &overlaps);
       const uint64_t sum = TotalFileSize(overlaps);
-      if (sum > cfd_->compaction_picker()->MaxGrandParentOverlapBytes(level)) {
+      if (sum > mutable_cf_options.MaxGrandParentOverlapBytes(level)) {
         break;
       }
       level++;
@@ -925,12 +1275,14 @@ int Version::PickLevelForMemTableOutput(
 // If hint_index is specified, then it points to a file in the
 // overlapping range.
 // The file_index returns a pointer to any file in an overlapping range.
-void Version::GetOverlappingInputs(int level,
-                                   const InternalKey* begin,
-                                   const InternalKey* end,
-                                   std::vector<FileMetaData*>* inputs,
-                                   int hint_index,
-                                   int* file_index) {
+void VersionStorageInfo::GetOverlappingInputs(
+    int level, const InternalKey* begin, const InternalKey* end,
+    std::vector<FileMetaData*>* inputs, int hint_index, int* file_index) {
+  if (level >= num_non_empty_levels_) {
+    // this level is empty, no overlapping inputs
+    return;
+  }
+
   inputs->clear();
   Slice user_begin, user_end;
   if (begin != nullptr) {
@@ -942,22 +1294,22 @@ void Version::GetOverlappingInputs(int level,
   if (file_index) {
     *file_index = -1;
   }
-  const Comparator* user_cmp = cfd_->internal_comparator().user_comparator();
+  const Comparator* user_cmp = user_comparator_;
   if (begin != nullptr && end != nullptr && level > 0) {
     GetOverlappingInputsBinarySearch(level, user_begin, user_end, inputs,
       hint_index, file_index);
     return;
   }
-  for (size_t i = 0; i < files_[level].size(); ) {
-    FileMetaData* f = files_[level][i++];
-    const Slice file_start = f->smallest.user_key();
-    const Slice file_limit = f->largest.user_key();
+  for (size_t i = 0; i < level_files_brief_[level].num_files; ) {
+    FdWithKeyRange* f = &(level_files_brief_[level].files[i++]);
+    const Slice file_start = ExtractUserKey(f->smallest_key);
+    const Slice file_limit = ExtractUserKey(f->largest_key);
     if (begin != nullptr && user_cmp->Compare(file_limit, user_begin) < 0) {
       // "f" is completely before specified range; skip it
     } else if (end != nullptr && user_cmp->Compare(file_start, user_end) > 0) {
       // "f" is completely after specified range; skip it
     } else {
-      inputs->push_back(f);
+      inputs->push_back(files_[level][i-1]);
       if (level == 0) {
         // Level-0 files may overlap each other.  So check if the newly
         // added file has expanded the range.  If so, restart search.
@@ -972,7 +1324,7 @@ void Version::GetOverlappingInputs(int level,
           i = 0;
         }
       } else if (file_index) {
-        *file_index = i-1;
+        *file_index = static_cast<int>(i) - 1;
       }
     }
   }
@@ -982,19 +1334,15 @@ void Version::GetOverlappingInputs(int level,
 // Employ binary search to find at least one file that overlaps the
 // specified range. From that file, iterate backwards and
 // forwards to find all overlapping files.
-void Version::GetOverlappingInputsBinarySearch(
-    int level,
-    const Slice& user_begin,
-    const Slice& user_end,
-    std::vector<FileMetaData*>* inputs,
-    int hint_index,
-    int* file_index) {
+void VersionStorageInfo::GetOverlappingInputsBinarySearch(
+    int level, const Slice& user_begin, const Slice& user_end,
+    std::vector<FileMetaData*>* inputs, int hint_index, int* file_index) {
   assert(level > 0);
   int min = 0;
   int mid = 0;
-  int max = files_[level].size() -1;
+  int max = static_cast<int>(files_[level].size()) - 1;
   bool foundOverlap = false;
-  const Comparator* user_cmp = cfd_->internal_comparator().user_comparator();
+  const Comparator* user_cmp = user_comparator_;
 
   // if the caller already knows the index of a file that has overlap,
   // then we can skip the binary search.
@@ -1005,9 +1353,9 @@ void Version::GetOverlappingInputsBinarySearch(
 
   while (!foundOverlap && min <= max) {
     mid = (min + max)/2;
-    FileMetaData* f = files_[level][mid];
-    const Slice file_start = f->smallest.user_key();
-    const Slice file_limit = f->largest.user_key();
+    FdWithKeyRange* f = &(level_files_brief_[level].files[mid]);
+    const Slice file_start = ExtractUserKey(f->smallest_key);
+    const Slice file_limit = ExtractUserKey(f->largest_key);
     if (user_cmp->Compare(file_limit, user_begin) < 0) {
       min = mid + 1;
     } else if (user_cmp->Compare(user_end, file_start) < 0) {
@@ -1033,21 +1381,20 @@ void Version::GetOverlappingInputsBinarySearch(
 // The midIndex specifies the index of at least one file that
 // overlaps the specified range. From that file, iterate backward
 // and forward to find all overlapping files.
-void Version::ExtendOverlappingInputs(
-    int level,
-    const Slice& user_begin,
-    const Slice& user_end,
-    std::vector<FileMetaData*>* inputs,
-    unsigned int midIndex) {
-
-  const Comparator* user_cmp = cfd_->internal_comparator().user_comparator();
+// Use FileLevel in searching, make it faster
+void VersionStorageInfo::ExtendOverlappingInputs(
+    int level, const Slice& user_begin, const Slice& user_end,
+    std::vector<FileMetaData*>* inputs, unsigned int midIndex) {
+
+  const Comparator* user_cmp = user_comparator_;
+  const FdWithKeyRange* files = level_files_brief_[level].files;
 #ifndef NDEBUG
   {
     // assert that the file at midIndex overlaps with the range
-    assert(midIndex < files_[level].size());
-    FileMetaData* f = files_[level][midIndex];
-    const Slice fstart = f->smallest.user_key();
-    const Slice flimit = f->largest.user_key();
+    assert(midIndex < level_files_brief_[level].num_files);
+    const FdWithKeyRange* f = &files[midIndex];
+    const Slice fstart = ExtractUserKey(f->smallest_key);
+    const Slice flimit = ExtractUserKey(f->largest_key);
     if (user_cmp->Compare(fstart, user_begin) >= 0) {
       assert(user_cmp->Compare(fstart, user_end) <= 0);
     } else {
@@ -1061,8 +1408,8 @@ void Version::ExtendOverlappingInputs(
 
   // check backwards from 'mid' to lower indices
   for (int i = midIndex; i >= 0 ; i--) {
-    FileMetaData* f = files_[level][i];
-    const Slice file_limit = f->largest.user_key();
+    const FdWithKeyRange* f = &files[i];
+    const Slice file_limit = ExtractUserKey(f->largest_key);
     if (user_cmp->Compare(file_limit, user_begin) >= 0) {
       startIndex = i;
       assert((count++, true));
@@ -1071,9 +1418,10 @@ void Version::ExtendOverlappingInputs(
     }
   }
   // check forward from 'mid+1' to higher indices
-  for (unsigned int i = midIndex+1; i < files_[level].size(); i++) {
-    FileMetaData* f = files_[level][i];
-    const Slice file_start = f->smallest.user_key();
+  for (unsigned int i = midIndex+1;
+       i < level_files_brief_[level].num_files; i++) {
+    const FdWithKeyRange* f = &files[i];
+    const Slice file_start = ExtractUserKey(f->smallest_key);
     if (user_cmp->Compare(file_start, user_end) <= 0) {
       assert((count++, true));
       endIndex = i;
@@ -1094,9 +1442,8 @@ void Version::ExtendOverlappingInputs(
 // an overlapping user key to the file "just outside" of it (i.e.
 // just after the last file, or just before the first file)
 // REQUIRES: "*inputs" is a sorted list of non-overlapping files
-bool Version::HasOverlappingUserKey(
-    const std::vector<FileMetaData*>* inputs,
-    int level) {
+bool VersionStorageInfo::HasOverlappingUserKey(
+    const std::vector<FileMetaData*>* inputs, int level) {
 
   // If inputs empty, there is no overlap.
   // If level == 0, it is assumed that all needed files were already included.
@@ -1104,17 +1451,20 @@ bool Version::HasOverlappingUserKey(
     return false;
   }
 
-  const Comparator* user_cmp = cfd_->internal_comparator().user_comparator();
-  const std::vector<FileMetaData*>& files = files_[level];
-  const size_t kNumFiles = files.size();
+  const Comparator* user_cmp = user_comparator_;
+  const rocksdb::LevelFilesBrief& file_level = level_files_brief_[level];
+  const FdWithKeyRange* files = level_files_brief_[level].files;
+  const size_t kNumFiles = file_level.num_files;
 
   // Check the last file in inputs against the file after it
-  size_t last_file = FindFile(cfd_->internal_comparator(), files,
+  size_t last_file = FindFile(*internal_comparator_, file_level,
                               inputs->back()->largest.Encode());
-  assert(0 <= last_file && last_file < kNumFiles);  // File should exist!
+  assert(last_file < kNumFiles);  // File should exist!
   if (last_file < kNumFiles-1) {                    // If not the last file
-    const Slice last_key_in_input = files[last_file]->largest.user_key();
-    const Slice first_key_after = files[last_file+1]->smallest.user_key();
+    const Slice last_key_in_input = ExtractUserKey(
+        files[last_file].largest_key);
+    const Slice first_key_after = ExtractUserKey(
+        files[last_file+1].smallest_key);
     if (user_cmp->Compare(last_key_in_input, first_key_after) == 0) {
       // The last user key in input overlaps with the next file's first key
       return true;
@@ -1122,12 +1472,14 @@ bool Version::HasOverlappingUserKey(
   }
 
   // Check the first file in inputs against the file just before it
-  size_t first_file = FindFile(cfd_->internal_comparator(), files,
+  size_t first_file = FindFile(*internal_comparator_, file_level,
                                inputs->front()->smallest.Encode());
-  assert(0 <= first_file && first_file <= last_file);   // File should exist!
+  assert(first_file <= last_file);   // File should exist!
   if (first_file > 0) {                                 // If not first file
-    const Slice& first_key_in_input = files[first_file]->smallest.user_key();
-    const Slice& last_key_before = files[first_file-1]->largest.user_key();
+    const Slice& first_key_in_input = ExtractUserKey(
+        files[first_file].smallest_key);
+    const Slice& last_key_before = ExtractUserKey(
+        files[first_file-1].largest_key);
     if (user_cmp->Compare(first_key_in_input, last_key_before) == 0) {
       // The first user key in input overlaps with the previous file's last key
       return true;
@@ -1137,47 +1489,65 @@ bool Version::HasOverlappingUserKey(
   return false;
 }
 
-int64_t Version::NumLevelBytes(int level) const {
+uint64_t VersionStorageInfo::NumLevelBytes(int level) const {
   assert(level >= 0);
-  assert(level < NumberLevels());
+  assert(level < num_levels());
   return TotalFileSize(files_[level]);
 }
 
-const char* Version::LevelSummary(LevelSummaryStorage* scratch) const {
-  int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files[");
-  for (int i = 0; i < NumberLevels(); i++) {
+const char* VersionStorageInfo::LevelSummary(
+    LevelSummaryStorage* scratch) const {
+  int len = 0;
+  if (compaction_style_ == kCompactionStyleLevel && num_levels() > 1) {
+    assert(base_level_ < static_cast<int>(level_max_bytes_.size()));
+    len = snprintf(scratch->buffer, sizeof(scratch->buffer),
+                   "base level %d max bytes base %" PRIu64 " ", base_level_,
+                   level_max_bytes_[base_level_]);
+  }
+  len +=
+      snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "files[");
+  for (int i = 0; i < num_levels(); i++) {
     int sz = sizeof(scratch->buffer) - len;
     int ret = snprintf(scratch->buffer + len, sz, "%d ", int(files_[i].size()));
     if (ret < 0 || ret >= sz) break;
     len += ret;
   }
-  snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]");
+  if (len > 0) {
+    // overwrite the last space
+    --len;
+  }
+  snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len,
+           "] max score %.2f", compaction_score_[0]);
   return scratch->buffer;
 }
 
-const char* Version::LevelFileSummary(FileSummaryStorage* scratch,
-                                      int level) const {
+const char* VersionStorageInfo::LevelFileSummary(FileSummaryStorage* scratch,
+                                                 int level) const {
   int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files_size[");
   for (const auto& f : files_[level]) {
     int sz = sizeof(scratch->buffer) - len;
+    char sztxt[16];
+    AppendHumanBytes(f->fd.GetFileSize(), sztxt, sizeof(sztxt));
     int ret = snprintf(scratch->buffer + len, sz,
-                       "#%lu(seq=%lu,sz=%lu,%lu) ",
-                       (unsigned long)f->number,
-                       (unsigned long)f->smallest_seqno,
-                       (unsigned long)f->file_size,
-                       (unsigned long)f->being_compacted);
+                       "#%" PRIu64 "(seq=%" PRIu64 ",sz=%s,%d) ",
+                       f->fd.GetNumber(), f->smallest_seqno, sztxt,
+                       static_cast<int>(f->being_compacted));
     if (ret < 0 || ret >= sz)
       break;
     len += ret;
   }
+  // overwrite the last space (only if files_[level].size() is non-zero)
+  if (files_[level].size() && len > 0) {
+    --len;
+  }
   snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]");
   return scratch->buffer;
 }
 
-int64_t Version::MaxNextLevelOverlappingBytes() {
+int64_t VersionStorageInfo::MaxNextLevelOverlappingBytes() {
   uint64_t result = 0;
   std::vector<FileMetaData*> overlaps;
-  for (int level = 1; level < NumberLevels() - 1; level++) {
+  for (int level = 1; level < num_levels() - 1; level++) {
     for (const auto& f : files_[level]) {
       GetOverlappingInputs(level + 1, &f->smallest, &f->largest, &overlaps);
       const uint64_t sum = TotalFileSize(overlaps);
@@ -1189,18 +1559,141 @@ int64_t Version::MaxNextLevelOverlappingBytes() {
   return result;
 }
 
-void Version::AddLiveFiles(std::set<uint64_t>* live) {
-  for (int level = 0; level < NumberLevels(); level++) {
-    const std::vector<FileMetaData*>& files = files_[level];
+uint64_t VersionStorageInfo::MaxBytesForLevel(int level) const {
+  // Note: the result for level zero is not really used since we set
+  // the level-0 compaction threshold based on number of files.
+  assert(level >= 0);
+  assert(level < static_cast<int>(level_max_bytes_.size()));
+  return level_max_bytes_[level];
+}
+
+void VersionStorageInfo::CalculateBaseBytes(const ImmutableCFOptions& ioptions,
+                                            const MutableCFOptions& options) {
+  // Special logic to set number of sorted runs.
+  // It is to match the previous behavior when all files are in L0.
+  int num_l0_count = static_cast<int>(files_[0].size());
+  if (compaction_style_ == kCompactionStyleUniversal) {
+    // For universal compaction, we use level0 score to indicate
+    // compaction score for the whole DB. Adding other levels as if
+    // they are L0 files.
+    for (int i = 1; i < num_levels(); i++) {
+      if (!files_[i].empty()) {
+        num_l0_count++;
+      }
+    }
+  }
+  set_l0_delay_trigger_count(num_l0_count);
+
+  level_max_bytes_.resize(ioptions.num_levels);
+  if (!ioptions.level_compaction_dynamic_level_bytes) {
+    base_level_ = (ioptions.compaction_style == kCompactionStyleLevel) ? 1 : -1;
+
+    // Calculate for static bytes base case
+    for (int i = 0; i < ioptions.num_levels; ++i) {
+      if (i == 0 && ioptions.compaction_style == kCompactionStyleUniversal) {
+        level_max_bytes_[i] = options.max_bytes_for_level_base;
+      } else if (i > 1) {
+        level_max_bytes_[i] = MultiplyCheckOverflow(
+            MultiplyCheckOverflow(level_max_bytes_[i - 1],
+                                  options.max_bytes_for_level_multiplier),
+            options.MaxBytesMultiplerAdditional(i - 1));
+      } else {
+        level_max_bytes_[i] = options.max_bytes_for_level_base;
+      }
+    }
+  } else {
+    uint64_t max_level_size = 0;
+
+    int first_non_empty_level = -1;
+    // Find size of non-L0 level of most data.
+    // Cannot use the size of the last level because it can be empty or less
+    // than previous levels after compaction.
+    for (int i = 1; i < num_levels_; i++) {
+      uint64_t total_size = 0;
+      for (const auto& f : files_[i]) {
+        total_size += f->fd.GetFileSize();
+      }
+      if (total_size > 0 && first_non_empty_level == -1) {
+        first_non_empty_level = i;
+      }
+      if (total_size > max_level_size) {
+        max_level_size = total_size;
+      }
+    }
+
+    // Prefill every level's max bytes to disallow compaction from there.
+    for (int i = 0; i < num_levels_; i++) {
+      level_max_bytes_[i] = std::numeric_limits<uint64_t>::max();
+    }
+
+    if (max_level_size == 0) {
+      // No data for L1 and up. L0 compacts to last level directly.
+      // No compaction from L1+ needs to be scheduled.
+      base_level_ = num_levels_ - 1;
+    } else {
+      uint64_t base_bytes_max = options.max_bytes_for_level_base;
+      uint64_t base_bytes_min =
+          base_bytes_max / options.max_bytes_for_level_multiplier;
+
+      // Try whether we can make last level's target size to be max_level_size
+      uint64_t cur_level_size = max_level_size;
+      for (int i = num_levels_ - 2; i >= first_non_empty_level; i--) {
+        // Round up after dividing
+        cur_level_size /= options.max_bytes_for_level_multiplier;
+      }
+
+      // Calculate base level and its size.
+      uint64_t base_level_size;
+      if (cur_level_size <= base_bytes_min) {
+        // Case 1. If we make target size of last level to be max_level_size,
+        // target size of the first non-empty level would be smaller than
+        // base_bytes_min. We set it be base_bytes_min.
+        base_level_size = base_bytes_min + 1U;
+        base_level_ = first_non_empty_level;
+        Warn(ioptions.info_log,
+             "More existing levels in DB than needed. "
+             "max_bytes_for_level_multiplier may not be guaranteed.");
+      } else {
+        // Find base level (where L0 data is compacted to).
+        base_level_ = first_non_empty_level;
+        while (base_level_ > 1 && cur_level_size > base_bytes_max) {
+          --base_level_;
+          cur_level_size =
+              cur_level_size / options.max_bytes_for_level_multiplier;
+        }
+        if (cur_level_size > base_bytes_max) {
+          // Even L1 will be too large
+          assert(base_level_ == 1);
+          base_level_size = base_bytes_max;
+        } else {
+          base_level_size = cur_level_size;
+        }
+      }
+
+      uint64_t level_size = base_level_size;
+      for (int i = base_level_; i < num_levels_; i++) {
+        if (i > base_level_) {
+          level_size = MultiplyCheckOverflow(
+              level_size, options.max_bytes_for_level_multiplier);
+        }
+        level_max_bytes_[i] = level_size;
+      }
+    }
+  }
+}
+
+void Version::AddLiveFiles(std::vector<FileDescriptor>* live) {
+  for (int level = 0; level < storage_info_.num_levels(); level++) {
+    const std::vector<FileMetaData*>& files = storage_info_.files_[level];
     for (const auto& file : files) {
-      live->insert(file->number);
+      live->push_back(file->fd);
     }
   }
 }
 
 std::string Version::DebugString(bool hex) const {
   std::string r;
-  for (int level = 0; level < num_levels_; level++) {
+  for (int level = 0; level < storage_info_.num_levels_; level++) {
     // E.g.,
     //   --- level 1 ---
     //   17:123['a' .. 'd']
@@ -1210,12 +1703,12 @@ std::string Version::DebugString(bool hex) const {
     r.append(" --- version# ");
     AppendNumberTo(&r, version_number_);
     r.append(" ---\n");
-    const std::vector<FileMetaData*>& files = files_[level];
+    const std::vector<FileMetaData*>& files = storage_info_.files_[level];
     for (size_t i = 0; i < files.size(); i++) {
       r.push_back(' ');
-      AppendNumberTo(&r, files[i]->number);
+      AppendNumberTo(&r, files[i]->fd.GetNumber());
       r.push_back(':');
-      AppendNumberTo(&r, files[i]->file_size);
+      AppendNumberTo(&r, files[i]->fd.GetFileSize());
       r.append("[");
       r.append(files[i]->smallest.DebugString(hex));
       r.append(" .. ");
@@ -1230,296 +1723,25 @@ std::string Version::DebugString(bool hex) const {
 struct VersionSet::ManifestWriter {
   Status status;
   bool done;
-  port::CondVar cv;
+  InstrumentedCondVar cv;
   ColumnFamilyData* cfd;
   VersionEdit* edit;
 
-  explicit ManifestWriter(port::Mutex* mu, ColumnFamilyData* cfd,
+  explicit ManifestWriter(InstrumentedMutex* mu, ColumnFamilyData* _cfd,
                           VersionEdit* e)
-      : done(false), cv(mu), cfd(cfd), edit(e) {}
+      : done(false), cv(mu), cfd(_cfd), edit(e) {}
 };
 
-// A helper class so we can efficiently apply a whole sequence
-// of edits to a particular state without creating intermediate
-// Versions that contain full copies of the intermediate state.
-class VersionSet::Builder {
- private:
-  // Helper to sort v->files_
-  // kLevel0LevelCompaction -- NewestFirst
-  // kLevel0UniversalCompaction -- NewestFirstBySeqNo
-  // kLevelNon0 -- BySmallestKey
-  struct FileComparator {
-    enum SortMethod {
-      kLevel0LevelCompaction = 0,
-      kLevel0UniversalCompaction = 1,
-      kLevelNon0 = 2,
-    } sort_method;
-    const InternalKeyComparator* internal_comparator;
-
-    bool operator()(FileMetaData* f1, FileMetaData* f2) const {
-      switch (sort_method) {
-        case kLevel0LevelCompaction:
-          return NewestFirst(f1, f2);
-        case kLevel0UniversalCompaction:
-          return NewestFirstBySeqNo(f1, f2);
-        case kLevelNon0:
-          return BySmallestKey(f1, f2, internal_comparator);
-      }
-      assert(false);
-      return false;
-    }
-  };
-
-  typedef std::set<FileMetaData*, FileComparator> FileSet;
-  struct LevelState {
-    std::set<uint64_t> deleted_files;
-    FileSet* added_files;
-  };
-
-  ColumnFamilyData* cfd_;
-  Version* base_;
-  LevelState* levels_;
-  FileComparator level_zero_cmp_;
-  FileComparator level_nonzero_cmp_;
-
- public:
-  Builder(ColumnFamilyData* cfd) : cfd_(cfd), base_(cfd->current()) {
-    base_->Ref();
-    levels_ = new LevelState[base_->NumberLevels()];
-    level_zero_cmp_.sort_method =
-        (cfd_->options()->compaction_style == kCompactionStyleUniversal)
-            ? FileComparator::kLevel0UniversalCompaction
-            : FileComparator::kLevel0LevelCompaction;
-    level_nonzero_cmp_.sort_method = FileComparator::kLevelNon0;
-    level_nonzero_cmp_.internal_comparator = &cfd->internal_comparator();
-
-    levels_[0].added_files = new FileSet(level_zero_cmp_);
-    for (int level = 1; level < base_->NumberLevels(); level++) {
-        levels_[level].added_files = new FileSet(level_nonzero_cmp_);
-    }
-  }
-
-  ~Builder() {
-    for (int level = 0; level < base_->NumberLevels(); level++) {
-      const FileSet* added = levels_[level].added_files;
-      std::vector<FileMetaData*> to_unref;
-      to_unref.reserve(added->size());
-      for (FileSet::const_iterator it = added->begin();
-          it != added->end(); ++it) {
-        to_unref.push_back(*it);
-      }
-      delete added;
-      for (uint32_t i = 0; i < to_unref.size(); i++) {
-        FileMetaData* f = to_unref[i];
-        f->refs--;
-        if (f->refs <= 0) {
-          if (f->table_reader_handle) {
-            cfd_->table_cache()->ReleaseHandle(f->table_reader_handle);
-            f->table_reader_handle = nullptr;
-          }
-          delete f;
-        }
-      }
-    }
-
-    delete[] levels_;
-    base_->Unref();
-  }
-
-  void CheckConsistency(Version* v) {
-#ifndef NDEBUG
-    // make sure the files are sorted correctly
-    for (int level = 0; level < v->NumberLevels(); level++) {
-      for (size_t i = 1; i < v->files_[level].size(); i++) {
-        auto f1 = v->files_[level][i - 1];
-        auto f2 = v->files_[level][i];
-        if (level == 0) {
-          assert(level_zero_cmp_(f1, f2));
-          if (cfd_->options()->compaction_style == kCompactionStyleUniversal) {
-            assert(f1->largest_seqno > f2->largest_seqno);
-          }
-        } else {
-          assert(level_nonzero_cmp_(f1, f2));
-
-          // Make sure there is no overlap in levels > 0
-          if (cfd_->internal_comparator().Compare(f1->largest, f2->smallest) >=
-              0) {
-            fprintf(stderr, "overlapping ranges in same level %s vs. %s\n",
-                    (f1->largest).DebugString().c_str(),
-                    (f2->smallest).DebugString().c_str());
-            abort();
-          }
-        }
-      }
-    }
-#endif
-  }
-
-  void CheckConsistencyForDeletes(VersionEdit* edit, unsigned int number,
-                                  int level) {
-#ifndef NDEBUG
-      // a file to be deleted better exist in the previous version
-      bool found = false;
-      for (int l = 0; !found && l < base_->NumberLevels(); l++) {
-        const std::vector<FileMetaData*>& base_files = base_->files_[l];
-        for (unsigned int i = 0; i < base_files.size(); i++) {
-          FileMetaData* f = base_files[i];
-          if (f->number == number) {
-            found =  true;
-            break;
-          }
-        }
-      }
-      // if the file did not exist in the previous version, then it
-      // is possibly moved from lower level to higher level in current
-      // version
-      for (int l = level+1; !found && l < base_->NumberLevels(); l++) {
-        const FileSet* added = levels_[l].added_files;
-        for (FileSet::const_iterator added_iter = added->begin();
-             added_iter != added->end(); ++added_iter) {
-          FileMetaData* f = *added_iter;
-          if (f->number == number) {
-            found = true;
-            break;
-          }
-        }
-      }
-
-      // maybe this file was added in a previous edit that was Applied
-      if (!found) {
-        const FileSet* added = levels_[level].added_files;
-        for (FileSet::const_iterator added_iter = added->begin();
-             added_iter != added->end(); ++added_iter) {
-          FileMetaData* f = *added_iter;
-          if (f->number == number) {
-            found = true;
-            break;
-          }
-        }
-      }
-      assert(found);
-#endif
-  }
-
-  // Apply all of the edits in *edit to the current state.
-  void Apply(VersionEdit* edit) {
-    CheckConsistency(base_);
-
-    // Delete files
-    const VersionEdit::DeletedFileSet& del = edit->deleted_files_;
-    for (const auto& del_file : del) {
-      const auto level = del_file.first;
-      const auto number = del_file.second;
-      levels_[level].deleted_files.insert(number);
-      CheckConsistencyForDeletes(edit, number, level);
-    }
-
-    // Add new files
-    for (const auto& new_file : edit->new_files_) {
-      const int level = new_file.first;
-      FileMetaData* f = new FileMetaData(new_file.second);
-      f->refs = 1;
-
-      // We arrange to automatically compact this file after
-      // a certain number of seeks.  Let's assume:
-      //   (1) One seek costs 10ms
-      //   (2) Writing or reading 1MB costs 10ms (100MB/s)
-      //   (3) A compaction of 1MB does 25MB of IO:
-      //         1MB read from this level
-      //         10-12MB read from next level (boundaries may be misaligned)
-      //         10-12MB written to next level
-      // This implies that 25 seeks cost the same as the compaction
-      // of 1MB of data.  I.e., one seek costs approximately the
-      // same as the compaction of 40KB of data.  We are a little
-      // conservative and allow approximately one seek for every 16KB
-      // of data before triggering a compaction.
-      f->allowed_seeks = (f->file_size / 16384);
-      if (f->allowed_seeks < 100) f->allowed_seeks = 100;
-
-      levels_[level].deleted_files.erase(f->number);
-      levels_[level].added_files->insert(f);
-    }
-  }
-
-  // Save the current state in *v.
-  void SaveTo(Version* v) {
-    CheckConsistency(base_);
-    CheckConsistency(v);
-
-    for (int level = 0; level < base_->NumberLevels(); level++) {
-      const auto& cmp = (level == 0) ? level_zero_cmp_ : level_nonzero_cmp_;
-      // Merge the set of added files with the set of pre-existing files.
-      // Drop any deleted files.  Store the result in *v.
-      const auto& base_files = base_->files_[level];
-      auto base_iter = base_files.begin();
-      auto base_end = base_files.end();
-      const auto& added_files = *levels_[level].added_files;
-      v->files_[level].reserve(base_files.size() + added_files.size());
-
-      for (const auto& added : added_files) {
-        // Add all smaller files listed in base_
-        for (auto bpos = std::upper_bound(base_iter, base_end, added, cmp);
-             base_iter != bpos;
-             ++base_iter) {
-          MaybeAddFile(v, level, *base_iter);
-        }
-
-        MaybeAddFile(v, level, added);
-      }
-
-      // Add remaining base files
-      for (; base_iter != base_end; ++base_iter) {
-        MaybeAddFile(v, level, *base_iter);
-      }
-    }
-
-    CheckConsistency(v);
-
-    v->file_indexer_.UpdateIndex(v->files_);
-  }
-
-  void LoadTableHandlers() {
-    for (int level = 0; level < cfd_->NumberLevels(); level++) {
-      for (auto& file_meta : *(levels_[level].added_files)) {
-        assert (!file_meta->table_reader_handle);
-        bool table_io;
-        cfd_->table_cache()->FindTable(
-            base_->vset_->storage_options_, cfd_->internal_comparator(),
-            file_meta->number, file_meta->file_size,
-            &file_meta->table_reader_handle, &table_io, false);
-        if (file_meta->table_reader_handle != nullptr) {
-          // Load table_reader
-          file_meta->table_reader =
-              cfd_->table_cache()->GetTableReaderFromHandle(
-                  file_meta->table_reader_handle);
-        }
-      }
-    }
-  }
-
-  void MaybeAddFile(Version* v, int level, FileMetaData* f) {
-    if (levels_[level].deleted_files.count(f->number) > 0) {
-      // File is deleted: do nothing
-    } else {
-      auto* files = &v->files_[level];
-      if (level > 0 && !files->empty()) {
-        // Must not overlap
-        assert(cfd_->internal_comparator().Compare(
-                   (*files)[files->size() - 1]->largest, f->smallest) < 0);
-      }
-      f->refs++;
-      files->push_back(f);
-    }
-  }
-};
-
-VersionSet::VersionSet(const std::string& dbname, const DBOptions* options,
-                       const EnvOptions& storage_options, Cache* table_cache)
-    : column_family_set_(new ColumnFamilySet(dbname, options, storage_options,
-                                             table_cache)),
-      env_(options->env),
+VersionSet::VersionSet(const std::string& dbname, const DBOptions* db_options,
+                       const EnvOptions& storage_options, Cache* table_cache,
+                       WriteBuffer* write_buffer,
+                       WriteController* write_controller)
+    : column_family_set_(new ColumnFamilySet(
+          dbname, db_options, storage_options, table_cache,
+          write_buffer, write_controller)),
+      env_(db_options->env),
       dbname_(dbname),
-      options_(options),
+      db_options_(db_options),
       next_file_number_(2),
       manifest_file_number_(0),  // Filled by Recover()
       pending_manifest_file_number_(0),
@@ -1527,8 +1749,8 @@ VersionSet::VersionSet(const std::string& dbname, const DBOptions* options,
       prev_log_number_(0),
       current_version_number_(0),
       manifest_file_size_(0),
-      storage_options_(storage_options),
-      storage_options_compactions_(storage_options_) {}
+      env_options_(storage_options),
+      env_options_compactions_(env_options_) {}
 
 VersionSet::~VersionSet() {
   // we need to delete column_family_set_ because its destructor depends on
@@ -1542,6 +1764,14 @@ VersionSet::~VersionSet() {
 
 void VersionSet::AppendVersion(ColumnFamilyData* column_family_data,
                                Version* v) {
+  // compute new compaction score
+  v->storage_info()->ComputeCompactionScore(
+      *column_family_data->GetLatestMutableCFOptions(),
+      column_family_data->ioptions()->compaction_options_fifo);
+
+  // Mark v finalized
+  v->storage_info_.SetFinalized();
+
   // Make "v" current
   assert(v->refs_ == 0);
   Version* current = column_family_data->current();
@@ -1561,16 +1791,17 @@ void VersionSet::AppendVersion(ColumnFamilyData* column_family_data,
 }
 
 Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
-                               VersionEdit* edit, port::Mutex* mu,
+                               const MutableCFOptions& mutable_cf_options,
+                               VersionEdit* edit, InstrumentedMutex* mu,
                                Directory* db_directory, bool new_descriptor_log,
-                               const ColumnFamilyOptions* options) {
+                               const ColumnFamilyOptions* new_cf_options) {
   mu->AssertHeld();
 
   // column_family_data can be nullptr only if this is column_family_add.
   // in that case, we also need to specify ColumnFamilyOptions
   if (column_family_data == nullptr) {
     assert(edit->is_column_family_add_);
-    assert(options != nullptr);
+    assert(new_cf_options != nullptr);
   }
 
   // queue our request
@@ -1595,7 +1826,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
 
   std::vector<VersionEdit*> batch_edits;
   Version* v = nullptr;
-  std::unique_ptr<Builder> builder(nullptr);
+  std::unique_ptr<BaseReferencedVersionBuilder> builder_guard(nullptr);
 
   // process all requests in the queue
   ManifestWriter* last_writer = &w;
@@ -1607,7 +1838,8 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
     batch_edits.push_back(edit);
   } else {
     v = new Version(column_family_data, this, current_version_number_++);
-    builder.reset(new Builder(column_family_data));
+    builder_guard.reset(new BaseReferencedVersionBuilder(column_family_data));
+    auto* builder = builder_guard->version_builder();
     for (const auto& writer : manifest_writers_) {
       if (writer->edit->IsColumnFamilyManipulation() ||
           writer->cfd->GetID() != column_family_data->GetID()) {
@@ -1616,11 +1848,10 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
         break;
       }
       last_writer = writer;
-      LogAndApplyHelper(column_family_data, builder.get(), v, last_writer->edit,
-                        mu);
+      LogAndApplyHelper(column_family_data, builder, v, last_writer->edit, mu);
       batch_edits.push_back(last_writer->edit);
     }
-    builder->SaveTo(v);
+    builder->SaveTo(v->storage_info());
   }
 
   // Initialize new descriptor log file if necessary by creating
@@ -1630,9 +1861,9 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
 
   assert(pending_manifest_file_number_ == 0);
   if (!descriptor_log_ ||
-      manifest_file_size_ > options_->max_manifest_file_size) {
+      manifest_file_size_ > db_options_->max_manifest_file_size) {
     pending_manifest_file_number_ = NewFileNumber();
-    batch_edits.back()->SetNextFile(next_file_number_);
+    batch_edits.back()->SetNextFile(next_file_number_.load());
     new_descriptor_log = true;
   } else {
     pending_manifest_file_number_ = manifest_file_number_;
@@ -1648,78 +1879,75 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
   // Unlock during expensive operations. New writes cannot get here
   // because &w is ensuring that all new writes get queued.
   {
-    std::vector<uint64_t> size_being_compacted;
-    if (!edit->IsColumnFamilyManipulation()) {
-      size_being_compacted.resize(v->NumberLevels() - 1);
-      // calculate the amount of data being compacted at every level
-      column_family_data->compaction_picker()->SizeBeingCompacted(
-          size_being_compacted);
-    }
 
     mu->Unlock();
 
-    if (!edit->IsColumnFamilyManipulation() && options_->max_open_files == -1) {
+    if (!edit->IsColumnFamilyManipulation() &&
+        db_options_->max_open_files == -1) {
       // unlimited table cache. Pre-load table handle now.
       // Need to do it out of the mutex.
-      builder->LoadTableHandlers();
+      builder_guard->version_builder()->LoadTableHandlers();
     }
 
     // This is fine because everything inside of this block is serialized --
     // only one thread can be here at the same time
     if (new_descriptor_log) {
+      // create manifest file
+      Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log,
+          "Creating manifest %" PRIu64 "\n", pending_manifest_file_number_);
       unique_ptr<WritableFile> descriptor_file;
       s = env_->NewWritableFile(
           DescriptorFileName(dbname_, pending_manifest_file_number_),
-          &descriptor_file, env_->OptimizeForManifestWrite(storage_options_));
+          &descriptor_file, env_->OptimizeForManifestWrite(env_options_));
       if (s.ok()) {
         descriptor_file->SetPreallocationBlockSize(
-            options_->manifest_preallocation_size);
+            db_options_->manifest_preallocation_size);
         descriptor_log_.reset(new log::Writer(std::move(descriptor_file)));
         s = WriteSnapshot(descriptor_log_.get());
       }
     }
 
     if (!edit->IsColumnFamilyManipulation()) {
-      // The calls to ComputeCompactionScore and UpdateFilesBySize are cpu-heavy
-      // and is best called outside the mutex.
-      v->ComputeCompactionScore(size_being_compacted);
-      v->UpdateFilesBySize();
+      // This is cpu-heavy operations, which should be called outside mutex.
+      v->PrepareApply(mutable_cf_options);
     }
 
     // Write new record to MANIFEST log
     if (s.ok()) {
       for (auto& e : batch_edits) {
         std::string record;
-        e->EncodeTo(&record);
+        if (!e->EncodeTo(&record)) {
+          s = Status::Corruption(
+              "Unable to Encode VersionEdit:" + e->DebugString(true));
+          break;
+        }
         s = descriptor_log_->AddRecord(record);
         if (!s.ok()) {
           break;
         }
       }
       if (s.ok()) {
-        if (options_->use_fsync) {
-          StopWatch sw(env_, options_->statistics.get(),
-                       MANIFEST_FILE_SYNC_MICROS);
-          s = descriptor_log_->file()->Fsync();
-        } else {
-          StopWatch sw(env_, options_->statistics.get(),
-                       MANIFEST_FILE_SYNC_MICROS);
-          s = descriptor_log_->file()->Sync();
-        }
+        s = SyncManifest(env_, db_options_, descriptor_log_->file());
       }
       if (!s.ok()) {
-        Log(options_->info_log, "MANIFEST write: %s\n", s.ToString().c_str());
+        Log(InfoLogLevel::ERROR_LEVEL, db_options_->info_log,
+            "MANIFEST write: %s\n", s.ToString().c_str());
         bool all_records_in = true;
         for (auto& e : batch_edits) {
           std::string record;
-          e->EncodeTo(&record);
+          if (!e->EncodeTo(&record)) {
+            s = Status::Corruption(
+                "Unable to Encode VersionEdit:" + e->DebugString(true));
+            all_records_in = false;
+            break;
+          }
           if (!ManifestContains(pending_manifest_file_number_, record)) {
             all_records_in = false;
             break;
           }
         }
         if (all_records_in) {
-          Log(options_->info_log,
+          Log(InfoLogLevel::WARN_LEVEL, db_options_->info_log,
               "MANIFEST contains log record despite error; advancing to new "
               "version to prevent mismatch between in-memory and logged state"
               " If paranoid is set, then the db is now in readonly mode.");
@@ -1731,19 +1959,17 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
     // If we just created a new descriptor file, install it by writing a
     // new CURRENT file that points to it.
     if (s.ok() && new_descriptor_log) {
-      s = SetCurrentFile(env_, dbname_, pending_manifest_file_number_);
+      s = SetCurrentFile(env_, dbname_, pending_manifest_file_number_,
+                         db_options_->disableDataSync ? nullptr : db_directory);
       if (s.ok() && pending_manifest_file_number_ > manifest_file_number_) {
         // delete old manifest file
-        Log(options_->info_log,
+        Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log,
             "Deleting manifest %" PRIu64 " current manifest %" PRIu64 "\n",
             manifest_file_number_, pending_manifest_file_number_);
         // we don't care about an error here, PurgeObsoleteFiles will take care
         // of it later
         env_->DeleteFile(DescriptorFileName(dbname_, manifest_file_number_));
       }
-      if (!options_->disableDataSync && db_directory != nullptr) {
-        db_directory->Fsync();
-      }
     }
 
     if (s.ok()) {
@@ -1751,7 +1977,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
       new_manifest_file_size = descriptor_log_->file()->GetFileSize();
     }
 
-    LogFlush(options_->info_log);
+    LogFlush(db_options_->info_log);
     mu->Lock();
   }
 
@@ -1760,8 +1986,8 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
     if (edit->is_column_family_add_) {
       // no group commit on column family add
       assert(batch_edits.size() == 1);
-      assert(options != nullptr);
-      CreateColumnFamily(*options, edit);
+      assert(new_cf_options != nullptr);
+      CreateColumnFamily(*new_cf_options, edit);
     } else if (edit->is_column_family_drop_) {
       assert(batch_edits.size() == 1);
       column_family_data->SetDropped();
@@ -1787,11 +2013,15 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
     manifest_file_size_ = new_manifest_file_size;
     prev_log_number_ = edit->prev_log_number_;
   } else {
-    Log(options_->info_log, "Error in committing version %lu to [%s]",
+    Log(InfoLogLevel::ERROR_LEVEL, db_options_->info_log,
+        "Error in committing version %lu to [%s]",
         (unsigned long)v->GetVersionNumber(),
         column_family_data->GetName().c_str());
     delete v;
     if (new_descriptor_log) {
+      Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log,
+        "Deleting manifest %" PRIu64 " current manifest %" PRIu64 "\n",
+        manifest_file_number_, pending_manifest_file_number_);
       descriptor_log_.reset();
       env_->DeleteFile(
           DescriptorFileName(dbname_, pending_manifest_file_number_));
@@ -1819,7 +2049,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
 
 void VersionSet::LogAndApplyCFHelper(VersionEdit* edit) {
   assert(edit->IsColumnFamilyManipulation());
-  edit->SetNextFile(next_file_number_);
+  edit->SetNextFile(next_file_number_.load());
   edit->SetLastSequence(last_sequence_);
   if (edit->is_column_family_drop_) {
     // if we drop column family, we have to make sure to save max column family,
@@ -1828,21 +2058,21 @@ void VersionSet::LogAndApplyCFHelper(VersionEdit* edit) {
   }
 }
 
-void VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd, Builder* builder,
-                                   Version* v, VersionEdit* edit,
-                                   port::Mutex* mu) {
+void VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd,
+                                   VersionBuilder* builder, Version* v,
+                                   VersionEdit* edit, InstrumentedMutex* mu) {
   mu->AssertHeld();
   assert(!edit->IsColumnFamilyManipulation());
 
   if (edit->has_log_number_) {
     assert(edit->log_number_ >= cfd->GetLogNumber());
-    assert(edit->log_number_ < next_file_number_);
+    assert(edit->log_number_ < next_file_number_.load());
   }
 
   if (!edit->has_prev_log_number_) {
     edit->SetPrevLogNumber(prev_log_number_);
   }
-  edit->SetNextFile(next_file_number_);
+  edit->SetNextFile(next_file_number_.load());
   edit->SetLastSequence(last_sequence_);
 
   builder->Apply(edit);
@@ -1881,18 +2111,19 @@ Status VersionSet::Recover(
     return Status::Corruption("CURRENT file corrupted");
   }
 
-  Log(options_->info_log, "Recovering from manifest file: %s\n",
+  Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log,
+      "Recovering from manifest file: %s\n",
       manifest_filename.c_str());
 
   manifest_filename = dbname_ + "/" + manifest_filename;
   unique_ptr<SequentialFile> manifest_file;
   s = env_->NewSequentialFile(manifest_filename, &manifest_file,
-                              storage_options_);
+                              env_options_);
   if (!s.ok()) {
     return s;
   }
-  uint64_t manifest_file_size;
-  s = env_->GetFileSize(manifest_filename, &manifest_file_size);
+  uint64_t current_manifest_file_size;
+  s = env_->GetFileSize(manifest_filename, &current_manifest_file_size);
   if (!s.ok()) {
     return s;
   }
@@ -1904,9 +2135,9 @@ Status VersionSet::Recover(
   uint64_t next_file = 0;
   uint64_t last_sequence = 0;
   uint64_t log_number = 0;
-  uint64_t prev_log_number = 0;
+  uint64_t previous_log_number = 0;
   uint32_t max_column_family = 0;
-  std::unordered_map<uint32_t, Builder*> builders;
+  std::unordered_map<uint32_t, BaseReferencedVersionBuilder*> builders;
 
   // add default column family
   auto default_cf_iter = cf_name_to_options.find(kDefaultColumnFamilyName);
@@ -1918,7 +2149,7 @@ Status VersionSet::Recover(
   default_cf_edit.SetColumnFamily(0);
   ColumnFamilyData* default_cfd =
       CreateColumnFamily(default_cf_iter->second, &default_cf_edit);
-  builders.insert({0, new Builder(default_cfd)});
+  builders.insert({0, new BaseReferencedVersionBuilder(default_cfd)});
 
   {
     VersionSet::LogReporter reporter;
@@ -1964,7 +2195,8 @@ Status VersionSet::Recover(
               {edit.column_family_, edit.column_family_name_});
         } else {
           cfd = CreateColumnFamily(cf_options->second, &edit);
-          builders.insert({edit.column_family_, new Builder(cfd)});
+          builders.insert(
+              {edit.column_family_, new BaseReferencedVersionBuilder(cfd)});
         }
       } else if (edit.is_column_family_drop_) {
         if (cf_in_builders) {
@@ -1997,7 +2229,7 @@ Status VersionSet::Recover(
         cfd = column_family_set_->GetColumnFamily(edit.column_family_);
         // this should never happen since cf_in_builders is true
         assert(cfd != nullptr);
-        if (edit.max_level_ >= cfd->current()->NumberLevels()) {
+        if (edit.max_level_ >= cfd->current()->storage_info()->num_levels()) {
           s = Status::InvalidArgument(
               "db has more levels than options.num_levels");
           break;
@@ -2008,13 +2240,13 @@ Status VersionSet::Recover(
         // to builder
         auto builder = builders.find(edit.column_family_);
         assert(builder != builders.end());
-        builder->second->Apply(&edit);
+        builder->second->version_builder()->Apply(&edit);
       }
 
       if (cfd != nullptr) {
         if (edit.has_log_number_) {
           if (cfd->GetLogNumber() > edit.log_number_) {
-            Log(options_->info_log,
+            Log(InfoLogLevel::WARN_LEVEL, db_options_->info_log,
                 "MANIFEST corruption detected, but ignored - Log numbers in "
                 "records NOT monotonically increasing");
           } else {
@@ -2032,7 +2264,7 @@ Status VersionSet::Recover(
       }
 
       if (edit.has_prev_log_number_) {
-        prev_log_number = edit.prev_log_number_;
+        previous_log_number = edit.prev_log_number_;
         have_prev_log_number = true;
       }
 
@@ -2062,18 +2294,18 @@ Status VersionSet::Recover(
     }
 
     if (!have_prev_log_number) {
-      prev_log_number = 0;
+      previous_log_number = 0;
     }
 
     column_family_set_->UpdateMaxColumnFamily(max_column_family);
 
-    MarkFileNumberUsed(prev_log_number);
-    MarkFileNumberUsed(log_number);
+    MarkFileNumberUsedDuringRecovery(previous_log_number);
+    MarkFileNumberUsedDuringRecovery(log_number);
   }
 
   // there were some column families in the MANIFEST that weren't specified
   // in the argument. This is OK in read_only mode
-  if (read_only == false && column_families_not_found.size() > 0) {
+  if (read_only == false && !column_families_not_found.empty()) {
     std::string list_of_not_found;
     for (const auto& cf : column_families_not_found) {
       list_of_not_found += ", " + cf.second;
@@ -2086,47 +2318,48 @@ Status VersionSet::Recover(
 
   if (s.ok()) {
     for (auto cfd : *column_family_set_) {
+      if (cfd->IsDropped()) {
+        continue;
+      }
       auto builders_iter = builders.find(cfd->GetID());
       assert(builders_iter != builders.end());
-      auto builder = builders_iter->second;
+      auto* builder = builders_iter->second->version_builder();
 
-      if (options_->max_open_files == -1) {
+      if (db_options_->max_open_files == -1) {
       // unlimited table cache. Pre-load table handle now.
       // Need to do it out of the mutex.
         builder->LoadTableHandlers();
       }
 
       Version* v = new Version(cfd, this, current_version_number_++);
-      builder->SaveTo(v);
+      builder->SaveTo(v->storage_info());
 
       // Install recovered version
-      std::vector<uint64_t> size_being_compacted(v->NumberLevels() - 1);
-      cfd->compaction_picker()->SizeBeingCompacted(size_being_compacted);
-      v->ComputeCompactionScore(size_being_compacted);
-      v->UpdateFilesBySize();
+      v->PrepareApply(*cfd->GetLatestMutableCFOptions());
       AppendVersion(cfd, v);
     }
 
-    manifest_file_size_ = manifest_file_size;
-    next_file_number_ = next_file + 1;
+    manifest_file_size_ = current_manifest_file_size;
+    next_file_number_.store(next_file + 1);
     last_sequence_ = last_sequence;
-    prev_log_number_ = prev_log_number;
+    prev_log_number_ = previous_log_number;
 
-    Log(options_->info_log, "Recovered from manifest file:%s succeeded,"
+    Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log,
+        "Recovered from manifest file:%s succeeded,"
         "manifest_file_number is %lu, next_file_number is %lu, "
         "last_sequence is %lu, log_number is %lu,"
         "prev_log_number is %lu,"
         "max_column_family is %u\n",
-        manifest_filename.c_str(),
-        (unsigned long)manifest_file_number_,
-        (unsigned long)next_file_number_,
-        (unsigned long)last_sequence_,
-        (unsigned long)log_number,
-        (unsigned long)prev_log_number_,
+        manifest_filename.c_str(), (unsigned long)manifest_file_number_,
+        (unsigned long)next_file_number_.load(), (unsigned long)last_sequence_,
+        (unsigned long)log_number, (unsigned long)prev_log_number_,
         column_family_set_->GetMaxColumnFamily());
 
     for (auto cfd : *column_family_set_) {
-      Log(options_->info_log,
+      if (cfd->IsDropped()) {
+        continue;
+      }
+      Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log,
           "Column family [%s] (ID %u), log number is %" PRIu64 "\n",
           cfd->GetName().c_str(), cfd->GetID(), cfd->GetLogNumber());
     }
@@ -2209,7 +2442,7 @@ Status VersionSet::ListColumnFamilies(std::vector<std::string>* column_families,
 #ifndef ROCKSDB_LITE
 Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
                                         const Options* options,
-                                        const EnvOptions& storage_options,
+                                        const EnvOptions& env_options,
                                         int new_levels) {
   if (new_levels <= 1) {
     return Status::InvalidArgument(
@@ -2217,10 +2450,11 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
   }
 
   ColumnFamilyOptions cf_options(*options);
-  std::shared_ptr<Cache> tc(NewLRUCache(
-      options->max_open_files - 10, options->table_cache_numshardbits,
-      options->table_cache_remove_scan_count_limit));
-  VersionSet versions(dbname, options, storage_options, tc.get());
+  std::shared_ptr<Cache> tc(NewLRUCache(options->max_open_files - 10,
+                                        options->table_cache_numshardbits));
+  WriteController wc;
+  WriteBuffer wb(options->db_write_buffer_size);
+  VersionSet versions(dbname, options, env_options, tc.get(), &wb, &wc);
   Status status;
 
   std::vector<ColumnFamilyDescriptor> dummy;
@@ -2234,7 +2468,8 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
 
   Version* current_version =
       versions.GetColumnFamilySet()->GetDefault()->current();
-  int current_levels = current_version->NumberLevels();
+  auto* vstorage = current_version->storage_info();
+  int current_levels = vstorage->num_levels();
 
   if (current_levels <= new_levels) {
     return Status::OK();
@@ -2245,7 +2480,7 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
   int first_nonempty_level = -1;
   int first_nonempty_level_filenum = 0;
   for (int i = new_levels - 1; i < current_levels; i++) {
-    int file_num = current_version->NumLevelFiles(i);
+    int file_num = vstorage->NumLevelFiles(i);
     if (file_num != 0) {
       if (first_nonempty_level < 0) {
         first_nonempty_level = i;
@@ -2262,36 +2497,37 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
     }
   }
 
-  std::vector<FileMetaData*>* old_files_list = current_version->files_;
   // we need to allocate an array with the old number of levels size to
   // avoid SIGSEGV in WriteSnapshot()
   // however, all levels bigger or equal to new_levels will be empty
   std::vector<FileMetaData*>* new_files_list =
       new std::vector<FileMetaData*>[current_levels];
   for (int i = 0; i < new_levels - 1; i++) {
-    new_files_list[i] = old_files_list[i];
+    new_files_list[i] = vstorage->LevelFiles(i);
   }
 
   if (first_nonempty_level > 0) {
-    new_files_list[new_levels - 1] = old_files_list[first_nonempty_level];
+    new_files_list[new_levels - 1] = vstorage->LevelFiles(first_nonempty_level);
   }
 
-  delete[] current_version->files_;
-  current_version->files_ = new_files_list;
-  current_version->num_levels_ = new_levels;
+  delete[] vstorage -> files_;
+  vstorage->files_ = new_files_list;
+  vstorage->num_levels_ = new_levels;
 
+  MutableCFOptions mutable_cf_options(*options, ImmutableCFOptions(*options));
   VersionEdit ve;
-  port::Mutex dummy_mutex;
-  MutexLock l(&dummy_mutex);
-  return versions.LogAndApply(versions.GetColumnFamilySet()->GetDefault(), &ve,
-                              &dummy_mutex, nullptr, true);
+  InstrumentedMutex dummy_mutex;
+  InstrumentedMutexLock l(&dummy_mutex);
+  return versions.LogAndApply(
+      versions.GetColumnFamilySet()->GetDefault(),
+      mutable_cf_options, &ve, &dummy_mutex, nullptr, true);
 }
 
 Status VersionSet::DumpManifest(Options& options, std::string& dscname,
                                 bool verbose, bool hex) {
   // Open the specified manifest file.
   unique_ptr<SequentialFile> file;
-  Status s = options.env->NewSequentialFile(dscname, &file, storage_options_);
+  Status s = options.env->NewSequentialFile(dscname, &file, env_options_);
   if (!s.ok()) {
     return s;
   }
@@ -2301,10 +2537,10 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
   bool have_last_sequence = false;
   uint64_t next_file = 0;
   uint64_t last_sequence = 0;
-  uint64_t prev_log_number = 0;
+  uint64_t previous_log_number = 0;
   int count = 0;
   std::unordered_map<uint32_t, std::string> comparators;
-  std::unordered_map<uint32_t, Builder*> builders;
+  std::unordered_map<uint32_t, BaseReferencedVersionBuilder*> builders;
 
   // add default column family
   VersionEdit default_cf_edit;
@@ -2312,7 +2548,7 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
   default_cf_edit.SetColumnFamily(0);
   ColumnFamilyData* default_cfd =
       CreateColumnFamily(ColumnFamilyOptions(options), &default_cf_edit);
-  builders.insert({0, new Builder(default_cfd)});
+  builders.insert({0, new BaseReferencedVersionBuilder(default_cfd)});
 
   {
     VersionSet::LogReporter reporter;
@@ -2351,7 +2587,8 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
           break;
         }
         cfd = CreateColumnFamily(ColumnFamilyOptions(options), &edit);
-        builders.insert({edit.column_family_, new Builder(cfd)});
+        builders.insert(
+            {edit.column_family_, new BaseReferencedVersionBuilder(cfd)});
       } else if (edit.is_column_family_drop_) {
         if (!cf_in_builders) {
           s = Status::Corruption(
@@ -2383,7 +2620,7 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
         // to builder
         auto builder = builders.find(edit.column_family_);
         assert(builder != builders.end());
-        builder->second->Apply(&edit);
+        builder->second->version_builder()->Apply(&edit);
       }
 
       if (cfd != nullptr && edit.has_log_number_) {
@@ -2391,7 +2628,7 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
       }
 
       if (edit.has_prev_log_number_) {
-        prev_log_number = edit.prev_log_number_;
+        previous_log_number = edit.prev_log_number_;
         have_prev_log_number = true;
       }
 
@@ -2422,23 +2659,22 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
     }
 
     if (!have_prev_log_number) {
-      prev_log_number = 0;
+      previous_log_number = 0;
     }
   }
 
   if (s.ok()) {
     for (auto cfd : *column_family_set_) {
+      if (cfd->IsDropped()) {
+        continue;
+      }
       auto builders_iter = builders.find(cfd->GetID());
       assert(builders_iter != builders.end());
-      auto builder = builders_iter->second;
+      auto builder = builders_iter->second->version_builder();
 
       Version* v = new Version(cfd, this, current_version_number_++);
-      builder->SaveTo(v);
-      std::vector<uint64_t> size_being_compacted(v->NumberLevels() - 1);
-      cfd->compaction_picker()->SizeBeingCompacted(size_being_compacted);
-      v->ComputeCompactionScore(size_being_compacted);
-      v->UpdateFilesBySize();
-      delete builder;
+      builder->SaveTo(v->storage_info());
+      v->PrepareApply(*cfd->GetLatestMutableCFOptions());
 
       printf("--------------- Column family \"%s\"  (ID %u) --------------\n",
              cfd->GetName().c_str(), (unsigned int)cfd->GetID());
@@ -2453,15 +2689,20 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
       delete v;
     }
 
-    next_file_number_ = next_file + 1;
+    // Free builders
+    for (auto& builder : builders) {
+      delete builder.second;
+    }
+
+    next_file_number_.store(next_file + 1);
     last_sequence_ = last_sequence;
-    prev_log_number_ = prev_log_number;
+    prev_log_number_ = previous_log_number;
 
     printf(
         "next_file_number %lu last_sequence "
         "%lu  prev_log_number %lu max_column_family %u\n",
-        (unsigned long)next_file_number_, (unsigned long)last_sequence,
-        (unsigned long)prev_log_number,
+        (unsigned long)next_file_number_.load(), (unsigned long)last_sequence,
+        (unsigned long)previous_log_number,
         column_family_set_->GetMaxColumnFamily());
   }
 
@@ -2469,9 +2710,11 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
 }
 #endif  // ROCKSDB_LITE
 
-void VersionSet::MarkFileNumberUsed(uint64_t number) {
-  if (next_file_number_ <= number) {
-    next_file_number_ = number + 1;
+void VersionSet::MarkFileNumberUsedDuringRecovery(uint64_t number) {
+  // only called during recovery which is single threaded, so this works because
+  // there can't be concurrent calls
+  if (next_file_number_.load(std::memory_order_relaxed) <= number) {
+    next_file_number_.store(number + 1, std::memory_order_relaxed);
   }
 }
 
@@ -2484,6 +2727,9 @@ Status VersionSet::WriteSnapshot(log::Writer* log) {
   // LogAndApply. Column family manipulations can only happen within LogAndApply
   // (the same single thread), so we're safe to iterate.
   for (auto cfd : *column_family_set_) {
+    if (cfd->IsDropped()) {
+      continue;
+    }
     {
       // Store column family info
       VersionEdit edit;
@@ -2496,7 +2742,10 @@ Status VersionSet::WriteSnapshot(log::Writer* log) {
       edit.SetComparatorName(
           cfd->internal_comparator().user_comparator()->Name());
       std::string record;
-      edit.EncodeTo(&record);
+      if (!edit.EncodeTo(&record)) {
+        return Status::Corruption(
+            "Unable to Encode VersionEdit:" + edit.DebugString(true));
+      }
       Status s = log->AddRecord(record);
       if (!s.ok()) {
         return s;
@@ -2509,19 +2758,19 @@ Status VersionSet::WriteSnapshot(log::Writer* log) {
       edit.SetColumnFamily(cfd->GetID());
 
       for (int level = 0; level < cfd->NumberLevels(); level++) {
-        for (const auto& f : cfd->current()->files_[level]) {
-          edit.AddFile(level,
-                       f->number,
-                       f->file_size,
-                       f->smallest,
-                       f->largest,
-                       f->smallest_seqno,
-                       f->largest_seqno);
+        for (const auto& f :
+             cfd->current()->storage_info()->LevelFiles(level)) {
+          edit.AddFile(level, f->fd.GetNumber(), f->fd.GetPathId(),
+                       f->fd.GetFileSize(), f->smallest, f->largest,
+                       f->smallest_seqno, f->largest_seqno);
         }
       }
       edit.SetLogNumber(cfd->GetLogNumber());
       std::string record;
-      edit.EncodeTo(&record);
+      if (!edit.EncodeTo(&record)) {
+        return Status::Corruption(
+            "Unable to Encode VersionEdit:" + edit.DebugString(true));
+      }
       Status s = log->AddRecord(record);
       if (!s.ok()) {
         return s;
@@ -2534,16 +2783,17 @@ Status VersionSet::WriteSnapshot(log::Writer* log) {
 
 // Opens the mainfest file and reads all records
 // till it finds the record we are looking for.
-bool VersionSet::ManifestContains(uint64_t manifest_file_number,
+bool VersionSet::ManifestContains(uint64_t manifest_file_num,
                                   const std::string& record) const {
-  std::string fname =
-      DescriptorFileName(dbname_, manifest_file_number);
-  Log(options_->info_log, "ManifestContains: checking %s\n", fname.c_str());
+  std::string fname = DescriptorFileName(dbname_, manifest_file_num);
+  Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log,
+      "ManifestContains: checking %s\n", fname.c_str());
   unique_ptr<SequentialFile> file;
-  Status s = env_->NewSequentialFile(fname, &file, storage_options_);
+  Status s = env_->NewSequentialFile(fname, &file, env_options_);
   if (!s.ok()) {
-    Log(options_->info_log, "ManifestContains: %s\n", s.ToString().c_str());
-    Log(options_->info_log,
+    Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log,
+        "ManifestContains: %s\n", s.ToString().c_str());
+    Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log,
         "ManifestContains: is unable to reopen the manifest file  %s",
         fname.c_str());
     return false;
@@ -2558,72 +2808,142 @@ bool VersionSet::ManifestContains(uint64_t manifest_file_number,
       break;
     }
   }
-  Log(options_->info_log, "ManifestContains: result = %d\n", result ? 1 : 0);
+  Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log,
+      "ManifestContains: result = %d\n", result ? 1 : 0);
   return result;
 }
 
+uint64_t VersionSet::ApproximateSize(Version* v, const Slice& start,
+                                     const Slice& end) {
+  // pre-condition
+  assert(v->cfd_->internal_comparator().Compare(start, end) <= 0);
 
-uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) {
-  uint64_t result = 0;
-  for (int level = 0; level < v->NumberLevels(); level++) {
-    const std::vector<FileMetaData*>& files = v->files_[level];
-    for (size_t i = 0; i < files.size(); i++) {
-      if (v->cfd_->internal_comparator().Compare(files[i]->largest, ikey) <=
-          0) {
-        // Entire file is before "ikey", so just add the file size
-        result += files[i]->file_size;
-      } else if (v->cfd_->internal_comparator().Compare(files[i]->smallest,
-                                                        ikey) > 0) {
-        // Entire file is after "ikey", so ignore
-        if (level > 0) {
-          // Files other than level 0 are sorted by meta->smallest, so
-          // no further files in this level will contain data for
-          // "ikey".
-          break;
-        }
-      } else {
-        // "ikey" falls in the range for this table.  Add the
-        // approximate offset of "ikey" within the table.
-        TableReader* table_reader_ptr;
-        Iterator* iter = v->cfd_->table_cache()->NewIterator(
-            ReadOptions(), storage_options_, v->cfd_->internal_comparator(),
-            *(files[i]), &table_reader_ptr);
-        if (table_reader_ptr != nullptr) {
-          result += table_reader_ptr->ApproximateOffsetOf(ikey.Encode());
-        }
-        delete iter;
+  uint64_t size = 0;
+  const auto* vstorage = v->storage_info();
+
+  for (int level = 0; level < vstorage->num_non_empty_levels(); level++) {
+    const LevelFilesBrief& files_brief = vstorage->LevelFilesBrief(level);
+    if (!files_brief.num_files) {
+      // empty level, skip exploration
+      continue;
+    }
+
+    if (!level) {
+      // level 0 data is sorted order, handle the use case explicitly
+      size += ApproximateSizeLevel0(v, files_brief, start, end);
+      continue;
+    }
+
+    assert(level > 0);
+    assert(files_brief.num_files > 0);
+
+    // identify the file position for starting key
+    const uint64_t idx_start = FindFileInRange(
+        v->cfd_->internal_comparator(), files_brief, start,
+        /*start=*/0, static_cast<uint32_t>(files_brief.num_files - 1));
+    assert(idx_start < files_brief.num_files);
+
+    // scan all files from the starting position until the ending position
+    // inferred from the sorted order
+    for (uint64_t i = idx_start; i < files_brief.num_files; i++) {
+      uint64_t val;
+      val = ApproximateSize(v, files_brief.files[i], end);
+      if (!val) {
+        // the files after this will not have the range
+        break;
+      }
+
+      size += val;
+
+      if (i == idx_start) {
+        // subtract the bytes needed to be scanned to get to the starting
+        // key
+        val = ApproximateSize(v, files_brief.files[i], start);
+        assert(size >= val);
+        size -= val;
       }
     }
   }
+
+  return size;
+}
+
+uint64_t VersionSet::ApproximateSizeLevel0(Version* v,
+                                           const LevelFilesBrief& files_brief,
+                                           const Slice& key_start,
+                                           const Slice& key_end) {
+  // level 0 files are not in sorted order, we need to iterate through
+  // the list to compute the total bytes that require scanning
+  uint64_t size = 0;
+  for (size_t i = 0; i < files_brief.num_files; i++) {
+    const uint64_t start = ApproximateSize(v, files_brief.files[i], key_start);
+    const uint64_t end = ApproximateSize(v, files_brief.files[i], key_end);
+    assert(end >= start);
+    size += end - start;
+  }
+  return size;
+}
+
+uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f,
+                                     const Slice& key) {
+  // pre-condition
+  assert(v);
+
+  uint64_t result = 0;
+  if (v->cfd_->internal_comparator().Compare(f.largest_key, key) <= 0) {
+    // Entire file is before "key", so just add the file size
+    result = f.fd.GetFileSize();
+  } else if (v->cfd_->internal_comparator().Compare(f.smallest_key, key) > 0) {
+    // Entire file is after "key", so ignore
+    result = 0;
+  } else {
+    // "key" falls in the range for this table.  Add the
+    // approximate offset of "key" within the table.
+    TableReader* table_reader_ptr;
+    Iterator* iter = v->cfd_->table_cache()->NewIterator(
+        ReadOptions(), env_options_, v->cfd_->internal_comparator(), f.fd,
+        &table_reader_ptr);
+    if (table_reader_ptr != nullptr) {
+      result = table_reader_ptr->ApproximateOffsetOf(key);
+    }
+    delete iter;
+  }
   return result;
 }
 
-void VersionSet::AddLiveFiles(std::vector<uint64_t>* live_list) {
+void VersionSet::AddLiveFiles(std::vector<FileDescriptor>* live_list) {
   // pre-calculate space requirement
   int64_t total_files = 0;
   for (auto cfd : *column_family_set_) {
     Version* dummy_versions = cfd->dummy_versions();
     for (Version* v = dummy_versions->next_; v != dummy_versions;
          v = v->next_) {
-      for (int level = 0; level < v->NumberLevels(); level++) {
-        total_files += v->files_[level].size();
+      const auto* vstorage = v->storage_info();
+      for (int level = 0; level < vstorage->num_levels(); level++) {
+        total_files += vstorage->LevelFiles(level).size();
       }
     }
   }
 
   // just one time extension to the right size
-  live_list->reserve(live_list->size() + total_files);
+  live_list->reserve(live_list->size() + static_cast<size_t>(total_files));
 
   for (auto cfd : *column_family_set_) {
+    auto* current = cfd->current();
+    bool found_current = false;
     Version* dummy_versions = cfd->dummy_versions();
     for (Version* v = dummy_versions->next_; v != dummy_versions;
          v = v->next_) {
-      for (int level = 0; level < v->NumberLevels(); level++) {
-        for (const auto& f : v->files_[level]) {
-          live_list->push_back(f->number);
-        }
+      v->AddLiveFiles(live_list);
+      if (v == current) {
+        found_current = true;
       }
     }
+    if (!found_current && current != nullptr) {
+      // Should never happen unless it is a bug.
+      assert(false);
+      current->AddLiveFiles(live_list);
+    }
   }
 }
 
@@ -2631,38 +2951,42 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) {
   auto cfd = c->column_family_data();
   ReadOptions read_options;
   read_options.verify_checksums =
-    cfd->options()->verify_checksums_in_compaction;
+    c->mutable_cf_options()->verify_checksums_in_compaction;
   read_options.fill_cache = false;
 
   // Level-0 files have to be merged together.  For other levels,
   // we will make a concatenating iterator per level.
   // TODO(opt): use concatenating iterator for level-0 if there is no overlap
-  const int space = (c->level() == 0 ? c->inputs(0)->size() + 1 : 2);
-  Iterator** list = new Iterator*[space];
-  int num = 0;
-  for (int which = 0; which < 2; which++) {
-    if (!c->inputs(which)->empty()) {
-      if (c->level() + which == 0) {
-        for (const auto& file : *c->inputs(which)) {
+  const size_t space = (c->level() == 0 ? c->input_levels(0)->num_files +
+                                              c->num_input_levels() - 1
+                                        : c->num_input_levels());
+  Iterator** list = new Iterator* [space];
+  size_t num = 0;
+  for (size_t which = 0; which < c->num_input_levels(); which++) {
+    if (c->input_levels(which)->num_files != 0) {
+      if (c->level(which) == 0) {
+        const LevelFilesBrief* flevel = c->input_levels(which);
+        for (size_t i = 0; i < flevel->num_files; i++) {
           list[num++] = cfd->table_cache()->NewIterator(
-              read_options, storage_options_compactions_,
-              cfd->internal_comparator(), *file, nullptr,
+              read_options, env_options_compactions_,
+              cfd->internal_comparator(), flevel->files[i].fd, nullptr,
               true /* for compaction */);
         }
       } else {
         // Create concatenating iterator for the files from this level
-        list[num++] = NewTwoLevelIterator(new Version::LevelFileIteratorState(
-              cfd->table_cache(), read_options, storage_options_,
+        list[num++] = NewTwoLevelIterator(new LevelFileIteratorState(
+              cfd->table_cache(), read_options, env_options_,
               cfd->internal_comparator(), true /* for_compaction */,
               false /* prefix enabled */),
-            new Version::LevelFileNumIterator(cfd->internal_comparator(),
-                                              c->inputs(which)));
+            new LevelFileNumIterator(cfd->internal_comparator(),
+                                     c->input_levels(which)));
       }
     }
   }
   assert(num <= space);
-  Iterator* result = NewMergingIterator(
-      &c->column_family_data()->internal_comparator(), list, num);
+  Iterator* result =
+      NewMergingIterator(&c->column_family_data()->internal_comparator(), list,
+                         static_cast<int>(num));
   delete[] list;
   return result;
 }
@@ -2672,46 +2996,43 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) {
 bool VersionSet::VerifyCompactionFileConsistency(Compaction* c) {
 #ifndef NDEBUG
   Version* version = c->column_family_data()->current();
+  const VersionStorageInfo* vstorage = version->storage_info();
   if (c->input_version() != version) {
-    Log(options_->info_log,
-        "[%s] VerifyCompactionFileConsistency version mismatch",
+    Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log,
+        "[%s] compaction output being applied to a different base version from"
+        " input version",
         c->column_family_data()->GetName().c_str());
-  }
 
-  // verify files in level
-  int level = c->level();
-  for (int i = 0; i < c->num_input_files(0); i++) {
-    uint64_t number = c->input(0,i)->number;
-
-    // look for this file in the current version
-    bool found = false;
-    for (unsigned int j = 0; j < version->files_[level].size(); j++) {
-      FileMetaData* f = version->files_[level][j];
-      if (f->number == number) {
-        found = true;
-        break;
+    if (vstorage->compaction_style_ == kCompactionStyleLevel &&
+        c->start_level() == 0 && c->num_input_levels() > 2U) {
+      // We are doing a L0->base_level compaction. The assumption is if
+      // base level is not L1, levels from L1 to base_level - 1 is empty.
+      // This is ensured by having one compaction from L0 going on at the
+      // same time in level-based compaction. So that during the time, no
+      // compaction/flush can put files to those levels.
+      for (int l = c->start_level() + 1; l < c->output_level(); l++) {
+        if (vstorage->NumLevelFiles(l) != 0) {
+          return false;
+        }
       }
     }
-    if (!found) {
-      return false; // input files non existant in current version
-    }
   }
-  // verify level+1 files
-  level++;
-  for (int i = 0; i < c->num_input_files(1); i++) {
-    uint64_t number = c->input(1,i)->number;
 
-    // look for this file in the current version
-    bool found = false;
-    for (unsigned int j = 0; j < version->files_[level].size(); j++) {
-      FileMetaData* f = version->files_[level][j];
-      if (f->number == number) {
-        found = true;
-        break;
+  for (size_t input = 0; input < c->num_input_levels(); ++input) {
+    int level = c->level(input);
+    for (size_t i = 0; i < c->num_input_files(input); ++i) {
+      uint64_t number = c->input(input, i)->fd.GetNumber();
+      bool found = false;
+      for (unsigned int j = 0; j < vstorage->files_[level].size(); j++) {
+        FileMetaData* f = vstorage->files_[level][j];
+        if (f->fd.GetNumber() == number) {
+          found = true;
+          break;
+        }
+      }
+      if (!found) {
+        return false;  // input files non existent in current version
       }
-    }
-    if (!found) {
-      return false; // input files non existant in current version
     }
   }
 #endif
@@ -2723,9 +3044,10 @@ Status VersionSet::GetMetadataForFile(uint64_t number, int* filelevel,
                                       ColumnFamilyData** cfd) {
   for (auto cfd_iter : *column_family_set_) {
     Version* version = cfd_iter->current();
-    for (int level = 0; level < version->NumberLevels(); level++) {
-      for (const auto& file : version->files_[level]) {
-        if (file->number == number) {
+    const auto* vstorage = version->storage_info();
+    for (int level = 0; level < vstorage->num_levels(); level++) {
+      for (const auto& file : vstorage->LevelFiles(level)) {
+        if (file->fd.GetNumber() == number) {
           *meta = file;
           *filelevel = level;
           *cfd = cfd_iter;
@@ -2739,13 +3061,24 @@ Status VersionSet::GetMetadataForFile(uint64_t number, int* filelevel,
 
 void VersionSet::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
   for (auto cfd : *column_family_set_) {
+    if (cfd->IsDropped()) {
+      continue;
+    }
     for (int level = 0; level < cfd->NumberLevels(); level++) {
-      for (const auto& file : cfd->current()->files_[level]) {
+      for (const auto& file :
+           cfd->current()->storage_info()->LevelFiles(level)) {
         LiveFileMetaData filemetadata;
         filemetadata.column_family_name = cfd->GetName();
-        filemetadata.name = TableFileName("", file->number);
+        uint32_t path_id = file->fd.GetPathId();
+        if (path_id < db_options_->db_paths.size()) {
+          filemetadata.db_path = db_options_->db_paths[path_id].path;
+        } else {
+          assert(!db_options_->db_paths.empty());
+          filemetadata.db_path = db_options_->db_paths.back().path;
+        }
+        filemetadata.name = MakeTableFileName("", file->fd.GetNumber());
         filemetadata.level = level;
-        filemetadata.size = file->file_size;
+        filemetadata.size = file->fd.GetFileSize();
         filemetadata.smallestkey = file->smallest.user_key().ToString();
         filemetadata.largestkey = file->largest.user_key().ToString();
         filemetadata.smallest_seqno = file->smallest_seqno;
@@ -2756,25 +3089,50 @@ void VersionSet::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
   }
 }
 
-void VersionSet::GetObsoleteFiles(std::vector<FileMetaData*>* files) {
-  files->insert(files->end(), obsolete_files_.begin(), obsolete_files_.end());
-  obsolete_files_.clear();
+void VersionSet::GetObsoleteFiles(std::vector<FileMetaData*>* files,
+                                  uint64_t min_pending_output) {
+  std::vector<FileMetaData*> pending_files;
+  for (auto f : obsolete_files_) {
+    if (f->fd.GetNumber() < min_pending_output) {
+      files->push_back(f);
+    } else {
+      pending_files.push_back(f);
+    }
+  }
+  obsolete_files_.swap(pending_files);
 }
 
 ColumnFamilyData* VersionSet::CreateColumnFamily(
-    const ColumnFamilyOptions& options, VersionEdit* edit) {
+    const ColumnFamilyOptions& cf_options, VersionEdit* edit) {
   assert(edit->is_column_family_add_);
 
   Version* dummy_versions = new Version(nullptr, this);
+  // Ref() dummy version once so that later we can call Unref() to delete it
+  // by avoiding calling "delete" explicitly (~Version is private)
+  dummy_versions->Ref();
   auto new_cfd = column_family_set_->CreateColumnFamily(
-      edit->column_family_name_, edit->column_family_, dummy_versions, options);
+      edit->column_family_name_, edit->column_family_, dummy_versions,
+      cf_options);
 
   Version* v = new Version(new_cfd, this, current_version_number_++);
 
+  // Fill level target base information.
+  v->storage_info()->CalculateBaseBytes(*new_cfd->ioptions(),
+                                        *new_cfd->GetLatestMutableCFOptions());
   AppendVersion(new_cfd, v);
-  new_cfd->CreateNewMemtable();
+  // GetLatestMutableCFOptions() is safe here without mutex since the
+  // cfd is not available to client
+  new_cfd->CreateNewMemtable(*new_cfd->GetLatestMutableCFOptions());
   new_cfd->SetLogNumber(edit->log_number_);
   return new_cfd;
 }
 
+uint64_t VersionSet::GetNumLiveVersions(Version* dummy_versions) {
+  uint64_t count = 0;
+  for (Version* v = dummy_versions->next_; v != dummy_versions; v = v->next_) {
+    count++;
+  }
+  return count;
+}
+
 }  // namespace rocksdb
diff --git a/src/rocksdb/db/version_set.h b/src/rocksdb/db/version_set.h
index 13a1383..5c5f1fc 100644
--- a/src/rocksdb/db/version_set.h
+++ b/src/rocksdb/db/version_set.h
@@ -18,14 +18,17 @@
 // synchronization on all accesses.
 
 #pragma once
+#include <atomic>
+#include <deque>
+#include <limits>
 #include <map>
 #include <memory>
 #include <set>
+#include <utility>
 #include <vector>
-#include <deque>
-#include <atomic>
-#include <limits>
+
 #include "db/dbformat.h"
+#include "db/version_builder.h"
 #include "db/version_edit.h"
 #include "port/port.h"
 #include "db/table_cache.h"
@@ -34,120 +37,141 @@
 #include "db/column_family.h"
 #include "db/log_reader.h"
 #include "db/file_indexer.h"
+#include "db/write_controller.h"
+#include "rocksdb/env.h"
+#include "util/instrumented_mutex.h"
 
 namespace rocksdb {
 
-namespace log { class Writer; }
+namespace log {
+class Writer;
+}
 
 class Compaction;
-class CompactionPicker;
 class Iterator;
 class LogBuffer;
 class LookupKey;
 class MemTable;
 class Version;
 class VersionSet;
+class WriteBuffer;
 class MergeContext;
 class ColumnFamilyData;
 class ColumnFamilySet;
 class TableCache;
+class MergeIteratorBuilder;
 
-// Return the smallest index i such that files[i]->largest >= key.
-// Return files.size() if there is no such file.
-// REQUIRES: "files" contains a sorted list of non-overlapping files.
+// Return the smallest index i such that file_level.files[i]->largest >= key.
+// Return file_level.num_files if there is no such file.
+// REQUIRES: "file_level.files" contains a sorted list of
+// non-overlapping files.
 extern int FindFile(const InternalKeyComparator& icmp,
-                    const std::vector<FileMetaData*>& files,
-                    const Slice& key);
+                    const LevelFilesBrief& file_level, const Slice& key);
 
 // Returns true iff some file in "files" overlaps the user key range
 // [*smallest,*largest].
 // smallest==nullptr represents a key smaller than all keys in the DB.
 // largest==nullptr represents a key largest than all keys in the DB.
-// REQUIRES: If disjoint_sorted_files, files[] contains disjoint ranges
-//           in sorted order.
-extern bool SomeFileOverlapsRange(
-    const InternalKeyComparator& icmp,
-    bool disjoint_sorted_files,
-    const std::vector<FileMetaData*>& files,
-    const Slice* smallest_user_key,
-    const Slice* largest_user_key);
-
-class Version {
+// REQUIRES: If disjoint_sorted_files, file_level.files[]
+// contains disjoint ranges in sorted order.
+extern bool SomeFileOverlapsRange(const InternalKeyComparator& icmp,
+                                  bool disjoint_sorted_files,
+                                  const LevelFilesBrief& file_level,
+                                  const Slice* smallest_user_key,
+                                  const Slice* largest_user_key);
+
+// Generate LevelFilesBrief from vector<FdWithKeyRange*>
+// Would copy smallest_key and largest_key data to sequential memory
+// arena: Arena used to allocate the memory
+extern void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level,
+                                      const std::vector<FileMetaData*>& files,
+                                      Arena* arena);
+
+class VersionStorageInfo {
  public:
-  // Append to *iters a sequence of iterators that will
-  // yield the contents of this Version when merged together.
-  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
-  void AddIterators(const ReadOptions&, const EnvOptions& soptions,
-                    std::vector<Iterator*>* iters);
+  VersionStorageInfo(const InternalKeyComparator* internal_comparator,
+                     const Comparator* user_comparator, int num_levels,
+                     CompactionStyle compaction_style,
+                     VersionStorageInfo* src_vstorage);
+  ~VersionStorageInfo();
 
-  // Lookup the value for key.  If found, store it in *val and
-  // return OK.  Else return a non-OK status.  Fills *stats.
-  // Uses *operands to store merge_operator operations to apply later
-  // REQUIRES: lock is not held
-  struct GetStats {
-    FileMetaData* seek_file;
-    int seek_file_level;
-  };
-  void Get(const ReadOptions&, const LookupKey& key, std::string* val,
-           Status* status, MergeContext* merge_context, GetStats* stats,
-           bool* value_found = nullptr);
+  void Reserve(int level, size_t size) { files_[level].reserve(size); }
 
-  // Adds "stats" into the current state.  Returns true if a new
-  // compaction may need to be triggered, false otherwise.
-  // REQUIRES: lock is held
-  bool UpdateStats(const GetStats& stats);
+  void AddFile(int level, FileMetaData* f);
+
+  void SetFinalized();
+
+  // Update num_non_empty_levels_.
+  void UpdateNumNonEmptyLevels();
+
+  void GenerateFileIndexer() {
+    file_indexer_.UpdateIndex(&arena_, num_non_empty_levels_, files_);
+  }
+
+  // Update the accumulated stats from a file-meta.
+  void UpdateAccumulatedStats(FileMetaData* file_meta);
+
+  void ComputeCompensatedSizes();
 
   // Updates internal structures that keep track of compaction scores
   // We use compaction scores to figure out which compaction to do next
-  // REQUIRES: If Version is not yet saved to current_, it can be called without
-  // a lock. Once a version is saved to current_, call only with mutex held
-  void ComputeCompactionScore(std::vector<uint64_t>& size_being_compacted);
-
-  // Reference count management (so Versions do not disappear out from
-  // under live iterators)
-  void Ref();
-  // Decrease reference count. Delete the object if no reference left
-  // and return true. Otherwise, return false.
-  bool Unref();
+  // REQUIRES: db_mutex held!!
+  // TODO find a better way to pass compaction_options_fifo.
+  void ComputeCompactionScore(
+      const MutableCFOptions& mutable_cf_options,
+      const CompactionOptionsFIFO& compaction_options_fifo);
+
+  // This computes files_marked_for_compaction_ and is called by
+  // ComputeCompactionScore()
+  void ComputeFilesMarkedForCompaction();
+
+  // Generate level_files_brief_ from files_
+  void GenerateLevelFilesBrief();
+  // Sort all files for this version based on their file size and
+  // record results in files_by_size_. The largest files are listed first.
+  void UpdateFilesBySize();
 
-  // Returns true iff some level needs a compaction.
-  bool NeedsCompaction() const;
+  int MaxInputLevel() const;
 
   // Returns the maxmimum compaction score for levels 1 to max
-  double MaxCompactionScore() const { return max_compaction_score_; }
+  double max_compaction_score() const { return max_compaction_score_; }
 
   // See field declaration
-  int MaxCompactionScoreLevel() const { return max_compaction_score_level_; }
+  int max_compaction_score_level() const { return max_compaction_score_level_; }
+
+  // Return level number that has idx'th highest score
+  int CompactionScoreLevel(int idx) const { return compaction_level_[idx]; }
+
+  // Return idx'th highest score
+  double CompactionScore(int idx) const { return compaction_score_[idx]; }
 
   void GetOverlappingInputs(
-      int level,
-      const InternalKey* begin,         // nullptr means before all keys
-      const InternalKey* end,           // nullptr means after all keys
+      int level, const InternalKey* begin,  // nullptr means before all keys
+      const InternalKey* end,               // nullptr means after all keys
       std::vector<FileMetaData*>* inputs,
-      int hint_index = -1,              // index of overlap file
-      int* file_index = nullptr);          // return index of overlap file
+      int hint_index = -1,         // index of overlap file
+      int* file_index = nullptr);  // return index of overlap file
 
   void GetOverlappingInputsBinarySearch(
       int level,
-      const Slice& begin,         // nullptr means before all keys
-      const Slice& end,           // nullptr means after all keys
+      const Slice& begin,  // nullptr means before all keys
+      const Slice& end,    // nullptr means after all keys
       std::vector<FileMetaData*>* inputs,
-      int hint_index,             // index of overlap file
-      int* file_index);           // return index of overlap file
+      int hint_index,    // index of overlap file
+      int* file_index);  // return index of overlap file
 
   void ExtendOverlappingInputs(
       int level,
-      const Slice& begin,         // nullptr means before all keys
-      const Slice& end,           // nullptr means after all keys
+      const Slice& begin,  // nullptr means before all keys
+      const Slice& end,    // nullptr means after all keys
       std::vector<FileMetaData*>* inputs,
-      unsigned int index);                 // start extending from this index
+      unsigned int index);  // start extending from this index
 
   // Returns true iff some file in the specified level overlaps
   // some part of [*smallest_user_key,*largest_user_key].
   // smallest_user_key==NULL represents a key smaller than all keys in the DB.
   // largest_user_key==NULL represents a key largest than all keys in the DB.
-  bool OverlapInLevel(int level,
-                      const Slice* smallest_user_key,
+  bool OverlapInLevel(int level, const Slice* smallest_user_key,
                       const Slice* largest_user_key);
 
   // Returns true iff the first or last file in inputs contains
@@ -157,27 +181,95 @@ class Version {
   bool HasOverlappingUserKey(const std::vector<FileMetaData*>* inputs,
                              int level);
 
-
   // Return the level at which we should place a new memtable compaction
   // result that covers the range [smallest_user_key,largest_user_key].
-  int PickLevelForMemTableOutput(const Slice& smallest_user_key,
+  int PickLevelForMemTableOutput(const MutableCFOptions& mutable_cf_options,
+                                 const Slice& smallest_user_key,
                                  const Slice& largest_user_key);
 
-  int NumberLevels() const { return num_levels_; }
+  int num_levels() const { return num_levels_; }
 
-  // REQUIRES: lock is held
-  int NumLevelFiles(int level) const { return files_[level].size(); }
+  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+  int num_non_empty_levels() const {
+    assert(finalized_);
+    return num_non_empty_levels_;
+  }
+
+  // REQUIRES: This version has been finalized.
+  // (CalculateBaseBytes() is called)
+  // This may or may not return number of level files. It is to keep backward
+  // compatible behavior in universal compaction.
+  int l0_delay_trigger_count() const { return l0_delay_trigger_count_; }
+
+  void set_l0_delay_trigger_count(int v) { l0_delay_trigger_count_ = v; }
+
+  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+  int NumLevelFiles(int level) const {
+    assert(finalized_);
+    return static_cast<int>(files_[level].size());
+  }
 
   // Return the combined file size of all files at the specified level.
-  int64_t NumLevelBytes(int level) const;
+  uint64_t NumLevelBytes(int level) const;
+
+  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+  const std::vector<FileMetaData*>& LevelFiles(int level) const {
+    return files_[level];
+  }
+
+  const rocksdb::LevelFilesBrief& LevelFilesBrief(int level) const {
+    assert(level < static_cast<int>(level_files_brief_.size()));
+    return level_files_brief_[level];
+  }
+
+  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+  const std::vector<int>& FilesBySize(int level) const {
+    assert(finalized_);
+    return files_by_size_[level];
+  }
+
+  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+  // REQUIRES: DB mutex held during access
+  const autovector<std::pair<int, FileMetaData*>>& FilesMarkedForCompaction()
+      const {
+    assert(finalized_);
+    return files_marked_for_compaction_;
+  }
+
+  int base_level() const { return base_level_; }
+
+  // REQUIRES: lock is held
+  // Set the index that is used to offset into files_by_size_ to find
+  // the next compaction candidate file.
+  void SetNextCompactionIndex(int level, int index) {
+    next_file_to_compact_by_size_[level] = index;
+  }
+
+  // REQUIRES: lock is held
+  int NextCompactionIndex(int level) const {
+    return next_file_to_compact_by_size_[level];
+  }
+
+  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+  const FileIndexer& file_indexer() const {
+    assert(finalized_);
+    return file_indexer_;
+  }
+
+  // Only the first few entries of files_by_size_ are sorted.
+  // There is no need to sort all the files because it is likely
+  // that on a running system, we need to look at only the first
+  // few largest files because a new version is created every few
+  // seconds/minutes (because of concurrent compactions).
+  static const size_t kNumberFilesToSort = 50;
 
   // Return a human-readable short (single-line) summary of the number
   // of files per level.  Uses *scratch as backing store.
   struct LevelSummaryStorage {
-    char buffer[100];
+    char buffer[1000];
   };
   struct FileSummaryStorage {
-    char buffer[1000];
+    char buffer[3000];
   };
   const char* LevelSummary(LevelSummaryStorage* scratch) const;
   // Return a human-readable short (single-line) summary of files
@@ -188,63 +280,63 @@ class Version {
   // file at a level >= 1.
   int64_t MaxNextLevelOverlappingBytes();
 
-  // Add all files listed in the current version to *live.
-  void AddLiveFiles(std::set<uint64_t>* live);
-
   // Return a human readable string that describes this version's contents.
   std::string DebugString(bool hex = false) const;
 
-  // Returns the version nuber of this version
-  uint64_t GetVersionNumber() const { return version_number_; }
-
-  // REQUIRES: lock is held
-  // On success, *props will be populated with all SSTables' table properties.
-  // The keys of `props` are the sst file name, the values of `props` are the
-  // tables' propertis, represented as shared_ptr.
-  Status GetPropertiesOfAllTables(TablePropertiesCollection* props);
+  uint64_t GetAverageValueSize() const {
+    if (accumulated_num_non_deletions_ == 0) {
+      return 0;
+    }
+    assert(accumulated_raw_key_size_ + accumulated_raw_value_size_ > 0);
+    assert(accumulated_file_size_ > 0);
+    return accumulated_raw_value_size_ / accumulated_num_non_deletions_ *
+           accumulated_file_size_ /
+           (accumulated_raw_key_size_ + accumulated_raw_value_size_);
+  }
 
-  // used to sort files by size
-  struct Fsize {
-    int index;
-    FileMetaData* file;
-  };
+  uint64_t GetEstimatedActiveKeys() const;
 
- private:
-  friend class Compaction;
-  friend class VersionSet;
-  friend class DBImpl;
-  friend class ColumnFamilyData;
-  friend class CompactionPicker;
-  friend class LevelCompactionPicker;
-  friend class UniversalCompactionPicker;
+  // re-initializes the index that is used to offset into files_by_size_
+  // to find the next compaction candidate file.
+  void ResetNextCompactionIndex(int level) {
+    next_file_to_compact_by_size_[level] = 0;
+  }
 
-  class LevelFileNumIterator;
-  class LevelFileIteratorState;
+  const InternalKeyComparator* InternalComparator() {
+    return internal_comparator_;
+  }
 
-  bool PrefixMayMatch(const ReadOptions& options, Iterator* level_iter,
-                      const Slice& internal_prefix) const;
+  // Returns maximum total bytes of data on a given level.
+  uint64_t MaxBytesForLevel(int level) const;
 
-  // Sort all files for this version based on their file size and
-  // record results in files_by_size_. The largest files are listed first.
-  void UpdateFilesBySize();
+  // Must be called after any change to MutableCFOptions.
+  void CalculateBaseBytes(const ImmutableCFOptions& ioptions,
+                          const MutableCFOptions& options);
 
-  ColumnFamilyData* cfd_;  // ColumnFamilyData to which this Version belongs
+ private:
   const InternalKeyComparator* internal_comparator_;
   const Comparator* user_comparator_;
-  TableCache* table_cache_;
-  const MergeOperator* merge_operator_;
-  Logger* info_log_;
-  Statistics* db_statistics_;
-  VersionSet* vset_;            // VersionSet to which this Version belongs
-  Version* next_;               // Next version in linked list
-  Version* prev_;               // Previous version in linked list
-  int refs_;                    // Number of live refs to this version
-  int num_levels_;              // Number of levels
+  int num_levels_;            // Number of levels
+  int num_non_empty_levels_;  // Number of levels. Any level larger than it
+                              // is guaranteed to be empty.
+  // Per-level max bytes
+  std::vector<uint64_t> level_max_bytes_;
+
+  // A short brief metadata of files per level
+  autovector<rocksdb::LevelFilesBrief> level_files_brief_;
+  FileIndexer file_indexer_;
+  Arena arena_;  // Used to allocate space for file_levels_
+
+  CompactionStyle compaction_style_;
 
   // List of files per level, files in each level are arranged
   // in increasing order of keys
   std::vector<FileMetaData*>* files_;
 
+  // Level that L0 data should be compacted to. All levels < base_level_ should
+  // be empty. -1 if it is not level-compaction so it's not applicable.
+  int base_level_;
+
   // A list for the same set of files that are stored in files_,
   // but files in each level are now sorted based on file
   // size. The file with the largest size is at the front.
@@ -260,11 +352,12 @@ class Version {
   // that on a running system, we need to look at only the first
   // few largest files because a new version is created every few
   // seconds/minutes (because of concurrent compactions).
-  static const int number_of_files_to_sort_ = 50;
+  static const size_t number_of_files_to_sort_ = 50;
 
-  // Next file to compact based on seek stats.
-  FileMetaData* file_to_compact_;
-  int file_to_compact_level_;
+  // This vector contains list of files marked for compaction and also not
+  // currently being compacted. It is protected by DB mutex. It is calculated in
+  // ComputeCompactionScore()
+  autovector<std::pair<int, FileMetaData*>> files_marked_for_compaction_;
 
   // Level that should be compacted next and its compaction score.
   // Score < 1 means compaction is not strictly needed.  These fields
@@ -273,24 +366,151 @@ class Version {
   // These are used to pick the best compaction level
   std::vector<double> compaction_score_;
   std::vector<int> compaction_level_;
-  double max_compaction_score_; // max score in l1 to ln-1
-  int max_compaction_score_level_; // level on which max score occurs
+  double max_compaction_score_ = 0.0;   // max score in l1 to ln-1
+  int max_compaction_score_level_ = 0;  // level on which max score occurs
+  int l0_delay_trigger_count_ = 0;  // Count used to trigger slow down and stop
+                                    // for number of L0 files.
+
+  // the following are the sampled temporary stats.
+  // the current accumulated size of sampled files.
+  uint64_t accumulated_file_size_;
+  // the current accumulated size of all raw keys based on the sampled files.
+  uint64_t accumulated_raw_key_size_;
+  // the current accumulated size of all raw keys based on the sampled files.
+  uint64_t accumulated_raw_value_size_;
+  // total number of non-deletion entries
+  uint64_t accumulated_num_non_deletions_;
+  // total number of deletion entries
+  uint64_t accumulated_num_deletions_;
+  // the number of samples
+  uint64_t num_samples_;
+
+  bool finalized_;
+
+  friend class Version;
+  friend class VersionSet;
+  // No copying allowed
+  VersionStorageInfo(const VersionStorageInfo&) = delete;
+  void operator=(const VersionStorageInfo&) = delete;
+};
+
+class Version {
+ public:
+  // Append to *iters a sequence of iterators that will
+  // yield the contents of this Version when merged together.
+  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+  void AddIterators(const ReadOptions&, const EnvOptions& soptions,
+                    MergeIteratorBuilder* merger_iter_builder);
+
+  // Lookup the value for key.  If found, store it in *val and
+  // return OK.  Else return a non-OK status.
+  // Uses *operands to store merge_operator operations to apply later
+  // REQUIRES: lock is not held
+  void Get(const ReadOptions&, const LookupKey& key, std::string* val,
+           Status* status, MergeContext* merge_context,
+           bool* value_found = nullptr);
+
+  // Loads some stats information from files. Call without mutex held. It needs
+  // to be called before applying the version to the version set.
+  void PrepareApply(const MutableCFOptions& mutable_cf_options);
+
+  // Reference count management (so Versions do not disappear out from
+  // under live iterators)
+  void Ref();
+  // Decrease reference count. Delete the object if no reference left
+  // and return true. Otherwise, return false.
+  bool Unref();
+
+  // Add all files listed in the current version to *live.
+  void AddLiveFiles(std::vector<FileDescriptor>* live);
+
+  // Return a human readable string that describes this version's contents.
+  std::string DebugString(bool hex = false) const;
+
+  // Returns the version nuber of this version
+  uint64_t GetVersionNumber() const { return version_number_; }
+
+  // REQUIRES: lock is held
+  // On success, "tp" will contains the table properties of the file
+  // specified in "file_meta".  If the file name of "file_meta" is
+  // known ahread, passing it by a non-null "fname" can save a
+  // file-name conversion.
+  Status GetTableProperties(std::shared_ptr<const TableProperties>* tp,
+                            const FileMetaData* file_meta,
+                            const std::string* fname = nullptr);
+
+  // REQUIRES: lock is held
+  // On success, *props will be populated with all SSTables' table properties.
+  // The keys of `props` are the sst file name, the values of `props` are the
+  // tables' propertis, represented as shared_ptr.
+  Status GetPropertiesOfAllTables(TablePropertiesCollection* props);
+
+  uint64_t GetEstimatedActiveKeys() {
+    return storage_info_.GetEstimatedActiveKeys();
+  }
+
+  size_t GetMemoryUsageByTableReaders();
+
+  ColumnFamilyData* cfd() const { return cfd_; }
+
+  // Return the next Version in the linked list. Used for debug only
+  Version* TEST_Next() const {
+    return next_;
+  }
+
+  VersionStorageInfo* storage_info() { return &storage_info_; }
+
+  VersionSet* version_set() { return vset_; }
+
+  void GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta);
+
+ private:
+  Env* env_;
+  friend class VersionSet;
+
+  const InternalKeyComparator* internal_comparator() const {
+    return storage_info_.internal_comparator_;
+  }
+  const Comparator* user_comparator() const {
+    return storage_info_.user_comparator_;
+  }
+
+  bool PrefixMayMatch(const ReadOptions& read_options, Iterator* level_iter,
+                      const Slice& internal_prefix) const;
+
+  // The helper function of UpdateAccumulatedStats, which may fill the missing
+  // fields of file_mata from its associated TableProperties.
+  // Returns true if it does initialize FileMetaData.
+  bool MaybeInitializeFileMetaData(FileMetaData* file_meta);
+
+  // Update the accumulated stats associated with the current version.
+  // This accumulated stats will be used in compaction.
+  void UpdateAccumulatedStats();
+
+  // Sort all files for this version based on their file size and
+  // record results in files_by_size_. The largest files are listed first.
+  void UpdateFilesBySize();
+
+  ColumnFamilyData* cfd_;  // ColumnFamilyData to which this Version belongs
+  Logger* info_log_;
+  Statistics* db_statistics_;
+  TableCache* table_cache_;
+  const MergeOperator* merge_operator_;
+
+  VersionStorageInfo storage_info_;
+  VersionSet* vset_;            // VersionSet to which this Version belongs
+  Version* next_;               // Next version in linked list
+  Version* prev_;               // Previous version in linked list
+  int refs_;                    // Number of live refs to this version
 
   // A version number that uniquely represents this version. This is
   // used for debugging and logging purposes only.
   uint64_t version_number_;
 
   Version(ColumnFamilyData* cfd, VersionSet* vset, uint64_t version_number = 0);
-  FileIndexer file_indexer_;
 
   ~Version();
 
-  // re-initializes the index that is used to offset into files_by_size_
-  // to find the next compaction candidate file.
-  void ResetNextCompactionIndex(int level) {
-    next_file_to_compact_by_size_[level] = 0;
-  }
-
   // No copying allowed
   Version(const Version&);
   void operator=(const Version&);
@@ -298,8 +518,9 @@ class Version {
 
 class VersionSet {
  public:
-  VersionSet(const std::string& dbname, const DBOptions* options,
-             const EnvOptions& storage_options, Cache* table_cache);
+  VersionSet(const std::string& dbname, const DBOptions* db_options,
+             const EnvOptions& env_options, Cache* table_cache,
+             WriteBuffer* write_buffer, WriteController* write_controller);
   ~VersionSet();
 
   // Apply *edit to the current version to form a new descriptor that
@@ -308,11 +529,12 @@ class VersionSet {
   // column_family_options has to be set if edit is column family add
   // REQUIRES: *mu is held on entry.
   // REQUIRES: no other thread concurrently calls LogAndApply()
-  Status LogAndApply(ColumnFamilyData* column_family_data, VersionEdit* edit,
-                     port::Mutex* mu, Directory* db_directory = nullptr,
-                     bool new_descriptor_log = false,
-                     const ColumnFamilyOptions* column_family_options =
-                         nullptr);
+  Status LogAndApply(
+      ColumnFamilyData* column_family_data,
+      const MutableCFOptions& mutable_cf_options, VersionEdit* edit,
+      InstrumentedMutex* mu, Directory* db_directory = nullptr,
+      bool new_descriptor_log = false,
+      const ColumnFamilyOptions* column_family_options = nullptr);
 
   // Recover the last saved descriptor from persistent storage.
   // If read_only == true, Recover() will not complain if some column families
@@ -337,7 +559,7 @@ class VersionSet {
   // among [4-6] contains files.
   static Status ReduceNumberOfLevels(const std::string& dbname,
                                      const Options* options,
-                                     const EnvOptions& storage_options,
+                                     const EnvOptions& env_options,
                                      int new_levels);
 
   // printf contents (for debugging)
@@ -347,23 +569,16 @@ class VersionSet {
 #endif  // ROCKSDB_LITE
 
   // Return the current manifest file number
-  uint64_t ManifestFileNumber() const { return manifest_file_number_; }
+  uint64_t manifest_file_number() const { return manifest_file_number_; }
 
-  uint64_t PendingManifestFileNumber() const {
+  uint64_t pending_manifest_file_number() const {
     return pending_manifest_file_number_;
   }
 
+  uint64_t current_next_file_number() const { return next_file_number_.load(); }
+
   // Allocate and return a new file number
-  uint64_t NewFileNumber() { return next_file_number_++; }
-
-  // Arrange to reuse "file_number" unless a newer file number has
-  // already been allocated.
-  // REQUIRES: "file_number" was returned by a call to NewFileNumber().
-  void ReuseFileNumber(uint64_t file_number) {
-    if (next_file_number_ == file_number + 1) {
-      next_file_number_ = file_number;
-    }
-  }
+  uint64_t NewFileNumber() { return next_file_number_.fetch_add(1); }
 
   // Return the last sequence number.
   uint64_t LastSequence() const {
@@ -377,11 +592,12 @@ class VersionSet {
   }
 
   // Mark the specified file number as used.
-  void MarkFileNumberUsed(uint64_t number);
+  // REQUIRED: this is only called during single-threaded recovery
+  void MarkFileNumberUsedDuringRecovery(uint64_t number);
 
   // Return the log file number for the log file that is currently
   // being compacted, or zero if there is no such log file.
-  uint64_t PrevLogNumber() const { return prev_log_number_; }
+  uint64_t prev_log_number() const { return prev_log_number_; }
 
   // Returns the minimum log number such that all
   // log numbers less than or equal to it can be deleted
@@ -400,14 +616,13 @@ class VersionSet {
   Iterator* MakeInputIterator(Compaction* c);
 
   // Add all files listed in any live version to *live.
-  void AddLiveFiles(std::vector<uint64_t>* live_list);
+  void AddLiveFiles(std::vector<FileDescriptor>* live_list);
 
-  // Return the approximate offset in the database of the data for
-  // "key" as of version "v".
-  uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key);
+  // Return the approximate size of data to be scanned for range [start, end)
+  uint64_t ApproximateSize(Version* v, const Slice& start, const Slice& end);
 
   // Return the size of the current manifest file
-  uint64_t ManifestFileSize() const { return manifest_file_size_; }
+  uint64_t manifest_file_size() const { return manifest_file_size_; }
 
   // verify that the files that we started with for a compaction
   // still exist in the current version and in the same original level.
@@ -418,26 +633,36 @@ class VersionSet {
   Status GetMetadataForFile(uint64_t number, int* filelevel,
                             FileMetaData** metadata, ColumnFamilyData** cfd);
 
-  void GetLiveFilesMetaData(
-    std::vector<LiveFileMetaData> *metadata);
+  void GetLiveFilesMetaData(std::vector<LiveFileMetaData> *metadata);
 
-  void GetObsoleteFiles(std::vector<FileMetaData*>* files);
+  void GetObsoleteFiles(std::vector<FileMetaData*>* files,
+                        uint64_t min_pending_output);
 
   ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); }
+  const EnvOptions& env_options() { return env_options_; }
+
+  static uint64_t GetNumLiveVersions(Version* dummy_versions);
 
  private:
-  class Builder;
   struct ManifestWriter;
 
   friend class Version;
+  friend class DBImpl;
 
   struct LogReporter : public log::Reader::Reporter {
     Status* status;
-    virtual void Corruption(size_t bytes, const Status& s) {
+    virtual void Corruption(size_t bytes, const Status& s) override {
       if (this->status->ok()) *this->status = s;
     }
   };
 
+  // ApproximateSize helper
+  uint64_t ApproximateSizeLevel0(Version* v, const LevelFilesBrief& files_brief,
+                                 const Slice& start, const Slice& end);
+
+  uint64_t ApproximateSize(Version* v, const FdWithKeyRange& f,
+                           const Slice& key);
+
   // Save current contents to *log
   Status WriteSnapshot(log::Writer* log);
 
@@ -446,15 +671,15 @@ class VersionSet {
   bool ManifestContains(uint64_t manifest_file_number,
                         const std::string& record) const;
 
-  ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& options,
+  ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options,
                                        VersionEdit* edit);
 
   std::unique_ptr<ColumnFamilySet> column_family_set_;
 
   Env* const env_;
   const std::string dbname_;
-  const DBOptions* const options_;
-  uint64_t next_file_number_;
+  const DBOptions* const db_options_;
+  std::atomic<uint64_t> next_file_number_;
   uint64_t manifest_file_number_;
   uint64_t pending_manifest_file_number_;
   std::atomic<uint64_t> last_sequence_;
@@ -474,20 +699,20 @@ class VersionSet {
 
   std::vector<FileMetaData*> obsolete_files_;
 
-  // storage options for all reads and writes except compactions
-  const EnvOptions& storage_options_;
+  // env options for all reads and writes except compactions
+  const EnvOptions& env_options_;
 
-  // storage options used for compactions. This is a copy of
-  // storage_options_ but with readaheads set to readahead_compactions_.
-  const EnvOptions storage_options_compactions_;
+  // env options used for compactions. This is a copy of
+  // env_options_ but with readaheads set to readahead_compactions_.
+  const EnvOptions env_options_compactions_;
 
   // No copying allowed
   VersionSet(const VersionSet&);
   void operator=(const VersionSet&);
 
   void LogAndApplyCFHelper(VersionEdit* edit);
-  void LogAndApplyHelper(ColumnFamilyData* cfd, Builder* b, Version* v,
-                         VersionEdit* edit, port::Mutex* mu);
+  void LogAndApplyHelper(ColumnFamilyData* cfd, VersionBuilder* b, Version* v,
+                         VersionEdit* edit, InstrumentedMutex* mu);
 };
 
 }  // namespace rocksdb
diff --git a/src/rocksdb/db/version_set_test.cc b/src/rocksdb/db/version_set_test.cc
index 1af95dd..202bb1c 100644
--- a/src/rocksdb/db/version_set_test.cc
+++ b/src/rocksdb/db/version_set_test.cc
@@ -14,14 +14,15 @@
 
 namespace rocksdb {
 
-class FindFileTest {
+class GenerateLevelFilesBriefTest : public testing::Test {
  public:
   std::vector<FileMetaData*> files_;
-  bool disjoint_sorted_files_;
+  LevelFilesBrief file_level_;
+  Arena arena_;
 
-  FindFileTest() : disjoint_sorted_files_(true) { }
+  GenerateLevelFilesBriefTest() { }
 
-  ~FindFileTest() {
+  ~GenerateLevelFilesBriefTest() {
     for (unsigned int i = 0; i < files_.size(); i++) {
       delete files_[i];
     }
@@ -31,29 +32,269 @@ class FindFileTest {
            SequenceNumber smallest_seq = 100,
            SequenceNumber largest_seq = 100) {
     FileMetaData* f = new FileMetaData;
-    f->number = files_.size() + 1;
+    f->fd = FileDescriptor(files_.size() + 1, 0, 0);
     f->smallest = InternalKey(smallest, smallest_seq, kTypeValue);
     f->largest = InternalKey(largest, largest_seq, kTypeValue);
     files_.push_back(f);
   }
 
+  int Compare() {
+    int diff = 0;
+    for (size_t i = 0; i < files_.size(); i++) {
+      if (file_level_.files[i].fd.GetNumber() != files_[i]->fd.GetNumber()) {
+        diff++;
+      }
+    }
+    return diff;
+  }
+};
+
+TEST_F(GenerateLevelFilesBriefTest, Empty) {
+  DoGenerateLevelFilesBrief(&file_level_, files_, &arena_);
+  ASSERT_EQ(0u, file_level_.num_files);
+  ASSERT_EQ(0, Compare());
+}
+
+TEST_F(GenerateLevelFilesBriefTest, Single) {
+  Add("p", "q");
+  DoGenerateLevelFilesBrief(&file_level_, files_, &arena_);
+  ASSERT_EQ(1u, file_level_.num_files);
+  ASSERT_EQ(0, Compare());
+}
+
+TEST_F(GenerateLevelFilesBriefTest, Multiple) {
+  Add("150", "200");
+  Add("200", "250");
+  Add("300", "350");
+  Add("400", "450");
+  DoGenerateLevelFilesBrief(&file_level_, files_, &arena_);
+  ASSERT_EQ(4u, file_level_.num_files);
+  ASSERT_EQ(0, Compare());
+}
+
+class CountingLogger : public Logger {
+ public:
+  CountingLogger() : log_count(0) {}
+  using Logger::Logv;
+  virtual void Logv(const char* format, va_list ap) override { log_count++; }
+  int log_count;
+};
+
+Options GetOptionsWithNumLevels(int num_levels,
+                                std::shared_ptr<CountingLogger> logger) {
+  Options opt;
+  opt.num_levels = num_levels;
+  opt.info_log = logger;
+  return opt;
+}
+
+class VersionStorageInfoTest : public testing::Test {
+ public:
+  const Comparator* ucmp_;
+  InternalKeyComparator icmp_;
+  std::shared_ptr<CountingLogger> logger_;
+  Options options_;
+  ImmutableCFOptions ioptions_;
+  MutableCFOptions mutable_cf_options_;
+  VersionStorageInfo vstorage_;
+
+  InternalKey GetInternalKey(const char* ukey,
+                             SequenceNumber smallest_seq = 100) {
+    return InternalKey(ukey, smallest_seq, kTypeValue);
+  }
+
+  VersionStorageInfoTest()
+      : ucmp_(BytewiseComparator()),
+        icmp_(ucmp_),
+        logger_(new CountingLogger()),
+        options_(GetOptionsWithNumLevels(6, logger_)),
+        ioptions_(options_),
+        mutable_cf_options_(options_, ioptions_),
+        vstorage_(&icmp_, ucmp_, 6, kCompactionStyleLevel, nullptr) {}
+
+  ~VersionStorageInfoTest() {
+    for (int i = 0; i < vstorage_.num_levels(); i++) {
+      for (auto* f : vstorage_.LevelFiles(i)) {
+        if (--f->refs == 0) {
+          delete f;
+        }
+      }
+    }
+  }
+
+  void Add(int level, uint32_t file_number, const char* smallest,
+           const char* largest, uint64_t file_size = 0) {
+    assert(level < vstorage_.num_levels());
+    FileMetaData* f = new FileMetaData;
+    f->fd = FileDescriptor(file_number, 0, file_size);
+    f->smallest = GetInternalKey(smallest, 0);
+    f->largest = GetInternalKey(largest, 0);
+    f->compensated_file_size = file_size;
+    f->refs = 0;
+    f->num_entries = 0;
+    f->num_deletions = 0;
+    vstorage_.AddFile(level, f);
+  }
+};
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelStatic) {
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  mutable_cf_options_.max_bytes_for_level_base = 10;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 5;
+  Add(4, 100U, "1", "2");
+  Add(5, 101U, "1", "2");
+
+  vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(1), 10U);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(2), 50U);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(3), 250U);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 1250U);
+
+  ASSERT_EQ(0, logger_->log_count);
+}
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamic) {
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.max_bytes_for_level_base = 1000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 5;
+  Add(5, 1U, "1", "2", 500U);
+
+  vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_);
+  ASSERT_EQ(0, logger_->log_count);
+  ASSERT_EQ(vstorage_.base_level(), 5);
+
+  Add(5, 2U, "3", "4", 550U);
+  vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_);
+  ASSERT_EQ(0, logger_->log_count);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 210U);
+  ASSERT_EQ(vstorage_.base_level(), 4);
+
+  Add(4, 3U, "3", "4", 550U);
+  vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_);
+  ASSERT_EQ(0, logger_->log_count);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 210U);
+  ASSERT_EQ(vstorage_.base_level(), 4);
+
+  Add(3, 4U, "3", "4", 250U);
+  Add(3, 5U, "5", "7", 300U);
+  vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_);
+  ASSERT_EQ(1, logger_->log_count);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 1005U);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(3), 201U);
+  ASSERT_EQ(vstorage_.base_level(), 3);
+
+  Add(1, 6U, "3", "4", 5U);
+  Add(1, 7U, "8", "9", 5U);
+  logger_->log_count = 0;
+  vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_);
+  ASSERT_EQ(1, logger_->log_count);
+  ASSERT_GT(vstorage_.MaxBytesForLevel(4), 1005U);
+  ASSERT_GT(vstorage_.MaxBytesForLevel(3), 1005U);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(2), 1005U);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(1), 201U);
+  ASSERT_EQ(vstorage_.base_level(), 1);
+}
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicLotsOfData) {
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.max_bytes_for_level_base = 100;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 2;
+  Add(0, 1U, "1", "2", 50U);
+  Add(1, 2U, "1", "2", 50U);
+  Add(2, 3U, "1", "2", 500U);
+  Add(3, 4U, "1", "2", 500U);
+  Add(4, 5U, "1", "2", 1700U);
+  Add(5, 6U, "1", "2", 500U);
+
+  vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 800U);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(3), 400U);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(2), 200U);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(1), 100U);
+  ASSERT_EQ(vstorage_.base_level(), 1);
+  ASSERT_EQ(0, logger_->log_count);
+}
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicLargeLevel) {
+  uint64_t kOneGB = 1000U * 1000U * 1000U;
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.max_bytes_for_level_base = 10U * kOneGB;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+  Add(0, 1U, "1", "2", 50U);
+  Add(3, 4U, "1", "2", 32U * kOneGB);
+  Add(4, 5U, "1", "2", 500U * kOneGB);
+  Add(5, 6U, "1", "2", 3000U * kOneGB);
+
+  vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(5), 3000U * kOneGB);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 300U * kOneGB);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(3), 30U * kOneGB);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(2), 3U * kOneGB);
+  ASSERT_EQ(vstorage_.base_level(), 2);
+  ASSERT_EQ(0, logger_->log_count);
+}
+
+class FindLevelFileTest : public testing::Test {
+ public:
+  LevelFilesBrief file_level_;
+  bool disjoint_sorted_files_;
+  Arena arena_;
+
+  FindLevelFileTest() : disjoint_sorted_files_(true) { }
+
+  ~FindLevelFileTest() {
+  }
+
+  void LevelFileInit(size_t num = 0) {
+    char* mem = arena_.AllocateAligned(num * sizeof(FdWithKeyRange));
+    file_level_.files = new (mem)FdWithKeyRange[num];
+    file_level_.num_files = 0;
+  }
+
+  void Add(const char* smallest, const char* largest,
+           SequenceNumber smallest_seq = 100,
+           SequenceNumber largest_seq = 100) {
+    InternalKey smallest_key = InternalKey(smallest, smallest_seq, kTypeValue);
+    InternalKey largest_key = InternalKey(largest, largest_seq, kTypeValue);
+
+    Slice smallest_slice = smallest_key.Encode();
+    Slice largest_slice = largest_key.Encode();
+
+    char* mem = arena_.AllocateAligned(
+        smallest_slice.size() + largest_slice.size());
+    memcpy(mem, smallest_slice.data(), smallest_slice.size());
+    memcpy(mem + smallest_slice.size(), largest_slice.data(),
+        largest_slice.size());
+
+    // add to file_level_
+    size_t num = file_level_.num_files;
+    auto& file = file_level_.files[num];
+    file.fd = FileDescriptor(num + 1, 0, 0);
+    file.smallest_key = Slice(mem, smallest_slice.size());
+    file.largest_key = Slice(mem + smallest_slice.size(),
+        largest_slice.size());
+    file_level_.num_files++;
+  }
+
   int Find(const char* key) {
     InternalKey target(key, 100, kTypeValue);
     InternalKeyComparator cmp(BytewiseComparator());
-    return FindFile(cmp, files_, target.Encode());
+    return FindFile(cmp, file_level_, target.Encode());
   }
 
   bool Overlaps(const char* smallest, const char* largest) {
     InternalKeyComparator cmp(BytewiseComparator());
     Slice s(smallest != nullptr ? smallest : "");
     Slice l(largest != nullptr ? largest : "");
-    return SomeFileOverlapsRange(cmp, disjoint_sorted_files_, files_,
+    return SomeFileOverlapsRange(cmp, disjoint_sorted_files_, file_level_,
                                  (smallest != nullptr ? &s : nullptr),
                                  (largest != nullptr ? &l : nullptr));
   }
 };
 
-TEST(FindFileTest, Empty) {
+TEST_F(FindLevelFileTest, LevelEmpty) {
+  LevelFileInit(0);
+
   ASSERT_EQ(0, Find("foo"));
   ASSERT_TRUE(! Overlaps("a", "z"));
   ASSERT_TRUE(! Overlaps(nullptr, "z"));
@@ -61,7 +302,9 @@ TEST(FindFileTest, Empty) {
   ASSERT_TRUE(! Overlaps(nullptr, nullptr));
 }
 
-TEST(FindFileTest, Single) {
+TEST_F(FindLevelFileTest, LevelSingle) {
+  LevelFileInit(1);
+
   Add("p", "q");
   ASSERT_EQ(0, Find("a"));
   ASSERT_EQ(0, Find("p"));
@@ -91,8 +334,9 @@ TEST(FindFileTest, Single) {
   ASSERT_TRUE(Overlaps(nullptr, nullptr));
 }
 
+TEST_F(FindLevelFileTest, LevelMultiple) {
+  LevelFileInit(4);
 
-TEST(FindFileTest, Multiple) {
   Add("150", "200");
   Add("200", "250");
   Add("300", "350");
@@ -130,7 +374,9 @@ TEST(FindFileTest, Multiple) {
   ASSERT_TRUE(Overlaps("450", "500"));
 }
 
-TEST(FindFileTest, MultipleNullBoundaries) {
+TEST_F(FindLevelFileTest, LevelMultipleNullBoundaries) {
+  LevelFileInit(4);
+
   Add("150", "200");
   Add("200", "250");
   Add("300", "350");
@@ -150,7 +396,9 @@ TEST(FindFileTest, MultipleNullBoundaries) {
   ASSERT_TRUE(Overlaps("450", nullptr));
 }
 
-TEST(FindFileTest, OverlapSequenceChecks) {
+TEST_F(FindLevelFileTest, LevelOverlapSequenceChecks) {
+  LevelFileInit(1);
+
   Add("200", "200", 5000, 3000);
   ASSERT_TRUE(! Overlaps("199", "199"));
   ASSERT_TRUE(! Overlaps("201", "300"));
@@ -159,7 +407,9 @@ TEST(FindFileTest, OverlapSequenceChecks) {
   ASSERT_TRUE(Overlaps("200", "210"));
 }
 
-TEST(FindFileTest, OverlappingFiles) {
+TEST_F(FindLevelFileTest, LevelOverlappingFiles) {
+  LevelFileInit(2);
+
   Add("150", "600");
   Add("400", "500");
   disjoint_sorted_files_ = false;
@@ -180,5 +430,6 @@ TEST(FindFileTest, OverlappingFiles) {
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
-  return rocksdb::test::RunAllTests();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
 }
diff --git a/src/rocksdb/db/wal_manager.cc b/src/rocksdb/db/wal_manager.cc
new file mode 100644
index 0000000..5651bae
--- /dev/null
+++ b/src/rocksdb/db/wal_manager.cc
@@ -0,0 +1,470 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/wal_manager.h"
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
+#include <algorithm>
+#include <vector>
+#include <memory>
+
+#include "db/filename.h"
+#include "db/transaction_log_impl.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/write_batch_internal.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/write_batch.h"
+#include "util/coding.h"
+#include "util/logging.h"
+#include "util/mutexlock.h"
+#include "util/sync_point.h"
+#include "util/string_util.h"
+
+namespace rocksdb {
+
+#ifndef ROCKSDB_LITE
+
+Status WalManager::GetSortedWalFiles(VectorLogPtr& files) {
+  // First get sorted files in db dir, then get sorted files from archived
+  // dir, to avoid a race condition where a log file is moved to archived
+  // dir in between.
+  Status s;
+  // list wal files in main db dir.
+  VectorLogPtr logs;
+  s = GetSortedWalsOfType(db_options_.wal_dir, logs, kAliveLogFile);
+  if (!s.ok()) {
+    return s;
+  }
+
+  // Reproduce the race condition where a log file is moved
+  // to archived dir, between these two sync points, used in
+  // (DBTest,TransactionLogIteratorRace)
+  TEST_SYNC_POINT("WalManager::GetSortedWalFiles:1");
+  TEST_SYNC_POINT("WalManager::GetSortedWalFiles:2");
+
+  files.clear();
+  // list wal files in archive dir.
+  std::string archivedir = ArchivalDirectory(db_options_.wal_dir);
+  if (env_->FileExists(archivedir)) {
+    s = GetSortedWalsOfType(archivedir, files, kArchivedLogFile);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  uint64_t latest_archived_log_number = 0;
+  if (!files.empty()) {
+    latest_archived_log_number = files.back()->LogNumber();
+    Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+        "Latest Archived log: %" PRIu64,
+        latest_archived_log_number);
+  }
+
+  files.reserve(files.size() + logs.size());
+  for (auto& log : logs) {
+    if (log->LogNumber() > latest_archived_log_number) {
+      files.push_back(std::move(log));
+    } else {
+      // When the race condition happens, we could see the
+      // same log in both db dir and archived dir. Simply
+      // ignore the one in db dir. Note that, if we read
+      // archived dir first, we would have missed the log file.
+      Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
+          "%s already moved to archive", log->PathName().c_str());
+    }
+  }
+
+  return s;
+}
+
+Status WalManager::GetUpdatesSince(
+    SequenceNumber seq, std::unique_ptr<TransactionLogIterator>* iter,
+    const TransactionLogIterator::ReadOptions& read_options,
+    VersionSet* version_set) {
+
+  //  Get all sorted Wal Files.
+  //  Do binary search and open files and find the seq number.
+
+  std::unique_ptr<VectorLogPtr> wal_files(new VectorLogPtr);
+  Status s = GetSortedWalFiles(*wal_files);
+  if (!s.ok()) {
+    return s;
+  }
+
+  s = RetainProbableWalFiles(*wal_files, seq);
+  if (!s.ok()) {
+    return s;
+  }
+  iter->reset(new TransactionLogIteratorImpl(
+      db_options_.wal_dir, &db_options_, read_options, env_options_, seq,
+      std::move(wal_files), version_set));
+  return (*iter)->status();
+}
+
+// 1. Go through all archived files and
+//    a. if ttl is enabled, delete outdated files
+//    b. if archive size limit is enabled, delete empty files,
+//        compute file number and size.
+// 2. If size limit is enabled:
+//    a. compute how many files should be deleted
+//    b. get sorted non-empty archived logs
+//    c. delete what should be deleted
+void WalManager::PurgeObsoleteWALFiles() {
+  bool const ttl_enabled = db_options_.WAL_ttl_seconds > 0;
+  bool const size_limit_enabled = db_options_.WAL_size_limit_MB > 0;
+  if (!ttl_enabled && !size_limit_enabled) {
+    return;
+  }
+
+  int64_t current_time;
+  Status s = env_->GetCurrentTime(&current_time);
+  if (!s.ok()) {
+    Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+        "Can't get current time: %s", s.ToString().c_str());
+    assert(false);
+    return;
+  }
+  uint64_t const now_seconds = static_cast<uint64_t>(current_time);
+  uint64_t const time_to_check = (ttl_enabled && !size_limit_enabled)
+                                     ? db_options_.WAL_ttl_seconds / 2
+                                     : kDefaultIntervalToDeleteObsoleteWAL;
+
+  if (purge_wal_files_last_run_ + time_to_check > now_seconds) {
+    return;
+  }
+
+  purge_wal_files_last_run_ = now_seconds;
+
+  std::string archival_dir = ArchivalDirectory(db_options_.wal_dir);
+  std::vector<std::string> files;
+  s = env_->GetChildren(archival_dir, &files);
+  if (!s.ok()) {
+    Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+        "Can't get archive files: %s", s.ToString().c_str());
+    assert(false);
+    return;
+  }
+
+  size_t log_files_num = 0;
+  uint64_t log_file_size = 0;
+
+  for (auto& f : files) {
+    uint64_t number;
+    FileType type;
+    if (ParseFileName(f, &number, &type) && type == kLogFile) {
+      std::string const file_path = archival_dir + "/" + f;
+      if (ttl_enabled) {
+        uint64_t file_m_time;
+        s = env_->GetFileModificationTime(file_path, &file_m_time);
+        if (!s.ok()) {
+          Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
+              "Can't get file mod time: %s: %s",
+              file_path.c_str(), s.ToString().c_str());
+          continue;
+        }
+        if (now_seconds - file_m_time > db_options_.WAL_ttl_seconds) {
+          s = env_->DeleteFile(file_path);
+          if (!s.ok()) {
+            Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
+                "Can't delete file: %s: %s",
+                file_path.c_str(), s.ToString().c_str());
+            continue;
+          } else {
+            MutexLock l(&read_first_record_cache_mutex_);
+            read_first_record_cache_.erase(number);
+          }
+          continue;
+        }
+      }
+
+      if (size_limit_enabled) {
+        uint64_t file_size;
+        s = env_->GetFileSize(file_path, &file_size);
+        if (!s.ok()) {
+          Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+              "Unable to get file size: %s: %s",
+              file_path.c_str(), s.ToString().c_str());
+          return;
+        } else {
+          if (file_size > 0) {
+            log_file_size = std::max(log_file_size, file_size);
+            ++log_files_num;
+          } else {
+            s = env_->DeleteFile(file_path);
+            if (!s.ok()) {
+              Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
+                  "Unable to delete file: %s: %s",
+                  file_path.c_str(), s.ToString().c_str());
+              continue;
+            } else {
+              MutexLock l(&read_first_record_cache_mutex_);
+              read_first_record_cache_.erase(number);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  if (0 == log_files_num || !size_limit_enabled) {
+    return;
+  }
+
+  size_t const files_keep_num =
+      db_options_.WAL_size_limit_MB * 1024 * 1024 / log_file_size;
+  if (log_files_num <= files_keep_num) {
+    return;
+  }
+
+  size_t files_del_num = log_files_num - files_keep_num;
+  VectorLogPtr archived_logs;
+  GetSortedWalsOfType(archival_dir, archived_logs, kArchivedLogFile);
+
+  if (files_del_num > archived_logs.size()) {
+    Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
+        "Trying to delete more archived log files than "
+        "exist. Deleting all");
+    files_del_num = archived_logs.size();
+  }
+
+  for (size_t i = 0; i < files_del_num; ++i) {
+    std::string const file_path = archived_logs[i]->PathName();
+    s = env_->DeleteFile(db_options_.wal_dir + "/" + file_path);
+    if (!s.ok()) {
+      Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
+          "Unable to delete file: %s: %s", file_path.c_str(),
+          s.ToString().c_str());
+      continue;
+    } else {
+      MutexLock l(&read_first_record_cache_mutex_);
+      read_first_record_cache_.erase(archived_logs[i]->LogNumber());
+    }
+  }
+}
+
+void WalManager::ArchiveWALFile(const std::string& fname, uint64_t number) {
+  auto archived_log_name = ArchivedLogFileName(db_options_.wal_dir, number);
+  // The sync point below is used in (DBTest,TransactionLogIteratorRace)
+  TEST_SYNC_POINT("WalManager::PurgeObsoleteFiles:1");
+  Status s = env_->RenameFile(fname, archived_log_name);
+  // The sync point below is used in (DBTest,TransactionLogIteratorRace)
+  TEST_SYNC_POINT("WalManager::PurgeObsoleteFiles:2");
+  Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+      "Move log file %s to %s -- %s\n", fname.c_str(),
+      archived_log_name.c_str(), s.ToString().c_str());
+}
+
+namespace {
+struct CompareLogByPointer {
+  bool operator()(const std::unique_ptr<LogFile>& a,
+                  const std::unique_ptr<LogFile>& b) {
+    LogFileImpl* a_impl = dynamic_cast<LogFileImpl*>(a.get());
+    LogFileImpl* b_impl = dynamic_cast<LogFileImpl*>(b.get());
+    return *a_impl < *b_impl;
+  }
+};
+}
+
+Status WalManager::GetSortedWalsOfType(const std::string& path,
+                                       VectorLogPtr& log_files,
+                                       WalFileType log_type) {
+  std::vector<std::string> all_files;
+  const Status status = env_->GetChildren(path, &all_files);
+  if (!status.ok()) {
+    return status;
+  }
+  log_files.reserve(all_files.size());
+  for (const auto& f : all_files) {
+    uint64_t number;
+    FileType type;
+    if (ParseFileName(f, &number, &type) && type == kLogFile) {
+      SequenceNumber sequence;
+      Status s = ReadFirstRecord(log_type, number, &sequence);
+      if (!s.ok()) {
+        return s;
+      }
+      if (sequence == 0) {
+        // empty file
+        continue;
+      }
+
+      // Reproduce the race condition where a log file is moved
+      // to archived dir, between these two sync points, used in
+      // (DBTest,TransactionLogIteratorRace)
+      TEST_SYNC_POINT("WalManager::GetSortedWalsOfType:1");
+      TEST_SYNC_POINT("WalManager::GetSortedWalsOfType:2");
+
+      uint64_t size_bytes;
+      s = env_->GetFileSize(LogFileName(path, number), &size_bytes);
+      // re-try in case the alive log file has been moved to archive.
+      std::string archived_file = ArchivedLogFileName(path, number);
+      if (!s.ok() && log_type == kAliveLogFile &&
+          env_->FileExists(archived_file)) {
+        s = env_->GetFileSize(archived_file, &size_bytes);
+        if (!s.ok() && !env_->FileExists(archived_file)) {
+          // oops, the file just got deleted from archived dir! move on
+          s = Status::OK();
+          continue;
+        }
+      }
+      if (!s.ok()) {
+        return s;
+      }
+
+      log_files.push_back(std::move(std::unique_ptr<LogFile>(
+          new LogFileImpl(number, log_type, sequence, size_bytes))));
+    }
+  }
+  CompareLogByPointer compare_log_files;
+  std::sort(log_files.begin(), log_files.end(), compare_log_files);
+  return status;
+}
+
+Status WalManager::RetainProbableWalFiles(VectorLogPtr& all_logs,
+                                          const SequenceNumber target) {
+  int64_t start = 0;  // signed to avoid overflow when target is < first file.
+  int64_t end = static_cast<int64_t>(all_logs.size()) - 1;
+  // Binary Search. avoid opening all files.
+  while (end >= start) {
+    int64_t mid = start + (end - start) / 2;  // Avoid overflow.
+    SequenceNumber current_seq_num = all_logs.at(mid)->StartSequence();
+    if (current_seq_num == target) {
+      end = mid;
+      break;
+    } else if (current_seq_num < target) {
+      start = mid + 1;
+    } else {
+      end = mid - 1;
+    }
+  }
+  // end could be -ve.
+  size_t start_index = std::max(static_cast<int64_t>(0), end);
+  // The last wal file is always included
+  all_logs.erase(all_logs.begin(), all_logs.begin() + start_index);
+  return Status::OK();
+}
+
+Status WalManager::ReadFirstRecord(const WalFileType type,
+                                   const uint64_t number,
+                                   SequenceNumber* sequence) {
+  *sequence = 0;
+  if (type != kAliveLogFile && type != kArchivedLogFile) {
+    Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+        "[WalManger] Unknown file type %s", ToString(type).c_str());
+    return Status::NotSupported(
+        "File Type Not Known " + ToString(type));
+  }
+  {
+    MutexLock l(&read_first_record_cache_mutex_);
+    auto itr = read_first_record_cache_.find(number);
+    if (itr != read_first_record_cache_.end()) {
+      *sequence = itr->second;
+      return Status::OK();
+    }
+  }
+  Status s;
+  if (type == kAliveLogFile) {
+    std::string fname = LogFileName(db_options_.wal_dir, number);
+    s = ReadFirstLine(fname, sequence);
+    if (env_->FileExists(fname) && !s.ok()) {
+      // return any error that is not caused by non-existing file
+      return s;
+    }
+  }
+
+  if (type == kArchivedLogFile || !s.ok()) {
+    //  check if the file got moved to archive.
+    std::string archived_file =
+        ArchivedLogFileName(db_options_.wal_dir, number);
+    s = ReadFirstLine(archived_file, sequence);
+    // maybe the file was deleted from archive dir. If that's the case, return
+    // Status::OK(). The caller with identify this as empty file because
+    // *sequence == 0
+    if (!s.ok() && !env_->FileExists(archived_file)) {
+      return Status::OK();
+    }
+  }
+
+  if (s.ok() && *sequence != 0) {
+    MutexLock l(&read_first_record_cache_mutex_);
+    read_first_record_cache_.insert({number, *sequence});
+  }
+  return s;
+}
+
+// the function returns status.ok() and sequence == 0 if the file exists, but is
+// empty
+Status WalManager::ReadFirstLine(const std::string& fname,
+                                 SequenceNumber* sequence) {
+  struct LogReporter : public log::Reader::Reporter {
+    Env* env;
+    Logger* info_log;
+    const char* fname;
+
+    Status* status;
+    bool ignore_error;  // true if db_options_.paranoid_checks==false
+    virtual void Corruption(size_t bytes, const Status& s) override {
+      Log(InfoLogLevel::WARN_LEVEL, info_log,
+          "[WalManager] %s%s: dropping %d bytes; %s",
+          (this->ignore_error ? "(ignoring error) " : ""), fname,
+          static_cast<int>(bytes), s.ToString().c_str());
+      if (this->status->ok()) {
+        // only keep the first error
+        *this->status = s;
+      }
+    }
+  };
+
+  std::unique_ptr<SequentialFile> file;
+  Status status = env_->NewSequentialFile(fname, &file, env_options_);
+
+  if (!status.ok()) {
+    return status;
+  }
+
+  LogReporter reporter;
+  reporter.env = env_;
+  reporter.info_log = db_options_.info_log.get();
+  reporter.fname = fname.c_str();
+  reporter.status = &status;
+  reporter.ignore_error = !db_options_.paranoid_checks;
+  log::Reader reader(std::move(file), &reporter, true /*checksum*/,
+                     0 /*initial_offset*/);
+  std::string scratch;
+  Slice record;
+
+  if (reader.ReadRecord(&record, &scratch) &&
+      (status.ok() || !db_options_.paranoid_checks)) {
+    if (record.size() < 12) {
+      reporter.Corruption(record.size(),
+                          Status::Corruption("log record too small"));
+      // TODO read record's till the first no corrupt entry?
+    } else {
+      WriteBatch batch;
+      WriteBatchInternal::SetContents(&batch, record);
+      *sequence = WriteBatchInternal::Sequence(&batch);
+      return Status::OK();
+    }
+  }
+
+  // ReadRecord returns false on EOF, which means that the log file is empty. we
+  // return status.ok() in that case and set sequence number to 0
+  *sequence = 0;
+  return status;
+}
+
+#endif  // ROCKSDB_LITE
+}  // namespace rocksdb
diff --git a/src/rocksdb/db/wal_manager.h b/src/rocksdb/db/wal_manager.h
new file mode 100644
index 0000000..fc04863
--- /dev/null
+++ b/src/rocksdb/db/wal_manager.h
@@ -0,0 +1,95 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <atomic>
+#include <deque>
+#include <limits>
+#include <set>
+#include <utility>
+#include <vector>
+#include <string>
+#include <memory>
+
+#include "port/port.h"
+
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/types.h"
+#include "rocksdb/transaction_log.h"
+#include "rocksdb/status.h"
+
+#include "db/version_set.h"
+
+namespace rocksdb {
+
+#ifndef ROCKSDB_LITE
+class WalManager {
+ public:
+  WalManager(const DBOptions& db_options, const EnvOptions& env_options)
+      : db_options_(db_options),
+        env_options_(env_options),
+        env_(db_options.env),
+        purge_wal_files_last_run_(0) {}
+
+  Status GetSortedWalFiles(VectorLogPtr& files);
+
+  Status GetUpdatesSince(
+      SequenceNumber seq_number, std::unique_ptr<TransactionLogIterator>* iter,
+      const TransactionLogIterator::ReadOptions& read_options,
+      VersionSet* version_set);
+
+  void PurgeObsoleteWALFiles();
+
+  void ArchiveWALFile(const std::string& fname, uint64_t number);
+
+  Status TEST_ReadFirstRecord(const WalFileType type, const uint64_t number,
+                              SequenceNumber* sequence) {
+    return ReadFirstRecord(type, number, sequence);
+  }
+
+  Status TEST_ReadFirstLine(const std::string& fname,
+                            SequenceNumber* sequence) {
+    return ReadFirstLine(fname, sequence);
+  }
+
+ private:
+  Status GetSortedWalsOfType(const std::string& path, VectorLogPtr& log_files,
+                             WalFileType type);
+  // Requires: all_logs should be sorted with earliest log file first
+  // Retains all log files in all_logs which contain updates with seq no.
+  // Greater Than or Equal to the requested SequenceNumber.
+  Status RetainProbableWalFiles(VectorLogPtr& all_logs,
+                                const SequenceNumber target);
+
+  Status ReadFirstRecord(const WalFileType type, const uint64_t number,
+                         SequenceNumber* sequence);
+
+  Status ReadFirstLine(const std::string& fname, SequenceNumber* sequence);
+
+  // ------- state from DBImpl ------
+  const DBOptions& db_options_;
+  const EnvOptions& env_options_;
+  Env* env_;
+
+  // ------- WalManager state -------
+  // cache for ReadFirstRecord() calls
+  std::unordered_map<uint64_t, SequenceNumber> read_first_record_cache_;
+  port::Mutex read_first_record_cache_mutex_;
+
+  // last time when PurgeObsoleteWALFiles ran.
+  uint64_t purge_wal_files_last_run_;
+
+  // obsolete files will be deleted every this seconds if ttl deletion is
+  // enabled and archive size_limit is disabled.
+  static const uint64_t kDefaultIntervalToDeleteObsoleteWAL = 600;
+};
+
+#endif  // ROCKSDB_LITE
+}  // namespace rocksdb
diff --git a/src/rocksdb/db/wal_manager_test.cc b/src/rocksdb/db/wal_manager_test.cc
new file mode 100644
index 0000000..325f0d9
--- /dev/null
+++ b/src/rocksdb/db/wal_manager_test.cc
@@ -0,0 +1,289 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include <map>
+#include <string>
+
+#include "rocksdb/cache.h"
+#include "rocksdb/write_batch.h"
+
+#include "db/wal_manager.h"
+#include "db/log_writer.h"
+#include "db/column_family.h"
+#include "db/version_set.h"
+#include "db/writebuffer.h"
+#include "util/mock_env.h"
+#include "util/string_util.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+#include "table/mock_table.h"
+#include "db/db_impl.h"
+
+namespace rocksdb {
+
+// TODO(icanadi) mock out VersionSet
+// TODO(icanadi) move other WalManager-specific tests from db_test here
+class WalManagerTest : public testing::Test {
+ public:
+  WalManagerTest()
+      : env_(new MockEnv(Env::Default())),
+        dbname_(test::TmpDir() + "/wal_manager_test"),
+        table_cache_(NewLRUCache(50000, 16)),
+        write_buffer_(db_options_.db_write_buffer_size),
+        current_log_number_(0) {
+    DestroyDB(dbname_, Options());
+  }
+
+  void Init() {
+    ASSERT_OK(env_->CreateDirIfMissing(dbname_));
+    ASSERT_OK(env_->CreateDirIfMissing(ArchivalDirectory(dbname_)));
+    db_options_.db_paths.emplace_back(dbname_,
+                                      std::numeric_limits<uint64_t>::max());
+    db_options_.wal_dir = dbname_;
+    db_options_.env = env_.get();
+
+    versions_.reset(new VersionSet(dbname_, &db_options_, env_options_,
+                                   table_cache_.get(), &write_buffer_,
+                                   &write_controller_));
+
+    wal_manager_.reset(new WalManager(db_options_, env_options_));
+  }
+
+  void Reopen() {
+    wal_manager_.reset(new WalManager(db_options_, env_options_));
+  }
+
+  // NOT thread safe
+  void Put(const std::string& key, const std::string& value) {
+    assert(current_log_writer_.get() != nullptr);
+    uint64_t seq =  versions_->LastSequence() + 1;
+    WriteBatch batch;
+    batch.Put(key, value);
+    WriteBatchInternal::SetSequence(&batch, seq);
+    current_log_writer_->AddRecord(WriteBatchInternal::Contents(&batch));
+    versions_->SetLastSequence(seq);
+  }
+
+  // NOT thread safe
+  void RollTheLog(bool archived) {
+    current_log_number_++;
+    std::string fname = ArchivedLogFileName(dbname_, current_log_number_);
+    unique_ptr<WritableFile> file;
+    ASSERT_OK(env_->NewWritableFile(fname, &file, env_options_));
+    current_log_writer_.reset(new log::Writer(std::move(file)));
+  }
+
+  void CreateArchiveLogs(int num_logs, int entries_per_log) {
+    for (int i = 1; i <= num_logs; ++i) {
+      RollTheLog(true);
+      for (int k = 0; k < entries_per_log; ++k) {
+        Put(ToString(k), std::string(1024, 'a'));
+      }
+    }
+  }
+
+  std::unique_ptr<TransactionLogIterator> OpenTransactionLogIter(
+      const SequenceNumber seq) {
+    unique_ptr<TransactionLogIterator> iter;
+    Status status = wal_manager_->GetUpdatesSince(
+        seq, &iter, TransactionLogIterator::ReadOptions(), versions_.get());
+    EXPECT_OK(status);
+    return std::move(iter);
+  }
+
+  std::unique_ptr<MockEnv> env_;
+  std::string dbname_;
+  WriteController write_controller_;
+  EnvOptions env_options_;
+  std::shared_ptr<Cache> table_cache_;
+  DBOptions db_options_;
+  WriteBuffer write_buffer_;
+  std::unique_ptr<VersionSet> versions_;
+  std::unique_ptr<WalManager> wal_manager_;
+
+  std::unique_ptr<log::Writer> current_log_writer_;
+  uint64_t current_log_number_;
+};
+
+TEST_F(WalManagerTest, ReadFirstRecordCache) {
+  Init();
+  std::string path = dbname_ + "/000001.log";
+  unique_ptr<WritableFile> file;
+  ASSERT_OK(env_->NewWritableFile(path, &file, EnvOptions()));
+
+  SequenceNumber s;
+  ASSERT_OK(wal_manager_->TEST_ReadFirstLine(path, &s));
+  ASSERT_EQ(s, 0U);
+
+  ASSERT_OK(wal_manager_->TEST_ReadFirstRecord(kAliveLogFile, 1, &s));
+  ASSERT_EQ(s, 0U);
+
+  log::Writer writer(std::move(file));
+  WriteBatch batch;
+  batch.Put("foo", "bar");
+  WriteBatchInternal::SetSequence(&batch, 10);
+  writer.AddRecord(WriteBatchInternal::Contents(&batch));
+
+  // TODO(icanadi) move SpecialEnv outside of db_test, so we can reuse it here.
+  // Waiting for lei to finish with db_test
+  // env_->count_sequential_reads_ = true;
+  // sequential_read_counter_ sanity test
+  // ASSERT_EQ(env_->sequential_read_counter_.Read(), 0);
+
+  ASSERT_OK(wal_manager_->TEST_ReadFirstRecord(kAliveLogFile, 1, &s));
+  ASSERT_EQ(s, 10U);
+  // did a read
+  // TODO(icanadi) move SpecialEnv outside of db_test, so we can reuse it here
+  // ASSERT_EQ(env_->sequential_read_counter_.Read(), 1);
+
+  ASSERT_OK(wal_manager_->TEST_ReadFirstRecord(kAliveLogFile, 1, &s));
+  ASSERT_EQ(s, 10U);
+  // no new reads since the value is cached
+  // TODO(icanadi) move SpecialEnv outside of db_test, so we can reuse it here
+  // ASSERT_EQ(env_->sequential_read_counter_.Read(), 1);
+}
+
+namespace {
+uint64_t GetLogDirSize(std::string dir_path, Env* env) {
+  uint64_t dir_size = 0;
+  std::vector<std::string> files;
+  env->GetChildren(dir_path, &files);
+  for (auto& f : files) {
+    uint64_t number;
+    FileType type;
+    if (ParseFileName(f, &number, &type) && type == kLogFile) {
+      std::string const file_path = dir_path + "/" + f;
+      uint64_t file_size;
+      env->GetFileSize(file_path, &file_size);
+      dir_size += file_size;
+    }
+  }
+  return dir_size;
+}
+std::vector<std::uint64_t> ListSpecificFiles(
+    Env* env, const std::string& path, const FileType expected_file_type) {
+  std::vector<std::string> files;
+  std::vector<uint64_t> file_numbers;
+  env->GetChildren(path, &files);
+  uint64_t number;
+  FileType type;
+  for (size_t i = 0; i < files.size(); ++i) {
+    if (ParseFileName(files[i], &number, &type)) {
+      if (type == expected_file_type) {
+        file_numbers.push_back(number);
+      }
+    }
+  }
+  return std::move(file_numbers);
+}
+
+int CountRecords(TransactionLogIterator* iter) {
+  int count = 0;
+  SequenceNumber lastSequence = 0;
+  BatchResult res;
+  while (iter->Valid()) {
+    res = iter->GetBatch();
+    EXPECT_TRUE(res.sequence > lastSequence);
+    ++count;
+    lastSequence = res.sequence;
+    EXPECT_OK(iter->status());
+    iter->Next();
+  }
+  return count;
+}
+}  // namespace
+
+TEST_F(WalManagerTest, WALArchivalSizeLimit) {
+  db_options_.WAL_ttl_seconds = 0;
+  db_options_.WAL_size_limit_MB = 1000;
+  Init();
+
+  // TEST : Create WalManager with huge size limit and no ttl.
+  // Create some archived files and call PurgeObsoleteWALFiles().
+  // Count the archived log files that survived.
+  // Assert that all of them did.
+  // Change size limit. Re-open WalManager.
+  // Assert that archive is not greater than WAL_size_limit_MB after
+  // PurgeObsoleteWALFiles()
+  // Set ttl and time_to_check_ to small values. Re-open db.
+  // Assert that there are no archived logs left.
+
+  std::string archive_dir = ArchivalDirectory(dbname_);
+  CreateArchiveLogs(20, 5000);
+
+  std::vector<std::uint64_t> log_files =
+      ListSpecificFiles(env_.get(), archive_dir, kLogFile);
+  ASSERT_EQ(log_files.size(), 20U);
+
+  db_options_.WAL_size_limit_MB = 8;
+  Reopen();
+  wal_manager_->PurgeObsoleteWALFiles();
+
+  uint64_t archive_size = GetLogDirSize(archive_dir, env_.get());
+  ASSERT_TRUE(archive_size <= db_options_.WAL_size_limit_MB * 1024 * 1024);
+
+  db_options_.WAL_ttl_seconds = 1;
+  env_->FakeSleepForMicroseconds(2 * 1000 * 1000);
+  Reopen();
+  wal_manager_->PurgeObsoleteWALFiles();
+
+  log_files = ListSpecificFiles(env_.get(), archive_dir, kLogFile);
+  ASSERT_TRUE(log_files.empty());
+}
+
+TEST_F(WalManagerTest, WALArchivalTtl) {
+  db_options_.WAL_ttl_seconds = 1000;
+  Init();
+
+  // TEST : Create WalManager with a ttl and no size limit.
+  // Create some archived log files and call PurgeObsoleteWALFiles().
+  // Assert that files are not deleted
+  // Reopen db with small ttl.
+  // Assert that all archived logs was removed.
+
+  std::string archive_dir = ArchivalDirectory(dbname_);
+  CreateArchiveLogs(20, 5000);
+
+  std::vector<uint64_t> log_files =
+      ListSpecificFiles(env_.get(), archive_dir, kLogFile);
+  ASSERT_GT(log_files.size(), 0U);
+
+  db_options_.WAL_ttl_seconds = 1;
+  env_->FakeSleepForMicroseconds(3 * 1000 * 1000);
+  Reopen();
+  wal_manager_->PurgeObsoleteWALFiles();
+
+  log_files = ListSpecificFiles(env_.get(), archive_dir, kLogFile);
+  ASSERT_TRUE(log_files.empty());
+}
+
+TEST_F(WalManagerTest, TransactionLogIteratorMoveOverZeroFiles) {
+  Init();
+  RollTheLog(false);
+  Put("key1", std::string(1024, 'a'));
+  // Create a zero record WAL file.
+  RollTheLog(false);
+  RollTheLog(false);
+
+  Put("key2", std::string(1024, 'a'));
+
+  auto iter = OpenTransactionLogIter(0);
+  ASSERT_EQ(2, CountRecords(iter.get()));
+}
+
+TEST_F(WalManagerTest, TransactionLogIteratorJustEmptyFile) {
+  Init();
+  RollTheLog(false);
+  auto iter = OpenTransactionLogIter(0);
+  // Check that an empty iterator is returned
+  ASSERT_TRUE(!iter->Valid());
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/write_batch.cc b/src/rocksdb/db/write_batch.cc
index 734d1e3..52956f8 100644
--- a/src/rocksdb/db/write_batch.cc
+++ b/src/rocksdb/db/write_batch.cc
@@ -23,16 +23,17 @@
 //    data: uint8[len]
 
 #include "rocksdb/write_batch.h"
-#include "rocksdb/options.h"
 #include "rocksdb/merge_operator.h"
 #include "db/dbformat.h"
 #include "db/db_impl.h"
+#include "db/column_family.h"
 #include "db/memtable.h"
 #include "db/snapshot.h"
 #include "db/write_batch_internal.h"
 #include "util/coding.h"
 #include "util/statistics.h"
 #include <stdexcept>
+#include "util/perf_context_imp.h"
 
 namespace rocksdb {
 
@@ -48,20 +49,6 @@ WriteBatch::~WriteBatch() { }
 
 WriteBatch::Handler::~Handler() { }
 
-void WriteBatch::Handler::Put(const Slice& key, const Slice& value) {
-  // you need to either implement Put or PutCF
-  throw std::runtime_error("Handler::Put not implemented!");
-}
-
-void WriteBatch::Handler::Merge(const Slice& key, const Slice& value) {
-  throw std::runtime_error("Handler::Merge not implemented!");
-}
-
-void WriteBatch::Handler::Delete(const Slice& key) {
-  // you need to either implement Delete or DeleteCF
-  throw std::runtime_error("Handler::Delete not implemented!");
-}
-
 void WriteBatch::Handler::LogData(const Slice& blob) {
   // If the user has not specified something to do with blobs, then we ignore
   // them.
@@ -80,6 +67,58 @@ int WriteBatch::Count() const {
   return WriteBatchInternal::Count(this);
 }
 
+Status ReadRecordFromWriteBatch(Slice* input, char* tag,
+                                uint32_t* column_family, Slice* key,
+                                Slice* value, Slice* blob) {
+  assert(key != nullptr && value != nullptr);
+  *tag = (*input)[0];
+  input->remove_prefix(1);
+  *column_family = 0;  // default
+  switch (*tag) {
+    case kTypeColumnFamilyValue:
+      if (!GetVarint32(input, column_family)) {
+        return Status::Corruption("bad WriteBatch Put");
+      }
+    // intentional fallthrough
+    case kTypeValue:
+      if (!GetLengthPrefixedSlice(input, key) ||
+          !GetLengthPrefixedSlice(input, value)) {
+        return Status::Corruption("bad WriteBatch Put");
+      }
+      break;
+    case kTypeColumnFamilyDeletion:
+      if (!GetVarint32(input, column_family)) {
+        return Status::Corruption("bad WriteBatch Delete");
+      }
+    // intentional fallthrough
+    case kTypeDeletion:
+      if (!GetLengthPrefixedSlice(input, key)) {
+        return Status::Corruption("bad WriteBatch Delete");
+      }
+      break;
+    case kTypeColumnFamilyMerge:
+      if (!GetVarint32(input, column_family)) {
+        return Status::Corruption("bad WriteBatch Merge");
+      }
+    // intentional fallthrough
+    case kTypeMerge:
+      if (!GetLengthPrefixedSlice(input, key) ||
+          !GetLengthPrefixedSlice(input, value)) {
+        return Status::Corruption("bad WriteBatch Merge");
+      }
+      break;
+    case kTypeLogData:
+      assert(blob != nullptr);
+      if (!GetLengthPrefixedSlice(input, blob)) {
+        return Status::Corruption("bad WriteBatch Blob");
+      }
+      break;
+    default:
+      return Status::Corruption("unknown WriteBatch tag");
+  }
+  return Status::OK();
+}
+
 Status WriteBatch::Iterate(Handler* handler) const {
   Slice input(rep_);
   if (input.size() < kHeader) {
@@ -91,57 +130,33 @@ Status WriteBatch::Iterate(Handler* handler) const {
   int found = 0;
   Status s;
   while (s.ok() && !input.empty() && handler->Continue()) {
-    char tag = input[0];
-    input.remove_prefix(1);
+    char tag = 0;
     uint32_t column_family = 0;  // default
+
+    s = ReadRecordFromWriteBatch(&input, &tag, &column_family, &key, &value,
+                                 &blob);
+    if (!s.ok()) {
+      return s;
+    }
+
     switch (tag) {
       case kTypeColumnFamilyValue:
-        if (!GetVarint32(&input, &column_family)) {
-          return Status::Corruption("bad WriteBatch Put");
-        }
-      // intentional fallthrough
       case kTypeValue:
-        if (GetLengthPrefixedSlice(&input, &key) &&
-            GetLengthPrefixedSlice(&input, &value)) {
-          s = handler->PutCF(column_family, key, value);
-          found++;
-        } else {
-          return Status::Corruption("bad WriteBatch Put");
-        }
+        s = handler->PutCF(column_family, key, value);
+        found++;
         break;
       case kTypeColumnFamilyDeletion:
-        if (!GetVarint32(&input, &column_family)) {
-          return Status::Corruption("bad WriteBatch Delete");
-        }
-      // intentional fallthrough
       case kTypeDeletion:
-        if (GetLengthPrefixedSlice(&input, &key)) {
-          s = handler->DeleteCF(column_family, key);
-          found++;
-        } else {
-          return Status::Corruption("bad WriteBatch Delete");
-        }
+        s = handler->DeleteCF(column_family, key);
+        found++;
         break;
       case kTypeColumnFamilyMerge:
-        if (!GetVarint32(&input, &column_family)) {
-          return Status::Corruption("bad WriteBatch Merge");
-        }
-      // intentional fallthrough
       case kTypeMerge:
-        if (GetLengthPrefixedSlice(&input, &key) &&
-            GetLengthPrefixedSlice(&input, &value)) {
-          s = handler->MergeCF(column_family, key, value);
-          found++;
-        } else {
-          return Status::Corruption("bad WriteBatch Merge");
-        }
+        s = handler->MergeCF(column_family, key, value);
+        found++;
         break;
       case kTypeLogData:
-        if (GetLengthPrefixedSlice(&input, &blob)) {
-          handler->LogData(blob);
-        } else {
-          return Status::Corruption("bad WriteBatch Blob");
-        }
+        handler->LogData(blob);
         break;
       default:
         return Status::Corruption("unknown WriteBatch tag");
@@ -186,17 +201,6 @@ void WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id,
   PutLengthPrefixedSlice(&b->rep_, value);
 }
 
-namespace {
-inline uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family) {
-  uint32_t column_family_id = 0;
-  if (column_family != nullptr) {
-    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
-    column_family_id = cfh->GetID();
-  }
-  return column_family_id;
-}
-}  // namespace
-
 void WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key,
                      const Slice& value) {
   WriteBatchInternal::Put(this, GetColumnFamilyID(column_family), key, value);
@@ -236,6 +240,23 @@ void WriteBatch::Delete(ColumnFamilyHandle* column_family, const Slice& key) {
   WriteBatchInternal::Delete(this, GetColumnFamilyID(column_family), key);
 }
 
+void WriteBatchInternal::Delete(WriteBatch* b, uint32_t column_family_id,
+                                const SliceParts& key) {
+  WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+  if (column_family_id == 0) {
+    b->rep_.push_back(static_cast<char>(kTypeDeletion));
+  } else {
+    b->rep_.push_back(static_cast<char>(kTypeColumnFamilyDeletion));
+    PutVarint32(&b->rep_, column_family_id);
+  }
+  PutLengthPrefixedSliceParts(&b->rep_, key);
+}
+
+void WriteBatch::Delete(ColumnFamilyHandle* column_family,
+                        const SliceParts& key) {
+  WriteBatchInternal::Delete(this, GetColumnFamilyID(column_family), key);
+}
+
 void WriteBatchInternal::Merge(WriteBatch* b, uint32_t column_family_id,
                                const Slice& key, const Slice& value) {
   WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
@@ -260,21 +281,23 @@ void WriteBatch::PutLogData(const Slice& blob) {
 }
 
 namespace {
+// This class can *only* be used from a single-threaded write thread, because it
+// calls ColumnFamilyMemTablesImpl::Seek()
 class MemTableInserter : public WriteBatch::Handler {
  public:
   SequenceNumber sequence_;
   ColumnFamilyMemTables* cf_mems_;
-  bool recovery_;
+  bool ignore_missing_column_families_;
   uint64_t log_number_;
   DBImpl* db_;
   const bool dont_filter_deletes_;
 
   MemTableInserter(SequenceNumber sequence, ColumnFamilyMemTables* cf_mems,
-                   bool recovery, uint64_t log_number, DB* db,
-                   const bool dont_filter_deletes)
+                   bool ignore_missing_column_families, uint64_t log_number,
+                   DB* db, const bool dont_filter_deletes)
       : sequence_(sequence),
         cf_mems_(cf_mems),
-        recovery_(recovery),
+        ignore_missing_column_families_(ignore_missing_column_families),
         log_number_(log_number),
         db_(reinterpret_cast<DBImpl*>(db)),
         dont_filter_deletes_(dont_filter_deletes) {
@@ -285,13 +308,21 @@ class MemTableInserter : public WriteBatch::Handler {
   }
 
   bool SeekToColumnFamily(uint32_t column_family_id, Status* s) {
+    // We are only allowed to call this from a single-threaded write thread
+    // (or while holding DB mutex)
     bool found = cf_mems_->Seek(column_family_id);
-    if (recovery_ && (!found || log_number_ < cf_mems_->GetLogNumber())) {
-      // if in recovery envoronment:
-      // * If column family was not found, it might mean that the WAL write
-      // batch references to the column family that was dropped after the
-      // insert. We don't want to fail the whole write batch in that case -- we
-      // just ignore the update.
+    if (!found) {
+      if (ignore_missing_column_families_) {
+        *s = Status::OK();
+      } else {
+        *s = Status::InvalidArgument(
+            "Invalid column family specified in write batch");
+      }
+      return false;
+    }
+    if (log_number_ != 0 && log_number_ < cf_mems_->GetLogNumber()) {
+      // This is true only in recovery environment (log_number_ is always 0 in
+      // non-recovery, regular write code-path)
       // * If log_number_ < cf_mems_->GetLogNumber(), this means that column
       // family already contains updates from this log. We can't apply updates
       // twice because of update-in-place or merge workloads -- ignore the
@@ -299,34 +330,24 @@ class MemTableInserter : public WriteBatch::Handler {
       *s = Status::OK();
       return false;
     }
-    if (!found) {
-      assert(!recovery_);
-      // If the column family was not found in non-recovery enviornment
-      // (client's write code-path), we have to fail the write and return
-      // the failure status to the client.
-      *s = Status::InvalidArgument(
-          "Invalid column family specified in write batch");
-      return false;
-    }
     return true;
   }
-
   virtual Status PutCF(uint32_t column_family_id, const Slice& key,
-                       const Slice& value) {
+                       const Slice& value) override {
     Status seek_status;
     if (!SeekToColumnFamily(column_family_id, &seek_status)) {
       ++sequence_;
       return seek_status;
     }
     MemTable* mem = cf_mems_->GetMemTable();
-    const Options* options = cf_mems_->GetOptions();
-    if (!options->inplace_update_support) {
+    auto* moptions = mem->GetMemTableOptions();
+    if (!moptions->inplace_update_support) {
       mem->Add(sequence_, kTypeValue, key, value);
-    } else if (options->inplace_callback == nullptr) {
+    } else if (moptions->inplace_callback == nullptr) {
       mem->Update(sequence_, key, value);
-      RecordTick(options->statistics.get(), NUMBER_KEYS_UPDATED);
+      RecordTick(moptions->statistics, NUMBER_KEYS_UPDATED);
     } else {
-      if (mem->UpdateCallback(sequence_, key, value, *options)) {
+      if (mem->UpdateCallback(sequence_, key, value)) {
       } else {
         // key not found in memtable. Do sst get, update, add
         SnapshotImpl read_from_snapshot;
@@ -344,18 +365,18 @@ class MemTableInserter : public WriteBatch::Handler {
         Status s = db_->Get(ropts, cf_handle, key, &prev_value);
 
         char* prev_buffer = const_cast<char*>(prev_value.c_str());
-        uint32_t prev_size = prev_value.size();
-        auto status = options->inplace_callback(s.ok() ? prev_buffer : nullptr,
-                                                s.ok() ? &prev_size : nullptr,
-                                                value, &merged_value);
+        uint32_t prev_size = static_cast<uint32_t>(prev_value.size());
+        auto status = moptions->inplace_callback(s.ok() ? prev_buffer : nullptr,
+                                                 s.ok() ? &prev_size : nullptr,
+                                                 value, &merged_value);
         if (status == UpdateStatus::UPDATED_INPLACE) {
           // prev_value is updated in-place with final value.
           mem->Add(sequence_, kTypeValue, key, Slice(prev_buffer, prev_size));
-          RecordTick(options->statistics.get(), NUMBER_KEYS_WRITTEN);
+          RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN);
         } else if (status == UpdateStatus::UPDATED) {
           // merged_value contains the final value.
           mem->Add(sequence_, kTypeValue, key, Slice(merged_value));
-          RecordTick(options->statistics.get(), NUMBER_KEYS_WRITTEN);
+          RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN);
         }
       }
     }
@@ -363,28 +384,29 @@ class MemTableInserter : public WriteBatch::Handler {
     // sequence number. Even if the update eventually fails and does not result
     // in memtable add/update.
     sequence_++;
+    cf_mems_->CheckMemtableFull();
     return Status::OK();
   }
 
   virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
-                         const Slice& value) {
+                         const Slice& value) override {
     Status seek_status;
     if (!SeekToColumnFamily(column_family_id, &seek_status)) {
       ++sequence_;
       return seek_status;
     }
     MemTable* mem = cf_mems_->GetMemTable();
-    const Options* options = cf_mems_->GetOptions();
+    auto* moptions = mem->GetMemTableOptions();
     bool perform_merge = false;
 
-    if (options->max_successive_merges > 0 && db_ != nullptr) {
+    if (moptions->max_successive_merges > 0 && db_ != nullptr) {
       LookupKey lkey(key, sequence_);
 
       // Count the number of successive merges at the head
       // of the key in the memtable
       size_t num_merges = mem->CountSuccessiveMergeEntries(lkey);
 
-      if (num_merges >= options->max_successive_merges) {
+      if (num_merges >= moptions->max_successive_merges) {
         perform_merge = true;
       }
     }
@@ -408,19 +430,28 @@ class MemTableInserter : public WriteBatch::Handler {
       Slice get_value_slice = Slice(get_value);
 
       // 2) Apply this merge
-      auto merge_operator = options->merge_operator.get();
+      auto merge_operator = moptions->merge_operator;
       assert(merge_operator);
 
       std::deque<std::string> operands;
       operands.push_front(value.ToString());
       std::string new_value;
-      if (!merge_operator->FullMerge(key, &get_value_slice, operands,
-                                     &new_value, options->info_log.get())) {
+      bool merge_success = false;
+      {
+        StopWatchNano timer(Env::Default(), moptions->statistics != nullptr);
+        PERF_TIMER_GUARD(merge_operator_time_nanos);
+        merge_success = merge_operator->FullMerge(
+            key, &get_value_slice, operands, &new_value, moptions->info_log);
+        RecordTick(moptions->statistics, MERGE_OPERATION_TOTAL_TIME,
+                   timer.ElapsedNanos());
+      }
+
+      if (!merge_success) {
           // Failed to merge!
-        RecordTick(options->statistics.get(), NUMBER_MERGE_FAILURES);
+        RecordTick(moptions->statistics, NUMBER_MERGE_FAILURES);
 
-          // Store the delta in memtable
-          perform_merge = false;
+        // Store the delta in memtable
+        perform_merge = false;
       } else {
         // 3) Add value to memtable
         mem->Add(sequence_, kTypeValue, key, new_value);
@@ -433,18 +464,20 @@ class MemTableInserter : public WriteBatch::Handler {
     }
 
     sequence_++;
+    cf_mems_->CheckMemtableFull();
     return Status::OK();
   }
 
-  virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) {
+  virtual Status DeleteCF(uint32_t column_family_id,
+                          const Slice& key) override {
     Status seek_status;
     if (!SeekToColumnFamily(column_family_id, &seek_status)) {
       ++sequence_;
       return seek_status;
     }
     MemTable* mem = cf_mems_->GetMemTable();
-    const Options* options = cf_mems_->GetOptions();
-    if (!dont_filter_deletes_ && options->filter_deletes) {
+    auto* moptions = mem->GetMemTableOptions();
+    if (!dont_filter_deletes_ && moptions->filter_deletes) {
       SnapshotImpl read_from_snapshot;
       read_from_snapshot.number_ = sequence_;
       ReadOptions ropts;
@@ -455,23 +488,31 @@ class MemTableInserter : public WriteBatch::Handler {
         cf_handle = db_->DefaultColumnFamily();
       }
       if (!db_->KeyMayExist(ropts, cf_handle, key, &value)) {
-        RecordTick(options->statistics.get(), NUMBER_FILTERED_DELETES);
+        RecordTick(moptions->statistics, NUMBER_FILTERED_DELETES);
         return Status::OK();
       }
     }
     mem->Add(sequence_, kTypeDeletion, key, Slice());
     sequence_++;
+    cf_mems_->CheckMemtableFull();
     return Status::OK();
   }
 };
 }  // namespace
 
+// This function can only be called in these conditions:
+// 1) During Recovery()
+// 2) during Write(), in a single-threaded write thread
+// The reason is that it calles ColumnFamilyMemTablesImpl::Seek(), which needs
+// to be called from a single-threaded write thread (or while holding DB mutex)
 Status WriteBatchInternal::InsertInto(const WriteBatch* b,
                                       ColumnFamilyMemTables* memtables,
-                                      bool recovery, uint64_t log_number,
-                                      DB* db, const bool dont_filter_deletes) {
+                                      bool ignore_missing_column_families,
+                                      uint64_t log_number, DB* db,
+                                      const bool dont_filter_deletes) {
   MemTableInserter inserter(WriteBatchInternal::Sequence(b), memtables,
-                            recovery, log_number, db, dont_filter_deletes);
+                            ignore_missing_column_families, log_number, db,
+                            dont_filter_deletes);
   return b->Iterate(&inserter);
 }
 
diff --git a/src/rocksdb/db/write_batch_base.cc b/src/rocksdb/db/write_batch_base.cc
new file mode 100644
index 0000000..5e3f5f0
--- /dev/null
+++ b/src/rocksdb/db/write_batch_base.cc
@@ -0,0 +1,46 @@
+//  Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "rocksdb/write_batch_base.h"
+
+#include <string>
+
+#include "rocksdb/slice.h"
+
+namespace rocksdb {
+
+// Simple implementation of SlicePart variants of Put().  Child classes
+// can override these method with more performant solutions if they choose.
+void WriteBatchBase::Put(ColumnFamilyHandle* column_family,
+                         const SliceParts& key, const SliceParts& value) {
+  std::string key_buf, value_buf;
+  Slice key_slice(key, &key_buf);
+  Slice value_slice(value, &value_buf);
+
+  Put(column_family, key_slice, value_slice);
+}
+
+void WriteBatchBase::Put(const SliceParts& key, const SliceParts& value) {
+  std::string key_buf, value_buf;
+  Slice key_slice(key, &key_buf);
+  Slice value_slice(value, &value_buf);
+
+  Put(key_slice, value_slice);
+}
+
+void WriteBatchBase::Delete(ColumnFamilyHandle* column_family,
+                            const SliceParts& key) {
+  std::string key_buf;
+  Slice key_slice(key, &key_buf);
+  Delete(column_family, key_slice);
+}
+
+void WriteBatchBase::Delete(const SliceParts& key) {
+  std::string key_buf;
+  Slice key_slice(key, &key_buf);
+  Delete(key_slice);
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/db/write_batch_internal.h b/src/rocksdb/db/write_batch_internal.h
index 85e85b3..793c0d4 100644
--- a/src/rocksdb/db/write_batch_internal.h
+++ b/src/rocksdb/db/write_batch_internal.h
@@ -26,14 +26,14 @@ class ColumnFamilyMemTables {
   // been processed)
   virtual uint64_t GetLogNumber() const = 0;
   virtual MemTable* GetMemTable() const = 0;
-  virtual const Options* GetOptions() const = 0;
   virtual ColumnFamilyHandle* GetColumnFamilyHandle() = 0;
+  virtual void CheckMemtableFull() = 0;
 };
 
 class ColumnFamilyMemTablesDefault : public ColumnFamilyMemTables {
  public:
-  ColumnFamilyMemTablesDefault(MemTable* mem, const Options* options)
-      : ok_(false), mem_(mem), options_(options) {}
+  explicit ColumnFamilyMemTablesDefault(MemTable* mem)
+      : ok_(false), mem_(mem) {}
 
   bool Seek(uint32_t column_family_id) override {
     ok_ = (column_family_id == 0);
@@ -47,17 +47,13 @@ class ColumnFamilyMemTablesDefault : public ColumnFamilyMemTables {
     return mem_;
   }
 
-  const Options* GetOptions() const override {
-    assert(ok_);
-    return options_;
-  }
-
   ColumnFamilyHandle* GetColumnFamilyHandle() override { return nullptr; }
 
+  void CheckMemtableFull() override {}
+
  private:
   bool ok_;
   MemTable* mem_;
-  const Options* const options_;
 };
 
 // WriteBatchInternal provides static methods for manipulating a
@@ -72,6 +68,9 @@ class WriteBatchInternal {
                   const SliceParts& key, const SliceParts& value);
 
   static void Delete(WriteBatch* batch, uint32_t column_family_id,
+                     const SliceParts& key);
+
+  static void Delete(WriteBatch* batch, uint32_t column_family_id,
                      const Slice& key);
 
   static void Merge(WriteBatch* batch, uint32_t column_family_id,
@@ -103,18 +102,18 @@ class WriteBatchInternal {
   // Inserts batch entries into memtable
   // If dont_filter_deletes is false AND options.filter_deletes is true,
   // then --> Drops deletes in batch if db->KeyMayExist returns false
-  // If recovery == true, this means InsertInto is executed on a recovery
-  // code-path. WriteBatch referencing a dropped column family can be
-  // found on a recovery code-path and should be ignored (recovery should not
-  // fail). Additionally, the memtable will be updated only if
+  // If ignore_missing_column_families == true. WriteBatch referencing
+  // non-existing column family should be ignored.
+  // However, if ignore_missing_column_families == false, any WriteBatch
+  // referencing non-existing column family will return a InvalidArgument()
+  // failure.
+  //
+  // If log_number is non-zero, the memtable will be updated only if
   // memtables->GetLogNumber() >= log_number
-  // However, if recovery == false, any WriteBatch referencing
-  // non-existing column family will return a failure. Also, log_number is
-  // ignored in that case
   static Status InsertInto(const WriteBatch* batch,
                            ColumnFamilyMemTables* memtables,
-                           bool recovery = false, uint64_t log_number = 0,
-                           DB* db = nullptr,
+                           bool ignore_missing_column_families = false,
+                           uint64_t log_number = 0, DB* db = nullptr,
                            const bool dont_filter_deletes = true);
 
   static void Append(WriteBatch* dst, const WriteBatch* src);
diff --git a/src/rocksdb/db/write_batch_test.cc b/src/rocksdb/db/write_batch_test.cc
index febd35c..649fb89 100644
--- a/src/rocksdb/db/write_batch_test.cc
+++ b/src/rocksdb/db/write_batch_test.cc
@@ -13,10 +13,14 @@
 #include "db/memtable.h"
 #include "db/column_family.h"
 #include "db/write_batch_internal.h"
+#include "db/writebuffer.h"
 #include "rocksdb/env.h"
 #include "rocksdb/memtablerep.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
 #include "util/logging.h"
+#include "util/string_util.h"
 #include "util/testharness.h"
+#include "util/scoped_arena_iterator.h"
 
 namespace rocksdb {
 
@@ -25,17 +29,21 @@ static std::string PrintContents(WriteBatch* b) {
   auto factory = std::make_shared<SkipListFactory>();
   Options options;
   options.memtable_factory = factory;
-  MemTable* mem = new MemTable(cmp, options);
+  ImmutableCFOptions ioptions(options);
+  WriteBuffer wb(options.db_write_buffer_size);
+  MemTable* mem = new MemTable(cmp, ioptions,
+                               MutableCFOptions(options, ioptions), &wb);
   mem->Ref();
   std::string state;
-  ColumnFamilyMemTablesDefault cf_mems_default(mem, &options);
+  ColumnFamilyMemTablesDefault cf_mems_default(mem);
   Status s = WriteBatchInternal::InsertInto(b, &cf_mems_default);
   int count = 0;
-  Iterator* iter = mem->NewIterator(ReadOptions());
+  Arena arena;
+  ScopedArenaIterator iter(mem->NewIterator(ReadOptions(), &arena));
   for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
     ParsedInternalKey ikey;
     memset((void *)&ikey, 0, sizeof(ikey));
-    ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey));
+    EXPECT_TRUE(ParseInternalKey(iter->key(), &ikey));
     switch (ikey.type) {
       case kTypeValue:
         state.append("Put(");
@@ -66,7 +74,6 @@ static std::string PrintContents(WriteBatch* b) {
     state.append("@");
     state.append(NumberToString(ikey.sequence));
   }
-  delete iter;
   if (!s.ok()) {
     state.append(s.ToString());
   } else if (count != WriteBatchInternal::Count(b)) {
@@ -76,16 +83,16 @@ static std::string PrintContents(WriteBatch* b) {
   return state;
 }
 
-class WriteBatchTest { };
+class WriteBatchTest : public testing::Test {};
 
-TEST(WriteBatchTest, Empty) {
+TEST_F(WriteBatchTest, Empty) {
   WriteBatch batch;
   ASSERT_EQ("", PrintContents(&batch));
   ASSERT_EQ(0, WriteBatchInternal::Count(&batch));
   ASSERT_EQ(0, batch.Count());
 }
 
-TEST(WriteBatchTest, Multiple) {
+TEST_F(WriteBatchTest, Multiple) {
   WriteBatch batch;
   batch.Put(Slice("foo"), Slice("bar"));
   batch.Delete(Slice("box"));
@@ -100,7 +107,7 @@ TEST(WriteBatchTest, Multiple) {
   ASSERT_EQ(3, batch.Count());
 }
 
-TEST(WriteBatchTest, Corruption) {
+TEST_F(WriteBatchTest, Corruption) {
   WriteBatch batch;
   batch.Put(Slice("foo"), Slice("bar"));
   batch.Delete(Slice("box"));
@@ -113,7 +120,7 @@ TEST(WriteBatchTest, Corruption) {
             PrintContents(&batch));
 }
 
-TEST(WriteBatchTest, Append) {
+TEST_F(WriteBatchTest, Append) {
   WriteBatch b1, b2;
   WriteBatchInternal::SetSequence(&b1, 200);
   WriteBatchInternal::SetSequence(&b2, 300);
@@ -147,33 +154,34 @@ namespace {
   struct TestHandler : public WriteBatch::Handler {
     std::string seen;
     virtual Status PutCF(uint32_t column_family_id, const Slice& key,
-                       const Slice& value) {
+                         const Slice& value) override {
       if (column_family_id == 0) {
         seen += "Put(" + key.ToString() + ", " + value.ToString() + ")";
       } else {
-        seen += "PutCF(" + std::to_string(column_family_id) + ", " +
+        seen += "PutCF(" + ToString(column_family_id) + ", " +
                 key.ToString() + ", " + value.ToString() + ")";
       }
       return Status::OK();
     }
     virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
-                         const Slice& value) {
+                           const Slice& value) override {
       if (column_family_id == 0) {
         seen += "Merge(" + key.ToString() + ", " + value.ToString() + ")";
       } else {
-        seen += "MergeCF(" + std::to_string(column_family_id) + ", " +
+        seen += "MergeCF(" + ToString(column_family_id) + ", " +
                 key.ToString() + ", " + value.ToString() + ")";
       }
       return Status::OK();
     }
-    virtual void LogData(const Slice& blob) {
+    virtual void LogData(const Slice& blob) override {
       seen += "LogData(" + blob.ToString() + ")";
     }
-    virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) {
+    virtual Status DeleteCF(uint32_t column_family_id,
+                            const Slice& key) override {
       if (column_family_id == 0) {
         seen += "Delete(" + key.ToString() + ")";
       } else {
-        seen += "DeleteCF(" + std::to_string(column_family_id) + ", " +
+        seen += "DeleteCF(" + ToString(column_family_id) + ", " +
                 key.ToString() + ")";
       }
       return Status::OK();
@@ -181,7 +189,40 @@ namespace {
   };
 }
 
-TEST(WriteBatchTest, Blob) {
+TEST_F(WriteBatchTest, MergeNotImplemented) {
+  WriteBatch batch;
+  batch.Merge(Slice("foo"), Slice("bar"));
+  ASSERT_EQ(1, batch.Count());
+  ASSERT_EQ("Merge(foo, bar)@0",
+            PrintContents(&batch));
+
+  WriteBatch::Handler handler;
+  ASSERT_OK(batch.Iterate(&handler));
+}
+
+TEST_F(WriteBatchTest, PutNotImplemented) {
+  WriteBatch batch;
+  batch.Put(Slice("k1"), Slice("v1"));
+  ASSERT_EQ(1, batch.Count());
+  ASSERT_EQ("Put(k1, v1)@0",
+            PrintContents(&batch));
+
+  WriteBatch::Handler handler;
+  ASSERT_OK(batch.Iterate(&handler));
+}
+
+TEST_F(WriteBatchTest, DeleteNotImplemented) {
+  WriteBatch batch;
+  batch.Delete(Slice("k2"));
+  ASSERT_EQ(1, batch.Count());
+  ASSERT_EQ("Delete(k2)@0",
+            PrintContents(&batch));
+
+  WriteBatch::Handler handler;
+  ASSERT_OK(batch.Iterate(&handler));
+}
+
+TEST_F(WriteBatchTest, Blob) {
   WriteBatch batch;
   batch.Put(Slice("k1"), Slice("v1"));
   batch.Put(Slice("k2"), Slice("v2"));
@@ -211,26 +252,27 @@ TEST(WriteBatchTest, Blob) {
             handler.seen);
 }
 
-TEST(WriteBatchTest, Continue) {
+TEST_F(WriteBatchTest, Continue) {
   WriteBatch batch;
 
   struct Handler : public TestHandler {
     int num_seen = 0;
     virtual Status PutCF(uint32_t column_family_id, const Slice& key,
-                       const Slice& value) {
+                         const Slice& value) override {
       ++num_seen;
       return TestHandler::PutCF(column_family_id, key, value);
     }
     virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
-                         const Slice& value) {
+                           const Slice& value) override {
       ++num_seen;
       return TestHandler::MergeCF(column_family_id, key, value);
     }
-    virtual void LogData(const Slice& blob) {
+    virtual void LogData(const Slice& blob) override {
       ++num_seen;
       TestHandler::LogData(blob);
     }
-    virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) {
+    virtual Status DeleteCF(uint32_t column_family_id,
+                            const Slice& key) override {
       ++num_seen;
       return TestHandler::DeleteCF(column_family_id, key);
     }
@@ -252,7 +294,7 @@ TEST(WriteBatchTest, Continue) {
             handler.seen);
 }
 
-TEST(WriteBatchTest, PutGatherSlices) {
+TEST_F(WriteBatchTest, PutGatherSlices) {
   WriteBatch batch;
   batch.Put(Slice("foo"), Slice("bar"));
 
@@ -286,13 +328,16 @@ class ColumnFamilyHandleImplDummy : public ColumnFamilyHandleImpl {
   explicit ColumnFamilyHandleImplDummy(int id)
       : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), id_(id) {}
   uint32_t GetID() const override { return id_; }
+  const Comparator* user_comparator() const override {
+    return BytewiseComparator();
+  }
 
  private:
   uint32_t id_;
 };
 }  // namespace anonymous
 
-TEST(WriteBatchTest, ColumnFamiliesBatchTest) {
+TEST_F(WriteBatchTest, ColumnFamiliesBatchTest) {
   WriteBatch batch;
   ColumnFamilyHandleImplDummy zero(0), two(2), three(3), eight(8);
   batch.Put(&zero, Slice("foo"), Slice("bar"));
@@ -316,8 +361,91 @@ TEST(WriteBatchTest, ColumnFamiliesBatchTest) {
       handler.seen);
 }
 
+TEST_F(WriteBatchTest, ColumnFamiliesBatchWithIndexTest) {
+  WriteBatchWithIndex batch;
+  ColumnFamilyHandleImplDummy zero(0), two(2), three(3), eight(8);
+  batch.Put(&zero, Slice("foo"), Slice("bar"));
+  batch.Put(&two, Slice("twofoo"), Slice("bar2"));
+  batch.Put(&eight, Slice("eightfoo"), Slice("bar8"));
+  batch.Delete(&eight, Slice("eightfoo"));
+  batch.Merge(&three, Slice("threethree"), Slice("3three"));
+  batch.Put(&zero, Slice("foo"), Slice("bar"));
+  batch.Merge(Slice("omom"), Slice("nom"));
+
+  std::unique_ptr<WBWIIterator> iter;
+
+  iter.reset(batch.NewIterator(&eight));
+  iter->Seek("eightfoo");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(WriteType::kPutRecord, iter->Entry().type);
+  ASSERT_EQ("eightfoo", iter->Entry().key.ToString());
+  ASSERT_EQ("bar8", iter->Entry().value.ToString());
+
+  iter->Next();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(WriteType::kDeleteRecord, iter->Entry().type);
+  ASSERT_EQ("eightfoo", iter->Entry().key.ToString());
+
+  iter->Next();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(!iter->Valid());
+
+  iter.reset(batch.NewIterator());
+  iter->Seek("gggg");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(WriteType::kMergeRecord, iter->Entry().type);
+  ASSERT_EQ("omom", iter->Entry().key.ToString());
+  ASSERT_EQ("nom", iter->Entry().value.ToString());
+
+  iter->Next();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(!iter->Valid());
+
+  iter.reset(batch.NewIterator(&zero));
+  iter->Seek("foo");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(WriteType::kPutRecord, iter->Entry().type);
+  ASSERT_EQ("foo", iter->Entry().key.ToString());
+  ASSERT_EQ("bar", iter->Entry().value.ToString());
+
+  iter->Next();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(WriteType::kPutRecord, iter->Entry().type);
+  ASSERT_EQ("foo", iter->Entry().key.ToString());
+  ASSERT_EQ("bar", iter->Entry().value.ToString());
+
+  iter->Next();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(WriteType::kMergeRecord, iter->Entry().type);
+  ASSERT_EQ("omom", iter->Entry().key.ToString());
+  ASSERT_EQ("nom", iter->Entry().value.ToString());
+
+  iter->Next();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(!iter->Valid());
+
+  TestHandler handler;
+  batch.GetWriteBatch()->Iterate(&handler);
+  ASSERT_EQ(
+      "Put(foo, bar)"
+      "PutCF(2, twofoo, bar2)"
+      "PutCF(8, eightfoo, bar8)"
+      "DeleteCF(8, eightfoo)"
+      "MergeCF(3, threethree, 3three)"
+      "Put(foo, bar)"
+      "Merge(omom, nom)",
+      handler.seen);
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
-  return rocksdb::test::RunAllTests();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
 }
diff --git a/src/rocksdb/db/write_controller.cc b/src/rocksdb/db/write_controller.cc
new file mode 100644
index 0000000..bb6f8ec
--- /dev/null
+++ b/src/rocksdb/db/write_controller.cc
@@ -0,0 +1,37 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "db/write_controller.h"
+
+#include <cassert>
+
+namespace rocksdb {
+
+std::unique_ptr<WriteControllerToken> WriteController::GetStopToken() {
+  ++total_stopped_;
+  return std::unique_ptr<WriteControllerToken>(new StopWriteToken(this));
+}
+
+std::unique_ptr<WriteControllerToken> WriteController::GetDelayToken(
+    uint64_t delay_us) {
+  total_delay_us_ += delay_us;
+  return std::unique_ptr<WriteControllerToken>(
+      new DelayWriteToken(this, delay_us));
+}
+
+bool WriteController::IsStopped() const { return total_stopped_ > 0; }
+uint64_t WriteController::GetDelay() const { return total_delay_us_; }
+
+StopWriteToken::~StopWriteToken() {
+  assert(controller_->total_stopped_ >= 1);
+  --controller_->total_stopped_;
+}
+
+DelayWriteToken::~DelayWriteToken() {
+  assert(controller_->total_delay_us_ >= delay_us_);
+  controller_->total_delay_us_ -= delay_us_;
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/db/write_controller.h b/src/rocksdb/db/write_controller.h
new file mode 100644
index 0000000..32e1d58
--- /dev/null
+++ b/src/rocksdb/db/write_controller.h
@@ -0,0 +1,78 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include <stdint.h>
+
+#include <memory>
+
+namespace rocksdb {
+
+class WriteControllerToken;
+
+// WriteController is controlling write stalls in our write code-path. Write
+// stalls happen when compaction can't keep up with write rate.
+// All of the methods here (including WriteControllerToken's destructors) need
+// to be called while holding DB mutex
+class WriteController {
+ public:
+  WriteController() : total_stopped_(0), total_delay_us_(0) {}
+  ~WriteController() = default;
+
+  // When an actor (column family) requests a stop token, all writes will be
+  // stopped until the stop token is released (deleted)
+  std::unique_ptr<WriteControllerToken> GetStopToken();
+  // When an actor (column family) requests a delay token, total delay for all
+  // writes will be increased by delay_us. The delay will last until delay token
+  // is released
+  std::unique_ptr<WriteControllerToken> GetDelayToken(uint64_t delay_us);
+
+  // these two metods are querying the state of the WriteController
+  bool IsStopped() const;
+  uint64_t GetDelay() const;
+
+ private:
+  friend class WriteControllerToken;
+  friend class StopWriteToken;
+  friend class DelayWriteToken;
+
+  int total_stopped_;
+  uint64_t total_delay_us_;
+};
+
+class WriteControllerToken {
+ public:
+  explicit WriteControllerToken(WriteController* controller)
+      : controller_(controller) {}
+  virtual ~WriteControllerToken() {}
+
+ protected:
+  WriteController* controller_;
+
+ private:
+  // no copying allowed
+  WriteControllerToken(const WriteControllerToken&) = delete;
+  void operator=(const WriteControllerToken&) = delete;
+};
+
+class StopWriteToken : public WriteControllerToken {
+ public:
+  explicit StopWriteToken(WriteController* controller)
+      : WriteControllerToken(controller) {}
+  virtual ~StopWriteToken();
+};
+
+class DelayWriteToken : public WriteControllerToken {
+ public:
+  DelayWriteToken(WriteController* controller, uint64_t delay_us)
+      : WriteControllerToken(controller), delay_us_(delay_us) {}
+  virtual ~DelayWriteToken();
+
+ private:
+  uint64_t delay_us_;
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/db/write_controller_test.cc b/src/rocksdb/db/write_controller_test.cc
new file mode 100644
index 0000000..41f8313
--- /dev/null
+++ b/src/rocksdb/db/write_controller_test.cc
@@ -0,0 +1,43 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include "db/write_controller.h"
+
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+class WriteControllerTest : public testing::Test {};
+
+TEST_F(WriteControllerTest, SanityTest) {
+  WriteController controller;
+  auto stop_token_1 = controller.GetStopToken();
+  auto stop_token_2 = controller.GetStopToken();
+
+  ASSERT_TRUE(controller.IsStopped());
+  stop_token_1.reset();
+  ASSERT_TRUE(controller.IsStopped());
+  stop_token_2.reset();
+  ASSERT_FALSE(controller.IsStopped());
+
+  auto delay_token_1 = controller.GetDelayToken(5);
+  ASSERT_EQ(static_cast<uint64_t>(5), controller.GetDelay());
+  auto delay_token_2 = controller.GetDelayToken(8);
+  ASSERT_EQ(static_cast<uint64_t>(13), controller.GetDelay());
+
+  delay_token_2.reset();
+  ASSERT_EQ(static_cast<uint64_t>(5), controller.GetDelay());
+  delay_token_1.reset();
+  ASSERT_EQ(static_cast<uint64_t>(0), controller.GetDelay());
+  delay_token_1.reset();
+  ASSERT_FALSE(controller.IsStopped());
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/write_thread.cc b/src/rocksdb/db/write_thread.cc
new file mode 100644
index 0000000..052e120
--- /dev/null
+++ b/src/rocksdb/db/write_thread.cc
@@ -0,0 +1,147 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "db/write_thread.h"
+
+namespace rocksdb {
+
+Status WriteThread::EnterWriteThread(WriteThread::Writer* w,
+                                     uint64_t expiration_time) {
+  // the following code block pushes the current writer "w" into the writer
+  // queue "writers_" and wait until one of the following conditions met:
+  // 1. the job of "w" has been done by some other writers.
+  // 2. "w" becomes the first writer in "writers_"
+  // 3. "w" timed-out.
+  writers_.push_back(w);
+
+  bool timed_out = false;
+  while (!w->done && w != writers_.front()) {
+    if (expiration_time == 0) {
+      w->cv.Wait();
+    } else if (w->cv.TimedWait(expiration_time)) {
+      if (w->in_batch_group) {
+        // then it means the front writer is currently doing the
+        // write on behalf of this "timed-out" writer.  Then it
+        // should wait until the write completes.
+        expiration_time = 0;
+      } else {
+        timed_out = true;
+        break;
+      }
+    }
+  }
+
+  if (timed_out) {
+#ifndef NDEBUG
+    bool found = false;
+#endif
+    for (auto iter = writers_.begin(); iter != writers_.end(); iter++) {
+      if (*iter == w) {
+        writers_.erase(iter);
+#ifndef NDEBUG
+        found = true;
+#endif
+        break;
+      }
+    }
+#ifndef NDEBUG
+    assert(found);
+#endif
+    // writers_.front() might still be in cond_wait without a time-out.
+    // As a result, we need to signal it to wake it up.  Otherwise no
+    // one else will wake him up, and RocksDB will hang.
+    if (!writers_.empty()) {
+      writers_.front()->cv.Signal();
+    }
+    return Status::TimedOut();
+  }
+  return Status::OK();
+}
+
+void WriteThread::ExitWriteThread(WriteThread::Writer* w,
+                                  WriteThread::Writer* last_writer,
+                                  Status status) {
+  // Pop out the current writer and all writers being pushed before the
+  // current writer from the writer queue.
+  while (!writers_.empty()) {
+    Writer* ready = writers_.front();
+    writers_.pop_front();
+    if (ready != w) {
+      ready->status = status;
+      ready->done = true;
+      ready->cv.Signal();
+    }
+    if (ready == last_writer) break;
+  }
+
+  // Notify new head of write queue
+  if (!writers_.empty()) {
+    writers_.front()->cv.Signal();
+  }
+}
+
+// This function will be called only when the first writer succeeds.
+// All writers in the to-be-built batch group will be processed.
+//
+// REQUIRES: Writer list must be non-empty
+// REQUIRES: First writer must have a non-nullptr batch
+void WriteThread::BuildBatchGroup(WriteThread::Writer** last_writer,
+                                  autovector<WriteBatch*>* write_batch_group) {
+  assert(!writers_.empty());
+  Writer* first = writers_.front();
+  assert(first->batch != nullptr);
+
+  size_t size = WriteBatchInternal::ByteSize(first->batch);
+  write_batch_group->push_back(first->batch);
+
+  // Allow the group to grow up to a maximum size, but if the
+  // original write is small, limit the growth so we do not slow
+  // down the small write too much.
+  size_t max_size = 1 << 20;
+  if (size <= (128<<10)) {
+    max_size = size + (128<<10);
+  }
+
+  *last_writer = first;
+  std::deque<Writer*>::iterator iter = writers_.begin();
+  ++iter;  // Advance past "first"
+  for (; iter != writers_.end(); ++iter) {
+    Writer* w = *iter;
+    if (w->sync && !first->sync) {
+      // Do not include a sync write into a batch handled by a non-sync write.
+      break;
+    }
+
+    if (!w->disableWAL && first->disableWAL) {
+      // Do not include a write that needs WAL into a batch that has
+      // WAL disabled.
+      break;
+    }
+
+    if (w->timeout_hint_us < first->timeout_hint_us) {
+      // Do not include those writes with shorter timeout.  Otherwise, we might
+      // execute a write that should instead be aborted because of timeout.
+      break;
+    }
+
+    if (w->batch == nullptr) {
+      // Do not include those writes with nullptr batch. Those are not writes,
+      // those are something else. They want to be alone
+      break;
+    }
+
+    size += WriteBatchInternal::ByteSize(w->batch);
+    if (size > max_size) {
+      // Do not make batch too big
+      break;
+    }
+
+    write_batch_group->push_back(w->batch);
+    w->in_batch_group = true;
+    *last_writer = w;
+  }
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/db/write_thread.h b/src/rocksdb/db/write_thread.h
new file mode 100644
index 0000000..db35202
--- /dev/null
+++ b/src/rocksdb/db/write_thread.h
@@ -0,0 +1,81 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include <stdint.h>
+#include <deque>
+#include <limits>
+#include "rocksdb/status.h"
+#include "db/write_batch_internal.h"
+#include "util/autovector.h"
+#include "port/port.h"
+#include "util/instrumented_mutex.h"
+
+namespace rocksdb {
+
+class WriteThread {
+ public:
+  static const uint64_t kNoTimeOut = std::numeric_limits<uint64_t>::max();
+  // Information kept for every waiting writer
+  struct Writer {
+    Status status;
+    WriteBatch* batch;
+    bool sync;
+    bool disableWAL;
+    bool in_batch_group;
+    bool done;
+    uint64_t timeout_hint_us;
+    InstrumentedCondVar cv;
+
+    explicit Writer(InstrumentedMutex* mu)
+        : batch(nullptr),
+          sync(false),
+          disableWAL(false),
+          in_batch_group(false),
+          done(false),
+          timeout_hint_us(kNoTimeOut),
+          cv(mu) {}
+  };
+
+  WriteThread() = default;
+  ~WriteThread() = default;
+
+  // Before applying write operation (such as DBImpl::Write, DBImpl::Flush)
+  // thread should grab the mutex_ and be the first on writers queue.
+  // EnterWriteThread is used for it.
+  // Be aware! Writer's job can be done by other thread (see DBImpl::Write
+  // for examples), so check it via w.done before applying changes.
+  //
+  // Writer* w:                writer to be placed in the queue
+  // uint64_t expiration_time: maximum time to be in the queue
+  // See also: ExitWriteThread
+  // REQUIRES: db mutex held
+  Status EnterWriteThread(Writer* w, uint64_t expiration_time);
+
+  // After doing write job, we need to remove already used writers from
+  // writers_ queue and notify head of the queue about it.
+  // ExitWriteThread is used for this.
+  //
+  // Writer* w:           Writer, that was added by EnterWriteThread function
+  // Writer* last_writer: Since we can join a few Writers (as DBImpl::Write
+  //                      does)
+  //                      we should pass last_writer as a parameter to
+  //                      ExitWriteThread
+  //                      (if you don't touch other writers, just pass w)
+  // Status status:       Status of write operation
+  // See also: EnterWriteThread
+  // REQUIRES: db mutex held
+  void ExitWriteThread(Writer* w, Writer* last_writer, Status status);
+
+  void BuildBatchGroup(Writer** last_writer,
+                       autovector<WriteBatch*>* write_batch_group);
+
+ private:
+  // Queue of writers.
+  std::deque<Writer*> writers_;
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/db/writebuffer.h b/src/rocksdb/db/writebuffer.h
new file mode 100644
index 0000000..7047a92
--- /dev/null
+++ b/src/rocksdb/db/writebuffer.h
@@ -0,0 +1,44 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// WriteBuffer is for managing memory allocation for one or more MemTables.
+
+#pragma once
+
+namespace rocksdb {
+
+class WriteBuffer {
+ public:
+  explicit WriteBuffer(size_t _buffer_size)
+    : buffer_size_(_buffer_size), memory_used_(0) {}
+
+  ~WriteBuffer() {}
+
+  size_t memory_usage() const { return memory_used_; }
+  size_t buffer_size() const { return buffer_size_; }
+
+  // Should only be called from write thread
+  bool ShouldFlush() const {
+    return buffer_size() > 0 && memory_usage() >= buffer_size();
+  }
+
+  // Should only be called from write thread
+  void ReserveMem(size_t mem) { memory_used_ += mem; }
+  void FreeMem(size_t mem) { memory_used_ -= mem; }
+
+ private:
+  const size_t buffer_size_;
+  size_t memory_used_;
+
+  // No copying allowed
+  WriteBuffer(const WriteBuffer&);
+  void operator=(const WriteBuffer&);
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/doc/index.html b/src/rocksdb/doc/index.html
index 71f515e..94f7cb8 100644
--- a/src/rocksdb/doc/index.html
+++ b/src/rocksdb/doc/index.html
@@ -642,10 +642,6 @@ Default:1, i.e. pick maxfilesize amount of data as the source of a compaction.
 <li> <code>Options::max_grandparent_overlap_factor</code> -   Control maximum bytes of overlaps in grandparent (i.e., level+2) before we
 stop building a single file in a level->level+1 compaction.
 <p>
-<li> <code>Options::disable_seek_compaction</code> -  Disable compaction triggered by seek.
-With bloomfilter and fast storage, a miss on one level is very cheap if the file handle is cached in table cache
-(which is true if max_open_files is large).
-<p>
 <li> <code>Options::max_background_compactions</code> - Maximum number of concurrent background jobs, submitted to
 the default LOW priority thread pool
 </ul>
diff --git a/src/rocksdb/examples/.gitignore b/src/rocksdb/examples/.gitignore
new file mode 100644
index 0000000..5cb04d4
--- /dev/null
+++ b/src/rocksdb/examples/.gitignore
@@ -0,0 +1,4 @@
+column_families_example
+simple_example
+c_simple_example
+compact_files_example
diff --git a/src/rocksdb/examples/Makefile b/src/rocksdb/examples/Makefile
new file mode 100644
index 0000000..7bd88fb
--- /dev/null
+++ b/src/rocksdb/examples/Makefile
@@ -0,0 +1,23 @@
+include ../make_config.mk
+
+.PHONY: clean
+
+all: simple_example column_families_example compact_files_example c_simple_example
+
+simple_example: simple_example.cc
+	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
+
+column_families_example: column_families_example.cc
+	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
+
+compact_files_example: compact_files_example.cc
+	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
+
+.c.o:
+	$(CC) $(CFLAGS) -c $< -o $@ -I../include
+
+c_simple_example: c_simple_example.o
+	$(CXX) $@.o -o$@ ../librocksdb.a $(PLATFORM_LDFLAGS) $(EXEC_LDFLAGS)
+
+clean:
+	rm -rf ./simple_example ./column_families_example ./compact_files_example ./c_simple_example c_simple_example.o
diff --git a/src/rocksdb/examples/README.md b/src/rocksdb/examples/README.md
new file mode 100644
index 0000000..b07b390
--- /dev/null
+++ b/src/rocksdb/examples/README.md
@@ -0,0 +1 @@
+Compile RocksDB first by executing `make static_lib` in parent dir
diff --git a/src/rocksdb/examples/c_simple_example.c b/src/rocksdb/examples/c_simple_example.c
new file mode 100644
index 0000000..7a63827
--- /dev/null
+++ b/src/rocksdb/examples/c_simple_example.c
@@ -0,0 +1,74 @@
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#include "rocksdb/c.h"
+
+#include <unistd.h>  // sysconf() - get CPU count
+
+const char DBPath[] = "/tmp/rocksdb_simple_example";
+const char DBBackupPath[] = "/tmp/rocksdb_simple_example_backup";
+
+int main(int argc, char **argv) {
+  rocksdb_t *db;
+  rocksdb_backup_engine_t *be;
+  rocksdb_options_t *options = rocksdb_options_create();
+  // Optimize RocksDB. This is the easiest way to
+  // get RocksDB to perform well
+  long cpus = sysconf(_SC_NPROCESSORS_ONLN);  // get # of online cores
+  rocksdb_options_increase_parallelism(options, (int)(cpus));
+  rocksdb_options_optimize_level_style_compaction(options, 0);
+  // create the DB if it's not already present
+  rocksdb_options_set_create_if_missing(options, 1);
+
+  // open DB
+  char *err = NULL;
+  db = rocksdb_open(options, DBPath, &err);
+  assert(!err);
+
+  // open Backup Engine that we will use for backing up or database
+  be = rocksdb_backup_engine_open(options, DBBackupPath, &err);
+  assert(!err);
+
+  // Put key-value
+  rocksdb_writeoptions_t *writeoptions = rocksdb_writeoptions_create();
+  const char key[] = "key";
+  const char *value = "value";
+  rocksdb_put(db, writeoptions, key, strlen(key), value, strlen(value) + 1,
+              &err);
+  assert(!err);
+  // Get value
+  rocksdb_readoptions_t *readoptions = rocksdb_readoptions_create();
+  size_t len;
+  char *returned_value =
+      rocksdb_get(db, readoptions, key, strlen(key), &len, &err);
+  assert(!err);
+  assert(strcmp(returned_value, "value") == 0);
+  free(returned_value);
+
+  // create new backup in a directory specified by DBBackupPath
+  rocksdb_backup_engine_create_new_backup(be, db, &err);
+  assert(!err);
+
+  rocksdb_close(db);
+
+  // If something is wrong, you might want to restore data from last backup
+  rocksdb_restore_options_t *restore_options = rocksdb_restore_options_create();
+  rocksdb_backup_engine_restore_db_from_latest_backup(be, DBPath, DBPath,
+                                                      restore_options, &err);
+  assert(!err);
+  rocksdb_restore_options_destroy(restore_options);
+
+  db = rocksdb_open(options, DBPath, &err);
+  assert(!err);
+
+  // cleanup
+  rocksdb_writeoptions_destroy(writeoptions);
+  rocksdb_readoptions_destroy(readoptions);
+  rocksdb_options_destroy(options);
+  rocksdb_backup_engine_close(be);
+  rocksdb_close(db);
+
+  return 0;
+}
diff --git a/src/rocksdb/examples/column_families_example.cc b/src/rocksdb/examples/column_families_example.cc
new file mode 100644
index 0000000..3ffac06
--- /dev/null
+++ b/src/rocksdb/examples/column_families_example.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+#include <cstdio>
+#include <string>
+#include <vector>
+
+#include "rocksdb/db.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/options.h"
+
+using namespace rocksdb;
+
+std::string kDBPath = "/tmp/rocksdb_column_families_example";
+
+int main() {
+  // open DB
+  Options options;
+  options.create_if_missing = true;
+  DB* db;
+  Status s = DB::Open(options, kDBPath, &db);
+  assert(s.ok());
+
+  // create column family
+  ColumnFamilyHandle* cf;
+  s = db->CreateColumnFamily(ColumnFamilyOptions(), "new_cf", &cf);
+  assert(s.ok());
+
+  // close DB
+  delete cf;
+  delete db;
+
+  // open DB with two column families
+  std::vector<ColumnFamilyDescriptor> column_families;
+  // have to open default column family
+  column_families.push_back(ColumnFamilyDescriptor(
+      kDefaultColumnFamilyName, ColumnFamilyOptions()));
+  // open the new one, too
+  column_families.push_back(ColumnFamilyDescriptor(
+      "new_cf", ColumnFamilyOptions()));
+  std::vector<ColumnFamilyHandle*> handles;
+  s = DB::Open(DBOptions(), kDBPath, column_families, &handles, &db);
+  assert(s.ok());
+
+  // put and get from non-default column family
+  s = db->Put(WriteOptions(), handles[1], Slice("key"), Slice("value"));
+  assert(s.ok());
+  std::string value;
+  s = db->Get(ReadOptions(), handles[1], Slice("key"), &value);
+  assert(s.ok());
+
+  // atomic write
+  WriteBatch batch;
+  batch.Put(handles[0], Slice("key2"), Slice("value2"));
+  batch.Put(handles[1], Slice("key3"), Slice("value3"));
+  batch.Delete(handles[0], Slice("key"));
+  s = db->Write(WriteOptions(), &batch);
+  assert(s.ok());
+
+  // drop column family
+  s = db->DropColumnFamily(handles[1]);
+  assert(s.ok());
+
+  // close db
+  for (auto handle : handles) {
+    delete handle;
+  }
+  delete db;
+
+  return 0;
+}
diff --git a/src/rocksdb/examples/compact_files_example.cc b/src/rocksdb/examples/compact_files_example.cc
new file mode 100644
index 0000000..3e7638b
--- /dev/null
+++ b/src/rocksdb/examples/compact_files_example.cc
@@ -0,0 +1,175 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// An example code demonstrating how to use CompactFiles, EventListener,
+// and GetColumnFamilyMetaData APIs to implement custom compaction algorithm.
+
+#include <mutex>
+#include <string>
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+
+using namespace rocksdb;
+std::string kDBPath = "/tmp/rocksdb_compact_files_example";
+class CompactionTask;
+
+// This is an example interface of external-compaction algorithm.
+// Compaction algorithm can be implemented outside the core-RocksDB
+// code by using the pluggable compaction APIs that RocksDb provides.
+class Compactor : public EventListener {
+ public:
+  // Picks and returns a compaction task given the specified DB
+  // and column family.  It is the caller's responsibility to
+  // destroy the returned CompactionTask.  Returns "nullptr"
+  // if it cannot find a proper compaction task.
+  virtual CompactionTask* PickCompaction(
+      DB* db, const std::string& cf_name) = 0;
+
+  // Schedule and run the specified compaction task in background.
+  virtual void ScheduleCompaction(CompactionTask *task) = 0;
+};
+
+// Example structure that describes a compaction task.
+struct CompactionTask {
+  CompactionTask(
+      DB* db, Compactor* compactor,
+      const std::string& column_family_name,
+      const std::vector<std::string>& input_file_names,
+      const int output_level,
+      const CompactionOptions& compact_options,
+      bool retry_on_fail)
+          : db(db),
+            compactor(compactor),
+            column_family_name(column_family_name),
+            input_file_names(input_file_names),
+            output_level(output_level),
+            compact_options(compact_options),
+            retry_on_fail(false) {}
+  DB* db;
+  Compactor* compactor;
+  const std::string& column_family_name;
+  std::vector<std::string> input_file_names;
+  int output_level;
+  CompactionOptions compact_options;
+  bool retry_on_fail;
+};
+
+// A simple compaction algorithm that always compacts everything
+// to the highest level whenever possible.
+class FullCompactor : public Compactor {
+ public:
+  explicit FullCompactor(const Options options) : options_(options) {
+    compact_options_.compression = options_.compression;
+    compact_options_.output_file_size_limit =
+        options_.target_file_size_base;
+  }
+
+  // When flush happens, it determins whether to trigger compaction.
+  // If triggered_writes_stop is true, it will also set the retry
+  // flag of compaction-task to true.
+  void OnFlushCompleted(
+      DB* db, const std::string& cf_name,
+      const std::string& file_path,
+      bool triggered_writes_slowdown,
+      bool triggered_writes_stop) override {
+    CompactionTask* task = PickCompaction(db, cf_name);
+    if (task != nullptr) {
+      if (triggered_writes_stop) {
+        task->retry_on_fail = true;
+      }
+      // Schedule compaction in a different thread.
+      ScheduleCompaction(task);
+    }
+  }
+
+  // Always pick a compaction which includes all files whenever possible.
+  CompactionTask* PickCompaction(
+      DB* db, const std::string& cf_name) override {
+    ColumnFamilyMetaData cf_meta;
+    db->GetColumnFamilyMetaData(&cf_meta);
+
+    std::vector<std::string> input_file_names;
+    for (auto level : cf_meta.levels) {
+      for (auto file : level.files) {
+        if (file.being_compacted) {
+          return nullptr;
+        }
+        input_file_names.push_back(file.name);
+      }
+    }
+    return new CompactionTask(
+        db, this, cf_name, input_file_names,
+        options_.num_levels - 1, compact_options_, false);
+  }
+
+  // Schedule the specified compaction task in background.
+  void ScheduleCompaction(CompactionTask* task) override {
+    options_.env->Schedule(&FullCompactor::CompactFiles, task);
+  }
+
+  static void CompactFiles(void* arg) {
+    CompactionTask* task = reinterpret_cast<CompactionTask*>(arg);
+    assert(task);
+    assert(task->db);
+    Status s = task->db->CompactFiles(
+        task->compact_options,
+        task->input_file_names,
+        task->output_level);
+    printf("CompactFiles() finished with status %s\n", s.ToString().c_str());
+    if (!s.ok() && !s.IsIOError() && task->retry_on_fail) {
+      // If a compaction task with its retry_on_fail=true failed,
+      // try to schedule another compaction in case the reason
+      // is not an IO error.
+      CompactionTask* new_task = task->compactor->PickCompaction(
+          task->db, task->column_family_name);
+      task->compactor->ScheduleCompaction(new_task);
+    }
+    // release the task
+    delete task;
+  }
+
+ private:
+  Options options_;
+  CompactionOptions compact_options_;
+};
+
+int main() {
+  Options options;
+  options.create_if_missing = true;
+  // Disable RocksDB background compaction.
+  options.compaction_style = kCompactionStyleNone;
+  // Small slowdown and stop trigger for experimental purpose.
+  options.level0_slowdown_writes_trigger = 3;
+  options.level0_stop_writes_trigger = 5;
+  options.IncreaseParallelism(5);
+  options.listeners.emplace_back(new FullCompactor(options));
+
+  DB* db = nullptr;
+  DestroyDB(kDBPath, options);
+  Status s = DB::Open(options, kDBPath, &db);
+  assert(s.ok());
+  assert(db);
+
+  // if background compaction is not working, write will stall
+  // because of options.level0_stop_writes_trigger
+  for (int i = 1000; i < 99999; ++i) {
+    db->Put(WriteOptions(), std::to_string(i),
+                            std::string(500, 'a' + (i % 26)));
+  }
+
+  // verify the values are still there
+  std::string value;
+  for (int i = 1000; i < 99999; ++i) {
+    db->Get(ReadOptions(), std::to_string(i),
+                           &value);
+    assert(value == std::string(500, 'a' + (i % 26)));
+  }
+
+  // close the db.
+  delete db;
+
+  return 0;
+}
diff --git a/src/rocksdb/examples/simple_example.cc b/src/rocksdb/examples/simple_example.cc
new file mode 100644
index 0000000..28a7c9e
--- /dev/null
+++ b/src/rocksdb/examples/simple_example.cc
@@ -0,0 +1,55 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+#include <cstdio>
+#include <string>
+
+#include "rocksdb/db.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/options.h"
+
+using namespace rocksdb;
+
+std::string kDBPath = "/tmp/rocksdb_simple_example";
+
+int main() {
+  DB* db;
+  Options options;
+  // Optimize RocksDB. This is the easiest way to get RocksDB to perform well
+  options.IncreaseParallelism();
+  options.OptimizeLevelStyleCompaction();
+  // create the DB if it's not already present
+  options.create_if_missing = true;
+
+  // open DB
+  Status s = DB::Open(options, kDBPath, &db);
+  assert(s.ok());
+
+  // Put key-value
+  s = db->Put(WriteOptions(), "key1", "value");
+  assert(s.ok());
+  std::string value;
+  // get value
+  s = db->Get(ReadOptions(), "key1", &value);
+  assert(s.ok());
+  assert(value == "value");
+
+  // atomically apply a set of updates
+  {
+    WriteBatch batch;
+    batch.Delete("key1");
+    batch.Put("key2", value);
+    s = db->Write(WriteOptions(), &batch);
+  }
+
+  s = db->Get(ReadOptions(), "key1", &value);
+  assert(s.IsNotFound());
+
+  db->Get(ReadOptions(), "key2", &value);
+  assert(value == "value");
+
+  delete db;
+
+  return 0;
+}
diff --git a/src/rocksdb/hdfs/README b/src/rocksdb/hdfs/README
index 9b7d0a6..f4f1106 100644
--- a/src/rocksdb/hdfs/README
+++ b/src/rocksdb/hdfs/README
@@ -1,19 +1,16 @@
 This directory contains the hdfs extensions needed to make rocksdb store
 files in HDFS.
 
-The hdfs.h file is copied from the Apache Hadoop 1.0 source code. 
-It defines the libhdfs library
-(http://hadoop.apache.org/common/docs/r0.20.2/libhdfs.html) to access 
-data in HDFS.  The libhdfs.a is copied from the Apache Hadoop 1.0 build. 
-It implements the API defined in hdfs.h. If your hadoop cluster is running
-a different hadoop release, then install these two files manually from your
-hadoop distribution and then recompile rocksdb.
+It has been compiled and testing against CDH 4.4 (2.0.0+1475-1.cdh4.4.0.p0.23~precise-cdh4.4.0).
+
+The configuration assumes that packages libhdfs0, libhdfs0-dev are 
+installed which basically means that hdfs.h is in /usr/include and libhdfs in /usr/lib
 
 The env_hdfs.h file defines the rocksdb objects that are needed to talk to an
 underlying filesystem. 
 
 If you want to compile rocksdb with hdfs support, please set the following
-enviroment variables appropriately:
+enviroment variables appropriately (also defined in setup.sh for convenience)
    USE_HDFS=1
    JAVA_HOME=/usr/local/jdk-6u22-64
    LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/jdk-6u22-64/jre/lib/amd64/server:/usr/local/jdk-6u22-64/jre/lib/amd64/:./snappy/libs
diff --git a/src/rocksdb/hdfs/env_hdfs.h b/src/rocksdb/hdfs/env_hdfs.h
index 303cd81..cc94d52 100644
--- a/src/rocksdb/hdfs/env_hdfs.h
+++ b/src/rocksdb/hdfs/env_hdfs.h
@@ -14,13 +14,10 @@
 #include "rocksdb/status.h"
 
 #ifdef USE_HDFS
-#include "hdfs/hdfs.h"
+#include <hdfs.h>
 
 namespace rocksdb {
 
-static const std::string kProto = "hdfs://";
-static const std::string pathsep = "/";
-
 // Thrown during execution when there is an issue with the supplied
 // arguments.
 class HdfsUsageException : public std::exception { };
@@ -58,20 +55,23 @@ class HdfsEnv : public Env {
   }
 
   virtual Status NewSequentialFile(const std::string& fname,
-                                   SequentialFile** result);
+                                   std::unique_ptr<SequentialFile>* result,
+                                   const EnvOptions& options);
 
   virtual Status NewRandomAccessFile(const std::string& fname,
-                                     RandomAccessFile** result);
+                                     std::unique_ptr<RandomAccessFile>* result,
+                                     const EnvOptions& options);
 
   virtual Status NewWritableFile(const std::string& fname,
-                                 WritableFile** result);
+                                 std::unique_ptr<WritableFile>* result,
+                                 const EnvOptions& options);
 
   virtual Status NewRandomRWFile(const std::string& fname,
-                                 unique_ptr<RandomRWFile>* result,
+                                 std::unique_ptr<RandomRWFile>* result,
                                  const EnvOptions& options);
 
   virtual Status NewDirectory(const std::string& name,
-                              unique_ptr<Directory>* result);
+                              std::unique_ptr<Directory>* result);
 
   virtual bool FileExists(const std::string& fname);
 
@@ -93,15 +93,22 @@ class HdfsEnv : public Env {
 
   virtual Status RenameFile(const std::string& src, const std::string& target);
 
+  virtual Status LinkFile(const std::string& src, const std::string& target);
+
   virtual Status LockFile(const std::string& fname, FileLock** lock);
 
   virtual Status UnlockFile(FileLock* lock);
 
-  virtual Status NewLogger(const std::string& fname, Logger** result);
+  virtual Status NewLogger(const std::string& fname,
+                           std::shared_ptr<Logger>* result);
 
   virtual void Schedule(void (*function)(void* arg), void* arg,
-                        Priority pri = LOW) {
-    posixEnv->Schedule(function, arg, pri);
+                        Priority pri = LOW, void* tag = nullptr) {
+    posixEnv->Schedule(function, arg, pri, tag);
+  }
+
+  virtual int UnSchedule(void* tag, Priority pri) {
+    posixEnv->UnSchedule(tag, pri);
   }
 
   virtual void StartThread(void (*function)(void* arg), void* arg) {
@@ -144,6 +151,10 @@ class HdfsEnv : public Env {
     posixEnv->SetBackgroundThreads(number, pri);
   }
 
+  virtual void IncBackgroundThreadsIfNeeded(int number, Priority pri) override {
+    posixEnv->IncBackgroundThreadsIfNeeded(number, pri);
+  }
+
   virtual std::string TimeToString(uint64_t number) {
     return posixEnv->TimeToString(number);
   }
@@ -161,6 +172,9 @@ class HdfsEnv : public Env {
                         // object here so that we can use posix timers,
                         // posix threads, etc.
 
+  static const std::string kProto;
+  static const std::string pathsep;
+
   /**
    * If the URI is specified of the form hdfs://server:port/path,
    * then connect to the specified cluster
@@ -228,7 +242,7 @@ class HdfsEnv : public Env {
   explicit HdfsEnv(const std::string& fsname) {
     fprintf(stderr, "You have not build rocksdb with HDFS support\n");
     fprintf(stderr, "Please see hdfs/README for details\n");
-    throw std::exception();
+    abort();
   }
 
   virtual ~HdfsEnv() {
@@ -236,87 +250,116 @@ class HdfsEnv : public Env {
 
   virtual Status NewSequentialFile(const std::string& fname,
                                    unique_ptr<SequentialFile>* result,
-                                   const EnvOptions& options);
+                                   const EnvOptions& options) override;
 
   virtual Status NewRandomAccessFile(const std::string& fname,
                                      unique_ptr<RandomAccessFile>* result,
-                                     const EnvOptions& options) {
+                                     const EnvOptions& options) override {
     return notsup;
   }
 
   virtual Status NewWritableFile(const std::string& fname,
                                  unique_ptr<WritableFile>* result,
-                                 const EnvOptions& options) {
+                                 const EnvOptions& options) override {
     return notsup;
   }
 
   virtual Status NewRandomRWFile(const std::string& fname,
                                  unique_ptr<RandomRWFile>* result,
-                                 const EnvOptions& options) {
+                                 const EnvOptions& options) override {
     return notsup;
   }
 
   virtual Status NewDirectory(const std::string& name,
-                              unique_ptr<Directory>* result) {
+                              unique_ptr<Directory>* result) override {
     return notsup;
   }
 
-  virtual bool FileExists(const std::string& fname){return false;}
+  virtual bool FileExists(const std::string& fname) override { return false; }
 
   virtual Status GetChildren(const std::string& path,
-                             std::vector<std::string>* result){return notsup;}
+                             std::vector<std::string>* result) override {
+    return notsup;
+  }
 
-  virtual Status DeleteFile(const std::string& fname){return notsup;}
+  virtual Status DeleteFile(const std::string& fname) override {
+    return notsup;
+  }
 
-  virtual Status CreateDir(const std::string& name){return notsup;}
+  virtual Status CreateDir(const std::string& name) override { return notsup; }
 
-  virtual Status CreateDirIfMissing(const std::string& name){return notsup;}
+  virtual Status CreateDirIfMissing(const std::string& name) override {
+    return notsup;
+  }
 
-  virtual Status DeleteDir(const std::string& name){return notsup;}
+  virtual Status DeleteDir(const std::string& name) override { return notsup; }
 
-  virtual Status GetFileSize(const std::string& fname, uint64_t* size){return notsup;}
+  virtual Status GetFileSize(const std::string& fname,
+                             uint64_t* size) override {
+    return notsup;
+  }
 
   virtual Status GetFileModificationTime(const std::string& fname,
-                                         uint64_t* time) {
+                                         uint64_t* time) override {
     return notsup;
   }
 
-  virtual Status RenameFile(const std::string& src, const std::string& target){return notsup;}
+  virtual Status RenameFile(const std::string& src,
+                            const std::string& target) override {
+    return notsup;
+  }
 
-  virtual Status LockFile(const std::string& fname, FileLock** lock){return notsup;}
+  virtual Status LinkFile(const std::string& src,
+                          const std::string& target) override {
+    return notsup;
+  }
+
+  virtual Status LockFile(const std::string& fname, FileLock** lock) override {
+    return notsup;
+  }
 
-  virtual Status UnlockFile(FileLock* lock){return notsup;}
+  virtual Status UnlockFile(FileLock* lock) override { return notsup; }
 
   virtual Status NewLogger(const std::string& fname,
-                           shared_ptr<Logger>* result){return notsup;}
+                           shared_ptr<Logger>* result) override {
+    return notsup;
+  }
 
   virtual void Schedule(void (*function)(void* arg), void* arg,
-                        Priority pri = LOW) {}
+                        Priority pri = LOW, void* tag = nullptr) override {}
+
+  virtual int UnSchedule(void* tag, Priority pri) override { return 0; }
 
-  virtual void StartThread(void (*function)(void* arg), void* arg) {}
+  virtual void StartThread(void (*function)(void* arg), void* arg) override {}
 
-  virtual void WaitForJoin() {}
+  virtual void WaitForJoin() override {}
 
-  virtual unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const {
+  virtual unsigned int GetThreadPoolQueueLen(
+      Priority pri = LOW) const override {
     return 0;
   }
 
-  virtual Status GetTestDirectory(std::string* path) {return notsup;}
+  virtual Status GetTestDirectory(std::string* path) override { return notsup; }
 
-  virtual uint64_t NowMicros() {return 0;}
+  virtual uint64_t NowMicros() override { return 0; }
 
-  virtual void SleepForMicroseconds(int micros) {}
+  virtual void SleepForMicroseconds(int micros) override {}
 
-  virtual Status GetHostName(char* name, uint64_t len) {return notsup;}
+  virtual Status GetHostName(char* name, uint64_t len) override {
+    return notsup;
+  }
 
-  virtual Status GetCurrentTime(int64_t* unix_time) {return notsup;}
+  virtual Status GetCurrentTime(int64_t* unix_time) override { return notsup; }
 
   virtual Status GetAbsolutePath(const std::string& db_path,
-      std::string* outputpath) {return notsup;}
-
-  virtual void SetBackgroundThreads(int number, Priority pri = LOW) {}
+                                 std::string* outputpath) override {
+    return notsup;
+  }
 
-  virtual std::string TimeToString(uint64_t number) { return "";}
+  virtual void SetBackgroundThreads(int number, Priority pri = LOW) override {}
+  virtual void IncBackgroundThreadsIfNeeded(int number, Priority pri) override {
+  }
+  virtual std::string TimeToString(uint64_t number) override { return ""; }
 };
 }
 
diff --git a/src/rocksdb/hdfs/hdfs.h b/src/rocksdb/hdfs/hdfs.h
deleted file mode 100644
index 8e8dfec..0000000
--- a/src/rocksdb/hdfs/hdfs.h
+++ /dev/null
@@ -1,477 +0,0 @@
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-//
-#ifndef LIBHDFS_HDFS_H
-#define LIBHDFS_HDFS_H
-
-#include <sys/types.h>
-#include <sys/stat.h>
-
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdint.h>
-#include <string.h>
-#include <stdlib.h>
-#include <time.h>
-#include <errno.h>
-
-#include <jni.h>
-
-#ifndef O_RDONLY
-#define O_RDONLY 1
-#endif
-
-#ifndef O_WRONLY 
-#define O_WRONLY 2
-#endif
-
-#ifndef EINTERNAL
-#define EINTERNAL 255 
-#endif
-
-
-/** All APIs set errno to meaningful values */
-#ifdef __cplusplus
-extern  "C" {
-#endif
-
-    /**
-     * Some utility decls used in libhdfs.
-     */
-
-    typedef int32_t   tSize; /// size of data for read/write io ops 
-    typedef time_t    tTime; /// time type in seconds
-    typedef int64_t   tOffset;/// offset within the file
-    typedef uint16_t  tPort; /// port
-    typedef enum tObjectKind {
-        kObjectKindFile = 'F',
-        kObjectKindDirectory = 'D',
-    } tObjectKind;
-
-
-    /**
-     * The C reflection of org.apache.org.hadoop.FileSystem .
-     */
-    typedef void* hdfsFS;
-
-    
-    /**
-     * The C equivalent of org.apache.org.hadoop.FSData(Input|Output)Stream .
-     */
-    enum hdfsStreamType
-    {
-        UNINITIALIZED = 0,
-        INPUT = 1,
-        OUTPUT = 2,
-    };
-
-    
-    /**
-     * The 'file-handle' to a file in hdfs.
-     */
-    struct hdfsFile_internal {
-        void* file;
-        enum hdfsStreamType type;
-    };
-    typedef struct hdfsFile_internal* hdfsFile;
-      
-
-    /** 
-     * hdfsConnectAsUser - Connect to a hdfs file system as a specific user
-     * Connect to the hdfs.
-     * @param host A string containing either a host name, or an ip address
-     * of the namenode of a hdfs cluster. 'host' should be passed as NULL if
-     * you want to connect to local filesystem. 'host' should be passed as
-     * 'default' (and port as 0) to used the 'configured' filesystem
-     * (core-site/core-default.xml).
-     * @param port The port on which the server is listening.
-     * @param user the user name (this is hadoop domain user). Or NULL is equivelant to hhdfsConnect(host, port)
-     * @param groups the groups (these are hadoop domain groups)
-     * @return Returns a handle to the filesystem or NULL on error.
-     */
-     hdfsFS hdfsConnectAsUser(const char* host, tPort port, const char *user , const char *groups[], int groups_size );
-
-
-    /** 
-     * hdfsConnect - Connect to a hdfs file system.
-     * Connect to the hdfs.
-     * @param host A string containing either a host name, or an ip address
-     * of the namenode of a hdfs cluster. 'host' should be passed as NULL if
-     * you want to connect to local filesystem. 'host' should be passed as
-     * 'default' (and port as 0) to used the 'configured' filesystem
-     * (core-site/core-default.xml).
-     * @param port The port on which the server is listening.
-     * @return Returns a handle to the filesystem or NULL on error.
-     */
-     hdfsFS hdfsConnect(const char* host, tPort port);
-
-
-    /**
-     * This are the same as hdfsConnectAsUser except that every invocation returns a new FileSystem handle.
-     * Applications should call a hdfsDisconnect for every call to hdfsConnectAsUserNewInstance.
-     */
-     hdfsFS hdfsConnectAsUserNewInstance(const char* host, tPort port, const char *user , const char *groups[], int groups_size );
-     hdfsFS hdfsConnectNewInstance(const char* host, tPort port);
-     hdfsFS hdfsConnectPath(const char* uri);
-
-    /** 
-     * hdfsDisconnect - Disconnect from the hdfs file system.
-     * Disconnect from hdfs.
-     * @param fs The configured filesystem handle.
-     * @return Returns 0 on success, -1 on error.  
-     */
-    int hdfsDisconnect(hdfsFS fs);
-        
-
-    /** 
-     * hdfsOpenFile - Open a hdfs file in given mode.
-     * @param fs The configured filesystem handle.
-     * @param path The full path to the file.
-     * @param flags - an | of bits/fcntl.h file flags - supported flags are O_RDONLY, O_WRONLY (meaning create or overwrite i.e., implies O_TRUNCAT), 
-     * O_WRONLY|O_APPEND. Other flags are generally ignored other than (O_RDWR || (O_EXCL & O_CREAT)) which return NULL and set errno equal ENOTSUP.
-     * @param bufferSize Size of buffer for read/write - pass 0 if you want
-     * to use the default configured values.
-     * @param replication Block replication - pass 0 if you want to use
-     * the default configured values.
-     * @param blocksize Size of block - pass 0 if you want to use the
-     * default configured values.
-     * @return Returns the handle to the open file or NULL on error.
-     */
-    hdfsFile hdfsOpenFile(hdfsFS fs, const char* path, int flags,
-                          int bufferSize, short replication, tSize blocksize);
-
-
-    /** 
-     * hdfsCloseFile - Close an open file. 
-     * @param fs The configured filesystem handle.
-     * @param file The file handle.
-     * @return Returns 0 on success, -1 on error.  
-     */
-    int hdfsCloseFile(hdfsFS fs, hdfsFile file);
-
-
-    /** 
-     * hdfsExists - Checks if a given path exsits on the filesystem 
-     * @param fs The configured filesystem handle.
-     * @param path The path to look for
-     * @return Returns 0 on exists, 1 on non-exists, -1/-2 on error.  
-     */
-    int hdfsExists(hdfsFS fs, const char *path);
-
-
-    /** 
-     * hdfsSeek - Seek to given offset in file. 
-     * This works only for files opened in read-only mode. 
-     * @param fs The configured filesystem handle.
-     * @param file The file handle.
-     * @param desiredPos Offset into the file to seek into.
-     * @return Returns 0 on success, -1 on error.  
-     */
-    int hdfsSeek(hdfsFS fs, hdfsFile file, tOffset desiredPos); 
-
-
-    /** 
-     * hdfsTell - Get the current offset in the file, in bytes.
-     * @param fs The configured filesystem handle.
-     * @param file The file handle.
-     * @return Current offset, -1 on error.
-     */
-    tOffset hdfsTell(hdfsFS fs, hdfsFile file);
-
-
-    /** 
-     * hdfsRead - Read data from an open file.
-     * @param fs The configured filesystem handle.
-     * @param file The file handle.
-     * @param buffer The buffer to copy read bytes into.
-     * @param length The length of the buffer.
-     * @return Returns the number of bytes actually read, possibly less
-     * than than length;-1 on error.
-     */
-    tSize hdfsRead(hdfsFS fs, hdfsFile file, void* buffer, tSize length);
-
-
-    /** 
-     * hdfsPread - Positional read of data from an open file.
-     * @param fs The configured filesystem handle.
-     * @param file The file handle.
-     * @param position Position from which to read
-     * @param buffer The buffer to copy read bytes into.
-     * @param length The length of the buffer.
-     * @return Returns the number of bytes actually read, possibly less than
-     * than length;-1 on error.
-     */
-    tSize hdfsPread(hdfsFS fs, hdfsFile file, tOffset position,
-                    void* buffer, tSize length);
-
-
-    /** 
-     * hdfsWrite - Write data into an open file.
-     * @param fs The configured filesystem handle.
-     * @param file The file handle.
-     * @param buffer The data.
-     * @param length The no. of bytes to write. 
-     * @return Returns the number of bytes written, -1 on error.
-     */
-    tSize hdfsWrite(hdfsFS fs, hdfsFile file, const void* buffer,
-                    tSize length);
-
-
-    /** 
-     * hdfsWrite - Flush the data. 
-     * @param fs The configured filesystem handle.
-     * @param file The file handle.
-     * @return Returns 0 on success, -1 on error. 
-     */
-    int hdfsFlush(hdfsFS fs, hdfsFile file);
-
-    /**
-     * hdfsSync - Sync the data to persistent store.
-     * @param fs The configured filesystem handle.
-     * @param file The file handle.
-     * @return Returns 0 on success, -1 on error.
-     */
-    int hdfsSync(hdfsFS fs, hdfsFile file);
-
-    /**
-     * hdfsGetNumReplicasInPipeline - get number of remaining replicas in 
-     * pipeline
-     * @param fs The configured filesystem handle
-     * @param file the file handle
-     * @return returns the # of datanodes in the write pipeline; -1 on error
-     */
-   int hdfsGetNumCurrentReplicas(hdfsFS, hdfsFile file);
-
-    /**
-     * hdfsAvailable - Number of bytes that can be read from this
-     * input stream without blocking.
-     * @param fs The configured filesystem handle.
-     * @param file The file handle.
-     * @return Returns available bytes; -1 on error. 
-     */
-    int hdfsAvailable(hdfsFS fs, hdfsFile file);
-
-
-    /**
-     * hdfsCopy - Copy file from one filesystem to another.
-     * @param srcFS The handle to source filesystem.
-     * @param src The path of source file. 
-     * @param dstFS The handle to destination filesystem.
-     * @param dst The path of destination file. 
-     * @return Returns 0 on success, -1 on error. 
-     */
-    int hdfsCopy(hdfsFS srcFS, const char* src, hdfsFS dstFS, const char* dst);
-
-
-    /**
-     * hdfsMove - Move file from one filesystem to another.
-     * @param srcFS The handle to source filesystem.
-     * @param src The path of source file. 
-     * @param dstFS The handle to destination filesystem.
-     * @param dst The path of destination file. 
-     * @return Returns 0 on success, -1 on error. 
-     */
-    int hdfsMove(hdfsFS srcFS, const char* src, hdfsFS dstFS, const char* dst);
-
-
-    /**
-     * hdfsDelete - Delete file. 
-     * @param fs The configured filesystem handle.
-     * @param path The path of the file. 
-     * @return Returns 0 on success, -1 on error. 
-     */
-    int hdfsDelete(hdfsFS fs, const char* path);
-
-
-    /**
-     * hdfsRename - Rename file. 
-     * @param fs The configured filesystem handle.
-     * @param oldPath The path of the source file. 
-     * @param newPath The path of the destination file. 
-     * @return Returns 0 on success, -1 on error. 
-     */
-    int hdfsRename(hdfsFS fs, const char* oldPath, const char* newPath);
-
-
-    /** 
-     * hdfsGetWorkingDirectory - Get the current working directory for
-     * the given filesystem.
-     * @param fs The configured filesystem handle.
-     * @param buffer The user-buffer to copy path of cwd into. 
-     * @param bufferSize The length of user-buffer.
-     * @return Returns buffer, NULL on error.
-     */
-    char* hdfsGetWorkingDirectory(hdfsFS fs, char *buffer, size_t bufferSize);
-
-
-    /** 
-     * hdfsSetWorkingDirectory - Set the working directory. All relative
-     * paths will be resolved relative to it.
-     * @param fs The configured filesystem handle.
-     * @param path The path of the new 'cwd'. 
-     * @return Returns 0 on success, -1 on error. 
-     */
-    int hdfsSetWorkingDirectory(hdfsFS fs, const char* path);
-
-
-    /** 
-     * hdfsCreateDirectory - Make the given file and all non-existent
-     * parents into directories.
-     * @param fs The configured filesystem handle.
-     * @param path The path of the directory. 
-     * @return Returns 0 on success, -1 on error. 
-     */
-    int hdfsCreateDirectory(hdfsFS fs, const char* path);
-
-
-    /** 
-     * hdfsSetReplication - Set the replication of the specified
-     * file to the supplied value
-     * @param fs The configured filesystem handle.
-     * @param path The path of the file. 
-     * @return Returns 0 on success, -1 on error. 
-     */
-    int hdfsSetReplication(hdfsFS fs, const char* path, int16_t replication);
-
-
-    /** 
-     * hdfsFileInfo - Information about a file/directory.
-     */
-    typedef struct  {
-        tObjectKind mKind;   /* file or directory */
-        char *mName;         /* the name of the file */
-        tTime mLastMod;      /* the last modification time for the file in seconds */
-        tOffset mSize;       /* the size of the file in bytes */
-        short mReplication;    /* the count of replicas */
-        tOffset mBlockSize;  /* the block size for the file */
-        char *mOwner;        /* the owner of the file */
-        char *mGroup;        /* the group associated with the file */
-        short mPermissions;  /* the permissions associated with the file */
-        tTime mLastAccess;    /* the last access time for the file in seconds */
-    } hdfsFileInfo;
-
-
-    /** 
-     * hdfsListDirectory - Get list of files/directories for a given
-     * directory-path. hdfsFreeFileInfo should be called to deallocate memory if
-     * the function returns non-NULL value.
-     * @param fs The configured filesystem handle.
-     * @param path The path of the directory. 
-     * @param numEntries Set to the number of files/directories in path.
-     * @return Returns a dynamically-allocated array of hdfsFileInfo
-     * objects; NULL if empty or on error.
-     * on error, numEntries will be -1.
-     */
-    hdfsFileInfo *hdfsListDirectory(hdfsFS fs, const char* path,
-                                    int *numEntries);
-
-
-    /** 
-     * hdfsGetPathInfo - Get information about a path as a (dynamically
-     * allocated) single hdfsFileInfo struct. hdfsFreeFileInfo should be
-     * called when the pointer is no longer needed.
-     * @param fs The configured filesystem handle.
-     * @param path The path of the file. 
-     * @return Returns a dynamically-allocated hdfsFileInfo object;
-     * NULL on error.
-     */
-    hdfsFileInfo *hdfsGetPathInfo(hdfsFS fs, const char* path);
-
-
-    /** 
-     * hdfsFreeFileInfo - Free up the hdfsFileInfo array (including fields) 
-     * @param hdfsFileInfo The array of dynamically-allocated hdfsFileInfo
-     * objects.
-     * @param numEntries The size of the array.
-     */
-    void hdfsFreeFileInfo(hdfsFileInfo *hdfsFileInfo, int numEntries);
-
-
-    /** 
-     * hdfsGetHosts - Get hostnames where a particular block (determined by
-     * pos & blocksize) of a file is stored. The last element in the array
-     * is NULL. Due to replication, a single block could be present on
-     * multiple hosts.
-     * @param fs The configured filesystem handle.
-     * @param path The path of the file. 
-     * @param start The start of the block.
-     * @param length The length of the block.
-     * @return Returns a dynamically-allocated 2-d array of blocks-hosts;
-     * NULL on error.
-     */
-    char*** hdfsGetHosts(hdfsFS fs, const char* path, 
-            tOffset start, tOffset length);
-
-
-    /** 
-     * hdfsFreeHosts - Free up the structure returned by hdfsGetHosts
-     * @param hdfsFileInfo The array of dynamically-allocated hdfsFileInfo
-     * objects.
-     * @param numEntries The size of the array.
-     */
-    void hdfsFreeHosts(char ***blockHosts);
-
-
-    /** 
-     * hdfsGetDefaultBlockSize - Get the optimum blocksize.
-     * @param fs The configured filesystem handle.
-     * @return Returns the blocksize; -1 on error. 
-     */
-    tOffset hdfsGetDefaultBlockSize(hdfsFS fs);
-
-
-    /** 
-     * hdfsGetCapacity - Return the raw capacity of the filesystem.  
-     * @param fs The configured filesystem handle.
-     * @return Returns the raw-capacity; -1 on error. 
-     */
-    tOffset hdfsGetCapacity(hdfsFS fs);
-
-
-    /** 
-     * hdfsGetUsed - Return the total raw size of all files in the filesystem.
-     * @param fs The configured filesystem handle.
-     * @return Returns the total-size; -1 on error. 
-     */
-    tOffset hdfsGetUsed(hdfsFS fs);
-
-    /** 
-     * hdfsChown 
-     * @param fs The configured filesystem handle.
-     * @param path the path to the file or directory
-     * @param owner this is a string in Hadoop land. Set to null or "" if only setting group
-     * @param group  this is a string in Hadoop land. Set to null or "" if only setting user
-     * @return 0 on success else -1
-     */
-    int hdfsChown(hdfsFS fs, const char* path, const char *owner, const char *group);
-
-    /** 
-     * hdfsChmod
-     * @param fs The configured filesystem handle.
-     * @param path the path to the file or directory
-     * @param mode the bitmask to set it to
-     * @return 0 on success else -1
-     */
-      int hdfsChmod(hdfsFS fs, const char* path, short mode);
-
-    /** 
-     * hdfsUtime
-     * @param fs The configured filesystem handle.
-     * @param path the path to the file or directory
-     * @param mtime new modification time or 0 for only set access time in seconds
-     * @param atime new access time or 0 for only set modification time in seconds
-     * @return 0 on success else -1
-     */
-    int hdfsUtime(hdfsFS fs, const char* path, tTime mtime, tTime atime);
-    
-#ifdef __cplusplus
-}
-#endif
-
-#endif /*LIBHDFS_HDFS_H*/
-
-/**
- * vim: ts=4: sw=4: et
- */
diff --git a/src/rocksdb/hdfs/libhdfs.a b/src/rocksdb/hdfs/libhdfs.a
deleted file mode 100644
index 4d1f19f..0000000
Binary files a/src/rocksdb/hdfs/libhdfs.a and /dev/null differ
diff --git a/src/rocksdb/hdfs/setup.sh b/src/rocksdb/hdfs/setup.sh
new file mode 100644
index 0000000..ac69b52
--- /dev/null
+++ b/src/rocksdb/hdfs/setup.sh
@@ -0,0 +1,7 @@
+export USE_HDFS=1
+export LD_LIBRARY_PATH=$JAVA_HOME/jre/lib/amd64/server:$JAVA_HOME/jre/lib/amd64:/usr/lib/hadoop/lib/native
+
+export CLASSPATH=
+for f in `find /usr/lib/hadoop-hdfs | grep jar`; do export CLASSPATH=$CLASSPATH:$f; done
+for f in `find /usr/lib/hadoop | grep jar`; do export CLASSPATH=$CLASSPATH:$f; done
+for f in `find /usr/lib/hadoop/client | grep jar`; do export CLASSPATH=$CLASSPATH:$f; done
diff --git a/src/rocksdb/helpers/memenv/memenv.cc b/src/rocksdb/helpers/memenv/memenv.cc
deleted file mode 100644
index 185e7d8..0000000
--- a/src/rocksdb/helpers/memenv/memenv.cc
+++ /dev/null
@@ -1,395 +0,0 @@
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#include "rocksdb/env.h"
-#include "rocksdb/status.h"
-#include "port/port.h"
-#include "util/mutexlock.h"
-#include <map>
-#include <string.h>
-#include <string>
-#include <vector>
-
-namespace rocksdb {
-
-namespace {
-
-class FileState {
- public:
-  // FileStates are reference counted. The initial reference count is zero
-  // and the caller must call Ref() at least once.
-  FileState() : refs_(0), size_(0) {}
-
-  // Increase the reference count.
-  void Ref() {
-    MutexLock lock(&refs_mutex_);
-    ++refs_;
-  }
-
-  // Decrease the reference count. Delete if this is the last reference.
-  void Unref() {
-    bool do_delete = false;
-
-    {
-      MutexLock lock(&refs_mutex_);
-      --refs_;
-      assert(refs_ >= 0);
-      if (refs_ <= 0) {
-        do_delete = true;
-      }
-    }
-
-    if (do_delete) {
-      delete this;
-    }
-  }
-
-  uint64_t Size() const { return size_; }
-
-  Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const {
-    if (offset > size_) {
-      return Status::IOError("Offset greater than file size.");
-    }
-    const uint64_t available = size_ - offset;
-    if (n > available) {
-      n = available;
-    }
-    if (n == 0) {
-      *result = Slice();
-      return Status::OK();
-    }
-
-    size_t block = offset / kBlockSize;
-    size_t block_offset = offset % kBlockSize;
-
-    if (n <= kBlockSize - block_offset) {
-      // The requested bytes are all in the first block.
-      *result = Slice(blocks_[block] + block_offset, n);
-      return Status::OK();
-    }
-
-    size_t bytes_to_copy = n;
-    char* dst = scratch;
-
-    while (bytes_to_copy > 0) {
-      size_t avail = kBlockSize - block_offset;
-      if (avail > bytes_to_copy) {
-        avail = bytes_to_copy;
-      }
-      memcpy(dst, blocks_[block] + block_offset, avail);
-
-      bytes_to_copy -= avail;
-      dst += avail;
-      block++;
-      block_offset = 0;
-    }
-
-    *result = Slice(scratch, n);
-    return Status::OK();
-  }
-
-  Status Append(const Slice& data) {
-    const char* src = data.data();
-    size_t src_len = data.size();
-
-    while (src_len > 0) {
-      size_t avail;
-      size_t offset = size_ % kBlockSize;
-
-      if (offset != 0) {
-        // There is some room in the last block.
-        avail = kBlockSize - offset;
-      } else {
-        // No room in the last block; push new one.
-        blocks_.push_back(new char[kBlockSize]);
-        avail = kBlockSize;
-      }
-
-      if (avail > src_len) {
-        avail = src_len;
-      }
-      memcpy(blocks_.back() + offset, src, avail);
-      src_len -= avail;
-      src += avail;
-      size_ += avail;
-    }
-
-    return Status::OK();
-  }
-
- private:
-  // Private since only Unref() should be used to delete it.
-  ~FileState() {
-    for (std::vector<char*>::iterator i = blocks_.begin(); i != blocks_.end();
-         ++i) {
-      delete [] *i;
-    }
-  }
-
-  // No copying allowed.
-  FileState(const FileState&);
-  void operator=(const FileState&);
-
-  port::Mutex refs_mutex_;
-  int refs_;  // Protected by refs_mutex_;
-
-  // The following fields are not protected by any mutex. They are only mutable
-  // while the file is being written, and concurrent access is not allowed
-  // to writable files.
-  std::vector<char*> blocks_;
-  uint64_t size_;
-
-  enum { kBlockSize = 8 * 1024 };
-};
-
-class SequentialFileImpl : public SequentialFile {
- public:
-  explicit SequentialFileImpl(FileState* file) : file_(file), pos_(0) {
-    file_->Ref();
-  }
-
-  ~SequentialFileImpl() {
-    file_->Unref();
-  }
-
-  virtual Status Read(size_t n, Slice* result, char* scratch) {
-    Status s = file_->Read(pos_, n, result, scratch);
-    if (s.ok()) {
-      pos_ += result->size();
-    }
-    return s;
-  }
-
-  virtual Status Skip(uint64_t n) {
-    if (pos_ > file_->Size()) {
-      return Status::IOError("pos_ > file_->Size()");
-    }
-    const size_t available = file_->Size() - pos_;
-    if (n > available) {
-      n = available;
-    }
-    pos_ += n;
-    return Status::OK();
-  }
-
- private:
-  FileState* file_;
-  size_t pos_;
-};
-
-class RandomAccessFileImpl : public RandomAccessFile {
- public:
-  explicit RandomAccessFileImpl(FileState* file) : file_(file) {
-    file_->Ref();
-  }
-
-  ~RandomAccessFileImpl() {
-    file_->Unref();
-  }
-
-  virtual Status Read(uint64_t offset, size_t n, Slice* result,
-                      char* scratch) const {
-    return file_->Read(offset, n, result, scratch);
-  }
-
- private:
-  FileState* file_;
-};
-
-class WritableFileImpl : public WritableFile {
- public:
-  WritableFileImpl(FileState* file) : file_(file) {
-    file_->Ref();
-  }
-
-  ~WritableFileImpl() {
-    file_->Unref();
-  }
-
-  virtual Status Append(const Slice& data) {
-    return file_->Append(data);
-  }
-
-  virtual Status Close() { return Status::OK(); }
-  virtual Status Flush() { return Status::OK(); }
-  virtual Status Sync() { return Status::OK(); }
-
- private:
-  FileState* file_;
-};
-
-class InMemoryDirectory : public Directory {
- public:
-  virtual Status Fsync() { return Status::OK(); }
-};
-
-class InMemoryEnv : public EnvWrapper {
- public:
-  explicit InMemoryEnv(Env* base_env) : EnvWrapper(base_env) { }
-
-  virtual ~InMemoryEnv() {
-    for (FileSystem::iterator i = file_map_.begin(); i != file_map_.end(); ++i){
-      i->second->Unref();
-    }
-  }
-
-  // Partial implementation of the Env interface.
-  virtual Status NewSequentialFile(const std::string& fname,
-                                   unique_ptr<SequentialFile>* result,
-                                   const EnvOptions& soptions) {
-    MutexLock lock(&mutex_);
-    if (file_map_.find(fname) == file_map_.end()) {
-      *result = NULL;
-      return Status::IOError(fname, "File not found");
-    }
-
-    result->reset(new SequentialFileImpl(file_map_[fname]));
-    return Status::OK();
-  }
-
-  virtual Status NewRandomAccessFile(const std::string& fname,
-                                     unique_ptr<RandomAccessFile>* result,
-                                     const EnvOptions& soptions) {
-    MutexLock lock(&mutex_);
-    if (file_map_.find(fname) == file_map_.end()) {
-      *result = NULL;
-      return Status::IOError(fname, "File not found");
-    }
-
-    result->reset(new RandomAccessFileImpl(file_map_[fname]));
-    return Status::OK();
-  }
-
-  virtual Status NewWritableFile(const std::string& fname,
-                                 unique_ptr<WritableFile>* result,
-                                 const EnvOptions& soptions) {
-    MutexLock lock(&mutex_);
-    if (file_map_.find(fname) != file_map_.end()) {
-      DeleteFileInternal(fname);
-    }
-
-    FileState* file = new FileState();
-    file->Ref();
-    file_map_[fname] = file;
-
-    result->reset(new WritableFileImpl(file));
-    return Status::OK();
-  }
-
-  virtual Status NewDirectory(const std::string& name,
-                              unique_ptr<Directory>* result) {
-    result->reset(new InMemoryDirectory());
-    return Status::OK();
-  }
-
-  virtual bool FileExists(const std::string& fname) {
-    MutexLock lock(&mutex_);
-    return file_map_.find(fname) != file_map_.end();
-  }
-
-  virtual Status GetChildren(const std::string& dir,
-                             std::vector<std::string>* result) {
-    MutexLock lock(&mutex_);
-    result->clear();
-
-    for (FileSystem::iterator i = file_map_.begin(); i != file_map_.end(); ++i){
-      const std::string& filename = i->first;
-
-      if (filename.size() >= dir.size() + 1 && filename[dir.size()] == '/' &&
-          Slice(filename).starts_with(Slice(dir))) {
-        result->push_back(filename.substr(dir.size() + 1));
-      }
-    }
-
-    return Status::OK();
-  }
-
-  void DeleteFileInternal(const std::string& fname) {
-    if (file_map_.find(fname) == file_map_.end()) {
-      return;
-    }
-
-    file_map_[fname]->Unref();
-    file_map_.erase(fname);
-  }
-
-  virtual Status DeleteFile(const std::string& fname) {
-    MutexLock lock(&mutex_);
-    if (file_map_.find(fname) == file_map_.end()) {
-      return Status::IOError(fname, "File not found");
-    }
-
-    DeleteFileInternal(fname);
-    return Status::OK();
-  }
-
-  virtual Status CreateDir(const std::string& dirname) {
-    return Status::OK();
-  }
-
-  virtual Status CreateDirIfMissing(const std::string& dirname) {
-    return Status::OK();
-  }
-
-  virtual Status DeleteDir(const std::string& dirname) {
-    return Status::OK();
-  }
-
-  virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) {
-    MutexLock lock(&mutex_);
-    if (file_map_.find(fname) == file_map_.end()) {
-      return Status::IOError(fname, "File not found");
-    }
-
-    *file_size = file_map_[fname]->Size();
-    return Status::OK();
-  }
-
-  virtual Status GetFileModificationTime(const std::string& fname,
-                                         uint64_t* time) {
-    return Status::NotSupported("getFileMTime", "Not supported in MemEnv");
-  }
-
-  virtual Status RenameFile(const std::string& src,
-                            const std::string& target) {
-    MutexLock lock(&mutex_);
-    if (file_map_.find(src) == file_map_.end()) {
-      return Status::IOError(src, "File not found");
-    }
-
-    DeleteFileInternal(target);
-    file_map_[target] = file_map_[src];
-    file_map_.erase(src);
-    return Status::OK();
-  }
-
-  virtual Status LockFile(const std::string& fname, FileLock** lock) {
-    *lock = new FileLock;
-    return Status::OK();
-  }
-
-  virtual Status UnlockFile(FileLock* lock) {
-    delete lock;
-    return Status::OK();
-  }
-
-  virtual Status GetTestDirectory(std::string* path) {
-    *path = "/test";
-    return Status::OK();
-  }
-
- private:
-  // Map from filenames to FileState objects, representing a simple file system.
-  typedef std::map<std::string, FileState*> FileSystem;
-  port::Mutex mutex_;
-  FileSystem file_map_;  // Protected by mutex_.
-};
-
-}  // namespace
-
-Env* NewMemEnv(Env* base_env) {
-  return new InMemoryEnv(base_env);
-}
-
-}  // namespace rocksdb
diff --git a/src/rocksdb/helpers/memenv/memenv_test.cc b/src/rocksdb/helpers/memenv/memenv_test.cc
deleted file mode 100644
index ea3ed61..0000000
--- a/src/rocksdb/helpers/memenv/memenv_test.cc
+++ /dev/null
@@ -1,231 +0,0 @@
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#include "db/db_impl.h"
-#include "rocksdb/db.h"
-#include "rocksdb/env.h"
-#include "util/testharness.h"
-#include <memory>
-#include <string>
-#include <vector>
-
-namespace rocksdb {
-
-class MemEnvTest {
- public:
-  Env* env_;
-  const EnvOptions soptions_;
-
-  MemEnvTest()
-      : env_(NewMemEnv(Env::Default())) {
-  }
-  ~MemEnvTest() {
-    delete env_;
-  }
-};
-
-TEST(MemEnvTest, Basics) {
-  uint64_t file_size;
-  unique_ptr<WritableFile> writable_file;
-  std::vector<std::string> children;
-
-  ASSERT_OK(env_->CreateDir("/dir"));
-
-  // Check that the directory is empty.
-  ASSERT_TRUE(!env_->FileExists("/dir/non_existent"));
-  ASSERT_TRUE(!env_->GetFileSize("/dir/non_existent", &file_size).ok());
-  ASSERT_OK(env_->GetChildren("/dir", &children));
-  ASSERT_EQ(0U, children.size());
-
-  // Create a file.
-  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
-  writable_file.reset();
-
-  // Check that the file exists.
-  ASSERT_TRUE(env_->FileExists("/dir/f"));
-  ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
-  ASSERT_EQ(0U, file_size);
-  ASSERT_OK(env_->GetChildren("/dir", &children));
-  ASSERT_EQ(1U, children.size());
-  ASSERT_EQ("f", children[0]);
-
-  // Write to the file.
-  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
-  ASSERT_OK(writable_file->Append("abc"));
-  writable_file.reset();
-
-  // Check for expected size.
-  ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
-  ASSERT_EQ(3U, file_size);
-
-  // Check that renaming works.
-  ASSERT_TRUE(!env_->RenameFile("/dir/non_existent", "/dir/g").ok());
-  ASSERT_OK(env_->RenameFile("/dir/f", "/dir/g"));
-  ASSERT_TRUE(!env_->FileExists("/dir/f"));
-  ASSERT_TRUE(env_->FileExists("/dir/g"));
-  ASSERT_OK(env_->GetFileSize("/dir/g", &file_size));
-  ASSERT_EQ(3U, file_size);
-
-  // Check that opening non-existent file fails.
-  unique_ptr<SequentialFile> seq_file;
-  unique_ptr<RandomAccessFile> rand_file;
-  ASSERT_TRUE(!env_->NewSequentialFile("/dir/non_existent", &seq_file,
-                                       soptions_).ok());
-  ASSERT_TRUE(!seq_file);
-  ASSERT_TRUE(!env_->NewRandomAccessFile("/dir/non_existent", &rand_file,
-                                         soptions_).ok());
-  ASSERT_TRUE(!rand_file);
-
-  // Check that deleting works.
-  ASSERT_TRUE(!env_->DeleteFile("/dir/non_existent").ok());
-  ASSERT_OK(env_->DeleteFile("/dir/g"));
-  ASSERT_TRUE(!env_->FileExists("/dir/g"));
-  ASSERT_OK(env_->GetChildren("/dir", &children));
-  ASSERT_EQ(0U, children.size());
-  ASSERT_OK(env_->DeleteDir("/dir"));
-}
-
-TEST(MemEnvTest, ReadWrite) {
-  unique_ptr<WritableFile> writable_file;
-  unique_ptr<SequentialFile> seq_file;
-  unique_ptr<RandomAccessFile> rand_file;
-  Slice result;
-  char scratch[100];
-
-  ASSERT_OK(env_->CreateDir("/dir"));
-
-  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
-  ASSERT_OK(writable_file->Append("hello "));
-  ASSERT_OK(writable_file->Append("world"));
-  writable_file.reset();
-
-  // Read sequentially.
-  ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file, soptions_));
-  ASSERT_OK(seq_file->Read(5, &result, scratch)); // Read "hello".
-  ASSERT_EQ(0, result.compare("hello"));
-  ASSERT_OK(seq_file->Skip(1));
-  ASSERT_OK(seq_file->Read(1000, &result, scratch)); // Read "world".
-  ASSERT_EQ(0, result.compare("world"));
-  ASSERT_OK(seq_file->Read(1000, &result, scratch)); // Try reading past EOF.
-  ASSERT_EQ(0U, result.size());
-  ASSERT_OK(seq_file->Skip(100)); // Try to skip past end of file.
-  ASSERT_OK(seq_file->Read(1000, &result, scratch));
-  ASSERT_EQ(0U, result.size());
-
-  // Random reads.
-  ASSERT_OK(env_->NewRandomAccessFile("/dir/f", &rand_file, soptions_));
-  ASSERT_OK(rand_file->Read(6, 5, &result, scratch)); // Read "world".
-  ASSERT_EQ(0, result.compare("world"));
-  ASSERT_OK(rand_file->Read(0, 5, &result, scratch)); // Read "hello".
-  ASSERT_EQ(0, result.compare("hello"));
-  ASSERT_OK(rand_file->Read(10, 100, &result, scratch)); // Read "d".
-  ASSERT_EQ(0, result.compare("d"));
-
-  // Too high offset.
-  ASSERT_TRUE(!rand_file->Read(1000, 5, &result, scratch).ok());
-}
-
-TEST(MemEnvTest, Locks) {
-  FileLock* lock;
-
-  // These are no-ops, but we test they return success.
-  ASSERT_OK(env_->LockFile("some file", &lock));
-  ASSERT_OK(env_->UnlockFile(lock));
-}
-
-TEST(MemEnvTest, Misc) {
-  std::string test_dir;
-  ASSERT_OK(env_->GetTestDirectory(&test_dir));
-  ASSERT_TRUE(!test_dir.empty());
-
-  unique_ptr<WritableFile> writable_file;
-  ASSERT_OK(env_->NewWritableFile("/a/b", &writable_file, soptions_));
-
-  // These are no-ops, but we test they return success.
-  ASSERT_OK(writable_file->Sync());
-  ASSERT_OK(writable_file->Flush());
-  ASSERT_OK(writable_file->Close());
-  writable_file.reset();
-}
-
-TEST(MemEnvTest, LargeWrite) {
-  const size_t kWriteSize = 300 * 1024;
-  char* scratch = new char[kWriteSize * 2];
-
-  std::string write_data;
-  for (size_t i = 0; i < kWriteSize; ++i) {
-    write_data.append(1, static_cast<char>(i));
-  }
-
-  unique_ptr<WritableFile> writable_file;
-  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
-  ASSERT_OK(writable_file->Append("foo"));
-  ASSERT_OK(writable_file->Append(write_data));
-  writable_file.reset();
-
-  unique_ptr<SequentialFile> seq_file;
-  Slice result;
-  ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file, soptions_));
-  ASSERT_OK(seq_file->Read(3, &result, scratch)); // Read "foo".
-  ASSERT_EQ(0, result.compare("foo"));
-
-  size_t read = 0;
-  std::string read_data;
-  while (read < kWriteSize) {
-    ASSERT_OK(seq_file->Read(kWriteSize - read, &result, scratch));
-    read_data.append(result.data(), result.size());
-    read += result.size();
-  }
-  ASSERT_TRUE(write_data == read_data);
-  delete [] scratch;
-}
-
-TEST(MemEnvTest, DBTest) {
-  Options options;
-  options.create_if_missing = true;
-  options.env = env_;
-  DB* db;
-
-  const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")};
-  const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")};
-
-  ASSERT_OK(DB::Open(options, "/dir/db", &db));
-  for (size_t i = 0; i < 3; ++i) {
-    ASSERT_OK(db->Put(WriteOptions(), keys[i], vals[i]));
-  }
-
-  for (size_t i = 0; i < 3; ++i) {
-    std::string res;
-    ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
-    ASSERT_TRUE(res == vals[i]);
-  }
-
-  Iterator* iterator = db->NewIterator(ReadOptions());
-  iterator->SeekToFirst();
-  for (size_t i = 0; i < 3; ++i) {
-    ASSERT_TRUE(iterator->Valid());
-    ASSERT_TRUE(keys[i] == iterator->key());
-    ASSERT_TRUE(vals[i] == iterator->value());
-    iterator->Next();
-  }
-  ASSERT_TRUE(!iterator->Valid());
-  delete iterator;
-
-  DBImpl* dbi = reinterpret_cast<DBImpl*>(db);
-  ASSERT_OK(dbi->TEST_FlushMemTable());
-
-  for (size_t i = 0; i < 3; ++i) {
-    std::string res;
-    ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
-    ASSERT_TRUE(res == vals[i]);
-  }
-
-  delete db;
-}
-
-}  // namespace rocksdb
-
-int main(int argc, char** argv) {
-  return rocksdb::test::RunAllTests();
-}
diff --git a/src/rocksdb/include/rocksdb/c.h b/src/rocksdb/include/rocksdb/c.h
index 013ee5d..9b92068 100644
--- a/src/rocksdb/include/rocksdb/c.h
+++ b/src/rocksdb/include/rocksdb/c.h
@@ -6,8 +6,8 @@
   Use of this source code is governed by a BSD-style license that can be
   found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-  C bindings for leveldb.  May be useful as a stable ABI that can be
-  used by programs that keep leveldb in a shared library, or for
+  C bindings for rocksdb.  May be useful as a stable ABI that can be
+  used by programs that keep rocksdb in a shared library, or for
   a JNI api.
 
   Does not support:
@@ -55,9 +55,22 @@ extern "C" {
 /* Exported types */
 
 typedef struct rocksdb_t                 rocksdb_t;
+typedef struct rocksdb_backup_engine_t   rocksdb_backup_engine_t;
+typedef struct rocksdb_backup_engine_info_t   rocksdb_backup_engine_info_t;
+typedef struct rocksdb_restore_options_t rocksdb_restore_options_t;
 typedef struct rocksdb_cache_t           rocksdb_cache_t;
+typedef struct rocksdb_compactionfilter_t rocksdb_compactionfilter_t;
+typedef struct rocksdb_compactionfiltercontext_t
+    rocksdb_compactionfiltercontext_t;
+typedef struct rocksdb_compactionfilterfactory_t
+    rocksdb_compactionfilterfactory_t;
+typedef struct rocksdb_compactionfilterv2_t
+    rocksdb_compactionfilterv2_t;
+typedef struct rocksdb_compactionfilterfactoryv2_t
+    rocksdb_compactionfilterfactoryv2_t;
 typedef struct rocksdb_comparator_t      rocksdb_comparator_t;
 typedef struct rocksdb_env_t             rocksdb_env_t;
+typedef struct rocksdb_fifo_compaction_options_t rocksdb_fifo_compaction_options_t;
 typedef struct rocksdb_filelock_t        rocksdb_filelock_t;
 typedef struct rocksdb_filterpolicy_t    rocksdb_filterpolicy_t;
 typedef struct rocksdb_flushoptions_t    rocksdb_flushoptions_t;
@@ -65,6 +78,10 @@ typedef struct rocksdb_iterator_t        rocksdb_iterator_t;
 typedef struct rocksdb_logger_t          rocksdb_logger_t;
 typedef struct rocksdb_mergeoperator_t   rocksdb_mergeoperator_t;
 typedef struct rocksdb_options_t         rocksdb_options_t;
+typedef struct rocksdb_block_based_table_options_t
+    rocksdb_block_based_table_options_t;
+typedef struct rocksdb_cuckoo_table_options_t
+    rocksdb_cuckoo_table_options_t;
 typedef struct rocksdb_randomfile_t      rocksdb_randomfile_t;
 typedef struct rocksdb_readoptions_t     rocksdb_readoptions_t;
 typedef struct rocksdb_seqfile_t         rocksdb_seqfile_t;
@@ -75,6 +92,7 @@ typedef struct rocksdb_writebatch_t      rocksdb_writebatch_t;
 typedef struct rocksdb_writeoptions_t    rocksdb_writeoptions_t;
 typedef struct rocksdb_universal_compaction_options_t rocksdb_universal_compaction_options_t;
 typedef struct rocksdb_livefiles_t     rocksdb_livefiles_t;
+typedef struct rocksdb_column_family_handle_t rocksdb_column_family_handle_t;
 
 /* DB operations */
 
@@ -89,6 +107,95 @@ extern rocksdb_t* rocksdb_open_for_read_only(
     unsigned char error_if_log_file_exist,
     char** errptr);
 
+extern rocksdb_backup_engine_t* rocksdb_backup_engine_open(
+    const rocksdb_options_t* options,
+    const char* path,
+    char** errptr);
+
+extern void rocksdb_backup_engine_create_new_backup(
+    rocksdb_backup_engine_t* be,
+    rocksdb_t* db,
+    char** errptr);
+
+extern rocksdb_restore_options_t* rocksdb_restore_options_create();
+extern void rocksdb_restore_options_destroy(rocksdb_restore_options_t* opt);
+extern void rocksdb_restore_options_set_keep_log_files(
+    rocksdb_restore_options_t* opt, int v);
+
+extern void rocksdb_backup_engine_restore_db_from_latest_backup(
+    rocksdb_backup_engine_t *be,
+    const char* db_dir,
+    const char* wal_dir,
+    const rocksdb_restore_options_t *restore_options,
+    char** errptr);
+
+extern const rocksdb_backup_engine_info_t* rocksdb_backup_engine_get_backup_info(
+    rocksdb_backup_engine_t* be);
+
+extern int rocksdb_backup_engine_info_count(
+    const rocksdb_backup_engine_info_t* info);
+
+extern int64_t rocksdb_backup_engine_info_timestamp(
+    const rocksdb_backup_engine_info_t* info,
+    int index);
+
+extern uint32_t rocksdb_backup_engine_info_backup_id(
+    const rocksdb_backup_engine_info_t* info,
+    int index);
+
+extern uint64_t rocksdb_backup_engine_info_size(
+    const rocksdb_backup_engine_info_t* info,
+    int index);
+
+extern uint32_t rocksdb_backup_engine_info_number_files(
+    const rocksdb_backup_engine_info_t* info,
+    int index);
+
+extern void rocksdb_backup_engine_info_destroy(
+    const rocksdb_backup_engine_info_t *info);
+
+extern void rocksdb_backup_engine_close(
+    rocksdb_backup_engine_t* be);
+
+extern rocksdb_t* rocksdb_open_column_families(
+    const rocksdb_options_t* options,
+    const char* name,
+    int num_column_families,
+    const char** column_family_names,
+    const rocksdb_options_t** column_family_options,
+    rocksdb_column_family_handle_t** column_family_handles,
+    char** errptr);
+
+extern rocksdb_t* rocksdb_open_for_read_only_column_families(
+    const rocksdb_options_t* options,
+    const char* name,
+    int num_column_families,
+    const char** column_family_names,
+    const rocksdb_options_t** column_family_options,
+    rocksdb_column_family_handle_t** column_family_handles,
+    unsigned char error_if_log_file_exist,
+    char** errptr);
+
+char** rocksdb_list_column_families(
+    const rocksdb_options_t* options,
+    const char* name,
+    size_t* lencf,
+    char** errptr);
+void rocksdb_list_column_families_destroy(char** list, size_t len);
+
+extern rocksdb_column_family_handle_t* rocksdb_create_column_family(
+    rocksdb_t* db,
+    const rocksdb_options_t* column_family_options,
+    const char* column_family_name,
+    char** errptr);
+
+extern void rocksdb_drop_column_family(
+    rocksdb_t* db,
+    rocksdb_column_family_handle_t* handle,
+    char** errptr);
+
+extern void rocksdb_column_family_handle_destroy(rocksdb_column_family_handle_t*);
+
 extern void rocksdb_close(rocksdb_t* db);
 
 extern void rocksdb_put(
@@ -98,12 +205,27 @@ extern void rocksdb_put(
     const char* val, size_t vallen,
     char** errptr);
 
+extern void rocksdb_put_cf(
+    rocksdb_t* db,
+    const rocksdb_writeoptions_t* options,
+    rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t keylen,
+    const char* val, size_t vallen,
+    char** errptr);
+
 extern void rocksdb_delete(
     rocksdb_t* db,
     const rocksdb_writeoptions_t* options,
     const char* key, size_t keylen,
     char** errptr);
 
+void rocksdb_delete_cf(
+    rocksdb_t* db,
+    const rocksdb_writeoptions_t* options,
+    rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t keylen,
+    char** errptr);
+
 extern void rocksdb_merge(
     rocksdb_t* db,
     const rocksdb_writeoptions_t* options,
@@ -111,6 +233,14 @@ extern void rocksdb_merge(
     const char* val, size_t vallen,
     char** errptr);
 
+extern void rocksdb_merge_cf(
+    rocksdb_t* db,
+    const rocksdb_writeoptions_t* options,
+    rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t keylen,
+    const char* val, size_t vallen,
+    char** errptr);
+
 extern void rocksdb_write(
     rocksdb_t* db,
     const rocksdb_writeoptions_t* options,
@@ -126,10 +256,23 @@ extern char* rocksdb_get(
     size_t* vallen,
     char** errptr);
 
+extern char* rocksdb_get_cf(
+    rocksdb_t* db,
+    const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t keylen,
+    size_t* vallen,
+    char** errptr);
+
 extern rocksdb_iterator_t* rocksdb_create_iterator(
     rocksdb_t* db,
     const rocksdb_readoptions_t* options);
 
+extern rocksdb_iterator_t* rocksdb_create_iterator_cf(
+    rocksdb_t* db,
+    const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family);
+
 extern const rocksdb_snapshot_t* rocksdb_create_snapshot(
     rocksdb_t* db);
 
@@ -143,6 +286,11 @@ extern char* rocksdb_property_value(
     rocksdb_t* db,
     const char* propname);
 
+extern char* rocksdb_property_value_cf(
+    rocksdb_t* db,
+    rocksdb_column_family_handle_t* column_family,
+    const char* propname);
+
 extern void rocksdb_approximate_sizes(
     rocksdb_t* db,
     int num_ranges,
@@ -150,11 +298,25 @@ extern void rocksdb_approximate_sizes(
     const char* const* range_limit_key, const size_t* range_limit_key_len,
     uint64_t* sizes);
 
+extern void rocksdb_approximate_sizes_cf(
+    rocksdb_t* db,
+    rocksdb_column_family_handle_t* column_family,
+    int num_ranges,
+    const char* const* range_start_key, const size_t* range_start_key_len,
+    const char* const* range_limit_key, const size_t* range_limit_key_len,
+    uint64_t* sizes);
+
 extern void rocksdb_compact_range(
     rocksdb_t* db,
     const char* start_key, size_t start_key_len,
     const char* limit_key, size_t limit_key_len);
 
+extern void rocksdb_compact_range_cf(
+    rocksdb_t* db,
+    rocksdb_column_family_handle_t* column_family,
+    const char* start_key, size_t start_key_len,
+    const char* limit_key, size_t limit_key_len);
+
 extern void rocksdb_delete_file(
     rocksdb_t* db,
     const char* name);
@@ -204,6 +366,8 @@ extern void rocksdb_iter_get_error(const rocksdb_iterator_t*, char** errptr);
 /* Write batch */
 
 extern rocksdb_writebatch_t* rocksdb_writebatch_create();
+extern rocksdb_writebatch_t* rocksdb_writebatch_create_from(const char* rep,
+                                                            size_t size);
 extern void rocksdb_writebatch_destroy(rocksdb_writebatch_t*);
 extern void rocksdb_writebatch_clear(rocksdb_writebatch_t*);
 extern int rocksdb_writebatch_count(rocksdb_writebatch_t*);
@@ -211,13 +375,27 @@ extern void rocksdb_writebatch_put(
     rocksdb_writebatch_t*,
     const char* key, size_t klen,
     const char* val, size_t vlen);
+extern void rocksdb_writebatch_put_cf(
+    rocksdb_writebatch_t*,
+    rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen,
+    const char* val, size_t vlen);
 extern void rocksdb_writebatch_merge(
     rocksdb_writebatch_t*,
     const char* key, size_t klen,
     const char* val, size_t vlen);
+extern void rocksdb_writebatch_merge_cf(
+    rocksdb_writebatch_t*,
+    rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen,
+    const char* val, size_t vlen);
 extern void rocksdb_writebatch_delete(
     rocksdb_writebatch_t*,
     const char* key, size_t klen);
+extern void rocksdb_writebatch_delete_cf(
+    rocksdb_writebatch_t*,
+    rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen);
 extern void rocksdb_writebatch_iterate(
     rocksdb_writebatch_t*,
     void* state,
@@ -225,24 +403,87 @@ extern void rocksdb_writebatch_iterate(
     void (*deleted)(void*, const char* k, size_t klen));
 extern const char* rocksdb_writebatch_data(rocksdb_writebatch_t*, size_t *size);
 
+/* Block based table options */
+
+extern rocksdb_block_based_table_options_t*
+    rocksdb_block_based_options_create();
+extern void rocksdb_block_based_options_destroy(
+    rocksdb_block_based_table_options_t* options);
+extern void rocksdb_block_based_options_set_block_size(
+    rocksdb_block_based_table_options_t* options, size_t block_size);
+extern void rocksdb_block_based_options_set_block_size_deviation(
+    rocksdb_block_based_table_options_t* options, int block_size_deviation);
+extern void rocksdb_block_based_options_set_block_restart_interval(
+    rocksdb_block_based_table_options_t* options, int block_restart_interval);
+extern void rocksdb_block_based_options_set_filter_policy(
+    rocksdb_block_based_table_options_t* options,
+    rocksdb_filterpolicy_t* filter_policy);
+extern void rocksdb_block_based_options_set_no_block_cache(
+    rocksdb_block_based_table_options_t* options,
+    unsigned char no_block_cache);
+extern void rocksdb_block_based_options_set_block_cache(
+    rocksdb_block_based_table_options_t* options, rocksdb_cache_t* block_cache);
+extern void rocksdb_block_based_options_set_block_cache_compressed(
+    rocksdb_block_based_table_options_t* options,
+    rocksdb_cache_t* block_cache_compressed);
+extern void rocksdb_block_based_options_set_whole_key_filtering(
+    rocksdb_block_based_table_options_t*, unsigned char);
+extern void rocksdb_options_set_block_based_table_factory(
+    rocksdb_options_t *opt, rocksdb_block_based_table_options_t* table_options);
+
+/* Cuckoo table options */
+
+extern rocksdb_cuckoo_table_options_t*
+    rocksdb_cuckoo_options_create();
+extern void rocksdb_cuckoo_options_destroy(
+    rocksdb_cuckoo_table_options_t* options);
+extern void rocksdb_cuckoo_options_set_hash_ratio(
+    rocksdb_cuckoo_table_options_t* options, double v);
+extern void rocksdb_cuckoo_options_set_max_search_depth(
+    rocksdb_cuckoo_table_options_t* options, uint32_t v);
+extern void rocksdb_cuckoo_options_set_cuckoo_block_size(
+    rocksdb_cuckoo_table_options_t* options, uint32_t v);
+extern void rocksdb_cuckoo_options_set_identity_as_first_hash(
+    rocksdb_cuckoo_table_options_t* options, unsigned char v);
+extern void rocksdb_cuckoo_options_set_use_module_hash(
+    rocksdb_cuckoo_table_options_t* options, unsigned char v);
+extern void rocksdb_options_set_cuckoo_table_factory(
+    rocksdb_options_t *opt, rocksdb_cuckoo_table_options_t* table_options);
+
 /* Options */
 
 extern rocksdb_options_t* rocksdb_options_create();
 extern void rocksdb_options_destroy(rocksdb_options_t*);
+extern void rocksdb_options_increase_parallelism(
+    rocksdb_options_t* opt, int total_threads);
+extern void rocksdb_options_optimize_for_point_lookup(
+    rocksdb_options_t* opt, uint64_t block_cache_size_mb);
+extern void rocksdb_options_optimize_level_style_compaction(
+    rocksdb_options_t* opt, uint64_t memtable_memory_budget);
+extern void rocksdb_options_optimize_universal_style_compaction(
+    rocksdb_options_t* opt, uint64_t memtable_memory_budget);
+extern void rocksdb_options_set_compaction_filter(
+    rocksdb_options_t*,
+    rocksdb_compactionfilter_t*);
+extern void rocksdb_options_set_compaction_filter_factory(
+    rocksdb_options_t*, rocksdb_compactionfilterfactory_t*);
+extern void rocksdb_options_set_compaction_filter_factory_v2(
+    rocksdb_options_t*,
+    rocksdb_compactionfilterfactoryv2_t*);
 extern void rocksdb_options_set_comparator(
     rocksdb_options_t*,
     rocksdb_comparator_t*);
-extern void rocksdb_options_set_merge_operator(rocksdb_options_t*,
-                                               rocksdb_mergeoperator_t*);
+extern void rocksdb_options_set_merge_operator(
+    rocksdb_options_t*,
+    rocksdb_mergeoperator_t*);
 extern void rocksdb_options_set_compression_per_level(
   rocksdb_options_t* opt,
   int* level_values,
   size_t num_levels);
-extern void rocksdb_options_set_filter_policy(
-    rocksdb_options_t*,
-    rocksdb_filterpolicy_t*);
 extern void rocksdb_options_set_create_if_missing(
     rocksdb_options_t*, unsigned char);
+extern void rocksdb_options_set_create_missing_column_families(
+    rocksdb_options_t*, unsigned char);
 extern void rocksdb_options_set_error_if_exists(
     rocksdb_options_t*, unsigned char);
 extern void rocksdb_options_set_paranoid_checks(
@@ -252,13 +493,9 @@ extern void rocksdb_options_set_info_log(rocksdb_options_t*, rocksdb_logger_t*);
 extern void rocksdb_options_set_info_log_level(rocksdb_options_t*, int);
 extern void rocksdb_options_set_write_buffer_size(rocksdb_options_t*, size_t);
 extern void rocksdb_options_set_max_open_files(rocksdb_options_t*, int);
-extern void rocksdb_options_set_cache(rocksdb_options_t*, rocksdb_cache_t*);
-extern void rocksdb_options_set_cache_compressed(rocksdb_options_t*, rocksdb_cache_t*);
-extern void rocksdb_options_set_block_size(rocksdb_options_t*, size_t);
-extern void rocksdb_options_set_block_restart_interval(rocksdb_options_t*, int);
+extern void rocksdb_options_set_max_total_wal_size(rocksdb_options_t* opt, uint64_t n);
 extern void rocksdb_options_set_compression_options(
     rocksdb_options_t*, int, int, int);
-extern void rocksdb_options_set_whole_key_filtering(rocksdb_options_t*, unsigned char);
 extern void rocksdb_options_set_prefix_extractor(
     rocksdb_options_t*, rocksdb_slicetransform_t*);
 extern void rocksdb_options_set_num_levels(rocksdb_options_t*, int);
@@ -309,8 +546,6 @@ extern void rocksdb_options_set_arena_block_size(
     rocksdb_options_t*, size_t);
 extern void rocksdb_options_set_use_fsync(
     rocksdb_options_t*, int);
-extern void rocksdb_options_set_db_stats_log_interval(
-    rocksdb_options_t*, int);
 extern void rocksdb_options_set_db_log_dir(
     rocksdb_options_t*, const char*);
 extern void rocksdb_options_set_wal_dir(
@@ -353,7 +588,6 @@ extern void rocksdb_options_set_max_sequential_skip_in_iterations(
     rocksdb_options_t*, uint64_t);
 extern void rocksdb_options_set_disable_data_sync(rocksdb_options_t*, int);
 extern void rocksdb_options_set_disable_auto_compactions(rocksdb_options_t*, int);
-extern void rocksdb_options_set_disable_seek_compaction(rocksdb_options_t*, int);
 extern void rocksdb_options_set_delete_obsolete_files_period_micros(
     rocksdb_options_t*, uint64_t);
 extern void rocksdb_options_set_source_compaction_factor(rocksdb_options_t*, int);
@@ -363,9 +597,6 @@ extern void rocksdb_options_set_hash_skip_list_rep(rocksdb_options_t*, size_t, i
 extern void rocksdb_options_set_hash_link_list_rep(rocksdb_options_t*, size_t);
 extern void rocksdb_options_set_plain_table_factory(rocksdb_options_t*, uint32_t, int, double, size_t);
 
-extern void rocksdb_options_set_max_bytes_for_level_base(rocksdb_options_t* opt, uint64_t n);
-extern void rocksdb_options_set_stats_dump_period_sec(rocksdb_options_t* opt, unsigned int sec);
-
 extern void rocksdb_options_set_min_level_to_compress(rocksdb_options_t* opt, int level);
 
 extern void rocksdb_options_set_memtable_prefix_bloom_bits(
@@ -378,8 +609,6 @@ extern void rocksdb_options_set_min_partial_merge_operands(
     rocksdb_options_t*, uint32_t);
 extern void rocksdb_options_set_bloom_locality(
     rocksdb_options_t*, uint32_t);
-extern void rocksdb_options_set_allow_thread_local(
-    rocksdb_options_t*, unsigned char);
 extern void rocksdb_options_set_inplace_update_support(
     rocksdb_options_t*, unsigned char);
 extern void rocksdb_options_set_inplace_update_num_locks(
@@ -397,10 +626,77 @@ extern void rocksdb_options_set_compression(rocksdb_options_t*, int);
 
 enum {
   rocksdb_level_compaction = 0,
-  rocksdb_universal_compaction = 1
+  rocksdb_universal_compaction = 1,
+  rocksdb_fifo_compaction = 2
 };
 extern void rocksdb_options_set_compaction_style(rocksdb_options_t*, int);
 extern void rocksdb_options_set_universal_compaction_options(rocksdb_options_t*, rocksdb_universal_compaction_options_t*);
+extern void rocksdb_options_set_fifo_compaction_options(rocksdb_options_t* opt,
+    rocksdb_fifo_compaction_options_t* fifo);
+
+/* Compaction Filter */
+
+extern rocksdb_compactionfilter_t* rocksdb_compactionfilter_create(
+    void* state,
+    void (*destructor)(void*),
+    unsigned char (*filter)(
+        void*,
+        int level,
+        const char* key, size_t key_length,
+        const char* existing_value, size_t value_length,
+        char** new_value, size_t *new_value_length,
+        unsigned char* value_changed),
+    const char* (*name)(void*));
+extern void rocksdb_compactionfilter_destroy(rocksdb_compactionfilter_t*);
+
+/* Compaction Filter Context */
+
+extern unsigned char rocksdb_compactionfiltercontext_is_full_compaction(
+    rocksdb_compactionfiltercontext_t* context);
+
+extern unsigned char rocksdb_compactionfiltercontext_is_manual_compaction(
+    rocksdb_compactionfiltercontext_t* context);
+
+/* Compaction Filter Factory */
+
+extern rocksdb_compactionfilterfactory_t*
+    rocksdb_compactionfilterfactory_create(
+        void* state, void (*destructor)(void*),
+        rocksdb_compactionfilter_t* (*create_compaction_filter)(
+            void*, rocksdb_compactionfiltercontext_t* context),
+        const char* (*name)(void*));
+extern void rocksdb_compactionfilterfactory_destroy(
+    rocksdb_compactionfilterfactory_t*);
+
+/* Compaction Filter V2 */
+
+extern rocksdb_compactionfilterv2_t* rocksdb_compactionfilterv2_create(
+    void* state,
+    void (*destructor)(void*),
+    // num_keys specifies the number of array entries in every *list parameter.
+    // New values added to the new_values_list should be malloc'd and will be
+    // freed by the caller. Specify true in the to_delete_list to remove an
+    // entry during compaction; false to keep it.
+    void (*filter)(
+        void*, int level, size_t num_keys,
+        const char* const* keys_list, const size_t* keys_list_sizes,
+        const char* const* existing_values_list, const size_t* existing_values_list_sizes,
+        char** new_values_list, size_t* new_values_list_sizes,
+        unsigned char* to_delete_list),
+    const char* (*name)(void*));
+extern void rocksdb_compactionfilterv2_destroy(rocksdb_compactionfilterv2_t*);
+
+/* Compaction Filter Factory V2 */
+
+extern rocksdb_compactionfilterfactoryv2_t* rocksdb_compactionfilterfactoryv2_create(
+    void* state,
+    rocksdb_slicetransform_t* prefix_extractor,
+    void (*destructor)(void*),
+    rocksdb_compactionfilterv2_t* (*create_compaction_filter_v2)(
+        void*, const rocksdb_compactionfiltercontext_t* context),
+    const char* (*name)(void*));
+extern void rocksdb_compactionfilterfactoryv2_destroy(rocksdb_compactionfilterfactoryv2_t*);
+
 /* Comparator */
 
 extern rocksdb_comparator_t* rocksdb_comparator_create(
@@ -472,6 +768,10 @@ extern void rocksdb_readoptions_set_fill_cache(
 extern void rocksdb_readoptions_set_snapshot(
     rocksdb_readoptions_t*,
     const rocksdb_snapshot_t*);
+extern void rocksdb_readoptions_set_iterate_upper_bound(
+    rocksdb_readoptions_t*,
+    const char* key,
+    size_t keylen);
 extern void rocksdb_readoptions_set_read_tier(
     rocksdb_readoptions_t*, int);
 extern void rocksdb_readoptions_set_tailing(
@@ -546,6 +846,12 @@ extern void rocksdb_universal_compaction_options_set_stop_style(
 extern void rocksdb_universal_compaction_options_destroy(
   rocksdb_universal_compaction_options_t*);
 
+extern rocksdb_fifo_compaction_options_t* rocksdb_fifo_compaction_options_create();
+extern void rocksdb_fifo_compaction_options_set_max_table_files_size(
+    rocksdb_fifo_compaction_options_t* fifo_opts, uint64_t size);
+extern void rocksdb_fifo_compaction_options_destroy(
+    rocksdb_fifo_compaction_options_t* fifo_opts);
+
 extern int rocksdb_livefiles_count(
   const rocksdb_livefiles_t*);
 extern const char* rocksdb_livefiles_name(
diff --git a/src/rocksdb/include/rocksdb/cache.h b/src/rocksdb/include/rocksdb/cache.h
index 65d44b6..c5c7f01 100644
--- a/src/rocksdb/include/rocksdb/cache.h
+++ b/src/rocksdb/include/rocksdb/cache.h
@@ -34,20 +34,11 @@ class Cache;
 
 // Create a new cache with a fixed size capacity. The cache is sharded
 // to 2^numShardBits shards, by hash of the key. The total capacity
-// is divided and evenly assigned to each shard. Inside each shard,
-// the eviction is done in two passes: first try to free spaces by
-// evicting entries that are among the most least used removeScanCountLimit
-// entries and do not have reference other than by the cache itself, in
-// the least-used order. If not enough space is freed, further free the
-// entries in least used order.
+// is divided and evenly assigned to each shard.
 //
-// The functions without parameter numShardBits and/or removeScanCountLimit
-// use default values. removeScanCountLimit's default value is 0, which
-// means a strict LRU order inside each shard.
+// The functions without parameter numShardBits uses default value, which is 4
 extern shared_ptr<Cache> NewLRUCache(size_t capacity);
 extern shared_ptr<Cache> NewLRUCache(size_t capacity, int numShardBits);
-extern shared_ptr<Cache> NewLRUCache(size_t capacity, int numShardBits,
-                                     int removeScanCountLimit);
 
 class Cache {
  public:
@@ -101,6 +92,12 @@ class Cache {
   // its cache keys.
   virtual uint64_t NewId() = 0;
 
+  // sets the maximum configured capacity of the cache. When the new
+  // capacity is less than the old capacity and the existing usage is
+  // greater than new capacity, the implementation will do its best job to
+  // purge the released entries from the cache in order to lower the usage
+  virtual void SetCapacity(size_t capacity) = 0;
+
   // returns the maximum configured capacity of the cache
   virtual size_t GetCapacity() const = 0;
 
@@ -127,9 +124,6 @@ class Cache {
   void LRU_Append(Handle* e);
   void Unref(Handle* e);
 
-  struct Rep;
-  Rep* rep_;
-
   // No copying allowed
   Cache(const Cache&);
   void operator=(const Cache&);
diff --git a/src/rocksdb/include/rocksdb/compaction_filter.h b/src/rocksdb/include/rocksdb/compaction_filter.h
index 59b0509..dce69d2 100644
--- a/src/rocksdb/include/rocksdb/compaction_filter.h
+++ b/src/rocksdb/include/rocksdb/compaction_filter.h
@@ -9,6 +9,7 @@
 #ifndef STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_
 #define STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_
 
+#include <memory>
 #include <string>
 #include <vector>
 
@@ -86,7 +87,7 @@ class CompactionFilterV2 {
   //
   // Each entry in the return vector indicates if the corresponding kv should
   // be preserved in the output of this compaction run. The application can
-  // inspect the exisitng values of the keys and make decision based on it.
+  // inspect the existing values of the keys and make decision based on it.
   //
   // When a value is to be preserved, the application has the option
   // to modify the entry in existing_values and pass it back through an entry
@@ -108,7 +109,7 @@ class CompactionFilterV2 {
 };
 
 // Each compaction will create a new CompactionFilter allowing the
-// application to know about different campactions
+// application to know about different compactions
 class CompactionFilterFactory {
  public:
   virtual ~CompactionFilterFactory() { }
@@ -120,7 +121,7 @@ class CompactionFilterFactory {
   virtual const char* Name() const = 0;
 };
 
-// Default implementaion of CompactionFilterFactory which does not
+// Default implementation of CompactionFilterFactory which does not
 // return any filter
 class DefaultCompactionFilterFactory : public CompactionFilterFactory {
  public:
@@ -175,7 +176,7 @@ class CompactionFilterFactoryV2 {
   const SliceTransform* prefix_extractor_;
 };
 
-// Default implementaion of CompactionFilterFactoryV2 which does not
+// Default implementation of CompactionFilterFactoryV2 which does not
 // return any filter
 class DefaultCompactionFilterFactoryV2 : public CompactionFilterFactoryV2 {
  public:
diff --git a/src/rocksdb/include/rocksdb/comparator.h b/src/rocksdb/include/rocksdb/comparator.h
index f3a8499..5b7dc10 100644
--- a/src/rocksdb/include/rocksdb/comparator.h
+++ b/src/rocksdb/include/rocksdb/comparator.h
@@ -62,6 +62,10 @@ class Comparator {
 // must not be deleted.
 extern const Comparator* BytewiseComparator();
 
+// Return a builtin comparator that uses reverse lexicographic byte-wise
+// ordering.
+extern const Comparator* ReverseBytewiseComparator();
+
 }  // namespace rocksdb
 
 #endif  // STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_
diff --git a/src/rocksdb/include/rocksdb/db.h b/src/rocksdb/include/rocksdb/db.h
index e743b4c..e5b4838 100644
--- a/src/rocksdb/include/rocksdb/db.h
+++ b/src/rocksdb/include/rocksdb/db.h
@@ -15,19 +15,36 @@
 #include <vector>
 #include <string>
 #include <unordered_map>
+#include "rocksdb/metadata.h"
 #include "rocksdb/version.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/options.h"
 #include "rocksdb/types.h"
 #include "rocksdb/transaction_log.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/thread_status.h"
 
 namespace rocksdb {
 
+struct Options;
+struct DBOptions;
+struct ColumnFamilyOptions;
+struct ReadOptions;
+struct WriteOptions;
+struct FlushOptions;
+struct CompactionOptions;
+struct TableProperties;
+class WriteBatch;
+class Env;
+class EventListener;
+
 using std::unique_ptr;
 
 class ColumnFamilyHandle {
  public:
   virtual ~ColumnFamilyHandle() {}
+  virtual const std::string& GetName() const = 0;
+  virtual uint32_t GetID() const = 0;
 };
 extern const std::string kDefaultColumnFamilyName;
 
@@ -44,30 +61,14 @@ struct ColumnFamilyDescriptor {
 static const int kMajorVersion = __ROCKSDB_MAJOR__;
 static const int kMinorVersion = __ROCKSDB_MINOR__;
 
-struct Options;
-struct ReadOptions;
-struct WriteOptions;
-struct FlushOptions;
-struct TableProperties;
-class WriteBatch;
-class Env;
-
-// Metadata associated with each SST file.
-struct LiveFileMetaData {
-  std::string column_family_name;  // Name of the column family
-  std::string name;                // Name of the file
-  int level;               // Level at which this file resides.
-  size_t size;             // File size in bytes.
-  std::string smallestkey; // Smallest user defined key in the file.
-  std::string largestkey;  // Largest user defined key in the file.
-  SequenceNumber smallest_seqno; // smallest seqno in file
-  SequenceNumber largest_seqno;  // largest seqno in file
-};
-
 // Abstract handle to particular state of a DB.
 // A Snapshot is an immutable object and can therefore be safely
 // accessed from multiple threads without any external synchronization.
 class Snapshot {
+ public:
+  // returns Snapshot's sequence number
+  virtual SequenceNumber GetSequenceNumber() const = 0;
+
  protected:
   virtual ~Snapshot();
 };
@@ -105,6 +106,9 @@ class DB {
   // that modify data, like put/delete, will return error.
   // If the db is opened in read only mode, then no compactions
   // will happen.
+  //
+  // Not supported in ROCKSDB_LITE, in which case the function will
+  // return Status::NotSupported.
   static Status OpenForReadOnly(const Options& options,
       const std::string& name, DB** dbptr,
       bool error_if_log_file_exist = false);
@@ -114,6 +118,9 @@ class DB {
   // database that should be opened. However, you always need to specify default
   // column family. The default column family name is 'default' and it's stored
   // in rocksdb::kDefaultColumnFamilyName
+  //
+  // Not supported in ROCKSDB_LITE, in which case the function will
+  // return Status::NotSupported.
   static Status OpenForReadOnly(
       const DBOptions& db_options, const std::string& name,
       const std::vector<ColumnFamilyDescriptor>& column_families,
@@ -122,7 +129,7 @@ class DB {
 
   // Open DB with column families.
   // db_options specify database specific options
-  // column_families is the vector of all column families in the databse,
+  // column_families is the vector of all column families in the database,
   // containing column family name and options. You need to open ALL column
   // families in the database. To get the list of column families, you can use
   // ListColumnFamilies(). Also, you can open only a subset of column families
@@ -159,6 +166,7 @@ class DB {
   virtual Status DropColumnFamily(ColumnFamilyHandle* column_family);
 
   // Set the database entry for "key" to "value".
+  // If "key" already exists, it will be overwritten.
   // Returns OK on success, and a non-OK status on error.
   // Note: consider setting options.sync = true.
   virtual Status Put(const WriteOptions& options,
@@ -193,6 +201,8 @@ class DB {
   }
 
   // Apply the specified updates to the database.
+  // If `updates` contains no update, WAL will still be synced if
+  // options.sync=true.
   // Returns OK on success, non-OK on failure.
   // Note: consider setting options.sync = true.
   virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0;
@@ -299,12 +309,85 @@ class DB {
   //     about the internal operation of the DB.
   //  "rocksdb.sstables" - returns a multi-line string that describes all
   //     of the sstables that make up the db contents.
+  //  "rocksdb.cfstats"
+  //  "rocksdb.dbstats"
+  //  "rocksdb.num-immutable-mem-table"
+  //  "rocksdb.mem-table-flush-pending"
+  //  "rocksdb.compaction-pending" - 1 if at least one compaction is pending
+  //  "rocksdb.background-errors" - accumulated number of background errors
+  //  "rocksdb.cur-size-active-mem-table"
+  //  "rocksdb.cur-size-all-mem-tables"
+  //  "rocksdb.num-entries-active-mem-table"
+  //  "rocksdb.num-entries-imm-mem-tables"
+  //  "rocksdb.num-deletes-active-mem-table"
+  //  "rocksdb.num-deletes-imm-mem-tables"
+  //  "rocksdb.estimate-num-keys" - estimated keys in the column family
+  //  "rocksdb.estimate-table-readers-mem" - estimated memory used for reding
+  //      SST tables, that is not counted as a part of block cache.
+  //  "rocksdb.is-file-deletions-enabled"
+  //  "rocksdb.num-snapshots"
+  //  "rocksdb.oldest-snapshot-time"
+  //  "rocksdb.num-live-versions" - `version` is an internal data structure.
+  //      See version_set.h for details. More live versions often mean more SST
+  //      files are held from being deleted, by iterators or unfinished
+  //      compactions.
+#ifndef ROCKSDB_LITE
+  struct Properties {
+    static const std::string kNumFilesAtLevelPrefix;
+    static const std::string kStats;
+    static const std::string kSSTables;
+    static const std::string kCFStats;
+    static const std::string kDBStats;
+    static const std::string kNumImmutableMemTable;
+    static const std::string kMemTableFlushPending;
+    static const std::string kCompactionPending;
+    static const std::string kBackgroundErrors;
+    static const std::string kCurSizeActiveMemTable;
+    static const std::string kCurSizeAllMemTables;
+    static const std::string kNumEntriesActiveMemTable;
+    static const std::string kNumEntriesImmMemTables;
+    static const std::string kNumDeletesActiveMemTable;
+    static const std::string kNumDeletesImmMemTables;
+    static const std::string kEstimateNumKeys;
+    static const std::string kEstimateTableReadersMem;
+    static const std::string kIsFileDeletionsEnabled;
+    static const std::string kNumSnapshots;
+    static const std::string kOldestSnapshotTime;
+    static const std::string kNumLiveVersions;
+  };
+#endif /* ROCKSDB_LITE */
+
   virtual bool GetProperty(ColumnFamilyHandle* column_family,
                            const Slice& property, std::string* value) = 0;
   virtual bool GetProperty(const Slice& property, std::string* value) {
     return GetProperty(DefaultColumnFamily(), property, value);
   }
 
+  // Similar to GetProperty(), but only works for a subset of properties whose
+  // return value is an integer. Return the value by integer. Supported
+  // properties:
+  //  "rocksdb.num-immutable-mem-table"
+  //  "rocksdb.mem-table-flush-pending"
+  //  "rocksdb.compaction-pending"
+  //  "rocksdb.background-errors"
+  //  "rocksdb.cur-size-active-mem-table"
+  //  "rocksdb.cur-size-all-mem-tables"
+  //  "rocksdb.num-entries-active-mem-table"
+  //  "rocksdb.num-entries-imm-mem-tables"
+  //  "rocksdb.num-deletes-active-mem-table"
+  //  "rocksdb.num-deletes-imm-mem-tables"
+  //  "rocksdb.estimate-num-keys"
+  //  "rocksdb.estimate-table-readers-mem"
+  //  "rocksdb.is-file-deletions-enabled"
+  //  "rocksdb.num-snapshots"
+  //  "rocksdb.oldest-snapshot-time"
+  //  "rocksdb.num-live-versions"
+  virtual bool GetIntProperty(ColumnFamilyHandle* column_family,
+                              const Slice& property, uint64_t* value) = 0;
+  virtual bool GetIntProperty(const Slice& property, uint64_t* value) {
+    return GetIntProperty(DefaultColumnFamily(), property, value);
+  }
+
   // For each i in [0,n-1], store in "sizes[i]", the approximate
   // file system space used by keys in "[range[i].start .. range[i].limit)".
   //
@@ -337,17 +420,47 @@ class DB {
   // hosting all the files. In this case, client could set reduce_level
   // to true, to move the files back to the minimum level capable of holding
   // the data set or a given level (specified by non-negative target_level).
+  // Compaction outputs should be placed in options.db_paths[target_path_id].
+  // Behavior is undefined if target_path_id is out of range.
   virtual Status CompactRange(ColumnFamilyHandle* column_family,
                               const Slice* begin, const Slice* end,
-                              bool reduce_level = false,
-                              int target_level = -1) = 0;
+                              bool reduce_level = false, int target_level = -1,
+                              uint32_t target_path_id = 0) = 0;
   virtual Status CompactRange(const Slice* begin, const Slice* end,
-                              bool reduce_level = false,
-                              int target_level = -1) {
+                              bool reduce_level = false, int target_level = -1,
+                              uint32_t target_path_id = 0) {
     return CompactRange(DefaultColumnFamily(), begin, end, reduce_level,
-                        target_level);
+                        target_level, target_path_id);
+  }
+  virtual Status SetOptions(ColumnFamilyHandle* column_family,
+      const std::unordered_map<std::string, std::string>& new_options) {
+    return Status::NotSupported("Not implemented");
+  }
+  virtual Status SetOptions(
+      const std::unordered_map<std::string, std::string>& new_options) {
+    return SetOptions(DefaultColumnFamily(), new_options);
   }
 
+  // CompactFiles() inputs a list of files specified by file numbers
+  // and compacts them to the specified level.  Note that the behavior
+  // is different from CompactRange in that CompactFiles() will
+  // perform the compaction job using the CURRENT thread.
+  //
+  // @see GetDataBaseMetaData
+  // @see GetColumnFamilyMetaData
+  virtual Status CompactFiles(
+      const CompactionOptions& compact_options,
+      ColumnFamilyHandle* column_family,
+      const std::vector<std::string>& input_file_names,
+      const int output_level, const int output_path_id = -1) = 0;
+
+  virtual Status CompactFiles(
+      const CompactionOptions& compact_options,
+      const std::vector<std::string>& input_file_names,
+      const int output_level, const int output_path_id = -1) {
+    return CompactFiles(compact_options, DefaultColumnFamily(),
+                        input_file_names, output_level, output_path_id);
+  }
   // Number of levels used for this DB.
   virtual int NumberLevels(ColumnFamilyHandle* column_family) = 0;
   virtual int NumberLevels() { return NumberLevels(DefaultColumnFamily()); }
@@ -372,13 +485,18 @@ class DB {
   // Get Env object from the DB
   virtual Env* GetEnv() const = 0;
 
-  // Get DB Options that we use
+  // Get DB Options that we use.  During the process of opening the
+  // column family, the options provided when calling DB::Open() or
+  // DB::CreateColumnFamily() will have been "sanitized" and transformed
+  // in an implementation-defined manner.
   virtual const Options& GetOptions(ColumnFamilyHandle* column_family)
       const = 0;
   virtual const Options& GetOptions() const {
     return GetOptions(DefaultColumnFamily());
   }
 
+  virtual const DBOptions& GetDBOptions() const = 0;
+
   // Flush all mem-table data.
   virtual Status Flush(const FlushOptions& options,
                        ColumnFamilyHandle* column_family) = 0;
@@ -396,7 +514,7 @@ class DB {
   // times have the same effect as calling it once.
   virtual Status DisableFileDeletions() = 0;
 
-  // Allow compactions to delete obselete files.
+  // Allow compactions to delete obsolete files.
   // If force == true, the call to EnableFileDeletions() will guarantee that
   // file deletions are enabled after the call, even if DisableFileDeletions()
   // was called multiple times before.
@@ -409,8 +527,6 @@ class DB {
 
   // GetLiveFiles followed by GetSortedWalFiles can generate a lossless backup
 
-  // THIS METHOD IS DEPRECATED. Use the GetLiveFilesMetaData to get more
-  // detailed information on the live files.
   // Retrieve the list of all files in the database. The files are
   // relative to the dbname and are not absolute paths. The valid size of the
   // manifest file is returned in manifest_file_size. The manifest file is an
@@ -454,6 +570,21 @@ class DB {
   // and end key
   virtual void GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {}
 
+  // Obtains the meta data of the specified column family of the DB.
+  // Status::NotFound() will be returned if the current DB does not have
+  // any column family match the specified name.
+  //
+  // If cf_name is not specified, then the metadata of the default
+  // column family will be returned.
+  virtual void GetColumnFamilyMetaData(
+      ColumnFamilyHandle* column_family,
+      ColumnFamilyMetaData* metadata) {}
+
+  // Get the metadata of the default column family.
+  void GetColumnFamilyMetaData(
+      ColumnFamilyMetaData* metadata) {
+    GetColumnFamilyMetaData(DefaultColumnFamily(), metadata);
+  }
 #endif  // ROCKSDB_LITE
 
   // Sets the globally unique ID created at database creation time by invoking
diff --git a/src/rocksdb/include/rocksdb/env.h b/src/rocksdb/include/rocksdb/env.h
index 6a96351..2fb9242 100644
--- a/src/rocksdb/include/rocksdb/env.h
+++ b/src/rocksdb/include/rocksdb/env.h
@@ -20,9 +20,11 @@
 #include <cstdarg>
 #include <string>
 #include <memory>
+#include <limits>
 #include <vector>
 #include <stdint.h>
 #include "rocksdb/status.h"
+#include "rocksdb/thread_status.h"
 
 namespace rocksdb {
 
@@ -35,6 +37,8 @@ class WritableFile;
 class RandomRWFile;
 class Directory;
 struct DBOptions;
+class RateLimiter;
+class ThreadStatusUpdater;
 
 using std::unique_ptr;
 using std::shared_ptr;
@@ -74,11 +78,15 @@ struct EnvOptions {
   // write. By default, we set it to true for MANIFEST writes and false for
   // WAL writes
   bool fallocate_with_keep_size = true;
+
+  // If not nullptr, write rate limiting is enabled for flush and compaction
+  RateLimiter* rate_limiter = nullptr;
 };
 
 class Env {
  public:
-  Env() { }
+  Env() : thread_status_updater_(nullptr) {}
+
   virtual ~Env();
 
   // Return a default environment suitable for the current operating
@@ -173,6 +181,11 @@ class Env {
   virtual Status RenameFile(const std::string& src,
                             const std::string& target) = 0;
 
+  // Hard Link file src to target.
+  virtual Status LinkFile(const std::string& src, const std::string& target) {
+    return Status::NotSupported("LinkFile is not supported for this Env");
+  }
+
   // Lock the specified file.  Used to prevent concurrent access to
   // the same db by multiple processes.  On failure, stores nullptr in
   // *lock and returns non-OK.
@@ -194,8 +207,16 @@ class Env {
   // REQUIRES: lock has not already been unlocked.
   virtual Status UnlockFile(FileLock* lock) = 0;
 
+  // Priority for scheduling job in thread pool
   enum Priority { LOW, HIGH, TOTAL };
 
+  // Priority for requesting bytes in rate limiter scheduler
+  enum IOPriority {
+    IO_LOW = 0,
+    IO_HIGH = 1,
+    IO_TOTAL = 2
+  };
+
   // Arrange to run "(*function)(arg)" once in a background thread, in
   // the thread pool specified by pri. By default, jobs go to the 'LOW'
   // priority thread pool.
@@ -204,10 +225,12 @@ class Env {
   // added to the same Env may run concurrently in different threads.
   // I.e., the caller may not assume that background work items are
   // serialized.
-  virtual void Schedule(
-      void (*function)(void* arg),
-      void* arg,
-      Priority pri = LOW) = 0;
+  virtual void Schedule(void (*function)(void* arg), void* arg,
+                        Priority pri = LOW, void* tag = nullptr) = 0;
+
+  // Arrange to remove jobs for given arg from the queue_ if they are not
+  // already scheduled. Caller is expected to have exclusive lock on arg.
+  virtual int UnSchedule(void* arg, Priority pri) { return 0; }
 
   // Start a new thread, invoking "function(arg)" within the new thread.
   // When "function(arg)" returns, the thread will be destroyed.
@@ -260,6 +283,14 @@ class Env {
   // default number: 1
   virtual void SetBackgroundThreads(int number, Priority pri = LOW) = 0;
 
+  // Enlarge number of background worker threads of a specific thread pool
+  // for this environment if it is smaller than specified. 'LOW' is the default
+  // pool.
+  virtual void IncBackgroundThreadsIfNeeded(int number, Priority pri) = 0;
+
+  // Lower IO priority for threads from the specified pool.
+  virtual void LowerThreadPoolIOPriority(Priority pool = LOW) {}
+
   // Converts seconds-since-Jan-01-1970 to a printable string
   virtual std::string TimeToString(uint64_t time) = 0;
 
@@ -269,19 +300,42 @@ class Env {
   // OptimizeForLogWrite will create a new EnvOptions object that is a copy of
   // the EnvOptions in the parameters, but is optimized for writing log files.
   // Default implementation returns the copy of the same object.
-  virtual EnvOptions OptimizeForLogWrite(const EnvOptions& env_options) const;
+  virtual EnvOptions OptimizeForLogWrite(const EnvOptions& env_options,
+                                         const DBOptions& db_options) const;
   // OptimizeForManifestWrite will create a new EnvOptions object that is a copy
   // of the EnvOptions in the parameters, but is optimized for writing manifest
   // files. Default implementation returns the copy of the same object.
   virtual EnvOptions OptimizeForManifestWrite(const EnvOptions& env_options)
       const;
 
+  // Returns the status of all threads that belong to the current Env.
+  virtual Status GetThreadList(std::vector<ThreadStatus>* thread_list) {
+    return Status::NotSupported("Not supported.");
+  }
+
+  // Returns the pointer to ThreadStatusUpdater.  This function will be
+  // used in RocksDB internally to update thread status and supports
+  // GetThreadList().
+  virtual ThreadStatusUpdater* GetThreadStatusUpdater() const {
+    return thread_status_updater_;
+  }
+
+ protected:
+  // The pointer to an internal structure that will update the
+  // status of each thread.
+  ThreadStatusUpdater* thread_status_updater_;
+
  private:
   // No copying allowed
   Env(const Env&);
   void operator=(const Env&);
 };
 
+// The factory function to construct a ThreadStatusUpdater.  Any Env
+// that supports GetThreadList() feature should call this function in its
+// constructor to initialize thread_status_updater_.
+ThreadStatusUpdater* CreateThreadStatusUpdater();
+
 // A file abstraction for reading sequentially through a file
 class SequentialFile {
  public:
@@ -371,7 +425,10 @@ class RandomAccessFile {
 // at a time to the file.
 class WritableFile {
  public:
-  WritableFile() : last_preallocated_block_(0), preallocation_block_size_ (0) {
+  WritableFile()
+    : last_preallocated_block_(0),
+      preallocation_block_size_(0),
+      io_priority_(Env::IO_TOTAL) {
   }
   virtual ~WritableFile();
 
@@ -391,6 +448,14 @@ class WritableFile {
   }
 
   /*
+   * Change the priority in rate limiter if rate limiting is enabled.
+   * If rate limiting is not enabled, this call has no effect.
+   */
+  virtual void SetIOPriority(Env::IOPriority pri) {
+    io_priority_ = pri;
+  }
+
+  /*
    * Get the size of valid data in the file.
    */
   virtual uint64_t GetFileSize() {
@@ -445,8 +510,8 @@ class WritableFile {
     if (new_last_preallocated_block > last_preallocated_block_) {
       size_t num_spanned_blocks =
         new_last_preallocated_block - last_preallocated_block_;
-      Allocate(block_size * last_preallocated_block_,
-               block_size * num_spanned_blocks);
+      Allocate(static_cast<off_t>(block_size * last_preallocated_block_),
+               static_cast<off_t>(block_size * num_spanned_blocks));
       last_preallocated_block_ = new_last_preallocated_block;
     }
   }
@@ -468,12 +533,17 @@ class WritableFile {
     return Status::OK();
   }
 
+  size_t preallocation_block_size() { return preallocation_block_size_; }
+
  private:
   size_t last_preallocated_block_;
   size_t preallocation_block_size_;
   // No copying allowed
   WritableFile(const WritableFile&);
   void operator=(const WritableFile&);
+
+ protected:
+  Env::IOPriority io_priority_;
 };
 
 // A file abstraction for random reading and writing.
@@ -546,11 +616,21 @@ enum InfoLogLevel : unsigned char {
 // An interface for writing log messages.
 class Logger {
  public:
-  enum { DO_NOT_SUPPORT_GET_LOG_FILE_SIZE = -1 };
+  size_t kDoNotSupportGetLogFileSize = std::numeric_limits<size_t>::max();
+
   explicit Logger(const InfoLogLevel log_level = InfoLogLevel::INFO_LEVEL)
       : log_level_(log_level) {}
   virtual ~Logger();
 
+  // Write a header to the log file with the specified format
+  // It is recommended that you log all header information at the start of the
+  // application. But it is not enforced.
+  virtual void LogHeader(const char* format, va_list ap) {
+    // Default implementation does a simple INFO level log write.
+    // Please override as per the logger class requirement.
+    Logv(format, ap);
+  }
+
   // Write an entry to the log file with the specified format.
   virtual void Logv(const char* format, va_list ap) = 0;
 
@@ -558,7 +638,7 @@ class Logger {
   // and format.  Any log with level under the internal log level
   // of *this (see @SetInfoLogLevel and @GetInfoLogLevel) will not be
   // printed.
-  void Logv(const InfoLogLevel log_level, const char* format, va_list ap) {
+  virtual void Logv(const InfoLogLevel log_level, const char* format, va_list ap) {
     static const char* kInfoLogLevelNames[5] = {"DEBUG", "INFO", "WARN",
                                                 "ERROR", "FATAL"};
     if (log_level < log_level_) {
@@ -579,9 +659,7 @@ class Logger {
       Logv(new_format, ap);
     }
   }
-  virtual size_t GetLogFileSize() const {
-    return DO_NOT_SUPPORT_GET_LOG_FILE_SIZE;
-  }
+  virtual size_t GetLogFileSize() const { return kDoNotSupportGetLogFileSize; }
   // Flush to the OS buffers
   virtual void Flush() {}
   virtual InfoLogLevel GetInfoLogLevel() const { return log_level_; }
@@ -614,6 +692,7 @@ extern void Log(const InfoLogLevel log_level,
                 const shared_ptr<Logger>& info_log, const char* format, ...);
 
 // a set of log functions with different log levels.
+extern void Header(const shared_ptr<Logger>& info_log, const char* format, ...);
 extern void Debug(const shared_ptr<Logger>& info_log, const char* format, ...);
 extern void Info(const shared_ptr<Logger>& info_log, const char* format, ...);
 extern void Warn(const shared_ptr<Logger>& info_log, const char* format, ...);
@@ -641,6 +720,7 @@ extern void Log(Logger* info_log, const char* format, ...)
     ;
 
 // a set of log functions with different log levels.
+extern void Header(Logger* info_log, const char* format, ...);
 extern void Debug(Logger* info_log, const char* format, ...);
 extern void Info(Logger* info_log, const char* format, ...);
 extern void Warn(Logger* info_log, const char* format, ...);
@@ -669,94 +749,131 @@ class EnvWrapper : public Env {
   Env* target() const { return target_; }
 
   // The following text is boilerplate that forwards all methods to target()
-  Status NewSequentialFile(const std::string& f,
-                           unique_ptr<SequentialFile>* r,
-                           const EnvOptions& options) {
+  Status NewSequentialFile(const std::string& f, unique_ptr<SequentialFile>* r,
+                           const EnvOptions& options) override {
     return target_->NewSequentialFile(f, r, options);
   }
   Status NewRandomAccessFile(const std::string& f,
                              unique_ptr<RandomAccessFile>* r,
-                             const EnvOptions& options) {
+                             const EnvOptions& options) override {
     return target_->NewRandomAccessFile(f, r, options);
   }
   Status NewWritableFile(const std::string& f, unique_ptr<WritableFile>* r,
-                         const EnvOptions& options) {
+                         const EnvOptions& options) override {
     return target_->NewWritableFile(f, r, options);
   }
   Status NewRandomRWFile(const std::string& f, unique_ptr<RandomRWFile>* r,
-                         const EnvOptions& options) {
+                         const EnvOptions& options) override {
     return target_->NewRandomRWFile(f, r, options);
   }
   virtual Status NewDirectory(const std::string& name,
-                              unique_ptr<Directory>* result) {
+                              unique_ptr<Directory>* result) override {
     return target_->NewDirectory(name, result);
   }
-  bool FileExists(const std::string& f) { return target_->FileExists(f); }
-  Status GetChildren(const std::string& dir, std::vector<std::string>* r) {
+  bool FileExists(const std::string& f) override {
+    return target_->FileExists(f);
+  }
+  Status GetChildren(const std::string& dir,
+                     std::vector<std::string>* r) override {
     return target_->GetChildren(dir, r);
   }
-  Status DeleteFile(const std::string& f) { return target_->DeleteFile(f); }
-  Status CreateDir(const std::string& d) { return target_->CreateDir(d); }
-  Status CreateDirIfMissing(const std::string& d) {
+  Status DeleteFile(const std::string& f) override {
+    return target_->DeleteFile(f);
+  }
+  Status CreateDir(const std::string& d) override {
+    return target_->CreateDir(d);
+  }
+  Status CreateDirIfMissing(const std::string& d) override {
     return target_->CreateDirIfMissing(d);
   }
-  Status DeleteDir(const std::string& d) { return target_->DeleteDir(d); }
-  Status GetFileSize(const std::string& f, uint64_t* s) {
+  Status DeleteDir(const std::string& d) override {
+    return target_->DeleteDir(d);
+  }
+  Status GetFileSize(const std::string& f, uint64_t* s) override {
     return target_->GetFileSize(f, s);
   }
 
   Status GetFileModificationTime(const std::string& fname,
-                                 uint64_t* file_mtime) {
+                                 uint64_t* file_mtime) override {
     return target_->GetFileModificationTime(fname, file_mtime);
   }
 
-  Status RenameFile(const std::string& s, const std::string& t) {
+  Status RenameFile(const std::string& s, const std::string& t) override {
     return target_->RenameFile(s, t);
   }
-  Status LockFile(const std::string& f, FileLock** l) {
+
+  Status LinkFile(const std::string& s, const std::string& t) override {
+    return target_->LinkFile(s, t);
+  }
+
+  Status LockFile(const std::string& f, FileLock** l) override {
     return target_->LockFile(f, l);
   }
-  Status UnlockFile(FileLock* l) { return target_->UnlockFile(l); }
-  void Schedule(void (*f)(void*), void* a, Priority pri) {
-    return target_->Schedule(f, a, pri);
+
+  Status UnlockFile(FileLock* l) override { return target_->UnlockFile(l); }
+
+  void Schedule(void (*f)(void* arg), void* a, Priority pri,
+                void* tag = nullptr) override {
+    return target_->Schedule(f, a, pri, tag);
+  }
+
+  int UnSchedule(void* tag, Priority pri) override {
+    return target_->UnSchedule(tag, pri);
   }
-  void StartThread(void (*f)(void*), void* a) {
+
+  void StartThread(void (*f)(void*), void* a) override {
     return target_->StartThread(f, a);
   }
-  void WaitForJoin() { return target_->WaitForJoin(); }
-  virtual unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const {
+  void WaitForJoin() override { return target_->WaitForJoin(); }
+  virtual unsigned int GetThreadPoolQueueLen(
+      Priority pri = LOW) const override {
     return target_->GetThreadPoolQueueLen(pri);
   }
-  virtual Status GetTestDirectory(std::string* path) {
+  virtual Status GetTestDirectory(std::string* path) override {
     return target_->GetTestDirectory(path);
   }
   virtual Status NewLogger(const std::string& fname,
-                           shared_ptr<Logger>* result) {
+                           shared_ptr<Logger>* result) override {
     return target_->NewLogger(fname, result);
   }
-  uint64_t NowMicros() {
-    return target_->NowMicros();
-  }
-  void SleepForMicroseconds(int micros) {
+  uint64_t NowMicros() override { return target_->NowMicros(); }
+  void SleepForMicroseconds(int micros) override {
     target_->SleepForMicroseconds(micros);
   }
-  Status GetHostName(char* name, uint64_t len) {
+  Status GetHostName(char* name, uint64_t len) override {
     return target_->GetHostName(name, len);
   }
-  Status GetCurrentTime(int64_t* unix_time) {
+  Status GetCurrentTime(int64_t* unix_time) override {
     return target_->GetCurrentTime(unix_time);
   }
   Status GetAbsolutePath(const std::string& db_path,
-      std::string* output_path) {
+                         std::string* output_path) override {
     return target_->GetAbsolutePath(db_path, output_path);
   }
-  void SetBackgroundThreads(int num, Priority pri) {
+  void SetBackgroundThreads(int num, Priority pri) override {
     return target_->SetBackgroundThreads(num, pri);
   }
-  std::string TimeToString(uint64_t time) {
+
+  void IncBackgroundThreadsIfNeeded(int num, Priority pri) override {
+    return target_->IncBackgroundThreadsIfNeeded(num, pri);
+  }
+
+  void LowerThreadPoolIOPriority(Priority pool = LOW) override {
+    target_->LowerThreadPoolIOPriority(pool);
+  }
+
+  std::string TimeToString(uint64_t time) override {
     return target_->TimeToString(time);
   }
 
+  Status GetThreadList(std::vector<ThreadStatus>* thread_list) override {
+    return target_->GetThreadList(thread_list);
+  }
+
+  ThreadStatusUpdater* GetThreadStatusUpdater() const override {
+    return target_->GetThreadStatusUpdater();
+  }
+
  private:
   Env* target_;
 };
diff --git a/src/rocksdb/include/rocksdb/experimental.h b/src/rocksdb/include/rocksdb/experimental.h
new file mode 100644
index 0000000..1d02e02
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/experimental.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include "rocksdb/db.h"
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+namespace experimental {
+
+// Supported only for Leveled compaction
+Status SuggestCompactRange(DB* db, ColumnFamilyHandle* column_family,
+                           const Slice* begin, const Slice* end);
+Status SuggestCompactRange(DB* db, const Slice* begin, const Slice* end);
+
+// Move all L0 files to target_level skipping compaction.
+// This operation succeeds only if the files in L0 have disjoint ranges; this
+// is guaranteed to happen, for instance, if keys are inserted in sorted
+// order. Furthermore, all levels between 1 and target_level must be empty.
+// If any of the above condition is violated, InvalidArgument will be
+// returned.
+Status PromoteL0(DB* db, ColumnFamilyHandle* column_family,
+                 int target_level = 1);
+
+}  // namespace experimental
+}  // namespace rocksdb
diff --git a/src/rocksdb/include/rocksdb/filter_policy.h b/src/rocksdb/include/rocksdb/filter_policy.h
index fa44db4..90aefb3 100644
--- a/src/rocksdb/include/rocksdb/filter_policy.h
+++ b/src/rocksdb/include/rocksdb/filter_policy.h
@@ -21,11 +21,52 @@
 #define STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_
 
 #include <string>
+#include <memory>
 
 namespace rocksdb {
 
 class Slice;
 
+// A class that takes a bunch of keys, then generates filter
+class FilterBitsBuilder {
+ public:
+  virtual ~FilterBitsBuilder() {}
+
+  // Add Key to filter, you could use any way to store the key.
+  // Such as: storing hashes or original keys
+  // Keys are in sorted order and duplicated keys are possible.
+  virtual void AddKey(const Slice& key) = 0;
+
+  // Generate the filter using the keys that are added
+  // The return value of this function would be the filter bits,
+  // The ownership of actual data is set to buf
+  virtual Slice Finish(std::unique_ptr<const char[]>* buf) = 0;
+};
+
+// A class that checks if a key can be in filter
+// It should be initialized by Slice generated by BitsBuilder
+class FilterBitsReader {
+ public:
+  virtual ~FilterBitsReader() {}
+
+  // Check if the entry match the bits in filter
+  virtual bool MayMatch(const Slice& entry) = 0;
+};
+
+// We add a new format of filter block called full filter block
+// This new interface gives you more space of customization
+//
+// For the full filter block, you can plug in your version by implement
+// the FilterBitsBuilder and FilterBitsReader
+//
+// There are two sets of interface in FilterPolicy
+// Set 1: CreateFilter, KeyMayMatch: used for blockbased filter
+// Set 2: GetFilterBitsBuilder, GetFilterBitsReader, they are used for
+// full filter.
+// Set 1 MUST be implemented correctly, Set 2 is optional
+// RocksDB would first try using functions in Set 2. if they return nullptr,
+// it would use Set 1 instead.
+// You can choose filter type in NewBloomFilterPolicy
 class FilterPolicy {
  public:
   virtual ~FilterPolicy();
@@ -51,11 +92,28 @@ class FilterPolicy {
   // This method may return true or false if the key was not on the
   // list, but it should aim to return false with a high probability.
   virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const = 0;
+
+  // Get the FilterBitsBuilder, which is ONLY used for full filter block
+  // It contains interface to take individual key, then generate filter
+  virtual FilterBitsBuilder* GetFilterBitsBuilder() const {
+    return nullptr;
+  }
+
+  // Get the FilterBitsReader, which is ONLY used for full filter block
+  // It contains interface to tell if key can be in filter
+  // The input slice should NOT be deleted by FilterPolicy
+  virtual FilterBitsReader* GetFilterBitsReader(const Slice& contents) const {
+    return nullptr;
+  }
 };
 
 // Return a new filter policy that uses a bloom filter with approximately
-// the specified number of bits per key.  A good value for bits_per_key
+// the specified number of bits per key.
+//
+// bits_per_key: bits per key in bloom filter. A good value for bits_per_key
 // is 10, which yields a filter with ~ 1% false positive rate.
+// use_block_based_builder: use block based filter rather than full fiter.
+// If you want to builder full filter, it needs to be set to false.
 //
 // Callers must delete the result after any database that is using the
 // result has been closed.
@@ -67,8 +125,8 @@ class FilterPolicy {
 // ignores trailing spaces, it would be incorrect to use a
 // FilterPolicy (like NewBloomFilterPolicy) that does not ignore
 // trailing spaces in keys.
-extern const FilterPolicy* NewBloomFilterPolicy(int bits_per_key);
-
+extern const FilterPolicy* NewBloomFilterPolicy(int bits_per_key,
+    bool use_block_based_builder = true);
 }
 
 #endif  // STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_
diff --git a/src/rocksdb/include/rocksdb/flush_block_policy.h b/src/rocksdb/include/rocksdb/flush_block_policy.h
index 8340ad6..939725c 100644
--- a/src/rocksdb/include/rocksdb/flush_block_policy.h
+++ b/src/rocksdb/include/rocksdb/flush_block_policy.h
@@ -6,6 +6,7 @@
 #pragma once
 
 #include <string>
+#include "rocksdb/table.h"
 
 namespace rocksdb {
 
@@ -37,7 +38,8 @@ class FlushBlockPolicyFactory {
   // Callers must delete the result after any database that is using the
   // result has been closed.
   virtual FlushBlockPolicy* NewFlushBlockPolicy(
-      const Options& options, const BlockBuilder& data_block_builder) const = 0;
+      const BlockBasedTableOptions& table_options,
+      const BlockBuilder& data_block_builder) const = 0;
 
   virtual ~FlushBlockPolicyFactory() { }
 };
@@ -51,7 +53,7 @@ class FlushBlockBySizePolicyFactory : public FlushBlockPolicyFactory {
   }
 
   virtual FlushBlockPolicy* NewFlushBlockPolicy(
-      const Options& options,
+      const BlockBasedTableOptions& table_options,
       const BlockBuilder& data_block_builder) const override;
 };
 
diff --git a/src/rocksdb/include/rocksdb/immutable_options.h b/src/rocksdb/include/rocksdb/immutable_options.h
new file mode 100644
index 0000000..1551d26
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/immutable_options.h
@@ -0,0 +1,105 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "rocksdb/options.h"
+
+namespace rocksdb {
+
+// ImmutableCFOptions is a data struct used by RocksDB internal. It contains a
+// subset of Options that should not be changed during the entire lifetime
+// of DB. You shouldn't need to access this data structure unless you are
+// implementing a new TableFactory. Raw pointers defined in this struct do
+// not have ownership to the data they point to. Options contains shared_ptr
+// to these data.
+struct ImmutableCFOptions {
+  explicit ImmutableCFOptions(const Options& options);
+
+  CompactionStyle compaction_style;
+
+  CompactionOptionsUniversal compaction_options_universal;
+  CompactionOptionsFIFO compaction_options_fifo;
+
+  const SliceTransform* prefix_extractor;
+
+  const Comparator* comparator;
+
+  MergeOperator* merge_operator;
+
+  const CompactionFilter* compaction_filter;
+
+  CompactionFilterFactory* compaction_filter_factory;
+
+  CompactionFilterFactoryV2* compaction_filter_factory_v2;
+
+  bool inplace_update_support;
+
+  UpdateStatus (*inplace_callback)(char* existing_value,
+                                   uint32_t* existing_value_size,
+                                   Slice delta_value,
+                                   std::string* merged_value);
+
+  Logger* info_log;
+
+  Statistics* statistics;
+
+  InfoLogLevel info_log_level;
+
+  Env* env;
+
+  // Allow the OS to mmap file for reading sst tables. Default: false
+  bool allow_mmap_reads;
+
+  // Allow the OS to mmap file for writing. Default: false
+  bool allow_mmap_writes;
+
+  std::vector<DbPath> db_paths;
+
+  MemTableRepFactory* memtable_factory;
+
+  TableFactory* table_factory;
+
+  Options::TablePropertiesCollectorFactories
+    table_properties_collector_factories;
+
+  bool advise_random_on_open;
+
+  // This options is required by PlainTableReader. May need to move it
+  // to PlainTalbeOptions just like bloom_bits_per_key
+  uint32_t bloom_locality;
+
+  bool purge_redundant_kvs_while_flush;
+
+  uint32_t min_partial_merge_operands;
+
+  bool disable_data_sync;
+
+  bool use_fsync;
+
+  CompressionType compression;
+
+  std::vector<CompressionType> compression_per_level;
+
+  CompressionOptions compression_opts;
+
+  bool level_compaction_dynamic_level_bytes;
+
+  Options::AccessHint access_hint_on_compaction_start;
+
+  int num_levels;
+
+  bool optimize_filters_for_hits;
+
+#ifndef ROCKSDB_LITE
+  // A vector of EventListeners which call-back functions will be called
+  // when specific RocksDB event happens.
+  std::vector<std::shared_ptr<EventListener>> listeners;
+#endif  // ROCKSDB_LITE
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/include/rocksdb/iostats_context.h b/src/rocksdb/include/rocksdb/iostats_context.h
new file mode 100644
index 0000000..e06ee17
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/iostats_context.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef INCLUDE_ROCKSDB_IOSTATS_CONTEXT_H_
+#define INCLUDE_ROCKSDB_IOSTATS_CONTEXT_H_
+
+#include <stdint.h>
+#include <string>
+
+// A thread local context for gathering io-stats efficiently and transparently.
+namespace rocksdb {
+
+struct IOStatsContext {
+  // reset all io-stats counter to zero
+  void Reset();
+
+  std::string ToString() const;
+
+  // the thread pool id
+  uint64_t thread_pool_id;
+
+  // number of bytes that has been written.
+  uint64_t bytes_written;
+  // number of bytes that has been read.
+  uint64_t bytes_read;
+};
+
+#ifndef IOS_CROSS_COMPILE
+extern __thread IOStatsContext iostats_context;
+#endif  // IOS_CROSS_COMPILE
+
+}  // namespace rocksdb
+
+#endif  // INCLUDE_ROCKSDB_IOSTATS_CONTEXT_H_
diff --git a/src/rocksdb/include/rocksdb/ldb_tool.h b/src/rocksdb/include/rocksdb/ldb_tool.h
index 46bacc8..1b1c64b 100644
--- a/src/rocksdb/include/rocksdb/ldb_tool.h
+++ b/src/rocksdb/include/rocksdb/ldb_tool.h
@@ -4,13 +4,32 @@
 // of patent rights can be found in the PATENTS file in the same directory.
 #ifndef ROCKSDB_LITE
 #pragma once
+#include <string>
 #include "rocksdb/options.h"
 
 namespace rocksdb {
 
+// An interface for converting a slice to a readable string
+class SliceFormatter {
+ public:
+  virtual ~SliceFormatter() {}
+  virtual std::string Format(const Slice& s) const = 0;
+};
+
+// Options for customizing ldb tool (beyond the DB Options)
+struct LDBOptions {
+  // Create LDBOptions with default values for all fields
+  LDBOptions();
+
+  // Key formatter that converts a slice to a readable string.
+  // Default: Slice::ToString()
+  std::shared_ptr<SliceFormatter> key_formatter;
+};
+
 class LDBTool {
  public:
-  void Run(int argc, char** argv, Options = Options());
+  void Run(int argc, char** argv, Options db_options= Options(),
+           const LDBOptions& ldb_options = LDBOptions());
 };
 
 } // namespace rocksdb
diff --git a/src/rocksdb/include/rocksdb/listener.h b/src/rocksdb/include/rocksdb/listener.h
new file mode 100644
index 0000000..7f70d1c
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/listener.h
@@ -0,0 +1,107 @@
+// Copyright (c) 2014 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <vector>
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+class DB;
+class Status;
+
+struct CompactionJobInfo {
+  // the name of the column family where the compaction happened.
+  std::string cf_name;
+  // the status indicating whether the compaction was successful or not.
+  Status status;
+  // the output level of the compaction.
+  int output_level;
+  // the names of the compaction input files.
+  std::vector<std::string> input_files;
+  // the names of the compaction output files.
+  std::vector<std::string> output_files;
+};
+
+// EventListener class contains a set of call-back functions that will
+// be called when specific RocksDB event happens such as flush.  It can
+// be used as a building block for developing custom features such as
+// stats-collector or external compaction algorithm.
+//
+// Note that call-back functions should not run for an extended period of
+// time before the function returns, otherwise RocksDB may be blocked.
+// For example, it is not suggested to do DB::CompactFiles() (as it may
+// run for a long while) or issue many of DB::Put() (as Put may be blocked
+// in certain cases) in the same thread in the EventListener callback.
+// However, doing DB::CompactFiles() and DB::Put() in another thread is
+// considered safe.
+//
+// [Threading] All EventListener callback will be called using the
+// actual thread that involves in that specific event.   For example, it
+// is the RocksDB background flush thread that does the actual flush to
+// call EventListener::OnFlushCompleted().
+//
+// [Locking] All EventListener callbacks are designed to be called without
+// the current thread holding any DB mutex. This is to prevent potential
+// deadlock and performance issue when using EventListener callback
+// in a complex way. However, all EventListener call-back functions
+// should not run for an extended period of time before the function
+// returns, otherwise RocksDB may be blocked. For example, it is not
+// suggested to do DB::CompactFiles() (as it may run for a long while)
+// or issue many of DB::Put() (as Put may be blocked in certain cases)
+// in the same thread in the EventListener callback. However, doing
+// DB::CompactFiles() and DB::Put() in a thread other than the
+// EventListener callback thread is considered safe.
+class EventListener {
+ public:
+  // A call-back function to RocksDB which will be called whenever a
+  // registered RocksDB flushes a file.  The default implementation is
+  // no-op.
+  //
+  // Note that the this function must be implemented in a way such that
+  // it should not run for an extended period of time before the function
+  // returns.  Otherwise, RocksDB may be blocked.
+  //
+  // @param db a pointer to the rocksdb instance which just flushed
+  //     a memtable to disk.
+  // @param column_family_id the id of the flushed column family.
+  // @param file_path the path to the newly created file.
+  // @param triggered_writes_slowdown true when rocksdb is currently
+  //     slowing-down all writes to prevent creating too many Level 0
+  //     files as compaction seems not able to catch up the write request
+  //     speed.  This indicates that there're too many files in Level 0.
+  // @param triggered_writes_stop true when rocksdb is currently blocking
+  //     any writes to prevent creating more L0 files.  This indicates that
+  //     there're too many files in level 0.  Compactions should try to
+  //     compact L0 files down to lower levels as soon as possible.
+  virtual void OnFlushCompleted(
+      DB* db, const std::string& column_family_name,
+      const std::string& file_path,
+      bool triggered_writes_slowdown,
+      bool triggered_writes_stop) {}
+
+  // A call-back function for RocksDB which will be called whenever
+  // a registered RocksDB compacts a file. The default implementation
+  // is a no-op.
+  //
+  // Note that this function must be implemented in a way such that
+  // it should not run for an extended period of time before the function
+  // returns. Otherwise, RocksDB may be blocked.
+  //
+  // @param db a pointer to the rocksdb instance which just compacted
+  //   a file.
+  // @param ci a reference to a CompactionJobInfo struct. 'ci' is released
+  //  after this function is returned, and must be copied if it is needed
+  //  outside of this function.
+  virtual void OnCompactionCompleted(DB *db, const CompactionJobInfo& ci) {}
+  virtual ~EventListener() {}
+};
+
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/memtablerep.h b/src/rocksdb/include/rocksdb/memtablerep.h
index be15a60..c369e88 100644
--- a/src/rocksdb/include/rocksdb/memtablerep.h
+++ b/src/rocksdb/include/rocksdb/memtablerep.h
@@ -14,8 +14,8 @@
 //  (4) Items are never deleted.
 // The liberal use of assertions is encouraged to enforce (1).
 //
-// The factory will be passed an Arena object when a new MemTableRep is
-// requested. The API for this object is in rocksdb/arena.h.
+// The factory will be passed an MemTableAllocator object when a new MemTableRep
+// is requested.
 //
 // Users can implement their own memtable representations. We include three
 // types built in:
@@ -41,9 +41,11 @@
 namespace rocksdb {
 
 class Arena;
+class MemTableAllocator;
 class LookupKey;
 class Slice;
 class SliceTransform;
+class Logger;
 
 typedef void* KeyHandle;
 
@@ -64,7 +66,7 @@ class MemTableRep {
     virtual ~KeyComparator() { }
   };
 
-  explicit MemTableRep(Arena* arena) : arena_(arena) {}
+  explicit MemTableRep(MemTableAllocator* allocator) : allocator_(allocator) {}
 
   // Allocate a buf of len size for storing key. The idea is that a specific
   // memtable representation knows its underlying data structure better. By
@@ -82,7 +84,9 @@ class MemTableRep {
   virtual bool Contains(const char* key) const = 0;
 
   // Notify this table rep that it will no longer be added to. By default, does
-  // nothing.
+  // nothing.  After MarkReadOnly() is called, this table rep will not be
+  // written to (ie No more calls to Allocate(), Insert(), or any writes done
+  // directly to entries accessed through the iterator.)
   virtual void MarkReadOnly() { }
 
   // Look up key from the mem table, since the first key in the mem table whose
@@ -100,7 +104,7 @@ class MemTableRep {
                    bool (*callback_func)(void* arg, const char* entry));
 
   // Report an approximation of how much memory has been used other than memory
-  // that was allocated through the arena.
+  // that was allocated through the allocator.
   virtual size_t ApproximateMemoryUsage() = 0;
 
   virtual ~MemTableRep() { }
@@ -141,16 +145,21 @@ class MemTableRep {
   };
 
   // Return an iterator over the keys in this representation.
-  virtual Iterator* GetIterator() = 0;
-
-  // Return an iterator over at least the keys with the specified user key. The
-  // iterator may also allow access to other keys, but doesn't have to. Default:
-  // GetIterator().
-  virtual Iterator* GetIterator(const Slice& user_key) { return GetIterator(); }
+  // arena: If not null, the arena needs to be used to allocate the Iterator.
+  //        When destroying the iterator, the caller will not call "delete"
+  //        but Iterator::~Iterator() directly. The destructor needs to destroy
+  //        all the states but those allocated in arena.
+  virtual Iterator* GetIterator(Arena* arena = nullptr) = 0;
 
   // Return an iterator that has a special Seek semantics. The result of
   // a Seek might only include keys with the same prefix as the target key.
-  virtual Iterator* GetDynamicPrefixIterator() { return GetIterator(); }
+  // arena: If not null, the arena is used to allocate the Iterator.
+  //        When destroying the iterator, the caller will not call "delete"
+  //        but Iterator::~Iterator() directly. The destructor needs to destroy
+  //        all the states but those allocated in arena.
+  virtual Iterator* GetDynamicPrefixIterator(Arena* arena = nullptr) {
+    return GetIterator(arena);
+  }
 
   // Return true if the current MemTableRep supports merge operator.
   // Default: true
@@ -165,7 +174,7 @@ class MemTableRep {
   // user key.
   virtual Slice UserKey(const char* key) const;
 
-  Arena* arena_;
+  MemTableAllocator* allocator_;
 };
 
 // This is the base class for all factories that are used by RocksDB to create
@@ -174,17 +183,31 @@ class MemTableRepFactory {
  public:
   virtual ~MemTableRepFactory() {}
   virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&,
-      Arena*, const SliceTransform*) = 0;
+                                         MemTableAllocator*,
+                                         const SliceTransform*,
+                                         Logger* logger) = 0;
   virtual const char* Name() const = 0;
 };
 
 // This uses a skip list to store keys. It is the default.
+//
+// Parameters:
+//   lookahead: If non-zero, each iterator's seek operation will start the
+//     search from the previously visited record (doing at most 'lookahead'
+//     steps). This is an optimization for the access pattern including many
+//     seeks with consecutive keys.
 class SkipListFactory : public MemTableRepFactory {
  public:
+  explicit SkipListFactory(size_t lookahead = 0) : lookahead_(lookahead) {}
+
   virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&,
-                                         Arena*,
-                                         const SliceTransform*) override;
+                                         MemTableAllocator*,
+                                         const SliceTransform*,
+                                         Logger* logger) override;
   virtual const char* Name() const override { return "SkipListFactory"; }
+
+ private:
+  const size_t lookahead_;
 };
 
 #ifndef ROCKSDB_LITE
@@ -201,9 +224,10 @@ class VectorRepFactory : public MemTableRepFactory {
 
  public:
   explicit VectorRepFactory(size_t count = 0) : count_(count) { }
-  virtual MemTableRep* CreateMemTableRep(
-      const MemTableRep::KeyComparator&, Arena*,
-      const SliceTransform*) override;
+  virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&,
+                                         MemTableAllocator*,
+                                         const SliceTransform*,
+                                         Logger* logger) override;
   virtual const char* Name() const override {
     return "VectorRepFactory";
   }
@@ -220,17 +244,27 @@ extern MemTableRepFactory* NewHashSkipListRepFactory(
     int32_t skiplist_branching_factor = 4
 );
 
-// The factory is to create memtables with a hashed linked list:
-// it contains a fixed array of buckets, each pointing to a sorted single
-// linked list (null if the bucket is empty).
+// The factory is to create memtables based on a hash table:
+// it contains a fixed array of buckets, each pointing to either a linked list
+// or a skip list if number of entries inside the bucket exceeds
+// threshold_use_skiplist.
 // @bucket_count: number of fixed array buckets
 // @huge_page_tlb_size: if <=0, allocate the hash table bytes from malloc.
 //                      Otherwise from huge page TLB. The user needs to reserve
 //                      huge pages for it to be allocated, like:
 //                          sysctl -w vm.nr_hugepages=20
 //                      See linux doc Documentation/vm/hugetlbpage.txt
+// @bucket_entries_logging_threshold: if number of entries in one bucket
+//                                    exceeds this number, log about it.
+// @if_log_bucket_dist_when_flash: if true, log distribution of number of
+//                                 entries when flushing.
+// @threshold_use_skiplist: a bucket switches to skip list if number of
+//                          entries exceed this parameter.
 extern MemTableRepFactory* NewHashLinkListRepFactory(
-    size_t bucket_count = 50000, size_t huge_page_tlb_size = 0);
+    size_t bucket_count = 50000, size_t huge_page_tlb_size = 0,
+    int bucket_entries_logging_threshold = 4096,
+    bool if_log_bucket_dist_when_flash = true,
+    uint32_t threshold_use_skiplist = 256);
 
 // This factory creates a cuckoo-hashing based mem-table representation.
 // Cuckoo-hash is a closed-hash strategy, in which all key/value pairs
diff --git a/src/rocksdb/include/rocksdb/metadata.h b/src/rocksdb/include/rocksdb/metadata.h
new file mode 100644
index 0000000..e026fa9
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/metadata.h
@@ -0,0 +1,90 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#include <limits>
+#include <string>
+#include <vector>
+
+#include "rocksdb/types.h"
+
+#pragma once
+
+namespace rocksdb {
+struct ColumnFamilyMetaData;
+struct LevelMetaData;
+struct SstFileMetaData;
+
+// The metadata that describes a column family.
+struct ColumnFamilyMetaData {
+  ColumnFamilyMetaData() : size(0), name("") {}
+  ColumnFamilyMetaData(const std::string& _name, uint64_t _size,
+                       const std::vector<LevelMetaData>&& _levels) :
+      size(_size), name(_name), levels(_levels) {}
+
+  // The size of this column family in bytes, which is equal to the sum of
+  // the file size of its "levels".
+  uint64_t size;
+  // The number of files in this column family.
+  size_t file_count;
+  // The name of the column family.
+  std::string name;
+  // The metadata of all levels in this column family.
+  std::vector<LevelMetaData> levels;
+};
+
+// The metadata that describes a level.
+struct LevelMetaData {
+  LevelMetaData(int _level, uint64_t _size,
+                const std::vector<SstFileMetaData>&& _files) :
+      level(_level), size(_size),
+      files(_files) {}
+
+  // The level which this meta data describes.
+  const int level;
+  // The size of this level in bytes, which is equal to the sum of
+  // the file size of its "files".
+  const uint64_t size;
+  // The metadata of all sst files in this level.
+  const std::vector<SstFileMetaData> files;
+};
+
+// The metadata that describes a SST file.
+struct SstFileMetaData {
+  SstFileMetaData() {}
+  SstFileMetaData(const std::string& _file_name,
+                  const std::string& _path, uint64_t _size,
+                  SequenceNumber _smallest_seqno,
+                  SequenceNumber _largest_seqno,
+                  const std::string& _smallestkey,
+                  const std::string& _largestkey,
+                  bool _being_compacted) :
+    size(_size), name(_file_name),
+    db_path(_path), smallest_seqno(_smallest_seqno), largest_seqno(_largest_seqno),
+    smallestkey(_smallestkey), largestkey(_largestkey),
+    being_compacted(_being_compacted) {}
+
+  // File size in bytes.
+  uint64_t size;
+  // The name of the file.
+  std::string name;
+  // The full path where the file locates.
+  std::string db_path;
+
+  SequenceNumber smallest_seqno;  // Smallest sequence number in file.
+  SequenceNumber largest_seqno;   // Largest sequence number in file.
+  std::string smallestkey;     // Smallest user defined key in the file.
+  std::string largestkey;      // Largest user defined key in the file.
+  bool being_compacted;  // true if the file is currently being compacted.
+};
+
+// The full set of metadata associated with each SST file.
+struct LiveFileMetaData : SstFileMetaData {
+  std::string column_family_name;  // Name of the column family
+  int level;               // Level at which this file resides.
+};
+
+
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/include/rocksdb/options.h b/src/rocksdb/include/rocksdb/options.h
index 93dbf0d..ea11b81 100644
--- a/src/rocksdb/include/rocksdb/options.h
+++ b/src/rocksdb/include/rocksdb/options.h
@@ -10,12 +10,16 @@
 #define STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_
 
 #include <stddef.h>
+#include <stdint.h>
 #include <string>
 #include <memory>
 #include <vector>
+#include <limits>
 #include <stdint.h>
+#include <unordered_map>
 
 #include "rocksdb/version.h"
+#include "rocksdb/listener.h"
 #include "rocksdb/universal_compaction.h"
 
 namespace rocksdb {
@@ -33,14 +37,13 @@ class MergeOperator;
 class Snapshot;
 class TableFactory;
 class MemTableRepFactory;
-class TablePropertiesCollector;
+class TablePropertiesCollectorFactory;
+class RateLimiter;
 class Slice;
 class SliceTransform;
 class Statistics;
 class InternalKeyComparator;
 
-using std::shared_ptr;
-
 // DB contents are stored in a set of blocks, each of which holds a
 // sequence of key,value pairs.  Each block may be compressed before
 // being stored in a file.  The following enum describes which
@@ -52,9 +55,34 @@ enum CompressionType : char {
   kBZip2Compression = 0x3, kLZ4Compression = 0x4, kLZ4HCCompression = 0x5
 };
 
+// returns true if RocksDB was correctly linked with compression library and
+// supports the compression type
+extern bool CompressionTypeSupported(CompressionType compression_type);
+// Returns a human-readable name of the compression type
+extern const char* CompressionTypeToString(CompressionType compression_type);
+
 enum CompactionStyle : char {
-  kCompactionStyleLevel = 0x0,     // level based compaction style
-  kCompactionStyleUniversal = 0x1  // Universal compaction style
+  // level based compaction style
+  kCompactionStyleLevel = 0x0,
+  // Universal compaction style
+  // Not supported in ROCKSDB_LITE.
+  kCompactionStyleUniversal = 0x1,
+  // FIFO compaction style
+  // Not supported in ROCKSDB_LITE
+  kCompactionStyleFIFO = 0x2,
+  // Disable background compaction. Compaction jobs are submitted
+  // via CompactFiles().
+  // Not supported in ROCKSDB_LITE
+  kCompactionStyleNone = 0x3,
+};
+
+struct CompactionOptionsFIFO {
+  // once the total sum of table files reaches this, we will delete the oldest
+  // table file
+  // Default: 1GB
+  uint64_t max_table_files_size;
+
+  CompactionOptionsFIFO() : max_table_files_size(1 * 1024 * 1024 * 1024) {}
 };
 
 // Compression options for different compression algorithms like Zlib
@@ -73,9 +101,47 @@ enum UpdateStatus {    // Return status For inplace update callback
   UPDATED         = 2, // No inplace update. Merged value set
 };
 
+struct DbPath {
+  std::string path;
+  uint64_t target_size;  // Target size of total files under the path, in byte.
+
+  DbPath() : target_size(0) {}
+  DbPath(const std::string& p, uint64_t t) : path(p), target_size(t) {}
+};
+
 struct Options;
 
 struct ColumnFamilyOptions {
+  // Some functions that make it easier to optimize RocksDB
+
+  // Use this if you don't need to keep the data sorted, i.e. you'll never use
+  // an iterator, only Put() and Get() API calls
+  //
+  // Not supported in ROCKSDB_LITE
+  ColumnFamilyOptions* OptimizeForPointLookup(
+      uint64_t block_cache_size_mb);
+
+  // Default values for some parameters in ColumnFamilyOptions are not
+  // optimized for heavy workloads and big datasets, which means you might
+  // observe write stalls under some conditions. As a starting point for tuning
+  // RocksDB options, use the following two functions:
+  // * OptimizeLevelStyleCompaction -- optimizes level style compaction
+  // * OptimizeUniversalStyleCompaction -- optimizes universal style compaction
+  // Universal style compaction is focused on reducing Write Amplification
+  // Factor for big data sets, but increases Space Amplification. You can learn
+  // more about the different styles here:
+  // https://github.com/facebook/rocksdb/wiki/Rocksdb-Architecture-Guide
+  // Make sure to also call IncreaseParallelism(), which will provide the
+  // biggest performance gains.
+  // Note: we might use more memory than memtable_memory_budget during high
+  // write rate period
+  //
+  // OptimizeUniversalStyleCompaction is not supported in ROCKSDB_LITE
+  ColumnFamilyOptions* OptimizeLevelStyleCompaction(
+      uint64_t memtable_memory_budget = 512 * 1024 * 1024);
+  ColumnFamilyOptions* OptimizeUniversalStyleCompaction(
+      uint64_t memtable_memory_budget = 512 * 1024 * 1024);
+
   // -------------------
   // Parameters that affect behavior
 
@@ -97,7 +163,7 @@ struct ColumnFamilyOptions {
   // for the first time. It's necessary to specify a merge operator when
   // openning the DB in this case.
   // Default: nullptr
-  shared_ptr<MergeOperator> merge_operator;
+  std::shared_ptr<MergeOperator> merge_operator;
 
   // A single CompactionFilter instance to call into during compaction.
   // Allows an application to modify/delete a key-value during background
@@ -145,13 +211,22 @@ struct ColumnFamilyOptions {
   // Also, a larger write buffer will result in a longer recovery time
   // the next time the database is opened.
   //
+  // Note that write_buffer_size is enforced per column family.
+  // See db_write_buffer_size for sharing memory across column families.
+  //
   // Default: 4MB
+  //
+  // Dynamically changeable through SetOptions() API
   size_t write_buffer_size;
 
   // The maximum number of write buffers that are built up in memory.
-  // The default is 2, so that when 1 write buffer is being flushed to
-  // storage, new writes can continue to the other write buffer.
+  // The default and the minimum number is 2, so that when 1 write buffer
+  // is being flushed to storage, new writes can continue to the other
+  // write buffer.
+  //
   // Default: 2
+  //
+  // Dynamically changeable through SetOptions() API
   int max_write_buffer_number;
 
   // The minimum number of write buffers that will be merged together
@@ -163,34 +238,6 @@ struct ColumnFamilyOptions {
   // individual write buffers.  Default: 1
   int min_write_buffer_number_to_merge;
 
-  // Control over blocks (user data is stored in a set of blocks, and
-  // a block is the unit of reading from disk).
-
-  // If non-NULL use the specified cache for blocks.
-  // If NULL, rocksdb will automatically create and use an 8MB internal cache.
-  // Default: nullptr
-  shared_ptr<Cache> block_cache;
-
-  // If non-NULL use the specified cache for compressed blocks.
-  // If NULL, rocksdb will not use a compressed block cache.
-  // Default: nullptr
-  shared_ptr<Cache> block_cache_compressed;
-
-  // Approximate size of user data packed per block.  Note that the
-  // block size specified here corresponds to uncompressed data.  The
-  // actual size of the unit read from disk may be smaller if
-  // compression is enabled.  This parameter can be changed dynamically.
-  //
-  // Default: 4K
-  size_t block_size;
-
-  // Number of keys between restart points for delta encoding of keys.
-  // This parameter can be changed dynamically.  Most clients should
-  // leave this parameter alone.
-  //
-  // Default: 16
-  int block_restart_interval;
-
   // Compress blocks using the specified compression algorithm.  This
   // parameter can be changed dynamically.
   //
@@ -208,29 +255,31 @@ struct ColumnFamilyOptions {
   CompressionType compression;
 
   // Different levels can have different compression policies. There
-  // are cases where most lower levels would like to quick compression
-  // algorithm while the higher levels (which have more data) use
+  // are cases where most lower levels would like to use quick compression
+  // algorithms while the higher levels (which have more data) use
   // compression algorithms that have better compression but could
-  // be slower. This array, if non nullptr, should have an entry for
-  // each level of the database. This array, if non nullptr, overides the
-  // value specified in the previous field 'compression'. The caller is
-  // reponsible for allocating memory and initializing the values in it
-  // before invoking Open(). The caller is responsible for freeing this
-  // array and it could be freed anytime after the return from Open().
-  // This could have been a std::vector but that makes the equivalent
-  // java/C api hard to construct.
+  // be slower. This array, if non-empty, should have an entry for
+  // each level of the database; these override the value specified in
+  // the previous field 'compression'.
+  //
+  // NOTICE if level_compaction_dynamic_level_bytes=true,
+  // compression_per_level[0] still determines L0, but other elements
+  // of the array are based on base level (the level L0 files are merged
+  // to), and may not match the level users see from info log for metadata.
+  // If L0 files are merged to level-n, then, for i>0, compression_per_level[i]
+  // determines compaction type for level n+i-1.
+  // For example, if we have three 5 levels, and we determine to merge L0
+  // data to L4 (which means L1..L3 will be empty), then the new files go to
+  // L4 uses compression type compression_per_level[1].
+  // If now L0 is merged to L2. Data goes to L2 will be compressed
+  // according to compression_per_level[1], L3 using compression_per_level[2]
+  // and L4 using compression_per_level[3]. Compaction for each level can
+  // change when data grows.
   std::vector<CompressionType> compression_per_level;
 
   // different options for compression algorithms
   CompressionOptions compression_opts;
 
-  // If non-nullptr, use the specified filter policy to reduce disk reads.
-  // Many applications will benefit from passing the result of
-  // NewBloomFilterPolicy() here.
-  //
-  // Default: nullptr
-  const FilterPolicy* filter_policy;
-
   // If non-nullptr, use the specified function to determine the
   // prefixes for keys.  These prefixes will be placed in the filter.
   // Depending on the workload, this can reduce the number of read-IOP
@@ -247,12 +296,6 @@ struct ColumnFamilyOptions {
   // Default: nullptr
   std::shared_ptr<const SliceTransform> prefix_extractor;
 
-  // If true, place whole keys in the filter (not just prefixes).
-  // This must generally be true for gets to be efficient.
-  //
-  // Default: true
-  bool whole_key_filtering;
-
   // Number of levels for this database
   int num_levels;
 
@@ -260,14 +303,20 @@ struct ColumnFamilyOptions {
   // level-0 compaction will not be triggered by number of files at all.
   //
   // Default: 4
+  //
+  // Dynamically changeable through SetOptions() API
   int level0_file_num_compaction_trigger;
 
   // Soft limit on number of level-0 files. We start slowing down writes at this
   // point. A value <0 means that no writing slow down will be triggered by
   // number of files in level-0.
+  //
+  // Dynamically changeable through SetOptions() API
   int level0_slowdown_writes_trigger;
 
   // Maximum number of level-0 files.  We stop writes at this point.
+  //
+  // Dynamically changeable through SetOptions() API
   int level0_stop_writes_trigger;
 
   // Maximum level to which a new compacted memtable is pushed if it
@@ -276,6 +325,8 @@ struct ColumnFamilyOptions {
   // expensive manifest file operations.  We do not push all the way to
   // the largest level since that can generate a lot of wasted disk
   // space if the same key space is being repeatedly overwritten.
+  //
+  // Dynamically changeable through SetOptions() API
   int max_mem_compaction_level;
 
   // Target file size for compaction.
@@ -286,11 +337,16 @@ struct ColumnFamilyOptions {
   // target_file_size_multiplier is 10, then each file on level-1 will
   // be 2MB, and each file on level 2 will be 20MB,
   // and each file on level-3 will be 200MB.
+  //
+  // Default: 2MB.
+  //
+  // Dynamically changeable through SetOptions() API
+  uint64_t target_file_size_base;
 
-  // by default target_file_size_base is 2MB.
-  int target_file_size_base;
-  // by default target_file_size_multiplier is 1, which means
+  // By default target_file_size_multiplier is 1, which means
   // by default files in different levels will have similar size.
+  //
+  // Dynamically changeable through SetOptions() API
   int target_file_size_multiplier;
 
   // Control maximum total data size for a level.
@@ -301,22 +357,91 @@ struct ColumnFamilyOptions {
   // max_bytes_for_level_multiplier is 10, total data size for level-1
   // will be 20MB, total file size for level-2 will be 200MB,
   // and total file size for level-3 will be 2GB.
-
-  // by default 'max_bytes_for_level_base' is 10MB.
+  //
+  // Default: 10MB.
+  //
+  // Dynamically changeable through SetOptions() API
   uint64_t max_bytes_for_level_base;
-  // by default 'max_bytes_for_level_base' is 10.
+
+  // If true, RocksDB will pick target size of each level dynamically.
+  // We will pick a base level b >= 1. L0 will be directly merged into level b,
+  // instead of always into level 1. Level 1 to b-1 need to be empty.
+  // We try to pick b and its target size so that
+  // 1. target size is in the range of
+  //   (max_bytes_for_level_base / max_bytes_for_level_multiplier,
+  //    max_bytes_for_level_base]
+  // 2. target size of the last level (level num_levels-1) equals to extra size
+  //    of the level.
+  // At the same time max_bytes_for_level_multiplier and
+  // max_bytes_for_level_multiplier_additional are still satisfied.
+  //
+  // With this option on, from an empty DB, we make last level the base level,
+  // which means merging L0 data into the last level, until it exceeds
+  // max_bytes_for_level_base. And then we make the second last level to be
+  // base level, to start to merge L0 data to second last level, with its
+  // target size to be 1/max_bytes_for_level_multiplier of the last level's
+  // extra size. After the data accumulates more so that we need to move the
+  // base level to the third last one, and so on.
+  //
+  // For example, assume max_bytes_for_level_multiplier=10, num_levels=6,
+  // and max_bytes_for_level_base=10MB.
+  // Target sizes of level 1 to 5 starts with:
+  // [- - - - 10MB]
+  // with base level is level. Target sizes of level 1 to 4 are not applicable
+  // because they will not be used.
+  // Until the size of Level 5 grows to more than 10MB, say 11MB, we make
+  // base target to level 4 and now the targets looks like:
+  // [- - - 1.1MB 11MB]
+  // While data are accumulated, size targets are tuned based on actual data
+  // of level 5. When level 5 has 50MB of data, the target is like:
+  // [- - - 5MB 50MB]
+  // Until level 5's actual size is more than 100MB, say 101MB. Now if we keep
+  // level 4 to be the base level, its target size needs to be 10.1MB, which
+  // doesn't satisfy the target size range. So now we make level 3 the target
+  // size and the target sizes of the levels look like:
+  // [- - 1.01MB 10.1MB 101MB]
+  // In the same way, while level 5 further grows, all levels' targets grow,
+  // like
+  // [- - 5MB 50MB 500MB]
+  // Until level 5 exceeds 1000MB and becomes 1001MB, we make level 2 the
+  // base level and make levels' target sizes like this:
+  // [- 1.001MB 10.01MB 100.1MB 1001MB]
+  // and go on...
+  //
+  // By doing it, we give max_bytes_for_level_multiplier a priority against
+  // max_bytes_for_level_base, for a more predictable LSM tree shape. It is
+  // useful to limit worse case space amplification.
+  //
+  // max_bytes_for_level_multiplier_additional is ignored with this flag on.
+  //
+  // Turning this feature on or off for an existing DB can cause unexpected
+  // LSM tree structure so it's not recommended.
+  //
+  // NOTE: this option is experimental
+  //
+  // Default: false
+  bool level_compaction_dynamic_level_bytes;
+
+  // Default: 10.
+  //
+  // Dynamically changeable through SetOptions() API
   int max_bytes_for_level_multiplier;
 
   // Different max-size multipliers for different levels.
   // These are multiplied by max_bytes_for_level_multiplier to arrive
   // at the max-size of each level.
+  //
   // Default: 1
+  //
+  // Dynamically changeable through SetOptions() API
   std::vector<int> max_bytes_for_level_multiplier_additional;
 
   // Maximum number of bytes in all compacted files.  We avoid expanding
   // the lower level file set of a compaction if it would make the
   // total compaction cover more than
   // (expanded_compaction_factor * targetFileSizeLevel()) many bytes.
+  //
+  // Dynamically changeable through SetOptions() API
   int expanded_compaction_factor;
 
   // Maximum number of bytes in all source files to be compacted in a
@@ -326,41 +451,37 @@ struct ColumnFamilyOptions {
   // (source_compaction_factor * targetFileSizeLevel()) many bytes.
   // Default:1, i.e. pick maxfilesize amount of data as the source of
   // a compaction.
+  //
+  // Dynamically changeable through SetOptions() API
   int source_compaction_factor;
 
   // Control maximum bytes of overlaps in grandparent (i.e., level+2) before we
   // stop building a single file in a level->level+1 compaction.
+  //
+  // Dynamically changeable through SetOptions() API
   int max_grandparent_overlap_factor;
 
-  // Disable compaction triggered by seek.
-  // With bloomfilter and fast storage, a miss on one level
-  // is very cheap if the file handle is cached in table cache
-  // (which is true if max_open_files is large).
-  bool disable_seek_compaction;
-
   // Puts are delayed 0-1 ms when any level has a compaction score that exceeds
   // soft_rate_limit. This is ignored when == 0.0.
   // CONSTRAINT: soft_rate_limit <= hard_rate_limit. If this constraint does not
   // hold, RocksDB will set soft_rate_limit = hard_rate_limit
+  //
   // Default: 0 (disabled)
+  //
+  // Dynamically changeable through SetOptions() API
   double soft_rate_limit;
 
   // Puts are delayed 1ms at a time when any level has a compaction score that
   // exceeds hard_rate_limit. This is ignored when <= 1.0.
+  //
   // Default: 0 (disabled)
+  //
+  // Dynamically changeable through SetOptions() API
   double hard_rate_limit;
 
-  // Max time a put will be stalled when hard_rate_limit is enforced. If 0, then
-  // there is no limit.
-  // Default: 1000
+  // DEPRECATED -- this options is no longer used
   unsigned int rate_limit_delay_max_milliseconds;
 
-  // Disable block cache. If this is set to true,
-  // then no block cache should be used, and the block_cache should
-  // point to a nullptr object.
-  // Default: false
-  bool no_block_cache;
-
   // size of one block in arena memory allocation.
   // If <= 0, a proper value is automatically calculated (usually 1/10 of
   // writer_buffer_size).
@@ -374,47 +495,55 @@ struct ColumnFamilyOptions {
   // conforms to the restrictions.
   //
   // Default: 0
+  //
+  // Dynamically changeable through SetOptions() API
   size_t arena_block_size;
 
   // Disable automatic compactions. Manual compactions can still
   // be issued on this column family
+  //
+  // Dynamically changeable through SetOptions() API
   bool disable_auto_compactions;
 
   // Purge duplicate/deleted keys when a memtable is flushed to storage.
   // Default: true
   bool purge_redundant_kvs_while_flush;
 
-  // This is used to close a block before it reaches the configured
-  // 'block_size'. If the percentage of free space in the current block is less
-  // than this specified number and adding a new record to the block will
-  // exceed the configured block size, then this block will be closed and the
-  // new record will be written to the next block.
-  // Default is 10.
-  int block_size_deviation;
-
   // The compaction style. Default: kCompactionStyleLevel
   CompactionStyle compaction_style;
 
   // If true, compaction will verify checksum on every read that happens
   // as part of compaction
+  //
   // Default: true
+  //
+  // Dynamically changeable through SetOptions() API
   bool verify_checksums_in_compaction;
 
   // The options needed to support Universal Style compactions
   CompactionOptionsUniversal compaction_options_universal;
 
+  // The options for FIFO compaction style
+  CompactionOptionsFIFO compaction_options_fifo;
+
   // Use KeyMayExist API to filter deletes when this is true.
   // If KeyMayExist returns false, i.e. the key definitely does not exist, then
   // the delete is a noop. KeyMayExist only incurs in-memory look up.
   // This optimization avoids writing the delete to storage when appropriate.
+  //
   // Default: false
+  //
+  // Dynamically changeable through SetOptions() API
   bool filter_deletes;
 
   // An iteration->Next() sequentially skips over keys with the same
   // user-key unless this option is set. This number specifies the number
   // of keys (with the same userkey) that will be sequentially
   // skipped before a reseek is issued.
+  //
   // Default: 8
+  //
+  // Dynamically changeable through SetOptions() API
   uint64_t max_sequential_skip_in_iterations;
 
   // This is a factory that provides MemTableRep objects.
@@ -423,19 +552,36 @@ struct ColumnFamilyOptions {
   std::shared_ptr<MemTableRepFactory> memtable_factory;
 
   // This is a factory that provides TableFactory objects.
-  // Default: a factory that provides a default implementation of
-  // Table and TableBuilder.
+  // Default: a block-based table factory that provides a default
+  // implementation of TableBuilder and TableReader with default
+  // BlockBasedTableOptions.
   std::shared_ptr<TableFactory> table_factory;
 
+  // Block-based table related options are moved to BlockBasedTableOptions.
+  // Related options that were originally here but now moved include:
+  //   no_block_cache
+  //   block_cache
+  //   block_cache_compressed
+  //   block_size
+  //   block_size_deviation
+  //   block_restart_interval
+  //   filter_policy
+  //   whole_key_filtering
+  // If you'd like to customize some of these options, you will need to
+  // use NewBlockBasedTableFactory() to construct a new table factory.
+
   // This option allows user to to collect their own interested statistics of
   // the tables.
-  // Default: emtpy vector -- no user-defined statistics collection will be
+  // Default: empty vector -- no user-defined statistics collection will be
   // performed.
-  typedef std::vector<std::shared_ptr<TablePropertiesCollector>>
-      TablePropertiesCollectors;
-  TablePropertiesCollectors table_properties_collectors;
-
-  // Allows thread-safe inplace updates.
+  typedef std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>
+      TablePropertiesCollectorFactories;
+  TablePropertiesCollectorFactories table_properties_collector_factories;
+
+  // Allows thread-safe inplace updates. If this is true, there is no way to
+  // achieve point-in-time consistency using snapshot or iterator (assuming
+  // concurrent updates). Hence iterator and multi-get will return results
+  // which are not consistent as of any point-in-time.
   // If inplace_callback function is not set,
   //   Put(key, new_value) will update inplace the existing_value iff
   //   * key exists in current memtable
@@ -447,6 +593,8 @@ struct ColumnFamilyOptions {
 
   // Number of locks used for inplace update
   // Default: 10000, if inplace_update_support = true, else 0.
+  //
+  // Dynamically changeable through SetOptions() API
   size_t inplace_update_num_locks;
 
   // existing_value - pointer to previous value (from both memtable and sst).
@@ -493,9 +641,13 @@ struct ColumnFamilyOptions {
 
   // if prefix_extractor is set and bloom_bits is not 0, create prefix bloom
   // for memtable
+  //
+  // Dynamically changeable through SetOptions() API
   uint32_t memtable_prefix_bloom_bits;
 
   // number of hash probes per key
+  //
+  // Dynamically changeable through SetOptions() API
   uint32_t memtable_prefix_bloom_probes;
 
   // Page size for huge page TLB for bloom in memtable. If <=0, not allocate
@@ -503,17 +655,15 @@ struct ColumnFamilyOptions {
   // Need to reserve huge pages for it to be allocated. For example:
   //      sysctl -w vm.nr_hugepages=20
   // See linux doc Documentation/vm/hugetlbpage.txt
-
+  //
+  // Dynamically changeable through SetOptions() API
   size_t memtable_prefix_bloom_huge_page_tlb_size;
 
   // Control locality of bloom filter probes to improve cache miss rate.
   // This option only applies to memtable prefix bloom and plaintable
-  // prefix bloom. It essentially limits the max number of cache lines each
-  // bloom filter check can touch.
-  // This optimization is turned off when set to 0. The number should never
-  // be greater than number of probes. This option can boost performance
-  // for in-memory workload but should use with care since it can cause
-  // higher false positive rate.
+  // prefix bloom. It essentially limits every bloom checking to one cache line.
+  // This optimization is turned off when set to 0, and positive number to turn
+  // it on.
   // Default: 0
   uint32_t bloom_locality;
 
@@ -526,6 +676,8 @@ struct ColumnFamilyOptions {
   // operations in the memtable.
   //
   // Default: 0 (disabled)
+  //
+  // Dynamically changeable through SetOptions() API
   size_t max_successive_merges;
 
   // The number of partial merge operands to accumulate before partial
@@ -537,6 +689,32 @@ struct ColumnFamilyOptions {
   // Default: 2
   uint32_t min_partial_merge_operands;
 
+  // This flag specifies that the implementation should optimize the filters
+  // mainly for cases where keys are found rather than also optimize for keys
+  // missed. This would be used in cases where the application knows that
+  // there are very few misses or the performance in the case of misses is not
+  // important.
+  //
+  // For now, this flag allows us to not store filters for the last level i.e
+  // the largest level which contains data of the LSM store. For keys which
+  // are hits, the filters in this level are not useful because we will search
+  // for the data anyway. NOTE: the filters in other levels are still useful
+  // even for key hit because they tell us whether to look in that level or go
+  // to the higher level.
+  //
+  // Default: false
+  bool optimize_filters_for_hits;
+
+  // After writing every SST file, reopen it and read all the keys.
+  // Default: false
+  bool paranoid_file_checks;
+
+#ifndef ROCKSDB_LITE
+  // A vector of EventListeners which call-back functions will be called
+  // when specific RocksDB event happens.
+  std::vector<std::shared_ptr<EventListener>> listeners;
+#endif  // ROCKSDB_LITE
+
   // Create ColumnFamilyOptions with default values for all fields
   ColumnFamilyOptions();
   // Create ColumnFamilyOptions from Options
@@ -546,22 +724,34 @@ struct ColumnFamilyOptions {
 };
 
 struct DBOptions {
+  // Some functions that make it easier to optimize RocksDB
+
+#ifndef ROCKSDB_LITE
+  // By default, RocksDB uses only one background thread for flush and
+  // compaction. Calling this function will set it up such that total of
+  // `total_threads` is used. Good value for `total_threads` is the number of
+  // cores. You almost definitely want to call this function if your system is
+  // bottlenecked by RocksDB.
+  DBOptions* IncreaseParallelism(int total_threads = 16);
+#endif  // ROCKSDB_LITE
+
   // If true, the database will be created if it is missing.
   // Default: false
   bool create_if_missing;
 
+  // If true, missing column families will be automatically created.
+  // Default: false
+  bool create_missing_column_families;
+
   // If true, an error is raised if the database already exists.
   // Default: false
   bool error_if_exists;
 
-  // If true, the implementation will do aggressive checking of the
-  // data it is processing and will stop early if it detects any
-  // errors.  This may have unforeseen ramifications: for example, a
-  // corruption of one DB entry may cause a large number of entries to
-  // become unreadable or for the entire DB to become unopenable.
-  // If any of the  writes to the database fails (Put, Delete, Merge, Write),
-  // the database will switch to read-only mode and fail all other
+  // If true, RocksDB will aggressively check consistency of the data.
+  // Also, if any of the  writes to the database fails (Put, Delete, Merge,
+  // Write), the database will switch to read-only mode and fail all other
   // Write operations.
+  // In most cases you want this to be set to true.
   // Default: true
   bool paranoid_checks;
 
@@ -570,11 +760,17 @@ struct DBOptions {
   // Default: Env::Default()
   Env* env;
 
+  // Use to control write rate of flush and compaction. Flush has higher
+  // priority than compaction. Rate limiting is disabled if nullptr.
+  // If rate limiter is enabled, bytes_per_sync is set to 1MB by default.
+  // Default: nullptr
+  std::shared_ptr<RateLimiter> rate_limiter;
+
   // Any internal progress/error information generated by the db will
   // be written to info_log if it is non-nullptr, or to a file stored
   // in the same directory as the DB contents if info_log is nullptr.
   // Default: nullptr
-  shared_ptr<Logger> info_log;
+  std::shared_ptr<Logger> info_log;
 
   InfoLogLevel info_log_level;
 
@@ -590,16 +786,16 @@ struct DBOptions {
   // column families whose memtables are backed by the oldest live WAL file
   // (i.e. the ones that are causing all the space amplification). If set to 0
   // (default), we will dynamically choose the WAL size limit to be
-  // [sum of all write_buffer_size * max_write_buffer_number] * 2
+  // [sum of all write_buffer_size * max_write_buffer_number] * 4
   // Default: 0
   uint64_t max_total_wal_size;
 
   // If non-null, then we should collect metrics about database operations
   // Statistics objects should not be shared between DB instances as
   // it does not use any locks to prevent concurrent updates.
-  shared_ptr<Statistics> statistics;
+  std::shared_ptr<Statistics> statistics;
 
-  // If true, then the contents of data files are not synced
+  // If true, then the contents of manifest and data files are not synced
   // to stable storage. Their contents remain in the OS buffers till the
   // OS decides to flush them. This option is good for bulk-loading
   // of data. Once the bulk-loading is complete, please issue a
@@ -614,11 +810,31 @@ struct DBOptions {
   // Default: false
   bool use_fsync;
 
-  // This number controls how often a new scribe log about
-  // db deploy stats is written out.
-  // -1 indicates no logging at all.
-  // Default value is 1800 (half an hour).
-  int db_stats_log_interval;
+  // A list of paths where SST files can be put into, with its target size.
+  // Newer data is placed into paths specified earlier in the vector while
+  // older data gradually moves to paths specified later in the vector.
+  //
+  // For example, you have a flash device with 10GB allocated for the DB,
+  // as well as a hard drive of 2TB, you should config it to be:
+  //   [{"/flash_path", 10GB}, {"/hard_drive", 2TB}]
+  //
+  // The system will try to guarantee data under each path is close to but
+  // not larger than the target size. But current and future file sizes used
+  // by determining where to place a file are based on best-effort estimation,
+  // which means there is a chance that the actual size under the directory
+  // is slightly more than target size under some workloads. User should give
+  // some buffer room for those cases.
+  //
+  // If none of the paths has sufficient room to place a file, the file will
+  // be placed to the last path anyway, despite to the target size.
+  //
+  // Placing newer data to ealier paths is also best-efforts. User should
+  // expect user files to be placed in higher levels in some extreme cases.
+  //
+  // If left empty, only one path will be used, which is db_name passed when
+  // opening the DB.
+  // Default: empty
+  std::vector<DbPath> db_paths;
 
   // This specifies the info LOG dir.
   // If it is empty, the log files will be in the same dir as data.
@@ -691,14 +907,8 @@ struct DBOptions {
   // Number of shards used for table cache.
   int table_cache_numshardbits;
 
-  // During data eviction of table's LRU cache, it would be inefficient
-  // to strictly follow LRU because this piece of memory will not really
-  // be released unless its refcount falls to zero. Instead, make two
-  // passes: the first pass will release items with refcount = 1,
-  // and if not enough space releases after scanning the number of
-  // elements specified by this parameter, we will remove items in LRU
-  // order.
-  int table_cache_remove_scan_count_limit;
+  // DEPRECATED
+  // int table_cache_remove_scan_count_limit;
 
   // The following two fields affect how archived logs will be deleted.
   // 1. If both set to 0, logs will be deleted asap and will not get into
@@ -734,9 +944,7 @@ struct DBOptions {
   // Disable child process inherit open files. Default: true
   bool is_fd_close_on_exec;
 
-  // Skip log corruption error on recovery (If client is ok with
-  // losing most recent changes)
-  // Default: false
+  // DEPRECATED -- this options is no longer used
   bool skip_log_error_on_recovery;
 
   // if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec
@@ -748,15 +956,28 @@ struct DBOptions {
   // Default: true
   bool advise_random_on_open;
 
+  // Amount of data to build up in memtables across all column
+  // families before writing to disk.
+  //
+  // This is distinct from write_buffer_size, which enforces a limit
+  // for a single memtable.
+  //
+  // This feature is disabled by default. Specify a non-zero value
+  // to enable it.
+  //
+  // Default: 0 (disabled)
+  size_t db_write_buffer_size;
+
   // Specify the file access pattern once a compaction is started.
   // It will be applied to all input files of a compaction.
   // Default: NORMAL
-  enum {
-    NONE,
-    NORMAL,
-    SEQUENTIAL,
-    WILLNEED
-  } access_hint_on_compaction_start;
+  enum AccessHint {
+      NONE,
+      NORMAL,
+      SEQUENTIAL,
+      WILLNEED
+  };
+  AccessHint access_hint_on_compaction_start;
 
   // Use adaptive mutex, which spins in the user space before resorting
   // to kernel. This could reduce context switch when the mutex is not
@@ -765,22 +986,34 @@ struct DBOptions {
   // Default: false
   bool use_adaptive_mutex;
 
+  // Create DBOptions with default values for all fields
+  DBOptions();
+  // Create DBOptions from Options
+  explicit DBOptions(const Options& options);
+
+  void Dump(Logger* log) const;
+
   // Allows OS to incrementally sync files to disk while they are being
   // written, asynchronously, in the background.
   // Issue one request for every bytes_per_sync written. 0 turns it off.
   // Default: 0
+  //
+  // You may consider using rate_limiter to regulate write rate to device.
+  // When rate limiter is enabled, it automatically enables bytes_per_sync
+  // to 1MB.
+  //
+  // This option applies to table files
   uint64_t bytes_per_sync;
 
-  // Allow RocksDB to use thread local storage to optimize performance.
-  // Default: true
-  bool allow_thread_local;
+  // Same as bytes_per_sync, but applies to WAL files
+  // Default: 0, turned off
+  uint64_t wal_bytes_per_sync;
 
-  // Create DBOptions with default values for all fields
-  DBOptions();
-  // Create DBOptions from Options
-  explicit DBOptions(const Options& options);
-
-  void Dump(Logger* log) const;
+  // If true, then the status of the threads involved in this DB will
+  // be tracked and available via GetThreadList() API.
+  //
+  // Default: false
+  bool enable_thread_tracking;
 };
 
 // Options to control the behavior of a database (passed to DB::Open)
@@ -861,6 +1094,18 @@ struct ReadOptions {
   // ! DEPRECATED
   // const Slice* prefix;
 
+  // "iterate_upper_bound" defines the extent upto which the forward iterator
+  // can returns entries. Once the bound is reached, Valid() will be false.
+  // "iterate_upper_bound" is exclusive ie the bound value is
+  // not a valid entry.  If iterator_extractor is not null, the Seek target
+  // and iterator_upper_bound need to have the same prefix.
+  // This is because ordering is not guaranteed outside of prefix domain.
+  // There is no lower bound on the iterator. If needed, that can be easily
+  // implemented
+  //
+  // Default: nullptr
+  const Slice* iterate_upper_bound;
+
   // Specify if this read request should process data that ALREADY
   // resides on a particular cache. If the required data is not
   // found at the specified cache, then Status::Incomplete is returned.
@@ -875,18 +1120,20 @@ struct ReadOptions {
   // Not supported in ROCKSDB_LITE mode!
   bool tailing;
 
-  ReadOptions()
-      : verify_checksums(true),
-        fill_cache(true),
-        snapshot(nullptr),
-        read_tier(kReadAllTier),
-        tailing(false) {}
-  ReadOptions(bool cksum, bool cache)
-      : verify_checksums(cksum),
-        fill_cache(cache),
-        snapshot(nullptr),
-        read_tier(kReadAllTier),
-        tailing(false) {}
+  // Specify to create a managed iterator -- a special iterator that
+  // uses less resources by having the ability to free its underlying
+  // resources on request.
+  // Default: false
+  // Not supported in ROCKSDB_LITE mode!
+  bool managed;
+
+  // Enable a total order seek regardless of index format (e.g. hash index)
+  // used in the table. Some table format (e.g. plain table) may not support
+  // this option.
+  bool total_order_seek;
+
+  ReadOptions();
+  ReadOptions(bool cksum, bool cache);
 };
 
 // Options that control write operations
@@ -913,7 +1160,28 @@ struct WriteOptions {
   // and the write may got lost after a crash.
   bool disableWAL;
 
-  WriteOptions() : sync(false), disableWAL(false) {}
+  // If non-zero, then associated write waiting longer than the specified
+  // time MAY be aborted and returns Status::TimedOut. A write that takes
+  // less than the specified time is guaranteed to not fail with
+  // Status::TimedOut.
+  //
+  // The number of times a write call encounters a timeout is recorded in
+  // Statistics.WRITE_TIMEDOUT
+  //
+  // Default: 0
+  uint64_t timeout_hint_us;
+
+  // If true and if user is trying to write to column families that don't exist
+  // (they were dropped),  ignore the write (don't return an error). If there
+  // are multiple writes in a WriteBatch, other writes will succeed.
+  // Default: false
+  bool ignore_missing_column_families;
+
+  WriteOptions()
+      : sync(false),
+        disableWAL(false),
+        timeout_hint_us(0),
+        ignore_missing_column_families(false) {}
 };
 
 // Options that control flush operations
@@ -925,6 +1193,30 @@ struct FlushOptions {
   FlushOptions() : wait(true) {}
 };
 
+// Get options based on some guidelines. Now only tune parameter based on
+// flush/compaction and fill default parameters for other parameters.
+// total_write_buffer_limit: budget for memory spent for mem tables
+// read_amplification_threshold: comfortable value of read amplification
+// write_amplification_threshold: comfortable value of write amplification.
+// target_db_size: estimated total DB size.
+extern Options GetOptions(size_t total_write_buffer_limit,
+                          int read_amplification_threshold = 8,
+                          int write_amplification_threshold = 32,
+                          uint64_t target_db_size = 68719476736 /* 64GB */);
+
+// CompactionOptions are used in CompactFiles() call.
+struct CompactionOptions {
+  // Compaction output compression type
+  // Default: snappy
+  CompressionType compression;
+  // Compaction will create files of size `output_file_size_limit`.
+  // Default: MAX, which means that compaction will create a single file
+  uint64_t output_file_size_limit;
+
+  CompactionOptions()
+      : compression(kSnappyCompression),
+        output_file_size_limit(std::numeric_limits<uint64_t>::max()) {}
+};
 }  // namespace rocksdb
 
 #endif  // STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_
diff --git a/src/rocksdb/include/rocksdb/perf_context.h b/src/rocksdb/include/rocksdb/perf_context.h
index 0704ea2..3b8145a 100644
--- a/src/rocksdb/include/rocksdb/perf_context.h
+++ b/src/rocksdb/include/rocksdb/perf_context.h
@@ -20,6 +20,9 @@ enum PerfLevel {
 // set the perf stats level
 void SetPerfLevel(PerfLevel level);
 
+// get current perf stats level
+PerfLevel GetPerfLevel();
+
 // A thread local context for gathering performance counter efficiently
 // and transparently.
 
@@ -48,6 +51,10 @@ struct PerfContext {
   // total time spent after Get() finds a key
   uint64_t get_post_process_time;
   uint64_t get_from_output_files_time; // total time reading from output files
+  // total time spent on seeking memtable
+  uint64_t seek_on_memtable_time;
+  // number of seeks issued on memtable
+  uint64_t seek_on_memtable_count;
   // total time spent on seeking child iters
   uint64_t seek_child_seek_time;
   // number of seek issued in child iterators
@@ -62,6 +69,11 @@ struct PerfContext {
   uint64_t write_wal_time;            // total time spent on writing to WAL
   // total time spent on writing to mem tables
   uint64_t write_memtable_time;
+  uint64_t db_mutex_lock_nanos;      // time spent on acquiring DB mutex.
+  // Time spent on waiting with a condition variable created with DB mutex.
+  uint64_t db_condition_wait_nanos;
+  // Time spent on merge operator.
+  uint64_t merge_operator_time_nanos;
 };
 
 #if defined(NPERF_CONTEXT) || defined(IOS_CROSS_COMPILE)
diff --git a/src/rocksdb/include/rocksdb/rate_limiter.h b/src/rocksdb/include/rocksdb/rate_limiter.h
new file mode 100644
index 0000000..44c1bdf
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/rate_limiter.h
@@ -0,0 +1,64 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "rocksdb/env.h"
+
+namespace rocksdb {
+
+class RateLimiter {
+ public:
+  virtual ~RateLimiter() {}
+
+  // This API allows user to dynamically change rate limiter's bytes per second.
+  // REQUIRED: bytes_per_second > 0
+  virtual void SetBytesPerSecond(int64_t bytes_per_second) = 0;
+
+  // Request for token to write bytes. If this request can not be satisfied,
+  // the call is blocked. Caller is responsible to make sure
+  // bytes < GetSingleBurstBytes()
+  virtual void Request(const int64_t bytes, const Env::IOPriority pri) = 0;
+
+  // Max bytes can be granted in a single burst
+  virtual int64_t GetSingleBurstBytes() const = 0;
+
+  // Total bytes that go though rate limiter
+  virtual int64_t GetTotalBytesThrough(
+      const Env::IOPriority pri = Env::IO_TOTAL) const = 0;
+
+  // Total # of requests that go though rate limiter
+  virtual int64_t GetTotalRequests(
+      const Env::IOPriority pri = Env::IO_TOTAL) const = 0;
+};
+
+// Create a RateLimiter object, which can be shared among RocksDB instances to
+// control write rate of flush and compaction.
+// @rate_bytes_per_sec: this is the only parameter you want to set most of the
+// time. It controls the total write rate of compaction and flush in bytes per
+// second. Currently, RocksDB does not enforce rate limit for anything other
+// than flush and compaction, e.g. write to WAL.
+// @refill_period_us: this controls how often tokens are refilled. For example,
+// when rate_bytes_per_sec is set to 10MB/s and refill_period_us is set to
+// 100ms, then 1MB is refilled every 100ms internally. Larger value can lead to
+// burstier writes while smaller value introduces more CPU overhead.
+// The default should work for most cases.
+// @fairness: RateLimiter accepts high-pri requests and low-pri requests.
+// A low-pri request is usually blocked in favor of hi-pri request. Currently,
+// RocksDB assigns low-pri to request from compaciton and high-pri to request
+// from flush. Low-pri requests can get blocked if flush requests come in
+// continuouly. This fairness parameter grants low-pri requests permission by
+// 1/fairness chance even though high-pri requests exist to avoid starvation.
+// You should be good by leaving it at default 10.
+extern RateLimiter* NewGenericRateLimiter(
+    int64_t rate_bytes_per_sec,
+    int64_t refill_period_us = 100 * 1000,
+    int32_t fairness = 10);
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/include/rocksdb/slice.h b/src/rocksdb/include/rocksdb/slice.h
index 2253715..7019c90 100644
--- a/src/rocksdb/include/rocksdb/slice.h
+++ b/src/rocksdb/include/rocksdb/slice.h
@@ -20,6 +20,7 @@
 #define STORAGE_ROCKSDB_INCLUDE_SLICE_H_
 
 #include <assert.h>
+#include <cstdio>
 #include <stddef.h>
 #include <string.h>
 #include <string>
@@ -42,6 +43,10 @@ class Slice {
   /* implicit */
   Slice(const char* s) : data_(s), size_(strlen(s)) { }
 
+  // Create a single slice from SliceParts using buf as storage.
+  // buf must exist as long as the returned Slice exists.
+  Slice(const struct SliceParts& parts, std::string* buf);
+
   // Return a pointer to the beginning of the referenced data
   const char* data() const { return data_; }
 
@@ -107,6 +112,7 @@ class Slice {
 struct SliceParts {
   SliceParts(const Slice* _parts, int _num_parts) :
       parts(_parts), num_parts(_num_parts) { }
+  SliceParts() : parts(nullptr), num_parts(0) {}
 
   const Slice* parts;
   int num_parts;
@@ -122,7 +128,7 @@ inline bool operator!=(const Slice& x, const Slice& y) {
 }
 
 inline int Slice::compare(const Slice& b) const {
-  const int min_len = (size_ < b.size_) ? size_ : b.size_;
+  const size_t min_len = (size_ < b.size_) ? size_ : b.size_;
   int r = memcmp(data_, b.data_, min_len);
   if (r == 0) {
     if (size_ < b.size_) r = -1;
diff --git a/src/rocksdb/include/rocksdb/slice_transform.h b/src/rocksdb/include/rocksdb/slice_transform.h
index a784550..3694c58 100644
--- a/src/rocksdb/include/rocksdb/slice_transform.h
+++ b/src/rocksdb/include/rocksdb/slice_transform.h
@@ -36,10 +36,39 @@ class SliceTransform {
 
   // determine whether dst=Transform(src) for some src
   virtual bool InRange(const Slice& dst) const = 0;
+
+  // Transform(s)=Transform(`prefix`) for any s with `prefix` as a prefix.
+  //
+  // This function is not used by RocksDB, but for users. If users pass
+  // Options by string to RocksDB, they might not know what prefix extractor
+  // they are using. This function is to help users can determine:
+  //   if they want to iterate all keys prefixing `prefix`, whetherit is
+  //   safe to use prefix bloom filter and seek to key `prefix`.
+  // If this function returns true, this means a user can Seek() to a prefix
+  // using the bloom filter. Otherwise, user needs to skip the bloom filter
+  // by setting ReadOptions.total_order_seek = true.
+  //
+  // Here is an example: Suppose we implement a slice transform that returns
+  // the first part of the string after spliting it using deimiter ",":
+  // 1. SameResultWhenAppended("abc,") should return true. If aplying prefix
+  //    bloom filter using it, all slices matching "abc:.*" will be extracted
+  //    to "abc,", so any SST file or memtable containing any of those key
+  //    will not be filtered out.
+  // 2. SameResultWhenAppended("abc") should return false. A user will not
+  //    guaranteed to see all the keys matching "abc.*" if a user seek to "abc"
+  //    against a DB with the same setting. If one SST file only contains
+  //    "abcd,e", the file can be filtered out and the key will be invisible.
+  //
+  // i.e., an implementation always returning false is safe.
+  virtual bool SameResultWhenAppended(const Slice& prefix) const {
+    return false;
+  }
 };
 
 extern const SliceTransform* NewFixedPrefixTransform(size_t prefix_len);
 
+extern const SliceTransform* NewCappedPrefixTransform(size_t cap_len);
+
 extern const SliceTransform* NewNoopTransform();
 
 }
diff --git a/src/rocksdb/include/rocksdb/sst_dump_tool.h b/src/rocksdb/include/rocksdb/sst_dump_tool.h
new file mode 100644
index 0000000..39bfb51
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/sst_dump_tool.h
@@ -0,0 +1,17 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+#ifndef ROCKSDB_LITE
+#pragma once
+
+namespace rocksdb {
+
+class SSTDumpTool {
+ public:
+  int Run(int argc, char** argv);
+};
+
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/statistics.h b/src/rocksdb/include/rocksdb/statistics.h
index dcd82f6..4e06bf6 100644
--- a/src/rocksdb/include/rocksdb/statistics.h
+++ b/src/rocksdb/include/rocksdb/statistics.h
@@ -20,12 +20,12 @@ namespace rocksdb {
  *  1. Any ticker should be added before TICKER_ENUM_MAX.
  *  2. Add a readable string in TickersNameMap below for the newly added ticker.
  */
-enum Tickers {
+enum Tickers : uint32_t {
   // total block cache misses
   // REQUIRES: BLOCK_CACHE_MISS == BLOCK_CACHE_INDEX_MISS +
   //                               BLOCK_CACHE_FILTER_MISS +
   //                               BLOCK_CACHE_DATA_MISS;
-  BLOCK_CACHE_MISS,
+  BLOCK_CACHE_MISS = 0,
   // total block cache hit
   // REQUIRES: BLOCK_CACHE_HIT == BLOCK_CACHE_INDEX_HIT +
   //                              BLOCK_CACHE_FILTER_HIT +
@@ -53,6 +53,13 @@ enum Tickers {
   // # of memtable misses.
   MEMTABLE_MISS,
 
+  // # of Get() queries served by L0
+  GET_HIT_L0,
+  // # of Get() queries served by L1
+  GET_HIT_L1,
+  // # of Get() queries served by L2 and up
+  GET_HIT_L2_AND_UP,
+
   /**
    * COMPACTION_KEY_DROP_* count the reasons for key drop during compaction
    * There are 3 reasons currently.
@@ -73,12 +80,16 @@ enum Tickers {
   NO_FILE_CLOSES,
   NO_FILE_OPENS,
   NO_FILE_ERRORS,
-  // Time system had to wait to do LO-L1 compactions
+  // DEPRECATED Time system had to wait to do LO-L1 compactions
   STALL_L0_SLOWDOWN_MICROS,
-  // Time system had to wait to move memtable to L1.
+  // DEPRECATED Time system had to wait to move memtable to L1.
   STALL_MEMTABLE_COMPACTION_MICROS,
-  // write throttle because of too many files in L0
+  // DEPRECATED write throttle because of too many files in L0
   STALL_L0_NUM_FILES_MICROS,
+  // Writer has to wait for compaction or flush to finish.
+  STALL_MICROS,
+  // The wait time for db mutex.
+  DB_MUTEX_WAIT_MICROS,
   RATE_LIMIT_DELAY_MILLIS,
   NO_ITERATORS,  // number of iterators currently open
 
@@ -115,9 +126,11 @@ enum Tickers {
   // head of the writers queue.
   WRITE_DONE_BY_SELF,
   WRITE_DONE_BY_OTHER,
+  WRITE_TIMEDOUT,       // Number of writes ending up with timed-out.
   WRITE_WITH_WAL,       // Number of Write calls that request WAL
   COMPACT_READ_BYTES,   // Bytes read during compaction
   COMPACT_WRITE_BYTES,  // Bytes written during compaction
+  FLUSH_WRITE_BYTES,    // Bytes written during flush
 
   // Number of table's properties loaded directly from file, without creating
   // table reader object.
@@ -125,6 +138,9 @@ enum Tickers {
   NUMBER_SUPERVERSION_ACQUIRES,
   NUMBER_SUPERVERSION_RELEASES,
   NUMBER_SUPERVERSION_CLEANUPS,
+  NUMBER_BLOCK_NOT_COMPRESSED,
+  MERGE_OPERATION_TOTAL_TIME,
+  FILTER_OPERATION_TOTAL_TIME,
   TICKER_ENUM_MAX
 };
 
@@ -143,6 +159,9 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
     {BLOOM_FILTER_USEFUL, "rocksdb.bloom.filter.useful"},
     {MEMTABLE_HIT, "rocksdb.memtable.hit"},
     {MEMTABLE_MISS, "rocksdb.memtable.miss"},
+    {GET_HIT_L0, "rocksdb.l0.hit"},
+    {GET_HIT_L1, "rocksdb.l1.hit"},
+    {GET_HIT_L2_AND_UP, "rocksdb.l2andup.hit"},
     {COMPACTION_KEY_DROP_NEWER_ENTRY, "rocksdb.compaction.key.drop.new"},
     {COMPACTION_KEY_DROP_OBSOLETE, "rocksdb.compaction.key.drop.obsolete"},
     {COMPACTION_KEY_DROP_USER, "rocksdb.compaction.key.drop.user"},
@@ -157,6 +176,8 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
     {STALL_L0_SLOWDOWN_MICROS, "rocksdb.l0.slowdown.micros"},
     {STALL_MEMTABLE_COMPACTION_MICROS, "rocksdb.memtable.compaction.micros"},
     {STALL_L0_NUM_FILES_MICROS, "rocksdb.l0.num.files.stall.micros"},
+    {STALL_MICROS, "rocksdb.stall.micros"},
+    {DB_MUTEX_WAIT_MICROS, "rocksdb.db.mutex.wait.micros"},
     {RATE_LIMIT_DELAY_MILLIS, "rocksdb.rate.limit.delay.millis"},
     {NO_ITERATORS, "rocksdb.num.iterators"},
     {NUMBER_MULTIGET_CALLS, "rocksdb.number.multiget.get"},
@@ -175,7 +196,9 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
     {WAL_FILE_BYTES, "rocksdb.wal.bytes"},
     {WRITE_DONE_BY_SELF, "rocksdb.write.self"},
     {WRITE_DONE_BY_OTHER, "rocksdb.write.other"},
+    {WRITE_TIMEDOUT, "rocksdb.write.timedout"},
     {WRITE_WITH_WAL, "rocksdb.write.wal"},
+    {FLUSH_WRITE_BYTES, "rocksdb.flush.write.bytes"},
     {COMPACT_READ_BYTES, "rocksdb.compact.read.bytes"},
     {COMPACT_WRITE_BYTES, "rocksdb.compact.write.bytes"},
     {NUMBER_DIRECT_LOAD_TABLE_PROPERTIES,
@@ -183,6 +206,9 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
     {NUMBER_SUPERVERSION_ACQUIRES, "rocksdb.number.superversion_acquires"},
     {NUMBER_SUPERVERSION_RELEASES, "rocksdb.number.superversion_releases"},
     {NUMBER_SUPERVERSION_CLEANUPS, "rocksdb.number.superversion_cleanups"},
+    {NUMBER_BLOCK_NOT_COMPRESSED, "rocksdb.number.block.not_compressed"},
+    {MERGE_OPERATION_TOTAL_TIME, "rocksdb.merge.operation.time.nanos"},
+    {FILTER_OPERATION_TOTAL_TIME, "rocksdb.filter.operation.time.nanos"},
 };
 
 /**
@@ -192,8 +218,8 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
  * Add a string representation in HistogramsNameMap below
  * And increment HISTOGRAM_ENUM_MAX
  */
-enum Histograms {
-  DB_GET,
+enum Histograms : uint32_t {
+  DB_GET = 0,
   DB_WRITE,
   COMPACTION_TIME,
   TABLE_SYNC_MICROS,
@@ -206,14 +232,15 @@ enum Histograms {
   READ_BLOCK_COMPACTION_MICROS,
   READ_BLOCK_GET_MICROS,
   WRITE_RAW_BLOCK_MICROS,
-
   STALL_L0_SLOWDOWN_COUNT,
   STALL_MEMTABLE_COMPACTION_COUNT,
   STALL_L0_NUM_FILES_COUNT,
   HARD_RATE_LIMIT_DELAY_COUNT,
   SOFT_RATE_LIMIT_DELAY_COUNT,
   NUM_FILES_IN_SINGLE_COMPACTION,
-  HISTOGRAM_ENUM_MAX,
+  DB_SEEK,
+  WRITE_STALL,
+  HISTOGRAM_ENUM_MAX,  // TODO(ldemailly): enforce HistogramsNameMap match
 };
 
 const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
@@ -235,6 +262,8 @@ const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
   { HARD_RATE_LIMIT_DELAY_COUNT, "rocksdb.hard.rate.limit.delay.count"},
   { SOFT_RATE_LIMIT_DELAY_COUNT, "rocksdb.soft.rate.limit.delay.count"},
   { NUM_FILES_IN_SINGLE_COMPACTION, "rocksdb.numfiles.in.singlecompaction" },
+  { DB_SEEK, "rocksdb.db.seek.micros" },
+  { WRITE_STALL, "rocksdb.db.write.stall" },
 };
 
 struct HistogramData {
@@ -250,14 +279,24 @@ class Statistics {
  public:
   virtual ~Statistics() {}
 
-  virtual long getTickerCount(Tickers tickerType) = 0;
-  virtual void recordTick(Tickers tickerType, uint64_t count = 0) = 0;
-  virtual void setTickerCount(Tickers tickerType, uint64_t count) = 0;
-  virtual void measureTime(Histograms histogramType, uint64_t time) = 0;
+  virtual uint64_t getTickerCount(uint32_t tickerType) const = 0;
+  virtual void histogramData(uint32_t type,
+                             HistogramData* const data) const = 0;
+
+  virtual void recordTick(uint32_t tickerType, uint64_t count = 0) = 0;
+  virtual void setTickerCount(uint32_t tickerType, uint64_t count) = 0;
+  virtual void measureTime(uint32_t histogramType, uint64_t time) = 0;
 
-  virtual void histogramData(Histograms type, HistogramData* const data) = 0;
   // String representation of the statistic object.
-  std::string ToString();
+  virtual std::string ToString() const {
+    // Do nothing by default
+    return std::string("ToString(): not implemented");
+  }
+
+  // Override this function to disable particular histogram collection
+  virtual bool HistEnabledForType(uint32_t type) const {
+    return type < HISTOGRAM_ENUM_MAX;
+  }
 };
 
 // Create a concrete DBStatistics object
diff --git a/src/rocksdb/include/rocksdb/status.h b/src/rocksdb/include/rocksdb/status.h
index dbd41fc..177d705 100644
--- a/src/rocksdb/include/rocksdb/status.h
+++ b/src/rocksdb/include/rocksdb/status.h
@@ -61,10 +61,25 @@ class Status {
   static Status Incomplete(const Slice& msg, const Slice& msg2 = Slice()) {
     return Status(kIncomplete, msg, msg2);
   }
+  static Status ShutdownInProgress() {
+    return Status(kShutdownInProgress);
+  }
   static Status ShutdownInProgress(const Slice& msg,
                                    const Slice& msg2 = Slice()) {
     return Status(kShutdownInProgress, msg, msg2);
   }
+  static Status TimedOut() {
+    return Status(kTimedOut);
+  }
+  static Status TimedOut(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kTimedOut, msg, msg2);
+  }
+  static Status Aborted() {
+    return Status(kAborted);
+  }
+  static Status Aborted(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kAborted, msg, msg2);
+  }
 
   // Returns true iff the status indicates success.
   bool ok() const { return code() == kOk; }
@@ -90,14 +105,17 @@ class Status {
   // Returns true iff the status indicates Incomplete
   bool IsIncomplete() const { return code() == kIncomplete; }
 
-  // Returns true iff the status indicates Incomplete
+  // Returns true iff the status indicates Shutdown In progress
   bool IsShutdownInProgress() const { return code() == kShutdownInProgress; }
 
+  bool IsTimedOut() const { return code() == kTimedOut; }
+
+  bool IsAborted() const { return code() == kAborted; }
+
   // Return a string representation of this status suitable for printing.
   // Returns the string "OK" for success.
   std::string ToString() const;
 
- private:
   enum Code {
     kOk = 0,
     kNotFound = 1,
@@ -107,9 +125,15 @@ class Status {
     kIOError = 5,
     kMergeInProgress = 6,
     kIncomplete = 7,
-    kShutdownInProgress = 8
+    kShutdownInProgress = 8,
+    kTimedOut = 9,
+    kAborted = 10
   };
 
+  Code code() const {
+    return code_;
+  }
+ private:
   // A nullptr state_ (which is always the case for OK) means the message
   // is empty.
   // of the following form:
@@ -118,11 +142,8 @@ class Status {
   Code code_;
   const char* state_;
 
-  Code code() const {
-    return code_;
-  }
-  explicit Status(Code code) : code_(code), state_(nullptr) { }
-  Status(Code code, const Slice& msg, const Slice& msg2);
+  explicit Status(Code _code) : code_(_code), state_(nullptr) {}
+  Status(Code _code, const Slice& msg, const Slice& msg2);
   static const char* CopyState(const char* s);
 };
 
diff --git a/src/rocksdb/include/rocksdb/table.h b/src/rocksdb/include/rocksdb/table.h
index 11adfec..b84363a 100644
--- a/src/rocksdb/include/rocksdb/table.h
+++ b/src/rocksdb/include/rocksdb/table.h
@@ -23,6 +23,7 @@
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/options.h"
+#include "rocksdb/immutable_options.h"
 #include "rocksdb/status.h"
 
 namespace rocksdb {
@@ -30,6 +31,7 @@ namespace rocksdb {
 // -- Block-based Table
 class FlushBlockPolicyFactory;
 class RandomAccessFile;
+struct TableBuilderOptions;
 class TableBuilder;
 class TableReader;
 class WritableFile;
@@ -74,16 +76,82 @@ struct BlockBasedTableOptions {
 
   IndexType index_type = kBinarySearch;
 
+  // Influence the behavior when kHashSearch is used.
+  // if false, stores a precise prefix to block range mapping
+  // if true, does not store prefix and allows prefix hash collision
+  // (less memory consumption)
+  bool hash_index_allow_collision = true;
+
   // Use the specified checksum type. Newly created table files will be
   // protected with this checksum type. Old table files will still be readable,
   // even though they have different checksum type.
   ChecksumType checksum = kCRC32c;
+
+  // Disable block cache. If this is set to true,
+  // then no block cache should be used, and the block_cache should
+  // point to a nullptr object.
+  bool no_block_cache = false;
+
+  // If non-NULL use the specified cache for blocks.
+  // If NULL, rocksdb will automatically create and use an 8MB internal cache.
+  std::shared_ptr<Cache> block_cache = nullptr;
+
+  // If non-NULL use the specified cache for compressed blocks.
+  // If NULL, rocksdb will not use a compressed block cache.
+  std::shared_ptr<Cache> block_cache_compressed = nullptr;
+
+  // Approximate size of user data packed per block.  Note that the
+  // block size specified here corresponds to uncompressed data.  The
+  // actual size of the unit read from disk may be smaller if
+  // compression is enabled.  This parameter can be changed dynamically.
+  size_t block_size = 4 * 1024;
+
+  // This is used to close a block before it reaches the configured
+  // 'block_size'. If the percentage of free space in the current block is less
+  // than this specified number and adding a new record to the block will
+  // exceed the configured block size, then this block will be closed and the
+  // new record will be written to the next block.
+  int block_size_deviation = 10;
+
+  // Number of keys between restart points for delta encoding of keys.
+  // This parameter can be changed dynamically.  Most clients should
+  // leave this parameter alone.
+  int block_restart_interval = 16;
+
+  // If non-nullptr, use the specified filter policy to reduce disk reads.
+  // Many applications will benefit from passing the result of
+  // NewBloomFilterPolicy() here.
+  std::shared_ptr<const FilterPolicy> filter_policy = nullptr;
+
+  // If true, place whole keys in the filter (not just prefixes).
+  // This must generally be true for gets to be efficient.
+  bool whole_key_filtering = true;
+
+  // We currently have three versions:
+  // 0 -- This version is currently written out by all RocksDB's versions by
+  // default.  Can be read by really old RocksDB's. Doesn't support changing
+  // checksum (default is CRC32).
+  // 1 -- Can be read by RocksDB's versions since 3.0. Supports non-default
+  // checksum, like xxHash. It is written by RocksDB when
+  // BlockBasedTableOptions::checksum is something other than kCRC32c. (version
+  // 0 is silently upconverted)
+  // 2 -- Can be read by RocksDB's versions since 3.10. Changes the way we
+  // encode compressed blocks with LZ4, BZip2 and Zlib compression. If you
+  // don't plan to run RocksDB before version 3.10, you should probably use
+  // this.
+  // This option only affects newly written tables. When reading exising tables,
+  // the information about version is read from the footer.
+  uint32_t format_version = 0;
 };
 
 // Table Properties that are specific to block-based table properties.
 struct BlockBasedTablePropertyNames {
   // value of this propertis is a fixed int32 number.
   static const std::string kIndexType;
+  // value is "1" for true and "0" for false.
+  static const std::string kWholeKeyFiltering;
+  // value is "1" for true and "0" for false.
+  static const std::string kPrefixFiltering;
 };
 
 // Create default block based table factory.
@@ -91,57 +159,159 @@ extern TableFactory* NewBlockBasedTableFactory(
     const BlockBasedTableOptions& table_options = BlockBasedTableOptions());
 
 #ifndef ROCKSDB_LITE
+
+enum EncodingType : char {
+  // Always write full keys without any special encoding.
+  kPlain,
+  // Find opportunity to write the same prefix once for multiple rows.
+  // In some cases, when a key follows a previous key with the same prefix,
+  // instead of writing out the full key, it just writes out the size of the
+  // shared prefix, as well as other bytes, to save some bytes.
+  //
+  // When using this option, the user is required to use the same prefix
+  // extractor to make sure the same prefix will be extracted from the same key.
+  // The Name() value of the prefix extractor will be stored in the file. When
+  // reopening the file, the name of the options.prefix_extractor given will be
+  // bitwise compared to the prefix extractors stored in the file. An error
+  // will be returned if the two don't match.
+  kPrefix,
+};
+
+// Table Properties that are specific to plain table properties.
+struct PlainTablePropertyNames {
+  static const std::string kPrefixExtractorName;
+  static const std::string kEncodingType;
+  static const std::string kBloomVersion;
+  static const std::string kNumBloomBlocks;
+};
+
+const uint32_t kPlainTableVariableLength = 0;
+
+struct PlainTableOptions {
+  // @user_key_len: plain table has optimization for fix-sized keys, which can
+  //                be specified via user_key_len.  Alternatively, you can pass
+  //                `kPlainTableVariableLength` if your keys have variable
+  //                lengths.
+  uint32_t user_key_len = kPlainTableVariableLength;
+
+  // @bloom_bits_per_key: the number of bits used for bloom filer per prefix.
+  //                      You may disable it by passing a zero.
+  int bloom_bits_per_key = 10;
+
+  // @hash_table_ratio: the desired utilization of the hash table used for
+  //                    prefix hashing.
+  //                    hash_table_ratio = number of prefixes / #buckets in the
+  //                    hash table
+  double hash_table_ratio = 0.75;
+
+  // @index_sparseness: inside each prefix, need to build one index record for
+  //                    how many keys for binary search inside each hash bucket.
+  //                    For encoding type kPrefix, the value will be used when
+  //                    writing to determine an interval to rewrite the full
+  //                    key. It will also be used as a suggestion and satisfied
+  //                    when possible.
+  size_t index_sparseness = 16;
+
+  // @huge_page_tlb_size: if <=0, allocate hash indexes and blooms from malloc.
+  //                      Otherwise from huge page TLB. The user needs to
+  //                      reserve huge pages for it to be allocated, like:
+  //                          sysctl -w vm.nr_hugepages=20
+  //                      See linux doc Documentation/vm/hugetlbpage.txt
+  size_t huge_page_tlb_size = 0;
+
+  // @encoding_type: how to encode the keys. See enum EncodingType above for
+  //                 the choices. The value will determine how to encode keys
+  //                 when writing to a new SST file. This value will be stored
+  //                 inside the SST file which will be used when reading from
+  //                 the file, which makes it possible for users to choose
+  //                 different encoding type when reopening a DB. Files with
+  //                 different encoding types can co-exist in the same DB and
+  //                 can be read.
+  EncodingType encoding_type = kPlain;
+
+  // @full_scan_mode: mode for reading the whole file one record by one without
+  //                  using the index.
+  bool full_scan_mode = false;
+
+  // @store_index_in_file: compute plain table index and bloom filter during
+  //                       file building and store it in file. When reading
+  //                       file, index will be mmaped instead of recomputation.
+  bool store_index_in_file = false;
+};
+
 // -- Plain Table with prefix-only seek
 // For this factory, you need to set Options.prefix_extrator properly to make it
 // work. Look-up will starts with prefix hash lookup for key prefix. Inside the
 // hash bucket found, a binary search is executed for hash conflicts. Finally,
 // a linear search is used.
-// @user_key_len: plain table has optimization for fix-sized keys, which can be
-//                specified via user_key_len.  Alternatively, you can pass
-//                `kPlainTableVariableLength` if your keys have variable
-//                lengths.
-// @bloom_bits_per_key: the number of bits used for bloom filer per prefix. You
-//                      may disable it by passing a zero.
-// @hash_table_ratio: the desired utilization of the hash table used for prefix
-//                    hashing. hash_table_ratio = number of prefixes / #buckets
-//                    in the hash table
-// @index_sparseness: inside each prefix, need to build one index record for how
-//                    many keys for binary search inside each hash bucket.
-// @huge_page_tlb_size: if <=0, allocate hash indexes and blooms from malloc.
-//                      Otherwise from huge page TLB. The user needs to reserve
-//                      huge pages for it to be allocated, like:
-//                          sysctl -w vm.nr_hugepages=20
-//                      See linux doc Documentation/vm/hugetlbpage.txt
 
-const uint32_t kPlainTableVariableLength = 0;
-extern TableFactory* NewPlainTableFactory(uint32_t user_key_len =
-                                              kPlainTableVariableLength,
-                                          int bloom_bits_per_prefix = 10,
-                                          double hash_table_ratio = 0.75,
-                                          size_t index_sparseness = 16,
-                                          size_t huge_page_tlb_size = 0);
-
-// -- Plain Table
-// This factory of plain table ignores Options.prefix_extractor and assumes no
-// hashable prefix available to the key structure. Lookup will be based on
-// binary search index only. Total order seek() can be issued.
-// @user_key_len: plain table has optimization for fix-sized keys, which can be
-//                specified via user_key_len.  Alternatively, you can pass
-//                `kPlainTableVariableLength` if your keys have variable
-//                lengths.
-// @bloom_bits_per_key: the number of bits used for bloom filer per key. You may
-//                  disable it by passing a zero.
-// @index_sparseness: need to build one index record for how many keys for
-//                    binary search.
-// @huge_page_tlb_size: if <=0, allocate hash indexes and blooms from malloc.
-//                      Otherwise from huge page TLB. The user needs to reserve
-//                      huge pages for it to be allocated, like:
-//                          sysctl -w vm.nr_hugepages=20
-//                      See linux doc Documentation/vm/hugetlbpage.txt
-extern TableFactory* NewTotalOrderPlainTableFactory(
-    uint32_t user_key_len = kPlainTableVariableLength,
-    int bloom_bits_per_key = 0, size_t index_sparseness = 16,
-    size_t huge_page_tlb_size = 0);
+extern TableFactory* NewPlainTableFactory(const PlainTableOptions& options =
+                                              PlainTableOptions());
+
+struct CuckooTablePropertyNames {
+  // The key that is used to fill empty buckets.
+  static const std::string kEmptyKey;
+  // Fixed length of value.
+  static const std::string kValueLength;
+  // Number of hash functions used in Cuckoo Hash.
+  static const std::string kNumHashFunc;
+  // It denotes the number of buckets in a Cuckoo Block. Given a key and a
+  // particular hash function, a Cuckoo Block is a set of consecutive buckets,
+  // where starting bucket id is given by the hash function on the key. In case
+  // of a collision during inserting the key, the builder tries to insert the
+  // key in other locations of the cuckoo block before using the next hash
+  // function. This reduces cache miss during read operation in case of
+  // collision.
+  static const std::string kCuckooBlockSize;
+  // Size of the hash table. Use this number to compute the modulo of hash
+  // function. The actual number of buckets will be kMaxHashTableSize +
+  // kCuckooBlockSize - 1. The last kCuckooBlockSize-1 buckets are used to
+  // accommodate the Cuckoo Block from end of hash table, due to cache friendly
+  // implementation.
+  static const std::string kHashTableSize;
+  // Denotes if the key sorted in the file is Internal Key (if false)
+  // or User Key only (if true).
+  static const std::string kIsLastLevel;
+  // Indicate if using identity function for the first hash function.
+  static const std::string kIdentityAsFirstHash;
+  // Indicate if using module or bit and to calculate hash value
+  static const std::string kUseModuleHash;
+  // Fixed user key length
+  static const std::string kUserKeyLength;
+};
+
+struct CuckooTableOptions {
+  // Determines the utilization of hash tables. Smaller values
+  // result in larger hash tables with fewer collisions.
+  double hash_table_ratio = 0.9;
+  // A property used by builder to determine the depth to go to
+  // to search for a path to displace elements in case of
+  // collision. See Builder.MakeSpaceForKey method. Higher
+  // values result in more efficient hash tables with fewer
+  // lookups but take more time to build.
+  uint32_t max_search_depth = 100;
+  // In case of collision while inserting, the builder
+  // attempts to insert in the next cuckoo_block_size
+  // locations before skipping over to the next Cuckoo hash
+  // function. This makes lookups more cache friendly in case
+  // of collisions.
+  uint32_t cuckoo_block_size = 5;
+  // If this option is enabled, user key is treated as uint64_t and its value
+  // is used as hash value directly. This option changes builder's behavior.
+  // Reader ignore this option and behave according to what specified in table
+  // property.
+  bool identity_as_first_hash = false;
+  // If this option is set to true, module is used during hash calculation.
+  // This often yields better space efficiency at the cost of performance.
+  // If this optino is set to false, # of entries in table is constrained to be
+  // power of two, and bit and is used to calculate hash, which is faster in
+  // general.
+  bool use_module_hash = true;
+};
+
+// Cuckoo Table Factory for SST table format using Cache Friendly Cuckoo Hashing
+extern TableFactory* NewCuckooTableFactory(
+    const CuckooTableOptions& table_options = CuckooTableOptions());
 
 #endif  // ROCKSDB_LITE
 
@@ -168,14 +338,15 @@ class TableFactory {
   //     and cache the table object returned.
   // (1) SstFileReader (for SST Dump) opens the table and dump the table
   //     contents using the interator of the table.
-  // options and soptions are options. options is the general options.
+  // ImmutableCFOptions is a subset of Options that can not be altered.
+  // EnvOptions is a subset of Options that will be used by Env.
   // Multiple configured can be accessed from there, including and not
   // limited to block cache and key comparators.
   // file is a file handler to handle the file for the table
   // file_size is the physical file size of the file
   // table_reader is the output table reader
   virtual Status NewTableReader(
-      const Options& options, const EnvOptions& soptions,
+      const ImmutableCFOptions& ioptions, const EnvOptions& env_options,
       const InternalKeyComparator& internal_comparator,
       unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
       unique_ptr<TableReader>* table_reader) const = 0;
@@ -193,14 +364,44 @@ class TableFactory {
   // (4) When running Repairer, it creates a table builder to convert logs to
   //     SST files (In Repairer::ConvertLogToTable() by calling BuildTable())
   //
-  // options is the general options. Multiple configured can be acceseed from
-  // there, including and not limited to compression options.
-  // file is a handle of a writable file. It is the caller's responsibility to
-  // keep the file open and close the file after closing the table builder.
-  // compression_type is the compression type to use in this table.
+  // ImmutableCFOptions is a subset of Options that can not be altered.
+  // Multiple configured can be acceseed from there, including and not limited
+  // to compression options. file is a handle of a writable file.
+  // It is the caller's responsibility to keep the file open and close the file
+  // after closing the table builder. compression_type is the compression type
+  // to use in this table.
   virtual TableBuilder* NewTableBuilder(
-      const Options& options, const InternalKeyComparator& internal_comparator,
-      WritableFile* file, CompressionType compression_type) const = 0;
+      const TableBuilderOptions& table_builder_options,
+      WritableFile* file) const = 0;
+
+  // Sanitizes the specified DB Options and ColumnFamilyOptions.
+  //
+  // If the function cannot find a way to sanitize the input DB Options,
+  // a non-ok Status will be returned.
+  virtual Status SanitizeOptions(
+      const DBOptions& db_opts,
+      const ColumnFamilyOptions& cf_opts) const = 0;
+
+  // Return a string that contains printable format of table configurations.
+  // RocksDB prints configurations at DB Open().
+  virtual std::string GetPrintableTableOptions() const = 0;
 };
 
+#ifndef ROCKSDB_LITE
+// Create a special table factory that can open either of the supported
+// table formats, based on setting inside the SST files. It should be used to
+// convert a DB from one table format to another.
+// @table_factory_to_write: the table factory used when writing to new files.
+// @block_based_table_factory:  block based table factory to use. If NULL, use
+//                              a default one.
+// @plain_table_factory: plain table factory to use. If NULL, use a default one.
+// @cuckoo_table_factory: cuckoo table factory to use. If NULL, use a default one.
+extern TableFactory* NewAdaptiveTableFactory(
+    std::shared_ptr<TableFactory> table_factory_to_write = nullptr,
+    std::shared_ptr<TableFactory> block_based_table_factory = nullptr,
+    std::shared_ptr<TableFactory> plain_table_factory = nullptr,
+    std::shared_ptr<TableFactory> cuckoo_table_factory = nullptr);
+
+#endif  // ROCKSDB_LITE
+
 }  // namespace rocksdb
diff --git a/src/rocksdb/include/rocksdb/table_properties.h b/src/rocksdb/include/rocksdb/table_properties.h
index aa8b8a0..8572021 100644
--- a/src/rocksdb/include/rocksdb/table_properties.h
+++ b/src/rocksdb/include/rocksdb/table_properties.h
@@ -6,6 +6,7 @@
 #include <string>
 #include <map>
 #include "rocksdb/status.h"
+#include "rocksdb/types.h"
 
 namespace rocksdb {
 
@@ -77,17 +78,44 @@ struct TablePropertiesNames {
 
 extern const std::string kPropertiesBlock;
 
+enum EntryType {
+  kEntryPut,
+  kEntryDelete,
+  kEntryMerge,
+  kEntryOther,
+};
+
 // `TablePropertiesCollector` provides the mechanism for users to collect
 // their own interested properties. This class is essentially a collection
-//  of callback functions that will be invoked during table building.
+// of callback functions that will be invoked during table building.
+// It is construced with TablePropertiesCollectorFactory. The methods don't
+// need to be thread-safe, as we will create exactly one
+// TablePropertiesCollector object per table and then call it sequentially
 class TablePropertiesCollector {
  public:
   virtual ~TablePropertiesCollector() {}
 
+  // DEPRECATE User defined collector should implement AddUserKey(), though
+  //           this old function still works for backward compatible reason.
   // Add() will be called when a new key/value pair is inserted into the table.
-  // @params key    the original key that is inserted into the table.
-  // @params value  the original value that is inserted into the table.
-  virtual Status Add(const Slice& key, const Slice& value) = 0;
+  // @params key    the user key that is inserted into the table.
+  // @params value  the value that is inserted into the table.
+  virtual Status Add(const Slice& key, const Slice& value) {
+    return Status::InvalidArgument(
+        "TablePropertiesCollector::Add() deprecated.");
+  }
+
+  // AddUserKey() will be called when a new key/value pair is inserted into the
+  // table.
+  // @params key    the user key that is inserted into the table.
+  // @params value  the value that is inserted into the table.
+  // @params file_size  file size up to now
+  virtual Status AddUserKey(const Slice& key, const Slice& value,
+                            EntryType type, SequenceNumber seq,
+                            uint64_t file_size) {
+    // For backward-compatible.
+    return Add(key, value);
+  }
 
   // Finish() will be called when a table has already been built and is ready
   // for writing the properties block.
@@ -95,12 +123,24 @@ class TablePropertiesCollector {
   // `properties`.
   virtual Status Finish(UserCollectedProperties* properties) = 0;
 
-  // The name of the properties collector can be used for debugging purpose.
-  virtual const char* Name() const = 0;
-
   // Return the human-readable properties, where the key is property name and
   // the value is the human-readable form of value.
   virtual UserCollectedProperties GetReadableProperties() const = 0;
+
+  // The name of the properties collector can be used for debugging purpose.
+  virtual const char* Name() const = 0;
+};
+
+// Constructs TablePropertiesCollector. Internals create a new
+// TablePropertiesCollector for each new table
+class TablePropertiesCollectorFactory {
+ public:
+  virtual ~TablePropertiesCollectorFactory() {}
+  // has to be thread-safe
+  virtual TablePropertiesCollector* CreateTablePropertiesCollector() = 0;
+
+  // The name of the properties collector can be used for debugging purpose.
+  virtual const char* Name() const = 0;
 };
 
 // Extra properties
diff --git a/src/rocksdb/include/rocksdb/thread_status.h b/src/rocksdb/include/rocksdb/thread_status.h
new file mode 100644
index 0000000..67346b8
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/thread_status.h
@@ -0,0 +1,191 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file defines the structures for exposing run-time status of any
+// rocksdb-related thread.  Such run-time status can be obtained via
+// GetThreadList() API.
+//
+// Note that all thread-status features are still under-development, and
+// thus APIs and class definitions might subject to change at this point.
+// Will remove this comment once the APIs have been finalized.
+
+#pragma once
+
+#include <cstddef>
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+
+#ifndef ROCKSDB_USING_THREAD_STATUS
+#define ROCKSDB_USING_THREAD_STATUS \
+    !defined(ROCKSDB_LITE) && \
+    !defined(NROCKSDB_THREAD_STATUS) && \
+    !defined(OS_MACOSX) && \
+    !defined(IOS_CROSS_COMPILE)
+#endif
+
+namespace rocksdb {
+
+// TODO(yhchiang): remove this function once c++14 is available
+//                 as std::max will be able to cover this.
+constexpr int constexpr_max(int a, int b) { return a > b ? a : b; }
+
+// A structure that describes the current status of a thread.
+// The status of active threads can be fetched using
+// rocksdb::GetThreadList().
+struct ThreadStatus {
+  // The type of a thread.
+  enum ThreadType : int {
+    HIGH_PRIORITY = 0,  // RocksDB BG thread in high-pri thread pool
+    LOW_PRIORITY,  // RocksDB BG thread in low-pri thread pool
+    USER,  // User thread (Non-RocksDB BG thread)
+    NUM_THREAD_TYPES
+  };
+
+  // The type used to refer to a thread operation.
+  // A thread operation describes high-level action of a thread.
+  // Examples include compaction and flush.
+  enum OperationType : int {
+    OP_UNKNOWN = 0,
+    OP_COMPACTION,
+    OP_FLUSH,
+    NUM_OP_TYPES
+  };
+
+  enum OperationStage : int {
+    STAGE_UNKNOWN = 0,
+    STAGE_FLUSH_RUN,
+    STAGE_FLUSH_WRITE_L0,
+    STAGE_COMPACTION_PREPARE,
+    STAGE_COMPACTION_RUN,
+    STAGE_COMPACTION_PROCESS_KV,
+    STAGE_COMPACTION_FILTER_V2,
+    STAGE_COMPACTION_INSTALL,
+    STAGE_COMPACTION_SYNC_FILE,
+    STAGE_PICK_MEMTABLES_TO_FLUSH,
+    STAGE_MEMTABLE_ROLLBACK,
+    STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS,
+    NUM_OP_STAGES
+  };
+
+  enum CompactionPropertyType : int {
+    COMPACTION_JOB_ID = 0,
+    COMPACTION_INPUT_OUTPUT_LEVEL,
+    COMPACTION_PROP_FLAGS,
+    COMPACTION_TOTAL_INPUT_BYTES,
+    COMPACTION_BYTES_READ,
+    COMPACTION_BYTES_WRITTEN,
+    NUM_COMPACTION_PROPERTIES
+  };
+
+  enum FlushPropertyType : int {
+    FLUSH_JOB_ID = 0,
+    FLUSH_BYTES_MEMTABLES,
+    FLUSH_BYTES_WRITTEN,
+    NUM_FLUSH_PROPERTIES
+  };
+
+  // The maximum number of properties of an operation.
+  // This number should be set to the biggest NUM_XXX_PROPERTIES.
+  static const int kNumOperationProperties =
+      constexpr_max(NUM_COMPACTION_PROPERTIES, NUM_FLUSH_PROPERTIES);
+
+  // The type used to refer to a thread state.
+  // A state describes lower-level action of a thread
+  // such as reading / writing a file or waiting for a mutex.
+  enum StateType : int {
+    STATE_UNKNOWN = 0,
+    STATE_MUTEX_WAIT = 1,
+    NUM_STATE_TYPES
+  };
+
+  ThreadStatus(const uint64_t _id,
+               const ThreadType _thread_type,
+               const std::string& _db_name,
+               const std::string& _cf_name,
+               const OperationType _operation_type,
+               const uint64_t _op_elapsed_micros,
+               const OperationStage _operation_stage,
+               const uint64_t _op_props[],
+               const StateType _state_type) :
+      thread_id(_id), thread_type(_thread_type),
+      db_name(_db_name),
+      cf_name(_cf_name),
+      operation_type(_operation_type),
+      op_elapsed_micros(_op_elapsed_micros),
+      operation_stage(_operation_stage),
+      state_type(_state_type) {
+    for (int i = 0; i < kNumOperationProperties; ++i) {
+      op_properties[i] = _op_props[i];
+    }
+  }
+
+  // An unique ID for the thread.
+  const uint64_t thread_id;
+
+  // The type of the thread, it could be HIGH_PRIORITY,
+  // LOW_PRIORITY, and USER
+  const ThreadType thread_type;
+
+  // The name of the DB instance where the thread is currently
+  // involved with.  It would be set to empty string if the thread
+  // does not involve in any DB operation.
+  const std::string db_name;
+
+  // The name of the column family where the thread is currently
+  // It would be set to empty string if the thread does not involve
+  // in any column family.
+  const std::string cf_name;
+
+  // The operation (high-level action) that the current thread is involved.
+  const OperationType operation_type;
+
+  // The elapsed time in micros of the current thread operation.
+  const uint64_t op_elapsed_micros;
+
+  // An integer showing the current stage where the thread is involved
+  // in the current operation.
+  const OperationStage operation_stage;
+
+  // A list of properties that describe some details about the current
+  // operation.  Same field in op_properties[] might have different
+  // meanings for different operations.
+  uint64_t op_properties[kNumOperationProperties];
+
+  // The state (lower-level action) that the current thread is involved.
+  const StateType state_type;
+
+  // The followings are a set of utility functions for interpreting
+  // the information of ThreadStatus
+
+  static const std::string& GetThreadTypeName(ThreadType thread_type);
+
+  // Obtain the name of an operation given its type.
+  static const std::string& GetOperationName(OperationType op_type);
+
+  static const std::string MicrosToString(uint64_t op_elapsed_time);
+
+  // Obtain a human-readable string describing the specified operation stage.
+  static const std::string& GetOperationStageName(
+      OperationStage stage);
+
+  // Obtain the name of the "i"th operation property of the
+  // specified operation.
+  static const std::string& GetOperationPropertyName(
+      OperationType op_type, int i);
+
+  // Translate the "i"th property of the specified operation given
+  // a property value.
+  static std::map<std::string, uint64_t>
+      InterpretOperationProperties(
+          OperationType op_type, const uint64_t* op_properties);
+
+  // Obtain the name of a state given its type.
+  static const std::string& GetStateName(StateType state_type);
+};
+
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/include/rocksdb/universal_compaction.h b/src/rocksdb/include/rocksdb/universal_compaction.h
index eaf47e5..229e50b 100644
--- a/src/rocksdb/include/rocksdb/universal_compaction.h
+++ b/src/rocksdb/include/rocksdb/universal_compaction.h
@@ -8,6 +8,7 @@
 
 #include <stdint.h>
 #include <climits>
+#include <vector>
 
 namespace rocksdb {
 
@@ -61,6 +62,7 @@ class CompactionOptionsUniversal {
   // well as  the total size of C1...Ct as total_C, the compaction output file
   // will be compressed iff
   //   total_C / total_size < this percentage
+  // Default: -1
   int compression_size_percent;
 
   // The algorithm used to stop picking files into a single compaction run
@@ -68,14 +70,13 @@ class CompactionOptionsUniversal {
   CompactionStopStyle stop_style;
 
   // Default set of parameters
-  CompactionOptionsUniversal() :
-    size_ratio(1),
-    min_merge_width(2),
-    max_merge_width(UINT_MAX),
-    max_size_amplification_percent(200),
-    compression_size_percent(-1),
-    stop_style(kCompactionStopStyleTotalSize) {
-  }
+  CompactionOptionsUniversal()
+      : size_ratio(1),
+        min_merge_width(2),
+        max_merge_width(UINT_MAX),
+        max_size_amplification_percent(200),
+        compression_size_percent(-1),
+        stop_style(kCompactionStopStyleTotalSize) {}
 };
 
 }  // namespace rocksdb
diff --git a/src/rocksdb/include/rocksdb/utilities/backupable_db.h b/src/rocksdb/include/rocksdb/utilities/backupable_db.h
new file mode 100644
index 0000000..956ab3d
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/backupable_db.h
@@ -0,0 +1,316 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
+#include <string>
+#include <map>
+#include <vector>
+
+#include "rocksdb/utilities/stackable_db.h"
+
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+struct BackupableDBOptions {
+  // Where to keep the backup files. Has to be different than dbname_
+  // Best to set this to dbname_ + "/backups"
+  // Required
+  std::string backup_dir;
+
+  // Backup Env object. It will be used for backup file I/O. If it's
+  // nullptr, backups will be written out using DBs Env. If it's
+  // non-nullptr, backup's I/O will be performed using this object.
+  // If you want to have backups on HDFS, use HDFS Env here!
+  // Default: nullptr
+  Env* backup_env;
+
+  // If share_table_files == true, backup will assume that table files with
+  // same name have the same contents. This enables incremental backups and
+  // avoids unnecessary data copies.
+  // If share_table_files == false, each backup will be on its own and will
+  // not share any data with other backups.
+  // default: true
+  bool share_table_files;
+
+  // Backup info and error messages will be written to info_log
+  // if non-nullptr.
+  // Default: nullptr
+  Logger* info_log;
+
+  // If sync == true, we can guarantee you'll get consistent backup even
+  // on a machine crash/reboot. Backup process is slower with sync enabled.
+  // If sync == false, we don't guarantee anything on machine reboot. However,
+  // chances are some of the backups are consistent.
+  // Default: true
+  bool sync;
+
+  // If true, it will delete whatever backups there are already
+  // Default: false
+  bool destroy_old_data;
+
+  // If false, we won't backup log files. This option can be useful for backing
+  // up in-memory databases where log file are persisted, but table files are in
+  // memory.
+  // Default: true
+  bool backup_log_files;
+
+  // Max bytes that can be transferred in a second during backup.
+  // If 0, go as fast as you can
+  // Default: 0
+  uint64_t backup_rate_limit;
+
+  // Max bytes that can be transferred in a second during restore.
+  // If 0, go as fast as you can
+  // Default: 0
+  uint64_t restore_rate_limit;
+
+  // Only used if share_table_files is set to true. If true, will consider that
+  // backups can come from different databases, hence a sst is not uniquely
+  // identifed by its name, but by the triple (file name, crc32, file length)
+  // Default: false
+  // Note: this is an experimental option, and you'll need to set it manually
+  // *turn it on only if you know what you're doing*
+  bool share_files_with_checksum;
+
+  void Dump(Logger* logger) const;
+
+  explicit BackupableDBOptions(const std::string& _backup_dir,
+                               Env* _backup_env = nullptr,
+                               bool _share_table_files = true,
+                               Logger* _info_log = nullptr, bool _sync = true,
+                               bool _destroy_old_data = false,
+                               bool _backup_log_files = true,
+                               uint64_t _backup_rate_limit = 0,
+                               uint64_t _restore_rate_limit = 0)
+      : backup_dir(_backup_dir),
+        backup_env(_backup_env),
+        share_table_files(_share_table_files),
+        info_log(_info_log),
+        sync(_sync),
+        destroy_old_data(_destroy_old_data),
+        backup_log_files(_backup_log_files),
+        backup_rate_limit(_backup_rate_limit),
+        restore_rate_limit(_restore_rate_limit),
+        share_files_with_checksum(false) {
+    assert(share_table_files || !share_files_with_checksum);
+  }
+};
+
+struct RestoreOptions {
+  // If true, restore won't overwrite the existing log files in wal_dir. It will
+  // also move all log files from archive directory to wal_dir. Use this option
+  // in combination with BackupableDBOptions::backup_log_files = false for
+  // persisting in-memory databases.
+  // Default: false
+  bool keep_log_files;
+
+  explicit RestoreOptions(bool _keep_log_files = false)
+      : keep_log_files(_keep_log_files) {}
+};
+
+typedef uint32_t BackupID;
+
+struct BackupInfo {
+  BackupID backup_id;
+  int64_t timestamp;
+  uint64_t size;
+
+  uint32_t number_files;
+
+  BackupInfo() {}
+
+  BackupInfo(BackupID _backup_id, int64_t _timestamp, uint64_t _size,
+             uint32_t _number_files)
+      : backup_id(_backup_id), timestamp(_timestamp), size(_size),
+        number_files(_number_files) {}
+};
+
+class BackupStatistics {
+ public:
+  BackupStatistics() {
+    number_success_backup = 0;
+    number_fail_backup = 0;
+  }
+
+  BackupStatistics(uint32_t _number_success_backup,
+                   uint32_t _number_fail_backup)
+      : number_success_backup(_number_success_backup),
+        number_fail_backup(_number_fail_backup) {}
+
+  ~BackupStatistics() {}
+
+  void IncrementNumberSuccessBackup();
+  void IncrementNumberFailBackup();
+
+  uint32_t GetNumberSuccessBackup() const;
+  uint32_t GetNumberFailBackup() const;
+
+  std::string ToString() const;
+
+ private:
+  uint32_t number_success_backup;
+  uint32_t number_fail_backup;
+};
+
+class BackupEngineReadOnly {
+ public:
+  virtual ~BackupEngineReadOnly() {}
+
+  static BackupEngineReadOnly* NewReadOnlyBackupEngine(
+      Env* db_env, const BackupableDBOptions& options)
+      __attribute__((deprecated("Please use Open() instead")));
+
+  static Status Open(Env* db_env, const BackupableDBOptions& options,
+                     BackupEngineReadOnly** backup_engine_ptr);
+
+  // You can GetBackupInfo safely, even with other BackupEngine performing
+  // backups on the same directory
+  virtual void GetBackupInfo(std::vector<BackupInfo>* backup_info) = 0;
+  virtual void GetCorruptedBackups(
+      std::vector<BackupID>* corrupt_backup_ids) = 0;
+
+  // Restoring DB from backup is NOT safe when there is another BackupEngine
+  // running that might call DeleteBackup() or PurgeOldBackups(). It is caller's
+  // responsibility to synchronize the operation, i.e. don't delete the backup
+  // when you're restoring from it
+  virtual Status RestoreDBFromBackup(
+      BackupID backup_id, const std::string& db_dir, const std::string& wal_dir,
+      const RestoreOptions& restore_options = RestoreOptions()) = 0;
+  virtual Status RestoreDBFromLatestBackup(
+      const std::string& db_dir, const std::string& wal_dir,
+      const RestoreOptions& restore_options = RestoreOptions()) = 0;
+};
+
+// Please see the documentation in BackupableDB and RestoreBackupableDB
+class BackupEngine {
+ public:
+  virtual ~BackupEngine() {}
+
+  static BackupEngine* NewBackupEngine(Env* db_env,
+                                       const BackupableDBOptions& options)
+    __attribute__((deprecated("Please use Open() instead")));
+
+  static Status Open(Env* db_env,
+                     const BackupableDBOptions& options,
+                     BackupEngine** backup_engine_ptr);
+
+  virtual Status CreateNewBackup(DB* db, bool flush_before_backup = false) = 0;
+  virtual Status PurgeOldBackups(uint32_t num_backups_to_keep) = 0;
+  virtual Status DeleteBackup(BackupID backup_id) = 0;
+  virtual void StopBackup() = 0;
+
+  virtual void GetBackupInfo(std::vector<BackupInfo>* backup_info) = 0;
+  virtual void GetCorruptedBackups(
+      std::vector<BackupID>* corrupt_backup_ids) = 0;
+  virtual Status RestoreDBFromBackup(
+      BackupID backup_id, const std::string& db_dir, const std::string& wal_dir,
+      const RestoreOptions& restore_options = RestoreOptions()) = 0;
+  virtual Status RestoreDBFromLatestBackup(
+      const std::string& db_dir, const std::string& wal_dir,
+      const RestoreOptions& restore_options = RestoreOptions()) = 0;
+
+  virtual Status GarbageCollect() = 0;
+};
+
+// Stack your DB with BackupableDB to be able to backup the DB
+class BackupableDB : public StackableDB {
+ public:
+  // BackupableDBOptions have to be the same as the ones used in a previous
+  // incarnation of the DB
+  //
+  // BackupableDB ownes the pointer `DB* db` now. You should not delete it or
+  // use it after the invocation of BackupableDB
+  BackupableDB(DB* db, const BackupableDBOptions& options);
+  virtual ~BackupableDB();
+
+  // Captures the state of the database in the latest backup
+  // NOT a thread safe call
+  Status CreateNewBackup(bool flush_before_backup = false);
+  // Returns info about backups in backup_info
+  void GetBackupInfo(std::vector<BackupInfo>* backup_info);
+  // Returns info about corrupt backups in corrupt_backups
+  void GetCorruptedBackups(std::vector<BackupID>* corrupt_backup_ids);
+  // deletes old backups, keeping latest num_backups_to_keep alive
+  Status PurgeOldBackups(uint32_t num_backups_to_keep);
+  // deletes a specific backup
+  Status DeleteBackup(BackupID backup_id);
+  // Call this from another thread if you want to stop the backup
+  // that is currently happening. It will return immediatelly, will
+  // not wait for the backup to stop.
+  // The backup will stop ASAP and the call to CreateNewBackup will
+  // return Status::Incomplete(). It will not clean up after itself, but
+  // the state will remain consistent. The state will be cleaned up
+  // next time you create BackupableDB or RestoreBackupableDB.
+  void StopBackup();
+
+  // Will delete all the files we don't need anymore
+  // It will do the full scan of the files/ directory and delete all the
+  // files that are not referenced.
+  Status GarbageCollect();
+
+ private:
+  BackupEngine* backup_engine_;
+};
+
+// Use this class to access information about backups and restore from them
+class RestoreBackupableDB {
+ public:
+  RestoreBackupableDB(Env* db_env, const BackupableDBOptions& options);
+  ~RestoreBackupableDB();
+
+  // Returns info about backups in backup_info
+  void GetBackupInfo(std::vector<BackupInfo>* backup_info);
+  // Returns info about corrupt backups in corrupt_backups
+  void GetCorruptedBackups(std::vector<BackupID>* corrupt_backup_ids);
+
+  // restore from backup with backup_id
+  // IMPORTANT -- if options_.share_table_files == true and you restore DB
+  // from some backup that is not the latest, and you start creating new
+  // backups from the new DB, they will probably fail
+  //
+  // Example: Let's say you have backups 1, 2, 3, 4, 5 and you restore 3.
+  // If you add new data to the DB and try creating a new backup now, the
+  // database will diverge from backups 4 and 5 and the new backup will fail.
+  // If you want to create new backup, you will first have to delete backups 4
+  // and 5.
+  Status RestoreDBFromBackup(BackupID backup_id, const std::string& db_dir,
+                             const std::string& wal_dir,
+                             const RestoreOptions& restore_options =
+                                 RestoreOptions());
+
+  // restore from the latest backup
+  Status RestoreDBFromLatestBackup(const std::string& db_dir,
+                                   const std::string& wal_dir,
+                                   const RestoreOptions& restore_options =
+                                       RestoreOptions());
+  // deletes old backups, keeping latest num_backups_to_keep alive
+  Status PurgeOldBackups(uint32_t num_backups_to_keep);
+  // deletes a specific backup
+  Status DeleteBackup(BackupID backup_id);
+
+  // Will delete all the files we don't need anymore
+  // It will do the full scan of the files/ directory and delete all the
+  // files that are not referenced.
+  Status GarbageCollect();
+
+ private:
+  BackupEngine* backup_engine_;
+};
+
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/checkpoint.h b/src/rocksdb/include/rocksdb/utilities/checkpoint.h
new file mode 100644
index 0000000..b60f4eb
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/checkpoint.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// A checkpoint is an openable snapshot of a database at a point in time.
+
+#pragma once
+
+#include <string>
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+class DB;
+
+class Checkpoint {
+ public:
+  // Creates a Checkpoint object to be used for creating openable sbapshots
+  static Status Create(DB* db, Checkpoint** checkpoint_ptr);
+
+  // Builds an openable snapshot of RocksDB on the same disk, which
+  // accepts an output directory on the same disk, and under the directory
+  // (1) hard-linked SST files pointing to existing live SST files
+  // SST files will be copied if output directory is on a different filesystem
+  // (2) a copied manifest files and other files
+  // The directory should not already exist and will be created by this API.
+  // The directory will be an absolute path
+  virtual Status CreateCheckpoint(const std::string& checkpoint_dir);
+
+  virtual ~Checkpoint() {}
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/include/rocksdb/utilities/convenience.h b/src/rocksdb/include/rocksdb/utilities/convenience.h
new file mode 100644
index 0000000..1c1057d
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/convenience.h
@@ -0,0 +1,63 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include <unordered_map>
+#include <string>
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+
+namespace rocksdb {
+
+#ifndef ROCKSDB_LITE
+// Take a map of option name and option value, apply them into the
+// base_options, and return the new options as a result
+Status GetColumnFamilyOptionsFromMap(
+    const ColumnFamilyOptions& base_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    ColumnFamilyOptions* new_options);
+
+Status GetDBOptionsFromMap(
+    const DBOptions& base_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    DBOptions* new_options);
+
+Status GetBlockBasedTableOptionsFromMap(
+    const BlockBasedTableOptions& table_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    BlockBasedTableOptions* new_table_options);
+
+// Take a string representation of option names and  values, apply them into the
+// base_options, and return the new options as a result. The string has the
+// following format:
+//   "write_buffer_size=1024;max_write_buffer_number=2"
+// Nested options config is also possible. For example, you can define
+// BlockBasedTableOptions as part of the string for block-based table factory:
+//   "write_buffer_size=1024;block_based_table_factory={block_size=4k};"
+//   "max_write_buffer_num=2"
+Status GetColumnFamilyOptionsFromString(
+    const ColumnFamilyOptions& base_options,
+    const std::string& opts_str,
+    ColumnFamilyOptions* new_options);
+
+Status GetDBOptionsFromString(
+    const DBOptions& base_options,
+    const std::string& opts_str,
+    DBOptions* new_options);
+
+Status GetBlockBasedTableOptionsFromString(
+    const BlockBasedTableOptions& table_options,
+    const std::string& opts_str,
+    BlockBasedTableOptions* new_table_options);
+
+Status GetOptionsFromString(const Options& base_options,
+                            const std::string& opts_str, Options* new_options);
+
+/// Request stopping background work, if wait is true wait until it's done
+void CancelAllBackgroundWork(DB* db, bool wait = false);
+#endif  // ROCKSDB_LITE
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/include/rocksdb/utilities/db_ttl.h b/src/rocksdb/include/rocksdb/utilities/db_ttl.h
new file mode 100644
index 0000000..4534e1f
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/db_ttl.h
@@ -0,0 +1,68 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <vector>
+
+#include "rocksdb/utilities/stackable_db.h"
+#include "rocksdb/db.h"
+
+namespace rocksdb {
+
+// Database with TTL support.
+//
+// USE-CASES:
+// This API should be used to open the db when key-values inserted are
+//  meant to be removed from the db in a non-strict 'ttl' amount of time
+//  Therefore, this guarantees that key-values inserted will remain in the
+//  db for >= ttl amount of time and the db will make efforts to remove the
+//  key-values as soon as possible after ttl seconds of their insertion.
+//
+// BEHAVIOUR:
+// TTL is accepted in seconds
+// (int32_t)Timestamp(creation) is suffixed to values in Put internally
+// Expired TTL values deleted in compaction only:(Timestamp+ttl<time_now)
+// Get/Iterator may return expired entries(compaction not run on them yet)
+// Different TTL may be used during different Opens
+// Example: Open1 at t=0 with ttl=4 and insert k1,k2, close at t=2
+//          Open2 at t=3 with ttl=5. Now k1,k2 should be deleted at t>=5
+// read_only=true opens in the usual read-only mode. Compactions will not be
+//  triggered(neither manual nor automatic), so no expired entries removed
+//
+// CONSTRAINTS:
+// Not specifying/passing or non-positive TTL behaves like TTL = infinity
+//
+// !!!WARNING!!!:
+// Calling DB::Open directly to re-open a db created by this API will get
+//  corrupt values(timestamp suffixed) and no ttl effect will be there
+//  during the second Open, so use this API consistently to open the db
+// Be careful when passing ttl with a small positive value because the
+//  whole database may be deleted in a small amount of time
+
+class DBWithTTL : public StackableDB {
+ public:
+  virtual Status CreateColumnFamilyWithTtl(
+      const ColumnFamilyOptions& options, const std::string& column_family_name,
+      ColumnFamilyHandle** handle, int ttl) = 0;
+
+  static Status Open(const Options& options, const std::string& dbname,
+                     DBWithTTL** dbptr, int32_t ttl = 0,
+                     bool read_only = false);
+
+  static Status Open(const DBOptions& db_options, const std::string& dbname,
+                     const std::vector<ColumnFamilyDescriptor>& column_families,
+                     std::vector<ColumnFamilyHandle*>* handles,
+                     DBWithTTL** dbptr, std::vector<int32_t> ttls,
+                     bool read_only = false);
+
+ protected:
+  explicit DBWithTTL(DB* db) : StackableDB(db) {}
+};
+
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/document_db.h b/src/rocksdb/include/rocksdb/utilities/document_db.h
new file mode 100644
index 0000000..7fde5ec
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/document_db.h
@@ -0,0 +1,149 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <vector>
+
+#include "rocksdb/utilities/stackable_db.h"
+#include "rocksdb/utilities/json_document.h"
+#include "rocksdb/db.h"
+
+namespace rocksdb {
+
+// IMPORTANT: DocumentDB is a work in progress. It is unstable and we might
+// change the API without warning. Talk to RocksDB team before using this in
+// production ;)
+
+// DocumentDB is a layer on top of RocksDB that provides a very simple JSON API.
+// When creating a DB, you specify a list of indexes you want to keep on your
+// data. You can insert a JSON document to the DB, which is automatically
+// indexed. Every document added to the DB needs to have "_id" field which is
+// automatically indexed and is an unique primary key. All other indexes are
+// non-unique.
+
+// NOTE: field names in the JSON are NOT allowed to start with '$' or
+// contain '.'. We don't currently enforce that rule, but will start behaving
+// badly.
+
+// Cursor is what you get as a result of executing query. To get all
+// results from a query, call Next() on a Cursor while  Valid() returns true
+class Cursor {
+ public:
+  Cursor() = default;
+  virtual ~Cursor() {}
+
+  virtual bool Valid() const = 0;
+  virtual void Next() = 0;
+  // Lifecycle of the returned JSONDocument is until the next Next() call
+  virtual const JSONDocument& document() const = 0;
+  virtual Status status() const = 0;
+
+ private:
+  // No copying allowed
+  Cursor(const Cursor&);
+  void operator=(const Cursor&);
+};
+
+struct DocumentDBOptions {
+  int background_threads = 4;
+  uint64_t memtable_size = 128 * 1024 * 1024;    // 128 MB
+  uint64_t cache_size = 1 * 1024 * 1024 * 1024;  // 1 GB
+};
+
+// TODO(icanadi) Add `JSONDocument* info` parameter to all calls that can be
+// used by the caller to get more information about the call execution (number
+// of dropped records, number of updated records, etc.)
+class DocumentDB : public StackableDB {
+ public:
+  struct IndexDescriptor {
+    // Currently, you can only define an index on a single field. To specify an
+    // index on a field X, set index description to JSON "{X: 1}"
+    // Currently the value needs to be 1, which means ascending.
+    // In the future, we plan to also support indexes on multiple keys, where
+    // you could mix ascending sorting (1) with descending sorting indexes (-1)
+    JSONDocument* description;
+    std::string name;
+  };
+
+  // Open DocumentDB with specified indexes. The list of indexes has to be
+  // complete, i.e. include all indexes present in the DB, except the primary
+  // key index.
+  // Otherwise, Open() will return an error
+  static Status Open(const DocumentDBOptions& options, const std::string& name,
+                     const std::vector<IndexDescriptor>& indexes,
+                     DocumentDB** db, bool read_only = false);
+
+  explicit DocumentDB(DB* db) : StackableDB(db) {}
+
+  // Create a new index. It will stop all writes for the duration of the call.
+  // All current documents in the DB are scanned and corresponding index entries
+  // are created
+  virtual Status CreateIndex(const WriteOptions& write_options,
+                             const IndexDescriptor& index) = 0;
+
+  // Drop an index. Client is responsible to make sure that index is not being
+  // used by currently executing queries
+  virtual Status DropIndex(const std::string& name) = 0;
+
+  // Insert a document to the DB. The document needs to have a primary key "_id"
+  // which can either be a string or an integer. Otherwise the write will fail
+  // with InvalidArgument.
+  virtual Status Insert(const WriteOptions& options,
+                        const JSONDocument& document) = 0;
+
+  // Deletes all documents matching a filter atomically
+  virtual Status Remove(const ReadOptions& read_options,
+                        const WriteOptions& write_options,
+                        const JSONDocument& query) = 0;
+
+  // Does this sequence of operations:
+  // 1. Find all documents matching a filter
+  // 2. For all documents, atomically:
+  // 2.1. apply the update operators
+  // 2.2. update the secondary indexes
+  //
+  // Currently only $set update operator is supported.
+  // Syntax is: {$set: {key1: value1, key2: value2, etc...}}
+  // This operator will change a document's key1 field to value1, key2 to
+  // value2, etc. New values will be set even if a document didn't have an entry
+  // for the specified key.
+  //
+  // You can not change a primary key of a document.
+  //
+  // Update example: Update({id: {$gt: 5}, $index: id}, {$set: {enabled: true}})
+  virtual Status Update(const ReadOptions& read_options,
+                        const WriteOptions& write_options,
+                        const JSONDocument& filter,
+                        const JSONDocument& updates) = 0;
+
+  // query has to be an array in which every element is an operator. Currently
+  // only $filter operator is supported. Syntax of $filter operator is:
+  // {$filter: {key1: condition1, key2: condition2, etc.}} where conditions can
+  // be either:
+  // 1) a single value in which case the condition is equality condition, or
+  // 2) a defined operators, like {$gt: 4}, which will match all documents that
+  // have key greater than 4.
+  //
+  // Supported operators are:
+  // 1) $gt -- greater than
+  // 2) $gte -- greater than or equal
+  // 3) $lt -- less than
+  // 4) $lte -- less than or equal
+  // If you want the filter to use an index, you need to specify it like this:
+  // {$filter: {...(conditions)..., $index: index_name}}
+  //
+  // Example query:
+  // * [{$filter: {name: John, age: {$gte: 18}, $index: age}}]
+  // will return all Johns whose age is greater or equal to 18 and it will use
+  // index "age" to satisfy the query.
+  virtual Cursor* Query(const ReadOptions& read_options,
+                        const JSONDocument& query) = 0;
+};
+
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/flashcache.h b/src/rocksdb/include/rocksdb/utilities/flashcache.h
new file mode 100644
index 0000000..7bb7609
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/flashcache.h
@@ -0,0 +1,25 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include <string>
+#include "rocksdb/env.h"
+
+namespace rocksdb {
+
+// This API is experimental. We will mark it stable once we run it in production
+// for a while.
+// NewFlashcacheAwareEnv() creates and Env that blacklists all background
+// threads (used for flush and compaction) from using flashcache to cache their
+// reads. Reads from compaction thread don't need to be cached because they are
+// going to be soon made obsolete (due to nature of compaction)
+// Usually you would pass Env::Default() as base.
+// cachedev_fd is a file descriptor of the flashcache device. Caller has to
+// open flashcache device before calling this API.
+extern std::unique_ptr<Env> NewFlashcacheAwareEnv(
+    Env* base, const int cachedev_fd);
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/include/rocksdb/utilities/geo_db.h b/src/rocksdb/include/rocksdb/utilities/geo_db.h
new file mode 100644
index 0000000..41c0f14
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/geo_db.h
@@ -0,0 +1,105 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+
+#ifndef ROCKSDB_LITE
+#pragma once
+#include <string>
+#include <vector>
+
+#include "rocksdb/utilities/stackable_db.h"
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+//
+// Configurable options needed for setting up a Geo database
+//
+struct GeoDBOptions {
+  // Backup info and error messages will be written to info_log
+  // if non-nullptr.
+  // Default: nullptr
+  Logger* info_log;
+
+  explicit GeoDBOptions(Logger* _info_log = nullptr):info_log(_info_log) { }
+};
+
+//
+// A position in the earth's geoid
+//
+class GeoPosition {
+ public:
+  double latitude;
+  double longitude;
+
+  explicit GeoPosition(double la = 0, double lo = 0) :
+    latitude(la), longitude(lo) {
+  }
+};
+
+//
+// Description of an object on the Geoid. It is located by a GPS location,
+// and is identified by the id. The value associated with this object is
+// an opaque string 'value'. Different objects identified by unique id's
+// can have the same gps-location associated with them.
+//
+class GeoObject {
+ public:
+  GeoPosition position;
+  std::string id;
+  std::string value;
+
+  GeoObject() {}
+
+  GeoObject(const GeoPosition& pos, const std::string& i,
+            const std::string& val) :
+    position(pos), id(i), value(val) {
+  }
+};
+
+//
+// Stack your DB with GeoDB to be able to get geo-spatial support
+//
+class GeoDB : public StackableDB {
+ public:
+  // GeoDBOptions have to be the same as the ones used in a previous
+  // incarnation of the DB
+  //
+  // GeoDB owns the pointer `DB* db` now. You should not delete it or
+  // use it after the invocation of GeoDB
+  // GeoDB(DB* db, const GeoDBOptions& options) : StackableDB(db) {}
+  GeoDB(DB* db, const GeoDBOptions& options) : StackableDB(db) {}
+  virtual ~GeoDB() {}
+
+  // Insert a new object into the location database. The object is
+  // uniquely identified by the id. If an object with the same id already
+  // exists in the db, then the old one is overwritten by the new
+  // object being inserted here.
+  virtual Status Insert(const GeoObject& object) = 0;
+
+  // Retrieve the value of the object located at the specified GPS
+  // location and is identified by the 'id'.
+  virtual Status GetByPosition(const GeoPosition& pos,
+                               const Slice& id, std::string* value) = 0;
+
+  // Retrieve the value of the object identified by the 'id'. This method
+  // could be potentially slower than GetByPosition
+  virtual Status GetById(const Slice& id, GeoObject*  object) = 0;
+
+  // Delete the specified object
+  virtual Status Remove(const Slice& id) = 0;
+
+  // Returns a list of all items within a circular radius from the
+  // specified gps location. If 'number_of_values' is specified,
+  // then this call returns at most that many number of objects.
+  // The radius is specified in 'meters'.
+  virtual Status SearchRadial(const GeoPosition& pos,
+                              double radius,
+                              std::vector<GeoObject>* values,
+                              int number_of_values = INT_MAX) = 0;
+};
+
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/json_document.h b/src/rocksdb/include/rocksdb/utilities/json_document.h
new file mode 100644
index 0000000..a5e3ab2
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/json_document.h
@@ -0,0 +1,195 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <deque>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "rocksdb/slice.h"
+
+// We use JSONDocument for DocumentDB API
+// Implementation inspired by folly::dynamic, rapidjson and fbson
+
+namespace fbson {
+  class FbsonValue;
+  class ObjectVal;
+  template <typename T>
+  class FbsonWriterT;
+  class FbsonOutStream;
+  typedef FbsonWriterT<FbsonOutStream> FbsonWriter;
+}  // namespace fbson
+
+namespace rocksdb {
+
+// NOTE: none of this is thread-safe
+class JSONDocument {
+ public:
+  // return nullptr on parse failure
+  static JSONDocument* ParseJSON(const char* json);
+
+  enum Type {
+    kNull,
+    kArray,
+    kBool,
+    kDouble,
+    kInt64,
+    kObject,
+    kString,
+  };
+
+  /* implicit */ JSONDocument();  // null
+  /* implicit */ JSONDocument(bool b);
+  /* implicit */ JSONDocument(double d);
+  /* implicit */ JSONDocument(int8_t i);
+  /* implicit */ JSONDocument(int16_t i);
+  /* implicit */ JSONDocument(int32_t i);
+  /* implicit */ JSONDocument(int64_t i);
+  /* implicit */ JSONDocument(const std::string& s);
+  /* implicit */ JSONDocument(const char* s);
+  // constructs JSONDocument of specific type with default value
+  explicit JSONDocument(Type _type);
+
+  JSONDocument(const JSONDocument& json_document);
+
+  JSONDocument(JSONDocument&& json_document);
+
+  Type type() const;
+
+  // REQUIRES: IsObject()
+  bool Contains(const std::string& key) const;
+  // REQUIRES: IsObject()
+  // Returns non-owner object
+  JSONDocument operator[](const std::string& key) const;
+
+  // REQUIRES: IsArray() == true || IsObject() == true
+  size_t Count() const;
+
+  // REQUIRES: IsArray()
+  // Returns non-owner object
+  JSONDocument operator[](size_t i) const;
+
+  JSONDocument& operator=(JSONDocument jsonDocument);
+
+  bool IsNull() const;
+  bool IsArray() const;
+  bool IsBool() const;
+  bool IsDouble() const;
+  bool IsInt64() const;
+  bool IsObject() const;
+  bool IsString() const;
+
+  // REQUIRES: IsBool() == true
+  bool GetBool() const;
+  // REQUIRES: IsDouble() == true
+  double GetDouble() const;
+  // REQUIRES: IsInt64() == true
+  int64_t GetInt64() const;
+  // REQUIRES: IsString() == true
+  std::string GetString() const;
+
+  bool operator==(const JSONDocument& rhs) const;
+
+  bool operator!=(const JSONDocument& rhs) const;
+
+  JSONDocument Copy() const;
+
+  bool IsOwner() const;
+
+  std::string DebugString() const;
+
+ private:
+  class ItemsIteratorGenerator;
+
+ public:
+  // REQUIRES: IsObject()
+  ItemsIteratorGenerator Items() const;
+
+  // appends serialized object to dst
+  void Serialize(std::string* dst) const;
+  // returns nullptr if Slice doesn't represent valid serialized JSONDocument
+  static JSONDocument* Deserialize(const Slice& src);
+
+ private:
+  friend class JSONDocumentBuilder;
+
+  JSONDocument(fbson::FbsonValue* val, bool makeCopy);
+
+  void InitFromValue(const fbson::FbsonValue* val);
+
+  // iteration on objects
+  class const_item_iterator {
+   private:
+    class Impl;
+   public:
+    typedef std::pair<std::string, JSONDocument> value_type;
+    explicit const_item_iterator(Impl* impl);
+    const_item_iterator(const_item_iterator&&);
+    const_item_iterator& operator++();
+    bool operator!=(const const_item_iterator& other);
+    value_type operator*();
+    ~const_item_iterator();
+   private:
+    friend class ItemsIteratorGenerator;
+    std::unique_ptr<Impl> it_;
+  };
+
+  class ItemsIteratorGenerator {
+   public:
+    explicit ItemsIteratorGenerator(const fbson::ObjectVal& object);
+    const_item_iterator begin() const;
+
+    const_item_iterator end() const;
+
+   private:
+    const fbson::ObjectVal& object_;
+  };
+
+  std::unique_ptr<char[]> data_;
+  mutable fbson::FbsonValue* value_;
+
+  // Our serialization format's first byte specifies the encoding version. That
+  // way, we can easily change our format while providing backwards
+  // compatibility. This constant specifies the current version of the
+  // serialization format
+  static const char kSerializationFormatVersion;
+};
+
+class JSONDocumentBuilder {
+ public:
+  JSONDocumentBuilder();
+
+  explicit JSONDocumentBuilder(fbson::FbsonOutStream* out);
+
+  void Reset();
+
+  bool WriteStartArray();
+
+  bool WriteEndArray();
+
+  bool WriteStartObject();
+
+  bool WriteEndObject();
+
+  bool WriteKeyValue(const std::string& key, const JSONDocument& value);
+
+  bool WriteJSONDocument(const JSONDocument& value);
+
+  JSONDocument GetJSONDocument();
+
+  ~JSONDocumentBuilder();
+
+ private:
+  std::unique_ptr<fbson::FbsonWriter> writer_;
+};
+
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/leveldb_options.h b/src/rocksdb/include/rocksdb/utilities/leveldb_options.h
new file mode 100644
index 0000000..8e2c3a1
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/leveldb_options.h
@@ -0,0 +1,144 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <stddef.h>
+
+namespace rocksdb {
+
+class Cache;
+class Comparator;
+class Env;
+class FilterPolicy;
+class Logger;
+struct Options;
+class Snapshot;
+
+enum CompressionType : char;
+
+// Options to control the behavior of a database (passed to
+// DB::Open). A LevelDBOptions object can be initialized as though
+// it were a LevelDB Options object, and then it can be converted into
+// a RocksDB Options object.
+struct LevelDBOptions {
+  // -------------------
+  // Parameters that affect behavior
+
+  // Comparator used to define the order of keys in the table.
+  // Default: a comparator that uses lexicographic byte-wise ordering
+  //
+  // REQUIRES: The client must ensure that the comparator supplied
+  // here has the same name and orders keys *exactly* the same as the
+  // comparator provided to previous open calls on the same DB.
+  const Comparator* comparator;
+
+  // If true, the database will be created if it is missing.
+  // Default: false
+  bool create_if_missing;
+
+  // If true, an error is raised if the database already exists.
+  // Default: false
+  bool error_if_exists;
+
+  // If true, the implementation will do aggressive checking of the
+  // data it is processing and will stop early if it detects any
+  // errors.  This may have unforeseen ramifications: for example, a
+  // corruption of one DB entry may cause a large number of entries to
+  // become unreadable or for the entire DB to become unopenable.
+  // Default: false
+  bool paranoid_checks;
+
+  // Use the specified object to interact with the environment,
+  // e.g. to read/write files, schedule background work, etc.
+  // Default: Env::Default()
+  Env* env;
+
+  // Any internal progress/error information generated by the db will
+  // be written to info_log if it is non-NULL, or to a file stored
+  // in the same directory as the DB contents if info_log is NULL.
+  // Default: NULL
+  Logger* info_log;
+
+  // -------------------
+  // Parameters that affect performance
+
+  // Amount of data to build up in memory (backed by an unsorted log
+  // on disk) before converting to a sorted on-disk file.
+  //
+  // Larger values increase performance, especially during bulk loads.
+  // Up to two write buffers may be held in memory at the same time,
+  // so you may wish to adjust this parameter to control memory usage.
+  // Also, a larger write buffer will result in a longer recovery time
+  // the next time the database is opened.
+  //
+  // Default: 4MB
+  size_t write_buffer_size;
+
+  // Number of open files that can be used by the DB.  You may need to
+  // increase this if your database has a large working set (budget
+  // one open file per 2MB of working set).
+  //
+  // Default: 1000
+  int max_open_files;
+
+  // Control over blocks (user data is stored in a set of blocks, and
+  // a block is the unit of reading from disk).
+
+  // If non-NULL, use the specified cache for blocks.
+  // If NULL, leveldb will automatically create and use an 8MB internal cache.
+  // Default: NULL
+  Cache* block_cache;
+
+  // Approximate size of user data packed per block.  Note that the
+  // block size specified here corresponds to uncompressed data.  The
+  // actual size of the unit read from disk may be smaller if
+  // compression is enabled.  This parameter can be changed dynamically.
+  //
+  // Default: 4K
+  size_t block_size;
+
+  // Number of keys between restart points for delta encoding of keys.
+  // This parameter can be changed dynamically.  Most clients should
+  // leave this parameter alone.
+  //
+  // Default: 16
+  int block_restart_interval;
+
+  // Compress blocks using the specified compression algorithm.  This
+  // parameter can be changed dynamically.
+  //
+  // Default: kSnappyCompression, which gives lightweight but fast
+  // compression.
+  //
+  // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz:
+  //    ~200-500MB/s compression
+  //    ~400-800MB/s decompression
+  // Note that these speeds are significantly faster than most
+  // persistent storage speeds, and therefore it is typically never
+  // worth switching to kNoCompression.  Even if the input data is
+  // incompressible, the kSnappyCompression implementation will
+  // efficiently detect that and will switch to uncompressed mode.
+  CompressionType compression;
+
+  // If non-NULL, use the specified filter policy to reduce disk reads.
+  // Many applications will benefit from passing the result of
+  // NewBloomFilterPolicy() here.
+  //
+  // Default: NULL
+  const FilterPolicy* filter_policy;
+
+  // Create a LevelDBOptions object with default values for all fields.
+  LevelDBOptions();
+};
+
+// Converts a LevelDBOptions object into a RocksDB Options object.
+Options ConvertOptions(const LevelDBOptions& leveldb_options);
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/include/rocksdb/utilities/spatial_db.h b/src/rocksdb/include/rocksdb/utilities/spatial_db.h
new file mode 100644
index 0000000..1beb5c7
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/spatial_db.h
@@ -0,0 +1,238 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <vector>
+
+#include "rocksdb/db.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/utilities/stackable_db.h"
+
+namespace rocksdb {
+namespace spatial {
+
+// NOTE: SpatialDB is experimental and we might change its API without warning.
+// Please talk to us before developing against SpatialDB API.
+//
+// SpatialDB is a support for spatial indexes built on top of RocksDB.
+// When creating a new SpatialDB, clients specifies a list of spatial indexes to
+// build on their data. Each spatial index is defined by the area and
+// granularity. If you're storing map data, different spatial index
+// granularities can be used for different zoom levels.
+//
+// Each element inserted into SpatialDB has:
+// * a bounding box, which determines how will the element be indexed
+// * string blob, which will usually be WKB representation of the polygon
+// (http://en.wikipedia.org/wiki/Well-known_text)
+// * feature set, which is a map of key-value pairs, where value can be null,
+// int, double, bool, string
+// * a list of indexes to insert the element in
+//
+// Each query is executed on a single spatial index. Query guarantees that it
+// will return all elements intersecting the specified bounding box, but it
+// might also return some extra non-intersecting elements.
+
+// Variant is a class that can be many things: null, bool, int, double or string
+// It is used to store different value types in FeatureSet (see below)
+struct Variant {
+  // Don't change the values here, they are persisted on disk
+  enum Type {
+    kNull = 0x0,
+    kBool = 0x1,
+    kInt = 0x2,
+    kDouble = 0x3,
+    kString = 0x4,
+  };
+
+  Variant() : type_(kNull) {}
+  /* implicit */ Variant(bool b) : type_(kBool) { data_.b = b; }
+  /* implicit */ Variant(uint64_t i) : type_(kInt) { data_.i = i; }
+  /* implicit */ Variant(double d) : type_(kDouble) { data_.d = d; }
+  /* implicit */ Variant(const std::string& s) : type_(kString) {
+    new (&data_.s) std::string(s);
+  }
+
+  Variant(const Variant& v);
+
+  ~Variant() {
+    if (type_ == kString) {
+      using std::string;
+      (&data_.s)->~string();
+    }
+  }
+
+  Type type() const { return type_; }
+  bool get_bool() const { return data_.b; }
+  uint64_t get_int() const { return data_.i; }
+  double get_double() const { return data_.d; }
+  const std::string& get_string() const { return data_.s; }
+
+  bool operator==(const Variant& other);
+  bool operator!=(const Variant& other);
+
+ private:
+  Type type_;
+  union Data {
+    Data() {}
+    ~Data() {}
+    bool b;
+    uint64_t i;
+    double d;
+    std::string s;
+  } data_;
+};
+
+// FeatureSet is a map of key-value pairs. One feature set is associated with
+// each element in SpatialDB. It can be used to add rich data about the element.
+class FeatureSet {
+ private:
+  typedef std::unordered_map<std::string, Variant> map;
+
+ public:
+  class iterator {
+   public:
+    /* implicit */ iterator(const map::const_iterator itr) : itr_(itr) {}
+    iterator& operator++() {
+      ++itr_;
+      return *this;
+    }
+    bool operator!=(const iterator& other) { return itr_ != other.itr_; }
+    bool operator==(const iterator& other) { return itr_ == other.itr_; }
+    map::value_type operator*() { return *itr_; }
+
+   private:
+    map::const_iterator itr_;
+  };
+  FeatureSet() = default;
+
+  FeatureSet* Set(const std::string& key, const Variant& value);
+  bool Contains(const std::string& key) const;
+  // REQUIRES: Contains(key)
+  const Variant& Get(const std::string& key) const;
+  iterator Find(const std::string& key) const;
+
+  iterator begin() const { return map_.begin(); }
+  iterator end() const { return map_.end(); }
+
+  void Clear();
+  size_t Size() const { return map_.size(); }
+
+  void Serialize(std::string* output) const;
+  // REQUIRED: empty FeatureSet
+  bool Deserialize(const Slice& input);
+
+  std::string DebugString() const;
+
+ private:
+  map map_;
+};
+
+// BoundingBox is a helper structure for defining rectangles representing
+// bounding boxes of spatial elements.
+template <typename T>
+struct BoundingBox {
+  T min_x, min_y, max_x, max_y;
+  BoundingBox() = default;
+  BoundingBox(T _min_x, T _min_y, T _max_x, T _max_y)
+      : min_x(_min_x), min_y(_min_y), max_x(_max_x), max_y(_max_y) {}
+
+  bool Intersects(const BoundingBox<T>& a) const {
+    return !(min_x > a.max_x || min_y > a.max_y || a.min_x > max_x ||
+             a.min_y > max_y);
+  }
+};
+
+struct SpatialDBOptions {
+  uint64_t cache_size = 1 * 1024 * 1024 * 1024LL;  // 1GB
+  int num_threads = 16;
+  bool bulk_load = true;
+};
+
+// Cursor is used to return data from the query to the client. To get all the
+// data from the query, just call Next() while Valid() is true
+class Cursor {
+ public:
+  Cursor() = default;
+  virtual ~Cursor() {}
+
+  virtual bool Valid() const = 0;
+  // REQUIRES: Valid()
+  virtual void Next() = 0;
+
+  // Lifetime of the underlying storage until the next call to Next()
+  // REQUIRES: Valid()
+  virtual const Slice blob() = 0;
+  // Lifetime of the underlying storage until the next call to Next()
+  // REQUIRES: Valid()
+  virtual const FeatureSet& feature_set() = 0;
+
+  virtual Status status() const = 0;
+
+ private:
+  // No copying allowed
+  Cursor(const Cursor&);
+  void operator=(const Cursor&);
+};
+
+// SpatialIndexOptions defines a spatial index that will be built on the data
+struct SpatialIndexOptions {
+  // Spatial indexes are referenced by names
+  std::string name;
+  // An area that is indexed. If the element is not intersecting with spatial
+  // index's bbox, it will not be inserted into the index
+  BoundingBox<double> bbox;
+  // tile_bits control the granularity of the spatial index. Each dimension of
+  // the bbox will be split into (1 << tile_bits) tiles, so there will be a
+  // total of (1 << tile_bits)^2 tiles. It is recommended to configure a size of
+  // each  tile to be approximately the size of the query on that spatial index
+  uint32_t tile_bits;
+  SpatialIndexOptions() {}
+  SpatialIndexOptions(const std::string& _name,
+                      const BoundingBox<double>& _bbox, uint32_t _tile_bits)
+      : name(_name), bbox(_bbox), tile_bits(_tile_bits) {}
+};
+
+class SpatialDB : public StackableDB {
+ public:
+  // Creates the SpatialDB with specified list of indexes.
+  // REQUIRED: db doesn't exist
+  static Status Create(const SpatialDBOptions& options, const std::string& name,
+                       const std::vector<SpatialIndexOptions>& spatial_indexes);
+
+  // Open the existing SpatialDB.  The resulting db object will be returned
+  // through db parameter.
+  // REQUIRED: db was created using SpatialDB::Create
+  static Status Open(const SpatialDBOptions& options, const std::string& name,
+                     SpatialDB** db, bool read_only = false);
+
+  explicit SpatialDB(DB* db) : StackableDB(db) {}
+
+  // Insert the element into the DB. Element will be inserted into specified
+  // spatial_indexes, based on specified bbox.
+  // REQUIRES: spatial_indexes.size() > 0
+  virtual Status Insert(const WriteOptions& write_options,
+                        const BoundingBox<double>& bbox, const Slice& blob,
+                        const FeatureSet& feature_set,
+                        const std::vector<std::string>& spatial_indexes) = 0;
+
+  // Calling Compact() after inserting a bunch of elements should speed up
+  // reading. This is especially useful if you use SpatialDBOptions::bulk_load
+  // Num threads determines how many threads we'll use for compactions. Setting
+  // this to bigger number will use more IO and CPU, but finish faster
+  virtual Status Compact(int num_threads = 1) = 0;
+
+  // Query the specified spatial_index. Query will return all elements that
+  // intersect bbox, but it may also return some extra elements.
+  virtual Cursor* Query(const ReadOptions& read_options,
+                        const BoundingBox<double>& bbox,
+                        const std::string& spatial_index) = 0;
+};
+
+}  // namespace spatial
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/stackable_db.h b/src/rocksdb/include/rocksdb/utilities/stackable_db.h
new file mode 100644
index 0000000..158aa32
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/stackable_db.h
@@ -0,0 +1,260 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <string>
+#include "rocksdb/db.h"
+
+namespace rocksdb {
+
+// This class contains APIs to stack rocksdb wrappers.Eg. Stack TTL over base d
+class StackableDB : public DB {
+ public:
+  // StackableDB is the owner of db now!
+  explicit StackableDB(DB* db) : db_(db) {}
+
+  ~StackableDB() {
+    delete db_;
+  }
+
+  virtual DB* GetBaseDB() {
+    return db_;
+  }
+
+  virtual Status CreateColumnFamily(const ColumnFamilyOptions& options,
+                                    const std::string& column_family_name,
+                                    ColumnFamilyHandle** handle) override {
+    return db_->CreateColumnFamily(options, column_family_name, handle);
+  }
+
+  virtual Status DropColumnFamily(ColumnFamilyHandle* column_family) override {
+    return db_->DropColumnFamily(column_family);
+  }
+
+  using DB::Put;
+  virtual Status Put(const WriteOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     const Slice& val) override {
+    return db_->Put(options, column_family, key, val);
+  }
+
+  using DB::Get;
+  virtual Status Get(const ReadOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     std::string* value) override {
+    return db_->Get(options, column_family, key, value);
+  }
+
+  using DB::MultiGet;
+  virtual std::vector<Status> MultiGet(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_family,
+      const std::vector<Slice>& keys,
+      std::vector<std::string>* values) override {
+    return db_->MultiGet(options, column_family, keys, values);
+  }
+
+  using DB::KeyMayExist;
+  virtual bool KeyMayExist(const ReadOptions& options,
+                           ColumnFamilyHandle* column_family, const Slice& key,
+                           std::string* value,
+                           bool* value_found = nullptr) override {
+    return db_->KeyMayExist(options, column_family, key, value, value_found);
+  }
+
+  using DB::Delete;
+  virtual Status Delete(const WriteOptions& wopts,
+                        ColumnFamilyHandle* column_family,
+                        const Slice& key) override {
+    return db_->Delete(wopts, column_family, key);
+  }
+
+  using DB::Merge;
+  virtual Status Merge(const WriteOptions& options,
+                       ColumnFamilyHandle* column_family, const Slice& key,
+                       const Slice& value) override {
+    return db_->Merge(options, column_family, key, value);
+  }
+
+
+  virtual Status Write(const WriteOptions& opts, WriteBatch* updates)
+    override {
+      return db_->Write(opts, updates);
+  }
+
+  using DB::NewIterator;
+  virtual Iterator* NewIterator(const ReadOptions& opts,
+                                ColumnFamilyHandle* column_family) override {
+    return db_->NewIterator(opts, column_family);
+  }
+
+  virtual Status NewIterators(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_families,
+      std::vector<Iterator*>* iterators) override {
+    return db_->NewIterators(options, column_families, iterators);
+  }
+
+
+  virtual const Snapshot* GetSnapshot() override {
+    return db_->GetSnapshot();
+  }
+
+  virtual void ReleaseSnapshot(const Snapshot* snapshot) override {
+    return db_->ReleaseSnapshot(snapshot);
+  }
+
+  using DB::GetProperty;
+  virtual bool GetProperty(ColumnFamilyHandle* column_family,
+                           const Slice& property, std::string* value) override {
+    return db_->GetProperty(column_family, property, value);
+  }
+
+  using DB::GetIntProperty;
+  virtual bool GetIntProperty(ColumnFamilyHandle* column_family,
+                              const Slice& property, uint64_t* value) override {
+    return db_->GetIntProperty(column_family, property, value);
+  }
+
+  using DB::GetApproximateSizes;
+  virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
+                                   const Range* r, int n,
+                                   uint64_t* sizes) override {
+      return db_->GetApproximateSizes(column_family, r, n, sizes);
+  }
+
+  using DB::CompactRange;
+  virtual Status CompactRange(ColumnFamilyHandle* column_family,
+                              const Slice* begin, const Slice* end,
+                              bool reduce_level = false, int target_level = -1,
+                              uint32_t target_path_id = 0) override {
+    return db_->CompactRange(column_family, begin, end, reduce_level,
+                             target_level, target_path_id);
+  }
+
+  using DB::CompactFiles;
+  virtual Status CompactFiles(
+      const CompactionOptions& compact_options,
+      ColumnFamilyHandle* column_family,
+      const std::vector<std::string>& input_file_names,
+      const int output_level, const int output_path_id = -1) override {
+    return db_->CompactFiles(
+        compact_options, column_family, input_file_names,
+        output_level, output_path_id);
+  }
+
+  using DB::NumberLevels;
+  virtual int NumberLevels(ColumnFamilyHandle* column_family) override {
+    return db_->NumberLevels(column_family);
+  }
+
+  using DB::MaxMemCompactionLevel;
+  virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family)
+      override {
+    return db_->MaxMemCompactionLevel(column_family);
+  }
+
+  using DB::Level0StopWriteTrigger;
+  virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family)
+      override {
+    return db_->Level0StopWriteTrigger(column_family);
+  }
+
+  virtual const std::string& GetName() const override {
+    return db_->GetName();
+  }
+
+  virtual Env* GetEnv() const override {
+    return db_->GetEnv();
+  }
+
+  using DB::GetOptions;
+  virtual const Options& GetOptions(ColumnFamilyHandle* column_family) const
+      override {
+    return db_->GetOptions(column_family);
+  }
+
+  using DB::GetDBOptions;
+  virtual const DBOptions& GetDBOptions() const override {
+    return db_->GetDBOptions();
+  }
+
+  using DB::Flush;
+  virtual Status Flush(const FlushOptions& fopts,
+                       ColumnFamilyHandle* column_family) override {
+    return db_->Flush(fopts, column_family);
+  }
+
+#ifndef ROCKSDB_LITE
+
+  virtual Status DisableFileDeletions() override {
+    return db_->DisableFileDeletions();
+  }
+
+  virtual Status EnableFileDeletions(bool force) override {
+    return db_->EnableFileDeletions(force);
+  }
+
+  virtual void GetLiveFilesMetaData(
+      std::vector<LiveFileMetaData>* metadata) override {
+    db_->GetLiveFilesMetaData(metadata);
+  }
+
+  virtual void GetColumnFamilyMetaData(
+      ColumnFamilyHandle *column_family,
+      ColumnFamilyMetaData* cf_meta) override {
+    db_->GetColumnFamilyMetaData(column_family, cf_meta);
+  }
+
+#endif  // ROCKSDB_LITE
+
+  virtual Status GetLiveFiles(std::vector<std::string>& vec, uint64_t* mfs,
+                              bool flush_memtable = true) override {
+      return db_->GetLiveFiles(vec, mfs, flush_memtable);
+  }
+
+  virtual SequenceNumber GetLatestSequenceNumber() const override {
+    return db_->GetLatestSequenceNumber();
+  }
+
+  virtual Status GetSortedWalFiles(VectorLogPtr& files) override {
+    return db_->GetSortedWalFiles(files);
+  }
+
+  virtual Status DeleteFile(std::string name) override {
+    return db_->DeleteFile(name);
+  }
+
+  virtual Status GetDbIdentity(std::string& identity) override {
+    return db_->GetDbIdentity(identity);
+  }
+
+  using DB::SetOptions;
+  virtual Status SetOptions(
+    const std::unordered_map<std::string, std::string>& new_options) override {
+    return db_->SetOptions(new_options);
+  }
+
+  using DB::GetPropertiesOfAllTables;
+  virtual Status GetPropertiesOfAllTables(
+      ColumnFamilyHandle* column_family,
+      TablePropertiesCollection* props) override {
+    return db_->GetPropertiesOfAllTables(column_family, props);
+  }
+
+  virtual Status GetUpdatesSince(
+      SequenceNumber seq_number, unique_ptr<TransactionLogIterator>* iter,
+      const TransactionLogIterator::ReadOptions& read_options) override {
+    return db_->GetUpdatesSince(seq_number, iter, read_options);
+  }
+
+  virtual ColumnFamilyHandle* DefaultColumnFamily() const override {
+    return db_->DefaultColumnFamily();
+  }
+
+ protected:
+  DB* db_;
+};
+
+} //  namespace rocksdb
diff --git a/src/rocksdb/include/rocksdb/utilities/utility_db.h b/src/rocksdb/include/rocksdb/utilities/utility_db.h
new file mode 100644
index 0000000..f4db665
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/utility_db.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+#include <vector>
+#include <string>
+
+#include "rocksdb/utilities/stackable_db.h"
+#include "rocksdb/utilities/db_ttl.h"
+#include "rocksdb/db.h"
+
+namespace rocksdb {
+
+// Please don't use this class. It's deprecated
+class UtilityDB {
+ public:
+  // This function is here only for backwards compatibility. Please use the
+  // functions defined in DBWithTTl (rocksdb/utilities/db_ttl.h)
+  // (deprecated)
+  __attribute__((deprecated)) static Status OpenTtlDB(const Options& options,
+                                                      const std::string& name,
+                                                      StackableDB** dbptr,
+                                                      int32_t ttl = 0,
+                                                      bool read_only = false);
+};
+
+} //  namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/write_batch_with_index.h b/src/rocksdb/include/rocksdb/utilities/write_batch_with_index.h
new file mode 100644
index 0000000..7c17534
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/write_batch_with_index.h
@@ -0,0 +1,162 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A WriteBatchWithIndex with a binary searchable index built for all the keys
+// inserted.
+
+#pragma once
+
+#include <string>
+
+#include "rocksdb/comparator.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "rocksdb/write_batch.h"
+#include "rocksdb/write_batch_base.h"
+
+namespace rocksdb {
+
+class ColumnFamilyHandle;
+class Comparator;
+class DB;
+struct ReadOptions;
+struct DBOptions;
+
+enum WriteType { kPutRecord, kMergeRecord, kDeleteRecord, kLogDataRecord };
+
+// an entry for Put, Merge or Delete entry for write batches. Used in
+// WBWIIterator.
+struct WriteEntry {
+  WriteType type;
+  Slice key;
+  Slice value;
+};
+
+// Iterator of one column family out of a WriteBatchWithIndex.
+class WBWIIterator {
+ public:
+  virtual ~WBWIIterator() {}
+
+  virtual bool Valid() const = 0;
+
+  virtual void SeekToFirst() = 0;
+
+  virtual void SeekToLast() = 0;
+
+  virtual void Seek(const Slice& key) = 0;
+
+  virtual void Next() = 0;
+
+  virtual void Prev() = 0;
+
+  virtual const WriteEntry& Entry() const = 0;
+
+  virtual Status status() const = 0;
+};
+
+// A WriteBatchWithIndex with a binary searchable index built for all the keys
+// inserted.
+// In Put(), Merge() or Delete(), the same function of the wrapped will be
+// called. At the same time, indexes will be built.
+// By calling GetWriteBatch(), a user will get the WriteBatch for the data
+// they inserted, which can be used for DB::Write().
+// A user can call NewIterator() to create an iterator.
+class WriteBatchWithIndex : public WriteBatchBase {
+ public:
+  // backup_index_comparator: the backup comparator used to compare keys
+  // within the same column family, if column family is not given in the
+  // interface, or we can't find a column family from the column family handle
+  // passed in, backup_index_comparator will be used for the column family.
+  // reserved_bytes: reserved bytes in underlying WriteBatch
+  // overwrite_key: if true, overwrite the key in the index when inserting
+  //                the same key as previously, so iterator will never
+  //                show two entries with the same key.
+  explicit WriteBatchWithIndex(
+      const Comparator* backup_index_comparator = BytewiseComparator(),
+      size_t reserved_bytes = 0, bool overwrite_key = false);
+  virtual ~WriteBatchWithIndex();
+
+  using WriteBatchBase::Put;
+  void Put(ColumnFamilyHandle* column_family, const Slice& key,
+           const Slice& value) override;
+
+  void Put(const Slice& key, const Slice& value) override;
+
+  using WriteBatchBase::Merge;
+  void Merge(ColumnFamilyHandle* column_family, const Slice& key,
+             const Slice& value) override;
+
+  void Merge(const Slice& key, const Slice& value) override;
+
+  using WriteBatchBase::Delete;
+  void Delete(ColumnFamilyHandle* column_family, const Slice& key) override;
+  void Delete(const Slice& key) override;
+
+  using WriteBatchBase::PutLogData;
+  void PutLogData(const Slice& blob) override;
+
+  using WriteBatchBase::Clear;
+  void Clear() override;
+
+  using WriteBatchBase::GetWriteBatch;
+  WriteBatch* GetWriteBatch() override;
+
+  // Create an iterator of a column family. User can call iterator.Seek() to
+  // search to the next entry of or after a key. Keys will be iterated in the
+  // order given by index_comparator. For multiple updates on the same key,
+  // each update will be returned as a separate entry, in the order of update
+  // time.
+  WBWIIterator* NewIterator(ColumnFamilyHandle* column_family);
+  // Create an iterator of the default column family.
+  WBWIIterator* NewIterator();
+
+  // Will create a new Iterator that will use WBWIIterator as a delta and
+  // base_iterator as base
+  Iterator* NewIteratorWithBase(ColumnFamilyHandle* column_family,
+                                Iterator* base_iterator);
+  // default column family
+  Iterator* NewIteratorWithBase(Iterator* base_iterator);
+
+  // Similar to DB::Get() but will only read the key from this batch.
+  // If the batch does not have enough data to resolve Merge operations,
+  // MergeInProgress status may be returned.
+  Status GetFromBatch(ColumnFamilyHandle* column_family,
+                      const DBOptions& options, const Slice& key,
+                      std::string* value);
+
+  // Similar to previous function but does not require a column_family.
+  // Note:  An InvalidArgument status will be returned if there are any Merge
+  // operators for this key.
+  Status GetFromBatch(const DBOptions& options, const Slice& key,
+                      std::string* value) {
+    return GetFromBatch(nullptr, options, key, value);
+  }
+
+  // Similar to DB::Get() but will also read writes from this batch.
+  //
+  // This function will query both this batch and the DB and then merge
+  // the results using the DB's merge operator (if the batch contains any
+  // merge requests).
+  //
+  // Setting read_options.snapshot will affect what is read from the DB
+  // but will NOT change which keys are read from the batch (the keys in
+  // this batch do not yet belong to any snapshot and will be fetched
+  // regardless).
+  Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options,
+                           const Slice& key, std::string* value);
+  Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options,
+                           ColumnFamilyHandle* column_family, const Slice& key,
+                           std::string* value);
+
+ private:
+  struct Rep;
+  Rep* rep;
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/include/rocksdb/version.h b/src/rocksdb/include/rocksdb/version.h
index 6aeabc2..26086b2 100644
--- a/src/rocksdb/include/rocksdb/version.h
+++ b/src/rocksdb/include/rocksdb/version.h
@@ -1,6 +1,16 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
 #pragma once
 
-// Also update Makefile if you change these
-#define __ROCKSDB_MAJOR__ 3
-#define __ROCKSDB_MINOR__ 0
-#define __ROCKSDB_PATCH__ 0
+#define ROCKSDB_MAJOR 3
+#define ROCKSDB_MINOR 11
+#define ROCKSDB_PATCH 2
+
+// Do not use these. We made the mistake of declaring macros starting with
+// double underscore. Now we have to live with our choice. We'll deprecate these
+// at some point
+#define __ROCKSDB_MAJOR__ ROCKSDB_MAJOR
+#define __ROCKSDB_MINOR__ ROCKSDB_MINOR
+#define __ROCKSDB_PATCH__ ROCKSDB_PATCH
diff --git a/src/rocksdb/include/rocksdb/write_batch.h b/src/rocksdb/include/rocksdb/write_batch.h
index 74ee2ad..c096ae1 100644
--- a/src/rocksdb/include/rocksdb/write_batch.h
+++ b/src/rocksdb/include/rocksdb/write_batch.h
@@ -27,6 +27,7 @@
 
 #include <string>
 #include "rocksdb/status.h"
+#include "rocksdb/write_batch_base.h"
 
 namespace rocksdb {
 
@@ -34,15 +35,16 @@ class Slice;
 class ColumnFamilyHandle;
 struct SliceParts;
 
-class WriteBatch {
+class WriteBatch : public WriteBatchBase {
  public:
   explicit WriteBatch(size_t reserved_bytes = 0);
   ~WriteBatch();
 
+  using WriteBatchBase::Put;
   // Store the mapping "key->value" in the database.
   void Put(ColumnFamilyHandle* column_family, const Slice& key,
-           const Slice& value);
-  void Put(const Slice& key, const Slice& value) {
+           const Slice& value) override;
+  void Put(const Slice& key, const Slice& value) override {
     Put(nullptr, key, value);
   }
 
@@ -50,23 +52,31 @@ class WriteBatch {
   // that will be written to the database are concatentations of arrays of
   // slices.
   void Put(ColumnFamilyHandle* column_family, const SliceParts& key,
-           const SliceParts& value);
-  void Put(const SliceParts& key, const SliceParts& value) {
+           const SliceParts& value) override;
+  void Put(const SliceParts& key, const SliceParts& value) override {
     Put(nullptr, key, value);
   }
 
+  using WriteBatchBase::Merge;
   // Merge "value" with the existing value of "key" in the database.
   // "key->merge(existing, value)"
   void Merge(ColumnFamilyHandle* column_family, const Slice& key,
-             const Slice& value);
-  void Merge(const Slice& key, const Slice& value) {
+             const Slice& value) override;
+  void Merge(const Slice& key, const Slice& value) override {
     Merge(nullptr, key, value);
   }
 
+  using WriteBatchBase::Delete;
   // If the database contains a mapping for "key", erase it.  Else do nothing.
-  void Delete(ColumnFamilyHandle* column_family, const Slice& key);
-  void Delete(const Slice& key) { Delete(nullptr, key); }
+  void Delete(ColumnFamilyHandle* column_family, const Slice& key) override;
+  void Delete(const Slice& key) override { Delete(nullptr, key); }
 
+  // variant that takes SliceParts
+  void Delete(ColumnFamilyHandle* column_family,
+              const SliceParts& key) override;
+  void Delete(const SliceParts& key) override { Delete(nullptr, key); }
+
+  using WriteBatchBase::PutLogData;
   // Append a blob of arbitrary size to the records in this batch. The blob will
   // be stored in the transaction log but not in any other file. In particular,
   // it will not be persisted to the SST files. When iterating over this
@@ -77,10 +87,11 @@ class WriteBatch {
   //
   // Example application: add timestamps to the transaction log for use in
   // replication.
-  void PutLogData(const Slice& blob);
+  void PutLogData(const Slice& blob) override;
 
+  using WriteBatchBase::Clear;
   // Clear all updates buffered in this batch.
-  void Clear();
+  void Clear() override;
 
   // Support for iterating over the contents of a batch.
   class Handler {
@@ -101,10 +112,11 @@ class WriteBatch {
       return Status::InvalidArgument(
           "non-default column family and PutCF not implemented");
     }
-    virtual void Put(const Slice& key, const Slice& value);
+    virtual void Put(const Slice& key, const Slice& value) {}
+
     // Merge and LogData are not pure virtual. Otherwise, we would break
     // existing clients of Handler on a source code level. The default
-    // implementation of Merge simply throws a runtime exception.
+    // implementation of Merge does nothing.
     virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
                            const Slice& value) {
       if (column_family_id == 0) {
@@ -114,7 +126,8 @@ class WriteBatch {
       return Status::InvalidArgument(
           "non-default column family and MergeCF not implemented");
     }
-    virtual void Merge(const Slice& key, const Slice& value);
+    virtual void Merge(const Slice& key, const Slice& value) {}
+
     // The default implementation of LogData does nothing.
     virtual void LogData(const Slice& blob);
     virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) {
@@ -125,7 +138,8 @@ class WriteBatch {
       return Status::InvalidArgument(
           "non-default column family and DeleteCF not implemented");
     }
-    virtual void Delete(const Slice& key);
+    virtual void Delete(const Slice& key) {}
+
     // Continue is called by WriteBatch::Iterate. If it returns false,
     // iteration is halted. Otherwise, it continues iterating. The default
     // implementation always returns true.
@@ -142,12 +156,16 @@ class WriteBatch {
   // Returns the number of updates in the batch
   int Count() const;
 
+  using WriteBatchBase::GetWriteBatch;
+  WriteBatch* GetWriteBatch() override { return this; }
+
   // Constructor with a serialized string object
   explicit WriteBatch(std::string rep): rep_(rep) {}
 
  private:
   friend class WriteBatchInternal;
 
+ protected:
   std::string rep_;  // See comment in write_batch.cc for the format of rep_
 
   // Intentionally copyable
diff --git a/src/rocksdb/include/rocksdb/write_batch_base.h b/src/rocksdb/include/rocksdb/write_batch_base.h
new file mode 100644
index 0000000..a218cc1
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/write_batch_base.h
@@ -0,0 +1,72 @@
+// Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+namespace rocksdb {
+
+class Slice;
+class ColumnFamilyHandle;
+class WriteBatch;
+struct SliceParts;
+
+// Abstract base class that defines the basic interface for a write batch.
+// See WriteBatch for a basic implementation and WrithBatchWithIndex for an
+// indexed implemenation.
+class WriteBatchBase {
+ public:
+  virtual ~WriteBatchBase() {}
+
+  // Store the mapping "key->value" in the database.
+  virtual void Put(ColumnFamilyHandle* column_family, const Slice& key,
+                   const Slice& value) = 0;
+  virtual void Put(const Slice& key, const Slice& value) = 0;
+
+  // Variant of Put() that gathers output like writev(2).  The key and value
+  // that will be written to the database are concatentations of arrays of
+  // slices.
+  virtual void Put(ColumnFamilyHandle* column_family, const SliceParts& key,
+                   const SliceParts& value);
+  virtual void Put(const SliceParts& key, const SliceParts& value);
+
+  // Merge "value" with the existing value of "key" in the database.
+  // "key->merge(existing, value)"
+  virtual void Merge(ColumnFamilyHandle* column_family, const Slice& key,
+                     const Slice& value) = 0;
+  virtual void Merge(const Slice& key, const Slice& value) = 0;
+
+  // If the database contains a mapping for "key", erase it.  Else do nothing.
+  virtual void Delete(ColumnFamilyHandle* column_family, const Slice& key) = 0;
+  virtual void Delete(const Slice& key) = 0;
+
+  // variant that takes SliceParts
+  virtual void Delete(ColumnFamilyHandle* column_family, const SliceParts& key);
+  virtual void Delete(const SliceParts& key);
+
+  // Append a blob of arbitrary size to the records in this batch. The blob will
+  // be stored in the transaction log but not in any other file. In particular,
+  // it will not be persisted to the SST files. When iterating over this
+  // WriteBatch, WriteBatch::Handler::LogData will be called with the contents
+  // of the blob as it is encountered. Blobs, puts, deletes, and merges will be
+  // encountered in the same order in thich they were inserted. The blob will
+  // NOT consume sequence number(s) and will NOT increase the count of the batch
+  //
+  // Example application: add timestamps to the transaction log for use in
+  // replication.
+  virtual void PutLogData(const Slice& blob) = 0;
+
+  // Clear all updates buffered in this batch.
+  virtual void Clear() = 0;
+
+  // Covert this batch into a WriteBatch.  This is an abstracted way of
+  // converting any WriteBatchBase(eg WriteBatchWithIndex) into a basic
+  // WriteBatch.
+  virtual WriteBatch* GetWriteBatch() = 0;
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/include/utilities/backupable_db.h b/src/rocksdb/include/utilities/backupable_db.h
index 617fe8a..43d5a5c 100644
--- a/src/rocksdb/include/utilities/backupable_db.h
+++ b/src/rocksdb/include/utilities/backupable_db.h
@@ -8,244 +8,5 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #pragma once
-#ifndef ROCKSDB_LITE
-
-#define __STDC_FORMAT_MACROS
-#include <inttypes.h>
-#include <string>
-#include <map>
-#include <vector>
-
-#include "utilities/stackable_db.h"
-#include "rocksdb/env.h"
-#include "rocksdb/status.h"
-
-namespace rocksdb {
-
-struct BackupableDBOptions {
-  // Where to keep the backup files. Has to be different than dbname_
-  // Best to set this to dbname_ + "/backups"
-  // Required
-  std::string backup_dir;
-
-  // Backup Env object. It will be used for backup file I/O. If it's
-  // nullptr, backups will be written out using DBs Env. If it's
-  // non-nullptr, backup's I/O will be performed using this object.
-  // If you want to have backups on HDFS, use HDFS Env here!
-  // Default: nullptr
-  Env* backup_env;
-
-  // If share_table_files == true, backup will assume that table files with
-  // same name have the same contents. This enables incremental backups and
-  // avoids unnecessary data copies.
-  // If share_table_files == false, each backup will be on its own and will
-  // not share any data with other backups.
-  // default: true
-  bool share_table_files;
-
-  // Backup info and error messages will be written to info_log
-  // if non-nullptr.
-  // Default: nullptr
-  Logger* info_log;
-
-  // If sync == true, we can guarantee you'll get consistent backup even
-  // on a machine crash/reboot. Backup process is slower with sync enabled.
-  // If sync == false, we don't guarantee anything on machine reboot. However,
-  // chances are some of the backups are consistent.
-  // Default: true
-  bool sync;
-
-  // If true, it will delete whatever backups there are already
-  // Default: false
-  bool destroy_old_data;
-
-  // If false, we won't backup log files. This option can be useful for backing
-  // up in-memory databases where log file are persisted, but table files are in
-  // memory.
-  // Default: true
-  bool backup_log_files;
-
-  // Max bytes that can be transferred in a second during backup.
-  // If 0, go as fast as you can
-  // Default: 0
-  uint64_t backup_rate_limit;
-
-  // Max bytes that can be transferred in a second during restore.
-  // If 0, go as fast as you can
-  // Default: 0
-  uint64_t restore_rate_limit;
-
-  // Only used if share_table_files is set to true. If true, will consider that
-  // backups can come from different databases, hence a sst is not uniquely
-  // identifed by its name, but by the triple (file name, crc32, file length)
-  // Default: false
-  // Note: this is an experimental option, and you'll need to set it manually
-  // *turn it on only if you know what you're doing*
-  bool share_files_with_checksum;
-
-  void Dump(Logger* logger) const;
-
-  explicit BackupableDBOptions(const std::string& _backup_dir,
-                               Env* _backup_env = nullptr,
-                               bool _share_table_files = true,
-                               Logger* _info_log = nullptr, bool _sync = true,
-                               bool _destroy_old_data = false,
-                               bool _backup_log_files = true,
-                               uint64_t _backup_rate_limit = 0,
-                               uint64_t _restore_rate_limit = 0)
-      : backup_dir(_backup_dir),
-        backup_env(_backup_env),
-        share_table_files(_share_table_files),
-        info_log(_info_log),
-        sync(_sync),
-        destroy_old_data(_destroy_old_data),
-        backup_log_files(_backup_log_files),
-        backup_rate_limit(_backup_rate_limit),
-        restore_rate_limit(_restore_rate_limit),
-        share_files_with_checksum(false) {
-    assert(share_table_files || !share_files_with_checksum);
-  }
-};
-
-struct RestoreOptions {
-  // If true, restore won't overwrite the existing log files in wal_dir. It will
-  // also move all log files from archive directory to wal_dir. Use this option
-  // in combination with BackupableDBOptions::backup_log_files = false for
-  // persisting in-memory databases.
-  // Default: false
-  bool keep_log_files;
-
-  explicit RestoreOptions(bool _keep_log_files = false)
-      : keep_log_files(_keep_log_files) {}
-};
-
-typedef uint32_t BackupID;
-
-struct BackupInfo {
-  BackupID backup_id;
-  int64_t timestamp;
-  uint64_t size;
-
-  BackupInfo() {}
-  BackupInfo(BackupID _backup_id, int64_t _timestamp, uint64_t _size)
-      : backup_id(_backup_id), timestamp(_timestamp), size(_size) {}
-};
-
-class BackupEngineReadOnly {
- public:
-  virtual ~BackupEngineReadOnly() {}
-
-  static BackupEngineReadOnly* NewReadOnlyBackupEngine(
-      Env* db_env, const BackupableDBOptions& options);
-
-  // You can GetBackupInfo safely, even with other BackupEngine performing
-  // backups on the same directory
-  virtual void GetBackupInfo(std::vector<BackupInfo>* backup_info) = 0;
-
-  // Restoring DB from backup is NOT safe when there is another BackupEngine
-  // running that might call DeleteBackup() or PurgeOldBackups(). It is caller's
-  // responsibility to synchronize the operation, i.e. don't delete the backup
-  // when you're restoring from it
-  virtual Status RestoreDBFromBackup(
-      BackupID backup_id, const std::string& db_dir, const std::string& wal_dir,
-      const RestoreOptions& restore_options = RestoreOptions()) = 0;
-  virtual Status RestoreDBFromLatestBackup(
-      const std::string& db_dir, const std::string& wal_dir,
-      const RestoreOptions& restore_options = RestoreOptions()) = 0;
-};
-
-// Please see the documentation in BackupableDB and RestoreBackupableDB
-class BackupEngine {
- public:
-  virtual ~BackupEngine() {}
-
-  static BackupEngine* NewBackupEngine(Env* db_env,
-                                       const BackupableDBOptions& options);
-
-  virtual Status CreateNewBackup(DB* db, bool flush_before_backup = false) = 0;
-  virtual Status PurgeOldBackups(uint32_t num_backups_to_keep) = 0;
-  virtual Status DeleteBackup(BackupID backup_id) = 0;
-  virtual void StopBackup() = 0;
-
-  virtual void GetBackupInfo(std::vector<BackupInfo>* backup_info) = 0;
-  virtual Status RestoreDBFromBackup(
-      BackupID backup_id, const std::string& db_dir, const std::string& wal_dir,
-      const RestoreOptions& restore_options = RestoreOptions()) = 0;
-  virtual Status RestoreDBFromLatestBackup(
-      const std::string& db_dir, const std::string& wal_dir,
-      const RestoreOptions& restore_options = RestoreOptions()) = 0;
-};
-
-// Stack your DB with BackupableDB to be able to backup the DB
-class BackupableDB : public StackableDB {
- public:
-  // BackupableDBOptions have to be the same as the ones used in a previous
-  // incarnation of the DB
-  //
-  // BackupableDB ownes the pointer `DB* db` now. You should not delete it or
-  // use it after the invocation of BackupableDB
-  BackupableDB(DB* db, const BackupableDBOptions& options);
-  virtual ~BackupableDB();
-
-  // Captures the state of the database in the latest backup
-  // NOT a thread safe call
-  Status CreateNewBackup(bool flush_before_backup = false);
-  // Returns info about backups in backup_info
-  void GetBackupInfo(std::vector<BackupInfo>* backup_info);
-  // deletes old backups, keeping latest num_backups_to_keep alive
-  Status PurgeOldBackups(uint32_t num_backups_to_keep);
-  // deletes a specific backup
-  Status DeleteBackup(BackupID backup_id);
-  // Call this from another thread if you want to stop the backup
-  // that is currently happening. It will return immediatelly, will
-  // not wait for the backup to stop.
-  // The backup will stop ASAP and the call to CreateNewBackup will
-  // return Status::Incomplete(). It will not clean up after itself, but
-  // the state will remain consistent. The state will be cleaned up
-  // next time you create BackupableDB or RestoreBackupableDB.
-  void StopBackup();
-
- private:
-  BackupEngine* backup_engine_;
-};
-
-// Use this class to access information about backups and restore from them
-class RestoreBackupableDB {
- public:
-  RestoreBackupableDB(Env* db_env, const BackupableDBOptions& options);
-  ~RestoreBackupableDB();
-
-  // Returns info about backups in backup_info
-  void GetBackupInfo(std::vector<BackupInfo>* backup_info);
-
-  // restore from backup with backup_id
-  // IMPORTANT -- if options_.share_table_files == true and you restore DB
-  // from some backup that is not the latest, and you start creating new
-  // backups from the new DB, they will probably fail
-  //
-  // Example: Let's say you have backups 1, 2, 3, 4, 5 and you restore 3.
-  // If you add new data to the DB and try creating a new backup now, the
-  // database will diverge from backups 4 and 5 and the new backup will fail.
-  // If you want to create new backup, you will first have to delete backups 4
-  // and 5.
-  Status RestoreDBFromBackup(BackupID backup_id, const std::string& db_dir,
-                             const std::string& wal_dir,
-                             const RestoreOptions& restore_options =
-                                 RestoreOptions());
-
-  // restore from the latest backup
-  Status RestoreDBFromLatestBackup(const std::string& db_dir,
-                                   const std::string& wal_dir,
-                                   const RestoreOptions& restore_options =
-                                       RestoreOptions());
-  // deletes old backups, keeping latest num_backups_to_keep alive
-  Status PurgeOldBackups(uint32_t num_backups_to_keep);
-  // deletes a specific backup
-  Status DeleteBackup(BackupID backup_id);
-
- private:
-  BackupEngine* backup_engine_;
-};
-
-}  // namespace rocksdb
-#endif  // ROCKSDB_LITE
+#warning This file was moved to rocksdb/utilities/backupable_db.h
+#include "rocksdb/utilities/backupable_db.h"
diff --git a/src/rocksdb/include/utilities/db_ttl.h b/src/rocksdb/include/utilities/db_ttl.h
index e99744d..c3d5c2b 100644
--- a/src/rocksdb/include/utilities/db_ttl.h
+++ b/src/rocksdb/include/utilities/db_ttl.h
@@ -4,65 +4,5 @@
 //  of patent rights can be found in the PATENTS file in the same directory.
 
 #pragma once
-#ifndef ROCKSDB_LITE
-
-#include <string>
-#include <vector>
-
-#include "utilities/stackable_db.h"
-#include "rocksdb/db.h"
-
-namespace rocksdb {
-
-// Database with TTL support.
-//
-// USE-CASES:
-// This API should be used to open the db when key-values inserted are
-//  meant to be removed from the db in a non-strict 'ttl' amount of time
-//  Therefore, this guarantees that key-values inserted will remain in the
-//  db for >= ttl amount of time and the db will make efforts to remove the
-//  key-values as soon as possible after ttl seconds of their insertion.
-//
-// BEHAVIOUR:
-// TTL is accepted in seconds
-// (int32_t)Timestamp(creation) is suffixed to values in Put internally
-// Expired TTL values deleted in compaction only:(Timestamp+ttl<time_now)
-// Get/Iterator may return expired entries(compaction not run on them yet)
-// Different TTL may be used during different Opens
-// Example: Open1 at t=0 with ttl=4 and insert k1,k2, close at t=2
-//          Open2 at t=3 with ttl=5. Now k1,k2 should be deleted at t>=5
-// read_only=true opens in the usual read-only mode. Compactions will not be
-//  triggered(neither manual nor automatic), so no expired entries removed
-//
-// CONSTRAINTS:
-// Not specifying/passing or non-positive TTL behaves like TTL = infinity
-//
-// !!!WARNING!!!:
-// Calling DB::Open directly to re-open a db created by this API will get
-//  corrupt values(timestamp suffixed) and no ttl effect will be there
-//  during the second Open, so use this API consistently to open the db
-// Be careful when passing ttl with a small positive value because the
-//  whole database may be deleted in a small amount of time
-
-class DBWithTTL : public StackableDB {
- public:
-  virtual Status CreateColumnFamilyWithTtl(
-      const ColumnFamilyOptions& options, const std::string& column_family_name,
-      ColumnFamilyHandle** handle, int ttl) = 0;
-
-  static Status Open(const Options& options, const std::string& dbname,
-                     DBWithTTL** dbptr, int32_t ttl = 0,
-                     bool read_only = false);
-
-  static Status Open(const DBOptions& db_options, const std::string& dbname,
-                     const std::vector<ColumnFamilyDescriptor>& column_families,
-                     std::vector<ColumnFamilyHandle*>* handles,
-                     DBWithTTL** dbptr, std::vector<int32_t> ttls,
-                     bool read_only = false);
-
- protected:
-  explicit DBWithTTL(DB* db) : StackableDB(db) {}
-};
-
-}  // namespace rocksdb
-#endif  // ROCKSDB_LITE
+#warning This file was moved to rocksdb/utilities/db_ttl.h
+#include "rocksdb/utilities/db_ttl.h"
diff --git a/src/rocksdb/include/utilities/document_db.h b/src/rocksdb/include/utilities/document_db.h
new file mode 100644
index 0000000..1d1330b
--- /dev/null
+++ b/src/rocksdb/include/utilities/document_db.h
@@ -0,0 +1,8 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+#warning This file was moved to rocksdb/utilities/document_db.h
+#include "rocksdb/utilities/document_db.h"
diff --git a/src/rocksdb/include/utilities/geo_db.h b/src/rocksdb/include/utilities/geo_db.h
index 87ff5e6..48957d4 100644
--- a/src/rocksdb/include/utilities/geo_db.h
+++ b/src/rocksdb/include/utilities/geo_db.h
@@ -2,104 +2,7 @@
 //  This source code is licensed under the BSD-style license found in the
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
-//
 
-#ifndef ROCKSDB_LITE
 #pragma once
-#include <string>
-#include <vector>
-
-#include "utilities/stackable_db.h"
-#include "rocksdb/status.h"
-
-namespace rocksdb {
-
-//
-// Configurable options needed for setting up a Geo database
-//
-struct GeoDBOptions {
-  // Backup info and error messages will be written to info_log
-  // if non-nullptr.
-  // Default: nullptr
-  Logger* info_log;
-
-  explicit GeoDBOptions(Logger* _info_log = nullptr):info_log(_info_log) { }
-};
-
-//
-// A position in the earth's geoid
-//
-class GeoPosition {
- public:
-  double latitude;
-  double longitude;
-
-  explicit GeoPosition(double la = 0, double lo = 0) :
-    latitude(la), longitude(lo) {
-  }
-};
-
-//
-// Description of an object on the Geoid. It is located by a GPS location,
-// and is identified by the id. The value associated with this object is
-// an opaque string 'value'. Different objects identified by unique id's
-// can have the same gps-location associated with them.
-//
-class GeoObject {
- public:
-  GeoPosition position;
-  std::string id;
-  std::string value;
-
-  GeoObject() {}
-
-  GeoObject(const GeoPosition& pos, const std::string& i,
-            const std::string& val) :
-    position(pos), id(i), value(val) {
-  }
-};
-
-//
-// Stack your DB with GeoDB to be able to get geo-spatial support
-//
-class GeoDB : public StackableDB {
- public:
-  // GeoDBOptions have to be the same as the ones used in a previous
-  // incarnation of the DB
-  //
-  // GeoDB owns the pointer `DB* db` now. You should not delete it or
-  // use it after the invocation of GeoDB
-  // GeoDB(DB* db, const GeoDBOptions& options) : StackableDB(db) {}
-  GeoDB(DB* db, const GeoDBOptions& options) : StackableDB(db) {}
-  virtual ~GeoDB() {}
-
-  // Insert a new object into the location database. The object is
-  // uniquely identified by the id. If an object with the same id already
-  // exists in the db, then the old one is overwritten by the new
-  // object being inserted here.
-  virtual Status Insert(const GeoObject& object) = 0;
-
-  // Retrieve the value of the object located at the specified GPS
-  // location and is identified by the 'id'.
-  virtual Status GetByPosition(const GeoPosition& pos,
-                               const Slice& id, std::string* value) = 0;
-
-  // Retrieve the value of the object identified by the 'id'. This method
-  // could be potentially slower than GetByPosition
-  virtual Status GetById(const Slice& id, GeoObject*  object) = 0;
-
-  // Delete the specified object
-  virtual Status Remove(const Slice& id) = 0;
-
-  // Returns a list of all items within a circular radius from the
-  // specified gps location. If 'number_of_values' is specified,
-  // then this call returns at most that many number of objects.
-  // The radius is specified in 'meters'.
-  virtual Status SearchRadial(const GeoPosition& pos,
-                              double radius,
-                              std::vector<GeoObject>* values,
-                              int number_of_values = INT_MAX) = 0;
-};
-
-}  // namespace rocksdb
-#endif  // ROCKSDB_LITE
+#warning This file was moved to rocksdb/utilities/geo_db.h
+#include "rocksdb/utilities/geo_db.h"
diff --git a/src/rocksdb/include/utilities/json_document.h b/src/rocksdb/include/utilities/json_document.h
new file mode 100644
index 0000000..f3f9396
--- /dev/null
+++ b/src/rocksdb/include/utilities/json_document.h
@@ -0,0 +1,7 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+#pragma once
+#warning This file was moved to rocksdb/utilities/json_document.h
+#include "rocksdb/utilities/json_document.h"
diff --git a/src/rocksdb/include/utilities/stackable_db.h b/src/rocksdb/include/utilities/stackable_db.h
index 7927c2a..435818d 100644
--- a/src/rocksdb/include/utilities/stackable_db.h
+++ b/src/rocksdb/include/utilities/stackable_db.h
@@ -3,213 +3,5 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #pragma once
-#include "rocksdb/db.h"
-
-namespace rocksdb {
-
-// This class contains APIs to stack rocksdb wrappers.Eg. Stack TTL over base d
-class StackableDB : public DB {
- public:
-  // StackableDB is the owner of db now!
-  explicit StackableDB(DB* db) : db_(db) {}
-
-  ~StackableDB() {
-    delete db_;
-  }
-
-  virtual DB* GetBaseDB() {
-    return db_;
-  }
-
-  virtual Status CreateColumnFamily(const ColumnFamilyOptions& options,
-                                    const std::string& column_family_name,
-                                    ColumnFamilyHandle** handle) {
-    return db_->CreateColumnFamily(options, column_family_name, handle);
-  }
-
-  virtual Status DropColumnFamily(ColumnFamilyHandle* column_family) {
-    return db_->DropColumnFamily(column_family);
-  }
-
-  using DB::Put;
-  virtual Status Put(const WriteOptions& options,
-                     ColumnFamilyHandle* column_family, const Slice& key,
-                     const Slice& val) override {
-    return db_->Put(options, column_family, key, val);
-  }
-
-  using DB::Get;
-  virtual Status Get(const ReadOptions& options,
-                     ColumnFamilyHandle* column_family, const Slice& key,
-                     std::string* value) override {
-    return db_->Get(options, column_family, key, value);
-  }
-
-  using DB::MultiGet;
-  virtual std::vector<Status> MultiGet(
-      const ReadOptions& options,
-      const std::vector<ColumnFamilyHandle*>& column_family,
-      const std::vector<Slice>& keys,
-      std::vector<std::string>* values) override {
-    return db_->MultiGet(options, column_family, keys, values);
-  }
-
-  using DB::KeyMayExist;
-  virtual bool KeyMayExist(const ReadOptions& options,
-                           ColumnFamilyHandle* column_family, const Slice& key,
-                           std::string* value,
-                           bool* value_found = nullptr) override {
-    return db_->KeyMayExist(options, column_family, key, value, value_found);
-  }
-
-  using DB::Delete;
-  virtual Status Delete(const WriteOptions& wopts,
-                        ColumnFamilyHandle* column_family,
-                        const Slice& key) override {
-    return db_->Delete(wopts, column_family, key);
-  }
-
-  using DB::Merge;
-  virtual Status Merge(const WriteOptions& options,
-                       ColumnFamilyHandle* column_family, const Slice& key,
-                       const Slice& value) override {
-    return db_->Merge(options, column_family, key, value);
-  }
-
-
-  virtual Status Write(const WriteOptions& opts, WriteBatch* updates)
-    override {
-      return db_->Write(opts, updates);
-  }
-
-  using DB::NewIterator;
-  virtual Iterator* NewIterator(const ReadOptions& opts,
-                                ColumnFamilyHandle* column_family) override {
-    return db_->NewIterator(opts, column_family);
-  }
-
-  virtual Status NewIterators(
-      const ReadOptions& options,
-      const std::vector<ColumnFamilyHandle*>& column_families,
-      std::vector<Iterator*>* iterators) {
-    return db_->NewIterators(options, column_families, iterators);
-  }
-
-
-  virtual const Snapshot* GetSnapshot() override {
-    return db_->GetSnapshot();
-  }
-
-  virtual void ReleaseSnapshot(const Snapshot* snapshot) override {
-    return db_->ReleaseSnapshot(snapshot);
-  }
-
-  using DB::GetProperty;
-  virtual bool GetProperty(ColumnFamilyHandle* column_family,
-                           const Slice& property, std::string* value) override {
-      return db_->GetProperty(column_family, property, value);
-  }
-
-  using DB::GetApproximateSizes;
-  virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
-                                   const Range* r, int n,
-                                   uint64_t* sizes) override {
-      return db_->GetApproximateSizes(column_family, r, n, sizes);
-  }
-
-  using DB::CompactRange;
-  virtual Status CompactRange(ColumnFamilyHandle* column_family,
-                              const Slice* begin, const Slice* end,
-                              bool reduce_level = false,
-                              int target_level = -1) override {
-    return db_->CompactRange(column_family, begin, end, reduce_level,
-                             target_level);
-  }
-
-  using DB::NumberLevels;
-  virtual int NumberLevels(ColumnFamilyHandle* column_family) override {
-    return db_->NumberLevels(column_family);
-  }
-
-  using DB::MaxMemCompactionLevel;
-  virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family)
-      override {
-    return db_->MaxMemCompactionLevel(column_family);
-  }
-
-  using DB::Level0StopWriteTrigger;
-  virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family)
-      override {
-    return db_->Level0StopWriteTrigger(column_family);
-  }
-
-  virtual const std::string& GetName() const override {
-    return db_->GetName();
-  }
-
-  virtual Env* GetEnv() const override {
-    return db_->GetEnv();
-  }
-
-  using DB::GetOptions;
-  virtual const Options& GetOptions(ColumnFamilyHandle* column_family) const
-      override {
-    return db_->GetOptions(column_family);
-  }
-
-  using DB::Flush;
-  virtual Status Flush(const FlushOptions& fopts,
-                       ColumnFamilyHandle* column_family) override {
-    return db_->Flush(fopts, column_family);
-  }
-
-  virtual Status DisableFileDeletions() override {
-    return db_->DisableFileDeletions();
-  }
-
-  virtual Status EnableFileDeletions(bool force) override {
-    return db_->EnableFileDeletions(force);
-  }
-
-  virtual Status GetLiveFiles(std::vector<std::string>& vec, uint64_t* mfs,
-                              bool flush_memtable = true) override {
-      return db_->GetLiveFiles(vec, mfs, flush_memtable);
-  }
-
-  virtual SequenceNumber GetLatestSequenceNumber() const override {
-    return db_->GetLatestSequenceNumber();
-  }
-
-  virtual Status GetSortedWalFiles(VectorLogPtr& files) override {
-    return db_->GetSortedWalFiles(files);
-  }
-
-  virtual Status DeleteFile(std::string name) override {
-    return db_->DeleteFile(name);
-  }
-
-  virtual Status GetDbIdentity(std::string& identity) {
-    return db_->GetDbIdentity(identity);
-  }
-
-  using DB::GetPropertiesOfAllTables;
-  virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
-                                          TablePropertiesCollection* props) {
-    return db_->GetPropertiesOfAllTables(column_family, props);
-  }
-
-  virtual Status GetUpdatesSince(
-      SequenceNumber seq_number, unique_ptr<TransactionLogIterator>* iter,
-      const TransactionLogIterator::ReadOptions& read_options) override {
-    return db_->GetUpdatesSince(seq_number, iter, read_options);
-  }
-
-  virtual ColumnFamilyHandle* DefaultColumnFamily() const override {
-    return db_->DefaultColumnFamily();
-  }
-
- protected:
-  DB* db_;
-};
-
-} //  namespace rocksdb
+#warning This file was moved to rocksdb/utilities/stackable_db.h
+#include "rocksdb/utilities/stackable_db.h"
diff --git a/src/rocksdb/include/utilities/utility_db.h b/src/rocksdb/include/utilities/utility_db.h
index f2b99ce..4a8bbae 100644
--- a/src/rocksdb/include/utilities/utility_db.h
+++ b/src/rocksdb/include/utilities/utility_db.h
@@ -3,28 +3,5 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #pragma once
-#ifndef ROCKSDB_LITE
-#include <vector>
-#include <string>
-
-#include "utilities/stackable_db.h"
-#include "utilities/db_ttl.h"
-#include "rocksdb/db.h"
-
-namespace rocksdb {
-
-// Please don't use this class. It's deprecated
-class UtilityDB {
- public:
-  // This function is here only for backwards compatibility. Please use the
-  // functions defined in DBWithTTl (utilities/db_ttl.h)
-  // (deprecated)
-  __attribute__((deprecated)) static Status OpenTtlDB(const Options& options,
-                                                      const std::string& name,
-                                                      StackableDB** dbptr,
-                                                      int32_t ttl = 0,
-                                                      bool read_only = false);
-};
-
-} //  namespace rocksdb
-#endif  // ROCKSDB_LITE
+#warning This file was moved to rocksdb/utilities/utility_db.h
+#include "rocksdb/utilities/utility_db.h"
diff --git a/src/rocksdb/java/Makefile b/src/rocksdb/java/Makefile
deleted file mode 100644
index 9d21b57..0000000
--- a/src/rocksdb/java/Makefile
+++ /dev/null
@@ -1,31 +0,0 @@
-NATIVE_JAVA_CLASSES = org.rocksdb.RocksDB org.rocksdb.Options org.rocksdb.WriteBatch org.rocksdb.WriteBatchInternal org.rocksdb.WriteBatchTest org.rocksdb.WriteOptions org.rocksdb.BackupableDB org.rocksdb.BackupableDBOptions org.rocksdb.Statistics org.rocksdb.Iterator org.rocksdb.VectorMemTableConfig org.rocksdb.SkipListMemTableConfig org.rocksdb.HashLinkedListMemTableConfig org.rocksdb.HashSkipListMemTableConfig org.rocksdb.PlainTableConfig org.rocksdb.ReadOptions org.rocksdb.Filter org [...]
-NATIVE_INCLUDE = ./include
-ROCKSDB_JAR = rocksdbjni.jar
-
-clean:
-	-find . -name "*.class" -exec rm {} \;
-	-find . -name "hs*.log" -exec rm {} \;
-	rm -f $(ROCKSDB_JAR)
-
-java:
-	javac org/rocksdb/util/*.java org/rocksdb/*.java
-	jar -cf $(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class
-	javah -d $(NATIVE_INCLUDE) -jni $(NATIVE_JAVA_CLASSES)
-
-sample: java
-	javac -cp $(ROCKSDB_JAR) RocksDBSample.java
-	@rm -rf /tmp/rocksdbjni
-	@rm -rf /tmp/rocksdbjni_not_found
-	java -ea -Djava.library.path=.:../ -cp ".:./*" -Xcheck:jni RocksDBSample /tmp/rocksdbjni
-	@rm -rf /tmp/rocksdbjni
-	@rm -rf /tmp/rocksdbjni_not_found
-
-test: java
-	javac org/rocksdb/test/*.java
-	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.WriteBatchTest
-	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.BackupableDBTest
-	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.OptionsTest
-	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ReadOptionsTest
-
-db_bench: java
-	javac org/rocksdb/benchmark/*.java
diff --git a/src/rocksdb/java/RocksDBSample.java b/src/rocksdb/java/RocksDBSample.java
deleted file mode 100644
index 5d11b1a..0000000
--- a/src/rocksdb/java/RocksDBSample.java
+++ /dev/null
@@ -1,253 +0,0 @@
-// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
-
-import java.util.Arrays;
-import java.util.List;
-import java.util.Map;
-import java.util.ArrayList;
-import org.rocksdb.*;
-import org.rocksdb.util.SizeUnit;
-import java.io.IOException;
-
-public class RocksDBSample {
-  static {
-    RocksDB.loadLibrary();
-  }
-
-  public static void main(String[] args) {
-    if (args.length < 1) {
-      System.out.println("usage: RocksDBSample db_path");
-      return;
-    }
-    String db_path = args[0];
-    String db_path_not_found = db_path + "_not_found";
-
-    System.out.println("RocksDBSample");
-    RocksDB db = null;
-    Options options = new Options();
-    try {
-      db = RocksDB.open(options, db_path_not_found);
-      assert(false);
-    } catch (RocksDBException e) {
-      System.out.format("caught the expceted exception -- %s\n", e);
-      assert(db == null);
-    }
-
-    Filter filter = new BloomFilter(10);
-    options.setCreateIfMissing(true)
-        .createStatistics()
-        .setWriteBufferSize(8 * SizeUnit.KB)
-        .setMaxWriteBufferNumber(3)
-        .setDisableSeekCompaction(true)
-        .setBlockSize(64 * SizeUnit.KB)
-        .setMaxBackgroundCompactions(10)
-        .setFilter(filter);
-    Statistics stats = options.statisticsPtr();
-
-    assert(options.createIfMissing() == true);
-    assert(options.writeBufferSize() == 8 * SizeUnit.KB);
-    assert(options.maxWriteBufferNumber() == 3);
-    assert(options.disableSeekCompaction() == true);
-    assert(options.blockSize() == 64 * SizeUnit.KB);
-    assert(options.maxBackgroundCompactions() == 10);
-
-    assert(options.memTableFactoryName().equals("SkipListFactory"));
-    options.setMemTableConfig(
-        new HashSkipListMemTableConfig()
-            .setHeight(4)
-            .setBranchingFactor(4)
-            .setBucketCount(2000000));
-    assert(options.memTableFactoryName().equals("HashSkipListRepFactory"));
-
-    options.setMemTableConfig(
-        new HashLinkedListMemTableConfig()
-            .setBucketCount(100000));
-    assert(options.memTableFactoryName().equals("HashLinkedListRepFactory"));
-
-    options.setMemTableConfig(
-        new VectorMemTableConfig().setReservedSize(10000));
-    assert(options.memTableFactoryName().equals("VectorRepFactory"));
-
-    options.setMemTableConfig(new SkipListMemTableConfig());
-    assert(options.memTableFactoryName().equals("SkipListFactory"));
-
-    options.setTableFormatConfig(new PlainTableConfig());
-    assert(options.tableFactoryName().equals("PlainTable"));
-
-    try {
-      db = RocksDB.open(options, db_path_not_found);
-      db.put("hello".getBytes(), "world".getBytes());
-      byte[] value = db.get("hello".getBytes());
-      assert("world".equals(new String(value)));
-    } catch (RocksDBException e) {
-      System.out.format("[ERROR] caught the unexpceted exception -- %s\n", e);
-      assert(db == null);
-      assert(false);
-    }
-    // be sure to release the c++ pointer
-    db.close();
-
-    ReadOptions readOptions = new ReadOptions();
-    readOptions.setFillCache(false);
-
-    try {
-      db = RocksDB.open(options, db_path);
-      db.put("hello".getBytes(), "world".getBytes());
-      byte[] value = db.get("hello".getBytes());
-      System.out.format("Get('hello') = %s\n",
-          new String(value));
-
-      for (int i = 1; i <= 9; ++i) {
-        for (int j = 1; j <= 9; ++j) {
-          db.put(String.format("%dx%d", i, j).getBytes(),
-                 String.format("%d", i * j).getBytes());
-        }
-      }
-
-      for (int i = 1; i <= 9; ++i) {
-        for (int j = 1; j <= 9; ++j) {
-          System.out.format("%s ", new String(db.get(
-              String.format("%dx%d", i, j).getBytes())));
-        }
-        System.out.println("");
-      }
-
-      value = db.get("1x1".getBytes());
-      assert(value != null);
-      value = db.get("world".getBytes());
-      assert(value == null);
-      value = db.get(readOptions, "world".getBytes());
-      assert(value == null);
-
-      byte[] testKey = "asdf".getBytes();
-      byte[] testValue =
-          "asdfghjkl;'?><MNBVCXZQWERTYUIOP{+_)(*&^%$#@".getBytes();
-      db.put(testKey, testValue);
-      byte[] testResult = db.get(testKey);
-      assert(testResult != null);
-      assert(Arrays.equals(testValue, testResult));
-      assert(new String(testValue).equals(new String(testResult)));
-      testResult = db.get(readOptions, testKey);
-      assert(testResult != null);
-      assert(Arrays.equals(testValue, testResult));
-      assert(new String(testValue).equals(new String(testResult)));
-
-      byte[] insufficientArray = new byte[10];
-      byte[] enoughArray = new byte[50];
-      int len;
-      len = db.get(testKey, insufficientArray);
-      assert(len > insufficientArray.length);
-      len = db.get("asdfjkl;".getBytes(), enoughArray);
-      assert(len == RocksDB.NOT_FOUND);
-      len = db.get(testKey, enoughArray);
-      assert(len == testValue.length);
-
-      len = db.get(readOptions, testKey, insufficientArray);
-      assert(len > insufficientArray.length);
-      len = db.get(readOptions, "asdfjkl;".getBytes(), enoughArray);
-      assert(len == RocksDB.NOT_FOUND);
-      len = db.get(readOptions, testKey, enoughArray);
-      assert(len == testValue.length);
-
-      db.remove(testKey);
-      len = db.get(testKey, enoughArray);
-      assert(len == RocksDB.NOT_FOUND);
-
-      // repeat the test with WriteOptions
-      WriteOptions writeOpts = new WriteOptions();
-      writeOpts.setSync(true);
-      writeOpts.setDisableWAL(true);
-      db.put(writeOpts, testKey, testValue);
-      len = db.get(testKey, enoughArray);
-      assert(len == testValue.length);
-      assert(new String(testValue).equals(
-          new String(enoughArray, 0, len)));
-      writeOpts.dispose();
-
-      try {
-        for (TickerType statsType : TickerType.values()) {
-          stats.getTickerCount(statsType);
-        }
-        System.out.println("getTickerCount() passed.");
-      } catch (Exception e) {
-        System.out.println("Failed in call to getTickerCount()");
-        assert(false); //Should never reach here.
-      }
-
-      try {
-        for (HistogramType histogramType : HistogramType.values()) {
-          HistogramData data = stats.geHistogramData(histogramType);
-        }
-        System.out.println("geHistogramData() passed.");
-      } catch (Exception e) {
-        System.out.println("Failed in call to geHistogramData()");
-        assert(false); //Should never reach here.
-      }
-
-      Iterator iterator = db.newIterator();
-
-      boolean seekToFirstPassed = false;
-      for (iterator.seekToFirst(); iterator.isValid(); iterator.next()) {
-        iterator.status();
-        assert(iterator.key() != null);
-        assert(iterator.value() != null);
-        seekToFirstPassed = true;
-      }
-      if(seekToFirstPassed) {
-        System.out.println("iterator seekToFirst tests passed.");
-      }
-
-      boolean seekToLastPassed = false;
-      for (iterator.seekToLast(); iterator.isValid(); iterator.prev()) {
-        iterator.status();
-        assert(iterator.key() != null);
-        assert(iterator.value() != null);
-        seekToLastPassed = true;
-      }
-
-      if(seekToLastPassed) {
-        System.out.println("iterator seekToLastPassed tests passed.");
-      }
-
-      iterator.seekToFirst();
-      iterator.seek(iterator.key());
-      assert(iterator.key() != null);
-      assert(iterator.value() != null);
-
-      System.out.println("iterator seek test passed.");
-
-      iterator.dispose();
-      System.out.println("iterator tests passed.");
-
-      iterator = db.newIterator();
-      List<byte[]> keys = new ArrayList<byte[]>();
-      for (iterator.seekToLast(); iterator.isValid(); iterator.prev()) {
-        keys.add(iterator.key());
-      }
-      iterator.dispose();
-
-      Map<byte[], byte[]> values = db.multiGet(keys);
-      assert(values.size() == keys.size());
-      for(byte[] value1 : values.values()) {
-        assert(value1 != null);
-      }
-
-      values = db.multiGet(new ReadOptions(), keys);
-      assert(values.size() == keys.size());
-      for(byte[] value1 : values.values()) {
-        assert(value1 != null);
-      }
-    } catch (RocksDBException e) {
-      System.err.println(e);
-    }
-    if (db != null) {
-      db.close();
-    }
-    // be sure to dispose c++ pointers
-    options.dispose();
-    readOptions.dispose();
-    filter.dispose();
-  }
-}
diff --git a/src/rocksdb/java/jdb_bench.sh b/src/rocksdb/java/jdb_bench.sh
deleted file mode 100755
index dba7dbd..0000000
--- a/src/rocksdb/java/jdb_bench.sh
+++ /dev/null
@@ -1 +0,0 @@
-java -server -d64 -XX:NewSize=4m -XX:+AggressiveOpts -Djava.library.path=.:../ -cp "rocksdbjni.jar:.:./*" org.rocksdb.benchmark.DbBenchmark $@
diff --git a/src/rocksdb/java/org/rocksdb/BackupableDB.java b/src/rocksdb/java/org/rocksdb/BackupableDB.java
deleted file mode 100644
index 91607d4..0000000
--- a/src/rocksdb/java/org/rocksdb/BackupableDB.java
+++ /dev/null
@@ -1,80 +0,0 @@
-// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
-
-package org.rocksdb;
-
-/**
- * A subclass of RocksDB which supports backup-related operations.
- *
- * @see BackupableDBOptions
- */
-public class BackupableDB extends RocksDB {
-  /**
-   * Open a BackupableDB under the specified path.
-   * Note that the backup path should be set properly in the
-   * input BackupableDBOptions.
-   *
-   * @param opt options for db.
-   * @param bopt backup related options.
-   * @param the db path for storing data.  The path for storing
-   *     backup should be specified in the BackupableDBOptions.
-   * @return reference to the opened BackupableDB.
-   */
-  public static BackupableDB open(
-      Options opt, BackupableDBOptions bopt, String db_path)
-      throws RocksDBException {
-    // since BackupableDB c++ will handle the life cycle of
-    // the returned RocksDB of RocksDB.open(), here we store
-    // it as a BackupableDB member variable to avoid GC.
-    BackupableDB bdb = new BackupableDB(RocksDB.open(opt, db_path));
-    bdb.open(bdb.db_.nativeHandle_, bopt.nativeHandle_);
-
-    return bdb;
-  }
-
-  /**
-   * Captures the state of the database in the latest backup.
-   * Note that this function is not thread-safe.
-   *
-   * @param flushBeforeBackup if true, then all data will be flushed
-   *     before creating backup.
-   */
-  public void createNewBackup(boolean flushBeforeBackup) {
-    createNewBackup(nativeHandle_, flushBeforeBackup);
-  }
-
-
-  /**
-   * Close the BackupableDB instance and release resource.
-   *
-   * Internally, BackupableDB owns the rocksdb::DB pointer to its
-   * associated RocksDB.  The release of that RocksDB pointer is
-   * handled in the destructor of the c++ rocksdb::BackupableDB and
-   * should be transparent to Java developers.
-   */
-  @Override public synchronized void close() {
-    if (isInitialized()) {
-      super.close();
-    }
-  }
-
-  /**
-   * A protected construction that will be used in the static factory
-   * method BackupableDB.open().
-   */
-  protected BackupableDB(RocksDB db) {
-    super();
-    db_ = db;
-  }
-
-  @Override protected void finalize() {
-    close();
-  }
-
-  protected native void open(long rocksDBHandle, long backupDBOptionsHandle);
-  protected native void createNewBackup(long handle, boolean flag);
-
-  private final RocksDB db_;
-}
diff --git a/src/rocksdb/java/org/rocksdb/BackupableDBOptions.java b/src/rocksdb/java/org/rocksdb/BackupableDBOptions.java
deleted file mode 100644
index 2c64b60..0000000
--- a/src/rocksdb/java/org/rocksdb/BackupableDBOptions.java
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
-
-package org.rocksdb;
-
-/**
- * BackupableDBOptions to control the behavior of a backupable database.
- * It will be used during the creation of a BackupableDB.
- *
- * Note that dispose() must be called before an Options instance
- * become out-of-scope to release the allocated memory in c++.
- */
-public class BackupableDBOptions extends RocksObject {
-  public BackupableDBOptions(String path) {
-    super();
-    newBackupableDBOptions(path);
-  }
-
-  /**
-   * Returns the path to the BackupableDB directory.
-   *
-   * @return the path to the BackupableDB directory.
-   */
-  public String backupDir() {
-    assert(isInitialized());
-    return backupDir(nativeHandle_);
-  }
-
-  /**
-   * Release the memory allocated for the current instance
-   * in the c++ side.
-   */
-  @Override public synchronized void dispose() {
-    if (isInitialized()) {
-      dispose(nativeHandle_);
-    }
-  }
-
-  private native void newBackupableDBOptions(String path);
-  private native String backupDir(long handle);
-  private native void dispose(long handle);
-}
diff --git a/src/rocksdb/java/org/rocksdb/BloomFilter.java b/src/rocksdb/java/org/rocksdb/BloomFilter.java
deleted file mode 100644
index 9c4913a..0000000
--- a/src/rocksdb/java/org/rocksdb/BloomFilter.java
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
-
-package org.rocksdb;
-
-/**
- * This class creates a new filter policy that uses a bloom filter
- * with approximately the specified number of bits per key.
- * A good value for bitsPerKey is 10, which yields a filter
- * with ~ 1% false positive rate.
- *
- * Default value of bits per key is 10.
- */
-public class BloomFilter extends Filter {
-  private static final int DEFAULT_BITS_PER_KEY = 10;
-  private final int bitsPerKey_;
-
-  public BloomFilter() {
-    this(DEFAULT_BITS_PER_KEY);
-  }
-
-  public BloomFilter(int bitsPerKey) {
-    super();
-    bitsPerKey_ = bitsPerKey;
-
-    createNewFilter();
-  }
-
-  @Override
-  protected void createNewFilter() {
-    createNewFilter0(bitsPerKey_);
-  }
-
-  private native void createNewFilter0(int bitsKeyKey);
-}
diff --git a/src/rocksdb/java/org/rocksdb/Filter.java b/src/rocksdb/java/org/rocksdb/Filter.java
deleted file mode 100644
index 3a01ad4..0000000
--- a/src/rocksdb/java/org/rocksdb/Filter.java
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
-
-package org.rocksdb;
-
-/**
- * Filters are stored in rocksdb and are consulted automatically
- * by rocksdb to decide whether or not to read some
- * information from disk. In many cases, a filter can cut down the
- * number of disk seeks form a handful to a single disk seek per
- * DB::Get() call.
- */
-public abstract class Filter extends RocksObject {
-  protected abstract void createNewFilter();
-
-  /**
-   * Deletes underlying C++ filter pointer.
-   *
-   * Note that this function should be called only after all
-   * RocksDB instances referencing the filter are closed.
-   * Otherwise an undefined behavior will occur.
-   */
-  @Override public synchronized void dispose() {
-    if (isInitialized()) {
-      dispose0(nativeHandle_);
-    }
-  }
-
-  private native void dispose0(long handle);
-}
diff --git a/src/rocksdb/java/org/rocksdb/HashLinkedListMemTableConfig.java b/src/rocksdb/java/org/rocksdb/HashLinkedListMemTableConfig.java
deleted file mode 100644
index 24fcd8b..0000000
--- a/src/rocksdb/java/org/rocksdb/HashLinkedListMemTableConfig.java
+++ /dev/null
@@ -1,52 +0,0 @@
-package org.rocksdb;
-
-/**
- * The config for hash linked list memtable representation
- * Such memtable contains a fix-sized array of buckets, where
- * each bucket points to a sorted singly-linked
- * list (or null if the bucket is empty).
- *
- * Note that since this mem-table representation relies on the
- * key prefix, it is required to invoke one of the usePrefixExtractor
- * functions to specify how to extract key prefix given a key.
- * If proper prefix-extractor is not set, then RocksDB will
- * use the default memtable representation (SkipList) instead
- * and post a warning in the LOG.
- */
-public class HashLinkedListMemTableConfig extends MemTableConfig {
-  public static final long DEFAULT_BUCKET_COUNT = 50000;
-
-  public HashLinkedListMemTableConfig() {
-    bucketCount_ = DEFAULT_BUCKET_COUNT;
-  }
-
-  /**
-   * Set the number of buckets in the fixed-size array used
-   * in the hash linked-list mem-table.
-   *
-   * @param count the number of hash buckets.
-   * @return the reference to the current HashLinkedListMemTableConfig.
-   */
-  public HashLinkedListMemTableConfig setBucketCount(long count) {
-    bucketCount_ = count;
-    return this;
-  }
-
-  /**
-   * Returns the number of buckets that will be used in the memtable
-   * created based on this config.
-   *
-   * @return the number of buckets
-   */
-  public long bucketCount() {
-    return bucketCount_;
-  }
-
-  @Override protected long newMemTableFactoryHandle() {
-    return newMemTableFactoryHandle(bucketCount_);
-  }
-
-  private native long newMemTableFactoryHandle(long bucketCount);
-
-  private long bucketCount_;
-}
diff --git a/src/rocksdb/java/org/rocksdb/HashSkipListMemTableConfig.java b/src/rocksdb/java/org/rocksdb/HashSkipListMemTableConfig.java
deleted file mode 100644
index 74fb0db..0000000
--- a/src/rocksdb/java/org/rocksdb/HashSkipListMemTableConfig.java
+++ /dev/null
@@ -1,97 +0,0 @@
-package org.rocksdb;
-
-/**
- * The config for hash skip-list mem-table representation.
- * Such mem-table representation contains a fix-sized array of
- * buckets, where each bucket points to a skiplist (or null if the
- * bucket is empty).
- *
- * Note that since this mem-table representation relies on the
- * key prefix, it is required to invoke one of the usePrefixExtractor
- * functions to specify how to extract key prefix given a key.
- * If proper prefix-extractor is not set, then RocksDB will
- * use the default memtable representation (SkipList) instead
- * and post a warning in the LOG.
- */
-public class HashSkipListMemTableConfig extends MemTableConfig {
-  public static final int DEFAULT_BUCKET_COUNT = 1000000;
-  public static final int DEFAULT_BRANCHING_FACTOR = 4;
-  public static final int DEFAULT_HEIGHT = 4;
-
-  public HashSkipListMemTableConfig() {
-    bucketCount_ = DEFAULT_BUCKET_COUNT;
-    branchingFactor_ = DEFAULT_BRANCHING_FACTOR;
-    height_ = DEFAULT_HEIGHT;
-  }
-
-  /**
-   * Set the number of hash buckets used in the hash skiplist memtable.
-   * Default = 1000000.
-   *
-   * @param count the number of hash buckets used in the hash
-   *    skiplist memtable.
-   * @return the reference to the current HashSkipListMemTableConfig.
-   */
-  public HashSkipListMemTableConfig setBucketCount(long count) {
-    bucketCount_ = count;
-    return this;
-  }
-
-  /**
-   * @return the number of hash buckets
-   */
-  public long bucketCount() {
-    return bucketCount_;
-  }
-
-  /**
-   * Set the height of the skip list.  Default = 4.
-   *
-   * @return the reference to the current HashSkipListMemTableConfig.
-   */
-  public HashSkipListMemTableConfig setHeight(int height) {
-    height_ = height;
-    return this;
-  }
-
-  /**
-   * @return the height of the skip list.
-   */
-  public int height() {
-    return height_;
-  }
-
-  /**
-   * Set the branching factor used in the hash skip-list memtable.
-   * This factor controls the probabilistic size ratio between adjacent
-   * links in the skip list.
-   *
-   * @param bf the probabilistic size ratio between adjacent link
-   *     lists in the skip list.
-   * @return the reference to the current HashSkipListMemTableConfig.
-   */
-  public HashSkipListMemTableConfig setBranchingFactor(int bf) {
-    branchingFactor_ = bf;
-    return this;
-  }
-
-  /**
-   * @return branching factor, the probabilistic size ratio between
-   *     adjacent links in the skip list.
-   */
-  public int branchingFactor() {
-    return branchingFactor_;
-  }
-
-  @Override protected long newMemTableFactoryHandle() {
-    return newMemTableFactoryHandle(
-        bucketCount_, height_, branchingFactor_);
-  }
-
-  private native long newMemTableFactoryHandle(
-      long bucketCount, int height, int branchingFactor);
-
-  private long bucketCount_;
-  private int branchingFactor_;
-  private int height_;
-}
diff --git a/src/rocksdb/java/org/rocksdb/HistogramData.java b/src/rocksdb/java/org/rocksdb/HistogramData.java
deleted file mode 100644
index 3b2e295..0000000
--- a/src/rocksdb/java/org/rocksdb/HistogramData.java
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
-
-package org.rocksdb;
-
-public class HistogramData {
-  private final double median_;
-  private final double percentile95_;
-  private final double percentile99_;
-  private final double average_;
-  private final double standardDeviation_;
-
-  public HistogramData(double median, double percentile95,
-      double percentile99, double average, double standardDeviation) {
-    median_ = median;
-    percentile95_ = percentile95;
-    percentile99_ = percentile99;
-    average_ = average;
-    standardDeviation_ = standardDeviation;
-  }
-
-  public double getMedian() {
-    return median_;
-  }
-
-  public double getPercentile95() {
-    return percentile95_;
-  }
-
-  public double getPercentile99() {
-    return percentile99_;
-  }
-
-  public double getAverage() {
-    return average_;
-  }
-
-  public double getStandardDeviation() {
-    return standardDeviation_;
-  }
-}
diff --git a/src/rocksdb/java/org/rocksdb/HistogramType.java b/src/rocksdb/java/org/rocksdb/HistogramType.java
deleted file mode 100644
index 751c03a..0000000
--- a/src/rocksdb/java/org/rocksdb/HistogramType.java
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
-
-package org.rocksdb;
-
-public enum HistogramType {
-  DB_GET(0),
-  DB_WRITE(1),
-  COMPACTION_TIME(2),
-  TABLE_SYNC_MICROS(3),
-  COMPACTION_OUTFILE_SYNC_MICROS(4),
-  WAL_FILE_SYNC_MICROS(5),
-  MANIFEST_FILE_SYNC_MICROS(6),
-  // TIME SPENT IN IO DURING TABLE OPEN
-  TABLE_OPEN_IO_MICROS(7),
-  DB_MULTIGET(8),
-  READ_BLOCK_COMPACTION_MICROS(9),
-  READ_BLOCK_GET_MICROS(10),
-  WRITE_RAW_BLOCK_MICROS(11),
-
-  STALL_L0_SLOWDOWN_COUNT(12),
-  STALL_MEMTABLE_COMPACTION_COUNT(13),
-  STALL_L0_NUM_FILES_COUNT(14),
-  HARD_RATE_LIMIT_DELAY_COUNT(15),
-  SOFT_RATE_LIMIT_DELAY_COUNT(16),
-  NUM_FILES_IN_SINGLE_COMPACTION(17);
-
-  private final int value_;
-
-  private HistogramType(int value) {
-    value_ = value;
-  }
-
-  public int getValue() {
-    return value_;
-  }
-}
diff --git a/src/rocksdb/java/org/rocksdb/Iterator.java b/src/rocksdb/java/org/rocksdb/Iterator.java
deleted file mode 100644
index 3c745a4..0000000
--- a/src/rocksdb/java/org/rocksdb/Iterator.java
+++ /dev/null
@@ -1,138 +0,0 @@
-// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
-
-package org.rocksdb;
-
-/**
- * An iterator yields a sequence of key/value pairs from a source.
- * The following class defines the interface.  Multiple implementations
- * are provided by this library.  In particular, iterators are provided
- * to access the contents of a Table or a DB.
- *
- * Multiple threads can invoke const methods on an Iterator without
- * external synchronization, but if any of the threads may call a
- * non-const method, all threads accessing the same Iterator must use
- * external synchronization.
- */
-public class Iterator extends RocksObject {
-  public Iterator(long nativeHandle) {
-    super();
-    nativeHandle_ = nativeHandle;
-  }
-
-  /**
-   * An iterator is either positioned at a key/value pair, or
-   * not valid.  This method returns true iff the iterator is valid.
-   * @return true if iterator is valid.
-   */
-  public boolean isValid() {
-    assert(isInitialized());
-    return isValid0(nativeHandle_);
-  }
-
-  /**
-   * Position at the first key in the source.  The iterator is Valid()
-   * after this call iff the source is not empty.
-   */
-  public void seekToFirst() {
-    assert(isInitialized());
-    seekToFirst0(nativeHandle_);
-  }
-
-  /**
-   * Position at the last key in the source.  The iterator is
-   * Valid() after this call iff the source is not empty.
-   */
-  public void seekToLast() {
-    assert(isInitialized());
-    seekToLast0(nativeHandle_);
-  }
-
-  /**
-   * Moves to the next entry in the source.  After this call, Valid() is
-   * true iff the iterator was not positioned at the last entry in the source.
-   * REQUIRES: Valid()
-   */
-  public void next() {
-    assert(isInitialized());
-    next0(nativeHandle_);
-  }
-
-  /**
-   * Moves to the previous entry in the source.  After this call, Valid() is
-   * true iff the iterator was not positioned at the first entry in source.
-   * REQUIRES: Valid()
-   */
-  public void prev() {
-    assert(isInitialized());
-    prev0(nativeHandle_);
-  }
-
-  /**
-   * Return the key for the current entry.  The underlying storage for
-   * the returned slice is valid only until the next modification of
-   * the iterator.
-   * REQUIRES: Valid()
-   * @return key for the current entry.
-   */
-  public byte[] key() {
-    assert(isInitialized());
-    return key0(nativeHandle_);
-  }
-
-  /**
-   * Return the value for the current entry.  The underlying storage for
-   * the returned slice is valid only until the next modification of
-   * the iterator.
-   * REQUIRES: !AtEnd() && !AtStart()
-   * @return value for the current entry.
-   */
-  public byte[] value() {
-    assert(isInitialized());
-    return value0(nativeHandle_);
-  }
-
-  /**
-   * Position at the first key in the source that at or past target
-   * The iterator is Valid() after this call iff the source contains
-   * an entry that comes at or past target.
-   */
-  public void seek(byte[] target) {
-    assert(isInitialized());
-    seek0(nativeHandle_, target, target.length);
-  }
-
-  /**
-   * If an error has occurred, return it.  Else return an ok status.
-   * If non-blocking IO is requested and this operation cannot be
-   * satisfied without doing some IO, then this returns Status::Incomplete().
-   *
-   */
-  public void status() throws RocksDBException {
-    assert(isInitialized());
-    status0(nativeHandle_);
-  }
-
-  /**
-   * Deletes underlying C++ iterator pointer.
-   */
-  @Override public synchronized void dispose() {
-    if(isInitialized()) {
-      dispose(nativeHandle_);
-      nativeHandle_ = 0;
-    }
-  }
-
-  private native boolean isValid0(long handle);
-  private native void dispose(long handle);
-  private native void seekToFirst0(long handle);
-  private native void seekToLast0(long handle);
-  private native void next0(long handle);
-  private native void prev0(long handle);
-  private native byte[] key0(long handle);
-  private native byte[] value0(long handle);
-  private native void seek0(long handle, byte[] target, int targetLen);
-  private native void status0(long handle);
-}
diff --git a/src/rocksdb/java/org/rocksdb/MemTableConfig.java b/src/rocksdb/java/org/rocksdb/MemTableConfig.java
deleted file mode 100644
index a473c25..0000000
--- a/src/rocksdb/java/org/rocksdb/MemTableConfig.java
+++ /dev/null
@@ -1,27 +0,0 @@
-// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
-package org.rocksdb;
-
-/**
- * MemTableConfig is used to config the internal mem-table of a RocksDB.
- * It is required for each memtable to have one such sub-class to allow
- * Java developers to use it.
- *
- * To make a RocksDB to use a specific MemTable format, its associated
- * MemTableConfig should be properly set and passed into Options
- * via Options.setMemTableFactory() and open the db using that Options.
- *
- * @see Options
- */
-public abstract class MemTableConfig {
-  /**
-   * This function should only be called by Options.setMemTableConfig(),
-   * which will create a c++ shared-pointer to the c++ MemTableRepFactory
-   * that associated with the Java MemTableConfig.
-   *
-   * @see Options.setMemTableFactory()
-   */
-  abstract protected long newMemTableFactoryHandle();
-}
diff --git a/src/rocksdb/java/org/rocksdb/Options.java b/src/rocksdb/java/org/rocksdb/Options.java
deleted file mode 100644
index 02d3e20..0000000
--- a/src/rocksdb/java/org/rocksdb/Options.java
+++ /dev/null
@@ -1,2355 +0,0 @@
-// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
-
-package org.rocksdb;
-
-/**
- * Options to control the behavior of a database.  It will be used
- * during the creation of a RocksDB (i.e., RocksDB.open()).
- *
- * Note that dispose() must be called before an Options instance
- * become out-of-scope to release the allocated memory in c++.
- */
-public class Options extends RocksObject {
-  static final long DEFAULT_CACHE_SIZE = 8 << 20;
-  /**
-   * Construct options for opening a RocksDB.
-   *
-   * This constructor will create (by allocating a block of memory)
-   * an rocksdb::Options in the c++ side.
-   */
-  public Options() {
-    super();
-    cacheSize_ = DEFAULT_CACHE_SIZE;
-    newOptions();
-  }
-
-  /**
-   * If this value is set to true, then the database will be created
-   * if it is missing during RocksDB.open().
-   * Default: false
-   *
-   * @param flag a flag indicating whether to create a database the
-   *     specified database in RocksDB.open() operation is missing.
-   * @return the instance of the current Options.
-   * @see RocksDB.open()
-   */
-  public Options setCreateIfMissing(boolean flag) {
-    assert(isInitialized());
-    setCreateIfMissing(nativeHandle_, flag);
-    return this;
-  }
-
-  /**
-   * Return true if the create_if_missing flag is set to true.
-   * If true, the database will be created if it is missing.
-   *
-   * @return true if the createIfMissing option is set to true.
-   * @see setCreateIfMissing()
-   */
-  public boolean createIfMissing() {
-    assert(isInitialized());
-    return createIfMissing(nativeHandle_);
-  }
-
-  /**
-   * Amount of data to build up in memory (backed by an unsorted log
-   * on disk) before converting to a sorted on-disk file.
-   *
-   * Larger values increase performance, especially during bulk loads.
-   * Up to max_write_buffer_number write buffers may be held in memory
-   * at the same time, so you may wish to adjust this parameter
-   * to control memory usage.
-   *
-   * Also, a larger write buffer will result in a longer recovery time
-   * the next time the database is opened.
-   *
-   * Default: 4MB
-   * @param writeBufferSize the size of write buffer.
-   * @return the instance of the current Options.
-   * @see RocksDB.open()
-   */
-  public Options setWriteBufferSize(long writeBufferSize) {
-    assert(isInitialized());
-    setWriteBufferSize(nativeHandle_, writeBufferSize);
-    return this;
-  }
-
-  /**
-   * Return size of write buffer size.
-   *
-   * @return size of write buffer.
-   * @see setWriteBufferSize()
-   */
-  public long writeBufferSize()  {
-    assert(isInitialized());
-    return writeBufferSize(nativeHandle_);
-  }
-
-  /**
-   * The maximum number of write buffers that are built up in memory.
-   * The default is 2, so that when 1 write buffer is being flushed to
-   * storage, new writes can continue to the other write buffer.
-   * Default: 2
-   *
-   * @param maxWriteBufferNumber maximum number of write buffers.
-   * @return the instance of the current Options.
-   * @see RocksDB.open()
-   */
-  public Options setMaxWriteBufferNumber(int maxWriteBufferNumber) {
-    assert(isInitialized());
-    setMaxWriteBufferNumber(nativeHandle_, maxWriteBufferNumber);
-    return this;
-  }
-
-  /**
-   * Returns maximum number of write buffers.
-   *
-   * @return maximum number of write buffers.
-   * @see setMaxWriteBufferNumber()
-   */
-  public int maxWriteBufferNumber() {
-    assert(isInitialized());
-    return maxWriteBufferNumber(nativeHandle_);
-  }
-
-  /*
-   * Approximate size of user data packed per block.  Note that the
-   * block size specified here corresponds to uncompressed data.  The
-   * actual size of the unit read from disk may be smaller if
-   * compression is enabled.  This parameter can be changed dynamically.
-   *
-   * Default: 4K
-   *
-   * @param blockSize the size of each block in bytes.
-   * @return the instance of the current Options.
-   * @see RocksDB.open()
-   */
-  public Options setBlockSize(long blockSize) {
-    assert(isInitialized());
-    setBlockSize(nativeHandle_, blockSize);
-    return this;
-  }
-
-  /*
-   * Returns the size of a block in bytes.
-   *
-   * @return block size.
-   * @see setBlockSize()
-   */
-  public long blockSize() {
-    assert(isInitialized());
-    return blockSize(nativeHandle_);
-  }
-
-  /**
-   * Use the specified filter policy to reduce disk reads.
-   *
-   * Note that the caller should not dispose the input filter as
-   * Options.dispose() will dispose this filter.
-   *
-   * @param Filter policy java instance.
-   * @return the instance of the current Options.
-   * @see RocksDB.open()
-   */
-  public Options setFilter(Filter filter) {
-    assert(isInitialized());
-    setFilterHandle(nativeHandle_, filter.nativeHandle_);
-    filter_ = filter;
-    return this;
-  }
-  private native void setFilterHandle(long optHandle, long filterHandle);
-
-  /*
-   * Disable compaction triggered by seek.
-   * With bloomfilter and fast storage, a miss on one level
-   * is very cheap if the file handle is cached in table cache
-   * (which is true if max_open_files is large).
-   * Default: true
-   *
-   * @param disableSeekCompaction a boolean value to specify whether
-   *     to disable seek compaction.
-   * @return the instance of the current Options.
-   * @see RocksDB.open()
-   */
-  public Options setDisableSeekCompaction(boolean disableSeekCompaction) {
-    assert(isInitialized());
-    setDisableSeekCompaction(nativeHandle_, disableSeekCompaction);
-    return this;
-  }
-
-  /*
-   * Returns true if disable seek compaction is set to true.
-   *
-   * @return true if disable seek compaction is set to true.
-   * @see setDisableSeekCompaction()
-   */
-  public boolean disableSeekCompaction() {
-    assert(isInitialized());
-    return disableSeekCompaction(nativeHandle_);
-  }
-
-  /**
-   * Set the amount of cache in bytes that will be used by RocksDB.
-   * If cacheSize is non-positive, then cache will not be used.
-   *
-   * DEFAULT: 8M
-   */
-  public Options setCacheSize(long cacheSize) {
-    cacheSize_ = cacheSize;
-    return this;
-  }
-
-  /**
-   * @return the amount of cache in bytes that will be used by RocksDB.
-   */
-  public long cacheSize() {
-    return cacheSize_;
-  }
-
-  /**
-   * If true, an error will be thrown during RocksDB.open() if the
-   * database already exists.
-   *
-   * @return if true, an error is raised when the specified database
-   *    already exists before open.
-   */
-  public boolean errorIfExists() {
-    assert(isInitialized());
-    return errorIfExists(nativeHandle_);
-  }
-  private native boolean errorIfExists(long handle);
-
-  /**
-   * If true, an error will be thrown during RocksDB.open() if the
-   * database already exists.
-   * Default: false
-   *
-   * @param errorIfExists if true, an exception will be thrown
-   *     during RocksDB.open() if the database already exists.
-   * @return the reference to the current option.
-   * @see RocksDB.open()
-   */
-  public Options setErrorIfExists(boolean errorIfExists) {
-    assert(isInitialized());
-    setErrorIfExists(nativeHandle_, errorIfExists);
-    return this;
-  }
-  private native void setErrorIfExists(long handle, boolean errorIfExists);
-
-  /**
-   * If true, the implementation will do aggressive checking of the
-   * data it is processing and will stop early if it detects any
-   * errors.  This may have unforeseen ramifications: for example, a
-   * corruption of one DB entry may cause a large number of entries to
-   * become unreadable or for the entire DB to become unopenable.
-   * If any of the  writes to the database fails (Put, Delete, Merge, Write),
-   * the database will switch to read-only mode and fail all other
-   * Write operations.
-   *
-   * @return a boolean indicating whether paranoid-check is on.
-   */
-  public boolean paranoidChecks() {
-    assert(isInitialized());
-    return paranoidChecks(nativeHandle_);
-  }
-  private native boolean paranoidChecks(long handle);
-
-  /**
-   * If true, the implementation will do aggressive checking of the
-   * data it is processing and will stop early if it detects any
-   * errors.  This may have unforeseen ramifications: for example, a
-   * corruption of one DB entry may cause a large number of entries to
-   * become unreadable or for the entire DB to become unopenable.
-   * If any of the  writes to the database fails (Put, Delete, Merge, Write),
-   * the database will switch to read-only mode and fail all other
-   * Write operations.
-   * Default: true
-   *
-   * @param paranoidChecks a flag to indicate whether paranoid-check
-   *     is on.
-   * @return the reference to the current option.
-   */
-  public Options setParanoidChecks(boolean paranoidChecks) {
-    assert(isInitialized());
-    setParanoidChecks(nativeHandle_, paranoidChecks);
-    return this;
-  }
-  private native void setParanoidChecks(
-      long handle, boolean paranoidChecks);
-
-  /**
-   * Number of open files that can be used by the DB.  You may need to
-   * increase this if your database has a large working set. Value -1 means
-   * files opened are always kept open. You can estimate number of files based
-   * on target_file_size_base and target_file_size_multiplier for level-based
-   * compaction. For universal-style compaction, you can usually set it to -1.
-   *
-   * @return the maximum number of open files.
-   */
-  public int maxOpenFiles() {
-    assert(isInitialized());
-    return maxOpenFiles(nativeHandle_);
-  }
-  private native int maxOpenFiles(long handle);
-
-  /**
-   * Number of open files that can be used by the DB.  You may need to
-   * increase this if your database has a large working set. Value -1 means
-   * files opened are always kept open. You can estimate number of files based
-   * on target_file_size_base and target_file_size_multiplier for level-based
-   * compaction. For universal-style compaction, you can usually set it to -1.
-   * Default: 5000
-   *
-   * @param maxOpenFiles the maximum number of open files.
-   * @return the reference to the current option.
-   */
-  public Options setMaxOpenFiles(int maxOpenFiles) {
-    assert(isInitialized());
-    setMaxOpenFiles(nativeHandle_, maxOpenFiles);
-    return this;
-  }
-  private native void setMaxOpenFiles(long handle, int maxOpenFiles);
-
-  /**
-   * If true, then the contents of data files are not synced
-   * to stable storage. Their contents remain in the OS buffers till the
-   * OS decides to flush them. This option is good for bulk-loading
-   * of data. Once the bulk-loading is complete, please issue a
-   * sync to the OS to flush all dirty buffesrs to stable storage.
-   *
-   * @return if true, then data-sync is disabled.
-   */
-  public boolean disableDataSync() {
-    assert(isInitialized());
-    return disableDataSync(nativeHandle_);
-  }
-  private native boolean disableDataSync(long handle);
-
-  /**
-   * If true, then the contents of data files are not synced
-   * to stable storage. Their contents remain in the OS buffers till the
-   * OS decides to flush them. This option is good for bulk-loading
-   * of data. Once the bulk-loading is complete, please issue a
-   * sync to the OS to flush all dirty buffesrs to stable storage.
-   * Default: false
-   *
-   * @param disableDataSync a boolean flag to specify whether to
-   *     disable data sync.
-   * @return the reference to the current option.
-   */
-  public Options setDisableDataSync(boolean disableDataSync) {
-    assert(isInitialized());
-    setDisableDataSync(nativeHandle_, disableDataSync);
-    return this;
-  }
-  private native void setDisableDataSync(long handle, boolean disableDataSync);
-
-  /**
-   * If true, then every store to stable storage will issue a fsync.
-   * If false, then every store to stable storage will issue a fdatasync.
-   * This parameter should be set to true while storing data to
-   * filesystem like ext3 that can lose files after a reboot.
-   *
-   * @return true if fsync is used.
-   */
-  public boolean useFsync() {
-    assert(isInitialized());
-    return useFsync(nativeHandle_);
-  }
-  private native boolean useFsync(long handle);
-
-  /**
-   * If true, then every store to stable storage will issue a fsync.
-   * If false, then every store to stable storage will issue a fdatasync.
-   * This parameter should be set to true while storing data to
-   * filesystem like ext3 that can lose files after a reboot.
-   * Default: false
-   *
-   * @param useFsync a boolean flag to specify whether to use fsync
-   * @return the reference to the current option.
-   */
-  public Options setUseFsync(boolean useFsync) {
-    assert(isInitialized());
-    setUseFsync(nativeHandle_, useFsync);
-    return this;
-  }
-  private native void setUseFsync(long handle, boolean useFsync);
-
-  /**
-   * The time interval in seconds between each two consecutive stats logs.
-   * This number controls how often a new scribe log about
-   * db deploy stats is written out.
-   * -1 indicates no logging at all.
-   *
-   * @return the time interval in seconds between each two consecutive
-   *     stats logs.
-   */
-  public int dbStatsLogInterval() {
-    assert(isInitialized());
-    return dbStatsLogInterval(nativeHandle_);
-  }
-  private native int dbStatsLogInterval(long handle);
-
-  /**
-   * The time interval in seconds between each two consecutive stats logs.
-   * This number controls how often a new scribe log about
-   * db deploy stats is written out.
-   * -1 indicates no logging at all.
-   * Default value is 1800 (half an hour).
-   *
-   * @param dbStatsLogInterval the time interval in seconds between each
-   *     two consecutive stats logs.
-   * @return the reference to the current option.
-   */
-  public Options setDbStatsLogInterval(int dbStatsLogInterval) {
-    assert(isInitialized());
-    setDbStatsLogInterval(nativeHandle_, dbStatsLogInterval);
-    return this;
-  }
-  private native void setDbStatsLogInterval(
-      long handle, int dbStatsLogInterval);
-
-  /**
-   * Returns the directory of info log.
-   *
-   * If it is empty, the log files will be in the same dir as data.
-   * If it is non empty, the log files will be in the specified dir,
-   * and the db data dir's absolute path will be used as the log file
-   * name's prefix.
-   *
-   * @return the path to the info log directory
-   */
-  public String dbLogDir() {
-    assert(isInitialized());
-    return dbLogDir(nativeHandle_);
-  }
-  private native String dbLogDir(long handle);
-
-  /**
-   * This specifies the info LOG dir.
-   * If it is empty, the log files will be in the same dir as data.
-   * If it is non empty, the log files will be in the specified dir,
-   * and the db data dir's absolute path will be used as the log file
-   * name's prefix.
-   *
-   * @param dbLogDir the path to the info log directory
-   * @return the reference to the current option.
-   */
-  public Options setDbLogDir(String dbLogDir) {
-    assert(isInitialized());
-    setDbLogDir(nativeHandle_, dbLogDir);
-    return this;
-  }
-  private native void setDbLogDir(long handle, String dbLogDir);
-
-  /**
-   * Returns the path to the write-ahead-logs (WAL) directory.
-   *
-   * If it is empty, the log files will be in the same dir as data,
-   *   dbname is used as the data dir by default
-   * If it is non empty, the log files will be in kept the specified dir.
-   * When destroying the db,
-   *   all log files in wal_dir and the dir itself is deleted
-   *
-   * @return the path to the write-ahead-logs (WAL) directory.
-   */
-  public String walDir() {
-    assert(isInitialized());
-    return walDir(nativeHandle_);
-  }
-  private native String walDir(long handle);
-
-  /**
-   * This specifies the absolute dir path for write-ahead logs (WAL).
-   * If it is empty, the log files will be in the same dir as data,
-   *   dbname is used as the data dir by default
-   * If it is non empty, the log files will be in kept the specified dir.
-   * When destroying the db,
-   *   all log files in wal_dir and the dir itself is deleted
-   *
-   * @param walDir the path to the write-ahead-log directory.
-   * @return the reference to the current option.
-   */
-  public Options setWalDir(String walDir) {
-    assert(isInitialized());
-    setWalDir(nativeHandle_, walDir);
-    return this;
-  }
-  private native void setWalDir(long handle, String walDir);
-
-  /**
-   * The periodicity when obsolete files get deleted. The default
-   * value is 6 hours. The files that get out of scope by compaction
-   * process will still get automatically delete on every compaction,
-   * regardless of this setting
-   *
-   * @return the time interval in micros when obsolete files will be deleted.
-   */
-  public long deleteObsoleteFilesPeriodMicros() {
-    assert(isInitialized());
-    return deleteObsoleteFilesPeriodMicros(nativeHandle_);
-  }
-  private native long deleteObsoleteFilesPeriodMicros(long handle);
-
-  /**
-   * The periodicity when obsolete files get deleted. The default
-   * value is 6 hours. The files that get out of scope by compaction
-   * process will still get automatically delete on every compaction,
-   * regardless of this setting
-   *
-   * @param micros the time interval in micros
-   * @return the reference to the current option.
-   */
-  public Options setDeleteObsoleteFilesPeriodMicros(long micros) {
-    assert(isInitialized());
-    setDeleteObsoleteFilesPeriodMicros(nativeHandle_, micros);
-    return this;
-  }
-  private native void setDeleteObsoleteFilesPeriodMicros(
-      long handle, long micros);
-
-  /**
-   * Returns the maximum number of concurrent background compaction jobs,
-   * submitted to the default LOW priority thread pool.
-   * When increasing this number, we may also want to consider increasing
-   * number of threads in LOW priority thread pool.
-   * Default: 1
-   *
-   * @return the maximum number of concurrent background compaction jobs.
-   * @see Env.setBackgroundThreads()
-   */
-  public int maxBackgroundCompactions() {
-    assert(isInitialized());
-    return maxBackgroundCompactions(nativeHandle_);
-  }
-
-  /**
-   * Creates statistics object which collects metrics about database operations.
-     Statistics objects should not be shared between DB instances as
-     it does not use any locks to prevent concurrent updates.
-   *
-   * @return the instance of the current Options.
-   * @see RocksDB.open()
-   */
-  public Options createStatistics() {
-    assert(isInitialized());
-    createStatistics(nativeHandle_);
-    return this;
-  }
-
-  /**
-   * Returns statistics object. Calls createStatistics() if
-   * C++ returns NULL pointer for statistics.
-   *
-   * @return the instance of the statistics object.
-   * @see createStatistics()
-   */
-  public Statistics statisticsPtr() {
-    assert(isInitialized());
-
-    long statsPtr = statisticsPtr(nativeHandle_);
-    if(statsPtr == 0) {
-      createStatistics();
-      statsPtr = statisticsPtr(nativeHandle_);
-    }
-
-    return new Statistics(statsPtr);
-  }
-
-  /**
-   * Specifies the maximum number of concurrent background compaction jobs,
-   * submitted to the default LOW priority thread pool.
-   * If you're increasing this, also consider increasing number of threads in
-   * LOW priority thread pool. For more information, see
-   * Default: 1
-   *
-   * @param maxBackgroundCompactions the maximum number of background
-   *     compaction jobs.
-   * @return the reference to the current option.
-   *
-   * @see Env.setBackgroundThreads()
-   * @see maxBackgroundFlushes()
-   */
-  public Options setMaxBackgroundCompactions(int maxBackgroundCompactions) {
-    assert(isInitialized());
-    setMaxBackgroundCompactions(nativeHandle_, maxBackgroundCompactions);
-    return this;
-  }
-
-  /**
-   * Returns the maximum number of concurrent background flush jobs.
-   * If you're increasing this, also consider increasing number of threads in
-   * HIGH priority thread pool. For more information, see
-   * Default: 1
-   *
-   * @return the maximum number of concurrent background flush jobs.
-   * @see Env.setBackgroundThreads()
-   */
-  public int maxBackgroundFlushes() {
-    assert(isInitialized());
-    return maxBackgroundFlushes(nativeHandle_);
-  }
-  private native int maxBackgroundFlushes(long handle);
-
-  /**
-   * Specifies the maximum number of concurrent background flush jobs.
-   * If you're increasing this, also consider increasing number of threads in
-   * HIGH priority thread pool. For more information, see
-   * Default: 1
-   *
-   * @param maxBackgroundFlushes
-   * @return the reference to the current option.
-   *
-   * @see Env.setBackgroundThreads()
-   * @see maxBackgroundCompactions()
-   */
-  public Options setMaxBackgroundFlushes(int maxBackgroundFlushes) {
-    assert(isInitialized());
-    setMaxBackgroundFlushes(nativeHandle_, maxBackgroundFlushes);
-    return this;
-  }
-  private native void setMaxBackgroundFlushes(
-      long handle, int maxBackgroundFlushes);
-
-  /**
-   * Returns the maximum size of a info log file. If the current log file
-   * is larger than this size, a new info log file will be created.
-   * If 0, all logs will be written to one log file.
-   *
-   * @return the maximum size of the info log file.
-   */
-  public long maxLogFileSize() {
-    assert(isInitialized());
-    return maxLogFileSize(nativeHandle_);
-  }
-  private native long maxLogFileSize(long handle);
-
-  /**
-   * Specifies the maximum size of a info log file. If the current log file
-   * is larger than `max_log_file_size`, a new info log file will
-   * be created.
-   * If 0, all logs will be written to one log file.
-   *
-   * @param maxLogFileSize the maximum size of a info log file.
-   * @return the reference to the current option.
-   */
-  public Options setMaxLogFileSize(long maxLogFileSize) {
-    assert(isInitialized());
-    setMaxLogFileSize(nativeHandle_, maxLogFileSize);
-    return this;
-  }
-  private native void setMaxLogFileSize(long handle, long maxLogFileSize);
-
-  /**
-   * Returns the time interval for the info log file to roll (in seconds).
-   * If specified with non-zero value, log file will be rolled
-   * if it has been active longer than `log_file_time_to_roll`.
-   * Default: 0 (disabled)
-   *
-   * @return the time interval in seconds.
-   */
-  public long logFileTimeToRoll() {
-    assert(isInitialized());
-    return logFileTimeToRoll(nativeHandle_);
-  }
-  private native long logFileTimeToRoll(long handle);
-
-  /**
-   * Specifies the time interval for the info log file to roll (in seconds).
-   * If specified with non-zero value, log file will be rolled
-   * if it has been active longer than `log_file_time_to_roll`.
-   * Default: 0 (disabled)
-   *
-   * @param logFileTimeToRoll the time interval in seconds.
-   * @return the reference to the current option.
-   */
-  public Options setLogFileTimeToRoll(long logFileTimeToRoll) {
-    assert(isInitialized());
-    setLogFileTimeToRoll(nativeHandle_, logFileTimeToRoll);
-    return this;
-  }
-  private native void setLogFileTimeToRoll(
-      long handle, long logFileTimeToRoll);
-
-  /**
-   * Returns the maximum number of info log files to be kept.
-   * Default: 1000
-   *
-   * @return the maximum number of info log files to be kept.
-   */
-  public long keepLogFileNum() {
-    assert(isInitialized());
-    return keepLogFileNum(nativeHandle_);
-  }
-  private native long keepLogFileNum(long handle);
-
-  /**
-   * Specifies the maximum number of info log files to be kept.
-   * Default: 1000
-   *
-   * @param keepLogFileNum the maximum number of info log files to be kept.
-   * @return the reference to the current option.
-   */
-  public Options setKeepLogFileNum(long keepLogFileNum) {
-    assert(isInitialized());
-    setKeepLogFileNum(nativeHandle_, keepLogFileNum);
-    return this;
-  }
-  private native void setKeepLogFileNum(long handle, long keepLogFileNum);
-
-  /**
-   * Manifest file is rolled over on reaching this limit.
-   * The older manifest file be deleted.
-   * The default value is MAX_INT so that roll-over does not take place.
-   *
-   * @return the size limit of a manifest file.
-   */
-  public long maxManifestFileSize() {
-    assert(isInitialized());
-    return maxManifestFileSize(nativeHandle_);
-  }
-  private native long maxManifestFileSize(long handle);
-
-  /**
-   * Manifest file is rolled over on reaching this limit.
-   * The older manifest file be deleted.
-   * The default value is MAX_INT so that roll-over does not take place.
-   *
-   * @param maxManifestFileSize the size limit of a manifest file.
-   * @return the reference to the current option.
-   */
-  public Options setMaxManifestFileSize(long maxManifestFileSize) {
-    assert(isInitialized());
-    setMaxManifestFileSize(nativeHandle_, maxManifestFileSize);
-    return this;
-  }
-  private native void setMaxManifestFileSize(
-      long handle, long maxManifestFileSize);
-
-  /**
-   * Number of shards used for table cache.
-   *
-   * @return the number of shards used for table cache.
-   */
-  public int tableCacheNumshardbits() {
-    assert(isInitialized());
-    return tableCacheNumshardbits(nativeHandle_);
-  }
-  private native int tableCacheNumshardbits(long handle);
-
-  /**
-   * Number of shards used for table cache.
-   *
-   * @param tableCacheNumshardbits the number of chards
-   * @return the reference to the current option.
-   */
-  public Options setTableCacheNumshardbits(int tableCacheNumshardbits) {
-    assert(isInitialized());
-    setTableCacheNumshardbits(nativeHandle_, tableCacheNumshardbits);
-    return this;
-  }
-  private native void setTableCacheNumshardbits(
-      long handle, int tableCacheNumshardbits);
-
-  /**
-   * During data eviction of table's LRU cache, it would be inefficient
-   * to strictly follow LRU because this piece of memory will not really
-   * be released unless its refcount falls to zero. Instead, make two
-   * passes: the first pass will release items with refcount = 1,
-   * and if not enough space releases after scanning the number of
-   * elements specified by this parameter, we will remove items in LRU
-   * order.
-   *
-   * @return scan count limit
-   */
-  public int tableCacheRemoveScanCountLimit() {
-    assert(isInitialized());
-    return tableCacheRemoveScanCountLimit(nativeHandle_);
-  }
-  private native int tableCacheRemoveScanCountLimit(long handle);
-
-  /**
-   * During data eviction of table's LRU cache, it would be inefficient
-   * to strictly follow LRU because this piece of memory will not really
-   * be released unless its refcount falls to zero. Instead, make two
-   * passes: the first pass will release items with refcount = 1,
-   * and if not enough space releases after scanning the number of
-   * elements specified by this parameter, we will remove items in LRU
-   * order.
-   *
-   * @param limit scan count limit
-   * @return the reference to the current option.
-   */
-  public Options setTableCacheRemoveScanCountLimit(int limit) {
-    assert(isInitialized());
-    setTableCacheRemoveScanCountLimit(nativeHandle_, limit);
-    return this;
-  }
-  private native void setTableCacheRemoveScanCountLimit(
-      long handle, int limit);
-
-  /**
-   * WalTtlSeconds() and walSizeLimitMB() affect how archived logs
-   * will be deleted.
-   * 1. If both set to 0, logs will be deleted asap and will not get into
-   *    the archive.
-   * 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
-   *    WAL files will be checked every 10 min and if total size is greater
-   *    then WAL_size_limit_MB, they will be deleted starting with the
-   *    earliest until size_limit is met. All empty files will be deleted.
-   * 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
-   *    WAL files will be checked every WAL_ttl_secondsi / 2 and those that
-   *    are older than WAL_ttl_seconds will be deleted.
-   * 4. If both are not 0, WAL files will be checked every 10 min and both
-   *    checks will be performed with ttl being first.
-   *
-   * @return the wal-ttl seconds
-   * @see walSizeLimitMB()
-   */
-  public long walTtlSeconds() {
-    assert(isInitialized());
-    return walTtlSeconds(nativeHandle_);
-  }
-  private native long walTtlSeconds(long handle);
-
-  /**
-   * WalTtlSeconds() and walSizeLimitMB() affect how archived logs
-   * will be deleted.
-   * 1. If both set to 0, logs will be deleted asap and will not get into
-   *    the archive.
-   * 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
-   *    WAL files will be checked every 10 min and if total size is greater
-   *    then WAL_size_limit_MB, they will be deleted starting with the
-   *    earliest until size_limit is met. All empty files will be deleted.
-   * 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
-   *    WAL files will be checked every WAL_ttl_secondsi / 2 and those that
-   *    are older than WAL_ttl_seconds will be deleted.
-   * 4. If both are not 0, WAL files will be checked every 10 min and both
-   *    checks will be performed with ttl being first.
-   *
-   * @param walTtlSeconds the ttl seconds
-   * @return the reference to the current option.
-   * @see setWalSizeLimitMB()
-   */
-  public Options setWalTtlSeconds(long walTtlSeconds) {
-    assert(isInitialized());
-    setWalTtlSeconds(nativeHandle_, walTtlSeconds);
-    return this;
-  }
-  private native void setWalTtlSeconds(long handle, long walTtlSeconds);
-
-  /**
-   * WalTtlSeconds() and walSizeLimitMB() affect how archived logs
-   * will be deleted.
-   * 1. If both set to 0, logs will be deleted asap and will not get into
-   *    the archive.
-   * 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
-   *    WAL files will be checked every 10 min and if total size is greater
-   *    then WAL_size_limit_MB, they will be deleted starting with the
-   *    earliest until size_limit is met. All empty files will be deleted.
-   * 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
-   *    WAL files will be checked every WAL_ttl_secondsi / 2 and those that
-   *    are older than WAL_ttl_seconds will be deleted.
-   * 4. If both are not 0, WAL files will be checked every 10 min and both
-   *    checks will be performed with ttl being first.
-   *
-   * @return size limit in mega-bytes.
-   * @see walSizeLimitMB()
-   */
-  public long walSizeLimitMB() {
-    assert(isInitialized());
-    return walSizeLimitMB(nativeHandle_);
-  }
-  private native long walSizeLimitMB(long handle);
-
-  /**
-   * WalTtlSeconds() and walSizeLimitMB() affect how archived logs
-   * will be deleted.
-   * 1. If both set to 0, logs will be deleted asap and will not get into
-   *    the archive.
-   * 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
-   *    WAL files will be checked every 10 min and if total size is greater
-   *    then WAL_size_limit_MB, they will be deleted starting with the
-   *    earliest until size_limit is met. All empty files will be deleted.
-   * 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
-   *    WAL files will be checked every WAL_ttl_secondsi / 2 and those that
-   *    are older than WAL_ttl_seconds will be deleted.
-   * 4. If both are not 0, WAL files will be checked every 10 min and both
-   *    checks will be performed with ttl being first.
-   *
-   * @param sizeLimitMB size limit in mega-bytes.
-   * @return the reference to the current option.
-   * @see setWalSizeLimitMB()
-   */
-  public Options setWalSizeLimitMB(long sizeLimitMB) {
-    assert(isInitialized());
-    setWalSizeLimitMB(nativeHandle_, sizeLimitMB);
-    return this;
-  }
-  private native void setWalSizeLimitMB(long handle, long sizeLimitMB);
-
-  /**
-   * Number of bytes to preallocate (via fallocate) the manifest
-   * files.  Default is 4mb, which is reasonable to reduce random IO
-   * as well as prevent overallocation for mounts that preallocate
-   * large amounts of data (such as xfs's allocsize option).
-   *
-   * @return size in bytes.
-   */
-  public long manifestPreallocationSize() {
-    assert(isInitialized());
-    return manifestPreallocationSize(nativeHandle_);
-  }
-  private native long manifestPreallocationSize(long handle);
-
-  /**
-   * Number of bytes to preallocate (via fallocate) the manifest
-   * files.  Default is 4mb, which is reasonable to reduce random IO
-   * as well as prevent overallocation for mounts that preallocate
-   * large amounts of data (such as xfs's allocsize option).
-   *
-   * @param size the size in byte
-   * @return the reference to the current option.
-   */
-  public Options setManifestPreallocationSize(long size) {
-    assert(isInitialized());
-    setManifestPreallocationSize(nativeHandle_, size);
-    return this;
-  }
-  private native void setManifestPreallocationSize(
-      long handle, long size);
-
-  /**
-   * Data being read from file storage may be buffered in the OS
-   * Default: true
-   *
-   * @return if true, then OS buffering is allowed.
-   */
-  public boolean allowOsBuffer() {
-    assert(isInitialized());
-    return allowOsBuffer(nativeHandle_);
-  }
-  private native boolean allowOsBuffer(long handle);
-
-  /**
-   * Data being read from file storage may be buffered in the OS
-   * Default: true
-   *
-   * @param allowOsBufferif true, then OS buffering is allowed.
-   * @return the reference to the current option.
-   */
-  public Options setAllowOsBuffer(boolean allowOsBuffer) {
-    assert(isInitialized());
-    setAllowOsBuffer(nativeHandle_, allowOsBuffer);
-    return this;
-  }
-  private native void setAllowOsBuffer(
-      long handle, boolean allowOsBuffer);
-
-  /**
-   * Allow the OS to mmap file for reading sst tables.
-   * Default: false
-   *
-   * @return true if mmap reads are allowed.
-   */
-  public boolean allowMmapReads() {
-    assert(isInitialized());
-    return allowMmapReads(nativeHandle_);
-  }
-  private native boolean allowMmapReads(long handle);
-
-  /**
-   * Allow the OS to mmap file for reading sst tables.
-   * Default: false
-   *
-   * @param allowMmapReads true if mmap reads are allowed.
-   * @return the reference to the current option.
-   */
-  public Options setAllowMmapReads(boolean allowMmapReads) {
-    assert(isInitialized());
-    setAllowMmapReads(nativeHandle_, allowMmapReads);
-    return this;
-  }
-  private native void setAllowMmapReads(
-      long handle, boolean allowMmapReads);
-
-  /**
-   * Allow the OS to mmap file for writing. Default: false
-   *
-   * @return true if mmap writes are allowed.
-   */
-  public boolean allowMmapWrites() {
-    assert(isInitialized());
-    return allowMmapWrites(nativeHandle_);
-  }
-  private native boolean allowMmapWrites(long handle);
-
-  /**
-   * Allow the OS to mmap file for writing. Default: false
-   *
-   * @param allowMmapWrites true if mmap writes are allowd.
-   * @return the reference to the current option.
-   */
-  public Options setAllowMmapWrites(boolean allowMmapWrites) {
-    assert(isInitialized());
-    setAllowMmapWrites(nativeHandle_, allowMmapWrites);
-    return this;
-  }
-  private native void setAllowMmapWrites(
-      long handle, boolean allowMmapWrites);
-
-  /**
-   * Disable child process inherit open files. Default: true
-   *
-   * @return true if child process inheriting open files is disabled.
-   */
-  public boolean isFdCloseOnExec() {
-    assert(isInitialized());
-    return isFdCloseOnExec(nativeHandle_);
-  }
-  private native boolean isFdCloseOnExec(long handle);
-
-  /**
-   * Disable child process inherit open files. Default: true
-   *
-   * @param isFdCloseOnExec true if child process inheriting open
-   *     files is disabled.
-   * @return the reference to the current option.
-   */
-  public Options setIsFdCloseOnExec(boolean isFdCloseOnExec) {
-    assert(isInitialized());
-    setIsFdCloseOnExec(nativeHandle_, isFdCloseOnExec);
-    return this;
-  }
-  private native void setIsFdCloseOnExec(
-      long handle, boolean isFdCloseOnExec);
-
-  /**
-   * Skip log corruption error on recovery (If client is ok with
-   * losing most recent changes)
-   * Default: false
-   *
-   * @return true if log corruption errors are skipped during recovery.
-   */
-  public boolean skipLogErrorOnRecovery() {
-    assert(isInitialized());
-    return skipLogErrorOnRecovery(nativeHandle_);
-  }
-  private native boolean skipLogErrorOnRecovery(long handle);
-
-  /**
-   * Skip log corruption error on recovery (If client is ok with
-   * losing most recent changes)
-   * Default: false
-   *
-   * @param skip true if log corruption errors are skipped during recovery.
-   * @return the reference to the current option.
-   */
-  public Options setSkipLogErrorOnRecovery(boolean skip) {
-    assert(isInitialized());
-    setSkipLogErrorOnRecovery(nativeHandle_, skip);
-    return this;
-  }
-  private native void setSkipLogErrorOnRecovery(
-      long handle, boolean skip);
-
-  /**
-   * If not zero, dump rocksdb.stats to LOG every stats_dump_period_sec
-   * Default: 3600 (1 hour)
-   *
-   * @return time interval in seconds.
-   */
-  public int statsDumpPeriodSec() {
-    assert(isInitialized());
-    return statsDumpPeriodSec(nativeHandle_);
-  }
-  private native int statsDumpPeriodSec(long handle);
-
-  /**
-   * if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec
-   * Default: 3600 (1 hour)
-   *
-   * @param statsDumpPeriodSec time interval in seconds.
-   * @return the reference to the current option.
-   */
-  public Options setStatsDumpPeriodSec(int statsDumpPeriodSec) {
-    assert(isInitialized());
-    setStatsDumpPeriodSec(nativeHandle_, statsDumpPeriodSec);
-    return this;
-  }
-  private native void setStatsDumpPeriodSec(
-      long handle, int statsDumpPeriodSec);
-
-  /**
-   * If set true, will hint the underlying file system that the file
-   * access pattern is random, when a sst file is opened.
-   * Default: true
-   *
-   * @return true if hinting random access is on.
-   */
-  public boolean adviseRandomOnOpen() {
-    return adviseRandomOnOpen(nativeHandle_);
-  }
-  private native boolean adviseRandomOnOpen(long handle);
-
-  /**
-   * If set true, will hint the underlying file system that the file
-   * access pattern is random, when a sst file is opened.
-   * Default: true
-   *
-   * @param adviseRandomOnOpen true if hinting random access is on.
-   * @return the reference to the current option.
-   */
-  public Options setAdviseRandomOnOpen(boolean adviseRandomOnOpen) {
-    assert(isInitialized());
-    setAdviseRandomOnOpen(nativeHandle_, adviseRandomOnOpen);
-    return this;
-  }
-  private native void setAdviseRandomOnOpen(
-      long handle, boolean adviseRandomOnOpen);
-
-  /**
-   * Use adaptive mutex, which spins in the user space before resorting
-   * to kernel. This could reduce context switch when the mutex is not
-   * heavily contended. However, if the mutex is hot, we could end up
-   * wasting spin time.
-   * Default: false
-   *
-   * @return true if adaptive mutex is used.
-   */
-  public boolean useAdaptiveMutex() {
-    assert(isInitialized());
-    return useAdaptiveMutex(nativeHandle_);
-  }
-  private native boolean useAdaptiveMutex(long handle);
-
-  /**
-   * Use adaptive mutex, which spins in the user space before resorting
-   * to kernel. This could reduce context switch when the mutex is not
-   * heavily contended. However, if the mutex is hot, we could end up
-   * wasting spin time.
-   * Default: false
-   *
-   * @param useAdaptiveMutex true if adaptive mutex is used.
-   * @return the reference to the current option.
-   */
-  public Options setUseAdaptiveMutex(boolean useAdaptiveMutex) {
-    assert(isInitialized());
-    setUseAdaptiveMutex(nativeHandle_, useAdaptiveMutex);
-    return this;
-  }
-  private native void setUseAdaptiveMutex(
-      long handle, boolean useAdaptiveMutex);
-
-  /**
-   * Allows OS to incrementally sync files to disk while they are being
-   * written, asynchronously, in the background.
-   * Issue one request for every bytes_per_sync written. 0 turns it off.
-   * Default: 0
-   *
-   * @return size in bytes
-   */
-  public long bytesPerSync() {
-    return bytesPerSync(nativeHandle_);
-  }
-  private native long bytesPerSync(long handle);
-
-  /**
-   * Allows OS to incrementally sync files to disk while they are being
-   * written, asynchronously, in the background.
-   * Issue one request for every bytes_per_sync written. 0 turns it off.
-   * Default: 0
-   *
-   * @param bytesPerSync size in bytes
-   * @return the reference to the current option.
-   */
-  public Options setBytesPerSync(long bytesPerSync) {
-    assert(isInitialized());
-    setBytesPerSync(nativeHandle_, bytesPerSync);
-    return this;
-  }
-  private native void setBytesPerSync(
-      long handle, long bytesPerSync);
-
-  /**
-   * Allow RocksDB to use thread local storage to optimize performance.
-   * Default: true
-   *
-   * @return true if thread-local storage is allowed
-   */
-  public boolean allowThreadLocal() {
-    assert(isInitialized());
-    return allowThreadLocal(nativeHandle_);
-  }
-  private native boolean allowThreadLocal(long handle);
-
-  /**
-   * Allow RocksDB to use thread local storage to optimize performance.
-   * Default: true
-   *
-   * @param allowThreadLocal true if thread-local storage is allowed.
-   * @return the reference to the current option.
-   */
-  public Options setAllowThreadLocal(boolean allowThreadLocal) {
-    assert(isInitialized());
-    setAllowThreadLocal(nativeHandle_, allowThreadLocal);
-    return this;
-  }
-  private native void setAllowThreadLocal(
-      long handle, boolean allowThreadLocal);
-
-  /**
-   * Set the config for mem-table.
-   *
-   * @param config the mem-table config.
-   * @return the instance of the current Options.
-   */
-  public Options setMemTableConfig(MemTableConfig config) {
-    setMemTableFactory(nativeHandle_, config.newMemTableFactoryHandle());
-    return this;
-  }
-
-  /**
-   * Returns the name of the current mem table representation.
-   * Memtable format can be set using setTableFormatConfig.
-   *
-   * @return the name of the currently-used memtable factory.
-   * @see setTableFormatConfig()
-   */
-  public String memTableFactoryName() {
-    assert(isInitialized());
-    return memTableFactoryName(nativeHandle_);
-  }
-
-  /**
-   * Set the config for table format.
-   *
-   * @param config the table format config.
-   * @return the reference of the current Options.
-   */
-  public Options setTableFormatConfig(TableFormatConfig config) {
-    setTableFactory(nativeHandle_, config.newTableFactoryHandle());
-    return this;
-  }
-
-  /**
-   * @return the name of the currently used table factory.
-   */
-  public String tableFactoryName() {
-    assert(isInitialized());
-    return tableFactoryName(nativeHandle_);
-  }
-
-  /**
-   * This prefix-extractor uses the first n bytes of a key as its prefix.
-   *
-   * In some hash-based memtable representation such as HashLinkedList
-   * and HashSkipList, prefixes are used to partition the keys into
-   * several buckets.  Prefix extractor is used to specify how to
-   * extract the prefix given a key.
-   *
-   * @param n use the first n bytes of a key as its prefix.
-   */
-  public Options useFixedLengthPrefixExtractor(int n) {
-    assert(isInitialized());
-    useFixedLengthPrefixExtractor(nativeHandle_, n);
-    return this;
-  }
-
-///////////////////////////////////////////////////////////////////////
-  /**
-   * Number of keys between restart points for delta encoding of keys.
-   * This parameter can be changed dynamically.  Most clients should
-   * leave this parameter alone.
-   * Default: 16
-   *
-   * @return the number of keys between restart points.
-   */
-  public int blockRestartInterval() {
-    return blockRestartInterval(nativeHandle_);
-  }
-  private native int blockRestartInterval(long handle);
-
-  /**
-   * Number of keys between restart points for delta encoding of keys.
-   * This parameter can be changed dynamically.  Most clients should
-   * leave this parameter alone.
-   * Default: 16
-   *
-   * @param blockRestartInterval the number of keys between restart points.
-   * @return the reference to the current option.
-   */
-  public Options setBlockRestartInterval(int blockRestartInterval) {
-    setBlockRestartInterval(nativeHandle_, blockRestartInterval);
-    return this;
-  }
-  private native void setBlockRestartInterval(
-      long handle, int blockRestartInterval);
-
-  /**
-   * If true, place whole keys in the filter (not just prefixes).
-   * This must generally be true for gets to be efficient.
-   * Default: true
-   *
-   * @return if true, then whole-key-filtering is on.
-   */
-  public boolean wholeKeyFiltering() {
-    return wholeKeyFiltering(nativeHandle_);
-  }
-  private native boolean wholeKeyFiltering(long handle);
-
-  /**
-   * If true, place whole keys in the filter (not just prefixes).
-   * This must generally be true for gets to be efficient.
-   * Default: true
-   *
-   * @param wholeKeyFiltering if true, then whole-key-filtering is on.
-   * @return the reference to the current option.
-   */
-  public Options setWholeKeyFiltering(boolean wholeKeyFiltering) {
-    setWholeKeyFiltering(nativeHandle_, wholeKeyFiltering);
-    return this;
-  }
-  private native void setWholeKeyFiltering(
-      long handle, boolean wholeKeyFiltering);
-
-  /**
-   * If level-styled compaction is used, then this number determines
-   * the total number of levels.
-   *
-   * @return the number of levels.
-   */
-  public int numLevels() {
-    return numLevels(nativeHandle_);
-  }
-  private native int numLevels(long handle);
-
-  /**
-   * Set the number of levels for this database
-   * If level-styled compaction is used, then this number determines
-   * the total number of levels.
-   *
-   * @param numLevels the number of levels.
-   * @return the reference to the current option.
-   */
-  public Options setNumLevels(int numLevels) {
-    setNumLevels(nativeHandle_, numLevels);
-    return this;
-  }
-  private native void setNumLevels(
-      long handle, int numLevels);
-
-  /**
-   * The number of files in leve 0 to trigger compaction from level-0 to
-   * level-1.  A value < 0 means that level-0 compaction will not be
-   * triggered by number of files at all.
-   * Default: 4
-   *
-   * @return the number of files in level 0 to trigger compaction.
-   */
-  public int levelZeroFileNumCompactionTrigger() {
-    return levelZeroFileNumCompactionTrigger(nativeHandle_);
-  }
-  private native int levelZeroFileNumCompactionTrigger(long handle);
-
-  /**
-   * Number of files to trigger level-0 compaction. A value <0 means that
-   * level-0 compaction will not be triggered by number of files at all.
-   * Default: 4
-   *
-   * @param numFiles the number of files in level-0 to trigger compaction.
-   * @return the reference to the current option.
-   */
-  public Options setLevelZeroFileNumCompactionTrigger(
-      int numFiles) {
-    setLevelZeroFileNumCompactionTrigger(
-        nativeHandle_, numFiles);
-    return this;
-  }
-  private native void setLevelZeroFileNumCompactionTrigger(
-      long handle, int numFiles);
-
-  /**
-   * Soft limit on the number of level-0 files. We start slowing down writes
-   * at this point. A value < 0 means that no writing slow down will be
-   * triggered by number of files in level-0.
-   *
-   * @return the soft limit on the number of level-0 files.
-   */
-  public int levelZeroSlowdownWritesTrigger() {
-    return levelZeroSlowdownWritesTrigger(nativeHandle_);
-  }
-  private native int levelZeroSlowdownWritesTrigger(long handle);
-
-  /**
-   * Soft limit on number of level-0 files. We start slowing down writes at this
-   * point. A value <0 means that no writing slow down will be triggered by
-   * number of files in level-0.
-   *
-   * @param numFiles soft limit on number of level-0 files.
-   * @return the reference to the current option.
-   */
-  public Options setLevelZeroSlowdownWritesTrigger(
-      int numFiles) {
-    setLevelZeroSlowdownWritesTrigger(nativeHandle_, numFiles);
-    return this;
-  }
-  private native void setLevelZeroSlowdownWritesTrigger(
-      long handle, int numFiles);
-
-  /**
-   * Maximum number of level-0 files.  We stop writes at this point.
-   *
-   * @return the hard limit of the number of level-0 file.
-   */
-  public int levelZeroStopWritesTrigger() {
-    return levelZeroStopWritesTrigger(nativeHandle_);
-  }
-  private native int levelZeroStopWritesTrigger(long handle);
-
-  /**
-   * Maximum number of level-0 files.  We stop writes at this point.
-   *
-   * @param numFiles the hard limit of the number of level-0 files.
-   * @return the reference to the current option.
-   */
-  public Options setLevelZeroStopWritesTrigger(int numFiles) {
-    setLevelZeroStopWritesTrigger(nativeHandle_, numFiles);
-    return this;
-  }
-  private native void setLevelZeroStopWritesTrigger(
-      long handle, int numFiles);
-
-  /**
-   * The highest level to which a new compacted memtable is pushed if it
-   * does not create overlap.  We try to push to level 2 to avoid the
-   * relatively expensive level 0=>1 compactions and to avoid some
-   * expensive manifest file operations.  We do not push all the way to
-   * the largest level since that can generate a lot of wasted disk
-   * space if the same key space is being repeatedly overwritten.
-   *
-   * @return the highest level where a new compacted memtable will be pushed.
-   */
-  public int maxMemCompactionLevel() {
-    return maxMemCompactionLevel(nativeHandle_);
-  }
-  private native int maxMemCompactionLevel(long handle);
-
-  /**
-   * The highest level to which a new compacted memtable is pushed if it
-   * does not create overlap.  We try to push to level 2 to avoid the
-   * relatively expensive level 0=>1 compactions and to avoid some
-   * expensive manifest file operations.  We do not push all the way to
-   * the largest level since that can generate a lot of wasted disk
-   * space if the same key space is being repeatedly overwritten.
-   *
-   * @param maxMemCompactionLevel the highest level to which a new compacted
-   *     mem-table will be pushed.
-   * @return the reference to the current option.
-   */
-  public Options setMaxMemCompactionLevel(int maxMemCompactionLevel) {
-    setMaxMemCompactionLevel(nativeHandle_, maxMemCompactionLevel);
-    return this;
-  }
-  private native void setMaxMemCompactionLevel(
-      long handle, int maxMemCompactionLevel);
-
-  /**
-   * The target file size for compaction.
-   * This targetFileSizeBase determines a level-1 file size.
-   * Target file size for level L can be calculated by
-   * targetFileSizeBase * (targetFileSizeMultiplier ^ (L-1))
-   * For example, if targetFileSizeBase is 2MB and
-   * target_file_size_multiplier is 10, then each file on level-1 will
-   * be 2MB, and each file on level 2 will be 20MB,
-   * and each file on level-3 will be 200MB.
-   * by default targetFileSizeBase is 2MB.
-   *
-   * @return the target size of a level-0 file.
-   *
-   * @see targetFileSizeMultiplier()
-   */
-  public int targetFileSizeBase() {
-    return targetFileSizeBase(nativeHandle_);
-  }
-  private native int targetFileSizeBase(long handle);
-
-  /**
-   * The target file size for compaction.
-   * This targetFileSizeBase determines a level-1 file size.
-   * Target file size for level L can be calculated by
-   * targetFileSizeBase * (targetFileSizeMultiplier ^ (L-1))
-   * For example, if targetFileSizeBase is 2MB and
-   * target_file_size_multiplier is 10, then each file on level-1 will
-   * be 2MB, and each file on level 2 will be 20MB,
-   * and each file on level-3 will be 200MB.
-   * by default targetFileSizeBase is 2MB.
-   *
-   * @param targetFileSizeBase the target size of a level-0 file.
-   * @return the reference to the current option.
-   *
-   * @see setTargetFileSizeMultiplier()
-   */
-  public Options setTargetFileSizeBase(int targetFileSizeBase) {
-    setTargetFileSizeBase(nativeHandle_, targetFileSizeBase);
-    return this;
-  }
-  private native void setTargetFileSizeBase(
-      long handle, int targetFileSizeBase);
-
-  /**
-   * targetFileSizeMultiplier defines the size ratio between a
-   * level-(L+1) file and level-L file.
-   * By default targetFileSizeMultiplier is 1, meaning
-   * files in different levels have the same target.
-   *
-   * @return the size ratio between a level-(L+1) file and level-L file.
-   */
-  public int targetFileSizeMultiplier() {
-    return targetFileSizeMultiplier(nativeHandle_);
-  }
-  private native int targetFileSizeMultiplier(long handle);
-
-  /**
-   * targetFileSizeMultiplier defines the size ratio between a
-   * level-L file and level-(L+1) file.
-   * By default target_file_size_multiplier is 1, meaning
-   * files in different levels have the same target.
-   *
-   * @param multiplier the size ratio between a level-(L+1) file
-   *     and level-L file.
-   * @return the reference to the current option.
-   */
-  public Options setTargetFileSizeMultiplier(int multiplier) {
-    setTargetFileSizeMultiplier(nativeHandle_, multiplier);
-    return this;
-  }
-  private native void setTargetFileSizeMultiplier(
-      long handle, int multiplier);
-
-  /**
-   * The upper-bound of the total size of level-1 files in bytes.
-   * Maximum number of bytes for level L can be calculated as
-   * (maxBytesForLevelBase) * (maxBytesForLevelMultiplier ^ (L-1))
-   * For example, if maxBytesForLevelBase is 20MB, and if
-   * max_bytes_for_level_multiplier is 10, total data size for level-1
-   * will be 20MB, total file size for level-2 will be 200MB,
-   * and total file size for level-3 will be 2GB.
-   * by default 'maxBytesForLevelBase' is 10MB.
-   *
-   * @return the upper-bound of the total size of leve-1 files in bytes.
-   * @see maxBytesForLevelMultiplier()
-   */
-  public long maxBytesForLevelBase() {
-    return maxBytesForLevelBase(nativeHandle_);
-  }
-  private native long maxBytesForLevelBase(long handle);
-
-  /**
-   * The upper-bound of the total size of level-1 files in bytes.
-   * Maximum number of bytes for level L can be calculated as
-   * (maxBytesForLevelBase) * (maxBytesForLevelMultiplier ^ (L-1))
-   * For example, if maxBytesForLevelBase is 20MB, and if
-   * max_bytes_for_level_multiplier is 10, total data size for level-1
-   * will be 20MB, total file size for level-2 will be 200MB,
-   * and total file size for level-3 will be 2GB.
-   * by default 'maxBytesForLevelBase' is 10MB.
-   *
-   * @return maxBytesForLevelBase the upper-bound of the total size of
-   *     leve-1 files in bytes.
-   * @return the reference to the current option.
-   * @see setMaxBytesForLevelMultiplier()
-   */
-  public Options setMaxBytesForLevelBase(long maxBytesForLevelBase) {
-    setMaxBytesForLevelBase(nativeHandle_, maxBytesForLevelBase);
-    return this;
-  }
-  private native void setMaxBytesForLevelBase(
-      long handle, long maxBytesForLevelBase);
-
-  /**
-   * The ratio between the total size of level-(L+1) files and the total
-   * size of level-L files for all L.
-   * DEFAULT: 10
-   *
-   * @return the ratio between the total size of level-(L+1) files and
-   *     the total size of level-L files for all L.
-   * @see maxBytesForLevelBase()
-   */
-  public int maxBytesForLevelMultiplier() {
-    return maxBytesForLevelMultiplier(nativeHandle_);
-  }
-  private native int maxBytesForLevelMultiplier(long handle);
-
-  /**
-   * The ratio between the total size of level-(L+1) files and the total
-   * size of level-L files for all L.
-   * DEFAULT: 10
-   *
-   * @param multiplier the ratio between the total size of level-(L+1)
-   *     files and the total size of level-L files for all L.
-   * @return the reference to the current option.
-   * @see setMaxBytesForLevelBase()
-   */
-  public Options setMaxBytesForLevelMultiplier(int multiplier) {
-    setMaxBytesForLevelMultiplier(nativeHandle_, multiplier);
-    return this;
-  }
-  private native void setMaxBytesForLevelMultiplier(
-      long handle, int multiplier);
-
-  /**
-   * Maximum number of bytes in all compacted files.  We avoid expanding
-   * the lower level file set of a compaction if it would make the
-   * total compaction cover more than
-   * (expanded_compaction_factor * targetFileSizeLevel()) many bytes.
-   *
-   * @return the maximum number of bytes in all compacted files.
-   * @see sourceCompactionFactor()
-   */
-  public int expandedCompactionFactor() {
-    return expandedCompactionFactor(nativeHandle_);
-  }
-  private native int expandedCompactionFactor(long handle);
-
-  /**
-   * Maximum number of bytes in all compacted files.  We avoid expanding
-   * the lower level file set of a compaction if it would make the
-   * total compaction cover more than
-   * (expanded_compaction_factor * targetFileSizeLevel()) many bytes.
-   *
-   * @param expandedCompactionFactor the maximum number of bytes in all
-   *     compacted files.
-   * @return the reference to the current option.
-   * @see setSourceCompactionFactor()
-   */
-  public Options setExpandedCompactionFactor(int expandedCompactionFactor) {
-    setExpandedCompactionFactor(nativeHandle_, expandedCompactionFactor);
-    return this;
-  }
-  private native void setExpandedCompactionFactor(
-      long handle, int expandedCompactionFactor);
-
-  /**
-   * Maximum number of bytes in all source files to be compacted in a
-   * single compaction run. We avoid picking too many files in the
-   * source level so that we do not exceed the total source bytes
-   * for compaction to exceed
-   * (source_compaction_factor * targetFileSizeLevel()) many bytes.
-   * Default:1, i.e. pick maxfilesize amount of data as the source of
-   * a compaction.
-   *
-   * @return the maximum number of bytes in all source files to be compactedo.
-   * @see expendedCompactionFactor()
-   */
-  public int sourceCompactionFactor() {
-    return sourceCompactionFactor(nativeHandle_);
-  }
-  private native int sourceCompactionFactor(long handle);
-
-  /**
-   * Maximum number of bytes in all source files to be compacted in a
-   * single compaction run. We avoid picking too many files in the
-   * source level so that we do not exceed the total source bytes
-   * for compaction to exceed
-   * (source_compaction_factor * targetFileSizeLevel()) many bytes.
-   * Default:1, i.e. pick maxfilesize amount of data as the source of
-   * a compaction.
-   *
-   * @param sourceCompactionFactor the maximum number of bytes in all
-   *     source files to be compacted in a single compaction run.
-   * @return the reference to the current option.
-   * @see setExpendedCompactionFactor()
-   */
-  public Options setSourceCompactionFactor(int sourceCompactionFactor) {
-    setSourceCompactionFactor(nativeHandle_, sourceCompactionFactor);
-    return this;
-  }
-  private native void setSourceCompactionFactor(
-      long handle, int sourceCompactionFactor);
-
-  /**
-   * Control maximum bytes of overlaps in grandparent (i.e., level+2) before we
-   * stop building a single file in a level->level+1 compaction.
-   *
-   * @return maximum bytes of overlaps in "grandparent" level.
-   */
-  public int maxGrandparentOverlapFactor() {
-    return maxGrandparentOverlapFactor(nativeHandle_);
-  }
-  private native int maxGrandparentOverlapFactor(long handle);
-
-  /**
-   * Control maximum bytes of overlaps in grandparent (i.e., level+2) before we
-   * stop building a single file in a level->level+1 compaction.
-   *
-   * @param maxGrandparentOverlapFactor maximum bytes of overlaps in
-   *     "grandparent" level.
-   * @return the reference to the current option.
-   */
-  public Options setMaxGrandparentOverlapFactor(
-      int maxGrandparentOverlapFactor) {
-    setMaxGrandparentOverlapFactor(nativeHandle_, maxGrandparentOverlapFactor);
-    return this;
-  }
-  private native void setMaxGrandparentOverlapFactor(
-      long handle, int maxGrandparentOverlapFactor);
-
-  /**
-   * Puts are delayed 0-1 ms when any level has a compaction score that exceeds
-   * soft_rate_limit. This is ignored when == 0.0.
-   * CONSTRAINT: soft_rate_limit <= hard_rate_limit. If this constraint does not
-   * hold, RocksDB will set soft_rate_limit = hard_rate_limit
-   * Default: 0 (disabled)
-   *
-   * @return soft-rate-limit for put delay.
-   */
-  public double softRateLimit() {
-    return softRateLimit(nativeHandle_);
-  }
-  private native double softRateLimit(long handle);
-
-  /**
-   * Puts are delayed 0-1 ms when any level has a compaction score that exceeds
-   * soft_rate_limit. This is ignored when == 0.0.
-   * CONSTRAINT: soft_rate_limit <= hard_rate_limit. If this constraint does not
-   * hold, RocksDB will set soft_rate_limit = hard_rate_limit
-   * Default: 0 (disabled)
-   *
-   * @param softRateLimit the soft-rate-limit of a compaction score
-   *     for put delay.
-   * @return the reference to the current option.
-   */
-  public Options setSoftRateLimit(double softRateLimit) {
-    setSoftRateLimit(nativeHandle_, softRateLimit);
-    return this;
-  }
-  private native void setSoftRateLimit(
-      long handle, double softRateLimit);
-
-  /**
-   * Puts are delayed 1ms at a time when any level has a compaction score that
-   * exceeds hard_rate_limit. This is ignored when <= 1.0.
-   * Default: 0 (disabled)
-   *
-   * @return the hard-rate-limit of a compaction score for put delay.
-   */
-  public double hardRateLimit() {
-    return hardRateLimit(nativeHandle_);
-  }
-  private native double hardRateLimit(long handle);
-
-  /**
-   * Puts are delayed 1ms at a time when any level has a compaction score that
-   * exceeds hard_rate_limit. This is ignored when <= 1.0.
-   * Default: 0 (disabled)
-   *
-   * @param hardRateLimit the hard-rate-limit of a compaction score for put
-   *     delay.
-   * @return the reference to the current option.
-   */
-  public Options setHardRateLimit(double hardRateLimit) {
-    setHardRateLimit(nativeHandle_, hardRateLimit);
-    return this;
-  }
-  private native void setHardRateLimit(
-      long handle, double hardRateLimit);
-
-  /**
-   * The maximum time interval a put will be stalled when hard_rate_limit
-   * is enforced.  If 0, then there is no limit.
-   * Default: 1000
-   *
-   * @return the maximum time interval a put will be stalled when
-   *     hard_rate_limit is enforced.
-   */
-  public int rateLimitDelayMaxMilliseconds() {
-    return rateLimitDelayMaxMilliseconds(nativeHandle_);
-  }
-  private native int rateLimitDelayMaxMilliseconds(long handle);
-
-  /**
-   * The maximum time interval a put will be stalled when hard_rate_limit
-   * is enforced. If 0, then there is no limit.
-   * Default: 1000
-   *
-   * @param rateLimitDelayMaxMilliseconds the maximum time interval a put
-   *     will be stalled.
-   * @return the reference to the current option.
-   */
-  public Options setRateLimitDelayMaxMilliseconds(
-      int rateLimitDelayMaxMilliseconds) {
-    setRateLimitDelayMaxMilliseconds(
-        nativeHandle_, rateLimitDelayMaxMilliseconds);
-    return this;
-  }
-  private native void setRateLimitDelayMaxMilliseconds(
-      long handle, int rateLimitDelayMaxMilliseconds);
-
-  /**
-   * Disable block cache. If this is set to true,
-   * then no block cache should be used, and the block_cache should
-   * point to a nullptr object.
-   * Default: false
-   *
-   * @return true if block cache is disabled.
-   */
-  public boolean noBlockCache() {
-    return noBlockCache(nativeHandle_);
-  }
-  private native boolean noBlockCache(long handle);
-
-  /**
-   * Disable block cache. If this is set to true,
-   * then no block cache should be used, and the block_cache should
-   * point to a nullptr object.
-   * Default: false
-   *
-   * @param noBlockCache true if block-cache is disabled.
-   * @return the reference to the current option.
-   */
-  public Options setNoBlockCache(boolean noBlockCache) {
-    setNoBlockCache(nativeHandle_, noBlockCache);
-    return this;
-  }
-  private native void setNoBlockCache(
-      long handle, boolean noBlockCache);
-
-  /**
-   * The size of one block in arena memory allocation.
-   * If <= 0, a proper value is automatically calculated (usually 1/10 of
-   * writer_buffer_size).
-   *
-   * There are two additonal restriction of the The specified size:
-   * (1) size should be in the range of [4096, 2 << 30] and
-   * (2) be the multiple of the CPU word (which helps with the memory
-   * alignment).
-   *
-   * We'll automatically check and adjust the size number to make sure it
-   * conforms to the restrictions.
-   * Default: 0
-   *
-   * @return the size of an arena block
-   */
-  public long arenaBlockSize() {
-    return arenaBlockSize(nativeHandle_);
-  }
-  private native long arenaBlockSize(long handle);
-
-  /**
-   * The size of one block in arena memory allocation.
-   * If <= 0, a proper value is automatically calculated (usually 1/10 of
-   * writer_buffer_size).
-   *
-   * There are two additonal restriction of the The specified size:
-   * (1) size should be in the range of [4096, 2 << 30] and
-   * (2) be the multiple of the CPU word (which helps with the memory
-   * alignment).
-   *
-   * We'll automatically check and adjust the size number to make sure it
-   * conforms to the restrictions.
-   * Default: 0
-   *
-   * @param arenaBlockSize the size of an arena block
-   * @return the reference to the current option.
-   */
-  public Options setArenaBlockSize(long arenaBlockSize) {
-    setArenaBlockSize(nativeHandle_, arenaBlockSize);
-    return this;
-  }
-  private native void setArenaBlockSize(
-      long handle, long arenaBlockSize);
-
-  /**
-   * Disable automatic compactions. Manual compactions can still
-   * be issued on this column family
-   *
-   * @return true if auto-compactions are disabled.
-   */
-  public boolean disableAutoCompactions() {
-    return disableAutoCompactions(nativeHandle_);
-  }
-  private native boolean disableAutoCompactions(long handle);
-
-  /**
-   * Disable automatic compactions. Manual compactions can still
-   * be issued on this column family
-   *
-   * @param disableAutoCompactions true if auto-compactions are disabled.
-   * @return the reference to the current option.
-   */
-  public Options setDisableAutoCompactions(boolean disableAutoCompactions) {
-    setDisableAutoCompactions(nativeHandle_, disableAutoCompactions);
-    return this;
-  }
-  private native void setDisableAutoCompactions(
-      long handle, boolean disableAutoCompactions);
-
-  /**
-   * Purge duplicate/deleted keys when a memtable is flushed to storage.
-   * Default: true
-   *
-   * @return true if purging keys is disabled.
-   */
-  public boolean purgeRedundantKvsWhileFlush() {
-    return purgeRedundantKvsWhileFlush(nativeHandle_);
-  }
-  private native boolean purgeRedundantKvsWhileFlush(long handle);
-
-  /**
-   * Purge duplicate/deleted keys when a memtable is flushed to storage.
-   * Default: true
-   *
-   * @param purgeRedundantKvsWhileFlush true if purging keys is disabled.
-   * @return the reference to the current option.
-   */
-  public Options setPurgeRedundantKvsWhileFlush(
-      boolean purgeRedundantKvsWhileFlush) {
-    setPurgeRedundantKvsWhileFlush(
-        nativeHandle_, purgeRedundantKvsWhileFlush);
-    return this;
-  }
-  private native void setPurgeRedundantKvsWhileFlush(
-      long handle, boolean purgeRedundantKvsWhileFlush);
-
-  /**
-   * This is used to close a block before it reaches the configured
-   * 'block_size'. If the percentage of free space in the current block is less
-   * than this specified number and adding a new record to the block will
-   * exceed the configured block size, then this block will be closed and the
-   * new record will be written to the next block.
-   * Default is 10.
-   *
-   * @return the target block size
-   */
-  public int blockSizeDeviation() {
-    return blockSizeDeviation(nativeHandle_);
-  }
-  private native int blockSizeDeviation(long handle);
-
-  /**
-   * This is used to close a block before it reaches the configured
-   * 'block_size'. If the percentage of free space in the current block is less
-   * than this specified number and adding a new record to the block will
-   * exceed the configured block size, then this block will be closed and the
-   * new record will be written to the next block.
-   * Default is 10.
-   *
-   * @param blockSizeDeviation the target block size
-   * @return the reference to the current option.
-   */
-  public Options setBlockSizeDeviation(int blockSizeDeviation) {
-    setBlockSizeDeviation(nativeHandle_, blockSizeDeviation);
-    return this;
-  }
-  private native void setBlockSizeDeviation(
-      long handle, int blockSizeDeviation);
-
-  /**
-   * If true, compaction will verify checksum on every read that happens
-   * as part of compaction
-   * Default: true
-   *
-   * @return true if compaction verifies checksum on every read.
-   */
-  public boolean verifyChecksumsInCompaction() {
-    return verifyChecksumsInCompaction(nativeHandle_);
-  }
-  private native boolean verifyChecksumsInCompaction(long handle);
-
-  /**
-   * If true, compaction will verify checksum on every read that happens
-   * as part of compaction
-   * Default: true
-   *
-   * @param verifyChecksumsInCompaction true if compaction verifies
-   *     checksum on every read.
-   * @return the reference to the current option.
-   */
-  public Options setVerifyChecksumsInCompaction(
-      boolean verifyChecksumsInCompaction) {
-    setVerifyChecksumsInCompaction(
-        nativeHandle_, verifyChecksumsInCompaction);
-    return this;
-  }
-  private native void setVerifyChecksumsInCompaction(
-      long handle, boolean verifyChecksumsInCompaction);
-
-  /**
-   * Use KeyMayExist API to filter deletes when this is true.
-   * If KeyMayExist returns false, i.e. the key definitely does not exist, then
-   * the delete is a noop. KeyMayExist only incurs in-memory look up.
-   * This optimization avoids writing the delete to storage when appropriate.
-   * Default: false
-   *
-   * @return true if filter-deletes behavior is on.
-   */
-  public boolean filterDeletes() {
-    return filterDeletes(nativeHandle_);
-  }
-  private native boolean filterDeletes(long handle);
-
-  /**
-   * Use KeyMayExist API to filter deletes when this is true.
-   * If KeyMayExist returns false, i.e. the key definitely does not exist, then
-   * the delete is a noop. KeyMayExist only incurs in-memory look up.
-   * This optimization avoids writing the delete to storage when appropriate.
-   * Default: false
-   *
-   * @param filterDeletes true if filter-deletes behavior is on.
-   * @return the reference to the current option.
-   */
-  public Options setFilterDeletes(boolean filterDeletes) {
-    setFilterDeletes(nativeHandle_, filterDeletes);
-    return this;
-  }
-  private native void setFilterDeletes(
-      long handle, boolean filterDeletes);
-
-  /**
-   * An iteration->Next() sequentially skips over keys with the same
-   * user-key unless this option is set. This number specifies the number
-   * of keys (with the same userkey) that will be sequentially
-   * skipped before a reseek is issued.
-   * Default: 8
-   *
-   * @return the number of keys could be skipped in a iteration.
-   */
-  public long maxSequentialSkipInIterations() {
-    return maxSequentialSkipInIterations(nativeHandle_);
-  }
-  private native long maxSequentialSkipInIterations(long handle);
-
-  /**
-   * An iteration->Next() sequentially skips over keys with the same
-   * user-key unless this option is set. This number specifies the number
-   * of keys (with the same userkey) that will be sequentially
-   * skipped before a reseek is issued.
-   * Default: 8
-   *
-   * @param maxSequentialSkipInIterations the number of keys could
-   *     be skipped in a iteration.
-   * @return the reference to the current option.
-   */
-  public Options setMaxSequentialSkipInIterations(long maxSequentialSkipInIterations) {
-    setMaxSequentialSkipInIterations(nativeHandle_, maxSequentialSkipInIterations);
-    return this;
-  }
-  private native void setMaxSequentialSkipInIterations(
-      long handle, long maxSequentialSkipInIterations);
-
-  /**
-   * Allows thread-safe inplace updates.
-   * If inplace_callback function is not set,
-   *   Put(key, new_value) will update inplace the existing_value iff
-   *   * key exists in current memtable
-   *   * new sizeof(new_value) <= sizeof(existing_value)
-   *   * existing_value for that key is a put i.e. kTypeValue
-   * If inplace_callback function is set, check doc for inplace_callback.
-   * Default: false.
-   *
-   * @return true if thread-safe inplace updates are allowed.
-   */
-  public boolean inplaceUpdateSupport() {
-    return inplaceUpdateSupport(nativeHandle_);
-  }
-  private native boolean inplaceUpdateSupport(long handle);
-
-  /**
-   * Allows thread-safe inplace updates.
-   * If inplace_callback function is not set,
-   *   Put(key, new_value) will update inplace the existing_value iff
-   *   * key exists in current memtable
-   *   * new sizeof(new_value) <= sizeof(existing_value)
-   *   * existing_value for that key is a put i.e. kTypeValue
-   * If inplace_callback function is set, check doc for inplace_callback.
-   * Default: false.
-   *
-   * @param inplaceUpdateSupport true if thread-safe inplace updates
-   *     are allowed.
-   * @return the reference to the current option.
-   */
-  public Options setInplaceUpdateSupport(boolean inplaceUpdateSupport) {
-    setInplaceUpdateSupport(nativeHandle_, inplaceUpdateSupport);
-    return this;
-  }
-  private native void setInplaceUpdateSupport(
-      long handle, boolean inplaceUpdateSupport);
-
-  /**
-   * Number of locks used for inplace update
-   * Default: 10000, if inplace_update_support = true, else 0.
-   *
-   * @return the number of locks used for inplace update.
-   */
-  public long inplaceUpdateNumLocks() {
-    return inplaceUpdateNumLocks(nativeHandle_);
-  }
-  private native long inplaceUpdateNumLocks(long handle);
-
-  /**
-   * Number of locks used for inplace update
-   * Default: 10000, if inplace_update_support = true, else 0.
-   *
-   * @param inplaceUpdateNumLocks the number of locks used for
-   *     inplace updates.
-   * @return the reference to the current option.
-   */
-  public Options setInplaceUpdateNumLocks(long inplaceUpdateNumLocks) {
-    setInplaceUpdateNumLocks(nativeHandle_, inplaceUpdateNumLocks);
-    return this;
-  }
-  private native void setInplaceUpdateNumLocks(
-      long handle, long inplaceUpdateNumLocks);
-
-  /**
-   * Returns the number of bits used in the prefix bloom filter.
-   *
-   * This value will be used only when a prefix-extractor is specified.
-   *
-   * @return the number of bloom-bits.
-   * @see useFixedLengthPrefixExtractor()
-   */
-  public int memtablePrefixBloomBits() {
-    return memtablePrefixBloomBits(nativeHandle_);
-  }
-  private native int memtablePrefixBloomBits(long handle);
-
-  /**
-   * Sets the number of bits used in the prefix bloom filter.
-   *
-   * This value will be used only when a prefix-extractor is specified.
-   *
-   * @param memtablePrefixBloomBits the number of bits used in the
-   *     prefix bloom filter.
-   * @return the reference to the current option.
-   */
-  public Options setMemtablePrefixBloomBits(int memtablePrefixBloomBits) {
-    setMemtablePrefixBloomBits(nativeHandle_, memtablePrefixBloomBits);
-    return this;
-  }
-  private native void setMemtablePrefixBloomBits(
-      long handle, int memtablePrefixBloomBits);
-
-  /**
-   * The number of hash probes per key used in the mem-table.
-   *
-   * @return the number of hash probes per key.
-   */
-  public int memtablePrefixBloomProbes() {
-    return memtablePrefixBloomProbes(nativeHandle_);
-  }
-  private native int memtablePrefixBloomProbes(long handle);
-
-  /**
-   * The number of hash probes per key used in the mem-table.
-   *
-   * @param memtablePrefixBloomProbes the number of hash probes per key.
-   * @return the reference to the current option.
-   */
-  public Options setMemtablePrefixBloomProbes(int memtablePrefixBloomProbes) {
-    setMemtablePrefixBloomProbes(nativeHandle_, memtablePrefixBloomProbes);
-    return this;
-  }
-  private native void setMemtablePrefixBloomProbes(
-      long handle, int memtablePrefixBloomProbes);
-
-  /**
-   * Control locality of bloom filter probes to improve cache miss rate.
-   * This option only applies to memtable prefix bloom and plaintable
-   * prefix bloom. It essentially limits the max number of cache lines each
-   * bloom filter check can touch.
-   * This optimization is turned off when set to 0. The number should never
-   * be greater than number of probes. This option can boost performance
-   * for in-memory workload but should use with care since it can cause
-   * higher false positive rate.
-   * Default: 0
-   *
-   * @return the level of locality of bloom-filter probes.
-   * @see setMemTablePrefixBloomProbes
-   */
-  public int bloomLocality() {
-    return bloomLocality(nativeHandle_);
-  }
-  private native int bloomLocality(long handle);
-
-  /**
-   * Control locality of bloom filter probes to improve cache miss rate.
-   * This option only applies to memtable prefix bloom and plaintable
-   * prefix bloom. It essentially limits the max number of cache lines each
-   * bloom filter check can touch.
-   * This optimization is turned off when set to 0. The number should never
-   * be greater than number of probes. This option can boost performance
-   * for in-memory workload but should use with care since it can cause
-   * higher false positive rate.
-   * Default: 0
-   *
-   * @param bloomLocality the level of locality of bloom-filter probes.
-   * @return the reference to the current option.
-   */
-  public Options setBloomLocality(int bloomLocality) {
-    setBloomLocality(nativeHandle_, bloomLocality);
-    return this;
-  }
-  private native void setBloomLocality(
-      long handle, int bloomLocality);
-
-  /**
-   * Maximum number of successive merge operations on a key in the memtable.
-   *
-   * When a merge operation is added to the memtable and the maximum number of
-   * successive merges is reached, the value of the key will be calculated and
-   * inserted into the memtable instead of the merge operation. This will
-   * ensure that there are never more than max_successive_merges merge
-   * operations in the memtable.
-   *
-   * Default: 0 (disabled)
-   *
-   * @return the maximum number of successive merges.
-   */
-  public long maxSuccessiveMerges() {
-    return maxSuccessiveMerges(nativeHandle_);
-  }
-  private native long maxSuccessiveMerges(long handle);
-
-  /**
-   * Maximum number of successive merge operations on a key in the memtable.
-   *
-   * When a merge operation is added to the memtable and the maximum number of
-   * successive merges is reached, the value of the key will be calculated and
-   * inserted into the memtable instead of the merge operation. This will
-   * ensure that there are never more than max_successive_merges merge
-   * operations in the memtable.
-   *
-   * Default: 0 (disabled)
-   *
-   * @param maxSuccessiveMerges the maximum number of successive merges.
-   * @return the reference to the current option.
-   */
-  public Options setMaxSuccessiveMerges(long maxSuccessiveMerges) {
-    setMaxSuccessiveMerges(nativeHandle_, maxSuccessiveMerges);
-    return this;
-  }
-  private native void setMaxSuccessiveMerges(
-      long handle, long maxSuccessiveMerges);
-
-  /**
-   * The minimum number of write buffers that will be merged together
-   * before writing to storage.  If set to 1, then
-   * all write buffers are fushed to L0 as individual files and this increases
-   * read amplification because a get request has to check in all of these
-   * files. Also, an in-memory merge may result in writing lesser
-   * data to storage if there are duplicate records in each of these
-   * individual write buffers.  Default: 1
-   *
-   * @return the minimum number of write buffers that will be merged together.
-   */
-  public int minWriteBufferNumberToMerge() {
-    return minWriteBufferNumberToMerge(nativeHandle_);
-  }
-  private native int minWriteBufferNumberToMerge(long handle);
-
-  /**
-   * The minimum number of write buffers that will be merged together
-   * before writing to storage.  If set to 1, then
-   * all write buffers are fushed to L0 as individual files and this increases
-   * read amplification because a get request has to check in all of these
-   * files. Also, an in-memory merge may result in writing lesser
-   * data to storage if there are duplicate records in each of these
-   * individual write buffers.  Default: 1
-   *
-   * @param minWriteBufferNumberToMerge the minimum number of write buffers
-   *     that will be merged together.
-   * @return the reference to the current option.
-   */
-  public Options setMinWriteBufferNumberToMerge(int minWriteBufferNumberToMerge) {
-    setMinWriteBufferNumberToMerge(nativeHandle_, minWriteBufferNumberToMerge);
-    return this;
-  }
-  private native void setMinWriteBufferNumberToMerge(
-      long handle, int minWriteBufferNumberToMerge);
-
-  /**
-   * The number of partial merge operands to accumulate before partial
-   * merge will be performed. Partial merge will not be called
-   * if the list of values to merge is less than min_partial_merge_operands.
-   *
-   * If min_partial_merge_operands < 2, then it will be treated as 2.
-   *
-   * Default: 2
-   *
-   * @return
-   */
-  public int minPartialMergeOperands() {
-    return minPartialMergeOperands(nativeHandle_);
-  }
-  private native int minPartialMergeOperands(long handle);
-
-  /**
-   * The number of partial merge operands to accumulate before partial
-   * merge will be performed. Partial merge will not be called
-   * if the list of values to merge is less than min_partial_merge_operands.
-   *
-   * If min_partial_merge_operands < 2, then it will be treated as 2.
-   *
-   * Default: 2
-   *
-   * @param minPartialMergeOperands
-   * @return the reference to the current option.
-   */
-  public Options setMinPartialMergeOperands(int minPartialMergeOperands) {
-    setMinPartialMergeOperands(nativeHandle_, minPartialMergeOperands);
-    return this;
-  }
-  private native void setMinPartialMergeOperands(
-      long handle, int minPartialMergeOperands);
-
-  /**
-   * Release the memory allocated for the current instance
-   * in the c++ side.
-   */
-  @Override public synchronized void dispose() {
-    if (isInitialized()) {
-      dispose0();
-    }
-  }
-
-  static final int DEFAULT_PLAIN_TABLE_BLOOM_BITS_PER_KEY = 10;
-  static final double DEFAULT_PLAIN_TABLE_HASH_TABLE_RATIO = 0.75;
-  static final int DEFAULT_PLAIN_TABLE_INDEX_SPARSENESS = 16;
-
-  private native void newOptions();
-  private native void dispose0();
-  private native void setCreateIfMissing(long handle, boolean flag);
-  private native boolean createIfMissing(long handle);
-  private native void setWriteBufferSize(long handle, long writeBufferSize);
-  private native long writeBufferSize(long handle);
-  private native void setMaxWriteBufferNumber(
-      long handle, int maxWriteBufferNumber);
-  private native int maxWriteBufferNumber(long handle);
-  private native void setBlockSize(long handle, long blockSize);
-  private native long blockSize(long handle);
-  private native void setDisableSeekCompaction(
-      long handle, boolean disableSeekCompaction);
-  private native boolean disableSeekCompaction(long handle);
-  private native void setMaxBackgroundCompactions(
-      long handle, int maxBackgroundCompactions);
-  private native int maxBackgroundCompactions(long handle);
-  private native void createStatistics(long optHandle);
-  private native long statisticsPtr(long optHandle);
-
-  private native void setMemTableFactory(long handle, long factoryHandle);
-  private native String memTableFactoryName(long handle);
-
-  private native void setTableFactory(long handle, long factoryHandle);
-  private native String tableFactoryName(long handle);
-
-  private native void useFixedLengthPrefixExtractor(
-      long handle, int prefixLength);
-
-  long cacheSize_;
-  Filter filter_;
-}
diff --git a/src/rocksdb/java/org/rocksdb/PlainTableConfig.java b/src/rocksdb/java/org/rocksdb/PlainTableConfig.java
deleted file mode 100644
index 554ce38..0000000
--- a/src/rocksdb/java/org/rocksdb/PlainTableConfig.java
+++ /dev/null
@@ -1,123 +0,0 @@
-// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
-package org.rocksdb;
-
-/**
- * The config for plain table sst format.
- *
- * PlainTable is a RocksDB's SST file format optimized for low query latency
- * on pure-memory or really low-latency media.  It also support prefix
- * hash feature.
- */
-public class PlainTableConfig extends TableFormatConfig {
-  public static final int VARIABLE_LENGTH = 0;
-  public static final int DEFAULT_BLOOM_BITS_PER_KEY = 10;
-  public static final double DEFAULT_HASH_TABLE_RATIO = 0.75;
-  public static final int DEFAULT_INDEX_SPARSENESS = 16;
-
-  public PlainTableConfig() {
-    keySize_ = VARIABLE_LENGTH;
-    bloomBitsPerKey_ = DEFAULT_BLOOM_BITS_PER_KEY;
-    hashTableRatio_ = DEFAULT_HASH_TABLE_RATIO;
-    indexSparseness_ = DEFAULT_INDEX_SPARSENESS;
-  }
-
-  /**
-   * Set the length of the user key. If it is set to be VARIABLE_LENGTH,
-   * then it indicates the user keys are variable-lengthed.  Otherwise,
-   * all the keys need to have the same length in byte.
-   * DEFAULT: VARIABLE_LENGTH
-   *
-   * @param keySize the length of the user key.
-   * @return the reference to the current config.
-   */
-  public PlainTableConfig setKeySize(int keySize) {
-    keySize_ = keySize;
-    return this;
-  }
-
-  /**
-   * @return the specified size of the user key.  If VARIABLE_LENGTH,
-   *     then it indicates variable-length key.
-   */
-  public int keySize() {
-    return keySize_;
-  }
-
-  /**
-   * Set the number of bits per key used by the internal bloom filter
-   * in the plain table sst format.
-   *
-   * @param bitsPerKey the number of bits per key for bloom filer.
-   * @return the reference to the current config.
-   */
-  public PlainTableConfig setBloomBitsPerKey(int bitsPerKey) {
-    bloomBitsPerKey_ = bitsPerKey;
-    return this;
-  }
-
-  /**
-   * @return the number of bits per key used for the bloom filter.
-   */
-  public int bloomBitsPerKey() {
-    return bloomBitsPerKey_;
-  }
-
-  /**
-   * hashTableRatio is the desired utilization of the hash table used
-   * for prefix hashing.  The ideal ratio would be the number of
-   * prefixes / the number of hash buckets.  If this value is set to
-   * zero, then hash table will not be used.
-   *
-   * @param ratio the hash table ratio.
-   * @return the reference to the current config.
-   */
-  public PlainTableConfig setHashTableRatio(double ratio) {
-    hashTableRatio_ = ratio;
-    return this;
-  }
-
-  /**
-   * @return the hash table ratio.
-   */
-  public double hashTableRatio() {
-    return hashTableRatio_;
-  }
-
-  /**
-   * Index sparseness determines the index interval for keys inside the
-   * same prefix.  This number is equal to the maximum number of linear
-   * search required after hash and binary search.  If it's set to 0,
-   * then each key will be indexed.
-   *
-   * @param sparseness the index sparseness.
-   * @return the reference to the current config.
-   */
-  public PlainTableConfig setIndexSparseness(int sparseness) {
-    indexSparseness_ = sparseness;
-    return this;
-  }
-
-  /**
-   * @return the index sparseness.
-   */
-  public int indexSparseness() {
-    return indexSparseness_;
-  }
-
-  @Override protected long newTableFactoryHandle() {
-    return newTableFactoryHandle(keySize_, bloomBitsPerKey_,
-        hashTableRatio_, indexSparseness_);
-  }
-
-  private native long newTableFactoryHandle(
-      int keySize, int bloomBitsPerKey,
-      double hashTableRatio, int indexSparseness);
-
-  private int keySize_;
-  private int bloomBitsPerKey_;
-  private double hashTableRatio_;
-  private int indexSparseness_;
-}
diff --git a/src/rocksdb/java/org/rocksdb/ReadOptions.java b/src/rocksdb/java/org/rocksdb/ReadOptions.java
deleted file mode 100644
index 23250fc..0000000
--- a/src/rocksdb/java/org/rocksdb/ReadOptions.java
+++ /dev/null
@@ -1,130 +0,0 @@
-// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
-
-package org.rocksdb;
-
-/**
- * The class that controls the get behavior.
- *
- * Note that dispose() must be called before an Options instance
- * become out-of-scope to release the allocated memory in c++.
- */
-public class ReadOptions extends RocksObject {
-  public ReadOptions() {
-    super();
-    newReadOptions();
-  }
-  private native void newReadOptions();
-
-  /**
-   * Release the memory allocated for the current instance
-   * in the c++ side.
-   *
-   * Calling other methods after dispose() leads to undefined behavior.
-   */
-  @Override public synchronized void dispose() {
-    if (isInitialized()) {
-      dispose(nativeHandle_);
-    }
-  }
-  private native void dispose(long handle);
-
-  /**
-   * If true, all data read from underlying storage will be
-   * verified against corresponding checksums.
-   * Default: true
-   *
-   * @return true if checksum verification is on.
-   */
-  public boolean verifyChecksums() {
-    assert(isInitialized());
-    return verifyChecksums(nativeHandle_);
-  }
-  private native boolean verifyChecksums(long handle);
-
-  /**
-   * If true, all data read from underlying storage will be
-   * verified against corresponding checksums.
-   * Default: true
-   *
-   * @param verifyChecksums if true, then checksum verification
-   *     will be performed on every read.
-   * @return the reference to the current ReadOptions.
-   */
-  public ReadOptions setVerifyChecksums(boolean verifyChecksums) {
-    assert(isInitialized());
-    setVerifyChecksums(nativeHandle_, verifyChecksums);
-    return this;
-  }
-  private native void setVerifyChecksums(
-      long handle, boolean verifyChecksums);
-
-  // TODO(yhchiang): this option seems to be block-based table only.
-  //                 move this to a better place?
-  /**
-   * Fill the cache when loading the block-based sst formated db.
-   * Callers may wish to set this field to false for bulk scans.
-   * Default: true
-   *
-   * @return true if the fill-cache behavior is on.
-   */
-  public boolean fillCache() {
-    assert(isInitialized());
-    return fillCache(nativeHandle_);
-  }
-  private native boolean fillCache(long handle);
-
-  /**
-   * Fill the cache when loading the block-based sst formated db.
-   * Callers may wish to set this field to false for bulk scans.
-   * Default: true
-   *
-   * @param fillCache if true, then fill-cache behavior will be
-   *     performed.
-   * @return the reference to the current ReadOptions.
-   */
-  public ReadOptions setFillCache(boolean fillCache) {
-    assert(isInitialized());
-    setFillCache(nativeHandle_, fillCache);
-    return this;
-  }
-  private native void setFillCache(
-      long handle, boolean fillCache);
-
-  /**
-   * Specify to create a tailing iterator -- a special iterator that has a
-   * view of the complete database (i.e. it can also be used to read newly
-   * added data) and is optimized for sequential reads. It will return records
-   * that were inserted into the database after the creation of the iterator.
-   * Default: false
-   * Not supported in ROCKSDB_LITE mode!
-   *
-   * @return true if tailing iterator is enabled.
-   */
-  public boolean tailing() {
-    assert(isInitialized());
-    return tailing(nativeHandle_);
-  }
-  private native boolean tailing(long handle);
-
-  /**
-   * Specify to create a tailing iterator -- a special iterator that has a
-   * view of the complete database (i.e. it can also be used to read newly
-   * added data) and is optimized for sequential reads. It will return records
-   * that were inserted into the database after the creation of the iterator.
-   * Default: false
-   * Not supported in ROCKSDB_LITE mode!
-   *
-   * @param tailing if true, then tailing iterator will be enabled.
-   * @return the reference to the current ReadOptions.
-   */
-  public ReadOptions setTailing(boolean tailing) {
-    assert(isInitialized());
-    setTailing(nativeHandle_, tailing);
-    return this;
-  }
-  private native void setTailing(
-      long handle, boolean tailing);
-}
diff --git a/src/rocksdb/java/org/rocksdb/RocksDB.java b/src/rocksdb/java/org/rocksdb/RocksDB.java
deleted file mode 100644
index e92acea..0000000
--- a/src/rocksdb/java/org/rocksdb/RocksDB.java
+++ /dev/null
@@ -1,376 +0,0 @@
-// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
-
-package org.rocksdb;
-
-import java.util.List;
-import java.util.Map;
-import java.util.HashMap;
-import java.io.Closeable;
-import java.io.IOException;
-import org.rocksdb.util.Environment;
-
-/**
- * A RocksDB is a persistent ordered map from keys to values.  It is safe for
- * concurrent access from multiple threads without any external synchronization.
- * All methods of this class could potentially throw RocksDBException, which
- * indicates sth wrong at the rocksdb library side and the call failed.
- */
-public class RocksDB extends RocksObject {
-  public static final int NOT_FOUND = -1;
-  private static final String[] compressionLibs_ = {
-      "snappy", "zlib", "bzip2", "lz4", "lz4hc"};
-
-  /**
-   * Loads the necessary library files.
-   * Calling this method twice will have no effect.
-   */
-  public static synchronized void loadLibrary() {
-    // loading possibly necessary libraries.
-    for (String lib : compressionLibs_) {
-      try {
-      System.loadLibrary(lib);
-      } catch (UnsatisfiedLinkError e) {
-        // since it may be optional, we ignore its loading failure here.
-      }
-    }
-    // However, if any of them is required.  We will see error here.
-    System.loadLibrary("rocksdbjni");
-  }
-
-  /**
-   * Tries to load the necessary library files from the given list of
-   * directories.
-   *
-   * @param paths a list of strings where each describes a directory
-   *     of a library.
-   */
-  public static synchronized void loadLibrary(List<String> paths) {
-    for (String lib : compressionLibs_) {
-      for (String path : paths) {
-        try {
-          System.load(path + "/" + Environment.getSharedLibraryName(lib));
-          break;
-        } catch (UnsatisfiedLinkError e) {
-          // since they are optional, we ignore loading fails.
-        }
-      }
-    }
-    boolean success = false;
-    UnsatisfiedLinkError err = null;
-    for (String path : paths) {
-      try {
-        System.load(path + "/" + Environment.getJniLibraryName("rocksdbjni"));
-        success = true;
-        break;
-      } catch (UnsatisfiedLinkError e) {
-        err = e;
-      }
-    }
-    if (success == false) {
-      throw err;
-    }
-  }
-
-  /**
-   * The factory constructor of RocksDB that opens a RocksDB instance given
-   * the path to the database using the default options w/ createIfMissing
-   * set to true.
-   *
-   * @param path the path to the rocksdb.
-   * @param status an out value indicating the status of the Open().
-   * @return a rocksdb instance on success, null if the specified rocksdb can
-   *     not be opened.
-   *
-   * @see Options.setCreateIfMissing()
-   * @see Options.createIfMissing()
-   */
-  public static RocksDB open(String path) throws RocksDBException {
-    RocksDB db = new RocksDB();
-
-    // This allows to use the rocksjni default Options instead of
-    // the c++ one.
-    Options options = new Options();
-    db.open(options.nativeHandle_, options.cacheSize_, path);
-    db.transferCppRawPointersOwnershipFrom(options);
-    options.dispose();
-    return db;
-  }
-
-  /**
-   * The factory constructor of RocksDB that opens a RocksDB instance given
-   * the path to the database using the specified options and db path.
-   */
-  public static RocksDB open(Options options, String path)
-      throws RocksDBException {
-    // when non-default Options is used, keeping an Options reference
-    // in RocksDB can prevent Java to GC during the life-time of
-    // the currently-created RocksDB.
-    RocksDB db = new RocksDB();
-    db.open(options.nativeHandle_, options.cacheSize_, path);
-    db.transferCppRawPointersOwnershipFrom(options);
-    return db;
-  }
-
-  @Override public synchronized void dispose() {
-    if (isInitialized()) {
-      dispose(nativeHandle_);
-      nativeHandle_ = 0;
-    }
-  }
-
-  /**
-   * Close the RocksDB instance.
-   * This function is equivalent to dispose().
-   */
-  public void close() {
-    dispose();
-  }
-
-  /**
-   * Set the database entry for "key" to "value".
-   *
-   * @param key the specified key to be inserted.
-   * @param value the value associated with the specified key.
-   */
-  public void put(byte[] key, byte[] value) throws RocksDBException {
-    put(nativeHandle_, key, key.length, value, value.length);
-  }
-
-  /**
-   * Set the database entry for "key" to "value".
-   *
-   * @param key the specified key to be inserted.
-   * @param value the value associated with the specified key.
-   */
-  public void put(WriteOptions writeOpts, byte[] key, byte[] value)
-      throws RocksDBException {
-    put(nativeHandle_, writeOpts.nativeHandle_,
-        key, key.length, value, value.length);
-  }
-
-  /**
-   * Apply the specified updates to the database.
-   */
-  public void write(WriteOptions writeOpts, WriteBatch updates)
-      throws RocksDBException {
-    write(writeOpts.nativeHandle_, updates.nativeHandle_);
-  }
-
-  /**
-   * Get the value associated with the specified key.
-   *
-   * @param key the key to retrieve the value.
-   * @param value the out-value to receive the retrieved value.
-   * @return The size of the actual value that matches the specified
-   *     {@code key} in byte.  If the return value is greater than the
-   *     length of {@code value}, then it indicates that the size of the
-   *     input buffer {@code value} is insufficient and partial result will
-   *     be returned.  RocksDB.NOT_FOUND will be returned if the value not
-   *     found.
-   */
-  public int get(byte[] key, byte[] value) throws RocksDBException {
-    return get(nativeHandle_, key, key.length, value, value.length);
-  }
-
-  /**
-   * Get the value associated with the specified key.
-   *
-   * @param key the key to retrieve the value.
-   * @param value the out-value to receive the retrieved value.
-   * @return The size of the actual value that matches the specified
-   *     {@code key} in byte.  If the return value is greater than the
-   *     length of {@code value}, then it indicates that the size of the
-   *     input buffer {@code value} is insufficient and partial result will
-   *     be returned.  RocksDB.NOT_FOUND will be returned if the value not
-   *     found.
-   */
-  public int get(ReadOptions opt, byte[] key, byte[] value)
-      throws RocksDBException {
-    return get(nativeHandle_, opt.nativeHandle_,
-               key, key.length, value, value.length);
-  }
-
-  /**
-   * The simplified version of get which returns a new byte array storing
-   * the value associated with the specified input key if any.  null will be
-   * returned if the specified key is not found.
-   *
-   * @param key the key retrieve the value.
-   * @return a byte array storing the value associated with the input key if
-   *     any.  null if it does not find the specified key.
-   *
-   * @see RocksDBException
-   */
-  public byte[] get(byte[] key) throws RocksDBException {
-    return get(nativeHandle_, key, key.length);
-  }
-
-  /**
-   * The simplified version of get which returns a new byte array storing
-   * the value associated with the specified input key if any.  null will be
-   * returned if the specified key is not found.
-   *
-   * @param key the key retrieve the value.
-   * @param opt Read options.
-   * @return a byte array storing the value associated with the input key if
-   *     any.  null if it does not find the specified key.
-   *
-   * @see RocksDBException
-   */
-  public byte[] get(ReadOptions opt, byte[] key) throws RocksDBException {
-    return get(nativeHandle_, opt.nativeHandle_, key, key.length);
-  }
-
-  /**
-   * Returns a map of keys for which values were found in DB.
-   *
-   * @param keys List of keys for which values need to be retrieved.
-   * @return Map where key of map is the key passed by user and value for map
-   * entry is the corresponding value in DB.
-   *
-   * @see RocksDBException
-   */
-  public Map<byte[], byte[]> multiGet(List<byte[]> keys)
-      throws RocksDBException {
-    assert(keys.size() != 0);
-
-    List<byte[]> values = multiGet(
-        nativeHandle_, keys, keys.size());
-
-    Map<byte[], byte[]> keyValueMap = new HashMap<byte[], byte[]>();
-    for(int i = 0; i < values.size(); i++) {
-      if(values.get(i) == null) {
-        continue;
-      }
-
-      keyValueMap.put(keys.get(i), values.get(i));
-    }
-
-    return keyValueMap;
-  }
-
-
-  /**
-   * Returns a map of keys for which values were found in DB.
-   *
-   * @param List of keys for which values need to be retrieved.
-   * @param opt Read options.
-   * @return Map where key of map is the key passed by user and value for map
-   * entry is the corresponding value in DB.
-   *
-   * @see RocksDBException
-   */
-  public Map<byte[], byte[]> multiGet(ReadOptions opt, List<byte[]> keys)
-      throws RocksDBException {
-    assert(keys.size() != 0);
-
-    List<byte[]> values = multiGet(
-        nativeHandle_, opt.nativeHandle_, keys, keys.size());
-
-    Map<byte[], byte[]> keyValueMap = new HashMap<byte[], byte[]>();
-    for(int i = 0; i < values.size(); i++) {
-      if(values.get(i) == null) {
-        continue;
-      }
-
-      keyValueMap.put(keys.get(i), values.get(i));
-    }
-
-    return keyValueMap;
-  }
-
-  /**
-   * Remove the database entry (if any) for "key".  Returns OK on
-   * success, and a non-OK status on error.  It is not an error if "key"
-   * did not exist in the database.
-   */
-  public void remove(byte[] key) throws RocksDBException {
-    remove(nativeHandle_, key, key.length);
-  }
-
-  /**
-   * Remove the database entry (if any) for "key".  Returns OK on
-   * success, and a non-OK status on error.  It is not an error if "key"
-   * did not exist in the database.
-   */
-  public void remove(WriteOptions writeOpt, byte[] key)
-      throws RocksDBException {
-    remove(nativeHandle_, writeOpt.nativeHandle_, key, key.length);
-  }
-
-  /**
-   * Return a heap-allocated iterator over the contents of the database.
-   * The result of newIterator() is initially invalid (caller must
-   * call one of the Seek methods on the iterator before using it).
-   *
-   * Caller should close the iterator when it is no longer needed.
-   * The returned iterator should be closed before this db is closed.
-   *
-   * @return instance of iterator object.
-   */
-  public Iterator newIterator() {
-    return new Iterator(iterator0(nativeHandle_));
-  }
-
-  @Override protected void finalize() {
-    close();
-  }
-
-  /**
-   * Private constructor.
-   */
-  protected RocksDB() {
-    super();
-  }
-
-  /**
-   * Transfer the ownership of all c++ raw-pointers from Options
-   * to RocksDB to ensure the life-time of those raw-pointers
-   * will be at least as long as the life-time of any RocksDB
-   * that uses these raw-pointers.
-   */
-  protected void transferCppRawPointersOwnershipFrom(Options opt) {
-    filter_ = opt.filter_;
-    opt.filter_ = null;
-  }
-
-  // native methods
-  protected native void open(
-      long optionsHandle, long cacheSize, String path) throws RocksDBException;
-  protected native void put(
-      long handle, byte[] key, int keyLen,
-      byte[] value, int valueLen) throws RocksDBException;
-  protected native void put(
-      long handle, long writeOptHandle,
-      byte[] key, int keyLen,
-      byte[] value, int valueLen) throws RocksDBException;
-  protected native void write(
-      long writeOptHandle, long batchHandle) throws RocksDBException;
-  protected native int get(
-      long handle, byte[] key, int keyLen,
-      byte[] value, int valueLen) throws RocksDBException;
-  protected native int get(
-      long handle, long readOptHandle, byte[] key, int keyLen,
-      byte[] value, int valueLen) throws RocksDBException;
-  protected native List<byte[]> multiGet(
-      long dbHandle, List<byte[]> keys, int keysCount);
-  protected native List<byte[]> multiGet(
-      long dbHandle, long rOptHandle, List<byte[]> keys, int keysCount);
-  protected native byte[] get(
-      long handle, byte[] key, int keyLen) throws RocksDBException;
-  protected native byte[] get(
-      long handle, long readOptHandle,
-      byte[] key, int keyLen) throws RocksDBException;
-  protected native void remove(
-      long handle, byte[] key, int keyLen) throws RocksDBException;
-  protected native void remove(
-      long handle, long writeOptHandle,
-      byte[] key, int keyLen) throws RocksDBException;
-  protected native long iterator0(long optHandle);
-  protected native void dispose(long handle);
-
-  protected Filter filter_;
-}
diff --git a/src/rocksdb/java/org/rocksdb/RocksDBException.java b/src/rocksdb/java/org/rocksdb/RocksDBException.java
deleted file mode 100644
index acc9366..0000000
--- a/src/rocksdb/java/org/rocksdb/RocksDBException.java
+++ /dev/null
@@ -1,23 +0,0 @@
-// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
-
-package org.rocksdb;
-
-import java.util.*;
-
-/**
- * A RocksDBException encapsulates the error of an operation.  This exception
- * type is used to describe an internal error from the c++ rocksdb library.
- */
-public class RocksDBException extends Exception {
-  /**
-   * The private construct used by a set of public static factory method.
-   *
-   * @param msg the specified error message.
-   */
-  public RocksDBException(String msg) {
-    super(msg);
-  }
-}
diff --git a/src/rocksdb/java/org/rocksdb/RocksObject.java b/src/rocksdb/java/org/rocksdb/RocksObject.java
deleted file mode 100644
index 6e36cba..0000000
--- a/src/rocksdb/java/org/rocksdb/RocksObject.java
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
-
-package org.rocksdb;
-
-/**
- * RocksObject is the base-class of all RocksDB related class that has
- * a pointer to some c++ rocksdb object.  Although RocksObject
- * will release its c++ resource on its finalize() once it has been
- * garbage-collected, it is suggested to call dispose() manually to
- * release its c++ resource once an instance of RocksObject is no
- * longer used.
- */
-public abstract class RocksObject {
-  protected RocksObject() {
-    nativeHandle_ = 0;
-  }
-
-  /**
-   * Release the c++ object pointed by the native handle.
-   */
-  public abstract void dispose();
-
-  protected boolean isInitialized() {
-    return (nativeHandle_ != 0);
-  }
-
-  @Override protected void finalize() {
-    dispose();
-  }
-
-  protected long nativeHandle_;
-}
diff --git a/src/rocksdb/java/org/rocksdb/SkipListMemTableConfig.java b/src/rocksdb/java/org/rocksdb/SkipListMemTableConfig.java
deleted file mode 100644
index 7f9f5cb..0000000
--- a/src/rocksdb/java/org/rocksdb/SkipListMemTableConfig.java
+++ /dev/null
@@ -1,15 +0,0 @@
-package org.rocksdb;
-
-/**
- * The config for skip-list memtable representation.
- */
-public class SkipListMemTableConfig extends MemTableConfig {
-  public SkipListMemTableConfig() {
-  }
-
-  @Override protected long newMemTableFactoryHandle() {
-    return newMemTableFactoryHandle0();
-  }
-
-  private native long newMemTableFactoryHandle0();
-}
diff --git a/src/rocksdb/java/org/rocksdb/Statistics.java b/src/rocksdb/java/org/rocksdb/Statistics.java
deleted file mode 100644
index bed2b88..0000000
--- a/src/rocksdb/java/org/rocksdb/Statistics.java
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
-
-package org.rocksdb;
-
-/**
- * Statistics to analyze the performance of a db. Pointer for statistics object
- * is managed by Options class.
- */
-public class Statistics {
-
-  private final long statsHandle_;
-
-  public Statistics(long statsHandle) {
-    statsHandle_ = statsHandle;
-  }
-
-  public long getTickerCount(TickerType tickerType) {
-    assert(isInitialized());
-    return getTickerCount0(tickerType.getValue(), statsHandle_);
-  }
-
-  public HistogramData geHistogramData(HistogramType histogramType) {
-    assert(isInitialized());
-    HistogramData hist = geHistogramData0(
-        histogramType.getValue(), statsHandle_);
-    return hist;
-  }
-
-  private boolean isInitialized() {
-    return (statsHandle_ != 0);
-  }
-
-  private native long getTickerCount0(int tickerType, long handle);
-  private native HistogramData geHistogramData0(int histogramType, long handle);
-}
diff --git a/src/rocksdb/java/org/rocksdb/TableFormatConfig.java b/src/rocksdb/java/org/rocksdb/TableFormatConfig.java
deleted file mode 100644
index e5c6341..0000000
--- a/src/rocksdb/java/org/rocksdb/TableFormatConfig.java
+++ /dev/null
@@ -1,20 +0,0 @@
-// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
-package org.rocksdb;
-
-/**
- * TableFormatConfig is used to config the internal Table format of a RocksDB.
- * To make a RocksDB to use a specific Table format, its associated
- * TableFormatConfig should be properly set and passed into Options via
- * Options.setTableFormatConfig() and open the db using that Options.
- */
-public abstract class TableFormatConfig {
-  /**
-   * This function should only be called by Options.setTableFormatConfig(),
-   * which will create a c++ shared-pointer to the c++ TableFactory
-   * that associated with the Java TableFormatConfig.
-   */
-  abstract protected long newTableFactoryHandle();
-}
diff --git a/src/rocksdb/java/org/rocksdb/TickerType.java b/src/rocksdb/java/org/rocksdb/TickerType.java
deleted file mode 100644
index 5ad714d..0000000
--- a/src/rocksdb/java/org/rocksdb/TickerType.java
+++ /dev/null
@@ -1,123 +0,0 @@
-// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
-
-package org.rocksdb;
-
-public enum TickerType {
-  // total block cache misses
-  // REQUIRES: BLOCK_CACHE_MISS == BLOCK_CACHE_INDEX_MISS +
-  //                               BLOCK_CACHE_FILTER_MISS +
-  //                               BLOCK_CACHE_DATA_MISS;
-  BLOCK_CACHE_MISS(0),
-  // total block cache hit
-  // REQUIRES: BLOCK_CACHE_HIT == BLOCK_CACHE_INDEX_HIT +
-  //                              BLOCK_CACHE_FILTER_HIT +
-  //                              BLOCK_CACHE_DATA_HIT;
-  BLOCK_CACHE_HIT(1),
-  // # of blocks added to block cache.
-  BLOCK_CACHE_ADD(2),
-  // # of times cache miss when accessing index block from block cache.
-  BLOCK_CACHE_INDEX_MISS(3),
-  // # of times cache hit when accessing index block from block cache.
-  BLOCK_CACHE_INDEX_HIT(4),
-  // # of times cache miss when accessing filter block from block cache.
-  BLOCK_CACHE_FILTER_MISS(5),
-  // # of times cache hit when accessing filter block from block cache.
-  BLOCK_CACHE_FILTER_HIT(6),
-  // # of times cache miss when accessing data block from block cache.
-  BLOCK_CACHE_DATA_MISS(7),
-  // # of times cache hit when accessing data block from block cache.
-  BLOCK_CACHE_DATA_HIT(8),
-  // # of times bloom filter has avoided file reads.
-  BLOOM_FILTER_USEFUL(9),
-
-  // # of memtable hits.
-  MEMTABLE_HIT(10),
-  // # of memtable misses.
-  MEMTABLE_MISS(11),
-
-  /**
-   * COMPACTION_KEY_DROP_* count the reasons for key drop during compaction
-   * There are 3 reasons currently.
-   */
-  COMPACTION_KEY_DROP_NEWER_ENTRY(12),  // key was written with a newer value.
-  COMPACTION_KEY_DROP_OBSOLETE(13),     // The key is obsolete.
-  COMPACTION_KEY_DROP_USER(14),  // user compaction function has dropped the key.
-
-  // Number of keys written to the database via the Put and Write call's
-  NUMBER_KEYS_WRITTEN(15),
-  // Number of Keys read,
-  NUMBER_KEYS_READ(16),
-  // Number keys updated, if inplace update is enabled
-  NUMBER_KEYS_UPDATED(17),
-  // Bytes written / read
-  BYTES_WRITTEN(18),
-  BYTES_READ(19),
-  NO_FILE_CLOSES(20),
-  NO_FILE_OPENS(21),
-  NO_FILE_ERRORS(22),
-  // Time system had to wait to do LO-L1 compactions
-  STALL_L0_SLOWDOWN_MICROS(23),
-  // Time system had to wait to move memtable to L1.
-  STALL_MEMTABLE_COMPACTION_MICROS(24),
-  // write throttle because of too many files in L0
-  STALL_L0_NUM_FILES_MICROS(25),
-  RATE_LIMIT_DELAY_MILLIS(26),
-  NO_ITERATORS(27),  // number of iterators currently open
-
-  // Number of MultiGet calls, keys read, and bytes read
-  NUMBER_MULTIGET_CALLS(28),
-  NUMBER_MULTIGET_KEYS_READ(29),
-  NUMBER_MULTIGET_BYTES_READ(30),
-
-  // Number of deletes records that were not required to be
-  // written to storage because key does not exist
-  NUMBER_FILTERED_DELETES(31),
-  NUMBER_MERGE_FAILURES(32),
-  SEQUENCE_NUMBER(33),
-
-  // number of times bloom was checked before creating iterator on a
-  // file, and the number of times the check was useful in avoiding
-  // iterator creation (and thus likely IOPs).
-  BLOOM_FILTER_PREFIX_CHECKED(34),
-  BLOOM_FILTER_PREFIX_USEFUL(35),
-
-  // Number of times we had to reseek inside an iteration to skip
-  // over large number of keys with same userkey.
-  NUMBER_OF_RESEEKS_IN_ITERATION(36),
-
-  // Record the number of calls to GetUpadtesSince. Useful to keep track of
-  // transaction log iterator refreshes
-  GET_UPDATES_SINCE_CALLS(37),
-  BLOCK_CACHE_COMPRESSED_MISS(38),  // miss in the compressed block cache
-  BLOCK_CACHE_COMPRESSED_HIT(39),   // hit in the compressed block cache
-  WAL_FILE_SYNCED(40),              // Number of times WAL sync is done
-  WAL_FILE_BYTES(41),               // Number of bytes written to WAL
-
-  // Writes can be processed by requesting thread or by the thread at the
-  // head of the writers queue.
-  WRITE_DONE_BY_SELF(42),
-  WRITE_DONE_BY_OTHER(43),
-  WRITE_WITH_WAL(44),       // Number of Write calls that request WAL
-  COMPACT_READ_BYTES(45),   // Bytes read during compaction
-  COMPACT_WRITE_BYTES(46),  // Bytes written during compaction
-
-  // Number of table's properties loaded directly from file, without creating
-  // table reader object.
-  NUMBER_DIRECT_LOAD_TABLE_PROPERTIES(47),
-  NUMBER_SUPERVERSION_ACQUIRES(48),
-  NUMBER_SUPERVERSION_RELEASES(49),
-  NUMBER_SUPERVERSION_CLEANUPS(50);
-
-  private final int value_;
-
-  private TickerType(int value) {
-    value_ = value;
-  }
-
-  public int getValue() {
-    return value_;
-  }
-}
diff --git a/src/rocksdb/java/org/rocksdb/VectorMemTableConfig.java b/src/rocksdb/java/org/rocksdb/VectorMemTableConfig.java
deleted file mode 100644
index b7a413f..0000000
--- a/src/rocksdb/java/org/rocksdb/VectorMemTableConfig.java
+++ /dev/null
@@ -1,40 +0,0 @@
-package org.rocksdb;
-
-/**
- * The config for vector memtable representation.
- */
-public class VectorMemTableConfig extends MemTableConfig {
-  public static final int DEFAULT_RESERVED_SIZE = 0;
-  public VectorMemTableConfig() {
-    reservedSize_ = DEFAULT_RESERVED_SIZE;
-  }
-
-  /**
-   * Set the initial size of the vector that will be used
-   * by the memtable created based on this config.
-   *
-   * @param size the initial size of the vector.
-   * @return the reference to the current config.
-   */
-  public VectorMemTableConfig setReservedSize(int size) {
-    reservedSize_ = size;
-    return this;
-  }
-
-  /**
-   * Returns the initial size of the vector used by the memtable
-   * created based on this config.
-   *
-   * @return the initial size of the vector.
-   */
-  public int reservedSize() {
-    return reservedSize_;
-  }
-
-  @Override protected long newMemTableFactoryHandle() {
-    return newMemTableFactoryHandle(reservedSize_);
-  }
-
-  private native long newMemTableFactoryHandle(long reservedSize);
-  private int reservedSize_;
-}
diff --git a/src/rocksdb/java/org/rocksdb/WriteBatch.java b/src/rocksdb/java/org/rocksdb/WriteBatch.java
deleted file mode 100644
index 1ddbd44..0000000
--- a/src/rocksdb/java/org/rocksdb/WriteBatch.java
+++ /dev/null
@@ -1,113 +0,0 @@
-// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
-
-package org.rocksdb;
-
-import java.util.*;
-
-/**
- * WriteBatch holds a collection of updates to apply atomically to a DB.
- *
- * The updates are applied in the order in which they are added
- * to the WriteBatch.  For example, the value of "key" will be "v3"
- * after the following batch is written:
- *
- *    batch.put("key", "v1");
- *    batch.remove("key");
- *    batch.put("key", "v2");
- *    batch.put("key", "v3");
- *
- * Multiple threads can invoke const methods on a WriteBatch without
- * external synchronization, but if any of the threads may call a
- * non-const method, all threads accessing the same WriteBatch must use
- * external synchronization.
- */
-public class WriteBatch extends RocksObject {
-  public WriteBatch() {
-    super();
-    newWriteBatch(0);
-  }
-
-  public WriteBatch(int reserved_bytes) {
-    nativeHandle_ = 0;
-    newWriteBatch(reserved_bytes);
-  }
-
-  /**
-   * Returns the number of updates in the batch.
-   */
-  public native int count();
-
-  /**
-   * Store the mapping "key->value" in the database.
-   */
-  public void put(byte[] key, byte[] value) {
-    put(key, key.length, value, value.length);
-  }
-
-  /**
-   * Merge "value" with the existing value of "key" in the database.
-   * "key->merge(existing, value)"
-   */
-  public void merge(byte[] key, byte[] value) {
-    merge(key, key.length, value, value.length);
-  }
-
-  /**
-   * If the database contains a mapping for "key", erase it.  Else do nothing.
-   */
-  public void remove(byte[] key) {
-    remove(key, key.length);
-  }
-
-  /**
-   * Append a blob of arbitrary size to the records in this batch. The blob will
-   * be stored in the transaction log but not in any other file. In particular,
-   * it will not be persisted to the SST files. When iterating over this
-   * WriteBatch, WriteBatch::Handler::LogData will be called with the contents
-   * of the blob as it is encountered. Blobs, puts, deletes, and merges will be
-   * encountered in the same order in thich they were inserted. The blob will
-   * NOT consume sequence number(s) and will NOT increase the count of the batch
-   *
-   * Example application: add timestamps to the transaction log for use in
-   * replication.
-   */
-  public void putLogData(byte[] blob) {
-    putLogData(blob, blob.length);
-  }
-
-  /**
-   * Clear all updates buffered in this batch
-   */
-  public native void clear();
-
-  /**
-   * Delete the c++ side pointer.
-   */
-  @Override public synchronized void dispose() {
-    if (isInitialized()) {
-      dispose0();
-    }
-  }
-
-  private native void newWriteBatch(int reserved_bytes);
-  private native void put(byte[] key, int keyLen,
-                          byte[] value, int valueLen);
-  private native void merge(byte[] key, int keyLen,
-                            byte[] value, int valueLen);
-  private native void remove(byte[] key, int keyLen);
-  private native void putLogData(byte[] blob, int blobLen);
-  private native void dispose0();
-}
-
-/**
- * Package-private class which provides java api to access
- * c++ WriteBatchInternal.
- */
-class WriteBatchInternal {
-  static native void setSequence(WriteBatch batch, long sn);
-  static native long sequence(WriteBatch batch);
-  static native void append(WriteBatch b1, WriteBatch b2);
-}
diff --git a/src/rocksdb/java/org/rocksdb/WriteBatchTest.java b/src/rocksdb/java/org/rocksdb/WriteBatchTest.java
deleted file mode 100644
index 03a8663..0000000
--- a/src/rocksdb/java/org/rocksdb/WriteBatchTest.java
+++ /dev/null
@@ -1,124 +0,0 @@
-//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-//
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-package org.rocksdb;
-
-import java.util.*;
-import java.io.UnsupportedEncodingException;
-
-/**
- * This class mimics the db/write_batch_test.cc in the c++ rocksdb library.
- */
-public class WriteBatchTest {
-  static {
-    RocksDB.loadLibrary();
-  }
-
-  public static void main(String args[]) {
-    System.out.println("Testing WriteBatchTest.Empty ===");
-    Empty();
-
-    System.out.println("Testing WriteBatchTest.Multiple ===");
-    Multiple();
-
-    System.out.println("Testing WriteBatchTest.Append ===");
-    Append();
-
-    System.out.println("Testing WriteBatchTest.Blob ===");
-    Blob();
-
-    // The following tests have not yet ported.
-    // Continue();
-    // PutGatherSlices();
-
-    System.out.println("Passed all WriteBatchTest!");
-  }
-
-  static void Empty() {
-    WriteBatch batch = new WriteBatch();
-    assert(batch.count() == 0);
-  }
-
-  static void Multiple() {
-    try {
-      WriteBatch batch =  new WriteBatch();
-      batch.put("foo".getBytes("US-ASCII"), "bar".getBytes("US-ASCII"));
-      batch.remove("box".getBytes("US-ASCII"));
-      batch.put("baz".getBytes("US-ASCII"), "boo".getBytes("US-ASCII"));
-      WriteBatchInternal.setSequence(batch, 100);
-      assert(100 == WriteBatchInternal.sequence(batch));
-      assert(3 == batch.count());
-      assert(new String("Put(baz, boo)@102" +
-                        "Delete(box)@101" +
-                        "Put(foo, bar)@100")
-                .equals(new String(getContents(batch), "US-ASCII")));
-    } catch (UnsupportedEncodingException e) {
-      System.err.println(e);
-      assert(false);
-    }
-  }
-
-  static void Append() {
-    WriteBatch b1 = new WriteBatch();
-    WriteBatch b2 = new WriteBatch();
-    WriteBatchInternal.setSequence(b1, 200);
-    WriteBatchInternal.setSequence(b2, 300);
-    WriteBatchInternal.append(b1, b2);
-    assert(getContents(b1).length == 0);
-    assert(b1.count() == 0);
-    try {
-      b2.put("a".getBytes("US-ASCII"), "va".getBytes("US-ASCII"));
-      WriteBatchInternal.append(b1, b2);
-      assert("Put(a, va)@200".equals(new String(getContents(b1), "US-ASCII")));
-      assert(1 == b1.count());
-      b2.clear();
-      b2.put("b".getBytes("US-ASCII"), "vb".getBytes("US-ASCII"));
-      WriteBatchInternal.append(b1, b2);
-      assert(new String("Put(a, va)@200" +
-                        "Put(b, vb)@201")
-                .equals(new String(getContents(b1), "US-ASCII")));
-      assert(2 == b1.count());
-      b2.remove("foo".getBytes("US-ASCII"));
-      WriteBatchInternal.append(b1, b2);
-      assert(new String("Put(a, va)@200" +
-                        "Put(b, vb)@202" +
-                        "Put(b, vb)@201" +
-                        "Delete(foo)@203")
-                 .equals(new String(getContents(b1), "US-ASCII")));
-      assert(4 == b1.count());
-    } catch (UnsupportedEncodingException e) {
-      System.err.println(e);
-      assert(false);
-    }
-  }
-
-  static void Blob() {
-    WriteBatch batch = new WriteBatch();
-    try {
-      batch.put("k1".getBytes("US-ASCII"), "v1".getBytes("US-ASCII"));
-      batch.put("k2".getBytes("US-ASCII"), "v2".getBytes("US-ASCII"));
-      batch.put("k3".getBytes("US-ASCII"), "v3".getBytes("US-ASCII"));
-      batch.putLogData("blob1".getBytes("US-ASCII"));
-      batch.remove("k2".getBytes("US-ASCII"));
-      batch.putLogData("blob2".getBytes("US-ASCII"));
-      batch.merge("foo".getBytes("US-ASCII"), "bar".getBytes("US-ASCII"));
-      assert(5 == batch.count());
-      assert(new String("Merge(foo, bar)@4" +
-                        "Put(k1, v1)@0" +
-                        "Delete(k2)@3" +
-                        "Put(k2, v2)@1" +
-                        "Put(k3, v3)@2")
-                .equals(new String(getContents(batch), "US-ASCII")));
-    } catch (UnsupportedEncodingException e) {
-      System.err.println(e);
-      assert(false);
-    }
-  }
-
-  static native byte[] getContents(WriteBatch batch);
-}
diff --git a/src/rocksdb/java/org/rocksdb/WriteOptions.java b/src/rocksdb/java/org/rocksdb/WriteOptions.java
deleted file mode 100644
index f4a1d6a..0000000
--- a/src/rocksdb/java/org/rocksdb/WriteOptions.java
+++ /dev/null
@@ -1,100 +0,0 @@
-// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
-
-package org.rocksdb;
-
-/**
- * Options that control write operations.
- *
- * Note that developers should call WriteOptions.dispose() to release the
- * c++ side memory before a WriteOptions instance runs out of scope.
- */
-public class WriteOptions extends RocksObject {
-  public WriteOptions() {
-    super();
-    newWriteOptions();
-  }
-
-  @Override public synchronized void dispose() {
-    if (isInitialized()) {
-      dispose0(nativeHandle_);
-    }
-  }
-
-  /**
-   * If true, the write will be flushed from the operating system
-   * buffer cache (by calling WritableFile::Sync()) before the write
-   * is considered complete.  If this flag is true, writes will be
-   * slower.
-   *
-   * If this flag is false, and the machine crashes, some recent
-   * writes may be lost.  Note that if it is just the process that
-   * crashes (i.e., the machine does not reboot), no writes will be
-   * lost even if sync==false.
-   *
-   * In other words, a DB write with sync==false has similar
-   * crash semantics as the "write()" system call.  A DB write
-   * with sync==true has similar crash semantics to a "write()"
-   * system call followed by "fdatasync()".
-   *
-   * Default: false
-   *
-   * @param flag a boolean flag to indicate whether a write
-   *     should be synchronized.
-   * @return the instance of the current WriteOptions.
-   */
-  public WriteOptions setSync(boolean flag) {
-    setSync(nativeHandle_, flag);
-    return this;
-  }
-
-  /**
-   * If true, the write will be flushed from the operating system
-   * buffer cache (by calling WritableFile::Sync()) before the write
-   * is considered complete.  If this flag is true, writes will be
-   * slower.
-   *
-   * If this flag is false, and the machine crashes, some recent
-   * writes may be lost.  Note that if it is just the process that
-   * crashes (i.e., the machine does not reboot), no writes will be
-   * lost even if sync==false.
-   *
-   * In other words, a DB write with sync==false has similar
-   * crash semantics as the "write()" system call.  A DB write
-   * with sync==true has similar crash semantics to a "write()"
-   * system call followed by "fdatasync()".
-   */
-  public boolean sync() {
-    return sync(nativeHandle_);
-  }
-
-  /**
-   * If true, writes will not first go to the write ahead log,
-   * and the write may got lost after a crash.
-   *
-   * @param flag a boolean flag to specify whether to disable
-   *     write-ahead-log on writes.
-   * @return the instance of the current WriteOptions.
-   */
-  public WriteOptions setDisableWAL(boolean flag) {
-    setDisableWAL(nativeHandle_, flag);
-    return this;
-  }
-
-  /**
-   * If true, writes will not first go to the write ahead log,
-   * and the write may got lost after a crash.
-   */
-  public boolean disableWAL() {
-    return disableWAL(nativeHandle_);
-  }
-
-  private native void newWriteOptions();
-  private native void setSync(long handle, boolean flag);
-  private native boolean sync(long handle);
-  private native void setDisableWAL(long handle, boolean flag);
-  private native boolean disableWAL(long handle);
-  private native void dispose0(long handle);
-}
diff --git a/src/rocksdb/java/org/rocksdb/benchmark/DbBenchmark.java b/src/rocksdb/java/org/rocksdb/benchmark/DbBenchmark.java
deleted file mode 100644
index 5404b72..0000000
--- a/src/rocksdb/java/org/rocksdb/benchmark/DbBenchmark.java
+++ /dev/null
@@ -1,1577 +0,0 @@
-// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
-/**
- * Copyright (C) 2011 the original author or authors.
- * See the notice.md file distributed with this work for additional
- * information regarding copyright ownership.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.rocksdb.benchmark;
-
-import java.lang.Runnable;
-import java.io.File;
-import java.nio.ByteBuffer;
-import java.util.Collection;
-import java.util.Date;
-import java.util.EnumMap;
-import java.util.List;
-import java.util.Map;
-import java.util.Random;
-import java.util.concurrent.TimeUnit;
-import java.util.Arrays;
-import java.util.ArrayList;
-import java.util.concurrent.Callable;
-import java.util.concurrent.Executors;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Future;
-import java.util.concurrent.TimeUnit;
-import org.rocksdb.*;
-import org.rocksdb.util.SizeUnit;
-
-class Stats {
-  int id_;
-  long start_;
-  long finish_;
-  double seconds_;
-  long done_;
-  long found_;
-  long lastOpTime_;
-  long nextReport_;
-  long bytes_;
-  StringBuilder message_;
-  boolean excludeFromMerge_;
-
-  // TODO(yhchiang): use the following arguments:
-  //   (Long)Flag.stats_interval
-  //   (Integer)Flag.stats_per_interval
-
-  Stats(int id) {
-    id_ = id;
-    nextReport_ = 100;
-    done_ = 0;
-    bytes_ = 0;
-    seconds_ = 0;
-    start_ = System.nanoTime();
-    lastOpTime_ = start_;
-    finish_ = start_;
-    found_ = 0;
-    message_ = new StringBuilder("");
-    excludeFromMerge_ = false;
-  }
-
-  void merge(final Stats other) {
-    if (other.excludeFromMerge_) {
-      return;
-    }
-
-    done_ += other.done_;
-    found_ += other.found_;
-    bytes_ += other.bytes_;
-    seconds_ += other.seconds_;
-    if (other.start_ < start_) start_ = other.start_;
-    if (other.finish_ > finish_) finish_ = other.finish_;
-
-    // Just keep the messages from one thread
-    if (message_.length() == 0) {
-      message_ = other.message_;
-    }
-  }
-
-  void stop() {
-    finish_ = System.nanoTime();
-    seconds_ = (double) (finish_ - start_) / 1000000;
-  }
-
-  void addMessage(String msg) {
-    if (message_.length() > 0) {
-      message_.append(" ");
-    }
-    message_.append(msg);
-  }
-
-  void setId(int id) { id_ = id; }
-  void setExcludeFromMerge() { excludeFromMerge_ = true; }
-
-  void finishedSingleOp(int bytes) {
-    done_++;
-    lastOpTime_ = System.nanoTime();
-    bytes_ += bytes;
-    if (done_ >= nextReport_) {
-      if (nextReport_ < 1000) {
-        nextReport_ += 100;
-      } else if (nextReport_ < 5000) {
-        nextReport_ += 500;
-      } else if (nextReport_ < 10000) {
-        nextReport_ += 1000;
-      } else if (nextReport_ < 50000) {
-        nextReport_ += 5000;
-      } else if (nextReport_ < 100000) {
-        nextReport_ += 10000;
-      } else if (nextReport_ < 500000) {
-        nextReport_ += 50000;
-      } else {
-        nextReport_ += 100000;
-      }
-      System.err.printf("... Task %s finished %d ops%30s\r", id_, done_, "");
-    }
-  }
-
-  void report(String name) {
-    // Pretend at least one op was done in case we are running a benchmark
-    // that does not call FinishedSingleOp().
-    if (done_ < 1) done_ = 1;
-
-    StringBuilder extra = new StringBuilder("");
-    if (bytes_ > 0) {
-      // Rate is computed on actual elapsed time, not the sum of per-thread
-      // elapsed times.
-      double elapsed = (finish_ - start_) * 1e-6;
-      extra.append(String.format("%6.1f MB/s", (bytes_ / 1048576.0) / elapsed));
-    }
-    extra.append(message_.toString());
-    double elapsed = (finish_ - start_) * 1e-6;
-    double throughput = (double) done_ / elapsed;
-
-    System.out.format("%-12s : %11.3f micros/op %d ops/sec;%s%s\n",
-            name, elapsed * 1e6 / done_,
-            (long) throughput, (extra.length() == 0 ? "" : " "), extra.toString());
-  }
-}
-
-public class DbBenchmark {
-  enum Order {
-    SEQUENTIAL,
-    RANDOM
-  }
-
-  enum DBState {
-    FRESH,
-    EXISTING
-  }
-
-  enum CompressionType {
-    NONE,
-    SNAPPY,
-    ZLIB,
-    BZIP2,
-    LZ4,
-    LZ4HC
-  }
-
-  static {
-    RocksDB.loadLibrary();
-  }
-
-  abstract class BenchmarkTask implements Callable<Stats> {
-    // TODO(yhchiang): use (Integer)Flag.perf_level.
-    public BenchmarkTask(
-        int tid, long randSeed, long numEntries, long keyRange) {
-      tid_ = tid;
-      rand_ = new Random(randSeed + tid * 1000);
-      numEntries_ = numEntries;
-      keyRange_ = keyRange;
-      stats_ = new Stats(tid);
-    }
-
-    @Override public Stats call() throws RocksDBException {
-      stats_.start_ = System.nanoTime();
-      runTask();
-      stats_.finish_ = System.nanoTime();
-      return stats_;
-    }
-
-    abstract protected void runTask() throws RocksDBException;
-
-    protected int tid_;
-    protected Random rand_;
-    protected long numEntries_;
-    protected long keyRange_;
-    protected Stats stats_;
-
-    protected void getFixedKey(byte[] key, long sn) {
-      generateKeyFromLong(key, sn);
-    }
-
-    protected void getRandomKey(byte[] key, long range) {
-      generateKeyFromLong(key, Math.abs(rand_.nextLong() % range));
-    }
-  }
-
-  abstract class WriteTask extends BenchmarkTask {
-    public WriteTask(
-        int tid, long randSeed, long numEntries, long keyRange,
-        WriteOptions writeOpt, long entriesPerBatch) {
-      super(tid, randSeed, numEntries, keyRange);
-      writeOpt_ = writeOpt;
-      entriesPerBatch_ = entriesPerBatch;
-      maxWritesPerSecond_ = -1;
-    }
-
-    public WriteTask(
-        int tid, long randSeed, long numEntries, long keyRange,
-        WriteOptions writeOpt, long entriesPerBatch, long maxWritesPerSecond) {
-      super(tid, randSeed, numEntries, keyRange);
-      writeOpt_ = writeOpt;
-      entriesPerBatch_ = entriesPerBatch;
-      maxWritesPerSecond_ = maxWritesPerSecond;
-    }
-
-    @Override public void runTask() throws RocksDBException {
-      if (numEntries_ != DbBenchmark.this.num_) {
-        stats_.message_.append(String.format(" (%d ops)", numEntries_));
-      }
-      byte[] key = new byte[keySize_];
-      byte[] value = new byte[valueSize_];
-
-      try {
-        if (entriesPerBatch_ == 1) {
-          for (long i = 0; i < numEntries_; ++i) {
-            getKey(key, i, keyRange_);
-            db_.put(writeOpt_, key, DbBenchmark.this.gen_.generate(valueSize_));
-            stats_.finishedSingleOp(keySize_ + valueSize_);
-            writeRateControl(i);
-            if (isFinished()) {
-              return;
-            }
-          }
-        } else {
-          for (long i = 0; i < numEntries_; i += entriesPerBatch_) {
-            WriteBatch batch = new WriteBatch();
-            for (long j = 0; j < entriesPerBatch_; j++) {
-              getKey(key, i + j, keyRange_);
-              batch.put(key, DbBenchmark.this.gen_.generate(valueSize_));
-              stats_.finishedSingleOp(keySize_ + valueSize_);
-            }
-            db_.write(writeOpt_, batch);
-            batch.dispose();
-            writeRateControl(i);
-            if (isFinished()) {
-              return;
-            }
-          }
-        }
-      } catch (InterruptedException e) {
-        // thread has been terminated.
-      }
-    }
-
-    protected void writeRateControl(long writeCount)
-        throws InterruptedException {
-      if (maxWritesPerSecond_ <= 0) return;
-      long minInterval =
-          writeCount * TimeUnit.SECONDS.toNanos(1) / maxWritesPerSecond_;
-      long interval = System.nanoTime() - stats_.start_;
-      if (minInterval - interval > TimeUnit.MILLISECONDS.toNanos(1)) {
-        TimeUnit.NANOSECONDS.sleep(minInterval - interval);
-      }
-    }
-
-    abstract protected void getKey(byte[] key, long id, long range);
-    protected WriteOptions writeOpt_;
-    protected long entriesPerBatch_;
-    protected long maxWritesPerSecond_;
-  }
-
-  class WriteSequentialTask extends WriteTask {
-    public WriteSequentialTask(
-        int tid, long randSeed, long numEntries, long keyRange,
-        WriteOptions writeOpt, long entriesPerBatch) {
-      super(tid, randSeed, numEntries, keyRange,
-            writeOpt, entriesPerBatch);
-    }
-    public WriteSequentialTask(
-        int tid, long randSeed, long numEntries, long keyRange,
-        WriteOptions writeOpt, long entriesPerBatch,
-        long maxWritesPerSecond) {
-      super(tid, randSeed, numEntries, keyRange,
-            writeOpt, entriesPerBatch,
-            maxWritesPerSecond);
-    }
-    @Override protected void getKey(byte[] key, long id, long range) {
-      getFixedKey(key, id);
-    }
-  }
-
-  class WriteRandomTask extends WriteTask {
-    public WriteRandomTask(
-        int tid, long randSeed, long numEntries, long keyRange,
-        WriteOptions writeOpt, long entriesPerBatch) {
-      super(tid, randSeed, numEntries, keyRange,
-            writeOpt, entriesPerBatch);
-    }
-    public WriteRandomTask(
-        int tid, long randSeed, long numEntries, long keyRange,
-        WriteOptions writeOpt, long entriesPerBatch,
-        long maxWritesPerSecond) {
-      super(tid, randSeed, numEntries, keyRange,
-            writeOpt, entriesPerBatch,
-            maxWritesPerSecond);
-    }
-    @Override protected void getKey(byte[] key, long id, long range) {
-      getRandomKey(key, range);
-    }
-  }
-
-  class WriteUniqueRandomTask extends WriteTask {
-    static final int MAX_BUFFER_SIZE = 10000000;
-    public WriteUniqueRandomTask(
-        int tid, long randSeed, long numEntries, long keyRange,
-        WriteOptions writeOpt, long entriesPerBatch) {
-      super(tid, randSeed, numEntries, keyRange,
-            writeOpt, entriesPerBatch);
-      initRandomKeySequence();
-    }
-    public WriteUniqueRandomTask(
-        int tid, long randSeed, long numEntries, long keyRange,
-        WriteOptions writeOpt, long entriesPerBatch,
-        long maxWritesPerSecond) {
-      super(tid, randSeed, numEntries, keyRange,
-            writeOpt, entriesPerBatch,
-            maxWritesPerSecond);
-      initRandomKeySequence();
-    }
-    @Override protected void getKey(byte[] key, long id, long range) {
-      generateKeyFromLong(key, nextUniqueRandom());
-    }
-
-    protected void initRandomKeySequence() {
-      bufferSize_ = MAX_BUFFER_SIZE;
-      if (bufferSize_ > keyRange_) {
-        bufferSize_ = (int) keyRange_;
-      }
-      currentKeyCount_ = bufferSize_;
-      keyBuffer_ = new long[MAX_BUFFER_SIZE];
-      for (int k = 0; k < bufferSize_; ++k) {
-        keyBuffer_[k] = k;
-      }
-    }
-
-    /**
-     * Semi-randomly return the next unique key.  It is guaranteed to be
-     * fully random if keyRange_ <= MAX_BUFFER_SIZE.
-     */
-    long nextUniqueRandom() {
-      if (bufferSize_ == 0) {
-        System.err.println("bufferSize_ == 0.");
-        return 0;
-      }
-      int r = rand_.nextInt(bufferSize_);
-      // randomly pick one from the keyBuffer
-      long randKey = keyBuffer_[r];
-      if (currentKeyCount_ < keyRange_) {
-        // if we have not yet inserted all keys, insert next new key to [r].
-        keyBuffer_[r] = currentKeyCount_++;
-      } else {
-        // move the last element to [r] and decrease the size by 1.
-        keyBuffer_[r] = keyBuffer_[--bufferSize_];
-      }
-      return randKey;
-    }
-
-    int bufferSize_;
-    long currentKeyCount_;
-    long[] keyBuffer_;
-  }
-
-  class ReadRandomTask extends BenchmarkTask {
-    public ReadRandomTask(
-        int tid, long randSeed, long numEntries, long keyRange) {
-      super(tid, randSeed, numEntries, keyRange);
-    }
-    @Override public void runTask() throws RocksDBException {
-      byte[] key = new byte[keySize_];
-      byte[] value = new byte[valueSize_];
-      for (long i = 0; i < numEntries_; i++) {
-        getRandomKey(key, numEntries_);
-        int len = db_.get(key, value);
-        if (len != RocksDB.NOT_FOUND) {
-          stats_.found_++;
-          stats_.finishedSingleOp(keySize_ + valueSize_);
-        } else {
-          stats_.finishedSingleOp(keySize_);
-        }
-        if (isFinished()) {
-          return;
-        }
-      }
-    }
-  }
-
-  class ReadSequentialTask extends BenchmarkTask {
-    public ReadSequentialTask(
-        int tid, long randSeed, long numEntries, long keyRange) {
-      super(tid, randSeed, numEntries, keyRange);
-    }
-    @Override public void runTask() throws RocksDBException {
-      org.rocksdb.Iterator iter = db_.newIterator();
-      long i;
-      for (iter.seekToFirst(), i = 0;
-           iter.isValid() && i < numEntries_;
-           iter.next(), ++i) {
-        stats_.found_++;
-        stats_.finishedSingleOp(iter.key().length + iter.value().length);
-        if (isFinished()) {
-          return;
-        }
-      }
-    }
-  }
-
-  public DbBenchmark(Map<Flag, Object> flags) throws Exception {
-    benchmarks_ = (List<String>) flags.get(Flag.benchmarks);
-    num_ = (Integer) flags.get(Flag.num);
-    threadNum_ = (Integer) flags.get(Flag.threads);
-    reads_ = (Integer) (flags.get(Flag.reads) == null ?
-        flags.get(Flag.num) : flags.get(Flag.reads));
-    keySize_ = (Integer) flags.get(Flag.key_size);
-    valueSize_ = (Integer) flags.get(Flag.value_size);
-    compressionRatio_ = (Double) flags.get(Flag.compression_ratio);
-    useExisting_ = (Boolean) flags.get(Flag.use_existing_db);
-    randSeed_ = (Long) flags.get(Flag.seed);
-    databaseDir_ = (String) flags.get(Flag.db);
-    writesPerSeconds_ = (Integer) flags.get(Flag.writes_per_second);
-    cacheSize_ = (Long) flags.get(Flag.cache_size);
-    memtable_ = (String) flags.get(Flag.memtablerep);
-    maxWriteBufferNumber_ = (Integer) flags.get(Flag.max_write_buffer_number);
-    prefixSize_ = (Integer) flags.get(Flag.prefix_size);
-    keysPerPrefix_ = (Integer) flags.get(Flag.keys_per_prefix);
-    hashBucketCount_ = (Long) flags.get(Flag.hash_bucket_count);
-    usePlainTable_ = (Boolean) flags.get(Flag.use_plain_table);
-    flags_ = flags;
-    finishLock_ = new Object();
-    // options.setPrefixSize((Integer)flags_.get(Flag.prefix_size));
-    // options.setKeysPerPrefix((Long)flags_.get(Flag.keys_per_prefix));
-    compressionType_ = (String) flags.get(Flag.compression_type);
-    compression_ = CompressionType.NONE;
-    try {
-      if (compressionType_.equals("snappy")) {
-        System.loadLibrary("snappy");
-      } else if (compressionType_.equals("zlib")) {
-        System.loadLibrary("zlib");
-      } else if (compressionType_.equals("bzip2")) {
-        System.loadLibrary("bzip2");
-      } else if (compressionType_.equals("lz4")) {
-        System.loadLibrary("lz4");
-      } else if (compressionType_.equals("lz4hc")) {
-        System.loadLibrary("lz4hc");
-      }
-    } catch (UnsatisfiedLinkError e) {
-      System.err.format("Unable to load %s library:%s%n" +
-                        "No compression is used.%n",
-          compressionType_, e.toString());
-      compressionType_ = "none";
-      compressionRatio_ = 1.0;
-    }
-    gen_ = new RandomGenerator(randSeed_, compressionRatio_);
-  }
-
-  private void prepareReadOptions(ReadOptions options) {
-    options.setVerifyChecksums((Boolean)flags_.get(Flag.verify_checksum));
-    options.setTailing((Boolean)flags_.get(Flag.use_tailing_iterator));
-  }
-
-  private void prepareWriteOptions(WriteOptions options) {
-    options.setSync((Boolean)flags_.get(Flag.sync));
-    options.setDisableWAL((Boolean)flags_.get(Flag.disable_wal));
-  }
-
-  private void prepareOptions(Options options) {
-    options.setCacheSize(cacheSize_);
-    if (!useExisting_) {
-      options.setCreateIfMissing(true);
-    } else {
-      options.setCreateIfMissing(false);
-    }
-    if (memtable_.equals("skip_list")) {
-      options.setMemTableConfig(new SkipListMemTableConfig());
-    } else if (memtable_.equals("vector")) {
-      options.setMemTableConfig(new VectorMemTableConfig());
-    } else if (memtable_.equals("hash_linkedlist")) {
-      options.setMemTableConfig(
-          new HashLinkedListMemTableConfig()
-              .setBucketCount(hashBucketCount_));
-      options.useFixedLengthPrefixExtractor(prefixSize_);
-    } else if (memtable_.equals("hash_skiplist") ||
-               memtable_.equals("prefix_hash")) {
-      options.setMemTableConfig(
-          new HashSkipListMemTableConfig()
-              .setBucketCount(hashBucketCount_));
-      options.useFixedLengthPrefixExtractor(prefixSize_);
-    } else {
-      System.err.format(
-          "unable to detect the specified memtable, " +
-          "use the default memtable factory %s%n",
-          options.memTableFactoryName());
-    }
-    if (usePlainTable_) {
-      options.setTableFormatConfig(
-          new PlainTableConfig().setKeySize(keySize_));
-    }
-    options.setWriteBufferSize(
-        (Long)flags_.get(Flag.write_buffer_size));
-    options.setMaxWriteBufferNumber(
-        (Integer)flags_.get(Flag.max_write_buffer_number));
-    options.setMaxBackgroundCompactions(
-        (Integer)flags_.get(Flag.max_background_compactions));
-    options.setMaxBackgroundFlushes(
-        (Integer)flags_.get(Flag.max_background_flushes));
-    options.setCacheSize(
-        (Long)flags_.get(Flag.cache_size));
-    options.setBlockSize(
-        (Long)flags_.get(Flag.block_size));
-    options.setMaxOpenFiles(
-        (Integer)flags_.get(Flag.open_files));
-    options.setCreateIfMissing(
-        !(Boolean)flags_.get(Flag.use_existing_db));
-    options.setTableCacheRemoveScanCountLimit(
-        (Integer)flags_.get(Flag.cache_remove_scan_count_limit));
-    options.setDisableDataSync(
-        (Boolean)flags_.get(Flag.disable_data_sync));
-    options.setUseFsync(
-        (Boolean)flags_.get(Flag.use_fsync));
-    options.setWalDir(
-        (String)flags_.get(Flag.wal_dir));
-    options.setDisableSeekCompaction(
-        (Boolean)flags_.get(Flag.disable_seek_compaction));
-    options.setDeleteObsoleteFilesPeriodMicros(
-        (Integer)flags_.get(Flag.delete_obsolete_files_period_micros));
-    options.setTableCacheNumshardbits(
-        (Integer)flags_.get(Flag.table_cache_numshardbits));
-    options.setAllowMmapReads(
-        (Boolean)flags_.get(Flag.mmap_read));
-    options.setAllowMmapWrites(
-        (Boolean)flags_.get(Flag.mmap_write));
-    options.setAdviseRandomOnOpen(
-        (Boolean)flags_.get(Flag.advise_random_on_open));
-    options.setUseAdaptiveMutex(
-        (Boolean)flags_.get(Flag.use_adaptive_mutex));
-    options.setBytesPerSync(
-        (Long)flags_.get(Flag.bytes_per_sync));
-    options.setBloomLocality(
-        (Integer)flags_.get(Flag.bloom_locality));
-    options.setMinWriteBufferNumberToMerge(
-        (Integer)flags_.get(Flag.min_write_buffer_number_to_merge));
-    options.setMemtablePrefixBloomBits(
-        (Integer)flags_.get(Flag.memtable_bloom_bits));
-    options.setNumLevels(
-        (Integer)flags_.get(Flag.num_levels));
-    options.setTargetFileSizeBase(
-        (Integer)flags_.get(Flag.target_file_size_base));
-    options.setTargetFileSizeMultiplier(
-        (Integer)flags_.get(Flag.target_file_size_multiplier));
-    options.setMaxBytesForLevelBase(
-        (Integer)flags_.get(Flag.max_bytes_for_level_base));
-    options.setMaxBytesForLevelMultiplier(
-        (Integer)flags_.get(Flag.max_bytes_for_level_multiplier));
-    options.setLevelZeroStopWritesTrigger(
-        (Integer)flags_.get(Flag.level0_stop_writes_trigger));
-    options.setLevelZeroSlowdownWritesTrigger(
-        (Integer)flags_.get(Flag.level0_slowdown_writes_trigger));
-    options.setLevelZeroFileNumCompactionTrigger(
-        (Integer)flags_.get(Flag.level0_file_num_compaction_trigger));
-    options.setSoftRateLimit(
-        (Double)flags_.get(Flag.soft_rate_limit));
-    options.setHardRateLimit(
-        (Double)flags_.get(Flag.hard_rate_limit));
-    options.setRateLimitDelayMaxMilliseconds(
-        (Integer)flags_.get(Flag.rate_limit_delay_max_milliseconds));
-    options.setMaxGrandparentOverlapFactor(
-        (Integer)flags_.get(Flag.max_grandparent_overlap_factor));
-    options.setDisableAutoCompactions(
-        (Boolean)flags_.get(Flag.disable_auto_compactions));
-    options.setSourceCompactionFactor(
-        (Integer)flags_.get(Flag.source_compaction_factor));
-    options.setFilterDeletes(
-        (Boolean)flags_.get(Flag.filter_deletes));
-    options.setMaxSuccessiveMerges(
-        (Integer)flags_.get(Flag.max_successive_merges));
-    options.setWalTtlSeconds((Long)flags_.get(Flag.wal_ttl_seconds));
-    options.setWalSizeLimitMB((Long)flags_.get(Flag.wal_size_limit_MB));
-    int bloomBits = (Integer)flags_.get(Flag.bloom_bits);
-    if (bloomBits > 0) {
-      // Internally, options will keep a reference to this BloomFilter.
-      // This will disallow Java to GC this BloomFilter.  In addition,
-      // options.dispose() will release the c++ object of this BloomFilter.
-      // As a result, the caller should not directly call
-      // BloomFilter.dispose().
-      options.setFilter(new BloomFilter(bloomBits));
-    }
-    /* TODO(yhchiang): enable the following parameters
-    options.setCompressionType((String)flags_.get(Flag.compression_type));
-    options.setCompressionLevel((Integer)flags_.get(Flag.compression_level));
-    options.setMinLevelToCompress((Integer)flags_.get(Flag.min_level_to_compress));
-    options.setHdfs((String)flags_.get(Flag.hdfs)); // env
-    options.setCacheNumshardbits((Integer)flags_.get(Flag.cache_numshardbits));
-    options.setStatistics((Boolean)flags_.get(Flag.statistics));
-    options.setUniversalSizeRatio(
-        (Integer)flags_.get(Flag.universal_size_ratio));
-    options.setUniversalMinMergeWidth(
-        (Integer)flags_.get(Flag.universal_min_merge_width));
-    options.setUniversalMaxMergeWidth(
-        (Integer)flags_.get(Flag.universal_max_merge_width));
-    options.setUniversalMaxSizeAmplificationPercent(
-        (Integer)flags_.get(Flag.universal_max_size_amplification_percent));
-    options.setUniversalCompressionSizePercent(
-        (Integer)flags_.get(Flag.universal_compression_size_percent));
-    // TODO(yhchiang): add RocksDB.openForReadOnly() to enable Flag.readonly
-    // TODO(yhchiang): enable Flag.merge_operator by switch
-    options.setAccessHintOnCompactionStart(
-        (String)flags_.get(Flag.compaction_fadvice));
-    // available values of fadvice are "NONE", "NORMAL", "SEQUENTIAL", "WILLNEED" for fadvice
-    */
-  }
-
-  private void run() throws RocksDBException {
-    if (!useExisting_) {
-      destroyDb();
-    }
-    Options options = new Options();
-    prepareOptions(options);
-    open(options);
-
-    printHeader(options);
-
-    for (String benchmark : benchmarks_) {
-      List<Callable<Stats>> tasks = new ArrayList<Callable<Stats>>();
-      List<Callable<Stats>> bgTasks = new ArrayList<Callable<Stats>>();
-      WriteOptions writeOpt = new WriteOptions();
-      prepareWriteOptions(writeOpt);
-      ReadOptions readOpt = new ReadOptions();
-      prepareReadOptions(readOpt);
-      int currentTaskId = 0;
-      boolean known = true;
-
-      if (benchmark.equals("fillseq")) {
-        tasks.add(new WriteSequentialTask(
-            currentTaskId++, randSeed_, num_, num_, writeOpt, 1));
-      } else if (benchmark.equals("fillbatch")) {
-        tasks.add(new WriteRandomTask(
-            currentTaskId++, randSeed_, num_ / 1000, num_, writeOpt, 1000));
-      } else if (benchmark.equals("fillrandom")) {
-        tasks.add(new WriteRandomTask(
-            currentTaskId++, randSeed_, num_, num_, writeOpt, 1));
-      } else if (benchmark.equals("filluniquerandom")) {
-        tasks.add(new WriteUniqueRandomTask(
-            currentTaskId++, randSeed_, num_, num_, writeOpt, 1));
-      } else if (benchmark.equals("fillsync")) {
-        writeOpt.setSync(true);
-        tasks.add(new WriteRandomTask(
-            currentTaskId++, randSeed_, num_ / 1000, num_ / 1000,
-            writeOpt, 1));
-      } else if (benchmark.equals("readseq")) {
-        for (int t = 0; t < threadNum_; ++t) {
-          tasks.add(new ReadSequentialTask(
-              currentTaskId++, randSeed_, reads_ / threadNum_, num_));
-        }
-      } else if (benchmark.equals("readrandom")) {
-        for (int t = 0; t < threadNum_; ++t) {
-          tasks.add(new ReadRandomTask(
-              currentTaskId++, randSeed_, reads_ / threadNum_, num_));
-        }
-      } else if (benchmark.equals("readwhilewriting")) {
-        WriteTask writeTask = new WriteRandomTask(
-            -1, randSeed_, Long.MAX_VALUE, num_, writeOpt, 1, writesPerSeconds_);
-        writeTask.stats_.setExcludeFromMerge();
-        bgTasks.add(writeTask);
-        for (int t = 0; t < threadNum_; ++t) {
-          tasks.add(new ReadRandomTask(
-              currentTaskId++, randSeed_, reads_ / threadNum_, num_));
-        }
-      } else if (benchmark.equals("readhot")) {
-        for (int t = 0; t < threadNum_; ++t) {
-          tasks.add(new ReadRandomTask(
-              currentTaskId++, randSeed_, reads_ / threadNum_, num_ / 100));
-        }
-      } else if (benchmark.equals("delete")) {
-        destroyDb();
-        open(options);
-      } else {
-        known = false;
-        System.err.println("Unknown benchmark: " + benchmark);
-      }
-      if (known) {
-        ExecutorService executor = Executors.newCachedThreadPool();
-        ExecutorService bgExecutor = Executors.newCachedThreadPool();
-        try {
-          // measure only the main executor time
-          List<Future<Stats>> bgResults = new ArrayList<Future<Stats>>();
-          for (Callable bgTask : bgTasks) {
-            bgResults.add(bgExecutor.submit(bgTask));
-          }
-          start();
-          List<Future<Stats>> results = executor.invokeAll(tasks);
-          executor.shutdown();
-          boolean finished = executor.awaitTermination(10, TimeUnit.SECONDS);
-          if (!finished) {
-            System.out.format(
-                "Benchmark %s was not finished before timeout.",
-                benchmark);
-            executor.shutdownNow();
-          }
-          setFinished(true);
-          bgExecutor.shutdown();
-          finished = bgExecutor.awaitTermination(10, TimeUnit.SECONDS);
-          if (!finished) {
-            System.out.format(
-                "Benchmark %s was not finished before timeout.",
-                benchmark);
-            bgExecutor.shutdownNow();
-          }
-
-          stop(benchmark, results, currentTaskId);
-        } catch (InterruptedException e) {
-          System.err.println(e);
-        }
-      }
-      writeOpt.dispose();
-      readOpt.dispose();
-    }
-    options.dispose();
-    db_.close();
-  }
-
-  private void printHeader(Options options) {
-    int kKeySize = 16;
-    System.out.printf("Keys:     %d bytes each\n", kKeySize);
-    System.out.printf("Values:   %d bytes each (%d bytes after compression)\n",
-        valueSize_,
-        (int) (valueSize_ * compressionRatio_ + 0.5));
-    System.out.printf("Entries:  %d\n", num_);
-    System.out.printf("RawSize:  %.1f MB (estimated)\n",
-        ((double)(kKeySize + valueSize_) * num_) / SizeUnit.MB);
-    System.out.printf("FileSize:   %.1f MB (estimated)\n",
-        (((kKeySize + valueSize_ * compressionRatio_) * num_) / SizeUnit.MB));
-    System.out.format("Memtable Factory: %s%n", options.memTableFactoryName());
-    System.out.format("Prefix:   %d bytes%n", prefixSize_);
-    System.out.format("Compression: %s%n", compressionType_);
-    printWarnings();
-    System.out.printf("------------------------------------------------\n");
-  }
-
-  void printWarnings() {
-    boolean assertsEnabled = false;
-    assert assertsEnabled = true; // Intentional side effect!!!
-    if (assertsEnabled) {
-      System.out.printf(
-          "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
-    }
-  }
-
-  private void open(Options options) throws RocksDBException {
-    db_ = RocksDB.open(options, databaseDir_);
-  }
-
-  private void start() {
-    setFinished(false);
-    startTime_ = System.nanoTime();
-  }
-
-  private void stop(
-      String benchmark, List<Future<Stats>> results, int concurrentThreads) {
-    long endTime = System.nanoTime();
-    double elapsedSeconds =
-        1.0d * (endTime - startTime_) / TimeUnit.SECONDS.toNanos(1);
-
-    Stats stats = new Stats(-1);
-    int taskFinishedCount = 0;
-    for (Future<Stats> result : results) {
-      if (result.isDone()) {
-        try {
-          Stats taskStats = result.get(3, TimeUnit.SECONDS);
-          if (!result.isCancelled()) {
-            taskFinishedCount++;
-          }
-          stats.merge(taskStats);
-        } catch (Exception e) {
-          // then it's not successful, the output will indicate this
-        }
-      }
-    }
-
-    System.out.printf(
-        "%-16s : %11.5f micros/op; %6.1f MB/s; %d / %d task(s) finished.\n",
-        benchmark, (double) elapsedSeconds / stats.done_ * 1e6,
-        (stats.bytes_ / 1048576.0) / elapsedSeconds,
-        taskFinishedCount, concurrentThreads);
-  }
-
-  public void generateKeyFromLong(byte[] slice, long n) {
-    assert(n >= 0);
-    int startPos = 0;
-
-    if (keysPerPrefix_ > 0) {
-      long numPrefix = (num_ + keysPerPrefix_ - 1) / keysPerPrefix_;
-      long prefix = n % numPrefix;
-      int bytesToFill = Math.min(prefixSize_, 8);
-      for (int i = 0; i < bytesToFill; ++i) {
-        slice[i] = (byte) (prefix % 256);
-        prefix /= 256;
-      }
-      for (int i = 8; i < bytesToFill; ++i) {
-        slice[i] = '0';
-      }
-      startPos = bytesToFill;
-    }
-
-    for (int i = slice.length - 1; i >= startPos; --i) {
-      slice[i] = (byte) ('0' + (n % 10));
-      n /= 10;
-    }
-  }
-
-  private void destroyDb() {
-    if (db_ != null) {
-      db_.close();
-    }
-    // TODO(yhchiang): develop our own FileUtil
-    // FileUtil.deleteDir(databaseDir_);
-  }
-
-  private void printStats() {
-  }
-
-  static void printHelp() {
-    System.out.println("usage:");
-    for (Flag flag : Flag.values()) {
-      System.out.format("  --%s%n\t%s%n",
-          flag.name(),
-          flag.desc());
-      if (flag.getDefaultValue() != null) {
-        System.out.format("\tDEFAULT: %s%n",
-            flag.getDefaultValue().toString());
-      }
-    }
-  }
-
-  public static void main(String[] args) throws Exception {
-    Map<Flag, Object> flags = new EnumMap<Flag, Object>(Flag.class);
-    for (Flag flag : Flag.values()) {
-      if (flag.getDefaultValue() != null) {
-        flags.put(flag, flag.getDefaultValue());
-      }
-    }
-    for (String arg : args) {
-      boolean valid = false;
-      if (arg.equals("--help") || arg.equals("-h")) {
-        printHelp();
-        System.exit(0);
-      }
-      if (arg.startsWith("--")) {
-        try {
-          String[] parts = arg.substring(2).split("=");
-          if (parts.length >= 1) {
-            Flag key = Flag.valueOf(parts[0]);
-            if (key != null) {
-              Object value = null;
-              if (parts.length >= 2) {
-                value = key.parseValue(parts[1]);
-              }
-              flags.put(key, value);
-              valid = true;
-            }
-          }
-        }
-        catch (Exception e) {
-        }
-      }
-      if (!valid) {
-        System.err.println("Invalid argument " + arg);
-        System.exit(1);
-      }
-    }
-    new DbBenchmark(flags).run();
-  }
-
-  private enum Flag {
-    benchmarks(
-        Arrays.asList(
-            "fillseq",
-            "readrandom",
-            "fillrandom"),
-        "Comma-separated list of operations to run in the specified order\n" +
-        "\tActual benchmarks:\n" +
-        "\t\tfillseq          -- write N values in sequential key order in async mode.\n" +
-        "\t\tfillrandom       -- write N values in random key order in async mode.\n" +
-        "\t\tfillbatch        -- write N/1000 batch where each batch has 1000 values\n" +
-        "\t\t                   in random key order in sync mode.\n" +
-        "\t\tfillsync         -- write N/100 values in random key order in sync mode.\n" +
-        "\t\tfill100K         -- write N/1000 100K values in random order in async mode.\n" +
-        "\t\treadseq          -- read N times sequentially.\n" +
-        "\t\treadrandom       -- read N times in random order.\n" +
-        "\t\treadhot          -- read N times in random order from 1% section of DB.\n" +
-        "\t\treadwhilewriting -- measure the read performance of multiple readers\n" +
-        "\t\t                   with a bg single writer.  The write rate of the bg\n" +
-        "\t\t                   is capped by --writes_per_second.\n" +
-        "\tMeta Operations:\n" +
-        "\t\tdelete            -- delete DB") {
-      @Override public Object parseValue(String value) {
-        return new ArrayList<String>(Arrays.asList(value.split(",")));
-      }
-    },
-    compression_ratio(0.5d,
-        "Arrange to generate values that shrink to this fraction of\n" +
-        "\ttheir original size after compression.") {
-      @Override public Object parseValue(String value) {
-        return Double.parseDouble(value);
-      }
-    },
-    use_existing_db(false,
-        "If true, do not destroy the existing database.  If you set this\n" +
-        "\tflag and also specify a benchmark that wants a fresh database,\n" +
-        "\tthat benchmark will fail.") {
-      @Override public Object parseValue(String value) {
-        return Boolean.parseBoolean(value);
-      }
-    },
-    num(1000000,
-        "Number of key/values to place in database.") {
-      @Override public Object parseValue(String value) {
-        return Integer.parseInt(value);
-      }
-    },
-    threads(1,
-        "Number of concurrent threads to run.") {
-      @Override public Object parseValue(String value) {
-        return Integer.parseInt(value);
-      }
-    },
-    reads(null,
-        "Number of read operations to do.  If negative, do --nums reads.") {
-      @Override public Object parseValue(String value) {
-        return Integer.parseInt(value);
-      }
-    },
-    key_size(16,
-        "The size of each key in bytes.") {
-      @Override public Object parseValue(String value) {
-        return Integer.parseInt(value);
-      }
-    },
-    value_size(100,
-        "The size of each value in bytes.") {
-      @Override public Object parseValue(String value) {
-        return Integer.parseInt(value);
-      }
-    },
-    write_buffer_size(4 * SizeUnit.MB,
-        "Number of bytes to buffer in memtable before compacting\n" +
-        "\t(initialized to default value by 'main'.)") {
-      @Override public Object parseValue(String value) {
-        return Long.parseLong(value);
-      }
-    },
-    max_write_buffer_number(2,
-             "The number of in-memory memtables. Each memtable is of size\n" +
-             "\twrite_buffer_size.") {
-      @Override public Object parseValue(String value) {
-        return Integer.parseInt(value);
-      }
-    },
-    prefix_size(0, "Controls the prefix size for HashSkipList, HashLinkedList,\n" +
-                   "\tand plain table.") {
-      @Override public Object parseValue(String value) {
-        return Integer.parseInt(value);
-      }
-    },
-    keys_per_prefix(0, "Controls the average number of keys generated\n" +
-             "\tper prefix, 0 means no special handling of the prefix,\n" +
-             "\ti.e. use the prefix comes with the generated random number.") {
-      @Override public Object parseValue(String value) {
-        return Integer.parseInt(value);
-      }
-    },
-    memtablerep("skip_list",
-        "The memtable format.  Available options are\n" +
-        "\tskip_list,\n" +
-        "\tvector,\n" +
-        "\thash_linkedlist,\n" +
-        "\thash_skiplist (prefix_hash.)") {
-      @Override public Object parseValue(String value) {
-        return value;
-      }
-    },
-    hash_bucket_count(SizeUnit.MB,
-        "The number of hash buckets used in the hash-bucket-based\n" +
-        "\tmemtables.  Memtables that currently support this argument are\n" +
-        "\thash_linkedlist and hash_skiplist.") {
-      @Override public Object parseValue(String value) {
-        return Long.parseLong(value);
-      }
-    },
-    writes_per_second(10000,
-        "The write-rate of the background writer used in the\n" +
-        "\t`readwhilewriting` benchmark.  Non-positive number indicates\n" +
-        "\tusing an unbounded write-rate in `readwhilewriting` benchmark.") {
-      @Override public Object parseValue(String value) {
-        return Integer.parseInt(value);
-      }
-    },
-    use_plain_table(false,
-        "Use plain-table sst format.") {
-      @Override public Object parseValue(String value) {
-        return Boolean.parseBoolean(value);
-      }
-    },
-    cache_size(-1L,
-        "Number of bytes to use as a cache of uncompressed data.\n" +
-        "\tNegative means use default settings.") {
-      @Override public Object parseValue(String value) {
-        return Long.parseLong(value);
-      }
-    },
-    seed(0L,
-        "Seed base for random number generators.") {
-      @Override public Object parseValue(String value) {
-        return Long.parseLong(value);
-      }
-    },
-    num_levels(7,
-        "The total number of levels.") {
-      @Override public Object parseValue(String value) {
-        return Integer.parseInt(value);
-      }
-    },
-    numdistinct(1000,
-        "Number of distinct keys to use. Used in RandomWithVerify to\n" +
-        "\tread/write on fewer keys so that gets are more likely to find the\n" +
-        "\tkey and puts are more likely to update the same key.") {
-      @Override public Object parseValue(String value) {
-        return Long.parseLong(value);
-      }
-    },
-    merge_keys(-1,
-        "Number of distinct keys to use for MergeRandom and\n" +
-        "\tReadRandomMergeRandom.\n" +
-        "\tIf negative, there will be FLAGS_num keys.") {
-      @Override public Object parseValue(String value) {
-        return Long.parseLong(value);
-      }
-    },
-    bloom_locality(0,"Control bloom filter probes locality.") {
-      @Override public Object parseValue(String value) {
-        return Integer.parseInt(value);
-      }
-    },
-    duration(0,"Time in seconds for the random-ops tests to run.\n" +
-        "\tWhen 0 then num & reads determine the test duration.") {
-      @Override public Object parseValue(String value) {
-        return Integer.parseInt(value);
-      }
-    },
-    num_multi_db(0,
-        "Number of DBs used in the benchmark. 0 means single DB.") {
-      @Override public Object parseValue(String value) {
-        return Integer.parseInt(value);
-      }
-    },
-    histogram(false,"Print histogram of operation timings.") {
-      @Override public Object parseValue(String value) {
-        return Boolean.parseBoolean(value);
-      }
-    },
-    min_write_buffer_number_to_merge(
-        defaultOptions_.minWriteBufferNumberToMerge(),
-        "The minimum number of write buffers that will be merged together\n" +
-        "\tbefore writing to storage. This is cheap because it is an\n" +
-        "\tin-memory merge. If this feature is not enabled, then all these\n" +
-        "\twrite buffers are flushed to L0 as separate files and this\n" +
-        "\tincreases read amplification because a get request has to check\n" +
-        "\tin all of these files. Also, an in-memory merge may result in\n" +
-        "\twriting less data to storage if there are duplicate records\n" +
-        "\tin each of these individual write buffers.") {
-      @Override public Object parseValue(String value) {
-        return Integer.parseInt(value);
-      }
-    },
-    max_background_compactions(
-        defaultOptions_.maxBackgroundCompactions(),
-        "The maximum number of concurrent background compactions\n" +
-        "\tthat can occur in parallel.") {
-      @Override public Object parseValue(String value) {
-        return Integer.parseInt(value);
-      }
-    },
-    max_background_flushes(
-        defaultOptions_.maxBackgroundFlushes(),
-        "The maximum number of concurrent background flushes\n" +
-        "\tthat can occur in parallel.") {
-      @Override public Object parseValue(String value) {
-        return Integer.parseInt(value);
-      }
-    },
-    /* TODO(yhchiang): enable the following
-    compaction_style((int32_t) defaultOptions_.compactionStyle(),
-        "style of compaction: level-based vs universal.") {
-      @Override public Object parseValue(String value) {
-        return Integer.parseInt(value);
-      }
-    },*/
-    universal_size_ratio(0,
-        "Percentage flexibility while comparing file size\n" +
-        "\t(for universal compaction only).") {
-      @Override public Object parseValue(String value) {
-        return Integer.parseInt(value);
-      }
-    },
-    universal_min_merge_width(0,"The minimum number of files in a\n" +
-        "\tsingle compaction run (for universal compaction only).") {
-      @Override public Object parseValue(String value) {
-        return Integer.parseInt(value);
-      }
-    },
-    universal_max_merge_width(0,"The max number of files to compact\n" +
-        "\tin universal style compaction.") {
-      @Override public Object parseValue(String value) {
-        return Integer.parseInt(value);
-      }
-    },
-    universal_max_size_amplification_percent(0,
-        "The max size amplification for universal style compaction.") {
-      @Override public Object parseValue(String value) {
-        return Integer.parseInt(value);
-      }
-    },
-    universal_compression_size_percent(-1,
-        "The percentage of the database to compress for universal\n" +
-        "\tcompaction. -1 means compress everything.") {
-      @Override public Object parseValue(String value) {
-        return Integer.parseInt(value);
-      }
-    },
-    block_size(defaultOptions_.blockSize(),
-        "Number of bytes in a block.") {
-      @Override public Object parseValue(String value) {
-        return Long.parseLong(value);
-      }
-    },
-    compressed_cache_size(-1,
-        "Number of bytes to use as a cache of compressed data.") {
-      @Override public Object parseValue(String value) {
-        return Long.parseLong(value);
-      }
-    },
-    open_files(defaultOptions_.maxOpenFiles(),
-        "Maximum number of files to keep open at the same time\n" +
-        "\t(use default if == 0)") {
-      @Override public Object parseValue(String value) {
-        return Integer.parseInt(value);
-      }
-    },
-    bloom_bits(-1,"Bloom filter bits per key. Negative means\n" +
-        "\tuse default settings.") {
-      @Override public Object parseValue(String value) {
-        return Integer.parseInt(value);
-      }
-    },
-    memtable_bloom_bits(0,"Bloom filter bits per key for memtable.\n" +
-        "\tNegative means no bloom filter.") {
-      @Override public Object parseValue(String value) {
-        return Integer.parseInt(value);
-      }
-    },
-    cache_numshardbits(-1,"Number of shards for the block cache\n" +
-        "\tis 2 ** cache_numshardbits. Negative means use default settings.\n" +
-        "\tThis is applied only if FLAGS_cache_size is non-negative.") {
-      @Override public Object parseValue(String value) {
-        return Integer.parseInt(value);
-      }
-    },
-    cache_remove_scan_count_limit(32,"") {
-      @Override public Object parseValue(String value) {
-        return Integer.parseInt(value);
-      }
-    },
-    verify_checksum(false,"Verify checksum for every block read\n" +
-        "\tfrom storage.") {
-      @Override public Object parseValue(String value) {
-        return Boolean.parseBoolean(value);
-      }
-    },
-    statistics(false,"Database statistics.") {
-      @Override public Object parseValue(String value) {
-        return Boolean.parseBoolean(value);
-      }
-    },
-    writes(-1,"Number of write operations to do. If negative, do\n" +
-        "\t--num reads.") {
-      @Override public Object parseValue(String value) {
-        return Long.parseLong(value);
-      }
-    },
-    sync(false,"Sync all writes to disk.") {
-      @Override public Object parseValue(String value) {
-        return Boolean.parseBoolean(value);
-      }
-    },
-    disable_data_sync(false,"If true, do not wait until data is\n" +
-        "\tsynced to disk.") {
-      @Override public Object parseValue(String value) {
-        return Boolean.parseBoolean(value);
-      }
-    },
-    use_fsync(false,"If true, issue fsync instead of fdatasync.") {
-      @Override public Object parseValue(String value) {
-        return Boolean.parseBoolean(value);
-      }
-    },
-    disable_wal(false,"If true, do not write WAL for write.") {
-      @Override public Object parseValue(String value) {
-        return Boolean.parseBoolean(value);
-      }
-    },
-    wal_dir("", "If not empty, use the given dir for WAL.") {
-      @Override public Object parseValue(String value) {
-        return value;
-      }
-    },
-    target_file_size_base(2 * 1048576,"Target file size at level-1") {
-      @Override public Object parseValue(String value) {
-        return Integer.parseInt(value);
-      }
-    },
-    target_file_size_multiplier(1,
-        "A multiplier to compute target level-N file size (N >= 2)") {
-      @Override public Object parseValue(String value) {
-        return Integer.parseInt(value);
-      }
-    },
-    max_bytes_for_level_base(10 * 1048576,
-      "Max bytes for level-1") {
-      @Override public Object parseValue(String value) {
-        return Integer.parseInt(value);
-      }
-    },
-    max_bytes_for_level_multiplier(10,
-        "A multiplier to compute max bytes for level-N (N >= 2)") {
-      @Override public Object parseValue(String value) {
-        return Integer.parseInt(value);
-      }
-    },
-    level0_stop_writes_trigger(12,"Number of files in level-0\n" +
-        "\tthat will trigger put stop.") {
-      @Override public Object parseValue(String value) {
-        return Integer.parseInt(value);
-      }
-    },
-    level0_slowdown_writes_trigger(8,"Number of files in level-0\n" +
-        "\tthat will slow down writes.") {
-      @Override public Object parseValue(String value) {
-        return Integer.parseInt(value);
-      }
-    },
-    level0_file_num_compaction_trigger(4,"Number of files in level-0\n" +
-        "\twhen compactions start.") {
-      @Override public Object parseValue(String value) {
-        return Integer.parseInt(value);
-      }
-    },
-    readwritepercent(90,"Ratio of reads to reads/writes (expressed\n" +
-        "\tas percentage) for the ReadRandomWriteRandom workload. The\n" +
-        "\tdefault value 90 means 90% operations out of all reads and writes\n" +
-        "\toperations are reads. In other words, 9 gets for every 1 put.") {
-      @Override public Object parseValue(String value) {
-        return Integer.parseInt(value);
-      }
-    },
-    mergereadpercent(70,"Ratio of merges to merges&reads (expressed\n" +
-        "\tas percentage) for the ReadRandomMergeRandom workload. The\n" +
-        "\tdefault value 70 means 70% out of all read and merge operations\n" +
-        "\tare merges. In other words, 7 merges for every 3 gets.") {
-      @Override public Object parseValue(String value) {
-        return Integer.parseInt(value);
-      }
-    },
-    deletepercent(2,"Percentage of deletes out of reads/writes/\n" +
-        "\tdeletes (used in RandomWithVerify only). RandomWithVerify\n" +
-        "\tcalculates writepercent as (100 - FLAGS_readwritepercent -\n" +
-        "\tdeletepercent), so deletepercent must be smaller than (100 -\n" +
-        "\tFLAGS_readwritepercent)") {
-      @Override public Object parseValue(String value) {
-        return Integer.parseInt(value);
-      }
-    },
-    disable_seek_compaction(false,"Option to disable compaction\n" +
-        "\ttriggered by read.") {
-      @Override public Object parseValue(String value) {
-        return Boolean.parseBoolean(value);
-      }
-    },
-    delete_obsolete_files_period_micros(0,"Option to delete\n" +
-        "\tobsolete files periodically. 0 means that obsolete files are\n" +
-        "\tdeleted after every compaction run.") {
-      @Override public Object parseValue(String value) {
-        return Integer.parseInt(value);
-      }
-    },
-    compression_type("snappy",
-        "Algorithm used to compress the database.") {
-      @Override public Object parseValue(String value) {
-        return value;
-      }
-    },
-    compression_level(-1,
-        "Compression level. For zlib this should be -1 for the\n" +
-        "\tdefault level, or between 0 and 9.") {
-      @Override public Object parseValue(String value) {
-        return Integer.parseInt(value);
-      }
-    },
-    min_level_to_compress(-1,"If non-negative, compression starts\n" +
-        "\tfrom this level. Levels with number < min_level_to_compress are\n" +
-        "\tnot compressed. Otherwise, apply compression_type to\n" +
-        "\tall levels.") {
-      @Override public Object parseValue(String value) {
-        return Integer.parseInt(value);
-      }
-    },
-    table_cache_numshardbits(4,"") {
-      @Override public Object parseValue(String value) {
-        return Integer.parseInt(value);
-      }
-    },
-    stats_interval(0,"Stats are reported every N operations when\n" +
-        "\tthis is greater than zero. When 0 the interval grows over time.") {
-      @Override public Object parseValue(String value) {
-        return Long.parseLong(value);
-      }
-    },
-    stats_per_interval(0,"Reports additional stats per interval when\n" +
-        "\tthis is greater than 0.") {
-      @Override public Object parseValue(String value) {
-        return Integer.parseInt(value);
-      }
-    },
-    perf_level(0,"Level of perf collection.") {
-      @Override public Object parseValue(String value) {
-        return Integer.parseInt(value);
-      }
-    },
-    soft_rate_limit(0.0,"") {
-      @Override public Object parseValue(String value) {
-        return Double.parseDouble(value);
-      }
-    },
-    hard_rate_limit(0.0,"When not equal to 0 this make threads\n" +
-        "\tsleep at each stats reporting interval until the compaction\n" +
-        "\tscore for all levels is less than or equal to this value.") {
-      @Override public Object parseValue(String value) {
-        return Double.parseDouble(value);
-      }
-    },
-    rate_limit_delay_max_milliseconds(1000,
-        "When hard_rate_limit is set then this is the max time a put will\n" +
-        "\tbe stalled.") {
-      @Override public Object parseValue(String value) {
-        return Integer.parseInt(value);
-      }
-    },
-    max_grandparent_overlap_factor(10,"Control maximum bytes of\n" +
-        "\toverlaps in grandparent (i.e., level+2) before we stop building a\n" +
-        "\tsingle file in a level->level+1 compaction.") {
-      @Override public Object parseValue(String value) {
-        return Integer.parseInt(value);
-      }
-    },
-    readonly(false,"Run read only benchmarks.") {
-      @Override public Object parseValue(String value) {
-        return Boolean.parseBoolean(value);
-      }
-    },
-    disable_auto_compactions(false,"Do not auto trigger compactions.") {
-      @Override public Object parseValue(String value) {
-        return Boolean.parseBoolean(value);
-      }
-    },
-    source_compaction_factor(1,"Cap the size of data in level-K for\n" +
-        "\ta compaction run that compacts Level-K with Level-(K+1) (for\n" +
-        "\tK >= 1)") {
-      @Override public Object parseValue(String value) {
-        return Integer.parseInt(value);
-      }
-    },
-    wal_ttl_seconds(0L,"Set the TTL for the WAL Files in seconds.") {
-      @Override public Object parseValue(String value) {
-        return Long.parseLong(value);
-      }
-    },
-    wal_size_limit_MB(0L,"Set the size limit for the WAL Files\n" +
-        "\tin MB.") {
-      @Override public Object parseValue(String value) {
-        return Long.parseLong(value);
-      }
-    },
-    /* TODO(yhchiang): enable the following
-    bufferedio(rocksdb::EnvOptions().use_os_buffer,
-        "Allow buffered io using OS buffers.") {
-      @Override public Object parseValue(String value) {
-        return Boolean.parseBoolean(value);
-      }
-    },
-    */
-    mmap_read(false,
-        "Allow reads to occur via mmap-ing files.") {
-      @Override public Object parseValue(String value) {
-        return Boolean.parseBoolean(value);
-      }
-    },
-    mmap_write(false,
-        "Allow writes to occur via mmap-ing files.") {
-      @Override public Object parseValue(String value) {
-        return Boolean.parseBoolean(value);
-      }
-    },
-    advise_random_on_open(defaultOptions_.adviseRandomOnOpen(),
-        "Advise random access on table file open.") {
-      @Override public Object parseValue(String value) {
-        return Boolean.parseBoolean(value);
-      }
-    },
-    compaction_fadvice("NORMAL",
-      "Access pattern advice when a file is compacted.") {
-      @Override public Object parseValue(String value) {
-        return value;
-      }
-    },
-    use_tailing_iterator(false,
-        "Use tailing iterator to access a series of keys instead of get.") {
-      @Override public Object parseValue(String value) {
-        return Boolean.parseBoolean(value);
-      }
-    },
-    use_adaptive_mutex(defaultOptions_.useAdaptiveMutex(),
-        "Use adaptive mutex.") {
-      @Override public Object parseValue(String value) {
-        return Boolean.parseBoolean(value);
-      }
-    },
-    bytes_per_sync(defaultOptions_.bytesPerSync(),
-        "Allows OS to incrementally sync files to disk while they are\n" +
-        "\tbeing written, in the background. Issue one request for every\n" +
-        "\tbytes_per_sync written. 0 turns it off.") {
-      @Override public Object parseValue(String value) {
-        return Long.parseLong(value);
-      }
-    },
-    filter_deletes(false," On true, deletes use bloom-filter and drop\n" +
-        "\tthe delete if key not present.") {
-      @Override public Object parseValue(String value) {
-        return Boolean.parseBoolean(value);
-      }
-    },
-    max_successive_merges(0,"Maximum number of successive merge\n" +
-        "\toperations on a key in the memtable.") {
-      @Override public Object parseValue(String value) {
-        return Integer.parseInt(value);
-      }
-    },
-    db("/tmp/rocksdbjni-bench",
-       "Use the db with the following name.") {
-      @Override public Object parseValue(String value) {
-        return value;
-      }
-    };
-
-    private Flag(Object defaultValue, String desc) {
-      defaultValue_ = defaultValue;
-      desc_ = desc;
-    }
-
-    protected abstract Object parseValue(String value);
-
-    public Object getDefaultValue() {
-      return defaultValue_;
-    }
-
-    public String desc() {
-      return desc_;
-    }
-
-    private final Object defaultValue_;
-    private final String desc_;
-  }
-
-  private static class RandomGenerator {
-    private final byte[] data_;
-    private int dataLength_;
-    private int position_;
-    Random rand_;
-
-    private RandomGenerator(long seed, double compressionRatio) {
-      // We use a limited amount of data over and over again and ensure
-      // that it is larger than the compression window (32KB), and also
-      // large enough to serve all typical value sizes we want to write.
-      rand_ = new Random(seed);
-      dataLength_ = 1048576 + 100;
-      data_ = new byte[dataLength_];
-      // TODO(yhchiang): mimic test::CompressibleString?
-      for (int i = 0; i < dataLength_; ++i) {
-        data_[i] = (byte) (' ' + rand_.nextInt(95));
-      }
-    }
-
-    private byte[] generate(int length) {
-      position_ = rand_.nextInt(data_.length - length);
-      return Arrays.copyOfRange(data_, position_, position_ + length);
-    }
-  }
-
-  boolean isFinished() {
-    synchronized(finishLock_) {
-      return isFinished_;
-    }
-  }
-
-  void setFinished(boolean flag) {
-    synchronized(finishLock_) {
-      isFinished_ = flag;
-    }
-  }
-
-  RocksDB db_;
-  final List<String> benchmarks_;
-  final int num_;
-  final int reads_;
-  final int keySize_;
-  final int valueSize_;
-  final int threadNum_;
-  final int writesPerSeconds_;
-  final long randSeed_;
-  final long cacheSize_;
-  final boolean useExisting_;
-  final String databaseDir_;
-  double compressionRatio_;
-  RandomGenerator gen_;
-  long startTime_;
-
-  // memtable related
-  final int maxWriteBufferNumber_;
-  final int prefixSize_;
-  final int keysPerPrefix_;
-  final String memtable_;
-  final long hashBucketCount_;
-
-  // sst format related
-  boolean usePlainTable_;
-
-  Object finishLock_;
-  boolean isFinished_;
-  Map<Flag, Object> flags_;
-  // as the scope of a static member equals to the scope of the problem,
-  // we let its c++ pointer to be disposed in its finalizer.
-  static Options defaultOptions_ = new Options();
-  String compressionType_;
-  CompressionType compression_;
-}
diff --git a/src/rocksdb/java/org/rocksdb/test/BackupableDBTest.java b/src/rocksdb/java/org/rocksdb/test/BackupableDBTest.java
deleted file mode 100644
index f0fc3d5..0000000
--- a/src/rocksdb/java/org/rocksdb/test/BackupableDBTest.java
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
-
-package org.rocksdb.test;
-
-import org.rocksdb.*;
-
-public class BackupableDBTest {
-  static final String db_path = "/tmp/backupablejni_db";
-  static final String backup_path = "/tmp/backupablejni_db_backup";
-  static {
-    RocksDB.loadLibrary();
-  }
-  public static void main(String[] args) {
-
-    Options opt = new Options();
-    opt.setCreateIfMissing(true);
-
-    BackupableDBOptions bopt = new BackupableDBOptions(backup_path);
-    BackupableDB bdb = null;
-
-    try {
-      bdb = BackupableDB.open(opt, bopt, db_path);
-      bdb.put("hello".getBytes(), "BackupableDB".getBytes());
-      bdb.createNewBackup(true);
-      byte[] value = bdb.get("hello".getBytes());
-      assert(new String(value).equals("BackupableDB"));
-    } catch (RocksDBException e) {
-      System.err.format("[ERROR]: %s%n", e);
-      e.printStackTrace();
-    } finally {
-      opt.dispose();
-      bopt.dispose();
-      if (bdb != null) {
-        bdb.close();
-      }
-    }
-  }
-}
diff --git a/src/rocksdb/java/org/rocksdb/test/OptionsTest.java b/src/rocksdb/java/org/rocksdb/test/OptionsTest.java
deleted file mode 100644
index e1e0e05..0000000
--- a/src/rocksdb/java/org/rocksdb/test/OptionsTest.java
+++ /dev/null
@@ -1,424 +0,0 @@
-// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
-
-package org.rocksdb.test;
-
-import java.util.Random;
-import org.rocksdb.RocksDB;
-import org.rocksdb.Options;
-
-public class OptionsTest {
-  static {
-    RocksDB.loadLibrary();
-  }
-  public static void main(String[] args) {
-    Options opt = new Options();
-    Random rand = new Random();
-    { // CreateIfMissing test
-      boolean boolValue = rand.nextBoolean();
-      opt.setCreateIfMissing(boolValue);
-      assert(opt.createIfMissing() == boolValue);
-    }
-
-    { // ErrorIfExists test
-      boolean boolValue = rand.nextBoolean();
-      opt.setErrorIfExists(boolValue);
-      assert(opt.errorIfExists() == boolValue);
-    }
-
-    { // ParanoidChecks test
-      boolean boolValue = rand.nextBoolean();
-      opt.setParanoidChecks(boolValue);
-      assert(opt.paranoidChecks() == boolValue);
-    }
-
-    { // MaxOpenFiles test
-      int intValue = rand.nextInt();
-      opt.setMaxOpenFiles(intValue);
-      assert(opt.maxOpenFiles() == intValue);
-    }
-
-    { // DisableDataSync test
-      boolean boolValue = rand.nextBoolean();
-      opt.setDisableDataSync(boolValue);
-      assert(opt.disableDataSync() == boolValue);
-    }
-
-    { // UseFsync test
-      boolean boolValue = rand.nextBoolean();
-      opt.setUseFsync(boolValue);
-      assert(opt.useFsync() == boolValue);
-    }
-
-    { // DbStatsLogInterval test
-      int intValue = rand.nextInt();
-      opt.setDbStatsLogInterval(intValue);
-      assert(opt.dbStatsLogInterval() == intValue);
-    }
-
-    { // DbLogDir test
-      String str = "path/to/DbLogDir";
-      opt.setDbLogDir(str);
-      assert(opt.dbLogDir().equals(str));
-    }
-
-    { // WalDir test
-      String str = "path/to/WalDir";
-      opt.setWalDir(str);
-      assert(opt.walDir().equals(str));
-    }
-
-    { // DeleteObsoleteFilesPeriodMicros test
-      long longValue = rand.nextLong();
-      opt.setDeleteObsoleteFilesPeriodMicros(longValue);
-      assert(opt.deleteObsoleteFilesPeriodMicros() == longValue);
-    }
-
-    { // MaxBackgroundCompactions test
-      int intValue = rand.nextInt();
-      opt.setMaxBackgroundCompactions(intValue);
-      assert(opt.maxBackgroundCompactions() == intValue);
-    }
-
-    { // MaxBackgroundFlushes test
-      int intValue = rand.nextInt();
-      opt.setMaxBackgroundFlushes(intValue);
-      assert(opt.maxBackgroundFlushes() == intValue);
-    }
-
-    { // MaxLogFileSize test
-      long longValue = rand.nextLong();
-      opt.setMaxLogFileSize(longValue);
-      assert(opt.maxLogFileSize() == longValue);
-    }
-
-    { // LogFileTimeToRoll test
-      long longValue = rand.nextLong();
-      opt.setLogFileTimeToRoll(longValue);
-      assert(opt.logFileTimeToRoll() == longValue);
-    }
-
-    { // KeepLogFileNum test
-      long longValue = rand.nextLong();
-      opt.setKeepLogFileNum(longValue);
-      assert(opt.keepLogFileNum() == longValue);
-    }
-
-    { // MaxManifestFileSize test
-      long longValue = rand.nextLong();
-      opt.setMaxManifestFileSize(longValue);
-      assert(opt.maxManifestFileSize() == longValue);
-    }
-
-    { // TableCacheNumshardbits test
-      int intValue = rand.nextInt();
-      opt.setTableCacheNumshardbits(intValue);
-      assert(opt.tableCacheNumshardbits() == intValue);
-    }
-
-    { // TableCacheRemoveScanCountLimit test
-      int intValue = rand.nextInt();
-      opt.setTableCacheRemoveScanCountLimit(intValue);
-      assert(opt.tableCacheRemoveScanCountLimit() == intValue);
-    }
-
-    { // WalTtlSeconds test
-      long longValue = rand.nextLong();
-      opt.setWalTtlSeconds(longValue);
-      assert(opt.walTtlSeconds() == longValue);
-    }
-
-    { // ManifestPreallocationSize test
-      long longValue = rand.nextLong();
-      opt.setManifestPreallocationSize(longValue);
-      assert(opt.manifestPreallocationSize() == longValue);
-    }
-
-    { // AllowOsBuffer test
-      boolean boolValue = rand.nextBoolean();
-      opt.setAllowOsBuffer(boolValue);
-      assert(opt.allowOsBuffer() == boolValue);
-    }
-
-    { // AllowMmapReads test
-      boolean boolValue = rand.nextBoolean();
-      opt.setAllowMmapReads(boolValue);
-      assert(opt.allowMmapReads() == boolValue);
-    }
-
-    { // AllowMmapWrites test
-      boolean boolValue = rand.nextBoolean();
-      opt.setAllowMmapWrites(boolValue);
-      assert(opt.allowMmapWrites() == boolValue);
-    }
-
-    { // IsFdCloseOnExec test
-      boolean boolValue = rand.nextBoolean();
-      opt.setIsFdCloseOnExec(boolValue);
-      assert(opt.isFdCloseOnExec() == boolValue);
-    }
-
-    { // SkipLogErrorOnRecovery test
-      boolean boolValue = rand.nextBoolean();
-      opt.setSkipLogErrorOnRecovery(boolValue);
-      assert(opt.skipLogErrorOnRecovery() == boolValue);
-    }
-
-    { // StatsDumpPeriodSec test
-      int intValue = rand.nextInt();
-      opt.setStatsDumpPeriodSec(intValue);
-      assert(opt.statsDumpPeriodSec() == intValue);
-    }
-
-    { // AdviseRandomOnOpen test
-      boolean boolValue = rand.nextBoolean();
-      opt.setAdviseRandomOnOpen(boolValue);
-      assert(opt.adviseRandomOnOpen() == boolValue);
-    }
-
-    { // UseAdaptiveMutex test
-      boolean boolValue = rand.nextBoolean();
-      opt.setUseAdaptiveMutex(boolValue);
-      assert(opt.useAdaptiveMutex() == boolValue);
-    }
-
-    { // BytesPerSync test
-      long longValue = rand.nextLong();
-      opt.setBytesPerSync(longValue);
-      assert(opt.bytesPerSync() == longValue);
-    }
-
-    { // AllowThreadLocal test
-      boolean boolValue = rand.nextBoolean();
-      opt.setAllowThreadLocal(boolValue);
-      assert(opt.allowThreadLocal() == boolValue);
-    }
-
-    { // WriteBufferSize test
-      long longValue = rand.nextLong();
-      opt.setWriteBufferSize(longValue);
-      assert(opt.writeBufferSize() == longValue);
-    }
-
-    { // MaxWriteBufferNumber test
-      int intValue = rand.nextInt();
-      opt.setMaxWriteBufferNumber(intValue);
-      assert(opt.maxWriteBufferNumber() == intValue);
-    }
-
-    { // MinWriteBufferNumberToMerge test
-      int intValue = rand.nextInt();
-      opt.setMinWriteBufferNumberToMerge(intValue);
-      assert(opt.minWriteBufferNumberToMerge() == intValue);
-    }
-
-    { // BlockSize test
-      long longValue = rand.nextLong();
-      opt.setBlockSize(longValue);
-      assert(opt.blockSize() == longValue);
-    }
-
-    { // BlockRestartInterval test
-      int intValue = rand.nextInt();
-      opt.setBlockRestartInterval(intValue);
-      assert(opt.blockRestartInterval() == intValue);
-    }
-
-    { // WholeKeyFiltering test
-      boolean boolValue = rand.nextBoolean();
-      opt.setWholeKeyFiltering(boolValue);
-      assert(opt.wholeKeyFiltering() == boolValue);
-    }
-
-    { // NumLevels test
-      int intValue = rand.nextInt();
-      opt.setNumLevels(intValue);
-      assert(opt.numLevels() == intValue);
-    }
-
-    { // LevelFileNumCompactionTrigger test
-      int intValue = rand.nextInt();
-      opt.setLevelZeroFileNumCompactionTrigger(intValue);
-      assert(opt.levelZeroFileNumCompactionTrigger() == intValue);
-    }
-
-    { // LevelSlowdownWritesTrigger test
-      int intValue = rand.nextInt();
-      opt.setLevelZeroSlowdownWritesTrigger(intValue);
-      assert(opt.levelZeroSlowdownWritesTrigger() == intValue);
-    }
-
-    { // LevelStopWritesTrigger test
-      int intValue = rand.nextInt();
-      opt.setLevelZeroStopWritesTrigger(intValue);
-      assert(opt.levelZeroStopWritesTrigger() == intValue);
-    }
-
-    { // MaxMemCompactionLevel test
-      int intValue = rand.nextInt();
-      opt.setMaxMemCompactionLevel(intValue);
-      assert(opt.maxMemCompactionLevel() == intValue);
-    }
-
-    { // TargetFileSizeBase test
-      int intValue = rand.nextInt();
-      opt.setTargetFileSizeBase(intValue);
-      assert(opt.targetFileSizeBase() == intValue);
-    }
-
-    { // TargetFileSizeMultiplier test
-      int intValue = rand.nextInt();
-      opt.setTargetFileSizeMultiplier(intValue);
-      assert(opt.targetFileSizeMultiplier() == intValue);
-    }
-
-    { // MaxBytesForLevelBase test
-      long longValue = rand.nextLong();
-      opt.setMaxBytesForLevelBase(longValue);
-      assert(opt.maxBytesForLevelBase() == longValue);
-    }
-
-    { // MaxBytesForLevelMultiplier test
-      int intValue = rand.nextInt();
-      opt.setMaxBytesForLevelMultiplier(intValue);
-      assert(opt.maxBytesForLevelMultiplier() == intValue);
-    }
-
-    { // ExpandedCompactionFactor test
-      int intValue = rand.nextInt();
-      opt.setExpandedCompactionFactor(intValue);
-      assert(opt.expandedCompactionFactor() == intValue);
-    }
-
-    { // SourceCompactionFactor test
-      int intValue = rand.nextInt();
-      opt.setSourceCompactionFactor(intValue);
-      assert(opt.sourceCompactionFactor() == intValue);
-    }
-
-    { // MaxGrandparentOverlapFactor test
-      int intValue = rand.nextInt();
-      opt.setMaxGrandparentOverlapFactor(intValue);
-      assert(opt.maxGrandparentOverlapFactor() == intValue);
-    }
-
-    { // DisableSeekCompaction test
-      boolean boolValue = rand.nextBoolean();
-      opt.setDisableSeekCompaction(boolValue);
-      assert(opt.disableSeekCompaction() == boolValue);
-    }
-
-    { // SoftRateLimit test
-      double doubleValue = rand.nextDouble();
-      opt.setSoftRateLimit(doubleValue);
-      assert(opt.softRateLimit() == doubleValue);
-    }
-
-    { // HardRateLimit test
-      double doubleValue = rand.nextDouble();
-      opt.setHardRateLimit(doubleValue);
-      assert(opt.hardRateLimit() == doubleValue);
-    }
-
-    { // RateLimitDelayMaxMilliseconds test
-      int intValue = rand.nextInt();
-      opt.setRateLimitDelayMaxMilliseconds(intValue);
-      assert(opt.rateLimitDelayMaxMilliseconds() == intValue);
-    }
-
-    { // NoBlockCache test
-      boolean boolValue = rand.nextBoolean();
-      opt.setNoBlockCache(boolValue);
-      assert(opt.noBlockCache() == boolValue);
-    }
-
-    { // ArenaBlockSize test
-      long longValue = rand.nextLong();
-      opt.setArenaBlockSize(longValue);
-      assert(opt.arenaBlockSize() == longValue);
-    }
-
-    { // DisableAutoCompactions test
-      boolean boolValue = rand.nextBoolean();
-      opt.setDisableAutoCompactions(boolValue);
-      assert(opt.disableAutoCompactions() == boolValue);
-    }
-
-    { // PurgeRedundantKvsWhileFlush test
-      boolean boolValue = rand.nextBoolean();
-      opt.setPurgeRedundantKvsWhileFlush(boolValue);
-      assert(opt.purgeRedundantKvsWhileFlush() == boolValue);
-    }
-
-    { // BlockSizeDeviation test
-      int intValue = rand.nextInt();
-      opt.setBlockSizeDeviation(intValue);
-      assert(opt.blockSizeDeviation() == intValue);
-    }
-
-    { // VerifyChecksumsInCompaction test
-      boolean boolValue = rand.nextBoolean();
-      opt.setVerifyChecksumsInCompaction(boolValue);
-      assert(opt.verifyChecksumsInCompaction() == boolValue);
-    }
-
-    { // FilterDeletes test
-      boolean boolValue = rand.nextBoolean();
-      opt.setFilterDeletes(boolValue);
-      assert(opt.filterDeletes() == boolValue);
-    }
-
-    { // MaxSequentialSkipInIterations test
-      long longValue = rand.nextLong();
-      opt.setMaxSequentialSkipInIterations(longValue);
-      assert(opt.maxSequentialSkipInIterations() == longValue);
-    }
-
-    { // InplaceUpdateSupport test
-      boolean boolValue = rand.nextBoolean();
-      opt.setInplaceUpdateSupport(boolValue);
-      assert(opt.inplaceUpdateSupport() == boolValue);
-    }
-
-    { // InplaceUpdateNumLocks test
-      long longValue = rand.nextLong();
-      opt.setInplaceUpdateNumLocks(longValue);
-      assert(opt.inplaceUpdateNumLocks() == longValue);
-    }
-
-    { // MemtablePrefixBloomBits test
-      int intValue = rand.nextInt();
-      opt.setMemtablePrefixBloomBits(intValue);
-      assert(opt.memtablePrefixBloomBits() == intValue);
-    }
-
-    { // MemtablePrefixBloomProbes test
-      int intValue = rand.nextInt();
-      opt.setMemtablePrefixBloomProbes(intValue);
-      assert(opt.memtablePrefixBloomProbes() == intValue);
-    }
-
-    { // BloomLocality test
-      int intValue = rand.nextInt();
-      opt.setBloomLocality(intValue);
-      assert(opt.bloomLocality() == intValue);
-    }
-
-    { // MaxSuccessiveMerges test
-      long longValue = rand.nextLong();
-      opt.setMaxSuccessiveMerges(longValue);
-      assert(opt.maxSuccessiveMerges() == longValue);
-    }
-
-    { // MinPartialMergeOperands test
-      int intValue = rand.nextInt();
-      opt.setMinPartialMergeOperands(intValue);
-      assert(opt.minPartialMergeOperands() == intValue);
-    }
-
-    opt.dispose();
-    System.out.println("Passed OptionsTest");
-  }
-}
diff --git a/src/rocksdb/java/org/rocksdb/test/ReadOptionsTest.java b/src/rocksdb/java/org/rocksdb/test/ReadOptionsTest.java
deleted file mode 100644
index b3b5b26..0000000
--- a/src/rocksdb/java/org/rocksdb/test/ReadOptionsTest.java
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
-
-package org.rocksdb.test;
-
-import java.util.Random;
-import org.rocksdb.RocksDB;
-import org.rocksdb.ReadOptions;
-
-public class ReadOptionsTest {
-  static {
-    RocksDB.loadLibrary();
-  }
-  public static void main(String[] args) {
-    ReadOptions opt = new ReadOptions();
-    Random rand = new Random();
-    { // VerifyChecksums test
-      boolean boolValue = rand.nextBoolean();
-      opt.setVerifyChecksums(boolValue);
-      assert(opt.verifyChecksums() == boolValue);
-    }
-
-    { // FillCache test
-      boolean boolValue = rand.nextBoolean();
-      opt.setFillCache(boolValue);
-      assert(opt.fillCache() == boolValue);
-    }
-
-    { // Tailing test
-      boolean boolValue = rand.nextBoolean();
-      opt.setTailing(boolValue);
-      assert(opt.tailing() == boolValue);
-    }
-
-    opt.dispose();
-    System.out.println("Passed ReadOptionsTest");
-  }
-}
diff --git a/src/rocksdb/java/org/rocksdb/util/Environment.java b/src/rocksdb/java/org/rocksdb/util/Environment.java
deleted file mode 100644
index c2e3bc0..0000000
--- a/src/rocksdb/java/org/rocksdb/util/Environment.java
+++ /dev/null
@@ -1,37 +0,0 @@
-package org.rocksdb.util;
-
-public class Environment {
-  private static String OS = System.getProperty("os.name").toLowerCase();
-
-  public static boolean isWindows() {
-    return (OS.indexOf("win") >= 0);
-  }
-
-  public static boolean isMac() {
-    return (OS.indexOf("mac") >= 0);
-  }
-
-  public static boolean isUnix() {
-    return (OS.indexOf("nix") >= 0 ||
-            OS.indexOf("nux") >= 0 ||
-            OS.indexOf("aix") >= 0);
-  }
-
-  public static String getSharedLibraryName(String name) {
-    if (isUnix()) {
-      return String.format("lib%s.so", name);
-    } else if (isMac()) {
-      return String.format("lib%s.dylib", name);
-    }
-    throw new UnsupportedOperationException();
-  }
-
-  public static String getJniLibraryName(String name) {
-    if (isUnix()) {
-      return String.format("lib%s.so", name);
-    } else if (isMac()) {
-      return String.format("lib%s.jnilib", name);
-    }
-    throw new UnsupportedOperationException();
-  }
-}
diff --git a/src/rocksdb/java/org/rocksdb/util/SizeUnit.java b/src/rocksdb/java/org/rocksdb/util/SizeUnit.java
deleted file mode 100644
index 8d50cd1..0000000
--- a/src/rocksdb/java/org/rocksdb/util/SizeUnit.java
+++ /dev/null
@@ -1,16 +0,0 @@
-// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
-
-package org.rocksdb.util;
-
-public class SizeUnit {
-  public static final long KB = 1024L;
-  public static final long MB = KB * KB;
-  public static final long GB = KB * MB;
-  public static final long TB = KB * GB;
-  public static final long PB = KB * TB;
-
-  private SizeUnit() {}
-}
diff --git a/src/rocksdb/java/rocksjni/backupablejni.cc b/src/rocksdb/java/rocksjni/backupablejni.cc
deleted file mode 100644
index 8b57a0c..0000000
--- a/src/rocksdb/java/rocksjni/backupablejni.cc
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
-//
-// This file implements the "bridge" between Java and C++ and enables
-// calling c++ rocksdb::DB methods from Java side.
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <jni.h>
-#include <string>
-
-#include "include/org_rocksdb_BackupableDB.h"
-#include "include/org_rocksdb_BackupableDBOptions.h"
-#include "rocksjni/portal.h"
-#include "utilities/backupable_db.h"
-
-/*
- * Class:     org_rocksdb_BackupableDB
- * Method:    open
- * Signature: (JJ)V
- */
-void Java_org_rocksdb_BackupableDB_open(
-    JNIEnv* env, jobject jbdb, jlong jdb_handle, jlong jopt_handle) {
-  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  auto opt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jopt_handle);
-  auto bdb = new rocksdb::BackupableDB(db, *opt);
-
-  // as BackupableDB extends RocksDB on the java side, we can reuse
-  // the RocksDB portal here.
-  rocksdb::RocksDBJni::setHandle(env, jbdb, bdb);
-}
-
-/*
- * Class:     org_rocksdb_BackupableDB
- * Method:    createNewBackup
- * Signature: (JZ)V
- */
-void Java_org_rocksdb_BackupableDB_createNewBackup(
-    JNIEnv* env, jobject jbdb, jlong jhandle, jboolean jflag) {
-  reinterpret_cast<rocksdb::BackupableDB*>(jhandle)->CreateNewBackup(jflag);
-}
-
-///////////////////////////////////////////////////////////////////////////
-// BackupDBOptions
-
-/*
- * Class:     org_rocksdb_BackupableDBOptions
- * Method:    newBackupableDBOptions
- * Signature: (Ljava/lang/String;)V
- */
-void Java_org_rocksdb_BackupableDBOptions_newBackupableDBOptions(
-    JNIEnv* env, jobject jobj, jstring jpath) {
-  const char* cpath = env->GetStringUTFChars(jpath, 0);
-  auto bopt = new rocksdb::BackupableDBOptions(cpath);
-  env->ReleaseStringUTFChars(jpath, cpath);
-
-  rocksdb::BackupableDBOptionsJni::setHandle(env, jobj, bopt);
-}
-
-/*
- * Class:     org_rocksdb_BackupableDBOptions
- * Method:    backupDir
- * Signature: (J)Ljava/lang/String;
- */
-jstring Java_org_rocksdb_BackupableDBOptions_backupDir(
-    JNIEnv* env, jobject jopt, jlong jhandle, jstring jpath) {
-  auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
-  return env->NewStringUTF(bopt->backup_dir.c_str());
-}
-
-/*
- * Class:     org_rocksdb_BackupableDBOptions
- * Method:    dispose
- * Signature: (J)V
- */
-void Java_org_rocksdb_BackupableDBOptions_dispose(
-    JNIEnv* env, jobject jopt, jlong jhandle) {
-  auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
-  assert(bopt);
-  delete bopt;
-
-  rocksdb::BackupableDBOptionsJni::setHandle(env, jopt, nullptr);
-}
diff --git a/src/rocksdb/java/rocksjni/filter.cc b/src/rocksdb/java/rocksjni/filter.cc
deleted file mode 100644
index 7ef9598..0000000
--- a/src/rocksdb/java/rocksjni/filter.cc
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
-//
-// This file implements the "bridge" between Java and C++ for
-// rocksdb::FilterPolicy.
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <jni.h>
-#include <string>
-
-#include "include/org_rocksdb_Filter.h"
-#include "include/org_rocksdb_BloomFilter.h"
-#include "rocksjni/portal.h"
-#include "rocksdb/filter_policy.h"
-
-/*
- * Class:     org_rocksdb_BloomFilter
- * Method:    createNewFilter0
- * Signature: (I)V
- */
-void Java_org_rocksdb_BloomFilter_createNewFilter0(
-    JNIEnv* env, jobject jobj, jint bits_per_key) {
-  const rocksdb::FilterPolicy* fp = rocksdb::NewBloomFilterPolicy(bits_per_key);
-  rocksdb::FilterJni::setHandle(env, jobj, fp);
-}
-
-/*
- * Class:     org_rocksdb_Filter
- * Method:    dispose0
- * Signature: (J)V
- */
-void Java_org_rocksdb_Filter_dispose0(
-    JNIEnv* env, jobject jobj, jlong handle) {
-  auto fp = reinterpret_cast<rocksdb::FilterPolicy*>(handle);
-  delete fp;
-
-  rocksdb::FilterJni::setHandle(env, jobj, nullptr);
-}
diff --git a/src/rocksdb/java/rocksjni/iterator.cc b/src/rocksdb/java/rocksjni/iterator.cc
deleted file mode 100644
index a7ea97d..0000000
--- a/src/rocksdb/java/rocksjni/iterator.cc
+++ /dev/null
@@ -1,145 +0,0 @@
-// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
-//
-// This file implements the "bridge" between Java and C++ and enables
-// calling c++ rocksdb::Iterator methods from Java side.
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <jni.h>
-
-#include "include/org_rocksdb_Iterator.h"
-#include "rocksjni/portal.h"
-#include "rocksdb/iterator.h"
-
-/*
- * Class:     org_rocksdb_Iterator
- * Method:    isValid0
- * Signature: (J)Z
- */
-jboolean Java_org_rocksdb_Iterator_isValid0(
-    JNIEnv* env, jobject jobj, jlong handle) {
-  return reinterpret_cast<rocksdb::Iterator*>(handle)->Valid();
-}
-
-/*
- * Class:     org_rocksdb_Iterator
- * Method:    seekToFirst0
- * Signature: (J)V
- */
-void Java_org_rocksdb_Iterator_seekToFirst0(
-    JNIEnv* env, jobject jobj, jlong handle) {
-  reinterpret_cast<rocksdb::Iterator*>(handle)->SeekToFirst();
-}
-
-/*
- * Class:     org_rocksdb_Iterator
- * Method:    seekToFirst0
- * Signature: (J)V
- */
-void Java_org_rocksdb_Iterator_seekToLast0(
-    JNIEnv* env, jobject jobj, jlong handle) {
-  reinterpret_cast<rocksdb::Iterator*>(handle)->SeekToLast();
-}
-
-/*
- * Class:     org_rocksdb_Iterator
- * Method:    seekToLast0
- * Signature: (J)V
- */
-void Java_org_rocksdb_Iterator_next0(
-    JNIEnv* env, jobject jobj, jlong handle) {
-  reinterpret_cast<rocksdb::Iterator*>(handle)->Next();
-}
-
-/*
- * Class:     org_rocksdb_Iterator
- * Method:    next0
- * Signature: (J)V
- */
-void Java_org_rocksdb_Iterator_prev0(
-    JNIEnv* env, jobject jobj, jlong handle) {
-  reinterpret_cast<rocksdb::Iterator*>(handle)->Prev();
-}
-
-/*
- * Class:     org_rocksdb_Iterator
- * Method:    prev0
- * Signature: (J)V
- */
-jbyteArray Java_org_rocksdb_Iterator_key0(
-    JNIEnv* env, jobject jobj, jlong handle) {
-  auto it = reinterpret_cast<rocksdb::Iterator*>(handle);
-  rocksdb::Slice key_slice = it->key();
-
-  jbyteArray jkey = env->NewByteArray(key_slice.size());
-  env->SetByteArrayRegion(
-      jkey, 0, key_slice.size(),
-      reinterpret_cast<const jbyte*>(key_slice.data()));
-  return jkey;
-}
-
-/*
- * Class:     org_rocksdb_Iterator
- * Method:    key0
- * Signature: (J)[B
- */
-jbyteArray Java_org_rocksdb_Iterator_value0(
-    JNIEnv* env, jobject jobj, jlong handle) {
-  auto it = reinterpret_cast<rocksdb::Iterator*>(handle);
-  rocksdb::Slice value_slice = it->value();
-
-  jbyteArray jvalue = env->NewByteArray(value_slice.size());
-  env->SetByteArrayRegion(
-      jvalue, 0, value_slice.size(),
-      reinterpret_cast<const jbyte*>(value_slice.data()));
-  return jvalue;
-}
-
-/*
- * Class:     org_rocksdb_Iterator
- * Method:    value0
- * Signature: (J)[B
- */
-void Java_org_rocksdb_Iterator_seek0(
-    JNIEnv* env, jobject jobj, jlong handle,
-    jbyteArray jtarget, jint jtarget_len) {
-  auto it = reinterpret_cast<rocksdb::Iterator*>(handle);
-  jbyte* target = env->GetByteArrayElements(jtarget, 0);
-  rocksdb::Slice target_slice(
-      reinterpret_cast<char*>(target), jtarget_len);
-
-  it->Seek(target_slice);
-
-  env->ReleaseByteArrayElements(jtarget, target, JNI_ABORT);
-}
-
-/*
- * Class:     org_rocksdb_Iterator
- * Method:    seek0
- * Signature: (J[BI)V
- */
-void Java_org_rocksdb_Iterator_status0(
-    JNIEnv* env, jobject jobj, jlong handle) {
-  auto it = reinterpret_cast<rocksdb::Iterator*>(handle);
-  rocksdb::Status s = it->status();
-
-  if (s.ok()) {
-    return;
-  }
-
-  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
-}
-
-/*
- * Class:     org_rocksdb_Iterator
- * Method:    dispose
- * Signature: (J)V
- */
-void Java_org_rocksdb_Iterator_dispose(
-    JNIEnv* env, jobject jobj, jlong handle) {
-  auto it = reinterpret_cast<rocksdb::Iterator*>(handle);
-  delete it;
-}
diff --git a/src/rocksdb/java/rocksjni/memtablejni.cc b/src/rocksdb/java/rocksjni/memtablejni.cc
deleted file mode 100644
index a0d50f5..0000000
--- a/src/rocksdb/java/rocksjni/memtablejni.cc
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
-//
-// This file implements the "bridge" between Java and C++ for MemTables.
-
-#include "include/org_rocksdb_HashSkipListMemTableConfig.h"
-#include "include/org_rocksdb_HashLinkedListMemTableConfig.h"
-#include "include/org_rocksdb_VectorMemTableConfig.h"
-#include "include/org_rocksdb_SkipListMemTableConfig.h"
-#include "rocksdb/memtablerep.h"
-
-/*
- * Class:     org_rocksdb_HashSkipListMemTableConfig
- * Method:    newMemTableFactoryHandle
- * Signature: (JII)J
- */
-jlong Java_org_rocksdb_HashSkipListMemTableConfig_newMemTableFactoryHandle(
-    JNIEnv* env, jobject jobj, jlong jbucket_count,
-    jint jheight, jint jbranching_factor) {
-  return reinterpret_cast<jlong>(rocksdb::NewHashSkipListRepFactory(
-      static_cast<size_t>(jbucket_count),
-      static_cast<int32_t>(jheight),
-      static_cast<int32_t>(jbranching_factor)));
-}
-
-/*
- * Class:     org_rocksdb_HashLinkedListMemTableConfig
- * Method:    newMemTableFactoryHandle
- * Signature: (J)J
- */
-jlong Java_org_rocksdb_HashLinkedListMemTableConfig_newMemTableFactoryHandle(
-    JNIEnv* env, jobject jobj, jlong jbucket_count) {
-  return reinterpret_cast<jlong>(rocksdb::NewHashLinkListRepFactory(
-       static_cast<size_t>(jbucket_count)));
-}
-
-/*
- * Class:     org_rocksdb_VectorMemTableConfig
- * Method:    newMemTableFactoryHandle
- * Signature: (J)J
- */
-jlong Java_org_rocksdb_VectorMemTableConfig_newMemTableFactoryHandle(
-    JNIEnv* env, jobject jobj, jlong jreserved_size) {
-  return reinterpret_cast<jlong>(new rocksdb::VectorRepFactory(
-      static_cast<size_t>(jreserved_size)));
-}
-
-/*
- * Class:     org_rocksdb_SkipListMemTableConfig
- * Method:    newMemTableFactoryHandle0
- * Signature: ()J
- */
-jlong Java_org_rocksdb_SkipListMemTableConfig_newMemTableFactoryHandle0(
-    JNIEnv* env, jobject jobj) {
-  return reinterpret_cast<jlong>(new rocksdb::SkipListFactory());
-}
diff --git a/src/rocksdb/java/rocksjni/options.cc b/src/rocksdb/java/rocksjni/options.cc
deleted file mode 100644
index c5849ce..0000000
--- a/src/rocksdb/java/rocksjni/options.cc
+++ /dev/null
@@ -1,1807 +0,0 @@
-// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
-//
-// This file implements the "bridge" between Java and C++ for rocksdb::Options.
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <jni.h>
-#include <string>
-#include <memory>
-
-#include "include/org_rocksdb_Options.h"
-#include "include/org_rocksdb_WriteOptions.h"
-#include "include/org_rocksdb_ReadOptions.h"
-#include "rocksjni/portal.h"
-#include "rocksdb/db.h"
-#include "rocksdb/options.h"
-#include "rocksdb/statistics.h"
-#include "rocksdb/memtablerep.h"
-#include "rocksdb/table.h"
-#include "rocksdb/slice_transform.h"
-#include "rocksdb/filter_policy.h"
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    newOptions
- * Signature: ()V
- */
-void Java_org_rocksdb_Options_newOptions(JNIEnv* env, jobject jobj) {
-  rocksdb::Options* op = new rocksdb::Options();
-  rocksdb::OptionsJni::setHandle(env, jobj, op);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    dispose0
- * Signature: ()V
- */
-void Java_org_rocksdb_Options_dispose0(JNIEnv* env, jobject jobj) {
-  rocksdb::Options* op = rocksdb::OptionsJni::getHandle(env, jobj);
-  delete op;
-
-  rocksdb::OptionsJni::setHandle(env, jobj, nullptr);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setCreateIfMissing
- * Signature: (JZ)V
- */
-void Java_org_rocksdb_Options_setCreateIfMissing(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean flag) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->create_if_missing = flag;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    createIfMissing
- * Signature: (J)Z
- */
-jboolean Java_org_rocksdb_Options_createIfMissing(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->create_if_missing;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setWriteBufferSize
- * Signature: (JJ)I
- */
-void Java_org_rocksdb_Options_setWriteBufferSize(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jwrite_buffer_size) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->write_buffer_size =
-          static_cast<size_t>(jwrite_buffer_size);
-}
-
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    writeBufferSize
- * Signature: (J)J
- */
-jlong Java_org_rocksdb_Options_writeBufferSize(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->write_buffer_size;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setMaxWriteBufferNumber
- * Signature: (JI)V
- */
-void Java_org_rocksdb_Options_setMaxWriteBufferNumber(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint jmax_write_buffer_number) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->max_write_buffer_number =
-          jmax_write_buffer_number;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    createStatistics
- * Signature: (J)V
- */
-void Java_org_rocksdb_Options_createStatistics(
-    JNIEnv* env, jobject jobj, jlong jOptHandle) {
-  reinterpret_cast<rocksdb::Options*>(jOptHandle)->statistics =
-      rocksdb::CreateDBStatistics();
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    statisticsPtr
- * Signature: (J)J
- */
-jlong Java_org_rocksdb_Options_statisticsPtr(
-    JNIEnv* env, jobject jobj, jlong jOptHandle) {
-  auto st = reinterpret_cast<rocksdb::Options*>(jOptHandle)->statistics.get();
-  return reinterpret_cast<jlong>(st);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setFilterHandle
- * Signature: (JJ)V
- */
-void Java_org_rocksdb_Options_setFilterHandle(
-    JNIEnv* env, jobject jobj, jlong jopt_handle, jlong jfilter_handle) {
-  reinterpret_cast<rocksdb::Options*>(jopt_handle)->filter_policy =
-      reinterpret_cast<rocksdb::FilterPolicy*>(jfilter_handle);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    maxWriteBufferNumber
- * Signature: (J)I
- */
-jint Java_org_rocksdb_Options_maxWriteBufferNumber(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->max_write_buffer_number;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setBlockSize
- * Signature: (JJ)V
- */
-void Java_org_rocksdb_Options_setBlockSize(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jblock_size) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->block_size =
-          static_cast<size_t>(jblock_size);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    blockSize
- * Signature: (J)J
- */
-jlong Java_org_rocksdb_Options_blockSize(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->block_size;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setDisableSeekCompaction
- * Signature: (JZ)V
- */
-void Java_org_rocksdb_Options_setDisableSeekCompaction(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jboolean jdisable_seek_compaction) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->disable_seek_compaction =
-         jdisable_seek_compaction;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    disableSeekCompaction
- * Signature: (J)Z
- */
-jboolean Java_org_rocksdb_Options_disableSeekCompaction(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->disable_seek_compaction;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    errorIfExists
- * Signature: (J)Z
- */
-jboolean Java_org_rocksdb_Options_errorIfExists(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->error_if_exists;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setErrorIfExists
- * Signature: (JZ)V
- */
-void Java_org_rocksdb_Options_setErrorIfExists(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean error_if_exists) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->error_if_exists =
-      static_cast<bool>(error_if_exists);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    paranoidChecks
- * Signature: (J)Z
- */
-jboolean Java_org_rocksdb_Options_paranoidChecks(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->paranoid_checks;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setParanoidChecks
- * Signature: (JZ)V
- */
-void Java_org_rocksdb_Options_setParanoidChecks(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean paranoid_checks) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->paranoid_checks =
-      static_cast<bool>(paranoid_checks);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    maxOpenFiles
- * Signature: (J)I
- */
-jint Java_org_rocksdb_Options_maxOpenFiles(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->max_open_files;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setMaxOpenFiles
- * Signature: (JI)V
- */
-void Java_org_rocksdb_Options_setMaxOpenFiles(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint max_open_files) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->max_open_files =
-      static_cast<int>(max_open_files);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    disableDataSync
- * Signature: (J)Z
- */
-jboolean Java_org_rocksdb_Options_disableDataSync(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->disableDataSync;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setDisableDataSync
- * Signature: (JZ)V
- */
-void Java_org_rocksdb_Options_setDisableDataSync(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean disableDataSync) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->disableDataSync =
-      static_cast<bool>(disableDataSync);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    useFsync
- * Signature: (J)Z
- */
-jboolean Java_org_rocksdb_Options_useFsync(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->use_fsync;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setUseFsync
- * Signature: (JZ)V
- */
-void Java_org_rocksdb_Options_setUseFsync(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean use_fsync) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->use_fsync =
-      static_cast<bool>(use_fsync);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    dbStatsLogInterval
- * Signature: (J)I
- */
-jint Java_org_rocksdb_Options_dbStatsLogInterval(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->db_stats_log_interval;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setDbStatsLogInterval
- * Signature: (JI)V
- */
-void Java_org_rocksdb_Options_setDbStatsLogInterval(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint db_stats_log_interval) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->db_stats_log_interval =
-      static_cast<int>(db_stats_log_interval);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    dbLogDir
- * Signature: (J)Ljava/lang/String
- */
-jstring Java_org_rocksdb_Options_dbLogDir(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return env->NewStringUTF(
-      reinterpret_cast<rocksdb::Options*>(jhandle)->db_log_dir.c_str());
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setDbLogDir
- * Signature: (JLjava/lang/String)V
- */
-void Java_org_rocksdb_Options_setDbLogDir(
-    JNIEnv* env, jobject jobj, jlong jhandle, jstring jdb_log_dir) {
-  const char* log_dir = env->GetStringUTFChars(jdb_log_dir, 0);
-  reinterpret_cast<rocksdb::Options*>(jhandle)->db_log_dir.assign(log_dir);
-  env->ReleaseStringUTFChars(jdb_log_dir, log_dir);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    walDir
- * Signature: (J)Ljava/lang/String
- */
-jstring Java_org_rocksdb_Options_walDir(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return env->NewStringUTF(
-      reinterpret_cast<rocksdb::Options*>(jhandle)->wal_dir.c_str());
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setWalDir
- * Signature: (JLjava/lang/String)V
- */
-void Java_org_rocksdb_Options_setWalDir(
-    JNIEnv* env, jobject jobj, jlong jhandle, jstring jwal_dir) {
-  const char* wal_dir = env->GetStringUTFChars(jwal_dir, 0);
-  reinterpret_cast<rocksdb::Options*>(jhandle)->wal_dir.assign(wal_dir);
-  env->ReleaseStringUTFChars(jwal_dir, wal_dir);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    deleteObsoleteFilesPeriodMicros
- * Signature: (J)J
- */
-jlong Java_org_rocksdb_Options_deleteObsoleteFilesPeriodMicros(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)
-      ->delete_obsolete_files_period_micros;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setDeleteObsoleteFilesPeriodMicros
- * Signature: (JJ)V
- */
-void Java_org_rocksdb_Options_setDeleteObsoleteFilesPeriodMicros(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong micros) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)
-      ->delete_obsolete_files_period_micros =
-          static_cast<int64_t>(micros);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    maxBackgroundCompactions
- * Signature: (J)I
- */
-jint Java_org_rocksdb_Options_maxBackgroundCompactions(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(
-      jhandle)->max_background_compactions;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setMaxBackgroundCompactions
- * Signature: (JI)V
- */
-void Java_org_rocksdb_Options_setMaxBackgroundCompactions(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint max) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)
-      ->max_background_compactions = static_cast<int>(max);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    maxBackgroundFlushes
- * Signature: (J)I
- */
-jint Java_org_rocksdb_Options_maxBackgroundFlushes(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->max_background_flushes;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setMaxBackgroundFlushes
- * Signature: (JI)V
- */
-void Java_org_rocksdb_Options_setMaxBackgroundFlushes(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint max_background_flushes) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->max_background_flushes =
-      static_cast<int>(max_background_flushes);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    maxLogFileSize
- * Signature: (J)J
- */
-jlong Java_org_rocksdb_Options_maxLogFileSize(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->max_log_file_size;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setMaxLogFileSize
- * Signature: (JJ)V
- */
-void Java_org_rocksdb_Options_setMaxLogFileSize(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong max_log_file_size) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->max_log_file_size =
-      static_cast<size_t>(max_log_file_size);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    logFileTimeToRoll
- * Signature: (J)J
- */
-jlong Java_org_rocksdb_Options_logFileTimeToRoll(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->log_file_time_to_roll;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setLogFileTimeToRoll
- * Signature: (JJ)V
- */
-void Java_org_rocksdb_Options_setLogFileTimeToRoll(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong log_file_time_to_roll) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->log_file_time_to_roll =
-      static_cast<size_t>(log_file_time_to_roll);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    keepLogFileNum
- * Signature: (J)J
- */
-jlong Java_org_rocksdb_Options_keepLogFileNum(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->keep_log_file_num;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setKeepLogFileNum
- * Signature: (JJ)V
- */
-void Java_org_rocksdb_Options_setKeepLogFileNum(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong keep_log_file_num) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->keep_log_file_num =
-      static_cast<size_t>(keep_log_file_num);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    maxManifestFileSize
- * Signature: (J)J
- */
-jlong Java_org_rocksdb_Options_maxManifestFileSize(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->max_manifest_file_size;
-}
-
-/*
- * Method:    memTableFactoryName
- * Signature: (J)Ljava/lang/String
- */
-jstring Java_org_rocksdb_Options_memTableFactoryName(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  auto opt = reinterpret_cast<rocksdb::Options*>(jhandle);
-  rocksdb::MemTableRepFactory* tf = opt->memtable_factory.get();
-
-  // Should never be nullptr.
-  // Default memtable factory is SkipListFactory
-  assert(tf);
-
-  // temporarly fix for the historical typo
-  if (strcmp(tf->Name(), "HashLinkListRepFactory") == 0) {
-    return env->NewStringUTF("HashLinkedListRepFactory");
-  }
-
-  return env->NewStringUTF(tf->Name());
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setMaxManifestFileSize
- * Signature: (JJ)V
- */
-void Java_org_rocksdb_Options_setMaxManifestFileSize(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong max_manifest_file_size) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->max_manifest_file_size =
-      static_cast<int64_t>(max_manifest_file_size);
-}
-
-/*
- * Method:    setMemTableFactory
- * Signature: (JJ)V
- */
-void Java_org_rocksdb_Options_setMemTableFactory(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jfactory_handle) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->memtable_factory.reset(
-      reinterpret_cast<rocksdb::MemTableRepFactory*>(jfactory_handle));
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    tableCacheNumshardbits
- * Signature: (J)I
- */
-jint Java_org_rocksdb_Options_tableCacheNumshardbits(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->table_cache_numshardbits;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setTableCacheNumshardbits
- * Signature: (JI)V
- */
-void Java_org_rocksdb_Options_setTableCacheNumshardbits(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint table_cache_numshardbits) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->table_cache_numshardbits =
-      static_cast<int>(table_cache_numshardbits);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    tableCacheRemoveScanCountLimit
- * Signature: (J)I
- */
-jint Java_org_rocksdb_Options_tableCacheRemoveScanCountLimit(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(
-      jhandle)->table_cache_remove_scan_count_limit;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setTableCacheRemoveScanCountLimit
- * Signature: (JI)V
- */
-void Java_org_rocksdb_Options_setTableCacheRemoveScanCountLimit(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint limit) {
-  reinterpret_cast<rocksdb::Options*>(
-      jhandle)->table_cache_remove_scan_count_limit = static_cast<int>(limit);
-}
-
-/*
- * Method:    useFixedLengthPrefixExtractor
- * Signature: (JI)V
- */
-void Java_org_rocksdb_Options_useFixedLengthPrefixExtractor(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint jprefix_length) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->prefix_extractor.reset(
-      rocksdb::NewFixedPrefixTransform(static_cast<size_t>(jprefix_length)));
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    walTtlSeconds
- * Signature: (J)J
- */
-jlong Java_org_rocksdb_Options_walTtlSeconds(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->WAL_ttl_seconds;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setWalTtlSeconds
- * Signature: (JJ)V
- */
-void Java_org_rocksdb_Options_setWalTtlSeconds(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong WAL_ttl_seconds) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->WAL_ttl_seconds =
-      static_cast<int64_t>(WAL_ttl_seconds);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    walTtlSeconds
- * Signature: (J)J
- */
-jlong Java_org_rocksdb_Options_walSizeLimitMB(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->WAL_size_limit_MB;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setWalSizeLimitMB
- * Signature: (JJ)V
- */
-void Java_org_rocksdb_Options_setWalSizeLimitMB(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong WAL_size_limit_MB) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->WAL_size_limit_MB =
-      static_cast<int64_t>(WAL_size_limit_MB);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    manifestPreallocationSize
- * Signature: (J)J
- */
-jlong Java_org_rocksdb_Options_manifestPreallocationSize(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)
-      ->manifest_preallocation_size;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setManifestPreallocationSize
- * Signature: (JJ)V
- */
-void Java_org_rocksdb_Options_setManifestPreallocationSize(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong preallocation_size) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->manifest_preallocation_size =
-      static_cast<size_t>(preallocation_size);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    allowOsBuffer
- * Signature: (J)Z
- */
-jboolean Java_org_rocksdb_Options_allowOsBuffer(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->allow_os_buffer;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setAllowOsBuffer
- * Signature: (JZ)V
- */
-void Java_org_rocksdb_Options_setAllowOsBuffer(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean allow_os_buffer) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->allow_os_buffer =
-      static_cast<bool>(allow_os_buffer);
-}
-
-/*
- * Method:    setTableFactory
- * Signature: (JJ)V
- */
-void Java_org_rocksdb_Options_setTableFactory(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jfactory_handle) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->table_factory.reset(
-      reinterpret_cast<rocksdb::TableFactory*>(jfactory_handle));
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    allowMmapReads
- * Signature: (J)Z
- */
-jboolean Java_org_rocksdb_Options_allowMmapReads(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->allow_mmap_reads;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setAllowMmapReads
- * Signature: (JZ)V
- */
-void Java_org_rocksdb_Options_setAllowMmapReads(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean allow_mmap_reads) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->allow_mmap_reads =
-      static_cast<bool>(allow_mmap_reads);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    allowMmapWrites
- * Signature: (J)Z
- */
-jboolean Java_org_rocksdb_Options_allowMmapWrites(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->allow_mmap_writes;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setAllowMmapWrites
- * Signature: (JZ)V
- */
-void Java_org_rocksdb_Options_setAllowMmapWrites(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean allow_mmap_writes) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->allow_mmap_writes =
-      static_cast<bool>(allow_mmap_writes);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    isFdCloseOnExec
- * Signature: (J)Z
- */
-jboolean Java_org_rocksdb_Options_isFdCloseOnExec(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->is_fd_close_on_exec;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setIsFdCloseOnExec
- * Signature: (JZ)V
- */
-void Java_org_rocksdb_Options_setIsFdCloseOnExec(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean is_fd_close_on_exec) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->is_fd_close_on_exec =
-      static_cast<bool>(is_fd_close_on_exec);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    skipLogErrorOnRecovery
- * Signature: (J)Z
- */
-jboolean Java_org_rocksdb_Options_skipLogErrorOnRecovery(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)
-      ->skip_log_error_on_recovery;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setSkipLogErrorOnRecovery
- * Signature: (JZ)V
- */
-void Java_org_rocksdb_Options_setSkipLogErrorOnRecovery(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean skip) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->skip_log_error_on_recovery =
-      static_cast<bool>(skip);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    statsDumpPeriodSec
- * Signature: (J)I
- */
-jint Java_org_rocksdb_Options_statsDumpPeriodSec(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->stats_dump_period_sec;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setStatsDumpPeriodSec
- * Signature: (JI)V
- */
-void Java_org_rocksdb_Options_setStatsDumpPeriodSec(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint stats_dump_period_sec) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->stats_dump_period_sec =
-      static_cast<int>(stats_dump_period_sec);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    adviseRandomOnOpen
- * Signature: (J)Z
- */
-jboolean Java_org_rocksdb_Options_adviseRandomOnOpen(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->advise_random_on_open;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setAdviseRandomOnOpen
- * Signature: (JZ)V
- */
-void Java_org_rocksdb_Options_setAdviseRandomOnOpen(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean advise_random_on_open) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->advise_random_on_open =
-      static_cast<bool>(advise_random_on_open);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    useAdaptiveMutex
- * Signature: (J)Z
- */
-jboolean Java_org_rocksdb_Options_useAdaptiveMutex(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->use_adaptive_mutex;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setUseAdaptiveMutex
- * Signature: (JZ)V
- */
-void Java_org_rocksdb_Options_setUseAdaptiveMutex(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean use_adaptive_mutex) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->use_adaptive_mutex =
-      static_cast<bool>(use_adaptive_mutex);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    bytesPerSync
- * Signature: (J)J
- */
-jlong Java_org_rocksdb_Options_bytesPerSync(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->bytes_per_sync;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setBytesPerSync
- * Signature: (JJ)V
- */
-void Java_org_rocksdb_Options_setBytesPerSync(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong bytes_per_sync) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->bytes_per_sync =
-      static_cast<int64_t>(bytes_per_sync);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    allowThreadLocal
- * Signature: (J)Z
- */
-jboolean Java_org_rocksdb_Options_allowThreadLocal(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->allow_thread_local;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setAllowThreadLocal
- * Signature: (JZ)V
- */
-void Java_org_rocksdb_Options_setAllowThreadLocal(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean allow_thread_local) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->allow_thread_local =
-      static_cast<bool>(allow_thread_local);
-}
-
-/*
- * Method:    tableFactoryName
- * Signature: (J)Ljava/lang/String
- */
-jstring Java_org_rocksdb_Options_tableFactoryName(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  auto opt = reinterpret_cast<rocksdb::Options*>(jhandle);
-  rocksdb::TableFactory* tf = opt->table_factory.get();
-
-  // Should never be nullptr.
-  // Default memtable factory is SkipListFactory
-  assert(tf);
-
-  return env->NewStringUTF(tf->Name());
-}
-
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    minWriteBufferNumberToMerge
- * Signature: (J)I
- */
-jint Java_org_rocksdb_Options_minWriteBufferNumberToMerge(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(
-      jhandle)->min_write_buffer_number_to_merge;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setMinWriteBufferNumberToMerge
- * Signature: (JI)V
- */
-void Java_org_rocksdb_Options_setMinWriteBufferNumberToMerge(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jint jmin_write_buffer_number_to_merge) {
-  reinterpret_cast<rocksdb::Options*>(
-      jhandle)->min_write_buffer_number_to_merge =
-          static_cast<int>(jmin_write_buffer_number_to_merge);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    blockRestartInterval
- * Signature: (J)I
- */
-jint Java_org_rocksdb_Options_blockRestartInterval(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->block_restart_interval;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setBlockRestartInterval
- * Signature: (JI)V
- */
-void Java_org_rocksdb_Options_setBlockRestartInterval(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint jblock_restart_interval) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->block_restart_interval =
-      static_cast<int>(jblock_restart_interval);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    wholeKeyFiltering
- * Signature: (J)Z
- */
-jboolean Java_org_rocksdb_Options_wholeKeyFiltering(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->whole_key_filtering;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setWholeKeyFiltering
- * Signature: (JZ)V
- */
-void Java_org_rocksdb_Options_setWholeKeyFiltering(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean jwhole_key_filtering) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->whole_key_filtering =
-      static_cast<bool>(jwhole_key_filtering);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    numLevels
- * Signature: (J)I
- */
-jint Java_org_rocksdb_Options_numLevels(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->num_levels;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setNumLevels
- * Signature: (JI)V
- */
-void Java_org_rocksdb_Options_setNumLevels(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint jnum_levels) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->num_levels =
-      static_cast<int>(jnum_levels);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    levelZeroFileNumCompactionTrigger
- * Signature: (J)I
- */
-jint Java_org_rocksdb_Options_levelZeroFileNumCompactionTrigger(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(
-      jhandle)->level0_file_num_compaction_trigger;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setLevelZeroFileNumCompactionTrigger
- * Signature: (JI)V
- */
-void Java_org_rocksdb_Options_setLevelZeroFileNumCompactionTrigger(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jint jlevel0_file_num_compaction_trigger) {
-  reinterpret_cast<rocksdb::Options*>(
-      jhandle)->level0_file_num_compaction_trigger =
-          static_cast<int>(jlevel0_file_num_compaction_trigger);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    levelZeroSlowdownWritesTrigger
- * Signature: (J)I
- */
-jint Java_org_rocksdb_Options_levelZeroSlowdownWritesTrigger(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(
-      jhandle)->level0_slowdown_writes_trigger;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setLevelSlowdownWritesTrigger
- * Signature: (JI)V
- */
-void Java_org_rocksdb_Options_setLevelZeroSlowdownWritesTrigger(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jint jlevel0_slowdown_writes_trigger) {
-  reinterpret_cast<rocksdb::Options*>(
-      jhandle)->level0_slowdown_writes_trigger =
-          static_cast<int>(jlevel0_slowdown_writes_trigger);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    levelZeroStopWritesTrigger
- * Signature: (J)I
- */
-jint Java_org_rocksdb_Options_levelZeroStopWritesTrigger(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(
-      jhandle)->level0_stop_writes_trigger;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setLevelStopWritesTrigger
- * Signature: (JI)V
- */
-void Java_org_rocksdb_Options_setLevelZeroStopWritesTrigger(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jint jlevel0_stop_writes_trigger) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->level0_stop_writes_trigger =
-      static_cast<int>(jlevel0_stop_writes_trigger);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    maxMemCompactionLevel
- * Signature: (J)I
- */
-jint Java_org_rocksdb_Options_maxMemCompactionLevel(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(
-      jhandle)->max_mem_compaction_level;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setMaxMemCompactionLevel
- * Signature: (JI)V
- */
-void Java_org_rocksdb_Options_setMaxMemCompactionLevel(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jint jmax_mem_compaction_level) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->max_mem_compaction_level =
-      static_cast<int>(jmax_mem_compaction_level);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    targetFileSizeBase
- * Signature: (J)I
- */
-jint Java_org_rocksdb_Options_targetFileSizeBase(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->target_file_size_base;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setTargetFileSizeBase
- * Signature: (JI)V
- */
-void Java_org_rocksdb_Options_setTargetFileSizeBase(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jint jtarget_file_size_base) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->target_file_size_base =
-      static_cast<int>(jtarget_file_size_base);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    targetFileSizeMultiplier
- * Signature: (J)I
- */
-jint Java_org_rocksdb_Options_targetFileSizeMultiplier(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(
-      jhandle)->target_file_size_multiplier;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setTargetFileSizeMultiplier
- * Signature: (JI)V
- */
-void Java_org_rocksdb_Options_setTargetFileSizeMultiplier(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jint jtarget_file_size_multiplier) {
-  reinterpret_cast<rocksdb::Options*>(
-      jhandle)->target_file_size_multiplier =
-          static_cast<int>(jtarget_file_size_multiplier);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    maxBytesForLevelBase
- * Signature: (J)J
- */
-jlong Java_org_rocksdb_Options_maxBytesForLevelBase(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(
-      jhandle)->max_bytes_for_level_base;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setMaxBytesForLevelBase
- * Signature: (JJ)V
- */
-void Java_org_rocksdb_Options_setMaxBytesForLevelBase(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jlong jmax_bytes_for_level_base) {
-  reinterpret_cast<rocksdb::Options*>(
-      jhandle)->max_bytes_for_level_base =
-          static_cast<int64_t>(jmax_bytes_for_level_base);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    maxBytesForLevelMultiplier
- * Signature: (J)I
- */
-jint Java_org_rocksdb_Options_maxBytesForLevelMultiplier(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(
-      jhandle)->max_bytes_for_level_multiplier;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setMaxBytesForLevelMultiplier
- * Signature: (JI)V
- */
-void Java_org_rocksdb_Options_setMaxBytesForLevelMultiplier(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jint jmax_bytes_for_level_multiplier) {
-  reinterpret_cast<rocksdb::Options*>(
-      jhandle)->max_bytes_for_level_multiplier =
-          static_cast<int>(jmax_bytes_for_level_multiplier);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    expandedCompactionFactor
- * Signature: (J)I
- */
-jint Java_org_rocksdb_Options_expandedCompactionFactor(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(
-      jhandle)->expanded_compaction_factor;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setExpandedCompactionFactor
- * Signature: (JI)V
- */
-void Java_org_rocksdb_Options_setExpandedCompactionFactor(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jint jexpanded_compaction_factor) {
-  reinterpret_cast<rocksdb::Options*>(
-      jhandle)->expanded_compaction_factor =
-          static_cast<int>(jexpanded_compaction_factor);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    sourceCompactionFactor
- * Signature: (J)I
- */
-jint Java_org_rocksdb_Options_sourceCompactionFactor(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(
-      jhandle)->source_compaction_factor;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setSourceCompactionFactor
- * Signature: (JI)V
- */
-void Java_org_rocksdb_Options_setSourceCompactionFactor(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-        jint jsource_compaction_factor) {
-  reinterpret_cast<rocksdb::Options*>(
-      jhandle)->source_compaction_factor =
-          static_cast<int>(jsource_compaction_factor);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    maxGrandparentOverlapFactor
- * Signature: (J)I
- */
-jint Java_org_rocksdb_Options_maxGrandparentOverlapFactor(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(
-      jhandle)->max_grandparent_overlap_factor;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setMaxGrandparentOverlapFactor
- * Signature: (JI)V
- */
-void Java_org_rocksdb_Options_setMaxGrandparentOverlapFactor(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jint jmax_grandparent_overlap_factor) {
-  reinterpret_cast<rocksdb::Options*>(
-      jhandle)->max_grandparent_overlap_factor =
-          static_cast<int>(jmax_grandparent_overlap_factor);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    softRateLimit
- * Signature: (J)D
- */
-jdouble Java_org_rocksdb_Options_softRateLimit(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->soft_rate_limit;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setSoftRateLimit
- * Signature: (JD)V
- */
-void Java_org_rocksdb_Options_setSoftRateLimit(
-    JNIEnv* env, jobject jobj, jlong jhandle, jdouble jsoft_rate_limit) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->soft_rate_limit =
-      static_cast<double>(jsoft_rate_limit);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    hardRateLimit
- * Signature: (J)D
- */
-jdouble Java_org_rocksdb_Options_hardRateLimit(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->hard_rate_limit;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setHardRateLimit
- * Signature: (JD)V
- */
-void Java_org_rocksdb_Options_setHardRateLimit(
-    JNIEnv* env, jobject jobj, jlong jhandle, jdouble jhard_rate_limit) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->hard_rate_limit =
-      static_cast<double>(jhard_rate_limit);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    rateLimitDelayMaxMilliseconds
- * Signature: (J)I
- */
-jint Java_org_rocksdb_Options_rateLimitDelayMaxMilliseconds(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(
-      jhandle)->rate_limit_delay_max_milliseconds;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setRateLimitDelayMaxMilliseconds
- * Signature: (JI)V
- */
-void Java_org_rocksdb_Options_setRateLimitDelayMaxMilliseconds(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jint jrate_limit_delay_max_milliseconds) {
-  reinterpret_cast<rocksdb::Options*>(
-      jhandle)->rate_limit_delay_max_milliseconds =
-          static_cast<int>(jrate_limit_delay_max_milliseconds);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    noBlockCache
- * Signature: (J)Z
- */
-jboolean Java_org_rocksdb_Options_noBlockCache(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->no_block_cache;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setNoBlockCache
- * Signature: (JZ)V
- */
-void Java_org_rocksdb_Options_setNoBlockCache(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean jno_block_cache) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->no_block_cache =
-      static_cast<bool>(jno_block_cache);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    arenaBlockSize
- * Signature: (J)J
- */
-jlong Java_org_rocksdb_Options_arenaBlockSize(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->arena_block_size;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setArenaBlockSize
- * Signature: (JJ)V
- */
-void Java_org_rocksdb_Options_setArenaBlockSize(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jarena_block_size) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->arena_block_size =
-      static_cast<size_t>(jarena_block_size);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    disableAutoCompactions
- * Signature: (J)Z
- */
-jboolean Java_org_rocksdb_Options_disableAutoCompactions(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(
-      jhandle)->disable_auto_compactions;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setDisableAutoCompactions
- * Signature: (JZ)V
- */
-void Java_org_rocksdb_Options_setDisableAutoCompactions(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jboolean jdisable_auto_compactions) {
-  reinterpret_cast<rocksdb::Options*>(
-      jhandle)->disable_auto_compactions =
-          static_cast<bool>(jdisable_auto_compactions);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    purgeRedundantKvsWhileFlush
- * Signature: (J)Z
- */
-jboolean Java_org_rocksdb_Options_purgeRedundantKvsWhileFlush(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(
-      jhandle)->purge_redundant_kvs_while_flush;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setPurgeRedundantKvsWhileFlush
- * Signature: (JZ)V
- */
-void Java_org_rocksdb_Options_setPurgeRedundantKvsWhileFlush(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jboolean jpurge_redundant_kvs_while_flush) {
-  reinterpret_cast<rocksdb::Options*>(
-      jhandle)->purge_redundant_kvs_while_flush =
-          static_cast<bool>(jpurge_redundant_kvs_while_flush);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    blockSizeDeviation
- * Signature: (J)I
- */
-jint Java_org_rocksdb_Options_blockSizeDeviation(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->block_size_deviation;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setBlockSizeDeviation
- * Signature: (JI)V
- */
-void Java_org_rocksdb_Options_setBlockSizeDeviation(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jint jblock_size_deviation) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->block_size_deviation =
-      static_cast<int>(jblock_size_deviation);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    verifyChecksumsInCompaction
- * Signature: (J)Z
- */
-jboolean Java_org_rocksdb_Options_verifyChecksumsInCompaction(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(
-      jhandle)->verify_checksums_in_compaction;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setVerifyChecksumsInCompaction
- * Signature: (JZ)V
- */
-void Java_org_rocksdb_Options_setVerifyChecksumsInCompaction(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jboolean jverify_checksums_in_compaction) {
-  reinterpret_cast<rocksdb::Options*>(
-      jhandle)->verify_checksums_in_compaction =
-          static_cast<bool>(jverify_checksums_in_compaction);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    filterDeletes
- * Signature: (J)Z
- */
-jboolean Java_org_rocksdb_Options_filterDeletes(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->filter_deletes;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setFilterDeletes
- * Signature: (JZ)V
- */
-void Java_org_rocksdb_Options_setFilterDeletes(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean jfilter_deletes) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->filter_deletes =
-      static_cast<bool>(jfilter_deletes);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    maxSequentialSkipInIterations
- * Signature: (J)J
- */
-jlong Java_org_rocksdb_Options_maxSequentialSkipInIterations(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(
-      jhandle)->max_sequential_skip_in_iterations;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setMaxSequentialSkipInIterations
- * Signature: (JJ)V
- */
-void Java_org_rocksdb_Options_setMaxSequentialSkipInIterations(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jlong jmax_sequential_skip_in_iterations) {
-  reinterpret_cast<rocksdb::Options*>(
-      jhandle)->max_sequential_skip_in_iterations =
-          static_cast<int64_t>(jmax_sequential_skip_in_iterations);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    inplaceUpdateSupport
- * Signature: (J)Z
- */
-jboolean Java_org_rocksdb_Options_inplaceUpdateSupport(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(
-      jhandle)->inplace_update_support;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setInplaceUpdateSupport
- * Signature: (JZ)V
- */
-void Java_org_rocksdb_Options_setInplaceUpdateSupport(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jboolean jinplace_update_support) {
-  reinterpret_cast<rocksdb::Options*>(
-      jhandle)->inplace_update_support =
-          static_cast<bool>(jinplace_update_support);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    inplaceUpdateNumLocks
- * Signature: (J)J
- */
-jlong Java_org_rocksdb_Options_inplaceUpdateNumLocks(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(
-      jhandle)->inplace_update_num_locks;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setInplaceUpdateNumLocks
- * Signature: (JJ)V
- */
-void Java_org_rocksdb_Options_setInplaceUpdateNumLocks(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jlong jinplace_update_num_locks) {
-  reinterpret_cast<rocksdb::Options*>(
-      jhandle)->inplace_update_num_locks =
-          static_cast<size_t>(jinplace_update_num_locks);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    memtablePrefixBloomBits
- * Signature: (J)I
- */
-jint Java_org_rocksdb_Options_memtablePrefixBloomBits(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(
-      jhandle)->memtable_prefix_bloom_bits;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setMemtablePrefixBloomBits
- * Signature: (JI)V
- */
-void Java_org_rocksdb_Options_setMemtablePrefixBloomBits(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jint jmemtable_prefix_bloom_bits) {
-  reinterpret_cast<rocksdb::Options*>(
-      jhandle)->memtable_prefix_bloom_bits =
-          static_cast<int32_t>(jmemtable_prefix_bloom_bits);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    memtablePrefixBloomProbes
- * Signature: (J)I
- */
-jint Java_org_rocksdb_Options_memtablePrefixBloomProbes(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(
-      jhandle)->memtable_prefix_bloom_probes;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setMemtablePrefixBloomProbes
- * Signature: (JI)V
- */
-void Java_org_rocksdb_Options_setMemtablePrefixBloomProbes(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jint jmemtable_prefix_bloom_probes) {
-  reinterpret_cast<rocksdb::Options*>(
-      jhandle)->memtable_prefix_bloom_probes =
-          static_cast<int32_t>(jmemtable_prefix_bloom_probes);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    bloomLocality
- * Signature: (J)I
- */
-jint Java_org_rocksdb_Options_bloomLocality(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->bloom_locality;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setBloomLocality
- * Signature: (JI)V
- */
-void Java_org_rocksdb_Options_setBloomLocality(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint jbloom_locality) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->bloom_locality =
-      static_cast<int32_t>(jbloom_locality);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    maxSuccessiveMerges
- * Signature: (J)J
- */
-jlong Java_org_rocksdb_Options_maxSuccessiveMerges(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->max_successive_merges;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setMaxSuccessiveMerges
- * Signature: (JJ)V
- */
-void Java_org_rocksdb_Options_setMaxSuccessiveMerges(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jlong jmax_successive_merges) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->max_successive_merges =
-      static_cast<size_t>(jmax_successive_merges);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    minPartialMergeOperands
- * Signature: (J)I
- */
-jint Java_org_rocksdb_Options_minPartialMergeOperands(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(
-      jhandle)->min_partial_merge_operands;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setMinPartialMergeOperands
- * Signature: (JI)V
- */
-void Java_org_rocksdb_Options_setMinPartialMergeOperands(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jint jmin_partial_merge_operands) {
-  reinterpret_cast<rocksdb::Options*>(
-      jhandle)->min_partial_merge_operands =
-          static_cast<int32_t>(jmin_partial_merge_operands);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// WriteOptions
-
-/*
- * Class:     org_rocksdb_WriteOptions
- * Method:    newWriteOptions
- * Signature: ()V
- */
-void Java_org_rocksdb_WriteOptions_newWriteOptions(
-    JNIEnv* env, jobject jwrite_options) {
-  rocksdb::WriteOptions* op = new rocksdb::WriteOptions();
-  rocksdb::WriteOptionsJni::setHandle(env, jwrite_options, op);
-}
-
-/*
- * Class:     org_rocksdb_WriteOptions
- * Method:    dispose0
- * Signature: ()V
- */
-void Java_org_rocksdb_WriteOptions_dispose0(
-    JNIEnv* env, jobject jwrite_options, jlong jhandle) {
-  auto write_options = reinterpret_cast<rocksdb::WriteOptions*>(jhandle);
-  delete write_options;
-
-  rocksdb::WriteOptionsJni::setHandle(env, jwrite_options, nullptr);
-}
-
-/*
- * Class:     org_rocksdb_WriteOptions
- * Method:    setSync
- * Signature: (JZ)V
- */
-void Java_org_rocksdb_WriteOptions_setSync(
-  JNIEnv* env, jobject jwrite_options, jlong jhandle, jboolean jflag) {
-  reinterpret_cast<rocksdb::WriteOptions*>(jhandle)->sync = jflag;
-}
-
-/*
- * Class:     org_rocksdb_WriteOptions
- * Method:    sync
- * Signature: (J)Z
- */
-jboolean Java_org_rocksdb_WriteOptions_sync(
-    JNIEnv* env, jobject jwrite_options, jlong jhandle) {
-  return reinterpret_cast<rocksdb::WriteOptions*>(jhandle)->sync;
-}
-
-/*
- * Class:     org_rocksdb_WriteOptions
- * Method:    setDisableWAL
- * Signature: (JZ)V
- */
-void Java_org_rocksdb_WriteOptions_setDisableWAL(
-    JNIEnv* env, jobject jwrite_options, jlong jhandle, jboolean jflag) {
-  reinterpret_cast<rocksdb::WriteOptions*>(jhandle)->disableWAL = jflag;
-}
-
-/*
- * Class:     org_rocksdb_WriteOptions
- * Method:    disableWAL
- * Signature: (J)Z
- */
-jboolean Java_org_rocksdb_WriteOptions_disableWAL(
-    JNIEnv* env, jobject jwrite_options, jlong jhandle) {
-  return reinterpret_cast<rocksdb::WriteOptions*>(jhandle)->disableWAL;
-}
-
-/////////////////////////////////////////////////////////////////////
-// rocksdb::ReadOptions
-
-/*
- * Class:     org_rocksdb_ReadOptions
- * Method:    newReadOptions
- * Signature: ()V
- */
-void Java_org_rocksdb_ReadOptions_newReadOptions(
-    JNIEnv* env, jobject jobj) {
-  auto read_opt = new rocksdb::ReadOptions();
-  rocksdb::ReadOptionsJni::setHandle(env, jobj, read_opt);
-}
-
-/*
- * Class:     org_rocksdb_ReadOptions
- * Method:    dispose
- * Signature: (J)V
- */
-void Java_org_rocksdb_ReadOptions_dispose(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  delete reinterpret_cast<rocksdb::ReadOptions*>(jhandle);
-  rocksdb::ReadOptionsJni::setHandle(env, jobj, nullptr);
-}
-
-/*
- * Class:     org_rocksdb_ReadOptions
- * Method:    verifyChecksums
- * Signature: (J)Z
- */
-jboolean Java_org_rocksdb_ReadOptions_verifyChecksums(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::ReadOptions*>(
-      jhandle)->verify_checksums;
-}
-
-/*
- * Class:     org_rocksdb_ReadOptions
- * Method:    setVerifyChecksums
- * Signature: (JZ)V
- */
-void Java_org_rocksdb_ReadOptions_setVerifyChecksums(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jboolean jverify_checksums) {
-  reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->verify_checksums =
-      static_cast<bool>(jverify_checksums);
-}
-
-/*
- * Class:     org_rocksdb_ReadOptions
- * Method:    fillCache
- * Signature: (J)Z
- */
-jboolean Java_org_rocksdb_ReadOptions_fillCache(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->fill_cache;
-}
-
-/*
- * Class:     org_rocksdb_ReadOptions
- * Method:    setFillCache
- * Signature: (JZ)V
- */
-void Java_org_rocksdb_ReadOptions_setFillCache(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean jfill_cache) {
-  reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->fill_cache =
-      static_cast<bool>(jfill_cache);
-}
-
-/*
- * Class:     org_rocksdb_ReadOptions
- * Method:    tailing
- * Signature: (J)Z
- */
-jboolean Java_org_rocksdb_ReadOptions_tailing(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->tailing;
-}
-
-/*
- * Class:     org_rocksdb_ReadOptions
- * Method:    setTailing
- * Signature: (JZ)V
- */
-void Java_org_rocksdb_ReadOptions_setTailing(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean jtailing) {
-  reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->tailing =
-      static_cast<bool>(jtailing);
-}
diff --git a/src/rocksdb/java/rocksjni/portal.h b/src/rocksdb/java/rocksjni/portal.h
deleted file mode 100644
index 7d70eec..0000000
--- a/src/rocksdb/java/rocksjni/portal.h
+++ /dev/null
@@ -1,383 +0,0 @@
-// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
-
-// This file is designed for caching those frequently used IDs and provide
-// efficient portal (i.e, a set of static functions) to access java code
-// from c++.
-
-#ifndef JAVA_ROCKSJNI_PORTAL_H_
-#define JAVA_ROCKSJNI_PORTAL_H_
-
-#include <jni.h>
-#include "rocksdb/db.h"
-#include "rocksdb/filter_policy.h"
-#include "utilities/backupable_db.h"
-
-namespace rocksdb {
-
-// The portal class for org.rocksdb.RocksDB
-class RocksDBJni {
- public:
-  // Get the java class id of org.rocksdb.RocksDB.
-  static jclass getJClass(JNIEnv* env) {
-    static jclass jclazz = env->FindClass("org/rocksdb/RocksDB");
-    assert(jclazz != nullptr);
-    return jclazz;
-  }
-
-  // Get the field id of the member variable of org.rocksdb.RocksDB
-  // that stores the pointer to rocksdb::DB.
-  static jfieldID getHandleFieldID(JNIEnv* env) {
-    static jfieldID fid = env->GetFieldID(
-        getJClass(env), "nativeHandle_", "J");
-    assert(fid != nullptr);
-    return fid;
-  }
-
-  // Get the pointer to rocksdb::DB of the specified org.rocksdb.RocksDB.
-  static rocksdb::DB* getHandle(JNIEnv* env, jobject jdb) {
-    return reinterpret_cast<rocksdb::DB*>(
-        env->GetLongField(jdb, getHandleFieldID(env)));
-  }
-
-  // Pass the rocksdb::DB pointer to the java side.
-  static void setHandle(JNIEnv* env, jobject jdb, rocksdb::DB* db) {
-    env->SetLongField(
-        jdb, getHandleFieldID(env),
-        reinterpret_cast<jlong>(db));
-  }
-};
-
-// The portal class for org.rocksdb.RocksDBException
-class RocksDBExceptionJni {
- public:
-  // Get the jclass of org.rocksdb.RocksDBException
-  static jclass getJClass(JNIEnv* env) {
-    static jclass jclazz = env->FindClass("org/rocksdb/RocksDBException");
-    assert(jclazz != nullptr);
-    return jclazz;
-  }
-
-  // Create and throw a java exception by converting the input
-  // Status to an RocksDBException.
-  //
-  // In case s.ok() is true, then this function will not throw any
-  // exception.
-  static void ThrowNew(JNIEnv* env, Status s) {
-    if (s.ok()) {
-      return;
-    }
-    jstring msg = env->NewStringUTF(s.ToString().c_str());
-    // get the constructor id of org.rocksdb.RocksDBException
-    static jmethodID mid = env->GetMethodID(
-        getJClass(env), "<init>", "(Ljava/lang/String;)V");
-    assert(mid != nullptr);
-
-    env->Throw((jthrowable)env->NewObject(getJClass(env), mid, msg));
-  }
-};
-
-class OptionsJni {
- public:
-  // Get the java class id of org.rocksdb.Options.
-  static jclass getJClass(JNIEnv* env) {
-    static jclass jclazz = env->FindClass("org/rocksdb/Options");
-    assert(jclazz != nullptr);
-    return jclazz;
-  }
-
-  // Get the field id of the member variable of org.rocksdb.Options
-  // that stores the pointer to rocksdb::Options
-  static jfieldID getHandleFieldID(JNIEnv* env) {
-    static jfieldID fid = env->GetFieldID(
-        getJClass(env), "nativeHandle_", "J");
-    assert(fid != nullptr);
-    return fid;
-  }
-
-  // Get the pointer to rocksdb::Options
-  static rocksdb::Options* getHandle(JNIEnv* env, jobject jobj) {
-    return reinterpret_cast<rocksdb::Options*>(
-        env->GetLongField(jobj, getHandleFieldID(env)));
-  }
-
-  // Pass the rocksdb::Options pointer to the java side.
-  static void setHandle(JNIEnv* env, jobject jobj, rocksdb::Options* op) {
-    env->SetLongField(
-        jobj, getHandleFieldID(env),
-        reinterpret_cast<jlong>(op));
-  }
-};
-
-class WriteOptionsJni {
- public:
-  // Get the java class id of org.rocksdb.WriteOptions.
-  static jclass getJClass(JNIEnv* env) {
-    static jclass jclazz = env->FindClass("org/rocksdb/WriteOptions");
-    assert(jclazz != nullptr);
-    return jclazz;
-  }
-
-  // Get the field id of the member variable of org.rocksdb.WriteOptions
-  // that stores the pointer to rocksdb::WriteOptions
-  static jfieldID getHandleFieldID(JNIEnv* env) {
-    static jfieldID fid = env->GetFieldID(
-        getJClass(env), "nativeHandle_", "J");
-    assert(fid != nullptr);
-    return fid;
-  }
-
-  // Get the pointer to rocksdb::WriteOptions
-  static rocksdb::WriteOptions* getHandle(JNIEnv* env, jobject jobj) {
-    return reinterpret_cast<rocksdb::WriteOptions*>(
-        env->GetLongField(jobj, getHandleFieldID(env)));
-  }
-
-  // Pass the rocksdb::WriteOptions pointer to the java side.
-  static void setHandle(JNIEnv* env, jobject jobj, rocksdb::WriteOptions* op) {
-    env->SetLongField(
-        jobj, getHandleFieldID(env),
-        reinterpret_cast<jlong>(op));
-  }
-};
-
-
-class ReadOptionsJni {
- public:
-  // Get the java class id of org.rocksdb.ReadOptions.
-  static jclass getJClass(JNIEnv* env) {
-    static jclass jclazz = env->FindClass("org/rocksdb/ReadOptions");
-    assert(jclazz != nullptr);
-    return jclazz;
-  }
-
-  // Get the field id of the member variable of org.rocksdb.ReadOptions
-  // that stores the pointer to rocksdb::ReadOptions
-  static jfieldID getHandleFieldID(JNIEnv* env) {
-    static jfieldID fid = env->GetFieldID(
-        getJClass(env), "nativeHandle_", "J");
-    assert(fid != nullptr);
-    return fid;
-  }
-
-  // Get the pointer to rocksdb::ReadOptions
-  static rocksdb::ReadOptions* getHandle(JNIEnv* env, jobject jobj) {
-    return reinterpret_cast<rocksdb::ReadOptions*>(
-        env->GetLongField(jobj, getHandleFieldID(env)));
-  }
-
-  // Pass the rocksdb::ReadOptions pointer to the java side.
-  static void setHandle(JNIEnv* env, jobject jobj,
-                        rocksdb::ReadOptions* op) {
-    env->SetLongField(
-        jobj, getHandleFieldID(env),
-        reinterpret_cast<jlong>(op));
-  }
-};
-
-
-class WriteBatchJni {
- public:
-  static jclass getJClass(JNIEnv* env) {
-    static jclass jclazz = env->FindClass("org/rocksdb/WriteBatch");
-    assert(jclazz != nullptr);
-    return jclazz;
-  }
-
-  static jfieldID getHandleFieldID(JNIEnv* env) {
-    static jfieldID fid = env->GetFieldID(
-        getJClass(env), "nativeHandle_", "J");
-    assert(fid != nullptr);
-    return fid;
-  }
-
-  // Get the pointer to rocksdb::WriteBatch of the specified
-  // org.rocksdb.WriteBatch.
-  static rocksdb::WriteBatch* getHandle(JNIEnv* env, jobject jwb) {
-    return reinterpret_cast<rocksdb::WriteBatch*>(
-        env->GetLongField(jwb, getHandleFieldID(env)));
-  }
-
-  // Pass the rocksdb::WriteBatch pointer to the java side.
-  static void setHandle(JNIEnv* env, jobject jwb, rocksdb::WriteBatch* wb) {
-    env->SetLongField(
-        jwb, getHandleFieldID(env),
-        reinterpret_cast<jlong>(wb));
-  }
-};
-
-class HistogramDataJni {
- public:
-  static jmethodID getConstructorMethodId(JNIEnv* env, jclass jclazz) {
-    static jmethodID mid = env->GetMethodID(
-        jclazz, "<init>", "(DDDDD)V");
-    assert(mid != nullptr);
-    return mid;
-  }
-};
-class BackupableDBOptionsJni {
- public:
-  // Get the java class id of org.rocksdb.BackupableDBOptions.
-  static jclass getJClass(JNIEnv* env) {
-    static jclass jclazz = env->FindClass("org/rocksdb/BackupableDBOptions");
-    assert(jclazz != nullptr);
-    return jclazz;
-  }
-
-  // Get the field id of the member variable of org.rocksdb.BackupableDBOptions
-  // that stores the pointer to rocksdb::BackupableDBOptions
-  static jfieldID getHandleFieldID(JNIEnv* env) {
-    static jfieldID fid = env->GetFieldID(
-        getJClass(env), "nativeHandle_", "J");
-    assert(fid != nullptr);
-    return fid;
-  }
-
-  // Get the pointer to rocksdb::BackupableDBOptions
-  static rocksdb::BackupableDBOptions* getHandle(JNIEnv* env, jobject jobj) {
-    return reinterpret_cast<rocksdb::BackupableDBOptions*>(
-        env->GetLongField(jobj, getHandleFieldID(env)));
-  }
-
-  // Pass the rocksdb::BackupableDBOptions pointer to the java side.
-  static void setHandle(
-      JNIEnv* env, jobject jobj, rocksdb::BackupableDBOptions* op) {
-    env->SetLongField(
-        jobj, getHandleFieldID(env),
-        reinterpret_cast<jlong>(op));
-  }
-};
-
-class IteratorJni {
- public:
-  // Get the java class id of org.rocksdb.Iteartor.
-  static jclass getJClass(JNIEnv* env) {
-    static jclass jclazz = env->FindClass("org/rocksdb/Iterator");
-    assert(jclazz != nullptr);
-    return jclazz;
-  }
-
-  // Get the field id of the member variable of org.rocksdb.Iterator
-  // that stores the pointer to rocksdb::Iterator.
-  static jfieldID getHandleFieldID(JNIEnv* env) {
-    static jfieldID fid = env->GetFieldID(
-        getJClass(env), "nativeHandle_", "J");
-    assert(fid != nullptr);
-    return fid;
-  }
-
-  // Get the pointer to rocksdb::Iterator.
-  static rocksdb::Iterator* getHandle(JNIEnv* env, jobject jobj) {
-    return reinterpret_cast<rocksdb::Iterator*>(
-        env->GetLongField(jobj, getHandleFieldID(env)));
-  }
-
-  // Pass the rocksdb::Iterator pointer to the java side.
-  static void setHandle(
-      JNIEnv* env, jobject jobj, rocksdb::Iterator* op) {
-    env->SetLongField(
-        jobj, getHandleFieldID(env),
-        reinterpret_cast<jlong>(op));
-  }
-};
-
-class FilterJni {
- public:
-  // Get the java class id of org.rocksdb.FilterPolicy.
-  static jclass getJClass(JNIEnv* env) {
-    static jclass jclazz = env->FindClass("org/rocksdb/Filter");
-    assert(jclazz != nullptr);
-    return jclazz;
-  }
-
-  // Get the field id of the member variable of org.rocksdb.Filter
-  // that stores the pointer to rocksdb::FilterPolicy.
-  static jfieldID getHandleFieldID(JNIEnv* env) {
-    static jfieldID fid = env->GetFieldID(
-        getJClass(env), "nativeHandle_", "J");
-    assert(fid != nullptr);
-    return fid;
-  }
-
-  // Get the pointer to rocksdb::FilterPolicy.
-  static rocksdb::FilterPolicy* getHandle(JNIEnv* env, jobject jobj) {
-    return reinterpret_cast<rocksdb::FilterPolicy*>(
-        env->GetLongField(jobj, getHandleFieldID(env)));
-  }
-
-  // Pass the rocksdb::FilterPolicy pointer to the java side.
-  static void setHandle(
-      JNIEnv* env, jobject jobj, const rocksdb::FilterPolicy* op) {
-    env->SetLongField(
-        jobj, getHandleFieldID(env),
-        reinterpret_cast<jlong>(op));
-  }
-};
-
-class ListJni {
- public:
-  // Get the java class id of java.util.List.
-  static jclass getListClass(JNIEnv* env) {
-    static jclass jclazz = env->FindClass("java/util/List");
-    assert(jclazz != nullptr);
-    return jclazz;
-  }
-
-  // Get the java class id of java.util.ArrayList.
-  static jclass getArrayListClass(JNIEnv* env) {
-    static jclass jclazz = env->FindClass("java/util/ArrayList");
-    assert(jclazz != nullptr);
-    return jclazz;
-  }
-
-  // Get the java class id of java.util.Iterator.
-  static jclass getIteratorClass(JNIEnv* env) {
-    static jclass jclazz = env->FindClass("java/util/Iterator");
-    assert(jclazz != nullptr);
-    return jclazz;
-  }
-
-  // Get the java method id of java.util.List.iterator().
-  static jmethodID getIteratorMethod(JNIEnv* env) {
-    static jmethodID mid = env->GetMethodID(
-        getListClass(env), "iterator", "()Ljava/util/Iterator;");
-    assert(mid != nullptr);
-    return mid;
-  }
-
-  // Get the java method id of java.util.Iterator.hasNext().
-  static jmethodID getHasNextMethod(JNIEnv* env) {
-    static jmethodID mid = env->GetMethodID(
-        getIteratorClass(env), "hasNext", "()Z");
-    assert(mid != nullptr);
-    return mid;
-  }
-
-  // Get the java method id of java.util.Iterator.next().
-  static jmethodID getNextMethod(JNIEnv* env) {
-    static jmethodID mid = env->GetMethodID(
-        getIteratorClass(env), "next", "()Ljava/lang/Object;");
-    assert(mid != nullptr);
-    return mid;
-  }
-
-  // Get the java method id of arrayList constructor.
-  static jmethodID getArrayListConstructorMethodId(JNIEnv* env, jclass jclazz) {
-    static jmethodID mid = env->GetMethodID(
-        jclazz, "<init>", "(I)V");
-    assert(mid != nullptr);
-    return mid;
-  }
-
-  // Get the java method id of java.util.List.add().
-  static jmethodID getListAddMethodId(JNIEnv* env) {
-    static jmethodID mid = env->GetMethodID(
-        getListClass(env), "add", "(Ljava/lang/Object;)Z");
-    assert(mid != nullptr);
-    return mid;
-  }
-};
-}  // namespace rocksdb
-#endif  // JAVA_ROCKSJNI_PORTAL_H_
diff --git a/src/rocksdb/java/rocksjni/rocksjni.cc b/src/rocksdb/java/rocksjni/rocksjni.cc
deleted file mode 100644
index 4595f3f..0000000
--- a/src/rocksdb/java/rocksjni/rocksjni.cc
+++ /dev/null
@@ -1,438 +0,0 @@
-// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
-//
-// This file implements the "bridge" between Java and C++ and enables
-// calling c++ rocksdb::DB methods from Java side.
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <jni.h>
-#include <string>
-#include <vector>
-
-#include "include/org_rocksdb_RocksDB.h"
-#include "rocksjni/portal.h"
-#include "rocksdb/db.h"
-#include "rocksdb/cache.h"
-
-//////////////////////////////////////////////////////////////////////////////
-// rocksdb::DB::Open
-
-/*
- * Class:     org_rocksdb_RocksDB
- * Method:    open
- * Signature: (JLjava/lang/String;)V
- */
-void Java_org_rocksdb_RocksDB_open(
-    JNIEnv* env, jobject jdb, jlong jopt_handle,
-    jlong jcache_size, jstring jdb_path) {
-  auto opt = reinterpret_cast<rocksdb::Options*>(jopt_handle);
-  if (jcache_size > 0) {
-    opt->no_block_cache = false;
-    opt->block_cache = rocksdb::NewLRUCache(jcache_size);
-  } else {
-    opt->no_block_cache = true;
-    opt->block_cache = nullptr;
-  }
-
-  rocksdb::DB* db = nullptr;
-  const char* db_path = env->GetStringUTFChars(jdb_path, 0);
-  rocksdb::Status s = rocksdb::DB::Open(*opt, db_path, &db);
-  env->ReleaseStringUTFChars(jdb_path, db_path);
-
-  if (s.ok()) {
-    rocksdb::RocksDBJni::setHandle(env, jdb, db);
-    return;
-  }
-  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// rocksdb::DB::Put
-
-void rocksdb_put_helper(
-    JNIEnv* env, rocksdb::DB* db, const rocksdb::WriteOptions& write_options,
-    jbyteArray jkey, jint jkey_len,
-    jbyteArray jvalue, jint jvalue_len) {
-
-  jbyte* key = env->GetByteArrayElements(jkey, 0);
-  jbyte* value = env->GetByteArrayElements(jvalue, 0);
-  rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
-  rocksdb::Slice value_slice(reinterpret_cast<char*>(value), jvalue_len);
-
-  rocksdb::Status s = db->Put(write_options, key_slice, value_slice);
-
-  // trigger java unref on key and value.
-  // by passing JNI_ABORT, it will simply release the reference without
-  // copying the result back to the java byte array.
-  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
-  env->ReleaseByteArrayElements(jvalue, value, JNI_ABORT);
-
-  if (s.ok()) {
-    return;
-  }
-  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
-}
-
-/*
- * Class:     org_rocksdb_RocksDB
- * Method:    put
- * Signature: (J[BI[BI)V
- */
-void Java_org_rocksdb_RocksDB_put__J_3BI_3BI(
-    JNIEnv* env, jobject jdb, jlong jdb_handle,
-    jbyteArray jkey, jint jkey_len,
-    jbyteArray jvalue, jint jvalue_len) {
-  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  static const rocksdb::WriteOptions default_write_options =
-      rocksdb::WriteOptions();
-
-  rocksdb_put_helper(env, db, default_write_options,
-                     jkey, jkey_len,
-                     jvalue, jvalue_len);
-}
-
-/*
- * Class:     org_rocksdb_RocksDB
- * Method:    put
- * Signature: (JJ[BI[BI)V
- */
-void Java_org_rocksdb_RocksDB_put__JJ_3BI_3BI(
-    JNIEnv* env, jobject jdb,
-    jlong jdb_handle, jlong jwrite_options_handle,
-    jbyteArray jkey, jint jkey_len,
-    jbyteArray jvalue, jint jvalue_len) {
-  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  auto write_options = reinterpret_cast<rocksdb::WriteOptions*>(
-      jwrite_options_handle);
-
-  rocksdb_put_helper(env, db, *write_options,
-                     jkey, jkey_len,
-                     jvalue, jvalue_len);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// rocksdb::DB::Write
-/*
- * Class:     org_rocksdb_RocksDB
- * Method:    write
- * Signature: (JJ)V
- */
-void Java_org_rocksdb_RocksDB_write(
-    JNIEnv* env, jobject jdb,
-    jlong jwrite_options_handle, jlong jbatch_handle) {
-  rocksdb::DB* db = rocksdb::RocksDBJni::getHandle(env, jdb);
-  auto write_options = reinterpret_cast<rocksdb::WriteOptions*>(
-      jwrite_options_handle);
-  auto batch = reinterpret_cast<rocksdb::WriteBatch*>(jbatch_handle);
-
-  rocksdb::Status s = db->Write(*write_options, batch);
-
-  if (!s.ok()) {
-    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
-  }
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// rocksdb::DB::Get
-
-jbyteArray rocksdb_get_helper(
-    JNIEnv* env, rocksdb::DB* db, const rocksdb::ReadOptions& read_opt,
-    jbyteArray jkey, jint jkey_len) {
-  jboolean isCopy;
-  jbyte* key = env->GetByteArrayElements(jkey, &isCopy);
-  rocksdb::Slice key_slice(
-      reinterpret_cast<char*>(key), jkey_len);
-
-  std::string value;
-  rocksdb::Status s = db->Get(
-      read_opt, key_slice, &value);
-
-  // trigger java unref on key.
-  // by passing JNI_ABORT, it will simply release the reference without
-  // copying the result back to the java byte array.
-  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
-
-  if (s.IsNotFound()) {
-    return nullptr;
-  }
-
-  if (s.ok()) {
-    jbyteArray jvalue = env->NewByteArray(value.size());
-    env->SetByteArrayRegion(
-        jvalue, 0, value.size(),
-        reinterpret_cast<const jbyte*>(value.c_str()));
-    return jvalue;
-  }
-  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
-
-  return nullptr;
-}
-
-/*
- * Class:     org_rocksdb_RocksDB
- * Method:    get
- * Signature: (J[BI)[B
- */
-jbyteArray Java_org_rocksdb_RocksDB_get__J_3BI(
-    JNIEnv* env, jobject jdb, jlong jdb_handle,
-    jbyteArray jkey, jint jkey_len) {
-  return rocksdb_get_helper(env,
-      reinterpret_cast<rocksdb::DB*>(jdb_handle),
-      rocksdb::ReadOptions(),
-      jkey, jkey_len);
-}
-
-/*
- * Class:     org_rocksdb_RocksDB
- * Method:    get
- * Signature: (JJ[BI)[B
- */
-jbyteArray Java_org_rocksdb_RocksDB_get__JJ_3BI(
-    JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jropt_handle,
-    jbyteArray jkey, jint jkey_len) {
-  return rocksdb_get_helper(env,
-      reinterpret_cast<rocksdb::DB*>(jdb_handle),
-      *reinterpret_cast<rocksdb::ReadOptions*>(jropt_handle),
-      jkey, jkey_len);
-}
-
-jint rocksdb_get_helper(
-    JNIEnv* env, rocksdb::DB* db, const rocksdb::ReadOptions& read_options,
-    jbyteArray jkey, jint jkey_len,
-    jbyteArray jvalue, jint jvalue_len) {
-  static const int kNotFound = -1;
-  static const int kStatusError = -2;
-
-  jbyte* key = env->GetByteArrayElements(jkey, 0);
-  rocksdb::Slice key_slice(
-      reinterpret_cast<char*>(key), jkey_len);
-
-  // TODO(yhchiang): we might save one memory allocation here by adding
-  // a DB::Get() function which takes preallocated jbyte* as input.
-  std::string cvalue;
-  rocksdb::Status s = db->Get(
-      read_options, key_slice, &cvalue);
-
-  // trigger java unref on key.
-  // by passing JNI_ABORT, it will simply release the reference without
-  // copying the result back to the java byte array.
-  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
-
-  if (s.IsNotFound()) {
-    return kNotFound;
-  } else if (!s.ok()) {
-    // Here since we are throwing a Java exception from c++ side.
-    // As a result, c++ does not know calling this function will in fact
-    // throwing an exception.  As a result, the execution flow will
-    // not stop here, and codes after this throw will still be
-    // executed.
-    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
-
-    // Return a dummy const value to avoid compilation error, although
-    // java side might not have a chance to get the return value :)
-    return kStatusError;
-  }
-
-  int cvalue_len = static_cast<int>(cvalue.size());
-  int length = std::min(jvalue_len, cvalue_len);
-
-  env->SetByteArrayRegion(
-      jvalue, 0, length,
-      reinterpret_cast<const jbyte*>(cvalue.c_str()));
-  return cvalue_len;
-}
-
-jobject multi_get_helper(JNIEnv* env, jobject jdb, rocksdb::DB* db,
-    const rocksdb::ReadOptions& rOpt, jobject jkey_list, jint jkeys_count) {
-  std::vector<rocksdb::Slice> keys;
-  std::vector<jbyte*> keys_to_free;
-
-  // get iterator
-  jobject iteratorObj = env->CallObjectMethod(
-      jkey_list, rocksdb::ListJni::getIteratorMethod(env));
-
-  // iterate over keys and convert java byte array to slice
-  while(env->CallBooleanMethod(
-      iteratorObj, rocksdb::ListJni::getHasNextMethod(env)) == JNI_TRUE) {
-    jbyteArray jkey = (jbyteArray) env->CallObjectMethod(
-       iteratorObj, rocksdb::ListJni::getNextMethod(env));
-    jint key_length = env->GetArrayLength(jkey);
-
-    jbyte* key = new jbyte[key_length];
-    env->GetByteArrayRegion(jkey, 0, key_length, key);
-    // store allocated jbyte to free it after multiGet call
-    keys_to_free.push_back(key);
-
-    rocksdb::Slice key_slice(
-      reinterpret_cast<char*>(key), key_length);
-    keys.push_back(key_slice);
-  }
-
-  std::vector<std::string> values;
-  std::vector<rocksdb::Status> s = db->MultiGet(rOpt, keys, &values);
-
-  // Don't reuse class pointer
-  jclass jclazz = env->FindClass("java/util/ArrayList");
-  jmethodID mid = rocksdb::ListJni::getArrayListConstructorMethodId(
-      env, jclazz);
-  jobject jvalue_list = env->NewObject(jclazz, mid, jkeys_count);
-
-  // insert in java list
-  for(std::vector<rocksdb::Status>::size_type i = 0; i != s.size(); i++) {
-    if(s[i].ok()) {
-      jbyteArray jvalue = env->NewByteArray(values[i].size());
-      env->SetByteArrayRegion(
-          jvalue, 0, values[i].size(),
-          reinterpret_cast<const jbyte*>(values[i].c_str()));
-      env->CallBooleanMethod(
-          jvalue_list, rocksdb::ListJni::getListAddMethodId(env), jvalue);
-    }
-    else {
-      env->CallBooleanMethod(
-          jvalue_list, rocksdb::ListJni::getListAddMethodId(env), nullptr);
-    }
-  }
-
-  // free up allocated byte arrays
-  for(std::vector<jbyte*>::size_type i = 0; i != keys_to_free.size(); i++) {
-    delete[] keys_to_free[i];
-  }
-  keys_to_free.clear();
-
-  return jvalue_list;
-}
-
-/*
- * Class:     org_rocksdb_RocksDB
- * Method:    multiGet
- * Signature: (JLjava/util/List;I)Ljava/util/List;
- */
-jobject Java_org_rocksdb_RocksDB_multiGet__JLjava_util_List_2I(
-    JNIEnv* env, jobject jdb, jlong jdb_handle,
-    jobject jkey_list, jint jkeys_count) {
-  return multi_get_helper(env, jdb, reinterpret_cast<rocksdb::DB*>(jdb_handle),
-      rocksdb::ReadOptions(), jkey_list, jkeys_count);
-}
-
-/*
- * Class:     org_rocksdb_RocksDB
- * Method:    multiGet
- * Signature: (JJLjava/util/List;I)Ljava/util/List;
- */
-jobject Java_org_rocksdb_RocksDB_multiGet__JJLjava_util_List_2I(
-    JNIEnv* env, jobject jdb, jlong jdb_handle,
-    jlong jropt_handle, jobject jkey_list, jint jkeys_count) {
-  return multi_get_helper(env, jdb, reinterpret_cast<rocksdb::DB*>(jdb_handle),
-      *reinterpret_cast<rocksdb::ReadOptions*>(jropt_handle), jkey_list,
-      jkeys_count);
-}
-
-/*
- * Class:     org_rocksdb_RocksDB
- * Method:    get
- * Signature: (J[BI[BI)I
- */
-jint Java_org_rocksdb_RocksDB_get__J_3BI_3BI(
-    JNIEnv* env, jobject jdb, jlong jdb_handle,
-    jbyteArray jkey, jint jkey_len,
-    jbyteArray jvalue, jint jvalue_len) {
-  return rocksdb_get_helper(env,
-      reinterpret_cast<rocksdb::DB*>(jdb_handle),
-      rocksdb::ReadOptions(),
-      jkey, jkey_len, jvalue, jvalue_len);
-}
-
-/*
- * Class:     org_rocksdb_RocksDB
- * Method:    get
- * Signature: (JJ[BI[BI)I
- */
-jint Java_org_rocksdb_RocksDB_get__JJ_3BI_3BI(
-    JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jropt_handle,
-    jbyteArray jkey, jint jkey_len,
-    jbyteArray jvalue, jint jvalue_len) {
-  return rocksdb_get_helper(env,
-      reinterpret_cast<rocksdb::DB*>(jdb_handle),
-      *reinterpret_cast<rocksdb::ReadOptions*>(jropt_handle),
-      jkey, jkey_len, jvalue, jvalue_len);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// rocksdb::DB::Delete()
-void rocksdb_remove_helper(
-    JNIEnv* env, rocksdb::DB* db, const rocksdb::WriteOptions& write_options,
-    jbyteArray jkey, jint jkey_len) {
-  jbyte* key = env->GetByteArrayElements(jkey, 0);
-  rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
-
-  rocksdb::Status s = db->Delete(write_options, key_slice);
-
-  // trigger java unref on key and value.
-  // by passing JNI_ABORT, it will simply release the reference without
-  // copying the result back to the java byte array.
-  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
-
-  if (!s.ok()) {
-    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
-  }
-  return;
-}
-
-/*
- * Class:     org_rocksdb_RocksDB
- * Method:    remove
- * Signature: (J[BI)V
- */
-void Java_org_rocksdb_RocksDB_remove__J_3BI(
-    JNIEnv* env, jobject jdb, jlong jdb_handle,
-    jbyteArray jkey, jint jkey_len) {
-  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  static const rocksdb::WriteOptions default_write_options =
-      rocksdb::WriteOptions();
-
-  rocksdb_remove_helper(env, db, default_write_options, jkey, jkey_len);
-}
-
-/*
- * Class:     org_rocksdb_RocksDB
- * Method:    remove
- * Signature: (JJ[BI)V
- */
-void Java_org_rocksdb_RocksDB_remove__JJ_3BI(
-    JNIEnv* env, jobject jdb, jlong jdb_handle,
-    jlong jwrite_options, jbyteArray jkey, jint jkey_len) {
-  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  auto write_options = reinterpret_cast<rocksdb::WriteOptions*>(jwrite_options);
-
-  rocksdb_remove_helper(env, db, *write_options, jkey, jkey_len);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// rocksdb::DB::~DB()
-
-/*
- * Class:     org_rocksdb_RocksDB
- * Method:    dispose
- * Signature: (J)V
- */
-void Java_org_rocksdb_RocksDB_dispose(
-    JNIEnv* env, jobject java_db, jlong jhandle) {
-  auto db = reinterpret_cast<rocksdb::DB*>(jhandle);
-  assert(db != nullptr);
-  delete db;
-}
-
-/*
- * Class:     org_rocksdb_RocksDB
- * Method:    iterator0
- * Signature: (J)J
- */
-jlong Java_org_rocksdb_RocksDB_iterator0(
-    JNIEnv* env, jobject jdb, jlong db_handle) {
-  auto db = reinterpret_cast<rocksdb::DB*>(db_handle);
-  rocksdb::Iterator* iterator = db->NewIterator(rocksdb::ReadOptions());
-  return reinterpret_cast<jlong>(iterator);
-}
diff --git a/src/rocksdb/java/rocksjni/statistics.cc b/src/rocksdb/java/rocksjni/statistics.cc
deleted file mode 100644
index bf170c6..0000000
--- a/src/rocksdb/java/rocksjni/statistics.cc
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
-//
-// This file implements the "bridge" between Java and C++ and enables
-// calling c++ rocksdb::Statistics methods from Java side.
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <jni.h>
-
-#include "include/org_rocksdb_Statistics.h"
-#include "rocksjni/portal.h"
-#include "rocksdb/statistics.h"
-
-/*
- * Class:     org_rocksdb_Statistics
- * Method:    getTickerCount0
- * Signature: (IJ)J
- */
-jlong Java_org_rocksdb_Statistics_getTickerCount0(
-    JNIEnv* env, jobject jobj, int tickerType, jlong handle) {
-  auto st = reinterpret_cast<rocksdb::Statistics*>(handle);
-  assert(st != nullptr);
-
-  return st->getTickerCount(static_cast<rocksdb::Tickers>(tickerType));
-}
-
-/*
- * Class:     org_rocksdb_Statistics
- * Method:    geHistogramData0
- * Signature: (IJ)Lorg/rocksdb/HistogramData;
- */
-jobject Java_org_rocksdb_Statistics_geHistogramData0(
-  JNIEnv* env, jobject jobj, int histogramType, jlong handle) {
-  auto st = reinterpret_cast<rocksdb::Statistics*>(handle);
-  assert(st != nullptr);
-
-  rocksdb::HistogramData data;
-  st->histogramData(static_cast<rocksdb::Histograms>(histogramType),
-    &data);
-
-  // Don't reuse class pointer
-  jclass jclazz = env->FindClass("org/rocksdb/HistogramData");
-  jmethodID mid = rocksdb::HistogramDataJni::getConstructorMethodId(
-      env, jclazz);
-  return env->NewObject(jclazz, mid, data.median, data.percentile95,
-      data.percentile99, data.average, data.standard_deviation);
-}
diff --git a/src/rocksdb/java/rocksjni/table.cc b/src/rocksdb/java/rocksjni/table.cc
deleted file mode 100644
index c21501b..0000000
--- a/src/rocksdb/java/rocksjni/table.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
-//
-// This file implements the "bridge" between Java and C++ for rocksdb::Options.
-
-#include <jni.h>
-#include "include/org_rocksdb_PlainTableConfig.h"
-#include "rocksdb/table.h"
-
-/*
- * Class:     org_rocksdb_PlainTableConfig
- * Method:    newTableFactoryHandle
- * Signature: (IIDI)J
- */
-jlong Java_org_rocksdb_PlainTableConfig_newTableFactoryHandle(
-    JNIEnv* env, jobject jobj, jint jkey_size, jint jbloom_bits_per_key,
-    jdouble jhash_table_ratio, jint jindex_sparseness) {
-  return reinterpret_cast<jlong>(rocksdb::NewPlainTableFactory(
-          static_cast<uint32_t>(jkey_size),
-          static_cast<int>(jbloom_bits_per_key),
-          static_cast<double>(jhash_table_ratio),
-          static_cast<size_t>(jindex_sparseness)));
-}
diff --git a/src/rocksdb/java/rocksjni/write_batch.cc b/src/rocksdb/java/rocksjni/write_batch.cc
deleted file mode 100644
index 035b35f..0000000
--- a/src/rocksdb/java/rocksjni/write_batch.cc
+++ /dev/null
@@ -1,264 +0,0 @@
-// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
-//
-// This file implements the "bridge" between Java and C++ and enables
-// calling c++ rocksdb::WriteBatch methods from Java side.
-#include <memory>
-
-#include "include/org_rocksdb_WriteBatch.h"
-#include "include/org_rocksdb_WriteBatchInternal.h"
-#include "include/org_rocksdb_WriteBatchTest.h"
-#include "rocksjni/portal.h"
-#include "rocksdb/db.h"
-#include "db/memtable.h"
-#include "rocksdb/write_batch.h"
-#include "db/write_batch_internal.h"
-#include "rocksdb/env.h"
-#include "rocksdb/memtablerep.h"
-#include "util/logging.h"
-#include "util/testharness.h"
-
-/*
- * Class:     org_rocksdb_WriteBatch
- * Method:    newWriteBatch
- * Signature: (I)V
- */
-void Java_org_rocksdb_WriteBatch_newWriteBatch(
-    JNIEnv* env, jobject jobj, jint jreserved_bytes) {
-  rocksdb::WriteBatch* wb = new rocksdb::WriteBatch(
-      static_cast<size_t>(jreserved_bytes));
-
-  rocksdb::WriteBatchJni::setHandle(env, jobj, wb);
-}
-
-/*
- * Class:     org_rocksdb_WriteBatch
- * Method:    count
- * Signature: ()I
- */
-jint Java_org_rocksdb_WriteBatch_count(JNIEnv* env, jobject jobj) {
-  rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
-  assert(wb != nullptr);
-
-  return static_cast<jint>(wb->Count());
-}
-
-/*
- * Class:     org_rocksdb_WriteBatch
- * Method:    clear
- * Signature: ()V
- */
-void Java_org_rocksdb_WriteBatch_clear(JNIEnv* env, jobject jobj) {
-  rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
-  assert(wb != nullptr);
-
-  wb->Clear();
-}
-
-/*
- * Class:     org_rocksdb_WriteBatch
- * Method:    put
- * Signature: ([BI[BI)V
- */
-void Java_org_rocksdb_WriteBatch_put(
-    JNIEnv* env, jobject jobj,
-    jbyteArray jkey, jint jkey_len,
-    jbyteArray jvalue, jint jvalue_len) {
-  rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
-  assert(wb != nullptr);
-
-  jbyte* key = env->GetByteArrayElements(jkey, nullptr);
-  jbyte* value = env->GetByteArrayElements(jvalue, nullptr);
-  rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
-  rocksdb::Slice value_slice(reinterpret_cast<char*>(value), jvalue_len);
-  wb->Put(key_slice, value_slice);
-  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
-  env->ReleaseByteArrayElements(jvalue, value, JNI_ABORT);
-}
-
-/*
- * Class:     org_rocksdb_WriteBatch
- * Method:    merge
- * Signature: ([BI[BI)V
- */
-JNIEXPORT void JNICALL Java_org_rocksdb_WriteBatch_merge(
-    JNIEnv* env, jobject jobj,
-    jbyteArray jkey, jint jkey_len,
-    jbyteArray jvalue, jint jvalue_len) {
-  rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
-  assert(wb != nullptr);
-
-  jbyte* key = env->GetByteArrayElements(jkey, nullptr);
-  jbyte* value = env->GetByteArrayElements(jvalue, nullptr);
-  rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
-  rocksdb::Slice value_slice(reinterpret_cast<char*>(value), jvalue_len);
-  wb->Merge(key_slice, value_slice);
-  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
-  env->ReleaseByteArrayElements(jvalue, value, JNI_ABORT);
-}
-
-/*
- * Class:     org_rocksdb_WriteBatch
- * Method:    remove
- * Signature: ([BI)V
- */
-JNIEXPORT void JNICALL Java_org_rocksdb_WriteBatch_remove(
-    JNIEnv* env, jobject jobj,
-    jbyteArray jkey, jint jkey_len) {
-  rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
-  assert(wb != nullptr);
-
-  jbyte* key = env->GetByteArrayElements(jkey, nullptr);
-  rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
-  wb->Delete(key_slice);
-  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
-}
-
-/*
- * Class:     org_rocksdb_WriteBatch
- * Method:    putLogData
- * Signature: ([BI)V
- */
-void Java_org_rocksdb_WriteBatch_putLogData(
-    JNIEnv* env, jobject jobj, jbyteArray jblob, jint jblob_len) {
-  rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
-  assert(wb != nullptr);
-
-  jbyte* blob = env->GetByteArrayElements(jblob, nullptr);
-  rocksdb::Slice blob_slice(reinterpret_cast<char*>(blob), jblob_len);
-  wb->PutLogData(blob_slice);
-  env->ReleaseByteArrayElements(jblob, blob, JNI_ABORT);
-}
-
-/*
- * Class:     org_rocksdb_WriteBatch
- * Method:    dispose0
- * Signature: ()V
- */
-void Java_org_rocksdb_WriteBatch_dispose0(JNIEnv* env, jobject jobj) {
-  rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
-  assert(wb != nullptr);
-  delete wb;
-
-  rocksdb::WriteBatchJni::setHandle(env, jobj, nullptr);
-}
-
-/*
- * Class:     org_rocksdb_WriteBatchInternal
- * Method:    setSequence
- * Signature: (Lorg/rocksdb/WriteBatch;J)V
- */
-void Java_org_rocksdb_WriteBatchInternal_setSequence(
-    JNIEnv* env, jclass jclazz, jobject jobj, jlong jsn) {
-  rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
-  assert(wb != nullptr);
-
-  rocksdb::WriteBatchInternal::SetSequence(
-      wb, static_cast<rocksdb::SequenceNumber>(jsn));
-}
-
-/*
- * Class:     org_rocksdb_WriteBatchInternal
- * Method:    sequence
- * Signature: (Lorg/rocksdb/WriteBatch;)J
- */
-jlong Java_org_rocksdb_WriteBatchInternal_sequence(
-    JNIEnv* env, jclass jclazz, jobject jobj) {
-  rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
-  assert(wb != nullptr);
-
-  return static_cast<jlong>(rocksdb::WriteBatchInternal::Sequence(wb));
-}
-
-/*
- * Class:     org_rocksdb_WriteBatchInternal
- * Method:    append
- * Signature: (Lorg/rocksdb/WriteBatch;Lorg/rocksdb/WriteBatch;)V
- */
-void Java_org_rocksdb_WriteBatchInternal_append(
-    JNIEnv* env, jclass jclazz, jobject jwb1, jobject jwb2) {
-  rocksdb::WriteBatch* wb1 = rocksdb::WriteBatchJni::getHandle(env, jwb1);
-  assert(wb1 != nullptr);
-  rocksdb::WriteBatch* wb2 = rocksdb::WriteBatchJni::getHandle(env, jwb2);
-  assert(wb2 != nullptr);
-
-  rocksdb::WriteBatchInternal::Append(wb1, wb2);
-}
-
-/*
- * Class:     org_rocksdb_WriteBatchTest
- * Method:    getContents
- * Signature: (Lorg/rocksdb/WriteBatch;)[B
- */
-jbyteArray Java_org_rocksdb_WriteBatchTest_getContents(
-    JNIEnv* env, jclass jclazz, jobject jobj) {
-  rocksdb::WriteBatch* b = rocksdb::WriteBatchJni::getHandle(env, jobj);
-  assert(b != nullptr);
-
-  // todo: Currently the following code is directly copied from
-  // db/write_bench_test.cc.  It could be implemented in java once
-  // all the necessary components can be accessed via jni api.
-
-  rocksdb::InternalKeyComparator cmp(rocksdb::BytewiseComparator());
-  auto factory = std::make_shared<rocksdb::SkipListFactory>();
-  rocksdb::Options options;
-  options.memtable_factory = factory;
-  rocksdb::MemTable* mem = new rocksdb::MemTable(cmp, options);
-  mem->Ref();
-  std::string state;
-  rocksdb::ColumnFamilyMemTablesDefault cf_mems_default(mem, &options);
-  rocksdb::Status s =
-      rocksdb::WriteBatchInternal::InsertInto(b, &cf_mems_default);
-  int count = 0;
-  rocksdb::Iterator* iter = mem->NewIterator(rocksdb::ReadOptions());
-  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
-    rocksdb::ParsedInternalKey ikey;
-    memset(reinterpret_cast<void*>(&ikey), 0, sizeof(ikey));
-    ASSERT_TRUE(rocksdb::ParseInternalKey(iter->key(), &ikey));
-    switch (ikey.type) {
-      case rocksdb::kTypeValue:
-        state.append("Put(");
-        state.append(ikey.user_key.ToString());
-        state.append(", ");
-        state.append(iter->value().ToString());
-        state.append(")");
-        count++;
-        break;
-      case rocksdb::kTypeMerge:
-        state.append("Merge(");
-        state.append(ikey.user_key.ToString());
-        state.append(", ");
-        state.append(iter->value().ToString());
-        state.append(")");
-        count++;
-        break;
-      case rocksdb::kTypeDeletion:
-        state.append("Delete(");
-        state.append(ikey.user_key.ToString());
-        state.append(")");
-        count++;
-        break;
-      default:
-        assert(false);
-        break;
-    }
-    state.append("@");
-    state.append(rocksdb::NumberToString(ikey.sequence));
-  }
-  delete iter;
-  if (!s.ok()) {
-    state.append(s.ToString());
-  } else if (count != rocksdb::WriteBatchInternal::Count(b)) {
-    state.append("CountMismatch()");
-  }
-  delete mem->Unref();
-
-  jbyteArray jstate = env->NewByteArray(state.size());
-  env->SetByteArrayRegion(
-      jstate, 0, state.size(),
-      reinterpret_cast<const jbyte*>(state.c_str()));
-
-  return jstate;
-}
diff --git a/src/rocksdb/linters/__phutil_library_init__.php b/src/rocksdb/linters/__phutil_library_init__.php
deleted file mode 100644
index 4b8d3d1..0000000
--- a/src/rocksdb/linters/__phutil_library_init__.php
+++ /dev/null
@@ -1,3 +0,0 @@
-<?php
-
-phutil_register_library('linters', __FILE__);
diff --git a/src/rocksdb/linters/__phutil_library_map__.php b/src/rocksdb/linters/__phutil_library_map__.php
deleted file mode 100644
index 7808dc1..0000000
--- a/src/rocksdb/linters/__phutil_library_map__.php
+++ /dev/null
@@ -1,27 +0,0 @@
-<?php
-
-/**
- * This file is automatically generated. Use 'arc liberate' to rebuild it.
- * @generated
- * @phutil-library-version 2
- */
-
-phutil_register_library_map(array(
-  '__library_version__' => 2,
-  'class' =>
-  array(
-    'FacebookFbcodeLintEngine' => 'lint_engine/FacebookFbcodeLintEngine.php',
-    'FbcodeCppLinter' => 'cpp_linter/FbcodeCppLinter.php',
-    'PfffCppLinter' => 'cpp_linter/PfffCppLinter.php',
-    'ArcanistCpplintLinter' => 'cpp_linter/ArcanistCpplintLinter.php',
-  ),
-  'function' =>
-  array(
-  ),
-  'xmap' =>
-  array(
-    'FacebookFbcodeLintEngine' => 'ArcanistLintEngine',
-    'FbcodeCppLinter' => 'ArcanistLinter',
-    'PfffCppLinter' => 'ArcanistLinter',
-  ),
-));
diff --git a/src/rocksdb/linters/cpp_linter/ArcanistCpplintLinter.php b/src/rocksdb/linters/cpp_linter/ArcanistCpplintLinter.php
deleted file mode 100644
index b9c4137..0000000
--- a/src/rocksdb/linters/cpp_linter/ArcanistCpplintLinter.php
+++ /dev/null
@@ -1,88 +0,0 @@
-<?php
-
-/**
- * Uses google's cpplint.py to check code. RocksDB team forked this file from
- * phabricator's /src/lint/linter/ArcanistCpplintLinter.php, and customized it
- * for its own use.
- *
- * You can get it here:
- * http://google-styleguide.googlecode.com/svn/trunk/cpplint/cpplint.py
- * @group linter
- */
-final class ArcanistCpplintLinter extends ArcanistLinter {
-
-  public function willLintPaths(array $paths) {
-    return;
-  }
-
-  public function getLinterName() {
-    return 'cpplint.py';
-  }
-
-  public function getLintPath() {
-    $bin = 'cpplint.py';
-    // Search under current dir
-    list($err) = exec_manual('which %s/%s', $this->linterDir(), $bin);
-    if (!$err) {
-      return $this->linterDir().'/'.$bin;
-    }
-
-    // Look for globally installed cpplint.py
-    list($err) = exec_manual('which %s', $bin);
-    if ($err) {
-      throw new ArcanistUsageException(
-        "cpplint.py does not appear to be installed on this system. Install ".
-        "it (e.g., with 'wget \"http://google-styleguide.googlecode.com/".
-        "svn/trunk/cpplint/cpplint.py\"') ".
-        "in your .arcconfig to point to the directory where it resides. ".
-        "Also don't forget to chmod a+x cpplint.py!");
-    }
-
-    return $bin;
-  }
-
-  public function lintPath($path) {
-    $bin = $this->getLintPath();
-    $path = $this->rocksdbDir().'/'.$path;
-
-    $f = new ExecFuture("%C $path", $bin);
-
-    list($err, $stdout, $stderr) = $f->resolve();
-
-    if ($err === 2) {
-      throw new Exception("cpplint failed to run correctly:\n".$stderr);
-    }
-
-    $lines = explode("\n", $stderr);
-    $messages = array();
-    foreach ($lines as $line) {
-      $line = trim($line);
-      $matches = null;
-      $regex = '/^[^:]+:(\d+):\s*(.*)\s*\[(.*)\] \[(\d+)\]$/';
-      if (!preg_match($regex, $line, $matches)) {
-        continue;
-      }
-      foreach ($matches as $key => $match) {
-        $matches[$key] = trim($match);
-      }
-      $message = new ArcanistLintMessage();
-      $message->setPath($path);
-      $message->setLine($matches[1]);
-      $message->setCode($matches[3]);
-      $message->setName($matches[3]);
-      $message->setDescription($matches[2]);
-      $message->setSeverity(ArcanistLintSeverity::SEVERITY_WARNING);
-      $this->addLintMessage($message);
-    }
-  }
-
-  // The path of this linter
-  private function linterDir() {
-    return dirname(__FILE__);
-  }
-
-  // TODO(kaili) a quick and dirty way to figure out rocksdb's root dir.
-  private function rocksdbDir() {
-    return $this->linterDir()."/../..";
-  }
-}
diff --git a/src/rocksdb/linters/cpp_linter/FbcodeCppLinter.php b/src/rocksdb/linters/cpp_linter/FbcodeCppLinter.php
deleted file mode 100644
index e62d3bb..0000000
--- a/src/rocksdb/linters/cpp_linter/FbcodeCppLinter.php
+++ /dev/null
@@ -1,99 +0,0 @@
-<?php
-
-class FbcodeCppLinter extends ArcanistLinter {
-  const CPPLINT      = "/home/engshare/tools/cpplint";
-  const LINT_ERROR   = 1;
-  const LINT_WARNING = 2;
-  const C_FLAG = "--c_mode=true";
-  private $rawLintOutput = array();
-
-  public function willLintPaths(array $paths) {
-    $futures = array();
-    $ret_value = 0;
-    $last_line = system("which cpplint", $ret_value);
-    $CPP_LINT = false;
-    if ($ret_value == 0) {
-      $CPP_LINT = $last_line;
-    } else if (file_exists(self::CPPLINT)) {
-      $CPP_LINT = self::CPPLINT;
-    }
-
-    if ($CPP_LINT) {
-      foreach ($paths as $p) {
-        $lpath = $this->getEngine()->getFilePathOnDisk($p);
-        $lpath_file = file($lpath);
-        if (preg_match('/\.(c)$/', $lpath) ||
-            preg_match('/-\*-.*Mode: C[; ].*-\*-/', $lpath_file[0]) ||
-            preg_match('/vim(:.*)*:\s*(set\s+)?filetype=c\s*:/', $lpath_file[0])
-            ) {
-          $futures[$p] = new ExecFuture("%s %s %s 2>&1",
-                             $CPP_LINT, self::C_FLAG,
-                             $this->getEngine()->getFilePathOnDisk($p));
-        } else {
-          $futures[$p] = new ExecFuture("%s %s 2>&1",
-            self::CPPLINT, $this->getEngine()->getFilePathOnDisk($p));
-        }
-      }
-
-      foreach (Futures($futures)->limit(8) as $p => $f) {
-        $this->rawLintOutput[$p] = $f->resolvex();
-      }
-    }
-    return;
-  }
-
-  public function getLinterName() {
-    return "FBCPP";
-  }
-
-  public function lintPath($path) {
-    $msgs = $this->getCppLintOutput($path);
-    foreach ($msgs as $m) {
-      $this->raiseLintAtLine($m['line'], 0, $m['severity'], $m['msg']);
-    }
-  }
-
-  public function getLintSeverityMap() {
-    return array(
-      self::LINT_WARNING => ArcanistLintSeverity::SEVERITY_WARNING,
-      self::LINT_ERROR   => ArcanistLintSeverity::SEVERITY_ERROR
-    );
-  }
-
-  public function getLintNameMap() {
-    return array(
-      self::LINT_WARNING => "CppLint Warning",
-      self::LINT_ERROR   => "CppLint Error"
-    );
-  }
-
-  private function getCppLintOutput($path) {
-    list($output) = $this->rawLintOutput[$path];
-
-    $msgs = array();
-    $current = null;
-    foreach (explode("\n", $output) as $line) {
-      if (preg_match('/[^:]*\((\d+)\):(.*)$/', $line, $matches)) {
-        if ($current) {
-          $msgs[] = $current;
-        }
-        $line = $matches[1];
-        $text = $matches[2];
-        $sev  = preg_match('/.*Warning.*/', $text)
-                  ? self::LINT_WARNING
-                  : self::LINT_ERROR;
-        $current = array('line'     => $line,
-                         'msg'      => $text,
-                         'severity' => $sev);
-      } else if ($current) {
-        $current['msg'] .= ' ' . $line;
-      }
-    }
-    if ($current) {
-      $msgs[] = $current;
-    }
-
-    return $msgs;
-  }
-}
-
diff --git a/src/rocksdb/linters/cpp_linter/PfffCppLinter.php b/src/rocksdb/linters/cpp_linter/PfffCppLinter.php
deleted file mode 100644
index 6736614..0000000
--- a/src/rocksdb/linters/cpp_linter/PfffCppLinter.php
+++ /dev/null
@@ -1,68 +0,0 @@
-<?php
-// Copyright 2004-present Facebook.  All rights reserved.
-
-class PfffCppLinter extends ArcanistLinter {
-  const PROGRAM      = "/home/engshare/tools/checkCpp";
-
-  public function getLinterName() {
-    return "checkCpp";
-  }
-  public function getLintNameMap() {
-    return array(
-    );
-  }
-
-  public function getLintSeverityMap() {
-    return array(
-    );
-  }
-
-  public function willLintPaths(array $paths) {
-    $program = false;
-    $ret_value = 0;
-    $last_line = system("which checkCpp", $ret_value);
-    if ($ret_value == 0) {
-      $program = $last_line;
-    } else if (file_exists(self::PROGRAM)) {
-      $program = self::PROGRAM;
-    }
-    if ($program) {
-      $futures = array();
-      foreach ($paths as $p) {
-        $futures[$p] = new ExecFuture("%s --lint %s 2>&1",
-          $program, $this->getEngine()->getFilePathOnDisk($p));
-      }
-      foreach (Futures($futures)->limit(8) as $p => $f) {
-
-        list($stdout, $stderr) = $f->resolvex();
-        $raw = json_decode($stdout, true);
-        if (!is_array($raw)) {
-          throw new Exception(
-            "checkCpp returned invalid JSON!".
-            "Stdout: {$stdout} Stderr: {$stderr}"
-          );
-        }
-        foreach($raw as $err) {
-          $this->addLintMessage(
-            ArcanistLintMessage::newFromDictionary(
-              array(
-                'path' => $err['file'],
-                'line' => $err['line'],
-                'char' => 0,
-                'name' => $err['name'],
-                'description' => $err['info'],
-                'code' => $this->getLinterName(),
-                'severity' => ArcanistLintSeverity::SEVERITY_WARNING,
-              )
-            )
-          );
-        }
-      }
-    }
-    return;
-  }
-
-  public function lintPath($path) {
-    return;
-  }
-}
diff --git a/src/rocksdb/linters/cpp_linter/cpplint.py b/src/rocksdb/linters/cpp_linter/cpplint.py
deleted file mode 100755
index d264b00..0000000
--- a/src/rocksdb/linters/cpp_linter/cpplint.py
+++ /dev/null
@@ -1,4767 +0,0 @@
-#!/usr/bin/python
-# Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree. An additional grant
-# of patent rights can be found in the PATENTS file in the same directory.
-# Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file. See the AUTHORS file for names of contributors.
-#
-# Copyright (c) 2009 Google Inc. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met:
-#    * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-#    * Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following disclaimer
-# in the documentation and/or other materials provided with the
-# distribution.
-#    * Neither the name of Google Inc. nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-"""Does google-lint on c++ files.
-
-The goal of this script is to identify places in the code that *may*
-be in non-compliance with google style.  It does not attempt to fix
-up these problems -- the point is to educate.  It does also not
-attempt to find all problems, or to ensure that everything it does
-find is legitimately a problem.
-
-In particular, we can get very confused by /* and // inside strings!
-We do a small hack, which is to ignore //'s with "'s after them on the
-same line, but it is far from perfect (in either direction).
-"""
-
-import codecs
-import copy
-import getopt
-import math  # for log
-import os
-import re
-import sre_compile
-import string
-import sys
-import unicodedata
-
-
-_USAGE = """
-Syntax: cpplint.py [--verbose=#] [--output=vs7] [--filter=-x,+y,...]
-                   [--counting=total|toplevel|detailed] [--root=subdir]
-                   [--linelength=digits]
-        <file> [file] ...
-
-  The style guidelines this tries to follow are those in
-    http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml
-
-  Every problem is given a confidence score from 1-5, with 5 meaning we are
-  certain of the problem, and 1 meaning it could be a legitimate construct.
-  This will miss some errors, and is not a substitute for a code review.
-
-  To suppress false-positive errors of a certain category, add a
-  'NOLINT(category)' comment to the line.  NOLINT or NOLINT(*)
-  suppresses errors of all categories on that line.
-
-  The files passed in will be linted; at least one file must be provided.
-  Default linted extensions are .cc, .cpp, .cu, .cuh and .h.  Change the
-  extensions with the --extensions flag.
-
-  Flags:
-
-    output=vs7
-      By default, the output is formatted to ease emacs parsing.  Visual Studio
-      compatible output (vs7) may also be used.  Other formats are unsupported.
-
-    verbose=#
-      Specify a number 0-5 to restrict errors to certain verbosity levels.
-
-    filter=-x,+y,...
-      Specify a comma-separated list of category-filters to apply: only
-      error messages whose category names pass the filters will be printed.
-      (Category names are printed with the message and look like
-      "[whitespace/indent]".)  Filters are evaluated left to right.
-      "-FOO" and "FOO" means "do not print categories that start with FOO".
-      "+FOO" means "do print categories that start with FOO".
-
-      Examples: --filter=-whitespace,+whitespace/braces
-                --filter=whitespace,runtime/printf,+runtime/printf_format
-                --filter=-,+build/include_what_you_use
-
-      To see a list of all the categories used in cpplint, pass no arg:
-         --filter=
-
-    counting=total|toplevel|detailed
-      The total number of errors found is always printed. If
-      'toplevel' is provided, then the count of errors in each of
-      the top-level categories like 'build' and 'whitespace' will
-      also be printed. If 'detailed' is provided, then a count
-      is provided for each category like 'build/class'.
-
-    root=subdir
-      The root directory used for deriving header guard CPP variable.
-      By default, the header guard CPP variable is calculated as the relative
-      path to the directory that contains .git, .hg, or .svn.  When this flag
-      is specified, the relative path is calculated from the specified
-      directory. If the specified directory does not exist, this flag is
-      ignored.
-
-      Examples:
-        Assuing that src/.git exists, the header guard CPP variables for
-        src/chrome/browser/ui/browser.h are:
-
-        No flag => CHROME_BROWSER_UI_BROWSER_H_
-        --root=chrome => BROWSER_UI_BROWSER_H_
-        --root=chrome/browser => UI_BROWSER_H_
-
-    linelength=digits
-      This is the allowed line length for the project. The default value is
-      80 characters.
-
-      Examples:
-        --linelength=120
-
-    extensions=extension,extension,...
-      The allowed file extensions that cpplint will check
-
-      Examples:
-        --extensions=hpp,cpp
-"""
-
-# We categorize each error message we print.  Here are the categories.
-# We want an explicit list so we can list them all in cpplint --filter=.
-# If you add a new error message with a new category, add it to the list
-# here!  cpplint_unittest.py should tell you if you forget to do this.
-_ERROR_CATEGORIES = [
-  'build/class',
-  'build/deprecated',
-  'build/endif_comment',
-  'build/explicit_make_pair',
-  'build/forward_decl',
-  'build/header_guard',
-  'build/include',
-  'build/include_alpha',
-  'build/include_order',
-  'build/include_what_you_use',
-  'build/namespaces',
-  'build/printf_format',
-  'build/storage_class',
-  'legal/copyright',
-  'readability/alt_tokens',
-  'readability/braces',
-  'readability/casting',
-  'readability/check',
-  'readability/constructors',
-  'readability/fn_size',
-  'readability/function',
-  'readability/multiline_comment',
-  'readability/multiline_string',
-  'readability/namespace',
-  'readability/nolint',
-  'readability/nul',
-  'readability/streams',
-  'readability/todo',
-  'readability/utf8',
-  'runtime/arrays',
-  'runtime/casting',
-  'runtime/explicit',
-  'runtime/int',
-  'runtime/init',
-  'runtime/invalid_increment',
-  'runtime/member_string_references',
-  'runtime/memset',
-  'runtime/operator',
-  'runtime/printf',
-  'runtime/printf_format',
-  'runtime/references',
-  'runtime/string',
-  'runtime/threadsafe_fn',
-  'runtime/vlog',
-  'whitespace/blank_line',
-  'whitespace/braces',
-  'whitespace/comma',
-  'whitespace/comments',
-  'whitespace/empty_conditional_body',
-  'whitespace/empty_loop_body',
-  'whitespace/end_of_line',
-  'whitespace/ending_newline',
-  'whitespace/forcolon',
-  'whitespace/indent',
-  'whitespace/line_length',
-  'whitespace/newline',
-  'whitespace/operators',
-  'whitespace/parens',
-  'whitespace/semicolon',
-  'whitespace/tab',
-  'whitespace/todo'
-  ]
-
-# The default state of the category filter. This is overrided by the --filter=
-# flag. By default all errors are on, so only add here categories that should be
-# off by default (i.e., categories that must be enabled by the --filter= flags).
-# All entries here should start with a '-' or '+', as in the --filter= flag.
-_DEFAULT_FILTERS = ['-build/include_alpha']
-
-# We used to check for high-bit characters, but after much discussion we
-# decided those were OK, as long as they were in UTF-8 and didn't represent
-# hard-coded international strings, which belong in a separate i18n file.
-
-
-# C++ headers
-_CPP_HEADERS = frozenset([
-    # Legacy
-    'algobase.h',
-    'algo.h',
-    'alloc.h',
-    'builtinbuf.h',
-    'bvector.h',
-    'complex.h',
-    'defalloc.h',
-    'deque.h',
-    'editbuf.h',
-    'fstream.h',
-    'function.h',
-    'hash_map',
-    'hash_map.h',
-    'hash_set',
-    'hash_set.h',
-    'hashtable.h',
-    'heap.h',
-    'indstream.h',
-    'iomanip.h',
-    'iostream.h',
-    'istream.h',
-    'iterator.h',
-    'list.h',
-    'map.h',
-    'multimap.h',
-    'multiset.h',
-    'ostream.h',
-    'pair.h',
-    'parsestream.h',
-    'pfstream.h',
-    'procbuf.h',
-    'pthread_alloc',
-    'pthread_alloc.h',
-    'rope',
-    'rope.h',
-    'ropeimpl.h',
-    'set.h',
-    'slist',
-    'slist.h',
-    'stack.h',
-    'stdiostream.h',
-    'stl_alloc.h',
-    'stl_relops.h',
-    'streambuf.h',
-    'stream.h',
-    'strfile.h',
-    'strstream.h',
-    'tempbuf.h',
-    'tree.h',
-    'type_traits.h',
-    'vector.h',
-    # 17.6.1.2 C++ library headers
-    'algorithm',
-    'array',
-    'atomic',
-    'bitset',
-    'chrono',
-    'codecvt',
-    'complex',
-    'condition_variable',
-    'deque',
-    'exception',
-    'forward_list',
-    'fstream',
-    'functional',
-    'future',
-    'initializer_list',
-    'iomanip',
-    'ios',
-    'iosfwd',
-    'iostream',
-    'istream',
-    'iterator',
-    'limits',
-    'list',
-    'locale',
-    'map',
-    'memory',
-    'mutex',
-    'new',
-    'numeric',
-    'ostream',
-    'queue',
-    'random',
-    'ratio',
-    'regex',
-    'set',
-    'sstream',
-    'stack',
-    'stdexcept',
-    'streambuf',
-    'string',
-    'strstream',
-    'system_error',
-    'thread',
-    'tuple',
-    'typeindex',
-    'typeinfo',
-    'type_traits',
-    'unordered_map',
-    'unordered_set',
-    'utility',
-    'valarray',
-    'vector',
-    # 17.6.1.2 C++ headers for C library facilities
-    'cassert',
-    'ccomplex',
-    'cctype',
-    'cerrno',
-    'cfenv',
-    'cfloat',
-    'cinttypes',
-    'ciso646',
-    'climits',
-    'clocale',
-    'cmath',
-    'csetjmp',
-    'csignal',
-    'cstdalign',
-    'cstdarg',
-    'cstdbool',
-    'cstddef',
-    'cstdint',
-    'cstdio',
-    'cstdlib',
-    'cstring',
-    'ctgmath',
-    'ctime',
-    'cuchar',
-    'cwchar',
-    'cwctype',
-    ])
-
-# Assertion macros.  These are defined in base/logging.h and
-# testing/base/gunit.h.  Note that the _M versions need to come first
-# for substring matching to work.
-_CHECK_MACROS = [
-    'DCHECK', 'CHECK',
-    'EXPECT_TRUE_M', 'EXPECT_TRUE',
-    'ASSERT_TRUE_M', 'ASSERT_TRUE',
-    'EXPECT_FALSE_M', 'EXPECT_FALSE',
-    'ASSERT_FALSE_M', 'ASSERT_FALSE',
-    ]
-
-# Replacement macros for CHECK/DCHECK/EXPECT_TRUE/EXPECT_FALSE
-_CHECK_REPLACEMENT = dict([(m, {}) for m in _CHECK_MACROS])
-
-for op, replacement in [('==', 'EQ'), ('!=', 'NE'),
-                        ('>=', 'GE'), ('>', 'GT'),
-                        ('<=', 'LE'), ('<', 'LT')]:
-  _CHECK_REPLACEMENT['DCHECK'][op] = 'DCHECK_%s' % replacement
-  _CHECK_REPLACEMENT['CHECK'][op] = 'CHECK_%s' % replacement
-  _CHECK_REPLACEMENT['EXPECT_TRUE'][op] = 'EXPECT_%s' % replacement
-  _CHECK_REPLACEMENT['ASSERT_TRUE'][op] = 'ASSERT_%s' % replacement
-  _CHECK_REPLACEMENT['EXPECT_TRUE_M'][op] = 'EXPECT_%s_M' % replacement
-  _CHECK_REPLACEMENT['ASSERT_TRUE_M'][op] = 'ASSERT_%s_M' % replacement
-
-for op, inv_replacement in [('==', 'NE'), ('!=', 'EQ'),
-                            ('>=', 'LT'), ('>', 'LE'),
-                            ('<=', 'GT'), ('<', 'GE')]:
-  _CHECK_REPLACEMENT['EXPECT_FALSE'][op] = 'EXPECT_%s' % inv_replacement
-  _CHECK_REPLACEMENT['ASSERT_FALSE'][op] = 'ASSERT_%s' % inv_replacement
-  _CHECK_REPLACEMENT['EXPECT_FALSE_M'][op] = 'EXPECT_%s_M' % inv_replacement
-  _CHECK_REPLACEMENT['ASSERT_FALSE_M'][op] = 'ASSERT_%s_M' % inv_replacement
-
-# Alternative tokens and their replacements.  For full list, see section 2.5
-# Alternative tokens [lex.digraph] in the C++ standard.
-#
-# Digraphs (such as '%:') are not included here since it's a mess to
-# match those on a word boundary.
-_ALT_TOKEN_REPLACEMENT = {
-    'and': '&&',
-    'bitor': '|',
-    'or': '||',
-    'xor': '^',
-    'compl': '~',
-    'bitand': '&',
-    'and_eq': '&=',
-    'or_eq': '|=',
-    'xor_eq': '^=',
-    'not': '!',
-    'not_eq': '!='
-    }
-
-# Compile regular expression that matches all the above keywords.  The "[ =()]"
-# bit is meant to avoid matching these keywords outside of boolean expressions.
-#
-# False positives include C-style multi-line comments and multi-line strings
-# but those have always been troublesome for cpplint.
-_ALT_TOKEN_REPLACEMENT_PATTERN = re.compile(
-    r'[ =()](' + ('|'.join(_ALT_TOKEN_REPLACEMENT.keys())) + r')(?=[ (]|$)')
-
-
-# These constants define types of headers for use with
-# _IncludeState.CheckNextIncludeOrder().
-_C_SYS_HEADER = 1
-_CPP_SYS_HEADER = 2
-_LIKELY_MY_HEADER = 3
-_POSSIBLE_MY_HEADER = 4
-_OTHER_HEADER = 5
-
-# These constants define the current inline assembly state
-_NO_ASM = 0       # Outside of inline assembly block
-_INSIDE_ASM = 1   # Inside inline assembly block
-_END_ASM = 2      # Last line of inline assembly block
-_BLOCK_ASM = 3    # The whole block is an inline assembly block
-
-# Match start of assembly blocks
-_MATCH_ASM = re.compile(r'^\s*(?:asm|_asm|__asm|__asm__)'
-                        r'(?:\s+(volatile|__volatile__))?'
-                        r'\s*[{(]')
-
-
-_regexp_compile_cache = {}
-
-# Finds occurrences of NOLINT or NOLINT(...).
-_RE_SUPPRESSION = re.compile(r'\bNOLINT\b(\([^)]*\))?')
-
-# {str, set(int)}: a map from error categories to sets of linenumbers
-# on which those errors are expected and should be suppressed.
-_error_suppressions = {}
-
-# The root directory used for deriving header guard CPP variable.
-# This is set by --root flag.
-_root = None
-
-# The allowed line length of files.
-# This is set by --linelength flag.
-_line_length = 80
-
-# The allowed extensions for file names
-# This is set by --extensions flag.
-_valid_extensions = set(['cc', 'h', 'cpp', 'cu', 'cuh'])
-
-def ParseNolintSuppressions(filename, raw_line, linenum, error):
-  """Updates the global list of error-suppressions.
-
-  Parses any NOLINT comments on the current line, updating the global
-  error_suppressions store.  Reports an error if the NOLINT comment
-  was malformed.
-
-  Args:
-    filename: str, the name of the input file.
-    raw_line: str, the line of input text, with comments.
-    linenum: int, the number of the current line.
-    error: function, an error handler.
-  """
-  # FIXME(adonovan): "NOLINT(" is misparsed as NOLINT(*).
-  matched = _RE_SUPPRESSION.search(raw_line)
-  if matched:
-    category = matched.group(1)
-    if category in (None, '(*)'):  # => "suppress all"
-      _error_suppressions.setdefault(None, set()).add(linenum)
-    else:
-      if category.startswith('(') and category.endswith(')'):
-        category = category[1:-1]
-        if category in _ERROR_CATEGORIES:
-          _error_suppressions.setdefault(category, set()).add(linenum)
-        else:
-          error(filename, linenum, 'readability/nolint', 5,
-                'Unknown NOLINT error category: %s' % category)
-
-
-def ResetNolintSuppressions():
-  "Resets the set of NOLINT suppressions to empty."
-  _error_suppressions.clear()
-
-
-def IsErrorSuppressedByNolint(category, linenum):
-  """Returns true if the specified error category is suppressed on this line.
-
-  Consults the global error_suppressions map populated by
-  ParseNolintSuppressions/ResetNolintSuppressions.
-
-  Args:
-    category: str, the category of the error.
-    linenum: int, the current line number.
-  Returns:
-    bool, True iff the error should be suppressed due to a NOLINT comment.
-  """
-  return (linenum in _error_suppressions.get(category, set()) or
-          linenum in _error_suppressions.get(None, set()))
-
-def Match(pattern, s):
-  """Matches the string with the pattern, caching the compiled regexp."""
-  # The regexp compilation caching is inlined in both Match and Search for
-  # performance reasons; factoring it out into a separate function turns out
-  # to be noticeably expensive.
-  if pattern not in _regexp_compile_cache:
-    _regexp_compile_cache[pattern] = sre_compile.compile(pattern)
-  return _regexp_compile_cache[pattern].match(s)
-
-
-def ReplaceAll(pattern, rep, s):
-  """Replaces instances of pattern in a string with a replacement.
-
-  The compiled regex is kept in a cache shared by Match and Search.
-
-  Args:
-    pattern: regex pattern
-    rep: replacement text
-    s: search string
-
-  Returns:
-    string with replacements made (or original string if no replacements)
-  """
-  if pattern not in _regexp_compile_cache:
-    _regexp_compile_cache[pattern] = sre_compile.compile(pattern)
-  return _regexp_compile_cache[pattern].sub(rep, s)
-
-
-def Search(pattern, s):
-  """Searches the string for the pattern, caching the compiled regexp."""
-  if pattern not in _regexp_compile_cache:
-    _regexp_compile_cache[pattern] = sre_compile.compile(pattern)
-  return _regexp_compile_cache[pattern].search(s)
-
-
-class _IncludeState(dict):
-  """Tracks line numbers for includes, and the order in which includes appear.
-
-  As a dict, an _IncludeState object serves as a mapping between include
-  filename and line number on which that file was included.
-
-  Call CheckNextIncludeOrder() once for each header in the file, passing
-  in the type constants defined above. Calls in an illegal order will
-  raise an _IncludeError with an appropriate error message.
-
-  """
-  # self._section will move monotonically through this set. If it ever
-  # needs to move backwards, CheckNextIncludeOrder will raise an error.
-  _INITIAL_SECTION = 0
-  _MY_H_SECTION = 1
-  _C_SECTION = 2
-  _CPP_SECTION = 3
-  _OTHER_H_SECTION = 4
-
-  _TYPE_NAMES = {
-      _C_SYS_HEADER: 'C system header',
-      _CPP_SYS_HEADER: 'C++ system header',
-      _LIKELY_MY_HEADER: 'header this file implements',
-      _POSSIBLE_MY_HEADER: 'header this file may implement',
-      _OTHER_HEADER: 'other header',
-      }
-  _SECTION_NAMES = {
-      _INITIAL_SECTION: "... nothing. (This can't be an error.)",
-      _MY_H_SECTION: 'a header this file implements',
-      _C_SECTION: 'C system header',
-      _CPP_SECTION: 'C++ system header',
-      _OTHER_H_SECTION: 'other header',
-      }
-
-  def __init__(self):
-    dict.__init__(self)
-    self.ResetSection()
-
-  def ResetSection(self):
-    # The name of the current section.
-    self._section = self._INITIAL_SECTION
-    # The path of last found header.
-    self._last_header = ''
-
-  def SetLastHeader(self, header_path):
-    self._last_header = header_path
-
-  def CanonicalizeAlphabeticalOrder(self, header_path):
-    """Returns a path canonicalized for alphabetical comparison.
-
-    - replaces "-" with "_" so they both cmp the same.
-    - removes '-inl' since we don't require them to be after the main header.
-    - lowercase everything, just in case.
-
-    Args:
-      header_path: Path to be canonicalized.
-
-    Returns:
-      Canonicalized path.
-    """
-    return header_path.replace('-inl.h', '.h').replace('-', '_').lower()
-
-  def IsInAlphabeticalOrder(self, clean_lines, linenum, header_path):
-    """Check if a header is in alphabetical order with the previous header.
-
-    Args:
-      clean_lines: A CleansedLines instance containing the file.
-      linenum: The number of the line to check.
-      header_path: Canonicalized header to be checked.
-
-    Returns:
-      Returns true if the header is in alphabetical order.
-    """
-    # If previous section is different from current section, _last_header will
-    # be reset to empty string, so it's always less than current header.
-    #
-    # If previous line was a blank line, assume that the headers are
-    # intentionally sorted the way they are.
-    if (self._last_header > header_path and
-        not Match(r'^\s*$', clean_lines.elided[linenum - 1])):
-      return False
-    return True
-
-  def CheckNextIncludeOrder(self, header_type):
-    """Returns a non-empty error message if the next header is out of order.
-
-    This function also updates the internal state to be ready to check
-    the next include.
-
-    Args:
-      header_type: One of the _XXX_HEADER constants defined above.
-
-    Returns:
-      The empty string if the header is in the right order, or an
-      error message describing what's wrong.
-
-    """
-    error_message = ('Found %s after %s' %
-                     (self._TYPE_NAMES[header_type],
-                      self._SECTION_NAMES[self._section]))
-
-    last_section = self._section
-
-    if header_type == _C_SYS_HEADER:
-      if self._section <= self._C_SECTION:
-        self._section = self._C_SECTION
-      else:
-        self._last_header = ''
-        return error_message
-    elif header_type == _CPP_SYS_HEADER:
-      if self._section <= self._CPP_SECTION:
-        self._section = self._CPP_SECTION
-      else:
-        self._last_header = ''
-        return error_message
-    elif header_type == _LIKELY_MY_HEADER:
-      if self._section <= self._MY_H_SECTION:
-        self._section = self._MY_H_SECTION
-      else:
-        self._section = self._OTHER_H_SECTION
-    elif header_type == _POSSIBLE_MY_HEADER:
-      if self._section <= self._MY_H_SECTION:
-        self._section = self._MY_H_SECTION
-      else:
-        # This will always be the fallback because we're not sure
-        # enough that the header is associated with this file.
-        self._section = self._OTHER_H_SECTION
-    else:
-      assert header_type == _OTHER_HEADER
-      self._section = self._OTHER_H_SECTION
-
-    if last_section != self._section:
-      self._last_header = ''
-
-    return ''
-
-
-class _CppLintState(object):
-  """Maintains module-wide state.."""
-
-  def __init__(self):
-    self.verbose_level = 1  # global setting.
-    self.error_count = 0    # global count of reported errors
-    # filters to apply when emitting error messages
-    self.filters = _DEFAULT_FILTERS[:]
-    self.counting = 'total'  # In what way are we counting errors?
-    self.errors_by_category = {}  # string to int dict storing error counts
-
-    # output format:
-    # "emacs" - format that emacs can parse (default)
-    # "vs7" - format that Microsoft Visual Studio 7 can parse
-    self.output_format = 'emacs'
-
-  def SetOutputFormat(self, output_format):
-    """Sets the output format for errors."""
-    self.output_format = output_format
-
-  def SetVerboseLevel(self, level):
-    """Sets the module's verbosity, and returns the previous setting."""
-    last_verbose_level = self.verbose_level
-    self.verbose_level = level
-    return last_verbose_level
-
-  def SetCountingStyle(self, counting_style):
-    """Sets the module's counting options."""
-    self.counting = counting_style
-
-  def SetFilters(self, filters):
-    """Sets the error-message filters.
-
-    These filters are applied when deciding whether to emit a given
-    error message.
-
-    Args:
-      filters: A string of comma-separated filters (eg "+whitespace/indent").
-               Each filter should start with + or -; else we die.
-
-    Raises:
-      ValueError: The comma-separated filters did not all start with '+' or '-'.
-                  E.g. "-,+whitespace,-whitespace/indent,whitespace/badfilter"
-    """
-    # Default filters always have less priority than the flag ones.
-    self.filters = _DEFAULT_FILTERS[:]
-    for filt in filters.split(','):
-      clean_filt = filt.strip()
-      if clean_filt:
-        self.filters.append(clean_filt)
-    for filt in self.filters:
-      if not (filt.startswith('+') or filt.startswith('-')):
-        raise ValueError('Every filter in --filters must start with + or -'
-                         ' (%s does not)' % filt)
-
-  def ResetErrorCounts(self):
-    """Sets the module's error statistic back to zero."""
-    self.error_count = 0
-    self.errors_by_category = {}
-
-  def IncrementErrorCount(self, category):
-    """Bumps the module's error statistic."""
-    self.error_count += 1
-    if self.counting in ('toplevel', 'detailed'):
-      if self.counting != 'detailed':
-        category = category.split('/')[0]
-      if category not in self.errors_by_category:
-        self.errors_by_category[category] = 0
-      self.errors_by_category[category] += 1
-
-  def PrintErrorCounts(self):
-    """Print a summary of errors by category, and the total."""
-    for category, count in self.errors_by_category.iteritems():
-      sys.stderr.write('Category \'%s\' errors found: %d\n' %
-                       (category, count))
-    sys.stderr.write('Total errors found: %d\n' % self.error_count)
-
-_cpplint_state = _CppLintState()
-
-
-def _OutputFormat():
-  """Gets the module's output format."""
-  return _cpplint_state.output_format
-
-
-def _SetOutputFormat(output_format):
-  """Sets the module's output format."""
-  _cpplint_state.SetOutputFormat(output_format)
-
-
-def _VerboseLevel():
-  """Returns the module's verbosity setting."""
-  return _cpplint_state.verbose_level
-
-
-def _SetVerboseLevel(level):
-  """Sets the module's verbosity, and returns the previous setting."""
-  return _cpplint_state.SetVerboseLevel(level)
-
-
-def _SetCountingStyle(level):
-  """Sets the module's counting options."""
-  _cpplint_state.SetCountingStyle(level)
-
-
-def _Filters():
-  """Returns the module's list of output filters, as a list."""
-  return _cpplint_state.filters
-
-
-def _SetFilters(filters):
-  """Sets the module's error-message filters.
-
-  These filters are applied when deciding whether to emit a given
-  error message.
-
-  Args:
-    filters: A string of comma-separated filters (eg "whitespace/indent").
-             Each filter should start with + or -; else we die.
-  """
-  _cpplint_state.SetFilters(filters)
-
-
-class _FunctionState(object):
-  """Tracks current function name and the number of lines in its body."""
-
-  _NORMAL_TRIGGER = 250  # for --v=0, 500 for --v=1, etc.
-  _TEST_TRIGGER = 400    # about 50% more than _NORMAL_TRIGGER.
-
-  def __init__(self):
-    self.in_a_function = False
-    self.lines_in_function = 0
-    self.current_function = ''
-
-  def Begin(self, function_name):
-    """Start analyzing function body.
-
-    Args:
-      function_name: The name of the function being tracked.
-    """
-    self.in_a_function = True
-    self.lines_in_function = 0
-    self.current_function = function_name
-
-  def Count(self):
-    """Count line in current function body."""
-    if self.in_a_function:
-      self.lines_in_function += 1
-
-  def Check(self, error, filename, linenum):
-    """Report if too many lines in function body.
-
-    Args:
-      error: The function to call with any errors found.
-      filename: The name of the current file.
-      linenum: The number of the line to check.
-    """
-    if Match(r'T(EST|est)', self.current_function):
-      base_trigger = self._TEST_TRIGGER
-    else:
-      base_trigger = self._NORMAL_TRIGGER
-    trigger = base_trigger * 2**_VerboseLevel()
-
-    if self.lines_in_function > trigger:
-      error_level = int(math.log(self.lines_in_function / base_trigger, 2))
-      # 50 => 0, 100 => 1, 200 => 2, 400 => 3, 800 => 4, 1600 => 5, ...
-      if error_level > 5:
-        error_level = 5
-      error(filename, linenum, 'readability/fn_size', error_level,
-            'Small and focused functions are preferred:'
-            ' %s has %d non-comment lines'
-            ' (error triggered by exceeding %d lines).'  % (
-                self.current_function, self.lines_in_function, trigger))
-
-  def End(self):
-    """Stop analyzing function body."""
-    self.in_a_function = False
-
-
-class _IncludeError(Exception):
-  """Indicates a problem with the include order in a file."""
-  pass
-
-
-class FileInfo:
-  """Provides utility functions for filenames.
-
-  FileInfo provides easy access to the components of a file's path
-  relative to the project root.
-  """
-
-  def __init__(self, filename):
-    self._filename = filename
-
-  def FullName(self):
-    """Make Windows paths like Unix."""
-    return os.path.abspath(self._filename).replace('\\', '/')
-
-  def RepositoryName(self):
-    """FullName after removing the local path to the repository.
-
-    If we have a real absolute path name here we can try to do something smart:
-    detecting the root of the checkout and truncating /path/to/checkout from
-    the name so that we get header guards that don't include things like
-    "C:\Documents and Settings\..." or "/home/username/..." in them and thus
-    people on different computers who have checked the source out to different
-    locations won't see bogus errors.
-    """
-    fullname = self.FullName()
-
-    if os.path.exists(fullname):
-      project_dir = os.path.dirname(fullname)
-
-      if os.path.exists(os.path.join(project_dir, ".svn")):
-        # If there's a .svn file in the current directory, we recursively look
-        # up the directory tree for the top of the SVN checkout
-        root_dir = project_dir
-        one_up_dir = os.path.dirname(root_dir)
-        while os.path.exists(os.path.join(one_up_dir, ".svn")):
-          root_dir = os.path.dirname(root_dir)
-          one_up_dir = os.path.dirname(one_up_dir)
-
-        prefix = os.path.commonprefix([root_dir, project_dir])
-        return fullname[len(prefix) + 1:]
-
-      # Not SVN <= 1.6? Try to find a git, hg, or svn top level directory by
-      # searching up from the current path.
-      root_dir = os.path.dirname(fullname)
-      while (root_dir != os.path.dirname(root_dir) and
-             not os.path.exists(os.path.join(root_dir, ".git")) and
-             not os.path.exists(os.path.join(root_dir, ".hg")) and
-             not os.path.exists(os.path.join(root_dir, ".svn"))):
-        root_dir = os.path.dirname(root_dir)
-
-      if (os.path.exists(os.path.join(root_dir, ".git")) or
-          os.path.exists(os.path.join(root_dir, ".hg")) or
-          os.path.exists(os.path.join(root_dir, ".svn"))):
-        prefix = os.path.commonprefix([root_dir, project_dir])
-        return fullname[len(prefix) + 1:]
-
-    # Don't know what to do; header guard warnings may be wrong...
-    return fullname
-
-  def Split(self):
-    """Splits the file into the directory, basename, and extension.
-
-    For 'chrome/browser/browser.cc', Split() would
-    return ('chrome/browser', 'browser', '.cc')
-
-    Returns:
-      A tuple of (directory, basename, extension).
-    """
-
-    googlename = self.RepositoryName()
-    project, rest = os.path.split(googlename)
-    return (project,) + os.path.splitext(rest)
-
-  def BaseName(self):
-    """File base name - text after the final slash, before the final period."""
-    return self.Split()[1]
-
-  def Extension(self):
-    """File extension - text following the final period."""
-    return self.Split()[2]
-
-  def NoExtension(self):
-    """File has no source file extension."""
-    return '/'.join(self.Split()[0:2])
-
-  def IsSource(self):
-    """File has a source file extension."""
-    return self.Extension()[1:] in ('c', 'cc', 'cpp', 'cxx')
-
-
-def _ShouldPrintError(category, confidence, linenum):
-  """If confidence >= verbose, category passes filter and is not suppressed."""
-
-  # There are three ways we might decide not to print an error message:
-  # a "NOLINT(category)" comment appears in the source,
-  # the verbosity level isn't high enough, or the filters filter it out.
-  if IsErrorSuppressedByNolint(category, linenum):
-    return False
-  if confidence < _cpplint_state.verbose_level:
-    return False
-
-  is_filtered = False
-  for one_filter in _Filters():
-    if one_filter.startswith('-'):
-      if category.startswith(one_filter[1:]):
-        is_filtered = True
-    elif one_filter.startswith('+'):
-      if category.startswith(one_filter[1:]):
-        is_filtered = False
-    else:
-      assert False  # should have been checked for in SetFilter.
-  if is_filtered:
-    return False
-
-  return True
-
-
-def Error(filename, linenum, category, confidence, message):
-  """Logs the fact we've found a lint error.
-
-  We log where the error was found, and also our confidence in the error,
-  that is, how certain we are this is a legitimate style regression, and
-  not a misidentification or a use that's sometimes justified.
-
-  False positives can be suppressed by the use of
-  "cpplint(category)"  comments on the offending line.  These are
-  parsed into _error_suppressions.
-
-  Args:
-    filename: The name of the file containing the error.
-    linenum: The number of the line containing the error.
-    category: A string used to describe the "category" this bug
-      falls under: "whitespace", say, or "runtime".  Categories
-      may have a hierarchy separated by slashes: "whitespace/indent".
-    confidence: A number from 1-5 representing a confidence score for
-      the error, with 5 meaning that we are certain of the problem,
-      and 1 meaning that it could be a legitimate construct.
-    message: The error message.
-  """
-  if _ShouldPrintError(category, confidence, linenum):
-    _cpplint_state.IncrementErrorCount(category)
-    if _cpplint_state.output_format == 'vs7':
-      sys.stderr.write('%s(%s):  %s  [%s] [%d]\n' % (
-          filename, linenum, message, category, confidence))
-    elif _cpplint_state.output_format == 'eclipse':
-      sys.stderr.write('%s:%s: warning: %s  [%s] [%d]\n' % (
-          filename, linenum, message, category, confidence))
-    else:
-      sys.stderr.write('%s:%s:  %s  [%s] [%d]\n' % (
-          filename, linenum, message, category, confidence))
-
-
-# Matches standard C++ escape sequences per 2.13.2.3 of the C++ standard.
-_RE_PATTERN_CLEANSE_LINE_ESCAPES = re.compile(
-    r'\\([abfnrtv?"\\\']|\d+|x[0-9a-fA-F]+)')
-# Matches strings.  Escape codes should already be removed by ESCAPES.
-_RE_PATTERN_CLEANSE_LINE_DOUBLE_QUOTES = re.compile(r'"[^"]*"')
-# Matches characters.  Escape codes should already be removed by ESCAPES.
-_RE_PATTERN_CLEANSE_LINE_SINGLE_QUOTES = re.compile(r"'.'")
-# Matches multi-line C++ comments.
-# This RE is a little bit more complicated than one might expect, because we
-# have to take care of space removals tools so we can handle comments inside
-# statements better.
-# The current rule is: We only clear spaces from both sides when we're at the
-# end of the line. Otherwise, we try to remove spaces from the right side,
-# if this doesn't work we try on left side but only if there's a non-character
-# on the right.
-_RE_PATTERN_CLEANSE_LINE_C_COMMENTS = re.compile(
-    r"""(\s*/\*.*\*/\s*$|
-            /\*.*\*/\s+|
-         \s+/\*.*\*/(?=\W)|
-            /\*.*\*/)""", re.VERBOSE)
-
-
-def IsCppString(line):
-  """Does line terminate so, that the next symbol is in string constant.
-
-  This function does not consider single-line nor multi-line comments.
-
-  Args:
-    line: is a partial line of code starting from the 0..n.
-
-  Returns:
-    True, if next character appended to 'line' is inside a
-    string constant.
-  """
-
-  line = line.replace(r'\\', 'XX')  # after this, \\" does not match to \"
-  return ((line.count('"') - line.count(r'\"') - line.count("'\"'")) & 1) == 1
-
-
-def CleanseRawStrings(raw_lines):
-  """Removes C++11 raw strings from lines.
-
-    Before:
-      static const char kData[] = R"(
-          multi-line string
-          )";
-
-    After:
-      static const char kData[] = ""
-          (replaced by blank line)
-          "";
-
-  Args:
-    raw_lines: list of raw lines.
-
-  Returns:
-    list of lines with C++11 raw strings replaced by empty strings.
-  """
-
-  delimiter = None
-  lines_without_raw_strings = []
-  for line in raw_lines:
-    if delimiter:
-      # Inside a raw string, look for the end
-      end = line.find(delimiter)
-      if end >= 0:
-        # Found the end of the string, match leading space for this
-        # line and resume copying the original lines, and also insert
-        # a "" on the last line.
-        leading_space = Match(r'^(\s*)\S', line)
-        line = leading_space.group(1) + '""' + line[end + len(delimiter):]
-        delimiter = None
-      else:
-        # Haven't found the end yet, append a blank line.
-        line = ''
-
-    else:
-      # Look for beginning of a raw string.
-      # See 2.14.15 [lex.string] for syntax.
-      matched = Match(r'^(.*)\b(?:R|u8R|uR|UR|LR)"([^\s\\()]*)\((.*)$', line)
-      if matched:
-        delimiter = ')' + matched.group(2) + '"'
-
-        end = matched.group(3).find(delimiter)
-        if end >= 0:
-          # Raw string ended on same line
-          line = (matched.group(1) + '""' +
-                  matched.group(3)[end + len(delimiter):])
-          delimiter = None
-        else:
-          # Start of a multi-line raw string
-          line = matched.group(1) + '""'
-
-    lines_without_raw_strings.append(line)
-
-  # TODO(unknown): if delimiter is not None here, we might want to
-  # emit a warning for unterminated string.
-  return lines_without_raw_strings
-
-
-def FindNextMultiLineCommentStart(lines, lineix):
-  """Find the beginning marker for a multiline comment."""
-  while lineix < len(lines):
-    if lines[lineix].strip().startswith('/*'):
-      # Only return this marker if the comment goes beyond this line
-      if lines[lineix].strip().find('*/', 2) < 0:
-        return lineix
-    lineix += 1
-  return len(lines)
-
-
-def FindNextMultiLineCommentEnd(lines, lineix):
-  """We are inside a comment, find the end marker."""
-  while lineix < len(lines):
-    if lines[lineix].strip().endswith('*/'):
-      return lineix
-    lineix += 1
-  return len(lines)
-
-
-def RemoveMultiLineCommentsFromRange(lines, begin, end):
-  """Clears a range of lines for multi-line comments."""
-  # Having // dummy comments makes the lines non-empty, so we will not get
-  # unnecessary blank line warnings later in the code.
-  for i in range(begin, end):
-    lines[i] = '// dummy'
-
-
-def RemoveMultiLineComments(filename, lines, error):
-  """Removes multiline (c-style) comments from lines."""
-  lineix = 0
-  while lineix < len(lines):
-    lineix_begin = FindNextMultiLineCommentStart(lines, lineix)
-    if lineix_begin >= len(lines):
-      return
-    lineix_end = FindNextMultiLineCommentEnd(lines, lineix_begin)
-    if lineix_end >= len(lines):
-      error(filename, lineix_begin + 1, 'readability/multiline_comment', 5,
-            'Could not find end of multi-line comment')
-      return
-    RemoveMultiLineCommentsFromRange(lines, lineix_begin, lineix_end + 1)
-    lineix = lineix_end + 1
-
-
-def CleanseComments(line):
-  """Removes //-comments and single-line C-style /* */ comments.
-
-  Args:
-    line: A line of C++ source.
-
-  Returns:
-    The line with single-line comments removed.
-  """
-  commentpos = line.find('//')
-  if commentpos != -1 and not IsCppString(line[:commentpos]):
-    line = line[:commentpos].rstrip()
-  # get rid of /* ... */
-  return _RE_PATTERN_CLEANSE_LINE_C_COMMENTS.sub('', line)
-
-
-class CleansedLines(object):
-  """Holds 3 copies of all lines with different preprocessing applied to them.
-
-  1) elided member contains lines without strings and comments,
-  2) lines member contains lines without comments, and
-  3) raw_lines member contains all the lines without processing.
-  All these three members are of <type 'list'>, and of the same length.
-  """
-
-  def __init__(self, lines):
-    self.elided = []
-    self.lines = []
-    self.raw_lines = lines
-    self.num_lines = len(lines)
-    self.lines_without_raw_strings = CleanseRawStrings(lines)
-    for linenum in range(len(self.lines_without_raw_strings)):
-      self.lines.append(CleanseComments(
-          self.lines_without_raw_strings[linenum]))
-      elided = self._CollapseStrings(self.lines_without_raw_strings[linenum])
-      self.elided.append(CleanseComments(elided))
-
-  def NumLines(self):
-    """Returns the number of lines represented."""
-    return self.num_lines
-
-  @staticmethod
-  def _CollapseStrings(elided):
-    """Collapses strings and chars on a line to simple "" or '' blocks.
-
-    We nix strings first so we're not fooled by text like '"http://"'
-
-    Args:
-      elided: The line being processed.
-
-    Returns:
-      The line with collapsed strings.
-    """
-    if not _RE_PATTERN_INCLUDE.match(elided):
-      # Remove escaped characters first to make quote/single quote collapsing
-      # basic.  Things that look like escaped characters shouldn't occur
-      # outside of strings and chars.
-      elided = _RE_PATTERN_CLEANSE_LINE_ESCAPES.sub('', elided)
-      elided = _RE_PATTERN_CLEANSE_LINE_SINGLE_QUOTES.sub("''", elided)
-      elided = _RE_PATTERN_CLEANSE_LINE_DOUBLE_QUOTES.sub('""', elided)
-    return elided
-
-
-def FindEndOfExpressionInLine(line, startpos, depth, startchar, endchar):
-  """Find the position just after the matching endchar.
-
-  Args:
-    line: a CleansedLines line.
-    startpos: start searching at this position.
-    depth: nesting level at startpos.
-    startchar: expression opening character.
-    endchar: expression closing character.
-
-  Returns:
-    On finding matching endchar: (index just after matching endchar, 0)
-    Otherwise: (-1, new depth at end of this line)
-  """
-  for i in xrange(startpos, len(line)):
-    if line[i] == startchar:
-      depth += 1
-    elif line[i] == endchar:
-      depth -= 1
-      if depth == 0:
-        return (i + 1, 0)
-  return (-1, depth)
-
-
-def CloseExpression(clean_lines, linenum, pos):
-  """If input points to ( or { or [ or <, finds the position that closes it.
-
-  If lines[linenum][pos] points to a '(' or '{' or '[' or '<', finds the
-  linenum/pos that correspond to the closing of the expression.
-
-  Args:
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    pos: A position on the line.
-
-  Returns:
-    A tuple (line, linenum, pos) pointer *past* the closing brace, or
-    (line, len(lines), -1) if we never find a close.  Note we ignore
-    strings and comments when matching; and the line we return is the
-    'cleansed' line at linenum.
-  """
-
-  line = clean_lines.elided[linenum]
-  startchar = line[pos]
-  if startchar not in '({[<':
-    return (line, clean_lines.NumLines(), -1)
-  if startchar == '(': endchar = ')'
-  if startchar == '[': endchar = ']'
-  if startchar == '{': endchar = '}'
-  if startchar == '<': endchar = '>'
-
-  # Check first line
-  (end_pos, num_open) = FindEndOfExpressionInLine(
-      line, pos, 0, startchar, endchar)
-  if end_pos > -1:
-    return (line, linenum, end_pos)
-
-  # Continue scanning forward
-  while linenum < clean_lines.NumLines() - 1:
-    linenum += 1
-    line = clean_lines.elided[linenum]
-    (end_pos, num_open) = FindEndOfExpressionInLine(
-        line, 0, num_open, startchar, endchar)
-    if end_pos > -1:
-      return (line, linenum, end_pos)
-
-  # Did not find endchar before end of file, give up
-  return (line, clean_lines.NumLines(), -1)
-
-
-def FindStartOfExpressionInLine(line, endpos, depth, startchar, endchar):
-  """Find position at the matching startchar.
-
-  This is almost the reverse of FindEndOfExpressionInLine, but note
-  that the input position and returned position differs by 1.
-
-  Args:
-    line: a CleansedLines line.
-    endpos: start searching at this position.
-    depth: nesting level at endpos.
-    startchar: expression opening character.
-    endchar: expression closing character.
-
-  Returns:
-    On finding matching startchar: (index at matching startchar, 0)
-    Otherwise: (-1, new depth at beginning of this line)
-  """
-  for i in xrange(endpos, -1, -1):
-    if line[i] == endchar:
-      depth += 1
-    elif line[i] == startchar:
-      depth -= 1
-      if depth == 0:
-        return (i, 0)
-  return (-1, depth)
-
-
-def ReverseCloseExpression(clean_lines, linenum, pos):
-  """If input points to ) or } or ] or >, finds the position that opens it.
-
-  If lines[linenum][pos] points to a ')' or '}' or ']' or '>', finds the
-  linenum/pos that correspond to the opening of the expression.
-
-  Args:
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    pos: A position on the line.
-
-  Returns:
-    A tuple (line, linenum, pos) pointer *at* the opening brace, or
-    (line, 0, -1) if we never find the matching opening brace.  Note
-    we ignore strings and comments when matching; and the line we
-    return is the 'cleansed' line at linenum.
-  """
-  line = clean_lines.elided[linenum]
-  endchar = line[pos]
-  if endchar not in ')}]>':
-    return (line, 0, -1)
-  if endchar == ')': startchar = '('
-  if endchar == ']': startchar = '['
-  if endchar == '}': startchar = '{'
-  if endchar == '>': startchar = '<'
-
-  # Check last line
-  (start_pos, num_open) = FindStartOfExpressionInLine(
-      line, pos, 0, startchar, endchar)
-  if start_pos > -1:
-    return (line, linenum, start_pos)
-
-  # Continue scanning backward
-  while linenum > 0:
-    linenum -= 1
-    line = clean_lines.elided[linenum]
-    (start_pos, num_open) = FindStartOfExpressionInLine(
-        line, len(line) - 1, num_open, startchar, endchar)
-    if start_pos > -1:
-      return (line, linenum, start_pos)
-
-  # Did not find startchar before beginning of file, give up
-  return (line, 0, -1)
-
-
-def CheckForCopyright(filename, lines, error):
-  """Logs an error if no Copyright message appears at the top of the file."""
-
-  # We'll say it should occur by line 10. Don't forget there's a
-  # dummy line at the front.
-  for line in xrange(1, min(len(lines), 11)):
-    if re.search(r'Copyright', lines[line], re.I): break
-  else:                       # means no copyright line was found
-    error(filename, 0, 'legal/copyright', 5,
-          'No copyright message found.  '
-          'You should have a line: "Copyright [year] <Copyright Owner>"')
-
-
-def GetHeaderGuardCPPVariable(filename):
-  """Returns the CPP variable that should be used as a header guard.
-
-  Args:
-    filename: The name of a C++ header file.
-
-  Returns:
-    The CPP variable that should be used as a header guard in the
-    named file.
-
-  """
-
-  # Restores original filename in case that cpplint is invoked from Emacs's
-  # flymake.
-  filename = re.sub(r'_flymake\.h$', '.h', filename)
-  filename = re.sub(r'/\.flymake/([^/]*)$', r'/\1', filename)
-
-  fileinfo = FileInfo(filename)
-  file_path_from_root = fileinfo.RepositoryName()
-  if _root:
-    file_path_from_root = re.sub('^' + _root + os.sep, '', file_path_from_root)
-  return re.sub(r'[-./\s]', '_', file_path_from_root).upper() + '_'
-
-
-def CheckForHeaderGuard(filename, lines, error):
-  """Checks that the file contains a header guard.
-
-  Logs an error if no #ifndef header guard is present.  For other
-  headers, checks that the full pathname is used.
-
-  Args:
-    filename: The name of the C++ header file.
-    lines: An array of strings, each representing a line of the file.
-    error: The function to call with any errors found.
-  """
-
-  cppvar = GetHeaderGuardCPPVariable(filename)
-
-  ifndef = None
-  ifndef_linenum = 0
-  define = None
-  endif = None
-  endif_linenum = 0
-  for linenum, line in enumerate(lines):
-    # Already been well guarded, no need for further checking.
-    if line.strip() == "#pragma once":
-        return
-    linesplit = line.split()
-    if len(linesplit) >= 2:
-      # find the first occurrence of #ifndef and #define, save arg
-      if not ifndef and linesplit[0] == '#ifndef':
-        # set ifndef to the header guard presented on the #ifndef line.
-        ifndef = linesplit[1]
-        ifndef_linenum = linenum
-      if not define and linesplit[0] == '#define':
-        define = linesplit[1]
-    # find the last occurrence of #endif, save entire line
-    if line.startswith('#endif'):
-      endif = line
-      endif_linenum = linenum
-
-  if not ifndef:
-    error(filename, 0, 'build/header_guard', 5,
-          'No #ifndef header guard found, suggested CPP variable is: %s' %
-          cppvar)
-    return
-
-  if not define:
-    error(filename, 0, 'build/header_guard', 5,
-          'No #define header guard found, suggested CPP variable is: %s' %
-          cppvar)
-    return
-
-  # The guard should be PATH_FILE_H_, but we also allow PATH_FILE_H__
-  # for backward compatibility.
-  if ifndef != cppvar:
-    error_level = 0
-    if ifndef != cppvar + '_':
-      error_level = 5
-
-    ParseNolintSuppressions(filename, lines[ifndef_linenum], ifndef_linenum,
-                            error)
-    error(filename, ifndef_linenum, 'build/header_guard', error_level,
-          '#ifndef header guard has wrong style, please use: %s' % cppvar)
-
-  if define != ifndef:
-    error(filename, 0, 'build/header_guard', 5,
-          '#ifndef and #define don\'t match, suggested CPP variable is: %s' %
-          cppvar)
-    return
-
-  if endif != ('#endif  // %s' % cppvar):
-    error_level = 0
-    if endif != ('#endif  // %s' % (cppvar + '_')):
-      error_level = 5
-
-    ParseNolintSuppressions(filename, lines[endif_linenum], endif_linenum,
-                            error)
-    error(filename, endif_linenum, 'build/header_guard', error_level,
-          '#endif line should be "#endif  // %s"' % cppvar)
-
-
-def CheckForBadCharacters(filename, lines, error):
-  """Logs an error for each line containing bad characters.
-
-  Two kinds of bad characters:
-
-  1. Unicode replacement characters: These indicate that either the file
-  contained invalid UTF-8 (likely) or Unicode replacement characters (which
-  it shouldn't).  Note that it's possible for this to throw off line
-  numbering if the invalid UTF-8 occurred adjacent to a newline.
-
-  2. NUL bytes.  These are problematic for some tools.
-
-  Args:
-    filename: The name of the current file.
-    lines: An array of strings, each representing a line of the file.
-    error: The function to call with any errors found.
-  """
-  for linenum, line in enumerate(lines):
-    if u'\ufffd' in line:
-      error(filename, linenum, 'readability/utf8', 5,
-            'Line contains invalid UTF-8 (or Unicode replacement character).')
-    if '\0' in line:
-      error(filename, linenum, 'readability/nul', 5, 'Line contains NUL byte.')
-
-
-def CheckForNewlineAtEOF(filename, lines, error):
-  """Logs an error if there is no newline char at the end of the file.
-
-  Args:
-    filename: The name of the current file.
-    lines: An array of strings, each representing a line of the file.
-    error: The function to call with any errors found.
-  """
-
-  # The array lines() was created by adding two newlines to the
-  # original file (go figure), then splitting on \n.
-  # To verify that the file ends in \n, we just have to make sure the
-  # last-but-two element of lines() exists and is empty.
-  if len(lines) < 3 or lines[-2]:
-    error(filename, len(lines) - 2, 'whitespace/ending_newline', 5,
-          'Could not find a newline character at the end of the file.')
-
-
-def CheckForMultilineCommentsAndStrings(filename, clean_lines, linenum, error):
-  """Logs an error if we see /* ... */ or "..." that extend past one line.
-
-  /* ... */ comments are legit inside macros, for one line.
-  Otherwise, we prefer // comments, so it's ok to warn about the
-  other.  Likewise, it's ok for strings to extend across multiple
-  lines, as long as a line continuation character (backslash)
-  terminates each line. Although not currently prohibited by the C++
-  style guide, it's ugly and unnecessary. We don't do well with either
-  in this lint program, so we warn about both.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    error: The function to call with any errors found.
-  """
-  line = clean_lines.elided[linenum]
-
-  # Remove all \\ (escaped backslashes) from the line. They are OK, and the
-  # second (escaped) slash may trigger later \" detection erroneously.
-  line = line.replace('\\\\', '')
-
-  if line.count('/*') > line.count('*/'):
-    error(filename, linenum, 'readability/multiline_comment', 5,
-          'Complex multi-line /*...*/-style comment found. '
-          'Lint may give bogus warnings.  '
-          'Consider replacing these with //-style comments, '
-          'with #if 0...#endif, '
-          'or with more clearly structured multi-line comments.')
-
-  if (line.count('"') - line.count('\\"')) % 2:
-    error(filename, linenum, 'readability/multiline_string', 5,
-          'Multi-line string ("...") found.  This lint script doesn\'t '
-          'do well with such strings, and may give bogus warnings.  '
-          'Use C++11 raw strings or concatenation instead.')
-
-
-threading_list = (
-    ('asctime(', 'asctime_r('),
-    ('ctime(', 'ctime_r('),
-    ('getgrgid(', 'getgrgid_r('),
-    ('getgrnam(', 'getgrnam_r('),
-    ('getlogin(', 'getlogin_r('),
-    ('getpwnam(', 'getpwnam_r('),
-    ('getpwuid(', 'getpwuid_r('),
-    ('gmtime(', 'gmtime_r('),
-    ('localtime(', 'localtime_r('),
-    ('rand(', 'rand_r('),
-    ('strtok(', 'strtok_r('),
-    ('ttyname(', 'ttyname_r('),
-    )
-
-
-def CheckPosixThreading(filename, clean_lines, linenum, error):
-  """Checks for calls to thread-unsafe functions.
-
-  Much code has been originally written without consideration of
-  multi-threading. Also, engineers are relying on their old experience;
-  they have learned posix before threading extensions were added. These
-  tests guide the engineers to use thread-safe functions (when using
-  posix directly).
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    error: The function to call with any errors found.
-  """
-  line = clean_lines.elided[linenum]
-  for single_thread_function, multithread_safe_function in threading_list:
-    ix = line.find(single_thread_function)
-    # Comparisons made explicit for clarity -- pylint: disable=g-explicit-bool-comparison
-    if ix >= 0 and (ix == 0 or (not line[ix - 1].isalnum() and
-                                line[ix - 1] not in ('_', '.', '>'))):
-      error(filename, linenum, 'runtime/threadsafe_fn', 2,
-            'Consider using ' + multithread_safe_function +
-            '...) instead of ' + single_thread_function +
-            '...) for improved thread safety.')
-
-
-def CheckVlogArguments(filename, clean_lines, linenum, error):
-  """Checks that VLOG() is only used for defining a logging level.
-
-  For example, VLOG(2) is correct. VLOG(INFO), VLOG(WARNING), VLOG(ERROR), and
-  VLOG(FATAL) are not.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    error: The function to call with any errors found.
-  """
-  line = clean_lines.elided[linenum]
-  if Search(r'\bVLOG\((INFO|ERROR|WARNING|DFATAL|FATAL)\)', line):
-    error(filename, linenum, 'runtime/vlog', 5,
-          'VLOG() should be used with numeric verbosity level.  '
-          'Use LOG() if you want symbolic severity levels.')
-
-
-# Matches invalid increment: *count++, which moves pointer instead of
-# incrementing a value.
-_RE_PATTERN_INVALID_INCREMENT = re.compile(
-    r'^\s*\*\w+(\+\+|--);')
-
-
-def CheckInvalidIncrement(filename, clean_lines, linenum, error):
-  """Checks for invalid increment *count++.
-
-  For example following function:
-  void increment_counter(int* count) {
-    *count++;
-  }
-  is invalid, because it effectively does count++, moving pointer, and should
-  be replaced with ++*count, (*count)++ or *count += 1.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    error: The function to call with any errors found.
-  """
-  line = clean_lines.elided[linenum]
-  if _RE_PATTERN_INVALID_INCREMENT.match(line):
-    error(filename, linenum, 'runtime/invalid_increment', 5,
-          'Changing pointer instead of value (or unused value of operator*).')
-
-
-class _BlockInfo(object):
-  """Stores information about a generic block of code."""
-
-  def __init__(self, seen_open_brace):
-    self.seen_open_brace = seen_open_brace
-    self.open_parentheses = 0
-    self.inline_asm = _NO_ASM
-
-  def CheckBegin(self, filename, clean_lines, linenum, error):
-    """Run checks that applies to text up to the opening brace.
-
-    This is mostly for checking the text after the class identifier
-    and the "{", usually where the base class is specified.  For other
-    blocks, there isn't much to check, so we always pass.
-
-    Args:
-      filename: The name of the current file.
-      clean_lines: A CleansedLines instance containing the file.
-      linenum: The number of the line to check.
-      error: The function to call with any errors found.
-    """
-    pass
-
-  def CheckEnd(self, filename, clean_lines, linenum, error):
-    """Run checks that applies to text after the closing brace.
-
-    This is mostly used for checking end of namespace comments.
-
-    Args:
-      filename: The name of the current file.
-      clean_lines: A CleansedLines instance containing the file.
-      linenum: The number of the line to check.
-      error: The function to call with any errors found.
-    """
-    pass
-
-
-class _ClassInfo(_BlockInfo):
-  """Stores information about a class."""
-
-  def __init__(self, name, class_or_struct, clean_lines, linenum):
-    _BlockInfo.__init__(self, False)
-    self.name = name
-    self.starting_linenum = linenum
-    self.is_derived = False
-    if class_or_struct == 'struct':
-      self.access = 'public'
-      self.is_struct = True
-    else:
-      self.access = 'private'
-      self.is_struct = False
-
-    # Remember initial indentation level for this class.  Using raw_lines here
-    # instead of elided to account for leading comments.
-    initial_indent = Match(r'^( *)\S', clean_lines.raw_lines[linenum])
-    if initial_indent:
-      self.class_indent = len(initial_indent.group(1))
-    else:
-      self.class_indent = 0
-
-    # Try to find the end of the class.  This will be confused by things like:
-    #   class A {
-    #   } *x = { ...
-    #
-    # But it's still good enough for CheckSectionSpacing.
-    self.last_line = 0
-    depth = 0
-    for i in range(linenum, clean_lines.NumLines()):
-      line = clean_lines.elided[i]
-      depth += line.count('{') - line.count('}')
-      if not depth:
-        self.last_line = i
-        break
-
-  def CheckBegin(self, filename, clean_lines, linenum, error):
-    # Look for a bare ':'
-    if Search('(^|[^:]):($|[^:])', clean_lines.elided[linenum]):
-      self.is_derived = True
-
-  def CheckEnd(self, filename, clean_lines, linenum, error):
-    # Check that closing brace is aligned with beginning of the class.
-    # Only do this if the closing brace is indented by only whitespaces.
-    # This means we will not check single-line class definitions.
-    indent = Match(r'^( *)\}', clean_lines.elided[linenum])
-    if indent and len(indent.group(1)) != self.class_indent:
-      if self.is_struct:
-        parent = 'struct ' + self.name
-      else:
-        parent = 'class ' + self.name
-      error(filename, linenum, 'whitespace/indent', 3,
-            'Closing brace should be aligned with beginning of %s' % parent)
-
-
-class _NamespaceInfo(_BlockInfo):
-  """Stores information about a namespace."""
-
-  def __init__(self, name, linenum):
-    _BlockInfo.__init__(self, False)
-    self.name = name or ''
-    self.starting_linenum = linenum
-
-  def CheckEnd(self, filename, clean_lines, linenum, error):
-    """Check end of namespace comments."""
-    line = clean_lines.raw_lines[linenum]
-
-    # Check how many lines is enclosed in this namespace.  Don't issue
-    # warning for missing namespace comments if there aren't enough
-    # lines.  However, do apply checks if there is already an end of
-    # namespace comment and it's incorrect.
-    #
-    # TODO(unknown): We always want to check end of namespace comments
-    # if a namespace is large, but sometimes we also want to apply the
-    # check if a short namespace contained nontrivial things (something
-    # other than forward declarations).  There is currently no logic on
-    # deciding what these nontrivial things are, so this check is
-    # triggered by namespace size only, which works most of the time.
-    if (linenum - self.starting_linenum < 10
-        and not Match(r'};*\s*(//|/\*).*\bnamespace\b', line)):
-      return
-
-    # Look for matching comment at end of namespace.
-    #
-    # Note that we accept C style "/* */" comments for terminating
-    # namespaces, so that code that terminate namespaces inside
-    # preprocessor macros can be cpplint clean.
-    #
-    # We also accept stuff like "// end of namespace <name>." with the
-    # period at the end.
-    #
-    # Besides these, we don't accept anything else, otherwise we might
-    # get false negatives when existing comment is a substring of the
-    # expected namespace.
-    if self.name:
-      # Named namespace
-      if not Match((r'};*\s*(//|/\*).*\bnamespace\s+' + re.escape(self.name) +
-                    r'[\*/\.\\\s]*$'),
-                   line):
-        error(filename, linenum, 'readability/namespace', 5,
-              'Namespace should be terminated with "// namespace %s"' %
-              self.name)
-    else:
-      # Anonymous namespace
-      if not Match(r'};*\s*(//|/\*).*\bnamespace[\*/\.\\\s]*$', line):
-        error(filename, linenum, 'readability/namespace', 5,
-              'Namespace should be terminated with "// namespace"')
-
-
-class _PreprocessorInfo(object):
-  """Stores checkpoints of nesting stacks when #if/#else is seen."""
-
-  def __init__(self, stack_before_if):
-    # The entire nesting stack before #if
-    self.stack_before_if = stack_before_if
-
-    # The entire nesting stack up to #else
-    self.stack_before_else = []
-
-    # Whether we have already seen #else or #elif
-    self.seen_else = False
-
-
-class _NestingState(object):
-  """Holds states related to parsing braces."""
-
-  def __init__(self):
-    # Stack for tracking all braces.  An object is pushed whenever we
-    # see a "{", and popped when we see a "}".  Only 3 types of
-    # objects are possible:
-    # - _ClassInfo: a class or struct.
-    # - _NamespaceInfo: a namespace.
-    # - _BlockInfo: some other type of block.
-    self.stack = []
-
-    # Stack of _PreprocessorInfo objects.
-    self.pp_stack = []
-
-  def SeenOpenBrace(self):
-    """Check if we have seen the opening brace for the innermost block.
-
-    Returns:
-      True if we have seen the opening brace, False if the innermost
-      block is still expecting an opening brace.
-    """
-    return (not self.stack) or self.stack[-1].seen_open_brace
-
-  def InNamespaceBody(self):
-    """Check if we are currently one level inside a namespace body.
-
-    Returns:
-      True if top of the stack is a namespace block, False otherwise.
-    """
-    return self.stack and isinstance(self.stack[-1], _NamespaceInfo)
-
-  def UpdatePreprocessor(self, line):
-    """Update preprocessor stack.
-
-    We need to handle preprocessors due to classes like this:
-      #ifdef SWIG
-      struct ResultDetailsPageElementExtensionPoint {
-      #else
-      struct ResultDetailsPageElementExtensionPoint : public Extension {
-      #endif
-
-    We make the following assumptions (good enough for most files):
-    - Preprocessor condition evaluates to true from #if up to first
-      #else/#elif/#endif.
-
-    - Preprocessor condition evaluates to false from #else/#elif up
-      to #endif.  We still perform lint checks on these lines, but
-      these do not affect nesting stack.
-
-    Args:
-      line: current line to check.
-    """
-    if Match(r'^\s*#\s*(if|ifdef|ifndef)\b', line):
-      # Beginning of #if block, save the nesting stack here.  The saved
-      # stack will allow us to restore the parsing state in the #else case.
-      self.pp_stack.append(_PreprocessorInfo(copy.deepcopy(self.stack)))
-    elif Match(r'^\s*#\s*(else|elif)\b', line):
-      # Beginning of #else block
-      if self.pp_stack:
-        if not self.pp_stack[-1].seen_else:
-          # This is the first #else or #elif block.  Remember the
-          # whole nesting stack up to this point.  This is what we
-          # keep after the #endif.
-          self.pp_stack[-1].seen_else = True
-          self.pp_stack[-1].stack_before_else = copy.deepcopy(self.stack)
-
-        # Restore the stack to how it was before the #if
-        self.stack = copy.deepcopy(self.pp_stack[-1].stack_before_if)
-      else:
-        # TODO(unknown): unexpected #else, issue warning?
-        pass
-    elif Match(r'^\s*#\s*endif\b', line):
-      # End of #if or #else blocks.
-      if self.pp_stack:
-        # If we saw an #else, we will need to restore the nesting
-        # stack to its former state before the #else, otherwise we
-        # will just continue from where we left off.
-        if self.pp_stack[-1].seen_else:
-          # Here we can just use a shallow copy since we are the last
-          # reference to it.
-          self.stack = self.pp_stack[-1].stack_before_else
-        # Drop the corresponding #if
-        self.pp_stack.pop()
-      else:
-        # TODO(unknown): unexpected #endif, issue warning?
-        pass
-
-  def Update(self, filename, clean_lines, linenum, error):
-    """Update nesting state with current line.
-
-    Args:
-      filename: The name of the current file.
-      clean_lines: A CleansedLines instance containing the file.
-      linenum: The number of the line to check.
-      error: The function to call with any errors found.
-    """
-    line = clean_lines.elided[linenum]
-
-    # Update pp_stack first
-    self.UpdatePreprocessor(line)
-
-    # Count parentheses.  This is to avoid adding struct arguments to
-    # the nesting stack.
-    if self.stack:
-      inner_block = self.stack[-1]
-      depth_change = line.count('(') - line.count(')')
-      inner_block.open_parentheses += depth_change
-
-      # Also check if we are starting or ending an inline assembly block.
-      if inner_block.inline_asm in (_NO_ASM, _END_ASM):
-        if (depth_change != 0 and
-            inner_block.open_parentheses == 1 and
-            _MATCH_ASM.match(line)):
-          # Enter assembly block
-          inner_block.inline_asm = _INSIDE_ASM
-        else:
-          # Not entering assembly block.  If previous line was _END_ASM,
-          # we will now shift to _NO_ASM state.
-          inner_block.inline_asm = _NO_ASM
-      elif (inner_block.inline_asm == _INSIDE_ASM and
-            inner_block.open_parentheses == 0):
-        # Exit assembly block
-        inner_block.inline_asm = _END_ASM
-
-    # Consume namespace declaration at the beginning of the line.  Do
-    # this in a loop so that we catch same line declarations like this:
-    #   namespace proto2 { namespace bridge { class MessageSet; } }
-    while True:
-      # Match start of namespace.  The "\b\s*" below catches namespace
-      # declarations even if it weren't followed by a whitespace, this
-      # is so that we don't confuse our namespace checker.  The
-      # missing spaces will be flagged by CheckSpacing.
-      namespace_decl_match = Match(r'^\s*namespace\b\s*([:\w]+)?(.*)$', line)
-      if not namespace_decl_match:
-        break
-
-      new_namespace = _NamespaceInfo(namespace_decl_match.group(1), linenum)
-      self.stack.append(new_namespace)
-
-      line = namespace_decl_match.group(2)
-      if line.find('{') != -1:
-        new_namespace.seen_open_brace = True
-        line = line[line.find('{') + 1:]
-
-    # Look for a class declaration in whatever is left of the line
-    # after parsing namespaces.  The regexp accounts for decorated classes
-    # such as in:
-    #   class LOCKABLE API Object {
-    #   };
-    #
-    # Templates with class arguments may confuse the parser, for example:
-    #   template <class T
-    #             class Comparator = less<T>,
-    #             class Vector = vector<T> >
-    #   class HeapQueue {
-    #
-    # Because this parser has no nesting state about templates, by the
-    # time it saw "class Comparator", it may think that it's a new class.
-    # Nested templates have a similar problem:
-    #   template <
-    #       typename ExportedType,
-    #       typename TupleType,
-    #       template <typename, typename> class ImplTemplate>
-    #
-    # To avoid these cases, we ignore classes that are followed by '=' or '>'
-    class_decl_match = Match(
-        r'\s*(template\s*<[\w\s<>,:]*>\s*)?'
-        r'(class|struct)\s+([A-Z_]+\s+)*(\w+(?:::\w+)*)'
-        r'(([^=>]|<[^<>]*>|<[^<>]*<[^<>]*>\s*>)*)$', line)
-    if (class_decl_match and
-        (not self.stack or self.stack[-1].open_parentheses == 0)):
-      self.stack.append(_ClassInfo(
-          class_decl_match.group(4), class_decl_match.group(2),
-          clean_lines, linenum))
-      line = class_decl_match.group(5)
-
-    # If we have not yet seen the opening brace for the innermost block,
-    # run checks here.
-    if not self.SeenOpenBrace():
-      self.stack[-1].CheckBegin(filename, clean_lines, linenum, error)
-
-    # Update access control if we are inside a class/struct
-    if self.stack and isinstance(self.stack[-1], _ClassInfo):
-      classinfo = self.stack[-1]
-      access_match = Match(
-          r'^(.*)\b(public|private|protected|signals)(\s+(?:slots\s*)?)?'
-          r':(?:[^:]|$)',
-          line)
-      if access_match:
-        classinfo.access = access_match.group(2)
-
-        # Check that access keywords are indented +1 space.  Skip this
-        # check if the keywords are not preceded by whitespaces.
-        indent = access_match.group(1)
-        if (len(indent) != classinfo.class_indent + 1 and
-            Match(r'^\s*$', indent)):
-          if classinfo.is_struct:
-            parent = 'struct ' + classinfo.name
-          else:
-            parent = 'class ' + classinfo.name
-          slots = ''
-          if access_match.group(3):
-            slots = access_match.group(3)
-          error(filename, linenum, 'whitespace/indent', 3,
-                '%s%s: should be indented +1 space inside %s' % (
-                    access_match.group(2), slots, parent))
-
-    # Consume braces or semicolons from what's left of the line
-    while True:
-      # Match first brace, semicolon, or closed parenthesis.
-      matched = Match(r'^[^{;)}]*([{;)}])(.*)$', line)
-      if not matched:
-        break
-
-      token = matched.group(1)
-      if token == '{':
-        # If namespace or class hasn't seen a opening brace yet, mark
-        # namespace/class head as complete.  Push a new block onto the
-        # stack otherwise.
-        if not self.SeenOpenBrace():
-          self.stack[-1].seen_open_brace = True
-        else:
-          self.stack.append(_BlockInfo(True))
-          if _MATCH_ASM.match(line):
-            self.stack[-1].inline_asm = _BLOCK_ASM
-      elif token == ';' or token == ')':
-        # If we haven't seen an opening brace yet, but we already saw
-        # a semicolon, this is probably a forward declaration.  Pop
-        # the stack for these.
-        #
-        # Similarly, if we haven't seen an opening brace yet, but we
-        # already saw a closing parenthesis, then these are probably
-        # function arguments with extra "class" or "struct" keywords.
-        # Also pop these stack for these.
-        if not self.SeenOpenBrace():
-          self.stack.pop()
-      else:  # token == '}'
-        # Perform end of block checks and pop the stack.
-        if self.stack:
-          self.stack[-1].CheckEnd(filename, clean_lines, linenum, error)
-          self.stack.pop()
-      line = matched.group(2)
-
-  def InnermostClass(self):
-    """Get class info on the top of the stack.
-
-    Returns:
-      A _ClassInfo object if we are inside a class, or None otherwise.
-    """
-    for i in range(len(self.stack), 0, -1):
-      classinfo = self.stack[i - 1]
-      if isinstance(classinfo, _ClassInfo):
-        return classinfo
-    return None
-
-  def CheckCompletedBlocks(self, filename, error):
-    """Checks that all classes and namespaces have been completely parsed.
-
-    Call this when all lines in a file have been processed.
-    Args:
-      filename: The name of the current file.
-      error: The function to call with any errors found.
-    """
-    # Note: This test can result in false positives if #ifdef constructs
-    # get in the way of brace matching. See the testBuildClass test in
-    # cpplint_unittest.py for an example of this.
-    for obj in self.stack:
-      if isinstance(obj, _ClassInfo):
-        error(filename, obj.starting_linenum, 'build/class', 5,
-              'Failed to find complete declaration of class %s' %
-              obj.name)
-      elif isinstance(obj, _NamespaceInfo):
-        error(filename, obj.starting_linenum, 'build/namespaces', 5,
-              'Failed to find complete declaration of namespace %s' %
-              obj.name)
-
-
-def CheckForNonStandardConstructs(filename, clean_lines, linenum,
-                                  nesting_state, error):
-  r"""Logs an error if we see certain non-ANSI constructs ignored by gcc-2.
-
-  Complain about several constructs which gcc-2 accepts, but which are
-  not standard C++.  Warning about these in lint is one way to ease the
-  transition to new compilers.
-  - put storage class first (e.g. "static const" instead of "const static").
-  - "%lld" instead of %qd" in printf-type functions.
-  - "%1$d" is non-standard in printf-type functions.
-  - "\%" is an undefined character escape sequence.
-  - text after #endif is not allowed.
-  - invalid inner-style forward declaration.
-  - >? and <? operators, and their >?= and <?= cousins.
-
-  Additionally, check for constructor/destructor style violations and reference
-  members, as it is very convenient to do so while checking for
-  gcc-2 compliance.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    nesting_state: A _NestingState instance which maintains information about
-                   the current stack of nested blocks being parsed.
-    error: A callable to which errors are reported, which takes 4 arguments:
-           filename, line number, error level, and message
-  """
-
-  # Remove comments from the line, but leave in strings for now.
-  line = clean_lines.lines[linenum]
-
-  if Search(r'printf\s*\(.*".*%[-+ ]?\d*q', line):
-    error(filename, linenum, 'runtime/printf_format', 3,
-          '%q in format strings is deprecated.  Use %ll instead.')
-
-  if Search(r'printf\s*\(.*".*%\d+\$', line):
-    error(filename, linenum, 'runtime/printf_format', 2,
-          '%N$ formats are unconventional.  Try rewriting to avoid them.')
-
-  # Remove escaped backslashes before looking for undefined escapes.
-  line = line.replace('\\\\', '')
-
-  if Search(r'("|\').*\\(%|\[|\(|{)', line):
-    error(filename, linenum, 'build/printf_format', 3,
-          '%, [, (, and { are undefined character escapes.  Unescape them.')
-
-  # For the rest, work with both comments and strings removed.
-  line = clean_lines.elided[linenum]
-
-  if Search(r'\b(const|volatile|void|char|short|int|long'
-            r'|float|double|signed|unsigned'
-            r'|schar|u?int8|u?int16|u?int32|u?int64)'
-            r'\s+(register|static|extern|typedef)\b',
-            line):
-    error(filename, linenum, 'build/storage_class', 5,
-          'Storage class (static, extern, typedef, etc) should be first.')
-
-  if Match(r'\s*#\s*endif\s*[^/\s]+', line):
-    error(filename, linenum, 'build/endif_comment', 5,
-          'Uncommented text after #endif is non-standard.  Use a comment.')
-
-  if Match(r'\s*class\s+(\w+\s*::\s*)+\w+\s*;', line):
-    error(filename, linenum, 'build/forward_decl', 5,
-          'Inner-style forward declarations are invalid.  Remove this line.')
-
-  if Search(r'(\w+|[+-]?\d+(\.\d*)?)\s*(<|>)\?=?\s*(\w+|[+-]?\d+)(\.\d*)?',
-            line):
-    error(filename, linenum, 'build/deprecated', 3,
-          '>? and <? (max and min) operators are non-standard and deprecated.')
-
-  if Search(r'^\s*const\s*string\s*&\s*\w+\s*;', line):
-    # TODO(unknown): Could it be expanded safely to arbitrary references,
-    # without triggering too many false positives? The first
-    # attempt triggered 5 warnings for mostly benign code in the regtest, hence
-    # the restriction.
-    # Here's the original regexp, for the reference:
-    # type_name = r'\w+((\s*::\s*\w+)|(\s*<\s*\w+?\s*>))?'
-    # r'\s*const\s*' + type_name + '\s*&\s*\w+\s*;'
-    error(filename, linenum, 'runtime/member_string_references', 2,
-          'const string& members are dangerous. It is much better to use '
-          'alternatives, such as pointers or simple constants.')
-
-  # Everything else in this function operates on class declarations.
-  # Return early if the top of the nesting stack is not a class, or if
-  # the class head is not completed yet.
-  classinfo = nesting_state.InnermostClass()
-  if not classinfo or not classinfo.seen_open_brace:
-    return
-
-  # The class may have been declared with namespace or classname qualifiers.
-  # The constructor and destructor will not have those qualifiers.
-  base_classname = classinfo.name.split('::')[-1]
-
-  # Look for single-argument constructors that aren't marked explicit.
-  # Technically a valid construct, but against style.
-  args = Match(r'\s+(?:inline\s+)?%s\s*\(([^,()]+)\)'
-               % re.escape(base_classname),
-               line)
-  if (args and
-      args.group(1) != 'void' and
-      not Match(r'(const\s+)?%s(\s+const)?\s*(?:<\w+>\s*)?&'
-                % re.escape(base_classname), args.group(1).strip())):
-    error(filename, linenum, 'runtime/explicit', 5,
-          'Single-argument constructors should be marked explicit.')
-
-
-def CheckSpacingForFunctionCall(filename, line, linenum, error):
-  """Checks for the correctness of various spacing around function calls.
-
-  Args:
-    filename: The name of the current file.
-    line: The text of the line to check.
-    linenum: The number of the line to check.
-    error: The function to call with any errors found.
-  """
-
-  # Since function calls often occur inside if/for/while/switch
-  # expressions - which have their own, more liberal conventions - we
-  # first see if we should be looking inside such an expression for a
-  # function call, to which we can apply more strict standards.
-  fncall = line    # if there's no control flow construct, look at whole line
-  for pattern in (r'\bif\s*\((.*)\)\s*{',
-                  r'\bfor\s*\((.*)\)\s*{',
-                  r'\bwhile\s*\((.*)\)\s*[{;]',
-                  r'\bswitch\s*\((.*)\)\s*{'):
-    match = Search(pattern, line)
-    if match:
-      fncall = match.group(1)    # look inside the parens for function calls
-      break
-
-  # Except in if/for/while/switch, there should never be space
-  # immediately inside parens (eg "f( 3, 4 )").  We make an exception
-  # for nested parens ( (a+b) + c ).  Likewise, there should never be
-  # a space before a ( when it's a function argument.  I assume it's a
-  # function argument when the char before the whitespace is legal in
-  # a function name (alnum + _) and we're not starting a macro. Also ignore
-  # pointers and references to arrays and functions coz they're too tricky:
-  # we use a very simple way to recognize these:
-  # " (something)(maybe-something)" or
-  # " (something)(maybe-something," or
-  # " (something)[something]"
-  # Note that we assume the contents of [] to be short enough that
-  # they'll never need to wrap.
-  if (  # Ignore control structures.
-      not Search(r'\b(if|for|while|switch|return|new|delete|catch|sizeof)\b',
-                 fncall) and
-      # Ignore pointers/references to functions.
-      not Search(r' \([^)]+\)\([^)]*(\)|,$)', fncall) and
-      # Ignore pointers/references to arrays.
-      not Search(r' \([^)]+\)\[[^\]]+\]', fncall)):
-    if Search(r'\w\s*\(\s(?!\s*\\$)', fncall):      # a ( used for a fn call
-      error(filename, linenum, 'whitespace/parens', 4,
-            'Extra space after ( in function call')
-    elif Search(r'\(\s+(?!(\s*\\)|\()', fncall):
-      error(filename, linenum, 'whitespace/parens', 2,
-            'Extra space after (')
-    if (Search(r'\w\s+\(', fncall) and
-        not Search(r'#\s*define|typedef', fncall) and
-        not Search(r'\w\s+\((\w+::)*\*\w+\)\(', fncall)):
-      error(filename, linenum, 'whitespace/parens', 4,
-            'Extra space before ( in function call')
-    # If the ) is followed only by a newline or a { + newline, assume it's
-    # part of a control statement (if/while/etc), and don't complain
-    if Search(r'[^)]\s+\)\s*[^{\s]', fncall):
-      # If the closing parenthesis is preceded by only whitespaces,
-      # try to give a more descriptive error message.
-      if Search(r'^\s+\)', fncall):
-        error(filename, linenum, 'whitespace/parens', 2,
-              'Closing ) should be moved to the previous line')
-      else:
-        error(filename, linenum, 'whitespace/parens', 2,
-              'Extra space before )')
-
-
-def IsBlankLine(line):
-  """Returns true if the given line is blank.
-
-  We consider a line to be blank if the line is empty or consists of
-  only white spaces.
-
-  Args:
-    line: A line of a string.
-
-  Returns:
-    True, if the given line is blank.
-  """
-  return not line or line.isspace()
-
-
-def CheckForFunctionLengths(filename, clean_lines, linenum,
-                            function_state, error):
-  """Reports for long function bodies.
-
-  For an overview why this is done, see:
-  http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Write_Short_Functions
-
-  Uses a simplistic algorithm assuming other style guidelines
-  (especially spacing) are followed.
-  Only checks unindented functions, so class members are unchecked.
-  Trivial bodies are unchecked, so constructors with huge initializer lists
-  may be missed.
-  Blank/comment lines are not counted so as to avoid encouraging the removal
-  of vertical space and comments just to get through a lint check.
-  NOLINT *on the last line of a function* disables this check.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    function_state: Current function name and lines in body so far.
-    error: The function to call with any errors found.
-  """
-  lines = clean_lines.lines
-  line = lines[linenum]
-  raw = clean_lines.raw_lines
-  raw_line = raw[linenum]
-  joined_line = ''
-
-  starting_func = False
-  regexp = r'(\w(\w|::|\*|\&|\s)*)\('  # decls * & space::name( ...
-  match_result = Match(regexp, line)
-  if match_result:
-    # If the name is all caps and underscores, figure it's a macro and
-    # ignore it, unless it's TEST or TEST_F.
-    function_name = match_result.group(1).split()[-1]
-    if function_name == 'TEST' or function_name == 'TEST_F' or (
-        not Match(r'[A-Z_]+$', function_name)):
-      starting_func = True
-
-  if starting_func:
-    body_found = False
-    for start_linenum in xrange(linenum, clean_lines.NumLines()):
-      start_line = lines[start_linenum]
-      joined_line += ' ' + start_line.lstrip()
-      if Search(r'(;|})', start_line):  # Declarations and trivial functions
-        body_found = True
-        break                              # ... ignore
-      elif Search(r'{', start_line):
-        body_found = True
-        function = Search(r'((\w|:)*)\(', line).group(1)
-        if Match(r'TEST', function):    # Handle TEST... macros
-          parameter_regexp = Search(r'(\(.*\))', joined_line)
-          if parameter_regexp:             # Ignore bad syntax
-            function += parameter_regexp.group(1)
-        else:
-          function += '()'
-        function_state.Begin(function)
-        break
-    if not body_found:
-      # No body for the function (or evidence of a non-function) was found.
-      error(filename, linenum, 'readability/fn_size', 5,
-            'Lint failed to find start of function body.')
-  elif Match(r'^\}\s*$', line):  # function end
-    function_state.Check(error, filename, linenum)
-    function_state.End()
-  elif not Match(r'^\s*$', line):
-    function_state.Count()  # Count non-blank/non-comment lines.
-
-
-_RE_PATTERN_TODO = re.compile(r'^//(\s*)TODO(\(.+?\))?:?(\s|$)?')
-
-
-def CheckComment(comment, filename, linenum, error):
-  """Checks for common mistakes in TODO comments.
-
-  Args:
-    comment: The text of the comment from the line in question.
-    filename: The name of the current file.
-    linenum: The number of the line to check.
-    error: The function to call with any errors found.
-  """
-  match = _RE_PATTERN_TODO.match(comment)
-  if match:
-    # One whitespace is correct; zero whitespace is handled elsewhere.
-    leading_whitespace = match.group(1)
-    if len(leading_whitespace) > 1:
-      error(filename, linenum, 'whitespace/todo', 2,
-            'Too many spaces before TODO')
-
-    username = match.group(2)
-    if not username:
-      error(filename, linenum, 'readability/todo', 2,
-            'Missing username in TODO; it should look like '
-            '"// TODO(my_username): Stuff."')
-
-    middle_whitespace = match.group(3)
-    # Comparisons made explicit for correctness -- pylint: disable=g-explicit-bool-comparison
-    if middle_whitespace != ' ' and middle_whitespace != '':
-      error(filename, linenum, 'whitespace/todo', 2,
-            'TODO(my_username) should be followed by a space')
-
-def CheckAccess(filename, clean_lines, linenum, nesting_state, error):
-  """Checks for improper use of DISALLOW* macros.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    nesting_state: A _NestingState instance which maintains information about
-                   the current stack of nested blocks being parsed.
-    error: The function to call with any errors found.
-  """
-  line = clean_lines.elided[linenum]  # get rid of comments and strings
-
-  matched = Match((r'\s*(DISALLOW_COPY_AND_ASSIGN|'
-                   r'DISALLOW_EVIL_CONSTRUCTORS|'
-                   r'DISALLOW_IMPLICIT_CONSTRUCTORS)'), line)
-  if not matched:
-    return
-  if nesting_state.stack and isinstance(nesting_state.stack[-1], _ClassInfo):
-    if nesting_state.stack[-1].access != 'private':
-      error(filename, linenum, 'readability/constructors', 3,
-            '%s must be in the private: section' % matched.group(1))
-
-  else:
-    # Found DISALLOW* macro outside a class declaration, or perhaps it
-    # was used inside a function when it should have been part of the
-    # class declaration.  We could issue a warning here, but it
-    # probably resulted in a compiler error already.
-    pass
-
-
-def FindNextMatchingAngleBracket(clean_lines, linenum, init_suffix):
-  """Find the corresponding > to close a template.
-
-  Args:
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: Current line number.
-    init_suffix: Remainder of the current line after the initial <.
-
-  Returns:
-    True if a matching bracket exists.
-  """
-  line = init_suffix
-  nesting_stack = ['<']
-  while True:
-    # Find the next operator that can tell us whether < is used as an
-    # opening bracket or as a less-than operator.  We only want to
-    # warn on the latter case.
-    #
-    # We could also check all other operators and terminate the search
-    # early, e.g. if we got something like this "a<b+c", the "<" is
-    # most likely a less-than operator, but then we will get false
-    # positives for default arguments and other template expressions.
-    match = Search(r'^[^<>(),;\[\]]*([<>(),;\[\]])(.*)$', line)
-    if match:
-      # Found an operator, update nesting stack
-      operator = match.group(1)
-      line = match.group(2)
-
-      if nesting_stack[-1] == '<':
-        # Expecting closing angle bracket
-        if operator in ('<', '(', '['):
-          nesting_stack.append(operator)
-        elif operator == '>':
-          nesting_stack.pop()
-          if not nesting_stack:
-            # Found matching angle bracket
-            return True
-        elif operator == ',':
-          # Got a comma after a bracket, this is most likely a template
-          # argument.  We have not seen a closing angle bracket yet, but
-          # it's probably a few lines later if we look for it, so just
-          # return early here.
-          return True
-        else:
-          # Got some other operator.
-          return False
-
-      else:
-        # Expecting closing parenthesis or closing bracket
-        if operator in ('<', '(', '['):
-          nesting_stack.append(operator)
-        elif operator in (')', ']'):
-          # We don't bother checking for matching () or [].  If we got
-          # something like (] or [), it would have been a syntax error.
-          nesting_stack.pop()
-
-    else:
-      # Scan the next line
-      linenum += 1
-      if linenum >= len(clean_lines.elided):
-        break
-      line = clean_lines.elided[linenum]
-
-  # Exhausted all remaining lines and still no matching angle bracket.
-  # Most likely the input was incomplete, otherwise we should have
-  # seen a semicolon and returned early.
-  return True
-
-
-def FindPreviousMatchingAngleBracket(clean_lines, linenum, init_prefix):
-  """Find the corresponding < that started a template.
-
-  Args:
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: Current line number.
-    init_prefix: Part of the current line before the initial >.
-
-  Returns:
-    True if a matching bracket exists.
-  """
-  line = init_prefix
-  nesting_stack = ['>']
-  while True:
-    # Find the previous operator
-    match = Search(r'^(.*)([<>(),;\[\]])[^<>(),;\[\]]*$', line)
-    if match:
-      # Found an operator, update nesting stack
-      operator = match.group(2)
-      line = match.group(1)
-
-      if nesting_stack[-1] == '>':
-        # Expecting opening angle bracket
-        if operator in ('>', ')', ']'):
-          nesting_stack.append(operator)
-        elif operator == '<':
-          nesting_stack.pop()
-          if not nesting_stack:
-            # Found matching angle bracket
-            return True
-        elif operator == ',':
-          # Got a comma before a bracket, this is most likely a
-          # template argument.  The opening angle bracket is probably
-          # there if we look for it, so just return early here.
-          return True
-        else:
-          # Got some other operator.
-          return False
-
-      else:
-        # Expecting opening parenthesis or opening bracket
-        if operator in ('>', ')', ']'):
-          nesting_stack.append(operator)
-        elif operator in ('(', '['):
-          nesting_stack.pop()
-
-    else:
-      # Scan the previous line
-      linenum -= 1
-      if linenum < 0:
-        break
-      line = clean_lines.elided[linenum]
-
-  # Exhausted all earlier lines and still no matching angle bracket.
-  return False
-
-
-def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
-  """Checks for the correctness of various spacing issues in the code.
-
-  Things we check for: spaces around operators, spaces after
-  if/for/while/switch, no spaces around parens in function calls, two
-  spaces between code and comment, don't start a block with a blank
-  line, don't end a function with a blank line, don't add a blank line
-  after public/protected/private, don't have too many blank lines in a row.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    nesting_state: A _NestingState instance which maintains information about
-                   the current stack of nested blocks being parsed.
-    error: The function to call with any errors found.
-  """
-
-  # Don't use "elided" lines here, otherwise we can't check commented lines.
-  # Don't want to use "raw" either, because we don't want to check inside C++11
-  # raw strings,
-  raw = clean_lines.lines_without_raw_strings
-  line = raw[linenum]
-
-  # Before nixing comments, check if the line is blank for no good
-  # reason.  This includes the first line after a block is opened, and
-  # blank lines at the end of a function (ie, right before a line like '}'
-  #
-  # Skip all the blank line checks if we are immediately inside a
-  # namespace body.  In other words, don't issue blank line warnings
-  # for this block:
-  #   namespace {
-  #
-  #   }
-  #
-  # A warning about missing end of namespace comments will be issued instead.
-  if IsBlankLine(line) and not nesting_state.InNamespaceBody():
-    elided = clean_lines.elided
-    prev_line = elided[linenum - 1]
-    prevbrace = prev_line.rfind('{')
-    # TODO(unknown): Don't complain if line before blank line, and line after,
-    #                both start with alnums and are indented the same amount.
-    #                This ignores whitespace at the start of a namespace block
-    #                because those are not usually indented.
-    if prevbrace != -1 and prev_line[prevbrace:].find('}') == -1:
-      # OK, we have a blank line at the start of a code block.  Before we
-      # complain, we check if it is an exception to the rule: The previous
-      # non-empty line has the parameters of a function header that are indented
-      # 4 spaces (because they did not fit in a 80 column line when placed on
-      # the same line as the function name).  We also check for the case where
-      # the previous line is indented 6 spaces, which may happen when the
-      # initializers of a constructor do not fit into a 80 column line.
-      exception = False
-      if Match(r' {6}\w', prev_line):  # Initializer list?
-        # We are looking for the opening column of initializer list, which
-        # should be indented 4 spaces to cause 6 space indentation afterwards.
-        search_position = linenum-2
-        while (search_position >= 0
-               and Match(r' {6}\w', elided[search_position])):
-          search_position -= 1
-        exception = (search_position >= 0
-                     and elided[search_position][:5] == '    :')
-      else:
-        # Search for the function arguments or an initializer list.  We use a
-        # simple heuristic here: If the line is indented 4 spaces; and we have a
-        # closing paren, without the opening paren, followed by an opening brace
-        # or colon (for initializer lists) we assume that it is the last line of
-        # a function header.  If we have a colon indented 4 spaces, it is an
-        # initializer list.
-        exception = (Match(r' {4}\w[^\(]*\)\s*(const\s*)?(\{\s*$|:)',
-                           prev_line)
-                     or Match(r' {4}:', prev_line))
-
-      if not exception:
-        error(filename, linenum, 'whitespace/blank_line', 2,
-              'Redundant blank line at the start of a code block '
-              'should be deleted.')
-    # Ignore blank lines at the end of a block in a long if-else
-    # chain, like this:
-    #   if (condition1) {
-    #     // Something followed by a blank line
-    #
-    #   } else if (condition2) {
-    #     // Something else
-    #   }
-    if linenum + 1 < clean_lines.NumLines():
-      next_line = raw[linenum + 1]
-      if (next_line
-          and Match(r'\s*}', next_line)
-          and next_line.find('} else ') == -1):
-        error(filename, linenum, 'whitespace/blank_line', 3,
-              'Redundant blank line at the end of a code block '
-              'should be deleted.')
-
-    matched = Match(r'\s*(public|protected|private):', prev_line)
-    if matched:
-      error(filename, linenum, 'whitespace/blank_line', 3,
-            'Do not leave a blank line after "%s:"' % matched.group(1))
-
-  # Next, we complain if there's a comment too near the text
-  commentpos = line.find('//')
-  if commentpos != -1:
-    # Check if the // may be in quotes.  If so, ignore it
-    # Comparisons made explicit for clarity -- pylint: disable=g-explicit-bool-comparison
-    if (line.count('"', 0, commentpos) -
-        line.count('\\"', 0, commentpos)) % 2 == 0:   # not in quotes
-      # Allow one space for new scopes, two spaces otherwise:
-      if (not Match(r'^\s*{ //', line) and
-          ((commentpos >= 1 and
-            line[commentpos-1] not in string.whitespace) or
-           (commentpos >= 2 and
-            line[commentpos-2] not in string.whitespace))):
-        error(filename, linenum, 'whitespace/comments', 2,
-              'At least two spaces is best between code and comments')
-      # There should always be a space between the // and the comment
-      commentend = commentpos + 2
-      if commentend < len(line) and not line[commentend] == ' ':
-        # but some lines are exceptions -- e.g. if they're big
-        # comment delimiters like:
-        # //----------------------------------------------------------
-        # or are an empty C++ style Doxygen comment, like:
-        # ///
-        # or C++ style Doxygen comments placed after the variable:
-        # ///<  Header comment
-        # //!<  Header comment
-        # or they begin with multiple slashes followed by a space:
-        # //////// Header comment
-        match = (Search(r'[=/-]{4,}\s*$', line[commentend:]) or
-                 Search(r'^/$', line[commentend:]) or
-                 Search(r'^!< ', line[commentend:]) or
-                 Search(r'^/< ', line[commentend:]) or
-                 Search(r'^/+ ', line[commentend:]))
-        if not match:
-          error(filename, linenum, 'whitespace/comments', 4,
-                'Should have a space between // and comment')
-      CheckComment(line[commentpos:], filename, linenum, error)
-
-  line = clean_lines.elided[linenum]  # get rid of comments and strings
-
-  # Don't try to do spacing checks for operator methods
-  line = re.sub(r'operator(==|!=|<|<<|<=|>=|>>|>)\(', 'operator\(', line)
-
-  # We allow no-spaces around = within an if: "if ( (a=Foo()) == 0 )".
-  # Otherwise not.  Note we only check for non-spaces on *both* sides;
-  # sometimes people put non-spaces on one side when aligning ='s among
-  # many lines (not that this is behavior that I approve of...)
-  if Search(r'[\w.]=[\w.]', line) and not Search(r'\b(if|while) ', line):
-    error(filename, linenum, 'whitespace/operators', 4,
-          'Missing spaces around =')
-
-  # It's ok not to have spaces around binary operators like + - * /, but if
-  # there's too little whitespace, we get concerned.  It's hard to tell,
-  # though, so we punt on this one for now.  TODO.
-
-  # You should always have whitespace around binary operators.
-  #
-  # Check <= and >= first to avoid false positives with < and >, then
-  # check non-include lines for spacing around < and >.
-  match = Search(r'[^<>=!\s](==|!=|<=|>=)[^<>=!\s]', line)
-  if match:
-    error(filename, linenum, 'whitespace/operators', 3,
-          'Missing spaces around %s' % match.group(1))
-  # We allow no-spaces around << when used like this: 10<<20, but
-  # not otherwise (particularly, not when used as streams)
-  # Also ignore using ns::operator<<;
-  match = Search(r'(operator|\S)(?:L|UL|ULL|l|ul|ull)?<<(\S)', line)
-  if (match and
-      not (match.group(1).isdigit() and match.group(2).isdigit()) and
-      not (match.group(1) == 'operator' and match.group(2) == ';')):
-    error(filename, linenum, 'whitespace/operators', 3,
-          'Missing spaces around <<')
-  elif not Match(r'#.*include', line):
-    # Avoid false positives on ->
-    reduced_line = line.replace('->', '')
-
-    # Look for < that is not surrounded by spaces.  This is only
-    # triggered if both sides are missing spaces, even though
-    # technically should should flag if at least one side is missing a
-    # space.  This is done to avoid some false positives with shifts.
-    match = Search(r'[^\s<]<([^\s=<].*)', reduced_line)
-    if (match and
-        not FindNextMatchingAngleBracket(clean_lines, linenum, match.group(1))):
-      error(filename, linenum, 'whitespace/operators', 3,
-            'Missing spaces around <')
-
-    # Look for > that is not surrounded by spaces.  Similar to the
-    # above, we only trigger if both sides are missing spaces to avoid
-    # false positives with shifts.
-    match = Search(r'^(.*[^\s>])>[^\s=>]', reduced_line)
-    if (match and
-        not FindPreviousMatchingAngleBracket(clean_lines, linenum,
-                                             match.group(1))):
-      error(filename, linenum, 'whitespace/operators', 3,
-            'Missing spaces around >')
-
-  # We allow no-spaces around >> for almost anything.  This is because
-  # C++11 allows ">>" to close nested templates, which accounts for
-  # most cases when ">>" is not followed by a space.
-  #
-  # We still warn on ">>" followed by alpha character, because that is
-  # likely due to ">>" being used for right shifts, e.g.:
-  #   value >> alpha
-  #
-  # When ">>" is used to close templates, the alphanumeric letter that
-  # follows would be part of an identifier, and there should still be
-  # a space separating the template type and the identifier.
-  #   type<type<type>> alpha
-  match = Search(r'>>[a-zA-Z_]', line)
-  if match:
-    error(filename, linenum, 'whitespace/operators', 3,
-          'Missing spaces around >>')
-
-  # There shouldn't be space around unary operators
-  match = Search(r'(!\s|~\s|[\s]--[\s;]|[\s]\+\+[\s;])', line)
-  if match:
-    error(filename, linenum, 'whitespace/operators', 4,
-          'Extra space for operator %s' % match.group(1))
-
-  # A pet peeve of mine: no spaces after an if, while, switch, or for
-  match = Search(r' (if\(|for\(|while\(|switch\()', line)
-  if match:
-    error(filename, linenum, 'whitespace/parens', 5,
-          'Missing space before ( in %s' % match.group(1))
-
-  # For if/for/while/switch, the left and right parens should be
-  # consistent about how many spaces are inside the parens, and
-  # there should either be zero or one spaces inside the parens.
-  # We don't want: "if ( foo)" or "if ( foo   )".
-  # Exception: "for ( ; foo; bar)" and "for (foo; bar; )" are allowed.
-  match = Search(r'\b(if|for|while|switch)\s*'
-                 r'\(([ ]*)(.).*[^ ]+([ ]*)\)\s*{\s*$',
-                 line)
-  if match:
-    if len(match.group(2)) != len(match.group(4)):
-      if not (match.group(3) == ';' and
-              len(match.group(2)) == 1 + len(match.group(4)) or
-              not match.group(2) and Search(r'\bfor\s*\(.*; \)', line)):
-        error(filename, linenum, 'whitespace/parens', 5,
-              'Mismatching spaces inside () in %s' % match.group(1))
-    if len(match.group(2)) not in [0, 1]:
-      error(filename, linenum, 'whitespace/parens', 5,
-            'Should have zero or one spaces inside ( and ) in %s' %
-            match.group(1))
-
-  # You should always have a space after a comma (either as fn arg or operator)
-  #
-  # This does not apply when the non-space character following the
-  # comma is another comma, since the only time when that happens is
-  # for empty macro arguments.
-  #
-  # We run this check in two passes: first pass on elided lines to
-  # verify that lines contain missing whitespaces, second pass on raw
-  # lines to confirm that those missing whitespaces are not due to
-  # elided comments.
-  if Search(r',[^,\s]', line) and Search(r',[^,\s]', raw[linenum]):
-    error(filename, linenum, 'whitespace/comma', 3,
-          'Missing space after ,')
-
-  # You should always have a space after a semicolon
-  # except for few corner cases
-  # TODO(unknown): clarify if 'if (1) { return 1;}' is requires one more
-  # space after ;
-  if Search(r';[^\s};\\)/]', line):
-    error(filename, linenum, 'whitespace/semicolon', 3,
-          'Missing space after ;')
-
-  # Next we will look for issues with function calls.
-  CheckSpacingForFunctionCall(filename, line, linenum, error)
-
-  # Except after an opening paren, or after another opening brace (in case of
-  # an initializer list, for instance), you should have spaces before your
-  # braces. And since you should never have braces at the beginning of a line,
-  # this is an easy test.
-  match = Match(r'^(.*[^ ({]){', line)
-  if match:
-    # Try a bit harder to check for brace initialization.  This
-    # happens in one of the following forms:
-    #   Constructor() : initializer_list_{} { ... }
-    #   Constructor{}.MemberFunction()
-    #   Type variable{};
-    #   FunctionCall(type{}, ...);
-    #   LastArgument(..., type{});
-    #   LOG(INFO) << type{} << " ...";
-    #   map_of_type[{...}] = ...;
-    #
-    # We check for the character following the closing brace, and
-    # silence the warning if it's one of those listed above, i.e.
-    # "{.;,)<]".
-    #
-    # To account for nested initializer list, we allow any number of
-    # closing braces up to "{;,)<".  We can't simply silence the
-    # warning on first sight of closing brace, because that would
-    # cause false negatives for things that are not initializer lists.
-    #   Silence this:         But not this:
-    #     Outer{                if (...) {
-    #       Inner{...}            if (...){  // Missing space before {
-    #     };                    }
-    #
-    # There is a false negative with this approach if people inserted
-    # spurious semicolons, e.g. "if (cond){};", but we will catch the
-    # spurious semicolon with a separate check.
-    (endline, endlinenum, endpos) = CloseExpression(
-        clean_lines, linenum, len(match.group(1)))
-    trailing_text = ''
-    if endpos > -1:
-      trailing_text = endline[endpos:]
-    for offset in xrange(endlinenum + 1,
-                         min(endlinenum + 3, clean_lines.NumLines() - 1)):
-      trailing_text += clean_lines.elided[offset]
-    if not Match(r'^[\s}]*[{.;,)<\]]', trailing_text):
-      error(filename, linenum, 'whitespace/braces', 5,
-            'Missing space before {')
-
-  # Make sure '} else {' has spaces.
-  if Search(r'}else', line):
-    error(filename, linenum, 'whitespace/braces', 5,
-          'Missing space before else')
-
-  # You shouldn't have spaces before your brackets, except maybe after
-  # 'delete []' or 'new char * []'.
-  if Search(r'\w\s+\[', line) and not Search(r'delete\s+\[', line):
-    error(filename, linenum, 'whitespace/braces', 5,
-          'Extra space before [')
-
-  # You shouldn't have a space before a semicolon at the end of the line.
-  # There's a special case for "for" since the style guide allows space before
-  # the semicolon there.
-  if Search(r':\s*;\s*$', line):
-    error(filename, linenum, 'whitespace/semicolon', 5,
-          'Semicolon defining empty statement. Use {} instead.')
-  elif Search(r'^\s*;\s*$', line):
-    error(filename, linenum, 'whitespace/semicolon', 5,
-          'Line contains only semicolon. If this should be an empty statement, '
-          'use {} instead.')
-  elif (Search(r'\s+;\s*$', line) and
-        not Search(r'\bfor\b', line)):
-    error(filename, linenum, 'whitespace/semicolon', 5,
-          'Extra space before last semicolon. If this should be an empty '
-          'statement, use {} instead.')
-
-  # In range-based for, we wanted spaces before and after the colon, but
-  # not around "::" tokens that might appear.
-  if (Search('for *\(.*[^:]:[^: ]', line) or
-      Search('for *\(.*[^: ]:[^:]', line)):
-    error(filename, linenum, 'whitespace/forcolon', 2,
-          'Missing space around colon in range-based for loop')
-
-
-def CheckSectionSpacing(filename, clean_lines, class_info, linenum, error):
-  """Checks for additional blank line issues related to sections.
-
-  Currently the only thing checked here is blank line before protected/private.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    class_info: A _ClassInfo objects.
-    linenum: The number of the line to check.
-    error: The function to call with any errors found.
-  """
-  # Skip checks if the class is small, where small means 25 lines or less.
-  # 25 lines seems like a good cutoff since that's the usual height of
-  # terminals, and any class that can't fit in one screen can't really
-  # be considered "small".
-  #
-  # Also skip checks if we are on the first line.  This accounts for
-  # classes that look like
-  #   class Foo { public: ... };
-  #
-  # If we didn't find the end of the class, last_line would be zero,
-  # and the check will be skipped by the first condition.
-  if (class_info.last_line - class_info.starting_linenum <= 24 or
-      linenum <= class_info.starting_linenum):
-    return
-
-  matched = Match(r'\s*(public|protected|private):', clean_lines.lines[linenum])
-  if matched:
-    # Issue warning if the line before public/protected/private was
-    # not a blank line, but don't do this if the previous line contains
-    # "class" or "struct".  This can happen two ways:
-    #  - We are at the beginning of the class.
-    #  - We are forward-declaring an inner class that is semantically
-    #    private, but needed to be public for implementation reasons.
-    # Also ignores cases where the previous line ends with a backslash as can be
-    # common when defining classes in C macros.
-    prev_line = clean_lines.lines[linenum - 1]
-    if (not IsBlankLine(prev_line) and
-        not Search(r'\b(class|struct)\b', prev_line) and
-        not Search(r'\\$', prev_line)):
-      # Try a bit harder to find the beginning of the class.  This is to
-      # account for multi-line base-specifier lists, e.g.:
-      #   class Derived
-      #       : public Base {
-      end_class_head = class_info.starting_linenum
-      for i in range(class_info.starting_linenum, linenum):
-        if Search(r'\{\s*$', clean_lines.lines[i]):
-          end_class_head = i
-          break
-      if end_class_head < linenum - 1:
-        error(filename, linenum, 'whitespace/blank_line', 3,
-              '"%s:" should be preceded by a blank line' % matched.group(1))
-
-
-def GetPreviousNonBlankLine(clean_lines, linenum):
-  """Return the most recent non-blank line and its line number.
-
-  Args:
-    clean_lines: A CleansedLines instance containing the file contents.
-    linenum: The number of the line to check.
-
-  Returns:
-    A tuple with two elements.  The first element is the contents of the last
-    non-blank line before the current line, or the empty string if this is the
-    first non-blank line.  The second is the line number of that line, or -1
-    if this is the first non-blank line.
-  """
-
-  prevlinenum = linenum - 1
-  while prevlinenum >= 0:
-    prevline = clean_lines.elided[prevlinenum]
-    if not IsBlankLine(prevline):     # if not a blank line...
-      return (prevline, prevlinenum)
-    prevlinenum -= 1
-  return ('', -1)
-
-
-def CheckBraces(filename, clean_lines, linenum, error):
-  """Looks for misplaced braces (e.g. at the end of line).
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    error: The function to call with any errors found.
-  """
-
-  line = clean_lines.elided[linenum]        # get rid of comments and strings
-
-  if Match(r'\s*{\s*$', line):
-    # We allow an open brace to start a line in the case where someone is using
-    # braces in a block to explicitly create a new scope, which is commonly used
-    # to control the lifetime of stack-allocated variables.  Braces are also
-    # used for brace initializers inside function calls.  We don't detect this
-    # perfectly: we just don't complain if the last non-whitespace character on
-    # the previous non-blank line is ',', ';', ':', '(', '{', or '}', or if the
-    # previous line starts a preprocessor block.
-    prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
-    if (not Search(r'[,;:}{(]\s*$', prevline) and
-        not Match(r'\s*#', prevline)):
-      error(filename, linenum, 'whitespace/braces', 4,
-            '{ should almost always be at the end of the previous line')
-
-  # An else clause should be on the same line as the preceding closing brace.
-  if Match(r'\s*else\s*', line):
-    prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
-    if Match(r'\s*}\s*$', prevline):
-      error(filename, linenum, 'whitespace/newline', 4,
-            'An else should appear on the same line as the preceding }')
-
-  # If braces come on one side of an else, they should be on both.
-  # However, we have to worry about "else if" that spans multiple lines!
-  if Search(r'}\s*else[^{]*$', line) or Match(r'[^}]*else\s*{', line):
-    if Search(r'}\s*else if([^{]*)$', line):       # could be multi-line if
-      # find the ( after the if
-      pos = line.find('else if')
-      pos = line.find('(', pos)
-      if pos > 0:
-        (endline, _, endpos) = CloseExpression(clean_lines, linenum, pos)
-        if endline[endpos:].find('{') == -1:    # must be brace after if
-          error(filename, linenum, 'readability/braces', 5,
-                'If an else has a brace on one side, it should have it on both')
-    else:            # common case: else not followed by a multi-line if
-      error(filename, linenum, 'readability/braces', 5,
-            'If an else has a brace on one side, it should have it on both')
-
-  # Likewise, an else should never have the else clause on the same line
-  if Search(r'\belse [^\s{]', line) and not Search(r'\belse if\b', line):
-    error(filename, linenum, 'whitespace/newline', 4,
-          'Else clause should never be on same line as else (use 2 lines)')
-
-  # In the same way, a do/while should never be on one line
-  if Match(r'\s*do [^\s{]', line):
-    error(filename, linenum, 'whitespace/newline', 4,
-          'do/while clauses should not be on a single line')
-
-  # Block bodies should not be followed by a semicolon.  Due to C++11
-  # brace initialization, there are more places where semicolons are
-  # required than not, so we use a whitelist approach to check these
-  # rather than a blacklist.  These are the places where "};" should
-  # be replaced by just "}":
-  # 1. Some flavor of block following closing parenthesis:
-  #    for (;;) {};
-  #    while (...) {};
-  #    switch (...) {};
-  #    Function(...) {};
-  #    if (...) {};
-  #    if (...) else if (...) {};
-  #
-  # 2. else block:
-  #    if (...) else {};
-  #
-  # 3. const member function:
-  #    Function(...) const {};
-  #
-  # 4. Block following some statement:
-  #    x = 42;
-  #    {};
-  #
-  # 5. Block at the beginning of a function:
-  #    Function(...) {
-  #      {};
-  #    }
-  #
-  #    Note that naively checking for the preceding "{" will also match
-  #    braces inside multi-dimensional arrays, but this is fine since
-  #    that expression will not contain semicolons.
-  #
-  # 6. Block following another block:
-  #    while (true) {}
-  #    {};
-  #
-  # 7. End of namespaces:
-  #    namespace {};
-  #
-  #    These semicolons seems far more common than other kinds of
-  #    redundant semicolons, possibly due to people converting classes
-  #    to namespaces.  For now we do not warn for this case.
-  #
-  # Try matching case 1 first.
-  match = Match(r'^(.*\)\s*)\{', line)
-  if match:
-    # Matched closing parenthesis (case 1).  Check the token before the
-    # matching opening parenthesis, and don't warn if it looks like a
-    # macro.  This avoids these false positives:
-    #  - macro that defines a base class
-    #  - multi-line macro that defines a base class
-    #  - macro that defines the whole class-head
-    #
-    # But we still issue warnings for macros that we know are safe to
-    # warn, specifically:
-    #  - TEST, TEST_F, TEST_P, MATCHER, MATCHER_P
-    #  - TYPED_TEST
-    #  - INTERFACE_DEF
-    #  - EXCLUSIVE_LOCKS_REQUIRED, SHARED_LOCKS_REQUIRED, LOCKS_EXCLUDED:
-    #
-    # We implement a whitelist of safe macros instead of a blacklist of
-    # unsafe macros, even though the latter appears less frequently in
-    # google code and would have been easier to implement.  This is because
-    # the downside for getting the whitelist wrong means some extra
-    # semicolons, while the downside for getting the blacklist wrong
-    # would result in compile errors.
-    #
-    # In addition to macros, we also don't want to warn on compound
-    # literals.
-    closing_brace_pos = match.group(1).rfind(')')
-    opening_parenthesis = ReverseCloseExpression(
-        clean_lines, linenum, closing_brace_pos)
-    if opening_parenthesis[2] > -1:
-      line_prefix = opening_parenthesis[0][0:opening_parenthesis[2]]
-      macro = Search(r'\b([A-Z_]+)\s*$', line_prefix)
-      if ((macro and
-           macro.group(1) not in (
-               'TEST', 'TEST_F', 'MATCHER', 'MATCHER_P', 'TYPED_TEST',
-               'EXCLUSIVE_LOCKS_REQUIRED', 'SHARED_LOCKS_REQUIRED',
-               'LOCKS_EXCLUDED', 'INTERFACE_DEF')) or
-          Search(r'\s+=\s*$', line_prefix)):
-        match = None
-    # Whitelist lambda function definition which also requires a ";" after
-    # closing brace
-    if match:
-        if Match(r'^.*\[.*\]\s*(.*\)\s*)\{', line):
-            match = None
-
-  else:
-    # Try matching cases 2-3.
-    match = Match(r'^(.*(?:else|\)\s*const)\s*)\{', line)
-    if not match:
-      # Try matching cases 4-6.  These are always matched on separate lines.
-      #
-      # Note that we can't simply concatenate the previous line to the
-      # current line and do a single match, otherwise we may output
-      # duplicate warnings for the blank line case:
-      #   if (cond) {
-      #     // blank line
-      #   }
-      prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
-      if prevline and Search(r'[;{}]\s*$', prevline):
-        match = Match(r'^(\s*)\{', line)
-
-  # Check matching closing brace
-  if match:
-    (endline, endlinenum, endpos) = CloseExpression(
-        clean_lines, linenum, len(match.group(1)))
-    if endpos > -1 and Match(r'^\s*;', endline[endpos:]):
-      # Current {} pair is eligible for semicolon check, and we have found
-      # the redundant semicolon, output warning here.
-      #
-      # Note: because we are scanning forward for opening braces, and
-      # outputting warnings for the matching closing brace, if there are
-      # nested blocks with trailing semicolons, we will get the error
-      # messages in reversed order.
-      error(filename, endlinenum, 'readability/braces', 4,
-            "You don't need a ; after a }")
-
-
-def CheckEmptyBlockBody(filename, clean_lines, linenum, error):
-  """Look for empty loop/conditional body with only a single semicolon.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    error: The function to call with any errors found.
-  """
-
-  # Search for loop keywords at the beginning of the line.  Because only
-  # whitespaces are allowed before the keywords, this will also ignore most
-  # do-while-loops, since those lines should start with closing brace.
-  #
-  # We also check "if" blocks here, since an empty conditional block
-  # is likely an error.
-  line = clean_lines.elided[linenum]
-  matched = Match(r'\s*(for|while|if)\s*\(', line)
-  if matched:
-    # Find the end of the conditional expression
-    (end_line, end_linenum, end_pos) = CloseExpression(
-        clean_lines, linenum, line.find('('))
-
-    # Output warning if what follows the condition expression is a semicolon.
-    # No warning for all other cases, including whitespace or newline, since we
-    # have a separate check for semicolons preceded by whitespace.
-    if end_pos >= 0 and Match(r';', end_line[end_pos:]):
-      if matched.group(1) == 'if':
-        error(filename, end_linenum, 'whitespace/empty_conditional_body', 5,
-              'Empty conditional bodies should use {}')
-      else:
-        error(filename, end_linenum, 'whitespace/empty_loop_body', 5,
-              'Empty loop bodies should use {} or continue')
-
-
-def CheckCheck(filename, clean_lines, linenum, error):
-  """Checks the use of CHECK and EXPECT macros.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    error: The function to call with any errors found.
-  """
-
-  # Decide the set of replacement macros that should be suggested
-  lines = clean_lines.elided
-  check_macro = None
-  start_pos = -1
-  for macro in _CHECK_MACROS:
-    i = lines[linenum].find(macro)
-    if i >= 0:
-      check_macro = macro
-
-      # Find opening parenthesis.  Do a regular expression match here
-      # to make sure that we are matching the expected CHECK macro, as
-      # opposed to some other macro that happens to contain the CHECK
-      # substring.
-      matched = Match(r'^(.*\b' + check_macro + r'\s*)\(', lines[linenum])
-      if not matched:
-        continue
-      start_pos = len(matched.group(1))
-      break
-  if not check_macro or start_pos < 0:
-    # Don't waste time here if line doesn't contain 'CHECK' or 'EXPECT'
-    return
-
-  # Find end of the boolean expression by matching parentheses
-  (last_line, end_line, end_pos) = CloseExpression(
-      clean_lines, linenum, start_pos)
-  if end_pos < 0:
-    return
-  if linenum == end_line:
-    expression = lines[linenum][start_pos + 1:end_pos - 1]
-  else:
-    expression = lines[linenum][start_pos + 1:]
-    for i in xrange(linenum + 1, end_line):
-      expression += lines[i]
-    expression += last_line[0:end_pos - 1]
-
-  # Parse expression so that we can take parentheses into account.
-  # This avoids false positives for inputs like "CHECK((a < 4) == b)",
-  # which is not replaceable by CHECK_LE.
-  lhs = ''
-  rhs = ''
-  operator = None
-  while expression:
-    matched = Match(r'^\s*(<<|<<=|>>|>>=|->\*|->|&&|\|\||'
-                    r'==|!=|>=|>|<=|<|\()(.*)$', expression)
-    if matched:
-      token = matched.group(1)
-      if token == '(':
-        # Parenthesized operand
-        expression = matched.group(2)
-        (end, _) = FindEndOfExpressionInLine(expression, 0, 1, '(', ')')
-        if end < 0:
-          return  # Unmatched parenthesis
-        lhs += '(' + expression[0:end]
-        expression = expression[end:]
-      elif token in ('&&', '||'):
-        # Logical and/or operators.  This means the expression
-        # contains more than one term, for example:
-        #   CHECK(42 < a && a < b);
-        #
-        # These are not replaceable with CHECK_LE, so bail out early.
-        return
-      elif token in ('<<', '<<=', '>>', '>>=', '->*', '->'):
-        # Non-relational operator
-        lhs += token
-        expression = matched.group(2)
-      else:
-        # Relational operator
-        operator = token
-        rhs = matched.group(2)
-        break
-    else:
-      # Unparenthesized operand.  Instead of appending to lhs one character
-      # at a time, we do another regular expression match to consume several
-      # characters at once if possible.  Trivial benchmark shows that this
-      # is more efficient when the operands are longer than a single
-      # character, which is generally the case.
-      matched = Match(r'^([^-=!<>()&|]+)(.*)$', expression)
-      if not matched:
-        matched = Match(r'^(\s*\S)(.*)$', expression)
-        if not matched:
-          break
-      lhs += matched.group(1)
-      expression = matched.group(2)
-
-  # Only apply checks if we got all parts of the boolean expression
-  if not (lhs and operator and rhs):
-    return
-
-  # Check that rhs do not contain logical operators.  We already know
-  # that lhs is fine since the loop above parses out && and ||.
-  if rhs.find('&&') > -1 or rhs.find('||') > -1:
-    return
-
-  # At least one of the operands must be a constant literal.  This is
-  # to avoid suggesting replacements for unprintable things like
-  # CHECK(variable != iterator)
-  #
-  # The following pattern matches decimal, hex integers, strings, and
-  # characters (in that order).
-  lhs = lhs.strip()
-  rhs = rhs.strip()
-  match_constant = r'^([-+]?(\d+|0[xX][0-9a-fA-F]+)[lLuU]{0,3}|".*"|\'.*\')$'
-  if Match(match_constant, lhs) or Match(match_constant, rhs):
-    # Note: since we know both lhs and rhs, we can provide a more
-    # descriptive error message like:
-    #   Consider using CHECK_EQ(x, 42) instead of CHECK(x == 42)
-    # Instead of:
-    #   Consider using CHECK_EQ instead of CHECK(a == b)
-    #
-    # We are still keeping the less descriptive message because if lhs
-    # or rhs gets long, the error message might become unreadable.
-    error(filename, linenum, 'readability/check', 2,
-          'Consider using %s instead of %s(a %s b)' % (
-              _CHECK_REPLACEMENT[check_macro][operator],
-              check_macro, operator))
-
-
-def CheckAltTokens(filename, clean_lines, linenum, error):
-  """Check alternative keywords being used in boolean expressions.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    error: The function to call with any errors found.
-  """
-  line = clean_lines.elided[linenum]
-
-  # Avoid preprocessor lines
-  if Match(r'^\s*#', line):
-    return
-
-  # Last ditch effort to avoid multi-line comments.  This will not help
-  # if the comment started before the current line or ended after the
-  # current line, but it catches most of the false positives.  At least,
-  # it provides a way to workaround this warning for people who use
-  # multi-line comments in preprocessor macros.
-  #
-  # TODO(unknown): remove this once cpplint has better support for
-  # multi-line comments.
-  if line.find('/*') >= 0 or line.find('*/') >= 0:
-    return
-
-  for match in _ALT_TOKEN_REPLACEMENT_PATTERN.finditer(line):
-    error(filename, linenum, 'readability/alt_tokens', 2,
-          'Use operator %s instead of %s' % (
-              _ALT_TOKEN_REPLACEMENT[match.group(1)], match.group(1)))
-
-
-def GetLineWidth(line):
-  """Determines the width of the line in column positions.
-
-  Args:
-    line: A string, which may be a Unicode string.
-
-  Returns:
-    The width of the line in column positions, accounting for Unicode
-    combining characters and wide characters.
-  """
-  if isinstance(line, unicode):
-    width = 0
-    for uc in unicodedata.normalize('NFC', line):
-      if unicodedata.east_asian_width(uc) in ('W', 'F'):
-        width += 2
-      elif not unicodedata.combining(uc):
-        width += 1
-    return width
-  else:
-    return len(line)
-
-
-def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state,
-               error):
-  """Checks rules from the 'C++ style rules' section of cppguide.html.
-
-  Most of these rules are hard to test (naming, comment style), but we
-  do what we can.  In particular we check for 2-space indents, line lengths,
-  tab usage, spaces inside code, etc.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    file_extension: The extension (without the dot) of the filename.
-    nesting_state: A _NestingState instance which maintains information about
-                   the current stack of nested blocks being parsed.
-    error: The function to call with any errors found.
-  """
-
-  # Don't use "elided" lines here, otherwise we can't check commented lines.
-  # Don't want to use "raw" either, because we don't want to check inside C++11
-  # raw strings,
-  raw_lines = clean_lines.lines_without_raw_strings
-  line = raw_lines[linenum]
-
-  if line.find('\t') != -1:
-    error(filename, linenum, 'whitespace/tab', 1,
-          'Tab found; better to use spaces')
-
-  # One or three blank spaces at the beginning of the line is weird; it's
-  # hard to reconcile that with 2-space indents.
-  # NOTE: here are the conditions rob pike used for his tests.  Mine aren't
-  # as sophisticated, but it may be worth becoming so:  RLENGTH==initial_spaces
-  # if(RLENGTH > 20) complain = 0;
-  # if(match($0, " +(error|private|public|protected):")) complain = 0;
-  # if(match(prev, "&& *$")) complain = 0;
-  # if(match(prev, "\\|\\| *$")) complain = 0;
-  # if(match(prev, "[\",=><] *$")) complain = 0;
-  # if(match($0, " <<")) complain = 0;
-  # if(match(prev, " +for \\(")) complain = 0;
-  # if(prevodd && match(prevprev, " +for \\(")) complain = 0;
-  initial_spaces = 0
-  cleansed_line = clean_lines.elided[linenum]
-  while initial_spaces < len(line) and line[initial_spaces] == ' ':
-    initial_spaces += 1
-  if line and line[-1].isspace():
-    error(filename, linenum, 'whitespace/end_of_line', 4,
-          'Line ends in whitespace.  Consider deleting these extra spaces.')
-  # There are certain situations we allow one space, notably for section labels
-  elif ((initial_spaces == 1 or initial_spaces == 3) and
-        not Match(r'\s*\w+\s*:\s*$', cleansed_line)):
-    error(filename, linenum, 'whitespace/indent', 3,
-          'Weird number of spaces at line-start.  '
-          'Are you using a 2-space indent?')
-
-  # Check if the line is a header guard.
-  is_header_guard = False
-  if file_extension == 'h':
-    cppvar = GetHeaderGuardCPPVariable(filename)
-    if (line.startswith('#ifndef %s' % cppvar) or
-        line.startswith('#define %s' % cppvar) or
-        line.startswith('#endif  // %s' % cppvar)):
-      is_header_guard = True
-  # #include lines and header guards can be long, since there's no clean way to
-  # split them.
-  #
-  # URLs can be long too.  It's possible to split these, but it makes them
-  # harder to cut&paste.
-  #
-  # The "$Id:...$" comment may also get very long without it being the
-  # developers fault.
-  if (not line.startswith('#include') and not is_header_guard and
-      not Match(r'^\s*//.*http(s?)://\S*$', line) and
-      not Match(r'^// \$Id:.*#[0-9]+ \$$', line)):
-    line_width = GetLineWidth(line)
-    extended_length = int((_line_length * 1.25))
-    if line_width > extended_length:
-      error(filename, linenum, 'whitespace/line_length', 4,
-            'Lines should very rarely be longer than %i characters' %
-            extended_length)
-    elif line_width > _line_length:
-      error(filename, linenum, 'whitespace/line_length', 2,
-            'Lines should be <= %i characters long' % _line_length)
-
-  if (cleansed_line.count(';') > 1 and
-      # for loops are allowed two ;'s (and may run over two lines).
-      cleansed_line.find('for') == -1 and
-      (GetPreviousNonBlankLine(clean_lines, linenum)[0].find('for') == -1 or
-       GetPreviousNonBlankLine(clean_lines, linenum)[0].find(';') != -1) and
-      # It's ok to have many commands in a switch case that fits in 1 line
-      not ((cleansed_line.find('case ') != -1 or
-            cleansed_line.find('default:') != -1) and
-           cleansed_line.find('break;') != -1)):
-    error(filename, linenum, 'whitespace/newline', 0,
-          'More than one command on the same line')
-
-  # Some more style checks
-  CheckBraces(filename, clean_lines, linenum, error)
-  CheckEmptyBlockBody(filename, clean_lines, linenum, error)
-  CheckAccess(filename, clean_lines, linenum, nesting_state, error)
-  CheckSpacing(filename, clean_lines, linenum, nesting_state, error)
-  CheckCheck(filename, clean_lines, linenum, error)
-  CheckAltTokens(filename, clean_lines, linenum, error)
-  classinfo = nesting_state.InnermostClass()
-  if classinfo:
-    CheckSectionSpacing(filename, clean_lines, classinfo, linenum, error)
-
-
-_RE_PATTERN_INCLUDE_NEW_STYLE = re.compile(r'#include +"[^/]+\.h"')
-_RE_PATTERN_INCLUDE = re.compile(r'^\s*#\s*include\s*([<"])([^>"]*)[>"].*$')
-# Matches the first component of a filename delimited by -s and _s. That is:
-#  _RE_FIRST_COMPONENT.match('foo').group(0) == 'foo'
-#  _RE_FIRST_COMPONENT.match('foo.cc').group(0) == 'foo'
-#  _RE_FIRST_COMPONENT.match('foo-bar_baz.cc').group(0) == 'foo'
-#  _RE_FIRST_COMPONENT.match('foo_bar-baz.cc').group(0) == 'foo'
-_RE_FIRST_COMPONENT = re.compile(r'^[^-_.]+')
-
-
-def _DropCommonSuffixes(filename):
-  """Drops common suffixes like _test.cc or -inl.h from filename.
-
-  For example:
-    >>> _DropCommonSuffixes('foo/foo-inl.h')
-    'foo/foo'
-    >>> _DropCommonSuffixes('foo/bar/foo.cc')
-    'foo/bar/foo'
-    >>> _DropCommonSuffixes('foo/foo_internal.h')
-    'foo/foo'
-    >>> _DropCommonSuffixes('foo/foo_unusualinternal.h')
-    'foo/foo_unusualinternal'
-
-  Args:
-    filename: The input filename.
-
-  Returns:
-    The filename with the common suffix removed.
-  """
-  for suffix in ('test.cc', 'regtest.cc', 'unittest.cc',
-                 'inl.h', 'impl.h', 'internal.h'):
-    if (filename.endswith(suffix) and len(filename) > len(suffix) and
-        filename[-len(suffix) - 1] in ('-', '_')):
-      return filename[:-len(suffix) - 1]
-  return os.path.splitext(filename)[0]
-
-
-def _IsTestFilename(filename):
-  """Determines if the given filename has a suffix that identifies it as a test.
-
-  Args:
-    filename: The input filename.
-
-  Returns:
-    True if 'filename' looks like a test, False otherwise.
-  """
-  if (filename.endswith('_test.cc') or
-      filename.endswith('_unittest.cc') or
-      filename.endswith('_regtest.cc')):
-    return True
-  else:
-    return False
-
-
-def _ClassifyInclude(fileinfo, include, is_system):
-  """Figures out what kind of header 'include' is.
-
-  Args:
-    fileinfo: The current file cpplint is running over. A FileInfo instance.
-    include: The path to a #included file.
-    is_system: True if the #include used <> rather than "".
-
-  Returns:
-    One of the _XXX_HEADER constants.
-
-  For example:
-    >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'stdio.h', True)
-    _C_SYS_HEADER
-    >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'string', True)
-    _CPP_SYS_HEADER
-    >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'foo/foo.h', False)
-    _LIKELY_MY_HEADER
-    >>> _ClassifyInclude(FileInfo('foo/foo_unknown_extension.cc'),
-    ...                  'bar/foo_other_ext.h', False)
-    _POSSIBLE_MY_HEADER
-    >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'foo/bar.h', False)
-    _OTHER_HEADER
-  """
-  # This is a list of all standard c++ header files, except
-  # those already checked for above.
-  is_cpp_h = include in _CPP_HEADERS
-
-  if is_system:
-    if is_cpp_h:
-      return _CPP_SYS_HEADER
-    else:
-      return _C_SYS_HEADER
-
-  # If the target file and the include we're checking share a
-  # basename when we drop common extensions, and the include
-  # lives in . , then it's likely to be owned by the target file.
-  target_dir, target_base = (
-      os.path.split(_DropCommonSuffixes(fileinfo.RepositoryName())))
-  include_dir, include_base = os.path.split(_DropCommonSuffixes(include))
-  if target_base == include_base and (
-      include_dir == target_dir or
-      include_dir == os.path.normpath(target_dir + '/../public')):
-    return _LIKELY_MY_HEADER
-
-  # If the target and include share some initial basename
-  # component, it's possible the target is implementing the
-  # include, so it's allowed to be first, but we'll never
-  # complain if it's not there.
-  target_first_component = _RE_FIRST_COMPONENT.match(target_base)
-  include_first_component = _RE_FIRST_COMPONENT.match(include_base)
-  if (target_first_component and include_first_component and
-      target_first_component.group(0) ==
-      include_first_component.group(0)):
-    return _POSSIBLE_MY_HEADER
-
-  return _OTHER_HEADER
-
-
-
-def CheckIncludeLine(filename, clean_lines, linenum, include_state, error):
-  """Check rules that are applicable to #include lines.
-
-  Strings on #include lines are NOT removed from elided line, to make
-  certain tasks easier. However, to prevent false positives, checks
-  applicable to #include lines in CheckLanguage must be put here.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    include_state: An _IncludeState instance in which the headers are inserted.
-    error: The function to call with any errors found.
-  """
-  fileinfo = FileInfo(filename)
-
-  line = clean_lines.lines[linenum]
-
-  # "include" should use the new style "foo/bar.h" instead of just "bar.h"
-  if _RE_PATTERN_INCLUDE_NEW_STYLE.search(line):
-    error(filename, linenum, 'build/include', 4,
-          'Include the directory when naming .h files')
-
-  # we shouldn't include a file more than once. actually, there are a
-  # handful of instances where doing so is okay, but in general it's
-  # not.
-  match = _RE_PATTERN_INCLUDE.search(line)
-  if match:
-    include = match.group(2)
-    is_system = (match.group(1) == '<')
-    if include in include_state:
-      error(filename, linenum, 'build/include', 4,
-            '"%s" already included at %s:%s' %
-            (include, filename, include_state[include]))
-    else:
-      include_state[include] = linenum
-
-      # We want to ensure that headers appear in the right order:
-      # 1) for foo.cc, foo.h  (preferred location)
-      # 2) c system files
-      # 3) cpp system files
-      # 4) for foo.cc, foo.h  (deprecated location)
-      # 5) other google headers
-      #
-      # We classify each include statement as one of those 5 types
-      # using a number of techniques. The include_state object keeps
-      # track of the highest type seen, and complains if we see a
-      # lower type after that.
-      error_message = include_state.CheckNextIncludeOrder(
-          _ClassifyInclude(fileinfo, include, is_system))
-      if error_message:
-        error(filename, linenum, 'build/include_order', 4,
-              '%s. Should be: %s.h, c system, c++ system, other.' %
-              (error_message, fileinfo.BaseName()))
-      canonical_include = include_state.CanonicalizeAlphabeticalOrder(include)
-      if not include_state.IsInAlphabeticalOrder(
-          clean_lines, linenum, canonical_include):
-        error(filename, linenum, 'build/include_alpha', 4,
-              'Include "%s" not in alphabetical order' % include)
-      include_state.SetLastHeader(canonical_include)
-
-  # Look for any of the stream classes that are part of standard C++.
-  match = _RE_PATTERN_INCLUDE.match(line)
-  if match:
-    include = match.group(2)
-    if Match(r'(f|ind|io|i|o|parse|pf|stdio|str|)?stream$', include):
-      # Many unit tests use cout, so we exempt them.
-      if not _IsTestFilename(filename):
-        error(filename, linenum, 'readability/streams', 3,
-              'Streams are highly discouraged.')
-
-
-def _GetTextInside(text, start_pattern):
-  r"""Retrieves all the text between matching open and close parentheses.
-
-  Given a string of lines and a regular expression string, retrieve all the text
-  following the expression and between opening punctuation symbols like
-  (, [, or {, and the matching close-punctuation symbol. This properly nested
-  occurrences of the punctuations, so for the text like
-    printf(a(), b(c()));
-  a call to _GetTextInside(text, r'printf\(') will return 'a(), b(c())'.
-  start_pattern must match string having an open punctuation symbol at the end.
-
-  Args:
-    text: The lines to extract text. Its comments and strings must be elided.
-           It can be single line and can span multiple lines.
-    start_pattern: The regexp string indicating where to start extracting
-                   the text.
-  Returns:
-    The extracted text.
-    None if either the opening string or ending punctuation could not be found.
-  """
-  # TODO(sugawarayu): Audit cpplint.py to see what places could be profitably
-  # rewritten to use _GetTextInside (and use inferior regexp matching today).
-
-  # Give opening punctuations to get the matching close-punctuations.
-  matching_punctuation = {'(': ')', '{': '}', '[': ']'}
-  closing_punctuation = set(matching_punctuation.itervalues())
-
-  # Find the position to start extracting text.
-  match = re.search(start_pattern, text, re.M)
-  if not match:  # start_pattern not found in text.
-    return None
-  start_position = match.end(0)
-
-  assert start_position > 0, (
-      'start_pattern must ends with an opening punctuation.')
-  assert text[start_position - 1] in matching_punctuation, (
-      'start_pattern must ends with an opening punctuation.')
-  # Stack of closing punctuations we expect to have in text after position.
-  punctuation_stack = [matching_punctuation[text[start_position - 1]]]
-  position = start_position
-  while punctuation_stack and position < len(text):
-    if text[position] == punctuation_stack[-1]:
-      punctuation_stack.pop()
-    elif text[position] in closing_punctuation:
-      # A closing punctuation without matching opening punctuations.
-      return None
-    elif text[position] in matching_punctuation:
-      punctuation_stack.append(matching_punctuation[text[position]])
-    position += 1
-  if punctuation_stack:
-    # Opening punctuations left without matching close-punctuations.
-    return None
-  # punctuations match.
-  return text[start_position:position - 1]
-
-
-# Patterns for matching call-by-reference parameters.
-#
-# Supports nested templates up to 2 levels deep using this messy pattern:
-#   < (?: < (?: < [^<>]*
-#               >
-#           |   [^<>] )*
-#         >
-#     |   [^<>] )*
-#   >
-_RE_PATTERN_IDENT = r'[_a-zA-Z]\w*'  # =~ [[:alpha:]][[:alnum:]]*
-_RE_PATTERN_TYPE = (
-    r'(?:const\s+)?(?:typename\s+|class\s+|struct\s+|union\s+|enum\s+)?'
-    r'(?:\w|'
-    r'\s*<(?:<(?:<[^<>]*>|[^<>])*>|[^<>])*>|'
-    r'::)+')
-# A call-by-reference parameter ends with '& identifier'.
-_RE_PATTERN_REF_PARAM = re.compile(
-    r'(' + _RE_PATTERN_TYPE + r'(?:\s*(?:\bconst\b|[*]))*\s*'
-    r'&\s*' + _RE_PATTERN_IDENT + r')\s*(?:=[^,()]+)?[,)]')
-# A call-by-const-reference parameter either ends with 'const& identifier'
-# or looks like 'const type& identifier' when 'type' is atomic.
-_RE_PATTERN_CONST_REF_PARAM = (
-    r'(?:.*\s*\bconst\s*&\s*' + _RE_PATTERN_IDENT +
-    r'|const\s+' + _RE_PATTERN_TYPE + r'\s*&\s*' + _RE_PATTERN_IDENT + r')')
-
-
-def CheckLanguage(filename, clean_lines, linenum, file_extension,
-                  include_state, nesting_state, error):
-  """Checks rules from the 'C++ language rules' section of cppguide.html.
-
-  Some of these rules are hard to test (function overloading, using
-  uint32 inappropriately), but we do the best we can.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    file_extension: The extension (without the dot) of the filename.
-    include_state: An _IncludeState instance in which the headers are inserted.
-    nesting_state: A _NestingState instance which maintains information about
-                   the current stack of nested blocks being parsed.
-    error: The function to call with any errors found.
-  """
-  # If the line is empty or consists of entirely a comment, no need to
-  # check it.
-  line = clean_lines.elided[linenum]
-  if not line:
-    return
-
-  match = _RE_PATTERN_INCLUDE.search(line)
-  if match:
-    CheckIncludeLine(filename, clean_lines, linenum, include_state, error)
-    return
-
-  # Reset include state across preprocessor directives.  This is meant
-  # to silence warnings for conditional includes.
-  if Match(r'^\s*#\s*(?:ifdef|elif|else|endif)\b', line):
-    include_state.ResetSection()
-
-  # Make Windows paths like Unix.
-  fullname = os.path.abspath(filename).replace('\\', '/')
-
-  # TODO(unknown): figure out if they're using default arguments in fn proto.
-
-  # Check to see if they're using an conversion function cast.
-  # I just try to capture the most common basic types, though there are more.
-  # Parameterless conversion functions, such as bool(), are allowed as they are
-  # probably a member operator declaration or default constructor.
-  match = Search(
-      r'(\bnew\s+)?\b'  # Grab 'new' operator, if it's there
-      r'(int|float|double|bool|char|int32|uint32|int64|uint64)'
-      r'(\([^)].*)', line)
-  if match:
-    matched_new = match.group(1)
-    matched_type = match.group(2)
-    matched_funcptr = match.group(3)
-
-    # gMock methods are defined using some variant of MOCK_METHODx(name, type)
-    # where type may be float(), int(string), etc.  Without context they are
-    # virtually indistinguishable from int(x) casts. Likewise, gMock's
-    # MockCallback takes a template parameter of the form return_type(arg_type),
-    # which looks much like the cast we're trying to detect.
-    #
-    # std::function<> wrapper has a similar problem.
-    #
-    # Return types for function pointers also look like casts if they
-    # don't have an extra space.
-    if (matched_new is None and  # If new operator, then this isn't a cast
-        not (Match(r'^\s*MOCK_(CONST_)?METHOD\d+(_T)?\(', line) or
-             Search(r'\bMockCallback<.*>', line) or
-             Search(r'\bstd::function<.*>', line)) and
-        not (matched_funcptr and
-             Match(r'\((?:[^() ]+::\s*\*\s*)?[^() ]+\)\s*\(',
-                   matched_funcptr))):
-      # Try a bit harder to catch gmock lines: the only place where
-      # something looks like an old-style cast is where we declare the
-      # return type of the mocked method, and the only time when we
-      # are missing context is if MOCK_METHOD was split across
-      # multiple lines.  The missing MOCK_METHOD is usually one or two
-      # lines back, so scan back one or two lines.
-      #
-      # It's not possible for gmock macros to appear in the first 2
-      # lines, since the class head + section name takes up 2 lines.
-      if (linenum < 2 or
-          not (Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\((?:\S+,)?\s*$',
-                     clean_lines.elided[linenum - 1]) or
-               Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\(\s*$',
-                     clean_lines.elided[linenum - 2]))):
-        error(filename, linenum, 'readability/casting', 4,
-              'Using deprecated casting style.  '
-              'Use static_cast<%s>(...) instead' %
-              matched_type)
-
-  CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum],
-                  'static_cast',
-                  r'\((int|float|double|bool|char|u?int(16|32|64))\)', error)
-
-  # This doesn't catch all cases. Consider (const char * const)"hello".
-  #
-  # (char *) "foo" should always be a const_cast (reinterpret_cast won't
-  # compile).
-  if CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum],
-                     'const_cast', r'\((char\s?\*+\s?)\)\s*"', error):
-    pass
-  else:
-    # Check pointer casts for other than string constants
-    CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum],
-                    'reinterpret_cast', r'\((\w+\s?\*+\s?)\)', error)
-
-  # In addition, we look for people taking the address of a cast.  This
-  # is dangerous -- casts can assign to temporaries, so the pointer doesn't
-  # point where you think.
-  match = Search(
-      r'(?:&\(([^)]+)\)[\w(])|'
-      r'(?:&(static|dynamic|down|reinterpret)_cast\b)', line)
-  if match and match.group(1) != '*':
-    error(filename, linenum, 'runtime/casting', 4,
-          ('Are you taking an address of a cast?  '
-           'This is dangerous: could be a temp var.  '
-           'Take the address before doing the cast, rather than after'))
-
-  # Create an extended_line, which is the concatenation of the current and
-  # next lines, for more effective checking of code that may span more than one
-  # line.
-  if linenum + 1 < clean_lines.NumLines():
-    extended_line = line + clean_lines.elided[linenum + 1]
-  else:
-    extended_line = line
-
-  # Check for people declaring static/global STL strings at the top level.
-  # This is dangerous because the C++ language does not guarantee that
-  # globals with constructors are initialized before the first access.
-  match = Match(
-      r'((?:|static +)(?:|const +))string +([a-zA-Z0-9_:]+)\b(.*)',
-      line)
-  # Make sure it's not a function.
-  # Function template specialization looks like: "string foo<Type>(...".
-  # Class template definitions look like: "string Foo<Type>::Method(...".
-  #
-  # Also ignore things that look like operators.  These are matched separately
-  # because operator names cross non-word boundaries.  If we change the pattern
-  # above, we would decrease the accuracy of matching identifiers.
-  if (match and
-      not Search(r'\boperator\W', line) and
-      not Match(r'\s*(<.*>)?(::[a-zA-Z0-9_]+)?\s*\(([^"]|$)', match.group(3))):
-    error(filename, linenum, 'runtime/string', 4,
-          'For a static/global string constant, use a C style string instead: '
-          '"%schar %s[]".' %
-          (match.group(1), match.group(2)))
-
-  if Search(r'\b([A-Za-z0-9_]*_)\(\1\)', line):
-    error(filename, linenum, 'runtime/init', 4,
-          'You seem to be initializing a member variable with itself.')
-
-  if file_extension == 'h':
-    # TODO(unknown): check that 1-arg constructors are explicit.
-    #                How to tell it's a constructor?
-    #                (handled in CheckForNonStandardConstructs for now)
-    # TODO(unknown): check that classes have DISALLOW_EVIL_CONSTRUCTORS
-    #                (level 1 error)
-    pass
-
-  # Check if people are using the verboten C basic types.  The only exception
-  # we regularly allow is "unsigned short port" for port.
-  if Search(r'\bshort port\b', line):
-    if not Search(r'\bunsigned short port\b', line):
-      error(filename, linenum, 'runtime/int', 4,
-            'Use "unsigned short" for ports, not "short"')
-  else:
-    match = Search(r'\b(short|long(?! +double)|long long)\b', line)
-    if match:
-      error(filename, linenum, 'runtime/int', 4,
-            'Use int16/int64/etc, rather than the C type %s' % match.group(1))
-
-  # When snprintf is used, the second argument shouldn't be a literal.
-  match = Search(r'snprintf\s*\(([^,]*),\s*([0-9]*)\s*,', line)
-  if match and match.group(2) != '0':
-    # If 2nd arg is zero, snprintf is used to calculate size.
-    error(filename, linenum, 'runtime/printf', 3,
-          'If you can, use sizeof(%s) instead of %s as the 2nd arg '
-          'to snprintf.' % (match.group(1), match.group(2)))
-
-  # Check if some verboten C functions are being used.
-  if Search(r'\bsprintf\b', line):
-    error(filename, linenum, 'runtime/printf', 5,
-          'Never use sprintf.  Use snprintf instead.')
-  match = Search(r'\b(strcpy|strcat)\b', line)
-  if match:
-    error(filename, linenum, 'runtime/printf', 4,
-          'Almost always, snprintf is better than %s' % match.group(1))
-
-  # Check if some verboten operator overloading is going on
-  # TODO(unknown): catch out-of-line unary operator&:
-  #   class X {};
-  #   int operator&(const X& x) { return 42; }  // unary operator&
-  # The trick is it's hard to tell apart from binary operator&:
-  #   class Y { int operator&(const Y& x) { return 23; } }; // binary operator&
-  if Search(r'\boperator\s*&\s*\(\s*\)', line):
-    error(filename, linenum, 'runtime/operator', 4,
-          'Unary operator& is dangerous.  Do not use it.')
-
-  # Check for suspicious usage of "if" like
-  # } if (a == b) {
-  if Search(r'\}\s*if\s*\(', line):
-    error(filename, linenum, 'readability/braces', 4,
-          'Did you mean "else if"? If not, start a new line for "if".')
-
-  # Check for potential format string bugs like printf(foo).
-  # We constrain the pattern not to pick things like DocidForPrintf(foo).
-  # Not perfect but it can catch printf(foo.c_str()) and printf(foo->c_str())
-  # TODO(sugawarayu): Catch the following case. Need to change the calling
-  # convention of the whole function to process multiple line to handle it.
-  #   printf(
-  #       boy_this_is_a_really_long_variable_that_cannot_fit_on_the_prev_line);
-  printf_args = _GetTextInside(line, r'(?i)\b(string)?printf\s*\(')
-  if printf_args:
-    match = Match(r'([\w.\->()]+)$', printf_args)
-    if match and match.group(1) != '__VA_ARGS__':
-      function_name = re.search(r'\b((?:string)?printf)\s*\(',
-                                line, re.I).group(1)
-      error(filename, linenum, 'runtime/printf', 4,
-            'Potential format string bug. Do %s("%%s", %s) instead.'
-            % (function_name, match.group(1)))
-
-  # Check for potential memset bugs like memset(buf, sizeof(buf), 0).
-  match = Search(r'memset\s*\(([^,]*),\s*([^,]*),\s*0\s*\)', line)
-  if match and not Match(r"^''|-?[0-9]+|0x[0-9A-Fa-f]$", match.group(2)):
-    error(filename, linenum, 'runtime/memset', 4,
-          'Did you mean "memset(%s, 0, %s)"?'
-          % (match.group(1), match.group(2)))
-
-  if Search(r'\busing namespace\b', line):
-    error(filename, linenum, 'build/namespaces', 5,
-          'Do not use namespace using-directives.  '
-          'Use using-declarations instead.')
-
-  # Detect variable-length arrays.
-  match = Match(r'\s*(.+::)?(\w+) [a-z]\w*\[(.+)];', line)
-  if (match and match.group(2) != 'return' and match.group(2) != 'delete' and
-      match.group(3).find(']') == -1):
-    # Split the size using space and arithmetic operators as delimiters.
-    # If any of the resulting tokens are not compile time constants then
-    # report the error.
-    tokens = re.split(r'\s|\+|\-|\*|\/|<<|>>]', match.group(3))
-    is_const = True
-    skip_next = False
-    for tok in tokens:
-      if skip_next:
-        skip_next = False
-        continue
-
-      if Search(r'sizeof\(.+\)', tok): continue
-      if Search(r'arraysize\(\w+\)', tok): continue
-
-      tok = tok.lstrip('(')
-      tok = tok.rstrip(')')
-      if not tok: continue
-      if Match(r'\d+', tok): continue
-      if Match(r'0[xX][0-9a-fA-F]+', tok): continue
-      if Match(r'k[A-Z0-9]\w*', tok): continue
-      if Match(r'(.+::)?k[A-Z0-9]\w*', tok): continue
-      if Match(r'(.+::)?[A-Z][A-Z0-9_]*', tok): continue
-      # A catch all for tricky sizeof cases, including 'sizeof expression',
-      # 'sizeof(*type)', 'sizeof(const type)', 'sizeof(struct StructName)'
-      # requires skipping the next token because we split on ' ' and '*'.
-      if tok.startswith('sizeof'):
-        skip_next = True
-        continue
-      is_const = False
-      break
-    if not is_const:
-      error(filename, linenum, 'runtime/arrays', 1,
-            'Do not use variable-length arrays.  Use an appropriately named '
-            "('k' followed by CamelCase) compile-time constant for the size.")
-
-  # If DISALLOW_EVIL_CONSTRUCTORS, DISALLOW_COPY_AND_ASSIGN, or
-  # DISALLOW_IMPLICIT_CONSTRUCTORS is present, then it should be the last thing
-  # in the class declaration.
-  match = Match(
-      (r'\s*'
-       r'(DISALLOW_(EVIL_CONSTRUCTORS|COPY_AND_ASSIGN|IMPLICIT_CONSTRUCTORS))'
-       r'\(.*\);$'),
-      line)
-  if match and linenum + 1 < clean_lines.NumLines():
-    next_line = clean_lines.elided[linenum + 1]
-    # We allow some, but not all, declarations of variables to be present
-    # in the statement that defines the class.  The [\w\*,\s]* fragment of
-    # the regular expression below allows users to declare instances of
-    # the class or pointers to instances, but not less common types such
-    # as function pointers or arrays.  It's a tradeoff between allowing
-    # reasonable code and avoiding trying to parse more C++ using regexps.
-    if not Search(r'^\s*}[\w\*,\s]*;', next_line):
-      error(filename, linenum, 'readability/constructors', 3,
-            match.group(1) + ' should be the last thing in the class')
-
-  # Check for use of unnamed namespaces in header files.  Registration
-  # macros are typically OK, so we allow use of "namespace {" on lines
-  # that end with backslashes.
-  if (file_extension == 'h'
-      and Search(r'\bnamespace\s*{', line)
-      and line[-1] != '\\'):
-    error(filename, linenum, 'build/namespaces', 4,
-          'Do not use unnamed namespaces in header files.  See '
-          'http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Namespaces'
-          ' for more information.')
-
-def CheckForNonConstReference(filename, clean_lines, linenum,
-                              nesting_state, error):
-  """Check for non-const references.
-
-  Separate from CheckLanguage since it scans backwards from current
-  line, instead of scanning forward.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    nesting_state: A _NestingState instance which maintains information about
-                   the current stack of nested blocks being parsed.
-    error: The function to call with any errors found.
-  """
-  # Do nothing if there is no '&' on current line.
-  line = clean_lines.elided[linenum]
-  if '&' not in line:
-    return
-
-  # Long type names may be broken across multiple lines, usually in one
-  # of these forms:
-  #   LongType
-  #       ::LongTypeContinued &identifier
-  #   LongType::
-  #       LongTypeContinued &identifier
-  #   LongType<
-  #       ...>::LongTypeContinued &identifier
-  #
-  # If we detected a type split across two lines, join the previous
-  # line to current line so that we can match const references
-  # accordingly.
-  #
-  # Note that this only scans back one line, since scanning back
-  # arbitrary number of lines would be expensive.  If you have a type
-  # that spans more than 2 lines, please use a typedef.
-  if linenum > 1:
-    previous = None
-    if Match(r'\s*::(?:[\w<>]|::)+\s*&\s*\S', line):
-      # previous_line\n + ::current_line
-      previous = Search(r'\b((?:const\s*)?(?:[\w<>]|::)+[\w<>])\s*$',
-                        clean_lines.elided[linenum - 1])
-    elif Match(r'\s*[a-zA-Z_]([\w<>]|::)+\s*&\s*\S', line):
-      # previous_line::\n + current_line
-      previous = Search(r'\b((?:const\s*)?(?:[\w<>]|::)+::)\s*$',
-                        clean_lines.elided[linenum - 1])
-    if previous:
-      line = previous.group(1) + line.lstrip()
-    else:
-      # Check for templated parameter that is split across multiple lines
-      endpos = line.rfind('>')
-      if endpos > -1:
-        (_, startline, startpos) = ReverseCloseExpression(
-            clean_lines, linenum, endpos)
-        if startpos > -1 and startline < linenum:
-          # Found the matching < on an earlier line, collect all
-          # pieces up to current line.
-          line = ''
-          for i in xrange(startline, linenum + 1):
-            line += clean_lines.elided[i].strip()
-
-  # Check for non-const references in function parameters.  A single '&' may
-  # found in the following places:
-  #   inside expression: binary & for bitwise AND
-  #   inside expression: unary & for taking the address of something
-  #   inside declarators: reference parameter
-  # We will exclude the first two cases by checking that we are not inside a
-  # function body, including one that was just introduced by a trailing '{'.
-  # TODO(unknwon): Doesn't account for preprocessor directives.
-  # TODO(unknown): Doesn't account for 'catch(Exception& e)' [rare].
-  check_params = False
-  if not nesting_state.stack:
-    check_params = True  # top level
-  elif (isinstance(nesting_state.stack[-1], _ClassInfo) or
-        isinstance(nesting_state.stack[-1], _NamespaceInfo)):
-    check_params = True  # within class or namespace
-  elif Match(r'.*{\s*$', line):
-    if (len(nesting_state.stack) == 1 or
-        isinstance(nesting_state.stack[-2], _ClassInfo) or
-        isinstance(nesting_state.stack[-2], _NamespaceInfo)):
-      check_params = True  # just opened global/class/namespace block
-  # We allow non-const references in a few standard places, like functions
-  # called "swap()" or iostream operators like "<<" or ">>".  Do not check
-  # those function parameters.
-  #
-  # We also accept & in static_assert, which looks like a function but
-  # it's actually a declaration expression.
-  whitelisted_functions = (r'(?:[sS]wap(?:<\w:+>)?|'
-                           r'operator\s*[<>][<>]|'
-                           r'static_assert|COMPILE_ASSERT'
-                           r')\s*\(')
-  if Search(whitelisted_functions, line):
-    check_params = False
-  elif not Search(r'\S+\([^)]*$', line):
-    # Don't see a whitelisted function on this line.  Actually we
-    # didn't see any function name on this line, so this is likely a
-    # multi-line parameter list.  Try a bit harder to catch this case.
-    for i in xrange(2):
-      if (linenum > i and
-          Search(whitelisted_functions, clean_lines.elided[linenum - i - 1])):
-        check_params = False
-        break
-
-  if check_params:
-    decls = ReplaceAll(r'{[^}]*}', ' ', line)  # exclude function body
-    for parameter in re.findall(_RE_PATTERN_REF_PARAM, decls):
-      if not Match(_RE_PATTERN_CONST_REF_PARAM, parameter):
-        error(filename, linenum, 'runtime/references', 2,
-              'Is this a non-const reference? '
-              'If so, make const or use a pointer: ' +
-              ReplaceAll(' *<', '<', parameter))
-
-
-def CheckCStyleCast(filename, linenum, line, raw_line, cast_type, pattern,
-                    error):
-  """Checks for a C-style cast by looking for the pattern.
-
-  Args:
-    filename: The name of the current file.
-    linenum: The number of the line to check.
-    line: The line of code to check.
-    raw_line: The raw line of code to check, with comments.
-    cast_type: The string for the C++ cast to recommend.  This is either
-      reinterpret_cast, static_cast, or const_cast, depending.
-    pattern: The regular expression used to find C-style casts.
-    error: The function to call with any errors found.
-
-  Returns:
-    True if an error was emitted.
-    False otherwise.
-  """
-  match = Search(pattern, line)
-  if not match:
-    return False
-
-  # Exclude lines with sizeof, since sizeof looks like a cast.
-  sizeof_match = Match(r'.*sizeof\s*$', line[0:match.start(1) - 1])
-  if sizeof_match:
-    return False
-
-  # operator++(int) and operator--(int)
-  if (line[0:match.start(1) - 1].endswith(' operator++') or
-      line[0:match.start(1) - 1].endswith(' operator--')):
-    return False
-
-  # A single unnamed argument for a function tends to look like old
-  # style cast.  If we see those, don't issue warnings for deprecated
-  # casts, instead issue warnings for unnamed arguments where
-  # appropriate.
-  #
-  # These are things that we want warnings for, since the style guide
-  # explicitly require all parameters to be named:
-  #   Function(int);
-  #   Function(int) {
-  #   ConstMember(int) const;
-  #   ConstMember(int) const {
-  #   ExceptionMember(int) throw (...);
-  #   ExceptionMember(int) throw (...) {
-  #   PureVirtual(int) = 0;
-  #
-  # These are functions of some sort, where the compiler would be fine
-  # if they had named parameters, but people often omit those
-  # identifiers to reduce clutter:
-  #   (FunctionPointer)(int);
-  #   (FunctionPointer)(int) = value;
-  #   Function((function_pointer_arg)(int))
-  #   <TemplateArgument(int)>;
-  #   <(FunctionPointerTemplateArgument)(int)>;
-  remainder = line[match.end(0):]
-  if Match(r'^\s*(?:;|const\b|throw\b|=|>|\{|\))', remainder):
-    # Looks like an unnamed parameter.
-
-    # Don't warn on any kind of template arguments.
-    if Match(r'^\s*>', remainder):
-      return False
-
-    # Don't warn on assignments to function pointers, but keep warnings for
-    # unnamed parameters to pure virtual functions.  Note that this pattern
-    # will also pass on assignments of "0" to function pointers, but the
-    # preferred values for those would be "nullptr" or "NULL".
-    matched_zero = Match(r'^\s=\s*(\S+)\s*;', remainder)
-    if matched_zero and matched_zero.group(1) != '0':
-      return False
-
-    # Don't warn on function pointer declarations.  For this we need
-    # to check what came before the "(type)" string.
-    if Match(r'.*\)\s*$', line[0:match.start(0)]):
-      return False
-
-    # Don't warn if the parameter is named with block comments, e.g.:
-    #  Function(int /*unused_param*/);
-    if '/*' in raw_line:
-      return False
-
-    # Passed all filters, issue warning here.
-    error(filename, linenum, 'readability/function', 3,
-          'All parameters should be named in a function')
-    return True
-
-  # At this point, all that should be left is actual casts.
-  error(filename, linenum, 'readability/casting', 4,
-        'Using C-style cast.  Use %s<%s>(...) instead' %
-        (cast_type, match.group(1)))
-
-  return True
-
-
-_HEADERS_CONTAINING_TEMPLATES = (
-    ('<deque>', ('deque',)),
-    ('<functional>', ('unary_function', 'binary_function',
-                      'plus', 'minus', 'multiplies', 'divides', 'modulus',
-                      'negate',
-                      'equal_to', 'not_equal_to', 'greater', 'less',
-                      'greater_equal', 'less_equal',
-                      'logical_and', 'logical_or', 'logical_not',
-                      'unary_negate', 'not1', 'binary_negate', 'not2',
-                      'bind1st', 'bind2nd',
-                      'pointer_to_unary_function',
-                      'pointer_to_binary_function',
-                      'ptr_fun',
-                      'mem_fun_t', 'mem_fun', 'mem_fun1_t', 'mem_fun1_ref_t',
-                      'mem_fun_ref_t',
-                      'const_mem_fun_t', 'const_mem_fun1_t',
-                      'const_mem_fun_ref_t', 'const_mem_fun1_ref_t',
-                      'mem_fun_ref',
-                     )),
-    ('<limits>', ('numeric_limits',)),
-    ('<list>', ('list',)),
-    ('<map>', ('map', 'multimap',)),
-    ('<memory>', ('allocator',)),
-    ('<queue>', ('queue', 'priority_queue',)),
-    ('<set>', ('set', 'multiset',)),
-    ('<stack>', ('stack',)),
-    ('<string>', ('char_traits', 'basic_string',)),
-    ('<utility>', ('pair',)),
-    ('<vector>', ('vector',)),
-
-    # gcc extensions.
-    # Note: std::hash is their hash, ::hash is our hash
-    ('<hash_map>', ('hash_map', 'hash_multimap',)),
-    ('<hash_set>', ('hash_set', 'hash_multiset',)),
-    ('<slist>', ('slist',)),
-    )
-
-_RE_PATTERN_STRING = re.compile(r'\bstring\b')
-
-_re_pattern_algorithm_header = []
-for _template in ('copy', 'max', 'min', 'min_element', 'sort', 'swap',
-                  'transform'):
-  # Match max<type>(..., ...), max(..., ...), but not foo->max, foo.max or
-  # type::max().
-  _re_pattern_algorithm_header.append(
-      (re.compile(r'[^>.]\b' + _template + r'(<.*?>)?\([^\)]'),
-       _template,
-       '<algorithm>'))
-
-_re_pattern_templates = []
-for _header, _templates in _HEADERS_CONTAINING_TEMPLATES:
-  for _template in _templates:
-    _re_pattern_templates.append(
-        (re.compile(r'(\<|\b)' + _template + r'\s*\<'),
-         _template + '<>',
-         _header))
-
-
-def FilesBelongToSameModule(filename_cc, filename_h):
-  """Check if these two filenames belong to the same module.
-
-  The concept of a 'module' here is a as follows:
-  foo.h, foo-inl.h, foo.cc, foo_test.cc and foo_unittest.cc belong to the
-  same 'module' if they are in the same directory.
-  some/path/public/xyzzy and some/path/internal/xyzzy are also considered
-  to belong to the same module here.
-
-  If the filename_cc contains a longer path than the filename_h, for example,
-  '/absolute/path/to/base/sysinfo.cc', and this file would include
-  'base/sysinfo.h', this function also produces the prefix needed to open the
-  header. This is used by the caller of this function to more robustly open the
-  header file. We don't have access to the real include paths in this context,
-  so we need this guesswork here.
-
-  Known bugs: tools/base/bar.cc and base/bar.h belong to the same module
-  according to this implementation. Because of this, this function gives
-  some false positives. This should be sufficiently rare in practice.
-
-  Args:
-    filename_cc: is the path for the .cc file
-    filename_h: is the path for the header path
-
-  Returns:
-    Tuple with a bool and a string:
-    bool: True if filename_cc and filename_h belong to the same module.
-    string: the additional prefix needed to open the header file.
-  """
-
-  if not filename_cc.endswith('.cc'):
-    return (False, '')
-  filename_cc = filename_cc[:-len('.cc')]
-  if filename_cc.endswith('_unittest'):
-    filename_cc = filename_cc[:-len('_unittest')]
-  elif filename_cc.endswith('_test'):
-    filename_cc = filename_cc[:-len('_test')]
-  filename_cc = filename_cc.replace('/public/', '/')
-  filename_cc = filename_cc.replace('/internal/', '/')
-
-  if not filename_h.endswith('.h'):
-    return (False, '')
-  filename_h = filename_h[:-len('.h')]
-  if filename_h.endswith('-inl'):
-    filename_h = filename_h[:-len('-inl')]
-  filename_h = filename_h.replace('/public/', '/')
-  filename_h = filename_h.replace('/internal/', '/')
-
-  files_belong_to_same_module = filename_cc.endswith(filename_h)
-  common_path = ''
-  if files_belong_to_same_module:
-    common_path = filename_cc[:-len(filename_h)]
-  return files_belong_to_same_module, common_path
-
-
-def UpdateIncludeState(filename, include_state, io=codecs):
-  """Fill up the include_state with new includes found from the file.
-
-  Args:
-    filename: the name of the header to read.
-    include_state: an _IncludeState instance in which the headers are inserted.
-    io: The io factory to use to read the file. Provided for testability.
-
-  Returns:
-    True if a header was successfully added. False otherwise.
-  """
-  headerfile = None
-  try:
-    headerfile = io.open(filename, 'r', 'utf8', 'replace')
-  except IOError:
-    return False
-  linenum = 0
-  for line in headerfile:
-    linenum += 1
-    clean_line = CleanseComments(line)
-    match = _RE_PATTERN_INCLUDE.search(clean_line)
-    if match:
-      include = match.group(2)
-      # The value formatting is cute, but not really used right now.
-      # What matters here is that the key is in include_state.
-      include_state.setdefault(include, '%s:%d' % (filename, linenum))
-  return True
-
-
-def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error,
-                              io=codecs):
-  """Reports for missing stl includes.
-
-  This function will output warnings to make sure you are including the headers
-  necessary for the stl containers and functions that you use. We only give one
-  reason to include a header. For example, if you use both equal_to<> and
-  less<> in a .h file, only one (the latter in the file) of these will be
-  reported as a reason to include the <functional>.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    include_state: An _IncludeState instance.
-    error: The function to call with any errors found.
-    io: The IO factory to use to read the header file. Provided for unittest
-        injection.
-  """
-  required = {}  # A map of header name to linenumber and the template entity.
-                 # Example of required: { '<functional>': (1219, 'less<>') }
-
-  for linenum in xrange(clean_lines.NumLines()):
-    line = clean_lines.elided[linenum]
-    if not line or line[0] == '#':
-      continue
-
-    # String is special -- it is a non-templatized type in STL.
-    matched = _RE_PATTERN_STRING.search(line)
-    if matched:
-      # Don't warn about strings in non-STL namespaces:
-      # (We check only the first match per line; good enough.)
-      prefix = line[:matched.start()]
-      if prefix.endswith('std::') or not prefix.endswith('::'):
-        required['<string>'] = (linenum, 'string')
-
-    for pattern, template, header in _re_pattern_algorithm_header:
-      if pattern.search(line):
-        required[header] = (linenum, template)
-
-    # The following function is just a speed up, no semantics are changed.
-    if not '<' in line:  # Reduces the cpu time usage by skipping lines.
-      continue
-
-    for pattern, template, header in _re_pattern_templates:
-      if pattern.search(line):
-        required[header] = (linenum, template)
-
-  # The policy is that if you #include something in foo.h you don't need to
-  # include it again in foo.cc. Here, we will look at possible includes.
-  # Let's copy the include_state so it is only messed up within this function.
-  include_state = include_state.copy()
-
-  # Did we find the header for this file (if any) and successfully load it?
-  header_found = False
-
-  # Use the absolute path so that matching works properly.
-  abs_filename = FileInfo(filename).FullName()
-
-  # For Emacs's flymake.
-  # If cpplint is invoked from Emacs's flymake, a temporary file is generated
-  # by flymake and that file name might end with '_flymake.cc'. In that case,
-  # restore original file name here so that the corresponding header file can be
-  # found.
-  # e.g. If the file name is 'foo_flymake.cc', we should search for 'foo.h'
-  # instead of 'foo_flymake.h'
-  abs_filename = re.sub(r'_flymake\.cc$', '.cc', abs_filename)
-
-  # include_state is modified during iteration, so we iterate over a copy of
-  # the keys.
-  header_keys = include_state.keys()
-  for header in header_keys:
-    (same_module, common_path) = FilesBelongToSameModule(abs_filename, header)
-    fullpath = common_path + header
-    if same_module and UpdateIncludeState(fullpath, include_state, io):
-      header_found = True
-
-  # If we can't find the header file for a .cc, assume it's because we don't
-  # know where to look. In that case we'll give up as we're not sure they
-  # didn't include it in the .h file.
-  # TODO(unknown): Do a better job of finding .h files so we are confident that
-  # not having the .h file means there isn't one.
-  if filename.endswith('.cc') and not header_found:
-    return
-
-  # All the lines have been processed, report the errors found.
-  for required_header_unstripped in required:
-    template = required[required_header_unstripped][1]
-    if required_header_unstripped.strip('<>"') not in include_state:
-      error(filename, required[required_header_unstripped][0],
-            'build/include_what_you_use', 4,
-            'Add #include ' + required_header_unstripped + ' for ' + template)
-
-
-_RE_PATTERN_EXPLICIT_MAKEPAIR = re.compile(r'\bmake_pair\s*<')
-
-
-def CheckMakePairUsesDeduction(filename, clean_lines, linenum, error):
-  """Check that make_pair's template arguments are deduced.
-
-  G++ 4.6 in C++0x mode fails badly if make_pair's template arguments are
-  specified explicitly, and such use isn't intended in any case.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    error: The function to call with any errors found.
-  """
-  line = clean_lines.elided[linenum]
-  match = _RE_PATTERN_EXPLICIT_MAKEPAIR.search(line)
-  if match:
-    error(filename, linenum, 'build/explicit_make_pair',
-          4,  # 4 = high confidence
-          'For C++11-compatibility, omit template arguments from make_pair'
-          ' OR use pair directly OR if appropriate, construct a pair directly')
-
-
-def ProcessLine(filename, file_extension, clean_lines, line,
-                include_state, function_state, nesting_state, error,
-                extra_check_functions=[]):
-  """Processes a single line in the file.
-
-  Args:
-    filename: Filename of the file that is being processed.
-    file_extension: The extension (dot not included) of the file.
-    clean_lines: An array of strings, each representing a line of the file,
-                 with comments stripped.
-    line: Number of line being processed.
-    include_state: An _IncludeState instance in which the headers are inserted.
-    function_state: A _FunctionState instance which counts function lines, etc.
-    nesting_state: A _NestingState instance which maintains information about
-                   the current stack of nested blocks being parsed.
-    error: A callable to which errors are reported, which takes 4 arguments:
-           filename, line number, error level, and message
-    extra_check_functions: An array of additional check functions that will be
-                           run on each source line. Each function takes 4
-                           arguments: filename, clean_lines, line, error
-  """
-  raw_lines = clean_lines.raw_lines
-  ParseNolintSuppressions(filename, raw_lines[line], line, error)
-  nesting_state.Update(filename, clean_lines, line, error)
-  if nesting_state.stack and nesting_state.stack[-1].inline_asm != _NO_ASM:
-    return
-  CheckForFunctionLengths(filename, clean_lines, line, function_state, error)
-  CheckForMultilineCommentsAndStrings(filename, clean_lines, line, error)
-  CheckStyle(filename, clean_lines, line, file_extension, nesting_state, error)
-  CheckLanguage(filename, clean_lines, line, file_extension, include_state,
-                nesting_state, error)
-  CheckForNonConstReference(filename, clean_lines, line, nesting_state, error)
-  CheckForNonStandardConstructs(filename, clean_lines, line,
-                                nesting_state, error)
-  CheckVlogArguments(filename, clean_lines, line, error)
-  CheckPosixThreading(filename, clean_lines, line, error)
-  CheckInvalidIncrement(filename, clean_lines, line, error)
-  CheckMakePairUsesDeduction(filename, clean_lines, line, error)
-  for check_fn in extra_check_functions:
-    check_fn(filename, clean_lines, line, error)
-
-def ProcessFileData(filename, file_extension, lines, error,
-                    extra_check_functions=[]):
-  """Performs lint checks and reports any errors to the given error function.
-
-  Args:
-    filename: Filename of the file that is being processed.
-    file_extension: The extension (dot not included) of the file.
-    lines: An array of strings, each representing a line of the file, with the
-           last element being empty if the file is terminated with a newline.
-    error: A callable to which errors are reported, which takes 4 arguments:
-           filename, line number, error level, and message
-    extra_check_functions: An array of additional check functions that will be
-                           run on each source line. Each function takes 4
-                           arguments: filename, clean_lines, line, error
-  """
-  lines = (['// marker so line numbers and indices both start at 1'] + lines +
-           ['// marker so line numbers end in a known way'])
-
-  include_state = _IncludeState()
-  function_state = _FunctionState()
-  nesting_state = _NestingState()
-
-  ResetNolintSuppressions()
-
-  CheckForCopyright(filename, lines, error)
-
-  if file_extension == 'h':
-    CheckForHeaderGuard(filename, lines, error)
-
-  RemoveMultiLineComments(filename, lines, error)
-  clean_lines = CleansedLines(lines)
-  for line in xrange(clean_lines.NumLines()):
-    ProcessLine(filename, file_extension, clean_lines, line,
-                include_state, function_state, nesting_state, error,
-                extra_check_functions)
-  nesting_state.CheckCompletedBlocks(filename, error)
-
-  CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error)
-
-  # We check here rather than inside ProcessLine so that we see raw
-  # lines rather than "cleaned" lines.
-  CheckForBadCharacters(filename, lines, error)
-
-  CheckForNewlineAtEOF(filename, lines, error)
-
-def ProcessFile(filename, vlevel, extra_check_functions=[]):
-  """Does google-lint on a single file.
-
-  Args:
-    filename: The name of the file to parse.
-
-    vlevel: The level of errors to report.  Every error of confidence
-    >= verbose_level will be reported.  0 is a good default.
-
-    extra_check_functions: An array of additional check functions that will be
-                           run on each source line. Each function takes 4
-                           arguments: filename, clean_lines, line, error
-  """
-
-  _SetVerboseLevel(vlevel)
-
-  try:
-    # Support the UNIX convention of using "-" for stdin.  Note that
-    # we are not opening the file with universal newline support
-    # (which codecs doesn't support anyway), so the resulting lines do
-    # contain trailing '\r' characters if we are reading a file that
-    # has CRLF endings.
-    # If after the split a trailing '\r' is present, it is removed
-    # below. If it is not expected to be present (i.e. os.linesep !=
-    # '\r\n' as in Windows), a warning is issued below if this file
-    # is processed.
-
-    if filename == '-':
-      lines = codecs.StreamReaderWriter(sys.stdin,
-                                        codecs.getreader('utf8'),
-                                        codecs.getwriter('utf8'),
-                                        'replace').read().split('\n')
-    else:
-      lines = codecs.open(filename, 'r', 'utf8', 'replace').read().split('\n')
-
-    carriage_return_found = False
-    # Remove trailing '\r'.
-    for linenum in range(len(lines)):
-      if lines[linenum].endswith('\r'):
-        lines[linenum] = lines[linenum].rstrip('\r')
-        carriage_return_found = True
-
-  except IOError:
-    sys.stderr.write(
-        "Skipping input '%s': Can't open for reading\n" % filename)
-    return
-
-  # Note, if no dot is found, this will give the entire filename as the ext.
-  file_extension = filename[filename.rfind('.') + 1:]
-
-  # When reading from stdin, the extension is unknown, so no cpplint tests
-  # should rely on the extension.
-  if filename != '-' and file_extension not in _valid_extensions:
-    sys.stderr.write('Ignoring %s; not a valid file name '
-                     '(%s)\n' % (filename, ', '.join(_valid_extensions)))
-  else:
-    ProcessFileData(filename, file_extension, lines, Error,
-                    extra_check_functions)
-    if carriage_return_found and os.linesep != '\r\n':
-      # Use 0 for linenum since outputting only one error for potentially
-      # several lines.
-      Error(filename, 0, 'whitespace/newline', 1,
-            'One or more unexpected \\r (^M) found;'
-            'better to use only a \\n')
-
-  sys.stderr.write('Done processing %s\n' % filename)
-
-
-def PrintUsage(message):
-  """Prints a brief usage string and exits, optionally with an error message.
-
-  Args:
-    message: The optional error message.
-  """
-  sys.stderr.write(_USAGE)
-  if message:
-    sys.exit('\nFATAL ERROR: ' + message)
-  else:
-    sys.exit(1)
-
-
-def PrintCategories():
-  """Prints a list of all the error-categories used by error messages.
-
-  These are the categories used to filter messages via --filter.
-  """
-  sys.stderr.write(''.join('  %s\n' % cat for cat in _ERROR_CATEGORIES))
-  sys.exit(0)
-
-
-def ParseArguments(args):
-  """Parses the command line arguments.
-
-  This may set the output format and verbosity level as side-effects.
-
-  Args:
-    args: The command line arguments:
-
-  Returns:
-    The list of filenames to lint.
-  """
-  try:
-    (opts, filenames) = getopt.getopt(args, '', ['help', 'output=', 'verbose=',
-                                                 'counting=',
-                                                 'filter=',
-                                                 'root=',
-                                                 'linelength=',
-                                                 'extensions='])
-  except getopt.GetoptError:
-    PrintUsage('Invalid arguments.')
-
-  verbosity = _VerboseLevel()
-  output_format = _OutputFormat()
-  filters = ''
-  counting_style = ''
-
-  for (opt, val) in opts:
-    if opt == '--help':
-      PrintUsage(None)
-    elif opt == '--output':
-      if val not in ('emacs', 'vs7', 'eclipse'):
-        PrintUsage('The only allowed output formats are emacs, vs7 and eclipse.')
-      output_format = val
-    elif opt == '--verbose':
-      verbosity = int(val)
-    elif opt == '--filter':
-      filters = val
-      if not filters:
-        PrintCategories()
-    elif opt == '--counting':
-      if val not in ('total', 'toplevel', 'detailed'):
-        PrintUsage('Valid counting options are total, toplevel, and detailed')
-      counting_style = val
-    elif opt == '--root':
-      global _root
-      _root = val
-    elif opt == '--linelength':
-      global _line_length
-      try:
-          _line_length = int(val)
-      except ValueError:
-          PrintUsage('Line length must be digits.')
-    elif opt == '--extensions':
-      global _valid_extensions
-      try:
-          _valid_extensions = set(val.split(','))
-      except ValueError:
-          PrintUsage('Extensions must be comma separated list.')
-
-  if not filenames:
-    PrintUsage('No files were specified.')
-
-  _SetOutputFormat(output_format)
-  _SetVerboseLevel(verbosity)
-  _SetFilters(filters)
-  _SetCountingStyle(counting_style)
-
-  return filenames
-
-
-def main():
-  filenames = ParseArguments(sys.argv[1:])
-
-  # Change stderr to write with replacement characters so we don't die
-  # if we try to print something containing non-ASCII characters.
-  sys.stderr = codecs.StreamReaderWriter(sys.stderr,
-                                         codecs.getreader('utf8'),
-                                         codecs.getwriter('utf8'),
-                                         'replace')
-
-  _cpplint_state.ResetErrorCounts()
-  for filename in filenames:
-    ProcessFile(filename, _cpplint_state.verbose_level)
-  _cpplint_state.PrintErrorCounts()
-
-  sys.exit(_cpplint_state.error_count > 0)
-
-
-if __name__ == '__main__':
-  main()
diff --git a/src/rocksdb/linters/lint_engine/FacebookFbcodeLintEngine.php b/src/rocksdb/linters/lint_engine/FacebookFbcodeLintEngine.php
deleted file mode 100644
index cb9cf9b..0000000
--- a/src/rocksdb/linters/lint_engine/FacebookFbcodeLintEngine.php
+++ /dev/null
@@ -1,147 +0,0 @@
-<?php
-// Copyright 2004-present Facebook.  All rights reserved.
-
-class FacebookFbcodeLintEngine extends ArcanistLintEngine {
-
-  public function buildLinters() {
-    $linters = array();
-    $paths = $this->getPaths();
-
-    // Remove all deleted files, which are not checked by the
-    // following linters.
-    foreach ($paths as $key => $path) {
-      if (!Filesystem::pathExists($this->getFilePathOnDisk($path))) {
-        unset($paths[$key]);
-      }
-    }
-
-    $generated_linter = new ArcanistGeneratedLinter();
-    $linters[] = $generated_linter;
-
-    $nolint_linter = new ArcanistNoLintLinter();
-    $linters[] = $nolint_linter;
-
-    $text_linter = new ArcanistTextLinter();
-    $text_linter->setCustomSeverityMap(array(
-      ArcanistTextLinter::LINT_LINE_WRAP
-        => ArcanistLintSeverity::SEVERITY_ADVICE,
-    ));
-    $linters[] = $text_linter;
-
-    $java_text_linter = new ArcanistTextLinter();
-    $java_text_linter->setMaxLineLength(100);
-    $java_text_linter->setCustomSeverityMap(array(
-      ArcanistTextLinter::LINT_LINE_WRAP
-        => ArcanistLintSeverity::SEVERITY_ADVICE,
-    ));
-    $linters[] = $java_text_linter;
-
-    $pep8_options = $this->getPEP8WithTextOptions().',E302';
-
-    $python_linter = new ArcanistPEP8Linter();
-    $python_linter->setConfig(array('options' => $pep8_options));
-    $linters[] = $python_linter;
-
-    $python_2space_linter = new ArcanistPEP8Linter();
-    $python_2space_linter->setConfig(array('options' => $pep8_options.',E111'));
-    $linters[] = $python_2space_linter;
-
-   // Currently we can't run cpplint in commit hook mode, because it
-    // depends on having access to the working directory.
-    if (!$this->getCommitHookMode()) {
-      $cpp_linters = array();
-      $google_linter = new ArcanistCpplintLinter();
-      $google_linter->setConfig(array(
-        'lint.cpplint.prefix' => '',
-        'lint.cpplint.bin' => 'cpplint',
-      ));
-      $cpp_linters[] = $linters[] = $google_linter;
-      $cpp_linters[] = $linters[] = new FbcodeCppLinter();
-      $cpp_linters[] = $linters[] = new PfffCppLinter();
-    }
-
-    $spelling_linter = new ArcanistSpellingLinter();
-    $linters[] = $spelling_linter;
-
-    foreach ($paths as $path) {
-      $is_text = false;
-
-      $text_extensions = (
-        '/\.('.
-        'cpp|cxx|c|cc|h|hpp|hxx|tcc|'.
-        'py|rb|hs|pl|pm|tw|'.
-        'php|phpt|css|js|'.
-        'java|'.
-        'thrift|'.
-        'lua|'.
-        'siv|'.
-        'txt'.
-        ')$/'
-      );
-      if (preg_match($text_extensions, $path)) {
-        $is_text = true;
-      }
-      if ($is_text) {
-        $nolint_linter->addPath($path);
-
-        $generated_linter->addPath($path);
-        $generated_linter->addData($path, $this->loadData($path));
-
-        if (preg_match('/\.java$/', $path)) {
-          $java_text_linter->addPath($path);
-          $java_text_linter->addData($path, $this->loadData($path));
-        } else {
-          $text_linter->addPath($path);
-          $text_linter->addData($path, $this->loadData($path));
-        }
-
-        $spelling_linter->addPath($path);
-        $spelling_linter->addData($path, $this->loadData($path));
-      }
-      if (preg_match('/\.(cpp|c|cc|cxx|h|hh|hpp|hxx|tcc)$/', $path)) {
-        foreach ($cpp_linters as &$linter) {
-          $linter->addPath($path);
-          $linter->addData($path, $this->loadData($path));
-        }
-      }
-
-      // Match *.py and contbuild config files
-      if (preg_match('/(\.(py|tw|smcprops)|^contbuild\/configs\/[^\/]*)$/',
-                    $path)) {
-        $space_count = 4;
-        $real_path = $this->getFilePathOnDisk($path);
-        $dir = dirname($real_path);
-        do {
-          if (file_exists($dir.'/.python2space')) {
-            $space_count = 2;
-            break;
-          }
-          $dir = dirname($dir);
-        } while ($dir != '/' && $dir != '.');
-
-        if ($space_count == 4) {
-          $cur_path_linter = $python_linter;
-        } else {
-          $cur_path_linter = $python_2space_linter;
-        }
-        $cur_path_linter->addPath($path);
-        $cur_path_linter->addData($path, $this->loadData($path));
-
-        if (preg_match('/\.tw$/', $path)) {
-          $cur_path_linter->setCustomSeverityMap(array(
-            'E251' => ArcanistLintSeverity::SEVERITY_DISABLED,
-          ));
-        }
-      }
-    }
-
-    $name_linter = new ArcanistFilenameLinter();
-    $linters[] = $name_linter;
-    foreach ($paths as $path) {
-      $name_linter->addPath($path);
-    }
-
-    return $linters;
-  }
-
-}
diff --git a/src/rocksdb/port/atomic_pointer.h b/src/rocksdb/port/atomic_pointer.h
deleted file mode 100644
index db3580b..0000000
--- a/src/rocksdb/port/atomic_pointer.h
+++ /dev/null
@@ -1,157 +0,0 @@
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-//
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-// AtomicPointer provides storage for a lock-free pointer.
-// Platform-dependent implementation of AtomicPointer:
-// - If the platform provides a cheap barrier, we use it with raw pointers
-// - If cstdatomic is present (on newer versions of gcc, it is), we use
-//   a cstdatomic-based AtomicPointer.  However we prefer the memory
-//   barrier based version, because at least on a gcc 4.4 32-bit build
-//   on linux, we have encountered a buggy <cstdatomic>
-//   implementation.  Also, some <cstdatomic> implementations are much
-//   slower than a memory-barrier based implementation (~16ns for
-//   <cstdatomic> based acquire-load vs. ~1ns for a barrier based
-//   acquire-load).
-// This code is based on atomicops-internals-* in Google's perftools:
-// http://code.google.com/p/google-perftools/source/browse/#svn%2Ftrunk%2Fsrc%2Fbase
-
-#ifndef PORT_ATOMIC_POINTER_H_
-#define PORT_ATOMIC_POINTER_H_
-
-#include <stdint.h>
-#ifdef ROCKSDB_ATOMIC_PRESENT
-#include <atomic>
-#endif
-#ifdef OS_WIN
-#include <windows.h>
-#endif
-#ifdef OS_MACOSX
-#include <libkern/OSAtomic.h>
-#endif
-
-#if defined(_M_X64) || defined(__x86_64__)
-#define ARCH_CPU_X86_FAMILY 1
-#elif defined(_M_IX86) || defined(__i386__) || defined(__i386)
-#define ARCH_CPU_X86_FAMILY 1
-#elif defined(__ARMEL__)
-#define ARCH_CPU_ARM_FAMILY 1
-#endif
-
-namespace rocksdb {
-namespace port {
-
-// Define MemoryBarrier() if available
-// Windows on x86
-#if defined(OS_WIN) && defined(COMPILER_MSVC) && defined(ARCH_CPU_X86_FAMILY)
-// windows.h already provides a MemoryBarrier(void) macro
-// http://msdn.microsoft.com/en-us/library/ms684208(v=vs.85).aspx
-#define ROCKSDB_HAVE_MEMORY_BARRIER
-
-// Gcc on x86
-#elif defined(ARCH_CPU_X86_FAMILY) && defined(__GNUC__)
-inline void MemoryBarrier() {
-  // See http://gcc.gnu.org/ml/gcc/2003-04/msg01180.html for a discussion on
-  // this idiom. Also see http://en.wikipedia.org/wiki/Memory_ordering.
-  __asm__ __volatile__("" : : : "memory");
-}
-#define ROCKSDB_HAVE_MEMORY_BARRIER
-
-// Sun Studio
-#elif defined(ARCH_CPU_X86_FAMILY) && defined(__SUNPRO_CC)
-inline void MemoryBarrier() {
-  // See http://gcc.gnu.org/ml/gcc/2003-04/msg01180.html for a discussion on
-  // this idiom. Also see http://en.wikipedia.org/wiki/Memory_ordering.
-  asm volatile("" : : : "memory");
-}
-#define ROCKSDB_HAVE_MEMORY_BARRIER
-
-// Mac OS
-#elif defined(OS_MACOSX)
-inline void MemoryBarrier() {
-  OSMemoryBarrier();
-}
-#define ROCKSDB_HAVE_MEMORY_BARRIER
-
-// ARM Linux
-#elif defined(ARCH_CPU_ARM_FAMILY) && defined(__linux__)
-typedef void (*LinuxKernelMemoryBarrierFunc)(void);
-// The Linux ARM kernel provides a highly optimized device-specific memory
-// barrier function at a fixed memory address that is mapped in every
-// user-level process.
-//
-// This beats using CPU-specific instructions which are, on single-core
-// devices, un-necessary and very costly (e.g. ARMv7-A "dmb" takes more
-// than 180ns on a Cortex-A8 like the one on a Nexus One). Benchmarking
-// shows that the extra function call cost is completely negligible on
-// multi-core devices.
-//
-inline void MemoryBarrier() {
-  (*(LinuxKernelMemoryBarrierFunc)0xffff0fa0)();
-}
-#define ROCKSDB_HAVE_MEMORY_BARRIER
-
-#endif
-
-// AtomicPointer built using platform-specific MemoryBarrier()
-#if defined(ROCKSDB_HAVE_MEMORY_BARRIER)
-class AtomicPointer {
- private:
-  void* rep_;
- public:
-  AtomicPointer() { }
-  explicit AtomicPointer(void* p) : rep_(p) {}
-  inline void* NoBarrier_Load() const { return rep_; }
-  inline void NoBarrier_Store(void* v) { rep_ = v; }
-  inline void* Acquire_Load() const {
-    void* result = rep_;
-    MemoryBarrier();
-    return result;
-  }
-  inline void Release_Store(void* v) {
-    MemoryBarrier();
-    rep_ = v;
-  }
-};
-
-// AtomicPointer based on <atomic>
-#elif defined(ROCKSDB_ATOMIC_PRESENT)
-class AtomicPointer {
- private:
-  std::atomic<void*> rep_;
- public:
-  AtomicPointer() { }
-  explicit AtomicPointer(void* v) : rep_(v) { }
-  inline void* Acquire_Load() const {
-    return rep_.load(std::memory_order_acquire);
-  }
-  inline void Release_Store(void* v) {
-    rep_.store(v, std::memory_order_release);
-  }
-  inline void* NoBarrier_Load() const {
-    return rep_.load(std::memory_order_relaxed);
-  }
-  inline void NoBarrier_Store(void* v) {
-    rep_.store(v, std::memory_order_relaxed);
-  }
-};
-
-// We have neither MemoryBarrier(), nor <cstdatomic>
-#else
-#error Please implement AtomicPointer for this platform.
-
-#endif
-
-#undef ROCKSDB_HAVE_MEMORY_BARRIER
-#undef ARCH_CPU_X86_FAMILY
-#undef ARCH_CPU_ARM_FAMILY
-
-}  // namespace port
-}  // namespace rocksdb
-
-#endif  // PORT_ATOMIC_POINTER_H_
diff --git a/src/rocksdb/port/port.h b/src/rocksdb/port/port.h
index 2dc9a0f..bc4b6a1 100644
--- a/src/rocksdb/port/port.h
+++ b/src/rocksdb/port/port.h
@@ -7,16 +7,14 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#ifndef STORAGE_LEVELDB_PORT_PORT_H_
-#define STORAGE_LEVELDB_PORT_PORT_H_
+#pragma once
 
-#include <string.h>
+#include <string>
 
 // Include the appropriate platform specific file below.  If you are
 // porting to a new platform, see "port_example.h" for documentation
 // of what the new port_<platform>.h file must provide.
 #if defined(ROCKSDB_PLATFORM_POSIX)
-#  include "port/port_posix.h"
+#include "port/port_posix.h"
 #endif
 
-#endif  // STORAGE_LEVELDB_PORT_PORT_H_
diff --git a/src/rocksdb/port/port_example.h b/src/rocksdb/port/port_example.h
index f124abb..ba14618 100644
--- a/src/rocksdb/port/port_example.h
+++ b/src/rocksdb/port/port_example.h
@@ -75,35 +75,6 @@ typedef intptr_t OnceType;
 #define LEVELDB_ONCE_INIT 0
 extern void InitOnce(port::OnceType*, void (*initializer)());
 
-// A type that holds a pointer that can be read or written atomically
-// (i.e., without word-tearing.)
-class AtomicPointer {
- private:
-  intptr_t rep_;
- public:
-  // Initialize to arbitrary value
-  AtomicPointer();
-
-  // Initialize to hold v
-  explicit AtomicPointer(void* v) : rep_(v) { }
-
-  // Read and return the stored pointer with the guarantee that no
-  // later memory access (read or write) by this thread can be
-  // reordered ahead of this read.
-  void* Acquire_Load() const;
-
-  // Set v as the stored pointer with the guarantee that no earlier
-  // memory access (read or write) by this thread can be reordered
-  // after this store.
-  void Release_Store(void* v);
-
-  // Read the stored pointer with no ordering guarantees.
-  void* NoBarrier_Load() const;
-
-  // Set va as the stored pointer with no ordering guarantees.
-  void NoBarrier_Store(void* v);
-};
-
 // ------------------ Compression -------------------
 
 // Store the snappy compression of "input[0,input_length-1]" in *output.
diff --git a/src/rocksdb/port/port_posix.cc b/src/rocksdb/port/port_posix.cc
index 911cebd..a8cffcc 100644
--- a/src/rocksdb/port/port_posix.cc
+++ b/src/rocksdb/port/port_posix.cc
@@ -9,26 +9,29 @@
 
 #include "port/port_posix.h"
 
-#include <cstdlib>
 #include <stdio.h>
 #include <assert.h>
+#include <errno.h>
+#include <sys/time.h>
 #include <string.h>
+#include <cstdlib>
 #include "util/logging.h"
 
 namespace rocksdb {
 namespace port {
 
-static void PthreadCall(const char* label, int result) {
-  if (result != 0) {
+static int PthreadCall(const char* label, int result) {
+  if (result != 0 && result != ETIMEDOUT) {
     fprintf(stderr, "pthread %s: %s\n", label, strerror(result));
     abort();
   }
+  return result;
 }
 
 Mutex::Mutex(bool adaptive) {
 #ifdef OS_LINUX
   if (!adaptive) {
-    PthreadCall("init mutex", pthread_mutex_init(&mu_, NULL));
+    PthreadCall("init mutex", pthread_mutex_init(&mu_, nullptr));
   } else {
     pthread_mutexattr_t mutex_attr;
     PthreadCall("init mutex attr", pthread_mutexattr_init(&mutex_attr));
@@ -40,7 +43,7 @@ Mutex::Mutex(bool adaptive) {
                 pthread_mutexattr_destroy(&mutex_attr));
   }
 #else // ignore adaptive for non-linux platform
-  PthreadCall("init mutex", pthread_mutex_init(&mu_, NULL));
+  PthreadCall("init mutex", pthread_mutex_init(&mu_, nullptr));
 #endif // OS_LINUX
 }
 
@@ -68,7 +71,7 @@ void Mutex::AssertHeld() {
 
 CondVar::CondVar(Mutex* mu)
     : mu_(mu) {
-    PthreadCall("init cv", pthread_cond_init(&cv_, NULL));
+    PthreadCall("init cv", pthread_cond_init(&cv_, nullptr));
 }
 
 CondVar::~CondVar() { PthreadCall("destroy cv", pthread_cond_destroy(&cv_)); }
@@ -83,6 +86,27 @@ void CondVar::Wait() {
 #endif
 }
 
+bool CondVar::TimedWait(uint64_t abs_time_us) {
+  struct timespec ts;
+  ts.tv_sec = static_cast<time_t>(abs_time_us / 1000000);
+  ts.tv_nsec = static_cast<suseconds_t>((abs_time_us % 1000000) * 1000);
+
+#ifndef NDEBUG
+  mu_->locked_ = false;
+#endif
+  int err = pthread_cond_timedwait(&cv_, &mu_->mu_, &ts);
+#ifndef NDEBUG
+  mu_->locked_ = true;
+#endif
+  if (err == ETIMEDOUT) {
+    return true;
+  }
+  if (err != 0) {
+    PthreadCall("timedwait", err);
+  }
+  return false;
+}
+
 void CondVar::Signal() {
   PthreadCall("signal", pthread_cond_signal(&cv_));
 }
@@ -91,7 +115,9 @@ void CondVar::SignalAll() {
   PthreadCall("broadcast", pthread_cond_broadcast(&cv_));
 }
 
-RWMutex::RWMutex() { PthreadCall("init mutex", pthread_rwlock_init(&mu_, NULL)); }
+RWMutex::RWMutex() {
+  PthreadCall("init mutex", pthread_rwlock_init(&mu_, nullptr));
+}
 
 RWMutex::~RWMutex() { PthreadCall("destroy mutex", pthread_rwlock_destroy(&mu_)); }
 
@@ -99,7 +125,9 @@ void RWMutex::ReadLock() { PthreadCall("read lock", pthread_rwlock_rdlock(&mu_))
 
 void RWMutex::WriteLock() { PthreadCall("write lock", pthread_rwlock_wrlock(&mu_)); }
 
-void RWMutex::Unlock() { PthreadCall("unlock", pthread_rwlock_unlock(&mu_)); }
+void RWMutex::ReadUnlock() { PthreadCall("read unlock", pthread_rwlock_unlock(&mu_)); }
+
+void RWMutex::WriteUnlock() { PthreadCall("write unlock", pthread_rwlock_unlock(&mu_)); }
 
 void InitOnce(OnceType* once, void (*initializer)()) {
   PthreadCall("once", pthread_once(once, initializer));
diff --git a/src/rocksdb/port/port_posix.h b/src/rocksdb/port/port_posix.h
index d20a5df..dbb6e17 100644
--- a/src/rocksdb/port/port_posix.h
+++ b/src/rocksdb/port/port_posix.h
@@ -9,8 +9,7 @@
 //
 // See port_example.h for documentation for the following types/functions.
 
-#ifndef STORAGE_LEVELDB_PORT_PORT_POSIX_H_
-#define STORAGE_LEVELDB_PORT_PORT_POSIX_H_
+#pragma once
 
 #undef PLATFORM_IS_LITTLE_ENDIAN
 #if defined(OS_MACOSX)
@@ -26,7 +25,11 @@
   #else
     #define PLATFORM_IS_LITTLE_ENDIAN false
   #endif
-#elif defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) ||\
+#elif defined(OS_FREEBSD)
+  #include <sys/endian.h>
+  #include <sys/types.h>
+  #define PLATFORM_IS_LITTLE_ENDIAN (_BYTE_ORDER == _LITTLE_ENDIAN)
+#elif defined(OS_OPENBSD) || defined(OS_NETBSD) ||\
       defined(OS_DRAGONFLYBSD) || defined(OS_ANDROID)
   #include <sys/types.h>
   #include <sys/endian.h>
@@ -34,28 +37,10 @@
   #include <endian.h>
 #endif
 #include <pthread.h>
-#ifdef SNAPPY
-#include <snappy.h>
-#endif
-
-#ifdef ZLIB
-#include <zlib.h>
-#endif
-
-#ifdef BZIP2
-#include <bzlib.h>
-#endif
-
-#if defined(LZ4)
-#include <lz4.h>
-#include <lz4hc.h>
-#endif
 
 #include <stdint.h>
 #include <string>
 #include <string.h>
-#include "rocksdb/options.h"
-#include "port/atomic_pointer.h"
 
 #ifndef PLATFORM_IS_LITTLE_ENDIAN
 #define PLATFORM_IS_LITTLE_ENDIAN (__BYTE_ORDER == __LITTLE_ENDIAN)
@@ -120,7 +105,8 @@ class RWMutex {
 
   void ReadLock();
   void WriteLock();
-  void Unlock();
+  void ReadUnlock();
+  void WriteUnlock();
   void AssertHeld() { }
 
  private:
@@ -136,6 +122,8 @@ class CondVar {
   explicit CondVar(Mutex* mu);
   ~CondVar();
   void Wait();
+  // Timed condition wait.  Returns true if timeout occurred.
+  bool TimedWait(uint64_t abs_time_us);
   void Signal();
   void SignalAll();
  private:
@@ -147,342 +135,10 @@ typedef pthread_once_t OnceType;
 #define LEVELDB_ONCE_INIT PTHREAD_ONCE_INIT
 extern void InitOnce(OnceType* once, void (*initializer)());
 
-inline bool Snappy_Compress(const CompressionOptions& opts, const char* input,
-                            size_t length, ::std::string* output) {
-#ifdef SNAPPY
-  output->resize(snappy::MaxCompressedLength(length));
-  size_t outlen;
-  snappy::RawCompress(input, length, &(*output)[0], &outlen);
-  output->resize(outlen);
-  return true;
-#endif
-
-  return false;
-}
-
-inline bool Snappy_GetUncompressedLength(const char* input, size_t length,
-                                         size_t* result) {
-#ifdef SNAPPY
-  return snappy::GetUncompressedLength(input, length, result);
-#else
-  return false;
-#endif
-}
-
-inline bool Snappy_Uncompress(const char* input, size_t length,
-                              char* output) {
-#ifdef SNAPPY
-  return snappy::RawUncompress(input, length, output);
-#else
-  return false;
-#endif
-}
-
-inline bool Zlib_Compress(const CompressionOptions& opts, const char* input,
-                          size_t length, ::std::string* output) {
-#ifdef ZLIB
-  // The memLevel parameter specifies how much memory should be allocated for
-  // the internal compression state.
-  // memLevel=1 uses minimum memory but is slow and reduces compression ratio.
-  // memLevel=9 uses maximum memory for optimal speed.
-  // The default value is 8. See zconf.h for more details.
-  static const int memLevel = 8;
-  z_stream _stream;
-  memset(&_stream, 0, sizeof(z_stream));
-  int st = deflateInit2(&_stream, opts.level, Z_DEFLATED, opts.window_bits,
-                        memLevel, opts.strategy);
-  if (st != Z_OK) {
-    return false;
-  }
-
-  // Resize output to be the plain data length.
-  // This may not be big enough if the compression actually expands data.
-  output->resize(length);
-
-  // Compress the input, and put compressed data in output.
-  _stream.next_in = (Bytef *)input;
-  _stream.avail_in = length;
-
-  // Initialize the output size.
-  _stream.avail_out = length;
-  _stream.next_out = (Bytef *)&(*output)[0];
-
-  int old_sz =0, new_sz =0, new_sz_delta =0;
-  bool done = false;
-  while (!done) {
-    int st = deflate(&_stream, Z_FINISH);
-    switch (st) {
-      case Z_STREAM_END:
-        done = true;
-        break;
-      case Z_OK:
-        // No output space. Increase the output space by 20%.
-        // (Should we fail the compression since it expands the size?)
-        old_sz = output->size();
-        new_sz_delta = (int)(output->size() * 0.2);
-        new_sz = output->size() + (new_sz_delta < 10 ? 10 : new_sz_delta);
-        output->resize(new_sz);
-        // Set more output.
-        _stream.next_out = (Bytef *)&(*output)[old_sz];
-        _stream.avail_out = new_sz - old_sz;
-        break;
-      case Z_BUF_ERROR:
-      default:
-        deflateEnd(&_stream);
-        return false;
-    }
-  }
-
-  output->resize(output->size() - _stream.avail_out);
-  deflateEnd(&_stream);
-  return true;
-#endif
-  return false;
-}
-
-inline char* Zlib_Uncompress(const char* input_data, size_t input_length,
-    int* decompress_size, int windowBits = -14) {
-#ifdef ZLIB
-  z_stream _stream;
-  memset(&_stream, 0, sizeof(z_stream));
-
-  // For raw inflate, the windowBits should be -8..-15.
-  // If windowBits is bigger than zero, it will use either zlib
-  // header or gzip header. Adding 32 to it will do automatic detection.
-  int st = inflateInit2(&_stream,
-      windowBits > 0 ? windowBits + 32 : windowBits);
-  if (st != Z_OK) {
-    return nullptr;
-  }
-
-  _stream.next_in = (Bytef *)input_data;
-  _stream.avail_in = input_length;
-
-  // Assume the decompressed data size will 5x of compressed size.
-  int output_len = input_length * 5;
-  char* output = new char[output_len];
-  int old_sz = output_len;
-
-  _stream.next_out = (Bytef *)output;
-  _stream.avail_out = output_len;
-
-  char* tmp = nullptr;
-  int output_len_delta;
-  bool done = false;
-
-  //while(_stream.next_in != nullptr && _stream.avail_in != 0) {
-  while (!done) {
-    int st = inflate(&_stream, Z_SYNC_FLUSH);
-    switch (st) {
-      case Z_STREAM_END:
-        done = true;
-        break;
-      case Z_OK:
-        // No output space. Increase the output space by 20%.
-        old_sz = output_len;
-        output_len_delta = (int)(output_len * 0.2);
-        output_len += output_len_delta < 10 ? 10 : output_len_delta;
-        tmp = new char[output_len];
-        memcpy(tmp, output, old_sz);
-        delete[] output;
-        output = tmp;
-
-        // Set more output.
-        _stream.next_out = (Bytef *)(output + old_sz);
-        _stream.avail_out = output_len - old_sz;
-        break;
-      case Z_BUF_ERROR:
-      default:
-        delete[] output;
-        inflateEnd(&_stream);
-        return nullptr;
-    }
-  }
-
-  *decompress_size = output_len - _stream.avail_out;
-  inflateEnd(&_stream);
-  return output;
-#endif
-
-  return nullptr;
-}
-
-inline bool BZip2_Compress(const CompressionOptions& opts, const char* input,
-                           size_t length, ::std::string* output) {
-#ifdef BZIP2
-  bz_stream _stream;
-  memset(&_stream, 0, sizeof(bz_stream));
-
-  // Block size 1 is 100K.
-  // 0 is for silent.
-  // 30 is the default workFactor
-  int st = BZ2_bzCompressInit(&_stream, 1, 0, 30);
-  if (st != BZ_OK) {
-    return false;
-  }
-
-  // Resize output to be the plain data length.
-  // This may not be big enough if the compression actually expands data.
-  output->resize(length);
-
-  // Compress the input, and put compressed data in output.
-  _stream.next_in = (char *)input;
-  _stream.avail_in = length;
-
-  // Initialize the output size.
-  _stream.next_out = (char *)&(*output)[0];
-  _stream.avail_out = length;
-
-  int old_sz =0, new_sz =0;
-  while(_stream.next_in != nullptr && _stream.avail_in != 0) {
-    int st = BZ2_bzCompress(&_stream, BZ_FINISH);
-    switch (st) {
-      case BZ_STREAM_END:
-        break;
-      case BZ_FINISH_OK:
-        // No output space. Increase the output space by 20%.
-        // (Should we fail the compression since it expands the size?)
-        old_sz = output->size();
-        new_sz = (int)(output->size() * 1.2);
-        output->resize(new_sz);
-        // Set more output.
-        _stream.next_out = (char *)&(*output)[old_sz];
-        _stream.avail_out = new_sz - old_sz;
-        break;
-      case BZ_SEQUENCE_ERROR:
-      default:
-        BZ2_bzCompressEnd(&_stream);
-        return false;
-    }
-  }
-
-  output->resize(output->size() - _stream.avail_out);
-  BZ2_bzCompressEnd(&_stream);
-  return true;
-#endif
-  return false;
-}
-
-inline char* BZip2_Uncompress(const char* input_data, size_t input_length,
-                              int* decompress_size) {
-#ifdef BZIP2
-  bz_stream _stream;
-  memset(&_stream, 0, sizeof(bz_stream));
-
-  int st = BZ2_bzDecompressInit(&_stream, 0, 0);
-  if (st != BZ_OK) {
-    return nullptr;
-  }
-
-  _stream.next_in = (char *)input_data;
-  _stream.avail_in = input_length;
-
-  // Assume the decompressed data size will be 5x of compressed size.
-  int output_len = input_length * 5;
-  char* output = new char[output_len];
-  int old_sz = output_len;
-
-  _stream.next_out = (char *)output;
-  _stream.avail_out = output_len;
-
-  char* tmp = nullptr;
-
-  while(_stream.next_in != nullptr && _stream.avail_in != 0) {
-    int st = BZ2_bzDecompress(&_stream);
-    switch (st) {
-      case BZ_STREAM_END:
-        break;
-      case BZ_OK:
-        // No output space. Increase the output space by 20%.
-        old_sz = output_len;
-        output_len = (int)(output_len * 1.2);
-        tmp = new char[output_len];
-        memcpy(tmp, output, old_sz);
-        delete[] output;
-        output = tmp;
-
-        // Set more output.
-        _stream.next_out = (char *)(output + old_sz);
-        _stream.avail_out = output_len - old_sz;
-        break;
-      default:
-        delete[] output;
-        BZ2_bzDecompressEnd(&_stream);
-        return nullptr;
-    }
-  }
-
-  *decompress_size = output_len - _stream.avail_out;
-  BZ2_bzDecompressEnd(&_stream);
-  return output;
-#endif
-  return nullptr;
-}
-
-inline bool LZ4_Compress(const CompressionOptions &opts, const char *input,
-                         size_t length, ::std::string* output) {
-#ifdef LZ4
-  int compressBound = LZ4_compressBound(length);
-  output->resize(8 + compressBound);
-  char *p = const_cast<char *>(output->c_str());
-  memcpy(p, &length, sizeof(length));
-  size_t outlen;
-  outlen = LZ4_compress_limitedOutput(input, p + 8, length, compressBound);
-  if (outlen == 0) {
-    return false;
-  }
-  output->resize(8 + outlen);
-  return true;
-#endif
-  return false;
-}
-
-inline char* LZ4_Uncompress(const char* input_data, size_t input_length,
-                            int* decompress_size) {
-#ifdef LZ4
-  if (input_length < 8) {
-    return nullptr;
-  }
-  int output_len;
-  memcpy(&output_len, input_data, sizeof(output_len));
-  char *output = new char[output_len];
-  *decompress_size = LZ4_decompress_safe_partial(
-      input_data + 8, output, input_length - 8, output_len, output_len);
-  if (*decompress_size < 0) {
-    delete[] output;
-    return nullptr;
-  }
-  return output;
-#endif
-  return nullptr;
-}
-
-inline bool LZ4HC_Compress(const CompressionOptions &opts, const char* input,
-                           size_t length, ::std::string* output) {
-#ifdef LZ4
-  int compressBound = LZ4_compressBound(length);
-  output->resize(8 + compressBound);
-  char *p = const_cast<char *>(output->c_str());
-  memcpy(p, &length, sizeof(length));
-  size_t outlen;
-#ifdef LZ4_VERSION_MAJOR  // they only started defining this since r113
-  outlen = LZ4_compressHC2_limitedOutput(input, p + 8, length, compressBound,
-                                         opts.level);
-#else
-  outlen = LZ4_compressHC_limitedOutput(input, p + 8, length, compressBound);
-#endif
-  if (outlen == 0) {
-    return false;
-  }
-  output->resize(8 + outlen);
-  return true;
-#endif
-  return false;
-}
-
 #define CACHE_LINE_SIZE 64U
 
+#define PREFETCH(addr, rw, locality) __builtin_prefetch(addr, rw, locality)
+
 } // namespace port
 } // namespace rocksdb
 
-#endif  // STORAGE_LEVELDB_PORT_PORT_POSIX_H_
diff --git a/src/rocksdb/port/stack_trace.cc b/src/rocksdb/port/stack_trace.cc
index 76866e6..e2211e9 100644
--- a/src/rocksdb/port/stack_trace.cc
+++ b/src/rocksdb/port/stack_trace.cc
@@ -5,15 +5,17 @@
 //
 #include "port/stack_trace.h"
 
-namespace rocksdb {
-namespace port {
-
-#if defined(ROCKSDB_LITE) || !(defined(OS_LINUX) || defined(OS_MACOSX))
+#if defined(ROCKSDB_LITE) || !(defined(OS_LINUX) || defined(OS_MACOSX)) || \
+    defined(CYGWIN)
 
 // noop
 
+namespace rocksdb {
+namespace port {
 void InstallStackTraceHandler() {}
 void PrintStack(int first_frames_to_skip) {}
+}  // namespace port
+}  // namespace rocksdb
 
 #else
 
@@ -25,6 +27,9 @@ void PrintStack(int first_frames_to_skip) {}
 #include <unistd.h>
 #include <cxxabi.h>
 
+namespace rocksdb {
+namespace port {
+
 namespace {
 
 #ifdef OS_LINUX
@@ -33,7 +38,7 @@ const char* GetExecutableName() {
 
   char link[1024];
   snprintf(link, sizeof(link), "/proc/%d/exe", getpid());
-  auto read = readlink(link, name, sizeof(name));
+  auto read = readlink(link, name, sizeof(name) - 1);
   if (-1 == read) {
     return nullptr;
   } else {
@@ -67,7 +72,7 @@ void PrintStackTraceLine(const char* symbol, void* frame) {
 
   fprintf(stderr, "\n");
 }
-#elif OS_MACOSX
+#elif defined(OS_MACOSX)
 
 void PrintStackTraceLine(const char* symbol, void* frame) {
   static int pid = getpid();
@@ -126,7 +131,7 @@ void InstallStackTraceHandler() {
   signal(SIGABRT, StackTraceHandler);
 }
 
-#endif
-
 }  // namespace port
 }  // namespace rocksdb
+
+#endif
diff --git a/src/rocksdb/table/adaptive_table_factory.cc b/src/rocksdb/table/adaptive_table_factory.cc
new file mode 100644
index 0000000..dcc8406
--- /dev/null
+++ b/src/rocksdb/table/adaptive_table_factory.cc
@@ -0,0 +1,115 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+#include "table/adaptive_table_factory.h"
+
+#include "table/format.h"
+
+namespace rocksdb {
+
+AdaptiveTableFactory::AdaptiveTableFactory(
+    std::shared_ptr<TableFactory> table_factory_to_write,
+    std::shared_ptr<TableFactory> block_based_table_factory,
+    std::shared_ptr<TableFactory> plain_table_factory,
+    std::shared_ptr<TableFactory> cuckoo_table_factory)
+    : table_factory_to_write_(table_factory_to_write),
+      block_based_table_factory_(block_based_table_factory),
+      plain_table_factory_(plain_table_factory),
+      cuckoo_table_factory_(cuckoo_table_factory) {
+  if (!table_factory_to_write_) {
+    table_factory_to_write_ = block_based_table_factory_;
+  }
+  if (!plain_table_factory_) {
+    plain_table_factory_.reset(NewPlainTableFactory());
+  }
+  if (!block_based_table_factory_) {
+    block_based_table_factory_.reset(NewBlockBasedTableFactory());
+  }
+  if (!cuckoo_table_factory_) {
+    cuckoo_table_factory_.reset(NewCuckooTableFactory());
+  }
+}
+
+extern const uint64_t kPlainTableMagicNumber;
+extern const uint64_t kLegacyPlainTableMagicNumber;
+extern const uint64_t kBlockBasedTableMagicNumber;
+extern const uint64_t kLegacyBlockBasedTableMagicNumber;
+extern const uint64_t kCuckooTableMagicNumber;
+
+Status AdaptiveTableFactory::NewTableReader(
+    const ImmutableCFOptions& ioptions, const EnvOptions& env_options,
+    const InternalKeyComparator& icomp, unique_ptr<RandomAccessFile>&& file,
+    uint64_t file_size, unique_ptr<TableReader>* table) const {
+  Footer footer;
+  auto s = ReadFooterFromFile(file.get(), file_size, &footer);
+  if (!s.ok()) {
+    return s;
+  }
+  if (footer.table_magic_number() == kPlainTableMagicNumber ||
+      footer.table_magic_number() == kLegacyPlainTableMagicNumber) {
+    return plain_table_factory_->NewTableReader(
+        ioptions, env_options, icomp, std::move(file), file_size, table);
+  } else if (footer.table_magic_number() == kBlockBasedTableMagicNumber ||
+      footer.table_magic_number() == kLegacyBlockBasedTableMagicNumber) {
+    return block_based_table_factory_->NewTableReader(
+        ioptions, env_options, icomp, std::move(file), file_size, table);
+  } else if (footer.table_magic_number() == kCuckooTableMagicNumber) {
+    return cuckoo_table_factory_->NewTableReader(
+        ioptions, env_options, icomp, std::move(file), file_size, table);
+  } else {
+    return Status::NotSupported("Unidentified table format");
+  }
+}
+
+TableBuilder* AdaptiveTableFactory::NewTableBuilder(
+    const TableBuilderOptions& table_builder_options,
+    WritableFile* file) const {
+  return table_factory_to_write_->NewTableBuilder(table_builder_options, file);
+}
+
+std::string AdaptiveTableFactory::GetPrintableTableOptions() const {
+  std::string ret;
+  ret.reserve(20000);
+  const int kBufferSize = 200;
+  char buffer[kBufferSize];
+
+  if (!table_factory_to_write_) {
+    snprintf(buffer, kBufferSize, "  write factory (%s) options:\n%s\n",
+             table_factory_to_write_->Name(),
+             table_factory_to_write_->GetPrintableTableOptions().c_str());
+    ret.append(buffer);
+  }
+  if (!plain_table_factory_) {
+    snprintf(buffer, kBufferSize, "  %s options:\n%s\n",
+             plain_table_factory_->Name(),
+             plain_table_factory_->GetPrintableTableOptions().c_str());
+    ret.append(buffer);
+  }
+  if (!block_based_table_factory_) {
+    snprintf(buffer, kBufferSize, "  %s options:\n%s\n",
+             block_based_table_factory_->Name(),
+             block_based_table_factory_->GetPrintableTableOptions().c_str());
+    ret.append(buffer);
+  }
+  if (!cuckoo_table_factory_) {
+    snprintf(buffer, kBufferSize, "  %s options:\n%s\n",
+             cuckoo_table_factory_->Name(),
+             cuckoo_table_factory_->GetPrintableTableOptions().c_str());
+    ret.append(buffer);
+  }
+  return ret;
+}
+
+extern TableFactory* NewAdaptiveTableFactory(
+    std::shared_ptr<TableFactory> table_factory_to_write,
+    std::shared_ptr<TableFactory> block_based_table_factory,
+    std::shared_ptr<TableFactory> plain_table_factory,
+    std::shared_ptr<TableFactory> cuckoo_table_factory) {
+  return new AdaptiveTableFactory(table_factory_to_write,
+      block_based_table_factory, plain_table_factory, cuckoo_table_factory);
+}
+
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/adaptive_table_factory.h b/src/rocksdb/table/adaptive_table_factory.h
new file mode 100644
index 0000000..aa0f827
--- /dev/null
+++ b/src/rocksdb/table/adaptive_table_factory.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+
+namespace rocksdb {
+
+struct EnvOptions;
+
+using std::unique_ptr;
+class Status;
+class RandomAccessFile;
+class WritableFile;
+class Table;
+class TableBuilder;
+
+class AdaptiveTableFactory : public TableFactory {
+ public:
+  ~AdaptiveTableFactory() {}
+
+  explicit AdaptiveTableFactory(
+      std::shared_ptr<TableFactory> table_factory_to_write,
+      std::shared_ptr<TableFactory> block_based_table_factory,
+      std::shared_ptr<TableFactory> plain_table_factory,
+      std::shared_ptr<TableFactory> cuckoo_table_factory);
+
+  const char* Name() const override { return "AdaptiveTableFactory"; }
+
+  Status NewTableReader(
+      const ImmutableCFOptions& ioptions, const EnvOptions& env_options,
+      const InternalKeyComparator& internal_comparator,
+      unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
+      unique_ptr<TableReader>* table) const override;
+
+  TableBuilder* NewTableBuilder(
+      const TableBuilderOptions& table_builder_options,
+      WritableFile* file) const override;
+
+  // Sanitizes the specified DB Options.
+  Status SanitizeOptions(const DBOptions& db_opts,
+                         const ColumnFamilyOptions& cf_opts) const override {
+    if (db_opts.allow_mmap_reads == false) {
+      return Status::NotSupported(
+          "AdaptiveTable with allow_mmap_reads == false is not supported.");
+    }
+    return Status::OK();
+  }
+
+  std::string GetPrintableTableOptions() const override;
+
+ private:
+  std::shared_ptr<TableFactory> table_factory_to_write_;
+  std::shared_ptr<TableFactory> block_based_table_factory_;
+  std::shared_ptr<TableFactory> plain_table_factory_;
+  std::shared_ptr<TableFactory> cuckoo_table_factory_;
+};
+
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/block.cc b/src/rocksdb/table/block.cc
index 6a6751c..6a5ede6 100644
--- a/src/rocksdb/table/block.cc
+++ b/src/rocksdb/table/block.cc
@@ -17,42 +17,14 @@
 #include <vector>
 
 #include "rocksdb/comparator.h"
-#include "table/block_hash_index.h"
 #include "table/format.h"
+#include "table/block_hash_index.h"
+#include "table/block_prefix_index.h"
 #include "util/coding.h"
 #include "util/logging.h"
 
 namespace rocksdb {
 
-uint32_t Block::NumRestarts() const {
-  assert(size_ >= 2*sizeof(uint32_t));
-  return DecodeFixed32(data_ + size_ - sizeof(uint32_t));
-}
-
-Block::Block(const BlockContents& contents)
-    : data_(contents.data.data()),
-      size_(contents.data.size()),
-      owned_(contents.heap_allocated),
-      cachable_(contents.cachable),
-      compression_type_(contents.compression_type) {
-  if (size_ < sizeof(uint32_t)) {
-    size_ = 0;  // Error marker
-  } else {
-    restart_offset_ = size_ - (1 + NumRestarts()) * sizeof(uint32_t);
-    if (restart_offset_ > size_ - sizeof(uint32_t)) {
-      // The size is too small for NumRestarts() and therefore
-      // restart_offset_ wrapped around.
-      size_ = 0;
-    }
-  }
-}
-
-Block::~Block() {
-  if (owned_) {
-    delete[] data_;
-  }
-}
-
 // Helper routine: decode the next block entry starting at "p",
 // storing the number of shared key bytes, non_shared key bytes,
 // and the length of the value in "*shared", "*non_shared", and
@@ -83,134 +55,85 @@ static inline const char* DecodeEntry(const char* p, const char* limit,
   return p;
 }
 
-class Block::Iter : public Iterator {
- private:
-  const Comparator* const comparator_;
-  const char* const data_;      // underlying block contents
-  uint32_t const restarts_;     // Offset of restart array (list of fixed32)
-  uint32_t const num_restarts_; // Number of uint32_t entries in restart array
-
-  // current_ is offset in data_ of current entry.  >= restarts_ if !Valid
-  uint32_t current_;
-  uint32_t restart_index_;  // Index of restart block in which current_ falls
-  std::string key_;
-  Slice value_;
-  Status status_;
-  BlockHashIndex* hash_index_;
-
-  inline int Compare(const Slice& a, const Slice& b) const {
-    return comparator_->Compare(a, b);
-  }
-
-  // Return the offset in data_ just past the end of the current entry.
-  inline uint32_t NextEntryOffset() const {
-    return (value_.data() + value_.size()) - data_;
-  }
-
-  uint32_t GetRestartPoint(uint32_t index) {
-    assert(index < num_restarts_);
-    return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t));
-  }
+void BlockIter::Next() {
+  assert(Valid());
+  ParseNextKey();
+}
 
-  void SeekToRestartPoint(uint32_t index) {
-    key_.clear();
-    restart_index_ = index;
-    // current_ will be fixed by ParseNextKey();
+void BlockIter::Prev() {
+  assert(Valid());
 
-    // ParseNextKey() starts at the end of value_, so set value_ accordingly
-    uint32_t offset = GetRestartPoint(index);
-    value_ = Slice(data_ + offset, 0);
+  // Scan backwards to a restart point before current_
+  const uint32_t original = current_;
+  while (GetRestartPoint(restart_index_) >= original) {
+    if (restart_index_ == 0) {
+      // No more entries
+      current_ = restarts_;
+      restart_index_ = num_restarts_;
+      return;
+    }
+    restart_index_--;
   }
 
- public:
-  Iter(const Comparator* comparator, const char* data, uint32_t restarts,
-       uint32_t num_restarts, BlockHashIndex* hash_index)
-      : comparator_(comparator),
-        data_(data),
-        restarts_(restarts),
-        num_restarts_(num_restarts),
-        current_(restarts_),
-        restart_index_(num_restarts_),
-        hash_index_(hash_index) {
-    assert(num_restarts_ > 0);
-  }
+  SeekToRestartPoint(restart_index_);
+  do {
+    // Loop until end of current entry hits the start of original entry
+  } while (ParseNextKey() && NextEntryOffset() < original);
+}
 
-  virtual bool Valid() const { return current_ < restarts_; }
-  virtual Status status() const { return status_; }
-  virtual Slice key() const {
-    assert(Valid());
-    return key_;
+void BlockIter::Seek(const Slice& target) {
+  if (data_ == nullptr) {  // Not init yet
+    return;
   }
-  virtual Slice value() const {
-    assert(Valid());
-    return value_;
+  uint32_t index = 0;
+  bool ok = false;
+  if (prefix_index_) {
+    ok = PrefixSeek(target, &index);
+  } else {
+    ok = hash_index_ ? HashSeek(target, &index)
+      : BinarySeek(target, 0, num_restarts_ - 1, &index);
   }
 
-  virtual void Next() {
-    assert(Valid());
-    ParseNextKey();
+  if (!ok) {
+    return;
   }
+  SeekToRestartPoint(index);
+  // Linear search (within restart block) for first key >= target
 
-  virtual void Prev() {
-    assert(Valid());
-
-    // Scan backwards to a restart point before current_
-    const uint32_t original = current_;
-    while (GetRestartPoint(restart_index_) >= original) {
-      if (restart_index_ == 0) {
-        // No more entries
-        current_ = restarts_;
-        restart_index_ = num_restarts_;
-        return;
-      }
-      restart_index_--;
-    }
-
-    SeekToRestartPoint(restart_index_);
-    do {
-      // Loop until end of current entry hits the start of original entry
-    } while (ParseNextKey() && NextEntryOffset() < original);
-  }
-
-  virtual void Seek(const Slice& target) {
-    uint32_t index = 0;
-    bool ok = hash_index_ ? HashSeek(target, &index)
-                          : BinarySeek(target, 0, num_restarts_ - 1, &index);
-
-    if (!ok) {
+  while (true) {
+    if (!ParseNextKey() || Compare(key_.GetKey(), target) >= 0) {
       return;
     }
-    SeekToRestartPoint(index);
-    // Linear search (within restart block) for first key >= target
-
-    while (true) {
-      if (!ParseNextKey() || Compare(key_, target) >= 0) {
-        return;
-      }
-    }
-  }
-  virtual void SeekToFirst() {
-    SeekToRestartPoint(0);
-    ParseNextKey();
   }
+}
 
-  virtual void SeekToLast() {
-    SeekToRestartPoint(num_restarts_ - 1);
-    while (ParseNextKey() && NextEntryOffset() < restarts_) {
-      // Keep skipping
-    }
+void BlockIter::SeekToFirst() {
+  if (data_ == nullptr) {  // Not init yet
+    return;
   }
+  SeekToRestartPoint(0);
+  ParseNextKey();
+}
 
- private:
-  void CorruptionError() {
-    current_ = restarts_;
-    restart_index_ = num_restarts_;
-    status_ = Status::Corruption("bad entry in block");
-    key_.clear();
-    value_.clear();
+void BlockIter::SeekToLast() {
+  if (data_ == nullptr) {  // Not init yet
+    return;
+  }
+  SeekToRestartPoint(num_restarts_ - 1);
+  while (ParseNextKey() && NextEntryOffset() < restarts_) {
+    // Keep skipping
   }
+}
+
+void BlockIter::CorruptionError() {
+  current_ = restarts_;
+  restart_index_ = num_restarts_;
+  status_ = Status::Corruption("bad entry in block");
+  key_.Clear();
+  value_.clear();
+}
 
-  bool ParseNextKey() {
+bool BlockIter::ParseNextKey() {
     current_ = NextEntryOffset();
     const char* p = data_ + current_;
     const char* limit = data_ + restarts_;  // Restarts come right after data
@@ -224,12 +147,11 @@ class Block::Iter : public Iterator {
     // Decode next entry
     uint32_t shared, non_shared, value_length;
     p = DecodeEntry(p, limit, &shared, &non_shared, &value_length);
-    if (p == nullptr || key_.size() < shared) {
+    if (p == nullptr || key_.Size() < shared) {
       CorruptionError();
       return false;
     } else {
-      key_.resize(shared);
-      key_.append(p, non_shared);
+      key_.TrimAppend(shared, p, non_shared);
       value_ = Slice(p + non_shared, value_length);
       while (restart_index_ + 1 < num_restarts_ &&
              GetRestartPoint(restart_index_ + 1) < current_) {
@@ -238,70 +160,213 @@ class Block::Iter : public Iterator {
       return true;
     }
   }
-  // Binary search in restart array to find the first restart point
-  // with a key >= target
-  bool BinarySeek(const Slice& target, uint32_t left, uint32_t right,
+
+// Binary search in restart array to find the first restart point
+// with a key >= target (TODO: this comment is inaccurate)
+bool BlockIter::BinarySeek(const Slice& target, uint32_t left, uint32_t right,
                   uint32_t* index) {
-    assert(left <= right);
-
-    while (left < right) {
-      uint32_t mid = (left + right + 1) / 2;
-      uint32_t region_offset = GetRestartPoint(mid);
-      uint32_t shared, non_shared, value_length;
-      const char* key_ptr =
-          DecodeEntry(data_ + region_offset, data_ + restarts_, &shared,
-                      &non_shared, &value_length);
-      if (key_ptr == nullptr || (shared != 0)) {
-        CorruptionError();
-        return false;
-      }
-      Slice mid_key(key_ptr, non_shared);
-      if (Compare(mid_key, target) < 0) {
-        // Key at "mid" is smaller than "target". Therefore all
-        // blocks before "mid" are uninteresting.
-        left = mid;
-      } else {
-        // Key at "mid" is >= "target". Therefore all blocks at or
-        // after "mid" are uninteresting.
-        right = mid - 1;
-      }
+  assert(left <= right);
+
+  while (left < right) {
+    uint32_t mid = (left + right + 1) / 2;
+    uint32_t region_offset = GetRestartPoint(mid);
+    uint32_t shared, non_shared, value_length;
+    const char* key_ptr =
+        DecodeEntry(data_ + region_offset, data_ + restarts_, &shared,
+                    &non_shared, &value_length);
+    if (key_ptr == nullptr || (shared != 0)) {
+      CorruptionError();
+      return false;
     }
+    Slice mid_key(key_ptr, non_shared);
+    int cmp = Compare(mid_key, target);
+    if (cmp < 0) {
+      // Key at "mid" is smaller than "target". Therefore all
+      // blocks before "mid" are uninteresting.
+      left = mid;
+    } else if (cmp > 0) {
+      // Key at "mid" is >= "target". Therefore all blocks at or
+      // after "mid" are uninteresting.
+      right = mid - 1;
+    } else {
+      left = right = mid;
+    }
+  }
 
-    *index = left;
-    return true;
+  *index = left;
+  return true;
+}
+
+// Compare target key and the block key of the block of `block_index`.
+// Return -1 if error.
+int BlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) {
+  uint32_t region_offset = GetRestartPoint(block_index);
+  uint32_t shared, non_shared, value_length;
+  const char* key_ptr = DecodeEntry(data_ + region_offset, data_ + restarts_,
+                                    &shared, &non_shared, &value_length);
+  if (key_ptr == nullptr || (shared != 0)) {
+    CorruptionError();
+    return 1;  // Return target is smaller
+  }
+  Slice block_key(key_ptr, non_shared);
+  return Compare(block_key, target);
+}
+
+// Binary search in block_ids to find the first block
+// with a key >= target
+bool BlockIter::BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids,
+                          uint32_t left, uint32_t right,
+                          uint32_t* index) {
+  assert(left <= right);
+  uint32_t left_bound = left;
+
+  while (left <= right) {
+    uint32_t mid = (left + right) / 2;
+
+    int cmp = CompareBlockKey(block_ids[mid], target);
+    if (!status_.ok()) {
+      return false;
+    }
+    if (cmp < 0) {
+      // Key at "target" is larger than "mid". Therefore all
+      // blocks before or at "mid" are uninteresting.
+      left = mid + 1;
+    } else {
+      // Key at "target" is <= "mid". Therefore all blocks
+      // after "mid" are uninteresting.
+      // If there is only one block left, we found it.
+      if (left == right) break;
+      right = mid;
+    }
   }
 
-  bool HashSeek(const Slice& target, uint32_t* index) {
-    assert(hash_index_);
-    auto restart_index = hash_index_->GetRestartIndex(target);
-    if (restart_index == nullptr) {
+  if (left == right) {
+    // In one of the two following cases:
+    // (1) left is the first one of block_ids
+    // (2) there is a gap of blocks between block of `left` and `left-1`.
+    // we can further distinguish the case of key in the block or key not
+    // existing, by comparing the target key and the key of the previous
+    // block to the left of the block found.
+    if (block_ids[left] > 0 &&
+        (left == left_bound || block_ids[left - 1] != block_ids[left] - 1) &&
+        CompareBlockKey(block_ids[left] - 1, target) > 0) {
       current_ = restarts_;
-      return 0;
+      return false;
     }
 
-    // the elements in restart_array[index : index + num_blocks]
-    // are all with same prefix. We'll do binary search in that small range.
-    auto left = restart_index->first_index;
-    auto right = restart_index->first_index + restart_index->num_blocks - 1;
-    return BinarySeek(target, left, right, index);
+    *index = block_ids[left];
+    return true;
+  } else {
+    assert(left > right);
+    // Mark iterator invalid
+    current_ = restarts_;
+    return false;
+  }
+}
+
+bool BlockIter::HashSeek(const Slice& target, uint32_t* index) {
+  assert(hash_index_);
+  auto restart_index = hash_index_->GetRestartIndex(target);
+  if (restart_index == nullptr) {
+    current_ = restarts_;
+    return false;
   }
-};
 
-Iterator* Block::NewIterator(const Comparator* cmp) {
+  // the elements in restart_array[index : index + num_blocks]
+  // are all with same prefix. We'll do binary search in that small range.
+  auto left = restart_index->first_index;
+  auto right = restart_index->first_index + restart_index->num_blocks - 1;
+  return BinarySeek(target, left, right, index);
+}
+
+bool BlockIter::PrefixSeek(const Slice& target, uint32_t* index) {
+  assert(prefix_index_);
+  uint32_t* block_ids = nullptr;
+  uint32_t num_blocks = prefix_index_->GetBlocks(target, &block_ids);
+
+  if (num_blocks == 0) {
+    current_ = restarts_;
+    return false;
+  } else  {
+    return BinaryBlockIndexSeek(target, block_ids, 0, num_blocks - 1, index);
+  }
+}
+
+uint32_t Block::NumRestarts() const {
+  assert(size_ >= 2*sizeof(uint32_t));
+  return DecodeFixed32(data_ + size_ - sizeof(uint32_t));
+}
+
+Block::Block(BlockContents&& contents)
+    : contents_(std::move(contents)),
+      data_(contents_.data.data()),
+      size_(contents_.data.size()) {
+  if (size_ < sizeof(uint32_t)) {
+    size_ = 0;  // Error marker
+  } else {
+    restart_offset_ =
+        static_cast<uint32_t>(size_) - (1 + NumRestarts()) * sizeof(uint32_t);
+    if (restart_offset_ > size_ - sizeof(uint32_t)) {
+      // The size is too small for NumRestarts() and therefore
+      // restart_offset_ wrapped around.
+      size_ = 0;
+    }
+  }
+}
+
+Iterator* Block::NewIterator(
+    const Comparator* cmp, BlockIter* iter, bool total_order_seek) {
   if (size_ < 2*sizeof(uint32_t)) {
-    return NewErrorIterator(Status::Corruption("bad block contents"));
+    if (iter != nullptr) {
+      iter->SetStatus(Status::Corruption("bad block contents"));
+      return iter;
+    } else {
+      return NewErrorIterator(Status::Corruption("bad block contents"));
+    }
   }
   const uint32_t num_restarts = NumRestarts();
   if (num_restarts == 0) {
-    return NewEmptyIterator();
+    if (iter != nullptr) {
+      iter->SetStatus(Status::OK());
+      return iter;
+    } else {
+      return NewEmptyIterator();
+    }
   } else {
-    return new Iter(cmp, data_, restart_offset_, num_restarts,
-                    hash_index_.get());
+    BlockHashIndex* hash_index_ptr =
+        total_order_seek ? nullptr : hash_index_.get();
+    BlockPrefixIndex* prefix_index_ptr =
+        total_order_seek ? nullptr : prefix_index_.get();
+
+    if (iter != nullptr) {
+      iter->Initialize(cmp, data_, restart_offset_, num_restarts,
+                    hash_index_ptr, prefix_index_ptr);
+    } else {
+      iter = new BlockIter(cmp, data_, restart_offset_, num_restarts,
+                           hash_index_ptr, prefix_index_ptr);
+    }
   }
+
+  return iter;
 }
 
 void Block::SetBlockHashIndex(BlockHashIndex* hash_index) {
   hash_index_.reset(hash_index);
 }
 
+void Block::SetBlockPrefixIndex(BlockPrefixIndex* prefix_index) {
+  prefix_index_.reset(prefix_index);
+}
+
+size_t Block::ApproximateMemoryUsage() const {
+  size_t usage = size();
+  if (hash_index_) {
+    usage += hash_index_->ApproximateMemoryUsage();
+  }
+  if (prefix_index_) {
+    usage += prefix_index_->ApproximateMemoryUsage();
+  }
+  return usage;
+}
+
 }  // namespace rocksdb
diff --git a/src/rocksdb/table/block.h b/src/rocksdb/table/block.h
index b363d62..0187489 100644
--- a/src/rocksdb/table/block.h
+++ b/src/rocksdb/table/block.h
@@ -13,25 +13,34 @@
 
 #include "rocksdb/iterator.h"
 #include "rocksdb/options.h"
+#include "db/dbformat.h"
+#include "table/block_prefix_index.h"
+#include "table/block_hash_index.h"
+
+#include "format.h"
 
 namespace rocksdb {
 
 struct BlockContents;
 class Comparator;
+class BlockIter;
 class BlockHashIndex;
+class BlockPrefixIndex;
 
 class Block {
  public:
   // Initialize the block with the specified contents.
-  explicit Block(const BlockContents& contents);
+  explicit Block(BlockContents&& contents);
 
-  ~Block();
+  ~Block() = default;
 
   size_t size() const { return size_; }
   const char* data() const { return data_; }
-  bool cachable() const { return cachable_; }
+  bool cachable() const { return contents_.cachable; }
   uint32_t NumRestarts() const;
-  CompressionType compression_type() const { return compression_type_; }
+  CompressionType compression_type() const {
+    return contents_.compression_type;
+  }
 
   // If hash index lookup is enabled and `use_hash_index` is true. This block
   // will do hash lookup for the key prefix.
@@ -39,23 +48,153 @@ class Block {
   // NOTE: for the hash based lookup, if a key prefix doesn't match any key,
   // the iterator will simply be set as "invalid", rather than returning
   // the key that is just pass the target key.
-  Iterator* NewIterator(const Comparator* comparator);
+  //
+  // If iter is null, return new Iterator
+  // If iter is not null, update this one and return it as Iterator*
+  //
+  // If total_order_seek is true, hash_index_ and prefix_index_ are ignored.
+  // This option only applies for index block. For data block, hash_index_
+  // and prefix_index_ are null, so this option does not matter.
+  Iterator* NewIterator(const Comparator* comparator,
+      BlockIter* iter = nullptr, bool total_order_seek = true);
   void SetBlockHashIndex(BlockHashIndex* hash_index);
+  void SetBlockPrefixIndex(BlockPrefixIndex* prefix_index);
+
+  // Report an approximation of how much memory has been used.
+  size_t ApproximateMemoryUsage() const;
 
  private:
-  const char* data_;
-  size_t size_;
+  BlockContents contents_;
+  const char* data_;            // contents_.data.data()
+  size_t size_;                 // contents_.data.size()
   uint32_t restart_offset_;     // Offset in data_ of restart array
-  bool owned_;                  // Block owns data_[]
-  bool cachable_;
-  CompressionType compression_type_;
   std::unique_ptr<BlockHashIndex> hash_index_;
+  std::unique_ptr<BlockPrefixIndex> prefix_index_;
 
   // No copying allowed
   Block(const Block&);
   void operator=(const Block&);
+};
+
+class BlockIter : public Iterator {
+ public:
+  BlockIter()
+      : comparator_(nullptr),
+        data_(nullptr),
+        restarts_(0),
+        num_restarts_(0),
+        current_(0),
+        restart_index_(0),
+        status_(Status::OK()),
+        hash_index_(nullptr),
+        prefix_index_(nullptr) {}
+
+  BlockIter(const Comparator* comparator, const char* data, uint32_t restarts,
+       uint32_t num_restarts, BlockHashIndex* hash_index,
+       BlockPrefixIndex* prefix_index)
+      : BlockIter() {
+    Initialize(comparator, data, restarts, num_restarts,
+        hash_index, prefix_index);
+  }
+
+  void Initialize(const Comparator* comparator, const char* data,
+      uint32_t restarts, uint32_t num_restarts, BlockHashIndex* hash_index,
+      BlockPrefixIndex* prefix_index) {
+    assert(data_ == nullptr);           // Ensure it is called only once
+    assert(num_restarts > 0);           // Ensure the param is valid
+
+    comparator_ = comparator;
+    data_ = data;
+    restarts_ = restarts;
+    num_restarts_ = num_restarts;
+    current_ = restarts_;
+    restart_index_ = num_restarts_;
+    hash_index_ = hash_index;
+    prefix_index_ = prefix_index;
+  }
+
+  void SetStatus(Status s) {
+    status_ = s;
+  }
+
+  virtual bool Valid() const override { return current_ < restarts_; }
+  virtual Status status() const override { return status_; }
+  virtual Slice key() const override {
+    assert(Valid());
+    return key_.GetKey();
+  }
+  virtual Slice value() const override {
+    assert(Valid());
+    return value_;
+  }
+
+  virtual void Next() override;
+
+  virtual void Prev() override;
+
+  virtual void Seek(const Slice& target) override;
+
+  virtual void SeekToFirst() override;
+
+  virtual void SeekToLast() override;
+
+ private:
+  const Comparator* comparator_;
+  const char* data_;       // underlying block contents
+  uint32_t restarts_;      // Offset of restart array (list of fixed32)
+  uint32_t num_restarts_;  // Number of uint32_t entries in restart array
+
+  // current_ is offset in data_ of current entry.  >= restarts_ if !Valid
+  uint32_t current_;
+  uint32_t restart_index_;  // Index of restart block in which current_ falls
+  IterKey key_;
+  Slice value_;
+  Status status_;
+  BlockHashIndex* hash_index_;
+  BlockPrefixIndex* prefix_index_;
+
+  inline int Compare(const Slice& a, const Slice& b) const {
+    return comparator_->Compare(a, b);
+  }
+
+  // Return the offset in data_ just past the end of the current entry.
+  inline uint32_t NextEntryOffset() const {
+    // NOTE: We don't support files bigger than 2GB
+    return static_cast<uint32_t>((value_.data() + value_.size()) - data_);
+  }
+
+  uint32_t GetRestartPoint(uint32_t index) {
+    assert(index < num_restarts_);
+    return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t));
+  }
+
+  void SeekToRestartPoint(uint32_t index) {
+    key_.Clear();
+    restart_index_ = index;
+    // current_ will be fixed by ParseNextKey();
+
+    // ParseNextKey() starts at the end of value_, so set value_ accordingly
+    uint32_t offset = GetRestartPoint(index);
+    value_ = Slice(data_ + offset, 0);
+  }
+
+  void CorruptionError();
+
+  bool ParseNextKey();
+
+  bool BinarySeek(const Slice& target, uint32_t left, uint32_t right,
+                  uint32_t* index);
+
+  int CompareBlockKey(uint32_t block_index, const Slice& target);
+
+  bool BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids,
+                            uint32_t left, uint32_t right,
+                            uint32_t* index);
+
+  bool HashSeek(const Slice& target, uint32_t* index);
+
+  bool PrefixSeek(const Slice& target, uint32_t* index);
 
-  class Iter;
 };
 
 }  // namespace rocksdb
diff --git a/src/rocksdb/table/block_based_filter_block.cc b/src/rocksdb/table/block_based_filter_block.cc
new file mode 100644
index 0000000..cd56028
--- /dev/null
+++ b/src/rocksdb/table/block_based_filter_block.cc
@@ -0,0 +1,255 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <algorithm>
+#include "table/block_based_filter_block.h"
+
+#include "db/dbformat.h"
+#include "rocksdb/filter_policy.h"
+#include "util/coding.h"
+#include "util/string_util.h"
+
+namespace rocksdb {
+
+namespace {
+bool SamePrefix(const SliceTransform* prefix_extractor,
+                const Slice& key1, const Slice& key2) {
+  if (!prefix_extractor->InDomain(key1) &&
+      !prefix_extractor->InDomain(key2)) {
+    return true;
+  } else if (!prefix_extractor->InDomain(key1) ||
+             !prefix_extractor->InDomain(key2)) {
+    return false;
+  } else {
+    return (prefix_extractor->Transform(key1) ==
+            prefix_extractor->Transform(key2));
+  }
+}
+
+void AppendItem(std::string* props, const std::string& key,
+                const std::string& value) {
+  char cspace = ' ';
+  std::string value_str("");
+  size_t i = 0;
+  const size_t dataLength = 64;
+  const size_t tabLength = 2;
+  const size_t offLength = 16;
+
+  value_str.append(&value[i], std::min(size_t(dataLength), value.size()));
+  i += dataLength;
+  while (i < value.size()) {
+    value_str.append("\n");
+    value_str.append(offLength, cspace);
+    value_str.append(&value[i], std::min(size_t(dataLength), value.size() - i));
+    i += dataLength;
+  }
+
+  std::string result("");
+  if (key.size() < (offLength - tabLength))
+    result.append(size_t((offLength - tabLength)) - key.size(), cspace);
+  result.append(key);
+
+  props->append(result + ": " + value_str + "\n");
+}
+
+template <class TKey>
+void AppendItem(std::string* props, const TKey& key, const std::string& value) {
+  std::string key_str = rocksdb::ToString(key);
+  AppendItem(props, key_str, value);
+}
+}  // namespace
+
+
+// See doc/table_format.txt for an explanation of the filter block format.
+
+// Generate new filter every 2KB of data
+static const size_t kFilterBaseLg = 11;
+static const size_t kFilterBase = 1 << kFilterBaseLg;
+
+BlockBasedFilterBlockBuilder::BlockBasedFilterBlockBuilder(
+    const SliceTransform* prefix_extractor,
+    const BlockBasedTableOptions& table_opt)
+    : policy_(table_opt.filter_policy.get()),
+      prefix_extractor_(prefix_extractor),
+      whole_key_filtering_(table_opt.whole_key_filtering) {
+  assert(policy_);
+}
+
+void BlockBasedFilterBlockBuilder::StartBlock(uint64_t block_offset) {
+  uint64_t filter_index = (block_offset / kFilterBase);
+  assert(filter_index >= filter_offsets_.size());
+  while (filter_index > filter_offsets_.size()) {
+    GenerateFilter();
+  }
+}
+
+void BlockBasedFilterBlockBuilder::Add(const Slice& key) {
+  added_to_start_ = 0;
+  if (whole_key_filtering_) {
+    AddKey(key);
+    added_to_start_ = 1;
+  }
+  if (prefix_extractor_ && prefix_extractor_->InDomain(key)) {
+    AddPrefix(key);
+  }
+}
+
+// Add key to filter if needed
+inline void BlockBasedFilterBlockBuilder::AddKey(const Slice& key) {
+  start_.push_back(entries_.size());
+  entries_.append(key.data(), key.size());
+}
+
+// Add prefix to filter if needed
+inline void BlockBasedFilterBlockBuilder::AddPrefix(const Slice& key) {
+  // get slice for most recently added entry
+  Slice prev;
+  if (start_.size() > added_to_start_) {
+    size_t prev_start = start_[start_.size() - 1 - added_to_start_];
+    const char* base = entries_.data() + prev_start;
+    size_t length = entries_.size() - prev_start;
+    prev = Slice(base, length);
+  }
+
+  // this assumes prefix(prefix(key)) == prefix(key), as the last
+  // entry in entries_ may be either a key or prefix, and we use
+  // prefix(last entry) to get the prefix of the last key.
+  if (prev.size() == 0 || !SamePrefix(prefix_extractor_, key, prev)) {
+    Slice prefix = prefix_extractor_->Transform(key);
+    start_.push_back(entries_.size());
+    entries_.append(prefix.data(), prefix.size());
+  }
+}
+
+Slice BlockBasedFilterBlockBuilder::Finish() {
+  if (!start_.empty()) {
+    GenerateFilter();
+  }
+
+  // Append array of per-filter offsets
+  const uint32_t array_offset = static_cast<uint32_t>(result_.size());
+  for (size_t i = 0; i < filter_offsets_.size(); i++) {
+    PutFixed32(&result_, filter_offsets_[i]);
+  }
+
+  PutFixed32(&result_, array_offset);
+  result_.push_back(kFilterBaseLg);  // Save encoding parameter in result
+  return Slice(result_);
+}
+
+void BlockBasedFilterBlockBuilder::GenerateFilter() {
+  const size_t num_entries = start_.size();
+  if (num_entries == 0) {
+    // Fast path if there are no keys for this filter
+    filter_offsets_.push_back(static_cast<uint32_t>(result_.size()));
+    return;
+  }
+
+  // Make list of keys from flattened key structure
+  start_.push_back(entries_.size());  // Simplify length computation
+  tmp_entries_.resize(num_entries);
+  for (size_t i = 0; i < num_entries; i++) {
+    const char* base = entries_.data() + start_[i];
+    size_t length = start_[i + 1] - start_[i];
+    tmp_entries_[i] = Slice(base, length);
+  }
+
+  // Generate filter for current set of keys and append to result_.
+  filter_offsets_.push_back(static_cast<uint32_t>(result_.size()));
+  policy_->CreateFilter(&tmp_entries_[0], static_cast<int>(num_entries),
+                        &result_);
+
+  tmp_entries_.clear();
+  entries_.clear();
+  start_.clear();
+}
+
+BlockBasedFilterBlockReader::BlockBasedFilterBlockReader(
+    const SliceTransform* prefix_extractor,
+    const BlockBasedTableOptions& table_opt, bool whole_key_filtering,
+    BlockContents&& contents)
+    : policy_(table_opt.filter_policy.get()),
+      prefix_extractor_(prefix_extractor),
+      whole_key_filtering_(whole_key_filtering),
+      data_(nullptr),
+      offset_(nullptr),
+      num_(0),
+      base_lg_(0),
+      contents_(std::move(contents)) {
+  assert(policy_);
+  size_t n = contents_.data.size();
+  if (n < 5) return;  // 1 byte for base_lg_ and 4 for start of offset array
+  base_lg_ = contents_.data[n - 1];
+  uint32_t last_word = DecodeFixed32(contents_.data.data() + n - 5);
+  if (last_word > n - 5) return;
+  data_ = contents_.data.data();
+  offset_ = data_ + last_word;
+  num_ = (n - 5 - last_word) / 4;
+}
+
+bool BlockBasedFilterBlockReader::KeyMayMatch(const Slice& key,
+                                              uint64_t block_offset) {
+  assert(block_offset != kNotValid);
+  if (!whole_key_filtering_) {
+    return true;
+  }
+  return MayMatch(key, block_offset);
+}
+
+bool BlockBasedFilterBlockReader::PrefixMayMatch(const Slice& prefix,
+                                                 uint64_t block_offset) {
+  assert(block_offset != kNotValid);
+  if (!prefix_extractor_) {
+    return true;
+  }
+  return MayMatch(prefix, block_offset);
+}
+
+bool BlockBasedFilterBlockReader::MayMatch(const Slice& entry,
+                                           uint64_t block_offset) {
+  uint64_t index = block_offset >> base_lg_;
+  if (index < num_) {
+    uint32_t start = DecodeFixed32(offset_ + index * 4);
+    uint32_t limit = DecodeFixed32(offset_ + index * 4 + 4);
+    if (start <= limit && limit <= (uint32_t)(offset_ - data_)) {
+      Slice filter = Slice(data_ + start, limit - start);
+      return policy_->KeyMayMatch(entry, filter);
+    } else if (start == limit) {
+      // Empty filters do not match any entries
+      return false;
+    }
+  }
+  return true;  // Errors are treated as potential matches
+}
+
+size_t BlockBasedFilterBlockReader::ApproximateMemoryUsage() const {
+  return num_ * 4 + 5 + (offset_ - data_);
+}
+
+std::string BlockBasedFilterBlockReader::ToString() const {
+  std::string result, filter_meta;
+  result.reserve(1024);
+
+  std::string s_bo("Block offset"), s_hd("Hex dump"), s_fb("# filter blocks");
+  AppendItem(&result, s_fb, rocksdb::ToString(num_));
+  AppendItem(&result, s_bo, s_hd);
+
+  for (size_t index = 0; index < num_; index++) {
+    uint32_t start = DecodeFixed32(offset_ + index * 4);
+    uint32_t limit = DecodeFixed32(offset_ + index * 4 + 4);
+
+    if (start != limit) {
+      result.append(" filter block # " + rocksdb::ToString(index + 1) + "\n");
+      Slice filter = Slice(data_ + start, limit - start);
+      AppendItem(&result, start, filter.ToString(true));
+    }
+  }
+  return result;
+}
+}  // namespace rocksdb
diff --git a/src/rocksdb/table/block_based_filter_block.h b/src/rocksdb/table/block_based_filter_block.h
new file mode 100644
index 0000000..d339ac6
--- /dev/null
+++ b/src/rocksdb/table/block_based_filter_block.h
@@ -0,0 +1,105 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A filter block is stored near the end of a Table file.  It contains
+// filters (e.g., bloom filters) for all data blocks in the table combined
+// into a single filter block.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string>
+#include <memory>
+#include <vector>
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "table/filter_block.h"
+#include "util/hash.h"
+
+namespace rocksdb {
+
+
+// A BlockBasedFilterBlockBuilder is used to construct all of the filters for a
+// particular Table.  It generates a single string which is stored as
+// a special block in the Table.
+//
+// The sequence of calls to BlockBasedFilterBlockBuilder must match the regexp:
+//      (StartBlock Add*)* Finish
+class BlockBasedFilterBlockBuilder : public FilterBlockBuilder {
+ public:
+  BlockBasedFilterBlockBuilder(const SliceTransform* prefix_extractor,
+      const BlockBasedTableOptions& table_opt);
+
+  virtual bool IsBlockBased() override { return true; }
+  virtual void StartBlock(uint64_t block_offset) override;
+  virtual void Add(const Slice& key) override;
+  virtual Slice Finish() override;
+
+ private:
+  void AddKey(const Slice& key);
+  void AddPrefix(const Slice& key);
+  void GenerateFilter();
+
+  // important: all of these might point to invalid addresses
+  // at the time of destruction of this filter block. destructor
+  // should NOT dereference them.
+  const FilterPolicy* policy_;
+  const SliceTransform* prefix_extractor_;
+  bool whole_key_filtering_;
+
+  std::string entries_;             // Flattened entry contents
+  std::vector<size_t> start_;       // Starting index in entries_ of each entry
+  uint32_t added_to_start_;         // To indicate if key is added
+  std::string result_;              // Filter data computed so far
+  std::vector<Slice> tmp_entries_;  // policy_->CreateFilter() argument
+  std::vector<uint32_t> filter_offsets_;
+
+  // No copying allowed
+  BlockBasedFilterBlockBuilder(const BlockBasedFilterBlockBuilder&);
+  void operator=(const BlockBasedFilterBlockBuilder&);
+};
+
+// A FilterBlockReader is used to parse filter from SST table.
+// KeyMayMatch and PrefixMayMatch would trigger filter checking
+class BlockBasedFilterBlockReader : public FilterBlockReader {
+ public:
+  // REQUIRES: "contents" and *policy must stay live while *this is live.
+  BlockBasedFilterBlockReader(const SliceTransform* prefix_extractor,
+                              const BlockBasedTableOptions& table_opt,
+                              bool whole_key_filtering,
+                              BlockContents&& contents);
+  virtual bool IsBlockBased() override { return true; }
+  virtual bool KeyMayMatch(const Slice& key,
+                           uint64_t block_offset = kNotValid) override;
+  virtual bool PrefixMayMatch(const Slice& prefix,
+                              uint64_t block_offset = kNotValid) override;
+  virtual size_t ApproximateMemoryUsage() const override;
+
+  // convert this object to a human readable form
+  std::string ToString() const override;
+
+ private:
+  const FilterPolicy* policy_;
+  const SliceTransform* prefix_extractor_;
+  bool whole_key_filtering_;
+  const char* data_;    // Pointer to filter data (at block-start)
+  const char* offset_;  // Pointer to beginning of offset array (at block-end)
+  size_t num_;          // Number of entries in offset array
+  size_t base_lg_;      // Encoding parameter (see kFilterBaseLg in .cc file)
+  BlockContents contents_;
+
+  bool MayMatch(const Slice& entry, uint64_t block_offset);
+
+  // No copying allowed
+  BlockBasedFilterBlockReader(const BlockBasedFilterBlockReader&);
+  void operator=(const BlockBasedFilterBlockReader&);
+};
+}  // namespace rocksdb
diff --git a/src/rocksdb/table/block_based_filter_block_test.cc b/src/rocksdb/table/block_based_filter_block_test.cc
new file mode 100644
index 0000000..017de59
--- /dev/null
+++ b/src/rocksdb/table/block_based_filter_block_test.cc
@@ -0,0 +1,248 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/block_based_filter_block.h"
+
+#include "rocksdb/filter_policy.h"
+#include "util/coding.h"
+#include "util/hash.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+// For testing: emit an array with one hash value per key
+class TestHashFilter : public FilterPolicy {
+ public:
+  virtual const char* Name() const override { return "TestHashFilter"; }
+
+  virtual void CreateFilter(const Slice* keys, int n,
+                            std::string* dst) const override {
+    for (int i = 0; i < n; i++) {
+      uint32_t h = Hash(keys[i].data(), keys[i].size(), 1);
+      PutFixed32(dst, h);
+    }
+  }
+
+  virtual bool KeyMayMatch(const Slice& key,
+                           const Slice& filter) const override {
+    uint32_t h = Hash(key.data(), key.size(), 1);
+    for (unsigned int i = 0; i + 4 <= filter.size(); i += 4) {
+      if (h == DecodeFixed32(filter.data() + i)) {
+        return true;
+      }
+    }
+    return false;
+  }
+};
+
+class FilterBlockTest : public testing::Test {
+ public:
+  TestHashFilter policy_;
+  BlockBasedTableOptions table_options_;
+
+  FilterBlockTest() {
+    table_options_.filter_policy.reset(new TestHashFilter());
+  }
+};
+
+TEST_F(FilterBlockTest, EmptyBuilder) {
+  BlockBasedFilterBlockBuilder builder(nullptr, table_options_);
+  BlockContents block(builder.Finish(), false, kNoCompression);
+  ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block.data));
+  BlockBasedFilterBlockReader reader(nullptr, table_options_, true,
+                                     std::move(block));
+  ASSERT_TRUE(reader.KeyMayMatch("foo", 0));
+  ASSERT_TRUE(reader.KeyMayMatch("foo", 100000));
+}
+
+TEST_F(FilterBlockTest, SingleChunk) {
+  BlockBasedFilterBlockBuilder builder(nullptr, table_options_);
+  builder.StartBlock(100);
+  builder.Add("foo");
+  builder.Add("bar");
+  builder.Add("box");
+  builder.StartBlock(200);
+  builder.Add("box");
+  builder.StartBlock(300);
+  builder.Add("hello");
+  BlockContents block(builder.Finish(), false, kNoCompression);
+  BlockBasedFilterBlockReader reader(nullptr, table_options_, true,
+                                     std::move(block));
+  ASSERT_TRUE(reader.KeyMayMatch("foo", 100));
+  ASSERT_TRUE(reader.KeyMayMatch("bar", 100));
+  ASSERT_TRUE(reader.KeyMayMatch("box", 100));
+  ASSERT_TRUE(reader.KeyMayMatch("hello", 100));
+  ASSERT_TRUE(reader.KeyMayMatch("foo", 100));
+  ASSERT_TRUE(!reader.KeyMayMatch("missing", 100));
+  ASSERT_TRUE(!reader.KeyMayMatch("other", 100));
+}
+
+TEST_F(FilterBlockTest, MultiChunk) {
+  BlockBasedFilterBlockBuilder builder(nullptr, table_options_);
+
+  // First filter
+  builder.StartBlock(0);
+  builder.Add("foo");
+  builder.StartBlock(2000);
+  builder.Add("bar");
+
+  // Second filter
+  builder.StartBlock(3100);
+  builder.Add("box");
+
+  // Third filter is empty
+
+  // Last filter
+  builder.StartBlock(9000);
+  builder.Add("box");
+  builder.Add("hello");
+
+  BlockContents block(builder.Finish(), false, kNoCompression);
+  BlockBasedFilterBlockReader reader(nullptr, table_options_, true,
+                                     std::move(block));
+
+  // Check first filter
+  ASSERT_TRUE(reader.KeyMayMatch("foo", 0));
+  ASSERT_TRUE(reader.KeyMayMatch("bar", 2000));
+  ASSERT_TRUE(!reader.KeyMayMatch("box", 0));
+  ASSERT_TRUE(!reader.KeyMayMatch("hello", 0));
+
+  // Check second filter
+  ASSERT_TRUE(reader.KeyMayMatch("box", 3100));
+  ASSERT_TRUE(!reader.KeyMayMatch("foo", 3100));
+  ASSERT_TRUE(!reader.KeyMayMatch("bar", 3100));
+  ASSERT_TRUE(!reader.KeyMayMatch("hello", 3100));
+
+  // Check third filter (empty)
+  ASSERT_TRUE(!reader.KeyMayMatch("foo", 4100));
+  ASSERT_TRUE(!reader.KeyMayMatch("bar", 4100));
+  ASSERT_TRUE(!reader.KeyMayMatch("box", 4100));
+  ASSERT_TRUE(!reader.KeyMayMatch("hello", 4100));
+
+  // Check last filter
+  ASSERT_TRUE(reader.KeyMayMatch("box", 9000));
+  ASSERT_TRUE(reader.KeyMayMatch("hello", 9000));
+  ASSERT_TRUE(!reader.KeyMayMatch("foo", 9000));
+  ASSERT_TRUE(!reader.KeyMayMatch("bar", 9000));
+}
+
+// Test for block based filter block
+// use new interface in FilterPolicy to create filter builder/reader
+class BlockBasedFilterBlockTest : public testing::Test {
+ public:
+  BlockBasedTableOptions table_options_;
+
+  BlockBasedFilterBlockTest() {
+    table_options_.filter_policy.reset(NewBloomFilterPolicy(10));
+  }
+
+  ~BlockBasedFilterBlockTest() {}
+};
+
+TEST_F(BlockBasedFilterBlockTest, BlockBasedEmptyBuilder) {
+  FilterBlockBuilder* builder = new BlockBasedFilterBlockBuilder(
+      nullptr, table_options_);
+  BlockContents block(builder->Finish(), false, kNoCompression);
+  ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block.data));
+  FilterBlockReader* reader = new BlockBasedFilterBlockReader(
+      nullptr, table_options_, true, std::move(block));
+  ASSERT_TRUE(reader->KeyMayMatch("foo", 0));
+  ASSERT_TRUE(reader->KeyMayMatch("foo", 100000));
+
+  delete builder;
+  delete reader;
+}
+
+TEST_F(BlockBasedFilterBlockTest, BlockBasedSingleChunk) {
+  FilterBlockBuilder* builder = new BlockBasedFilterBlockBuilder(
+      nullptr, table_options_);
+  builder->StartBlock(100);
+  builder->Add("foo");
+  builder->Add("bar");
+  builder->Add("box");
+  builder->StartBlock(200);
+  builder->Add("box");
+  builder->StartBlock(300);
+  builder->Add("hello");
+  BlockContents block(builder->Finish(), false, kNoCompression);
+  FilterBlockReader* reader = new BlockBasedFilterBlockReader(
+      nullptr, table_options_, true, std::move(block));
+  ASSERT_TRUE(reader->KeyMayMatch("foo", 100));
+  ASSERT_TRUE(reader->KeyMayMatch("bar", 100));
+  ASSERT_TRUE(reader->KeyMayMatch("box", 100));
+  ASSERT_TRUE(reader->KeyMayMatch("hello", 100));
+  ASSERT_TRUE(reader->KeyMayMatch("foo", 100));
+  ASSERT_TRUE(!reader->KeyMayMatch("missing", 100));
+  ASSERT_TRUE(!reader->KeyMayMatch("other", 100));
+
+  delete builder;
+  delete reader;
+}
+
+TEST_F(BlockBasedFilterBlockTest, BlockBasedMultiChunk) {
+  FilterBlockBuilder* builder = new BlockBasedFilterBlockBuilder(
+      nullptr, table_options_);
+
+  // First filter
+  builder->StartBlock(0);
+  builder->Add("foo");
+  builder->StartBlock(2000);
+  builder->Add("bar");
+
+  // Second filter
+  builder->StartBlock(3100);
+  builder->Add("box");
+
+  // Third filter is empty
+
+  // Last filter
+  builder->StartBlock(9000);
+  builder->Add("box");
+  builder->Add("hello");
+
+  BlockContents block(builder->Finish(), false, kNoCompression);
+  FilterBlockReader* reader = new BlockBasedFilterBlockReader(
+      nullptr, table_options_, true, std::move(block));
+
+  // Check first filter
+  ASSERT_TRUE(reader->KeyMayMatch("foo", 0));
+  ASSERT_TRUE(reader->KeyMayMatch("bar", 2000));
+  ASSERT_TRUE(!reader->KeyMayMatch("box", 0));
+  ASSERT_TRUE(!reader->KeyMayMatch("hello", 0));
+
+  // Check second filter
+  ASSERT_TRUE(reader->KeyMayMatch("box", 3100));
+  ASSERT_TRUE(!reader->KeyMayMatch("foo", 3100));
+  ASSERT_TRUE(!reader->KeyMayMatch("bar", 3100));
+  ASSERT_TRUE(!reader->KeyMayMatch("hello", 3100));
+
+  // Check third filter (empty)
+  ASSERT_TRUE(!reader->KeyMayMatch("foo", 4100));
+  ASSERT_TRUE(!reader->KeyMayMatch("bar", 4100));
+  ASSERT_TRUE(!reader->KeyMayMatch("box", 4100));
+  ASSERT_TRUE(!reader->KeyMayMatch("hello", 4100));
+
+  // Check last filter
+  ASSERT_TRUE(reader->KeyMayMatch("box", 9000));
+  ASSERT_TRUE(reader->KeyMayMatch("hello", 9000));
+  ASSERT_TRUE(!reader->KeyMayMatch("foo", 9000));
+  ASSERT_TRUE(!reader->KeyMayMatch("bar", 9000));
+
+  delete builder;
+  delete reader;
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/table/block_based_table_builder.cc b/src/rocksdb/table/block_based_table_builder.cc
index c6469a2..201f128 100644
--- a/src/rocksdb/table/block_based_table_builder.cc
+++ b/src/rocksdb/table/block_based_table_builder.cc
@@ -15,6 +15,9 @@
 
 #include <map>
 #include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
 
 #include "db/dbformat.h"
 
@@ -23,25 +26,30 @@
 #include "rocksdb/env.h"
 #include "rocksdb/filter_policy.h"
 #include "rocksdb/flush_block_policy.h"
-#include "rocksdb/options.h"
 #include "rocksdb/table.h"
 
 #include "table/block.h"
 #include "table/block_based_table_reader.h"
 #include "table/block_builder.h"
 #include "table/filter_block.h"
+#include "table/block_based_filter_block.h"
+#include "table/block_based_table_factory.h"
+#include "table/full_filter_block.h"
 #include "table/format.h"
 #include "table/meta_blocks.h"
 #include "table/table_builder.h"
 
+#include "util/string_util.h"
 #include "util/coding.h"
+#include "util/compression.h"
 #include "util/crc32c.h"
 #include "util/stop_watch.h"
 #include "util/xxhash.h"
 
 namespace rocksdb {
 
-namespace {
+extern const std::string kHashIndexPrefixesBlock;
+extern const std::string kHashIndexPrefixesMetadataBlock;
 
 typedef BlockBasedTableOptions::IndexType IndexType;
 
@@ -57,6 +65,14 @@ typedef BlockBasedTableOptions::IndexType IndexType;
 // design that just works.
 class IndexBuilder {
  public:
+  // Index builder will construct a set of blocks which contain:
+  //  1. One primary index block.
+  //  2. (Optional) a set of metablocks that contains the metadata of the
+  //     primary index.
+  struct IndexBlocks {
+    Slice index_block_contents;
+    std::unordered_map<std::string, Slice> meta_blocks;
+  };
   explicit IndexBuilder(const Comparator* comparator)
       : comparator_(comparator) {}
 
@@ -72,15 +88,19 @@ class IndexBuilder {
   //                           the last one in the table
   //
   // REQUIRES: Finish() has not yet been called.
-  virtual void AddEntry(std::string* last_key_in_current_block,
-                        const Slice* first_key_in_next_block,
-                        const BlockHandle& block_handle) = 0;
+  virtual void AddIndexEntry(std::string* last_key_in_current_block,
+                             const Slice* first_key_in_next_block,
+                             const BlockHandle& block_handle) = 0;
+
+  // This method will be called whenever a key is added. The subclasses may
+  // override OnKeyAdded() if they need to collect additional information.
+  virtual void OnKeyAdded(const Slice& key) {}
 
   // Inform the index builder that all entries has been written. Block builder
   // may therefore perform any operation required for block finalization.
   //
   // REQUIRES: Finish() has not yet been called.
-  virtual Slice Finish() = 0;
+  virtual Status Finish(IndexBlocks* index_blocks) = 0;
 
   // Get the estimated size for index block.
   virtual size_t EstimatedSize() const = 0;
@@ -101,11 +121,11 @@ class ShortenedIndexBuilder : public IndexBuilder {
  public:
   explicit ShortenedIndexBuilder(const Comparator* comparator)
       : IndexBuilder(comparator),
-        index_block_builder_(1 /* block_restart_interval == 1 */, comparator) {}
+        index_block_builder_(1 /* block_restart_interval == 1 */) {}
 
-  virtual void AddEntry(std::string* last_key_in_current_block,
-                        const Slice* first_key_in_next_block,
-                        const BlockHandle& block_handle) override {
+  virtual void AddIndexEntry(std::string* last_key_in_current_block,
+                             const Slice* first_key_in_next_block,
+                             const BlockHandle& block_handle) override {
     if (first_key_in_next_block != nullptr) {
       comparator_->FindShortestSeparator(last_key_in_current_block,
                                          *first_key_in_next_block);
@@ -118,9 +138,12 @@ class ShortenedIndexBuilder : public IndexBuilder {
     index_block_builder_.Add(*last_key_in_current_block, handle_encoding);
   }
 
-  virtual Slice Finish() override { return index_block_builder_.Finish(); }
+  virtual Status Finish(IndexBlocks* index_blocks) override {
+    index_blocks->index_block_contents = index_block_builder_.Finish();
+    return Status::OK();
+  }
 
-  virtual size_t EstimatedSize() const {
+  virtual size_t EstimatedSize() const override {
     return index_block_builder_.CurrentSizeEstimate();
   }
 
@@ -128,38 +151,129 @@ class ShortenedIndexBuilder : public IndexBuilder {
   BlockBuilder index_block_builder_;
 };
 
-// FullKeyIndexBuilder is also based on BlockBuilder. It works pretty much like
-// ShortenedIndexBuilder, but preserves the full key instead the substitude key.
-class FullKeyIndexBuilder : public IndexBuilder {
+// HashIndexBuilder contains a binary-searchable primary index and the
+// metadata for secondary hash index construction.
+// The metadata for hash index consists two parts:
+//  - a metablock that compactly contains a sequence of prefixes. All prefixes
+//    are stored consectively without any metadata (like, prefix sizes) being
+//    stored, which is kept in the other metablock.
+//  - a metablock contains the metadata of the prefixes, including prefix size,
+//    restart index and number of block it spans. The format looks like:
+//
+// +-----------------+---------------------------+---------------------+ <=prefix 1
+// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
+// +-----------------+---------------------------+---------------------+ <=prefix 2
+// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
+// +-----------------+---------------------------+---------------------+
+// |                                                                   |
+// | ....                                                              |
+// |                                                                   |
+// +-----------------+---------------------------+---------------------+ <=prefix n
+// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
+// +-----------------+---------------------------+---------------------+
+//
+// The reason of separating these two metablocks is to enable the efficiently
+// reuse the first metablock during hash index construction without unnecessary
+// data copy or small heap allocations for prefixes.
+class HashIndexBuilder : public IndexBuilder {
  public:
-  explicit FullKeyIndexBuilder(const Comparator* comparator)
+  explicit HashIndexBuilder(const Comparator* comparator,
+                            const SliceTransform* hash_key_extractor)
       : IndexBuilder(comparator),
-        index_block_builder_(1 /* block_restart_interval == 1 */, comparator) {}
+        primary_index_builder_(comparator),
+        hash_key_extractor_(hash_key_extractor) {}
+
+  virtual void AddIndexEntry(std::string* last_key_in_current_block,
+                             const Slice* first_key_in_next_block,
+                             const BlockHandle& block_handle) override {
+    ++current_restart_index_;
+    primary_index_builder_.AddIndexEntry(last_key_in_current_block,
+                                        first_key_in_next_block, block_handle);
+  }
 
-  virtual void AddEntry(std::string* last_key_in_current_block,
-                        const Slice* first_key_in_next_block,
-                        const BlockHandle& block_handle) override {
-    std::string handle_encoding;
-    block_handle.EncodeTo(&handle_encoding);
-    index_block_builder_.Add(*last_key_in_current_block, handle_encoding);
+  virtual void OnKeyAdded(const Slice& key) override {
+    auto key_prefix = hash_key_extractor_->Transform(key);
+    bool is_first_entry = pending_block_num_ == 0;
+
+    // Keys may share the prefix
+    if (is_first_entry || pending_entry_prefix_ != key_prefix) {
+      if (!is_first_entry) {
+        FlushPendingPrefix();
+      }
+
+      // need a hard copy otherwise the underlying data changes all the time.
+      // TODO(kailiu) ToString() is expensive. We may speed up can avoid data
+      // copy.
+      pending_entry_prefix_ = key_prefix.ToString();
+      pending_block_num_ = 1;
+      pending_entry_index_ = static_cast<uint32_t>(current_restart_index_);
+    } else {
+      // entry number increments when keys share the prefix reside in
+      // different data blocks.
+      auto last_restart_index = pending_entry_index_ + pending_block_num_ - 1;
+      assert(last_restart_index <= current_restart_index_);
+      if (last_restart_index != current_restart_index_) {
+        ++pending_block_num_;
+      }
+    }
   }
 
-  virtual Slice Finish() override { return index_block_builder_.Finish(); }
+  virtual Status Finish(IndexBlocks* index_blocks) override {
+    FlushPendingPrefix();
+    primary_index_builder_.Finish(index_blocks);
+    index_blocks->meta_blocks.insert(
+        {kHashIndexPrefixesBlock.c_str(), prefix_block_});
+    index_blocks->meta_blocks.insert(
+        {kHashIndexPrefixesMetadataBlock.c_str(), prefix_meta_block_});
+    return Status::OK();
+  }
 
-  virtual size_t EstimatedSize() const {
-    return index_block_builder_.CurrentSizeEstimate();
+  virtual size_t EstimatedSize() const override {
+    return primary_index_builder_.EstimatedSize() + prefix_block_.size() +
+           prefix_meta_block_.size();
   }
 
  private:
-  BlockBuilder index_block_builder_;
+  void FlushPendingPrefix() {
+    prefix_block_.append(pending_entry_prefix_.data(),
+                         pending_entry_prefix_.size());
+    PutVarint32(&prefix_meta_block_,
+                static_cast<uint32_t>(pending_entry_prefix_.size()));
+    PutVarint32(&prefix_meta_block_, pending_entry_index_);
+    PutVarint32(&prefix_meta_block_, pending_block_num_);
+  }
+
+  ShortenedIndexBuilder primary_index_builder_;
+  const SliceTransform* hash_key_extractor_;
+
+  // stores a sequence of prefixes
+  std::string prefix_block_;
+  // stores the metadata of prefixes
+  std::string prefix_meta_block_;
+
+  // The following 3 variables keeps unflushed prefix and its metadata.
+  // The details of block_num and entry_index can be found in
+  // "block_hash_index.{h,cc}"
+  uint32_t pending_block_num_ = 0;
+  uint32_t pending_entry_index_ = 0;
+  std::string pending_entry_prefix_;
+
+  uint64_t current_restart_index_ = 0;
 };
 
+// Without anonymous namespace here, we fail the warning -Wmissing-prototypes
+namespace {
+
 // Create a index builder based on its type.
-IndexBuilder* CreateIndexBuilder(IndexType type, const Comparator* comparator) {
+IndexBuilder* CreateIndexBuilder(IndexType type, const Comparator* comparator,
+                                 const SliceTransform* prefix_extractor) {
   switch (type) {
     case BlockBasedTableOptions::kBinarySearch: {
       return new ShortenedIndexBuilder(comparator);
     }
+    case BlockBasedTableOptions::kHashSearch: {
+      return new HashIndexBuilder(comparator, prefix_extractor);
+    }
     default: {
       assert(!"Do not recognize the index type ");
       return nullptr;
@@ -170,14 +284,32 @@ IndexBuilder* CreateIndexBuilder(IndexType type, const Comparator* comparator) {
   return nullptr;
 }
 
+// Create a index builder based on its type.
+FilterBlockBuilder* CreateFilterBlockBuilder(const ImmutableCFOptions& opt,
+    const BlockBasedTableOptions& table_opt) {
+  if (table_opt.filter_policy == nullptr) return nullptr;
+
+  FilterBitsBuilder* filter_bits_builder =
+      table_opt.filter_policy->GetFilterBitsBuilder();
+  if (filter_bits_builder == nullptr) {
+    return new BlockBasedFilterBlockBuilder(opt.prefix_extractor, table_opt);
+  } else {
+    return new FullFilterBlockBuilder(opt.prefix_extractor,
+                                      table_opt.whole_key_filtering,
+                                      filter_bits_builder);
+  }
+}
+
 bool GoodCompressionRatio(size_t compressed_size, size_t raw_size) {
   // Check to see if compressed less than 12.5%
   return compressed_size < raw_size - (raw_size / 8u);
 }
 
+// format_version is the block format as defined in include/rocksdb/table.h
 Slice CompressBlock(const Slice& raw,
                     const CompressionOptions& compression_options,
-                    CompressionType* type, std::string* compressed_output) {
+                    CompressionType* type, uint32_t format_version,
+                    std::string* compressed_output) {
   if (*type == kNoCompression) {
     return raw;
   }
@@ -186,36 +318,44 @@ Slice CompressBlock(const Slice& raw,
   // supported in this platform and (2) the compression rate is "good enough".
   switch (*type) {
     case kSnappyCompression:
-      if (port::Snappy_Compress(compression_options, raw.data(), raw.size(),
-                                compressed_output) &&
+      if (Snappy_Compress(compression_options, raw.data(), raw.size(),
+                          compressed_output) &&
           GoodCompressionRatio(compressed_output->size(), raw.size())) {
         return *compressed_output;
       }
       break;  // fall back to no compression.
     case kZlibCompression:
-      if (port::Zlib_Compress(compression_options, raw.data(), raw.size(),
-                              compressed_output) &&
+      if (Zlib_Compress(
+              compression_options,
+              GetCompressFormatForVersion(kZlibCompression, format_version),
+              raw.data(), raw.size(), compressed_output) &&
           GoodCompressionRatio(compressed_output->size(), raw.size())) {
         return *compressed_output;
       }
       break;  // fall back to no compression.
     case kBZip2Compression:
-      if (port::BZip2_Compress(compression_options, raw.data(), raw.size(),
-                               compressed_output) &&
+      if (BZip2_Compress(
+              compression_options,
+              GetCompressFormatForVersion(kBZip2Compression, format_version),
+              raw.data(), raw.size(), compressed_output) &&
           GoodCompressionRatio(compressed_output->size(), raw.size())) {
         return *compressed_output;
       }
       break;  // fall back to no compression.
     case kLZ4Compression:
-      if (port::LZ4_Compress(compression_options, raw.data(), raw.size(),
-                             compressed_output) &&
+      if (LZ4_Compress(
+              compression_options,
+              GetCompressFormatForVersion(kLZ4Compression, format_version),
+              raw.data(), raw.size(), compressed_output) &&
           GoodCompressionRatio(compressed_output->size(), raw.size())) {
         return *compressed_output;
       }
       break;  // fall back to no compression.
     case kLZ4HCCompression:
-      if (port::LZ4HC_Compress(compression_options, raw.data(), raw.size(),
-                               compressed_output) &&
+      if (LZ4HC_Compress(
+              compression_options,
+              GetCompressFormatForVersion(kLZ4HCCompression, format_version),
+              raw.data(), raw.size(), compressed_output) &&
           GoodCompressionRatio(compressed_output->size(), raw.size())) {
         return *compressed_output;
       }
@@ -229,7 +369,7 @@ Slice CompressBlock(const Slice& raw,
   return raw;
 }
 
-}  // anonymous namespace
+}  // namespace
 
 // kBlockBasedTableMagicNumber was picked by running
 //    echo rocksdb.table.block_based | sha1sum
@@ -244,59 +384,71 @@ extern const uint64_t kLegacyBlockBasedTableMagicNumber = 0xdb4775248b80fb57ull;
 // A collector that collects properties of interest to block-based table.
 // For now this class looks heavy-weight since we only write one additional
 // property.
-// But in the forseeable future, we will add more and more properties that are
+// But in the foreseeable future, we will add more and more properties that are
 // specific to block-based table.
 class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector
-    : public TablePropertiesCollector {
+    : public IntTblPropCollector {
  public:
-  BlockBasedTablePropertiesCollector(
-      BlockBasedTableOptions::IndexType index_type)
-      : index_type_(index_type) {}
-
-  virtual Status Add(const Slice& key, const Slice& value) {
+  explicit BlockBasedTablePropertiesCollector(
+      BlockBasedTableOptions::IndexType index_type, bool whole_key_filtering,
+      bool prefix_filtering)
+      : index_type_(index_type),
+        whole_key_filtering_(whole_key_filtering),
+        prefix_filtering_(prefix_filtering) {}
+
+  virtual Status InternalAdd(const Slice& key, const Slice& value,
+                             uint64_t file_size) override {
     // Intentionally left blank. Have no interest in collecting stats for
     // individual key/value pairs.
     return Status::OK();
   }
 
-  virtual Status Finish(UserCollectedProperties* properties) {
+  virtual Status Finish(UserCollectedProperties* properties) override {
     std::string val;
     PutFixed32(&val, static_cast<uint32_t>(index_type_));
     properties->insert({BlockBasedTablePropertyNames::kIndexType, val});
-
+    properties->insert({BlockBasedTablePropertyNames::kWholeKeyFiltering,
+                        whole_key_filtering_ ? kPropTrue : kPropFalse});
+    properties->insert({BlockBasedTablePropertyNames::kPrefixFiltering,
+                        prefix_filtering_ ? kPropTrue : kPropFalse});
     return Status::OK();
   }
 
   // The name of the properties collector can be used for debugging purpose.
-  virtual const char* Name() const {
+  virtual const char* Name() const override {
     return "BlockBasedTablePropertiesCollector";
   }
 
-  virtual UserCollectedProperties GetReadableProperties() const {
+  virtual UserCollectedProperties GetReadableProperties() const override {
     // Intentionally left blank.
     return UserCollectedProperties();
   }
 
  private:
   BlockBasedTableOptions::IndexType index_type_;
+  bool whole_key_filtering_;
+  bool prefix_filtering_;
 };
 
 struct BlockBasedTableBuilder::Rep {
-  Options options;
+  const ImmutableCFOptions ioptions;
+  const BlockBasedTableOptions table_options;
   const InternalKeyComparator& internal_comparator;
   WritableFile* file;
   uint64_t offset = 0;
   Status status;
   BlockBuilder data_block;
+
+  InternalKeySliceTransform internal_prefix_transform;
   std::unique_ptr<IndexBuilder> index_builder;
 
   std::string last_key;
-  CompressionType compression_type;
-  ChecksumType checksum_type;
+  const CompressionType compression_type;
+  const CompressionOptions compression_opts;
   TableProperties props;
 
   bool closed = false;  // Either Finish() or Abandon() has been called.
-  FilterBlockBuilder* filter_block;
+  std::unique_ptr<FilterBlockBuilder> filter_block;
   char compressed_cache_key_prefix[BlockBasedTable::kMaxCacheKeyPrefixSize];
   size_t compressed_cache_key_prefix_size;
 
@@ -305,45 +457,71 @@ struct BlockBasedTableBuilder::Rep {
   std::string compressed_output;
   std::unique_ptr<FlushBlockPolicy> flush_block_policy;
 
-  Rep(const Options& opt, const InternalKeyComparator& icomparator,
-      WritableFile* f, FlushBlockPolicyFactory* flush_block_policy_factory,
-      CompressionType compression_type, IndexType index_block_type,
-      ChecksumType checksum_type)
-      : options(opt),
+  std::vector<std::unique_ptr<IntTblPropCollector>> table_properties_collectors;
+
+  Rep(const ImmutableCFOptions& _ioptions,
+      const BlockBasedTableOptions& table_opt,
+      const InternalKeyComparator& icomparator,
+      const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
+          int_tbl_prop_collector_factories,
+      WritableFile* f, const CompressionType _compression_type,
+      const CompressionOptions& _compression_opts, const bool skip_filters)
+      : ioptions(_ioptions),
+        table_options(table_opt),
         internal_comparator(icomparator),
         file(f),
-        data_block(options, &internal_comparator),
-        index_builder(
-            CreateIndexBuilder(index_block_type, &internal_comparator)),
-        compression_type(compression_type),
-        checksum_type(checksum_type),
-        filter_block(opt.filter_policy == nullptr
-                         ? nullptr
-                         : new FilterBlockBuilder(opt, &internal_comparator)),
-        flush_block_policy(flush_block_policy_factory->NewFlushBlockPolicy(
-            options, data_block)) {
-    options.table_properties_collectors.push_back(
-        std::make_shared<BlockBasedTablePropertiesCollector>(index_block_type));
+        data_block(table_options.block_restart_interval),
+        internal_prefix_transform(_ioptions.prefix_extractor),
+        index_builder(CreateIndexBuilder(table_options.index_type,
+                                         &internal_comparator,
+                                         &this->internal_prefix_transform)),
+        compression_type(_compression_type),
+        compression_opts(_compression_opts),
+        filter_block(skip_filters ? nullptr : CreateFilterBlockBuilder(
+                                                  _ioptions, table_options)),
+        flush_block_policy(
+            table_options.flush_block_policy_factory->NewFlushBlockPolicy(
+                table_options, data_block)) {
+    for (auto& collector_factories : *int_tbl_prop_collector_factories) {
+      table_properties_collectors.emplace_back(
+          collector_factories->CreateIntTblPropCollector());
+    }
+    table_properties_collectors.emplace_back(
+        new BlockBasedTablePropertiesCollector(
+            table_options.index_type, table_options.whole_key_filtering,
+            _ioptions.prefix_extractor != nullptr));
   }
 };
 
-// TODO(sdong): Currently only write out binary search index. In
-// BlockBasedTableReader, Hash index will be built using binary search index.
 BlockBasedTableBuilder::BlockBasedTableBuilder(
-    const Options& options, const BlockBasedTableOptions& table_options,
-    const InternalKeyComparator& internal_comparator, WritableFile* file,
-    CompressionType compression_type)
-    : rep_(new Rep(options, internal_comparator, file,
-                   table_options.flush_block_policy_factory.get(),
-                   compression_type,
-                   BlockBasedTableOptions::IndexType::kBinarySearch,
-                   table_options.checksum)) {
+    const ImmutableCFOptions& ioptions,
+    const BlockBasedTableOptions& table_options,
+    const InternalKeyComparator& internal_comparator,
+    const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
+        int_tbl_prop_collector_factories,
+    WritableFile* file, const CompressionType compression_type,
+    const CompressionOptions& compression_opts, const bool skip_filters) {
+  BlockBasedTableOptions sanitized_table_options(table_options);
+  if (sanitized_table_options.format_version == 0 &&
+      sanitized_table_options.checksum != kCRC32c) {
+    Log(InfoLogLevel::WARN_LEVEL, ioptions.info_log,
+        "Silently converting format_version to 1 because checksum is "
+        "non-default");
+    // silently convert format_version to 1 to keep consistent with current
+    // behavior
+    sanitized_table_options.format_version = 1;
+  }
+
+  rep_ = new Rep(ioptions, sanitized_table_options, internal_comparator,
+                 int_tbl_prop_collector_factories, file, compression_type,
+                 compression_opts, skip_filters);
+
   if (rep_->filter_block != nullptr) {
     rep_->filter_block->StartBlock(0);
   }
-  if (options.block_cache_compressed.get() != nullptr) {
+  if (table_options.block_cache_compressed.get() != nullptr) {
     BlockBasedTable::GenerateCachePrefix(
-        options.block_cache_compressed.get(), file,
+        table_options.block_cache_compressed.get(), file,
         &rep_->compressed_cache_key_prefix[0],
         &rep_->compressed_cache_key_prefix_size);
   }
@@ -351,7 +529,6 @@ BlockBasedTableBuilder::BlockBasedTableBuilder(
 
 BlockBasedTableBuilder::~BlockBasedTableBuilder() {
   assert(rep_->closed);  // Catch errors where caller forgot to call Finish()
-  delete rep_->filter_block;
   delete rep_;
 }
 
@@ -377,12 +554,12 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
     // entries in the first block and < all entries in subsequent
     // blocks.
     if (ok()) {
-      r->index_builder->AddEntry(&r->last_key, &key, r->pending_handle);
+      r->index_builder->AddIndexEntry(&r->last_key, &key, r->pending_handle);
     }
   }
 
   if (r->filter_block != nullptr) {
-    r->filter_block->AddKey(key);
+    r->filter_block->Add(ExtractUserKey(key));
   }
 
   r->last_key.assign(key.data(), key.size());
@@ -391,12 +568,10 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
   r->props.raw_key_size += key.size();
   r->props.raw_value_size += value.size();
 
-  NotifyCollectTableCollectorsOnAdd(
-      key,
-      value,
-      r->options.table_properties_collectors,
-      r->options.info_log.get()
-  );
+  r->index_builder->OnKeyAdded(key);
+  NotifyCollectTableCollectorsOnAdd(key, value, r->offset,
+                                    r->table_properties_collectors,
+                                    r->ioptions.info_log);
 }
 
 void BlockBasedTableBuilder::Flush() {
@@ -431,9 +606,16 @@ void BlockBasedTableBuilder::WriteBlock(const Slice& raw_block_contents,
   Rep* r = rep_;
 
   auto type = r->compression_type;
-  auto block_contents =
-      CompressBlock(raw_block_contents, r->options.compression_opts, &type,
-                    &r->compressed_output);
+  Slice block_contents;
+  if (raw_block_contents.size() < kCompressionSizeLimit) {
+    block_contents =
+        CompressBlock(raw_block_contents, r->compression_opts, &type,
+                      r->table_options.format_version, &r->compressed_output);
+  } else {
+    RecordTick(r->ioptions.statistics, NUMBER_BLOCK_NOT_COMPRESSED);
+    type = kNoCompression;
+    block_contents = raw_block_contents;
+  }
   WriteRawBlock(block_contents, type, handle);
   r->compressed_output.clear();
 }
@@ -442,8 +624,7 @@ void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents,
                                            CompressionType type,
                                            BlockHandle* handle) {
   Rep* r = rep_;
-  StopWatch sw(r->options.env, r->options.statistics.get(),
-               WRITE_RAW_BLOCK_MICROS);
+  StopWatch sw(r->ioptions.env, r->ioptions.statistics, WRITE_RAW_BLOCK_MICROS);
   handle->set_offset(r->offset);
   handle->set_size(block_contents.size());
   r->status = r->file->Append(block_contents);
@@ -451,7 +632,7 @@ void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents,
     char trailer[kBlockTrailerSize];
     trailer[0] = type;
     char* trailer_without_type = trailer + 1;
-    switch (r->checksum_type) {
+    switch (r->table_options.checksum) {
       case kNoChecksum:
         // we don't support no checksum yet
         assert(false);
@@ -464,7 +645,8 @@ void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents,
       }
       case kxxHash: {
         void* xxh = XXH32_init(0);
-        XXH32_update(xxh, block_contents.data(), block_contents.size());
+        XXH32_update(xxh, block_contents.data(),
+                     static_cast<uint32_t>(block_contents.size()));
         XXH32_update(xxh, trailer, 1);  // Extend  to cover block type
         EncodeFixed32(trailer_without_type, XXH32_digest(xxh));
         break;
@@ -494,27 +676,23 @@ static void DeleteCachedBlock(const Slice& key, void* value) {
 // Make a copy of the block contents and insert into compressed block cache
 //
 Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents,
-                                 const CompressionType type,
-                                 const BlockHandle* handle) {
+                                                  const CompressionType type,
+                                                  const BlockHandle* handle) {
   Rep* r = rep_;
-  Cache* block_cache_compressed = r->options.block_cache_compressed.get();
+  Cache* block_cache_compressed = r->table_options.block_cache_compressed.get();
 
   if (type != kNoCompression && block_cache_compressed != nullptr) {
 
     Cache::Handle* cache_handle = nullptr;
     size_t size = block_contents.size();
 
-    char* ubuf = new char[size];             // make a new copy
-    memcpy(ubuf, block_contents.data(), size);
+    std::unique_ptr<char[]> ubuf(new char[size + 1]);
+    memcpy(ubuf.get(), block_contents.data(), size);
+    ubuf[size] = type;
 
-    BlockContents results;
-    Slice sl(ubuf, size);
-    results.data = sl;
-    results.cachable = true; // XXX
-    results.heap_allocated = true;
-    results.compression_type = type;
+    BlockContents results(std::move(ubuf), size, true, type);
 
-    Block* block = new Block(results);
+    Block* block = new Block(std::move(results));
 
     // make cache key by appending the file offset to the cache prefix id
     char* end = EncodeVarint64(
@@ -530,7 +708,7 @@ Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents,
     block_cache_compressed->Release(cache_handle);
 
     // Invalidate OS cache.
-    r->file->InvalidateCache(r->offset, size);
+    r->file->InvalidateCache(static_cast<size_t>(r->offset), size);
   }
   return Status::OK();
 }
@@ -542,10 +720,7 @@ Status BlockBasedTableBuilder::Finish() {
   assert(!r->closed);
   r->closed = true;
 
-  BlockHandle filter_block_handle,
-              metaindex_block_handle,
-              index_block_handle;
-
+  BlockHandle filter_block_handle, metaindex_block_handle, index_block_handle;
   // Write filter block
   if (ok() && r->filter_block != nullptr) {
     auto filter_contents = r->filter_block->Finish();
@@ -557,32 +732,48 @@ Status BlockBasedTableBuilder::Finish() {
   // block, we will finish writing all index entries here and flush them
   // to storage after metaindex block is written.
   if (ok() && !empty_data_block) {
-    r->index_builder->AddEntry(&r->last_key, nullptr /* no next data block */,
-                               r->pending_handle);
+    r->index_builder->AddIndexEntry(
+        &r->last_key, nullptr /* no next data block */, r->pending_handle);
+  }
+
+  IndexBuilder::IndexBlocks index_blocks;
+  auto s = r->index_builder->Finish(&index_blocks);
+  if (!s.ok()) {
+    return s;
   }
 
   // Write meta blocks and metaindex block with the following order.
   //    1. [meta block: filter]
-  //    2. [meta block: properties]
-  //    3. [metaindex block]
-  if (ok()) {
-    MetaIndexBuilder meta_index_builer;
+  //    2. [other meta blocks]
+  //    3. [meta block: properties]
+  //    4. [metaindex block]
+  // write meta blocks
+  MetaIndexBuilder meta_index_builder;
+  for (const auto& item : index_blocks.meta_blocks) {
+    BlockHandle block_handle;
+    WriteBlock(item.second, &block_handle);
+    meta_index_builder.Add(item.first, block_handle);
+  }
 
-    // Write filter block.
+  if (ok()) {
     if (r->filter_block != nullptr) {
       // Add mapping from "<filter_block_prefix>.Name" to location
       // of filter data.
-      std::string key = BlockBasedTable::kFilterBlockPrefix;
-      key.append(r->options.filter_policy->Name());
-      meta_index_builer.Add(key, filter_block_handle);
+      std::string key;
+      if (r->filter_block->IsBlockBased()) {
+        key = BlockBasedTable::kFilterBlockPrefix;
+      } else {
+        key = BlockBasedTable::kFullFilterBlockPrefix;
+      }
+      key.append(r->table_options.filter_policy->Name());
+      meta_index_builder.Add(key, filter_block_handle);
     }
 
     // Write properties block.
     {
       PropertyBlockBuilder property_block_builder;
-      std::vector<std::string> failed_user_prop_collectors;
-      r->props.filter_policy_name = r->options.filter_policy != nullptr ?
-          r->options.filter_policy->Name() : "";
+      r->props.filter_policy_name = r->table_options.filter_policy != nullptr ?
+          r->table_options.filter_policy->Name() : "";
       r->props.index_size =
           r->index_builder->EstimatedSize() + kBlockTrailerSize;
 
@@ -590,11 +781,9 @@ Status BlockBasedTableBuilder::Finish() {
       property_block_builder.AddTableProperty(r->props);
 
       // Add use collected properties
-      NotifyCollectTableCollectorsOnFinish(
-          r->options.table_properties_collectors,
-          r->options.info_log.get(),
-          &property_block_builder
-      );
+      NotifyCollectTableCollectorsOnFinish(r->table_properties_collectors,
+                                           r->ioptions.info_log,
+                                           &property_block_builder);
 
       BlockHandle properties_block_handle;
       WriteRawBlock(
@@ -603,20 +792,16 @@ Status BlockBasedTableBuilder::Finish() {
           &properties_block_handle
       );
 
-      meta_index_builer.Add(kPropertiesBlock,
-                            properties_block_handle);
+      meta_index_builder.Add(kPropertiesBlock, properties_block_handle);
     }  // end of properties block writing
-
-    WriteRawBlock(
-        meta_index_builer.Finish(),
-        kNoCompression,
-        &metaindex_block_handle
-    );
-  }  // meta blocks and metaindex block.
+  }    // meta blocks
 
   // Write index block
   if (ok()) {
-    WriteBlock(r->index_builder->Finish(), &index_block_handle);
+    // flush the meta index block
+    WriteRawBlock(meta_index_builder.Finish(), kNoCompression,
+                  &metaindex_block_handle);
+    WriteBlock(index_blocks.index_block_contents, &index_block_handle);
   }
 
   // Write footer
@@ -628,12 +813,16 @@ Status BlockBasedTableBuilder::Finish() {
     // TODO(icanadi) at some point in the future, when we're absolutely sure
     // nobody will roll back to RocksDB 2.x versions, retire the legacy magic
     // number and always write new table files with new magic number
-    bool legacy = (r->checksum_type == kCRC32c);
+    bool legacy = (r->table_options.format_version == 0);
+    // this is guaranteed by BlockBasedTableBuilder's constructor
+    assert(r->table_options.checksum == kCRC32c ||
+           r->table_options.format_version != 0);
     Footer footer(legacy ? kLegacyBlockBasedTableMagicNumber
-                         : kBlockBasedTableMagicNumber);
+                         : kBlockBasedTableMagicNumber,
+                  r->table_options.format_version);
     footer.set_metaindex_handle(metaindex_block_handle);
     footer.set_index_handle(index_block_handle);
-    footer.set_checksum(r->checksum_type);
+    footer.set_checksum(r->table_options.checksum);
     std::string footer_encoding;
     footer.EncodeTo(&footer_encoding);
     r->status = r->file->Append(footer_encoding);
@@ -642,30 +831,6 @@ Status BlockBasedTableBuilder::Finish() {
     }
   }
 
-  // Print out the table stats
-  if (ok()) {
-    // user collected properties
-    std::string user_collected;
-    user_collected.reserve(1024);
-    for (auto collector : r->options.table_properties_collectors) {
-      for (const auto& prop : collector->GetReadableProperties()) {
-        user_collected.append(prop.first);
-        user_collected.append("=");
-        user_collected.append(prop.second);
-        user_collected.append("; ");
-      }
-    }
-
-    Log(
-        r->options.info_log,
-        "Table was constructed:\n"
-        "  [basic properties]: %s\n"
-        "  [user collected properties]: %s",
-        r->props.ToString().c_str(),
-        user_collected.c_str()
-    );
-  }
-
   return r->status;
 }
 
@@ -683,7 +848,16 @@ uint64_t BlockBasedTableBuilder::FileSize() const {
   return rep_->offset;
 }
 
-const std::string BlockBasedTable::kFilterBlockPrefix =
-    "filter.";
+TableProperties BlockBasedTableBuilder::GetTableProperties() const {
+  TableProperties ret = rep_->props;
+  for (const auto& collector : rep_->table_properties_collectors) {
+    for (const auto& prop : collector->GetReadableProperties()) {
+      ret.user_collected_properties.insert(prop);
+    }
+  }
+  return ret;
+}
 
+const std::string BlockBasedTable::kFilterBlockPrefix = "filter.";
+const std::string BlockBasedTable::kFullFilterBlockPrefix = "fullfilter.";
 }  // namespace rocksdb
diff --git a/src/rocksdb/table/block_based_table_builder.h b/src/rocksdb/table/block_based_table_builder.h
index 5871427..716a4e9 100644
--- a/src/rocksdb/table/block_based_table_builder.h
+++ b/src/rocksdb/table/block_based_table_builder.h
@@ -9,6 +9,10 @@
 
 #pragma once
 #include <stdint.h>
+#include <limits>
+#include <string>
+#include <utility>
+#include <vector>
 
 #include "rocksdb/flush_block_policy.h"
 #include "rocksdb/options.h"
@@ -27,10 +31,14 @@ class BlockBasedTableBuilder : public TableBuilder {
   // Create a builder that will store the contents of the table it is
   // building in *file.  Does not close the file.  It is up to the
   // caller to close the file after calling Finish().
-  BlockBasedTableBuilder(const Options& options,
-                         const BlockBasedTableOptions& table_options,
-                         const InternalKeyComparator& internal_comparator,
-                         WritableFile* file, CompressionType compression_type);
+  BlockBasedTableBuilder(
+      const ImmutableCFOptions& ioptions,
+      const BlockBasedTableOptions& table_options,
+      const InternalKeyComparator& internal_comparator,
+      const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
+          int_tbl_prop_collector_factories,
+      WritableFile* file, const CompressionType compression_type,
+      const CompressionOptions& compression_opts, const bool skip_filters);
 
   // REQUIRES: Either Finish() or Abandon() has been called.
   ~BlockBasedTableBuilder();
@@ -62,6 +70,9 @@ class BlockBasedTableBuilder : public TableBuilder {
   // Finish() call, returns the size of the final generated file.
   uint64_t FileSize() const override;
 
+  // Get table properties
+  TableProperties GetTableProperties() const override;
+
  private:
   bool ok() const { return status().ok(); }
   // Call block's Finish() method and then write the finalize block contents to
@@ -74,6 +85,7 @@ class BlockBasedTableBuilder : public TableBuilder {
                             const CompressionType type,
                             const BlockHandle* handle);
   struct Rep;
+  class BlockBasedTablePropertiesCollectorFactory;
   class BlockBasedTablePropertiesCollector;
   Rep* rep_;
 
@@ -83,6 +95,10 @@ class BlockBasedTableBuilder : public TableBuilder {
   // REQUIRES: Finish(), Abandon() have not been called
   void Flush();
 
+  // Some compression libraries fail when the raw size is bigger than int. If
+  // uncompressed size is bigger than kCompressionSizeLimit, don't compress it
+  const uint64_t kCompressionSizeLimit = std::numeric_limits<int>::max();
+
   // No copying allowed
   BlockBasedTableBuilder(const BlockBasedTableBuilder&) = delete;
   void operator=(const BlockBasedTableBuilder&) = delete;
diff --git a/src/rocksdb/table/block_based_table_factory.cc b/src/rocksdb/table/block_based_table_factory.cc
index 822adee..f87660c 100644
--- a/src/rocksdb/table/block_based_table_factory.cc
+++ b/src/rocksdb/table/block_based_table_factory.cc
@@ -14,10 +14,12 @@
 #include <string>
 #include <stdint.h>
 
+#include "port/port.h"
 #include "rocksdb/flush_block_policy.h"
+#include "rocksdb/cache.h"
 #include "table/block_based_table_builder.h"
 #include "table/block_based_table_reader.h"
-#include "port/port.h"
+#include "table/format.h"
 
 namespace rocksdb {
 
@@ -28,27 +30,128 @@ BlockBasedTableFactory::BlockBasedTableFactory(
     table_options_.flush_block_policy_factory.reset(
         new FlushBlockBySizePolicyFactory());
   }
+  if (table_options_.no_block_cache) {
+    table_options_.block_cache.reset();
+  } else if (table_options_.block_cache == nullptr) {
+    table_options_.block_cache = NewLRUCache(8 << 20);
+  }
+  if (table_options_.block_size_deviation < 0 ||
+      table_options_.block_size_deviation > 100) {
+    table_options_.block_size_deviation = 0;
+  }
 }
 
 Status BlockBasedTableFactory::NewTableReader(
-    const Options& options, const EnvOptions& soptions,
+    const ImmutableCFOptions& ioptions, const EnvOptions& soptions,
     const InternalKeyComparator& internal_comparator,
     unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
-    unique_ptr<TableReader>* table_reader) const {
-  return BlockBasedTable::Open(options, soptions, table_options_,
+    unique_ptr<TableReader>* table_reader, const bool prefetch_enabled) const {
+  return BlockBasedTable::Open(ioptions, soptions, table_options_,
                                internal_comparator, std::move(file), file_size,
-                               table_reader);
+                               table_reader, prefetch_enabled);
 }
 
 TableBuilder* BlockBasedTableFactory::NewTableBuilder(
-    const Options& options, const InternalKeyComparator& internal_comparator,
-    WritableFile* file, CompressionType compression_type) const {
+    const TableBuilderOptions& table_builder_options,
+    WritableFile* file) const {
   auto table_builder = new BlockBasedTableBuilder(
-      options, table_options_, internal_comparator, file, compression_type);
+      table_builder_options.ioptions, table_options_,
+      table_builder_options.internal_comparator,
+      table_builder_options.int_tbl_prop_collector_factories, file,
+      table_builder_options.compression_type,
+      table_builder_options.compression_opts,
+      table_builder_options.skip_filters);
 
   return table_builder;
 }
 
+Status BlockBasedTableFactory::SanitizeOptions(
+    const DBOptions& db_opts,
+    const ColumnFamilyOptions& cf_opts) const {
+  if (table_options_.index_type == BlockBasedTableOptions::kHashSearch &&
+      cf_opts.prefix_extractor == nullptr) {
+    return Status::InvalidArgument("Hash index is specified for block-based "
+        "table, but prefix_extractor is not given");
+  }
+  if (table_options_.cache_index_and_filter_blocks &&
+      table_options_.no_block_cache) {
+    return Status::InvalidArgument("Enable cache_index_and_filter_blocks, "
+        ", but block cache is disabled");
+  }
+  if (!BlockBasedTableSupportedVersion(table_options_.format_version)) {
+    return Status::InvalidArgument(
+        "Unsupported BlockBasedTable format_version. Please check "
+        "include/rocksdb/table.h for more info");
+  }
+  return Status::OK();
+}
+
+std::string BlockBasedTableFactory::GetPrintableTableOptions() const {
+  std::string ret;
+  ret.reserve(20000);
+  const int kBufferSize = 200;
+  char buffer[kBufferSize];
+
+  snprintf(buffer, kBufferSize, "  flush_block_policy_factory: %s (%p)\n",
+           table_options_.flush_block_policy_factory->Name(),
+           table_options_.flush_block_policy_factory.get());
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  cache_index_and_filter_blocks: %d\n",
+           table_options_.cache_index_and_filter_blocks);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  index_type: %d\n",
+           table_options_.index_type);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  hash_index_allow_collision: %d\n",
+           table_options_.hash_index_allow_collision);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  checksum: %d\n",
+           table_options_.checksum);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  no_block_cache: %d\n",
+           table_options_.no_block_cache);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  block_cache: %p\n",
+           table_options_.block_cache.get());
+  ret.append(buffer);
+  if (table_options_.block_cache) {
+    snprintf(buffer, kBufferSize, "  block_cache_size: %zd\n",
+             table_options_.block_cache->GetCapacity());
+    ret.append(buffer);
+  }
+  snprintf(buffer, kBufferSize, "  block_cache_compressed: %p\n",
+           table_options_.block_cache_compressed.get());
+  ret.append(buffer);
+  if (table_options_.block_cache_compressed) {
+    snprintf(buffer, kBufferSize, "  block_cache_compressed_size: %zd\n",
+             table_options_.block_cache_compressed->GetCapacity());
+    ret.append(buffer);
+  }
+  snprintf(buffer, kBufferSize, "  block_size: %zd\n",
+           table_options_.block_size);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  block_size_deviation: %d\n",
+           table_options_.block_size_deviation);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  block_restart_interval: %d\n",
+           table_options_.block_restart_interval);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  filter_policy: %s\n",
+           table_options_.filter_policy == nullptr ?
+             "nullptr" : table_options_.filter_policy->Name());
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  whole_key_filtering: %d\n",
+           table_options_.whole_key_filtering);
+  snprintf(buffer, kBufferSize, "  format_version: %d\n",
+           table_options_.format_version);
+  ret.append(buffer);
+  return ret;
+}
+
+const BlockBasedTableOptions& BlockBasedTableFactory::GetTableOptions() const {
+  return table_options_;
+}
+
 TableFactory* NewBlockBasedTableFactory(
     const BlockBasedTableOptions& table_options) {
   return new BlockBasedTableFactory(table_options);
@@ -56,5 +159,14 @@ TableFactory* NewBlockBasedTableFactory(
 
 const std::string BlockBasedTablePropertyNames::kIndexType =
     "rocksdb.block.based.table.index.type";
+const std::string BlockBasedTablePropertyNames::kWholeKeyFiltering =
+    "rocksdb.block.based.table.whole.key.filtering";
+const std::string BlockBasedTablePropertyNames::kPrefixFiltering =
+    "rocksdb.block.based.table.prefix.filtering";
+const std::string kHashIndexPrefixesBlock = "rocksdb.hashindex.prefixes";
+const std::string kHashIndexPrefixesMetadataBlock =
+    "rocksdb.hashindex.metadata";
+const std::string kPropTrue = "1";
+const std::string kPropFalse = "0";
 
 }  // namespace rocksdb
diff --git a/src/rocksdb/table/block_based_table_factory.h b/src/rocksdb/table/block_based_table_factory.h
index 492349c..6394926 100644
--- a/src/rocksdb/table/block_based_table_factory.h
+++ b/src/rocksdb/table/block_based_table_factory.h
@@ -8,16 +8,17 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #pragma once
-#include <memory>
 #include <stdint.h>
 
+#include <memory>
+#include <string>
+
 #include "rocksdb/flush_block_policy.h"
-#include "rocksdb/options.h"
 #include "rocksdb/table.h"
+#include "db/dbformat.h"
 
 namespace rocksdb {
 
-struct Options;
 struct EnvOptions;
 
 using std::unique_ptr;
@@ -32,17 +33,44 @@ class BlockBasedTableFactory : public TableFactory {
 
   const char* Name() const override { return "BlockBasedTable"; }
 
-  Status NewTableReader(const Options& options, const EnvOptions& soptions,
+  Status NewTableReader(const ImmutableCFOptions& ioptions,
+                        const EnvOptions& soptions,
+                        const InternalKeyComparator& internal_comparator,
+                        unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
+                        unique_ptr<TableReader>* table_reader) const override {
+    return NewTableReader(ioptions, soptions, internal_comparator,
+                          std::move(file), file_size, table_reader,
+                          /*prefetch_index_and_filter=*/true);
+  }
+
+  // This is a variant of virtual member function NewTableReader function with
+  // added capability to disable pre-fetching of blocks on BlockBasedTable::Open
+  Status NewTableReader(const ImmutableCFOptions& ioptions,
+                        const EnvOptions& soptions,
                         const InternalKeyComparator& internal_comparator,
                         unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
-                        unique_ptr<TableReader>* table_reader) const override;
+                        unique_ptr<TableReader>* table_reader,
+                        bool prefetch_index_and_filter) const;
 
   TableBuilder* NewTableBuilder(
-      const Options& options, const InternalKeyComparator& internal_comparator,
-      WritableFile* file, CompressionType compression_type) const override;
+      const TableBuilderOptions& table_builder_options,
+      WritableFile* file) const override;
+
+  // Sanitizes the specified DB Options.
+  Status SanitizeOptions(const DBOptions& db_opts,
+                         const ColumnFamilyOptions& cf_opts) const override;
+
+  std::string GetPrintableTableOptions() const override;
+
+  const BlockBasedTableOptions& GetTableOptions() const;
 
  private:
   BlockBasedTableOptions table_options_;
 };
 
+extern const std::string kHashIndexPrefixesBlock;
+extern const std::string kHashIndexPrefixesMetadataBlock;
+extern const std::string kPropTrue;
+extern const std::string kPropFalse;
+
 }  // namespace rocksdb
diff --git a/src/rocksdb/table/block_based_table_reader.cc b/src/rocksdb/table/block_based_table_reader.cc
index 9538007..ed7fb0b 100644
--- a/src/rocksdb/table/block_based_table_reader.cc
+++ b/src/rocksdb/table/block_based_table_reader.cc
@@ -26,24 +26,31 @@
 
 #include "table/block.h"
 #include "table/filter_block.h"
+#include "table/block_based_filter_block.h"
+#include "table/block_based_table_factory.h"
+#include "table/full_filter_block.h"
 #include "table/block_hash_index.h"
+#include "table/block_prefix_index.h"
 #include "table/format.h"
 #include "table/meta_blocks.h"
 #include "table/two_level_iterator.h"
+#include "table/get_context.h"
 
 #include "util/coding.h"
 #include "util/perf_context_imp.h"
 #include "util/stop_watch.h"
+#include "util/string_util.h"
 
 namespace rocksdb {
 
 extern const uint64_t kBlockBasedTableMagicNumber;
+extern const std::string kHashIndexPrefixesBlock;
+extern const std::string kHashIndexPrefixesMetadataBlock;
 using std::unique_ptr;
 
 typedef BlockBasedTable::IndexReader IndexReader;
 
 namespace {
-
 // The longest the prefix of the cache key used to identify blocks can be.
 // We are using the fact that we know for Posix files the unique ID is three
 // varints.
@@ -53,23 +60,19 @@ const size_t kMaxCacheKeyPrefixSize __attribute__((unused)) =
 
 // Read the block identified by "handle" from "file".
 // The only relevant option is options.verify_checksums for now.
-// Set *didIO to true if didIO is not null.
 // On failure return non-OK.
 // On success fill *result and return OK - caller owns *result
 Status ReadBlockFromFile(RandomAccessFile* file, const Footer& footer,
                          const ReadOptions& options, const BlockHandle& handle,
-                         Block** result, Env* env, bool* didIO = nullptr,
+                         std::unique_ptr<Block>* result, Env* env,
                          bool do_uncompress = true) {
   BlockContents contents;
   Status s = ReadBlockContents(file, footer, options, handle, &contents, env,
                                do_uncompress);
   if (s.ok()) {
-    *result = new Block(contents);
+    result->reset(new Block(std::move(contents)));
   }
 
-  if (didIO != nullptr) {
-    *didIO = true;
-  }
   return s;
 }
 
@@ -137,11 +140,18 @@ class BlockBasedTable::IndexReader {
   virtual ~IndexReader() {}
 
   // Create an iterator for index access.
-  virtual Iterator* NewIterator() = 0;
+  // An iter is passed in, if it is not null, update this one and return it
+  // If it is null, create a new Iterator
+  virtual Iterator* NewIterator(
+      BlockIter* iter = nullptr, bool total_order_seek = true) = 0;
 
   // The size of the index.
   virtual size_t size() const = 0;
 
+  // Report an approximation of how much memory has been used other than memory
+  // that was allocated in block cache.
+  virtual size_t ApproximateMemoryUsage() const = 0;
+
  protected:
   const Comparator* comparator_;
 };
@@ -159,26 +169,34 @@ class BinarySearchIndexReader : public IndexReader {
                        const BlockHandle& index_handle, Env* env,
                        const Comparator* comparator,
                        IndexReader** index_reader) {
-    Block* index_block = nullptr;
+    std::unique_ptr<Block> index_block;
     auto s = ReadBlockFromFile(file, footer, ReadOptions(), index_handle,
                                &index_block, env);
 
     if (s.ok()) {
-      *index_reader = new BinarySearchIndexReader(comparator, index_block);
+      *index_reader =
+          new BinarySearchIndexReader(comparator, std::move(index_block));
     }
 
     return s;
   }
 
-  virtual Iterator* NewIterator() override {
-    return index_block_->NewIterator(comparator_);
+  virtual Iterator* NewIterator(
+      BlockIter* iter = nullptr, bool dont_care = true) override {
+    return index_block_->NewIterator(comparator_, iter, true);
   }
 
   virtual size_t size() const override { return index_block_->size(); }
 
+  virtual size_t ApproximateMemoryUsage() const override {
+    assert(index_block_);
+    return index_block_->ApproximateMemoryUsage();
+  }
+
  private:
-  BinarySearchIndexReader(const Comparator* comparator, Block* index_block)
-      : IndexReader(comparator), index_block_(index_block) {
+  BinarySearchIndexReader(const Comparator* comparator,
+                          std::unique_ptr<Block>&& index_block)
+      : IndexReader(comparator), index_block_(std::move(index_block)) {
     assert(index_block_ != nullptr);
   }
   std::unique_ptr<Block> index_block_;
@@ -186,20 +204,15 @@ class BinarySearchIndexReader : public IndexReader {
 
 // Index that leverages an internal hash table to quicken the lookup for a given
 // key.
-// @param data_iter_gen, equavalent to BlockBasedTable::NewIterator(). But that
-// functions requires index to be initalized. To avoid this problem external
-// caller will pass a function that can create the iterator over the entries
-// without the table to be fully initialized.
 class HashIndexReader : public IndexReader {
  public:
-  static Status Create(RandomAccessFile* file, const Footer& footer,
-                       const BlockHandle& index_handle, Env* env,
+  static Status Create(const SliceTransform* hash_key_extractor,
+                       const Footer& footer, RandomAccessFile* file, Env* env,
                        const Comparator* comparator,
-                       std::function<Iterator*(Iterator*)> data_iter_gen,
-                       const SliceTransform* prefix_extractor,
-                       IndexReader** index_reader) {
-    assert(prefix_extractor);
-    Block* index_block = nullptr;
+                       const BlockHandle& index_handle,
+                       Iterator* meta_index_iter, IndexReader** index_reader,
+                       bool hash_index_allow_collision) {
+    std::unique_ptr<Block> index_block;
     auto s = ReadBlockFromFile(file, footer, ReadOptions(), index_handle,
                                &index_block, env);
 
@@ -207,39 +220,123 @@ class HashIndexReader : public IndexReader {
       return s;
     }
 
-    *index_reader = new HashIndexReader(comparator, index_block);
-    std::unique_ptr<Iterator> index_iter(index_block->NewIterator(nullptr));
-    std::unique_ptr<Iterator> data_iter(
-        data_iter_gen(index_block->NewIterator(nullptr)));
-    auto hash_index = CreateBlockHashIndex(index_iter.get(), data_iter.get(),
-                                           index_block->NumRestarts(),
-                                           comparator, prefix_extractor);
-    index_block->SetBlockHashIndex(hash_index);
-    return s;
+    // Note, failure to create prefix hash index does not need to be a
+    // hard error. We can still fall back to the original binary search index.
+    // So, Create will succeed regardless, from this point on.
+
+    auto new_index_reader =
+        new HashIndexReader(comparator, std::move(index_block));
+    *index_reader = new_index_reader;
+
+    // Get prefixes block
+    BlockHandle prefixes_handle;
+    s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesBlock,
+                      &prefixes_handle);
+    if (!s.ok()) {
+      // TODO: log error
+      return Status::OK();
+    }
+
+    // Get index metadata block
+    BlockHandle prefixes_meta_handle;
+    s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesMetadataBlock,
+                      &prefixes_meta_handle);
+    if (!s.ok()) {
+      // TODO: log error
+      return Status::OK();
+    }
+
+    // Read contents for the blocks
+    BlockContents prefixes_contents;
+    s = ReadBlockContents(file, footer, ReadOptions(), prefixes_handle,
+                          &prefixes_contents, env, true /* do decompression */);
+    if (!s.ok()) {
+      return s;
+    }
+    BlockContents prefixes_meta_contents;
+    s = ReadBlockContents(file, footer, ReadOptions(), prefixes_meta_handle,
+                          &prefixes_meta_contents, env,
+                          true /* do decompression */);
+    if (!s.ok()) {
+      // TODO: log error
+      return Status::OK();
+    }
+
+    if (!hash_index_allow_collision) {
+      // TODO: deprecate once hash_index_allow_collision proves to be stable.
+      BlockHashIndex* hash_index = nullptr;
+      s = CreateBlockHashIndex(hash_key_extractor,
+                               prefixes_contents.data,
+                               prefixes_meta_contents.data,
+                               &hash_index);
+      // TODO: log error
+      if (s.ok()) {
+        new_index_reader->index_block_->SetBlockHashIndex(hash_index);
+        new_index_reader->OwnPrefixesContents(std::move(prefixes_contents));
+      }
+    } else {
+      BlockPrefixIndex* prefix_index = nullptr;
+      s = BlockPrefixIndex::Create(hash_key_extractor,
+                                   prefixes_contents.data,
+                                   prefixes_meta_contents.data,
+                                   &prefix_index);
+      // TODO: log error
+      if (s.ok()) {
+        new_index_reader->index_block_->SetBlockPrefixIndex(prefix_index);
+      }
+    }
+
+    return Status::OK();
   }
 
-  virtual Iterator* NewIterator() override {
-    return index_block_->NewIterator(comparator_);
+  virtual Iterator* NewIterator(
+      BlockIter* iter = nullptr, bool total_order_seek = true) override {
+    return index_block_->NewIterator(comparator_, iter, total_order_seek);
   }
 
   virtual size_t size() const override { return index_block_->size(); }
 
+  virtual size_t ApproximateMemoryUsage() const override {
+    assert(index_block_);
+    return index_block_->ApproximateMemoryUsage() +
+           prefixes_contents_.data.size();
+  }
+
  private:
-  HashIndexReader(const Comparator* comparator, Block* index_block)
-      : IndexReader(comparator), index_block_(index_block) {
+  HashIndexReader(const Comparator* comparator,
+                  std::unique_ptr<Block>&& index_block)
+      : IndexReader(comparator), index_block_(std::move(index_block)) {
     assert(index_block_ != nullptr);
   }
+
+  ~HashIndexReader() {
+  }
+
+  void OwnPrefixesContents(BlockContents&& prefixes_contents) {
+    prefixes_contents_ = std::move(prefixes_contents);
+  }
+
   std::unique_ptr<Block> index_block_;
+  BlockContents prefixes_contents_;
 };
 
 
 struct BlockBasedTable::Rep {
-  Rep(const EnvOptions& storage_options,
-      const InternalKeyComparator& internal_comparator)
-      : soptions(storage_options), internal_comparator(internal_comparator) {}
-
-  Options options;
-  const EnvOptions& soptions;
+  Rep(const ImmutableCFOptions& _ioptions, const EnvOptions& _env_options,
+      const BlockBasedTableOptions& _table_opt,
+      const InternalKeyComparator& _internal_comparator)
+      : ioptions(_ioptions),
+        env_options(_env_options),
+        table_options(_table_opt),
+        filter_policy(_table_opt.filter_policy.get()),
+        internal_comparator(_internal_comparator),
+        whole_key_filtering(_table_opt.whole_key_filtering),
+        prefix_filtering(true) {}
+
+  const ImmutableCFOptions& ioptions;
+  const EnvOptions& env_options;
+  const BlockBasedTableOptions& table_options;
+  const FilterPolicy* const filter_policy;
   const InternalKeyComparator& internal_comparator;
   Status status;
   unique_ptr<RandomAccessFile> file;
@@ -258,6 +355,9 @@ struct BlockBasedTable::Rep {
 
   std::shared_ptr<const TableProperties> table_properties;
   BlockBasedTableOptions::IndexType index_type;
+  bool hash_index_allow_collision;
+  bool whole_key_filtering;
+  bool prefix_filtering;
   // TODO(kailiu) It is very ugly to use internal key in table, since table
   // module should not be relying on db module. However to make things easier
   // and compatible with existing code, we introduce a wrapper that allows
@@ -275,11 +375,9 @@ BlockBasedTable::~BlockBasedTable() {
 //    was not read from cache, `cache_handle` will be nullptr.
 template <class TValue>
 struct BlockBasedTable::CachableEntry {
-  CachableEntry(TValue* value, Cache::Handle* cache_handle)
-    : value(value)
-    , cache_handle(cache_handle) {
-  }
-  CachableEntry(): CachableEntry(nullptr, nullptr) { }
+  CachableEntry(TValue* _value, Cache::Handle* _cache_handle)
+      : value(_value), cache_handle(_cache_handle) {}
+  CachableEntry() : CachableEntry(nullptr, nullptr) {}
   void Release(Cache* cache) {
     if (cache_handle) {
       cache->Release(cache_handle);
@@ -298,13 +396,13 @@ void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep) {
   assert(kMaxCacheKeyPrefixSize >= 10);
   rep->cache_key_prefix_size = 0;
   rep->compressed_cache_key_prefix_size = 0;
-  if (rep->options.block_cache != nullptr) {
-    GenerateCachePrefix(rep->options.block_cache.get(), rep->file.get(),
+  if (rep->table_options.block_cache != nullptr) {
+    GenerateCachePrefix(rep->table_options.block_cache.get(), rep->file.get(),
                         &rep->cache_key_prefix[0],
                         &rep->cache_key_prefix_size);
   }
-  if (rep->options.block_cache_compressed != nullptr) {
-    GenerateCachePrefix(rep->options.block_cache_compressed.get(),
+  if (rep->table_options.block_cache_compressed != nullptr) {
+    GenerateCachePrefix(rep->table_options.block_cache_compressed.get(),
                         rep->file.get(), &rep->compressed_cache_key_prefix[0],
                         &rep->compressed_cache_key_prefix_size);
   }
@@ -338,25 +436,57 @@ void BlockBasedTable::GenerateCachePrefix(Cache* cc,
   }
 }
 
-Status BlockBasedTable::Open(const Options& options, const EnvOptions& soptions,
+namespace {
+// Return True if table_properties has `user_prop_name` has a `true` value
+// or it doesn't contain this property (for backward compatible).
+bool IsFeatureSupported(const TableProperties& table_properties,
+                        const std::string& user_prop_name, Logger* info_log) {
+  auto& props = table_properties.user_collected_properties;
+  auto pos = props.find(user_prop_name);
+  // Older version doesn't have this value set. Skip this check.
+  if (pos != props.end()) {
+    if (pos->second == kPropFalse) {
+      return false;
+    } else if (pos->second != kPropTrue) {
+      Log(InfoLogLevel::WARN_LEVEL, info_log,
+          "Property %s has invalidate value %s", user_prop_name.c_str(),
+          pos->second.c_str());
+    }
+  }
+  return true;
+}
+}  // namespace
+
+Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
+                             const EnvOptions& env_options,
                              const BlockBasedTableOptions& table_options,
                              const InternalKeyComparator& internal_comparator,
                              unique_ptr<RandomAccessFile>&& file,
                              uint64_t file_size,
-                             unique_ptr<TableReader>* table_reader) {
+                             unique_ptr<TableReader>* table_reader,
+                             const bool prefetch_index_and_filter) {
   table_reader->reset();
 
-  Footer footer(kBlockBasedTableMagicNumber);
-  auto s = ReadFooterFromFile(file.get(), file_size, &footer);
-  if (!s.ok()) return s;
+  Footer footer;
+  auto s = ReadFooterFromFile(file.get(), file_size, &footer,
+                              kBlockBasedTableMagicNumber);
+  if (!s.ok()) {
+    return s;
+  }
+  if (!BlockBasedTableSupportedVersion(footer.version())) {
+    return Status::Corruption(
+        "Unknown Footer version. Maybe this file was created with newer "
+        "version of RocksDB?");
+  }
 
   // We've successfully read the footer and the index block: we're
   // ready to serve requests.
-  Rep* rep = new BlockBasedTable::Rep(soptions, internal_comparator);
-  rep->options = options;
+  Rep* rep = new BlockBasedTable::Rep(
+      ioptions, env_options, table_options, internal_comparator);
   rep->file = std::move(file);
   rep->footer = footer;
   rep->index_type = table_options.index_type;
+  rep->hash_index_allow_collision = table_options.hash_index_allow_collision;
   SetupCacheKeyPrefix(rep);
   unique_ptr<BlockBasedTable> new_table(new BlockBasedTable(rep));
 
@@ -364,67 +494,82 @@ Status BlockBasedTable::Open(const Options& options, const EnvOptions& soptions,
   std::unique_ptr<Block> meta;
   std::unique_ptr<Iterator> meta_iter;
   s = ReadMetaBlock(rep, &meta, &meta_iter);
+  if (!s.ok()) {
+    return s;
+  }
 
   // Read the properties
   bool found_properties_block = true;
   s = SeekToPropertiesBlock(meta_iter.get(), &found_properties_block);
 
-  if (found_properties_block) {
+  if (!s.ok()) {
+    Log(InfoLogLevel::WARN_LEVEL, rep->ioptions.info_log,
+        "Cannot seek to properties block from file: %s",
+        s.ToString().c_str());
+  } else if (found_properties_block) {
     s = meta_iter->status();
     TableProperties* table_properties = nullptr;
     if (s.ok()) {
       s = ReadProperties(meta_iter->value(), rep->file.get(), rep->footer,
-                         rep->options.env, rep->options.info_log.get(),
+                         rep->ioptions.env, rep->ioptions.info_log,
                          &table_properties);
     }
 
     if (!s.ok()) {
-      auto err_msg =
-        "[Warning] Encountered error while reading data from properties "
-        "block " + s.ToString();
-      Log(rep->options.info_log, "%s", err_msg.c_str());
+      Log(InfoLogLevel::WARN_LEVEL, rep->ioptions.info_log,
+        "Encountered error while reading data from properties "
+        "block %s", s.ToString().c_str());
     } else {
       rep->table_properties.reset(table_properties);
     }
   } else {
-    Log(WARN_LEVEL, rep->options.info_log,
+    Log(InfoLogLevel::ERROR_LEVEL, rep->ioptions.info_log,
         "Cannot find Properties block from file.");
   }
 
-  // Will use block cache for index/filter blocks access?
-  if (options.block_cache && table_options.cache_index_and_filter_blocks) {
-    // Hack: Call NewIndexIterator() to implicitly add index to the block_cache
-    unique_ptr<Iterator> iter(new_table->NewIndexIterator(ReadOptions()));
-    s = iter->status();
+  // Determine whether whole key filtering is supported.
+  if (rep->table_properties) {
+    rep->whole_key_filtering &=
+        IsFeatureSupported(*(rep->table_properties),
+                           BlockBasedTablePropertyNames::kWholeKeyFiltering,
+                           rep->ioptions.info_log);
+    rep->prefix_filtering &= IsFeatureSupported(
+        *(rep->table_properties),
+        BlockBasedTablePropertyNames::kPrefixFiltering, rep->ioptions.info_log);
+  }
 
-    if (s.ok()) {
-      // Hack: Call GetFilter() to implicitly add filter to the block_cache
-      auto filter_entry = new_table->GetFilter();
-      filter_entry.Release(options.block_cache.get());
-    }
-  } else {
-    // If we don't use block cache for index/filter blocks access, we'll
-    // pre-load these blocks, which will kept in member variables in Rep
-    // and with a same life-time as this table object.
-    IndexReader* index_reader = nullptr;
-    // TODO: we never really verify check sum for index block
-    s = new_table->CreateIndexReader(&index_reader);
+  if (prefetch_index_and_filter) {
+    // pre-fetching of blocks is turned on
+    // Will use block cache for index/filter blocks access?
+    if (table_options.cache_index_and_filter_blocks) {
+      assert(table_options.block_cache != nullptr);
+      // Hack: Call NewIndexIterator() to implicitly add index to the
+      // block_cache
+      unique_ptr<Iterator> iter(new_table->NewIndexIterator(ReadOptions()));
+      s = iter->status();
 
-    if (s.ok()) {
-      rep->index_reader.reset(index_reader);
+      if (s.ok()) {
+        // Hack: Call GetFilter() to implicitly add filter to the block_cache
+        auto filter_entry = new_table->GetFilter();
+        filter_entry.Release(table_options.block_cache.get());
+      }
+    } else {
+      // If we don't use block cache for index/filter blocks access, we'll
+      // pre-load these blocks, which will kept in member variables in Rep
+      // and with a same life-time as this table object.
+      IndexReader* index_reader = nullptr;
+      s = new_table->CreateIndexReader(&index_reader, meta_iter.get());
 
-      // Set filter block
-      if (rep->options.filter_policy) {
-        std::string key = kFilterBlockPrefix;
-        key.append(rep->options.filter_policy->Name());
-        meta_iter->Seek(key);
+      if (s.ok()) {
+        rep->index_reader.reset(index_reader);
 
-        if (meta_iter->Valid() && meta_iter->key() == Slice(key)) {
-          rep->filter.reset(ReadFilter(meta_iter->value(), rep));
+        // Set filter block
+        if (rep->filter_policy) {
+          rep->filter.reset(ReadFilter(rep, meta_iter.get(), nullptr));
         }
+      } else {
+        delete index_reader;
       }
-    } else {
-      delete index_reader;
     }
   }
 
@@ -436,7 +581,7 @@ Status BlockBasedTable::Open(const Options& options, const EnvOptions& soptions,
 }
 
 void BlockBasedTable::SetupForCompaction() {
-  switch (rep_->options.access_hint_on_compaction_start) {
+  switch (rep_->ioptions.access_hint_on_compaction_start) {
     case Options::NONE:
       break;
     case Options::NORMAL:
@@ -459,6 +604,17 @@ std::shared_ptr<const TableProperties> BlockBasedTable::GetTableProperties()
   return rep_->table_properties;
 }
 
+size_t BlockBasedTable::ApproximateMemoryUsage() const {
+  size_t usage = 0;
+  if (rep_->filter) {
+    usage += rep_->filter->ApproximateMemoryUsage();
+  }
+  if (rep_->index_reader) {
+    usage += rep_->index_reader->ApproximateMemoryUsage();
+  }
+  return usage;
+}
+
 // Load the meta-block from the file. On success, return the loaded meta block
 // and its iterator.
 Status BlockBasedTable::ReadMetaBlock(
@@ -468,29 +624,25 @@ Status BlockBasedTable::ReadMetaBlock(
   // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates
   // it is an empty block.
   //  TODO: we never really verify check sum for meta index block
-  Block* meta = nullptr;
+  std::unique_ptr<Block> meta;
   Status s = ReadBlockFromFile(
       rep->file.get(),
       rep->footer,
       ReadOptions(),
       rep->footer.metaindex_handle(),
       &meta,
-      rep->options.env);
+      rep->ioptions.env);
 
-    if (!s.ok()) {
-      auto err_msg =
-        "[Warning] Encountered error while reading data from properties"
-        "block " + s.ToString();
-      Log(rep->options.info_log, "%s", err_msg.c_str());
-    }
   if (!s.ok()) {
-    delete meta;
+    Log(InfoLogLevel::ERROR_LEVEL, rep->ioptions.info_log,
+        "Encountered error while reading data from properties"
+        " block %s", s.ToString().c_str());
     return s;
   }
 
-  meta_block->reset(meta);
+  *meta_block = std::move(meta);
   // meta block uses bytewise comparator.
-  iter->reset(meta->NewIterator(BytewiseComparator()));
+  iter->reset(meta_block->get()->NewIterator(BytewiseComparator()));
   return Status::OK();
 }
 
@@ -498,7 +650,7 @@ Status BlockBasedTable::GetDataBlockFromCache(
     const Slice& block_cache_key, const Slice& compressed_block_cache_key,
     Cache* block_cache, Cache* block_cache_compressed, Statistics* statistics,
     const ReadOptions& read_options,
-    BlockBasedTable::CachableEntry<Block>* block) {
+    BlockBasedTable::CachableEntry<Block>* block, uint32_t format_version) {
   Status s;
   Block* compressed_block = nullptr;
   Cache::Handle* block_cache_compressed_handle = nullptr;
@@ -541,11 +693,12 @@ Status BlockBasedTable::GetDataBlockFromCache(
   // Retrieve the uncompressed contents into a new buffer
   BlockContents contents;
   s = UncompressBlockContents(compressed_block->data(),
-                              compressed_block->size(), &contents);
+                              compressed_block->size(), &contents,
+                              format_version);
 
   // Insert uncompressed block into block cache
   if (s.ok()) {
-    block->value = new Block(contents);  // uncompressed block
+    block->value = new Block(std::move(contents));  // uncompressed block
     assert(block->value->compression_type() == kNoCompression);
     if (block_cache != nullptr && block->value->cachable() &&
         read_options.fill_cache) {
@@ -566,7 +719,7 @@ Status BlockBasedTable::PutDataBlockToCache(
     const Slice& block_cache_key, const Slice& compressed_block_cache_key,
     Cache* block_cache, Cache* block_cache_compressed,
     const ReadOptions& read_options, Statistics* statistics,
-    CachableEntry<Block>* block, Block* raw_block) {
+    CachableEntry<Block>* block, Block* raw_block, uint32_t format_version) {
   assert(raw_block->compression_type() == kNoCompression ||
          block_cache_compressed != nullptr);
 
@@ -574,8 +727,8 @@ Status BlockBasedTable::PutDataBlockToCache(
   // Retrieve the uncompressed contents into a new buffer
   BlockContents contents;
   if (raw_block->compression_type() != kNoCompression) {
-    s = UncompressBlockContents(raw_block->data(), raw_block->size(),
-                                &contents);
+    s = UncompressBlockContents(raw_block->data(), raw_block->size(), &contents,
+                                format_version);
   }
   if (!s.ok()) {
     delete raw_block;
@@ -583,7 +736,7 @@ Status BlockBasedTable::PutDataBlockToCache(
   }
 
   if (raw_block->compression_type() != kNoCompression) {
-    block->value = new Block(contents);  // uncompressed block
+    block->value = new Block(std::move(contents));  // uncompressed block
   } else {
     block->value = raw_block;
     raw_block = nullptr;
@@ -617,64 +770,79 @@ Status BlockBasedTable::PutDataBlockToCache(
   return s;
 }
 
-FilterBlockReader* BlockBasedTable::ReadFilter (
-    const Slice& filter_handle_value,
-    BlockBasedTable::Rep* rep,
-    size_t* filter_size) {
-  Slice v = filter_handle_value;
-  BlockHandle filter_handle;
-  if (!filter_handle.DecodeFrom(&v).ok()) {
-    return nullptr;
-  }
-
+FilterBlockReader* BlockBasedTable::ReadFilter(
+    Rep* rep, Iterator* meta_index_iter, size_t* filter_size) {
   // TODO: We might want to unify with ReadBlockFromFile() if we start
   // requiring checksum verification in Table::Open.
-  ReadOptions opt;
-  BlockContents block;
-  if (!ReadBlockContents(rep->file.get(), rep->footer, opt, filter_handle,
-                         &block, rep->options.env, false).ok()) {
-    return nullptr;
-  }
+  for (auto prefix : {kFullFilterBlockPrefix, kFilterBlockPrefix}) {
+    std::string filter_block_key = prefix;
+    filter_block_key.append(rep->filter_policy->Name());
+    BlockHandle handle;
+    if (FindMetaBlock(meta_index_iter, filter_block_key, &handle).ok()) {
+      BlockContents block;
+      if (!ReadBlockContents(rep->file.get(), rep->footer, ReadOptions(),
+                             handle, &block, rep->ioptions.env, false).ok()) {
+        // Error reading the block
+        return nullptr;
+      }
 
-  if (filter_size) {
-    *filter_size = block.data.size();
-  }
+      if (filter_size) {
+        *filter_size = block.data.size();
+      }
 
-  return new FilterBlockReader(
-       rep->options, block.data, block.heap_allocated);
+      assert(rep->filter_policy);
+      if (kFilterBlockPrefix == prefix) {
+        return new BlockBasedFilterBlockReader(
+            rep->prefix_filtering ? rep->ioptions.prefix_extractor : nullptr,
+            rep->table_options, rep->whole_key_filtering, std::move(block));
+      } else if (kFullFilterBlockPrefix == prefix) {
+        auto filter_bits_reader = rep->filter_policy->
+            GetFilterBitsReader(block.data);
+        if (filter_bits_reader != nullptr) {
+          return new FullFilterBlockReader(
+              rep->prefix_filtering ? rep->ioptions.prefix_extractor : nullptr,
+              rep->whole_key_filtering, std::move(block), filter_bits_reader);
+        }
+      } else {
+        assert(false);
+        return nullptr;
+      }
+    }
+  }
+  return nullptr;
 }
 
 BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
-    bool no_io) const {
-  // filter pre-populated
-  if (rep_->filter != nullptr) {
+                                                          bool no_io) const {
+  // If cache_index_and_filter_blocks is false, filter should be pre-populated.
+  // We will return rep_->filter anyway. rep_->filter can be nullptr if filter
+  // read fails at Open() time. We don't want to reload again since it will
+  // most probably fail again.
+  if (!rep_->table_options.cache_index_and_filter_blocks) {
     return {rep_->filter.get(), nullptr /* cache handle */};
   }
 
-  if (rep_->options.filter_policy == nullptr /* do not use filter at all */ ||
-      rep_->options.block_cache == nullptr /* no block cache at all */) {
+  Cache* block_cache = rep_->table_options.block_cache.get();
+  if (rep_->filter_policy == nullptr /* do not use filter */ ||
+      block_cache == nullptr /* no block cache at all */) {
     return {nullptr /* filter */, nullptr /* cache handle */};
   }
 
   // Fetching from the cache
-  Cache* block_cache = rep_->options.block_cache.get();
   char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
-  auto key = GetCacheKey(
-      rep_->cache_key_prefix,
-      rep_->cache_key_prefix_size,
-      rep_->footer.metaindex_handle(),
-      cache_key
-  );
-
-  Statistics* statistics = rep_->options.statistics.get();
+  auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
+                         rep_->footer.metaindex_handle(),
+                         cache_key);
+
+  Statistics* statistics = rep_->ioptions.statistics;
   auto cache_handle =
       GetEntryFromCache(block_cache, key, BLOCK_CACHE_FILTER_MISS,
                         BLOCK_CACHE_FILTER_HIT, statistics);
 
   FilterBlockReader* filter = nullptr;
   if (cache_handle != nullptr) {
-     filter = reinterpret_cast<FilterBlockReader*>(
-         block_cache->Value(cache_handle));
+    filter = reinterpret_cast<FilterBlockReader*>(
+        block_cache->Value(cache_handle));
   } else if (no_io) {
     // Do not invoke any io.
     return CachableEntry<FilterBlockReader>();
@@ -685,15 +853,9 @@ BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
     auto s = ReadMetaBlock(rep_, &meta, &iter);
 
     if (s.ok()) {
-      std::string filter_block_key = kFilterBlockPrefix;
-      filter_block_key.append(rep_->options.filter_policy->Name());
-      iter->Seek(filter_block_key);
-
-      if (iter->Valid() && iter->key() == Slice(filter_block_key)) {
-        filter = ReadFilter(iter->value(), rep_, &filter_size);
-        assert(filter);
+      filter = ReadFilter(rep_, iter.get(), &filter_size);
+      if (filter != nullptr) {
         assert(filter_size > 0);
-
         cache_handle = block_cache->Insert(
             key, filter, filter_size, &DeleteCachedEntry<FilterBlockReader>);
         RecordTick(statistics, BLOCK_CACHE_ADD);
@@ -704,24 +866,31 @@ BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
   return { filter, cache_handle };
 }
 
-Iterator* BlockBasedTable::NewIndexIterator(const ReadOptions& read_options) {
+Iterator* BlockBasedTable::NewIndexIterator(const ReadOptions& read_options,
+        BlockIter* input_iter) {
   // index reader has already been pre-populated.
   if (rep_->index_reader) {
-    return rep_->index_reader->NewIterator();
+    return rep_->index_reader->NewIterator(
+        input_iter, read_options.total_order_seek);
   }
 
   bool no_io = read_options.read_tier == kBlockCacheTier;
-  Cache* block_cache = rep_->options.block_cache.get();
+  Cache* block_cache = rep_->table_options.block_cache.get();
   char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
   auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
                          rep_->footer.index_handle(), cache_key);
-  Statistics* statistics = rep_->options.statistics.get();
+  Statistics* statistics = rep_->ioptions.statistics;
   auto cache_handle =
       GetEntryFromCache(block_cache, key, BLOCK_CACHE_INDEX_MISS,
                         BLOCK_CACHE_INDEX_HIT, statistics);
 
   if (cache_handle == nullptr && no_io) {
-    return NewErrorIterator(Status::Incomplete("no blocking io"));
+    if (input_iter != nullptr) {
+      input_iter->SetStatus(Status::Incomplete("no blocking io"));
+      return input_iter;
+    } else {
+      return NewErrorIterator(Status::Incomplete("no blocking io"));
+    }
   }
 
   IndexReader* index_reader = nullptr;
@@ -736,7 +905,12 @@ Iterator* BlockBasedTable::NewIndexIterator(const ReadOptions& read_options) {
     if (!s.ok()) {
       // make sure if something goes wrong, index_reader shall remain intact.
       assert(index_reader == nullptr);
-      return NewErrorIterator(s);
+      if (input_iter != nullptr) {
+        input_iter->SetStatus(s);
+        return input_iter;
+      } else {
+        return NewErrorIterator(s);
+      }
     }
 
     cache_handle = block_cache->Insert(key, index_reader, index_reader->size(),
@@ -745,20 +919,23 @@ Iterator* BlockBasedTable::NewIndexIterator(const ReadOptions& read_options) {
   }
 
   assert(cache_handle);
-  auto iter = index_reader->NewIterator();
+  auto* iter = index_reader->NewIterator(
+      input_iter, read_options.total_order_seek);
   iter->RegisterCleanup(&ReleaseCachedEntry, block_cache, cache_handle);
-
   return iter;
 }
 
 // Convert an index iterator value (i.e., an encoded BlockHandle)
 // into an iterator over the contents of the corresponding block.
+// If input_iter is null, new a iterator
+// If input_iter is not null, update this iter and return it
 Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep,
-    const ReadOptions& ro, bool* didIO, const Slice& index_value) {
+    const ReadOptions& ro, const Slice& index_value,
+    BlockIter* input_iter) {
   const bool no_io = (ro.read_tier == kBlockCacheTier);
-  Cache* block_cache = rep->options.block_cache.get();
-  Cache* block_cache_compressed = rep->options.
-                                    block_cache_compressed.get();
+  Cache* block_cache = rep->table_options.block_cache.get();
+  Cache* block_cache_compressed =
+      rep->table_options.block_cache_compressed.get();
   CachableEntry<Block> block;
 
   BlockHandle handle;
@@ -768,12 +945,17 @@ Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep,
   Status s = handle.DecodeFrom(&input);
 
   if (!s.ok()) {
-    return NewErrorIterator(s);
+    if (input_iter != nullptr) {
+      input_iter->SetStatus(s);
+      return input_iter;
+    } else {
+      return NewErrorIterator(s);
+    }
   }
 
   // If either block cache is enabled, we'll try to read from it.
   if (block_cache != nullptr || block_cache_compressed != nullptr) {
-    Statistics* statistics = rep->options.statistics.get();
+    Statistics* statistics = rep->ioptions.statistics;
     char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
     char compressed_cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
     Slice key, /* key to the block cache */
@@ -781,8 +963,8 @@ Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep,
 
     // create key for block cache
     if (block_cache != nullptr) {
-      key = GetCacheKey(rep->cache_key_prefix,
-                        rep->cache_key_prefix_size, handle, cache_key);
+      key = GetCacheKey(rep->cache_key_prefix, rep->cache_key_prefix_size,
+                        handle, cache_key);
     }
 
     if (block_cache_compressed != nullptr) {
@@ -792,21 +974,22 @@ Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep,
     }
 
     s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed,
-                      statistics, ro, &block);
+                              statistics, ro, &block,
+                              rep->table_options.format_version);
 
     if (block.value == nullptr && !no_io && ro.fill_cache) {
-      Histograms histogram = READ_BLOCK_GET_MICROS;
-      Block* raw_block = nullptr;
+      std::unique_ptr<Block> raw_block;
       {
-        StopWatch sw(rep->options.env, statistics, histogram);
+        StopWatch sw(rep->ioptions.env, statistics, READ_BLOCK_GET_MICROS);
         s = ReadBlockFromFile(rep->file.get(), rep->footer, ro, handle,
-                              &raw_block, rep->options.env, didIO,
+                              &raw_block, rep->ioptions.env,
                               block_cache_compressed == nullptr);
       }
 
       if (s.ok()) {
         s = PutDataBlockToCache(key, ckey, block_cache, block_cache_compressed,
-                                ro, statistics, &block, raw_block);
+                                ro, statistics, &block, raw_block.release(),
+                                rep->table_options.format_version);
       }
     }
   }
@@ -815,23 +998,37 @@ Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep,
   if (block.value == nullptr) {
     if (no_io) {
       // Could not read from block_cache and can't do IO
-      return NewErrorIterator(Status::Incomplete("no blocking io"));
+      if (input_iter != nullptr) {
+        input_iter->SetStatus(Status::Incomplete("no blocking io"));
+        return input_iter;
+      } else {
+        return NewErrorIterator(Status::Incomplete("no blocking io"));
+      }
     }
+    std::unique_ptr<Block> block_value;
     s = ReadBlockFromFile(rep->file.get(), rep->footer, ro, handle,
-                          &block.value, rep->options.env, didIO);
+                          &block_value, rep->ioptions.env);
+    if (s.ok()) {
+      block.value = block_value.release();
+    }
   }
 
   Iterator* iter;
   if (block.value != nullptr) {
-    iter = block.value->NewIterator(&rep->internal_comparator);
+    iter = block.value->NewIterator(&rep->internal_comparator, input_iter);
     if (block.cache_handle != nullptr) {
       iter->RegisterCleanup(&ReleaseCachedEntry, block_cache,
-                            block.cache_handle);
+          block.cache_handle);
     } else {
       iter->RegisterCleanup(&DeleteHeldResource<Block>, block.value, nullptr);
     }
   } else {
-    iter = NewErrorIterator(s);
+    if (input_iter != nullptr) {
+      input_iter->SetStatus(s);
+      iter = input_iter;
+    } else {
+      iter = NewErrorIterator(s);
+    }
   }
   return iter;
 }
@@ -839,16 +1036,20 @@ Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep,
 class BlockBasedTable::BlockEntryIteratorState : public TwoLevelIteratorState {
  public:
   BlockEntryIteratorState(BlockBasedTable* table,
-      const ReadOptions& read_options, bool* did_io)
-    : TwoLevelIteratorState(table->rep_->options.prefix_extractor != nullptr),
-      table_(table), read_options_(read_options), did_io_(did_io) {}
+                          const ReadOptions& read_options)
+      : TwoLevelIteratorState(
+          table->rep_->ioptions.prefix_extractor != nullptr),
+        table_(table),
+        read_options_(read_options) {}
 
   Iterator* NewSecondaryIterator(const Slice& index_value) override {
-    return NewDataBlockIterator(table_->rep_, read_options_, did_io_,
-                                index_value);
+    return NewDataBlockIterator(table_->rep_, read_options_, index_value);
   }
 
   bool PrefixMayMatch(const Slice& internal_key) override {
+    if (read_options_.total_order_seek) {
+      return true;
+    }
     return table_->PrefixMayMatch(internal_key);
   }
 
@@ -856,14 +1057,12 @@ class BlockBasedTable::BlockEntryIteratorState : public TwoLevelIteratorState {
   // Don't own table_
   BlockBasedTable* table_;
   const ReadOptions read_options_;
-  // Don't own did_io_
-  bool* did_io_;
 };
 
 // This will be broken if the user specifies an unusual implementation
 // of Options.comparator, or if the user specifies an unusual
-// definition of prefixes in Options.filter_policy.  In particular, we
-// require the following three properties:
+// definition of prefixes in BlockBasedTableOptions.filter_policy.
+// In particular, we require the following three properties:
 //
 // 1) key.starts_with(prefix(key))
 // 2) Compare(prefix(key), key) <= 0.
@@ -873,8 +1072,12 @@ class BlockBasedTable::BlockEntryIteratorState : public TwoLevelIteratorState {
 //
 // REQUIRES: this method shouldn't be called while the DB lock is held.
 bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key) {
-  assert(rep_->options.prefix_extractor != nullptr);
-  auto prefix = rep_->options.prefix_extractor->Transform(
+  if (!rep_->filter_policy) {
+    return true;
+  }
+
+  assert(rep_->ioptions.prefix_extractor != nullptr);
+  auto prefix = rep_->ioptions.prefix_extractor->Transform(
       ExtractUserKey(internal_key));
   InternalKey internal_key_prefix(prefix, 0, kTypeValue);
   auto internal_prefix = internal_key_prefix.Encode();
@@ -882,148 +1085,236 @@ bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key) {
   bool may_match = true;
   Status s;
 
-  if (!rep_->options.filter_policy) {
-    return true;
-  }
-
   // To prevent any io operation in this method, we set `read_tier` to make
   // sure we always read index or filter only when they have already been
   // loaded to memory.
   ReadOptions no_io_read_options;
   no_io_read_options.read_tier = kBlockCacheTier;
-  unique_ptr<Iterator> iiter(NewIndexIterator(no_io_read_options));
-  iiter->Seek(internal_prefix);
-
-  if (!iiter->Valid()) {
-    // we're past end of file
-    // if it's incomplete, it means that we avoided I/O
-    // and we're not really sure that we're past the end
-    // of the file
-    may_match = iiter->status().IsIncomplete();
-  } else if (ExtractUserKey(iiter->key()).starts_with(
-              ExtractUserKey(internal_prefix))) {
-    // we need to check for this subtle case because our only
-    // guarantee is that "the key is a string >= last key in that data
-    // block" according to the doc/table_format.txt spec.
-    //
-    // Suppose iiter->key() starts with the desired prefix; it is not
-    // necessarily the case that the corresponding data block will
-    // contain the prefix, since iiter->key() need not be in the
-    // block.  However, the next data block may contain the prefix, so
-    // we return true to play it safe.
-    may_match = true;
-  } else {
-    // iiter->key() does NOT start with the desired prefix.  Because
-    // Seek() finds the first key that is >= the seek target, this
-    // means that iiter->key() > prefix.  Thus, any data blocks coming
-    // after the data block corresponding to iiter->key() cannot
-    // possibly contain the key.  Thus, the corresponding data block
-    // is the only one which could potentially contain the prefix.
-    Slice handle_value = iiter->value();
-    BlockHandle handle;
-    s = handle.DecodeFrom(&handle_value);
-    assert(s.ok());
-    auto filter_entry = GetFilter(true /* no io */);
-    may_match =
-      filter_entry.value == nullptr ||
-      filter_entry.value->PrefixMayMatch(handle.offset(), internal_prefix);
-    filter_entry.Release(rep_->options.block_cache.get());
+
+  // First, try check with full filter
+  auto filter_entry = GetFilter(true /* no io */);
+  FilterBlockReader* filter = filter_entry.value;
+  if (filter != nullptr && !filter->IsBlockBased()) {
+    may_match = filter->PrefixMayMatch(prefix);
   }
 
-  Statistics* statistics = rep_->options.statistics.get();
+  // Then, try find it within each block
+  if (may_match) {
+    unique_ptr<Iterator> iiter(NewIndexIterator(no_io_read_options));
+    iiter->Seek(internal_prefix);
+
+    if (!iiter->Valid()) {
+      // we're past end of file
+      // if it's incomplete, it means that we avoided I/O
+      // and we're not really sure that we're past the end
+      // of the file
+      may_match = iiter->status().IsIncomplete();
+    } else if (ExtractUserKey(iiter->key()).starts_with(
+                ExtractUserKey(internal_prefix))) {
+      // we need to check for this subtle case because our only
+      // guarantee is that "the key is a string >= last key in that data
+      // block" according to the doc/table_format.txt spec.
+      //
+      // Suppose iiter->key() starts with the desired prefix; it is not
+      // necessarily the case that the corresponding data block will
+      // contain the prefix, since iiter->key() need not be in the
+      // block.  However, the next data block may contain the prefix, so
+      // we return true to play it safe.
+      may_match = true;
+    } else if (filter != nullptr && filter->IsBlockBased()) {
+      // iiter->key() does NOT start with the desired prefix.  Because
+      // Seek() finds the first key that is >= the seek target, this
+      // means that iiter->key() > prefix.  Thus, any data blocks coming
+      // after the data block corresponding to iiter->key() cannot
+      // possibly contain the key.  Thus, the corresponding data block
+      // is the only on could potentially contain the prefix.
+      Slice handle_value = iiter->value();
+      BlockHandle handle;
+      s = handle.DecodeFrom(&handle_value);
+      assert(s.ok());
+      may_match = filter->PrefixMayMatch(prefix, handle.offset());
+    }
+  }
+
+  Statistics* statistics = rep_->ioptions.statistics;
   RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED);
   if (!may_match) {
     RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL);
   }
 
+  filter_entry.Release(rep_->table_options.block_cache.get());
   return may_match;
 }
 
-Iterator* BlockBasedTable::NewIterator(const ReadOptions& read_options) {
-  return NewTwoLevelIterator(new BlockEntryIteratorState(this, read_options,
-                                                         nullptr),
-                             NewIndexIterator(read_options));
+Iterator* BlockBasedTable::NewIterator(const ReadOptions& read_options,
+                                       Arena* arena) {
+  return NewTwoLevelIterator(new BlockEntryIteratorState(this, read_options),
+                             NewIndexIterator(read_options), arena);
+}
+
+bool BlockBasedTable::FullFilterKeyMayMatch(FilterBlockReader* filter,
+                                            const Slice& internal_key) const {
+  if (filter == nullptr || filter->IsBlockBased()) {
+    return true;
+  }
+  Slice user_key = ExtractUserKey(internal_key);
+  if (!filter->KeyMayMatch(user_key)) {
+    return false;
+  }
+  if (rep_->ioptions.prefix_extractor &&
+      !filter->PrefixMayMatch(
+          rep_->ioptions.prefix_extractor->Transform(user_key))) {
+    return false;
+  }
+  return true;
 }
 
 Status BlockBasedTable::Get(
-    const ReadOptions& read_options, const Slice& key, void* handle_context,
-    bool (*result_handler)(void* handle_context, const ParsedInternalKey& k,
-                           const Slice& v, bool didIO),
-    void (*mark_key_may_exist_handler)(void* handle_context)) {
+    const ReadOptions& read_options, const Slice& key,
+    GetContext* get_context) {
   Status s;
-  Iterator* iiter = NewIndexIterator(read_options);
   auto filter_entry = GetFilter(read_options.read_tier == kBlockCacheTier);
   FilterBlockReader* filter = filter_entry.value;
-  bool done = false;
-  for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) {
-    Slice handle_value = iiter->value();
 
-    BlockHandle handle;
-    bool may_not_exist_in_filter =
-      filter != nullptr &&
-      handle.DecodeFrom(&handle_value).ok() &&
-      !filter->KeyMayMatch(handle.offset(), key);
-
-    if (may_not_exist_in_filter) {
-      // Not found
-      // TODO: think about interaction with Merge. If a user key cannot
-      // cross one data block, we should be fine.
-      RecordTick(rep_->options.statistics.get(), BLOOM_FILTER_USEFUL);
-      break;
-    } else {
-      bool didIO = false;
-      unique_ptr<Iterator> block_iter(
-          NewDataBlockIterator(rep_, read_options, &didIO, iiter->value()));
-
-      if (read_options.read_tier && block_iter->status().IsIncomplete()) {
-        // couldn't get block from block_cache
-        // Update Saver.state to Found because we are only looking for whether
-        // we can guarantee the key is not there when "no_io" is set
-        (*mark_key_may_exist_handler)(handle_context);
+  // First check the full filter
+  // If full filter not useful, Then go into each block
+  if (!FullFilterKeyMayMatch(filter, key)) {
+    RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
+  } else {
+    BlockIter iiter;
+    NewIndexIterator(read_options, &iiter);
+
+    bool done = false;
+    for (iiter.Seek(key); iiter.Valid() && !done; iiter.Next()) {
+      Slice handle_value = iiter.value();
+
+      BlockHandle handle;
+      bool not_exist_in_filter =
+          filter != nullptr && filter->IsBlockBased() == true &&
+          handle.DecodeFrom(&handle_value).ok() &&
+          !filter->KeyMayMatch(ExtractUserKey(key), handle.offset());
+
+      if (not_exist_in_filter) {
+        // Not found
+        // TODO: think about interaction with Merge. If a user key cannot
+        // cross one data block, we should be fine.
+        RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
         break;
-      }
-
-      // Call the *saver function on each entry/block until it returns false
-      for (block_iter->Seek(key); block_iter->Valid(); block_iter->Next()) {
-        ParsedInternalKey parsed_key;
-        if (!ParseInternalKey(block_iter->key(), &parsed_key)) {
-          s = Status::Corruption(Slice());
+      } else {
+        BlockIter biter;
+        NewDataBlockIterator(rep_, read_options, iiter.value(), &biter);
+
+        if (read_options.read_tier && biter.status().IsIncomplete()) {
+          // couldn't get block from block_cache
+          // Update Saver.state to Found because we are only looking for whether
+          // we can guarantee the key is not there when "no_io" is set
+          get_context->MarkKeyMayExist();
+          break;
         }
-
-        if (!(*result_handler)(handle_context, parsed_key, block_iter->value(),
-                               didIO)) {
-          done = true;
+        if (!biter.status().ok()) {
+          s = biter.status();
           break;
         }
+
+        // Call the *saver function on each entry/block until it returns false
+        for (biter.Seek(key); biter.Valid(); biter.Next()) {
+          ParsedInternalKey parsed_key;
+          if (!ParseInternalKey(biter.key(), &parsed_key)) {
+            s = Status::Corruption(Slice());
+          }
+
+          if (!get_context->SaveValue(parsed_key, biter.value())) {
+            done = true;
+            break;
+          }
+        }
+        s = biter.status();
       }
-      s = block_iter->status();
+    }
+    if (s.ok()) {
+      s = iiter.status();
     }
   }
 
-  filter_entry.Release(rep_->options.block_cache.get());
-  if (s.ok()) {
-    s = iiter->status();
-  }
-  delete iiter;
+  filter_entry.Release(rep_->table_options.block_cache.get());
   return s;
 }
 
-namespace {
-bool SaveDidIO(void* arg, const ParsedInternalKey& key, const Slice& value,
-               bool didIO) {
-  *reinterpret_cast<bool*>(arg) = didIO;
-  return false;
+Status BlockBasedTable::Prefetch(const Slice* const begin,
+                                 const Slice* const end) {
+  auto& comparator = rep_->internal_comparator;
+  // pre-condition
+  if (begin && end && comparator.Compare(*begin, *end) > 0) {
+    return Status::InvalidArgument(*begin, *end);
+  }
+
+  BlockIter iiter;
+  NewIndexIterator(ReadOptions(), &iiter);
+
+  if (!iiter.status().ok()) {
+    // error opening index iterator
+    return iiter.status();
+  }
+
+  // indicates if we are on the last page that need to be pre-fetched
+  bool prefetching_boundary_page = false;
+
+  for (begin ? iiter.Seek(*begin) : iiter.SeekToFirst(); iiter.Valid();
+       iiter.Next()) {
+    Slice block_handle = iiter.value();
+
+    if (end && comparator.Compare(iiter.key(), *end) >= 0) {
+      if (prefetching_boundary_page) {
+        break;
+      }
+
+      // The index entry represents the last key in the data block.
+      // We should load this page into memory as well, but no more
+      prefetching_boundary_page = true;
+    }
+
+    // Load the block specified by the block_handle into the block cache
+    BlockIter biter;
+    NewDataBlockIterator(rep_, ReadOptions(), block_handle, &biter);
+
+    if (!biter.status().ok()) {
+      // there was an unexpected error while pre-fetching
+      return biter.status();
+    }
+  }
+
+  return Status::OK();
 }
-}  // namespace
 
 bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
                                       const Slice& key) {
-  // We use Get() as it has logic that checks whether we read the
-  // block from the disk or not.
-  bool didIO = false;
-  Status s = Get(options, key, &didIO, SaveDidIO);
+  std::unique_ptr<Iterator> iiter(NewIndexIterator(options));
+  iiter->Seek(key);
+  assert(iiter->Valid());
+  CachableEntry<Block> block;
+
+  BlockHandle handle;
+  Slice input = iiter->value();
+  Status s = handle.DecodeFrom(&input);
   assert(s.ok());
-  return !didIO;
+  Cache* block_cache = rep_->table_options.block_cache.get();
+  assert(block_cache != nullptr);
+
+  char cache_key_storage[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
+  Slice cache_key =
+      GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
+                  handle, cache_key_storage);
+  Slice ckey;
+
+  s = GetDataBlockFromCache(cache_key, ckey, block_cache, nullptr, nullptr,
+                            options, &block,
+                            rep_->table_options.format_version);
+  assert(s.ok());
+  bool in_cache = block.value != nullptr;
+  if (in_cache) {
+    ReleaseCachedEntry(block_cache, block.cache_handle);
+  }
+  return in_cache;
 }
 
 // REQUIRES: The following fields of rep_ should have already been populated:
@@ -1032,7 +1323,8 @@ bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
 //  3. options
 //  4. internal_comparator
 //  5. index_type
-Status BlockBasedTable::CreateIndexReader(IndexReader** index_reader) {
+Status BlockBasedTable::CreateIndexReader(IndexReader** index_reader,
+                                          Iterator* preloaded_meta_index_iter) {
   // Some old version of block-based tables don't have index type present in
   // table properties. If that's the case we can safely use the kBinarySearch.
   auto index_type_on_file = BlockBasedTableOptions::kBinarySearch;
@@ -1045,41 +1337,55 @@ Status BlockBasedTable::CreateIndexReader(IndexReader** index_reader) {
     }
   }
 
-  // TODO(sdong): Currently binary index is the only index type we support in
-  // files. Hash index is built on top of binary index too.
-  if (index_type_on_file != BlockBasedTableOptions::kBinarySearch) {
-    return Status::NotSupported("File Contains not supported index type: ",
-                                std::to_string(index_type_on_file));
-  }
-
   auto file = rep_->file.get();
-  auto env = rep_->options.env;
+  auto env = rep_->ioptions.env;
   auto comparator = &rep_->internal_comparator;
   const Footer& footer = rep_->footer;
 
-  switch (rep_->index_type) {
+  if (index_type_on_file == BlockBasedTableOptions::kHashSearch &&
+      rep_->ioptions.prefix_extractor == nullptr) {
+    Log(InfoLogLevel::WARN_LEVEL, rep_->ioptions.info_log,
+        "BlockBasedTableOptions::kHashSearch requires "
+        "options.prefix_extractor to be set."
+        " Fall back to binary search index.");
+    index_type_on_file = BlockBasedTableOptions::kBinarySearch;
+  }
+
+  switch (index_type_on_file) {
     case BlockBasedTableOptions::kBinarySearch: {
       return BinarySearchIndexReader::Create(
           file, footer, footer.index_handle(), env, comparator, index_reader);
     }
     case BlockBasedTableOptions::kHashSearch: {
+      std::unique_ptr<Block> meta_guard;
+      std::unique_ptr<Iterator> meta_iter_guard;
+      auto meta_index_iter = preloaded_meta_index_iter;
+      if (meta_index_iter == nullptr) {
+        auto s = ReadMetaBlock(rep_, &meta_guard, &meta_iter_guard);
+        if (!s.ok()) {
+          // we simply fall back to binary search in case there is any
+          // problem with prefix hash index loading.
+          Log(InfoLogLevel::WARN_LEVEL, rep_->ioptions.info_log,
+              "Unable to read the metaindex block."
+              " Fall back to binary search index.");
+          return BinarySearchIndexReader::Create(
+            file, footer, footer.index_handle(), env, comparator, index_reader);
+        }
+        meta_index_iter = meta_iter_guard.get();
+      }
+
       // We need to wrap data with internal_prefix_transform to make sure it can
       // handle prefix correctly.
       rep_->internal_prefix_transform.reset(
-          new InternalKeySliceTransform(rep_->options.prefix_extractor.get()));
+          new InternalKeySliceTransform(rep_->ioptions.prefix_extractor));
       return HashIndexReader::Create(
-          file, footer, footer.index_handle(), env, comparator,
-          [&](Iterator* index_iter) {
-            return NewTwoLevelIterator(new BlockEntryIteratorState(this,
-                ReadOptions(), nullptr), index_iter);
-          },
-          rep_->internal_prefix_transform.get(), index_reader);
+          rep_->internal_prefix_transform.get(), footer, file, env, comparator,
+          footer.index_handle(), meta_index_iter, index_reader,
+          rep_->hash_index_allow_collision);
     }
     default: {
       std::string error_message =
-          "Unrecognized index type: " + std::to_string(rep_->index_type);
-      // equivalent to assert(false), but more informative.
-      assert(!error_message.c_str());
+          "Unrecognized index type: " + ToString(rep_->index_type);
       return Status::InvalidArgument(error_message.c_str());
     }
   }
@@ -1126,4 +1432,217 @@ bool BlockBasedTable::TEST_index_reader_preloaded() const {
   return rep_->index_reader != nullptr;
 }
 
+Status BlockBasedTable::DumpTable(WritableFile* out_file) {
+  // Output Footer
+  out_file->Append(
+      "Footer Details:\n"
+      "--------------------------------------\n"
+      "  ");
+  out_file->Append(rep_->footer.ToString().c_str());
+  out_file->Append("\n");
+
+  // Output MetaIndex
+  out_file->Append(
+      "Metaindex Details:\n"
+      "--------------------------------------\n");
+  std::unique_ptr<Block> meta;
+  std::unique_ptr<Iterator> meta_iter;
+  Status s = ReadMetaBlock(rep_, &meta, &meta_iter);
+  if (s.ok()) {
+    for (meta_iter->SeekToFirst(); meta_iter->Valid(); meta_iter->Next()) {
+      s = meta_iter->status();
+      if (!s.ok()) {
+        return s;
+      }
+      if (meta_iter->key() == rocksdb::kPropertiesBlock) {
+        out_file->Append("  Properties block handle: ");
+        out_file->Append(meta_iter->value().ToString(true).c_str());
+        out_file->Append("\n");
+      } else if (strstr(meta_iter->key().ToString().c_str(),
+                        "filter.rocksdb.") != nullptr) {
+        out_file->Append("  Filter block handle: ");
+        out_file->Append(meta_iter->value().ToString(true).c_str());
+        out_file->Append("\n");
+      }
+    }
+    out_file->Append("\n");
+  } else {
+    return s;
+  }
+
+  // Output TableProperties
+  const rocksdb::TableProperties* table_properties;
+  table_properties = rep_->table_properties.get();
+
+  if (table_properties != nullptr) {
+    out_file->Append(
+        "Table Properties:\n"
+        "--------------------------------------\n"
+        "  ");
+    out_file->Append(table_properties->ToString("\n  ", ": ").c_str());
+    out_file->Append("\n");
+  }
+
+  // Output Filter blocks
+  if (!rep_->filter && !table_properties->filter_policy_name.empty()) {
+    // Support only BloomFilter as off now
+    rocksdb::BlockBasedTableOptions table_options;
+    table_options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(1));
+    if (table_properties->filter_policy_name.compare(
+            table_options.filter_policy->Name()) == 0) {
+      std::string filter_block_key = kFilterBlockPrefix;
+      filter_block_key.append(table_properties->filter_policy_name);
+      BlockHandle handle;
+      if (FindMetaBlock(meta_iter.get(), filter_block_key, &handle).ok()) {
+        BlockContents block;
+        if (ReadBlockContents(rep_->file.get(), rep_->footer, ReadOptions(),
+                              handle, &block, rep_->ioptions.env, false).ok()) {
+          rep_->filter.reset(new BlockBasedFilterBlockReader(
+              rep_->ioptions.prefix_extractor, table_options,
+              table_options.whole_key_filtering, std::move(block)));
+        }
+      }
+    }
+  }
+  if (rep_->filter) {
+    out_file->Append(
+        "Filter Details:\n"
+        "--------------------------------------\n"
+        "  ");
+    out_file->Append(rep_->filter->ToString().c_str());
+    out_file->Append("\n");
+  }
+
+  // Output Index block
+  s = DumpIndexBlock(out_file);
+  if (!s.ok()) {
+    return s;
+  }
+  // Output Data blocks
+  s = DumpDataBlocks(out_file);
+
+  return s;
+}
+
+Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) {
+  out_file->Append(
+      "Index Details:\n"
+      "--------------------------------------\n");
+
+  std::unique_ptr<Iterator> blockhandles_iter(NewIndexIterator(ReadOptions()));
+  Status s = blockhandles_iter->status();
+  if (!s.ok()) {
+    out_file->Append("Can not read Index Block \n\n");
+    return s;
+  }
+
+  out_file->Append("  Block key hex dump: Data block handle\n");
+  out_file->Append("  Block key ascii\n\n");
+  for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
+       blockhandles_iter->Next()) {
+    s = blockhandles_iter->status();
+    if (!s.ok()) {
+      break;
+    }
+    Slice key = blockhandles_iter->key();
+    InternalKey ikey;
+    ikey.DecodeFrom(key);
+
+    out_file->Append("  HEX    ");
+    out_file->Append(ikey.user_key().ToString(true).c_str());
+    out_file->Append(": ");
+    out_file->Append(blockhandles_iter->value().ToString(true).c_str());
+    out_file->Append("\n");
+
+    std::string str_key = ikey.user_key().ToString();
+    std::string res_key("");
+    char cspace = ' ';
+    for (size_t i = 0; i < str_key.size(); i++) {
+      res_key.append(&str_key[i], 1);
+      res_key.append(1, cspace);
+    }
+    out_file->Append("  ASCII  ");
+    out_file->Append(res_key.c_str());
+    out_file->Append("\n  ------\n");
+  }
+  out_file->Append("\n");
+  return Status::OK();
+}
+
+Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) {
+  std::unique_ptr<Iterator> blockhandles_iter(NewIndexIterator(ReadOptions()));
+  Status s = blockhandles_iter->status();
+  if (!s.ok()) {
+    out_file->Append("Can not read Index Block \n\n");
+    return s;
+  }
+
+  size_t block_id = 1;
+  for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
+       block_id++, blockhandles_iter->Next()) {
+    s = blockhandles_iter->status();
+    if (!s.ok()) {
+      break;
+    }
+
+    out_file->Append("Data Block # ");
+    out_file->Append(rocksdb::ToString(block_id));
+    out_file->Append(" @ ");
+    out_file->Append(blockhandles_iter->value().ToString(true).c_str());
+    out_file->Append("\n");
+    out_file->Append("--------------------------------------\n");
+
+    std::unique_ptr<Iterator> datablock_iter;
+    datablock_iter.reset(
+        NewDataBlockIterator(rep_, ReadOptions(), blockhandles_iter->value()));
+    s = datablock_iter->status();
+
+    if (!s.ok()) {
+      out_file->Append("Error reading the block - Skipped \n\n");
+      continue;
+    }
+
+    for (datablock_iter->SeekToFirst(); datablock_iter->Valid();
+         datablock_iter->Next()) {
+      s = datablock_iter->status();
+      if (!s.ok()) {
+        out_file->Append("Error reading the block - Skipped \n");
+        break;
+      }
+      Slice key = datablock_iter->key();
+      Slice value = datablock_iter->value();
+      InternalKey ikey, iValue;
+      ikey.DecodeFrom(key);
+      iValue.DecodeFrom(value);
+
+      out_file->Append("  HEX    ");
+      out_file->Append(ikey.user_key().ToString(true).c_str());
+      out_file->Append(": ");
+      out_file->Append(iValue.user_key().ToString(true).c_str());
+      out_file->Append("\n");
+
+      std::string str_key = ikey.user_key().ToString();
+      std::string str_value = iValue.user_key().ToString();
+      std::string res_key(""), res_value("");
+      char cspace = ' ';
+      for (size_t i = 0; i < str_key.size(); i++) {
+        res_key.append(&str_key[i], 1);
+        res_key.append(1, cspace);
+      }
+      for (size_t i = 0; i < str_value.size(); i++) {
+        res_value.append(&str_value[i], 1);
+        res_value.append(1, cspace);
+      }
+
+      out_file->Append("  ASCII  ");
+      out_file->Append(res_key.c_str());
+      out_file->Append(": ");
+      out_file->Append(res_value.c_str());
+      out_file->Append("\n  ------\n");
+    }
+    out_file->Append("\n");
+  }
+  return Status::OK();
+}
+
 }  // namespace rocksdb
diff --git a/src/rocksdb/table/block_based_table_reader.h b/src/rocksdb/table/block_based_table_reader.h
index f68d642..727a0d6 100644
--- a/src/rocksdb/table/block_based_table_reader.h
+++ b/src/rocksdb/table/block_based_table_reader.h
@@ -14,18 +14,23 @@
 #include <utility>
 #include <string>
 
+#include "rocksdb/options.h"
 #include "rocksdb/statistics.h"
 #include "rocksdb/status.h"
 #include "rocksdb/table.h"
 #include "table/table_reader.h"
+#include "table/table_properties_internal.h"
 #include "util/coding.h"
 
 namespace rocksdb {
 
 class Block;
+class BlockIter;
 class BlockHandle;
 class Cache;
 class FilterBlockReader;
+class BlockBasedFilterBlockReader;
+class FullFilterBlockReader;
 class Footer;
 class InternalKeyComparator;
 class Iterator;
@@ -35,8 +40,8 @@ class TableReader;
 class WritableFile;
 struct BlockBasedTableOptions;
 struct EnvOptions;
-struct Options;
 struct ReadOptions;
+class GetContext;
 
 using std::unique_ptr;
 
@@ -46,6 +51,7 @@ using std::unique_ptr;
 class BlockBasedTable : public TableReader {
  public:
   static const std::string kFilterBlockPrefix;
+  static const std::string kFullFilterBlockPrefix;
 
   // Attempt to open the table that is stored in bytes [0..file_size)
   // of "file", and read the metadata entries necessary to allow
@@ -57,26 +63,30 @@ class BlockBasedTable : public TableReader {
   // to nullptr and returns a non-ok status.
   //
   // *file must remain live while this Table is in use.
-  static Status Open(const Options& db_options, const EnvOptions& env_options,
+  // *prefetch_blocks can be used to disable prefetching of index and filter
+  //  blocks at statup
+  static Status Open(const ImmutableCFOptions& ioptions,
+                     const EnvOptions& env_options,
                      const BlockBasedTableOptions& table_options,
                      const InternalKeyComparator& internal_key_comparator,
                      unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
-                     unique_ptr<TableReader>* table_reader);
+                     unique_ptr<TableReader>* table_reader,
+                     bool prefetch_index_and_filter = true);
 
   bool PrefixMayMatch(const Slice& internal_key);
 
   // Returns a new iterator over the table contents.
   // The result of NewIterator() is initially invalid (caller must
   // call one of the Seek methods on the iterator before using it).
-  Iterator* NewIterator(const ReadOptions&) override;
+  Iterator* NewIterator(const ReadOptions&, Arena* arena = nullptr) override;
 
   Status Get(const ReadOptions& readOptions, const Slice& key,
-             void* handle_context,
-             bool (*result_handler)(void* handle_context,
-                                    const ParsedInternalKey& k, const Slice& v,
-                                    bool didIO),
-             void (*mark_key_may_exist_handler)(void* handle_context) =
-                 nullptr) override;
+             GetContext* get_context) override;
+
+  // Pre-fetch the disk blocks that correspond to the key range specified by
+  // (kbegin, kend). The call will return return error status in the event of
+  // IO or iteration error.
+  Status Prefetch(const Slice* begin, const Slice* end) override;
 
   // Given a key, return an approximate byte offset in the file where
   // the data for that key begins (or would begin if the key were
@@ -87,7 +97,7 @@ class BlockBasedTable : public TableReader {
   uint64_t ApproximateOffsetOf(const Slice& key) override;
 
   // Returns true if the block for the specified key is in cache.
-  // REQUIRES: key is in this table.
+  // REQUIRES: key is in this table && block cache enabled
   bool TEST_KeyInCache(const ReadOptions& options, const Slice& key);
 
   // Set up the table for Compaction. Might change some parameters with
@@ -96,6 +106,11 @@ class BlockBasedTable : public TableReader {
 
   std::shared_ptr<const TableProperties> GetTableProperties() const override;
 
+  size_t ApproximateMemoryUsage() const override;
+
+  // convert SST file to a human readable form
+  Status DumpTable(WritableFile* out_file) override;
+
   ~BlockBasedTable();
 
   bool TEST_filter_block_preloaded() const;
@@ -112,8 +127,10 @@ class BlockBasedTable : public TableReader {
   bool compaction_optimized_;
 
   class BlockEntryIteratorState;
+  // input_iter: if it is not null, update this one and return it as Iterator
   static Iterator* NewDataBlockIterator(Rep* rep, const ReadOptions& ro,
-      bool* didIO, const Slice& index_value);
+                                        const Slice& index_value,
+                                        BlockIter* input_iter = nullptr);
 
   // For the following two functions:
   // if `no_io == true`, we will not try to read filter/index from sst file
@@ -121,6 +138,8 @@ class BlockBasedTable : public TableReader {
   CachableEntry<FilterBlockReader> GetFilter(bool no_io = false) const;
 
   // Get the iterator from the index reader.
+  // If input_iter is not set, return new Iterator
+  // If input_iter is set, update it and return it as Iterator
   //
   // Note: ErrorIterator with Status::Incomplete shall be returned if all the
   // following conditions are met:
@@ -128,7 +147,8 @@ class BlockBasedTable : public TableReader {
   //  2. index is not present in block cache.
   //  3. We disallowed any io to be performed, that is, read_options ==
   //     kBlockCacheTier
-  Iterator* NewIndexIterator(const ReadOptions& read_options);
+  Iterator* NewIndexIterator(const ReadOptions& read_options,
+                             BlockIter* input_iter = nullptr);
 
   // Read block cache from block caches (if set): block_cache and
   // block_cache_compressed.
@@ -138,7 +158,7 @@ class BlockBasedTable : public TableReader {
       const Slice& block_cache_key, const Slice& compressed_block_cache_key,
       Cache* block_cache, Cache* block_cache_compressed, Statistics* statistics,
       const ReadOptions& read_options,
-      BlockBasedTable::CachableEntry<Block>* block);
+      BlockBasedTable::CachableEntry<Block>* block, uint32_t format_version);
   // Put a raw block (maybe compressed) to the corresponding block caches.
   // This method will perform decompression against raw_block if needed and then
   // populate the block caches.
@@ -151,7 +171,7 @@ class BlockBasedTable : public TableReader {
       const Slice& block_cache_key, const Slice& compressed_block_cache_key,
       Cache* block_cache, Cache* block_cache_compressed,
       const ReadOptions& read_options, Statistics* statistics,
-      CachableEntry<Block>* block, Block* raw_block);
+      CachableEntry<Block>* block, Block* raw_block, uint32_t format_version);
 
   // Calls (*handle_result)(arg, ...) repeatedly, starting with the entry found
   // after a call to Seek(key), until handle_result returns false.
@@ -160,8 +180,16 @@ class BlockBasedTable : public TableReader {
   friend class BlockBasedTableBuilder;
 
   void ReadMeta(const Footer& footer);
-  void ReadFilter(const Slice& filter_handle_value);
-  Status CreateIndexReader(IndexReader** index_reader);
+
+  // Create a index reader based on the index type stored in the table.
+  // Optionally, user can pass a preloaded meta_index_iter for the index that
+  // need to access extra meta blocks for index construction. This parameter
+  // helps avoid re-reading meta index block if caller already created one.
+  Status CreateIndexReader(IndexReader** index_reader,
+                           Iterator* preloaded_meta_index_iter = nullptr);
+
+  bool FullFilterKeyMayMatch(FilterBlockReader* filter,
+                             const Slice& user_key) const;
 
   // Read the meta block from sst.
   static Status ReadMetaBlock(
@@ -171,8 +199,8 @@ class BlockBasedTable : public TableReader {
 
   // Create the filter from the filter block.
   static FilterBlockReader* ReadFilter(
-      const Slice& filter_handle_value,
       Rep* rep,
+      Iterator* meta_index_iter,
       size_t* filter_size = nullptr);
 
   static void SetupCacheKeyPrefix(Rep* rep);
@@ -190,6 +218,10 @@ class BlockBasedTable : public TableReader {
   // For Posix files the unique ID is three varints.
   static const size_t kMaxCacheKeyPrefixSize = kMaxVarint64Length*3+1;
 
+  // Helper functions for DumpTable()
+  Status DumpIndexBlock(WritableFile* out_file);
+  Status DumpDataBlocks(WritableFile* out_file);
+
   // No copying allowed
   explicit BlockBasedTable(const TableReader&) = delete;
   void operator=(const TableReader&) = delete;
diff --git a/src/rocksdb/table/block_builder.cc b/src/rocksdb/table/block_builder.cc
index f812dba..1eee96d 100644
--- a/src/rocksdb/table/block_builder.cc
+++ b/src/rocksdb/table/block_builder.cc
@@ -41,10 +41,8 @@
 
 namespace rocksdb {
 
-BlockBuilder::BlockBuilder(int block_restart_interval,
-                           const Comparator* comparator)
+BlockBuilder::BlockBuilder(int block_restart_interval)
     : block_restart_interval_(block_restart_interval),
-      comparator_(comparator),
       restarts_(),
       counter_(0),
       finished_(false) {
@@ -52,9 +50,6 @@ BlockBuilder::BlockBuilder(int block_restart_interval,
   restarts_.push_back(0);       // First restart point is at offset 0
 }
 
-BlockBuilder::BlockBuilder(const Options& options, const Comparator* comparator)
-    : BlockBuilder(options.block_restart_interval, comparator) {}
-
 void BlockBuilder::Reset() {
   buffer_.clear();
   restarts_.clear();
@@ -90,7 +85,7 @@ Slice BlockBuilder::Finish() {
   for (size_t i = 0; i < restarts_.size(); i++) {
     PutFixed32(&buffer_, restarts_[i]);
   }
-  PutFixed32(&buffer_, restarts_.size());
+  PutFixed32(&buffer_, static_cast<uint32_t>(restarts_.size()));
   finished_ = true;
   return Slice(buffer_);
 }
@@ -99,8 +94,6 @@ void BlockBuilder::Add(const Slice& key, const Slice& value) {
   Slice last_key_piece(last_key_);
   assert(!finished_);
   assert(counter_ <= block_restart_interval_);
-  assert(buffer_.empty() // No values yet?
-         || comparator_->Compare(key, last_key_piece) > 0);
   size_t shared = 0;
   if (counter_ < block_restart_interval_) {
     // See how much sharing to do with previous string
@@ -110,15 +103,15 @@ void BlockBuilder::Add(const Slice& key, const Slice& value) {
     }
   } else {
     // Restart compression
-    restarts_.push_back(buffer_.size());
+    restarts_.push_back(static_cast<uint32_t>(buffer_.size()));
     counter_ = 0;
   }
   const size_t non_shared = key.size() - shared;
 
   // Add "<shared><non_shared><value_size>" to buffer_
-  PutVarint32(&buffer_, shared);
-  PutVarint32(&buffer_, non_shared);
-  PutVarint32(&buffer_, value.size());
+  PutVarint32(&buffer_, static_cast<uint32_t>(shared));
+  PutVarint32(&buffer_, static_cast<uint32_t>(non_shared));
+  PutVarint32(&buffer_, static_cast<uint32_t>(value.size()));
 
   // Add string delta to buffer_ followed by value
   buffer_.append(key.data() + shared, non_shared);
diff --git a/src/rocksdb/table/block_builder.h b/src/rocksdb/table/block_builder.h
index ed2f290..c01a23b 100644
--- a/src/rocksdb/table/block_builder.h
+++ b/src/rocksdb/table/block_builder.h
@@ -15,13 +15,12 @@
 
 namespace rocksdb {
 
-struct Options;
-class Comparator;
-
 class BlockBuilder {
  public:
-  BlockBuilder(int block_builder, const Comparator* comparator);
-  explicit BlockBuilder(const Options& options, const Comparator* comparator);
+  BlockBuilder(const BlockBuilder&) = delete;
+  void operator=(const BlockBuilder&) = delete;
+
+  explicit BlockBuilder(int block_restart_interval);
 
   // Reset the contents as if the BlockBuilder was just constructed.
   void Reset();
@@ -49,17 +48,12 @@ class BlockBuilder {
 
  private:
   const int          block_restart_interval_;
-  const Comparator*  comparator_;
 
   std::string           buffer_;    // Destination buffer
   std::vector<uint32_t> restarts_;  // Restart points
   int                   counter_;   // Number of entries emitted since restart
   bool                  finished_;  // Has Finish() been called?
   std::string           last_key_;
-
-  // No copying allowed
-  BlockBuilder(const BlockBuilder&);
-  void operator=(const BlockBuilder&);
 };
 
 }  // namespace rocksdb
diff --git a/src/rocksdb/table/block_hash_index.cc b/src/rocksdb/table/block_hash_index.cc
index 0c9674c..02ebcbc 100644
--- a/src/rocksdb/table/block_hash_index.cc
+++ b/src/rocksdb/table/block_hash_index.cc
@@ -3,22 +3,63 @@
 // LICENSE file in the root directory of this source tree. An additional grant
 // of patent rights can be found in the PATENTS file in the same directory.
 
+#include "table/block_hash_index.h"
+
 #include <algorithm>
 
-#include "table/block_hash_index.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/slice_transform.h"
+#include "util/coding.h"
 
 namespace rocksdb {
 
-BlockHashIndex* CreateBlockHashIndex(Iterator* index_iter, Iterator* data_iter,
-                                     const uint32_t num_restarts,
-                                     const Comparator* comparator,
-                                     const SliceTransform* hash_key_extractor) {
+Status CreateBlockHashIndex(const SliceTransform* hash_key_extractor,
+                            const Slice& prefixes, const Slice& prefix_meta,
+                            BlockHashIndex** hash_index) {
+  uint64_t pos = 0;
+  auto meta_pos = prefix_meta;
+  Status s;
+  *hash_index = new BlockHashIndex(
+      hash_key_extractor,
+      false /* external module manages memory space for prefixes */);
+
+  while (!meta_pos.empty()) {
+    uint32_t prefix_size = 0;
+    uint32_t entry_index = 0;
+    uint32_t num_blocks = 0;
+    if (!GetVarint32(&meta_pos, &prefix_size) ||
+        !GetVarint32(&meta_pos, &entry_index) ||
+        !GetVarint32(&meta_pos, &num_blocks)) {
+      s = Status::Corruption(
+          "Corrupted prefix meta block: unable to read from it.");
+      break;
+    }
+    Slice prefix(prefixes.data() + pos, prefix_size);
+    (*hash_index)->Add(prefix, entry_index, num_blocks);
+
+    pos += prefix_size;
+  }
+
+  if (s.ok() && pos != prefixes.size()) {
+    s = Status::Corruption("Corrupted prefix meta block");
+  }
+
+  if (!s.ok()) {
+    delete *hash_index;
+  }
+
+  return s;
+}
+
+BlockHashIndex* CreateBlockHashIndexOnTheFly(
+    Iterator* index_iter, Iterator* data_iter, const uint32_t num_restarts,
+    const Comparator* comparator, const SliceTransform* hash_key_extractor) {
   assert(hash_key_extractor);
-  auto hash_index = new BlockHashIndex(hash_key_extractor);
-  uint64_t current_restart_index = 0;
+  auto hash_index = new BlockHashIndex(
+      hash_key_extractor,
+      true /* hash_index will copy prefix when Add() is called */);
+  uint32_t current_restart_index = 0;
 
   std::string pending_entry_prefix;
   // pending_block_num == 0 also implies there is no entry inserted at all.
@@ -57,7 +98,7 @@ BlockHashIndex* CreateBlockHashIndex(Iterator* index_iter, Iterator* data_iter,
         pending_entry_index = current_restart_index;
       } else {
         // entry number increments when keys share the prefix reside in
-        // differnt data blocks.
+        // different data blocks.
         auto last_restart_index = pending_entry_index + pending_block_num - 1;
         assert(last_restart_index <= current_restart_index);
         if (last_restart_index != current_restart_index) {
@@ -88,12 +129,16 @@ BlockHashIndex* CreateBlockHashIndex(Iterator* index_iter, Iterator* data_iter,
 
 bool BlockHashIndex::Add(const Slice& prefix, uint32_t restart_index,
                          uint32_t num_blocks) {
-  auto prefix_ptr = arena_.Allocate(prefix.size());
-  std::copy(prefix.data() /* begin */, prefix.data() + prefix.size() /* end */,
-            prefix_ptr /* destination */);
-  auto result =
-      restart_indices_.insert({Slice(prefix_ptr, prefix.size()),
-                               RestartIndex(restart_index, num_blocks)});
+  auto prefix_to_insert = prefix;
+  if (kOwnPrefixes) {
+    auto prefix_ptr = arena_.Allocate(prefix.size());
+    std::copy(prefix.data() /* begin */,
+              prefix.data() + prefix.size() /* end */,
+              prefix_ptr /* destination */);
+    prefix_to_insert = Slice(prefix_ptr, prefix.size());
+  }
+  auto result = restart_indices_.insert(
+      {prefix_to_insert, RestartIndex(restart_index, num_blocks)});
   return result.second;
 }
 
diff --git a/src/rocksdb/table/block_hash_index.h b/src/rocksdb/table/block_hash_index.h
index 0ff65b4..5829107 100644
--- a/src/rocksdb/table/block_hash_index.h
+++ b/src/rocksdb/table/block_hash_index.h
@@ -7,6 +7,7 @@
 #include <string>
 #include <unordered_map>
 
+#include "rocksdb/status.h"
 #include "util/arena.h"
 #include "util/murmurhash.h"
 
@@ -24,8 +25,8 @@ class BlockHashIndex {
  public:
   // Represents a restart index in the index block's restart array.
   struct RestartIndex {
-    explicit RestartIndex(uint32_t first_index, uint32_t num_blocks = 1)
-        : first_index(first_index), num_blocks(num_blocks) {}
+    explicit RestartIndex(uint32_t _first_index, uint32_t _num_blocks = 1)
+        : first_index(_first_index), num_blocks(_num_blocks) {}
 
     // For a given prefix, what is the restart index for the first data block
     // that contains it.
@@ -35,8 +36,12 @@ class BlockHashIndex {
     uint32_t num_blocks = 1;
   };
 
-  explicit BlockHashIndex(const SliceTransform* hash_key_extractor)
-      : hash_key_extractor_(hash_key_extractor) {}
+  // @params own_prefixes indicate if we should take care the memory space for
+  // the `key_prefix`
+  // passed by Add()
+  explicit BlockHashIndex(const SliceTransform* hash_key_extractor,
+                          bool own_prefixes)
+      : hash_key_extractor_(hash_key_extractor), kOwnPrefixes(own_prefixes) {}
 
   // Maps a key to its restart first_index.
   // Returns nullptr if the restart first_index is found
@@ -52,9 +57,18 @@ class BlockHashIndex {
  private:
   const SliceTransform* hash_key_extractor_;
   std::unordered_map<Slice, RestartIndex, murmur_hash> restart_indices_;
+
   Arena arena_;
+  bool kOwnPrefixes;
 };
 
+// Create hash index by reading from the metadata blocks.
+// @params prefixes: a sequence of prefixes.
+// @params prefix_meta: contains the "metadata" to of the prefixes.
+Status CreateBlockHashIndex(const SliceTransform* hash_key_extractor,
+                            const Slice& prefixes, const Slice& prefix_meta,
+                            BlockHashIndex** hash_index);
+
 // Create hash index by scanning the entries in index as well as the whole
 // dataset.
 // @params index_iter: an iterator with the pointer to the first entry in a
@@ -64,9 +78,8 @@ class BlockHashIndex {
 // @params num_restarts: used for correctness verification.
 // @params hash_key_extractor: extract the hashable part of a given key.
 // On error, nullptr will be returned.
-BlockHashIndex* CreateBlockHashIndex(Iterator* index_iter, Iterator* data_iter,
-                                     const uint32_t num_restarts,
-                                     const Comparator* comparator,
-                                     const SliceTransform* hash_key_extractor);
+BlockHashIndex* CreateBlockHashIndexOnTheFly(
+    Iterator* index_iter, Iterator* data_iter, const uint32_t num_restarts,
+    const Comparator* comparator, const SliceTransform* hash_key_extractor);
 
 }  // namespace rocksdb
diff --git a/src/rocksdb/table/block_hash_index_test.cc b/src/rocksdb/table/block_hash_index_test.cc
index f4c0ac4..b001c20 100644
--- a/src/rocksdb/table/block_hash_index_test.cc
+++ b/src/rocksdb/table/block_hash_index_test.cc
@@ -22,37 +22,37 @@ class MapIterator : public Iterator {
  public:
   explicit MapIterator(const Data& data) : data_(data), pos_(data_.end()) {}
 
-  virtual bool Valid() const { return pos_ != data_.end(); }
+  virtual bool Valid() const override { return pos_ != data_.end(); }
 
-  virtual void SeekToFirst() { pos_ = data_.begin(); }
+  virtual void SeekToFirst() override { pos_ = data_.begin(); }
 
-  virtual void SeekToLast() {
+  virtual void SeekToLast() override {
     pos_ = data_.end();
     --pos_;
   }
 
-  virtual void Seek(const Slice& target) {
+  virtual void Seek(const Slice& target) override {
     pos_ = data_.find(target.ToString());
   }
 
-  virtual void Next() { ++pos_; }
+  virtual void Next() override { ++pos_; }
 
-  virtual void Prev() { --pos_; }
+  virtual void Prev() override { --pos_; }
 
-  virtual Slice key() const { return pos_->first; }
+  virtual Slice key() const override { return pos_->first; }
 
-  virtual Slice value() const { return pos_->second; }
+  virtual Slice value() const override { return pos_->second; }
 
-  virtual Status status() const { return Status::OK(); }
+  virtual Status status() const override { return Status::OK(); }
 
  private:
   const Data& data_;
   Data::const_iterator pos_;
 };
 
-class BlockTest {};
+class BlockTest : public testing::Test {};
 
-TEST(BlockTest, BasicTest) {
+TEST_F(BlockTest, BasicTest) {
   const size_t keys_per_block = 4;
   const size_t prefix_size = 2;
   std::vector<std::string> keys = {/* block 1 */
@@ -81,9 +81,9 @@ TEST(BlockTest, BasicTest) {
   MapIterator index_iter(index_entries);
 
   auto prefix_extractor = NewFixedPrefixTransform(prefix_size);
-  std::unique_ptr<BlockHashIndex> block_hash_index(
-      CreateBlockHashIndex(&index_iter, &data_iter, index_entries.size(),
-                           BytewiseComparator(), prefix_extractor));
+  std::unique_ptr<BlockHashIndex> block_hash_index(CreateBlockHashIndexOnTheFly(
+      &index_iter, &data_iter, static_cast<uint32_t>(index_entries.size()),
+      BytewiseComparator(), prefix_extractor));
 
   std::map<std::string, BlockHashIndex::RestartIndex> expected = {
       {"01xx", BlockHashIndex::RestartIndex(0, 1)},
@@ -114,4 +114,7 @@ TEST(BlockTest, BasicTest) {
 
 }  // namespace rocksdb
 
-int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); }
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/table/block_prefix_index.cc b/src/rocksdb/table/block_prefix_index.cc
new file mode 100644
index 0000000..147bcf5
--- /dev/null
+++ b/src/rocksdb/table/block_prefix_index.cc
@@ -0,0 +1,236 @@
+// Copyright (c) 2014, Facebook, Inc. All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#include "table/block_prefix_index.h"
+
+#include <vector>
+
+#include "rocksdb/comparator.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "util/arena.h"
+#include "util/coding.h"
+#include "util/hash.h"
+
+namespace rocksdb {
+
+inline uint32_t Hash(const Slice& s) {
+  return rocksdb::Hash(s.data(), s.size(), 0);
+}
+
+inline uint32_t PrefixToBucket(const Slice& prefix, uint32_t num_buckets) {
+  return Hash(prefix) % num_buckets;
+}
+
+// The prefix block index is simply a bucket array, with each entry pointing to
+// the blocks that span the prefixes hashed to this bucket.
+//
+// To reduce memory footprint, if there is only one block per bucket, the entry
+// stores the block id directly. If there are more than one blocks per bucket,
+// because of hash collision or a single prefix spanning multiple blocks,
+// the entry points to an array of block ids. The block array is an array of
+// uint32_t's. The first uint32_t indicates the total number of blocks, followed
+// by the block ids.
+//
+// To differentiate the two cases, the high order bit of the entry indicates
+// whether it is a 'pointer' into a separate block array.
+// 0x7FFFFFFF is reserved for empty bucket.
+
+const uint32_t kNoneBlock = 0x7FFFFFFF;
+const uint32_t kBlockArrayMask = 0x80000000;
+
+inline bool IsNone(uint32_t block_id) {
+  return block_id == kNoneBlock;
+}
+
+inline bool IsBlockId(uint32_t block_id) {
+  return (block_id & kBlockArrayMask) == 0;
+}
+
+inline uint32_t DecodeIndex(uint32_t block_id) {
+  uint32_t index = block_id ^ kBlockArrayMask;
+  assert(index < kBlockArrayMask);
+  return index;
+}
+
+inline uint32_t EncodeIndex(uint32_t index) {
+  assert(index < kBlockArrayMask);
+  return index | kBlockArrayMask;
+}
+
+// temporary storage for prefix information during index building
+struct PrefixRecord {
+  Slice prefix;
+  uint32_t start_block;
+  uint32_t end_block;
+  uint32_t num_blocks;
+  PrefixRecord* next;
+};
+
+class BlockPrefixIndex::Builder {
+ public:
+  explicit Builder(const SliceTransform* internal_prefix_extractor)
+      : internal_prefix_extractor_(internal_prefix_extractor) {}
+
+  void Add(const Slice& key_prefix, uint32_t start_block,
+           uint32_t num_blocks) {
+    PrefixRecord* record = reinterpret_cast<PrefixRecord*>(
+      arena_.AllocateAligned(sizeof(PrefixRecord)));
+    record->prefix = key_prefix;
+    record->start_block = start_block;
+    record->end_block = start_block + num_blocks - 1;
+    record->num_blocks = num_blocks;
+    prefixes_.push_back(record);
+  }
+
+  BlockPrefixIndex* Finish() {
+    // For now, use roughly 1:1 prefix to bucket ratio.
+    uint32_t num_buckets = static_cast<uint32_t>(prefixes_.size()) + 1;
+
+    // Collect prefix records that hash to the same bucket, into a single
+    // linklist.
+    std::vector<PrefixRecord*> prefixes_per_bucket(num_buckets, nullptr);
+    std::vector<uint32_t> num_blocks_per_bucket(num_buckets, 0);
+    for (PrefixRecord* current : prefixes_) {
+      uint32_t bucket = PrefixToBucket(current->prefix, num_buckets);
+      // merge the prefix block span if the first block of this prefix is
+      // connected to the last block of the previous prefix.
+      PrefixRecord* prev = prefixes_per_bucket[bucket];
+      if (prev) {
+        assert(current->start_block >= prev->end_block);
+        auto distance = current->start_block - prev->end_block;
+        if (distance <= 1) {
+          prev->end_block = current->end_block;
+          prev->num_blocks = prev->end_block - prev->start_block + 1;
+          num_blocks_per_bucket[bucket] += (current->num_blocks + distance - 1);
+          continue;
+        }
+      }
+      current->next = prev;
+      prefixes_per_bucket[bucket] = current;
+      num_blocks_per_bucket[bucket] += current->num_blocks;
+    }
+
+    // Calculate the block array buffer size
+    uint32_t total_block_array_entries = 0;
+    for (uint32_t i = 0; i < num_buckets; i++) {
+      uint32_t num_blocks = num_blocks_per_bucket[i];
+      if (num_blocks > 1) {
+        total_block_array_entries += (num_blocks + 1);
+      }
+    }
+
+    // Populate the final prefix block index
+    uint32_t* block_array_buffer = new uint32_t[total_block_array_entries];
+    uint32_t* buckets = new uint32_t[num_buckets];
+    uint32_t offset = 0;
+    for (uint32_t i = 0; i < num_buckets; i++) {
+      uint32_t num_blocks = num_blocks_per_bucket[i];
+      if (num_blocks == 0) {
+        assert(prefixes_per_bucket[i] == nullptr);
+        buckets[i] = kNoneBlock;
+      } else if (num_blocks == 1) {
+        assert(prefixes_per_bucket[i] != nullptr);
+        assert(prefixes_per_bucket[i]->next == nullptr);
+        buckets[i] = prefixes_per_bucket[i]->start_block;
+      } else {
+        assert(prefixes_per_bucket[i] != nullptr);
+        buckets[i] = EncodeIndex(offset);
+        block_array_buffer[offset] = num_blocks;
+        uint32_t* last_block = &block_array_buffer[offset + num_blocks];
+        auto current = prefixes_per_bucket[i];
+        // populate block ids from largest to smallest
+        while (current != nullptr) {
+          for (uint32_t iter = 0; iter < current->num_blocks; iter++) {
+            *last_block = current->end_block - iter;
+            last_block--;
+          }
+          current = current->next;
+        }
+        assert(last_block == &block_array_buffer[offset]);
+        offset += (num_blocks + 1);
+      }
+    }
+
+    assert(offset == total_block_array_entries);
+
+    return new BlockPrefixIndex(internal_prefix_extractor_, num_buckets,
+                                buckets, total_block_array_entries,
+                                block_array_buffer);
+  }
+
+ private:
+  const SliceTransform* internal_prefix_extractor_;
+
+  std::vector<PrefixRecord*> prefixes_;
+  Arena arena_;
+};
+
+
+Status BlockPrefixIndex::Create(const SliceTransform* internal_prefix_extractor,
+                                const Slice& prefixes, const Slice& prefix_meta,
+                                BlockPrefixIndex** prefix_index) {
+  uint64_t pos = 0;
+  auto meta_pos = prefix_meta;
+  Status s;
+  Builder builder(internal_prefix_extractor);
+
+  while (!meta_pos.empty()) {
+    uint32_t prefix_size = 0;
+    uint32_t entry_index = 0;
+    uint32_t num_blocks = 0;
+    if (!GetVarint32(&meta_pos, &prefix_size) ||
+        !GetVarint32(&meta_pos, &entry_index) ||
+        !GetVarint32(&meta_pos, &num_blocks)) {
+      s = Status::Corruption(
+          "Corrupted prefix meta block: unable to read from it.");
+      break;
+    }
+    if (pos + prefix_size > prefixes.size()) {
+      s = Status::Corruption(
+        "Corrupted prefix meta block: size inconsistency.");
+      break;
+    }
+    Slice prefix(prefixes.data() + pos, prefix_size);
+    builder.Add(prefix, entry_index, num_blocks);
+
+    pos += prefix_size;
+  }
+
+  if (s.ok() && pos != prefixes.size()) {
+    s = Status::Corruption("Corrupted prefix meta block");
+  }
+
+  if (s.ok()) {
+    *prefix_index = builder.Finish();
+  }
+
+  return s;
+}
+
+uint32_t BlockPrefixIndex::GetBlocks(const Slice& key,
+                                     uint32_t** blocks) {
+  Slice prefix = internal_prefix_extractor_->Transform(key);
+
+  uint32_t bucket = PrefixToBucket(prefix, num_buckets_);
+  uint32_t block_id = buckets_[bucket];
+
+  if (IsNone(block_id)) {
+    return 0;
+  } else if (IsBlockId(block_id)) {
+    *blocks = &buckets_[bucket];
+    return 1;
+  } else {
+    uint32_t index = DecodeIndex(block_id);
+    assert(index < num_block_array_buffer_entries_);
+    *blocks = &block_array_buffer_[index+1];
+    uint32_t num_blocks = block_array_buffer_[index];
+    assert(num_blocks > 1);
+    assert(index + num_blocks < num_block_array_buffer_entries_);
+    return num_blocks;
+  }
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/table/block_prefix_index.h b/src/rocksdb/table/block_prefix_index.h
new file mode 100644
index 0000000..662bc09
--- /dev/null
+++ b/src/rocksdb/table/block_prefix_index.h
@@ -0,0 +1,67 @@
+// Copyright (c) 2013, Facebook, Inc. All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+#pragma once
+
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+class Comparator;
+class Iterator;
+class Slice;
+class SliceTransform;
+
+// Build a hash-based index to speed up the lookup for "index block".
+// BlockHashIndex accepts a key and, if found, returns its restart index within
+// that index block.
+class BlockPrefixIndex {
+ public:
+
+  // Maps a key to a list of data blocks that could potentially contain
+  // the key, based on the prefix.
+  // Returns the total number of relevant blocks, 0 means the key does
+  // not exist.
+  uint32_t GetBlocks(const Slice& key, uint32_t** blocks);
+
+  size_t ApproximateMemoryUsage() const {
+    return sizeof(BlockPrefixIndex) +
+      (num_block_array_buffer_entries_ + num_buckets_) * sizeof(uint32_t);
+  }
+
+  // Create hash index by reading from the metadata blocks.
+  // @params prefixes: a sequence of prefixes.
+  // @params prefix_meta: contains the "metadata" to of the prefixes.
+  static Status Create(const SliceTransform* hash_key_extractor,
+                       const Slice& prefixes, const Slice& prefix_meta,
+                       BlockPrefixIndex** prefix_index);
+
+  ~BlockPrefixIndex() {
+    delete[] buckets_;
+    delete[] block_array_buffer_;
+  }
+
+ private:
+  class Builder;
+  friend Builder;
+
+  BlockPrefixIndex(const SliceTransform* internal_prefix_extractor,
+                   uint32_t num_buckets,
+                   uint32_t* buckets,
+                   uint32_t num_block_array_buffer_entries,
+                   uint32_t* block_array_buffer)
+      : internal_prefix_extractor_(internal_prefix_extractor),
+        num_buckets_(num_buckets),
+        num_block_array_buffer_entries_(num_block_array_buffer_entries),
+        buckets_(buckets),
+        block_array_buffer_(block_array_buffer) {}
+
+  const SliceTransform* internal_prefix_extractor_;
+  uint32_t num_buckets_;
+  uint32_t num_block_array_buffer_entries_;
+  uint32_t* buckets_;
+  uint32_t* block_array_buffer_;
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/table/block_test.cc b/src/rocksdb/table/block_test.cc
index fdba8e9..c86f38d 100644
--- a/src/rocksdb/table/block_test.cc
+++ b/src/rocksdb/table/block_test.cc
@@ -65,10 +65,10 @@ void GenerateRandomKVs(std::vector<std::string> *keys,
   }
 }
 
-class BlockTest {};
+class BlockTest : public testing::Test {};
 
 // block test
-TEST(BlockTest, SimpleTest) {
+TEST_F(BlockTest, SimpleTest) {
   Random rnd(301);
   Options options = Options();
   std::unique_ptr<InternalKeyComparator> ic;
@@ -76,7 +76,7 @@ TEST(BlockTest, SimpleTest) {
 
   std::vector<std::string> keys;
   std::vector<std::string> values;
-  BlockBuilder builder(options, ic.get());
+  BlockBuilder builder(16);
   int num_records = 100000;
 
   GenerateRandomKVs(&keys, &values, 0, num_records);
@@ -92,8 +92,7 @@ TEST(BlockTest, SimpleTest) {
   BlockContents contents;
   contents.data = rawblock;
   contents.cachable = false;
-  contents.heap_allocated = false;
-  Block reader(contents);
+  Block reader(std::move(contents));
 
   // read contents of block sequentially
   int count = 0;
@@ -132,8 +131,7 @@ BlockContents GetBlockContents(std::unique_ptr<BlockBuilder> *builder,
                                const std::vector<std::string> &keys,
                                const std::vector<std::string> &values,
                                const int prefix_group_size = 1) {
-  builder->reset(
-      new BlockBuilder(1 /* restart interval */, BytewiseComparator()));
+  builder->reset(new BlockBuilder(1 /* restart interval */));
 
   // Add only half of the keys
   for (size_t i = 0; i < keys.size(); ++i) {
@@ -144,7 +142,6 @@ BlockContents GetBlockContents(std::unique_ptr<BlockBuilder> *builder,
   BlockContents contents;
   contents.data = rawblock;
   contents.cachable = false;
-  contents.heap_allocated = false;
 
   return contents;
 }
@@ -154,8 +151,10 @@ void CheckBlockContents(BlockContents contents, const int max_key,
                         const std::vector<std::string> &values) {
   const size_t prefix_size = 6;
   // create block reader
-  Block reader1(contents);
-  Block reader2(contents);
+  BlockContents contents_ref(contents.data, contents.cachable,
+                             contents.compression_type);
+  Block reader1(std::move(contents));
+  Block reader2(std::move(contents_ref));
 
   std::unique_ptr<const SliceTransform> prefix_extractor(
       NewFixedPrefixTransform(prefix_size));
@@ -163,16 +162,16 @@ void CheckBlockContents(BlockContents contents, const int max_key,
   {
     auto iter1 = reader1.NewIterator(nullptr);
     auto iter2 = reader1.NewIterator(nullptr);
-    reader1.SetBlockHashIndex(CreateBlockHashIndex(iter1, iter2, keys.size(),
-                                                   BytewiseComparator(),
-                                                   prefix_extractor.get()));
+    reader1.SetBlockHashIndex(CreateBlockHashIndexOnTheFly(
+        iter1, iter2, static_cast<uint32_t>(keys.size()), BytewiseComparator(),
+        prefix_extractor.get()));
 
     delete iter1;
     delete iter2;
   }
 
   std::unique_ptr<Iterator> hash_iter(
-      reader1.NewIterator(BytewiseComparator()));
+      reader1.NewIterator(BytewiseComparator(), nullptr, false));
 
   std::unique_ptr<Iterator> regular_iter(
       reader2.NewIterator(BytewiseComparator()));
@@ -202,7 +201,7 @@ void CheckBlockContents(BlockContents contents, const int max_key,
 }
 
 // In this test case, no two key share same prefix.
-TEST(BlockTest, SimpleIndexHash) {
+TEST_F(BlockTest, SimpleIndexHash) {
   const int kMaxKey = 100000;
   std::vector<std::string> keys;
   std::vector<std::string> values;
@@ -213,10 +212,10 @@ TEST(BlockTest, SimpleIndexHash) {
   std::unique_ptr<BlockBuilder> builder;
   auto contents = GetBlockContents(&builder, keys, values);
 
-  CheckBlockContents(contents, kMaxKey, keys, values);
+  CheckBlockContents(std::move(contents), kMaxKey, keys, values);
 }
 
-TEST(BlockTest, IndexHashWithSharedPrefix) {
+TEST_F(BlockTest, IndexHashWithSharedPrefix) {
   const int kMaxKey = 100000;
   // for each prefix, there will be 5 keys starts with it.
   const int kPrefixGroup = 5;
@@ -232,11 +231,12 @@ TEST(BlockTest, IndexHashWithSharedPrefix) {
   std::unique_ptr<BlockBuilder> builder;
   auto contents = GetBlockContents(&builder, keys, values, kPrefixGroup);
 
-  CheckBlockContents(contents, kMaxKey, keys, values);
+  CheckBlockContents(std::move(contents), kMaxKey, keys, values);
 }
 
 }  // namespace rocksdb
 
-int main(int argc, char** argv) {
-  return rocksdb::test::RunAllTests();
+int main(int argc, char **argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
 }
diff --git a/src/rocksdb/table/bloom_block.cc b/src/rocksdb/table/bloom_block.cc
new file mode 100644
index 0000000..cfea8a2
--- /dev/null
+++ b/src/rocksdb/table/bloom_block.cc
@@ -0,0 +1,23 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "table/bloom_block.h"
+
+#include <string>
+#include "rocksdb/slice.h"
+#include "util/dynamic_bloom.h"
+
+namespace rocksdb {
+
+void BloomBlockBuilder::AddKeysHashes(const std::vector<uint32_t>& keys_hashes) {
+  for (auto hash : keys_hashes) {
+    bloom_.AddHash(hash);
+  }
+}
+
+Slice BloomBlockBuilder::Finish() { return bloom_.GetRawData(); }
+
+const std::string BloomBlockBuilder::kBloomBlock = "kBloomBlock";
+}  // namespace rocksdb
diff --git a/src/rocksdb/table/bloom_block.h b/src/rocksdb/table/bloom_block.h
new file mode 100644
index 0000000..5b60d2b
--- /dev/null
+++ b/src/rocksdb/table/bloom_block.h
@@ -0,0 +1,38 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+#pragma once
+
+#include <vector>
+#include <string>
+#include "util/dynamic_bloom.h"
+
+namespace rocksdb {
+class Logger;
+
+class BloomBlockBuilder {
+ public:
+  static const std::string kBloomBlock;
+
+  explicit BloomBlockBuilder(uint32_t num_probes = 6)
+      : bloom_(num_probes, nullptr) {}
+
+  void SetTotalBits(Allocator* allocator, uint32_t total_bits,
+                    uint32_t locality, size_t huge_page_tlb_size,
+                    Logger* logger) {
+    bloom_.SetTotalBits(allocator, total_bits, locality, huge_page_tlb_size,
+                        logger);
+  }
+
+  uint32_t GetNumBlocks() const { return bloom_.GetNumBlocks(); }
+
+  void AddKeysHashes(const std::vector<uint32_t>& keys_hashes);
+
+  Slice Finish();
+
+ private:
+  DynamicBloom bloom_;
+};
+
+};  // namespace rocksdb
diff --git a/src/rocksdb/table/cuckoo_table_builder.cc b/src/rocksdb/table/cuckoo_table_builder.cc
new file mode 100644
index 0000000..1aa1e07
--- /dev/null
+++ b/src/rocksdb/table/cuckoo_table_builder.cc
@@ -0,0 +1,511 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef ROCKSDB_LITE
+#include "table/cuckoo_table_builder.h"
+
+#include <assert.h>
+#include <algorithm>
+#include <limits>
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "rocksdb/env.h"
+#include "rocksdb/table.h"
+#include "table/block_builder.h"
+#include "table/cuckoo_table_factory.h"
+#include "table/format.h"
+#include "table/meta_blocks.h"
+#include "util/autovector.h"
+#include "util/random.h"
+#include "util/string_util.h"
+
+namespace rocksdb {
+const std::string CuckooTablePropertyNames::kEmptyKey =
+      "rocksdb.cuckoo.bucket.empty.key";
+const std::string CuckooTablePropertyNames::kNumHashFunc =
+      "rocksdb.cuckoo.hash.num";
+const std::string CuckooTablePropertyNames::kHashTableSize =
+      "rocksdb.cuckoo.hash.size";
+const std::string CuckooTablePropertyNames::kValueLength =
+      "rocksdb.cuckoo.value.length";
+const std::string CuckooTablePropertyNames::kIsLastLevel =
+      "rocksdb.cuckoo.file.islastlevel";
+const std::string CuckooTablePropertyNames::kCuckooBlockSize =
+      "rocksdb.cuckoo.hash.cuckooblocksize";
+const std::string CuckooTablePropertyNames::kIdentityAsFirstHash =
+      "rocksdb.cuckoo.hash.identityfirst";
+const std::string CuckooTablePropertyNames::kUseModuleHash =
+      "rocksdb.cuckoo.hash.usemodule";
+const std::string CuckooTablePropertyNames::kUserKeyLength =
+      "rocksdb.cuckoo.hash.userkeylength";
+
+// Obtained by running echo rocksdb.table.cuckoo | sha1sum
+extern const uint64_t kCuckooTableMagicNumber = 0x926789d0c5f17873ull;
+
+CuckooTableBuilder::CuckooTableBuilder(
+    WritableFile* file, double max_hash_table_ratio,
+    uint32_t max_num_hash_table, uint32_t max_search_depth,
+    const Comparator* user_comparator, uint32_t cuckoo_block_size,
+    bool use_module_hash, bool identity_as_first_hash,
+    uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t))
+    : num_hash_func_(2),
+      file_(file),
+      max_hash_table_ratio_(max_hash_table_ratio),
+      max_num_hash_func_(max_num_hash_table),
+      max_search_depth_(max_search_depth),
+      cuckoo_block_size_(std::max(1U, cuckoo_block_size)),
+      hash_table_size_(use_module_hash ? 0 : 2),
+      is_last_level_file_(false),
+      has_seen_first_key_(false),
+      has_seen_first_value_(false),
+      key_size_(0),
+      value_size_(0),
+      num_entries_(0),
+      num_values_(0),
+      ucomp_(user_comparator),
+      use_module_hash_(use_module_hash),
+      identity_as_first_hash_(identity_as_first_hash),
+      get_slice_hash_(get_slice_hash),
+      closed_(false) {
+  // Data is in a huge block.
+  properties_.num_data_blocks = 1;
+  properties_.index_size = 0;
+  properties_.filter_size = 0;
+}
+
+void CuckooTableBuilder::Add(const Slice& key, const Slice& value) {
+  if (num_entries_ >= kMaxVectorIdx - 1) {
+    status_ = Status::NotSupported("Number of keys in a file must be < 2^32-1");
+    return;
+  }
+  ParsedInternalKey ikey;
+  if (!ParseInternalKey(key, &ikey)) {
+    status_ = Status::Corruption("Unable to parse key into inernal key.");
+    return;
+  }
+  if (ikey.type != kTypeDeletion && ikey.type != kTypeValue) {
+    status_ = Status::NotSupported("Unsupported key type " +
+                                   ToString(ikey.type));
+    return;
+  }
+
+  // Determine if we can ignore the sequence number and value type from
+  // internal keys by looking at sequence number from first key. We assume
+  // that if first key has a zero sequence number, then all the remaining
+  // keys will have zero seq. no.
+  if (!has_seen_first_key_) {
+    is_last_level_file_ = ikey.sequence == 0;
+    has_seen_first_key_ = true;
+    smallest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
+    largest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
+    key_size_ = is_last_level_file_ ? ikey.user_key.size() : key.size();
+  }
+  if (key_size_ != (is_last_level_file_ ? ikey.user_key.size() : key.size())) {
+    status_ = Status::NotSupported("all keys have to be the same size");
+    return;
+  }
+  // Even if one sequence number is non-zero, then it is not last level.
+  assert(!is_last_level_file_ || ikey.sequence == 0);
+
+  if (ikey.type == kTypeValue) {
+    if (!has_seen_first_value_) {
+      has_seen_first_value_ = true;
+      value_size_ = value.size();
+    }
+    if (value_size_ != value.size()) {
+      status_ = Status::NotSupported("all values have to be the same size");
+      return;
+    }
+
+    if (is_last_level_file_) {
+      kvs_.append(ikey.user_key.data(), ikey.user_key.size());
+    } else {
+      kvs_.append(key.data(), key.size());
+    }
+    kvs_.append(value.data(), value.size());
+    ++num_values_;
+  } else {
+    if (is_last_level_file_) {
+      deleted_keys_.append(ikey.user_key.data(), ikey.user_key.size());
+    } else {
+      deleted_keys_.append(key.data(), key.size());
+    }
+  }
+  ++num_entries_;
+
+  // In order to fill the empty buckets in the hash table, we identify a
+  // key which is not used so far (unused_user_key). We determine this by
+  // maintaining smallest and largest keys inserted so far in bytewise order
+  // and use them to find a key outside this range in Finish() operation.
+  // Note that this strategy is independent of user comparator used here.
+  if (ikey.user_key.compare(smallest_user_key_) < 0) {
+    smallest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
+  } else if (ikey.user_key.compare(largest_user_key_) > 0) {
+    largest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
+  }
+  if (!use_module_hash_) {
+    if (hash_table_size_ < num_entries_ / max_hash_table_ratio_) {
+      hash_table_size_ *= 2;
+    }
+  }
+}
+
+bool CuckooTableBuilder::IsDeletedKey(uint64_t idx) const {
+  assert(closed_);
+  return idx >= num_values_;
+}
+
+Slice CuckooTableBuilder::GetKey(uint64_t idx) const {
+  assert(closed_);
+  if (IsDeletedKey(idx)) {
+    return Slice(&deleted_keys_[(idx - num_values_) * key_size_], key_size_);
+  }
+  return Slice(&kvs_[idx * (key_size_ + value_size_)], key_size_);
+}
+
+Slice CuckooTableBuilder::GetUserKey(uint64_t idx) const {
+  assert(closed_);
+  return is_last_level_file_ ? GetKey(idx) : ExtractUserKey(GetKey(idx));
+}
+
+Slice CuckooTableBuilder::GetValue(uint64_t idx) const {
+  assert(closed_);
+  if (IsDeletedKey(idx)) {
+    static std::string empty_value(value_size_, 'a');
+    return Slice(empty_value);
+  }
+  return Slice(&kvs_[idx * (key_size_ + value_size_) + key_size_], value_size_);
+}
+
+Status CuckooTableBuilder::MakeHashTable(std::vector<CuckooBucket>* buckets) {
+  buckets->resize(hash_table_size_ + cuckoo_block_size_ - 1);
+  uint32_t make_space_for_key_call_id = 0;
+  for (uint32_t vector_idx = 0; vector_idx < num_entries_; vector_idx++) {
+    uint64_t bucket_id;
+    bool bucket_found = false;
+    autovector<uint64_t> hash_vals;
+    Slice user_key = GetUserKey(vector_idx);
+    for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_ && !bucket_found;
+        ++hash_cnt) {
+      uint64_t hash_val = CuckooHash(user_key, hash_cnt, use_module_hash_,
+          hash_table_size_, identity_as_first_hash_, get_slice_hash_);
+      // If there is a collision, check next cuckoo_block_size_ locations for
+      // empty locations. While checking, if we reach end of the hash table,
+      // stop searching and proceed for next hash function.
+      for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
+          ++block_idx, ++hash_val) {
+        if ((*buckets)[hash_val].vector_idx == kMaxVectorIdx) {
+          bucket_id = hash_val;
+          bucket_found = true;
+          break;
+        } else {
+          if (ucomp_->Compare(user_key,
+                GetUserKey((*buckets)[hash_val].vector_idx)) == 0) {
+            return Status::NotSupported("Same key is being inserted again.");
+          }
+          hash_vals.push_back(hash_val);
+        }
+      }
+    }
+    while (!bucket_found && !MakeSpaceForKey(hash_vals,
+          ++make_space_for_key_call_id, buckets, &bucket_id)) {
+      // Rehash by increashing number of hash tables.
+      if (num_hash_func_ >= max_num_hash_func_) {
+        return Status::NotSupported("Too many collisions. Unable to hash.");
+      }
+      // We don't really need to rehash the entire table because old hashes are
+      // still valid and we only increased the number of hash functions.
+      uint64_t hash_val = CuckooHash(user_key, num_hash_func_, use_module_hash_,
+          hash_table_size_, identity_as_first_hash_, get_slice_hash_);
+      ++num_hash_func_;
+      for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
+          ++block_idx, ++hash_val) {
+        if ((*buckets)[hash_val].vector_idx == kMaxVectorIdx) {
+          bucket_found = true;
+          bucket_id = hash_val;
+          break;
+        } else {
+          hash_vals.push_back(hash_val);
+        }
+      }
+    }
+    (*buckets)[bucket_id].vector_idx = vector_idx;
+  }
+  return Status::OK();
+}
+
+Status CuckooTableBuilder::Finish() {
+  assert(!closed_);
+  closed_ = true;
+  std::vector<CuckooBucket> buckets;
+  Status s;
+  std::string unused_bucket;
+  if (num_entries_ > 0) {
+    // Calculate the real hash size if module hash is enabled.
+    if (use_module_hash_) {
+      hash_table_size_ = num_entries_ / max_hash_table_ratio_;
+    }
+    s = MakeHashTable(&buckets);
+    if (!s.ok()) {
+      return s;
+    }
+    // Determine unused_user_key to fill empty buckets.
+    std::string unused_user_key = smallest_user_key_;
+    int curr_pos = static_cast<int>(unused_user_key.size()) - 1;
+    while (curr_pos >= 0) {
+      --unused_user_key[curr_pos];
+      if (Slice(unused_user_key).compare(smallest_user_key_) < 0) {
+        break;
+      }
+      --curr_pos;
+    }
+    if (curr_pos < 0) {
+      // Try using the largest key to identify an unused key.
+      unused_user_key = largest_user_key_;
+      curr_pos = static_cast<int>(unused_user_key.size()) - 1;
+      while (curr_pos >= 0) {
+        ++unused_user_key[curr_pos];
+        if (Slice(unused_user_key).compare(largest_user_key_) > 0) {
+          break;
+        }
+        --curr_pos;
+      }
+    }
+    if (curr_pos < 0) {
+      return Status::Corruption("Unable to find unused key");
+    }
+    if (is_last_level_file_) {
+      unused_bucket = unused_user_key;
+    } else {
+      ParsedInternalKey ikey(unused_user_key, 0, kTypeValue);
+      AppendInternalKey(&unused_bucket, ikey);
+    }
+  }
+  properties_.num_entries = num_entries_;
+  properties_.fixed_key_len = key_size_;
+  properties_.user_collected_properties[
+        CuckooTablePropertyNames::kValueLength].assign(
+        reinterpret_cast<const char*>(&value_size_), sizeof(value_size_));
+
+  uint64_t bucket_size = key_size_ + value_size_;
+  unused_bucket.resize(bucket_size, 'a');
+  // Write the table.
+  uint32_t num_added = 0;
+  for (auto& bucket : buckets) {
+    if (bucket.vector_idx == kMaxVectorIdx) {
+      s = file_->Append(Slice(unused_bucket));
+    } else {
+      ++num_added;
+      s = file_->Append(GetKey(bucket.vector_idx));
+      if (s.ok()) {
+        if (value_size_ > 0) {
+          s = file_->Append(GetValue(bucket.vector_idx));
+        }
+      }
+    }
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  assert(num_added == NumEntries());
+  properties_.raw_key_size = num_added * properties_.fixed_key_len;
+  properties_.raw_value_size = num_added * value_size_;
+
+  uint64_t offset = buckets.size() * bucket_size;
+  properties_.data_size = offset;
+  unused_bucket.resize(properties_.fixed_key_len);
+  properties_.user_collected_properties[
+    CuckooTablePropertyNames::kEmptyKey] = unused_bucket;
+  properties_.user_collected_properties[
+    CuckooTablePropertyNames::kNumHashFunc].assign(
+        reinterpret_cast<char*>(&num_hash_func_), sizeof(num_hash_func_));
+
+  properties_.user_collected_properties[
+    CuckooTablePropertyNames::kHashTableSize].assign(
+        reinterpret_cast<const char*>(&hash_table_size_),
+        sizeof(hash_table_size_));
+  properties_.user_collected_properties[
+    CuckooTablePropertyNames::kIsLastLevel].assign(
+        reinterpret_cast<const char*>(&is_last_level_file_),
+        sizeof(is_last_level_file_));
+  properties_.user_collected_properties[
+    CuckooTablePropertyNames::kCuckooBlockSize].assign(
+        reinterpret_cast<const char*>(&cuckoo_block_size_),
+        sizeof(cuckoo_block_size_));
+  properties_.user_collected_properties[
+    CuckooTablePropertyNames::kIdentityAsFirstHash].assign(
+        reinterpret_cast<const char*>(&identity_as_first_hash_),
+        sizeof(identity_as_first_hash_));
+  properties_.user_collected_properties[
+    CuckooTablePropertyNames::kUseModuleHash].assign(
+        reinterpret_cast<const char*>(&use_module_hash_),
+        sizeof(use_module_hash_));
+  uint32_t user_key_len = static_cast<uint32_t>(smallest_user_key_.size());
+  properties_.user_collected_properties[
+    CuckooTablePropertyNames::kUserKeyLength].assign(
+        reinterpret_cast<const char*>(&user_key_len),
+        sizeof(user_key_len));
+
+  // Write meta blocks.
+  MetaIndexBuilder meta_index_builder;
+  PropertyBlockBuilder property_block_builder;
+
+  property_block_builder.AddTableProperty(properties_);
+  property_block_builder.Add(properties_.user_collected_properties);
+  Slice property_block = property_block_builder.Finish();
+  BlockHandle property_block_handle;
+  property_block_handle.set_offset(offset);
+  property_block_handle.set_size(property_block.size());
+  s = file_->Append(property_block);
+  offset += property_block.size();
+  if (!s.ok()) {
+    return s;
+  }
+
+  meta_index_builder.Add(kPropertiesBlock, property_block_handle);
+  Slice meta_index_block = meta_index_builder.Finish();
+
+  BlockHandle meta_index_block_handle;
+  meta_index_block_handle.set_offset(offset);
+  meta_index_block_handle.set_size(meta_index_block.size());
+  s = file_->Append(meta_index_block);
+  if (!s.ok()) {
+    return s;
+  }
+
+  Footer footer(kCuckooTableMagicNumber, 1);
+  footer.set_metaindex_handle(meta_index_block_handle);
+  footer.set_index_handle(BlockHandle::NullBlockHandle());
+  std::string footer_encoding;
+  footer.EncodeTo(&footer_encoding);
+  s = file_->Append(footer_encoding);
+  return s;
+}
+
+void CuckooTableBuilder::Abandon() {
+  assert(!closed_);
+  closed_ = true;
+}
+
+uint64_t CuckooTableBuilder::NumEntries() const {
+  return num_entries_;
+}
+
+uint64_t CuckooTableBuilder::FileSize() const {
+  if (closed_) {
+    return file_->GetFileSize();
+  } else if (num_entries_ == 0) {
+    return 0;
+  }
+
+  if (use_module_hash_) {
+    return (key_size_ + value_size_) * num_entries_ / max_hash_table_ratio_;
+  } else {
+    // Account for buckets being a power of two.
+    // As elements are added, file size remains constant for a while and
+    // doubles its size. Since compaction algorithm stops adding elements
+    // only after it exceeds the file limit, we account for the extra element
+    // being added here.
+    uint64_t expected_hash_table_size = hash_table_size_;
+    if (expected_hash_table_size < (num_entries_ + 1) / max_hash_table_ratio_) {
+      expected_hash_table_size *= 2;
+    }
+    return (key_size_ + value_size_) * expected_hash_table_size - 1;
+  }
+}
+
+// This method is invoked when there is no place to insert the target key.
+// It searches for a set of elements that can be moved to accommodate target
+// key. The search is a BFS graph traversal with first level (hash_vals)
+// being all the buckets target key could go to.
+// Then, from each node (curr_node), we find all the buckets that curr_node
+// could go to. They form the children of curr_node in the tree.
+// We continue the traversal until we find an empty bucket, in which case, we
+// move all elements along the path from first level to this empty bucket, to
+// make space for target key which is inserted at first level (*bucket_id).
+// If tree depth exceedes max depth, we return false indicating failure.
+bool CuckooTableBuilder::MakeSpaceForKey(
+    const autovector<uint64_t>& hash_vals,
+    const uint32_t make_space_for_key_call_id,
+    std::vector<CuckooBucket>* buckets, uint64_t* bucket_id) {
+  struct CuckooNode {
+    uint64_t bucket_id;
+    uint32_t depth;
+    uint32_t parent_pos;
+    CuckooNode(uint64_t _bucket_id, uint32_t _depth, int _parent_pos)
+        : bucket_id(_bucket_id), depth(_depth), parent_pos(_parent_pos) {}
+  };
+  // This is BFS search tree that is stored simply as a vector.
+  // Each node stores the index of parent node in the vector.
+  std::vector<CuckooNode> tree;
+  // We want to identify already visited buckets in the current method call so
+  // that we don't add same buckets again for exploration in the tree.
+  // We do this by maintaining a count of current method call in
+  // make_space_for_key_call_id, which acts as a unique id for this invocation
+  // of the method. We store this number into the nodes that we explore in
+  // current method call.
+  // It is unlikely for the increment operation to overflow because the maximum
+  // no. of times this will be called is <= max_num_hash_func_ + num_entries_.
+  for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) {
+    uint64_t bid = hash_vals[hash_cnt];
+    (*buckets)[bid].make_space_for_key_call_id = make_space_for_key_call_id;
+    tree.push_back(CuckooNode(bid, 0, 0));
+  }
+  bool null_found = false;
+  uint32_t curr_pos = 0;
+  while (!null_found && curr_pos < tree.size()) {
+    CuckooNode& curr_node = tree[curr_pos];
+    uint32_t curr_depth = curr_node.depth;
+    if (curr_depth >= max_search_depth_) {
+      break;
+    }
+    CuckooBucket& curr_bucket = (*buckets)[curr_node.bucket_id];
+    for (uint32_t hash_cnt = 0;
+        hash_cnt < num_hash_func_ && !null_found; ++hash_cnt) {
+      uint64_t child_bucket_id = CuckooHash(GetUserKey(curr_bucket.vector_idx),
+          hash_cnt, use_module_hash_, hash_table_size_, identity_as_first_hash_,
+          get_slice_hash_);
+      // Iterate inside Cuckoo Block.
+      for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
+          ++block_idx, ++child_bucket_id) {
+        if ((*buckets)[child_bucket_id].make_space_for_key_call_id ==
+            make_space_for_key_call_id) {
+          continue;
+        }
+        (*buckets)[child_bucket_id].make_space_for_key_call_id =
+          make_space_for_key_call_id;
+        tree.push_back(CuckooNode(child_bucket_id, curr_depth + 1,
+              curr_pos));
+        if ((*buckets)[child_bucket_id].vector_idx == kMaxVectorIdx) {
+          null_found = true;
+          break;
+        }
+      }
+    }
+    ++curr_pos;
+  }
+
+  if (null_found) {
+    // There is an empty node in tree.back(). Now, traverse the path from this
+    // empty node to top of the tree and at every node in the path, replace
+    // child with the parent. Stop when first level is reached in the tree
+    // (happens when 0 <= bucket_to_replace_pos < num_hash_func_) and return
+    // this location in first level for target key to be inserted.
+    uint32_t bucket_to_replace_pos = static_cast<uint32_t>(tree.size()) - 1;
+    while (bucket_to_replace_pos >= num_hash_func_) {
+      CuckooNode& curr_node = tree[bucket_to_replace_pos];
+      (*buckets)[curr_node.bucket_id] =
+        (*buckets)[tree[curr_node.parent_pos].bucket_id];
+      bucket_to_replace_pos = curr_node.parent_pos;
+    }
+    *bucket_id = tree[bucket_to_replace_pos].bucket_id;
+  }
+  return null_found;
+}
+
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/cuckoo_table_builder.h b/src/rocksdb/table/cuckoo_table_builder.h
new file mode 100644
index 0000000..6b5a180
--- /dev/null
+++ b/src/rocksdb/table/cuckoo_table_builder.h
@@ -0,0 +1,123 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+#include <stdint.h>
+#include <limits>
+#include <string>
+#include <utility>
+#include <vector>
+#include "rocksdb/status.h"
+#include "table/table_builder.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "util/autovector.h"
+
+namespace rocksdb {
+
+class CuckooTableBuilder: public TableBuilder {
+ public:
+  CuckooTableBuilder(
+      WritableFile* file, double max_hash_table_ratio,
+      uint32_t max_num_hash_func, uint32_t max_search_depth,
+      const Comparator* user_comparator, uint32_t cuckoo_block_size,
+      bool use_module_hash, bool identity_as_first_hash,
+      uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t));
+
+  // REQUIRES: Either Finish() or Abandon() has been called.
+  ~CuckooTableBuilder() {}
+
+  // Add key,value to the table being constructed.
+  // REQUIRES: key is after any previously added key according to comparator.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Add(const Slice& key, const Slice& value) override;
+
+  // Return non-ok iff some error has been detected.
+  Status status() const override { return status_; }
+
+  // Finish building the table.  Stops using the file passed to the
+  // constructor after this function returns.
+  // REQUIRES: Finish(), Abandon() have not been called
+  Status Finish() override;
+
+  // Indicate that the contents of this builder should be abandoned.  Stops
+  // using the file passed to the constructor after this function returns.
+  // If the caller is not going to call Finish(), it must call Abandon()
+  // before destroying this builder.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Abandon() override;
+
+  // Number of calls to Add() so far.
+  uint64_t NumEntries() const override;
+
+  // Size of the file generated so far.  If invoked after a successful
+  // Finish() call, returns the size of the final generated file.
+  uint64_t FileSize() const override;
+
+  TableProperties GetTableProperties() const override { return properties_; }
+
+ private:
+  struct CuckooBucket {
+    CuckooBucket()
+      : vector_idx(kMaxVectorIdx), make_space_for_key_call_id(0) {}
+    uint32_t vector_idx;
+    // This number will not exceed kvs_.size() + max_num_hash_func_.
+    // We assume number of items is <= 2^32.
+    uint32_t make_space_for_key_call_id;
+  };
+  static const uint32_t kMaxVectorIdx = std::numeric_limits<int32_t>::max();
+
+  bool MakeSpaceForKey(const autovector<uint64_t>& hash_vals,
+                       const uint32_t call_id,
+                       std::vector<CuckooBucket>* buckets, uint64_t* bucket_id);
+  Status MakeHashTable(std::vector<CuckooBucket>* buckets);
+
+  inline bool IsDeletedKey(uint64_t idx) const;
+  inline Slice GetKey(uint64_t idx) const;
+  inline Slice GetUserKey(uint64_t idx) const;
+  inline Slice GetValue(uint64_t idx) const;
+
+  uint32_t num_hash_func_;
+  WritableFile* file_;
+  const double max_hash_table_ratio_;
+  const uint32_t max_num_hash_func_;
+  const uint32_t max_search_depth_;
+  const uint32_t cuckoo_block_size_;
+  uint64_t hash_table_size_;
+  bool is_last_level_file_;
+  bool has_seen_first_key_;
+  bool has_seen_first_value_;
+  uint64_t key_size_;
+  uint64_t value_size_;
+  // A list of fixed-size key-value pairs concatenating into a string.
+  // Use GetKey(), GetUserKey(), and GetValue() to retrieve a specific
+  // key / value given an index
+  std::string kvs_;
+  std::string deleted_keys_;
+  // Number of key-value pairs stored in kvs_ + number of deleted keys
+  uint64_t num_entries_;
+  // Number of keys that contain value (non-deletion op)
+  uint64_t num_values_;
+  Status status_;
+  TableProperties properties_;
+  const Comparator* ucomp_;
+  bool use_module_hash_;
+  bool identity_as_first_hash_;
+  uint64_t (*get_slice_hash_)(const Slice& s, uint32_t index,
+    uint64_t max_num_buckets);
+  std::string largest_user_key_ = "";
+  std::string smallest_user_key_ = "";
+
+  bool closed_;  // Either Finish() or Abandon() has been called.
+
+  // No copying allowed
+  CuckooTableBuilder(const CuckooTableBuilder&) = delete;
+  void operator=(const CuckooTableBuilder&) = delete;
+};
+
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/cuckoo_table_builder_test.cc b/src/rocksdb/table/cuckoo_table_builder_test.cc
new file mode 100644
index 0000000..cab5daf
--- /dev/null
+++ b/src/rocksdb/table/cuckoo_table_builder_test.cc
@@ -0,0 +1,521 @@
+// Copyright (c) 2014, Facebook, Inc. All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#include <vector>
+#include <string>
+#include <map>
+#include <utility>
+
+#include "table/meta_blocks.h"
+#include "table/cuckoo_table_builder.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+extern const uint64_t kCuckooTableMagicNumber;
+
+namespace {
+std::unordered_map<std::string, std::vector<uint64_t>> hash_map;
+
+uint64_t GetSliceHash(const Slice& s, uint32_t index,
+    uint64_t max_num_buckets) {
+  return hash_map[s.ToString()][index];
+}
+}  // namespace
+
+class CuckooBuilderTest : public testing::Test {
+ public:
+  CuckooBuilderTest() {
+    env_ = Env::Default();
+    Options options;
+    options.allow_mmap_reads = true;
+    env_options_ = EnvOptions(options);
+  }
+
+  void CheckFileContents(const std::vector<std::string>& keys,
+      const std::vector<std::string>& values,
+      const std::vector<uint64_t>& expected_locations,
+      std::string expected_unused_bucket, uint64_t expected_table_size,
+      uint32_t expected_num_hash_func, bool expected_is_last_level,
+      uint32_t expected_cuckoo_block_size = 1) {
+    // Read file
+    unique_ptr<RandomAccessFile> read_file;
+    ASSERT_OK(env_->NewRandomAccessFile(fname, &read_file, env_options_));
+    uint64_t read_file_size;
+    ASSERT_OK(env_->GetFileSize(fname, &read_file_size));
+
+    // Assert Table Properties.
+    TableProperties* props = nullptr;
+    ASSERT_OK(ReadTableProperties(read_file.get(), read_file_size,
+          kCuckooTableMagicNumber, env_, nullptr, &props));
+    // Check unused bucket.
+    std::string unused_key = props->user_collected_properties[
+      CuckooTablePropertyNames::kEmptyKey];
+    ASSERT_EQ(expected_unused_bucket.substr(0,
+          props->fixed_key_len), unused_key);
+
+    uint32_t value_len_found =
+      *reinterpret_cast<const uint32_t*>(props->user_collected_properties[
+                CuckooTablePropertyNames::kValueLength].data());
+    ASSERT_EQ(values.empty() ? 0 : values[0].size(), value_len_found);
+    ASSERT_EQ(props->raw_value_size, values.size()*value_len_found);
+    const uint64_t table_size =
+      *reinterpret_cast<const uint64_t*>(props->user_collected_properties[
+                CuckooTablePropertyNames::kHashTableSize].data());
+    ASSERT_EQ(expected_table_size, table_size);
+    const uint32_t num_hash_func_found =
+      *reinterpret_cast<const uint32_t*>(props->user_collected_properties[
+                CuckooTablePropertyNames::kNumHashFunc].data());
+    ASSERT_EQ(expected_num_hash_func, num_hash_func_found);
+    const uint32_t cuckoo_block_size =
+      *reinterpret_cast<const uint32_t*>(props->user_collected_properties[
+                CuckooTablePropertyNames::kCuckooBlockSize].data());
+    ASSERT_EQ(expected_cuckoo_block_size, cuckoo_block_size);
+    const bool is_last_level_found =
+      *reinterpret_cast<const bool*>(props->user_collected_properties[
+                CuckooTablePropertyNames::kIsLastLevel].data());
+    ASSERT_EQ(expected_is_last_level, is_last_level_found);
+
+    ASSERT_EQ(props->num_entries, keys.size());
+    ASSERT_EQ(props->fixed_key_len, keys.empty() ? 0 : keys[0].size());
+    ASSERT_EQ(props->data_size, expected_unused_bucket.size() *
+        (expected_table_size + expected_cuckoo_block_size - 1));
+    ASSERT_EQ(props->raw_key_size, keys.size()*props->fixed_key_len);
+    delete props;
+
+    // Check contents of the bucket.
+    std::vector<bool> keys_found(keys.size(), false);
+    size_t bucket_size = expected_unused_bucket.size();
+    for (uint32_t i = 0; i < table_size + cuckoo_block_size - 1; ++i) {
+      Slice read_slice;
+      ASSERT_OK(read_file->Read(i*bucket_size, bucket_size,
+            &read_slice, nullptr));
+      size_t key_idx =
+          std::find(expected_locations.begin(), expected_locations.end(), i) -
+          expected_locations.begin();
+      if (key_idx == keys.size()) {
+        // i is not one of the expected locaitons. Empty bucket.
+        ASSERT_EQ(read_slice.compare(expected_unused_bucket), 0);
+      } else {
+        keys_found[key_idx] = true;
+        ASSERT_EQ(read_slice.compare(keys[key_idx] + values[key_idx]), 0);
+      }
+    }
+    for (auto key_found : keys_found) {
+      // Check that all keys were found.
+      ASSERT_TRUE(key_found);
+    }
+  }
+
+  std::string GetInternalKey(Slice user_key, bool zero_seqno) {
+    IterKey ikey;
+    ikey.SetInternalKey(user_key, zero_seqno ? 0 : 1000, kTypeValue);
+    return ikey.GetKey().ToString();
+  }
+
+  uint64_t NextPowOf2(uint64_t num) {
+    uint64_t n = 2;
+    while (n <= num) {
+      n *= 2;
+    }
+    return n;
+  }
+
+  Env* env_;
+  EnvOptions env_options_;
+  std::string fname;
+  const double kHashTableRatio = 0.9;
+};
+
+TEST_F(CuckooBuilderTest, SuccessWithEmptyFile) {
+  unique_ptr<WritableFile> writable_file;
+  fname = test::TmpDir() + "/EmptyFile";
+  ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
+  CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
+      4, 100, BytewiseComparator(), 1, false, false, GetSliceHash);
+  ASSERT_OK(builder.status());
+  ASSERT_EQ(0UL, builder.FileSize());
+  ASSERT_OK(builder.Finish());
+  ASSERT_OK(writable_file->Close());
+  CheckFileContents({}, {}, {}, "", 2, 2, false);
+}
+
+TEST_F(CuckooBuilderTest, WriteSuccessNoCollisionFullKey) {
+  uint32_t num_hash_fun = 4;
+  std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04"};
+  std::vector<std::string> values = {"v01", "v02", "v03", "v04"};
+  hash_map = {
+    {user_keys[0], {0, 1, 2, 3}},
+    {user_keys[1], {1, 2, 3, 4}},
+    {user_keys[2], {2, 3, 4, 5}},
+    {user_keys[3], {3, 4, 5, 6}}
+  };
+  std::vector<uint64_t> expected_locations = {0, 1, 2, 3};
+  std::vector<std::string> keys;
+  for (auto& user_key : user_keys) {
+    keys.push_back(GetInternalKey(user_key, false));
+  }
+  uint64_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio);
+
+  unique_ptr<WritableFile> writable_file;
+  fname = test::TmpDir() + "/NoCollisionFullKey";
+  ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
+  CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
+      num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash);
+  ASSERT_OK(builder.status());
+  for (uint32_t i = 0; i < user_keys.size(); i++) {
+    builder.Add(Slice(keys[i]), Slice(values[i]));
+    ASSERT_EQ(builder.NumEntries(), i + 1);
+    ASSERT_OK(builder.status());
+  }
+  size_t bucket_size = keys[0].size() + values[0].size();
+  ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
+  ASSERT_OK(builder.Finish());
+  ASSERT_OK(writable_file->Close());
+  ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+
+  std::string expected_unused_bucket = GetInternalKey("key00", true);
+  expected_unused_bucket += std::string(values[0].size(), 'a');
+  CheckFileContents(keys, values, expected_locations,
+      expected_unused_bucket, expected_table_size, 2, false);
+}
+
+TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionFullKey) {
+  uint32_t num_hash_fun = 4;
+  std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04"};
+  std::vector<std::string> values = {"v01", "v02", "v03", "v04"};
+  hash_map = {
+    {user_keys[0], {0, 1, 2, 3}},
+    {user_keys[1], {0, 1, 2, 3}},
+    {user_keys[2], {0, 1, 2, 3}},
+    {user_keys[3], {0, 1, 2, 3}},
+  };
+  std::vector<uint64_t> expected_locations = {0, 1, 2, 3};
+  std::vector<std::string> keys;
+  for (auto& user_key : user_keys) {
+    keys.push_back(GetInternalKey(user_key, false));
+  }
+  uint64_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio);
+
+  unique_ptr<WritableFile> writable_file;
+  fname = test::TmpDir() + "/WithCollisionFullKey";
+  ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
+  CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
+      num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash);
+  ASSERT_OK(builder.status());
+  for (uint32_t i = 0; i < user_keys.size(); i++) {
+    builder.Add(Slice(keys[i]), Slice(values[i]));
+    ASSERT_EQ(builder.NumEntries(), i + 1);
+    ASSERT_OK(builder.status());
+  }
+  size_t bucket_size = keys[0].size() + values[0].size();
+  ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
+  ASSERT_OK(builder.Finish());
+  ASSERT_OK(writable_file->Close());
+  ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+
+  std::string expected_unused_bucket = GetInternalKey("key00", true);
+  expected_unused_bucket += std::string(values[0].size(), 'a');
+  CheckFileContents(keys, values, expected_locations,
+      expected_unused_bucket, expected_table_size, 4, false);
+}
+
+TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionAndCuckooBlock) {
+  uint32_t num_hash_fun = 4;
+  std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04"};
+  std::vector<std::string> values = {"v01", "v02", "v03", "v04"};
+  hash_map = {
+    {user_keys[0], {0, 1, 2, 3}},
+    {user_keys[1], {0, 1, 2, 3}},
+    {user_keys[2], {0, 1, 2, 3}},
+    {user_keys[3], {0, 1, 2, 3}},
+  };
+  std::vector<uint64_t> expected_locations = {0, 1, 2, 3};
+  std::vector<std::string> keys;
+  for (auto& user_key : user_keys) {
+    keys.push_back(GetInternalKey(user_key, false));
+  }
+  uint64_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio);
+
+  unique_ptr<WritableFile> writable_file;
+  uint32_t cuckoo_block_size = 2;
+  fname = test::TmpDir() + "/WithCollisionFullKey2";
+  ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
+  CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
+      num_hash_fun, 100, BytewiseComparator(), cuckoo_block_size,
+      false, false, GetSliceHash);
+  ASSERT_OK(builder.status());
+  for (uint32_t i = 0; i < user_keys.size(); i++) {
+    builder.Add(Slice(keys[i]), Slice(values[i]));
+    ASSERT_EQ(builder.NumEntries(), i + 1);
+    ASSERT_OK(builder.status());
+  }
+  size_t bucket_size = keys[0].size() + values[0].size();
+  ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
+  ASSERT_OK(builder.Finish());
+  ASSERT_OK(writable_file->Close());
+  ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+
+  std::string expected_unused_bucket = GetInternalKey("key00", true);
+  expected_unused_bucket += std::string(values[0].size(), 'a');
+  CheckFileContents(keys, values, expected_locations,
+      expected_unused_bucket, expected_table_size, 3, false, cuckoo_block_size);
+}
+
+TEST_F(CuckooBuilderTest, WithCollisionPathFullKey) {
+  // Have two hash functions. Insert elements with overlapping hashes.
+  // Finally insert an element with hash value somewhere in the middle
+  // so that it displaces all the elements after that.
+  uint32_t num_hash_fun = 2;
+  std::vector<std::string> user_keys = {"key01", "key02", "key03",
+    "key04", "key05"};
+  std::vector<std::string> values = {"v01", "v02", "v03", "v04", "v05"};
+  hash_map = {
+    {user_keys[0], {0, 1}},
+    {user_keys[1], {1, 2}},
+    {user_keys[2], {2, 3}},
+    {user_keys[3], {3, 4}},
+    {user_keys[4], {0, 2}},
+  };
+  std::vector<uint64_t> expected_locations = {0, 1, 3, 4, 2};
+  std::vector<std::string> keys;
+  for (auto& user_key : user_keys) {
+    keys.push_back(GetInternalKey(user_key, false));
+  }
+  uint64_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio);
+
+  unique_ptr<WritableFile> writable_file;
+  fname = test::TmpDir() + "/WithCollisionPathFullKey";
+  ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
+  CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
+      num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash);
+  ASSERT_OK(builder.status());
+  for (uint32_t i = 0; i < user_keys.size(); i++) {
+    builder.Add(Slice(keys[i]), Slice(values[i]));
+    ASSERT_EQ(builder.NumEntries(), i + 1);
+    ASSERT_OK(builder.status());
+  }
+  size_t bucket_size = keys[0].size() + values[0].size();
+  ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
+  ASSERT_OK(builder.Finish());
+  ASSERT_OK(writable_file->Close());
+  ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+
+  std::string expected_unused_bucket = GetInternalKey("key00", true);
+  expected_unused_bucket += std::string(values[0].size(), 'a');
+  CheckFileContents(keys, values, expected_locations,
+      expected_unused_bucket, expected_table_size, 2, false);
+}
+
+TEST_F(CuckooBuilderTest, WithCollisionPathFullKeyAndCuckooBlock) {
+  uint32_t num_hash_fun = 2;
+  std::vector<std::string> user_keys = {"key01", "key02", "key03",
+    "key04", "key05"};
+  std::vector<std::string> values = {"v01", "v02", "v03", "v04", "v05"};
+  hash_map = {
+    {user_keys[0], {0, 1}},
+    {user_keys[1], {1, 2}},
+    {user_keys[2], {3, 4}},
+    {user_keys[3], {4, 5}},
+    {user_keys[4], {0, 3}},
+  };
+  std::vector<uint64_t> expected_locations = {2, 1, 3, 4, 0};
+  std::vector<std::string> keys;
+  for (auto& user_key : user_keys) {
+    keys.push_back(GetInternalKey(user_key, false));
+  }
+  uint64_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio);
+
+  unique_ptr<WritableFile> writable_file;
+  fname = test::TmpDir() + "/WithCollisionPathFullKeyAndCuckooBlock";
+  ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
+  CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
+      num_hash_fun, 100, BytewiseComparator(), 2, false, false, GetSliceHash);
+  ASSERT_OK(builder.status());
+  for (uint32_t i = 0; i < user_keys.size(); i++) {
+    builder.Add(Slice(keys[i]), Slice(values[i]));
+    ASSERT_EQ(builder.NumEntries(), i + 1);
+    ASSERT_OK(builder.status());
+  }
+  size_t bucket_size = keys[0].size() + values[0].size();
+  ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
+  ASSERT_OK(builder.Finish());
+  ASSERT_OK(writable_file->Close());
+  ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+
+  std::string expected_unused_bucket = GetInternalKey("key00", true);
+  expected_unused_bucket += std::string(values[0].size(), 'a');
+  CheckFileContents(keys, values, expected_locations,
+      expected_unused_bucket, expected_table_size, 2, false, 2);
+}
+
+TEST_F(CuckooBuilderTest, WriteSuccessNoCollisionUserKey) {
+  uint32_t num_hash_fun = 4;
+  std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04"};
+  std::vector<std::string> values = {"v01", "v02", "v03", "v04"};
+  hash_map = {
+    {user_keys[0], {0, 1, 2, 3}},
+    {user_keys[1], {1, 2, 3, 4}},
+    {user_keys[2], {2, 3, 4, 5}},
+    {user_keys[3], {3, 4, 5, 6}}
+  };
+  std::vector<uint64_t> expected_locations = {0, 1, 2, 3};
+  uint64_t expected_table_size = NextPowOf2(user_keys.size() / kHashTableRatio);
+
+  unique_ptr<WritableFile> writable_file;
+  fname = test::TmpDir() + "/NoCollisionUserKey";
+  ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
+  CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
+      num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash);
+  ASSERT_OK(builder.status());
+  for (uint32_t i = 0; i < user_keys.size(); i++) {
+    builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i]));
+    ASSERT_EQ(builder.NumEntries(), i + 1);
+    ASSERT_OK(builder.status());
+  }
+  size_t bucket_size = user_keys[0].size() + values[0].size();
+  ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
+  ASSERT_OK(builder.Finish());
+  ASSERT_OK(writable_file->Close());
+  ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+
+  std::string expected_unused_bucket = "key00";
+  expected_unused_bucket += std::string(values[0].size(), 'a');
+  CheckFileContents(user_keys, values, expected_locations,
+      expected_unused_bucket, expected_table_size, 2, true);
+}
+
+TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionUserKey) {
+  uint32_t num_hash_fun = 4;
+  std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04"};
+  std::vector<std::string> values = {"v01", "v02", "v03", "v04"};
+  hash_map = {
+    {user_keys[0], {0, 1, 2, 3}},
+    {user_keys[1], {0, 1, 2, 3}},
+    {user_keys[2], {0, 1, 2, 3}},
+    {user_keys[3], {0, 1, 2, 3}},
+  };
+  std::vector<uint64_t> expected_locations = {0, 1, 2, 3};
+  uint64_t expected_table_size = NextPowOf2(user_keys.size() / kHashTableRatio);
+
+  unique_ptr<WritableFile> writable_file;
+  fname = test::TmpDir() + "/WithCollisionUserKey";
+  ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
+  CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
+      num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash);
+  ASSERT_OK(builder.status());
+  for (uint32_t i = 0; i < user_keys.size(); i++) {
+    builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i]));
+    ASSERT_EQ(builder.NumEntries(), i + 1);
+    ASSERT_OK(builder.status());
+  }
+  size_t bucket_size = user_keys[0].size() + values[0].size();
+  ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
+  ASSERT_OK(builder.Finish());
+  ASSERT_OK(writable_file->Close());
+  ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+
+  std::string expected_unused_bucket = "key00";
+  expected_unused_bucket += std::string(values[0].size(), 'a');
+  CheckFileContents(user_keys, values, expected_locations,
+      expected_unused_bucket, expected_table_size, 4, true);
+}
+
+TEST_F(CuckooBuilderTest, WithCollisionPathUserKey) {
+  uint32_t num_hash_fun = 2;
+  std::vector<std::string> user_keys = {"key01", "key02", "key03",
+    "key04", "key05"};
+  std::vector<std::string> values = {"v01", "v02", "v03", "v04", "v05"};
+  hash_map = {
+    {user_keys[0], {0, 1}},
+    {user_keys[1], {1, 2}},
+    {user_keys[2], {2, 3}},
+    {user_keys[3], {3, 4}},
+    {user_keys[4], {0, 2}},
+  };
+  std::vector<uint64_t> expected_locations = {0, 1, 3, 4, 2};
+  uint64_t expected_table_size = NextPowOf2(user_keys.size() / kHashTableRatio);
+
+  unique_ptr<WritableFile> writable_file;
+  fname = test::TmpDir() + "/WithCollisionPathUserKey";
+  ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
+  CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
+      num_hash_fun, 2, BytewiseComparator(), 1, false, false, GetSliceHash);
+  ASSERT_OK(builder.status());
+  for (uint32_t i = 0; i < user_keys.size(); i++) {
+    builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i]));
+    ASSERT_EQ(builder.NumEntries(), i + 1);
+    ASSERT_OK(builder.status());
+  }
+  size_t bucket_size = user_keys[0].size() + values[0].size();
+  ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
+  ASSERT_OK(builder.Finish());
+  ASSERT_OK(writable_file->Close());
+  ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+
+  std::string expected_unused_bucket = "key00";
+  expected_unused_bucket += std::string(values[0].size(), 'a');
+  CheckFileContents(user_keys, values, expected_locations,
+      expected_unused_bucket, expected_table_size, 2, true);
+}
+
+TEST_F(CuckooBuilderTest, FailWhenCollisionPathTooLong) {
+  // Have two hash functions. Insert elements with overlapping hashes.
+  // Finally try inserting an element with hash value somewhere in the middle
+  // and it should fail because the no. of elements to displace is too high.
+  uint32_t num_hash_fun = 2;
+  std::vector<std::string> user_keys = {"key01", "key02", "key03",
+    "key04", "key05"};
+  hash_map = {
+    {user_keys[0], {0, 1}},
+    {user_keys[1], {1, 2}},
+    {user_keys[2], {2, 3}},
+    {user_keys[3], {3, 4}},
+    {user_keys[4], {0, 1}},
+  };
+
+  unique_ptr<WritableFile> writable_file;
+  fname = test::TmpDir() + "/WithCollisionPathUserKey";
+  ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
+  CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
+      num_hash_fun, 2, BytewiseComparator(), 1, false, false, GetSliceHash);
+  ASSERT_OK(builder.status());
+  for (uint32_t i = 0; i < user_keys.size(); i++) {
+    builder.Add(Slice(GetInternalKey(user_keys[i], false)), Slice("value"));
+    ASSERT_EQ(builder.NumEntries(), i + 1);
+    ASSERT_OK(builder.status());
+  }
+  ASSERT_TRUE(builder.Finish().IsNotSupported());
+  ASSERT_OK(writable_file->Close());
+}
+
+TEST_F(CuckooBuilderTest, FailWhenSameKeyInserted) {
+  hash_map = {{"repeatedkey", {0, 1, 2, 3}}};
+  uint32_t num_hash_fun = 4;
+  std::string user_key = "repeatedkey";
+
+  unique_ptr<WritableFile> writable_file;
+  fname = test::TmpDir() + "/FailWhenSameKeyInserted";
+  ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
+  CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
+      num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash);
+  ASSERT_OK(builder.status());
+
+  builder.Add(Slice(GetInternalKey(user_key, false)), Slice("value1"));
+  ASSERT_EQ(builder.NumEntries(), 1u);
+  ASSERT_OK(builder.status());
+  builder.Add(Slice(GetInternalKey(user_key, true)), Slice("value2"));
+  ASSERT_EQ(builder.NumEntries(), 2u);
+  ASSERT_OK(builder.status());
+
+  ASSERT_TRUE(builder.Finish().IsNotSupported());
+  ASSERT_OK(writable_file->Close());
+}
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/table/cuckoo_table_factory.cc b/src/rocksdb/table/cuckoo_table_factory.cc
new file mode 100644
index 0000000..17aa1d7
--- /dev/null
+++ b/src/rocksdb/table/cuckoo_table_factory.cc
@@ -0,0 +1,69 @@
+// Copyright (c) 2014, Facebook, Inc. All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef ROCKSDB_LITE
+#include "table/cuckoo_table_factory.h"
+
+#include "db/dbformat.h"
+#include "table/cuckoo_table_builder.h"
+#include "table/cuckoo_table_reader.h"
+
+namespace rocksdb {
+
+Status CuckooTableFactory::NewTableReader(const ImmutableCFOptions& ioptions,
+    const EnvOptions& env_options, const InternalKeyComparator& icomp,
+    std::unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
+    std::unique_ptr<TableReader>* table) const {
+  std::unique_ptr<CuckooTableReader> new_reader(new CuckooTableReader(ioptions,
+      std::move(file), file_size, icomp.user_comparator(), nullptr));
+  Status s = new_reader->status();
+  if (s.ok()) {
+    *table = std::move(new_reader);
+  }
+  return s;
+}
+
+TableBuilder* CuckooTableFactory::NewTableBuilder(
+    const TableBuilderOptions& table_builder_options,
+    WritableFile* file) const {
+  // Ignore the skipFIlters flag. Does not apply to this file format
+  //
+
+  // TODO: change builder to take the option struct
+  return new CuckooTableBuilder(
+      file, table_options_.hash_table_ratio, 64,
+      table_options_.max_search_depth,
+      table_builder_options.internal_comparator.user_comparator(),
+      table_options_.cuckoo_block_size, table_options_.use_module_hash,
+      table_options_.identity_as_first_hash, nullptr);
+}
+
+std::string CuckooTableFactory::GetPrintableTableOptions() const {
+  std::string ret;
+  ret.reserve(2000);
+  const int kBufferSize = 200;
+  char buffer[kBufferSize];
+
+  snprintf(buffer, kBufferSize, "  hash_table_ratio: %lf\n",
+           table_options_.hash_table_ratio);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  max_search_depth: %u\n",
+           table_options_.max_search_depth);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  cuckoo_block_size: %u\n",
+           table_options_.cuckoo_block_size);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  identity_as_first_hash: %d\n",
+           table_options_.identity_as_first_hash);
+  ret.append(buffer);
+  return ret;
+}
+
+TableFactory* NewCuckooTableFactory(const CuckooTableOptions& table_options) {
+  return new CuckooTableFactory(table_options);
+}
+
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/cuckoo_table_factory.h b/src/rocksdb/table/cuckoo_table_factory.h
new file mode 100644
index 0000000..0b3729e
--- /dev/null
+++ b/src/rocksdb/table/cuckoo_table_factory.h
@@ -0,0 +1,79 @@
+// Copyright (c) 2014, Facebook, Inc. All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include "rocksdb/table.h"
+#include "util/murmurhash.h"
+#include "rocksdb/options.h"
+
+namespace rocksdb {
+
+const uint32_t kCuckooMurmurSeedMultiplier = 816922183;
+static inline uint64_t CuckooHash(
+    const Slice& user_key, uint32_t hash_cnt, bool use_module_hash,
+    uint64_t table_size_, bool identity_as_first_hash,
+    uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t)) {
+#ifndef NDEBUG
+  // This part is used only in unit tests.
+  if (get_slice_hash != nullptr) {
+    return get_slice_hash(user_key, hash_cnt, table_size_);
+  }
+#endif
+  uint64_t value = 0;
+  if (hash_cnt == 0 && identity_as_first_hash) {
+    value = (*reinterpret_cast<const int64_t*>(user_key.data()));
+  } else {
+    value = MurmurHash(user_key.data(), static_cast<int>(user_key.size()),
+                       kCuckooMurmurSeedMultiplier * hash_cnt);
+  }
+  if (use_module_hash) {
+    return value % table_size_;
+  } else {
+    return value & (table_size_ - 1);
+  }
+}
+
+// Cuckoo Table is designed for applications that require fast point lookups
+// but not fast range scans.
+//
+// Some assumptions:
+// - Key length and Value length are fixed.
+// - Does not support Snapshot.
+// - Does not support Merge operations.
+class CuckooTableFactory : public TableFactory {
+ public:
+  explicit CuckooTableFactory(const CuckooTableOptions& table_options)
+    : table_options_(table_options) {}
+  ~CuckooTableFactory() {}
+
+  const char* Name() const override { return "CuckooTable"; }
+
+  Status NewTableReader(
+      const ImmutableCFOptions& ioptions, const EnvOptions& env_options,
+      const InternalKeyComparator& internal_comparator,
+      unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
+      unique_ptr<TableReader>* table) const override;
+
+  TableBuilder* NewTableBuilder(
+      const TableBuilderOptions& table_builder_options,
+      WritableFile* file) const override;
+
+  // Sanitizes the specified DB Options.
+  Status SanitizeOptions(const DBOptions& db_opts,
+                         const ColumnFamilyOptions& cf_opts) const override {
+    return Status::OK();
+  }
+
+  std::string GetPrintableTableOptions() const override;
+
+ private:
+  const CuckooTableOptions table_options_;
+};
+
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/cuckoo_table_reader.cc b/src/rocksdb/table/cuckoo_table_reader.cc
new file mode 100644
index 0000000..7f017ec
--- /dev/null
+++ b/src/rocksdb/table/cuckoo_table_reader.cc
@@ -0,0 +1,377 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+#include "table/cuckoo_table_reader.h"
+
+#include <algorithm>
+#include <limits>
+#include <string>
+#include <utility>
+#include <vector>
+#include "rocksdb/iterator.h"
+#include "rocksdb/table.h"
+#include "table/meta_blocks.h"
+#include "table/cuckoo_table_factory.h"
+#include "table/get_context.h"
+#include "util/arena.h"
+#include "util/coding.h"
+
+namespace rocksdb {
+namespace {
+const uint64_t CACHE_LINE_MASK = ~((uint64_t)CACHE_LINE_SIZE - 1);
+const uint32_t kInvalidIndex = std::numeric_limits<uint32_t>::max();
+}
+
+extern const uint64_t kCuckooTableMagicNumber;
+
+CuckooTableReader::CuckooTableReader(
+    const ImmutableCFOptions& ioptions,
+    std::unique_ptr<RandomAccessFile>&& file,
+    uint64_t file_size,
+    const Comparator* comparator,
+    uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t))
+    : file_(std::move(file)),
+      ucomp_(comparator),
+      get_slice_hash_(get_slice_hash) {
+  if (!ioptions.allow_mmap_reads) {
+    status_ = Status::InvalidArgument("File is not mmaped");
+  }
+  TableProperties* props = nullptr;
+  status_ = ReadTableProperties(file_.get(), file_size, kCuckooTableMagicNumber,
+      ioptions.env, ioptions.info_log, &props);
+  if (!status_.ok()) {
+    return;
+  }
+  table_props_.reset(props);
+  auto& user_props = props->user_collected_properties;
+  auto hash_funs = user_props.find(CuckooTablePropertyNames::kNumHashFunc);
+  if (hash_funs == user_props.end()) {
+    status_ = Status::Corruption("Number of hash functions not found");
+    return;
+  }
+  num_hash_func_ = *reinterpret_cast<const uint32_t*>(hash_funs->second.data());
+  auto unused_key = user_props.find(CuckooTablePropertyNames::kEmptyKey);
+  if (unused_key == user_props.end()) {
+    status_ = Status::Corruption("Empty bucket value not found");
+    return;
+  }
+  unused_key_ = unused_key->second;
+
+  key_length_ = static_cast<uint32_t>(props->fixed_key_len);
+  auto user_key_len = user_props.find(CuckooTablePropertyNames::kUserKeyLength);
+  if (user_key_len == user_props.end()) {
+    status_ = Status::Corruption("User key length not found");
+    return;
+  }
+  user_key_length_ = *reinterpret_cast<const uint32_t*>(
+      user_key_len->second.data());
+
+  auto value_length = user_props.find(CuckooTablePropertyNames::kValueLength);
+  if (value_length == user_props.end()) {
+    status_ = Status::Corruption("Value length not found");
+    return;
+  }
+  value_length_ = *reinterpret_cast<const uint32_t*>(
+      value_length->second.data());
+  bucket_length_ = key_length_ + value_length_;
+
+  auto hash_table_size = user_props.find(
+      CuckooTablePropertyNames::kHashTableSize);
+  if (hash_table_size == user_props.end()) {
+    status_ = Status::Corruption("Hash table size not found");
+    return;
+  }
+  table_size_ = *reinterpret_cast<const uint64_t*>(
+      hash_table_size->second.data());
+
+  auto is_last_level = user_props.find(CuckooTablePropertyNames::kIsLastLevel);
+  if (is_last_level == user_props.end()) {
+    status_ = Status::Corruption("Is last level not found");
+    return;
+  }
+  is_last_level_ = *reinterpret_cast<const bool*>(is_last_level->second.data());
+
+  auto identity_as_first_hash = user_props.find(
+      CuckooTablePropertyNames::kIdentityAsFirstHash);
+  if (identity_as_first_hash == user_props.end()) {
+    status_ = Status::Corruption("identity as first hash not found");
+    return;
+  }
+  identity_as_first_hash_ = *reinterpret_cast<const bool*>(
+      identity_as_first_hash->second.data());
+
+  auto use_module_hash = user_props.find(
+      CuckooTablePropertyNames::kUseModuleHash);
+  if (use_module_hash == user_props.end()) {
+    status_ = Status::Corruption("hash type is not found");
+    return;
+  }
+  use_module_hash_ = *reinterpret_cast<const bool*>(
+      use_module_hash->second.data());
+  auto cuckoo_block_size = user_props.find(
+      CuckooTablePropertyNames::kCuckooBlockSize);
+  if (cuckoo_block_size == user_props.end()) {
+    status_ = Status::Corruption("Cuckoo block size not found");
+    return;
+  }
+  cuckoo_block_size_ = *reinterpret_cast<const uint32_t*>(
+      cuckoo_block_size->second.data());
+  cuckoo_block_bytes_minus_one_ = cuckoo_block_size_ * bucket_length_ - 1;
+  status_ = file_->Read(0, file_size, &file_data_, nullptr);
+}
+
+Status CuckooTableReader::Get(const ReadOptions& readOptions, const Slice& key,
+                              GetContext* get_context) {
+  assert(key.size() == key_length_ + (is_last_level_ ? 8 : 0));
+  Slice user_key = ExtractUserKey(key);
+  for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) {
+    uint64_t offset = bucket_length_ * CuckooHash(
+        user_key, hash_cnt, use_module_hash_, table_size_,
+        identity_as_first_hash_, get_slice_hash_);
+    const char* bucket = &file_data_.data()[offset];
+    for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
+         ++block_idx, bucket += bucket_length_) {
+      if (ucomp_->Compare(Slice(unused_key_.data(), user_key.size()),
+                          Slice(bucket, user_key.size())) == 0) {
+        return Status::OK();
+      }
+      // Here, we compare only the user key part as we support only one entry
+      // per user key and we don't support sanpshot.
+      if (ucomp_->Compare(user_key, Slice(bucket, user_key.size())) == 0) {
+        Slice value(bucket + key_length_, value_length_);
+        if (is_last_level_) {
+          get_context->SaveValue(value);
+        } else {
+          Slice full_key(bucket, key_length_);
+          ParsedInternalKey found_ikey;
+          ParseInternalKey(full_key, &found_ikey);
+          get_context->SaveValue(found_ikey, value);
+        }
+        // We don't support merge operations. So, we return here.
+        return Status::OK();
+      }
+    }
+  }
+  return Status::OK();
+}
+
+void CuckooTableReader::Prepare(const Slice& key) {
+  // Prefetch the first Cuckoo Block.
+  Slice user_key = ExtractUserKey(key);
+  uint64_t addr = reinterpret_cast<uint64_t>(file_data_.data()) +
+    bucket_length_ * CuckooHash(user_key, 0, use_module_hash_, table_size_,
+                                identity_as_first_hash_, nullptr);
+  uint64_t end_addr = addr + cuckoo_block_bytes_minus_one_;
+  for (addr &= CACHE_LINE_MASK; addr < end_addr; addr += CACHE_LINE_SIZE) {
+    PREFETCH(reinterpret_cast<const char*>(addr), 0, 3);
+  }
+}
+
+class CuckooTableIterator : public Iterator {
+ public:
+  explicit CuckooTableIterator(CuckooTableReader* reader);
+  ~CuckooTableIterator() {}
+  bool Valid() const override;
+  void SeekToFirst() override;
+  void SeekToLast() override;
+  void Seek(const Slice& target) override;
+  void Next() override;
+  void Prev() override;
+  Slice key() const override;
+  Slice value() const override;
+  Status status() const override { return status_; }
+  void InitIfNeeded();
+
+ private:
+  struct BucketComparator {
+    BucketComparator(const Slice& file_data, const Comparator* ucomp,
+                     uint32_t bucket_len, uint32_t user_key_len,
+                     const Slice& target = Slice())
+      : file_data_(file_data),
+        ucomp_(ucomp),
+        bucket_len_(bucket_len),
+        user_key_len_(user_key_len),
+        target_(target) {}
+    bool operator()(const uint32_t first, const uint32_t second) const {
+      const char* first_bucket =
+        (first == kInvalidIndex) ? target_.data() :
+                                   &file_data_.data()[first * bucket_len_];
+      const char* second_bucket =
+        (second == kInvalidIndex) ? target_.data() :
+                                    &file_data_.data()[second * bucket_len_];
+      return ucomp_->Compare(Slice(first_bucket, user_key_len_),
+                             Slice(second_bucket, user_key_len_)) < 0;
+    }
+   private:
+    const Slice file_data_;
+    const Comparator* ucomp_;
+    const uint32_t bucket_len_;
+    const uint32_t user_key_len_;
+    const Slice target_;
+  };
+
+  const BucketComparator bucket_comparator_;
+  void PrepareKVAtCurrIdx();
+  CuckooTableReader* reader_;
+  bool initialized_;
+  Status status_;
+  // Contains a map of keys to bucket_id sorted in key order.
+  std::vector<uint32_t> sorted_bucket_ids_;
+  // We assume that the number of items can be stored in uint32 (4 Billion).
+  uint32_t curr_key_idx_;
+  Slice curr_value_;
+  IterKey curr_key_;
+  // No copying allowed
+  CuckooTableIterator(const CuckooTableIterator&) = delete;
+  void operator=(const Iterator&) = delete;
+};
+
+CuckooTableIterator::CuckooTableIterator(CuckooTableReader* reader)
+  : bucket_comparator_(reader->file_data_, reader->ucomp_,
+                       reader->bucket_length_, reader->user_key_length_),
+    reader_(reader),
+    initialized_(false),
+    curr_key_idx_(kInvalidIndex) {
+  sorted_bucket_ids_.clear();
+  curr_value_.clear();
+  curr_key_.Clear();
+}
+
+void CuckooTableIterator::InitIfNeeded() {
+  if (initialized_) {
+    return;
+  }
+  sorted_bucket_ids_.reserve(reader_->GetTableProperties()->num_entries);
+  uint64_t num_buckets = reader_->table_size_ + reader_->cuckoo_block_size_ - 1;
+  assert(num_buckets < kInvalidIndex);
+  const char* bucket = reader_->file_data_.data();
+  for (uint32_t bucket_id = 0; bucket_id < num_buckets; ++bucket_id) {
+    if (Slice(bucket, reader_->key_length_) != Slice(reader_->unused_key_)) {
+      sorted_bucket_ids_.push_back(bucket_id);
+    }
+    bucket += reader_->bucket_length_;
+  }
+  assert(sorted_bucket_ids_.size() ==
+      reader_->GetTableProperties()->num_entries);
+  std::sort(sorted_bucket_ids_.begin(), sorted_bucket_ids_.end(),
+            bucket_comparator_);
+  curr_key_idx_ = kInvalidIndex;
+  initialized_ = true;
+}
+
+void CuckooTableIterator::SeekToFirst() {
+  InitIfNeeded();
+  curr_key_idx_ = 0;
+  PrepareKVAtCurrIdx();
+}
+
+void CuckooTableIterator::SeekToLast() {
+  InitIfNeeded();
+  curr_key_idx_ = static_cast<uint32_t>(sorted_bucket_ids_.size()) - 1;
+  PrepareKVAtCurrIdx();
+}
+
+void CuckooTableIterator::Seek(const Slice& target) {
+  InitIfNeeded();
+  const BucketComparator seek_comparator(
+      reader_->file_data_, reader_->ucomp_,
+      reader_->bucket_length_, reader_->user_key_length_,
+      ExtractUserKey(target));
+  auto seek_it = std::lower_bound(sorted_bucket_ids_.begin(),
+      sorted_bucket_ids_.end(),
+      kInvalidIndex,
+      seek_comparator);
+  curr_key_idx_ =
+      static_cast<uint32_t>(std::distance(sorted_bucket_ids_.begin(), seek_it));
+  PrepareKVAtCurrIdx();
+}
+
+bool CuckooTableIterator::Valid() const {
+  return curr_key_idx_ < sorted_bucket_ids_.size();
+}
+
+void CuckooTableIterator::PrepareKVAtCurrIdx() {
+  if (!Valid()) {
+    curr_value_.clear();
+    curr_key_.Clear();
+    return;
+  }
+  uint32_t id = sorted_bucket_ids_[curr_key_idx_];
+  const char* offset = reader_->file_data_.data() +
+                       id * reader_->bucket_length_;
+  if (reader_->is_last_level_) {
+    // Always return internal key.
+    curr_key_.SetInternalKey(Slice(offset, reader_->user_key_length_),
+                             0, kTypeValue);
+  } else {
+    curr_key_.SetKey(Slice(offset, reader_->key_length_));
+  }
+  curr_value_ = Slice(offset + reader_->key_length_, reader_->value_length_);
+}
+
+void CuckooTableIterator::Next() {
+  if (!Valid()) {
+    curr_value_.clear();
+    curr_key_.Clear();
+    return;
+  }
+  ++curr_key_idx_;
+  PrepareKVAtCurrIdx();
+}
+
+void CuckooTableIterator::Prev() {
+  if (curr_key_idx_ == 0) {
+    curr_key_idx_ = static_cast<uint32_t>(sorted_bucket_ids_.size());
+  }
+  if (!Valid()) {
+    curr_value_.clear();
+    curr_key_.Clear();
+    return;
+  }
+  --curr_key_idx_;
+  PrepareKVAtCurrIdx();
+}
+
+Slice CuckooTableIterator::key() const {
+  assert(Valid());
+  return curr_key_.GetKey();
+}
+
+Slice CuckooTableIterator::value() const {
+  assert(Valid());
+  return curr_value_;
+}
+
+extern Iterator* NewErrorIterator(const Status& status, Arena* arena);
+
+Iterator* CuckooTableReader::NewIterator(
+    const ReadOptions& read_options, Arena* arena) {
+  if (!status().ok()) {
+    return NewErrorIterator(
+        Status::Corruption("CuckooTableReader status is not okay."), arena);
+  }
+  if (read_options.total_order_seek) {
+    return NewErrorIterator(
+        Status::InvalidArgument("total_order_seek is not supported."), arena);
+  }
+  CuckooTableIterator* iter;
+  if (arena == nullptr) {
+    iter = new CuckooTableIterator(this);
+  } else {
+    auto iter_mem = arena->AllocateAligned(sizeof(CuckooTableIterator));
+    iter = new (iter_mem) CuckooTableIterator(this);
+  }
+  return iter;
+}
+
+size_t CuckooTableReader::ApproximateMemoryUsage() const { return 0; }
+
+}  // namespace rocksdb
+#endif
diff --git a/src/rocksdb/table/cuckoo_table_reader.h b/src/rocksdb/table/cuckoo_table_reader.h
new file mode 100644
index 0000000..4f00a9e
--- /dev/null
+++ b/src/rocksdb/table/cuckoo_table_reader.h
@@ -0,0 +1,82 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+#include <string>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "table/table_reader.h"
+
+namespace rocksdb {
+
+class Arena;
+class TableReader;
+
+class CuckooTableReader: public TableReader {
+ public:
+  CuckooTableReader(
+      const ImmutableCFOptions& ioptions,
+      std::unique_ptr<RandomAccessFile>&& file,
+      uint64_t file_size,
+      const Comparator* user_comparator,
+      uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t));
+  ~CuckooTableReader() {}
+
+  std::shared_ptr<const TableProperties> GetTableProperties() const override {
+    return table_props_;
+  }
+
+  Status status() const { return status_; }
+
+  Status Get(const ReadOptions& read_options, const Slice& key,
+             GetContext* get_context) override;
+
+  Iterator* NewIterator(const ReadOptions&, Arena* arena = nullptr) override;
+  void Prepare(const Slice& target) override;
+
+  // Report an approximation of how much memory has been used.
+  size_t ApproximateMemoryUsage() const override;
+
+  // Following methods are not implemented for Cuckoo Table Reader
+  uint64_t ApproximateOffsetOf(const Slice& key) override { return 0; }
+  void SetupForCompaction() override {}
+  // End of methods not implemented.
+
+ private:
+  friend class CuckooTableIterator;
+  void LoadAllKeys(std::vector<std::pair<Slice, uint32_t>>* key_to_bucket_id);
+  std::unique_ptr<RandomAccessFile> file_;
+  Slice file_data_;
+  bool is_last_level_;
+  bool identity_as_first_hash_;
+  bool use_module_hash_;
+  std::shared_ptr<const TableProperties> table_props_;
+  Status status_;
+  uint32_t num_hash_func_;
+  std::string unused_key_;
+  uint32_t key_length_;
+  uint32_t user_key_length_;
+  uint32_t value_length_;
+  uint32_t bucket_length_;
+  uint32_t cuckoo_block_size_;
+  uint32_t cuckoo_block_bytes_minus_one_;
+  uint64_t table_size_;
+  const Comparator* ucomp_;
+  uint64_t (*get_slice_hash_)(const Slice& s, uint32_t index,
+      uint64_t max_num_buckets);
+};
+
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/cuckoo_table_reader_test.cc b/src/rocksdb/table/cuckoo_table_reader_test.cc
new file mode 100644
index 0000000..660261a
--- /dev/null
+++ b/src/rocksdb/table/cuckoo_table_reader_test.cc
@@ -0,0 +1,546 @@
+// Copyright (c) 2014, Facebook, Inc. All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run this test\n");
+  return 1;
+}
+#else
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
+#include <gflags/gflags.h>
+#include <vector>
+#include <string>
+#include <map>
+
+#include "table/meta_blocks.h"
+#include "table/cuckoo_table_builder.h"
+#include "table/cuckoo_table_reader.h"
+#include "table/cuckoo_table_factory.h"
+#include "table/get_context.h"
+#include "util/arena.h"
+#include "util/random.h"
+#include "util/string_util.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+using GFLAGS::ParseCommandLineFlags;
+using GFLAGS::SetUsageMessage;
+
+DEFINE_string(file_dir, "", "Directory where the files will be created"
+    " for benchmark. Added for using tmpfs.");
+DEFINE_bool(enable_perf, false, "Run Benchmark Tests too.");
+DEFINE_bool(write, false,
+    "Should write new values to file in performance tests?");
+DEFINE_bool(identity_as_first_hash, true, "use identity as first hash");
+
+namespace rocksdb {
+
+namespace {
+const uint32_t kNumHashFunc = 10;
+// Methods, variables related to Hash functions.
+std::unordered_map<std::string, std::vector<uint64_t>> hash_map;
+
+void AddHashLookups(const std::string& s, uint64_t bucket_id,
+        uint32_t num_hash_fun) {
+  std::vector<uint64_t> v;
+  for (uint32_t i = 0; i < num_hash_fun; i++) {
+    v.push_back(bucket_id + i);
+  }
+  hash_map[s] = v;
+}
+
+uint64_t GetSliceHash(const Slice& s, uint32_t index,
+    uint64_t max_num_buckets) {
+  return hash_map[s.ToString()][index];
+}
+
+}  // namespace
+
+class CuckooReaderTest : public testing::Test {
+ public:
+  using testing::Test::SetUp;
+
+  CuckooReaderTest() {
+    options.allow_mmap_reads = true;
+    env = options.env;
+    env_options = EnvOptions(options);
+  }
+
+  void SetUp(int num) {
+    num_items = num;
+    hash_map.clear();
+    keys.clear();
+    keys.resize(num_items);
+    user_keys.clear();
+    user_keys.resize(num_items);
+    values.clear();
+    values.resize(num_items);
+  }
+
+  std::string NumToStr(int64_t i) {
+    return std::string(reinterpret_cast<char*>(&i), sizeof(i));
+  }
+
+  void CreateCuckooFileAndCheckReader(
+      const Comparator* ucomp = BytewiseComparator()) {
+    std::unique_ptr<WritableFile> writable_file;
+    ASSERT_OK(env->NewWritableFile(fname, &writable_file, env_options));
+    CuckooTableBuilder builder(
+        writable_file.get(), 0.9, kNumHashFunc, 100, ucomp, 2,
+        false, false, GetSliceHash);
+    ASSERT_OK(builder.status());
+    for (uint32_t key_idx = 0; key_idx < num_items; ++key_idx) {
+      builder.Add(Slice(keys[key_idx]), Slice(values[key_idx]));
+      ASSERT_OK(builder.status());
+      ASSERT_EQ(builder.NumEntries(), key_idx + 1);
+    }
+    ASSERT_OK(builder.Finish());
+    ASSERT_EQ(num_items, builder.NumEntries());
+    file_size = builder.FileSize();
+    ASSERT_OK(writable_file->Close());
+
+    // Check reader now.
+    std::unique_ptr<RandomAccessFile> read_file;
+    ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options));
+    const ImmutableCFOptions ioptions(options);
+    CuckooTableReader reader(
+        ioptions,
+        std::move(read_file),
+        file_size,
+        ucomp,
+        GetSliceHash);
+    ASSERT_OK(reader.status());
+    // Assume no merge/deletion
+    for (uint32_t i = 0; i < num_items; ++i) {
+      std::string value;
+      GetContext get_context(ucomp, nullptr, nullptr, nullptr,
+                             GetContext::kNotFound, Slice(user_keys[i]), &value,
+                             nullptr, nullptr, nullptr);
+      ASSERT_OK(reader.Get(ReadOptions(), Slice(keys[i]), &get_context));
+      ASSERT_EQ(values[i], value);
+    }
+  }
+  void UpdateKeys(bool with_zero_seqno) {
+    for (uint32_t i = 0; i < num_items; i++) {
+      ParsedInternalKey ikey(user_keys[i],
+          with_zero_seqno ? 0 : i + 1000, kTypeValue);
+      keys[i].clear();
+      AppendInternalKey(&keys[i], ikey);
+    }
+  }
+
+  void CheckIterator(const Comparator* ucomp = BytewiseComparator()) {
+    std::unique_ptr<RandomAccessFile> read_file;
+    ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options));
+    const ImmutableCFOptions ioptions(options);
+    CuckooTableReader reader(
+        ioptions,
+        std::move(read_file),
+        file_size,
+        ucomp,
+        GetSliceHash);
+    ASSERT_OK(reader.status());
+    Iterator* it = reader.NewIterator(ReadOptions(), nullptr);
+    ASSERT_OK(it->status());
+    ASSERT_TRUE(!it->Valid());
+    it->SeekToFirst();
+    int cnt = 0;
+    while (it->Valid()) {
+      ASSERT_OK(it->status());
+      ASSERT_TRUE(Slice(keys[cnt]) == it->key());
+      ASSERT_TRUE(Slice(values[cnt]) == it->value());
+      ++cnt;
+      it->Next();
+    }
+    ASSERT_EQ(static_cast<uint32_t>(cnt), num_items);
+
+    it->SeekToLast();
+    cnt = static_cast<int>(num_items) - 1;
+    ASSERT_TRUE(it->Valid());
+    while (it->Valid()) {
+      ASSERT_OK(it->status());
+      ASSERT_TRUE(Slice(keys[cnt]) == it->key());
+      ASSERT_TRUE(Slice(values[cnt]) == it->value());
+      --cnt;
+      it->Prev();
+    }
+    ASSERT_EQ(cnt, -1);
+
+    cnt = static_cast<int>(num_items) / 2;
+    it->Seek(keys[cnt]);
+    while (it->Valid()) {
+      ASSERT_OK(it->status());
+      ASSERT_TRUE(Slice(keys[cnt]) == it->key());
+      ASSERT_TRUE(Slice(values[cnt]) == it->value());
+      ++cnt;
+      it->Next();
+    }
+    ASSERT_EQ(static_cast<uint32_t>(cnt), num_items);
+    delete it;
+
+    Arena arena;
+    it = reader.NewIterator(ReadOptions(), &arena);
+    ASSERT_OK(it->status());
+    ASSERT_TRUE(!it->Valid());
+    it->Seek(keys[num_items/2]);
+    ASSERT_TRUE(it->Valid());
+    ASSERT_OK(it->status());
+    ASSERT_TRUE(keys[num_items/2] == it->key());
+    ASSERT_TRUE(values[num_items/2] == it->value());
+    ASSERT_OK(it->status());
+    it->~Iterator();
+  }
+
+  std::vector<std::string> keys;
+  std::vector<std::string> user_keys;
+  std::vector<std::string> values;
+  uint64_t num_items;
+  std::string fname;
+  uint64_t file_size;
+  Options options;
+  Env* env;
+  EnvOptions env_options;
+};
+
+TEST_F(CuckooReaderTest, WhenKeyExists) {
+  SetUp(kNumHashFunc);
+  fname = test::TmpDir() + "/CuckooReader_WhenKeyExists";
+  for (uint64_t i = 0; i < num_items; i++) {
+    user_keys[i] = "key" + NumToStr(i);
+    ParsedInternalKey ikey(user_keys[i], i + 1000, kTypeValue);
+    AppendInternalKey(&keys[i], ikey);
+    values[i] = "value" + NumToStr(i);
+    // Give disjoint hash values.
+    AddHashLookups(user_keys[i], i, kNumHashFunc);
+  }
+  CreateCuckooFileAndCheckReader();
+  // Last level file.
+  UpdateKeys(true);
+  CreateCuckooFileAndCheckReader();
+  // Test with collision. Make all hash values collide.
+  hash_map.clear();
+  for (uint32_t i = 0; i < num_items; i++) {
+    AddHashLookups(user_keys[i], 0, kNumHashFunc);
+  }
+  UpdateKeys(false);
+  CreateCuckooFileAndCheckReader();
+  // Last level file.
+  UpdateKeys(true);
+  CreateCuckooFileAndCheckReader();
+}
+
+TEST_F(CuckooReaderTest, WhenKeyExistsWithUint64Comparator) {
+  SetUp(kNumHashFunc);
+  fname = test::TmpDir() + "/CuckooReaderUint64_WhenKeyExists";
+  for (uint64_t i = 0; i < num_items; i++) {
+    user_keys[i].resize(8);
+    memcpy(&user_keys[i][0], static_cast<void*>(&i), 8);
+    ParsedInternalKey ikey(user_keys[i], i + 1000, kTypeValue);
+    AppendInternalKey(&keys[i], ikey);
+    values[i] = "value" + NumToStr(i);
+    // Give disjoint hash values.
+    AddHashLookups(user_keys[i], i, kNumHashFunc);
+  }
+  CreateCuckooFileAndCheckReader(test::Uint64Comparator());
+  // Last level file.
+  UpdateKeys(true);
+  CreateCuckooFileAndCheckReader(test::Uint64Comparator());
+  // Test with collision. Make all hash values collide.
+  hash_map.clear();
+  for (uint32_t i = 0; i < num_items; i++) {
+    AddHashLookups(user_keys[i], 0, kNumHashFunc);
+  }
+  UpdateKeys(false);
+  CreateCuckooFileAndCheckReader(test::Uint64Comparator());
+  // Last level file.
+  UpdateKeys(true);
+  CreateCuckooFileAndCheckReader(test::Uint64Comparator());
+}
+
+TEST_F(CuckooReaderTest, CheckIterator) {
+  SetUp(2*kNumHashFunc);
+  fname = test::TmpDir() + "/CuckooReader_CheckIterator";
+  for (uint64_t i = 0; i < num_items; i++) {
+    user_keys[i] = "key" + NumToStr(i);
+    ParsedInternalKey ikey(user_keys[i], 1000, kTypeValue);
+    AppendInternalKey(&keys[i], ikey);
+    values[i] = "value" + NumToStr(i);
+    // Give disjoint hash values, in reverse order.
+    AddHashLookups(user_keys[i], num_items-i-1, kNumHashFunc);
+  }
+  CreateCuckooFileAndCheckReader();
+  CheckIterator();
+  // Last level file.
+  UpdateKeys(true);
+  CreateCuckooFileAndCheckReader();
+  CheckIterator();
+}
+
+TEST_F(CuckooReaderTest, CheckIteratorUint64) {
+  SetUp(2*kNumHashFunc);
+  fname = test::TmpDir() + "/CuckooReader_CheckIterator";
+  for (uint64_t i = 0; i < num_items; i++) {
+    user_keys[i].resize(8);
+    memcpy(&user_keys[i][0], static_cast<void*>(&i), 8);
+    ParsedInternalKey ikey(user_keys[i], 1000, kTypeValue);
+    AppendInternalKey(&keys[i], ikey);
+    values[i] = "value" + NumToStr(i);
+    // Give disjoint hash values, in reverse order.
+    AddHashLookups(user_keys[i], num_items-i-1, kNumHashFunc);
+  }
+  CreateCuckooFileAndCheckReader(test::Uint64Comparator());
+  CheckIterator(test::Uint64Comparator());
+  // Last level file.
+  UpdateKeys(true);
+  CreateCuckooFileAndCheckReader(test::Uint64Comparator());
+  CheckIterator(test::Uint64Comparator());
+}
+
+TEST_F(CuckooReaderTest, WhenKeyNotFound) {
+  // Add keys with colliding hash values.
+  SetUp(kNumHashFunc);
+  fname = test::TmpDir() + "/CuckooReader_WhenKeyNotFound";
+  for (uint64_t i = 0; i < num_items; i++) {
+    user_keys[i] = "key" + NumToStr(i);
+    ParsedInternalKey ikey(user_keys[i], i + 1000, kTypeValue);
+    AppendInternalKey(&keys[i], ikey);
+    values[i] = "value" + NumToStr(i);
+    // Make all hash values collide.
+    AddHashLookups(user_keys[i], 0, kNumHashFunc);
+  }
+  auto* ucmp = BytewiseComparator();
+  CreateCuckooFileAndCheckReader();
+  std::unique_ptr<RandomAccessFile> read_file;
+  ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options));
+  const ImmutableCFOptions ioptions(options);
+  CuckooTableReader reader(
+      ioptions,
+      std::move(read_file),
+      file_size,
+      ucmp,
+      GetSliceHash);
+  ASSERT_OK(reader.status());
+  // Search for a key with colliding hash values.
+  std::string not_found_user_key = "key" + NumToStr(num_items);
+  std::string not_found_key;
+  AddHashLookups(not_found_user_key, 0, kNumHashFunc);
+  ParsedInternalKey ikey(not_found_user_key, 1000, kTypeValue);
+  AppendInternalKey(&not_found_key, ikey);
+  std::string value;
+  GetContext get_context(ucmp, nullptr, nullptr, nullptr, GetContext::kNotFound,
+                         Slice(not_found_key), &value, nullptr, nullptr,
+                         nullptr);
+  ASSERT_OK(reader.Get(ReadOptions(), Slice(not_found_key), &get_context));
+  ASSERT_TRUE(value.empty());
+  ASSERT_OK(reader.status());
+  // Search for a key with an independent hash value.
+  std::string not_found_user_key2 = "key" + NumToStr(num_items + 1);
+  AddHashLookups(not_found_user_key2, kNumHashFunc, kNumHashFunc);
+  ParsedInternalKey ikey2(not_found_user_key2, 1000, kTypeValue);
+  std::string not_found_key2;
+  AppendInternalKey(&not_found_key2, ikey2);
+  GetContext get_context2(ucmp, nullptr, nullptr, nullptr,
+                          GetContext::kNotFound, Slice(not_found_key2), &value,
+                          nullptr, nullptr, nullptr);
+  ASSERT_OK(reader.Get(ReadOptions(), Slice(not_found_key2), &get_context2));
+  ASSERT_TRUE(value.empty());
+  ASSERT_OK(reader.status());
+
+  // Test read when key is unused key.
+  std::string unused_key =
+    reader.GetTableProperties()->user_collected_properties.at(
+    CuckooTablePropertyNames::kEmptyKey);
+  // Add hash values that map to empty buckets.
+  AddHashLookups(ExtractUserKey(unused_key).ToString(),
+      kNumHashFunc, kNumHashFunc);
+  GetContext get_context3(ucmp, nullptr, nullptr, nullptr,
+                          GetContext::kNotFound, Slice(unused_key), &value,
+                          nullptr, nullptr, nullptr);
+  ASSERT_OK(reader.Get(ReadOptions(), Slice(unused_key), &get_context3));
+  ASSERT_TRUE(value.empty());
+  ASSERT_OK(reader.status());
+}
+
+// Performance tests
+namespace {
+void GetKeys(uint64_t num, std::vector<std::string>* keys) {
+  keys->clear();
+  IterKey k;
+  k.SetInternalKey("", 0, kTypeValue);
+  std::string internal_key_suffix = k.GetKey().ToString();
+  ASSERT_EQ(static_cast<size_t>(8), internal_key_suffix.size());
+  for (uint64_t key_idx = 0; key_idx < num; ++key_idx) {
+    uint64_t value = 2 * key_idx;
+    std::string new_key(reinterpret_cast<char*>(&value), sizeof(value));
+    new_key += internal_key_suffix;
+    keys->push_back(new_key);
+  }
+}
+
+std::string GetFileName(uint64_t num) {
+  if (FLAGS_file_dir.empty()) {
+    FLAGS_file_dir = test::TmpDir();
+  }
+  return FLAGS_file_dir + "/cuckoo_read_benchmark" +
+    ToString(num/1000000) + "Mkeys";
+}
+
+// Create last level file as we are interested in measuring performance of
+// last level file only.
+void WriteFile(const std::vector<std::string>& keys,
+    const uint64_t num, double hash_ratio) {
+  Options options;
+  options.allow_mmap_reads = true;
+  Env* env = options.env;
+  EnvOptions env_options = EnvOptions(options);
+  std::string fname = GetFileName(num);
+
+  std::unique_ptr<WritableFile> writable_file;
+  ASSERT_OK(env->NewWritableFile(fname, &writable_file, env_options));
+  CuckooTableBuilder builder(
+      writable_file.get(), hash_ratio,
+      64, 1000, test::Uint64Comparator(), 5,
+      false, FLAGS_identity_as_first_hash, nullptr);
+  ASSERT_OK(builder.status());
+  for (uint64_t key_idx = 0; key_idx < num; ++key_idx) {
+    // Value is just a part of key.
+    builder.Add(Slice(keys[key_idx]), Slice(&keys[key_idx][0], 4));
+    ASSERT_EQ(builder.NumEntries(), key_idx + 1);
+    ASSERT_OK(builder.status());
+  }
+  ASSERT_OK(builder.Finish());
+  ASSERT_EQ(num, builder.NumEntries());
+  ASSERT_OK(writable_file->Close());
+
+  uint64_t file_size;
+  env->GetFileSize(fname, &file_size);
+  std::unique_ptr<RandomAccessFile> read_file;
+  ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options));
+
+  const ImmutableCFOptions ioptions(options);
+  CuckooTableReader reader(
+      ioptions, std::move(read_file), file_size,
+      test::Uint64Comparator(), nullptr);
+  ASSERT_OK(reader.status());
+  ReadOptions r_options;
+  std::string value;
+  // Assume only the fast path is triggered
+  GetContext get_context(nullptr, nullptr, nullptr, nullptr,
+                         GetContext::kNotFound, Slice(), &value, nullptr,
+                         nullptr, nullptr);
+  for (uint64_t i = 0; i < num; ++i) {
+    value.clear();
+    ASSERT_OK(reader.Get(r_options, Slice(keys[i]), &get_context));
+    ASSERT_TRUE(Slice(keys[i]) == Slice(&keys[i][0], 4));
+  }
+}
+
+void ReadKeys(uint64_t num, uint32_t batch_size) {
+  Options options;
+  options.allow_mmap_reads = true;
+  Env* env = options.env;
+  EnvOptions env_options = EnvOptions(options);
+  std::string fname = GetFileName(num);
+
+  uint64_t file_size;
+  env->GetFileSize(fname, &file_size);
+  std::unique_ptr<RandomAccessFile> read_file;
+  ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options));
+
+  const ImmutableCFOptions ioptions(options);
+  CuckooTableReader reader(
+      ioptions, std::move(read_file), file_size, test::Uint64Comparator(),
+      nullptr);
+  ASSERT_OK(reader.status());
+  const UserCollectedProperties user_props =
+    reader.GetTableProperties()->user_collected_properties;
+  const uint32_t num_hash_fun = *reinterpret_cast<const uint32_t*>(
+      user_props.at(CuckooTablePropertyNames::kNumHashFunc).data());
+  const uint64_t table_size = *reinterpret_cast<const uint64_t*>(
+      user_props.at(CuckooTablePropertyNames::kHashTableSize).data());
+  fprintf(stderr, "With %" PRIu64 " items, utilization is %.2f%%, number of"
+      " hash functions: %u.\n", num, num * 100.0 / (table_size), num_hash_fun);
+  ReadOptions r_options;
+
+  std::vector<uint64_t> keys;
+  keys.reserve(num);
+  for (uint64_t i = 0; i < num; ++i) {
+    keys.push_back(2 * i);
+  }
+  std::random_shuffle(keys.begin(), keys.end());
+
+  std::string value;
+  // Assume only the fast path is triggered
+  GetContext get_context(nullptr, nullptr, nullptr, nullptr,
+                         GetContext::kNotFound, Slice(), &value, nullptr,
+                         nullptr, nullptr);
+  uint64_t start_time = env->NowMicros();
+  if (batch_size > 0) {
+    for (uint64_t i = 0; i < num; i += batch_size) {
+      for (uint64_t j = i; j < i+batch_size && j < num; ++j) {
+        reader.Prepare(Slice(reinterpret_cast<char*>(&keys[j]), 16));
+      }
+      for (uint64_t j = i; j < i+batch_size && j < num; ++j) {
+        reader.Get(r_options, Slice(reinterpret_cast<char*>(&keys[j]), 16),
+                   &get_context);
+      }
+    }
+  } else {
+    for (uint64_t i = 0; i < num; i++) {
+      reader.Get(r_options, Slice(reinterpret_cast<char*>(&keys[i]), 16),
+                 &get_context);
+    }
+  }
+  float time_per_op = (env->NowMicros() - start_time) * 1.0 / num;
+  fprintf(stderr,
+      "Time taken per op is %.3fus (%.1f Mqps) with batch size of %u\n",
+      time_per_op, 1.0 / time_per_op, batch_size);
+}
+}  // namespace.
+
+TEST_F(CuckooReaderTest, TestReadPerformance) {
+  if (!FLAGS_enable_perf) {
+    return;
+  }
+  double hash_ratio = 0.95;
+  // These numbers are chosen to have a hash utilizaiton % close to
+  // 0.9, 0.75, 0.6 and 0.5 respectively.
+  // They all create 128 M buckets.
+  std::vector<uint64_t> nums = {120*1024*1024, 100*1024*1024, 80*1024*1024,
+    70*1024*1024};
+#ifndef NDEBUG
+  fprintf(stdout,
+      "WARNING: Not compiled with DNDEBUG. Performance tests may be slow.\n");
+#endif
+  for (uint64_t num : nums) {
+    if (FLAGS_write || !Env::Default()->FileExists(GetFileName(num))) {
+      std::vector<std::string> all_keys;
+      GetKeys(num, &all_keys);
+      WriteFile(all_keys, num, hash_ratio);
+    }
+    ReadKeys(num, 0);
+    ReadKeys(num, 10);
+    ReadKeys(num, 25);
+    ReadKeys(num, 50);
+    ReadKeys(num, 100);
+    fprintf(stderr, "\n");
+  }
+}
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  ParseCommandLineFlags(&argc, &argv, true);
+  return RUN_ALL_TESTS();
+}
+
+#endif  // GFLAGS.
diff --git a/src/rocksdb/table/filter_block.cc b/src/rocksdb/table/filter_block.cc
deleted file mode 100644
index 3651a7d..0000000
--- a/src/rocksdb/table/filter_block.cc
+++ /dev/null
@@ -1,187 +0,0 @@
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-//
-// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#include "table/filter_block.h"
-
-#include "db/dbformat.h"
-#include "rocksdb/filter_policy.h"
-#include "util/coding.h"
-
-namespace rocksdb {
-
-// See doc/table_format.txt for an explanation of the filter block format.
-
-// Generate new filter every 2KB of data
-static const size_t kFilterBaseLg = 11;
-static const size_t kFilterBase = 1 << kFilterBaseLg;
-
-FilterBlockBuilder::FilterBlockBuilder(const Options& opt,
-                                       const Comparator* internal_comparator)
-    : policy_(opt.filter_policy),
-      prefix_extractor_(opt.prefix_extractor.get()),
-      whole_key_filtering_(opt.whole_key_filtering),
-      comparator_(internal_comparator) {}
-
-void FilterBlockBuilder::StartBlock(uint64_t block_offset) {
-  uint64_t filter_index = (block_offset / kFilterBase);
-  assert(filter_index >= filter_offsets_.size());
-  while (filter_index > filter_offsets_.size()) {
-    GenerateFilter();
-  }
-}
-
-bool FilterBlockBuilder::SamePrefix(const Slice &key1,
-                                    const Slice &key2) const {
-  if (!prefix_extractor_->InDomain(key1) &&
-      !prefix_extractor_->InDomain(key2)) {
-    return true;
-  } else if (!prefix_extractor_->InDomain(key1) ||
-             !prefix_extractor_->InDomain(key2)) {
-    return false;
-  } else {
-    return (prefix_extractor_->Transform(key1) ==
-            prefix_extractor_->Transform(key2));
-  }
-}
-
-void FilterBlockBuilder::AddKey(const Slice& key) {
-  // get slice for most recently added entry
-  Slice prev;
-  size_t added_to_start = 0;
-
-  // add key to filter if needed
-  if (whole_key_filtering_) {
-    start_.push_back(entries_.size());
-    ++added_to_start;
-    entries_.append(key.data(), key.size());
-  }
-
-  if (start_.size() > added_to_start) {
-    size_t prev_start = start_[start_.size() - 1 - added_to_start];
-    const char* base = entries_.data() + prev_start;
-    size_t length = entries_.size() - prev_start;
-    prev = Slice(base, length);
-  }
-
-  // add prefix to filter if needed
-  if (prefix_extractor_ && prefix_extractor_->InDomain(ExtractUserKey(key))) {
-    // If prefix_extractor_, this filter_block layer assumes we only
-    // operate on internal keys.
-    Slice user_key = ExtractUserKey(key);
-    // this assumes prefix(prefix(key)) == prefix(key), as the last
-    // entry in entries_ may be either a key or prefix, and we use
-    // prefix(last entry) to get the prefix of the last key.
-    if (prev.size() == 0 ||
-        !SamePrefix(user_key, ExtractUserKey(prev))) {
-      Slice prefix = prefix_extractor_->Transform(user_key);
-      InternalKey internal_prefix_tmp(prefix, 0, kTypeValue);
-      Slice internal_prefix = internal_prefix_tmp.Encode();
-      start_.push_back(entries_.size());
-      entries_.append(internal_prefix.data(), internal_prefix.size());
-    }
-  }
-}
-
-Slice FilterBlockBuilder::Finish() {
-  if (!start_.empty()) {
-    GenerateFilter();
-  }
-
-  // Append array of per-filter offsets
-  const uint32_t array_offset = result_.size();
-  for (size_t i = 0; i < filter_offsets_.size(); i++) {
-    PutFixed32(&result_, filter_offsets_[i]);
-  }
-
-  PutFixed32(&result_, array_offset);
-  result_.push_back(kFilterBaseLg);  // Save encoding parameter in result
-  return Slice(result_);
-}
-
-void FilterBlockBuilder::GenerateFilter() {
-  const size_t num_entries = start_.size();
-  if (num_entries == 0) {
-    // Fast path if there are no keys for this filter
-    filter_offsets_.push_back(result_.size());
-    return;
-  }
-
-  // Make list of keys from flattened key structure
-  start_.push_back(entries_.size());  // Simplify length computation
-  tmp_entries_.resize(num_entries);
-  for (size_t i = 0; i < num_entries; i++) {
-    const char* base = entries_.data() + start_[i];
-    size_t length = start_[i+1] - start_[i];
-    tmp_entries_[i] = Slice(base, length);
-  }
-
-  // Generate filter for current set of keys and append to result_.
-  filter_offsets_.push_back(result_.size());
-  policy_->CreateFilter(&tmp_entries_[0], num_entries, &result_);
-
-  tmp_entries_.clear();
-  entries_.clear();
-  start_.clear();
-}
-
-FilterBlockReader::FilterBlockReader(
-    const Options& opt, const Slice& contents, bool delete_contents_after_use)
-    : policy_(opt.filter_policy),
-      prefix_extractor_(opt.prefix_extractor.get()),
-      whole_key_filtering_(opt.whole_key_filtering),
-      data_(nullptr),
-      offset_(nullptr),
-      num_(0),
-      base_lg_(0) {
-  size_t n = contents.size();
-  if (n < 5) return;  // 1 byte for base_lg_ and 4 for start of offset array
-  base_lg_ = contents[n-1];
-  uint32_t last_word = DecodeFixed32(contents.data() + n - 5);
-  if (last_word > n - 5) return;
-  data_ = contents.data();
-  offset_ = data_ + last_word;
-  num_ = (n - 5 - last_word) / 4;
-  if (delete_contents_after_use) {
-    filter_data.reset(contents.data());
-  }
-}
-
-bool FilterBlockReader::KeyMayMatch(uint64_t block_offset,
-                                    const Slice& key) {
-  if (!whole_key_filtering_) {
-    return true;
-  }
-  return MayMatch(block_offset, key);
-}
-
-bool FilterBlockReader::PrefixMayMatch(uint64_t block_offset,
-                                       const Slice& prefix) {
-  if (!prefix_extractor_) {
-    return true;
-  }
-  return MayMatch(block_offset, prefix);
-}
-
-bool FilterBlockReader::MayMatch(uint64_t block_offset, const Slice& entry) {
-  uint64_t index = block_offset >> base_lg_;
-  if (index < num_) {
-    uint32_t start = DecodeFixed32(offset_ + index*4);
-    uint32_t limit = DecodeFixed32(offset_ + index*4 + 4);
-    if (start <= limit && limit <= (uint32_t)(offset_ - data_)) {
-      Slice filter = Slice(data_ + start, limit - start);
-      return policy_->KeyMayMatch(entry, filter);
-    } else if (start == limit) {
-      // Empty filters do not match any entries
-      return false;
-    }
-  }
-  return true;  // Errors are treated as potential matches
-}
-
-}
diff --git a/src/rocksdb/table/filter_block.h b/src/rocksdb/table/filter_block.h
index 05c2bb9..855a231 100644
--- a/src/rocksdb/table/filter_block.h
+++ b/src/rocksdb/table/filter_block.h
@@ -10,6 +10,11 @@
 // A filter block is stored near the end of a Table file.  It contains
 // filters (e.g., bloom filters) for all data blocks in the table combined
 // into a single filter block.
+//
+// It is a base class for BlockBasedFilter and FullFilter.
+// These two are both used in BlockBasedTable. The first one contain filter
+// For a part of keys in sst file, the second contain filter for all keys
+// in sst file.
 
 #pragma once
 
@@ -21,10 +26,13 @@
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
 #include "util/hash.h"
+#include "format.h"
 
 namespace rocksdb {
 
+const uint64_t kNotValid = ULLONG_MAX;
 class FilterPolicy;
 
 // A FilterBlockBuilder is used to construct all of the filters for a
@@ -32,61 +40,51 @@ class FilterPolicy;
 // a special block in the Table.
 //
 // The sequence of calls to FilterBlockBuilder must match the regexp:
-//      (StartBlock AddKey*)* Finish
+//      (StartBlock Add*)* Finish
+//
+// BlockBased/Full FilterBlock would be called in the same way.
 class FilterBlockBuilder {
  public:
-  explicit FilterBlockBuilder(const Options& opt,
-                              const Comparator* internal_comparator);
+  explicit FilterBlockBuilder() {}
+  virtual ~FilterBlockBuilder() {}
 
-  void StartBlock(uint64_t block_offset);
-  void AddKey(const Slice& key);
-  Slice Finish();
+  virtual bool IsBlockBased() = 0;                    // If is blockbased filter
+  virtual void StartBlock(uint64_t block_offset) = 0;  // Start new block filter
+  virtual void Add(const Slice& key) = 0;      // Add a key to current filter
+  virtual Slice Finish() = 0;                     // Generate Filter
 
  private:
-  bool SamePrefix(const Slice &key1, const Slice &key2) const;
-  void GenerateFilter();
-
-  // important: all of these might point to invalid addresses
-  // at the time of destruction of this filter block. destructor
-  // should NOT dereference them.
-  const FilterPolicy* policy_;
-  const SliceTransform* prefix_extractor_;
-  bool whole_key_filtering_;
-  const Comparator* comparator_;
-
-  std::string entries_;         // Flattened entry contents
-  std::vector<size_t> start_;   // Starting index in entries_ of each entry
-  std::string result_;          // Filter data computed so far
-  std::vector<Slice> tmp_entries_; // policy_->CreateFilter() argument
-  std::vector<uint32_t> filter_offsets_;
-
   // No copying allowed
   FilterBlockBuilder(const FilterBlockBuilder&);
   void operator=(const FilterBlockBuilder&);
 };
 
+// A FilterBlockReader is used to parse filter from SST table.
+// KeyMayMatch and PrefixMayMatch would trigger filter checking
+//
+// BlockBased/Full FilterBlock would be called in the same way.
 class FilterBlockReader {
  public:
- // REQUIRES: "contents" and *policy must stay live while *this is live.
-  FilterBlockReader(
-    const Options& opt,
-    const Slice& contents,
-    bool delete_contents_after_use = false);
-  bool KeyMayMatch(uint64_t block_offset, const Slice& key);
-  bool PrefixMayMatch(uint64_t block_offset, const Slice& prefix);
+  explicit FilterBlockReader() {}
+  virtual ~FilterBlockReader() {}
 
- private:
-  const FilterPolicy* policy_;
-  const SliceTransform* prefix_extractor_;
-  bool whole_key_filtering_;
-  const char* data_;    // Pointer to filter data (at block-start)
-  const char* offset_;  // Pointer to beginning of offset array (at block-end)
-  size_t num_;          // Number of entries in offset array
-  size_t base_lg_;      // Encoding parameter (see kFilterBaseLg in .cc file)
-  std::unique_ptr<const char[]> filter_data;
+  virtual bool IsBlockBased() = 0;  // If is blockbased filter
+  virtual bool KeyMayMatch(const Slice& key,
+                           uint64_t block_offset = kNotValid) = 0;
+  virtual bool PrefixMayMatch(const Slice& prefix,
+                              uint64_t block_offset = kNotValid) = 0;
+  virtual size_t ApproximateMemoryUsage() const = 0;
 
+  // convert this object to a human readable form
+  virtual std::string ToString() const {
+    std::string error_msg("Unsupported filter \n");
+    return error_msg;
+  }
 
-  bool MayMatch(uint64_t block_offset, const Slice& entry);
+ private:
+  // No copying allowed
+  FilterBlockReader(const FilterBlockReader&);
+  void operator=(const FilterBlockReader&);
 };
 
-}
+}  // namespace rocksdb
diff --git a/src/rocksdb/table/filter_block_test.cc b/src/rocksdb/table/filter_block_test.cc
deleted file mode 100644
index 1703d59..0000000
--- a/src/rocksdb/table/filter_block_test.cc
+++ /dev/null
@@ -1,139 +0,0 @@
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-//
-// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#include "table/filter_block.h"
-
-#include "rocksdb/filter_policy.h"
-#include "util/coding.h"
-#include "util/hash.h"
-#include "util/logging.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
-
-namespace rocksdb {
-
-// For testing: emit an array with one hash value per key
-class TestHashFilter : public FilterPolicy {
- public:
-  virtual const char* Name() const {
-    return "TestHashFilter";
-  }
-
-  virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const {
-    for (int i = 0; i < n; i++) {
-      uint32_t h = Hash(keys[i].data(), keys[i].size(), 1);
-      PutFixed32(dst, h);
-    }
-  }
-
-  virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const {
-    uint32_t h = Hash(key.data(), key.size(), 1);
-    for (unsigned int i = 0; i + 4 <= filter.size(); i += 4) {
-      if (h == DecodeFixed32(filter.data() + i)) {
-        return true;
-      }
-    }
-    return false;
-  }
-};
-
-class FilterBlockTest {
- public:
-  TestHashFilter policy_;
-  Options options_;
-
-  FilterBlockTest() {
-    options_ = Options();
-    options_.filter_policy = &policy_;
-  }
-};
-
-TEST(FilterBlockTest, EmptyBuilder) {
-  FilterBlockBuilder builder(options_, options_.comparator);
-  Slice block = builder.Finish();
-  ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block));
-  FilterBlockReader reader(options_, block);
-  ASSERT_TRUE(reader.KeyMayMatch(0, "foo"));
-  ASSERT_TRUE(reader.KeyMayMatch(100000, "foo"));
-}
-
-TEST(FilterBlockTest, SingleChunk) {
-  FilterBlockBuilder builder(options_, options_.comparator);
-  builder.StartBlock(100);
-  builder.AddKey("foo");
-  builder.AddKey("bar");
-  builder.AddKey("box");
-  builder.StartBlock(200);
-  builder.AddKey("box");
-  builder.StartBlock(300);
-  builder.AddKey("hello");
-  Slice block = builder.Finish();
-  FilterBlockReader reader(options_, block);
-  ASSERT_TRUE(reader.KeyMayMatch(100, "foo"));
-  ASSERT_TRUE(reader.KeyMayMatch(100, "bar"));
-  ASSERT_TRUE(reader.KeyMayMatch(100, "box"));
-  ASSERT_TRUE(reader.KeyMayMatch(100, "hello"));
-  ASSERT_TRUE(reader.KeyMayMatch(100, "foo"));
-  ASSERT_TRUE(! reader.KeyMayMatch(100, "missing"));
-  ASSERT_TRUE(! reader.KeyMayMatch(100, "other"));
-}
-
-TEST(FilterBlockTest, MultiChunk) {
-  FilterBlockBuilder builder(options_, options_.comparator);
-
-  // First filter
-  builder.StartBlock(0);
-  builder.AddKey("foo");
-  builder.StartBlock(2000);
-  builder.AddKey("bar");
-
-  // Second filter
-  builder.StartBlock(3100);
-  builder.AddKey("box");
-
-  // Third filter is empty
-
-  // Last filter
-  builder.StartBlock(9000);
-  builder.AddKey("box");
-  builder.AddKey("hello");
-
-  Slice block = builder.Finish();
-  FilterBlockReader reader(options_, block);
-
-  // Check first filter
-  ASSERT_TRUE(reader.KeyMayMatch(0, "foo"));
-  ASSERT_TRUE(reader.KeyMayMatch(2000, "bar"));
-  ASSERT_TRUE(! reader.KeyMayMatch(0, "box"));
-  ASSERT_TRUE(! reader.KeyMayMatch(0, "hello"));
-
-  // Check second filter
-  ASSERT_TRUE(reader.KeyMayMatch(3100, "box"));
-  ASSERT_TRUE(! reader.KeyMayMatch(3100, "foo"));
-  ASSERT_TRUE(! reader.KeyMayMatch(3100, "bar"));
-  ASSERT_TRUE(! reader.KeyMayMatch(3100, "hello"));
-
-  // Check third filter (empty)
-  ASSERT_TRUE(! reader.KeyMayMatch(4100, "foo"));
-  ASSERT_TRUE(! reader.KeyMayMatch(4100, "bar"));
-  ASSERT_TRUE(! reader.KeyMayMatch(4100, "box"));
-  ASSERT_TRUE(! reader.KeyMayMatch(4100, "hello"));
-
-  // Check last filter
-  ASSERT_TRUE(reader.KeyMayMatch(9000, "box"));
-  ASSERT_TRUE(reader.KeyMayMatch(9000, "hello"));
-  ASSERT_TRUE(! reader.KeyMayMatch(9000, "foo"));
-  ASSERT_TRUE(! reader.KeyMayMatch(9000, "bar"));
-}
-
-}  // namespace rocksdb
-
-int main(int argc, char** argv) {
-  return rocksdb::test::RunAllTests();
-}
diff --git a/src/rocksdb/table/flush_block_policy.cc b/src/rocksdb/table/flush_block_policy.cc
index 4e22352..4c12b30 100644
--- a/src/rocksdb/table/flush_block_policy.cc
+++ b/src/rocksdb/table/flush_block_policy.cc
@@ -62,9 +62,11 @@ class FlushBlockBySizePolicy : public FlushBlockPolicy {
 };
 
 FlushBlockPolicy* FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
-    const Options& options, const BlockBuilder& data_block_builder) const {
+    const BlockBasedTableOptions& table_options,
+    const BlockBuilder& data_block_builder) const {
   return new FlushBlockBySizePolicy(
-      options.block_size, options.block_size_deviation, data_block_builder);
+      table_options.block_size, table_options.block_size_deviation,
+      data_block_builder);
 }
 
 }  // namespace rocksdb
diff --git a/src/rocksdb/table/format.cc b/src/rocksdb/table/format.cc
index e9229dc..ccc345f 100644
--- a/src/rocksdb/table/format.cc
+++ b/src/rocksdb/table/format.cc
@@ -12,12 +12,13 @@
 #include <string>
 #include <inttypes.h>
 
-#include "port/port.h"
 #include "rocksdb/env.h"
 #include "table/block.h"
 #include "util/coding.h"
+#include "util/compression.h"
 #include "util/crc32c.h"
 #include "util/perf_context_imp.h"
+#include "util/string_util.h"
 #include "util/xxhash.h"
 
 namespace rocksdb {
@@ -33,6 +34,7 @@ extern const uint64_t kPlainTableMagicNumber;
 const uint64_t kLegacyPlainTableMagicNumber = 0;
 const uint64_t kPlainTableMagicNumber = 0;
 #endif
+const uint32_t DefaultStackBufferSize = 5000;
 
 void BlockHandle::EncodeTo(std::string* dst) const {
   // Sanity check that all fields have been set
@@ -50,8 +52,44 @@ Status BlockHandle::DecodeFrom(Slice* input) {
     return Status::Corruption("bad block handle");
   }
 }
+
+// Return a string that contains the copy of handle.
+std::string BlockHandle::ToString(bool hex) const {
+  std::string handle_str;
+  EncodeTo(&handle_str);
+  if (hex) {
+    std::string result;
+    char buf[10];
+    for (size_t i = 0; i < handle_str.size(); i++) {
+      snprintf(buf, sizeof(buf), "%02X",
+               static_cast<unsigned char>(handle_str[i]));
+      result += buf;
+    }
+    return result;
+  } else {
+    return handle_str;
+  }
+}
+
 const BlockHandle BlockHandle::kNullBlockHandle(0, 0);
 
+namespace {
+inline bool IsLegacyFooterFormat(uint64_t magic_number) {
+  return magic_number == kLegacyBlockBasedTableMagicNumber ||
+         magic_number == kLegacyPlainTableMagicNumber;
+}
+inline uint64_t UpconvertLegacyFooterFormat(uint64_t magic_number) {
+  if (magic_number == kLegacyBlockBasedTableMagicNumber) {
+    return kBlockBasedTableMagicNumber;
+  }
+  if (magic_number == kLegacyPlainTableMagicNumber) {
+    return kPlainTableMagicNumber;
+  }
+  assert(false);
+  return 0;
+}
+}  // namespace
+
 // legacy footer format:
 //    metaindex handle (varint64 offset, varint64 size)
 //    index handle     (varint64 offset, varint64 size)
@@ -65,7 +103,8 @@ const BlockHandle BlockHandle::kNullBlockHandle(0, 0);
 //    footer version (4 bytes)
 //    table_magic_number (8 bytes)
 void Footer::EncodeTo(std::string* dst) const {
-  if (version() == kLegacyFooter) {
+  assert(HasInitializedTableMagicNumber());
+  if (IsLegacyFooterFormat(table_magic_number())) {
     // has to be default checksum with legacy footer
     assert(checksum_ == kCRC32c);
     const size_t original_size = dst->size();
@@ -80,39 +119,24 @@ void Footer::EncodeTo(std::string* dst) const {
     dst->push_back(static_cast<char>(checksum_));
     metaindex_handle_.EncodeTo(dst);
     index_handle_.EncodeTo(dst);
-    dst->resize(original_size + kVersion1EncodedLength - 12);  // Padding
-    PutFixed32(dst, kFooterVersion);
+    dst->resize(original_size + kNewVersionsEncodedLength - 12);  // Padding
+    PutFixed32(dst, version());
     PutFixed32(dst, static_cast<uint32_t>(table_magic_number() & 0xffffffffu));
     PutFixed32(dst, static_cast<uint32_t>(table_magic_number() >> 32));
-    assert(dst->size() == original_size + kVersion1EncodedLength);
-  }
-}
-
-namespace {
-inline bool IsLegacyFooterFormat(uint64_t magic_number) {
-  return magic_number == kLegacyBlockBasedTableMagicNumber ||
-         magic_number == kLegacyPlainTableMagicNumber;
-}
-
-inline uint64_t UpconvertLegacyFooterFormat(uint64_t magic_number) {
-  if (magic_number == kLegacyBlockBasedTableMagicNumber) {
-    return kBlockBasedTableMagicNumber;
-  }
-  if (magic_number == kLegacyPlainTableMagicNumber) {
-    return kPlainTableMagicNumber;
+    assert(dst->size() == original_size + kNewVersionsEncodedLength);
   }
-  assert(false);
-  return 0;
 }
-}  // namespace
 
-Footer::Footer(uint64_t table_magic_number)
-    : version_(IsLegacyFooterFormat(table_magic_number) ? kLegacyFooter
-                                                        : kFooterVersion),
+Footer::Footer(uint64_t _table_magic_number, uint32_t _version)
+    : version_(_version),
       checksum_(kCRC32c),
-      table_magic_number_(table_magic_number) {}
+      table_magic_number_(_table_magic_number) {
+  // This should be guaranteed by constructor callers
+  assert(!IsLegacyFooterFormat(_table_magic_number) || version_ == 0);
+}
 
 Status Footer::DecodeFrom(Slice* input) {
+  assert(!HasInitializedTableMagicNumber());
   assert(input != nullptr);
   assert(input->size() >= kMinEncodedLength);
 
@@ -128,42 +152,29 @@ Status Footer::DecodeFrom(Slice* input) {
   if (legacy) {
     magic = UpconvertLegacyFooterFormat(magic);
   }
-  if (HasInitializedTableMagicNumber()) {
-    if (magic != table_magic_number()) {
-      char buffer[80];
-      snprintf(buffer, sizeof(buffer) - 1,
-               "not an sstable (bad magic number --- %lx)",
-               (long)magic);
-      return Status::InvalidArgument(buffer);
-    }
-  } else {
-    set_table_magic_number(magic);
-  }
+  set_table_magic_number(magic);
 
   if (legacy) {
     // The size is already asserted to be at least kMinEncodedLength
     // at the beginning of the function
     input->remove_prefix(input->size() - kVersion0EncodedLength);
-    version_ = kLegacyFooter;
+    version_ = 0 /* legacy */;
     checksum_ = kCRC32c;
   } else {
     version_ = DecodeFixed32(magic_ptr - 4);
-    if (version_ != kFooterVersion) {
-      return Status::Corruption("bad footer version");
-    }
-    // Footer version 1 will always occupy exactly this many bytes.
+    // Footer version 1 and higher will always occupy exactly this many bytes.
     // It consists of the checksum type, two block handles, padding,
     // a version number, and a magic number
-    if (input->size() < kVersion1EncodedLength) {
-      return Status::InvalidArgument("input is too short to be an sstable");
+    if (input->size() < kNewVersionsEncodedLength) {
+      return Status::Corruption("input is too short to be an sstable");
     } else {
-      input->remove_prefix(input->size() - kVersion1EncodedLength);
+      input->remove_prefix(input->size() - kNewVersionsEncodedLength);
     }
-    uint32_t checksum;
-    if (!GetVarint32(input, &checksum)) {
+    uint32_t chksum;
+    if (!GetVarint32(input, &chksum)) {
       return Status::Corruption("bad checksum type");
     }
-    checksum_ = static_cast<ChecksumType>(checksum);
+    checksum_ = static_cast<ChecksumType>(chksum);
   }
 
   Status result = metaindex_handle_.DecodeFrom(input);
@@ -178,18 +189,39 @@ Status Footer::DecodeFrom(Slice* input) {
   return result;
 }
 
-Status ReadFooterFromFile(RandomAccessFile* file,
-                          uint64_t file_size,
-                          Footer* footer) {
+std::string Footer::ToString() const {
+  std::string result, handle_;
+  result.reserve(1024);
+
+  bool legacy = IsLegacyFooterFormat(table_magic_number_);
+  if (legacy) {
+    result.append("metaindex handle: " + metaindex_handle_.ToString() + "\n  ");
+    result.append("index handle: " + index_handle_.ToString() + "\n  ");
+    result.append("table_magic_number: " +
+                  rocksdb::ToString(table_magic_number_) + "\n  ");
+  } else {
+    result.append("checksum: " + rocksdb::ToString(checksum_) + "\n  ");
+    result.append("metaindex handle: " + metaindex_handle_.ToString() + "\n  ");
+    result.append("index handle: " + index_handle_.ToString() + "\n  ");
+    result.append("footer version: " + rocksdb::ToString(version_) + "\n  ");
+    result.append("table_magic_number: " +
+                  rocksdb::ToString(table_magic_number_) + "\n  ");
+  }
+  return result;
+}
+
+Status ReadFooterFromFile(RandomAccessFile* file, uint64_t file_size,
+                          Footer* footer, uint64_t enforce_table_magic_number) {
   if (file_size < Footer::kMinEncodedLength) {
-    return Status::InvalidArgument("file is too short to be an sstable");
+    return Status::Corruption("file is too short to be an sstable");
   }
 
   char footer_space[Footer::kMaxEncodedLength];
   Slice footer_input;
-  size_t read_offset = (file_size > Footer::kMaxEncodedLength)
-                           ? (file_size - Footer::kMaxEncodedLength)
-                           : 0;
+  size_t read_offset =
+      (file_size > Footer::kMaxEncodedLength)
+          ? static_cast<size_t>(file_size - Footer::kMaxEncodedLength)
+          : 0;
   Status s = file->Read(read_offset, Footer::kMaxEncodedLength, &footer_input,
                         footer_space);
   if (!s.ok()) return s;
@@ -197,47 +229,51 @@ Status ReadFooterFromFile(RandomAccessFile* file,
   // Check that we actually read the whole footer from the file. It may be
   // that size isn't correct.
   if (footer_input.size() < Footer::kMinEncodedLength) {
-    return Status::InvalidArgument("file is too short to be an sstable");
+    return Status::Corruption("file is too short to be an sstable");
   }
 
-  return footer->DecodeFrom(&footer_input);
+  s = footer->DecodeFrom(&footer_input);
+  if (!s.ok()) {
+    return s;
+  }
+  if (enforce_table_magic_number != 0 &&
+      enforce_table_magic_number != footer->table_magic_number()) {
+    return Status::Corruption("Bad table magic number");
+  }
+  return Status::OK();
 }
 
-Status ReadBlockContents(RandomAccessFile* file,
-                         const Footer& footer,
-                         const ReadOptions& options,
-                         const BlockHandle& handle,
-                         BlockContents* result,
-                         Env* env,
-                         bool do_uncompress) {
-  result->data = Slice();
-  result->cachable = false;
-  result->heap_allocated = false;
-
-  // Read the block contents as well as the type/crc footer.
-  // See table_builder.cc for the code that built this structure.
+// Without anonymous namespace here, we fail the warning -Wmissing-prototypes
+namespace {
+
+// Read a block and check its CRC
+// contents is the result of reading.
+// According to the implementation of file->Read, contents may not point to buf
+Status ReadBlock(RandomAccessFile* file, const Footer& footer,
+                  const ReadOptions& options, const BlockHandle& handle,
+                  Slice* contents,  /* result of reading */ char* buf) {
   size_t n = static_cast<size_t>(handle.size());
-  char* buf = new char[n + kBlockTrailerSize];
-  Slice contents;
+  Status s;
+
+  {
+    PERF_TIMER_GUARD(block_read_time);
+    s = file->Read(handle.offset(), n + kBlockTrailerSize, contents, buf);
+  }
 
-  PERF_TIMER_AUTO(block_read_time);
-  Status s = file->Read(handle.offset(), n + kBlockTrailerSize, &contents, buf);
-  PERF_TIMER_MEASURE(block_read_time);
   PERF_COUNTER_ADD(block_read_count, 1);
   PERF_COUNTER_ADD(block_read_byte, n + kBlockTrailerSize);
 
   if (!s.ok()) {
-    delete[] buf;
     return s;
   }
-  if (contents.size() != n + kBlockTrailerSize) {
-    delete[] buf;
+  if (contents->size() != n + kBlockTrailerSize) {
     return Status::Corruption("truncated block read");
   }
 
   // Check the crc of the type and the block contents
-  const char* data = contents.data();    // Pointer to where Read put the data
+  const char* data = contents->data();  // Pointer to where Read put the data
   if (options.verify_checksums) {
+    PERF_TIMER_GUARD(block_checksum_time);
     uint32_t value = DecodeFixed32(data + n + 1);
     uint32_t actual = 0;
     switch (footer.checksum()) {
@@ -246,7 +282,7 @@ Status ReadBlockContents(RandomAccessFile* file,
         actual = crc32c::Value(data, n + 1);
         break;
       case kxxHash:
-        actual = XXH32(data, n + 1, 0);
+        actual = XXH32(data, static_cast<int>(n) + 1, 0);
         break;
       default:
         s = Status::Corruption("unknown checksum type");
@@ -255,37 +291,62 @@ Status ReadBlockContents(RandomAccessFile* file,
       s = Status::Corruption("block checksum mismatch");
     }
     if (!s.ok()) {
-      delete[] buf;
       return s;
     }
-    PERF_TIMER_MEASURE(block_checksum_time);
   }
+  return s;
+}
 
-  rocksdb::CompressionType compression_type =
-      static_cast<rocksdb::CompressionType>(data[n]);
-  // If the caller has requested that the block not be uncompressed
-  if (!do_uncompress || compression_type == kNoCompression) {
-    if (data != buf) {
-      // File implementation gave us pointer to some other data.
-      // Use it directly under the assumption that it will be live
-      // while the file is open.
-      delete[] buf;
-      result->data = Slice(data, n);
-      result->heap_allocated = false;
-      result->cachable = false;  // Do not double-cache
-    } else {
-      result->data = Slice(buf, n);
-      result->heap_allocated = true;
-      result->cachable = true;
-    }
-    result->compression_type = compression_type;
-    s = Status::OK();
+}  // namespace
+
+Status ReadBlockContents(RandomAccessFile* file, const Footer& footer,
+                         const ReadOptions& options, const BlockHandle& handle,
+                         BlockContents* contents, Env* env,
+                         bool decompression_requested) {
+  Status status;
+  Slice slice;
+  size_t n = static_cast<size_t>(handle.size());
+  std::unique_ptr<char[]> heap_buf;
+  char stack_buf[DefaultStackBufferSize];
+  char* used_buf = nullptr;
+  rocksdb::CompressionType compression_type;
+
+  if (decompression_requested &&
+      n + kBlockTrailerSize < DefaultStackBufferSize) {
+    // If we've got a small enough hunk of data, read it in to the
+    // trivially allocated stack buffer instead of needing a full malloc()
+    used_buf = &stack_buf[0];
   } else {
-    s = UncompressBlockContents(data, n, result);
-    delete[] buf;
+    heap_buf = std::unique_ptr<char[]>(new char[n + kBlockTrailerSize]);
+    used_buf = heap_buf.get();
   }
-  PERF_TIMER_STOP(block_decompress_time);
-  return s;
+
+  status = ReadBlock(file, footer, options, handle, &slice, used_buf);
+
+  if (!status.ok()) {
+    return status;
+  }
+
+  PERF_TIMER_GUARD(block_decompress_time);
+
+  compression_type = static_cast<rocksdb::CompressionType>(slice.data()[n]);
+
+  if (decompression_requested && compression_type != kNoCompression) {
+    return UncompressBlockContents(slice.data(), n, contents, footer.version());
+  }
+
+  if (slice.data() != used_buf) {
+    *contents = BlockContents(Slice(slice.data(), n), false, compression_type);
+    return status;
+  }
+
+  if (used_buf == &stack_buf[0]) {
+    heap_buf = std::unique_ptr<char[]>(new char[n]);
+    memcpy(heap_buf.get(), stack_buf, n);
+  }
+
+  *contents = BlockContents(std::move(heap_buf), n, true, compression_type);
+  return status;
 }
 
 //
@@ -294,9 +355,11 @@ Status ReadBlockContents(RandomAccessFile* file,
 // contents are uncompresed into this buffer. This
 // buffer is returned via 'result' and it is upto the caller to
 // free this buffer.
+// format_version is the block format as defined in include/rocksdb/table.h
 Status UncompressBlockContents(const char* data, size_t n,
-                               BlockContents* result) {
-  char* ubuf = nullptr;
+                               BlockContents* contents,
+                               uint32_t format_version) {
+  std::unique_ptr<char[]> ubuf;
   int decompress_size = 0;
   assert(data[n] != kNoCompression);
   switch (data[n]) {
@@ -304,67 +367,67 @@ Status UncompressBlockContents(const char* data, size_t n,
       size_t ulength = 0;
       static char snappy_corrupt_msg[] =
         "Snappy not supported or corrupted Snappy compressed block contents";
-      if (!port::Snappy_GetUncompressedLength(data, n, &ulength)) {
+      if (!Snappy_GetUncompressedLength(data, n, &ulength)) {
         return Status::Corruption(snappy_corrupt_msg);
       }
-      ubuf = new char[ulength];
-      if (!port::Snappy_Uncompress(data, n, ubuf)) {
-        delete[] ubuf;
+      ubuf = std::unique_ptr<char[]>(new char[ulength]);
+      if (!Snappy_Uncompress(data, n, ubuf.get())) {
         return Status::Corruption(snappy_corrupt_msg);
       }
-      result->data = Slice(ubuf, ulength);
-      result->heap_allocated = true;
-      result->cachable = true;
+      *contents = BlockContents(std::move(ubuf), ulength, true, kNoCompression);
       break;
     }
     case kZlibCompression:
-      ubuf = port::Zlib_Uncompress(data, n, &decompress_size);
-      static char zlib_corrupt_msg[] =
-        "Zlib not supported or corrupted Zlib compressed block contents";
+      ubuf = std::unique_ptr<char[]>(Zlib_Uncompress(
+          data, n, &decompress_size,
+          GetCompressFormatForVersion(kZlibCompression, format_version)));
       if (!ubuf) {
+        static char zlib_corrupt_msg[] =
+          "Zlib not supported or corrupted Zlib compressed block contents";
         return Status::Corruption(zlib_corrupt_msg);
       }
-      result->data = Slice(ubuf, decompress_size);
-      result->heap_allocated = true;
-      result->cachable = true;
+      *contents =
+          BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
       break;
     case kBZip2Compression:
-      ubuf = port::BZip2_Uncompress(data, n, &decompress_size);
-      static char bzip2_corrupt_msg[] =
-        "Bzip2 not supported or corrupted Bzip2 compressed block contents";
+      ubuf = std::unique_ptr<char[]>(BZip2_Uncompress(
+          data, n, &decompress_size,
+          GetCompressFormatForVersion(kBZip2Compression, format_version)));
       if (!ubuf) {
+        static char bzip2_corrupt_msg[] =
+          "Bzip2 not supported or corrupted Bzip2 compressed block contents";
         return Status::Corruption(bzip2_corrupt_msg);
       }
-      result->data = Slice(ubuf, decompress_size);
-      result->heap_allocated = true;
-      result->cachable = true;
+      *contents =
+          BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
       break;
     case kLZ4Compression:
-      ubuf = port::LZ4_Uncompress(data, n, &decompress_size);
-      static char lz4_corrupt_msg[] =
-          "LZ4 not supported or corrupted LZ4 compressed block contents";
+      ubuf = std::unique_ptr<char[]>(LZ4_Uncompress(
+          data, n, &decompress_size,
+          GetCompressFormatForVersion(kLZ4Compression, format_version)));
       if (!ubuf) {
+        static char lz4_corrupt_msg[] =
+          "LZ4 not supported or corrupted LZ4 compressed block contents";
         return Status::Corruption(lz4_corrupt_msg);
       }
-      result->data = Slice(ubuf, decompress_size);
-      result->heap_allocated = true;
-      result->cachable = true;
+      *contents =
+          BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
       break;
     case kLZ4HCCompression:
-      ubuf = port::LZ4_Uncompress(data, n, &decompress_size);
-      static char lz4hc_corrupt_msg[] =
-          "LZ4HC not supported or corrupted LZ4HC compressed block contents";
+      ubuf = std::unique_ptr<char[]>(LZ4_Uncompress(
+          data, n, &decompress_size,
+          GetCompressFormatForVersion(kLZ4HCCompression, format_version)));
       if (!ubuf) {
+        static char lz4hc_corrupt_msg[] =
+          "LZ4HC not supported or corrupted LZ4HC compressed block contents";
         return Status::Corruption(lz4hc_corrupt_msg);
       }
-      result->data = Slice(ubuf, decompress_size);
-      result->heap_allocated = true;
-      result->cachable = true;
+      *contents =
+          BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
       break;
     default:
       return Status::Corruption("bad block type");
   }
-  result->compression_type = kNoCompression;  // not compressed any more
   return Status::OK();
 }
 
diff --git a/src/rocksdb/table/format.h b/src/rocksdb/table/format.h
index a971c1a..900a071 100644
--- a/src/rocksdb/table/format.h
+++ b/src/rocksdb/table/format.h
@@ -33,15 +33,18 @@ class BlockHandle {
 
   // The offset of the block in the file.
   uint64_t offset() const { return offset_; }
-  void set_offset(uint64_t offset) { offset_ = offset; }
+  void set_offset(uint64_t _offset) { offset_ = _offset; }
 
   // The size of the stored block
   uint64_t size() const { return size_; }
-  void set_size(uint64_t size) { size_ = size; }
+  void set_size(uint64_t _size) { size_ = _size; }
 
   void EncodeTo(std::string* dst) const;
   Status DecodeFrom(Slice* input);
 
+  // Return a string that contains the copy of handle.
+  std::string ToString(bool hex = true) const;
+
   // if the block handle's offset and size are both "0", we will view it
   // as a null block handle that points to no where.
   bool IsNull() const {
@@ -62,6 +65,21 @@ class BlockHandle {
   static const BlockHandle kNullBlockHandle;
 };
 
+inline uint32_t GetCompressFormatForVersion(CompressionType compression_type,
+                                            uint32_t version) {
+  // snappy is not versioned
+  assert(compression_type != kSnappyCompression &&
+         compression_type != kNoCompression);
+  // As of version 2, we encode compressed block with
+  // compress_format_version == 2. Before that, the version is 1.
+  // DO NOT CHANGE THIS FUNCTION, it affects disk format
+  return version >= 2 ? 2 : 1;
+}
+
+inline bool BlockBasedTableSupportedVersion(uint32_t version) {
+  return version <= 2;
+}
+
 // Footer encapsulates the fixed information stored at the tail
 // end of every table file.
 class Footer {
@@ -69,12 +87,13 @@ class Footer {
   // Constructs a footer without specifying its table magic number.
   // In such case, the table magic number of such footer should be
   // initialized via @ReadFooterFromFile().
-  Footer() : Footer(kInvalidTableMagicNumber) {}
+  // Use this when you plan to load Footer with DecodeFrom(). Never use this
+  // when you plan to EncodeTo.
+  Footer() : Footer(kInvalidTableMagicNumber, 0) {}
 
-  // @table_magic_number serves two purposes:
-  //  1. Identify different types of the tables.
-  //  2. Help us to identify if a given file is a valid sst.
-  explicit Footer(uint64_t table_magic_number);
+  // Use this constructor when you plan to write out the footer using
+  // EncodeTo(). Never use this constructor with DecodeFrom().
+  Footer(uint64_t table_magic_number, uint32_t version);
 
   // The version of the footer in this file
   uint32_t version() const { return version_; }
@@ -94,20 +113,13 @@ class Footer {
 
   uint64_t table_magic_number() const { return table_magic_number_; }
 
-  // The version of Footer we encode
-  enum {
-    kLegacyFooter = 0,
-    kFooterVersion = 1,
-  };
-
   void EncodeTo(std::string* dst) const;
 
-  // Set the current footer based on the input slice.  If table_magic_number_
-  // is not set (i.e., HasInitializedTableMagicNumber() is true), then this
-  // function will also initialize table_magic_number_.  Otherwise, this
-  // function will verify whether the magic number specified in the input
-  // slice matches table_magic_number_ and update the current footer only
-  // when the test passes.
+  // Set the current footer based on the input slice.
+  //
+  // REQUIRES: table_magic_number_ is not set (i.e.,
+  // HasInitializedTableMagicNumber() is true). The function will initialize the
+  // magic number
   Status DecodeFrom(Slice* input);
 
   // Encoded length of a Footer.  Note that the serialization of a Footer will
@@ -118,17 +130,19 @@ class Footer {
     // Footer version 0 (legacy) will always occupy exactly this many bytes.
     // It consists of two block handles, padding, and a magic number.
     kVersion0EncodedLength = 2 * BlockHandle::kMaxEncodedLength + 8,
-    // Footer version 1 will always occupy exactly this many bytes.
-    // It consists of the checksum type, two block handles, padding,
-    // a version number, and a magic number
-    kVersion1EncodedLength = 1 + 2 * BlockHandle::kMaxEncodedLength + 4 + 8,
-
+    // Footer of versions 1 and higher will always occupy exactly this many
+    // bytes. It consists of the checksum type, two block handles, padding,
+    // a version number (bigger than 1), and a magic number
+    kNewVersionsEncodedLength = 1 + 2 * BlockHandle::kMaxEncodedLength + 4 + 8,
     kMinEncodedLength = kVersion0EncodedLength,
-    kMaxEncodedLength = kVersion1EncodedLength
+    kMaxEncodedLength = kNewVersionsEncodedLength,
   };
 
   static const uint64_t kInvalidTableMagicNumber = 0;
 
+  // convert this object to a human readable form
+  std::string ToString() const;
+
  private:
   // REQUIRES: magic number wasn't initialized.
   void set_table_magic_number(uint64_t magic_number) {
@@ -150,9 +164,11 @@ class Footer {
 };
 
 // Read the footer from file
-Status ReadFooterFromFile(RandomAccessFile* file,
-                          uint64_t file_size,
-                          Footer* footer);
+// If enforce_table_magic_number != 0, ReadFooterFromFile() will return
+// corruption if table_magic number is not equal to enforce_table_magic_number
+Status ReadFooterFromFile(RandomAccessFile* file, uint64_t file_size,
+                          Footer* footer,
+                          uint64_t enforce_table_magic_number = 0);
 
 // 1-byte type + 32-bit crc
 static const size_t kBlockTrailerSize = 5;
@@ -160,18 +176,29 @@ static const size_t kBlockTrailerSize = 5;
 struct BlockContents {
   Slice data;           // Actual contents of data
   bool cachable;        // True iff data can be cached
-  bool heap_allocated;  // True iff caller should delete[] data.data()
   CompressionType compression_type;
+  std::unique_ptr<char[]> allocation;
+
+  BlockContents() : cachable(false), compression_type(kNoCompression) {}
+
+  BlockContents(const Slice& _data, bool _cachable,
+                CompressionType _compression_type)
+      : data(_data), cachable(_cachable), compression_type(_compression_type) {}
+
+  BlockContents(std::unique_ptr<char[]>&& _data, size_t _size, bool _cachable,
+                CompressionType _compression_type)
+      : data(_data.get(), _size),
+        cachable(_cachable),
+        compression_type(_compression_type),
+        allocation(std::move(_data)) {}
 };
 
 // Read the block identified by "handle" from "file".  On failure
 // return non-OK.  On success fill *result and return OK.
-extern Status ReadBlockContents(RandomAccessFile* file,
-                                const Footer& footer,
+extern Status ReadBlockContents(RandomAccessFile* file, const Footer& footer,
                                 const ReadOptions& options,
                                 const BlockHandle& handle,
-                                BlockContents* result,
-                                Env* env,
+                                BlockContents* contents, Env* env,
                                 bool do_uncompress);
 
 // The 'data' points to the raw block contents read in from file.
@@ -179,9 +206,11 @@ extern Status ReadBlockContents(RandomAccessFile* file,
 // contents are uncompresed into this buffer. This buffer is
 // returned via 'result' and it is upto the caller to
 // free this buffer.
-extern Status UncompressBlockContents(const char* data,
-                                      size_t n,
-                                      BlockContents* result);
+// For description of compress_format_version and possible values, see
+// util/compression.h
+extern Status UncompressBlockContents(const char* data, size_t n,
+                                      BlockContents* contents,
+                                      uint32_t compress_format_version);
 
 // Implementation details follow.  Clients should ignore,
 
@@ -190,9 +219,7 @@ inline BlockHandle::BlockHandle()
                   ~static_cast<uint64_t>(0)) {
 }
 
-inline BlockHandle::BlockHandle(uint64_t offset, uint64_t size)
-    : offset_(offset),
-      size_(size) {
-}
+inline BlockHandle::BlockHandle(uint64_t _offset, uint64_t _size)
+    : offset_(_offset), size_(_size) {}
 
 }  // namespace rocksdb
diff --git a/src/rocksdb/table/full_filter_block.cc b/src/rocksdb/table/full_filter_block.cc
new file mode 100644
index 0000000..b3afdac
--- /dev/null
+++ b/src/rocksdb/table/full_filter_block.cc
@@ -0,0 +1,100 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "table/full_filter_block.h"
+
+#include "rocksdb/filter_policy.h"
+#include "port/port.h"
+#include "util/coding.h"
+
+namespace rocksdb {
+
+FullFilterBlockBuilder::FullFilterBlockBuilder(
+    const SliceTransform* prefix_extractor, bool whole_key_filtering,
+    FilterBitsBuilder* filter_bits_builder)
+    : prefix_extractor_(prefix_extractor),
+      whole_key_filtering_(whole_key_filtering),
+      num_added_(0) {
+  assert(filter_bits_builder != nullptr);
+  filter_bits_builder_.reset(filter_bits_builder);
+}
+
+void FullFilterBlockBuilder::Add(const Slice& key) {
+  if (whole_key_filtering_) {
+    AddKey(key);
+  }
+  if (prefix_extractor_ && prefix_extractor_->InDomain(key)) {
+    AddPrefix(key);
+  }
+}
+
+// Add key to filter if needed
+inline void FullFilterBlockBuilder::AddKey(const Slice& key) {
+  filter_bits_builder_->AddKey(key);
+  num_added_++;
+}
+
+// Add prefix to filter if needed
+inline void FullFilterBlockBuilder::AddPrefix(const Slice& key) {
+  Slice prefix = prefix_extractor_->Transform(key);
+  filter_bits_builder_->AddKey(prefix);
+  num_added_++;
+}
+
+Slice FullFilterBlockBuilder::Finish() {
+  if (num_added_ != 0) {
+    num_added_ = 0;
+    return filter_bits_builder_->Finish(&filter_data_);
+  }
+  return Slice();
+}
+
+FullFilterBlockReader::FullFilterBlockReader(
+    const SliceTransform* prefix_extractor, bool whole_key_filtering,
+    const Slice& contents, FilterBitsReader* filter_bits_reader)
+    : prefix_extractor_(prefix_extractor),
+      whole_key_filtering_(whole_key_filtering),
+      contents_(contents) {
+  assert(filter_bits_reader != nullptr);
+  filter_bits_reader_.reset(filter_bits_reader);
+}
+
+FullFilterBlockReader::FullFilterBlockReader(
+    const SliceTransform* prefix_extractor, bool whole_key_filtering,
+    BlockContents&& contents, FilterBitsReader* filter_bits_reader)
+    : FullFilterBlockReader(prefix_extractor, whole_key_filtering,
+                            contents.data, filter_bits_reader) {
+  block_contents_ = std::move(contents);
+}
+
+bool FullFilterBlockReader::KeyMayMatch(const Slice& key,
+    uint64_t block_offset) {
+  assert(block_offset == kNotValid);
+  if (!whole_key_filtering_) {
+    return true;
+  }
+  return MayMatch(key);
+}
+
+bool FullFilterBlockReader::PrefixMayMatch(const Slice& prefix,
+                                           uint64_t block_offset) {
+  assert(block_offset == kNotValid);
+  if (!prefix_extractor_) {
+    return true;
+  }
+  return MayMatch(prefix);
+}
+
+bool FullFilterBlockReader::MayMatch(const Slice& entry) {
+  if (contents_.size() != 0)  {
+    return filter_bits_reader_->MayMatch(entry);
+  }
+  return true;  // remain the same with block_based filter
+}
+
+size_t FullFilterBlockReader::ApproximateMemoryUsage() const {
+  return contents_.size();
+}
+}  // namespace rocksdb
diff --git a/src/rocksdb/table/full_filter_block.h b/src/rocksdb/table/full_filter_block.h
new file mode 100644
index 0000000..1ecc07a
--- /dev/null
+++ b/src/rocksdb/table/full_filter_block.h
@@ -0,0 +1,111 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+#include <memory>
+#include <string>
+#include <vector>
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "db/dbformat.h"
+#include "util/hash.h"
+#include "table/filter_block.h"
+
+namespace rocksdb {
+
+class FilterPolicy;
+class FilterBitsBuilder;
+class FilterBitsReader;
+
+// A FullFilterBlockBuilder is used to construct a full filter for a
+// particular Table.  It generates a single string which is stored as
+// a special block in the Table.
+// The format of full filter block is:
+// +----------------------------------------------------------------+
+// |              full filter for all keys in sst file              |
+// +----------------------------------------------------------------+
+// The full filter can be very large. At the end of it, we put
+// num_probes: how many hash functions are used in bloom filter
+//
+class FullFilterBlockBuilder : public FilterBlockBuilder {
+ public:
+  explicit FullFilterBlockBuilder(const SliceTransform* prefix_extractor,
+                                  bool whole_key_filtering,
+                                  FilterBitsBuilder* filter_bits_builder);
+  // bits_builder is created in filter_policy, it should be passed in here
+  // directly. and be deleted here
+  ~FullFilterBlockBuilder() {}
+
+  virtual bool IsBlockBased() override { return false; }
+  virtual void StartBlock(uint64_t block_offset) override {}
+  virtual void Add(const Slice& key) override;
+  virtual Slice Finish() override;
+
+ private:
+  // important: all of these might point to invalid addresses
+  // at the time of destruction of this filter block. destructor
+  // should NOT dereference them.
+  const SliceTransform* prefix_extractor_;
+  bool whole_key_filtering_;
+
+  uint32_t num_added_;
+  std::unique_ptr<FilterBitsBuilder> filter_bits_builder_;
+  std::unique_ptr<const char[]> filter_data_;
+
+  void AddKey(const Slice& key);
+  void AddPrefix(const Slice& key);
+
+  // No copying allowed
+  FullFilterBlockBuilder(const FullFilterBlockBuilder&);
+  void operator=(const FullFilterBlockBuilder&);
+};
+
+// A FilterBlockReader is used to parse filter from SST table.
+// KeyMayMatch and PrefixMayMatch would trigger filter checking
+class FullFilterBlockReader : public FilterBlockReader {
+ public:
+  // REQUIRES: "contents" and filter_bits_reader must stay live
+  // while *this is live.
+  explicit FullFilterBlockReader(const SliceTransform* prefix_extractor,
+                                 bool whole_key_filtering,
+                                 const Slice& contents,
+                                 FilterBitsReader* filter_bits_reader);
+  explicit FullFilterBlockReader(const SliceTransform* prefix_extractor,
+                                 bool whole_key_filtering,
+                                 BlockContents&& contents,
+                                 FilterBitsReader* filter_bits_reader);
+
+  // bits_reader is created in filter_policy, it should be passed in here
+  // directly. and be deleted here
+  ~FullFilterBlockReader() {}
+
+  virtual bool IsBlockBased() override { return false; }
+  virtual bool KeyMayMatch(const Slice& key,
+                           uint64_t block_offset = kNotValid) override;
+  virtual bool PrefixMayMatch(const Slice& prefix,
+                              uint64_t block_offset = kNotValid) override;
+  virtual size_t ApproximateMemoryUsage() const override;
+
+ private:
+  const SliceTransform* prefix_extractor_;
+  bool whole_key_filtering_;
+
+  std::unique_ptr<FilterBitsReader> filter_bits_reader_;
+  Slice contents_;
+  BlockContents block_contents_;
+  std::unique_ptr<const char[]> filter_data_;
+
+  bool MayMatch(const Slice& entry);
+
+  // No copying allowed
+  FullFilterBlockReader(const FullFilterBlockReader&);
+  void operator=(const FullFilterBlockReader&);
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/table/full_filter_block_test.cc b/src/rocksdb/table/full_filter_block_test.cc
new file mode 100644
index 0000000..0275a6c
--- /dev/null
+++ b/src/rocksdb/table/full_filter_block_test.cc
@@ -0,0 +1,189 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "table/full_filter_block.h"
+
+#include "rocksdb/filter_policy.h"
+#include "util/coding.h"
+#include "util/hash.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+class TestFilterBitsBuilder : public FilterBitsBuilder {
+ public:
+  explicit TestFilterBitsBuilder() {}
+
+  // Add Key to filter
+  virtual void AddKey(const Slice& key) override {
+    hash_entries_.push_back(Hash(key.data(), key.size(), 1));
+  }
+
+  // Generate the filter using the keys that are added
+  virtual Slice Finish(std::unique_ptr<const char[]>* buf) override {
+    uint32_t len = static_cast<uint32_t>(hash_entries_.size()) * 4;
+    char* data = new char[len];
+    for (size_t i = 0; i < hash_entries_.size(); i++) {
+      EncodeFixed32(data + i * 4, hash_entries_[i]);
+    }
+    const char* const_data = data;
+    buf->reset(const_data);
+    return Slice(data, len);
+  }
+
+ private:
+  std::vector<uint32_t> hash_entries_;
+};
+
+class TestFilterBitsReader : public FilterBitsReader {
+ public:
+  explicit TestFilterBitsReader(const Slice& contents)
+      : data_(contents.data()), len_(static_cast<uint32_t>(contents.size())) {}
+
+  virtual bool MayMatch(const Slice& entry) override {
+    uint32_t h = Hash(entry.data(), entry.size(), 1);
+    for (size_t i = 0; i + 4 <= len_; i += 4) {
+      if (h == DecodeFixed32(data_ + i)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+ private:
+  const char* data_;
+  uint32_t len_;
+};
+
+
+class TestHashFilter : public FilterPolicy {
+ public:
+  virtual const char* Name() const override { return "TestHashFilter"; }
+
+  virtual void CreateFilter(const Slice* keys, int n,
+                            std::string* dst) const override {
+    for (int i = 0; i < n; i++) {
+      uint32_t h = Hash(keys[i].data(), keys[i].size(), 1);
+      PutFixed32(dst, h);
+    }
+  }
+
+  virtual bool KeyMayMatch(const Slice& key,
+                           const Slice& filter) const override {
+    uint32_t h = Hash(key.data(), key.size(), 1);
+    for (unsigned int i = 0; i + 4 <= filter.size(); i += 4) {
+      if (h == DecodeFixed32(filter.data() + i)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  virtual FilterBitsBuilder* GetFilterBitsBuilder() const override {
+    return new TestFilterBitsBuilder();
+  }
+
+  virtual FilterBitsReader* GetFilterBitsReader(const Slice& contents)
+      const override {
+    return new TestFilterBitsReader(contents);
+  }
+};
+
+class PluginFullFilterBlockTest : public testing::Test {
+ public:
+  BlockBasedTableOptions table_options_;
+
+  PluginFullFilterBlockTest() {
+    table_options_.filter_policy.reset(new TestHashFilter());
+  }
+};
+
+TEST_F(PluginFullFilterBlockTest, PluginEmptyBuilder) {
+  FullFilterBlockBuilder builder(
+      nullptr, true, table_options_.filter_policy->GetFilterBitsBuilder());
+  Slice block = builder.Finish();
+  ASSERT_EQ("", EscapeString(block));
+
+  FullFilterBlockReader reader(
+      nullptr, true, block,
+      table_options_.filter_policy->GetFilterBitsReader(block));
+  // Remain same symantic with blockbased filter
+  ASSERT_TRUE(reader.KeyMayMatch("foo"));
+}
+
+TEST_F(PluginFullFilterBlockTest, PluginSingleChunk) {
+  FullFilterBlockBuilder builder(
+      nullptr, true, table_options_.filter_policy->GetFilterBitsBuilder());
+  builder.Add("foo");
+  builder.Add("bar");
+  builder.Add("box");
+  builder.Add("box");
+  builder.Add("hello");
+  Slice block = builder.Finish();
+  FullFilterBlockReader reader(
+      nullptr, true, block,
+      table_options_.filter_policy->GetFilterBitsReader(block));
+  ASSERT_TRUE(reader.KeyMayMatch("foo"));
+  ASSERT_TRUE(reader.KeyMayMatch("bar"));
+  ASSERT_TRUE(reader.KeyMayMatch("box"));
+  ASSERT_TRUE(reader.KeyMayMatch("hello"));
+  ASSERT_TRUE(reader.KeyMayMatch("foo"));
+  ASSERT_TRUE(!reader.KeyMayMatch("missing"));
+  ASSERT_TRUE(!reader.KeyMayMatch("other"));
+}
+
+class FullFilterBlockTest : public testing::Test {
+ public:
+  BlockBasedTableOptions table_options_;
+
+  FullFilterBlockTest() {
+    table_options_.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  }
+
+  ~FullFilterBlockTest() {}
+};
+
+TEST_F(FullFilterBlockTest, EmptyBuilder) {
+  FullFilterBlockBuilder builder(
+      nullptr, true, table_options_.filter_policy->GetFilterBitsBuilder());
+  Slice block = builder.Finish();
+  ASSERT_EQ("", EscapeString(block));
+
+  FullFilterBlockReader reader(
+      nullptr, true, block,
+      table_options_.filter_policy->GetFilterBitsReader(block));
+  // Remain same symantic with blockbased filter
+  ASSERT_TRUE(reader.KeyMayMatch("foo"));
+}
+
+TEST_F(FullFilterBlockTest, SingleChunk) {
+  FullFilterBlockBuilder builder(
+      nullptr, true, table_options_.filter_policy->GetFilterBitsBuilder());
+  builder.Add("foo");
+  builder.Add("bar");
+  builder.Add("box");
+  builder.Add("box");
+  builder.Add("hello");
+  Slice block = builder.Finish();
+  FullFilterBlockReader reader(
+      nullptr, true, block,
+      table_options_.filter_policy->GetFilterBitsReader(block));
+  ASSERT_TRUE(reader.KeyMayMatch("foo"));
+  ASSERT_TRUE(reader.KeyMayMatch("bar"));
+  ASSERT_TRUE(reader.KeyMayMatch("box"));
+  ASSERT_TRUE(reader.KeyMayMatch("hello"));
+  ASSERT_TRUE(reader.KeyMayMatch("foo"));
+  ASSERT_TRUE(!reader.KeyMayMatch("missing"));
+  ASSERT_TRUE(!reader.KeyMayMatch("other"));
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/table/get_context.cc b/src/rocksdb/table/get_context.cc
new file mode 100644
index 0000000..e83aa1d
--- /dev/null
+++ b/src/rocksdb/table/get_context.cc
@@ -0,0 +1,119 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "table/get_context.h"
+#include "rocksdb/env.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/statistics.h"
+#include "util/perf_context_imp.h"
+#include "util/statistics.h"
+
+namespace rocksdb {
+
+GetContext::GetContext(const Comparator* ucmp,
+                       const MergeOperator* merge_operator, Logger* logger,
+                       Statistics* statistics, GetState init_state,
+                       const Slice& user_key, std::string* ret_value,
+                       bool* value_found, MergeContext* merge_context, Env* env)
+    : ucmp_(ucmp),
+      merge_operator_(merge_operator),
+      logger_(logger),
+      statistics_(statistics),
+      state_(init_state),
+      user_key_(user_key),
+      value_(ret_value),
+      value_found_(value_found),
+      merge_context_(merge_context),
+      env_(env) {}
+
+// Called from TableCache::Get and Table::Get when file/block in which
+// key may exist are not there in TableCache/BlockCache respectively. In this
+// case we can't guarantee that key does not exist and are not permitted to do
+// IO to be certain.Set the status=kFound and value_found=false to let the
+// caller know that key may exist but is not there in memory
+void GetContext::MarkKeyMayExist() {
+  state_ = kFound;
+  if (value_found_ != nullptr) {
+    *value_found_ = false;
+  }
+}
+
+void GetContext::SaveValue(const Slice& value) {
+  state_ = kFound;
+  value_->assign(value.data(), value.size());
+}
+
+bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
+                           const Slice& value) {
+  assert((state_ != kMerge && parsed_key.type != kTypeMerge) ||
+         merge_context_ != nullptr);
+  if (ucmp_->Compare(parsed_key.user_key, user_key_) == 0) {
+    // Key matches. Process it
+    switch (parsed_key.type) {
+      case kTypeValue:
+        assert(state_ == kNotFound || state_ == kMerge);
+        if (kNotFound == state_) {
+          state_ = kFound;
+          value_->assign(value.data(), value.size());
+        } else if (kMerge == state_) {
+          assert(merge_operator_ != nullptr);
+          state_ = kFound;
+          bool merge_success = false;
+          {
+            StopWatchNano timer(env_, statistics_ != nullptr);
+            PERF_TIMER_GUARD(merge_operator_time_nanos);
+            merge_success = merge_operator_->FullMerge(
+                user_key_, &value, merge_context_->GetOperands(), value_,
+                logger_);
+            RecordTick(statistics_, MERGE_OPERATION_TOTAL_TIME,
+                       env_ != nullptr ? timer.ElapsedNanos() : 0);
+          }
+          if (!merge_success) {
+            RecordTick(statistics_, NUMBER_MERGE_FAILURES);
+            state_ = kCorrupt;
+          }
+        }
+        return false;
+
+      case kTypeDeletion:
+        assert(state_ == kNotFound || state_ == kMerge);
+        if (kNotFound == state_) {
+          state_ = kDeleted;
+        } else if (kMerge == state_) {
+          state_ = kFound;
+          bool merge_success = false;
+          {
+            StopWatchNano timer(env_, statistics_ != nullptr);
+            PERF_TIMER_GUARD(merge_operator_time_nanos);
+            merge_success = merge_operator_->FullMerge(
+                user_key_, nullptr, merge_context_->GetOperands(), value_,
+                logger_);
+            RecordTick(statistics_, MERGE_OPERATION_TOTAL_TIME,
+                       env_ != nullptr ? timer.ElapsedNanos() : 0);
+          }
+          if (!merge_success) {
+            RecordTick(statistics_, NUMBER_MERGE_FAILURES);
+            state_ = kCorrupt;
+          }
+        }
+        return false;
+
+      case kTypeMerge:
+        assert(state_ == kNotFound || state_ == kMerge);
+        state_ = kMerge;
+        merge_context_->PushOperand(value);
+        return true;
+
+      default:
+        assert(false);
+        break;
+    }
+  }
+
+  // state_ could be Corrupt, merge or notfound
+  return false;
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/table/get_context.h b/src/rocksdb/table/get_context.h
new file mode 100644
index 0000000..700f23a
--- /dev/null
+++ b/src/rocksdb/table/get_context.h
@@ -0,0 +1,49 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+#include <string>
+#include "db/merge_context.h"
+#include "rocksdb/env.h"
+
+namespace rocksdb {
+class MergeContext;
+
+class GetContext {
+ public:
+  enum GetState {
+    kNotFound,
+    kFound,
+    kDeleted,
+    kCorrupt,
+    kMerge  // saver contains the current merge result (the operands)
+  };
+
+  GetContext(const Comparator* ucmp, const MergeOperator* merge_operator,
+             Logger* logger, Statistics* statistics, GetState init_state,
+             const Slice& user_key, std::string* ret_value, bool* value_found,
+             MergeContext* merge_context, Env* env_);
+
+  void MarkKeyMayExist();
+  void SaveValue(const Slice& value);
+  bool SaveValue(const ParsedInternalKey& parsed_key, const Slice& value);
+  GetState State() const { return state_; }
+
+ private:
+  const Comparator* ucmp_;
+  const MergeOperator* merge_operator_;
+  // the merge operations encountered;
+  Logger* logger_;
+  Statistics* statistics_;
+
+  GetState state_;
+  Slice user_key_;
+  std::string* value_;
+  bool* value_found_;  // Is value set correctly? Used by KeyMayExist
+  MergeContext* merge_context_;
+  Env* env_;
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/table/iterator.cc b/src/rocksdb/table/iterator.cc
index a3d4f63..f97879a 100644
--- a/src/rocksdb/table/iterator.cc
+++ b/src/rocksdb/table/iterator.cc
@@ -8,6 +8,8 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "rocksdb/iterator.h"
+#include "table/iterator_wrapper.h"
+#include "util/arena.h"
 
 namespace rocksdb {
 
@@ -47,15 +49,22 @@ namespace {
 class EmptyIterator : public Iterator {
  public:
   explicit EmptyIterator(const Status& s) : status_(s) { }
-  virtual bool Valid() const { return false; }
-  virtual void Seek(const Slice& target) { }
-  virtual void SeekToFirst() { }
-  virtual void SeekToLast() { }
-  virtual void Next() { assert(false); }
-  virtual void Prev() { assert(false); }
-  Slice key() const { assert(false); return Slice(); }
-  Slice value() const { assert(false); return Slice(); }
-  virtual Status status() const { return status_; }
+  virtual bool Valid() const override { return false; }
+  virtual void Seek(const Slice& target) override {}
+  virtual void SeekToFirst() override {}
+  virtual void SeekToLast() override {}
+  virtual void Next() override { assert(false); }
+  virtual void Prev() override { assert(false); }
+  Slice key() const override {
+    assert(false);
+    return Slice();
+  }
+  Slice value() const override {
+    assert(false);
+    return Slice();
+  }
+  virtual Status status() const override { return status_; }
+
  private:
   Status status_;
 };
@@ -65,8 +74,26 @@ Iterator* NewEmptyIterator() {
   return new EmptyIterator(Status::OK());
 }
 
+Iterator* NewEmptyIterator(Arena* arena) {
+  if (arena == nullptr) {
+    return NewEmptyIterator();
+  } else {
+    auto mem = arena->AllocateAligned(sizeof(EmptyIterator));
+    return new (mem) EmptyIterator(Status::OK());
+  }
+}
+
 Iterator* NewErrorIterator(const Status& status) {
   return new EmptyIterator(status);
 }
 
+Iterator* NewErrorIterator(const Status& status, Arena* arena) {
+  if (arena == nullptr) {
+    return NewErrorIterator(status);
+  } else {
+    auto mem = arena->AllocateAligned(sizeof(EmptyIterator));
+    return new (mem) EmptyIterator(status);
+  }
+}
+
 }  // namespace rocksdb
diff --git a/src/rocksdb/table/iterator_wrapper.h b/src/rocksdb/table/iterator_wrapper.h
index cb8520b..d64047b 100644
--- a/src/rocksdb/table/iterator_wrapper.h
+++ b/src/rocksdb/table/iterator_wrapper.h
@@ -8,6 +8,9 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #pragma once
+
+#include "rocksdb/iterator.h"
+
 namespace rocksdb {
 
 // A internal wrapper class with an interface similar to Iterator that
@@ -17,17 +20,15 @@ namespace rocksdb {
 class IteratorWrapper {
  public:
   IteratorWrapper(): iter_(nullptr), valid_(false) { }
-  explicit IteratorWrapper(Iterator* iter): iter_(nullptr) {
-    Set(iter);
-  }
-  ~IteratorWrapper() { delete iter_; }
+  explicit IteratorWrapper(Iterator* _iter) : iter_(nullptr) { Set(_iter); }
+  ~IteratorWrapper() {}
   Iterator* iter() const { return iter_; }
 
   // Takes ownership of "iter" and will delete it when destroyed, or
   // when Set() is invoked again.
-  void Set(Iterator* iter) {
+  void Set(Iterator* _iter) {
     delete iter_;
-    iter_ = iter;
+    iter_ = _iter;
     if (iter_ == nullptr) {
       valid_ = false;
     } else {
@@ -35,6 +36,13 @@ class IteratorWrapper {
     }
   }
 
+  void DeleteIter(bool is_arena_mode) {
+    if (!is_arena_mode) {
+      delete iter_;
+    } else {
+      iter_->~Iterator();
+    }
+  }
 
   // Iterator interface methods
   bool Valid() const        { return valid_; }
@@ -61,4 +69,11 @@ class IteratorWrapper {
   Slice key_;
 };
 
+class Arena;
+// Return an empty iterator (yields nothing) allocated from arena.
+extern Iterator* NewEmptyIterator(Arena* arena);
+
+// Return an empty iterator with the specified status, allocated arena.
+extern Iterator* NewErrorIterator(const Status& status, Arena* arena);
+
 }  // namespace rocksdb
diff --git a/src/rocksdb/table/merger.cc b/src/rocksdb/table/merger.cc
index b829f71..b418b88 100644
--- a/src/rocksdb/table/merger.cc
+++ b/src/rocksdb/table/merger.cc
@@ -17,42 +17,45 @@
 #include "rocksdb/options.h"
 #include "table/iter_heap.h"
 #include "table/iterator_wrapper.h"
+#include "util/arena.h"
 #include "util/stop_watch.h"
 #include "util/perf_context_imp.h"
+#include "util/autovector.h"
 
 namespace rocksdb {
+// Without anonymous namespace here, we fail the warning -Wmissing-prototypes
 namespace {
+typedef std::priority_queue<IteratorWrapper*, std::vector<IteratorWrapper*>,
+                            MaxIteratorComparator> MergerMaxIterHeap;
 
-typedef std::priority_queue<
-          IteratorWrapper*,
-          std::vector<IteratorWrapper*>,
-          MaxIteratorComparator> MaxIterHeap;
-
-typedef std::priority_queue<
-          IteratorWrapper*,
-          std::vector<IteratorWrapper*>,
-          MinIteratorComparator> MinIterHeap;
+typedef std::priority_queue<IteratorWrapper*, std::vector<IteratorWrapper*>,
+                            MinIteratorComparator> MergerMinIterHeap;
 
 // Return's a new MaxHeap of IteratorWrapper's using the provided Comparator.
-MaxIterHeap NewMaxIterHeap(const Comparator* comparator) {
-  return MaxIterHeap(MaxIteratorComparator(comparator));
+MergerMaxIterHeap NewMergerMaxIterHeap(const Comparator* comparator) {
+  return MergerMaxIterHeap(MaxIteratorComparator(comparator));
 }
 
 // Return's a new MinHeap of IteratorWrapper's using the provided Comparator.
-MinIterHeap NewMinIterHeap(const Comparator* comparator) {
-  return MinIterHeap(MinIteratorComparator(comparator));
+MergerMinIterHeap NewMergerMinIterHeap(const Comparator* comparator) {
+  return MergerMinIterHeap(MinIteratorComparator(comparator));
 }
+}  // namespace
+
+const size_t kNumIterReserve = 4;
 
 class MergingIterator : public Iterator {
  public:
-  MergingIterator(const Comparator* comparator, Iterator** children, int n)
-      : comparator_(comparator),
-        children_(n),
+  MergingIterator(const Comparator* comparator, Iterator** children, int n,
+                  bool is_arena_mode)
+      : is_arena_mode_(is_arena_mode),
+        comparator_(comparator),
         current_(nullptr),
         use_heap_(true),
         direction_(kForward),
-        maxHeap_(NewMaxIterHeap(comparator_)),
-        minHeap_(NewMinIterHeap(comparator_)) {
+        maxHeap_(NewMergerMaxIterHeap(comparator_)),
+        minHeap_(NewMergerMinIterHeap(comparator_)) {
+    children_.resize(n);
     for (int i = 0; i < n; i++) {
       children_[i].Set(children[i]);
     }
@@ -63,13 +66,24 @@ class MergingIterator : public Iterator {
     }
   }
 
-  virtual ~MergingIterator() { }
+  virtual void AddIterator(Iterator* iter) {
+    assert(direction_ == kForward);
+    children_.emplace_back(iter);
+    auto new_wrapper = children_.back();
+    if (new_wrapper.Valid()) {
+      minHeap_.push(&new_wrapper);
+    }
+  }
 
-  virtual bool Valid() const {
-    return (current_ != nullptr);
+  virtual ~MergingIterator() {
+    for (auto& child : children_) {
+      child.DeleteIter(is_arena_mode_);
+    }
   }
 
-  virtual void SeekToFirst() {
+  virtual bool Valid() const override { return (current_ != nullptr); }
+
+  virtual void SeekToFirst() override {
     ClearHeaps();
     for (auto& child : children_) {
       child.SeekToFirst();
@@ -81,7 +95,7 @@ class MergingIterator : public Iterator {
     direction_ = kForward;
   }
 
-  virtual void SeekToLast() {
+  virtual void SeekToLast() override {
     ClearHeaps();
     for (auto& child : children_) {
       child.SeekToLast();
@@ -93,16 +107,16 @@ class MergingIterator : public Iterator {
     direction_ = kReverse;
   }
 
-  virtual void Seek(const Slice& target) {
+  virtual void Seek(const Slice& target) override {
     // Invalidate the heap.
     use_heap_ = false;
     IteratorWrapper* first_child = nullptr;
-    PERF_TIMER_DECLARE();
 
     for (auto& child : children_) {
-      PERF_TIMER_START(seek_child_seek_time);
-      child.Seek(target);
-      PERF_TIMER_STOP(seek_child_seek_time);
+      {
+        PERF_TIMER_GUARD(seek_child_seek_time);
+        child.Seek(target);
+      }
       PERF_COUNTER_ADD(seek_child_seek_count, 1);
 
       if (child.Valid()) {
@@ -115,24 +129,21 @@ class MergingIterator : public Iterator {
           } else {
             // We have more than one children with valid keys. Initialize
             // the heap and put the first child into the heap.
-            PERF_TIMER_START(seek_min_heap_time);
+            PERF_TIMER_GUARD(seek_min_heap_time);
             ClearHeaps();
             minHeap_.push(first_child);
-            PERF_TIMER_STOP(seek_min_heap_time);
           }
         }
         if (use_heap_) {
-          PERF_TIMER_START(seek_min_heap_time);
+          PERF_TIMER_GUARD(seek_min_heap_time);
           minHeap_.push(&child);
-          PERF_TIMER_STOP(seek_min_heap_time);
         }
       }
     }
     if (use_heap_) {
       // If heap is valid, need to put the smallest key to curent_.
-      PERF_TIMER_START(seek_min_heap_time);
+      PERF_TIMER_GUARD(seek_min_heap_time);
       FindSmallest();
-      PERF_TIMER_STOP(seek_min_heap_time);
     } else {
       // The heap is not valid, then the current_ iterator is the first
       // one, or null if there is no first child.
@@ -141,7 +152,7 @@ class MergingIterator : public Iterator {
     direction_ = kForward;
   }
 
-  virtual void Next() {
+  virtual void Next() override {
     assert(Valid());
 
     // Ensure that all children are positioned after key().
@@ -179,7 +190,7 @@ class MergingIterator : public Iterator {
     }
   }
 
-  virtual void Prev() {
+  virtual void Prev() override {
     assert(Valid());
     // Ensure that all children are positioned before key().
     // If we are moving in the reverse direction, it is already
@@ -213,25 +224,25 @@ class MergingIterator : public Iterator {
     FindLargest();
   }
 
-  virtual Slice key() const {
+  virtual Slice key() const override {
     assert(Valid());
     return current_->key();
   }
 
-  virtual Slice value() const {
+  virtual Slice value() const override {
     assert(Valid());
     return current_->value();
   }
 
-  virtual Status status() const {
-    Status status;
+  virtual Status status() const override {
+    Status s;
     for (auto& child : children_) {
-      status = child.status();
-      if (!status.ok()) {
+      s = child.status();
+      if (!s.ok()) {
         break;
       }
     }
-    return status;
+    return s;
   }
 
  private:
@@ -239,8 +250,9 @@ class MergingIterator : public Iterator {
   void FindLargest();
   void ClearHeaps();
 
+  bool is_arena_mode_;
   const Comparator* comparator_;
-  std::vector<IteratorWrapper> children_;
+  autovector<IteratorWrapper, kNumIterReserve> children_;
   IteratorWrapper* current_;
   // If the value is true, both of iterators in the heap and current_
   // contain valid rows. If it is false, only current_ can possibly contain
@@ -254,8 +266,8 @@ class MergingIterator : public Iterator {
     kReverse
   };
   Direction direction_;
-  MaxIterHeap maxHeap_;
-  MinIterHeap minHeap_;
+  MergerMaxIterHeap maxHeap_;
+  MergerMinIterHeap minHeap_;
 };
 
 void MergingIterator::FindSmallest() {
@@ -282,19 +294,54 @@ void MergingIterator::FindLargest() {
 
 void MergingIterator::ClearHeaps() {
   use_heap_ = true;
-  maxHeap_ = NewMaxIterHeap(comparator_);
-  minHeap_ = NewMinIterHeap(comparator_);
+  maxHeap_ = NewMergerMaxIterHeap(comparator_);
+  minHeap_ = NewMergerMinIterHeap(comparator_);
 }
-}  // namespace
 
-Iterator* NewMergingIterator(const Comparator* cmp, Iterator** list, int n) {
+Iterator* NewMergingIterator(const Comparator* cmp, Iterator** list, int n,
+                             Arena* arena) {
   assert(n >= 0);
   if (n == 0) {
-    return NewEmptyIterator();
+    return NewEmptyIterator(arena);
   } else if (n == 1) {
     return list[0];
   } else {
-    return new MergingIterator(cmp, list, n);
+    if (arena == nullptr) {
+      return new MergingIterator(cmp, list, n, false);
+    } else {
+      auto mem = arena->AllocateAligned(sizeof(MergingIterator));
+      return new (mem) MergingIterator(cmp, list, n, true);
+    }
+  }
+}
+
+MergeIteratorBuilder::MergeIteratorBuilder(const Comparator* comparator,
+                                           Arena* a)
+    : first_iter(nullptr), use_merging_iter(false), arena(a) {
+
+  auto mem = arena->AllocateAligned(sizeof(MergingIterator));
+  merge_iter = new (mem) MergingIterator(comparator, nullptr, 0, true);
+}
+
+void MergeIteratorBuilder::AddIterator(Iterator* iter) {
+  if (!use_merging_iter && first_iter != nullptr) {
+    merge_iter->AddIterator(first_iter);
+    use_merging_iter = true;
+  }
+  if (use_merging_iter) {
+    merge_iter->AddIterator(iter);
+  } else {
+    first_iter = iter;
+  }
+}
+
+Iterator* MergeIteratorBuilder::Finish() {
+  if (!use_merging_iter) {
+    return first_iter;
+  } else {
+    auto ret = merge_iter;
+    merge_iter = nullptr;
+    return ret;
   }
 }
 
diff --git a/src/rocksdb/table/merger.h b/src/rocksdb/table/merger.h
index 3a1a4fe..7dcf2af 100644
--- a/src/rocksdb/table/merger.h
+++ b/src/rocksdb/table/merger.h
@@ -9,11 +9,14 @@
 
 #pragma once
 
+#include "rocksdb/types.h"
+
 namespace rocksdb {
 
 class Comparator;
 class Iterator;
 class Env;
+class Arena;
 
 // Return an iterator that provided the union of the data in
 // children[0,n-1].  Takes ownership of the child iterators and
@@ -24,6 +27,34 @@ class Env;
 //
 // REQUIRES: n >= 0
 extern Iterator* NewMergingIterator(const Comparator* comparator,
-                                    Iterator** children, int n);
+                                    Iterator** children, int n,
+                                    Arena* arena = nullptr);
+
+class MergingIterator;
+
+// A builder class to build a merging iterator by adding iterators one by one.
+class MergeIteratorBuilder {
+ public:
+  // comparator: the comparator used in merging comparator
+  // arena: where the merging iterator needs to be allocated from.
+  explicit MergeIteratorBuilder(const Comparator* comparator, Arena* arena);
+  ~MergeIteratorBuilder() {}
+
+  // Add iter to the merging iterator.
+  void AddIterator(Iterator* iter);
+
+  // Get arena used to build the merging iterator. It is called one a child
+  // iterator needs to be allocated.
+  Arena* GetArena() { return arena; }
+
+  // Return the result merging iterator.
+  Iterator* Finish();
+
+ private:
+  MergingIterator* merge_iter;
+  Iterator* first_iter;
+  bool use_merging_iter;
+  Arena* arena;
+};
 
 }  // namespace rocksdb
diff --git a/src/rocksdb/table/merger_test.cc b/src/rocksdb/table/merger_test.cc
new file mode 100644
index 0000000..1085ce4
--- /dev/null
+++ b/src/rocksdb/table/merger_test.cc
@@ -0,0 +1,201 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include <vector>
+#include <string>
+#include <algorithm>
+
+#include "rocksdb/iterator.h"
+#include "table/merger.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+class VectorIterator : public Iterator {
+ public:
+  explicit VectorIterator(const std::vector<std::string>& keys)
+      : keys_(keys), current_(keys.size()) {
+    std::sort(keys_.begin(), keys_.end());
+  }
+
+  virtual bool Valid() const override { return current_ < keys_.size(); }
+
+  virtual void SeekToFirst() override { current_ = 0; }
+  virtual void SeekToLast() override { current_ = keys_.size() - 1; }
+
+  virtual void Seek(const Slice& target) override {
+    current_ = std::lower_bound(keys_.begin(), keys_.end(), target.ToString()) -
+               keys_.begin();
+  }
+
+  virtual void Next() override { current_++; }
+  virtual void Prev() override { current_--; }
+
+  virtual Slice key() const override { return Slice(keys_[current_]); }
+  virtual Slice value() const override { return Slice(); }
+
+  virtual Status status() const override { return Status::OK(); }
+
+ private:
+  std::vector<std::string> keys_;
+  size_t current_;
+};
+
+class MergerTest : public testing::Test {
+ public:
+  MergerTest()
+      : rnd_(3), merging_iterator_(nullptr), single_iterator_(nullptr) {}
+  ~MergerTest() = default;
+  std::vector<std::string> GenerateStrings(size_t len, int string_len) {
+    std::vector<std::string> ret;
+    for (size_t i = 0; i < len; ++i) {
+      ret.push_back(test::RandomHumanReadableString(&rnd_, string_len));
+    }
+    return ret;
+  }
+
+  void AssertEquivalence() {
+    auto a = merging_iterator_.get();
+    auto b = single_iterator_.get();
+    if (!a->Valid()) {
+      ASSERT_TRUE(!b->Valid());
+    } else {
+      ASSERT_TRUE(b->Valid());
+      ASSERT_EQ(b->key().ToString(), a->key().ToString());
+      ASSERT_EQ(b->value().ToString(), a->value().ToString());
+    }
+  }
+
+  void SeekToRandom() { Seek(test::RandomHumanReadableString(&rnd_, 5)); }
+
+  void Seek(std::string target) {
+    merging_iterator_->Seek(target);
+    single_iterator_->Seek(target);
+  }
+
+  void SeekToFirst() {
+    merging_iterator_->SeekToFirst();
+    single_iterator_->SeekToFirst();
+  }
+
+  void SeekToLast() {
+    merging_iterator_->SeekToLast();
+    single_iterator_->SeekToLast();
+  }
+
+  void Next(int times) {
+    for (int i = 0; i < times && merging_iterator_->Valid(); ++i) {
+      AssertEquivalence();
+      merging_iterator_->Next();
+      single_iterator_->Next();
+    }
+    AssertEquivalence();
+  }
+
+  void Prev(int times) {
+    for (int i = 0; i < times && merging_iterator_->Valid(); ++i) {
+      AssertEquivalence();
+      merging_iterator_->Prev();
+      single_iterator_->Prev();
+    }
+    AssertEquivalence();
+  }
+
+  void NextAndPrev(int times) {
+    for (int i = 0; i < times && merging_iterator_->Valid(); ++i) {
+      AssertEquivalence();
+      if (rnd_.OneIn(2)) {
+        merging_iterator_->Prev();
+        single_iterator_->Prev();
+      } else {
+        merging_iterator_->Next();
+        single_iterator_->Next();
+      }
+    }
+    AssertEquivalence();
+  }
+
+  void Generate(size_t num_iterators, size_t strings_per_iterator,
+                int letters_per_string) {
+    std::vector<Iterator*> small_iterators;
+    for (size_t i = 0; i < num_iterators; ++i) {
+      auto strings = GenerateStrings(strings_per_iterator, letters_per_string);
+      small_iterators.push_back(new VectorIterator(strings));
+      all_keys_.insert(all_keys_.end(), strings.begin(), strings.end());
+    }
+
+    merging_iterator_.reset(
+        NewMergingIterator(BytewiseComparator(), &small_iterators[0],
+                           static_cast<int>(small_iterators.size())));
+    single_iterator_.reset(new VectorIterator(all_keys_));
+  }
+
+  Random rnd_;
+  std::unique_ptr<Iterator> merging_iterator_;
+  std::unique_ptr<Iterator> single_iterator_;
+  std::vector<std::string> all_keys_;
+};
+
+TEST_F(MergerTest, SeekToRandomNextTest) {
+  Generate(1000, 50, 50);
+  for (int i = 0; i < 10; ++i) {
+    SeekToRandom();
+    AssertEquivalence();
+    Next(50000);
+  }
+}
+
+TEST_F(MergerTest, SeekToRandomNextSmallStringsTest) {
+  Generate(1000, 50, 2);
+  for (int i = 0; i < 10; ++i) {
+    SeekToRandom();
+    AssertEquivalence();
+    Next(50000);
+  }
+}
+
+TEST_F(MergerTest, SeekToRandomPrevTest) {
+  Generate(1000, 50, 50);
+  for (int i = 0; i < 10; ++i) {
+    SeekToRandom();
+    AssertEquivalence();
+    Prev(50000);
+  }
+}
+
+TEST_F(MergerTest, SeekToRandomRandomTest) {
+  Generate(200, 50, 50);
+  for (int i = 0; i < 3; ++i) {
+    SeekToRandom();
+    AssertEquivalence();
+    NextAndPrev(5000);
+  }
+}
+
+TEST_F(MergerTest, SeekToFirstTest) {
+  Generate(1000, 50, 50);
+  for (int i = 0; i < 10; ++i) {
+    SeekToFirst();
+    AssertEquivalence();
+    Next(50000);
+  }
+}
+
+TEST_F(MergerTest, SeekToLastTest) {
+  Generate(1000, 50, 50);
+  for (int i = 0; i < 10; ++i) {
+    SeekToLast();
+    AssertEquivalence();
+    Prev(50000);
+  }
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/table/meta_blocks.cc b/src/rocksdb/table/meta_blocks.cc
index f28b44d..6fad808 100644
--- a/src/rocksdb/table/meta_blocks.cc
+++ b/src/rocksdb/table/meta_blocks.cc
@@ -7,18 +7,18 @@
 #include <map>
 #include <string>
 
+#include "db/table_properties_collector.h"
+#include "table/block.h"
 #include "rocksdb/table.h"
 #include "rocksdb/table_properties.h"
-#include "table/block.h"
 #include "table/format.h"
+#include "table/table_properties_internal.h"
 #include "util/coding.h"
 
 namespace rocksdb {
 
 MetaIndexBuilder::MetaIndexBuilder()
-    : meta_index_block_(
-        new BlockBuilder(1 /* restart interval */, BytewiseComparator())) {
-}
+    : meta_index_block_(new BlockBuilder(1 /* restart interval */)) {}
 
 void MetaIndexBuilder::Add(const std::string& key,
                            const BlockHandle& handle) {
@@ -35,9 +35,7 @@ Slice MetaIndexBuilder::Finish() {
 }
 
 PropertyBlockBuilder::PropertyBlockBuilder()
-  : properties_block_(
-      new BlockBuilder(1 /* restart interval */, BytewiseComparator())) {
-}
+    : properties_block_(new BlockBuilder(1 /* restart interval */)) {}
 
 void PropertyBlockBuilder::Add(const std::string& name,
                                const std::string& val) {
@@ -90,19 +88,18 @@ void LogPropertiesCollectionError(
   assert(method == "Add" || method == "Finish");
 
   std::string msg =
-    "[Warning] encountered error when calling TablePropertiesCollector::" +
+    "Encountered error when calling TablePropertiesCollector::" +
     method + "() with collector name: " + name;
-  Log(info_log, "%s", msg.c_str());
+  Log(InfoLogLevel::ERROR_LEVEL, info_log, "%s", msg.c_str());
 }
 
 bool NotifyCollectTableCollectorsOnAdd(
-    const Slice& key,
-    const Slice& value,
-    const Options::TablePropertiesCollectors& collectors,
+    const Slice& key, const Slice& value, uint64_t file_size,
+    const std::vector<std::unique_ptr<IntTblPropCollector>>& collectors,
     Logger* info_log) {
   bool all_succeeded = true;
-  for (auto collector : collectors) {
-    Status s = collector->Add(key, value);
+  for (auto& collector : collectors) {
+    Status s = collector->InternalAdd(key, value, file_size);
     all_succeeded = all_succeeded && s.ok();
     if (!s.ok()) {
       LogPropertiesCollectionError(info_log, "Add" /* method */,
@@ -113,11 +110,10 @@ bool NotifyCollectTableCollectorsOnAdd(
 }
 
 bool NotifyCollectTableCollectorsOnFinish(
-    const Options::TablePropertiesCollectors& collectors,
-    Logger* info_log,
-    PropertyBlockBuilder* builder) {
+    const std::vector<std::unique_ptr<IntTblPropCollector>>& collectors,
+    Logger* info_log, PropertyBlockBuilder* builder) {
   bool all_succeeded = true;
-  for (auto collector : collectors) {
+  for (auto& collector : collectors) {
     UserCollectedProperties user_collected_properties;
     Status s = collector->Finish(&user_collected_properties);
 
@@ -147,14 +143,15 @@ Status ReadProperties(const Slice &handle_value, RandomAccessFile *file,
   BlockContents block_contents;
   ReadOptions read_options;
   read_options.verify_checksums = false;
-  Status s = ReadBlockContents(file, footer, read_options, handle,
-                               &block_contents, env, false);
+  Status s;
+  s = ReadBlockContents(file, footer, read_options, handle, &block_contents,
+                        env, false);
 
   if (!s.ok()) {
     return s;
   }
 
-  Block properties_block(block_contents);
+  Block properties_block(std::move(block_contents));
   std::unique_ptr<Iterator> iter(
       properties_block.NewIterator(BytewiseComparator()));
 
@@ -197,9 +194,9 @@ Status ReadProperties(const Slice &handle_value, RandomAccessFile *file,
       if (!GetVarint64(&raw_val, &val)) {
         // skip malformed value
         auto error_msg =
-          "[Warning] detect malformed value in properties meta-block:"
+          "Detect malformed value in properties meta-block:"
           "\tkey: " + key + "\tval: " + raw_val.ToString();
-        Log(logger, "%s", error_msg.c_str());
+        Log(InfoLogLevel::ERROR_LEVEL, logger, "%s", error_msg.c_str());
         continue;
       }
       *(pos->second) = val;
@@ -224,8 +221,8 @@ Status ReadTableProperties(RandomAccessFile* file, uint64_t file_size,
                            uint64_t table_magic_number, Env* env,
                            Logger* info_log, TableProperties** properties) {
   // -- Read metaindex block
-  Footer footer(table_magic_number);
-  auto s = ReadFooterFromFile(file, file_size, &footer);
+  Footer footer;
+  auto s = ReadFooterFromFile(file, file_size, &footer, table_magic_number);
   if (!s.ok()) {
     return s;
   }
@@ -239,7 +236,7 @@ Status ReadTableProperties(RandomAccessFile* file, uint64_t file_size,
   if (!s.ok()) {
     return s;
   }
-  Block metaindex_block(metaindex_contents);
+  Block metaindex_block(std::move(metaindex_contents));
   std::unique_ptr<Iterator> meta_iter(
       metaindex_block.NewIterator(BytewiseComparator()));
 
@@ -255,12 +252,90 @@ Status ReadTableProperties(RandomAccessFile* file, uint64_t file_size,
     s = ReadProperties(meta_iter->value(), file, footer, env, info_log,
                        properties);
   } else {
-    s = Status::Corruption("Unable to read the property block.");
-    Log(WARN_LEVEL, info_log,
-        "Cannot find Properties block from file.");
+    s = Status::NotFound();
   }
 
   return s;
 }
 
+Status FindMetaBlock(Iterator* meta_index_iter,
+                     const std::string& meta_block_name,
+                     BlockHandle* block_handle) {
+  meta_index_iter->Seek(meta_block_name);
+  if (meta_index_iter->status().ok() && meta_index_iter->Valid() &&
+      meta_index_iter->key() == meta_block_name) {
+    Slice v = meta_index_iter->value();
+    return block_handle->DecodeFrom(&v);
+  } else {
+    return Status::Corruption("Cannot find the meta block", meta_block_name);
+  }
+}
+
+Status FindMetaBlock(RandomAccessFile* file, uint64_t file_size,
+                     uint64_t table_magic_number, Env* env,
+                     const std::string& meta_block_name,
+                     BlockHandle* block_handle) {
+  Footer footer;
+  auto s = ReadFooterFromFile(file, file_size, &footer, table_magic_number);
+  if (!s.ok()) {
+    return s;
+  }
+
+  auto metaindex_handle = footer.metaindex_handle();
+  BlockContents metaindex_contents;
+  ReadOptions read_options;
+  read_options.verify_checksums = false;
+  s = ReadBlockContents(file, footer, read_options, metaindex_handle,
+                        &metaindex_contents, env, false);
+  if (!s.ok()) {
+    return s;
+  }
+  Block metaindex_block(std::move(metaindex_contents));
+
+  std::unique_ptr<Iterator> meta_iter;
+  meta_iter.reset(metaindex_block.NewIterator(BytewiseComparator()));
+
+  return FindMetaBlock(meta_iter.get(), meta_block_name, block_handle);
+}
+
+Status ReadMetaBlock(RandomAccessFile* file, uint64_t file_size,
+                     uint64_t table_magic_number, Env* env,
+                     const std::string& meta_block_name,
+                     BlockContents* contents) {
+  Status status;
+  Footer footer;
+  status = ReadFooterFromFile(file, file_size, &footer, table_magic_number);
+  if (!status.ok()) {
+    return status;
+  }
+
+  // Reading metaindex block
+  auto metaindex_handle = footer.metaindex_handle();
+  BlockContents metaindex_contents;
+  ReadOptions read_options;
+  read_options.verify_checksums = false;
+  status = ReadBlockContents(file, footer, read_options, metaindex_handle,
+                             &metaindex_contents, env, false);
+  if (!status.ok()) {
+    return status;
+  }
+
+  // Finding metablock
+  Block metaindex_block(std::move(metaindex_contents));
+
+  std::unique_ptr<Iterator> meta_iter;
+  meta_iter.reset(metaindex_block.NewIterator(BytewiseComparator()));
+
+  BlockHandle block_handle;
+  status = FindMetaBlock(meta_iter.get(), meta_block_name, &block_handle);
+
+  if (!status.ok()) {
+    return status;
+  }
+
+  // Reading metablock
+  return ReadBlockContents(file, footer, read_options, block_handle, contents,
+                           env, false);
+}
+
 }  // namespace rocksdb
diff --git a/src/rocksdb/table/meta_blocks.h b/src/rocksdb/table/meta_blocks.h
index 2ac8903..7ac3cb0 100644
--- a/src/rocksdb/table/meta_blocks.h
+++ b/src/rocksdb/table/meta_blocks.h
@@ -6,14 +6,16 @@
 
 #include <map>
 #include <memory>
+#include <vector>
 #include <string>
 
 #include "db/builder.h"
+#include "db/table_properties_collector.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
-#include "rocksdb/table_properties.h"
 #include "table/block_builder.h"
+#include "table/format.h"
 
 namespace rocksdb {
 
@@ -91,17 +93,15 @@ void LogPropertiesCollectionError(
 // NotifyCollectTableCollectorsOnAdd() triggers the `Add` event for all
 // property collectors.
 bool NotifyCollectTableCollectorsOnAdd(
-    const Slice& key,
-    const Slice& value,
-    const Options::TablePropertiesCollectors& collectors,
+    const Slice& key, const Slice& value, uint64_t file_size,
+    const std::vector<std::unique_ptr<IntTblPropCollector>>& collectors,
     Logger* info_log);
 
 // NotifyCollectTableCollectorsOnAdd() triggers the `Finish` event for all
 // property collectors. The collected properties will be added to `builder`.
 bool NotifyCollectTableCollectorsOnFinish(
-    const Options::TablePropertiesCollectors& collectors,
-    Logger* info_log,
-    PropertyBlockBuilder* builder);
+    const std::vector<std::unique_ptr<IntTblPropCollector>>& collectors,
+    Logger* info_log, PropertyBlockBuilder* builder);
 
 // Read the properties from the table.
 // @returns a status to indicate if the operation succeeded. On success,
@@ -119,9 +119,24 @@ Status ReadTableProperties(RandomAccessFile* file, uint64_t file_size,
                            uint64_t table_magic_number, Env* env,
                            Logger* info_log, TableProperties** properties);
 
-// Seek to the properties block.
-// If it successfully seeks to the properties block, "is_found" will be
-// set to true.
-extern Status SeekToPropertiesBlock(Iterator* meta_iter, bool* is_found);
+
+// Find the meta block from the meta index block.
+Status FindMetaBlock(Iterator* meta_index_iter,
+                     const std::string& meta_block_name,
+                     BlockHandle* block_handle);
+
+// Find the meta block
+Status FindMetaBlock(RandomAccessFile* file, uint64_t file_size,
+                     uint64_t table_magic_number, Env* env,
+                     const std::string& meta_block_name,
+                     BlockHandle* block_handle);
+
+// Read the specified meta block with name meta_block_name
+// from `file` and initialize `contents` with contents of this block.
+// Return Status::OK in case of success.
+Status ReadMetaBlock(RandomAccessFile* file, uint64_t file_size,
+                     uint64_t table_magic_number, Env* env,
+                     const std::string& meta_block_name,
+                     BlockContents* contents);
 
 }  // namespace rocksdb
diff --git a/src/rocksdb/table/mock_table.cc b/src/rocksdb/table/mock_table.cc
new file mode 100644
index 0000000..90e2079
--- /dev/null
+++ b/src/rocksdb/table/mock_table.cc
@@ -0,0 +1,114 @@
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "rocksdb/table_properties.h"
+#include "table/mock_table.h"
+#include "table/get_context.h"
+#include "db/dbformat.h"
+#include "port/port.h"
+#include "util/coding.h"
+
+namespace rocksdb {
+namespace mock {
+
+Iterator* MockTableReader::NewIterator(const ReadOptions&, Arena* arena) {
+  return new MockTableIterator(table_);
+}
+
+Status MockTableReader::Get(const ReadOptions&, const Slice& key,
+                            GetContext* get_context) {
+  std::unique_ptr<MockTableIterator> iter(new MockTableIterator(table_));
+  for (iter->Seek(key); iter->Valid(); iter->Next()) {
+    ParsedInternalKey parsed_key;
+    if (!ParseInternalKey(iter->key(), &parsed_key)) {
+      return Status::Corruption(Slice());
+    }
+
+    if (!get_context->SaveValue(parsed_key, iter->value())) {
+      break;
+    }
+  }
+  return Status::OK();
+}
+
+std::shared_ptr<const TableProperties> MockTableReader::GetTableProperties()
+    const {
+  return std::shared_ptr<const TableProperties>(new TableProperties());
+}
+
+MockTableFactory::MockTableFactory() : next_id_(1) {}
+
+Status MockTableFactory::NewTableReader(
+    const ImmutableCFOptions& ioptions, const EnvOptions& env_options,
+    const InternalKeyComparator& internal_key,
+    unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
+    unique_ptr<TableReader>* table_reader) const {
+  uint32_t id = GetIDFromFile(file.get());
+
+  MutexLock lock_guard(&file_system_.mutex);
+
+  auto it = file_system_.files.find(id);
+  if (it == file_system_.files.end()) {
+    return Status::IOError("Mock file not found");
+  }
+
+  table_reader->reset(new MockTableReader(it->second));
+
+  return Status::OK();
+}
+
+TableBuilder* MockTableFactory::NewTableBuilder(
+    const TableBuilderOptions& table_builder_options,
+    WritableFile* file) const {
+  uint32_t id = GetAndWriteNextID(file);
+
+  return new MockTableBuilder(id, &file_system_);
+}
+
+Status MockTableFactory::CreateMockTable(Env* env, const std::string& fname,
+                                         MockFileContents file_contents) {
+  std::unique_ptr<WritableFile> file;
+  auto s = env->NewWritableFile(fname, &file, EnvOptions());
+  if (!s.ok()) {
+    return s;
+  }
+
+  uint32_t id = GetAndWriteNextID(file.get());
+  file_system_.files.insert({id, std::move(file_contents)});
+  return Status::OK();
+}
+
+uint32_t MockTableFactory::GetAndWriteNextID(WritableFile* file) const {
+  uint32_t next_id = next_id_.fetch_add(1);
+  char buf[4];
+  EncodeFixed32(buf, next_id);
+  file->Append(Slice(buf, 4));
+  return next_id;
+}
+
+uint32_t MockTableFactory::GetIDFromFile(RandomAccessFile* file) const {
+  char buf[4];
+  Slice result;
+  file->Read(0, 4, &result, buf);
+  assert(result.size() == 4);
+  return DecodeFixed32(buf);
+}
+
+void MockTableFactory::AssertSingleFile(const MockFileContents& file_contents) {
+  ASSERT_EQ(file_system_.files.size(), 1U);
+  ASSERT_TRUE(file_contents == file_system_.files.begin()->second);
+}
+
+void MockTableFactory::AssertLatestFile(const MockFileContents& file_contents) {
+  ASSERT_GE(file_system_.files.size(), 1U);
+  auto latest = file_system_.files.end();
+  --latest;
+  ASSERT_TRUE(file_contents == latest->second);
+}
+
+}  // namespace mock
+}  // namespace rocksdb
diff --git a/src/rocksdb/table/mock_table.h b/src/rocksdb/table/mock_table.h
new file mode 100644
index 0000000..ef38575
--- /dev/null
+++ b/src/rocksdb/table/mock_table.h
@@ -0,0 +1,181 @@
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+#pragma once
+#include <algorithm>
+#include <set>
+#include <memory>
+#include <atomic>
+#include <map>
+#include <string>
+
+#include "rocksdb/table.h"
+#include "table/table_reader.h"
+#include "table/table_builder.h"
+#include "port/port.h"
+#include "util/mutexlock.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+namespace mock {
+
+typedef std::map<std::string, std::string> MockFileContents;
+// NOTE this currently only supports bitwise comparator
+
+struct MockTableFileSystem {
+  port::Mutex mutex;
+  std::map<uint32_t, MockFileContents> files;
+};
+
+class MockTableReader : public TableReader {
+ public:
+  explicit MockTableReader(const MockFileContents& table) : table_(table) {}
+
+  Iterator* NewIterator(const ReadOptions&, Arena* arena) override;
+
+  Status Get(const ReadOptions&, const Slice& key,
+             GetContext* get_context) override;
+
+  uint64_t ApproximateOffsetOf(const Slice& key) override { return 0; }
+
+  virtual size_t ApproximateMemoryUsage() const override { return 0; }
+
+  void SetupForCompaction() override {}
+
+  std::shared_ptr<const TableProperties> GetTableProperties() const override;
+
+  ~MockTableReader() {}
+
+ private:
+  const MockFileContents& table_;
+};
+
+class MockTableIterator : public Iterator {
+ public:
+  explicit MockTableIterator(const MockFileContents& table) : table_(table) {
+    itr_ = table_.end();
+  }
+
+  bool Valid() const override { return itr_ != table_.end(); }
+
+  void SeekToFirst() override { itr_ = table_.begin(); }
+
+  void SeekToLast() override {
+    itr_ = table_.end();
+    --itr_;
+  }
+
+  void Seek(const Slice& target) override {
+    std::string str_target(target.data(), target.size());
+    itr_ = table_.lower_bound(str_target);
+  }
+
+  void Next() override { ++itr_; }
+
+  void Prev() override {
+    if (itr_ == table_.begin()) {
+      itr_ = table_.end();
+    } else {
+      --itr_;
+    }
+  }
+
+  Slice key() const override { return Slice(itr_->first); }
+
+  Slice value() const override { return Slice(itr_->second); }
+
+  Status status() const override { return Status::OK(); }
+
+ private:
+  const MockFileContents& table_;
+  MockFileContents::const_iterator itr_;
+};
+
+class MockTableBuilder : public TableBuilder {
+ public:
+  MockTableBuilder(uint32_t id, MockTableFileSystem* file_system)
+      : id_(id), file_system_(file_system) {}
+
+  // REQUIRES: Either Finish() or Abandon() has been called.
+  ~MockTableBuilder() {}
+
+  // Add key,value to the table being constructed.
+  // REQUIRES: key is after any previously added key according to comparator.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Add(const Slice& key, const Slice& value) override {
+    table_.insert({key.ToString(), value.ToString()});
+  }
+
+  // Return non-ok iff some error has been detected.
+  Status status() const override { return Status::OK(); }
+
+  Status Finish() override {
+    MutexLock lock_guard(&file_system_->mutex);
+    file_system_->files.insert({id_, table_});
+    return Status::OK();
+  }
+
+  void Abandon() override {}
+
+  uint64_t NumEntries() const override { return table_.size(); }
+
+  uint64_t FileSize() const override { return table_.size(); }
+
+  TableProperties GetTableProperties() const override {
+    return TableProperties();
+  }
+
+ private:
+  uint32_t id_;
+  MockTableFileSystem* file_system_;
+  MockFileContents table_;
+};
+
+class MockTableFactory : public TableFactory {
+ public:
+  MockTableFactory();
+  const char* Name() const override { return "MockTable"; }
+  Status NewTableReader(const ImmutableCFOptions& ioptions,
+                               const EnvOptions& env_options,
+                               const InternalKeyComparator& internal_key,
+                               unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
+                               unique_ptr<TableReader>* table_reader) const override;
+  TableBuilder* NewTableBuilder(
+      const TableBuilderOptions& table_builder_options,
+      WritableFile* file) const override;
+
+  // This function will directly create mock table instead of going through
+  // MockTableBuilder. MockFileContents has to have a format of <internal_key,
+  // value>. Those key-value pairs will then be inserted into the mock table
+  Status CreateMockTable(Env* env, const std::string& fname,
+                         MockFileContents file_contents);
+
+  virtual Status SanitizeOptions(
+      const DBOptions& db_opts,
+      const ColumnFamilyOptions& cf_opts) const override {
+    return Status::OK();
+  }
+
+  virtual std::string GetPrintableTableOptions() const override {
+    return std::string();
+  }
+
+  // This function will assert that only a single file exists and that the
+  // contents are equal to file_contents
+  void AssertSingleFile(const MockFileContents& file_contents);
+  void AssertLatestFile(const MockFileContents& file_contents);
+
+ private:
+  uint32_t GetAndWriteNextID(WritableFile* file) const;
+  uint32_t GetIDFromFile(RandomAccessFile* file) const;
+
+  mutable MockTableFileSystem file_system_;
+  mutable std::atomic<uint32_t> next_id_;
+};
+
+}  // namespace mock
+}  // namespace rocksdb
diff --git a/src/rocksdb/table/plain_table_builder.cc b/src/rocksdb/table/plain_table_builder.cc
index d76f0b2..25e1b85 100644
--- a/src/rocksdb/table/plain_table_builder.cc
+++ b/src/rocksdb/table/plain_table_builder.cc
@@ -1,21 +1,27 @@
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
 
 #ifndef ROCKSDB_LITE
 #include "table/plain_table_builder.h"
 
 #include <assert.h>
+
+#include <string>
+#include <limits>
 #include <map>
 
 #include "rocksdb/comparator.h"
 #include "rocksdb/env.h"
 #include "rocksdb/filter_policy.h"
 #include "rocksdb/options.h"
+#include "rocksdb/table.h"
 #include "table/plain_table_factory.h"
 #include "db/dbformat.h"
 #include "table/block_builder.h"
-#include "table/filter_block.h"
+#include "table/bloom_block.h"
+#include "table/plain_table_index.h"
 #include "table/format.h"
 #include "table/meta_blocks.h"
 #include "util/coding.h"
@@ -52,70 +58,105 @@ Status WriteBlock(
 extern const uint64_t kPlainTableMagicNumber = 0x8242229663bf9564ull;
 extern const uint64_t kLegacyPlainTableMagicNumber = 0x4f3418eb7a8f13b8ull;
 
-PlainTableBuilder::PlainTableBuilder(const Options& options,
-                                     WritableFile* file,
-                                     uint32_t user_key_len) :
-    options_(options), file_(file), user_key_len_(user_key_len) {
+PlainTableBuilder::PlainTableBuilder(
+    const ImmutableCFOptions& ioptions,
+    const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
+        int_tbl_prop_collector_factories,
+    WritableFile* file, uint32_t user_key_len, EncodingType encoding_type,
+    size_t index_sparseness, uint32_t bloom_bits_per_key, uint32_t num_probes,
+    size_t huge_page_tlb_size, double hash_table_ratio,
+    bool store_index_in_file)
+    : ioptions_(ioptions),
+      bloom_block_(num_probes),
+      file_(file),
+      bloom_bits_per_key_(bloom_bits_per_key),
+      huge_page_tlb_size_(huge_page_tlb_size),
+      encoder_(encoding_type, user_key_len, ioptions.prefix_extractor,
+               index_sparseness),
+      store_index_in_file_(store_index_in_file),
+      prefix_extractor_(ioptions.prefix_extractor) {
+  // Build index block and save it in the file if hash_table_ratio > 0
+  if (store_index_in_file_) {
+    assert(hash_table_ratio > 0 || IsTotalOrderMode());
+    index_builder_.reset(
+        new PlainTableIndexBuilder(&arena_, ioptions, index_sparseness,
+                                   hash_table_ratio, huge_page_tlb_size_));
+    assert(bloom_bits_per_key_ > 0);
+    properties_.user_collected_properties
+        [PlainTablePropertyNames::kBloomVersion] = "1";  // For future use
+  }
+
   properties_.fixed_key_len = user_key_len;
 
   // for plain table, we put all the data in a big chuck.
   properties_.num_data_blocks = 1;
-  // emphasize that currently plain table doesn't have persistent index or
-  // filter block.
+  // Fill it later if store_index_in_file_ == true
   properties_.index_size = 0;
   properties_.filter_size = 0;
-  properties_.format_version = 0;
+  // To support roll-back to previous version, now still use version 0 for
+  // plain encoding.
+  properties_.format_version = (encoding_type == kPlain) ? 0 : 1;
+
+  if (ioptions_.prefix_extractor) {
+    properties_.user_collected_properties
+        [PlainTablePropertyNames::kPrefixExtractorName] =
+        ioptions_.prefix_extractor->Name();
+  }
+
+  std::string val;
+  PutFixed32(&val, static_cast<uint32_t>(encoder_.GetEncodingType()));
+  properties_.user_collected_properties
+      [PlainTablePropertyNames::kEncodingType] = val;
+
+  for (auto& collector_factories : *int_tbl_prop_collector_factories) {
+    table_properties_collectors_.emplace_back(
+        collector_factories->CreateIntTblPropCollector());
+  }
 }
 
 PlainTableBuilder::~PlainTableBuilder() {
 }
 
 void PlainTableBuilder::Add(const Slice& key, const Slice& value) {
-  size_t user_key_size = key.size() - 8;
-  assert(user_key_len_ == 0 || user_key_size == user_key_len_);
-
-  if (!IsFixedLength()) {
-    // Write key length
-    char key_size_buf[5];  // tmp buffer for key size as varint32
-    char* ptr = EncodeVarint32(key_size_buf, user_key_size);
-    assert(ptr <= key_size_buf + sizeof(key_size_buf));
-    auto len = ptr - key_size_buf;
-    file_->Append(Slice(key_size_buf, len));
-    offset_ += len;
+  // temp buffer for metadata bytes between key and value.
+  char meta_bytes_buf[6];
+  size_t meta_bytes_buf_size = 0;
+
+  ParsedInternalKey internal_key;
+  ParseInternalKey(key, &internal_key);
+
+  // Store key hash
+  if (store_index_in_file_) {
+    if (ioptions_.prefix_extractor == nullptr) {
+      keys_or_prefixes_hashes_.push_back(GetSliceHash(internal_key.user_key));
+    } else {
+      Slice prefix =
+          ioptions_.prefix_extractor->Transform(internal_key.user_key);
+      keys_or_prefixes_hashes_.push_back(GetSliceHash(prefix));
+    }
   }
 
-  // Write key
-  ParsedInternalKey parsed_key;
-  if (!ParseInternalKey(key, &parsed_key)) {
-    status_ = Status::Corruption(Slice());
-    return;
-  }
-  // For value size as varint32 (up to 5 bytes).
-  // If the row is of value type with seqId 0, flush the special flag together
-  // in this buffer to safe one file append call, which takes 1 byte.
-  char value_size_buf[6];
-  size_t value_size_buf_size = 0;
-  if (parsed_key.sequence == 0 && parsed_key.type == kTypeValue) {
-    file_->Append(Slice(key.data(), user_key_size));
-    offset_ += user_key_size;
-    value_size_buf[0] = PlainTableFactory::kValueTypeSeqId0;
-    value_size_buf_size = 1;
-  } else {
-    file_->Append(key);
-    offset_ += key.size();
+  // Write value
+  assert(offset_ <= std::numeric_limits<uint32_t>::max());
+  auto prev_offset = static_cast<uint32_t>(offset_);
+  // Write out the key
+  encoder_.AppendKey(key, file_, &offset_, meta_bytes_buf,
+                     &meta_bytes_buf_size);
+  if (SaveIndexInFile()) {
+    index_builder_->AddKeyPrefix(GetPrefix(internal_key), prev_offset);
   }
 
   // Write value length
-  int value_size = value.size();
+  uint32_t value_size = static_cast<uint32_t>(value.size());
   char* end_ptr =
-      EncodeVarint32(value_size_buf + value_size_buf_size, value_size);
-  assert(end_ptr <= value_size_buf + sizeof(value_size_buf));
-  value_size_buf_size = end_ptr - value_size_buf;
-  file_->Append(Slice(value_size_buf, value_size_buf_size));
+      EncodeVarint32(meta_bytes_buf + meta_bytes_buf_size, value_size);
+  assert(end_ptr <= meta_bytes_buf + sizeof(meta_bytes_buf));
+  meta_bytes_buf_size = end_ptr - meta_bytes_buf;
+  file_->Append(Slice(meta_bytes_buf, meta_bytes_buf_size));
 
   // Write value
   file_->Append(value);
-  offset_ += value_size + value_size_buf_size;
+  offset_ += value_size + meta_bytes_buf_size;
 
   properties_.num_entries++;
   properties_.raw_key_size += key.size();
@@ -123,11 +164,7 @@ void PlainTableBuilder::Add(const Slice& key, const Slice& value) {
 
   // notify property collectors
   NotifyCollectTableCollectorsOnAdd(
-      key,
-      value,
-      options_.table_properties_collectors,
-      options_.info_log.get()
-  );
+      key, value, offset_, table_properties_collectors_, ioptions_.info_log);
 }
 
 Status PlainTableBuilder::status() const { return status_; }
@@ -138,22 +175,63 @@ Status PlainTableBuilder::Finish() {
 
   properties_.data_size = offset_;
 
-  // Write the following blocks
-  //  1. [meta block: properties]
-  //  2. [metaindex block]
-  //  3. [footer]
+  //  Write the following blocks
+  //  1. [meta block: bloom] - optional
+  //  2. [meta block: index] - optional
+  //  3. [meta block: properties]
+  //  4. [metaindex block]
+  //  5. [footer]
+
   MetaIndexBuilder meta_index_builer;
 
+  if (store_index_in_file_ && (properties_.num_entries > 0)) {
+    assert(properties_.num_entries <= std::numeric_limits<uint32_t>::max());
+    bloom_block_.SetTotalBits(
+        &arena_,
+        static_cast<uint32_t>(properties_.num_entries) * bloom_bits_per_key_,
+        ioptions_.bloom_locality, huge_page_tlb_size_, ioptions_.info_log);
+
+    PutVarint32(&properties_.user_collected_properties
+                     [PlainTablePropertyNames::kNumBloomBlocks],
+                bloom_block_.GetNumBlocks());
+
+    bloom_block_.AddKeysHashes(keys_or_prefixes_hashes_);
+    BlockHandle bloom_block_handle;
+    auto finish_result = bloom_block_.Finish();
+
+    properties_.filter_size = finish_result.size();
+    auto s = WriteBlock(finish_result, file_, &offset_, &bloom_block_handle);
+
+    if (!s.ok()) {
+      return s;
+    }
+
+    BlockHandle index_block_handle;
+    finish_result = index_builder_->Finish();
+
+    properties_.index_size = finish_result.size();
+    s = WriteBlock(finish_result, file_, &offset_, &index_block_handle);
+
+    if (!s.ok()) {
+      return s;
+    }
+
+    meta_index_builer.Add(BloomBlockBuilder::kBloomBlock, bloom_block_handle);
+    meta_index_builer.Add(PlainTableIndexBuilder::kPlainTableIndexBlock,
+                          index_block_handle);
+  }
+
+  // Calculate bloom block size and index block size
   PropertyBlockBuilder property_block_builder;
   // -- Add basic properties
   property_block_builder.AddTableProperty(properties_);
 
+  property_block_builder.Add(properties_.user_collected_properties);
+
   // -- Add user collected properties
-  NotifyCollectTableCollectorsOnFinish(
-      options_.table_properties_collectors,
-      options_.info_log.get(),
-      &property_block_builder
-  );
+  NotifyCollectTableCollectorsOnFinish(table_properties_collectors_,
+                                       ioptions_.info_log,
+                                       &property_block_builder);
 
   // -- Write property block
   BlockHandle property_block_handle;
@@ -182,7 +260,7 @@ Status PlainTableBuilder::Finish() {
 
   // Write Footer
   // no need to write out new footer if we're using default checksum
-  Footer footer(kLegacyPlainTableMagicNumber);
+  Footer footer(kLegacyPlainTableMagicNumber, 0);
   footer.set_metaindex_handle(metaindex_block_handle);
   footer.set_index_handle(BlockHandle::NullBlockHandle());
   std::string footer_encoding;
diff --git a/src/rocksdb/table/plain_table_builder.h b/src/rocksdb/table/plain_table_builder.h
index 7bc388b..f542d2f 100644
--- a/src/rocksdb/table/plain_table_builder.h
+++ b/src/rocksdb/table/plain_table_builder.h
@@ -1,17 +1,20 @@
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-//
-// IndexedTable is a simple table format for UNIT TEST ONLY. It is not built
-// as production quality.
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
 
-#ifndef ROCKSDB_LITE
 #pragma once
+#ifndef ROCKSDB_LITE
 #include <stdint.h>
+#include <vector>
 #include "rocksdb/options.h"
 #include "rocksdb/status.h"
 #include "table/table_builder.h"
+#include "table/plain_table_key_coding.h"
+#include "rocksdb/table.h"
 #include "rocksdb/table_properties.h"
+#include "table/bloom_block.h"
+#include "table/plain_table_index.h"
 
 namespace rocksdb {
 
@@ -21,14 +24,20 @@ class WritableFile;
 class TableBuilder;
 
 class PlainTableBuilder: public TableBuilder {
-public:
+ public:
   // Create a builder that will store the contents of the table it is
   // building in *file.  Does not close the file.  It is up to the
   // caller to close the file after calling Finish(). The output file
   // will be part of level specified by 'level'.  A value of -1 means
   // that the caller does not know which level the output file will reside.
-  PlainTableBuilder(const Options& options, WritableFile* file,
-                    uint32_t user_key_size);
+  PlainTableBuilder(
+      const ImmutableCFOptions& ioptions,
+      const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
+          int_tbl_prop_collector_factories,
+      WritableFile* file, uint32_t user_key_size, EncodingType encoding_type,
+      size_t index_sparseness, uint32_t bloom_bits_per_key,
+      uint32_t num_probes = 6, size_t huge_page_tlb_size = 0,
+      double hash_table_ratio = 0, bool store_index_in_file = false);
 
   // REQUIRES: Either Finish() or Abandon() has been called.
   ~PlainTableBuilder();
@@ -60,20 +69,61 @@ public:
   // Finish() call, returns the size of the final generated file.
   uint64_t FileSize() const override;
 
-private:
-  Options options_;
+  TableProperties GetTableProperties() const override { return properties_; }
+
+  bool SaveIndexInFile() const { return store_index_in_file_; }
+
+ private:
+  Arena arena_;
+  const ImmutableCFOptions& ioptions_;
+  std::vector<std::unique_ptr<IntTblPropCollector>>
+      table_properties_collectors_;
+
+  BloomBlockBuilder bloom_block_;
+  std::unique_ptr<PlainTableIndexBuilder> index_builder_;
+
   WritableFile* file_;
   uint64_t offset_ = 0;
+  uint32_t bloom_bits_per_key_;
+  size_t huge_page_tlb_size_;
   Status status_;
   TableProperties properties_;
+  PlainTableKeyEncoder encoder_;
 
-  const size_t user_key_len_;
+  bool store_index_in_file_;
+
+  std::vector<uint32_t> keys_or_prefixes_hashes_;
   bool closed_ = false;  // Either Finish() or Abandon() has been called.
 
-  bool IsFixedLength() const {
-    return user_key_len_ > 0;
+  const SliceTransform* prefix_extractor_;
+
+  Slice GetPrefix(const Slice& target) const {
+    assert(target.size() >= 8);  // target is internal key
+    return GetPrefixFromUserKey(GetUserKey(target));
   }
 
+  Slice GetPrefix(const ParsedInternalKey& target) const {
+    return GetPrefixFromUserKey(target.user_key);
+  }
+
+  Slice GetUserKey(const Slice& key) const {
+    return Slice(key.data(), key.size() - 8);
+  }
+
+  Slice GetPrefixFromUserKey(const Slice& user_key) const {
+    if (!IsTotalOrderMode()) {
+      return prefix_extractor_->Transform(user_key);
+    } else {
+      // Use empty slice as prefix if prefix_extractor is not set.
+      // In that case,
+      // it falls back to pure binary search and
+      // total iterator seek is supported.
+      return Slice();
+    }
+  }
+
+  bool IsTotalOrderMode() const { return (prefix_extractor_ == nullptr); }
+
   // No copying allowed
   PlainTableBuilder(const PlainTableBuilder&) = delete;
   void operator=(const PlainTableBuilder&) = delete;
diff --git a/src/rocksdb/table/plain_table_factory.cc b/src/rocksdb/table/plain_table_factory.cc
index f9d88e9..5f19c3b 100644
--- a/src/rocksdb/table/plain_table_factory.cc
+++ b/src/rocksdb/table/plain_table_factory.cc
@@ -14,41 +14,80 @@
 
 namespace rocksdb {
 
-Status PlainTableFactory::NewTableReader(const Options& options,
-                                         const EnvOptions& soptions,
+Status PlainTableFactory::NewTableReader(const ImmutableCFOptions& ioptions,
+                                         const EnvOptions& env_options,
                                          const InternalKeyComparator& icomp,
                                          unique_ptr<RandomAccessFile>&& file,
                                          uint64_t file_size,
                                          unique_ptr<TableReader>* table) const {
-  return PlainTableReader::Open(options, soptions, icomp, std::move(file),
+  return PlainTableReader::Open(ioptions, env_options, icomp, std::move(file),
                                 file_size, table, bloom_bits_per_key_,
                                 hash_table_ratio_, index_sparseness_,
-                                huge_page_tlb_size_);
+                                huge_page_tlb_size_, full_scan_mode_);
 }
 
 TableBuilder* PlainTableFactory::NewTableBuilder(
-    const Options& options, const InternalKeyComparator& internal_comparator,
-    WritableFile* file, CompressionType compression_type) const {
-  return new PlainTableBuilder(options, file, user_key_len_);
+    const TableBuilderOptions& table_builder_options,
+    WritableFile* file) const {
+  // Ignore the skip_filters flag. PlainTable format is optimized for small
+  // in-memory dbs. The skip_filters optimization is not useful for plain
+  // tables
+  //
+  return new PlainTableBuilder(
+      table_builder_options.ioptions,
+      table_builder_options.int_tbl_prop_collector_factories, file,
+      user_key_len_, encoding_type_, index_sparseness_, bloom_bits_per_key_, 6,
+      huge_page_tlb_size_, hash_table_ratio_, store_index_in_file_);
 }
 
-extern TableFactory* NewPlainTableFactory(uint32_t user_key_len,
-                                          int bloom_bits_per_key,
-                                          double hash_table_ratio,
-                                          size_t index_sparseness,
-                                          size_t huge_page_tlb_size) {
-  return new PlainTableFactory(user_key_len, bloom_bits_per_key,
-                               hash_table_ratio, index_sparseness,
-                               huge_page_tlb_size);
+std::string PlainTableFactory::GetPrintableTableOptions() const {
+  std::string ret;
+  ret.reserve(20000);
+  const int kBufferSize = 200;
+  char buffer[kBufferSize];
+
+  snprintf(buffer, kBufferSize, "  user_key_len: %u\n",
+           user_key_len_);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  bloom_bits_per_key: %d\n",
+           bloom_bits_per_key_);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  hash_table_ratio: %lf\n",
+           hash_table_ratio_);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  index_sparseness: %zu\n",
+           index_sparseness_);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  huge_page_tlb_size: %zu\n",
+           huge_page_tlb_size_);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  encoding_type: %d\n",
+           encoding_type_);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  full_scan_mode: %d\n",
+           full_scan_mode_);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  store_index_in_file: %d\n",
+           store_index_in_file_);
+  ret.append(buffer);
+  return ret;
 }
 
-extern TableFactory* NewTotalOrderPlainTableFactory(uint32_t user_key_len,
-                                                    int bloom_bits_per_key,
-                                                    size_t index_sparseness,
-                                                    size_t huge_page_tlb_size) {
-  return new PlainTableFactory(user_key_len, bloom_bits_per_key, 0,
-                               index_sparseness, huge_page_tlb_size);
+extern TableFactory* NewPlainTableFactory(const PlainTableOptions& options) {
+  return new PlainTableFactory(options);
 }
 
+const std::string PlainTablePropertyNames::kPrefixExtractorName =
+    "rocksdb.prefix.extractor.name";
+
+const std::string PlainTablePropertyNames::kEncodingType =
+    "rocksdb.plain.table.encoding.type";
+
+const std::string PlainTablePropertyNames::kBloomVersion =
+    "rocksdb.plain.table.bloom.version";
+
+const std::string PlainTablePropertyNames::kNumBloomBlocks =
+    "rocksdb.plain.table.bloom.numblocks";
+
 }  // namespace rocksdb
 #endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/plain_table_factory.h b/src/rocksdb/table/plain_table_factory.h
index 06ddbf4..730e134 100644
--- a/src/rocksdb/table/plain_table_factory.h
+++ b/src/rocksdb/table/plain_table_factory.h
@@ -6,6 +6,7 @@
 
 #ifndef ROCKSDB_LITE
 #include <memory>
+#include <string>
 #include <stdint.h>
 
 #include "rocksdb/options.h"
@@ -13,7 +14,6 @@
 
 namespace rocksdb {
 
-struct Options;
 struct EnvOptions;
 
 using std::unique_ptr;
@@ -27,24 +27,107 @@ class TableBuilder;
 // parameter of the factory class. Output file format:
 // +-------------+-----------------+
 // | version     | user_key_length |
-// +------------++------------------------------+  <= key1 offset
-// | [key_size] |  key1       | value_size  |   |
+// +------------++------------+-----------------+  <= key1 offset
+// |  encoded key1            | value_size  |   |
 // +------------+-------------+-------------+   |
 // | value1                                     |
 // |                                            |
-// +----------------------------------------+---+  <= key2 offset
-// | [key_size] |  key2       | value_size  |   |
+// +--------------------------+-------------+---+  <= key2 offset
+// | encoded key2             | value_size  |   |
 // +------------+-------------+-------------+   |
 // | value2                                     |
 // |                                            |
 // |        ......                              |
 // +-----------------+--------------------------+
-// If user_key_length = kPlainTableVariableLength, it means the key is variable
-// length, there will be an extra field for key size encoded before every key.
+//
+// When the key encoding type is kPlain. Key part is encoded as:
+// +------------+--------------------+
+// | [key_size] |  internal key      |
+// +------------+--------------------+
+// for the case of user_key_len = kPlainTableVariableLength case,
+// and simply:
+// +----------------------+
+// |  internal key        |
+// +----------------------+
+// for user_key_len != kPlainTableVariableLength case.
+//
+// If key encoding type is kPrefix. Keys are encoding in this format.
+// There are three ways to encode a key:
+// (1) Full Key
+// +---------------+---------------+-------------------+
+// | Full Key Flag | Full Key Size | Full Internal Key |
+// +---------------+---------------+-------------------+
+// which simply encodes a full key
+//
+// (2) A key shared the same prefix as the previous key, which is encoded as
+//     format of (1).
+// +-------------+-------------+-------------+-------------+------------+
+// | Prefix Flag | Prefix Size | Suffix Flag | Suffix Size | Key Suffix |
+// +-------------+-------------+-------------+-------------+------------+
+// where key is the suffix part of the key, including the internal bytes.
+// the actual key will be constructed by concatenating prefix part of the
+// previous key, with the suffix part of the key here, with sizes given here.
+//
+// (3) A key shared the same prefix as the previous key, which is encoded as
+//     the format of (2).
+// +-----------------+-----------------+------------------------+
+// | Key Suffix Flag | Key Suffix Size | Suffix of Internal Key |
+// +-----------------+-----------------+------------------------+
+// The key will be constructed by concatenating previous key's prefix (which is
+// also a prefix which the last key encoded in the format of (1)) and the
+// key given here.
+//
+// For example, we for following keys (prefix and suffix are separated by
+// spaces):
+//   0000 0001
+//   0000 00021
+//   0000 0002
+//   00011 00
+//   0002 0001
+// Will be encoded like this:
+//   FK 8 00000001
+//   PF 4 SF 5 00021
+//   SF 4 0002
+//   FK 7 0001100
+//   FK 8 00020001
+// (where FK means full key flag, PF means prefix flag and SF means suffix flag)
+//
+// All those "key flag + key size" shown above are in this format:
+// The 8 bits of the first byte:
+// +----+----+----+----+----+----+----+----+
+// |  Type   |            Size             |
+// +----+----+----+----+----+----+----+----+
+// Type indicates: full key, prefix, or suffix.
+// The last 6 bits are for size. If the size bits are not all 1, it means the
+// size of the key. Otherwise, varint32 is read after this byte. This varint
+// value + 0x3F (the value of all 1) will be the key size.
+//
+// For example, full key with length 16 will be encoded as (binary):
+//     00 010000
+// (00 means full key)
+// and a prefix with 100 bytes will be encoded as:
+//     01 111111    00100101
+//         (63)       (37)
+// (01 means key suffix)
+//
+// All the internal keys above (including kPlain and kPrefix) are encoded in
+// this format:
+// There are two types:
+// (1) normal internal key format
+// +----------- ...... -------------+----+---+---+---+---+---+---+---+
+// |       user key                 |type|      sequence ID          |
+// +----------- ..... --------------+----+---+---+---+---+---+---+---+
+// (2) Special case for keys whose sequence ID is 0 and is value type
+// +----------- ...... -------------+----+
+// |       user key                 |0x80|
+// +----------- ..... --------------+----+
+// To save 7 bytes for the special case where sequence ID = 0.
+//
+//
 class PlainTableFactory : public TableFactory {
  public:
   ~PlainTableFactory() {}
-  // user_key_size is the length of the user key. If it is set to be
+  // user_key_len is the length of the user key. If it is set to be
   // kPlainTableVariableLength, then it means variable length. Otherwise, all
   // the keys need to have the fix length of this value. bloom_bits_per_key is
   // number of bits used for bloom filer per key. hash_table_ratio is
@@ -59,35 +142,49 @@ class PlainTableFactory : public TableFactory {
   // huge_page_tlb_size determines whether to allocate hash indexes from huge
   // page TLB and the page size if allocating from there. See comments of
   // Arena::AllocateAligned() for details.
-  explicit PlainTableFactory(uint32_t user_key_len = kPlainTableVariableLength,
-                             int bloom_bits_per_key = 0,
-                             double hash_table_ratio = 0.75,
-                             size_t index_sparseness = 16,
-                             size_t huge_page_tlb_size = 0)
-      : user_key_len_(user_key_len),
-        bloom_bits_per_key_(bloom_bits_per_key),
-        hash_table_ratio_(hash_table_ratio),
-        index_sparseness_(index_sparseness),
-        huge_page_tlb_size_(huge_page_tlb_size) {}
+  explicit PlainTableFactory(const PlainTableOptions& options =
+                                 PlainTableOptions())
+      : user_key_len_(options.user_key_len),
+        bloom_bits_per_key_(options.bloom_bits_per_key),
+        hash_table_ratio_(options.hash_table_ratio),
+        index_sparseness_(options.index_sparseness),
+        huge_page_tlb_size_(options.huge_page_tlb_size),
+        encoding_type_(options.encoding_type),
+        full_scan_mode_(options.full_scan_mode),
+        store_index_in_file_(options.store_index_in_file) {}
   const char* Name() const override { return "PlainTable"; }
-  Status NewTableReader(const Options& options, const EnvOptions& soptions,
-                        const InternalKeyComparator& internal_comparator,
-                        unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
-                        unique_ptr<TableReader>* table) const override;
-  TableBuilder* NewTableBuilder(const Options& options,
-                                const InternalKeyComparator& icomparator,
-                                WritableFile* file,
-                                CompressionType compression_type) const
-      override;
+  Status NewTableReader(
+      const ImmutableCFOptions& options, const EnvOptions& soptions,
+      const InternalKeyComparator& internal_comparator,
+      unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
+      unique_ptr<TableReader>* table) const override;
+  TableBuilder* NewTableBuilder(
+      const TableBuilderOptions& table_builder_options,
+      WritableFile* file) const override;
+
+  std::string GetPrintableTableOptions() const override;
 
   static const char kValueTypeSeqId0 = 0xFF;
 
+  // Sanitizes the specified DB Options.
+  Status SanitizeOptions(const DBOptions& db_opts,
+                         const ColumnFamilyOptions& cf_opts) const override {
+    if (db_opts.allow_mmap_reads == false) {
+      return Status::NotSupported(
+          "PlainTable with allow_mmap_reads == false is not supported.");
+    }
+    return Status::OK();
+  }
+
  private:
   uint32_t user_key_len_;
   int bloom_bits_per_key_;
   double hash_table_ratio_;
   size_t index_sparseness_;
   size_t huge_page_tlb_size_;
+  EncodingType encoding_type_;
+  bool full_scan_mode_;
+  bool store_index_in_file_;
 };
 
 }  // namespace rocksdb
diff --git a/src/rocksdb/table/plain_table_index.cc b/src/rocksdb/table/plain_table_index.cc
new file mode 100644
index 0000000..7ca451e
--- /dev/null
+++ b/src/rocksdb/table/plain_table_index.cc
@@ -0,0 +1,215 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef ROCKSDB_LITE
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
+
+#include "table/plain_table_index.h"
+#include "util/coding.h"
+#include "util/hash.h"
+
+namespace rocksdb {
+
+namespace {
+inline uint32_t GetBucketIdFromHash(uint32_t hash, uint32_t num_buckets) {
+  assert(num_buckets > 0);
+  return hash % num_buckets;
+}
+}
+
+Status PlainTableIndex::InitFromRawData(Slice data) {
+  if (!GetVarint32(&data, &index_size_)) {
+    return Status::Corruption("Couldn't read the index size!");
+  }
+  assert(index_size_ > 0);
+  if (!GetVarint32(&data, &num_prefixes_)) {
+    return Status::Corruption("Couldn't read the index size!");
+  }
+  sub_index_size_ =
+      static_cast<uint32_t>(data.size()) - index_size_ * kOffsetLen;
+
+  char* index_data_begin = const_cast<char*>(data.data());
+  index_ = reinterpret_cast<uint32_t*>(index_data_begin);
+  sub_index_ = reinterpret_cast<char*>(index_ + index_size_);
+  return Status::OK();
+}
+
+PlainTableIndex::IndexSearchResult PlainTableIndex::GetOffset(
+    uint32_t prefix_hash, uint32_t* bucket_value) const {
+  int bucket = GetBucketIdFromHash(prefix_hash, index_size_);
+  *bucket_value = index_[bucket];
+  if ((*bucket_value & kSubIndexMask) == kSubIndexMask) {
+    *bucket_value ^= kSubIndexMask;
+    return kSubindex;
+  }
+  if (*bucket_value >= kMaxFileSize) {
+    return kNoPrefixForBucket;
+  } else {
+    // point directly to the file
+    return kDirectToFile;
+  }
+}
+
+void PlainTableIndexBuilder::IndexRecordList::AddRecord(uint32_t hash,
+                                                        uint32_t offset) {
+  if (num_records_in_current_group_ == kNumRecordsPerGroup) {
+    current_group_ = AllocateNewGroup();
+    num_records_in_current_group_ = 0;
+  }
+  auto& new_record = current_group_[num_records_in_current_group_++];
+  new_record.hash = hash;
+  new_record.offset = offset;
+  new_record.next = nullptr;
+}
+
+void PlainTableIndexBuilder::AddKeyPrefix(Slice key_prefix_slice,
+                                          uint32_t key_offset) {
+  if (is_first_record_ || prev_key_prefix_ != key_prefix_slice.ToString()) {
+    ++num_prefixes_;
+    if (!is_first_record_) {
+      keys_per_prefix_hist_.Add(num_keys_per_prefix_);
+    }
+    num_keys_per_prefix_ = 0;
+    prev_key_prefix_ = key_prefix_slice.ToString();
+    prev_key_prefix_hash_ = GetSliceHash(key_prefix_slice);
+    due_index_ = true;
+  }
+
+  if (due_index_) {
+    // Add an index key for every kIndexIntervalForSamePrefixKeys keys
+    record_list_.AddRecord(prev_key_prefix_hash_, key_offset);
+    due_index_ = false;
+  }
+
+  num_keys_per_prefix_++;
+  if (index_sparseness_ == 0 || num_keys_per_prefix_ % index_sparseness_ == 0) {
+    due_index_ = true;
+  }
+  is_first_record_ = false;
+}
+
+Slice PlainTableIndexBuilder::Finish() {
+  AllocateIndex();
+  std::vector<IndexRecord*> hash_to_offsets(index_size_, nullptr);
+  std::vector<uint32_t> entries_per_bucket(index_size_, 0);
+  BucketizeIndexes(&hash_to_offsets, &entries_per_bucket);
+
+  keys_per_prefix_hist_.Add(num_keys_per_prefix_);
+  Log(InfoLogLevel::INFO_LEVEL, ioptions_.info_log,
+      "Number of Keys per prefix Histogram: %s",
+      keys_per_prefix_hist_.ToString().c_str());
+
+  // From the temp data structure, populate indexes.
+  return FillIndexes(hash_to_offsets, entries_per_bucket);
+}
+
+void PlainTableIndexBuilder::AllocateIndex() {
+  if (prefix_extractor_ == nullptr || hash_table_ratio_ <= 0) {
+    // Fall back to pure binary search if the user fails to specify a prefix
+    // extractor.
+    index_size_ = 1;
+  } else {
+    double hash_table_size_multipier = 1.0 / hash_table_ratio_;
+    index_size_ = num_prefixes_ * hash_table_size_multipier + 1;
+    assert(index_size_ > 0);
+  }
+}
+
+void PlainTableIndexBuilder::BucketizeIndexes(
+    std::vector<IndexRecord*>* hash_to_offsets,
+    std::vector<uint32_t>* entries_per_bucket) {
+  bool first = true;
+  uint32_t prev_hash = 0;
+  size_t num_records = record_list_.GetNumRecords();
+  for (size_t i = 0; i < num_records; i++) {
+    IndexRecord* index_record = record_list_.At(i);
+    uint32_t cur_hash = index_record->hash;
+    if (first || prev_hash != cur_hash) {
+      prev_hash = cur_hash;
+      first = false;
+    }
+    uint32_t bucket = GetBucketIdFromHash(cur_hash, index_size_);
+    IndexRecord* prev_bucket_head = (*hash_to_offsets)[bucket];
+    index_record->next = prev_bucket_head;
+    (*hash_to_offsets)[bucket] = index_record;
+    (*entries_per_bucket)[bucket]++;
+  }
+
+  sub_index_size_ = 0;
+  for (auto entry_count : *entries_per_bucket) {
+    if (entry_count <= 1) {
+      continue;
+    }
+    // Only buckets with more than 1 entry will have subindex.
+    sub_index_size_ += VarintLength(entry_count);
+    // total bytes needed to store these entries' in-file offsets.
+    sub_index_size_ += entry_count * PlainTableIndex::kOffsetLen;
+  }
+}
+
+Slice PlainTableIndexBuilder::FillIndexes(
+    const std::vector<IndexRecord*>& hash_to_offsets,
+    const std::vector<uint32_t>& entries_per_bucket) {
+  Log(InfoLogLevel::DEBUG_LEVEL, ioptions_.info_log,
+      "Reserving %" PRIu32 " bytes for plain table's sub_index",
+      sub_index_size_);
+  auto total_allocate_size = GetTotalSize();
+  char* allocated = arena_->AllocateAligned(
+      total_allocate_size, huge_page_tlb_size_, ioptions_.info_log);
+
+  auto temp_ptr = EncodeVarint32(allocated, index_size_);
+  uint32_t* index =
+      reinterpret_cast<uint32_t*>(EncodeVarint32(temp_ptr, num_prefixes_));
+  char* sub_index = reinterpret_cast<char*>(index + index_size_);
+
+  uint32_t sub_index_offset = 0;
+  for (uint32_t i = 0; i < index_size_; i++) {
+    uint32_t num_keys_for_bucket = entries_per_bucket[i];
+    switch (num_keys_for_bucket) {
+      case 0:
+        // No key for bucket
+        index[i] = PlainTableIndex::kMaxFileSize;
+        break;
+      case 1:
+        // point directly to the file offset
+        index[i] = hash_to_offsets[i]->offset;
+        break;
+      default:
+        // point to second level indexes.
+        index[i] = sub_index_offset | PlainTableIndex::kSubIndexMask;
+        char* prev_ptr = &sub_index[sub_index_offset];
+        char* cur_ptr = EncodeVarint32(prev_ptr, num_keys_for_bucket);
+        sub_index_offset += (cur_ptr - prev_ptr);
+        char* sub_index_pos = &sub_index[sub_index_offset];
+        IndexRecord* record = hash_to_offsets[i];
+        int j;
+        for (j = num_keys_for_bucket - 1; j >= 0 && record;
+             j--, record = record->next) {
+          EncodeFixed32(sub_index_pos + j * sizeof(uint32_t), record->offset);
+        }
+        assert(j == -1 && record == nullptr);
+        sub_index_offset += PlainTableIndex::kOffsetLen * num_keys_for_bucket;
+        assert(sub_index_offset <= sub_index_size_);
+        break;
+    }
+  }
+  assert(sub_index_offset == sub_index_size_);
+
+  Log(InfoLogLevel::DEBUG_LEVEL, ioptions_.info_log,
+      "hash table size: %d, suffix_map length %zu",
+      index_size_, sub_index_size_);
+  return Slice(allocated, GetTotalSize());
+}
+
+const std::string PlainTableIndexBuilder::kPlainTableIndexBlock =
+    "PlainTableIndexBlock";
+};  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/plain_table_index.h b/src/rocksdb/table/plain_table_index.h
new file mode 100644
index 0000000..be8ad16
--- /dev/null
+++ b/src/rocksdb/table/plain_table_index.h
@@ -0,0 +1,225 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "rocksdb/options.h"
+#include "util/murmurhash.h"
+#include "util/hash.h"
+#include "util/arena.h"
+#include "util/histogram.h"
+
+namespace rocksdb {
+
+// PlainTableIndex contains buckets size of index_size_, each is a
+// 32-bit integer. The lower 31 bits contain an offset value (explained below)
+// and the first bit of the integer indicates type of the offset.
+//
+// +--------------+------------------------------------------------------+
+// | Flag (1 bit) | Offset to binary search buffer or file (31 bits)     +
+// +--------------+------------------------------------------------------+
+//
+// Explanation for the "flag bit":
+//
+// 0 indicates that the bucket contains only one prefix (no conflict when
+//   hashing this prefix), whose first row starts from this offset of the
+// file.
+// 1 indicates that the bucket contains more than one prefixes, or there
+//   are too many rows for one prefix so we need a binary search for it. In
+//   this case, the offset indicates the offset of sub_index_ holding the
+//   binary search indexes of keys for those rows. Those binary search indexes
+//   are organized in this way:
+//
+// The first 4 bytes, indicate how many indexes (N) are stored after it. After
+// it, there are N 32-bit integers, each points of an offset of the file,
+// which
+// points to starting of a row. Those offsets need to be guaranteed to be in
+// ascending order so the keys they are pointing to are also in ascending
+// order
+// to make sure we can use them to do binary searches. Below is visual
+// presentation of a bucket.
+//
+// <begin>
+//   number_of_records:  varint32
+//   record 1 file offset:  fixedint32
+//   record 2 file offset:  fixedint32
+//    ....
+//   record N file offset:  fixedint32
+// <end>
+class PlainTableIndex {
+ public:
+  enum IndexSearchResult {
+    kNoPrefixForBucket = 0,
+    kDirectToFile = 1,
+    kSubindex = 2
+  };
+
+  explicit PlainTableIndex(Slice data) { InitFromRawData(data); }
+
+  PlainTableIndex()
+      : index_size_(0),
+        sub_index_size_(0),
+        num_prefixes_(0),
+        index_(nullptr),
+        sub_index_(nullptr) {}
+
+  IndexSearchResult GetOffset(uint32_t prefix_hash,
+                              uint32_t* bucket_value) const;
+
+  Status InitFromRawData(Slice data);
+
+  const char* GetSubIndexBasePtrAndUpperBound(uint32_t offset,
+                                              uint32_t* upper_bound) const {
+    const char* index_ptr = &sub_index_[offset];
+    return GetVarint32Ptr(index_ptr, index_ptr + 4, upper_bound);
+  }
+
+  uint32_t GetIndexSize() const { return index_size_; }
+
+  uint32_t GetSubIndexSize() const { return sub_index_size_; }
+
+  uint32_t GetNumPrefixes() const { return num_prefixes_; }
+
+  static const uint64_t kMaxFileSize = (1u << 31) - 1;
+  static const uint32_t kSubIndexMask = 0x80000000;
+  static const size_t kOffsetLen = sizeof(uint32_t);
+
+ private:
+  uint32_t index_size_;
+  uint32_t sub_index_size_;
+  uint32_t num_prefixes_;
+
+  uint32_t* index_;
+  char* sub_index_;
+};
+
+// PlainTableIndexBuilder is used to create plain table index.
+// After calling Finish(), it returns Slice, which is usually
+// used either to initialize PlainTableIndex or
+// to save index to sst file.
+// For more details about the  index, please refer to:
+// https://github.com/facebook/rocksdb/wiki/PlainTable-Format
+// #wiki-in-memory-index-format
+class PlainTableIndexBuilder {
+ public:
+  PlainTableIndexBuilder(Arena* arena, const ImmutableCFOptions& ioptions,
+                         size_t index_sparseness, double hash_table_ratio,
+                         size_t huge_page_tlb_size)
+      : arena_(arena),
+        ioptions_(ioptions),
+        record_list_(kRecordsPerGroup),
+        is_first_record_(true),
+        due_index_(false),
+        num_prefixes_(0),
+        num_keys_per_prefix_(0),
+        prev_key_prefix_hash_(0),
+        index_sparseness_(index_sparseness),
+        prefix_extractor_(ioptions.prefix_extractor),
+        hash_table_ratio_(hash_table_ratio),
+        huge_page_tlb_size_(huge_page_tlb_size) {}
+
+  void AddKeyPrefix(Slice key_prefix_slice, uint32_t key_offset);
+
+  Slice Finish();
+
+  uint32_t GetTotalSize() const {
+    return VarintLength(index_size_) + VarintLength(num_prefixes_) +
+           PlainTableIndex::kOffsetLen * index_size_ + sub_index_size_;
+  }
+
+  static const std::string kPlainTableIndexBlock;
+
+ private:
+  struct IndexRecord {
+    uint32_t hash;    // hash of the prefix
+    uint32_t offset;  // offset of a row
+    IndexRecord* next;
+  };
+
+  // Helper class to track all the index records
+  class IndexRecordList {
+   public:
+    explicit IndexRecordList(size_t num_records_per_group)
+        : kNumRecordsPerGroup(num_records_per_group),
+          current_group_(nullptr),
+          num_records_in_current_group_(num_records_per_group) {}
+
+    ~IndexRecordList() {
+      for (size_t i = 0; i < groups_.size(); i++) {
+        delete[] groups_[i];
+      }
+    }
+
+    void AddRecord(uint32_t hash, uint32_t offset);
+
+    size_t GetNumRecords() const {
+      return (groups_.size() - 1) * kNumRecordsPerGroup +
+             num_records_in_current_group_;
+    }
+    IndexRecord* At(size_t index) {
+      return &(groups_[index / kNumRecordsPerGroup]
+                      [index % kNumRecordsPerGroup]);
+    }
+
+   private:
+    IndexRecord* AllocateNewGroup() {
+      IndexRecord* result = new IndexRecord[kNumRecordsPerGroup];
+      groups_.push_back(result);
+      return result;
+    }
+
+    // Each group in `groups_` contains fix-sized records (determined by
+    // kNumRecordsPerGroup). Which can help us minimize the cost if resizing
+    // occurs.
+    const size_t kNumRecordsPerGroup;
+    IndexRecord* current_group_;
+    // List of arrays allocated
+    std::vector<IndexRecord*> groups_;
+    size_t num_records_in_current_group_;
+  };
+
+  void AllocateIndex();
+
+  // Internal helper function to bucket index record list to hash buckets.
+  void BucketizeIndexes(std::vector<IndexRecord*>* hash_to_offsets,
+                        std::vector<uint32_t>* entries_per_bucket);
+
+  // Internal helper class to fill the indexes and bloom filters to internal
+  // data structures.
+  Slice FillIndexes(const std::vector<IndexRecord*>& hash_to_offsets,
+                    const std::vector<uint32_t>& entries_per_bucket);
+
+  Arena* arena_;
+  const ImmutableCFOptions ioptions_;
+  HistogramImpl keys_per_prefix_hist_;
+  IndexRecordList record_list_;
+  bool is_first_record_;
+  bool due_index_;
+  uint32_t num_prefixes_;
+  uint32_t num_keys_per_prefix_;
+
+  uint32_t prev_key_prefix_hash_;
+  size_t index_sparseness_;
+  uint32_t index_size_;
+  uint32_t sub_index_size_;
+
+  const SliceTransform* prefix_extractor_;
+  double hash_table_ratio_;
+  size_t huge_page_tlb_size_;
+
+  std::string prev_key_prefix_;
+
+  static const size_t kRecordsPerGroup = 256;
+};
+
+};  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/plain_table_key_coding.cc b/src/rocksdb/table/plain_table_key_coding.cc
new file mode 100644
index 0000000..4f09b50
--- /dev/null
+++ b/src/rocksdb/table/plain_table_key_coding.cc
@@ -0,0 +1,323 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef ROCKSDB_LITE
+#include "table/plain_table_key_coding.h"
+
+#include "table/plain_table_factory.h"
+#include "db/dbformat.h"
+
+namespace rocksdb {
+
+namespace {
+
+enum PlainTableEntryType : unsigned char {
+  kFullKey = 0,
+  kPrefixFromPreviousKey = 1,
+  kKeySuffix = 2,
+};
+
+// Control byte:
+// First two bits indicate type of entry
+// Other bytes are inlined sizes. If all bits are 1 (0x03F), overflow bytes
+// are used. key_size-0x3F will be encoded as a variint32 after this bytes.
+
+const unsigned char kSizeInlineLimit = 0x3F;
+
+// Return 0 for error
+size_t EncodeSize(PlainTableEntryType type, uint32_t key_size,
+                  char* out_buffer) {
+  out_buffer[0] = type << 6;
+
+  if (key_size < static_cast<uint32_t>(kSizeInlineLimit)) {
+    // size inlined
+    out_buffer[0] |= static_cast<char>(key_size);
+    return 1;
+  } else {
+    out_buffer[0] |= kSizeInlineLimit;
+    char* ptr = EncodeVarint32(out_buffer + 1, key_size - kSizeInlineLimit);
+    return ptr - out_buffer;
+  }
+}
+
+// Return position after the size byte(s). nullptr means error
+const char* DecodeSize(const char* offset, const char* limit,
+                       PlainTableEntryType* entry_type, uint32_t* key_size) {
+  assert(offset < limit);
+  *entry_type = static_cast<PlainTableEntryType>(
+      (static_cast<unsigned char>(offset[0]) & ~kSizeInlineLimit) >> 6);
+  char inline_key_size = offset[0] & kSizeInlineLimit;
+  if (inline_key_size < kSizeInlineLimit) {
+    *key_size = inline_key_size;
+    return offset + 1;
+  } else {
+    uint32_t extra_size;
+    const char* ptr = GetVarint32Ptr(offset + 1, limit, &extra_size);
+    if (ptr == nullptr) {
+      return nullptr;
+    }
+    *key_size = kSizeInlineLimit + extra_size;
+    return ptr;
+  }
+}
+}  // namespace
+
+Status PlainTableKeyEncoder::AppendKey(const Slice& key, WritableFile* file,
+                                       uint64_t* offset, char* meta_bytes_buf,
+                                       size_t* meta_bytes_buf_size) {
+  ParsedInternalKey parsed_key;
+  if (!ParseInternalKey(key, &parsed_key)) {
+    return Status::Corruption(Slice());
+  }
+
+  Slice key_to_write = key;  // Portion of internal key to write out.
+
+  uint32_t user_key_size = static_cast<uint32_t>(key.size() - 8);
+  if (encoding_type_ == kPlain) {
+    if (fixed_user_key_len_ == kPlainTableVariableLength) {
+      // Write key length
+      char key_size_buf[5];  // tmp buffer for key size as varint32
+      char* ptr = EncodeVarint32(key_size_buf, user_key_size);
+      assert(ptr <= key_size_buf + sizeof(key_size_buf));
+      auto len = ptr - key_size_buf;
+      Status s = file->Append(Slice(key_size_buf, len));
+      if (!s.ok()) {
+        return s;
+      }
+      *offset += len;
+    }
+  } else {
+    assert(encoding_type_ == kPrefix);
+    char size_bytes[12];
+    size_t size_bytes_pos = 0;
+
+    Slice prefix =
+        prefix_extractor_->Transform(Slice(key.data(), user_key_size));
+    if (key_count_for_prefix_ == 0 || prefix != pre_prefix_.GetKey() ||
+        key_count_for_prefix_ % index_sparseness_ == 0) {
+      key_count_for_prefix_ = 1;
+      pre_prefix_.SetKey(prefix);
+      size_bytes_pos += EncodeSize(kFullKey, user_key_size, size_bytes);
+      Status s = file->Append(Slice(size_bytes, size_bytes_pos));
+      if (!s.ok()) {
+        return s;
+      }
+      *offset += size_bytes_pos;
+    } else {
+      key_count_for_prefix_++;
+      if (key_count_for_prefix_ == 2) {
+        // For second key within a prefix, need to encode prefix length
+        size_bytes_pos +=
+            EncodeSize(kPrefixFromPreviousKey,
+                       static_cast<uint32_t>(pre_prefix_.GetKey().size()),
+                       size_bytes + size_bytes_pos);
+      }
+      uint32_t prefix_len = static_cast<uint32_t>(pre_prefix_.GetKey().size());
+      size_bytes_pos += EncodeSize(kKeySuffix, user_key_size - prefix_len,
+                                   size_bytes + size_bytes_pos);
+      Status s = file->Append(Slice(size_bytes, size_bytes_pos));
+      if (!s.ok()) {
+        return s;
+      }
+      *offset += size_bytes_pos;
+      key_to_write = Slice(key.data() + prefix_len, key.size() - prefix_len);
+    }
+  }
+
+  // Encode full key
+  // For value size as varint32 (up to 5 bytes).
+  // If the row is of value type with seqId 0, flush the special flag together
+  // in this buffer to safe one file append call, which takes 1 byte.
+  if (parsed_key.sequence == 0 && parsed_key.type == kTypeValue) {
+    Status s =
+        file->Append(Slice(key_to_write.data(), key_to_write.size() - 8));
+    if (!s.ok()) {
+      return s;
+    }
+    *offset += key_to_write.size() - 8;
+    meta_bytes_buf[*meta_bytes_buf_size] = PlainTableFactory::kValueTypeSeqId0;
+    *meta_bytes_buf_size += 1;
+  } else {
+    file->Append(key_to_write);
+    *offset += key_to_write.size();
+  }
+
+  return Status::OK();
+}
+
+namespace {
+Status ReadInternalKey(const char* key_ptr, const char* limit,
+                       uint32_t user_key_size, ParsedInternalKey* parsed_key,
+                       size_t* bytes_read, bool* internal_key_valid,
+                       Slice* internal_key) {
+  if (key_ptr + user_key_size + 1 >= limit) {
+    return Status::Corruption("Unexpected EOF when reading the next key");
+  }
+  if (*(key_ptr + user_key_size) == PlainTableFactory::kValueTypeSeqId0) {
+    // Special encoding for the row with seqID=0
+    parsed_key->user_key = Slice(key_ptr, user_key_size);
+    parsed_key->sequence = 0;
+    parsed_key->type = kTypeValue;
+    *bytes_read += user_key_size + 1;
+    *internal_key_valid = false;
+  } else {
+    if (key_ptr + user_key_size + 8 >= limit) {
+      return Status::Corruption(
+          "Unexpected EOF when reading internal bytes of the next key");
+    }
+    *internal_key_valid = true;
+    *internal_key = Slice(key_ptr, user_key_size + 8);
+    if (!ParseInternalKey(*internal_key, parsed_key)) {
+      return Status::Corruption(
+          Slice("Incorrect value type found when reading the next key"));
+    }
+    *bytes_read += user_key_size + 8;
+  }
+  return Status::OK();
+}
+}  // namespace
+
+Status PlainTableKeyDecoder::NextPlainEncodingKey(
+    const char* start, const char* limit, ParsedInternalKey* parsed_key,
+    Slice* internal_key, size_t* bytes_read, bool* seekable) {
+  const char* key_ptr = start;
+  uint32_t user_key_size = 0;
+  if (fixed_user_key_len_ != kPlainTableVariableLength) {
+    user_key_size = fixed_user_key_len_;
+    key_ptr = start;
+  } else {
+    uint32_t tmp_size = 0;
+    key_ptr = GetVarint32Ptr(start, limit, &tmp_size);
+    if (key_ptr == nullptr) {
+      return Status::Corruption(
+          "Unexpected EOF when reading the next key's size");
+    }
+    user_key_size = tmp_size;
+    *bytes_read = key_ptr - start;
+  }
+  // dummy initial value to avoid compiler complain
+  bool decoded_internal_key_valid = true;
+  Slice decoded_internal_key;
+  Status s =
+      ReadInternalKey(key_ptr, limit, user_key_size, parsed_key, bytes_read,
+                      &decoded_internal_key_valid, &decoded_internal_key);
+  if (!s.ok()) {
+    return s;
+  }
+  if (internal_key != nullptr) {
+    if (decoded_internal_key_valid) {
+      *internal_key = decoded_internal_key;
+    } else {
+      // Need to copy out the internal key
+      cur_key_.SetInternalKey(*parsed_key);
+      *internal_key = cur_key_.GetKey();
+    }
+  }
+  return Status::OK();
+}
+
+Status PlainTableKeyDecoder::NextPrefixEncodingKey(
+    const char* start, const char* limit, ParsedInternalKey* parsed_key,
+    Slice* internal_key, size_t* bytes_read, bool* seekable) {
+  const char* key_ptr = start;
+  PlainTableEntryType entry_type;
+
+  bool expect_suffix = false;
+  do {
+    uint32_t size = 0;
+    // dummy initial value to avoid compiler complain
+    bool decoded_internal_key_valid = true;
+    const char* pos = DecodeSize(key_ptr, limit, &entry_type, &size);
+    if (pos == nullptr) {
+      return Status::Corruption("Unexpected EOF when reading size of the key");
+    }
+    *bytes_read += pos - key_ptr;
+    key_ptr = pos;
+
+    switch (entry_type) {
+      case kFullKey: {
+        expect_suffix = false;
+        Slice decoded_internal_key;
+        Status s =
+            ReadInternalKey(key_ptr, limit, size, parsed_key, bytes_read,
+                            &decoded_internal_key_valid, &decoded_internal_key);
+        if (!s.ok()) {
+          return s;
+        }
+        saved_user_key_ = parsed_key->user_key;
+        if (internal_key != nullptr) {
+          if (decoded_internal_key_valid) {
+            *internal_key = decoded_internal_key;
+          } else {
+            cur_key_.SetInternalKey(*parsed_key);
+            *internal_key = cur_key_.GetKey();
+          }
+        }
+        break;
+      }
+      case kPrefixFromPreviousKey: {
+        if (seekable != nullptr) {
+          *seekable = false;
+        }
+        prefix_len_ = size;
+        assert(prefix_extractor_ == nullptr ||
+               prefix_extractor_->Transform(saved_user_key_).size() ==
+                   prefix_len_);
+        // Need read another size flag for suffix
+        expect_suffix = true;
+        break;
+      }
+      case kKeySuffix: {
+        expect_suffix = false;
+        if (seekable != nullptr) {
+          *seekable = false;
+        }
+        cur_key_.Reserve(prefix_len_ + size);
+
+        Slice tmp_slice;
+        Status s = ReadInternalKey(key_ptr, limit, size, parsed_key, bytes_read,
+                                   &decoded_internal_key_valid, &tmp_slice);
+        if (!s.ok()) {
+          return s;
+        }
+        cur_key_.SetInternalKey(Slice(saved_user_key_.data(), prefix_len_),
+                                *parsed_key);
+        assert(
+            prefix_extractor_ == nullptr ||
+            prefix_extractor_->Transform(ExtractUserKey(cur_key_.GetKey())) ==
+                Slice(saved_user_key_.data(), prefix_len_));
+        parsed_key->user_key = ExtractUserKey(cur_key_.GetKey());
+        if (internal_key != nullptr) {
+          *internal_key = cur_key_.GetKey();
+        }
+        break;
+      }
+      default:
+        return Status::Corruption("Identified size flag.");
+    }
+  } while (expect_suffix);  // Another round if suffix is expected.
+  return Status::OK();
+}
+
+Status PlainTableKeyDecoder::NextKey(const char* start, const char* limit,
+                                     ParsedInternalKey* parsed_key,
+                                     Slice* internal_key, size_t* bytes_read,
+                                     bool* seekable) {
+  *bytes_read = 0;
+  if (seekable != nullptr) {
+    *seekable = true;
+  }
+  if (encoding_type_ == kPlain) {
+    return NextPlainEncodingKey(start, limit, parsed_key, internal_key,
+                                bytes_read, seekable);
+  } else {
+    assert(encoding_type_ == kPrefix);
+    return NextPrefixEncodingKey(start, limit, parsed_key, internal_key,
+                                 bytes_read, seekable);
+  }
+}
+
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/plain_table_key_coding.h b/src/rocksdb/table/plain_table_key_coding.h
new file mode 100644
index 0000000..9047087
--- /dev/null
+++ b/src/rocksdb/table/plain_table_key_coding.h
@@ -0,0 +1,97 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/slice.h"
+#include "db/dbformat.h"
+
+namespace rocksdb {
+
+class WritableFile;
+struct ParsedInternalKey;
+
+// Helper class to write out a key to an output file
+// Actual data format of the key is documented in plain_table_factory.h
+class PlainTableKeyEncoder {
+ public:
+  explicit PlainTableKeyEncoder(EncodingType encoding_type,
+                                uint32_t user_key_len,
+                                const SliceTransform* prefix_extractor,
+                                size_t index_sparseness)
+      : encoding_type_((prefix_extractor != nullptr) ? encoding_type : kPlain),
+        fixed_user_key_len_(user_key_len),
+        prefix_extractor_(prefix_extractor),
+        index_sparseness_((index_sparseness > 1) ? index_sparseness : 1),
+        key_count_for_prefix_(0) {}
+  // key: the key to write out, in the format of internal key.
+  // file: the output file to write out
+  // offset: offset in the file. Needs to be updated after appending bytes
+  //         for the key
+  // meta_bytes_buf: buffer for extra meta bytes
+  // meta_bytes_buf_size: offset to append extra meta bytes. Will be updated
+  //                      if meta_bytes_buf is updated.
+  Status AppendKey(const Slice& key, WritableFile* file, uint64_t* offset,
+                   char* meta_bytes_buf, size_t* meta_bytes_buf_size);
+
+  // Return actual encoding type to be picked
+  EncodingType GetEncodingType() { return encoding_type_; }
+
+ private:
+  EncodingType encoding_type_;
+  uint32_t fixed_user_key_len_;
+  const SliceTransform* prefix_extractor_;
+  const size_t index_sparseness_;
+  size_t key_count_for_prefix_;
+  IterKey pre_prefix_;
+};
+
+// A helper class to decode keys from input buffer
+// Actual data format of the key is documented in plain_table_factory.h
+class PlainTableKeyDecoder {
+ public:
+  explicit PlainTableKeyDecoder(EncodingType encoding_type,
+                                uint32_t user_key_len,
+                                const SliceTransform* prefix_extractor)
+      : encoding_type_(encoding_type),
+        prefix_len_(0),
+        fixed_user_key_len_(user_key_len),
+        prefix_extractor_(prefix_extractor),
+        in_prefix_(false) {}
+  // Find the next key.
+  // start: char array where the key starts.
+  // limit: boundary of the char array
+  // parsed_key: the output of the result key
+  // internal_key: if not null, fill with the output of the result key in
+  //               un-parsed format
+  // bytes_read: how many bytes read from start. Output
+  // seekable: whether key can be read from this place. Used when building
+  //           indexes. Output.
+  Status NextKey(const char* start, const char* limit,
+                 ParsedInternalKey* parsed_key, Slice* internal_key,
+                 size_t* bytes_read, bool* seekable = nullptr);
+  EncodingType encoding_type_;
+  uint32_t prefix_len_;
+  uint32_t fixed_user_key_len_;
+  Slice saved_user_key_;
+  IterKey cur_key_;
+  const SliceTransform* prefix_extractor_;
+  bool in_prefix_;
+
+ private:
+  Status NextPlainEncodingKey(const char* start, const char* limit,
+                              ParsedInternalKey* parsed_key,
+                              Slice* internal_key, size_t* bytes_read,
+                              bool* seekable = nullptr);
+  Status NextPrefixEncodingKey(const char* start, const char* limit,
+                               ParsedInternalKey* parsed_key,
+                               Slice* internal_key, size_t* bytes_read,
+                               bool* seekable = nullptr);
+};
+
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/plain_table_reader.cc b/src/rocksdb/table/plain_table_reader.cc
index f1cb3db..c409204 100644
--- a/src/rocksdb/table/plain_table_reader.cc
+++ b/src/rocksdb/table/plain_table_reader.cc
@@ -3,6 +3,7 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #ifndef ROCKSDB_LITE
+
 #include "table/plain_table_reader.h"
 
 #include <string>
@@ -18,11 +19,14 @@
 #include "rocksdb/statistics.h"
 
 #include "table/block.h"
+#include "table/bloom_block.h"
 #include "table/filter_block.h"
 #include "table/format.h"
 #include "table/meta_blocks.h"
 #include "table/two_level_iterator.h"
 #include "table/plain_table_factory.h"
+#include "table/plain_table_key_coding.h"
+#include "table/get_context.h"
 
 #include "util/arena.h"
 #include "util/coding.h"
@@ -32,26 +36,18 @@
 #include "util/murmurhash.h"
 #include "util/perf_context_imp.h"
 #include "util/stop_watch.h"
+#include "util/string_util.h"
 
 
 namespace rocksdb {
 
 namespace {
 
-inline uint32_t GetSliceHash(const Slice& s) {
-  return Hash(s.data(), s.size(), 397) ;
-}
-
-inline uint32_t GetBucketIdFromHash(uint32_t hash, uint32_t num_buckets) {
-  return hash % num_buckets;
-}
-
 // Safely getting a uint32_t element from a char array, where, starting from
 // `base`, every 4 bytes are considered as an fixed 32 bit integer.
 inline uint32_t GetFixed32Element(const char* base, size_t offset) {
   return DecodeFixed32(base + offset * sizeof(uint32_t));
 }
-
 }  // namespace
 
 // Iterator to iterate IndexedTable
@@ -60,30 +56,31 @@ class PlainTableIterator : public Iterator {
   explicit PlainTableIterator(PlainTableReader* table, bool use_prefix_seek);
   ~PlainTableIterator();
 
-  bool Valid() const;
+  bool Valid() const override;
 
-  void SeekToFirst();
+  void SeekToFirst() override;
 
-  void SeekToLast();
+  void SeekToLast() override;
 
-  void Seek(const Slice& target);
+  void Seek(const Slice& target) override;
 
-  void Next();
+  void Next() override;
 
-  void Prev();
+  void Prev() override;
 
-  Slice key() const;
+  Slice key() const override;
 
-  Slice value() const;
+  Slice value() const override;
 
-  Status status() const;
+  Status status() const override;
 
  private:
   PlainTableReader* table_;
+  PlainTableKeyDecoder decoder_;
   bool use_prefix_seek_;
   uint32_t offset_;
   uint32_t next_offset_;
-  IterKey key_;
+  Slice key_;
   Slice value_;
   Status status_;
   // No copying allowed
@@ -92,63 +89,97 @@ class PlainTableIterator : public Iterator {
 };
 
 extern const uint64_t kPlainTableMagicNumber;
-PlainTableReader::PlainTableReader(
-    const Options& options, unique_ptr<RandomAccessFile>&& file,
-    const EnvOptions& storage_options, const InternalKeyComparator& icomparator,
-    uint64_t file_size, int bloom_bits_per_key, double hash_table_ratio,
-    size_t index_sparseness, const TableProperties* table_properties,
-    size_t huge_page_tlb_size)
-    : options_(options),
-      soptions_(storage_options),
+PlainTableReader::PlainTableReader(const ImmutableCFOptions& ioptions,
+                                   unique_ptr<RandomAccessFile>&& file,
+                                   const EnvOptions& storage_options,
+                                   const InternalKeyComparator& icomparator,
+                                   EncodingType encoding_type,
+                                   uint64_t file_size,
+                                   const TableProperties* table_properties)
+    : internal_comparator_(icomparator),
+      encoding_type_(encoding_type),
+      full_scan_mode_(false),
+      data_end_offset_(static_cast<uint32_t>(table_properties->data_size)),
+      user_key_len_(static_cast<uint32_t>(table_properties->fixed_key_len)),
+      prefix_extractor_(ioptions.prefix_extractor),
+      enable_bloom_(false),
+      bloom_(6, nullptr),
+      ioptions_(ioptions),
       file_(std::move(file)),
-      internal_comparator_(icomparator),
       file_size_(file_size),
-      kHashTableRatio(hash_table_ratio),
-      kBloomBitsPerKey(bloom_bits_per_key),
-      kIndexIntervalForSamePrefixKeys(index_sparseness),
-      table_properties_(nullptr),
-      data_end_offset_(table_properties->data_size),
-      user_key_len_(table_properties->fixed_key_len),
-      huge_page_tlb_size_(huge_page_tlb_size) {
-  assert(kHashTableRatio >= 0.0);
-}
+      table_properties_(nullptr) {}
 
 PlainTableReader::~PlainTableReader() {
 }
 
-Status PlainTableReader::Open(const Options& options,
-                              const EnvOptions& soptions,
+Status PlainTableReader::Open(const ImmutableCFOptions& ioptions,
+                              const EnvOptions& env_options,
                               const InternalKeyComparator& internal_comparator,
                               unique_ptr<RandomAccessFile>&& file,
                               uint64_t file_size,
                               unique_ptr<TableReader>* table_reader,
                               const int bloom_bits_per_key,
                               double hash_table_ratio, size_t index_sparseness,
-                              size_t huge_page_tlb_size) {
-  assert(options.allow_mmap_reads);
-
-  if (file_size > kMaxFileSize) {
+                              size_t huge_page_tlb_size, bool full_scan_mode) {
+  assert(ioptions.allow_mmap_reads);
+  if (file_size > PlainTableIndex::kMaxFileSize) {
     return Status::NotSupported("File is too large for PlainTableReader!");
   }
 
   TableProperties* props = nullptr;
   auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber,
-                               options.env, options.info_log.get(), &props);
+                               ioptions.env, ioptions.info_log, &props);
   if (!s.ok()) {
     return s;
   }
 
+  assert(hash_table_ratio >= 0.0);
+  auto& user_props = props->user_collected_properties;
+  auto prefix_extractor_in_file =
+      user_props.find(PlainTablePropertyNames::kPrefixExtractorName);
+
+  if (!full_scan_mode && prefix_extractor_in_file != user_props.end()) {
+    if (!ioptions.prefix_extractor) {
+      return Status::InvalidArgument(
+          "Prefix extractor is missing when opening a PlainTable built "
+          "using a prefix extractor");
+    } else if (prefix_extractor_in_file->second.compare(
+                   ioptions.prefix_extractor->Name()) != 0) {
+      return Status::InvalidArgument(
+          "Prefix extractor given doesn't match the one used to build "
+          "PlainTable");
+    }
+  }
+
+  EncodingType encoding_type = kPlain;
+  auto encoding_type_prop =
+      user_props.find(PlainTablePropertyNames::kEncodingType);
+  if (encoding_type_prop != user_props.end()) {
+    encoding_type = static_cast<EncodingType>(
+        DecodeFixed32(encoding_type_prop->second.c_str()));
+  }
+
   std::unique_ptr<PlainTableReader> new_reader(new PlainTableReader(
-      options, std::move(file), soptions, internal_comparator, file_size,
-      bloom_bits_per_key, hash_table_ratio, index_sparseness, props,
-      huge_page_tlb_size));
+      ioptions, std::move(file), env_options, internal_comparator,
+      encoding_type, file_size, props));
 
-  // -- Populate Index
-  s = new_reader->PopulateIndex(props);
+  s = new_reader->MmapDataFile();
   if (!s.ok()) {
     return s;
   }
 
+  if (!full_scan_mode) {
+    s = new_reader->PopulateIndex(props, bloom_bits_per_key, hash_table_ratio,
+                                  index_sparseness, huge_page_tlb_size);
+    if (!s.ok()) {
+      return s;
+    }
+  } else {
+    // Flag to indicate it is a full scan mode so that none of the indexes
+    // can be used.
+    new_reader->full_scan_mode_ = true;
+  }
+
   *table_reader = std::move(new_reader);
   return s;
 }
@@ -156,276 +187,209 @@ Status PlainTableReader::Open(const Options& options,
 void PlainTableReader::SetupForCompaction() {
 }
 
-Iterator* PlainTableReader::NewIterator(const ReadOptions& options) {
-  return new PlainTableIterator(this, options_.prefix_extractor != nullptr);
-}
-
-struct PlainTableReader::IndexRecord {
-  uint32_t hash; // hash of the prefix
-  uint32_t offset; // offset of a row
-  IndexRecord* next;
-};
-
-// Helper class to track all the index records
-class PlainTableReader::IndexRecordList {
- public:
-  explicit IndexRecordList(size_t num_records_per_group)
-      : kNumRecordsPerGroup(num_records_per_group),
-        current_group_(nullptr),
-        num_records_in_current_group_(num_records_per_group) {}
-
-  ~IndexRecordList() {
-    for (size_t i = 0; i < groups_.size(); i++) {
-      delete[] groups_[i];
-    }
-  }
-
-  void AddRecord(murmur_t hash, uint32_t offset) {
-    if (num_records_in_current_group_ == kNumRecordsPerGroup) {
-      current_group_ = AllocateNewGroup();
-      num_records_in_current_group_ = 0;
-    }
-    auto& new_record = current_group_[num_records_in_current_group_++];
-    new_record.hash = hash;
-    new_record.offset = offset;
-    new_record.next = nullptr;
+Iterator* PlainTableReader::NewIterator(const ReadOptions& options,
+                                        Arena* arena) {
+  if (options.total_order_seek && !IsTotalOrderMode()) {
+    return NewErrorIterator(
+        Status::InvalidArgument("total_order_seek not supported"), arena);
   }
-
-  size_t GetNumRecords() const {
-    return (groups_.size() - 1) * kNumRecordsPerGroup +
-           num_records_in_current_group_;
-  }
-  IndexRecord* At(size_t index) {
-    return &(groups_[index / kNumRecordsPerGroup][index % kNumRecordsPerGroup]);
+  if (arena == nullptr) {
+    return new PlainTableIterator(this, prefix_extractor_ != nullptr);
+  } else {
+    auto mem = arena->AllocateAligned(sizeof(PlainTableIterator));
+    return new (mem) PlainTableIterator(this, prefix_extractor_ != nullptr);
   }
+}
 
- private:
-  IndexRecord* AllocateNewGroup() {
-    IndexRecord* result = new IndexRecord[kNumRecordsPerGroup];
-    groups_.push_back(result);
-    return result;
-  }
-
-  // Each group in `groups_` contains fix-sized records (determined by
-  // kNumRecordsPerGroup). Which can help us minimize the cost if resizing
-  // occurs.
-  const size_t kNumRecordsPerGroup;
-  IndexRecord* current_group_;
-  // List of arrays allocated
-  std::vector<IndexRecord*> groups_;
-  size_t num_records_in_current_group_;
-};
-
-Status PlainTableReader::PopulateIndexRecordList(IndexRecordList* record_list,
-                                                 int* num_prefixes) const {
+Status PlainTableReader::PopulateIndexRecordList(
+    PlainTableIndexBuilder* index_builder, vector<uint32_t>* prefix_hashes) {
   Slice prev_key_prefix_slice;
-  uint32_t prev_key_prefix_hash = 0;
   uint32_t pos = data_start_offset_;
-  int num_keys_per_prefix = 0;
-  bool is_first_record = true;
-  HistogramImpl keys_per_prefix_hist;
-  // Need map to be ordered to make sure sub indexes generated
-  // are in order.
 
-  *num_prefixes = 0;
+  bool is_first_record = true;
+  Slice key_prefix_slice;
+  PlainTableKeyDecoder decoder(encoding_type_, user_key_len_,
+                               ioptions_.prefix_extractor);
   while (pos < data_end_offset_) {
     uint32_t key_offset = pos;
     ParsedInternalKey key;
     Slice value_slice;
-    Status s = Next(&pos, &key, &value_slice);
+    bool seekable = false;
+    Status s = Next(&decoder, &pos, &key, nullptr, &value_slice, &seekable);
     if (!s.ok()) {
       return s;
     }
-    if (bloom_) {
-      // total order mode and bloom filter is enabled.
-      bloom_->AddHash(GetSliceHash(key.user_key));
-    }
-    Slice key_prefix_slice = GetPrefix(key);
 
-    if (is_first_record || prev_key_prefix_slice != key_prefix_slice) {
-      ++(*num_prefixes);
-      if (!is_first_record) {
-        keys_per_prefix_hist.Add(num_keys_per_prefix);
+    key_prefix_slice = GetPrefix(key);
+    if (enable_bloom_) {
+      bloom_.AddHash(GetSliceHash(key.user_key));
+    } else {
+      if (is_first_record || prev_key_prefix_slice != key_prefix_slice) {
+        if (!is_first_record) {
+          prefix_hashes->push_back(GetSliceHash(prev_key_prefix_slice));
+        }
+        prev_key_prefix_slice = key_prefix_slice;
       }
-      num_keys_per_prefix = 0;
-      prev_key_prefix_slice = key_prefix_slice;
-      prev_key_prefix_hash = GetSliceHash(key_prefix_slice);
     }
 
-    if (kIndexIntervalForSamePrefixKeys == 0 ||
-        num_keys_per_prefix++ % kIndexIntervalForSamePrefixKeys == 0) {
-      // Add an index key for every kIndexIntervalForSamePrefixKeys keys
-      record_list->AddRecord(prev_key_prefix_hash, key_offset);
+    index_builder->AddKeyPrefix(GetPrefix(key), key_offset);
+
+    if (!seekable && is_first_record) {
+      return Status::Corruption("Key for a prefix is not seekable");
     }
+
     is_first_record = false;
   }
 
-  keys_per_prefix_hist.Add(num_keys_per_prefix);
-  Log(options_.info_log, "Number of Keys per prefix Histogram: %s",
-      keys_per_prefix_hist.ToString().c_str());
-
-  return Status::OK();
+  prefix_hashes->push_back(GetSliceHash(key_prefix_slice));
+  auto s = index_.InitFromRawData(index_builder->Finish());
+  return s;
 }
 
-void PlainTableReader::AllocateIndexAndBloom(int num_prefixes) {
-  if (options_.prefix_extractor.get() != nullptr) {
-    uint32_t bloom_total_bits = num_prefixes * kBloomBitsPerKey;
+void PlainTableReader::AllocateAndFillBloom(int bloom_bits_per_key,
+                                            int num_prefixes,
+                                            size_t huge_page_tlb_size,
+                                            vector<uint32_t>* prefix_hashes) {
+  if (!IsTotalOrderMode()) {
+    uint32_t bloom_total_bits = num_prefixes * bloom_bits_per_key;
     if (bloom_total_bits > 0) {
-      bloom_.reset(new DynamicBloom(bloom_total_bits, options_.bloom_locality,
-                                    6, nullptr, huge_page_tlb_size_));
+      enable_bloom_ = true;
+      bloom_.SetTotalBits(&arena_, bloom_total_bits, ioptions_.bloom_locality,
+                          huge_page_tlb_size, ioptions_.info_log);
+      FillBloom(prefix_hashes);
     }
   }
-
-  if (options_.prefix_extractor.get() == nullptr || kHashTableRatio <= 0) {
-    // Fall back to pure binary search if the user fails to specify a prefix
-    // extractor.
-    index_size_ = 1;
-  } else {
-    double hash_table_size_multipier = 1.0 / kHashTableRatio;
-    index_size_ = num_prefixes * hash_table_size_multipier + 1;
-  }
 }
 
-size_t PlainTableReader::BucketizeIndexesAndFillBloom(
-    IndexRecordList* record_list, std::vector<IndexRecord*>* hash_to_offsets,
-    std::vector<uint32_t>* entries_per_bucket) {
-  bool first = true;
-  uint32_t prev_hash = 0;
-  size_t num_records = record_list->GetNumRecords();
-  for (size_t i = 0; i < num_records; i++) {
-    IndexRecord* index_record = record_list->At(i);
-    uint32_t cur_hash = index_record->hash;
-    if (first || prev_hash != cur_hash) {
-      prev_hash = cur_hash;
-      first = false;
-      if (bloom_ && !IsTotalOrderMode()) {
-        bloom_->AddHash(cur_hash);
-      }
-    }
-    uint32_t bucket = GetBucketIdFromHash(cur_hash, index_size_);
-    IndexRecord* prev_bucket_head = (*hash_to_offsets)[bucket];
-    index_record->next = prev_bucket_head;
-    (*hash_to_offsets)[bucket] = index_record;
-    (*entries_per_bucket)[bucket]++;
-  }
-  size_t sub_index_size = 0;
-  for (auto entry_count : *entries_per_bucket) {
-    if (entry_count <= 1) {
-      continue;
-    }
-    // Only buckets with more than 1 entry will have subindex.
-    sub_index_size += VarintLength(entry_count);
-    // total bytes needed to store these entries' in-file offsets.
-    sub_index_size += entry_count * kOffsetLen;
+void PlainTableReader::FillBloom(vector<uint32_t>* prefix_hashes) {
+  assert(bloom_.IsInitialized());
+  for (auto prefix_hash : *prefix_hashes) {
+    bloom_.AddHash(prefix_hash);
   }
-  return sub_index_size;
 }
 
-void PlainTableReader::FillIndexes(
-    const size_t kSubIndexSize,
-    const std::vector<IndexRecord*>& hash_to_offsets,
-    const std::vector<uint32_t>& entries_per_bucket) {
-  Log(options_.info_log, "Reserving %zu bytes for plain table's sub_index",
-      kSubIndexSize);
-  auto total_allocate_size = sizeof(uint32_t) * index_size_ + kSubIndexSize;
-  char* allocated =
-      arena_.AllocateAligned(total_allocate_size, huge_page_tlb_size_);
-  index_ = reinterpret_cast<uint32_t*>(allocated);
-  sub_index_ = allocated + sizeof(uint32_t) * index_size_;
-
-  size_t sub_index_offset = 0;
-  for (int i = 0; i < index_size_; i++) {
-    uint32_t num_keys_for_bucket = entries_per_bucket[i];
-    switch (num_keys_for_bucket) {
-    case 0:
-      // No key for bucket
-      index_[i] = data_end_offset_;
-      break;
-    case 1:
-      // point directly to the file offset
-      index_[i] = hash_to_offsets[i]->offset;
-      break;
-    default:
-      // point to second level indexes.
-      index_[i] = sub_index_offset | kSubIndexMask;
-      char* prev_ptr = &sub_index_[sub_index_offset];
-      char* cur_ptr = EncodeVarint32(prev_ptr, num_keys_for_bucket);
-      sub_index_offset += (cur_ptr - prev_ptr);
-      char* sub_index_pos = &sub_index_[sub_index_offset];
-      IndexRecord* record = hash_to_offsets[i];
-      int j;
-      for (j = num_keys_for_bucket - 1; j >= 0 && record;
-           j--, record = record->next) {
-        EncodeFixed32(sub_index_pos + j * sizeof(uint32_t), record->offset);
-      }
-      assert(j == -1 && record == nullptr);
-      sub_index_offset += kOffsetLen * num_keys_for_bucket;
-      assert(sub_index_offset <= kSubIndexSize);
-      break;
-    }
-  }
-  assert(sub_index_offset == kSubIndexSize);
-
-  Log(options_.info_log, "hash table size: %d, suffix_map length %zu",
-      index_size_, kSubIndexSize);
+Status PlainTableReader::MmapDataFile() {
+  // Get mmapped memory to file_data_.
+  return file_->Read(0, file_size_, &file_data_, nullptr);
 }
 
-Status PlainTableReader::PopulateIndex(TableProperties* props) {
+Status PlainTableReader::PopulateIndex(TableProperties* props,
+                                       int bloom_bits_per_key,
+                                       double hash_table_ratio,
+                                       size_t index_sparseness,
+                                       size_t huge_page_tlb_size) {
   assert(props != nullptr);
   table_properties_.reset(props);
 
-  // options.prefix_extractor is requried for a hash-based look-up.
-  if (options_.prefix_extractor.get() == nullptr && kHashTableRatio != 0) {
-    return Status::NotSupported(
-        "PlainTable requires a prefix extractor enable prefix hash mode.");
+  BlockContents bloom_block_contents;
+  auto s = ReadMetaBlock(file_.get(), file_size_, kPlainTableMagicNumber,
+                         ioptions_.env, BloomBlockBuilder::kBloomBlock,
+                         &bloom_block_contents);
+  bool index_in_file = s.ok();
+
+  BlockContents index_block_contents;
+  s = ReadMetaBlock(file_.get(), file_size_, kPlainTableMagicNumber,
+      ioptions_.env, PlainTableIndexBuilder::kPlainTableIndexBlock,
+      &index_block_contents);
+
+  index_in_file &= s.ok();
+
+  Slice* bloom_block;
+  if (index_in_file) {
+    bloom_block = &bloom_block_contents.data;
+  } else {
+    bloom_block = nullptr;
   }
 
-  // Get mmapped memory to file_data_.
-  Status s = file_->Read(0, file_size_, &file_data_, nullptr);
-  if (!s.ok()) {
-    return s;
+  // index_in_file == true only if there are kBloomBlock and
+  // kPlainTableIndexBlock
+  // in file
+
+  Slice* index_block;
+  if (index_in_file) {
+    index_block = &index_block_contents.data;
+  } else {
+    index_block = nullptr;
+  }
+
+  if ((ioptions_.prefix_extractor == nullptr) &&
+      (hash_table_ratio != 0)) {
+    // ioptions.prefix_extractor is requried for a hash-based look-up.
+    return Status::NotSupported(
+        "PlainTable requires a prefix extractor enable prefix hash mode.");
   }
 
-  IndexRecordList record_list(kRecordsPerGroup);
   // First, read the whole file, for every kIndexIntervalForSamePrefixKeys rows
   // for a prefix (starting from the first one), generate a record of (hash,
   // offset) and append it to IndexRecordList, which is a data structure created
   // to store them.
-  int num_prefixes;
 
-  // Allocate bloom filter here for total order mode.
-  if (IsTotalOrderMode()) {
-    uint32_t num_bloom_bits = table_properties_->num_entries * kBloomBitsPerKey;
-    if (num_bloom_bits > 0) {
-      bloom_.reset(new DynamicBloom(num_bloom_bits, options_.bloom_locality, 6,
-                                    nullptr, huge_page_tlb_size_));
+  if (!index_in_file) {
+    // Allocate bloom filter here for total order mode.
+    if (IsTotalOrderMode()) {
+      uint32_t num_bloom_bits =
+          static_cast<uint32_t>(table_properties_->num_entries) *
+          bloom_bits_per_key;
+      if (num_bloom_bits > 0) {
+        enable_bloom_ = true;
+        bloom_.SetTotalBits(&arena_, num_bloom_bits, ioptions_.bloom_locality,
+                            huge_page_tlb_size, ioptions_.info_log);
+      }
     }
+  } else {
+    enable_bloom_ = true;
+    auto num_blocks_property = props->user_collected_properties.find(
+        PlainTablePropertyNames::kNumBloomBlocks);
+
+    uint32_t num_blocks = 0;
+    if (num_blocks_property != props->user_collected_properties.end()) {
+      Slice temp_slice(num_blocks_property->second);
+      if (!GetVarint32(&temp_slice, &num_blocks)) {
+        num_blocks = 0;
+      }
+    }
+    // cast away const qualifier, because bloom_ won't be changed
+    bloom_.SetRawData(
+        const_cast<unsigned char*>(
+            reinterpret_cast<const unsigned char*>(bloom_block->data())),
+        static_cast<uint32_t>(bloom_block->size()) * 8, num_blocks);
   }
 
-  s = PopulateIndexRecordList(&record_list, &num_prefixes);
-  if (!s.ok()) {
-    return s;
+  PlainTableIndexBuilder index_builder(&arena_, ioptions_, index_sparseness,
+                                       hash_table_ratio, huge_page_tlb_size);
+
+  std::vector<uint32_t> prefix_hashes;
+  if (!index_in_file) {
+    s = PopulateIndexRecordList(&index_builder, &prefix_hashes);
+    if (!s.ok()) {
+      return s;
+    }
+  } else {
+    s = index_.InitFromRawData(*index_block);
+    if (!s.ok()) {
+      return s;
+    }
   }
-  // Calculated hash table and bloom filter size and allocate memory for indexes
-  // and bloom filter based on the number of prefixes.
-  AllocateIndexAndBloom(num_prefixes);
 
-  // Bucketize all the index records to a temp data structure, in which for
-  // each bucket, we generate a linked list of IndexRecord, in reversed order.
-  std::vector<IndexRecord*> hash_to_offsets(index_size_, nullptr);
-  std::vector<uint32_t> entries_per_bucket(index_size_, 0);
-  size_t sub_index_size_needed = BucketizeIndexesAndFillBloom(
-      &record_list, &hash_to_offsets, &entries_per_bucket);
-  // From the temp data structure, populate indexes.
-  FillIndexes(sub_index_size_needed, hash_to_offsets, entries_per_bucket);
+  if (!index_in_file) {
+    // Calculated bloom filter size and allocate memory for
+    // bloom filter based on the number of prefixes, then fill it.
+    AllocateAndFillBloom(bloom_bits_per_key, index_.GetNumPrefixes(),
+                         huge_page_tlb_size, &prefix_hashes);
+  }
 
   // Fill two table properties.
-  // TODO(sdong): after we have the feature of storing index in file, this
-  // properties need to be populated to index_size instead.
-  props->user_collected_properties["plain_table_hash_table_size"] =
-      std::to_string(index_size_ * 4U);
-  props->user_collected_properties["plain_table_sub_index_size"] =
-      std::to_string(sub_index_size_needed);
+  if (!index_in_file) {
+    props->user_collected_properties["plain_table_hash_table_size"] =
+        ToString(index_.GetIndexSize() * PlainTableIndex::kOffsetLen);
+    props->user_collected_properties["plain_table_sub_index_size"] =
+        ToString(index_.GetSubIndexSize());
+  } else {
+    props->user_collected_properties["plain_table_hash_table_size"] =
+        ToString(0);
+    props->user_collected_properties["plain_table_sub_index_size"] =
+        ToString(0);
+  }
 
   return Status::OK();
 }
@@ -434,24 +398,21 @@ Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix,
                                    uint32_t prefix_hash, bool& prefix_matched,
                                    uint32_t* offset) const {
   prefix_matched = false;
-  int bucket = GetBucketIdFromHash(prefix_hash, index_size_);
-  uint32_t bucket_value = index_[bucket];
-  if (bucket_value == data_end_offset_) {
+  uint32_t prefix_index_offset;
+  auto res = index_.GetOffset(prefix_hash, &prefix_index_offset);
+  if (res == PlainTableIndex::kNoPrefixForBucket) {
     *offset = data_end_offset_;
     return Status::OK();
-  } else if ((bucket_value & kSubIndexMask) == 0) {
-    // point directly to the file
-    *offset = bucket_value;
+  } else if (res == PlainTableIndex::kDirectToFile) {
+    *offset = prefix_index_offset;
     return Status::OK();
   }
 
   // point to sub-index, need to do a binary search
+  uint32_t upper_bound;
+  const char* base_ptr =
+      index_.GetSubIndexBasePtrAndUpperBound(prefix_index_offset, &upper_bound);
   uint32_t low = 0;
-  uint64_t prefix_index_offset = bucket_value ^ kSubIndexMask;
-
-  const char* index_ptr = &sub_index_[prefix_index_offset];
-  uint32_t upper_bound = 0;
-  const char* base_ptr = GetVarint32Ptr(index_ptr, index_ptr + 4, &upper_bound);
   uint32_t high = upper_bound;
   ParsedInternalKey mid_key;
   ParsedInternalKey parsed_target;
@@ -464,7 +425,11 @@ Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix,
     uint32_t mid = (high + low) / 2;
     uint32_t file_offset = GetFixed32Element(base_ptr, mid);
     size_t tmp;
-    Status s = ReadKey(file_data_.data() + file_offset, &mid_key, &tmp);
+    Status s = PlainTableKeyDecoder(encoding_type_, user_key_len_,
+                                    ioptions_.prefix_extractor)
+                   .NextKey(file_data_.data() + file_offset,
+                            file_data_.data() + data_end_offset_, &mid_key,
+                            nullptr, &tmp);
     if (!s.ok()) {
       return s;
     }
@@ -489,7 +454,15 @@ Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix,
   ParsedInternalKey low_key;
   size_t tmp;
   uint32_t low_key_offset = GetFixed32Element(base_ptr, low);
-  Status s = ReadKey(file_data_.data() + low_key_offset, &low_key, &tmp);
+  Status s = PlainTableKeyDecoder(encoding_type_, user_key_len_,
+                                  ioptions_.prefix_extractor)
+                 .NextKey(file_data_.data() + low_key_offset,
+                          file_data_.data() + data_end_offset_, &low_key,
+                          nullptr, &tmp);
+  if (!s.ok()) {
+    return s;
+  }
+
   if (GetPrefix(low_key) == prefix) {
     prefix_matched = true;
     *offset = low_key_offset;
@@ -506,59 +479,14 @@ Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix,
 }
 
 bool PlainTableReader::MatchBloom(uint32_t hash) const {
-  return bloom_.get() == nullptr || bloom_->MayContainHash(hash);
-}
-
-Slice PlainTableReader::GetPrefix(const ParsedInternalKey& target) const {
-  return GetPrefixFromUserKey(target.user_key);
+  return !enable_bloom_ || bloom_.MayContainHash(hash);
 }
 
-Status PlainTableReader::ReadKey(const char* start, ParsedInternalKey* key,
-                                 size_t* bytes_read) const {
-  const char* key_ptr = nullptr;
-  *bytes_read = 0;
-  size_t user_key_size = 0;
-  if (IsFixedLength()) {
-    user_key_size = user_key_len_;
-    key_ptr = start;
-  } else {
-    uint32_t tmp_size = 0;
-    key_ptr =
-        GetVarint32Ptr(start, file_data_.data() + data_end_offset_, &tmp_size);
-    if (key_ptr == nullptr) {
-      return Status::Corruption(
-          "Unexpected EOF when reading the next key's size");
-    }
-    user_key_size = (size_t)tmp_size;
-    *bytes_read = key_ptr - start;
-  }
-  if (key_ptr + user_key_size + 1 >= file_data_.data() + data_end_offset_) {
-    return Status::Corruption("Unexpected EOF when reading the next key");
-  }
-
-  if (*(key_ptr + user_key_size) == PlainTableFactory::kValueTypeSeqId0) {
-    // Special encoding for the row with seqID=0
-    key->user_key = Slice(key_ptr, user_key_size);
-    key->sequence = 0;
-    key->type = kTypeValue;
-    *bytes_read += user_key_size + 1;
-  } else {
-    if (start + user_key_size + 8 >= file_data_.data() + data_end_offset_) {
-      return Status::Corruption(
-          "Unexpected EOF when reading internal bytes of the next key");
-    }
-    if (!ParseInternalKey(Slice(key_ptr, user_key_size + 8), key)) {
-      return Status::Corruption(
-          Slice("Incorrect value type found when reading the next key"));
-    }
-    *bytes_read += user_key_size + 8;
-  }
-
-  return Status::OK();
-}
 
-Status PlainTableReader::Next(uint32_t* offset, ParsedInternalKey* key,
-                              Slice* value) const {
+Status PlainTableReader::Next(PlainTableKeyDecoder* decoder, uint32_t* offset,
+                              ParsedInternalKey* parsed_key,
+                              Slice* internal_key, Slice* value,
+                              bool* seekable) const {
   if (*offset == data_end_offset_) {
     *offset = data_end_offset_;
     return Status::OK();
@@ -570,7 +498,9 @@ Status PlainTableReader::Next(uint32_t* offset, ParsedInternalKey* key,
 
   const char* start = file_data_.data() + *offset;
   size_t bytes_for_key;
-  Status s = ReadKey(start, key, &bytes_for_key);
+  Status s =
+      decoder->NextKey(start, file_data_.data() + data_end_offset_, parsed_key,
+                       internal_key, &bytes_for_key, seekable);
   if (!s.ok()) {
     return s;
   }
@@ -581,7 +511,7 @@ Status PlainTableReader::Next(uint32_t* offset, ParsedInternalKey* key,
     return Status::Corruption(
         "Unexpected EOF when reading the next value's size.");
   }
-  *offset = *offset + (value_ptr - start) + value_size;
+  *offset = *offset + static_cast<uint32_t>(value_ptr - start) + value_size;
   if (*offset > data_end_offset_) {
     return Status::Corruption("Unexpected EOF when reading the next value. ");
   }
@@ -590,15 +520,23 @@ Status PlainTableReader::Next(uint32_t* offset, ParsedInternalKey* key,
   return Status::OK();
 }
 
+void PlainTableReader::Prepare(const Slice& target) {
+  if (enable_bloom_) {
+    uint32_t prefix_hash = GetSliceHash(GetPrefix(target));
+    bloom_.Prefetch(prefix_hash);
+  }
+}
+
 Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target,
-                             void* arg,
-                             bool (*saver)(void*, const ParsedInternalKey&,
-                                           const Slice&, bool),
-                             void (*mark_key_may_exist)(void*)) {
+                             GetContext* get_context) {
   // Check bloom filter first.
   Slice prefix_slice;
   uint32_t prefix_hash;
   if (IsTotalOrderMode()) {
+    if (full_scan_mode_) {
+      status_ =
+          Status::InvalidArgument("Get() is not allowed in full scan mode.");
+    }
     // Match whole user key for bloom filter check.
     if (!MatchBloom(GetSliceHash(GetUserKey(target)))) {
       return Status::OK();
@@ -626,10 +564,11 @@ Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target,
   if (!ParseInternalKey(target, &parsed_target)) {
     return Status::Corruption(Slice());
   }
-
   Slice found_value;
+  PlainTableKeyDecoder decoder(encoding_type_, user_key_len_,
+                               ioptions_.prefix_extractor);
   while (offset < data_end_offset_) {
-    Status s = Next(&offset, &found_key, &found_value);
+    s = Next(&decoder, &offset, &found_key, nullptr, &found_value);
     if (!s.ok()) {
       return s;
     }
@@ -641,8 +580,10 @@ Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target,
       }
       prefix_match = true;
     }
+    // TODO(ljin): since we know the key comparison result here,
+    // can we enable the fast path?
     if (internal_comparator_.Compare(found_key, parsed_target) >= 0) {
-      if (!(*saver)(arg, found_key, found_value, true)) {
+      if (!get_context->SaveValue(found_key, found_value)) {
         break;
       }
     }
@@ -656,7 +597,10 @@ uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& key) {
 
 PlainTableIterator::PlainTableIterator(PlainTableReader* table,
                                        bool use_prefix_seek)
-    : table_(table), use_prefix_seek_(use_prefix_seek) {
+    : table_(table),
+      decoder_(table_->encoding_type_, table_->user_key_len_,
+               table_->prefix_extractor_),
+      use_prefix_seek_(use_prefix_seek) {
   next_offset_ = offset_ = table_->data_end_offset_;
 }
 
@@ -685,12 +629,20 @@ void PlainTableIterator::SeekToLast() {
 void PlainTableIterator::Seek(const Slice& target) {
   // If the user doesn't set prefix seek option and we are not able to do a
   // total Seek(). assert failure.
-  if (!use_prefix_seek_ && table_->index_size_ > 1) {
-    assert(false);
-    status_ = Status::NotSupported(
-        "PlainTable cannot issue non-prefix seek unless in total order mode.");
-    offset_ = next_offset_ = table_->data_end_offset_;
-    return;
+  if (!use_prefix_seek_) {
+    if (table_->full_scan_mode_) {
+      status_ =
+          Status::InvalidArgument("Seek() is not allowed in full scan mode.");
+      offset_ = next_offset_ = table_->data_end_offset_;
+      return;
+    } else if (table_->GetIndexSize() > 1) {
+      assert(false);
+      status_ = Status::NotSupported(
+          "PlainTable cannot issue non-prefix seek unless in total order "
+          "mode.");
+      offset_ = next_offset_ = table_->data_end_offset_;
+      return;
+    }
   }
 
   Slice prefix_slice = table_->GetPrefix(target);
@@ -735,11 +687,9 @@ void PlainTableIterator::Next() {
   if (offset_ < table_->data_end_offset_) {
     Slice tmp_slice;
     ParsedInternalKey parsed_key;
-    status_ = table_->Next(&next_offset_, &parsed_key, &value_);
-    if (status_.ok()) {
-      // Make a copy in this case. TODO optimize.
-      key_.SetInternalKey(parsed_key);
-    } else {
+    status_ =
+        table_->Next(&decoder_, &next_offset_, &parsed_key, &key_, &value_);
+    if (!status_.ok()) {
       offset_ = next_offset_ = table_->data_end_offset_;
     }
   }
@@ -751,7 +701,7 @@ void PlainTableIterator::Prev() {
 
 Slice PlainTableIterator::key() const {
   assert(Valid());
-  return key_.GetKey();
+  return key_;
 }
 
 Slice PlainTableIterator::value() const {
diff --git a/src/rocksdb/table/plain_table_reader.h b/src/rocksdb/table/plain_table_reader.h
index e6373dc..b4f68a0 100644
--- a/src/rocksdb/table/plain_table_reader.h
+++ b/src/rocksdb/table/plain_table_reader.h
@@ -19,11 +19,14 @@
 #include "rocksdb/table_properties.h"
 #include "table/table_reader.h"
 #include "table/plain_table_factory.h"
+#include "table/plain_table_index.h"
 #include "util/arena.h"
+#include "util/dynamic_bloom.h"
 
 namespace rocksdb {
 
 class Block;
+struct BlockContents;
 class BlockHandle;
 class Footer;
 struct Options;
@@ -31,11 +34,13 @@ class RandomAccessFile;
 struct ReadOptions;
 class TableCache;
 class TableReader;
-class DynamicBloom;
 class InternalKeyComparator;
+class PlainTableKeyDecoder;
+class GetContext;
 
 using std::unique_ptr;
 using std::unordered_map;
+using std::vector;
 extern const uint32_t kPlainTableVariableLength;
 
 // Based on following output file format shown in plain_table_factory.h
@@ -48,35 +53,41 @@ extern const uint32_t kPlainTableVariableLength;
 // The implementation of IndexedTableReader requires output file is mmaped
 class PlainTableReader: public TableReader {
  public:
-  static Status Open(const Options& options, const EnvOptions& soptions,
+  static Status Open(const ImmutableCFOptions& ioptions,
+                     const EnvOptions& env_options,
                      const InternalKeyComparator& internal_comparator,
                      unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
                      unique_ptr<TableReader>* table,
                      const int bloom_bits_per_key, double hash_table_ratio,
-                     size_t index_sparseness, size_t huge_page_tlb_size);
+                     size_t index_sparseness, size_t huge_page_tlb_size,
+                     bool full_scan_mode);
 
-  Iterator* NewIterator(const ReadOptions&);
+  Iterator* NewIterator(const ReadOptions&, Arena* arena = nullptr) override;
 
-  Status Get(const ReadOptions&, const Slice& key, void* arg,
-             bool (*result_handler)(void* arg, const ParsedInternalKey& k,
-                                    const Slice& v, bool),
-             void (*mark_key_may_exist)(void*) = nullptr);
+  void Prepare(const Slice& target) override;
 
-  uint64_t ApproximateOffsetOf(const Slice& key);
+  Status Get(const ReadOptions&, const Slice& key,
+             GetContext* get_context) override;
 
-  void SetupForCompaction();
+  uint64_t ApproximateOffsetOf(const Slice& key) override;
 
-  std::shared_ptr<const TableProperties> GetTableProperties() const {
+  uint32_t GetIndexSize() const { return index_.GetIndexSize(); }
+  void SetupForCompaction() override;
+
+  std::shared_ptr<const TableProperties> GetTableProperties() const override {
     return table_properties_;
   }
 
-  PlainTableReader(const Options& options, unique_ptr<RandomAccessFile>&& file,
-                   const EnvOptions& storage_options,
+  virtual size_t ApproximateMemoryUsage() const override {
+    return arena_.MemoryAllocatedBytes();
+  }
+
+  PlainTableReader(const ImmutableCFOptions& ioptions,
+                   unique_ptr<RandomAccessFile>&& file,
+                   const EnvOptions& env_options,
                    const InternalKeyComparator& internal_comparator,
-                   uint64_t file_size, int bloom_num_bits,
-                   double hash_table_ratio, size_t index_sparseness,
-                   const TableProperties* table_properties,
-                   size_t huge_page_tlb_size);
+                   EncodingType encoding_type, uint64_t file_size,
+                   const TableProperties* table_properties);
   virtual ~PlainTableReader();
 
  protected:
@@ -91,91 +102,41 @@ class PlainTableReader: public TableReader {
   // props: the table properties object that need to be stored. Ownership of
   //        the object will be passed.
   //
-  // index_ contains buckets size of index_size_, each is a
-  // 32-bit integer. The lower 31 bits contain an offset value (explained below)
-  // and the first bit of the integer indicates type of the offset.
-  //
-  // +--------------+------------------------------------------------------+
-  // | Flag (1 bit) | Offset to binary search buffer or file (31 bits)     +
-  // +--------------+------------------------------------------------------+
-  //
-  // Explanation for the "flag bit":
-  //
-  // 0 indicates that the bucket contains only one prefix (no conflict when
-  //   hashing this prefix), whose first row starts from this offset of the
-  // file.
-  // 1 indicates that the bucket contains more than one prefixes, or there
-  //   are too many rows for one prefix so we need a binary search for it. In
-  //   this case, the offset indicates the offset of sub_index_ holding the
-  //   binary search indexes of keys for those rows. Those binary search indexes
-  //   are organized in this way:
-  //
-  // The first 4 bytes, indicate how many indexes (N) are stored after it. After
-  // it, there are N 32-bit integers, each points of an offset of the file,
-  // which
-  // points to starting of a row. Those offsets need to be guaranteed to be in
-  // ascending order so the keys they are pointing to are also in ascending
-  // order
-  // to make sure we can use them to do binary searches. Below is visual
-  // presentation of a bucket.
-  //
-  // <begin>
-  //   number_of_records:  varint32
-  //   record 1 file offset:  fixedint32
-  //   record 2 file offset:  fixedint32
-  //    ....
-  //   record N file offset:  fixedint32
-  // <end>
-  Status PopulateIndex(TableProperties* props);
 
- private:
-  struct IndexRecord;
-  class IndexRecordList;
-
-  // Plain table maintains an index and a sub index.
-  // index is implemented by a hash table.
-  // subindex is a big of memory array.
-  // For more details about the in-memory index, please refer to:
-  // https://github.com/facebook/rocksdb/wiki/PlainTable-Format
-  // #wiki-in-memory-index-format
-  uint32_t* index_;
-  int index_size_ = 0;
-  char* sub_index_;
-
-  Options options_;
-  const EnvOptions& soptions_;
-  unique_ptr<RandomAccessFile> file_;
+  Status PopulateIndex(TableProperties* props, int bloom_bits_per_key,
+                       double hash_table_ratio, size_t index_sparseness,
+                       size_t huge_page_tlb_size);
 
+  Status MmapDataFile();
+
+ private:
   const InternalKeyComparator internal_comparator_;
+  EncodingType encoding_type_;
   // represents plain table's current status.
   Status status_;
-
   Slice file_data_;
-  uint32_t file_size_;
-
-  const double kHashTableRatio;
-  const int kBloomBitsPerKey;
-  // To speed up the search for keys with same prefix, we'll add index key for
-  // every N keys, where the "N" is determined by
-  // kIndexIntervalForSamePrefixKeys
-  const size_t kIndexIntervalForSamePrefixKeys = 16;
-  // Bloom filter is used to rule out non-existent key
-  unique_ptr<DynamicBloom> bloom_;
-  Arena arena_;
 
-  std::shared_ptr<const TableProperties> table_properties_;
+  PlainTableIndex index_;
+  bool full_scan_mode_;
+
   // data_start_offset_ and data_end_offset_ defines the range of the
   // sst file that stores data.
   const uint32_t data_start_offset_ = 0;
   const uint32_t data_end_offset_;
-  const size_t user_key_len_;
-  const size_t huge_page_tlb_size_;
+  const uint32_t user_key_len_;
+  const SliceTransform* prefix_extractor_;
 
   static const size_t kNumInternalBytes = 8;
-  static const uint32_t kSubIndexMask = 0x80000000;
-  static const size_t kOffsetLen = sizeof(uint32_t);
-  static const uint64_t kMaxFileSize = 1u << 31;
-  static const size_t kRecordsPerGroup = 256;
+
+  // Bloom filter is used to rule out non-existent key
+  bool enable_bloom_;
+  DynamicBloom bloom_;
+  Arena arena_;
+
+  const ImmutableCFOptions& ioptions_;
+  unique_ptr<RandomAccessFile> file_;
+  uint64_t file_size_;
+  std::shared_ptr<const TableProperties> table_properties_;
 
   bool IsFixedLength() const {
     return user_key_len_ != kPlainTableVariableLength;
@@ -185,6 +146,31 @@ class PlainTableReader: public TableReader {
     return user_key_len_ + kNumInternalBytes;
   }
 
+  Slice GetPrefix(const Slice& target) const {
+    assert(target.size() >= 8);  // target is internal key
+    return GetPrefixFromUserKey(GetUserKey(target));
+  }
+
+  Slice GetPrefix(const ParsedInternalKey& target) const {
+    return GetPrefixFromUserKey(target.user_key);
+  }
+
+  Slice GetUserKey(const Slice& key) const {
+    return Slice(key.data(), key.size() - 8);
+  }
+
+  Slice GetPrefixFromUserKey(const Slice& user_key) const {
+    if (!IsTotalOrderMode()) {
+      return prefix_extractor_->Transform(user_key);
+    } else {
+      // Use empty slice as prefix if prefix_extractor is not set.
+      // In that case,
+      // it falls back to pure binary search and
+      // total iterator seek is supported.
+      return Slice();
+    }
+  }
+
   friend class TableCache;
   friend class PlainTableIterator;
 
@@ -192,38 +178,27 @@ class PlainTableReader: public TableReader {
   // the rows, which contains index records as a list.
   // If bloom_ is not null, all the keys' full-key hash will be added to the
   // bloom filter.
-  Status PopulateIndexRecordList(IndexRecordList* record_list,
-                                 int* num_prefixes) const;
-
-  // Internal helper function to allocate memory for indexes and bloom filters
-  void AllocateIndexAndBloom(int num_prefixes);
-
-  // Internal helper function to bucket index record list to hash buckets.
-  // bucket_header is a vector of size hash_table_size_, with each entry
-  // containing a linklist of IndexRecord hashed to the same bucket, in reverse
-  // order.
-  // of offsets for the hash, in reversed order.
-  // entries_per_bucket is sized of index_size_. The value is how many index
-  // records are there in bucket_headers for the same bucket.
-  size_t BucketizeIndexesAndFillBloom(
-      IndexRecordList* record_list, std::vector<IndexRecord*>* bucket_headers,
-      std::vector<uint32_t>* entries_per_bucket);
-
-  // Internal helper class to fill the indexes and bloom filters to internal
-  // data structures. bucket_headers and entries_per_bucket are bucketized
-  // indexes and counts generated by BucketizeIndexesAndFillBloom().
-  void FillIndexes(const size_t kSubIndexSize,
-                   const std::vector<IndexRecord*>& bucket_headers,
-                   const std::vector<uint32_t>& entries_per_bucket);
-
-  // Read a plain table key from the position `start`. The read content
-  // will be written to `key` and the size of read bytes will be populated
-  // in `bytes_read`.
-  Status ReadKey(const char* row_ptr, ParsedInternalKey* key,
-                 size_t* bytes_read) const;
-  // Read the key and value at `offset` to parameters `key` and `value`.
+  Status PopulateIndexRecordList(PlainTableIndexBuilder* index_builder,
+                                 vector<uint32_t>* prefix_hashes);
+
+  // Internal helper function to allocate memory for bloom filter and fill it
+  void AllocateAndFillBloom(int bloom_bits_per_key, int num_prefixes,
+                            size_t huge_page_tlb_size,
+                            vector<uint32_t>* prefix_hashes);
+
+  void FillBloom(vector<uint32_t>* prefix_hashes);
+
+  // Read the key and value at `offset` to parameters for keys, the and
+  // `seekable`.
   // On success, `offset` will be updated as the offset for the next key.
-  Status Next(uint32_t* offset, ParsedInternalKey* key, Slice* value) const;
+  // `parsed_key` will be key in parsed format.
+  // if `internal_key` is not empty, it will be filled with key with slice
+  // format.
+  // if `seekable` is not null, it will return whether we can directly read
+  // data using this offset.
+  Status Next(PlainTableKeyDecoder* decoder, uint32_t* offset,
+              ParsedInternalKey* parsed_key, Slice* internal_key, Slice* value,
+              bool* seekable = nullptr) const;
   // Get file offset for key target.
   // return value prefix_matched is set to true if the offset is confirmed
   // for a key with the same prefix as target.
@@ -231,31 +206,7 @@ class PlainTableReader: public TableReader {
                    uint32_t prefix_hash, bool& prefix_matched,
                    uint32_t* offset) const;
 
-  Slice GetUserKey(const Slice& key) const {
-    return Slice(key.data(), key.size() - 8);
-  }
-
-  Slice GetPrefix(const Slice& target) const {
-    assert(target.size() >= 8);  // target is internal key
-    return GetPrefixFromUserKey(GetUserKey(target));
-  }
-
-  inline Slice GetPrefix(const ParsedInternalKey& target) const;
-
-  Slice GetPrefixFromUserKey(const Slice& user_key) const {
-    if (!IsTotalOrderMode()) {
-      return options_.prefix_extractor->Transform(user_key);
-    } else {
-      // Use empty slice as prefix if prefix_extractor is not set. In that case,
-      // it falls back to pure binary search and total iterator seek is
-      // supported.
-      return Slice();
-    }
-  }
-
-  bool IsTotalOrderMode() const {
-    return (options_.prefix_extractor.get() == nullptr);
-  }
+  bool IsTotalOrderMode() const { return (prefix_extractor_ == nullptr); }
 
   // No copying allowed
   explicit PlainTableReader(const TableReader&) = delete;
diff --git a/src/rocksdb/table/table_builder.h b/src/rocksdb/table/table_builder.h
index ee32cff..19da4c2 100644
--- a/src/rocksdb/table/table_builder.h
+++ b/src/rocksdb/table/table_builder.h
@@ -9,11 +9,42 @@
 
 #pragma once
 
+#include <string>
+#include <utility>
+#include <vector>
+#include "db/table_properties_collector.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table_properties.h"
+#include "util/mutable_cf_options.h"
+
 namespace rocksdb {
 
 class Slice;
 class Status;
 
+struct TableBuilderOptions {
+  TableBuilderOptions(
+      const ImmutableCFOptions& _ioptions,
+      const InternalKeyComparator& _internal_comparator,
+      const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
+          _int_tbl_prop_collector_factories,
+      CompressionType _compression_type,
+      const CompressionOptions& _compression_opts, bool _skip_filters)
+      : ioptions(_ioptions),
+        internal_comparator(_internal_comparator),
+        int_tbl_prop_collector_factories(_int_tbl_prop_collector_factories),
+        compression_type(_compression_type),
+        compression_opts(_compression_opts),
+        skip_filters(_skip_filters) {}
+  const ImmutableCFOptions& ioptions;
+  const InternalKeyComparator& internal_comparator;
+  const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
+      int_tbl_prop_collector_factories;
+  CompressionType compression_type;
+  const CompressionOptions& compression_opts;
+  bool skip_filters = false;
+};
+
 // TableBuilder provides the interface used to build a Table
 // (an immutable and sorted map from keys to values).
 //
@@ -50,6 +81,9 @@ class TableBuilder {
   // Size of the file generated so far.  If invoked after a successful
   // Finish() call, returns the size of the final generated file.
   virtual uint64_t FileSize() const = 0;
+
+  // Returns table properties
+  virtual TableProperties GetTableProperties() const = 0;
 };
 
 }  // namespace rocksdb
diff --git a/src/rocksdb/table/table_properties.cc b/src/rocksdb/table/table_properties.cc
index c7e1419..1ee34a6 100644
--- a/src/rocksdb/table/table_properties.cc
+++ b/src/rocksdb/table/table_properties.cc
@@ -3,9 +3,12 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 
+#include "table/table_properties_internal.h"
 #include "rocksdb/table_properties.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/env.h"
+#include "port/port.h"
+#include "util/string_util.h"
 
 namespace rocksdb {
 
@@ -30,7 +33,7 @@ namespace {
       const std::string& prop_delim,
       const std::string& kv_delim) {
     AppendProperty(
-        props, key, std::to_string(value), prop_delim, kv_delim
+        props, key, ToString(value), prop_delim, kv_delim
     );
   }
 }
diff --git a/src/rocksdb/table/table_properties_internal.h b/src/rocksdb/table/table_properties_internal.h
new file mode 100644
index 0000000..9ef8ad4
--- /dev/null
+++ b/src/rocksdb/table/table_properties_internal.h
@@ -0,0 +1,18 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include "rocksdb/status.h"
+#include "rocksdb/iterator.h"
+
+namespace rocksdb {
+
+// Seek to the properties block.
+// If it successfully seeks to the properties block, "is_found" will be
+// set to true.
+Status SeekToPropertiesBlock(Iterator* meta_iter, bool* is_found);
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/table/table_reader.h b/src/rocksdb/table/table_reader.h
index 02a2d16..2058b86 100644
--- a/src/rocksdb/table/table_reader.h
+++ b/src/rocksdb/table/table_reader.h
@@ -15,8 +15,10 @@ namespace rocksdb {
 class Iterator;
 struct ParsedInternalKey;
 class Slice;
+class Arena;
 struct ReadOptions;
 struct TableProperties;
+class GetContext;
 
 // A Table is a sorted map from strings to strings.  Tables are
 // immutable and persistent.  A Table may be safely accessed from
@@ -28,7 +30,11 @@ class TableReader {
   // Returns a new iterator over the table contents.
   // The result of NewIterator() is initially invalid (caller must
   // call one of the Seek methods on the iterator before using it).
-  virtual Iterator* NewIterator(const ReadOptions&) = 0;
+  // arena: If not null, the arena needs to be used to allocate the Iterator.
+  //        When destroying the iterator, the caller will not call "delete"
+  //        but Iterator::~Iterator() directly. The destructor needs to destroy
+  //        all the states but those allocated in arena.
+  virtual Iterator* NewIterator(const ReadOptions&, Arena* arena = nullptr) = 0;
 
   // Given a key, return an approximate byte offset in the file where
   // the data for that key begins (or would begin if the key were
@@ -44,23 +50,40 @@ class TableReader {
 
   virtual std::shared_ptr<const TableProperties> GetTableProperties() const = 0;
 
-  // Calls (*result_handler)(handle_context, ...) repeatedly, starting with
-  // the entry found after a call to Seek(key), until result_handler returns
-  // false, where k is the actual internal key for a row found and v as the
-  // value of the key. didIO is true if I/O is involved in the operation. May
-  // not make such a call if filter policy says that key is not present.
+  // Prepare work that can be done before the real Get()
+  virtual void Prepare(const Slice& target) {}
+
+  // Report an approximation of how much memory has been used.
+  virtual size_t ApproximateMemoryUsage() const = 0;
+
+  // Calls get_context->SaveValue() repeatedly, starting with
+  // the entry found after a call to Seek(key), until it returns false.
+  // May not make such a call if filter policy says that key is not present.
   //
-  // mark_key_may_exist_handler needs to be called when it is configured to be
-  // memory only and the key is not found in the block cache, with
-  // the parameter to be handle_context.
+  // get_context->MarkKeyMayExist needs to be called when it is configured to be
+  // memory only and the key is not found in the block cache.
   //
   // readOptions is the options for the read
   // key is the key to search for
-  virtual Status Get(
-      const ReadOptions& readOptions, const Slice& key, void* handle_context,
-      bool (*result_handler)(void* arg, const ParsedInternalKey& k,
-                             const Slice& v, bool didIO),
-      void (*mark_key_may_exist_handler)(void* handle_context) = nullptr) = 0;
+  virtual Status Get(const ReadOptions& readOptions, const Slice& key,
+                     GetContext* get_context) = 0;
+
+  // Prefetch data corresponding to a give range of keys
+  // Typically this functionality is required for table implementations that
+  // persists the data on a non volatile storage medium like disk/SSD
+  virtual Status Prefetch(const Slice* begin = nullptr,
+                          const Slice* end = nullptr) {
+    (void) begin;
+    (void) end;
+    // Default implementation is NOOP.
+    // The child class should implement functionality when applicable
+    return Status::OK();
+  }
+
+  // convert db file to a human readable form
+  virtual Status DumpTable(WritableFile* out_file) {
+    return Status::NotSupported("DumpTable() not supported");
+  }
 };
 
 }  // namespace rocksdb
diff --git a/src/rocksdb/table/table_reader_bench.cc b/src/rocksdb/table/table_reader_bench.cc
index a0ff0d7..b4039aa 100644
--- a/src/rocksdb/table/table_reader_bench.cc
+++ b/src/rocksdb/table/table_reader_bench.cc
@@ -3,6 +3,14 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+  return 1;
+}
+#else
+
 #include <gflags/gflags.h>
 
 #include "rocksdb/db.h"
@@ -10,14 +18,17 @@
 #include "rocksdb/table.h"
 #include "db/db_impl.h"
 #include "db/dbformat.h"
-#include "port/atomic_pointer.h"
 #include "table/block_based_table_factory.h"
 #include "table/plain_table_factory.h"
 #include "table/table_builder.h"
+#include "table/get_context.h"
 #include "util/histogram.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
 
+using GFLAGS::ParseCommandLineFlags;
+using GFLAGS::SetUsageMessage;
+
 namespace rocksdb {
 
 namespace {
@@ -37,11 +48,6 @@ static std::string MakeKey(int i, int j, bool through_db) {
   return key.Encode().ToString();
 }
 
-static bool DummySaveValue(void* arg, const ParsedInternalKey& ikey,
-                           const Slice& v, bool didIO) {
-  return false;
-}
-
 uint64_t Now(Env* env, bool measured_by_nanosecond) {
   return measured_by_nanosecond ? env->NowNanos() : env->NowMicros();
 }
@@ -77,10 +83,18 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
   TableBuilder* tb = nullptr;
   DB* db = nullptr;
   Status s;
+  const ImmutableCFOptions ioptions(opts);
   if (!through_db) {
     env->NewWritableFile(file_name, &file, env_options);
-    tb = opts.table_factory->NewTableBuilder(opts, ikc, file.get(),
-                                             CompressionType::kNoCompression);
+
+    std::vector<std::unique_ptr<IntTblPropCollectorFactory> >
+        int_tbl_prop_collector_factories;
+
+    tb = opts.table_factory->NewTableBuilder(
+        TableBuilderOptions(ioptions, ikc, &int_tbl_prop_collector_factories,
+                            CompressionType::kNoCompression,
+                            CompressionOptions(), false),
+        file.get());
   } else {
     s = DB::Open(opts, dbname, &db);
     ASSERT_OK(s);
@@ -107,18 +121,17 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
   unique_ptr<TableReader> table_reader;
   unique_ptr<RandomAccessFile> raf;
   if (!through_db) {
-    Status s = env->NewRandomAccessFile(file_name, &raf, env_options);
+    s = env->NewRandomAccessFile(file_name, &raf, env_options);
     uint64_t file_size;
     env->GetFileSize(file_name, &file_size);
     s = opts.table_factory->NewTableReader(
-        opts, env_options, ikc, std::move(raf), file_size, &table_reader);
+        ioptions, env_options, ikc, std::move(raf), file_size, &table_reader);
   }
 
   Random rnd(301);
   std::string result;
   HistogramImpl hist;
 
-  void* arg = nullptr;
   for (int it = 0; it < num_iter; it++) {
     for (int i = 0; i < num_keys1; i++) {
       for (int j = 0; j < num_keys2; j++) {
@@ -133,14 +146,17 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
           // Query one existing key;
           std::string key = MakeKey(r1, r2, through_db);
           uint64_t start_time = Now(env, measured_by_nanosecond);
-          port::MemoryBarrier();
           if (!through_db) {
-            s = table_reader->Get(read_options, key, arg, DummySaveValue,
-                                  nullptr);
+            std::string value;
+            MergeContext merge_context;
+            GetContext get_context(ioptions.comparator, ioptions.merge_operator,
+                                   ioptions.info_log, ioptions.statistics,
+                                   GetContext::kNotFound, Slice(key), &value,
+                                   nullptr, &merge_context, env);
+            s = table_reader->Get(read_options, key, &get_context);
           } else {
             s = db->Get(read_options, key, &result);
           }
-          port::MemoryBarrier();
           hist.Add(Now(env, measured_by_nanosecond) - start_time);
         } else {
           int r2_len;
@@ -156,7 +172,6 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
           std::string end_key = MakeKey(r1, r2 + r2_len, through_db);
           uint64_t total_time = 0;
           uint64_t start_time = Now(env, measured_by_nanosecond);
-          port::MemoryBarrier();
           Iterator* iter;
           if (!through_db) {
             iter = table_reader->NewIterator(read_options);
@@ -169,7 +184,6 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
               break;
             }
             // verify key;
-            port::MemoryBarrier();
             total_time += Now(env, measured_by_nanosecond) - start_time;
             assert(Slice(MakeKey(r1, r2 + count, through_db)) == iter->key());
             start_time = Now(env, measured_by_nanosecond);
@@ -184,7 +198,6 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
             assert(false);
           }
           delete iter;
-          port::MemoryBarrier();
           total_time += Now(env, measured_by_nanosecond) - start_time;
           hist.Add(total_time);
         }
@@ -226,17 +239,19 @@ DEFINE_bool(iterator, false, "For test iterator");
 DEFINE_bool(through_db, false, "If enable, a DB instance will be created and "
             "the query will be against DB. Otherwise, will be directly against "
             "a table reader.");
-DEFINE_bool(plain_table, false, "Use PlainTable");
+DEFINE_string(table_factory, "block_based",
+              "Table factory to use: `block_based` (default), `plain_table` or "
+              "`cuckoo_hash`.");
 DEFINE_string(time_unit, "microsecond",
               "The time unit used for measuring performance. User can specify "
               "`microsecond` (default) or `nanosecond`");
 
 int main(int argc, char** argv) {
-  google::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
-                          " [OPTIONS]...");
-  google::ParseCommandLineFlags(&argc, &argv, true);
+  SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
+                  " [OPTIONS]...");
+  ParseCommandLineFlags(&argc, &argv, true);
 
-  rocksdb::TableFactory* tf = new rocksdb::BlockBasedTableFactory();
+  std::shared_ptr<rocksdb::TableFactory> tf;
   rocksdb::Options options;
   if (FLAGS_prefix_len < 16) {
     options.prefix_extractor.reset(rocksdb::NewFixedPrefixTransform(
@@ -247,25 +262,54 @@ int main(int argc, char** argv) {
   options.create_if_missing = true;
   options.compression = rocksdb::CompressionType::kNoCompression;
 
-  if (FLAGS_plain_table) {
+  if (FLAGS_table_factory == "cuckoo_hash") {
+#ifndef ROCKSDB_LITE
+    options.allow_mmap_reads = true;
+    env_options.use_mmap_reads = true;
+    rocksdb::CuckooTableOptions table_options;
+    table_options.hash_table_ratio = 0.75;
+    tf.reset(rocksdb::NewCuckooTableFactory(table_options));
+#else
+    fprintf(stderr, "Plain table is not supported in lite mode\n");
+    exit(1);
+#endif  // ROCKSDB_LITE
+  } else if (FLAGS_table_factory == "plain_table") {
+#ifndef ROCKSDB_LITE
     options.allow_mmap_reads = true;
     env_options.use_mmap_reads = true;
-    tf = new rocksdb::PlainTableFactory(16, (FLAGS_prefix_len == 16) ? 0 : 8,
-                                        0.75);
+
+    rocksdb::PlainTableOptions plain_table_options;
+    plain_table_options.user_key_len = 16;
+    plain_table_options.bloom_bits_per_key = (FLAGS_prefix_len == 16) ? 0 : 8;
+    plain_table_options.hash_table_ratio = 0.75;
+
+    tf.reset(new rocksdb::PlainTableFactory(plain_table_options));
     options.prefix_extractor.reset(rocksdb::NewFixedPrefixTransform(
         FLAGS_prefix_len));
+#else
+    fprintf(stderr, "Cuckoo table is not supported in lite mode\n");
+    exit(1);
+#endif  // ROCKSDB_LITE
+  } else if (FLAGS_table_factory == "block_based") {
+    tf.reset(new rocksdb::BlockBasedTableFactory());
+  } else {
+    fprintf(stderr, "Invalid table type %s\n", FLAGS_table_factory.c_str());
+  }
+
+  if (tf) {
+    // if user provides invalid options, just fall back to microsecond.
+    bool measured_by_nanosecond = FLAGS_time_unit == "nanosecond";
+
+    options.table_factory = tf;
+    rocksdb::TableReaderBenchmark(options, env_options, ro, FLAGS_num_keys1,
+                                  FLAGS_num_keys2, FLAGS_iter, FLAGS_prefix_len,
+                                  FLAGS_query_empty, FLAGS_iterator,
+                                  FLAGS_through_db, measured_by_nanosecond);
   } else {
-    tf = new rocksdb::BlockBasedTableFactory();
+    return 1;
   }
-  // if user provides invalid options, just fall back to microsecond.
-  bool measured_by_nanosecond = FLAGS_time_unit == "nanosecond";
 
-  options.table_factory =
-      std::shared_ptr<rocksdb::TableFactory>(tf);
-  rocksdb::TableReaderBenchmark(options, env_options, ro, FLAGS_num_keys1,
-                                FLAGS_num_keys2, FLAGS_iter, FLAGS_prefix_len,
-                                FLAGS_query_empty, FLAGS_iterator,
-                                FLAGS_through_db, measured_by_nanosecond);
-  delete tf;
   return 0;
 }
+
+#endif  // GFLAGS
diff --git a/src/rocksdb/table/table_test.cc b/src/rocksdb/table/table_test.cc
index dd81bae..6f7b4db 100644
--- a/src/rocksdb/table/table_test.cc
+++ b/src/rocksdb/table/table_test.cc
@@ -11,6 +11,7 @@
 #include <stdio.h>
 
 #include <algorithm>
+#include <iostream>
 #include <map>
 #include <string>
 #include <memory>
@@ -19,6 +20,7 @@
 #include "db/dbformat.h"
 #include "db/memtable.h"
 #include "db/write_batch_internal.h"
+#include "db/writebuffer.h"
 
 #include "rocksdb/cache.h"
 #include "rocksdb/db.h"
@@ -36,11 +38,18 @@
 #include "table/format.h"
 #include "table/meta_blocks.h"
 #include "table/plain_table_factory.h"
+#include "table/get_context.h"
 
+#include "util/compression.h"
 #include "util/random.h"
 #include "util/statistics.h"
+#include "util/string_util.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
+#include "util/scoped_arena_iterator.h"
+
+using std::vector;
+using std::string;
 
 namespace rocksdb {
 
@@ -61,24 +70,23 @@ std::string Reverse(const Slice& key) {
 
 class ReverseKeyComparator : public Comparator {
  public:
-  virtual const char* Name() const {
+  virtual const char* Name() const override {
     return "rocksdb.ReverseBytewiseComparator";
   }
 
-  virtual int Compare(const Slice& a, const Slice& b) const {
+  virtual int Compare(const Slice& a, const Slice& b) const override {
     return BytewiseComparator()->Compare(Reverse(a), Reverse(b));
   }
 
-  virtual void FindShortestSeparator(
-      std::string* start,
-      const Slice& limit) const {
+  virtual void FindShortestSeparator(std::string* start,
+                                     const Slice& limit) const override {
     std::string s = Reverse(*start);
     std::string l = Reverse(limit);
     BytewiseComparator()->FindShortestSeparator(&s, l);
     *start = Reverse(s);
   }
 
-  virtual void FindShortSuccessor(std::string* key) const {
+  virtual void FindShortSuccessor(std::string* key) const override {
     std::string s = Reverse(*key);
     BytewiseComparator()->FindShortSuccessor(&s);
     *key = Reverse(s);
@@ -117,11 +125,11 @@ class StringSink: public WritableFile {
 
   const std::string& contents() const { return contents_; }
 
-  virtual Status Close() { return Status::OK(); }
-  virtual Status Flush() { return Status::OK(); }
-  virtual Status Sync() { return Status::OK(); }
+  virtual Status Close() override { return Status::OK(); }
+  virtual Status Flush() override { return Status::OK(); }
+  virtual Status Sync() override { return Status::OK(); }
 
-  virtual Status Append(const Slice& data) {
+  virtual Status Append(const Slice& data) override {
     contents_.append(data.data(), data.size());
     return Status::OK();
   }
@@ -143,7 +151,7 @@ class StringSource: public RandomAccessFile {
   uint64_t Size() const { return contents_.size(); }
 
   virtual Status Read(uint64_t offset, size_t n, Slice* result,
-                       char* scratch) const {
+                      char* scratch) const override {
     if (offset > contents_.size()) {
       return Status::InvalidArgument("invalid Read offset");
     }
@@ -159,7 +167,7 @@ class StringSource: public RandomAccessFile {
     return Status::OK();
   }
 
-  virtual size_t GetUniqueId(char* id, size_t max_size) const {
+  virtual size_t GetUniqueId(char* id, size_t max_size) const override {
     if (max_size < 20) {
       return 0;
     }
@@ -193,6 +201,8 @@ class Constructor {
   // been added so far.  Returns the keys in sorted order in "*keys"
   // and stores the key/value pairs in "*kvmap"
   void Finish(const Options& options,
+              const ImmutableCFOptions& ioptions,
+              const BlockBasedTableOptions& table_options,
               const InternalKeyComparator& internal_comparator,
               std::vector<std::string>* keys, KVMap* kvmap) {
     last_internal_key_ = &internal_comparator;
@@ -204,12 +214,15 @@ class Constructor {
       keys->push_back(it->first);
     }
     data_.clear();
-    Status s = FinishImpl(options, internal_comparator, *kvmap);
+    Status s = FinishImpl(options, ioptions, table_options,
+                          internal_comparator, *kvmap);
     ASSERT_TRUE(s.ok()) << s.ToString();
   }
 
   // Construct the data structure from the data in "data"
   virtual Status FinishImpl(const Options& options,
+                            const ImmutableCFOptions& ioptions,
+                            const BlockBasedTableOptions& table_options,
                             const InternalKeyComparator& internal_comparator,
                             const KVMap& data) = 0;
 
@@ -217,8 +230,12 @@ class Constructor {
 
   virtual const KVMap& data() { return data_; }
 
+  virtual bool IsArenaMode() const { return false; }
+
   virtual DB* db() const { return nullptr; }  // Overridden in DBConstructor
 
+  virtual bool AnywayDeleteIterator() const { return false; }
+
  protected:
   const InternalKeyComparator* last_internal_key_;
 
@@ -236,27 +253,26 @@ class BlockConstructor: public Constructor {
     delete block_;
   }
   virtual Status FinishImpl(const Options& options,
+                            const ImmutableCFOptions& ioptions,
+                            const BlockBasedTableOptions& table_options,
                             const InternalKeyComparator& internal_comparator,
-                            const KVMap& data) {
+                            const KVMap& kv_map) override {
     delete block_;
     block_ = nullptr;
-    BlockBuilder builder(options, &internal_comparator);
+    BlockBuilder builder(table_options.block_restart_interval);
 
-    for (KVMap::const_iterator it = data.begin();
-         it != data.end();
-         ++it) {
-      builder.Add(it->first, it->second);
+    for (const auto kv : kv_map) {
+      builder.Add(kv.first, kv.second);
     }
     // Open the block
     data_ = builder.Finish().ToString();
     BlockContents contents;
     contents.data = data_;
     contents.cachable = false;
-    contents.heap_allocated = false;
-    block_ = new Block(contents);
+    block_ = new Block(std::move(contents));
     return Status::OK();
   }
-  virtual Iterator* NewIterator() const {
+  virtual Iterator* NewIterator() const override {
     return block_->NewIterator(comparator_);
   }
 
@@ -271,38 +287,46 @@ class BlockConstructor: public Constructor {
 // A helper class that converts internal format keys into user keys
 class KeyConvertingIterator: public Iterator {
  public:
-  explicit KeyConvertingIterator(Iterator* iter) : iter_(iter) { }
-  virtual ~KeyConvertingIterator() { delete iter_; }
-  virtual bool Valid() const { return iter_->Valid(); }
-  virtual void Seek(const Slice& target) {
+  KeyConvertingIterator(Iterator* iter, bool arena_mode = false)
+      : iter_(iter), arena_mode_(arena_mode) {}
+  virtual ~KeyConvertingIterator() {
+    if (arena_mode_) {
+      iter_->~Iterator();
+    } else {
+      delete iter_;
+    }
+  }
+  virtual bool Valid() const override { return iter_->Valid(); }
+  virtual void Seek(const Slice& target) override {
     ParsedInternalKey ikey(target, kMaxSequenceNumber, kTypeValue);
     std::string encoded;
     AppendInternalKey(&encoded, ikey);
     iter_->Seek(encoded);
   }
-  virtual void SeekToFirst() { iter_->SeekToFirst(); }
-  virtual void SeekToLast() { iter_->SeekToLast(); }
-  virtual void Next() { iter_->Next(); }
-  virtual void Prev() { iter_->Prev(); }
+  virtual void SeekToFirst() override { iter_->SeekToFirst(); }
+  virtual void SeekToLast() override { iter_->SeekToLast(); }
+  virtual void Next() override { iter_->Next(); }
+  virtual void Prev() override { iter_->Prev(); }
 
-  virtual Slice key() const {
+  virtual Slice key() const override {
     assert(Valid());
-    ParsedInternalKey key;
-    if (!ParseInternalKey(iter_->key(), &key)) {
+    ParsedInternalKey parsed_key;
+    if (!ParseInternalKey(iter_->key(), &parsed_key)) {
       status_ = Status::Corruption("malformed internal key");
       return Slice("corrupted key");
     }
-    return key.user_key;
+    return parsed_key.user_key;
   }
 
-  virtual Slice value() const { return iter_->value(); }
-  virtual Status status() const {
+  virtual Slice value() const override { return iter_->value(); }
+  virtual Status status() const override {
     return status_.ok() ? iter_->status() : status_;
   }
 
  private:
   mutable Status status_;
   Iterator* iter_;
+  bool arena_mode_;
 
   // No copying allowed
   KeyConvertingIterator(const KeyConvertingIterator&);
@@ -318,42 +342,47 @@ class TableConstructor: public Constructor {
   ~TableConstructor() { Reset(); }
 
   virtual Status FinishImpl(const Options& options,
+                            const ImmutableCFOptions& ioptions,
+                            const BlockBasedTableOptions& table_options,
                             const InternalKeyComparator& internal_comparator,
-                            const KVMap& data) {
+                            const KVMap& kv_map) override {
     Reset();
     sink_.reset(new StringSink());
     unique_ptr<TableBuilder> builder;
-    builder.reset(options.table_factory->NewTableBuilder(
-        options, internal_comparator, sink_.get(), options.compression));
-
-    for (KVMap::const_iterator it = data.begin();
-         it != data.end();
-         ++it) {
+    std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
+        int_tbl_prop_collector_factories;
+    builder.reset(ioptions.table_factory->NewTableBuilder(
+        TableBuilderOptions(ioptions, internal_comparator,
+                            &int_tbl_prop_collector_factories,
+                            options.compression, CompressionOptions(), false),
+        sink_.get()));
+
+    for (const auto kv : kv_map) {
       if (convert_to_internal_key_) {
-        ParsedInternalKey ikey(it->first, kMaxSequenceNumber, kTypeValue);
+        ParsedInternalKey ikey(kv.first, kMaxSequenceNumber, kTypeValue);
         std::string encoded;
         AppendInternalKey(&encoded, ikey);
-        builder->Add(encoded, it->second);
+        builder->Add(encoded, kv.second);
       } else {
-        builder->Add(it->first, it->second);
+        builder->Add(kv.first, kv.second);
       }
-      ASSERT_TRUE(builder->status().ok());
+      EXPECT_TRUE(builder->status().ok());
     }
     Status s = builder->Finish();
-    ASSERT_TRUE(s.ok()) << s.ToString();
+    EXPECT_TRUE(s.ok()) << s.ToString();
 
-    ASSERT_EQ(sink_->contents().size(), builder->FileSize());
+    EXPECT_EQ(sink_->contents().size(), builder->FileSize());
 
     // Open the table
     uniq_id_ = cur_uniq_id_++;
     source_.reset(new StringSource(sink_->contents(), uniq_id_,
-                                   options.allow_mmap_reads));
-    return options.table_factory->NewTableReader(
-        options, soptions, internal_comparator, std::move(source_),
+                                   ioptions.allow_mmap_reads));
+    return ioptions.table_factory->NewTableReader(
+        ioptions, soptions, internal_comparator, std::move(source_),
         sink_->contents().size(), &table_reader_);
   }
 
-  virtual Iterator* NewIterator() const {
+  virtual Iterator* NewIterator() const override {
     ReadOptions ro;
     Iterator* iter = table_reader_->NewIterator(ro);
     if (convert_to_internal_key_) {
@@ -367,19 +396,23 @@ class TableConstructor: public Constructor {
     return table_reader_->ApproximateOffsetOf(key);
   }
 
-  virtual Status Reopen(const Options& options) {
+  virtual Status Reopen(const ImmutableCFOptions& ioptions) {
     source_.reset(
         new StringSource(sink_->contents(), uniq_id_,
-                         options.allow_mmap_reads));
-    return options.table_factory->NewTableReader(
-        options, soptions, *last_internal_key_, std::move(source_),
+                         ioptions.allow_mmap_reads));
+    return ioptions.table_factory->NewTableReader(
+        ioptions, soptions, *last_internal_key_, std::move(source_),
         sink_->contents().size(), &table_reader_);
   }
 
-  virtual TableReader* table_reader() {
+  virtual TableReader* GetTableReader() {
     return table_reader_.get();
   }
 
+  virtual bool AnywayDeleteIterator() const override {
+    return convert_to_internal_key_;
+  }
+
  private:
   void Reset() {
     uniq_id_ = 0;
@@ -387,12 +420,12 @@ class TableConstructor: public Constructor {
     sink_.reset();
     source_.reset();
   }
-  bool convert_to_internal_key_;
 
   uint64_t uniq_id_;
   unique_ptr<StringSink> sink_;
   unique_ptr<StringSource> source_;
   unique_ptr<TableReader> table_reader_;
+  bool convert_to_internal_key_;
 
   TableConstructor();
 
@@ -403,41 +436,51 @@ uint64_t TableConstructor::cur_uniq_id_ = 1;
 
 class MemTableConstructor: public Constructor {
  public:
-  explicit MemTableConstructor(const Comparator* cmp)
+  explicit MemTableConstructor(const Comparator* cmp, WriteBuffer* wb)
       : Constructor(cmp),
         internal_comparator_(cmp),
+        write_buffer_(wb),
         table_factory_(new SkipListFactory) {
-    Options options;
-    options.memtable_factory = table_factory_;
-    memtable_ = new MemTable(internal_comparator_, options);
+    options_.memtable_factory = table_factory_;
+    ImmutableCFOptions ioptions(options_);
+    memtable_ = new MemTable(internal_comparator_, ioptions,
+                             MutableCFOptions(options_, ioptions), wb);
     memtable_->Ref();
   }
   ~MemTableConstructor() {
     delete memtable_->Unref();
   }
-  virtual Status FinishImpl(const Options& options,
+  virtual Status FinishImpl(const Options&, const ImmutableCFOptions& ioptions,
+                            const BlockBasedTableOptions& table_options,
                             const InternalKeyComparator& internal_comparator,
-                            const KVMap& data) {
+                            const KVMap& kv_map) override {
     delete memtable_->Unref();
-    Options memtable_options;
-    memtable_options.memtable_factory = table_factory_;
-    memtable_ = new MemTable(internal_comparator_, memtable_options);
+    ImmutableCFOptions mem_ioptions(ioptions);
+    memtable_ = new MemTable(internal_comparator_, mem_ioptions,
+                             MutableCFOptions(options_, mem_ioptions),
+                             write_buffer_);
     memtable_->Ref();
     int seq = 1;
-    for (KVMap::const_iterator it = data.begin();
-         it != data.end();
-         ++it) {
-      memtable_->Add(seq, kTypeValue, it->first, it->second);
+    for (const auto kv : kv_map) {
+      memtable_->Add(seq, kTypeValue, kv.first, kv.second);
       seq++;
     }
     return Status::OK();
   }
-  virtual Iterator* NewIterator() const {
-    return new KeyConvertingIterator(memtable_->NewIterator(ReadOptions()));
+  virtual Iterator* NewIterator() const override {
+    return new KeyConvertingIterator(
+        memtable_->NewIterator(ReadOptions(), &arena_), true);
   }
 
+  virtual bool AnywayDeleteIterator() const override { return true; }
+
+  virtual bool IsArenaMode() const override { return true; }
+
  private:
+  mutable Arena arena_;
   InternalKeyComparator internal_comparator_;
+  Options options_;
+  WriteBuffer* write_buffer_;
   MemTable* memtable_;
   std::shared_ptr<SkipListFactory> table_factory_;
 };
@@ -454,25 +497,25 @@ class DBConstructor: public Constructor {
     delete db_;
   }
   virtual Status FinishImpl(const Options& options,
+                            const ImmutableCFOptions& ioptions,
+                            const BlockBasedTableOptions& table_options,
                             const InternalKeyComparator& internal_comparator,
-                            const KVMap& data) {
+                            const KVMap& kv_map) override {
     delete db_;
     db_ = nullptr;
     NewDB();
-    for (KVMap::const_iterator it = data.begin();
-         it != data.end();
-         ++it) {
+    for (const auto kv : kv_map) {
       WriteBatch batch;
-      batch.Put(it->first, it->second);
-      ASSERT_TRUE(db_->Write(WriteOptions(), &batch).ok());
+      batch.Put(kv.first, kv.second);
+      EXPECT_TRUE(db_->Write(WriteOptions(), &batch).ok());
     }
     return Status::OK();
   }
-  virtual Iterator* NewIterator() const {
+  virtual Iterator* NewIterator() const override {
     return db_->NewIterator(ReadOptions());
   }
 
-  virtual DB* db() const { return db_; }
+  virtual DB* db() const override { return db_; }
 
  private:
   void NewDB() {
@@ -494,64 +537,6 @@ class DBConstructor: public Constructor {
   DB* db_;
 };
 
-static bool SnappyCompressionSupported() {
-#ifdef SNAPPY
-  std::string out;
-  Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
-  return port::Snappy_Compress(Options().compression_opts,
-                               in.data(), in.size(),
-                               &out);
-#else
-  return false;
-#endif
-}
-
-static bool ZlibCompressionSupported() {
-#ifdef ZLIB
-  std::string out;
-  Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
-  return port::Zlib_Compress(Options().compression_opts,
-                             in.data(), in.size(),
-                             &out);
-#else
-  return false;
-#endif
-}
-
-static bool BZip2CompressionSupported() {
-#ifdef BZIP2
-  std::string out;
-  Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
-  return port::BZip2_Compress(Options().compression_opts,
-                              in.data(), in.size(),
-                              &out);
-#else
-  return false;
-#endif
-}
-
-static bool LZ4CompressionSupported() {
-#ifdef LZ4
-  std::string out;
-  Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
-  return port::LZ4_Compress(Options().compression_opts, in.data(), in.size(),
-                            &out);
-#else
-  return false;
-#endif
-}
-
-static bool LZ4HCCompressionSupported() {
-#ifdef LZ4
-  std::string out;
-  Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
-  return port::LZ4HC_Compress(Options().compression_opts, in.data(), in.size(),
-                              &out);
-#else
-  return false;
-#endif
-}
-
 enum TestType {
   BLOCK_BASED_TABLE_TEST,
   PLAIN_TABLE_SEMI_FIXED_PREFIX,
@@ -567,6 +552,7 @@ struct TestArgs {
   bool reverse_compare;
   int restart_interval;
   CompressionType compression;
+  uint32_t format_version;
 };
 
 static std::vector<TestArgs> GenerateArgList() {
@@ -580,22 +566,24 @@ static std::vector<TestArgs> GenerateArgList() {
   std::vector<int> restart_intervals = {16, 1, 1024};
 
   // Only add compression if it is supported
-  std::vector<CompressionType> compression_types;
-  compression_types.push_back(kNoCompression);
-  if (SnappyCompressionSupported()) {
-    compression_types.push_back(kSnappyCompression);
-  }
-  if (ZlibCompressionSupported()) {
-    compression_types.push_back(kZlibCompression);
+  std::vector<std::pair<CompressionType, bool>> compression_types;
+  compression_types.emplace_back(kNoCompression, false);
+  if (Snappy_Supported()) {
+    compression_types.emplace_back(kSnappyCompression, false);
   }
-  if (BZip2CompressionSupported()) {
-    compression_types.push_back(kBZip2Compression);
+  if (Zlib_Supported()) {
+    compression_types.emplace_back(kZlibCompression, false);
+    compression_types.emplace_back(kZlibCompression, true);
   }
-  if (LZ4CompressionSupported()) {
-    compression_types.push_back(kLZ4Compression);
+  if (BZip2_Supported()) {
+    compression_types.emplace_back(kBZip2Compression, false);
+    compression_types.emplace_back(kBZip2Compression, true);
   }
-  if (LZ4HCCompressionSupported()) {
-    compression_types.push_back(kLZ4HCCompression);
+  if (LZ4_Supported()) {
+    compression_types.emplace_back(kLZ4Compression, false);
+    compression_types.emplace_back(kLZ4Compression, true);
+    compression_types.emplace_back(kLZ4HCCompression, false);
+    compression_types.emplace_back(kLZ4HCCompression, true);
   }
 
   for (auto test_type : test_types) {
@@ -607,7 +595,7 @@ static std::vector<TestArgs> GenerateArgList() {
         one_arg.type = test_type;
         one_arg.reverse_compare = reverse_compare;
         one_arg.restart_interval = restart_intervals[0];
-        one_arg.compression = compression_types[0];
+        one_arg.compression = compression_types[0].first;
         test_args.push_back(one_arg);
         continue;
       }
@@ -618,7 +606,8 @@ static std::vector<TestArgs> GenerateArgList() {
           one_arg.type = test_type;
           one_arg.reverse_compare = reverse_compare;
           one_arg.restart_interval = restart_interval;
-          one_arg.compression = compression_type;
+          one_arg.compression = compression_type.first;
+          one_arg.format_version = compression_type.second ? 2 : 1;
           test_args.push_back(one_arg);
         }
       }
@@ -640,11 +629,9 @@ class FixedOrLessPrefixTransform : public SliceTransform {
       prefix_len_(prefix_len) {
   }
 
-  virtual const char* Name() const {
-    return "rocksdb.FixedPrefix";
-  }
+  virtual const char* Name() const override { return "rocksdb.FixedPrefix"; }
 
-  virtual Slice Transform(const Slice& src) const {
+  virtual Slice Transform(const Slice& src) const override {
     assert(InDomain(src));
     if (src.size() < prefix_len_) {
       return src;
@@ -652,29 +639,27 @@ class FixedOrLessPrefixTransform : public SliceTransform {
     return Slice(src.data(), prefix_len_);
   }
 
-  virtual bool InDomain(const Slice& src) const {
-    return true;
-  }
+  virtual bool InDomain(const Slice& src) const override { return true; }
 
-  virtual bool InRange(const Slice& dst) const {
+  virtual bool InRange(const Slice& dst) const override {
     return (dst.size() <= prefix_len_);
   }
 };
 
-class Harness {
+class HarnessTest : public testing::Test {
  public:
-  Harness() : constructor_(nullptr) { }
+  HarnessTest()
+      : ioptions_(options_),
+        constructor_(nullptr),
+        write_buffer_(options_.db_write_buffer_size) {}
 
   void Init(const TestArgs& args) {
     delete constructor_;
     constructor_ = nullptr;
     options_ = Options();
-
-    options_.block_restart_interval = args.restart_interval;
     options_.compression = args.compression;
     // Use shorter block size for tests to exercise block boundary
     // conditions more.
-    options_.block_size = 256;
     if (args.reverse_compare) {
       options_.comparator = &reverse_key_comparator;
     }
@@ -684,12 +669,15 @@ class Harness {
 
     support_prev_ = true;
     only_support_prefix_seek_ = false;
-    BlockBasedTableOptions table_options;
     switch (args.type) {
       case BLOCK_BASED_TABLE_TEST:
-        table_options.flush_block_policy_factory.reset(
+        table_options_.flush_block_policy_factory.reset(
             new FlushBlockBySizePolicyFactory());
-        options_.table_factory.reset(new BlockBasedTableFactory(table_options));
+        table_options_.block_size = 256;
+        table_options_.block_restart_interval = args.restart_interval;
+        table_options_.format_version = args.format_version;
+        options_.table_factory.reset(
+            new BlockBasedTableFactory(table_options_));
         constructor_ = new TableConstructor(options_.comparator);
         break;
       case PLAIN_TABLE_SEMI_FIXED_PREFIX:
@@ -717,26 +705,44 @@ class Harness {
         only_support_prefix_seek_ = false;
         options_.prefix_extractor = nullptr;
         options_.allow_mmap_reads = true;
-        options_.table_factory.reset(NewTotalOrderPlainTableFactory());
+
+        {
+          PlainTableOptions plain_table_options;
+          plain_table_options.user_key_len = kPlainTableVariableLength;
+          plain_table_options.bloom_bits_per_key = 0;
+          plain_table_options.hash_table_ratio = 0;
+
+          options_.table_factory.reset(
+              NewPlainTableFactory(plain_table_options));
+        }
         constructor_ = new TableConstructor(options_.comparator, true);
         internal_comparator_.reset(
             new InternalKeyComparator(options_.comparator));
         break;
       case BLOCK_TEST:
+        table_options_.block_size = 256;
+        options_.table_factory.reset(
+            new BlockBasedTableFactory(table_options_));
         constructor_ = new BlockConstructor(options_.comparator);
         break;
       case MEMTABLE_TEST:
-        constructor_ = new MemTableConstructor(options_.comparator);
+        table_options_.block_size = 256;
+        options_.table_factory.reset(
+            new BlockBasedTableFactory(table_options_));
+        constructor_ = new MemTableConstructor(options_.comparator,
+                                               &write_buffer_);
         break;
       case DB_TEST:
+        table_options_.block_size = 256;
+        options_.table_factory.reset(
+            new BlockBasedTableFactory(table_options_));
         constructor_ = new DBConstructor(options_.comparator);
         break;
     }
+    ioptions_ = ImmutableCFOptions(options_);
   }
 
-  ~Harness() {
-    delete constructor_;
-  }
+  ~HarnessTest() { delete constructor_; }
 
   void Add(const std::string& key, const std::string& value) {
     constructor_->Add(key, value);
@@ -745,7 +751,8 @@ class Harness {
   void Test(Random* rnd) {
     std::vector<std::string> keys;
     KVMap data;
-    constructor_->Finish(options_, *internal_comparator_, &keys, &data);
+    constructor_->Finish(options_, ioptions_, table_options_,
+                         *internal_comparator_, &keys, &data);
 
     TestForwardScan(keys, data);
     if (support_prev_) {
@@ -766,7 +773,11 @@ class Harness {
       iter->Next();
     }
     ASSERT_TRUE(!iter->Valid());
-    delete iter;
+    if (constructor_->IsArenaMode() && !constructor_->AnywayDeleteIterator()) {
+      iter->~Iterator();
+    } else {
+      delete iter;
+    }
   }
 
   void TestBackwardScan(const std::vector<std::string>& keys,
@@ -781,7 +792,11 @@ class Harness {
       iter->Prev();
     }
     ASSERT_TRUE(!iter->Valid());
-    delete iter;
+    if (constructor_->IsArenaMode() && !constructor_->AnywayDeleteIterator()) {
+      iter->~Iterator();
+    } else {
+      delete iter;
+    }
   }
 
   void TestRandomAccess(Random* rnd,
@@ -851,7 +866,11 @@ class Harness {
         }
       }
     }
-    delete iter;
+    if (constructor_->IsArenaMode() && !constructor_->AnywayDeleteIterator()) {
+      iter->~Iterator();
+    } else {
+      delete iter;
+    }
   }
 
   std::string ToString(const KVMap& data, const KVMap::const_iterator& it) {
@@ -883,7 +902,7 @@ class Harness {
     if (keys.empty()) {
       return "foo";
     } else {
-      const int index = rnd->Uniform(keys.size());
+      const int index = rnd->Uniform(static_cast<int>(keys.size()));
       std::string result = keys[index];
       switch (rnd->Uniform(support_prev_ ? 3 : 1)) {
         case 0:
@@ -914,7 +933,10 @@ class Harness {
 
  private:
   Options options_ = Options();
+  ImmutableCFOptions ioptions_;
+  BlockBasedTableOptions table_options_ = BlockBasedTableOptions();
   Constructor* constructor_;
+  WriteBuffer write_buffer_;
   bool support_prev_;
   bool only_support_prefix_seek_;
   shared_ptr<InternalKeyComparator> internal_comparator_;
@@ -932,7 +954,7 @@ static bool Between(uint64_t val, uint64_t low, uint64_t high) {
 }
 
 // Tests against all kinds of tables
-class TableTest {
+class TableTest : public testing::Test {
  public:
   const InternalKeyComparator& GetPlainInternalComparator(
       const Comparator* comp) {
@@ -950,11 +972,11 @@ class TableTest {
 class GeneralTableTest : public TableTest {};
 class BlockBasedTableTest : public TableTest {};
 class PlainTableTest : public TableTest {};
-class TablePropertyTest {};
+class TablePropertyTest : public testing::Test {};
 
 // This test serves as the living tutorial for the prefix scan of user collected
 // properties.
-TEST(TablePropertyTest, PrefixScanTest) {
+TEST_F(TablePropertyTest, PrefixScanTest) {
   UserCollectedProperties props{{"num.111.1", "1"},
                                 {"num.111.2", "2"},
                                 {"num.111.3", "3"},
@@ -973,9 +995,9 @@ TEST(TablePropertyTest, PrefixScanTest) {
              pos->first.compare(0, prefix.size(), prefix) == 0;
          ++pos) {
       ++num;
-      auto key = prefix + "." + std::to_string(num);
+      auto key = prefix + "." + ToString(num);
       ASSERT_EQ(key, pos->first);
-      ASSERT_EQ(std::to_string(num), pos->second);
+      ASSERT_EQ(ToString(num), pos->second);
     }
     ASSERT_EQ(3, num);
   }
@@ -991,7 +1013,7 @@ TEST(TablePropertyTest, PrefixScanTest) {
 
 // This test include all the basic checks except those for index size and block
 // size, which will be conducted in separated unit tests.
-TEST(BlockBasedTableTest, BasicBlockBasedTableProperties) {
+TEST_F(BlockBasedTableTest, BasicBlockBasedTableProperties) {
   TableConstructor c(BytewiseComparator());
 
   c.Add("a1", "val1");
@@ -1008,12 +1030,15 @@ TEST(BlockBasedTableTest, BasicBlockBasedTableProperties) {
   KVMap kvmap;
   Options options;
   options.compression = kNoCompression;
-  options.block_restart_interval = 1;
+  BlockBasedTableOptions table_options;
+  table_options.block_restart_interval = 1;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
-  c.Finish(options, GetPlainInternalComparator(options.comparator), &keys,
-           &kvmap);
+  const ImmutableCFOptions ioptions(options);
+  c.Finish(options, ioptions, table_options,
+           GetPlainInternalComparator(options.comparator), &keys, &kvmap);
 
-  auto& props = *c.table_reader()->GetTableProperties();
+  auto& props = *c.GetTableReader()->GetTableProperties();
   ASSERT_EQ(kvmap.size(), props.num_entries);
 
   auto raw_key_size = kvmap.size() * 2ul;
@@ -1025,7 +1050,7 @@ TEST(BlockBasedTableTest, BasicBlockBasedTableProperties) {
   ASSERT_EQ("", props.filter_policy_name);  // no filter policy is used
 
   // Verify data size.
-  BlockBuilder block_builder(options, options.comparator);
+  BlockBuilder block_builder(1);
   for (const auto& item : kvmap) {
     block_builder.Add(item.first, item.second);
   }
@@ -1033,35 +1058,244 @@ TEST(BlockBasedTableTest, BasicBlockBasedTableProperties) {
   ASSERT_EQ(content.size() + kBlockTrailerSize, props.data_size);
 }
 
-TEST(BlockBasedTableTest, FilterPolicyNameProperties) {
-  TableConstructor c(BytewiseComparator());
+TEST_F(BlockBasedTableTest, FilterPolicyNameProperties) {
+  TableConstructor c(BytewiseComparator(), true);
   c.Add("a1", "val1");
   std::vector<std::string> keys;
   KVMap kvmap;
+  BlockBasedTableOptions table_options;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10));
   Options options;
-  std::unique_ptr<const FilterPolicy> filter_policy(NewBloomFilterPolicy(10));
-  options.filter_policy = filter_policy.get();
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
-  c.Finish(options, GetPlainInternalComparator(options.comparator), &keys,
-           &kvmap);
-  auto& props = *c.table_reader()->GetTableProperties();
+  const ImmutableCFOptions ioptions(options);
+  c.Finish(options, ioptions, table_options,
+           GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+  auto& props = *c.GetTableReader()->GetTableProperties();
   ASSERT_EQ("rocksdb.BuiltinBloomFilter", props.filter_policy_name);
 }
 
+//
+// BlockBasedTableTest::PrefetchTest
+//
+void AssertKeysInCache(BlockBasedTable* table_reader,
+                 const vector<string>& keys_in_cache,
+                 const vector<string>& keys_not_in_cache) {
+  for (auto key : keys_in_cache) {
+    ASSERT_TRUE(table_reader->TEST_KeyInCache(ReadOptions(), key));
+  }
+
+  for (auto key : keys_not_in_cache) {
+    ASSERT_TRUE(!table_reader->TEST_KeyInCache(ReadOptions(), key));
+  }
+}
+
+void PrefetchRange(TableConstructor* c, Options* opt,
+                   BlockBasedTableOptions* table_options,
+                   const vector<std::string>& keys,
+                   const char* key_begin, const char* key_end,
+                   const vector<string>& keys_in_cache,
+                   const vector<string>& keys_not_in_cache,
+                   const Status expected_status = Status::OK()) {
+  // reset the cache and reopen the table
+  table_options->block_cache = NewLRUCache(16 * 1024 * 1024);
+  opt->table_factory.reset(NewBlockBasedTableFactory(*table_options));
+  const ImmutableCFOptions ioptions2(*opt);
+  ASSERT_OK(c->Reopen(ioptions2));
+
+  // prefetch
+  auto* table_reader = dynamic_cast<BlockBasedTable*>(c->GetTableReader());
+  // empty string replacement is a trick so we don't crash the test
+  Slice begin(key_begin ? key_begin : "");
+  Slice end(key_end ? key_end : "");
+  Status s = table_reader->Prefetch(key_begin ? &begin : nullptr,
+                                    key_end ? &end : nullptr);
+  ASSERT_TRUE(s.code() == expected_status.code());
+
+  // assert our expectation in cache warmup
+  AssertKeysInCache(table_reader, keys_in_cache, keys_not_in_cache);
+}
+
+TEST_F(BlockBasedTableTest, PrefetchTest) {
+  // The purpose of this test is to test the prefetching operation built into
+  // BlockBasedTable.
+  Options opt;
+  unique_ptr<InternalKeyComparator> ikc;
+  ikc.reset(new test::PlainInternalKeyComparator(opt.comparator));
+  opt.compression = kNoCompression;
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 1024;
+  // big enough so we don't ever lose cached values.
+  table_options.block_cache = NewLRUCache(16 * 1024 * 1024);
+  opt.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  TableConstructor c(BytewiseComparator());
+  c.Add("k01", "hello");
+  c.Add("k02", "hello2");
+  c.Add("k03", std::string(10000, 'x'));
+  c.Add("k04", std::string(200000, 'x'));
+  c.Add("k05", std::string(300000, 'x'));
+  c.Add("k06", "hello3");
+  c.Add("k07", std::string(100000, 'x'));
+  std::vector<std::string> keys;
+  KVMap kvmap;
+  const ImmutableCFOptions ioptions(opt);
+  c.Finish(opt, ioptions, table_options, *ikc, &keys, &kvmap);
+
+  // We get the following data spread :
+  //
+  // Data block         Index
+  // ========================
+  // [ k01 k02 k03 ]    k03
+  // [ k04         ]    k04
+  // [ k05         ]    k05
+  // [ k06 k07     ]    k07
+
+
+  // Simple
+  PrefetchRange(&c, &opt, &table_options, keys,
+                /*key_range=*/ "k01", "k05",
+                /*keys_in_cache=*/ {"k01", "k02", "k03", "k04", "k05"},
+                /*keys_not_in_cache=*/ {"k06", "k07"});
+  PrefetchRange(&c, &opt, &table_options, keys,
+                "k01", "k01",
+                {"k01", "k02", "k03"},
+                {"k04", "k05", "k06", "k07"});
+  // odd
+  PrefetchRange(&c, &opt, &table_options, keys,
+                "a", "z",
+                {"k01", "k02", "k03", "k04", "k05", "k06", "k07"},
+                {});
+  PrefetchRange(&c, &opt, &table_options, keys,
+                "k00", "k00",
+                {"k01", "k02", "k03"},
+                {"k04", "k05", "k06", "k07"});
+  // Edge cases
+  PrefetchRange(&c, &opt, &table_options, keys,
+                "k00", "k06",
+                {"k01", "k02", "k03", "k04", "k05", "k06", "k07"},
+                {});
+  PrefetchRange(&c, &opt, &table_options, keys,
+                "k00", "zzz",
+                {"k01", "k02", "k03", "k04", "k05", "k06", "k07"},
+                {});
+  // null keys
+  PrefetchRange(&c, &opt, &table_options, keys,
+                nullptr, nullptr,
+                {"k01", "k02", "k03", "k04", "k05", "k06", "k07"},
+                {});
+  PrefetchRange(&c, &opt, &table_options, keys,
+                "k04", nullptr,
+                {"k04", "k05", "k06", "k07"},
+                {"k01", "k02", "k03"});
+  PrefetchRange(&c, &opt, &table_options, keys,
+                nullptr, "k05",
+                {"k01", "k02", "k03", "k04", "k05"},
+                {"k06", "k07"});
+  // invalid
+  PrefetchRange(&c, &opt, &table_options, keys,
+                "k06", "k00", {}, {},
+                Status::InvalidArgument(Slice("k06 "), Slice("k07")));
+}
+
+TEST_F(BlockBasedTableTest, TotalOrderSeekOnHashIndex) {
+  BlockBasedTableOptions table_options;
+  for (int i = 0; i < 4; ++i) {
+    Options options;
+    // Make each key/value an individual block
+    table_options.block_size = 64;
+    switch (i) {
+    case 0:
+      // Binary search index
+      table_options.index_type = BlockBasedTableOptions::kBinarySearch;
+      options.table_factory.reset(new BlockBasedTableFactory(table_options));
+      break;
+    case 1:
+      // Hash search index
+      table_options.index_type = BlockBasedTableOptions::kHashSearch;
+      options.table_factory.reset(new BlockBasedTableFactory(table_options));
+      options.prefix_extractor.reset(NewFixedPrefixTransform(4));
+      break;
+    case 2:
+      // Hash search index with hash_index_allow_collision
+      table_options.index_type = BlockBasedTableOptions::kHashSearch;
+      table_options.hash_index_allow_collision = true;
+      options.table_factory.reset(new BlockBasedTableFactory(table_options));
+      options.prefix_extractor.reset(NewFixedPrefixTransform(4));
+      break;
+    case 3:
+    default:
+      // Hash search index with filter policy
+      table_options.index_type = BlockBasedTableOptions::kHashSearch;
+      table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+      options.table_factory.reset(new BlockBasedTableFactory(table_options));
+      options.prefix_extractor.reset(NewFixedPrefixTransform(4));
+      break;
+    }
+
+    TableConstructor c(BytewiseComparator(), true);
+    c.Add("aaaa1", std::string('a', 56));
+    c.Add("bbaa1", std::string('a', 56));
+    c.Add("cccc1", std::string('a', 56));
+    c.Add("bbbb1", std::string('a', 56));
+    c.Add("baaa1", std::string('a', 56));
+    c.Add("abbb1", std::string('a', 56));
+    c.Add("cccc2", std::string('a', 56));
+    std::vector<std::string> keys;
+    KVMap kvmap;
+    const ImmutableCFOptions ioptions(options);
+    c.Finish(options, ioptions, table_options,
+             GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+    auto props = c.GetTableReader()->GetTableProperties();
+    ASSERT_EQ(7u, props->num_data_blocks);
+    auto* reader = c.GetTableReader();
+    ReadOptions ro;
+    ro.total_order_seek = true;
+    std::unique_ptr<Iterator> iter(reader->NewIterator(ro));
+
+    iter->Seek(InternalKey("b", 0, kTypeValue).Encode());
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("baaa1", ExtractUserKey(iter->key()).ToString());
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("bbaa1", ExtractUserKey(iter->key()).ToString());
+
+    iter->Seek(InternalKey("bb", 0, kTypeValue).Encode());
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("bbaa1", ExtractUserKey(iter->key()).ToString());
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("bbbb1", ExtractUserKey(iter->key()).ToString());
+
+    iter->Seek(InternalKey("bbb", 0, kTypeValue).Encode());
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("bbbb1", ExtractUserKey(iter->key()).ToString());
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("cccc1", ExtractUserKey(iter->key()).ToString());
+  }
+}
+
 static std::string RandomString(Random* rnd, int len) {
   std::string r;
   test::RandomString(rnd, len, &r);
   return r;
 }
 
-void AddInternalKey(TableConstructor* c, const std::string prefix,
+void AddInternalKey(TableConstructor* c, const std::string& prefix,
                     int suffix_len = 800) {
   static Random rnd(1023);
   InternalKey k(prefix + RandomString(&rnd, 800), 0, kTypeValue);
   c->Add(k.Encode().ToString(), "v");
 }
 
-TEST(TableTest, HashIndexTest) {
+TEST_F(TableTest, HashIndexTest) {
   TableConstructor c(BytewiseComparator());
 
   // keys with prefix length 3, make sure the key/value is big enough to fill
@@ -1084,20 +1318,21 @@ TEST(TableTest, HashIndexTest) {
   std::vector<std::string> keys;
   KVMap kvmap;
   Options options;
+  options.prefix_extractor.reset(NewFixedPrefixTransform(3));
   BlockBasedTableOptions table_options;
   table_options.index_type = BlockBasedTableOptions::kHashSearch;
-  options.table_factory.reset(new BlockBasedTableFactory(table_options));
-
-  options.prefix_extractor.reset(NewFixedPrefixTransform(3));
-  options.block_cache = NewLRUCache(1024);
-  options.block_size = 1700;
+  table_options.hash_index_allow_collision = true;
+  table_options.block_size = 1700;
+  table_options.block_cache = NewLRUCache(1024);
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
   std::unique_ptr<InternalKeyComparator> comparator(
       new InternalKeyComparator(BytewiseComparator()));
-  c.Finish(options, *comparator, &keys, &kvmap);
-  auto reader = c.table_reader();
+  const ImmutableCFOptions ioptions(options);
+  c.Finish(options, ioptions, table_options, *comparator, &keys, &kvmap);
+  auto reader = c.GetTableReader();
 
-  auto props = c.table_reader()->GetTableProperties();
+  auto props = reader->GetTableProperties();
   ASSERT_EQ(5u, props->num_data_blocks);
 
   std::unique_ptr<Iterator> hash_iter(reader->NewIterator(ReadOptions()));
@@ -1160,14 +1395,20 @@ TEST(TableTest, HashIndexTest) {
     // regular_iter->Seek(prefix);
 
     ASSERT_OK(hash_iter->status());
-    ASSERT_TRUE(!hash_iter->Valid());
+    // Seek to non-existing prefixes should yield either invalid, or a
+    // key with prefix greater than the target.
+    if (hash_iter->Valid()) {
+      Slice ukey = ExtractUserKey(hash_iter->key());
+      Slice ukey_prefix = options.prefix_extractor->Transform(ukey);
+      ASSERT_TRUE(BytewiseComparator()->Compare(prefix, ukey_prefix) < 0);
+    }
   }
 }
 
 // It's very hard to figure out the index block size of a block accurately.
 // To make sure we get the index size, we just make sure as key number
 // grows, the filter block size also grows.
-TEST(BlockBasedTableTest, IndexSizeStat) {
+TEST_F(BlockBasedTableTest, IndexSizeStat) {
   uint64_t last_index_size = 0;
 
   // we need to use random keys since the pure human readable texts
@@ -1192,23 +1433,28 @@ TEST(BlockBasedTableTest, IndexSizeStat) {
     KVMap kvmap;
     Options options;
     options.compression = kNoCompression;
-    options.block_restart_interval = 1;
+    BlockBasedTableOptions table_options;
+    table_options.block_restart_interval = 1;
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
-    c.Finish(options, GetPlainInternalComparator(options.comparator), &ks,
-             &kvmap);
-    auto index_size = c.table_reader()->GetTableProperties()->index_size;
+    const ImmutableCFOptions ioptions(options);
+    c.Finish(options, ioptions, table_options,
+             GetPlainInternalComparator(options.comparator), &ks, &kvmap);
+    auto index_size = c.GetTableReader()->GetTableProperties()->index_size;
     ASSERT_GT(index_size, last_index_size);
     last_index_size = index_size;
   }
 }
 
-TEST(BlockBasedTableTest, NumBlockStat) {
+TEST_F(BlockBasedTableTest, NumBlockStat) {
   Random rnd(test::RandomSeed());
   TableConstructor c(BytewiseComparator());
   Options options;
   options.compression = kNoCompression;
-  options.block_restart_interval = 1;
-  options.block_size = 1000;
+  BlockBasedTableOptions table_options;
+  table_options.block_restart_interval = 1;
+  table_options.block_size = 1000;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
   for (int i = 0; i < 10; ++i) {
     // the key/val are slightly smaller than block size, so that each block
@@ -1218,10 +1464,11 @@ TEST(BlockBasedTableTest, NumBlockStat) {
 
   std::vector<std::string> ks;
   KVMap kvmap;
-  c.Finish(options, GetPlainInternalComparator(options.comparator), &ks,
-           &kvmap);
+  const ImmutableCFOptions ioptions(options);
+  c.Finish(options, ioptions, table_options,
+           GetPlainInternalComparator(options.comparator), &ks, &kvmap);
   ASSERT_EQ(kvmap.size(),
-            c.table_reader()->GetTableProperties()->num_data_blocks);
+            c.GetTableReader()->GetTableProperties()->num_data_blocks);
 }
 
 // A simple tool that takes the snapshot of block cache statistics.
@@ -1239,31 +1486,32 @@ class BlockCachePropertiesSnapshot {
     filter_block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_FILTER_HIT);
   }
 
-  void AssertIndexBlockStat(int64_t index_block_cache_miss,
-                            int64_t index_block_cache_hit) {
-    ASSERT_EQ(index_block_cache_miss, this->index_block_cache_miss);
-    ASSERT_EQ(index_block_cache_hit, this->index_block_cache_hit);
+  void AssertIndexBlockStat(int64_t expected_index_block_cache_miss,
+                            int64_t expected_index_block_cache_hit) {
+    ASSERT_EQ(expected_index_block_cache_miss, index_block_cache_miss);
+    ASSERT_EQ(expected_index_block_cache_hit, index_block_cache_hit);
   }
 
-  void AssertFilterBlockStat(int64_t filter_block_cache_miss,
-                             int64_t filter_block_cache_hit) {
-    ASSERT_EQ(filter_block_cache_miss, this->filter_block_cache_miss);
-    ASSERT_EQ(filter_block_cache_hit, this->filter_block_cache_hit);
+  void AssertFilterBlockStat(int64_t expected_filter_block_cache_miss,
+                             int64_t expected_filter_block_cache_hit) {
+    ASSERT_EQ(expected_filter_block_cache_miss, filter_block_cache_miss);
+    ASSERT_EQ(expected_filter_block_cache_hit, filter_block_cache_hit);
   }
 
   // Check if the fetched props matches the expected ones.
   // TODO(kailiu) Use this only when you disabled filter policy!
-  void AssertEqual(int64_t index_block_cache_miss,
-                   int64_t index_block_cache_hit, int64_t data_block_cache_miss,
-                   int64_t data_block_cache_hit) const {
-    ASSERT_EQ(index_block_cache_miss, this->index_block_cache_miss);
-    ASSERT_EQ(index_block_cache_hit, this->index_block_cache_hit);
-    ASSERT_EQ(data_block_cache_miss, this->data_block_cache_miss);
-    ASSERT_EQ(data_block_cache_hit, this->data_block_cache_hit);
-    ASSERT_EQ(index_block_cache_miss + data_block_cache_miss,
-              this->block_cache_miss);
-    ASSERT_EQ(index_block_cache_hit + data_block_cache_hit,
-              this->block_cache_hit);
+  void AssertEqual(int64_t expected_index_block_cache_miss,
+                   int64_t expected_index_block_cache_hit,
+                   int64_t expected_data_block_cache_miss,
+                   int64_t expected_data_block_cache_hit) const {
+    ASSERT_EQ(expected_index_block_cache_miss, index_block_cache_miss);
+    ASSERT_EQ(expected_index_block_cache_hit, index_block_cache_hit);
+    ASSERT_EQ(expected_data_block_cache_miss, data_block_cache_miss);
+    ASSERT_EQ(expected_data_block_cache_hit, data_block_cache_hit);
+    ASSERT_EQ(expected_index_block_cache_miss + expected_data_block_cache_miss,
+              block_cache_miss);
+    ASSERT_EQ(expected_index_block_cache_hit + expected_data_block_cache_hit,
+              block_cache_hit);
   }
 
  private:
@@ -1279,27 +1527,25 @@ class BlockCachePropertiesSnapshot {
 
 // Make sure, by default, index/filter blocks were pre-loaded (meaning we won't
 // use block cache to store them).
-TEST(BlockBasedTableTest, BlockCacheDisabledTest) {
+TEST_F(BlockBasedTableTest, BlockCacheDisabledTest) {
   Options options;
   options.create_if_missing = true;
   options.statistics = CreateDBStatistics();
-  options.block_cache = NewLRUCache(1024);
-  std::unique_ptr<const FilterPolicy> filter_policy(NewBloomFilterPolicy(10));
-  options.filter_policy = filter_policy.get();
   BlockBasedTableOptions table_options;
-  // Intentionally commented out: table_options.cache_index_and_filter_blocks =
-  // true;
+  table_options.block_cache = NewLRUCache(1024);
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10));
   options.table_factory.reset(new BlockBasedTableFactory(table_options));
   std::vector<std::string> keys;
   KVMap kvmap;
 
-  TableConstructor c(BytewiseComparator());
+  TableConstructor c(BytewiseComparator(), true);
   c.Add("key", "value");
-  c.Finish(options, GetPlainInternalComparator(options.comparator), &keys,
-           &kvmap);
+  const ImmutableCFOptions ioptions(options);
+  c.Finish(options, ioptions, table_options,
+           GetPlainInternalComparator(options.comparator), &keys, &kvmap);
 
   // preloading filter/index blocks is enabled.
-  auto reader = dynamic_cast<BlockBasedTable*>(c.table_reader());
+  auto reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
   ASSERT_TRUE(reader->TEST_filter_block_preloaded());
   ASSERT_TRUE(reader->TEST_index_reader_preloaded());
 
@@ -1311,8 +1557,11 @@ TEST(BlockBasedTableTest, BlockCacheDisabledTest) {
   }
 
   {
+    GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                           GetContext::kNotFound, Slice(), nullptr, nullptr,
+                           nullptr, nullptr);
     // a hack that just to trigger BlockBasedTable::GetFilter.
-    reader->Get(ReadOptions(), "non-exist-key", nullptr, nullptr, nullptr);
+    reader->Get(ReadOptions(), "non-exist-key", &get_context);
     BlockCachePropertiesSnapshot props(options.statistics.get());
     props.AssertIndexBlockStat(0, 0);
     props.AssertFilterBlockStat(0, 0);
@@ -1321,15 +1570,15 @@ TEST(BlockBasedTableTest, BlockCacheDisabledTest) {
 
 // Due to the difficulities of the intersaction between statistics, this test
 // only tests the case when "index block is put to block cache"
-TEST(BlockBasedTableTest, FilterBlockInBlockCache) {
+TEST_F(BlockBasedTableTest, FilterBlockInBlockCache) {
   // -- Table construction
   Options options;
   options.create_if_missing = true;
   options.statistics = CreateDBStatistics();
-  options.block_cache = NewLRUCache(1024);
 
   // Enable the cache for index/filter blocks
   BlockBasedTableOptions table_options;
+  table_options.block_cache = NewLRUCache(1024);
   table_options.cache_index_and_filter_blocks = true;
   options.table_factory.reset(new BlockBasedTableFactory(table_options));
   std::vector<std::string> keys;
@@ -1337,10 +1586,11 @@ TEST(BlockBasedTableTest, FilterBlockInBlockCache) {
 
   TableConstructor c(BytewiseComparator());
   c.Add("key", "value");
-  c.Finish(options, GetPlainInternalComparator(options.comparator), &keys,
-           &kvmap);
+  const ImmutableCFOptions ioptions(options);
+  c.Finish(options, ioptions, table_options,
+           GetPlainInternalComparator(options.comparator), &keys, &kvmap);
   // preloading filter/index blocks is prohibited.
-  auto reader = dynamic_cast<BlockBasedTable*>(c.table_reader());
+  auto* reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
   ASSERT_TRUE(!reader->TEST_filter_block_preloaded());
   ASSERT_TRUE(!reader->TEST_index_reader_preloaded());
 
@@ -1386,32 +1636,20 @@ TEST(BlockBasedTableTest, FilterBlockInBlockCache) {
   // release the iterator so that the block cache can reset correctly.
   iter.reset();
 
-  // -- PART 2: Open without block cache
-  options.block_cache.reset();
-  options.statistics = CreateDBStatistics();  // reset the stats
-  c.Reopen(options);
-
-  {
-    iter.reset(c.NewIterator());
-    iter->SeekToFirst();
-    ASSERT_EQ("key", iter->key().ToString());
-    BlockCachePropertiesSnapshot props(options.statistics.get());
-    // Nothing is affected at all
-    props.AssertEqual(0, 0, 0, 0);
-  }
-
-  // -- PART 3: Open with very small block cache
+  // -- PART 2: Open with very small block cache
   // In this test, no block will ever get hit since the block cache is
   // too small to fit even one entry.
-  options.block_cache = NewLRUCache(1);
-  c.Reopen(options);
+  table_options.block_cache = NewLRUCache(1);
+  options.statistics = CreateDBStatistics();
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  const ImmutableCFOptions ioptions2(options);
+  c.Reopen(ioptions2);
   {
     BlockCachePropertiesSnapshot props(options.statistics.get());
     props.AssertEqual(1,  // index block miss
                       0, 0, 0);
   }
 
-
   {
     // Both index and data block get accessed.
     // It first cache index block then data block. But since the cache size
@@ -1431,9 +1669,40 @@ TEST(BlockBasedTableTest, FilterBlockInBlockCache) {
     props.AssertEqual(2, 0, 0 + 1,  // data block miss
                       0);
   }
+  iter.reset();
+
+  // -- PART 3: Open table with bloom filter enabled but not in SST file
+  table_options.block_cache = NewLRUCache(4096);
+  table_options.cache_index_and_filter_blocks = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  TableConstructor c3(BytewiseComparator());
+  std::string user_key = "k01";
+  InternalKey internal_key(user_key, 0, kTypeValue);
+  c3.Add(internal_key.Encode().ToString(), "hello");
+  ImmutableCFOptions ioptions3(options);
+  // Generate table without filter policy
+  c3.Finish(options, ioptions3, table_options,
+           GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+  // Open table with filter policy
+  table_options.filter_policy.reset(NewBloomFilterPolicy(1));
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  options.statistics = CreateDBStatistics();
+  ImmutableCFOptions ioptions4(options);
+  ASSERT_OK(c3.Reopen(ioptions4));
+  reader = dynamic_cast<BlockBasedTable*>(c3.GetTableReader());
+  ASSERT_TRUE(!reader->TEST_filter_block_preloaded());
+  std::string value;
+  GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                         GetContext::kNotFound, user_key, &value, nullptr,
+                         nullptr, nullptr);
+  ASSERT_OK(reader->Get(ReadOptions(), user_key, &get_context));
+  ASSERT_EQ(value, "hello");
+  BlockCachePropertiesSnapshot props(options.statistics.get());
+  props.AssertFilterBlockStat(0, 0);
 }
 
-TEST(BlockBasedTableTest, BlockCacheLeak) {
+TEST_F(BlockBasedTableTest, BlockCacheLeak) {
   // Check that when we reopen a table we don't lose access to blocks already
   // in the cache. This test checks whether the Table actually makes use of the
   // unique ID from the file.
@@ -1441,11 +1710,12 @@ TEST(BlockBasedTableTest, BlockCacheLeak) {
   Options opt;
   unique_ptr<InternalKeyComparator> ikc;
   ikc.reset(new test::PlainInternalKeyComparator(opt.comparator));
-  opt.block_size = 1024;
   opt.compression = kNoCompression;
-  opt.block_cache =
-      NewLRUCache(16 * 1024 * 1024);  // big enough so we don't ever
-                                      // lose cached values.
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 1024;
+  // big enough so we don't ever lose cached values.
+  table_options.block_cache = NewLRUCache(16 * 1024 * 1024);
+  opt.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
   TableConstructor c(BytewiseComparator());
   c.Add("k01", "hello");
@@ -1457,7 +1727,8 @@ TEST(BlockBasedTableTest, BlockCacheLeak) {
   c.Add("k07", std::string(100000, 'x'));
   std::vector<std::string> keys;
   KVMap kvmap;
-  c.Finish(opt, *ikc, &keys, &kvmap);
+  const ImmutableCFOptions ioptions(opt);
+  c.Finish(opt, ioptions, table_options, *ikc, &keys, &kvmap);
 
   unique_ptr<Iterator> iter(c.NewIterator());
   iter->SeekToFirst();
@@ -1468,20 +1739,41 @@ TEST(BlockBasedTableTest, BlockCacheLeak) {
   }
   ASSERT_OK(iter->status());
 
-  ASSERT_OK(c.Reopen(opt));
-  auto table_reader = dynamic_cast<BlockBasedTable*>(c.table_reader());
+  const ImmutableCFOptions ioptions1(opt);
+  ASSERT_OK(c.Reopen(ioptions1));
+  auto table_reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
   for (const std::string& key : keys) {
     ASSERT_TRUE(table_reader->TEST_KeyInCache(ReadOptions(), key));
   }
+
+  // rerun with different block cache
+  table_options.block_cache = NewLRUCache(16 * 1024 * 1024);
+  opt.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  const ImmutableCFOptions ioptions2(opt);
+  ASSERT_OK(c.Reopen(ioptions2));
+  table_reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
+  for (const std::string& key : keys) {
+    ASSERT_TRUE(!table_reader->TEST_KeyInCache(ReadOptions(), key));
+  }
 }
 
-TEST(PlainTableTest, BasicPlainTableProperties) {
-  PlainTableFactory factory(8, 8, 0);
+TEST_F(PlainTableTest, BasicPlainTableProperties) {
+  PlainTableOptions plain_table_options;
+  plain_table_options.user_key_len = 8;
+  plain_table_options.bloom_bits_per_key = 8;
+  plain_table_options.hash_table_ratio = 0;
+
+  PlainTableFactory factory(plain_table_options);
   StringSink sink;
   Options options;
+  const ImmutableCFOptions ioptions(options);
   InternalKeyComparator ikc(options.comparator);
-  std::unique_ptr<TableBuilder> builder(
-      factory.NewTableBuilder(options, ikc, &sink, kNoCompression));
+  std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
+      int_tbl_prop_collector_factories;
+  std::unique_ptr<TableBuilder> builder(factory.NewTableBuilder(
+      TableBuilderOptions(ioptions, ikc, &int_tbl_prop_collector_factories,
+                          kNoCompression, CompressionOptions(), false),
+      &sink));
 
   for (char c = 'a'; c <= 'z'; ++c) {
     std::string key(8, c);
@@ -1508,7 +1800,7 @@ TEST(PlainTableTest, BasicPlainTableProperties) {
   ASSERT_EQ(1ul, props->num_data_blocks);
 }
 
-TEST(GeneralTableTest, ApproximateOffsetOfPlain) {
+TEST_F(GeneralTableTest, ApproximateOffsetOfPlain) {
   TableConstructor c(BytewiseComparator());
   c.Add("k01", "hello");
   c.Add("k02", "hello2");
@@ -1521,9 +1813,12 @@ TEST(GeneralTableTest, ApproximateOffsetOfPlain) {
   KVMap kvmap;
   Options options;
   test::PlainInternalKeyComparator internal_comparator(options.comparator);
-  options.block_size = 1024;
   options.compression = kNoCompression;
-  c.Finish(options, internal_comparator, &keys, &kvmap);
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 1024;
+  const ImmutableCFOptions ioptions(options);
+  c.Finish(options, ioptions, table_options, internal_comparator,
+           &keys, &kvmap);
 
   ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"),       0,      0));
   ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"),       0,      0));
@@ -1550,9 +1845,11 @@ static void DoCompressionTest(CompressionType comp) {
   KVMap kvmap;
   Options options;
   test::PlainInternalKeyComparator ikc(options.comparator);
-  options.block_size = 1024;
   options.compression = comp;
-  c.Finish(options, ikc, &keys, &kvmap);
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 1024;
+  const ImmutableCFOptions ioptions(options);
+  c.Finish(options, ioptions, table_options, ikc, &keys, &kvmap);
 
   ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"),       0,      0));
   ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"),       0,      0));
@@ -1562,15 +1859,15 @@ static void DoCompressionTest(CompressionType comp) {
   ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"),    4000,   6100));
 }
 
-TEST(GeneralTableTest, ApproximateOffsetOfCompressed) {
+TEST_F(GeneralTableTest, ApproximateOffsetOfCompressed) {
   std::vector<CompressionType> compression_state;
-  if (!SnappyCompressionSupported()) {
+  if (!Snappy_Supported()) {
     fprintf(stderr, "skipping snappy compression tests\n");
   } else {
     compression_state.push_back(kSnappyCompression);
   }
 
-  if (!ZlibCompressionSupported()) {
+  if (!Zlib_Supported()) {
     fprintf(stderr, "skipping zlib compression tests\n");
   } else {
     compression_state.push_back(kZlibCompression);
@@ -1578,22 +1875,17 @@ TEST(GeneralTableTest, ApproximateOffsetOfCompressed) {
 
   // TODO(kailiu) DoCompressionTest() doesn't work with BZip2.
   /*
-  if (!BZip2CompressionSupported()) {
+  if (!BZip2_Supported()) {
     fprintf(stderr, "skipping bzip2 compression tests\n");
   } else {
     compression_state.push_back(kBZip2Compression);
   }
   */
 
-  if (!LZ4CompressionSupported()) {
-    fprintf(stderr, "skipping lz4 compression tests\n");
+  if (!LZ4_Supported()) {
+    fprintf(stderr, "skipping lz4 and lz4hc compression tests\n");
   } else {
     compression_state.push_back(kLZ4Compression);
-  }
-
-  if (!LZ4HCCompressionSupported()) {
-    fprintf(stderr, "skipping lz4hc compression tests\n");
-  } else {
     compression_state.push_back(kLZ4HCCompression);
   }
 
@@ -1602,7 +1894,7 @@ TEST(GeneralTableTest, ApproximateOffsetOfCompressed) {
   }
 }
 
-TEST(Harness, Randomized) {
+TEST_F(HarnessTest, Randomized) {
   std::vector<TestArgs> args = GenerateArgList();
   for (unsigned int i = 0; i < args.size(); i++) {
     Init(args[i]);
@@ -1623,9 +1915,9 @@ TEST(Harness, Randomized) {
   }
 }
 
-TEST(Harness, RandomizedLongDB) {
+TEST_F(HarnessTest, RandomizedLongDB) {
   Random rnd(test::RandomSeed());
-  TestArgs args = { DB_TEST, false, 16, kNoCompression };
+  TestArgs args = { DB_TEST, false, 16, kNoCompression, 0 };
   Init(args);
   int num_entries = 100000;
   for (int e = 0; e < num_entries; e++) {
@@ -1647,14 +1939,17 @@ TEST(Harness, RandomizedLongDB) {
   ASSERT_GT(files, 0);
 }
 
-class MemTableTest { };
+class MemTableTest : public testing::Test {};
 
-TEST(MemTableTest, Simple) {
+TEST_F(MemTableTest, Simple) {
   InternalKeyComparator cmp(BytewiseComparator());
   auto table_factory = std::make_shared<SkipListFactory>();
   Options options;
   options.memtable_factory = table_factory;
-  MemTable* memtable = new MemTable(cmp, options);
+  ImmutableCFOptions ioptions(options);
+  WriteBuffer wb(options.db_write_buffer_size);
+  MemTable* memtable =
+      new MemTable(cmp, ioptions, MutableCFOptions(options, ioptions), &wb);
   memtable->Ref();
   WriteBatch batch;
   WriteBatchInternal::SetSequence(&batch, 100);
@@ -1662,10 +1957,11 @@ TEST(MemTableTest, Simple) {
   batch.Put(std::string("k2"), std::string("v2"));
   batch.Put(std::string("k3"), std::string("v3"));
   batch.Put(std::string("largekey"), std::string("vlarge"));
-  ColumnFamilyMemTablesDefault cf_mems_default(memtable, &options);
+  ColumnFamilyMemTablesDefault cf_mems_default(memtable);
   ASSERT_TRUE(WriteBatchInternal::InsertInto(&batch, &cf_mems_default).ok());
 
-  Iterator* iter = memtable->NewIterator(ReadOptions());
+  Arena arena;
+  ScopedArenaIterator iter(memtable->NewIterator(ReadOptions(), &arena));
   iter->SeekToFirst();
   while (iter->Valid()) {
     fprintf(stderr, "key: '%s' -> '%s'\n",
@@ -1674,12 +1970,11 @@ TEST(MemTableTest, Simple) {
     iter->Next();
   }
 
-  delete iter;
   delete memtable->Unref();
 }
 
 // Test the empty key
-TEST(Harness, SimpleEmptyKey) {
+TEST_F(HarnessTest, SimpleEmptyKey) {
   auto args = GenerateArgList();
   for (const auto& arg : args) {
     Init(arg);
@@ -1689,7 +1984,7 @@ TEST(Harness, SimpleEmptyKey) {
   }
 }
 
-TEST(Harness, SimpleSingle) {
+TEST_F(HarnessTest, SimpleSingle) {
   auto args = GenerateArgList();
   for (const auto& arg : args) {
     Init(arg);
@@ -1699,7 +1994,7 @@ TEST(Harness, SimpleSingle) {
   }
 }
 
-TEST(Harness, SimpleMulti) {
+TEST_F(HarnessTest, SimpleMulti) {
   auto args = GenerateArgList();
   for (const auto& arg : args) {
     Init(arg);
@@ -1711,7 +2006,7 @@ TEST(Harness, SimpleMulti) {
   }
 }
 
-TEST(Harness, SimpleSpecialKey) {
+TEST_F(HarnessTest, SimpleSpecialKey) {
   auto args = GenerateArgList();
   for (const auto& arg : args) {
     Init(arg);
@@ -1721,11 +2016,11 @@ TEST(Harness, SimpleSpecialKey) {
   }
 }
 
-TEST(Harness, FooterTests) {
+TEST_F(HarnessTest, FooterTests) {
   {
     // upconvert legacy block based
     std::string encoded;
-    Footer footer(kLegacyBlockBasedTableMagicNumber);
+    Footer footer(kLegacyBlockBasedTableMagicNumber, 0);
     BlockHandle meta_index(10, 5), index(20, 15);
     footer.set_metaindex_handle(meta_index);
     footer.set_index_handle(index);
@@ -1739,11 +2034,12 @@ TEST(Harness, FooterTests) {
     ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
     ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
     ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
+    ASSERT_EQ(decoded_footer.version(), 0U);
   }
   {
     // xxhash block based
     std::string encoded;
-    Footer footer(kBlockBasedTableMagicNumber);
+    Footer footer(kBlockBasedTableMagicNumber, 1);
     BlockHandle meta_index(10, 5), index(20, 15);
     footer.set_metaindex_handle(meta_index);
     footer.set_index_handle(index);
@@ -1758,11 +2054,12 @@ TEST(Harness, FooterTests) {
     ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
     ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
     ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
+    ASSERT_EQ(decoded_footer.version(), 1U);
   }
   {
     // upconvert legacy plain table
     std::string encoded;
-    Footer footer(kLegacyPlainTableMagicNumber);
+    Footer footer(kLegacyPlainTableMagicNumber, 0);
     BlockHandle meta_index(10, 5), index(20, 15);
     footer.set_metaindex_handle(meta_index);
     footer.set_index_handle(index);
@@ -1776,11 +2073,12 @@ TEST(Harness, FooterTests) {
     ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
     ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
     ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
+    ASSERT_EQ(decoded_footer.version(), 0U);
   }
   {
     // xxhash block based
     std::string encoded;
-    Footer footer(kPlainTableMagicNumber);
+    Footer footer(kPlainTableMagicNumber, 1);
     BlockHandle meta_index(10, 5), index(20, 15);
     footer.set_metaindex_handle(meta_index);
     footer.set_index_handle(index);
@@ -1795,11 +2093,32 @@ TEST(Harness, FooterTests) {
     ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
     ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
     ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
+    ASSERT_EQ(decoded_footer.version(), 1U);
+  }
+  {
+    // version == 2
+    std::string encoded;
+    Footer footer(kBlockBasedTableMagicNumber, 2);
+    BlockHandle meta_index(10, 5), index(20, 15);
+    footer.set_metaindex_handle(meta_index);
+    footer.set_index_handle(index);
+    footer.EncodeTo(&encoded);
+    Footer decoded_footer;
+    Slice encoded_slice(encoded);
+    decoded_footer.DecodeFrom(&encoded_slice);
+    ASSERT_EQ(decoded_footer.table_magic_number(), kBlockBasedTableMagicNumber);
+    ASSERT_EQ(decoded_footer.checksum(), kCRC32c);
+    ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset());
+    ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
+    ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
+    ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
+    ASSERT_EQ(decoded_footer.version(), 2U);
   }
 }
 
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
-  return rocksdb::test::RunAllTests();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
 }
diff --git a/src/rocksdb/table/two_level_iterator.cc b/src/rocksdb/table/two_level_iterator.cc
index 990f181..5d3e372 100644
--- a/src/rocksdb/table/two_level_iterator.cc
+++ b/src/rocksdb/table/two_level_iterator.cc
@@ -13,6 +13,7 @@
 #include "rocksdb/table.h"
 #include "table/block.h"
 #include "table/format.h"
+#include "util/arena.h"
 
 namespace rocksdb {
 
@@ -23,26 +24,27 @@ class TwoLevelIterator: public Iterator {
   explicit TwoLevelIterator(TwoLevelIteratorState* state,
       Iterator* first_level_iter);
 
-  virtual ~TwoLevelIterator() {}
+  virtual ~TwoLevelIterator() {
+    first_level_iter_.DeleteIter(false);
+    second_level_iter_.DeleteIter(false);
+  }
 
-  virtual void Seek(const Slice& target);
-  virtual void SeekToFirst();
-  virtual void SeekToLast();
-  virtual void Next();
-  virtual void Prev();
+  virtual void Seek(const Slice& target) override;
+  virtual void SeekToFirst() override;
+  virtual void SeekToLast() override;
+  virtual void Next() override;
+  virtual void Prev() override;
 
-  virtual bool Valid() const {
-    return second_level_iter_.Valid();
-  }
-  virtual Slice key() const {
+  virtual bool Valid() const override { return second_level_iter_.Valid(); }
+  virtual Slice key() const override {
     assert(Valid());
     return second_level_iter_.key();
   }
-  virtual Slice value() const {
+  virtual Slice value() const override {
     assert(Valid());
     return second_level_iter_.value();
   }
-  virtual Status status() const {
+  virtual Status status() const override {
     // It'd be nice if status() returned a const Status& instead of a Status
     if (!first_level_iter_.status().ok()) {
       return first_level_iter_.status();
@@ -168,8 +170,9 @@ void TwoLevelIterator::InitDataBlock() {
     SetSecondLevelIterator(nullptr);
   } else {
     Slice handle = first_level_iter_.value();
-    if (second_level_iter_.iter() != nullptr
-        && handle.compare(data_block_handle_) == 0) {
+    if (second_level_iter_.iter() != nullptr &&
+        !second_level_iter_.status().IsIncomplete() &&
+        handle.compare(data_block_handle_) == 0) {
       // second_level_iter is already constructed with this iterator, so
       // no need to change anything
     } else {
@@ -183,8 +186,13 @@ void TwoLevelIterator::InitDataBlock() {
 }  // namespace
 
 Iterator* NewTwoLevelIterator(TwoLevelIteratorState* state,
-      Iterator* first_level_iter) {
-  return new TwoLevelIterator(state, first_level_iter);
+                              Iterator* first_level_iter, Arena* arena) {
+  if (arena == nullptr) {
+    return new TwoLevelIterator(state, first_level_iter);
+  } else {
+    auto mem = arena->AllocateAligned(sizeof(TwoLevelIterator));
+    return new (mem) TwoLevelIterator(state, first_level_iter);
+  }
 }
 
 }  // namespace rocksdb
diff --git a/src/rocksdb/table/two_level_iterator.h b/src/rocksdb/table/two_level_iterator.h
index b808338..0301935 100644
--- a/src/rocksdb/table/two_level_iterator.h
+++ b/src/rocksdb/table/two_level_iterator.h
@@ -16,10 +16,11 @@ namespace rocksdb {
 
 struct ReadOptions;
 class InternalKeyComparator;
+class Arena;
 
 struct TwoLevelIteratorState {
-  explicit TwoLevelIteratorState(bool check_prefix_may_match)
-    : check_prefix_may_match(check_prefix_may_match) {}
+  explicit TwoLevelIteratorState(bool _check_prefix_may_match)
+      : check_prefix_may_match(_check_prefix_may_match) {}
 
   virtual ~TwoLevelIteratorState() {}
   virtual Iterator* NewSecondaryIterator(const Slice& handle) = 0;
@@ -39,7 +40,11 @@ struct TwoLevelIteratorState {
 //
 // Uses a supplied function to convert an index_iter value into
 // an iterator over the contents of the corresponding block.
+// arena: If not null, the arena is used to allocate the Iterator.
+//        When destroying the iterator, the destructor will destroy
+//        all the states but those allocated in arena.
 extern Iterator* NewTwoLevelIterator(TwoLevelIteratorState* state,
-      Iterator* first_level_iter);
+                                     Iterator* first_level_iter,
+                                     Arena* arena = nullptr);
 
 }  // namespace rocksdb
diff --git a/src/rocksdb/third-party/fbson/COMMIT.md b/src/rocksdb/third-party/fbson/COMMIT.md
new file mode 100644
index 0000000..bba88d5
--- /dev/null
+++ b/src/rocksdb/third-party/fbson/COMMIT.md
@@ -0,0 +1,2 @@
+fbson commit: 
+https://github.com/facebook/mysql-5.6/commit/55ef9ff25c934659a70b4094e9b406c48e9dd43d
diff --git a/src/rocksdb/third-party/fbson/FbsonDocument.h b/src/rocksdb/third-party/fbson/FbsonDocument.h
new file mode 100644
index 0000000..4d7c79a
--- /dev/null
+++ b/src/rocksdb/third-party/fbson/FbsonDocument.h
@@ -0,0 +1,887 @@
+/*
+ *  Copyright (c) 2014, Facebook, Inc.
+ *  All rights reserved.
+ *
+ *  This source code is licensed under the BSD-style license found in the
+ *  LICENSE file in the root directory of this source tree. An additional grant
+ *  of patent rights can be found in the PATENTS file in the same directory.
+ *
+ */
+
+/*
+ * This header defines FbsonDocument, FbsonKeyValue, and various value classes
+ * which are derived from FbsonValue, and a forward iterator for container
+ * values - essentially everything that is related to FBSON binary data
+ * structures.
+ *
+ * Implementation notes:
+ *
+ * None of the classes in this header file can be instantiated directly (i.e.
+ * you cannot create a FbsonKeyValue or FbsonValue object - all constructors
+ * are declared non-public). We use the classes as wrappers on the packed FBSON
+ * bytes (serialized), and cast the classes (types) to the underlying packed
+ * byte array.
+ *
+ * For the same reason, we cannot define any FBSON value class to be virtual,
+ * since we never call constructors, and will not instantiate vtbl and vptrs.
+ *
+ * Therefore, the classes are defined as packed structures (i.e. no data
+ * alignment and padding), and the private member variables of the classes are
+ * defined precisely in the same order as the FBSON spec. This ensures we
+ * access the packed FBSON bytes correctly.
+ *
+ * The packed structures are highly optimized for in-place operations with low
+ * overhead. The reads (and in-place writes) are performed directly on packed
+ * bytes. There is no memory allocation at all at runtime.
+ *
+ * For updates/writes of values that will expand the original FBSON size, the
+ * write will fail, and the caller needs to handle buffer increase.
+ *
+ * ** Iterator **
+ * Both ObjectVal class and ArrayVal class have iterator type that you can use
+ * to declare an iterator on a container object to go through the key-value
+ * pairs or value list. The iterator has both non-const and const types.
+ *
+ * Note: iterators are forward direction only.
+ *
+ * ** Query **
+ * Querying into containers is through the member functions find (for key/value
+ * pairs) and get (for array elements), and is in streaming style. We don't
+ * need to read/scan the whole FBSON packed bytes in order to return results.
+ * Once the key/index is found, we will stop search.  You can use text to query
+ * both objects and array (for array, text will be converted to integer index),
+ * and use index to retrieve from array. Array index is 0-based.
+ *
+ * ** External dictionary **
+ * During query processing, you can also pass a call-back function, so the
+ * search will first try to check if the key string exists in the dictionary.
+ * If so, search will be based on the id instead of the key string.
+ *
+ * @author Tian Xia <tianx at fb.com>
+ */
+
+#ifndef FBSON_FBSONDOCUMENT_H
+#define FBSON_FBSONDOCUMENT_H
+
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+namespace fbson {
+
+#pragma pack(push, 1)
+
+#define FBSON_VER 1
+
+// forward declaration
+class FbsonValue;
+class ObjectVal;
+
+/*
+ * FbsonDocument is the main object that accesses and queries FBSON packed
+ * bytes. NOTE: FbsonDocument only allows object container as the top level
+ * FBSON value. However, you can use the static method "createValue" to get any
+ * FbsonValue object from the packed bytes.
+ *
+ * FbsonDocument object also dereferences to an object container value
+ * (ObjectVal) once FBSON is loaded.
+ *
+ * ** Load **
+ * FbsonDocument is usable after loading packed bytes (memory location) into
+ * the object. We only need the header and first few bytes of the payload after
+ * header to verify the FBSON.
+ *
+ * Note: creating an FbsonDocument (through createDocument) does not allocate
+ * any memory. The document object is an efficient wrapper on the packed bytes
+ * which is accessed directly.
+ *
+ * ** Query **
+ * Query is through dereferencing into ObjectVal.
+ */
+class FbsonDocument {
+ public:
+  // create an FbsonDocument object from FBSON packed bytes
+  static FbsonDocument* createDocument(const char* pb, uint32_t size);
+
+  // create an FbsonValue from FBSON packed bytes
+  static FbsonValue* createValue(const char* pb, uint32_t size);
+
+  uint8_t version() { return header_.ver_; }
+
+  FbsonValue* getValue() { return ((FbsonValue*)payload_); }
+
+  ObjectVal* operator->() { return ((ObjectVal*)payload_); }
+
+  const ObjectVal* operator->() const { return ((const ObjectVal*)payload_); }
+
+ private:
+  /*
+   * FbsonHeader class defines FBSON header (internal to FbsonDocument).
+   *
+   * Currently it only contains version information (1-byte). We may expand the
+   * header to include checksum of the FBSON binary for more security.
+   */
+  struct FbsonHeader {
+    uint8_t ver_;
+  } header_;
+
+  char payload_[0];
+
+  FbsonDocument();
+};
+
+/*
+ * FbsonFwdIteratorT implements FBSON's iterator template.
+ *
+ * Note: it is an FORWARD iterator only due to the design of FBSON format.
+ */
+template <class Iter_Type, class Cont_Type>
+class FbsonFwdIteratorT {
+  typedef Iter_Type iterator;
+  typedef typename std::iterator_traits<Iter_Type>::pointer pointer;
+  typedef typename std::iterator_traits<Iter_Type>::reference reference;
+
+ public:
+  explicit FbsonFwdIteratorT(const iterator& i) : current_(i) {}
+
+  // allow non-const to const iterator conversion (same container type)
+  template <class Iter_Ty>
+  FbsonFwdIteratorT(const FbsonFwdIteratorT<Iter_Ty, Cont_Type>& rhs)
+      : current_(rhs.base()) {}
+
+  bool operator==(const FbsonFwdIteratorT& rhs) const {
+    return (current_ == rhs.current_);
+  }
+
+  bool operator!=(const FbsonFwdIteratorT& rhs) const {
+    return !operator==(rhs);
+  }
+
+  bool operator<(const FbsonFwdIteratorT& rhs) const {
+    return (current_ < rhs.current_);
+  }
+
+  bool operator>(const FbsonFwdIteratorT& rhs) const { return !operator<(rhs); }
+
+  FbsonFwdIteratorT& operator++() {
+    current_ = (iterator)(((char*)current_) + current_->numPackedBytes());
+    return *this;
+  }
+
+  FbsonFwdIteratorT operator++(int) {
+    auto tmp = *this;
+    current_ = (iterator)(((char*)current_) + current_->numPackedBytes());
+    return tmp;
+  }
+
+  explicit operator pointer() { return current_; }
+
+  reference operator*() const { return *current_; }
+
+  pointer operator->() const { return current_; }
+
+  iterator base() const { return current_; }
+
+ private:
+  iterator current_;
+};
+
+typedef int (*hDictInsert)(const char* key, unsigned len);
+typedef int (*hDictFind)(const char* key, unsigned len);
+
+/*
+ * FbsonType defines 10 primitive types and 2 container types, as described
+ * below.
+ *
+ * primitive_value ::=
+ *   0x00        //null value (0 byte)
+ * | 0x01        //boolean true (0 byte)
+ * | 0x02        //boolean false (0 byte)
+ * | 0x03 int8   //char/int8 (1 byte)
+ * | 0x04 int16  //int16 (2 bytes)
+ * | 0x05 int32  //int32 (4 bytes)
+ * | 0x06 int64  //int64 (8 bytes)
+ * | 0x07 double //floating point (8 bytes)
+ * | 0x08 string //variable length string
+ * | 0x09 binary //variable length binary
+ *
+ * container ::=
+ *   0x0A int32 key_value_list //object, int32 is the total bytes of the object
+ * | 0x0B int32 value_list     //array, int32 is the total bytes of the array
+ */
+enum class FbsonType : char {
+  T_Null = 0x00,
+  T_True = 0x01,
+  T_False = 0x02,
+  T_Int8 = 0x03,
+  T_Int16 = 0x04,
+  T_Int32 = 0x05,
+  T_Int64 = 0x06,
+  T_Double = 0x07,
+  T_String = 0x08,
+  T_Binary = 0x09,
+  T_Object = 0x0A,
+  T_Array = 0x0B,
+  NUM_TYPES,
+};
+
+typedef std::underlying_type<FbsonType>::type FbsonTypeUnder;
+
+/*
+ * FbsonKeyValue class defines FBSON key type, as described below.
+ *
+ * key ::=
+ *   0x00 int8    //1-byte dictionary id
+ * | int8 (byte*) //int8 (>0) is the size of the key string
+ *
+ * value ::= primitive_value | container
+ *
+ * FbsonKeyValue can be either an id mapping to the key string in an external
+ * dictionary, or it is the original key string. Whether to read an id or a
+ * string is decided by the first byte (size_).
+ *
+ * Note: a key object must be followed by a value object. Therefore, a key
+ * object implicitly refers to a key-value pair, and you can get the value
+ * object right after the key object. The function numPackedBytes hence
+ * indicates the total size of the key-value pair, so that we will be able go
+ * to next pair from the key.
+ *
+ * ** Dictionary size **
+ * By default, the dictionary size is 255 (1-byte). Users can define
+ * "USE_LARGE_DICT" to increase the dictionary size to 655535 (2-byte).
+ */
+class FbsonKeyValue {
+ public:
+#ifdef USE_LARGE_DICT
+  static const int sMaxKeyId = 65535;
+  typedef uint16_t keyid_type;
+#else
+  static const int sMaxKeyId = 255;
+  typedef uint8_t keyid_type;
+#endif // #ifdef USE_LARGE_DICT
+
+  static const uint8_t sMaxKeyLen = 64;
+
+  // size of the key. 0 indicates it is stored as id
+  uint8_t klen() const { return size_; }
+
+  // get the key string. Note the string may not be null terminated.
+  const char* getKeyStr() const { return key_.str_; }
+
+  keyid_type getKeyId() const { return key_.id_; }
+
+  unsigned int keyPackedBytes() const {
+    return size_ ? (sizeof(size_) + size_)
+                 : (sizeof(size_) + sizeof(keyid_type));
+  }
+
+  FbsonValue* value() const {
+    return (FbsonValue*)(((char*)this) + keyPackedBytes());
+  }
+
+  // size of the total packed bytes (key+value)
+  unsigned int numPackedBytes() const;
+
+ private:
+  uint8_t size_;
+
+  union key_ {
+    keyid_type id_;
+    char str_[1];
+  } key_;
+
+  FbsonKeyValue();
+};
+
+/*
+ * FbsonValue is the base class of all FBSON types. It contains only one member
+ * variable - type info, which can be retrieved by member functions is[Type]()
+ * or type().
+ */
+class FbsonValue {
+ public:
+  static const uint32_t sMaxValueLen = 1 << 24; // 16M
+
+  bool isNull() const { return (type_ == FbsonType::T_Null); }
+  bool isTrue() const { return (type_ == FbsonType::T_True); }
+  bool isFalse() const { return (type_ == FbsonType::T_False); }
+  bool isInt8() const { return (type_ == FbsonType::T_Int8); }
+  bool isInt16() const { return (type_ == FbsonType::T_Int16); }
+  bool isInt32() const { return (type_ == FbsonType::T_Int32); }
+  bool isInt64() const { return (type_ == FbsonType::T_Int64); }
+  bool isDouble() const { return (type_ == FbsonType::T_Double); }
+  bool isString() const { return (type_ == FbsonType::T_String); }
+  bool isBinary() const { return (type_ == FbsonType::T_Binary); }
+  bool isObject() const { return (type_ == FbsonType::T_Object); }
+  bool isArray() const { return (type_ == FbsonType::T_Array); }
+
+  FbsonType type() const { return type_; }
+
+  // size of the total packed bytes
+  unsigned int numPackedBytes() const;
+
+  // size of the value in bytes
+  unsigned int size() const;
+
+  // get the raw byte array of the value
+  const char* getValuePtr() const;
+
+  // find the FBSON value by a key path string (null terminated)
+  FbsonValue* findPath(const char* key_path,
+                       const char* delim = ".",
+                       hDictFind handler = nullptr) {
+    return findPath(key_path, (unsigned int)strlen(key_path), delim, handler);
+  }
+
+  // find the FBSON value by a key path string (with length)
+  FbsonValue* findPath(const char* key_path,
+                       unsigned int len,
+                       const char* delim,
+                       hDictFind handler);
+
+ protected:
+  FbsonType type_; // type info
+
+  FbsonValue();
+};
+
+/*
+ * NumerValT is the template class (derived from FbsonValue) of all number
+ * types (integers and double).
+ */
+template <class T>
+class NumberValT : public FbsonValue {
+ public:
+  T val() const { return num_; }
+
+  unsigned int numPackedBytes() const { return sizeof(FbsonValue) + sizeof(T); }
+
+  // catch all unknow specialization of the template class
+  bool setVal(T value) { return false; }
+
+ private:
+  T num_;
+
+  NumberValT();
+};
+
+typedef NumberValT<int8_t> Int8Val;
+
+// override setVal for Int8Val
+template <>
+inline bool Int8Val::setVal(int8_t value) {
+  if (!isInt8()) {
+    return false;
+  }
+
+  num_ = value;
+  return true;
+}
+
+typedef NumberValT<int16_t> Int16Val;
+
+// override setVal for Int16Val
+template <>
+inline bool Int16Val::setVal(int16_t value) {
+  if (!isInt16()) {
+    return false;
+  }
+
+  num_ = value;
+  return true;
+}
+
+typedef NumberValT<int32_t> Int32Val;
+
+// override setVal for Int32Val
+template <>
+inline bool Int32Val::setVal(int32_t value) {
+  if (!isInt32()) {
+    return false;
+  }
+
+  num_ = value;
+  return true;
+}
+
+typedef NumberValT<int64_t> Int64Val;
+
+// override setVal for Int64Val
+template <>
+inline bool Int64Val::setVal(int64_t value) {
+  if (!isInt64()) {
+    return false;
+  }
+
+  num_ = value;
+  return true;
+}
+
+typedef NumberValT<double> DoubleVal;
+
+// override setVal for DoubleVal
+template <>
+inline bool DoubleVal::setVal(double value) {
+  if (!isDouble()) {
+    return false;
+  }
+
+  num_ = value;
+  return true;
+}
+
+/*
+ * BlobVal is the base class (derived from FbsonValue) for string and binary
+ * types. The size_ indicates the total bytes of the payload_.
+ */
+class BlobVal : public FbsonValue {
+ public:
+  // size of the blob payload only
+  unsigned int getBlobLen() const { return size_; }
+
+  // return the blob as byte array
+  const char* getBlob() const { return payload_; }
+
+  // size of the total packed bytes
+  unsigned int numPackedBytes() const {
+    return sizeof(FbsonValue) + sizeof(size_) + size_;
+  }
+
+ protected:
+  uint32_t size_;
+  char payload_[0];
+
+  // set new blob bytes
+  bool internalSetVal(const char* blob, uint32_t blobSize) {
+    // if we cannot fit the new blob, fail the operation
+    if (blobSize > size_) {
+      return false;
+    }
+
+    memcpy(payload_, blob, blobSize);
+
+    // Set the reset of the bytes to 0.  Note we cannot change the size_ of the
+    // current payload, as all values are packed.
+    memset(payload_ + blobSize, 0, size_ - blobSize);
+
+    return true;
+  }
+
+  BlobVal();
+};
+
+/*
+ * Binary type
+ */
+class BinaryVal : public BlobVal {
+ public:
+  bool setVal(const char* blob, uint32_t blobSize) {
+    if (!isBinary()) {
+      return false;
+    }
+
+    return internalSetVal(blob, blobSize);
+  }
+
+ private:
+  BinaryVal();
+};
+
+/*
+ * String type
+ * Note: FBSON string may not be a c-string (NULL-terminated)
+ */
+class StringVal : public BlobVal {
+ public:
+  bool setVal(const char* str, uint32_t blobSize) {
+    if (!isString()) {
+      return false;
+    }
+
+    return internalSetVal(str, blobSize);
+  }
+
+ private:
+  StringVal();
+};
+
+/*
+ * ContainerVal is the base class (derived from FbsonValue) for object and
+ * array types. The size_ indicates the total bytes of the payload_.
+ */
+class ContainerVal : public FbsonValue {
+ public:
+  // size of the container payload only
+  unsigned int getContainerSize() const { return size_; }
+
+  // return the container payload as byte array
+  const char* getPayload() const { return payload_; }
+
+  // size of the total packed bytes
+  unsigned int numPackedBytes() const {
+    return sizeof(FbsonValue) + sizeof(size_) + size_;
+  }
+
+ protected:
+  uint32_t size_;
+  char payload_[0];
+
+  ContainerVal();
+};
+
+/*
+ * Object type
+ */
+class ObjectVal : public ContainerVal {
+ public:
+  // find the FBSON value by a key string (null terminated)
+  FbsonValue* find(const char* key, hDictFind handler = nullptr) const {
+    if (!key)
+      return nullptr;
+
+    return find(key, (unsigned int)strlen(key), handler);
+  }
+
+  // find the FBSON value by a key string (with length)
+  FbsonValue* find(const char* key,
+                   unsigned int klen,
+                   hDictFind handler = nullptr) const {
+    if (!key || !klen)
+      return nullptr;
+
+    int key_id = -1;
+    if (handler && (key_id = handler(key, klen)) >= 0) {
+      return find(key_id);
+    }
+
+    return internalFind(key, klen);
+  }
+
+  // find the FBSON value by a key dictionary ID
+  FbsonValue* find(int key_id) const {
+    if (key_id < 0 || key_id > FbsonKeyValue::sMaxKeyId)
+      return nullptr;
+
+    const char* pch = payload_;
+    const char* fence = payload_ + size_;
+
+    while (pch < fence) {
+      FbsonKeyValue* pkey = (FbsonKeyValue*)(pch);
+      if (!pkey->klen() && key_id == pkey->getKeyId()) {
+        return pkey->value();
+      }
+      pch += pkey->numPackedBytes();
+    }
+
+    assert(pch == fence);
+
+    return nullptr;
+  }
+
+  typedef FbsonKeyValue value_type;
+  typedef value_type* pointer;
+  typedef const value_type* const_pointer;
+  typedef FbsonFwdIteratorT<pointer, ObjectVal> iterator;
+  typedef FbsonFwdIteratorT<const_pointer, ObjectVal> const_iterator;
+
+  iterator begin() { return iterator((pointer)payload_); }
+
+  const_iterator begin() const { return const_iterator((pointer)payload_); }
+
+  iterator end() { return iterator((pointer)(payload_ + size_)); }
+
+  const_iterator end() const {
+    return const_iterator((pointer)(payload_ + size_));
+  }
+
+ private:
+  FbsonValue* internalFind(const char* key, unsigned int klen) const {
+    const char* pch = payload_;
+    const char* fence = payload_ + size_;
+
+    while (pch < fence) {
+      FbsonKeyValue* pkey = (FbsonKeyValue*)(pch);
+      if (klen == pkey->klen() && strncmp(key, pkey->getKeyStr(), klen) == 0) {
+        return pkey->value();
+      }
+      pch += pkey->numPackedBytes();
+    }
+
+    assert(pch == fence);
+
+    return nullptr;
+  }
+
+ private:
+  ObjectVal();
+};
+
+/*
+ * Array type
+ */
+class ArrayVal : public ContainerVal {
+ public:
+  // get the FBSON value at index
+  FbsonValue* get(int idx) const {
+    if (idx < 0)
+      return nullptr;
+
+    const char* pch = payload_;
+    const char* fence = payload_ + size_;
+
+    while (pch < fence && idx-- > 0)
+      pch += ((FbsonValue*)pch)->numPackedBytes();
+
+    if (idx == -1)
+      return (FbsonValue*)pch;
+    else {
+      assert(pch == fence);
+      return nullptr;
+    }
+  }
+
+  // Get number of elements in array
+  unsigned int numElem() const {
+    const char* pch = payload_;
+    const char* fence = payload_ + size_;
+
+    unsigned int num = 0;
+    while (pch < fence) {
+      ++num;
+      pch += ((FbsonValue*)pch)->numPackedBytes();
+    }
+
+    assert(pch == fence);
+
+    return num;
+  }
+
+  typedef FbsonValue value_type;
+  typedef value_type* pointer;
+  typedef const value_type* const_pointer;
+  typedef FbsonFwdIteratorT<pointer, ArrayVal> iterator;
+  typedef FbsonFwdIteratorT<const_pointer, ArrayVal> const_iterator;
+
+  iterator begin() { return iterator((pointer)payload_); }
+
+  const_iterator begin() const { return const_iterator((pointer)payload_); }
+
+  iterator end() { return iterator((pointer)(payload_ + size_)); }
+
+  const_iterator end() const {
+    return const_iterator((pointer)(payload_ + size_));
+  }
+
+ private:
+  ArrayVal();
+};
+
+inline FbsonDocument* FbsonDocument::createDocument(const char* pb,
+                                                    uint32_t size) {
+  if (!pb || size < sizeof(FbsonHeader) + sizeof(FbsonValue)) {
+    return nullptr;
+  }
+
+  FbsonDocument* doc = (FbsonDocument*)pb;
+  if (doc->header_.ver_ != FBSON_VER) {
+    return nullptr;
+  }
+
+  FbsonValue* val = (FbsonValue*)doc->payload_;
+  if (!val->isObject() || size != sizeof(FbsonHeader) + val->numPackedBytes()) {
+    return nullptr;
+  }
+
+  return doc;
+}
+
+inline FbsonValue* FbsonDocument::createValue(const char* pb, uint32_t size) {
+  if (!pb || size < sizeof(FbsonHeader) + sizeof(FbsonValue)) {
+    return nullptr;
+  }
+
+  FbsonDocument* doc = (FbsonDocument*)pb;
+  if (doc->header_.ver_ != FBSON_VER) {
+    return nullptr;
+  }
+
+  FbsonValue* val = (FbsonValue*)doc->payload_;
+  if (size != sizeof(FbsonHeader) + val->numPackedBytes()) {
+    return nullptr;
+  }
+
+  return val;
+}
+
+inline unsigned int FbsonKeyValue::numPackedBytes() const {
+  unsigned int ks = keyPackedBytes();
+  FbsonValue* val = (FbsonValue*)(((char*)this) + ks);
+  return ks + val->numPackedBytes();
+}
+
+// Poor man's "virtual" function FbsonValue::numPackedBytes
+inline unsigned int FbsonValue::numPackedBytes() const {
+  switch (type_) {
+  case FbsonType::T_Null:
+  case FbsonType::T_True:
+  case FbsonType::T_False: {
+    return sizeof(type_);
+  }
+
+  case FbsonType::T_Int8: {
+    return sizeof(type_) + sizeof(int8_t);
+  }
+  case FbsonType::T_Int16: {
+    return sizeof(type_) + sizeof(int16_t);
+  }
+  case FbsonType::T_Int32: {
+    return sizeof(type_) + sizeof(int32_t);
+  }
+  case FbsonType::T_Int64: {
+    return sizeof(type_) + sizeof(int64_t);
+  }
+  case FbsonType::T_Double: {
+    return sizeof(type_) + sizeof(double);
+  }
+  case FbsonType::T_String:
+  case FbsonType::T_Binary: {
+    return ((BlobVal*)(this))->numPackedBytes();
+  }
+
+  case FbsonType::T_Object:
+  case FbsonType::T_Array: {
+    return ((ContainerVal*)(this))->numPackedBytes();
+  }
+  default:
+    return 0;
+  }
+}
+
+inline unsigned int FbsonValue::size() const {
+  switch (type_) {
+  case FbsonType::T_Int8: {
+    return sizeof(int8_t);
+  }
+  case FbsonType::T_Int16: {
+    return sizeof(int16_t);
+  }
+  case FbsonType::T_Int32: {
+    return sizeof(int32_t);
+  }
+  case FbsonType::T_Int64: {
+    return sizeof(int64_t);
+  }
+  case FbsonType::T_Double: {
+    return sizeof(double);
+  }
+  case FbsonType::T_String:
+  case FbsonType::T_Binary: {
+    return ((BlobVal*)(this))->getBlobLen();
+  }
+
+  case FbsonType::T_Object:
+  case FbsonType::T_Array: {
+    return ((ContainerVal*)(this))->getContainerSize();
+  }
+  case FbsonType::T_Null:
+  case FbsonType::T_True:
+  case FbsonType::T_False:
+  default:
+    return 0;
+  }
+}
+
+inline const char* FbsonValue::getValuePtr() const {
+  switch (type_) {
+  case FbsonType::T_Int8:
+  case FbsonType::T_Int16:
+  case FbsonType::T_Int32:
+  case FbsonType::T_Int64:
+  case FbsonType::T_Double:
+    return ((char*)this) + sizeof(FbsonType);
+
+  case FbsonType::T_String:
+  case FbsonType::T_Binary:
+    return ((BlobVal*)(this))->getBlob();
+
+  case FbsonType::T_Object:
+  case FbsonType::T_Array:
+    return ((ContainerVal*)(this))->getPayload();
+
+  case FbsonType::T_Null:
+  case FbsonType::T_True:
+  case FbsonType::T_False:
+  default:
+    return nullptr;
+  }
+}
+
+inline FbsonValue* FbsonValue::findPath(const char* key_path,
+                                        unsigned int kp_len,
+                                        const char* delim = ".",
+                                        hDictFind handler = nullptr) {
+  if (!key_path || !kp_len)
+    return nullptr;
+
+  if (!delim)
+    delim = "."; // default delimiter
+
+  FbsonValue* pval = this;
+  const char* fence = key_path + kp_len;
+  char idx_buf[21]; // buffer to parse array index (integer value)
+
+  while (pval && key_path < fence) {
+    const char* key = key_path;
+    unsigned int klen = 0;
+    // find the current key
+    for (; key_path != fence && *key_path != *delim; ++key_path, ++klen)
+      ;
+
+    if (!klen)
+      return nullptr;
+
+    switch (pval->type_) {
+    case FbsonType::T_Object: {
+      pval = ((ObjectVal*)pval)->find(key, klen, handler);
+      break;
+    }
+
+    case FbsonType::T_Array: {
+      // parse string into an integer (array index)
+      if (klen >= sizeof(idx_buf))
+        return nullptr;
+
+      memcpy(idx_buf, key, klen);
+      idx_buf[klen] = 0;
+
+      char* end = nullptr;
+      int index = (int)strtol(idx_buf, &end, 10);
+      if (end && !*end)
+        pval = ((fbson::ArrayVal*)pval)->get(index);
+      else
+        // incorrect index string
+        return nullptr;
+      break;
+    }
+
+    default:
+      return nullptr;
+    }
+
+    // skip the delimiter
+    if (key_path < fence) {
+      ++key_path;
+      if (key_path == fence)
+        // we have a trailing delimiter at the end
+        return nullptr;
+    }
+  }
+
+  return pval;
+}
+
+#pragma pack(pop)
+
+} // namespace fbson
+
+#endif // FBSON_FBSONDOCUMENT_H
diff --git a/src/rocksdb/third-party/fbson/FbsonJsonParser.h b/src/rocksdb/third-party/fbson/FbsonJsonParser.h
new file mode 100644
index 0000000..3525b68
--- /dev/null
+++ b/src/rocksdb/third-party/fbson/FbsonJsonParser.h
@@ -0,0 +1,746 @@
+/*
+ *  Copyright (c) 2014, Facebook, Inc.
+ *  All rights reserved.
+ *
+ *  This source code is licensed under the BSD-style license found in the
+ *  LICENSE file in the root directory of this source tree. An additional grant
+ *  of patent rights can be found in the PATENTS file in the same directory.
+ *
+ */
+
+/*
+ * This file defines FbsonJsonParserT (template) and FbsonJsonParser.
+ *
+ * FbsonJsonParserT is a template class which implements a JSON parser.
+ * FbsonJsonParserT parses JSON text, and serialize it to FBSON binary format
+ * by using FbsonWriterT object. By default, FbsonJsonParserT creates a new
+ * FbsonWriterT object with an output stream object.  However, you can also
+ * pass in your FbsonWriterT or any stream object that implements some basic
+ * interface of std::ostream (see FbsonStream.h).
+ *
+ * FbsonJsonParser specializes FbsonJsonParserT with FbsonOutStream type (see
+ * FbsonStream.h). So unless you want to provide own a different output stream
+ * type, use FbsonJsonParser object.
+ *
+ * ** Parsing JSON **
+ * FbsonJsonParserT parses JSON string, and directly serializes into FBSON
+ * packed bytes. There are three ways to parse a JSON string: (1) using
+ * c-string, (2) using string with len, (3) using std::istream object. You can
+ * use custome streambuf to redirect output. FbsonOutBuffer is a streambuf used
+ * internally if the input is raw character buffer.
+ *
+ * You can reuse an FbsonJsonParserT object to parse/serialize multiple JSON
+ * strings, and the previous FBSON will be overwritten.
+ *
+ * If parsing fails (returned false), the error code will be set to one of
+ * FbsonErrType, and can be retrieved by calling getErrorCode().
+ *
+ * ** External dictionary **
+ * During parsing a JSON string, you can pass a call-back function to map a key
+ * string to an id, and store the dictionary id in FBSON to save space. The
+ * purpose of using an external dictionary is more towards a collection of
+ * documents (which has common keys) rather than a single document, so that
+ * space saving will be siginificant.
+ *
+ * ** Endianness **
+ * Note: FBSON serialization doesn't assume endianness of the server. However
+ * you will need to ensure that the endianness at the reader side is the same
+ * as that at the writer side (if they are on different machines). Otherwise,
+ * proper conversion is needed when a number value is returned to the
+ * caller/writer.
+ *
+ * @author Tian Xia <tianx at fb.com>
+ */
+
+#ifndef FBSON_FBSONPARSER_H
+#define FBSON_FBSONPARSER_H
+
+#include <cmath>
+#include <limits>
+#include "FbsonDocument.h"
+#include "FbsonWriter.h"
+
+namespace fbson {
+
+const char* const kJsonDelim = " ,]}\t\r\n";
+const char* const kWhiteSpace = " \t\n\r";
+
+/*
+ * Error codes
+ */
+enum class FbsonErrType {
+  E_NONE = 0,
+  E_INVALID_VER,
+  E_EMPTY_STR,
+  E_OUTPUT_FAIL,
+  E_INVALID_DOCU,
+  E_INVALID_VALUE,
+  E_INVALID_KEY,
+  E_INVALID_STR,
+  E_INVALID_OBJ,
+  E_INVALID_ARR,
+  E_INVALID_HEX,
+  E_INVALID_OCTAL,
+  E_INVALID_DECIMAL,
+  E_INVALID_EXPONENT,
+  E_HEX_OVERFLOW,
+  E_OCTAL_OVERFLOW,
+  E_DECIMAL_OVERFLOW,
+  E_DOUBLE_OVERFLOW,
+  E_EXPONENT_OVERFLOW,
+};
+
+/*
+ * Template FbsonJsonParserT
+ */
+template <class OS_TYPE>
+class FbsonJsonParserT {
+ public:
+  FbsonJsonParserT() : err_(FbsonErrType::E_NONE) {}
+
+  explicit FbsonJsonParserT(OS_TYPE& os)
+      : writer_(os), err_(FbsonErrType::E_NONE) {}
+
+  // parse a UTF-8 JSON string
+  bool parse(const std::string& str, hDictInsert handler = nullptr) {
+    return parse(str.c_str(), (unsigned int)str.size(), handler);
+  }
+
+  // parse a UTF-8 JSON c-style string (NULL terminated)
+  bool parse(const char* c_str, hDictInsert handler = nullptr) {
+    return parse(c_str, (unsigned int)strlen(c_str), handler);
+  }
+
+  // parse a UTF-8 JSON string with length
+  bool parse(const char* pch, unsigned int len, hDictInsert handler = nullptr) {
+    if (!pch || len == 0) {
+      err_ = FbsonErrType::E_EMPTY_STR;
+      return false;
+    }
+
+    FbsonInBuffer sb(pch, len);
+    std::istream in(&sb);
+    return parse(in, handler);
+  }
+
+  // parse UTF-8 JSON text from an input stream
+  bool parse(std::istream& in, hDictInsert handler = nullptr) {
+    bool res = false;
+
+    // reset output stream
+    writer_.reset();
+
+    trim(in);
+
+    if (in.peek() == '{') {
+      in.ignore();
+      res = parseObject(in, handler);
+    } else if (in.peek() == '[') {
+      in.ignore();
+      res = parseArray(in, handler);
+    } else {
+      err_ = FbsonErrType::E_INVALID_DOCU;
+    }
+
+    trim(in);
+    if (res && !in.eof()) {
+      err_ = FbsonErrType::E_INVALID_DOCU;
+      return false;
+    }
+
+    return res;
+  }
+
+  FbsonWriterT<OS_TYPE>& getWriter() { return writer_; }
+
+  FbsonErrType getErrorCode() { return err_; }
+
+  // clear error code
+  void clearErr() { err_ = FbsonErrType::E_NONE; }
+
+ private:
+  // parse a JSON object (comma-separated list of key-value pairs)
+  bool parseObject(std::istream& in, hDictInsert handler) {
+    if (!writer_.writeStartObject()) {
+      err_ = FbsonErrType::E_OUTPUT_FAIL;
+      return false;
+    }
+
+    trim(in);
+
+    if (in.peek() == '}') {
+      in.ignore();
+      // empty object
+      if (!writer_.writeEndObject()) {
+        err_ = FbsonErrType::E_OUTPUT_FAIL;
+        return false;
+      }
+      return true;
+    }
+
+    while (in.good()) {
+      if (in.get() != '"') {
+        err_ = FbsonErrType::E_INVALID_KEY;
+        return false;
+      }
+
+      if (!parseKVPair(in, handler)) {
+        return false;
+      }
+
+      trim(in);
+
+      char ch = in.get();
+      if (ch == '}') {
+        // end of the object
+        if (!writer_.writeEndObject()) {
+          err_ = FbsonErrType::E_OUTPUT_FAIL;
+          return false;
+        }
+        return true;
+      } else if (ch != ',') {
+        err_ = FbsonErrType::E_INVALID_OBJ;
+        return false;
+      }
+
+      trim(in);
+    }
+
+    err_ = FbsonErrType::E_INVALID_OBJ;
+    return false;
+  }
+
+  // parse a JSON array (comma-separated list of values)
+  bool parseArray(std::istream& in, hDictInsert handler) {
+    if (!writer_.writeStartArray()) {
+      err_ = FbsonErrType::E_OUTPUT_FAIL;
+      return false;
+    }
+
+    trim(in);
+
+    if (in.peek() == ']') {
+      in.ignore();
+      // empty array
+      if (!writer_.writeEndArray()) {
+        err_ = FbsonErrType::E_OUTPUT_FAIL;
+        return false;
+      }
+      return true;
+    }
+
+    while (in.good()) {
+      if (!parseValue(in, handler)) {
+        return false;
+      }
+
+      trim(in);
+
+      char ch = in.get();
+      if (ch == ']') {
+        // end of the array
+        if (!writer_.writeEndArray()) {
+          err_ = FbsonErrType::E_OUTPUT_FAIL;
+          return false;
+        }
+        return true;
+      } else if (ch != ',') {
+        err_ = FbsonErrType::E_INVALID_ARR;
+        return false;
+      }
+
+      trim(in);
+    }
+
+    err_ = FbsonErrType::E_INVALID_ARR;
+    return false;
+  }
+
+  // parse a key-value pair, separated by ":"
+  bool parseKVPair(std::istream& in, hDictInsert handler) {
+    if (parseKey(in, handler) && parseValue(in, handler)) {
+      return true;
+    }
+
+    return false;
+  }
+
+  // parse a key (must be string)
+  bool parseKey(std::istream& in, hDictInsert handler) {
+    char key[FbsonKeyValue::sMaxKeyLen];
+    int i = 0;
+    while (in.good() && in.peek() != '"' && i < FbsonKeyValue::sMaxKeyLen) {
+      key[i++] = in.get();
+    }
+
+    if (!in.good() || in.peek() != '"' || i == 0) {
+      err_ = FbsonErrType::E_INVALID_KEY;
+      return false;
+    }
+
+    in.ignore(); // discard '"'
+
+    int key_id = -1;
+    if (handler) {
+      key_id = handler(key, i);
+    }
+
+    if (key_id < 0) {
+      writer_.writeKey(key, i);
+    } else {
+      writer_.writeKey(key_id);
+    }
+
+    trim(in);
+
+    if (in.get() != ':') {
+      err_ = FbsonErrType::E_INVALID_OBJ;
+      return false;
+    }
+
+    return true;
+  }
+
+  // parse a value
+  bool parseValue(std::istream& in, hDictInsert handler) {
+    bool res = false;
+
+    trim(in);
+
+    switch (in.peek()) {
+    case 'N':
+    case 'n': {
+      in.ignore();
+      res = parseNull(in);
+      break;
+    }
+    case 'T':
+    case 't': {
+      in.ignore();
+      res = parseTrue(in);
+      break;
+    }
+    case 'F':
+    case 'f': {
+      in.ignore();
+      res = parseFalse(in);
+      break;
+    }
+    case '"': {
+      in.ignore();
+      res = parseString(in);
+      break;
+    }
+    case '{': {
+      in.ignore();
+      res = parseObject(in, handler);
+      break;
+    }
+    case '[': {
+      in.ignore();
+      res = parseArray(in, handler);
+      break;
+    }
+    default: {
+      res = parseNumber(in);
+      break;
+    }
+    }
+
+    return res;
+  }
+
+  // parse NULL value
+  bool parseNull(std::istream& in) {
+    if (tolower(in.get()) == 'u' && tolower(in.get()) == 'l' &&
+        tolower(in.get()) == 'l') {
+      writer_.writeNull();
+      return true;
+    }
+
+    err_ = FbsonErrType::E_INVALID_VALUE;
+    return false;
+  }
+
+  // parse TRUE value
+  bool parseTrue(std::istream& in) {
+    if (tolower(in.get()) == 'r' && tolower(in.get()) == 'u' &&
+        tolower(in.get()) == 'e') {
+      writer_.writeBool(true);
+      return true;
+    }
+
+    err_ = FbsonErrType::E_INVALID_VALUE;
+    return false;
+  }
+
+  // parse FALSE value
+  bool parseFalse(std::istream& in) {
+    if (tolower(in.get()) == 'a' && tolower(in.get()) == 'l' &&
+        tolower(in.get()) == 's' && tolower(in.get()) == 'e') {
+      writer_.writeBool(false);
+      return true;
+    }
+
+    err_ = FbsonErrType::E_INVALID_VALUE;
+    return false;
+  }
+
+  // parse a string
+  bool parseString(std::istream& in) {
+    if (!writer_.writeStartString()) {
+      err_ = FbsonErrType::E_OUTPUT_FAIL;
+      return false;
+    }
+
+    bool escaped = false;
+    char buffer[4096]; // write 4KB at a time
+    int nread = 0;
+    while (in.good()) {
+      char ch = in.get();
+      if (ch != '"' || escaped) {
+        buffer[nread++] = ch;
+        if (nread == 4096) {
+          // flush buffer
+          if (!writer_.writeString(buffer, nread)) {
+            err_ = FbsonErrType::E_OUTPUT_FAIL;
+            return false;
+          }
+          nread = 0;
+        }
+        // set/reset escape
+        if (ch == '\\' || escaped) {
+          escaped = !escaped;
+        }
+      } else {
+        // write all remaining bytes in the buffer
+        if (nread > 0) {
+          if (!writer_.writeString(buffer, nread)) {
+            err_ = FbsonErrType::E_OUTPUT_FAIL;
+            return false;
+          }
+        }
+        // end writing string
+        if (!writer_.writeEndString()) {
+          err_ = FbsonErrType::E_OUTPUT_FAIL;
+          return false;
+        }
+        return true;
+      }
+    }
+
+    err_ = FbsonErrType::E_INVALID_STR;
+    return false;
+  }
+
+  // parse a number
+  // Number format can be hex, octal, or decimal (including float).
+  // Only decimal can have (+/-) sign prefix.
+  bool parseNumber(std::istream& in) {
+    bool ret = false;
+    switch (in.peek()) {
+    case '0': {
+      in.ignore();
+
+      if (in.peek() == 'x' || in.peek() == 'X') {
+        in.ignore();
+        ret = parseHex(in);
+      } else if (in.peek() == '.') {
+        in.ignore();
+        ret = parseDouble(in, 0, 0, 1);
+      } else {
+        ret = parseOctal(in);
+      }
+
+      break;
+    }
+    case '-': {
+      in.ignore();
+      ret = parseDecimal(in, -1);
+      break;
+    }
+    case '+':
+      in.ignore();
+    // fall through
+    default:
+      ret = parseDecimal(in, 1);
+      break;
+    }
+
+    return ret;
+  }
+
+  // parse a number in hex format
+  bool parseHex(std::istream& in) {
+    uint64_t val = 0;
+    int num_digits = 0;
+    char ch = tolower(in.peek());
+    while (in.good() && !strchr(kJsonDelim, ch) && (++num_digits) <= 16) {
+      if (ch >= '0' && ch <= '9') {
+        val = (val << 4) + (ch - '0');
+      } else if (ch >= 'a' && ch <= 'f') {
+        val = (val << 4) + (ch - 'a' + 10);
+      } else { // unrecognized hex digit
+        err_ = FbsonErrType::E_INVALID_HEX;
+        return false;
+      }
+
+      in.ignore();
+      ch = tolower(in.peek());
+    }
+
+    int size = 0;
+    if (num_digits <= 2) {
+      size = writer_.writeInt8((int8_t)val);
+    } else if (num_digits <= 4) {
+      size = writer_.writeInt16((int16_t)val);
+    } else if (num_digits <= 8) {
+      size = writer_.writeInt32((int32_t)val);
+    } else if (num_digits <= 16) {
+      size = writer_.writeInt64(val);
+    } else {
+      err_ = FbsonErrType::E_HEX_OVERFLOW;
+      return false;
+    }
+
+    if (size == 0) {
+      err_ = FbsonErrType::E_OUTPUT_FAIL;
+      return false;
+    }
+
+    return true;
+  }
+
+  // parse a number in octal format
+  bool parseOctal(std::istream& in) {
+    int64_t val = 0;
+    char ch = in.peek();
+    while (in.good() && !strchr(kJsonDelim, ch)) {
+      if (ch >= '0' && ch <= '7') {
+        val = val * 8 + (ch - '0');
+      } else {
+        err_ = FbsonErrType::E_INVALID_OCTAL;
+        return false;
+      }
+
+      // check if the number overflows
+      if (val < 0) {
+        err_ = FbsonErrType::E_OCTAL_OVERFLOW;
+        return false;
+      }
+
+      in.ignore();
+      ch = in.peek();
+    }
+
+    int size = 0;
+    if (val <= std::numeric_limits<int8_t>::max()) {
+      size = writer_.writeInt8((int8_t)val);
+    } else if (val <= std::numeric_limits<int16_t>::max()) {
+      size = writer_.writeInt16((int16_t)val);
+    } else if (val <= std::numeric_limits<int32_t>::max()) {
+      size = writer_.writeInt32((int32_t)val);
+    } else { // val <= INT64_MAX
+      size = writer_.writeInt64(val);
+    }
+
+    if (size == 0) {
+      err_ = FbsonErrType::E_OUTPUT_FAIL;
+      return false;
+    }
+
+    return true;
+  }
+
+  // parse a number in decimal (including float)
+  bool parseDecimal(std::istream& in, int sign) {
+    int64_t val = 0;
+    int precision = 0;
+
+    char ch = 0;
+    while (in.good() && (ch = in.peek()) == '0')
+      in.ignore();
+
+    while (in.good() && !strchr(kJsonDelim, ch)) {
+      if (ch >= '0' && ch <= '9') {
+        val = val * 10 + (ch - '0');
+        ++precision;
+      } else if (ch == '.') {
+        // note we don't pop out '.'
+        return parseDouble(in, val, precision, sign);
+      } else {
+        err_ = FbsonErrType::E_INVALID_DECIMAL;
+        return false;
+      }
+
+      in.ignore();
+
+      // if the number overflows int64_t, first parse it as double iff we see a
+      // decimal point later. Otherwise, will treat it as overflow
+      if (val < 0 && val > std::numeric_limits<int64_t>::min()) {
+        return parseDouble(in, (uint64_t)val, precision, sign);
+      }
+
+      ch = in.peek();
+    }
+
+    if (sign < 0) {
+      val = -val;
+    }
+
+    int size = 0;
+    if (val >= std::numeric_limits<int8_t>::min() &&
+        val <= std::numeric_limits<int8_t>::max()) {
+      size = writer_.writeInt8((int8_t)val);
+    } else if (val >= std::numeric_limits<int16_t>::min() &&
+               val <= std::numeric_limits<int16_t>::max()) {
+      size = writer_.writeInt16((int16_t)val);
+    } else if (val >= std::numeric_limits<int32_t>::min() &&
+               val <= std::numeric_limits<int32_t>::max()) {
+      size = writer_.writeInt32((int32_t)val);
+    } else { // val <= INT64_MAX
+      size = writer_.writeInt64(val);
+    }
+
+    if (size == 0) {
+      err_ = FbsonErrType::E_OUTPUT_FAIL;
+      return false;
+    }
+
+    return true;
+  }
+
+  // parse IEEE745 double precision:
+  // Significand precision length - 15
+  // Maximum exponent value - 308
+  //
+  // "If a decimal string with at most 15 significant digits is converted to
+  // IEEE 754 double precision representation and then converted back to a
+  // string with the same number of significant digits, then the final string
+  // should match the original"
+  bool parseDouble(std::istream& in, double val, int precision, int sign) {
+    int integ = precision;
+    int frac = 0;
+    bool is_frac = false;
+
+    char ch = in.peek();
+    if (ch == '.') {
+      is_frac = true;
+      in.ignore();
+      ch = in.peek();
+    }
+
+    int exp = 0;
+    while (in.good() && !strchr(kJsonDelim, ch)) {
+      if (ch >= '0' && ch <= '9') {
+        if (precision < 15) {
+          val = val * 10 + (ch - '0');
+          if (is_frac) {
+            ++frac;
+          } else {
+            ++integ;
+          }
+          ++precision;
+        } else if (!is_frac) {
+          ++exp;
+        }
+      } else if (ch == 'e' || ch == 'E') {
+        in.ignore();
+        int exp2;
+        if (!parseExponent(in, exp2)) {
+          return false;
+        }
+
+        exp += exp2;
+        // check if exponent overflows
+        if (exp > 308 || exp < -308) {
+          err_ = FbsonErrType::E_EXPONENT_OVERFLOW;
+          return false;
+        }
+
+        is_frac = true;
+        break;
+      }
+
+      in.ignore();
+      ch = in.peek();
+    }
+
+    if (!is_frac) {
+      err_ = FbsonErrType::E_DECIMAL_OVERFLOW;
+      return false;
+    }
+
+    val *= std::pow(10, exp - frac);
+    if (std::isnan(val) || std::isinf(val)) {
+      err_ = FbsonErrType::E_DOUBLE_OVERFLOW;
+      return false;
+    }
+
+    if (sign < 0) {
+      val = -val;
+    }
+
+    if (writer_.writeDouble(val) == 0) {
+      err_ = FbsonErrType::E_OUTPUT_FAIL;
+      return false;
+    }
+
+    return true;
+  }
+
+  // parse the exponent part of a double number
+  bool parseExponent(std::istream& in, int& exp) {
+    bool neg = false;
+
+    char ch = in.peek();
+    if (ch == '+') {
+      in.ignore();
+      ch = in.peek();
+    } else if (ch == '-') {
+      neg = true;
+      in.ignore();
+      ch = in.peek();
+    }
+
+    exp = 0;
+    while (in.good() && !strchr(kJsonDelim, ch)) {
+      if (ch >= '0' && ch <= '9') {
+        exp = exp * 10 + (ch - '0');
+      } else {
+        err_ = FbsonErrType::E_INVALID_EXPONENT;
+        return false;
+      }
+
+      if (exp > 308) {
+        err_ = FbsonErrType::E_EXPONENT_OVERFLOW;
+        return false;
+      }
+
+      in.ignore();
+      ch = in.peek();
+    }
+
+    if (neg) {
+      exp = -exp;
+    }
+
+    return true;
+  }
+
+  void trim(std::istream& in) {
+    while (in.good() && strchr(kWhiteSpace, in.peek())) {
+      in.ignore();
+    }
+  }
+
+ private:
+  FbsonWriterT<OS_TYPE> writer_;
+  FbsonErrType err_;
+};
+
+typedef FbsonJsonParserT<FbsonOutStream> FbsonJsonParser;
+
+} // namespace fbson
+
+#endif // FBSON_FBSONPARSER_H
diff --git a/src/rocksdb/third-party/fbson/FbsonStream.h b/src/rocksdb/third-party/fbson/FbsonStream.h
new file mode 100644
index 0000000..6ac132b
--- /dev/null
+++ b/src/rocksdb/third-party/fbson/FbsonStream.h
@@ -0,0 +1,183 @@
+/*
+ *  Copyright (c) 2014, Facebook, Inc.
+ *  All rights reserved.
+ *
+ *  This source code is licensed under the BSD-style license found in the
+ *  LICENSE file in the root directory of this source tree. An additional grant
+ *  of patent rights can be found in the PATENTS file in the same directory.
+ *
+ */
+
+/*
+ * This header file defines FbsonInBuffer and FbsonOutStream classes.
+ *
+ * ** Input Buffer **
+ * FbsonInBuffer is a customer input buffer to wrap raw character buffer. Its
+ * object instances are used to create std::istream objects interally.
+ *
+ * ** Output Stream **
+ * FbsonOutStream is a custom output stream classes, to contain the FBSON
+ * serialized binary. The class is conveniently used to specialize templates of
+ * FbsonParser and FbsonWriter.
+ *
+ * @author Tian Xia <tianx at fb.com>
+ */
+
+#ifndef FBSON_FBSONSTREAM_H
+#define FBSON_FBSONSTREAM_H
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
+#include <iostream>
+
+namespace fbson {
+
+// lengths includes sign
+#define MAX_INT_DIGITS 11
+#define MAX_INT64_DIGITS 20
+#define MAX_DOUBLE_DIGITS 23 // 1(sign)+16(significant)+1(decimal)+5(exponent)
+
+/*
+ * FBSON's implementation of input buffer
+ */
+class FbsonInBuffer : public std::streambuf {
+ public:
+  FbsonInBuffer(const char* str, uint32_t len) {
+    // this is read buffer and the str will not be changed
+    // so we use const_cast (ugly!) to remove constness
+    char* pch(const_cast<char*>(str));
+    setg(pch, pch, pch + len);
+  }
+};
+
+/*
+ * FBSON's implementation of output stream.
+ *
+ * This is a wrapper of a char buffer. By default, the buffer capacity is 1024
+ * bytes. We will double the buffer if realloc is needed for writes.
+ */
+class FbsonOutStream : public std::ostream {
+ public:
+  explicit FbsonOutStream(uint32_t capacity = 1024)
+      : std::ostream(nullptr),
+        head_(nullptr),
+        size_(0),
+        capacity_(capacity),
+        alloc_(true) {
+    if (capacity_ == 0) {
+      capacity_ = 1024;
+    }
+
+    head_ = (char*)malloc(capacity_);
+  }
+
+  FbsonOutStream(char* buffer, uint32_t capacity)
+      : std::ostream(nullptr),
+        head_(buffer),
+        size_(0),
+        capacity_(capacity),
+        alloc_(false) {
+    assert(buffer && capacity_ > 0);
+  }
+
+  ~FbsonOutStream() {
+    if (alloc_) {
+      free(head_);
+    }
+  }
+
+  void put(char c) { write(&c, 1); }
+
+  void write(const char* c_str) { write(c_str, (uint32_t)strlen(c_str)); }
+
+  void write(const char* bytes, uint32_t len) {
+    if (len == 0)
+      return;
+
+    if (size_ + len > capacity_) {
+      realloc(len);
+    }
+
+    memcpy(head_ + size_, bytes, len);
+    size_ += len;
+  }
+
+  // write the integer to string
+  void write(int i) {
+    // snprintf automatically adds a NULL, so we need one more char
+    if (size_ + MAX_INT_DIGITS + 1 > capacity_) {
+      realloc(MAX_INT_DIGITS + 1);
+    }
+
+    int len = snprintf(head_ + size_, MAX_INT_DIGITS + 1, "%d", i);
+    assert(len > 0);
+    size_ += len;
+  }
+
+  // write the 64bit integer to string
+  void write(int64_t l) {
+    // snprintf automatically adds a NULL, so we need one more char
+    if (size_ + MAX_INT64_DIGITS + 1 > capacity_) {
+      realloc(MAX_INT64_DIGITS + 1);
+    }
+
+    int len = snprintf(head_ + size_, MAX_INT64_DIGITS + 1, "%" PRIi64, l);
+    assert(len > 0);
+    size_ += len;
+  }
+
+  // write the double to string
+  void write(double d) {
+    // snprintf automatically adds a NULL, so we need one more char
+    if (size_ + MAX_DOUBLE_DIGITS + 1 > capacity_) {
+      realloc(MAX_DOUBLE_DIGITS + 1);
+    }
+
+    int len = snprintf(head_ + size_, MAX_DOUBLE_DIGITS + 1, "%.15g", d);
+    assert(len > 0);
+    size_ += len;
+  }
+
+  pos_type tellp() const { return size_; }
+
+  void seekp(pos_type pos) { size_ = (uint32_t)pos; }
+
+  const char* getBuffer() const { return head_; }
+
+  pos_type getSize() const { return tellp(); }
+
+ private:
+  void realloc(uint32_t len) {
+    assert(capacity_ > 0);
+
+    capacity_ *= 2;
+    while (capacity_ < size_ + len) {
+      capacity_ *= 2;
+    }
+
+    if (alloc_) {
+      char* new_buf = (char*)::realloc(head_, capacity_);
+      assert(new_buf);
+      head_ = new_buf;
+    } else {
+      char* new_buf = (char*)::malloc(capacity_);
+      assert(new_buf);
+      memcpy(new_buf, head_, size_);
+      head_ = new_buf;
+      alloc_ = true;
+    }
+  }
+
+ private:
+  char* head_;
+  uint32_t size_;
+  uint32_t capacity_;
+  bool alloc_;
+};
+
+} // namespace fbson
+
+#endif // FBSON_FBSONSTREAM_H
diff --git a/src/rocksdb/third-party/fbson/FbsonUtil.h b/src/rocksdb/third-party/fbson/FbsonUtil.h
new file mode 100644
index 0000000..ab96563
--- /dev/null
+++ b/src/rocksdb/third-party/fbson/FbsonUtil.h
@@ -0,0 +1,168 @@
+/*
+ *  Copyright (c) 2014, Facebook, Inc.
+ *  All rights reserved.
+ *
+ *  This source code is licensed under the BSD-style license found in the
+ *  LICENSE file in the root directory of this source tree. An additional grant
+ *  of patent rights can be found in the PATENTS file in the same directory.
+ *
+ */
+
+/*
+ * This header file defines miscellaneous utility classes.
+ *
+ * @author Tian Xia <tianx at fb.com>
+ */
+
+#ifndef FBSON_FBSONUTIL_H
+#define FBSON_FBSONUTIL_H
+
+#include <sstream>
+#include "FbsonDocument.h"
+
+namespace fbson {
+
+#define OUT_BUF_SIZE 1024
+
+/*
+ * FbsonToJson converts an FbsonValue object to a JSON string.
+ */
+class FbsonToJson {
+ public:
+  FbsonToJson() : os_(buffer_, OUT_BUF_SIZE) {}
+
+  // get json string
+  const char* json(const FbsonValue* pval) {
+    os_.clear();
+    os_.seekp(0);
+
+    if (pval) {
+      intern_json(pval);
+    }
+
+    os_.put(0);
+    return os_.getBuffer();
+  }
+
+ private:
+  // recursively convert FbsonValue
+  void intern_json(const FbsonValue* val) {
+    switch (val->type()) {
+    case FbsonType::T_Null: {
+      os_.write("null", 4);
+      break;
+    }
+    case FbsonType::T_True: {
+      os_.write("true", 4);
+      break;
+    }
+    case FbsonType::T_False: {
+      os_.write("false", 5);
+      break;
+    }
+    case FbsonType::T_Int8: {
+      os_.write(((Int8Val*)val)->val());
+      break;
+    }
+    case FbsonType::T_Int16: {
+      os_.write(((Int16Val*)val)->val());
+      break;
+    }
+    case FbsonType::T_Int32: {
+      os_.write(((Int32Val*)val)->val());
+      break;
+    }
+    case FbsonType::T_Int64: {
+      os_.write(((Int64Val*)val)->val());
+      break;
+    }
+    case FbsonType::T_Double: {
+      os_.write(((DoubleVal*)val)->val());
+      break;
+    }
+    case FbsonType::T_String: {
+      os_.put('"');
+      os_.write(((StringVal*)val)->getBlob(), ((StringVal*)val)->getBlobLen());
+      os_.put('"');
+      break;
+    }
+    case FbsonType::T_Binary: {
+      os_.write("\"<BINARY>", 9);
+      os_.write(((BinaryVal*)val)->getBlob(), ((BinaryVal*)val)->getBlobLen());
+      os_.write("<BINARY>\"", 9);
+      break;
+    }
+    case FbsonType::T_Object: {
+      object_to_json((ObjectVal*)val);
+      break;
+    }
+    case FbsonType::T_Array: {
+      array_to_json((ArrayVal*)val);
+      break;
+    }
+    default:
+      break;
+    }
+  }
+
+  // convert object
+  void object_to_json(const ObjectVal* val) {
+    os_.put('{');
+
+    auto iter = val->begin();
+    auto iter_fence = val->end();
+
+    while (iter < iter_fence) {
+      // write key
+      if (iter->klen()) {
+        os_.put('"');
+        os_.write(iter->getKeyStr(), iter->klen());
+        os_.put('"');
+      } else {
+        os_.write(iter->getKeyId());
+      }
+      os_.put(':');
+
+      // convert value
+      intern_json(iter->value());
+
+      ++iter;
+      if (iter != iter_fence) {
+        os_.put(',');
+      }
+    }
+
+    assert(iter == iter_fence);
+
+    os_.put('}');
+  }
+
+  // convert array to json
+  void array_to_json(const ArrayVal* val) {
+    os_.put('[');
+
+    auto iter = val->begin();
+    auto iter_fence = val->end();
+
+    while (iter != iter_fence) {
+      // convert value
+      intern_json((const FbsonValue*)iter);
+      ++iter;
+      if (iter != iter_fence) {
+        os_.put(',');
+      }
+    }
+
+    assert(iter == iter_fence);
+
+    os_.put(']');
+  }
+
+ private:
+  FbsonOutStream os_;
+  char buffer_[OUT_BUF_SIZE];
+};
+
+} // namespace fbson
+
+#endif // FBSON_FBSONUTIL_H
diff --git a/src/rocksdb/third-party/fbson/FbsonWriter.h b/src/rocksdb/third-party/fbson/FbsonWriter.h
new file mode 100644
index 0000000..21bd6f2
--- /dev/null
+++ b/src/rocksdb/third-party/fbson/FbsonWriter.h
@@ -0,0 +1,435 @@
+/*
+ *  Copyright (c) 2014, Facebook, Inc.
+ *  All rights reserved.
+ *
+ *  This source code is licensed under the BSD-style license found in the
+ *  LICENSE file in the root directory of this source tree. An additional grant
+ *  of patent rights can be found in the PATENTS file in the same directory.
+ *
+ */
+
+/*
+ * This file defines FbsonWriterT (template) and FbsonWriter.
+ *
+ * FbsonWriterT is a template class which implements an FBSON serializer.
+ * Users call various write functions of FbsonWriterT object to write values
+ * directly to FBSON packed bytes. All write functions of value or key return
+ * the number of bytes written to FBSON, or 0 if there is an error. To write an
+ * object, an array, or a string, you must call writeStart[..] before writing
+ * values or key, and call writeEnd[..] after finishing at the end.
+ *
+ * By default, an FbsonWriterT object creates an output stream buffer.
+ * Alternatively, you can also pass any output stream object to a writer, as
+ * long as the stream object implements some basic functions of std::ostream
+ * (such as FbsonOutStream, see FbsonStream.h).
+ *
+ * FbsonWriter specializes FbsonWriterT with FbsonOutStream type (see
+ * FbsonStream.h). So unless you want to provide own a different output stream
+ * type, use FbsonParser object.
+ *
+ * @author Tian Xia <tianx at fb.com>
+ */
+
+#ifndef FBSON_FBSONWRITER_H
+#define FBSON_FBSONWRITER_H
+
+#include <stack>
+#include "FbsonDocument.h"
+#include "FbsonStream.h"
+
+namespace fbson {
+
+template <class OS_TYPE>
+class FbsonWriterT {
+ public:
+  FbsonWriterT()
+      : alloc_(true), hasHdr_(false), kvState_(WS_Value), str_pos_(0) {
+    os_ = new OS_TYPE();
+  }
+
+  explicit FbsonWriterT(OS_TYPE& os)
+      : os_(&os),
+        alloc_(false),
+        hasHdr_(false),
+        kvState_(WS_Value),
+        str_pos_(0) {}
+
+  ~FbsonWriterT() {
+    if (alloc_) {
+      delete os_;
+    }
+  }
+
+  void reset() {
+    os_->clear();
+    os_->seekp(0);
+    hasHdr_ = false;
+    kvState_ = WS_Value;
+    for (; !stack_.empty(); stack_.pop())
+      ;
+  }
+
+  // write a key string (or key id if an external dict is provided)
+  uint32_t writeKey(const char* key,
+                    uint8_t len,
+                    hDictInsert handler = nullptr) {
+    if (len && !stack_.empty() && verifyKeyState()) {
+      int key_id = -1;
+      if (handler) {
+        key_id = handler(key, len);
+      }
+
+      uint32_t size = sizeof(uint8_t);
+      if (key_id < 0) {
+        os_->put(len);
+        os_->write(key, len);
+        size += len;
+      } else if (key_id <= FbsonKeyValue::sMaxKeyId) {
+        FbsonKeyValue::keyid_type idx = key_id;
+        os_->put(0);
+        os_->write((char*)&idx, sizeof(FbsonKeyValue::keyid_type));
+        size += sizeof(FbsonKeyValue::keyid_type);
+      } else { // key id overflow
+        assert(0);
+        return 0;
+      }
+
+      kvState_ = WS_Key;
+      return size;
+    }
+
+    return 0;
+  }
+
+  // write a key id
+  uint32_t writeKey(FbsonKeyValue::keyid_type idx) {
+    if (!stack_.empty() && verifyKeyState()) {
+      os_->put(0);
+      os_->write((char*)&idx, sizeof(FbsonKeyValue::keyid_type));
+      kvState_ = WS_Key;
+      return sizeof(uint8_t) + sizeof(FbsonKeyValue::keyid_type);
+    }
+
+    return 0;
+  }
+
+  uint32_t writeNull() {
+    if (!stack_.empty() && verifyValueState()) {
+      os_->put((FbsonTypeUnder)FbsonType::T_Null);
+      kvState_ = WS_Value;
+      return sizeof(FbsonValue);
+    }
+
+    return 0;
+  }
+
+  uint32_t writeBool(bool b) {
+    if (!stack_.empty() && verifyValueState()) {
+      if (b) {
+        os_->put((FbsonTypeUnder)FbsonType::T_True);
+      } else {
+        os_->put((FbsonTypeUnder)FbsonType::T_False);
+      }
+
+      kvState_ = WS_Value;
+      return sizeof(FbsonValue);
+    }
+
+    return 0;
+  }
+
+  uint32_t writeInt8(int8_t v) {
+    if (!stack_.empty() && verifyValueState()) {
+      os_->put((FbsonTypeUnder)FbsonType::T_Int8);
+      os_->put(v);
+      kvState_ = WS_Value;
+      return sizeof(Int8Val);
+    }
+
+    return 0;
+  }
+
+  uint32_t writeInt16(int16_t v) {
+    if (!stack_.empty() && verifyValueState()) {
+      os_->put((FbsonTypeUnder)FbsonType::T_Int16);
+      os_->write((char*)&v, sizeof(int16_t));
+      kvState_ = WS_Value;
+      return sizeof(Int16Val);
+    }
+
+    return 0;
+  }
+
+  uint32_t writeInt32(int32_t v) {
+    if (!stack_.empty() && verifyValueState()) {
+      os_->put((FbsonTypeUnder)FbsonType::T_Int32);
+      os_->write((char*)&v, sizeof(int32_t));
+      kvState_ = WS_Value;
+      return sizeof(Int32Val);
+    }
+
+    return 0;
+  }
+
+  uint32_t writeInt64(int64_t v) {
+    if (!stack_.empty() && verifyValueState()) {
+      os_->put((FbsonTypeUnder)FbsonType::T_Int64);
+      os_->write((char*)&v, sizeof(int64_t));
+      kvState_ = WS_Value;
+      return sizeof(Int64Val);
+    }
+
+    return 0;
+  }
+
+  uint32_t writeDouble(double v) {
+    if (!stack_.empty() && verifyValueState()) {
+      os_->put((FbsonTypeUnder)FbsonType::T_Double);
+      os_->write((char*)&v, sizeof(double));
+      kvState_ = WS_Value;
+      return sizeof(DoubleVal);
+    }
+
+    return 0;
+  }
+
+  // must call writeStartString before writing a string val
+  bool writeStartString() {
+    if (!stack_.empty() && verifyValueState()) {
+      os_->put((FbsonTypeUnder)FbsonType::T_String);
+      str_pos_ = os_->tellp();
+
+      // fill the size bytes with 0 for now
+      uint32_t size = 0;
+      os_->write((char*)&size, sizeof(uint32_t));
+
+      kvState_ = WS_String;
+      return true;
+    }
+
+    return false;
+  }
+
+  // finish writing a string val
+  bool writeEndString() {
+    if (kvState_ == WS_String) {
+      std::streampos cur_pos = os_->tellp();
+      int32_t size = (int32_t)(cur_pos - str_pos_ - sizeof(uint32_t));
+      assert(size >= 0);
+
+      os_->seekp(str_pos_);
+      os_->write((char*)&size, sizeof(uint32_t));
+      os_->seekp(cur_pos);
+
+      kvState_ = WS_Value;
+      return true;
+    }
+
+    return false;
+  }
+
+  uint32_t writeString(const char* str, uint32_t len) {
+    if (kvState_ == WS_String) {
+      os_->write(str, len);
+      return len;
+    }
+
+    return 0;
+  }
+
+  uint32_t writeString(char ch) {
+    if (kvState_ == WS_String) {
+      os_->put(ch);
+      return 1;
+    }
+
+    return 0;
+  }
+
+  // must call writeStartBinary before writing a binary val
+  bool writeStartBinary() {
+    if (!stack_.empty() && verifyValueState()) {
+      os_->put((FbsonTypeUnder)FbsonType::T_Binary);
+      str_pos_ = os_->tellp();
+
+      // fill the size bytes with 0 for now
+      uint32_t size = 0;
+      os_->write((char*)&size, sizeof(uint32_t));
+
+      kvState_ = WS_Binary;
+      return true;
+    }
+
+    return false;
+  }
+
+  // finish writing a binary val
+  bool writeEndBinary() {
+    if (kvState_ == WS_Binary) {
+      std::streampos cur_pos = os_->tellp();
+      int32_t size = (int32_t)(cur_pos - str_pos_ - sizeof(uint32_t));
+      assert(size >= 0);
+
+      os_->seekp(str_pos_);
+      os_->write((char*)&size, sizeof(uint32_t));
+      os_->seekp(cur_pos);
+
+      kvState_ = WS_Value;
+      return true;
+    }
+
+    return false;
+  }
+
+  uint32_t writeBinary(const char* bin, uint32_t len) {
+    if (kvState_ == WS_Binary) {
+      os_->write(bin, len);
+      return len;
+    }
+
+    return 0;
+  }
+
+  // must call writeStartObject before writing an object val
+  bool writeStartObject() {
+    if (stack_.empty() || verifyValueState()) {
+      if (stack_.empty()) {
+        // if this is a new FBSON, write the header
+        if (!hasHdr_) {
+          writeHeader();
+        } else
+          return false;
+      }
+
+      os_->put((FbsonTypeUnder)FbsonType::T_Object);
+      // save the size position
+      stack_.push(WriteInfo({WS_Object, os_->tellp()}));
+
+      // fill the size bytes with 0 for now
+      uint32_t size = 0;
+      os_->write((char*)&size, sizeof(uint32_t));
+
+      kvState_ = WS_Value;
+      return true;
+    }
+
+    return false;
+  }
+
+  // finish writing an object val
+  bool writeEndObject() {
+    if (!stack_.empty() && stack_.top().state == WS_Object &&
+        kvState_ == WS_Value) {
+      WriteInfo& ci = stack_.top();
+      std::streampos cur_pos = os_->tellp();
+      int32_t size = (int32_t)(cur_pos - ci.sz_pos - sizeof(uint32_t));
+      assert(size >= 0);
+
+      os_->seekp(ci.sz_pos);
+      os_->write((char*)&size, sizeof(uint32_t));
+      os_->seekp(cur_pos);
+      stack_.pop();
+
+      return true;
+    }
+
+    return false;
+  }
+
+  // must call writeStartArray before writing an array val
+  bool writeStartArray() {
+    if (stack_.empty() || verifyValueState()) {
+      if (stack_.empty()) {
+        // if this is a new FBSON, write the header
+        if (!hasHdr_) {
+          writeHeader();
+        } else
+          return false;
+      }
+
+      os_->put((FbsonTypeUnder)FbsonType::T_Array);
+      // save the size position
+      stack_.push(WriteInfo({WS_Array, os_->tellp()}));
+
+      // fill the size bytes with 0 for now
+      uint32_t size = 0;
+      os_->write((char*)&size, sizeof(uint32_t));
+
+      kvState_ = WS_Value;
+      return true;
+    }
+
+    return false;
+  }
+
+  // finish writing an array val
+  bool writeEndArray() {
+    if (!stack_.empty() && stack_.top().state == WS_Array &&
+        kvState_ == WS_Value) {
+      WriteInfo& ci = stack_.top();
+      std::streampos cur_pos = os_->tellp();
+      int32_t size = (int32_t)(cur_pos - ci.sz_pos - sizeof(uint32_t));
+      assert(size >= 0);
+
+      os_->seekp(ci.sz_pos);
+      os_->write((char*)&size, sizeof(uint32_t));
+      os_->seekp(cur_pos);
+      stack_.pop();
+
+      return true;
+    }
+
+    return false;
+  }
+
+  OS_TYPE* getOutput() { return os_; }
+
+ private:
+  // verify we are in the right state before writing a value
+  bool verifyValueState() {
+    assert(!stack_.empty());
+    return (stack_.top().state == WS_Object && kvState_ == WS_Key) ||
+           (stack_.top().state == WS_Array && kvState_ == WS_Value);
+  }
+
+  // verify we are in the right state before writing a key
+  bool verifyKeyState() {
+    assert(!stack_.empty());
+    return stack_.top().state == WS_Object && kvState_ == WS_Value;
+  }
+
+  void writeHeader() {
+    os_->put(FBSON_VER);
+    hasHdr_ = true;
+  }
+
+ private:
+  enum WriteState {
+    WS_NONE,
+    WS_Array,
+    WS_Object,
+    WS_Key,
+    WS_Value,
+    WS_String,
+    WS_Binary,
+  };
+
+  struct WriteInfo {
+    WriteState state;
+    std::streampos sz_pos;
+  };
+
+ private:
+  OS_TYPE* os_;
+  bool alloc_;
+  bool hasHdr_;
+  WriteState kvState_; // key or value state
+  std::streampos str_pos_;
+  std::stack<WriteInfo> stack_;
+};
+
+typedef FbsonWriterT<FbsonOutStream> FbsonWriter;
+
+} // namespace fbson
+
+#endif // FBSON_FBSONWRITER_H
diff --git a/src/rocksdb/third-party/flashcache/flashcache_ioctl.h b/src/rocksdb/third-party/flashcache/flashcache_ioctl.h
new file mode 100644
index 0000000..af111ab
--- /dev/null
+++ b/src/rocksdb/third-party/flashcache/flashcache_ioctl.h
@@ -0,0 +1,55 @@
+/****************************************************************************
+ *  flashcache_ioctl.h
+ *  FlashCache: Device mapper target for block-level disk caching
+ *
+ *  Copyright 2010 Facebook, Inc.
+ *  Author: Mohan Srinivasan (mohan at facebook.com)
+ *
+ *  Based on DM-Cache:
+ *   Copyright (C) International Business Machines Corp., 2006
+ *   Author: Ming Zhao (mingzhao at ufl.edu)
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; under version 2 of the License.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+#ifdef OS_LINUX
+#ifndef FLASHCACHE_IOCTL_H
+#define FLASHCACHE_IOCTL_H
+
+#include <linux/types.h>
+
+#define FLASHCACHE_IOCTL 0xfe
+
+enum {
+	FLASHCACHEADDNCPID_CMD=200,
+	FLASHCACHEDELNCPID_CMD,
+	FLASHCACHEDELNCALL_CMD,
+	FLASHCACHEADDWHITELIST_CMD,
+	FLASHCACHEDELWHITELIST_CMD,
+	FLASHCACHEDELWHITELISTALL_CMD,
+};
+
+#define FLASHCACHEADDNCPID	_IOW(FLASHCACHE_IOCTL, FLASHCACHEADDNCPID_CMD, pid_t)
+#define FLASHCACHEDELNCPID	_IOW(FLASHCACHE_IOCTL, FLASHCACHEDELNCPID_CMD, pid_t)
+#define FLASHCACHEDELNCALL	_IOW(FLASHCACHE_IOCTL, FLASHCACHEDELNCALL_CMD, pid_t)
+
+#define FLASHCACHEADDBLACKLIST		FLASHCACHEADDNCPID
+#define FLASHCACHEDELBLACKLIST		FLASHCACHEDELNCPID
+#define FLASHCACHEDELALLBLACKLIST	FLASHCACHEDELNCALL
+
+#define FLASHCACHEADDWHITELIST		_IOW(FLASHCACHE_IOCTL, FLASHCACHEADDWHITELIST_CMD, pid_t)
+#define FLASHCACHEDELWHITELIST		_IOW(FLASHCACHE_IOCTL, FLASHCACHEDELWHITELIST_CMD, pid_t)
+#define FLASHCACHEDELALLWHITELIST	_IOW(FLASHCACHE_IOCTL, FLASHCACHEDELWHITELISTALL_CMD, pid_t)
+
+#endif /* FLASHCACHE_IOCTL_H */
+#endif /* OS_LINUX */
diff --git a/src/rocksdb/third-party/gtest-1.7.0/fused-src/gtest/gtest-all.cc b/src/rocksdb/third-party/gtest-1.7.0/fused-src/gtest/gtest-all.cc
new file mode 100644
index 0000000..92c3a43
--- /dev/null
+++ b/src/rocksdb/third-party/gtest-1.7.0/fused-src/gtest/gtest-all.cc
@@ -0,0 +1,10257 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: mheule at google.com (Markus Heule)
+//
+// Google C++ Testing Framework (Google Test)
+//
+// Sometimes it's desirable to build Google Test by compiling a single file.
+// This file serves this purpose.
+
+// This line ensures that gtest.h can be compiled on its own, even
+// when it's fused.
+#include "gtest/gtest.h"
+
+// The following lines pull in the real gtest *.cc files.
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan at google.com (Zhanyong Wan)
+//
+// The Google C++ Testing Framework (Google Test)
+
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan at google.com (Zhanyong Wan)
+//
+// Utilities for testing Google Test itself and code that uses Google Test
+// (e.g. frameworks built on top of Google Test).
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_SPI_H_
+#define GTEST_INCLUDE_GTEST_GTEST_SPI_H_
+
+
+namespace testing {
+
+// This helper class can be used to mock out Google Test failure reporting
+// so that we can test Google Test or code that builds on Google Test.
+//
+// An object of this class appends a TestPartResult object to the
+// TestPartResultArray object given in the constructor whenever a Google Test
+// failure is reported. It can either intercept only failures that are
+// generated in the same thread that created this object or it can intercept
+// all generated failures. The scope of this mock object can be controlled with
+// the second argument to the two arguments constructor.
+class GTEST_API_ ScopedFakeTestPartResultReporter
+    : public TestPartResultReporterInterface {
+ public:
+  // The two possible mocking modes of this object.
+  enum InterceptMode {
+    INTERCEPT_ONLY_CURRENT_THREAD,  // Intercepts only thread local failures.
+    INTERCEPT_ALL_THREADS           // Intercepts all failures.
+  };
+
+  // The c'tor sets this object as the test part result reporter used
+  // by Google Test.  The 'result' parameter specifies where to report the
+  // results. This reporter will only catch failures generated in the current
+  // thread. DEPRECATED
+  explicit ScopedFakeTestPartResultReporter(TestPartResultArray* result);
+
+  // Same as above, but you can choose the interception scope of this object.
+  ScopedFakeTestPartResultReporter(InterceptMode intercept_mode,
+                                   TestPartResultArray* result);
+
+  // The d'tor restores the previous test part result reporter.
+  virtual ~ScopedFakeTestPartResultReporter();
+
+  // Appends the TestPartResult object to the TestPartResultArray
+  // received in the constructor.
+  //
+  // This method is from the TestPartResultReporterInterface
+  // interface.
+  virtual void ReportTestPartResult(const TestPartResult& result);
+ private:
+  void Init();
+
+  const InterceptMode intercept_mode_;
+  TestPartResultReporterInterface* old_reporter_;
+  TestPartResultArray* const result_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedFakeTestPartResultReporter);
+};
+
+namespace internal {
+
+// A helper class for implementing EXPECT_FATAL_FAILURE() and
+// EXPECT_NONFATAL_FAILURE().  Its destructor verifies that the given
+// TestPartResultArray contains exactly one failure that has the given
+// type and contains the given substring.  If that's not the case, a
+// non-fatal failure will be generated.
+class GTEST_API_ SingleFailureChecker {
+ public:
+  // The constructor remembers the arguments.
+  SingleFailureChecker(const TestPartResultArray* results,
+                       TestPartResult::Type type,
+                       const string& substr);
+  ~SingleFailureChecker();
+ private:
+  const TestPartResultArray* const results_;
+  const TestPartResult::Type type_;
+  const string substr_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(SingleFailureChecker);
+};
+
+}  // namespace internal
+
+}  // namespace testing
+
+// A set of macros for testing Google Test assertions or code that's expected
+// to generate Google Test fatal failures.  It verifies that the given
+// statement will cause exactly one fatal Google Test failure with 'substr'
+// being part of the failure message.
+//
+// There are two different versions of this macro. EXPECT_FATAL_FAILURE only
+// affects and considers failures generated in the current thread and
+// EXPECT_FATAL_FAILURE_ON_ALL_THREADS does the same but for all threads.
+//
+// The verification of the assertion is done correctly even when the statement
+// throws an exception or aborts the current function.
+//
+// Known restrictions:
+//   - 'statement' cannot reference local non-static variables or
+//     non-static members of the current object.
+//   - 'statement' cannot return a value.
+//   - You cannot stream a failure message to this macro.
+//
+// Note that even though the implementations of the following two
+// macros are much alike, we cannot refactor them to use a common
+// helper macro, due to some peculiarity in how the preprocessor
+// works.  The AcceptsMacroThatExpandsToUnprotectedComma test in
+// gtest_unittest.cc will fail to compile if we do that.
+#define EXPECT_FATAL_FAILURE(statement, substr) \
+  do { \
+    class GTestExpectFatalFailureHelper {\
+     public:\
+      static void Execute() { statement; }\
+    };\
+    ::testing::TestPartResultArray gtest_failures;\
+    ::testing::internal::SingleFailureChecker gtest_checker(\
+        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr));\
+    {\
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
+          ::testing::ScopedFakeTestPartResultReporter:: \
+          INTERCEPT_ONLY_CURRENT_THREAD, &gtest_failures);\
+      GTestExpectFatalFailureHelper::Execute();\
+    }\
+  } while (::testing::internal::AlwaysFalse())
+
+#define EXPECT_FATAL_FAILURE_ON_ALL_THREADS(statement, substr) \
+  do { \
+    class GTestExpectFatalFailureHelper {\
+     public:\
+      static void Execute() { statement; }\
+    };\
+    ::testing::TestPartResultArray gtest_failures;\
+    ::testing::internal::SingleFailureChecker gtest_checker(\
+        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr));\
+    {\
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
+          ::testing::ScopedFakeTestPartResultReporter:: \
+          INTERCEPT_ALL_THREADS, &gtest_failures);\
+      GTestExpectFatalFailureHelper::Execute();\
+    }\
+  } while (::testing::internal::AlwaysFalse())
+
+// A macro for testing Google Test assertions or code that's expected to
+// generate Google Test non-fatal failures.  It asserts that the given
+// statement will cause exactly one non-fatal Google Test failure with 'substr'
+// being part of the failure message.
+//
+// There are two different versions of this macro. EXPECT_NONFATAL_FAILURE only
+// affects and considers failures generated in the current thread and
+// EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS does the same but for all threads.
+//
+// 'statement' is allowed to reference local variables and members of
+// the current object.
+//
+// The verification of the assertion is done correctly even when the statement
+// throws an exception or aborts the current function.
+//
+// Known restrictions:
+//   - You cannot stream a failure message to this macro.
+//
+// Note that even though the implementations of the following two
+// macros are much alike, we cannot refactor them to use a common
+// helper macro, due to some peculiarity in how the preprocessor
+// works.  If we do that, the code won't compile when the user gives
+// EXPECT_NONFATAL_FAILURE() a statement that contains a macro that
+// expands to code containing an unprotected comma.  The
+// AcceptsMacroThatExpandsToUnprotectedComma test in gtest_unittest.cc
+// catches that.
+//
+// For the same reason, we have to write
+//   if (::testing::internal::AlwaysTrue()) { statement; }
+// instead of
+//   GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement)
+// to avoid an MSVC warning on unreachable code.
+#define EXPECT_NONFATAL_FAILURE(statement, substr) \
+  do {\
+    ::testing::TestPartResultArray gtest_failures;\
+    ::testing::internal::SingleFailureChecker gtest_checker(\
+        &gtest_failures, ::testing::TestPartResult::kNonFatalFailure, \
+        (substr));\
+    {\
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
+          ::testing::ScopedFakeTestPartResultReporter:: \
+          INTERCEPT_ONLY_CURRENT_THREAD, &gtest_failures);\
+      if (::testing::internal::AlwaysTrue()) { statement; }\
+    }\
+  } while (::testing::internal::AlwaysFalse())
+
+#define EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS(statement, substr) \
+  do {\
+    ::testing::TestPartResultArray gtest_failures;\
+    ::testing::internal::SingleFailureChecker gtest_checker(\
+        &gtest_failures, ::testing::TestPartResult::kNonFatalFailure, \
+        (substr));\
+    {\
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
+          ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, \
+          &gtest_failures);\
+      if (::testing::internal::AlwaysTrue()) { statement; }\
+    }\
+  } while (::testing::internal::AlwaysFalse())
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_SPI_H_
+
+#include <ctype.h>
+#include <math.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <wchar.h>
+#include <wctype.h>
+
+#include <algorithm>
+#include <iomanip>
+#include <limits>
+#include <list>
+#include <map>
+#include <ostream>  // NOLINT
+#include <sstream>
+#include <vector>
+
+#if GTEST_OS_LINUX
+
+// TODO(kenton at google.com): Use autoconf to detect availability of
+// gettimeofday().
+# define GTEST_HAS_GETTIMEOFDAY_ 1
+
+# include <fcntl.h>  // NOLINT
+# include <limits.h>  // NOLINT
+# include <sched.h>  // NOLINT
+// Declares vsnprintf().  This header is not available on Windows.
+# include <strings.h>  // NOLINT
+# include <sys/mman.h>  // NOLINT
+# include <sys/time.h>  // NOLINT
+# include <unistd.h>  // NOLINT
+# include <string>
+
+#elif GTEST_OS_SYMBIAN
+# define GTEST_HAS_GETTIMEOFDAY_ 1
+# include <sys/time.h>  // NOLINT
+
+#elif GTEST_OS_ZOS
+# define GTEST_HAS_GETTIMEOFDAY_ 1
+# include <sys/time.h>  // NOLINT
+
+// On z/OS we additionally need strings.h for strcasecmp.
+# include <strings.h>  // NOLINT
+
+#elif GTEST_OS_WINDOWS_MOBILE  // We are on Windows CE.
+
+# include <windows.h>  // NOLINT
+# undef min
+
+#elif GTEST_OS_WINDOWS  // We are on Windows proper.
+
+# include <io.h>  // NOLINT
+# include <sys/timeb.h>  // NOLINT
+# include <sys/types.h>  // NOLINT
+# include <sys/stat.h>  // NOLINT
+
+# if GTEST_OS_WINDOWS_MINGW
+// MinGW has gettimeofday() but not _ftime64().
+// TODO(kenton at google.com): Use autoconf to detect availability of
+//   gettimeofday().
+// TODO(kenton at google.com): There are other ways to get the time on
+//   Windows, like GetTickCount() or GetSystemTimeAsFileTime().  MinGW
+//   supports these.  consider using them instead.
+#  define GTEST_HAS_GETTIMEOFDAY_ 1
+#  include <sys/time.h>  // NOLINT
+# endif  // GTEST_OS_WINDOWS_MINGW
+
+// cpplint thinks that the header is already included, so we want to
+// silence it.
+# include <windows.h>  // NOLINT
+# undef min
+
+#else
+
+// Assume other platforms have gettimeofday().
+// TODO(kenton at google.com): Use autoconf to detect availability of
+//   gettimeofday().
+# define GTEST_HAS_GETTIMEOFDAY_ 1
+
+// cpplint thinks that the header is already included, so we want to
+// silence it.
+# include <sys/time.h>  // NOLINT
+# include <unistd.h>  // NOLINT
+
+#endif  // GTEST_OS_LINUX
+
+#if GTEST_HAS_EXCEPTIONS
+# include <stdexcept>
+#endif
+
+#if GTEST_CAN_STREAM_RESULTS_
+# include <arpa/inet.h>  // NOLINT
+# include <netdb.h>  // NOLINT
+# include <sys/socket.h>  // NOLINT
+# include <sys/types.h>  // NOLINT
+#endif
+
+// Indicates that this translation unit is part of Google Test's
+// implementation.  It must come before gtest-internal-inl.h is
+// included, or there will be a compiler error.  This trick is to
+// prevent a user from accidentally including gtest-internal-inl.h in
+// his code.
+#define GTEST_IMPLEMENTATION_ 1
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Utility functions and classes used by the Google C++ testing framework.
+//
+// Author: wan at google.com (Zhanyong Wan)
+//
+// This file contains purely Google Test's internal implementation.  Please
+// DO NOT #INCLUDE IT IN A USER PROGRAM.
+
+#ifndef GTEST_SRC_GTEST_INTERNAL_INL_H_
+#define GTEST_SRC_GTEST_INTERNAL_INL_H_
+
+// GTEST_IMPLEMENTATION_ is defined to 1 iff the current translation unit is
+// part of Google Test's implementation; otherwise it's undefined.
+#if !GTEST_IMPLEMENTATION_
+// If this file is included from the user's code, just say no.
+# error "gtest-internal-inl.h is part of Google Test's internal implementation."
+# error "It must not be included except by Google Test itself."
+#endif  // GTEST_IMPLEMENTATION_
+
+#ifndef _WIN32_WCE
+# include <errno.h>
+#endif  // !_WIN32_WCE
+#include <stddef.h>
+#include <stdlib.h>  // For strtoll/_strtoul64/malloc/free.
+#include <string.h>  // For memmove.
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+
+#if GTEST_CAN_STREAM_RESULTS_
+# include <arpa/inet.h>  // NOLINT
+# include <netdb.h>  // NOLINT
+#endif
+
+#if GTEST_OS_WINDOWS
+# include <windows.h>  // NOLINT
+#endif  // GTEST_OS_WINDOWS
+
+
+namespace testing {
+
+// Declares the flags.
+//
+// We don't want the users to modify this flag in the code, but want
+// Google Test's own unit tests to be able to access it. Therefore we
+// declare it here as opposed to in gtest.h.
+GTEST_DECLARE_bool_(death_test_use_fork);
+
+namespace internal {
+
+// The value of GetTestTypeId() as seen from within the Google Test
+// library.  This is solely for testing GetTestTypeId().
+GTEST_API_ extern const TypeId kTestTypeIdInGoogleTest;
+
+// Names of the flags (needed for parsing Google Test flags).
+const char kAlsoRunDisabledTestsFlag[] = "also_run_disabled_tests";
+const char kBreakOnFailureFlag[] = "break_on_failure";
+const char kCatchExceptionsFlag[] = "catch_exceptions";
+const char kColorFlag[] = "color";
+const char kFilterFlag[] = "filter";
+const char kListTestsFlag[] = "list_tests";
+const char kOutputFlag[] = "output";
+const char kPrintTimeFlag[] = "print_time";
+const char kRandomSeedFlag[] = "random_seed";
+const char kRepeatFlag[] = "repeat";
+const char kShuffleFlag[] = "shuffle";
+const char kStackTraceDepthFlag[] = "stack_trace_depth";
+const char kStreamResultToFlag[] = "stream_result_to";
+const char kThrowOnFailureFlag[] = "throw_on_failure";
+
+// A valid random seed must be in [1, kMaxRandomSeed].
+const int kMaxRandomSeed = 99999;
+
+// g_help_flag is true iff the --help flag or an equivalent form is
+// specified on the command line.
+GTEST_API_ extern bool g_help_flag;
+
+// Returns the current time in milliseconds.
+GTEST_API_ TimeInMillis GetTimeInMillis();
+
+// Returns true iff Google Test should use colors in the output.
+GTEST_API_ bool ShouldUseColor(bool stdout_is_tty);
+
+// Formats the given time in milliseconds as seconds.
+GTEST_API_ std::string FormatTimeInMillisAsSeconds(TimeInMillis ms);
+
+// Converts the given time in milliseconds to a date string in the ISO 8601
+// format, without the timezone information.  N.B.: due to the use the
+// non-reentrant localtime() function, this function is not thread safe.  Do
+// not use it in any code that can be called from multiple threads.
+GTEST_API_ std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms);
+
+// Parses a string for an Int32 flag, in the form of "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+GTEST_API_ bool ParseInt32Flag(
+    const char* str, const char* flag, Int32* value);
+
+// Returns a random seed in range [1, kMaxRandomSeed] based on the
+// given --gtest_random_seed flag value.
+inline int GetRandomSeedFromFlag(Int32 random_seed_flag) {
+  const unsigned int raw_seed = (random_seed_flag == 0) ?
+      static_cast<unsigned int>(GetTimeInMillis()) :
+      static_cast<unsigned int>(random_seed_flag);
+
+  // Normalizes the actual seed to range [1, kMaxRandomSeed] such that
+  // it's easy to type.
+  const int normalized_seed =
+      static_cast<int>((raw_seed - 1U) %
+                       static_cast<unsigned int>(kMaxRandomSeed)) + 1;
+  return normalized_seed;
+}
+
+// Returns the first valid random seed after 'seed'.  The behavior is
+// undefined if 'seed' is invalid.  The seed after kMaxRandomSeed is
+// considered to be 1.
+inline int GetNextRandomSeed(int seed) {
+  GTEST_CHECK_(1 <= seed && seed <= kMaxRandomSeed)
+      << "Invalid random seed " << seed << " - must be in [1, "
+      << kMaxRandomSeed << "].";
+  const int next_seed = seed + 1;
+  return (next_seed > kMaxRandomSeed) ? 1 : next_seed;
+}
+
+// This class saves the values of all Google Test flags in its c'tor, and
+// restores them in its d'tor.
+class GTestFlagSaver {
+ public:
+  // The c'tor.
+  GTestFlagSaver() {
+    also_run_disabled_tests_ = GTEST_FLAG(also_run_disabled_tests);
+    break_on_failure_ = GTEST_FLAG(break_on_failure);
+    catch_exceptions_ = GTEST_FLAG(catch_exceptions);
+    color_ = GTEST_FLAG(color);
+    death_test_style_ = GTEST_FLAG(death_test_style);
+    death_test_use_fork_ = GTEST_FLAG(death_test_use_fork);
+    filter_ = GTEST_FLAG(filter);
+    internal_run_death_test_ = GTEST_FLAG(internal_run_death_test);
+    list_tests_ = GTEST_FLAG(list_tests);
+    output_ = GTEST_FLAG(output);
+    print_time_ = GTEST_FLAG(print_time);
+    random_seed_ = GTEST_FLAG(random_seed);
+    repeat_ = GTEST_FLAG(repeat);
+    shuffle_ = GTEST_FLAG(shuffle);
+    stack_trace_depth_ = GTEST_FLAG(stack_trace_depth);
+    stream_result_to_ = GTEST_FLAG(stream_result_to);
+    throw_on_failure_ = GTEST_FLAG(throw_on_failure);
+  }
+
+  // The d'tor is not virtual.  DO NOT INHERIT FROM THIS CLASS.
+  ~GTestFlagSaver() {
+    GTEST_FLAG(also_run_disabled_tests) = also_run_disabled_tests_;
+    GTEST_FLAG(break_on_failure) = break_on_failure_;
+    GTEST_FLAG(catch_exceptions) = catch_exceptions_;
+    GTEST_FLAG(color) = color_;
+    GTEST_FLAG(death_test_style) = death_test_style_;
+    GTEST_FLAG(death_test_use_fork) = death_test_use_fork_;
+    GTEST_FLAG(filter) = filter_;
+    GTEST_FLAG(internal_run_death_test) = internal_run_death_test_;
+    GTEST_FLAG(list_tests) = list_tests_;
+    GTEST_FLAG(output) = output_;
+    GTEST_FLAG(print_time) = print_time_;
+    GTEST_FLAG(random_seed) = random_seed_;
+    GTEST_FLAG(repeat) = repeat_;
+    GTEST_FLAG(shuffle) = shuffle_;
+    GTEST_FLAG(stack_trace_depth) = stack_trace_depth_;
+    GTEST_FLAG(stream_result_to) = stream_result_to_;
+    GTEST_FLAG(throw_on_failure) = throw_on_failure_;
+  }
+
+ private:
+  // Fields for saving the original values of flags.
+  bool also_run_disabled_tests_;
+  bool break_on_failure_;
+  bool catch_exceptions_;
+  std::string color_;
+  std::string death_test_style_;
+  bool death_test_use_fork_;
+  std::string filter_;
+  std::string internal_run_death_test_;
+  bool list_tests_;
+  std::string output_;
+  bool print_time_;
+  internal::Int32 random_seed_;
+  internal::Int32 repeat_;
+  bool shuffle_;
+  internal::Int32 stack_trace_depth_;
+  std::string stream_result_to_;
+  bool throw_on_failure_;
+} GTEST_ATTRIBUTE_UNUSED_;
+
+// Converts a Unicode code point to a narrow string in UTF-8 encoding.
+// code_point parameter is of type UInt32 because wchar_t may not be
+// wide enough to contain a code point.
+// If the code_point is not a valid Unicode code point
+// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted
+// to "(Invalid Unicode 0xXXXXXXXX)".
+GTEST_API_ std::string CodePointToUtf8(UInt32 code_point);
+
+// Converts a wide string to a narrow string in UTF-8 encoding.
+// The wide string is assumed to have the following encoding:
+//   UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin, Symbian OS)
+//   UTF-32 if sizeof(wchar_t) == 4 (on Linux)
+// Parameter str points to a null-terminated wide string.
+// Parameter num_chars may additionally limit the number
+// of wchar_t characters processed. -1 is used when the entire string
+// should be processed.
+// If the string contains code points that are not valid Unicode code points
+// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output
+// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding
+// and contains invalid UTF-16 surrogate pairs, values in those pairs
+// will be encoded as individual Unicode characters from Basic Normal Plane.
+GTEST_API_ std::string WideStringToUtf8(const wchar_t* str, int num_chars);
+
+// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file
+// if the variable is present. If a file already exists at this location, this
+// function will write over it. If the variable is present, but the file cannot
+// be created, prints an error and exits.
+void WriteToShardStatusFileIfNeeded();
+
+// Checks whether sharding is enabled by examining the relevant
+// environment variable values. If the variables are present,
+// but inconsistent (e.g., shard_index >= total_shards), prints
+// an error and exits. If in_subprocess_for_death_test, sharding is
+// disabled because it must only be applied to the original test
+// process. Otherwise, we could filter out death tests we intended to execute.
+GTEST_API_ bool ShouldShard(const char* total_shards_str,
+                            const char* shard_index_str,
+                            bool in_subprocess_for_death_test);
+
+// Parses the environment variable var as an Int32. If it is unset,
+// returns default_val. If it is not an Int32, prints an error and
+// and aborts.
+GTEST_API_ Int32 Int32FromEnvOrDie(const char* env_var, Int32 default_val);
+
+// Given the total number of shards, the shard index, and the test id,
+// returns true iff the test should be run on this shard. The test id is
+// some arbitrary but unique non-negative integer assigned to each test
+// method. Assumes that 0 <= shard_index < total_shards.
+GTEST_API_ bool ShouldRunTestOnShard(
+    int total_shards, int shard_index, int test_id);
+
+// STL container utilities.
+
+// Returns the number of elements in the given container that satisfy
+// the given predicate.
+template <class Container, typename Predicate>
+inline int CountIf(const Container& c, Predicate predicate) {
+  // Implemented as an explicit loop since std::count_if() in libCstd on
+  // Solaris has a non-standard signature.
+  int count = 0;
+  for (typename Container::const_iterator it = c.begin(); it != c.end(); ++it) {
+    if (predicate(*it))
+      ++count;
+  }
+  return count;
+}
+
+// Applies a function/functor to each element in the container.
+template <class Container, typename Functor>
+void ForEach(const Container& c, Functor functor) {
+  std::for_each(c.begin(), c.end(), functor);
+}
+
+// Returns the i-th element of the vector, or default_value if i is not
+// in range [0, v.size()).
+template <typename E>
+inline E GetElementOr(const std::vector<E>& v, int i, E default_value) {
+  return (i < 0 || i >= static_cast<int>(v.size())) ? default_value : v[i];
+}
+
+// Performs an in-place shuffle of a range of the vector's elements.
+// 'begin' and 'end' are element indices as an STL-style range;
+// i.e. [begin, end) are shuffled, where 'end' == size() means to
+// shuffle to the end of the vector.
+template <typename E>
+void ShuffleRange(internal::Random* random, int begin, int end,
+                  std::vector<E>* v) {
+  const int size = static_cast<int>(v->size());
+  GTEST_CHECK_(0 <= begin && begin <= size)
+      << "Invalid shuffle range start " << begin << ": must be in range [0, "
+      << size << "].";
+  GTEST_CHECK_(begin <= end && end <= size)
+      << "Invalid shuffle range finish " << end << ": must be in range ["
+      << begin << ", " << size << "].";
+
+  // Fisher-Yates shuffle, from
+  // http://en.wikipedia.org/wiki/Fisher-Yates_shuffle
+  for (int range_width = end - begin; range_width >= 2; range_width--) {
+    const int last_in_range = begin + range_width - 1;
+    const int selected = begin + random->Generate(range_width);
+    std::swap((*v)[selected], (*v)[last_in_range]);
+  }
+}
+
+// Performs an in-place shuffle of the vector's elements.
+template <typename E>
+inline void Shuffle(internal::Random* random, std::vector<E>* v) {
+  ShuffleRange(random, 0, static_cast<int>(v->size()), v);
+}
+
+// A function for deleting an object.  Handy for being used as a
+// functor.
+template <typename T>
+static void Delete(T* x) {
+  delete x;
+}
+
+// A predicate that checks the key of a TestProperty against a known key.
+//
+// TestPropertyKeyIs is copyable.
+class TestPropertyKeyIs {
+ public:
+  // Constructor.
+  //
+  // TestPropertyKeyIs has NO default constructor.
+  explicit TestPropertyKeyIs(const std::string& key) : key_(key) {}
+
+  // Returns true iff the test name of test property matches on key_.
+  bool operator()(const TestProperty& test_property) const {
+    return test_property.key() == key_;
+  }
+
+ private:
+  std::string key_;
+};
+
+// Class UnitTestOptions.
+//
+// This class contains functions for processing options the user
+// specifies when running the tests.  It has only static members.
+//
+// In most cases, the user can specify an option using either an
+// environment variable or a command line flag.  E.g. you can set the
+// test filter using either GTEST_FILTER or --gtest_filter.  If both
+// the variable and the flag are present, the latter overrides the
+// former.
+class GTEST_API_ UnitTestOptions {
+ public:
+  // Functions for processing the gtest_output flag.
+
+  // Returns the output format, or "" for normal printed output.
+  static std::string GetOutputFormat();
+
+  // Returns the absolute path of the requested output file, or the
+  // default (test_detail.xml in the original working directory) if
+  // none was explicitly specified.
+  static std::string GetAbsolutePathToOutputFile();
+
+  // Functions for processing the gtest_filter flag.
+
+  // Returns true iff the wildcard pattern matches the string.  The
+  // first ':' or '\0' character in pattern marks the end of it.
+  //
+  // This recursive algorithm isn't very efficient, but is clear and
+  // works well enough for matching test names, which are short.
+  static bool PatternMatchesString(const char *pattern, const char *str);
+
+  // Returns true iff the user-specified filter matches the test case
+  // name and the test name.
+  static bool FilterMatchesTest(const std::string &test_case_name,
+                                const std::string &test_name);
+
+#if GTEST_OS_WINDOWS
+  // Function for supporting the gtest_catch_exception flag.
+
+  // Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the
+  // given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise.
+  // This function is useful as an __except condition.
+  static int GTestShouldProcessSEH(DWORD exception_code);
+#endif  // GTEST_OS_WINDOWS
+
+  // Returns true if "name" matches the ':' separated list of glob-style
+  // filters in "filter".
+  static bool MatchesFilter(const std::string& name, const char* filter);
+};
+
+// Returns the current application's name, removing directory path if that
+// is present.  Used by UnitTestOptions::GetOutputFile.
+GTEST_API_ FilePath GetCurrentExecutableName();
+
+// The role interface for getting the OS stack trace as a string.
+class OsStackTraceGetterInterface {
+ public:
+  OsStackTraceGetterInterface() {}
+  virtual ~OsStackTraceGetterInterface() {}
+
+  // Returns the current OS stack trace as an std::string.  Parameters:
+  //
+  //   max_depth  - the maximum number of stack frames to be included
+  //                in the trace.
+  //   skip_count - the number of top frames to be skipped; doesn't count
+  //                against max_depth.
+  virtual string CurrentStackTrace(int max_depth, int skip_count) = 0;
+
+  // UponLeavingGTest() should be called immediately before Google Test calls
+  // user code. It saves some information about the current stack that
+  // CurrentStackTrace() will use to find and hide Google Test stack frames.
+  virtual void UponLeavingGTest() = 0;
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetterInterface);
+};
+
+// A working implementation of the OsStackTraceGetterInterface interface.
+class OsStackTraceGetter : public OsStackTraceGetterInterface {
+ public:
+  OsStackTraceGetter() : caller_frame_(NULL) {}
+
+  virtual string CurrentStackTrace(int max_depth, int skip_count)
+      GTEST_LOCK_EXCLUDED_(mutex_);
+
+  virtual void UponLeavingGTest() GTEST_LOCK_EXCLUDED_(mutex_);
+
+  // This string is inserted in place of stack frames that are part of
+  // Google Test's implementation.
+  static const char* const kElidedFramesMarker;
+
+ private:
+  Mutex mutex_;  // protects all internal state
+
+  // We save the stack frame below the frame that calls user code.
+  // We do this because the address of the frame immediately below
+  // the user code changes between the call to UponLeavingGTest()
+  // and any calls to CurrentStackTrace() from within the user code.
+  void* caller_frame_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetter);
+};
+
+// Information about a Google Test trace point.
+struct TraceInfo {
+  const char* file;
+  int line;
+  std::string message;
+};
+
+// This is the default global test part result reporter used in UnitTestImpl.
+// This class should only be used by UnitTestImpl.
+class DefaultGlobalTestPartResultReporter
+  : public TestPartResultReporterInterface {
+ public:
+  explicit DefaultGlobalTestPartResultReporter(UnitTestImpl* unit_test);
+  // Implements the TestPartResultReporterInterface. Reports the test part
+  // result in the current test.
+  virtual void ReportTestPartResult(const TestPartResult& result);
+
+ private:
+  UnitTestImpl* const unit_test_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultGlobalTestPartResultReporter);
+};
+
+// This is the default per thread test part result reporter used in
+// UnitTestImpl. This class should only be used by UnitTestImpl.
+class DefaultPerThreadTestPartResultReporter
+    : public TestPartResultReporterInterface {
+ public:
+  explicit DefaultPerThreadTestPartResultReporter(UnitTestImpl* unit_test);
+  // Implements the TestPartResultReporterInterface. The implementation just
+  // delegates to the current global test part result reporter of *unit_test_.
+  virtual void ReportTestPartResult(const TestPartResult& result);
+
+ private:
+  UnitTestImpl* const unit_test_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultPerThreadTestPartResultReporter);
+};
+
+// The private implementation of the UnitTest class.  We don't protect
+// the methods under a mutex, as this class is not accessible by a
+// user and the UnitTest class that delegates work to this class does
+// proper locking.
+class GTEST_API_ UnitTestImpl {
+ public:
+  explicit UnitTestImpl(UnitTest* parent);
+  virtual ~UnitTestImpl();
+
+  // There are two different ways to register your own TestPartResultReporter.
+  // You can register your own repoter to listen either only for test results
+  // from the current thread or for results from all threads.
+  // By default, each per-thread test result repoter just passes a new
+  // TestPartResult to the global test result reporter, which registers the
+  // test part result for the currently running test.
+
+  // Returns the global test part result reporter.
+  TestPartResultReporterInterface* GetGlobalTestPartResultReporter();
+
+  // Sets the global test part result reporter.
+  void SetGlobalTestPartResultReporter(
+      TestPartResultReporterInterface* reporter);
+
+  // Returns the test part result reporter for the current thread.
+  TestPartResultReporterInterface* GetTestPartResultReporterForCurrentThread();
+
+  // Sets the test part result reporter for the current thread.
+  void SetTestPartResultReporterForCurrentThread(
+      TestPartResultReporterInterface* reporter);
+
+  // Gets the number of successful test cases.
+  int successful_test_case_count() const;
+
+  // Gets the number of failed test cases.
+  int failed_test_case_count() const;
+
+  // Gets the number of all test cases.
+  int total_test_case_count() const;
+
+  // Gets the number of all test cases that contain at least one test
+  // that should run.
+  int test_case_to_run_count() const;
+
+  // Gets the number of successful tests.
+  int successful_test_count() const;
+
+  // Gets the number of failed tests.
+  int failed_test_count() const;
+
+  // Gets the number of disabled tests that will be reported in the XML report.
+  int reportable_disabled_test_count() const;
+
+  // Gets the number of disabled tests.
+  int disabled_test_count() const;
+
+  // Gets the number of tests to be printed in the XML report.
+  int reportable_test_count() const;
+
+  // Gets the number of all tests.
+  int total_test_count() const;
+
+  // Gets the number of tests that should run.
+  int test_to_run_count() const;
+
+  // Gets the time of the test program start, in ms from the start of the
+  // UNIX epoch.
+  TimeInMillis start_timestamp() const { return start_timestamp_; }
+
+  // Gets the elapsed time, in milliseconds.
+  TimeInMillis elapsed_time() const { return elapsed_time_; }
+
+  // Returns true iff the unit test passed (i.e. all test cases passed).
+  bool Passed() const { return !Failed(); }
+
+  // Returns true iff the unit test failed (i.e. some test case failed
+  // or something outside of all tests failed).
+  bool Failed() const {
+    return failed_test_case_count() > 0 || ad_hoc_test_result()->Failed();
+  }
+
+  // Gets the i-th test case among all the test cases. i can range from 0 to
+  // total_test_case_count() - 1. If i is not in that range, returns NULL.
+  const TestCase* GetTestCase(int i) const {
+    const int index = GetElementOr(test_case_indices_, i, -1);
+    return index < 0 ? NULL : test_cases_[i];
+  }
+
+  // Gets the i-th test case among all the test cases. i can range from 0 to
+  // total_test_case_count() - 1. If i is not in that range, returns NULL.
+  TestCase* GetMutableTestCase(int i) {
+    const int index = GetElementOr(test_case_indices_, i, -1);
+    return index < 0 ? NULL : test_cases_[index];
+  }
+
+  // Provides access to the event listener list.
+  TestEventListeners* listeners() { return &listeners_; }
+
+  // Returns the TestResult for the test that's currently running, or
+  // the TestResult for the ad hoc test if no test is running.
+  TestResult* current_test_result();
+
+  // Returns the TestResult for the ad hoc test.
+  const TestResult* ad_hoc_test_result() const { return &ad_hoc_test_result_; }
+
+  // Sets the OS stack trace getter.
+  //
+  // Does nothing if the input and the current OS stack trace getter
+  // are the same; otherwise, deletes the old getter and makes the
+  // input the current getter.
+  void set_os_stack_trace_getter(OsStackTraceGetterInterface* getter);
+
+  // Returns the current OS stack trace getter if it is not NULL;
+  // otherwise, creates an OsStackTraceGetter, makes it the current
+  // getter, and returns it.
+  OsStackTraceGetterInterface* os_stack_trace_getter();
+
+  // Returns the current OS stack trace as an std::string.
+  //
+  // The maximum number of stack frames to be included is specified by
+  // the gtest_stack_trace_depth flag.  The skip_count parameter
+  // specifies the number of top frames to be skipped, which doesn't
+  // count against the number of frames to be included.
+  //
+  // For example, if Foo() calls Bar(), which in turn calls
+  // CurrentOsStackTraceExceptTop(1), Foo() will be included in the
+  // trace but Bar() and CurrentOsStackTraceExceptTop() won't.
+  std::string CurrentOsStackTraceExceptTop(int skip_count) GTEST_NO_INLINE_;
+
+  // Finds and returns a TestCase with the given name.  If one doesn't
+  // exist, creates one and returns it.
+  //
+  // Arguments:
+  //
+  //   test_case_name: name of the test case
+  //   type_param:     the name of the test's type parameter, or NULL if
+  //                   this is not a typed or a type-parameterized test.
+  //   set_up_tc:      pointer to the function that sets up the test case
+  //   tear_down_tc:   pointer to the function that tears down the test case
+  TestCase* GetTestCase(const char* test_case_name,
+                        const char* type_param,
+                        Test::SetUpTestCaseFunc set_up_tc,
+                        Test::TearDownTestCaseFunc tear_down_tc);
+
+  // Adds a TestInfo to the unit test.
+  //
+  // Arguments:
+  //
+  //   set_up_tc:    pointer to the function that sets up the test case
+  //   tear_down_tc: pointer to the function that tears down the test case
+  //   test_info:    the TestInfo object
+  void AddTestInfo(Test::SetUpTestCaseFunc set_up_tc,
+                   Test::TearDownTestCaseFunc tear_down_tc,
+                   TestInfo* test_info) {
+    // In order to support thread-safe death tests, we need to
+    // remember the original working directory when the test program
+    // was first invoked.  We cannot do this in RUN_ALL_TESTS(), as
+    // the user may have changed the current directory before calling
+    // RUN_ALL_TESTS().  Therefore we capture the current directory in
+    // AddTestInfo(), which is called to register a TEST or TEST_F
+    // before main() is reached.
+    if (original_working_dir_.IsEmpty()) {
+      original_working_dir_.Set(FilePath::GetCurrentDir());
+      GTEST_CHECK_(!original_working_dir_.IsEmpty())
+          << "Failed to get the current working directory.";
+    }
+
+    GetTestCase(test_info->test_case_name(),
+                test_info->type_param(),
+                set_up_tc,
+                tear_down_tc)->AddTestInfo(test_info);
+  }
+
+#if GTEST_HAS_PARAM_TEST
+  // Returns ParameterizedTestCaseRegistry object used to keep track of
+  // value-parameterized tests and instantiate and register them.
+  internal::ParameterizedTestCaseRegistry& parameterized_test_registry() {
+    return parameterized_test_registry_;
+  }
+#endif  // GTEST_HAS_PARAM_TEST
+
+  // Sets the TestCase object for the test that's currently running.
+  void set_current_test_case(TestCase* a_current_test_case) {
+    current_test_case_ = a_current_test_case;
+  }
+
+  // Sets the TestInfo object for the test that's currently running.  If
+  // current_test_info is NULL, the assertion results will be stored in
+  // ad_hoc_test_result_.
+  void set_current_test_info(TestInfo* a_current_test_info) {
+    current_test_info_ = a_current_test_info;
+  }
+
+  // Registers all parameterized tests defined using TEST_P and
+  // INSTANTIATE_TEST_CASE_P, creating regular tests for each test/parameter
+  // combination. This method can be called more then once; it has guards
+  // protecting from registering the tests more then once.  If
+  // value-parameterized tests are disabled, RegisterParameterizedTests is
+  // present but does nothing.
+  void RegisterParameterizedTests();
+
+  // Runs all tests in this UnitTest object, prints the result, and
+  // returns true if all tests are successful.  If any exception is
+  // thrown during a test, this test is considered to be failed, but
+  // the rest of the tests will still be run.
+  bool RunAllTests();
+
+  // Clears the results of all tests, except the ad hoc tests.
+  void ClearNonAdHocTestResult() {
+    ForEach(test_cases_, TestCase::ClearTestCaseResult);
+  }
+
+  // Clears the results of ad-hoc test assertions.
+  void ClearAdHocTestResult() {
+    ad_hoc_test_result_.Clear();
+  }
+
+  // Adds a TestProperty to the current TestResult object when invoked in a
+  // context of a test or a test case, or to the global property set. If the
+  // result already contains a property with the same key, the value will be
+  // updated.
+  void RecordProperty(const TestProperty& test_property);
+
+  enum ReactionToSharding {
+    HONOR_SHARDING_PROTOCOL,
+    IGNORE_SHARDING_PROTOCOL
+  };
+
+  // Matches the full name of each test against the user-specified
+  // filter to decide whether the test should run, then records the
+  // result in each TestCase and TestInfo object.
+  // If shard_tests == HONOR_SHARDING_PROTOCOL, further filters tests
+  // based on sharding variables in the environment.
+  // Returns the number of tests that should run.
+  int FilterTests(ReactionToSharding shard_tests);
+
+  // Prints the names of the tests matching the user-specified filter flag.
+  void ListTestsMatchingFilter();
+
+  const TestCase* current_test_case() const { return current_test_case_; }
+  TestInfo* current_test_info() { return current_test_info_; }
+  const TestInfo* current_test_info() const { return current_test_info_; }
+
+  // Returns the vector of environments that need to be set-up/torn-down
+  // before/after the tests are run.
+  std::vector<Environment*>& environments() { return environments_; }
+
+  // Getters for the per-thread Google Test trace stack.
+  std::vector<TraceInfo>& gtest_trace_stack() {
+    return *(gtest_trace_stack_.pointer());
+  }
+  const std::vector<TraceInfo>& gtest_trace_stack() const {
+    return gtest_trace_stack_.get();
+  }
+
+#if GTEST_HAS_DEATH_TEST
+  void InitDeathTestSubprocessControlInfo() {
+    internal_run_death_test_flag_.reset(ParseInternalRunDeathTestFlag());
+  }
+  // Returns a pointer to the parsed --gtest_internal_run_death_test
+  // flag, or NULL if that flag was not specified.
+  // This information is useful only in a death test child process.
+  // Must not be called before a call to InitGoogleTest.
+  const InternalRunDeathTestFlag* internal_run_death_test_flag() const {
+    return internal_run_death_test_flag_.get();
+  }
+
+  // Returns a pointer to the current death test factory.
+  internal::DeathTestFactory* death_test_factory() {
+    return death_test_factory_.get();
+  }
+
+  void SuppressTestEventsIfInSubprocess();
+
+  friend class ReplaceDeathTestFactory;
+#endif  // GTEST_HAS_DEATH_TEST
+
+  // Initializes the event listener performing XML output as specified by
+  // UnitTestOptions. Must not be called before InitGoogleTest.
+  void ConfigureXmlOutput();
+
+#if GTEST_CAN_STREAM_RESULTS_
+  // Initializes the event listener for streaming test results to a socket.
+  // Must not be called before InitGoogleTest.
+  void ConfigureStreamingOutput();
+#endif
+
+  // Performs initialization dependent upon flag values obtained in
+  // ParseGoogleTestFlagsOnly.  Is called from InitGoogleTest after the call to
+  // ParseGoogleTestFlagsOnly.  In case a user neglects to call InitGoogleTest
+  // this function is also called from RunAllTests.  Since this function can be
+  // called more than once, it has to be idempotent.
+  void PostFlagParsingInit();
+
+  // Gets the random seed used at the start of the current test iteration.
+  int random_seed() const { return random_seed_; }
+
+  // Gets the random number generator.
+  internal::Random* random() { return &random_; }
+
+  // Shuffles all test cases, and the tests within each test case,
+  // making sure that death tests are still run first.
+  void ShuffleTests();
+
+  // Restores the test cases and tests to their order before the first shuffle.
+  void UnshuffleTests();
+
+  // Returns the value of GTEST_FLAG(catch_exceptions) at the moment
+  // UnitTest::Run() starts.
+  bool catch_exceptions() const { return catch_exceptions_; }
+
+ private:
+  friend class ::testing::UnitTest;
+
+  // Used by UnitTest::Run() to capture the state of
+  // GTEST_FLAG(catch_exceptions) at the moment it starts.
+  void set_catch_exceptions(bool value) { catch_exceptions_ = value; }
+
+  // The UnitTest object that owns this implementation object.
+  UnitTest* const parent_;
+
+  // The working directory when the first TEST() or TEST_F() was
+  // executed.
+  internal::FilePath original_working_dir_;
+
+  // The default test part result reporters.
+  DefaultGlobalTestPartResultReporter default_global_test_part_result_reporter_;
+  DefaultPerThreadTestPartResultReporter
+      default_per_thread_test_part_result_reporter_;
+
+  // Points to (but doesn't own) the global test part result reporter.
+  TestPartResultReporterInterface* global_test_part_result_repoter_;
+
+  // Protects read and write access to global_test_part_result_reporter_.
+  internal::Mutex global_test_part_result_reporter_mutex_;
+
+  // Points to (but doesn't own) the per-thread test part result reporter.
+  internal::ThreadLocal<TestPartResultReporterInterface*>
+      per_thread_test_part_result_reporter_;
+
+  // The vector of environments that need to be set-up/torn-down
+  // before/after the tests are run.
+  std::vector<Environment*> environments_;
+
+  // The vector of TestCases in their original order.  It owns the
+  // elements in the vector.
+  std::vector<TestCase*> test_cases_;
+
+  // Provides a level of indirection for the test case list to allow
+  // easy shuffling and restoring the test case order.  The i-th
+  // element of this vector is the index of the i-th test case in the
+  // shuffled order.
+  std::vector<int> test_case_indices_;
+
+#if GTEST_HAS_PARAM_TEST
+  // ParameterizedTestRegistry object used to register value-parameterized
+  // tests.
+  internal::ParameterizedTestCaseRegistry parameterized_test_registry_;
+
+  // Indicates whether RegisterParameterizedTests() has been called already.
+  bool parameterized_tests_registered_;
+#endif  // GTEST_HAS_PARAM_TEST
+
+  // Index of the last death test case registered.  Initially -1.
+  int last_death_test_case_;
+
+  // This points to the TestCase for the currently running test.  It
+  // changes as Google Test goes through one test case after another.
+  // When no test is running, this is set to NULL and Google Test
+  // stores assertion results in ad_hoc_test_result_.  Initially NULL.
+  TestCase* current_test_case_;
+
+  // This points to the TestInfo for the currently running test.  It
+  // changes as Google Test goes through one test after another.  When
+  // no test is running, this is set to NULL and Google Test stores
+  // assertion results in ad_hoc_test_result_.  Initially NULL.
+  TestInfo* current_test_info_;
+
+  // Normally, a user only writes assertions inside a TEST or TEST_F,
+  // or inside a function called by a TEST or TEST_F.  Since Google
+  // Test keeps track of which test is current running, it can
+  // associate such an assertion with the test it belongs to.
+  //
+  // If an assertion is encountered when no TEST or TEST_F is running,
+  // Google Test attributes the assertion result to an imaginary "ad hoc"
+  // test, and records the result in ad_hoc_test_result_.
+  TestResult ad_hoc_test_result_;
+
+  // The list of event listeners that can be used to track events inside
+  // Google Test.
+  TestEventListeners listeners_;
+
+  // The OS stack trace getter.  Will be deleted when the UnitTest
+  // object is destructed.  By default, an OsStackTraceGetter is used,
+  // but the user can set this field to use a custom getter if that is
+  // desired.
+  OsStackTraceGetterInterface* os_stack_trace_getter_;
+
+  // True iff PostFlagParsingInit() has been called.
+  bool post_flag_parse_init_performed_;
+
+  // The random number seed used at the beginning of the test run.
+  int random_seed_;
+
+  // Our random number generator.
+  internal::Random random_;
+
+  // The time of the test program start, in ms from the start of the
+  // UNIX epoch.
+  TimeInMillis start_timestamp_;
+
+  // How long the test took to run, in milliseconds.
+  TimeInMillis elapsed_time_;
+
+#if GTEST_HAS_DEATH_TEST
+  // The decomposed components of the gtest_internal_run_death_test flag,
+  // parsed when RUN_ALL_TESTS is called.
+  internal::scoped_ptr<InternalRunDeathTestFlag> internal_run_death_test_flag_;
+  internal::scoped_ptr<internal::DeathTestFactory> death_test_factory_;
+#endif  // GTEST_HAS_DEATH_TEST
+
+  // A per-thread stack of traces created by the SCOPED_TRACE() macro.
+  internal::ThreadLocal<std::vector<TraceInfo> > gtest_trace_stack_;
+
+  // The value of GTEST_FLAG(catch_exceptions) at the moment RunAllTests()
+  // starts.
+  bool catch_exceptions_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTestImpl);
+};  // class UnitTestImpl
+
+// Convenience function for accessing the global UnitTest
+// implementation object.
+inline UnitTestImpl* GetUnitTestImpl() {
+  return UnitTest::GetInstance()->impl();
+}
+
+#if GTEST_USES_SIMPLE_RE
+
+// Internal helper functions for implementing the simple regular
+// expression matcher.
+GTEST_API_ bool IsInSet(char ch, const char* str);
+GTEST_API_ bool IsAsciiDigit(char ch);
+GTEST_API_ bool IsAsciiPunct(char ch);
+GTEST_API_ bool IsRepeat(char ch);
+GTEST_API_ bool IsAsciiWhiteSpace(char ch);
+GTEST_API_ bool IsAsciiWordChar(char ch);
+GTEST_API_ bool IsValidEscape(char ch);
+GTEST_API_ bool AtomMatchesChar(bool escaped, char pattern, char ch);
+GTEST_API_ bool ValidateRegex(const char* regex);
+GTEST_API_ bool MatchRegexAtHead(const char* regex, const char* str);
+GTEST_API_ bool MatchRepetitionAndRegexAtHead(
+    bool escaped, char ch, char repeat, const char* regex, const char* str);
+GTEST_API_ bool MatchRegexAnywhere(const char* regex, const char* str);
+
+#endif  // GTEST_USES_SIMPLE_RE
+
+// Parses the command line for Google Test flags, without initializing
+// other parts of Google Test.
+GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, char** argv);
+GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv);
+
+#if GTEST_HAS_DEATH_TEST
+
+// Returns the message describing the last system error, regardless of the
+// platform.
+GTEST_API_ std::string GetLastErrnoDescription();
+
+// Attempts to parse a string into a positive integer pointed to by the
+// number parameter.  Returns true if that is possible.
+// GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we can use
+// it here.
+template <typename Integer>
+bool ParseNaturalNumber(const ::std::string& str, Integer* number) {
+  // Fail fast if the given string does not begin with a digit;
+  // this bypasses strtoXXX's "optional leading whitespace and plus
+  // or minus sign" semantics, which are undesirable here.
+  if (str.empty() || !IsDigit(str[0])) {
+    return false;
+  }
+  errno = 0;
+
+  char* end;
+  // BiggestConvertible is the largest integer type that system-provided
+  // string-to-number conversion routines can return.
+
+# if GTEST_OS_WINDOWS && !defined(__GNUC__)
+
+  // MSVC and C++ Builder define __int64 instead of the standard long long.
+  typedef unsigned __int64 BiggestConvertible;
+  const BiggestConvertible parsed = _strtoui64(str.c_str(), &end, 10);
+
+# else
+
+  typedef unsigned long long BiggestConvertible;  // NOLINT
+  const BiggestConvertible parsed = strtoull(str.c_str(), &end, 10);
+
+# endif  // GTEST_OS_WINDOWS && !defined(__GNUC__)
+
+  const bool parse_success = *end == '\0' && errno == 0;
+
+  // TODO(vladl at google.com): Convert this to compile time assertion when it is
+  // available.
+  GTEST_CHECK_(sizeof(Integer) <= sizeof(parsed));
+
+  const Integer result = static_cast<Integer>(parsed);
+  if (parse_success && static_cast<BiggestConvertible>(result) == parsed) {
+    *number = result;
+    return true;
+  }
+  return false;
+}
+#endif  // GTEST_HAS_DEATH_TEST
+
+// TestResult contains some private methods that should be hidden from
+// Google Test user but are required for testing. This class allow our tests
+// to access them.
+//
+// This class is supplied only for the purpose of testing Google Test's own
+// constructs. Do not use it in user tests, either directly or indirectly.
+class TestResultAccessor {
+ public:
+  static void RecordProperty(TestResult* test_result,
+                             const std::string& xml_element,
+                             const TestProperty& property) {
+    test_result->RecordProperty(xml_element, property);
+  }
+
+  static void ClearTestPartResults(TestResult* test_result) {
+    test_result->ClearTestPartResults();
+  }
+
+  static const std::vector<testing::TestPartResult>& test_part_results(
+      const TestResult& test_result) {
+    return test_result.test_part_results();
+  }
+};
+
+#if GTEST_CAN_STREAM_RESULTS_
+
+// Streams test results to the given port on the given host machine.
+class StreamingListener : public EmptyTestEventListener {
+ public:
+  // Abstract base class for writing strings to a socket.
+  class AbstractSocketWriter {
+   public:
+    virtual ~AbstractSocketWriter() {}
+
+    // Sends a string to the socket.
+    virtual void Send(const string& message) = 0;
+
+    // Closes the socket.
+    virtual void CloseConnection() {}
+
+    // Sends a string and a newline to the socket.
+    void SendLn(const string& message) {
+      Send(message + "\n");
+    }
+  };
+
+  // Concrete class for actually writing strings to a socket.
+  class SocketWriter : public AbstractSocketWriter {
+   public:
+    SocketWriter(const string& host, const string& port)
+        : sockfd_(-1), host_name_(host), port_num_(port) {
+      MakeConnection();
+    }
+
+    virtual ~SocketWriter() {
+      if (sockfd_ != -1)
+        CloseConnection();
+    }
+
+    // Sends a string to the socket.
+    virtual void Send(const string& message) {
+      GTEST_CHECK_(sockfd_ != -1)
+          << "Send() can be called only when there is a connection.";
+
+      const int len = static_cast<int>(message.length());
+      if (write(sockfd_, message.c_str(), len) != len) {
+        GTEST_LOG_(WARNING)
+            << "stream_result_to: failed to stream to "
+            << host_name_ << ":" << port_num_;
+      }
+    }
+
+   private:
+    // Creates a client socket and connects to the server.
+    void MakeConnection();
+
+    // Closes the socket.
+    void CloseConnection() {
+      GTEST_CHECK_(sockfd_ != -1)
+          << "CloseConnection() can be called only when there is a connection.";
+
+      close(sockfd_);
+      sockfd_ = -1;
+    }
+
+    int sockfd_;  // socket file descriptor
+    const string host_name_;
+    const string port_num_;
+
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(SocketWriter);
+  };  // class SocketWriter
+
+  // Escapes '=', '&', '%', and '\n' characters in str as "%xx".
+  static string UrlEncode(const char* str);
+
+  StreamingListener(const string& host, const string& port)
+      : socket_writer_(new SocketWriter(host, port)) { Start(); }
+
+  explicit StreamingListener(AbstractSocketWriter* socket_writer)
+      : socket_writer_(socket_writer) { Start(); }
+
+  void OnTestProgramStart(const UnitTest& /* unit_test */) {
+    SendLn("event=TestProgramStart");
+  }
+
+  void OnTestProgramEnd(const UnitTest& unit_test) {
+    // Note that Google Test current only report elapsed time for each
+    // test iteration, not for the entire test program.
+    SendLn("event=TestProgramEnd&passed=" + FormatBool(unit_test.Passed()));
+
+    // Notify the streaming server to stop.
+    socket_writer_->CloseConnection();
+  }
+
+  void OnTestIterationStart(const UnitTest& /* unit_test */, int iteration) {
+    SendLn("event=TestIterationStart&iteration=" +
+           StreamableToString(iteration));
+  }
+
+  void OnTestIterationEnd(const UnitTest& unit_test, int /* iteration */) {
+    SendLn("event=TestIterationEnd&passed=" +
+           FormatBool(unit_test.Passed()) + "&elapsed_time=" +
+           StreamableToString(unit_test.elapsed_time()) + "ms");
+  }
+
+  void OnTestCaseStart(const TestCase& test_case) {
+    SendLn(std::string("event=TestCaseStart&name=") + test_case.name());
+  }
+
+  void OnTestCaseEnd(const TestCase& test_case) {
+    SendLn("event=TestCaseEnd&passed=" + FormatBool(test_case.Passed())
+           + "&elapsed_time=" + StreamableToString(test_case.elapsed_time())
+           + "ms");
+  }
+
+  void OnTestStart(const TestInfo& test_info) {
+    SendLn(std::string("event=TestStart&name=") + test_info.name());
+  }
+
+  void OnTestEnd(const TestInfo& test_info) {
+    SendLn("event=TestEnd&passed=" +
+           FormatBool((test_info.result())->Passed()) +
+           "&elapsed_time=" +
+           StreamableToString((test_info.result())->elapsed_time()) + "ms");
+  }
+
+  void OnTestPartResult(const TestPartResult& test_part_result) {
+    const char* file_name = test_part_result.file_name();
+    if (file_name == NULL)
+      file_name = "";
+    SendLn("event=TestPartResult&file=" + UrlEncode(file_name) +
+           "&line=" + StreamableToString(test_part_result.line_number()) +
+           "&message=" + UrlEncode(test_part_result.message()));
+  }
+
+ private:
+  // Sends the given message and a newline to the socket.
+  void SendLn(const string& message) { socket_writer_->SendLn(message); }
+
+  // Called at the start of streaming to notify the receiver what
+  // protocol we are using.
+  void Start() { SendLn("gtest_streaming_protocol_version=1.0"); }
+
+  string FormatBool(bool value) { return value ? "1" : "0"; }
+
+  const scoped_ptr<AbstractSocketWriter> socket_writer_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamingListener);
+};  // class StreamingListener
+
+#endif  // GTEST_CAN_STREAM_RESULTS_
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_SRC_GTEST_INTERNAL_INL_H_
+#undef GTEST_IMPLEMENTATION_
+
+#if GTEST_OS_WINDOWS
+# define vsnprintf _vsnprintf
+#endif  // GTEST_OS_WINDOWS
+
+namespace testing {
+
+using internal::CountIf;
+using internal::ForEach;
+using internal::GetElementOr;
+using internal::Shuffle;
+
+// Constants.
+
+// A test whose test case name or test name matches this filter is
+// disabled and not run.
+static const char kDisableTestFilter[] = "DISABLED_*:*/DISABLED_*";
+
+// A test case whose name matches this filter is considered a death
+// test case and will be run before test cases whose name doesn't
+// match this filter.
+static const char kDeathTestCaseFilter[] = "*DeathTest:*DeathTest/*";
+
+// A test filter that matches everything.
+static const char kUniversalFilter[] = "*";
+
+// The default output file for XML output.
+static const char kDefaultOutputFile[] = "test_detail.xml";
+
+// The environment variable name for the test shard index.
+static const char kTestShardIndex[] = "GTEST_SHARD_INDEX";
+// The environment variable name for the total number of test shards.
+static const char kTestTotalShards[] = "GTEST_TOTAL_SHARDS";
+// The environment variable name for the test shard status file.
+static const char kTestShardStatusFile[] = "GTEST_SHARD_STATUS_FILE";
+
+namespace internal {
+
+// The text used in failure messages to indicate the start of the
+// stack trace.
+const char kStackTraceMarker[] = "\nStack trace:\n";
+
+// g_help_flag is true iff the --help flag or an equivalent form is
+// specified on the command line.
+bool g_help_flag = false;
+
+}  // namespace internal
+
+static const char* GetDefaultFilter() {
+  return kUniversalFilter;
+}
+
+GTEST_DEFINE_bool_(
+    also_run_disabled_tests,
+    internal::BoolFromGTestEnv("also_run_disabled_tests", false),
+    "Run disabled tests too, in addition to the tests normally being run.");
+
+GTEST_DEFINE_bool_(
+    break_on_failure,
+    internal::BoolFromGTestEnv("break_on_failure", false),
+    "True iff a failed assertion should be a debugger break-point.");
+
+GTEST_DEFINE_bool_(
+    catch_exceptions,
+    internal::BoolFromGTestEnv("catch_exceptions", true),
+    "True iff " GTEST_NAME_
+    " should catch exceptions and treat them as test failures.");
+
+GTEST_DEFINE_string_(
+    color,
+    internal::StringFromGTestEnv("color", "auto"),
+    "Whether to use colors in the output.  Valid values: yes, no, "
+    "and auto.  'auto' means to use colors if the output is "
+    "being sent to a terminal and the TERM environment variable "
+    "is set to a terminal type that supports colors.");
+
+GTEST_DEFINE_string_(
+    filter,
+    internal::StringFromGTestEnv("filter", GetDefaultFilter()),
+    "A colon-separated list of glob (not regex) patterns "
+    "for filtering the tests to run, optionally followed by a "
+    "'-' and a : separated list of negative patterns (tests to "
+    "exclude).  A test is run if it matches one of the positive "
+    "patterns and does not match any of the negative patterns.");
+
+GTEST_DEFINE_bool_(list_tests, false,
+                   "List all tests without running them.");
+
+GTEST_DEFINE_string_(
+    output,
+    internal::StringFromGTestEnv("output", ""),
+    "A format (currently must be \"xml\"), optionally followed "
+    "by a colon and an output file name or directory. A directory "
+    "is indicated by a trailing pathname separator. "
+    "Examples: \"xml:filename.xml\", \"xml::directoryname/\". "
+    "If a directory is specified, output files will be created "
+    "within that directory, with file-names based on the test "
+    "executable's name and, if necessary, made unique by adding "
+    "digits.");
+
+GTEST_DEFINE_bool_(
+    print_time,
+    internal::BoolFromGTestEnv("print_time", true),
+    "True iff " GTEST_NAME_
+    " should display elapsed time in text output.");
+
+GTEST_DEFINE_int32_(
+    random_seed,
+    internal::Int32FromGTestEnv("random_seed", 0),
+    "Random number seed to use when shuffling test orders.  Must be in range "
+    "[1, 99999], or 0 to use a seed based on the current time.");
+
+GTEST_DEFINE_int32_(
+    repeat,
+    internal::Int32FromGTestEnv("repeat", 1),
+    "How many times to repeat each test.  Specify a negative number "
+    "for repeating forever.  Useful for shaking out flaky tests.");
+
+GTEST_DEFINE_bool_(
+    show_internal_stack_frames, false,
+    "True iff " GTEST_NAME_ " should include internal stack frames when "
+    "printing test failure stack traces.");
+
+GTEST_DEFINE_bool_(
+    shuffle,
+    internal::BoolFromGTestEnv("shuffle", false),
+    "True iff " GTEST_NAME_
+    " should randomize tests' order on every run.");
+
+GTEST_DEFINE_int32_(
+    stack_trace_depth,
+    internal::Int32FromGTestEnv("stack_trace_depth", kMaxStackTraceDepth),
+    "The maximum number of stack frames to print when an "
+    "assertion fails.  The valid range is 0 through 100, inclusive.");
+
+GTEST_DEFINE_string_(
+    stream_result_to,
+    internal::StringFromGTestEnv("stream_result_to", ""),
+    "This flag specifies the host name and the port number on which to stream "
+    "test results. Example: \"localhost:555\". The flag is effective only on "
+    "Linux.");
+
+GTEST_DEFINE_bool_(
+    throw_on_failure,
+    internal::BoolFromGTestEnv("throw_on_failure", false),
+    "When this flag is specified, a failed assertion will throw an exception "
+    "if exceptions are enabled or exit the program with a non-zero code "
+    "otherwise.");
+
+namespace internal {
+
+// Generates a random number from [0, range), using a Linear
+// Congruential Generator (LCG).  Crashes if 'range' is 0 or greater
+// than kMaxRange.
+UInt32 Random::Generate(UInt32 range) {
+  // These constants are the same as are used in glibc's rand(3).
+  state_ = (1103515245U*state_ + 12345U) % kMaxRange;
+
+  GTEST_CHECK_(range > 0)
+      << "Cannot generate a number in the range [0, 0).";
+  GTEST_CHECK_(range <= kMaxRange)
+      << "Generation of a number in [0, " << range << ") was requested, "
+      << "but this can only generate numbers in [0, " << kMaxRange << ").";
+
+  // Converting via modulus introduces a bit of downward bias, but
+  // it's simple, and a linear congruential generator isn't too good
+  // to begin with.
+  return state_ % range;
+}
+
+// GTestIsInitialized() returns true iff the user has initialized
+// Google Test.  Useful for catching the user mistake of not initializing
+// Google Test before calling RUN_ALL_TESTS().
+//
+// A user must call testing::InitGoogleTest() to initialize Google
+// Test.  g_init_gtest_count is set to the number of times
+// InitGoogleTest() has been called.  We don't protect this variable
+// under a mutex as it is only accessed in the main thread.
+GTEST_API_ int g_init_gtest_count = 0;
+static bool GTestIsInitialized() { return g_init_gtest_count != 0; }
+
+// Iterates over a vector of TestCases, keeping a running sum of the
+// results of calling a given int-returning method on each.
+// Returns the sum.
+static int SumOverTestCaseList(const std::vector<TestCase*>& case_list,
+                               int (TestCase::*method)() const) {
+  int sum = 0;
+  for (size_t i = 0; i < case_list.size(); i++) {
+    sum += (case_list[i]->*method)();
+  }
+  return sum;
+}
+
+// Returns true iff the test case passed.
+static bool TestCasePassed(const TestCase* test_case) {
+  return test_case->should_run() && test_case->Passed();
+}
+
+// Returns true iff the test case failed.
+static bool TestCaseFailed(const TestCase* test_case) {
+  return test_case->should_run() && test_case->Failed();
+}
+
+// Returns true iff test_case contains at least one test that should
+// run.
+static bool ShouldRunTestCase(const TestCase* test_case) {
+  return test_case->should_run();
+}
+
+// AssertHelper constructor.
+AssertHelper::AssertHelper(TestPartResult::Type type,
+                           const char* file,
+                           int line,
+                           const char* message)
+    : data_(new AssertHelperData(type, file, line, message)) {
+}
+
+AssertHelper::~AssertHelper() {
+  delete data_;
+}
+
+// Message assignment, for assertion streaming support.
+void AssertHelper::operator=(const Message& message) const {
+  UnitTest::GetInstance()->
+    AddTestPartResult(data_->type, data_->file, data_->line,
+                      AppendUserMessage(data_->message, message),
+                      UnitTest::GetInstance()->impl()
+                      ->CurrentOsStackTraceExceptTop(1)
+                      // Skips the stack frame for this function itself.
+                      );  // NOLINT
+}
+
+// Mutex for linked pointers.
+GTEST_API_ GTEST_DEFINE_STATIC_MUTEX_(g_linked_ptr_mutex);
+
+// Application pathname gotten in InitGoogleTest.
+std::string g_executable_path;
+
+// Returns the current application's name, removing directory path if that
+// is present.
+FilePath GetCurrentExecutableName() {
+  FilePath result;
+
+#if GTEST_OS_WINDOWS
+  result.Set(FilePath(g_executable_path).RemoveExtension("exe"));
+#else
+  result.Set(FilePath(g_executable_path));
+#endif  // GTEST_OS_WINDOWS
+
+  return result.RemoveDirectoryName();
+}
+
+// Functions for processing the gtest_output flag.
+
+// Returns the output format, or "" for normal printed output.
+std::string UnitTestOptions::GetOutputFormat() {
+  const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
+  if (gtest_output_flag == NULL) return std::string("");
+
+  const char* const colon = strchr(gtest_output_flag, ':');
+  return (colon == NULL) ?
+      std::string(gtest_output_flag) :
+      std::string(gtest_output_flag, colon - gtest_output_flag);
+}
+
+// Returns the name of the requested output file, or the default if none
+// was explicitly specified.
+std::string UnitTestOptions::GetAbsolutePathToOutputFile() {
+  const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
+  if (gtest_output_flag == NULL)
+    return "";
+
+  const char* const colon = strchr(gtest_output_flag, ':');
+  if (colon == NULL)
+    return internal::FilePath::ConcatPaths(
+        internal::FilePath(
+            UnitTest::GetInstance()->original_working_dir()),
+        internal::FilePath(kDefaultOutputFile)).string();
+
+  internal::FilePath output_name(colon + 1);
+  if (!output_name.IsAbsolutePath())
+    // TODO(wan at google.com): on Windows \some\path is not an absolute
+    // path (as its meaning depends on the current drive), yet the
+    // following logic for turning it into an absolute path is wrong.
+    // Fix it.
+    output_name = internal::FilePath::ConcatPaths(
+        internal::FilePath(UnitTest::GetInstance()->original_working_dir()),
+        internal::FilePath(colon + 1));
+
+  if (!output_name.IsDirectory())
+    return output_name.string();
+
+  internal::FilePath result(internal::FilePath::GenerateUniqueFileName(
+      output_name, internal::GetCurrentExecutableName(),
+      GetOutputFormat().c_str()));
+  return result.string();
+}
+
+// Returns true iff the wildcard pattern matches the string.  The
+// first ':' or '\0' character in pattern marks the end of it.
+//
+// This recursive algorithm isn't very efficient, but is clear and
+// works well enough for matching test names, which are short.
+bool UnitTestOptions::PatternMatchesString(const char *pattern,
+                                           const char *str) {
+  switch (*pattern) {
+    case '\0':
+    case ':':  // Either ':' or '\0' marks the end of the pattern.
+      return *str == '\0';
+    case '?':  // Matches any single character.
+      return *str != '\0' && PatternMatchesString(pattern + 1, str + 1);
+    case '*':  // Matches any string (possibly empty) of characters.
+      return (*str != '\0' && PatternMatchesString(pattern, str + 1)) ||
+          PatternMatchesString(pattern + 1, str);
+    default:  // Non-special character.  Matches itself.
+      return *pattern == *str &&
+          PatternMatchesString(pattern + 1, str + 1);
+  }
+}
+
+bool UnitTestOptions::MatchesFilter(
+    const std::string& name, const char* filter) {
+  const char *cur_pattern = filter;
+  for (;;) {
+    if (PatternMatchesString(cur_pattern, name.c_str())) {
+      return true;
+    }
+
+    // Finds the next pattern in the filter.
+    cur_pattern = strchr(cur_pattern, ':');
+
+    // Returns if no more pattern can be found.
+    if (cur_pattern == NULL) {
+      return false;
+    }
+
+    // Skips the pattern separater (the ':' character).
+    cur_pattern++;
+  }
+}
+
+// Returns true iff the user-specified filter matches the test case
+// name and the test name.
+bool UnitTestOptions::FilterMatchesTest(const std::string &test_case_name,
+                                        const std::string &test_name) {
+  const std::string& full_name = test_case_name + "." + test_name.c_str();
+
+  // Split --gtest_filter at '-', if there is one, to separate into
+  // positive filter and negative filter portions
+  const char* const p = GTEST_FLAG(filter).c_str();
+  const char* const dash = strchr(p, '-');
+  std::string positive;
+  std::string negative;
+  if (dash == NULL) {
+    positive = GTEST_FLAG(filter).c_str();  // Whole string is a positive filter
+    negative = "";
+  } else {
+    positive = std::string(p, dash);   // Everything up to the dash
+    negative = std::string(dash + 1);  // Everything after the dash
+    if (positive.empty()) {
+      // Treat '-test1' as the same as '*-test1'
+      positive = kUniversalFilter;
+    }
+  }
+
+  // A filter is a colon-separated list of patterns.  It matches a
+  // test if any pattern in it matches the test.
+  return (MatchesFilter(full_name, positive.c_str()) &&
+          !MatchesFilter(full_name, negative.c_str()));
+}
+
+#if GTEST_HAS_SEH
+// Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the
+// given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise.
+// This function is useful as an __except condition.
+int UnitTestOptions::GTestShouldProcessSEH(DWORD exception_code) {
+  // Google Test should handle a SEH exception if:
+  //   1. the user wants it to, AND
+  //   2. this is not a breakpoint exception, AND
+  //   3. this is not a C++ exception (VC++ implements them via SEH,
+  //      apparently).
+  //
+  // SEH exception code for C++ exceptions.
+  // (see http://support.microsoft.com/kb/185294 for more information).
+  const DWORD kCxxExceptionCode = 0xe06d7363;
+
+  bool should_handle = true;
+
+  if (!GTEST_FLAG(catch_exceptions))
+    should_handle = false;
+  else if (exception_code == EXCEPTION_BREAKPOINT)
+    should_handle = false;
+  else if (exception_code == kCxxExceptionCode)
+    should_handle = false;
+
+  return should_handle ? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH;
+}
+#endif  // GTEST_HAS_SEH
+
+}  // namespace internal
+
+// The c'tor sets this object as the test part result reporter used by
+// Google Test.  The 'result' parameter specifies where to report the
+// results. Intercepts only failures from the current thread.
+ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
+    TestPartResultArray* result)
+    : intercept_mode_(INTERCEPT_ONLY_CURRENT_THREAD),
+      result_(result) {
+  Init();
+}
+
+// The c'tor sets this object as the test part result reporter used by
+// Google Test.  The 'result' parameter specifies where to report the
+// results.
+ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
+    InterceptMode intercept_mode, TestPartResultArray* result)
+    : intercept_mode_(intercept_mode),
+      result_(result) {
+  Init();
+}
+
+void ScopedFakeTestPartResultReporter::Init() {
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  if (intercept_mode_ == INTERCEPT_ALL_THREADS) {
+    old_reporter_ = impl->GetGlobalTestPartResultReporter();
+    impl->SetGlobalTestPartResultReporter(this);
+  } else {
+    old_reporter_ = impl->GetTestPartResultReporterForCurrentThread();
+    impl->SetTestPartResultReporterForCurrentThread(this);
+  }
+}
+
+// The d'tor restores the test part result reporter used by Google Test
+// before.
+ScopedFakeTestPartResultReporter::~ScopedFakeTestPartResultReporter() {
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  if (intercept_mode_ == INTERCEPT_ALL_THREADS) {
+    impl->SetGlobalTestPartResultReporter(old_reporter_);
+  } else {
+    impl->SetTestPartResultReporterForCurrentThread(old_reporter_);
+  }
+}
+
+// Increments the test part result count and remembers the result.
+// This method is from the TestPartResultReporterInterface interface.
+void ScopedFakeTestPartResultReporter::ReportTestPartResult(
+    const TestPartResult& result) {
+  result_->Append(result);
+}
+
+namespace internal {
+
+// Returns the type ID of ::testing::Test.  We should always call this
+// instead of GetTypeId< ::testing::Test>() to get the type ID of
+// testing::Test.  This is to work around a suspected linker bug when
+// using Google Test as a framework on Mac OS X.  The bug causes
+// GetTypeId< ::testing::Test>() to return different values depending
+// on whether the call is from the Google Test framework itself or
+// from user test code.  GetTestTypeId() is guaranteed to always
+// return the same value, as it always calls GetTypeId<>() from the
+// gtest.cc, which is within the Google Test framework.
+TypeId GetTestTypeId() {
+  return GetTypeId<Test>();
+}
+
+// The value of GetTestTypeId() as seen from within the Google Test
+// library.  This is solely for testing GetTestTypeId().
+extern const TypeId kTestTypeIdInGoogleTest = GetTestTypeId();
+
+// This predicate-formatter checks that 'results' contains a test part
+// failure of the given type and that the failure message contains the
+// given substring.
+AssertionResult HasOneFailure(const char* /* results_expr */,
+                              const char* /* type_expr */,
+                              const char* /* substr_expr */,
+                              const TestPartResultArray& results,
+                              TestPartResult::Type type,
+                              const string& substr) {
+  const std::string expected(type == TestPartResult::kFatalFailure ?
+                        "1 fatal failure" :
+                        "1 non-fatal failure");
+  Message msg;
+  if (results.size() != 1) {
+    msg << "Expected: " << expected << "\n"
+        << "  Actual: " << results.size() << " failures";
+    for (int i = 0; i < results.size(); i++) {
+      msg << "\n" << results.GetTestPartResult(i);
+    }
+    return AssertionFailure() << msg;
+  }
+
+  const TestPartResult& r = results.GetTestPartResult(0);
+  if (r.type() != type) {
+    return AssertionFailure() << "Expected: " << expected << "\n"
+                              << "  Actual:\n"
+                              << r;
+  }
+
+  if (strstr(r.message(), substr.c_str()) == NULL) {
+    return AssertionFailure() << "Expected: " << expected << " containing \""
+                              << substr << "\"\n"
+                              << "  Actual:\n"
+                              << r;
+  }
+
+  return AssertionSuccess();
+}
+
+// The constructor of SingleFailureChecker remembers where to look up
+// test part results, what type of failure we expect, and what
+// substring the failure message should contain.
+SingleFailureChecker:: SingleFailureChecker(
+    const TestPartResultArray* results,
+    TestPartResult::Type type,
+    const string& substr)
+    : results_(results),
+      type_(type),
+      substr_(substr) {}
+
+// The destructor of SingleFailureChecker verifies that the given
+// TestPartResultArray contains exactly one failure that has the given
+// type and contains the given substring.  If that's not the case, a
+// non-fatal failure will be generated.
+SingleFailureChecker::~SingleFailureChecker() {
+  EXPECT_PRED_FORMAT3(HasOneFailure, *results_, type_, substr_);
+}
+
+DefaultGlobalTestPartResultReporter::DefaultGlobalTestPartResultReporter(
+    UnitTestImpl* unit_test) : unit_test_(unit_test) {}
+
+void DefaultGlobalTestPartResultReporter::ReportTestPartResult(
+    const TestPartResult& result) {
+  unit_test_->current_test_result()->AddTestPartResult(result);
+  unit_test_->listeners()->repeater()->OnTestPartResult(result);
+}
+
+DefaultPerThreadTestPartResultReporter::DefaultPerThreadTestPartResultReporter(
+    UnitTestImpl* unit_test) : unit_test_(unit_test) {}
+
+void DefaultPerThreadTestPartResultReporter::ReportTestPartResult(
+    const TestPartResult& result) {
+  unit_test_->GetGlobalTestPartResultReporter()->ReportTestPartResult(result);
+}
+
+// Returns the global test part result reporter.
+TestPartResultReporterInterface*
+UnitTestImpl::GetGlobalTestPartResultReporter() {
+  internal::MutexLock lock(&global_test_part_result_reporter_mutex_);
+  return global_test_part_result_repoter_;
+}
+
+// Sets the global test part result reporter.
+void UnitTestImpl::SetGlobalTestPartResultReporter(
+    TestPartResultReporterInterface* reporter) {
+  internal::MutexLock lock(&global_test_part_result_reporter_mutex_);
+  global_test_part_result_repoter_ = reporter;
+}
+
+// Returns the test part result reporter for the current thread.
+TestPartResultReporterInterface*
+UnitTestImpl::GetTestPartResultReporterForCurrentThread() {
+  return per_thread_test_part_result_reporter_.get();
+}
+
+// Sets the test part result reporter for the current thread.
+void UnitTestImpl::SetTestPartResultReporterForCurrentThread(
+    TestPartResultReporterInterface* reporter) {
+  per_thread_test_part_result_reporter_.set(reporter);
+}
+
+// Gets the number of successful test cases.
+int UnitTestImpl::successful_test_case_count() const {
+  return CountIf(test_cases_, TestCasePassed);
+}
+
+// Gets the number of failed test cases.
+int UnitTestImpl::failed_test_case_count() const {
+  return CountIf(test_cases_, TestCaseFailed);
+}
+
+// Gets the number of all test cases.
+int UnitTestImpl::total_test_case_count() const {
+  return static_cast<int>(test_cases_.size());
+}
+
+// Gets the number of all test cases that contain at least one test
+// that should run.
+int UnitTestImpl::test_case_to_run_count() const {
+  return CountIf(test_cases_, ShouldRunTestCase);
+}
+
+// Gets the number of successful tests.
+int UnitTestImpl::successful_test_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::successful_test_count);
+}
+
+// Gets the number of failed tests.
+int UnitTestImpl::failed_test_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::failed_test_count);
+}
+
+// Gets the number of disabled tests that will be reported in the XML report.
+int UnitTestImpl::reportable_disabled_test_count() const {
+  return SumOverTestCaseList(test_cases_,
+                             &TestCase::reportable_disabled_test_count);
+}
+
+// Gets the number of disabled tests.
+int UnitTestImpl::disabled_test_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::disabled_test_count);
+}
+
+// Gets the number of tests to be printed in the XML report.
+int UnitTestImpl::reportable_test_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::reportable_test_count);
+}
+
+// Gets the number of all tests.
+int UnitTestImpl::total_test_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::total_test_count);
+}
+
+// Gets the number of tests that should run.
+int UnitTestImpl::test_to_run_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::test_to_run_count);
+}
+
+// Returns the current OS stack trace as an std::string.
+//
+// The maximum number of stack frames to be included is specified by
+// the gtest_stack_trace_depth flag.  The skip_count parameter
+// specifies the number of top frames to be skipped, which doesn't
+// count against the number of frames to be included.
+//
+// For example, if Foo() calls Bar(), which in turn calls
+// CurrentOsStackTraceExceptTop(1), Foo() will be included in the
+// trace but Bar() and CurrentOsStackTraceExceptTop() won't.
+std::string UnitTestImpl::CurrentOsStackTraceExceptTop(int skip_count) {
+  (void)skip_count;
+  return "";
+}
+
+// Returns the current time in milliseconds.
+TimeInMillis GetTimeInMillis() {
+#if GTEST_OS_WINDOWS_MOBILE || defined(__BORLANDC__)
+  // Difference between 1970-01-01 and 1601-01-01 in milliseconds.
+  // http://analogous.blogspot.com/2005/04/epoch.html
+  const TimeInMillis kJavaEpochToWinFileTimeDelta =
+    static_cast<TimeInMillis>(116444736UL) * 100000UL;
+  const DWORD kTenthMicrosInMilliSecond = 10000;
+
+  SYSTEMTIME now_systime;
+  FILETIME now_filetime;
+  ULARGE_INTEGER now_int64;
+  // TODO(kenton at google.com): Shouldn't this just use
+  //   GetSystemTimeAsFileTime()?
+  GetSystemTime(&now_systime);
+  if (SystemTimeToFileTime(&now_systime, &now_filetime)) {
+    now_int64.LowPart = now_filetime.dwLowDateTime;
+    now_int64.HighPart = now_filetime.dwHighDateTime;
+    now_int64.QuadPart = (now_int64.QuadPart / kTenthMicrosInMilliSecond) -
+      kJavaEpochToWinFileTimeDelta;
+    return now_int64.QuadPart;
+  }
+  return 0;
+#elif GTEST_OS_WINDOWS && !GTEST_HAS_GETTIMEOFDAY_
+  __timeb64 now;
+
+  // MSVC 8 deprecates _ftime64(), so we want to suppress warning 4996
+  // (deprecated function) there.
+  // TODO(kenton at google.com): Use GetTickCount()?  Or use
+  //   SystemTimeToFileTime()
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996)
+  _ftime64(&now);
+  GTEST_DISABLE_MSC_WARNINGS_POP_()
+
+  return static_cast<TimeInMillis>(now.time) * 1000 + now.millitm;
+#elif GTEST_HAS_GETTIMEOFDAY_
+  struct timeval now;
+  gettimeofday(&now, NULL);
+  return static_cast<TimeInMillis>(now.tv_sec) * 1000 + now.tv_usec / 1000;
+#else
+# error "Don't know how to get the current time on your system."
+#endif
+}
+
+// Utilities
+
+// class String.
+
+#if GTEST_OS_WINDOWS_MOBILE
+// Creates a UTF-16 wide string from the given ANSI string, allocating
+// memory using new. The caller is responsible for deleting the return
+// value using delete[]. Returns the wide string, or NULL if the
+// input is NULL.
+LPCWSTR String::AnsiToUtf16(const char* ansi) {
+  if (!ansi) return NULL;
+  const int length = strlen(ansi);
+  const int unicode_length =
+      MultiByteToWideChar(CP_ACP, 0, ansi, length,
+                          NULL, 0);
+  WCHAR* unicode = new WCHAR[unicode_length + 1];
+  MultiByteToWideChar(CP_ACP, 0, ansi, length,
+                      unicode, unicode_length);
+  unicode[unicode_length] = 0;
+  return unicode;
+}
+
+// Creates an ANSI string from the given wide string, allocating
+// memory using new. The caller is responsible for deleting the return
+// value using delete[]. Returns the ANSI string, or NULL if the
+// input is NULL.
+const char* String::Utf16ToAnsi(LPCWSTR utf16_str)  {
+  if (!utf16_str) return NULL;
+  const int ansi_length =
+      WideCharToMultiByte(CP_ACP, 0, utf16_str, -1,
+                          NULL, 0, NULL, NULL);
+  char* ansi = new char[ansi_length + 1];
+  WideCharToMultiByte(CP_ACP, 0, utf16_str, -1,
+                      ansi, ansi_length, NULL, NULL);
+  ansi[ansi_length] = 0;
+  return ansi;
+}
+
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+// Compares two C strings.  Returns true iff they have the same content.
+//
+// Unlike strcmp(), this function can handle NULL argument(s).  A NULL
+// C string is considered different to any non-NULL C string,
+// including the empty string.
+bool String::CStringEquals(const char * lhs, const char * rhs) {
+  if ( lhs == NULL ) return rhs == NULL;
+
+  if ( rhs == NULL ) return false;
+
+  return strcmp(lhs, rhs) == 0;
+}
+
+#if GTEST_HAS_STD_WSTRING || GTEST_HAS_GLOBAL_WSTRING
+
+// Converts an array of wide chars to a narrow string using the UTF-8
+// encoding, and streams the result to the given Message object.
+static void StreamWideCharsToMessage(const wchar_t* wstr, size_t length,
+                                     Message* msg) {
+  for (size_t i = 0; i != length; ) {  // NOLINT
+    if (wstr[i] != L'\0') {
+      *msg << WideStringToUtf8(wstr + i, static_cast<int>(length - i));
+      while (i != length && wstr[i] != L'\0')
+        i++;
+    } else {
+      *msg << '\0';
+      i++;
+    }
+  }
+}
+
+#endif  // GTEST_HAS_STD_WSTRING || GTEST_HAS_GLOBAL_WSTRING
+
+}  // namespace internal
+
+// Constructs an empty Message.
+// We allocate the stringstream separately because otherwise each use of
+// ASSERT/EXPECT in a procedure adds over 200 bytes to the procedure's
+// stack frame leading to huge stack frames in some cases; gcc does not reuse
+// the stack space.
+Message::Message() : ss_(new ::std::stringstream) {
+  // By default, we want there to be enough precision when printing
+  // a double to a Message.
+  *ss_ << std::setprecision(std::numeric_limits<double>::digits10 + 2);
+}
+
+// These two overloads allow streaming a wide C string to a Message
+// using the UTF-8 encoding.
+Message& Message::operator <<(const wchar_t* wide_c_str) {
+  return *this << internal::String::ShowWideCString(wide_c_str);
+}
+Message& Message::operator <<(wchar_t* wide_c_str) {
+  return *this << internal::String::ShowWideCString(wide_c_str);
+}
+
+#if GTEST_HAS_STD_WSTRING
+// Converts the given wide string to a narrow string using the UTF-8
+// encoding, and streams the result to this Message object.
+Message& Message::operator <<(const ::std::wstring& wstr) {
+  internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this);
+  return *this;
+}
+#endif  // GTEST_HAS_STD_WSTRING
+
+#if GTEST_HAS_GLOBAL_WSTRING
+// Converts the given wide string to a narrow string using the UTF-8
+// encoding, and streams the result to this Message object.
+Message& Message::operator <<(const ::wstring& wstr) {
+  internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this);
+  return *this;
+}
+#endif  // GTEST_HAS_GLOBAL_WSTRING
+
+// Gets the text streamed to this object so far as an std::string.
+// Each '\0' character in the buffer is replaced with "\\0".
+std::string Message::GetString() const {
+  return internal::StringStreamToString(ss_.get());
+}
+
+// AssertionResult constructors.
+// Used in EXPECT_TRUE/FALSE(assertion_result).
+AssertionResult::AssertionResult(const AssertionResult& other)
+    : success_(other.success_),
+      message_(other.message_.get() != NULL ?
+               new ::std::string(*other.message_) :
+               static_cast< ::std::string*>(NULL)) {
+}
+
+// Swaps two AssertionResults.
+void AssertionResult::swap(AssertionResult& other) {
+  using std::swap;
+  swap(success_, other.success_);
+  swap(message_, other.message_);
+}
+
+// Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
+AssertionResult AssertionResult::operator!() const {
+  AssertionResult negation(!success_);
+  if (message_.get() != NULL)
+    negation << *message_;
+  return negation;
+}
+
+// Makes a successful assertion result.
+AssertionResult AssertionSuccess() {
+  return AssertionResult(true);
+}
+
+// Makes a failed assertion result.
+AssertionResult AssertionFailure() {
+  return AssertionResult(false);
+}
+
+// Makes a failed assertion result with the given failure message.
+// Deprecated; use AssertionFailure() << message.
+AssertionResult AssertionFailure(const Message& message) {
+  return AssertionFailure() << message;
+}
+
+namespace internal {
+
+namespace edit_distance {
+std::vector<EditType> CalculateOptimalEdits(const std::vector<size_t>& left,
+                                            const std::vector<size_t>& right) {
+  std::vector<std::vector<double> > costs(
+      left.size() + 1, std::vector<double>(right.size() + 1));
+  std::vector<std::vector<EditType> > best_move(
+      left.size() + 1, std::vector<EditType>(right.size() + 1));
+
+  // Populate for empty right.
+  for (size_t l_i = 0; l_i < costs.size(); ++l_i) {
+    costs[l_i][0] = static_cast<double>(l_i);
+    best_move[l_i][0] = kRemove;
+  }
+  // Populate for empty left.
+  for (size_t r_i = 1; r_i < costs[0].size(); ++r_i) {
+    costs[0][r_i] = static_cast<double>(r_i);
+    best_move[0][r_i] = kAdd;
+  }
+
+  for (size_t l_i = 0; l_i < left.size(); ++l_i) {
+    for (size_t r_i = 0; r_i < right.size(); ++r_i) {
+      if (left[l_i] == right[r_i]) {
+        // Found a match. Consume it.
+        costs[l_i + 1][r_i + 1] = costs[l_i][r_i];
+        best_move[l_i + 1][r_i + 1] = kMatch;
+        continue;
+      }
+
+      const double add = costs[l_i + 1][r_i];
+      const double remove = costs[l_i][r_i + 1];
+      const double replace = costs[l_i][r_i];
+      if (add < remove && add < replace) {
+        costs[l_i + 1][r_i + 1] = add + 1;
+        best_move[l_i + 1][r_i + 1] = kAdd;
+      } else if (remove < add && remove < replace) {
+        costs[l_i + 1][r_i + 1] = remove + 1;
+        best_move[l_i + 1][r_i + 1] = kRemove;
+      } else {
+        // We make replace a little more expensive than add/remove to lower
+        // their priority.
+        costs[l_i + 1][r_i + 1] = replace + 1.00001;
+        best_move[l_i + 1][r_i + 1] = kReplace;
+      }
+    }
+  }
+
+  // Reconstruct the best path. We do it in reverse order.
+  std::vector<EditType> best_path;
+  for (size_t l_i = left.size(), r_i = right.size(); l_i > 0 || r_i > 0;) {
+    EditType move = best_move[l_i][r_i];
+    best_path.push_back(move);
+    l_i -= move != kAdd;
+    r_i -= move != kRemove;
+  }
+  std::reverse(best_path.begin(), best_path.end());
+  return best_path;
+}
+
+namespace {
+
+// Helper class to convert string into ids with deduplication.
+class InternalStrings {
+ public:
+  size_t GetId(const std::string& str) {
+    IdMap::iterator it = ids_.find(str);
+    if (it != ids_.end()) return it->second;
+    size_t id = ids_.size();
+    return ids_[str] = id;
+  }
+
+ private:
+  typedef std::map<std::string, size_t> IdMap;
+  IdMap ids_;
+};
+
+}  // namespace
+
+std::vector<EditType> CalculateOptimalEdits(
+    const std::vector<std::string>& left,
+    const std::vector<std::string>& right) {
+  std::vector<size_t> left_ids, right_ids;
+  {
+    InternalStrings intern_table;
+    for (size_t i = 0; i < left.size(); ++i) {
+      left_ids.push_back(intern_table.GetId(left[i]));
+    }
+    for (size_t i = 0; i < right.size(); ++i) {
+      right_ids.push_back(intern_table.GetId(right[i]));
+    }
+  }
+  return CalculateOptimalEdits(left_ids, right_ids);
+}
+
+namespace {
+
+// Helper class that holds the state for one hunk and prints it out to the
+// stream.
+// It reorders adds/removes when possible to group all removes before all
+// adds. It also adds the hunk header before printint into the stream.
+class Hunk {
+ public:
+  Hunk(size_t left_start, size_t right_start)
+      : left_start_(left_start),
+        right_start_(right_start),
+        adds_(),
+        removes_(),
+        common_() {}
+
+  void PushLine(char edit, const char* line) {
+    switch (edit) {
+      case ' ':
+        ++common_;
+        FlushEdits();
+        hunk_.push_back(std::make_pair(' ', line));
+        break;
+      case '-':
+        ++removes_;
+        hunk_removes_.push_back(std::make_pair('-', line));
+        break;
+      case '+':
+        ++adds_;
+        hunk_adds_.push_back(std::make_pair('+', line));
+        break;
+    }
+  }
+
+  void PrintTo(std::ostream* os) {
+    PrintHeader(os);
+    FlushEdits();
+    for (std::list<std::pair<char, const char*> >::const_iterator it =
+             hunk_.begin();
+         it != hunk_.end(); ++it) {
+      *os << it->first << it->second << "\n";
+    }
+  }
+
+  bool has_edits() const { return adds_ || removes_; }
+
+ private:
+  void FlushEdits() {
+    hunk_.splice(hunk_.end(), hunk_removes_);
+    hunk_.splice(hunk_.end(), hunk_adds_);
+  }
+
+  // Print a unified diff header for one hunk.
+  // The format is
+  //   "@@ -<left_start>,<left_length> +<right_start>,<right_length> @@"
+  // where the left/right parts are ommitted if unnecessary.
+  void PrintHeader(std::ostream* ss) const {
+    *ss << "@@ ";
+    if (removes_) {
+      *ss << "-" << left_start_ << "," << (removes_ + common_);
+    }
+    if (removes_ && adds_) {
+      *ss << " ";
+    }
+    if (adds_) {
+      *ss << "+" << right_start_ << "," << (adds_ + common_);
+    }
+    *ss << " @@\n";
+  }
+
+  size_t left_start_, right_start_;
+  size_t adds_, removes_, common_;
+  std::list<std::pair<char, const char*> > hunk_, hunk_adds_, hunk_removes_;
+};
+
+}  // namespace
+
+// Create a list of diff hunks in Unified diff format.
+// Each hunk has a header generated by PrintHeader above plus a body with
+// lines prefixed with ' ' for no change, '-' for deletion and '+' for
+// addition.
+// 'context' represents the desired unchanged prefix/suffix around the diff.
+// If two hunks are close enough that their contexts overlap, then they are
+// joined into one hunk.
+std::string CreateUnifiedDiff(const std::vector<std::string>& left,
+                              const std::vector<std::string>& right,
+                              size_t context) {
+  const std::vector<EditType> edits = CalculateOptimalEdits(left, right);
+
+  size_t l_i = 0, r_i = 0, edit_i = 0;
+  std::stringstream ss;
+  while (edit_i < edits.size()) {
+    // Find first edit.
+    while (edit_i < edits.size() && edits[edit_i] == kMatch) {
+      ++l_i;
+      ++r_i;
+      ++edit_i;
+    }
+
+    // Find the first line to include in the hunk.
+    const size_t prefix_context = std::min(l_i, context);
+    Hunk hunk(l_i - prefix_context + 1, r_i - prefix_context + 1);
+    for (size_t i = prefix_context; i > 0; --i) {
+      hunk.PushLine(' ', left[l_i - i].c_str());
+    }
+
+    // Iterate the edits until we found enough suffix for the hunk or the input
+    // is over.
+    size_t n_suffix = 0;
+    for (; edit_i < edits.size(); ++edit_i) {
+      if (n_suffix >= context) {
+        // Continue only if the next hunk is very close.
+        std::vector<EditType>::const_iterator it = edits.begin() + edit_i;
+        while (it != edits.end() && *it == kMatch) ++it;
+        if (it == edits.end() || (it - edits.begin()) - edit_i >= context) {
+          // There is no next edit or it is too far away.
+          break;
+        }
+      }
+
+      EditType edit = edits[edit_i];
+      // Reset count when a non match is found.
+      n_suffix = edit == kMatch ? n_suffix + 1 : 0;
+
+      if (edit == kMatch || edit == kRemove || edit == kReplace) {
+        hunk.PushLine(edit == kMatch ? ' ' : '-', left[l_i].c_str());
+      }
+      if (edit == kAdd || edit == kReplace) {
+        hunk.PushLine('+', right[r_i].c_str());
+      }
+
+      // Advance indices, depending on edit type.
+      l_i += edit != kAdd;
+      r_i += edit != kRemove;
+    }
+
+    if (!hunk.has_edits()) {
+      // We are done. We don't want this hunk.
+      break;
+    }
+
+    hunk.PrintTo(&ss);
+  }
+  return ss.str();
+}
+
+}  // namespace edit_distance
+
+namespace {
+
+// The string representation of the values received in EqFailure() are already
+// escaped. Split them on escaped '\n' boundaries. Leave all other escaped
+// characters the same.
+std::vector<std::string> SplitEscapedString(const std::string& str) {
+  std::vector<std::string> lines;
+  size_t start = 0, end = str.size();
+  if (end > 2 && str[0] == '"' && str[end - 1] == '"') {
+    ++start;
+    --end;
+  }
+  bool escaped = false;
+  for (size_t i = start; i + 1 < end; ++i) {
+    if (escaped) {
+      escaped = false;
+      if (str[i] == 'n') {
+        lines.push_back(str.substr(start, i - start - 1));
+        start = i + 1;
+      }
+    } else {
+      escaped = str[i] == '\\';
+    }
+  }
+  lines.push_back(str.substr(start, end - start));
+  return lines;
+}
+
+}  // namespace
+
+// Constructs and returns the message for an equality assertion
+// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure.
+//
+// The first four parameters are the expressions used in the assertion
+// and their values, as strings.  For example, for ASSERT_EQ(foo, bar)
+// where foo is 5 and bar is 6, we have:
+//
+//   expected_expression: "foo"
+//   actual_expression:   "bar"
+//   expected_value:      "5"
+//   actual_value:        "6"
+//
+// The ignoring_case parameter is true iff the assertion is a
+// *_STRCASEEQ*.  When it's true, the string " (ignoring case)" will
+// be inserted into the message.
+AssertionResult EqFailure(const char* expected_expression,
+                          const char* actual_expression,
+                          const std::string& expected_value,
+                          const std::string& actual_value,
+                          bool ignoring_case) {
+  Message msg;
+  msg << "Value of: " << actual_expression;
+  if (actual_value != actual_expression) {
+    msg << "\n  Actual: " << actual_value;
+  }
+
+  msg << "\nExpected: " << expected_expression;
+  if (ignoring_case) {
+    msg << " (ignoring case)";
+  }
+  if (expected_value != expected_expression) {
+    msg << "\nWhich is: " << expected_value;
+  }
+
+  if (!expected_value.empty() && !actual_value.empty()) {
+    const std::vector<std::string> expected_lines =
+        SplitEscapedString(expected_value);
+    const std::vector<std::string> actual_lines =
+        SplitEscapedString(actual_value);
+    if (expected_lines.size() > 1 || actual_lines.size() > 1) {
+      msg << "\nWith diff:\n"
+          << edit_distance::CreateUnifiedDiff(expected_lines, actual_lines);
+    }
+  }
+
+  return AssertionFailure() << msg;
+}
+
+// Constructs a failure message for Boolean assertions such as EXPECT_TRUE.
+std::string GetBoolAssertionFailureMessage(
+    const AssertionResult& assertion_result,
+    const char* expression_text,
+    const char* actual_predicate_value,
+    const char* expected_predicate_value) {
+  const char* actual_message = assertion_result.message();
+  Message msg;
+  msg << "Value of: " << expression_text
+      << "\n  Actual: " << actual_predicate_value;
+  if (actual_message[0] != '\0')
+    msg << " (" << actual_message << ")";
+  msg << "\nExpected: " << expected_predicate_value;
+  return msg.GetString();
+}
+
+// Helper function for implementing ASSERT_NEAR.
+AssertionResult DoubleNearPredFormat(const char* expr1,
+                                     const char* expr2,
+                                     const char* abs_error_expr,
+                                     double val1,
+                                     double val2,
+                                     double abs_error) {
+  const double diff = fabs(val1 - val2);
+  if (diff <= abs_error) return AssertionSuccess();
+
+  // TODO(wan): do not print the value of an expression if it's
+  // already a literal.
+  return AssertionFailure()
+      << "The difference between " << expr1 << " and " << expr2
+      << " is " << diff << ", which exceeds " << abs_error_expr << ", where\n"
+      << expr1 << " evaluates to " << val1 << ",\n"
+      << expr2 << " evaluates to " << val2 << ", and\n"
+      << abs_error_expr << " evaluates to " << abs_error << ".";
+}
+
+
+// Helper template for implementing FloatLE() and DoubleLE().
+template <typename RawType>
+AssertionResult FloatingPointLE(const char* expr1,
+                                const char* expr2,
+                                RawType val1,
+                                RawType val2) {
+  // Returns success if val1 is less than val2,
+  if (val1 < val2) {
+    return AssertionSuccess();
+  }
+
+  // or if val1 is almost equal to val2.
+  const FloatingPoint<RawType> lhs(val1), rhs(val2);
+  if (lhs.AlmostEquals(rhs)) {
+    return AssertionSuccess();
+  }
+
+  // Note that the above two checks will both fail if either val1 or
+  // val2 is NaN, as the IEEE floating-point standard requires that
+  // any predicate involving a NaN must return false.
+
+  ::std::stringstream val1_ss;
+  val1_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+          << val1;
+
+  ::std::stringstream val2_ss;
+  val2_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+          << val2;
+
+  return AssertionFailure()
+      << "Expected: (" << expr1 << ") <= (" << expr2 << ")\n"
+      << "  Actual: " << StringStreamToString(&val1_ss) << " vs "
+      << StringStreamToString(&val2_ss);
+}
+
+}  // namespace internal
+
+// Asserts that val1 is less than, or almost equal to, val2.  Fails
+// otherwise.  In particular, it fails if either val1 or val2 is NaN.
+AssertionResult FloatLE(const char* expr1, const char* expr2,
+                        float val1, float val2) {
+  return internal::FloatingPointLE<float>(expr1, expr2, val1, val2);
+}
+
+// Asserts that val1 is less than, or almost equal to, val2.  Fails
+// otherwise.  In particular, it fails if either val1 or val2 is NaN.
+AssertionResult DoubleLE(const char* expr1, const char* expr2,
+                         double val1, double val2) {
+  return internal::FloatingPointLE<double>(expr1, expr2, val1, val2);
+}
+
+namespace internal {
+
+// The helper function for {ASSERT|EXPECT}_EQ with int or enum
+// arguments.
+AssertionResult CmpHelperEQ(const char* expected_expression,
+                            const char* actual_expression,
+                            BiggestInt expected,
+                            BiggestInt actual) {
+  if (expected == actual) {
+    return AssertionSuccess();
+  }
+
+  return EqFailure(expected_expression,
+                   actual_expression,
+                   FormatForComparisonFailureMessage(expected, actual),
+                   FormatForComparisonFailureMessage(actual, expected),
+                   false);
+}
+
+// A macro for implementing the helper functions needed to implement
+// ASSERT_?? and EXPECT_?? with integer or enum arguments.  It is here
+// just to avoid copy-and-paste of similar code.
+#define GTEST_IMPL_CMP_HELPER_(op_name, op)\
+AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \
+                                   BiggestInt val1, BiggestInt val2) {\
+  if (val1 op val2) {\
+    return AssertionSuccess();\
+  } else {\
+    return AssertionFailure() \
+        << "Expected: (" << expr1 << ") " #op " (" << expr2\
+        << "), actual: " << FormatForComparisonFailureMessage(val1, val2)\
+        << " vs " << FormatForComparisonFailureMessage(val2, val1);\
+  }\
+}
+
+// Implements the helper function for {ASSERT|EXPECT}_NE with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(NE, !=)
+// Implements the helper function for {ASSERT|EXPECT}_LE with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(LE, <=)
+// Implements the helper function for {ASSERT|EXPECT}_LT with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(LT, < )
+// Implements the helper function for {ASSERT|EXPECT}_GE with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(GE, >=)
+// Implements the helper function for {ASSERT|EXPECT}_GT with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(GT, > )
+
+#undef GTEST_IMPL_CMP_HELPER_
+
+// The helper function for {ASSERT|EXPECT}_STREQ.
+AssertionResult CmpHelperSTREQ(const char* expected_expression,
+                               const char* actual_expression,
+                               const char* expected,
+                               const char* actual) {
+  if (String::CStringEquals(expected, actual)) {
+    return AssertionSuccess();
+  }
+
+  return EqFailure(expected_expression,
+                   actual_expression,
+                   PrintToString(expected),
+                   PrintToString(actual),
+                   false);
+}
+
+// The helper function for {ASSERT|EXPECT}_STRCASEEQ.
+AssertionResult CmpHelperSTRCASEEQ(const char* expected_expression,
+                                   const char* actual_expression,
+                                   const char* expected,
+                                   const char* actual) {
+  if (String::CaseInsensitiveCStringEquals(expected, actual)) {
+    return AssertionSuccess();
+  }
+
+  return EqFailure(expected_expression,
+                   actual_expression,
+                   PrintToString(expected),
+                   PrintToString(actual),
+                   true);
+}
+
+// The helper function for {ASSERT|EXPECT}_STRNE.
+AssertionResult CmpHelperSTRNE(const char* s1_expression,
+                               const char* s2_expression,
+                               const char* s1,
+                               const char* s2) {
+  if (!String::CStringEquals(s1, s2)) {
+    return AssertionSuccess();
+  } else {
+    return AssertionFailure() << "Expected: (" << s1_expression << ") != ("
+                              << s2_expression << "), actual: \""
+                              << s1 << "\" vs \"" << s2 << "\"";
+  }
+}
+
+// The helper function for {ASSERT|EXPECT}_STRCASENE.
+AssertionResult CmpHelperSTRCASENE(const char* s1_expression,
+                                   const char* s2_expression,
+                                   const char* s1,
+                                   const char* s2) {
+  if (!String::CaseInsensitiveCStringEquals(s1, s2)) {
+    return AssertionSuccess();
+  } else {
+    return AssertionFailure()
+        << "Expected: (" << s1_expression << ") != ("
+        << s2_expression << ") (ignoring case), actual: \""
+        << s1 << "\" vs \"" << s2 << "\"";
+  }
+}
+
+}  // namespace internal
+
+namespace {
+
+// Helper functions for implementing IsSubString() and IsNotSubstring().
+
+// This group of overloaded functions return true iff needle is a
+// substring of haystack.  NULL is considered a substring of itself
+// only.
+
+bool IsSubstringPred(const char* needle, const char* haystack) {
+  if (needle == NULL || haystack == NULL)
+    return needle == haystack;
+
+  return strstr(haystack, needle) != NULL;
+}
+
+bool IsSubstringPred(const wchar_t* needle, const wchar_t* haystack) {
+  if (needle == NULL || haystack == NULL)
+    return needle == haystack;
+
+  return wcsstr(haystack, needle) != NULL;
+}
+
+// StringType here can be either ::std::string or ::std::wstring.
+template <typename StringType>
+bool IsSubstringPred(const StringType& needle,
+                     const StringType& haystack) {
+  return haystack.find(needle) != StringType::npos;
+}
+
+// This function implements either IsSubstring() or IsNotSubstring(),
+// depending on the value of the expected_to_be_substring parameter.
+// StringType here can be const char*, const wchar_t*, ::std::string,
+// or ::std::wstring.
+template <typename StringType>
+AssertionResult IsSubstringImpl(
+    bool expected_to_be_substring,
+    const char* needle_expr, const char* haystack_expr,
+    const StringType& needle, const StringType& haystack) {
+  if (IsSubstringPred(needle, haystack) == expected_to_be_substring)
+    return AssertionSuccess();
+
+  const bool is_wide_string = sizeof(needle[0]) > 1;
+  const char* const begin_string_quote = is_wide_string ? "L\"" : "\"";
+  return AssertionFailure()
+      << "Value of: " << needle_expr << "\n"
+      << "  Actual: " << begin_string_quote << needle << "\"\n"
+      << "Expected: " << (expected_to_be_substring ? "" : "not ")
+      << "a substring of " << haystack_expr << "\n"
+      << "Which is: " << begin_string_quote << haystack << "\"";
+}
+
+}  // namespace
+
+// IsSubstring() and IsNotSubstring() check whether needle is a
+// substring of haystack (NULL is considered a substring of itself
+// only), and return an appropriate error message when they fail.
+
+AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const char* needle, const char* haystack) {
+  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const wchar_t* needle, const wchar_t* haystack) {
+  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const char* needle, const char* haystack) {
+  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const wchar_t* needle, const wchar_t* haystack) {
+  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::string& needle, const ::std::string& haystack) {
+  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::string& needle, const ::std::string& haystack) {
+  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
+}
+
+#if GTEST_HAS_STD_WSTRING
+AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::wstring& needle, const ::std::wstring& haystack) {
+  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::wstring& needle, const ::std::wstring& haystack) {
+  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
+}
+#endif  // GTEST_HAS_STD_WSTRING
+
+namespace internal {
+
+#if GTEST_OS_WINDOWS
+
+namespace {
+
+// Helper function for IsHRESULT{SuccessFailure} predicates
+AssertionResult HRESULTFailureHelper(const char* expr,
+                                     const char* expected,
+                                     long hr) {  // NOLINT
+# if GTEST_OS_WINDOWS_MOBILE
+
+  // Windows CE doesn't support FormatMessage.
+  const char error_text[] = "";
+
+# else
+
+  // Looks up the human-readable system message for the HRESULT code
+  // and since we're not passing any params to FormatMessage, we don't
+  // want inserts expanded.
+  const DWORD kFlags = FORMAT_MESSAGE_FROM_SYSTEM |
+                       FORMAT_MESSAGE_IGNORE_INSERTS;
+  const DWORD kBufSize = 4096;
+  // Gets the system's human readable message string for this HRESULT.
+  char error_text[kBufSize] = { '\0' };
+  DWORD message_length = ::FormatMessageA(kFlags,
+                                          0,  // no source, we're asking system
+                                          hr,  // the error
+                                          0,  // no line width restrictions
+                                          error_text,  // output buffer
+                                          kBufSize,  // buf size
+                                          NULL);  // no arguments for inserts
+  // Trims tailing white space (FormatMessage leaves a trailing CR-LF)
+  for (; message_length && IsSpace(error_text[message_length - 1]);
+          --message_length) {
+    error_text[message_length - 1] = '\0';
+  }
+
+# endif  // GTEST_OS_WINDOWS_MOBILE
+
+  const std::string error_hex("0x" + String::FormatHexInt(hr));
+  return ::testing::AssertionFailure()
+      << "Expected: " << expr << " " << expected << ".\n"
+      << "  Actual: " << error_hex << " " << error_text << "\n";
+}
+
+}  // namespace
+
+AssertionResult IsHRESULTSuccess(const char* expr, long hr) {  // NOLINT
+  if (SUCCEEDED(hr)) {
+    return AssertionSuccess();
+  }
+  return HRESULTFailureHelper(expr, "succeeds", hr);
+}
+
+AssertionResult IsHRESULTFailure(const char* expr, long hr) {  // NOLINT
+  if (FAILED(hr)) {
+    return AssertionSuccess();
+  }
+  return HRESULTFailureHelper(expr, "fails", hr);
+}
+
+#endif  // GTEST_OS_WINDOWS
+
+// Utility functions for encoding Unicode text (wide strings) in
+// UTF-8.
+
+// A Unicode code-point can have upto 21 bits, and is encoded in UTF-8
+// like this:
+//
+// Code-point length   Encoding
+//   0 -  7 bits       0xxxxxxx
+//   8 - 11 bits       110xxxxx 10xxxxxx
+//  12 - 16 bits       1110xxxx 10xxxxxx 10xxxxxx
+//  17 - 21 bits       11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+
+// The maximum code-point a one-byte UTF-8 sequence can represent.
+const UInt32 kMaxCodePoint1 = (static_cast<UInt32>(1) <<  7) - 1;
+
+// The maximum code-point a two-byte UTF-8 sequence can represent.
+const UInt32 kMaxCodePoint2 = (static_cast<UInt32>(1) << (5 + 6)) - 1;
+
+// The maximum code-point a three-byte UTF-8 sequence can represent.
+const UInt32 kMaxCodePoint3 = (static_cast<UInt32>(1) << (4 + 2*6)) - 1;
+
+// The maximum code-point a four-byte UTF-8 sequence can represent.
+const UInt32 kMaxCodePoint4 = (static_cast<UInt32>(1) << (3 + 3*6)) - 1;
+
+// Chops off the n lowest bits from a bit pattern.  Returns the n
+// lowest bits.  As a side effect, the original bit pattern will be
+// shifted to the right by n bits.
+inline UInt32 ChopLowBits(UInt32* bits, int n) {
+  const UInt32 low_bits = *bits & ((static_cast<UInt32>(1) << n) - 1);
+  *bits >>= n;
+  return low_bits;
+}
+
+// Converts a Unicode code point to a narrow string in UTF-8 encoding.
+// code_point parameter is of type UInt32 because wchar_t may not be
+// wide enough to contain a code point.
+// If the code_point is not a valid Unicode code point
+// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted
+// to "(Invalid Unicode 0xXXXXXXXX)".
+std::string CodePointToUtf8(UInt32 code_point) {
+  if (code_point > kMaxCodePoint4) {
+    return "(Invalid Unicode 0x" + String::FormatHexInt(code_point) + ")";
+  }
+
+  char str[5];  // Big enough for the largest valid code point.
+  if (code_point <= kMaxCodePoint1) {
+    str[1] = '\0';
+    str[0] = static_cast<char>(code_point);                          // 0xxxxxxx
+  } else if (code_point <= kMaxCodePoint2) {
+    str[2] = '\0';
+    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[0] = static_cast<char>(0xC0 | code_point);                   // 110xxxxx
+  } else if (code_point <= kMaxCodePoint3) {
+    str[3] = '\0';
+    str[2] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[0] = static_cast<char>(0xE0 | code_point);                   // 1110xxxx
+  } else {  // code_point <= kMaxCodePoint4
+    str[4] = '\0';
+    str[3] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[2] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[0] = static_cast<char>(0xF0 | code_point);                   // 11110xxx
+  }
+  return str;
+}
+
+// The following two functions only make sense if the the system
+// uses UTF-16 for wide string encoding. All supported systems
+// with 16 bit wchar_t (Windows, Cygwin, Symbian OS) do use UTF-16.
+
+// Determines if the arguments constitute UTF-16 surrogate pair
+// and thus should be combined into a single Unicode code point
+// using CreateCodePointFromUtf16SurrogatePair.
+inline bool IsUtf16SurrogatePair(wchar_t first, wchar_t second) {
+  return sizeof(wchar_t) == 2 &&
+      (first & 0xFC00) == 0xD800 && (second & 0xFC00) == 0xDC00;
+}
+
+// Creates a Unicode code point from UTF16 surrogate pair.
+inline UInt32 CreateCodePointFromUtf16SurrogatePair(wchar_t first,
+                                                    wchar_t second) {
+  const UInt32 mask = (1 << 10) - 1;
+  return (sizeof(wchar_t) == 2) ?
+      (((first & mask) << 10) | (second & mask)) + 0x10000 :
+      // This function should not be called when the condition is
+      // false, but we provide a sensible default in case it is.
+      static_cast<UInt32>(first);
+}
+
+// Converts a wide string to a narrow string in UTF-8 encoding.
+// The wide string is assumed to have the following encoding:
+//   UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin, Symbian OS)
+//   UTF-32 if sizeof(wchar_t) == 4 (on Linux)
+// Parameter str points to a null-terminated wide string.
+// Parameter num_chars may additionally limit the number
+// of wchar_t characters processed. -1 is used when the entire string
+// should be processed.
+// If the string contains code points that are not valid Unicode code points
+// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output
+// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding
+// and contains invalid UTF-16 surrogate pairs, values in those pairs
+// will be encoded as individual Unicode characters from Basic Normal Plane.
+std::string WideStringToUtf8(const wchar_t* str, int num_chars) {
+  if (num_chars == -1)
+    num_chars = static_cast<int>(wcslen(str));
+
+  ::std::stringstream stream;
+  for (int i = 0; i < num_chars; ++i) {
+    UInt32 unicode_code_point;
+
+    if (str[i] == L'\0') {
+      break;
+    } else if (i + 1 < num_chars && IsUtf16SurrogatePair(str[i], str[i + 1])) {
+      unicode_code_point = CreateCodePointFromUtf16SurrogatePair(str[i],
+                                                                 str[i + 1]);
+      i++;
+    } else {
+      unicode_code_point = static_cast<UInt32>(str[i]);
+    }
+
+    stream << CodePointToUtf8(unicode_code_point);
+  }
+  return StringStreamToString(&stream);
+}
+
+// Converts a wide C string to an std::string using the UTF-8 encoding.
+// NULL will be converted to "(null)".
+std::string String::ShowWideCString(const wchar_t * wide_c_str) {
+  if (wide_c_str == NULL)  return "(null)";
+
+  return internal::WideStringToUtf8(wide_c_str, -1);
+}
+
+// Compares two wide C strings.  Returns true iff they have the same
+// content.
+//
+// Unlike wcscmp(), this function can handle NULL argument(s).  A NULL
+// C string is considered different to any non-NULL C string,
+// including the empty string.
+bool String::WideCStringEquals(const wchar_t * lhs, const wchar_t * rhs) {
+  if (lhs == NULL) return rhs == NULL;
+
+  if (rhs == NULL) return false;
+
+  return wcscmp(lhs, rhs) == 0;
+}
+
+// Helper function for *_STREQ on wide strings.
+AssertionResult CmpHelperSTREQ(const char* expected_expression,
+                               const char* actual_expression,
+                               const wchar_t* expected,
+                               const wchar_t* actual) {
+  if (String::WideCStringEquals(expected, actual)) {
+    return AssertionSuccess();
+  }
+
+  return EqFailure(expected_expression,
+                   actual_expression,
+                   PrintToString(expected),
+                   PrintToString(actual),
+                   false);
+}
+
+// Helper function for *_STRNE on wide strings.
+AssertionResult CmpHelperSTRNE(const char* s1_expression,
+                               const char* s2_expression,
+                               const wchar_t* s1,
+                               const wchar_t* s2) {
+  if (!String::WideCStringEquals(s1, s2)) {
+    return AssertionSuccess();
+  }
+
+  return AssertionFailure() << "Expected: (" << s1_expression << ") != ("
+                            << s2_expression << "), actual: "
+                            << PrintToString(s1)
+                            << " vs " << PrintToString(s2);
+}
+
+// Compares two C strings, ignoring case.  Returns true iff they have
+// the same content.
+//
+// Unlike strcasecmp(), this function can handle NULL argument(s).  A
+// NULL C string is considered different to any non-NULL C string,
+// including the empty string.
+bool String::CaseInsensitiveCStringEquals(const char * lhs, const char * rhs) {
+  if (lhs == NULL)
+    return rhs == NULL;
+  if (rhs == NULL)
+    return false;
+  return posix::StrCaseCmp(lhs, rhs) == 0;
+}
+
+  // Compares two wide C strings, ignoring case.  Returns true iff they
+  // have the same content.
+  //
+  // Unlike wcscasecmp(), this function can handle NULL argument(s).
+  // A NULL C string is considered different to any non-NULL wide C string,
+  // including the empty string.
+  // NB: The implementations on different platforms slightly differ.
+  // On windows, this method uses _wcsicmp which compares according to LC_CTYPE
+  // environment variable. On GNU platform this method uses wcscasecmp
+  // which compares according to LC_CTYPE category of the current locale.
+  // On MacOS X, it uses towlower, which also uses LC_CTYPE category of the
+  // current locale.
+bool String::CaseInsensitiveWideCStringEquals(const wchar_t* lhs,
+                                              const wchar_t* rhs) {
+  if (lhs == NULL) return rhs == NULL;
+
+  if (rhs == NULL) return false;
+
+#if GTEST_OS_WINDOWS
+  return _wcsicmp(lhs, rhs) == 0;
+#elif GTEST_OS_LINUX && !GTEST_OS_LINUX_ANDROID
+  return wcscasecmp(lhs, rhs) == 0;
+#else
+  // Android, Mac OS X and Cygwin don't define wcscasecmp.
+  // Other unknown OSes may not define it either.
+  wint_t left, right;
+  do {
+    left = towlower(*lhs++);
+    right = towlower(*rhs++);
+  } while (left && left == right);
+  return left == right;
+#endif  // OS selector
+}
+
+// Returns true iff str ends with the given suffix, ignoring case.
+// Any string is considered to end with an empty suffix.
+bool String::EndsWithCaseInsensitive(
+    const std::string& str, const std::string& suffix) {
+  const size_t str_len = str.length();
+  const size_t suffix_len = suffix.length();
+  return (str_len >= suffix_len) &&
+         CaseInsensitiveCStringEquals(str.c_str() + str_len - suffix_len,
+                                      suffix.c_str());
+}
+
+// Formats an int value as "%02d".
+std::string String::FormatIntWidth2(int value) {
+  std::stringstream ss;
+  ss << std::setfill('0') << std::setw(2) << value;
+  return ss.str();
+}
+
+// Formats an int value as "%X".
+std::string String::FormatHexInt(int value) {
+  std::stringstream ss;
+  ss << std::hex << std::uppercase << value;
+  return ss.str();
+}
+
+// Formats a byte as "%02X".
+std::string String::FormatByte(unsigned char value) {
+  std::stringstream ss;
+  ss << std::setfill('0') << std::setw(2) << std::hex << std::uppercase
+     << static_cast<unsigned int>(value);
+  return ss.str();
+}
+
+// Converts the buffer in a stringstream to an std::string, converting NUL
+// bytes to "\\0" along the way.
+std::string StringStreamToString(::std::stringstream* ss) {
+  const ::std::string& str = ss->str();
+  const char* const start = str.c_str();
+  const char* const end = start + str.length();
+
+  std::string result;
+  result.reserve(2 * (end - start));
+  for (const char* ch = start; ch != end; ++ch) {
+    if (*ch == '\0') {
+      result += "\\0";  // Replaces NUL with "\\0";
+    } else {
+      result += *ch;
+    }
+  }
+
+  return result;
+}
+
+// Appends the user-supplied message to the Google-Test-generated message.
+std::string AppendUserMessage(const std::string& gtest_msg,
+                              const Message& user_msg) {
+  // Appends the user message if it's non-empty.
+  const std::string user_msg_string = user_msg.GetString();
+  if (user_msg_string.empty()) {
+    return gtest_msg;
+  }
+
+  return gtest_msg + "\n" + user_msg_string;
+}
+
+}  // namespace internal
+
+// class TestResult
+
+// Creates an empty TestResult.
+TestResult::TestResult()
+    : death_test_count_(0),
+      elapsed_time_(0) {
+}
+
+// D'tor.
+TestResult::~TestResult() {
+}
+
+// Returns the i-th test part result among all the results. i can
+// range from 0 to total_part_count() - 1. If i is not in that range,
+// aborts the program.
+const TestPartResult& TestResult::GetTestPartResult(int i) const {
+  if (i < 0 || i >= total_part_count())
+    internal::posix::Abort();
+  return test_part_results_.at(i);
+}
+
+// Returns the i-th test property. i can range from 0 to
+// test_property_count() - 1. If i is not in that range, aborts the
+// program.
+const TestProperty& TestResult::GetTestProperty(int i) const {
+  if (i < 0 || i >= test_property_count())
+    internal::posix::Abort();
+  return test_properties_.at(i);
+}
+
+// Clears the test part results.
+void TestResult::ClearTestPartResults() {
+  test_part_results_.clear();
+}
+
+// Adds a test part result to the list.
+void TestResult::AddTestPartResult(const TestPartResult& test_part_result) {
+  test_part_results_.push_back(test_part_result);
+}
+
+// Adds a test property to the list. If a property with the same key as the
+// supplied property is already represented, the value of this test_property
+// replaces the old value for that key.
+void TestResult::RecordProperty(const std::string& xml_element,
+                                const TestProperty& test_property) {
+  if (!ValidateTestProperty(xml_element, test_property)) {
+    return;
+  }
+  internal::MutexLock lock(&test_properites_mutex_);
+  const std::vector<TestProperty>::iterator property_with_matching_key =
+      std::find_if(test_properties_.begin(), test_properties_.end(),
+                   internal::TestPropertyKeyIs(test_property.key()));
+  if (property_with_matching_key == test_properties_.end()) {
+    test_properties_.push_back(test_property);
+    return;
+  }
+  property_with_matching_key->SetValue(test_property.value());
+}
+
+// The list of reserved attributes used in the <testsuites> element of XML
+// output.
+static const char* const kReservedTestSuitesAttributes[] = {
+  "disabled",
+  "errors",
+  "failures",
+  "name",
+  "random_seed",
+  "tests",
+  "time",
+  "timestamp"
+};
+
+// The list of reserved attributes used in the <testsuite> element of XML
+// output.
+static const char* const kReservedTestSuiteAttributes[] = {
+  "disabled",
+  "errors",
+  "failures",
+  "name",
+  "tests",
+  "time"
+};
+
+// The list of reserved attributes used in the <testcase> element of XML output.
+static const char* const kReservedTestCaseAttributes[] = {
+  "classname",
+  "name",
+  "status",
+  "time",
+  "type_param",
+  "value_param"
+};
+
+template <int kSize>
+std::vector<std::string> ArrayAsVector(const char* const (&array)[kSize]) {
+  return std::vector<std::string>(array, array + kSize);
+}
+
+static std::vector<std::string> GetReservedAttributesForElement(
+    const std::string& xml_element) {
+  if (xml_element == "testsuites") {
+    return ArrayAsVector(kReservedTestSuitesAttributes);
+  } else if (xml_element == "testsuite") {
+    return ArrayAsVector(kReservedTestSuiteAttributes);
+  } else if (xml_element == "testcase") {
+    return ArrayAsVector(kReservedTestCaseAttributes);
+  } else {
+    GTEST_CHECK_(false) << "Unrecognized xml_element provided: " << xml_element;
+  }
+  // This code is unreachable but some compilers may not realizes that.
+  return std::vector<std::string>();
+}
+
+static std::string FormatWordList(const std::vector<std::string>& words) {
+  Message word_list;
+  for (size_t i = 0; i < words.size(); ++i) {
+    if (i > 0 && words.size() > 2) {
+      word_list << ", ";
+    }
+    if (i == words.size() - 1) {
+      word_list << "and ";
+    }
+    word_list << "'" << words[i] << "'";
+  }
+  return word_list.GetString();
+}
+
+bool ValidateTestPropertyName(const std::string& property_name,
+                              const std::vector<std::string>& reserved_names) {
+  if (std::find(reserved_names.begin(), reserved_names.end(), property_name) !=
+          reserved_names.end()) {
+    ADD_FAILURE() << "Reserved key used in RecordProperty(): " << property_name
+                  << " (" << FormatWordList(reserved_names)
+                  << " are reserved by " << GTEST_NAME_ << ")";
+    return false;
+  }
+  return true;
+}
+
+// Adds a failure if the key is a reserved attribute of the element named
+// xml_element.  Returns true if the property is valid.
+bool TestResult::ValidateTestProperty(const std::string& xml_element,
+                                      const TestProperty& test_property) {
+  return ValidateTestPropertyName(test_property.key(),
+                                  GetReservedAttributesForElement(xml_element));
+}
+
+// Clears the object.
+void TestResult::Clear() {
+  test_part_results_.clear();
+  test_properties_.clear();
+  death_test_count_ = 0;
+  elapsed_time_ = 0;
+}
+
+// Returns true iff the test failed.
+bool TestResult::Failed() const {
+  for (int i = 0; i < total_part_count(); ++i) {
+    if (GetTestPartResult(i).failed())
+      return true;
+  }
+  return false;
+}
+
+// Returns true iff the test part fatally failed.
+static bool TestPartFatallyFailed(const TestPartResult& result) {
+  return result.fatally_failed();
+}
+
+// Returns true iff the test fatally failed.
+bool TestResult::HasFatalFailure() const {
+  return CountIf(test_part_results_, TestPartFatallyFailed) > 0;
+}
+
+// Returns true iff the test part non-fatally failed.
+static bool TestPartNonfatallyFailed(const TestPartResult& result) {
+  return result.nonfatally_failed();
+}
+
+// Returns true iff the test has a non-fatal failure.
+bool TestResult::HasNonfatalFailure() const {
+  return CountIf(test_part_results_, TestPartNonfatallyFailed) > 0;
+}
+
+// Gets the number of all test parts.  This is the sum of the number
+// of successful test parts and the number of failed test parts.
+int TestResult::total_part_count() const {
+  return static_cast<int>(test_part_results_.size());
+}
+
+// Returns the number of the test properties.
+int TestResult::test_property_count() const {
+  return static_cast<int>(test_properties_.size());
+}
+
+// class Test
+
+// Creates a Test object.
+
+// The c'tor saves the values of all Google Test flags.
+Test::Test()
+    : gtest_flag_saver_(new internal::GTestFlagSaver) {
+}
+
+// The d'tor restores the values of all Google Test flags.
+Test::~Test() {
+  delete gtest_flag_saver_;
+}
+
+// Sets up the test fixture.
+//
+// A sub-class may override this.
+void Test::SetUp() {
+}
+
+// Tears down the test fixture.
+//
+// A sub-class may override this.
+void Test::TearDown() {
+}
+
+// Allows user supplied key value pairs to be recorded for later output.
+void Test::RecordProperty(const std::string& key, const std::string& value) {
+  UnitTest::GetInstance()->RecordProperty(key, value);
+}
+
+// Allows user supplied key value pairs to be recorded for later output.
+void Test::RecordProperty(const std::string& key, int value) {
+  Message value_message;
+  value_message << value;
+  RecordProperty(key, value_message.GetString().c_str());
+}
+
+namespace internal {
+
+void ReportFailureInUnknownLocation(TestPartResult::Type result_type,
+                                    const std::string& message) {
+  // This function is a friend of UnitTest and as such has access to
+  // AddTestPartResult.
+  UnitTest::GetInstance()->AddTestPartResult(
+      result_type,
+      NULL,  // No info about the source file where the exception occurred.
+      -1,    // We have no info on which line caused the exception.
+      message,
+      "");   // No stack trace, either.
+}
+
+}  // namespace internal
+
+// Google Test requires all tests in the same test case to use the same test
+// fixture class.  This function checks if the current test has the
+// same fixture class as the first test in the current test case.  If
+// yes, it returns true; otherwise it generates a Google Test failure and
+// returns false.
+bool Test::HasSameFixtureClass() {
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  const TestCase* const test_case = impl->current_test_case();
+
+  // Info about the first test in the current test case.
+  const TestInfo* const first_test_info = test_case->test_info_list()[0];
+  const internal::TypeId first_fixture_id = first_test_info->fixture_class_id_;
+  const char* const first_test_name = first_test_info->name();
+
+  // Info about the current test.
+  const TestInfo* const this_test_info = impl->current_test_info();
+  const internal::TypeId this_fixture_id = this_test_info->fixture_class_id_;
+  const char* const this_test_name = this_test_info->name();
+
+  if (this_fixture_id != first_fixture_id) {
+    // Is the first test defined using TEST?
+    const bool first_is_TEST = first_fixture_id == internal::GetTestTypeId();
+    // Is this test defined using TEST?
+    const bool this_is_TEST = this_fixture_id == internal::GetTestTypeId();
+
+    if (first_is_TEST || this_is_TEST) {
+      // Both TEST and TEST_F appear in same test case, which is incorrect.
+      // Tell the user how to fix this.
+
+      // Gets the name of the TEST and the name of the TEST_F.  Note
+      // that first_is_TEST and this_is_TEST cannot both be true, as
+      // the fixture IDs are different for the two tests.
+      const char* const TEST_name =
+          first_is_TEST ? first_test_name : this_test_name;
+      const char* const TEST_F_name =
+          first_is_TEST ? this_test_name : first_test_name;
+
+      ADD_FAILURE()
+          << "All tests in the same test case must use the same test fixture\n"
+          << "class, so mixing TEST_F and TEST in the same test case is\n"
+          << "illegal.  In test case " << this_test_info->test_case_name()
+          << ",\n"
+          << "test " << TEST_F_name << " is defined using TEST_F but\n"
+          << "test " << TEST_name << " is defined using TEST.  You probably\n"
+          << "want to change the TEST to TEST_F or move it to another test\n"
+          << "case.";
+    } else {
+      // Two fixture classes with the same name appear in two different
+      // namespaces, which is not allowed. Tell the user how to fix this.
+      ADD_FAILURE()
+          << "All tests in the same test case must use the same test fixture\n"
+          << "class.  However, in test case "
+          << this_test_info->test_case_name() << ",\n"
+          << "you defined test " << first_test_name
+          << " and test " << this_test_name << "\n"
+          << "using two different test fixture classes.  This can happen if\n"
+          << "the two classes are from different namespaces or translation\n"
+          << "units and have the same name.  You should probably rename one\n"
+          << "of the classes to put the tests into different test cases.";
+    }
+    return false;
+  }
+
+  return true;
+}
+
+#if GTEST_HAS_SEH
+
+// Adds an "exception thrown" fatal failure to the current test.  This
+// function returns its result via an output parameter pointer because VC++
+// prohibits creation of objects with destructors on stack in functions
+// using __try (see error C2712).
+static std::string* FormatSehExceptionMessage(DWORD exception_code,
+                                              const char* location) {
+  Message message;
+  message << "SEH exception with code 0x" << std::setbase(16) <<
+    exception_code << std::setbase(10) << " thrown in " << location << ".";
+
+  return new std::string(message.GetString());
+}
+
+#endif  // GTEST_HAS_SEH
+
+namespace internal {
+
+#if GTEST_HAS_EXCEPTIONS
+
+// Adds an "exception thrown" fatal failure to the current test.
+static std::string FormatCxxExceptionMessage(const char* description,
+                                             const char* location) {
+  Message message;
+  if (description != NULL) {
+    message << "C++ exception with description \"" << description << "\"";
+  } else {
+    message << "Unknown C++ exception";
+  }
+  message << " thrown in " << location << ".";
+
+  return message.GetString();
+}
+
+static std::string PrintTestPartResultToString(
+    const TestPartResult& test_part_result);
+
+GoogleTestFailureException::GoogleTestFailureException(
+    const TestPartResult& failure)
+    : ::std::runtime_error(PrintTestPartResultToString(failure).c_str()) {}
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
+// We put these helper functions in the internal namespace as IBM's xlC
+// compiler rejects the code if they were declared static.
+
+// Runs the given method and handles SEH exceptions it throws, when
+// SEH is supported; returns the 0-value for type Result in case of an
+// SEH exception.  (Microsoft compilers cannot handle SEH and C++
+// exceptions in the same function.  Therefore, we provide a separate
+// wrapper function for handling SEH exceptions.)
+template <class T, typename Result>
+Result HandleSehExceptionsInMethodIfSupported(
+    T* object, Result (T::*method)(), const char* location) {
+#if GTEST_HAS_SEH
+  __try {
+    return (object->*method)();
+  } __except (internal::UnitTestOptions::GTestShouldProcessSEH(  // NOLINT
+      GetExceptionCode())) {
+    // We create the exception message on the heap because VC++ prohibits
+    // creation of objects with destructors on stack in functions using __try
+    // (see error C2712).
+    std::string* exception_message = FormatSehExceptionMessage(
+        GetExceptionCode(), location);
+    internal::ReportFailureInUnknownLocation(TestPartResult::kFatalFailure,
+                                             *exception_message);
+    delete exception_message;
+    return static_cast<Result>(0);
+  }
+#else
+  (void)location;
+  return (object->*method)();
+#endif  // GTEST_HAS_SEH
+}
+
+// Runs the given method and catches and reports C++ and/or SEH-style
+// exceptions, if they are supported; returns the 0-value for type
+// Result in case of an SEH exception.
+template <class T, typename Result>
+Result HandleExceptionsInMethodIfSupported(
+    T* object, Result (T::*method)(), const char* location) {
+  // NOTE: The user code can affect the way in which Google Test handles
+  // exceptions by setting GTEST_FLAG(catch_exceptions), but only before
+  // RUN_ALL_TESTS() starts. It is technically possible to check the flag
+  // after the exception is caught and either report or re-throw the
+  // exception based on the flag's value:
+  //
+  // try {
+  //   // Perform the test method.
+  // } catch (...) {
+  //   if (GTEST_FLAG(catch_exceptions))
+  //     // Report the exception as failure.
+  //   else
+  //     throw;  // Re-throws the original exception.
+  // }
+  //
+  // However, the purpose of this flag is to allow the program to drop into
+  // the debugger when the exception is thrown. On most platforms, once the
+  // control enters the catch block, the exception origin information is
+  // lost and the debugger will stop the program at the point of the
+  // re-throw in this function -- instead of at the point of the original
+  // throw statement in the code under test.  For this reason, we perform
+  // the check early, sacrificing the ability to affect Google Test's
+  // exception handling in the method where the exception is thrown.
+  if (internal::GetUnitTestImpl()->catch_exceptions()) {
+#if GTEST_HAS_EXCEPTIONS
+    try {
+      return HandleSehExceptionsInMethodIfSupported(object, method, location);
+    } catch (const internal::GoogleTestFailureException&) {  // NOLINT
+      // This exception type can only be thrown by a failed Google
+      // Test assertion with the intention of letting another testing
+      // framework catch it.  Therefore we just re-throw it.
+      throw;
+    } catch (const std::exception& e) {  // NOLINT
+      internal::ReportFailureInUnknownLocation(
+          TestPartResult::kFatalFailure,
+          FormatCxxExceptionMessage(e.what(), location));
+    } catch (...) {  // NOLINT
+      internal::ReportFailureInUnknownLocation(
+          TestPartResult::kFatalFailure,
+          FormatCxxExceptionMessage(NULL, location));
+    }
+    return static_cast<Result>(0);
+#else
+    return HandleSehExceptionsInMethodIfSupported(object, method, location);
+#endif  // GTEST_HAS_EXCEPTIONS
+  } else {
+    return (object->*method)();
+  }
+}
+
+}  // namespace internal
+
+// Runs the test and updates the test result.
+void Test::Run() {
+  if (!HasSameFixtureClass()) return;
+
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(this, &Test::SetUp, "SetUp()");
+  // We will run the test only if SetUp() was successful.
+  if (!HasFatalFailure()) {
+    impl->os_stack_trace_getter()->UponLeavingGTest();
+    internal::HandleExceptionsInMethodIfSupported(
+        this, &Test::TestBody, "the test body");
+  }
+
+  // However, we want to clean up as much as possible.  Hence we will
+  // always call TearDown(), even if SetUp() or the test body has
+  // failed.
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(
+      this, &Test::TearDown, "TearDown()");
+}
+
+// Returns true iff the current test has a fatal failure.
+bool Test::HasFatalFailure() {
+  return internal::GetUnitTestImpl()->current_test_result()->HasFatalFailure();
+}
+
+// Returns true iff the current test has a non-fatal failure.
+bool Test::HasNonfatalFailure() {
+  return internal::GetUnitTestImpl()->current_test_result()->
+      HasNonfatalFailure();
+}
+
+// class TestInfo
+
+// Constructs a TestInfo object. It assumes ownership of the test factory
+// object.
+TestInfo::TestInfo(const std::string& a_test_case_name,
+                   const std::string& a_name,
+                   const char* a_type_param,
+                   const char* a_value_param,
+                   internal::TypeId fixture_class_id,
+                   internal::TestFactoryBase* factory)
+    : test_case_name_(a_test_case_name),
+      name_(a_name),
+      type_param_(a_type_param ? new std::string(a_type_param) : NULL),
+      value_param_(a_value_param ? new std::string(a_value_param) : NULL),
+      fixture_class_id_(fixture_class_id),
+      should_run_(false),
+      is_disabled_(false),
+      matches_filter_(false),
+      factory_(factory),
+      result_() {}
+
+// Destructs a TestInfo object.
+TestInfo::~TestInfo() { delete factory_; }
+
+namespace internal {
+
+// Creates a new TestInfo object and registers it with Google Test;
+// returns the created object.
+//
+// Arguments:
+//
+//   test_case_name:   name of the test case
+//   name:             name of the test
+//   type_param:       the name of the test's type parameter, or NULL if
+//                     this is not a typed or a type-parameterized test.
+//   value_param:      text representation of the test's value parameter,
+//                     or NULL if this is not a value-parameterized test.
+//   fixture_class_id: ID of the test fixture class
+//   set_up_tc:        pointer to the function that sets up the test case
+//   tear_down_tc:     pointer to the function that tears down the test case
+//   factory:          pointer to the factory that creates a test object.
+//                     The newly created TestInfo instance will assume
+//                     ownership of the factory object.
+TestInfo* MakeAndRegisterTestInfo(
+    const char* test_case_name,
+    const char* name,
+    const char* type_param,
+    const char* value_param,
+    TypeId fixture_class_id,
+    SetUpTestCaseFunc set_up_tc,
+    TearDownTestCaseFunc tear_down_tc,
+    TestFactoryBase* factory) {
+  TestInfo* const test_info =
+      new TestInfo(test_case_name, name, type_param, value_param,
+                   fixture_class_id, factory);
+  GetUnitTestImpl()->AddTestInfo(set_up_tc, tear_down_tc, test_info);
+  return test_info;
+}
+
+#if GTEST_HAS_PARAM_TEST
+void ReportInvalidTestCaseType(const char* test_case_name,
+                               const char* file, int line) {
+  Message errors;
+  errors
+      << "Attempted redefinition of test case " << test_case_name << ".\n"
+      << "All tests in the same test case must use the same test fixture\n"
+      << "class.  However, in test case " << test_case_name << ", you tried\n"
+      << "to define a test using a fixture class different from the one\n"
+      << "used earlier. This can happen if the two fixture classes are\n"
+      << "from different namespaces and have the same name. You should\n"
+      << "probably rename one of the classes to put the tests into different\n"
+      << "test cases.";
+
+  fprintf(stderr, "%s %s", FormatFileLocation(file, line).c_str(),
+          errors.GetString().c_str());
+}
+#endif  // GTEST_HAS_PARAM_TEST
+
+}  // namespace internal
+
+namespace {
+
+// A predicate that checks the test name of a TestInfo against a known
+// value.
+//
+// This is used for implementation of the TestCase class only.  We put
+// it in the anonymous namespace to prevent polluting the outer
+// namespace.
+//
+// TestNameIs is copyable.
+class TestNameIs {
+ public:
+  // Constructor.
+  //
+  // TestNameIs has NO default constructor.
+  explicit TestNameIs(const char* name)
+      : name_(name) {}
+
+  // Returns true iff the test name of test_info matches name_.
+  bool operator()(const TestInfo * test_info) const {
+    return test_info && test_info->name() == name_;
+  }
+
+ private:
+  std::string name_;
+};
+
+}  // namespace
+
+namespace internal {
+
+// This method expands all parameterized tests registered with macros TEST_P
+// and INSTANTIATE_TEST_CASE_P into regular tests and registers those.
+// This will be done just once during the program runtime.
+void UnitTestImpl::RegisterParameterizedTests() {
+#if GTEST_HAS_PARAM_TEST
+  if (!parameterized_tests_registered_) {
+    parameterized_test_registry_.RegisterTests();
+    parameterized_tests_registered_ = true;
+  }
+#endif
+}
+
+}  // namespace internal
+
+// Creates the test object, runs it, records its result, and then
+// deletes it.
+void TestInfo::Run() {
+  if (!should_run_) return;
+
+  // Tells UnitTest where to store test result.
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  impl->set_current_test_info(this);
+
+  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
+
+  // Notifies the unit test event listeners that a test is about to start.
+  repeater->OnTestStart(*this);
+
+  const TimeInMillis start = internal::GetTimeInMillis();
+
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+
+  // Creates the test object.
+  Test* const test = internal::HandleExceptionsInMethodIfSupported(
+      factory_, &internal::TestFactoryBase::CreateTest,
+      "the test fixture's constructor");
+
+  // Runs the test only if the test object was created and its
+  // constructor didn't generate a fatal failure.
+  if ((test != NULL) && !Test::HasFatalFailure()) {
+    // This doesn't throw as all user code that can throw are wrapped into
+    // exception handling code.
+    test->Run();
+  }
+
+  // Deletes the test object.
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(
+      test, &Test::DeleteSelf_, "the test fixture's destructor");
+
+  result_.set_elapsed_time(internal::GetTimeInMillis() - start);
+
+  // Notifies the unit test event listener that a test has just finished.
+  repeater->OnTestEnd(*this);
+
+  // Tells UnitTest to stop associating assertion results to this
+  // test.
+  impl->set_current_test_info(NULL);
+}
+
+// class TestCase
+
+// Gets the number of successful tests in this test case.
+int TestCase::successful_test_count() const {
+  return CountIf(test_info_list_, TestPassed);
+}
+
+// Gets the number of failed tests in this test case.
+int TestCase::failed_test_count() const {
+  return CountIf(test_info_list_, TestFailed);
+}
+
+// Gets the number of disabled tests that will be reported in the XML report.
+int TestCase::reportable_disabled_test_count() const {
+  return CountIf(test_info_list_, TestReportableDisabled);
+}
+
+// Gets the number of disabled tests in this test case.
+int TestCase::disabled_test_count() const {
+  return CountIf(test_info_list_, TestDisabled);
+}
+
+// Gets the number of tests to be printed in the XML report.
+int TestCase::reportable_test_count() const {
+  return CountIf(test_info_list_, TestReportable);
+}
+
+// Get the number of tests in this test case that should run.
+int TestCase::test_to_run_count() const {
+  return CountIf(test_info_list_, ShouldRunTest);
+}
+
+// Gets the number of all tests.
+int TestCase::total_test_count() const {
+  return static_cast<int>(test_info_list_.size());
+}
+
+// Creates a TestCase with the given name.
+//
+// Arguments:
+//
+//   name:         name of the test case
+//   a_type_param: the name of the test case's type parameter, or NULL if
+//                 this is not a typed or a type-parameterized test case.
+//   set_up_tc:    pointer to the function that sets up the test case
+//   tear_down_tc: pointer to the function that tears down the test case
+TestCase::TestCase(const char* a_name, const char* a_type_param,
+                   Test::SetUpTestCaseFunc set_up_tc,
+                   Test::TearDownTestCaseFunc tear_down_tc)
+    : name_(a_name),
+      type_param_(a_type_param ? new std::string(a_type_param) : NULL),
+      set_up_tc_(set_up_tc),
+      tear_down_tc_(tear_down_tc),
+      should_run_(false),
+      elapsed_time_(0) {
+}
+
+// Destructor of TestCase.
+TestCase::~TestCase() {
+  // Deletes every Test in the collection.
+  ForEach(test_info_list_, internal::Delete<TestInfo>);
+}
+
+// Returns the i-th test among all the tests. i can range from 0 to
+// total_test_count() - 1. If i is not in that range, returns NULL.
+const TestInfo* TestCase::GetTestInfo(int i) const {
+  const int index = GetElementOr(test_indices_, i, -1);
+  return index < 0 ? NULL : test_info_list_[index];
+}
+
+// Returns the i-th test among all the tests. i can range from 0 to
+// total_test_count() - 1. If i is not in that range, returns NULL.
+TestInfo* TestCase::GetMutableTestInfo(int i) {
+  const int index = GetElementOr(test_indices_, i, -1);
+  return index < 0 ? NULL : test_info_list_[index];
+}
+
+// Adds a test to this test case.  Will delete the test upon
+// destruction of the TestCase object.
+void TestCase::AddTestInfo(TestInfo * test_info) {
+  test_info_list_.push_back(test_info);
+  test_indices_.push_back(static_cast<int>(test_indices_.size()));
+}
+
+// Runs every test in this TestCase.
+void TestCase::Run() {
+  if (!should_run_) return;
+
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  impl->set_current_test_case(this);
+
+  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
+
+  repeater->OnTestCaseStart(*this);
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(
+      this, &TestCase::RunSetUpTestCase, "SetUpTestCase()");
+
+  const internal::TimeInMillis start = internal::GetTimeInMillis();
+  for (int i = 0; i < total_test_count(); i++) {
+    GetMutableTestInfo(i)->Run();
+  }
+  elapsed_time_ = internal::GetTimeInMillis() - start;
+
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(
+      this, &TestCase::RunTearDownTestCase, "TearDownTestCase()");
+
+  repeater->OnTestCaseEnd(*this);
+  impl->set_current_test_case(NULL);
+}
+
+// Clears the results of all tests in this test case.
+void TestCase::ClearResult() {
+  ad_hoc_test_result_.Clear();
+  ForEach(test_info_list_, TestInfo::ClearTestResult);
+}
+
+// Shuffles the tests in this test case.
+void TestCase::ShuffleTests(internal::Random* random) {
+  Shuffle(random, &test_indices_);
+}
+
+// Restores the test order to before the first shuffle.
+void TestCase::UnshuffleTests() {
+  for (size_t i = 0; i < test_indices_.size(); i++) {
+    test_indices_[i] = static_cast<int>(i);
+  }
+}
+
+// Formats a countable noun.  Depending on its quantity, either the
+// singular form or the plural form is used. e.g.
+//
+// FormatCountableNoun(1, "formula", "formuli") returns "1 formula".
+// FormatCountableNoun(5, "book", "books") returns "5 books".
+static std::string FormatCountableNoun(int count,
+                                       const char * singular_form,
+                                       const char * plural_form) {
+  return internal::StreamableToString(count) + " " +
+      (count == 1 ? singular_form : plural_form);
+}
+
+// Formats the count of tests.
+static std::string FormatTestCount(int test_count) {
+  return FormatCountableNoun(test_count, "test", "tests");
+}
+
+// Formats the count of test cases.
+static std::string FormatTestCaseCount(int test_case_count) {
+  return FormatCountableNoun(test_case_count, "test case", "test cases");
+}
+
+// Converts a TestPartResult::Type enum to human-friendly string
+// representation.  Both kNonFatalFailure and kFatalFailure are translated
+// to "Failure", as the user usually doesn't care about the difference
+// between the two when viewing the test result.
+static const char * TestPartResultTypeToString(TestPartResult::Type type) {
+  switch (type) {
+    case TestPartResult::kSuccess:
+      return "Success";
+
+    case TestPartResult::kNonFatalFailure:
+    case TestPartResult::kFatalFailure:
+#ifdef _MSC_VER
+      return "error: ";
+#else
+      return "Failure\n";
+#endif
+    default:
+      return "Unknown result type";
+  }
+}
+
+namespace internal {
+
+// Prints a TestPartResult to an std::string.
+static std::string PrintTestPartResultToString(
+    const TestPartResult& test_part_result) {
+  return (Message()
+          << internal::FormatFileLocation(test_part_result.file_name(),
+                                          test_part_result.line_number())
+          << " " << TestPartResultTypeToString(test_part_result.type())
+          << test_part_result.message()).GetString();
+}
+
+// Prints a TestPartResult.
+static void PrintTestPartResult(const TestPartResult& test_part_result) {
+  const std::string& result =
+      PrintTestPartResultToString(test_part_result);
+  printf("%s\n", result.c_str());
+  fflush(stdout);
+  // If the test program runs in Visual Studio or a debugger, the
+  // following statements add the test part result message to the Output
+  // window such that the user can double-click on it to jump to the
+  // corresponding source code location; otherwise they do nothing.
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
+  // We don't call OutputDebugString*() on Windows Mobile, as printing
+  // to stdout is done by OutputDebugString() there already - we don't
+  // want the same message printed twice.
+  ::OutputDebugStringA(result.c_str());
+  ::OutputDebugStringA("\n");
+#endif
+}
+
+// class PrettyUnitTestResultPrinter
+
+enum GTestColor {
+  COLOR_DEFAULT,
+  COLOR_RED,
+  COLOR_GREEN,
+  COLOR_YELLOW
+};
+
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && \
+    !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+
+// Returns the character attribute for the given color.
+WORD GetColorAttribute(GTestColor color) {
+  switch (color) {
+    case COLOR_RED:    return FOREGROUND_RED;
+    case COLOR_GREEN:  return FOREGROUND_GREEN;
+    case COLOR_YELLOW: return FOREGROUND_RED | FOREGROUND_GREEN;
+    default:           return 0;
+  }
+}
+
+#else
+
+// Returns the ANSI color code for the given color.  COLOR_DEFAULT is
+// an invalid input.
+const char* GetAnsiColorCode(GTestColor color) {
+  switch (color) {
+    case COLOR_RED:     return "1";
+    case COLOR_GREEN:   return "2";
+    case COLOR_YELLOW:  return "3";
+    default:            return NULL;
+  };
+}
+
+#endif  // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
+
+// Returns true iff Google Test should use colors in the output.
+bool ShouldUseColor(bool stdout_is_tty) {
+  const char* const gtest_color = GTEST_FLAG(color).c_str();
+
+  if (String::CaseInsensitiveCStringEquals(gtest_color, "auto")) {
+#if GTEST_OS_WINDOWS
+    // On Windows the TERM variable is usually not set, but the
+    // console there does support colors.
+    return stdout_is_tty;
+#else
+    // On non-Windows platforms, we rely on the TERM variable.
+    const char* const term = posix::GetEnv("TERM");
+    const bool term_supports_color =
+        String::CStringEquals(term, "xterm") ||
+        String::CStringEquals(term, "xterm-color") ||
+        String::CStringEquals(term, "xterm-256color") ||
+        String::CStringEquals(term, "screen") ||
+        String::CStringEquals(term, "screen-256color") ||
+        String::CStringEquals(term, "linux") ||
+        String::CStringEquals(term, "cygwin");
+    return stdout_is_tty && term_supports_color;
+#endif  // GTEST_OS_WINDOWS
+  }
+
+  return String::CaseInsensitiveCStringEquals(gtest_color, "yes") ||
+      String::CaseInsensitiveCStringEquals(gtest_color, "true") ||
+      String::CaseInsensitiveCStringEquals(gtest_color, "t") ||
+      String::CStringEquals(gtest_color, "1");
+  // We take "yes", "true", "t", and "1" as meaning "yes".  If the
+  // value is neither one of these nor "auto", we treat it as "no" to
+  // be conservative.
+}
+
+// Helpers for printing colored strings to stdout. Note that on Windows, we
+// cannot simply emit special characters and have the terminal change colors.
+// This routine must actually emit the characters rather than return a string
+// that would be colored when printed, as can be done on Linux.
+void ColoredPrintf(GTestColor color, const char* fmt, ...) {
+  va_list args;
+  va_start(args, fmt);
+
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN || GTEST_OS_ZOS || \
+    GTEST_OS_IOS || GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT
+  const bool use_color = AlwaysFalse();
+#else
+  static const bool in_color_mode =
+      ShouldUseColor(posix::IsATTY(posix::FileNo(stdout)) != 0);
+  const bool use_color = in_color_mode && (color != COLOR_DEFAULT);
+#endif  // GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN || GTEST_OS_ZOS
+  // The '!= 0' comparison is necessary to satisfy MSVC 7.1.
+
+  if (!use_color) {
+    vprintf(fmt, args);
+    va_end(args);
+    return;
+  }
+
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && \
+    !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+  const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE);
+
+  // Gets the current text color.
+  CONSOLE_SCREEN_BUFFER_INFO buffer_info;
+  GetConsoleScreenBufferInfo(stdout_handle, &buffer_info);
+  const WORD old_color_attrs = buffer_info.wAttributes;
+
+  // We need to flush the stream buffers into the console before each
+  // SetConsoleTextAttribute call lest it affect the text that is already
+  // printed but has not yet reached the console.
+  fflush(stdout);
+  SetConsoleTextAttribute(stdout_handle,
+                          GetColorAttribute(color) | FOREGROUND_INTENSITY);
+  vprintf(fmt, args);
+
+  fflush(stdout);
+  // Restores the text color.
+  SetConsoleTextAttribute(stdout_handle, old_color_attrs);
+#else
+  printf("\033[0;3%sm", GetAnsiColorCode(color));
+  vprintf(fmt, args);
+  printf("\033[m");  // Resets the terminal to default.
+#endif  // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
+  va_end(args);
+}
+
+// Text printed in Google Test's text output and --gunit_list_tests
+// output to label the type parameter and value parameter for a test.
+static const char kTypeParamLabel[] = "TypeParam";
+static const char kValueParamLabel[] = "GetParam()";
+
+void PrintFullTestCommentIfPresent(const TestInfo& test_info) {
+  const char* const type_param = test_info.type_param();
+  const char* const value_param = test_info.value_param();
+
+  if (type_param != NULL || value_param != NULL) {
+    printf(", where ");
+    if (type_param != NULL) {
+      printf("%s = %s", kTypeParamLabel, type_param);
+      if (value_param != NULL)
+        printf(" and ");
+    }
+    if (value_param != NULL) {
+      printf("%s = %s", kValueParamLabel, value_param);
+    }
+  }
+}
+
+// This class implements the TestEventListener interface.
+//
+// Class PrettyUnitTestResultPrinter is copyable.
+class PrettyUnitTestResultPrinter : public TestEventListener {
+ public:
+  PrettyUnitTestResultPrinter() {}
+  static void PrintTestName(const char * test_case, const char * test) {
+    printf("%s.%s", test_case, test);
+  }
+
+  // The following methods override what's in the TestEventListener class.
+  virtual void OnTestProgramStart(const UnitTest& /*unit_test*/) {}
+  virtual void OnTestIterationStart(const UnitTest& unit_test, int iteration);
+  virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test);
+  virtual void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) {}
+  virtual void OnTestCaseStart(const TestCase& test_case);
+  virtual void OnTestStart(const TestInfo& test_info);
+  virtual void OnTestPartResult(const TestPartResult& result);
+  virtual void OnTestEnd(const TestInfo& test_info);
+  virtual void OnTestCaseEnd(const TestCase& test_case);
+  virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test);
+  virtual void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) {}
+  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration);
+  virtual void OnTestProgramEnd(const UnitTest& /*unit_test*/) {}
+
+ private:
+  static void PrintFailedTests(const UnitTest& unit_test);
+};
+
+  // Fired before each iteration of tests starts.
+void PrettyUnitTestResultPrinter::OnTestIterationStart(
+    const UnitTest& unit_test, int iteration) {
+  if (GTEST_FLAG(repeat) != 1)
+    printf("\nRepeating all tests (iteration %d) . . .\n\n", iteration + 1);
+
+  const char* const filter = GTEST_FLAG(filter).c_str();
+
+  // Prints the filter if it's not *.  This reminds the user that some
+  // tests may be skipped.
+  if (!String::CStringEquals(filter, kUniversalFilter)) {
+    ColoredPrintf(COLOR_YELLOW,
+                  "Note: %s filter = %s\n", GTEST_NAME_, filter);
+  }
+
+  if (internal::ShouldShard(kTestTotalShards, kTestShardIndex, false)) {
+    const Int32 shard_index = Int32FromEnvOrDie(kTestShardIndex, -1);
+    ColoredPrintf(COLOR_YELLOW,
+                  "Note: This is test shard %d of %s.\n",
+                  static_cast<int>(shard_index) + 1,
+                  internal::posix::GetEnv(kTestTotalShards));
+  }
+
+  if (GTEST_FLAG(shuffle)) {
+    ColoredPrintf(COLOR_YELLOW,
+                  "Note: Randomizing tests' orders with a seed of %d .\n",
+                  unit_test.random_seed());
+  }
+
+  ColoredPrintf(COLOR_GREEN,  "[==========] ");
+  printf("Running %s from %s.\n",
+         FormatTestCount(unit_test.test_to_run_count()).c_str(),
+         FormatTestCaseCount(unit_test.test_case_to_run_count()).c_str());
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnEnvironmentsSetUpStart(
+    const UnitTest& /*unit_test*/) {
+  ColoredPrintf(COLOR_GREEN,  "[----------] ");
+  printf("Global test environment set-up.\n");
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnTestCaseStart(const TestCase& test_case) {
+  const std::string counts =
+      FormatCountableNoun(test_case.test_to_run_count(), "test", "tests");
+  ColoredPrintf(COLOR_GREEN, "[----------] ");
+  printf("%s from %s", counts.c_str(), test_case.name());
+  if (test_case.type_param() == NULL) {
+    printf("\n");
+  } else {
+    printf(", where %s = %s\n", kTypeParamLabel, test_case.type_param());
+  }
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnTestStart(const TestInfo& test_info) {
+  ColoredPrintf(COLOR_GREEN,  "[ RUN      ] ");
+  PrintTestName(test_info.test_case_name(), test_info.name());
+  printf("\n");
+  fflush(stdout);
+}
+
+// Called after an assertion failure.
+void PrettyUnitTestResultPrinter::OnTestPartResult(
+    const TestPartResult& result) {
+  // If the test part succeeded, we don't need to do anything.
+  if (result.type() == TestPartResult::kSuccess)
+    return;
+
+  // Print failure message from the assertion (e.g. expected this and got that).
+  PrintTestPartResult(result);
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) {
+  if (test_info.result()->Passed()) {
+    ColoredPrintf(COLOR_GREEN, "[       OK ] ");
+  } else {
+    ColoredPrintf(COLOR_RED, "[  FAILED  ] ");
+  }
+  PrintTestName(test_info.test_case_name(), test_info.name());
+  if (test_info.result()->Failed())
+    PrintFullTestCommentIfPresent(test_info);
+
+  if (GTEST_FLAG(print_time)) {
+    printf(" (%s ms)\n", internal::StreamableToString(
+           test_info.result()->elapsed_time()).c_str());
+  } else {
+    printf("\n");
+  }
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnTestCaseEnd(const TestCase& test_case) {
+  if (!GTEST_FLAG(print_time)) return;
+
+  const std::string counts =
+      FormatCountableNoun(test_case.test_to_run_count(), "test", "tests");
+  ColoredPrintf(COLOR_GREEN, "[----------] ");
+  printf("%s from %s (%s ms total)\n\n",
+         counts.c_str(), test_case.name(),
+         internal::StreamableToString(test_case.elapsed_time()).c_str());
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnEnvironmentsTearDownStart(
+    const UnitTest& /*unit_test*/) {
+  ColoredPrintf(COLOR_GREEN,  "[----------] ");
+  printf("Global test environment tear-down\n");
+  fflush(stdout);
+}
+
+// Internal helper for printing the list of failed tests.
+void PrettyUnitTestResultPrinter::PrintFailedTests(const UnitTest& unit_test) {
+  const int failed_test_count = unit_test.failed_test_count();
+  if (failed_test_count == 0) {
+    return;
+  }
+
+  for (int i = 0; i < unit_test.total_test_case_count(); ++i) {
+    const TestCase& test_case = *unit_test.GetTestCase(i);
+    if (!test_case.should_run() || (test_case.failed_test_count() == 0)) {
+      continue;
+    }
+    for (int j = 0; j < test_case.total_test_count(); ++j) {
+      const TestInfo& test_info = *test_case.GetTestInfo(j);
+      if (!test_info.should_run() || test_info.result()->Passed()) {
+        continue;
+      }
+      ColoredPrintf(COLOR_RED, "[  FAILED  ] ");
+      printf("%s.%s", test_case.name(), test_info.name());
+      PrintFullTestCommentIfPresent(test_info);
+      printf("\n");
+    }
+  }
+}
+
+void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
+                                                     int /*iteration*/) {
+  ColoredPrintf(COLOR_GREEN,  "[==========] ");
+  printf("%s from %s ran.",
+         FormatTestCount(unit_test.test_to_run_count()).c_str(),
+         FormatTestCaseCount(unit_test.test_case_to_run_count()).c_str());
+  if (GTEST_FLAG(print_time)) {
+    printf(" (%s ms total)",
+           internal::StreamableToString(unit_test.elapsed_time()).c_str());
+  }
+  printf("\n");
+  ColoredPrintf(COLOR_GREEN,  "[  PASSED  ] ");
+  printf("%s.\n", FormatTestCount(unit_test.successful_test_count()).c_str());
+
+  int num_failures = unit_test.failed_test_count();
+  if (!unit_test.Passed()) {
+    const int failed_test_count = unit_test.failed_test_count();
+    ColoredPrintf(COLOR_RED,  "[  FAILED  ] ");
+    printf("%s, listed below:\n", FormatTestCount(failed_test_count).c_str());
+    PrintFailedTests(unit_test);
+    printf("\n%2d FAILED %s\n", num_failures,
+                        num_failures == 1 ? "TEST" : "TESTS");
+  }
+
+  int num_disabled = unit_test.reportable_disabled_test_count();
+  if (num_disabled && !GTEST_FLAG(also_run_disabled_tests)) {
+    if (!num_failures) {
+      printf("\n");  // Add a spacer if no FAILURE banner is displayed.
+    }
+    ColoredPrintf(COLOR_YELLOW,
+                  "  YOU HAVE %d DISABLED %s\n\n",
+                  num_disabled,
+                  num_disabled == 1 ? "TEST" : "TESTS");
+  }
+  // Ensure that Google Test output is printed before, e.g., heapchecker output.
+  fflush(stdout);
+}
+
+// End PrettyUnitTestResultPrinter
+
+// class TestEventRepeater
+//
+// This class forwards events to other event listeners.
+class TestEventRepeater : public TestEventListener {
+ public:
+  TestEventRepeater() : forwarding_enabled_(true) {}
+  virtual ~TestEventRepeater();
+  void Append(TestEventListener *listener);
+  TestEventListener* Release(TestEventListener* listener);
+
+  // Controls whether events will be forwarded to listeners_. Set to false
+  // in death test child processes.
+  bool forwarding_enabled() const { return forwarding_enabled_; }
+  void set_forwarding_enabled(bool enable) { forwarding_enabled_ = enable; }
+
+  virtual void OnTestProgramStart(const UnitTest& unit_test);
+  virtual void OnTestIterationStart(const UnitTest& unit_test, int iteration);
+  virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test);
+  virtual void OnEnvironmentsSetUpEnd(const UnitTest& unit_test);
+  virtual void OnTestCaseStart(const TestCase& test_case);
+  virtual void OnTestStart(const TestInfo& test_info);
+  virtual void OnTestPartResult(const TestPartResult& result);
+  virtual void OnTestEnd(const TestInfo& test_info);
+  virtual void OnTestCaseEnd(const TestCase& test_case);
+  virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test);
+  virtual void OnEnvironmentsTearDownEnd(const UnitTest& unit_test);
+  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration);
+  virtual void OnTestProgramEnd(const UnitTest& unit_test);
+
+ private:
+  // Controls whether events will be forwarded to listeners_. Set to false
+  // in death test child processes.
+  bool forwarding_enabled_;
+  // The list of listeners that receive events.
+  std::vector<TestEventListener*> listeners_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventRepeater);
+};
+
+TestEventRepeater::~TestEventRepeater() {
+  ForEach(listeners_, Delete<TestEventListener>);
+}
+
+void TestEventRepeater::Append(TestEventListener *listener) {
+  listeners_.push_back(listener);
+}
+
+// TODO(vladl at google.com): Factor the search functionality into Vector::Find.
+TestEventListener* TestEventRepeater::Release(TestEventListener *listener) {
+  for (size_t i = 0; i < listeners_.size(); ++i) {
+    if (listeners_[i] == listener) {
+      listeners_.erase(listeners_.begin() + i);
+      return listener;
+    }
+  }
+
+  return NULL;
+}
+
+// Since most methods are very similar, use macros to reduce boilerplate.
+// This defines a member that forwards the call to all listeners.
+#define GTEST_REPEATER_METHOD_(Name, Type) \
+void TestEventRepeater::Name(const Type& parameter) { \
+  if (forwarding_enabled_) { \
+    for (size_t i = 0; i < listeners_.size(); i++) { \
+      listeners_[i]->Name(parameter); \
+    } \
+  } \
+}
+// This defines a member that forwards the call to all listeners in reverse
+// order.
+#define GTEST_REVERSE_REPEATER_METHOD_(Name, Type) \
+void TestEventRepeater::Name(const Type& parameter) { \
+  if (forwarding_enabled_) { \
+    for (int i = static_cast<int>(listeners_.size()) - 1; i >= 0; i--) { \
+      listeners_[i]->Name(parameter); \
+    } \
+  } \
+}
+
+GTEST_REPEATER_METHOD_(OnTestProgramStart, UnitTest)
+GTEST_REPEATER_METHOD_(OnEnvironmentsSetUpStart, UnitTest)
+GTEST_REPEATER_METHOD_(OnTestCaseStart, TestCase)
+GTEST_REPEATER_METHOD_(OnTestStart, TestInfo)
+GTEST_REPEATER_METHOD_(OnTestPartResult, TestPartResult)
+GTEST_REPEATER_METHOD_(OnEnvironmentsTearDownStart, UnitTest)
+GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsSetUpEnd, UnitTest)
+GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsTearDownEnd, UnitTest)
+GTEST_REVERSE_REPEATER_METHOD_(OnTestEnd, TestInfo)
+GTEST_REVERSE_REPEATER_METHOD_(OnTestCaseEnd, TestCase)
+GTEST_REVERSE_REPEATER_METHOD_(OnTestProgramEnd, UnitTest)
+
+#undef GTEST_REPEATER_METHOD_
+#undef GTEST_REVERSE_REPEATER_METHOD_
+
+void TestEventRepeater::OnTestIterationStart(const UnitTest& unit_test,
+                                             int iteration) {
+  if (forwarding_enabled_) {
+    for (size_t i = 0; i < listeners_.size(); i++) {
+      listeners_[i]->OnTestIterationStart(unit_test, iteration);
+    }
+  }
+}
+
+void TestEventRepeater::OnTestIterationEnd(const UnitTest& unit_test,
+                                           int iteration) {
+  if (forwarding_enabled_) {
+    for (int i = static_cast<int>(listeners_.size()) - 1; i >= 0; i--) {
+      listeners_[i]->OnTestIterationEnd(unit_test, iteration);
+    }
+  }
+}
+
+// End TestEventRepeater
+
+// This class generates an XML output file.
+class XmlUnitTestResultPrinter : public EmptyTestEventListener {
+ public:
+  explicit XmlUnitTestResultPrinter(const char* output_file);
+
+  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration);
+
+ private:
+  // Is c a whitespace character that is normalized to a space character
+  // when it appears in an XML attribute value?
+  static bool IsNormalizableWhitespace(char c) {
+    return c == 0x9 || c == 0xA || c == 0xD;
+  }
+
+  // May c appear in a well-formed XML document?
+  static bool IsValidXmlCharacter(char c) {
+    return IsNormalizableWhitespace(c) || c >= 0x20;
+  }
+
+  // Returns an XML-escaped copy of the input string str.  If
+  // is_attribute is true, the text is meant to appear as an attribute
+  // value, and normalizable whitespace is preserved by replacing it
+  // with character references.
+  static std::string EscapeXml(const std::string& str, bool is_attribute);
+
+  // Returns the given string with all characters invalid in XML removed.
+  static std::string RemoveInvalidXmlCharacters(const std::string& str);
+
+  // Convenience wrapper around EscapeXml when str is an attribute value.
+  static std::string EscapeXmlAttribute(const std::string& str) {
+    return EscapeXml(str, true);
+  }
+
+  // Convenience wrapper around EscapeXml when str is not an attribute value.
+  static std::string EscapeXmlText(const char* str) {
+    return EscapeXml(str, false);
+  }
+
+  // Verifies that the given attribute belongs to the given element and
+  // streams the attribute as XML.
+  static void OutputXmlAttribute(std::ostream* stream,
+                                 const std::string& element_name,
+                                 const std::string& name,
+                                 const std::string& value);
+
+  // Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
+  static void OutputXmlCDataSection(::std::ostream* stream, const char* data);
+
+  // Streams an XML representation of a TestInfo object.
+  static void OutputXmlTestInfo(::std::ostream* stream,
+                                const char* test_case_name,
+                                const TestInfo& test_info);
+
+  // Prints an XML representation of a TestCase object
+  static void PrintXmlTestCase(::std::ostream* stream,
+                               const TestCase& test_case);
+
+  // Prints an XML summary of unit_test to output stream out.
+  static void PrintXmlUnitTest(::std::ostream* stream,
+                               const UnitTest& unit_test);
+
+  // Produces a string representing the test properties in a result as space
+  // delimited XML attributes based on the property key="value" pairs.
+  // When the std::string is not empty, it includes a space at the beginning,
+  // to delimit this attribute from prior attributes.
+  static std::string TestPropertiesAsXmlAttributes(const TestResult& result);
+
+  // The output file.
+  const std::string output_file_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(XmlUnitTestResultPrinter);
+};
+
+// Creates a new XmlUnitTestResultPrinter.
+XmlUnitTestResultPrinter::XmlUnitTestResultPrinter(const char* output_file)
+    : output_file_(output_file) {
+  if (output_file_.c_str() == NULL || output_file_.empty()) {
+    fprintf(stderr, "XML output file may not be null\n");
+    fflush(stderr);
+    exit(EXIT_FAILURE);
+  }
+}
+
+// Called after the unit test ends.
+void XmlUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
+                                                  int /*iteration*/) {
+  FILE* xmlout = NULL;
+  FilePath output_file(output_file_);
+  FilePath output_dir(output_file.RemoveFileName());
+
+  if (output_dir.CreateDirectoriesRecursively()) {
+    xmlout = posix::FOpen(output_file_.c_str(), "w");
+  }
+  if (xmlout == NULL) {
+    // TODO(wan): report the reason of the failure.
+    //
+    // We don't do it for now as:
+    //
+    //   1. There is no urgent need for it.
+    //   2. It's a bit involved to make the errno variable thread-safe on
+    //      all three operating systems (Linux, Windows, and Mac OS).
+    //   3. To interpret the meaning of errno in a thread-safe way,
+    //      we need the strerror_r() function, which is not available on
+    //      Windows.
+    fprintf(stderr,
+            "Unable to open file \"%s\"\n",
+            output_file_.c_str());
+    fflush(stderr);
+    exit(EXIT_FAILURE);
+  }
+  std::stringstream stream;
+  PrintXmlUnitTest(&stream, unit_test);
+  fprintf(xmlout, "%s", StringStreamToString(&stream).c_str());
+  fclose(xmlout);
+}
+
+// Returns an XML-escaped copy of the input string str.  If is_attribute
+// is true, the text is meant to appear as an attribute value, and
+// normalizable whitespace is preserved by replacing it with character
+// references.
+//
+// Invalid XML characters in str, if any, are stripped from the output.
+// It is expected that most, if not all, of the text processed by this
+// module will consist of ordinary English text.
+// If this module is ever modified to produce version 1.1 XML output,
+// most invalid characters can be retained using character references.
+// TODO(wan): It might be nice to have a minimally invasive, human-readable
+// escaping scheme for invalid characters, rather than dropping them.
+std::string XmlUnitTestResultPrinter::EscapeXml(
+    const std::string& str, bool is_attribute) {
+  Message m;
+
+  for (size_t i = 0; i < str.size(); ++i) {
+    const char ch = str[i];
+    switch (ch) {
+      case '<':
+        m << "<";
+        break;
+      case '>':
+        m << ">";
+        break;
+      case '&':
+        m << "&";
+        break;
+      case '\'':
+        if (is_attribute)
+          m << "'";
+        else
+          m << '\'';
+        break;
+      case '"':
+        if (is_attribute)
+          m << """;
+        else
+          m << '"';
+        break;
+      default:
+        if (IsValidXmlCharacter(ch)) {
+          if (is_attribute && IsNormalizableWhitespace(ch))
+            m << "&#x" << String::FormatByte(static_cast<unsigned char>(ch))
+              << ";";
+          else
+            m << ch;
+        }
+        break;
+    }
+  }
+
+  return m.GetString();
+}
+
+// Returns the given string with all characters invalid in XML removed.
+// Currently invalid characters are dropped from the string. An
+// alternative is to replace them with certain characters such as . or ?.
+std::string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters(
+    const std::string& str) {
+  std::string output;
+  output.reserve(str.size());
+  for (std::string::const_iterator it = str.begin(); it != str.end(); ++it)
+    if (IsValidXmlCharacter(*it))
+      output.push_back(*it);
+
+  return output;
+}
+
+// The following routines generate an XML representation of a UnitTest
+// object.
+//
+// This is how Google Test concepts map to the DTD:
+//
+// <testsuites name="AllTests">        <-- corresponds to a UnitTest object
+//   <testsuite name="testcase-name">  <-- corresponds to a TestCase object
+//     <testcase name="test-name">     <-- corresponds to a TestInfo object
+//       <failure message="...">...</failure>
+//       <failure message="...">...</failure>
+//       <failure message="...">...</failure>
+//                                     <-- individual assertion failures
+//     </testcase>
+//   </testsuite>
+// </testsuites>
+
+// Formats the given time in milliseconds as seconds.
+std::string FormatTimeInMillisAsSeconds(TimeInMillis ms) {
+  ::std::stringstream ss;
+  ss << ms/1000.0;
+  return ss.str();
+}
+
+static bool PortableLocaltime(time_t seconds, struct tm* out) {
+#if defined(_MSC_VER)
+  return localtime_s(out, &seconds) == 0;
+#elif defined(__MINGW32__) || defined(__MINGW64__)
+  // MINGW <time.h> provides neither localtime_r nor localtime_s, but uses
+  // Windows' localtime(), which has a thread-local tm buffer.
+  struct tm* tm_ptr = localtime(&seconds);  // NOLINT
+  if (tm_ptr == NULL)
+    return false;
+  *out = *tm_ptr;
+  return true;
+#else
+  return localtime_r(&seconds, out) != NULL;
+#endif
+}
+
+// Converts the given epoch time in milliseconds to a date string in the ISO
+// 8601 format, without the timezone information.
+std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms) {
+  struct tm time_struct;
+  if (!PortableLocaltime(static_cast<time_t>(ms / 1000), &time_struct))
+    return "";
+  // YYYY-MM-DDThh:mm:ss
+  return StreamableToString(time_struct.tm_year + 1900) + "-" +
+      String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" +
+      String::FormatIntWidth2(time_struct.tm_mday) + "T" +
+      String::FormatIntWidth2(time_struct.tm_hour) + ":" +
+      String::FormatIntWidth2(time_struct.tm_min) + ":" +
+      String::FormatIntWidth2(time_struct.tm_sec);
+}
+
+// Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
+void XmlUnitTestResultPrinter::OutputXmlCDataSection(::std::ostream* stream,
+                                                     const char* data) {
+  const char* segment = data;
+  *stream << "<![CDATA[";
+  for (;;) {
+    const char* const next_segment = strstr(segment, "]]>");
+    if (next_segment != NULL) {
+      stream->write(
+          segment, static_cast<std::streamsize>(next_segment - segment));
+      *stream << "]]>]]><![CDATA[";
+      segment = next_segment + strlen("]]>");
+    } else {
+      *stream << segment;
+      break;
+    }
+  }
+  *stream << "]]>";
+}
+
+void XmlUnitTestResultPrinter::OutputXmlAttribute(
+    std::ostream* stream,
+    const std::string& element_name,
+    const std::string& name,
+    const std::string& value) {
+  const std::vector<std::string>& allowed_names =
+      GetReservedAttributesForElement(element_name);
+
+  GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
+                   allowed_names.end())
+      << "Attribute " << name << " is not allowed for element <" << element_name
+      << ">.";
+
+  *stream << " " << name << "=\"" << EscapeXmlAttribute(value) << "\"";
+}
+
+// Prints an XML representation of a TestInfo object.
+// TODO(wan): There is also value in printing properties with the plain printer.
+void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream,
+                                                 const char* test_case_name,
+                                                 const TestInfo& test_info) {
+  const TestResult& result = *test_info.result();
+  const std::string kTestcase = "testcase";
+
+  *stream << "    <testcase";
+  OutputXmlAttribute(stream, kTestcase, "name", test_info.name());
+
+  if (test_info.value_param() != NULL) {
+    OutputXmlAttribute(stream, kTestcase, "value_param",
+                       test_info.value_param());
+  }
+  if (test_info.type_param() != NULL) {
+    OutputXmlAttribute(stream, kTestcase, "type_param", test_info.type_param());
+  }
+
+  OutputXmlAttribute(stream, kTestcase, "status",
+                     test_info.should_run() ? "run" : "notrun");
+  OutputXmlAttribute(stream, kTestcase, "time",
+                     FormatTimeInMillisAsSeconds(result.elapsed_time()));
+  OutputXmlAttribute(stream, kTestcase, "classname", test_case_name);
+  *stream << TestPropertiesAsXmlAttributes(result);
+
+  int failures = 0;
+  for (int i = 0; i < result.total_part_count(); ++i) {
+    const TestPartResult& part = result.GetTestPartResult(i);
+    if (part.failed()) {
+      if (++failures == 1) {
+        *stream << ">\n";
+      }
+      const string location = internal::FormatCompilerIndependentFileLocation(
+          part.file_name(), part.line_number());
+      const string summary = location + "\n" + part.summary();
+      *stream << "      <failure message=\""
+              << EscapeXmlAttribute(summary.c_str())
+              << "\" type=\"\">";
+      const string detail = location + "\n" + part.message();
+      OutputXmlCDataSection(stream, RemoveInvalidXmlCharacters(detail).c_str());
+      *stream << "</failure>\n";
+    }
+  }
+
+  if (failures == 0)
+    *stream << " />\n";
+  else
+    *stream << "    </testcase>\n";
+}
+
+// Prints an XML representation of a TestCase object
+void XmlUnitTestResultPrinter::PrintXmlTestCase(std::ostream* stream,
+                                                const TestCase& test_case) {
+  const std::string kTestsuite = "testsuite";
+  *stream << "  <" << kTestsuite;
+  OutputXmlAttribute(stream, kTestsuite, "name", test_case.name());
+  OutputXmlAttribute(stream, kTestsuite, "tests",
+                     StreamableToString(test_case.reportable_test_count()));
+  OutputXmlAttribute(stream, kTestsuite, "failures",
+                     StreamableToString(test_case.failed_test_count()));
+  OutputXmlAttribute(
+      stream, kTestsuite, "disabled",
+      StreamableToString(test_case.reportable_disabled_test_count()));
+  OutputXmlAttribute(stream, kTestsuite, "errors", "0");
+  OutputXmlAttribute(stream, kTestsuite, "time",
+                     FormatTimeInMillisAsSeconds(test_case.elapsed_time()));
+  *stream << TestPropertiesAsXmlAttributes(test_case.ad_hoc_test_result())
+          << ">\n";
+
+  for (int i = 0; i < test_case.total_test_count(); ++i) {
+    if (test_case.GetTestInfo(i)->is_reportable())
+      OutputXmlTestInfo(stream, test_case.name(), *test_case.GetTestInfo(i));
+  }
+  *stream << "  </" << kTestsuite << ">\n";
+}
+
+// Prints an XML summary of unit_test to output stream out.
+void XmlUnitTestResultPrinter::PrintXmlUnitTest(std::ostream* stream,
+                                                const UnitTest& unit_test) {
+  const std::string kTestsuites = "testsuites";
+
+  *stream << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
+  *stream << "<" << kTestsuites;
+
+  OutputXmlAttribute(stream, kTestsuites, "tests",
+                     StreamableToString(unit_test.reportable_test_count()));
+  OutputXmlAttribute(stream, kTestsuites, "failures",
+                     StreamableToString(unit_test.failed_test_count()));
+  OutputXmlAttribute(
+      stream, kTestsuites, "disabled",
+      StreamableToString(unit_test.reportable_disabled_test_count()));
+  OutputXmlAttribute(stream, kTestsuites, "errors", "0");
+  OutputXmlAttribute(
+      stream, kTestsuites, "timestamp",
+      FormatEpochTimeInMillisAsIso8601(unit_test.start_timestamp()));
+  OutputXmlAttribute(stream, kTestsuites, "time",
+                     FormatTimeInMillisAsSeconds(unit_test.elapsed_time()));
+
+  if (GTEST_FLAG(shuffle)) {
+    OutputXmlAttribute(stream, kTestsuites, "random_seed",
+                       StreamableToString(unit_test.random_seed()));
+  }
+
+  *stream << TestPropertiesAsXmlAttributes(unit_test.ad_hoc_test_result());
+
+  OutputXmlAttribute(stream, kTestsuites, "name", "AllTests");
+  *stream << ">\n";
+
+  for (int i = 0; i < unit_test.total_test_case_count(); ++i) {
+    if (unit_test.GetTestCase(i)->reportable_test_count() > 0)
+      PrintXmlTestCase(stream, *unit_test.GetTestCase(i));
+  }
+  *stream << "</" << kTestsuites << ">\n";
+}
+
+// Produces a string representing the test properties in a result as space
+// delimited XML attributes based on the property key="value" pairs.
+std::string XmlUnitTestResultPrinter::TestPropertiesAsXmlAttributes(
+    const TestResult& result) {
+  Message attributes;
+  for (int i = 0; i < result.test_property_count(); ++i) {
+    const TestProperty& property = result.GetTestProperty(i);
+    attributes << " " << property.key() << "="
+        << "\"" << EscapeXmlAttribute(property.value()) << "\"";
+  }
+  return attributes.GetString();
+}
+
+// End XmlUnitTestResultPrinter
+
+#if GTEST_CAN_STREAM_RESULTS_
+
+// Checks if str contains '=', '&', '%' or '\n' characters. If yes,
+// replaces them by "%xx" where xx is their hexadecimal value. For
+// example, replaces "=" with "%3D".  This algorithm is O(strlen(str))
+// in both time and space -- important as the input str may contain an
+// arbitrarily long test failure message and stack trace.
+string StreamingListener::UrlEncode(const char* str) {
+  string result;
+  result.reserve(strlen(str) + 1);
+  for (char ch = *str; ch != '\0'; ch = *++str) {
+    switch (ch) {
+      case '%':
+      case '=':
+      case '&':
+      case '\n':
+        result.append("%" + String::FormatByte(static_cast<unsigned char>(ch)));
+        break;
+      default:
+        result.push_back(ch);
+        break;
+    }
+  }
+  return result;
+}
+
+void StreamingListener::SocketWriter::MakeConnection() {
+  GTEST_CHECK_(sockfd_ == -1)
+      << "MakeConnection() can't be called when there is already a connection.";
+
+  addrinfo hints;
+  memset(&hints, 0, sizeof(hints));
+  hints.ai_family = AF_UNSPEC;    // To allow both IPv4 and IPv6 addresses.
+  hints.ai_socktype = SOCK_STREAM;
+  addrinfo* servinfo = NULL;
+
+  // Use the getaddrinfo() to get a linked list of IP addresses for
+  // the given host name.
+  const int error_num = getaddrinfo(
+      host_name_.c_str(), port_num_.c_str(), &hints, &servinfo);
+  if (error_num != 0) {
+    GTEST_LOG_(WARNING) << "stream_result_to: getaddrinfo() failed: "
+                        << gai_strerror(error_num);
+  }
+
+  // Loop through all the results and connect to the first we can.
+  for (addrinfo* cur_addr = servinfo; sockfd_ == -1 && cur_addr != NULL;
+       cur_addr = cur_addr->ai_next) {
+    sockfd_ = socket(
+        cur_addr->ai_family, cur_addr->ai_socktype, cur_addr->ai_protocol);
+    if (sockfd_ != -1) {
+      // Connect the client socket to the server socket.
+      if (connect(sockfd_, cur_addr->ai_addr, cur_addr->ai_addrlen) == -1) {
+        close(sockfd_);
+        sockfd_ = -1;
+      }
+    }
+  }
+
+  freeaddrinfo(servinfo);  // all done with this structure
+
+  if (sockfd_ == -1) {
+    GTEST_LOG_(WARNING) << "stream_result_to: failed to connect to "
+                        << host_name_ << ":" << port_num_;
+  }
+}
+
+// End of class Streaming Listener
+#endif  // GTEST_CAN_STREAM_RESULTS__
+
+// Class ScopedTrace
+
+// Pushes the given source file location and message onto a per-thread
+// trace stack maintained by Google Test.
+ScopedTrace::ScopedTrace(const char* file, int line, const Message& message)
+    GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) {
+  TraceInfo trace;
+  trace.file = file;
+  trace.line = line;
+  trace.message = message.GetString();
+
+  UnitTest::GetInstance()->PushGTestTrace(trace);
+}
+
+// Pops the info pushed by the c'tor.
+ScopedTrace::~ScopedTrace()
+    GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) {
+  UnitTest::GetInstance()->PopGTestTrace();
+}
+
+
+// class OsStackTraceGetter
+
+// Returns the current OS stack trace as an std::string.  Parameters:
+//
+//   max_depth  - the maximum number of stack frames to be included
+//                in the trace.
+//   skip_count - the number of top frames to be skipped; doesn't count
+//                against max_depth.
+//
+string OsStackTraceGetter::CurrentStackTrace(int /* max_depth */,
+                                             int /* skip_count */)
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  return "";
+}
+
+void OsStackTraceGetter::UponLeavingGTest()
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+}
+
+const char* const
+OsStackTraceGetter::kElidedFramesMarker =
+    "... " GTEST_NAME_ " internal frames ...";
+
+// A helper class that creates the premature-exit file in its
+// constructor and deletes the file in its destructor.
+class ScopedPrematureExitFile {
+ public:
+  explicit ScopedPrematureExitFile(const char* premature_exit_filepath)
+      : premature_exit_filepath_(premature_exit_filepath) {
+    // If a path to the premature-exit file is specified...
+    if (premature_exit_filepath != NULL && *premature_exit_filepath != '\0') {
+      // create the file with a single "0" character in it.  I/O
+      // errors are ignored as there's nothing better we can do and we
+      // don't want to fail the test because of this.
+      FILE* pfile = posix::FOpen(premature_exit_filepath, "w");
+      fwrite("0", 1, 1, pfile);
+      fclose(pfile);
+    }
+  }
+
+  ~ScopedPrematureExitFile() {
+    if (premature_exit_filepath_ != NULL && *premature_exit_filepath_ != '\0') {
+      remove(premature_exit_filepath_);
+    }
+  }
+
+ private:
+  const char* const premature_exit_filepath_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedPrematureExitFile);
+};
+
+}  // namespace internal
+
+// class TestEventListeners
+
+TestEventListeners::TestEventListeners()
+    : repeater_(new internal::TestEventRepeater()),
+      default_result_printer_(NULL),
+      default_xml_generator_(NULL) {
+}
+
+TestEventListeners::~TestEventListeners() { delete repeater_; }
+
+// Returns the standard listener responsible for the default console
+// output.  Can be removed from the listeners list to shut down default
+// console output.  Note that removing this object from the listener list
+// with Release transfers its ownership to the user.
+void TestEventListeners::Append(TestEventListener* listener) {
+  repeater_->Append(listener);
+}
+
+// Removes the given event listener from the list and returns it.  It then
+// becomes the caller's responsibility to delete the listener. Returns
+// NULL if the listener is not found in the list.
+TestEventListener* TestEventListeners::Release(TestEventListener* listener) {
+  if (listener == default_result_printer_)
+    default_result_printer_ = NULL;
+  else if (listener == default_xml_generator_)
+    default_xml_generator_ = NULL;
+  return repeater_->Release(listener);
+}
+
+// Returns repeater that broadcasts the TestEventListener events to all
+// subscribers.
+TestEventListener* TestEventListeners::repeater() { return repeater_; }
+
+// Sets the default_result_printer attribute to the provided listener.
+// The listener is also added to the listener list and previous
+// default_result_printer is removed from it and deleted. The listener can
+// also be NULL in which case it will not be added to the list. Does
+// nothing if the previous and the current listener objects are the same.
+void TestEventListeners::SetDefaultResultPrinter(TestEventListener* listener) {
+  if (default_result_printer_ != listener) {
+    // It is an error to pass this method a listener that is already in the
+    // list.
+    delete Release(default_result_printer_);
+    default_result_printer_ = listener;
+    if (listener != NULL)
+      Append(listener);
+  }
+}
+
+// Sets the default_xml_generator attribute to the provided listener.  The
+// listener is also added to the listener list and previous
+// default_xml_generator is removed from it and deleted. The listener can
+// also be NULL in which case it will not be added to the list. Does
+// nothing if the previous and the current listener objects are the same.
+void TestEventListeners::SetDefaultXmlGenerator(TestEventListener* listener) {
+  if (default_xml_generator_ != listener) {
+    // It is an error to pass this method a listener that is already in the
+    // list.
+    delete Release(default_xml_generator_);
+    default_xml_generator_ = listener;
+    if (listener != NULL)
+      Append(listener);
+  }
+}
+
+// Controls whether events will be forwarded by the repeater to the
+// listeners in the list.
+bool TestEventListeners::EventForwardingEnabled() const {
+  return repeater_->forwarding_enabled();
+}
+
+void TestEventListeners::SuppressEventForwarding() {
+  repeater_->set_forwarding_enabled(false);
+}
+
+// class UnitTest
+
+// Gets the singleton UnitTest object.  The first time this method is
+// called, a UnitTest object is constructed and returned.  Consecutive
+// calls will return the same object.
+//
+// We don't protect this under mutex_ as a user is not supposed to
+// call this before main() starts, from which point on the return
+// value will never change.
+UnitTest* UnitTest::GetInstance() {
+  // When compiled with MSVC 7.1 in optimized mode, destroying the
+  // UnitTest object upon exiting the program messes up the exit code,
+  // causing successful tests to appear failed.  We have to use a
+  // different implementation in this case to bypass the compiler bug.
+  // This implementation makes the compiler happy, at the cost of
+  // leaking the UnitTest object.
+
+  // CodeGear C++Builder insists on a public destructor for the
+  // default implementation.  Use this implementation to keep good OO
+  // design with private destructor.
+
+#if (_MSC_VER == 1310 && !defined(_DEBUG)) || defined(__BORLANDC__)
+  static UnitTest* const instance = new UnitTest;
+  return instance;
+#else
+  static UnitTest instance;
+  return &instance;
+#endif  // (_MSC_VER == 1310 && !defined(_DEBUG)) || defined(__BORLANDC__)
+}
+
+// Gets the number of successful test cases.
+int UnitTest::successful_test_case_count() const {
+  return impl()->successful_test_case_count();
+}
+
+// Gets the number of failed test cases.
+int UnitTest::failed_test_case_count() const {
+  return impl()->failed_test_case_count();
+}
+
+// Gets the number of all test cases.
+int UnitTest::total_test_case_count() const {
+  return impl()->total_test_case_count();
+}
+
+// Gets the number of all test cases that contain at least one test
+// that should run.
+int UnitTest::test_case_to_run_count() const {
+  return impl()->test_case_to_run_count();
+}
+
+// Gets the number of successful tests.
+int UnitTest::successful_test_count() const {
+  return impl()->successful_test_count();
+}
+
+// Gets the number of failed tests.
+int UnitTest::failed_test_count() const { return impl()->failed_test_count(); }
+
+// Gets the number of disabled tests that will be reported in the XML report.
+int UnitTest::reportable_disabled_test_count() const {
+  return impl()->reportable_disabled_test_count();
+}
+
+// Gets the number of disabled tests.
+int UnitTest::disabled_test_count() const {
+  return impl()->disabled_test_count();
+}
+
+// Gets the number of tests to be printed in the XML report.
+int UnitTest::reportable_test_count() const {
+  return impl()->reportable_test_count();
+}
+
+// Gets the number of all tests.
+int UnitTest::total_test_count() const { return impl()->total_test_count(); }
+
+// Gets the number of tests that should run.
+int UnitTest::test_to_run_count() const { return impl()->test_to_run_count(); }
+
+// Gets the time of the test program start, in ms from the start of the
+// UNIX epoch.
+internal::TimeInMillis UnitTest::start_timestamp() const {
+    return impl()->start_timestamp();
+}
+
+// Gets the elapsed time, in milliseconds.
+internal::TimeInMillis UnitTest::elapsed_time() const {
+  return impl()->elapsed_time();
+}
+
+// Returns true iff the unit test passed (i.e. all test cases passed).
+bool UnitTest::Passed() const { return impl()->Passed(); }
+
+// Returns true iff the unit test failed (i.e. some test case failed
+// or something outside of all tests failed).
+bool UnitTest::Failed() const { return impl()->Failed(); }
+
+// Gets the i-th test case among all the test cases. i can range from 0 to
+// total_test_case_count() - 1. If i is not in that range, returns NULL.
+const TestCase* UnitTest::GetTestCase(int i) const {
+  return impl()->GetTestCase(i);
+}
+
+// Returns the TestResult containing information on test failures and
+// properties logged outside of individual test cases.
+const TestResult& UnitTest::ad_hoc_test_result() const {
+  return *impl()->ad_hoc_test_result();
+}
+
+// Gets the i-th test case among all the test cases. i can range from 0 to
+// total_test_case_count() - 1. If i is not in that range, returns NULL.
+TestCase* UnitTest::GetMutableTestCase(int i) {
+  return impl()->GetMutableTestCase(i);
+}
+
+// Returns the list of event listeners that can be used to track events
+// inside Google Test.
+TestEventListeners& UnitTest::listeners() {
+  return *impl()->listeners();
+}
+
+// Registers and returns a global test environment.  When a test
+// program is run, all global test environments will be set-up in the
+// order they were registered.  After all tests in the program have
+// finished, all global test environments will be torn-down in the
+// *reverse* order they were registered.
+//
+// The UnitTest object takes ownership of the given environment.
+//
+// We don't protect this under mutex_, as we only support calling it
+// from the main thread.
+Environment* UnitTest::AddEnvironment(Environment* env) {
+  if (env == NULL) {
+    return NULL;
+  }
+
+  impl_->environments().push_back(env);
+  return env;
+}
+
+// Adds a TestPartResult to the current TestResult object.  All Google Test
+// assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc) eventually call
+// this to report their results.  The user code should use the
+// assertion macros instead of calling this directly.
+void UnitTest::AddTestPartResult(
+    TestPartResult::Type result_type,
+    const char* file_name,
+    int line_number,
+    const std::string& message,
+    const std::string& os_stack_trace) GTEST_LOCK_EXCLUDED_(mutex_) {
+  Message msg;
+  msg << message;
+
+  internal::MutexLock lock(&mutex_);
+  if (impl_->gtest_trace_stack().size() > 0) {
+    msg << "\n" << GTEST_NAME_ << " trace:";
+
+    for (int i = static_cast<int>(impl_->gtest_trace_stack().size());
+         i > 0; --i) {
+      const internal::TraceInfo& trace = impl_->gtest_trace_stack()[i - 1];
+      msg << "\n" << internal::FormatFileLocation(trace.file, trace.line)
+          << " " << trace.message;
+    }
+  }
+
+  if (os_stack_trace.c_str() != NULL && !os_stack_trace.empty()) {
+    msg << internal::kStackTraceMarker << os_stack_trace;
+  }
+
+  const TestPartResult result =
+    TestPartResult(result_type, file_name, line_number,
+                   msg.GetString().c_str());
+  impl_->GetTestPartResultReporterForCurrentThread()->
+      ReportTestPartResult(result);
+
+  if (result_type != TestPartResult::kSuccess) {
+    // gtest_break_on_failure takes precedence over
+    // gtest_throw_on_failure.  This allows a user to set the latter
+    // in the code (perhaps in order to use Google Test assertions
+    // with another testing framework) and specify the former on the
+    // command line for debugging.
+    if (GTEST_FLAG(break_on_failure)) {
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+      // Using DebugBreak on Windows allows gtest to still break into a debugger
+      // when a failure happens and both the --gtest_break_on_failure and
+      // the --gtest_catch_exceptions flags are specified.
+      DebugBreak();
+#else
+      // Dereference NULL through a volatile pointer to prevent the compiler
+      // from removing. We use this rather than abort() or __builtin_trap() for
+      // portability: Symbian doesn't implement abort() well, and some debuggers
+      // don't correctly trap abort().
+      *static_cast<volatile int*>(NULL) = 1;
+#endif  // GTEST_OS_WINDOWS
+    } else if (GTEST_FLAG(throw_on_failure)) {
+#if GTEST_HAS_EXCEPTIONS
+      throw internal::GoogleTestFailureException(result);
+#else
+      // We cannot call abort() as it generates a pop-up in debug mode
+      // that cannot be suppressed in VC 7.1 or below.
+      exit(1);
+#endif
+    }
+  }
+}
+
+// Adds a TestProperty to the current TestResult object when invoked from
+// inside a test, to current TestCase's ad_hoc_test_result_ when invoked
+// from SetUpTestCase or TearDownTestCase, or to the global property set
+// when invoked elsewhere.  If the result already contains a property with
+// the same key, the value will be updated.
+void UnitTest::RecordProperty(const std::string& key,
+                              const std::string& value) {
+  impl_->RecordProperty(TestProperty(key, value));
+}
+
+// Runs all tests in this UnitTest object and prints the result.
+// Returns 0 if successful, or 1 otherwise.
+//
+// We don't protect this under mutex_, as we only support calling it
+// from the main thread.
+int UnitTest::Run() {
+  const bool in_death_test_child_process =
+      internal::GTEST_FLAG(internal_run_death_test).length() > 0;
+
+  // Google Test implements this protocol for catching that a test
+  // program exits before returning control to Google Test:
+  //
+  //   1. Upon start, Google Test creates a file whose absolute path
+  //      is specified by the environment variable
+  //      TEST_PREMATURE_EXIT_FILE.
+  //   2. When Google Test has finished its work, it deletes the file.
+  //
+  // This allows a test runner to set TEST_PREMATURE_EXIT_FILE before
+  // running a Google-Test-based test program and check the existence
+  // of the file at the end of the test execution to see if it has
+  // exited prematurely.
+
+  // If we are in the child process of a death test, don't
+  // create/delete the premature exit file, as doing so is unnecessary
+  // and will confuse the parent process.  Otherwise, create/delete
+  // the file upon entering/leaving this function.  If the program
+  // somehow exits before this function has a chance to return, the
+  // premature-exit file will be left undeleted, causing a test runner
+  // that understands the premature-exit-file protocol to report the
+  // test as having failed.
+  const internal::ScopedPrematureExitFile premature_exit_file(
+      in_death_test_child_process ?
+      NULL : internal::posix::GetEnv("TEST_PREMATURE_EXIT_FILE"));
+
+  // Captures the value of GTEST_FLAG(catch_exceptions).  This value will be
+  // used for the duration of the program.
+  impl()->set_catch_exceptions(GTEST_FLAG(catch_exceptions));
+
+#if GTEST_HAS_SEH
+  // Either the user wants Google Test to catch exceptions thrown by the
+  // tests or this is executing in the context of death test child
+  // process. In either case the user does not want to see pop-up dialogs
+  // about crashes - they are expected.
+  if (impl()->catch_exceptions() || in_death_test_child_process) {
+# if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+    // SetErrorMode doesn't exist on CE.
+    SetErrorMode(SEM_FAILCRITICALERRORS | SEM_NOALIGNMENTFAULTEXCEPT |
+                 SEM_NOGPFAULTERRORBOX | SEM_NOOPENFILEERRORBOX);
+# endif  // !GTEST_OS_WINDOWS_MOBILE
+
+# if (defined(_MSC_VER) || GTEST_OS_WINDOWS_MINGW) && !GTEST_OS_WINDOWS_MOBILE
+    // Death test children can be terminated with _abort().  On Windows,
+    // _abort() can show a dialog with a warning message.  This forces the
+    // abort message to go to stderr instead.
+    _set_error_mode(_OUT_TO_STDERR);
+# endif
+
+# if _MSC_VER >= 1400 && !GTEST_OS_WINDOWS_MOBILE
+    // In the debug version, Visual Studio pops up a separate dialog
+    // offering a choice to debug the aborted program. We need to suppress
+    // this dialog or it will pop up for every EXPECT/ASSERT_DEATH statement
+    // executed. Google Test will notify the user of any unexpected
+    // failure via stderr.
+    //
+    // VC++ doesn't define _set_abort_behavior() prior to the version 8.0.
+    // Users of prior VC versions shall suffer the agony and pain of
+    // clicking through the countless debug dialogs.
+    // TODO(vladl at google.com): find a way to suppress the abort dialog() in the
+    // debug mode when compiled with VC 7.1 or lower.
+    if (!GTEST_FLAG(break_on_failure))
+      _set_abort_behavior(
+          0x0,                                    // Clear the following flags:
+          _WRITE_ABORT_MSG | _CALL_REPORTFAULT);  // pop-up window, core dump.
+# endif
+  }
+#endif  // GTEST_HAS_SEH
+
+  return internal::HandleExceptionsInMethodIfSupported(
+      impl(),
+      &internal::UnitTestImpl::RunAllTests,
+      "auxiliary test code (environments or event listeners)") ? 0 : 1;
+}
+
+// Returns the working directory when the first TEST() or TEST_F() was
+// executed.
+const char* UnitTest::original_working_dir() const {
+  return impl_->original_working_dir_.c_str();
+}
+
+// Returns the TestCase object for the test that's currently running,
+// or NULL if no test is running.
+const TestCase* UnitTest::current_test_case() const
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  internal::MutexLock lock(&mutex_);
+  return impl_->current_test_case();
+}
+
+// Returns the TestInfo object for the test that's currently running,
+// or NULL if no test is running.
+const TestInfo* UnitTest::current_test_info() const
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  internal::MutexLock lock(&mutex_);
+  return impl_->current_test_info();
+}
+
+// Returns the random seed used at the start of the current test run.
+int UnitTest::random_seed() const { return impl_->random_seed(); }
+
+#if GTEST_HAS_PARAM_TEST
+// Returns ParameterizedTestCaseRegistry object used to keep track of
+// value-parameterized tests and instantiate and register them.
+internal::ParameterizedTestCaseRegistry&
+    UnitTest::parameterized_test_registry()
+        GTEST_LOCK_EXCLUDED_(mutex_) {
+  return impl_->parameterized_test_registry();
+}
+#endif  // GTEST_HAS_PARAM_TEST
+
+// Creates an empty UnitTest.
+UnitTest::UnitTest() {
+  impl_ = new internal::UnitTestImpl(this);
+}
+
+// Destructor of UnitTest.
+UnitTest::~UnitTest() {
+  delete impl_;
+}
+
+// Pushes a trace defined by SCOPED_TRACE() on to the per-thread
+// Google Test trace stack.
+void UnitTest::PushGTestTrace(const internal::TraceInfo& trace)
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  internal::MutexLock lock(&mutex_);
+  impl_->gtest_trace_stack().push_back(trace);
+}
+
+// Pops a trace from the per-thread Google Test trace stack.
+void UnitTest::PopGTestTrace()
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  internal::MutexLock lock(&mutex_);
+  impl_->gtest_trace_stack().pop_back();
+}
+
+namespace internal {
+
+UnitTestImpl::UnitTestImpl(UnitTest* parent)
+    : parent_(parent),
+      GTEST_DISABLE_MSC_WARNINGS_PUSH_(4355 /* using this in initializer */)
+      default_global_test_part_result_reporter_(this),
+      default_per_thread_test_part_result_reporter_(this),
+      GTEST_DISABLE_MSC_WARNINGS_POP_()
+      global_test_part_result_repoter_(
+          &default_global_test_part_result_reporter_),
+      per_thread_test_part_result_reporter_(
+          &default_per_thread_test_part_result_reporter_),
+#if GTEST_HAS_PARAM_TEST
+      parameterized_test_registry_(),
+      parameterized_tests_registered_(false),
+#endif  // GTEST_HAS_PARAM_TEST
+      last_death_test_case_(-1),
+      current_test_case_(NULL),
+      current_test_info_(NULL),
+      ad_hoc_test_result_(),
+      os_stack_trace_getter_(NULL),
+      post_flag_parse_init_performed_(false),
+      random_seed_(0),  // Will be overridden by the flag before first use.
+      random_(0),  // Will be reseeded before first use.
+      start_timestamp_(0),
+      elapsed_time_(0),
+#if GTEST_HAS_DEATH_TEST
+      death_test_factory_(new DefaultDeathTestFactory),
+#endif
+      // Will be overridden by the flag before first use.
+      catch_exceptions_(false) {
+  listeners()->SetDefaultResultPrinter(new PrettyUnitTestResultPrinter);
+}
+
+UnitTestImpl::~UnitTestImpl() {
+  // Deletes every TestCase.
+  ForEach(test_cases_, internal::Delete<TestCase>);
+
+  // Deletes every Environment.
+  ForEach(environments_, internal::Delete<Environment>);
+
+  delete os_stack_trace_getter_;
+}
+
+// Adds a TestProperty to the current TestResult object when invoked in a
+// context of a test, to current test case's ad_hoc_test_result when invoke
+// from SetUpTestCase/TearDownTestCase, or to the global property set
+// otherwise.  If the result already contains a property with the same key,
+// the value will be updated.
+void UnitTestImpl::RecordProperty(const TestProperty& test_property) {
+  std::string xml_element;
+  TestResult* test_result;  // TestResult appropriate for property recording.
+
+  if (current_test_info_ != NULL) {
+    xml_element = "testcase";
+    test_result = &(current_test_info_->result_);
+  } else if (current_test_case_ != NULL) {
+    xml_element = "testsuite";
+    test_result = &(current_test_case_->ad_hoc_test_result_);
+  } else {
+    xml_element = "testsuites";
+    test_result = &ad_hoc_test_result_;
+  }
+  test_result->RecordProperty(xml_element, test_property);
+}
+
+#if GTEST_HAS_DEATH_TEST
+// Disables event forwarding if the control is currently in a death test
+// subprocess. Must not be called before InitGoogleTest.
+void UnitTestImpl::SuppressTestEventsIfInSubprocess() {
+  if (internal_run_death_test_flag_.get() != NULL)
+    listeners()->SuppressEventForwarding();
+}
+#endif  // GTEST_HAS_DEATH_TEST
+
+// Initializes event listeners performing XML output as specified by
+// UnitTestOptions. Must not be called before InitGoogleTest.
+void UnitTestImpl::ConfigureXmlOutput() {
+  const std::string& output_format = UnitTestOptions::GetOutputFormat();
+  if (output_format == "xml") {
+    listeners()->SetDefaultXmlGenerator(new XmlUnitTestResultPrinter(
+        UnitTestOptions::GetAbsolutePathToOutputFile().c_str()));
+  } else if (output_format != "") {
+    printf("WARNING: unrecognized output format \"%s\" ignored.\n",
+           output_format.c_str());
+    fflush(stdout);
+  }
+}
+
+#if GTEST_CAN_STREAM_RESULTS_
+// Initializes event listeners for streaming test results in string form.
+// Must not be called before InitGoogleTest.
+void UnitTestImpl::ConfigureStreamingOutput() {
+  const std::string& target = GTEST_FLAG(stream_result_to);
+  if (!target.empty()) {
+    const size_t pos = target.find(':');
+    if (pos != std::string::npos) {
+      listeners()->Append(new StreamingListener(target.substr(0, pos),
+                                                target.substr(pos+1)));
+    } else {
+      printf("WARNING: unrecognized streaming target \"%s\" ignored.\n",
+             target.c_str());
+      fflush(stdout);
+    }
+  }
+}
+#endif  // GTEST_CAN_STREAM_RESULTS_
+
+// Performs initialization dependent upon flag values obtained in
+// ParseGoogleTestFlagsOnly.  Is called from InitGoogleTest after the call to
+// ParseGoogleTestFlagsOnly.  In case a user neglects to call InitGoogleTest
+// this function is also called from RunAllTests.  Since this function can be
+// called more than once, it has to be idempotent.
+void UnitTestImpl::PostFlagParsingInit() {
+  // Ensures that this function does not execute more than once.
+  if (!post_flag_parse_init_performed_) {
+    post_flag_parse_init_performed_ = true;
+
+#if GTEST_HAS_DEATH_TEST
+    InitDeathTestSubprocessControlInfo();
+    SuppressTestEventsIfInSubprocess();
+#endif  // GTEST_HAS_DEATH_TEST
+
+    // Registers parameterized tests. This makes parameterized tests
+    // available to the UnitTest reflection API without running
+    // RUN_ALL_TESTS.
+    RegisterParameterizedTests();
+
+    // Configures listeners for XML output. This makes it possible for users
+    // to shut down the default XML output before invoking RUN_ALL_TESTS.
+    ConfigureXmlOutput();
+
+#if GTEST_CAN_STREAM_RESULTS_
+    // Configures listeners for streaming test results to the specified server.
+    ConfigureStreamingOutput();
+#endif  // GTEST_CAN_STREAM_RESULTS_
+  }
+}
+
+// A predicate that checks the name of a TestCase against a known
+// value.
+//
+// This is used for implementation of the UnitTest class only.  We put
+// it in the anonymous namespace to prevent polluting the outer
+// namespace.
+//
+// TestCaseNameIs is copyable.
+class TestCaseNameIs {
+ public:
+  // Constructor.
+  explicit TestCaseNameIs(const std::string& name)
+      : name_(name) {}
+
+  // Returns true iff the name of test_case matches name_.
+  bool operator()(const TestCase* test_case) const {
+    return test_case != NULL && strcmp(test_case->name(), name_.c_str()) == 0;
+  }
+
+ private:
+  std::string name_;
+};
+
+// Finds and returns a TestCase with the given name.  If one doesn't
+// exist, creates one and returns it.  It's the CALLER'S
+// RESPONSIBILITY to ensure that this function is only called WHEN THE
+// TESTS ARE NOT SHUFFLED.
+//
+// Arguments:
+//
+//   test_case_name: name of the test case
+//   type_param:     the name of the test case's type parameter, or NULL if
+//                   this is not a typed or a type-parameterized test case.
+//   set_up_tc:      pointer to the function that sets up the test case
+//   tear_down_tc:   pointer to the function that tears down the test case
+TestCase* UnitTestImpl::GetTestCase(const char* test_case_name,
+                                    const char* type_param,
+                                    Test::SetUpTestCaseFunc set_up_tc,
+                                    Test::TearDownTestCaseFunc tear_down_tc) {
+  // Can we find a TestCase with the given name?
+  const std::vector<TestCase*>::const_iterator test_case =
+      std::find_if(test_cases_.begin(), test_cases_.end(),
+                   TestCaseNameIs(test_case_name));
+
+  if (test_case != test_cases_.end())
+    return *test_case;
+
+  // No.  Let's create one.
+  TestCase* const new_test_case =
+      new TestCase(test_case_name, type_param, set_up_tc, tear_down_tc);
+
+  // Is this a death test case?
+  if (internal::UnitTestOptions::MatchesFilter(test_case_name,
+                                               kDeathTestCaseFilter)) {
+    // Yes.  Inserts the test case after the last death test case
+    // defined so far.  This only works when the test cases haven't
+    // been shuffled.  Otherwise we may end up running a death test
+    // after a non-death test.
+    ++last_death_test_case_;
+    test_cases_.insert(test_cases_.begin() + last_death_test_case_,
+                       new_test_case);
+  } else {
+    // No.  Appends to the end of the list.
+    test_cases_.push_back(new_test_case);
+  }
+
+  test_case_indices_.push_back(static_cast<int>(test_case_indices_.size()));
+  return new_test_case;
+}
+
+// Helpers for setting up / tearing down the given environment.  They
+// are for use in the ForEach() function.
+static void SetUpEnvironment(Environment* env) { env->SetUp(); }
+static void TearDownEnvironment(Environment* env) { env->TearDown(); }
+
+// Runs all tests in this UnitTest object, prints the result, and
+// returns true if all tests are successful.  If any exception is
+// thrown during a test, the test is considered to be failed, but the
+// rest of the tests will still be run.
+//
+// When parameterized tests are enabled, it expands and registers
+// parameterized tests first in RegisterParameterizedTests().
+// All other functions called from RunAllTests() may safely assume that
+// parameterized tests are ready to be counted and run.
+bool UnitTestImpl::RunAllTests() {
+  // Makes sure InitGoogleTest() was called.
+  if (!GTestIsInitialized()) {
+    printf("%s",
+           "\nThis test program did NOT call ::testing::InitGoogleTest "
+           "before calling RUN_ALL_TESTS().  Please fix it.\n");
+    return false;
+  }
+
+  // Do not run any test if the --help flag was specified.
+  if (g_help_flag)
+    return true;
+
+  // Repeats the call to the post-flag parsing initialization in case the
+  // user didn't call InitGoogleTest.
+  PostFlagParsingInit();
+
+  // Even if sharding is not on, test runners may want to use the
+  // GTEST_SHARD_STATUS_FILE to query whether the test supports the sharding
+  // protocol.
+  internal::WriteToShardStatusFileIfNeeded();
+
+  // True iff we are in a subprocess for running a thread-safe-style
+  // death test.
+  bool in_subprocess_for_death_test = false;
+
+#if GTEST_HAS_DEATH_TEST
+  in_subprocess_for_death_test = (internal_run_death_test_flag_.get() != NULL);
+#endif  // GTEST_HAS_DEATH_TEST
+
+  const bool should_shard = ShouldShard(kTestTotalShards, kTestShardIndex,
+                                        in_subprocess_for_death_test);
+
+  // Compares the full test names with the filter to decide which
+  // tests to run.
+  const bool has_tests_to_run = FilterTests(should_shard
+                                              ? HONOR_SHARDING_PROTOCOL
+                                              : IGNORE_SHARDING_PROTOCOL) > 0;
+
+  // Lists the tests and exits if the --gtest_list_tests flag was specified.
+  if (GTEST_FLAG(list_tests)) {
+    // This must be called *after* FilterTests() has been called.
+    ListTestsMatchingFilter();
+    return true;
+  }
+
+  random_seed_ = GTEST_FLAG(shuffle) ?
+      GetRandomSeedFromFlag(GTEST_FLAG(random_seed)) : 0;
+
+  // True iff at least one test has failed.
+  bool failed = false;
+
+  TestEventListener* repeater = listeners()->repeater();
+
+  start_timestamp_ = GetTimeInMillis();
+  repeater->OnTestProgramStart(*parent_);
+
+  // How many times to repeat the tests?  We don't want to repeat them
+  // when we are inside the subprocess of a death test.
+  const int repeat = in_subprocess_for_death_test ? 1 : GTEST_FLAG(repeat);
+  // Repeats forever if the repeat count is negative.
+  const bool forever = repeat < 0;
+  for (int i = 0; forever || i != repeat; i++) {
+    // We want to preserve failures generated by ad-hoc test
+    // assertions executed before RUN_ALL_TESTS().
+    ClearNonAdHocTestResult();
+
+    const TimeInMillis start = GetTimeInMillis();
+
+    // Shuffles test cases and tests if requested.
+    if (has_tests_to_run && GTEST_FLAG(shuffle)) {
+      random()->Reseed(random_seed_);
+      // This should be done before calling OnTestIterationStart(),
+      // such that a test event listener can see the actual test order
+      // in the event.
+      ShuffleTests();
+    }
+
+    // Tells the unit test event listeners that the tests are about to start.
+    repeater->OnTestIterationStart(*parent_, i);
+
+    // Runs each test case if there is at least one test to run.
+    if (has_tests_to_run) {
+      // Sets up all environments beforehand.
+      repeater->OnEnvironmentsSetUpStart(*parent_);
+      ForEach(environments_, SetUpEnvironment);
+      repeater->OnEnvironmentsSetUpEnd(*parent_);
+
+      // Runs the tests only if there was no fatal failure during global
+      // set-up.
+      if (!Test::HasFatalFailure()) {
+        for (int test_index = 0; test_index < total_test_case_count();
+             test_index++) {
+          GetMutableTestCase(test_index)->Run();
+        }
+      }
+
+      // Tears down all environments in reverse order afterwards.
+      repeater->OnEnvironmentsTearDownStart(*parent_);
+      std::for_each(environments_.rbegin(), environments_.rend(),
+                    TearDownEnvironment);
+      repeater->OnEnvironmentsTearDownEnd(*parent_);
+    }
+
+    elapsed_time_ = GetTimeInMillis() - start;
+
+    // Tells the unit test event listener that the tests have just finished.
+    repeater->OnTestIterationEnd(*parent_, i);
+
+    // Gets the result and clears it.
+    if (!Passed()) {
+      failed = true;
+    }
+
+    // Restores the original test order after the iteration.  This
+    // allows the user to quickly repro a failure that happens in the
+    // N-th iteration without repeating the first (N - 1) iterations.
+    // This is not enclosed in "if (GTEST_FLAG(shuffle)) { ... }", in
+    // case the user somehow changes the value of the flag somewhere
+    // (it's always safe to unshuffle the tests).
+    UnshuffleTests();
+
+    if (GTEST_FLAG(shuffle)) {
+      // Picks a new random seed for each iteration.
+      random_seed_ = GetNextRandomSeed(random_seed_);
+    }
+  }
+
+  repeater->OnTestProgramEnd(*parent_);
+
+  return !failed;
+}
+
+// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file
+// if the variable is present. If a file already exists at this location, this
+// function will write over it. If the variable is present, but the file cannot
+// be created, prints an error and exits.
+void WriteToShardStatusFileIfNeeded() {
+  const char* const test_shard_file = posix::GetEnv(kTestShardStatusFile);
+  if (test_shard_file != NULL) {
+    FILE* const file = posix::FOpen(test_shard_file, "w");
+    if (file == NULL) {
+      ColoredPrintf(COLOR_RED,
+                    "Could not write to the test shard status file \"%s\" "
+                    "specified by the %s environment variable.\n",
+                    test_shard_file, kTestShardStatusFile);
+      fflush(stdout);
+      exit(EXIT_FAILURE);
+    }
+    fclose(file);
+  }
+}
+
+// Checks whether sharding is enabled by examining the relevant
+// environment variable values. If the variables are present,
+// but inconsistent (i.e., shard_index >= total_shards), prints
+// an error and exits. If in_subprocess_for_death_test, sharding is
+// disabled because it must only be applied to the original test
+// process. Otherwise, we could filter out death tests we intended to execute.
+bool ShouldShard(const char* total_shards_env,
+                 const char* shard_index_env,
+                 bool in_subprocess_for_death_test) {
+  if (in_subprocess_for_death_test) {
+    return false;
+  }
+
+  const Int32 total_shards = Int32FromEnvOrDie(total_shards_env, -1);
+  const Int32 shard_index = Int32FromEnvOrDie(shard_index_env, -1);
+
+  if (total_shards == -1 && shard_index == -1) {
+    return false;
+  } else if (total_shards == -1 && shard_index != -1) {
+    const Message msg = Message()
+      << "Invalid environment variables: you have "
+      << kTestShardIndex << " = " << shard_index
+      << ", but have left " << kTestTotalShards << " unset.\n";
+    ColoredPrintf(COLOR_RED, msg.GetString().c_str());
+    fflush(stdout);
+    exit(EXIT_FAILURE);
+  } else if (total_shards != -1 && shard_index == -1) {
+    const Message msg = Message()
+      << "Invalid environment variables: you have "
+      << kTestTotalShards << " = " << total_shards
+      << ", but have left " << kTestShardIndex << " unset.\n";
+    ColoredPrintf(COLOR_RED, msg.GetString().c_str());
+    fflush(stdout);
+    exit(EXIT_FAILURE);
+  } else if (shard_index < 0 || shard_index >= total_shards) {
+    const Message msg = Message()
+      << "Invalid environment variables: we require 0 <= "
+      << kTestShardIndex << " < " << kTestTotalShards
+      << ", but you have " << kTestShardIndex << "=" << shard_index
+      << ", " << kTestTotalShards << "=" << total_shards << ".\n";
+    ColoredPrintf(COLOR_RED, msg.GetString().c_str());
+    fflush(stdout);
+    exit(EXIT_FAILURE);
+  }
+
+  return total_shards > 1;
+}
+
+// Parses the environment variable var as an Int32. If it is unset,
+// returns default_val. If it is not an Int32, prints an error
+// and aborts.
+Int32 Int32FromEnvOrDie(const char* var, Int32 default_val) {
+  const char* str_val = posix::GetEnv(var);
+  if (str_val == NULL) {
+    return default_val;
+  }
+
+  Int32 result;
+  if (!ParseInt32(Message() << "The value of environment variable " << var,
+                  str_val, &result)) {
+    exit(EXIT_FAILURE);
+  }
+  return result;
+}
+
+// Given the total number of shards, the shard index, and the test id,
+// returns true iff the test should be run on this shard. The test id is
+// some arbitrary but unique non-negative integer assigned to each test
+// method. Assumes that 0 <= shard_index < total_shards.
+bool ShouldRunTestOnShard(int total_shards, int shard_index, int test_id) {
+  return (test_id % total_shards) == shard_index;
+}
+
+// Compares the name of each test with the user-specified filter to
+// decide whether the test should be run, then records the result in
+// each TestCase and TestInfo object.
+// If shard_tests == true, further filters tests based on sharding
+// variables in the environment - see
+// http://code.google.com/p/googletest/wiki/GoogleTestAdvancedGuide.
+// Returns the number of tests that should run.
+int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) {
+  const Int32 total_shards = shard_tests == HONOR_SHARDING_PROTOCOL ?
+      Int32FromEnvOrDie(kTestTotalShards, -1) : -1;
+  const Int32 shard_index = shard_tests == HONOR_SHARDING_PROTOCOL ?
+      Int32FromEnvOrDie(kTestShardIndex, -1) : -1;
+
+  // num_runnable_tests are the number of tests that will
+  // run across all shards (i.e., match filter and are not disabled).
+  // num_selected_tests are the number of tests to be run on
+  // this shard.
+  int num_runnable_tests = 0;
+  int num_selected_tests = 0;
+  for (size_t i = 0; i < test_cases_.size(); i++) {
+    TestCase* const test_case = test_cases_[i];
+    const std::string &test_case_name = test_case->name();
+    test_case->set_should_run(false);
+
+    for (size_t j = 0; j < test_case->test_info_list().size(); j++) {
+      TestInfo* const test_info = test_case->test_info_list()[j];
+      const std::string test_name(test_info->name());
+      // A test is disabled if test case name or test name matches
+      // kDisableTestFilter.
+      const bool is_disabled =
+          internal::UnitTestOptions::MatchesFilter(test_case_name,
+                                                   kDisableTestFilter) ||
+          internal::UnitTestOptions::MatchesFilter(test_name,
+                                                   kDisableTestFilter);
+      test_info->is_disabled_ = is_disabled;
+
+      const bool matches_filter =
+          internal::UnitTestOptions::FilterMatchesTest(test_case_name,
+                                                       test_name);
+      test_info->matches_filter_ = matches_filter;
+
+      const bool is_runnable =
+          (GTEST_FLAG(also_run_disabled_tests) || !is_disabled) &&
+          matches_filter;
+
+      const bool is_selected = is_runnable &&
+          (shard_tests == IGNORE_SHARDING_PROTOCOL ||
+           ShouldRunTestOnShard(total_shards, shard_index,
+                                num_runnable_tests));
+
+      num_runnable_tests += is_runnable;
+      num_selected_tests += is_selected;
+
+      test_info->should_run_ = is_selected;
+      test_case->set_should_run(test_case->should_run() || is_selected);
+    }
+  }
+  return num_selected_tests;
+}
+
+// Prints the given C-string on a single line by replacing all '\n'
+// characters with string "\\n".  If the output takes more than
+// max_length characters, only prints the first max_length characters
+// and "...".
+static void PrintOnOneLine(const char* str, int max_length) {
+  if (str != NULL) {
+    for (int i = 0; *str != '\0'; ++str) {
+      if (i >= max_length) {
+        printf("...");
+        break;
+      }
+      if (*str == '\n') {
+        printf("\\n");
+        i += 2;
+      } else {
+        printf("%c", *str);
+        ++i;
+      }
+    }
+  }
+}
+
+// Prints the names of the tests matching the user-specified filter flag.
+void UnitTestImpl::ListTestsMatchingFilter() {
+  // Print at most this many characters for each type/value parameter.
+  const int kMaxParamLength = 250;
+
+  for (size_t i = 0; i < test_cases_.size(); i++) {
+    const TestCase* const test_case = test_cases_[i];
+    bool printed_test_case_name = false;
+
+    for (size_t j = 0; j < test_case->test_info_list().size(); j++) {
+      const TestInfo* const test_info =
+          test_case->test_info_list()[j];
+      if (test_info->matches_filter_) {
+        if (!printed_test_case_name) {
+          printed_test_case_name = true;
+          printf("%s.", test_case->name());
+          if (test_case->type_param() != NULL) {
+            printf("  # %s = ", kTypeParamLabel);
+            // We print the type parameter on a single line to make
+            // the output easy to parse by a program.
+            PrintOnOneLine(test_case->type_param(), kMaxParamLength);
+          }
+          printf("\n");
+        }
+        printf("  %s", test_info->name());
+        if (test_info->value_param() != NULL) {
+          printf("  # %s = ", kValueParamLabel);
+          // We print the value parameter on a single line to make the
+          // output easy to parse by a program.
+          PrintOnOneLine(test_info->value_param(), kMaxParamLength);
+        }
+        printf("\n");
+      }
+    }
+  }
+  fflush(stdout);
+}
+
+// Sets the OS stack trace getter.
+//
+// Does nothing if the input and the current OS stack trace getter are
+// the same; otherwise, deletes the old getter and makes the input the
+// current getter.
+void UnitTestImpl::set_os_stack_trace_getter(
+    OsStackTraceGetterInterface* getter) {
+  if (os_stack_trace_getter_ != getter) {
+    delete os_stack_trace_getter_;
+    os_stack_trace_getter_ = getter;
+  }
+}
+
+// Returns the current OS stack trace getter if it is not NULL;
+// otherwise, creates an OsStackTraceGetter, makes it the current
+// getter, and returns it.
+OsStackTraceGetterInterface* UnitTestImpl::os_stack_trace_getter() {
+  if (os_stack_trace_getter_ == NULL) {
+    os_stack_trace_getter_ = new OsStackTraceGetter;
+  }
+
+  return os_stack_trace_getter_;
+}
+
+// Returns the TestResult for the test that's currently running, or
+// the TestResult for the ad hoc test if no test is running.
+TestResult* UnitTestImpl::current_test_result() {
+  return current_test_info_ ?
+      &(current_test_info_->result_) : &ad_hoc_test_result_;
+}
+
+// Shuffles all test cases, and the tests within each test case,
+// making sure that death tests are still run first.
+void UnitTestImpl::ShuffleTests() {
+  // Shuffles the death test cases.
+  ShuffleRange(random(), 0, last_death_test_case_ + 1, &test_case_indices_);
+
+  // Shuffles the non-death test cases.
+  ShuffleRange(random(), last_death_test_case_ + 1,
+               static_cast<int>(test_cases_.size()), &test_case_indices_);
+
+  // Shuffles the tests inside each test case.
+  for (size_t i = 0; i < test_cases_.size(); i++) {
+    test_cases_[i]->ShuffleTests(random());
+  }
+}
+
+// Restores the test cases and tests to their order before the first shuffle.
+void UnitTestImpl::UnshuffleTests() {
+  for (size_t i = 0; i < test_cases_.size(); i++) {
+    // Unshuffles the tests in each test case.
+    test_cases_[i]->UnshuffleTests();
+    // Resets the index of each test case.
+    test_case_indices_[i] = static_cast<int>(i);
+  }
+}
+
+// Returns the current OS stack trace as an std::string.
+//
+// The maximum number of stack frames to be included is specified by
+// the gtest_stack_trace_depth flag.  The skip_count parameter
+// specifies the number of top frames to be skipped, which doesn't
+// count against the number of frames to be included.
+//
+// For example, if Foo() calls Bar(), which in turn calls
+// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in
+// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't.
+std::string GetCurrentOsStackTraceExceptTop(UnitTest* /*unit_test*/,
+                                            int skip_count) {
+  // We pass skip_count + 1 to skip this wrapper function in addition
+  // to what the user really wants to skip.
+  return GetUnitTestImpl()->CurrentOsStackTraceExceptTop(skip_count + 1);
+}
+
+// Used by the GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_ macro to
+// suppress unreachable code warnings.
+namespace {
+class ClassUniqueToAlwaysTrue {};
+}
+
+bool IsTrue(bool condition) { return condition; }
+
+bool AlwaysTrue() {
+#if GTEST_HAS_EXCEPTIONS
+  // This condition is always false so AlwaysTrue() never actually throws,
+  // but it makes the compiler think that it may throw.
+  if (IsTrue(false))
+    throw ClassUniqueToAlwaysTrue();
+#endif  // GTEST_HAS_EXCEPTIONS
+  return true;
+}
+
+// If *pstr starts with the given prefix, modifies *pstr to be right
+// past the prefix and returns true; otherwise leaves *pstr unchanged
+// and returns false.  None of pstr, *pstr, and prefix can be NULL.
+bool SkipPrefix(const char* prefix, const char** pstr) {
+  const size_t prefix_len = strlen(prefix);
+  if (strncmp(*pstr, prefix, prefix_len) == 0) {
+    *pstr += prefix_len;
+    return true;
+  }
+  return false;
+}
+
+// Parses a string as a command line flag.  The string should have
+// the format "--flag=value".  When def_optional is true, the "=value"
+// part can be omitted.
+//
+// Returns the value of the flag, or NULL if the parsing failed.
+const char* ParseFlagValue(const char* str,
+                           const char* flag,
+                           bool def_optional) {
+  // str and flag must not be NULL.
+  if (str == NULL || flag == NULL) return NULL;
+
+  // The flag must start with "--" followed by GTEST_FLAG_PREFIX_.
+  const std::string flag_str = std::string("--") + GTEST_FLAG_PREFIX_ + flag;
+  const size_t flag_len = flag_str.length();
+  if (strncmp(str, flag_str.c_str(), flag_len) != 0) return NULL;
+
+  // Skips the flag name.
+  const char* flag_end = str + flag_len;
+
+  // When def_optional is true, it's OK to not have a "=value" part.
+  if (def_optional && (flag_end[0] == '\0')) {
+    return flag_end;
+  }
+
+  // If def_optional is true and there are more characters after the
+  // flag name, or if def_optional is false, there must be a '=' after
+  // the flag name.
+  if (flag_end[0] != '=') return NULL;
+
+  // Returns the string after "=".
+  return flag_end + 1;
+}
+
+// Parses a string for a bool flag, in the form of either
+// "--flag=value" or "--flag".
+//
+// In the former case, the value is taken as true as long as it does
+// not start with '0', 'f', or 'F'.
+//
+// In the latter case, the value is taken as true.
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseFlagValue(str, flag, true);
+
+  // Aborts if the parsing failed.
+  if (value_str == NULL) return false;
+
+  // Converts the string value to a bool.
+  *value = !(*value_str == '0' || *value_str == 'f' || *value_str == 'F');
+  return true;
+}
+
+// Parses a string for an Int32 flag, in the form of
+// "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+bool ParseInt32Flag(const char* str, const char* flag, Int32* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseFlagValue(str, flag, false);
+
+  // Aborts if the parsing failed.
+  if (value_str == NULL) return false;
+
+  // Sets *value to the value of the flag.
+  return ParseInt32(Message() << "The value of flag --" << flag,
+                    value_str, value);
+}
+
+// Parses a string for a string flag, in the form of
+// "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+bool ParseStringFlag(const char* str, const char* flag, std::string* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseFlagValue(str, flag, false);
+
+  // Aborts if the parsing failed.
+  if (value_str == NULL) return false;
+
+  // Sets *value to the value of the flag.
+  *value = value_str;
+  return true;
+}
+
+// Determines whether a string has a prefix that Google Test uses for its
+// flags, i.e., starts with GTEST_FLAG_PREFIX_ or GTEST_FLAG_PREFIX_DASH_.
+// If Google Test detects that a command line flag has its prefix but is not
+// recognized, it will print its help message. Flags starting with
+// GTEST_INTERNAL_PREFIX_ followed by "internal_" are considered Google Test
+// internal flags and do not trigger the help message.
+static bool HasGoogleTestFlagPrefix(const char* str) {
+  return (SkipPrefix("--", &str) ||
+          SkipPrefix("-", &str) ||
+          SkipPrefix("/", &str)) &&
+         !SkipPrefix(GTEST_FLAG_PREFIX_ "internal_", &str) &&
+         (SkipPrefix(GTEST_FLAG_PREFIX_, &str) ||
+          SkipPrefix(GTEST_FLAG_PREFIX_DASH_, &str));
+}
+
+// Prints a string containing code-encoded text.  The following escape
+// sequences can be used in the string to control the text color:
+//
+//   @@    prints a single '@' character.
+//   @R    changes the color to red.
+//   @G    changes the color to green.
+//   @Y    changes the color to yellow.
+//   @D    changes to the default terminal text color.
+//
+// TODO(wan at google.com): Write tests for this once we add stdout
+// capturing to Google Test.
+static void PrintColorEncoded(const char* str) {
+  GTestColor color = COLOR_DEFAULT;  // The current color.
+
+  // Conceptually, we split the string into segments divided by escape
+  // sequences.  Then we print one segment at a time.  At the end of
+  // each iteration, the str pointer advances to the beginning of the
+  // next segment.
+  for (;;) {
+    const char* p = strchr(str, '@');
+    if (p == NULL) {
+      ColoredPrintf(color, "%s", str);
+      return;
+    }
+
+    ColoredPrintf(color, "%s", std::string(str, p).c_str());
+
+    const char ch = p[1];
+    str = p + 2;
+    if (ch == '@') {
+      ColoredPrintf(color, "@");
+    } else if (ch == 'D') {
+      color = COLOR_DEFAULT;
+    } else if (ch == 'R') {
+      color = COLOR_RED;
+    } else if (ch == 'G') {
+      color = COLOR_GREEN;
+    } else if (ch == 'Y') {
+      color = COLOR_YELLOW;
+    } else {
+      --str;
+    }
+  }
+}
+
+static const char kColorEncodedHelpMessage[] =
+"This program contains tests written using " GTEST_NAME_ ". You can use the\n"
+"following command line flags to control its behavior:\n"
+"\n"
+"Test Selection:\n"
+"  @G--" GTEST_FLAG_PREFIX_ "list_tests at D\n"
+"      List the names of all tests instead of running them. The name of\n"
+"      TEST(Foo, Bar) is \"Foo.Bar\".\n"
+"  @G--" GTEST_FLAG_PREFIX_ "filter=@YPOSTIVE_PATTERNS"
+    "[@G- at YNEGATIVE_PATTERNS]@D\n"
+"      Run only the tests whose name matches one of the positive patterns but\n"
+"      none of the negative patterns. '?' matches any single character; '*'\n"
+"      matches any substring; ':' separates two patterns.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "also_run_disabled_tests at D\n"
+"      Run all disabled tests too.\n"
+"\n"
+"Test Execution:\n"
+"  @G--" GTEST_FLAG_PREFIX_ "repeat=@Y[COUNT]@D\n"
+"      Run the tests repeatedly; use a negative count to repeat forever.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "shuffle at D\n"
+"      Randomize tests' orders on every iteration.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "random_seed=@Y[NUMBER]@D\n"
+"      Random number seed to use for shuffling test orders (between 1 and\n"
+"      99999, or 0 to use a seed based on the current time).\n"
+"\n"
+"Test Output:\n"
+"  @G--" GTEST_FLAG_PREFIX_ "color=@Y(@Gyes at Y|@Gno at Y|@Gauto at Y)@D\n"
+"      Enable/disable colored output. The default is @Gauto at D.\n"
+"  - at G-" GTEST_FLAG_PREFIX_ "print_time=0 at D\n"
+"      Don't print the elapsed time of each test.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "output=xml at Y[@G:@YDIRECTORY_PATH at G"
+    GTEST_PATH_SEP_ "@Y|@G:@YFILE_PATH]@D\n"
+"      Generate an XML report in the given directory or with the given file\n"
+"      name. @YFILE_PATH at D defaults to @Gtest_details.xml at D.\n"
+#if GTEST_CAN_STREAM_RESULTS_
+"  @G--" GTEST_FLAG_PREFIX_ "stream_result_to=@YHOST at G:@YPORT at D\n"
+"      Stream test results to the given server.\n"
+#endif  // GTEST_CAN_STREAM_RESULTS_
+"\n"
+"Assertion Behavior:\n"
+#if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
+"  @G--" GTEST_FLAG_PREFIX_ "death_test_style=@Y(@Gfast at Y|@Gthreadsafe at Y)@D\n"
+"      Set the default death test style.\n"
+#endif  // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
+"  @G--" GTEST_FLAG_PREFIX_ "break_on_failure at D\n"
+"      Turn assertion failures into debugger break-points.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "throw_on_failure at D\n"
+"      Turn assertion failures into C++ exceptions.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "catch_exceptions=0 at D\n"
+"      Do not report exceptions as test failures. Instead, allow them\n"
+"      to crash the program or throw a pop-up (on Windows).\n"
+"\n"
+"Except for @G--" GTEST_FLAG_PREFIX_ "list_tests at D, you can alternatively set "
+    "the corresponding\n"
+"environment variable of a flag (all letters in upper-case). For example, to\n"
+"disable colored text output, you can either specify @G--" GTEST_FLAG_PREFIX_
+    "color=no at D or set\n"
+"the @G" GTEST_FLAG_PREFIX_UPPER_ "COLOR at D environment variable to @Gno at D.\n"
+"\n"
+"For more information, please read the " GTEST_NAME_ " documentation at\n"
+"@G" GTEST_PROJECT_URL_ "@D. If you find a bug in " GTEST_NAME_ "\n"
+"(not one in your own code or tests), please report it to\n"
+"@G<" GTEST_DEV_EMAIL_ ">@D.\n";
+
+// Parses the command line for Google Test flags, without initializing
+// other parts of Google Test.  The type parameter CharType can be
+// instantiated to either char or wchar_t.
+template <typename CharType>
+void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) {
+  for (int i = 1; i < *argc; i++) {
+    const std::string arg_string = StreamableToString(argv[i]);
+    const char* const arg = arg_string.c_str();
+
+    using internal::ParseBoolFlag;
+    using internal::ParseInt32Flag;
+    using internal::ParseStringFlag;
+
+    // Do we see a Google Test flag?
+    if (ParseBoolFlag(arg, kAlsoRunDisabledTestsFlag,
+                      &GTEST_FLAG(also_run_disabled_tests)) ||
+        ParseBoolFlag(arg, kBreakOnFailureFlag,
+                      &GTEST_FLAG(break_on_failure)) ||
+        ParseBoolFlag(arg, kCatchExceptionsFlag,
+                      &GTEST_FLAG(catch_exceptions)) ||
+        ParseStringFlag(arg, kColorFlag, &GTEST_FLAG(color)) ||
+        ParseStringFlag(arg, kDeathTestStyleFlag,
+                        &GTEST_FLAG(death_test_style)) ||
+        ParseBoolFlag(arg, kDeathTestUseFork,
+                      &GTEST_FLAG(death_test_use_fork)) ||
+        ParseStringFlag(arg, kFilterFlag, &GTEST_FLAG(filter)) ||
+        ParseStringFlag(arg, kInternalRunDeathTestFlag,
+                        &GTEST_FLAG(internal_run_death_test)) ||
+        ParseBoolFlag(arg, kListTestsFlag, &GTEST_FLAG(list_tests)) ||
+        ParseStringFlag(arg, kOutputFlag, &GTEST_FLAG(output)) ||
+        ParseBoolFlag(arg, kPrintTimeFlag, &GTEST_FLAG(print_time)) ||
+        ParseInt32Flag(arg, kRandomSeedFlag, &GTEST_FLAG(random_seed)) ||
+        ParseInt32Flag(arg, kRepeatFlag, &GTEST_FLAG(repeat)) ||
+        ParseBoolFlag(arg, kShuffleFlag, &GTEST_FLAG(shuffle)) ||
+        ParseInt32Flag(arg, kStackTraceDepthFlag,
+                       &GTEST_FLAG(stack_trace_depth)) ||
+        ParseStringFlag(arg, kStreamResultToFlag,
+                        &GTEST_FLAG(stream_result_to)) ||
+        ParseBoolFlag(arg, kThrowOnFailureFlag,
+                      &GTEST_FLAG(throw_on_failure))
+        ) {
+      // Yes.  Shift the remainder of the argv list left by one.  Note
+      // that argv has (*argc + 1) elements, the last one always being
+      // NULL.  The following loop moves the trailing NULL element as
+      // well.
+      for (int j = i; j != *argc; j++) {
+        argv[j] = argv[j + 1];
+      }
+
+      // Decrements the argument count.
+      (*argc)--;
+
+      // We also need to decrement the iterator as we just removed
+      // an element.
+      i--;
+    } else if (arg_string == "--help" || arg_string == "-h" ||
+               arg_string == "-?" || arg_string == "/?" ||
+               HasGoogleTestFlagPrefix(arg)) {
+      // Both help flag and unrecognized Google Test flags (excluding
+      // internal ones) trigger help display.
+      g_help_flag = true;
+    }
+  }
+
+  if (g_help_flag) {
+    // We print the help here instead of in RUN_ALL_TESTS(), as the
+    // latter may not be called at all if the user is using Google
+    // Test with another testing framework.
+    PrintColorEncoded(kColorEncodedHelpMessage);
+  }
+}
+
+// Parses the command line for Google Test flags, without initializing
+// other parts of Google Test.
+void ParseGoogleTestFlagsOnly(int* argc, char** argv) {
+  ParseGoogleTestFlagsOnlyImpl(argc, argv);
+}
+void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv) {
+  ParseGoogleTestFlagsOnlyImpl(argc, argv);
+}
+
+// The internal implementation of InitGoogleTest().
+//
+// The type parameter CharType can be instantiated to either char or
+// wchar_t.
+template <typename CharType>
+void InitGoogleTestImpl(int* argc, CharType** argv) {
+  g_init_gtest_count++;
+
+  // We don't want to run the initialization code twice.
+  if (g_init_gtest_count != 1) return;
+
+  if (*argc <= 0) return;
+
+  internal::g_executable_path = internal::StreamableToString(argv[0]);
+
+#if GTEST_HAS_DEATH_TEST
+
+  g_argvs.clear();
+  for (int i = 0; i != *argc; i++) {
+    g_argvs.push_back(StreamableToString(argv[i]));
+  }
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+  ParseGoogleTestFlagsOnly(argc, argv);
+  GetUnitTestImpl()->PostFlagParsingInit();
+}
+
+}  // namespace internal
+
+// Initializes Google Test.  This must be called before calling
+// RUN_ALL_TESTS().  In particular, it parses a command line for the
+// flags that Google Test recognizes.  Whenever a Google Test flag is
+// seen, it is removed from argv, and *argc is decremented.
+//
+// No value is returned.  Instead, the Google Test flag variables are
+// updated.
+//
+// Calling the function for the second time has no user-visible effect.
+void InitGoogleTest(int* argc, char** argv) {
+  internal::InitGoogleTestImpl(argc, argv);
+}
+
+// This overloaded version can be used in Windows programs compiled in
+// UNICODE mode.
+void InitGoogleTest(int* argc, wchar_t** argv) {
+  internal::InitGoogleTestImpl(argc, argv);
+}
+
+}  // namespace testing
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan at google.com (Zhanyong Wan), vladl at google.com (Vlad Losev)
+//
+// This file implements death tests.
+
+
+#if GTEST_HAS_DEATH_TEST
+
+# if GTEST_OS_MAC
+#  include <crt_externs.h>
+# endif  // GTEST_OS_MAC
+
+# include <errno.h>
+# include <fcntl.h>
+# include <limits.h>
+
+# if GTEST_OS_LINUX
+#  include <signal.h>
+# endif  // GTEST_OS_LINUX
+
+# include <stdarg.h>
+
+# if GTEST_OS_WINDOWS
+#  include <windows.h>
+# else
+#  include <sys/mman.h>
+#  include <sys/wait.h>
+# endif  // GTEST_OS_WINDOWS
+
+# if GTEST_OS_QNX
+#  include <spawn.h>
+# endif  // GTEST_OS_QNX
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+
+// Indicates that this translation unit is part of Google Test's
+// implementation.  It must come before gtest-internal-inl.h is
+// included, or there will be a compiler error.  This trick exists to
+// prevent the accidental inclusion of gtest-internal-inl.h in the
+// user's code.
+#define GTEST_IMPLEMENTATION_ 1
+#undef GTEST_IMPLEMENTATION_
+
+namespace testing {
+
+// Constants.
+
+// The default death test style.
+static const char kDefaultDeathTestStyle[] = "fast";
+
+GTEST_DEFINE_string_(
+    death_test_style,
+    internal::StringFromGTestEnv("death_test_style", kDefaultDeathTestStyle),
+    "Indicates how to run a death test in a forked child process: "
+    "\"threadsafe\" (child process re-executes the test binary "
+    "from the beginning, running only the specific death test) or "
+    "\"fast\" (child process runs the death test immediately "
+    "after forking).");
+
+GTEST_DEFINE_bool_(
+    death_test_use_fork,
+    internal::BoolFromGTestEnv("death_test_use_fork", false),
+    "Instructs to use fork()/_exit() instead of clone() in death tests. "
+    "Ignored and always uses fork() on POSIX systems where clone() is not "
+    "implemented. Useful when running under valgrind or similar tools if "
+    "those do not support clone(). Valgrind 3.3.1 will just fail if "
+    "it sees an unsupported combination of clone() flags. "
+    "It is not recommended to use this flag w/o valgrind though it will "
+    "work in 99% of the cases. Once valgrind is fixed, this flag will "
+    "most likely be removed.");
+
+namespace internal {
+GTEST_DEFINE_string_(
+    internal_run_death_test, "",
+    "Indicates the file, line number, temporal index of "
+    "the single death test to run, and a file descriptor to "
+    "which a success code may be sent, all separated by "
+    "the '|' characters.  This flag is specified if and only if the current "
+    "process is a sub-process launched for running a thread-safe "
+    "death test.  FOR INTERNAL USE ONLY.");
+}  // namespace internal
+
+#if GTEST_HAS_DEATH_TEST
+
+namespace internal {
+
+// Valid only for fast death tests. Indicates the code is running in the
+// child process of a fast style death test.
+static bool g_in_fast_death_test_child = false;
+
+// Returns a Boolean value indicating whether the caller is currently
+// executing in the context of the death test child process.  Tools such as
+// Valgrind heap checkers may need this to modify their behavior in death
+// tests.  IMPORTANT: This is an internal utility.  Using it may break the
+// implementation of death tests.  User code MUST NOT use it.
+bool InDeathTestChild() {
+# if GTEST_OS_WINDOWS
+
+  // On Windows, death tests are thread-safe regardless of the value of the
+  // death_test_style flag.
+  return !GTEST_FLAG(internal_run_death_test).empty();
+
+# else
+
+  if (GTEST_FLAG(death_test_style) == "threadsafe")
+    return !GTEST_FLAG(internal_run_death_test).empty();
+  else
+    return g_in_fast_death_test_child;
+#endif
+}
+
+}  // namespace internal
+
+// ExitedWithCode constructor.
+ExitedWithCode::ExitedWithCode(int exit_code) : exit_code_(exit_code) {
+}
+
+// ExitedWithCode function-call operator.
+bool ExitedWithCode::operator()(int exit_status) const {
+# if GTEST_OS_WINDOWS
+
+  return exit_status == exit_code_;
+
+# else
+
+  return WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == exit_code_;
+
+# endif  // GTEST_OS_WINDOWS
+}
+
+# if !GTEST_OS_WINDOWS
+// KilledBySignal constructor.
+KilledBySignal::KilledBySignal(int signum) : signum_(signum) {
+}
+
+// KilledBySignal function-call operator.
+bool KilledBySignal::operator()(int exit_status) const {
+  return WIFSIGNALED(exit_status) && WTERMSIG(exit_status) == signum_;
+}
+# endif  // !GTEST_OS_WINDOWS
+
+namespace internal {
+
+// Utilities needed for death tests.
+
+// Generates a textual description of a given exit code, in the format
+// specified by wait(2).
+static std::string ExitSummary(int exit_code) {
+  Message m;
+
+# if GTEST_OS_WINDOWS
+
+  m << "Exited with exit status " << exit_code;
+
+# else
+
+  if (WIFEXITED(exit_code)) {
+    m << "Exited with exit status " << WEXITSTATUS(exit_code);
+  } else if (WIFSIGNALED(exit_code)) {
+    m << "Terminated by signal " << WTERMSIG(exit_code);
+  }
+#  ifdef WCOREDUMP
+  if (WCOREDUMP(exit_code)) {
+    m << " (core dumped)";
+  }
+#  endif
+# endif  // GTEST_OS_WINDOWS
+
+  return m.GetString();
+}
+
+// Returns true if exit_status describes a process that was terminated
+// by a signal, or exited normally with a nonzero exit code.
+bool ExitedUnsuccessfully(int exit_status) {
+  return !ExitedWithCode(0)(exit_status);
+}
+
+# if !GTEST_OS_WINDOWS
+// Generates a textual failure message when a death test finds more than
+// one thread running, or cannot determine the number of threads, prior
+// to executing the given statement.  It is the responsibility of the
+// caller not to pass a thread_count of 1.
+static std::string DeathTestThreadWarning(size_t thread_count) {
+  Message msg;
+  msg << "Death tests use fork(), which is unsafe particularly"
+      << " in a threaded context. For this test, " << GTEST_NAME_ << " ";
+  if (thread_count == 0)
+    msg << "couldn't detect the number of threads.";
+  else
+    msg << "detected " << thread_count << " threads.";
+  return msg.GetString();
+}
+# endif  // !GTEST_OS_WINDOWS
+
+// Flag characters for reporting a death test that did not die.
+static const char kDeathTestLived = 'L';
+static const char kDeathTestReturned = 'R';
+static const char kDeathTestThrew = 'T';
+static const char kDeathTestInternalError = 'I';
+
+// An enumeration describing all of the possible ways that a death test can
+// conclude.  DIED means that the process died while executing the test
+// code; LIVED means that process lived beyond the end of the test code;
+// RETURNED means that the test statement attempted to execute a return
+// statement, which is not allowed; THREW means that the test statement
+// returned control by throwing an exception.  IN_PROGRESS means the test
+// has not yet concluded.
+// TODO(vladl at google.com): Unify names and possibly values for
+// AbortReason, DeathTestOutcome, and flag characters above.
+enum DeathTestOutcome { IN_PROGRESS, DIED, LIVED, RETURNED, THREW };
+
+// Routine for aborting the program which is safe to call from an
+// exec-style death test child process, in which case the error
+// message is propagated back to the parent process.  Otherwise, the
+// message is simply printed to stderr.  In either case, the program
+// then exits with status 1.
+void DeathTestAbort(const std::string& message) {
+  // On a POSIX system, this function may be called from a threadsafe-style
+  // death test child process, which operates on a very small stack.  Use
+  // the heap for any additional non-minuscule memory requirements.
+  const InternalRunDeathTestFlag* const flag =
+      GetUnitTestImpl()->internal_run_death_test_flag();
+  if (flag != NULL) {
+    FILE* parent = posix::FDOpen(flag->write_fd(), "w");
+    fputc(kDeathTestInternalError, parent);
+    fprintf(parent, "%s", message.c_str());
+    fflush(parent);
+    _exit(1);
+  } else {
+    fprintf(stderr, "%s", message.c_str());
+    fflush(stderr);
+    posix::Abort();
+  }
+}
+
+// A replacement for CHECK that calls DeathTestAbort if the assertion
+// fails.
+# define GTEST_DEATH_TEST_CHECK_(expression) \
+  do { \
+    if (!::testing::internal::IsTrue(expression)) { \
+      DeathTestAbort( \
+          ::std::string("CHECK failed: File ") + __FILE__ +  ", line " \
+          + ::testing::internal::StreamableToString(__LINE__) + ": " \
+          + #expression); \
+    } \
+  } while (::testing::internal::AlwaysFalse())
+
+// This macro is similar to GTEST_DEATH_TEST_CHECK_, but it is meant for
+// evaluating any system call that fulfills two conditions: it must return
+// -1 on failure, and set errno to EINTR when it is interrupted and
+// should be tried again.  The macro expands to a loop that repeatedly
+// evaluates the expression as long as it evaluates to -1 and sets
+// errno to EINTR.  If the expression evaluates to -1 but errno is
+// something other than EINTR, DeathTestAbort is called.
+# define GTEST_DEATH_TEST_CHECK_SYSCALL_(expression) \
+  do { \
+    int gtest_retval; \
+    do { \
+      gtest_retval = (expression); \
+    } while (gtest_retval == -1 && errno == EINTR); \
+    if (gtest_retval == -1) { \
+      DeathTestAbort( \
+          ::std::string("CHECK failed: File ") + __FILE__ + ", line " \
+          + ::testing::internal::StreamableToString(__LINE__) + ": " \
+          + #expression + " != -1"); \
+    } \
+  } while (::testing::internal::AlwaysFalse())
+
+// Returns the message describing the last system error in errno.
+std::string GetLastErrnoDescription() {
+    return errno == 0 ? "" : posix::StrError(errno);
+}
+
+// This is called from a death test parent process to read a failure
+// message from the death test child process and log it with the FATAL
+// severity. On Windows, the message is read from a pipe handle. On other
+// platforms, it is read from a file descriptor.
+static void FailFromInternalError(int fd) {
+  Message error;
+  char buffer[256];
+  int num_read;
+
+  do {
+    while ((num_read = posix::Read(fd, buffer, 255)) > 0) {
+      buffer[num_read] = '\0';
+      error << buffer;
+    }
+  } while (num_read == -1 && errno == EINTR);
+
+  if (num_read == 0) {
+    GTEST_LOG_(FATAL) << error.GetString();
+  } else {
+    const int last_error = errno;
+    GTEST_LOG_(FATAL) << "Error while reading death test internal: "
+                      << GetLastErrnoDescription() << " [" << last_error << "]";
+  }
+}
+
+// Death test constructor.  Increments the running death test count
+// for the current test.
+DeathTest::DeathTest() {
+  TestInfo* const info = GetUnitTestImpl()->current_test_info();
+  if (info == NULL) {
+    DeathTestAbort("Cannot run a death test outside of a TEST or "
+                   "TEST_F construct");
+  }
+}
+
+// Creates and returns a death test by dispatching to the current
+// death test factory.
+bool DeathTest::Create(const char* statement, const RE* regex,
+                       const char* file, int line, DeathTest** test) {
+  return GetUnitTestImpl()->death_test_factory()->Create(
+      statement, regex, file, line, test);
+}
+
+const char* DeathTest::LastMessage() {
+  return last_death_test_message_.c_str();
+}
+
+void DeathTest::set_last_death_test_message(const std::string& message) {
+  last_death_test_message_ = message;
+}
+
+std::string DeathTest::last_death_test_message_;
+
+// Provides cross platform implementation for some death functionality.
+class DeathTestImpl : public DeathTest {
+ protected:
+  DeathTestImpl(const char* a_statement, const RE* a_regex)
+      : statement_(a_statement),
+        regex_(a_regex),
+        spawned_(false),
+        status_(-1),
+        outcome_(IN_PROGRESS),
+        read_fd_(-1),
+        write_fd_(-1) {}
+
+  // read_fd_ is expected to be closed and cleared by a derived class.
+  ~DeathTestImpl() { GTEST_DEATH_TEST_CHECK_(read_fd_ == -1); }
+
+  void Abort(AbortReason reason);
+  virtual bool Passed(bool status_ok);
+
+  const char* statement() const { return statement_; }
+  const RE* regex() const { return regex_; }
+  bool spawned() const { return spawned_; }
+  void set_spawned(bool is_spawned) { spawned_ = is_spawned; }
+  int status() const { return status_; }
+  void set_status(int a_status) { status_ = a_status; }
+  DeathTestOutcome outcome() const { return outcome_; }
+  void set_outcome(DeathTestOutcome an_outcome) { outcome_ = an_outcome; }
+  int read_fd() const { return read_fd_; }
+  void set_read_fd(int fd) { read_fd_ = fd; }
+  int write_fd() const { return write_fd_; }
+  void set_write_fd(int fd) { write_fd_ = fd; }
+
+  // Called in the parent process only. Reads the result code of the death
+  // test child process via a pipe, interprets it to set the outcome_
+  // member, and closes read_fd_.  Outputs diagnostics and terminates in
+  // case of unexpected codes.
+  void ReadAndInterpretStatusByte();
+
+ private:
+  // The textual content of the code this object is testing.  This class
+  // doesn't own this string and should not attempt to delete it.
+  const char* const statement_;
+  // The regular expression which test output must match.  DeathTestImpl
+  // doesn't own this object and should not attempt to delete it.
+  const RE* const regex_;
+  // True if the death test child process has been successfully spawned.
+  bool spawned_;
+  // The exit status of the child process.
+  int status_;
+  // How the death test concluded.
+  DeathTestOutcome outcome_;
+  // Descriptor to the read end of the pipe to the child process.  It is
+  // always -1 in the child process.  The child keeps its write end of the
+  // pipe in write_fd_.
+  int read_fd_;
+  // Descriptor to the child's write end of the pipe to the parent process.
+  // It is always -1 in the parent process.  The parent keeps its end of the
+  // pipe in read_fd_.
+  int write_fd_;
+};
+
+// Called in the parent process only. Reads the result code of the death
+// test child process via a pipe, interprets it to set the outcome_
+// member, and closes read_fd_.  Outputs diagnostics and terminates in
+// case of unexpected codes.
+void DeathTestImpl::ReadAndInterpretStatusByte() {
+  char flag;
+  int bytes_read;
+
+  // The read() here blocks until data is available (signifying the
+  // failure of the death test) or until the pipe is closed (signifying
+  // its success), so it's okay to call this in the parent before
+  // the child process has exited.
+  do {
+    bytes_read = posix::Read(read_fd(), &flag, 1);
+  } while (bytes_read == -1 && errno == EINTR);
+
+  if (bytes_read == 0) {
+    set_outcome(DIED);
+  } else if (bytes_read == 1) {
+    switch (flag) {
+      case kDeathTestReturned:
+        set_outcome(RETURNED);
+        break;
+      case kDeathTestThrew:
+        set_outcome(THREW);
+        break;
+      case kDeathTestLived:
+        set_outcome(LIVED);
+        break;
+      case kDeathTestInternalError:
+        FailFromInternalError(read_fd());  // Does not return.
+        break;
+      default:
+        GTEST_LOG_(FATAL) << "Death test child process reported "
+                          << "unexpected status byte ("
+                          << static_cast<unsigned int>(flag) << ")";
+    }
+  } else {
+    GTEST_LOG_(FATAL) << "Read from death test child process failed: "
+                      << GetLastErrnoDescription();
+  }
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Close(read_fd()));
+  set_read_fd(-1);
+}
+
+// Signals that the death test code which should have exited, didn't.
+// Should be called only in a death test child process.
+// Writes a status byte to the child's status file descriptor, then
+// calls _exit(1).
+void DeathTestImpl::Abort(AbortReason reason) {
+  // The parent process considers the death test to be a failure if
+  // it finds any data in our pipe.  So, here we write a single flag byte
+  // to the pipe, then exit.
+  const char status_ch =
+      reason == TEST_DID_NOT_DIE ? kDeathTestLived :
+      reason == TEST_THREW_EXCEPTION ? kDeathTestThrew : kDeathTestReturned;
+
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Write(write_fd(), &status_ch, 1));
+  // We are leaking the descriptor here because on some platforms (i.e.,
+  // when built as Windows DLL), destructors of global objects will still
+  // run after calling _exit(). On such systems, write_fd_ will be
+  // indirectly closed from the destructor of UnitTestImpl, causing double
+  // close if it is also closed here. On debug configurations, double close
+  // may assert. As there are no in-process buffers to flush here, we are
+  // relying on the OS to close the descriptor after the process terminates
+  // when the destructors are not run.
+  _exit(1);  // Exits w/o any normal exit hooks (we were supposed to crash)
+}
+
+// Returns an indented copy of stderr output for a death test.
+// This makes distinguishing death test output lines from regular log lines
+// much easier.
+static ::std::string FormatDeathTestOutput(const ::std::string& output) {
+  ::std::string ret;
+  for (size_t at = 0; ; ) {
+    const size_t line_end = output.find('\n', at);
+    ret += "[  DEATH   ] ";
+    if (line_end == ::std::string::npos) {
+      ret += output.substr(at);
+      break;
+    }
+    ret += output.substr(at, line_end + 1 - at);
+    at = line_end + 1;
+  }
+  return ret;
+}
+
+// Assesses the success or failure of a death test, using both private
+// members which have previously been set, and one argument:
+//
+// Private data members:
+//   outcome:  An enumeration describing how the death test
+//             concluded: DIED, LIVED, THREW, or RETURNED.  The death test
+//             fails in the latter three cases.
+//   status:   The exit status of the child process. On *nix, it is in the
+//             in the format specified by wait(2). On Windows, this is the
+//             value supplied to the ExitProcess() API or a numeric code
+//             of the exception that terminated the program.
+//   regex:    A regular expression object to be applied to
+//             the test's captured standard error output; the death test
+//             fails if it does not match.
+//
+// Argument:
+//   status_ok: true if exit_status is acceptable in the context of
+//              this particular death test, which fails if it is false
+//
+// Returns true iff all of the above conditions are met.  Otherwise, the
+// first failing condition, in the order given above, is the one that is
+// reported. Also sets the last death test message string.
+bool DeathTestImpl::Passed(bool status_ok) {
+  if (!spawned())
+    return false;
+
+  const std::string error_message = GetCapturedStderr();
+
+  bool success = false;
+  Message buffer;
+
+  buffer << "Death test: " << statement() << "\n";
+  switch (outcome()) {
+    case LIVED:
+      buffer << "    Result: failed to die.\n"
+             << " Error msg:\n" << FormatDeathTestOutput(error_message);
+      break;
+    case THREW:
+      buffer << "    Result: threw an exception.\n"
+             << " Error msg:\n" << FormatDeathTestOutput(error_message);
+      break;
+    case RETURNED:
+      buffer << "    Result: illegal return in test statement.\n"
+             << " Error msg:\n" << FormatDeathTestOutput(error_message);
+      break;
+    case DIED:
+      if (status_ok) {
+        const bool matched = RE::PartialMatch(error_message.c_str(), *regex());
+        if (matched) {
+          success = true;
+        } else {
+          buffer << "    Result: died but not with expected error.\n"
+                 << "  Expected: " << regex()->pattern() << "\n"
+                 << "Actual msg:\n" << FormatDeathTestOutput(error_message);
+        }
+      } else {
+        buffer << "    Result: died but not with expected exit code:\n"
+               << "            " << ExitSummary(status()) << "\n"
+               << "Actual msg:\n" << FormatDeathTestOutput(error_message);
+      }
+      break;
+    case IN_PROGRESS:
+    default:
+      GTEST_LOG_(FATAL)
+          << "DeathTest::Passed somehow called before conclusion of test";
+  }
+
+  DeathTest::set_last_death_test_message(buffer.GetString());
+  return success;
+}
+
+# if GTEST_OS_WINDOWS
+// WindowsDeathTest implements death tests on Windows. Due to the
+// specifics of starting new processes on Windows, death tests there are
+// always threadsafe, and Google Test considers the
+// --gtest_death_test_style=fast setting to be equivalent to
+// --gtest_death_test_style=threadsafe there.
+//
+// A few implementation notes:  Like the Linux version, the Windows
+// implementation uses pipes for child-to-parent communication. But due to
+// the specifics of pipes on Windows, some extra steps are required:
+//
+// 1. The parent creates a communication pipe and stores handles to both
+//    ends of it.
+// 2. The parent starts the child and provides it with the information
+//    necessary to acquire the handle to the write end of the pipe.
+// 3. The child acquires the write end of the pipe and signals the parent
+//    using a Windows event.
+// 4. Now the parent can release the write end of the pipe on its side. If
+//    this is done before step 3, the object's reference count goes down to
+//    0 and it is destroyed, preventing the child from acquiring it. The
+//    parent now has to release it, or read operations on the read end of
+//    the pipe will not return when the child terminates.
+// 5. The parent reads child's output through the pipe (outcome code and
+//    any possible error messages) from the pipe, and its stderr and then
+//    determines whether to fail the test.
+//
+// Note: to distinguish Win32 API calls from the local method and function
+// calls, the former are explicitly resolved in the global namespace.
+//
+class WindowsDeathTest : public DeathTestImpl {
+ public:
+  WindowsDeathTest(const char* a_statement,
+                   const RE* a_regex,
+                   const char* file,
+                   int line)
+      : DeathTestImpl(a_statement, a_regex), file_(file), line_(line) {}
+
+  // All of these virtual functions are inherited from DeathTest.
+  virtual int Wait();
+  virtual TestRole AssumeRole();
+
+ private:
+  // The name of the file in which the death test is located.
+  const char* const file_;
+  // The line number on which the death test is located.
+  const int line_;
+  // Handle to the write end of the pipe to the child process.
+  AutoHandle write_handle_;
+  // Child process handle.
+  AutoHandle child_handle_;
+  // Event the child process uses to signal the parent that it has
+  // acquired the handle to the write end of the pipe. After seeing this
+  // event the parent can release its own handles to make sure its
+  // ReadFile() calls return when the child terminates.
+  AutoHandle event_handle_;
+};
+
+// Waits for the child in a death test to exit, returning its exit
+// status, or 0 if no child process exists.  As a side effect, sets the
+// outcome data member.
+int WindowsDeathTest::Wait() {
+  if (!spawned())
+    return 0;
+
+  // Wait until the child either signals that it has acquired the write end
+  // of the pipe or it dies.
+  const HANDLE wait_handles[2] = { child_handle_.Get(), event_handle_.Get() };
+  switch (::WaitForMultipleObjects(2,
+                                   wait_handles,
+                                   FALSE,  // Waits for any of the handles.
+                                   INFINITE)) {
+    case WAIT_OBJECT_0:
+    case WAIT_OBJECT_0 + 1:
+      break;
+    default:
+      GTEST_DEATH_TEST_CHECK_(false);  // Should not get here.
+  }
+
+  // The child has acquired the write end of the pipe or exited.
+  // We release the handle on our side and continue.
+  write_handle_.Reset();
+  event_handle_.Reset();
+
+  ReadAndInterpretStatusByte();
+
+  // Waits for the child process to exit if it haven't already. This
+  // returns immediately if the child has already exited, regardless of
+  // whether previous calls to WaitForMultipleObjects synchronized on this
+  // handle or not.
+  GTEST_DEATH_TEST_CHECK_(
+      WAIT_OBJECT_0 == ::WaitForSingleObject(child_handle_.Get(),
+                                             INFINITE));
+  DWORD status_code;
+  GTEST_DEATH_TEST_CHECK_(
+      ::GetExitCodeProcess(child_handle_.Get(), &status_code) != FALSE);
+  child_handle_.Reset();
+  set_status(static_cast<int>(status_code));
+  return status();
+}
+
+// The AssumeRole process for a Windows death test.  It creates a child
+// process with the same executable as the current process to run the
+// death test.  The child process is given the --gtest_filter and
+// --gtest_internal_run_death_test flags such that it knows to run the
+// current death test only.
+DeathTest::TestRole WindowsDeathTest::AssumeRole() {
+  const UnitTestImpl* const impl = GetUnitTestImpl();
+  const InternalRunDeathTestFlag* const flag =
+      impl->internal_run_death_test_flag();
+  const TestInfo* const info = impl->current_test_info();
+  const int death_test_index = info->result()->death_test_count();
+
+  if (flag != NULL) {
+    // ParseInternalRunDeathTestFlag() has performed all the necessary
+    // processing.
+    set_write_fd(flag->write_fd());
+    return EXECUTE_TEST;
+  }
+
+  // WindowsDeathTest uses an anonymous pipe to communicate results of
+  // a death test.
+  SECURITY_ATTRIBUTES handles_are_inheritable = {
+    sizeof(SECURITY_ATTRIBUTES), NULL, TRUE };
+  HANDLE read_handle, write_handle;
+  GTEST_DEATH_TEST_CHECK_(
+      ::CreatePipe(&read_handle, &write_handle, &handles_are_inheritable,
+                   0)  // Default buffer size.
+      != FALSE);
+  set_read_fd(::_open_osfhandle(reinterpret_cast<intptr_t>(read_handle),
+                                O_RDONLY));
+  write_handle_.Reset(write_handle);
+  event_handle_.Reset(::CreateEvent(
+      &handles_are_inheritable,
+      TRUE,    // The event will automatically reset to non-signaled state.
+      FALSE,   // The initial state is non-signalled.
+      NULL));  // The even is unnamed.
+  GTEST_DEATH_TEST_CHECK_(event_handle_.Get() != NULL);
+  const std::string filter_flag =
+      std::string("--") + GTEST_FLAG_PREFIX_ + kFilterFlag + "=" +
+      info->test_case_name() + "." + info->name();
+  const std::string internal_flag =
+      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag +
+      "=" + file_ + "|" + StreamableToString(line_) + "|" +
+      StreamableToString(death_test_index) + "|" +
+      StreamableToString(static_cast<unsigned int>(::GetCurrentProcessId())) +
+      // size_t has the same width as pointers on both 32-bit and 64-bit
+      // Windows platforms.
+      // See http://msdn.microsoft.com/en-us/library/tcxf1dw6.aspx.
+      "|" + StreamableToString(reinterpret_cast<size_t>(write_handle)) +
+      "|" + StreamableToString(reinterpret_cast<size_t>(event_handle_.Get()));
+
+  char executable_path[_MAX_PATH + 1];  // NOLINT
+  GTEST_DEATH_TEST_CHECK_(
+      _MAX_PATH + 1 != ::GetModuleFileNameA(NULL,
+                                            executable_path,
+                                            _MAX_PATH));
+
+  std::string command_line =
+      std::string(::GetCommandLineA()) + " " + filter_flag + " \"" +
+      internal_flag + "\"";
+
+  DeathTest::set_last_death_test_message("");
+
+  CaptureStderr();
+  // Flush the log buffers since the log streams are shared with the child.
+  FlushInfoLog();
+
+  // The child process will share the standard handles with the parent.
+  STARTUPINFOA startup_info;
+  memset(&startup_info, 0, sizeof(STARTUPINFO));
+  startup_info.dwFlags = STARTF_USESTDHANDLES;
+  startup_info.hStdInput = ::GetStdHandle(STD_INPUT_HANDLE);
+  startup_info.hStdOutput = ::GetStdHandle(STD_OUTPUT_HANDLE);
+  startup_info.hStdError = ::GetStdHandle(STD_ERROR_HANDLE);
+
+  PROCESS_INFORMATION process_info;
+  GTEST_DEATH_TEST_CHECK_(::CreateProcessA(
+      executable_path,
+      const_cast<char*>(command_line.c_str()),
+      NULL,   // Retuned process handle is not inheritable.
+      NULL,   // Retuned thread handle is not inheritable.
+      TRUE,   // Child inherits all inheritable handles (for write_handle_).
+      0x0,    // Default creation flags.
+      NULL,   // Inherit the parent's environment.
+      UnitTest::GetInstance()->original_working_dir(),
+      &startup_info,
+      &process_info) != FALSE);
+  child_handle_.Reset(process_info.hProcess);
+  ::CloseHandle(process_info.hThread);
+  set_spawned(true);
+  return OVERSEE_TEST;
+}
+# else  // We are not on Windows.
+
+// ForkingDeathTest provides implementations for most of the abstract
+// methods of the DeathTest interface.  Only the AssumeRole method is
+// left undefined.
+class ForkingDeathTest : public DeathTestImpl {
+ public:
+  ForkingDeathTest(const char* statement, const RE* regex);
+
+  // All of these virtual functions are inherited from DeathTest.
+  virtual int Wait();
+
+ protected:
+  void set_child_pid(pid_t child_pid) { child_pid_ = child_pid; }
+
+ private:
+  // PID of child process during death test; 0 in the child process itself.
+  pid_t child_pid_;
+};
+
+// Constructs a ForkingDeathTest.
+ForkingDeathTest::ForkingDeathTest(const char* a_statement, const RE* a_regex)
+    : DeathTestImpl(a_statement, a_regex),
+      child_pid_(-1) {}
+
+// Waits for the child in a death test to exit, returning its exit
+// status, or 0 if no child process exists.  As a side effect, sets the
+// outcome data member.
+int ForkingDeathTest::Wait() {
+  if (!spawned())
+    return 0;
+
+  ReadAndInterpretStatusByte();
+
+  int status_value;
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(waitpid(child_pid_, &status_value, 0));
+  set_status(status_value);
+  return status_value;
+}
+
+// A concrete death test class that forks, then immediately runs the test
+// in the child process.
+class NoExecDeathTest : public ForkingDeathTest {
+ public:
+  NoExecDeathTest(const char* a_statement, const RE* a_regex) :
+      ForkingDeathTest(a_statement, a_regex) { }
+  virtual TestRole AssumeRole();
+};
+
+// The AssumeRole process for a fork-and-run death test.  It implements a
+// straightforward fork, with a simple pipe to transmit the status byte.
+DeathTest::TestRole NoExecDeathTest::AssumeRole() {
+  const size_t thread_count = GetThreadCount();
+  if (thread_count != 1) {
+    GTEST_LOG_(WARNING) << DeathTestThreadWarning(thread_count);
+  }
+
+  int pipe_fd[2];
+  GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1);
+
+  DeathTest::set_last_death_test_message("");
+  CaptureStderr();
+  // When we fork the process below, the log file buffers are copied, but the
+  // file descriptors are shared.  We flush all log files here so that closing
+  // the file descriptors in the child process doesn't throw off the
+  // synchronization between descriptors and buffers in the parent process.
+  // This is as close to the fork as possible to avoid a race condition in case
+  // there are multiple threads running before the death test, and another
+  // thread writes to the log file.
+  FlushInfoLog();
+
+  const pid_t child_pid = fork();
+  GTEST_DEATH_TEST_CHECK_(child_pid != -1);
+  set_child_pid(child_pid);
+  if (child_pid == 0) {
+    GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[0]));
+    set_write_fd(pipe_fd[1]);
+    // Redirects all logging to stderr in the child process to prevent
+    // concurrent writes to the log files.  We capture stderr in the parent
+    // process and append the child process' output to a log.
+    LogToStderr();
+    // Event forwarding to the listeners of event listener API mush be shut
+    // down in death test subprocesses.
+    GetUnitTestImpl()->listeners()->SuppressEventForwarding();
+    g_in_fast_death_test_child = true;
+    return EXECUTE_TEST;
+  } else {
+    GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1]));
+    set_read_fd(pipe_fd[0]);
+    set_spawned(true);
+    return OVERSEE_TEST;
+  }
+}
+
+// A concrete death test class that forks and re-executes the main
+// program from the beginning, with command-line flags set that cause
+// only this specific death test to be run.
+class ExecDeathTest : public ForkingDeathTest {
+ public:
+  ExecDeathTest(const char* a_statement, const RE* a_regex,
+                const char* file, int line) :
+      ForkingDeathTest(a_statement, a_regex), file_(file), line_(line) { }
+  virtual TestRole AssumeRole();
+ private:
+  static ::std::vector<testing::internal::string>
+  GetArgvsForDeathTestChildProcess() {
+    ::std::vector<testing::internal::string> args = GetInjectableArgvs();
+    return args;
+  }
+  // The name of the file in which the death test is located.
+  const char* const file_;
+  // The line number on which the death test is located.
+  const int line_;
+};
+
+// Utility class for accumulating command-line arguments.
+class Arguments {
+ public:
+  Arguments() {
+    args_.push_back(NULL);
+  }
+
+  ~Arguments() {
+    for (std::vector<char*>::iterator i = args_.begin(); i != args_.end();
+         ++i) {
+      free(*i);
+    }
+  }
+  void AddArgument(const char* argument) {
+    args_.insert(args_.end() - 1, posix::StrDup(argument));
+  }
+
+  template <typename Str>
+  void AddArguments(const ::std::vector<Str>& arguments) {
+    for (typename ::std::vector<Str>::const_iterator i = arguments.begin();
+         i != arguments.end();
+         ++i) {
+      args_.insert(args_.end() - 1, posix::StrDup(i->c_str()));
+    }
+  }
+  char* const* Argv() {
+    return &args_[0];
+  }
+
+ private:
+  std::vector<char*> args_;
+};
+
+// A struct that encompasses the arguments to the child process of a
+// threadsafe-style death test process.
+struct ExecDeathTestArgs {
+  char* const* argv;  // Command-line arguments for the child's call to exec
+  int close_fd;       // File descriptor to close; the read end of a pipe
+};
+
+#  if GTEST_OS_MAC
+inline char** GetEnviron() {
+  // When Google Test is built as a framework on MacOS X, the environ variable
+  // is unavailable. Apple's documentation (man environ) recommends using
+  // _NSGetEnviron() instead.
+  return *_NSGetEnviron();
+}
+#  else
+// Some POSIX platforms expect you to declare environ. extern "C" makes
+// it reside in the global namespace.
+extern "C" char** environ;
+inline char** GetEnviron() { return environ; }
+#  endif  // GTEST_OS_MAC
+
+#  if !GTEST_OS_QNX
+// The main function for a threadsafe-style death test child process.
+// This function is called in a clone()-ed process and thus must avoid
+// any potentially unsafe operations like malloc or libc functions.
+static int ExecDeathTestChildMain(void* child_arg) {
+  ExecDeathTestArgs* const args = static_cast<ExecDeathTestArgs*>(child_arg);
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(close(args->close_fd));
+
+  // We need to execute the test program in the same environment where
+  // it was originally invoked.  Therefore we change to the original
+  // working directory first.
+  const char* const original_dir =
+      UnitTest::GetInstance()->original_working_dir();
+  // We can safely call chdir() as it's a direct system call.
+  if (chdir(original_dir) != 0) {
+    DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " +
+                   GetLastErrnoDescription());
+    return EXIT_FAILURE;
+  }
+
+  // We can safely call execve() as it's a direct system call.  We
+  // cannot use execvp() as it's a libc function and thus potentially
+  // unsafe.  Since execve() doesn't search the PATH, the user must
+  // invoke the test program via a valid path that contains at least
+  // one path separator.
+  execve(args->argv[0], args->argv, GetEnviron());
+  DeathTestAbort(std::string("execve(") + args->argv[0] + ", ...) in " +
+                 original_dir + " failed: " +
+                 GetLastErrnoDescription());
+  return EXIT_FAILURE;
+}
+#  endif  // !GTEST_OS_QNX
+
+// Two utility routines that together determine the direction the stack
+// grows.
+// This could be accomplished more elegantly by a single recursive
+// function, but we want to guard against the unlikely possibility of
+// a smart compiler optimizing the recursion away.
+//
+// GTEST_NO_INLINE_ is required to prevent GCC 4.6 from inlining
+// StackLowerThanAddress into StackGrowsDown, which then doesn't give
+// correct answer.
+void StackLowerThanAddress(const void* ptr, bool* result) GTEST_NO_INLINE_;
+void StackLowerThanAddress(const void* ptr, bool* result) {
+  int dummy;
+  *result = (&dummy < ptr);
+}
+
+// Make sure AddressSanitizer does not tamper with the stack here.
+GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+bool StackGrowsDown() {
+  int dummy;
+  bool result;
+  StackLowerThanAddress(&dummy, &result);
+  return result;
+}
+
+// Spawns a child process with the same executable as the current process in
+// a thread-safe manner and instructs it to run the death test.  The
+// implementation uses fork(2) + exec.  On systems where clone(2) is
+// available, it is used instead, being slightly more thread-safe.  On QNX,
+// fork supports only single-threaded environments, so this function uses
+// spawn(2) there instead.  The function dies with an error message if
+// anything goes wrong.
+static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) {
+  ExecDeathTestArgs args = { argv, close_fd };
+  pid_t child_pid = -1;
+
+#  if GTEST_OS_QNX
+  // Obtains the current directory and sets it to be closed in the child
+  // process.
+  const int cwd_fd = open(".", O_RDONLY);
+  GTEST_DEATH_TEST_CHECK_(cwd_fd != -1);
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(fcntl(cwd_fd, F_SETFD, FD_CLOEXEC));
+  // We need to execute the test program in the same environment where
+  // it was originally invoked.  Therefore we change to the original
+  // working directory first.
+  const char* const original_dir =
+      UnitTest::GetInstance()->original_working_dir();
+  // We can safely call chdir() as it's a direct system call.
+  if (chdir(original_dir) != 0) {
+    DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " +
+                   GetLastErrnoDescription());
+    return EXIT_FAILURE;
+  }
+
+  int fd_flags;
+  // Set close_fd to be closed after spawn.
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(fd_flags = fcntl(close_fd, F_GETFD));
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(fcntl(close_fd, F_SETFD,
+                                        fd_flags | FD_CLOEXEC));
+  struct inheritance inherit = {0};
+  // spawn is a system call.
+  child_pid = spawn(args.argv[0], 0, NULL, &inherit, args.argv, GetEnviron());
+  // Restores the current working directory.
+  GTEST_DEATH_TEST_CHECK_(fchdir(cwd_fd) != -1);
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(close(cwd_fd));
+
+#  else   // GTEST_OS_QNX
+#   if GTEST_OS_LINUX
+  // When a SIGPROF signal is received while fork() or clone() are executing,
+  // the process may hang. To avoid this, we ignore SIGPROF here and re-enable
+  // it after the call to fork()/clone() is complete.
+  struct sigaction saved_sigprof_action;
+  struct sigaction ignore_sigprof_action;
+  memset(&ignore_sigprof_action, 0, sizeof(ignore_sigprof_action));
+  sigemptyset(&ignore_sigprof_action.sa_mask);
+  ignore_sigprof_action.sa_handler = SIG_IGN;
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(sigaction(
+      SIGPROF, &ignore_sigprof_action, &saved_sigprof_action));
+#   endif  // GTEST_OS_LINUX
+
+#   if GTEST_HAS_CLONE
+  const bool use_fork = GTEST_FLAG(death_test_use_fork);
+
+  if (!use_fork) {
+    static const bool stack_grows_down = StackGrowsDown();
+    const size_t stack_size = getpagesize();
+    // MMAP_ANONYMOUS is not defined on Mac, so we use MAP_ANON instead.
+    void* const stack = mmap(NULL, stack_size, PROT_READ | PROT_WRITE,
+                             MAP_ANON | MAP_PRIVATE, -1, 0);
+    GTEST_DEATH_TEST_CHECK_(stack != MAP_FAILED);
+
+    // Maximum stack alignment in bytes:  For a downward-growing stack, this
+    // amount is subtracted from size of the stack space to get an address
+    // that is within the stack space and is aligned on all systems we care
+    // about.  As far as I know there is no ABI with stack alignment greater
+    // than 64.  We assume stack and stack_size already have alignment of
+    // kMaxStackAlignment.
+    const size_t kMaxStackAlignment = 64;
+    void* const stack_top =
+        static_cast<char*>(stack) +
+            (stack_grows_down ? stack_size - kMaxStackAlignment : 0);
+    GTEST_DEATH_TEST_CHECK_(stack_size > kMaxStackAlignment &&
+        reinterpret_cast<intptr_t>(stack_top) % kMaxStackAlignment == 0);
+
+    child_pid = clone(&ExecDeathTestChildMain, stack_top, SIGCHLD, &args);
+
+    GTEST_DEATH_TEST_CHECK_(munmap(stack, stack_size) != -1);
+  }
+#   else
+  const bool use_fork = true;
+#   endif  // GTEST_HAS_CLONE
+
+  if (use_fork && (child_pid = fork()) == 0) {
+      ExecDeathTestChildMain(&args);
+      _exit(0);
+  }
+#  endif  // GTEST_OS_QNX
+#  if GTEST_OS_LINUX
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(
+      sigaction(SIGPROF, &saved_sigprof_action, NULL));
+#  endif  // GTEST_OS_LINUX
+
+  GTEST_DEATH_TEST_CHECK_(child_pid != -1);
+  return child_pid;
+}
+
+// The AssumeRole process for a fork-and-exec death test.  It re-executes the
+// main program from the beginning, setting the --gtest_filter
+// and --gtest_internal_run_death_test flags to cause only the current
+// death test to be re-run.
+DeathTest::TestRole ExecDeathTest::AssumeRole() {
+  const UnitTestImpl* const impl = GetUnitTestImpl();
+  const InternalRunDeathTestFlag* const flag =
+      impl->internal_run_death_test_flag();
+  const TestInfo* const info = impl->current_test_info();
+  const int death_test_index = info->result()->death_test_count();
+
+  if (flag != NULL) {
+    set_write_fd(flag->write_fd());
+    return EXECUTE_TEST;
+  }
+
+  int pipe_fd[2];
+  GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1);
+  // Clear the close-on-exec flag on the write end of the pipe, lest
+  // it be closed when the child process does an exec:
+  GTEST_DEATH_TEST_CHECK_(fcntl(pipe_fd[1], F_SETFD, 0) != -1);
+
+  const std::string filter_flag =
+      std::string("--") + GTEST_FLAG_PREFIX_ + kFilterFlag + "="
+      + info->test_case_name() + "." + info->name();
+  const std::string internal_flag =
+      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + "="
+      + file_ + "|" + StreamableToString(line_) + "|"
+      + StreamableToString(death_test_index) + "|"
+      + StreamableToString(pipe_fd[1]);
+  Arguments args;
+  args.AddArguments(GetArgvsForDeathTestChildProcess());
+  args.AddArgument(filter_flag.c_str());
+  args.AddArgument(internal_flag.c_str());
+
+  DeathTest::set_last_death_test_message("");
+
+  CaptureStderr();
+  // See the comment in NoExecDeathTest::AssumeRole for why the next line
+  // is necessary.
+  FlushInfoLog();
+
+  const pid_t child_pid = ExecDeathTestSpawnChild(args.Argv(), pipe_fd[0]);
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1]));
+  set_child_pid(child_pid);
+  set_read_fd(pipe_fd[0]);
+  set_spawned(true);
+  return OVERSEE_TEST;
+}
+
+# endif  // !GTEST_OS_WINDOWS
+
+// Creates a concrete DeathTest-derived class that depends on the
+// --gtest_death_test_style flag, and sets the pointer pointed to
+// by the "test" argument to its address.  If the test should be
+// skipped, sets that pointer to NULL.  Returns true, unless the
+// flag is set to an invalid value.
+bool DefaultDeathTestFactory::Create(const char* statement, const RE* regex,
+                                     const char* file, int line,
+                                     DeathTest** test) {
+  UnitTestImpl* const impl = GetUnitTestImpl();
+  const InternalRunDeathTestFlag* const flag =
+      impl->internal_run_death_test_flag();
+  const int death_test_index = impl->current_test_info()
+      ->increment_death_test_count();
+
+  if (flag != NULL) {
+    if (death_test_index > flag->index()) {
+      DeathTest::set_last_death_test_message(
+          "Death test count (" + StreamableToString(death_test_index)
+          + ") somehow exceeded expected maximum ("
+          + StreamableToString(flag->index()) + ")");
+      return false;
+    }
+
+    if (!(flag->file() == file && flag->line() == line &&
+          flag->index() == death_test_index)) {
+      *test = NULL;
+      return true;
+    }
+  }
+
+# if GTEST_OS_WINDOWS
+
+  if (GTEST_FLAG(death_test_style) == "threadsafe" ||
+      GTEST_FLAG(death_test_style) == "fast") {
+    *test = new WindowsDeathTest(statement, regex, file, line);
+  }
+
+# else
+
+  if (GTEST_FLAG(death_test_style) == "threadsafe") {
+    *test = new ExecDeathTest(statement, regex, file, line);
+  } else if (GTEST_FLAG(death_test_style) == "fast") {
+    *test = new NoExecDeathTest(statement, regex);
+  }
+
+# endif  // GTEST_OS_WINDOWS
+
+  else {  // NOLINT - this is more readable than unbalanced brackets inside #if.
+    DeathTest::set_last_death_test_message(
+        "Unknown death test style \"" + GTEST_FLAG(death_test_style)
+        + "\" encountered");
+    return false;
+  }
+
+  return true;
+}
+
+// Splits a given string on a given delimiter, populating a given
+// vector with the fields.  GTEST_HAS_DEATH_TEST implies that we have
+// ::std::string, so we can use it here.
+static void SplitString(const ::std::string& str, char delimiter,
+                        ::std::vector< ::std::string>* dest) {
+  ::std::vector< ::std::string> parsed;
+  ::std::string::size_type pos = 0;
+  while (::testing::internal::AlwaysTrue()) {
+    const ::std::string::size_type colon = str.find(delimiter, pos);
+    if (colon == ::std::string::npos) {
+      parsed.push_back(str.substr(pos));
+      break;
+    } else {
+      parsed.push_back(str.substr(pos, colon - pos));
+      pos = colon + 1;
+    }
+  }
+  dest->swap(parsed);
+}
+
+# if GTEST_OS_WINDOWS
+// Recreates the pipe and event handles from the provided parameters,
+// signals the event, and returns a file descriptor wrapped around the pipe
+// handle. This function is called in the child process only.
+int GetStatusFileDescriptor(unsigned int parent_process_id,
+                            size_t write_handle_as_size_t,
+                            size_t event_handle_as_size_t) {
+  AutoHandle parent_process_handle(::OpenProcess(PROCESS_DUP_HANDLE,
+                                                   FALSE,  // Non-inheritable.
+                                                   parent_process_id));
+  if (parent_process_handle.Get() == INVALID_HANDLE_VALUE) {
+    DeathTestAbort("Unable to open parent process " +
+                   StreamableToString(parent_process_id));
+  }
+
+  // TODO(vladl at google.com): Replace the following check with a
+  // compile-time assertion when available.
+  GTEST_CHECK_(sizeof(HANDLE) <= sizeof(size_t));
+
+  const HANDLE write_handle =
+      reinterpret_cast<HANDLE>(write_handle_as_size_t);
+  HANDLE dup_write_handle;
+
+  // The newly initialized handle is accessible only in in the parent
+  // process. To obtain one accessible within the child, we need to use
+  // DuplicateHandle.
+  if (!::DuplicateHandle(parent_process_handle.Get(), write_handle,
+                         ::GetCurrentProcess(), &dup_write_handle,
+                         0x0,    // Requested privileges ignored since
+                                 // DUPLICATE_SAME_ACCESS is used.
+                         FALSE,  // Request non-inheritable handler.
+                         DUPLICATE_SAME_ACCESS)) {
+    DeathTestAbort("Unable to duplicate the pipe handle " +
+                   StreamableToString(write_handle_as_size_t) +
+                   " from the parent process " +
+                   StreamableToString(parent_process_id));
+  }
+
+  const HANDLE event_handle = reinterpret_cast<HANDLE>(event_handle_as_size_t);
+  HANDLE dup_event_handle;
+
+  if (!::DuplicateHandle(parent_process_handle.Get(), event_handle,
+                         ::GetCurrentProcess(), &dup_event_handle,
+                         0x0,
+                         FALSE,
+                         DUPLICATE_SAME_ACCESS)) {
+    DeathTestAbort("Unable to duplicate the event handle " +
+                   StreamableToString(event_handle_as_size_t) +
+                   " from the parent process " +
+                   StreamableToString(parent_process_id));
+  }
+
+  const int write_fd =
+      ::_open_osfhandle(reinterpret_cast<intptr_t>(dup_write_handle), O_APPEND);
+  if (write_fd == -1) {
+    DeathTestAbort("Unable to convert pipe handle " +
+                   StreamableToString(write_handle_as_size_t) +
+                   " to a file descriptor");
+  }
+
+  // Signals the parent that the write end of the pipe has been acquired
+  // so the parent can release its own write end.
+  ::SetEvent(dup_event_handle);
+
+  return write_fd;
+}
+# endif  // GTEST_OS_WINDOWS
+
+// Returns a newly created InternalRunDeathTestFlag object with fields
+// initialized from the GTEST_FLAG(internal_run_death_test) flag if
+// the flag is specified; otherwise returns NULL.
+InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag() {
+  if (GTEST_FLAG(internal_run_death_test) == "") return NULL;
+
+  // GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we
+  // can use it here.
+  int line = -1;
+  int index = -1;
+  ::std::vector< ::std::string> fields;
+  SplitString(GTEST_FLAG(internal_run_death_test).c_str(), '|', &fields);
+  int write_fd = -1;
+
+# if GTEST_OS_WINDOWS
+
+  unsigned int parent_process_id = 0;
+  size_t write_handle_as_size_t = 0;
+  size_t event_handle_as_size_t = 0;
+
+  if (fields.size() != 6
+      || !ParseNaturalNumber(fields[1], &line)
+      || !ParseNaturalNumber(fields[2], &index)
+      || !ParseNaturalNumber(fields[3], &parent_process_id)
+      || !ParseNaturalNumber(fields[4], &write_handle_as_size_t)
+      || !ParseNaturalNumber(fields[5], &event_handle_as_size_t)) {
+    DeathTestAbort("Bad --gtest_internal_run_death_test flag: " +
+                   GTEST_FLAG(internal_run_death_test));
+  }
+  write_fd = GetStatusFileDescriptor(parent_process_id,
+                                     write_handle_as_size_t,
+                                     event_handle_as_size_t);
+# else
+
+  if (fields.size() != 4
+      || !ParseNaturalNumber(fields[1], &line)
+      || !ParseNaturalNumber(fields[2], &index)
+      || !ParseNaturalNumber(fields[3], &write_fd)) {
+    DeathTestAbort("Bad --gtest_internal_run_death_test flag: "
+        + GTEST_FLAG(internal_run_death_test));
+  }
+
+# endif  // GTEST_OS_WINDOWS
+
+  return new InternalRunDeathTestFlag(fields[0], line, index, write_fd);
+}
+
+}  // namespace internal
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+}  // namespace testing
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: keith.ray at gmail.com (Keith Ray)
+
+
+#include <stdlib.h>
+
+#if GTEST_OS_WINDOWS_MOBILE
+# include <windows.h>
+#elif GTEST_OS_WINDOWS
+# include <direct.h>
+# include <io.h>
+#elif GTEST_OS_SYMBIAN
+// Symbian OpenC has PATH_MAX in sys/syslimits.h
+# include <sys/syslimits.h>
+#else
+# include <limits.h>
+# include <climits>  // Some Linux distributions define PATH_MAX here.
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+#if GTEST_OS_WINDOWS
+# define GTEST_PATH_MAX_ _MAX_PATH
+#elif defined(PATH_MAX)
+# define GTEST_PATH_MAX_ PATH_MAX
+#elif defined(_XOPEN_PATH_MAX)
+# define GTEST_PATH_MAX_ _XOPEN_PATH_MAX
+#else
+# define GTEST_PATH_MAX_ _POSIX_PATH_MAX
+#endif  // GTEST_OS_WINDOWS
+
+
+namespace testing {
+namespace internal {
+
+#if GTEST_OS_WINDOWS
+// On Windows, '\\' is the standard path separator, but many tools and the
+// Windows API also accept '/' as an alternate path separator. Unless otherwise
+// noted, a file path can contain either kind of path separators, or a mixture
+// of them.
+const char kPathSeparator = '\\';
+const char kAlternatePathSeparator = '/';
+const char kAlternatePathSeparatorString[] = "/";
+# if GTEST_OS_WINDOWS_MOBILE
+// Windows CE doesn't have a current directory. You should not use
+// the current directory in tests on Windows CE, but this at least
+// provides a reasonable fallback.
+const char kCurrentDirectoryString[] = "\\";
+// Windows CE doesn't define INVALID_FILE_ATTRIBUTES
+const DWORD kInvalidFileAttributes = 0xffffffff;
+# else
+const char kCurrentDirectoryString[] = ".\\";
+# endif  // GTEST_OS_WINDOWS_MOBILE
+#else
+const char kPathSeparator = '/';
+const char kCurrentDirectoryString[] = "./";
+#endif  // GTEST_OS_WINDOWS
+
+// Returns whether the given character is a valid path separator.
+static bool IsPathSeparator(char c) {
+#if GTEST_HAS_ALT_PATH_SEP_
+  return (c == kPathSeparator) || (c == kAlternatePathSeparator);
+#else
+  return c == kPathSeparator;
+#endif
+}
+
+// Returns the current working directory, or "" if unsuccessful.
+FilePath FilePath::GetCurrentDir() {
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT
+  // Windows CE doesn't have a current directory, so we just return
+  // something reasonable.
+  return FilePath(kCurrentDirectoryString);
+#elif GTEST_OS_WINDOWS
+  char cwd[GTEST_PATH_MAX_ + 1] = { '\0' };
+  return FilePath(_getcwd(cwd, sizeof(cwd)) == NULL ? "" : cwd);
+#else
+  char cwd[GTEST_PATH_MAX_ + 1] = { '\0' };
+  char* result = getcwd(cwd, sizeof(cwd));
+# if GTEST_OS_NACL
+  // getcwd will likely fail in NaCl due to the sandbox, so return something
+  // reasonable. The user may have provided a shim implementation for getcwd,
+  // however, so fallback only when failure is detected.
+  return FilePath(result == NULL ? kCurrentDirectoryString : cwd);
+# endif  // GTEST_OS_NACL
+  return FilePath(result == NULL ? "" : cwd);
+#endif  // GTEST_OS_WINDOWS_MOBILE
+}
+
+// Returns a copy of the FilePath with the case-insensitive extension removed.
+// Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns
+// FilePath("dir/file"). If a case-insensitive extension is not
+// found, returns a copy of the original FilePath.
+FilePath FilePath::RemoveExtension(const char* extension) const {
+  const std::string dot_extension = std::string(".") + extension;
+  if (String::EndsWithCaseInsensitive(pathname_, dot_extension)) {
+    return FilePath(pathname_.substr(
+        0, pathname_.length() - dot_extension.length()));
+  }
+  return *this;
+}
+
+// Returns a pointer to the last occurence of a valid path separator in
+// the FilePath. On Windows, for example, both '/' and '\' are valid path
+// separators. Returns NULL if no path separator was found.
+const char* FilePath::FindLastPathSeparator() const {
+  const char* const last_sep = strrchr(c_str(), kPathSeparator);
+#if GTEST_HAS_ALT_PATH_SEP_
+  const char* const last_alt_sep = strrchr(c_str(), kAlternatePathSeparator);
+  // Comparing two pointers of which only one is NULL is undefined.
+  if (last_alt_sep != NULL &&
+      (last_sep == NULL || last_alt_sep > last_sep)) {
+    return last_alt_sep;
+  }
+#endif
+  return last_sep;
+}
+
+// Returns a copy of the FilePath with the directory part removed.
+// Example: FilePath("path/to/file").RemoveDirectoryName() returns
+// FilePath("file"). If there is no directory part ("just_a_file"), it returns
+// the FilePath unmodified. If there is no file part ("just_a_dir/") it
+// returns an empty FilePath ("").
+// On Windows platform, '\' is the path separator, otherwise it is '/'.
+FilePath FilePath::RemoveDirectoryName() const {
+  const char* const last_sep = FindLastPathSeparator();
+  return last_sep ? FilePath(last_sep + 1) : *this;
+}
+
+// RemoveFileName returns the directory path with the filename removed.
+// Example: FilePath("path/to/file").RemoveFileName() returns "path/to/".
+// If the FilePath is "a_file" or "/a_file", RemoveFileName returns
+// FilePath("./") or, on Windows, FilePath(".\\"). If the filepath does
+// not have a file, like "just/a/dir/", it returns the FilePath unmodified.
+// On Windows platform, '\' is the path separator, otherwise it is '/'.
+FilePath FilePath::RemoveFileName() const {
+  const char* const last_sep = FindLastPathSeparator();
+  std::string dir;
+  if (last_sep) {
+    dir = std::string(c_str(), last_sep + 1 - c_str());
+  } else {
+    dir = kCurrentDirectoryString;
+  }
+  return FilePath(dir);
+}
+
+// Helper functions for naming files in a directory for xml output.
+
+// Given directory = "dir", base_name = "test", number = 0,
+// extension = "xml", returns "dir/test.xml". If number is greater
+// than zero (e.g., 12), returns "dir/test_12.xml".
+// On Windows platform, uses \ as the separator rather than /.
+FilePath FilePath::MakeFileName(const FilePath& directory,
+                                const FilePath& base_name,
+                                int number,
+                                const char* extension) {
+  std::string file;
+  if (number == 0) {
+    file = base_name.string() + "." + extension;
+  } else {
+    file = base_name.string() + "_" + StreamableToString(number)
+        + "." + extension;
+  }
+  return ConcatPaths(directory, FilePath(file));
+}
+
+// Given directory = "dir", relative_path = "test.xml", returns "dir/test.xml".
+// On Windows, uses \ as the separator rather than /.
+FilePath FilePath::ConcatPaths(const FilePath& directory,
+                               const FilePath& relative_path) {
+  if (directory.IsEmpty())
+    return relative_path;
+  const FilePath dir(directory.RemoveTrailingPathSeparator());
+  return FilePath(dir.string() + kPathSeparator + relative_path.string());
+}
+
+// Returns true if pathname describes something findable in the file-system,
+// either a file, directory, or whatever.
+bool FilePath::FileOrDirectoryExists() const {
+#if GTEST_OS_WINDOWS_MOBILE
+  LPCWSTR unicode = String::AnsiToUtf16(pathname_.c_str());
+  const DWORD attributes = GetFileAttributes(unicode);
+  delete [] unicode;
+  return attributes != kInvalidFileAttributes;
+#else
+  posix::StatStruct file_stat;
+  return posix::Stat(pathname_.c_str(), &file_stat) == 0;
+#endif  // GTEST_OS_WINDOWS_MOBILE
+}
+
+// Returns true if pathname describes a directory in the file-system
+// that exists.
+bool FilePath::DirectoryExists() const {
+  bool result = false;
+#if GTEST_OS_WINDOWS
+  // Don't strip off trailing separator if path is a root directory on
+  // Windows (like "C:\\").
+  const FilePath& path(IsRootDirectory() ? *this :
+                                           RemoveTrailingPathSeparator());
+#else
+  const FilePath& path(*this);
+#endif
+
+#if GTEST_OS_WINDOWS_MOBILE
+  LPCWSTR unicode = String::AnsiToUtf16(path.c_str());
+  const DWORD attributes = GetFileAttributes(unicode);
+  delete [] unicode;
+  if ((attributes != kInvalidFileAttributes) &&
+      (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
+    result = true;
+  }
+#else
+  posix::StatStruct file_stat;
+  result = posix::Stat(path.c_str(), &file_stat) == 0 &&
+      posix::IsDir(file_stat);
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+  return result;
+}
+
+// Returns true if pathname describes a root directory. (Windows has one
+// root directory per disk drive.)
+bool FilePath::IsRootDirectory() const {
+#if GTEST_OS_WINDOWS
+  // TODO(wan at google.com): on Windows a network share like
+  // \\server\share can be a root directory, although it cannot be the
+  // current directory.  Handle this properly.
+  return pathname_.length() == 3 && IsAbsolutePath();
+#else
+  return pathname_.length() == 1 && IsPathSeparator(pathname_.c_str()[0]);
+#endif
+}
+
+// Returns true if pathname describes an absolute path.
+bool FilePath::IsAbsolutePath() const {
+  const char* const name = pathname_.c_str();
+#if GTEST_OS_WINDOWS
+  return pathname_.length() >= 3 &&
+     ((name[0] >= 'a' && name[0] <= 'z') ||
+      (name[0] >= 'A' && name[0] <= 'Z')) &&
+     name[1] == ':' &&
+     IsPathSeparator(name[2]);
+#else
+  return IsPathSeparator(name[0]);
+#endif
+}
+
+// Returns a pathname for a file that does not currently exist. The pathname
+// will be directory/base_name.extension or
+// directory/base_name_<number>.extension if directory/base_name.extension
+// already exists. The number will be incremented until a pathname is found
+// that does not already exist.
+// Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'.
+// There could be a race condition if two or more processes are calling this
+// function at the same time -- they could both pick the same filename.
+FilePath FilePath::GenerateUniqueFileName(const FilePath& directory,
+                                          const FilePath& base_name,
+                                          const char* extension) {
+  FilePath full_pathname;
+  int number = 0;
+  do {
+    full_pathname.Set(MakeFileName(directory, base_name, number++, extension));
+  } while (full_pathname.FileOrDirectoryExists());
+  return full_pathname;
+}
+
+// Returns true if FilePath ends with a path separator, which indicates that
+// it is intended to represent a directory. Returns false otherwise.
+// This does NOT check that a directory (or file) actually exists.
+bool FilePath::IsDirectory() const {
+  return !pathname_.empty() &&
+         IsPathSeparator(pathname_.c_str()[pathname_.length() - 1]);
+}
+
+// Create directories so that path exists. Returns true if successful or if
+// the directories already exist; returns false if unable to create directories
+// for any reason.
+bool FilePath::CreateDirectoriesRecursively() const {
+  if (!this->IsDirectory()) {
+    return false;
+  }
+
+  if (pathname_.length() == 0 || this->DirectoryExists()) {
+    return true;
+  }
+
+  const FilePath parent(this->RemoveTrailingPathSeparator().RemoveFileName());
+  return parent.CreateDirectoriesRecursively() && this->CreateFolder();
+}
+
+// Create the directory so that path exists. Returns true if successful or
+// if the directory already exists; returns false if unable to create the
+// directory for any reason, including if the parent directory does not
+// exist. Not named "CreateDirectory" because that's a macro on Windows.
+bool FilePath::CreateFolder() const {
+#if GTEST_OS_WINDOWS_MOBILE
+  FilePath removed_sep(this->RemoveTrailingPathSeparator());
+  LPCWSTR unicode = String::AnsiToUtf16(removed_sep.c_str());
+  int result = CreateDirectory(unicode, NULL) ? 0 : -1;
+  delete [] unicode;
+#elif GTEST_OS_WINDOWS
+  int result = _mkdir(pathname_.c_str());
+#else
+  int result = mkdir(pathname_.c_str(), 0777);
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+  if (result == -1) {
+    return this->DirectoryExists();  // An error is OK if the directory exists.
+  }
+  return true;  // No error.
+}
+
+// If input name has a trailing separator character, remove it and return the
+// name, otherwise return the name string unmodified.
+// On Windows platform, uses \ as the separator, other platforms use /.
+FilePath FilePath::RemoveTrailingPathSeparator() const {
+  return IsDirectory()
+      ? FilePath(pathname_.substr(0, pathname_.length() - 1))
+      : *this;
+}
+
+// Removes any redundant separators that might be in the pathname.
+// For example, "bar///foo" becomes "bar/foo". Does not eliminate other
+// redundancies that might be in a pathname involving "." or "..".
+// TODO(wan at google.com): handle Windows network shares (e.g. \\server\share).
+void FilePath::Normalize() {
+  if (pathname_.c_str() == NULL) {
+    pathname_ = "";
+    return;
+  }
+  const char* src = pathname_.c_str();
+  char* const dest = new char[pathname_.length() + 1];
+  char* dest_ptr = dest;
+  memset(dest_ptr, 0, pathname_.length() + 1);
+
+  while (*src != '\0') {
+    *dest_ptr = *src;
+    if (!IsPathSeparator(*src)) {
+      src++;
+    } else {
+#if GTEST_HAS_ALT_PATH_SEP_
+      if (*dest_ptr == kAlternatePathSeparator) {
+        *dest_ptr = kPathSeparator;
+      }
+#endif
+      while (IsPathSeparator(*src))
+        src++;
+    }
+    dest_ptr++;
+  }
+  *dest_ptr = '\0';
+  pathname_ = dest;
+  delete[] dest;
+}
+
+}  // namespace internal
+}  // namespace testing
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan at google.com (Zhanyong Wan)
+
+
+#include <limits.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#if GTEST_OS_WINDOWS
+# include <windows.h>
+# include <io.h>
+# include <sys/stat.h>
+# include <map>  // Used in ThreadLocal.
+#else
+# include <unistd.h>
+#endif  // GTEST_OS_WINDOWS
+
+#if GTEST_OS_MAC
+# include <mach/mach_init.h>
+# include <mach/task.h>
+# include <mach/vm_map.h>
+#endif  // GTEST_OS_MAC
+
+#if GTEST_OS_QNX
+# include <devctl.h>
+# include <fcntl.h>
+# include <sys/procfs.h>
+#endif  // GTEST_OS_QNX
+
+
+// Indicates that this translation unit is part of Google Test's
+// implementation.  It must come before gtest-internal-inl.h is
+// included, or there will be a compiler error.  This trick exists to
+// prevent the accidental inclusion of gtest-internal-inl.h in the
+// user's code.
+#define GTEST_IMPLEMENTATION_ 1
+#undef GTEST_IMPLEMENTATION_
+
+namespace testing {
+namespace internal {
+
+#if defined(_MSC_VER) || defined(__BORLANDC__)
+// MSVC and C++Builder do not provide a definition of STDERR_FILENO.
+const int kStdOutFileno = 1;
+const int kStdErrFileno = 2;
+#else
+const int kStdOutFileno = STDOUT_FILENO;
+const int kStdErrFileno = STDERR_FILENO;
+#endif  // _MSC_VER
+
+#if GTEST_OS_MAC
+
+// Returns the number of threads running in the process, or 0 to indicate that
+// we cannot detect it.
+size_t GetThreadCount() {
+  const task_t task = mach_task_self();
+  mach_msg_type_number_t thread_count;
+  thread_act_array_t thread_list;
+  const kern_return_t status = task_threads(task, &thread_list, &thread_count);
+  if (status == KERN_SUCCESS) {
+    // task_threads allocates resources in thread_list and we need to free them
+    // to avoid leaks.
+    vm_deallocate(task,
+                  reinterpret_cast<vm_address_t>(thread_list),
+                  sizeof(thread_t) * thread_count);
+    return static_cast<size_t>(thread_count);
+  } else {
+    return 0;
+  }
+}
+
+#elif GTEST_OS_QNX
+
+// Returns the number of threads running in the process, or 0 to indicate that
+// we cannot detect it.
+size_t GetThreadCount() {
+  const int fd = open("/proc/self/as", O_RDONLY);
+  if (fd < 0) {
+    return 0;
+  }
+  procfs_info process_info;
+  const int status =
+      devctl(fd, DCMD_PROC_INFO, &process_info, sizeof(process_info), NULL);
+  close(fd);
+  if (status == EOK) {
+    return static_cast<size_t>(process_info.num_threads);
+  } else {
+    return 0;
+  }
+}
+
+#else
+
+size_t GetThreadCount() {
+  // There's no portable way to detect the number of threads, so we just
+  // return 0 to indicate that we cannot detect it.
+  return 0;
+}
+
+#endif  // GTEST_OS_MAC
+
+#if GTEST_IS_THREADSAFE && GTEST_OS_WINDOWS
+
+void SleepMilliseconds(int n) {
+  ::Sleep(n);
+}
+
+AutoHandle::AutoHandle()
+    : handle_(INVALID_HANDLE_VALUE) {}
+
+AutoHandle::AutoHandle(Handle handle)
+    : handle_(handle) {}
+
+AutoHandle::~AutoHandle() {
+  Reset();
+}
+
+AutoHandle::Handle AutoHandle::Get() const {
+  return handle_;
+}
+
+void AutoHandle::Reset() {
+  Reset(INVALID_HANDLE_VALUE);
+}
+
+void AutoHandle::Reset(HANDLE handle) {
+  // Resetting with the same handle we already own is invalid.
+  if (handle_ != handle) {
+    if (IsCloseable()) {
+      ::CloseHandle(handle_);
+    }
+    handle_ = handle;
+  } else {
+    GTEST_CHECK_(!IsCloseable())
+        << "Resetting a valid handle to itself is likely a programmer error "
+            "and thus not allowed.";
+  }
+}
+
+bool AutoHandle::IsCloseable() const {
+  // Different Windows APIs may use either of these values to represent an
+  // invalid handle.
+  return handle_ != NULL && handle_ != INVALID_HANDLE_VALUE;
+}
+
+Notification::Notification()
+    : event_(::CreateEvent(NULL,   // Default security attributes.
+                           TRUE,   // Do not reset automatically.
+                           FALSE,  // Initially unset.
+                           NULL)) {  // Anonymous event.
+  GTEST_CHECK_(event_.Get() != NULL);
+}
+
+void Notification::Notify() {
+  GTEST_CHECK_(::SetEvent(event_.Get()) != FALSE);
+}
+
+void Notification::WaitForNotification() {
+  GTEST_CHECK_(
+      ::WaitForSingleObject(event_.Get(), INFINITE) == WAIT_OBJECT_0);
+}
+
+Mutex::Mutex()
+    : type_(kDynamic),
+      owner_thread_id_(0),
+      critical_section_init_phase_(0),
+      critical_section_(new CRITICAL_SECTION) {
+  ::InitializeCriticalSection(critical_section_);
+}
+
+Mutex::~Mutex() {
+  // Static mutexes are leaked intentionally. It is not thread-safe to try
+  // to clean them up.
+  // TODO(yukawa): Switch to Slim Reader/Writer (SRW) Locks, which requires
+  // nothing to clean it up but is available only on Vista and later.
+  // http://msdn.microsoft.com/en-us/library/windows/desktop/aa904937.aspx
+  if (type_ == kDynamic) {
+    ::DeleteCriticalSection(critical_section_);
+    delete critical_section_;
+    critical_section_ = NULL;
+  }
+}
+
+void Mutex::Lock() {
+  ThreadSafeLazyInit();
+  ::EnterCriticalSection(critical_section_);
+  owner_thread_id_ = ::GetCurrentThreadId();
+}
+
+void Mutex::Unlock() {
+  ThreadSafeLazyInit();
+  // We don't protect writing to owner_thread_id_ here, as it's the
+  // caller's responsibility to ensure that the current thread holds the
+  // mutex when this is called.
+  owner_thread_id_ = 0;
+  ::LeaveCriticalSection(critical_section_);
+}
+
+// Does nothing if the current thread holds the mutex. Otherwise, crashes
+// with high probability.
+void Mutex::AssertHeld() {
+  ThreadSafeLazyInit();
+  GTEST_CHECK_(owner_thread_id_ == ::GetCurrentThreadId())
+      << "The current thread is not holding the mutex @" << this;
+}
+
+// Initializes owner_thread_id_ and critical_section_ in static mutexes.
+void Mutex::ThreadSafeLazyInit() {
+  // Dynamic mutexes are initialized in the constructor.
+  if (type_ == kStatic) {
+    switch (
+        ::InterlockedCompareExchange(&critical_section_init_phase_, 1L, 0L)) {
+      case 0:
+        // If critical_section_init_phase_ was 0 before the exchange, we
+        // are the first to test it and need to perform the initialization.
+        owner_thread_id_ = 0;
+        critical_section_ = new CRITICAL_SECTION;
+        ::InitializeCriticalSection(critical_section_);
+        // Updates the critical_section_init_phase_ to 2 to signal
+        // initialization complete.
+        GTEST_CHECK_(::InterlockedCompareExchange(
+                          &critical_section_init_phase_, 2L, 1L) ==
+                      1L);
+        break;
+      case 1:
+        // Somebody else is already initializing the mutex; spin until they
+        // are done.
+        while (::InterlockedCompareExchange(&critical_section_init_phase_,
+                                            2L,
+                                            2L) != 2L) {
+          // Possibly yields the rest of the thread's time slice to other
+          // threads.
+          ::Sleep(0);
+        }
+        break;
+
+      case 2:
+        break;  // The mutex is already initialized and ready for use.
+
+      default:
+        GTEST_CHECK_(false)
+            << "Unexpected value of critical_section_init_phase_ "
+            << "while initializing a static mutex.";
+    }
+  }
+}
+
+namespace {
+
+class ThreadWithParamSupport : public ThreadWithParamBase {
+ public:
+  static HANDLE CreateThread(Runnable* runnable,
+                             Notification* thread_can_start) {
+    ThreadMainParam* param = new ThreadMainParam(runnable, thread_can_start);
+    DWORD thread_id;
+    // TODO(yukawa): Consider to use _beginthreadex instead.
+    HANDLE thread_handle = ::CreateThread(
+        NULL,    // Default security.
+        0,       // Default stack size.
+        &ThreadWithParamSupport::ThreadMain,
+        param,   // Parameter to ThreadMainStatic
+        0x0,     // Default creation flags.
+        &thread_id);  // Need a valid pointer for the call to work under Win98.
+    GTEST_CHECK_(thread_handle != NULL) << "CreateThread failed with error "
+                                        << ::GetLastError() << ".";
+    if (thread_handle == NULL) {
+      delete param;
+    }
+    return thread_handle;
+  }
+
+ private:
+  struct ThreadMainParam {
+    ThreadMainParam(Runnable* runnable, Notification* thread_can_start)
+        : runnable_(runnable),
+          thread_can_start_(thread_can_start) {
+    }
+    scoped_ptr<Runnable> runnable_;
+    // Does not own.
+    Notification* thread_can_start_;
+  };
+
+  static DWORD WINAPI ThreadMain(void* ptr) {
+    // Transfers ownership.
+    scoped_ptr<ThreadMainParam> param(static_cast<ThreadMainParam*>(ptr));
+    if (param->thread_can_start_ != NULL)
+      param->thread_can_start_->WaitForNotification();
+    param->runnable_->Run();
+    return 0;
+  }
+
+  // Prohibit instantiation.
+  ThreadWithParamSupport();
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParamSupport);
+};
+
+}  // namespace
+
+ThreadWithParamBase::ThreadWithParamBase(Runnable *runnable,
+                                         Notification* thread_can_start)
+      : thread_(ThreadWithParamSupport::CreateThread(runnable,
+                                                     thread_can_start)) {
+}
+
+ThreadWithParamBase::~ThreadWithParamBase() {
+  Join();
+}
+
+void ThreadWithParamBase::Join() {
+  GTEST_CHECK_(::WaitForSingleObject(thread_.Get(), INFINITE) == WAIT_OBJECT_0)
+      << "Failed to join the thread with error " << ::GetLastError() << ".";
+}
+
+// Maps a thread to a set of ThreadIdToThreadLocals that have values
+// instantiated on that thread and notifies them when the thread exits.  A
+// ThreadLocal instance is expected to persist until all threads it has
+// values on have terminated.
+class ThreadLocalRegistryImpl {
+ public:
+  // Registers thread_local_instance as having value on the current thread.
+  // Returns a value that can be used to identify the thread from other threads.
+  static ThreadLocalValueHolderBase* GetValueOnCurrentThread(
+      const ThreadLocalBase* thread_local_instance) {
+    DWORD current_thread = ::GetCurrentThreadId();
+    MutexLock lock(&mutex_);
+    ThreadIdToThreadLocals* const thread_to_thread_locals =
+        GetThreadLocalsMapLocked();
+    ThreadIdToThreadLocals::iterator thread_local_pos =
+        thread_to_thread_locals->find(current_thread);
+    if (thread_local_pos == thread_to_thread_locals->end()) {
+      thread_local_pos = thread_to_thread_locals->insert(
+          std::make_pair(current_thread, ThreadLocalValues())).first;
+      StartWatcherThreadFor(current_thread);
+    }
+    ThreadLocalValues& thread_local_values = thread_local_pos->second;
+    ThreadLocalValues::iterator value_pos =
+        thread_local_values.find(thread_local_instance);
+    if (value_pos == thread_local_values.end()) {
+      value_pos =
+          thread_local_values
+              .insert(std::make_pair(
+                  thread_local_instance,
+                  linked_ptr<ThreadLocalValueHolderBase>(
+                      thread_local_instance->NewValueForCurrentThread())))
+              .first;
+    }
+    return value_pos->second.get();
+  }
+
+  static void OnThreadLocalDestroyed(
+      const ThreadLocalBase* thread_local_instance) {
+    std::vector<linked_ptr<ThreadLocalValueHolderBase> > value_holders;
+    // Clean up the ThreadLocalValues data structure while holding the lock, but
+    // defer the destruction of the ThreadLocalValueHolderBases.
+    {
+      MutexLock lock(&mutex_);
+      ThreadIdToThreadLocals* const thread_to_thread_locals =
+          GetThreadLocalsMapLocked();
+      for (ThreadIdToThreadLocals::iterator it =
+          thread_to_thread_locals->begin();
+          it != thread_to_thread_locals->end();
+          ++it) {
+        ThreadLocalValues& thread_local_values = it->second;
+        ThreadLocalValues::iterator value_pos =
+            thread_local_values.find(thread_local_instance);
+        if (value_pos != thread_local_values.end()) {
+          value_holders.push_back(value_pos->second);
+          thread_local_values.erase(value_pos);
+          // This 'if' can only be successful at most once, so theoretically we
+          // could break out of the loop here, but we don't bother doing so.
+        }
+      }
+    }
+    // Outside the lock, let the destructor for 'value_holders' deallocate the
+    // ThreadLocalValueHolderBases.
+  }
+
+  static void OnThreadExit(DWORD thread_id) {
+    GTEST_CHECK_(thread_id != 0) << ::GetLastError();
+    std::vector<linked_ptr<ThreadLocalValueHolderBase> > value_holders;
+    // Clean up the ThreadIdToThreadLocals data structure while holding the
+    // lock, but defer the destruction of the ThreadLocalValueHolderBases.
+    {
+      MutexLock lock(&mutex_);
+      ThreadIdToThreadLocals* const thread_to_thread_locals =
+          GetThreadLocalsMapLocked();
+      ThreadIdToThreadLocals::iterator thread_local_pos =
+          thread_to_thread_locals->find(thread_id);
+      if (thread_local_pos != thread_to_thread_locals->end()) {
+        ThreadLocalValues& thread_local_values = thread_local_pos->second;
+        for (ThreadLocalValues::iterator value_pos =
+            thread_local_values.begin();
+            value_pos != thread_local_values.end();
+            ++value_pos) {
+          value_holders.push_back(value_pos->second);
+        }
+        thread_to_thread_locals->erase(thread_local_pos);
+      }
+    }
+    // Outside the lock, let the destructor for 'value_holders' deallocate the
+    // ThreadLocalValueHolderBases.
+  }
+
+ private:
+  // In a particular thread, maps a ThreadLocal object to its value.
+  typedef std::map<const ThreadLocalBase*,
+                   linked_ptr<ThreadLocalValueHolderBase> > ThreadLocalValues;
+  // Stores all ThreadIdToThreadLocals having values in a thread, indexed by
+  // thread's ID.
+  typedef std::map<DWORD, ThreadLocalValues> ThreadIdToThreadLocals;
+
+  // Holds the thread id and thread handle that we pass from
+  // StartWatcherThreadFor to WatcherThreadFunc.
+  typedef std::pair<DWORD, HANDLE> ThreadIdAndHandle;
+
+  static void StartWatcherThreadFor(DWORD thread_id) {
+    // The returned handle will be kept in thread_map and closed by
+    // watcher_thread in WatcherThreadFunc.
+    HANDLE thread = ::OpenThread(SYNCHRONIZE | THREAD_QUERY_INFORMATION,
+                                 FALSE,
+                                 thread_id);
+    GTEST_CHECK_(thread != NULL);
+    // We need to to pass a valid thread ID pointer into CreateThread for it
+    // to work correctly under Win98.
+    DWORD watcher_thread_id;
+    HANDLE watcher_thread = ::CreateThread(
+        NULL,   // Default security.
+        0,      // Default stack size
+        &ThreadLocalRegistryImpl::WatcherThreadFunc,
+        reinterpret_cast<LPVOID>(new ThreadIdAndHandle(thread_id, thread)),
+        CREATE_SUSPENDED,
+        &watcher_thread_id);
+    GTEST_CHECK_(watcher_thread != NULL);
+    // Give the watcher thread the same priority as ours to avoid being
+    // blocked by it.
+    ::SetThreadPriority(watcher_thread,
+                        ::GetThreadPriority(::GetCurrentThread()));
+    ::ResumeThread(watcher_thread);
+    ::CloseHandle(watcher_thread);
+  }
+
+  // Monitors exit from a given thread and notifies those
+  // ThreadIdToThreadLocals about thread termination.
+  static DWORD WINAPI WatcherThreadFunc(LPVOID param) {
+    const ThreadIdAndHandle* tah =
+        reinterpret_cast<const ThreadIdAndHandle*>(param);
+    GTEST_CHECK_(
+        ::WaitForSingleObject(tah->second, INFINITE) == WAIT_OBJECT_0);
+    OnThreadExit(tah->first);
+    ::CloseHandle(tah->second);
+    delete tah;
+    return 0;
+  }
+
+  // Returns map of thread local instances.
+  static ThreadIdToThreadLocals* GetThreadLocalsMapLocked() {
+    mutex_.AssertHeld();
+    static ThreadIdToThreadLocals* map = new ThreadIdToThreadLocals;
+    return map;
+  }
+
+  // Protects access to GetThreadLocalsMapLocked() and its return value.
+  static Mutex mutex_;
+  // Protects access to GetThreadMapLocked() and its return value.
+  static Mutex thread_map_mutex_;
+};
+
+Mutex ThreadLocalRegistryImpl::mutex_(Mutex::kStaticMutex);
+Mutex ThreadLocalRegistryImpl::thread_map_mutex_(Mutex::kStaticMutex);
+
+ThreadLocalValueHolderBase* ThreadLocalRegistry::GetValueOnCurrentThread(
+      const ThreadLocalBase* thread_local_instance) {
+  return ThreadLocalRegistryImpl::GetValueOnCurrentThread(
+      thread_local_instance);
+}
+
+void ThreadLocalRegistry::OnThreadLocalDestroyed(
+      const ThreadLocalBase* thread_local_instance) {
+  ThreadLocalRegistryImpl::OnThreadLocalDestroyed(thread_local_instance);
+}
+
+#endif  // GTEST_IS_THREADSAFE && GTEST_OS_WINDOWS
+
+#if GTEST_USES_POSIX_RE
+
+// Implements RE.  Currently only needed for death tests.
+
+RE::~RE() {
+  if (is_valid_) {
+    // regfree'ing an invalid regex might crash because the content
+    // of the regex is undefined. Since the regex's are essentially
+    // the same, one cannot be valid (or invalid) without the other
+    // being so too.
+    regfree(&partial_regex_);
+    regfree(&full_regex_);
+  }
+  free(const_cast<char*>(pattern_));
+}
+
+// Returns true iff regular expression re matches the entire str.
+bool RE::FullMatch(const char* str, const RE& re) {
+  if (!re.is_valid_) return false;
+
+  regmatch_t match;
+  return regexec(&re.full_regex_, str, 1, &match, 0) == 0;
+}
+
+// Returns true iff regular expression re matches a substring of str
+// (including str itself).
+bool RE::PartialMatch(const char* str, const RE& re) {
+  if (!re.is_valid_) return false;
+
+  regmatch_t match;
+  return regexec(&re.partial_regex_, str, 1, &match, 0) == 0;
+}
+
+// Initializes an RE from its string representation.
+void RE::Init(const char* regex) {
+  pattern_ = posix::StrDup(regex);
+
+  // Reserves enough bytes to hold the regular expression used for a
+  // full match.
+  const size_t full_regex_len = strlen(regex) + 10;
+  char* const full_pattern = new char[full_regex_len];
+
+  snprintf(full_pattern, full_regex_len, "^(%s)$", regex);
+  is_valid_ = regcomp(&full_regex_, full_pattern, REG_EXTENDED) == 0;
+  // We want to call regcomp(&partial_regex_, ...) even if the
+  // previous expression returns false.  Otherwise partial_regex_ may
+  // not be properly initialized can may cause trouble when it's
+  // freed.
+  //
+  // Some implementation of POSIX regex (e.g. on at least some
+  // versions of Cygwin) doesn't accept the empty string as a valid
+  // regex.  We change it to an equivalent form "()" to be safe.
+  if (is_valid_) {
+    const char* const partial_regex = (*regex == '\0') ? "()" : regex;
+    is_valid_ = regcomp(&partial_regex_, partial_regex, REG_EXTENDED) == 0;
+  }
+  EXPECT_TRUE(is_valid_)
+      << "Regular expression \"" << regex
+      << "\" is not a valid POSIX Extended regular expression.";
+
+  delete[] full_pattern;
+}
+
+#elif GTEST_USES_SIMPLE_RE
+
+// Returns true iff ch appears anywhere in str (excluding the
+// terminating '\0' character).
+bool IsInSet(char ch, const char* str) {
+  return ch != '\0' && strchr(str, ch) != NULL;
+}
+
+// Returns true iff ch belongs to the given classification.  Unlike
+// similar functions in <ctype.h>, these aren't affected by the
+// current locale.
+bool IsAsciiDigit(char ch) { return '0' <= ch && ch <= '9'; }
+bool IsAsciiPunct(char ch) {
+  return IsInSet(ch, "^-!\"#$%&'()*+,./:;<=>?@[\\]_`{|}~");
+}
+bool IsRepeat(char ch) { return IsInSet(ch, "?*+"); }
+bool IsAsciiWhiteSpace(char ch) { return IsInSet(ch, " \f\n\r\t\v"); }
+bool IsAsciiWordChar(char ch) {
+  return ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') ||
+      ('0' <= ch && ch <= '9') || ch == '_';
+}
+
+// Returns true iff "\\c" is a supported escape sequence.
+bool IsValidEscape(char c) {
+  return (IsAsciiPunct(c) || IsInSet(c, "dDfnrsStvwW"));
+}
+
+// Returns true iff the given atom (specified by escaped and pattern)
+// matches ch.  The result is undefined if the atom is invalid.
+bool AtomMatchesChar(bool escaped, char pattern_char, char ch) {
+  if (escaped) {  // "\\p" where p is pattern_char.
+    switch (pattern_char) {
+      case 'd': return IsAsciiDigit(ch);
+      case 'D': return !IsAsciiDigit(ch);
+      case 'f': return ch == '\f';
+      case 'n': return ch == '\n';
+      case 'r': return ch == '\r';
+      case 's': return IsAsciiWhiteSpace(ch);
+      case 'S': return !IsAsciiWhiteSpace(ch);
+      case 't': return ch == '\t';
+      case 'v': return ch == '\v';
+      case 'w': return IsAsciiWordChar(ch);
+      case 'W': return !IsAsciiWordChar(ch);
+    }
+    return IsAsciiPunct(pattern_char) && pattern_char == ch;
+  }
+
+  return (pattern_char == '.' && ch != '\n') || pattern_char == ch;
+}
+
+// Helper function used by ValidateRegex() to format error messages.
+std::string FormatRegexSyntaxError(const char* regex, int index) {
+  return (Message() << "Syntax error at index " << index
+          << " in simple regular expression \"" << regex << "\": ").GetString();
+}
+
+// Generates non-fatal failures and returns false if regex is invalid;
+// otherwise returns true.
+bool ValidateRegex(const char* regex) {
+  if (regex == NULL) {
+    // TODO(wan at google.com): fix the source file location in the
+    // assertion failures to match where the regex is used in user
+    // code.
+    ADD_FAILURE() << "NULL is not a valid simple regular expression.";
+    return false;
+  }
+
+  bool is_valid = true;
+
+  // True iff ?, *, or + can follow the previous atom.
+  bool prev_repeatable = false;
+  for (int i = 0; regex[i]; i++) {
+    if (regex[i] == '\\') {  // An escape sequence
+      i++;
+      if (regex[i] == '\0') {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1)
+                      << "'\\' cannot appear at the end.";
+        return false;
+      }
+
+      if (!IsValidEscape(regex[i])) {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1)
+                      << "invalid escape sequence \"\\" << regex[i] << "\".";
+        is_valid = false;
+      }
+      prev_repeatable = true;
+    } else {  // Not an escape sequence.
+      const char ch = regex[i];
+
+      if (ch == '^' && i > 0) {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
+                      << "'^' can only appear at the beginning.";
+        is_valid = false;
+      } else if (ch == '$' && regex[i + 1] != '\0') {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
+                      << "'$' can only appear at the end.";
+        is_valid = false;
+      } else if (IsInSet(ch, "()[]{}|")) {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
+                      << "'" << ch << "' is unsupported.";
+        is_valid = false;
+      } else if (IsRepeat(ch) && !prev_repeatable) {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
+                      << "'" << ch << "' can only follow a repeatable token.";
+        is_valid = false;
+      }
+
+      prev_repeatable = !IsInSet(ch, "^$?*+");
+    }
+  }
+
+  return is_valid;
+}
+
+// Matches a repeated regex atom followed by a valid simple regular
+// expression.  The regex atom is defined as c if escaped is false,
+// or \c otherwise.  repeat is the repetition meta character (?, *,
+// or +).  The behavior is undefined if str contains too many
+// characters to be indexable by size_t, in which case the test will
+// probably time out anyway.  We are fine with this limitation as
+// std::string has it too.
+bool MatchRepetitionAndRegexAtHead(
+    bool escaped, char c, char repeat, const char* regex,
+    const char* str) {
+  const size_t min_count = (repeat == '+') ? 1 : 0;
+  const size_t max_count = (repeat == '?') ? 1 :
+      static_cast<size_t>(-1) - 1;
+  // We cannot call numeric_limits::max() as it conflicts with the
+  // max() macro on Windows.
+
+  for (size_t i = 0; i <= max_count; ++i) {
+    // We know that the atom matches each of the first i characters in str.
+    if (i >= min_count && MatchRegexAtHead(regex, str + i)) {
+      // We have enough matches at the head, and the tail matches too.
+      // Since we only care about *whether* the pattern matches str
+      // (as opposed to *how* it matches), there is no need to find a
+      // greedy match.
+      return true;
+    }
+    if (str[i] == '\0' || !AtomMatchesChar(escaped, c, str[i]))
+      return false;
+  }
+  return false;
+}
+
+// Returns true iff regex matches a prefix of str.  regex must be a
+// valid simple regular expression and not start with "^", or the
+// result is undefined.
+bool MatchRegexAtHead(const char* regex, const char* str) {
+  if (*regex == '\0')  // An empty regex matches a prefix of anything.
+    return true;
+
+  // "$" only matches the end of a string.  Note that regex being
+  // valid guarantees that there's nothing after "$" in it.
+  if (*regex == '$')
+    return *str == '\0';
+
+  // Is the first thing in regex an escape sequence?
+  const bool escaped = *regex == '\\';
+  if (escaped)
+    ++regex;
+  if (IsRepeat(regex[1])) {
+    // MatchRepetitionAndRegexAtHead() calls MatchRegexAtHead(), so
+    // here's an indirect recursion.  It terminates as the regex gets
+    // shorter in each recursion.
+    return MatchRepetitionAndRegexAtHead(
+        escaped, regex[0], regex[1], regex + 2, str);
+  } else {
+    // regex isn't empty, isn't "$", and doesn't start with a
+    // repetition.  We match the first atom of regex with the first
+    // character of str and recurse.
+    return (*str != '\0') && AtomMatchesChar(escaped, *regex, *str) &&
+        MatchRegexAtHead(regex + 1, str + 1);
+  }
+}
+
+// Returns true iff regex matches any substring of str.  regex must be
+// a valid simple regular expression, or the result is undefined.
+//
+// The algorithm is recursive, but the recursion depth doesn't exceed
+// the regex length, so we won't need to worry about running out of
+// stack space normally.  In rare cases the time complexity can be
+// exponential with respect to the regex length + the string length,
+// but usually it's must faster (often close to linear).
+bool MatchRegexAnywhere(const char* regex, const char* str) {
+  if (regex == NULL || str == NULL)
+    return false;
+
+  if (*regex == '^')
+    return MatchRegexAtHead(regex + 1, str);
+
+  // A successful match can be anywhere in str.
+  do {
+    if (MatchRegexAtHead(regex, str))
+      return true;
+  } while (*str++ != '\0');
+  return false;
+}
+
+// Implements the RE class.
+
+RE::~RE() {
+  free(const_cast<char*>(pattern_));
+  free(const_cast<char*>(full_pattern_));
+}
+
+// Returns true iff regular expression re matches the entire str.
+bool RE::FullMatch(const char* str, const RE& re) {
+  return re.is_valid_ && MatchRegexAnywhere(re.full_pattern_, str);
+}
+
+// Returns true iff regular expression re matches a substring of str
+// (including str itself).
+bool RE::PartialMatch(const char* str, const RE& re) {
+  return re.is_valid_ && MatchRegexAnywhere(re.pattern_, str);
+}
+
+// Initializes an RE from its string representation.
+void RE::Init(const char* regex) {
+  pattern_ = full_pattern_ = NULL;
+  if (regex != NULL) {
+    pattern_ = posix::StrDup(regex);
+  }
+
+  is_valid_ = ValidateRegex(regex);
+  if (!is_valid_) {
+    // No need to calculate the full pattern when the regex is invalid.
+    return;
+  }
+
+  const size_t len = strlen(regex);
+  // Reserves enough bytes to hold the regular expression used for a
+  // full match: we need space to prepend a '^', append a '$', and
+  // terminate the string with '\0'.
+  char* buffer = static_cast<char*>(malloc(len + 3));
+  full_pattern_ = buffer;
+
+  if (*regex != '^')
+    *buffer++ = '^';  // Makes sure full_pattern_ starts with '^'.
+
+  // We don't use snprintf or strncpy, as they trigger a warning when
+  // compiled with VC++ 8.0.
+  memcpy(buffer, regex, len);
+  buffer += len;
+
+  if (len == 0 || regex[len - 1] != '$')
+    *buffer++ = '$';  // Makes sure full_pattern_ ends with '$'.
+
+  *buffer = '\0';
+}
+
+#endif  // GTEST_USES_POSIX_RE
+
+const char kUnknownFile[] = "unknown file";
+
+// Formats a source file path and a line number as they would appear
+// in an error message from the compiler used to compile this code.
+GTEST_API_ ::std::string FormatFileLocation(const char* file, int line) {
+  const std::string file_name(file == NULL ? kUnknownFile : file);
+
+  if (line < 0) {
+    return file_name + ":";
+  }
+#ifdef _MSC_VER
+  return file_name + "(" + StreamableToString(line) + "):";
+#else
+  return file_name + ":" + StreamableToString(line) + ":";
+#endif  // _MSC_VER
+}
+
+// Formats a file location for compiler-independent XML output.
+// Although this function is not platform dependent, we put it next to
+// FormatFileLocation in order to contrast the two functions.
+// Note that FormatCompilerIndependentFileLocation() does NOT append colon
+// to the file location it produces, unlike FormatFileLocation().
+GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(
+    const char* file, int line) {
+  const std::string file_name(file == NULL ? kUnknownFile : file);
+
+  if (line < 0)
+    return file_name;
+  else
+    return file_name + ":" + StreamableToString(line);
+}
+
+
+GTestLog::GTestLog(GTestLogSeverity severity, const char* file, int line)
+    : severity_(severity) {
+  const char* const marker =
+      severity == GTEST_INFO ?    "[  INFO ]" :
+      severity == GTEST_WARNING ? "[WARNING]" :
+      severity == GTEST_ERROR ?   "[ ERROR ]" : "[ FATAL ]";
+  GetStream() << ::std::endl << marker << " "
+              << FormatFileLocation(file, line).c_str() << ": ";
+}
+
+// Flushes the buffers and, if severity is GTEST_FATAL, aborts the program.
+GTestLog::~GTestLog() {
+  GetStream() << ::std::endl;
+  if (severity_ == GTEST_FATAL) {
+    fflush(stderr);
+    posix::Abort();
+  }
+}
+// Disable Microsoft deprecation warnings for POSIX functions called from
+// this class (creat, dup, dup2, and close)
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996)
+
+#if GTEST_HAS_STREAM_REDIRECTION
+
+// Object that captures an output stream (stdout/stderr).
+class CapturedStream {
+ public:
+  // The ctor redirects the stream to a temporary file.
+  explicit CapturedStream(int fd) : fd_(fd), uncaptured_fd_(dup(fd)) {
+# if GTEST_OS_WINDOWS
+    char temp_dir_path[MAX_PATH + 1] = { '\0' };  // NOLINT
+    char temp_file_path[MAX_PATH + 1] = { '\0' };  // NOLINT
+
+    ::GetTempPathA(sizeof(temp_dir_path), temp_dir_path);
+    const UINT success = ::GetTempFileNameA(temp_dir_path,
+                                            "gtest_redir",
+                                            0,  // Generate unique file name.
+                                            temp_file_path);
+    GTEST_CHECK_(success != 0)
+        << "Unable to create a temporary file in " << temp_dir_path;
+    const int captured_fd = creat(temp_file_path, _S_IREAD | _S_IWRITE);
+    GTEST_CHECK_(captured_fd != -1) << "Unable to open temporary file "
+                                    << temp_file_path;
+    filename_ = temp_file_path;
+# else
+    // There's no guarantee that a test has write access to the current
+    // directory, so we create the temporary file in the /tmp directory
+    // instead. We use /tmp on most systems, and /sdcard on Android.
+    // That's because Android doesn't have /tmp.
+#  if GTEST_OS_LINUX_ANDROID
+    // Note: Android applications are expected to call the framework's
+    // Context.getExternalStorageDirectory() method through JNI to get
+    // the location of the world-writable SD Card directory. However,
+    // this requires a Context handle, which cannot be retrieved
+    // globally from native code. Doing so also precludes running the
+    // code as part of a regular standalone executable, which doesn't
+    // run in a Dalvik process (e.g. when running it through 'adb shell').
+    //
+    // The location /sdcard is directly accessible from native code
+    // and is the only location (unofficially) supported by the Android
+    // team. It's generally a symlink to the real SD Card mount point
+    // which can be /mnt/sdcard, /mnt/sdcard0, /system/media/sdcard, or
+    // other OEM-customized locations. Never rely on these, and always
+    // use /sdcard.
+    char name_template[] = "/sdcard/gtest_captured_stream.XXXXXX";
+#  else
+    char name_template[] = "/tmp/captured_stream.XXXXXX";
+#  endif  // GTEST_OS_LINUX_ANDROID
+    const int captured_fd = mkstemp(name_template);
+    filename_ = name_template;
+# endif  // GTEST_OS_WINDOWS
+    fflush(NULL);
+    dup2(captured_fd, fd_);
+    close(captured_fd);
+  }
+
+  ~CapturedStream() {
+    remove(filename_.c_str());
+  }
+
+  std::string GetCapturedString() {
+    if (uncaptured_fd_ != -1) {
+      // Restores the original stream.
+      fflush(NULL);
+      dup2(uncaptured_fd_, fd_);
+      close(uncaptured_fd_);
+      uncaptured_fd_ = -1;
+    }
+
+    FILE* const file = posix::FOpen(filename_.c_str(), "r");
+    const std::string content = ReadEntireFile(file);
+    posix::FClose(file);
+    return content;
+  }
+
+ private:
+  // Reads the entire content of a file as an std::string.
+  static std::string ReadEntireFile(FILE* file);
+
+  // Returns the size (in bytes) of a file.
+  static size_t GetFileSize(FILE* file);
+
+  const int fd_;  // A stream to capture.
+  int uncaptured_fd_;
+  // Name of the temporary file holding the stderr output.
+  ::std::string filename_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(CapturedStream);
+};
+
+// Returns the size (in bytes) of a file.
+size_t CapturedStream::GetFileSize(FILE* file) {
+  fseek(file, 0, SEEK_END);
+  return static_cast<size_t>(ftell(file));
+}
+
+// Reads the entire content of a file as a string.
+std::string CapturedStream::ReadEntireFile(FILE* file) {
+  const size_t file_size = GetFileSize(file);
+  char* const buffer = new char[file_size];
+
+  size_t bytes_last_read = 0;  // # of bytes read in the last fread()
+  size_t bytes_read = 0;       // # of bytes read so far
+
+  fseek(file, 0, SEEK_SET);
+
+  // Keeps reading the file until we cannot read further or the
+  // pre-determined file size is reached.
+  do {
+    bytes_last_read = fread(buffer+bytes_read, 1, file_size-bytes_read, file);
+    bytes_read += bytes_last_read;
+  } while (bytes_last_read > 0 && bytes_read < file_size);
+
+  const std::string content(buffer, bytes_read);
+  delete[] buffer;
+
+  return content;
+}
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()
+
+static CapturedStream* g_captured_stderr = NULL;
+static CapturedStream* g_captured_stdout = NULL;
+
+// Starts capturing an output stream (stdout/stderr).
+void CaptureStream(int fd, const char* stream_name, CapturedStream** stream) {
+  if (*stream != NULL) {
+    GTEST_LOG_(FATAL) << "Only one " << stream_name
+                      << " capturer can exist at a time.";
+  }
+  *stream = new CapturedStream(fd);
+}
+
+// Stops capturing the output stream and returns the captured string.
+std::string GetCapturedStream(CapturedStream** captured_stream) {
+  const std::string content = (*captured_stream)->GetCapturedString();
+
+  delete *captured_stream;
+  *captured_stream = NULL;
+
+  return content;
+}
+
+// Starts capturing stdout.
+void CaptureStdout() {
+  CaptureStream(kStdOutFileno, "stdout", &g_captured_stdout);
+}
+
+// Starts capturing stderr.
+void CaptureStderr() {
+  CaptureStream(kStdErrFileno, "stderr", &g_captured_stderr);
+}
+
+// Stops capturing stdout and returns the captured string.
+std::string GetCapturedStdout() {
+  return GetCapturedStream(&g_captured_stdout);
+}
+
+// Stops capturing stderr and returns the captured string.
+std::string GetCapturedStderr() {
+  return GetCapturedStream(&g_captured_stderr);
+}
+
+#endif  // GTEST_HAS_STREAM_REDIRECTION
+
+#if GTEST_HAS_DEATH_TEST
+
+// A copy of all command line arguments.  Set by InitGoogleTest().
+::std::vector<testing::internal::string> g_argvs;
+
+static const ::std::vector<testing::internal::string>* g_injected_test_argvs =
+                                        NULL;  // Owned.
+
+void SetInjectableArgvs(const ::std::vector<testing::internal::string>* argvs) {
+  if (g_injected_test_argvs != argvs)
+    delete g_injected_test_argvs;
+  g_injected_test_argvs = argvs;
+}
+
+const ::std::vector<testing::internal::string>& GetInjectableArgvs() {
+  if (g_injected_test_argvs != NULL) {
+    return *g_injected_test_argvs;
+  }
+  return g_argvs;
+}
+#endif  // GTEST_HAS_DEATH_TEST
+
+#if GTEST_OS_WINDOWS_MOBILE
+namespace posix {
+void Abort() {
+  DebugBreak();
+  TerminateProcess(GetCurrentProcess(), 1);
+}
+}  // namespace posix
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+// Returns the name of the environment variable corresponding to the
+// given flag.  For example, FlagToEnvVar("foo") will return
+// "GTEST_FOO" in the open-source version.
+static std::string FlagToEnvVar(const char* flag) {
+  const std::string full_flag =
+      (Message() << GTEST_FLAG_PREFIX_ << flag).GetString();
+
+  Message env_var;
+  for (size_t i = 0; i != full_flag.length(); i++) {
+    env_var << ToUpper(full_flag.c_str()[i]);
+  }
+
+  return env_var.GetString();
+}
+
+// Parses 'str' for a 32-bit signed integer.  If successful, writes
+// the result to *value and returns true; otherwise leaves *value
+// unchanged and returns false.
+bool ParseInt32(const Message& src_text, const char* str, Int32* value) {
+  // Parses the environment variable as a decimal integer.
+  char* end = NULL;
+  const long long_value = strtol(str, &end, 10);  // NOLINT
+
+  // Has strtol() consumed all characters in the string?
+  if (*end != '\0') {
+    // No - an invalid character was encountered.
+    Message msg;
+    msg << "WARNING: " << src_text
+        << " is expected to be a 32-bit integer, but actually"
+        << " has value \"" << str << "\".\n";
+    printf("%s", msg.GetString().c_str());
+    fflush(stdout);
+    return false;
+  }
+
+  // Is the parsed value in the range of an Int32?
+  const Int32 result = static_cast<Int32>(long_value);
+  if (long_value == LONG_MAX || long_value == LONG_MIN ||
+      // The parsed value overflows as a long.  (strtol() returns
+      // LONG_MAX or LONG_MIN when the input overflows.)
+      result != long_value
+      // The parsed value overflows as an Int32.
+      ) {
+    Message msg;
+    msg << "WARNING: " << src_text
+        << " is expected to be a 32-bit integer, but actually"
+        << " has value " << str << ", which overflows.\n";
+    printf("%s", msg.GetString().c_str());
+    fflush(stdout);
+    return false;
+  }
+
+  *value = result;
+  return true;
+}
+
+// Reads and returns the Boolean environment variable corresponding to
+// the given flag; if it's not set, returns default_value.
+//
+// The value is considered true iff it's not "0".
+bool BoolFromGTestEnv(const char* flag, bool default_value) {
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const string_value = posix::GetEnv(env_var.c_str());
+  return string_value == NULL ?
+      default_value : strcmp(string_value, "0") != 0;
+}
+
+// Reads and returns a 32-bit integer stored in the environment
+// variable corresponding to the given flag; if it isn't set or
+// doesn't represent a valid 32-bit integer, returns default_value.
+Int32 Int32FromGTestEnv(const char* flag, Int32 default_value) {
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const string_value = posix::GetEnv(env_var.c_str());
+  if (string_value == NULL) {
+    // The environment variable is not set.
+    return default_value;
+  }
+
+  Int32 result = default_value;
+  if (!ParseInt32(Message() << "Environment variable " << env_var,
+                  string_value, &result)) {
+    printf("The default value %s is used.\n",
+           (Message() << default_value).GetString().c_str());
+    fflush(stdout);
+    return default_value;
+  }
+
+  return result;
+}
+
+// Reads and returns the string environment variable corresponding to
+// the given flag; if it's not set, returns default_value.
+const char* StringFromGTestEnv(const char* flag, const char* default_value) {
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const value = posix::GetEnv(env_var.c_str());
+  return value == NULL ? default_value : value;
+}
+
+}  // namespace internal
+}  // namespace testing
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan at google.com (Zhanyong Wan)
+
+// Google Test - The Google C++ Testing Framework
+//
+// This file implements a universal value printer that can print a
+// value of any type T:
+//
+//   void ::testing::internal::UniversalPrinter<T>::Print(value, ostream_ptr);
+//
+// It uses the << operator when possible, and prints the bytes in the
+// object otherwise.  A user can override its behavior for a class
+// type Foo by defining either operator<<(::std::ostream&, const Foo&)
+// or void PrintTo(const Foo&, ::std::ostream*) in the namespace that
+// defines Foo.
+
+#include <ctype.h>
+#include <stdio.h>
+#include <cwchar>
+#include <ostream>  // NOLINT
+#include <string>
+
+namespace testing {
+
+namespace {
+
+using ::std::ostream;
+
+// Prints a segment of bytes in the given object.
+GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+void PrintByteSegmentInObjectTo(const unsigned char* obj_bytes, size_t start,
+                                size_t count, ostream* os) {
+  char text[5] = "";
+  for (size_t i = 0; i != count; i++) {
+    const size_t j = start + i;
+    if (i != 0) {
+      // Organizes the bytes into groups of 2 for easy parsing by
+      // human.
+      if ((j % 2) == 0)
+        *os << ' ';
+      else
+        *os << '-';
+    }
+    GTEST_SNPRINTF_(text, sizeof(text), "%02X", obj_bytes[j]);
+    *os << text;
+  }
+}
+
+// Prints the bytes in the given value to the given ostream.
+void PrintBytesInObjectToImpl(const unsigned char* obj_bytes, size_t count,
+                              ostream* os) {
+  // Tells the user how big the object is.
+  *os << count << "-byte object <";
+
+  const size_t kThreshold = 132;
+  const size_t kChunkSize = 64;
+  // If the object size is bigger than kThreshold, we'll have to omit
+  // some details by printing only the first and the last kChunkSize
+  // bytes.
+  // TODO(wan): let the user control the threshold using a flag.
+  if (count < kThreshold) {
+    PrintByteSegmentInObjectTo(obj_bytes, 0, count, os);
+  } else {
+    PrintByteSegmentInObjectTo(obj_bytes, 0, kChunkSize, os);
+    *os << " ... ";
+    // Rounds up to 2-byte boundary.
+    const size_t resume_pos = (count - kChunkSize + 1)/2*2;
+    PrintByteSegmentInObjectTo(obj_bytes, resume_pos, count - resume_pos, os);
+  }
+  *os << ">";
+}
+
+}  // namespace
+
+namespace internal2 {
+
+// Delegates to PrintBytesInObjectToImpl() to print the bytes in the
+// given object.  The delegation simplifies the implementation, which
+// uses the << operator and thus is easier done outside of the
+// ::testing::internal namespace, which contains a << operator that
+// sometimes conflicts with the one in STL.
+void PrintBytesInObjectTo(const unsigned char* obj_bytes, size_t count,
+                          ostream* os) {
+  PrintBytesInObjectToImpl(obj_bytes, count, os);
+}
+
+}  // namespace internal2
+
+namespace internal {
+
+// Depending on the value of a char (or wchar_t), we print it in one
+// of three formats:
+//   - as is if it's a printable ASCII (e.g. 'a', '2', ' '),
+//   - as a hexidecimal escape sequence (e.g. '\x7F'), or
+//   - as a special escape sequence (e.g. '\r', '\n').
+enum CharFormat {
+  kAsIs,
+  kHexEscape,
+  kSpecialEscape
+};
+
+// Returns true if c is a printable ASCII character.  We test the
+// value of c directly instead of calling isprint(), which is buggy on
+// Windows Mobile.
+inline bool IsPrintableAscii(wchar_t c) {
+  return 0x20 <= c && c <= 0x7E;
+}
+
+// Prints a wide or narrow char c as a character literal without the
+// quotes, escaping it when necessary; returns how c was formatted.
+// The template argument UnsignedChar is the unsigned version of Char,
+// which is the type of c.
+template <typename UnsignedChar, typename Char>
+static CharFormat PrintAsCharLiteralTo(Char c, ostream* os) {
+  switch (static_cast<wchar_t>(c)) {
+    case L'\0':
+      *os << "\\0";
+      break;
+    case L'\'':
+      *os << "\\'";
+      break;
+    case L'\\':
+      *os << "\\\\";
+      break;
+    case L'\a':
+      *os << "\\a";
+      break;
+    case L'\b':
+      *os << "\\b";
+      break;
+    case L'\f':
+      *os << "\\f";
+      break;
+    case L'\n':
+      *os << "\\n";
+      break;
+    case L'\r':
+      *os << "\\r";
+      break;
+    case L'\t':
+      *os << "\\t";
+      break;
+    case L'\v':
+      *os << "\\v";
+      break;
+    default:
+      if (IsPrintableAscii(c)) {
+        *os << static_cast<char>(c);
+        return kAsIs;
+      } else {
+        *os << "\\x" + String::FormatHexInt(static_cast<UnsignedChar>(c));
+        return kHexEscape;
+      }
+  }
+  return kSpecialEscape;
+}
+
+// Prints a wchar_t c as if it's part of a string literal, escaping it when
+// necessary; returns how c was formatted.
+static CharFormat PrintAsStringLiteralTo(wchar_t c, ostream* os) {
+  switch (c) {
+    case L'\'':
+      *os << "'";
+      return kAsIs;
+    case L'"':
+      *os << "\\\"";
+      return kSpecialEscape;
+    default:
+      return PrintAsCharLiteralTo<wchar_t>(c, os);
+  }
+}
+
+// Prints a char c as if it's part of a string literal, escaping it when
+// necessary; returns how c was formatted.
+static CharFormat PrintAsStringLiteralTo(char c, ostream* os) {
+  return PrintAsStringLiteralTo(
+      static_cast<wchar_t>(static_cast<unsigned char>(c)), os);
+}
+
+// Prints a wide or narrow character c and its code.  '\0' is printed
+// as "'\\0'", other unprintable characters are also properly escaped
+// using the standard C++ escape sequence.  The template argument
+// UnsignedChar is the unsigned version of Char, which is the type of c.
+template <typename UnsignedChar, typename Char>
+void PrintCharAndCodeTo(Char c, ostream* os) {
+  // First, print c as a literal in the most readable form we can find.
+  *os << ((sizeof(c) > 1) ? "L'" : "'");
+  const CharFormat format = PrintAsCharLiteralTo<UnsignedChar>(c, os);
+  *os << "'";
+
+  // To aid user debugging, we also print c's code in decimal, unless
+  // it's 0 (in which case c was printed as '\\0', making the code
+  // obvious).
+  if (c == 0)
+    return;
+  *os << " (" << static_cast<int>(c);
+
+  // For more convenience, we print c's code again in hexidecimal,
+  // unless c was already printed in the form '\x##' or the code is in
+  // [1, 9].
+  if (format == kHexEscape || (1 <= c && c <= 9)) {
+    // Do nothing.
+  } else {
+    *os << ", 0x" << String::FormatHexInt(static_cast<UnsignedChar>(c));
+  }
+  *os << ")";
+}
+
+void PrintTo(unsigned char c, ::std::ostream* os) {
+  PrintCharAndCodeTo<unsigned char>(c, os);
+}
+void PrintTo(signed char c, ::std::ostream* os) {
+  PrintCharAndCodeTo<unsigned char>(c, os);
+}
+
+// Prints a wchar_t as a symbol if it is printable or as its internal
+// code otherwise and also as its code.  L'\0' is printed as "L'\\0'".
+void PrintTo(wchar_t wc, ostream* os) {
+  PrintCharAndCodeTo<wchar_t>(wc, os);
+}
+
+// Prints the given array of characters to the ostream.  CharType must be either
+// char or wchar_t.
+// The array starts at begin, the length is len, it may include '\0' characters
+// and may not be NUL-terminated.
+template <typename CharType>
+GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+static void PrintCharsAsStringTo(
+    const CharType* begin, size_t len, ostream* os) {
+  const char* const kQuoteBegin = sizeof(CharType) == 1 ? "\"" : "L\"";
+  *os << kQuoteBegin;
+  bool is_previous_hex = false;
+  for (size_t index = 0; index < len; ++index) {
+    const CharType cur = begin[index];
+    if (is_previous_hex && IsXDigit(cur)) {
+      // Previous character is of '\x..' form and this character can be
+      // interpreted as another hexadecimal digit in its number. Break string to
+      // disambiguate.
+      *os << "\" " << kQuoteBegin;
+    }
+    is_previous_hex = PrintAsStringLiteralTo(cur, os) == kHexEscape;
+  }
+  *os << "\"";
+}
+
+// Prints a (const) char/wchar_t array of 'len' elements, starting at address
+// 'begin'.  CharType must be either char or wchar_t.
+template <typename CharType>
+GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+static void UniversalPrintCharArray(
+    const CharType* begin, size_t len, ostream* os) {
+  // The code
+  //   const char kFoo[] = "foo";
+  // generates an array of 4, not 3, elements, with the last one being '\0'.
+  //
+  // Therefore when printing a char array, we don't print the last element if
+  // it's '\0', such that the output matches the string literal as it's
+  // written in the source code.
+  if (len > 0 && begin[len - 1] == '\0') {
+    PrintCharsAsStringTo(begin, len - 1, os);
+    return;
+  }
+
+  // If, however, the last element in the array is not '\0', e.g.
+  //    const char kFoo[] = { 'f', 'o', 'o' };
+  // we must print the entire array.  We also print a message to indicate
+  // that the array is not NUL-terminated.
+  PrintCharsAsStringTo(begin, len, os);
+  *os << " (no terminating NUL)";
+}
+
+// Prints a (const) char array of 'len' elements, starting at address 'begin'.
+void UniversalPrintArray(const char* begin, size_t len, ostream* os) {
+  UniversalPrintCharArray(begin, len, os);
+}
+
+// Prints a (const) wchar_t array of 'len' elements, starting at address
+// 'begin'.
+void UniversalPrintArray(const wchar_t* begin, size_t len, ostream* os) {
+  UniversalPrintCharArray(begin, len, os);
+}
+
+// Prints the given C string to the ostream.
+void PrintTo(const char* s, ostream* os) {
+  if (s == NULL) {
+    *os << "NULL";
+  } else {
+    *os << ImplicitCast_<const void*>(s) << " pointing to ";
+    PrintCharsAsStringTo(s, strlen(s), os);
+  }
+}
+
+// MSVC compiler can be configured to define whar_t as a typedef
+// of unsigned short. Defining an overload for const wchar_t* in that case
+// would cause pointers to unsigned shorts be printed as wide strings,
+// possibly accessing more memory than intended and causing invalid
+// memory accesses. MSVC defines _NATIVE_WCHAR_T_DEFINED symbol when
+// wchar_t is implemented as a native type.
+#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED)
+// Prints the given wide C string to the ostream.
+void PrintTo(const wchar_t* s, ostream* os) {
+  if (s == NULL) {
+    *os << "NULL";
+  } else {
+    *os << ImplicitCast_<const void*>(s) << " pointing to ";
+    PrintCharsAsStringTo(s, std::wcslen(s), os);
+  }
+}
+#endif  // wchar_t is native
+
+// Prints a ::string object.
+#if GTEST_HAS_GLOBAL_STRING
+void PrintStringTo(const ::string& s, ostream* os) {
+  PrintCharsAsStringTo(s.data(), s.size(), os);
+}
+#endif  // GTEST_HAS_GLOBAL_STRING
+
+void PrintStringTo(const ::std::string& s, ostream* os) {
+  PrintCharsAsStringTo(s.data(), s.size(), os);
+}
+
+// Prints a ::wstring object.
+#if GTEST_HAS_GLOBAL_WSTRING
+void PrintWideStringTo(const ::wstring& s, ostream* os) {
+  PrintCharsAsStringTo(s.data(), s.size(), os);
+}
+#endif  // GTEST_HAS_GLOBAL_WSTRING
+
+#if GTEST_HAS_STD_WSTRING
+void PrintWideStringTo(const ::std::wstring& s, ostream* os) {
+  PrintCharsAsStringTo(s.data(), s.size(), os);
+}
+#endif  // GTEST_HAS_STD_WSTRING
+
+}  // namespace internal
+
+}  // namespace testing
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: mheule at google.com (Markus Heule)
+//
+// The Google C++ Testing Framework (Google Test)
+
+
+// Indicates that this translation unit is part of Google Test's
+// implementation.  It must come before gtest-internal-inl.h is
+// included, or there will be a compiler error.  This trick exists to
+// prevent the accidental inclusion of gtest-internal-inl.h in the
+// user's code.
+#define GTEST_IMPLEMENTATION_ 1
+#undef GTEST_IMPLEMENTATION_
+
+namespace testing {
+
+using internal::GetUnitTestImpl;
+
+// Gets the summary of the failure message by omitting the stack trace
+// in it.
+std::string TestPartResult::ExtractSummary(const char* message) {
+  const char* const stack_trace = strstr(message, internal::kStackTraceMarker);
+  return stack_trace == NULL ? message :
+      std::string(message, stack_trace);
+}
+
+// Prints a TestPartResult object.
+std::ostream& operator<<(std::ostream& os, const TestPartResult& result) {
+  return os
+      << result.file_name() << ":" << result.line_number() << ": "
+      << (result.type() == TestPartResult::kSuccess ? "Success" :
+          result.type() == TestPartResult::kFatalFailure ? "Fatal failure" :
+          "Non-fatal failure") << ":\n"
+      << result.message() << std::endl;
+}
+
+// Appends a TestPartResult to the array.
+void TestPartResultArray::Append(const TestPartResult& result) {
+  array_.push_back(result);
+}
+
+// Returns the TestPartResult at the given index (0-based).
+const TestPartResult& TestPartResultArray::GetTestPartResult(int index) const {
+  if (index < 0 || index >= size()) {
+    printf("\nInvalid index (%d) into TestPartResultArray.\n", index);
+    internal::posix::Abort();
+  }
+
+  return array_[index];
+}
+
+// Returns the number of TestPartResult objects in the array.
+int TestPartResultArray::size() const {
+  return static_cast<int>(array_.size());
+}
+
+namespace internal {
+
+HasNewFatalFailureHelper::HasNewFatalFailureHelper()
+    : has_new_fatal_failure_(false),
+      original_reporter_(GetUnitTestImpl()->
+                         GetTestPartResultReporterForCurrentThread()) {
+  GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(this);
+}
+
+HasNewFatalFailureHelper::~HasNewFatalFailureHelper() {
+  GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(
+      original_reporter_);
+}
+
+void HasNewFatalFailureHelper::ReportTestPartResult(
+    const TestPartResult& result) {
+  if (result.fatally_failed())
+    has_new_fatal_failure_ = true;
+  original_reporter_->ReportTestPartResult(result);
+}
+
+}  // namespace internal
+
+}  // namespace testing
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan at google.com (Zhanyong Wan)
+
+
+namespace testing {
+namespace internal {
+
+#if GTEST_HAS_TYPED_TEST_P
+
+// Skips to the first non-space char in str. Returns an empty string if str
+// contains only whitespace characters.
+static const char* SkipSpaces(const char* str) {
+  while (IsSpace(*str))
+    str++;
+  return str;
+}
+
+static std::vector<std::string> SplitIntoTestNames(const char* src) {
+  std::vector<std::string> name_vec;
+  src = SkipSpaces(src);
+  for (; src != NULL; src = SkipComma(src)) {
+    name_vec.push_back(StripTrailingSpaces(GetPrefixUntilComma(src)));
+  }
+  return name_vec;
+}
+
+// Verifies that registered_tests match the test names in
+// defined_test_names_; returns registered_tests if successful, or
+// aborts the program otherwise.
+const char* TypedTestCasePState::VerifyRegisteredTestNames(
+    const char* file, int line, const char* registered_tests) {
+  typedef ::std::set<const char*>::const_iterator DefinedTestIter;
+  registered_ = true;
+
+  std::vector<std::string> name_vec = SplitIntoTestNames(registered_tests);
+
+  Message errors;
+
+  std::set<std::string> tests;
+  for (std::vector<std::string>::const_iterator name_it = name_vec.begin();
+       name_it != name_vec.end(); ++name_it) {
+    const std::string& name = *name_it;
+    if (tests.count(name) != 0) {
+      errors << "Test " << name << " is listed more than once.\n";
+      continue;
+    }
+
+    bool found = false;
+    for (DefinedTestIter it = defined_test_names_.begin();
+         it != defined_test_names_.end();
+         ++it) {
+      if (name == *it) {
+        found = true;
+        break;
+      }
+    }
+
+    if (found) {
+      tests.insert(name);
+    } else {
+      errors << "No test named " << name
+             << " can be found in this test case.\n";
+    }
+  }
+
+  for (DefinedTestIter it = defined_test_names_.begin();
+       it != defined_test_names_.end();
+       ++it) {
+    if (tests.count(*it) == 0) {
+      errors << "You forgot to list test " << *it << ".\n";
+    }
+  }
+
+  const std::string& errors_str = errors.GetString();
+  if (errors_str != "") {
+    fprintf(stderr, "%s %s", FormatFileLocation(file, line).c_str(),
+            errors_str.c_str());
+    fflush(stderr);
+    posix::Abort();
+  }
+
+  return registered_tests;
+}
+
+#endif  // GTEST_HAS_TYPED_TEST_P
+
+}  // namespace internal
+}  // namespace testing
diff --git a/src/rocksdb/third-party/gtest-1.7.0/fused-src/gtest/gtest.h b/src/rocksdb/third-party/gtest-1.7.0/fused-src/gtest/gtest.h
new file mode 100644
index 0000000..2756b47
--- /dev/null
+++ b/src/rocksdb/third-party/gtest-1.7.0/fused-src/gtest/gtest.h
@@ -0,0 +1,20725 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan at google.com (Zhanyong Wan)
+//
+// The Google C++ Testing Framework (Google Test)
+//
+// This header file defines the public API for Google Test.  It should be
+// included by any test program that uses Google Test.
+//
+// IMPORTANT NOTE: Due to limitation of the C++ language, we have to
+// leave some internal implementation details in this header file.
+// They are clearly marked by comments like this:
+//
+//   // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+//
+// Such code is NOT meant to be used by a user directly, and is subject
+// to CHANGE WITHOUT NOTICE.  Therefore DO NOT DEPEND ON IT in a user
+// program!
+//
+// Acknowledgment: Google Test borrowed the idea of automatic test
+// registration from Barthelemy Dagenais' (barthelemy at prologique.com)
+// easyUnit framework.
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_H_
+#define GTEST_INCLUDE_GTEST_GTEST_H_
+
+#include <limits>
+#include <ostream>
+#include <vector>
+
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: wan at google.com (Zhanyong Wan), eefacm at gmail.com (Sean Mcafee)
+//
+// The Google C++ Testing Framework (Google Test)
+//
+// This header file declares functions and macros used internally by
+// Google Test.  They are subject to change without notice.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
+
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: wan at google.com (Zhanyong Wan)
+//
+// Low-level types and utilities for porting Google Test to various
+// platforms.  All macros ending with _ and symbols defined in an
+// internal namespace are subject to change without notice.  Code
+// outside Google Test MUST NOT USE THEM DIRECTLY.  Macros that don't
+// end with _ are part of Google Test's public API and can be used by
+// code outside Google Test.
+//
+// This file is fundamental to Google Test.  All other Google Test source
+// files are expected to #include this.  Therefore, it cannot #include
+// any other Google Test header.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
+
+// Environment-describing macros
+// -----------------------------
+//
+// Google Test can be used in many different environments.  Macros in
+// this section tell Google Test what kind of environment it is being
+// used in, such that Google Test can provide environment-specific
+// features and implementations.
+//
+// Google Test tries to automatically detect the properties of its
+// environment, so users usually don't need to worry about these
+// macros.  However, the automatic detection is not perfect.
+// Sometimes it's necessary for a user to define some of the following
+// macros in the build script to override Google Test's decisions.
+//
+// If the user doesn't define a macro in the list, Google Test will
+// provide a default definition.  After this header is #included, all
+// macros in this list will be defined to either 1 or 0.
+//
+// Notes to maintainers:
+//   - Each macro here is a user-tweakable knob; do not grow the list
+//     lightly.
+//   - Use #if to key off these macros.  Don't use #ifdef or "#if
+//     defined(...)", which will not work as these macros are ALWAYS
+//     defined.
+//
+//   GTEST_HAS_CLONE          - Define it to 1/0 to indicate that clone(2)
+//                              is/isn't available.
+//   GTEST_HAS_EXCEPTIONS     - Define it to 1/0 to indicate that exceptions
+//                              are enabled.
+//   GTEST_HAS_GLOBAL_STRING  - Define it to 1/0 to indicate that ::string
+//                              is/isn't available (some systems define
+//                              ::string, which is different to std::string).
+//   GTEST_HAS_GLOBAL_WSTRING - Define it to 1/0 to indicate that ::string
+//                              is/isn't available (some systems define
+//                              ::wstring, which is different to std::wstring).
+//   GTEST_HAS_POSIX_RE       - Define it to 1/0 to indicate that POSIX regular
+//                              expressions are/aren't available.
+//   GTEST_HAS_PTHREAD        - Define it to 1/0 to indicate that <pthread.h>
+//                              is/isn't available.
+//   GTEST_HAS_RTTI           - Define it to 1/0 to indicate that RTTI is/isn't
+//                              enabled.
+//   GTEST_HAS_STD_WSTRING    - Define it to 1/0 to indicate that
+//                              std::wstring does/doesn't work (Google Test can
+//                              be used where std::wstring is unavailable).
+//   GTEST_HAS_TR1_TUPLE      - Define it to 1/0 to indicate tr1::tuple
+//                              is/isn't available.
+//   GTEST_HAS_SEH            - Define it to 1/0 to indicate whether the
+//                              compiler supports Microsoft's "Structured
+//                              Exception Handling".
+//   GTEST_HAS_STREAM_REDIRECTION
+//                            - Define it to 1/0 to indicate whether the
+//                              platform supports I/O stream redirection using
+//                              dup() and dup2().
+//   GTEST_USE_OWN_TR1_TUPLE  - Define it to 1/0 to indicate whether Google
+//                              Test's own tr1 tuple implementation should be
+//                              used.  Unused when the user sets
+//                              GTEST_HAS_TR1_TUPLE to 0.
+//   GTEST_LANG_CXX11         - Define it to 1/0 to indicate that Google Test
+//                              is building in C++11/C++98 mode.
+//   GTEST_LINKED_AS_SHARED_LIBRARY
+//                            - Define to 1 when compiling tests that use
+//                              Google Test as a shared library (known as
+//                              DLL on Windows).
+//   GTEST_CREATE_SHARED_LIBRARY
+//                            - Define to 1 when compiling Google Test itself
+//                              as a shared library.
+
+// Platform-indicating macros
+// --------------------------
+//
+// Macros indicating the platform on which Google Test is being used
+// (a macro is defined to 1 if compiled on the given platform;
+// otherwise UNDEFINED -- it's never defined to 0.).  Google Test
+// defines these macros automatically.  Code outside Google Test MUST
+// NOT define them.
+//
+//   GTEST_OS_AIX      - IBM AIX
+//   GTEST_OS_CYGWIN   - Cygwin
+//   GTEST_OS_FREEBSD  - FreeBSD
+//   GTEST_OS_HPUX     - HP-UX
+//   GTEST_OS_LINUX    - Linux
+//     GTEST_OS_LINUX_ANDROID - Google Android
+//   GTEST_OS_MAC      - Mac OS X
+//     GTEST_OS_IOS    - iOS
+//   GTEST_OS_NACL     - Google Native Client (NaCl)
+//   GTEST_OS_OPENBSD  - OpenBSD
+//   GTEST_OS_QNX      - QNX
+//   GTEST_OS_SOLARIS  - Sun Solaris
+//   GTEST_OS_SYMBIAN  - Symbian
+//   GTEST_OS_WINDOWS  - Windows (Desktop, MinGW, or Mobile)
+//     GTEST_OS_WINDOWS_DESKTOP  - Windows Desktop
+//     GTEST_OS_WINDOWS_MINGW    - MinGW
+//     GTEST_OS_WINDOWS_MOBILE   - Windows Mobile
+//     GTEST_OS_WINDOWS_PHONE    - Windows Phone
+//     GTEST_OS_WINDOWS_RT       - Windows Store App/WinRT
+//   GTEST_OS_ZOS      - z/OS
+//
+// Among the platforms, Cygwin, Linux, Max OS X, and Windows have the
+// most stable support.  Since core members of the Google Test project
+// don't have access to other platforms, support for them may be less
+// stable.  If you notice any problems on your platform, please notify
+// googletestframework at googlegroups.com (patches for fixing them are
+// even more welcome!).
+//
+// It is possible that none of the GTEST_OS_* macros are defined.
+
+// Feature-indicating macros
+// -------------------------
+//
+// Macros indicating which Google Test features are available (a macro
+// is defined to 1 if the corresponding feature is supported;
+// otherwise UNDEFINED -- it's never defined to 0.).  Google Test
+// defines these macros automatically.  Code outside Google Test MUST
+// NOT define them.
+//
+// These macros are public so that portable tests can be written.
+// Such tests typically surround code using a feature with an #if
+// which controls that code.  For example:
+//
+// #if GTEST_HAS_DEATH_TEST
+//   EXPECT_DEATH(DoSomethingDeadly());
+// #endif
+//
+//   GTEST_HAS_COMBINE      - the Combine() function (for value-parameterized
+//                            tests)
+//   GTEST_HAS_DEATH_TEST   - death tests
+//   GTEST_HAS_PARAM_TEST   - value-parameterized tests
+//   GTEST_HAS_TYPED_TEST   - typed tests
+//   GTEST_HAS_TYPED_TEST_P - type-parameterized tests
+//   GTEST_IS_THREADSAFE    - Google Test is thread-safe.
+//   GTEST_USES_POSIX_RE    - enhanced POSIX regex is used. Do not confuse with
+//                            GTEST_HAS_POSIX_RE (see above) which users can
+//                            define themselves.
+//   GTEST_USES_SIMPLE_RE   - our own simple regex is used;
+//                            the above two are mutually exclusive.
+//   GTEST_CAN_COMPARE_NULL - accepts untyped NULL in EXPECT_EQ().
+
+// Misc public macros
+// ------------------
+//
+//   GTEST_FLAG(flag_name)  - references the variable corresponding to
+//                            the given Google Test flag.
+
+// Internal utilities
+// ------------------
+//
+// The following macros and utilities are for Google Test's INTERNAL
+// use only.  Code outside Google Test MUST NOT USE THEM DIRECTLY.
+//
+// Macros for basic C++ coding:
+//   GTEST_AMBIGUOUS_ELSE_BLOCKER_ - for disabling a gcc warning.
+//   GTEST_ATTRIBUTE_UNUSED_  - declares that a class' instances or a
+//                              variable don't have to be used.
+//   GTEST_DISALLOW_ASSIGN_   - disables operator=.
+//   GTEST_DISALLOW_COPY_AND_ASSIGN_ - disables copy ctor and operator=.
+//   GTEST_MUST_USE_RESULT_   - declares that a function's result must be used.
+//   GTEST_INTENTIONAL_CONST_COND_PUSH_ - start code section where MSVC C4127 is
+//                                        suppressed (constant conditional).
+//   GTEST_INTENTIONAL_CONST_COND_POP_  - finish code section where MSVC C4127
+//                                        is suppressed.
+//
+// C++11 feature wrappers:
+//
+//   testing::internal::move  - portability wrapper for std::move.
+//
+// Synchronization:
+//   Mutex, MutexLock, ThreadLocal, GetThreadCount()
+//                            - synchronization primitives.
+//
+// Template meta programming:
+//   is_pointer     - as in TR1; needed on Symbian and IBM XL C/C++ only.
+//   IteratorTraits - partial implementation of std::iterator_traits, which
+//                    is not available in libCstd when compiled with Sun C++.
+//
+// Smart pointers:
+//   scoped_ptr     - as in TR2.
+//
+// Regular expressions:
+//   RE             - a simple regular expression class using the POSIX
+//                    Extended Regular Expression syntax on UNIX-like
+//                    platforms, or a reduced regular exception syntax on
+//                    other platforms, including Windows.
+//
+// Logging:
+//   GTEST_LOG_()   - logs messages at the specified severity level.
+//   LogToStderr()  - directs all log messages to stderr.
+//   FlushInfoLog() - flushes informational log messages.
+//
+// Stdout and stderr capturing:
+//   CaptureStdout()     - starts capturing stdout.
+//   GetCapturedStdout() - stops capturing stdout and returns the captured
+//                         string.
+//   CaptureStderr()     - starts capturing stderr.
+//   GetCapturedStderr() - stops capturing stderr and returns the captured
+//                         string.
+//
+// Integer types:
+//   TypeWithSize   - maps an integer to a int type.
+//   Int32, UInt32, Int64, UInt64, TimeInMillis
+//                  - integers of known sizes.
+//   BiggestInt     - the biggest signed integer type.
+//
+// Command-line utilities:
+//   GTEST_DECLARE_*()  - declares a flag.
+//   GTEST_DEFINE_*()   - defines a flag.
+//   GetInjectableArgvs() - returns the command line as a vector of strings.
+//
+// Environment variable utilities:
+//   GetEnv()             - gets the value of an environment variable.
+//   BoolFromGTestEnv()   - parses a bool environment variable.
+//   Int32FromGTestEnv()  - parses an Int32 environment variable.
+//   StringFromGTestEnv() - parses a string environment variable.
+
+#include <ctype.h>   // for isspace, etc
+#include <stddef.h>  // for ptrdiff_t
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#ifndef _WIN32_WCE
+# include <sys/types.h>
+# include <sys/stat.h>
+#endif  // !_WIN32_WCE
+
+#if defined __APPLE__
+# include <AvailabilityMacros.h>
+# include <TargetConditionals.h>
+#endif
+
+#include <algorithm>  // NOLINT
+#include <iostream>  // NOLINT
+#include <sstream>  // NOLINT
+#include <string>  // NOLINT
+#include <utility>
+
+#define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com"
+#define GTEST_FLAG_PREFIX_ "gtest_"
+#define GTEST_FLAG_PREFIX_DASH_ "gtest-"
+#define GTEST_FLAG_PREFIX_UPPER_ "GTEST_"
+#define GTEST_NAME_ "Google Test"
+#define GTEST_PROJECT_URL_ "http://code.google.com/p/googletest/"
+
+// Determines the version of gcc that is used to compile this.
+#ifdef __GNUC__
+// 40302 means version 4.3.2.
+# define GTEST_GCC_VER_ \
+    (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
+#endif  // __GNUC__
+
+// Determines the platform on which Google Test is compiled.
+#ifdef __CYGWIN__
+# define GTEST_OS_CYGWIN 1
+#elif defined __SYMBIAN32__
+# define GTEST_OS_SYMBIAN 1
+#elif defined _WIN32
+# define GTEST_OS_WINDOWS 1
+# ifdef _WIN32_WCE
+#  define GTEST_OS_WINDOWS_MOBILE 1
+# elif defined(__MINGW__) || defined(__MINGW32__)
+#  define GTEST_OS_WINDOWS_MINGW 1
+# elif defined(WINAPI_FAMILY)
+#  include <winapifamily.h>
+#  if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#   define GTEST_OS_WINDOWS_DESKTOP 1
+#  elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP)
+#   define GTEST_OS_WINDOWS_PHONE 1
+#  elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP)
+#   define GTEST_OS_WINDOWS_RT 1
+#  else
+    // WINAPI_FAMILY defined but no known partition matched.
+    // Default to desktop.
+#   define GTEST_OS_WINDOWS_DESKTOP 1
+#  endif
+# else
+#  define GTEST_OS_WINDOWS_DESKTOP 1
+# endif  // _WIN32_WCE
+#elif defined __APPLE__
+# define GTEST_OS_MAC 1
+# if TARGET_OS_IPHONE
+#  define GTEST_OS_IOS 1
+# endif
+#elif defined __FreeBSD__
+# define GTEST_OS_FREEBSD 1
+#elif defined __linux__
+# define GTEST_OS_LINUX 1
+# if defined __ANDROID__
+#  define GTEST_OS_LINUX_ANDROID 1
+# endif
+#elif defined __MVS__
+# define GTEST_OS_ZOS 1
+#elif defined(__sun) && defined(__SVR4)
+# define GTEST_OS_SOLARIS 1
+#elif defined(_AIX)
+# define GTEST_OS_AIX 1
+#elif defined(__hpux)
+# define GTEST_OS_HPUX 1
+#elif defined __native_client__
+# define GTEST_OS_NACL 1
+#elif defined __OpenBSD__
+# define GTEST_OS_OPENBSD 1
+#elif defined __QNX__
+# define GTEST_OS_QNX 1
+#endif  // __CYGWIN__
+
+// Macros for disabling Microsoft Visual C++ warnings.
+//
+//   GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 4385)
+//   /* code that triggers warnings C4800 and C4385 */
+//   GTEST_DISABLE_MSC_WARNINGS_POP_()
+#if _MSC_VER >= 1500
+# define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings) \
+    __pragma(warning(push))                        \
+    __pragma(warning(disable: warnings))
+# define GTEST_DISABLE_MSC_WARNINGS_POP_()          \
+    __pragma(warning(pop))
+#else
+// Older versions of MSVC don't have __pragma.
+# define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings)
+# define GTEST_DISABLE_MSC_WARNINGS_POP_()
+#endif
+
+#ifndef GTEST_LANG_CXX11
+// gcc and clang define __GXX_EXPERIMENTAL_CXX0X__ when
+// -std={c,gnu}++{0x,11} is passed.  The C++11 standard specifies a
+// value for __cplusplus, and recent versions of clang, gcc, and
+// probably other compilers set that too in C++11 mode.
+# if __GXX_EXPERIMENTAL_CXX0X__ || __cplusplus >= 201103L
+// Compiling in at least C++11 mode.
+#  define GTEST_LANG_CXX11 1
+# else
+#  define GTEST_LANG_CXX11 0
+# endif
+#endif
+
+// Distinct from C++11 language support, some environments don't provide
+// proper C++11 library support. Notably, it's possible to build in
+// C++11 mode when targeting Mac OS X 10.6, which has an old libstdc++
+// with no C++11 support.
+//
+// libstdc++ has sufficient C++11 support as of GCC 4.6.0, __GLIBCXX__
+// 20110325, but maintenance releases in the 4.4 and 4.5 series followed
+// this date, so check for those versions by their date stamps.
+// https://gcc.gnu.org/onlinedocs/libstdc++/manual/abi.html#abi.versioning
+#if GTEST_LANG_CXX11 && \
+    (!defined(__GLIBCXX__) || ( \
+        __GLIBCXX__ >= 20110325ul &&  /* GCC >= 4.6.0 */ \
+        /* Blacklist of patch releases of older branches: */ \
+        __GLIBCXX__ != 20110416ul &&  /* GCC 4.4.6 */ \
+        __GLIBCXX__ != 20120313ul &&  /* GCC 4.4.7 */ \
+        __GLIBCXX__ != 20110428ul &&  /* GCC 4.5.3 */ \
+        __GLIBCXX__ != 20120702ul))   /* GCC 4.5.4 */
+# define GTEST_STDLIB_CXX11 1
+#endif
+
+// Only use C++11 library features if the library provides them.
+#if GTEST_STDLIB_CXX11
+# define GTEST_HAS_STD_BEGIN_AND_END_ 1
+# define GTEST_HAS_STD_FORWARD_LIST_ 1
+# define GTEST_HAS_STD_FUNCTION_ 1
+# define GTEST_HAS_STD_INITIALIZER_LIST_ 1
+# define GTEST_HAS_STD_MOVE_ 1
+# define GTEST_HAS_STD_UNIQUE_PTR_ 1
+#endif
+
+// C++11 specifies that <tuple> provides std::tuple.
+// Some platforms still might not have it, however.
+#if GTEST_LANG_CXX11
+# define GTEST_HAS_STD_TUPLE_ 1
+# if defined(__clang__)
+// Inspired by http://clang.llvm.org/docs/LanguageExtensions.html#__has_include
+#  if defined(__has_include) && !__has_include(<tuple>)
+#   undef GTEST_HAS_STD_TUPLE_
+#  endif
+# elif defined(_MSC_VER)
+// Inspired by boost/config/stdlib/dinkumware.hpp
+#  if defined(_CPPLIB_VER) && _CPPLIB_VER < 520
+#   undef GTEST_HAS_STD_TUPLE_
+#  endif
+# elif defined(__GLIBCXX__)
+// Inspired by boost/config/stdlib/libstdcpp3.hpp,
+// http://gcc.gnu.org/gcc-4.2/changes.html and
+// http://gcc.gnu.org/onlinedocs/libstdc++/manual/bk01pt01ch01.html#manual.intro.status.standard.200x
+#  if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2)
+#   undef GTEST_HAS_STD_TUPLE_
+#  endif
+# endif
+#endif
+
+// Brings in definitions for functions used in the testing::internal::posix
+// namespace (read, write, close, chdir, isatty, stat). We do not currently
+// use them on Windows Mobile.
+#if GTEST_OS_WINDOWS
+# if !GTEST_OS_WINDOWS_MOBILE
+#  include <direct.h>
+#  include <io.h>
+# endif
+// In order to avoid having to include <windows.h>, use forward declaration
+// assuming CRITICAL_SECTION is a typedef of _RTL_CRITICAL_SECTION.
+// This assumption is verified by
+// WindowsTypesTest.CRITICAL_SECTIONIs_RTL_CRITICAL_SECTION.
+struct _RTL_CRITICAL_SECTION;
+#else
+// This assumes that non-Windows OSes provide unistd.h. For OSes where this
+// is not the case, we need to include headers that provide the functions
+// mentioned above.
+# include <unistd.h>
+# include <strings.h>
+#endif  // GTEST_OS_WINDOWS
+
+#if GTEST_OS_LINUX_ANDROID
+// Used to define __ANDROID_API__ matching the target NDK API level.
+#  include <android/api-level.h>  // NOLINT
+#endif
+
+// Defines this to true iff Google Test can use POSIX regular expressions.
+#ifndef GTEST_HAS_POSIX_RE
+# if GTEST_OS_LINUX_ANDROID
+// On Android, <regex.h> is only available starting with Gingerbread.
+#  define GTEST_HAS_POSIX_RE (__ANDROID_API__ >= 9)
+# else
+#  define GTEST_HAS_POSIX_RE (!GTEST_OS_WINDOWS)
+# endif
+#endif
+
+#if GTEST_HAS_POSIX_RE
+
+// On some platforms, <regex.h> needs someone to define size_t, and
+// won't compile otherwise.  We can #include it here as we already
+// included <stdlib.h>, which is guaranteed to define size_t through
+// <stddef.h>.
+# include <regex.h>  // NOLINT
+
+# define GTEST_USES_POSIX_RE 1
+
+#elif GTEST_OS_WINDOWS
+
+// <regex.h> is not available on Windows.  Use our own simple regex
+// implementation instead.
+# define GTEST_USES_SIMPLE_RE 1
+
+#else
+
+// <regex.h> may not be available on this platform.  Use our own
+// simple regex implementation instead.
+# define GTEST_USES_SIMPLE_RE 1
+
+#endif  // GTEST_HAS_POSIX_RE
+
+#ifndef GTEST_HAS_EXCEPTIONS
+// The user didn't tell us whether exceptions are enabled, so we need
+// to figure it out.
+# if defined(_MSC_VER) || defined(__BORLANDC__)
+// MSVC's and C++Builder's implementations of the STL use the _HAS_EXCEPTIONS
+// macro to enable exceptions, so we'll do the same.
+// Assumes that exceptions are enabled by default.
+#  ifndef _HAS_EXCEPTIONS
+#   define _HAS_EXCEPTIONS 1
+#  endif  // _HAS_EXCEPTIONS
+#  define GTEST_HAS_EXCEPTIONS _HAS_EXCEPTIONS
+# elif defined(__clang__)
+// clang defines __EXCEPTIONS iff exceptions are enabled before clang 220714,
+// but iff cleanups are enabled after that. In Obj-C++ files, there can be
+// cleanups for ObjC exceptions which also need cleanups, even if C++ exceptions
+// are disabled. clang has __has_feature(cxx_exceptions) which checks for C++
+// exceptions starting at clang r206352, but which checked for cleanups prior to
+// that. To reliably check for C++ exception availability with clang, check for
+// __EXCEPTIONS && __has_feature(cxx_exceptions).
+#  define GTEST_HAS_EXCEPTIONS (__EXCEPTIONS && __has_feature(cxx_exceptions))
+# elif defined(__GNUC__) && __EXCEPTIONS
+// gcc defines __EXCEPTIONS to 1 iff exceptions are enabled.
+#  define GTEST_HAS_EXCEPTIONS 1
+# elif defined(__SUNPRO_CC)
+// Sun Pro CC supports exceptions.  However, there is no compile-time way of
+// detecting whether they are enabled or not.  Therefore, we assume that
+// they are enabled unless the user tells us otherwise.
+#  define GTEST_HAS_EXCEPTIONS 1
+# elif defined(__IBMCPP__) && __EXCEPTIONS
+// xlC defines __EXCEPTIONS to 1 iff exceptions are enabled.
+#  define GTEST_HAS_EXCEPTIONS 1
+# elif defined(__HP_aCC)
+// Exception handling is in effect by default in HP aCC compiler. It has to
+// be turned of by +noeh compiler option if desired.
+#  define GTEST_HAS_EXCEPTIONS 1
+# else
+// For other compilers, we assume exceptions are disabled to be
+// conservative.
+#  define GTEST_HAS_EXCEPTIONS 0
+# endif  // defined(_MSC_VER) || defined(__BORLANDC__)
+#endif  // GTEST_HAS_EXCEPTIONS
+
+#if !defined(GTEST_HAS_STD_STRING)
+// Even though we don't use this macro any longer, we keep it in case
+// some clients still depend on it.
+# define GTEST_HAS_STD_STRING 1
+#elif !GTEST_HAS_STD_STRING
+// The user told us that ::std::string isn't available.
+# error "Google Test cannot be used where ::std::string isn't available."
+#endif  // !defined(GTEST_HAS_STD_STRING)
+
+#ifndef GTEST_HAS_GLOBAL_STRING
+// The user didn't tell us whether ::string is available, so we need
+// to figure it out.
+
+# define GTEST_HAS_GLOBAL_STRING 0
+
+#endif  // GTEST_HAS_GLOBAL_STRING
+
+#ifndef GTEST_HAS_STD_WSTRING
+// The user didn't tell us whether ::std::wstring is available, so we need
+// to figure it out.
+// TODO(wan at google.com): uses autoconf to detect whether ::std::wstring
+//   is available.
+
+// Cygwin 1.7 and below doesn't support ::std::wstring.
+// Solaris' libc++ doesn't support it either.  Android has
+// no support for it at least as recent as Froyo (2.2).
+# define GTEST_HAS_STD_WSTRING \
+    (!(GTEST_OS_LINUX_ANDROID || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS))
+
+#endif  // GTEST_HAS_STD_WSTRING
+
+#ifndef GTEST_HAS_GLOBAL_WSTRING
+// The user didn't tell us whether ::wstring is available, so we need
+// to figure it out.
+# define GTEST_HAS_GLOBAL_WSTRING \
+    (GTEST_HAS_STD_WSTRING && GTEST_HAS_GLOBAL_STRING)
+#endif  // GTEST_HAS_GLOBAL_WSTRING
+
+// Determines whether RTTI is available.
+#ifndef GTEST_HAS_RTTI
+// The user didn't tell us whether RTTI is enabled, so we need to
+// figure it out.
+
+# ifdef _MSC_VER
+
+#  ifdef _CPPRTTI  // MSVC defines this macro iff RTTI is enabled.
+#   define GTEST_HAS_RTTI 1
+#  else
+#   define GTEST_HAS_RTTI 0
+#  endif
+
+// Starting with version 4.3.2, gcc defines __GXX_RTTI iff RTTI is enabled.
+# elif defined(__GNUC__) && (GTEST_GCC_VER_ >= 40302)
+
+#  ifdef __GXX_RTTI
+// When building against STLport with the Android NDK and with
+// -frtti -fno-exceptions, the build fails at link time with undefined
+// references to __cxa_bad_typeid. Note sure if STL or toolchain bug,
+// so disable RTTI when detected.
+#   if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) && \
+       !defined(__EXCEPTIONS)
+#    define GTEST_HAS_RTTI 0
+#   else
+#    define GTEST_HAS_RTTI 1
+#   endif  // GTEST_OS_LINUX_ANDROID && __STLPORT_MAJOR && !__EXCEPTIONS
+#  else
+#   define GTEST_HAS_RTTI 0
+#  endif  // __GXX_RTTI
+
+// Clang defines __GXX_RTTI starting with version 3.0, but its manual recommends
+// using has_feature instead. has_feature(cxx_rtti) is supported since 2.7, the
+// first version with C++ support.
+# elif defined(__clang__)
+
+#  define GTEST_HAS_RTTI __has_feature(cxx_rtti)
+
+// Starting with version 9.0 IBM Visual Age defines __RTTI_ALL__ to 1 if
+// both the typeid and dynamic_cast features are present.
+# elif defined(__IBMCPP__) && (__IBMCPP__ >= 900)
+
+#  ifdef __RTTI_ALL__
+#   define GTEST_HAS_RTTI 1
+#  else
+#   define GTEST_HAS_RTTI 0
+#  endif
+
+# else
+
+// For all other compilers, we assume RTTI is enabled.
+#  define GTEST_HAS_RTTI 1
+
+# endif  // _MSC_VER
+
+#endif  // GTEST_HAS_RTTI
+
+// It's this header's responsibility to #include <typeinfo> when RTTI
+// is enabled.
+#if GTEST_HAS_RTTI
+# include <typeinfo>
+#endif
+
+// Determines whether Google Test can use the pthreads library.
+#ifndef GTEST_HAS_PTHREAD
+// The user didn't tell us explicitly, so we make reasonable assumptions about
+// which platforms have pthreads support.
+//
+// To disable threading support in Google Test, add -DGTEST_HAS_PTHREAD=0
+// to your compiler flags.
+# define GTEST_HAS_PTHREAD (GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_HPUX \
+    || GTEST_OS_QNX || GTEST_OS_FREEBSD || GTEST_OS_NACL)
+#endif  // GTEST_HAS_PTHREAD
+
+#if GTEST_HAS_PTHREAD
+// gtest-port.h guarantees to #include <pthread.h> when GTEST_HAS_PTHREAD is
+// true.
+# include <pthread.h>  // NOLINT
+
+// For timespec and nanosleep, used below.
+# include <time.h>  // NOLINT
+#endif
+
+// Determines whether Google Test can use tr1/tuple.  You can define
+// this macro to 0 to prevent Google Test from using tuple (any
+// feature depending on tuple with be disabled in this mode).
+#ifndef GTEST_HAS_TR1_TUPLE
+# if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR)
+// STLport, provided with the Android NDK, has neither <tr1/tuple> or <tuple>.
+#  define GTEST_HAS_TR1_TUPLE 0
+# else
+// The user didn't tell us not to do it, so we assume it's OK.
+#  define GTEST_HAS_TR1_TUPLE 1
+# endif
+#endif  // GTEST_HAS_TR1_TUPLE
+
+// Determines whether Google Test's own tr1 tuple implementation
+// should be used.
+#ifndef GTEST_USE_OWN_TR1_TUPLE
+// The user didn't tell us, so we need to figure it out.
+
+// We use our own TR1 tuple if we aren't sure the user has an
+// implementation of it already.  At this time, libstdc++ 4.0.0+ and
+// MSVC 2010 are the only mainstream standard libraries that come
+// with a TR1 tuple implementation.  NVIDIA's CUDA NVCC compiler
+// pretends to be GCC by defining __GNUC__ and friends, but cannot
+// compile GCC's tuple implementation.  MSVC 2008 (9.0) provides TR1
+// tuple in a 323 MB Feature Pack download, which we cannot assume the
+// user has.  QNX's QCC compiler is a modified GCC but it doesn't
+// support TR1 tuple.  libc++ only provides std::tuple, in C++11 mode,
+// and it can be used with some compilers that define __GNUC__.
+# if (defined(__GNUC__) && !defined(__CUDACC__) && (GTEST_GCC_VER_ >= 40000) \
+      && !GTEST_OS_QNX && !defined(_LIBCPP_VERSION)) || _MSC_VER >= 1600
+#  define GTEST_ENV_HAS_TR1_TUPLE_ 1
+# endif
+
+// C++11 specifies that <tuple> provides std::tuple. Use that if gtest is used
+// in C++11 mode and libstdc++ isn't very old (binaries targeting OS X 10.6
+// can build with clang but need to use gcc4.2's libstdc++).
+# if GTEST_LANG_CXX11 && (!defined(__GLIBCXX__) || __GLIBCXX__ > 20110325)
+#  define GTEST_ENV_HAS_STD_TUPLE_ 1
+# endif
+
+# if GTEST_ENV_HAS_TR1_TUPLE_ || GTEST_ENV_HAS_STD_TUPLE_
+#  define GTEST_USE_OWN_TR1_TUPLE 0
+# else
+#  define GTEST_USE_OWN_TR1_TUPLE 1
+# endif
+
+#endif  // GTEST_USE_OWN_TR1_TUPLE
+
+// To avoid conditional compilation everywhere, we make it
+// gtest-port.h's responsibility to #include the header implementing
+// tuple.
+#if GTEST_HAS_STD_TUPLE_
+# include <tuple>  // IWYU pragma: export
+# define GTEST_TUPLE_NAMESPACE_ ::std
+#endif  // GTEST_HAS_STD_TUPLE_
+
+// We include tr1::tuple even if std::tuple is available to define printers for
+// them.
+#if GTEST_HAS_TR1_TUPLE
+# ifndef GTEST_TUPLE_NAMESPACE_
+#  define GTEST_TUPLE_NAMESPACE_ ::std::tr1
+# endif  // GTEST_TUPLE_NAMESPACE_
+
+# if GTEST_USE_OWN_TR1_TUPLE
+// This file was GENERATED by command:
+//     pump.py gtest-tuple.h.pump
+// DO NOT EDIT BY HAND!!!
+
+// Copyright 2009 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan at google.com (Zhanyong Wan)
+
+// Implements a subset of TR1 tuple needed by Google Test and Google Mock.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
+
+#include <utility>  // For ::std::pair.
+
+// The compiler used in Symbian has a bug that prevents us from declaring the
+// tuple template as a friend (it complains that tuple is redefined).  This
+// hack bypasses the bug by declaring the members that should otherwise be
+// private as public.
+// Sun Studio versions < 12 also have the above bug.
+#if defined(__SYMBIAN32__) || (defined(__SUNPRO_CC) && __SUNPRO_CC < 0x590)
+# define GTEST_DECLARE_TUPLE_AS_FRIEND_ public:
+#else
+# define GTEST_DECLARE_TUPLE_AS_FRIEND_ \
+    template <GTEST_10_TYPENAMES_(U)> friend class tuple; \
+   private:
+#endif
+
+// Visual Studio 2010, 2012, and 2013 define symbols in std::tr1 that conflict
+// with our own definitions. Therefore using our own tuple does not work on
+// those compilers.
+#if defined(_MSC_VER) && _MSC_VER >= 1600  /* 1600 is Visual Studio 2010 */
+# error "gtest's tuple doesn't compile on Visual Studio 2010 or later. \
+GTEST_USE_OWN_TR1_TUPLE must be set to 0 on those compilers."
+#endif
+
+// GTEST_n_TUPLE_(T) is the type of an n-tuple.
+#define GTEST_0_TUPLE_(T) tuple<>
+#define GTEST_1_TUPLE_(T) tuple<T##0, void, void, void, void, void, void, \
+    void, void, void>
+#define GTEST_2_TUPLE_(T) tuple<T##0, T##1, void, void, void, void, void, \
+    void, void, void>
+#define GTEST_3_TUPLE_(T) tuple<T##0, T##1, T##2, void, void, void, void, \
+    void, void, void>
+#define GTEST_4_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, void, void, void, \
+    void, void, void>
+#define GTEST_5_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, void, void, \
+    void, void, void>
+#define GTEST_6_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, void, \
+    void, void, void>
+#define GTEST_7_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \
+    void, void, void>
+#define GTEST_8_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \
+    T##7, void, void>
+#define GTEST_9_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \
+    T##7, T##8, void>
+#define GTEST_10_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \
+    T##7, T##8, T##9>
+
+// GTEST_n_TYPENAMES_(T) declares a list of n typenames.
+#define GTEST_0_TYPENAMES_(T)
+#define GTEST_1_TYPENAMES_(T) typename T##0
+#define GTEST_2_TYPENAMES_(T) typename T##0, typename T##1
+#define GTEST_3_TYPENAMES_(T) typename T##0, typename T##1, typename T##2
+#define GTEST_4_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3
+#define GTEST_5_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3, typename T##4
+#define GTEST_6_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3, typename T##4, typename T##5
+#define GTEST_7_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3, typename T##4, typename T##5, typename T##6
+#define GTEST_8_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3, typename T##4, typename T##5, typename T##6, typename T##7
+#define GTEST_9_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3, typename T##4, typename T##5, typename T##6, \
+    typename T##7, typename T##8
+#define GTEST_10_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3, typename T##4, typename T##5, typename T##6, \
+    typename T##7, typename T##8, typename T##9
+
+// In theory, defining stuff in the ::std namespace is undefined
+// behavior.  We can do this as we are playing the role of a standard
+// library vendor.
+namespace std {
+namespace tr1 {
+
+template <typename T0 = void, typename T1 = void, typename T2 = void,
+    typename T3 = void, typename T4 = void, typename T5 = void,
+    typename T6 = void, typename T7 = void, typename T8 = void,
+    typename T9 = void>
+class tuple;
+
+// Anything in namespace gtest_internal is Google Test's INTERNAL
+// IMPLEMENTATION DETAIL and MUST NOT BE USED DIRECTLY in user code.
+namespace gtest_internal {
+
+// ByRef<T>::type is T if T is a reference; otherwise it's const T&.
+template <typename T>
+struct ByRef { typedef const T& type; };  // NOLINT
+template <typename T>
+struct ByRef<T&> { typedef T& type; };  // NOLINT
+
+// A handy wrapper for ByRef.
+#define GTEST_BY_REF_(T) typename ::std::tr1::gtest_internal::ByRef<T>::type
+
+// AddRef<T>::type is T if T is a reference; otherwise it's T&.  This
+// is the same as tr1::add_reference<T>::type.
+template <typename T>
+struct AddRef { typedef T& type; };  // NOLINT
+template <typename T>
+struct AddRef<T&> { typedef T& type; };  // NOLINT
+
+// A handy wrapper for AddRef.
+#define GTEST_ADD_REF_(T) typename ::std::tr1::gtest_internal::AddRef<T>::type
+
+// A helper for implementing get<k>().
+template <int k> class Get;
+
+// A helper for implementing tuple_element<k, T>.  kIndexValid is true
+// iff k < the number of fields in tuple type T.
+template <bool kIndexValid, int kIndex, class Tuple>
+struct TupleElement;
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 0, GTEST_10_TUPLE_(T) > {
+  typedef T0 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 1, GTEST_10_TUPLE_(T) > {
+  typedef T1 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 2, GTEST_10_TUPLE_(T) > {
+  typedef T2 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 3, GTEST_10_TUPLE_(T) > {
+  typedef T3 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 4, GTEST_10_TUPLE_(T) > {
+  typedef T4 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 5, GTEST_10_TUPLE_(T) > {
+  typedef T5 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 6, GTEST_10_TUPLE_(T) > {
+  typedef T6 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 7, GTEST_10_TUPLE_(T) > {
+  typedef T7 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 8, GTEST_10_TUPLE_(T) > {
+  typedef T8 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 9, GTEST_10_TUPLE_(T) > {
+  typedef T9 type;
+};
+
+}  // namespace gtest_internal
+
+template <>
+class tuple<> {
+ public:
+  tuple() {}
+  tuple(const tuple& /* t */)  {}
+  tuple& operator=(const tuple& /* t */) { return *this; }
+};
+
+template <GTEST_1_TYPENAMES_(T)>
+class GTEST_1_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0) : f0_(f0) {}
+
+  tuple(const tuple& t) : f0_(t.f0_) {}
+
+  template <GTEST_1_TYPENAMES_(U)>
+  tuple(const GTEST_1_TUPLE_(U)& t) : f0_(t.f0_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_1_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_1_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_1_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_1_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    return *this;
+  }
+
+  T0 f0_;
+};
+
+template <GTEST_2_TYPENAMES_(T)>
+class GTEST_2_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1) : f0_(f0),
+      f1_(f1) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_) {}
+
+  template <GTEST_2_TYPENAMES_(U)>
+  tuple(const GTEST_2_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_) {}
+  template <typename U0, typename U1>
+  tuple(const ::std::pair<U0, U1>& p) : f0_(p.first), f1_(p.second) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_2_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_2_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+  template <typename U0, typename U1>
+  tuple& operator=(const ::std::pair<U0, U1>& p) {
+    f0_ = p.first;
+    f1_ = p.second;
+    return *this;
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_2_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_2_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+};
+
+template <GTEST_3_TYPENAMES_(T)>
+class GTEST_3_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2) : f0_(f0), f1_(f1), f2_(f2) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_) {}
+
+  template <GTEST_3_TYPENAMES_(U)>
+  tuple(const GTEST_3_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_3_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_3_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_3_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_3_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+};
+
+template <GTEST_4_TYPENAMES_(T)>
+class GTEST_4_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3) : f0_(f0), f1_(f1), f2_(f2),
+      f3_(f3) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_) {}
+
+  template <GTEST_4_TYPENAMES_(U)>
+  tuple(const GTEST_4_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_4_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_4_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_4_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_4_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+};
+
+template <GTEST_5_TYPENAMES_(T)>
+class GTEST_5_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_(), f4_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3,
+      GTEST_BY_REF_(T4) f4) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
+      f4_(t.f4_) {}
+
+  template <GTEST_5_TYPENAMES_(U)>
+  tuple(const GTEST_5_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_), f4_(t.f4_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_5_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_5_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_5_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_5_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    f4_ = t.f4_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+  T4 f4_;
+};
+
+template <GTEST_6_TYPENAMES_(T)>
+class GTEST_6_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
+      GTEST_BY_REF_(T5) f5) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4),
+      f5_(f5) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
+      f4_(t.f4_), f5_(t.f5_) {}
+
+  template <GTEST_6_TYPENAMES_(U)>
+  tuple(const GTEST_6_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_6_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_6_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_6_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_6_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    f4_ = t.f4_;
+    f5_ = t.f5_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+  T4 f4_;
+  T5 f5_;
+};
+
+template <GTEST_7_TYPENAMES_(T)>
+class GTEST_7_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
+      GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6) : f0_(f0), f1_(f1), f2_(f2),
+      f3_(f3), f4_(f4), f5_(f5), f6_(f6) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
+      f4_(t.f4_), f5_(t.f5_), f6_(t.f6_) {}
+
+  template <GTEST_7_TYPENAMES_(U)>
+  tuple(const GTEST_7_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_7_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_7_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_7_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_7_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    f4_ = t.f4_;
+    f5_ = t.f5_;
+    f6_ = t.f6_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+  T4 f4_;
+  T5 f5_;
+  T6 f6_;
+};
+
+template <GTEST_8_TYPENAMES_(T)>
+class GTEST_8_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
+      GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6,
+      GTEST_BY_REF_(T7) f7) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4),
+      f5_(f5), f6_(f6), f7_(f7) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
+      f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_) {}
+
+  template <GTEST_8_TYPENAMES_(U)>
+  tuple(const GTEST_8_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_8_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_8_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_8_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_8_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    f4_ = t.f4_;
+    f5_ = t.f5_;
+    f6_ = t.f6_;
+    f7_ = t.f7_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+  T4 f4_;
+  T5 f5_;
+  T6 f6_;
+  T7 f7_;
+};
+
+template <GTEST_9_TYPENAMES_(T)>
+class GTEST_9_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_(), f8_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
+      GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6, GTEST_BY_REF_(T7) f7,
+      GTEST_BY_REF_(T8) f8) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4),
+      f5_(f5), f6_(f6), f7_(f7), f8_(f8) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
+      f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_) {}
+
+  template <GTEST_9_TYPENAMES_(U)>
+  tuple(const GTEST_9_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_9_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_9_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_9_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_9_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    f4_ = t.f4_;
+    f5_ = t.f5_;
+    f6_ = t.f6_;
+    f7_ = t.f7_;
+    f8_ = t.f8_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+  T4 f4_;
+  T5 f5_;
+  T6 f6_;
+  T7 f7_;
+  T8 f8_;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+class tuple {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_(), f8_(),
+      f9_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
+      GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6, GTEST_BY_REF_(T7) f7,
+      GTEST_BY_REF_(T8) f8, GTEST_BY_REF_(T9) f9) : f0_(f0), f1_(f1), f2_(f2),
+      f3_(f3), f4_(f4), f5_(f5), f6_(f6), f7_(f7), f8_(f8), f9_(f9) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
+      f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_), f9_(t.f9_) {}
+
+  template <GTEST_10_TYPENAMES_(U)>
+  tuple(const GTEST_10_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_),
+      f9_(t.f9_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_10_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_10_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_10_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_10_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    f4_ = t.f4_;
+    f5_ = t.f5_;
+    f6_ = t.f6_;
+    f7_ = t.f7_;
+    f8_ = t.f8_;
+    f9_ = t.f9_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+  T4 f4_;
+  T5 f5_;
+  T6 f6_;
+  T7 f7_;
+  T8 f8_;
+  T9 f9_;
+};
+
+// 6.1.3.2 Tuple creation functions.
+
+// Known limitations: we don't support passing an
+// std::tr1::reference_wrapper<T> to make_tuple().  And we don't
+// implement tie().
+
+inline tuple<> make_tuple() { return tuple<>(); }
+
+template <GTEST_1_TYPENAMES_(T)>
+inline GTEST_1_TUPLE_(T) make_tuple(const T0& f0) {
+  return GTEST_1_TUPLE_(T)(f0);
+}
+
+template <GTEST_2_TYPENAMES_(T)>
+inline GTEST_2_TUPLE_(T) make_tuple(const T0& f0, const T1& f1) {
+  return GTEST_2_TUPLE_(T)(f0, f1);
+}
+
+template <GTEST_3_TYPENAMES_(T)>
+inline GTEST_3_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2) {
+  return GTEST_3_TUPLE_(T)(f0, f1, f2);
+}
+
+template <GTEST_4_TYPENAMES_(T)>
+inline GTEST_4_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3) {
+  return GTEST_4_TUPLE_(T)(f0, f1, f2, f3);
+}
+
+template <GTEST_5_TYPENAMES_(T)>
+inline GTEST_5_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3, const T4& f4) {
+  return GTEST_5_TUPLE_(T)(f0, f1, f2, f3, f4);
+}
+
+template <GTEST_6_TYPENAMES_(T)>
+inline GTEST_6_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3, const T4& f4, const T5& f5) {
+  return GTEST_6_TUPLE_(T)(f0, f1, f2, f3, f4, f5);
+}
+
+template <GTEST_7_TYPENAMES_(T)>
+inline GTEST_7_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3, const T4& f4, const T5& f5, const T6& f6) {
+  return GTEST_7_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6);
+}
+
+template <GTEST_8_TYPENAMES_(T)>
+inline GTEST_8_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7) {
+  return GTEST_8_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7);
+}
+
+template <GTEST_9_TYPENAMES_(T)>
+inline GTEST_9_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7,
+    const T8& f8) {
+  return GTEST_9_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7, f8);
+}
+
+template <GTEST_10_TYPENAMES_(T)>
+inline GTEST_10_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7,
+    const T8& f8, const T9& f9) {
+  return GTEST_10_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7, f8, f9);
+}
+
+// 6.1.3.3 Tuple helper classes.
+
+template <typename Tuple> struct tuple_size;
+
+template <GTEST_0_TYPENAMES_(T)>
+struct tuple_size<GTEST_0_TUPLE_(T) > {
+  static const int value = 0;
+};
+
+template <GTEST_1_TYPENAMES_(T)>
+struct tuple_size<GTEST_1_TUPLE_(T) > {
+  static const int value = 1;
+};
+
+template <GTEST_2_TYPENAMES_(T)>
+struct tuple_size<GTEST_2_TUPLE_(T) > {
+  static const int value = 2;
+};
+
+template <GTEST_3_TYPENAMES_(T)>
+struct tuple_size<GTEST_3_TUPLE_(T) > {
+  static const int value = 3;
+};
+
+template <GTEST_4_TYPENAMES_(T)>
+struct tuple_size<GTEST_4_TUPLE_(T) > {
+  static const int value = 4;
+};
+
+template <GTEST_5_TYPENAMES_(T)>
+struct tuple_size<GTEST_5_TUPLE_(T) > {
+  static const int value = 5;
+};
+
+template <GTEST_6_TYPENAMES_(T)>
+struct tuple_size<GTEST_6_TUPLE_(T) > {
+  static const int value = 6;
+};
+
+template <GTEST_7_TYPENAMES_(T)>
+struct tuple_size<GTEST_7_TUPLE_(T) > {
+  static const int value = 7;
+};
+
+template <GTEST_8_TYPENAMES_(T)>
+struct tuple_size<GTEST_8_TUPLE_(T) > {
+  static const int value = 8;
+};
+
+template <GTEST_9_TYPENAMES_(T)>
+struct tuple_size<GTEST_9_TUPLE_(T) > {
+  static const int value = 9;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct tuple_size<GTEST_10_TUPLE_(T) > {
+  static const int value = 10;
+};
+
+template <int k, class Tuple>
+struct tuple_element {
+  typedef typename gtest_internal::TupleElement<
+      k < (tuple_size<Tuple>::value), k, Tuple>::type type;
+};
+
+#define GTEST_TUPLE_ELEMENT_(k, Tuple) typename tuple_element<k, Tuple >::type
+
+// 6.1.3.4 Element access.
+
+namespace gtest_internal {
+
+template <>
+class Get<0> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(0, Tuple))
+  Field(Tuple& t) { return t.f0_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(0, Tuple))
+  ConstField(const Tuple& t) { return t.f0_; }
+};
+
+template <>
+class Get<1> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(1, Tuple))
+  Field(Tuple& t) { return t.f1_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(1, Tuple))
+  ConstField(const Tuple& t) { return t.f1_; }
+};
+
+template <>
+class Get<2> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(2, Tuple))
+  Field(Tuple& t) { return t.f2_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(2, Tuple))
+  ConstField(const Tuple& t) { return t.f2_; }
+};
+
+template <>
+class Get<3> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(3, Tuple))
+  Field(Tuple& t) { return t.f3_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(3, Tuple))
+  ConstField(const Tuple& t) { return t.f3_; }
+};
+
+template <>
+class Get<4> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(4, Tuple))
+  Field(Tuple& t) { return t.f4_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(4, Tuple))
+  ConstField(const Tuple& t) { return t.f4_; }
+};
+
+template <>
+class Get<5> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(5, Tuple))
+  Field(Tuple& t) { return t.f5_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(5, Tuple))
+  ConstField(const Tuple& t) { return t.f5_; }
+};
+
+template <>
+class Get<6> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(6, Tuple))
+  Field(Tuple& t) { return t.f6_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(6, Tuple))
+  ConstField(const Tuple& t) { return t.f6_; }
+};
+
+template <>
+class Get<7> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(7, Tuple))
+  Field(Tuple& t) { return t.f7_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(7, Tuple))
+  ConstField(const Tuple& t) { return t.f7_; }
+};
+
+template <>
+class Get<8> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(8, Tuple))
+  Field(Tuple& t) { return t.f8_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(8, Tuple))
+  ConstField(const Tuple& t) { return t.f8_; }
+};
+
+template <>
+class Get<9> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(9, Tuple))
+  Field(Tuple& t) { return t.f9_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(9, Tuple))
+  ConstField(const Tuple& t) { return t.f9_; }
+};
+
+}  // namespace gtest_internal
+
+template <int k, GTEST_10_TYPENAMES_(T)>
+GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(k, GTEST_10_TUPLE_(T)))
+get(GTEST_10_TUPLE_(T)& t) {
+  return gtest_internal::Get<k>::Field(t);
+}
+
+template <int k, GTEST_10_TYPENAMES_(T)>
+GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(k,  GTEST_10_TUPLE_(T)))
+get(const GTEST_10_TUPLE_(T)& t) {
+  return gtest_internal::Get<k>::ConstField(t);
+}
+
+// 6.1.3.5 Relational operators
+
+// We only implement == and !=, as we don't have a need for the rest yet.
+
+namespace gtest_internal {
+
+// SameSizeTuplePrefixComparator<k, k>::Eq(t1, t2) returns true if the
+// first k fields of t1 equals the first k fields of t2.
+// SameSizeTuplePrefixComparator(k1, k2) would be a compiler error if
+// k1 != k2.
+template <int kSize1, int kSize2>
+struct SameSizeTuplePrefixComparator;
+
+template <>
+struct SameSizeTuplePrefixComparator<0, 0> {
+  template <class Tuple1, class Tuple2>
+  static bool Eq(const Tuple1& /* t1 */, const Tuple2& /* t2 */) {
+    return true;
+  }
+};
+
+template <int k>
+struct SameSizeTuplePrefixComparator<k, k> {
+  template <class Tuple1, class Tuple2>
+  static bool Eq(const Tuple1& t1, const Tuple2& t2) {
+    return SameSizeTuplePrefixComparator<k - 1, k - 1>::Eq(t1, t2) &&
+        ::std::tr1::get<k - 1>(t1) == ::std::tr1::get<k - 1>(t2);
+  }
+};
+
+}  // namespace gtest_internal
+
+template <GTEST_10_TYPENAMES_(T), GTEST_10_TYPENAMES_(U)>
+inline bool operator==(const GTEST_10_TUPLE_(T)& t,
+                       const GTEST_10_TUPLE_(U)& u) {
+  return gtest_internal::SameSizeTuplePrefixComparator<
+      tuple_size<GTEST_10_TUPLE_(T) >::value,
+      tuple_size<GTEST_10_TUPLE_(U) >::value>::Eq(t, u);
+}
+
+template <GTEST_10_TYPENAMES_(T), GTEST_10_TYPENAMES_(U)>
+inline bool operator!=(const GTEST_10_TUPLE_(T)& t,
+                       const GTEST_10_TUPLE_(U)& u) { return !(t == u); }
+
+// 6.1.4 Pairs.
+// Unimplemented.
+
+}  // namespace tr1
+}  // namespace std
+
+#undef GTEST_0_TUPLE_
+#undef GTEST_1_TUPLE_
+#undef GTEST_2_TUPLE_
+#undef GTEST_3_TUPLE_
+#undef GTEST_4_TUPLE_
+#undef GTEST_5_TUPLE_
+#undef GTEST_6_TUPLE_
+#undef GTEST_7_TUPLE_
+#undef GTEST_8_TUPLE_
+#undef GTEST_9_TUPLE_
+#undef GTEST_10_TUPLE_
+
+#undef GTEST_0_TYPENAMES_
+#undef GTEST_1_TYPENAMES_
+#undef GTEST_2_TYPENAMES_
+#undef GTEST_3_TYPENAMES_
+#undef GTEST_4_TYPENAMES_
+#undef GTEST_5_TYPENAMES_
+#undef GTEST_6_TYPENAMES_
+#undef GTEST_7_TYPENAMES_
+#undef GTEST_8_TYPENAMES_
+#undef GTEST_9_TYPENAMES_
+#undef GTEST_10_TYPENAMES_
+
+#undef GTEST_DECLARE_TUPLE_AS_FRIEND_
+#undef GTEST_BY_REF_
+#undef GTEST_ADD_REF_
+#undef GTEST_TUPLE_ELEMENT_
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
+# elif GTEST_ENV_HAS_STD_TUPLE_
+#  include <tuple>
+// C++11 puts its tuple into the ::std namespace rather than
+// ::std::tr1.  gtest expects tuple to live in ::std::tr1, so put it there.
+// This causes undefined behavior, but supported compilers react in
+// the way we intend.
+namespace std {
+namespace tr1 {
+using ::std::get;
+using ::std::make_tuple;
+using ::std::tuple;
+using ::std::tuple_element;
+using ::std::tuple_size;
+}
+}
+
+# elif GTEST_OS_SYMBIAN
+
+// On Symbian, BOOST_HAS_TR1_TUPLE causes Boost's TR1 tuple library to
+// use STLport's tuple implementation, which unfortunately doesn't
+// work as the copy of STLport distributed with Symbian is incomplete.
+// By making sure BOOST_HAS_TR1_TUPLE is undefined, we force Boost to
+// use its own tuple implementation.
+#  ifdef BOOST_HAS_TR1_TUPLE
+#   undef BOOST_HAS_TR1_TUPLE
+#  endif  // BOOST_HAS_TR1_TUPLE
+
+// This prevents <boost/tr1/detail/config.hpp>, which defines
+// BOOST_HAS_TR1_TUPLE, from being #included by Boost's <tuple>.
+#  define BOOST_TR1_DETAIL_CONFIG_HPP_INCLUDED
+#  include <tuple>  // IWYU pragma: export  // NOLINT
+
+# elif defined(__GNUC__) && (GTEST_GCC_VER_ >= 40000)
+// GCC 4.0+ implements tr1/tuple in the <tr1/tuple> header.  This does
+// not conform to the TR1 spec, which requires the header to be <tuple>.
+
+#  if !GTEST_HAS_RTTI && GTEST_GCC_VER_ < 40302
+// Until version 4.3.2, gcc has a bug that causes <tr1/functional>,
+// which is #included by <tr1/tuple>, to not compile when RTTI is
+// disabled.  _TR1_FUNCTIONAL is the header guard for
+// <tr1/functional>.  Hence the following #define is a hack to prevent
+// <tr1/functional> from being included.
+#   define _TR1_FUNCTIONAL 1
+#   include <tr1/tuple>
+#   undef _TR1_FUNCTIONAL  // Allows the user to #include
+                        // <tr1/functional> if he chooses to.
+#  else
+#   include <tr1/tuple>  // NOLINT
+#  endif  // !GTEST_HAS_RTTI && GTEST_GCC_VER_ < 40302
+
+# else
+// If the compiler is not GCC 4.0+, we assume the user is using a
+// spec-conforming TR1 implementation.
+#  include <tuple>  // IWYU pragma: export  // NOLINT
+# endif  // GTEST_USE_OWN_TR1_TUPLE
+
+#endif  // GTEST_HAS_TR1_TUPLE
+
+// Determines whether clone(2) is supported.
+// Usually it will only be available on Linux, excluding
+// Linux on the Itanium architecture.
+// Also see http://linux.die.net/man/2/clone.
+#ifndef GTEST_HAS_CLONE
+// The user didn't tell us, so we need to figure it out.
+
+# if GTEST_OS_LINUX && !defined(__ia64__)
+#  if GTEST_OS_LINUX_ANDROID
+// On Android, clone() is only available on ARM starting with Gingerbread.
+#    if defined(__arm__) && __ANDROID_API__ >= 9
+#     define GTEST_HAS_CLONE 1
+#    else
+#     define GTEST_HAS_CLONE 0
+#    endif
+#  else
+#   define GTEST_HAS_CLONE 1
+#  endif
+# else
+#  define GTEST_HAS_CLONE 0
+# endif  // GTEST_OS_LINUX && !defined(__ia64__)
+
+#endif  // GTEST_HAS_CLONE
+
+// Determines whether to support stream redirection. This is used to test
+// output correctness and to implement death tests.
+#ifndef GTEST_HAS_STREAM_REDIRECTION
+// By default, we assume that stream redirection is supported on all
+// platforms except known mobile ones.
+# if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN || \
+    GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT
+#  define GTEST_HAS_STREAM_REDIRECTION 0
+# else
+#  define GTEST_HAS_STREAM_REDIRECTION 1
+# endif  // !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_SYMBIAN
+#endif  // GTEST_HAS_STREAM_REDIRECTION
+
+// Determines whether to support death tests.
+// Google Test does not support death tests for VC 7.1 and earlier as
+// abort() in a VC 7.1 application compiled as GUI in debug config
+// pops up a dialog window that cannot be suppressed programmatically.
+#if (GTEST_OS_LINUX || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS || \
+     (GTEST_OS_MAC && !GTEST_OS_IOS) || \
+     (GTEST_OS_WINDOWS_DESKTOP && _MSC_VER >= 1400) || \
+     GTEST_OS_WINDOWS_MINGW || GTEST_OS_AIX || GTEST_OS_HPUX || \
+     GTEST_OS_OPENBSD || GTEST_OS_QNX || GTEST_OS_FREEBSD)
+# define GTEST_HAS_DEATH_TEST 1
+# include <vector>  // NOLINT
+#endif
+
+// We don't support MSVC 7.1 with exceptions disabled now.  Therefore
+// all the compilers we care about are adequate for supporting
+// value-parameterized tests.
+#define GTEST_HAS_PARAM_TEST 1
+
+// Determines whether to support type-driven tests.
+
+// Typed tests need <typeinfo> and variadic macros, which GCC, VC++ 8.0,
+// Sun Pro CC, IBM Visual Age, and HP aCC support.
+#if defined(__GNUC__) || (_MSC_VER >= 1400) || defined(__SUNPRO_CC) || \
+    defined(__IBMCPP__) || defined(__HP_aCC)
+# define GTEST_HAS_TYPED_TEST 1
+# define GTEST_HAS_TYPED_TEST_P 1
+#endif
+
+// Determines whether to support Combine(). This only makes sense when
+// value-parameterized tests are enabled.  The implementation doesn't
+// work on Sun Studio since it doesn't understand templated conversion
+// operators.
+#if GTEST_HAS_PARAM_TEST && GTEST_HAS_TR1_TUPLE && !defined(__SUNPRO_CC)
+# define GTEST_HAS_COMBINE 1
+#endif
+
+// Determines whether the system compiler uses UTF-16 for encoding wide strings.
+#define GTEST_WIDE_STRING_USES_UTF16_ \
+    (GTEST_OS_WINDOWS || GTEST_OS_CYGWIN || GTEST_OS_SYMBIAN || GTEST_OS_AIX)
+
+// Determines whether test results can be streamed to a socket.
+#if GTEST_OS_LINUX
+# define GTEST_CAN_STREAM_RESULTS_ 1
+#endif
+
+// Defines some utility macros.
+
+// The GNU compiler emits a warning if nested "if" statements are followed by
+// an "else" statement and braces are not used to explicitly disambiguate the
+// "else" binding.  This leads to problems with code like:
+//
+//   if (gate)
+//     ASSERT_*(condition) << "Some message";
+//
+// The "switch (0) case 0:" idiom is used to suppress this.
+#ifdef __INTEL_COMPILER
+# define GTEST_AMBIGUOUS_ELSE_BLOCKER_
+#else
+# define GTEST_AMBIGUOUS_ELSE_BLOCKER_ switch (0) case 0: default:  // NOLINT
+#endif
+
+// Use this annotation at the end of a struct/class definition to
+// prevent the compiler from optimizing away instances that are never
+// used.  This is useful when all interesting logic happens inside the
+// c'tor and / or d'tor.  Example:
+//
+//   struct Foo {
+//     Foo() { ... }
+//   } GTEST_ATTRIBUTE_UNUSED_;
+//
+// Also use it after a variable or parameter declaration to tell the
+// compiler the variable/parameter does not have to be used.
+#if defined(__GNUC__) && !defined(COMPILER_ICC)
+# define GTEST_ATTRIBUTE_UNUSED_ __attribute__ ((unused))
+#elif defined(__clang__)
+# if __has_attribute(unused)
+#  define GTEST_ATTRIBUTE_UNUSED_ __attribute__ ((unused))
+# endif
+#endif
+#ifndef GTEST_ATTRIBUTE_UNUSED_
+# define GTEST_ATTRIBUTE_UNUSED_
+#endif
+
+// A macro to disallow operator=
+// This should be used in the private: declarations for a class.
+#define GTEST_DISALLOW_ASSIGN_(type)\
+  void operator=(type const &)
+
+// A macro to disallow copy constructor and operator=
+// This should be used in the private: declarations for a class.
+#define GTEST_DISALLOW_COPY_AND_ASSIGN_(type)\
+  type(type const &);\
+  GTEST_DISALLOW_ASSIGN_(type)
+
+// Tell the compiler to warn about unused return values for functions declared
+// with this macro.  The macro should be used on function declarations
+// following the argument list:
+//
+//   Sprocket* AllocateSprocket() GTEST_MUST_USE_RESULT_;
+#if defined(__GNUC__) && (GTEST_GCC_VER_ >= 30400) && !defined(COMPILER_ICC)
+# define GTEST_MUST_USE_RESULT_ __attribute__ ((warn_unused_result))
+#else
+# define GTEST_MUST_USE_RESULT_
+#endif  // __GNUC__ && (GTEST_GCC_VER_ >= 30400) && !COMPILER_ICC
+
+// MS C++ compiler emits warning when a conditional expression is compile time
+// constant. In some contexts this warning is false positive and needs to be
+// suppressed. Use the following two macros in such cases:
+//
+// GTEST_INTENTIONAL_CONST_COND_PUSH_()
+// while (true) {
+// GTEST_INTENTIONAL_CONST_COND_POP_()
+// }
+# define GTEST_INTENTIONAL_CONST_COND_PUSH_() \
+    GTEST_DISABLE_MSC_WARNINGS_PUSH_(4127)
+# define GTEST_INTENTIONAL_CONST_COND_POP_() \
+    GTEST_DISABLE_MSC_WARNINGS_POP_()
+
+// Determine whether the compiler supports Microsoft's Structured Exception
+// Handling.  This is supported by several Windows compilers but generally
+// does not exist on any other system.
+#ifndef GTEST_HAS_SEH
+// The user didn't tell us, so we need to figure it out.
+
+# if defined(_MSC_VER) || defined(__BORLANDC__)
+// These two compilers are known to support SEH.
+#  define GTEST_HAS_SEH 1
+# else
+// Assume no SEH.
+#  define GTEST_HAS_SEH 0
+# endif
+
+#define GTEST_IS_THREADSAFE \
+    (0 \
+     || (GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT) \
+     || GTEST_HAS_PTHREAD)
+
+#endif  // GTEST_HAS_SEH
+
+#ifdef _MSC_VER
+
+# if GTEST_LINKED_AS_SHARED_LIBRARY
+#  define GTEST_API_ __declspec(dllimport)
+# elif GTEST_CREATE_SHARED_LIBRARY
+#  define GTEST_API_ __declspec(dllexport)
+# endif
+
+#endif  // _MSC_VER
+
+#ifndef GTEST_API_
+# define GTEST_API_
+#endif
+
+#ifdef __GNUC__
+// Ask the compiler to never inline a given function.
+# define GTEST_NO_INLINE_ __attribute__((noinline))
+#else
+# define GTEST_NO_INLINE_
+#endif
+
+// _LIBCPP_VERSION is defined by the libc++ library from the LLVM project.
+#if defined(__GLIBCXX__) || defined(_LIBCPP_VERSION)
+# define GTEST_HAS_CXXABI_H_ 1
+#else
+# define GTEST_HAS_CXXABI_H_ 0
+#endif
+
+// A function level attribute to disable checking for use of uninitialized
+// memory when built with MemorySanitizer.
+#if defined(__clang__)
+# if __has_feature(memory_sanitizer)
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ \
+       __attribute__((no_sanitize_memory))
+# else
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+# endif  // __has_feature(memory_sanitizer)
+#else
+# define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+#endif  // __clang__
+
+// A function level attribute to disable AddressSanitizer instrumentation.
+#if defined(__clang__)
+# if __has_feature(address_sanitizer)
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ \
+       __attribute__((no_sanitize_address))
+# else
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+# endif  // __has_feature(address_sanitizer)
+#else
+# define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+#endif  // __clang__
+
+// A function level attribute to disable ThreadSanitizer instrumentation.
+#if defined(__clang__)
+# if __has_feature(thread_sanitizer)
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ \
+       __attribute__((no_sanitize_thread))
+# else
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+# endif  // __has_feature(thread_sanitizer)
+#else
+# define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+#endif  // __clang__
+
+namespace testing {
+
+class Message;
+
+#if defined(GTEST_TUPLE_NAMESPACE_)
+// Import tuple and friends into the ::testing namespace.
+// It is part of our interface, having them in ::testing allows us to change
+// their types as needed.
+using GTEST_TUPLE_NAMESPACE_::get;
+using GTEST_TUPLE_NAMESPACE_::make_tuple;
+using GTEST_TUPLE_NAMESPACE_::tuple;
+using GTEST_TUPLE_NAMESPACE_::tuple_size;
+using GTEST_TUPLE_NAMESPACE_::tuple_element;
+#endif  // defined(GTEST_TUPLE_NAMESPACE_)
+
+namespace internal {
+
+// A secret type that Google Test users don't know about.  It has no
+// definition on purpose.  Therefore it's impossible to create a
+// Secret object, which is what we want.
+class Secret;
+
+// The GTEST_COMPILE_ASSERT_ macro can be used to verify that a compile time
+// expression is true. For example, you could use it to verify the
+// size of a static array:
+//
+//   GTEST_COMPILE_ASSERT_(GTEST_ARRAY_SIZE_(names) == NUM_NAMES,
+//                         names_incorrect_size);
+//
+// or to make sure a struct is smaller than a certain size:
+//
+//   GTEST_COMPILE_ASSERT_(sizeof(foo) < 128, foo_too_large);
+//
+// The second argument to the macro is the name of the variable. If
+// the expression is false, most compilers will issue a warning/error
+// containing the name of the variable.
+
+#if GTEST_LANG_CXX11
+# define GTEST_COMPILE_ASSERT_(expr, msg) static_assert(expr, #msg)
+#else  // !GTEST_LANG_CXX11
+template <bool>
+  struct CompileAssert {
+};
+
+# define GTEST_COMPILE_ASSERT_(expr, msg) \
+  typedef ::testing::internal::CompileAssert<(static_cast<bool>(expr))> \
+      msg[static_cast<bool>(expr) ? 1 : -1] GTEST_ATTRIBUTE_UNUSED_
+#endif  // !GTEST_LANG_CXX11
+
+// Implementation details of GTEST_COMPILE_ASSERT_:
+//
+// (In C++11, we simply use static_assert instead of the following)
+//
+// - GTEST_COMPILE_ASSERT_ works by defining an array type that has -1
+//   elements (and thus is invalid) when the expression is false.
+//
+// - The simpler definition
+//
+//    #define GTEST_COMPILE_ASSERT_(expr, msg) typedef char msg[(expr) ? 1 : -1]
+//
+//   does not work, as gcc supports variable-length arrays whose sizes
+//   are determined at run-time (this is gcc's extension and not part
+//   of the C++ standard).  As a result, gcc fails to reject the
+//   following code with the simple definition:
+//
+//     int foo;
+//     GTEST_COMPILE_ASSERT_(foo, msg); // not supposed to compile as foo is
+//                                      // not a compile-time constant.
+//
+// - By using the type CompileAssert<(bool(expr))>, we ensures that
+//   expr is a compile-time constant.  (Template arguments must be
+//   determined at compile-time.)
+//
+// - The outter parentheses in CompileAssert<(bool(expr))> are necessary
+//   to work around a bug in gcc 3.4.4 and 4.0.1.  If we had written
+//
+//     CompileAssert<bool(expr)>
+//
+//   instead, these compilers will refuse to compile
+//
+//     GTEST_COMPILE_ASSERT_(5 > 0, some_message);
+//
+//   (They seem to think the ">" in "5 > 0" marks the end of the
+//   template argument list.)
+//
+// - The array size is (bool(expr) ? 1 : -1), instead of simply
+//
+//     ((expr) ? 1 : -1).
+//
+//   This is to avoid running into a bug in MS VC 7.1, which
+//   causes ((0.0) ? 1 : -1) to incorrectly evaluate to 1.
+
+// StaticAssertTypeEqHelper is used by StaticAssertTypeEq defined in gtest.h.
+//
+// This template is declared, but intentionally undefined.
+template <typename T1, typename T2>
+struct StaticAssertTypeEqHelper;
+
+template <typename T>
+struct StaticAssertTypeEqHelper<T, T> {
+  enum { value = true };
+};
+
+// Evaluates to the number of elements in 'array'.
+#define GTEST_ARRAY_SIZE_(array) (sizeof(array) / sizeof(array[0]))
+
+#if GTEST_HAS_GLOBAL_STRING
+typedef ::string string;
+#else
+typedef ::std::string string;
+#endif  // GTEST_HAS_GLOBAL_STRING
+
+#if GTEST_HAS_GLOBAL_WSTRING
+typedef ::wstring wstring;
+#elif GTEST_HAS_STD_WSTRING
+typedef ::std::wstring wstring;
+#endif  // GTEST_HAS_GLOBAL_WSTRING
+
+// A helper for suppressing warnings on constant condition.  It just
+// returns 'condition'.
+GTEST_API_ bool IsTrue(bool condition);
+
+// Defines scoped_ptr.
+
+// This implementation of scoped_ptr is PARTIAL - it only contains
+// enough stuff to satisfy Google Test's need.
+template <typename T>
+class scoped_ptr {
+ public:
+  typedef T element_type;
+
+  explicit scoped_ptr(T* p = NULL) : ptr_(p) {}
+  ~scoped_ptr() { reset(); }
+
+  T& operator*() const { return *ptr_; }
+  T* operator->() const { return ptr_; }
+  T* get() const { return ptr_; }
+
+  T* release() {
+    T* const ptr = ptr_;
+    ptr_ = NULL;
+    return ptr;
+  }
+
+  void reset(T* p = NULL) {
+    if (p != ptr_) {
+      if (IsTrue(sizeof(T) > 0)) {  // Makes sure T is a complete type.
+        delete ptr_;
+      }
+      ptr_ = p;
+    }
+  }
+
+  friend void swap(scoped_ptr& a, scoped_ptr& b) {
+    using std::swap;
+    swap(a.ptr_, b.ptr_);
+  }
+
+ private:
+  T* ptr_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(scoped_ptr);
+};
+
+// Defines RE.
+
+// A simple C++ wrapper for <regex.h>.  It uses the POSIX Extended
+// Regular Expression syntax.
+class GTEST_API_ RE {
+ public:
+  // A copy constructor is required by the Standard to initialize object
+  // references from r-values.
+  RE(const RE& other) { Init(other.pattern()); }
+
+  // Constructs an RE from a string.
+  RE(const ::std::string& regex) { Init(regex.c_str()); }  // NOLINT
+
+#if GTEST_HAS_GLOBAL_STRING
+
+  RE(const ::string& regex) { Init(regex.c_str()); }  // NOLINT
+
+#endif  // GTEST_HAS_GLOBAL_STRING
+
+  RE(const char* regex) { Init(regex); }  // NOLINT
+  ~RE();
+
+  // Returns the string representation of the regex.
+  const char* pattern() const { return pattern_; }
+
+  // FullMatch(str, re) returns true iff regular expression re matches
+  // the entire str.
+  // PartialMatch(str, re) returns true iff regular expression re
+  // matches a substring of str (including str itself).
+  //
+  // TODO(wan at google.com): make FullMatch() and PartialMatch() work
+  // when str contains NUL characters.
+  static bool FullMatch(const ::std::string& str, const RE& re) {
+    return FullMatch(str.c_str(), re);
+  }
+  static bool PartialMatch(const ::std::string& str, const RE& re) {
+    return PartialMatch(str.c_str(), re);
+  }
+
+#if GTEST_HAS_GLOBAL_STRING
+
+  static bool FullMatch(const ::string& str, const RE& re) {
+    return FullMatch(str.c_str(), re);
+  }
+  static bool PartialMatch(const ::string& str, const RE& re) {
+    return PartialMatch(str.c_str(), re);
+  }
+
+#endif  // GTEST_HAS_GLOBAL_STRING
+
+  static bool FullMatch(const char* str, const RE& re);
+  static bool PartialMatch(const char* str, const RE& re);
+
+ private:
+  void Init(const char* regex);
+
+  // We use a const char* instead of an std::string, as Google Test used to be
+  // used where std::string is not available.  TODO(wan at google.com): change to
+  // std::string.
+  const char* pattern_;
+  bool is_valid_;
+
+#if GTEST_USES_POSIX_RE
+
+  regex_t full_regex_;     // For FullMatch().
+  regex_t partial_regex_;  // For PartialMatch().
+
+#else  // GTEST_USES_SIMPLE_RE
+
+  const char* full_pattern_;  // For FullMatch();
+
+#endif
+
+  GTEST_DISALLOW_ASSIGN_(RE);
+};
+
+// Formats a source file path and a line number as they would appear
+// in an error message from the compiler used to compile this code.
+GTEST_API_ ::std::string FormatFileLocation(const char* file, int line);
+
+// Formats a file location for compiler-independent XML output.
+// Although this function is not platform dependent, we put it next to
+// FormatFileLocation in order to contrast the two functions.
+GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char* file,
+                                                               int line);
+
+// Defines logging utilities:
+//   GTEST_LOG_(severity) - logs messages at the specified severity level. The
+//                          message itself is streamed into the macro.
+//   LogToStderr()  - directs all log messages to stderr.
+//   FlushInfoLog() - flushes informational log messages.
+
+enum GTestLogSeverity {
+  GTEST_INFO,
+  GTEST_WARNING,
+  GTEST_ERROR,
+  GTEST_FATAL
+};
+
+// Formats log entry severity, provides a stream object for streaming the
+// log message, and terminates the message with a newline when going out of
+// scope.
+class GTEST_API_ GTestLog {
+ public:
+  GTestLog(GTestLogSeverity severity, const char* file, int line);
+
+  // Flushes the buffers and, if severity is GTEST_FATAL, aborts the program.
+  ~GTestLog();
+
+  ::std::ostream& GetStream() { return ::std::cerr; }
+
+ private:
+  const GTestLogSeverity severity_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestLog);
+};
+
+#define GTEST_LOG_(severity) \
+    ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \
+                                  __FILE__, __LINE__).GetStream()
+
+inline void LogToStderr() {}
+inline void FlushInfoLog() { fflush(NULL); }
+
+// INTERNAL IMPLEMENTATION - DO NOT USE.
+//
+// GTEST_CHECK_ is an all-mode assert. It aborts the program if the condition
+// is not satisfied.
+//  Synopsys:
+//    GTEST_CHECK_(boolean_condition);
+//     or
+//    GTEST_CHECK_(boolean_condition) << "Additional message";
+//
+//    This checks the condition and if the condition is not satisfied
+//    it prints message about the condition violation, including the
+//    condition itself, plus additional message streamed into it, if any,
+//    and then it aborts the program. It aborts the program irrespective of
+//    whether it is built in the debug mode or not.
+#define GTEST_CHECK_(condition) \
+    GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+    if (::testing::internal::IsTrue(condition)) \
+      ; \
+    else \
+      GTEST_LOG_(FATAL) << "Condition " #condition " failed. "
+
+// An all-mode assert to verify that the given POSIX-style function
+// call returns 0 (indicating success).  Known limitation: this
+// doesn't expand to a balanced 'if' statement, so enclose the macro
+// in {} if you need to use it as the only statement in an 'if'
+// branch.
+#define GTEST_CHECK_POSIX_SUCCESS_(posix_call) \
+  if (const int gtest_error = (posix_call)) \
+    GTEST_LOG_(FATAL) << #posix_call << "failed with error " \
+                      << gtest_error
+
+#if GTEST_HAS_STD_MOVE_
+using std::move;
+#else  // GTEST_HAS_STD_MOVE_
+template <typename T>
+const T& move(const T& t) {
+  return t;
+}
+#endif  // GTEST_HAS_STD_MOVE_
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Use ImplicitCast_ as a safe version of static_cast for upcasting in
+// the type hierarchy (e.g. casting a Foo* to a SuperclassOfFoo* or a
+// const Foo*).  When you use ImplicitCast_, the compiler checks that
+// the cast is safe.  Such explicit ImplicitCast_s are necessary in
+// surprisingly many situations where C++ demands an exact type match
+// instead of an argument type convertable to a target type.
+//
+// The syntax for using ImplicitCast_ is the same as for static_cast:
+//
+//   ImplicitCast_<ToType>(expr)
+//
+// ImplicitCast_ would have been part of the C++ standard library,
+// but the proposal was submitted too late.  It will probably make
+// its way into the language in the future.
+//
+// This relatively ugly name is intentional. It prevents clashes with
+// similar functions users may have (e.g., implicit_cast). The internal
+// namespace alone is not enough because the function can be found by ADL.
+template<typename To>
+inline To ImplicitCast_(To x) { return ::testing::internal::move(x); }
+
+// When you upcast (that is, cast a pointer from type Foo to type
+// SuperclassOfFoo), it's fine to use ImplicitCast_<>, since upcasts
+// always succeed.  When you downcast (that is, cast a pointer from
+// type Foo to type SubclassOfFoo), static_cast<> isn't safe, because
+// how do you know the pointer is really of type SubclassOfFoo?  It
+// could be a bare Foo, or of type DifferentSubclassOfFoo.  Thus,
+// when you downcast, you should use this macro.  In debug mode, we
+// use dynamic_cast<> to double-check the downcast is legal (we die
+// if it's not).  In normal mode, we do the efficient static_cast<>
+// instead.  Thus, it's important to test in debug mode to make sure
+// the cast is legal!
+//    This is the only place in the code we should use dynamic_cast<>.
+// In particular, you SHOULDN'T be using dynamic_cast<> in order to
+// do RTTI (eg code like this:
+//    if (dynamic_cast<Subclass1>(foo)) HandleASubclass1Object(foo);
+//    if (dynamic_cast<Subclass2>(foo)) HandleASubclass2Object(foo);
+// You should design the code some other way not to need this.
+//
+// This relatively ugly name is intentional. It prevents clashes with
+// similar functions users may have (e.g., down_cast). The internal
+// namespace alone is not enough because the function can be found by ADL.
+template<typename To, typename From>  // use like this: DownCast_<T*>(foo);
+inline To DownCast_(From* f) {  // so we only accept pointers
+  // Ensures that To is a sub-type of From *.  This test is here only
+  // for compile-time type checking, and has no overhead in an
+  // optimized build at run-time, as it will be optimized away
+  // completely.
+  GTEST_INTENTIONAL_CONST_COND_PUSH_()
+  if (false) {
+  GTEST_INTENTIONAL_CONST_COND_POP_()
+    const To to = NULL;
+    ::testing::internal::ImplicitCast_<From*>(to);
+  }
+
+#if GTEST_HAS_RTTI
+  // RTTI: debug mode only!
+  GTEST_CHECK_(f == NULL || dynamic_cast<To>(f) != NULL);
+#endif
+  return static_cast<To>(f);
+}
+
+// Downcasts the pointer of type Base to Derived.
+// Derived must be a subclass of Base. The parameter MUST
+// point to a class of type Derived, not any subclass of it.
+// When RTTI is available, the function performs a runtime
+// check to enforce this.
+template <class Derived, class Base>
+Derived* CheckedDowncastToActualType(Base* base) {
+#if GTEST_HAS_RTTI
+  GTEST_CHECK_(typeid(*base) == typeid(Derived));
+  return dynamic_cast<Derived*>(base);  // NOLINT
+#else
+  return static_cast<Derived*>(base);  // Poor man's downcast.
+#endif
+}
+
+#if GTEST_HAS_STREAM_REDIRECTION
+
+// Defines the stderr capturer:
+//   CaptureStdout     - starts capturing stdout.
+//   GetCapturedStdout - stops capturing stdout and returns the captured string.
+//   CaptureStderr     - starts capturing stderr.
+//   GetCapturedStderr - stops capturing stderr and returns the captured string.
+//
+GTEST_API_ void CaptureStdout();
+GTEST_API_ std::string GetCapturedStdout();
+GTEST_API_ void CaptureStderr();
+GTEST_API_ std::string GetCapturedStderr();
+
+#endif  // GTEST_HAS_STREAM_REDIRECTION
+
+
+#if GTEST_HAS_DEATH_TEST
+
+const ::std::vector<testing::internal::string>& GetInjectableArgvs();
+void SetInjectableArgvs(const ::std::vector<testing::internal::string>*
+                             new_argvs);
+
+// A copy of all command line arguments.  Set by InitGoogleTest().
+extern ::std::vector<testing::internal::string> g_argvs;
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+// Defines synchronization primitives.
+#if GTEST_IS_THREADSAFE
+# if GTEST_HAS_PTHREAD
+// Sleeps for (roughly) n milliseconds.  This function is only for testing
+// Google Test's own constructs.  Don't use it in user tests, either
+// directly or indirectly.
+inline void SleepMilliseconds(int n) {
+  const timespec time = {
+    0,                  // 0 seconds.
+    n * 1000L * 1000L,  // And n ms.
+  };
+  nanosleep(&time, NULL);
+}
+# endif  // GTEST_HAS_PTHREAD
+
+# if 0  // OS detection
+# elif GTEST_HAS_PTHREAD
+// Allows a controller thread to pause execution of newly created
+// threads until notified.  Instances of this class must be created
+// and destroyed in the controller thread.
+//
+// This class is only for testing Google Test's own constructs. Do not
+// use it in user tests, either directly or indirectly.
+class Notification {
+ public:
+  Notification() : notified_(false) {
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, NULL));
+  }
+  ~Notification() {
+    pthread_mutex_destroy(&mutex_);
+  }
+
+  // Notifies all threads created with this notification to start. Must
+  // be called from the controller thread.
+  void Notify() {
+    pthread_mutex_lock(&mutex_);
+    notified_ = true;
+    pthread_mutex_unlock(&mutex_);
+  }
+
+  // Blocks until the controller thread notifies. Must be called from a test
+  // thread.
+  void WaitForNotification() {
+    for (;;) {
+      pthread_mutex_lock(&mutex_);
+      const bool notified = notified_;
+      pthread_mutex_unlock(&mutex_);
+      if (notified)
+        break;
+      SleepMilliseconds(10);
+    }
+  }
+
+ private:
+  pthread_mutex_t mutex_;
+  bool notified_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification);
+};
+
+# elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+
+GTEST_API_ void SleepMilliseconds(int n);
+
+// Provides leak-safe Windows kernel handle ownership.
+// Used in death tests and in threading support.
+class GTEST_API_ AutoHandle {
+ public:
+  // Assume that Win32 HANDLE type is equivalent to void*. Doing so allows us to
+  // avoid including <windows.h> in this header file. Including <windows.h> is
+  // undesirable because it defines a lot of symbols and macros that tend to
+  // conflict with client code. This assumption is verified by
+  // WindowsTypesTest.HANDLEIsVoidStar.
+  typedef void* Handle;
+  AutoHandle();
+  explicit AutoHandle(Handle handle);
+
+  ~AutoHandle();
+
+  Handle Get() const;
+  void Reset();
+  void Reset(Handle handle);
+
+ private:
+  // Returns true iff the handle is a valid handle object that can be closed.
+  bool IsCloseable() const;
+
+  Handle handle_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(AutoHandle);
+};
+
+// Allows a controller thread to pause execution of newly created
+// threads until notified.  Instances of this class must be created
+// and destroyed in the controller thread.
+//
+// This class is only for testing Google Test's own constructs. Do not
+// use it in user tests, either directly or indirectly.
+class GTEST_API_ Notification {
+ public:
+  Notification();
+  void Notify();
+  void WaitForNotification();
+
+ private:
+  AutoHandle event_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification);
+};
+# endif  // OS detection
+
+// On MinGW, we can have both GTEST_OS_WINDOWS and GTEST_HAS_PTHREAD
+// defined, but we don't want to use MinGW's pthreads implementation, which
+// has conformance problems with some versions of the POSIX standard.
+# if GTEST_HAS_PTHREAD && !GTEST_OS_WINDOWS_MINGW
+
+// As a C-function, ThreadFuncWithCLinkage cannot be templated itself.
+// Consequently, it cannot select a correct instantiation of ThreadWithParam
+// in order to call its Run(). Introducing ThreadWithParamBase as a
+// non-templated base class for ThreadWithParam allows us to bypass this
+// problem.
+class ThreadWithParamBase {
+ public:
+  virtual ~ThreadWithParamBase() {}
+  virtual void Run() = 0;
+};
+
+// pthread_create() accepts a pointer to a function type with the C linkage.
+// According to the Standard (7.5/1), function types with different linkages
+// are different even if they are otherwise identical.  Some compilers (for
+// example, SunStudio) treat them as different types.  Since class methods
+// cannot be defined with C-linkage we need to define a free C-function to
+// pass into pthread_create().
+extern "C" inline void* ThreadFuncWithCLinkage(void* thread) {
+  static_cast<ThreadWithParamBase*>(thread)->Run();
+  return NULL;
+}
+
+// Helper class for testing Google Test's multi-threading constructs.
+// To use it, write:
+//
+//   void ThreadFunc(int param) { /* Do things with param */ }
+//   Notification thread_can_start;
+//   ...
+//   // The thread_can_start parameter is optional; you can supply NULL.
+//   ThreadWithParam<int> thread(&ThreadFunc, 5, &thread_can_start);
+//   thread_can_start.Notify();
+//
+// These classes are only for testing Google Test's own constructs. Do
+// not use them in user tests, either directly or indirectly.
+template <typename T>
+class ThreadWithParam : public ThreadWithParamBase {
+ public:
+  typedef void UserThreadFunc(T);
+
+  ThreadWithParam(UserThreadFunc* func, T param, Notification* thread_can_start)
+      : func_(func),
+        param_(param),
+        thread_can_start_(thread_can_start),
+        finished_(false) {
+    ThreadWithParamBase* const base = this;
+    // The thread can be created only after all fields except thread_
+    // have been initialized.
+    GTEST_CHECK_POSIX_SUCCESS_(
+        pthread_create(&thread_, 0, &ThreadFuncWithCLinkage, base));
+  }
+  ~ThreadWithParam() { Join(); }
+
+  void Join() {
+    if (!finished_) {
+      GTEST_CHECK_POSIX_SUCCESS_(pthread_join(thread_, 0));
+      finished_ = true;
+    }
+  }
+
+  virtual void Run() {
+    if (thread_can_start_ != NULL)
+      thread_can_start_->WaitForNotification();
+    func_(param_);
+  }
+
+ private:
+  UserThreadFunc* const func_;  // User-supplied thread function.
+  const T param_;  // User-supplied parameter to the thread function.
+  // When non-NULL, used to block execution until the controller thread
+  // notifies.
+  Notification* const thread_can_start_;
+  bool finished_;  // true iff we know that the thread function has finished.
+  pthread_t thread_;  // The native thread object.
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam);
+};
+# endif  // GTEST_HAS_PTHREAD && !GTEST_OS_WINDOWS_MINGW
+
+# if 0  // OS detection
+# elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+
+// Mutex implements mutex on Windows platforms.  It is used in conjunction
+// with class MutexLock:
+//
+//   Mutex mutex;
+//   ...
+//   MutexLock lock(&mutex);  // Acquires the mutex and releases it at the
+//                            // end of the current scope.
+//
+// A static Mutex *must* be defined or declared using one of the following
+// macros:
+//   GTEST_DEFINE_STATIC_MUTEX_(g_some_mutex);
+//   GTEST_DECLARE_STATIC_MUTEX_(g_some_mutex);
+//
+// (A non-static Mutex is defined/declared in the usual way).
+class GTEST_API_ Mutex {
+ public:
+  enum MutexType { kStatic = 0, kDynamic = 1 };
+  // We rely on kStaticMutex being 0 as it is to what the linker initializes
+  // type_ in static mutexes.  critical_section_ will be initialized lazily
+  // in ThreadSafeLazyInit().
+  enum StaticConstructorSelector { kStaticMutex = 0 };
+
+  // This constructor intentionally does nothing.  It relies on type_ being
+  // statically initialized to 0 (effectively setting it to kStatic) and on
+  // ThreadSafeLazyInit() to lazily initialize the rest of the members.
+  explicit Mutex(StaticConstructorSelector /*dummy*/) {}
+
+  Mutex();
+  ~Mutex();
+
+  void Lock();
+
+  void Unlock();
+
+  // Does nothing if the current thread holds the mutex. Otherwise, crashes
+  // with high probability.
+  void AssertHeld();
+
+ private:
+  // Initializes owner_thread_id_ and critical_section_ in static mutexes.
+  void ThreadSafeLazyInit();
+
+  // Per http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx,
+  // we assume that 0 is an invalid value for thread IDs.
+  unsigned int owner_thread_id_;
+
+  // For static mutexes, we rely on these members being initialized to zeros
+  // by the linker.
+  MutexType type_;
+  long critical_section_init_phase_;  // NOLINT
+  _RTL_CRITICAL_SECTION* critical_section_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex);
+};
+
+# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+    extern ::testing::internal::Mutex mutex
+
+# define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
+    ::testing::internal::Mutex mutex(::testing::internal::Mutex::kStaticMutex)
+
+// We cannot name this class MutexLock because the ctor declaration would
+// conflict with a macro named MutexLock, which is defined on some
+// platforms. That macro is used as a defensive measure to prevent against
+// inadvertent misuses of MutexLock like "MutexLock(&mu)" rather than
+// "MutexLock l(&mu)".  Hence the typedef trick below.
+class GTestMutexLock {
+ public:
+  explicit GTestMutexLock(Mutex* mutex)
+      : mutex_(mutex) { mutex_->Lock(); }
+
+  ~GTestMutexLock() { mutex_->Unlock(); }
+
+ private:
+  Mutex* const mutex_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock);
+};
+
+typedef GTestMutexLock MutexLock;
+
+// Base class for ValueHolder<T>.  Allows a caller to hold and delete a value
+// without knowing its type.
+class ThreadLocalValueHolderBase {
+ public:
+  virtual ~ThreadLocalValueHolderBase() {}
+};
+
+// Provides a way for a thread to send notifications to a ThreadLocal
+// regardless of its parameter type.
+class ThreadLocalBase {
+ public:
+  // Creates a new ValueHolder<T> object holding a default value passed to
+  // this ThreadLocal<T>'s constructor and returns it.  It is the caller's
+  // responsibility not to call this when the ThreadLocal<T> instance already
+  // has a value on the current thread.
+  virtual ThreadLocalValueHolderBase* NewValueForCurrentThread() const = 0;
+
+ protected:
+  ThreadLocalBase() {}
+  virtual ~ThreadLocalBase() {}
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocalBase);
+};
+
+// Maps a thread to a set of ThreadLocals that have values instantiated on that
+// thread and notifies them when the thread exits.  A ThreadLocal instance is
+// expected to persist until all threads it has values on have terminated.
+class GTEST_API_ ThreadLocalRegistry {
+ public:
+  // Registers thread_local_instance as having value on the current thread.
+  // Returns a value that can be used to identify the thread from other threads.
+  static ThreadLocalValueHolderBase* GetValueOnCurrentThread(
+      const ThreadLocalBase* thread_local_instance);
+
+  // Invoked when a ThreadLocal instance is destroyed.
+  static void OnThreadLocalDestroyed(
+      const ThreadLocalBase* thread_local_instance);
+};
+
+class GTEST_API_ ThreadWithParamBase {
+ public:
+  void Join();
+
+ protected:
+  class Runnable {
+   public:
+    virtual ~Runnable() {}
+    virtual void Run() = 0;
+  };
+
+  ThreadWithParamBase(Runnable *runnable, Notification* thread_can_start);
+  virtual ~ThreadWithParamBase();
+
+ private:
+  AutoHandle thread_;
+};
+
+// Helper class for testing Google Test's multi-threading constructs.
+template <typename T>
+class ThreadWithParam : public ThreadWithParamBase {
+ public:
+  typedef void UserThreadFunc(T);
+
+  ThreadWithParam(UserThreadFunc* func, T param, Notification* thread_can_start)
+      : ThreadWithParamBase(new RunnableImpl(func, param), thread_can_start) {
+  }
+  virtual ~ThreadWithParam() {}
+
+ private:
+  class RunnableImpl : public Runnable {
+   public:
+    RunnableImpl(UserThreadFunc* func, T param)
+        : func_(func),
+          param_(param) {
+    }
+    virtual ~RunnableImpl() {}
+    virtual void Run() {
+      func_(param_);
+    }
+
+   private:
+    UserThreadFunc* const func_;
+    const T param_;
+
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(RunnableImpl);
+  };
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam);
+};
+
+// Implements thread-local storage on Windows systems.
+//
+//   // Thread 1
+//   ThreadLocal<int> tl(100);  // 100 is the default value for each thread.
+//
+//   // Thread 2
+//   tl.set(150);  // Changes the value for thread 2 only.
+//   EXPECT_EQ(150, tl.get());
+//
+//   // Thread 1
+//   EXPECT_EQ(100, tl.get());  // In thread 1, tl has the original value.
+//   tl.set(200);
+//   EXPECT_EQ(200, tl.get());
+//
+// The template type argument T must have a public copy constructor.
+// In addition, the default ThreadLocal constructor requires T to have
+// a public default constructor.
+//
+// The users of a TheadLocal instance have to make sure that all but one
+// threads (including the main one) using that instance have exited before
+// destroying it. Otherwise, the per-thread objects managed for them by the
+// ThreadLocal instance are not guaranteed to be destroyed on all platforms.
+//
+// Google Test only uses global ThreadLocal objects.  That means they
+// will die after main() has returned.  Therefore, no per-thread
+// object managed by Google Test will be leaked as long as all threads
+// using Google Test have exited when main() returns.
+template <typename T>
+class ThreadLocal : public ThreadLocalBase {
+ public:
+  ThreadLocal() : default_() {}
+  explicit ThreadLocal(const T& value) : default_(value) {}
+
+  ~ThreadLocal() { ThreadLocalRegistry::OnThreadLocalDestroyed(this); }
+
+  T* pointer() { return GetOrCreateValue(); }
+  const T* pointer() const { return GetOrCreateValue(); }
+  const T& get() const { return *pointer(); }
+  void set(const T& value) { *pointer() = value; }
+
+ private:
+  // Holds a value of T.  Can be deleted via its base class without the caller
+  // knowing the type of T.
+  class ValueHolder : public ThreadLocalValueHolderBase {
+   public:
+    explicit ValueHolder(const T& value) : value_(value) {}
+
+    T* pointer() { return &value_; }
+
+   private:
+    T value_;
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolder);
+  };
+
+
+  T* GetOrCreateValue() const {
+    return static_cast<ValueHolder*>(
+        ThreadLocalRegistry::GetValueOnCurrentThread(this))->pointer();
+  }
+
+  virtual ThreadLocalValueHolderBase* NewValueForCurrentThread() const {
+    return new ValueHolder(default_);
+  }
+
+  const T default_;  // The default value for each thread.
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal);
+};
+
+# elif GTEST_HAS_PTHREAD
+
+// MutexBase and Mutex implement mutex on pthreads-based platforms.
+class MutexBase {
+ public:
+  // Acquires this mutex.
+  void Lock() {
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_lock(&mutex_));
+    owner_ = pthread_self();
+    has_owner_ = true;
+  }
+
+  // Releases this mutex.
+  void Unlock() {
+    // Since the lock is being released the owner_ field should no longer be
+    // considered valid. We don't protect writing to has_owner_ here, as it's
+    // the caller's responsibility to ensure that the current thread holds the
+    // mutex when this is called.
+    has_owner_ = false;
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_unlock(&mutex_));
+  }
+
+  // Does nothing if the current thread holds the mutex. Otherwise, crashes
+  // with high probability.
+  void AssertHeld() const {
+    GTEST_CHECK_(has_owner_ && pthread_equal(owner_, pthread_self()))
+        << "The current thread is not holding the mutex @" << this;
+  }
+
+  // A static mutex may be used before main() is entered.  It may even
+  // be used before the dynamic initialization stage.  Therefore we
+  // must be able to initialize a static mutex object at link time.
+  // This means MutexBase has to be a POD and its member variables
+  // have to be public.
+ public:
+  pthread_mutex_t mutex_;  // The underlying pthread mutex.
+  // has_owner_ indicates whether the owner_ field below contains a valid thread
+  // ID and is therefore safe to inspect (e.g., to use in pthread_equal()). All
+  // accesses to the owner_ field should be protected by a check of this field.
+  // An alternative might be to memset() owner_ to all zeros, but there's no
+  // guarantee that a zero'd pthread_t is necessarily invalid or even different
+  // from pthread_self().
+  bool has_owner_;
+  pthread_t owner_;  // The thread holding the mutex.
+};
+
+// Forward-declares a static mutex.
+#  define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+     extern ::testing::internal::MutexBase mutex
+
+// Defines and statically (i.e. at link time) initializes a static mutex.
+// The initialization list here does not explicitly initialize each field,
+// instead relying on default initialization for the unspecified fields. In
+// particular, the owner_ field (a pthread_t) is not explicitly initialized.
+// This allows initialization to work whether pthread_t is a scalar or struct.
+// The flag -Wmissing-field-initializers must not be specified for this to work.
+#  define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
+     ::testing::internal::MutexBase mutex = { PTHREAD_MUTEX_INITIALIZER, false }
+
+// The Mutex class can only be used for mutexes created at runtime. It
+// shares its API with MutexBase otherwise.
+class Mutex : public MutexBase {
+ public:
+  Mutex() {
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, NULL));
+    has_owner_ = false;
+  }
+  ~Mutex() {
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_destroy(&mutex_));
+  }
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex);
+};
+
+// We cannot name this class MutexLock because the ctor declaration would
+// conflict with a macro named MutexLock, which is defined on some
+// platforms. That macro is used as a defensive measure to prevent against
+// inadvertent misuses of MutexLock like "MutexLock(&mu)" rather than
+// "MutexLock l(&mu)".  Hence the typedef trick below.
+class GTestMutexLock {
+ public:
+  explicit GTestMutexLock(MutexBase* mutex)
+      : mutex_(mutex) { mutex_->Lock(); }
+
+  ~GTestMutexLock() { mutex_->Unlock(); }
+
+ private:
+  MutexBase* const mutex_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock);
+};
+
+typedef GTestMutexLock MutexLock;
+
+// Helpers for ThreadLocal.
+
+// pthread_key_create() requires DeleteThreadLocalValue() to have
+// C-linkage.  Therefore it cannot be templatized to access
+// ThreadLocal<T>.  Hence the need for class
+// ThreadLocalValueHolderBase.
+class ThreadLocalValueHolderBase {
+ public:
+  virtual ~ThreadLocalValueHolderBase() {}
+};
+
+// Called by pthread to delete thread-local data stored by
+// pthread_setspecific().
+extern "C" inline void DeleteThreadLocalValue(void* value_holder) {
+  delete static_cast<ThreadLocalValueHolderBase*>(value_holder);
+}
+
+// Implements thread-local storage on pthreads-based systems.
+template <typename T>
+class ThreadLocal {
+ public:
+  ThreadLocal() : key_(CreateKey()),
+                  default_() {}
+  explicit ThreadLocal(const T& value) : key_(CreateKey()),
+                                         default_(value) {}
+
+  ~ThreadLocal() {
+    // Destroys the managed object for the current thread, if any.
+    DeleteThreadLocalValue(pthread_getspecific(key_));
+
+    // Releases resources associated with the key.  This will *not*
+    // delete managed objects for other threads.
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_key_delete(key_));
+  }
+
+  T* pointer() { return GetOrCreateValue(); }
+  const T* pointer() const { return GetOrCreateValue(); }
+  const T& get() const { return *pointer(); }
+  void set(const T& value) { *pointer() = value; }
+
+ private:
+  // Holds a value of type T.
+  class ValueHolder : public ThreadLocalValueHolderBase {
+   public:
+    explicit ValueHolder(const T& value) : value_(value) {}
+
+    T* pointer() { return &value_; }
+
+   private:
+    T value_;
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolder);
+  };
+
+  static pthread_key_t CreateKey() {
+    pthread_key_t key;
+    // When a thread exits, DeleteThreadLocalValue() will be called on
+    // the object managed for that thread.
+    GTEST_CHECK_POSIX_SUCCESS_(
+        pthread_key_create(&key, &DeleteThreadLocalValue));
+    return key;
+  }
+
+  T* GetOrCreateValue() const {
+    ThreadLocalValueHolderBase* const holder =
+        static_cast<ThreadLocalValueHolderBase*>(pthread_getspecific(key_));
+    if (holder != NULL) {
+      return CheckedDowncastToActualType<ValueHolder>(holder)->pointer();
+    }
+
+    ValueHolder* const new_holder = new ValueHolder(default_);
+    ThreadLocalValueHolderBase* const holder_base = new_holder;
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_setspecific(key_, holder_base));
+    return new_holder->pointer();
+  }
+
+  // A key pthreads uses for looking up per-thread values.
+  const pthread_key_t key_;
+  const T default_;  // The default value for each thread.
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal);
+};
+
+# endif  // OS detection
+
+#else  // GTEST_IS_THREADSAFE
+
+// A dummy implementation of synchronization primitives (mutex, lock,
+// and thread-local variable).  Necessary for compiling Google Test where
+// mutex is not supported - using Google Test in multiple threads is not
+// supported on such platforms.
+
+class Mutex {
+ public:
+  Mutex() {}
+  void Lock() {}
+  void Unlock() {}
+  void AssertHeld() const {}
+};
+
+# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+  extern ::testing::internal::Mutex mutex
+
+# define GTEST_DEFINE_STATIC_MUTEX_(mutex) ::testing::internal::Mutex mutex
+
+// We cannot name this class MutexLock because the ctor declaration would
+// conflict with a macro named MutexLock, which is defined on some
+// platforms. That macro is used as a defensive measure to prevent against
+// inadvertent misuses of MutexLock like "MutexLock(&mu)" rather than
+// "MutexLock l(&mu)".  Hence the typedef trick below.
+class GTestMutexLock {
+ public:
+  explicit GTestMutexLock(Mutex*) {}  // NOLINT
+};
+
+typedef GTestMutexLock MutexLock;
+
+template <typename T>
+class ThreadLocal {
+ public:
+  ThreadLocal() : value_() {}
+  explicit ThreadLocal(const T& value) : value_(value) {}
+  T* pointer() { return &value_; }
+  const T* pointer() const { return &value_; }
+  const T& get() const { return value_; }
+  void set(const T& value) { value_ = value; }
+ private:
+  T value_;
+};
+
+#endif  // GTEST_IS_THREADSAFE
+
+// Returns the number of threads running in the process, or 0 to indicate that
+// we cannot detect it.
+GTEST_API_ size_t GetThreadCount();
+
+// Passing non-POD classes through ellipsis (...) crashes the ARM
+// compiler and generates a warning in Sun Studio.  The Nokia Symbian
+// and the IBM XL C/C++ compiler try to instantiate a copy constructor
+// for objects passed through ellipsis (...), failing for uncopyable
+// objects.  We define this to ensure that only POD is passed through
+// ellipsis on these systems.
+#if defined(__SYMBIAN32__) || defined(__IBMCPP__) || defined(__SUNPRO_CC)
+// We lose support for NULL detection where the compiler doesn't like
+// passing non-POD classes through ellipsis (...).
+# define GTEST_ELLIPSIS_NEEDS_POD_ 1
+#else
+# define GTEST_CAN_COMPARE_NULL 1
+#endif
+
+// The Nokia Symbian and IBM XL C/C++ compilers cannot decide between
+// const T& and const T* in a function template.  These compilers
+// _can_ decide between class template specializations for T and T*,
+// so a tr1::type_traits-like is_pointer works.
+#if defined(__SYMBIAN32__) || defined(__IBMCPP__)
+# define GTEST_NEEDS_IS_POINTER_ 1
+#endif
+
+template <bool bool_value>
+struct bool_constant {
+  typedef bool_constant<bool_value> type;
+  static const bool value = bool_value;
+};
+template <bool bool_value> const bool bool_constant<bool_value>::value;
+
+typedef bool_constant<false> false_type;
+typedef bool_constant<true> true_type;
+
+template <typename T>
+struct is_pointer : public false_type {};
+
+template <typename T>
+struct is_pointer<T*> : public true_type {};
+
+template <typename Iterator>
+struct IteratorTraits {
+  typedef typename Iterator::value_type value_type;
+};
+
+template <typename T>
+struct IteratorTraits<T*> {
+  typedef T value_type;
+};
+
+template <typename T>
+struct IteratorTraits<const T*> {
+  typedef T value_type;
+};
+
+#if GTEST_OS_WINDOWS
+# define GTEST_PATH_SEP_ "\\"
+# define GTEST_HAS_ALT_PATH_SEP_ 1
+// The biggest signed integer type the compiler supports.
+typedef __int64 BiggestInt;
+#else
+# define GTEST_PATH_SEP_ "/"
+# define GTEST_HAS_ALT_PATH_SEP_ 0
+typedef long long BiggestInt;  // NOLINT
+#endif  // GTEST_OS_WINDOWS
+
+// Utilities for char.
+
+// isspace(int ch) and friends accept an unsigned char or EOF.  char
+// may be signed, depending on the compiler (or compiler flags).
+// Therefore we need to cast a char to unsigned char before calling
+// isspace(), etc.
+
+inline bool IsAlpha(char ch) {
+  return isalpha(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsAlNum(char ch) {
+  return isalnum(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsDigit(char ch) {
+  return isdigit(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsLower(char ch) {
+  return islower(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsSpace(char ch) {
+  return isspace(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsUpper(char ch) {
+  return isupper(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsXDigit(char ch) {
+  return isxdigit(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsXDigit(wchar_t ch) {
+  const unsigned char low_byte = static_cast<unsigned char>(ch);
+  return ch == low_byte && isxdigit(low_byte) != 0;
+}
+
+inline char ToLower(char ch) {
+  return static_cast<char>(tolower(static_cast<unsigned char>(ch)));
+}
+inline char ToUpper(char ch) {
+  return static_cast<char>(toupper(static_cast<unsigned char>(ch)));
+}
+
+inline std::string StripTrailingSpaces(std::string str) {
+  std::string::iterator it = str.end();
+  while (it != str.begin() && IsSpace(*--it))
+    it = str.erase(it);
+  return str;
+}
+
+// The testing::internal::posix namespace holds wrappers for common
+// POSIX functions.  These wrappers hide the differences between
+// Windows/MSVC and POSIX systems.  Since some compilers define these
+// standard functions as macros, the wrapper cannot have the same name
+// as the wrapped function.
+
+namespace posix {
+
+// Functions with a different name on Windows.
+
+#if GTEST_OS_WINDOWS
+
+typedef struct _stat StatStruct;
+
+# ifdef __BORLANDC__
+inline int IsATTY(int fd) { return isatty(fd); }
+inline int StrCaseCmp(const char* s1, const char* s2) {
+  return stricmp(s1, s2);
+}
+inline char* StrDup(const char* src) { return strdup(src); }
+# else  // !__BORLANDC__
+#  if GTEST_OS_WINDOWS_MOBILE
+inline int IsATTY(int /* fd */) { return 0; }
+#  else
+inline int IsATTY(int fd) { return _isatty(fd); }
+#  endif  // GTEST_OS_WINDOWS_MOBILE
+inline int StrCaseCmp(const char* s1, const char* s2) {
+  return _stricmp(s1, s2);
+}
+inline char* StrDup(const char* src) { return _strdup(src); }
+# endif  // __BORLANDC__
+
+# if GTEST_OS_WINDOWS_MOBILE
+inline int FileNo(FILE* file) { return reinterpret_cast<int>(_fileno(file)); }
+// Stat(), RmDir(), and IsDir() are not needed on Windows CE at this
+// time and thus not defined there.
+# else
+inline int FileNo(FILE* file) { return _fileno(file); }
+inline int Stat(const char* path, StatStruct* buf) { return _stat(path, buf); }
+inline int RmDir(const char* dir) { return _rmdir(dir); }
+inline bool IsDir(const StatStruct& st) {
+  return (_S_IFDIR & st.st_mode) != 0;
+}
+# endif  // GTEST_OS_WINDOWS_MOBILE
+
+#else
+
+typedef struct stat StatStruct;
+
+inline int FileNo(FILE* file) { return fileno(file); }
+inline int IsATTY(int fd) { return isatty(fd); }
+inline int Stat(const char* path, StatStruct* buf) { return stat(path, buf); }
+inline int StrCaseCmp(const char* s1, const char* s2) {
+  return strcasecmp(s1, s2);
+}
+inline char* StrDup(const char* src) { return strdup(src); }
+inline int RmDir(const char* dir) { return rmdir(dir); }
+inline bool IsDir(const StatStruct& st) { return S_ISDIR(st.st_mode); }
+
+#endif  // GTEST_OS_WINDOWS
+
+// Functions deprecated by MSVC 8.0.
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996 /* deprecated function */)
+
+inline const char* StrNCpy(char* dest, const char* src, size_t n) {
+  return strncpy(dest, src, n);
+}
+
+// ChDir(), FReopen(), FDOpen(), Read(), Write(), Close(), and
+// StrError() aren't needed on Windows CE at this time and thus not
+// defined there.
+
+#if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+inline int ChDir(const char* dir) { return chdir(dir); }
+#endif
+inline FILE* FOpen(const char* path, const char* mode) {
+  return fopen(path, mode);
+}
+#if !GTEST_OS_WINDOWS_MOBILE
+inline FILE *FReopen(const char* path, const char* mode, FILE* stream) {
+  return freopen(path, mode, stream);
+}
+inline FILE* FDOpen(int fd, const char* mode) { return fdopen(fd, mode); }
+#endif
+inline int FClose(FILE* fp) { return fclose(fp); }
+#if !GTEST_OS_WINDOWS_MOBILE
+inline int Read(int fd, void* buf, unsigned int count) {
+  return static_cast<int>(read(fd, buf, count));
+}
+inline int Write(int fd, const void* buf, unsigned int count) {
+  return static_cast<int>(write(fd, buf, count));
+}
+inline int Close(int fd) { return close(fd); }
+inline const char* StrError(int errnum) { return strerror(errnum); }
+#endif
+inline const char* GetEnv(const char* name) {
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE | GTEST_OS_WINDOWS_RT
+  // We are on Windows CE, which has no environment variables.
+  static_cast<void>(name);  // To prevent 'unused argument' warning.
+  return NULL;
+#elif defined(__BORLANDC__) || defined(__SunOS_5_8) || defined(__SunOS_5_9)
+  // Environment variables which we programmatically clear will be set to the
+  // empty string rather than unset (NULL).  Handle that case.
+  const char* const env = getenv(name);
+  return (env != NULL && env[0] != '\0') ? env : NULL;
+#else
+  return getenv(name);
+#endif
+}
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()
+
+#if GTEST_OS_WINDOWS_MOBILE
+// Windows CE has no C library. The abort() function is used in
+// several places in Google Test. This implementation provides a reasonable
+// imitation of standard behaviour.
+void Abort();
+#else
+inline void Abort() { abort(); }
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+}  // namespace posix
+
+// MSVC "deprecates" snprintf and issues warnings wherever it is used.  In
+// order to avoid these warnings, we need to use _snprintf or _snprintf_s on
+// MSVC-based platforms.  We map the GTEST_SNPRINTF_ macro to the appropriate
+// function in order to achieve that.  We use macro definition here because
+// snprintf is a variadic function.
+#if _MSC_VER >= 1400 && !GTEST_OS_WINDOWS_MOBILE
+// MSVC 2005 and above support variadic macros.
+# define GTEST_SNPRINTF_(buffer, size, format, ...) \
+     _snprintf_s(buffer, size, size, format, __VA_ARGS__)
+#elif defined(_MSC_VER)
+// Windows CE does not define _snprintf_s and MSVC prior to 2005 doesn't
+// complain about _snprintf.
+# define GTEST_SNPRINTF_ _snprintf
+#else
+# define GTEST_SNPRINTF_ snprintf
+#endif
+
+// The maximum number a BiggestInt can represent.  This definition
+// works no matter BiggestInt is represented in one's complement or
+// two's complement.
+//
+// We cannot rely on numeric_limits in STL, as __int64 and long long
+// are not part of standard C++ and numeric_limits doesn't need to be
+// defined for them.
+const BiggestInt kMaxBiggestInt =
+    ~(static_cast<BiggestInt>(1) << (8*sizeof(BiggestInt) - 1));
+
+// This template class serves as a compile-time function from size to
+// type.  It maps a size in bytes to a primitive type with that
+// size. e.g.
+//
+//   TypeWithSize<4>::UInt
+//
+// is typedef-ed to be unsigned int (unsigned integer made up of 4
+// bytes).
+//
+// Such functionality should belong to STL, but I cannot find it
+// there.
+//
+// Google Test uses this class in the implementation of floating-point
+// comparison.
+//
+// For now it only handles UInt (unsigned int) as that's all Google Test
+// needs.  Other types can be easily added in the future if need
+// arises.
+template <size_t size>
+class TypeWithSize {
+ public:
+  // This prevents the user from using TypeWithSize<N> with incorrect
+  // values of N.
+  typedef void UInt;
+};
+
+// The specialization for size 4.
+template <>
+class TypeWithSize<4> {
+ public:
+  // unsigned int has size 4 in both gcc and MSVC.
+  //
+  // As base/basictypes.h doesn't compile on Windows, we cannot use
+  // uint32, uint64, and etc here.
+  typedef int Int;
+  typedef unsigned int UInt;
+};
+
+// The specialization for size 8.
+template <>
+class TypeWithSize<8> {
+ public:
+#if GTEST_OS_WINDOWS
+  typedef __int64 Int;
+  typedef unsigned __int64 UInt;
+#else
+  typedef long long Int;  // NOLINT
+  typedef unsigned long long UInt;  // NOLINT
+#endif  // GTEST_OS_WINDOWS
+};
+
+// Integer types of known sizes.
+typedef TypeWithSize<4>::Int Int32;
+typedef TypeWithSize<4>::UInt UInt32;
+typedef TypeWithSize<8>::Int Int64;
+typedef TypeWithSize<8>::UInt UInt64;
+typedef TypeWithSize<8>::Int TimeInMillis;  // Represents time in milliseconds.
+
+// Utilities for command line flags and environment variables.
+
+// Macro for referencing flags.
+#define GTEST_FLAG(name) FLAGS_gtest_##name
+
+// Macros for declaring flags.
+#define GTEST_DECLARE_bool_(name) GTEST_API_ extern bool GTEST_FLAG(name)
+#define GTEST_DECLARE_int32_(name) \
+    GTEST_API_ extern ::testing::internal::Int32 GTEST_FLAG(name)
+#define GTEST_DECLARE_string_(name) \
+    GTEST_API_ extern ::std::string GTEST_FLAG(name)
+
+// Macros for defining flags.
+#define GTEST_DEFINE_bool_(name, default_val, doc) \
+    GTEST_API_ bool GTEST_FLAG(name) = (default_val)
+#define GTEST_DEFINE_int32_(name, default_val, doc) \
+    GTEST_API_ ::testing::internal::Int32 GTEST_FLAG(name) = (default_val)
+#define GTEST_DEFINE_string_(name, default_val, doc) \
+    GTEST_API_ ::std::string GTEST_FLAG(name) = (default_val)
+
+// Thread annotations
+#define GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)
+#define GTEST_LOCK_EXCLUDED_(locks)
+
+// Parses 'str' for a 32-bit signed integer.  If successful, writes the result
+// to *value and returns true; otherwise leaves *value unchanged and returns
+// false.
+// TODO(chandlerc): Find a better way to refactor flag and environment parsing
+// out of both gtest-port.cc and gtest.cc to avoid exporting this utility
+// function.
+bool ParseInt32(const Message& src_text, const char* str, Int32* value);
+
+// Parses a bool/Int32/string from the environment variable
+// corresponding to the given Google Test flag.
+bool BoolFromGTestEnv(const char* flag, bool default_val);
+GTEST_API_ Int32 Int32FromGTestEnv(const char* flag, Int32 default_val);
+const char* StringFromGTestEnv(const char* flag, const char* default_val);
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
+
+
+#if GTEST_OS_LINUX
+# include <stdlib.h>
+# include <sys/types.h>
+# include <sys/wait.h>
+# include <unistd.h>
+#endif  // GTEST_OS_LINUX
+
+#if GTEST_HAS_EXCEPTIONS
+# include <stdexcept>
+#endif
+
+#include <ctype.h>
+#include <float.h>
+#include <string.h>
+#include <iomanip>
+#include <limits>
+#include <set>
+#include <string>
+#include <vector>
+
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan at google.com (Zhanyong Wan)
+//
+// The Google C++ Testing Framework (Google Test)
+//
+// This header file defines the Message class.
+//
+// IMPORTANT NOTE: Due to limitation of the C++ language, we have to
+// leave some internal implementation details in this header file.
+// They are clearly marked by comments like this:
+//
+//   // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+//
+// Such code is NOT meant to be used by a user directly, and is subject
+// to CHANGE WITHOUT NOTICE.  Therefore DO NOT DEPEND ON IT in a user
+// program!
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
+#define GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
+
+#include <limits>
+
+
+// Ensures that there is at least one operator<< in the global namespace.
+// See Message& operator<<(...) below for why.
+void operator<<(const testing::internal::Secret&, int);
+
+namespace testing {
+
+// The Message class works like an ostream repeater.
+//
+// Typical usage:
+//
+//   1. You stream a bunch of values to a Message object.
+//      It will remember the text in a stringstream.
+//   2. Then you stream the Message object to an ostream.
+//      This causes the text in the Message to be streamed
+//      to the ostream.
+//
+// For example;
+//
+//   testing::Message foo;
+//   foo << 1 << " != " << 2;
+//   std::cout << foo;
+//
+// will print "1 != 2".
+//
+// Message is not intended to be inherited from.  In particular, its
+// destructor is not virtual.
+//
+// Note that stringstream behaves differently in gcc and in MSVC.  You
+// can stream a NULL char pointer to it in the former, but not in the
+// latter (it causes an access violation if you do).  The Message
+// class hides this difference by treating a NULL char pointer as
+// "(null)".
+class GTEST_API_ Message {
+ private:
+  // The type of basic IO manipulators (endl, ends, and flush) for
+  // narrow streams.
+  typedef std::ostream& (*BasicNarrowIoManip)(std::ostream&);
+
+ public:
+  // Constructs an empty Message.
+  Message();
+
+  // Copy constructor.
+  Message(const Message& msg) : ss_(new ::std::stringstream) {  // NOLINT
+    *ss_ << msg.GetString();
+  }
+
+  // Constructs a Message from a C-string.
+  explicit Message(const char* str) : ss_(new ::std::stringstream) {
+    *ss_ << str;
+  }
+
+#if GTEST_OS_SYMBIAN
+  // Streams a value (either a pointer or not) to this object.
+  template <typename T>
+  inline Message& operator <<(const T& value) {
+    StreamHelper(typename internal::is_pointer<T>::type(), value);
+    return *this;
+  }
+#else
+  // Streams a non-pointer value to this object.
+  template <typename T>
+  inline Message& operator <<(const T& val) {
+    // Some libraries overload << for STL containers.  These
+    // overloads are defined in the global namespace instead of ::std.
+    //
+    // C++'s symbol lookup rule (i.e. Koenig lookup) says that these
+    // overloads are visible in either the std namespace or the global
+    // namespace, but not other namespaces, including the testing
+    // namespace which Google Test's Message class is in.
+    //
+    // To allow STL containers (and other types that has a << operator
+    // defined in the global namespace) to be used in Google Test
+    // assertions, testing::Message must access the custom << operator
+    // from the global namespace.  With this using declaration,
+    // overloads of << defined in the global namespace and those
+    // visible via Koenig lookup are both exposed in this function.
+    using ::operator <<;
+    *ss_ << val;
+    return *this;
+  }
+
+  // Streams a pointer value to this object.
+  //
+  // This function is an overload of the previous one.  When you
+  // stream a pointer to a Message, this definition will be used as it
+  // is more specialized.  (The C++ Standard, section
+  // [temp.func.order].)  If you stream a non-pointer, then the
+  // previous definition will be used.
+  //
+  // The reason for this overload is that streaming a NULL pointer to
+  // ostream is undefined behavior.  Depending on the compiler, you
+  // may get "0", "(nil)", "(null)", or an access violation.  To
+  // ensure consistent result across compilers, we always treat NULL
+  // as "(null)".
+  template <typename T>
+  inline Message& operator <<(T* const& pointer) {  // NOLINT
+    if (pointer == NULL) {
+      *ss_ << "(null)";
+    } else {
+      *ss_ << pointer;
+    }
+    return *this;
+  }
+#endif  // GTEST_OS_SYMBIAN
+
+  // Since the basic IO manipulators are overloaded for both narrow
+  // and wide streams, we have to provide this specialized definition
+  // of operator <<, even though its body is the same as the
+  // templatized version above.  Without this definition, streaming
+  // endl or other basic IO manipulators to Message will confuse the
+  // compiler.
+  Message& operator <<(BasicNarrowIoManip val) {
+    *ss_ << val;
+    return *this;
+  }
+
+  // Instead of 1/0, we want to see true/false for bool values.
+  Message& operator <<(bool b) {
+    return *this << (b ? "true" : "false");
+  }
+
+  // These two overloads allow streaming a wide C string to a Message
+  // using the UTF-8 encoding.
+  Message& operator <<(const wchar_t* wide_c_str);
+  Message& operator <<(wchar_t* wide_c_str);
+
+#if GTEST_HAS_STD_WSTRING
+  // Converts the given wide string to a narrow string using the UTF-8
+  // encoding, and streams the result to this Message object.
+  Message& operator <<(const ::std::wstring& wstr);
+#endif  // GTEST_HAS_STD_WSTRING
+
+#if GTEST_HAS_GLOBAL_WSTRING
+  // Converts the given wide string to a narrow string using the UTF-8
+  // encoding, and streams the result to this Message object.
+  Message& operator <<(const ::wstring& wstr);
+#endif  // GTEST_HAS_GLOBAL_WSTRING
+
+  // Gets the text streamed to this object so far as an std::string.
+  // Each '\0' character in the buffer is replaced with "\\0".
+  //
+  // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+  std::string GetString() const;
+
+ private:
+
+#if GTEST_OS_SYMBIAN
+  // These are needed as the Nokia Symbian Compiler cannot decide between
+  // const T& and const T* in a function template. The Nokia compiler _can_
+  // decide between class template specializations for T and T*, so a
+  // tr1::type_traits-like is_pointer works, and we can overload on that.
+  template <typename T>
+  inline void StreamHelper(internal::true_type /*is_pointer*/, T* pointer) {
+    if (pointer == NULL) {
+      *ss_ << "(null)";
+    } else {
+      *ss_ << pointer;
+    }
+  }
+  template <typename T>
+  inline void StreamHelper(internal::false_type /*is_pointer*/,
+                           const T& value) {
+    // See the comments in Message& operator <<(const T&) above for why
+    // we need this using statement.
+    using ::operator <<;
+    *ss_ << value;
+  }
+#endif  // GTEST_OS_SYMBIAN
+
+  // We'll hold the text streamed to this object here.
+  const internal::scoped_ptr< ::std::stringstream> ss_;
+
+  // We declare (but don't implement) this to prevent the compiler
+  // from implementing the assignment operator.
+  void operator=(const Message&);
+};
+
+// Streams a Message to an ostream.
+inline std::ostream& operator <<(std::ostream& os, const Message& sb) {
+  return os << sb.GetString();
+}
+
+namespace internal {
+
+// Converts a streamable value to an std::string.  A NULL pointer is
+// converted to "(null)".  When the input value is a ::string,
+// ::std::string, ::wstring, or ::std::wstring object, each NUL
+// character in it is replaced with "\\0".
+template <typename T>
+std::string StreamableToString(const T& streamable) {
+  return (Message() << streamable).GetString();
+}
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: wan at google.com (Zhanyong Wan), eefacm at gmail.com (Sean Mcafee)
+//
+// The Google C++ Testing Framework (Google Test)
+//
+// This header file declares the String class and functions used internally by
+// Google Test.  They are subject to change without notice. They should not used
+// by code external to Google Test.
+//
+// This header file is #included by <gtest/internal/gtest-internal.h>.
+// It should not be #included by other files.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
+
+#ifdef __BORLANDC__
+// string.h is not guaranteed to provide strcpy on C++ Builder.
+# include <mem.h>
+#endif
+
+#include <string.h>
+#include <string>
+
+
+namespace testing {
+namespace internal {
+
+// String - an abstract class holding static string utilities.
+class GTEST_API_ String {
+ public:
+  // Static utility methods
+
+  // Clones a 0-terminated C string, allocating memory using new.  The
+  // caller is responsible for deleting the return value using
+  // delete[].  Returns the cloned string, or NULL if the input is
+  // NULL.
+  //
+  // This is different from strdup() in string.h, which allocates
+  // memory using malloc().
+  static const char* CloneCString(const char* c_str);
+
+#if GTEST_OS_WINDOWS_MOBILE
+  // Windows CE does not have the 'ANSI' versions of Win32 APIs. To be
+  // able to pass strings to Win32 APIs on CE we need to convert them
+  // to 'Unicode', UTF-16.
+
+  // Creates a UTF-16 wide string from the given ANSI string, allocating
+  // memory using new. The caller is responsible for deleting the return
+  // value using delete[]. Returns the wide string, or NULL if the
+  // input is NULL.
+  //
+  // The wide string is created using the ANSI codepage (CP_ACP) to
+  // match the behaviour of the ANSI versions of Win32 calls and the
+  // C runtime.
+  static LPCWSTR AnsiToUtf16(const char* c_str);
+
+  // Creates an ANSI string from the given wide string, allocating
+  // memory using new. The caller is responsible for deleting the return
+  // value using delete[]. Returns the ANSI string, or NULL if the
+  // input is NULL.
+  //
+  // The returned string is created using the ANSI codepage (CP_ACP) to
+  // match the behaviour of the ANSI versions of Win32 calls and the
+  // C runtime.
+  static const char* Utf16ToAnsi(LPCWSTR utf16_str);
+#endif
+
+  // Compares two C strings.  Returns true iff they have the same content.
+  //
+  // Unlike strcmp(), this function can handle NULL argument(s).  A
+  // NULL C string is considered different to any non-NULL C string,
+  // including the empty string.
+  static bool CStringEquals(const char* lhs, const char* rhs);
+
+  // Converts a wide C string to a String using the UTF-8 encoding.
+  // NULL will be converted to "(null)".  If an error occurred during
+  // the conversion, "(failed to convert from wide string)" is
+  // returned.
+  static std::string ShowWideCString(const wchar_t* wide_c_str);
+
+  // Compares two wide C strings.  Returns true iff they have the same
+  // content.
+  //
+  // Unlike wcscmp(), this function can handle NULL argument(s).  A
+  // NULL C string is considered different to any non-NULL C string,
+  // including the empty string.
+  static bool WideCStringEquals(const wchar_t* lhs, const wchar_t* rhs);
+
+  // Compares two C strings, ignoring case.  Returns true iff they
+  // have the same content.
+  //
+  // Unlike strcasecmp(), this function can handle NULL argument(s).
+  // A NULL C string is considered different to any non-NULL C string,
+  // including the empty string.
+  static bool CaseInsensitiveCStringEquals(const char* lhs,
+                                           const char* rhs);
+
+  // Compares two wide C strings, ignoring case.  Returns true iff they
+  // have the same content.
+  //
+  // Unlike wcscasecmp(), this function can handle NULL argument(s).
+  // A NULL C string is considered different to any non-NULL wide C string,
+  // including the empty string.
+  // NB: The implementations on different platforms slightly differ.
+  // On windows, this method uses _wcsicmp which compares according to LC_CTYPE
+  // environment variable. On GNU platform this method uses wcscasecmp
+  // which compares according to LC_CTYPE category of the current locale.
+  // On MacOS X, it uses towlower, which also uses LC_CTYPE category of the
+  // current locale.
+  static bool CaseInsensitiveWideCStringEquals(const wchar_t* lhs,
+                                               const wchar_t* rhs);
+
+  // Returns true iff the given string ends with the given suffix, ignoring
+  // case. Any string is considered to end with an empty suffix.
+  static bool EndsWithCaseInsensitive(
+      const std::string& str, const std::string& suffix);
+
+  // Formats an int value as "%02d".
+  static std::string FormatIntWidth2(int value);  // "%02d" for width == 2
+
+  // Formats an int value as "%X".
+  static std::string FormatHexInt(int value);
+
+  // Formats a byte as "%02X".
+  static std::string FormatByte(unsigned char value);
+
+ private:
+  String();  // Not meant to be instantiated.
+};  // class String
+
+// Gets the content of the stringstream's buffer as an std::string.  Each '\0'
+// character in the buffer is replaced with "\\0".
+GTEST_API_ std::string StringStreamToString(::std::stringstream* stream);
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: keith.ray at gmail.com (Keith Ray)
+//
+// Google Test filepath utilities
+//
+// This header file declares classes and functions used internally by
+// Google Test.  They are subject to change without notice.
+//
+// This file is #included in <gtest/internal/gtest-internal.h>.
+// Do not include this header file separately!
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
+
+
+namespace testing {
+namespace internal {
+
+// FilePath - a class for file and directory pathname manipulation which
+// handles platform-specific conventions (like the pathname separator).
+// Used for helper functions for naming files in a directory for xml output.
+// Except for Set methods, all methods are const or static, which provides an
+// "immutable value object" -- useful for peace of mind.
+// A FilePath with a value ending in a path separator ("like/this/") represents
+// a directory, otherwise it is assumed to represent a file. In either case,
+// it may or may not represent an actual file or directory in the file system.
+// Names are NOT checked for syntax correctness -- no checking for illegal
+// characters, malformed paths, etc.
+
+class GTEST_API_ FilePath {
+ public:
+  FilePath() : pathname_("") { }
+  FilePath(const FilePath& rhs) : pathname_(rhs.pathname_) { }
+
+  explicit FilePath(const std::string& pathname) : pathname_(pathname) {
+    Normalize();
+  }
+
+  FilePath& operator=(const FilePath& rhs) {
+    Set(rhs);
+    return *this;
+  }
+
+  void Set(const FilePath& rhs) {
+    pathname_ = rhs.pathname_;
+  }
+
+  const std::string& string() const { return pathname_; }
+  const char* c_str() const { return pathname_.c_str(); }
+
+  // Returns the current working directory, or "" if unsuccessful.
+  static FilePath GetCurrentDir();
+
+  // Given directory = "dir", base_name = "test", number = 0,
+  // extension = "xml", returns "dir/test.xml". If number is greater
+  // than zero (e.g., 12), returns "dir/test_12.xml".
+  // On Windows platform, uses \ as the separator rather than /.
+  static FilePath MakeFileName(const FilePath& directory,
+                               const FilePath& base_name,
+                               int number,
+                               const char* extension);
+
+  // Given directory = "dir", relative_path = "test.xml",
+  // returns "dir/test.xml".
+  // On Windows, uses \ as the separator rather than /.
+  static FilePath ConcatPaths(const FilePath& directory,
+                              const FilePath& relative_path);
+
+  // Returns a pathname for a file that does not currently exist. The pathname
+  // will be directory/base_name.extension or
+  // directory/base_name_<number>.extension if directory/base_name.extension
+  // already exists. The number will be incremented until a pathname is found
+  // that does not already exist.
+  // Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'.
+  // There could be a race condition if two or more processes are calling this
+  // function at the same time -- they could both pick the same filename.
+  static FilePath GenerateUniqueFileName(const FilePath& directory,
+                                         const FilePath& base_name,
+                                         const char* extension);
+
+  // Returns true iff the path is "".
+  bool IsEmpty() const { return pathname_.empty(); }
+
+  // If input name has a trailing separator character, removes it and returns
+  // the name, otherwise return the name string unmodified.
+  // On Windows platform, uses \ as the separator, other platforms use /.
+  FilePath RemoveTrailingPathSeparator() const;
+
+  // Returns a copy of the FilePath with the directory part removed.
+  // Example: FilePath("path/to/file").RemoveDirectoryName() returns
+  // FilePath("file"). If there is no directory part ("just_a_file"), it returns
+  // the FilePath unmodified. If there is no file part ("just_a_dir/") it
+  // returns an empty FilePath ("").
+  // On Windows platform, '\' is the path separator, otherwise it is '/'.
+  FilePath RemoveDirectoryName() const;
+
+  // RemoveFileName returns the directory path with the filename removed.
+  // Example: FilePath("path/to/file").RemoveFileName() returns "path/to/".
+  // If the FilePath is "a_file" or "/a_file", RemoveFileName returns
+  // FilePath("./") or, on Windows, FilePath(".\\"). If the filepath does
+  // not have a file, like "just/a/dir/", it returns the FilePath unmodified.
+  // On Windows platform, '\' is the path separator, otherwise it is '/'.
+  FilePath RemoveFileName() const;
+
+  // Returns a copy of the FilePath with the case-insensitive extension removed.
+  // Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns
+  // FilePath("dir/file"). If a case-insensitive extension is not
+  // found, returns a copy of the original FilePath.
+  FilePath RemoveExtension(const char* extension) const;
+
+  // Creates directories so that path exists. Returns true if successful or if
+  // the directories already exist; returns false if unable to create
+  // directories for any reason. Will also return false if the FilePath does
+  // not represent a directory (that is, it doesn't end with a path separator).
+  bool CreateDirectoriesRecursively() const;
+
+  // Create the directory so that path exists. Returns true if successful or
+  // if the directory already exists; returns false if unable to create the
+  // directory for any reason, including if the parent directory does not
+  // exist. Not named "CreateDirectory" because that's a macro on Windows.
+  bool CreateFolder() const;
+
+  // Returns true if FilePath describes something in the file-system,
+  // either a file, directory, or whatever, and that something exists.
+  bool FileOrDirectoryExists() const;
+
+  // Returns true if pathname describes a directory in the file-system
+  // that exists.
+  bool DirectoryExists() const;
+
+  // Returns true if FilePath ends with a path separator, which indicates that
+  // it is intended to represent a directory. Returns false otherwise.
+  // This does NOT check that a directory (or file) actually exists.
+  bool IsDirectory() const;
+
+  // Returns true if pathname describes a root directory. (Windows has one
+  // root directory per disk drive.)
+  bool IsRootDirectory() const;
+
+  // Returns true if pathname describes an absolute path.
+  bool IsAbsolutePath() const;
+
+ private:
+  // Replaces multiple consecutive separators with a single separator.
+  // For example, "bar///foo" becomes "bar/foo". Does not eliminate other
+  // redundancies that might be in a pathname involving "." or "..".
+  //
+  // A pathname with multiple consecutive separators may occur either through
+  // user error or as a result of some scripts or APIs that generate a pathname
+  // with a trailing separator. On other platforms the same API or script
+  // may NOT generate a pathname with a trailing "/". Then elsewhere that
+  // pathname may have another "/" and pathname components added to it,
+  // without checking for the separator already being there.
+  // The script language and operating system may allow paths like "foo//bar"
+  // but some of the functions in FilePath will not handle that correctly. In
+  // particular, RemoveTrailingPathSeparator() only removes one separator, and
+  // it is called in CreateDirectoriesRecursively() assuming that it will change
+  // a pathname from directory syntax (trailing separator) to filename syntax.
+  //
+  // On Windows this method also replaces the alternate path separator '/' with
+  // the primary path separator '\\', so that for example "bar\\/\\foo" becomes
+  // "bar\\foo".
+
+  void Normalize();
+
+  // Returns a pointer to the last occurence of a valid path separator in
+  // the FilePath. On Windows, for example, both '/' and '\' are valid path
+  // separators. Returns NULL if no path separator was found.
+  const char* FindLastPathSeparator() const;
+
+  std::string pathname_;
+};  // class FilePath
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
+// This file was GENERATED by command:
+//     pump.py gtest-type-util.h.pump
+// DO NOT EDIT BY HAND!!!
+
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan at google.com (Zhanyong Wan)
+
+// Type utilities needed for implementing typed and type-parameterized
+// tests.  This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
+//
+// Currently we support at most 50 types in a list, and at most 50
+// type-parameterized tests in one type-parameterized test case.
+// Please contact googletestframework at googlegroups.com if you need
+// more.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
+
+
+// #ifdef __GNUC__ is too general here.  It is possible to use gcc without using
+// libstdc++ (which is where cxxabi.h comes from).
+# if GTEST_HAS_CXXABI_H_
+#  include <cxxabi.h>
+# elif defined(__HP_aCC)
+#  include <acxx_demangle.h>
+# endif  // GTEST_HASH_CXXABI_H_
+
+namespace testing {
+namespace internal {
+
+// GetTypeName<T>() returns a human-readable name of type T.
+// NB: This function is also used in Google Mock, so don't move it inside of
+// the typed-test-only section below.
+template <typename T>
+std::string GetTypeName() {
+# if GTEST_HAS_RTTI
+
+  const char* const name = typeid(T).name();
+#  if GTEST_HAS_CXXABI_H_ || defined(__HP_aCC)
+  int status = 0;
+  // gcc's implementation of typeid(T).name() mangles the type name,
+  // so we have to demangle it.
+#   if GTEST_HAS_CXXABI_H_
+  using abi::__cxa_demangle;
+#   endif  // GTEST_HAS_CXXABI_H_
+  char* const readable_name = __cxa_demangle(name, 0, 0, &status);
+  const std::string name_str(status == 0 ? readable_name : name);
+  free(readable_name);
+  return name_str;
+#  else
+  return name;
+#  endif  // GTEST_HAS_CXXABI_H_ || __HP_aCC
+
+# else
+
+  return "<type>";
+
+# endif  // GTEST_HAS_RTTI
+}
+
+#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
+
+// AssertyTypeEq<T1, T2>::type is defined iff T1 and T2 are the same
+// type.  This can be used as a compile-time assertion to ensure that
+// two types are equal.
+
+template <typename T1, typename T2>
+struct AssertTypeEq;
+
+template <typename T>
+struct AssertTypeEq<T, T> {
+  typedef bool type;
+};
+
+// A unique type used as the default value for the arguments of class
+// template Types.  This allows us to simulate variadic templates
+// (e.g. Types<int>, Type<int, double>, and etc), which C++ doesn't
+// support directly.
+struct None {};
+
+// The following family of struct and struct templates are used to
+// represent type lists.  In particular, TypesN<T1, T2, ..., TN>
+// represents a type list with N types (T1, T2, ..., and TN) in it.
+// Except for Types0, every struct in the family has two member types:
+// Head for the first type in the list, and Tail for the rest of the
+// list.
+
+// The empty type list.
+struct Types0 {};
+
+// Type lists of length 1, 2, 3, and so on.
+
+template <typename T1>
+struct Types1 {
+  typedef T1 Head;
+  typedef Types0 Tail;
+};
+template <typename T1, typename T2>
+struct Types2 {
+  typedef T1 Head;
+  typedef Types1<T2> Tail;
+};
+
+template <typename T1, typename T2, typename T3>
+struct Types3 {
+  typedef T1 Head;
+  typedef Types2<T2, T3> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4>
+struct Types4 {
+  typedef T1 Head;
+  typedef Types3<T2, T3, T4> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+struct Types5 {
+  typedef T1 Head;
+  typedef Types4<T2, T3, T4, T5> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6>
+struct Types6 {
+  typedef T1 Head;
+  typedef Types5<T2, T3, T4, T5, T6> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7>
+struct Types7 {
+  typedef T1 Head;
+  typedef Types6<T2, T3, T4, T5, T6, T7> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8>
+struct Types8 {
+  typedef T1 Head;
+  typedef Types7<T2, T3, T4, T5, T6, T7, T8> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9>
+struct Types9 {
+  typedef T1 Head;
+  typedef Types8<T2, T3, T4, T5, T6, T7, T8, T9> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10>
+struct Types10 {
+  typedef T1 Head;
+  typedef Types9<T2, T3, T4, T5, T6, T7, T8, T9, T10> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11>
+struct Types11 {
+  typedef T1 Head;
+  typedef Types10<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12>
+struct Types12 {
+  typedef T1 Head;
+  typedef Types11<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13>
+struct Types13 {
+  typedef T1 Head;
+  typedef Types12<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14>
+struct Types14 {
+  typedef T1 Head;
+  typedef Types13<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15>
+struct Types15 {
+  typedef T1 Head;
+  typedef Types14<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16>
+struct Types16 {
+  typedef T1 Head;
+  typedef Types15<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17>
+struct Types17 {
+  typedef T1 Head;
+  typedef Types16<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18>
+struct Types18 {
+  typedef T1 Head;
+  typedef Types17<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19>
+struct Types19 {
+  typedef T1 Head;
+  typedef Types18<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20>
+struct Types20 {
+  typedef T1 Head;
+  typedef Types19<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21>
+struct Types21 {
+  typedef T1 Head;
+  typedef Types20<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22>
+struct Types22 {
+  typedef T1 Head;
+  typedef Types21<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23>
+struct Types23 {
+  typedef T1 Head;
+  typedef Types22<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24>
+struct Types24 {
+  typedef T1 Head;
+  typedef Types23<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25>
+struct Types25 {
+  typedef T1 Head;
+  typedef Types24<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26>
+struct Types26 {
+  typedef T1 Head;
+  typedef Types25<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27>
+struct Types27 {
+  typedef T1 Head;
+  typedef Types26<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28>
+struct Types28 {
+  typedef T1 Head;
+  typedef Types27<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29>
+struct Types29 {
+  typedef T1 Head;
+  typedef Types28<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30>
+struct Types30 {
+  typedef T1 Head;
+  typedef Types29<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31>
+struct Types31 {
+  typedef T1 Head;
+  typedef Types30<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32>
+struct Types32 {
+  typedef T1 Head;
+  typedef Types31<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33>
+struct Types33 {
+  typedef T1 Head;
+  typedef Types32<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34>
+struct Types34 {
+  typedef T1 Head;
+  typedef Types33<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35>
+struct Types35 {
+  typedef T1 Head;
+  typedef Types34<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36>
+struct Types36 {
+  typedef T1 Head;
+  typedef Types35<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37>
+struct Types37 {
+  typedef T1 Head;
+  typedef Types36<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38>
+struct Types38 {
+  typedef T1 Head;
+  typedef Types37<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39>
+struct Types39 {
+  typedef T1 Head;
+  typedef Types38<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40>
+struct Types40 {
+  typedef T1 Head;
+  typedef Types39<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41>
+struct Types41 {
+  typedef T1 Head;
+  typedef Types40<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42>
+struct Types42 {
+  typedef T1 Head;
+  typedef Types41<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43>
+struct Types43 {
+  typedef T1 Head;
+  typedef Types42<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44>
+struct Types44 {
+  typedef T1 Head;
+  typedef Types43<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45>
+struct Types45 {
+  typedef T1 Head;
+  typedef Types44<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44, T45> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46>
+struct Types46 {
+  typedef T1 Head;
+  typedef Types45<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44, T45, T46> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47>
+struct Types47 {
+  typedef T1 Head;
+  typedef Types46<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44, T45, T46, T47> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48>
+struct Types48 {
+  typedef T1 Head;
+  typedef Types47<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44, T45, T46, T47, T48> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49>
+struct Types49 {
+  typedef T1 Head;
+  typedef Types48<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44, T45, T46, T47, T48, T49> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49, typename T50>
+struct Types50 {
+  typedef T1 Head;
+  typedef Types49<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44, T45, T46, T47, T48, T49, T50> Tail;
+};
+
+
+}  // namespace internal
+
+// We don't want to require the users to write TypesN<...> directly,
+// as that would require them to count the length.  Types<...> is much
+// easier to write, but generates horrible messages when there is a
+// compiler error, as gcc insists on printing out each template
+// argument, even if it has the default value (this means Types<int>
+// will appear as Types<int, None, None, ..., None> in the compiler
+// errors).
+//
+// Our solution is to combine the best part of the two approaches: a
+// user would write Types<T1, ..., TN>, and Google Test will translate
+// that to TypesN<T1, ..., TN> internally to make error messages
+// readable.  The translation is done by the 'type' member of the
+// Types template.
+template <typename T1 = internal::None, typename T2 = internal::None,
+    typename T3 = internal::None, typename T4 = internal::None,
+    typename T5 = internal::None, typename T6 = internal::None,
+    typename T7 = internal::None, typename T8 = internal::None,
+    typename T9 = internal::None, typename T10 = internal::None,
+    typename T11 = internal::None, typename T12 = internal::None,
+    typename T13 = internal::None, typename T14 = internal::None,
+    typename T15 = internal::None, typename T16 = internal::None,
+    typename T17 = internal::None, typename T18 = internal::None,
+    typename T19 = internal::None, typename T20 = internal::None,
+    typename T21 = internal::None, typename T22 = internal::None,
+    typename T23 = internal::None, typename T24 = internal::None,
+    typename T25 = internal::None, typename T26 = internal::None,
+    typename T27 = internal::None, typename T28 = internal::None,
+    typename T29 = internal::None, typename T30 = internal::None,
+    typename T31 = internal::None, typename T32 = internal::None,
+    typename T33 = internal::None, typename T34 = internal::None,
+    typename T35 = internal::None, typename T36 = internal::None,
+    typename T37 = internal::None, typename T38 = internal::None,
+    typename T39 = internal::None, typename T40 = internal::None,
+    typename T41 = internal::None, typename T42 = internal::None,
+    typename T43 = internal::None, typename T44 = internal::None,
+    typename T45 = internal::None, typename T46 = internal::None,
+    typename T47 = internal::None, typename T48 = internal::None,
+    typename T49 = internal::None, typename T50 = internal::None>
+struct Types {
+  typedef internal::Types50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45, T46, T47, T48, T49, T50> type;
+};
+
+template <>
+struct Types<internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types0 type;
+};
+template <typename T1>
+struct Types<T1, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types1<T1> type;
+};
+template <typename T1, typename T2>
+struct Types<T1, T2, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types2<T1, T2> type;
+};
+template <typename T1, typename T2, typename T3>
+struct Types<T1, T2, T3, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types3<T1, T2, T3> type;
+};
+template <typename T1, typename T2, typename T3, typename T4>
+struct Types<T1, T2, T3, T4, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types4<T1, T2, T3, T4> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+struct Types<T1, T2, T3, T4, T5, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types5<T1, T2, T3, T4, T5> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6>
+struct Types<T1, T2, T3, T4, T5, T6, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types6<T1, T2, T3, T4, T5, T6> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7>
+struct Types<T1, T2, T3, T4, T5, T6, T7, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types7<T1, T2, T3, T4, T5, T6, T7> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types8<T1, T2, T3, T4, T5, T6, T7, T8> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types9<T1, T2, T3, T4, T5, T6, T7, T8, T9> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
+    T46, internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45, T46> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
+    T46, T47, internal::None, internal::None, internal::None> {
+  typedef internal::Types47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45, T46, T47> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
+    T46, T47, T48, internal::None, internal::None> {
+  typedef internal::Types48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45, T46, T47, T48> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
+    T46, T47, T48, T49, internal::None> {
+  typedef internal::Types49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45, T46, T47, T48, T49> type;
+};
+
+namespace internal {
+
+# define GTEST_TEMPLATE_ template <typename T> class
+
+// The template "selector" struct TemplateSel<Tmpl> is used to
+// represent Tmpl, which must be a class template with one type
+// parameter, as a type.  TemplateSel<Tmpl>::Bind<T>::type is defined
+// as the type Tmpl<T>.  This allows us to actually instantiate the
+// template "selected" by TemplateSel<Tmpl>.
+//
+// This trick is necessary for simulating typedef for class templates,
+// which C++ doesn't support directly.
+template <GTEST_TEMPLATE_ Tmpl>
+struct TemplateSel {
+  template <typename T>
+  struct Bind {
+    typedef Tmpl<T> type;
+  };
+};
+
+# define GTEST_BIND_(TmplSel, T) \
+  TmplSel::template Bind<T>::type
+
+// A unique struct template used as the default value for the
+// arguments of class template Templates.  This allows us to simulate
+// variadic templates (e.g. Templates<int>, Templates<int, double>,
+// and etc), which C++ doesn't support directly.
+template <typename T>
+struct NoneT {};
+
+// The following family of struct and struct templates are used to
+// represent template lists.  In particular, TemplatesN<T1, T2, ...,
+// TN> represents a list of N templates (T1, T2, ..., and TN).  Except
+// for Templates0, every struct in the family has two member types:
+// Head for the selector of the first template in the list, and Tail
+// for the rest of the list.
+
+// The empty template list.
+struct Templates0 {};
+
+// Template lists of length 1, 2, 3, and so on.
+
+template <GTEST_TEMPLATE_ T1>
+struct Templates1 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates0 Tail;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2>
+struct Templates2 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates1<T2> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3>
+struct Templates3 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates2<T2, T3> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4>
+struct Templates4 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates3<T2, T3, T4> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5>
+struct Templates5 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates4<T2, T3, T4, T5> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6>
+struct Templates6 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates5<T2, T3, T4, T5, T6> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7>
+struct Templates7 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates6<T2, T3, T4, T5, T6, T7> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8>
+struct Templates8 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates7<T2, T3, T4, T5, T6, T7, T8> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9>
+struct Templates9 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates8<T2, T3, T4, T5, T6, T7, T8, T9> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10>
+struct Templates10 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates9<T2, T3, T4, T5, T6, T7, T8, T9, T10> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11>
+struct Templates11 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates10<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12>
+struct Templates12 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates11<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13>
+struct Templates13 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates12<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14>
+struct Templates14 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates13<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15>
+struct Templates15 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates14<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16>
+struct Templates16 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates15<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17>
+struct Templates17 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates16<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18>
+struct Templates18 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates17<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19>
+struct Templates19 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates18<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20>
+struct Templates20 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates19<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21>
+struct Templates21 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates20<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22>
+struct Templates22 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates21<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23>
+struct Templates23 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates22<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24>
+struct Templates24 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates23<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25>
+struct Templates25 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates24<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26>
+struct Templates26 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates25<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27>
+struct Templates27 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates26<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28>
+struct Templates28 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates27<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29>
+struct Templates29 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates28<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30>
+struct Templates30 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates29<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31>
+struct Templates31 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates30<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32>
+struct Templates32 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates31<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33>
+struct Templates33 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates32<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34>
+struct Templates34 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates33<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35>
+struct Templates35 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates34<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36>
+struct Templates36 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates35<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37>
+struct Templates37 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates36<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38>
+struct Templates38 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates37<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39>
+struct Templates39 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates38<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40>
+struct Templates40 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates39<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41>
+struct Templates41 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates40<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42>
+struct Templates42 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates41<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43>
+struct Templates43 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates42<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44>
+struct Templates44 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates43<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45>
+struct Templates45 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates44<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44, T45> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46>
+struct Templates46 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates45<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44, T45, T46> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47>
+struct Templates47 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates46<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44, T45, T46, T47> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48>
+struct Templates48 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates47<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44, T45, T46, T47, T48> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48,
+    GTEST_TEMPLATE_ T49>
+struct Templates49 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates48<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44, T45, T46, T47, T48, T49> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48,
+    GTEST_TEMPLATE_ T49, GTEST_TEMPLATE_ T50>
+struct Templates50 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates49<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44, T45, T46, T47, T48, T49, T50> Tail;
+};
+
+
+// We don't want to require the users to write TemplatesN<...> directly,
+// as that would require them to count the length.  Templates<...> is much
+// easier to write, but generates horrible messages when there is a
+// compiler error, as gcc insists on printing out each template
+// argument, even if it has the default value (this means Templates<list>
+// will appear as Templates<list, NoneT, NoneT, ..., NoneT> in the compiler
+// errors).
+//
+// Our solution is to combine the best part of the two approaches: a
+// user would write Templates<T1, ..., TN>, and Google Test will translate
+// that to TemplatesN<T1, ..., TN> internally to make error messages
+// readable.  The translation is done by the 'type' member of the
+// Templates template.
+template <GTEST_TEMPLATE_ T1 = NoneT, GTEST_TEMPLATE_ T2 = NoneT,
+    GTEST_TEMPLATE_ T3 = NoneT, GTEST_TEMPLATE_ T4 = NoneT,
+    GTEST_TEMPLATE_ T5 = NoneT, GTEST_TEMPLATE_ T6 = NoneT,
+    GTEST_TEMPLATE_ T7 = NoneT, GTEST_TEMPLATE_ T8 = NoneT,
+    GTEST_TEMPLATE_ T9 = NoneT, GTEST_TEMPLATE_ T10 = NoneT,
+    GTEST_TEMPLATE_ T11 = NoneT, GTEST_TEMPLATE_ T12 = NoneT,
+    GTEST_TEMPLATE_ T13 = NoneT, GTEST_TEMPLATE_ T14 = NoneT,
+    GTEST_TEMPLATE_ T15 = NoneT, GTEST_TEMPLATE_ T16 = NoneT,
+    GTEST_TEMPLATE_ T17 = NoneT, GTEST_TEMPLATE_ T18 = NoneT,
+    GTEST_TEMPLATE_ T19 = NoneT, GTEST_TEMPLATE_ T20 = NoneT,
+    GTEST_TEMPLATE_ T21 = NoneT, GTEST_TEMPLATE_ T22 = NoneT,
+    GTEST_TEMPLATE_ T23 = NoneT, GTEST_TEMPLATE_ T24 = NoneT,
+    GTEST_TEMPLATE_ T25 = NoneT, GTEST_TEMPLATE_ T26 = NoneT,
+    GTEST_TEMPLATE_ T27 = NoneT, GTEST_TEMPLATE_ T28 = NoneT,
+    GTEST_TEMPLATE_ T29 = NoneT, GTEST_TEMPLATE_ T30 = NoneT,
+    GTEST_TEMPLATE_ T31 = NoneT, GTEST_TEMPLATE_ T32 = NoneT,
+    GTEST_TEMPLATE_ T33 = NoneT, GTEST_TEMPLATE_ T34 = NoneT,
+    GTEST_TEMPLATE_ T35 = NoneT, GTEST_TEMPLATE_ T36 = NoneT,
+    GTEST_TEMPLATE_ T37 = NoneT, GTEST_TEMPLATE_ T38 = NoneT,
+    GTEST_TEMPLATE_ T39 = NoneT, GTEST_TEMPLATE_ T40 = NoneT,
+    GTEST_TEMPLATE_ T41 = NoneT, GTEST_TEMPLATE_ T42 = NoneT,
+    GTEST_TEMPLATE_ T43 = NoneT, GTEST_TEMPLATE_ T44 = NoneT,
+    GTEST_TEMPLATE_ T45 = NoneT, GTEST_TEMPLATE_ T46 = NoneT,
+    GTEST_TEMPLATE_ T47 = NoneT, GTEST_TEMPLATE_ T48 = NoneT,
+    GTEST_TEMPLATE_ T49 = NoneT, GTEST_TEMPLATE_ T50 = NoneT>
+struct Templates {
+  typedef Templates50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44, T45, T46, T47, T48, T49, T50> type;
+};
+
+template <>
+struct Templates<NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT> {
+  typedef Templates0 type;
+};
+template <GTEST_TEMPLATE_ T1>
+struct Templates<T1, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT> {
+  typedef Templates1<T1> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2>
+struct Templates<T1, T2, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT> {
+  typedef Templates2<T1, T2> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3>
+struct Templates<T1, T2, T3, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates3<T1, T2, T3> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4>
+struct Templates<T1, T2, T3, T4, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates4<T1, T2, T3, T4> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5>
+struct Templates<T1, T2, T3, T4, T5, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates5<T1, T2, T3, T4, T5> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6>
+struct Templates<T1, T2, T3, T4, T5, T6, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates6<T1, T2, T3, T4, T5, T6> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates7<T1, T2, T3, T4, T5, T6, T7> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates8<T1, T2, T3, T4, T5, T6, T7, T8> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates9<T1, T2, T3, T4, T5, T6, T7, T8, T9> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT> {
+  typedef Templates22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT> {
+  typedef Templates23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT> {
+  typedef Templates24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT> {
+  typedef Templates25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT> {
+  typedef Templates26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT> {
+  typedef Templates27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT> {
+  typedef Templates28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT> {
+  typedef Templates29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    T45, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44, T45> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    T45, T46, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44, T45, T46> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    T45, T46, T47, NoneT, NoneT, NoneT> {
+  typedef Templates47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44, T45, T46, T47> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    T45, T46, T47, T48, NoneT, NoneT> {
+  typedef Templates48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44, T45, T46, T47, T48> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48,
+    GTEST_TEMPLATE_ T49>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    T45, T46, T47, T48, T49, NoneT> {
+  typedef Templates49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44, T45, T46, T47, T48, T49> type;
+};
+
+// The TypeList template makes it possible to use either a single type
+// or a Types<...> list in TYPED_TEST_CASE() and
+// INSTANTIATE_TYPED_TEST_CASE_P().
+
+template <typename T>
+struct TypeList {
+  typedef Types1<T> type;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49, typename T50>
+struct TypeList<Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45, T46, T47, T48, T49, T50> > {
+  typedef typename Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45, T46, T47, T48, T49, T50>::type type;
+};
+
+#endif  // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
+
+// Due to C++ preprocessor weirdness, we need double indirection to
+// concatenate two tokens when one of them is __LINE__.  Writing
+//
+//   foo ## __LINE__
+//
+// will result in the token foo__LINE__, instead of foo followed by
+// the current line number.  For more details, see
+// http://www.parashift.com/c++-faq-lite/misc-technical-issues.html#faq-39.6
+#define GTEST_CONCAT_TOKEN_(foo, bar) GTEST_CONCAT_TOKEN_IMPL_(foo, bar)
+#define GTEST_CONCAT_TOKEN_IMPL_(foo, bar) foo ## bar
+
+class ProtocolMessage;
+namespace proto2 { class Message; }
+
+namespace testing {
+
+// Forward declarations.
+
+class AssertionResult;                 // Result of an assertion.
+class Message;                         // Represents a failure message.
+class Test;                            // Represents a test.
+class TestInfo;                        // Information about a test.
+class TestPartResult;                  // Result of a test part.
+class UnitTest;                        // A collection of test cases.
+
+template <typename T>
+::std::string PrintToString(const T& value);
+
+namespace internal {
+
+struct TraceInfo;                      // Information about a trace point.
+class ScopedTrace;                     // Implements scoped trace.
+class TestInfoImpl;                    // Opaque implementation of TestInfo
+class UnitTestImpl;                    // Opaque implementation of UnitTest
+
+// How many times InitGoogleTest() has been called.
+GTEST_API_ extern int g_init_gtest_count;
+
+// The text used in failure messages to indicate the start of the
+// stack trace.
+GTEST_API_ extern const char kStackTraceMarker[];
+
+// Two overloaded helpers for checking at compile time whether an
+// expression is a null pointer literal (i.e. NULL or any 0-valued
+// compile-time integral constant).  Their return values have
+// different sizes, so we can use sizeof() to test which version is
+// picked by the compiler.  These helpers have no implementations, as
+// we only need their signatures.
+//
+// Given IsNullLiteralHelper(x), the compiler will pick the first
+// version if x can be implicitly converted to Secret*, and pick the
+// second version otherwise.  Since Secret is a secret and incomplete
+// type, the only expression a user can write that has type Secret* is
+// a null pointer literal.  Therefore, we know that x is a null
+// pointer literal if and only if the first version is picked by the
+// compiler.
+char IsNullLiteralHelper(Secret* p);
+char (&IsNullLiteralHelper(...))[2];  // NOLINT
+
+// A compile-time bool constant that is true if and only if x is a
+// null pointer literal (i.e. NULL or any 0-valued compile-time
+// integral constant).
+#ifdef GTEST_ELLIPSIS_NEEDS_POD_
+// We lose support for NULL detection where the compiler doesn't like
+// passing non-POD classes through ellipsis (...).
+# define GTEST_IS_NULL_LITERAL_(x) false
+#else
+# define GTEST_IS_NULL_LITERAL_(x) \
+    (sizeof(::testing::internal::IsNullLiteralHelper(x)) == 1)
+#endif  // GTEST_ELLIPSIS_NEEDS_POD_
+
+// Appends the user-supplied message to the Google-Test-generated message.
+GTEST_API_ std::string AppendUserMessage(
+    const std::string& gtest_msg, const Message& user_msg);
+
+#if GTEST_HAS_EXCEPTIONS
+
+// This exception is thrown by (and only by) a failed Google Test
+// assertion when GTEST_FLAG(throw_on_failure) is true (if exceptions
+// are enabled).  We derive it from std::runtime_error, which is for
+// errors presumably detectable only at run time.  Since
+// std::runtime_error inherits from std::exception, many testing
+// frameworks know how to extract and print the message inside it.
+class GTEST_API_ GoogleTestFailureException : public ::std::runtime_error {
+ public:
+  explicit GoogleTestFailureException(const TestPartResult& failure);
+};
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
+// A helper class for creating scoped traces in user programs.
+class GTEST_API_ ScopedTrace {
+ public:
+  // The c'tor pushes the given source file location and message onto
+  // a trace stack maintained by Google Test.
+  ScopedTrace(const char* file, int line, const Message& message);
+
+  // The d'tor pops the info pushed by the c'tor.
+  //
+  // Note that the d'tor is not virtual in order to be efficient.
+  // Don't inherit from ScopedTrace!
+  ~ScopedTrace();
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedTrace);
+} GTEST_ATTRIBUTE_UNUSED_;  // A ScopedTrace object does its job in its
+                            // c'tor and d'tor.  Therefore it doesn't
+                            // need to be used otherwise.
+
+namespace edit_distance {
+// Returns the optimal edits to go from 'left' to 'right'.
+// All edits cost the same, with replace having lower priority than
+// add/remove.
+// Simple implementation of the Wagner–Fischer algorithm.
+// See http://en.wikipedia.org/wiki/Wagner-Fischer_algorithm
+enum EditType { kMatch, kAdd, kRemove, kReplace };
+GTEST_API_ std::vector<EditType> CalculateOptimalEdits(
+    const std::vector<size_t>& left, const std::vector<size_t>& right);
+
+// Same as above, but the input is represented as strings.
+GTEST_API_ std::vector<EditType> CalculateOptimalEdits(
+    const std::vector<std::string>& left,
+    const std::vector<std::string>& right);
+
+// Create a diff of the input strings in Unified diff format.
+GTEST_API_ std::string CreateUnifiedDiff(const std::vector<std::string>& left,
+                                         const std::vector<std::string>& right,
+                                         size_t context = 2);
+
+}  // namespace edit_distance
+
+// Calculate the diff between 'left' and 'right' and return it in unified diff
+// format.
+// If not null, stores in 'total_line_count' the total number of lines found
+// in left + right.
+GTEST_API_ std::string DiffStrings(const std::string& left,
+                                   const std::string& right,
+                                   size_t* total_line_count);
+
+// Constructs and returns the message for an equality assertion
+// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure.
+//
+// The first four parameters are the expressions used in the assertion
+// and their values, as strings.  For example, for ASSERT_EQ(foo, bar)
+// where foo is 5 and bar is 6, we have:
+//
+//   expected_expression: "foo"
+//   actual_expression:   "bar"
+//   expected_value:      "5"
+//   actual_value:        "6"
+//
+// The ignoring_case parameter is true iff the assertion is a
+// *_STRCASEEQ*.  When it's true, the string " (ignoring case)" will
+// be inserted into the message.
+GTEST_API_ AssertionResult EqFailure(const char* expected_expression,
+                                     const char* actual_expression,
+                                     const std::string& expected_value,
+                                     const std::string& actual_value,
+                                     bool ignoring_case);
+
+// Constructs a failure message for Boolean assertions such as EXPECT_TRUE.
+GTEST_API_ std::string GetBoolAssertionFailureMessage(
+    const AssertionResult& assertion_result,
+    const char* expression_text,
+    const char* actual_predicate_value,
+    const char* expected_predicate_value);
+
+// This template class represents an IEEE floating-point number
+// (either single-precision or double-precision, depending on the
+// template parameters).
+//
+// The purpose of this class is to do more sophisticated number
+// comparison.  (Due to round-off error, etc, it's very unlikely that
+// two floating-points will be equal exactly.  Hence a naive
+// comparison by the == operation often doesn't work.)
+//
+// Format of IEEE floating-point:
+//
+//   The most-significant bit being the leftmost, an IEEE
+//   floating-point looks like
+//
+//     sign_bit exponent_bits fraction_bits
+//
+//   Here, sign_bit is a single bit that designates the sign of the
+//   number.
+//
+//   For float, there are 8 exponent bits and 23 fraction bits.
+//
+//   For double, there are 11 exponent bits and 52 fraction bits.
+//
+//   More details can be found at
+//   http://en.wikipedia.org/wiki/IEEE_floating-point_standard.
+//
+// Template parameter:
+//
+//   RawType: the raw floating-point type (either float or double)
+template <typename RawType>
+class FloatingPoint {
+ public:
+  // Defines the unsigned integer type that has the same size as the
+  // floating point number.
+  typedef typename TypeWithSize<sizeof(RawType)>::UInt Bits;
+
+  // Constants.
+
+  // # of bits in a number.
+  static const size_t kBitCount = 8*sizeof(RawType);
+
+  // # of fraction bits in a number.
+  static const size_t kFractionBitCount =
+    std::numeric_limits<RawType>::digits - 1;
+
+  // # of exponent bits in a number.
+  static const size_t kExponentBitCount = kBitCount - 1 - kFractionBitCount;
+
+  // The mask for the sign bit.
+  static const Bits kSignBitMask = static_cast<Bits>(1) << (kBitCount - 1);
+
+  // The mask for the fraction bits.
+  static const Bits kFractionBitMask =
+    ~static_cast<Bits>(0) >> (kExponentBitCount + 1);
+
+  // The mask for the exponent bits.
+  static const Bits kExponentBitMask = ~(kSignBitMask | kFractionBitMask);
+
+  // How many ULP's (Units in the Last Place) we want to tolerate when
+  // comparing two numbers.  The larger the value, the more error we
+  // allow.  A 0 value means that two numbers must be exactly the same
+  // to be considered equal.
+  //
+  // The maximum error of a single floating-point operation is 0.5
+  // units in the last place.  On Intel CPU's, all floating-point
+  // calculations are done with 80-bit precision, while double has 64
+  // bits.  Therefore, 4 should be enough for ordinary use.
+  //
+  // See the following article for more details on ULP:
+  // http://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/
+  static const size_t kMaxUlps = 4;
+
+  // Constructs a FloatingPoint from a raw floating-point number.
+  //
+  // On an Intel CPU, passing a non-normalized NAN (Not a Number)
+  // around may change its bits, although the new value is guaranteed
+  // to be also a NAN.  Therefore, don't expect this constructor to
+  // preserve the bits in x when x is a NAN.
+  explicit FloatingPoint(const RawType& x) { u_.value_ = x; }
+
+  // Static methods
+
+  // Reinterprets a bit pattern as a floating-point number.
+  //
+  // This function is needed to test the AlmostEquals() method.
+  static RawType ReinterpretBits(const Bits bits) {
+    FloatingPoint fp(0);
+    fp.u_.bits_ = bits;
+    return fp.u_.value_;
+  }
+
+  // Returns the floating-point number that represent positive infinity.
+  static RawType Infinity() {
+    return ReinterpretBits(kExponentBitMask);
+  }
+
+  // Returns the maximum representable finite floating-point number.
+  static RawType Max();
+
+  // Non-static methods
+
+  // Returns the bits that represents this number.
+  const Bits &bits() const { return u_.bits_; }
+
+  // Returns the exponent bits of this number.
+  Bits exponent_bits() const { return kExponentBitMask & u_.bits_; }
+
+  // Returns the fraction bits of this number.
+  Bits fraction_bits() const { return kFractionBitMask & u_.bits_; }
+
+  // Returns the sign bit of this number.
+  Bits sign_bit() const { return kSignBitMask & u_.bits_; }
+
+  // Returns true iff this is NAN (not a number).
+  bool is_nan() const {
+    // It's a NAN if the exponent bits are all ones and the fraction
+    // bits are not entirely zeros.
+    return (exponent_bits() == kExponentBitMask) && (fraction_bits() != 0);
+  }
+
+  // Returns true iff this number is at most kMaxUlps ULP's away from
+  // rhs.  In particular, this function:
+  //
+  //   - returns false if either number is (or both are) NAN.
+  //   - treats really large numbers as almost equal to infinity.
+  //   - thinks +0.0 and -0.0 are 0 DLP's apart.
+  bool AlmostEquals(const FloatingPoint& rhs) const {
+    // The IEEE standard says that any comparison operation involving
+    // a NAN must return false.
+    if (is_nan() || rhs.is_nan()) return false;
+
+    return DistanceBetweenSignAndMagnitudeNumbers(u_.bits_, rhs.u_.bits_)
+        <= kMaxUlps;
+  }
+
+ private:
+  // The data type used to store the actual floating-point number.
+  union FloatingPointUnion {
+    RawType value_;  // The raw floating-point number.
+    Bits bits_;      // The bits that represent the number.
+  };
+
+  // Converts an integer from the sign-and-magnitude representation to
+  // the biased representation.  More precisely, let N be 2 to the
+  // power of (kBitCount - 1), an integer x is represented by the
+  // unsigned number x + N.
+  //
+  // For instance,
+  //
+  //   -N + 1 (the most negative number representable using
+  //          sign-and-magnitude) is represented by 1;
+  //   0      is represented by N; and
+  //   N - 1  (the biggest number representable using
+  //          sign-and-magnitude) is represented by 2N - 1.
+  //
+  // Read http://en.wikipedia.org/wiki/Signed_number_representations
+  // for more details on signed number representations.
+  static Bits SignAndMagnitudeToBiased(const Bits &sam) {
+    if (kSignBitMask & sam) {
+      // sam represents a negative number.
+      return ~sam + 1;
+    } else {
+      // sam represents a positive number.
+      return kSignBitMask | sam;
+    }
+  }
+
+  // Given two numbers in the sign-and-magnitude representation,
+  // returns the distance between them as an unsigned number.
+  static Bits DistanceBetweenSignAndMagnitudeNumbers(const Bits &sam1,
+                                                     const Bits &sam2) {
+    const Bits biased1 = SignAndMagnitudeToBiased(sam1);
+    const Bits biased2 = SignAndMagnitudeToBiased(sam2);
+    return (biased1 >= biased2) ? (biased1 - biased2) : (biased2 - biased1);
+  }
+
+  FloatingPointUnion u_;
+};
+
+// We cannot use std::numeric_limits<T>::max() as it clashes with the max()
+// macro defined by <windows.h>.
+template <>
+inline float FloatingPoint<float>::Max() { return FLT_MAX; }
+template <>
+inline double FloatingPoint<double>::Max() { return DBL_MAX; }
+
+// Typedefs the instances of the FloatingPoint template class that we
+// care to use.
+typedef FloatingPoint<float> Float;
+typedef FloatingPoint<double> Double;
+
+// In order to catch the mistake of putting tests that use different
+// test fixture classes in the same test case, we need to assign
+// unique IDs to fixture classes and compare them.  The TypeId type is
+// used to hold such IDs.  The user should treat TypeId as an opaque
+// type: the only operation allowed on TypeId values is to compare
+// them for equality using the == operator.
+typedef const void* TypeId;
+
+template <typename T>
+class TypeIdHelper {
+ public:
+  // dummy_ must not have a const type.  Otherwise an overly eager
+  // compiler (e.g. MSVC 7.1 & 8.0) may try to merge
+  // TypeIdHelper<T>::dummy_ for different Ts as an "optimization".
+  static bool dummy_;
+};
+
+template <typename T>
+bool TypeIdHelper<T>::dummy_ = false;
+
+// GetTypeId<T>() returns the ID of type T.  Different values will be
+// returned for different types.  Calling the function twice with the
+// same type argument is guaranteed to return the same ID.
+template <typename T>
+TypeId GetTypeId() {
+  // The compiler is required to allocate a different
+  // TypeIdHelper<T>::dummy_ variable for each T used to instantiate
+  // the template.  Therefore, the address of dummy_ is guaranteed to
+  // be unique.
+  return &(TypeIdHelper<T>::dummy_);
+}
+
+// Returns the type ID of ::testing::Test.  Always call this instead
+// of GetTypeId< ::testing::Test>() to get the type ID of
+// ::testing::Test, as the latter may give the wrong result due to a
+// suspected linker bug when compiling Google Test as a Mac OS X
+// framework.
+GTEST_API_ TypeId GetTestTypeId();
+
+// Defines the abstract factory interface that creates instances
+// of a Test object.
+class TestFactoryBase {
+ public:
+  virtual ~TestFactoryBase() {}
+
+  // Creates a test instance to run. The instance is both created and destroyed
+  // within TestInfoImpl::Run()
+  virtual Test* CreateTest() = 0;
+
+ protected:
+  TestFactoryBase() {}
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestFactoryBase);
+};
+
+// This class provides implementation of TeastFactoryBase interface.
+// It is used in TEST and TEST_F macros.
+template <class TestClass>
+class TestFactoryImpl : public TestFactoryBase {
+ public:
+  virtual Test* CreateTest() { return new TestClass; }
+};
+
+#if GTEST_OS_WINDOWS
+
+// Predicate-formatters for implementing the HRESULT checking macros
+// {ASSERT|EXPECT}_HRESULT_{SUCCEEDED|FAILED}
+// We pass a long instead of HRESULT to avoid causing an
+// include dependency for the HRESULT type.
+GTEST_API_ AssertionResult IsHRESULTSuccess(const char* expr,
+                                            long hr);  // NOLINT
+GTEST_API_ AssertionResult IsHRESULTFailure(const char* expr,
+                                            long hr);  // NOLINT
+
+#endif  // GTEST_OS_WINDOWS
+
+// Types of SetUpTestCase() and TearDownTestCase() functions.
+typedef void (*SetUpTestCaseFunc)();
+typedef void (*TearDownTestCaseFunc)();
+
+// Creates a new TestInfo object and registers it with Google Test;
+// returns the created object.
+//
+// Arguments:
+//
+//   test_case_name:   name of the test case
+//   name:             name of the test
+//   type_param        the name of the test's type parameter, or NULL if
+//                     this is not a typed or a type-parameterized test.
+//   value_param       text representation of the test's value parameter,
+//                     or NULL if this is not a type-parameterized test.
+//   fixture_class_id: ID of the test fixture class
+//   set_up_tc:        pointer to the function that sets up the test case
+//   tear_down_tc:     pointer to the function that tears down the test case
+//   factory:          pointer to the factory that creates a test object.
+//                     The newly created TestInfo instance will assume
+//                     ownership of the factory object.
+GTEST_API_ TestInfo* MakeAndRegisterTestInfo(
+    const char* test_case_name,
+    const char* name,
+    const char* type_param,
+    const char* value_param,
+    TypeId fixture_class_id,
+    SetUpTestCaseFunc set_up_tc,
+    TearDownTestCaseFunc tear_down_tc,
+    TestFactoryBase* factory);
+
+// If *pstr starts with the given prefix, modifies *pstr to be right
+// past the prefix and returns true; otherwise leaves *pstr unchanged
+// and returns false.  None of pstr, *pstr, and prefix can be NULL.
+GTEST_API_ bool SkipPrefix(const char* prefix, const char** pstr);
+
+#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
+
+// State of the definition of a type-parameterized test case.
+class GTEST_API_ TypedTestCasePState {
+ public:
+  TypedTestCasePState() : registered_(false) {}
+
+  // Adds the given test name to defined_test_names_ and return true
+  // if the test case hasn't been registered; otherwise aborts the
+  // program.
+  bool AddTestName(const char* file, int line, const char* case_name,
+                   const char* test_name) {
+    if (registered_) {
+      fprintf(stderr, "%s Test %s must be defined before "
+              "REGISTER_TYPED_TEST_CASE_P(%s, ...).\n",
+              FormatFileLocation(file, line).c_str(), test_name, case_name);
+      fflush(stderr);
+      posix::Abort();
+    }
+    defined_test_names_.insert(test_name);
+    return true;
+  }
+
+  // Verifies that registered_tests match the test names in
+  // defined_test_names_; returns registered_tests if successful, or
+  // aborts the program otherwise.
+  const char* VerifyRegisteredTestNames(
+      const char* file, int line, const char* registered_tests);
+
+ private:
+  bool registered_;
+  ::std::set<const char*> defined_test_names_;
+};
+
+// Skips to the first non-space char after the first comma in 'str';
+// returns NULL if no comma is found in 'str'.
+inline const char* SkipComma(const char* str) {
+  const char* comma = strchr(str, ',');
+  if (comma == NULL) {
+    return NULL;
+  }
+  while (IsSpace(*(++comma))) {}
+  return comma;
+}
+
+// Returns the prefix of 'str' before the first comma in it; returns
+// the entire string if it contains no comma.
+inline std::string GetPrefixUntilComma(const char* str) {
+  const char* comma = strchr(str, ',');
+  return comma == NULL ? str : std::string(str, comma);
+}
+
+// TypeParameterizedTest<Fixture, TestSel, Types>::Register()
+// registers a list of type-parameterized tests with Google Test.  The
+// return value is insignificant - we just need to return something
+// such that we can call this function in a namespace scope.
+//
+// Implementation note: The GTEST_TEMPLATE_ macro declares a template
+// template parameter.  It's defined in gtest-type-util.h.
+template <GTEST_TEMPLATE_ Fixture, class TestSel, typename Types>
+class TypeParameterizedTest {
+ public:
+  // 'index' is the index of the test in the type list 'Types'
+  // specified in INSTANTIATE_TYPED_TEST_CASE_P(Prefix, TestCase,
+  // Types).  Valid values for 'index' are [0, N - 1] where N is the
+  // length of Types.
+  static bool Register(const char* prefix, const char* case_name,
+                       const char* test_names, int index) {
+    typedef typename Types::Head Type;
+    typedef Fixture<Type> FixtureClass;
+    typedef typename GTEST_BIND_(TestSel, Type) TestClass;
+
+    // First, registers the first type-parameterized test in the type
+    // list.
+    MakeAndRegisterTestInfo(
+        (std::string(prefix) + (prefix[0] == '\0' ? "" : "/") + case_name + "/"
+         + StreamableToString(index)).c_str(),
+        StripTrailingSpaces(GetPrefixUntilComma(test_names)).c_str(),
+        GetTypeName<Type>().c_str(),
+        NULL,  // No value parameter.
+        GetTypeId<FixtureClass>(),
+        TestClass::SetUpTestCase,
+        TestClass::TearDownTestCase,
+        new TestFactoryImpl<TestClass>);
+
+    // Next, recurses (at compile time) with the tail of the type list.
+    return TypeParameterizedTest<Fixture, TestSel, typename Types::Tail>
+        ::Register(prefix, case_name, test_names, index + 1);
+  }
+};
+
+// The base case for the compile time recursion.
+template <GTEST_TEMPLATE_ Fixture, class TestSel>
+class TypeParameterizedTest<Fixture, TestSel, Types0> {
+ public:
+  static bool Register(const char* /*prefix*/, const char* /*case_name*/,
+                       const char* /*test_names*/, int /*index*/) {
+    return true;
+  }
+};
+
+// TypeParameterizedTestCase<Fixture, Tests, Types>::Register()
+// registers *all combinations* of 'Tests' and 'Types' with Google
+// Test.  The return value is insignificant - we just need to return
+// something such that we can call this function in a namespace scope.
+template <GTEST_TEMPLATE_ Fixture, typename Tests, typename Types>
+class TypeParameterizedTestCase {
+ public:
+  static bool Register(const char* prefix, const char* case_name,
+                       const char* test_names) {
+    typedef typename Tests::Head Head;
+
+    // First, register the first test in 'Test' for each type in 'Types'.
+    TypeParameterizedTest<Fixture, Head, Types>::Register(
+        prefix, case_name, test_names, 0);
+
+    // Next, recurses (at compile time) with the tail of the test list.
+    return TypeParameterizedTestCase<Fixture, typename Tests::Tail, Types>
+        ::Register(prefix, case_name, SkipComma(test_names));
+  }
+};
+
+// The base case for the compile time recursion.
+template <GTEST_TEMPLATE_ Fixture, typename Types>
+class TypeParameterizedTestCase<Fixture, Templates0, Types> {
+ public:
+  static bool Register(const char* /*prefix*/, const char* /*case_name*/,
+                       const char* /*test_names*/) {
+    return true;
+  }
+};
+
+#endif  // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
+
+// Returns the current OS stack trace as an std::string.
+//
+// The maximum number of stack frames to be included is specified by
+// the gtest_stack_trace_depth flag.  The skip_count parameter
+// specifies the number of top frames to be skipped, which doesn't
+// count against the number of frames to be included.
+//
+// For example, if Foo() calls Bar(), which in turn calls
+// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in
+// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't.
+GTEST_API_ std::string GetCurrentOsStackTraceExceptTop(
+    UnitTest* unit_test, int skip_count);
+
+// Helpers for suppressing warnings on unreachable code or constant
+// condition.
+
+// Always returns true.
+GTEST_API_ bool AlwaysTrue();
+
+// Always returns false.
+inline bool AlwaysFalse() { return !AlwaysTrue(); }
+
+// Helper for suppressing false warning from Clang on a const char*
+// variable declared in a conditional expression always being NULL in
+// the else branch.
+struct GTEST_API_ ConstCharPtr {
+  ConstCharPtr(const char* str) : value(str) {}
+  operator bool() const { return true; }
+  const char* value;
+};
+
+// A simple Linear Congruential Generator for generating random
+// numbers with a uniform distribution.  Unlike rand() and srand(), it
+// doesn't use global state (and therefore can't interfere with user
+// code).  Unlike rand_r(), it's portable.  An LCG isn't very random,
+// but it's good enough for our purposes.
+class GTEST_API_ Random {
+ public:
+  static const UInt32 kMaxRange = 1u << 31;
+
+  explicit Random(UInt32 seed) : state_(seed) {}
+
+  void Reseed(UInt32 seed) { state_ = seed; }
+
+  // Generates a random number from [0, range).  Crashes if 'range' is
+  // 0 or greater than kMaxRange.
+  UInt32 Generate(UInt32 range);
+
+ private:
+  UInt32 state_;
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Random);
+};
+
+// Defining a variable of type CompileAssertTypesEqual<T1, T2> will cause a
+// compiler error iff T1 and T2 are different types.
+template <typename T1, typename T2>
+struct CompileAssertTypesEqual;
+
+template <typename T>
+struct CompileAssertTypesEqual<T, T> {
+};
+
+// Removes the reference from a type if it is a reference type,
+// otherwise leaves it unchanged.  This is the same as
+// tr1::remove_reference, which is not widely available yet.
+template <typename T>
+struct RemoveReference { typedef T type; };  // NOLINT
+template <typename T>
+struct RemoveReference<T&> { typedef T type; };  // NOLINT
+
+// A handy wrapper around RemoveReference that works when the argument
+// T depends on template parameters.
+#define GTEST_REMOVE_REFERENCE_(T) \
+    typename ::testing::internal::RemoveReference<T>::type
+
+// Removes const from a type if it is a const type, otherwise leaves
+// it unchanged.  This is the same as tr1::remove_const, which is not
+// widely available yet.
+template <typename T>
+struct RemoveConst { typedef T type; };  // NOLINT
+template <typename T>
+struct RemoveConst<const T> { typedef T type; };  // NOLINT
+
+// MSVC 8.0, Sun C++, and IBM XL C++ have a bug which causes the above
+// definition to fail to remove the const in 'const int[3]' and 'const
+// char[3][4]'.  The following specialization works around the bug.
+template <typename T, size_t N>
+struct RemoveConst<const T[N]> {
+  typedef typename RemoveConst<T>::type type[N];
+};
+
+#if defined(_MSC_VER) && _MSC_VER < 1400
+// This is the only specialization that allows VC++ 7.1 to remove const in
+// 'const int[3] and 'const int[3][4]'.  However, it causes trouble with GCC
+// and thus needs to be conditionally compiled.
+template <typename T, size_t N>
+struct RemoveConst<T[N]> {
+  typedef typename RemoveConst<T>::type type[N];
+};
+#endif
+
+// A handy wrapper around RemoveConst that works when the argument
+// T depends on template parameters.
+#define GTEST_REMOVE_CONST_(T) \
+    typename ::testing::internal::RemoveConst<T>::type
+
+// Turns const U&, U&, const U, and U all into U.
+#define GTEST_REMOVE_REFERENCE_AND_CONST_(T) \
+    GTEST_REMOVE_CONST_(GTEST_REMOVE_REFERENCE_(T))
+
+// Adds reference to a type if it is not a reference type,
+// otherwise leaves it unchanged.  This is the same as
+// tr1::add_reference, which is not widely available yet.
+template <typename T>
+struct AddReference { typedef T& type; };  // NOLINT
+template <typename T>
+struct AddReference<T&> { typedef T& type; };  // NOLINT
+
+// A handy wrapper around AddReference that works when the argument T
+// depends on template parameters.
+#define GTEST_ADD_REFERENCE_(T) \
+    typename ::testing::internal::AddReference<T>::type
+
+// Adds a reference to const on top of T as necessary.  For example,
+// it transforms
+//
+//   char         ==> const char&
+//   const char   ==> const char&
+//   char&        ==> const char&
+//   const char&  ==> const char&
+//
+// The argument T must depend on some template parameters.
+#define GTEST_REFERENCE_TO_CONST_(T) \
+    GTEST_ADD_REFERENCE_(const GTEST_REMOVE_REFERENCE_(T))
+
+// ImplicitlyConvertible<From, To>::value is a compile-time bool
+// constant that's true iff type From can be implicitly converted to
+// type To.
+template <typename From, typename To>
+class ImplicitlyConvertible {
+ private:
+  // We need the following helper functions only for their types.
+  // They have no implementations.
+
+  // MakeFrom() is an expression whose type is From.  We cannot simply
+  // use From(), as the type From may not have a public default
+  // constructor.
+  static typename AddReference<From>::type MakeFrom();
+
+  // These two functions are overloaded.  Given an expression
+  // Helper(x), the compiler will pick the first version if x can be
+  // implicitly converted to type To; otherwise it will pick the
+  // second version.
+  //
+  // The first version returns a value of size 1, and the second
+  // version returns a value of size 2.  Therefore, by checking the
+  // size of Helper(x), which can be done at compile time, we can tell
+  // which version of Helper() is used, and hence whether x can be
+  // implicitly converted to type To.
+  static char Helper(To);
+  static char (&Helper(...))[2];  // NOLINT
+
+  // We have to put the 'public' section after the 'private' section,
+  // or MSVC refuses to compile the code.
+ public:
+#if defined(__BORLANDC__)
+  // C++Builder cannot use member overload resolution during template
+  // instantiation.  The simplest workaround is to use its C++0x type traits
+  // functions (C++Builder 2009 and above only).
+  static const bool value = __is_convertible(From, To);
+#else
+  // MSVC warns about implicitly converting from double to int for
+  // possible loss of data, so we need to temporarily disable the
+  // warning.
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4244)
+  static const bool value =
+      sizeof(Helper(ImplicitlyConvertible::MakeFrom())) == 1;
+  GTEST_DISABLE_MSC_WARNINGS_POP_()
+#endif  // __BORLANDC__
+};
+template <typename From, typename To>
+const bool ImplicitlyConvertible<From, To>::value;
+
+// IsAProtocolMessage<T>::value is a compile-time bool constant that's
+// true iff T is type ProtocolMessage, proto2::Message, or a subclass
+// of those.
+template <typename T>
+struct IsAProtocolMessage
+    : public bool_constant<
+  ImplicitlyConvertible<const T*, const ::ProtocolMessage*>::value ||
+  ImplicitlyConvertible<const T*, const ::proto2::Message*>::value> {
+};
+
+// When the compiler sees expression IsContainerTest<C>(0), if C is an
+// STL-style container class, the first overload of IsContainerTest
+// will be viable (since both C::iterator* and C::const_iterator* are
+// valid types and NULL can be implicitly converted to them).  It will
+// be picked over the second overload as 'int' is a perfect match for
+// the type of argument 0.  If C::iterator or C::const_iterator is not
+// a valid type, the first overload is not viable, and the second
+// overload will be picked.  Therefore, we can determine whether C is
+// a container class by checking the type of IsContainerTest<C>(0).
+// The value of the expression is insignificant.
+//
+// Note that we look for both C::iterator and C::const_iterator.  The
+// reason is that C++ injects the name of a class as a member of the
+// class itself (e.g. you can refer to class iterator as either
+// 'iterator' or 'iterator::iterator').  If we look for C::iterator
+// only, for example, we would mistakenly think that a class named
+// iterator is an STL container.
+//
+// Also note that the simpler approach of overloading
+// IsContainerTest(typename C::const_iterator*) and
+// IsContainerTest(...) doesn't work with Visual Age C++ and Sun C++.
+typedef int IsContainer;
+template <class C>
+IsContainer IsContainerTest(int /* dummy */,
+                            typename C::iterator* /* it */ = NULL,
+                            typename C::const_iterator* /* const_it */ = NULL) {
+  return 0;
+}
+
+typedef char IsNotContainer;
+template <class C>
+IsNotContainer IsContainerTest(long /* dummy */) { return '\0'; }
+
+// EnableIf<condition>::type is void when 'Cond' is true, and
+// undefined when 'Cond' is false.  To use SFINAE to make a function
+// overload only apply when a particular expression is true, add
+// "typename EnableIf<expression>::type* = 0" as the last parameter.
+template<bool> struct EnableIf;
+template<> struct EnableIf<true> { typedef void type; };  // NOLINT
+
+// Utilities for native arrays.
+
+// ArrayEq() compares two k-dimensional native arrays using the
+// elements' operator==, where k can be any integer >= 0.  When k is
+// 0, ArrayEq() degenerates into comparing a single pair of values.
+
+template <typename T, typename U>
+bool ArrayEq(const T* lhs, size_t size, const U* rhs);
+
+// This generic version is used when k is 0.
+template <typename T, typename U>
+inline bool ArrayEq(const T& lhs, const U& rhs) { return lhs == rhs; }
+
+// This overload is used when k >= 1.
+template <typename T, typename U, size_t N>
+inline bool ArrayEq(const T(&lhs)[N], const U(&rhs)[N]) {
+  return internal::ArrayEq(lhs, N, rhs);
+}
+
+// This helper reduces code bloat.  If we instead put its logic inside
+// the previous ArrayEq() function, arrays with different sizes would
+// lead to different copies of the template code.
+template <typename T, typename U>
+bool ArrayEq(const T* lhs, size_t size, const U* rhs) {
+  for (size_t i = 0; i != size; i++) {
+    if (!internal::ArrayEq(lhs[i], rhs[i]))
+      return false;
+  }
+  return true;
+}
+
+// Finds the first element in the iterator range [begin, end) that
+// equals elem.  Element may be a native array type itself.
+template <typename Iter, typename Element>
+Iter ArrayAwareFind(Iter begin, Iter end, const Element& elem) {
+  for (Iter it = begin; it != end; ++it) {
+    if (internal::ArrayEq(*it, elem))
+      return it;
+  }
+  return end;
+}
+
+// CopyArray() copies a k-dimensional native array using the elements'
+// operator=, where k can be any integer >= 0.  When k is 0,
+// CopyArray() degenerates into copying a single value.
+
+template <typename T, typename U>
+void CopyArray(const T* from, size_t size, U* to);
+
+// This generic version is used when k is 0.
+template <typename T, typename U>
+inline void CopyArray(const T& from, U* to) { *to = from; }
+
+// This overload is used when k >= 1.
+template <typename T, typename U, size_t N>
+inline void CopyArray(const T(&from)[N], U(*to)[N]) {
+  internal::CopyArray(from, N, *to);
+}
+
+// This helper reduces code bloat.  If we instead put its logic inside
+// the previous CopyArray() function, arrays with different sizes
+// would lead to different copies of the template code.
+template <typename T, typename U>
+void CopyArray(const T* from, size_t size, U* to) {
+  for (size_t i = 0; i != size; i++) {
+    internal::CopyArray(from[i], to + i);
+  }
+}
+
+// The relation between an NativeArray object (see below) and the
+// native array it represents.
+// We use 2 different structs to allow non-copyable types to be used, as long
+// as RelationToSourceReference() is passed.
+struct RelationToSourceReference {};
+struct RelationToSourceCopy {};
+
+// Adapts a native array to a read-only STL-style container.  Instead
+// of the complete STL container concept, this adaptor only implements
+// members useful for Google Mock's container matchers.  New members
+// should be added as needed.  To simplify the implementation, we only
+// support Element being a raw type (i.e. having no top-level const or
+// reference modifier).  It's the client's responsibility to satisfy
+// this requirement.  Element can be an array type itself (hence
+// multi-dimensional arrays are supported).
+template <typename Element>
+class NativeArray {
+ public:
+  // STL-style container typedefs.
+  typedef Element value_type;
+  typedef Element* iterator;
+  typedef const Element* const_iterator;
+
+  // Constructs from a native array. References the source.
+  NativeArray(const Element* array, size_t count, RelationToSourceReference) {
+    InitRef(array, count);
+  }
+
+  // Constructs from a native array. Copies the source.
+  NativeArray(const Element* array, size_t count, RelationToSourceCopy) {
+    InitCopy(array, count);
+  }
+
+  // Copy constructor.
+  NativeArray(const NativeArray& rhs) {
+    (this->*rhs.clone_)(rhs.array_, rhs.size_);
+  }
+
+  ~NativeArray() {
+    if (clone_ != &NativeArray::InitRef)
+      delete[] array_;
+  }
+
+  // STL-style container methods.
+  size_t size() const { return size_; }
+  const_iterator begin() const { return array_; }
+  const_iterator end() const { return array_ + size_; }
+  bool operator==(const NativeArray& rhs) const {
+    return size() == rhs.size() &&
+        ArrayEq(begin(), size(), rhs.begin());
+  }
+
+ private:
+  enum {
+    kCheckTypeIsNotConstOrAReference = StaticAssertTypeEqHelper<
+        Element, GTEST_REMOVE_REFERENCE_AND_CONST_(Element)>::value,
+  };
+
+  // Initializes this object with a copy of the input.
+  void InitCopy(const Element* array, size_t a_size) {
+    Element* const copy = new Element[a_size];
+    CopyArray(array, a_size, copy);
+    array_ = copy;
+    size_ = a_size;
+    clone_ = &NativeArray::InitCopy;
+  }
+
+  // Initializes this object with a reference of the input.
+  void InitRef(const Element* array, size_t a_size) {
+    array_ = array;
+    size_ = a_size;
+    clone_ = &NativeArray::InitRef;
+  }
+
+  const Element* array_;
+  size_t size_;
+  void (NativeArray::*clone_)(const Element*, size_t);
+
+  GTEST_DISALLOW_ASSIGN_(NativeArray);
+};
+
+}  // namespace internal
+}  // namespace testing
+
+#define GTEST_MESSAGE_AT_(file, line, message, result_type) \
+  ::testing::internal::AssertHelper(result_type, file, line, message) \
+    = ::testing::Message()
+
+#define GTEST_MESSAGE_(message, result_type) \
+  GTEST_MESSAGE_AT_(__FILE__, __LINE__, message, result_type)
+
+#define GTEST_FATAL_FAILURE_(message) \
+  return GTEST_MESSAGE_(message, ::testing::TestPartResult::kFatalFailure)
+
+#define GTEST_NONFATAL_FAILURE_(message) \
+  GTEST_MESSAGE_(message, ::testing::TestPartResult::kNonFatalFailure)
+
+#define GTEST_SUCCESS_(message) \
+  GTEST_MESSAGE_(message, ::testing::TestPartResult::kSuccess)
+
+// Suppresses MSVC warnings 4072 (unreachable code) for the code following
+// statement if it returns or throws (or doesn't return or throw in some
+// situations).
+#define GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) \
+  if (::testing::internal::AlwaysTrue()) { statement; }
+
+#define GTEST_TEST_THROW_(statement, expected_exception, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::ConstCharPtr gtest_msg = "") { \
+    bool gtest_caught_expected = false; \
+    try { \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+    } \
+    catch (expected_exception const&) { \
+      gtest_caught_expected = true; \
+    } \
+    catch (...) { \
+      gtest_msg.value = \
+          "Expected: " #statement " throws an exception of type " \
+          #expected_exception ".\n  Actual: it throws a different type."; \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \
+    } \
+    if (!gtest_caught_expected) { \
+      gtest_msg.value = \
+          "Expected: " #statement " throws an exception of type " \
+          #expected_exception ".\n  Actual: it throws nothing."; \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \
+    } \
+  } else \
+    GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__): \
+      fail(gtest_msg.value)
+
+#define GTEST_TEST_NO_THROW_(statement, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::AlwaysTrue()) { \
+    try { \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+    } \
+    catch (...) { \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__); \
+    } \
+  } else \
+    GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__): \
+      fail("Expected: " #statement " doesn't throw an exception.\n" \
+           "  Actual: it throws.")
+
+#define GTEST_TEST_ANY_THROW_(statement, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::AlwaysTrue()) { \
+    bool gtest_caught_any = false; \
+    try { \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+    } \
+    catch (...) { \
+      gtest_caught_any = true; \
+    } \
+    if (!gtest_caught_any) { \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__); \
+    } \
+  } else \
+    GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__): \
+      fail("Expected: " #statement " throws an exception.\n" \
+           "  Actual: it doesn't.")
+
+
+// Implements Boolean test assertions such as EXPECT_TRUE. expression can be
+// either a boolean expression or an AssertionResult. text is a textual
+// represenation of expression as it was passed into the EXPECT_TRUE.
+#define GTEST_TEST_BOOLEAN_(expression, text, actual, expected, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (const ::testing::AssertionResult gtest_ar_ = \
+      ::testing::AssertionResult(expression)) \
+    ; \
+  else \
+    fail(::testing::internal::GetBoolAssertionFailureMessage(\
+        gtest_ar_, text, #actual, #expected).c_str())
+
+#define GTEST_TEST_NO_FATAL_FAILURE_(statement, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::AlwaysTrue()) { \
+    ::testing::internal::HasNewFatalFailureHelper gtest_fatal_failure_checker; \
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+    if (gtest_fatal_failure_checker.has_new_fatal_failure()) { \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__); \
+    } \
+  } else \
+    GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__): \
+      fail("Expected: " #statement " doesn't generate new fatal " \
+           "failures in the current thread.\n" \
+           "  Actual: it does.")
+
+// Expands to the name of the class that implements the given test.
+#define GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \
+  test_case_name##_##test_name##_Test
+
+// Helper macro for defining tests.
+#define GTEST_TEST_(test_case_name, test_name, parent_class, parent_id)\
+class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) : public parent_class {\
+ public:\
+  GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {}\
+ private:\
+  virtual void TestBody();\
+  static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_;\
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(\
+      GTEST_TEST_CLASS_NAME_(test_case_name, test_name));\
+};\
+\
+::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_case_name, test_name)\
+  ::test_info_ =\
+    ::testing::internal::MakeAndRegisterTestInfo(\
+        #test_case_name, #test_name, NULL, NULL, \
+        (parent_id), \
+        parent_class::SetUpTestCase, \
+        parent_class::TearDownTestCase, \
+        new ::testing::internal::TestFactoryImpl<\
+            GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>);\
+void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody()
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
+
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan at google.com (Zhanyong Wan)
+//
+// The Google C++ Testing Framework (Google Test)
+//
+// This header file defines the public API for death tests.  It is
+// #included by gtest.h so a user doesn't need to include this
+// directly.
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
+#define GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
+
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: wan at google.com (Zhanyong Wan), eefacm at gmail.com (Sean Mcafee)
+//
+// The Google C++ Testing Framework (Google Test)
+//
+// This header file defines internal utilities needed for implementing
+// death tests.  They are subject to change without notice.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
+
+
+#include <stdio.h>
+
+namespace testing {
+namespace internal {
+
+GTEST_DECLARE_string_(internal_run_death_test);
+
+// Names of the flags (needed for parsing Google Test flags).
+const char kDeathTestStyleFlag[] = "death_test_style";
+const char kDeathTestUseFork[] = "death_test_use_fork";
+const char kInternalRunDeathTestFlag[] = "internal_run_death_test";
+
+#if GTEST_HAS_DEATH_TEST
+
+// DeathTest is a class that hides much of the complexity of the
+// GTEST_DEATH_TEST_ macro.  It is abstract; its static Create method
+// returns a concrete class that depends on the prevailing death test
+// style, as defined by the --gtest_death_test_style and/or
+// --gtest_internal_run_death_test flags.
+
+// In describing the results of death tests, these terms are used with
+// the corresponding definitions:
+//
+// exit status:  The integer exit information in the format specified
+//               by wait(2)
+// exit code:    The integer code passed to exit(3), _exit(2), or
+//               returned from main()
+class GTEST_API_ DeathTest {
+ public:
+  // Create returns false if there was an error determining the
+  // appropriate action to take for the current death test; for example,
+  // if the gtest_death_test_style flag is set to an invalid value.
+  // The LastMessage method will return a more detailed message in that
+  // case.  Otherwise, the DeathTest pointer pointed to by the "test"
+  // argument is set.  If the death test should be skipped, the pointer
+  // is set to NULL; otherwise, it is set to the address of a new concrete
+  // DeathTest object that controls the execution of the current test.
+  static bool Create(const char* statement, const RE* regex,
+                     const char* file, int line, DeathTest** test);
+  DeathTest();
+  virtual ~DeathTest() { }
+
+  // A helper class that aborts a death test when it's deleted.
+  class ReturnSentinel {
+   public:
+    explicit ReturnSentinel(DeathTest* test) : test_(test) { }
+    ~ReturnSentinel() { test_->Abort(TEST_ENCOUNTERED_RETURN_STATEMENT); }
+   private:
+    DeathTest* const test_;
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(ReturnSentinel);
+  } GTEST_ATTRIBUTE_UNUSED_;
+
+  // An enumeration of possible roles that may be taken when a death
+  // test is encountered.  EXECUTE means that the death test logic should
+  // be executed immediately.  OVERSEE means that the program should prepare
+  // the appropriate environment for a child process to execute the death
+  // test, then wait for it to complete.
+  enum TestRole { OVERSEE_TEST, EXECUTE_TEST };
+
+  // An enumeration of the three reasons that a test might be aborted.
+  enum AbortReason {
+    TEST_ENCOUNTERED_RETURN_STATEMENT,
+    TEST_THREW_EXCEPTION,
+    TEST_DID_NOT_DIE
+  };
+
+  // Assumes one of the above roles.
+  virtual TestRole AssumeRole() = 0;
+
+  // Waits for the death test to finish and returns its status.
+  virtual int Wait() = 0;
+
+  // Returns true if the death test passed; that is, the test process
+  // exited during the test, its exit status matches a user-supplied
+  // predicate, and its stderr output matches a user-supplied regular
+  // expression.
+  // The user-supplied predicate may be a macro expression rather
+  // than a function pointer or functor, or else Wait and Passed could
+  // be combined.
+  virtual bool Passed(bool exit_status_ok) = 0;
+
+  // Signals that the death test did not die as expected.
+  virtual void Abort(AbortReason reason) = 0;
+
+  // Returns a human-readable outcome message regarding the outcome of
+  // the last death test.
+  static const char* LastMessage();
+
+  static void set_last_death_test_message(const std::string& message);
+
+ private:
+  // A string containing a description of the outcome of the last death test.
+  static std::string last_death_test_message_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(DeathTest);
+};
+
+// Factory interface for death tests.  May be mocked out for testing.
+class DeathTestFactory {
+ public:
+  virtual ~DeathTestFactory() { }
+  virtual bool Create(const char* statement, const RE* regex,
+                      const char* file, int line, DeathTest** test) = 0;
+};
+
+// A concrete DeathTestFactory implementation for normal use.
+class DefaultDeathTestFactory : public DeathTestFactory {
+ public:
+  virtual bool Create(const char* statement, const RE* regex,
+                      const char* file, int line, DeathTest** test);
+};
+
+// Returns true if exit_status describes a process that was terminated
+// by a signal, or exited normally with a nonzero exit code.
+GTEST_API_ bool ExitedUnsuccessfully(int exit_status);
+
+// Traps C++ exceptions escaping statement and reports them as test
+// failures. Note that trapping SEH exceptions is not implemented here.
+# if GTEST_HAS_EXCEPTIONS
+#  define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
+  try { \
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+  } catch (const ::std::exception& gtest_exception) { \
+    fprintf(\
+        stderr, \
+        "\n%s: Caught std::exception-derived exception escaping the " \
+        "death test statement. Exception message: %s\n", \
+        ::testing::internal::FormatFileLocation(__FILE__, __LINE__).c_str(), \
+        gtest_exception.what()); \
+    fflush(stderr); \
+    death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \
+  } catch (...) { \
+    death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \
+  }
+
+# else
+#  define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
+  GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement)
+
+# endif
+
+// This macro is for implementing ASSERT_DEATH*, EXPECT_DEATH*,
+// ASSERT_EXIT*, and EXPECT_EXIT*.
+# define GTEST_DEATH_TEST_(statement, predicate, regex, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::AlwaysTrue()) { \
+    const ::testing::internal::RE& gtest_regex = (regex); \
+    ::testing::internal::DeathTest* gtest_dt; \
+    if (!::testing::internal::DeathTest::Create(#statement, &gtest_regex, \
+        __FILE__, __LINE__, &gtest_dt)) { \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__); \
+    } \
+    if (gtest_dt != NULL) { \
+      ::testing::internal::scoped_ptr< ::testing::internal::DeathTest> \
+          gtest_dt_ptr(gtest_dt); \
+      switch (gtest_dt->AssumeRole()) { \
+        case ::testing::internal::DeathTest::OVERSEE_TEST: \
+          if (!gtest_dt->Passed(predicate(gtest_dt->Wait()))) { \
+            goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__); \
+          } \
+          break; \
+        case ::testing::internal::DeathTest::EXECUTE_TEST: { \
+          ::testing::internal::DeathTest::ReturnSentinel \
+              gtest_sentinel(gtest_dt); \
+          GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, gtest_dt); \
+          gtest_dt->Abort(::testing::internal::DeathTest::TEST_DID_NOT_DIE); \
+          break; \
+        } \
+        default: \
+          break; \
+      } \
+    } \
+  } else \
+    GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__): \
+      fail(::testing::internal::DeathTest::LastMessage())
+// The symbol "fail" here expands to something into which a message
+// can be streamed.
+
+// This macro is for implementing ASSERT/EXPECT_DEBUG_DEATH when compiled in
+// NDEBUG mode. In this case we need the statements to be executed, the regex is
+// ignored, and the macro must accept a streamed message even though the message
+// is never printed.
+# define GTEST_EXECUTE_STATEMENT_(statement, regex) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::AlwaysTrue()) { \
+     GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+  } else \
+    ::testing::Message()
+
+// A class representing the parsed contents of the
+// --gtest_internal_run_death_test flag, as it existed when
+// RUN_ALL_TESTS was called.
+class InternalRunDeathTestFlag {
+ public:
+  InternalRunDeathTestFlag(const std::string& a_file,
+                           int a_line,
+                           int an_index,
+                           int a_write_fd)
+      : file_(a_file), line_(a_line), index_(an_index),
+        write_fd_(a_write_fd) {}
+
+  ~InternalRunDeathTestFlag() {
+    if (write_fd_ >= 0)
+      posix::Close(write_fd_);
+  }
+
+  const std::string& file() const { return file_; }
+  int line() const { return line_; }
+  int index() const { return index_; }
+  int write_fd() const { return write_fd_; }
+
+ private:
+  std::string file_;
+  int line_;
+  int index_;
+  int write_fd_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(InternalRunDeathTestFlag);
+};
+
+// Returns a newly created InternalRunDeathTestFlag object with fields
+// initialized from the GTEST_FLAG(internal_run_death_test) flag if
+// the flag is specified; otherwise returns NULL.
+InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag();
+
+#else  // GTEST_HAS_DEATH_TEST
+
+// This macro is used for implementing macros such as
+// EXPECT_DEATH_IF_SUPPORTED and ASSERT_DEATH_IF_SUPPORTED on systems where
+// death tests are not supported. Those macros must compile on such systems
+// iff EXPECT_DEATH and ASSERT_DEATH compile with the same parameters on
+// systems that support death tests. This allows one to write such a macro
+// on a system that does not support death tests and be sure that it will
+// compile on a death-test supporting system.
+//
+// Parameters:
+//   statement -  A statement that a macro such as EXPECT_DEATH would test
+//                for program termination. This macro has to make sure this
+//                statement is compiled but not executed, to ensure that
+//                EXPECT_DEATH_IF_SUPPORTED compiles with a certain
+//                parameter iff EXPECT_DEATH compiles with it.
+//   regex     -  A regex that a macro such as EXPECT_DEATH would use to test
+//                the output of statement.  This parameter has to be
+//                compiled but not evaluated by this macro, to ensure that
+//                this macro only accepts expressions that a macro such as
+//                EXPECT_DEATH would accept.
+//   terminator - Must be an empty statement for EXPECT_DEATH_IF_SUPPORTED
+//                and a return statement for ASSERT_DEATH_IF_SUPPORTED.
+//                This ensures that ASSERT_DEATH_IF_SUPPORTED will not
+//                compile inside functions where ASSERT_DEATH doesn't
+//                compile.
+//
+//  The branch that has an always false condition is used to ensure that
+//  statement and regex are compiled (and thus syntactically correct) but
+//  never executed. The unreachable code macro protects the terminator
+//  statement from generating an 'unreachable code' warning in case
+//  statement unconditionally returns or throws. The Message constructor at
+//  the end allows the syntax of streaming additional messages into the
+//  macro, for compilational compatibility with EXPECT_DEATH/ASSERT_DEATH.
+# define GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, terminator) \
+    GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+    if (::testing::internal::AlwaysTrue()) { \
+      GTEST_LOG_(WARNING) \
+          << "Death tests are not supported on this platform.\n" \
+          << "Statement '" #statement "' cannot be verified."; \
+    } else if (::testing::internal::AlwaysFalse()) { \
+      ::testing::internal::RE::PartialMatch(".*", (regex)); \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+      terminator; \
+    } else \
+      ::testing::Message()
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
+
+namespace testing {
+
+// This flag controls the style of death tests.  Valid values are "threadsafe",
+// meaning that the death test child process will re-execute the test binary
+// from the start, running only a single death test, or "fast",
+// meaning that the child process will execute the test logic immediately
+// after forking.
+GTEST_DECLARE_string_(death_test_style);
+
+#if GTEST_HAS_DEATH_TEST
+
+namespace internal {
+
+// Returns a Boolean value indicating whether the caller is currently
+// executing in the context of the death test child process.  Tools such as
+// Valgrind heap checkers may need this to modify their behavior in death
+// tests.  IMPORTANT: This is an internal utility.  Using it may break the
+// implementation of death tests.  User code MUST NOT use it.
+GTEST_API_ bool InDeathTestChild();
+
+}  // namespace internal
+
+// The following macros are useful for writing death tests.
+
+// Here's what happens when an ASSERT_DEATH* or EXPECT_DEATH* is
+// executed:
+//
+//   1. It generates a warning if there is more than one active
+//   thread.  This is because it's safe to fork() or clone() only
+//   when there is a single thread.
+//
+//   2. The parent process clone()s a sub-process and runs the death
+//   test in it; the sub-process exits with code 0 at the end of the
+//   death test, if it hasn't exited already.
+//
+//   3. The parent process waits for the sub-process to terminate.
+//
+//   4. The parent process checks the exit code and error message of
+//   the sub-process.
+//
+// Examples:
+//
+//   ASSERT_DEATH(server.SendMessage(56, "Hello"), "Invalid port number");
+//   for (int i = 0; i < 5; i++) {
+//     EXPECT_DEATH(server.ProcessRequest(i),
+//                  "Invalid request .* in ProcessRequest()")
+//                  << "Failed to die on request " << i;
+//   }
+//
+//   ASSERT_EXIT(server.ExitNow(), ::testing::ExitedWithCode(0), "Exiting");
+//
+//   bool KilledBySIGHUP(int exit_code) {
+//     return WIFSIGNALED(exit_code) && WTERMSIG(exit_code) == SIGHUP;
+//   }
+//
+//   ASSERT_EXIT(client.HangUpServer(), KilledBySIGHUP, "Hanging up!");
+//
+// On the regular expressions used in death tests:
+//
+//   On POSIX-compliant systems (*nix), we use the <regex.h> library,
+//   which uses the POSIX extended regex syntax.
+//
+//   On other platforms (e.g. Windows), we only support a simple regex
+//   syntax implemented as part of Google Test.  This limited
+//   implementation should be enough most of the time when writing
+//   death tests; though it lacks many features you can find in PCRE
+//   or POSIX extended regex syntax.  For example, we don't support
+//   union ("x|y"), grouping ("(xy)"), brackets ("[xy]"), and
+//   repetition count ("x{5,7}"), among others.
+//
+//   Below is the syntax that we do support.  We chose it to be a
+//   subset of both PCRE and POSIX extended regex, so it's easy to
+//   learn wherever you come from.  In the following: 'A' denotes a
+//   literal character, period (.), or a single \\ escape sequence;
+//   'x' and 'y' denote regular expressions; 'm' and 'n' are for
+//   natural numbers.
+//
+//     c     matches any literal character c
+//     \\d   matches any decimal digit
+//     \\D   matches any character that's not a decimal digit
+//     \\f   matches \f
+//     \\n   matches \n
+//     \\r   matches \r
+//     \\s   matches any ASCII whitespace, including \n
+//     \\S   matches any character that's not a whitespace
+//     \\t   matches \t
+//     \\v   matches \v
+//     \\w   matches any letter, _, or decimal digit
+//     \\W   matches any character that \\w doesn't match
+//     \\c   matches any literal character c, which must be a punctuation
+//     .     matches any single character except \n
+//     A?    matches 0 or 1 occurrences of A
+//     A*    matches 0 or many occurrences of A
+//     A+    matches 1 or many occurrences of A
+//     ^     matches the beginning of a string (not that of each line)
+//     $     matches the end of a string (not that of each line)
+//     xy    matches x followed by y
+//
+//   If you accidentally use PCRE or POSIX extended regex features
+//   not implemented by us, you will get a run-time failure.  In that
+//   case, please try to rewrite your regular expression within the
+//   above syntax.
+//
+//   This implementation is *not* meant to be as highly tuned or robust
+//   as a compiled regex library, but should perform well enough for a
+//   death test, which already incurs significant overhead by launching
+//   a child process.
+//
+// Known caveats:
+//
+//   A "threadsafe" style death test obtains the path to the test
+//   program from argv[0] and re-executes it in the sub-process.  For
+//   simplicity, the current implementation doesn't search the PATH
+//   when launching the sub-process.  This means that the user must
+//   invoke the test program via a path that contains at least one
+//   path separator (e.g. path/to/foo_test and
+//   /absolute/path/to/bar_test are fine, but foo_test is not).  This
+//   is rarely a problem as people usually don't put the test binary
+//   directory in PATH.
+//
+// TODO(wan at google.com): make thread-safe death tests search the PATH.
+
+// Asserts that a given statement causes the program to exit, with an
+// integer exit status that satisfies predicate, and emitting error output
+// that matches regex.
+# define ASSERT_EXIT(statement, predicate, regex) \
+    GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_FATAL_FAILURE_)
+
+// Like ASSERT_EXIT, but continues on to successive tests in the
+// test case, if any:
+# define EXPECT_EXIT(statement, predicate, regex) \
+    GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_NONFATAL_FAILURE_)
+
+// Asserts that a given statement causes the program to exit, either by
+// explicitly exiting with a nonzero exit code or being killed by a
+// signal, and emitting error output that matches regex.
+# define ASSERT_DEATH(statement, regex) \
+    ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex)
+
+// Like ASSERT_DEATH, but continues on to successive tests in the
+// test case, if any:
+# define EXPECT_DEATH(statement, regex) \
+    EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex)
+
+// Two predicate classes that can be used in {ASSERT,EXPECT}_EXIT*:
+
+// Tests that an exit code describes a normal exit with a given exit code.
+class GTEST_API_ ExitedWithCode {
+ public:
+  explicit ExitedWithCode(int exit_code);
+  bool operator()(int exit_status) const;
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ExitedWithCode& other);
+
+  const int exit_code_;
+};
+
+# if !GTEST_OS_WINDOWS
+// Tests that an exit code describes an exit due to termination by a
+// given signal.
+class GTEST_API_ KilledBySignal {
+ public:
+  explicit KilledBySignal(int signum);
+  bool operator()(int exit_status) const;
+ private:
+  const int signum_;
+};
+# endif  // !GTEST_OS_WINDOWS
+
+// EXPECT_DEBUG_DEATH asserts that the given statements die in debug mode.
+// The death testing framework causes this to have interesting semantics,
+// since the sideeffects of the call are only visible in opt mode, and not
+// in debug mode.
+//
+// In practice, this can be used to test functions that utilize the
+// LOG(DFATAL) macro using the following style:
+//
+// int DieInDebugOr12(int* sideeffect) {
+//   if (sideeffect) {
+//     *sideeffect = 12;
+//   }
+//   LOG(DFATAL) << "death";
+//   return 12;
+// }
+//
+// TEST(TestCase, TestDieOr12WorksInDgbAndOpt) {
+//   int sideeffect = 0;
+//   // Only asserts in dbg.
+//   EXPECT_DEBUG_DEATH(DieInDebugOr12(&sideeffect), "death");
+//
+// #ifdef NDEBUG
+//   // opt-mode has sideeffect visible.
+//   EXPECT_EQ(12, sideeffect);
+// #else
+//   // dbg-mode no visible sideeffect.
+//   EXPECT_EQ(0, sideeffect);
+// #endif
+// }
+//
+// This will assert that DieInDebugReturn12InOpt() crashes in debug
+// mode, usually due to a DCHECK or LOG(DFATAL), but returns the
+// appropriate fallback value (12 in this case) in opt mode. If you
+// need to test that a function has appropriate side-effects in opt
+// mode, include assertions against the side-effects.  A general
+// pattern for this is:
+//
+// EXPECT_DEBUG_DEATH({
+//   // Side-effects here will have an effect after this statement in
+//   // opt mode, but none in debug mode.
+//   EXPECT_EQ(12, DieInDebugOr12(&sideeffect));
+// }, "death");
+//
+# ifdef NDEBUG
+
+#  define EXPECT_DEBUG_DEATH(statement, regex) \
+  GTEST_EXECUTE_STATEMENT_(statement, regex)
+
+#  define ASSERT_DEBUG_DEATH(statement, regex) \
+  GTEST_EXECUTE_STATEMENT_(statement, regex)
+
+# else
+
+#  define EXPECT_DEBUG_DEATH(statement, regex) \
+  EXPECT_DEATH(statement, regex)
+
+#  define ASSERT_DEBUG_DEATH(statement, regex) \
+  ASSERT_DEATH(statement, regex)
+
+# endif  // NDEBUG for EXPECT_DEBUG_DEATH
+#endif  // GTEST_HAS_DEATH_TEST
+
+// EXPECT_DEATH_IF_SUPPORTED(statement, regex) and
+// ASSERT_DEATH_IF_SUPPORTED(statement, regex) expand to real death tests if
+// death tests are supported; otherwise they just issue a warning.  This is
+// useful when you are combining death test assertions with normal test
+// assertions in one test.
+#if GTEST_HAS_DEATH_TEST
+# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
+    EXPECT_DEATH(statement, regex)
+# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
+    ASSERT_DEATH(statement, regex)
+#else
+# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
+    GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, )
+# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
+    GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, return)
+#endif
+
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
+// This file was GENERATED by command:
+//     pump.py gtest-param-test.h.pump
+// DO NOT EDIT BY HAND!!!
+
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: vladl at google.com (Vlad Losev)
+//
+// Macros and functions for implementing parameterized tests
+// in Google C++ Testing Framework (Google Test)
+//
+// This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
+//
+#ifndef GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
+#define GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
+
+
+// Value-parameterized tests allow you to test your code with different
+// parameters without writing multiple copies of the same test.
+//
+// Here is how you use value-parameterized tests:
+
+#if 0
+
+// To write value-parameterized tests, first you should define a fixture
+// class. It is usually derived from testing::TestWithParam<T> (see below for
+// another inheritance scheme that's sometimes useful in more complicated
+// class hierarchies), where the type of your parameter values.
+// TestWithParam<T> is itself derived from testing::Test. T can be any
+// copyable type. If it's a raw pointer, you are responsible for managing the
+// lifespan of the pointed values.
+
+class FooTest : public ::testing::TestWithParam<const char*> {
+  // You can implement all the usual class fixture members here.
+};
+
+// Then, use the TEST_P macro to define as many parameterized tests
+// for this fixture as you want. The _P suffix is for "parameterized"
+// or "pattern", whichever you prefer to think.
+
+TEST_P(FooTest, DoesBlah) {
+  // Inside a test, access the test parameter with the GetParam() method
+  // of the TestWithParam<T> class:
+  EXPECT_TRUE(foo.Blah(GetParam()));
+  ...
+}
+
+TEST_P(FooTest, HasBlahBlah) {
+  ...
+}
+
+// Finally, you can use INSTANTIATE_TEST_CASE_P to instantiate the test
+// case with any set of parameters you want. Google Test defines a number
+// of functions for generating test parameters. They return what we call
+// (surprise!) parameter generators. Here is a  summary of them, which
+// are all in the testing namespace:
+//
+//
+//  Range(begin, end [, step]) - Yields values {begin, begin+step,
+//                               begin+step+step, ...}. The values do not
+//                               include end. step defaults to 1.
+//  Values(v1, v2, ..., vN)    - Yields values {v1, v2, ..., vN}.
+//  ValuesIn(container)        - Yields values from a C-style array, an STL
+//  ValuesIn(begin,end)          container, or an iterator range [begin, end).
+//  Bool()                     - Yields sequence {false, true}.
+//  Combine(g1, g2, ..., gN)   - Yields all combinations (the Cartesian product
+//                               for the math savvy) of the values generated
+//                               by the N generators.
+//
+// For more details, see comments at the definitions of these functions below
+// in this file.
+//
+// The following statement will instantiate tests from the FooTest test case
+// each with parameter values "meeny", "miny", and "moe".
+
+INSTANTIATE_TEST_CASE_P(InstantiationName,
+                        FooTest,
+                        Values("meeny", "miny", "moe"));
+
+// To distinguish different instances of the pattern, (yes, you
+// can instantiate it more then once) the first argument to the
+// INSTANTIATE_TEST_CASE_P macro is a prefix that will be added to the
+// actual test case name. Remember to pick unique prefixes for different
+// instantiations. The tests from the instantiation above will have
+// these names:
+//
+//    * InstantiationName/FooTest.DoesBlah/0 for "meeny"
+//    * InstantiationName/FooTest.DoesBlah/1 for "miny"
+//    * InstantiationName/FooTest.DoesBlah/2 for "moe"
+//    * InstantiationName/FooTest.HasBlahBlah/0 for "meeny"
+//    * InstantiationName/FooTest.HasBlahBlah/1 for "miny"
+//    * InstantiationName/FooTest.HasBlahBlah/2 for "moe"
+//
+// You can use these names in --gtest_filter.
+//
+// This statement will instantiate all tests from FooTest again, each
+// with parameter values "cat" and "dog":
+
+const char* pets[] = {"cat", "dog"};
+INSTANTIATE_TEST_CASE_P(AnotherInstantiationName, FooTest, ValuesIn(pets));
+
+// The tests from the instantiation above will have these names:
+//
+//    * AnotherInstantiationName/FooTest.DoesBlah/0 for "cat"
+//    * AnotherInstantiationName/FooTest.DoesBlah/1 for "dog"
+//    * AnotherInstantiationName/FooTest.HasBlahBlah/0 for "cat"
+//    * AnotherInstantiationName/FooTest.HasBlahBlah/1 for "dog"
+//
+// Please note that INSTANTIATE_TEST_CASE_P will instantiate all tests
+// in the given test case, whether their definitions come before or
+// AFTER the INSTANTIATE_TEST_CASE_P statement.
+//
+// Please also note that generator expressions (including parameters to the
+// generators) are evaluated in InitGoogleTest(), after main() has started.
+// This allows the user on one hand, to adjust generator parameters in order
+// to dynamically determine a set of tests to run and on the other hand,
+// give the user a chance to inspect the generated tests with Google Test
+// reflection API before RUN_ALL_TESTS() is executed.
+//
+// You can see samples/sample7_unittest.cc and samples/sample8_unittest.cc
+// for more examples.
+//
+// In the future, we plan to publish the API for defining new parameter
+// generators. But for now this interface remains part of the internal
+// implementation and is subject to change.
+//
+//
+// A parameterized test fixture must be derived from testing::Test and from
+// testing::WithParamInterface<T>, where T is the type of the parameter
+// values. Inheriting from TestWithParam<T> satisfies that requirement because
+// TestWithParam<T> inherits from both Test and WithParamInterface. In more
+// complicated hierarchies, however, it is occasionally useful to inherit
+// separately from Test and WithParamInterface. For example:
+
+class BaseTest : public ::testing::Test {
+  // You can inherit all the usual members for a non-parameterized test
+  // fixture here.
+};
+
+class DerivedTest : public BaseTest, public ::testing::WithParamInterface<int> {
+  // The usual test fixture members go here too.
+};
+
+TEST_F(BaseTest, HasFoo) {
+  // This is an ordinary non-parameterized test.
+}
+
+TEST_P(DerivedTest, DoesBlah) {
+  // GetParam works just the same here as if you inherit from TestWithParam.
+  EXPECT_TRUE(foo.Blah(GetParam()));
+}
+
+#endif  // 0
+
+
+#if !GTEST_OS_SYMBIAN
+# include <utility>
+#endif
+
+// scripts/fuse_gtest.py depends on gtest's own header being #included
+// *unconditionally*.  Therefore these #includes cannot be moved
+// inside #if GTEST_HAS_PARAM_TEST.
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: vladl at google.com (Vlad Losev)
+
+// Type and function utilities for implementing parameterized tests.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
+
+#include <iterator>
+#include <utility>
+#include <vector>
+
+// scripts/fuse_gtest.py depends on gtest's own header being #included
+// *unconditionally*.  Therefore these #includes cannot be moved
+// inside #if GTEST_HAS_PARAM_TEST.
+// Copyright 2003 Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: Dan Egnor (egnor at google.com)
+//
+// A "smart" pointer type with reference tracking.  Every pointer to a
+// particular object is kept on a circular linked list.  When the last pointer
+// to an object is destroyed or reassigned, the object is deleted.
+//
+// Used properly, this deletes the object when the last reference goes away.
+// There are several caveats:
+// - Like all reference counting schemes, cycles lead to leaks.
+// - Each smart pointer is actually two pointers (8 bytes instead of 4).
+// - Every time a pointer is assigned, the entire list of pointers to that
+//   object is traversed.  This class is therefore NOT SUITABLE when there
+//   will often be more than two or three pointers to a particular object.
+// - References are only tracked as long as linked_ptr<> objects are copied.
+//   If a linked_ptr<> is converted to a raw pointer and back, BAD THINGS
+//   will happen (double deletion).
+//
+// A good use of this class is storing object references in STL containers.
+// You can safely put linked_ptr<> in a vector<>.
+// Other uses may not be as good.
+//
+// Note: If you use an incomplete type with linked_ptr<>, the class
+// *containing* linked_ptr<> must have a constructor and destructor (even
+// if they do nothing!).
+//
+// Bill Gibbons suggested we use something like this.
+//
+// Thread Safety:
+//   Unlike other linked_ptr implementations, in this implementation
+//   a linked_ptr object is thread-safe in the sense that:
+//     - it's safe to copy linked_ptr objects concurrently,
+//     - it's safe to copy *from* a linked_ptr and read its underlying
+//       raw pointer (e.g. via get()) concurrently, and
+//     - it's safe to write to two linked_ptrs that point to the same
+//       shared object concurrently.
+// TODO(wan at google.com): rename this to safe_linked_ptr to avoid
+// confusion with normal linked_ptr.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_
+
+#include <stdlib.h>
+#include <assert.h>
+
+
+namespace testing {
+namespace internal {
+
+// Protects copying of all linked_ptr objects.
+GTEST_API_ GTEST_DECLARE_STATIC_MUTEX_(g_linked_ptr_mutex);
+
+// This is used internally by all instances of linked_ptr<>.  It needs to be
+// a non-template class because different types of linked_ptr<> can refer to
+// the same object (linked_ptr<Superclass>(obj) vs linked_ptr<Subclass>(obj)).
+// So, it needs to be possible for different types of linked_ptr to participate
+// in the same circular linked list, so we need a single class type here.
+//
+// DO NOT USE THIS CLASS DIRECTLY YOURSELF.  Use linked_ptr<T>.
+class linked_ptr_internal {
+ public:
+  // Create a new circle that includes only this instance.
+  void join_new() {
+    next_ = this;
+  }
+
+  // Many linked_ptr operations may change p.link_ for some linked_ptr
+  // variable p in the same circle as this object.  Therefore we need
+  // to prevent two such operations from occurring concurrently.
+  //
+  // Note that different types of linked_ptr objects can coexist in a
+  // circle (e.g. linked_ptr<Base>, linked_ptr<Derived1>, and
+  // linked_ptr<Derived2>).  Therefore we must use a single mutex to
+  // protect all linked_ptr objects.  This can create serious
+  // contention in production code, but is acceptable in a testing
+  // framework.
+
+  // Join an existing circle.
+  void join(linked_ptr_internal const* ptr)
+      GTEST_LOCK_EXCLUDED_(g_linked_ptr_mutex) {
+    MutexLock lock(&g_linked_ptr_mutex);
+
+    linked_ptr_internal const* p = ptr;
+    while (p->next_ != ptr) {
+      assert(p->next_ != this &&
+             "Trying to join() a linked ring we are already in. "
+             "Is GMock thread safety enabled?");
+      p = p->next_;
+    }
+    p->next_ = this;
+    next_ = ptr;
+  }
+
+  // Leave whatever circle we're part of.  Returns true if we were the
+  // last member of the circle.  Once this is done, you can join() another.
+  bool depart()
+      GTEST_LOCK_EXCLUDED_(g_linked_ptr_mutex) {
+    MutexLock lock(&g_linked_ptr_mutex);
+
+    if (next_ == this) return true;
+    linked_ptr_internal const* p = next_;
+    while (p->next_ != this) {
+      assert(p->next_ != next_ &&
+             "Trying to depart() a linked ring we are not in. "
+             "Is GMock thread safety enabled?");
+      p = p->next_;
+    }
+    p->next_ = next_;
+    return false;
+  }
+
+ private:
+  mutable linked_ptr_internal const* next_;
+};
+
+template <typename T>
+class linked_ptr {
+ public:
+  typedef T element_type;
+
+  // Take over ownership of a raw pointer.  This should happen as soon as
+  // possible after the object is created.
+  explicit linked_ptr(T* ptr = NULL) { capture(ptr); }
+  ~linked_ptr() { depart(); }
+
+  // Copy an existing linked_ptr<>, adding ourselves to the list of references.
+  template <typename U> linked_ptr(linked_ptr<U> const& ptr) { copy(&ptr); }
+  linked_ptr(linked_ptr const& ptr) {  // NOLINT
+    assert(&ptr != this);
+    copy(&ptr);
+  }
+
+  // Assignment releases the old value and acquires the new.
+  template <typename U> linked_ptr& operator=(linked_ptr<U> const& ptr) {
+    depart();
+    copy(&ptr);
+    return *this;
+  }
+
+  linked_ptr& operator=(linked_ptr const& ptr) {
+    if (&ptr != this) {
+      depart();
+      copy(&ptr);
+    }
+    return *this;
+  }
+
+  // Smart pointer members.
+  void reset(T* ptr = NULL) {
+    depart();
+    capture(ptr);
+  }
+  T* get() const { return value_; }
+  T* operator->() const { return value_; }
+  T& operator*() const { return *value_; }
+
+  bool operator==(T* p) const { return value_ == p; }
+  bool operator!=(T* p) const { return value_ != p; }
+  template <typename U>
+  bool operator==(linked_ptr<U> const& ptr) const {
+    return value_ == ptr.get();
+  }
+  template <typename U>
+  bool operator!=(linked_ptr<U> const& ptr) const {
+    return value_ != ptr.get();
+  }
+
+ private:
+  template <typename U>
+  friend class linked_ptr;
+
+  T* value_;
+  linked_ptr_internal link_;
+
+  void depart() {
+    if (link_.depart()) delete value_;
+  }
+
+  void capture(T* ptr) {
+    value_ = ptr;
+    link_.join_new();
+  }
+
+  template <typename U> void copy(linked_ptr<U> const* ptr) {
+    value_ = ptr->get();
+    if (value_)
+      link_.join(&ptr->link_);
+    else
+      link_.join_new();
+  }
+};
+
+template<typename T> inline
+bool operator==(T* ptr, const linked_ptr<T>& x) {
+  return ptr == x.get();
+}
+
+template<typename T> inline
+bool operator!=(T* ptr, const linked_ptr<T>& x) {
+  return ptr != x.get();
+}
+
+// A function to convert T* into linked_ptr<T>
+// Doing e.g. make_linked_ptr(new FooBarBaz<type>(arg)) is a shorter notation
+// for linked_ptr<FooBarBaz<type> >(new FooBarBaz<type>(arg))
+template <typename T>
+linked_ptr<T> make_linked_ptr(T* ptr) {
+  return linked_ptr<T>(ptr);
+}
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan at google.com (Zhanyong Wan)
+
+// Google Test - The Google C++ Testing Framework
+//
+// This file implements a universal value printer that can print a
+// value of any type T:
+//
+//   void ::testing::internal::UniversalPrinter<T>::Print(value, ostream_ptr);
+//
+// A user can teach this function how to print a class type T by
+// defining either operator<<() or PrintTo() in the namespace that
+// defines T.  More specifically, the FIRST defined function in the
+// following list will be used (assuming T is defined in namespace
+// foo):
+//
+//   1. foo::PrintTo(const T&, ostream*)
+//   2. operator<<(ostream&, const T&) defined in either foo or the
+//      global namespace.
+//
+// If none of the above is defined, it will print the debug string of
+// the value if it is a protocol buffer, or print the raw bytes in the
+// value otherwise.
+//
+// To aid debugging: when T is a reference type, the address of the
+// value is also printed; when T is a (const) char pointer, both the
+// pointer value and the NUL-terminated string it points to are
+// printed.
+//
+// We also provide some convenient wrappers:
+//
+//   // Prints a value to a string.  For a (const or not) char
+//   // pointer, the NUL-terminated string (but not the pointer) is
+//   // printed.
+//   std::string ::testing::PrintToString(const T& value);
+//
+//   // Prints a value tersely: for a reference type, the referenced
+//   // value (but not the address) is printed; for a (const or not) char
+//   // pointer, the NUL-terminated string (but not the pointer) is
+//   // printed.
+//   void ::testing::internal::UniversalTersePrint(const T& value, ostream*);
+//
+//   // Prints value using the type inferred by the compiler.  The difference
+//   // from UniversalTersePrint() is that this function prints both the
+//   // pointer and the NUL-terminated string for a (const or not) char pointer.
+//   void ::testing::internal::UniversalPrint(const T& value, ostream*);
+//
+//   // Prints the fields of a tuple tersely to a string vector, one
+//   // element for each field. Tuple support must be enabled in
+//   // gtest-port.h.
+//   std::vector<string> UniversalTersePrintTupleFieldsToStrings(
+//       const Tuple& value);
+//
+// Known limitation:
+//
+// The print primitives print the elements of an STL-style container
+// using the compiler-inferred type of *iter where iter is a
+// const_iterator of the container.  When const_iterator is an input
+// iterator but not a forward iterator, this inferred type may not
+// match value_type, and the print output may be incorrect.  In
+// practice, this is rarely a problem as for most containers
+// const_iterator is a forward iterator.  We'll fix this if there's an
+// actual need for it.  Note that this fix cannot rely on value_type
+// being defined as many user-defined container types don't have
+// value_type.
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
+#define GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
+
+#include <ostream>  // NOLINT
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#if GTEST_HAS_STD_TUPLE_
+# include <tuple>
+#endif
+
+namespace testing {
+
+// Definitions in the 'internal' and 'internal2' name spaces are
+// subject to change without notice.  DO NOT USE THEM IN USER CODE!
+namespace internal2 {
+
+// Prints the given number of bytes in the given object to the given
+// ostream.
+GTEST_API_ void PrintBytesInObjectTo(const unsigned char* obj_bytes,
+                                     size_t count,
+                                     ::std::ostream* os);
+
+// For selecting which printer to use when a given type has neither <<
+// nor PrintTo().
+enum TypeKind {
+  kProtobuf,              // a protobuf type
+  kConvertibleToInteger,  // a type implicitly convertible to BiggestInt
+                          // (e.g. a named or unnamed enum type)
+  kOtherType              // anything else
+};
+
+// TypeWithoutFormatter<T, kTypeKind>::PrintValue(value, os) is called
+// by the universal printer to print a value of type T when neither
+// operator<< nor PrintTo() is defined for T, where kTypeKind is the
+// "kind" of T as defined by enum TypeKind.
+template <typename T, TypeKind kTypeKind>
+class TypeWithoutFormatter {
+ public:
+  // This default version is called when kTypeKind is kOtherType.
+  static void PrintValue(const T& value, ::std::ostream* os) {
+    PrintBytesInObjectTo(reinterpret_cast<const unsigned char*>(&value),
+                         sizeof(value), os);
+  }
+};
+
+// We print a protobuf using its ShortDebugString() when the string
+// doesn't exceed this many characters; otherwise we print it using
+// DebugString() for better readability.
+const size_t kProtobufOneLinerMaxLength = 50;
+
+template <typename T>
+class TypeWithoutFormatter<T, kProtobuf> {
+ public:
+  static void PrintValue(const T& value, ::std::ostream* os) {
+    const ::testing::internal::string short_str = value.ShortDebugString();
+    const ::testing::internal::string pretty_str =
+        short_str.length() <= kProtobufOneLinerMaxLength ?
+        short_str : ("\n" + value.DebugString());
+    *os << ("<" + pretty_str + ">");
+  }
+};
+
+template <typename T>
+class TypeWithoutFormatter<T, kConvertibleToInteger> {
+ public:
+  // Since T has no << operator or PrintTo() but can be implicitly
+  // converted to BiggestInt, we print it as a BiggestInt.
+  //
+  // Most likely T is an enum type (either named or unnamed), in which
+  // case printing it as an integer is the desired behavior.  In case
+  // T is not an enum, printing it as an integer is the best we can do
+  // given that it has no user-defined printer.
+  static void PrintValue(const T& value, ::std::ostream* os) {
+    const internal::BiggestInt kBigInt = value;
+    *os << kBigInt;
+  }
+};
+
+// Prints the given value to the given ostream.  If the value is a
+// protocol message, its debug string is printed; if it's an enum or
+// of a type implicitly convertible to BiggestInt, it's printed as an
+// integer; otherwise the bytes in the value are printed.  This is
+// what UniversalPrinter<T>::Print() does when it knows nothing about
+// type T and T has neither << operator nor PrintTo().
+//
+// A user can override this behavior for a class type Foo by defining
+// a << operator in the namespace where Foo is defined.
+//
+// We put this operator in namespace 'internal2' instead of 'internal'
+// to simplify the implementation, as much code in 'internal' needs to
+// use << in STL, which would conflict with our own << were it defined
+// in 'internal'.
+//
+// Note that this operator<< takes a generic std::basic_ostream<Char,
+// CharTraits> type instead of the more restricted std::ostream.  If
+// we define it to take an std::ostream instead, we'll get an
+// "ambiguous overloads" compiler error when trying to print a type
+// Foo that supports streaming to std::basic_ostream<Char,
+// CharTraits>, as the compiler cannot tell whether
+// operator<<(std::ostream&, const T&) or
+// operator<<(std::basic_stream<Char, CharTraits>, const Foo&) is more
+// specific.
+template <typename Char, typename CharTraits, typename T>
+::std::basic_ostream<Char, CharTraits>& operator<<(
+    ::std::basic_ostream<Char, CharTraits>& os, const T& x) {
+  TypeWithoutFormatter<T,
+      (internal::IsAProtocolMessage<T>::value ? kProtobuf :
+       internal::ImplicitlyConvertible<const T&, internal::BiggestInt>::value ?
+       kConvertibleToInteger : kOtherType)>::PrintValue(x, &os);
+  return os;
+}
+
+}  // namespace internal2
+}  // namespace testing
+
+// This namespace MUST NOT BE NESTED IN ::testing, or the name look-up
+// magic needed for implementing UniversalPrinter won't work.
+namespace testing_internal {
+
+// Used to print a value that is not an STL-style container when the
+// user doesn't define PrintTo() for it.
+template <typename T>
+void DefaultPrintNonContainerTo(const T& value, ::std::ostream* os) {
+  // With the following statement, during unqualified name lookup,
+  // testing::internal2::operator<< appears as if it was declared in
+  // the nearest enclosing namespace that contains both
+  // ::testing_internal and ::testing::internal2, i.e. the global
+  // namespace.  For more details, refer to the C++ Standard section
+  // 7.3.4-1 [namespace.udir].  This allows us to fall back onto
+  // testing::internal2::operator<< in case T doesn't come with a <<
+  // operator.
+  //
+  // We cannot write 'using ::testing::internal2::operator<<;', which
+  // gcc 3.3 fails to compile due to a compiler bug.
+  using namespace ::testing::internal2;  // NOLINT
+
+  // Assuming T is defined in namespace foo, in the next statement,
+  // the compiler will consider all of:
+  //
+  //   1. foo::operator<< (thanks to Koenig look-up),
+  //   2. ::operator<< (as the current namespace is enclosed in ::),
+  //   3. testing::internal2::operator<< (thanks to the using statement above).
+  //
+  // The operator<< whose type matches T best will be picked.
+  //
+  // We deliberately allow #2 to be a candidate, as sometimes it's
+  // impossible to define #1 (e.g. when foo is ::std, defining
+  // anything in it is undefined behavior unless you are a compiler
+  // vendor.).
+  *os << value;
+}
+
+}  // namespace testing_internal
+
+namespace testing {
+namespace internal {
+
+// UniversalPrinter<T>::Print(value, ostream_ptr) prints the given
+// value to the given ostream.  The caller must ensure that
+// 'ostream_ptr' is not NULL, or the behavior is undefined.
+//
+// We define UniversalPrinter as a class template (as opposed to a
+// function template), as we need to partially specialize it for
+// reference types, which cannot be done with function templates.
+template <typename T>
+class UniversalPrinter;
+
+template <typename T>
+void UniversalPrint(const T& value, ::std::ostream* os);
+
+// Used to print an STL-style container when the user doesn't define
+// a PrintTo() for it.
+template <typename C>
+void DefaultPrintTo(IsContainer /* dummy */,
+                    false_type /* is not a pointer */,
+                    const C& container, ::std::ostream* os) {
+  const size_t kMaxCount = 32;  // The maximum number of elements to print.
+  *os << '{';
+  size_t count = 0;
+  for (typename C::const_iterator it = container.begin();
+       it != container.end(); ++it, ++count) {
+    if (count > 0) {
+      *os << ',';
+      if (count == kMaxCount) {  // Enough has been printed.
+        *os << " ...";
+        break;
+      }
+    }
+    *os << ' ';
+    // We cannot call PrintTo(*it, os) here as PrintTo() doesn't
+    // handle *it being a native array.
+    internal::UniversalPrint(*it, os);
+  }
+
+  if (count > 0) {
+    *os << ' ';
+  }
+  *os << '}';
+}
+
+// Used to print a pointer that is neither a char pointer nor a member
+// pointer, when the user doesn't define PrintTo() for it.  (A member
+// variable pointer or member function pointer doesn't really point to
+// a location in the address space.  Their representation is
+// implementation-defined.  Therefore they will be printed as raw
+// bytes.)
+template <typename T>
+void DefaultPrintTo(IsNotContainer /* dummy */,
+                    true_type /* is a pointer */,
+                    T* p, ::std::ostream* os) {
+  if (p == NULL) {
+    *os << "NULL";
+  } else {
+    // C++ doesn't allow casting from a function pointer to any object
+    // pointer.
+    //
+    // IsTrue() silences warnings: "Condition is always true",
+    // "unreachable code".
+    if (IsTrue(ImplicitlyConvertible<T*, const void*>::value)) {
+      // T is not a function type.  We just call << to print p,
+      // relying on ADL to pick up user-defined << for their pointer
+      // types, if any.
+      *os << p;
+    } else {
+      // T is a function type, so '*os << p' doesn't do what we want
+      // (it just prints p as bool).  We want to print p as a const
+      // void*.  However, we cannot cast it to const void* directly,
+      // even using reinterpret_cast, as earlier versions of gcc
+      // (e.g. 3.4.5) cannot compile the cast when p is a function
+      // pointer.  Casting to UInt64 first solves the problem.
+      *os << reinterpret_cast<const void*>(
+          reinterpret_cast<internal::UInt64>(p));
+    }
+  }
+}
+
+// Used to print a non-container, non-pointer value when the user
+// doesn't define PrintTo() for it.
+template <typename T>
+void DefaultPrintTo(IsNotContainer /* dummy */,
+                    false_type /* is not a pointer */,
+                    const T& value, ::std::ostream* os) {
+  ::testing_internal::DefaultPrintNonContainerTo(value, os);
+}
+
+// Prints the given value using the << operator if it has one;
+// otherwise prints the bytes in it.  This is what
+// UniversalPrinter<T>::Print() does when PrintTo() is not specialized
+// or overloaded for type T.
+//
+// A user can override this behavior for a class type Foo by defining
+// an overload of PrintTo() in the namespace where Foo is defined.  We
+// give the user this option as sometimes defining a << operator for
+// Foo is not desirable (e.g. the coding style may prevent doing it,
+// or there is already a << operator but it doesn't do what the user
+// wants).
+template <typename T>
+void PrintTo(const T& value, ::std::ostream* os) {
+  // DefaultPrintTo() is overloaded.  The type of its first two
+  // arguments determine which version will be picked.  If T is an
+  // STL-style container, the version for container will be called; if
+  // T is a pointer, the pointer version will be called; otherwise the
+  // generic version will be called.
+  //
+  // Note that we check for container types here, prior to we check
+  // for protocol message types in our operator<<.  The rationale is:
+  //
+  // For protocol messages, we want to give people a chance to
+  // override Google Mock's format by defining a PrintTo() or
+  // operator<<.  For STL containers, other formats can be
+  // incompatible with Google Mock's format for the container
+  // elements; therefore we check for container types here to ensure
+  // that our format is used.
+  //
+  // The second argument of DefaultPrintTo() is needed to bypass a bug
+  // in Symbian's C++ compiler that prevents it from picking the right
+  // overload between:
+  //
+  //   PrintTo(const T& x, ...);
+  //   PrintTo(T* x, ...);
+  DefaultPrintTo(IsContainerTest<T>(0), is_pointer<T>(), value, os);
+}
+
+// The following list of PrintTo() overloads tells
+// UniversalPrinter<T>::Print() how to print standard types (built-in
+// types, strings, plain arrays, and pointers).
+
+// Overloads for various char types.
+GTEST_API_ void PrintTo(unsigned char c, ::std::ostream* os);
+GTEST_API_ void PrintTo(signed char c, ::std::ostream* os);
+inline void PrintTo(char c, ::std::ostream* os) {
+  // When printing a plain char, we always treat it as unsigned.  This
+  // way, the output won't be affected by whether the compiler thinks
+  // char is signed or not.
+  PrintTo(static_cast<unsigned char>(c), os);
+}
+
+// Overloads for other simple built-in types.
+inline void PrintTo(bool x, ::std::ostream* os) {
+  *os << (x ? "true" : "false");
+}
+
+// Overload for wchar_t type.
+// Prints a wchar_t as a symbol if it is printable or as its internal
+// code otherwise and also as its decimal code (except for L'\0').
+// The L'\0' char is printed as "L'\\0'". The decimal code is printed
+// as signed integer when wchar_t is implemented by the compiler
+// as a signed type and is printed as an unsigned integer when wchar_t
+// is implemented as an unsigned type.
+GTEST_API_ void PrintTo(wchar_t wc, ::std::ostream* os);
+
+// Overloads for C strings.
+GTEST_API_ void PrintTo(const char* s, ::std::ostream* os);
+inline void PrintTo(char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const char*>(s), os);
+}
+
+// signed/unsigned char is often used for representing binary data, so
+// we print pointers to it as void* to be safe.
+inline void PrintTo(const signed char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const void*>(s), os);
+}
+inline void PrintTo(signed char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const void*>(s), os);
+}
+inline void PrintTo(const unsigned char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const void*>(s), os);
+}
+inline void PrintTo(unsigned char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const void*>(s), os);
+}
+
+// MSVC can be configured to define wchar_t as a typedef of unsigned
+// short.  It defines _NATIVE_WCHAR_T_DEFINED when wchar_t is a native
+// type.  When wchar_t is a typedef, defining an overload for const
+// wchar_t* would cause unsigned short* be printed as a wide string,
+// possibly causing invalid memory accesses.
+#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED)
+// Overloads for wide C strings
+GTEST_API_ void PrintTo(const wchar_t* s, ::std::ostream* os);
+inline void PrintTo(wchar_t* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const wchar_t*>(s), os);
+}
+#endif
+
+// Overload for C arrays.  Multi-dimensional arrays are printed
+// properly.
+
+// Prints the given number of elements in an array, without printing
+// the curly braces.
+template <typename T>
+void PrintRawArrayTo(const T a[], size_t count, ::std::ostream* os) {
+  UniversalPrint(a[0], os);
+  for (size_t i = 1; i != count; i++) {
+    *os << ", ";
+    UniversalPrint(a[i], os);
+  }
+}
+
+// Overloads for ::string and ::std::string.
+#if GTEST_HAS_GLOBAL_STRING
+GTEST_API_ void PrintStringTo(const ::string&s, ::std::ostream* os);
+inline void PrintTo(const ::string& s, ::std::ostream* os) {
+  PrintStringTo(s, os);
+}
+#endif  // GTEST_HAS_GLOBAL_STRING
+
+GTEST_API_ void PrintStringTo(const ::std::string&s, ::std::ostream* os);
+inline void PrintTo(const ::std::string& s, ::std::ostream* os) {
+  PrintStringTo(s, os);
+}
+
+// Overloads for ::wstring and ::std::wstring.
+#if GTEST_HAS_GLOBAL_WSTRING
+GTEST_API_ void PrintWideStringTo(const ::wstring&s, ::std::ostream* os);
+inline void PrintTo(const ::wstring& s, ::std::ostream* os) {
+  PrintWideStringTo(s, os);
+}
+#endif  // GTEST_HAS_GLOBAL_WSTRING
+
+#if GTEST_HAS_STD_WSTRING
+GTEST_API_ void PrintWideStringTo(const ::std::wstring&s, ::std::ostream* os);
+inline void PrintTo(const ::std::wstring& s, ::std::ostream* os) {
+  PrintWideStringTo(s, os);
+}
+#endif  // GTEST_HAS_STD_WSTRING
+
+#if GTEST_HAS_TR1_TUPLE || GTEST_HAS_STD_TUPLE_
+// Helper function for printing a tuple.  T must be instantiated with
+// a tuple type.
+template <typename T>
+void PrintTupleTo(const T& t, ::std::ostream* os);
+#endif  // GTEST_HAS_TR1_TUPLE || GTEST_HAS_STD_TUPLE_
+
+#if GTEST_HAS_TR1_TUPLE
+// Overload for ::std::tr1::tuple.  Needed for printing function arguments,
+// which are packed as tuples.
+
+// Overloaded PrintTo() for tuples of various arities.  We support
+// tuples of up-to 10 fields.  The following implementation works
+// regardless of whether tr1::tuple is implemented using the
+// non-standard variadic template feature or not.
+
+inline void PrintTo(const ::std::tr1::tuple<>& t, ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1>
+void PrintTo(const ::std::tr1::tuple<T1>& t, ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2>
+void PrintTo(const ::std::tr1::tuple<T1, T2>& t, ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3>
+void PrintTo(const ::std::tr1::tuple<T1, T2, T3>& t, ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3, typename T4>
+void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4>& t, ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5>& t,
+             ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+          typename T6>
+void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6>& t,
+             ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+          typename T6, typename T7>
+void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7>& t,
+             ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+          typename T6, typename T7, typename T8>
+void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8>& t,
+             ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+          typename T6, typename T7, typename T8, typename T9>
+void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9>& t,
+             ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+          typename T6, typename T7, typename T8, typename T9, typename T10>
+void PrintTo(
+    const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10>& t,
+    ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+#endif  // GTEST_HAS_TR1_TUPLE
+
+#if GTEST_HAS_STD_TUPLE_
+template <typename... Types>
+void PrintTo(const ::std::tuple<Types...>& t, ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+#endif  // GTEST_HAS_STD_TUPLE_
+
+// Overload for std::pair.
+template <typename T1, typename T2>
+void PrintTo(const ::std::pair<T1, T2>& value, ::std::ostream* os) {
+  *os << '(';
+  // We cannot use UniversalPrint(value.first, os) here, as T1 may be
+  // a reference type.  The same for printing value.second.
+  UniversalPrinter<T1>::Print(value.first, os);
+  *os << ", ";
+  UniversalPrinter<T2>::Print(value.second, os);
+  *os << ')';
+}
+
+// Implements printing a non-reference type T by letting the compiler
+// pick the right overload of PrintTo() for T.
+template <typename T>
+class UniversalPrinter {
+ public:
+  // MSVC warns about adding const to a function type, so we want to
+  // disable the warning.
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4180)
+
+  // Note: we deliberately don't call this PrintTo(), as that name
+  // conflicts with ::testing::internal::PrintTo in the body of the
+  // function.
+  static void Print(const T& value, ::std::ostream* os) {
+    // By default, ::testing::internal::PrintTo() is used for printing
+    // the value.
+    //
+    // Thanks to Koenig look-up, if T is a class and has its own
+    // PrintTo() function defined in its namespace, that function will
+    // be visible here.  Since it is more specific than the generic ones
+    // in ::testing::internal, it will be picked by the compiler in the
+    // following statement - exactly what we want.
+    PrintTo(value, os);
+  }
+
+  GTEST_DISABLE_MSC_WARNINGS_POP_()
+};
+
+// UniversalPrintArray(begin, len, os) prints an array of 'len'
+// elements, starting at address 'begin'.
+template <typename T>
+void UniversalPrintArray(const T* begin, size_t len, ::std::ostream* os) {
+  if (len == 0) {
+    *os << "{}";
+  } else {
+    *os << "{ ";
+    const size_t kThreshold = 18;
+    const size_t kChunkSize = 8;
+    // If the array has more than kThreshold elements, we'll have to
+    // omit some details by printing only the first and the last
+    // kChunkSize elements.
+    // TODO(wan at google.com): let the user control the threshold using a flag.
+    if (len <= kThreshold) {
+      PrintRawArrayTo(begin, len, os);
+    } else {
+      PrintRawArrayTo(begin, kChunkSize, os);
+      *os << ", ..., ";
+      PrintRawArrayTo(begin + len - kChunkSize, kChunkSize, os);
+    }
+    *os << " }";
+  }
+}
+// This overload prints a (const) char array compactly.
+GTEST_API_ void UniversalPrintArray(
+    const char* begin, size_t len, ::std::ostream* os);
+
+// This overload prints a (const) wchar_t array compactly.
+GTEST_API_ void UniversalPrintArray(
+    const wchar_t* begin, size_t len, ::std::ostream* os);
+
+// Implements printing an array type T[N].
+template <typename T, size_t N>
+class UniversalPrinter<T[N]> {
+ public:
+  // Prints the given array, omitting some elements when there are too
+  // many.
+  static void Print(const T (&a)[N], ::std::ostream* os) {
+    UniversalPrintArray(a, N, os);
+  }
+};
+
+// Implements printing a reference type T&.
+template <typename T>
+class UniversalPrinter<T&> {
+ public:
+  // MSVC warns about adding const to a function type, so we want to
+  // disable the warning.
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4180)
+
+  static void Print(const T& value, ::std::ostream* os) {
+    // Prints the address of the value.  We use reinterpret_cast here
+    // as static_cast doesn't compile when T is a function type.
+    *os << "@" << reinterpret_cast<const void*>(&value) << " ";
+
+    // Then prints the value itself.
+    UniversalPrint(value, os);
+  }
+
+  GTEST_DISABLE_MSC_WARNINGS_POP_()
+};
+
+// Prints a value tersely: for a reference type, the referenced value
+// (but not the address) is printed; for a (const) char pointer, the
+// NUL-terminated string (but not the pointer) is printed.
+
+template <typename T>
+class UniversalTersePrinter {
+ public:
+  static void Print(const T& value, ::std::ostream* os) {
+    UniversalPrint(value, os);
+  }
+};
+template <typename T>
+class UniversalTersePrinter<T&> {
+ public:
+  static void Print(const T& value, ::std::ostream* os) {
+    UniversalPrint(value, os);
+  }
+};
+template <typename T, size_t N>
+class UniversalTersePrinter<T[N]> {
+ public:
+  static void Print(const T (&value)[N], ::std::ostream* os) {
+    UniversalPrinter<T[N]>::Print(value, os);
+  }
+};
+template <>
+class UniversalTersePrinter<const char*> {
+ public:
+  static void Print(const char* str, ::std::ostream* os) {
+    if (str == NULL) {
+      *os << "NULL";
+    } else {
+      UniversalPrint(string(str), os);
+    }
+  }
+};
+template <>
+class UniversalTersePrinter<char*> {
+ public:
+  static void Print(char* str, ::std::ostream* os) {
+    UniversalTersePrinter<const char*>::Print(str, os);
+  }
+};
+
+#if GTEST_HAS_STD_WSTRING
+template <>
+class UniversalTersePrinter<const wchar_t*> {
+ public:
+  static void Print(const wchar_t* str, ::std::ostream* os) {
+    if (str == NULL) {
+      *os << "NULL";
+    } else {
+      UniversalPrint(::std::wstring(str), os);
+    }
+  }
+};
+#endif
+
+template <>
+class UniversalTersePrinter<wchar_t*> {
+ public:
+  static void Print(wchar_t* str, ::std::ostream* os) {
+    UniversalTersePrinter<const wchar_t*>::Print(str, os);
+  }
+};
+
+template <typename T>
+void UniversalTersePrint(const T& value, ::std::ostream* os) {
+  UniversalTersePrinter<T>::Print(value, os);
+}
+
+// Prints a value using the type inferred by the compiler.  The
+// difference between this and UniversalTersePrint() is that for a
+// (const) char pointer, this prints both the pointer and the
+// NUL-terminated string.
+template <typename T>
+void UniversalPrint(const T& value, ::std::ostream* os) {
+  // A workarond for the bug in VC++ 7.1 that prevents us from instantiating
+  // UniversalPrinter with T directly.
+  typedef T T1;
+  UniversalPrinter<T1>::Print(value, os);
+}
+
+typedef ::std::vector<string> Strings;
+
+// TuplePolicy<TupleT> must provide:
+// - tuple_size
+//     size of tuple TupleT.
+// - get<size_t I>(const TupleT& t)
+//     static function extracting element I of tuple TupleT.
+// - tuple_element<size_t I>::type
+//     type of element I of tuple TupleT.
+template <typename TupleT>
+struct TuplePolicy;
+
+#if GTEST_HAS_TR1_TUPLE
+template <typename TupleT>
+struct TuplePolicy {
+  typedef TupleT Tuple;
+  static const size_t tuple_size = ::std::tr1::tuple_size<Tuple>::value;
+
+  template <size_t I>
+  struct tuple_element : ::std::tr1::tuple_element<I, Tuple> {};
+
+  template <size_t I>
+  static typename AddReference<
+      const typename ::std::tr1::tuple_element<I, Tuple>::type>::type get(
+      const Tuple& tuple) {
+    return ::std::tr1::get<I>(tuple);
+  }
+};
+template <typename TupleT>
+const size_t TuplePolicy<TupleT>::tuple_size;
+#endif  // GTEST_HAS_TR1_TUPLE
+
+#if GTEST_HAS_STD_TUPLE_
+template <typename... Types>
+struct TuplePolicy< ::std::tuple<Types...> > {
+  typedef ::std::tuple<Types...> Tuple;
+  static const size_t tuple_size = ::std::tuple_size<Tuple>::value;
+
+  template <size_t I>
+  struct tuple_element : ::std::tuple_element<I, Tuple> {};
+
+  template <size_t I>
+  static const typename ::std::tuple_element<I, Tuple>::type& get(
+      const Tuple& tuple) {
+    return ::std::get<I>(tuple);
+  }
+};
+template <typename... Types>
+const size_t TuplePolicy< ::std::tuple<Types...> >::tuple_size;
+#endif  // GTEST_HAS_STD_TUPLE_
+
+#if GTEST_HAS_TR1_TUPLE || GTEST_HAS_STD_TUPLE_
+// This helper template allows PrintTo() for tuples and
+// UniversalTersePrintTupleFieldsToStrings() to be defined by
+// induction on the number of tuple fields.  The idea is that
+// TuplePrefixPrinter<N>::PrintPrefixTo(t, os) prints the first N
+// fields in tuple t, and can be defined in terms of
+// TuplePrefixPrinter<N - 1>.
+//
+// The inductive case.
+template <size_t N>
+struct TuplePrefixPrinter {
+  // Prints the first N fields of a tuple.
+  template <typename Tuple>
+  static void PrintPrefixTo(const Tuple& t, ::std::ostream* os) {
+    TuplePrefixPrinter<N - 1>::PrintPrefixTo(t, os);
+    GTEST_INTENTIONAL_CONST_COND_PUSH_()
+    if (N > 1) {
+    GTEST_INTENTIONAL_CONST_COND_POP_()
+      *os << ", ";
+    }
+    UniversalPrinter<
+        typename TuplePolicy<Tuple>::template tuple_element<N - 1>::type>
+        ::Print(TuplePolicy<Tuple>::template get<N - 1>(t), os);
+  }
+
+  // Tersely prints the first N fields of a tuple to a string vector,
+  // one element for each field.
+  template <typename Tuple>
+  static void TersePrintPrefixToStrings(const Tuple& t, Strings* strings) {
+    TuplePrefixPrinter<N - 1>::TersePrintPrefixToStrings(t, strings);
+    ::std::stringstream ss;
+    UniversalTersePrint(TuplePolicy<Tuple>::template get<N - 1>(t), &ss);
+    strings->push_back(ss.str());
+  }
+};
+
+// Base case.
+template <>
+struct TuplePrefixPrinter<0> {
+  template <typename Tuple>
+  static void PrintPrefixTo(const Tuple&, ::std::ostream*) {}
+
+  template <typename Tuple>
+  static void TersePrintPrefixToStrings(const Tuple&, Strings*) {}
+};
+
+// Helper function for printing a tuple.
+// Tuple must be either std::tr1::tuple or std::tuple type.
+template <typename Tuple>
+void PrintTupleTo(const Tuple& t, ::std::ostream* os) {
+  *os << "(";
+  TuplePrefixPrinter<TuplePolicy<Tuple>::tuple_size>::PrintPrefixTo(t, os);
+  *os << ")";
+}
+
+// Prints the fields of a tuple tersely to a string vector, one
+// element for each field.  See the comment before
+// UniversalTersePrint() for how we define "tersely".
+template <typename Tuple>
+Strings UniversalTersePrintTupleFieldsToStrings(const Tuple& value) {
+  Strings result;
+  TuplePrefixPrinter<TuplePolicy<Tuple>::tuple_size>::
+      TersePrintPrefixToStrings(value, &result);
+  return result;
+}
+#endif  // GTEST_HAS_TR1_TUPLE || GTEST_HAS_STD_TUPLE_
+
+}  // namespace internal
+
+template <typename T>
+::std::string PrintToString(const T& value) {
+  ::std::stringstream ss;
+  internal::UniversalTersePrinter<T>::Print(value, &ss);
+  return ss.str();
+}
+
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
+
+#if GTEST_HAS_PARAM_TEST
+
+namespace testing {
+namespace internal {
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Outputs a message explaining invalid registration of different
+// fixture class for the same test case. This may happen when
+// TEST_P macro is used to define two tests with the same name
+// but in different namespaces.
+GTEST_API_ void ReportInvalidTestCaseType(const char* test_case_name,
+                                          const char* file, int line);
+
+template <typename> class ParamGeneratorInterface;
+template <typename> class ParamGenerator;
+
+// Interface for iterating over elements provided by an implementation
+// of ParamGeneratorInterface<T>.
+template <typename T>
+class ParamIteratorInterface {
+ public:
+  virtual ~ParamIteratorInterface() {}
+  // A pointer to the base generator instance.
+  // Used only for the purposes of iterator comparison
+  // to make sure that two iterators belong to the same generator.
+  virtual const ParamGeneratorInterface<T>* BaseGenerator() const = 0;
+  // Advances iterator to point to the next element
+  // provided by the generator. The caller is responsible
+  // for not calling Advance() on an iterator equal to
+  // BaseGenerator()->End().
+  virtual void Advance() = 0;
+  // Clones the iterator object. Used for implementing copy semantics
+  // of ParamIterator<T>.
+  virtual ParamIteratorInterface* Clone() const = 0;
+  // Dereferences the current iterator and provides (read-only) access
+  // to the pointed value. It is the caller's responsibility not to call
+  // Current() on an iterator equal to BaseGenerator()->End().
+  // Used for implementing ParamGenerator<T>::operator*().
+  virtual const T* Current() const = 0;
+  // Determines whether the given iterator and other point to the same
+  // element in the sequence generated by the generator.
+  // Used for implementing ParamGenerator<T>::operator==().
+  virtual bool Equals(const ParamIteratorInterface& other) const = 0;
+};
+
+// Class iterating over elements provided by an implementation of
+// ParamGeneratorInterface<T>. It wraps ParamIteratorInterface<T>
+// and implements the const forward iterator concept.
+template <typename T>
+class ParamIterator {
+ public:
+  typedef T value_type;
+  typedef const T& reference;
+  typedef ptrdiff_t difference_type;
+
+  // ParamIterator assumes ownership of the impl_ pointer.
+  ParamIterator(const ParamIterator& other) : impl_(other.impl_->Clone()) {}
+  ParamIterator& operator=(const ParamIterator& other) {
+    if (this != &other)
+      impl_.reset(other.impl_->Clone());
+    return *this;
+  }
+
+  const T& operator*() const { return *impl_->Current(); }
+  const T* operator->() const { return impl_->Current(); }
+  // Prefix version of operator++.
+  ParamIterator& operator++() {
+    impl_->Advance();
+    return *this;
+  }
+  // Postfix version of operator++.
+  ParamIterator operator++(int /*unused*/) {
+    ParamIteratorInterface<T>* clone = impl_->Clone();
+    impl_->Advance();
+    return ParamIterator(clone);
+  }
+  bool operator==(const ParamIterator& other) const {
+    return impl_.get() == other.impl_.get() || impl_->Equals(*other.impl_);
+  }
+  bool operator!=(const ParamIterator& other) const {
+    return !(*this == other);
+  }
+
+ private:
+  friend class ParamGenerator<T>;
+  explicit ParamIterator(ParamIteratorInterface<T>* impl) : impl_(impl) {}
+  scoped_ptr<ParamIteratorInterface<T> > impl_;
+};
+
+// ParamGeneratorInterface<T> is the binary interface to access generators
+// defined in other translation units.
+template <typename T>
+class ParamGeneratorInterface {
+ public:
+  typedef T ParamType;
+
+  virtual ~ParamGeneratorInterface() {}
+
+  // Generator interface definition
+  virtual ParamIteratorInterface<T>* Begin() const = 0;
+  virtual ParamIteratorInterface<T>* End() const = 0;
+};
+
+// Wraps ParamGeneratorInterface<T> and provides general generator syntax
+// compatible with the STL Container concept.
+// This class implements copy initialization semantics and the contained
+// ParamGeneratorInterface<T> instance is shared among all copies
+// of the original object. This is possible because that instance is immutable.
+template<typename T>
+class ParamGenerator {
+ public:
+  typedef ParamIterator<T> iterator;
+
+  explicit ParamGenerator(ParamGeneratorInterface<T>* impl) : impl_(impl) {}
+  ParamGenerator(const ParamGenerator& other) : impl_(other.impl_) {}
+
+  ParamGenerator& operator=(const ParamGenerator& other) {
+    impl_ = other.impl_;
+    return *this;
+  }
+
+  iterator begin() const { return iterator(impl_->Begin()); }
+  iterator end() const { return iterator(impl_->End()); }
+
+ private:
+  linked_ptr<const ParamGeneratorInterface<T> > impl_;
+};
+
+// Generates values from a range of two comparable values. Can be used to
+// generate sequences of user-defined types that implement operator+() and
+// operator<().
+// This class is used in the Range() function.
+template <typename T, typename IncrementT>
+class RangeGenerator : public ParamGeneratorInterface<T> {
+ public:
+  RangeGenerator(T begin, T end, IncrementT step)
+      : begin_(begin), end_(end),
+        step_(step), end_index_(CalculateEndIndex(begin, end, step)) {}
+  virtual ~RangeGenerator() {}
+
+  virtual ParamIteratorInterface<T>* Begin() const {
+    return new Iterator(this, begin_, 0, step_);
+  }
+  virtual ParamIteratorInterface<T>* End() const {
+    return new Iterator(this, end_, end_index_, step_);
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<T> {
+   public:
+    Iterator(const ParamGeneratorInterface<T>* base, T value, int index,
+             IncrementT step)
+        : base_(base), value_(value), index_(index), step_(step) {}
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<T>* BaseGenerator() const {
+      return base_;
+    }
+    virtual void Advance() {
+      value_ = value_ + step_;
+      index_++;
+    }
+    virtual ParamIteratorInterface<T>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const T* Current() const { return &value_; }
+    virtual bool Equals(const ParamIteratorInterface<T>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const int other_index =
+          CheckedDowncastToActualType<const Iterator>(&other)->index_;
+      return index_ == other_index;
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : ParamIteratorInterface<T>(),
+          base_(other.base_), value_(other.value_), index_(other.index_),
+          step_(other.step_) {}
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<T>* const base_;
+    T value_;
+    int index_;
+    const IncrementT step_;
+  };  // class RangeGenerator::Iterator
+
+  static int CalculateEndIndex(const T& begin,
+                               const T& end,
+                               const IncrementT& step) {
+    int end_index = 0;
+    for (T i = begin; i < end; i = i + step)
+      end_index++;
+    return end_index;
+  }
+
+  // No implementation - assignment is unsupported.
+  void operator=(const RangeGenerator& other);
+
+  const T begin_;
+  const T end_;
+  const IncrementT step_;
+  // The index for the end() iterator. All the elements in the generated
+  // sequence are indexed (0-based) to aid iterator comparison.
+  const int end_index_;
+};  // class RangeGenerator
+
+
+// Generates values from a pair of STL-style iterators. Used in the
+// ValuesIn() function. The elements are copied from the source range
+// since the source can be located on the stack, and the generator
+// is likely to persist beyond that stack frame.
+template <typename T>
+class ValuesInIteratorRangeGenerator : public ParamGeneratorInterface<T> {
+ public:
+  template <typename ForwardIterator>
+  ValuesInIteratorRangeGenerator(ForwardIterator begin, ForwardIterator end)
+      : container_(begin, end) {}
+  virtual ~ValuesInIteratorRangeGenerator() {}
+
+  virtual ParamIteratorInterface<T>* Begin() const {
+    return new Iterator(this, container_.begin());
+  }
+  virtual ParamIteratorInterface<T>* End() const {
+    return new Iterator(this, container_.end());
+  }
+
+ private:
+  typedef typename ::std::vector<T> ContainerType;
+
+  class Iterator : public ParamIteratorInterface<T> {
+   public:
+    Iterator(const ParamGeneratorInterface<T>* base,
+             typename ContainerType::const_iterator iterator)
+        : base_(base), iterator_(iterator) {}
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<T>* BaseGenerator() const {
+      return base_;
+    }
+    virtual void Advance() {
+      ++iterator_;
+      value_.reset();
+    }
+    virtual ParamIteratorInterface<T>* Clone() const {
+      return new Iterator(*this);
+    }
+    // We need to use cached value referenced by iterator_ because *iterator_
+    // can return a temporary object (and of type other then T), so just
+    // having "return &*iterator_;" doesn't work.
+    // value_ is updated here and not in Advance() because Advance()
+    // can advance iterator_ beyond the end of the range, and we cannot
+    // detect that fact. The client code, on the other hand, is
+    // responsible for not calling Current() on an out-of-range iterator.
+    virtual const T* Current() const {
+      if (value_.get() == NULL)
+        value_.reset(new T(*iterator_));
+      return value_.get();
+    }
+    virtual bool Equals(const ParamIteratorInterface<T>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      return iterator_ ==
+          CheckedDowncastToActualType<const Iterator>(&other)->iterator_;
+    }
+
+   private:
+    Iterator(const Iterator& other)
+          // The explicit constructor call suppresses a false warning
+          // emitted by gcc when supplied with the -Wextra option.
+        : ParamIteratorInterface<T>(),
+          base_(other.base_),
+          iterator_(other.iterator_) {}
+
+    const ParamGeneratorInterface<T>* const base_;
+    typename ContainerType::const_iterator iterator_;
+    // A cached value of *iterator_. We keep it here to allow access by
+    // pointer in the wrapping iterator's operator->().
+    // value_ needs to be mutable to be accessed in Current().
+    // Use of scoped_ptr helps manage cached value's lifetime,
+    // which is bound by the lifespan of the iterator itself.
+    mutable scoped_ptr<const T> value_;
+  };  // class ValuesInIteratorRangeGenerator::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const ValuesInIteratorRangeGenerator& other);
+
+  const ContainerType container_;
+};  // class ValuesInIteratorRangeGenerator
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Stores a parameter value and later creates tests parameterized with that
+// value.
+template <class TestClass>
+class ParameterizedTestFactory : public TestFactoryBase {
+ public:
+  typedef typename TestClass::ParamType ParamType;
+  explicit ParameterizedTestFactory(ParamType parameter) :
+      parameter_(parameter) {}
+  virtual Test* CreateTest() {
+    TestClass::SetParam(&parameter_);
+    return new TestClass();
+  }
+
+ private:
+  const ParamType parameter_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestFactory);
+};
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// TestMetaFactoryBase is a base class for meta-factories that create
+// test factories for passing into MakeAndRegisterTestInfo function.
+template <class ParamType>
+class TestMetaFactoryBase {
+ public:
+  virtual ~TestMetaFactoryBase() {}
+
+  virtual TestFactoryBase* CreateTestFactory(ParamType parameter) = 0;
+};
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// TestMetaFactory creates test factories for passing into
+// MakeAndRegisterTestInfo function. Since MakeAndRegisterTestInfo receives
+// ownership of test factory pointer, same factory object cannot be passed
+// into that method twice. But ParameterizedTestCaseInfo is going to call
+// it for each Test/Parameter value combination. Thus it needs meta factory
+// creator class.
+template <class TestCase>
+class TestMetaFactory
+    : public TestMetaFactoryBase<typename TestCase::ParamType> {
+ public:
+  typedef typename TestCase::ParamType ParamType;
+
+  TestMetaFactory() {}
+
+  virtual TestFactoryBase* CreateTestFactory(ParamType parameter) {
+    return new ParameterizedTestFactory<TestCase>(parameter);
+  }
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestMetaFactory);
+};
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// ParameterizedTestCaseInfoBase is a generic interface
+// to ParameterizedTestCaseInfo classes. ParameterizedTestCaseInfoBase
+// accumulates test information provided by TEST_P macro invocations
+// and generators provided by INSTANTIATE_TEST_CASE_P macro invocations
+// and uses that information to register all resulting test instances
+// in RegisterTests method. The ParameterizeTestCaseRegistry class holds
+// a collection of pointers to the ParameterizedTestCaseInfo objects
+// and calls RegisterTests() on each of them when asked.
+class ParameterizedTestCaseInfoBase {
+ public:
+  virtual ~ParameterizedTestCaseInfoBase() {}
+
+  // Base part of test case name for display purposes.
+  virtual const string& GetTestCaseName() const = 0;
+  // Test case id to verify identity.
+  virtual TypeId GetTestCaseTypeId() const = 0;
+  // UnitTest class invokes this method to register tests in this
+  // test case right before running them in RUN_ALL_TESTS macro.
+  // This method should not be called more then once on any single
+  // instance of a ParameterizedTestCaseInfoBase derived class.
+  virtual void RegisterTests() = 0;
+
+ protected:
+  ParameterizedTestCaseInfoBase() {}
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseInfoBase);
+};
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// ParameterizedTestCaseInfo accumulates tests obtained from TEST_P
+// macro invocations for a particular test case and generators
+// obtained from INSTANTIATE_TEST_CASE_P macro invocations for that
+// test case. It registers tests with all values generated by all
+// generators when asked.
+template <class TestCase>
+class ParameterizedTestCaseInfo : public ParameterizedTestCaseInfoBase {
+ public:
+  // ParamType and GeneratorCreationFunc are private types but are required
+  // for declarations of public methods AddTestPattern() and
+  // AddTestCaseInstantiation().
+  typedef typename TestCase::ParamType ParamType;
+  // A function that returns an instance of appropriate generator type.
+  typedef ParamGenerator<ParamType>(GeneratorCreationFunc)();
+
+  explicit ParameterizedTestCaseInfo(const char* name)
+      : test_case_name_(name) {}
+
+  // Test case base name for display purposes.
+  virtual const string& GetTestCaseName() const { return test_case_name_; }
+  // Test case id to verify identity.
+  virtual TypeId GetTestCaseTypeId() const { return GetTypeId<TestCase>(); }
+  // TEST_P macro uses AddTestPattern() to record information
+  // about a single test in a LocalTestInfo structure.
+  // test_case_name is the base name of the test case (without invocation
+  // prefix). test_base_name is the name of an individual test without
+  // parameter index. For the test SequenceA/FooTest.DoBar/1 FooTest is
+  // test case base name and DoBar is test base name.
+  void AddTestPattern(const char* test_case_name,
+                      const char* test_base_name,
+                      TestMetaFactoryBase<ParamType>* meta_factory) {
+    tests_.push_back(linked_ptr<TestInfo>(new TestInfo(test_case_name,
+                                                       test_base_name,
+                                                       meta_factory)));
+  }
+  // INSTANTIATE_TEST_CASE_P macro uses AddGenerator() to record information
+  // about a generator.
+  int AddTestCaseInstantiation(const string& instantiation_name,
+                               GeneratorCreationFunc* func,
+                               const char* /* file */,
+                               int /* line */) {
+    instantiations_.push_back(::std::make_pair(instantiation_name, func));
+    return 0;  // Return value used only to run this method in namespace scope.
+  }
+  // UnitTest class invokes this method to register tests in this test case
+  // test cases right before running tests in RUN_ALL_TESTS macro.
+  // This method should not be called more then once on any single
+  // instance of a ParameterizedTestCaseInfoBase derived class.
+  // UnitTest has a guard to prevent from calling this method more then once.
+  virtual void RegisterTests() {
+    for (typename TestInfoContainer::iterator test_it = tests_.begin();
+         test_it != tests_.end(); ++test_it) {
+      linked_ptr<TestInfo> test_info = *test_it;
+      for (typename InstantiationContainer::iterator gen_it =
+               instantiations_.begin(); gen_it != instantiations_.end();
+               ++gen_it) {
+        const string& instantiation_name = gen_it->first;
+        ParamGenerator<ParamType> generator((*gen_it->second)());
+
+        string test_case_name;
+        if ( !instantiation_name.empty() )
+          test_case_name = instantiation_name + "/";
+        test_case_name += test_info->test_case_base_name;
+
+        int i = 0;
+        for (typename ParamGenerator<ParamType>::iterator param_it =
+                 generator.begin();
+             param_it != generator.end(); ++param_it, ++i) {
+          Message test_name_stream;
+          test_name_stream << test_info->test_base_name << "/" << i;
+          MakeAndRegisterTestInfo(
+              test_case_name.c_str(),
+              test_name_stream.GetString().c_str(),
+              NULL,  // No type parameter.
+              PrintToString(*param_it).c_str(),
+              GetTestCaseTypeId(),
+              TestCase::SetUpTestCase,
+              TestCase::TearDownTestCase,
+              test_info->test_meta_factory->CreateTestFactory(*param_it));
+        }  // for param_it
+      }  // for gen_it
+    }  // for test_it
+  }  // RegisterTests
+
+ private:
+  // LocalTestInfo structure keeps information about a single test registered
+  // with TEST_P macro.
+  struct TestInfo {
+    TestInfo(const char* a_test_case_base_name,
+             const char* a_test_base_name,
+             TestMetaFactoryBase<ParamType>* a_test_meta_factory) :
+        test_case_base_name(a_test_case_base_name),
+        test_base_name(a_test_base_name),
+        test_meta_factory(a_test_meta_factory) {}
+
+    const string test_case_base_name;
+    const string test_base_name;
+    const scoped_ptr<TestMetaFactoryBase<ParamType> > test_meta_factory;
+  };
+  typedef ::std::vector<linked_ptr<TestInfo> > TestInfoContainer;
+  // Keeps pairs of <Instantiation name, Sequence generator creation function>
+  // received from INSTANTIATE_TEST_CASE_P macros.
+  typedef ::std::vector<std::pair<string, GeneratorCreationFunc*> >
+      InstantiationContainer;
+
+  const string test_case_name_;
+  TestInfoContainer tests_;
+  InstantiationContainer instantiations_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseInfo);
+};  // class ParameterizedTestCaseInfo
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// ParameterizedTestCaseRegistry contains a map of ParameterizedTestCaseInfoBase
+// classes accessed by test case names. TEST_P and INSTANTIATE_TEST_CASE_P
+// macros use it to locate their corresponding ParameterizedTestCaseInfo
+// descriptors.
+class ParameterizedTestCaseRegistry {
+ public:
+  ParameterizedTestCaseRegistry() {}
+  ~ParameterizedTestCaseRegistry() {
+    for (TestCaseInfoContainer::iterator it = test_case_infos_.begin();
+         it != test_case_infos_.end(); ++it) {
+      delete *it;
+    }
+  }
+
+  // Looks up or creates and returns a structure containing information about
+  // tests and instantiations of a particular test case.
+  template <class TestCase>
+  ParameterizedTestCaseInfo<TestCase>* GetTestCasePatternHolder(
+      const char* test_case_name,
+      const char* file,
+      int line) {
+    ParameterizedTestCaseInfo<TestCase>* typed_test_info = NULL;
+    for (TestCaseInfoContainer::iterator it = test_case_infos_.begin();
+         it != test_case_infos_.end(); ++it) {
+      if ((*it)->GetTestCaseName() == test_case_name) {
+        if ((*it)->GetTestCaseTypeId() != GetTypeId<TestCase>()) {
+          // Complain about incorrect usage of Google Test facilities
+          // and terminate the program since we cannot guaranty correct
+          // test case setup and tear-down in this case.
+          ReportInvalidTestCaseType(test_case_name,  file, line);
+          posix::Abort();
+        } else {
+          // At this point we are sure that the object we found is of the same
+          // type we are looking for, so we downcast it to that type
+          // without further checks.
+          typed_test_info = CheckedDowncastToActualType<
+              ParameterizedTestCaseInfo<TestCase> >(*it);
+        }
+        break;
+      }
+    }
+    if (typed_test_info == NULL) {
+      typed_test_info = new ParameterizedTestCaseInfo<TestCase>(test_case_name);
+      test_case_infos_.push_back(typed_test_info);
+    }
+    return typed_test_info;
+  }
+  void RegisterTests() {
+    for (TestCaseInfoContainer::iterator it = test_case_infos_.begin();
+         it != test_case_infos_.end(); ++it) {
+      (*it)->RegisterTests();
+    }
+  }
+
+ private:
+  typedef ::std::vector<ParameterizedTestCaseInfoBase*> TestCaseInfoContainer;
+
+  TestCaseInfoContainer test_case_infos_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseRegistry);
+};
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  //  GTEST_HAS_PARAM_TEST
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
+// This file was GENERATED by command:
+//     pump.py gtest-param-util-generated.h.pump
+// DO NOT EDIT BY HAND!!!
+
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: vladl at google.com (Vlad Losev)
+
+// Type and function utilities for implementing parameterized tests.
+// This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
+//
+// Currently Google Test supports at most 50 arguments in Values,
+// and at most 10 arguments in Combine. Please contact
+// googletestframework at googlegroups.com if you need more.
+// Please note that the number of arguments to Combine is limited
+// by the maximum arity of the implementation of tuple which is
+// currently set at 10.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
+
+// scripts/fuse_gtest.py depends on gtest's own header being #included
+// *unconditionally*.  Therefore these #includes cannot be moved
+// inside #if GTEST_HAS_PARAM_TEST.
+
+#if GTEST_HAS_PARAM_TEST
+
+namespace testing {
+
+// Forward declarations of ValuesIn(), which is implemented in
+// include/gtest/gtest-param-test.h.
+template <typename ForwardIterator>
+internal::ParamGenerator<
+  typename ::testing::internal::IteratorTraits<ForwardIterator>::value_type>
+ValuesIn(ForwardIterator begin, ForwardIterator end);
+
+template <typename T, size_t N>
+internal::ParamGenerator<T> ValuesIn(const T (&array)[N]);
+
+template <class Container>
+internal::ParamGenerator<typename Container::value_type> ValuesIn(
+    const Container& container);
+
+namespace internal {
+
+// Used in the Values() function to provide polymorphic capabilities.
+template <typename T1>
+class ValueArray1 {
+ public:
+  explicit ValueArray1(T1 v1) : v1_(v1) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const { return ValuesIn(&v1_, &v1_ + 1); }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray1& other);
+
+  const T1 v1_;
+};
+
+template <typename T1, typename T2>
+class ValueArray2 {
+ public:
+  ValueArray2(T1 v1, T2 v2) : v1_(v1), v2_(v2) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray2& other);
+
+  const T1 v1_;
+  const T2 v2_;
+};
+
+template <typename T1, typename T2, typename T3>
+class ValueArray3 {
+ public:
+  ValueArray3(T1 v1, T2 v2, T3 v3) : v1_(v1), v2_(v2), v3_(v3) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray3& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4>
+class ValueArray4 {
+ public:
+  ValueArray4(T1 v1, T2 v2, T3 v3, T4 v4) : v1_(v1), v2_(v2), v3_(v3),
+      v4_(v4) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray4& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class ValueArray5 {
+ public:
+  ValueArray5(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5) : v1_(v1), v2_(v2), v3_(v3),
+      v4_(v4), v5_(v5) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray5& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6>
+class ValueArray6 {
+ public:
+  ValueArray6(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6) : v1_(v1), v2_(v2),
+      v3_(v3), v4_(v4), v5_(v5), v6_(v6) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray6& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7>
+class ValueArray7 {
+ public:
+  ValueArray7(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7) : v1_(v1),
+      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray7& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8>
+class ValueArray8 {
+ public:
+  ValueArray8(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
+      T8 v8) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray8& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9>
+class ValueArray9 {
+ public:
+  ValueArray9(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8,
+      T9 v9) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray9& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10>
+class ValueArray10 {
+ public:
+  ValueArray10(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray10& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11>
+class ValueArray11 {
+ public:
+  ValueArray11(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
+      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray11& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12>
+class ValueArray12 {
+ public:
+  ValueArray12(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
+      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray12& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13>
+class ValueArray13 {
+ public:
+  ValueArray13(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
+      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
+      v12_(v12), v13_(v13) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray13& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14>
+class ValueArray14 {
+ public:
+  ValueArray14(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14) : v1_(v1), v2_(v2), v3_(v3),
+      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray14& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15>
+class ValueArray15 {
+ public:
+  ValueArray15(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15) : v1_(v1), v2_(v2),
+      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray15& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16>
+class ValueArray16 {
+ public:
+  ValueArray16(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16) : v1_(v1),
+      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
+      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
+      v16_(v16) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray16& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17>
+class ValueArray17 {
+ public:
+  ValueArray17(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16,
+      T17 v17) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray17& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18>
+class ValueArray18 {
+ public:
+  ValueArray18(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray18& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19>
+class ValueArray19 {
+ public:
+  ValueArray19(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
+      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13),
+      v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray19& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20>
+class ValueArray20 {
+ public:
+  ValueArray20(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
+      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12),
+      v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18),
+      v19_(v19), v20_(v20) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray20& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21>
+class ValueArray21 {
+ public:
+  ValueArray21(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
+      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
+      v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17),
+      v18_(v18), v19_(v19), v20_(v20), v21_(v21) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray21& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22>
+class ValueArray22 {
+ public:
+  ValueArray22(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22) : v1_(v1), v2_(v2), v3_(v3),
+      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray22& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23>
+class ValueArray23 {
+ public:
+  ValueArray23(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23) : v1_(v1), v2_(v2),
+      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
+      v23_(v23) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray23& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24>
+class ValueArray24 {
+ public:
+  ValueArray24(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24) : v1_(v1),
+      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
+      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
+      v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21),
+      v22_(v22), v23_(v23), v24_(v24) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray24& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25>
+class ValueArray25 {
+ public:
+  ValueArray25(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24,
+      T25 v25) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray25& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26>
+class ValueArray26 {
+ public:
+  ValueArray26(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray26& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27>
+class ValueArray27 {
+ public:
+  ValueArray27(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
+      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13),
+      v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19),
+      v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25),
+      v26_(v26), v27_(v27) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray27& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28>
+class ValueArray28 {
+ public:
+  ValueArray28(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
+      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12),
+      v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18),
+      v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24),
+      v25_(v25), v26_(v26), v27_(v27), v28_(v28) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray28& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29>
+class ValueArray29 {
+ public:
+  ValueArray29(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
+      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
+      v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17),
+      v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23),
+      v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray29& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30>
+class ValueArray30 {
+ public:
+  ValueArray30(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30) : v1_(v1), v2_(v2), v3_(v3),
+      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
+      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
+      v29_(v29), v30_(v30) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray30& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31>
+class ValueArray31 {
+ public:
+  ValueArray31(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31) : v1_(v1), v2_(v2),
+      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
+      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
+      v29_(v29), v30_(v30), v31_(v31) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray31& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32>
+class ValueArray32 {
+ public:
+  ValueArray32(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32) : v1_(v1),
+      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
+      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
+      v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21),
+      v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27),
+      v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray32& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33>
+class ValueArray33 {
+ public:
+  ValueArray33(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32,
+      T33 v33) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
+      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
+      v33_(v33) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray33& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34>
+class ValueArray34 {
+ public:
+  ValueArray34(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
+      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
+      v33_(v33), v34_(v34) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray34& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35>
+class ValueArray35 {
+ public:
+  ValueArray35(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
+      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13),
+      v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19),
+      v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25),
+      v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31),
+      v32_(v32), v33_(v33), v34_(v34), v35_(v35) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray35& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36>
+class ValueArray36 {
+ public:
+  ValueArray36(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
+      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12),
+      v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18),
+      v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24),
+      v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30),
+      v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray36& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37>
+class ValueArray37 {
+ public:
+  ValueArray37(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
+      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
+      v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17),
+      v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23),
+      v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29),
+      v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35),
+      v36_(v36), v37_(v37) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray37& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38>
+class ValueArray38 {
+ public:
+  ValueArray38(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38) : v1_(v1), v2_(v2), v3_(v3),
+      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
+      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
+      v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34),
+      v35_(v35), v36_(v36), v37_(v37), v38_(v38) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray38& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39>
+class ValueArray39 {
+ public:
+  ValueArray39(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39) : v1_(v1), v2_(v2),
+      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
+      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
+      v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34),
+      v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray39& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40>
+class ValueArray40 {
+ public:
+  ValueArray40(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40) : v1_(v1),
+      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
+      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
+      v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21),
+      v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27),
+      v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33),
+      v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39),
+      v40_(v40) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray40& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41>
+class ValueArray41 {
+ public:
+  ValueArray41(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40,
+      T41 v41) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
+      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
+      v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38),
+      v39_(v39), v40_(v40), v41_(v41) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray41& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42>
+class ValueArray42 {
+ public:
+  ValueArray42(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
+      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
+      v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38),
+      v39_(v39), v40_(v40), v41_(v41), v42_(v42) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray42& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43>
+class ValueArray43 {
+ public:
+  ValueArray43(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
+      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13),
+      v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19),
+      v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25),
+      v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31),
+      v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37),
+      v38_(v38), v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray43& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44>
+class ValueArray44 {
+ public:
+  ValueArray44(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43, T44 v44) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
+      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12),
+      v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18),
+      v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24),
+      v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30),
+      v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36),
+      v37_(v37), v38_(v38), v39_(v39), v40_(v40), v41_(v41), v42_(v42),
+      v43_(v43), v44_(v44) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray44& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+  const T44 v44_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45>
+class ValueArray45 {
+ public:
+  ValueArray45(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43, T44 v44, T45 v45) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
+      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
+      v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17),
+      v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23),
+      v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29),
+      v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35),
+      v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40), v41_(v41),
+      v42_(v42), v43_(v43), v44_(v44), v45_(v45) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
+        static_cast<T>(v45_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray45& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+  const T44 v44_;
+  const T45 v45_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46>
+class ValueArray46 {
+ public:
+  ValueArray46(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46) : v1_(v1), v2_(v2), v3_(v3),
+      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
+      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
+      v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34),
+      v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40),
+      v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45), v46_(v46) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
+        static_cast<T>(v45_), static_cast<T>(v46_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray46& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+  const T44 v44_;
+  const T45 v45_;
+  const T46 v46_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47>
+class ValueArray47 {
+ public:
+  ValueArray47(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47) : v1_(v1), v2_(v2),
+      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
+      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
+      v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34),
+      v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40),
+      v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45), v46_(v46),
+      v47_(v47) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
+        static_cast<T>(v45_), static_cast<T>(v46_), static_cast<T>(v47_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray47& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+  const T44 v44_;
+  const T45 v45_;
+  const T46 v46_;
+  const T47 v47_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48>
+class ValueArray48 {
+ public:
+  ValueArray48(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48) : v1_(v1),
+      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
+      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
+      v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21),
+      v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27),
+      v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33),
+      v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39),
+      v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45),
+      v46_(v46), v47_(v47), v48_(v48) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
+        static_cast<T>(v45_), static_cast<T>(v46_), static_cast<T>(v47_),
+        static_cast<T>(v48_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray48& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+  const T44 v44_;
+  const T45 v45_;
+  const T46 v46_;
+  const T47 v47_;
+  const T48 v48_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49>
+class ValueArray49 {
+ public:
+  ValueArray49(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48,
+      T49 v49) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
+      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
+      v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38),
+      v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44),
+      v45_(v45), v46_(v46), v47_(v47), v48_(v48), v49_(v49) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
+        static_cast<T>(v45_), static_cast<T>(v46_), static_cast<T>(v47_),
+        static_cast<T>(v48_), static_cast<T>(v49_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray49& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+  const T44 v44_;
+  const T45 v45_;
+  const T46 v46_;
+  const T47 v47_;
+  const T48 v48_;
+  const T49 v49_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49, typename T50>
+class ValueArray50 {
+ public:
+  ValueArray50(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48, T49 v49,
+      T50 v50) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
+      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
+      v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38),
+      v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44),
+      v45_(v45), v46_(v46), v47_(v47), v48_(v48), v49_(v49), v50_(v50) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
+        static_cast<T>(v45_), static_cast<T>(v46_), static_cast<T>(v47_),
+        static_cast<T>(v48_), static_cast<T>(v49_), static_cast<T>(v50_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray50& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+  const T44 v44_;
+  const T45 v45_;
+  const T46 v46_;
+  const T47 v47_;
+  const T48 v48_;
+  const T49 v49_;
+  const T50 v50_;
+};
+
+# if GTEST_HAS_COMBINE
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Generates values from the Cartesian product of values produced
+// by the argument generators.
+//
+template <typename T1, typename T2>
+class CartesianProductGenerator2
+    : public ParamGeneratorInterface< ::testing::tuple<T1, T2> > {
+ public:
+  typedef ::testing::tuple<T1, T2> ParamType;
+
+  CartesianProductGenerator2(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2)
+      : g1_(g1), g2_(g2) {}
+  virtual ~CartesianProductGenerator2() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current2_;
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator2::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator2& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+};  // class CartesianProductGenerator2
+
+
+template <typename T1, typename T2, typename T3>
+class CartesianProductGenerator3
+    : public ParamGeneratorInterface< ::testing::tuple<T1, T2, T3> > {
+ public:
+  typedef ::testing::tuple<T1, T2, T3> ParamType;
+
+  CartesianProductGenerator3(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3)
+      : g1_(g1), g2_(g2), g3_(g3) {}
+  virtual ~CartesianProductGenerator3() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current3_;
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_, *current3_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator3::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator3& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+};  // class CartesianProductGenerator3
+
+
+template <typename T1, typename T2, typename T3, typename T4>
+class CartesianProductGenerator4
+    : public ParamGeneratorInterface< ::testing::tuple<T1, T2, T3, T4> > {
+ public:
+  typedef ::testing::tuple<T1, T2, T3, T4> ParamType;
+
+  CartesianProductGenerator4(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
+      const ParamGenerator<T4>& g4)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4) {}
+  virtual ~CartesianProductGenerator4() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin(), g4_, g4_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
+        g4_, g4_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3,
+      const ParamGenerator<T4>& g4,
+      const typename ParamGenerator<T4>::iterator& current4)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
+          begin4_(g4.begin()), end4_(g4.end()), current4_(current4)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current4_;
+      if (current4_ == end4_) {
+        current4_ = begin4_;
+        ++current3_;
+      }
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_ &&
+          current4_ == typed_other->current4_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_),
+        begin4_(other.begin4_),
+        end4_(other.end4_),
+        current4_(other.current4_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_, *current3_,
+            *current4_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_ ||
+          current4_ == end4_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    const typename ParamGenerator<T4>::iterator begin4_;
+    const typename ParamGenerator<T4>::iterator end4_;
+    typename ParamGenerator<T4>::iterator current4_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator4::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator4& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+  const ParamGenerator<T4> g4_;
+};  // class CartesianProductGenerator4
+
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class CartesianProductGenerator5
+    : public ParamGeneratorInterface< ::testing::tuple<T1, T2, T3, T4, T5> > {
+ public:
+  typedef ::testing::tuple<T1, T2, T3, T4, T5> ParamType;
+
+  CartesianProductGenerator5(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
+      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5) {}
+  virtual ~CartesianProductGenerator5() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
+        g4_, g4_.end(), g5_, g5_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3,
+      const ParamGenerator<T4>& g4,
+      const typename ParamGenerator<T4>::iterator& current4,
+      const ParamGenerator<T5>& g5,
+      const typename ParamGenerator<T5>::iterator& current5)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
+          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
+          begin5_(g5.begin()), end5_(g5.end()), current5_(current5)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current5_;
+      if (current5_ == end5_) {
+        current5_ = begin5_;
+        ++current4_;
+      }
+      if (current4_ == end4_) {
+        current4_ = begin4_;
+        ++current3_;
+      }
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_ &&
+          current4_ == typed_other->current4_ &&
+          current5_ == typed_other->current5_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_),
+        begin4_(other.begin4_),
+        end4_(other.end4_),
+        current4_(other.current4_),
+        begin5_(other.begin5_),
+        end5_(other.end5_),
+        current5_(other.current5_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_ ||
+          current4_ == end4_ ||
+          current5_ == end5_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    const typename ParamGenerator<T4>::iterator begin4_;
+    const typename ParamGenerator<T4>::iterator end4_;
+    typename ParamGenerator<T4>::iterator current4_;
+    const typename ParamGenerator<T5>::iterator begin5_;
+    const typename ParamGenerator<T5>::iterator end5_;
+    typename ParamGenerator<T5>::iterator current5_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator5::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator5& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+  const ParamGenerator<T4> g4_;
+  const ParamGenerator<T5> g5_;
+};  // class CartesianProductGenerator5
+
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6>
+class CartesianProductGenerator6
+    : public ParamGeneratorInterface< ::testing::tuple<T1, T2, T3, T4, T5,
+        T6> > {
+ public:
+  typedef ::testing::tuple<T1, T2, T3, T4, T5, T6> ParamType;
+
+  CartesianProductGenerator6(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
+      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
+      const ParamGenerator<T6>& g6)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6) {}
+  virtual ~CartesianProductGenerator6() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
+        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3,
+      const ParamGenerator<T4>& g4,
+      const typename ParamGenerator<T4>::iterator& current4,
+      const ParamGenerator<T5>& g5,
+      const typename ParamGenerator<T5>::iterator& current5,
+      const ParamGenerator<T6>& g6,
+      const typename ParamGenerator<T6>::iterator& current6)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
+          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
+          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
+          begin6_(g6.begin()), end6_(g6.end()), current6_(current6)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current6_;
+      if (current6_ == end6_) {
+        current6_ = begin6_;
+        ++current5_;
+      }
+      if (current5_ == end5_) {
+        current5_ = begin5_;
+        ++current4_;
+      }
+      if (current4_ == end4_) {
+        current4_ = begin4_;
+        ++current3_;
+      }
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_ &&
+          current4_ == typed_other->current4_ &&
+          current5_ == typed_other->current5_ &&
+          current6_ == typed_other->current6_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_),
+        begin4_(other.begin4_),
+        end4_(other.end4_),
+        current4_(other.current4_),
+        begin5_(other.begin5_),
+        end5_(other.end5_),
+        current5_(other.current5_),
+        begin6_(other.begin6_),
+        end6_(other.end6_),
+        current6_(other.current6_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_, *current6_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_ ||
+          current4_ == end4_ ||
+          current5_ == end5_ ||
+          current6_ == end6_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    const typename ParamGenerator<T4>::iterator begin4_;
+    const typename ParamGenerator<T4>::iterator end4_;
+    typename ParamGenerator<T4>::iterator current4_;
+    const typename ParamGenerator<T5>::iterator begin5_;
+    const typename ParamGenerator<T5>::iterator end5_;
+    typename ParamGenerator<T5>::iterator current5_;
+    const typename ParamGenerator<T6>::iterator begin6_;
+    const typename ParamGenerator<T6>::iterator end6_;
+    typename ParamGenerator<T6>::iterator current6_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator6::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator6& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+  const ParamGenerator<T4> g4_;
+  const ParamGenerator<T5> g5_;
+  const ParamGenerator<T6> g6_;
+};  // class CartesianProductGenerator6
+
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7>
+class CartesianProductGenerator7
+    : public ParamGeneratorInterface< ::testing::tuple<T1, T2, T3, T4, T5, T6,
+        T7> > {
+ public:
+  typedef ::testing::tuple<T1, T2, T3, T4, T5, T6, T7> ParamType;
+
+  CartesianProductGenerator7(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
+      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
+      const ParamGenerator<T6>& g6, const ParamGenerator<T7>& g7)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7) {}
+  virtual ~CartesianProductGenerator7() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_,
+        g7_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
+        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3,
+      const ParamGenerator<T4>& g4,
+      const typename ParamGenerator<T4>::iterator& current4,
+      const ParamGenerator<T5>& g5,
+      const typename ParamGenerator<T5>::iterator& current5,
+      const ParamGenerator<T6>& g6,
+      const typename ParamGenerator<T6>::iterator& current6,
+      const ParamGenerator<T7>& g7,
+      const typename ParamGenerator<T7>::iterator& current7)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
+          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
+          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
+          begin6_(g6.begin()), end6_(g6.end()), current6_(current6),
+          begin7_(g7.begin()), end7_(g7.end()), current7_(current7)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current7_;
+      if (current7_ == end7_) {
+        current7_ = begin7_;
+        ++current6_;
+      }
+      if (current6_ == end6_) {
+        current6_ = begin6_;
+        ++current5_;
+      }
+      if (current5_ == end5_) {
+        current5_ = begin5_;
+        ++current4_;
+      }
+      if (current4_ == end4_) {
+        current4_ = begin4_;
+        ++current3_;
+      }
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_ &&
+          current4_ == typed_other->current4_ &&
+          current5_ == typed_other->current5_ &&
+          current6_ == typed_other->current6_ &&
+          current7_ == typed_other->current7_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_),
+        begin4_(other.begin4_),
+        end4_(other.end4_),
+        current4_(other.current4_),
+        begin5_(other.begin5_),
+        end5_(other.end5_),
+        current5_(other.current5_),
+        begin6_(other.begin6_),
+        end6_(other.end6_),
+        current6_(other.current6_),
+        begin7_(other.begin7_),
+        end7_(other.end7_),
+        current7_(other.current7_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_, *current6_, *current7_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_ ||
+          current4_ == end4_ ||
+          current5_ == end5_ ||
+          current6_ == end6_ ||
+          current7_ == end7_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    const typename ParamGenerator<T4>::iterator begin4_;
+    const typename ParamGenerator<T4>::iterator end4_;
+    typename ParamGenerator<T4>::iterator current4_;
+    const typename ParamGenerator<T5>::iterator begin5_;
+    const typename ParamGenerator<T5>::iterator end5_;
+    typename ParamGenerator<T5>::iterator current5_;
+    const typename ParamGenerator<T6>::iterator begin6_;
+    const typename ParamGenerator<T6>::iterator end6_;
+    typename ParamGenerator<T6>::iterator current6_;
+    const typename ParamGenerator<T7>::iterator begin7_;
+    const typename ParamGenerator<T7>::iterator end7_;
+    typename ParamGenerator<T7>::iterator current7_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator7::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator7& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+  const ParamGenerator<T4> g4_;
+  const ParamGenerator<T5> g5_;
+  const ParamGenerator<T6> g6_;
+  const ParamGenerator<T7> g7_;
+};  // class CartesianProductGenerator7
+
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8>
+class CartesianProductGenerator8
+    : public ParamGeneratorInterface< ::testing::tuple<T1, T2, T3, T4, T5, T6,
+        T7, T8> > {
+ public:
+  typedef ::testing::tuple<T1, T2, T3, T4, T5, T6, T7, T8> ParamType;
+
+  CartesianProductGenerator8(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
+      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
+      const ParamGenerator<T6>& g6, const ParamGenerator<T7>& g7,
+      const ParamGenerator<T8>& g8)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7),
+          g8_(g8) {}
+  virtual ~CartesianProductGenerator8() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_,
+        g7_.begin(), g8_, g8_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
+        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_,
+        g8_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3,
+      const ParamGenerator<T4>& g4,
+      const typename ParamGenerator<T4>::iterator& current4,
+      const ParamGenerator<T5>& g5,
+      const typename ParamGenerator<T5>::iterator& current5,
+      const ParamGenerator<T6>& g6,
+      const typename ParamGenerator<T6>::iterator& current6,
+      const ParamGenerator<T7>& g7,
+      const typename ParamGenerator<T7>::iterator& current7,
+      const ParamGenerator<T8>& g8,
+      const typename ParamGenerator<T8>::iterator& current8)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
+          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
+          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
+          begin6_(g6.begin()), end6_(g6.end()), current6_(current6),
+          begin7_(g7.begin()), end7_(g7.end()), current7_(current7),
+          begin8_(g8.begin()), end8_(g8.end()), current8_(current8)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current8_;
+      if (current8_ == end8_) {
+        current8_ = begin8_;
+        ++current7_;
+      }
+      if (current7_ == end7_) {
+        current7_ = begin7_;
+        ++current6_;
+      }
+      if (current6_ == end6_) {
+        current6_ = begin6_;
+        ++current5_;
+      }
+      if (current5_ == end5_) {
+        current5_ = begin5_;
+        ++current4_;
+      }
+      if (current4_ == end4_) {
+        current4_ = begin4_;
+        ++current3_;
+      }
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_ &&
+          current4_ == typed_other->current4_ &&
+          current5_ == typed_other->current5_ &&
+          current6_ == typed_other->current6_ &&
+          current7_ == typed_other->current7_ &&
+          current8_ == typed_other->current8_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_),
+        begin4_(other.begin4_),
+        end4_(other.end4_),
+        current4_(other.current4_),
+        begin5_(other.begin5_),
+        end5_(other.end5_),
+        current5_(other.current5_),
+        begin6_(other.begin6_),
+        end6_(other.end6_),
+        current6_(other.current6_),
+        begin7_(other.begin7_),
+        end7_(other.end7_),
+        current7_(other.current7_),
+        begin8_(other.begin8_),
+        end8_(other.end8_),
+        current8_(other.current8_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_, *current6_, *current7_, *current8_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_ ||
+          current4_ == end4_ ||
+          current5_ == end5_ ||
+          current6_ == end6_ ||
+          current7_ == end7_ ||
+          current8_ == end8_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    const typename ParamGenerator<T4>::iterator begin4_;
+    const typename ParamGenerator<T4>::iterator end4_;
+    typename ParamGenerator<T4>::iterator current4_;
+    const typename ParamGenerator<T5>::iterator begin5_;
+    const typename ParamGenerator<T5>::iterator end5_;
+    typename ParamGenerator<T5>::iterator current5_;
+    const typename ParamGenerator<T6>::iterator begin6_;
+    const typename ParamGenerator<T6>::iterator end6_;
+    typename ParamGenerator<T6>::iterator current6_;
+    const typename ParamGenerator<T7>::iterator begin7_;
+    const typename ParamGenerator<T7>::iterator end7_;
+    typename ParamGenerator<T7>::iterator current7_;
+    const typename ParamGenerator<T8>::iterator begin8_;
+    const typename ParamGenerator<T8>::iterator end8_;
+    typename ParamGenerator<T8>::iterator current8_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator8::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator8& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+  const ParamGenerator<T4> g4_;
+  const ParamGenerator<T5> g5_;
+  const ParamGenerator<T6> g6_;
+  const ParamGenerator<T7> g7_;
+  const ParamGenerator<T8> g8_;
+};  // class CartesianProductGenerator8
+
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9>
+class CartesianProductGenerator9
+    : public ParamGeneratorInterface< ::testing::tuple<T1, T2, T3, T4, T5, T6,
+        T7, T8, T9> > {
+ public:
+  typedef ::testing::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9> ParamType;
+
+  CartesianProductGenerator9(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
+      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
+      const ParamGenerator<T6>& g6, const ParamGenerator<T7>& g7,
+      const ParamGenerator<T8>& g8, const ParamGenerator<T9>& g9)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8),
+          g9_(g9) {}
+  virtual ~CartesianProductGenerator9() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_,
+        g7_.begin(), g8_, g8_.begin(), g9_, g9_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
+        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_,
+        g8_.end(), g9_, g9_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3,
+      const ParamGenerator<T4>& g4,
+      const typename ParamGenerator<T4>::iterator& current4,
+      const ParamGenerator<T5>& g5,
+      const typename ParamGenerator<T5>::iterator& current5,
+      const ParamGenerator<T6>& g6,
+      const typename ParamGenerator<T6>::iterator& current6,
+      const ParamGenerator<T7>& g7,
+      const typename ParamGenerator<T7>::iterator& current7,
+      const ParamGenerator<T8>& g8,
+      const typename ParamGenerator<T8>::iterator& current8,
+      const ParamGenerator<T9>& g9,
+      const typename ParamGenerator<T9>::iterator& current9)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
+          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
+          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
+          begin6_(g6.begin()), end6_(g6.end()), current6_(current6),
+          begin7_(g7.begin()), end7_(g7.end()), current7_(current7),
+          begin8_(g8.begin()), end8_(g8.end()), current8_(current8),
+          begin9_(g9.begin()), end9_(g9.end()), current9_(current9)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current9_;
+      if (current9_ == end9_) {
+        current9_ = begin9_;
+        ++current8_;
+      }
+      if (current8_ == end8_) {
+        current8_ = begin8_;
+        ++current7_;
+      }
+      if (current7_ == end7_) {
+        current7_ = begin7_;
+        ++current6_;
+      }
+      if (current6_ == end6_) {
+        current6_ = begin6_;
+        ++current5_;
+      }
+      if (current5_ == end5_) {
+        current5_ = begin5_;
+        ++current4_;
+      }
+      if (current4_ == end4_) {
+        current4_ = begin4_;
+        ++current3_;
+      }
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_ &&
+          current4_ == typed_other->current4_ &&
+          current5_ == typed_other->current5_ &&
+          current6_ == typed_other->current6_ &&
+          current7_ == typed_other->current7_ &&
+          current8_ == typed_other->current8_ &&
+          current9_ == typed_other->current9_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_),
+        begin4_(other.begin4_),
+        end4_(other.end4_),
+        current4_(other.current4_),
+        begin5_(other.begin5_),
+        end5_(other.end5_),
+        current5_(other.current5_),
+        begin6_(other.begin6_),
+        end6_(other.end6_),
+        current6_(other.current6_),
+        begin7_(other.begin7_),
+        end7_(other.end7_),
+        current7_(other.current7_),
+        begin8_(other.begin8_),
+        end8_(other.end8_),
+        current8_(other.current8_),
+        begin9_(other.begin9_),
+        end9_(other.end9_),
+        current9_(other.current9_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_, *current6_, *current7_, *current8_,
+            *current9_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_ ||
+          current4_ == end4_ ||
+          current5_ == end5_ ||
+          current6_ == end6_ ||
+          current7_ == end7_ ||
+          current8_ == end8_ ||
+          current9_ == end9_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    const typename ParamGenerator<T4>::iterator begin4_;
+    const typename ParamGenerator<T4>::iterator end4_;
+    typename ParamGenerator<T4>::iterator current4_;
+    const typename ParamGenerator<T5>::iterator begin5_;
+    const typename ParamGenerator<T5>::iterator end5_;
+    typename ParamGenerator<T5>::iterator current5_;
+    const typename ParamGenerator<T6>::iterator begin6_;
+    const typename ParamGenerator<T6>::iterator end6_;
+    typename ParamGenerator<T6>::iterator current6_;
+    const typename ParamGenerator<T7>::iterator begin7_;
+    const typename ParamGenerator<T7>::iterator end7_;
+    typename ParamGenerator<T7>::iterator current7_;
+    const typename ParamGenerator<T8>::iterator begin8_;
+    const typename ParamGenerator<T8>::iterator end8_;
+    typename ParamGenerator<T8>::iterator current8_;
+    const typename ParamGenerator<T9>::iterator begin9_;
+    const typename ParamGenerator<T9>::iterator end9_;
+    typename ParamGenerator<T9>::iterator current9_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator9::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator9& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+  const ParamGenerator<T4> g4_;
+  const ParamGenerator<T5> g5_;
+  const ParamGenerator<T6> g6_;
+  const ParamGenerator<T7> g7_;
+  const ParamGenerator<T8> g8_;
+  const ParamGenerator<T9> g9_;
+};  // class CartesianProductGenerator9
+
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10>
+class CartesianProductGenerator10
+    : public ParamGeneratorInterface< ::testing::tuple<T1, T2, T3, T4, T5, T6,
+        T7, T8, T9, T10> > {
+ public:
+  typedef ::testing::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> ParamType;
+
+  CartesianProductGenerator10(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
+      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
+      const ParamGenerator<T6>& g6, const ParamGenerator<T7>& g7,
+      const ParamGenerator<T8>& g8, const ParamGenerator<T9>& g9,
+      const ParamGenerator<T10>& g10)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8),
+          g9_(g9), g10_(g10) {}
+  virtual ~CartesianProductGenerator10() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_,
+        g7_.begin(), g8_, g8_.begin(), g9_, g9_.begin(), g10_, g10_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
+        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_,
+        g8_.end(), g9_, g9_.end(), g10_, g10_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3,
+      const ParamGenerator<T4>& g4,
+      const typename ParamGenerator<T4>::iterator& current4,
+      const ParamGenerator<T5>& g5,
+      const typename ParamGenerator<T5>::iterator& current5,
+      const ParamGenerator<T6>& g6,
+      const typename ParamGenerator<T6>::iterator& current6,
+      const ParamGenerator<T7>& g7,
+      const typename ParamGenerator<T7>::iterator& current7,
+      const ParamGenerator<T8>& g8,
+      const typename ParamGenerator<T8>::iterator& current8,
+      const ParamGenerator<T9>& g9,
+      const typename ParamGenerator<T9>::iterator& current9,
+      const ParamGenerator<T10>& g10,
+      const typename ParamGenerator<T10>::iterator& current10)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
+          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
+          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
+          begin6_(g6.begin()), end6_(g6.end()), current6_(current6),
+          begin7_(g7.begin()), end7_(g7.end()), current7_(current7),
+          begin8_(g8.begin()), end8_(g8.end()), current8_(current8),
+          begin9_(g9.begin()), end9_(g9.end()), current9_(current9),
+          begin10_(g10.begin()), end10_(g10.end()), current10_(current10)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current10_;
+      if (current10_ == end10_) {
+        current10_ = begin10_;
+        ++current9_;
+      }
+      if (current9_ == end9_) {
+        current9_ = begin9_;
+        ++current8_;
+      }
+      if (current8_ == end8_) {
+        current8_ = begin8_;
+        ++current7_;
+      }
+      if (current7_ == end7_) {
+        current7_ = begin7_;
+        ++current6_;
+      }
+      if (current6_ == end6_) {
+        current6_ = begin6_;
+        ++current5_;
+      }
+      if (current5_ == end5_) {
+        current5_ = begin5_;
+        ++current4_;
+      }
+      if (current4_ == end4_) {
+        current4_ = begin4_;
+        ++current3_;
+      }
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_ &&
+          current4_ == typed_other->current4_ &&
+          current5_ == typed_other->current5_ &&
+          current6_ == typed_other->current6_ &&
+          current7_ == typed_other->current7_ &&
+          current8_ == typed_other->current8_ &&
+          current9_ == typed_other->current9_ &&
+          current10_ == typed_other->current10_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_),
+        begin4_(other.begin4_),
+        end4_(other.end4_),
+        current4_(other.current4_),
+        begin5_(other.begin5_),
+        end5_(other.end5_),
+        current5_(other.current5_),
+        begin6_(other.begin6_),
+        end6_(other.end6_),
+        current6_(other.current6_),
+        begin7_(other.begin7_),
+        end7_(other.end7_),
+        current7_(other.current7_),
+        begin8_(other.begin8_),
+        end8_(other.end8_),
+        current8_(other.current8_),
+        begin9_(other.begin9_),
+        end9_(other.end9_),
+        current9_(other.current9_),
+        begin10_(other.begin10_),
+        end10_(other.end10_),
+        current10_(other.current10_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_, *current6_, *current7_, *current8_,
+            *current9_, *current10_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_ ||
+          current4_ == end4_ ||
+          current5_ == end5_ ||
+          current6_ == end6_ ||
+          current7_ == end7_ ||
+          current8_ == end8_ ||
+          current9_ == end9_ ||
+          current10_ == end10_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    const typename ParamGenerator<T4>::iterator begin4_;
+    const typename ParamGenerator<T4>::iterator end4_;
+    typename ParamGenerator<T4>::iterator current4_;
+    const typename ParamGenerator<T5>::iterator begin5_;
+    const typename ParamGenerator<T5>::iterator end5_;
+    typename ParamGenerator<T5>::iterator current5_;
+    const typename ParamGenerator<T6>::iterator begin6_;
+    const typename ParamGenerator<T6>::iterator end6_;
+    typename ParamGenerator<T6>::iterator current6_;
+    const typename ParamGenerator<T7>::iterator begin7_;
+    const typename ParamGenerator<T7>::iterator end7_;
+    typename ParamGenerator<T7>::iterator current7_;
+    const typename ParamGenerator<T8>::iterator begin8_;
+    const typename ParamGenerator<T8>::iterator end8_;
+    typename ParamGenerator<T8>::iterator current8_;
+    const typename ParamGenerator<T9>::iterator begin9_;
+    const typename ParamGenerator<T9>::iterator end9_;
+    typename ParamGenerator<T9>::iterator current9_;
+    const typename ParamGenerator<T10>::iterator begin10_;
+    const typename ParamGenerator<T10>::iterator end10_;
+    typename ParamGenerator<T10>::iterator current10_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator10::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator10& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+  const ParamGenerator<T4> g4_;
+  const ParamGenerator<T5> g5_;
+  const ParamGenerator<T6> g6_;
+  const ParamGenerator<T7> g7_;
+  const ParamGenerator<T8> g8_;
+  const ParamGenerator<T9> g9_;
+  const ParamGenerator<T10> g10_;
+};  // class CartesianProductGenerator10
+
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Helper classes providing Combine() with polymorphic features. They allow
+// casting CartesianProductGeneratorN<T> to ParamGenerator<U> if T is
+// convertible to U.
+//
+template <class Generator1, class Generator2>
+class CartesianProductHolder2 {
+ public:
+CartesianProductHolder2(const Generator1& g1, const Generator2& g2)
+      : g1_(g1), g2_(g2) {}
+  template <typename T1, typename T2>
+  operator ParamGenerator< ::testing::tuple<T1, T2> >() const {
+    return ParamGenerator< ::testing::tuple<T1, T2> >(
+        new CartesianProductGenerator2<T1, T2>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder2& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+};  // class CartesianProductHolder2
+
+template <class Generator1, class Generator2, class Generator3>
+class CartesianProductHolder3 {
+ public:
+CartesianProductHolder3(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3)
+      : g1_(g1), g2_(g2), g3_(g3) {}
+  template <typename T1, typename T2, typename T3>
+  operator ParamGenerator< ::testing::tuple<T1, T2, T3> >() const {
+    return ParamGenerator< ::testing::tuple<T1, T2, T3> >(
+        new CartesianProductGenerator3<T1, T2, T3>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder3& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+};  // class CartesianProductHolder3
+
+template <class Generator1, class Generator2, class Generator3,
+    class Generator4>
+class CartesianProductHolder4 {
+ public:
+CartesianProductHolder4(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3, const Generator4& g4)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4) {}
+  template <typename T1, typename T2, typename T3, typename T4>
+  operator ParamGenerator< ::testing::tuple<T1, T2, T3, T4> >() const {
+    return ParamGenerator< ::testing::tuple<T1, T2, T3, T4> >(
+        new CartesianProductGenerator4<T1, T2, T3, T4>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_),
+        static_cast<ParamGenerator<T4> >(g4_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder4& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+  const Generator4 g4_;
+};  // class CartesianProductHolder4
+
+template <class Generator1, class Generator2, class Generator3,
+    class Generator4, class Generator5>
+class CartesianProductHolder5 {
+ public:
+CartesianProductHolder5(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3, const Generator4& g4, const Generator5& g5)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5) {}
+  template <typename T1, typename T2, typename T3, typename T4, typename T5>
+  operator ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5> >() const {
+    return ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5> >(
+        new CartesianProductGenerator5<T1, T2, T3, T4, T5>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_),
+        static_cast<ParamGenerator<T4> >(g4_),
+        static_cast<ParamGenerator<T5> >(g5_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder5& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+  const Generator4 g4_;
+  const Generator5 g5_;
+};  // class CartesianProductHolder5
+
+template <class Generator1, class Generator2, class Generator3,
+    class Generator4, class Generator5, class Generator6>
+class CartesianProductHolder6 {
+ public:
+CartesianProductHolder6(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3, const Generator4& g4, const Generator5& g5,
+    const Generator6& g6)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6) {}
+  template <typename T1, typename T2, typename T3, typename T4, typename T5,
+      typename T6>
+  operator ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6> >() const {
+    return ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6> >(
+        new CartesianProductGenerator6<T1, T2, T3, T4, T5, T6>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_),
+        static_cast<ParamGenerator<T4> >(g4_),
+        static_cast<ParamGenerator<T5> >(g5_),
+        static_cast<ParamGenerator<T6> >(g6_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder6& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+  const Generator4 g4_;
+  const Generator5 g5_;
+  const Generator6 g6_;
+};  // class CartesianProductHolder6
+
+template <class Generator1, class Generator2, class Generator3,
+    class Generator4, class Generator5, class Generator6, class Generator7>
+class CartesianProductHolder7 {
+ public:
+CartesianProductHolder7(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3, const Generator4& g4, const Generator5& g5,
+    const Generator6& g6, const Generator7& g7)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7) {}
+  template <typename T1, typename T2, typename T3, typename T4, typename T5,
+      typename T6, typename T7>
+  operator ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6,
+      T7> >() const {
+    return ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6, T7> >(
+        new CartesianProductGenerator7<T1, T2, T3, T4, T5, T6, T7>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_),
+        static_cast<ParamGenerator<T4> >(g4_),
+        static_cast<ParamGenerator<T5> >(g5_),
+        static_cast<ParamGenerator<T6> >(g6_),
+        static_cast<ParamGenerator<T7> >(g7_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder7& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+  const Generator4 g4_;
+  const Generator5 g5_;
+  const Generator6 g6_;
+  const Generator7 g7_;
+};  // class CartesianProductHolder7
+
+template <class Generator1, class Generator2, class Generator3,
+    class Generator4, class Generator5, class Generator6, class Generator7,
+    class Generator8>
+class CartesianProductHolder8 {
+ public:
+CartesianProductHolder8(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3, const Generator4& g4, const Generator5& g5,
+    const Generator6& g6, const Generator7& g7, const Generator8& g8)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7),
+          g8_(g8) {}
+  template <typename T1, typename T2, typename T3, typename T4, typename T5,
+      typename T6, typename T7, typename T8>
+  operator ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6, T7,
+      T8> >() const {
+    return ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6, T7, T8> >(
+        new CartesianProductGenerator8<T1, T2, T3, T4, T5, T6, T7, T8>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_),
+        static_cast<ParamGenerator<T4> >(g4_),
+        static_cast<ParamGenerator<T5> >(g5_),
+        static_cast<ParamGenerator<T6> >(g6_),
+        static_cast<ParamGenerator<T7> >(g7_),
+        static_cast<ParamGenerator<T8> >(g8_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder8& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+  const Generator4 g4_;
+  const Generator5 g5_;
+  const Generator6 g6_;
+  const Generator7 g7_;
+  const Generator8 g8_;
+};  // class CartesianProductHolder8
+
+template <class Generator1, class Generator2, class Generator3,
+    class Generator4, class Generator5, class Generator6, class Generator7,
+    class Generator8, class Generator9>
+class CartesianProductHolder9 {
+ public:
+CartesianProductHolder9(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3, const Generator4& g4, const Generator5& g5,
+    const Generator6& g6, const Generator7& g7, const Generator8& g8,
+    const Generator9& g9)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8),
+          g9_(g9) {}
+  template <typename T1, typename T2, typename T3, typename T4, typename T5,
+      typename T6, typename T7, typename T8, typename T9>
+  operator ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6, T7, T8,
+      T9> >() const {
+    return ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6, T7, T8,
+        T9> >(
+        new CartesianProductGenerator9<T1, T2, T3, T4, T5, T6, T7, T8, T9>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_),
+        static_cast<ParamGenerator<T4> >(g4_),
+        static_cast<ParamGenerator<T5> >(g5_),
+        static_cast<ParamGenerator<T6> >(g6_),
+        static_cast<ParamGenerator<T7> >(g7_),
+        static_cast<ParamGenerator<T8> >(g8_),
+        static_cast<ParamGenerator<T9> >(g9_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder9& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+  const Generator4 g4_;
+  const Generator5 g5_;
+  const Generator6 g6_;
+  const Generator7 g7_;
+  const Generator8 g8_;
+  const Generator9 g9_;
+};  // class CartesianProductHolder9
+
+template <class Generator1, class Generator2, class Generator3,
+    class Generator4, class Generator5, class Generator6, class Generator7,
+    class Generator8, class Generator9, class Generator10>
+class CartesianProductHolder10 {
+ public:
+CartesianProductHolder10(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3, const Generator4& g4, const Generator5& g5,
+    const Generator6& g6, const Generator7& g7, const Generator8& g8,
+    const Generator9& g9, const Generator10& g10)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8),
+          g9_(g9), g10_(g10) {}
+  template <typename T1, typename T2, typename T3, typename T4, typename T5,
+      typename T6, typename T7, typename T8, typename T9, typename T10>
+  operator ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9,
+      T10> >() const {
+    return ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9,
+        T10> >(
+        new CartesianProductGenerator10<T1, T2, T3, T4, T5, T6, T7, T8, T9,
+            T10>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_),
+        static_cast<ParamGenerator<T4> >(g4_),
+        static_cast<ParamGenerator<T5> >(g5_),
+        static_cast<ParamGenerator<T6> >(g6_),
+        static_cast<ParamGenerator<T7> >(g7_),
+        static_cast<ParamGenerator<T8> >(g8_),
+        static_cast<ParamGenerator<T9> >(g9_),
+        static_cast<ParamGenerator<T10> >(g10_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder10& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+  const Generator4 g4_;
+  const Generator5 g5_;
+  const Generator6 g6_;
+  const Generator7 g7_;
+  const Generator8 g8_;
+  const Generator9 g9_;
+  const Generator10 g10_;
+};  // class CartesianProductHolder10
+
+# endif  // GTEST_HAS_COMBINE
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  //  GTEST_HAS_PARAM_TEST
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
+
+#if GTEST_HAS_PARAM_TEST
+
+namespace testing {
+
+// Functions producing parameter generators.
+//
+// Google Test uses these generators to produce parameters for value-
+// parameterized tests. When a parameterized test case is instantiated
+// with a particular generator, Google Test creates and runs tests
+// for each element in the sequence produced by the generator.
+//
+// In the following sample, tests from test case FooTest are instantiated
+// each three times with parameter values 3, 5, and 8:
+//
+// class FooTest : public TestWithParam<int> { ... };
+//
+// TEST_P(FooTest, TestThis) {
+// }
+// TEST_P(FooTest, TestThat) {
+// }
+// INSTANTIATE_TEST_CASE_P(TestSequence, FooTest, Values(3, 5, 8));
+//
+
+// Range() returns generators providing sequences of values in a range.
+//
+// Synopsis:
+// Range(start, end)
+//   - returns a generator producing a sequence of values {start, start+1,
+//     start+2, ..., }.
+// Range(start, end, step)
+//   - returns a generator producing a sequence of values {start, start+step,
+//     start+step+step, ..., }.
+// Notes:
+//   * The generated sequences never include end. For example, Range(1, 5)
+//     returns a generator producing a sequence {1, 2, 3, 4}. Range(1, 9, 2)
+//     returns a generator producing {1, 3, 5, 7}.
+//   * start and end must have the same type. That type may be any integral or
+//     floating-point type or a user defined type satisfying these conditions:
+//     * It must be assignable (have operator=() defined).
+//     * It must have operator+() (operator+(int-compatible type) for
+//       two-operand version).
+//     * It must have operator<() defined.
+//     Elements in the resulting sequences will also have that type.
+//   * Condition start < end must be satisfied in order for resulting sequences
+//     to contain any elements.
+//
+template <typename T, typename IncrementT>
+internal::ParamGenerator<T> Range(T start, T end, IncrementT step) {
+  return internal::ParamGenerator<T>(
+      new internal::RangeGenerator<T, IncrementT>(start, end, step));
+}
+
+template <typename T>
+internal::ParamGenerator<T> Range(T start, T end) {
+  return Range(start, end, 1);
+}
+
+// ValuesIn() function allows generation of tests with parameters coming from
+// a container.
+//
+// Synopsis:
+// ValuesIn(const T (&array)[N])
+//   - returns a generator producing sequences with elements from
+//     a C-style array.
+// ValuesIn(const Container& container)
+//   - returns a generator producing sequences with elements from
+//     an STL-style container.
+// ValuesIn(Iterator begin, Iterator end)
+//   - returns a generator producing sequences with elements from
+//     a range [begin, end) defined by a pair of STL-style iterators. These
+//     iterators can also be plain C pointers.
+//
+// Please note that ValuesIn copies the values from the containers
+// passed in and keeps them to generate tests in RUN_ALL_TESTS().
+//
+// Examples:
+//
+// This instantiates tests from test case StringTest
+// each with C-string values of "foo", "bar", and "baz":
+//
+// const char* strings[] = {"foo", "bar", "baz"};
+// INSTANTIATE_TEST_CASE_P(StringSequence, SrtingTest, ValuesIn(strings));
+//
+// This instantiates tests from test case StlStringTest
+// each with STL strings with values "a" and "b":
+//
+// ::std::vector< ::std::string> GetParameterStrings() {
+//   ::std::vector< ::std::string> v;
+//   v.push_back("a");
+//   v.push_back("b");
+//   return v;
+// }
+//
+// INSTANTIATE_TEST_CASE_P(CharSequence,
+//                         StlStringTest,
+//                         ValuesIn(GetParameterStrings()));
+//
+//
+// This will also instantiate tests from CharTest
+// each with parameter values 'a' and 'b':
+//
+// ::std::list<char> GetParameterChars() {
+//   ::std::list<char> list;
+//   list.push_back('a');
+//   list.push_back('b');
+//   return list;
+// }
+// ::std::list<char> l = GetParameterChars();
+// INSTANTIATE_TEST_CASE_P(CharSequence2,
+//                         CharTest,
+//                         ValuesIn(l.begin(), l.end()));
+//
+template <typename ForwardIterator>
+internal::ParamGenerator<
+  typename ::testing::internal::IteratorTraits<ForwardIterator>::value_type>
+ValuesIn(ForwardIterator begin, ForwardIterator end) {
+  typedef typename ::testing::internal::IteratorTraits<ForwardIterator>
+      ::value_type ParamType;
+  return internal::ParamGenerator<ParamType>(
+      new internal::ValuesInIteratorRangeGenerator<ParamType>(begin, end));
+}
+
+template <typename T, size_t N>
+internal::ParamGenerator<T> ValuesIn(const T (&array)[N]) {
+  return ValuesIn(array, array + N);
+}
+
+template <class Container>
+internal::ParamGenerator<typename Container::value_type> ValuesIn(
+    const Container& container) {
+  return ValuesIn(container.begin(), container.end());
+}
+
+// Values() allows generating tests from explicitly specified list of
+// parameters.
+//
+// Synopsis:
+// Values(T v1, T v2, ..., T vN)
+//   - returns a generator producing sequences with elements v1, v2, ..., vN.
+//
+// For example, this instantiates tests from test case BarTest each
+// with values "one", "two", and "three":
+//
+// INSTANTIATE_TEST_CASE_P(NumSequence, BarTest, Values("one", "two", "three"));
+//
+// This instantiates tests from test case BazTest each with values 1, 2, 3.5.
+// The exact type of values will depend on the type of parameter in BazTest.
+//
+// INSTANTIATE_TEST_CASE_P(FloatingNumbers, BazTest, Values(1, 2, 3.5));
+//
+// Currently, Values() supports from 1 to 50 parameters.
+//
+template <typename T1>
+internal::ValueArray1<T1> Values(T1 v1) {
+  return internal::ValueArray1<T1>(v1);
+}
+
+template <typename T1, typename T2>
+internal::ValueArray2<T1, T2> Values(T1 v1, T2 v2) {
+  return internal::ValueArray2<T1, T2>(v1, v2);
+}
+
+template <typename T1, typename T2, typename T3>
+internal::ValueArray3<T1, T2, T3> Values(T1 v1, T2 v2, T3 v3) {
+  return internal::ValueArray3<T1, T2, T3>(v1, v2, v3);
+}
+
+template <typename T1, typename T2, typename T3, typename T4>
+internal::ValueArray4<T1, T2, T3, T4> Values(T1 v1, T2 v2, T3 v3, T4 v4) {
+  return internal::ValueArray4<T1, T2, T3, T4>(v1, v2, v3, v4);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+internal::ValueArray5<T1, T2, T3, T4, T5> Values(T1 v1, T2 v2, T3 v3, T4 v4,
+    T5 v5) {
+  return internal::ValueArray5<T1, T2, T3, T4, T5>(v1, v2, v3, v4, v5);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6>
+internal::ValueArray6<T1, T2, T3, T4, T5, T6> Values(T1 v1, T2 v2, T3 v3,
+    T4 v4, T5 v5, T6 v6) {
+  return internal::ValueArray6<T1, T2, T3, T4, T5, T6>(v1, v2, v3, v4, v5, v6);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7>
+internal::ValueArray7<T1, T2, T3, T4, T5, T6, T7> Values(T1 v1, T2 v2, T3 v3,
+    T4 v4, T5 v5, T6 v6, T7 v7) {
+  return internal::ValueArray7<T1, T2, T3, T4, T5, T6, T7>(v1, v2, v3, v4, v5,
+      v6, v7);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8>
+internal::ValueArray8<T1, T2, T3, T4, T5, T6, T7, T8> Values(T1 v1, T2 v2,
+    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8) {
+  return internal::ValueArray8<T1, T2, T3, T4, T5, T6, T7, T8>(v1, v2, v3, v4,
+      v5, v6, v7, v8);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9>
+internal::ValueArray9<T1, T2, T3, T4, T5, T6, T7, T8, T9> Values(T1 v1, T2 v2,
+    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9) {
+  return internal::ValueArray9<T1, T2, T3, T4, T5, T6, T7, T8, T9>(v1, v2, v3,
+      v4, v5, v6, v7, v8, v9);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10>
+internal::ValueArray10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> Values(T1 v1,
+    T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10) {
+  return internal::ValueArray10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10>(v1,
+      v2, v3, v4, v5, v6, v7, v8, v9, v10);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11>
+internal::ValueArray11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10,
+    T11> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11) {
+  return internal::ValueArray11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10,
+      T11>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12>
+internal::ValueArray12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+    T12> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12) {
+  return internal::ValueArray12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13>
+internal::ValueArray13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+    T13> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13) {
+  return internal::ValueArray13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14>
+internal::ValueArray14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14) {
+  return internal::ValueArray14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
+      v14);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15>
+internal::ValueArray15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8,
+    T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15) {
+  return internal::ValueArray15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+      v13, v14, v15);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16>
+internal::ValueArray16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
+    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16) {
+  return internal::ValueArray16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
+      v12, v13, v14, v15, v16);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17>
+internal::ValueArray17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
+    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17) {
+  return internal::ValueArray17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10,
+      v11, v12, v13, v14, v15, v16, v17);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18>
+internal::ValueArray18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6,
+    T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17, T18 v18) {
+  return internal::ValueArray18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18>(v1, v2, v3, v4, v5, v6, v7, v8, v9,
+      v10, v11, v12, v13, v14, v15, v16, v17, v18);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19>
+internal::ValueArray19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5,
+    T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14,
+    T15 v15, T16 v16, T17 v17, T18 v18, T19 v19) {
+  return internal::ValueArray19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19>(v1, v2, v3, v4, v5, v6, v7, v8,
+      v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20>
+internal::ValueArray20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20> Values(T1 v1, T2 v2, T3 v3, T4 v4,
+    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
+    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20) {
+  return internal::ValueArray20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20>(v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21>
+internal::ValueArray21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21> Values(T1 v1, T2 v2, T3 v3, T4 v4,
+    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
+    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21) {
+  return internal::ValueArray21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21>(v1, v2, v3, v4, v5, v6,
+      v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22>
+internal::ValueArray22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22> Values(T1 v1, T2 v2, T3 v3,
+    T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
+    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
+    T21 v21, T22 v22) {
+  return internal::ValueArray22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22>(v1, v2, v3, v4,
+      v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
+      v20, v21, v22);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23>
+internal::ValueArray23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23> Values(T1 v1, T2 v2,
+    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
+    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
+    T21 v21, T22 v22, T23 v23) {
+  return internal::ValueArray23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23>(v1, v2, v3,
+      v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
+      v20, v21, v22, v23);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24>
+internal::ValueArray24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> Values(T1 v1, T2 v2,
+    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
+    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
+    T21 v21, T22 v22, T23 v23, T24 v24) {
+  return internal::ValueArray24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24>(v1, v2,
+      v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18,
+      v19, v20, v21, v22, v23, v24);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25>
+internal::ValueArray25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> Values(T1 v1,
+    T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11,
+    T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19,
+    T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25) {
+  return internal::ValueArray25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25>(v1,
+      v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17,
+      v18, v19, v20, v21, v22, v23, v24, v25);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26>
+internal::ValueArray26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+    T26> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26) {
+  return internal::ValueArray26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
+      v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27>
+internal::ValueArray27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+    T27> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26, T27 v27) {
+  return internal::ValueArray27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14,
+      v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28>
+internal::ValueArray28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+    T28> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26, T27 v27, T28 v28) {
+  return internal::ValueArray28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
+      v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27,
+      v28);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29>
+internal::ValueArray29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26, T27 v27, T28 v28, T29 v29) {
+  return internal::ValueArray29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+      v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26,
+      v27, v28, v29);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30>
+internal::ValueArray30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8,
+    T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16,
+    T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24,
+    T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30) {
+  return internal::ValueArray30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
+      v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25,
+      v26, v27, v28, v29, v30);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31>
+internal::ValueArray31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
+    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
+    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31) {
+  return internal::ValueArray31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10,
+      v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24,
+      v25, v26, v27, v28, v29, v30, v31);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32>
+internal::ValueArray32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
+    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
+    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
+    T32 v32) {
+  return internal::ValueArray32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32>(v1, v2, v3, v4, v5, v6, v7, v8, v9,
+      v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23,
+      v24, v25, v26, v27, v28, v29, v30, v31, v32);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33>
+internal::ValueArray33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6,
+    T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
+    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
+    T32 v32, T33 v33) {
+  return internal::ValueArray33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33>(v1, v2, v3, v4, v5, v6, v7, v8,
+      v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23,
+      v24, v25, v26, v27, v28, v29, v30, v31, v32, v33);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34>
+internal::ValueArray34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5,
+    T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14,
+    T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22,
+    T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30,
+    T31 v31, T32 v32, T33 v33, T34 v34) {
+  return internal::ValueArray34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34>(v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22,
+      v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35>
+internal::ValueArray35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35> Values(T1 v1, T2 v2, T3 v3, T4 v4,
+    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
+    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21,
+    T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29,
+    T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35) {
+  return internal::ValueArray35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35>(v1, v2, v3, v4, v5, v6,
+      v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21,
+      v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36>
+internal::ValueArray36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36> Values(T1 v1, T2 v2, T3 v3, T4 v4,
+    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
+    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21,
+    T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29,
+    T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36) {
+  return internal::ValueArray36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36>(v1, v2, v3, v4,
+      v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
+      v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33,
+      v34, v35, v36);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37>
+internal::ValueArray37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37> Values(T1 v1, T2 v2, T3 v3,
+    T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
+    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
+    T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28,
+    T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36,
+    T37 v37) {
+  return internal::ValueArray37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37>(v1, v2, v3,
+      v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
+      v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33,
+      v34, v35, v36, v37);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38>
+internal::ValueArray38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> Values(T1 v1, T2 v2,
+    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
+    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
+    T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28,
+    T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36,
+    T37 v37, T38 v38) {
+  return internal::ValueArray38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38>(v1, v2,
+      v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18,
+      v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32,
+      v33, v34, v35, v36, v37, v38);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39>
+internal::ValueArray39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> Values(T1 v1, T2 v2,
+    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
+    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
+    T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28,
+    T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36,
+    T37 v37, T38 v38, T39 v39) {
+  return internal::ValueArray39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39>(v1,
+      v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17,
+      v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
+      v32, v33, v34, v35, v36, v37, v38, v39);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40>
+internal::ValueArray40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> Values(T1 v1,
+    T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11,
+    T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19,
+    T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27,
+    T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35,
+    T36 v36, T37 v37, T38 v38, T39 v39, T40 v40) {
+  return internal::ValueArray40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
+      v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29,
+      v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41>
+internal::ValueArray41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+    T41> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+    T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41) {
+  return internal::ValueArray41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14,
+      v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28,
+      v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42>
+internal::ValueArray42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+    T42> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+    T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+    T42 v42) {
+  return internal::ValueArray42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
+      v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27,
+      v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41,
+      v42);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43>
+internal::ValueArray43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+    T43> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+    T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+    T42 v42, T43 v43) {
+  return internal::ValueArray43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+      v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26,
+      v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40,
+      v41, v42, v43);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44>
+internal::ValueArray44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+    T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+    T42 v42, T43 v43, T44 v44) {
+  return internal::ValueArray44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43, T44>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
+      v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25,
+      v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39,
+      v40, v41, v42, v43, v44);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45>
+internal::ValueArray45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8,
+    T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16,
+    T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24,
+    T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32,
+    T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40,
+    T41 v41, T42 v42, T43 v43, T44 v44, T45 v45) {
+  return internal::ValueArray45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43, T44, T45>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10,
+      v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24,
+      v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38,
+      v39, v40, v41, v42, v43, v44, v45);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46>
+internal::ValueArray46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45, T46> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
+    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
+    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
+    T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39,
+    T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46) {
+  return internal::ValueArray46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43, T44, T45, T46>(v1, v2, v3, v4, v5, v6, v7, v8, v9,
+      v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23,
+      v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37,
+      v38, v39, v40, v41, v42, v43, v44, v45, v46);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47>
+internal::ValueArray47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45, T46, T47> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
+    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
+    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
+    T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39,
+    T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47) {
+  return internal::ValueArray47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43, T44, T45, T46, T47>(v1, v2, v3, v4, v5, v6, v7, v8,
+      v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23,
+      v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37,
+      v38, v39, v40, v41, v42, v43, v44, v45, v46, v47);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48>
+internal::ValueArray48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45, T46, T47, T48> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6,
+    T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
+    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
+    T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39,
+    T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47,
+    T48 v48) {
+  return internal::ValueArray48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43, T44, T45, T46, T47, T48>(v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22,
+      v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36,
+      v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49>
+internal::ValueArray49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45, T46, T47, T48, T49> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5,
+    T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14,
+    T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22,
+    T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30,
+    T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38,
+    T39 v39, T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46,
+    T47 v47, T48 v48, T49 v49) {
+  return internal::ValueArray49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43, T44, T45, T46, T47, T48, T49>(v1, v2, v3, v4, v5, v6,
+      v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21,
+      v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35,
+      v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49, typename T50>
+internal::ValueArray50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45, T46, T47, T48, T49, T50> Values(T1 v1, T2 v2, T3 v3, T4 v4,
+    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
+    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21,
+    T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29,
+    T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37,
+    T38 v38, T39 v39, T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45,
+    T46 v46, T47 v47, T48 v48, T49 v49, T50 v50) {
+  return internal::ValueArray50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43, T44, T45, T46, T47, T48, T49, T50>(v1, v2, v3, v4,
+      v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
+      v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33,
+      v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47,
+      v48, v49, v50);
+}
+
+// Bool() allows generating tests with parameters in a set of (false, true).
+//
+// Synopsis:
+// Bool()
+//   - returns a generator producing sequences with elements {false, true}.
+//
+// It is useful when testing code that depends on Boolean flags. Combinations
+// of multiple flags can be tested when several Bool()'s are combined using
+// Combine() function.
+//
+// In the following example all tests in the test case FlagDependentTest
+// will be instantiated twice with parameters false and true.
+//
+// class FlagDependentTest : public testing::TestWithParam<bool> {
+//   virtual void SetUp() {
+//     external_flag = GetParam();
+//   }
+// }
+// INSTANTIATE_TEST_CASE_P(BoolSequence, FlagDependentTest, Bool());
+//
+inline internal::ParamGenerator<bool> Bool() {
+  return Values(false, true);
+}
+
+# if GTEST_HAS_COMBINE
+// Combine() allows the user to combine two or more sequences to produce
+// values of a Cartesian product of those sequences' elements.
+//
+// Synopsis:
+// Combine(gen1, gen2, ..., genN)
+//   - returns a generator producing sequences with elements coming from
+//     the Cartesian product of elements from the sequences generated by
+//     gen1, gen2, ..., genN. The sequence elements will have a type of
+//     tuple<T1, T2, ..., TN> where T1, T2, ..., TN are the types
+//     of elements from sequences produces by gen1, gen2, ..., genN.
+//
+// Combine can have up to 10 arguments. This number is currently limited
+// by the maximum number of elements in the tuple implementation used by Google
+// Test.
+//
+// Example:
+//
+// This will instantiate tests in test case AnimalTest each one with
+// the parameter values tuple("cat", BLACK), tuple("cat", WHITE),
+// tuple("dog", BLACK), and tuple("dog", WHITE):
+//
+// enum Color { BLACK, GRAY, WHITE };
+// class AnimalTest
+//     : public testing::TestWithParam<tuple<const char*, Color> > {...};
+//
+// TEST_P(AnimalTest, AnimalLooksNice) {...}
+//
+// INSTANTIATE_TEST_CASE_P(AnimalVariations, AnimalTest,
+//                         Combine(Values("cat", "dog"),
+//                                 Values(BLACK, WHITE)));
+//
+// This will instantiate tests in FlagDependentTest with all variations of two
+// Boolean flags:
+//
+// class FlagDependentTest
+//     : public testing::TestWithParam<tuple<bool, bool> > {
+//   virtual void SetUp() {
+//     // Assigns external_flag_1 and external_flag_2 values from the tuple.
+//     tie(external_flag_1, external_flag_2) = GetParam();
+//   }
+// };
+//
+// TEST_P(FlagDependentTest, TestFeature1) {
+//   // Test your code using external_flag_1 and external_flag_2 here.
+// }
+// INSTANTIATE_TEST_CASE_P(TwoBoolSequence, FlagDependentTest,
+//                         Combine(Bool(), Bool()));
+//
+template <typename Generator1, typename Generator2>
+internal::CartesianProductHolder2<Generator1, Generator2> Combine(
+    const Generator1& g1, const Generator2& g2) {
+  return internal::CartesianProductHolder2<Generator1, Generator2>(
+      g1, g2);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3>
+internal::CartesianProductHolder3<Generator1, Generator2, Generator3> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3) {
+  return internal::CartesianProductHolder3<Generator1, Generator2, Generator3>(
+      g1, g2, g3);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3,
+    typename Generator4>
+internal::CartesianProductHolder4<Generator1, Generator2, Generator3,
+    Generator4> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3,
+        const Generator4& g4) {
+  return internal::CartesianProductHolder4<Generator1, Generator2, Generator3,
+      Generator4>(
+      g1, g2, g3, g4);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3,
+    typename Generator4, typename Generator5>
+internal::CartesianProductHolder5<Generator1, Generator2, Generator3,
+    Generator4, Generator5> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3,
+        const Generator4& g4, const Generator5& g5) {
+  return internal::CartesianProductHolder5<Generator1, Generator2, Generator3,
+      Generator4, Generator5>(
+      g1, g2, g3, g4, g5);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3,
+    typename Generator4, typename Generator5, typename Generator6>
+internal::CartesianProductHolder6<Generator1, Generator2, Generator3,
+    Generator4, Generator5, Generator6> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3,
+        const Generator4& g4, const Generator5& g5, const Generator6& g6) {
+  return internal::CartesianProductHolder6<Generator1, Generator2, Generator3,
+      Generator4, Generator5, Generator6>(
+      g1, g2, g3, g4, g5, g6);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3,
+    typename Generator4, typename Generator5, typename Generator6,
+    typename Generator7>
+internal::CartesianProductHolder7<Generator1, Generator2, Generator3,
+    Generator4, Generator5, Generator6, Generator7> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3,
+        const Generator4& g4, const Generator5& g5, const Generator6& g6,
+        const Generator7& g7) {
+  return internal::CartesianProductHolder7<Generator1, Generator2, Generator3,
+      Generator4, Generator5, Generator6, Generator7>(
+      g1, g2, g3, g4, g5, g6, g7);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3,
+    typename Generator4, typename Generator5, typename Generator6,
+    typename Generator7, typename Generator8>
+internal::CartesianProductHolder8<Generator1, Generator2, Generator3,
+    Generator4, Generator5, Generator6, Generator7, Generator8> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3,
+        const Generator4& g4, const Generator5& g5, const Generator6& g6,
+        const Generator7& g7, const Generator8& g8) {
+  return internal::CartesianProductHolder8<Generator1, Generator2, Generator3,
+      Generator4, Generator5, Generator6, Generator7, Generator8>(
+      g1, g2, g3, g4, g5, g6, g7, g8);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3,
+    typename Generator4, typename Generator5, typename Generator6,
+    typename Generator7, typename Generator8, typename Generator9>
+internal::CartesianProductHolder9<Generator1, Generator2, Generator3,
+    Generator4, Generator5, Generator6, Generator7, Generator8,
+    Generator9> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3,
+        const Generator4& g4, const Generator5& g5, const Generator6& g6,
+        const Generator7& g7, const Generator8& g8, const Generator9& g9) {
+  return internal::CartesianProductHolder9<Generator1, Generator2, Generator3,
+      Generator4, Generator5, Generator6, Generator7, Generator8, Generator9>(
+      g1, g2, g3, g4, g5, g6, g7, g8, g9);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3,
+    typename Generator4, typename Generator5, typename Generator6,
+    typename Generator7, typename Generator8, typename Generator9,
+    typename Generator10>
+internal::CartesianProductHolder10<Generator1, Generator2, Generator3,
+    Generator4, Generator5, Generator6, Generator7, Generator8, Generator9,
+    Generator10> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3,
+        const Generator4& g4, const Generator5& g5, const Generator6& g6,
+        const Generator7& g7, const Generator8& g8, const Generator9& g9,
+        const Generator10& g10) {
+  return internal::CartesianProductHolder10<Generator1, Generator2, Generator3,
+      Generator4, Generator5, Generator6, Generator7, Generator8, Generator9,
+      Generator10>(
+      g1, g2, g3, g4, g5, g6, g7, g8, g9, g10);
+}
+# endif  // GTEST_HAS_COMBINE
+
+
+
+# define TEST_P(test_case_name, test_name) \
+  class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \
+      : public test_case_name { \
+   public: \
+    GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {} \
+    virtual void TestBody(); \
+   private: \
+    static int AddToRegistry() { \
+      ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \
+          GetTestCasePatternHolder<test_case_name>(\
+              #test_case_name, __FILE__, __LINE__)->AddTestPattern(\
+                  #test_case_name, \
+                  #test_name, \
+                  new ::testing::internal::TestMetaFactory< \
+                      GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>()); \
+      return 0; \
+    } \
+    static int gtest_registering_dummy_; \
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(\
+        GTEST_TEST_CLASS_NAME_(test_case_name, test_name)); \
+  }; \
+  int GTEST_TEST_CLASS_NAME_(test_case_name, \
+                             test_name)::gtest_registering_dummy_ = \
+      GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::AddToRegistry(); \
+  void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody()
+
+# define INSTANTIATE_TEST_CASE_P(prefix, test_case_name, generator) \
+  ::testing::internal::ParamGenerator<test_case_name::ParamType> \
+      gtest_##prefix##test_case_name##_EvalGenerator_() { return generator; } \
+  int gtest_##prefix##test_case_name##_dummy_ = \
+      ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \
+          GetTestCasePatternHolder<test_case_name>(\
+              #test_case_name, __FILE__, __LINE__)->AddTestCaseInstantiation(\
+                  #prefix, \
+                  &gtest_##prefix##test_case_name##_EvalGenerator_, \
+                  __FILE__, __LINE__)
+
+}  // namespace testing
+
+#endif  // GTEST_HAS_PARAM_TEST
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
+// Copyright 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan at google.com (Zhanyong Wan)
+//
+// Google C++ Testing Framework definitions useful in production code.
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_PROD_H_
+#define GTEST_INCLUDE_GTEST_GTEST_PROD_H_
+
+// When you need to test the private or protected members of a class,
+// use the FRIEND_TEST macro to declare your tests as friends of the
+// class.  For example:
+//
+// class MyClass {
+//  private:
+//   void MyMethod();
+//   FRIEND_TEST(MyClassTest, MyMethod);
+// };
+//
+// class MyClassTest : public testing::Test {
+//   // ...
+// };
+//
+// TEST_F(MyClassTest, MyMethod) {
+//   // Can call MyClass::MyMethod() here.
+// }
+
+#define FRIEND_TEST(test_case_name, test_name)\
+friend class test_case_name##_##test_name##_Test
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_PROD_H_
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: mheule at google.com (Markus Heule)
+//
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
+#define GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
+
+#include <iosfwd>
+#include <vector>
+
+namespace testing {
+
+// A copyable object representing the result of a test part (i.e. an
+// assertion or an explicit FAIL(), ADD_FAILURE(), or SUCCESS()).
+//
+// Don't inherit from TestPartResult as its destructor is not virtual.
+class GTEST_API_ TestPartResult {
+ public:
+  // The possible outcomes of a test part (i.e. an assertion or an
+  // explicit SUCCEED(), FAIL(), or ADD_FAILURE()).
+  enum Type {
+    kSuccess,          // Succeeded.
+    kNonFatalFailure,  // Failed but the test can continue.
+    kFatalFailure      // Failed and the test should be terminated.
+  };
+
+  // C'tor.  TestPartResult does NOT have a default constructor.
+  // Always use this constructor (with parameters) to create a
+  // TestPartResult object.
+  TestPartResult(Type a_type,
+                 const char* a_file_name,
+                 int a_line_number,
+                 const char* a_message)
+      : type_(a_type),
+        file_name_(a_file_name == NULL ? "" : a_file_name),
+        line_number_(a_line_number),
+        summary_(ExtractSummary(a_message)),
+        message_(a_message) {
+  }
+
+  // Gets the outcome of the test part.
+  Type type() const { return type_; }
+
+  // Gets the name of the source file where the test part took place, or
+  // NULL if it's unknown.
+  const char* file_name() const {
+    return file_name_.empty() ? NULL : file_name_.c_str();
+  }
+
+  // Gets the line in the source file where the test part took place,
+  // or -1 if it's unknown.
+  int line_number() const { return line_number_; }
+
+  // Gets the summary of the failure message.
+  const char* summary() const { return summary_.c_str(); }
+
+  // Gets the message associated with the test part.
+  const char* message() const { return message_.c_str(); }
+
+  // Returns true iff the test part passed.
+  bool passed() const { return type_ == kSuccess; }
+
+  // Returns true iff the test part failed.
+  bool failed() const { return type_ != kSuccess; }
+
+  // Returns true iff the test part non-fatally failed.
+  bool nonfatally_failed() const { return type_ == kNonFatalFailure; }
+
+  // Returns true iff the test part fatally failed.
+  bool fatally_failed() const { return type_ == kFatalFailure; }
+
+ private:
+  Type type_;
+
+  // Gets the summary of the failure message by omitting the stack
+  // trace in it.
+  static std::string ExtractSummary(const char* message);
+
+  // The name of the source file where the test part took place, or
+  // "" if the source file is unknown.
+  std::string file_name_;
+  // The line in the source file where the test part took place, or -1
+  // if the line number is unknown.
+  int line_number_;
+  std::string summary_;  // The test failure summary.
+  std::string message_;  // The test failure message.
+};
+
+// Prints a TestPartResult object.
+std::ostream& operator<<(std::ostream& os, const TestPartResult& result);
+
+// An array of TestPartResult objects.
+//
+// Don't inherit from TestPartResultArray as its destructor is not
+// virtual.
+class GTEST_API_ TestPartResultArray {
+ public:
+  TestPartResultArray() {}
+
+  // Appends the given TestPartResult to the array.
+  void Append(const TestPartResult& result);
+
+  // Returns the TestPartResult at the given index (0-based).
+  const TestPartResult& GetTestPartResult(int index) const;
+
+  // Returns the number of TestPartResult objects in the array.
+  int size() const;
+
+ private:
+  std::vector<TestPartResult> array_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestPartResultArray);
+};
+
+// This interface knows how to report a test part result.
+class TestPartResultReporterInterface {
+ public:
+  virtual ~TestPartResultReporterInterface() {}
+
+  virtual void ReportTestPartResult(const TestPartResult& result) = 0;
+};
+
+namespace internal {
+
+// This helper class is used by {ASSERT|EXPECT}_NO_FATAL_FAILURE to check if a
+// statement generates new fatal failures. To do so it registers itself as the
+// current test part result reporter. Besides checking if fatal failures were
+// reported, it only delegates the reporting to the former result reporter.
+// The original result reporter is restored in the destructor.
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+class GTEST_API_ HasNewFatalFailureHelper
+    : public TestPartResultReporterInterface {
+ public:
+  HasNewFatalFailureHelper();
+  virtual ~HasNewFatalFailureHelper();
+  virtual void ReportTestPartResult(const TestPartResult& result);
+  bool has_new_fatal_failure() const { return has_new_fatal_failure_; }
+ private:
+  bool has_new_fatal_failure_;
+  TestPartResultReporterInterface* original_reporter_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(HasNewFatalFailureHelper);
+};
+
+}  // namespace internal
+
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan at google.com (Zhanyong Wan)
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
+#define GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
+
+// This header implements typed tests and type-parameterized tests.
+
+// Typed (aka type-driven) tests repeat the same test for types in a
+// list.  You must know which types you want to test with when writing
+// typed tests. Here's how you do it:
+
+#if 0
+
+// First, define a fixture class template.  It should be parameterized
+// by a type.  Remember to derive it from testing::Test.
+template <typename T>
+class FooTest : public testing::Test {
+ public:
+  ...
+  typedef std::list<T> List;
+  static T shared_;
+  T value_;
+};
+
+// Next, associate a list of types with the test case, which will be
+// repeated for each type in the list.  The typedef is necessary for
+// the macro to parse correctly.
+typedef testing::Types<char, int, unsigned int> MyTypes;
+TYPED_TEST_CASE(FooTest, MyTypes);
+
+// If the type list contains only one type, you can write that type
+// directly without Types<...>:
+//   TYPED_TEST_CASE(FooTest, int);
+
+// Then, use TYPED_TEST() instead of TEST_F() to define as many typed
+// tests for this test case as you want.
+TYPED_TEST(FooTest, DoesBlah) {
+  // Inside a test, refer to TypeParam to get the type parameter.
+  // Since we are inside a derived class template, C++ requires use to
+  // visit the members of FooTest via 'this'.
+  TypeParam n = this->value_;
+
+  // To visit static members of the fixture, add the TestFixture::
+  // prefix.
+  n += TestFixture::shared_;
+
+  // To refer to typedefs in the fixture, add the "typename
+  // TestFixture::" prefix.
+  typename TestFixture::List values;
+  values.push_back(n);
+  ...
+}
+
+TYPED_TEST(FooTest, HasPropertyA) { ... }
+
+#endif  // 0
+
+// Type-parameterized tests are abstract test patterns parameterized
+// by a type.  Compared with typed tests, type-parameterized tests
+// allow you to define the test pattern without knowing what the type
+// parameters are.  The defined pattern can be instantiated with
+// different types any number of times, in any number of translation
+// units.
+//
+// If you are designing an interface or concept, you can define a
+// suite of type-parameterized tests to verify properties that any
+// valid implementation of the interface/concept should have.  Then,
+// each implementation can easily instantiate the test suite to verify
+// that it conforms to the requirements, without having to write
+// similar tests repeatedly.  Here's an example:
+
+#if 0
+
+// First, define a fixture class template.  It should be parameterized
+// by a type.  Remember to derive it from testing::Test.
+template <typename T>
+class FooTest : public testing::Test {
+  ...
+};
+
+// Next, declare that you will define a type-parameterized test case
+// (the _P suffix is for "parameterized" or "pattern", whichever you
+// prefer):
+TYPED_TEST_CASE_P(FooTest);
+
+// Then, use TYPED_TEST_P() to define as many type-parameterized tests
+// for this type-parameterized test case as you want.
+TYPED_TEST_P(FooTest, DoesBlah) {
+  // Inside a test, refer to TypeParam to get the type parameter.
+  TypeParam n = 0;
+  ...
+}
+
+TYPED_TEST_P(FooTest, HasPropertyA) { ... }
+
+// Now the tricky part: you need to register all test patterns before
+// you can instantiate them.  The first argument of the macro is the
+// test case name; the rest are the names of the tests in this test
+// case.
+REGISTER_TYPED_TEST_CASE_P(FooTest,
+                           DoesBlah, HasPropertyA);
+
+// Finally, you are free to instantiate the pattern with the types you
+// want.  If you put the above code in a header file, you can #include
+// it in multiple C++ source files and instantiate it multiple times.
+//
+// To distinguish different instances of the pattern, the first
+// argument to the INSTANTIATE_* macro is a prefix that will be added
+// to the actual test case name.  Remember to pick unique prefixes for
+// different instances.
+typedef testing::Types<char, int, unsigned int> MyTypes;
+INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, MyTypes);
+
+// If the type list contains only one type, you can write that type
+// directly without Types<...>:
+//   INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, int);
+
+#endif  // 0
+
+
+// Implements typed tests.
+
+#if GTEST_HAS_TYPED_TEST
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Expands to the name of the typedef for the type parameters of the
+// given test case.
+# define GTEST_TYPE_PARAMS_(TestCaseName) gtest_type_params_##TestCaseName##_
+
+// The 'Types' template argument below must have spaces around it
+// since some compilers may choke on '>>' when passing a template
+// instance (e.g. Types<int>)
+# define TYPED_TEST_CASE(CaseName, Types) \
+  typedef ::testing::internal::TypeList< Types >::type \
+      GTEST_TYPE_PARAMS_(CaseName)
+
+# define TYPED_TEST(CaseName, TestName) \
+  template <typename gtest_TypeParam_> \
+  class GTEST_TEST_CLASS_NAME_(CaseName, TestName) \
+      : public CaseName<gtest_TypeParam_> { \
+   private: \
+    typedef CaseName<gtest_TypeParam_> TestFixture; \
+    typedef gtest_TypeParam_ TypeParam; \
+    virtual void TestBody(); \
+  }; \
+  bool gtest_##CaseName##_##TestName##_registered_ GTEST_ATTRIBUTE_UNUSED_ = \
+      ::testing::internal::TypeParameterizedTest< \
+          CaseName, \
+          ::testing::internal::TemplateSel< \
+              GTEST_TEST_CLASS_NAME_(CaseName, TestName)>, \
+          GTEST_TYPE_PARAMS_(CaseName)>::Register(\
+              "", #CaseName, #TestName, 0); \
+  template <typename gtest_TypeParam_> \
+  void GTEST_TEST_CLASS_NAME_(CaseName, TestName)<gtest_TypeParam_>::TestBody()
+
+#endif  // GTEST_HAS_TYPED_TEST
+
+// Implements type-parameterized tests.
+
+#if GTEST_HAS_TYPED_TEST_P
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Expands to the namespace name that the type-parameterized tests for
+// the given type-parameterized test case are defined in.  The exact
+// name of the namespace is subject to change without notice.
+# define GTEST_CASE_NAMESPACE_(TestCaseName) \
+  gtest_case_##TestCaseName##_
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Expands to the name of the variable used to remember the names of
+// the defined tests in the given test case.
+# define GTEST_TYPED_TEST_CASE_P_STATE_(TestCaseName) \
+  gtest_typed_test_case_p_state_##TestCaseName##_
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE DIRECTLY.
+//
+// Expands to the name of the variable used to remember the names of
+// the registered tests in the given test case.
+# define GTEST_REGISTERED_TEST_NAMES_(TestCaseName) \
+  gtest_registered_test_names_##TestCaseName##_
+
+// The variables defined in the type-parameterized test macros are
+// static as typically these macros are used in a .h file that can be
+// #included in multiple translation units linked together.
+# define TYPED_TEST_CASE_P(CaseName) \
+  static ::testing::internal::TypedTestCasePState \
+      GTEST_TYPED_TEST_CASE_P_STATE_(CaseName)
+
+# define TYPED_TEST_P(CaseName, TestName) \
+  namespace GTEST_CASE_NAMESPACE_(CaseName) { \
+  template <typename gtest_TypeParam_> \
+  class TestName : public CaseName<gtest_TypeParam_> { \
+   private: \
+    typedef CaseName<gtest_TypeParam_> TestFixture; \
+    typedef gtest_TypeParam_ TypeParam; \
+    virtual void TestBody(); \
+  }; \
+  static bool gtest_##TestName##_defined_ GTEST_ATTRIBUTE_UNUSED_ = \
+      GTEST_TYPED_TEST_CASE_P_STATE_(CaseName).AddTestName(\
+          __FILE__, __LINE__, #CaseName, #TestName); \
+  } \
+  template <typename gtest_TypeParam_> \
+  void GTEST_CASE_NAMESPACE_(CaseName)::TestName<gtest_TypeParam_>::TestBody()
+
+# define REGISTER_TYPED_TEST_CASE_P(CaseName, ...) \
+  namespace GTEST_CASE_NAMESPACE_(CaseName) { \
+  typedef ::testing::internal::Templates<__VA_ARGS__>::type gtest_AllTests_; \
+  } \
+  static const char* const GTEST_REGISTERED_TEST_NAMES_(CaseName) = \
+      GTEST_TYPED_TEST_CASE_P_STATE_(CaseName).VerifyRegisteredTestNames(\
+          __FILE__, __LINE__, #__VA_ARGS__)
+
+// The 'Types' template argument below must have spaces around it
+// since some compilers may choke on '>>' when passing a template
+// instance (e.g. Types<int>)
+# define INSTANTIATE_TYPED_TEST_CASE_P(Prefix, CaseName, Types) \
+  bool gtest_##Prefix##_##CaseName GTEST_ATTRIBUTE_UNUSED_ = \
+      ::testing::internal::TypeParameterizedTestCase<CaseName, \
+          GTEST_CASE_NAMESPACE_(CaseName)::gtest_AllTests_, \
+          ::testing::internal::TypeList< Types >::type>::Register(\
+              #Prefix, #CaseName, GTEST_REGISTERED_TEST_NAMES_(CaseName))
+
+#endif  // GTEST_HAS_TYPED_TEST_P
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
+
+// Depending on the platform, different string classes are available.
+// On Linux, in addition to ::std::string, Google also makes use of
+// class ::string, which has the same interface as ::std::string, but
+// has a different implementation.
+//
+// You can define GTEST_HAS_GLOBAL_STRING to 1 to indicate that
+// ::string is available AND is a distinct type to ::std::string, or
+// define it to 0 to indicate otherwise.
+//
+// If ::std::string and ::string are the same class on your platform
+// due to aliasing, you should define GTEST_HAS_GLOBAL_STRING to 0.
+//
+// If you do not define GTEST_HAS_GLOBAL_STRING, it is defined
+// heuristically.
+
+namespace testing {
+
+// Declares the flags.
+
+// This flag temporary enables the disabled tests.
+GTEST_DECLARE_bool_(also_run_disabled_tests);
+
+// This flag brings the debugger on an assertion failure.
+GTEST_DECLARE_bool_(break_on_failure);
+
+// This flag controls whether Google Test catches all test-thrown exceptions
+// and logs them as failures.
+GTEST_DECLARE_bool_(catch_exceptions);
+
+// This flag enables using colors in terminal output. Available values are
+// "yes" to enable colors, "no" (disable colors), or "auto" (the default)
+// to let Google Test decide.
+GTEST_DECLARE_string_(color);
+
+// This flag sets up the filter to select by name using a glob pattern
+// the tests to run. If the filter is not given all tests are executed.
+GTEST_DECLARE_string_(filter);
+
+// This flag causes the Google Test to list tests. None of the tests listed
+// are actually run if the flag is provided.
+GTEST_DECLARE_bool_(list_tests);
+
+// This flag controls whether Google Test emits a detailed XML report to a file
+// in addition to its normal textual output.
+GTEST_DECLARE_string_(output);
+
+// This flags control whether Google Test prints the elapsed time for each
+// test.
+GTEST_DECLARE_bool_(print_time);
+
+// This flag specifies the random number seed.
+GTEST_DECLARE_int32_(random_seed);
+
+// This flag sets how many times the tests are repeated. The default value
+// is 1. If the value is -1 the tests are repeating forever.
+GTEST_DECLARE_int32_(repeat);
+
+// This flag controls whether Google Test includes Google Test internal
+// stack frames in failure stack traces.
+GTEST_DECLARE_bool_(show_internal_stack_frames);
+
+// When this flag is specified, tests' order is randomized on every iteration.
+GTEST_DECLARE_bool_(shuffle);
+
+// This flag specifies the maximum number of stack frames to be
+// printed in a failure message.
+GTEST_DECLARE_int32_(stack_trace_depth);
+
+// When this flag is specified, a failed assertion will throw an
+// exception if exceptions are enabled, or exit the program with a
+// non-zero code otherwise.
+GTEST_DECLARE_bool_(throw_on_failure);
+
+// When this flag is set with a "host:port" string, on supported
+// platforms test results are streamed to the specified port on
+// the specified host machine.
+GTEST_DECLARE_string_(stream_result_to);
+
+// The upper limit for valid stack trace depths.
+const int kMaxStackTraceDepth = 100;
+
+namespace internal {
+
+class AssertHelper;
+class DefaultGlobalTestPartResultReporter;
+class ExecDeathTest;
+class NoExecDeathTest;
+class FinalSuccessChecker;
+class GTestFlagSaver;
+class StreamingListenerTest;
+class TestResultAccessor;
+class TestEventListenersAccessor;
+class TestEventRepeater;
+class UnitTestRecordPropertyTestHelper;
+class WindowsDeathTest;
+class UnitTestImpl* GetUnitTestImpl();
+void ReportFailureInUnknownLocation(TestPartResult::Type result_type,
+                                    const std::string& message);
+
+}  // namespace internal
+
+// The friend relationship of some of these classes is cyclic.
+// If we don't forward declare them the compiler might confuse the classes
+// in friendship clauses with same named classes on the scope.
+class Test;
+class TestCase;
+class TestInfo;
+class UnitTest;
+
+// A class for indicating whether an assertion was successful.  When
+// the assertion wasn't successful, the AssertionResult object
+// remembers a non-empty message that describes how it failed.
+//
+// To create an instance of this class, use one of the factory functions
+// (AssertionSuccess() and AssertionFailure()).
+//
+// This class is useful for two purposes:
+//   1. Defining predicate functions to be used with Boolean test assertions
+//      EXPECT_TRUE/EXPECT_FALSE and their ASSERT_ counterparts
+//   2. Defining predicate-format functions to be
+//      used with predicate assertions (ASSERT_PRED_FORMAT*, etc).
+//
+// For example, if you define IsEven predicate:
+//
+//   testing::AssertionResult IsEven(int n) {
+//     if ((n % 2) == 0)
+//       return testing::AssertionSuccess();
+//     else
+//       return testing::AssertionFailure() << n << " is odd";
+//   }
+//
+// Then the failed expectation EXPECT_TRUE(IsEven(Fib(5)))
+// will print the message
+//
+//   Value of: IsEven(Fib(5))
+//     Actual: false (5 is odd)
+//   Expected: true
+//
+// instead of a more opaque
+//
+//   Value of: IsEven(Fib(5))
+//     Actual: false
+//   Expected: true
+//
+// in case IsEven is a simple Boolean predicate.
+//
+// If you expect your predicate to be reused and want to support informative
+// messages in EXPECT_FALSE and ASSERT_FALSE (negative assertions show up
+// about half as often as positive ones in our tests), supply messages for
+// both success and failure cases:
+//
+//   testing::AssertionResult IsEven(int n) {
+//     if ((n % 2) == 0)
+//       return testing::AssertionSuccess() << n << " is even";
+//     else
+//       return testing::AssertionFailure() << n << " is odd";
+//   }
+//
+// Then a statement EXPECT_FALSE(IsEven(Fib(6))) will print
+//
+//   Value of: IsEven(Fib(6))
+//     Actual: true (8 is even)
+//   Expected: false
+//
+// NB: Predicates that support negative Boolean assertions have reduced
+// performance in positive ones so be careful not to use them in tests
+// that have lots (tens of thousands) of positive Boolean assertions.
+//
+// To use this class with EXPECT_PRED_FORMAT assertions such as:
+//
+//   // Verifies that Foo() returns an even number.
+//   EXPECT_PRED_FORMAT1(IsEven, Foo());
+//
+// you need to define:
+//
+//   testing::AssertionResult IsEven(const char* expr, int n) {
+//     if ((n % 2) == 0)
+//       return testing::AssertionSuccess();
+//     else
+//       return testing::AssertionFailure()
+//         << "Expected: " << expr << " is even\n  Actual: it's " << n;
+//   }
+//
+// If Foo() returns 5, you will see the following message:
+//
+//   Expected: Foo() is even
+//     Actual: it's 5
+//
+class GTEST_API_ AssertionResult {
+ public:
+  // Copy constructor.
+  // Used in EXPECT_TRUE/FALSE(assertion_result).
+  AssertionResult(const AssertionResult& other);
+
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 /* forcing value to bool */)
+
+  // Used in the EXPECT_TRUE/FALSE(bool_expression).
+  //
+  // T must be contextually convertible to bool.
+  //
+  // The second parameter prevents this overload from being considered if
+  // the argument is implicitly convertible to AssertionResult. In that case
+  // we want AssertionResult's copy constructor to be used.
+  template <typename T>
+  explicit AssertionResult(
+      const T& success,
+      typename internal::EnableIf<
+          !internal::ImplicitlyConvertible<T, AssertionResult>::value>::type*
+          /*enabler*/ = NULL)
+      : success_(success) {}
+
+  GTEST_DISABLE_MSC_WARNINGS_POP_()
+
+  // Assignment operator.
+  AssertionResult& operator=(AssertionResult other) {
+    swap(other);
+    return *this;
+  }
+
+  // Returns true iff the assertion succeeded.
+  operator bool() const { return success_; }  // NOLINT
+
+  // Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
+  AssertionResult operator!() const;
+
+  // Returns the text streamed into this AssertionResult. Test assertions
+  // use it when they fail (i.e., the predicate's outcome doesn't match the
+  // assertion's expectation). When nothing has been streamed into the
+  // object, returns an empty string.
+  const char* message() const {
+    return message_.get() != NULL ?  message_->c_str() : "";
+  }
+  // TODO(vladl at google.com): Remove this after making sure no clients use it.
+  // Deprecated; please use message() instead.
+  const char* failure_message() const { return message(); }
+
+  // Streams a custom failure message into this object.
+  template <typename T> AssertionResult& operator<<(const T& value) {
+    AppendMessage(Message() << value);
+    return *this;
+  }
+
+  // Allows streaming basic output manipulators such as endl or flush into
+  // this object.
+  AssertionResult& operator<<(
+      ::std::ostream& (*basic_manipulator)(::std::ostream& stream)) {
+    AppendMessage(Message() << basic_manipulator);
+    return *this;
+  }
+
+ private:
+  // Appends the contents of message to message_.
+  void AppendMessage(const Message& a_message) {
+    if (message_.get() == NULL)
+      message_.reset(new ::std::string);
+    message_->append(a_message.GetString().c_str());
+  }
+
+  // Swap the contents of this AssertionResult with other.
+  void swap(AssertionResult& other);
+
+  // Stores result of the assertion predicate.
+  bool success_;
+  // Stores the message describing the condition in case the expectation
+  // construct is not satisfied with the predicate's outcome.
+  // Referenced via a pointer to avoid taking too much stack frame space
+  // with test assertions.
+  internal::scoped_ptr< ::std::string> message_;
+};
+
+// Makes a successful assertion result.
+GTEST_API_ AssertionResult AssertionSuccess();
+
+// Makes a failed assertion result.
+GTEST_API_ AssertionResult AssertionFailure();
+
+// Makes a failed assertion result with the given failure message.
+// Deprecated; use AssertionFailure() << msg.
+GTEST_API_ AssertionResult AssertionFailure(const Message& msg);
+
+// The abstract class that all tests inherit from.
+//
+// In Google Test, a unit test program contains one or many TestCases, and
+// each TestCase contains one or many Tests.
+//
+// When you define a test using the TEST macro, you don't need to
+// explicitly derive from Test - the TEST macro automatically does
+// this for you.
+//
+// The only time you derive from Test is when defining a test fixture
+// to be used a TEST_F.  For example:
+//
+//   class FooTest : public testing::Test {
+//    protected:
+//     void SetUp() override { ... }
+//     void TearDown() override { ... }
+//     ...
+//   };
+//
+//   TEST_F(FooTest, Bar) { ... }
+//   TEST_F(FooTest, Baz) { ... }
+//
+// Test is not copyable.
+class GTEST_API_ Test {
+ public:
+  friend class TestInfo;
+
+  // Defines types for pointers to functions that set up and tear down
+  // a test case.
+  typedef internal::SetUpTestCaseFunc SetUpTestCaseFunc;
+  typedef internal::TearDownTestCaseFunc TearDownTestCaseFunc;
+
+  // The d'tor is virtual as we intend to inherit from Test.
+  virtual ~Test();
+
+  // Sets up the stuff shared by all tests in this test case.
+  //
+  // Google Test will call Foo::SetUpTestCase() before running the first
+  // test in test case Foo.  Hence a sub-class can define its own
+  // SetUpTestCase() method to shadow the one defined in the super
+  // class.
+  static void SetUpTestCase() {}
+
+  // Tears down the stuff shared by all tests in this test case.
+  //
+  // Google Test will call Foo::TearDownTestCase() after running the last
+  // test in test case Foo.  Hence a sub-class can define its own
+  // TearDownTestCase() method to shadow the one defined in the super
+  // class.
+  static void TearDownTestCase() {}
+
+  // Returns true iff the current test has a fatal failure.
+  static bool HasFatalFailure();
+
+  // Returns true iff the current test has a non-fatal failure.
+  static bool HasNonfatalFailure();
+
+  // Returns true iff the current test has a (either fatal or
+  // non-fatal) failure.
+  static bool HasFailure() { return HasFatalFailure() || HasNonfatalFailure(); }
+
+  // Logs a property for the current test, test case, or for the entire
+  // invocation of the test program when used outside of the context of a
+  // test case.  Only the last value for a given key is remembered.  These
+  // are public static so they can be called from utility functions that are
+  // not members of the test fixture.  Calls to RecordProperty made during
+  // lifespan of the test (from the moment its constructor starts to the
+  // moment its destructor finishes) will be output in XML as attributes of
+  // the <testcase> element.  Properties recorded from fixture's
+  // SetUpTestCase or TearDownTestCase are logged as attributes of the
+  // corresponding <testsuite> element.  Calls to RecordProperty made in the
+  // global context (before or after invocation of RUN_ALL_TESTS and from
+  // SetUp/TearDown method of Environment objects registered with Google
+  // Test) will be output as attributes of the <testsuites> element.
+  static void RecordProperty(const std::string& key, const std::string& value);
+  static void RecordProperty(const std::string& key, int value);
+
+ protected:
+  // Creates a Test object.
+  Test();
+
+  // Sets up the test fixture.
+  virtual void SetUp();
+
+  // Tears down the test fixture.
+  virtual void TearDown();
+
+ private:
+  // Returns true iff the current test has the same fixture class as
+  // the first test in the current test case.
+  static bool HasSameFixtureClass();
+
+  // Runs the test after the test fixture has been set up.
+  //
+  // A sub-class must implement this to define the test logic.
+  //
+  // DO NOT OVERRIDE THIS FUNCTION DIRECTLY IN A USER PROGRAM.
+  // Instead, use the TEST or TEST_F macro.
+  virtual void TestBody() = 0;
+
+  // Sets up, executes, and tears down the test.
+  void Run();
+
+  // Deletes self.  We deliberately pick an unusual name for this
+  // internal method to avoid clashing with names used in user TESTs.
+  void DeleteSelf_() { delete this; }
+
+  // Uses a GTestFlagSaver to save and restore all Google Test flags.
+  const internal::GTestFlagSaver* const gtest_flag_saver_;
+
+  // Often a user misspells SetUp() as Setup() and spends a long time
+  // wondering why it is never called by Google Test.  The declaration of
+  // the following method is solely for catching such an error at
+  // compile time:
+  //
+  //   - The return type is deliberately chosen to be not void, so it
+  //   will be a conflict if void Setup() is declared in the user's
+  //   test fixture.
+  //
+  //   - This method is private, so it will be another compiler error
+  //   if the method is called from the user's test fixture.
+  //
+  // DO NOT OVERRIDE THIS FUNCTION.
+  //
+  // If you see an error about overriding the following function or
+  // about it being private, you have mis-spelled SetUp() as Setup().
+  struct Setup_should_be_spelled_SetUp {};
+  virtual Setup_should_be_spelled_SetUp* Setup() { return NULL; }
+
+  // We disallow copying Tests.
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Test);
+};
+
+typedef internal::TimeInMillis TimeInMillis;
+
+// A copyable object representing a user specified test property which can be
+// output as a key/value string pair.
+//
+// Don't inherit from TestProperty as its destructor is not virtual.
+class TestProperty {
+ public:
+  // C'tor.  TestProperty does NOT have a default constructor.
+  // Always use this constructor (with parameters) to create a
+  // TestProperty object.
+  TestProperty(const std::string& a_key, const std::string& a_value) :
+    key_(a_key), value_(a_value) {
+  }
+
+  // Gets the user supplied key.
+  const char* key() const {
+    return key_.c_str();
+  }
+
+  // Gets the user supplied value.
+  const char* value() const {
+    return value_.c_str();
+  }
+
+  // Sets a new value, overriding the one supplied in the constructor.
+  void SetValue(const std::string& new_value) {
+    value_ = new_value;
+  }
+
+ private:
+  // The key supplied by the user.
+  std::string key_;
+  // The value supplied by the user.
+  std::string value_;
+};
+
+// The result of a single Test.  This includes a list of
+// TestPartResults, a list of TestProperties, a count of how many
+// death tests there are in the Test, and how much time it took to run
+// the Test.
+//
+// TestResult is not copyable.
+class GTEST_API_ TestResult {
+ public:
+  // Creates an empty TestResult.
+  TestResult();
+
+  // D'tor.  Do not inherit from TestResult.
+  ~TestResult();
+
+  // Gets the number of all test parts.  This is the sum of the number
+  // of successful test parts and the number of failed test parts.
+  int total_part_count() const;
+
+  // Returns the number of the test properties.
+  int test_property_count() const;
+
+  // Returns true iff the test passed (i.e. no test part failed).
+  bool Passed() const { return !Failed(); }
+
+  // Returns true iff the test failed.
+  bool Failed() const;
+
+  // Returns true iff the test fatally failed.
+  bool HasFatalFailure() const;
+
+  // Returns true iff the test has a non-fatal failure.
+  bool HasNonfatalFailure() const;
+
+  // Returns the elapsed time, in milliseconds.
+  TimeInMillis elapsed_time() const { return elapsed_time_; }
+
+  // Returns the i-th test part result among all the results. i can range
+  // from 0 to test_property_count() - 1. If i is not in that range, aborts
+  // the program.
+  const TestPartResult& GetTestPartResult(int i) const;
+
+  // Returns the i-th test property. i can range from 0 to
+  // test_property_count() - 1. If i is not in that range, aborts the
+  // program.
+  const TestProperty& GetTestProperty(int i) const;
+
+ private:
+  friend class TestInfo;
+  friend class TestCase;
+  friend class UnitTest;
+  friend class internal::DefaultGlobalTestPartResultReporter;
+  friend class internal::ExecDeathTest;
+  friend class internal::TestResultAccessor;
+  friend class internal::UnitTestImpl;
+  friend class internal::WindowsDeathTest;
+
+  // Gets the vector of TestPartResults.
+  const std::vector<TestPartResult>& test_part_results() const {
+    return test_part_results_;
+  }
+
+  // Gets the vector of TestProperties.
+  const std::vector<TestProperty>& test_properties() const {
+    return test_properties_;
+  }
+
+  // Sets the elapsed time.
+  void set_elapsed_time(TimeInMillis elapsed) { elapsed_time_ = elapsed; }
+
+  // Adds a test property to the list. The property is validated and may add
+  // a non-fatal failure if invalid (e.g., if it conflicts with reserved
+  // key names). If a property is already recorded for the same key, the
+  // value will be updated, rather than storing multiple values for the same
+  // key.  xml_element specifies the element for which the property is being
+  // recorded and is used for validation.
+  void RecordProperty(const std::string& xml_element,
+                      const TestProperty& test_property);
+
+  // Adds a failure if the key is a reserved attribute of Google Test
+  // testcase tags.  Returns true if the property is valid.
+  // TODO(russr): Validate attribute names are legal and human readable.
+  static bool ValidateTestProperty(const std::string& xml_element,
+                                   const TestProperty& test_property);
+
+  // Adds a test part result to the list.
+  void AddTestPartResult(const TestPartResult& test_part_result);
+
+  // Returns the death test count.
+  int death_test_count() const { return death_test_count_; }
+
+  // Increments the death test count, returning the new count.
+  int increment_death_test_count() { return ++death_test_count_; }
+
+  // Clears the test part results.
+  void ClearTestPartResults();
+
+  // Clears the object.
+  void Clear();
+
+  // Protects mutable state of the property vector and of owned
+  // properties, whose values may be updated.
+  internal::Mutex test_properites_mutex_;
+
+  // The vector of TestPartResults
+  std::vector<TestPartResult> test_part_results_;
+  // The vector of TestProperties
+  std::vector<TestProperty> test_properties_;
+  // Running count of death tests.
+  int death_test_count_;
+  // The elapsed time, in milliseconds.
+  TimeInMillis elapsed_time_;
+
+  // We disallow copying TestResult.
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestResult);
+};  // class TestResult
+
+// A TestInfo object stores the following information about a test:
+//
+//   Test case name
+//   Test name
+//   Whether the test should be run
+//   A function pointer that creates the test object when invoked
+//   Test result
+//
+// The constructor of TestInfo registers itself with the UnitTest
+// singleton such that the RUN_ALL_TESTS() macro knows which tests to
+// run.
+class GTEST_API_ TestInfo {
+ public:
+  // Destructs a TestInfo object.  This function is not virtual, so
+  // don't inherit from TestInfo.
+  ~TestInfo();
+
+  // Returns the test case name.
+  const char* test_case_name() const { return test_case_name_.c_str(); }
+
+  // Returns the test name.
+  const char* name() const { return name_.c_str(); }
+
+  // Returns the name of the parameter type, or NULL if this is not a typed
+  // or a type-parameterized test.
+  const char* type_param() const {
+    if (type_param_.get() != NULL)
+      return type_param_->c_str();
+    return NULL;
+  }
+
+  // Returns the text representation of the value parameter, or NULL if this
+  // is not a value-parameterized test.
+  const char* value_param() const {
+    if (value_param_.get() != NULL)
+      return value_param_->c_str();
+    return NULL;
+  }
+
+  // Returns true if this test should run, that is if the test is not
+  // disabled (or it is disabled but the also_run_disabled_tests flag has
+  // been specified) and its full name matches the user-specified filter.
+  //
+  // Google Test allows the user to filter the tests by their full names.
+  // The full name of a test Bar in test case Foo is defined as
+  // "Foo.Bar".  Only the tests that match the filter will run.
+  //
+  // A filter is a colon-separated list of glob (not regex) patterns,
+  // optionally followed by a '-' and a colon-separated list of
+  // negative patterns (tests to exclude).  A test is run if it
+  // matches one of the positive patterns and does not match any of
+  // the negative patterns.
+  //
+  // For example, *A*:Foo.* is a filter that matches any string that
+  // contains the character 'A' or starts with "Foo.".
+  bool should_run() const { return should_run_; }
+
+  // Returns true iff this test will appear in the XML report.
+  bool is_reportable() const {
+    // For now, the XML report includes all tests matching the filter.
+    // In the future, we may trim tests that are excluded because of
+    // sharding.
+    return matches_filter_;
+  }
+
+  // Returns the result of the test.
+  const TestResult* result() const { return &result_; }
+
+ private:
+#if GTEST_HAS_DEATH_TEST
+  friend class internal::DefaultDeathTestFactory;
+#endif  // GTEST_HAS_DEATH_TEST
+  friend class Test;
+  friend class TestCase;
+  friend class internal::UnitTestImpl;
+  friend class internal::StreamingListenerTest;
+  friend TestInfo* internal::MakeAndRegisterTestInfo(
+      const char* test_case_name,
+      const char* name,
+      const char* type_param,
+      const char* value_param,
+      internal::TypeId fixture_class_id,
+      Test::SetUpTestCaseFunc set_up_tc,
+      Test::TearDownTestCaseFunc tear_down_tc,
+      internal::TestFactoryBase* factory);
+
+  // Constructs a TestInfo object. The newly constructed instance assumes
+  // ownership of the factory object.
+  TestInfo(const std::string& test_case_name,
+           const std::string& name,
+           const char* a_type_param,   // NULL if not a type-parameterized test
+           const char* a_value_param,  // NULL if not a value-parameterized test
+           internal::TypeId fixture_class_id,
+           internal::TestFactoryBase* factory);
+
+  // Increments the number of death tests encountered in this test so
+  // far.
+  int increment_death_test_count() {
+    return result_.increment_death_test_count();
+  }
+
+  // Creates the test object, runs it, records its result, and then
+  // deletes it.
+  void Run();
+
+  static void ClearTestResult(TestInfo* test_info) {
+    test_info->result_.Clear();
+  }
+
+  // These fields are immutable properties of the test.
+  const std::string test_case_name_;     // Test case name
+  const std::string name_;               // Test name
+  // Name of the parameter type, or NULL if this is not a typed or a
+  // type-parameterized test.
+  const internal::scoped_ptr<const ::std::string> type_param_;
+  // Text representation of the value parameter, or NULL if this is not a
+  // value-parameterized test.
+  const internal::scoped_ptr<const ::std::string> value_param_;
+  const internal::TypeId fixture_class_id_;   // ID of the test fixture class
+  bool should_run_;                 // True iff this test should run
+  bool is_disabled_;                // True iff this test is disabled
+  bool matches_filter_;             // True if this test matches the
+                                    // user-specified filter.
+  internal::TestFactoryBase* const factory_;  // The factory that creates
+                                              // the test object
+
+  // This field is mutable and needs to be reset before running the
+  // test for the second time.
+  TestResult result_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestInfo);
+};
+
+// A test case, which consists of a vector of TestInfos.
+//
+// TestCase is not copyable.
+class GTEST_API_ TestCase {
+ public:
+  // Creates a TestCase with the given name.
+  //
+  // TestCase does NOT have a default constructor.  Always use this
+  // constructor to create a TestCase object.
+  //
+  // Arguments:
+  //
+  //   name:         name of the test case
+  //   a_type_param: the name of the test's type parameter, or NULL if
+  //                 this is not a type-parameterized test.
+  //   set_up_tc:    pointer to the function that sets up the test case
+  //   tear_down_tc: pointer to the function that tears down the test case
+  TestCase(const char* name, const char* a_type_param,
+           Test::SetUpTestCaseFunc set_up_tc,
+           Test::TearDownTestCaseFunc tear_down_tc);
+
+  // Destructor of TestCase.
+  virtual ~TestCase();
+
+  // Gets the name of the TestCase.
+  const char* name() const { return name_.c_str(); }
+
+  // Returns the name of the parameter type, or NULL if this is not a
+  // type-parameterized test case.
+  const char* type_param() const {
+    if (type_param_.get() != NULL)
+      return type_param_->c_str();
+    return NULL;
+  }
+
+  // Returns true if any test in this test case should run.
+  bool should_run() const { return should_run_; }
+
+  // Gets the number of successful tests in this test case.
+  int successful_test_count() const;
+
+  // Gets the number of failed tests in this test case.
+  int failed_test_count() const;
+
+  // Gets the number of disabled tests that will be reported in the XML report.
+  int reportable_disabled_test_count() const;
+
+  // Gets the number of disabled tests in this test case.
+  int disabled_test_count() const;
+
+  // Gets the number of tests to be printed in the XML report.
+  int reportable_test_count() const;
+
+  // Get the number of tests in this test case that should run.
+  int test_to_run_count() const;
+
+  // Gets the number of all tests in this test case.
+  int total_test_count() const;
+
+  // Returns true iff the test case passed.
+  bool Passed() const { return !Failed(); }
+
+  // Returns true iff the test case failed.
+  bool Failed() const { return failed_test_count() > 0; }
+
+  // Returns the elapsed time, in milliseconds.
+  TimeInMillis elapsed_time() const { return elapsed_time_; }
+
+  // Returns the i-th test among all the tests. i can range from 0 to
+  // total_test_count() - 1. If i is not in that range, returns NULL.
+  const TestInfo* GetTestInfo(int i) const;
+
+  // Returns the TestResult that holds test properties recorded during
+  // execution of SetUpTestCase and TearDownTestCase.
+  const TestResult& ad_hoc_test_result() const { return ad_hoc_test_result_; }
+
+ private:
+  friend class Test;
+  friend class internal::UnitTestImpl;
+
+  // Gets the (mutable) vector of TestInfos in this TestCase.
+  std::vector<TestInfo*>& test_info_list() { return test_info_list_; }
+
+  // Gets the (immutable) vector of TestInfos in this TestCase.
+  const std::vector<TestInfo*>& test_info_list() const {
+    return test_info_list_;
+  }
+
+  // Returns the i-th test among all the tests. i can range from 0 to
+  // total_test_count() - 1. If i is not in that range, returns NULL.
+  TestInfo* GetMutableTestInfo(int i);
+
+  // Sets the should_run member.
+  void set_should_run(bool should) { should_run_ = should; }
+
+  // Adds a TestInfo to this test case.  Will delete the TestInfo upon
+  // destruction of the TestCase object.
+  void AddTestInfo(TestInfo * test_info);
+
+  // Clears the results of all tests in this test case.
+  void ClearResult();
+
+  // Clears the results of all tests in the given test case.
+  static void ClearTestCaseResult(TestCase* test_case) {
+    test_case->ClearResult();
+  }
+
+  // Runs every test in this TestCase.
+  void Run();
+
+  // Runs SetUpTestCase() for this TestCase.  This wrapper is needed
+  // for catching exceptions thrown from SetUpTestCase().
+  void RunSetUpTestCase() { (*set_up_tc_)(); }
+
+  // Runs TearDownTestCase() for this TestCase.  This wrapper is
+  // needed for catching exceptions thrown from TearDownTestCase().
+  void RunTearDownTestCase() { (*tear_down_tc_)(); }
+
+  // Returns true iff test passed.
+  static bool TestPassed(const TestInfo* test_info) {
+    return test_info->should_run() && test_info->result()->Passed();
+  }
+
+  // Returns true iff test failed.
+  static bool TestFailed(const TestInfo* test_info) {
+    return test_info->should_run() && test_info->result()->Failed();
+  }
+
+  // Returns true iff the test is disabled and will be reported in the XML
+  // report.
+  static bool TestReportableDisabled(const TestInfo* test_info) {
+    return test_info->is_reportable() && test_info->is_disabled_;
+  }
+
+  // Returns true iff test is disabled.
+  static bool TestDisabled(const TestInfo* test_info) {
+    return test_info->is_disabled_;
+  }
+
+  // Returns true iff this test will appear in the XML report.
+  static bool TestReportable(const TestInfo* test_info) {
+    return test_info->is_reportable();
+  }
+
+  // Returns true if the given test should run.
+  static bool ShouldRunTest(const TestInfo* test_info) {
+    return test_info->should_run();
+  }
+
+  // Shuffles the tests in this test case.
+  void ShuffleTests(internal::Random* random);
+
+  // Restores the test order to before the first shuffle.
+  void UnshuffleTests();
+
+  // Name of the test case.
+  std::string name_;
+  // Name of the parameter type, or NULL if this is not a typed or a
+  // type-parameterized test.
+  const internal::scoped_ptr<const ::std::string> type_param_;
+  // The vector of TestInfos in their original order.  It owns the
+  // elements in the vector.
+  std::vector<TestInfo*> test_info_list_;
+  // Provides a level of indirection for the test list to allow easy
+  // shuffling and restoring the test order.  The i-th element in this
+  // vector is the index of the i-th test in the shuffled test list.
+  std::vector<int> test_indices_;
+  // Pointer to the function that sets up the test case.
+  Test::SetUpTestCaseFunc set_up_tc_;
+  // Pointer to the function that tears down the test case.
+  Test::TearDownTestCaseFunc tear_down_tc_;
+  // True iff any test in this test case should run.
+  bool should_run_;
+  // Elapsed time, in milliseconds.
+  TimeInMillis elapsed_time_;
+  // Holds test properties recorded during execution of SetUpTestCase and
+  // TearDownTestCase.
+  TestResult ad_hoc_test_result_;
+
+  // We disallow copying TestCases.
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestCase);
+};
+
+// An Environment object is capable of setting up and tearing down an
+// environment.  You should subclass this to define your own
+// environment(s).
+//
+// An Environment object does the set-up and tear-down in virtual
+// methods SetUp() and TearDown() instead of the constructor and the
+// destructor, as:
+//
+//   1. You cannot safely throw from a destructor.  This is a problem
+//      as in some cases Google Test is used where exceptions are enabled, and
+//      we may want to implement ASSERT_* using exceptions where they are
+//      available.
+//   2. You cannot use ASSERT_* directly in a constructor or
+//      destructor.
+class Environment {
+ public:
+  // The d'tor is virtual as we need to subclass Environment.
+  virtual ~Environment() {}
+
+  // Override this to define how to set up the environment.
+  virtual void SetUp() {}
+
+  // Override this to define how to tear down the environment.
+  virtual void TearDown() {}
+ private:
+  // If you see an error about overriding the following function or
+  // about it being private, you have mis-spelled SetUp() as Setup().
+  struct Setup_should_be_spelled_SetUp {};
+  virtual Setup_should_be_spelled_SetUp* Setup() { return NULL; }
+};
+
+// The interface for tracing execution of tests. The methods are organized in
+// the order the corresponding events are fired.
+class TestEventListener {
+ public:
+  virtual ~TestEventListener() {}
+
+  // Fired before any test activity starts.
+  virtual void OnTestProgramStart(const UnitTest& unit_test) = 0;
+
+  // Fired before each iteration of tests starts.  There may be more than
+  // one iteration if GTEST_FLAG(repeat) is set. iteration is the iteration
+  // index, starting from 0.
+  virtual void OnTestIterationStart(const UnitTest& unit_test,
+                                    int iteration) = 0;
+
+  // Fired before environment set-up for each iteration of tests starts.
+  virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test) = 0;
+
+  // Fired after environment set-up for each iteration of tests ends.
+  virtual void OnEnvironmentsSetUpEnd(const UnitTest& unit_test) = 0;
+
+  // Fired before the test case starts.
+  virtual void OnTestCaseStart(const TestCase& test_case) = 0;
+
+  // Fired before the test starts.
+  virtual void OnTestStart(const TestInfo& test_info) = 0;
+
+  // Fired after a failed assertion or a SUCCEED() invocation.
+  virtual void OnTestPartResult(const TestPartResult& test_part_result) = 0;
+
+  // Fired after the test ends.
+  virtual void OnTestEnd(const TestInfo& test_info) = 0;
+
+  // Fired after the test case ends.
+  virtual void OnTestCaseEnd(const TestCase& test_case) = 0;
+
+  // Fired before environment tear-down for each iteration of tests starts.
+  virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test) = 0;
+
+  // Fired after environment tear-down for each iteration of tests ends.
+  virtual void OnEnvironmentsTearDownEnd(const UnitTest& unit_test) = 0;
+
+  // Fired after each iteration of tests finishes.
+  virtual void OnTestIterationEnd(const UnitTest& unit_test,
+                                  int iteration) = 0;
+
+  // Fired after all test activities have ended.
+  virtual void OnTestProgramEnd(const UnitTest& unit_test) = 0;
+};
+
+// The convenience class for users who need to override just one or two
+// methods and are not concerned that a possible change to a signature of
+// the methods they override will not be caught during the build.  For
+// comments about each method please see the definition of TestEventListener
+// above.
+class EmptyTestEventListener : public TestEventListener {
+ public:
+  virtual void OnTestProgramStart(const UnitTest& /*unit_test*/) {}
+  virtual void OnTestIterationStart(const UnitTest& /*unit_test*/,
+                                    int /*iteration*/) {}
+  virtual void OnEnvironmentsSetUpStart(const UnitTest& /*unit_test*/) {}
+  virtual void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) {}
+  virtual void OnTestCaseStart(const TestCase& /*test_case*/) {}
+  virtual void OnTestStart(const TestInfo& /*test_info*/) {}
+  virtual void OnTestPartResult(const TestPartResult& /*test_part_result*/) {}
+  virtual void OnTestEnd(const TestInfo& /*test_info*/) {}
+  virtual void OnTestCaseEnd(const TestCase& /*test_case*/) {}
+  virtual void OnEnvironmentsTearDownStart(const UnitTest& /*unit_test*/) {}
+  virtual void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) {}
+  virtual void OnTestIterationEnd(const UnitTest& /*unit_test*/,
+                                  int /*iteration*/) {}
+  virtual void OnTestProgramEnd(const UnitTest& /*unit_test*/) {}
+};
+
+// TestEventListeners lets users add listeners to track events in Google Test.
+class GTEST_API_ TestEventListeners {
+ public:
+  TestEventListeners();
+  ~TestEventListeners();
+
+  // Appends an event listener to the end of the list. Google Test assumes
+  // the ownership of the listener (i.e. it will delete the listener when
+  // the test program finishes).
+  void Append(TestEventListener* listener);
+
+  // Removes the given event listener from the list and returns it.  It then
+  // becomes the caller's responsibility to delete the listener. Returns
+  // NULL if the listener is not found in the list.
+  TestEventListener* Release(TestEventListener* listener);
+
+  // Returns the standard listener responsible for the default console
+  // output.  Can be removed from the listeners list to shut down default
+  // console output.  Note that removing this object from the listener list
+  // with Release transfers its ownership to the caller and makes this
+  // function return NULL the next time.
+  TestEventListener* default_result_printer() const {
+    return default_result_printer_;
+  }
+
+  // Returns the standard listener responsible for the default XML output
+  // controlled by the --gtest_output=xml flag.  Can be removed from the
+  // listeners list by users who want to shut down the default XML output
+  // controlled by this flag and substitute it with custom one.  Note that
+  // removing this object from the listener list with Release transfers its
+  // ownership to the caller and makes this function return NULL the next
+  // time.
+  TestEventListener* default_xml_generator() const {
+    return default_xml_generator_;
+  }
+
+ private:
+  friend class TestCase;
+  friend class TestInfo;
+  friend class internal::DefaultGlobalTestPartResultReporter;
+  friend class internal::NoExecDeathTest;
+  friend class internal::TestEventListenersAccessor;
+  friend class internal::UnitTestImpl;
+
+  // Returns repeater that broadcasts the TestEventListener events to all
+  // subscribers.
+  TestEventListener* repeater();
+
+  // Sets the default_result_printer attribute to the provided listener.
+  // The listener is also added to the listener list and previous
+  // default_result_printer is removed from it and deleted. The listener can
+  // also be NULL in which case it will not be added to the list. Does
+  // nothing if the previous and the current listener objects are the same.
+  void SetDefaultResultPrinter(TestEventListener* listener);
+
+  // Sets the default_xml_generator attribute to the provided listener.  The
+  // listener is also added to the listener list and previous
+  // default_xml_generator is removed from it and deleted. The listener can
+  // also be NULL in which case it will not be added to the list. Does
+  // nothing if the previous and the current listener objects are the same.
+  void SetDefaultXmlGenerator(TestEventListener* listener);
+
+  // Controls whether events will be forwarded by the repeater to the
+  // listeners in the list.
+  bool EventForwardingEnabled() const;
+  void SuppressEventForwarding();
+
+  // The actual list of listeners.
+  internal::TestEventRepeater* repeater_;
+  // Listener responsible for the standard result output.
+  TestEventListener* default_result_printer_;
+  // Listener responsible for the creation of the XML output file.
+  TestEventListener* default_xml_generator_;
+
+  // We disallow copying TestEventListeners.
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventListeners);
+};
+
+// A UnitTest consists of a vector of TestCases.
+//
+// This is a singleton class.  The only instance of UnitTest is
+// created when UnitTest::GetInstance() is first called.  This
+// instance is never deleted.
+//
+// UnitTest is not copyable.
+//
+// This class is thread-safe as long as the methods are called
+// according to their specification.
+class GTEST_API_ UnitTest {
+ public:
+  // Gets the singleton UnitTest object.  The first time this method
+  // is called, a UnitTest object is constructed and returned.
+  // Consecutive calls will return the same object.
+  static UnitTest* GetInstance();
+
+  // Runs all tests in this UnitTest object and prints the result.
+  // Returns 0 if successful, or 1 otherwise.
+  //
+  // This method can only be called from the main thread.
+  //
+  // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+  int Run() GTEST_MUST_USE_RESULT_;
+
+  // Returns the working directory when the first TEST() or TEST_F()
+  // was executed.  The UnitTest object owns the string.
+  const char* original_working_dir() const;
+
+  // Returns the TestCase object for the test that's currently running,
+  // or NULL if no test is running.
+  const TestCase* current_test_case() const
+      GTEST_LOCK_EXCLUDED_(mutex_);
+
+  // Returns the TestInfo object for the test that's currently running,
+  // or NULL if no test is running.
+  const TestInfo* current_test_info() const
+      GTEST_LOCK_EXCLUDED_(mutex_);
+
+  // Returns the random seed used at the start of the current test run.
+  int random_seed() const;
+
+#if GTEST_HAS_PARAM_TEST
+  // Returns the ParameterizedTestCaseRegistry object used to keep track of
+  // value-parameterized tests and instantiate and register them.
+  //
+  // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+  internal::ParameterizedTestCaseRegistry& parameterized_test_registry()
+      GTEST_LOCK_EXCLUDED_(mutex_);
+#endif  // GTEST_HAS_PARAM_TEST
+
+  // Gets the number of successful test cases.
+  int successful_test_case_count() const;
+
+  // Gets the number of failed test cases.
+  int failed_test_case_count() const;
+
+  // Gets the number of all test cases.
+  int total_test_case_count() const;
+
+  // Gets the number of all test cases that contain at least one test
+  // that should run.
+  int test_case_to_run_count() const;
+
+  // Gets the number of successful tests.
+  int successful_test_count() const;
+
+  // Gets the number of failed tests.
+  int failed_test_count() const;
+
+  // Gets the number of disabled tests that will be reported in the XML report.
+  int reportable_disabled_test_count() const;
+
+  // Gets the number of disabled tests.
+  int disabled_test_count() const;
+
+  // Gets the number of tests to be printed in the XML report.
+  int reportable_test_count() const;
+
+  // Gets the number of all tests.
+  int total_test_count() const;
+
+  // Gets the number of tests that should run.
+  int test_to_run_count() const;
+
+  // Gets the time of the test program start, in ms from the start of the
+  // UNIX epoch.
+  TimeInMillis start_timestamp() const;
+
+  // Gets the elapsed time, in milliseconds.
+  TimeInMillis elapsed_time() const;
+
+  // Returns true iff the unit test passed (i.e. all test cases passed).
+  bool Passed() const;
+
+  // Returns true iff the unit test failed (i.e. some test case failed
+  // or something outside of all tests failed).
+  bool Failed() const;
+
+  // Gets the i-th test case among all the test cases. i can range from 0 to
+  // total_test_case_count() - 1. If i is not in that range, returns NULL.
+  const TestCase* GetTestCase(int i) const;
+
+  // Returns the TestResult containing information on test failures and
+  // properties logged outside of individual test cases.
+  const TestResult& ad_hoc_test_result() const;
+
+  // Returns the list of event listeners that can be used to track events
+  // inside Google Test.
+  TestEventListeners& listeners();
+
+ private:
+  // Registers and returns a global test environment.  When a test
+  // program is run, all global test environments will be set-up in
+  // the order they were registered.  After all tests in the program
+  // have finished, all global test environments will be torn-down in
+  // the *reverse* order they were registered.
+  //
+  // The UnitTest object takes ownership of the given environment.
+  //
+  // This method can only be called from the main thread.
+  Environment* AddEnvironment(Environment* env);
+
+  // Adds a TestPartResult to the current TestResult object.  All
+  // Google Test assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc)
+  // eventually call this to report their results.  The user code
+  // should use the assertion macros instead of calling this directly.
+  void AddTestPartResult(TestPartResult::Type result_type,
+                         const char* file_name,
+                         int line_number,
+                         const std::string& message,
+                         const std::string& os_stack_trace)
+      GTEST_LOCK_EXCLUDED_(mutex_);
+
+  // Adds a TestProperty to the current TestResult object when invoked from
+  // inside a test, to current TestCase's ad_hoc_test_result_ when invoked
+  // from SetUpTestCase or TearDownTestCase, or to the global property set
+  // when invoked elsewhere.  If the result already contains a property with
+  // the same key, the value will be updated.
+  void RecordProperty(const std::string& key, const std::string& value);
+
+  // Gets the i-th test case among all the test cases. i can range from 0 to
+  // total_test_case_count() - 1. If i is not in that range, returns NULL.
+  TestCase* GetMutableTestCase(int i);
+
+  // Accessors for the implementation object.
+  internal::UnitTestImpl* impl() { return impl_; }
+  const internal::UnitTestImpl* impl() const { return impl_; }
+
+  // These classes and funcions are friends as they need to access private
+  // members of UnitTest.
+  friend class Test;
+  friend class internal::AssertHelper;
+  friend class internal::ScopedTrace;
+  friend class internal::StreamingListenerTest;
+  friend class internal::UnitTestRecordPropertyTestHelper;
+  friend Environment* AddGlobalTestEnvironment(Environment* env);
+  friend internal::UnitTestImpl* internal::GetUnitTestImpl();
+  friend void internal::ReportFailureInUnknownLocation(
+      TestPartResult::Type result_type,
+      const std::string& message);
+
+  // Creates an empty UnitTest.
+  UnitTest();
+
+  // D'tor
+  virtual ~UnitTest();
+
+  // Pushes a trace defined by SCOPED_TRACE() on to the per-thread
+  // Google Test trace stack.
+  void PushGTestTrace(const internal::TraceInfo& trace)
+      GTEST_LOCK_EXCLUDED_(mutex_);
+
+  // Pops a trace from the per-thread Google Test trace stack.
+  void PopGTestTrace()
+      GTEST_LOCK_EXCLUDED_(mutex_);
+
+  // Protects mutable state in *impl_.  This is mutable as some const
+  // methods need to lock it too.
+  mutable internal::Mutex mutex_;
+
+  // Opaque implementation object.  This field is never changed once
+  // the object is constructed.  We don't mark it as const here, as
+  // doing so will cause a warning in the constructor of UnitTest.
+  // Mutable state in *impl_ is protected by mutex_.
+  internal::UnitTestImpl* impl_;
+
+  // We disallow copying UnitTest.
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTest);
+};
+
+// A convenient wrapper for adding an environment for the test
+// program.
+//
+// You should call this before RUN_ALL_TESTS() is called, probably in
+// main().  If you use gtest_main, you need to call this before main()
+// starts for it to take effect.  For example, you can define a global
+// variable like this:
+//
+//   testing::Environment* const foo_env =
+//       testing::AddGlobalTestEnvironment(new FooEnvironment);
+//
+// However, we strongly recommend you to write your own main() and
+// call AddGlobalTestEnvironment() there, as relying on initialization
+// of global variables makes the code harder to read and may cause
+// problems when you register multiple environments from different
+// translation units and the environments have dependencies among them
+// (remember that the compiler doesn't guarantee the order in which
+// global variables from different translation units are initialized).
+inline Environment* AddGlobalTestEnvironment(Environment* env) {
+  return UnitTest::GetInstance()->AddEnvironment(env);
+}
+
+// Initializes Google Test.  This must be called before calling
+// RUN_ALL_TESTS().  In particular, it parses a command line for the
+// flags that Google Test recognizes.  Whenever a Google Test flag is
+// seen, it is removed from argv, and *argc is decremented.
+//
+// No value is returned.  Instead, the Google Test flag variables are
+// updated.
+//
+// Calling the function for the second time has no user-visible effect.
+GTEST_API_ void InitGoogleTest(int* argc, char** argv);
+
+// This overloaded version can be used in Windows programs compiled in
+// UNICODE mode.
+GTEST_API_ void InitGoogleTest(int* argc, wchar_t** argv);
+
+namespace internal {
+
+// FormatForComparison<ToPrint, OtherOperand>::Format(value) formats a
+// value of type ToPrint that is an operand of a comparison assertion
+// (e.g. ASSERT_EQ).  OtherOperand is the type of the other operand in
+// the comparison, and is used to help determine the best way to
+// format the value.  In particular, when the value is a C string
+// (char pointer) and the other operand is an STL string object, we
+// want to format the C string as a string, since we know it is
+// compared by value with the string object.  If the value is a char
+// pointer but the other operand is not an STL string object, we don't
+// know whether the pointer is supposed to point to a NUL-terminated
+// string, and thus want to print it as a pointer to be safe.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+
+// The default case.
+template <typename ToPrint, typename OtherOperand>
+class FormatForComparison {
+ public:
+  static ::std::string Format(const ToPrint& value) {
+    return ::testing::PrintToString(value);
+  }
+};
+
+// Array.
+template <typename ToPrint, size_t N, typename OtherOperand>
+class FormatForComparison<ToPrint[N], OtherOperand> {
+ public:
+  static ::std::string Format(const ToPrint* value) {
+    return FormatForComparison<const ToPrint*, OtherOperand>::Format(value);
+  }
+};
+
+// By default, print C string as pointers to be safe, as we don't know
+// whether they actually point to a NUL-terminated string.
+
+#define GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(CharType)                \
+  template <typename OtherOperand>                                      \
+  class FormatForComparison<CharType*, OtherOperand> {                  \
+   public:                                                              \
+    static ::std::string Format(CharType* value) {                      \
+      return ::testing::PrintToString(static_cast<const void*>(value)); \
+    }                                                                   \
+  }
+
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(wchar_t);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const wchar_t);
+
+#undef GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_
+
+// If a C string is compared with an STL string object, we know it's meant
+// to point to a NUL-terminated string, and thus can print it as a string.
+
+#define GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(CharType, OtherStringType) \
+  template <>                                                           \
+  class FormatForComparison<CharType*, OtherStringType> {               \
+   public:                                                              \
+    static ::std::string Format(CharType* value) {                      \
+      return ::testing::PrintToString(value);                           \
+    }                                                                   \
+  }
+
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::std::string);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::std::string);
+
+#if GTEST_HAS_GLOBAL_STRING
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::string);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::string);
+#endif
+
+#if GTEST_HAS_GLOBAL_WSTRING
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::wstring);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::wstring);
+#endif
+
+#if GTEST_HAS_STD_WSTRING
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::std::wstring);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::std::wstring);
+#endif
+
+#undef GTEST_IMPL_FORMAT_C_STRING_AS_STRING_
+
+// Formats a comparison assertion (e.g. ASSERT_EQ, EXPECT_LT, and etc)
+// operand to be used in a failure message.  The type (but not value)
+// of the other operand may affect the format.  This allows us to
+// print a char* as a raw pointer when it is compared against another
+// char* or void*, and print it as a C string when it is compared
+// against an std::string object, for example.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+template <typename T1, typename T2>
+std::string FormatForComparisonFailureMessage(
+    const T1& value, const T2& /* other_operand */) {
+  return FormatForComparison<T1, T2>::Format(value);
+}
+
+// Separate the error generating code from the code path to reduce the stack
+// frame size of CmpHelperEQ. This helps reduce the overhead of some sanitizers
+// when calling EXPECT_* in a tight loop.
+template <typename T1, typename T2>
+AssertionResult CmpHelperEQFailure(const char* expected_expression,
+                                   const char* actual_expression,
+                                   const T1& expected, const T2& actual) {
+  return EqFailure(expected_expression,
+                   actual_expression,
+                   FormatForComparisonFailureMessage(expected, actual),
+                   FormatForComparisonFailureMessage(actual, expected),
+                   false);
+}
+
+// The helper function for {ASSERT|EXPECT}_EQ.
+template <typename T1, typename T2>
+AssertionResult CmpHelperEQ(const char* expected_expression,
+                            const char* actual_expression,
+                            const T1& expected,
+                            const T2& actual) {
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4389 /* signed/unsigned mismatch */)
+  if (expected == actual) {
+    return AssertionSuccess();
+  }
+GTEST_DISABLE_MSC_WARNINGS_POP_()
+
+  return CmpHelperEQFailure(expected_expression, actual_expression, expected,
+                            actual);
+}
+
+// With this overloaded version, we allow anonymous enums to be used
+// in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous enums
+// can be implicitly cast to BiggestInt.
+GTEST_API_ AssertionResult CmpHelperEQ(const char* expected_expression,
+                                       const char* actual_expression,
+                                       BiggestInt expected,
+                                       BiggestInt actual);
+
+// The helper class for {ASSERT|EXPECT}_EQ.  The template argument
+// lhs_is_null_literal is true iff the first argument to ASSERT_EQ()
+// is a null pointer literal.  The following default implementation is
+// for lhs_is_null_literal being false.
+template <bool lhs_is_null_literal>
+class EqHelper {
+ public:
+  // This templatized version is for the general case.
+  template <typename T1, typename T2>
+  static AssertionResult Compare(const char* expected_expression,
+                                 const char* actual_expression,
+                                 const T1& expected,
+                                 const T2& actual) {
+    return CmpHelperEQ(expected_expression, actual_expression, expected,
+                       actual);
+  }
+
+  // With this overloaded version, we allow anonymous enums to be used
+  // in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous
+  // enums can be implicitly cast to BiggestInt.
+  //
+  // Even though its body looks the same as the above version, we
+  // cannot merge the two, as it will make anonymous enums unhappy.
+  static AssertionResult Compare(const char* expected_expression,
+                                 const char* actual_expression,
+                                 BiggestInt expected,
+                                 BiggestInt actual) {
+    return CmpHelperEQ(expected_expression, actual_expression, expected,
+                       actual);
+  }
+};
+
+// This specialization is used when the first argument to ASSERT_EQ()
+// is a null pointer literal, like NULL, false, or 0.
+template <>
+class EqHelper<true> {
+ public:
+  // We define two overloaded versions of Compare().  The first
+  // version will be picked when the second argument to ASSERT_EQ() is
+  // NOT a pointer, e.g. ASSERT_EQ(0, AnIntFunction()) or
+  // EXPECT_EQ(false, a_bool).
+  template <typename T1, typename T2>
+  static AssertionResult Compare(
+      const char* expected_expression,
+      const char* actual_expression,
+      const T1& expected,
+      const T2& actual,
+      // The following line prevents this overload from being considered if T2
+      // is not a pointer type.  We need this because ASSERT_EQ(NULL, my_ptr)
+      // expands to Compare("", "", NULL, my_ptr), which requires a conversion
+      // to match the Secret* in the other overload, which would otherwise make
+      // this template match better.
+      typename EnableIf<!is_pointer<T2>::value>::type* = 0) {
+    return CmpHelperEQ(expected_expression, actual_expression, expected,
+                       actual);
+  }
+
+  // This version will be picked when the second argument to ASSERT_EQ() is a
+  // pointer, e.g. ASSERT_EQ(NULL, a_pointer).
+  template <typename T>
+  static AssertionResult Compare(
+      const char* expected_expression,
+      const char* actual_expression,
+      // We used to have a second template parameter instead of Secret*.  That
+      // template parameter would deduce to 'long', making this a better match
+      // than the first overload even without the first overload's EnableIf.
+      // Unfortunately, gcc with -Wconversion-null warns when "passing NULL to
+      // non-pointer argument" (even a deduced integral argument), so the old
+      // implementation caused warnings in user code.
+      Secret* /* expected (NULL) */,
+      T* actual) {
+    // We already know that 'expected' is a null pointer.
+    return CmpHelperEQ(expected_expression, actual_expression,
+                       static_cast<T*>(NULL), actual);
+  }
+};
+
+// Separate the error generating code from the code path to reduce the stack
+// frame size of CmpHelperOP. This helps reduce the overhead of some sanitizers
+// when calling EXPECT_OP in a tight loop.
+template <typename T1, typename T2>
+AssertionResult CmpHelperOpFailure(const char* expr1, const char* expr2,
+                                   const T1& val1, const T2& val2,
+                                   const char* op) {
+  return AssertionFailure()
+         << "Expected: (" << expr1 << ") " << op << " (" << expr2
+         << "), actual: " << FormatForComparisonFailureMessage(val1, val2)
+         << " vs " << FormatForComparisonFailureMessage(val2, val1);
+}
+
+// A macro for implementing the helper functions needed to implement
+// ASSERT_?? and EXPECT_??.  It is here just to avoid copy-and-paste
+// of similar code.
+//
+// For each templatized helper function, we also define an overloaded
+// version for BiggestInt in order to reduce code bloat and allow
+// anonymous enums to be used with {ASSERT|EXPECT}_?? when compiled
+// with gcc 4.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+
+#define GTEST_IMPL_CMP_HELPER_(op_name, op)\
+template <typename T1, typename T2>\
+AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \
+                                   const T1& val1, const T2& val2) {\
+  if (val1 op val2) {\
+    return AssertionSuccess();\
+  } else {\
+    return CmpHelperOpFailure(expr1, expr2, val1, val2, #op);\
+  }\
+}\
+GTEST_API_ AssertionResult CmpHelper##op_name(\
+    const char* expr1, const char* expr2, BiggestInt val1, BiggestInt val2)
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+
+// Implements the helper function for {ASSERT|EXPECT}_NE
+GTEST_IMPL_CMP_HELPER_(NE, !=);
+// Implements the helper function for {ASSERT|EXPECT}_LE
+GTEST_IMPL_CMP_HELPER_(LE, <=);
+// Implements the helper function for {ASSERT|EXPECT}_LT
+GTEST_IMPL_CMP_HELPER_(LT, <);
+// Implements the helper function for {ASSERT|EXPECT}_GE
+GTEST_IMPL_CMP_HELPER_(GE, >=);
+// Implements the helper function for {ASSERT|EXPECT}_GT
+GTEST_IMPL_CMP_HELPER_(GT, >);
+
+#undef GTEST_IMPL_CMP_HELPER_
+
+// The helper function for {ASSERT|EXPECT}_STREQ.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTREQ(const char* expected_expression,
+                                          const char* actual_expression,
+                                          const char* expected,
+                                          const char* actual);
+
+// The helper function for {ASSERT|EXPECT}_STRCASEEQ.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTRCASEEQ(const char* expected_expression,
+                                              const char* actual_expression,
+                                              const char* expected,
+                                              const char* actual);
+
+// The helper function for {ASSERT|EXPECT}_STRNE.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression,
+                                          const char* s2_expression,
+                                          const char* s1,
+                                          const char* s2);
+
+// The helper function for {ASSERT|EXPECT}_STRCASENE.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTRCASENE(const char* s1_expression,
+                                              const char* s2_expression,
+                                              const char* s1,
+                                              const char* s2);
+
+
+// Helper function for *_STREQ on wide strings.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTREQ(const char* expected_expression,
+                                          const char* actual_expression,
+                                          const wchar_t* expected,
+                                          const wchar_t* actual);
+
+// Helper function for *_STRNE on wide strings.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression,
+                                          const char* s2_expression,
+                                          const wchar_t* s1,
+                                          const wchar_t* s2);
+
+}  // namespace internal
+
+// IsSubstring() and IsNotSubstring() are intended to be used as the
+// first argument to {EXPECT,ASSERT}_PRED_FORMAT2(), not by
+// themselves.  They check whether needle is a substring of haystack
+// (NULL is considered a substring of itself only), and return an
+// appropriate error message when they fail.
+//
+// The {needle,haystack}_expr arguments are the stringified
+// expressions that generated the two real arguments.
+GTEST_API_ AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const char* needle, const char* haystack);
+GTEST_API_ AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const wchar_t* needle, const wchar_t* haystack);
+GTEST_API_ AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const char* needle, const char* haystack);
+GTEST_API_ AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const wchar_t* needle, const wchar_t* haystack);
+GTEST_API_ AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::string& needle, const ::std::string& haystack);
+GTEST_API_ AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::string& needle, const ::std::string& haystack);
+
+#if GTEST_HAS_STD_WSTRING
+GTEST_API_ AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::wstring& needle, const ::std::wstring& haystack);
+GTEST_API_ AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::wstring& needle, const ::std::wstring& haystack);
+#endif  // GTEST_HAS_STD_WSTRING
+
+namespace internal {
+
+// Helper template function for comparing floating-points.
+//
+// Template parameter:
+//
+//   RawType: the raw floating-point type (either float or double)
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+template <typename RawType>
+AssertionResult CmpHelperFloatingPointEQ(const char* expected_expression,
+                                         const char* actual_expression,
+                                         RawType expected,
+                                         RawType actual) {
+  const FloatingPoint<RawType> lhs(expected), rhs(actual);
+
+  if (lhs.AlmostEquals(rhs)) {
+    return AssertionSuccess();
+  }
+
+  ::std::stringstream expected_ss;
+  expected_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+              << expected;
+
+  ::std::stringstream actual_ss;
+  actual_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+            << actual;
+
+  return EqFailure(expected_expression,
+                   actual_expression,
+                   StringStreamToString(&expected_ss),
+                   StringStreamToString(&actual_ss),
+                   false);
+}
+
+// Helper function for implementing ASSERT_NEAR.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult DoubleNearPredFormat(const char* expr1,
+                                                const char* expr2,
+                                                const char* abs_error_expr,
+                                                double val1,
+                                                double val2,
+                                                double abs_error);
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+// A class that enables one to stream messages to assertion macros
+class GTEST_API_ AssertHelper {
+ public:
+  // Constructor.
+  AssertHelper(TestPartResult::Type type,
+               const char* file,
+               int line,
+               const char* message);
+  ~AssertHelper();
+
+  // Message assignment is a semantic trick to enable assertion
+  // streaming; see the GTEST_MESSAGE_ macro below.
+  void operator=(const Message& message) const;
+
+ private:
+  // We put our data in a struct so that the size of the AssertHelper class can
+  // be as small as possible.  This is important because gcc is incapable of
+  // re-using stack space even for temporary variables, so every EXPECT_EQ
+  // reserves stack space for another AssertHelper.
+  struct AssertHelperData {
+    AssertHelperData(TestPartResult::Type t,
+                     const char* srcfile,
+                     int line_num,
+                     const char* msg)
+        : type(t), file(srcfile), line(line_num), message(msg) { }
+
+    TestPartResult::Type const type;
+    const char* const file;
+    int const line;
+    std::string const message;
+
+   private:
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelperData);
+  };
+
+  AssertHelperData* const data_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelper);
+};
+
+}  // namespace internal
+
+#if GTEST_HAS_PARAM_TEST
+// The pure interface class that all value-parameterized tests inherit from.
+// A value-parameterized class must inherit from both ::testing::Test and
+// ::testing::WithParamInterface. In most cases that just means inheriting
+// from ::testing::TestWithParam, but more complicated test hierarchies
+// may need to inherit from Test and WithParamInterface at different levels.
+//
+// This interface has support for accessing the test parameter value via
+// the GetParam() method.
+//
+// Use it with one of the parameter generator defining functions, like Range(),
+// Values(), ValuesIn(), Bool(), and Combine().
+//
+// class FooTest : public ::testing::TestWithParam<int> {
+//  protected:
+//   FooTest() {
+//     // Can use GetParam() here.
+//   }
+//   virtual ~FooTest() {
+//     // Can use GetParam() here.
+//   }
+//   virtual void SetUp() {
+//     // Can use GetParam() here.
+//   }
+//   virtual void TearDown {
+//     // Can use GetParam() here.
+//   }
+// };
+// TEST_P(FooTest, DoesBar) {
+//   // Can use GetParam() method here.
+//   Foo foo;
+//   ASSERT_TRUE(foo.DoesBar(GetParam()));
+// }
+// INSTANTIATE_TEST_CASE_P(OneToTenRange, FooTest, ::testing::Range(1, 10));
+
+template <typename T>
+class WithParamInterface {
+ public:
+  typedef T ParamType;
+  virtual ~WithParamInterface() {}
+
+  // The current parameter value. Is also available in the test fixture's
+  // constructor. This member function is non-static, even though it only
+  // references static data, to reduce the opportunity for incorrect uses
+  // like writing 'WithParamInterface<bool>::GetParam()' for a test that
+  // uses a fixture whose parameter type is int.
+  const ParamType& GetParam() const {
+    GTEST_CHECK_(parameter_ != NULL)
+        << "GetParam() can only be called inside a value-parameterized test "
+        << "-- did you intend to write TEST_P instead of TEST_F?";
+    return *parameter_;
+  }
+
+ private:
+  // Sets parameter value. The caller is responsible for making sure the value
+  // remains alive and unchanged throughout the current test.
+  static void SetParam(const ParamType* parameter) {
+    parameter_ = parameter;
+  }
+
+  // Static value used for accessing parameter during a test lifetime.
+  static const ParamType* parameter_;
+
+  // TestClass must be a subclass of WithParamInterface<T> and Test.
+  template <class TestClass> friend class internal::ParameterizedTestFactory;
+};
+
+template <typename T>
+const T* WithParamInterface<T>::parameter_ = NULL;
+
+// Most value-parameterized classes can ignore the existence of
+// WithParamInterface, and can just inherit from ::testing::TestWithParam.
+
+template <typename T>
+class TestWithParam : public Test, public WithParamInterface<T> {
+};
+
+#endif  // GTEST_HAS_PARAM_TEST
+
+// Macros for indicating success/failure in test code.
+
+// ADD_FAILURE unconditionally adds a failure to the current test.
+// SUCCEED generates a success - it doesn't automatically make the
+// current test successful, as a test is only successful when it has
+// no failure.
+//
+// EXPECT_* verifies that a certain condition is satisfied.  If not,
+// it behaves like ADD_FAILURE.  In particular:
+//
+//   EXPECT_TRUE  verifies that a Boolean condition is true.
+//   EXPECT_FALSE verifies that a Boolean condition is false.
+//
+// FAIL and ASSERT_* are similar to ADD_FAILURE and EXPECT_*, except
+// that they will also abort the current function on failure.  People
+// usually want the fail-fast behavior of FAIL and ASSERT_*, but those
+// writing data-driven tests often find themselves using ADD_FAILURE
+// and EXPECT_* more.
+
+// Generates a nonfatal failure with a generic message.
+#define ADD_FAILURE() GTEST_NONFATAL_FAILURE_("Failed")
+
+// Generates a nonfatal failure at the given source file location with
+// a generic message.
+#define ADD_FAILURE_AT(file, line) \
+  GTEST_MESSAGE_AT_(file, line, "Failed", \
+                    ::testing::TestPartResult::kNonFatalFailure)
+
+// Generates a fatal failure with a generic message.
+#define GTEST_FAIL() GTEST_FATAL_FAILURE_("Failed")
+
+// Define this macro to 1 to omit the definition of FAIL(), which is a
+// generic name and clashes with some other libraries.
+#if !GTEST_DONT_DEFINE_FAIL
+# define FAIL() GTEST_FAIL()
+#endif
+
+// Generates a success with a generic message.
+#define GTEST_SUCCEED() GTEST_SUCCESS_("Succeeded")
+
+// Define this macro to 1 to omit the definition of SUCCEED(), which
+// is a generic name and clashes with some other libraries.
+#if !GTEST_DONT_DEFINE_SUCCEED
+# define SUCCEED() GTEST_SUCCEED()
+#endif
+
+// Macros for testing exceptions.
+//
+//    * {ASSERT|EXPECT}_THROW(statement, expected_exception):
+//         Tests that the statement throws the expected exception.
+//    * {ASSERT|EXPECT}_NO_THROW(statement):
+//         Tests that the statement doesn't throw any exception.
+//    * {ASSERT|EXPECT}_ANY_THROW(statement):
+//         Tests that the statement throws an exception.
+
+#define EXPECT_THROW(statement, expected_exception) \
+  GTEST_TEST_THROW_(statement, expected_exception, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_NO_THROW(statement) \
+  GTEST_TEST_NO_THROW_(statement, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_ANY_THROW(statement) \
+  GTEST_TEST_ANY_THROW_(statement, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_THROW(statement, expected_exception) \
+  GTEST_TEST_THROW_(statement, expected_exception, GTEST_FATAL_FAILURE_)
+#define ASSERT_NO_THROW(statement) \
+  GTEST_TEST_NO_THROW_(statement, GTEST_FATAL_FAILURE_)
+#define ASSERT_ANY_THROW(statement) \
+  GTEST_TEST_ANY_THROW_(statement, GTEST_FATAL_FAILURE_)
+
+// Boolean assertions. Condition can be either a Boolean expression or an
+// AssertionResult. For more information on how to use AssertionResult with
+// these macros see comments on that class.
+#define EXPECT_TRUE(condition) \
+  GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
+                      GTEST_NONFATAL_FAILURE_)
+#define EXPECT_FALSE(condition) \
+  GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
+                      GTEST_NONFATAL_FAILURE_)
+#define ASSERT_TRUE(condition) \
+  GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
+                      GTEST_FATAL_FAILURE_)
+#define ASSERT_FALSE(condition) \
+  GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
+                      GTEST_FATAL_FAILURE_)
+
+// Includes the auto-generated header that implements a family of
+// generic predicate assertion macros.
+// Copyright 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// This file is AUTOMATICALLY GENERATED on 10/31/2011 by command
+// 'gen_gtest_pred_impl.py 5'.  DO NOT EDIT BY HAND!
+//
+// Implements a family of generic predicate assertion macros.
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
+#define GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
+
+// Makes sure this header is not included before gtest.h.
+#ifndef GTEST_INCLUDE_GTEST_GTEST_H_
+# error Do not include gtest_pred_impl.h directly.  Include gtest.h instead.
+#endif  // GTEST_INCLUDE_GTEST_GTEST_H_
+
+// This header implements a family of generic predicate assertion
+// macros:
+//
+//   ASSERT_PRED_FORMAT1(pred_format, v1)
+//   ASSERT_PRED_FORMAT2(pred_format, v1, v2)
+//   ...
+//
+// where pred_format is a function or functor that takes n (in the
+// case of ASSERT_PRED_FORMATn) values and their source expression
+// text, and returns a testing::AssertionResult.  See the definition
+// of ASSERT_EQ in gtest.h for an example.
+//
+// If you don't care about formatting, you can use the more
+// restrictive version:
+//
+//   ASSERT_PRED1(pred, v1)
+//   ASSERT_PRED2(pred, v1, v2)
+//   ...
+//
+// where pred is an n-ary function or functor that returns bool,
+// and the values v1, v2, ..., must support the << operator for
+// streaming to std::ostream.
+//
+// We also define the EXPECT_* variations.
+//
+// For now we only support predicates whose arity is at most 5.
+// Please email googletestframework at googlegroups.com if you need
+// support for higher arities.
+
+// GTEST_ASSERT_ is the basic statement to which all of the assertions
+// in this file reduce.  Don't use this in your code.
+
+#define GTEST_ASSERT_(expression, on_failure) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (const ::testing::AssertionResult gtest_ar = (expression)) \
+    ; \
+  else \
+    on_failure(gtest_ar.failure_message())
+
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED1.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1>
+AssertionResult AssertPred1Helper(const char* pred_text,
+                                  const char* e1,
+                                  Pred pred,
+                                  const T1& v1) {
+  if (pred(v1)) return AssertionSuccess();
+
+  return AssertionFailure() << pred_text << "("
+                            << e1 << ") evaluates to false, where"
+                            << "\n" << e1 << " evaluates to " << v1;
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT1.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT1_(pred_format, v1, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, v1), \
+                on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED1.  Don't use
+// this in your code.
+#define GTEST_PRED1_(pred, v1, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred1Helper(#pred, \
+                                             #v1, \
+                                             pred, \
+                                             v1), on_failure)
+
+// Unary predicate assertion macros.
+#define EXPECT_PRED_FORMAT1(pred_format, v1) \
+  GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED1(pred, v1) \
+  GTEST_PRED1_(pred, v1, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT1(pred_format, v1) \
+  GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED1(pred, v1) \
+  GTEST_PRED1_(pred, v1, GTEST_FATAL_FAILURE_)
+
+
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED2.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1,
+          typename T2>
+AssertionResult AssertPred2Helper(const char* pred_text,
+                                  const char* e1,
+                                  const char* e2,
+                                  Pred pred,
+                                  const T1& v1,
+                                  const T2& v2) {
+  if (pred(v1, v2)) return AssertionSuccess();
+
+  return AssertionFailure() << pred_text << "("
+                            << e1 << ", "
+                            << e2 << ") evaluates to false, where"
+                            << "\n" << e1 << " evaluates to " << v1
+                            << "\n" << e2 << " evaluates to " << v2;
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT2.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT2_(pred_format, v1, v2, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2), \
+                on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED2.  Don't use
+// this in your code.
+#define GTEST_PRED2_(pred, v1, v2, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred2Helper(#pred, \
+                                             #v1, \
+                                             #v2, \
+                                             pred, \
+                                             v1, \
+                                             v2), on_failure)
+
+// Binary predicate assertion macros.
+#define EXPECT_PRED_FORMAT2(pred_format, v1, v2) \
+  GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED2(pred, v1, v2) \
+  GTEST_PRED2_(pred, v1, v2, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT2(pred_format, v1, v2) \
+  GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED2(pred, v1, v2) \
+  GTEST_PRED2_(pred, v1, v2, GTEST_FATAL_FAILURE_)
+
+
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED3.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1,
+          typename T2,
+          typename T3>
+AssertionResult AssertPred3Helper(const char* pred_text,
+                                  const char* e1,
+                                  const char* e2,
+                                  const char* e3,
+                                  Pred pred,
+                                  const T1& v1,
+                                  const T2& v2,
+                                  const T3& v3) {
+  if (pred(v1, v2, v3)) return AssertionSuccess();
+
+  return AssertionFailure() << pred_text << "("
+                            << e1 << ", "
+                            << e2 << ", "
+                            << e3 << ") evaluates to false, where"
+                            << "\n" << e1 << " evaluates to " << v1
+                            << "\n" << e2 << " evaluates to " << v2
+                            << "\n" << e3 << " evaluates to " << v3;
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT3.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, v1, v2, v3), \
+                on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED3.  Don't use
+// this in your code.
+#define GTEST_PRED3_(pred, v1, v2, v3, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred3Helper(#pred, \
+                                             #v1, \
+                                             #v2, \
+                                             #v3, \
+                                             pred, \
+                                             v1, \
+                                             v2, \
+                                             v3), on_failure)
+
+// Ternary predicate assertion macros.
+#define EXPECT_PRED_FORMAT3(pred_format, v1, v2, v3) \
+  GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED3(pred, v1, v2, v3) \
+  GTEST_PRED3_(pred, v1, v2, v3, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT3(pred_format, v1, v2, v3) \
+  GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED3(pred, v1, v2, v3) \
+  GTEST_PRED3_(pred, v1, v2, v3, GTEST_FATAL_FAILURE_)
+
+
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED4.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1,
+          typename T2,
+          typename T3,
+          typename T4>
+AssertionResult AssertPred4Helper(const char* pred_text,
+                                  const char* e1,
+                                  const char* e2,
+                                  const char* e3,
+                                  const char* e4,
+                                  Pred pred,
+                                  const T1& v1,
+                                  const T2& v2,
+                                  const T3& v3,
+                                  const T4& v4) {
+  if (pred(v1, v2, v3, v4)) return AssertionSuccess();
+
+  return AssertionFailure() << pred_text << "("
+                            << e1 << ", "
+                            << e2 << ", "
+                            << e3 << ", "
+                            << e4 << ") evaluates to false, where"
+                            << "\n" << e1 << " evaluates to " << v1
+                            << "\n" << e2 << " evaluates to " << v2
+                            << "\n" << e3 << " evaluates to " << v3
+                            << "\n" << e4 << " evaluates to " << v4;
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT4.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, v1, v2, v3, v4), \
+                on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED4.  Don't use
+// this in your code.
+#define GTEST_PRED4_(pred, v1, v2, v3, v4, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred4Helper(#pred, \
+                                             #v1, \
+                                             #v2, \
+                                             #v3, \
+                                             #v4, \
+                                             pred, \
+                                             v1, \
+                                             v2, \
+                                             v3, \
+                                             v4), on_failure)
+
+// 4-ary predicate assertion macros.
+#define EXPECT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \
+  GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED4(pred, v1, v2, v3, v4) \
+  GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \
+  GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED4(pred, v1, v2, v3, v4) \
+  GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_FATAL_FAILURE_)
+
+
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED5.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5>
+AssertionResult AssertPred5Helper(const char* pred_text,
+                                  const char* e1,
+                                  const char* e2,
+                                  const char* e3,
+                                  const char* e4,
+                                  const char* e5,
+                                  Pred pred,
+                                  const T1& v1,
+                                  const T2& v2,
+                                  const T3& v3,
+                                  const T4& v4,
+                                  const T5& v5) {
+  if (pred(v1, v2, v3, v4, v5)) return AssertionSuccess();
+
+  return AssertionFailure() << pred_text << "("
+                            << e1 << ", "
+                            << e2 << ", "
+                            << e3 << ", "
+                            << e4 << ", "
+                            << e5 << ") evaluates to false, where"
+                            << "\n" << e1 << " evaluates to " << v1
+                            << "\n" << e2 << " evaluates to " << v2
+                            << "\n" << e3 << " evaluates to " << v3
+                            << "\n" << e4 << " evaluates to " << v4
+                            << "\n" << e5 << " evaluates to " << v5;
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT5.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, #v5, v1, v2, v3, v4, v5), \
+                on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED5.  Don't use
+// this in your code.
+#define GTEST_PRED5_(pred, v1, v2, v3, v4, v5, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred5Helper(#pred, \
+                                             #v1, \
+                                             #v2, \
+                                             #v3, \
+                                             #v4, \
+                                             #v5, \
+                                             pred, \
+                                             v1, \
+                                             v2, \
+                                             v3, \
+                                             v4, \
+                                             v5), on_failure)
+
+// 5-ary predicate assertion macros.
+#define EXPECT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \
+  GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED5(pred, v1, v2, v3, v4, v5) \
+  GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \
+  GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED5(pred, v1, v2, v3, v4, v5) \
+  GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_)
+
+
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
+
+// Macros for testing equalities and inequalities.
+//
+//    * {ASSERT|EXPECT}_EQ(expected, actual): Tests that expected == actual
+//    * {ASSERT|EXPECT}_NE(v1, v2):           Tests that v1 != v2
+//    * {ASSERT|EXPECT}_LT(v1, v2):           Tests that v1 < v2
+//    * {ASSERT|EXPECT}_LE(v1, v2):           Tests that v1 <= v2
+//    * {ASSERT|EXPECT}_GT(v1, v2):           Tests that v1 > v2
+//    * {ASSERT|EXPECT}_GE(v1, v2):           Tests that v1 >= v2
+//
+// When they are not, Google Test prints both the tested expressions and
+// their actual values.  The values must be compatible built-in types,
+// or you will get a compiler error.  By "compatible" we mean that the
+// values can be compared by the respective operator.
+//
+// Note:
+//
+//   1. It is possible to make a user-defined type work with
+//   {ASSERT|EXPECT}_??(), but that requires overloading the
+//   comparison operators and is thus discouraged by the Google C++
+//   Usage Guide.  Therefore, you are advised to use the
+//   {ASSERT|EXPECT}_TRUE() macro to assert that two objects are
+//   equal.
+//
+//   2. The {ASSERT|EXPECT}_??() macros do pointer comparisons on
+//   pointers (in particular, C strings).  Therefore, if you use it
+//   with two C strings, you are testing how their locations in memory
+//   are related, not how their content is related.  To compare two C
+//   strings by content, use {ASSERT|EXPECT}_STR*().
+//
+//   3. {ASSERT|EXPECT}_EQ(expected, actual) is preferred to
+//   {ASSERT|EXPECT}_TRUE(expected == actual), as the former tells you
+//   what the actual value is when it fails, and similarly for the
+//   other comparisons.
+//
+//   4. Do not depend on the order in which {ASSERT|EXPECT}_??()
+//   evaluate their arguments, which is undefined.
+//
+//   5. These macros evaluate their arguments exactly once.
+//
+// Examples:
+//
+//   EXPECT_NE(5, Foo());
+//   EXPECT_EQ(NULL, a_pointer);
+//   ASSERT_LT(i, array_size);
+//   ASSERT_GT(records.size(), 0) << "There is no record left.";
+
+#define EXPECT_EQ(expected, actual) \
+  EXPECT_PRED_FORMAT2(::testing::internal:: \
+                      EqHelper<GTEST_IS_NULL_LITERAL_(expected)>::Compare, \
+                      expected, actual)
+#define EXPECT_NE(expected, actual) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperNE, expected, actual)
+#define EXPECT_LE(val1, val2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperLE, val1, val2)
+#define EXPECT_LT(val1, val2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperLT, val1, val2)
+#define EXPECT_GE(val1, val2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperGE, val1, val2)
+#define EXPECT_GT(val1, val2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperGT, val1, val2)
+
+#define GTEST_ASSERT_EQ(expected, actual) \
+  ASSERT_PRED_FORMAT2(::testing::internal:: \
+                      EqHelper<GTEST_IS_NULL_LITERAL_(expected)>::Compare, \
+                      expected, actual)
+#define GTEST_ASSERT_NE(val1, val2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperNE, val1, val2)
+#define GTEST_ASSERT_LE(val1, val2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperLE, val1, val2)
+#define GTEST_ASSERT_LT(val1, val2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperLT, val1, val2)
+#define GTEST_ASSERT_GE(val1, val2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperGE, val1, val2)
+#define GTEST_ASSERT_GT(val1, val2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperGT, val1, val2)
+
+// Define macro GTEST_DONT_DEFINE_ASSERT_XY to 1 to omit the definition of
+// ASSERT_XY(), which clashes with some users' own code.
+
+#if !GTEST_DONT_DEFINE_ASSERT_EQ
+# define ASSERT_EQ(val1, val2) GTEST_ASSERT_EQ(val1, val2)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_NE
+# define ASSERT_NE(val1, val2) GTEST_ASSERT_NE(val1, val2)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_LE
+# define ASSERT_LE(val1, val2) GTEST_ASSERT_LE(val1, val2)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_LT
+# define ASSERT_LT(val1, val2) GTEST_ASSERT_LT(val1, val2)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_GE
+# define ASSERT_GE(val1, val2) GTEST_ASSERT_GE(val1, val2)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_GT
+# define ASSERT_GT(val1, val2) GTEST_ASSERT_GT(val1, val2)
+#endif
+
+// C-string Comparisons.  All tests treat NULL and any non-NULL string
+// as different.  Two NULLs are equal.
+//
+//    * {ASSERT|EXPECT}_STREQ(s1, s2):     Tests that s1 == s2
+//    * {ASSERT|EXPECT}_STRNE(s1, s2):     Tests that s1 != s2
+//    * {ASSERT|EXPECT}_STRCASEEQ(s1, s2): Tests that s1 == s2, ignoring case
+//    * {ASSERT|EXPECT}_STRCASENE(s1, s2): Tests that s1 != s2, ignoring case
+//
+// For wide or narrow string objects, you can use the
+// {ASSERT|EXPECT}_??() macros.
+//
+// Don't depend on the order in which the arguments are evaluated,
+// which is undefined.
+//
+// These macros evaluate their arguments exactly once.
+
+#define EXPECT_STREQ(expected, actual) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, expected, actual)
+#define EXPECT_STRNE(s1, s2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2)
+#define EXPECT_STRCASEEQ(expected, actual) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, expected, actual)
+#define EXPECT_STRCASENE(s1, s2)\
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2)
+
+#define ASSERT_STREQ(expected, actual) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, expected, actual)
+#define ASSERT_STRNE(s1, s2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2)
+#define ASSERT_STRCASEEQ(expected, actual) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, expected, actual)
+#define ASSERT_STRCASENE(s1, s2)\
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2)
+
+// Macros for comparing floating-point numbers.
+//
+//    * {ASSERT|EXPECT}_FLOAT_EQ(expected, actual):
+//         Tests that two float values are almost equal.
+//    * {ASSERT|EXPECT}_DOUBLE_EQ(expected, actual):
+//         Tests that two double values are almost equal.
+//    * {ASSERT|EXPECT}_NEAR(v1, v2, abs_error):
+//         Tests that v1 and v2 are within the given distance to each other.
+//
+// Google Test uses ULP-based comparison to automatically pick a default
+// error bound that is appropriate for the operands.  See the
+// FloatingPoint template class in gtest-internal.h if you are
+// interested in the implementation details.
+
+#define EXPECT_FLOAT_EQ(expected, actual)\
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \
+                      expected, actual)
+
+#define EXPECT_DOUBLE_EQ(expected, actual)\
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \
+                      expected, actual)
+
+#define ASSERT_FLOAT_EQ(expected, actual)\
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \
+                      expected, actual)
+
+#define ASSERT_DOUBLE_EQ(expected, actual)\
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \
+                      expected, actual)
+
+#define EXPECT_NEAR(val1, val2, abs_error)\
+  EXPECT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \
+                      val1, val2, abs_error)
+
+#define ASSERT_NEAR(val1, val2, abs_error)\
+  ASSERT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \
+                      val1, val2, abs_error)
+
+// These predicate format functions work on floating-point values, and
+// can be used in {ASSERT|EXPECT}_PRED_FORMAT2*(), e.g.
+//
+//   EXPECT_PRED_FORMAT2(testing::DoubleLE, Foo(), 5.0);
+
+// Asserts that val1 is less than, or almost equal to, val2.  Fails
+// otherwise.  In particular, it fails if either val1 or val2 is NaN.
+GTEST_API_ AssertionResult FloatLE(const char* expr1, const char* expr2,
+                                   float val1, float val2);
+GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2,
+                                    double val1, double val2);
+
+
+#if GTEST_OS_WINDOWS
+
+// Macros that test for HRESULT failure and success, these are only useful
+// on Windows, and rely on Windows SDK macros and APIs to compile.
+//
+//    * {ASSERT|EXPECT}_HRESULT_{SUCCEEDED|FAILED}(expr)
+//
+// When expr unexpectedly fails or succeeds, Google Test prints the
+// expected result and the actual result with both a human-readable
+// string representation of the error, if available, as well as the
+// hex result code.
+# define EXPECT_HRESULT_SUCCEEDED(expr) \
+    EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
+
+# define ASSERT_HRESULT_SUCCEEDED(expr) \
+    ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
+
+# define EXPECT_HRESULT_FAILED(expr) \
+    EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
+
+# define ASSERT_HRESULT_FAILED(expr) \
+    ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
+
+#endif  // GTEST_OS_WINDOWS
+
+// Macros that execute statement and check that it doesn't generate new fatal
+// failures in the current thread.
+//
+//   * {ASSERT|EXPECT}_NO_FATAL_FAILURE(statement);
+//
+// Examples:
+//
+//   EXPECT_NO_FATAL_FAILURE(Process());
+//   ASSERT_NO_FATAL_FAILURE(Process()) << "Process() failed";
+//
+#define ASSERT_NO_FATAL_FAILURE(statement) \
+    GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_FATAL_FAILURE_)
+#define EXPECT_NO_FATAL_FAILURE(statement) \
+    GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_NONFATAL_FAILURE_)
+
+// Causes a trace (including the source file path, the current line
+// number, and the given message) to be included in every test failure
+// message generated by code in the current scope.  The effect is
+// undone when the control leaves the current scope.
+//
+// The message argument can be anything streamable to std::ostream.
+//
+// In the implementation, we include the current line number as part
+// of the dummy variable name, thus allowing multiple SCOPED_TRACE()s
+// to appear in the same block - as long as they are on different
+// lines.
+#define SCOPED_TRACE(message) \
+  ::testing::internal::ScopedTrace GTEST_CONCAT_TOKEN_(gtest_trace_, __LINE__)(\
+    __FILE__, __LINE__, ::testing::Message() << (message))
+
+// Compile-time assertion for type equality.
+// StaticAssertTypeEq<type1, type2>() compiles iff type1 and type2 are
+// the same type.  The value it returns is not interesting.
+//
+// Instead of making StaticAssertTypeEq a class template, we make it a
+// function template that invokes a helper class template.  This
+// prevents a user from misusing StaticAssertTypeEq<T1, T2> by
+// defining objects of that type.
+//
+// CAVEAT:
+//
+// When used inside a method of a class template,
+// StaticAssertTypeEq<T1, T2>() is effective ONLY IF the method is
+// instantiated.  For example, given:
+//
+//   template <typename T> class Foo {
+//    public:
+//     void Bar() { testing::StaticAssertTypeEq<int, T>(); }
+//   };
+//
+// the code:
+//
+//   void Test1() { Foo<bool> foo; }
+//
+// will NOT generate a compiler error, as Foo<bool>::Bar() is never
+// actually instantiated.  Instead, you need:
+//
+//   void Test2() { Foo<bool> foo; foo.Bar(); }
+//
+// to cause a compiler error.
+template <typename T1, typename T2>
+bool StaticAssertTypeEq() {
+  (void)internal::StaticAssertTypeEqHelper<T1, T2>();
+  return true;
+}
+
+// Defines a test.
+//
+// The first parameter is the name of the test case, and the second
+// parameter is the name of the test within the test case.
+//
+// The convention is to end the test case name with "Test".  For
+// example, a test case for the Foo class can be named FooTest.
+//
+// Test code should appear between braces after an invocation of
+// this macro.  Example:
+//
+//   TEST(FooTest, InitializesCorrectly) {
+//     Foo foo;
+//     EXPECT_TRUE(foo.StatusIsOK());
+//   }
+
+// Note that we call GetTestTypeId() instead of GetTypeId<
+// ::testing::Test>() here to get the type ID of testing::Test.  This
+// is to work around a suspected linker bug when using Google Test as
+// a framework on Mac OS X.  The bug causes GetTypeId<
+// ::testing::Test>() to return different values depending on whether
+// the call is from the Google Test framework itself or from user test
+// code.  GetTestTypeId() is guaranteed to always return the same
+// value, as it always calls GetTypeId<>() from the Google Test
+// framework.
+#define GTEST_TEST(test_case_name, test_name)\
+  GTEST_TEST_(test_case_name, test_name, \
+              ::testing::Test, ::testing::internal::GetTestTypeId())
+
+// Define this macro to 1 to omit the definition of TEST(), which
+// is a generic name and clashes with some other libraries.
+#if !GTEST_DONT_DEFINE_TEST
+# define TEST(test_case_name, test_name) GTEST_TEST(test_case_name, test_name)
+#endif
+
+// Defines a test that uses a test fixture.
+//
+// The first parameter is the name of the test fixture class, which
+// also doubles as the test case name.  The second parameter is the
+// name of the test within the test case.
+//
+// A test fixture class must be declared earlier.  The user should put
+// his test code between braces after using this macro.  Example:
+//
+//   class FooTest : public testing::Test {
+//    protected:
+//     virtual void SetUp() { b_.AddElement(3); }
+//
+//     Foo a_;
+//     Foo b_;
+//   };
+//
+//   TEST_F(FooTest, InitializesCorrectly) {
+//     EXPECT_TRUE(a_.StatusIsOK());
+//   }
+//
+//   TEST_F(FooTest, ReturnsElementCountCorrectly) {
+//     EXPECT_EQ(0, a_.size());
+//     EXPECT_EQ(1, b_.size());
+//   }
+
+#define TEST_F(test_fixture, test_name)\
+  GTEST_TEST_(test_fixture, test_name, test_fixture, \
+              ::testing::internal::GetTypeId<test_fixture>())
+
+}  // namespace testing
+
+// Use this function in main() to run all tests.  It returns 0 if all
+// tests are successful, or 1 otherwise.
+//
+// RUN_ALL_TESTS() should be invoked after the command line has been
+// parsed by InitGoogleTest().
+//
+// This function was formerly a macro; thus, it is in the global
+// namespace and has an all-caps name.
+int RUN_ALL_TESTS() GTEST_MUST_USE_RESULT_;
+
+inline int RUN_ALL_TESTS() {
+  return ::testing::UnitTest::GetInstance()->Run();
+}
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_H_
diff --git a/src/rocksdb/tools/auto_sanity_test.sh b/src/rocksdb/tools/auto_sanity_test.sh
deleted file mode 100755
index 2d63c0a..0000000
--- a/src/rocksdb/tools/auto_sanity_test.sh
+++ /dev/null
@@ -1,71 +0,0 @@
-TMP_DIR="/tmp/rocksdb-sanity-test"
-
-if [ "$#" -lt 2 ]; then
-  echo "usage: ./auto_sanity_test.sh [new_commit] [old_commit]"
-  echo "Missing either [new_commit] or [old_commit], perform sanity check with the latest and 10th latest commits."
-  recent_commits=`git log | grep -e "^commit [a-z0-9]\+$"| head -n10 | sed -e 's/commit //g'`
-  commit_new=`echo "$recent_commits" | head -n1`
-  commit_old=`echo "$recent_commits" | tail -n1`
-  echo "the most recent commits are:"
-  echo "$recent_commits"
-else
-  commit_new=$1
-  commit_old=$2
-fi
-
-if [ ! -d $TMP_DIR ]; then
-  mkdir $TMP_DIR
-fi
-dir_new="${TMP_DIR}/${commit_new}"
-dir_old="${TMP_DIR}/${commit_old}"
-
-function makestuff() {
-  echo "make clean"
-  make clean > /dev/null
-  echo "make db_sanity_test -j32"
-  make db_sanity_test -j32 > /dev/null
-  if [ $? -ne 0 ]; then
-    echo "[ERROR] Failed to perform 'make db_sanity_test'"
-    exit 1
-  fi
-}
-
-rm -r -f $dir_new
-rm -r -f $dir_old
-
-echo "Running db sanity check with commits $commit_new and $commit_old."
-
-echo "============================================================="
-echo "Making build $commit_new"
-makestuff
-mv db_sanity_test new_db_sanity_test
-echo "Creating db based on the new commit --- $commit_new"
-./new_db_sanity_test $dir_new create
-
-echo "============================================================="
-echo "Making build $commit_old"
-makestuff
-mv db_sanity_test old_db_sanity_test
-echo "Creating db based on the old commit --- $commit_old"
-./old_db_sanity_test $dir_old create
-
-echo "============================================================="
-echo "Verifying new db $dir_new using the old commit --- $commit_old"
-./old_db_sanity_test $dir_new verify
-if [ $? -ne 0 ]; then
-  echo "[ERROR] Verification of $dir_new using commit $commit_old failed."
-  exit 2
-fi
-
-echo "============================================================="
-echo "Verifying old db $dir_old using the new commit --- $commit_new"
-./new_db_sanity_test $dir_old verify
-if [ $? -ne 0 ]; then
-  echo "[ERROR] Verification of $dir_old using commit $commit_new failed."
-  exit 2
-fi
-
-rm old_db_sanity_test
-rm new_db_sanity_test
-
-echo "Auto sanity test passed!"
diff --git a/src/rocksdb/tools/blob_store_bench.cc b/src/rocksdb/tools/blob_store_bench.cc
deleted file mode 100644
index 60a0b84..0000000
--- a/src/rocksdb/tools/blob_store_bench.cc
+++ /dev/null
@@ -1,280 +0,0 @@
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-
-#include <cstdio>
-#include <vector>
-#include <atomic>
-
-#include "rocksdb/env.h"
-#include "util/blob_store.h"
-#include "util/testutil.h"
-
-#define KB 1024LL
-#define MB 1024*1024LL
-// BlobStore does costly asserts to make sure it's running correctly, which
-// significantly impacts benchmark runtime.
-// NDEBUG will compile out those asserts.
-#ifndef NDEBUG
-#define NDEBUG
-#endif
-
-using namespace rocksdb;
-using namespace std;
-
-// used by all threads
-uint64_t timeout_sec;
-Env *env;
-BlobStore* bs;
-
-namespace {
-std::string RandomString(Random* rnd, uint64_t len) {
-  std::string r;
-  test::RandomString(rnd, len, &r);
-  return r;
-}
-}  // namespace
-
-struct Result {
-  uint32_t writes;
-  uint32_t reads;
-  uint32_t deletes;
-  uint64_t data_written;
-  uint64_t data_read;
-
-  void print() {
-    printf("Total writes = %u\n", writes);
-    printf("Total reads = %u\n", reads);
-    printf("Total deletes = %u\n", deletes);
-    printf("Write throughput = %lf MB/s\n",
-           (double)data_written / (1024*1024.0) / timeout_sec);
-    printf("Read throughput = %lf MB/s\n",
-           (double)data_read / (1024*1024.0) / timeout_sec);
-    printf("Total throughput = %lf MB/s\n",
-           (double)(data_read + data_written) / (1024*1024.0) / timeout_sec);
-  }
-
-  Result() {
-    writes = reads = deletes = data_read = data_written = 0;
-  }
-
-  Result (uint32_t writes, uint32_t reads, uint32_t deletes,
-          uint64_t data_written, uint64_t data_read) :
-    writes(writes), reads(reads), deletes(deletes),
-    data_written(data_written), data_read(data_read) {}
-
-};
-
-namespace {
-Result operator + (const Result &a, const Result &b) {
-  return Result(a.writes + b.writes, a.reads + b.reads,
-                a.deletes + b.deletes, a.data_written + b.data_written,
-                a.data_read + b.data_read);
-}
-}  // namespace
-
-struct WorkerThread {
-  uint64_t data_size_from, data_size_to;
-  double read_ratio;
-  uint64_t working_set_size; // start deleting once you reach this
-  Result result;
-  atomic<bool> stopped;
-
-  WorkerThread(uint64_t data_size_from, uint64_t data_size_to,
-                double read_ratio, uint64_t working_set_size) :
-    data_size_from(data_size_from), data_size_to(data_size_to),
-    read_ratio(read_ratio), working_set_size(working_set_size),
-    stopped(false) {}
-
-  WorkerThread(const WorkerThread& wt) :
-    data_size_from(wt.data_size_from), data_size_to(wt.data_size_to),
-    read_ratio(wt.read_ratio), working_set_size(wt.working_set_size),
-    stopped(false) {}
-};
-
-static void WorkerThreadBody(void* arg) {
-  WorkerThread* t = reinterpret_cast<WorkerThread*>(arg);
-  Random rnd(5);
-  string buf;
-  vector<pair<Blob, uint64_t>> blobs;
-  vector<string> random_strings;
-
-  for (int i = 0; i < 10; ++i) {
-    random_strings.push_back(RandomString(&rnd, t->data_size_to));
-  }
-
-  uint64_t total_size = 0;
-
-  uint64_t start_micros = env->NowMicros();
-  while (env->NowMicros() - start_micros < timeout_sec * 1000 * 1000) {
-    if (blobs.size() && rand() < RAND_MAX * t->read_ratio) {
-      // read
-      int bi = rand() % blobs.size();
-      Status s = bs->Get(blobs[bi].first, &buf);
-      assert(s.ok());
-      t->result.data_read += buf.size();
-      t->result.reads++;
-    } else {
-      // write
-      uint64_t size = rand() % (t->data_size_to - t->data_size_from) +
-        t->data_size_from;
-      total_size += size;
-      string put_str = random_strings[rand() % random_strings.size()];
-      blobs.push_back(make_pair(Blob(), size));
-      Status s = bs->Put(Slice(put_str.data(), size), &blobs.back().first);
-      assert(s.ok());
-      t->result.data_written += size;
-      t->result.writes++;
-    }
-
-    while (total_size >= t->working_set_size) {
-      // delete random
-      int bi = rand() % blobs.size();
-      total_size -= blobs[bi].second;
-      bs->Delete(blobs[bi].first);
-      blobs.erase(blobs.begin() + bi);
-      t->result.deletes++;
-    }
-  }
-  t->stopped.store(true);
-}
-
-namespace {
-Result StartBenchmark(vector<WorkerThread*>& config) {
-  for (auto w : config) {
-    env->StartThread(WorkerThreadBody, w);
-  }
-
-  Result result;
-
-  for (auto w : config) {
-    while (!w->stopped.load());
-    result = result + w->result;
-  }
-
-  for (auto w : config) {
-    delete w;
-  }
-
-  delete bs;
-
-  return result;
-}
-
-vector<WorkerThread*> SetupBenchmarkBalanced() {
-  string test_path;
-  env->GetTestDirectory(&test_path);
-  test_path.append("/blob_store");
-
-  // config start
-  uint32_t block_size = 16*KB;
-  uint32_t file_size = 1*MB;
-  double read_write_ratio = 0.5;
-  uint64_t data_read_from = 16*KB;
-  uint64_t data_read_to = 32*KB;
-  int number_of_threads = 10;
-  uint64_t working_set_size = 5*MB;
-  timeout_sec = 5;
-  // config end
-
-  bs = new BlobStore(test_path, block_size, file_size / block_size, 10000, env);
-
-  vector <WorkerThread*> config;
-
-  for (int i = 0; i < number_of_threads; ++i) {
-    config.push_back(new WorkerThread(data_read_from,
-                                      data_read_to,
-                                      read_write_ratio,
-                                      working_set_size));
-  };
-
-  return config;
-}
-
-vector<WorkerThread*> SetupBenchmarkWriteHeavy() {
-  string test_path;
-  env->GetTestDirectory(&test_path);
-  test_path.append("/blob_store");
-
-  // config start
-  uint32_t block_size = 16*KB;
-  uint32_t file_size = 1*MB;
-  double read_write_ratio = 0.1;
-  uint64_t data_read_from = 16*KB;
-  uint64_t data_read_to = 32*KB;
-  int number_of_threads = 10;
-  uint64_t working_set_size = 5*MB;
-  timeout_sec = 5;
-  // config end
-
-  bs = new BlobStore(test_path, block_size, file_size / block_size, 10000, env);
-
-  vector <WorkerThread*> config;
-
-  for (int i = 0; i < number_of_threads; ++i) {
-    config.push_back(new WorkerThread(data_read_from,
-                                      data_read_to,
-                                      read_write_ratio,
-                                      working_set_size));
-  };
-
-  return config;
-}
-
-vector<WorkerThread*> SetupBenchmarkReadHeavy() {
-  string test_path;
-  env->GetTestDirectory(&test_path);
-  test_path.append("/blob_store");
-
-  // config start
-  uint32_t block_size = 16*KB;
-  uint32_t file_size = 1*MB;
-  double read_write_ratio = 0.9;
-  uint64_t data_read_from = 16*KB;
-  uint64_t data_read_to = 32*KB;
-  int number_of_threads = 10;
-  uint64_t working_set_size = 5*MB;
-  timeout_sec = 5;
-  // config end
-
-  bs = new BlobStore(test_path, block_size, file_size / block_size, 10000, env);
-
-  vector <WorkerThread*> config;
-
-  for (int i = 0; i < number_of_threads; ++i) {
-    config.push_back(new WorkerThread(data_read_from,
-                                      data_read_to,
-                                      read_write_ratio,
-                                      working_set_size));
-  };
-
-  return config;
-}
-}  // namespace
-
-int main(int argc, const char** argv) {
-  srand(33);
-  env = Env::Default();
-
-  {
-    printf("--- Balanced read/write benchmark ---\n");
-    vector <WorkerThread*> config = SetupBenchmarkBalanced();
-    Result r = StartBenchmark(config);
-    r.print();
-  }
-  {
-    printf("--- Write heavy benchmark ---\n");
-    vector <WorkerThread*> config = SetupBenchmarkWriteHeavy();
-    Result r = StartBenchmark(config);
-    r.print();
-  }
-  {
-    printf("--- Read heavy benchmark ---\n");
-    vector <WorkerThread*> config = SetupBenchmarkReadHeavy();
-    Result r = StartBenchmark(config);
-    r.print();
-  }
-
-  return 0;
-}
diff --git a/src/rocksdb/tools/db_crashtest.py b/src/rocksdb/tools/db_crashtest.py
deleted file mode 100644
index 3c93eca..0000000
--- a/src/rocksdb/tools/db_crashtest.py
+++ /dev/null
@@ -1,150 +0,0 @@
-#! /usr/bin/env python
-import os
-import re
-import sys
-import time
-import random
-import getopt
-import logging
-import tempfile
-import subprocess
-import shutil
-
-# This script runs and kills db_stress multiple times. It checks consistency
-# in case of unsafe crashes in RocksDB.
-
-def main(argv):
-    try:
-        opts, args = getopt.getopt(argv, "hd:t:i:o:b:")
-    except getopt.GetoptError:
-        print("db_crashtest.py -d <duration_test> -t <#threads> "
-              "-i <interval for one run> -o <ops_per_thread> "
-              "-b <write_buffer_size>\n")
-        sys.exit(2)
-
-    # default values, will be overridden by cmdline args
-    interval = 120  # time for one db_stress instance to run
-    duration = 6000  # total time for this script to test db_stress
-    threads = 32
-    # since we will be killing anyway, use large value for ops_per_thread
-    ops_per_thread = 100000000
-    write_buf_size = 4 * 1024 * 1024
-
-    for opt, arg in opts:
-        if opt == '-h':
-            print("db_crashtest.py -d <duration_test>"
-                  " -t <#threads> -i <interval for one run>"
-                  " -o <ops_per_thread> -b <write_buffer_size>\n")
-            sys.exit()
-        elif opt == "-d":
-            duration = int(arg)
-        elif opt == "-t":
-            threads = int(arg)
-        elif opt == "-i":
-            interval = int(arg)
-        elif opt == "-o":
-            ops_per_thread = int(arg)
-        elif opt == "-b":
-            write_buf_size = int(arg)
-        else:
-            print("db_crashtest.py -d <duration_test>"
-                  " -t <#threads> -i <interval for one run>"
-                  " -o <ops_per_thread> -b <write_buffer_size>\n")
-            sys.exit(2)
-
-    exit_time = time.time() + duration
-
-    print("Running blackbox-crash-test with \ninterval_between_crash="
-          + str(interval) + "\ntotal-duration=" + str(duration)
-          + "\nthreads=" + str(threads) + "\nops_per_thread="
-          + str(ops_per_thread) + "\nwrite_buffer_size="
-          + str(write_buf_size) + "\n")
-
-    dbname = tempfile.mkdtemp(prefix='rocksdb_crashtest_')
-
-    while time.time() < exit_time:
-        run_had_errors = False
-        killtime = time.time() + interval
-
-        cmd = re.sub('\s+', ' ', """
-            ./db_stress
-            --test_batches_snapshots=1
-            --ops_per_thread=%s
-            --threads=%s
-            --write_buffer_size=%s
-            --destroy_db_initially=0
-            --reopen=20
-            --readpercent=45
-            --prefixpercent=5
-            --writepercent=35
-            --delpercent=5
-            --iterpercent=10
-            --db=%s
-            --max_key=100000000
-            --disable_seek_compaction=%s
-            --mmap_read=%s
-            --block_size=16384
-            --cache_size=1048576
-            --open_files=500000
-            --verify_checksum=1
-            --sync=0
-            --progress_reports=0
-            --disable_wal=0
-            --disable_data_sync=1
-            --target_file_size_base=2097152
-            --target_file_size_multiplier=2
-            --max_write_buffer_number=3
-            --max_background_compactions=20
-            --max_bytes_for_level_base=10485760
-            --filter_deletes=%s
-            --memtablerep=prefix_hash
-            --prefix_size=7
-            """ % (ops_per_thread,
-                   threads,
-                   write_buf_size,
-                   dbname,
-                   random.randint(0, 1),
-                   random.randint(0, 1),
-                   random.randint(0, 1)))
-
-        child = subprocess.Popen([cmd],
-                                 stderr=subprocess.PIPE, shell=True)
-        print("Running db_stress with pid=%d: %s\n\n"
-              % (child.pid, cmd))
-
-        stop_early = False
-        while time.time() < killtime:
-            if child.poll() is not None:
-                print("WARNING: db_stress ended before kill: exitcode=%d\n"
-                      % child.returncode)
-                stop_early = True
-                break
-            time.sleep(1)
-
-        if not stop_early:
-            if child.poll() is not None:
-                print("WARNING: db_stress ended before kill: exitcode=%d\n"
-                      % child.returncode)
-            else:
-                child.kill()
-                print("KILLED %d\n" % child.pid)
-                time.sleep(1)  # time to stabilize after a kill
-
-        while True:
-            line = child.stderr.readline().strip()
-            if line != '':
-                run_had_errors = True
-                print('***' + line + '^')
-            else:
-                break
-
-        if run_had_errors:
-            sys.exit(2)
-
-        time.sleep(1)  # time to stabilize before the next run
-
-    # we need to clean up after ourselves -- only do this on test success
-    shutil.rmtree(dbname, True)
-
-if __name__ == "__main__":
-    sys.exit(main(sys.argv[1:]))
diff --git a/src/rocksdb/tools/db_crashtest2.py b/src/rocksdb/tools/db_crashtest2.py
deleted file mode 100644
index 0a12b5a..0000000
--- a/src/rocksdb/tools/db_crashtest2.py
+++ /dev/null
@@ -1,168 +0,0 @@
-#! /usr/bin/env python
-import os
-import re
-import sys
-import time
-import random
-import getopt
-import logging
-import tempfile
-import subprocess
-import shutil
-
-# This python script runs db_stress multiple times. Some runs with
-# kill_random_test that causes rocksdb to crash at various points in code.
-
-def main(argv):
-    try:
-        opts, args = getopt.getopt(argv, "hd:t:k:o:b:")
-    except getopt.GetoptError:
-        print str(getopt.GetoptError)
-        print "db_crashtest2.py -d <duration_test> -t <#threads> " \
-              "-k <kills with prob 1/k> -o <ops_per_thread> "\
-              "-b <write_buffer_size>\n"
-        sys.exit(2)
-
-    # default values, will be overridden by cmdline args
-    kill_random_test = 97  # kill with probability 1/97 by default
-    duration = 10000  # total time for this script to test db_stress
-    threads = 32
-    ops_per_thread = 200000
-    write_buf_size = 4 * 1024 * 1024
-
-    for opt, arg in opts:
-        if opt == '-h':
-            print "db_crashtest2.py -d <duration_test> -t <#threads> " \
-                  "-k <kills with prob 1/k> -o <ops_per_thread> " \
-                  "-b <write_buffer_size>\n"
-            sys.exit()
-        elif opt == "-d":
-            duration = int(arg)
-        elif opt == "-t":
-            threads = int(arg)
-        elif opt == "-k":
-            kill_random_test = int(arg)
-        elif opt == "-o":
-            ops_per_thread = int(arg)
-        elif opt == "-b":
-            write_buf_size = int(arg)
-        else:
-            print "unrecognized option " + str(opt) + "\n"
-            print "db_crashtest2.py -d <duration_test> -t <#threads> " \
-                  "-k <kills with prob 1/k> -o <ops_per_thread> " \
-                  "-b <write_buffer_size>\n"
-            sys.exit(2)
-
-    exit_time = time.time() + duration
-
-    print "Running whitebox-crash-test with \ntotal-duration=" + str(duration) \
-          + "\nthreads=" + str(threads) + "\nops_per_thread=" \
-          + str(ops_per_thread) + "\nwrite_buffer_size=" \
-          + str(write_buf_size) + "\n"
-
-    total_check_mode = 3
-    check_mode = 0
-
-    while time.time() < exit_time:
-        killoption = ""
-        if check_mode == 0:
-            # run with kill_random_test
-            killoption = " --kill_random_test=" + str(kill_random_test)
-            # use large ops per thread since we will kill it anyway
-            additional_opts = "--ops_per_thread=" + \
-                              str(100 * ops_per_thread) + killoption
-        elif check_mode == 1:
-            # normal run with universal compaction mode
-            additional_opts = "--ops_per_thread=" + str(ops_per_thread) + \
-                              " --compaction_style=1"
-        else:
-            # nomral run
-            additional_opts = "--ops_per_thread=" + str(ops_per_thread)
-
-        dbname = tempfile.mkdtemp(prefix='rocksdb_crashtest_')
-        cmd = re.sub('\s+', ' ', """
-            ./db_stress
-            --test_batches_snapshots=%s
-            --threads=%s
-            --write_buffer_size=%s
-            --destroy_db_initially=0
-            --reopen=20
-            --readpercent=45
-            --prefixpercent=5
-            --writepercent=35
-            --delpercent=5
-            --iterpercent=10
-            --db=%s
-            --max_key=100000000
-            --disable_seek_compaction=%s
-            --mmap_read=%s
-            --block_size=16384
-            --cache_size=1048576
-            --open_files=500000
-            --verify_checksum=1
-            --sync=0
-            --progress_reports=0
-            --disable_wal=0
-            --disable_data_sync=1
-            --target_file_size_base=2097152
-            --target_file_size_multiplier=2
-            --max_write_buffer_number=3
-            --max_background_compactions=20
-            --max_bytes_for_level_base=10485760
-            --filter_deletes=%s
-            --memtablerep=prefix_hash
-            --prefix_size=7
-            %s
-            """ % (random.randint(0, 1),
-                   threads,
-                   write_buf_size,
-                   dbname,
-                   random.randint(0, 1),
-                   random.randint(0, 1),
-                   random.randint(0, 1),
-                   additional_opts))
-
-        print "Running:" + cmd + "\n"
-
-        popen = subprocess.Popen([cmd], stdout=subprocess.PIPE,
-                                 stderr=subprocess.STDOUT,
-                                 shell=True)
-        stdoutdata, stderrdata = popen.communicate()
-        retncode = popen.returncode
-        msg = ("check_mode={0}, kill option={1}, exitcode={2}\n".format(
-               check_mode, killoption, retncode))
-        print msg
-        print stdoutdata
-
-        expected = False
-        if (killoption == '') and (retncode == 0):
-            # we expect zero retncode if no kill option
-            expected = True
-        elif killoption != '' and retncode < 0:
-            # we expect negative retncode if kill option was given
-            expected = True
-
-        if not expected:
-            print "TEST FAILED. See kill option and exit code above!!!\n"
-            sys.exit(1)
-
-        stdoutdata = stdoutdata.lower()
-        errorcount = (stdoutdata.count('error') -
-                      stdoutdata.count('got errors 0 times'))
-        print "#times error occurred in output is " + str(errorcount) + "\n"
-
-        if (errorcount > 0):
-            print "TEST FAILED. Output has 'error'!!!\n"
-            sys.exit(2)
-        if (stdoutdata.find('fail') >= 0):
-            print "TEST FAILED. Output has 'fail'!!!\n"
-            sys.exit(2)
-        # we need to clean up after ourselves -- only do this on test success
-        shutil.rmtree(dbname, True)
-
-        check_mode = (check_mode + 1) % total_check_mode
-
-        time.sleep(1)  # time to stabilize after a kill
-
-if __name__ == "__main__":
-    sys.exit(main(sys.argv[1:]))
diff --git a/src/rocksdb/tools/db_repl_stress.cc b/src/rocksdb/tools/db_repl_stress.cc
deleted file mode 100644
index 27cb6d5..0000000
--- a/src/rocksdb/tools/db_repl_stress.cc
+++ /dev/null
@@ -1,134 +0,0 @@
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-//
-#include <cstdio>
-
-#include <gflags/gflags.h>
-
-#include "db/write_batch_internal.h"
-#include "rocksdb/db.h"
-#include "rocksdb/types.h"
-#include "port/atomic_pointer.h"
-#include "util/testutil.h"
-
-
-// Run a thread to perform Put's.
-// Another thread uses GetUpdatesSince API to keep getting the updates.
-// options :
-// --num_inserts = the num of inserts the first thread should perform.
-// --wal_ttl = the wal ttl for the run.
-
-using namespace rocksdb;
-
-struct DataPumpThread {
-  size_t no_records;
-  DB* db; // Assumption DB is Open'ed already.
-};
-
-static std::string RandomString(Random* rnd, int len) {
-  std::string r;
-  test::RandomString(rnd, len, &r);
-  return r;
-}
-
-static void DataPumpThreadBody(void* arg) {
-  DataPumpThread* t = reinterpret_cast<DataPumpThread*>(arg);
-  DB* db = t->db;
-  Random rnd(301);
-  size_t i = 0;
-  while(i++ < t->no_records) {
-    if(!db->Put(WriteOptions(), Slice(RandomString(&rnd, 500)),
-                Slice(RandomString(&rnd, 500))).ok()) {
-      fprintf(stderr, "Error in put\n");
-      exit(1);
-    }
-  }
-}
-
-struct ReplicationThread {
-  port::AtomicPointer stop;
-  DB* db;
-  volatile size_t no_read;
-};
-
-static void ReplicationThreadBody(void* arg) {
-  ReplicationThread* t = reinterpret_cast<ReplicationThread*>(arg);
-  DB* db = t->db;
-  unique_ptr<TransactionLogIterator> iter;
-  SequenceNumber currentSeqNum = 1;
-  while (t->stop.Acquire_Load() != nullptr) {
-    iter.reset();
-    Status s;
-    while(!db->GetUpdatesSince(currentSeqNum, &iter).ok()) {
-      if (t->stop.Acquire_Load() == nullptr) {
-        return;
-      }
-    }
-    fprintf(stderr, "Refreshing iterator\n");
-    for(;iter->Valid(); iter->Next(), t->no_read++, currentSeqNum++) {
-      BatchResult res = iter->GetBatch();
-      if (res.sequence != currentSeqNum) {
-        fprintf(stderr,
-                "Missed a seq no. b/w %ld and %ld\n",
-                (long)currentSeqNum,
-                (long)res.sequence);
-        exit(1);
-      }
-    }
-  }
-}
-
-DEFINE_uint64(num_inserts, 1000, "the num of inserts the first thread should"
-              " perform.");
-DEFINE_uint64(wal_ttl_seconds, 1000, "the wal ttl for the run(in seconds)");
-DEFINE_uint64(wal_size_limit_MB, 10, "the wal size limit for the run"
-              "(in MB)");
-
-int main(int argc, const char** argv) {
-  google::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
-    " --num_inserts=<num_inserts> --wal_ttl_seconds=<WAL_ttl_seconds>" +
-    " --wal_size_limit_MB=<WAL_size_limit_MB>");
-  google::ParseCommandLineFlags(&argc, const_cast<char***>(&argv), true);
-
-  Env* env = Env::Default();
-  std::string default_db_path;
-  env->GetTestDirectory(&default_db_path);
-  default_db_path += "db_repl_stress";
-  Options options;
-  options.create_if_missing = true;
-  options.WAL_ttl_seconds = FLAGS_wal_ttl_seconds;
-  options.WAL_size_limit_MB = FLAGS_wal_size_limit_MB;
-  DB* db;
-  DestroyDB(default_db_path, options);
-
-  Status s = DB::Open(options, default_db_path, &db);
-
-  if (!s.ok()) {
-    fprintf(stderr, "Could not open DB due to %s\n", s.ToString().c_str());
-    exit(1);
-  }
-
-  DataPumpThread dataPump;
-  dataPump.no_records = FLAGS_num_inserts;
-  dataPump.db = db;
-  env->StartThread(DataPumpThreadBody, &dataPump);
-
-  ReplicationThread replThread;
-  replThread.db = db;
-  replThread.no_read = 0;
-  replThread.stop.Release_Store(env); // store something to make it non-null.
-
-  env->StartThread(ReplicationThreadBody, &replThread);
-  while(replThread.no_read < FLAGS_num_inserts);
-  replThread.stop.Release_Store(nullptr);
-  if (replThread.no_read < dataPump.no_records) {
-    // no. read should be => than inserted.
-    fprintf(stderr, "No. of Record's written and read not same\nRead : %zu"
-            " Written : %zu\n", replThread.no_read, dataPump.no_records);
-    exit(1);
-  }
-  fprintf(stderr, "Successful!\n");
-  exit(0);
-}
diff --git a/src/rocksdb/tools/db_sanity_test.cc b/src/rocksdb/tools/db_sanity_test.cc
deleted file mode 100644
index e970f5e..0000000
--- a/src/rocksdb/tools/db_sanity_test.cc
+++ /dev/null
@@ -1,203 +0,0 @@
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-
-#include <cstdio>
-#include <vector>
-#include <memory>
-
-#include "include/rocksdb/db.h"
-#include "include/rocksdb/options.h"
-#include "include/rocksdb/env.h"
-#include "include/rocksdb/slice.h"
-#include "include/rocksdb/status.h"
-#include "include/rocksdb/comparator.h"
-#include "include/rocksdb/table.h"
-#include "include/rocksdb/slice_transform.h"
-
-namespace rocksdb {
-
-class SanityTest {
- public:
-  explicit SanityTest(const std::string& path)
-      : env_(Env::Default()), path_(path) {
-    env_->CreateDirIfMissing(path);
-  }
-  virtual ~SanityTest() {}
-
-  virtual std::string Name() const = 0;
-  virtual Options GetOptions() const = 0;
-
-  Status Create() {
-    Options options = GetOptions();
-    options.create_if_missing = true;
-    std::string dbname = path_ + Name();
-    DestroyDB(dbname, options);
-    DB* db;
-    Status s = DB::Open(options, dbname, &db);
-    std::unique_ptr<DB> db_guard(db);
-    if (!s.ok()) {
-      return s;
-    }
-    for (int i = 0; i < 1000000; ++i) {
-      std::string k = "key" + std::to_string(i);
-      std::string v = "value" + std::to_string(i);
-      s = db->Put(WriteOptions(), Slice(k), Slice(v));
-      if (!s.ok()) {
-        return s;
-      }
-    }
-    return Status::OK();
-  }
-  Status Verify() {
-    DB* db;
-    std::string dbname = path_ + Name();
-    Status s = DB::Open(GetOptions(), dbname, &db);
-    std::unique_ptr<DB> db_guard(db);
-    if (!s.ok()) {
-      return s;
-    }
-    for (int i = 0; i < 1000000; ++i) {
-      std::string k = "key" + std::to_string(i);
-      std::string v = "value" + std::to_string(i);
-      std::string result;
-      s = db->Get(ReadOptions(), Slice(k), &result);
-      if (!s.ok()) {
-        return s;
-      }
-      if (result != v) {
-        return Status::Corruption("Unexpected value for key " + k);
-      }
-    }
-    return Status::OK();
-  }
-
- private:
-  Env* env_;
-  std::string const path_;
-};
-
-class SanityTestBasic : public SanityTest {
- public:
-  explicit SanityTestBasic(const std::string& path) : SanityTest(path) {}
-  virtual Options GetOptions() const {
-    Options options;
-    options.create_if_missing = true;
-    return options;
-  }
-  virtual std::string Name() const { return "Basic"; }
-};
-
-class SanityTestSpecialComparator : public SanityTest {
- public:
-  explicit SanityTestSpecialComparator(const std::string& path)
-      : SanityTest(path) {
-    options_.comparator = new NewComparator();
-  }
-  ~SanityTestSpecialComparator() { delete options_.comparator; }
-  virtual Options GetOptions() const { return options_; }
-  virtual std::string Name() const { return "SpecialComparator"; }
-
- private:
-  class NewComparator : public Comparator {
-   public:
-    virtual const char* Name() const { return "rocksdb.NewComparator"; }
-    virtual int Compare(const Slice& a, const Slice& b) const {
-      return BytewiseComparator()->Compare(a, b);
-    }
-    virtual void FindShortestSeparator(std::string* s, const Slice& l) const {
-      BytewiseComparator()->FindShortestSeparator(s, l);
-    }
-    virtual void FindShortSuccessor(std::string* key) const {
-      BytewiseComparator()->FindShortSuccessor(key);
-    }
-  };
-  Options options_;
-};
-
-class SanityTestZlibCompression : public SanityTest {
- public:
-  explicit SanityTestZlibCompression(const std::string& path)
-      : SanityTest(path) {
-    options_.compression = kZlibCompression;
-  }
-  virtual Options GetOptions() const { return options_; }
-  virtual std::string Name() const { return "ZlibCompression"; }
-
- private:
-  Options options_;
-};
-
-class SanityTestPlainTableFactory : public SanityTest {
- public:
-  explicit SanityTestPlainTableFactory(const std::string& path)
-      : SanityTest(path) {
-    options_.table_factory.reset(NewPlainTableFactory());
-    options_.prefix_extractor.reset(NewFixedPrefixTransform(2));
-    options_.allow_mmap_reads = true;
-  }
-  ~SanityTestPlainTableFactory() {}
-  virtual Options GetOptions() const { return options_; }
-  virtual std::string Name() const { return "PlainTable"; }
-
- private:
-  Options options_;
-};
-
-namespace {
-bool RunSanityTests(const std::string& command, const std::string& path) {
-  std::vector<SanityTest*> sanity_tests = {
-      new SanityTestBasic(path),
-      new SanityTestSpecialComparator(path),
-      new SanityTestZlibCompression(path),
-      new SanityTestPlainTableFactory(path)};
-
-  if (command == "create") {
-    fprintf(stderr, "Creating...\n");
-  } else {
-    fprintf(stderr, "Verifying...\n");
-  }
-  for (auto sanity_test : sanity_tests) {
-    Status s;
-    fprintf(stderr, "%s -- ", sanity_test->Name().c_str());
-    if (command == "create") {
-      s = sanity_test->Create();
-    } else {
-      assert(command == "verify");
-      s = sanity_test->Verify();
-    }
-    fprintf(stderr, "%s\n", s.ToString().c_str());
-    if (!s.ok()) {
-      fprintf(stderr, "FAIL\n");
-      return false;
-    }
-
-    delete sanity_test;
-  }
-  return true;
-}
-}  // namespace
-
-}  // namespace rocksdb
-
-int main(int argc, char** argv) {
-  std::string path, command;
-  bool ok = (argc == 3);
-  if (ok) {
-    path = std::string(argv[1]);
-    command = std::string(argv[2]);
-    ok = (command == "create" || command == "verify");
-  }
-  if (!ok) {
-    fprintf(stderr, "Usage: %s <path> [create|verify] \n", argv[0]);
-    exit(1);
-  }
-  if (path.back() != '/') {
-    path += "/";
-  }
-
-  bool sanity_ok = rocksdb::RunSanityTests(command, path);
-
-  return sanity_ok ? 0 : 1;
-}
diff --git a/src/rocksdb/tools/db_stress.cc b/src/rocksdb/tools/db_stress.cc
deleted file mode 100644
index c774171..0000000
--- a/src/rocksdb/tools/db_stress.cc
+++ /dev/null
@@ -1,1732 +0,0 @@
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-//
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-//
-// The test uses an array to compare against values written to the database.
-// Keys written to the array are in 1:1 correspondence to the actual values in
-// the database according to the formula in the function GenerateValue.
-
-// Space is reserved in the array from 0 to FLAGS_max_key and values are
-// randomly written/deleted/read from those positions. During verification we
-// compare all the positions in the array. To shorten/elongate the running
-// time, you could change the settings: FLAGS_max_key, FLAGS_ops_per_thread,
-// (sometimes also FLAGS_threads).
-//
-// NOTE that if FLAGS_test_batches_snapshots is set, the test will have
-// different behavior. See comment of the flag for details.
-
-#include <sys/types.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <gflags/gflags.h>
-#include "db/db_impl.h"
-#include "db/version_set.h"
-#include "rocksdb/statistics.h"
-#include "rocksdb/cache.h"
-#include "utilities/db_ttl.h"
-#include "rocksdb/env.h"
-#include "rocksdb/write_batch.h"
-#include "rocksdb/slice.h"
-#include "rocksdb/slice_transform.h"
-#include "rocksdb/statistics.h"
-#include "port/port.h"
-#include "util/coding.h"
-#include "util/crc32c.h"
-#include "util/histogram.h"
-#include "util/mutexlock.h"
-#include "util/random.h"
-#include "util/testutil.h"
-#include "util/logging.h"
-#include "hdfs/env_hdfs.h"
-#include "utilities/merge_operators.h"
-
-static const long KB = 1024;
-
-
-static bool ValidateUint32Range(const char* flagname, uint64_t value) {
-  if (value > std::numeric_limits<uint32_t>::max()) {
-    fprintf(stderr,
-            "Invalid value for --%s: %lu, overflow\n",
-            flagname,
-            (unsigned long)value);
-    return false;
-  }
-  return true;
-}
-
-DEFINE_uint64(seed, 2341234, "Seed for PRNG");
-static const bool FLAGS_seed_dummy __attribute__((unused)) =
-    google::RegisterFlagValidator(&FLAGS_seed, &ValidateUint32Range);
-
-DEFINE_int64(max_key, 1 * KB* KB,
-             "Max number of key/values to place in database");
-
-DEFINE_int32(column_families, 10, "Number of column families");
-
-DEFINE_bool(test_batches_snapshots, false,
-            "If set, the test uses MultiGet(), Multiut() and MultiDelete()"
-            " which read/write/delete multiple keys in a batch. In this mode,"
-            " we do not verify db content by comparing the content with the "
-            "pre-allocated array. Instead, we do partial verification inside"
-            " MultiGet() by checking various values in a batch. Benefit of"
-            " this mode:\n"
-            "\t(a) No need to acquire mutexes during writes (less cache "
-            "flushes in multi-core leading to speed up)\n"
-            "\t(b) No long validation at the end (more speed up)\n"
-            "\t(c) Test snapshot and atomicity of batch writes");
-
-DEFINE_int32(threads, 32, "Number of concurrent threads to run.");
-
-DEFINE_int32(ttl, -1,
-             "Opens the db with this ttl value if this is not -1. "
-             "Carefully specify a large value such that verifications on "
-             "deleted values don't fail");
-
-DEFINE_int32(value_size_mult, 8,
-             "Size of value will be this number times rand_int(1,3) bytes");
-
-DEFINE_bool(verify_before_write, false, "Verify before write");
-
-DEFINE_bool(histogram, false, "Print histogram of operation timings");
-
-DEFINE_bool(destroy_db_initially, true,
-            "Destroys the database dir before start if this is true");
-
-DEFINE_bool(verbose, false, "Verbose");
-
-DEFINE_bool(progress_reports, true,
-            "If true, db_stress will report number of finished operations");
-
-DEFINE_int32(write_buffer_size, rocksdb::Options().write_buffer_size,
-             "Number of bytes to buffer in memtable before compacting");
-
-DEFINE_int32(max_write_buffer_number,
-             rocksdb::Options().max_write_buffer_number,
-             "The number of in-memory memtables. "
-             "Each memtable is of size FLAGS_write_buffer_size.");
-
-DEFINE_int32(min_write_buffer_number_to_merge,
-             rocksdb::Options().min_write_buffer_number_to_merge,
-             "The minimum number of write buffers that will be merged together "
-             "before writing to storage. This is cheap because it is an "
-             "in-memory merge. If this feature is not enabled, then all these "
-             "write buffers are flushed to L0 as separate files and this "
-             "increases read amplification because a get request has to check "
-             "in all of these files. Also, an in-memory merge may result in "
-             "writing less data to storage if there are duplicate records in"
-             " each of these individual write buffers.");
-
-DEFINE_int32(open_files, rocksdb::Options().max_open_files,
-             "Maximum number of files to keep open at the same time "
-             "(use default if == 0)");
-
-DEFINE_int64(compressed_cache_size, -1,
-             "Number of bytes to use as a cache of compressed data."
-             " Negative means use default settings.");
-
-DEFINE_int32(compaction_style, rocksdb::Options().compaction_style, "");
-
-DEFINE_int32(level0_file_num_compaction_trigger,
-             rocksdb::Options().level0_file_num_compaction_trigger,
-             "Level0 compaction start trigger");
-
-DEFINE_int32(level0_slowdown_writes_trigger,
-             rocksdb::Options().level0_slowdown_writes_trigger,
-             "Number of files in level-0 that will slow down writes");
-
-DEFINE_int32(level0_stop_writes_trigger,
-             rocksdb::Options().level0_stop_writes_trigger,
-             "Number of files in level-0 that will trigger put stop.");
-
-DEFINE_int32(block_size, rocksdb::Options().block_size,
-             "Number of bytes in a block.");
-
-DEFINE_int32(max_background_compactions,
-             rocksdb::Options().max_background_compactions,
-             "The maximum number of concurrent background compactions "
-             "that can occur in parallel.");
-
-DEFINE_int32(max_background_flushes, rocksdb::Options().max_background_flushes,
-             "The maximum number of concurrent background flushes "
-             "that can occur in parallel.");
-
-DEFINE_int32(universal_size_ratio, 0, "The ratio of file sizes that trigger"
-             " compaction in universal style");
-
-DEFINE_int32(universal_min_merge_width, 0, "The minimum number of files to "
-             "compact in universal style compaction");
-
-DEFINE_int32(universal_max_merge_width, 0, "The max number of files to compact"
-             " in universal style compaction");
-
-DEFINE_int32(universal_max_size_amplification_percent, 0,
-             "The max size amplification for universal style compaction");
-
-DEFINE_int32(clear_column_family_one_in, 1000000,
-             "With a chance of 1/N, delete a column family and then recreate "
-             "it again. If N == 0, never drop/create column families. "
-             "When test_batches_snapshots is true, this flag has no effect");
-
-DEFINE_int64(cache_size, 2 * KB * KB * KB,
-             "Number of bytes to use as a cache of uncompressed data.");
-
-static bool ValidateInt32Positive(const char* flagname, int32_t value) {
-  if (value < 0) {
-    fprintf(stderr, "Invalid value for --%s: %d, must be >=0\n",
-            flagname, value);
-    return false;
-  }
-  return true;
-}
-DEFINE_int32(reopen, 10, "Number of times database reopens");
-static const bool FLAGS_reopen_dummy __attribute__((unused)) =
-    google::RegisterFlagValidator(&FLAGS_reopen, &ValidateInt32Positive);
-
-DEFINE_int32(bloom_bits, 10, "Bloom filter bits per key. "
-             "Negative means use default settings.");
-
-DEFINE_string(db, "", "Use the db with the following name.");
-
-DEFINE_bool(verify_checksum, false,
-            "Verify checksum for every block read from storage");
-
-DEFINE_bool(mmap_read, rocksdb::EnvOptions().use_mmap_reads,
-            "Allow reads to occur via mmap-ing files");
-
-// Database statistics
-static std::shared_ptr<rocksdb::Statistics> dbstats;
-DEFINE_bool(statistics, false, "Create database statistics");
-
-DEFINE_bool(sync, false, "Sync all writes to disk");
-
-DEFINE_bool(disable_data_sync, false,
-            "If true, do not wait until data is synced to disk.");
-
-DEFINE_bool(use_fsync, false, "If true, issue fsync instead of fdatasync");
-
-DEFINE_int32(kill_random_test, 0,
-             "If non-zero, kill at various points in source code with "
-             "probability 1/this");
-static const bool FLAGS_kill_random_test_dummy __attribute__((unused)) =
-    google::RegisterFlagValidator(&FLAGS_kill_random_test,
-                                  &ValidateInt32Positive);
-extern int rocksdb_kill_odds;
-
-DEFINE_bool(disable_wal, false, "If true, do not write WAL for write.");
-
-DEFINE_int32(target_file_size_base, 64 * KB,
-             "Target level-1 file size for compaction");
-
-DEFINE_int32(target_file_size_multiplier, 1,
-             "A multiplier to compute targe level-N file size (N >= 2)");
-
-DEFINE_uint64(max_bytes_for_level_base, 256 * KB, "Max bytes for level-1");
-
-DEFINE_int32(max_bytes_for_level_multiplier, 2,
-             "A multiplier to compute max bytes for level-N (N >= 2)");
-
-static bool ValidateInt32Percent(const char* flagname, int32_t value) {
-  if (value < 0 || value>100) {
-    fprintf(stderr, "Invalid value for --%s: %d, 0<= pct <=100 \n",
-            flagname, value);
-    return false;
-  }
-  return true;
-}
-DEFINE_int32(readpercent, 10,
-             "Ratio of reads to total workload (expressed as a percentage)");
-static const bool FLAGS_readpercent_dummy __attribute__((unused)) =
-    google::RegisterFlagValidator(&FLAGS_readpercent, &ValidateInt32Percent);
-
-DEFINE_int32(prefixpercent, 20,
-             "Ratio of prefix iterators to total workload (expressed as a"
-             " percentage)");
-static const bool FLAGS_prefixpercent_dummy __attribute__((unused)) =
-    google::RegisterFlagValidator(&FLAGS_prefixpercent, &ValidateInt32Percent);
-
-DEFINE_int32(writepercent, 45,
-             " Ratio of deletes to total workload (expressed as a percentage)");
-static const bool FLAGS_writepercent_dummy __attribute__((unused)) =
-    google::RegisterFlagValidator(&FLAGS_writepercent, &ValidateInt32Percent);
-
-DEFINE_int32(delpercent, 15,
-             "Ratio of deletes to total workload (expressed as a percentage)");
-static const bool FLAGS_delpercent_dummy __attribute__((unused)) =
-    google::RegisterFlagValidator(&FLAGS_delpercent, &ValidateInt32Percent);
-
-DEFINE_int32(iterpercent, 10, "Ratio of iterations to total workload"
-             " (expressed as a percentage)");
-static const bool FLAGS_iterpercent_dummy __attribute__((unused)) =
-    google::RegisterFlagValidator(&FLAGS_iterpercent, &ValidateInt32Percent);
-
-DEFINE_uint64(num_iterations, 10, "Number of iterations per MultiIterate run");
-static const bool FLAGS_num_iterations_dummy __attribute__((unused)) =
-    google::RegisterFlagValidator(&FLAGS_num_iterations, &ValidateUint32Range);
-
-DEFINE_bool(disable_seek_compaction, false,
-            "Option to disable compation triggered by read.");
-
-namespace {
-enum rocksdb::CompressionType StringToCompressionType(const char* ctype) {
-  assert(ctype);
-
-  if (!strcasecmp(ctype, "none"))
-    return rocksdb::kNoCompression;
-  else if (!strcasecmp(ctype, "snappy"))
-    return rocksdb::kSnappyCompression;
-  else if (!strcasecmp(ctype, "zlib"))
-    return rocksdb::kZlibCompression;
-  else if (!strcasecmp(ctype, "bzip2"))
-    return rocksdb::kBZip2Compression;
-  else if (!strcasecmp(ctype, "lz4"))
-    return rocksdb::kLZ4Compression;
-  else if (!strcasecmp(ctype, "lz4hc"))
-    return rocksdb::kLZ4HCCompression;
-
-  fprintf(stdout, "Cannot parse compression type '%s'\n", ctype);
-  return rocksdb::kSnappyCompression; //default value
-}
-}  // namespace
-
-DEFINE_string(compression_type, "snappy",
-              "Algorithm to use to compress the database");
-static enum rocksdb::CompressionType FLAGS_compression_type_e =
-    rocksdb::kSnappyCompression;
-
-DEFINE_string(hdfs, "", "Name of hdfs environment");
-// posix or hdfs environment
-static rocksdb::Env* FLAGS_env = rocksdb::Env::Default();
-
-DEFINE_uint64(ops_per_thread, 1200000, "Number of operations per thread.");
-static const bool FLAGS_ops_per_thread_dummy __attribute__((unused)) =
-    google::RegisterFlagValidator(&FLAGS_ops_per_thread, &ValidateUint32Range);
-
-DEFINE_uint64(log2_keys_per_lock, 2, "Log2 of number of keys per lock");
-static const bool FLAGS_log2_keys_per_lock_dummy __attribute__((unused)) =
-    google::RegisterFlagValidator(&FLAGS_log2_keys_per_lock,
-                                  &ValidateUint32Range);
-
-DEFINE_int32(purge_redundant_percent, 50,
-             "Percentage of times we want to purge redundant keys in memory "
-             "before flushing");
-static const bool FLAGS_purge_redundant_percent_dummy __attribute__((unused)) =
-    google::RegisterFlagValidator(&FLAGS_purge_redundant_percent,
-                                  &ValidateInt32Percent);
-
-DEFINE_bool(filter_deletes, false, "On true, deletes use KeyMayExist to drop"
-            " the delete if key not present");
-
-enum RepFactory {
-  kSkipList,
-  kHashSkipList,
-  kVectorRep
-};
-
-namespace {
-enum RepFactory StringToRepFactory(const char* ctype) {
-  assert(ctype);
-
-  if (!strcasecmp(ctype, "skip_list"))
-    return kSkipList;
-  else if (!strcasecmp(ctype, "prefix_hash"))
-    return kHashSkipList;
-  else if (!strcasecmp(ctype, "vector"))
-    return kVectorRep;
-
-  fprintf(stdout, "Cannot parse memreptable %s\n", ctype);
-  return kSkipList;
-}
-}  // namespace
-
-static enum RepFactory FLAGS_rep_factory;
-DEFINE_string(memtablerep, "prefix_hash", "");
-
-static bool ValidatePrefixSize(const char* flagname, int32_t value) {
-  if (value < 0 || value > 8) {
-    fprintf(stderr, "Invalid value for --%s: %d. 0 <= PrefixSize <= 8\n",
-            flagname, value);
-    return false;
-  }
-  return true;
-}
-DEFINE_int32(prefix_size, 7, "Control the prefix size for HashSkipListRep");
-static const bool FLAGS_prefix_size_dummy =
-  google::RegisterFlagValidator(&FLAGS_prefix_size, &ValidatePrefixSize);
-
-DEFINE_bool(use_merge, false, "On true, replaces all writes with a Merge "
-            "that behaves like a Put");
-
-
-namespace rocksdb {
-
-// convert long to a big-endian slice key
-static std::string Key(long val) {
-  std::string little_endian_key;
-  std::string big_endian_key;
-  PutFixed64(&little_endian_key, val);
-  assert(little_endian_key.size() == sizeof(val));
-  big_endian_key.resize(sizeof(val));
-  for (int i=0; i<(int)sizeof(val); i++) {
-    big_endian_key[i] = little_endian_key[sizeof(val) - 1 - i];
-  }
-  return big_endian_key;
-}
-
-static std::string StringToHex(const std::string& str) {
-  std::string result = "0x";
-  char buf[10];
-  for (size_t i = 0; i < str.length(); i++) {
-    snprintf(buf, 10, "%02X", (unsigned char)str[i]);
-    result += buf;
-  }
-  return result;
-}
-
-
-class StressTest;
-namespace {
-
-class Stats {
- private:
-  double start_;
-  double finish_;
-  double seconds_;
-  long done_;
-  long gets_;
-  long prefixes_;
-  long writes_;
-  long deletes_;
-  long iterator_size_sums_;
-  long founds_;
-  long iterations_;
-  long errors_;
-  int next_report_;
-  size_t bytes_;
-  double last_op_finish_;
-  HistogramImpl hist_;
-
- public:
-  Stats() { }
-
-  void Start() {
-    next_report_ = 100;
-    hist_.Clear();
-    done_ = 0;
-    gets_ = 0;
-    prefixes_ = 0;
-    writes_ = 0;
-    deletes_ = 0;
-    iterator_size_sums_ = 0;
-    founds_ = 0;
-    iterations_ = 0;
-    errors_ = 0;
-    bytes_ = 0;
-    seconds_ = 0;
-    start_ = FLAGS_env->NowMicros();
-    last_op_finish_ = start_;
-    finish_ = start_;
-  }
-
-  void Merge(const Stats& other) {
-    hist_.Merge(other.hist_);
-    done_ += other.done_;
-    gets_ += other.gets_;
-    prefixes_ += other.prefixes_;
-    writes_ += other.writes_;
-    deletes_ += other.deletes_;
-    iterator_size_sums_ += other.iterator_size_sums_;
-    founds_ += other.founds_;
-    iterations_ += other.iterations_;
-    errors_ += other.errors_;
-    bytes_ += other.bytes_;
-    seconds_ += other.seconds_;
-    if (other.start_ < start_) start_ = other.start_;
-    if (other.finish_ > finish_) finish_ = other.finish_;
-  }
-
-  void Stop() {
-    finish_ = FLAGS_env->NowMicros();
-    seconds_ = (finish_ - start_) * 1e-6;
-  }
-
-  void FinishedSingleOp() {
-    if (FLAGS_histogram) {
-      double now = FLAGS_env->NowMicros();
-      double micros = now - last_op_finish_;
-      hist_.Add(micros);
-      if (micros > 20000) {
-        fprintf(stdout, "long op: %.1f micros%30s\r", micros, "");
-      }
-      last_op_finish_ = now;
-    }
-
-      done_++;
-    if (FLAGS_progress_reports) {
-      if (done_ >= next_report_) {
-        if      (next_report_ < 1000)   next_report_ += 100;
-        else if (next_report_ < 5000)   next_report_ += 500;
-        else if (next_report_ < 10000)  next_report_ += 1000;
-        else if (next_report_ < 50000)  next_report_ += 5000;
-        else if (next_report_ < 100000) next_report_ += 10000;
-        else if (next_report_ < 500000) next_report_ += 50000;
-        else                            next_report_ += 100000;
-        fprintf(stdout, "... finished %ld ops%30s\r", done_, "");
-      }
-    }
-  }
-
-  void AddBytesForWrites(int nwrites, size_t nbytes) {
-    writes_ += nwrites;
-    bytes_ += nbytes;
-  }
-
-  void AddGets(int ngets, int nfounds) {
-    founds_ += nfounds;
-    gets_ += ngets;
-  }
-
-  void AddPrefixes(int nprefixes, int count) {
-    prefixes_ += nprefixes;
-    iterator_size_sums_ += count;
-  }
-
-  void AddIterations(int n) {
-    iterations_ += n;
-  }
-
-  void AddDeletes(int n) {
-    deletes_ += n;
-  }
-
-  void AddErrors(int n) {
-    errors_ += n;
-  }
-
-  void Report(const char* name) {
-    std::string extra;
-    if (bytes_ < 1 || done_ < 1) {
-      fprintf(stderr, "No writes or ops?\n");
-      return;
-    }
-
-    double elapsed = (finish_ - start_) * 1e-6;
-    double bytes_mb = bytes_ / 1048576.0;
-    double rate = bytes_mb / elapsed;
-    double throughput = (double)done_/elapsed;
-
-    fprintf(stdout, "%-12s: ", name);
-    fprintf(stdout, "%.3f micros/op %ld ops/sec\n",
-            seconds_ * 1e6 / done_, (long)throughput);
-    fprintf(stdout, "%-12s: Wrote %.2f MB (%.2f MB/sec) (%ld%% of %ld ops)\n",
-            "", bytes_mb, rate, (100*writes_)/done_, done_);
-    fprintf(stdout, "%-12s: Wrote %ld times\n", "", writes_);
-    fprintf(stdout, "%-12s: Deleted %ld times\n", "", deletes_);
-    fprintf(stdout, "%-12s: %ld read and %ld found the key\n", "",
-            gets_, founds_);
-    fprintf(stdout, "%-12s: Prefix scanned %ld times\n", "", prefixes_);
-    fprintf(stdout, "%-12s: Iterator size sum is %ld\n", "",
-            iterator_size_sums_);
-    fprintf(stdout, "%-12s: Iterated %ld times\n", "", iterations_);
-    fprintf(stdout, "%-12s: Got errors %ld times\n", "", errors_);
-
-    if (FLAGS_histogram) {
-      fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str());
-    }
-    fflush(stdout);
-  }
-};
-
-// State shared by all concurrent executions of the same benchmark.
-class SharedState {
- public:
-  static const uint32_t SENTINEL;
-
-  explicit SharedState(StressTest* stress_test)
-      : cv_(&mu_),
-        seed_(FLAGS_seed),
-        max_key_(FLAGS_max_key),
-        log2_keys_per_lock_(FLAGS_log2_keys_per_lock),
-        num_threads_(FLAGS_threads),
-        num_initialized_(0),
-        num_populated_(0),
-        vote_reopen_(0),
-        num_done_(0),
-        start_(false),
-        start_verify_(false),
-        stress_test_(stress_test),
-        verification_failure_(false) {
-    if (FLAGS_test_batches_snapshots) {
-      fprintf(stdout, "No lock creation because test_batches_snapshots set\n");
-      return;
-    }
-    values_.resize(FLAGS_column_families);
-
-    for (int i = 0; i < FLAGS_column_families; ++i) {
-      values_[i] = std::vector<uint32_t>(max_key_, SENTINEL);
-    }
-
-    long num_locks = (max_key_ >> log2_keys_per_lock_);
-    if (max_key_ & ((1 << log2_keys_per_lock_) - 1)) {
-      num_locks++;
-    }
-    fprintf(stdout, "Creating %ld locks\n", num_locks * FLAGS_column_families);
-    key_locks_.resize(FLAGS_column_families);
-    for (int i = 0; i < FLAGS_column_families; ++i) {
-      key_locks_[i] = std::vector<port::Mutex>(num_locks);
-    }
-  }
-
-  ~SharedState() {}
-
-  port::Mutex* GetMutex() {
-    return &mu_;
-  }
-
-  port::CondVar* GetCondVar() {
-    return &cv_;
-  }
-
-  StressTest* GetStressTest() const {
-    return stress_test_;
-  }
-
-  long GetMaxKey() const {
-    return max_key_;
-  }
-
-  uint32_t GetNumThreads() const {
-    return num_threads_;
-  }
-
-  void IncInitialized() {
-    num_initialized_++;
-  }
-
-  void IncOperated() {
-    num_populated_++;
-  }
-
-  void IncDone() {
-    num_done_++;
-  }
-
-  void IncVotedReopen() {
-    vote_reopen_ = (vote_reopen_ + 1) % num_threads_;
-  }
-
-  bool AllInitialized() const {
-    return num_initialized_ >= num_threads_;
-  }
-
-  bool AllOperated() const {
-    return num_populated_ >= num_threads_;
-  }
-
-  bool AllDone() const {
-    return num_done_ >= num_threads_;
-  }
-
-  bool AllVotedReopen() {
-    return (vote_reopen_ == 0);
-  }
-
-  void SetStart() {
-    start_ = true;
-  }
-
-  void SetStartVerify() {
-    start_verify_ = true;
-  }
-
-  bool Started() const {
-    return start_;
-  }
-
-  bool VerifyStarted() const {
-    return start_verify_;
-  }
-
-  void SetVerificationFailure() { verification_failure_.store(true); }
-
-  bool HasVerificationFailedYet() { return verification_failure_.load(); }
-
-  port::Mutex* GetMutexForKey(int cf, long key) {
-    return &key_locks_[cf][key >> log2_keys_per_lock_];
-  }
-
-  void LockColumnFamily(int cf) {
-    for (auto& mutex : key_locks_[cf]) {
-      mutex.Lock();
-    }
-  }
-
-  void UnlockColumnFamily(int cf) {
-    for (auto& mutex : key_locks_[cf]) {
-      mutex.Unlock();
-    }
-  }
-
-  void ClearColumnFamily(int cf) {
-    std::fill(values_[cf].begin(), values_[cf].end(), SENTINEL);
-  }
-
-  void Put(int cf, long key, uint32_t value_base) {
-    values_[cf][key] = value_base;
-  }
-
-  uint32_t Get(int cf, long key) const { return values_[cf][key]; }
-
-  void Delete(int cf, long key) { values_[cf][key] = SENTINEL; }
-
-  uint32_t GetSeed() const { return seed_; }
-
- private:
-  port::Mutex mu_;
-  port::CondVar cv_;
-  const uint32_t seed_;
-  const long max_key_;
-  const uint32_t log2_keys_per_lock_;
-  const int num_threads_;
-  long num_initialized_;
-  long num_populated_;
-  long vote_reopen_;
-  long num_done_;
-  bool start_;
-  bool start_verify_;
-  StressTest* stress_test_;
-  std::atomic<bool> verification_failure_;
-
-  std::vector<std::vector<uint32_t>> values_;
-  std::vector<std::vector<port::Mutex>> key_locks_;
-};
-
-const uint32_t SharedState::SENTINEL = 0xffffffff;
-
-// Per-thread state for concurrent executions of the same benchmark.
-struct ThreadState {
-  uint32_t tid; // 0..n-1
-  Random rand;  // Has different seeds for different threads
-  SharedState* shared;
-  Stats stats;
-
-  ThreadState(uint32_t index, SharedState *shared)
-      : tid(index),
-        rand(1000 + index + shared->GetSeed()),
-        shared(shared) {
-  }
-};
-
-}  // namespace
-
-class StressTest {
- public:
-  StressTest()
-      : cache_(NewLRUCache(FLAGS_cache_size)),
-        compressed_cache_(FLAGS_compressed_cache_size >= 0
-                              ? NewLRUCache(FLAGS_compressed_cache_size)
-                              : nullptr),
-        filter_policy_(FLAGS_bloom_bits >= 0
-                           ? NewBloomFilterPolicy(FLAGS_bloom_bits)
-                           : nullptr),
-        db_(nullptr),
-        new_column_family_name_(0),
-        num_times_reopened_(0) {
-    if (FLAGS_destroy_db_initially) {
-      std::vector<std::string> files;
-      FLAGS_env->GetChildren(FLAGS_db, &files);
-      for (unsigned int i = 0; i < files.size(); i++) {
-        if (Slice(files[i]).starts_with("heap-")) {
-          FLAGS_env->DeleteFile(FLAGS_db + "/" + files[i]);
-        }
-      }
-      DestroyDB(FLAGS_db, Options());
-    }
-  }
-
-  ~StressTest() {
-    for (auto cf : column_families_) {
-      delete cf;
-    }
-    column_families_.clear();
-    delete db_;
-    delete filter_policy_;
-  }
-
-  bool Run() {
-    PrintEnv();
-    Open();
-    SharedState shared(this);
-    uint32_t n = shared.GetNumThreads();
-
-    std::vector<ThreadState*> threads(n);
-    for (uint32_t i = 0; i < n; i++) {
-      threads[i] = new ThreadState(i, &shared);
-      FLAGS_env->StartThread(ThreadBody, threads[i]);
-    }
-    // Each thread goes through the following states:
-    // initializing -> wait for others to init -> read/populate/depopulate
-    // wait for others to operate -> verify -> done
-
-    {
-      MutexLock l(shared.GetMutex());
-      while (!shared.AllInitialized()) {
-        shared.GetCondVar()->Wait();
-      }
-
-      double now = FLAGS_env->NowMicros();
-      fprintf(stdout, "%s Starting database operations\n",
-              FLAGS_env->TimeToString((uint64_t) now/1000000).c_str());
-
-      shared.SetStart();
-      shared.GetCondVar()->SignalAll();
-      while (!shared.AllOperated()) {
-        shared.GetCondVar()->Wait();
-      }
-
-      now = FLAGS_env->NowMicros();
-      if (FLAGS_test_batches_snapshots) {
-        fprintf(stdout, "%s Limited verification already done during gets\n",
-                FLAGS_env->TimeToString((uint64_t) now/1000000).c_str());
-      } else {
-        fprintf(stdout, "%s Starting verification\n",
-                FLAGS_env->TimeToString((uint64_t) now/1000000).c_str());
-      }
-
-      shared.SetStartVerify();
-      shared.GetCondVar()->SignalAll();
-      while (!shared.AllDone()) {
-        shared.GetCondVar()->Wait();
-      }
-    }
-
-    for (unsigned int i = 1; i < n; i++) {
-      threads[0]->stats.Merge(threads[i]->stats);
-    }
-    threads[0]->stats.Report("Stress Test");
-
-    for (unsigned int i = 0; i < n; i++) {
-      delete threads[i];
-      threads[i] = nullptr;
-    }
-    double now = FLAGS_env->NowMicros();
-    if (!FLAGS_test_batches_snapshots) {
-      fprintf(stdout, "%s Verification successful\n",
-              FLAGS_env->TimeToString((uint64_t) now/1000000).c_str());
-    }
-    PrintStatistics();
-
-    if (shared.HasVerificationFailedYet()) {
-      printf("Verification failed :(\n");
-      return false;
-    }
-    return true;
-  }
-
- private:
-
-  static void ThreadBody(void* v) {
-    ThreadState* thread = reinterpret_cast<ThreadState*>(v);
-    SharedState* shared = thread->shared;
-
-    {
-      MutexLock l(shared->GetMutex());
-      shared->IncInitialized();
-      if (shared->AllInitialized()) {
-        shared->GetCondVar()->SignalAll();
-      }
-      while (!shared->Started()) {
-        shared->GetCondVar()->Wait();
-      }
-    }
-    thread->shared->GetStressTest()->OperateDb(thread);
-
-    {
-      MutexLock l(shared->GetMutex());
-      shared->IncOperated();
-      if (shared->AllOperated()) {
-        shared->GetCondVar()->SignalAll();
-      }
-      while (!shared->VerifyStarted()) {
-        shared->GetCondVar()->Wait();
-      }
-    }
-
-    if (!FLAGS_test_batches_snapshots) {
-      thread->shared->GetStressTest()->VerifyDb(thread);
-    }
-
-    {
-      MutexLock l(shared->GetMutex());
-      shared->IncDone();
-      if (shared->AllDone()) {
-        shared->GetCondVar()->SignalAll();
-      }
-    }
-
-  }
-
-  // Given a key K and value V, this puts ("0"+K, "0"+V), ("1"+K, "1"+V), ...
-  // ("9"+K, "9"+V) in DB atomically i.e in a single batch.
-  // Also refer MultiGet.
-  Status MultiPut(ThreadState* thread, const WriteOptions& writeoptions,
-                  ColumnFamilyHandle* column_family, const Slice& key,
-                  const Slice& value, size_t sz) {
-    std::string keys[10] = {"9", "8", "7", "6", "5",
-                            "4", "3", "2", "1", "0"};
-    std::string values[10] = {"9", "8", "7", "6", "5",
-                              "4", "3", "2", "1", "0"};
-    Slice value_slices[10];
-    WriteBatch batch;
-    Status s;
-    for (int i = 0; i < 10; i++) {
-      keys[i] += key.ToString();
-      values[i] += value.ToString();
-      value_slices[i] = values[i];
-      if (FLAGS_use_merge) {
-        batch.Merge(column_family, keys[i], value_slices[i]);
-      } else {
-        batch.Put(column_family, keys[i], value_slices[i]);
-      }
-    }
-
-    s = db_->Write(writeoptions, &batch);
-    if (!s.ok()) {
-      fprintf(stderr, "multiput error: %s\n", s.ToString().c_str());
-      thread->stats.AddErrors(1);
-    } else {
-      // we did 10 writes each of size sz + 1
-      thread->stats.AddBytesForWrites(10, (sz + 1) * 10);
-    }
-
-    return s;
-  }
-
-  // Given a key K, this deletes ("0"+K), ("1"+K),... ("9"+K)
-  // in DB atomically i.e in a single batch. Also refer MultiGet.
-  Status MultiDelete(ThreadState* thread, const WriteOptions& writeoptions,
-                     ColumnFamilyHandle* column_family, const Slice& key) {
-    std::string keys[10] = {"9", "7", "5", "3", "1",
-                            "8", "6", "4", "2", "0"};
-
-    WriteBatch batch;
-    Status s;
-    for (int i = 0; i < 10; i++) {
-      keys[i] += key.ToString();
-      batch.Delete(column_family, keys[i]);
-    }
-
-    s = db_->Write(writeoptions, &batch);
-    if (!s.ok()) {
-      fprintf(stderr, "multidelete error: %s\n", s.ToString().c_str());
-      thread->stats.AddErrors(1);
-    } else {
-      thread->stats.AddDeletes(10);
-    }
-
-    return s;
-  }
-
-  // Given a key K, this gets values for "0"+K, "1"+K,..."9"+K
-  // in the same snapshot, and verifies that all the values are of the form
-  // "0"+V, "1"+V,..."9"+V.
-  // ASSUMES that MultiPut was used to put (K, V) into the DB.
-  Status MultiGet(ThreadState* thread, const ReadOptions& readoptions,
-                  ColumnFamilyHandle* column_family, const Slice& key,
-                  std::string* value) {
-    std::string keys[10] = {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"};
-    Slice key_slices[10];
-    std::string values[10];
-    ReadOptions readoptionscopy = readoptions;
-    readoptionscopy.snapshot = db_->GetSnapshot();
-    Status s;
-    for (int i = 0; i < 10; i++) {
-      keys[i] += key.ToString();
-      key_slices[i] = keys[i];
-      s = db_->Get(readoptionscopy, column_family, key_slices[i], value);
-      if (!s.ok() && !s.IsNotFound()) {
-        fprintf(stderr, "get error: %s\n", s.ToString().c_str());
-        values[i] = "";
-        thread->stats.AddErrors(1);
-        // we continue after error rather than exiting so that we can
-        // find more errors if any
-      } else if (s.IsNotFound()) {
-        values[i] = "";
-        thread->stats.AddGets(1, 0);
-      } else {
-        values[i] = *value;
-
-        char expected_prefix = (keys[i])[0];
-        char actual_prefix = (values[i])[0];
-        if (actual_prefix != expected_prefix) {
-          fprintf(stderr, "error expected prefix = %c actual = %c\n",
-                  expected_prefix, actual_prefix);
-        }
-        (values[i])[0] = ' '; // blank out the differing character
-        thread->stats.AddGets(1, 1);
-      }
-    }
-    db_->ReleaseSnapshot(readoptionscopy.snapshot);
-
-    // Now that we retrieved all values, check that they all match
-    for (int i = 1; i < 10; i++) {
-      if (values[i] != values[0]) {
-        fprintf(stderr, "error : inconsistent values for key %s: %s, %s\n",
-                key.ToString(true).c_str(), StringToHex(values[0]).c_str(),
-                StringToHex(values[i]).c_str());
-      // we continue after error rather than exiting so that we can
-      // find more errors if any
-      }
-    }
-
-    return s;
-  }
-
-  // Given a key, this does prefix scans for "0"+P, "1"+P,..."9"+P
-  // in the same snapshot where P is the first FLAGS_prefix_size - 1 bytes
-  // of the key. Each of these 10 scans returns a series of values;
-  // each series should be the same length, and it is verified for each
-  // index i that all the i'th values are of the form "0"+V, "1"+V,..."9"+V.
-  // ASSUMES that MultiPut was used to put (K, V)
-  Status MultiPrefixScan(ThreadState* thread, const ReadOptions& readoptions,
-                         ColumnFamilyHandle* column_family,
-                         const Slice& key) {
-    std::string prefixes[10] = {"0", "1", "2", "3", "4",
-                                "5", "6", "7", "8", "9"};
-    Slice prefix_slices[10];
-    ReadOptions readoptionscopy[10];
-    const Snapshot* snapshot = db_->GetSnapshot();
-    Iterator* iters[10];
-    Status s = Status::OK();
-    for (int i = 0; i < 10; i++) {
-      prefixes[i] += key.ToString();
-      prefixes[i].resize(FLAGS_prefix_size);
-      prefix_slices[i] = Slice(prefixes[i]);
-      readoptionscopy[i] = readoptions;
-      readoptionscopy[i].snapshot = snapshot;
-      iters[i] = db_->NewIterator(readoptionscopy[i], column_family);
-      iters[i]->Seek(prefix_slices[i]);
-    }
-
-    int count = 0;
-    while (iters[0]->Valid() && iters[0]->key().starts_with(prefix_slices[0])) {
-      count++;
-      std::string values[10];
-      // get list of all values for this iteration
-      for (int i = 0; i < 10; i++) {
-        // no iterator should finish before the first one
-        assert(iters[i]->Valid() &&
-               iters[i]->key().starts_with(prefix_slices[i]));
-        values[i] = iters[i]->value().ToString();
-
-        char expected_first = (prefixes[i])[0];
-        char actual_first = (values[i])[0];
-
-        if (actual_first != expected_first) {
-          fprintf(stderr, "error expected first = %c actual = %c\n",
-                  expected_first, actual_first);
-        }
-        (values[i])[0] = ' '; // blank out the differing character
-      }
-      // make sure all values are equivalent
-      for (int i = 0; i < 10; i++) {
-        if (values[i] != values[0]) {
-          fprintf(stderr, "error : %d, inconsistent values for prefix %s: %s, %s\n",
-                  i, prefixes[i].c_str(), StringToHex(values[0]).c_str(),
-                  StringToHex(values[i]).c_str());
-          // we continue after error rather than exiting so that we can
-          // find more errors if any
-        }
-        iters[i]->Next();
-      }
-    }
-
-    // cleanup iterators and snapshot
-    for (int i = 0; i < 10; i++) {
-      // if the first iterator finished, they should have all finished
-      assert(!iters[i]->Valid() ||
-             !iters[i]->key().starts_with(prefix_slices[i]));
-      assert(iters[i]->status().ok());
-      delete iters[i];
-    }
-    db_->ReleaseSnapshot(snapshot);
-
-    if (s.ok()) {
-      thread->stats.AddPrefixes(1, count);
-    } else {
-      thread->stats.AddErrors(1);
-    }
-
-    return s;
-  }
-
-  // Given a key K, this creates an iterator which scans to K and then
-  // does a random sequence of Next/Prev operations.
-  Status MultiIterate(ThreadState* thread, const ReadOptions& readoptions,
-                      ColumnFamilyHandle* column_family, const Slice& key) {
-    Status s;
-    const Snapshot* snapshot = db_->GetSnapshot();
-    ReadOptions readoptionscopy = readoptions;
-    readoptionscopy.snapshot = snapshot;
-    unique_ptr<Iterator> iter(db_->NewIterator(readoptionscopy, column_family));
-
-    iter->Seek(key);
-    for (uint64_t i = 0; i < FLAGS_num_iterations && iter->Valid(); i++) {
-      if (thread->rand.OneIn(2)) {
-        iter->Next();
-      } else {
-        iter->Prev();
-      }
-    }
-
-    if (s.ok()) {
-      thread->stats.AddIterations(1);
-    } else {
-      thread->stats.AddErrors(1);
-    }
-
-    db_->ReleaseSnapshot(snapshot);
-
-    return s;
-  }
-
-  void OperateDb(ThreadState* thread) {
-    ReadOptions read_opts(FLAGS_verify_checksum, true);
-    WriteOptions write_opts;
-    char value[100];
-    long max_key = thread->shared->GetMaxKey();
-    std::string from_db;
-    if (FLAGS_sync) {
-      write_opts.sync = true;
-    }
-    write_opts.disableWAL = FLAGS_disable_wal;
-    const int prefixBound = (int)FLAGS_readpercent + (int)FLAGS_prefixpercent;
-    const int writeBound = prefixBound + (int)FLAGS_writepercent;
-    const int delBound = writeBound + (int)FLAGS_delpercent;
-
-    thread->stats.Start();
-    for (uint64_t i = 0; i < FLAGS_ops_per_thread; i++) {
-      if (thread->shared->HasVerificationFailedYet()) {
-        break;
-      }
-      if (i != 0 && (i % (FLAGS_ops_per_thread / (FLAGS_reopen + 1))) == 0) {
-        {
-          thread->stats.FinishedSingleOp();
-          MutexLock l(thread->shared->GetMutex());
-          thread->shared->IncVotedReopen();
-          if (thread->shared->AllVotedReopen()) {
-            thread->shared->GetStressTest()->Reopen();
-            thread->shared->GetCondVar()->SignalAll();
-          }
-          else {
-            thread->shared->GetCondVar()->Wait();
-          }
-          // Commenting this out as we don't want to reset stats on each open.
-          // thread->stats.Start();
-        }
-      }
-
-      if (!FLAGS_test_batches_snapshots &&
-          FLAGS_clear_column_family_one_in != 0) {
-        if (thread->rand.OneIn(FLAGS_clear_column_family_one_in)) {
-          // drop column family and then create it again (can't drop default)
-          int cf = thread->rand.Next() % (FLAGS_column_families - 1) + 1;
-          std::string new_name =
-              std::to_string(new_column_family_name_.fetch_add(1));
-          {
-            MutexLock l(thread->shared->GetMutex());
-            fprintf(
-                stdout,
-                "[CF %d] Dropping and recreating column family. new name: %s\n",
-                cf, new_name.c_str());
-          }
-          thread->shared->LockColumnFamily(cf);
-          Status s __attribute__((unused));
-          s = db_->DropColumnFamily(column_families_[cf]);
-          delete column_families_[cf];
-          assert(s.ok());
-          s = db_->CreateColumnFamily(ColumnFamilyOptions(options_), new_name,
-                                      &column_families_[cf]);
-          column_family_names_[cf] = new_name;
-          thread->shared->ClearColumnFamily(cf);
-          assert(s.ok());
-          thread->shared->UnlockColumnFamily(cf);
-        }
-      }
-
-      long rand_key = thread->rand.Next() % max_key;
-      int rand_column_family = thread->rand.Next() % FLAGS_column_families;
-      std::string keystr = Key(rand_key);
-      Slice key = keystr;
-      int prob_op = thread->rand.Uniform(100);
-      std::unique_ptr<MutexLock> l;
-      if (!FLAGS_test_batches_snapshots) {
-        l.reset(new MutexLock(
-            thread->shared->GetMutexForKey(rand_column_family, rand_key)));
-      }
-      auto column_family = column_families_[rand_column_family];
-
-      if (prob_op >= 0 && prob_op < (int)FLAGS_readpercent) {
-        // OPERATION read
-        if (!FLAGS_test_batches_snapshots) {
-          Status s = db_->Get(read_opts, column_family, key, &from_db);
-          if (s.ok()) {
-            // found case
-            thread->stats.AddGets(1, 1);
-          } else if (s.IsNotFound()) {
-            // not found case
-            thread->stats.AddGets(1, 0);
-          } else {
-            // errors case
-            thread->stats.AddErrors(1);
-          }
-        } else {
-          MultiGet(thread, read_opts, column_family, key, &from_db);
-        }
-      } else if ((int)FLAGS_readpercent <= prob_op && prob_op < prefixBound) {
-        // OPERATION prefix scan
-        // keys are 8 bytes long, prefix size is FLAGS_prefix_size. There are
-        // (8 - FLAGS_prefix_size) bytes besides the prefix. So there will
-        // be 2 ^ ((8 - FLAGS_prefix_size) * 8) possible keys with the same
-        // prefix
-        if (!FLAGS_test_batches_snapshots) {
-          Slice prefix = Slice(key.data(), FLAGS_prefix_size);
-          Iterator* iter = db_->NewIterator(read_opts, column_family);
-          int64_t count = 0;
-          for (iter->Seek(prefix);
-               iter->Valid() && iter->key().starts_with(prefix); iter->Next()) {
-            ++count;
-          }
-          assert(count <=
-                 (static_cast<int64_t>(1) << ((8 - FLAGS_prefix_size) * 8)));
-          if (iter->status().ok()) {
-            thread->stats.AddPrefixes(1, count);
-          } else {
-            thread->stats.AddErrors(1);
-          }
-          delete iter;
-        } else {
-          MultiPrefixScan(thread, read_opts, column_family, key);
-        }
-      } else if (prefixBound <= prob_op && prob_op < writeBound) {
-        // OPERATION write
-        uint32_t value_base = thread->rand.Next();
-        size_t sz = GenerateValue(value_base, value, sizeof(value));
-        Slice v(value, sz);
-        if (!FLAGS_test_batches_snapshots) {
-          if (FLAGS_verify_before_write) {
-            std::string keystr2 = Key(rand_key);
-            Slice k = keystr2;
-            Status s = db_->Get(read_opts, column_family, k, &from_db);
-            if (VerifyValue(rand_column_family, rand_key, read_opts,
-                            thread->shared, from_db, s, true) == false) {
-              break;
-            }
-          }
-          thread->shared->Put(rand_column_family, rand_key, value_base);
-          if (FLAGS_use_merge) {
-            db_->Merge(write_opts, column_family, key, v);
-          } else {
-            db_->Put(write_opts, column_family, key, v);
-          }
-          thread->stats.AddBytesForWrites(1, sz);
-        } else {
-          MultiPut(thread, write_opts, column_family, key, v, sz);
-        }
-        PrintKeyValue(rand_column_family, rand_key, value, sz);
-      } else if (writeBound <= prob_op && prob_op < delBound) {
-        // OPERATION delete
-        if (!FLAGS_test_batches_snapshots) {
-          thread->shared->Delete(rand_column_family, rand_key);
-          db_->Delete(write_opts, column_family, key);
-          thread->stats.AddDeletes(1);
-        } else {
-          MultiDelete(thread, write_opts, column_family, key);
-        }
-      } else {
-        // OPERATION iterate
-        MultiIterate(thread, read_opts, column_family, key);
-      }
-      thread->stats.FinishedSingleOp();
-    }
-
-    thread->stats.Stop();
-  }
-
-  void VerifyDb(ThreadState* thread) const {
-    ReadOptions options(FLAGS_verify_checksum, true);
-    auto shared = thread->shared;
-    static const long max_key = shared->GetMaxKey();
-    static const long keys_per_thread = max_key / shared->GetNumThreads();
-    long start = keys_per_thread * thread->tid;
-    long end = start + keys_per_thread;
-    if (thread->tid == shared->GetNumThreads() - 1) {
-      end = max_key;
-    }
-    for (size_t cf = 0; cf < column_families_.size(); ++cf) {
-      if (thread->shared->HasVerificationFailedYet()) {
-        break;
-      }
-      if (!thread->rand.OneIn(2)) {
-        // Use iterator to verify this range
-        unique_ptr<Iterator> iter(
-            db_->NewIterator(options, column_families_[cf]));
-        iter->Seek(Key(start));
-        for (long i = start; i < end; i++) {
-          if (thread->shared->HasVerificationFailedYet()) {
-            break;
-          }
-          // TODO(ljin): update "long" to uint64_t
-          // Reseek when the prefix changes
-          if (i % (static_cast<int64_t>(1) << 8 * (8 - FLAGS_prefix_size)) ==
-              0) {
-            iter->Seek(Key(i));
-          }
-          std::string from_db;
-          std::string keystr = Key(i);
-          Slice k = keystr;
-          Status s = iter->status();
-          if (iter->Valid()) {
-            if (iter->key().compare(k) > 0) {
-              s = Status::NotFound(Slice());
-            } else if (iter->key().compare(k) == 0) {
-              from_db = iter->value().ToString();
-              iter->Next();
-            } else if (iter->key().compare(k) < 0) {
-              VerificationAbort(shared, "An out of range key was found", cf, i);
-            }
-          } else {
-            // The iterator found no value for the key in question, so do not
-            // move to the next item in the iterator
-            s = Status::NotFound(Slice());
-          }
-          VerifyValue(cf, i, options, shared, from_db, s, true);
-          if (from_db.length()) {
-            PrintKeyValue(cf, i, from_db.data(), from_db.length());
-          }
-        }
-      } else {
-        // Use Get to verify this range
-        for (long i = start; i < end; i++) {
-          if (thread->shared->HasVerificationFailedYet()) {
-            break;
-          }
-          std::string from_db;
-          std::string keystr = Key(i);
-          Slice k = keystr;
-          Status s = db_->Get(options, column_families_[cf], k, &from_db);
-          VerifyValue(cf, i, options, shared, from_db, s, true);
-          if (from_db.length()) {
-            PrintKeyValue(cf, i, from_db.data(), from_db.length());
-          }
-        }
-      }
-    }
-  }
-
-  void VerificationAbort(SharedState* shared, std::string msg, int cf,
-                         long key) const {
-    printf("Verification failed for column family %d key %ld: %s\n", cf, key,
-           msg.c_str());
-    shared->SetVerificationFailure();
-  }
-
-  bool VerifyValue(int cf, long key, const ReadOptions& opts,
-                   SharedState* shared, const std::string& value_from_db,
-                   Status s, bool strict = false) const {
-    if (shared->HasVerificationFailedYet()) {
-      return false;
-    }
-    // compare value_from_db with the value in the shared state
-    char value[100];
-    uint32_t value_base = shared->Get(cf, key);
-    if (value_base == SharedState::SENTINEL && !strict) {
-      return true;
-    }
-
-    if (s.ok()) {
-      if (value_base == SharedState::SENTINEL) {
-        VerificationAbort(shared, "Unexpected value found", cf, key);
-        return false;
-      }
-      size_t sz = GenerateValue(value_base, value, sizeof(value));
-      if (value_from_db.length() != sz) {
-        VerificationAbort(shared, "Length of value read is not equal", cf, key);
-        return false;
-      }
-      if (memcmp(value_from_db.data(), value, sz) != 0) {
-        VerificationAbort(shared, "Contents of value read don't match", cf,
-                          key);
-        return false;
-      }
-    } else {
-      if (value_base != SharedState::SENTINEL) {
-        VerificationAbort(shared, "Value not found: " + s.ToString(), cf, key);
-        return false;
-      }
-    }
-    return true;
-  }
-
-  static void PrintKeyValue(int cf, uint32_t key, const char* value,
-                            size_t sz) {
-    if (!FLAGS_verbose) {
-      return;
-    }
-    fprintf(stdout, "[CF %d] %u ==> (%u) ", cf, key, (unsigned int)sz);
-    for (size_t i = 0; i < sz; i++) {
-      fprintf(stdout, "%X", value[i]);
-    }
-    fprintf(stdout, "\n");
-  }
-
-  static size_t GenerateValue(uint32_t rand, char *v, size_t max_sz) {
-    size_t value_sz = ((rand % 3) + 1) * FLAGS_value_size_mult;
-    assert(value_sz <= max_sz && value_sz >= sizeof(uint32_t));
-    *((uint32_t*)v) = rand;
-    for (size_t i=sizeof(uint32_t); i < value_sz; i++) {
-      v[i] = (char)(rand ^ i);
-    }
-    v[value_sz] = '\0';
-    return value_sz; // the size of the value set.
-  }
-
-  void PrintEnv() const {
-    fprintf(stdout, "RocksDB version     : %d.%d\n", kMajorVersion,
-            kMinorVersion);
-    fprintf(stdout, "Column families     : %d\n", FLAGS_column_families);
-    if (!FLAGS_test_batches_snapshots) {
-      fprintf(stdout, "Clear CFs one in    : %d\n",
-              FLAGS_clear_column_family_one_in);
-    }
-    fprintf(stdout, "Number of threads   : %d\n", FLAGS_threads);
-    fprintf(stdout,
-            "Ops per thread      : %lu\n",
-            (unsigned long)FLAGS_ops_per_thread);
-    std::string ttl_state("unused");
-    if (FLAGS_ttl > 0) {
-      ttl_state = NumberToString(FLAGS_ttl);
-    }
-    fprintf(stdout, "Time to live(sec)   : %s\n", ttl_state.c_str());
-    fprintf(stdout, "Read percentage     : %d%%\n", FLAGS_readpercent);
-    fprintf(stdout, "Prefix percentage   : %d%%\n", FLAGS_prefixpercent);
-    fprintf(stdout, "Write percentage    : %d%%\n", FLAGS_writepercent);
-    fprintf(stdout, "Delete percentage   : %d%%\n", FLAGS_delpercent);
-    fprintf(stdout, "Iterate percentage  : %d%%\n", FLAGS_iterpercent);
-    fprintf(stdout, "Write-buffer-size   : %d\n", FLAGS_write_buffer_size);
-    fprintf(stdout,
-            "Iterations          : %lu\n",
-            (unsigned long)FLAGS_num_iterations);
-    fprintf(stdout,
-            "Max key             : %lu\n",
-            (unsigned long)FLAGS_max_key);
-    fprintf(stdout, "Ratio #ops/#keys    : %f\n",
-            (1.0 * FLAGS_ops_per_thread * FLAGS_threads)/FLAGS_max_key);
-    fprintf(stdout, "Num times DB reopens: %d\n", FLAGS_reopen);
-    fprintf(stdout, "Batches/snapshots   : %d\n",
-            FLAGS_test_batches_snapshots);
-    fprintf(stdout, "Purge redundant %%   : %d\n",
-            FLAGS_purge_redundant_percent);
-    fprintf(stdout, "Deletes use filter  : %d\n",
-            FLAGS_filter_deletes);
-    fprintf(stdout, "Num keys per lock   : %d\n",
-            1 << FLAGS_log2_keys_per_lock);
-
-    const char* compression = "";
-    switch (FLAGS_compression_type_e) {
-      case rocksdb::kNoCompression:
-        compression = "none";
-        break;
-      case rocksdb::kSnappyCompression:
-        compression = "snappy";
-        break;
-      case rocksdb::kZlibCompression:
-        compression = "zlib";
-        break;
-      case rocksdb::kBZip2Compression:
-        compression = "bzip2";
-        break;
-      case rocksdb::kLZ4Compression:
-        compression = "lz4";
-      case rocksdb::kLZ4HCCompression:
-        compression = "lz4hc";
-        break;
-      }
-
-    fprintf(stdout, "Compression         : %s\n", compression);
-
-    const char* memtablerep = "";
-    switch (FLAGS_rep_factory) {
-      case kSkipList:
-        memtablerep = "skip_list";
-        break;
-      case kHashSkipList:
-        memtablerep = "prefix_hash";
-        break;
-      case kVectorRep:
-        memtablerep = "vector";
-        break;
-    }
-
-    fprintf(stdout, "Memtablerep         : %s\n", memtablerep);
-
-    fprintf(stdout, "------------------------------------------------\n");
-  }
-
-  void Open() {
-    assert(db_ == nullptr);
-    options_.block_cache = cache_;
-    options_.block_cache_compressed = compressed_cache_;
-    options_.write_buffer_size = FLAGS_write_buffer_size;
-    options_.max_write_buffer_number = FLAGS_max_write_buffer_number;
-    options_.min_write_buffer_number_to_merge =
-        FLAGS_min_write_buffer_number_to_merge;
-    options_.max_background_compactions = FLAGS_max_background_compactions;
-    options_.max_background_flushes = FLAGS_max_background_flushes;
-    options_.compaction_style =
-        static_cast<rocksdb::CompactionStyle>(FLAGS_compaction_style);
-    options_.block_size = FLAGS_block_size;
-    options_.filter_policy = filter_policy_;
-    options_.prefix_extractor.reset(NewFixedPrefixTransform(FLAGS_prefix_size));
-    options_.max_open_files = FLAGS_open_files;
-    options_.statistics = dbstats;
-    options_.env = FLAGS_env;
-    options_.disableDataSync = FLAGS_disable_data_sync;
-    options_.use_fsync = FLAGS_use_fsync;
-    options_.allow_mmap_reads = FLAGS_mmap_read;
-    rocksdb_kill_odds = FLAGS_kill_random_test;
-    options_.target_file_size_base = FLAGS_target_file_size_base;
-    options_.target_file_size_multiplier = FLAGS_target_file_size_multiplier;
-    options_.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base;
-    options_.max_bytes_for_level_multiplier =
-        FLAGS_max_bytes_for_level_multiplier;
-    options_.level0_stop_writes_trigger = FLAGS_level0_stop_writes_trigger;
-    options_.level0_slowdown_writes_trigger =
-        FLAGS_level0_slowdown_writes_trigger;
-    options_.level0_file_num_compaction_trigger =
-        FLAGS_level0_file_num_compaction_trigger;
-    options_.compression = FLAGS_compression_type_e;
-    options_.create_if_missing = true;
-    options_.disable_seek_compaction = FLAGS_disable_seek_compaction;
-    options_.max_manifest_file_size = 10 * 1024;
-    options_.filter_deletes = FLAGS_filter_deletes;
-    if ((FLAGS_prefix_size == 0) == (FLAGS_rep_factory == kHashSkipList)) {
-      fprintf(stderr,
-            "prefix_size should be non-zero iff memtablerep == prefix_hash\n");
-      exit(1);
-    }
-    switch (FLAGS_rep_factory) {
-      case kHashSkipList:
-        options_.memtable_factory.reset(NewHashSkipListRepFactory());
-        break;
-      case kSkipList:
-        // no need to do anything
-        break;
-      case kVectorRep:
-        options_.memtable_factory.reset(new VectorRepFactory());
-        break;
-    }
-    static Random purge_percent(1000); // no benefit from non-determinism here
-    if (static_cast<int32_t>(purge_percent.Uniform(100)) <
-        FLAGS_purge_redundant_percent - 1) {
-      options_.purge_redundant_kvs_while_flush = false;
-    }
-
-    if (FLAGS_use_merge) {
-      options_.merge_operator = MergeOperators::CreatePutOperator();
-    }
-
-    // set universal style compaction configurations, if applicable
-    if (FLAGS_universal_size_ratio != 0) {
-      options_.compaction_options_universal.size_ratio =
-          FLAGS_universal_size_ratio;
-    }
-    if (FLAGS_universal_min_merge_width != 0) {
-      options_.compaction_options_universal.min_merge_width =
-          FLAGS_universal_min_merge_width;
-    }
-    if (FLAGS_universal_max_merge_width != 0) {
-      options_.compaction_options_universal.max_merge_width =
-          FLAGS_universal_max_merge_width;
-    }
-    if (FLAGS_universal_max_size_amplification_percent != 0) {
-      options_.compaction_options_universal.max_size_amplification_percent =
-          FLAGS_universal_max_size_amplification_percent;
-    }
-
-    fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str());
-
-    Status s;
-    if (FLAGS_ttl == -1) {
-      std::vector<std::string> existing_column_families;
-      s = DB::ListColumnFamilies(DBOptions(options_), FLAGS_db,
-                                 &existing_column_families);  // ignore errors
-      if (!s.ok()) {
-        // DB doesn't exist
-        assert(existing_column_families.empty());
-        assert(column_family_names_.empty());
-        column_family_names_.push_back(kDefaultColumnFamilyName);
-      } else if (column_family_names_.empty()) {
-        // this is the first call to the function Open()
-        column_family_names_ = existing_column_families;
-      } else {
-        // this is a reopen. just assert that existing column_family_names are
-        // equivalent to what we remember
-        auto sorted_cfn = column_family_names_;
-        sort(sorted_cfn.begin(), sorted_cfn.end());
-        sort(existing_column_families.begin(), existing_column_families.end());
-        if (sorted_cfn != existing_column_families) {
-          fprintf(stderr,
-                  "Expected column families differ from the existing:\n");
-          printf("Expected: {");
-          for (auto cf : sorted_cfn) {
-            printf("%s ", cf.c_str());
-          }
-          printf("}\n");
-          printf("Existing: {");
-          for (auto cf : existing_column_families) {
-            printf("%s ", cf.c_str());
-          }
-          printf("}\n");
-        }
-        assert(sorted_cfn == existing_column_families);
-      }
-      std::vector<ColumnFamilyDescriptor> cf_descriptors;
-      for (auto name : column_family_names_) {
-        if (name != kDefaultColumnFamilyName) {
-          new_column_family_name_ =
-              std::max(new_column_family_name_.load(), std::stoi(name) + 1);
-        }
-        cf_descriptors.emplace_back(name, ColumnFamilyOptions(options_));
-      }
-      s = DB::Open(DBOptions(options_), FLAGS_db, cf_descriptors,
-                   &column_families_, &db_);
-      if (s.ok()) {
-        while (s.ok() &&
-               column_families_.size() < (size_t)FLAGS_column_families) {
-          ColumnFamilyHandle* cf = nullptr;
-          std::string name = std::to_string(new_column_family_name_.load());
-          new_column_family_name_++;
-          s = db_->CreateColumnFamily(ColumnFamilyOptions(options_), name, &cf);
-          column_families_.push_back(cf);
-          column_family_names_.push_back(name);
-        }
-      }
-      assert(!s.ok() || column_families_.size() ==
-                            static_cast<size_t>(FLAGS_column_families));
-    } else {
-      DBWithTTL* db_with_ttl;
-      s = DBWithTTL::Open(options_, FLAGS_db, &db_with_ttl, FLAGS_ttl);
-      db_ = db_with_ttl;
-    }
-    if (!s.ok()) {
-      fprintf(stderr, "open error: %s\n", s.ToString().c_str());
-      exit(1);
-    }
-  }
-
-  void Reopen() {
-    for (auto cf : column_families_) {
-      delete cf;
-    }
-    column_families_.clear();
-    delete db_;
-    db_ = nullptr;
-
-    num_times_reopened_++;
-    double now = FLAGS_env->NowMicros();
-    fprintf(stdout, "%s Reopening database for the %dth time\n",
-            FLAGS_env->TimeToString((uint64_t) now/1000000).c_str(),
-            num_times_reopened_);
-    Open();
-  }
-
-  void PrintStatistics() {
-    if (dbstats) {
-      fprintf(stdout, "STATISTICS:\n%s\n", dbstats->ToString().c_str());
-    }
-  }
-
- private:
-  shared_ptr<Cache> cache_;
-  shared_ptr<Cache> compressed_cache_;
-  const FilterPolicy* filter_policy_;
-  DB* db_;
-  Options options_;
-  std::vector<ColumnFamilyHandle*> column_families_;
-  std::vector<std::string> column_family_names_;
-  std::atomic<int> new_column_family_name_;
-  int num_times_reopened_;
-};
-
-}  // namespace rocksdb
-
-int main(int argc, char** argv) {
-  google::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
-                          " [OPTIONS]...");
-  google::ParseCommandLineFlags(&argc, &argv, true);
-
-  if (FLAGS_statistics) {
-    dbstats = rocksdb::CreateDBStatistics();
-  }
-  FLAGS_compression_type_e =
-    StringToCompressionType(FLAGS_compression_type.c_str());
-  if (!FLAGS_hdfs.empty()) {
-    FLAGS_env  = new rocksdb::HdfsEnv(FLAGS_hdfs);
-  }
-  FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str());
-
-  // The number of background threads should be at least as much the
-  // max number of concurrent compactions.
-  FLAGS_env->SetBackgroundThreads(FLAGS_max_background_compactions);
-
-  if (FLAGS_prefixpercent > 0 && FLAGS_prefix_size <= 0) {
-    fprintf(stderr,
-            "Error: prefixpercent is non-zero while prefix_size is "
-            "not positive!\n");
-    exit(1);
-  }
-  if (FLAGS_test_batches_snapshots && FLAGS_prefix_size <= 0) {
-    fprintf(stderr,
-            "Error: please specify prefix_size for "
-            "test_batches_snapshots test!\n");
-    exit(1);
-  }
-  if ((FLAGS_readpercent + FLAGS_prefixpercent +
-       FLAGS_writepercent + FLAGS_delpercent + FLAGS_iterpercent) != 100) {
-      fprintf(stderr,
-              "Error: Read+Prefix+Write+Delete+Iterate percents != 100!\n");
-      exit(1);
-  }
-  if (FLAGS_disable_wal == 1 && FLAGS_reopen > 0) {
-      fprintf(stderr, "Error: Db cannot reopen safely with disable_wal set!\n");
-      exit(1);
-  }
-  if ((unsigned)FLAGS_reopen >= FLAGS_ops_per_thread) {
-      fprintf(stderr,
-              "Error: #DB-reopens should be < ops_per_thread\n"
-              "Provided reopens = %d and ops_per_thread = %lu\n",
-              FLAGS_reopen,
-              (unsigned long)FLAGS_ops_per_thread);
-      exit(1);
-  }
-
-  // Choose a location for the test database if none given with --db=<path>
-  if (FLAGS_db.empty()) {
-      std::string default_db_path;
-      rocksdb::Env::Default()->GetTestDirectory(&default_db_path);
-      default_db_path += "/dbstress";
-      FLAGS_db = default_db_path;
-  }
-
-  rocksdb::StressTest stress;
-  if (stress.Run()) {
-    return 0;
-  } else {
-    return 1;
-  }
-}
diff --git a/src/rocksdb/tools/ldb.cc b/src/rocksdb/tools/ldb.cc
deleted file mode 100644
index 4581b80..0000000
--- a/src/rocksdb/tools/ldb.cc
+++ /dev/null
@@ -1,13 +0,0 @@
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-//
-
-#include "rocksdb/ldb_tool.h"
-
-int main(int argc, char** argv) {
-  rocksdb::LDBTool tool;
-  tool.Run(argc, argv);
-  return 0;
-}
diff --git a/src/rocksdb/tools/ldb_test.py b/src/rocksdb/tools/ldb_test.py
deleted file mode 100644
index b4ef522..0000000
--- a/src/rocksdb/tools/ldb_test.py
+++ /dev/null
@@ -1,383 +0,0 @@
-import os
-import os.path
-import shutil
-import subprocess
-import time
-import unittest
-import tempfile
-
-def my_check_output(*popenargs, **kwargs):
-    """
-    If we had python 2.7, we should simply use subprocess.check_output.
-    This is a stop-gap solution for python 2.6
-    """
-    if 'stdout' in kwargs:
-        raise ValueError('stdout argument not allowed, it will be overridden.')
-    process = subprocess.Popen(stderr=subprocess.PIPE, stdout=subprocess.PIPE,
-                               *popenargs, **kwargs)
-    output, unused_err = process.communicate()
-    retcode = process.poll()
-    if retcode:
-        cmd = kwargs.get("args")
-        if cmd is None:
-            cmd = popenargs[0]
-        raise Exception("Exit code is not 0.  It is %d.  Command: %s" %
-                (retcode, cmd))
-    return output
-
-def run_err_null(cmd):
-    return os.system(cmd + " 2>/dev/null ")
-
-class LDBTestCase(unittest.TestCase):
-    def setUp(self):
-        self.TMP_DIR  = tempfile.mkdtemp(prefix="ldb_test_")
-        self.DB_NAME = "testdb"
-
-    def tearDown(self):
-        assert(self.TMP_DIR.strip() != "/"
-                and self.TMP_DIR.strip() != "/tmp"
-                and self.TMP_DIR.strip() != "/tmp/") #Just some paranoia
-
-        shutil.rmtree(self.TMP_DIR)
-
-    def dbParam(self, dbName):
-        return "--db=%s" % os.path.join(self.TMP_DIR, dbName)
-
-    def assertRunOKFull(self, params, expectedOutput, unexpected=False):
-        """
-        All command-line params must be specified.
-        Allows full flexibility in testing; for example: missing db param.
-
-        """
-
-        output = my_check_output("./ldb %s |grep -v \"Created bg thread\"" %
-                            params, shell=True)
-        if not unexpected:
-            self.assertEqual(output.strip(), expectedOutput.strip())
-        else:
-            self.assertNotEqual(output.strip(), expectedOutput.strip())
-
-    def assertRunFAILFull(self, params):
-        """
-        All command-line params must be specified.
-        Allows full flexibility in testing; for example: missing db param.
-
-        """
-        try:
-
-            my_check_output("./ldb %s >/dev/null 2>&1 |grep -v \"Created bg \
-                thread\"" % params, shell=True)
-        except Exception, e:
-            return
-        self.fail(
-            "Exception should have been raised for command with params: %s" %
-            params)
-
-    def assertRunOK(self, params, expectedOutput, unexpected=False):
-        """
-        Uses the default test db.
-
-        """
-        self.assertRunOKFull("%s %s" % (self.dbParam(self.DB_NAME), params),
-                             expectedOutput, unexpected)
-
-    def assertRunFAIL(self, params):
-        """
-        Uses the default test db.
-        """
-        self.assertRunFAILFull("%s %s" % (self.dbParam(self.DB_NAME), params))
-
-    def testSimpleStringPutGet(self):
-        print "Running testSimpleStringPutGet..."
-        self.assertRunFAIL("put x1 y1")
-        self.assertRunOK("put --create_if_missing x1 y1", "OK")
-        self.assertRunOK("get x1", "y1")
-        self.assertRunFAIL("get x2")
-
-        self.assertRunOK("put x2 y2", "OK")
-        self.assertRunOK("get x1", "y1")
-        self.assertRunOK("get x2", "y2")
-        self.assertRunFAIL("get x3")
-
-        self.assertRunOK("scan --from=x1 --to=z", "x1 : y1\nx2 : y2")
-        self.assertRunOK("put x3 y3", "OK")
-
-        self.assertRunOK("scan --from=x1 --to=z", "x1 : y1\nx2 : y2\nx3 : y3")
-        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3")
-        self.assertRunOK("scan --from=x", "x1 : y1\nx2 : y2\nx3 : y3")
-
-        self.assertRunOK("scan --to=x2", "x1 : y1")
-        self.assertRunOK("scan --from=x1 --to=z --max_keys=1", "x1 : y1")
-        self.assertRunOK("scan --from=x1 --to=z --max_keys=2",
-                "x1 : y1\nx2 : y2")
-
-        self.assertRunOK("scan --from=x1 --to=z --max_keys=3",
-                "x1 : y1\nx2 : y2\nx3 : y3")
-        self.assertRunOK("scan --from=x1 --to=z --max_keys=4",
-                "x1 : y1\nx2 : y2\nx3 : y3")
-        self.assertRunOK("scan --from=x1 --to=x2", "x1 : y1")
-        self.assertRunOK("scan --from=x2 --to=x4", "x2 : y2\nx3 : y3")
-        self.assertRunFAIL("scan --from=x4 --to=z") # No results => FAIL
-        self.assertRunFAIL("scan --from=x1 --to=z --max_keys=foo")
-
-        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3")
-
-        self.assertRunOK("delete x1", "OK")
-        self.assertRunOK("scan", "x2 : y2\nx3 : y3")
-
-        self.assertRunOK("delete NonExistentKey", "OK")
-        # It is weird that GET and SCAN raise exception for
-        # non-existent key, while delete does not
-
-        self.assertRunOK("checkconsistency", "OK")
-
-    def dumpDb(self, params, dumpFile):
-        return 0 == run_err_null("./ldb dump %s > %s" % (params, dumpFile))
-
-    def loadDb(self, params, dumpFile):
-        return 0 == run_err_null("cat %s | ./ldb load %s" % (dumpFile, params))
-
-    def testStringBatchPut(self):
-        print "Running testStringBatchPut..."
-        self.assertRunOK("batchput x1 y1 --create_if_missing", "OK")
-        self.assertRunOK("scan", "x1 : y1")
-        self.assertRunOK("batchput x2 y2 x3 y3 \"x4 abc\" \"y4 xyz\"", "OK")
-        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 abc : y4 xyz")
-        self.assertRunFAIL("batchput")
-        self.assertRunFAIL("batchput k1")
-        self.assertRunFAIL("batchput k1 v1 k2")
-
-    def testCountDelimDump(self):
-        print "Running testCountDelimDump..."
-        self.assertRunOK("batchput x.1 x1 --create_if_missing", "OK")
-        self.assertRunOK("batchput y.abc abc y.2 2 z.13c pqr", "OK")
-        self.assertRunOK("dump --count_delim", "x => count:1\tsize:5\ny => count:2\tsize:12\nz => count:1\tsize:8")
-        self.assertRunOK("dump --count_delim=\".\"", "x => count:1\tsize:5\ny => count:2\tsize:12\nz => count:1\tsize:8")
-        self.assertRunOK("batchput x,2 x2 x,abc xabc", "OK")
-        self.assertRunOK("dump --count_delim=\",\"", "x => count:2\tsize:14\nx.1 => count:1\tsize:5\ny.2 => count:1\tsize:4\ny.abc => count:1\tsize:8\nz.13c => count:1\tsize:8")
-
-    def testCountDelimIDump(self):
-        print "Running testCountDelimIDump..."
-        self.assertRunOK("batchput x.1 x1 --create_if_missing", "OK")
-        self.assertRunOK("batchput y.abc abc y.2 2 z.13c pqr", "OK")
-        self.assertRunOK("dump --count_delim", "x => count:1\tsize:5\ny => count:2\tsize:12\nz => count:1\tsize:8")
-        self.assertRunOK("dump --count_delim=\".\"", "x => count:1\tsize:5\ny => count:2\tsize:12\nz => count:1\tsize:8")
-        self.assertRunOK("batchput x,2 x2 x,abc xabc", "OK")
-        self.assertRunOK("dump --count_delim=\",\"", "x => count:2\tsize:14\nx.1 => count:1\tsize:5\ny.2 => count:1\tsize:4\ny.abc => count:1\tsize:8\nz.13c => count:1\tsize:8")
-
-    def testInvalidCmdLines(self):
-        print "Running testInvalidCmdLines..."
-        # db not specified
-        self.assertRunFAILFull("put 0x6133 0x6233 --hex --create_if_missing")
-        # No param called he
-        self.assertRunFAIL("put 0x6133 0x6233 --he --create_if_missing")
-        # max_keys is not applicable for put
-        self.assertRunFAIL("put 0x6133 0x6233 --max_keys=1 --create_if_missing")
-        # hex has invalid boolean value
-
-    def testHexPutGet(self):
-        print "Running testHexPutGet..."
-        self.assertRunOK("put a1 b1 --create_if_missing", "OK")
-        self.assertRunOK("scan", "a1 : b1")
-        self.assertRunOK("scan --hex", "0x6131 : 0x6231")
-        self.assertRunFAIL("put --hex 6132 6232")
-        self.assertRunOK("put --hex 0x6132 0x6232", "OK")
-        self.assertRunOK("scan --hex", "0x6131 : 0x6231\n0x6132 : 0x6232")
-        self.assertRunOK("scan", "a1 : b1\na2 : b2")
-        self.assertRunOK("get a1", "b1")
-        self.assertRunOK("get --hex 0x6131", "0x6231")
-        self.assertRunOK("get a2", "b2")
-        self.assertRunOK("get --hex 0x6132", "0x6232")
-        self.assertRunOK("get --key_hex 0x6132", "b2")
-        self.assertRunOK("get --key_hex --value_hex 0x6132", "0x6232")
-        self.assertRunOK("get --value_hex a2", "0x6232")
-        self.assertRunOK("scan --key_hex --value_hex",
-                "0x6131 : 0x6231\n0x6132 : 0x6232")
-        self.assertRunOK("scan --hex --from=0x6131 --to=0x6133",
-                "0x6131 : 0x6231\n0x6132 : 0x6232")
-        self.assertRunOK("scan --hex --from=0x6131 --to=0x6132",
-                "0x6131 : 0x6231")
-        self.assertRunOK("scan --key_hex", "0x6131 : b1\n0x6132 : b2")
-        self.assertRunOK("scan --value_hex", "a1 : 0x6231\na2 : 0x6232")
-        self.assertRunOK("batchput --hex 0x6133 0x6233 0x6134 0x6234", "OK")
-        self.assertRunOK("scan", "a1 : b1\na2 : b2\na3 : b3\na4 : b4")
-        self.assertRunOK("delete --hex 0x6133", "OK")
-        self.assertRunOK("scan", "a1 : b1\na2 : b2\na4 : b4")
-        self.assertRunOK("checkconsistency", "OK")
-
-    def testTtlPutGet(self):
-        print "Running testTtlPutGet..."
-        self.assertRunOK("put a1 b1 --ttl --create_if_missing", "OK")
-        self.assertRunOK("scan --hex", "0x6131 : 0x6231", True)
-        self.assertRunOK("dump --ttl ", "a1 ==> b1", True)
-        self.assertRunOK("dump --hex --ttl ",
-                         "0x6131 ==> 0x6231\nKeys in range: 1")
-        self.assertRunOK("scan --hex --ttl", "0x6131 : 0x6231")
-        self.assertRunOK("get --value_hex a1", "0x6231", True)
-        self.assertRunOK("get --ttl a1", "b1")
-        self.assertRunOK("put a3 b3 --create_if_missing", "OK")
-        # fails because timstamp's length is greater than value's
-        self.assertRunFAIL("get --ttl a3")
-        self.assertRunOK("checkconsistency", "OK")
-
-    def testInvalidCmdLines(self):
-        print "Running testInvalidCmdLines..."
-        # db not specified
-        self.assertRunFAILFull("put 0x6133 0x6233 --hex --create_if_missing")
-        # No param called he
-        self.assertRunFAIL("put 0x6133 0x6233 --he --create_if_missing")
-        # max_keys is not applicable for put
-        self.assertRunFAIL("put 0x6133 0x6233 --max_keys=1 --create_if_missing")
-        # hex has invalid boolean value
-        self.assertRunFAIL("put 0x6133 0x6233 --hex=Boo --create_if_missing")
-
-    def testDumpLoad(self):
-        print "Running testDumpLoad..."
-        self.assertRunOK("batchput --create_if_missing x1 y1 x2 y2 x3 y3 x4 y4",
-                "OK")
-        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
-        origDbPath = os.path.join(self.TMP_DIR, self.DB_NAME)
-
-        # Dump and load without any additional params specified
-        dumpFilePath = os.path.join(self.TMP_DIR, "dump1")
-        loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump1")
-        self.assertTrue(self.dumpDb("--db=%s" % origDbPath, dumpFilePath))
-        self.assertTrue(self.loadDb(
-            "--db=%s --create_if_missing" % loadedDbPath, dumpFilePath))
-        self.assertRunOKFull("scan --db=%s" % loadedDbPath,
-                "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
-
-        # Dump and load in hex
-        dumpFilePath = os.path.join(self.TMP_DIR, "dump2")
-        loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump2")
-        self.assertTrue(self.dumpDb("--db=%s --hex" % origDbPath, dumpFilePath))
-        self.assertTrue(self.loadDb(
-            "--db=%s --hex --create_if_missing" % loadedDbPath, dumpFilePath))
-        self.assertRunOKFull("scan --db=%s" % loadedDbPath,
-                "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
-
-        # Dump only a portion of the key range
-        dumpFilePath = os.path.join(self.TMP_DIR, "dump3")
-        loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump3")
-        self.assertTrue(self.dumpDb(
-            "--db=%s --from=x1 --to=x3" % origDbPath, dumpFilePath))
-        self.assertTrue(self.loadDb(
-            "--db=%s --create_if_missing" % loadedDbPath, dumpFilePath))
-        self.assertRunOKFull("scan --db=%s" % loadedDbPath, "x1 : y1\nx2 : y2")
-
-        # Dump upto max_keys rows
-        dumpFilePath = os.path.join(self.TMP_DIR, "dump4")
-        loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump4")
-        self.assertTrue(self.dumpDb(
-            "--db=%s --max_keys=3" % origDbPath, dumpFilePath))
-        self.assertTrue(self.loadDb(
-            "--db=%s --create_if_missing" % loadedDbPath, dumpFilePath))
-        self.assertRunOKFull("scan --db=%s" % loadedDbPath,
-                "x1 : y1\nx2 : y2\nx3 : y3")
-
-        # Load into an existing db, create_if_missing is not specified
-        self.assertTrue(self.dumpDb("--db=%s" % origDbPath, dumpFilePath))
-        self.assertTrue(self.loadDb("--db=%s" % loadedDbPath, dumpFilePath))
-        self.assertRunOKFull("scan --db=%s" % loadedDbPath,
-                "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
-
-        # Dump and load with WAL disabled
-        dumpFilePath = os.path.join(self.TMP_DIR, "dump5")
-        loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump5")
-        self.assertTrue(self.dumpDb("--db=%s" % origDbPath, dumpFilePath))
-        self.assertTrue(self.loadDb(
-            "--db=%s --disable_wal --create_if_missing" % loadedDbPath,
-            dumpFilePath))
-        self.assertRunOKFull("scan --db=%s" % loadedDbPath,
-                "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
-
-        # Dump and load with lots of extra params specified
-        extraParams = " ".join(["--bloom_bits=14", "--compression_type=bzip2",
-                                "--block_size=1024", "--auto_compaction=true",
-                                "--write_buffer_size=4194304",
-                                "--file_size=2097152"])
-        dumpFilePath = os.path.join(self.TMP_DIR, "dump6")
-        loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump6")
-        self.assertTrue(self.dumpDb(
-            "--db=%s %s" % (origDbPath, extraParams), dumpFilePath))
-        self.assertTrue(self.loadDb(
-            "--db=%s %s --create_if_missing" % (loadedDbPath, extraParams),
-            dumpFilePath))
-        self.assertRunOKFull("scan --db=%s" % loadedDbPath,
-                "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
-
-        # Dump with count_only
-        dumpFilePath = os.path.join(self.TMP_DIR, "dump7")
-        loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump7")
-        self.assertTrue(self.dumpDb(
-            "--db=%s --count_only" % origDbPath, dumpFilePath))
-        self.assertTrue(self.loadDb(
-            "--db=%s --create_if_missing" % loadedDbPath, dumpFilePath))
-        # DB should have atleast one value for scan to work
-        self.assertRunOKFull("put --db=%s k1 v1" % loadedDbPath, "OK")
-        self.assertRunOKFull("scan --db=%s" % loadedDbPath, "k1 : v1")
-
-        # Dump command fails because of typo in params
-        dumpFilePath = os.path.join(self.TMP_DIR, "dump8")
-        self.assertFalse(self.dumpDb(
-            "--db=%s --create_if_missing" % origDbPath, dumpFilePath))
-
-    def testMiscAdminTask(self):
-        print "Running testMiscAdminTask..."
-        # These tests need to be improved; for example with asserts about
-        # whether compaction or level reduction actually took place.
-        self.assertRunOK("batchput --create_if_missing x1 y1 x2 y2 x3 y3 x4 y4",
-                "OK")
-        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
-        origDbPath = os.path.join(self.TMP_DIR, self.DB_NAME)
-
-        self.assertTrue(0 == run_err_null(
-            "./ldb compact --db=%s" % origDbPath))
-        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
-
-        self.assertTrue(0 == run_err_null(
-            "./ldb reduce_levels --db=%s --new_levels=2" % origDbPath))
-        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
-
-        self.assertTrue(0 == run_err_null(
-            "./ldb reduce_levels --db=%s --new_levels=3" % origDbPath))
-        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
-
-        self.assertTrue(0 == run_err_null(
-            "./ldb compact --db=%s --from=x1 --to=x3" % origDbPath))
-        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
-
-        self.assertTrue(0 == run_err_null(
-            "./ldb compact --db=%s --hex --from=0x6131 --to=0x6134"
-            % origDbPath))
-        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
-
-        #TODO(dilip): Not sure what should be passed to WAL.Currently corrupted.
-        self.assertTrue(0 == run_err_null(
-            "./ldb dump_wal --db=%s --walfile=%s --header" % (
-                origDbPath, os.path.join(origDbPath, "LOG"))))
-        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
-
-    def testCheckConsistency(self):
-        print "Running testCheckConsistency..."
-
-        dbPath = os.path.join(self.TMP_DIR, self.DB_NAME)
-        self.assertRunOK("put x1 y1 --create_if_missing", "OK")
-        self.assertRunOK("put x2 y2", "OK")
-        self.assertRunOK("get x1", "y1")
-        self.assertRunOK("checkconsistency", "OK")
-
-        sstFilePath = my_check_output("ls %s" % os.path.join(dbPath, "*.sst"),
-                                      shell=True)
-
-        # Modify the file
-        my_check_output("echo 'evil' > %s" % sstFilePath, shell=True)
-        self.assertRunFAIL("checkconsistency")
-
-        # Delete the file
-        my_check_output("rm -f %s" % sstFilePath, shell=True)
-        self.assertRunFAIL("checkconsistency")
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/src/rocksdb/tools/reduce_levels_test.cc b/src/rocksdb/tools/reduce_levels_test.cc
deleted file mode 100644
index b588b52..0000000
--- a/src/rocksdb/tools/reduce_levels_test.cc
+++ /dev/null
@@ -1,197 +0,0 @@
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-//
-#include "rocksdb/db.h"
-#include "db/db_impl.h"
-#include "db/version_set.h"
-#include "util/logging.h"
-#include "util/testutil.h"
-#include "util/testharness.h"
-#include "util/ldb_cmd.h"
-
-namespace rocksdb {
-
-class ReduceLevelTest {
-public:
-  ReduceLevelTest() {
-    dbname_ = test::TmpDir() + "/db_reduce_levels_test";
-    DestroyDB(dbname_, Options());
-    db_ = nullptr;
-  }
-
-  Status OpenDB(bool create_if_missing, int levels,
-      int mem_table_compact_level);
-
-  Status Put(const std::string& k, const std::string& v) {
-    return db_->Put(WriteOptions(), k, v);
-  }
-
-  std::string Get(const std::string& k) {
-    ReadOptions options;
-    std::string result;
-    Status s = db_->Get(options, k, &result);
-    if (s.IsNotFound()) {
-      result = "NOT_FOUND";
-    } else if (!s.ok()) {
-      result = s.ToString();
-    }
-    return result;
-  }
-
-  Status CompactMemTable() {
-    if (db_ == nullptr) {
-      return Status::InvalidArgument("DB not opened.");
-    }
-    DBImpl* db_impl = reinterpret_cast<DBImpl*>(db_);
-    return db_impl->TEST_FlushMemTable();
-  }
-
-  void CloseDB() {
-    if (db_ != nullptr) {
-      delete db_;
-      db_ = nullptr;
-    }
-  }
-
-  bool ReduceLevels(int target_level);
-
-  int FilesOnLevel(int level) {
-    std::string property;
-    ASSERT_TRUE(
-        db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(level),
-                         &property));
-    return atoi(property.c_str());
-  }
-
-private:
-  std::string dbname_;
-  DB* db_;
-};
-
-Status ReduceLevelTest::OpenDB(bool create_if_missing, int num_levels,
-    int mem_table_compact_level) {
-  rocksdb::Options opt;
-  opt.num_levels = num_levels;
-  opt.create_if_missing = create_if_missing;
-  opt.max_mem_compaction_level = mem_table_compact_level;
-  rocksdb::Status st = rocksdb::DB::Open(opt, dbname_, &db_);
-  if (!st.ok()) {
-    fprintf(stderr, "Can't open the db:%s\n", st.ToString().c_str());
-  }
-  return st;
-}
-
-bool ReduceLevelTest::ReduceLevels(int target_level) {
-  std::vector<std::string> args = rocksdb::ReduceDBLevelsCommand::PrepareArgs(
-      dbname_, target_level, false);
-  LDBCommand* level_reducer = LDBCommand::InitFromCmdLineArgs(args);
-  level_reducer->Run();
-  bool is_succeed = level_reducer->GetExecuteState().IsSucceed();
-  delete level_reducer;
-  return is_succeed;
-}
-
-TEST(ReduceLevelTest, Last_Level) {
-  // create files on all levels;
-  ASSERT_OK(OpenDB(true, 4, 3));
-  ASSERT_OK(Put("aaaa", "11111"));
-  ASSERT_OK(CompactMemTable());
-  ASSERT_EQ(FilesOnLevel(3), 1);
-  CloseDB();
-
-  ASSERT_TRUE(ReduceLevels(3));
-  ASSERT_OK(OpenDB(true, 3, 1));
-  ASSERT_EQ(FilesOnLevel(2), 1);
-  CloseDB();
-
-  ASSERT_TRUE(ReduceLevels(2));
-  ASSERT_OK(OpenDB(true, 2, 1));
-  ASSERT_EQ(FilesOnLevel(1), 1);
-  CloseDB();
-}
-
-TEST(ReduceLevelTest, Top_Level) {
-  // create files on all levels;
-  ASSERT_OK(OpenDB(true, 5, 0));
-  ASSERT_OK(Put("aaaa", "11111"));
-  ASSERT_OK(CompactMemTable());
-  ASSERT_EQ(FilesOnLevel(0), 1);
-  CloseDB();
-
-  ASSERT_TRUE(ReduceLevels(4));
-  ASSERT_OK(OpenDB(true, 4, 0));
-  CloseDB();
-
-  ASSERT_TRUE(ReduceLevels(3));
-  ASSERT_OK(OpenDB(true, 3, 0));
-  CloseDB();
-
-  ASSERT_TRUE(ReduceLevels(2));
-  ASSERT_OK(OpenDB(true, 2, 0));
-  CloseDB();
-}
-
-TEST(ReduceLevelTest, All_Levels) {
-  // create files on all levels;
-  ASSERT_OK(OpenDB(true, 5, 1));
-  ASSERT_OK(Put("a", "a11111"));
-  ASSERT_OK(CompactMemTable());
-  ASSERT_EQ(FilesOnLevel(1), 1);
-  CloseDB();
-
-  ASSERT_OK(OpenDB(true, 5, 2));
-  ASSERT_OK(Put("b", "b11111"));
-  ASSERT_OK(CompactMemTable());
-  ASSERT_EQ(FilesOnLevel(1), 1);
-  ASSERT_EQ(FilesOnLevel(2), 1);
-  CloseDB();
-
-  ASSERT_OK(OpenDB(true, 5, 3));
-  ASSERT_OK(Put("c", "c11111"));
-  ASSERT_OK(CompactMemTable());
-  ASSERT_EQ(FilesOnLevel(1), 1);
-  ASSERT_EQ(FilesOnLevel(2), 1);
-  ASSERT_EQ(FilesOnLevel(3), 1);
-  CloseDB();
-
-  ASSERT_OK(OpenDB(true, 5, 4));
-  ASSERT_OK(Put("d", "d11111"));
-  ASSERT_OK(CompactMemTable());
-  ASSERT_EQ(FilesOnLevel(1), 1);
-  ASSERT_EQ(FilesOnLevel(2), 1);
-  ASSERT_EQ(FilesOnLevel(3), 1);
-  ASSERT_EQ(FilesOnLevel(4), 1);
-  CloseDB();
-
-  ASSERT_TRUE(ReduceLevels(4));
-  ASSERT_OK(OpenDB(true, 4, 0));
-  ASSERT_EQ("a11111", Get("a"));
-  ASSERT_EQ("b11111", Get("b"));
-  ASSERT_EQ("c11111", Get("c"));
-  ASSERT_EQ("d11111", Get("d"));
-  CloseDB();
-
-  ASSERT_TRUE(ReduceLevels(3));
-  ASSERT_OK(OpenDB(true, 3, 0));
-  ASSERT_EQ("a11111", Get("a"));
-  ASSERT_EQ("b11111", Get("b"));
-  ASSERT_EQ("c11111", Get("c"));
-  ASSERT_EQ("d11111", Get("d"));
-  CloseDB();
-
-  ASSERT_TRUE(ReduceLevels(2));
-  ASSERT_OK(OpenDB(true, 2, 0));
-  ASSERT_EQ("a11111", Get("a"));
-  ASSERT_EQ("b11111", Get("b"));
-  ASSERT_EQ("c11111", Get("c"));
-  ASSERT_EQ("d11111", Get("d"));
-  CloseDB();
-}
-
-}
-
-int main(int argc, char** argv) {
-  return rocksdb::test::RunAllTests();
-}
diff --git a/src/rocksdb/tools/sst_dump.cc b/src/rocksdb/tools/sst_dump.cc
deleted file mode 100644
index 9a144bb..0000000
--- a/src/rocksdb/tools/sst_dump.cc
+++ /dev/null
@@ -1,367 +0,0 @@
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-//
-
-#include <map>
-#include <string>
-#include <vector>
-#include <inttypes.h>
-
-#include "db/dbformat.h"
-#include "db/memtable.h"
-#include "db/write_batch_internal.h"
-#include "rocksdb/db.h"
-#include "rocksdb/env.h"
-#include "rocksdb/iterator.h"
-#include "rocksdb/slice_transform.h"
-#include "rocksdb/table.h"
-#include "rocksdb/table_properties.h"
-#include "table/block_based_table_factory.h"
-#include "table/plain_table_factory.h"
-#include "table/meta_blocks.h"
-#include "table/block.h"
-#include "table/block_builder.h"
-#include "table/format.h"
-#include "util/ldb_cmd.h"
-#include "util/random.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
-
-namespace rocksdb {
-
-class SstFileReader {
- public:
-  explicit SstFileReader(const std::string& file_name,
-                         bool verify_checksum,
-                         bool output_hex);
-
-  Status ReadSequential(bool print_kv,
-                        uint64_t read_num,
-                        bool has_from,
-                        const std::string& from_key,
-                        bool has_to,
-                        const std::string& to_key);
-
-  Status ReadTableProperties(
-      std::shared_ptr<const TableProperties>* table_properties);
-  uint64_t GetReadNumber() { return read_num_; }
-
- private:
-  Status NewTableReader(const std::string& file_path);
-  Status SetTableOptionsByMagicNumber(uint64_t table_magic_number,
-                                      RandomAccessFile* file,
-                                      uint64_t file_size);
-
-  std::string file_name_;
-  uint64_t read_num_;
-  bool verify_checksum_;
-  bool output_hex_;
-  EnvOptions soptions_;
-
-  Status init_result_;
-  unique_ptr<TableReader> table_reader_;
-  unique_ptr<RandomAccessFile> file_;
-  // options_ and internal_comparator_ will also be used in
-  // ReadSequential internally (specifically, seek-related operations)
-  Options options_;
-  InternalKeyComparator internal_comparator_;
-};
-
-SstFileReader::SstFileReader(const std::string& file_path,
-                             bool verify_checksum,
-                             bool output_hex)
-    :file_name_(file_path), read_num_(0), verify_checksum_(verify_checksum),
-    output_hex_(output_hex), internal_comparator_(BytewiseComparator()) {
-  fprintf(stdout, "Process %s\n", file_path.c_str());
-
-  init_result_ = NewTableReader(file_name_);
-}
-
-extern uint64_t kBlockBasedTableMagicNumber;
-extern uint64_t kPlainTableMagicNumber;
-
-Status SstFileReader::NewTableReader(const std::string& file_path) {
-  uint64_t magic_number;
-
-  // read table magic number
-  Footer footer;
-
-  unique_ptr<RandomAccessFile> file;
-  uint64_t file_size;
-  Status s = options_.env->NewRandomAccessFile(file_path, &file_, soptions_);
-  if (s.ok()) {
-    s = options_.env->GetFileSize(file_path, &file_size);
-  }
-  if (s.ok()) {
-    s = ReadFooterFromFile(file_.get(), file_size, &footer);
-  }
-  if (s.ok()) {
-    magic_number = footer.table_magic_number();
-  }
-
-  if (s.ok()) {
-    if (magic_number == kPlainTableMagicNumber) {
-      soptions_.use_mmap_reads = true;
-    }
-    options_.comparator = &internal_comparator_;
-    s = SetTableOptionsByMagicNumber(magic_number, file_.get(), file_size);
-  }
-
-  if (s.ok()) {
-    s = options_.table_factory->NewTableReader(
-        options_, soptions_, internal_comparator_, std::move(file_), file_size,
-        &table_reader_);
-  }
-  return s;
-}
-
-Status SstFileReader::SetTableOptionsByMagicNumber(uint64_t table_magic_number,
-                                                   RandomAccessFile* file,
-                                                   uint64_t file_size) {
-  TableProperties* table_properties;
-  Status s = rocksdb::ReadTableProperties(file, file_size, table_magic_number,
-                                          options_.env, options_.info_log.get(),
-                                          &table_properties);
-  if (!s.ok()) {
-    return s;
-  }
-  std::unique_ptr<TableProperties> props_guard(table_properties);
-
-  if (table_magic_number == kBlockBasedTableMagicNumber) {
-    options_.table_factory = std::make_shared<BlockBasedTableFactory>();
-    fprintf(stdout, "Sst file format: block-based\n");
-  } else if (table_magic_number == kPlainTableMagicNumber) {
-    options_.allow_mmap_reads = true;
-    options_.table_factory = std::make_shared<PlainTableFactory>(
-        table_properties->fixed_key_len, 2, 0.8);
-    options_.prefix_extractor.reset(NewNoopTransform());
-    fprintf(stdout, "Sst file format: plain table\n");
-  } else {
-    char error_msg_buffer[80];
-    snprintf(error_msg_buffer, sizeof(error_msg_buffer) - 1,
-             "Unsupported table magic number --- %lx",
-             (long)table_magic_number);
-    return Status::InvalidArgument(error_msg_buffer);
-  }
-
-  return Status::OK();
-}
-
-Status SstFileReader::ReadSequential(bool print_kv,
-                                     uint64_t read_num,
-                                     bool has_from,
-                                     const std::string& from_key,
-                                     bool has_to,
-                                     const std::string& to_key) {
-  if (!table_reader_) {
-    return init_result_;
-  }
-
-  Iterator* iter = table_reader_->NewIterator(ReadOptions(verify_checksum_,
-                                                         false));
-  uint64_t i = 0;
-  if (has_from) {
-    InternalKey ikey(from_key, kMaxSequenceNumber, kValueTypeForSeek);
-    iter->Seek(ikey.Encode());
-  } else {
-    iter->SeekToFirst();
-  }
-  for (; iter->Valid(); iter->Next()) {
-    Slice key = iter->key();
-    Slice value = iter->value();
-    ++i;
-    if (read_num > 0 && i > read_num)
-      break;
-
-    ParsedInternalKey ikey;
-    if (!ParseInternalKey(key, &ikey)) {
-      std::cerr << "Internal Key ["
-                << key.ToString(true /* in hex*/)
-                << "] parse error!\n";
-      continue;
-    }
-
-    // If end marker was specified, we stop before it
-    if (has_to && BytewiseComparator()->Compare(ikey.user_key, to_key) >= 0) {
-      break;
-    }
-
-    if (print_kv) {
-      fprintf(stdout, "%s => %s\n",
-          ikey.DebugString(output_hex_).c_str(),
-          value.ToString(output_hex_).c_str());
-    }
-  }
-
-  read_num_ += i;
-
-  Status ret = iter->status();
-  delete iter;
-  return ret;
-}
-
-Status SstFileReader::ReadTableProperties(
-    std::shared_ptr<const TableProperties>* table_properties) {
-  if (!table_reader_) {
-    return init_result_;
-  }
-
-  *table_properties = table_reader_->GetTableProperties();
-  return init_result_;
-}
-
-}  // namespace rocksdb
-
-static void print_help() {
-  fprintf(stderr,
-      "sst_dump [--command=check|scan] [--verify_checksum] "
-      "--file=data_dir_OR_sst_file"
-      " [--output_hex]"
-      " [--input_key_hex]"
-      " [--from=<user_key>]"
-      " [--to=<user_key>]"
-      " [--read_num=NUM]"
-      " [--show_properties]\n");
-}
-
-namespace {
-string HexToString(const string& str) {
-  string parsed;
-  if (str[0] != '0' || str[1] != 'x') {
-    fprintf(stderr, "Invalid hex input %s.  Must start with 0x\n",
-            str.c_str());
-    throw "Invalid hex input";
-  }
-
-  for (unsigned int i = 2; i < str.length();) {
-    int c;
-    sscanf(str.c_str() + i, "%2X", &c);
-    parsed.push_back(c);
-    i += 2;
-  }
-  return parsed;
-}
-}  // namespace
-
-int main(int argc, char** argv) {
-  const char* dir_or_file = nullptr;
-  uint64_t read_num = -1;
-  std::string command;
-
-  char junk;
-  uint64_t n;
-  bool verify_checksum = false;
-  bool output_hex = false;
-  bool input_key_hex = false;
-  bool has_from = false;
-  bool has_to = false;
-  bool show_properties = false;
-  std::string from_key;
-  std::string to_key;
-  for (int i = 1; i < argc; i++) {
-    if (strncmp(argv[i], "--file=", 7) == 0) {
-      dir_or_file = argv[i] + 7;
-    } else if (strcmp(argv[i], "--output_hex") == 0) {
-      output_hex = true;
-    } else if (strcmp(argv[i], "--input_key_hex") == 0) {
-      input_key_hex = true;
-    } else if (sscanf(argv[i],
-               "--read_num=%lu%c",
-               (unsigned long*)&n, &junk) == 1) {
-      read_num = n;
-    } else if (strcmp(argv[i], "--verify_checksum") == 0) {
-      verify_checksum = true;
-    } else if (strncmp(argv[i], "--command=", 10) == 0) {
-      command = argv[i] + 10;
-    } else if (strncmp(argv[i], "--from=", 7) == 0) {
-      from_key = argv[i] + 7;
-      has_from = true;
-    } else if (strncmp(argv[i], "--to=", 5) == 0) {
-      to_key = argv[i] + 5;
-      has_to = true;
-    } else if (strcmp(argv[i], "--show_properties") == 0) {
-      show_properties = true;
-    } else {
-      print_help();
-      exit(1);
-    }
-  }
-
-
-  if (input_key_hex) {
-    if (has_from) {
-      from_key = HexToString(from_key);
-    }
-    if (has_to) {
-      to_key = HexToString(to_key);
-    }
-  }
-
-  if (dir_or_file == nullptr) {
-    print_help();
-    exit(1);
-  }
-
-  std::vector<std::string> filenames;
-  rocksdb::Env* env = rocksdb::Env::Default();
-  rocksdb::Status st = env->GetChildren(dir_or_file, &filenames);
-  bool dir = true;
-  if (!st.ok()) {
-    filenames.clear();
-    filenames.push_back(dir_or_file);
-    dir = false;
-  }
-
-  fprintf(stdout, "from [%s] to [%s]\n",
-      rocksdb::Slice(from_key).ToString(true).c_str(),
-      rocksdb::Slice(to_key).ToString(true).c_str());
-
-  uint64_t total_read = 0;
-  for (size_t i = 0; i < filenames.size(); i++) {
-    std::string filename = filenames.at(i);
-    if (filename.length() <= 4 ||
-        filename.rfind(".sst") != filename.length() - 4) {
-      // ignore
-      continue;
-    }
-    if (dir) {
-      filename = std::string(dir_or_file) + "/" + filename;
-    }
-    rocksdb::SstFileReader reader(filename, verify_checksum,
-                                  output_hex);
-    rocksdb::Status st;
-    // scan all files in give file path.
-    if (command == "" || command == "scan" || command == "check") {
-      st = reader.ReadSequential(command != "check",
-                                 read_num > 0 ? (read_num - total_read) :
-                                                read_num,
-                                 has_from, from_key, has_to, to_key);
-      if (!st.ok()) {
-        fprintf(stderr, "%s: %s\n", filename.c_str(),
-            st.ToString().c_str());
-      }
-      total_read += reader.GetReadNumber();
-      if (read_num > 0 && total_read > read_num) {
-        break;
-      }
-    }
-    if (show_properties) {
-      std::shared_ptr<const rocksdb::TableProperties> table_properties;
-      st = reader.ReadTableProperties(&table_properties);
-      if (!st.ok()) {
-        fprintf(stderr, "%s: %s\n", filename.c_str(), st.ToString().c_str());
-      } else {
-        fprintf(stdout,
-                "Table Properties:\n"
-                "------------------------------\n"
-                "  %s",
-                table_properties->ToString("\n  ", ": ").c_str());
-        fprintf(stdout, "# deleted keys: %zd\n",
-                rocksdb::GetDeletedKeys(
-                    table_properties->user_collected_properties));
-      }
-    }
-  }
-}
diff --git a/src/rocksdb/util/allocator.h b/src/rocksdb/util/allocator.h
new file mode 100644
index 0000000..58bf0da
--- /dev/null
+++ b/src/rocksdb/util/allocator.h
@@ -0,0 +1,32 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Abstract interface for allocating memory in blocks. This memory is freed
+// when the allocator object is destroyed. See the Arena class for more info.
+
+#pragma once
+#include <cstddef>
+#include <cerrno>
+
+namespace rocksdb {
+
+class Logger;
+
+class Allocator {
+ public:
+  virtual ~Allocator() {}
+
+  virtual char* Allocate(size_t bytes) = 0;
+  virtual char* AllocateAligned(size_t bytes, size_t huge_page_size = 0,
+                                Logger* logger = nullptr) = 0;
+
+  virtual size_t BlockSize() const = 0;
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/arena.cc b/src/rocksdb/util/arena.cc
index 0e36bb5..3f00f08 100644
--- a/src/rocksdb/util/arena.cc
+++ b/src/rocksdb/util/arena.cc
@@ -10,9 +10,11 @@
 #include "util/arena.h"
 #include <sys/mman.h>
 #include <algorithm>
+#include "rocksdb/env.h"
 
 namespace rocksdb {
 
+const size_t Arena::kInlineSize;
 const size_t Arena::kMinBlockSize = 4096;
 const size_t Arena::kMaxBlockSize = 2 << 30;
 static const int kAlignUnit = sizeof(void*);
@@ -30,9 +32,20 @@ size_t OptimizeBlockSize(size_t block_size) {
   return block_size;
 }
 
-Arena::Arena(size_t block_size) : kBlockSize(OptimizeBlockSize(block_size)) {
+Arena::Arena(size_t block_size, size_t huge_page_size)
+    : kBlockSize(OptimizeBlockSize(block_size)) {
   assert(kBlockSize >= kMinBlockSize && kBlockSize <= kMaxBlockSize &&
          kBlockSize % kAlignUnit == 0);
+  alloc_bytes_remaining_ = sizeof(inline_block_);
+  blocks_memory_ += alloc_bytes_remaining_;
+  aligned_alloc_ptr_ = inline_block_;
+  unaligned_alloc_ptr_ = inline_block_ + alloc_bytes_remaining_;
+#ifdef MAP_HUGETLB
+  hugetlb_size_ = huge_page_size;
+  if (hugetlb_size_ && kBlockSize > hugetlb_size_) {
+    hugetlb_size_ = ((kBlockSize - 1U) / hugetlb_size_ + 1U) * hugetlb_size_;
+  }
+#endif
 }
 
 Arena::~Arena() {
@@ -56,39 +69,69 @@ char* Arena::AllocateFallback(size_t bytes, bool aligned) {
   }
 
   // We waste the remaining space in the current block.
-  auto block_head = AllocateNewBlock(kBlockSize);
-  alloc_bytes_remaining_ = kBlockSize - bytes;
+  size_t size;
+  char* block_head = nullptr;
+  if (hugetlb_size_) {
+    size = hugetlb_size_;
+    block_head = AllocateFromHugePage(size);
+  }
+  if (!block_head) {
+    size = kBlockSize;
+    block_head = AllocateNewBlock(size);
+  }
+  alloc_bytes_remaining_ = size - bytes;
 
   if (aligned) {
     aligned_alloc_ptr_ = block_head + bytes;
-    unaligned_alloc_ptr_ = block_head + kBlockSize;
+    unaligned_alloc_ptr_ = block_head + size;
     return block_head;
   } else {
     aligned_alloc_ptr_ = block_head;
-    unaligned_alloc_ptr_ = block_head + kBlockSize - bytes;
+    unaligned_alloc_ptr_ = block_head + size - bytes;
     return unaligned_alloc_ptr_;
   }
 }
 
-char* Arena::AllocateAligned(size_t bytes, size_t huge_page_tlb_size) {
+char* Arena::AllocateFromHugePage(size_t bytes) {
+#ifdef MAP_HUGETLB
+  if (hugetlb_size_ == 0) {
+    return nullptr;
+  }
+
+  void* addr = mmap(nullptr, bytes, (PROT_READ | PROT_WRITE),
+                    (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB), 0, 0);
+
+  if (addr == MAP_FAILED) {
+    return nullptr;
+  }
+  huge_blocks_.push_back(MmapInfo(addr, bytes));
+  blocks_memory_ += bytes;
+  return reinterpret_cast<char*>(addr);
+#else
+  return nullptr;
+#endif
+}
+
+char* Arena::AllocateAligned(size_t bytes, size_t huge_page_size,
+                             Logger* logger) {
   assert((kAlignUnit & (kAlignUnit - 1)) ==
          0);  // Pointer size should be a power of 2
 
-#ifdef OS_LINUX
-  if (huge_page_tlb_size > 0 && bytes > 0) {
+#ifdef MAP_HUGETLB
+  if (huge_page_size > 0 && bytes > 0) {
     // Allocate from a huge page TBL table.
+    assert(logger != nullptr);  // logger need to be passed in.
     size_t reserved_size =
-        ((bytes - 1U) / huge_page_tlb_size + 1U) * huge_page_tlb_size;
+        ((bytes - 1U) / huge_page_size + 1U) * huge_page_size;
     assert(reserved_size >= bytes);
-    void* addr = mmap(nullptr, reserved_size, (PROT_READ | PROT_WRITE),
-                      (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB), 0, 0);
-    if (addr == MAP_FAILED) {
-      // TODO(sdong): Better handling
+
+    char* addr = AllocateFromHugePage(reserved_size);
+    if (addr == nullptr) {
+      Warn(logger, "AllocateAligned fail to allocate huge TLB pages: %s",
+           strerror(errno));
       // fail back to malloc
     } else {
-      blocks_memory_ += reserved_size;
-      huge_blocks_.push_back(MmapInfo(addr, reserved_size));
-      return reinterpret_cast<char*>(addr);
+      return addr;
     }
   }
 #endif
diff --git a/src/rocksdb/util/arena.h b/src/rocksdb/util/arena.h
index a4dff49..1ae50e2 100644
--- a/src/rocksdb/util/arena.h
+++ b/src/rocksdb/util/arena.h
@@ -7,41 +7,52 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-// Arena is an implementation of Arena class. For a request of small size,
+// Arena is an implementation of Allocator class. For a request of small size,
 // it allocates a block with pre-defined block size. For a request of big
 // size, it uses malloc to directly get the requested size.
 
 #pragma once
 #include <cstddef>
+#include <cerrno>
 #include <vector>
 #include <assert.h>
 #include <stdint.h>
-#include "util/arena.h"
+#include "util/allocator.h"
 
 namespace rocksdb {
 
-class Arena {
+class Arena : public Allocator {
  public:
   // No copying allowed
   Arena(const Arena&) = delete;
   void operator=(const Arena&) = delete;
 
+  static const size_t kInlineSize = 2048;
   static const size_t kMinBlockSize;
   static const size_t kMaxBlockSize;
 
-  explicit Arena(size_t block_size = kMinBlockSize);
+  // huge_page_size: if 0, don't use huge page TLB. If > 0 (should set to the
+  // supported hugepage size of the system), block allocation will try huge
+  // page TLB first. If allocation fails, will fall back to normal case.
+  explicit Arena(size_t block_size = kMinBlockSize, size_t huge_page_size = 0);
   ~Arena();
 
-  char* Allocate(size_t bytes);
+  char* Allocate(size_t bytes) override;
 
-  // huge_page_tlb_size: if >0, allocate bytes from huge page TLB and the size
-  // of the huge page TLB. Bytes will be rounded up to multiple and 2MB and
-  // allocate huge pages through mmap anonymous option with huge page on.
-  // The extra  space allocated will be wasted. To enable it, need to reserve
-  // huge pages for it to be allocated, like:
+  // huge_page_size: if >0, will try to allocate from huage page TLB.
+  // The argument will be the size of the page size for huge page TLB. Bytes
+  // will be rounded up to multiple of the page size to allocate through mmap
+  // anonymous option with huge page on. The extra  space allocated will be
+  // wasted. If allocation fails, will fall back to normal case. To enable it,
+  // need to reserve huge pages for it to be allocated, like:
   //     sysctl -w vm.nr_hugepages=20
   // See linux doc Documentation/vm/hugetlbpage.txt for details.
-  char* AllocateAligned(size_t bytes, size_t huge_page_tlb_size = 0);
+  // huge page allocation can fail. In this case it will fail back to
+  // normal cases. The messages will be logged to logger. So when calling with
+  // huge_page_tlb_size > 0, we highly recommend a logger is passed in.
+  // Otherwise, the error message will be printed out to stderr directly.
+  char* AllocateAligned(size_t bytes, size_t huge_page_size = 0,
+                        Logger* logger = nullptr) override;
 
   // Returns an estimate of the total memory usage of data allocated
   // by the arena (exclude the space allocated but not yet used for future
@@ -57,11 +68,12 @@ class Arena {
 
   // If an allocation is too big, we'll allocate an irregular block with the
   // same size of that allocation.
-  virtual size_t IrregularBlockNum() const { return irregular_block_num; }
+  size_t IrregularBlockNum() const { return irregular_block_num; }
 
-  size_t BlockSize() const { return kBlockSize; }
+  size_t BlockSize() const override { return kBlockSize; }
 
  private:
+  char inline_block_[kInlineSize];
   // Number of bytes allocated in one block
   const size_t kBlockSize;
   // Array of new[] allocated memory blocks
@@ -87,6 +99,8 @@ class Arena {
   // How many bytes left in currently active block?
   size_t alloc_bytes_remaining_ = 0;
 
+  size_t hugetlb_size_ = 0;
+  char* AllocateFromHugePage(size_t bytes);
   char* AllocateFallback(size_t bytes, bool aligned);
   char* AllocateNewBlock(size_t block_bytes);
 
diff --git a/src/rocksdb/util/arena_test.cc b/src/rocksdb/util/arena_test.cc
index 1b2b531..a3b96bb 100644
--- a/src/rocksdb/util/arena_test.cc
+++ b/src/rocksdb/util/arena_test.cc
@@ -13,17 +13,21 @@
 
 namespace rocksdb {
 
-class ArenaTest {};
+namespace {
+const size_t kHugePageSize = 2 * 1024 * 1024;
+}  // namespace
+class ArenaTest : public testing::Test {};
 
-TEST(ArenaTest, Empty) { Arena arena0; }
+TEST_F(ArenaTest, Empty) { Arena arena0; }
 
-TEST(ArenaTest, MemoryAllocatedBytes) {
+namespace {
+void MemoryAllocatedBytesTest(size_t huge_page_size) {
   const int N = 17;
   size_t req_sz;  // requested size
   size_t bsz = 8192;  // block size
   size_t expected_memory_allocated;
 
-  Arena arena(bsz);
+  Arena arena(bsz, huge_page_size);
 
   // requested size > quarter of a block:
   //   allocate requested size separately
@@ -31,9 +35,11 @@ TEST(ArenaTest, MemoryAllocatedBytes) {
   for (int i = 0; i < N; i++) {
     arena.Allocate(req_sz);
   }
-  expected_memory_allocated = req_sz * N;
+  expected_memory_allocated = req_sz * N + Arena::kInlineSize;
   ASSERT_EQ(arena.MemoryAllocatedBytes(), expected_memory_allocated);
 
+  arena.Allocate(Arena::kInlineSize - 1);
+
   // requested size < quarter of a block:
   //   allocate a block with the default size, then try to use unused part
   //   of the block. So one new block will be allocated for the first
@@ -42,8 +48,15 @@ TEST(ArenaTest, MemoryAllocatedBytes) {
   for (int i = 0; i < N; i++) {
     arena.Allocate(req_sz);
   }
-  expected_memory_allocated += bsz;
-  ASSERT_EQ(arena.MemoryAllocatedBytes(), expected_memory_allocated);
+  if (huge_page_size) {
+    ASSERT_TRUE(arena.MemoryAllocatedBytes() ==
+                    expected_memory_allocated + bsz ||
+                arena.MemoryAllocatedBytes() ==
+                    expected_memory_allocated + huge_page_size);
+  } else {
+    expected_memory_allocated += bsz;
+    ASSERT_EQ(arena.MemoryAllocatedBytes(), expected_memory_allocated);
+  }
 
   // requested size > quarter of a block:
   //   allocate requested size separately
@@ -52,24 +65,43 @@ TEST(ArenaTest, MemoryAllocatedBytes) {
     arena.Allocate(req_sz);
   }
   expected_memory_allocated += req_sz * N;
-  ASSERT_EQ(arena.MemoryAllocatedBytes(), expected_memory_allocated);
+  if (huge_page_size) {
+    ASSERT_TRUE(arena.MemoryAllocatedBytes() ==
+                    expected_memory_allocated + bsz ||
+                arena.MemoryAllocatedBytes() ==
+                    expected_memory_allocated + huge_page_size);
+  } else {
+    ASSERT_EQ(arena.MemoryAllocatedBytes(), expected_memory_allocated);
+  }
 }
 
 // Make sure we didn't count the allocate but not used memory space in
 // Arena::ApproximateMemoryUsage()
-TEST(ArenaTest, ApproximateMemoryUsageTest) {
+static void ApproximateMemoryUsageTest(size_t huge_page_size) {
   const size_t kBlockSize = 4096;
   const size_t kEntrySize = kBlockSize / 8;
   const size_t kZero = 0;
-  Arena arena(kBlockSize);
+  Arena arena(kBlockSize, huge_page_size);
   ASSERT_EQ(kZero, arena.ApproximateMemoryUsage());
 
+  // allocate inline bytes
+  arena.AllocateAligned(8);
+  arena.AllocateAligned(Arena::kInlineSize / 2 - 16);
+  arena.AllocateAligned(Arena::kInlineSize / 2);
+  ASSERT_EQ(arena.ApproximateMemoryUsage(), Arena::kInlineSize - 8);
+  ASSERT_EQ(arena.MemoryAllocatedBytes(), Arena::kInlineSize);
+
   auto num_blocks = kBlockSize / kEntrySize;
 
   // first allocation
   arena.AllocateAligned(kEntrySize);
   auto mem_usage = arena.MemoryAllocatedBytes();
-  ASSERT_EQ(mem_usage, kBlockSize);
+  if (huge_page_size) {
+    ASSERT_TRUE(mem_usage == kBlockSize + Arena::kInlineSize ||
+                mem_usage == huge_page_size + Arena::kInlineSize);
+  } else {
+    ASSERT_EQ(mem_usage, kBlockSize + Arena::kInlineSize);
+  }
   auto usage = arena.ApproximateMemoryUsage();
   ASSERT_LT(usage, mem_usage);
   for (size_t i = 1; i < num_blocks; ++i) {
@@ -78,12 +110,17 @@ TEST(ArenaTest, ApproximateMemoryUsageTest) {
     ASSERT_EQ(arena.ApproximateMemoryUsage(), usage + kEntrySize);
     usage = arena.ApproximateMemoryUsage();
   }
-  ASSERT_GT(usage, mem_usage);
+  if (huge_page_size) {
+    ASSERT_TRUE(usage > mem_usage ||
+                usage + huge_page_size - kBlockSize == mem_usage);
+  } else {
+    ASSERT_GT(usage, mem_usage);
+  }
 }
 
-TEST(ArenaTest, Simple) {
+static void SimpleTest(size_t huge_page_size) {
   std::vector<std::pair<size_t, char*>> allocated;
-  Arena arena;
+  Arena arena(Arena::kMinBlockSize, huge_page_size);
   const int N = 100000;
   size_t bytes = 0;
   Random rnd(301);
@@ -127,7 +164,25 @@ TEST(ArenaTest, Simple) {
     }
   }
 }
+}  // namespace
+
+TEST_F(ArenaTest, MemoryAllocatedBytes) {
+  MemoryAllocatedBytesTest(0);
+  MemoryAllocatedBytesTest(kHugePageSize);
+}
 
+TEST_F(ArenaTest, ApproximateMemoryUsage) {
+  ApproximateMemoryUsageTest(0);
+  ApproximateMemoryUsageTest(kHugePageSize);
+}
+
+TEST_F(ArenaTest, Simple) {
+  SimpleTest(0);
+  SimpleTest(kHugePageSize);
+}
 }  // namespace rocksdb
 
-int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); }
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/auto_roll_logger.cc b/src/rocksdb/util/auto_roll_logger.cc
index 19c2b8c..684abfc 100644
--- a/src/rocksdb/util/auto_roll_logger.cc
+++ b/src/rocksdb/util/auto_roll_logger.cc
@@ -18,8 +18,7 @@ Status AutoRollLogger::ResetLogger() {
     return status_;
   }
 
-  if (logger_->GetLogFileSize() ==
-      (size_t)Logger::DO_NOT_SUPPORT_GET_LOG_FILE_SIZE) {
+  if (logger_->GetLogFileSize() == Logger::kDoNotSupportGetLogFileSize) {
     status_ = Status::NotSupported(
         "The underlying logger doesn't support GetLogFileSize()");
   }
@@ -38,6 +37,27 @@ void AutoRollLogger::RollLogFile() {
   env_->RenameFile(log_fname_, old_fname);
 }
 
+string AutoRollLogger::ValistToString(const char* format, va_list args) const {
+  // Any log messages longer than 1024 will get truncated.
+  // The user is responsible for chopping longer messages into multi line log
+  static const int MAXBUFFERSIZE = 1024;
+  char buffer[MAXBUFFERSIZE];
+
+  int count = vsnprintf(buffer, MAXBUFFERSIZE, format, args);
+  (void) count;
+  assert(count >= 0);
+
+  return buffer;
+}
+
+void AutoRollLogger::LogInternal(const char* format, ...) {
+  mutex_.AssertHeld();
+  va_list args;
+  va_start(args, format);
+  logger_->Logv(format, args);
+  va_end(args);
+}
+
 void AutoRollLogger::Logv(const char* format, va_list ap) {
   assert(GetStatus().ok());
 
@@ -52,6 +72,8 @@ void AutoRollLogger::Logv(const char* format, va_list ap) {
         // can't really log the error if creating a new LOG file failed
         return;
       }
+
+      WriteHeaderInfo();
     }
 
     // pin down the current logger_ instance before releasing the mutex.
@@ -67,6 +89,29 @@ void AutoRollLogger::Logv(const char* format, va_list ap) {
   logger->Logv(format, ap);
 }
 
+void AutoRollLogger::WriteHeaderInfo() {
+  mutex_.AssertHeld();
+  for (auto header : headers_) {
+    LogInternal("%s", header.c_str());
+  }
+}
+
+void AutoRollLogger::LogHeader(const char* format, va_list args) {
+  // header message are to be retained in memory. Since we cannot make any
+  // assumptions about the data contained in va_list, we will retain them as
+  // strings
+  va_list tmp;
+  va_copy(tmp, args);
+  string data = ValistToString(format, tmp);
+  va_end(tmp);
+
+  MutexLock l(&mutex_);
+  headers_.push_back(data);
+
+  // Log the original message to the current log
+  logger_->Logv(format, args);
+}
+
 bool AutoRollLogger::LogExpired() {
   if (cached_now_access_count >= call_NowMicros_every_N_records_) {
     cached_now = static_cast<uint64_t>(env_->NowMicros() * 1e-6);
@@ -87,6 +132,7 @@ Status CreateLoggerFromOptions(
   env->GetAbsolutePath(dbname, &db_absolute_path);
   std::string fname = InfoLogFileName(dbname, db_absolute_path, db_log_dir);
 
+  env->CreateDirIfMissing(dbname);  // In case it does not exist
   // Currently we only support roll by time-to-roll and log size
   if (options.log_file_time_to_roll > 0 || options.max_log_file_size > 0) {
     AutoRollLogger* result = new AutoRollLogger(
@@ -102,7 +148,6 @@ Status CreateLoggerFromOptions(
     return s;
   } else {
     // Open a log file in the same directory as the db
-    env->CreateDir(dbname);  // In case it does not exist
     env->RenameFile(fname, OldInfoLogFileName(dbname, env->NowMicros(),
                                               db_absolute_path, db_log_dir));
     auto s = env->NewLogger(fname, logger);
diff --git a/src/rocksdb/util/auto_roll_logger.h b/src/rocksdb/util/auto_roll_logger.h
index c592d79..e8bb596 100644
--- a/src/rocksdb/util/auto_roll_logger.h
+++ b/src/rocksdb/util/auto_roll_logger.h
@@ -7,6 +7,8 @@
 // where enough posix functionality is available.
 
 #pragma once
+#include <list>
+
 #include "db/filename.h"
 #include "port/port.h"
 #include "util/posix_logger.h"
@@ -38,15 +40,24 @@ class AutoRollLogger : public Logger {
     ResetLogger();
   }
 
-  void Logv(const char* format, va_list ap);
+  using Logger::Logv;
+  void Logv(const char* format, va_list ap) override;
+
+  // Write a header entry to the log. All header information will be written
+  // again every time the log rolls over.
+  virtual void LogHeader(const char* format, va_list ap) override;
 
   // check if the logger has encountered any problem.
   Status GetStatus() {
     return status_;
   }
 
-  size_t GetLogFileSize() const {
-    return logger_->GetLogFileSize();
+  size_t GetLogFileSize() const override { return logger_->GetLogFileSize(); }
+
+  void Flush() override {
+    if (logger_) {
+      logger_->Flush();
+    }
   }
 
   virtual ~AutoRollLogger() {
@@ -56,11 +67,21 @@ class AutoRollLogger : public Logger {
     call_NowMicros_every_N_records_ = call_NowMicros_every_N_records;
   }
 
- private:
+  // Expose the log file path for testing purpose
+  std::string TEST_log_fname() const {
+    return log_fname_;
+  }
 
+ private:
   bool LogExpired();
   Status ResetLogger();
   void RollLogFile();
+  // Log message to logger without rolling
+  void LogInternal(const char* format, ...);
+  // Serialize the va_list to a string
+  std::string ValistToString(const char* format, va_list args) const;
+  // Write the logs marked as headers to the new log file
+  void WriteHeaderInfo();
 
   std::string log_fname_; // Current active info log's file name.
   std::string dbname_;
@@ -72,6 +93,8 @@ class AutoRollLogger : public Logger {
   Status status_;
   const size_t kMaxLogFileSize;
   const size_t kLogFileTimeToRoll;
+  // header information
+  std::list<std::string> headers_;
   // to avoid frequent env->NowMicros() calls, we cached the current time
   uint64_t cached_now;
   uint64_t ctime_;
diff --git a/src/rocksdb/util/auto_roll_logger_test.cc b/src/rocksdb/util/auto_roll_logger_test.cc
old mode 100755
new mode 100644
index c49894f..6733a62
--- a/src/rocksdb/util/auto_roll_logger_test.cc
+++ b/src/rocksdb/util/auto_roll_logger_test.cc
@@ -4,6 +4,7 @@
 //  of patent rights can be found in the PATENTS file in the same directory.
 //
 #include <string>
+#include <vector>
 #include <cmath>
 #include <iostream>
 #include <fstream>
@@ -19,7 +20,7 @@ using namespace std;
 
 namespace rocksdb {
 
-class AutoRollLoggerTest {
+class AutoRollLoggerTest : public testing::Test {
  public:
   static void InitTestDb() {
     string deleteCmd = "rm -rf " + kTestDir;
@@ -102,7 +103,7 @@ uint64_t AutoRollLoggerTest::RollLogFileByTimeTest(
   uint64_t expected_create_time;
   uint64_t actual_create_time;
   uint64_t total_log_size;
-  ASSERT_OK(env->GetFileSize(kLogFile, &total_log_size));
+  EXPECT_OK(env->GetFileSize(kLogFile, &total_log_size));
   GetFileCreateTime(kLogFile, &expected_create_time);
   logger->SetCallNowMicrosEveryNRecords(0);
 
@@ -110,31 +111,31 @@ uint64_t AutoRollLoggerTest::RollLogFileByTimeTest(
   // to be finished before time.
   for (int i = 0; i < 10; ++i) {
      LogMessage(logger, log_message.c_str());
-     ASSERT_OK(logger->GetStatus());
+     EXPECT_OK(logger->GetStatus());
      // Make sure we always write to the same log file (by
      // checking the create time);
      GetFileCreateTime(kLogFile, &actual_create_time);
 
      // Also make sure the log size is increasing.
-     ASSERT_EQ(expected_create_time, actual_create_time);
-     ASSERT_GT(logger->GetLogFileSize(), total_log_size);
+     EXPECT_EQ(expected_create_time, actual_create_time);
+     EXPECT_GT(logger->GetLogFileSize(), total_log_size);
      total_log_size = logger->GetLogFileSize();
   }
 
   // -- Make the log file expire
-  sleep(time);
+  sleep(static_cast<unsigned int>(time));
   LogMessage(logger, log_message.c_str());
 
   // At this time, the new log file should be created.
   GetFileCreateTime(kLogFile, &actual_create_time);
-  ASSERT_GT(actual_create_time, expected_create_time);
-  ASSERT_LT(logger->GetLogFileSize(), total_log_size);
+  EXPECT_GT(actual_create_time, expected_create_time);
+  EXPECT_LT(logger->GetLogFileSize(), total_log_size);
   expected_create_time = actual_create_time;
 
   return expected_create_time;
 }
 
-TEST(AutoRollLoggerTest, RollLogFileBySize) {
+TEST_F(AutoRollLoggerTest, RollLogFileBySize) {
     InitTestDb();
     size_t log_max_size = 1024 * 5;
 
@@ -144,21 +145,20 @@ TEST(AutoRollLoggerTest, RollLogFileBySize) {
                           kSampleMessage + ":RollLogFileBySize");
 }
 
-TEST(AutoRollLoggerTest, RollLogFileByTime) {
-    size_t time = 1;
+TEST_F(AutoRollLoggerTest, RollLogFileByTime) {
+    size_t time = 2;
     size_t log_size = 1024 * 5;
 
     InitTestDb();
     // -- Test the existence of file during the server restart.
     ASSERT_TRUE(!env->FileExists(kLogFile));
-    AutoRollLogger logger(Env::Default(), kTestDir, "", log_size, 1);
+    AutoRollLogger logger(Env::Default(), kTestDir, "", log_size, time);
     ASSERT_TRUE(env->FileExists(kLogFile));
 
     RollLogFileByTimeTest(&logger, time, kSampleMessage + ":RollLogFileByTime");
 }
 
-TEST(AutoRollLoggerTest,
-     OpenLogFilesMultipleTimesWithOptionLog_max_size) {
+TEST_F(AutoRollLoggerTest, OpenLogFilesMultipleTimesWithOptionLog_max_size) {
   // If only 'log_max_size' options is specified, then every time
   // when rocksdb is restarted, a new empty log file will be created.
   InitTestDb();
@@ -183,8 +183,8 @@ TEST(AutoRollLoggerTest,
   delete logger;
 }
 
-TEST(AutoRollLoggerTest, CompositeRollByTimeAndSizeLogger) {
-  size_t time = 1, log_max_size = 1024 * 5;
+TEST_F(AutoRollLoggerTest, CompositeRollByTimeAndSizeLogger) {
+  size_t time = 2, log_max_size = 1024 * 5;
 
   InitTestDb();
 
@@ -200,7 +200,7 @@ TEST(AutoRollLoggerTest, CompositeRollByTimeAndSizeLogger) {
       kSampleMessage + ":CompositeRollByTimeAndSizeLogger");
 }
 
-TEST(AutoRollLoggerTest, CreateLoggerFromOptions) {
+TEST_F(AutoRollLoggerTest, CreateLoggerFromOptions) {
   DBOptions options;
   shared_ptr<Logger> logger;
 
@@ -222,7 +222,7 @@ TEST(AutoRollLoggerTest, CreateLoggerFromOptions) {
   // Only roll by Time
   InitTestDb();
   options.max_log_file_size = 0;
-  options.log_file_time_to_roll = 1;
+  options.log_file_time_to_roll = 2;
   ASSERT_OK(CreateLoggerFromOptions(kTestDir, "", env, options, &logger));
   auto_roll_logger =
     dynamic_cast<AutoRollLogger*>(logger.get());
@@ -233,7 +233,7 @@ TEST(AutoRollLoggerTest, CreateLoggerFromOptions) {
   // roll by both Time and size
   InitTestDb();
   options.max_log_file_size = 1024 * 5;
-  options.log_file_time_to_roll = 1;
+  options.log_file_time_to_roll = 2;
   ASSERT_OK(CreateLoggerFromOptions(kTestDir, "", env, options, &logger));
   auto_roll_logger =
     dynamic_cast<AutoRollLogger*>(logger.get());
@@ -245,7 +245,7 @@ TEST(AutoRollLoggerTest, CreateLoggerFromOptions) {
       kSampleMessage + ":CreateLoggerFromOptions - both");
 }
 
-TEST(AutoRollLoggerTest, InfoLogLevel) {
+TEST_F(AutoRollLoggerTest, InfoLogLevel) {
   InitTestDb();
 
   size_t log_size = 8192;
@@ -285,8 +285,103 @@ TEST(AutoRollLoggerTest, InfoLogLevel) {
   inFile.close();
 }
 
+// Test the logger Header function for roll over logs
+// We expect the new logs creates as roll over to carry the headers specified
+static list<string> GetOldFileNames(const string& path) {
+  const string dirname = path.substr(/*start=*/ 0, path.find_last_of("/"));
+  const string fname = path.substr(path.find_last_of("/") + 1);
+
+  vector<string> children;
+  Env::Default()->GetChildren(dirname, &children);
+
+  // We know that the old log files are named [path]<something>
+  // Return all entities that match the pattern
+  list<string> ret;
+  for (auto child : children) {
+    if (fname != child && child.find(fname) == 0) {
+      ret.push_back(dirname + "/" + child);
+    }
+  }
+
+  return ret;
+}
+
+// Return the number of lines where a given pattern was found in the file
+static size_t GetLinesCount(const string& fname, const string& pattern) {
+  stringstream ssbuf;
+  string line;
+  size_t count = 0;
+
+  ifstream inFile(fname.c_str());
+  ssbuf << inFile.rdbuf();
+
+  while (getline(ssbuf, line)) {
+    if (line.find(pattern) != std::string::npos) {
+      count++;
+    }
+  }
+
+  return count;
+}
+
+TEST_F(AutoRollLoggerTest, LogHeaderTest) {
+  static const size_t MAX_HEADERS = 10;
+  static const size_t LOG_MAX_SIZE = 1024 * 5;
+  static const std::string HEADER_STR = "Log header line";
+
+  InitTestDb();
+
+  AutoRollLogger logger(Env::Default(), kTestDir, /*db_log_dir=*/ "",
+                        LOG_MAX_SIZE, /*log_file_time_to_roll=*/ 0);
+
+  // log some headers
+  for (size_t i = 0; i < MAX_HEADERS; i++) {
+    Header(&logger, "%s %d", HEADER_STR.c_str(), i);
+  }
+
+  const string& newfname = logger.TEST_log_fname().c_str();
+
+  // log enough data to cause a roll over
+  int i = 0;
+  for (size_t iter = 0; iter < 2; iter++) {
+    while (logger.GetLogFileSize() < LOG_MAX_SIZE) {
+      Info(&logger, (kSampleMessage + ":LogHeaderTest line %d").c_str(), i);
+      ++i;
+    }
+
+    Info(&logger, "Rollover");
+  }
+
+  // Flus the log for the latest file
+  LogFlush(&logger);
+
+  const list<string> oldfiles = GetOldFileNames(newfname);
+
+  ASSERT_EQ(oldfiles.size(), (size_t) 2);
+
+  for (auto oldfname : oldfiles) {
+    // verify that the files rolled over
+    ASSERT_NE(oldfname, newfname);
+    // verify that the old log contains all the header logs
+    ASSERT_EQ(GetLinesCount(oldfname, HEADER_STR), MAX_HEADERS);
+  }
+}
+
+TEST_F(AutoRollLoggerTest, LogFileExistence) {
+  rocksdb::DB* db;
+  rocksdb::Options options;
+  string deleteCmd = "rm -rf " + kTestDir;
+  ASSERT_EQ(system(deleteCmd.c_str()), 0);
+  options.max_log_file_size = 100 * 1024 * 1024;
+  options.create_if_missing = true;
+  ASSERT_OK(rocksdb::DB::Open(options, kTestDir, &db));
+  ASSERT_TRUE(env->FileExists(kLogFile));
+  delete db;
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
-  return rocksdb::test::RunAllTests();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
 }
diff --git a/src/rocksdb/util/autovector.h b/src/rocksdb/util/autovector.h
index 212073e..c9befe9 100644
--- a/src/rocksdb/util/autovector.h
+++ b/src/rocksdb/util/autovector.h
@@ -24,7 +24,7 @@ class autovector : public std::vector<T> {};
 // full-fledged generic container.
 //
 // Currently we don't support:
-//  * reserve()/shrink_to_fit()/resize()
+//  * reserve()/shrink_to_fit()
 //     If used correctly, in most cases, people should not touch the
 //     underlying vector at all.
 //  * random insert()/erase(), please only use push_back()/pop_back().
@@ -67,26 +67,26 @@ class autovector {
     iterator_impl& operator=(const iterator_impl&) = default;
 
     // -- Advancement
-    // iterator++
+    // ++iterator
     self_type& operator++() {
       ++index_;
       return *this;
     }
 
-    // ++iterator
+    // iterator++
     self_type operator++(int) {
       auto old = *this;
       ++index_;
       return old;
     }
 
-    // iterator--
+    // --iterator
     self_type& operator--() {
       --index_;
       return *this;
     }
 
-    // --iterator
+    // iterator--
     self_type operator--(int) {
       auto old = *this;
       --index_;
@@ -176,29 +176,37 @@ class autovector {
 
   size_type size() const { return num_stack_items_ + vect_.size(); }
 
+  // resize does not guarantee anything about the contents of the newly
+  // available elements
+  void resize(size_type n) {
+    if (n > kSize) {
+      vect_.resize(n - kSize);
+      num_stack_items_ = kSize;
+    } else {
+      vect_.clear();
+      num_stack_items_ = n;
+    }
+  }
+
   bool empty() const { return size() == 0; }
 
-  // will not check boundry
   const_reference operator[](size_type n) const {
+    assert(n < size());
     return n < kSize ? values_[n] : vect_[n - kSize];
   }
 
   reference operator[](size_type n) {
+    assert(n < size());
     return n < kSize ? values_[n] : vect_[n - kSize];
   }
 
-  // will check boundry
   const_reference at(size_type n) const {
-    if (n >= size()) {
-      throw std::out_of_range("autovector: index out of range");
-    }
+    assert(n < size());
     return (*this)[n];
   }
 
   reference at(size_type n) {
-    if (n >= size()) {
-      throw std::out_of_range("autovector: index out of range");
-    }
+    assert(n < size());
     return (*this)[n];
   }
 
diff --git a/src/rocksdb/util/autovector_test.cc b/src/rocksdb/util/autovector_test.cc
index 88744cf..c597e36 100644
--- a/src/rocksdb/util/autovector_test.cc
+++ b/src/rocksdb/util/autovector_test.cc
@@ -5,9 +5,11 @@
 
 #include <atomic>
 #include <iostream>
+#include <utility>
 
 #include "rocksdb/env.h"
 #include "util/autovector.h"
+#include "util/string_util.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
 
@@ -15,10 +17,10 @@ namespace rocksdb {
 
 using namespace std;
 
-class AutoVectorTest { };
+class AutoVectorTest : public testing::Test {};
 
 const unsigned long kSize = 8;
-TEST(AutoVectorTest, PushBackAndPopBack) {
+TEST_F(AutoVectorTest, PushBackAndPopBack) {
   autovector<size_t, kSize> vec;
   ASSERT_TRUE(vec.empty());
   ASSERT_EQ(0ul, vec.size());
@@ -47,12 +49,12 @@ TEST(AutoVectorTest, PushBackAndPopBack) {
   ASSERT_TRUE(vec.empty());
 }
 
-TEST(AutoVectorTest, EmplaceBack) {
-  typedef std::pair<size_t, std::string> ValueType;
-  autovector<ValueType, kSize> vec;
+TEST_F(AutoVectorTest, EmplaceBack) {
+  typedef std::pair<size_t, std::string> ValType;
+  autovector<ValType, kSize> vec;
 
   for (size_t i = 0; i < 1000 * kSize; ++i) {
-    vec.emplace_back(i, std::to_string(i + 123));
+    vec.emplace_back(i, ToString(i + 123));
     ASSERT_TRUE(!vec.empty());
     if (i < kSize) {
       ASSERT_TRUE(vec.only_in_stack());
@@ -62,7 +64,7 @@ TEST(AutoVectorTest, EmplaceBack) {
 
     ASSERT_EQ(i + 1, vec.size());
     ASSERT_EQ(i, vec[i].first);
-    ASSERT_EQ(std::to_string(i + 123), vec[i].second);
+    ASSERT_EQ(ToString(i + 123), vec[i].second);
   }
 
   vec.clear();
@@ -70,6 +72,28 @@ TEST(AutoVectorTest, EmplaceBack) {
   ASSERT_TRUE(!vec.only_in_stack());
 }
 
+TEST_F(AutoVectorTest, Resize) {
+  autovector<size_t, kSize> vec;
+
+  vec.resize(kSize);
+  ASSERT_TRUE(vec.only_in_stack());
+  for (size_t i = 0; i < kSize; ++i) {
+    vec[i] = i;
+  }
+
+  vec.resize(kSize * 2);
+  ASSERT_TRUE(!vec.only_in_stack());
+  for (size_t i = 0; i < kSize; ++i) {
+    ASSERT_EQ(vec[i], i);
+  }
+  for (size_t i = 0; i < kSize; ++i) {
+    vec[i + kSize] = i;
+  }
+
+  vec.resize(1);
+  ASSERT_EQ(1U, vec.size());
+}
+
 namespace {
 void AssertEqual(
     const autovector<size_t, kSize>& a, const autovector<size_t, kSize>& b) {
@@ -82,7 +106,7 @@ void AssertEqual(
 }
 }  // namespace
 
-TEST(AutoVectorTest, CopyAndAssignment) {
+TEST_F(AutoVectorTest, CopyAndAssignment) {
   // Test both heap-allocated and stack-allocated cases.
   for (auto size : { kSize / 2, kSize * 1000 }) {
     autovector<size_t, kSize> vec;
@@ -103,10 +127,10 @@ TEST(AutoVectorTest, CopyAndAssignment) {
   }
 }
 
-TEST(AutoVectorTest, Iterators) {
+TEST_F(AutoVectorTest, Iterators) {
   autovector<std::string, kSize> vec;
   for (size_t i = 0; i < kSize * 1000; ++i) {
-    vec.push_back(std::to_string(i));
+    vec.push_back(ToString(i));
   }
 
   // basic operator test
@@ -168,7 +192,7 @@ vector<string> GetTestKeys(size_t size) {
 
   int index = 0;
   for (auto& key : keys) {
-    key = "item-" + to_string(index++);
+    key = "item-" + rocksdb::ToString(index++);
   }
   return keys;
 }
@@ -223,7 +247,7 @@ size_t BenchmarkSequenceAccess(string name, size_t ops, size_t elem_size) {
 // This test case only reports the performance between std::vector<string>
 // and autovector<string>. We chose string for comparison because in most
 // o our use cases we used std::vector<string>.
-TEST(AutoVectorTest, PerfBench) {
+TEST_F(AutoVectorTest, PerfBench) {
   // We run same operations for kOps times in order to get a more fair result.
   size_t kOps = 100000;
 
@@ -290,5 +314,6 @@ TEST(AutoVectorTest, PerfBench) {
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
-  return rocksdb::test::RunAllTests();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
 }
diff --git a/src/rocksdb/util/benchharness.cc b/src/rocksdb/util/benchharness.cc
deleted file mode 100644
index 8cd3700..0000000
--- a/src/rocksdb/util/benchharness.cc
+++ /dev/null
@@ -1,398 +0,0 @@
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-//
-// This code is derived from Benchmark.cpp implemented in Folly, the opensourced
-// Facebook C++ library available at https://github.com/facebook/folly
-// The code has removed any dependence on other folly and boost libraries
-
-#include "util/benchharness.h"
-
-#include <algorithm>
-#include <cmath>
-#include <cstring>
-#include <limits>
-#include <string>
-#include <utility>
-#include <vector>
-
-using std::function;
-using std::get;
-using std::make_pair;
-using std::max;
-using std::min;
-using std::pair;
-using std::sort;
-using std::string;
-using std::tuple;
-using std::vector;
-
-DEFINE_bool(benchmark, false, "Run benchmarks.");
-
-DEFINE_int64(bm_min_usec, 100,
-             "Minimum # of microseconds we'll accept for each benchmark.");
-
-DEFINE_int64(bm_min_iters, 1,
-             "Minimum # of iterations we'll try for each benchmark.");
-
-DEFINE_int32(bm_max_secs, 1,
-             "Maximum # of seconds we'll spend on each benchmark.");
-
-
-namespace rocksdb {
-namespace benchmark {
-
-BenchmarkSuspender::NanosecondsSpent BenchmarkSuspender::nsSpent;
-
-typedef function<uint64_t(unsigned int)> BenchmarkFun;
-static vector<tuple<const char*, const char*, BenchmarkFun>> benchmarks;
-
-// Add the global baseline
-BENCHMARK(globalBenchmarkBaseline) {
-  asm volatile("");
-}
-
-void detail::AddBenchmarkImpl(const char* file, const char* name,
-                              BenchmarkFun fun) {
-  benchmarks.emplace_back(file, name, std::move(fun));
-}
-
-/**
- * Given a point, gives density at that point as a number 0.0 < x <=
- * 1.0. The result is 1.0 if all samples are equal to where, and
- * decreases near 0 if all points are far away from it. The density is
- * computed with the help of a radial basis function.
- */
-static double Density(const double * begin, const double *const end,
-                      const double where, const double bandwidth) {
-  assert(begin < end);
-  assert(bandwidth > 0.0);
-  double sum = 0.0;
-  for (auto i = begin; i < end; i++) {
-    auto d = (*i - where) / bandwidth;
-    sum += exp(- d * d);
-  }
-  return sum / (end - begin);
-}
-
-/**
- * Computes mean and variance for a bunch of data points. Note that
- * mean is currently not being used.
- */
-static pair<double, double>
-MeanVariance(const double * begin, const double *const end) {
-  assert(begin < end);
-  double sum = 0.0, sum2 = 0.0;
-  for (auto i = begin; i < end; i++) {
-    sum += *i;
-    sum2 += *i * *i;
-  }
-  auto const n = end - begin;
-  return make_pair(sum / n, sqrt((sum2 - sum * sum / n) / n));
-}
-
-/**
- * Computes the mode of a sample set through brute force. Assumes
- * input is sorted.
- */
-static double Mode(const double * begin, const double *const end) {
-  assert(begin < end);
-  // Lower bound and upper bound for result and their respective
-  // densities.
-  auto
-    result = 0.0,
-    bestDensity = 0.0;
-
-  // Get the variance so we pass it down to Density()
-  auto const sigma = MeanVariance(begin, end).second;
-  if (!sigma) {
-    // No variance means constant signal
-    return *begin;
-  }
-
-  for (auto i = begin; i < end; i++) {
-    assert(i == begin || *i >= i[-1]);
-    auto candidate = Density(begin, end, *i, sigma * sqrt(2.0));
-    if (candidate > bestDensity) {
-      // Found a new best
-      bestDensity = candidate;
-      result = *i;
-    } else {
-      // Density is decreasing... we could break here if we definitely
-      // knew this is unimodal.
-    }
-  }
-
-  return result;
-}
-
-/**
- * Given a bunch of benchmark samples, estimate the actual run time.
- */
-static double EstimateTime(double * begin, double * end) {
-  assert(begin < end);
-
-  // Current state of the art: get the minimum. After some
-  // experimentation, it seems taking the minimum is the best.
-
-  return *std::min_element(begin, end);
-
-  // What follows after estimates the time as the mode of the
-  // distribution.
-
-  // Select the awesomest (i.e. most frequent) result. We do this by
-  // sorting and then computing the longest run length.
-  sort(begin, end);
-
-  // Eliminate outliers. A time much larger than the minimum time is
-  // considered an outlier.
-  while (end[-1] > 2.0 * *begin) {
-    --end;
-    if (begin == end) {
-//      LOG(INFO) << *begin;
-    }
-    assert(begin < end);
-  }
-
-  double result = 0;
-
-  /* Code used just for comparison purposes */ {
-    unsigned bestFrequency = 0;
-    unsigned candidateFrequency = 1;
-    double candidateValue = *begin;
-    for (auto current = begin + 1; ; ++current) {
-      if (current == end || *current != candidateValue) {
-        // Done with the current run, see if it was best
-        if (candidateFrequency > bestFrequency) {
-          bestFrequency = candidateFrequency;
-          result = candidateValue;
-        }
-        if (current == end) {
-          break;
-        }
-        // Start a new run
-        candidateValue = *current;
-        candidateFrequency = 1;
-      } else {
-        // Cool, inside a run, increase the frequency
-        ++candidateFrequency;
-      }
-    }
-  }
-
-  result = Mode(begin, end);
-
-  return result;
-}
-
-static double RunBenchmarkGetNSPerIteration(const BenchmarkFun& fun,
-                                            const double globalBaseline) {
-  // They key here is accuracy; too low numbers means the accuracy was
-  // coarse. We up the ante until we get to at least minNanoseconds
-  // timings.
-  static const auto minNanoseconds = FLAGS_bm_min_usec * 1000UL;
-
-  // We do measurements in several epochs and take the minimum, to
-  // account for jitter.
-  static const unsigned int epochs = 1000;
-  // We establish a total time budget as we don't want a measurement
-  // to take too long. This will curtail the number of actual epochs.
-  const uint64_t timeBudgetInNs = FLAGS_bm_max_secs * 1000000000;
-  auto env = Env::Default();
-  uint64_t global = env->NowNanos();
-
-  double epochResults[epochs] = { 0 };
-  size_t actualEpochs = 0;
-
-  for (; actualEpochs < epochs; ++actualEpochs) {
-    for (unsigned int n = FLAGS_bm_min_iters; n < (1UL << 30); n *= 2) {
-      auto const nsecs = fun(n);
-      if (nsecs < minNanoseconds) {
-        continue;
-      }
-      // We got an accurate enough timing, done. But only save if
-      // smaller than the current result.
-      epochResults[actualEpochs] = max(0.0,
-          static_cast<double>(nsecs) / n - globalBaseline);
-      // Done with the current epoch, we got a meaningful timing.
-      break;
-    }
-    uint64_t now = env->NowNanos();
-    if ((now - global) >= timeBudgetInNs) {
-      // No more time budget available.
-      ++actualEpochs;
-      break;
-    }
-  }
-
-  // If the benchmark was basically drowned in baseline noise, it's
-  // possible it became negative.
-  return max(0.0, EstimateTime(epochResults, epochResults + actualEpochs));
-}
-
-struct ScaleInfo {
-  double boundary;
-  const char* suffix;
-};
-
-static const ScaleInfo kTimeSuffixes[] {
-  { 365.25 * 24 * 3600, "years" },
-  { 24 * 3600, "days" },
-  { 3600, "hr" },
-  { 60, "min" },
-  { 1, "s" },
-  { 1E-3, "ms" },
-  { 1E-6, "us" },
-  { 1E-9, "ns" },
-  { 1E-12, "ps" },
-  { 1E-15, "fs" },
-  { 0, nullptr },
-};
-
-static const ScaleInfo kMetricSuffixes[] {
-  { 1E24, "Y" },  // yotta
-  { 1E21, "Z" },  // zetta
-  { 1E18, "X" },  // "exa" written with suffix 'X' so as to not create
-                  //   confusion with scientific notation
-  { 1E15, "P" },  // peta
-  { 1E12, "T" },  // terra
-  { 1E9, "G" },   // giga
-  { 1E6, "M" },   // mega
-  { 1E3, "K" },   // kilo
-  { 1, "" },
-  { 1E-3, "m" },  // milli
-  { 1E-6, "u" },  // micro
-  { 1E-9, "n" },  // nano
-  { 1E-12, "p" },  // pico
-  { 1E-15, "f" },  // femto
-  { 1E-18, "a" },  // atto
-  { 1E-21, "z" },  // zepto
-  { 1E-24, "y" },  // yocto
-  { 0, nullptr },
-};
-
-static string HumanReadable(double n, unsigned int decimals,
-                            const ScaleInfo* scales) {
-  if (std::isinf(n) || std::isnan(n)) {
-    return std::to_string(n);
-  }
-
-  const double absValue = fabs(n);
-  const ScaleInfo* scale = scales;
-  while (absValue < scale[0].boundary && scale[1].suffix != nullptr) {
-    ++scale;
-  }
-
-  const double scaledValue = n / scale->boundary;
-  char a[80];
-  snprintf(a, sizeof(a), "%.*f%s", decimals, scaledValue, scale->suffix);
-  return a;
-}
-
-static string ReadableTime(double n, unsigned int decimals) {
-  return HumanReadable(n, decimals, kTimeSuffixes);
-}
-
-static string MetricReadable(double n, unsigned int decimals) {
-  return HumanReadable(n, decimals, kMetricSuffixes);
-}
-
-static void PrintBenchmarkResultsAsTable(
-  const vector<tuple<const char*, const char*, double> >& data) {
-  // Width available
-  static const uint columns = 76;
-
-  // Compute the longest benchmark name
-  size_t longestName = 0;
-  for (size_t i = 1; i < benchmarks.size(); i++) {
-    longestName = max(longestName, strlen(get<1>(benchmarks[i])));
-  }
-
-  // Print a horizontal rule
-  auto separator = [&](char pad) {
-    puts(string(columns, pad).c_str());
-  };
-
-  // Print header for a file
-  auto header = [&](const char* file) {
-    separator('=');
-    printf("%-*srelative  time/iter  iters/s\n",
-           columns - 28, file);
-    separator('=');
-  };
-
-  double baselineNsPerIter = std::numeric_limits<double>::max();
-  const char* lastFile = "";
-
-  for (auto& datum : data) {
-    auto file = get<0>(datum);
-    if (strcmp(file, lastFile)) {
-      // New file starting
-      header(file);
-      lastFile = file;
-    }
-
-    string s = get<1>(datum);
-    if (s == "-") {
-      separator('-');
-      continue;
-    }
-    bool useBaseline /* = void */;
-    if (s[0] == '%') {
-      s.erase(0, 1);
-      useBaseline = true;
-    } else {
-      baselineNsPerIter = get<2>(datum);
-      useBaseline = false;
-    }
-    s.resize(columns - 29, ' ');
-    auto nsPerIter = get<2>(datum);
-    auto secPerIter = nsPerIter / 1E9;
-    auto itersPerSec = 1 / secPerIter;
-    if (!useBaseline) {
-      // Print without baseline
-      printf("%*s           %9s  %7s\n",
-             static_cast<int>(s.size()), s.c_str(),
-             ReadableTime(secPerIter, 2).c_str(),
-             MetricReadable(itersPerSec, 2).c_str());
-    } else {
-      // Print with baseline
-      auto rel = baselineNsPerIter / nsPerIter * 100.0;
-      printf("%*s %7.2f%%  %9s  %7s\n",
-             static_cast<int>(s.size()), s.c_str(),
-             rel,
-             ReadableTime(secPerIter, 2).c_str(),
-             MetricReadable(itersPerSec, 2).c_str());
-    }
-  }
-  separator('=');
-}
-
-void RunBenchmarks() {
-  ASSERT_TRUE(!benchmarks.empty());
-
-  vector<tuple<const char*, const char*, double>> results;
-  results.reserve(benchmarks.size() - 1);
-
-  // PLEASE KEEP QUIET. MEASUREMENTS IN PROGRESS.
-
-  auto const globalBaseline = RunBenchmarkGetNSPerIteration(
-    get<2>(benchmarks.front()), 0);
-  for (size_t i = 1; i < benchmarks.size(); i++) {
-    double elapsed = 0.0;
-    if (strcmp(get<1>(benchmarks[i]), "-") != 0) {  // skip separators
-      elapsed = RunBenchmarkGetNSPerIteration(get<2>(benchmarks[i]),
-                                              globalBaseline);
-    }
-    results.emplace_back(get<0>(benchmarks[i]),
-                         get<1>(benchmarks[i]), elapsed);
-  }
-
-  // PLEASE MAKE NOISE. MEASUREMENTS DONE.
-
-  PrintBenchmarkResultsAsTable(results);
-}
-
-}  // namespace benchmark
-}  // namespace rocksdb
diff --git a/src/rocksdb/util/benchharness.h b/src/rocksdb/util/benchharness.h
deleted file mode 100644
index 4fdef52..0000000
--- a/src/rocksdb/util/benchharness.h
+++ /dev/null
@@ -1,357 +0,0 @@
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-//
-// This code is derived from Benchmark.h implemented in Folly, the opensourced
-// Facebook C++ library available at https://github.com/facebook/folly
-// The code has removed any dependence on other folly and boost libraries
-
-#pragma once
-
-#include <gflags/gflags.h>
-
-#include <cassert>
-#include <functional>
-#include <limits>
-
-#include "util/testharness.h"
-#include "rocksdb/env.h"
-
-namespace rocksdb {
-namespace benchmark {
-
-/**
- * Runs all benchmarks defined. Usually put in main().
- */
-void RunBenchmarks();
-
-namespace detail {
-
-/**
- * Adds a benchmark wrapped in a std::function. Only used
- * internally. Pass by value is intentional.
- */
-void AddBenchmarkImpl(const char* file,
-                      const char* name,
-                      std::function<uint64_t(unsigned int)>);
-
-}  // namespace detail
-
-
-/**
- * Supporting type for BENCHMARK_SUSPEND defined below.
- */
-struct BenchmarkSuspender {
-  BenchmarkSuspender() { start_ = Env::Default()->NowNanos(); }
-
-  BenchmarkSuspender(const BenchmarkSuspender&) = delete;
-  BenchmarkSuspender(BenchmarkSuspender && rhs) {
-    start_ = rhs.start_;
-    rhs.start_ = 0;
-  }
-
-  BenchmarkSuspender& operator=(const BenchmarkSuspender &) = delete;
-  BenchmarkSuspender& operator=(BenchmarkSuspender && rhs) {
-    if (start_ > 0) {
-      tally();
-    }
-    start_ = rhs.start_;
-    rhs.start_ = 0;
-    return *this;
-  }
-
-  ~BenchmarkSuspender() {
-    if (start_ > 0) {
-      tally();
-    }
-  }
-
-  void Dismiss() {
-    assert(start_ > 0);
-    tally();
-    start_ = 0;
-  }
-
-  void Rehire() { start_ = Env::Default()->NowNanos(); }
-
-  /**
-   * This helps the macro definition. To get around the dangers of
-   * operator bool, returns a pointer to member (which allows no
-   * arithmetic).
-   */
-  /* implicit */
-  operator int BenchmarkSuspender::*() const { return nullptr; }
-
-  /**
-   * Accumulates nanoseconds spent outside benchmark.
-   */
-  typedef uint64_t NanosecondsSpent;
-  static NanosecondsSpent nsSpent;
-
- private:
-  void tally() {
-    uint64_t end = Env::Default()->NowNanos();
-    nsSpent += start_ - end;
-    start_ = end;
-  }
-
-  uint64_t start_;
-};
-
-/**
- * Adds a benchmark. Usually not called directly but instead through
- * the macro BENCHMARK defined below. The lambda function involved
- * must take exactly one parameter of type unsigned, and the benchmark
- * uses it with counter semantics (iteration occurs inside the
- * function).
- */
-template <typename Lambda>
-void
-AddBenchmark_n(const char* file, const char* name, Lambda&& lambda) {
-  auto execute = [=](unsigned int times) -> uint64_t {
-    BenchmarkSuspender::nsSpent = 0;
-    uint64_t start, end;
-    auto env = Env::Default();
-
-    // CORE MEASUREMENT STARTS
-    start = env->NowNanos();
-    lambda(times);
-    end = env->NowNanos();
-    // CORE MEASUREMENT ENDS
-    return (end - start) - BenchmarkSuspender::nsSpent;
-  };
-
-  detail::AddBenchmarkImpl(file, name,
-                           std::function<uint64_t(unsigned int)>(execute));
-}
-
-/**
- * Adds a benchmark. Usually not called directly but instead through
- * the macro BENCHMARK defined below. The lambda function involved
- * must take zero parameters, and the benchmark calls it repeatedly
- * (iteration occurs outside the function).
- */
-template <typename Lambda>
-void
-AddBenchmark(const char* file, const char* name, Lambda&& lambda) {
-  AddBenchmark_n(file, name, [=](unsigned int times) {
-      while (times-- > 0) {
-        lambda();
-      }
-    });
-}
-
-}  // namespace benchmark
-}  // namespace rocksdb
-
-/**
- * FB_ONE_OR_NONE(hello, world) expands to hello and
- * FB_ONE_OR_NONE(hello) expands to nothing. This macro is used to
- * insert or eliminate text based on the presence of another argument.
- */
-#define FB_ONE_OR_NONE(a, ...) FB_THIRD(a, ## __VA_ARGS__, a)
-#define FB_THIRD(a, b, ...) __VA_ARGS__
-
-#define FB_CONCATENATE_IMPL(s1, s2) s1##s2
-#define FB_CONCATENATE(s1, s2) FB_CONCATENATE_IMPL(s1, s2)
-
-#define FB_ANONYMOUS_VARIABLE(str) FB_CONCATENATE(str, __LINE__)
-
-#define FB_STRINGIZE(x) #x
-
-/**
- * Introduces a benchmark function. Used internally, see BENCHMARK and
- * friends below.
- */
-#define BENCHMARK_IMPL_N(funName, stringName, paramType, paramName)     \
-  static void funName(paramType);                                       \
-  static bool FB_ANONYMOUS_VARIABLE(rocksdbBenchmarkUnused) = (         \
-    ::rocksdb::benchmark::AddBenchmark_n(__FILE__, stringName,          \
-      [](paramType paramName) { funName(paramName); }),                 \
-    true);                                                              \
-  static void funName(paramType paramName)
-
-#define BENCHMARK_IMPL(funName, stringName)                             \
-  static void funName();                                                \
-  static bool FB_ANONYMOUS_VARIABLE(rocksdbBenchmarkUnused) = (         \
-    ::rocksdb::benchmark::AddBenchmark(__FILE__, stringName,            \
-      []() { funName(); }),                                             \
-    true);                                                              \
-  static void funName()
-
-/**
- * Introduces a benchmark function. Use with either one one or two
- * arguments. The first is the name of the benchmark. Use something
- * descriptive, such as insertVectorBegin. The second argument may be
- * missing, or could be a symbolic counter. The counter dictates how
- * many internal iteration the benchmark does. Example:
- *
- * BENCHMARK(vectorPushBack) {
- *   vector<int> v;
- *   v.push_back(42);
- * }
- *
- * BENCHMARK_N(insertVectorBegin, n) {
- *   vector<int> v;
- *   FOR_EACH_RANGE (i, 0, n) {
- *     v.insert(v.begin(), 42);
- *   }
- * }
- */
-#define BENCHMARK_N(name, ...)                                  \
-  BENCHMARK_IMPL_N(                                             \
-    name,                                                       \
-    FB_STRINGIZE(name),                                         \
-    FB_ONE_OR_NONE(unsigned, ## __VA_ARGS__),                   \
-    __VA_ARGS__)
-
-#define BENCHMARK(name)                                         \
-  BENCHMARK_IMPL(                                               \
-    name,                                                       \
-    FB_STRINGIZE(name))
-
-/**
- * Defines a benchmark that passes a parameter to another one. This is
- * common for benchmarks that need a "problem size" in addition to
- * "number of iterations". Consider:
- *
- * void pushBack(uint n, size_t initialSize) {
- *   vector<int> v;
- *   BENCHMARK_SUSPEND {
- *     v.resize(initialSize);
- *   }
- *   FOR_EACH_RANGE (i, 0, n) {
- *    v.push_back(i);
- *   }
- * }
- * BENCHMARK_PARAM(pushBack, 0)
- * BENCHMARK_PARAM(pushBack, 1000)
- * BENCHMARK_PARAM(pushBack, 1000000)
- *
- * The benchmark above estimates the speed of push_back at different
- * initial sizes of the vector. The framework will pass 0, 1000, and
- * 1000000 for initialSize, and the iteration count for n.
- */
-#define BENCHMARK_PARAM(name, param)                                    \
-  BENCHMARK_NAMED_PARAM(name, param, param)
-
-/*
- * Like BENCHMARK_PARAM(), but allows a custom name to be specified for each
- * parameter, rather than using the parameter value.
- *
- * Useful when the parameter value is not a valid token for string pasting,
- * of when you want to specify multiple parameter arguments.
- *
- * For example:
- *
- * void addValue(uint n, int64_t bucketSize, int64_t min, int64_t max) {
- *   Histogram<int64_t> hist(bucketSize, min, max);
- *   int64_t num = min;
- *   FOR_EACH_RANGE (i, 0, n) {
- *     hist.addValue(num);
- *     ++num;
- *     if (num > max) { num = min; }
- *   }
- * }
- *
- * BENCHMARK_NAMED_PARAM(addValue, 0_to_100, 1, 0, 100)
- * BENCHMARK_NAMED_PARAM(addValue, 0_to_1000, 10, 0, 1000)
- * BENCHMARK_NAMED_PARAM(addValue, 5k_to_20k, 250, 5000, 20000)
- */
-#define BENCHMARK_NAMED_PARAM(name, param_name, ...)                    \
-  BENCHMARK_IMPL(                                                       \
-      FB_CONCATENATE(name, FB_CONCATENATE(_, param_name)),              \
-      FB_STRINGIZE(name) "(" FB_STRINGIZE(param_name) ")") {            \
-    name(__VA_ARGS__);                                                  \
-  }
-
-#define BENCHMARK_NAMED_PARAM_N(name, param_name, ...)                  \
-  BENCHMARK_IMPL_N(                                                     \
-      FB_CONCATENATE(name, FB_CONCATENATE(_, param_name)),              \
-      FB_STRINGIZE(name) "(" FB_STRINGIZE(param_name) ")",              \
-      unsigned,                                                         \
-      iters) {                                                          \
-    name(iters, ## __VA_ARGS__);                                        \
-  }
-
-/**
- * Just like BENCHMARK, but prints the time relative to a
- * baseline. The baseline is the most recent BENCHMARK() seen in
- * lexical order. Example:
- *
- * // This is the baseline
- * BENCHMARK_N(insertVectorBegin, n) {
- *   vector<int> v;
- *   FOR_EACH_RANGE (i, 0, n) {
- *     v.insert(v.begin(), 42);
- *   }
- * }
- *
- * BENCHMARK_RELATIVE_N(insertListBegin, n) {
- *   list<int> s;
- *   FOR_EACH_RANGE (i, 0, n) {
- *     s.insert(s.begin(), 42);
- *   }
- * }
- *
- * Any number of relative benchmark can be associated with a
- * baseline. Another BENCHMARK() occurrence effectively establishes a
- * new baseline.
- */
-#define BENCHMARK_RELATIVE_N(name, ...)                         \
-  BENCHMARK_IMPL_N(                                             \
-    name,                                                       \
-    "%" FB_STRINGIZE(name),                                     \
-    FB_ONE_OR_NONE(unsigned, ## __VA_ARGS__),                   \
-    __VA_ARGS__)
-
-#define BENCHMARK_RELATIVE(name)                                \
-  BENCHMARK_IMPL(                                               \
-    name,                                                       \
-    "%" FB_STRINGIZE(name))
-
-/**
- * A combination of BENCHMARK_RELATIVE and BENCHMARK_PARAM.
- */
-#define BENCHMARK_RELATIVE_PARAM(name, param)                   \
-  BENCHMARK_RELATIVE_NAMED_PARAM(name, param, param)
-
-/**
- * A combination of BENCHMARK_RELATIVE and BENCHMARK_NAMED_PARAM.
- */
-#define BENCHMARK_RELATIVE_NAMED_PARAM(name, param_name, ...)           \
-  BENCHMARK_IMPL_N(                                                     \
-      FB_CONCATENATE(name, FB_CONCATENATE(_, param_name)),              \
-      "%" FB_STRINGIZE(name) "(" FB_STRINGIZE(param_name) ")",          \
-      unsigned,                                                         \
-      iters) {                                                          \
-    name(iters, ## __VA_ARGS__);                                        \
-  }
-
-/**
- * Draws a line of dashes.
- */
-#define BENCHMARK_DRAW_LINE()                                       \
-  static bool FB_ANONYMOUS_VARIABLE(rocksdbBenchmarkUnused) = (     \
-    ::rocksdb::benchmark::AddBenchmark(__FILE__, "-", []() { }),    \
-    true);
-
-/**
- * Allows execution of code that doesn't count torward the benchmark's
- * time budget. Example:
- *
- * BENCHMARK_START_GROUP(insertVectorBegin, n) {
- *   vector<int> v;
- *   BENCHMARK_SUSPEND {
- *     v.reserve(n);
- *   }
- *   FOR_EACH_RANGE (i, 0, n) {
- *     v.insert(v.begin(), 42);
- *   }
- * }
- */
-#define BENCHMARK_SUSPEND                               \
-  if (auto FB_ANONYMOUS_VARIABLE(BENCHMARK_SUSPEND) =   \
-      ::rocksdb::benchmark::BenchmarkSuspender()) {}    \
-  else
diff --git a/src/rocksdb/util/benchharness_test.cc b/src/rocksdb/util/benchharness_test.cc
deleted file mode 100644
index 75ff658..0000000
--- a/src/rocksdb/util/benchharness_test.cc
+++ /dev/null
@@ -1,67 +0,0 @@
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-//
-
-#include "util/benchharness.h"
-#include <vector>
-
-namespace rocksdb {
-
-BENCHMARK(insertFrontVector) {
-  std::vector<int> v;
-  for (int i = 0; i < 100; i++) {
-    v.insert(v.begin(), i);
-  }
-}
-
-BENCHMARK_RELATIVE(insertBackVector) {
-  std::vector<int> v;
-  for (size_t i = 0; i < 100; i++) {
-    v.insert(v.end(), i);
-  }
-}
-
-BENCHMARK_N(insertFrontVector_n, n) {
-  std::vector<int> v;
-  for (size_t i = 0; i < n; i++) {
-    v.insert(v.begin(), i);
-  }
-}
-
-BENCHMARK_RELATIVE_N(insertBackVector_n, n) {
-  std::vector<int> v;
-  for (size_t i = 0; i < n; i++) {
-    v.insert(v.end(), i);
-  }
-}
-
-BENCHMARK_N(insertFrontEnd_n, n) {
-  std::vector<int> v;
-  for (size_t i = 0; i < n; i++) {
-    v.insert(v.begin(), i);
-  }
-  for (size_t i = 0; i < n; i++) {
-    v.insert(v.end(), i);
-  }
-}
-
-BENCHMARK_RELATIVE_N(insertFrontEndSuspend_n, n) {
-  std::vector<int> v;
-  for (size_t i = 0; i < n; i++) {
-    v.insert(v.begin(), i);
-  }
-  BENCHMARK_SUSPEND {
-    for (size_t i = 0; i < n; i++) {
-      v.insert(v.end(), i);
-    }
-  }
-}
-
-}  // namespace rocksdb
-
-int main(int argc, char** argv) {
-  rocksdb::benchmark::RunBenchmarks();
-  return 0;
-}
diff --git a/src/rocksdb/util/blob_store.cc b/src/rocksdb/util/blob_store.cc
deleted file mode 100644
index daaf4bc..0000000
--- a/src/rocksdb/util/blob_store.cc
+++ /dev/null
@@ -1,270 +0,0 @@
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#ifndef ROCKSDB_LITE
-#include "util/blob_store.h"
-
-namespace rocksdb {
-
-using namespace std;
-
-// BlobChunk
-bool BlobChunk::ImmediatelyBefore(const BlobChunk& chunk) const {
-  // overlapping!?
-  assert(!Overlap(chunk));
-  // size == 0 is a marker, not a block
-  return size != 0 &&
-    bucket_id == chunk.bucket_id &&
-    offset + size == chunk.offset;
-}
-
-bool BlobChunk::Overlap(const BlobChunk &chunk) const {
-  return size != 0 && chunk.size != 0 && bucket_id == chunk.bucket_id &&
-    ((offset >= chunk.offset && offset < chunk.offset + chunk.size) ||
-     (chunk.offset >= offset && chunk.offset < offset + size));
-}
-
-// Blob
-string Blob::ToString() const {
-  string ret;
-  for (auto chunk : chunks) {
-    PutFixed32(&ret, chunk.bucket_id);
-    PutFixed32(&ret, chunk.offset);
-    PutFixed32(&ret, chunk.size);
-  }
-  return ret;
-}
-
-Blob::Blob(const std::string& blob) {
-  for (uint32_t i = 0; i < blob.size(); ) {
-    uint32_t t[3] = {0};
-    for (int j = 0; j < 3 && i + sizeof(uint32_t) - 1 < blob.size();
-                    ++j, i += sizeof(uint32_t)) {
-      t[j] = DecodeFixed32(blob.data() + i);
-    }
-    chunks.push_back(BlobChunk(t[0], t[1], t[2]));
-  }
-}
-
-// FreeList
-Status FreeList::Free(const Blob& blob) {
-  // add it back to the free list
-  for (auto chunk : blob.chunks) {
-    free_blocks_ += chunk.size;
-    if (fifo_free_chunks_.size() &&
-        fifo_free_chunks_.back().ImmediatelyBefore(chunk)) {
-      fifo_free_chunks_.back().size += chunk.size;
-    } else {
-      fifo_free_chunks_.push_back(chunk);
-    }
-  }
-
-  return Status::OK();
-}
-
-Status FreeList::Allocate(uint32_t blocks, Blob* blob) {
-  if (free_blocks_ < blocks) {
-    return Status::Incomplete("");
-  }
-
-  blob->chunks.clear();
-  free_blocks_ -= blocks;
-
-  while (blocks > 0) {
-    assert(fifo_free_chunks_.size() > 0);
-    auto& front = fifo_free_chunks_.front();
-    if (front.size > blocks) {
-      blob->chunks.push_back(BlobChunk(front.bucket_id, front.offset, blocks));
-      front.offset += blocks;
-      front.size -= blocks;
-      blocks = 0;
-    } else {
-      blob->chunks.push_back(front);
-      blocks -= front.size;
-      fifo_free_chunks_.pop_front();
-    }
-  }
-  assert(blocks == 0);
-
-  return Status::OK();
-}
-
-bool FreeList::Overlap(const Blob &blob) const {
-  for (auto chunk : blob.chunks) {
-    for (auto itr = fifo_free_chunks_.begin();
-         itr != fifo_free_chunks_.end();
-         ++itr) {
-      if (itr->Overlap(chunk)) {
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-// BlobStore
-BlobStore::BlobStore(const string& directory,
-                     uint64_t block_size,
-                     uint32_t blocks_per_bucket,
-                     uint32_t max_buckets,
-                     Env* env) :
-    directory_(directory),
-    block_size_(block_size),
-    blocks_per_bucket_(blocks_per_bucket),
-    env_(env),
-    max_buckets_(max_buckets) {
-  env_->CreateDirIfMissing(directory_);
-
-  storage_options_.use_mmap_writes = false;
-  storage_options_.use_mmap_reads = false;
-
-  buckets_size_ = 0;
-  buckets_ = new unique_ptr<RandomRWFile>[max_buckets_];
-
-  CreateNewBucket();
-}
-
-BlobStore::~BlobStore() {
-  // TODO we don't care about recovery for now
-  delete [] buckets_;
-}
-
-Status BlobStore::Put(const Slice& value, Blob* blob) {
-  // convert size to number of blocks
-  Status s = Allocate((value.size() + block_size_ - 1) / block_size_, blob);
-  if (!s.ok()) {
-    return s;
-  }
-  auto size_left = (uint64_t) value.size();
-
-  uint64_t offset = 0; // in bytes, not blocks
-  for (auto chunk : blob->chunks) {
-    uint64_t write_size = min(chunk.size * block_size_, size_left);
-    assert(chunk.bucket_id < buckets_size_);
-    s = buckets_[chunk.bucket_id].get()->Write(chunk.offset * block_size_,
-                                               Slice(value.data() + offset,
-                                                     write_size));
-    if (!s.ok()) {
-      Delete(*blob);
-      return s;
-    }
-    offset += write_size;
-    size_left -= write_size;
-    if (write_size < chunk.size * block_size_) {
-      // if we have any space left in the block, fill it up with zeros
-      string zero_string(chunk.size * block_size_ - write_size, 0);
-      s = buckets_[chunk.bucket_id].get()->Write(chunk.offset * block_size_ +
-                                                    write_size,
-                                                 Slice(zero_string));
-    }
-  }
-
-  if (size_left > 0) {
-    Delete(*blob);
-    return Status::Corruption("Tried to write more data than fits in the blob");
-  }
-
-  return Status::OK();
-}
-
-Status BlobStore::Get(const Blob& blob,
-                      string* value) const {
-  {
-    // assert that it doesn't overlap with free list
-    // it will get compiled out for release
-    MutexLock l(&free_list_mutex_);
-    assert(!free_list_.Overlap(blob));
-  }
-
-  value->resize(blob.Size() * block_size_);
-
-  uint64_t offset = 0; // in bytes, not blocks
-  for (auto chunk : blob.chunks) {
-    Slice result;
-    assert(chunk.bucket_id < buckets_size_);
-    Status s;
-    s = buckets_[chunk.bucket_id].get()->Read(chunk.offset * block_size_,
-                                              chunk.size * block_size_,
-                                              &result,
-                                              &value->at(offset));
-    if (!s.ok()) {
-      value->clear();
-      return s;
-    }
-    if (result.size() < chunk.size * block_size_) {
-      value->clear();
-      return Status::Corruption("Could not read in from file");
-    }
-    offset += chunk.size * block_size_;
-  }
-
-  // remove the '\0's at the end of the string
-  value->erase(find(value->begin(), value->end(), '\0'), value->end());
-
-  return Status::OK();
-}
-
-Status BlobStore::Delete(const Blob& blob) {
-  MutexLock l(&free_list_mutex_);
-  return free_list_.Free(blob);
-}
-
-Status BlobStore::Sync() {
-  for (size_t i = 0; i < buckets_size_; ++i) {
-    Status s = buckets_[i].get()->Sync();
-    if (!s.ok()) {
-      return s;
-    }
-  }
-  return Status::OK();
-}
-
-Status BlobStore::Allocate(uint32_t blocks, Blob* blob) {
-  MutexLock l(&free_list_mutex_);
-  Status s;
-
-  s = free_list_.Allocate(blocks, blob);
-  if (!s.ok()) {
-    s = CreateNewBucket();
-    if (!s.ok()) {
-      return s;
-    }
-    s = free_list_.Allocate(blocks, blob);
-  }
-
-  return s;
-}
-
-// called with free_list_mutex_ held
-Status BlobStore::CreateNewBucket() {
-  MutexLock l(&buckets_mutex_);
-
-  if (buckets_size_ >= max_buckets_) {
-    return Status::NotSupported("Max size exceeded\n");
-  }
-
-  int new_bucket_id = buckets_size_;
-
-  char fname[200];
-  sprintf(fname, "%s/%d.bs", directory_.c_str(), new_bucket_id);
-
-  Status s = env_->NewRandomRWFile(string(fname),
-                                   &buckets_[new_bucket_id],
-                                   storage_options_);
-  if (!s.ok()) {
-    return s;
-  }
-
-  // whether Allocate succeeds or not, does not affect the overall correctness
-  // of this function - calling Allocate is really optional
-  // (also, tmpfs does not support allocate)
-  buckets_[new_bucket_id].get()->Allocate(0, block_size_ * blocks_per_bucket_);
-
-  buckets_size_ = new_bucket_id + 1;
-
-  return free_list_.Free(Blob(new_bucket_id, 0, blocks_per_bucket_));
-}
-
-} // namespace rocksdb
-#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/util/blob_store.h b/src/rocksdb/util/blob_store.h
deleted file mode 100644
index ce86337..0000000
--- a/src/rocksdb/util/blob_store.h
+++ /dev/null
@@ -1,163 +0,0 @@
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#ifndef ROCKSDB_LITE
-#pragma once
-#include "rocksdb/env.h"
-#include "rocksdb/status.h"
-#include "port/port.h"
-#include "util/mutexlock.h"
-#include "util/coding.h"
-
-#include <list>
-#include <deque>
-#include <cstdint>
-#include <iostream>
-#include <stdexcept>
-#include <algorithm>
-#include <cstdio>
-
-namespace rocksdb {
-
-struct BlobChunk {
-  uint32_t bucket_id;
-  uint32_t offset; // in blocks
-  uint32_t size; // in blocks
-  BlobChunk() {}
-  BlobChunk(uint32_t bucket_id, uint32_t offset, uint32_t size) :
-    bucket_id(bucket_id), offset(offset), size(size) {}
-
-  // returns true if it's immediately before chunk
-  bool ImmediatelyBefore(const BlobChunk& chunk) const;
-  // returns true if chunks overlap
-  bool Overlap(const BlobChunk &chunk) const;
-};
-
-// We represent each Blob as a string in format:
-// bucket_id offset size|bucket_id offset size...
-// The string can be used to reference the Blob stored on external
-// device/file
-// Not thread-safe!
-struct Blob {
-  // Generates the string
-  std::string ToString() const;
-  // Parses the previously generated string
-  explicit Blob(const std::string& blob);
-  // Creates unfragmented Blob
-  Blob(uint32_t bucket_id, uint32_t offset, uint32_t size) {
-    SetOneChunk(bucket_id, offset, size);
-  }
-  Blob() {}
-
-  void SetOneChunk(uint32_t bucket_id, uint32_t offset, uint32_t size) {
-    chunks.clear();
-    chunks.push_back(BlobChunk(bucket_id, offset, size));
-  }
-
-  uint32_t Size() const { // in blocks
-    uint32_t ret = 0;
-    for (auto chunk : chunks) {
-      ret += chunk.size;
-    }
-    assert(ret > 0);
-    return ret;
-  }
-
-  // bucket_id, offset, size
-  std::vector<BlobChunk> chunks;
-};
-
-// Keeps a list of free chunks
-// NOT thread-safe. Externally synchronized
-class FreeList {
- public:
-  FreeList() :
-    free_blocks_(0) {}
-  ~FreeList() {}
-
-  // Allocates a a blob. Stores the allocated blob in
-  // 'blob'. Returns non-OK status if it failed to allocate.
-  // Thread-safe
-  Status Allocate(uint32_t blocks, Blob* blob);
-  // Frees the blob for reuse. Thread-safe
-  Status Free(const Blob& blob);
-
-  // returns true if blob is overlapping with any of the
-  // chunks stored in free list
-  bool Overlap(const Blob &blob) const;
-
- private:
-  std::deque<BlobChunk> fifo_free_chunks_;
-  uint32_t free_blocks_;
-  mutable port::Mutex mutex_;
-};
-
-// thread-safe
-class BlobStore {
- public:
-   // directory - wherever the blobs should be stored. It will be created
-   //   if missing
-   // block_size - self explanatory
-   // blocks_per_bucket - how many blocks we want to keep in one bucket.
-   //   Bucket is a device or a file that we use to store the blobs.
-   //   If we don't have enough blocks to allocate a new blob, we will
-   //   try to create a new file or device.
-   // max_buckets - maximum number of buckets BlobStore will create
-   //   BlobStore max size in bytes is
-   //     max_buckets * blocks_per_bucket * block_size
-   // env - env for creating new files
-  BlobStore(const std::string& directory,
-            uint64_t block_size,
-            uint32_t blocks_per_bucket,
-            uint32_t max_buckets,
-            Env* env);
-  ~BlobStore();
-
-  // Allocates space for value.size bytes (rounded up to be multiple of
-  // block size) and writes value.size bytes from value.data to a backing store.
-  // Sets Blob blob that can than be used for addressing the
-  // stored value. Returns non-OK status on error.
-  Status Put(const Slice& value, Blob* blob);
-  // Value needs to have enough space to store all the loaded stuff.
-  // This function is thread safe!
-  Status Get(const Blob& blob, std::string* value) const;
-  // Frees the blob for reuse, but does not delete the data
-  // on the backing store.
-  Status Delete(const Blob& blob);
-  // Sync all opened files that are modified
-  Status Sync();
-
- private:
-  const std::string directory_;
-  // block_size_ is uint64_t because when we multiply with
-  // blocks_size_ we want the result to be uint64_t or
-  // we risk overflowing
-  const uint64_t block_size_;
-  const uint32_t blocks_per_bucket_;
-  Env* env_;
-  EnvOptions storage_options_;
-  // protected by free_list_mutex_
-  FreeList free_list_;
-  // free_list_mutex_ is locked BEFORE buckets_mutex_
-  mutable port::Mutex free_list_mutex_;
-  // protected by buckets_mutex_
-  // array of buckets
-  unique_ptr<RandomRWFile>* buckets_;
-  // number of buckets in the array
-  uint32_t buckets_size_;
-  uint32_t max_buckets_;
-  mutable port::Mutex buckets_mutex_;
-
-  // Calls FreeList allocate. If free list can't allocate
-  // new blob, creates new bucket and tries again
-  // Thread-safe
-  Status Allocate(uint32_t blocks, Blob* blob);
-
-  // Creates a new backing store and adds all the blocks
-  // from the new backing store to the free list
-  Status CreateNewBucket();
-};
-
-} // namespace rocksdb
-#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/util/blob_store_test.cc b/src/rocksdb/util/blob_store_test.cc
deleted file mode 100644
index f199f5d..0000000
--- a/src/rocksdb/util/blob_store_test.cc
+++ /dev/null
@@ -1,200 +0,0 @@
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#include "util/blob_store.h"
-
-#include "util/testharness.h"
-#include "util/testutil.h"
-#include "util/random.h"
-
-#include <cstdlib>
-#include <string>
-
-namespace rocksdb {
-
-using namespace std;
-
-class BlobStoreTest { };
-
-TEST(BlobStoreTest, RangeParseTest) {
-  Blob e;
-  for (int i = 0; i < 5; ++i) {
-    e.chunks.push_back(BlobChunk(rand(), rand(), rand()));
-  }
-  string x = e.ToString();
-  Blob nx(x);
-
-  ASSERT_EQ(nx.ToString(), x);
-}
-
-// make sure we're reusing the freed space
-TEST(BlobStoreTest, SanityTest) {
-  const uint64_t block_size = 10;
-  const uint32_t blocks_per_file = 20;
-  Random random(5);
-
-  BlobStore blob_store(test::TmpDir() + "/blob_store_test",
-                       block_size,
-                       blocks_per_file,
-                       1000,
-                       Env::Default());
-
-  string buf;
-
-  // put string of size 170
-  test::RandomString(&random, 170, &buf);
-  Blob r1;
-  ASSERT_OK(blob_store.Put(Slice(buf), &r1));
-  // use the first file
-  for (size_t i = 0; i < r1.chunks.size(); ++i) {
-    ASSERT_EQ(r1.chunks[0].bucket_id, 0u);
-  }
-
-  // put string of size 30
-  test::RandomString(&random, 30, &buf);
-  Blob r2;
-  ASSERT_OK(blob_store.Put(Slice(buf), &r2));
-  // use the first file
-  for (size_t i = 0; i < r2.chunks.size(); ++i) {
-    ASSERT_EQ(r2.chunks[0].bucket_id, 0u);
-  }
-
-  // delete blob of size 170
-  ASSERT_OK(blob_store.Delete(r1));
-
-  // put a string of size 100
-  test::RandomString(&random, 100, &buf);
-  Blob r3;
-  ASSERT_OK(blob_store.Put(Slice(buf), &r3));
-  // use the first file
-  for (size_t i = 0; i < r3.chunks.size(); ++i) {
-    ASSERT_EQ(r3.chunks[0].bucket_id, 0u);
-  }
-
-  // put a string of size 70
-  test::RandomString(&random, 70, &buf);
-  Blob r4;
-  ASSERT_OK(blob_store.Put(Slice(buf), &r4));
-  // use the first file
-  for (size_t i = 0; i < r4.chunks.size(); ++i) {
-    ASSERT_EQ(r4.chunks[0].bucket_id, 0u);
-  }
-
-  // put a string of size 5
-  test::RandomString(&random, 5, &buf);
-  Blob r5;
-  ASSERT_OK(blob_store.Put(Slice(buf), &r5));
-  // now you get to use the second file
-  for (size_t i = 0; i < r5.chunks.size(); ++i) {
-    ASSERT_EQ(r5.chunks[0].bucket_id, 1u);
-  }
-}
-
-TEST(BlobStoreTest, FragmentedChunksTest) {
-  const uint64_t block_size = 10;
-  const uint32_t blocks_per_file = 20;
-  Random random(5);
-
-  BlobStore blob_store(test::TmpDir() + "/blob_store_test",
-                       block_size,
-                       blocks_per_file,
-                       1000,
-                       Env::Default());
-
-  string buf;
-
-  vector <Blob> r(4);
-
-  // put 4 strings of size 50
-  for (int k = 0; k < 4; ++k)  {
-    test::RandomString(&random, 50, &buf);
-    ASSERT_OK(blob_store.Put(Slice(buf), &r[k]));
-    // use the first file
-    for (size_t i = 0; i < r[k].chunks.size(); ++i) {
-      ASSERT_EQ(r[k].chunks[0].bucket_id, 0u);
-    }
-  }
-
-  // delete the first and third
-  ASSERT_OK(blob_store.Delete(r[0]));
-  ASSERT_OK(blob_store.Delete(r[2]));
-
-  // put string of size 100. it should reuse space that we deleting
-  // by deleting first and third strings of size 50
-  test::RandomString(&random, 100, &buf);
-  Blob r2;
-  ASSERT_OK(blob_store.Put(Slice(buf), &r2));
-  // use the first file
-  for (size_t i = 0; i < r2.chunks.size(); ++i) {
-    ASSERT_EQ(r2.chunks[0].bucket_id, 0u);
-  }
-}
-
-TEST(BlobStoreTest, CreateAndStoreTest) {
-  const uint64_t block_size = 10;
-  const uint32_t blocks_per_file = 1000;
-  const int max_blurb_size = 300;
-  Random random(5);
-
-  BlobStore blob_store(test::TmpDir() + "/blob_store_test",
-                       block_size,
-                       blocks_per_file,
-                       10000,
-                       Env::Default());
-  vector<pair<Blob, string>> ranges;
-
-  for (int i = 0; i < 2000; ++i) {
-    int decision = rand() % 5;
-    if (decision <= 2 || ranges.size() == 0) {
-      string buf;
-      int size_blocks = (rand() % max_blurb_size + 1);
-      int string_size = size_blocks * block_size - (rand() % block_size);
-      test::RandomString(&random, string_size, &buf);
-      Blob r;
-      ASSERT_OK(blob_store.Put(Slice(buf), &r));
-      ranges.push_back(make_pair(r, buf));
-    } else if (decision == 3) {
-      int ti = rand() % ranges.size();
-      string out_buf;
-      ASSERT_OK(blob_store.Get(ranges[ti].first, &out_buf));
-      ASSERT_EQ(ranges[ti].second, out_buf);
-    } else {
-      int ti = rand() % ranges.size();
-      ASSERT_OK(blob_store.Delete(ranges[ti].first));
-      ranges.erase(ranges.begin() + ti);
-    }
-  }
-  ASSERT_OK(blob_store.Sync());
-}
-
-TEST(BlobStoreTest, MaxSizeTest) {
-  const uint64_t block_size = 10;
-  const uint32_t blocks_per_file = 100;
-  const int max_buckets = 10;
-  Random random(5);
-
-  BlobStore blob_store(test::TmpDir() + "/blob_store_test",
-                       block_size,
-                       blocks_per_file,
-                       max_buckets,
-                       Env::Default());
-  string buf;
-  for (int i = 0; i < max_buckets; ++i) {
-    test::RandomString(&random, 1000, &buf);
-    Blob r;
-    ASSERT_OK(blob_store.Put(Slice(buf), &r));
-  }
-
-  test::RandomString(&random, 1000, &buf);
-  Blob r;
-  // should fail because max size
-  Status s = blob_store.Put(Slice(buf), &r);
-  ASSERT_EQ(s.ok(), false);
-}
-
-}  // namespace rocksdb
-
-int main(int argc, char** argv) {
-  return rocksdb::test::RunAllTests();
-}
diff --git a/src/rocksdb/util/bloom.cc b/src/rocksdb/util/bloom.cc
index 78ae04a..d3f3abd 100644
--- a/src/rocksdb/util/bloom.cc
+++ b/src/rocksdb/util/bloom.cc
@@ -1,4 +1,4 @@
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
 //  This source code is licensed under the BSD-style license found in the
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
@@ -10,45 +10,268 @@
 #include "rocksdb/filter_policy.h"
 
 #include "rocksdb/slice.h"
+#include "table/block_based_filter_block.h"
+#include "table/full_filter_block.h"
 #include "util/hash.h"
+#include "util/coding.h"
 
 namespace rocksdb {
 
+class BlockBasedFilterBlockBuilder;
+class FullFilterBlockBuilder;
+
 namespace {
-static uint32_t BloomHash(const Slice& key) {
-  return Hash(key.data(), key.size(), 0xbc9f1d34);
-}
+class FullFilterBitsBuilder : public FilterBitsBuilder {
+ public:
+  explicit FullFilterBitsBuilder(const size_t bits_per_key,
+                                 const size_t num_probes)
+      : bits_per_key_(bits_per_key),
+        num_probes_(num_probes) {
+    assert(bits_per_key_);
+  }
+
+  ~FullFilterBitsBuilder() {}
+
+  virtual void AddKey(const Slice& key) override {
+    uint32_t hash = BloomHash(key);
+    if (hash_entries_.size() == 0 || hash != hash_entries_.back()) {
+      hash_entries_.push_back(hash);
+    }
+  }
+
+  // Create a filter that for hashes [0, n-1], the filter is allocated here
+  // When creating filter, it is ensured that
+  // total_bits = num_lines * CACHE_LINE_SIZE * 8
+  // dst len is >= 5, 1 for num_probes, 4 for num_lines
+  // Then total_bits = (len - 5) * 8, and cache_line_size could be calculated
+  // +----------------------------------------------------------------+
+  // |              filter data with length total_bits/8              |
+  // +----------------------------------------------------------------+
+  // |                                                                |
+  // | ...                                                            |
+  // |                                                                |
+  // +----------------------------------------------------------------+
+  // | ...                | num_probes : 1 byte | num_lines : 4 bytes |
+  // +----------------------------------------------------------------+
+  virtual Slice Finish(std::unique_ptr<const char[]>* buf) override {
+    uint32_t total_bits, num_lines;
+    char* data = ReserveSpace(static_cast<int>(hash_entries_.size()),
+                              &total_bits, &num_lines);
+    assert(data);
+
+    if (total_bits != 0 && num_lines != 0) {
+      for (auto h : hash_entries_) {
+        AddHash(h, data, num_lines, total_bits);
+      }
+    }
+    data[total_bits/8] = static_cast<char>(num_probes_);
+    EncodeFixed32(data + total_bits/8 + 1, static_cast<uint32_t>(num_lines));
+
+    const char* const_data = data;
+    buf->reset(const_data);
+    hash_entries_.clear();
+
+    return Slice(data, total_bits / 8 + 5);
+  }
 
-class BloomFilterPolicy : public FilterPolicy {
  private:
   size_t bits_per_key_;
-  size_t k_;
-  uint32_t (*hash_func_)(const Slice& key);
+  size_t num_probes_;
+  std::vector<uint32_t> hash_entries_;
 
-  void initialize() {
-    // We intentionally round down to reduce probing cost a little bit
-    k_ = static_cast<size_t>(bits_per_key_ * 0.69);  // 0.69 =~ ln(2)
-    if (k_ < 1) k_ = 1;
-    if (k_ > 30) k_ = 30;
+  // Get totalbits that optimized for cpu cache line
+  uint32_t GetTotalBitsForLocality(uint32_t total_bits);
+
+  // Reserve space for new filter
+  char* ReserveSpace(const int num_entry, uint32_t* total_bits,
+      uint32_t* num_lines);
+
+  // Assuming single threaded access to this function.
+  void AddHash(uint32_t h, char* data, uint32_t num_lines,
+      uint32_t total_bits);
+
+  // No Copy allowed
+  FullFilterBitsBuilder(const FullFilterBitsBuilder&);
+  void operator=(const FullFilterBitsBuilder&);
+};
+
+uint32_t FullFilterBitsBuilder::GetTotalBitsForLocality(uint32_t total_bits) {
+  uint32_t num_lines =
+      (total_bits + CACHE_LINE_SIZE * 8 - 1) / (CACHE_LINE_SIZE * 8);
+
+  // Make num_lines an odd number to make sure more bits are involved
+  // when determining which block.
+  if (num_lines % 2 == 0) {
+    num_lines++;
   }
+  return num_lines * (CACHE_LINE_SIZE * 8);
+}
+
+char* FullFilterBitsBuilder::ReserveSpace(const int num_entry,
+    uint32_t* total_bits, uint32_t* num_lines) {
+  assert(bits_per_key_);
+  char* data = nullptr;
+  if (num_entry != 0) {
+    uint32_t total_bits_tmp = num_entry * static_cast<uint32_t>(bits_per_key_);
+
+    *total_bits = GetTotalBitsForLocality(total_bits_tmp);
+    *num_lines = *total_bits / (CACHE_LINE_SIZE * 8);
+    assert(*total_bits > 0 && *total_bits % 8 == 0);
+  } else {
+    // filter is empty, just leave space for metadata
+    *total_bits = 0;
+    *num_lines = 0;
+  }
+
+  // Reserve space for Filter
+  uint32_t sz = *total_bits / 8;
+  sz += 5;  // 4 bytes for num_lines, 1 byte for num_probes
+
+  data = new char[sz];
+  memset(data, 0, sz);
+  return data;
+}
+
+inline void FullFilterBitsBuilder::AddHash(uint32_t h, char* data,
+    uint32_t num_lines, uint32_t total_bits) {
+  assert(num_lines > 0 && total_bits > 0);
 
+  const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
+  uint32_t b = (h % num_lines) * (CACHE_LINE_SIZE * 8);
+
+  for (uint32_t i = 0; i < num_probes_; ++i) {
+    // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized
+    // to a simple operation by compiler.
+    const uint32_t bitpos = b + (h % (CACHE_LINE_SIZE * 8));
+    data[bitpos / 8] |= (1 << (bitpos % 8));
+
+    h += delta;
+  }
+}
+
+class FullFilterBitsReader : public FilterBitsReader {
  public:
-  explicit BloomFilterPolicy(int bits_per_key,
-                             uint32_t (*hash_func)(const Slice& key))
-      : bits_per_key_(bits_per_key), hash_func_(hash_func) {
-    initialize();
+  explicit FullFilterBitsReader(const Slice& contents)
+      : data_(const_cast<char*>(contents.data())),
+        data_len_(static_cast<uint32_t>(contents.size())),
+        num_probes_(0),
+        num_lines_(0) {
+    assert(data_);
+    GetFilterMeta(contents, &num_probes_, &num_lines_);
+    // Sanitize broken parameter
+    if (num_lines_ != 0 && (data_len_-5) % num_lines_ != 0) {
+      num_lines_ = 0;
+      num_probes_ = 0;
+    }
+  }
+
+  ~FullFilterBitsReader() {}
+
+  virtual bool MayMatch(const Slice& entry) override {
+    if (data_len_ <= 5) {   // remain same with original filter
+      return false;
+    }
+    // Other Error params, including a broken filter, regarded as match
+    if (num_probes_ == 0 || num_lines_ == 0) return true;
+    uint32_t hash = BloomHash(entry);
+    return HashMayMatch(hash, Slice(data_, data_len_),
+                        num_probes_, num_lines_);
+  }
+
+ private:
+  // Filter meta data
+  char* data_;
+  uint32_t data_len_;
+  size_t num_probes_;
+  uint32_t num_lines_;
+
+  // Get num_probes, and num_lines from filter
+  // If filter format broken, set both to 0.
+  void GetFilterMeta(const Slice& filter, size_t* num_probes,
+                             uint32_t* num_lines);
+
+  // "filter" contains the data appended by a preceding call to
+  // CreateFilterFromHash() on this class.  This method must return true if
+  // the key was in the list of keys passed to CreateFilter().
+  // This method may return true or false if the key was not on the
+  // list, but it should aim to return false with a high probability.
+  //
+  // hash: target to be checked
+  // filter: the whole filter, including meta data bytes
+  // num_probes: number of probes, read before hand
+  // num_lines: filter metadata, read before hand
+  // Before calling this function, need to ensure the input meta data
+  // is valid.
+  bool HashMayMatch(const uint32_t& hash, const Slice& filter,
+      const size_t& num_probes, const uint32_t& num_lines);
+
+  // No Copy allowed
+  FullFilterBitsReader(const FullFilterBitsReader&);
+  void operator=(const FullFilterBitsReader&);
+};
+
+void FullFilterBitsReader::GetFilterMeta(const Slice& filter,
+    size_t* num_probes, uint32_t* num_lines) {
+  uint32_t len = static_cast<uint32_t>(filter.size());
+  if (len <= 5) {
+    // filter is empty or broken
+    *num_probes = 0;
+    *num_lines = 0;
+    return;
+  }
+
+  *num_probes = filter.data()[len - 5];
+  *num_lines = DecodeFixed32(filter.data() + len - 4);
+}
+
+bool FullFilterBitsReader::HashMayMatch(const uint32_t& hash,
+    const Slice& filter, const size_t& num_probes,
+    const uint32_t& num_lines) {
+  uint32_t len = static_cast<uint32_t>(filter.size());
+  if (len <= 5) return false;  // remain the same with original filter
+
+  // It is ensured the params are valid before calling it
+  assert(num_probes != 0);
+  assert(num_lines != 0 && (len - 5) % num_lines == 0);
+  uint32_t cache_line_size = (len - 5) / num_lines;
+  const char* data = filter.data();
+
+  uint32_t h = hash;
+  const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
+  uint32_t b = (h % num_lines) * (cache_line_size * 8);
+
+  for (uint32_t i = 0; i < num_probes; ++i) {
+    // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized
+    //  to a simple and operation by compiler.
+    const uint32_t bitpos = b + (h % (cache_line_size * 8));
+    if (((data[bitpos / 8]) & (1 << (bitpos % 8))) == 0) {
+      return false;
+    }
+
+    h += delta;
   }
-  explicit BloomFilterPolicy(int bits_per_key)
-      : bits_per_key_(bits_per_key) {
-    hash_func_ = BloomHash;
+
+  return true;
+}
+
+// An implementation of filter policy
+class BloomFilterPolicy : public FilterPolicy {
+ public:
+  explicit BloomFilterPolicy(int bits_per_key, bool use_block_based_builder)
+      : bits_per_key_(bits_per_key), hash_func_(BloomHash),
+        use_block_based_builder_(use_block_based_builder) {
     initialize();
   }
 
-  virtual const char* Name() const {
+  ~BloomFilterPolicy() {
+  }
+
+  virtual const char* Name() const override {
     return "rocksdb.BuiltinBloomFilter";
   }
 
-  virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const {
+  virtual void CreateFilter(const Slice* keys, int n,
+                            std::string* dst) const override {
     // Compute bloom filter size (in both bits and bytes)
     size_t bits = n * bits_per_key_;
 
@@ -61,14 +284,14 @@ class BloomFilterPolicy : public FilterPolicy {
 
     const size_t init_size = dst->size();
     dst->resize(init_size + bytes, 0);
-    dst->push_back(static_cast<char>(k_));  // Remember # of probes in filter
+    dst->push_back(static_cast<char>(num_probes_));  // Remember # of probes
     char* array = &(*dst)[init_size];
     for (size_t i = 0; i < (size_t)n; i++) {
       // Use double-hashing to generate a sequence of hash values.
       // See analysis in [Kirsch,Mitzenmacher 2006].
       uint32_t h = hash_func_(keys[i]);
       const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
-      for (size_t j = 0; j < k_; j++) {
+      for (size_t j = 0; j < num_probes_; j++) {
         const uint32_t bitpos = h % bits;
         array[bitpos/8] |= (1 << (bitpos % 8));
         h += delta;
@@ -76,7 +299,8 @@ class BloomFilterPolicy : public FilterPolicy {
     }
   }
 
-  virtual bool KeyMayMatch(const Slice& key, const Slice& bloom_filter) const {
+  virtual bool KeyMayMatch(const Slice& key,
+                           const Slice& bloom_filter) const override {
     const size_t len = bloom_filter.size();
     if (len < 2) return false;
 
@@ -101,11 +325,43 @@ class BloomFilterPolicy : public FilterPolicy {
     }
     return true;
   }
+
+  virtual FilterBitsBuilder* GetFilterBitsBuilder() const override {
+    if (use_block_based_builder_) {
+      return nullptr;
+    }
+
+    return new FullFilterBitsBuilder(bits_per_key_, num_probes_);
+  }
+
+  virtual FilterBitsReader* GetFilterBitsReader(const Slice& contents)
+      const override {
+    return new FullFilterBitsReader(contents);
+  }
+
+  // If choose to use block based builder
+  bool UseBlockBasedBuilder() { return use_block_based_builder_; }
+
+ private:
+  size_t bits_per_key_;
+  size_t num_probes_;
+  uint32_t (*hash_func_)(const Slice& key);
+
+  const bool use_block_based_builder_;
+
+  void initialize() {
+    // We intentionally round down to reduce probing cost a little bit
+    num_probes_ = static_cast<size_t>(bits_per_key_ * 0.69);  // 0.69 =~ ln(2)
+    if (num_probes_ < 1) num_probes_ = 1;
+    if (num_probes_ > 30) num_probes_ = 30;
+  }
 };
-}
 
-const FilterPolicy* NewBloomFilterPolicy(int bits_per_key) {
-  return new BloomFilterPolicy(bits_per_key);
+}  // namespace
+
+const FilterPolicy* NewBloomFilterPolicy(int bits_per_key,
+                                         bool use_block_based_builder) {
+  return new BloomFilterPolicy(bits_per_key, use_block_based_builder);
 }
 
 }  // namespace rocksdb
diff --git a/src/rocksdb/util/bloom_test.cc b/src/rocksdb/util/bloom_test.cc
index 2c430e2..237bf7d 100644
--- a/src/rocksdb/util/bloom_test.cc
+++ b/src/rocksdb/util/bloom_test.cc
@@ -7,13 +7,24 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+  return 1;
+}
+#else
+
 #include <gflags/gflags.h>
+#include <vector>
 
 #include "rocksdb/filter_policy.h"
-
 #include "util/logging.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
+#include "util/arena.h"
+
+using GFLAGS::ParseCommandLineFlags;
 
 DEFINE_int32(bits_per_key, 10, "");
 
@@ -26,14 +37,28 @@ static Slice Key(int i, char* buffer) {
   return Slice(buffer, sizeof(i));
 }
 
-class BloomTest {
+static int NextLength(int length) {
+  if (length < 10) {
+    length += 1;
+  } else if (length < 100) {
+    length += 10;
+  } else if (length < 1000) {
+    length += 100;
+  } else {
+    length += 1000;
+  }
+  return length;
+}
+
+class BloomTest : public testing::Test {
  private:
   const FilterPolicy* policy_;
   std::string filter_;
   std::vector<std::string> keys_;
 
  public:
-  BloomTest() : policy_(NewBloomFilterPolicy(FLAGS_bits_per_key)) { }
+  BloomTest() : policy_(
+      NewBloomFilterPolicy(FLAGS_bits_per_key)) {}
 
   ~BloomTest() {
     delete policy_;
@@ -54,7 +79,8 @@ class BloomTest {
       key_slices.push_back(Slice(keys_[i]));
     }
     filter_.clear();
-    policy_->CreateFilter(&key_slices[0], key_slices.size(), &filter_);
+    policy_->CreateFilter(&key_slices[0], static_cast<int>(key_slices.size()),
+                          &filter_);
     keys_.clear();
     if (kVerbose >= 2) DumpFilter();
   }
@@ -93,12 +119,12 @@ class BloomTest {
   }
 };
 
-TEST(BloomTest, EmptyFilter) {
+TEST_F(BloomTest, EmptyFilter) {
   ASSERT_TRUE(! Matches("hello"));
   ASSERT_TRUE(! Matches("world"));
 }
 
-TEST(BloomTest, Small) {
+TEST_F(BloomTest, Small) {
   Add("hello");
   Add("world");
   ASSERT_TRUE(Matches("hello"));
@@ -107,20 +133,7 @@ TEST(BloomTest, Small) {
   ASSERT_TRUE(! Matches("foo"));
 }
 
-static int NextLength(int length) {
-  if (length < 10) {
-    length += 1;
-  } else if (length < 100) {
-    length += 10;
-  } else if (length < 1000) {
-    length += 100;
-  } else {
-    length += 1000;
-  }
-  return length;
-}
-
-TEST(BloomTest, VaryingLengths) {
+TEST_F(BloomTest, VaryingLengths) {
   char buffer[sizeof(int)];
 
   // Count number of filters that significantly exceed the false positive rate
@@ -161,10 +174,128 @@ TEST(BloomTest, VaryingLengths) {
 
 // Different bits-per-byte
 
+class FullBloomTest : public testing::Test {
+ private:
+  const FilterPolicy* policy_;
+  std::unique_ptr<FilterBitsBuilder> bits_builder_;
+  std::unique_ptr<FilterBitsReader> bits_reader_;
+  std::unique_ptr<const char[]> buf_;
+  size_t filter_size_;
+
+ public:
+  FullBloomTest() :
+      policy_(NewBloomFilterPolicy(FLAGS_bits_per_key, false)),
+      filter_size_(0) {
+    Reset();
+  }
+
+  ~FullBloomTest() {
+    delete policy_;
+  }
+
+  void Reset() {
+    bits_builder_.reset(policy_->GetFilterBitsBuilder());
+    bits_reader_.reset(nullptr);
+    buf_.reset(nullptr);
+    filter_size_ = 0;
+  }
+
+  void Add(const Slice& s) {
+    bits_builder_->AddKey(s);
+  }
+
+  void Build() {
+    Slice filter = bits_builder_->Finish(&buf_);
+    bits_reader_.reset(policy_->GetFilterBitsReader(filter));
+    filter_size_ = filter.size();
+  }
+
+  size_t FilterSize() const {
+    return filter_size_;
+  }
+
+  bool Matches(const Slice& s) {
+    if (bits_reader_ == nullptr) {
+      Build();
+    }
+    return bits_reader_->MayMatch(s);
+  }
+
+  double FalsePositiveRate() {
+    char buffer[sizeof(int)];
+    int result = 0;
+    for (int i = 0; i < 10000; i++) {
+      if (Matches(Key(i + 1000000000, buffer))) {
+        result++;
+      }
+    }
+    return result / 10000.0;
+  }
+};
+
+TEST_F(FullBloomTest, FullEmptyFilter) {
+  // Empty filter is not match, at this level
+  ASSERT_TRUE(!Matches("hello"));
+  ASSERT_TRUE(!Matches("world"));
+}
+
+TEST_F(FullBloomTest, FullSmall) {
+  Add("hello");
+  Add("world");
+  ASSERT_TRUE(Matches("hello"));
+  ASSERT_TRUE(Matches("world"));
+  ASSERT_TRUE(!Matches("x"));
+  ASSERT_TRUE(!Matches("foo"));
+}
+
+TEST_F(FullBloomTest, FullVaryingLengths) {
+  char buffer[sizeof(int)];
+
+  // Count number of filters that significantly exceed the false positive rate
+  int mediocre_filters = 0;
+  int good_filters = 0;
+
+  for (int length = 1; length <= 10000; length = NextLength(length)) {
+    Reset();
+    for (int i = 0; i < length; i++) {
+      Add(Key(i, buffer));
+    }
+    Build();
+
+    ASSERT_LE(FilterSize(), (size_t)((length * 10 / 8) + 128 + 5)) << length;
+
+    // All added keys must match
+    for (int i = 0; i < length; i++) {
+      ASSERT_TRUE(Matches(Key(i, buffer)))
+          << "Length " << length << "; key " << i;
+    }
+
+    // Check false positive rate
+    double rate = FalsePositiveRate();
+    if (kVerbose >= 1) {
+      fprintf(stderr, "False positives: %5.2f%% @ length = %6d ; bytes = %6d\n",
+              rate*100.0, length, static_cast<int>(FilterSize()));
+    }
+    ASSERT_LE(rate, 0.02);   // Must not be over 2%
+    if (rate > 0.0125)
+      mediocre_filters++;  // Allowed, but not too often
+    else
+      good_filters++;
+  }
+  if (kVerbose >= 1) {
+    fprintf(stderr, "Filters: %d good, %d mediocre\n",
+            good_filters, mediocre_filters);
+  }
+  ASSERT_LE(mediocre_filters, good_filters/5);
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
-  google::ParseCommandLineFlags(&argc, &argv, true);
+  ::testing::InitGoogleTest(&argc, argv);
+  ParseCommandLineFlags(&argc, &argv, true);
 
-  return rocksdb::test::RunAllTests();
+  return RUN_ALL_TESTS();
 }
+
+#endif  // GFLAGS
diff --git a/src/rocksdb/util/build_version.h b/src/rocksdb/util/build_version.h
index 2035a78..ca1dbf5 100644
--- a/src/rocksdb/util/build_version.h
+++ b/src/rocksdb/util/build_version.h
@@ -7,10 +7,9 @@
 #if !defined(IOS_CROSS_COMPILE)
 // if we compile with Xcode, we don't run build_detect_vesion, so we don't
 // generate these variables
-// these variables tell us about the git config and time
+// this variable tells us about the git revision
 extern const char* rocksdb_build_git_sha;
 
-// these variables tell us when the compilation occurred
-extern const char* rocksdb_build_compile_time;
+// Date on which the code was compiled:
 extern const char* rocksdb_build_compile_date;
 #endif
diff --git a/src/rocksdb/util/cache.cc b/src/rocksdb/util/cache.cc
index f1c48a8..781e870 100644
--- a/src/rocksdb/util/cache.cc
+++ b/src/rocksdb/util/cache.cc
@@ -26,8 +26,27 @@ namespace {
 
 // LRU cache implementation
 
-// An entry is a variable length heap-allocated structure.  Entries
-// are kept in a circular doubly linked list ordered by access time.
+// An entry is a variable length heap-allocated structure.
+// Entries are referenced by cache and/or by any external entity.
+// The cache keeps all its entries in table. Some elements
+// are also stored on LRU list.
+//
+// LRUHandle can be in these states:
+// 1. Referenced externally AND in hash table.
+//  In that case the entry is *not* in the LRU. (refs > 1 && in_cache == true)
+// 2. Not referenced externally and in hash table. In that case the entry is
+// in the LRU and can be freed. (refs == 1 && in_cache == true)
+// 3. Referenced externally and not in hash table. In that case the entry is
+// in not on LRU and not in table. (refs >= 1 && in_cache == false)
+//
+// All newly created LRUHandles are in state 1. If you call LRUCache::Release
+// on entry in state 1, it will go into state 2. To move from state 1 to
+// state 3, either call LRUCache::Erase or LRUCache::Insert with the same key.
+// To move from state 2 to state 1, use LRUCache::Lookup.
+// Before destruction, make sure that no handles are in state 1. This means
+// that any successful LRUCache::Lookup/LRUCache::Insert have a matching
+// RUCache::Release (to move into state 2) or LRUCache::Erase (for state 3)
+
 struct LRUHandle {
   void* value;
   void (*deleter)(const Slice&, void* value);
@@ -36,7 +55,9 @@ struct LRUHandle {
   LRUHandle* prev;
   size_t charge;      // TODO(opt): Only allow uint32_t?
   size_t key_length;
-  uint32_t refs;
+  uint32_t refs;      // a number of refs to this entry
+                      // cache itself is counted as 1
+  bool in_cache;      // true, if this entry is referenced by the hash table
   uint32_t hash;      // Hash of key(); used for fast sharding and comparisons
   char key_data[1];   // Beginning of key
 
@@ -49,6 +70,12 @@ struct LRUHandle {
       return Slice(key_data, key_length);
     }
   }
+
+  void Free() {
+    assert((refs == 1 && in_cache) || (refs == 0 && !in_cache));
+    (*deleter)(key(), value);
+    free(this);
+  }
 };
 
 // We provide our own simple hash table since it removes a whole bunch
@@ -59,7 +86,28 @@ struct LRUHandle {
 class HandleTable {
  public:
   HandleTable() : length_(0), elems_(0), list_(nullptr) { Resize(); }
-  ~HandleTable() { delete[] list_; }
+
+  template <typename T>
+  void ApplyToAllCacheEntries(T func) {
+    for (uint32_t i = 0; i < length_; i++) {
+      LRUHandle* h = list_[i];
+      while (h != nullptr) {
+        auto n = h->next_hash;
+        assert(h->in_cache);
+        func(h);
+        h = n;
+      }
+    }
+  }
+
+  ~HandleTable() {
+    ApplyToAllCacheEntries([](LRUHandle* h) {
+      if (h->refs == 1) {
+        h->Free();
+      }
+    });
+    delete[] list_;
+  }
 
   LRUHandle* Lookup(const Slice& key, uint32_t hash) {
     return *FindPointer(key, hash);
@@ -144,10 +192,9 @@ class LRUCache {
   ~LRUCache();
 
   // Separate from constructor so caller can easily make an array of LRUCache
-  void SetCapacity(size_t capacity) { capacity_ = capacity; }
-  void SetRemoveScanCountLimit(size_t remove_scan_count_limit) {
-    remove_scan_count_limit_ = remove_scan_count_limit;
-  }
+  // if current usage is more than new capacity, the function will attempt to
+  // free the needed space
+  void SetCapacity(size_t capacity);
 
   // Like Cache methods, but with an extra "hash" parameter.
   Cache::Handle* Insert(const Slice& key, uint32_t hash,
@@ -173,12 +220,16 @@ class LRUCache {
   // Just reduce the reference count by 1.
   // Return true if last reference
   bool Unref(LRUHandle* e);
-  // Call deleter and free
-  void FreeEntry(LRUHandle* e);
+
+  // Free some space following strict LRU policy until enough space
+  // to hold (usage_ + charge) is freed or the lru list is empty
+  // This function is not thread safe - it needs to be executed while
+  // holding the mutex_
+  void EvictFromLRU(size_t charge,
+                    autovector<LRUHandle*>* deleted);
 
   // Initialized before use.
   size_t capacity_;
-  uint32_t remove_scan_count_limit_;
 
   // mutex_ protects the following state.
   // We don't count mutex_ as the cache's internal state so semantically we
@@ -188,6 +239,7 @@ class LRUCache {
 
   // Dummy head of LRU list.
   // lru.prev is newest entry, lru.next is oldest entry.
+  // LRU contains items which can be evicted, ie reference only by cache
   LRUHandle lru_;
 
   HandleTable table_;
@@ -200,16 +252,7 @@ LRUCache::LRUCache()
   lru_.prev = &lru_;
 }
 
-LRUCache::~LRUCache() {
-  for (LRUHandle* e = lru_.next; e != &lru_; ) {
-    LRUHandle* next = e->next;
-    assert(e->refs == 1);  // Error if caller has an unreleased handle
-    if (Unref(e)) {
-      FreeEntry(e);
-    }
-    e = next;
-  }
-}
+LRUCache::~LRUCache() {}
 
 bool LRUCache::Unref(LRUHandle* e) {
   assert(e->refs > 0);
@@ -217,47 +260,77 @@ bool LRUCache::Unref(LRUHandle* e) {
   return e->refs == 0;
 }
 
-void LRUCache::FreeEntry(LRUHandle* e) {
-  assert(e->refs == 0);
-  (*e->deleter)(e->key(), e->value);
-  free(e);
-}
+// Call deleter and free
 
 void LRUCache::ApplyToAllCacheEntries(void (*callback)(void*, size_t),
                                       bool thread_safe) {
   if (thread_safe) {
     mutex_.Lock();
   }
-  for (auto e = lru_.next; e != &lru_; e = e->next) {
-    callback(e->value, e->charge);
-  }
+  table_.ApplyToAllCacheEntries([callback](LRUHandle* h) {
+    callback(h->value, h->charge);
+  });
   if (thread_safe) {
     mutex_.Unlock();
   }
 }
 
 void LRUCache::LRU_Remove(LRUHandle* e) {
+  assert(e->next != nullptr);
+  assert(e->prev != nullptr);
   e->next->prev = e->prev;
   e->prev->next = e->next;
-  usage_ -= e->charge;
+  e->prev = e->next = nullptr;
 }
 
 void LRUCache::LRU_Append(LRUHandle* e) {
   // Make "e" newest entry by inserting just before lru_
+  assert(e->next == nullptr);
+  assert(e->prev == nullptr);
   e->next = &lru_;
   e->prev = lru_.prev;
   e->prev->next = e;
   e->next->prev = e;
-  usage_ += e->charge;
+}
+
+void LRUCache::EvictFromLRU(size_t charge,
+                            autovector<LRUHandle*>* deleted) {
+  while (usage_ + charge > capacity_ && lru_.next != &lru_) {
+    LRUHandle* old = lru_.next;
+    assert(old->in_cache);
+    assert(old->refs == 1);  // LRU list contains elements which may be evicted
+    LRU_Remove(old);
+    table_.Remove(old->key(), old->hash);
+    old->in_cache = false;
+    Unref(old);
+    usage_ -= old->charge;
+    deleted->push_back(old);
+  }
+}
+
+void LRUCache::SetCapacity(size_t capacity) {
+  autovector<LRUHandle*> last_reference_list;
+  {
+    MutexLock l(&mutex_);
+    capacity_ = capacity;
+    EvictFromLRU(0, &last_reference_list);
+  }
+  // we free the entries here outside of mutex for
+  // performance reasons
+  for (auto entry : last_reference_list) {
+    entry->Free();
+  }
 }
 
 Cache::Handle* LRUCache::Lookup(const Slice& key, uint32_t hash) {
   MutexLock l(&mutex_);
   LRUHandle* e = table_.Lookup(key, hash);
   if (e != nullptr) {
+    assert(e->in_cache);
+    if (e->refs == 1) {
+      LRU_Remove(e);
+    }
     e->refs++;
-    LRU_Remove(e);
-    LRU_Append(e);
   }
   return reinterpret_cast<Cache::Handle*>(e);
 }
@@ -268,9 +341,31 @@ void LRUCache::Release(Cache::Handle* handle) {
   {
     MutexLock l(&mutex_);
     last_reference = Unref(e);
+    if (last_reference) {
+      usage_ -= e->charge;
+    }
+    if (e->refs == 1 && e->in_cache) {
+      // The item is still in cache, and nobody else holds a reference to it
+      if (usage_ > capacity_) {
+        // the cache is full
+        // The LRU list must be empty since the cache is full
+        assert(lru_.next == &lru_);
+        // take this opportunity and remove the item
+        table_.Remove(e->key(), e->hash);
+        e->in_cache = false;
+        Unref(e);
+        usage_ -= e->charge;
+        last_reference = true;
+      } else {
+        // put the item on the list to be potentially freed
+        LRU_Append(e);
+      }
+    }
   }
+
+  // free outside of mutex
   if (last_reference) {
-    FreeEntry(e);
+    e->Free();
   }
 }
 
@@ -278,8 +373,11 @@ Cache::Handle* LRUCache::Insert(
     const Slice& key, uint32_t hash, void* value, size_t charge,
     void (*deleter)(const Slice& key, void* value)) {
 
-  LRUHandle* e = reinterpret_cast<LRUHandle*>(
-      malloc(sizeof(LRUHandle)-1 + key.size()));
+  // Allocate the memory here outside of the mutex
+  // If the cache is full, we'll have to release it
+  // It shouldn't happen very often though.
+  LRUHandle* e =
+      reinterpret_cast<LRUHandle*>(malloc(sizeof(LRUHandle) - 1 + key.size()));
   autovector<LRUHandle*> last_reference_list;
 
   e->value = value;
@@ -288,47 +386,29 @@ Cache::Handle* LRUCache::Insert(
   e->key_length = key.size();
   e->hash = hash;
   e->refs = 2;  // One from LRUCache, one for the returned handle
+  e->next = e->prev = nullptr;
+  e->in_cache = true;
   memcpy(e->key_data, key.data(), key.size());
 
   {
     MutexLock l(&mutex_);
 
-    LRU_Append(e);
+    // Free the space following strict LRU policy until enough space
+    // is freed or the lru list is empty
+    EvictFromLRU(charge, &last_reference_list);
 
+    // insert into the cache
+    // note that the cache might get larger than its capacity if not enough
+    // space was freed
     LRUHandle* old = table_.Insert(e);
+    usage_ += e->charge;
     if (old != nullptr) {
-      LRU_Remove(old);
-      if (Unref(old)) {
-        last_reference_list.push_back(old);
-      }
-    }
-
-    if (remove_scan_count_limit_ > 0) {
-      // Try to free the space by evicting the entries that are only
-      // referenced by the cache first.
-      LRUHandle* cur = lru_.next;
-      for (unsigned int scanCount = 0;
-           usage_ > capacity_ && cur != &lru_
-           && scanCount < remove_scan_count_limit_; scanCount++) {
-        LRUHandle* next = cur->next;
-        if (cur->refs <= 1) {
-          LRU_Remove(cur);
-          table_.Remove(cur->key(), cur->hash);
-          if (Unref(cur)) {
-            last_reference_list.push_back(cur);
-          }
-        }
-        cur = next;
-      }
-    }
-
-    // Free the space following strict LRU policy until enough space
-    // is freed.
-    while (usage_ > capacity_ && lru_.next != &lru_) {
-      LRUHandle* old = lru_.next;
-      LRU_Remove(old);
-      table_.Remove(old->key(), old->hash);
+      old->in_cache = false;
       if (Unref(old)) {
+        usage_ -= old->charge;
+        // old is on LRU because it's in cache and its reference count
+        // was just 1 (Unref returned 0)
+        LRU_Remove(old);
         last_reference_list.push_back(old);
       }
     }
@@ -337,7 +417,7 @@ Cache::Handle* LRUCache::Insert(
   // we free the entries here outside of mutex for
   // performance reasons
   for (auto entry : last_reference_list) {
-    FreeEntry(entry);
+    entry->Free();
   }
 
   return reinterpret_cast<Cache::Handle*>(e);
@@ -350,24 +430,31 @@ void LRUCache::Erase(const Slice& key, uint32_t hash) {
     MutexLock l(&mutex_);
     e = table_.Remove(key, hash);
     if (e != nullptr) {
-      LRU_Remove(e);
       last_reference = Unref(e);
+      if (last_reference) {
+        usage_ -= e->charge;
+      }
+      if (last_reference && e->in_cache) {
+        LRU_Remove(e);
+      }
+      e->in_cache = false;
     }
   }
+
   // mutex not held here
   // last_reference will only be true if e != nullptr
   if (last_reference) {
-    FreeEntry(e);
+    e->Free();
   }
 }
 
 static int kNumShardBits = 4;          // default values, can be overridden
-static int kRemoveScanCountLimit = 0; // default values, can be overridden
 
 class ShardedLRUCache : public Cache {
  private:
   LRUCache* shards_;
   port::Mutex id_mutex_;
+  port::Mutex capacity_mutex_;
   uint64_t last_id_;
   int num_shard_bits_;
   size_t capacity_;
@@ -381,60 +468,56 @@ class ShardedLRUCache : public Cache {
     return (num_shard_bits_ > 0) ? (hash >> (32 - num_shard_bits_)) : 0;
   }
 
-  void init(size_t capacity, int numbits, int removeScanCountLimit) {
-    num_shard_bits_ = numbits;
-    capacity_ = capacity;
+ public:
+  ShardedLRUCache(size_t capacity, int num_shard_bits)
+      : last_id_(0), num_shard_bits_(num_shard_bits), capacity_(capacity) {
     int num_shards = 1 << num_shard_bits_;
     shards_ = new LRUCache[num_shards];
     const size_t per_shard = (capacity + (num_shards - 1)) / num_shards;
     for (int s = 0; s < num_shards; s++) {
       shards_[s].SetCapacity(per_shard);
-      shards_[s].SetRemoveScanCountLimit(removeScanCountLimit);
     }
   }
-
- public:
-  explicit ShardedLRUCache(size_t capacity)
-      : last_id_(0) {
-    init(capacity, kNumShardBits, kRemoveScanCountLimit);
-  }
-  ShardedLRUCache(size_t capacity, int num_shard_bits,
-                  int removeScanCountLimit)
-     : last_id_(0) {
-    init(capacity, num_shard_bits, removeScanCountLimit);
-  }
   virtual ~ShardedLRUCache() {
     delete[] shards_;
   }
+  virtual void SetCapacity(size_t capacity) override {
+    int num_shards = 1 << num_shard_bits_;
+    const size_t per_shard = (capacity + (num_shards - 1)) / num_shards;
+    MutexLock l(&capacity_mutex_);
+    for (int s = 0; s < num_shards; s++) {
+      shards_[s].SetCapacity(per_shard);
+    }
+    capacity_ = capacity;
+  }
   virtual Handle* Insert(const Slice& key, void* value, size_t charge,
-                         void (*deleter)(const Slice& key, void* value)) {
+                         void (*deleter)(const Slice& key,
+                                         void* value)) override {
     const uint32_t hash = HashSlice(key);
     return shards_[Shard(hash)].Insert(key, hash, value, charge, deleter);
   }
-  virtual Handle* Lookup(const Slice& key) {
+  virtual Handle* Lookup(const Slice& key) override {
     const uint32_t hash = HashSlice(key);
     return shards_[Shard(hash)].Lookup(key, hash);
   }
-  virtual void Release(Handle* handle) {
+  virtual void Release(Handle* handle) override {
     LRUHandle* h = reinterpret_cast<LRUHandle*>(handle);
     shards_[Shard(h->hash)].Release(handle);
   }
-  virtual void Erase(const Slice& key) {
+  virtual void Erase(const Slice& key) override {
     const uint32_t hash = HashSlice(key);
     shards_[Shard(hash)].Erase(key, hash);
   }
-  virtual void* Value(Handle* handle) {
+  virtual void* Value(Handle* handle) override {
     return reinterpret_cast<LRUHandle*>(handle)->value;
   }
-  virtual uint64_t NewId() {
+  virtual uint64_t NewId() override {
     MutexLock l(&id_mutex_);
     return ++(last_id_);
   }
-  virtual size_t GetCapacity() const {
-    return capacity_;
-  }
+  virtual size_t GetCapacity() const override { return capacity_; }
 
-  virtual size_t GetUsage() const {
+  virtual size_t GetUsage() const override {
     // We will not lock the cache when getting the usage from shards.
     // for (size_t i = 0; i < num_shard_bits_; ++i)
     int num_shards = 1 << num_shard_bits_;
@@ -445,9 +528,7 @@ class ShardedLRUCache : public Cache {
     return usage;
   }
 
-  virtual void DisownData() {
-    shards_ = nullptr;
-  }
+  virtual void DisownData() override { shards_ = nullptr; }
 
   virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t),
                                       bool thread_safe) override {
@@ -465,17 +546,10 @@ shared_ptr<Cache> NewLRUCache(size_t capacity) {
 }
 
 shared_ptr<Cache> NewLRUCache(size_t capacity, int num_shard_bits) {
-  return NewLRUCache(capacity, num_shard_bits, kRemoveScanCountLimit);
-}
-
-shared_ptr<Cache> NewLRUCache(size_t capacity, int num_shard_bits,
-                              int removeScanCountLimit) {
   if (num_shard_bits >= 20) {
     return nullptr;  // the cache cannot be sharded into too many fine pieces
   }
-  return std::make_shared<ShardedLRUCache>(capacity,
-                                           num_shard_bits,
-                                           removeScanCountLimit);
+  return std::make_shared<ShardedLRUCache>(capacity, num_shard_bits);
 }
 
 }  // namespace rocksdb
diff --git a/src/rocksdb/util/cache_bench.cc b/src/rocksdb/util/cache_bench.cc
new file mode 100644
index 0000000..92df772
--- /dev/null
+++ b/src/rocksdb/util/cache_bench.cc
@@ -0,0 +1,276 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+  return 1;
+}
+#else
+
+#include <inttypes.h>
+#include <sys/types.h>
+#include <stdio.h>
+#include <gflags/gflags.h>
+
+#include "rocksdb/db.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/env.h"
+#include "port/port.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+
+using GFLAGS::ParseCommandLineFlags;
+
+static const uint32_t KB = 1024;
+
+DEFINE_int32(threads, 16, "Number of concurrent threads to run.");
+DEFINE_int64(cache_size, 8 * KB * KB,
+             "Number of bytes to use as a cache of uncompressed data.");
+DEFINE_int32(num_shard_bits, 4, "shard_bits.");
+
+DEFINE_int64(max_key, 1 * KB * KB * KB, "Max number of key to place in cache");
+DEFINE_uint64(ops_per_thread, 1200000, "Number of operations per thread.");
+
+DEFINE_bool(populate_cache, false, "Populate cache before operations");
+DEFINE_int32(insert_percent, 40,
+             "Ratio of insert to total workload (expressed as a percentage)");
+DEFINE_int32(lookup_percent, 50,
+             "Ratio of lookup to total workload (expressed as a percentage)");
+DEFINE_int32(erase_percent, 10,
+             "Ratio of erase to total workload (expressed as a percentage)");
+
+namespace rocksdb {
+
+class CacheBench;
+namespace {
+void deleter(const Slice& key, void* value) {
+    delete reinterpret_cast<char *>(value);
+}
+
+// State shared by all concurrent executions of the same benchmark.
+class SharedState {
+ public:
+  explicit SharedState(CacheBench* cache_bench)
+      : cv_(&mu_),
+        num_threads_(FLAGS_threads),
+        num_initialized_(0),
+        start_(false),
+        num_done_(0),
+        cache_bench_(cache_bench) {
+  }
+
+  ~SharedState() {}
+
+  port::Mutex* GetMutex() {
+    return &mu_;
+  }
+
+  port::CondVar* GetCondVar() {
+    return &cv_;
+  }
+
+  CacheBench* GetCacheBench() const {
+    return cache_bench_;
+  }
+
+  void IncInitialized() {
+    num_initialized_++;
+  }
+
+  void IncDone() {
+    num_done_++;
+  }
+
+  bool AllInitialized() const {
+    return num_initialized_ >= num_threads_;
+  }
+
+  bool AllDone() const {
+    return num_done_ >= num_threads_;
+  }
+
+  void SetStart() {
+    start_ = true;
+  }
+
+  bool Started() const {
+    return start_;
+  }
+
+ private:
+  port::Mutex mu_;
+  port::CondVar cv_;
+
+  const uint64_t num_threads_;
+  uint64_t num_initialized_;
+  bool start_;
+  uint64_t num_done_;
+
+  CacheBench* cache_bench_;
+};
+
+// Per-thread state for concurrent executions of the same benchmark.
+struct ThreadState {
+  uint32_t tid;
+  Random rnd;
+  SharedState* shared;
+
+  ThreadState(uint32_t index, SharedState* _shared)
+      : tid(index), rnd(1000 + index), shared(_shared) {}
+};
+}  // namespace
+
+class CacheBench {
+ public:
+  CacheBench() :
+      cache_(NewLRUCache(FLAGS_cache_size, FLAGS_num_shard_bits)),
+      num_threads_(FLAGS_threads) {}
+
+  ~CacheBench() {}
+
+  void PopulateCache() {
+    Random rnd(1);
+    for (int64_t i = 0; i < FLAGS_cache_size; i++) {
+      uint64_t rand_key = rnd.Next() % FLAGS_max_key;
+      // Cast uint64* to be char*, data would be copied to cache
+      Slice key(reinterpret_cast<char*>(&rand_key), 8);
+      // do insert
+      auto handle = cache_->Insert(key, new char[10], 1, &deleter);
+      cache_->Release(handle);
+    }
+  }
+
+  bool Run() {
+    rocksdb::Env* env = rocksdb::Env::Default();
+
+    PrintEnv();
+    SharedState shared(this);
+    std::vector<ThreadState*> threads(num_threads_);
+    for (uint32_t i = 0; i < num_threads_; i++) {
+      threads[i] = new ThreadState(i, &shared);
+      env->StartThread(ThreadBody, threads[i]);
+    }
+    {
+      MutexLock l(shared.GetMutex());
+      while (!shared.AllInitialized()) {
+        shared.GetCondVar()->Wait();
+      }
+      // Record start time
+      uint64_t start_time = env->NowMicros();
+
+      // Start all threads
+      shared.SetStart();
+      shared.GetCondVar()->SignalAll();
+
+      // Wait threads to complete
+      while (!shared.AllDone()) {
+        shared.GetCondVar()->Wait();
+      }
+
+      // Record end time
+      uint64_t end_time = env->NowMicros();
+      double elapsed = static_cast<double>(end_time - start_time) * 1e-6;
+      uint32_t qps = static_cast<uint32_t>(
+          static_cast<double>(FLAGS_threads * FLAGS_ops_per_thread) / elapsed);
+      fprintf(stdout, "Complete in %.3f s; QPS = %u\n", elapsed, qps);
+    }
+    return true;
+  }
+
+ private:
+  std::shared_ptr<Cache> cache_;
+  uint32_t num_threads_;
+
+  static void ThreadBody(void* v) {
+    ThreadState* thread = reinterpret_cast<ThreadState*>(v);
+    SharedState* shared = thread->shared;
+
+    {
+      MutexLock l(shared->GetMutex());
+      shared->IncInitialized();
+      if (shared->AllInitialized()) {
+        shared->GetCondVar()->SignalAll();
+      }
+      while (!shared->Started()) {
+        shared->GetCondVar()->Wait();
+      }
+    }
+    thread->shared->GetCacheBench()->OperateCache(thread);
+
+    {
+      MutexLock l(shared->GetMutex());
+      shared->IncDone();
+      if (shared->AllDone()) {
+        shared->GetCondVar()->SignalAll();
+      }
+    }
+  }
+
+  void OperateCache(ThreadState* thread) {
+    for (uint64_t i = 0; i < FLAGS_ops_per_thread; i++) {
+      uint64_t rand_key = thread->rnd.Next() % FLAGS_max_key;
+      // Cast uint64* to be char*, data would be copied to cache
+      Slice key(reinterpret_cast<char*>(&rand_key), 8);
+      int32_t prob_op = thread->rnd.Uniform(100);
+      if (prob_op >= 0 && prob_op < FLAGS_insert_percent) {
+        // do insert
+        auto handle = cache_->Insert(key, new char[10], 1, &deleter);
+        cache_->Release(handle);
+      } else if (prob_op -= FLAGS_insert_percent &&
+                 prob_op < FLAGS_lookup_percent) {
+        // do lookup
+        auto handle = cache_->Lookup(key);
+        if (handle) {
+          cache_->Release(handle);
+        }
+      } else if (prob_op -= FLAGS_lookup_percent &&
+                 prob_op < FLAGS_erase_percent) {
+        // do erase
+        cache_->Erase(key);
+      }
+    }
+  }
+
+  void PrintEnv() const {
+    printf("RocksDB version     : %d.%d\n", kMajorVersion, kMinorVersion);
+    printf("Number of threads   : %d\n", FLAGS_threads);
+    printf("Ops per thread      : %" PRIu64 "\n", FLAGS_ops_per_thread);
+    printf("Cache size          : %" PRIu64 "\n", FLAGS_cache_size);
+    printf("Num shard bits      : %d\n", FLAGS_num_shard_bits);
+    printf("Max key             : %" PRIu64 "\n", FLAGS_max_key);
+    printf("Populate cache      : %d\n", FLAGS_populate_cache);
+    printf("Insert percentage   : %d%%\n", FLAGS_insert_percent);
+    printf("Lookup percentage   : %d%%\n", FLAGS_lookup_percent);
+    printf("Erase percentage    : %d%%\n", FLAGS_erase_percent);
+    printf("----------------------------\n");
+  }
+};
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ParseCommandLineFlags(&argc, &argv, true);
+
+  if (FLAGS_threads <= 0) {
+    fprintf(stderr, "threads number <= 0\n");
+    exit(1);
+  }
+
+  rocksdb::CacheBench bench;
+  if (FLAGS_populate_cache) {
+    bench.PopulateCache();
+  }
+  if (bench.Run()) {
+    return 0;
+  } else {
+    return 1;
+  }
+}
+
+#endif  // GFLAGS
diff --git a/src/rocksdb/util/cache_test.cc b/src/rocksdb/util/cache_test.cc
index c12cdb7..6fba6a7 100644
--- a/src/rocksdb/util/cache_test.cc
+++ b/src/rocksdb/util/cache_test.cc
@@ -13,6 +13,7 @@
 #include <string>
 #include <iostream>
 #include "util/coding.h"
+#include "util/string_util.h"
 #include "util/testharness.h"
 
 namespace rocksdb {
@@ -28,9 +29,11 @@ static int DecodeKey(const Slice& k) {
   return DecodeFixed32(k.data());
 }
 static void* EncodeValue(uintptr_t v) { return reinterpret_cast<void*>(v); }
-static int DecodeValue(void* v) { return reinterpret_cast<uintptr_t>(v); }
+static int DecodeValue(void* v) {
+  return static_cast<int>(reinterpret_cast<uintptr_t>(v));
+}
 
-class CacheTest {
+class CacheTest : public testing::Test {
  public:
   static CacheTest* current_;
 
@@ -41,11 +44,9 @@ class CacheTest {
 
   static const int kCacheSize = 1000;
   static const int kNumShardBits = 4;
-  static const int kRemoveScanCountLimit = 16;
 
   static const int kCacheSize2 = 100;
   static const int kNumShardBits2 = 2;
-  static const int kRemoveScanCountLimit2 = 200;
 
   std::vector<int> deleted_keys_;
   std::vector<int> deleted_values_;
@@ -53,9 +54,8 @@ class CacheTest {
   shared_ptr<Cache> cache2_;
 
   CacheTest() :
-      cache_(NewLRUCache(kCacheSize, kNumShardBits, kRemoveScanCountLimit)),
-      cache2_(NewLRUCache(kCacheSize2, kNumShardBits2,
-                          kRemoveScanCountLimit2)) {
+      cache_(NewLRUCache(kCacheSize, kNumShardBits)),
+      cache2_(NewLRUCache(kCacheSize2, kNumShardBits2)) {
     current_ = this;
   }
 
@@ -111,10 +111,10 @@ namespace {
 void dumbDeleter(const Slice& key, void* value) { }
 }  // namespace
 
-TEST(CacheTest, UsageTest) {
+TEST_F(CacheTest, UsageTest) {
   // cache is shared_ptr and will be automatically cleaned up.
   const uint64_t kCapacity = 100000;
-  auto cache = NewLRUCache(kCapacity, 8, 200);
+  auto cache = NewLRUCache(kCapacity, 8);
 
   size_t usage = 0;
   const char* value = "abcdef";
@@ -131,7 +131,7 @@ TEST(CacheTest, UsageTest) {
 
   // make sure the cache will be overloaded
   for (uint64_t i = 1; i < kCapacity; ++i) {
-    auto key = std::to_string(i);
+    auto key = ToString(i);
     cache->Release(
         cache->Insert(key, (void*)value, key.size() + 5, dumbDeleter)
     );
@@ -142,7 +142,7 @@ TEST(CacheTest, UsageTest) {
   ASSERT_LT(kCapacity * 0.95, cache->GetUsage());
 }
 
-TEST(CacheTest, HitAndMiss) {
+TEST_F(CacheTest, HitAndMiss) {
   ASSERT_EQ(-1, Lookup(100));
 
   Insert(100, 101);
@@ -165,7 +165,7 @@ TEST(CacheTest, HitAndMiss) {
   ASSERT_EQ(101, deleted_values_[0]);
 }
 
-TEST(CacheTest, Erase) {
+TEST_F(CacheTest, Erase) {
   Erase(200);
   ASSERT_EQ(0U, deleted_keys_.size());
 
@@ -184,32 +184,37 @@ TEST(CacheTest, Erase) {
   ASSERT_EQ(1U, deleted_keys_.size());
 }
 
-TEST(CacheTest, EntriesArePinned) {
+TEST_F(CacheTest, EntriesArePinned) {
   Insert(100, 101);
   Cache::Handle* h1 = cache_->Lookup(EncodeKey(100));
   ASSERT_EQ(101, DecodeValue(cache_->Value(h1)));
+  ASSERT_EQ(1U, cache_->GetUsage());
 
   Insert(100, 102);
   Cache::Handle* h2 = cache_->Lookup(EncodeKey(100));
   ASSERT_EQ(102, DecodeValue(cache_->Value(h2)));
   ASSERT_EQ(0U, deleted_keys_.size());
+  ASSERT_EQ(2U, cache_->GetUsage());
 
   cache_->Release(h1);
   ASSERT_EQ(1U, deleted_keys_.size());
   ASSERT_EQ(100, deleted_keys_[0]);
   ASSERT_EQ(101, deleted_values_[0]);
+  ASSERT_EQ(1U, cache_->GetUsage());
 
   Erase(100);
   ASSERT_EQ(-1, Lookup(100));
   ASSERT_EQ(1U, deleted_keys_.size());
+  ASSERT_EQ(1U, cache_->GetUsage());
 
   cache_->Release(h2);
   ASSERT_EQ(2U, deleted_keys_.size());
   ASSERT_EQ(100, deleted_keys_[1]);
   ASSERT_EQ(102, deleted_values_[1]);
+  ASSERT_EQ(0U, cache_->GetUsage());
 }
 
-TEST(CacheTest, EvictionPolicy) {
+TEST_F(CacheTest, EvictionPolicy) {
   Insert(100, 101);
   Insert(200, 201);
 
@@ -223,7 +228,7 @@ TEST(CacheTest, EvictionPolicy) {
   ASSERT_EQ(-1, Lookup(200));
 }
 
-TEST(CacheTest, EvictionPolicyRef) {
+TEST_F(CacheTest, EvictionPolicyRef) {
   Insert(100, 101);
   Insert(101, 102);
   Insert(102, 103);
@@ -271,77 +276,29 @@ TEST(CacheTest, EvictionPolicyRef) {
   cache_->Release(h204);
 }
 
-TEST(CacheTest, EvictionPolicyRef2) {
-  std::vector<Cache::Handle*> handles;
-
-  Insert(100, 101);
-  // Insert entries much more than Cache capacity
-  for (int i = 0; i < kCacheSize + 100; i++) {
-    Insert(1000 + i, 2000 + i);
-    if (i < kCacheSize ) {
-      handles.push_back(cache_->Lookup(EncodeKey(1000 + i)));
-    }
-  }
-
-  // Make sure referenced keys are also possible to be deleted
-  // if there are not sufficient non-referenced keys
-  for (int i = 0; i < 5; i++) {
-    ASSERT_EQ(-1, Lookup(1000 + i));
-  }
+TEST_F(CacheTest, ErasedHandleState) {
+  // insert a key and get two handles
+  Insert(100, 1000);
+  Cache::Handle* h1 = cache_->Lookup(EncodeKey(100));
+  Cache::Handle* h2 = cache_->Lookup(EncodeKey(100));
+  ASSERT_EQ(h1, h2);
+  ASSERT_EQ(DecodeValue(cache_->Value(h1)), 1000);
+  ASSERT_EQ(DecodeValue(cache_->Value(h2)), 1000);
 
-  for (int i = kCacheSize; i < kCacheSize + 100; i++) {
-    ASSERT_EQ(2000 + i, Lookup(1000 + i));
-  }
+  // delete the key from the cache
+  Erase(100);
+  // can no longer find in the cache
   ASSERT_EQ(-1, Lookup(100));
 
-  // Cleaning up all the handles
-  while (handles.size() > 0) {
-    cache_->Release(handles.back());
-    handles.pop_back();
-  }
-}
-
-TEST(CacheTest, EvictionPolicyRefLargeScanLimit) {
-  std::vector<Cache::Handle*> handles2;
-
-  // Cache2 has a cache RemoveScanCountLimit higher than cache size
-  // so it would trigger a boundary condition.
-
-  // Populate the cache with 10 more keys than its size.
-  // Reference all keys except one close to the end.
-  for (int i = 0; i < kCacheSize2 + 10; i++) {
-    Insert2(1000 + i, 2000+i);
-    if (i != kCacheSize2 ) {
-      handles2.push_back(cache2_->Lookup(EncodeKey(1000 + i)));
-    }
-  }
-
-  // Make sure referenced keys are also possible to be deleted
-  // if there are not sufficient non-referenced keys
-  for (int i = 0; i < 3; i++) {
-    ASSERT_EQ(-1, Lookup2(1000 + i));
-  }
-  // The non-referenced value is deleted even if it's accessed
-  // recently.
-  ASSERT_EQ(-1, Lookup2(1000 + kCacheSize2));
-  // Other values recently accessed are not deleted since they
-  // are referenced.
-  for (int i = kCacheSize2 - 10; i < kCacheSize2 + 10; i++) {
-    if (i != kCacheSize2) {
-      ASSERT_EQ(2000 + i, Lookup2(1000 + i));
-    }
-  }
+  // release one handle
+  cache_->Release(h1);
+  // still can't find in cache
+  ASSERT_EQ(-1, Lookup(100));
 
-  // Cleaning up all the handles
-  while (handles2.size() > 0) {
-    cache2_->Release(handles2.back());
-    handles2.pop_back();
-  }
+  cache_->Release(h2);
 }
 
-
-
-TEST(CacheTest, HeavyEntries) {
+TEST_F(CacheTest, HeavyEntries) {
   // Add a bunch of light and heavy entries and then count the combined
   // size of items still in the cache, which must be approximately the
   // same as the total capacity.
@@ -368,7 +325,7 @@ TEST(CacheTest, HeavyEntries) {
   ASSERT_LE(cached_weight, kCacheSize + kCacheSize/10);
 }
 
-TEST(CacheTest, NewId) {
+TEST_F(CacheTest, NewId) {
   uint64_t a = cache_->NewId();
   uint64_t b = cache_->NewId();
   ASSERT_NE(a, b);
@@ -377,21 +334,64 @@ TEST(CacheTest, NewId) {
 
 class Value {
  private:
-  int v_;
+  size_t v_;
  public:
-  explicit Value(int v) : v_(v) { }
+  explicit Value(size_t v) : v_(v) { }
 
   ~Value() { std::cout << v_ << " is destructed\n"; }
 };
 
 namespace {
 void deleter(const Slice& key, void* value) {
-  delete (Value *)value;
+  delete static_cast<Value *>(value);
 }
 }  // namespace
 
-TEST(CacheTest, BadEviction) {
-  int n = 10;
+TEST_F(CacheTest, SetCapacity) {
+  // test1: increase capacity
+  // lets create a cache with capacity 5,
+  // then, insert 5 elements, then increase capacity
+  // to 10, returned capacity should be 10, usage=5
+  std::shared_ptr<Cache> cache = NewLRUCache(5, 0);
+  std::vector<Cache::Handle*> handles(10);
+  // Insert 5 entries, but not releasing.
+  for (size_t i = 0; i < 5; i++) {
+    std::string key = ToString(i+1);
+    handles[i] = cache->Insert(key, new Value(i+1), 1, &deleter);
+  }
+  ASSERT_EQ(5U, cache->GetCapacity());
+  ASSERT_EQ(5U, cache->GetUsage());
+  cache->SetCapacity(10);
+  ASSERT_EQ(10U, cache->GetCapacity());
+  ASSERT_EQ(5U, cache->GetUsage());
+
+  // test2: decrease capacity
+  // insert 5 more elements to cache, then release 5,
+  // then decrease capacity to 7, final capacity should be 7
+  // and usage should be 7
+  for (size_t i = 5; i < 10; i++) {
+    std::string key = ToString(i+1);
+    handles[i] = cache->Insert(key, new Value(i+1), 1, &deleter);
+  }
+  ASSERT_EQ(10U, cache->GetCapacity());
+  ASSERT_EQ(10U, cache->GetUsage());
+  for (size_t i = 0; i < 5; i++) {
+    cache->Release(handles[i]);
+  }
+  ASSERT_EQ(10U, cache->GetCapacity());
+  ASSERT_EQ(10U, cache->GetUsage());
+  cache->SetCapacity(7);
+  ASSERT_EQ(7, cache->GetCapacity());
+  ASSERT_EQ(7, cache->GetUsage());
+
+  // release remaining 5 to keep valgrind happy
+  for (size_t i = 5; i < 10; i++) {
+    cache->Release(handles[i]);
+  }
+}
+
+TEST_F(CacheTest, OverCapacity) {
+  size_t n = 10;
 
   // a LRUCache with n entries and one shard only
   std::shared_ptr<Cache> cache = NewLRUCache(n, 0);
@@ -399,25 +399,42 @@ TEST(CacheTest, BadEviction) {
   std::vector<Cache::Handle*> handles(n+1);
 
   // Insert n+1 entries, but not releasing.
-  for (int i = 0; i < n+1; i++) {
-    std::string key = std::to_string(i+1);
+  for (size_t i = 0; i < n + 1; i++) {
+    std::string key = ToString(i+1);
     handles[i] = cache->Insert(key, new Value(i+1), 1, &deleter);
   }
 
   // Guess what's in the cache now?
-  for (int i = 0; i < n+1; i++) {
-    std::string key = std::to_string(i+1);
+  for (size_t i = 0; i < n + 1; i++) {
+    std::string key = ToString(i+1);
     auto h = cache->Lookup(key);
     std::cout << key << (h?" found\n":" not found\n");
-    // Only the first entry should be missing
-    ASSERT_TRUE(h || i == 0);
+    ASSERT_TRUE(h != nullptr);
     if (h) cache->Release(h);
   }
 
-  for (int i = 0; i < n+1; i++) {
+  // the cache is over capacity since nothing could be evicted
+  ASSERT_EQ(n + 1U, cache->GetUsage());
+  for (size_t i = 0; i < n + 1; i++) {
     cache->Release(handles[i]);
   }
-  std::cout << "Poor entries\n";
+
+  // cache is under capacity now since elements were released
+  ASSERT_EQ(n, cache->GetUsage());
+
+  // element 0 is evicted and the rest is there
+  // This is consistent with the LRU policy since the element 0
+  // was released first
+  for (size_t i = 0; i < n + 1; i++) {
+    std::string key = ToString(i+1);
+    auto h = cache->Lookup(key);
+    if (h) {
+      ASSERT_NE(i, 0U);
+      cache->Release(h);
+    } else {
+      ASSERT_EQ(i, 0U);
+    }
+  }
 }
 
 namespace {
@@ -427,7 +444,7 @@ void callback(void* entry, size_t charge) {
 }
 };
 
-TEST(CacheTest, ApplyToAllCacheEntiresTest) {
+TEST_F(CacheTest, ApplyToAllCacheEntiresTest) {
   std::vector<std::pair<int, int>> inserted;
   callback_state.clear();
 
@@ -445,5 +462,6 @@ TEST(CacheTest, ApplyToAllCacheEntiresTest) {
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
-  return rocksdb::test::RunAllTests();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
 }
diff --git a/src/rocksdb/util/coding.cc b/src/rocksdb/util/coding.cc
index 31ae0e3..f09e672 100644
--- a/src/rocksdb/util/coding.cc
+++ b/src/rocksdb/util/coding.cc
@@ -78,92 +78,4 @@ const char* GetVarint64Ptr(const char* p, const char* limit, uint64_t* value) {
   return nullptr;
 }
 
-void BitStreamPutInt(char* dst, size_t dstlen, size_t offset,
-                     uint32_t bits, uint64_t value) {
-  assert((offset + bits + 7)/8 <= dstlen);
-  assert(bits <= 64);
-
-  unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
-
-  size_t byteOffset = offset / 8;
-  size_t bitOffset = offset % 8;
-
-  // This prevents unused variable warnings when compiling.
-#ifndef NDEBUG
-  // Store truncated value.
-  uint64_t origValue = (bits < 64)?(value & (((uint64_t)1 << bits) - 1)):value;
-  uint32_t origBits = bits;
-#endif
-
-  while (bits > 0) {
-    size_t bitsToGet = std::min<size_t>(bits, 8 - bitOffset);
-    unsigned char mask = ((1 << bitsToGet) - 1);
-
-    ptr[byteOffset] = (ptr[byteOffset] & ~(mask << bitOffset)) +
-                      ((value & mask) << bitOffset);
-
-    value >>= bitsToGet;
-    byteOffset += 1;
-    bitOffset = 0;
-    bits -= bitsToGet;
-  }
-
-  assert(origValue == BitStreamGetInt(dst, dstlen, offset, origBits));
-}
-
-uint64_t BitStreamGetInt(const char* src, size_t srclen, size_t offset,
-                         uint32_t bits) {
-  assert((offset + bits + 7)/8 <= srclen);
-  assert(bits <= 64);
-
-  const unsigned char* ptr = reinterpret_cast<const unsigned char*>(src);
-
-  uint64_t result = 0;
-
-  size_t byteOffset = offset / 8;
-  size_t bitOffset = offset % 8;
-  size_t shift = 0;
-
-  while (bits > 0) {
-    size_t bitsToGet = std::min<size_t>(bits, 8 - bitOffset);
-    unsigned char mask = ((1 << bitsToGet) - 1);
-
-    result += (uint64_t)((ptr[byteOffset] >> bitOffset) & mask) << shift;
-
-    shift += bitsToGet;
-    byteOffset += 1;
-    bitOffset = 0;
-    bits -= bitsToGet;
-  }
-
-  return result;
-}
-
-void BitStreamPutInt(std::string* dst, size_t offset, uint32_t bits,
-                     uint64_t value) {
-  assert((offset + bits + 7)/8 <= dst->size());
-
-  const size_t kTmpBufLen = sizeof(value) + 1;
-  char tmpBuf[kTmpBufLen];
-
-  // Number of bytes of tmpBuf being used
-  const size_t kUsedBytes = (offset%8 + bits)/8;
-
-  // Copy relevant parts of dst to tmpBuf
-  for (size_t idx = 0; idx <= kUsedBytes; ++idx) {
-    tmpBuf[idx] = (*dst)[offset/8 + idx];
-  }
-
-  BitStreamPutInt(tmpBuf, kTmpBufLen, offset%8, bits, value);
-
-  // Copy tmpBuf back to dst
-  for (size_t idx = 0; idx <= kUsedBytes; ++idx) {
-    (*dst)[offset/8 + idx] = tmpBuf[idx];
-  }
-
-  // Do the check here too as we are working with a buffer.
-  assert(((bits < 64)?(value & (((uint64_t)1 << bits) - 1)):value) ==
-         BitStreamGetInt(dst, offset, bits));
-}
-
 }  // namespace rocksdb
diff --git a/src/rocksdb/util/coding.h b/src/rocksdb/util/coding.h
index 8ffba51..a72f7db 100644
--- a/src/rocksdb/util/coding.h
+++ b/src/rocksdb/util/coding.h
@@ -38,6 +38,7 @@ extern void PutLengthPrefixedSliceParts(std::string* dst,
 
 // Standard Get... routines parse a value from the beginning of a Slice
 // and advance the slice past the parsed value.
+extern bool GetFixed64(Slice* input, uint64_t* value);
 extern bool GetVarint32(Slice* input, uint32_t* value);
 extern bool GetVarint64(Slice* input, uint64_t* value);
 extern bool GetLengthPrefixedSlice(Slice* input, Slice* result);
@@ -114,32 +115,6 @@ inline const char* GetVarint32Ptr(const char* p,
   return GetVarint32PtrFallback(p, limit, value);
 }
 
-// Writes an unsigned integer with bits number of bits with its least
-// significant bit at offset.
-// Bits are numbered from 0 to 7 in the first byte, 8 to 15 in the second and
-// so on.
-// value is truncated to the bits number of least significant bits.
-// REQUIRES: (offset+bits+7)/8 <= dstlen
-// REQUIRES: bits <= 64
-extern void BitStreamPutInt(char* dst, size_t dstlen, size_t offset,
-                            uint32_t bits, uint64_t value);
-
-// Reads an unsigned integer with bits number of bits with its least
-// significant bit at offset.
-// Bits are numbered in the same way as ByteStreamPutInt().
-// REQUIRES: (offset+bits+7)/8 <= srclen
-// REQUIRES: bits <= 64
-extern uint64_t BitStreamGetInt(const char* src, size_t srclen, size_t offset,
-                                uint32_t bits);
-
-// Convenience functions
-extern void BitStreamPutInt(std::string* dst, size_t offset, uint32_t bits,
-                            uint64_t value);
-extern uint64_t BitStreamGetInt(const std::string* src, size_t offset,
-                                uint32_t bits);
-extern uint64_t BitStreamGetInt(const Slice* src, size_t offset,
-                                uint32_t bits);
-
 // -- Implementation of the functions declared above
 inline void EncodeFixed32(char* buf, uint32_t value) {
 #if __BYTE_ORDER == __LITTLE_ENDIAN
@@ -182,7 +157,7 @@ inline void PutFixed64(std::string* dst, uint64_t value) {
 inline void PutVarint32(std::string* dst, uint32_t v) {
   char buf[5];
   char* ptr = EncodeVarint32(buf, v);
-  dst->append(buf, ptr - buf);
+  dst->append(buf, static_cast<size_t>(ptr - buf));
 }
 
 inline char* EncodeVarint64(char* dst, uint64_t v) {
@@ -199,11 +174,11 @@ inline char* EncodeVarint64(char* dst, uint64_t v) {
 inline void PutVarint64(std::string* dst, uint64_t v) {
   char buf[10];
   char* ptr = EncodeVarint64(buf, v);
-  dst->append(buf, ptr - buf);
+  dst->append(buf, static_cast<size_t>(ptr - buf));
 }
 
 inline void PutLengthPrefixedSlice(std::string* dst, const Slice& value) {
-  PutVarint32(dst, value.size());
+  PutVarint32(dst, static_cast<uint32_t>(value.size()));
   dst->append(value.data(), value.size());
 }
 
@@ -228,6 +203,15 @@ inline int VarintLength(uint64_t v) {
   return len;
 }
 
+inline bool GetFixed64(Slice* input, uint64_t* value) {
+  if (input->size() < sizeof(uint64_t)) {
+    return false;
+  }
+  *value = DecodeFixed64(input->data());
+  input->remove_prefix(sizeof(uint64_t));
+  return true;
+}
+
 inline bool GetVarint32(Slice* input, uint32_t* value) {
   const char* p = input->data();
   const char* limit = p + input->size();
@@ -235,7 +219,7 @@ inline bool GetVarint32(Slice* input, uint32_t* value) {
   if (q == nullptr) {
     return false;
   } else {
-    *input = Slice(q, limit - q);
+    *input = Slice(q, static_cast<size_t>(limit - q));
     return true;
   }
 }
@@ -247,7 +231,7 @@ inline bool GetVarint64(Slice* input, uint64_t* value) {
   if (q == nullptr) {
     return false;
   } else {
-    *input = Slice(q, limit - q);
+    *input = Slice(q, static_cast<size_t>(limit - q));
     return true;
   }
 }
@@ -281,14 +265,4 @@ inline Slice GetSliceUntil(Slice* slice, char delimiter) {
   return ret;
 }
 
-inline uint64_t BitStreamGetInt(const std::string* src, size_t offset,
-                                uint32_t bits) {
-  return BitStreamGetInt(src->data(), src->size(), offset, bits);
-}
-
-inline uint64_t BitStreamGetInt(const Slice* src, size_t offset,
-                                uint32_t bits) {
-  return BitStreamGetInt(src->data(), src->size(), offset, bits);
-}
-
 }  // namespace rocksdb
diff --git a/src/rocksdb/util/coding_test.cc b/src/rocksdb/util/coding_test.cc
index ed542d6..e3c265b 100644
--- a/src/rocksdb/util/coding_test.cc
+++ b/src/rocksdb/util/coding_test.cc
@@ -196,101 +196,9 @@ TEST(Coding, Strings) {
   ASSERT_EQ("", input.ToString());
 }
 
-TEST(Coding, BitStream) {
-  const int kNumBytes = 10;
-  char bytes[kNumBytes+1];
-  for (int i = 0; i < kNumBytes + 1; ++i) {
-      bytes[i] = '\0';
-  }
-
-  // Simple byte aligned test.
-  for (int i = 0; i < kNumBytes; ++i) {
-    BitStreamPutInt(bytes, kNumBytes, i*8, 8, 255-i);
-
-    ASSERT_EQ((unsigned char)bytes[i], (unsigned char)(255-i));
-  }
-  for (int i = 0; i < kNumBytes; ++i) {
-    ASSERT_EQ(BitStreamGetInt(bytes, kNumBytes, i*8, 8), (uint32_t)(255-i));
-  }
-  ASSERT_EQ(bytes[kNumBytes], '\0');
-
-  // Write and read back at strange offsets
-  for (int i = 0; i < kNumBytes + 1; ++i) {
-      bytes[i] = '\0';
-  }
-  for (int i = 0; i < kNumBytes; ++i) {
-    BitStreamPutInt(bytes, kNumBytes, i*5+1, 4, (i * 7) % (1 << 4));
-  }
-  for (int i = 0; i < kNumBytes; ++i) {
-    ASSERT_EQ(BitStreamGetInt(bytes, kNumBytes, i*5+1, 4),
-              (uint32_t)((i * 7) % (1 << 4)));
-  }
-  ASSERT_EQ(bytes[kNumBytes], '\0');
-
-  // Create 11011011 as a bit pattern
-  for (int i = 0; i < kNumBytes + 1; ++i) {
-      bytes[i] = '\0';
-  }
-  for (int i = 0; i < kNumBytes; ++i) {
-    BitStreamPutInt(bytes, kNumBytes, i*8, 2, 3);
-    BitStreamPutInt(bytes, kNumBytes, i*8+3, 2, 3);
-    BitStreamPutInt(bytes, kNumBytes, i*8+6, 2, 3);
-
-    ASSERT_EQ((unsigned char)bytes[i],
-              (unsigned char)(3 + (3 << 3) + (3 << 6)));
-  }
-  ASSERT_EQ(bytes[kNumBytes], '\0');
-
-
-  // Test large values
-  for (int i = 0; i < kNumBytes + 1; ++i) {
-      bytes[i] = '\0';
-  }
-  BitStreamPutInt(bytes, kNumBytes, 0, 64, (uint64_t)(-1));
-  for (int i = 0; i < 64/8; ++i) {
-    ASSERT_EQ((unsigned char)bytes[i],
-              (unsigned char)(255));
-  }
-  ASSERT_EQ(bytes[64/8], '\0');
-
-
-}
-
-TEST(Coding, BitStreamConvenienceFuncs) {
-  std::string bytes(1, '\0');
-
-  // Check that independent changes to byte are preserved.
-  BitStreamPutInt(&bytes, 0, 2, 3);
-  BitStreamPutInt(&bytes, 3, 2, 3);
-  BitStreamPutInt(&bytes, 6, 2, 3);
-  ASSERT_EQ((unsigned char)bytes[0], (unsigned char)(3 + (3 << 3) + (3 << 6)));
-  ASSERT_EQ(BitStreamGetInt(&bytes, 0, 2), 3u);
-  ASSERT_EQ(BitStreamGetInt(&bytes, 3, 2), 3u);
-  ASSERT_EQ(BitStreamGetInt(&bytes, 6, 2), 3u);
-  Slice slice(bytes);
-  ASSERT_EQ(BitStreamGetInt(&slice, 0, 2), 3u);
-  ASSERT_EQ(BitStreamGetInt(&slice, 3, 2), 3u);
-  ASSERT_EQ(BitStreamGetInt(&slice, 6, 2), 3u);
-
-  // Test overlapping crossing over byte boundaries
-  bytes = std::string(2, '\0');
-  BitStreamPutInt(&bytes, 6, 4, 15);
-  ASSERT_EQ((unsigned char)bytes[0], 3 << 6);
-  ASSERT_EQ((unsigned char)bytes[1], 3);
-  ASSERT_EQ(BitStreamGetInt(&bytes, 6, 4), 15u);
-  slice = Slice(bytes);
-  ASSERT_EQ(BitStreamGetInt(&slice, 6, 4), 15u);
-
-  // Test 64-bit number
-  bytes = std::string(64/8, '\0');
-  BitStreamPutInt(&bytes, 0, 64, (uint64_t)(-1));
-  ASSERT_EQ(BitStreamGetInt(&bytes, 0, 64), (uint64_t)(-1));
-  slice = Slice(bytes);
-  ASSERT_EQ(BitStreamGetInt(&slice, 0, 64), (uint64_t)(-1));
-}
-
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
-  return rocksdb::test::RunAllTests();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
 }
diff --git a/src/rocksdb/util/comparator.cc b/src/rocksdb/util/comparator.cc
index adeacac..e606395 100644
--- a/src/rocksdb/util/comparator.cc
+++ b/src/rocksdb/util/comparator.cc
@@ -23,17 +23,16 @@ class BytewiseComparatorImpl : public Comparator {
  public:
   BytewiseComparatorImpl() { }
 
-  virtual const char* Name() const {
+  virtual const char* Name() const override {
     return "leveldb.BytewiseComparator";
   }
 
-  virtual int Compare(const Slice& a, const Slice& b) const {
+  virtual int Compare(const Slice& a, const Slice& b) const override {
     return a.compare(b);
   }
 
-  virtual void FindShortestSeparator(
-      std::string* start,
-      const Slice& limit) const {
+  virtual void FindShortestSeparator(std::string* start,
+                                     const Slice& limit) const override {
     // Find length of common prefix
     size_t min_length = std::min(start->size(), limit.size());
     size_t diff_index = 0;
@@ -55,7 +54,7 @@ class BytewiseComparatorImpl : public Comparator {
     }
   }
 
-  virtual void FindShortSuccessor(std::string* key) const {
+  virtual void FindShortSuccessor(std::string* key) const override {
     // Find first character that can be incremented
     size_t n = key->size();
     for (size_t i = 0; i < n; i++) {
@@ -69,13 +68,29 @@ class BytewiseComparatorImpl : public Comparator {
     // *key is a run of 0xffs.  Leave it alone.
   }
 };
-}  // namespace
+
+class ReverseBytewiseComparatorImpl : public BytewiseComparatorImpl {
+ public:
+  ReverseBytewiseComparatorImpl() { }
+
+  virtual const char* Name() const override {
+    return "rocksdb.ReverseBytewiseComparator";
+  }
+
+  virtual int Compare(const Slice& a, const Slice& b) const override {
+    return -a.compare(b);
+  }
+};
+
+}// namespace
 
 static port::OnceType once = LEVELDB_ONCE_INIT;
 static const Comparator* bytewise;
+static const Comparator* rbytewise;
 
 static void InitModule() {
   bytewise = new BytewiseComparatorImpl;
+  rbytewise= new ReverseBytewiseComparatorImpl;
 }
 
 const Comparator* BytewiseComparator() {
@@ -83,4 +98,9 @@ const Comparator* BytewiseComparator() {
   return bytewise;
 }
 
+const Comparator* ReverseBytewiseComparator() {
+  port::InitOnce(&once, InitModule);
+  return rbytewise;
+}
+
 }  // namespace rocksdb
diff --git a/src/rocksdb/util/compression.h b/src/rocksdb/util/compression.h
new file mode 100644
index 0000000..36e36d5
--- /dev/null
+++ b/src/rocksdb/util/compression.h
@@ -0,0 +1,553 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+#pragma once
+
+#include <algorithm>
+#include <limits>
+
+#include "rocksdb/options.h"
+#include "util/coding.h"
+
+#ifdef SNAPPY
+#include <snappy.h>
+#endif
+
+#ifdef ZLIB
+#include <zlib.h>
+#endif
+
+#ifdef BZIP2
+#include <bzlib.h>
+#endif
+
+#if defined(LZ4)
+#include <lz4.h>
+#include <lz4hc.h>
+#endif
+
+namespace rocksdb {
+
+inline bool Snappy_Supported() {
+#ifdef SNAPPY
+  return true;
+#endif
+  return false;
+}
+
+inline bool Zlib_Supported() {
+#ifdef ZLIB
+  return true;
+#endif
+  return false;
+}
+
+inline bool BZip2_Supported() {
+#ifdef BZIP2
+  return true;
+#endif
+  return false;
+}
+
+inline bool LZ4_Supported() {
+#ifdef LZ4
+  return true;
+#endif
+  return false;
+}
+
+// compress_format_version can have two values:
+// 1 -- decompressed sizes for BZip2 and Zlib are not included in the compressed
+// block. Also, decompressed sizes for LZ4 are encoded in platform-dependent
+// way.
+// 2 -- Zlib, BZip2 and LZ4 encode decompressed size as Varint32 just before the
+// start of compressed block. Snappy format is the same as version 1.
+
+inline bool Snappy_Compress(const CompressionOptions& opts, const char* input,
+                            size_t length, ::std::string* output) {
+#ifdef SNAPPY
+  output->resize(snappy::MaxCompressedLength(length));
+  size_t outlen;
+  snappy::RawCompress(input, length, &(*output)[0], &outlen);
+  output->resize(outlen);
+  return true;
+#endif
+
+  return false;
+}
+
+inline bool Snappy_GetUncompressedLength(const char* input, size_t length,
+                                         size_t* result) {
+#ifdef SNAPPY
+  return snappy::GetUncompressedLength(input, length, result);
+#else
+  return false;
+#endif
+}
+
+inline bool Snappy_Uncompress(const char* input, size_t length,
+                              char* output) {
+#ifdef SNAPPY
+  return snappy::RawUncompress(input, length, output);
+#else
+  return false;
+#endif
+}
+
+namespace compression {
+// returns size
+inline size_t PutDecompressedSizeInfo(std::string* output, uint32_t length) {
+  PutVarint32(output, length);
+  return output->size();
+}
+
+inline bool GetDecompressedSizeInfo(const char** input_data,
+                                    size_t* input_length,
+                                    uint32_t* output_len) {
+  auto new_input_data =
+      GetVarint32Ptr(*input_data, *input_data + *input_length, output_len);
+  if (new_input_data == nullptr) {
+    return false;
+  }
+  *input_length -= (new_input_data - *input_data);
+  *input_data = new_input_data;
+  return true;
+}
+}  // namespace compression
+
+// compress_format_version == 1 -- decompressed size is not included in the
+// block header
+// compress_format_version == 2 -- decompressed size is included in the block
+// header in varint32 format
+inline bool Zlib_Compress(const CompressionOptions& opts,
+                          uint32_t compress_format_version,
+                          const char* input, size_t length,
+                          ::std::string* output) {
+#ifdef ZLIB
+  if (length > std::numeric_limits<uint32_t>::max()) {
+    // Can't compress more than 4GB
+    return false;
+  }
+
+  size_t output_header_len = 0;
+  if (compress_format_version == 2) {
+    output_header_len = compression::PutDecompressedSizeInfo(
+        output, static_cast<uint32_t>(length));
+  }
+  // Resize output to be the plain data length.
+  // This may not be big enough if the compression actually expands data.
+  output->resize(output_header_len + length);
+
+  // The memLevel parameter specifies how much memory should be allocated for
+  // the internal compression state.
+  // memLevel=1 uses minimum memory but is slow and reduces compression ratio.
+  // memLevel=9 uses maximum memory for optimal speed.
+  // The default value is 8. See zconf.h for more details.
+  static const int memLevel = 8;
+  z_stream _stream;
+  memset(&_stream, 0, sizeof(z_stream));
+  int st = deflateInit2(&_stream, opts.level, Z_DEFLATED, opts.window_bits,
+                        memLevel, opts.strategy);
+  if (st != Z_OK) {
+    return false;
+  }
+
+  // Compress the input, and put compressed data in output.
+  _stream.next_in = (Bytef *)input;
+  _stream.avail_in = static_cast<unsigned int>(length);
+
+  // Initialize the output size.
+  _stream.avail_out = static_cast<unsigned int>(length);
+  _stream.next_out = reinterpret_cast<Bytef*>(&(*output)[output_header_len]);
+
+  bool done = false;
+  while (!done) {
+    st = deflate(&_stream, Z_FINISH);
+    switch (st) {
+      case Z_STREAM_END:
+        done = true;
+        break;
+      case Z_OK:
+        // No output space. This means the compression is bigger than
+        // decompressed size. Just fail the compression in that case.
+        // Intentional fallback (to failure case)
+      case Z_BUF_ERROR:
+      default:
+        deflateEnd(&_stream);
+        return false;
+    }
+  }
+
+  output->resize(output->size() - _stream.avail_out + output_header_len);
+  deflateEnd(&_stream);
+  return true;
+#endif
+  return false;
+}
+
+// compress_format_version == 1 -- decompressed size is not included in the
+// block header
+// compress_format_version == 2 -- decompressed size is included in the block
+// header in varint32 format
+inline char* Zlib_Uncompress(const char* input_data, size_t input_length,
+                             int* decompress_size,
+                             uint32_t compress_format_version,
+                             int windowBits = -14) {
+#ifdef ZLIB
+  uint32_t output_len = 0;
+  if (compress_format_version == 2) {
+    if (!compression::GetDecompressedSizeInfo(&input_data, &input_length,
+                                              &output_len)) {
+      return nullptr;
+    }
+  } else {
+    // Assume the decompressed data size will 5x of compressed size, but round
+    // to the page size
+    size_t proposed_output_len = ((input_length * 5) & (~(4096 - 1))) + 4096;
+    output_len = static_cast<uint32_t>(
+        std::min(proposed_output_len,
+                 static_cast<size_t>(std::numeric_limits<uint32_t>::max())));
+  }
+
+  z_stream _stream;
+  memset(&_stream, 0, sizeof(z_stream));
+
+  // For raw inflate, the windowBits should be -8..-15.
+  // If windowBits is bigger than zero, it will use either zlib
+  // header or gzip header. Adding 32 to it will do automatic detection.
+  int st = inflateInit2(&_stream,
+      windowBits > 0 ? windowBits + 32 : windowBits);
+  if (st != Z_OK) {
+    return nullptr;
+  }
+
+  _stream.next_in = (Bytef *)input_data;
+  _stream.avail_in = static_cast<unsigned int>(input_length);
+
+  char* output = new char[output_len];
+
+  _stream.next_out = (Bytef *)output;
+  _stream.avail_out = static_cast<unsigned int>(output_len);
+
+  bool done = false;
+  while (!done) {
+    st = inflate(&_stream, Z_SYNC_FLUSH);
+    switch (st) {
+      case Z_STREAM_END:
+        done = true;
+        break;
+      case Z_OK: {
+        // No output space. Increase the output space by 20%.
+        // We should never run out of output space if
+        // compress_format_version == 2
+        assert(compress_format_version != 2);
+        size_t old_sz = output_len;
+        size_t output_len_delta = static_cast<size_t>(output_len * 0.2);
+        output_len += output_len_delta < 10 ? 10 : output_len_delta;
+        char* tmp = new char[output_len];
+        memcpy(tmp, output, old_sz);
+        delete[] output;
+        output = tmp;
+
+        // Set more output.
+        _stream.next_out = (Bytef *)(output + old_sz);
+        _stream.avail_out = static_cast<unsigned int>(output_len - old_sz);
+        break;
+      }
+      case Z_BUF_ERROR:
+      default:
+        delete[] output;
+        inflateEnd(&_stream);
+        return nullptr;
+    }
+  }
+
+  // If we encoded decompressed block size, we should have no bytes left
+  assert(compress_format_version != 2 || _stream.avail_out == 0);
+  *decompress_size = static_cast<int>(output_len - _stream.avail_out);
+  inflateEnd(&_stream);
+  return output;
+#endif
+
+  return nullptr;
+}
+
+// compress_format_version == 1 -- decompressed size is not included in the
+// block header
+// compress_format_version == 2 -- decompressed size is included in the block
+// header in varint32 format
+inline bool BZip2_Compress(const CompressionOptions& opts,
+                           uint32_t compress_format_version,
+                           const char* input, size_t length,
+                           ::std::string* output) {
+#ifdef BZIP2
+  if (length > std::numeric_limits<uint32_t>::max()) {
+    // Can't compress more than 4GB
+    return false;
+  }
+  size_t output_header_len = 0;
+  if (compress_format_version == 2) {
+    output_header_len = compression::PutDecompressedSizeInfo(
+        output, static_cast<uint32_t>(length));
+  }
+  // Resize output to be the plain data length.
+  // This may not be big enough if the compression actually expands data.
+  output->resize(output_header_len + length);
+
+
+  bz_stream _stream;
+  memset(&_stream, 0, sizeof(bz_stream));
+
+  // Block size 1 is 100K.
+  // 0 is for silent.
+  // 30 is the default workFactor
+  int st = BZ2_bzCompressInit(&_stream, 1, 0, 30);
+  if (st != BZ_OK) {
+    return false;
+  }
+
+  // Compress the input, and put compressed data in output.
+  _stream.next_in = (char *)input;
+  _stream.avail_in = static_cast<unsigned int>(length);
+
+  // Initialize the output size.
+  _stream.avail_out = static_cast<unsigned int>(length);
+  _stream.next_out = reinterpret_cast<char*>(&(*output)[output_header_len]);
+
+  while (_stream.next_in != nullptr && _stream.avail_in != 0) {
+    st = BZ2_bzCompress(&_stream, BZ_FINISH);
+    switch (st) {
+      case BZ_STREAM_END:
+        break;
+      case BZ_FINISH_OK:
+        // No output space. This means the compression is bigger than
+        // decompressed size. Just fail the compression in that case
+        // Intentional fallback (to failure case)
+      case BZ_SEQUENCE_ERROR:
+      default:
+        BZ2_bzCompressEnd(&_stream);
+        return false;
+    }
+  }
+
+  output->resize(output->size() - _stream.avail_out + output_header_len);
+  BZ2_bzCompressEnd(&_stream);
+  return true;
+#endif
+  return false;
+}
+
+// compress_format_version == 1 -- decompressed size is not included in the
+// block header
+// compress_format_version == 2 -- decompressed size is included in the block
+// header in varint32 format
+inline char* BZip2_Uncompress(const char* input_data, size_t input_length,
+                              int* decompress_size,
+                              uint32_t compress_format_version) {
+#ifdef BZIP2
+  uint32_t output_len = 0;
+  if (compress_format_version == 2) {
+    if (!compression::GetDecompressedSizeInfo(&input_data, &input_length,
+                                              &output_len)) {
+      return nullptr;
+    }
+  } else {
+    // Assume the decompressed data size will 5x of compressed size, but round
+    // to the next page size
+    size_t proposed_output_len = ((input_length * 5) & (~(4096 - 1))) + 4096;
+    output_len = static_cast<uint32_t>(
+        std::min(proposed_output_len,
+                 static_cast<size_t>(std::numeric_limits<uint32_t>::max())));
+  }
+
+  bz_stream _stream;
+  memset(&_stream, 0, sizeof(bz_stream));
+
+  int st = BZ2_bzDecompressInit(&_stream, 0, 0);
+  if (st != BZ_OK) {
+    return nullptr;
+  }
+
+  _stream.next_in = (char *)input_data;
+  _stream.avail_in = static_cast<unsigned int>(input_length);
+
+  char* output = new char[output_len];
+
+  _stream.next_out = (char *)output;
+  _stream.avail_out = static_cast<unsigned int>(output_len);
+
+  bool done = false;
+  while (!done) {
+    st = BZ2_bzDecompress(&_stream);
+    switch (st) {
+      case BZ_STREAM_END:
+        done = true;
+        break;
+      case BZ_OK: {
+        // No output space. Increase the output space by 20%.
+        // We should never run out of output space if
+        // compress_format_version == 2
+        assert(compress_format_version != 2);
+        uint32_t old_sz = output_len;
+        output_len = output_len * 1.2;
+        char* tmp = new char[output_len];
+        memcpy(tmp, output, old_sz);
+        delete[] output;
+        output = tmp;
+
+        // Set more output.
+        _stream.next_out = (char *)(output + old_sz);
+        _stream.avail_out = static_cast<unsigned int>(output_len - old_sz);
+        break;
+      }
+      default:
+        delete[] output;
+        BZ2_bzDecompressEnd(&_stream);
+        return nullptr;
+    }
+  }
+
+  // If we encoded decompressed block size, we should have no bytes left
+  assert(compress_format_version != 2 || _stream.avail_out == 0);
+  *decompress_size = static_cast<int>(output_len - _stream.avail_out);
+  BZ2_bzDecompressEnd(&_stream);
+  return output;
+#endif
+  return nullptr;
+}
+
+// compress_format_version == 1 -- decompressed size is included in the
+// block header using memcpy, which makes database non-portable)
+// compress_format_version == 2 -- decompressed size is included in the block
+// header in varint32 format
+inline bool LZ4_Compress(const CompressionOptions& opts,
+                         uint32_t compress_format_version, const char* input,
+                         size_t length, ::std::string* output) {
+#ifdef LZ4
+  if (length > std::numeric_limits<uint32_t>::max()) {
+    // Can't compress more than 4GB
+    return false;
+  }
+
+  size_t output_header_len = 0;
+  if (compress_format_version == 2) {
+    // new encoding, using varint32 to store size information
+    output_header_len = compression::PutDecompressedSizeInfo(
+        output, static_cast<uint32_t>(length));
+  } else {
+    // legacy encoding, which is not really portable (depends on big/little
+    // endianness)
+    output_header_len = 8;
+    output->resize(output_header_len);
+    char* p = const_cast<char*>(output->c_str());
+    memcpy(p, &length, sizeof(length));
+  }
+
+  int compressBound = LZ4_compressBound(static_cast<int>(length));
+  output->resize(static_cast<size_t>(output_header_len + compressBound));
+  int outlen =
+      LZ4_compress_limitedOutput(input, &(*output)[output_header_len],
+                                 static_cast<int>(length), compressBound);
+  if (outlen == 0) {
+    return false;
+  }
+  output->resize(static_cast<size_t>(output_header_len + outlen));
+  return true;
+#endif
+  return false;
+}
+
+// compress_format_version == 1 -- decompressed size is included in the
+// block header using memcpy, which makes database non-portable)
+// compress_format_version == 2 -- decompressed size is included in the block
+// header in varint32 format
+inline char* LZ4_Uncompress(const char* input_data, size_t input_length,
+                            int* decompress_size,
+                            uint32_t compress_format_version) {
+#ifdef LZ4
+  uint32_t output_len = 0;
+  if (compress_format_version == 2) {
+    // new encoding, using varint32 to store size information
+    if (!compression::GetDecompressedSizeInfo(&input_data, &input_length,
+                                              &output_len)) {
+      return nullptr;
+    }
+  } else {
+    // legacy encoding, which is not really portable (depends on big/little
+    // endianness)
+    if (input_length < 8) {
+      return nullptr;
+    }
+    memcpy(&output_len, input_data, sizeof(output_len));
+    input_length -= 8;
+    input_data += 8;
+  }
+  char* output = new char[output_len];
+  *decompress_size =
+      LZ4_decompress_safe(input_data, output, static_cast<int>(input_length),
+                          static_cast<int>(output_len));
+  if (*decompress_size < 0) {
+    delete[] output;
+    return nullptr;
+  }
+  assert(*decompress_size == static_cast<int>(output_len));
+  return output;
+#endif
+  return nullptr;
+}
+
+// compress_format_version == 1 -- decompressed size is included in the
+// block header using memcpy, which makes database non-portable)
+// compress_format_version == 2 -- decompressed size is included in the block
+// header in varint32 format
+inline bool LZ4HC_Compress(const CompressionOptions& opts,
+                           uint32_t compress_format_version, const char* input,
+                           size_t length, ::std::string* output) {
+#ifdef LZ4
+  if (length > std::numeric_limits<uint32_t>::max()) {
+    // Can't compress more than 4GB
+    return false;
+  }
+
+  size_t output_header_len = 0;
+  if (compress_format_version == 2) {
+    // new encoding, using varint32 to store size information
+    output_header_len = compression::PutDecompressedSizeInfo(
+        output, static_cast<uint32_t>(length));
+  } else {
+    // legacy encoding, which is not really portable (depends on big/little
+    // endianness)
+    output_header_len = 8;
+    output->resize(output_header_len);
+    char* p = const_cast<char*>(output->c_str());
+    memcpy(p, &length, sizeof(length));
+  }
+
+  int compressBound = LZ4_compressBound(static_cast<int>(length));
+  output->resize(static_cast<size_t>(output_header_len + compressBound));
+  int outlen;
+#ifdef LZ4_VERSION_MAJOR  // they only started defining this since r113
+  outlen = LZ4_compressHC2_limitedOutput(input, &(*output)[output_header_len],
+                                         static_cast<int>(length),
+                                         compressBound, opts.level);
+#else
+  outlen =
+      LZ4_compressHC_limitedOutput(input, &(*output)[output_header_len],
+                                   static_cast<int>(length), compressBound);
+#endif
+  if (outlen == 0) {
+    return false;
+  }
+  output->resize(static_cast<size_t>(output_header_len + outlen));
+  return true;
+#endif
+  return false;
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/crc32c.cc b/src/rocksdb/util/crc32c.cc
index d27fb4b..8f1a09e 100644
--- a/src/rocksdb/util/crc32c.cc
+++ b/src/rocksdb/util/crc32c.cc
@@ -298,14 +298,14 @@ static inline uint64_t LE_LOAD64(const uint8_t *p) {
 #endif
 
 static inline void Slow_CRC32(uint64_t* l, uint8_t const **p) {
-  uint32_t c = *l ^ LE_LOAD32(*p);
+  uint32_t c = static_cast<uint32_t>(*l ^ LE_LOAD32(*p));
   *p += 4;
   *l = table3_[c & 0xff] ^
   table2_[(c >> 8) & 0xff] ^
   table1_[(c >> 16) & 0xff] ^
   table0_[c >> 24];
   // DO it twice.
-  c = *l ^ LE_LOAD32(*p);
+  c = static_cast<uint32_t>(*l ^ LE_LOAD32(*p));
   *p += 4;
   *l = table3_[c & 0xff] ^
   table2_[(c >> 8) & 0xff] ^
@@ -362,7 +362,7 @@ uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) {
   }
 #undef STEP1
 #undef ALIGN
-  return l ^ 0xffffffffu;
+  return static_cast<uint32_t>(l ^ 0xffffffffu);
 }
 
 // Detect if SS42 or not.
diff --git a/src/rocksdb/util/crc32c_test.cc b/src/rocksdb/util/crc32c_test.cc
index 300c9d3..413302a 100644
--- a/src/rocksdb/util/crc32c_test.cc
+++ b/src/rocksdb/util/crc32c_test.cc
@@ -73,5 +73,6 @@ TEST(CRC, Mask) {
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
-  return rocksdb::test::RunAllTests();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
 }
diff --git a/src/rocksdb/util/db_info_dumper.cc b/src/rocksdb/util/db_info_dumper.cc
new file mode 100644
index 0000000..9c70928
--- /dev/null
+++ b/src/rocksdb/util/db_info_dumper.cc
@@ -0,0 +1,130 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <string>
+#include <algorithm>
+#include <vector>
+
+#include "db/filename.h"
+#include "rocksdb/options.h"
+#include "rocksdb/env.h"
+#include "util/db_info_dumper.h"
+
+namespace rocksdb {
+
+void DumpDBFileSummary(const DBOptions& options, const std::string& dbname) {
+  if (options.info_log == nullptr) {
+    return;
+  }
+
+  auto* env = options.env;
+  uint64_t number = 0;
+  FileType type = kInfoLogFile;
+
+  std::vector<std::string> files;
+  uint64_t file_num = 0;
+  uint64_t file_size;
+  std::string file_info, wal_info;
+
+  Log(InfoLogLevel::INFO_LEVEL, options.info_log, "DB SUMMARY\n");
+  // Get files in dbname dir
+  if (!env->GetChildren(dbname, &files).ok()) {
+    Log(InfoLogLevel::ERROR_LEVEL,
+        options.info_log, "Error when reading %s dir\n", dbname.c_str());
+  }
+  std::sort(files.begin(), files.end());
+  for (std::string file : files) {
+    if (!ParseFileName(file, &number, &type)) {
+      continue;
+    }
+    switch (type) {
+      case kCurrentFile:
+        Log(InfoLogLevel::INFO_LEVEL, options.info_log,
+            "CURRENT file:  %s\n", file.c_str());
+        break;
+      case kIdentityFile:
+        Log(InfoLogLevel::INFO_LEVEL, options.info_log,
+            "IDENTITY file:  %s\n", file.c_str());
+        break;
+      case kDescriptorFile:
+        env->GetFileSize(dbname + "/" + file, &file_size);
+        Log(InfoLogLevel::INFO_LEVEL, options.info_log,
+            "MANIFEST file:  %s size: %" PRIu64 " Bytes\n",
+            file.c_str(), file_size);
+        break;
+      case kLogFile:
+        env->GetFileSize(dbname + "/" + file, &file_size);
+        char str[8];
+        snprintf(str, sizeof(str), "%" PRIu64, file_size);
+        wal_info.append(file).append(" size: ").
+            append(str, sizeof(str)).append(" ;");
+        break;
+      case kTableFile:
+        if (++file_num < 10) {
+          file_info.append(file).append(" ");
+        }
+        break;
+      default:
+        break;
+    }
+  }
+
+  // Get sst files in db_path dir
+  for (auto& db_path : options.db_paths) {
+    if (dbname.compare(db_path.path) != 0) {
+      if (!env->GetChildren(db_path.path, &files).ok()) {
+        Log(InfoLogLevel::ERROR_LEVEL, options.info_log,
+            "Error when reading %s dir\n",
+            db_path.path.c_str());
+        continue;
+      }
+      std::sort(files.begin(), files.end());
+      for (std::string file : files) {
+        if (ParseFileName(file, &number, &type)) {
+          if (type == kTableFile && ++file_num < 10) {
+            file_info.append(file).append(" ");
+          }
+        }
+      }
+    }
+    Log(InfoLogLevel::INFO_LEVEL, options.info_log,
+        "SST files in %s dir, Total Num: %" PRIu64 ", files: %s\n",
+        db_path.path.c_str(), file_num, file_info.c_str());
+    file_num = 0;
+    file_info.clear();
+  }
+
+  // Get wal file in wal_dir
+  if (dbname.compare(options.wal_dir) != 0) {
+    if (!env->GetChildren(options.wal_dir, &files).ok()) {
+      Log(InfoLogLevel::ERROR_LEVEL, options.info_log,
+          "Error when reading %s dir\n",
+          options.wal_dir.c_str());
+      return;
+    }
+    wal_info.clear();
+    for (std::string file : files) {
+      if (ParseFileName(file, &number, &type)) {
+        if (type == kLogFile) {
+          env->GetFileSize(options.wal_dir + "/" + file, &file_size);
+          char str[8];
+          snprintf(str, sizeof(str), "%" PRIu64, file_size);
+          wal_info.append(file).append(" size: ").
+              append(str, sizeof(str)).append(" ;");
+        }
+      }
+    }
+  }
+  Log(InfoLogLevel::INFO_LEVEL, options.info_log,
+      "Write Ahead Log file in %s: %s\n",
+      options.wal_dir.c_str(), wal_info.c_str());
+}
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/db_info_dumper.h b/src/rocksdb/util/db_info_dumper.h
new file mode 100644
index 0000000..ed0a63d
--- /dev/null
+++ b/src/rocksdb/util/db_info_dumper.h
@@ -0,0 +1,13 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+#pragma once
+
+#include <string>
+
+#include "rocksdb/options.h"
+
+namespace rocksdb {
+void DumpDBFileSummary(const DBOptions& options, const std::string& dbname);
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/dynamic_bloom.cc b/src/rocksdb/util/dynamic_bloom.cc
index bc48b9f..ffe8157 100644
--- a/src/rocksdb/util/dynamic_bloom.cc
+++ b/src/rocksdb/util/dynamic_bloom.cc
@@ -9,42 +9,72 @@
 
 #include "port/port.h"
 #include "rocksdb/slice.h"
+#include "util/allocator.h"
 #include "util/hash.h"
 
 namespace rocksdb {
 
 namespace {
-static uint32_t BloomHash(const Slice& key) {
-  return Hash(key.data(), key.size(), 0xbc9f1d34);
+
+uint32_t GetTotalBitsForLocality(uint32_t total_bits) {
+  uint32_t num_blocks =
+      (total_bits + CACHE_LINE_SIZE * 8 - 1) / (CACHE_LINE_SIZE * 8);
+
+  // Make num_blocks an odd number to make sure more bits are involved
+  // when determining which block.
+  if (num_blocks % 2 == 0) {
+    num_blocks++;
+  }
+
+  return num_blocks * (CACHE_LINE_SIZE * 8);
 }
 }
 
-DynamicBloom::DynamicBloom(uint32_t total_bits, uint32_t cl_per_block,
-                           uint32_t num_probes,
+DynamicBloom::DynamicBloom(Allocator* allocator, uint32_t total_bits,
+                           uint32_t locality, uint32_t num_probes,
                            uint32_t (*hash_func)(const Slice& key),
-                           size_t huge_page_tlb_size)
-    : kBlocked(cl_per_block > 0),
-      kBitsPerBlock(std::min(cl_per_block, num_probes) * CACHE_LINE_SIZE * 8),
-      kTotalBits((kBlocked ? (total_bits + kBitsPerBlock - 1) / kBitsPerBlock *
-                                 kBitsPerBlock
-                           : total_bits + 7) /
-                 8 * 8),
-      kNumBlocks(kBlocked ? kTotalBits / kBitsPerBlock : 1),
+                           size_t huge_page_tlb_size,
+                           Logger* logger)
+    : DynamicBloom(num_probes, hash_func) {
+  SetTotalBits(allocator, total_bits, locality, huge_page_tlb_size, logger);
+}
+
+DynamicBloom::DynamicBloom(uint32_t num_probes,
+                           uint32_t (*hash_func)(const Slice& key))
+    : kTotalBits(0),
+      kNumBlocks(0),
       kNumProbes(num_probes),
-      hash_func_(hash_func == nullptr ? &BloomHash : hash_func) {
-  assert(kBlocked ? kTotalBits > 0 : kTotalBits >= kBitsPerBlock);
+      hash_func_(hash_func == nullptr ? &BloomHash : hash_func) {}
+
+void DynamicBloom::SetRawData(unsigned char* raw_data, uint32_t total_bits,
+                              uint32_t num_blocks) {
+  data_ = raw_data;
+  kTotalBits = total_bits;
+  kNumBlocks = num_blocks;
+}
+
+void DynamicBloom::SetTotalBits(Allocator* allocator,
+                                uint32_t total_bits, uint32_t locality,
+                                size_t huge_page_tlb_size,
+                                Logger* logger) {
+  kTotalBits = (locality > 0) ? GetTotalBitsForLocality(total_bits)
+                              : (total_bits + 7) / 8 * 8;
+  kNumBlocks = (locality > 0) ? (kTotalBits / (CACHE_LINE_SIZE * 8)) : 0;
+
+  assert(kNumBlocks > 0 || kTotalBits > 0);
   assert(kNumProbes > 0);
 
   uint32_t sz = kTotalBits / 8;
-  if (kBlocked) {
+  if (kNumBlocks > 0) {
     sz += CACHE_LINE_SIZE - 1;
   }
+  assert(allocator);
   raw_ = reinterpret_cast<unsigned char*>(
-      arena_.AllocateAligned(sz, huge_page_tlb_size));
+      allocator->AllocateAligned(sz, huge_page_tlb_size, logger));
   memset(raw_, 0, sz);
-  if (kBlocked && (reinterpret_cast<uint64_t>(raw_) % CACHE_LINE_SIZE)) {
+  if (kNumBlocks > 0 && (reinterpret_cast<uint64_t>(raw_) % CACHE_LINE_SIZE)) {
     data_ = raw_ + CACHE_LINE_SIZE -
-      reinterpret_cast<uint64_t>(raw_) % CACHE_LINE_SIZE;
+            reinterpret_cast<uint64_t>(raw_) % CACHE_LINE_SIZE;
   } else {
     data_ = raw_;
   }
diff --git a/src/rocksdb/util/dynamic_bloom.h b/src/rocksdb/util/dynamic_bloom.h
index f91bb8f..a6e4d73 100644
--- a/src/rocksdb/util/dynamic_bloom.h
+++ b/src/rocksdb/util/dynamic_bloom.h
@@ -5,31 +5,46 @@
 
 #pragma once
 
+#include <string>
+
+#include "rocksdb/slice.h"
+
+#include "port/port_posix.h"
+
 #include <atomic>
 #include <memory>
 
-#include <util/arena.h>
-
 namespace rocksdb {
 
 class Slice;
+class Allocator;
+class Logger;
 
 class DynamicBloom {
  public:
+  // allocator: pass allocator to bloom filter, hence trace the usage of memory
   // total_bits: fixed total bits for the bloom
   // num_probes: number of hash probes for a single key
-  // cl_per_block: block size in cache lines. When this is non-zero, a
-  //               query/set is done within a block to improve cache locality.
+  // locality:  If positive, optimize for cache line locality, 0 otherwise.
   // hash_func:  customized hash function
   // huge_page_tlb_size:  if >0, try to allocate bloom bytes from huge page TLB
   //                      withi this page size. Need to reserve huge pages for
   //                      it to be allocated, like:
   //                         sysctl -w vm.nr_hugepages=20
   //                     See linux doc Documentation/vm/hugetlbpage.txt
-  explicit DynamicBloom(uint32_t total_bits, uint32_t cl_per_block = 0,
+  explicit DynamicBloom(Allocator* allocator,
+                        uint32_t total_bits, uint32_t locality = 0,
                         uint32_t num_probes = 6,
                         uint32_t (*hash_func)(const Slice& key) = nullptr,
-                        size_t huge_page_tlb_size = 0);
+                        size_t huge_page_tlb_size = 0,
+                        Logger* logger = nullptr);
+
+  explicit DynamicBloom(uint32_t num_probes = 6,
+                        uint32_t (*hash_func)(const Slice& key) = nullptr);
+
+  void SetTotalBits(Allocator* allocator, uint32_t total_bits,
+                    uint32_t locality, size_t huge_page_tlb_size,
+                    Logger* logger);
 
   ~DynamicBloom() {}
 
@@ -40,40 +55,64 @@ class DynamicBloom {
   void AddHash(uint32_t hash);
 
   // Multithreaded access to this function is OK
-  bool MayContain(const Slice& key);
+  bool MayContain(const Slice& key) const;
 
   // Multithreaded access to this function is OK
-  bool MayContainHash(uint32_t hash);
+  bool MayContainHash(uint32_t hash) const;
+
+  void Prefetch(uint32_t h);
+
+  uint32_t GetNumBlocks() const { return kNumBlocks; }
+
+  Slice GetRawData() const {
+    return Slice(reinterpret_cast<char*>(data_), GetTotalBits() / 8);
+  }
+
+  void SetRawData(unsigned char* raw_data, uint32_t total_bits,
+                  uint32_t num_blocks = 0);
+
+  uint32_t GetTotalBits() const { return kTotalBits; }
+
+  bool IsInitialized() const { return kNumBlocks > 0 || kTotalBits > 0; }
 
  private:
-  const bool kBlocked;
-  const uint32_t kBitsPerBlock;
-  const uint32_t kTotalBits;
-  const uint32_t kNumBlocks;
+  uint32_t kTotalBits;
+  uint32_t kNumBlocks;
   const uint32_t kNumProbes;
 
   uint32_t (*hash_func_)(const Slice& key);
   unsigned char* data_;
   unsigned char* raw_;
-
-  Arena arena_;
 };
 
 inline void DynamicBloom::Add(const Slice& key) { AddHash(hash_func_(key)); }
 
-inline bool DynamicBloom::MayContain(const Slice& key) {
+inline bool DynamicBloom::MayContain(const Slice& key) const {
   return (MayContainHash(hash_func_(key)));
 }
 
-inline bool DynamicBloom::MayContainHash(uint32_t h) {
+inline void DynamicBloom::Prefetch(uint32_t h) {
+  if (kNumBlocks != 0) {
+    uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8);
+    PREFETCH(&(data_[b]), 0, 3);
+  }
+}
+
+inline bool DynamicBloom::MayContainHash(uint32_t h) const {
+  assert(IsInitialized());
   const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
-  if (kBlocked) {
-    uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * kBitsPerBlock;
+  if (kNumBlocks != 0) {
+    uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8);
     for (uint32_t i = 0; i < kNumProbes; ++i) {
-      const uint32_t bitpos = b + h % kBitsPerBlock;
+      // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized
+      //  to a simple and operation by compiler.
+      const uint32_t bitpos = b + (h % (CACHE_LINE_SIZE * 8));
       if (((data_[bitpos / 8]) & (1 << (bitpos % 8))) == 0) {
         return false;
       }
+      // Rotate h so that we don't reuse the same bytes.
+      h = h / (CACHE_LINE_SIZE * 8) +
+          (h % (CACHE_LINE_SIZE * 8)) * (0x20000000U / CACHE_LINE_SIZE);
       h += delta;
     }
   } else {
@@ -89,12 +128,18 @@ inline bool DynamicBloom::MayContainHash(uint32_t h) {
 }
 
 inline void DynamicBloom::AddHash(uint32_t h) {
+  assert(IsInitialized());
   const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
-  if (kBlocked) {
-    uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * kBitsPerBlock;
+  if (kNumBlocks != 0) {
+    uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8);
     for (uint32_t i = 0; i < kNumProbes; ++i) {
-      const uint32_t bitpos = b + h % kBitsPerBlock;
+      // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized
+      // to a simple and operation by compiler.
+      const uint32_t bitpos = b + (h % (CACHE_LINE_SIZE * 8));
       data_[bitpos / 8] |= (1 << (bitpos % 8));
+      // Rotate h so that we don't reuse the same bytes.
+      h = h / (CACHE_LINE_SIZE * 8) +
+          (h % (CACHE_LINE_SIZE * 8)) * (0x20000000U / CACHE_LINE_SIZE);
       h += delta;
     }
   } else {
diff --git a/src/rocksdb/util/dynamic_bloom_test.cc b/src/rocksdb/util/dynamic_bloom_test.cc
index 4a34d50..fb10d09 100644
--- a/src/rocksdb/util/dynamic_bloom_test.cc
+++ b/src/rocksdb/util/dynamic_bloom_test.cc
@@ -3,18 +3,32 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+  return 1;
+}
+#else
+
+#ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
+#endif
+
 #include <inttypes.h>
 #include <algorithm>
 #include <gflags/gflags.h>
 
 #include "dynamic_bloom.h"
 #include "port/port.h"
+#include "util/arena.h"
 #include "util/logging.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
 #include "util/stop_watch.h"
 
+using GFLAGS::ParseCommandLineFlags;
+
 DEFINE_int32(bits_per_key, 10, "");
 DEFINE_int32(num_probes, 6, "");
 DEFINE_bool(enable_perf, false, "");
@@ -26,21 +40,22 @@ static Slice Key(uint64_t i, char* buffer) {
   return Slice(buffer, sizeof(i));
 }
 
-class DynamicBloomTest {
-};
+class DynamicBloomTest : public testing::Test {};
 
-TEST(DynamicBloomTest, EmptyFilter) {
-  DynamicBloom bloom1(100, 0, 2);
+TEST_F(DynamicBloomTest, EmptyFilter) {
+  Arena arena;
+  DynamicBloom bloom1(&arena, 100, 0, 2);
   ASSERT_TRUE(!bloom1.MayContain("hello"));
   ASSERT_TRUE(!bloom1.MayContain("world"));
 
-  DynamicBloom bloom2(CACHE_LINE_SIZE * 8 * 2 - 1, 1, 2);
+  DynamicBloom bloom2(&arena, CACHE_LINE_SIZE * 8 * 2 - 1, 1, 2);
   ASSERT_TRUE(!bloom2.MayContain("hello"));
   ASSERT_TRUE(!bloom2.MayContain("world"));
 }
 
-TEST(DynamicBloomTest, Small) {
-  DynamicBloom bloom1(100, 0, 2);
+TEST_F(DynamicBloomTest, Small) {
+  Arena arena;
+  DynamicBloom bloom1(&arena, 100, 0, 2);
   bloom1.Add("hello");
   bloom1.Add("world");
   ASSERT_TRUE(bloom1.MayContain("hello"));
@@ -48,7 +63,7 @@ TEST(DynamicBloomTest, Small) {
   ASSERT_TRUE(!bloom1.MayContain("x"));
   ASSERT_TRUE(!bloom1.MayContain("foo"));
 
-  DynamicBloom bloom2(CACHE_LINE_SIZE * 8 * 2 - 1, 1, 2);
+  DynamicBloom bloom2(&arena, CACHE_LINE_SIZE * 8 * 2 - 1, 1, 2);
   bloom2.Add("hello");
   bloom2.Add("world");
   ASSERT_TRUE(bloom2.MayContain("hello"));
@@ -70,7 +85,7 @@ static uint32_t NextNum(uint32_t num) {
   return num;
 }
 
-TEST(DynamicBloomTest, VaryingLengths) {
+TEST_F(DynamicBloomTest, VaryingLengths) {
   char buffer[sizeof(uint64_t)];
 
   // Count number of filters that significantly exceed the false positive rate
@@ -81,17 +96,17 @@ TEST(DynamicBloomTest, VaryingLengths) {
   fprintf(stderr, "bits_per_key: %d  num_probes: %d\n",
           FLAGS_bits_per_key, num_probes);
 
-  for (uint32_t cl_per_block = 0; cl_per_block < num_probes;
-      ++cl_per_block) {
+  for (uint32_t enable_locality = 0; enable_locality < 2; ++enable_locality) {
     for (uint32_t num = 1; num <= 10000; num = NextNum(num)) {
       uint32_t bloom_bits = 0;
-      if (cl_per_block == 0) {
+      Arena arena;
+      if (enable_locality == 0) {
         bloom_bits = std::max(num * FLAGS_bits_per_key, 64U);
       } else {
         bloom_bits = std::max(num * FLAGS_bits_per_key,
-            cl_per_block * CACHE_LINE_SIZE * 8);
+                              enable_locality * CACHE_LINE_SIZE * 8);
       }
-      DynamicBloom bloom(bloom_bits, cl_per_block, num_probes);
+      DynamicBloom bloom(&arena, bloom_bits, enable_locality, num_probes);
       for (uint64_t i = 0; i < num; i++) {
         bloom.Add(Key(i, buffer));
         ASSERT_TRUE(bloom.MayContain(Key(i, buffer)));
@@ -113,8 +128,10 @@ TEST(DynamicBloomTest, VaryingLengths) {
       }
       double rate = result / 10000.0;
 
-      fprintf(stderr, "False positives: %5.2f%% @ num = %6u, bloom_bits = %6u, "
-              "cl per block = %u\n", rate*100.0, num, bloom_bits, cl_per_block);
+      fprintf(stderr,
+              "False positives: %5.2f%% @ num = %6u, bloom_bits = %6u, "
+              "enable locality?%u\n",
+              rate * 100.0, num, bloom_bits, enable_locality);
 
       if (rate > 0.0125)
         mediocre_filters++;  // Allowed, but not too often
@@ -128,7 +145,7 @@ TEST(DynamicBloomTest, VaryingLengths) {
   }
 }
 
-TEST(DynamicBloomTest, perf) {
+TEST_F(DynamicBloomTest, perf) {
   StopWatchNano timer(Env::Default());
   uint32_t num_probes = static_cast<uint32_t>(FLAGS_num_probes);
 
@@ -136,14 +153,15 @@ TEST(DynamicBloomTest, perf) {
     return;
   }
 
-  for (uint64_t m = 1; m <= 8; ++m) {
-    const uint64_t num_keys = m * 8 * 1024 * 1024;
-    fprintf(stderr, "testing %" PRIu64 "M keys\n", m * 8);
+  for (uint32_t m = 1; m <= 8; ++m) {
+    Arena arena;
+    const uint32_t num_keys = m * 8 * 1024 * 1024;
+    fprintf(stderr, "testing %" PRIu32 "M keys\n", m * 8);
 
-    DynamicBloom std_bloom(num_keys * 10, 0, num_probes);
+    DynamicBloom std_bloom(&arena, num_keys * 10, 0, num_probes);
 
     timer.Start();
-    for (uint64_t i = 1; i <= num_keys; ++i) {
+    for (uint32_t i = 1; i <= num_keys; ++i) {
       std_bloom.Add(Slice(reinterpret_cast<const char*>(&i), 8));
     }
 
@@ -151,52 +169,55 @@ TEST(DynamicBloomTest, perf) {
     fprintf(stderr, "standard bloom, avg add latency %" PRIu64 "\n",
             elapsed / num_keys);
 
-    uint64_t count = 0;
+    uint32_t count = 0;
     timer.Start();
-    for (uint64_t i = 1; i <= num_keys; ++i) {
+    for (uint32_t i = 1; i <= num_keys; ++i) {
       if (std_bloom.MayContain(Slice(reinterpret_cast<const char*>(&i), 8))) {
         ++count;
       }
     }
+    ASSERT_EQ(count, num_keys);
     elapsed = timer.ElapsedNanos();
     fprintf(stderr, "standard bloom, avg query latency %" PRIu64 "\n",
             elapsed / count);
-    ASSERT_TRUE(count == num_keys);
 
-    for (uint32_t cl_per_block = 1; cl_per_block <= num_probes;
-        ++cl_per_block) {
-      DynamicBloom blocked_bloom(num_keys * 10, cl_per_block, num_probes);
+    // Locality enabled version
+    DynamicBloom blocked_bloom(&arena, num_keys * 10, 1, num_probes);
 
       timer.Start();
-      for (uint64_t i = 1; i <= num_keys; ++i) {
+      for (uint32_t i = 1; i <= num_keys; ++i) {
         blocked_bloom.Add(Slice(reinterpret_cast<const char*>(&i), 8));
       }
 
-      uint64_t elapsed = timer.ElapsedNanos();
-      fprintf(stderr, "blocked bloom(%d), avg add latency %" PRIu64 "\n",
-              cl_per_block, elapsed / num_keys);
+      elapsed = timer.ElapsedNanos();
+      fprintf(stderr,
+              "blocked bloom(enable locality), avg add latency %" PRIu64 "\n",
+              elapsed / num_keys);
 
-      uint64_t count = 0;
+      count = 0;
       timer.Start();
-      for (uint64_t i = 1; i <= num_keys; ++i) {
+      for (uint32_t i = 1; i <= num_keys; ++i) {
         if (blocked_bloom.MayContain(
-              Slice(reinterpret_cast<const char*>(&i), 8))) {
+                Slice(reinterpret_cast<const char*>(&i), 8))) {
           ++count;
         }
       }
 
       elapsed = timer.ElapsedNanos();
-      fprintf(stderr, "blocked bloom(%d), avg query latency %" PRIu64 "\n",
-              cl_per_block, elapsed / count);
+      fprintf(stderr,
+              "blocked bloom(enable locality), avg query latency %" PRIu64 "\n",
+              elapsed / count);
       ASSERT_TRUE(count == num_keys);
     }
-  }
 }
 
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
-  google::ParseCommandLineFlags(&argc, &argv, true);
+  ::testing::InitGoogleTest(&argc, argv);
+  ParseCommandLineFlags(&argc, &argv, true);
 
-  return rocksdb::test::RunAllTests();
+  return RUN_ALL_TESTS();
 }
+
+#endif  // GFLAGS
diff --git a/src/rocksdb/util/env.cc b/src/rocksdb/util/env.cc
index 1c0cae4..0695b55 100644
--- a/src/rocksdb/util/env.cc
+++ b/src/rocksdb/util/env.cc
@@ -41,7 +41,7 @@ void LogFlush(Logger *info_log) {
 }
 
 void Log(Logger* info_log, const char* format, ...) {
-  if (info_log) {
+  if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::INFO_LEVEL) {
     va_list ap;
     va_start(ap, format);
     info_log->Logv(InfoLogLevel::INFO_LEVEL, format, ap);
@@ -51,7 +51,7 @@ void Log(Logger* info_log, const char* format, ...) {
 
 void Log(const InfoLogLevel log_level, Logger* info_log, const char* format,
          ...) {
-  if (info_log) {
+  if (info_log && info_log->GetInfoLogLevel() <= log_level) {
     va_list ap;
     va_start(ap, format);
     info_log->Logv(log_level, format, ap);
@@ -59,17 +59,26 @@ void Log(const InfoLogLevel log_level, Logger* info_log, const char* format,
   }
 }
 
-void Debug(Logger* info_log, const char* format, ...) {
+void Header(Logger* info_log, const char* format, ...) {
   if (info_log) {
     va_list ap;
     va_start(ap, format);
+    info_log->LogHeader(format, ap);
+    va_end(ap);
+  }
+}
+
+void Debug(Logger* info_log, const char* format, ...) {
+  if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::DEBUG_LEVEL) {
+    va_list ap;
+    va_start(ap, format);
     info_log->Logv(InfoLogLevel::DEBUG_LEVEL, format, ap);
     va_end(ap);
   }
 }
 
 void Info(Logger* info_log, const char* format, ...) {
-  if (info_log) {
+  if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::INFO_LEVEL) {
     va_list ap;
     va_start(ap, format);
     info_log->Logv(InfoLogLevel::INFO_LEVEL, format, ap);
@@ -78,7 +87,7 @@ void Info(Logger* info_log, const char* format, ...) {
 }
 
 void Warn(Logger* info_log, const char* format, ...) {
-  if (info_log) {
+  if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::WARN_LEVEL) {
     va_list ap;
     va_start(ap, format);
     info_log->Logv(InfoLogLevel::WARN_LEVEL, format, ap);
@@ -86,7 +95,7 @@ void Warn(Logger* info_log, const char* format, ...) {
   }
 }
 void Error(Logger* info_log, const char* format, ...) {
-  if (info_log) {
+  if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::ERROR_LEVEL) {
     va_list ap;
     va_start(ap, format);
     info_log->Logv(InfoLogLevel::ERROR_LEVEL, format, ap);
@@ -94,7 +103,7 @@ void Error(Logger* info_log, const char* format, ...) {
   }
 }
 void Fatal(Logger* info_log, const char* format, ...) {
-  if (info_log) {
+  if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::FATAL_LEVEL) {
     va_list ap;
     va_start(ap, format);
     info_log->Logv(InfoLogLevel::FATAL_LEVEL, format, ap);
@@ -118,6 +127,15 @@ void Log(const InfoLogLevel log_level, const shared_ptr<Logger>& info_log,
   }
 }
 
+void Header(const shared_ptr<Logger>& info_log, const char* format, ...) {
+  if (info_log) {
+    va_list ap;
+    va_start(ap, format);
+    info_log->LogHeader(format, ap);
+    va_end(ap);
+  }
+}
+
 void Debug(const shared_ptr<Logger>& info_log, const char* format, ...) {
   if (info_log) {
     va_list ap;
@@ -226,12 +244,16 @@ void AssignEnvOptions(EnvOptions* env_options, const DBOptions& options) {
   env_options->use_mmap_writes = options.allow_mmap_writes;
   env_options->set_fd_cloexec = options.is_fd_close_on_exec;
   env_options->bytes_per_sync = options.bytes_per_sync;
+  env_options->rate_limiter = options.rate_limiter.get();
 }
 
 }
 
-EnvOptions Env::OptimizeForLogWrite(const EnvOptions& env_options) const {
-  return env_options;
+EnvOptions Env::OptimizeForLogWrite(const EnvOptions& env_options,
+                                    const DBOptions& db_options) const {
+  EnvOptions optimized_env_options(env_options);
+  optimized_env_options.bytes_per_sync = db_options.wal_bytes_per_sync;
+  return optimized_env_options;
 }
 
 EnvOptions Env::OptimizeForManifestWrite(const EnvOptions& env_options) const {
diff --git a/src/rocksdb/util/env_hdfs.cc b/src/rocksdb/util/env_hdfs.cc
index c724b23..298eb48 100644
--- a/src/rocksdb/util/env_hdfs.cc
+++ b/src/rocksdb/util/env_hdfs.cc
@@ -15,9 +15,12 @@
 #include <sstream>
 #include "rocksdb/env.h"
 #include "rocksdb/status.h"
-#include "hdfs/hdfs.h"
 #include "hdfs/env_hdfs.h"
 
+#define HDFS_EXISTS 0
+#define HDFS_DOESNT_EXIST -1
+#define HDFS_SUCCESS 0
+
 //
 // This file defines an HDFS environment for rocksdb. It uses the libhdfs
 // api to access HDFS. All HDFS files created by one instance of rocksdb
@@ -39,7 +42,8 @@ static Logger* mylog = nullptr;
 
 // Used for reading a file from HDFS. It implements both sequential-read
 // access methods as well as random read access methods.
-class HdfsReadableFile: virtual public SequentialFile, virtual public RandomAccessFile {
+class HdfsReadableFile : virtual public SequentialFile,
+                         virtual public RandomAccessFile {
  private:
   hdfsFS fileSys_;
   std::string filename_;
@@ -48,18 +52,22 @@ class HdfsReadableFile: virtual public SequentialFile, virtual public RandomAcce
  public:
   HdfsReadableFile(hdfsFS fileSys, const std::string& fname)
       : fileSys_(fileSys), filename_(fname), hfile_(nullptr) {
-    Log(mylog, "[hdfs] HdfsReadableFile opening file %s\n",
+    Log(InfoLogLevel::DEBUG_LEVEL, mylog,
+        "[hdfs] HdfsReadableFile opening file %s\n",
         filename_.c_str());
     hfile_ = hdfsOpenFile(fileSys_, filename_.c_str(), O_RDONLY, 0, 0, 0);
-    Log(mylog, "[hdfs] HdfsReadableFile opened file %s hfile_=0x%p\n",
-            filename_.c_str(), hfile_);
+    Log(InfoLogLevel::DEBUG_LEVEL, mylog,
+        "[hdfs] HdfsReadableFile opened file %s hfile_=0x%p\n",
+        filename_.c_str(), hfile_);
   }
 
   virtual ~HdfsReadableFile() {
-    Log(mylog, "[hdfs] HdfsReadableFile closing file %s\n",
-       filename_.c_str());
+    Log(InfoLogLevel::DEBUG_LEVEL, mylog,
+        "[hdfs] HdfsReadableFile closing file %s\n",
+        filename_.c_str());
     hdfsCloseFile(fileSys_, hfile_);
-    Log(mylog, "[hdfs] HdfsReadableFile closed file %s\n",
+    Log(InfoLogLevel::DEBUG_LEVEL, mylog,
+        "[hdfs] HdfsReadableFile closed file %s\n",
         filename_.c_str());
     hfile_ = nullptr;
   }
@@ -71,19 +79,38 @@ class HdfsReadableFile: virtual public SequentialFile, virtual public RandomAcce
   // sequential access, read data at current offset in file
   virtual Status Read(size_t n, Slice* result, char* scratch) {
     Status s;
-    Log(mylog, "[hdfs] HdfsReadableFile reading %s %ld\n",
+    Log(InfoLogLevel::DEBUG_LEVEL, mylog,
+        "[hdfs] HdfsReadableFile reading %s %ld\n",
         filename_.c_str(), n);
-    size_t bytes_read = hdfsRead(fileSys_, hfile_, scratch, (tSize)n);
-    Log(mylog, "[hdfs] HdfsReadableFile read %s\n", filename_.c_str());
-    *result = Slice(scratch, bytes_read);
-    if (bytes_read < n) {
-      if (feof()) {
-        // We leave status as ok if we hit the end of the file
-      } else {
-        // A partial read with an error: return a non-ok status
-        s = IOError(filename_, errno);
+
+    char* buffer = scratch;
+    size_t total_bytes_read = 0;
+    tSize bytes_read = 0;
+    tSize remaining_bytes = (tSize)n;
+
+    // Read a total of n bytes repeatedly until we hit error or eof
+    while (remaining_bytes > 0) {
+      bytes_read = hdfsRead(fileSys_, hfile_, buffer, remaining_bytes);
+      if (bytes_read <= 0) {
+        break;
       }
+      assert(bytes_read <= remaining_bytes);
+
+      total_bytes_read += bytes_read;
+      remaining_bytes -= bytes_read;
+      buffer += bytes_read;
+    }
+    assert(total_bytes_read <= n);
+
+    Log(InfoLogLevel::DEBUG_LEVEL, mylog,
+        "[hdfs] HdfsReadableFile read %s\n", filename_.c_str());
+
+    if (bytes_read < 0) {
+      s = IOError(filename_, errno);
+    } else {
+      *result = Slice(scratch, total_bytes_read);
     }
+
     return s;
   }
 
@@ -91,10 +118,12 @@ class HdfsReadableFile: virtual public SequentialFile, virtual public RandomAcce
   virtual Status Read(uint64_t offset, size_t n, Slice* result,
                       char* scratch) const {
     Status s;
-    Log(mylog, "[hdfs] HdfsReadableFile preading %s\n", filename_.c_str());
+    Log(InfoLogLevel::DEBUG_LEVEL, mylog,
+        "[hdfs] HdfsReadableFile preading %s\n", filename_.c_str());
     ssize_t bytes_read = hdfsPread(fileSys_, hfile_, offset,
                                    (void*)scratch, (tSize)n);
-    Log(mylog, "[hdfs] HdfsReadableFile pread %s\n", filename_.c_str());
+    Log(InfoLogLevel::DEBUG_LEVEL, mylog,
+        "[hdfs] HdfsReadableFile pread %s\n", filename_.c_str());
     *result = Slice(scratch, (bytes_read < 0) ? 0 : bytes_read);
     if (bytes_read < 0) {
       // An error: return a non-ok status
@@ -104,7 +133,8 @@ class HdfsReadableFile: virtual public SequentialFile, virtual public RandomAcce
   }
 
   virtual Status Skip(uint64_t n) {
-    Log(mylog, "[hdfs] HdfsReadableFile skip %s\n", filename_.c_str());
+    Log(InfoLogLevel::DEBUG_LEVEL, mylog,
+        "[hdfs] HdfsReadableFile skip %s\n", filename_.c_str());
     // get current offset from file
     tOffset current = hdfsTell(fileSys_, hfile_);
     if (current < 0) {
@@ -123,7 +153,8 @@ class HdfsReadableFile: virtual public SequentialFile, virtual public RandomAcce
 
   // returns true if we are at the end of file, false otherwise
   bool feof() {
-    Log(mylog, "[hdfs] HdfsReadableFile feof %s\n", filename_.c_str());
+    Log(InfoLogLevel::DEBUG_LEVEL, mylog,
+        "[hdfs] HdfsReadableFile feof %s\n", filename_.c_str());
     if (hdfsTell(fileSys_, hfile_) == fileSize()) {
       return true;
     }
@@ -132,15 +163,15 @@ class HdfsReadableFile: virtual public SequentialFile, virtual public RandomAcce
 
   // the current size of the file
   tOffset fileSize() {
-    Log(mylog, "[hdfs] HdfsReadableFile fileSize %s\n", filename_.c_str());
+    Log(InfoLogLevel::DEBUG_LEVEL, mylog,
+        "[hdfs] HdfsReadableFile fileSize %s\n", filename_.c_str());
     hdfsFileInfo* pFileInfo = hdfsGetPathInfo(fileSys_, filename_.c_str());
     tOffset size = 0L;
     if (pFileInfo != nullptr) {
       size = pFileInfo->mSize;
       hdfsFreeFileInfo(pFileInfo, 1);
     } else {
-      throw rocksdb::HdfsFatalException("fileSize on unknown file " +
-                                            filename_);
+      throw HdfsFatalException("fileSize on unknown file " + filename_);
     }
     return size;
   }
@@ -156,16 +187,20 @@ class HdfsWritableFile: public WritableFile {
  public:
   HdfsWritableFile(hdfsFS fileSys, const std::string& fname)
       : fileSys_(fileSys), filename_(fname) , hfile_(nullptr) {
-    Log(mylog, "[hdfs] HdfsWritableFile opening %s\n", filename_.c_str());
+    Log(InfoLogLevel::DEBUG_LEVEL, mylog,
+        "[hdfs] HdfsWritableFile opening %s\n", filename_.c_str());
     hfile_ = hdfsOpenFile(fileSys_, filename_.c_str(), O_WRONLY, 0, 0, 0);
-    Log(mylog, "[hdfs] HdfsWritableFile opened %s\n", filename_.c_str());
+    Log(InfoLogLevel::DEBUG_LEVEL, mylog,
+        "[hdfs] HdfsWritableFile opened %s\n", filename_.c_str());
     assert(hfile_ != nullptr);
   }
   virtual ~HdfsWritableFile() {
     if (hfile_ != nullptr) {
-      Log(mylog, "[hdfs] HdfsWritableFile closing %s\n", filename_.c_str());
+      Log(InfoLogLevel::DEBUG_LEVEL, mylog,
+          "[hdfs] HdfsWritableFile closing %s\n", filename_.c_str());
       hdfsCloseFile(fileSys_, hfile_);
-      Log(mylog, "[hdfs] HdfsWritableFile closed %s\n", filename_.c_str());
+      Log(InfoLogLevel::DEBUG_LEVEL, mylog,
+          "[hdfs] HdfsWritableFile closed %s\n", filename_.c_str());
       hfile_ = nullptr;
     }
   }
@@ -182,11 +217,13 @@ class HdfsWritableFile: public WritableFile {
   }
 
   virtual Status Append(const Slice& data) {
-    Log(mylog, "[hdfs] HdfsWritableFile Append %s\n", filename_.c_str());
+    Log(InfoLogLevel::DEBUG_LEVEL, mylog,
+        "[hdfs] HdfsWritableFile Append %s\n", filename_.c_str());
     const char* src = data.data();
     size_t left = data.size();
     size_t ret = hdfsWrite(fileSys_, hfile_, src, left);
-    Log(mylog, "[hdfs] HdfsWritableFile Appended %s\n", filename_.c_str());
+    Log(InfoLogLevel::DEBUG_LEVEL, mylog,
+        "[hdfs] HdfsWritableFile Appended %s\n", filename_.c_str());
     if (ret != left) {
       return IOError(filename_, errno);
     }
@@ -199,14 +236,16 @@ class HdfsWritableFile: public WritableFile {
 
   virtual Status Sync() {
     Status s;
-    Log(mylog, "[hdfs] HdfsWritableFile Sync %s\n", filename_.c_str());
+    Log(InfoLogLevel::DEBUG_LEVEL, mylog,
+        "[hdfs] HdfsWritableFile Sync %s\n", filename_.c_str());
     if (hdfsFlush(fileSys_, hfile_) == -1) {
       return IOError(filename_, errno);
     }
-    if (hdfsSync(fileSys_, hfile_) == -1) {
+    if (hdfsHSync(fileSys_, hfile_) == -1) {
       return IOError(filename_, errno);
     }
-    Log(mylog, "[hdfs] HdfsWritableFile Synced %s\n", filename_.c_str());
+    Log(InfoLogLevel::DEBUG_LEVEL, mylog,
+        "[hdfs] HdfsWritableFile Synced %s\n", filename_.c_str());
     return Status::OK();
   }
 
@@ -219,11 +258,13 @@ class HdfsWritableFile: public WritableFile {
   }
 
   virtual Status Close() {
-    Log(mylog, "[hdfs] HdfsWritableFile closing %s\n", filename_.c_str());
+    Log(InfoLogLevel::DEBUG_LEVEL, mylog,
+        "[hdfs] HdfsWritableFile closing %s\n", filename_.c_str());
     if (hdfsCloseFile(fileSys_, hfile_) != 0) {
       return IOError(filename_, errno);
     }
-    Log(mylog, "[hdfs] HdfsWritableFile closed %s\n", filename_.c_str());
+    Log(InfoLogLevel::DEBUG_LEVEL, mylog,
+        "[hdfs] HdfsWritableFile closed %s\n", filename_.c_str());
     hfile_ = nullptr;
     return Status::OK();
   }
@@ -236,16 +277,17 @@ class HdfsLogger : public Logger {
   uint64_t (*gettid_)();  // Return the thread id for the current thread
 
  public:
-  HdfsLogger(HdfsWritableFile* f, uint64_t (*gettid)(),
-             const InfoLogLevel log_level = InfoLogLevel::ERROR)
-      : Logger(log_level), file_(f), gettid_(gettid) {
-    Log(mylog, "[hdfs] HdfsLogger opened %s\n",
-            file_->getName().c_str());
+  HdfsLogger(HdfsWritableFile* f, uint64_t (*gettid)())
+      : file_(f), gettid_(gettid) {
+    Log(InfoLogLevel::DEBUG_LEVEL, mylog,
+        "[hdfs] HdfsLogger opened %s\n",
+        file_->getName().c_str());
   }
 
   virtual ~HdfsLogger() {
-    Log(mylog, "[hdfs] HdfsLogger closed %s\n",
-            file_->getName().c_str());
+    Log(InfoLogLevel::DEBUG_LEVEL, mylog,
+        "[hdfs] HdfsLogger closed %s\n",
+        file_->getName().c_str());
     delete file_;
     if (mylog != nullptr && mylog == this) {
       mylog = nullptr;
@@ -324,40 +366,52 @@ class HdfsLogger : public Logger {
 
 // Finally, the hdfs environment
 
+const std::string HdfsEnv::kProto = "hdfs://";
+const std::string HdfsEnv::pathsep = "/";
+
 // open a file for sequential reading
 Status HdfsEnv::NewSequentialFile(const std::string& fname,
-                                 SequentialFile** result) {
+                                  unique_ptr<SequentialFile>* result,
+                                  const EnvOptions& options) {
+  result->reset();
   HdfsReadableFile* f = new HdfsReadableFile(fileSys_, fname);
-  if (f == nullptr) {
+  if (f == nullptr || !f->isValid()) {
+    delete f;
     *result = nullptr;
     return IOError(fname, errno);
   }
-  *result = dynamic_cast<SequentialFile*>(f);
+  result->reset(dynamic_cast<SequentialFile*>(f));
   return Status::OK();
 }
 
 // open a file for random reading
 Status HdfsEnv::NewRandomAccessFile(const std::string& fname,
-                                   RandomAccessFile** result) {
+                                    unique_ptr<RandomAccessFile>* result,
+                                    const EnvOptions& options) {
+  result->reset();
   HdfsReadableFile* f = new HdfsReadableFile(fileSys_, fname);
-  if (f == nullptr) {
+  if (f == nullptr || !f->isValid()) {
+    delete f;
     *result = nullptr;
     return IOError(fname, errno);
   }
-  *result = dynamic_cast<RandomAccessFile*>(f);
+  result->reset(dynamic_cast<RandomAccessFile*>(f));
   return Status::OK();
 }
 
 // create a new file for writing
 Status HdfsEnv::NewWritableFile(const std::string& fname,
-                               WritableFile** result) {
+                                unique_ptr<WritableFile>* result,
+                                const EnvOptions& options) {
+  result->reset();
   Status s;
   HdfsWritableFile* f = new HdfsWritableFile(fileSys_, fname);
   if (f == nullptr || !f->isValid()) {
+    delete f;
     *result = nullptr;
     return IOError(fname, errno);
   }
-  *result = dynamic_cast<WritableFile*>(f);
+  result->reset(dynamic_cast<WritableFile*>(f));
   return Status::OK();
 }
 
@@ -367,24 +421,55 @@ Status HdfsEnv::NewRandomRWFile(const std::string& fname,
   return Status::NotSupported("NewRandomRWFile not supported on HdfsEnv");
 }
 
-virtual Status NewDirectory(const std::string& name,
-                            unique_ptr<Directory>* result) {
-  return Status::NotSupported("NewDirectory not yet supported on HdfsEnv");
+class HdfsDirectory : public Directory {
+ public:
+  explicit HdfsDirectory(int fd) : fd_(fd) {}
+  ~HdfsDirectory() {}
+
+  virtual Status Fsync() { return Status::OK(); }
+
+ private:
+  int fd_;
+};
+
+Status HdfsEnv::NewDirectory(const std::string& name,
+                             unique_ptr<Directory>* result) {
+  int value = hdfsExists(fileSys_, name.c_str());
+  switch (value) {
+    case HDFS_EXISTS:
+      result->reset(new HdfsDirectory(0));
+      return Status::OK();
+    default:  // fail if the directory doesn't exist
+      Log(InfoLogLevel::FATAL_LEVEL,
+          mylog, "NewDirectory hdfsExists call failed");
+      throw HdfsFatalException("hdfsExists call failed with error " +
+                               ToString(value) + " on path " + name +
+                               ".\n");
+  }
 }
 
 bool HdfsEnv::FileExists(const std::string& fname) {
+
   int value = hdfsExists(fileSys_, fname.c_str());
-  if (value == 0) {
+  switch (value) {
+    case HDFS_EXISTS:
     return true;
+    case HDFS_DOESNT_EXIST:
+      return false;
+    default:  // anything else should be an error
+      Log(InfoLogLevel::FATAL_LEVEL,
+          mylog, "FileExists hdfsExists call failed");
+      throw HdfsFatalException("hdfsExists call failed with error " +
+                               ToString(value) + " on path " + fname +
+                               ".\n");
   }
-  return false;
 }
 
 Status HdfsEnv::GetChildren(const std::string& path,
                             std::vector<std::string>* result) {
   int value = hdfsExists(fileSys_, path.c_str());
   switch (value) {
-  case 0: {
+    case HDFS_EXISTS: {  // directory exists
     int numEntries = 0;
     hdfsFileInfo* pHdfsFileInfo = 0;
     pHdfsFileInfo = hdfsListDirectory(fileSys_, path.c_str(), &numEntries);
@@ -401,22 +486,26 @@ Status HdfsEnv::GetChildren(const std::string& path,
       }
     } else {
       // numEntries < 0 indicates error
-      Log(mylog, "hdfsListDirectory call failed with error ");
-      throw HdfsFatalException("hdfsListDirectory call failed negative error.\n");
+      Log(InfoLogLevel::FATAL_LEVEL, mylog,
+          "hdfsListDirectory call failed with error ");
+      throw HdfsFatalException(
+          "hdfsListDirectory call failed negative error.\n");
     }
     break;
   }
-  case 1:           // directory does not exist, exit
+  case HDFS_DOESNT_EXIST:  // directory does not exist, exit
     break;
   default:          // anything else should be an error
-    Log(mylog, "hdfsListDirectory call failed with error ");
-    throw HdfsFatalException("hdfsListDirectory call failed with error.\n");
+    Log(InfoLogLevel::FATAL_LEVEL, mylog,
+        "GetChildren hdfsExists call failed");
+    throw HdfsFatalException("hdfsExists call failed with error " +
+                             ToString(value) + ".\n");
   }
   return Status::OK();
 }
 
 Status HdfsEnv::DeleteFile(const std::string& fname) {
-  if (hdfsDelete(fileSys_, fname.c_str()) == 0) {
+  if (hdfsDelete(fileSys_, fname.c_str(), 1) == 0) {
     return Status::OK();
   }
   return IOError(fname, errno);
@@ -432,10 +521,16 @@ Status HdfsEnv::CreateDir(const std::string& name) {
 Status HdfsEnv::CreateDirIfMissing(const std::string& name) {
   const int value = hdfsExists(fileSys_, name.c_str());
   //  Not atomic. state might change b/w hdfsExists and CreateDir.
-  if (value == 0) {
+  switch (value) {
+    case HDFS_EXISTS:
     return Status::OK();
-  } else {
+    case HDFS_DOESNT_EXIST:
     return CreateDir(name);
+    default:  // anything else should be an error
+      Log(InfoLogLevel::FATAL_LEVEL, mylog,
+          "CreateDirIfMissing hdfsExists call failed");
+      throw HdfsFatalException("hdfsExists call failed with error " +
+                               ToString(value) + ".\n");
   }
 };
 
@@ -467,10 +562,10 @@ Status HdfsEnv::GetFileModificationTime(const std::string& fname,
 }
 
 // The rename is not atomic. HDFS does not allow a renaming if the
-// target already exists. So, we delete the target before attemting the
+// target already exists. So, we delete the target before attempting the
 // rename.
 Status HdfsEnv::RenameFile(const std::string& src, const std::string& target) {
-  hdfsDelete(fileSys_, target.c_str());
+  hdfsDelete(fileSys_, target.c_str(), 1);
   if (hdfsRename(fileSys_, src.c_str(), target.c_str()) == 0) {
     return Status::OK();
   }
@@ -492,11 +587,12 @@ Status HdfsEnv::NewLogger(const std::string& fname,
                           shared_ptr<Logger>* result) {
   HdfsWritableFile* f = new HdfsWritableFile(fileSys_, fname);
   if (f == nullptr || !f->isValid()) {
+    delete f;
     *result = nullptr;
     return IOError(fname, errno);
   }
   HdfsLogger* h = new HdfsLogger(f, &HdfsEnv::gettid);
-  *result = h;
+  result->reset(h);
   if (mylog == nullptr) {
     // mylog = h; // uncomment this for detailed logging
   }
diff --git a/src/rocksdb/util/env_posix.cc b/src/rocksdb/util/env_posix.cc
index 63b9fc2..3cdd12b 100644
--- a/src/rocksdb/util/env_posix.cc
+++ b/src/rocksdb/util/env_posix.cc
@@ -21,6 +21,7 @@
 #include <sys/stat.h>
 #ifdef OS_LINUX
 #include <sys/statfs.h>
+#include <sys/syscall.h>
 #endif
 #include <sys/time.h>
 #include <sys/types.h>
@@ -28,11 +29,9 @@
 #include <unistd.h>
 #if defined(OS_LINUX)
 #include <linux/fs.h>
-#include <fcntl.h>
-#endif
-#if defined(LEVELDB_PLATFORM_ANDROID)
-#include <sys/stat.h>
 #endif
+#include <signal.h>
+#include <algorithm>
 #include "rocksdb/env.h"
 #include "rocksdb/slice.h"
 #include "port/port.h"
@@ -40,12 +39,19 @@
 #include "util/logging.h"
 #include "util/posix_logger.h"
 #include "util/random.h"
-#include <signal.h>
-
-// Get nano time for mach systems
-#ifdef __MACH__
+#include "util/iostats_context_imp.h"
+#include "util/rate_limiter.h"
+#include "util/sync_point.h"
+#include "util/thread_status_updater.h"
+#include "util/thread_status_util.h"
+
+// Get nano time includes
+#if defined(OS_LINUX) || defined(OS_FREEBSD)
+#elif defined(__MACH__)
 #include <mach/clock.h>
 #include <mach/mach.h>
+#else
+#include <chrono>
 #endif
 
 #if !defined(TMPFS_MAGIC)
@@ -60,7 +66,7 @@
 
 // For non linux platform, the following macros are used only as place
 // holder.
-#ifndef OS_LINUX
+#if !(defined OS_LINUX) && !(defined CYGWIN)
 #define POSIX_FADV_NORMAL 0 /* [MC1] no further special treatment */
 #define POSIX_FADV_RANDOM 1 /* [MC1] expect random page refs */
 #define POSIX_FADV_SEQUENTIAL 2 /* [MC1] expect sequential page refs */
@@ -86,6 +92,10 @@ int Fadvise(int fd, off_t offset, size_t len, int advice) {
 #endif
 }
 
+ThreadStatusUpdater* CreateThreadStatusUpdater() {
+  return new ThreadStatusUpdater();
+}
+
 // list of pathnames that are locked
 static std::set<std::string> lockedFiles;
 static port::Mutex mutex_lockedFiles;
@@ -172,12 +182,17 @@ class PosixSequentialFile: public SequentialFile {
   }
   virtual ~PosixSequentialFile() { fclose(file_); }
 
-  virtual Status Read(size_t n, Slice* result, char* scratch) {
+  virtual Status Read(size_t n, Slice* result, char* scratch) override {
     Status s;
     size_t r = 0;
     do {
+#ifndef CYGWIN
       r = fread_unlocked(scratch, 1, n, file_);
+#else
+      r = fread(scratch, 1, n, file_);
+#endif
     } while (r == 0 && ferror(file_) && errno == EINTR);
+    IOSTATS_ADD(bytes_read, r);
     *result = Slice(scratch, r);
     if (r < n) {
       if (feof(file_)) {
@@ -198,14 +213,14 @@ class PosixSequentialFile: public SequentialFile {
     return s;
   }
 
-  virtual Status Skip(uint64_t n) {
-    if (fseek(file_, n, SEEK_CUR)) {
+  virtual Status Skip(uint64_t n) override {
+    if (fseek(file_, static_cast<long int>(n), SEEK_CUR)) {
       return IOError(filename_, errno);
     }
     return Status::OK();
   }
 
-  virtual Status InvalidateCache(size_t offset, size_t length) {
+  virtual Status InvalidateCache(size_t offset, size_t length) override {
 #ifndef OS_LINUX
     return Status::OK();
 #else
@@ -230,18 +245,31 @@ class PosixRandomAccessFile: public RandomAccessFile {
   PosixRandomAccessFile(const std::string& fname, int fd,
                         const EnvOptions& options)
       : filename_(fname), fd_(fd), use_os_buffer_(options.use_os_buffer) {
-    assert(!options.use_mmap_reads);
+    assert(!options.use_mmap_reads || sizeof(void*) < 8);
   }
   virtual ~PosixRandomAccessFile() { close(fd_); }
 
   virtual Status Read(uint64_t offset, size_t n, Slice* result,
-                      char* scratch) const {
+                      char* scratch) const override {
     Status s;
     ssize_t r = -1;
-    do {
-      r = pread(fd_, scratch, n, static_cast<off_t>(offset));
-    } while (r < 0 && errno == EINTR);
-    *result = Slice(scratch, (r < 0) ? 0 : r);
+    size_t left = n;
+    char* ptr = scratch;
+    while (left > 0) {
+      r = pread(fd_, ptr, left, static_cast<off_t>(offset));
+      if (r <= 0) {
+        if (errno == EINTR) {
+          continue;
+        }
+        break;
+      }
+      ptr += r;
+      offset += r;
+      left -= r;
+    }
+
+    IOSTATS_ADD_IF_POSITIVE(bytes_read, n - left);
+    *result = Slice(scratch, (r < 0) ? 0 : n - left);
     if (r < 0) {
       // An error: return a non-ok status
       s = IOError(filename_, errno);
@@ -255,12 +283,12 @@ class PosixRandomAccessFile: public RandomAccessFile {
   }
 
 #ifdef OS_LINUX
-  virtual size_t GetUniqueId(char* id, size_t max_size) const {
+  virtual size_t GetUniqueId(char* id, size_t max_size) const override {
     return GetUniqueIdFromFile(fd_, id, max_size);
   }
 #endif
 
-  virtual void Hint(AccessPattern pattern) {
+  virtual void Hint(AccessPattern pattern) override {
     switch(pattern) {
       case NORMAL:
         Fadvise(fd_, 0, 0, POSIX_FADV_NORMAL);
@@ -283,7 +311,7 @@ class PosixRandomAccessFile: public RandomAccessFile {
     }
   }
 
-  virtual Status InvalidateCache(size_t offset, size_t length) {
+  virtual Status InvalidateCache(size_t offset, size_t length) override {
 #ifndef OS_LINUX
     return Status::OK();
 #else
@@ -324,7 +352,7 @@ class PosixMmapReadableFile: public RandomAccessFile {
   }
 
   virtual Status Read(uint64_t offset, size_t n, Slice* result,
-                      char* scratch) const {
+                      char* scratch) const override {
     Status s;
     if (offset + n > length_) {
       *result = Slice();
@@ -334,7 +362,7 @@ class PosixMmapReadableFile: public RandomAccessFile {
     }
     return s;
   }
-  virtual Status InvalidateCache(size_t offset, size_t length) {
+  virtual Status InvalidateCache(size_t offset, size_t length) override {
 #ifndef OS_LINUX
     return Status::OK();
 #else
@@ -380,16 +408,16 @@ class PosixMmapFile : public WritableFile {
     return s;
   }
 
-  bool UnmapCurrentRegion() {
-    bool result = true;
+  Status UnmapCurrentRegion() {
     TEST_KILL_RANDOM(rocksdb_kill_odds);
     if (base_ != nullptr) {
       if (last_sync_ < limit_) {
         // Defer syncing this data until next Sync() call, if any
         pending_sync_ = true;
       }
-      if (munmap(base_, limit_ - base_) != 0) {
-        result = false;
+      int munmap_status = munmap(base_, limit_ - base_);
+      if (munmap_status != 0) {
+        return IOError(filename_, munmap_status);
       }
       file_offset_ += limit_ - base_;
       base_ = nullptr;
@@ -402,7 +430,7 @@ class PosixMmapFile : public WritableFile {
         map_size_ *= 2;
       }
     }
-    return result;
+    return Status::OK();
   }
 
   Status MapNewRegion() {
@@ -467,27 +495,30 @@ class PosixMmapFile : public WritableFile {
     }
   }
 
-  virtual Status Append(const Slice& data) {
+  virtual Status Append(const Slice& data) override {
     const char* src = data.data();
     size_t left = data.size();
     TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS);
-    PrepareWrite(GetFileSize(), left);
+    PrepareWrite(static_cast<size_t>(GetFileSize()), left);
     while (left > 0) {
       assert(base_ <= dst_);
       assert(dst_ <= limit_);
       size_t avail = limit_ - dst_;
       if (avail == 0) {
-        if (UnmapCurrentRegion()) {
-          Status s = MapNewRegion();
-          if (!s.ok()) {
-            return s;
-          }
-          TEST_KILL_RANDOM(rocksdb_kill_odds);
+        Status s = UnmapCurrentRegion();
+        if (!s.ok()) {
+          return s;
         }
+        s = MapNewRegion();
+        if (!s.ok()) {
+          return s;
+        }
+        TEST_KILL_RANDOM(rocksdb_kill_odds);
       }
 
       size_t n = (left <= avail) ? left : avail;
       memcpy(dst_, src, n);
+      IOSTATS_ADD(bytes_written, n);
       dst_ += n;
       src += n;
       left -= n;
@@ -496,13 +527,14 @@ class PosixMmapFile : public WritableFile {
     return Status::OK();
   }
 
-  virtual Status Close() {
+  virtual Status Close() override {
     Status s;
     size_t unused = limit_ - dst_;
 
     TEST_KILL_RANDOM(rocksdb_kill_odds);
 
-    if (!UnmapCurrentRegion()) {
+    s = UnmapCurrentRegion();
+    if (!s.ok()) {
       s = IOError(filename_, errno);
     } else if (unused > 0) {
       // Trim the extra space at the end of the file
@@ -525,12 +557,12 @@ class PosixMmapFile : public WritableFile {
     return s;
   }
 
-  virtual Status Flush() {
+  virtual Status Flush() override {
     TEST_KILL_RANDOM(rocksdb_kill_odds);
     return Status::OK();
   }
 
-  virtual Status Sync() {
+  virtual Status Sync() override {
     Status s;
 
     if (pending_sync_) {
@@ -562,7 +594,7 @@ class PosixMmapFile : public WritableFile {
   /**
    * Flush data as well as metadata to stable storage.
    */
-  virtual Status Fsync() {
+  virtual Status Fsync() override {
     if (pending_sync_) {
       // Some unmapped data was not synced
       TEST_KILL_RANDOM(rocksdb_kill_odds);
@@ -582,12 +614,12 @@ class PosixMmapFile : public WritableFile {
    * size that is returned from the filesystem because we use mmap
    * to extend file by map_size every time.
    */
-  virtual uint64_t GetFileSize() {
+  virtual uint64_t GetFileSize() override {
     size_t used = dst_ - base_;
     return file_offset_ + used;
   }
 
-  virtual Status InvalidateCache(size_t offset, size_t length) {
+  virtual Status InvalidateCache(size_t offset, size_t length) override {
 #ifndef OS_LINUX
     return Status::OK();
 #else
@@ -601,7 +633,7 @@ class PosixMmapFile : public WritableFile {
   }
 
 #ifdef ROCKSDB_FALLOCATE_PRESENT
-  virtual Status Allocate(off_t offset, off_t len) {
+  virtual Status Allocate(off_t offset, off_t len) override {
     TEST_KILL_RANDOM(rocksdb_kill_odds);
     int alloc_status = fallocate(
         fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len);
@@ -630,6 +662,7 @@ class PosixWritableFile : public WritableFile {
 #ifdef ROCKSDB_FALLOCATE_PRESENT
   bool fallocate_with_keep_size_;
 #endif
+  RateLimiter* rate_limiter_;
 
  public:
   PosixWritableFile(const std::string& fname, int fd, size_t capacity,
@@ -643,7 +676,8 @@ class PosixWritableFile : public WritableFile {
         pending_sync_(false),
         pending_fsync_(false),
         last_sync_size_(0),
-        bytes_per_sync_(options.bytes_per_sync) {
+        bytes_per_sync_(options.bytes_per_sync),
+        rate_limiter_(options.rate_limiter) {
 #ifdef ROCKSDB_FALLOCATE_PRESENT
     fallocate_with_keep_size_ = options.fallocate_with_keep_size;
 #endif
@@ -656,7 +690,7 @@ class PosixWritableFile : public WritableFile {
     }
   }
 
-  virtual Status Append(const Slice& data) {
+  virtual Status Append(const Slice& data) override {
     const char* src = data.data();
     size_t left = data.size();
     Status s;
@@ -665,7 +699,7 @@ class PosixWritableFile : public WritableFile {
 
     TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2);
 
-    PrepareWrite(GetFileSize(), left);
+    PrepareWrite(static_cast<size_t>(GetFileSize()), left);
     // if there is no space in the cache, then flush
     if (cursize_ + left > capacity_) {
       s = Flush();
@@ -687,13 +721,14 @@ class PosixWritableFile : public WritableFile {
       cursize_ += left;
     } else {
       while (left != 0) {
-        ssize_t done = write(fd_, src, left);
+        ssize_t done = write(fd_, src, RequestToken(left));
         if (done < 0) {
           if (errno == EINTR) {
             continue;
           }
           return IOError(filename_, errno);
         }
+        IOSTATS_ADD(bytes_written, done);
         TEST_KILL_RANDOM(rocksdb_kill_odds);
 
         left -= done;
@@ -704,7 +739,7 @@ class PosixWritableFile : public WritableFile {
     return Status::OK();
   }
 
-  virtual Status Close() {
+  virtual Status Close() override {
     Status s;
     s = Flush(); // flush cache to OS
     if (!s.ok()) {
@@ -718,32 +753,48 @@ class PosixWritableFile : public WritableFile {
     GetPreallocationStatus(&block_size, &last_allocated_block);
     if (last_allocated_block > 0) {
       // trim the extra space preallocated at the end of the file
+      // NOTE(ljin): we probably don't want to surface failure as an IOError,
+      // but it will be nice to log these errors.
       int dummy __attribute__((unused));
-      dummy = ftruncate(fd_, filesize_);  // ignore errors
+      dummy = ftruncate(fd_, filesize_);
+#ifdef ROCKSDB_FALLOCATE_PRESENT
+      // in some file systems, ftruncate only trims trailing space if the
+      // new file size is smaller than the current size. Calling fallocate
+      // with FALLOC_FL_PUNCH_HOLE flag to explicitly release these unused
+      // blocks. FALLOC_FL_PUNCH_HOLE is supported on at least the following
+      // filesystems:
+      //   XFS (since Linux 2.6.38)
+      //   ext4 (since Linux 3.0)
+      //   Btrfs (since Linux 3.7)
+      //   tmpfs (since Linux 3.5)
+      // We ignore error since failure of this operation does not affect
+      // correctness.
+      fallocate(fd_, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE,
+                filesize_, block_size * last_allocated_block - filesize_);
+#endif
     }
 
     if (close(fd_) < 0) {
-      if (s.ok()) {
-        s = IOError(filename_, errno);
-      }
+      s = IOError(filename_, errno);
     }
     fd_ = -1;
     return s;
   }
 
   // write out the cached data to the OS cache
-  virtual Status Flush() {
+  virtual Status Flush() override {
     TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2);
     size_t left = cursize_;
     char* src = buf_.get();
     while (left != 0) {
-      ssize_t done = write(fd_, src, left);
+      ssize_t done = write(fd_, src, RequestToken(left));
       if (done < 0) {
         if (errno == EINTR) {
           continue;
         }
         return IOError(filename_, errno);
       }
+      IOSTATS_ADD(bytes_written, done);
       TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2);
       left -= done;
       src += done;
@@ -763,7 +814,7 @@ class PosixWritableFile : public WritableFile {
     return Status::OK();
   }
 
-  virtual Status Sync() {
+  virtual Status Sync() override {
     Status s = Flush();
     if (!s.ok()) {
       return s;
@@ -777,7 +828,7 @@ class PosixWritableFile : public WritableFile {
     return Status::OK();
   }
 
-  virtual Status Fsync() {
+  virtual Status Fsync() override {
     Status s = Flush();
     if (!s.ok()) {
       return s;
@@ -792,11 +843,9 @@ class PosixWritableFile : public WritableFile {
     return Status::OK();
   }
 
-  virtual uint64_t GetFileSize() {
-    return filesize_;
-  }
+  virtual uint64_t GetFileSize() override { return filesize_; }
 
-  virtual Status InvalidateCache(size_t offset, size_t length) {
+  virtual Status InvalidateCache(size_t offset, size_t length) override {
 #ifndef OS_LINUX
     return Status::OK();
 #else
@@ -810,7 +859,7 @@ class PosixWritableFile : public WritableFile {
   }
 
 #ifdef ROCKSDB_FALLOCATE_PRESENT
-  virtual Status Allocate(off_t offset, off_t len) {
+  virtual Status Allocate(off_t offset, off_t len) override {
     TEST_KILL_RANDOM(rocksdb_kill_odds);
     int alloc_status = fallocate(
         fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len);
@@ -821,17 +870,27 @@ class PosixWritableFile : public WritableFile {
     }
   }
 
-  virtual Status RangeSync(off_t offset, off_t nbytes) {
+  virtual Status RangeSync(off_t offset, off_t nbytes) override {
     if (sync_file_range(fd_, offset, nbytes, SYNC_FILE_RANGE_WRITE) == 0) {
       return Status::OK();
     } else {
       return IOError(filename_, errno);
     }
   }
-  virtual size_t GetUniqueId(char* id, size_t max_size) const {
+  virtual size_t GetUniqueId(char* id, size_t max_size) const override {
     return GetUniqueIdFromFile(fd_, id, max_size);
   }
 #endif
+
+ private:
+  inline size_t RequestToken(size_t bytes) {
+    if (rate_limiter_ && io_priority_ < Env::IO_TOTAL) {
+      bytes = std::min(bytes,
+          static_cast<size_t>(rate_limiter_->GetSingleBurstBytes()));
+      rate_limiter_->Request(bytes, io_priority_);
+    }
+    return bytes;
+  }
 };
 
 class PosixRandomRWFile : public RandomRWFile {
@@ -862,7 +921,7 @@ class PosixRandomRWFile : public RandomRWFile {
     }
   }
 
-  virtual Status Write(uint64_t offset, const Slice& data) {
+  virtual Status Write(uint64_t offset, const Slice& data) override {
     const char* src = data.data();
     size_t left = data.size();
     Status s;
@@ -877,6 +936,7 @@ class PosixRandomRWFile : public RandomRWFile {
         }
         return IOError(filename_, errno);
       }
+      IOSTATS_ADD(bytes_written, done);
 
       left -= done;
       src += done;
@@ -887,17 +947,32 @@ class PosixRandomRWFile : public RandomRWFile {
   }
 
   virtual Status Read(uint64_t offset, size_t n, Slice* result,
-                      char* scratch) const {
+                      char* scratch) const override {
     Status s;
-    ssize_t r = pread(fd_, scratch, n, static_cast<off_t>(offset));
-    *result = Slice(scratch, (r < 0) ? 0 : r);
+    ssize_t r = -1;
+    size_t left = n;
+    char* ptr = scratch;
+    while (left > 0) {
+      r = pread(fd_, ptr, left, static_cast<off_t>(offset));
+      if (r <= 0) {
+        if (errno == EINTR) {
+          continue;
+        }
+        break;
+      }
+      ptr += r;
+      offset += r;
+      left -= r;
+    }
+    IOSTATS_ADD_IF_POSITIVE(bytes_read, n - left);
+    *result = Slice(scratch, (r < 0) ? 0 : n - left);
     if (r < 0) {
       s = IOError(filename_, errno);
     }
     return s;
   }
 
-  virtual Status Close() {
+  virtual Status Close() override {
     Status s = Status::OK();
     if (fd_ >= 0 && close(fd_) < 0) {
       s = IOError(filename_, errno);
@@ -906,7 +981,7 @@ class PosixRandomRWFile : public RandomRWFile {
     return s;
   }
 
-  virtual Status Sync() {
+  virtual Status Sync() override {
     if (pending_sync_ && fdatasync(fd_) < 0) {
       return IOError(filename_, errno);
     }
@@ -914,7 +989,7 @@ class PosixRandomRWFile : public RandomRWFile {
     return Status::OK();
   }
 
-  virtual Status Fsync() {
+  virtual Status Fsync() override {
     if (pending_fsync_ && fsync(fd_) < 0) {
       return IOError(filename_, errno);
     }
@@ -924,7 +999,7 @@ class PosixRandomRWFile : public RandomRWFile {
   }
 
 #ifdef ROCKSDB_FALLOCATE_PRESENT
-  virtual Status Allocate(off_t offset, off_t len) {
+  virtual Status Allocate(off_t offset, off_t len) override {
     TEST_KILL_RANDOM(rocksdb_kill_odds);
     int alloc_status = fallocate(
         fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len);
@@ -944,7 +1019,7 @@ class PosixDirectory : public Directory {
     close(fd_);
   }
 
-  virtual Status Fsync() {
+  virtual Status Fsync() override {
     if (fsync(fd_) == -1) {
       return IOError("directory", errno);
     }
@@ -999,24 +1074,27 @@ class PosixFileLock : public FileLock {
   std::string filename;
 };
 
-
-namespace {
 void PthreadCall(const char* label, int result) {
   if (result != 0) {
     fprintf(stderr, "pthread %s: %s\n", label, strerror(result));
-    exit(1);
+    abort();
   }
 }
-}
 
 class PosixEnv : public Env {
  public:
   PosixEnv();
 
-  virtual ~PosixEnv(){
+  virtual ~PosixEnv() {
     for (const auto tid : threads_to_join_) {
       pthread_join(tid, nullptr);
     }
+    for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) {
+      thread_pools_[pool_id].JoinAllThreads();
+    }
+    // All threads must be joined before the deletion of
+    // thread_status_updater_.
+    delete thread_status_updater_;
   }
 
   void SetFD_CLOEXEC(int fd, const EnvOptions* options) {
@@ -1027,7 +1105,7 @@ class PosixEnv : public Env {
 
   virtual Status NewSequentialFile(const std::string& fname,
                                    unique_ptr<SequentialFile>* result,
-                                   const EnvOptions& options) {
+                                   const EnvOptions& options) override {
     result->reset();
     FILE* f = nullptr;
     do {
@@ -1046,7 +1124,7 @@ class PosixEnv : public Env {
 
   virtual Status NewRandomAccessFile(const std::string& fname,
                                      unique_ptr<RandomAccessFile>* result,
-                                     const EnvOptions& options) {
+                                     const EnvOptions& options) override {
     result->reset();
     Status s;
     int fd = open(fname.c_str(), O_RDONLY);
@@ -1077,7 +1155,7 @@ class PosixEnv : public Env {
 
   virtual Status NewWritableFile(const std::string& fname,
                                  unique_ptr<WritableFile>* result,
-                                 const EnvOptions& options) {
+                                 const EnvOptions& options) override {
     result->reset();
     Status s;
     int fd = -1;
@@ -1115,7 +1193,7 @@ class PosixEnv : public Env {
 
   virtual Status NewRandomRWFile(const std::string& fname,
                                  unique_ptr<RandomRWFile>* result,
-                                 const EnvOptions& options) {
+                                 const EnvOptions& options) override {
     result->reset();
     // no support for mmap yet
     if (options.use_mmap_writes || options.use_mmap_reads) {
@@ -1133,7 +1211,7 @@ class PosixEnv : public Env {
   }
 
   virtual Status NewDirectory(const std::string& name,
-                              unique_ptr<Directory>* result) {
+                              unique_ptr<Directory>* result) override {
     result->reset();
     const int fd = open(name.c_str(), 0);
     if (fd < 0) {
@@ -1144,12 +1222,12 @@ class PosixEnv : public Env {
     return Status::OK();
   }
 
-  virtual bool FileExists(const std::string& fname) {
+  virtual bool FileExists(const std::string& fname) override {
     return access(fname.c_str(), F_OK) == 0;
   }
 
   virtual Status GetChildren(const std::string& dir,
-                             std::vector<std::string>* result) {
+                             std::vector<std::string>* result) override {
     result->clear();
     DIR* d = opendir(dir.c_str());
     if (d == nullptr) {
@@ -1163,7 +1241,7 @@ class PosixEnv : public Env {
     return Status::OK();
   }
 
-  virtual Status DeleteFile(const std::string& fname) {
+  virtual Status DeleteFile(const std::string& fname) override {
     Status result;
     if (unlink(fname.c_str()) != 0) {
       result = IOError(fname, errno);
@@ -1171,7 +1249,7 @@ class PosixEnv : public Env {
     return result;
   };
 
-  virtual Status CreateDir(const std::string& name) {
+  virtual Status CreateDir(const std::string& name) override {
     Status result;
     if (mkdir(name.c_str(), 0755) != 0) {
       result = IOError(name, errno);
@@ -1179,7 +1257,7 @@ class PosixEnv : public Env {
     return result;
   };
 
-  virtual Status CreateDirIfMissing(const std::string& name) {
+  virtual Status CreateDirIfMissing(const std::string& name) override {
     Status result;
     if (mkdir(name.c_str(), 0755) != 0) {
       if (errno != EEXIST) {
@@ -1193,7 +1271,7 @@ class PosixEnv : public Env {
     return result;
   };
 
-  virtual Status DeleteDir(const std::string& name) {
+  virtual Status DeleteDir(const std::string& name) override {
     Status result;
     if (rmdir(name.c_str()) != 0) {
       result = IOError(name, errno);
@@ -1201,7 +1279,8 @@ class PosixEnv : public Env {
     return result;
   };
 
-  virtual Status GetFileSize(const std::string& fname, uint64_t* size) {
+  virtual Status GetFileSize(const std::string& fname,
+                             uint64_t* size) override {
     Status s;
     struct stat sbuf;
     if (stat(fname.c_str(), &sbuf) != 0) {
@@ -1214,7 +1293,7 @@ class PosixEnv : public Env {
   }
 
   virtual Status GetFileModificationTime(const std::string& fname,
-                                         uint64_t* file_mtime) {
+                                         uint64_t* file_mtime) override {
     struct stat s;
     if (stat(fname.c_str(), &s) !=0) {
       return IOError(fname, errno);
@@ -1222,7 +1301,8 @@ class PosixEnv : public Env {
     *file_mtime = static_cast<uint64_t>(s.st_mtime);
     return Status::OK();
   }
-  virtual Status RenameFile(const std::string& src, const std::string& target) {
+  virtual Status RenameFile(const std::string& src,
+                            const std::string& target) override {
     Status result;
     if (rename(src.c_str(), target.c_str()) != 0) {
       result = IOError(src, errno);
@@ -1230,7 +1310,19 @@ class PosixEnv : public Env {
     return result;
   }
 
-  virtual Status LockFile(const std::string& fname, FileLock** lock) {
+  virtual Status LinkFile(const std::string& src,
+                          const std::string& target) override {
+    Status result;
+    if (link(src.c_str(), target.c_str()) != 0) {
+      if (errno == EXDEV) {
+        return Status::NotSupported("No cross FS links allowed");
+      }
+      result = IOError(src, errno);
+    }
+    return result;
+  }
+
+  virtual Status LockFile(const std::string& fname, FileLock** lock) override {
     *lock = nullptr;
     Status result;
     int fd = open(fname.c_str(), O_RDWR | O_CREAT, 0644);
@@ -1249,7 +1341,7 @@ class PosixEnv : public Env {
     return result;
   }
 
-  virtual Status UnlockFile(FileLock* lock) {
+  virtual Status UnlockFile(FileLock* lock) override {
     PosixFileLock* my_lock = reinterpret_cast<PosixFileLock*>(lock);
     Status result;
     if (LockOrUnlock(my_lock->filename, my_lock->fd_, false) == -1) {
@@ -1260,15 +1352,18 @@ class PosixEnv : public Env {
     return result;
   }
 
-  virtual void Schedule(void (*function)(void*), void* arg, Priority pri = LOW);
+  virtual void Schedule(void (*function)(void* arg1), void* arg,
+                        Priority pri = LOW, void* tag = nullptr) override;
 
-  virtual void StartThread(void (*function)(void* arg), void* arg);
+  virtual int UnSchedule(void* arg, Priority pri) override;
 
-  virtual void WaitForJoin();
+  virtual void StartThread(void (*function)(void* arg), void* arg) override;
+
+  virtual void WaitForJoin() override;
 
   virtual unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const override;
 
-  virtual Status GetTestDirectory(std::string* result) {
+  virtual Status GetTestDirectory(std::string* result) override {
     const char* env = getenv("TEST_TMPDIR");
     if (env && env[0] != '\0') {
       *result = env;
@@ -1282,15 +1377,25 @@ class PosixEnv : public Env {
     return Status::OK();
   }
 
-  static uint64_t gettid() {
-    pthread_t tid = pthread_self();
+  virtual Status GetThreadList(
+      std::vector<ThreadStatus>* thread_list) override {
+    assert(thread_status_updater_);
+    return thread_status_updater_->GetThreadList(thread_list);
+  }
+
+  static uint64_t gettid(pthread_t tid) {
     uint64_t thread_id = 0;
     memcpy(&thread_id, &tid, std::min(sizeof(thread_id), sizeof(tid)));
     return thread_id;
   }
 
+  static uint64_t gettid() {
+    pthread_t tid = pthread_self();
+    return gettid(tid);
+  }
+
   virtual Status NewLogger(const std::string& fname,
-                           shared_ptr<Logger>* result) {
+                           shared_ptr<Logger>* result) override {
     FILE* f = fopen(fname.c_str(), "w");
     if (f == nullptr) {
       result->reset();
@@ -1303,34 +1408,34 @@ class PosixEnv : public Env {
     }
   }
 
-  virtual uint64_t NowMicros() {
+  virtual uint64_t NowMicros() override {
     struct timeval tv;
-    // TODO(kailiu) MAC DON'T HAVE THIS
     gettimeofday(&tv, nullptr);
     return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
   }
 
-  virtual uint64_t NowNanos() {
-#ifdef OS_LINUX
+  virtual uint64_t NowNanos() override {
+#if defined(OS_LINUX) || defined(OS_FREEBSD)
     struct timespec ts;
     clock_gettime(CLOCK_MONOTONIC, &ts);
     return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
-#elif __MACH__
+#elif defined(__MACH__)
     clock_serv_t cclock;
     mach_timespec_t ts;
     host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock);
     clock_get_time(cclock, &ts);
     mach_port_deallocate(mach_task_self(), cclock);
-#endif
     return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
+#else
+    return std::chrono::duration_cast<std::chrono::nanoseconds>(
+       std::chrono::steady_clock::now().time_since_epoch()).count();
+#endif
   }
 
-  virtual void SleepForMicroseconds(int micros) {
-    usleep(micros);
-  }
+  virtual void SleepForMicroseconds(int micros) override { usleep(micros); }
 
-  virtual Status GetHostName(char* name, uint64_t len) {
-    int ret = gethostname(name, len);
+  virtual Status GetHostName(char* name, uint64_t len) override {
+    int ret = gethostname(name, static_cast<size_t>(len));
     if (ret < 0) {
       if (errno == EFAULT || errno == EINVAL)
         return Status::InvalidArgument(strerror(errno));
@@ -1340,7 +1445,7 @@ class PosixEnv : public Env {
     return Status::OK();
   }
 
-  virtual Status GetCurrentTime(int64_t* unix_time) {
+  virtual Status GetCurrentTime(int64_t* unix_time) override {
     time_t ret = time(nullptr);
     if (ret == (time_t) -1) {
       return IOError("GetCurrentTime", errno);
@@ -1350,7 +1455,7 @@ class PosixEnv : public Env {
   }
 
   virtual Status GetAbsolutePath(const std::string& db_path,
-      std::string* output_path) {
+                                 std::string* output_path) override {
     if (db_path.find('/') == 0) {
       *output_path = db_path;
       return Status::OK();
@@ -1367,12 +1472,25 @@ class PosixEnv : public Env {
   }
 
   // Allow increasing the number of worker threads.
-  virtual void SetBackgroundThreads(int num, Priority pri) {
+  virtual void SetBackgroundThreads(int num, Priority pri) override {
     assert(pri >= Priority::LOW && pri <= Priority::HIGH);
     thread_pools_[pri].SetBackgroundThreads(num);
   }
 
-  virtual std::string TimeToString(uint64_t secondsSince1970) {
+  // Allow increasing the number of worker threads.
+  virtual void IncBackgroundThreadsIfNeeded(int num, Priority pri) override {
+    assert(pri >= Priority::LOW && pri <= Priority::HIGH);
+    thread_pools_[pri].IncBackgroundThreadsIfNeeded(num);
+  }
+
+  virtual void LowerThreadPoolIOPriority(Priority pool = LOW) override {
+    assert(pool >= Priority::LOW && pool <= Priority::HIGH);
+#ifdef OS_LINUX
+    thread_pools_[pool].LowerIOPriority();
+#endif
+  }
+
+  virtual std::string TimeToString(uint64_t secondsSince1970) override {
     const time_t seconds = (time_t)secondsSince1970;
     struct tm t;
     int maxsize = 64;
@@ -1392,17 +1510,22 @@ class PosixEnv : public Env {
     return dummy;
   }
 
-  EnvOptions OptimizeForLogWrite(const EnvOptions& env_options) const {
+  EnvOptions OptimizeForLogWrite(const EnvOptions& env_options,
+                                 const DBOptions& db_options) const override {
     EnvOptions optimized = env_options;
     optimized.use_mmap_writes = false;
+    optimized.bytes_per_sync = db_options.wal_bytes_per_sync;
     // TODO(icanadi) it's faster if fallocate_with_keep_size is false, but it
     // breaks TransactionLogIteratorStallAtLastRecord unit test. Fix the unit
     // test and make this false
-    optimized.fallocate_with_keep_size = true;
+    // CEPH: we don't care about replication and want to avoid updating the
+    // inode... set this to false! [-sage]
+    optimized.fallocate_with_keep_size = false;  //true;
     return optimized;
   }
 
-  EnvOptions OptimizeForManifestWrite(const EnvOptions& env_options) const {
+  EnvOptions OptimizeForManifestWrite(
+      const EnvOptions& env_options) const override {
     EnvOptions optimized = env_options;
     optimized.use_mmap_writes = false;
     optimized.fallocate_with_keep_size = true;
@@ -1454,12 +1577,18 @@ class PosixEnv : public Env {
           bgthreads_(0),
           queue_(),
           queue_len_(0),
-          exit_all_threads_(false) {
+          exit_all_threads_(false),
+          low_io_priority_(false),
+          env_(nullptr) {
       PthreadCall("mutex_init", pthread_mutex_init(&mu_, nullptr));
       PthreadCall("cvar_init", pthread_cond_init(&bgsignal_, nullptr));
     }
 
     ~ThreadPool() {
+      assert(bgthreads_.size() == 0U);
+    }
+
+    void JoinAllThreads() {
       PthreadCall("lock", pthread_mutex_lock(&mu_));
       assert(!exit_all_threads_);
       exit_all_threads_ = true;
@@ -1468,59 +1597,176 @@ class PosixEnv : public Env {
       for (const auto tid : bgthreads_) {
         pthread_join(tid, nullptr);
       }
+      bgthreads_.clear();
+    }
+
+    void SetHostEnv(Env* env) {
+      env_ = env;
+    }
+
+    void LowerIOPriority() {
+#ifdef OS_LINUX
+      PthreadCall("lock", pthread_mutex_lock(&mu_));
+      low_io_priority_ = true;
+      PthreadCall("unlock", pthread_mutex_unlock(&mu_));
+#endif
+    }
+
+    // Return true if there is at least one thread needs to terminate.
+    bool HasExcessiveThread() {
+      return static_cast<int>(bgthreads_.size()) > total_threads_limit_;
+    }
+
+    // Return true iff the current thread is the excessive thread to terminate.
+    // Always terminate the running thread that is added last, even if there are
+    // more than one thread to terminate.
+    bool IsLastExcessiveThread(size_t thread_id) {
+      return HasExcessiveThread() && thread_id == bgthreads_.size() - 1;
+    }
+
+    // Is one of the threads to terminate.
+    bool IsExcessiveThread(size_t thread_id) {
+      return static_cast<int>(thread_id) >= total_threads_limit_;
+    }
+
+    // Return the thread priority.
+    // This would allow its member-thread to know its priority.
+    Env::Priority GetThreadPriority() {
+      return priority_;
+    }
+
+    // Set the thread priority.
+    void SetThreadPriority(Env::Priority priority) {
+      priority_ = priority;
     }
 
-    void BGThread() {
+    void BGThread(size_t thread_id) {
+      bool low_io_priority = false;
       while (true) {
         // Wait until there is an item that is ready to run
         PthreadCall("lock", pthread_mutex_lock(&mu_));
-        while (queue_.empty() && !exit_all_threads_) {
+        // Stop waiting if the thread needs to do work or needs to terminate.
+        while (!exit_all_threads_ && !IsLastExcessiveThread(thread_id) &&
+               (queue_.empty() || IsExcessiveThread(thread_id))) {
           PthreadCall("wait", pthread_cond_wait(&bgsignal_, &mu_));
         }
         if (exit_all_threads_) { // mechanism to let BG threads exit safely
           PthreadCall("unlock", pthread_mutex_unlock(&mu_));
           break;
         }
+        if (IsLastExcessiveThread(thread_id)) {
+          // Current thread is the last generated one and is excessive.
+          // We always terminate excessive thread in the reverse order of
+          // generation time.
+          auto terminating_thread = bgthreads_.back();
+          pthread_detach(terminating_thread);
+          bgthreads_.pop_back();
+          if (HasExcessiveThread()) {
+            // There is still at least more excessive thread to terminate.
+            WakeUpAllThreads();
+          }
+          PthreadCall("unlock", pthread_mutex_unlock(&mu_));
+          break;
+        }
         void (*function)(void*) = queue_.front().function;
         void* arg = queue_.front().arg;
         queue_.pop_front();
-        queue_len_.store(queue_.size(), std::memory_order_relaxed);
+        queue_len_.store(static_cast<unsigned int>(queue_.size()),
+                         std::memory_order_relaxed);
 
+        bool decrease_io_priority = (low_io_priority != low_io_priority_);
         PthreadCall("unlock", pthread_mutex_unlock(&mu_));
+
+#ifdef OS_LINUX
+        if (decrease_io_priority) {
+          #define IOPRIO_CLASS_SHIFT               (13)
+          #define IOPRIO_PRIO_VALUE(class, data)   \
+              (((class) << IOPRIO_CLASS_SHIFT) | data)
+          // Put schedule into IOPRIO_CLASS_IDLE class (lowest)
+          // These system calls only have an effect when used in conjunction
+          // with an I/O scheduler that supports I/O priorities. As at
+          // kernel 2.6.17 the only such scheduler is the Completely
+          // Fair Queuing (CFQ) I/O scheduler.
+          // To change scheduler:
+          //  echo cfq > /sys/block/<device_name>/queue/schedule
+          // Tunables to consider:
+          //  /sys/block/<device_name>/queue/slice_idle
+          //  /sys/block/<device_name>/queue/slice_sync
+          syscall(SYS_ioprio_set,
+                  1,  // IOPRIO_WHO_PROCESS
+                  0,  // current thread
+                  IOPRIO_PRIO_VALUE(3, 0));
+          low_io_priority = true;
+        }
+#else
+        (void)decrease_io_priority; // avoid 'unused variable' error
+#endif
         (*function)(arg);
       }
     }
 
+    // Helper struct for passing arguments when creating threads.
+    struct BGThreadMetadata {
+      ThreadPool* thread_pool_;
+      size_t thread_id_;  // Thread count in the thread.
+      explicit BGThreadMetadata(ThreadPool* thread_pool, size_t thread_id)
+          : thread_pool_(thread_pool), thread_id_(thread_id) {}
+    };
+
     static void* BGThreadWrapper(void* arg) {
-      reinterpret_cast<ThreadPool*>(arg)->BGThread();
+      BGThreadMetadata* meta = reinterpret_cast<BGThreadMetadata*>(arg);
+      size_t thread_id = meta->thread_id_;
+      ThreadPool* tp = meta->thread_pool_;
+#if ROCKSDB_USING_THREAD_STATUS
+      // for thread-status
+      ThreadStatusUtil::SetThreadType(tp->env_,
+          (tp->GetThreadPriority() == Env::Priority::HIGH ?
+              ThreadStatus::HIGH_PRIORITY :
+              ThreadStatus::LOW_PRIORITY));
+#endif
+      delete meta;
+      tp->BGThread(thread_id);
+#if ROCKSDB_USING_THREAD_STATUS
+      ThreadStatusUtil::UnregisterThread();
+#endif
       return nullptr;
     }
 
-    void SetBackgroundThreads(int num) {
-      PthreadCall("lock", pthread_mutex_lock(&mu_));
-      if (num > total_threads_limit_) {
-        total_threads_limit_ = num;
-      }
-      assert(total_threads_limit_ > 0);
-      PthreadCall("unlock", pthread_mutex_unlock(&mu_));
+    void WakeUpAllThreads() {
+      PthreadCall("signalall", pthread_cond_broadcast(&bgsignal_));
     }
 
-    void Schedule(void (*function)(void*), void* arg) {
+    void SetBackgroundThreadsInternal(int num, bool allow_reduce) {
       PthreadCall("lock", pthread_mutex_lock(&mu_));
-
       if (exit_all_threads_) {
         PthreadCall("unlock", pthread_mutex_unlock(&mu_));
         return;
       }
+      if (num > total_threads_limit_ ||
+          (num < total_threads_limit_ && allow_reduce)) {
+        total_threads_limit_ = std::max(1, num);
+        WakeUpAllThreads();
+        StartBGThreads();
+      }
+      PthreadCall("unlock", pthread_mutex_unlock(&mu_));
+    }
+
+    void IncBackgroundThreadsIfNeeded(int num) {
+      SetBackgroundThreadsInternal(num, false);
+    }
+
+    void SetBackgroundThreads(int num) {
+      SetBackgroundThreadsInternal(num, true);
+    }
+
+    void StartBGThreads() {
       // Start background thread if necessary
       while ((int)bgthreads_.size() < total_threads_limit_) {
         pthread_t t;
         PthreadCall(
-          "create thread",
-          pthread_create(&t,
-                         nullptr,
-                         &ThreadPool::BGThreadWrapper,
-                         this));
+            "create thread",
+            pthread_create(&t, nullptr, &ThreadPool::BGThreadWrapper,
+                           new BGThreadMetadata(this, bgthreads_.size())));
 
         // Set the thread name to aid debugging
 #if defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ)
@@ -1534,26 +1780,69 @@ class PosixEnv : public Env {
 
         bgthreads_.push_back(t);
       }
+    }
+
+    void Schedule(void (*function)(void* arg1), void* arg, void* tag) {
+      PthreadCall("lock", pthread_mutex_lock(&mu_));
+
+      if (exit_all_threads_) {
+        PthreadCall("unlock", pthread_mutex_unlock(&mu_));
+        return;
+      }
+
+      StartBGThreads();
 
       // Add to priority queue
       queue_.push_back(BGItem());
       queue_.back().function = function;
       queue_.back().arg = arg;
-      queue_len_.store(queue_.size(), std::memory_order_relaxed);
+      queue_.back().tag = tag;
+      queue_len_.store(static_cast<unsigned int>(queue_.size()),
+                       std::memory_order_relaxed);
 
-      // always wake up at least one waiting thread.
-      PthreadCall("signal", pthread_cond_signal(&bgsignal_));
+      if (!HasExcessiveThread()) {
+        // Wake up at least one waiting thread.
+        PthreadCall("signal", pthread_cond_signal(&bgsignal_));
+      } else {
+        // Need to wake up all threads to make sure the one woken
+        // up is not the one to terminate.
+        WakeUpAllThreads();
+      }
 
       PthreadCall("unlock", pthread_mutex_unlock(&mu_));
     }
 
+    int UnSchedule(void* arg) {
+      int count = 0;
+      PthreadCall("lock", pthread_mutex_lock(&mu_));
+
+      // Remove from priority queue
+      BGQueue::iterator it = queue_.begin();
+      while (it != queue_.end()) {
+        if (arg == (*it).tag) {
+          it = queue_.erase(it);
+          count++;
+        } else {
+          it++;
+        }
+      }
+      queue_len_.store(static_cast<unsigned int>(queue_.size()),
+                       std::memory_order_relaxed);
+      PthreadCall("unlock", pthread_mutex_unlock(&mu_));
+      return count;
+    }
+
     unsigned int GetQueueLen() const {
       return queue_len_.load(std::memory_order_relaxed);
     }
 
    private:
     // Entry per Schedule() call
-    struct BGItem { void* arg; void (*function)(void*); };
+    struct BGItem {
+      void* arg;
+      void (*function)(void*);
+      void* tag;
+    };
     typedef std::deque<BGItem> BGQueue;
 
     pthread_mutex_t mu_;
@@ -1563,6 +1852,9 @@ class PosixEnv : public Env {
     BGQueue queue_;
     std::atomic_uint queue_len_;  // Queue length. Used for stats reporting
     bool exit_all_threads_;
+    bool low_io_priority_;
+    Env::Priority priority_;
+    Env* env_;
   };
 
   std::vector<ThreadPool> thread_pools_;
@@ -1577,11 +1869,23 @@ PosixEnv::PosixEnv() : checkedDiskForMmap_(false),
                        page_size_(getpagesize()),
                        thread_pools_(Priority::TOTAL) {
   PthreadCall("mutex_init", pthread_mutex_init(&mu_, nullptr));
+  for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) {
+    thread_pools_[pool_id].SetThreadPriority(
+        static_cast<Env::Priority>(pool_id));
+    // This allows later initializing the thread-local-env of each thread.
+    thread_pools_[pool_id].SetHostEnv(this);
+  }
+  thread_status_updater_ = CreateThreadStatusUpdater();
 }
 
-void PosixEnv::Schedule(void (*function)(void*), void* arg, Priority pri) {
+void PosixEnv::Schedule(void (*function)(void* arg1), void* arg, Priority pri,
+                        void* tag) {
   assert(pri >= Priority::LOW && pri <= Priority::HIGH);
-  thread_pools_[pri].Schedule(function, arg);
+  thread_pools_[pri].Schedule(function, arg, tag);
+}
+
+int PosixEnv::UnSchedule(void* arg, Priority pri) {
+  return thread_pools_[pri].UnSchedule(arg);
 }
 
 unsigned int PosixEnv::GetThreadPoolQueueLen(Priority pri) const {
@@ -1589,12 +1893,11 @@ unsigned int PosixEnv::GetThreadPoolQueueLen(Priority pri) const {
   return thread_pools_[pri].GetQueueLen();
 }
 
-namespace {
 struct StartThreadState {
   void (*user_function)(void*);
   void* arg;
 };
-}
+
 static void* StartThreadWrapper(void* arg) {
   StartThreadState* state = reinterpret_cast<StartThreadState*>(arg);
   state->user_function(state->arg);
diff --git a/src/rocksdb/util/env_test.cc b/src/rocksdb/util/env_test.cc
index 1ac3773..081a10f 100644
--- a/src/rocksdb/util/env_test.cc
+++ b/src/rocksdb/util/env_test.cc
@@ -8,27 +8,38 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include <sys/types.h>
+#include <sys/ioctl.h>
 
 #include <iostream>
 #include <unordered_set>
+#include <atomic>
+#include <list>
 
 #ifdef OS_LINUX
+#include <linux/fs.h>
+#include <stdlib.h>
 #include <sys/stat.h>
 #include <unistd.h>
 #endif
 
+#ifdef ROCKSDB_FALLOCATE_PRESENT
+#include <errno.h>
+#include <fcntl.h>
+#endif
+
 #include "rocksdb/env.h"
 #include "port/port.h"
 #include "util/coding.h"
 #include "util/log_buffer.h"
 #include "util/mutexlock.h"
+#include "util/string_util.h"
 #include "util/testharness.h"
 
 namespace rocksdb {
 
 static const int kDelayMicros = 100000;
 
-class EnvPosixTest {
+class EnvPosixTest : public testing::Test {
  private:
   port::Mutex mu_;
   std::string events_;
@@ -39,30 +50,106 @@ class EnvPosixTest {
 };
 
 static void SetBool(void* ptr) {
-  reinterpret_cast<port::AtomicPointer*>(ptr)->NoBarrier_Store(ptr);
+  reinterpret_cast<std::atomic<bool>*>(ptr)
+      ->store(true, std::memory_order_relaxed);
 }
 
-TEST(EnvPosixTest, RunImmediately) {
-  port::AtomicPointer called (nullptr);
+class SleepingBackgroundTask {
+ public:
+  explicit SleepingBackgroundTask()
+      : bg_cv_(&mutex_), should_sleep_(true), sleeping_(false) {}
+  void DoSleep() {
+    MutexLock l(&mutex_);
+    sleeping_ = true;
+    while (should_sleep_) {
+      bg_cv_.Wait();
+    }
+    sleeping_ = false;
+    bg_cv_.SignalAll();
+  }
+
+  void WakeUp() {
+    MutexLock l(&mutex_);
+    should_sleep_ = false;
+    bg_cv_.SignalAll();
+
+    while (sleeping_) {
+      bg_cv_.Wait();
+    }
+  }
+
+  bool IsSleeping() {
+    MutexLock l(&mutex_);
+    return sleeping_;
+  }
+
+  static void DoSleepTask(void* arg) {
+    reinterpret_cast<SleepingBackgroundTask*>(arg)->DoSleep();
+  }
+
+ private:
+  port::Mutex mutex_;
+  port::CondVar bg_cv_;  // Signalled when background work finishes
+  bool should_sleep_;
+  bool sleeping_;
+};
+
+TEST_F(EnvPosixTest, RunImmediately) {
+  std::atomic<bool> called(false);
   env_->Schedule(&SetBool, &called);
   Env::Default()->SleepForMicroseconds(kDelayMicros);
-  ASSERT_TRUE(called.NoBarrier_Load() != nullptr);
+  ASSERT_TRUE(called.load(std::memory_order_relaxed));
 }
 
-TEST(EnvPosixTest, RunMany) {
-  port::AtomicPointer last_id (nullptr);
+TEST_F(EnvPosixTest, UnSchedule) {
+  std::atomic<bool> called(false);
+  env_->SetBackgroundThreads(1, Env::LOW);
+
+  /* Block the low priority queue */
+  SleepingBackgroundTask sleeping_task, sleeping_task1;
+  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task,
+                 Env::Priority::LOW);
+
+  /* Schedule another task */
+  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task1,
+                 Env::Priority::LOW, &sleeping_task1);
+
+  /* Remove it with a different tag  */
+  ASSERT_EQ(0, env_->UnSchedule(&called, Env::Priority::LOW));
+
+  /* Remove it from the queue with the right tag */
+  ASSERT_EQ(1, env_->UnSchedule(&sleeping_task1, Env::Priority::LOW));
+
+  // Unblock background thread
+  sleeping_task.WakeUp();
+
+  /* Schedule another task */
+  env_->Schedule(&SetBool, &called);
+  for (int i = 0; i < kDelayMicros; i++) {
+    if (called.load(std::memory_order_relaxed)) {
+      break;
+    }
+    Env::Default()->SleepForMicroseconds(1);
+  }
+  ASSERT_TRUE(called.load(std::memory_order_relaxed));
+
+  ASSERT_TRUE(!sleeping_task.IsSleeping() && !sleeping_task1.IsSleeping());
+}
+
+TEST_F(EnvPosixTest, RunMany) {
+  std::atomic<int> last_id(0);
 
   struct CB {
-    port::AtomicPointer* last_id_ptr;   // Pointer to shared slot
-    uintptr_t id;             // Order# for the execution of this callback
+    std::atomic<int>* last_id_ptr;  // Pointer to shared slot
+    int id;                         // Order# for the execution of this callback
 
-    CB(port::AtomicPointer* p, int i) : last_id_ptr(p), id(i) { }
+    CB(std::atomic<int>* p, int i) : last_id_ptr(p), id(i) {}
 
     static void Run(void* v) {
       CB* cb = reinterpret_cast<CB*>(v);
-      void* cur = cb->last_id_ptr->NoBarrier_Load();
-      ASSERT_EQ(cb->id-1, reinterpret_cast<uintptr_t>(cur));
-      cb->last_id_ptr->Release_Store(reinterpret_cast<void*>(cb->id));
+      int cur = cb->last_id_ptr->load(std::memory_order_relaxed);
+      ASSERT_EQ(cb->id - 1, cur);
+      cb->last_id_ptr->store(cb->id, std::memory_order_release);
     }
   };
 
@@ -77,8 +164,8 @@ TEST(EnvPosixTest, RunMany) {
   env_->Schedule(&CB::Run, &cb4);
 
   Env::Default()->SleepForMicroseconds(kDelayMicros);
-  void* cur = last_id.Acquire_Load();
-  ASSERT_EQ(4U, reinterpret_cast<uintptr_t>(cur));
+  int cur = last_id.load(std::memory_order_acquire);
+  ASSERT_EQ(4, cur);
 }
 
 struct State {
@@ -95,7 +182,7 @@ static void ThreadBody(void* arg) {
   s->mu.Unlock();
 }
 
-TEST(EnvPosixTest, StartThread) {
+TEST_F(EnvPosixTest, StartThread) {
   State state;
   state.val = 0;
   state.num_running = 3;
@@ -114,8 +201,7 @@ TEST(EnvPosixTest, StartThread) {
   ASSERT_EQ(state.val, 3);
 }
 
-TEST(EnvPosixTest, TwoPools) {
-
+TEST_F(EnvPosixTest, TwoPools) {
   class CB {
    public:
     CB(const std::string& pool_name, int pool_size)
@@ -134,10 +220,8 @@ TEST(EnvPosixTest, TwoPools) {
       {
         MutexLock l(&mu_);
         num_running_++;
-        std::cout << "Pool " << pool_name_ << ": "
-                  << num_running_ << " running threads.\n";
         // make sure we don't have more than pool_size_ jobs running.
-        ASSERT_LE(num_running_, pool_size_);
+        ASSERT_LE(num_running_, pool_size_.load());
       }
 
       // sleep for 1 sec
@@ -155,11 +239,16 @@ TEST(EnvPosixTest, TwoPools) {
       return num_finished_;
     }
 
+    void Reset(int pool_size) {
+      pool_size_.store(pool_size);
+      num_finished_ = 0;
+    }
+
    private:
     port::Mutex mu_;
     int num_running_;
     int num_finished_;
-    int pool_size_;
+    std::atomic<int> pool_size_;
     std::string pool_name_;
   };
 
@@ -198,18 +287,193 @@ TEST(EnvPosixTest, TwoPools) {
 
   ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
   ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+
+  // call IncBackgroundThreadsIfNeeded to two pools. One increasing and
+  // the other decreasing
+  env_->IncBackgroundThreadsIfNeeded(kLowPoolSize - 1, Env::Priority::LOW);
+  env_->IncBackgroundThreadsIfNeeded(kHighPoolSize + 1, Env::Priority::HIGH);
+  high_pool_job.Reset(kHighPoolSize + 1);
+  low_pool_job.Reset(kLowPoolSize);
+
+  // schedule same number of jobs in each pool
+  for (int i = 0; i < kJobs; i++) {
+    env_->Schedule(&CB::Run, &low_pool_job);
+    env_->Schedule(&CB::Run, &high_pool_job, Env::Priority::HIGH);
+  }
+  // Wait a short while for the jobs to be dispatched.
+  Env::Default()->SleepForMicroseconds(kDelayMicros);
+  ASSERT_EQ((unsigned int)(kJobs - kLowPoolSize),
+            env_->GetThreadPoolQueueLen());
+  ASSERT_EQ((unsigned int)(kJobs - kLowPoolSize),
+            env_->GetThreadPoolQueueLen(Env::Priority::LOW));
+  ASSERT_EQ((unsigned int)(kJobs - (kHighPoolSize + 1)),
+            env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+
+  // wait for all jobs to finish
+  while (low_pool_job.NumFinished() < kJobs ||
+         high_pool_job.NumFinished() < kJobs) {
+    env_->SleepForMicroseconds(kDelayMicros);
+  }
+
+  env_->SetBackgroundThreads(kHighPoolSize, Env::Priority::HIGH);
+}
+
+TEST_F(EnvPosixTest, DecreaseNumBgThreads) {
+  std::vector<SleepingBackgroundTask> tasks(10);
+
+  // Set number of thread to 1 first.
+  env_->SetBackgroundThreads(1, Env::Priority::HIGH);
+  Env::Default()->SleepForMicroseconds(kDelayMicros);
+
+  // Schedule 3 tasks. 0 running; Task 1, 2 waiting.
+  for (size_t i = 0; i < 3; i++) {
+    env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &tasks[i],
+                   Env::Priority::HIGH);
+    Env::Default()->SleepForMicroseconds(kDelayMicros);
+  }
+  ASSERT_EQ(2U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+  ASSERT_TRUE(tasks[0].IsSleeping());
+  ASSERT_TRUE(!tasks[1].IsSleeping());
+  ASSERT_TRUE(!tasks[2].IsSleeping());
+
+  // Increase to 2 threads. Task 0, 1 running; 2 waiting
+  env_->SetBackgroundThreads(2, Env::Priority::HIGH);
+  Env::Default()->SleepForMicroseconds(kDelayMicros);
+  ASSERT_EQ(1U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+  ASSERT_TRUE(tasks[0].IsSleeping());
+  ASSERT_TRUE(tasks[1].IsSleeping());
+  ASSERT_TRUE(!tasks[2].IsSleeping());
+
+  // Shrink back to 1 thread. Still task 0, 1 running, 2 waiting
+  env_->SetBackgroundThreads(1, Env::Priority::HIGH);
+  Env::Default()->SleepForMicroseconds(kDelayMicros);
+  ASSERT_EQ(1U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+  ASSERT_TRUE(tasks[0].IsSleeping());
+  ASSERT_TRUE(tasks[1].IsSleeping());
+  ASSERT_TRUE(!tasks[2].IsSleeping());
+
+  // The last task finishes. Task 0 running, 2 waiting.
+  tasks[1].WakeUp();
+  Env::Default()->SleepForMicroseconds(kDelayMicros);
+  ASSERT_EQ(1U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+  ASSERT_TRUE(tasks[0].IsSleeping());
+  ASSERT_TRUE(!tasks[1].IsSleeping());
+  ASSERT_TRUE(!tasks[2].IsSleeping());
+
+  // Increase to 5 threads. Task 0 and 2 running.
+  env_->SetBackgroundThreads(5, Env::Priority::HIGH);
+  Env::Default()->SleepForMicroseconds(kDelayMicros);
+  ASSERT_EQ((unsigned int)0, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+  ASSERT_TRUE(tasks[0].IsSleeping());
+  ASSERT_TRUE(tasks[2].IsSleeping());
+
+  // Change number of threads a couple of times while there is no sufficient
+  // tasks.
+  env_->SetBackgroundThreads(7, Env::Priority::HIGH);
+  Env::Default()->SleepForMicroseconds(kDelayMicros);
+  tasks[2].WakeUp();
+  ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+  env_->SetBackgroundThreads(3, Env::Priority::HIGH);
+  Env::Default()->SleepForMicroseconds(kDelayMicros);
+  ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+  env_->SetBackgroundThreads(4, Env::Priority::HIGH);
+  Env::Default()->SleepForMicroseconds(kDelayMicros);
+  ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+  env_->SetBackgroundThreads(5, Env::Priority::HIGH);
+  Env::Default()->SleepForMicroseconds(kDelayMicros);
+  ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+  env_->SetBackgroundThreads(4, Env::Priority::HIGH);
+  Env::Default()->SleepForMicroseconds(kDelayMicros);
+  ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+
+  Env::Default()->SleepForMicroseconds(kDelayMicros * 50);
+
+  // Enqueue 5 more tasks. Thread pool size now is 4.
+  // Task 0, 3, 4, 5 running;6, 7 waiting.
+  for (size_t i = 3; i < 8; i++) {
+    env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &tasks[i],
+                   Env::Priority::HIGH);
+  }
+  Env::Default()->SleepForMicroseconds(kDelayMicros);
+  ASSERT_EQ(2U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+  ASSERT_TRUE(tasks[3].IsSleeping());
+  ASSERT_TRUE(tasks[4].IsSleeping());
+  ASSERT_TRUE(tasks[5].IsSleeping());
+  ASSERT_TRUE(!tasks[6].IsSleeping());
+  ASSERT_TRUE(!tasks[7].IsSleeping());
+
+  // Wake up task 0, 3 and 4. Task 5, 6, 7 running.
+  tasks[0].WakeUp();
+  tasks[3].WakeUp();
+  tasks[4].WakeUp();
+
+  Env::Default()->SleepForMicroseconds(kDelayMicros);
+  ASSERT_EQ((unsigned int)0, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+  for (size_t i = 5; i < 8; i++) {
+    ASSERT_TRUE(tasks[i].IsSleeping());
+  }
+
+  // Shrink back to 1 thread. Still task 5, 6, 7 running
+  env_->SetBackgroundThreads(1, Env::Priority::HIGH);
+  Env::Default()->SleepForMicroseconds(kDelayMicros);
+  ASSERT_TRUE(tasks[5].IsSleeping());
+  ASSERT_TRUE(tasks[6].IsSleeping());
+  ASSERT_TRUE(tasks[7].IsSleeping());
+
+  // Wake up task  6. Task 5, 7 running
+  tasks[6].WakeUp();
+  Env::Default()->SleepForMicroseconds(kDelayMicros);
+  ASSERT_TRUE(tasks[5].IsSleeping());
+  ASSERT_TRUE(!tasks[6].IsSleeping());
+  ASSERT_TRUE(tasks[7].IsSleeping());
+
+  // Wake up threads 7. Task 5 running
+  tasks[7].WakeUp();
+  Env::Default()->SleepForMicroseconds(kDelayMicros);
+  ASSERT_TRUE(!tasks[7].IsSleeping());
+
+  // Enqueue thread 8 and 9. Task 5 running; one of 8, 9 might be running.
+  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &tasks[8],
+                 Env::Priority::HIGH);
+  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &tasks[9],
+                 Env::Priority::HIGH);
+  Env::Default()->SleepForMicroseconds(kDelayMicros);
+  ASSERT_GT(env_->GetThreadPoolQueueLen(Env::Priority::HIGH), (unsigned int)0);
+  ASSERT_TRUE(!tasks[8].IsSleeping() || !tasks[9].IsSleeping());
+
+  // Increase to 4 threads. Task 5, 8, 9 running.
+  env_->SetBackgroundThreads(4, Env::Priority::HIGH);
+  Env::Default()->SleepForMicroseconds(kDelayMicros);
+  ASSERT_EQ((unsigned int)0, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+  ASSERT_TRUE(tasks[8].IsSleeping());
+  ASSERT_TRUE(tasks[9].IsSleeping());
+
+  // Shrink to 1 thread
+  env_->SetBackgroundThreads(1, Env::Priority::HIGH);
+
+  // Wake up thread 9.
+  tasks[9].WakeUp();
+  Env::Default()->SleepForMicroseconds(kDelayMicros);
+  ASSERT_TRUE(!tasks[9].IsSleeping());
+  ASSERT_TRUE(tasks[8].IsSleeping());
+
+  // Wake up thread 8
+  tasks[8].WakeUp();
+  Env::Default()->SleepForMicroseconds(kDelayMicros);
+  ASSERT_TRUE(!tasks[8].IsSleeping());
+
+  // Wake up the last thread
+  tasks[5].WakeUp();
+
+  Env::Default()->SleepForMicroseconds(kDelayMicros);
+  ASSERT_TRUE(!tasks[5].IsSleeping());
 }
 
 #ifdef OS_LINUX
-// To make sure the Env::GetUniqueId() related tests work correctly, The files
-// should be stored in regular storage like "hard disk" or "flash device".
-// Otherwise we cannot get the correct id.
-//
-// The following function act as the replacement of test::TmpDir() that may be
-// customized by user to be on a storage that doesn't work with GetUniqueId().
-//
-// TODO(kailiu) This function still assumes /tmp/<test-dir> reside in regular
-// storage system.
+// Travis doesn't support fallocate or getting unique ID from files for whatever
+// reason.
+#ifndef TRAVIS
+
 namespace {
 bool IsSingleVarint(const std::string& s) {
   Slice slice(s);
@@ -229,22 +493,100 @@ bool IsUniqueIDValid(const std::string& s) {
 const size_t MAX_ID_SIZE = 100;
 char temp_id[MAX_ID_SIZE];
 
-std::string GetOnDiskTestDir() {
-  char base[100];
-  snprintf(base, sizeof(base), "/tmp/rocksdbtest-%d",
-           static_cast<int>(geteuid()));
-  // Directory may already exist
-  Env::Default()->CreateDirIfMissing(base);
 
-  return base;
-}
 }  // namespace
 
+// Determine whether we can use the FS_IOC_GETVERSION ioctl
+// on a file in directory DIR.  Create a temporary file therein,
+// try to apply the ioctl (save that result), cleanup and
+// return the result.  Return true if it is supported, and
+// false if anything fails.
+// Note that this function "knows" that dir has just been created
+// and is empty, so we create a simply-named test file: "f".
+bool ioctl_support__FS_IOC_GETVERSION(const std::string& dir) {
+  const std::string file = dir + "/f";
+  int fd;
+  do {
+    fd = open(file.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644);
+  } while (fd < 0 && errno == EINTR);
+  long int version;
+  bool ok = (fd >= 0 && ioctl(fd, FS_IOC_GETVERSION, &version) >= 0);
+
+  close(fd);
+  unlink(file.c_str());
+
+  return ok;
+}
+
+// To ensure that Env::GetUniqueId-related tests work correctly, the files
+// should be stored in regular storage like "hard disk" or "flash device",
+// and not on a tmpfs file system (like /dev/shm and /tmp on some systems).
+// Otherwise we cannot get the correct id.
+//
+// This function serves as the replacement for test::TmpDir(), which may be
+// customized to be on a file system that doesn't work with GetUniqueId().
+
+class IoctlFriendlyTmpdir {
+ public:
+  explicit IoctlFriendlyTmpdir() {
+    char dir_buf[100];
+    std::list<std::string> candidate_dir_list = {"/var/tmp", "/tmp"};
+
+    const char *fmt = "%s/rocksdb.XXXXXX";
+    const char *tmp = getenv("TEST_IOCTL_FRIENDLY_TMPDIR");
+    // If $TEST_IOCTL_FRIENDLY_TMPDIR/rocksdb.XXXXXX fits, use
+    // $TEST_IOCTL_FRIENDLY_TMPDIR; subtract 2 for the "%s", and
+    // add 1 for the trailing NUL byte.
+    if (tmp && strlen(tmp) + strlen(fmt) - 2 + 1 <= sizeof dir_buf) {
+      // use $TEST_IOCTL_FRIENDLY_TMPDIR value
+      candidate_dir_list.push_front(tmp);
+    }
+
+    for (const std::string& d : candidate_dir_list) {
+      snprintf(dir_buf, sizeof dir_buf, fmt, d.c_str());
+      if (mkdtemp(dir_buf)) {
+        if (ioctl_support__FS_IOC_GETVERSION(dir_buf)) {
+          dir_ = dir_buf;
+          return;
+        } else {
+          // Diagnose ioctl-related failure only if this is the
+          // directory specified via that envvar.
+          if (tmp == d) {
+            fprintf(stderr, "TEST_IOCTL_FRIENDLY_TMPDIR-specified directory is "
+                    "not suitable: %s\n", d.c_str());
+          }
+          rmdir(dir_buf);  // ignore failure
+        }
+      } else {
+        // mkdtemp failed: diagnose it, but don't give up.
+        fprintf(stderr, "mkdtemp(%s/...) failed: %s\n", d.c_str(),
+                strerror(errno));
+      }
+    }
+
+    fprintf(stderr, "failed to find an ioctl-friendly temporary directory;"
+            " specify one via the TEST_IOCTL_FRIENDLY_TMPDIR envvar\n");
+    std::abort();
+  }
+
+  ~IoctlFriendlyTmpdir() {
+    rmdir(dir_.c_str());
+  }
+  const std::string& name() {
+    return dir_;
+  }
+
+ private:
+  std::string dir_;
+};
+
+
 // Only works in linux platforms
-TEST(EnvPosixTest, RandomAccessUniqueID) {
+TEST_F(EnvPosixTest, RandomAccessUniqueID) {
   // Create file.
   const EnvOptions soptions;
-  std::string fname = GetOnDiskTestDir() + "/" + "testfile";
+  IoctlFriendlyTmpdir ift;
+  std::string fname = ift.name() + "/testfile";
   unique_ptr<WritableFile> wfile;
   ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions));
 
@@ -282,8 +624,33 @@ TEST(EnvPosixTest, RandomAccessUniqueID) {
 
 // only works in linux platforms
 #ifdef ROCKSDB_FALLOCATE_PRESENT
-TEST(EnvPosixTest, AllocateTest) {
-  std::string fname = GetOnDiskTestDir() + "/preallocate_testfile";
+TEST_F(EnvPosixTest, AllocateTest) {
+  IoctlFriendlyTmpdir ift;
+  std::string fname = ift.name() + "/preallocate_testfile";
+
+  // Try fallocate in a file to see whether the target file system supports it.
+  // Skip the test if fallocate is not supported.
+  std::string fname_test_fallocate = ift.name() + "/preallocate_testfile_2";
+  int fd = -1;
+  do {
+    fd = open(fname_test_fallocate.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644);
+  } while (fd < 0 && errno == EINTR);
+  ASSERT_GT(fd, 0);
+
+  int alloc_status = fallocate(fd, 0, 0, 1);
+
+  int err_number = 0;
+  if (alloc_status != 0) {
+    err_number = errno;
+    fprintf(stderr, "Warning: fallocate() fails, %s\n", strerror(err_number));
+  }
+  close(fd);
+  ASSERT_OK(env_->DeleteFile(fname_test_fallocate));
+  if (alloc_status != 0 && err_number == EOPNOTSUPP) {
+    // The filesystem containing the file does not support fallocate
+    return;
+  }
+
   EnvOptions soptions;
   soptions.use_mmap_writes = false;
   unique_ptr<WritableFile> wfile;
@@ -292,7 +659,8 @@ TEST(EnvPosixTest, AllocateTest) {
   // allocate 100 MB
   size_t kPreallocateSize = 100 * 1024 * 1024;
   size_t kBlockSize = 512;
-  std::string data = "test";
+  size_t kPageSize = 4096;
+  std::string data(1024 * 1024, 'a');
   wfile->SetPreallocationBlockSize(kPreallocateSize);
   ASSERT_OK(wfile->Append(Slice(data)));
   ASSERT_OK(wfile->Flush());
@@ -305,8 +673,7 @@ TEST(EnvPosixTest, AllocateTest) {
   // we only require that number of allocated blocks is at least what we expect.
   // It looks like some FS give us more blocks that we asked for. That's fine.
   // It might be worth investigating further.
-  auto st_blocks = f_stat.st_blocks;
-  ASSERT_LE((unsigned int)(kPreallocateSize / kBlockSize), st_blocks);
+  ASSERT_LE((unsigned int)(kPreallocateSize / kBlockSize), f_stat.st_blocks);
 
   // close the file, should deallocate the blocks
   wfile.reset();
@@ -314,9 +681,11 @@ TEST(EnvPosixTest, AllocateTest) {
   stat(fname.c_str(), &f_stat);
   ASSERT_EQ((unsigned int)data.size(), f_stat.st_size);
   // verify that preallocated blocks were deallocated on file close
-  ASSERT_GT(st_blocks, f_stat.st_blocks);
+  // Because the FS might give us more blocks, we add a full page to the size
+  // and expect the number of blocks to be less or equal to that.
+  ASSERT_GE((f_stat.st_size + kPageSize + kBlockSize - 1) / kBlockSize, (unsigned int)f_stat.st_blocks);
 }
-#endif
+#endif  // ROCKSDB_FALLOCATE_PRESENT
 
 // Returns true if any of the strings in ss are the prefix of another string.
 bool HasPrefix(const std::unordered_set<std::string>& ss) {
@@ -334,14 +703,15 @@ bool HasPrefix(const std::unordered_set<std::string>& ss) {
 }
 
 // Only works in linux platforms
-TEST(EnvPosixTest, RandomAccessUniqueIDConcurrent) {
+TEST_F(EnvPosixTest, RandomAccessUniqueIDConcurrent) {
   // Check whether a bunch of concurrently existing files have unique IDs.
   const EnvOptions soptions;
 
   // Create the files
+  IoctlFriendlyTmpdir ift;
   std::vector<std::string> fnames;
   for (int i = 0; i < 1000; ++i) {
-    fnames.push_back(GetOnDiskTestDir() + "/" + "testfile" + std::to_string(i));
+    fnames.push_back(ift.name() + "/" + "testfile" + ToString(i));
 
     // Create file.
     unique_ptr<WritableFile> wfile;
@@ -372,10 +742,11 @@ TEST(EnvPosixTest, RandomAccessUniqueIDConcurrent) {
 }
 
 // Only works in linux platforms
-TEST(EnvPosixTest, RandomAccessUniqueIDDeletes) {
+TEST_F(EnvPosixTest, RandomAccessUniqueIDDeletes) {
   const EnvOptions soptions;
 
-  std::string fname = GetOnDiskTestDir() + "/" + "testfile";
+  IoctlFriendlyTmpdir ift;
+  std::string fname = ift.name() + "/" + "testfile";
 
   // Check that after file is deleted we don't get same ID again in a new file.
   std::unordered_set<std::string> ids;
@@ -408,7 +779,7 @@ TEST(EnvPosixTest, RandomAccessUniqueIDDeletes) {
 }
 
 // Only works in linux platforms
-TEST(EnvPosixTest, InvalidateCache) {
+TEST_F(EnvPosixTest, InvalidateCache) {
   const EnvOptions soptions;
   std::string fname = test::TmpDir() + "/" + "testfile";
 
@@ -447,9 +818,10 @@ TEST(EnvPosixTest, InvalidateCache) {
   // Delete the file
   ASSERT_OK(env_->DeleteFile(fname));
 }
-#endif
+#endif  // not TRAVIS
+#endif  // OS_LINUX
 
-TEST(EnvPosixTest, PosixRandomRWFileTest) {
+TEST_F(EnvPosixTest, PosixRandomRWFileTest) {
   EnvOptions soptions;
   soptions.use_mmap_writes = soptions.use_mmap_reads = false;
   std::string fname = test::TmpDir() + "/" + "testfile";
@@ -473,6 +845,7 @@ TEST(EnvPosixTest, PosixRandomRWFileTest) {
 
 class TestLogger : public Logger {
  public:
+  using Logger::Logv;
   virtual void Logv(const char* format, va_list ap) override {
     log_count++;
 
@@ -506,7 +879,7 @@ class TestLogger : public Logger {
   int char_0_count;
 };
 
-TEST(EnvPosixTest, LogBufferTest) {
+TEST_F(EnvPosixTest, LogBufferTest) {
   TestLogger test_logger;
   test_logger.SetInfoLogLevel(InfoLogLevel::INFO_LEVEL);
   test_logger.log_count = 0;
@@ -543,8 +916,75 @@ TEST(EnvPosixTest, LogBufferTest) {
   ASSERT_EQ(10, test_logger.char_x_count);
 }
 
+class TestLogger2 : public Logger {
+ public:
+  explicit TestLogger2(size_t max_log_size) : max_log_size_(max_log_size) {}
+  using Logger::Logv;
+  virtual void Logv(const char* format, va_list ap) override {
+    char new_format[2000];
+    std::fill_n(new_format, sizeof(new_format), '2');
+    {
+      va_list backup_ap;
+      va_copy(backup_ap, ap);
+      int n = vsnprintf(new_format, sizeof(new_format) - 1, format, backup_ap);
+      // 48 bytes for extra information + bytes allocated
+      ASSERT_TRUE(
+          n <= 48 + static_cast<int>(max_log_size_ - sizeof(struct timeval)));
+      ASSERT_TRUE(n > static_cast<int>(max_log_size_ - sizeof(struct timeval)));
+      va_end(backup_ap);
+    }
+  }
+  size_t max_log_size_;
+};
+
+TEST_F(EnvPosixTest, LogBufferMaxSizeTest) {
+  char bytes9000[9000];
+  std::fill_n(bytes9000, sizeof(bytes9000), '1');
+  bytes9000[sizeof(bytes9000) - 1] = '\0';
+
+  for (size_t max_log_size = 256; max_log_size <= 1024;
+       max_log_size += 1024 - 256) {
+    TestLogger2 test_logger(max_log_size);
+    test_logger.SetInfoLogLevel(InfoLogLevel::INFO_LEVEL);
+    LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, &test_logger);
+    LogToBuffer(&log_buffer, max_log_size, "%s", bytes9000);
+    log_buffer.FlushBufferToLog();
+  }
+}
+
+TEST_F(EnvPosixTest, Preallocation) {
+  const std::string src = test::TmpDir() + "/" + "testfile";
+  unique_ptr<WritableFile> srcfile;
+  const EnvOptions soptions;
+  ASSERT_OK(env_->NewWritableFile(src, &srcfile, soptions));
+  srcfile->SetPreallocationBlockSize(1024 * 1024);
+
+  // No writes should mean no preallocation
+  size_t block_size, last_allocated_block;
+  srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
+  ASSERT_EQ(last_allocated_block, 0UL);
+
+  // Small write should preallocate one block
+  srcfile->Append("test");
+  srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
+  ASSERT_EQ(last_allocated_block, 1UL);
+
+  // Write an entire preallocation block, make sure we increased by two.
+  std::string buf(block_size, ' ');
+  srcfile->Append(buf);
+  srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
+  ASSERT_EQ(last_allocated_block, 2UL);
+
+  // Write five more blocks at once, ensure we're where we need to be.
+  buf = std::string(block_size * 5, ' ');
+  srcfile->Append(buf);
+  srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
+  ASSERT_EQ(last_allocated_block, 7UL);
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
-  return rocksdb::test::RunAllTests();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
 }
diff --git a/src/rocksdb/util/event_logger.cc b/src/rocksdb/util/event_logger.cc
new file mode 100644
index 0000000..fdecb8e
--- /dev/null
+++ b/src/rocksdb/util/event_logger.cc
@@ -0,0 +1,46 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include "util/event_logger.h"
+
+#include <inttypes.h>
+#include <cassert>
+#include <sstream>
+#include <string>
+
+#include "util/string_util.h"
+
+namespace rocksdb {
+
+const char* kEventLoggerPrefix = "EVENT_LOG_v1";
+
+EventLoggerStream::EventLoggerStream(Logger* logger)
+    : logger_(logger), log_buffer_(nullptr), json_writter_(nullptr) {}
+
+EventLoggerStream::EventLoggerStream(LogBuffer* log_buffer)
+    : logger_(nullptr), log_buffer_(log_buffer), json_writter_(nullptr) {}
+
+EventLoggerStream::~EventLoggerStream() {
+  if (json_writter_) {
+    json_writter_->EndObject();
+#ifdef ROCKSDB_PRINT_EVENTS_TO_STDOUT
+    printf("%s\n", json_writter_->Get().c_str());
+#else
+    if (logger_) {
+      Log(logger_, "%s %s", kEventLoggerPrefix, json_writter_->Get().c_str());
+    } else if (log_buffer_) {
+      LogToBuffer(log_buffer_, "%s %s", kEventLoggerPrefix,
+                  json_writter_->Get().c_str());
+    }
+#endif
+    delete json_writter_;
+  }
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/event_logger.h b/src/rocksdb/util/event_logger.h
new file mode 100644
index 0000000..806b4e5
--- /dev/null
+++ b/src/rocksdb/util/event_logger.h
@@ -0,0 +1,170 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include <memory>
+#include <sstream>
+#include <string>
+#include <chrono>
+
+#include "rocksdb/env.h"
+#include "util/log_buffer.h"
+
+namespace rocksdb {
+
+// JSONWritter doesn't support objects in arrays yet. There wasn't a need for
+// that.
+class JSONWritter {
+ public:
+  JSONWritter() : state_(kExpectKey), first_element_(true) { stream_ << "{"; }
+
+  void AddKey(const std::string& key) {
+    assert(state_ == kExpectKey);
+    if (!first_element_) {
+      stream_ << ", ";
+    }
+    stream_ << "\"" << key << "\": ";
+    state_ = kExpectValue;
+    first_element_ = false;
+  }
+
+  void AddValue(const char* value) {
+    assert(state_ == kExpectValue || state_ == kInArray);
+    if (state_ == kInArray && !first_element_) {
+      stream_ << ", ";
+    }
+    stream_ << "\"" << value << "\"";
+    if (state_ != kInArray) {
+      state_ = kExpectKey;
+    }
+    first_element_ = false;
+  }
+
+  template <typename T>
+  void AddValue(const T& value) {
+    assert(state_ == kExpectValue || state_ == kInArray);
+    if (state_ == kInArray && !first_element_) {
+      stream_ << ", ";
+    }
+    stream_ << value;
+    if (state_ != kInArray) {
+      state_ = kExpectKey;
+    }
+    first_element_ = false;
+  }
+
+  void StartArray() {
+    assert(state_ == kExpectValue);
+    state_ = kInArray;
+    stream_ << "[";
+    first_element_ = true;
+  }
+
+  void EndArray() {
+    assert(state_ == kInArray);
+    state_ = kExpectKey;
+    stream_ << "]";
+    first_element_ = false;
+  }
+
+  void StartObject() {
+    assert(state_ == kExpectValue);
+    state_ = kExpectKey;
+    stream_ << "{";
+    first_element_ = true;
+  }
+
+  void EndObject() {
+    assert(state_ == kExpectKey);
+    stream_ << "}";
+    first_element_ = false;
+  }
+
+  std::string Get() const { return stream_.str(); }
+
+  JSONWritter& operator<<(const char* val) {
+    if (state_ == kExpectKey) {
+      AddKey(val);
+    } else {
+      AddValue(val);
+    }
+    return *this;
+  }
+
+  JSONWritter& operator<<(const std::string& val) {
+    return *this << val.c_str();
+  }
+
+  template <typename T>
+  JSONWritter& operator<<(const T& val) {
+    assert(state_ != kExpectKey);
+    AddValue(val);
+    return *this;
+  }
+
+ private:
+  enum JSONWritterState {
+    kExpectKey,
+    kExpectValue,
+    kInArray,
+  };
+  JSONWritterState state_;
+  bool first_element_;
+  std::ostringstream stream_;
+};
+
+class EventLoggerStream {
+ public:
+  template <typename T>
+  EventLoggerStream& operator<<(const T& val) {
+    MakeStream();
+    *json_writter_ << val;
+    return *this;
+  }
+
+  void StartArray() { json_writter_->StartArray(); }
+  void EndArray() { json_writter_->EndArray(); }
+  void StartObject() { json_writter_->StartObject(); }
+  void EndObject() { json_writter_->EndObject(); }
+
+  ~EventLoggerStream();
+
+ private:
+  void MakeStream() {
+    if (!json_writter_) {
+      json_writter_ = new JSONWritter();
+      *this << "time_micros"
+            << std::chrono::duration_cast<std::chrono::microseconds>(
+                   std::chrono::system_clock::now().time_since_epoch()).count();
+    }
+  }
+  friend class EventLogger;
+  explicit EventLoggerStream(Logger* logger);
+  explicit EventLoggerStream(LogBuffer* log_buffer);
+  // exactly one is non-nullptr
+  Logger* const logger_;
+  LogBuffer* const log_buffer_;
+  // ownership
+  JSONWritter* json_writter_;
+};
+
+// here is an example of the output that will show up in the LOG:
+// 2015/01/15-14:13:25.788019 1105ef000 EVENT_LOG_v1 {"time_micros":
+// 1421360005788015, "event": "table_file_creation", "file_number": 12,
+// "file_size": 1909699}
+class EventLogger {
+ public:
+  explicit EventLogger(Logger* logger) : logger_(logger) {}
+  EventLoggerStream Log() { return EventLoggerStream(logger_); }
+  EventLoggerStream LogToBuffer(LogBuffer* log_buffer) {
+    return EventLoggerStream(log_buffer);
+  }
+
+ private:
+  Logger* logger_;
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/event_logger_test.cc b/src/rocksdb/util/event_logger_test.cc
new file mode 100644
index 0000000..1aad0ac
--- /dev/null
+++ b/src/rocksdb/util/event_logger_test.cc
@@ -0,0 +1,43 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include <string>
+
+#include "util/event_logger.h"
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+class EventLoggerTest : public testing::Test {};
+
+class StringLogger : public Logger {
+ public:
+  using Logger::Logv;
+  virtual void Logv(const char* format, va_list ap) override {
+    vsnprintf(buffer_, sizeof(buffer_), format, ap);
+  }
+  char* buffer() { return buffer_; }
+
+ private:
+  char buffer_[1000];
+};
+
+TEST_F(EventLoggerTest, SimpleTest) {
+  StringLogger logger;
+  EventLogger event_logger(&logger);
+  event_logger.Log() << "id" << 5 << "event"
+                     << "just_testing";
+  std::string output(logger.buffer());
+  ASSERT_TRUE(output.find("\"event\": \"just_testing\"") != std::string::npos);
+  ASSERT_TRUE(output.find("\"id\": 5") != std::string::npos);
+  ASSERT_TRUE(output.find("\"time_micros\"") != std::string::npos);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/file_util.cc b/src/rocksdb/util/file_util.cc
new file mode 100644
index 0000000..c75d59c
--- /dev/null
+++ b/src/rocksdb/util/file_util.cc
@@ -0,0 +1,59 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include <string>
+#include <algorithm>
+#include "util/file_util.h"
+#include "rocksdb/env.h"
+#include "db/filename.h"
+
+namespace rocksdb {
+
+// Utility function to copy a file up to a specified length
+Status CopyFile(Env* env, const std::string& source,
+                const std::string& destination, uint64_t size) {
+  const EnvOptions soptions;
+  unique_ptr<SequentialFile> srcfile;
+  Status s;
+  s = env->NewSequentialFile(source, &srcfile, soptions);
+  unique_ptr<WritableFile> destfile;
+  if (s.ok()) {
+    s = env->NewWritableFile(destination, &destfile, soptions);
+  } else {
+    return s;
+  }
+
+  if (size == 0) {
+    // default argument means copy everything
+    if (s.ok()) {
+      s = env->GetFileSize(source, &size);
+    } else {
+      return s;
+    }
+  }
+
+  char buffer[4096];
+  Slice slice;
+  while (size > 0) {
+    uint64_t bytes_to_read =
+        std::min(static_cast<uint64_t>(sizeof(buffer)), size);
+    if (s.ok()) {
+      s = srcfile->Read(bytes_to_read, &slice, buffer);
+    }
+    if (s.ok()) {
+      if (slice.size() == 0) {
+        return Status::Corruption("file too small");
+      }
+      s = destfile->Append(slice);
+    }
+    if (!s.ok()) {
+      return s;
+    }
+    size -= slice.size();
+  }
+  return Status::OK();
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/file_util.h b/src/rocksdb/util/file_util.h
new file mode 100644
index 0000000..84b3734
--- /dev/null
+++ b/src/rocksdb/util/file_util.h
@@ -0,0 +1,18 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include <string>
+
+#pragma once
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+#include "rocksdb/env.h"
+
+namespace rocksdb {
+
+extern Status CopyFile(Env* env, const std::string& source,
+                       const std::string& destination, uint64_t size = 0);
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/filelock_test.cc b/src/rocksdb/util/filelock_test.cc
index a9e30a5..33362f8 100644
--- a/src/rocksdb/util/filelock_test.cc
+++ b/src/rocksdb/util/filelock_test.cc
@@ -12,7 +12,7 @@
 
 namespace rocksdb {
 
-class LockTest {
+class LockTest : public testing::Test {
  public:
   static LockTest* current_;
   std::string file_;
@@ -36,7 +36,7 @@ class LockTest {
 };
 LockTest* LockTest::current_;
 
-TEST(LockTest, LockBySameThread) {
+TEST_F(LockTest, LockBySameThread) {
   FileLock* lock1;
   FileLock* lock2;
 
@@ -54,5 +54,6 @@ TEST(LockTest, LockBySameThread) {
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
-  return rocksdb::test::RunAllTests();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
 }
diff --git a/src/rocksdb/util/hash.cc b/src/rocksdb/util/hash.cc
index e38c186..427f0d1 100644
--- a/src/rocksdb/util/hash.cc
+++ b/src/rocksdb/util/hash.cc
@@ -18,7 +18,7 @@ uint32_t Hash(const char* data, size_t n, uint32_t seed) {
   const uint32_t m = 0xc6a4a793;
   const uint32_t r = 24;
   const char* limit = data + n;
-  uint32_t h = seed ^ (n * m);
+  uint32_t h = static_cast<uint32_t>(seed ^ (n * m));
 
   // Pick up four bytes at a time
   while (data + 4 <= limit) {
@@ -31,14 +31,26 @@ uint32_t Hash(const char* data, size_t n, uint32_t seed) {
 
   // Pick up remaining bytes
   switch (limit - data) {
+    // Note: It would be better if this was cast to unsigned char, but that
+    // would be a disk format change since we previously didn't have any cast
+    // at all (so gcc used signed char).
+    // To understand the difference between shifting unsigned and signed chars,
+    // let's use 250 as an example. unsigned char will be 250, while signed char
+    // will be -6. Bit-wise, they are equivalent: 11111010. However, when
+    // converting negative number (signed char) to int, it will be converted
+    // into negative int (of equivalent value, which is -6), while converting
+    // positive number (unsigned char) will be converted to 250. Bitwise,
+    // this looks like this:
+    // signed char 11111010 -> int 11111111111111111111111111111010
+    // unsigned char 11111010 -> int 00000000000000000000000011111010
     case 3:
-      h += data[2] << 16;
-      // fall through
+      h += static_cast<uint32_t>(static_cast<signed char>(data[2]) << 16);
+    // fall through
     case 2:
-      h += data[1] << 8;
-      // fall through
+      h += static_cast<uint32_t>(static_cast<signed char>(data[1]) << 8);
+    // fall through
     case 1:
-      h += data[0];
+      h += static_cast<uint32_t>(static_cast<signed char>(data[0]));
       h *= m;
       h ^= (h >> r);
       break;
diff --git a/src/rocksdb/util/hash.h b/src/rocksdb/util/hash.h
index c9eb659..cab8d46 100644
--- a/src/rocksdb/util/hash.h
+++ b/src/rocksdb/util/hash.h
@@ -17,4 +17,12 @@ namespace rocksdb {
 
 extern uint32_t Hash(const char* data, size_t n, uint32_t seed);
 
+inline uint32_t BloomHash(const Slice& key) {
+  return Hash(key.data(), key.size(), 0xbc9f1d34);
 }
+
+inline uint32_t GetSliceHash(const Slice& s) {
+  return Hash(s.data(), s.size(), 397);
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/hash_cuckoo_rep.cc b/src/rocksdb/util/hash_cuckoo_rep.cc
index d10bc5d..3ac5ba7 100644
--- a/src/rocksdb/util/hash_cuckoo_rep.cc
+++ b/src/rocksdb/util/hash_cuckoo_rep.cc
@@ -1,4 +1,3 @@
-
 //  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
 //  This source code is licensed under the BSD-style license found in the
 //  LICENSE file in the root directory of this source tree. An additional grant
@@ -53,25 +52,26 @@ struct CuckooStep {
 class HashCuckooRep : public MemTableRep {
  public:
   explicit HashCuckooRep(const MemTableRep::KeyComparator& compare,
-                         Arena* arena, const size_t bucket_count,
+                         MemTableAllocator* allocator,
+                         const size_t bucket_count,
                          const unsigned int hash_func_count)
-      : MemTableRep(arena),
+      : MemTableRep(allocator),
         compare_(compare),
-        arena_(arena),
+        allocator_(allocator),
         bucket_count_(bucket_count),
         cuckoo_path_max_depth_(kDefaultCuckooPathMaxDepth),
         occupied_count_(0),
         hash_function_count_(hash_func_count),
         backup_table_(nullptr) {
     char* mem = reinterpret_cast<char*>(
-        arena_->Allocate(sizeof(std::atomic<const char*>) * bucket_count_));
+        allocator_->Allocate(sizeof(std::atomic<const char*>) * bucket_count_));
     cuckoo_array_ = new (mem) std::atomic<const char*>[bucket_count_];
     for (unsigned int bid = 0; bid < bucket_count_; ++bid) {
       cuckoo_array_[bid].store(nullptr, std::memory_order_relaxed);
     }
 
     cuckoo_path_ = reinterpret_cast<int*>(
-        arena_->Allocate(sizeof(int*) * (cuckoo_path_max_depth_ + 1)));
+        allocator_->Allocate(sizeof(int) * (cuckoo_path_max_depth_ + 1)));
     is_nearly_full_ = false;
   }
 
@@ -182,8 +182,8 @@ class HashCuckooRep : public MemTableRep {
 
  private:
   const MemTableRep::KeyComparator& compare_;
-  // the pointer to Arena to allocate memory, immutable after construction.
-  Arena* const arena_;
+  // the pointer to Allocator to allocate memory, immutable after construction.
+  MemTableAllocator* const allocator_;
   // the number of hash bucket in the hash table.
   const size_t bucket_count_;
   // the maxinum depth of the cuckoo path.
@@ -214,9 +214,10 @@ class HashCuckooRep : public MemTableRep {
     static const int kMurmurHashSeeds[HashCuckooRepFactory::kMaxHashCount] = {
         545609244,  1769731426, 763324157,  13099088,   592422103,
         1899789565, 248369300,  1984183468, 1613664382, 1491157517};
-    return MurmurHash(slice.data(), slice.size(),
-                      kMurmurHashSeeds[hash_func_id]) %
-           bucket_count_;
+    return static_cast<unsigned int>(
+        MurmurHash(slice.data(), static_cast<int>(slice.size()),
+                   kMurmurHashSeeds[hash_func_id]) %
+        bucket_count_);
   }
 
   // A cuckoo path is a sequence of bucket ids, where each id points to a
@@ -245,13 +246,11 @@ class HashCuckooRep : public MemTableRep {
   bool QuickInsert(const char* internal_key, const Slice& user_key,
                    int bucket_ids[], const int initial_hash_id);
 
-  // Unhide default implementations of GetIterator
-  using MemTableRep::GetIterator;
   // Returns the pointer to the internal iterator to the buckets where buckets
   // are sorted according to the user specified KeyComparator.  Note that
   // any insert after this function call may affect the sorted nature of
   // the returned iterator.
-  virtual MemTableRep::Iterator* GetIterator() override {
+  virtual MemTableRep::Iterator* GetIterator(Arena* arena) override {
     std::vector<const char*> compact_buckets;
     for (unsigned int bid = 0; bid < bucket_count_; ++bid) {
       const char* bucket = cuckoo_array_[bid].load(std::memory_order_relaxed);
@@ -266,10 +265,18 @@ class HashCuckooRep : public MemTableRep {
         compact_buckets.push_back(iter->key());
       }
     }
-    return new Iterator(
-        std::shared_ptr<std::vector<const char*>>(
-            new std::vector<const char*>(std::move(compact_buckets))),
-        compare_);
+    if (arena == nullptr) {
+      return new Iterator(
+          std::shared_ptr<std::vector<const char*>>(
+              new std::vector<const char*>(std::move(compact_buckets))),
+          compare_);
+    } else {
+      auto mem = arena->AllocateAligned(sizeof(Iterator));
+      return new (mem) Iterator(
+          std::shared_ptr<std::vector<const char*>>(
+              new std::vector<const char*>(std::move(compact_buckets))),
+          compare_);
+    }
   }
 };
 
@@ -314,7 +321,8 @@ void HashCuckooRep::Insert(KeyHandle handle) {
     // immutable.
     if (backup_table_.get() == nullptr) {
       VectorRepFactory factory(10);
-      backup_table_.reset(factory.CreateMemTableRep(compare_, arena_, nullptr));
+      backup_table_.reset(
+          factory.CreateMemTableRep(compare_, allocator_, nullptr, nullptr));
       is_nearly_full_ = true;
     }
     backup_table_->Insert(key);
@@ -594,8 +602,8 @@ void HashCuckooRep::Iterator::SeekToLast() {
 }  // anom namespace
 
 MemTableRep* HashCuckooRepFactory::CreateMemTableRep(
-    const MemTableRep::KeyComparator& compare, Arena* arena,
-    const SliceTransform* transform) {
+    const MemTableRep::KeyComparator& compare, MemTableAllocator* allocator,
+    const SliceTransform* transform, Logger* logger) {
   // The estimated average fullness.  The write performance of any close hash
   // degrades as the fullness of the mem-table increases.  Setting kFullness
   // to a value around 0.7 can better avoid write performance degradation while
@@ -613,7 +621,8 @@ MemTableRep* HashCuckooRepFactory::CreateMemTableRep(
   if (hash_function_count > kMaxHashCount) {
     hash_function_count = kMaxHashCount;
   }
-  return new HashCuckooRep(compare, arena, bucket_count, hash_function_count);
+  return new HashCuckooRep(compare, allocator, bucket_count,
+                           hash_function_count);
 }
 
 MemTableRepFactory* NewHashCuckooRepFactory(size_t write_buffer_size,
diff --git a/src/rocksdb/util/hash_cuckoo_rep.h b/src/rocksdb/util/hash_cuckoo_rep.h
index 8f97ed4..9f374a9 100644
--- a/src/rocksdb/util/hash_cuckoo_rep.h
+++ b/src/rocksdb/util/hash_cuckoo_rep.h
@@ -28,8 +28,8 @@ class HashCuckooRepFactory : public MemTableRepFactory {
   virtual ~HashCuckooRepFactory() {}
 
   virtual MemTableRep* CreateMemTableRep(
-      const MemTableRep::KeyComparator& compare, Arena* arena,
-      const SliceTransform* transform) override;
+      const MemTableRep::KeyComparator& compare, MemTableAllocator* allocator,
+      const SliceTransform* transform, Logger* logger) override;
 
   virtual const char* Name() const override { return "HashCuckooRepFactory"; }
 
diff --git a/src/rocksdb/util/hash_linklist_rep.cc b/src/rocksdb/util/hash_linklist_rep.cc
index acd78c5..ea4cd99 100644
--- a/src/rocksdb/util/hash_linklist_rep.cc
+++ b/src/rocksdb/util/hash_linklist_rep.cc
@@ -7,12 +7,14 @@
 #ifndef ROCKSDB_LITE
 #include "util/hash_linklist_rep.h"
 
+#include <algorithm>
+#include <atomic>
 #include "rocksdb/memtablerep.h"
 #include "util/arena.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
 #include "port/port.h"
-#include "port/atomic_pointer.h"
+#include "util/histogram.h"
 #include "util/murmurhash.h"
 #include "db/memtable.h"
 #include "db/skiplist.h"
@@ -21,6 +23,44 @@ namespace rocksdb {
 namespace {
 
 typedef const char* Key;
+typedef SkipList<Key, const MemTableRep::KeyComparator&> MemtableSkipList;
+typedef std::atomic<void*> Pointer;
+
+// A data structure used as the header of a link list of a hash bucket.
+struct BucketHeader {
+  Pointer next;
+  std::atomic<uint32_t> num_entries;
+
+  explicit BucketHeader(void* n, uint32_t count)
+      : next(n), num_entries(count) {}
+
+  bool IsSkipListBucket() {
+    return next.load(std::memory_order_relaxed) == this;
+  }
+
+  uint32_t GetNumEntries() const {
+    return num_entries.load(std::memory_order_relaxed);
+  }
+
+  // REQUIRES: called from single-threaded Insert()
+  void IncNumEntries() {
+    // Only one thread can do write at one time. No need to do atomic
+    // incremental. Update it with relaxed load and store.
+    num_entries.store(GetNumEntries() + 1, std::memory_order_relaxed);
+  }
+};
+
+// A data structure used as the header of a skip list of a hash bucket.
+struct SkipListBucketHeader {
+  BucketHeader Counting_header;
+  MemtableSkipList skip_list;
+
+  explicit SkipListBucketHeader(const MemTableRep::KeyComparator& cmp,
+                                MemTableAllocator* allocator, uint32_t count)
+      : Counting_header(this,  // Pointing to itself to indicate header type.
+                        count),
+        skip_list(cmp, allocator) {}
+};
 
 struct Node {
   // Accessors/mutators for links.  Wrapped in methods so we can
@@ -28,33 +68,98 @@ struct Node {
   Node* Next() {
     // Use an 'acquire load' so that we observe a fully initialized
     // version of the returned Node.
-    return reinterpret_cast<Node*>(next_.Acquire_Load());
+    return next_.load(std::memory_order_acquire);
   }
   void SetNext(Node* x) {
     // Use a 'release store' so that anybody who reads through this
     // pointer observes a fully initialized version of the inserted node.
-    next_.Release_Store(x);
+    next_.store(x, std::memory_order_release);
   }
   // No-barrier variants that can be safely used in a few locations.
   Node* NoBarrier_Next() {
-    return reinterpret_cast<Node*>(next_.NoBarrier_Load());
+    return next_.load(std::memory_order_relaxed);
   }
 
-  void NoBarrier_SetNext(Node* x) {
-    next_.NoBarrier_Store(x);
-  }
+  void NoBarrier_SetNext(Node* x) { next_.store(x, std::memory_order_relaxed); }
 
  private:
-  port::AtomicPointer next_;
+  std::atomic<Node*> next_;
+
  public:
   char key[0];
 };
 
+// Memory structure of the mem table:
+// It is a hash table, each bucket points to one entry, a linked list or a
+// skip list. In order to track total number of records in a bucket to determine
+// whether should switch to skip list, a header is added just to indicate
+// number of entries in the bucket.
+//
+//
+//          +-----> NULL    Case 1. Empty bucket
+//          |
+//          |
+//          | +---> +-------+
+//          | |     | Next  +--> NULL
+//          | |     +-------+
+//  +-----+ | |     |       |  Case 2. One Entry in bucket.
+//  |     +-+ |     | Data  |          next pointer points to
+//  +-----+   |     |       |          NULL. All other cases
+//  |     |   |     |       |          next pointer is not NULL.
+//  +-----+   |     +-------+
+//  |     +---+
+//  +-----+     +-> +-------+  +> +-------+  +-> +-------+
+//  |     |     |   | Next  +--+  | Next  +--+   | Next  +-->NULL
+//  +-----+     |   +-------+     +-------+      +-------+
+//  |     +-----+   | Count |     |       |      |       |
+//  +-----+         +-------+     | Data  |      | Data  |
+//  |     |                       |       |      |       |
+//  +-----+          Case 3.      |       |      |       |
+//  |     |          A header     +-------+      +-------+
+//  +-----+          points to
+//  |     |          a linked list. Count indicates total number
+//  +-----+          of rows in this bucket.
+//  |     |
+//  +-----+    +-> +-------+ <--+
+//  |     |    |   | Next  +----+
+//  +-----+    |   +-------+   Case 4. A header points to a skip
+//  |     +----+   | Count |           list and next pointer points to
+//  +-----+        +-------+           itself, to distinguish case 3 or 4.
+//  |     |        |       |           Count still is kept to indicates total
+//  +-----+        | Skip +-->         of entries in the bucket for debugging
+//  |     |        | List  |   Data    purpose.
+//  |     |        |      +-->
+//  +-----+        |       |
+//  |     |        +-------+
+//  +-----+
+//
+// We don't have data race when changing cases because:
+// (1) When changing from case 2->3, we create a new bucket header, put the
+//     single node there first without changing the original node, and do a
+//     release store when changing the bucket pointer. In that case, a reader
+//     who sees a stale value of the bucket pointer will read this node, while
+//     a reader sees the correct value because of the release store.
+// (2) When changing case 3->4, a new header is created with skip list points
+//     to the data, before doing an acquire store to change the bucket pointer.
+//     The old header and nodes are never changed, so any reader sees any
+//     of those existing pointers will guarantee to be able to iterate to the
+//     end of the linked list.
+// (3) Header's next pointer in case 3 might change, but they are never equal
+//     to itself, so no matter a reader sees any stale or newer value, it will
+//     be able to correctly distinguish case 3 and 4.
+//
+// The reason that we use case 2 is we want to make the format to be efficient
+// when the utilization of buckets is relatively low. If we use case 3 for
+// single entry bucket, we will need to waste 12 bytes for every entry,
+// which can be significant decrease of memory utilization.
 class HashLinkListRep : public MemTableRep {
  public:
-  HashLinkListRep(const MemTableRep::KeyComparator& compare, Arena* arena,
-                  const SliceTransform* transform, size_t bucket_size,
-                  size_t huge_page_tlb_size);
+  HashLinkListRep(const MemTableRep::KeyComparator& compare,
+                  MemTableAllocator* allocator, const SliceTransform* transform,
+                  size_t bucket_size, uint32_t threshold_use_skiplist,
+                  size_t huge_page_tlb_size, Logger* logger,
+                  int bucket_entries_logging_threshold,
+                  bool if_log_bucket_dist_when_flash);
 
   virtual KeyHandle Allocate(const size_t len, char** buf) override;
 
@@ -70,42 +175,52 @@ class HashLinkListRep : public MemTableRep {
 
   virtual ~HashLinkListRep();
 
-  virtual MemTableRep::Iterator* GetIterator() override;
+  virtual MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override;
 
-  virtual MemTableRep::Iterator* GetIterator(const Slice& slice) override;
-
-  virtual MemTableRep::Iterator* GetDynamicPrefixIterator() override;
+  virtual MemTableRep::Iterator* GetDynamicPrefixIterator(
+       Arena* arena = nullptr) override;
 
  private:
   friend class DynamicIterator;
-  typedef SkipList<const char*, const MemTableRep::KeyComparator&> FullList;
 
   size_t bucket_size_;
 
   // Maps slices (which are transformed user keys) to buckets of keys sharing
   // the same transform.
-  port::AtomicPointer* buckets_;
+  Pointer* buckets_;
+
+  const uint32_t threshold_use_skiplist_;
 
   // The user-supplied transform whose domain is the user keys.
   const SliceTransform* transform_;
 
   const MemTableRep::KeyComparator& compare_;
 
-  bool BucketContains(Node* head, const Slice& key) const;
+  Logger* logger_;
+  int bucket_entries_logging_threshold_;
+  bool if_log_bucket_dist_when_flash_;
+
+  bool LinkListContains(Node* head, const Slice& key) const;
+
+  SkipListBucketHeader* GetSkipListBucketHeader(Pointer* first_next_pointer)
+      const;
+
+  Node* GetLinkListFirstNode(Pointer* first_next_pointer) const;
 
   Slice GetPrefix(const Slice& internal_key) const {
     return transform_->Transform(ExtractUserKey(internal_key));
   }
 
   size_t GetHash(const Slice& slice) const {
-    return MurmurHash(slice.data(), slice.size(), 0) % bucket_size_;
+    return MurmurHash(slice.data(), static_cast<int>(slice.size()), 0) %
+           bucket_size_;
   }
 
-  Node* GetBucket(size_t i) const {
-    return static_cast<Node*>(buckets_[i].Acquire_Load());
+  Pointer* GetBucket(size_t i) const {
+    return static_cast<Pointer*>(buckets_[i].load(std::memory_order_acquire));
   }
 
-  Node* GetBucket(const Slice& slice) const {
+  Pointer* GetBucket(const Slice& slice) const {
     return GetBucket(GetHash(slice));
   }
 
@@ -113,7 +228,6 @@ class HashLinkListRep : public MemTableRep {
     return (compare_(b, a) == 0);
   }
 
-
   bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); }
 
   bool KeyIsAfterNode(const Slice& internal_key, const Node* n) const {
@@ -131,40 +245,39 @@ class HashLinkListRep : public MemTableRep {
 
   class FullListIterator : public MemTableRep::Iterator {
    public:
-    explicit FullListIterator(FullList* list, Arena* arena)
-      : iter_(list), full_list_(list), arena_(arena) {}
+    explicit FullListIterator(MemtableSkipList* list, Allocator* allocator)
+        : iter_(list), full_list_(list), allocator_(allocator) {}
 
     virtual ~FullListIterator() {
     }
 
     // Returns true iff the iterator is positioned at a valid node.
-    virtual bool Valid() const {
-      return iter_.Valid();
-    }
+    virtual bool Valid() const override { return iter_.Valid(); }
 
     // Returns the key at the current position.
     // REQUIRES: Valid()
-    virtual const char* key() const {
+    virtual const char* key() const override {
       assert(Valid());
       return iter_.key();
     }
 
     // Advances to the next position.
     // REQUIRES: Valid()
-    virtual void Next() {
+    virtual void Next() override {
       assert(Valid());
       iter_.Next();
     }
 
     // Advances to the previous position.
     // REQUIRES: Valid()
-    virtual void Prev() {
+    virtual void Prev() override {
       assert(Valid());
       iter_.Prev();
     }
 
     // Advance to the first entry with a key >= target
-    virtual void Seek(const Slice& internal_key, const char* memtable_key) {
+    virtual void Seek(const Slice& internal_key,
+                      const char* memtable_key) override {
       const char* encoded_key =
           (memtable_key != nullptr) ?
               memtable_key : EncodeKey(&tmp_, internal_key);
@@ -173,69 +286,64 @@ class HashLinkListRep : public MemTableRep {
 
     // Position at the first entry in collection.
     // Final state of iterator is Valid() iff collection is not empty.
-    virtual void SeekToFirst() {
-      iter_.SeekToFirst();
-    }
+    virtual void SeekToFirst() override { iter_.SeekToFirst(); }
 
     // Position at the last entry in collection.
     // Final state of iterator is Valid() iff collection is not empty.
-    virtual void SeekToLast() {
-      iter_.SeekToLast();
-    }
+    virtual void SeekToLast() override { iter_.SeekToLast(); }
    private:
-    FullList::Iterator iter_;
+    MemtableSkipList::Iterator iter_;
     // To destruct with the iterator.
-    std::unique_ptr<FullList> full_list_;
-    std::unique_ptr<Arena> arena_;
+    std::unique_ptr<MemtableSkipList> full_list_;
+    std::unique_ptr<Allocator> allocator_;
     std::string tmp_;       // For passing to EncodeKey
   };
 
-  class Iterator : public MemTableRep::Iterator {
+  class LinkListIterator : public MemTableRep::Iterator {
    public:
-    explicit Iterator(const HashLinkListRep* const hash_link_list_rep,
-                      Node* head) :
-        hash_link_list_rep_(hash_link_list_rep), head_(head), node_(nullptr) {
-    }
+    explicit LinkListIterator(const HashLinkListRep* const hash_link_list_rep,
+                              Node* head)
+        : hash_link_list_rep_(hash_link_list_rep),
+          head_(head),
+          node_(nullptr) {}
 
-    virtual ~Iterator() {
-    }
+    virtual ~LinkListIterator() {}
 
     // Returns true iff the iterator is positioned at a valid node.
-    virtual bool Valid() const {
-      return node_ != nullptr;
-    }
+    virtual bool Valid() const override { return node_ != nullptr; }
 
     // Returns the key at the current position.
     // REQUIRES: Valid()
-    virtual const char* key() const {
+    virtual const char* key() const override {
       assert(Valid());
       return node_->key;
     }
 
     // Advances to the next position.
     // REQUIRES: Valid()
-    virtual void Next() {
+    virtual void Next() override {
       assert(Valid());
       node_ = node_->Next();
     }
 
     // Advances to the previous position.
     // REQUIRES: Valid()
-    virtual void Prev() {
+    virtual void Prev() override {
       // Prefix iterator does not support total order.
       // We simply set the iterator to invalid state
       Reset(nullptr);
     }
 
     // Advance to the first entry with a key >= target
-    virtual void Seek(const Slice& internal_key, const char* memtable_key) {
+    virtual void Seek(const Slice& internal_key,
+                      const char* memtable_key) override {
       node_ = hash_link_list_rep_->FindGreaterOrEqualInBucket(head_,
                                                               internal_key);
     }
 
     // Position at the first entry in collection.
     // Final state of iterator is Valid() iff collection is not empty.
-    virtual void SeekToFirst() {
+    virtual void SeekToFirst() override {
       // Prefix iterator does not support total order.
       // We simply set the iterator to invalid state
       Reset(nullptr);
@@ -243,7 +351,7 @@ class HashLinkListRep : public MemTableRep {
 
     // Position at the last entry in collection.
     // Final state of iterator is Valid() iff collection is not empty.
-    virtual void SeekToLast() {
+    virtual void SeekToLast() override {
       // Prefix iterator does not support total order.
       // We simply set the iterator to invalid state
       Reset(nullptr);
@@ -259,29 +367,74 @@ class HashLinkListRep : public MemTableRep {
     const HashLinkListRep* const hash_link_list_rep_;
     Node* head_;
     Node* node_;
-    std::string tmp_;       // For passing to EncodeKey
 
     virtual void SeekToHead() {
       node_ = head_;
     }
   };
 
-  class DynamicIterator : public HashLinkListRep::Iterator {
+  class DynamicIterator : public HashLinkListRep::LinkListIterator {
    public:
     explicit DynamicIterator(HashLinkListRep& memtable_rep)
-      : HashLinkListRep::Iterator(&memtable_rep, nullptr),
-        memtable_rep_(memtable_rep) {}
+        : HashLinkListRep::LinkListIterator(&memtable_rep, nullptr),
+          memtable_rep_(memtable_rep) {}
 
     // Advance to the first entry with a key >= target
-    virtual void Seek(const Slice& k, const char* memtable_key) {
+    virtual void Seek(const Slice& k, const char* memtable_key) override {
       auto transformed = memtable_rep_.GetPrefix(k);
-      Reset(memtable_rep_.GetBucket(transformed));
-      HashLinkListRep::Iterator::Seek(k, memtable_key);
+      auto* bucket = memtable_rep_.GetBucket(transformed);
+
+      SkipListBucketHeader* skip_list_header =
+          memtable_rep_.GetSkipListBucketHeader(bucket);
+      if (skip_list_header != nullptr) {
+        // The bucket is organized as a skip list
+        if (!skip_list_iter_) {
+          skip_list_iter_.reset(
+              new MemtableSkipList::Iterator(&skip_list_header->skip_list));
+        } else {
+          skip_list_iter_->SetList(&skip_list_header->skip_list);
+        }
+        if (memtable_key != nullptr) {
+          skip_list_iter_->Seek(memtable_key);
+        } else {
+          IterKey encoded_key;
+          encoded_key.EncodeLengthPrefixedKey(k);
+          skip_list_iter_->Seek(encoded_key.GetKey().data());
+        }
+      } else {
+        // The bucket is organized as a linked list
+        skip_list_iter_.reset();
+        Reset(memtable_rep_.GetLinkListFirstNode(bucket));
+        HashLinkListRep::LinkListIterator::Seek(k, memtable_key);
+      }
+    }
+
+    virtual bool Valid() const override {
+      if (skip_list_iter_) {
+        return skip_list_iter_->Valid();
+      }
+      return HashLinkListRep::LinkListIterator::Valid();
+    }
+
+    virtual const char* key() const override {
+      if (skip_list_iter_) {
+        return skip_list_iter_->key();
+      }
+      return HashLinkListRep::LinkListIterator::key();
+    }
+
+    virtual void Next() override {
+      if (skip_list_iter_) {
+        skip_list_iter_->Next();
+      } else {
+        HashLinkListRep::LinkListIterator::Next();
+      }
     }
 
    private:
     // the underlying memtable
     const HashLinkListRep& memtable_rep_;
+    std::unique_ptr<MemtableSkipList::Iterator> skip_list_iter_;
   };
 
   class EmptyIterator : public MemTableRep::Iterator {
@@ -289,36 +442,47 @@ class HashLinkListRep : public MemTableRep {
     // instantiating an empty bucket over which to iterate.
    public:
     EmptyIterator() { }
-    virtual bool Valid() const {
-      return false;
-    }
-    virtual const char* key() const {
+    virtual bool Valid() const override { return false; }
+    virtual const char* key() const override {
       assert(false);
       return nullptr;
     }
-    virtual void Next() { }
-    virtual void Prev() { }
-    virtual void Seek(const Slice& user_key, const char* memtable_key) { }
-    virtual void SeekToFirst() { }
-    virtual void SeekToLast() { }
+    virtual void Next() override {}
+    virtual void Prev() override {}
+    virtual void Seek(const Slice& user_key,
+                      const char* memtable_key) override {}
+    virtual void SeekToFirst() override {}
+    virtual void SeekToLast() override {}
+
    private:
   };
 };
 
 HashLinkListRep::HashLinkListRep(const MemTableRep::KeyComparator& compare,
-                                 Arena* arena, const SliceTransform* transform,
-                                 size_t bucket_size, size_t huge_page_tlb_size)
-    : MemTableRep(arena),
+                                 MemTableAllocator* allocator,
+                                 const SliceTransform* transform,
+                                 size_t bucket_size,
+                                 uint32_t threshold_use_skiplist,
+                                 size_t huge_page_tlb_size, Logger* logger,
+                                 int bucket_entries_logging_threshold,
+                                 bool if_log_bucket_dist_when_flash)
+    : MemTableRep(allocator),
       bucket_size_(bucket_size),
+      // Threshold to use skip list doesn't make sense if less than 3, so we
+      // force it to be minimum of 3 to simplify implementation.
+      threshold_use_skiplist_(std::max(threshold_use_skiplist, 3U)),
       transform_(transform),
-      compare_(compare) {
-  char* mem = arena_->AllocateAligned(sizeof(port::AtomicPointer) * bucket_size,
-                                      huge_page_tlb_size);
+      compare_(compare),
+      logger_(logger),
+      bucket_entries_logging_threshold_(bucket_entries_logging_threshold),
+      if_log_bucket_dist_when_flash_(if_log_bucket_dist_when_flash) {
+  char* mem = allocator_->AllocateAligned(sizeof(Pointer) * bucket_size,
+                                      huge_page_tlb_size, logger);
 
-  buckets_ = new (mem) port::AtomicPointer[bucket_size];
+  buckets_ = new (mem) Pointer[bucket_size];
 
   for (size_t i = 0; i < bucket_size_; ++i) {
-    buckets_[i].NoBarrier_Store(nullptr);
+    buckets_[i].store(nullptr, std::memory_order_relaxed);
   }
 }
 
@@ -326,59 +490,173 @@ HashLinkListRep::~HashLinkListRep() {
 }
 
 KeyHandle HashLinkListRep::Allocate(const size_t len, char** buf) {
-  char* mem = arena_->AllocateAligned(sizeof(Node) + len);
+  char* mem = allocator_->AllocateAligned(sizeof(Node) + len);
   Node* x = new (mem) Node();
   *buf = x->key;
   return static_cast<void*>(x);
 }
 
+SkipListBucketHeader* HashLinkListRep::GetSkipListBucketHeader(
+    Pointer* first_next_pointer) const {
+  if (first_next_pointer == nullptr) {
+    return nullptr;
+  }
+  if (first_next_pointer->load(std::memory_order_relaxed) == nullptr) {
+    // Single entry bucket
+    return nullptr;
+  }
+  // Counting header
+  BucketHeader* header = reinterpret_cast<BucketHeader*>(first_next_pointer);
+  if (header->IsSkipListBucket()) {
+    assert(header->GetNumEntries() > threshold_use_skiplist_);
+    auto* skip_list_bucket_header =
+        reinterpret_cast<SkipListBucketHeader*>(header);
+    assert(skip_list_bucket_header->Counting_header.next.load(
+               std::memory_order_relaxed) == header);
+    return skip_list_bucket_header;
+  }
+  assert(header->GetNumEntries() <= threshold_use_skiplist_);
+  return nullptr;
+}
+
+Node* HashLinkListRep::GetLinkListFirstNode(Pointer* first_next_pointer) const {
+  if (first_next_pointer == nullptr) {
+    return nullptr;
+  }
+  if (first_next_pointer->load(std::memory_order_relaxed) == nullptr) {
+    // Single entry bucket
+    return reinterpret_cast<Node*>(first_next_pointer);
+  }
+  // Counting header
+  BucketHeader* header = reinterpret_cast<BucketHeader*>(first_next_pointer);
+  if (!header->IsSkipListBucket()) {
+    assert(header->GetNumEntries() <= threshold_use_skiplist_);
+    return reinterpret_cast<Node*>(
+        header->next.load(std::memory_order_acquire));
+  }
+  assert(header->GetNumEntries() > threshold_use_skiplist_);
+  return nullptr;
+}
+
 void HashLinkListRep::Insert(KeyHandle handle) {
   Node* x = static_cast<Node*>(handle);
   assert(!Contains(x->key));
   Slice internal_key = GetLengthPrefixedSlice(x->key);
   auto transformed = GetPrefix(internal_key);
   auto& bucket = buckets_[GetHash(transformed)];
-  Node* head = static_cast<Node*>(bucket.Acquire_Load());
+  Pointer* first_next_pointer =
+      static_cast<Pointer*>(bucket.load(std::memory_order_relaxed));
 
-  if (!head) {
+  if (first_next_pointer == nullptr) {
+    // Case 1. empty bucket
     // NoBarrier_SetNext() suffices since we will add a barrier when
     // we publish a pointer to "x" in prev[i].
     x->NoBarrier_SetNext(nullptr);
-    bucket.Release_Store(static_cast<void*>(x));
+    bucket.store(x, std::memory_order_release);
     return;
   }
 
-  Node* cur = head;
-  Node* prev = nullptr;
-  while (true) {
-    if (cur == nullptr) {
-      break;
-    }
-    Node* next = cur->Next();
-    // Make sure the lists are sorted.
-    // If x points to head_ or next points nullptr, it is trivially satisfied.
-    assert((cur == head) || (next == nullptr) ||
-           KeyIsAfterNode(next->key, cur));
-    if (KeyIsAfterNode(internal_key, cur)) {
-      // Keep searching in this list
-      prev = cur;
-      cur = next;
-    } else {
-      break;
+  BucketHeader* header = nullptr;
+  if (first_next_pointer->load(std::memory_order_relaxed) == nullptr) {
+    // Case 2. only one entry in the bucket
+    // Need to convert to a Counting bucket and turn to case 4.
+    Node* first = reinterpret_cast<Node*>(first_next_pointer);
+    // Need to add a bucket header.
+    // We have to first convert it to a bucket with header before inserting
+    // the new node. Otherwise, we might need to change next pointer of first.
+    // In that case, a reader might sees the next pointer is NULL and wrongly
+    // think the node is a bucket header.
+    auto* mem = allocator_->AllocateAligned(sizeof(BucketHeader));
+    header = new (mem) BucketHeader(first, 1);
+    bucket.store(header, std::memory_order_release);
+  } else {
+    header = reinterpret_cast<BucketHeader*>(first_next_pointer);
+    if (header->IsSkipListBucket()) {
+      // Case 4. Bucket is already a skip list
+      assert(header->GetNumEntries() > threshold_use_skiplist_);
+      auto* skip_list_bucket_header =
+          reinterpret_cast<SkipListBucketHeader*>(header);
+      // Only one thread can execute Insert() at one time. No need to do atomic
+      // incremental.
+      skip_list_bucket_header->Counting_header.IncNumEntries();
+      skip_list_bucket_header->skip_list.Insert(x->key);
+      return;
     }
   }
 
-  // Our data structure does not allow duplicate insertion
-  assert(cur == nullptr || !Equal(x->key, cur->key));
+  if (bucket_entries_logging_threshold_ > 0 &&
+      header->GetNumEntries() ==
+          static_cast<uint32_t>(bucket_entries_logging_threshold_)) {
+    Info(logger_,
+         "HashLinkedList bucket %zu has more than %d "
+         "entries. Key to insert: %s",
+         GetHash(transformed), header->GetNumEntries(),
+         GetLengthPrefixedSlice(x->key).ToString(true).c_str());
+  }
 
-  // NoBarrier_SetNext() suffices since we will add a barrier when
-  // we publish a pointer to "x" in prev[i].
-  x->NoBarrier_SetNext(cur);
+  if (header->GetNumEntries() == threshold_use_skiplist_) {
+    // Case 3. number of entries reaches the threshold so need to convert to
+    // skip list.
+    LinkListIterator bucket_iter(
+        this, reinterpret_cast<Node*>(
+                  first_next_pointer->load(std::memory_order_relaxed)));
+    auto mem = allocator_->AllocateAligned(sizeof(SkipListBucketHeader));
+    SkipListBucketHeader* new_skip_list_header = new (mem)
+        SkipListBucketHeader(compare_, allocator_, header->GetNumEntries() + 1);
+    auto& skip_list = new_skip_list_header->skip_list;
+
+    // Add all current entries to the skip list
+    for (bucket_iter.SeekToHead(); bucket_iter.Valid(); bucket_iter.Next()) {
+      skip_list.Insert(bucket_iter.key());
+    }
 
-  if (prev) {
-    prev->SetNext(x);
+    // insert the new entry
+    skip_list.Insert(x->key);
+    // Set the bucket
+    bucket.store(new_skip_list_header, std::memory_order_release);
   } else {
-    bucket.Release_Store(static_cast<void*>(x));
+    // Case 5. Need to insert to the sorted linked list without changing the
+    // header.
+    Node* first =
+        reinterpret_cast<Node*>(header->next.load(std::memory_order_relaxed));
+    assert(first != nullptr);
+    // Advance counter unless the bucket needs to be advanced to skip list.
+    // In that case, we need to make sure the previous count never exceeds
+    // threshold_use_skiplist_ to avoid readers to cast to wrong format.
+    header->IncNumEntries();
+
+    Node* cur = first;
+    Node* prev = nullptr;
+    while (true) {
+      if (cur == nullptr) {
+        break;
+      }
+      Node* next = cur->Next();
+      // Make sure the lists are sorted.
+      // If x points to head_ or next points nullptr, it is trivially satisfied.
+      assert((cur == first) || (next == nullptr) ||
+             KeyIsAfterNode(next->key, cur));
+      if (KeyIsAfterNode(internal_key, cur)) {
+        // Keep searching in this list
+        prev = cur;
+        cur = next;
+      } else {
+        break;
+      }
+    }
+
+    // Our data structure does not allow duplicate insertion
+    assert(cur == nullptr || !Equal(x->key, cur->key));
+
+    // NoBarrier_SetNext() suffices since we will add a barrier when
+    // we publish a pointer to "x" in prev[i].
+    x->NoBarrier_SetNext(cur);
+
+    if (prev) {
+      prev->SetNext(x);
+    } else {
+      header->next.store(static_cast<void*>(x), std::memory_order_release);
+    }
   }
 }
 
@@ -390,11 +668,17 @@ bool HashLinkListRep::Contains(const char* key) const {
   if (bucket == nullptr) {
     return false;
   }
-  return BucketContains(bucket, internal_key);
+
+  SkipListBucketHeader* skip_list_header = GetSkipListBucketHeader(bucket);
+  if (skip_list_header != nullptr) {
+    return skip_list_header->skip_list.Contains(key);
+  } else {
+    return LinkListContains(GetLinkListFirstNode(bucket), internal_key);
+  }
 }
 
 size_t HashLinkListRep::ApproximateMemoryUsage() {
-  // Memory is always allocated from the arena.
+  // Memory is always allocated from the allocator.
   return 0;
 }
 
@@ -402,44 +686,85 @@ void HashLinkListRep::Get(const LookupKey& k, void* callback_args,
                           bool (*callback_func)(void* arg, const char* entry)) {
   auto transformed = transform_->Transform(k.user_key());
   auto bucket = GetBucket(transformed);
-  if (bucket != nullptr) {
-    Iterator iter(this, bucket);
-    for (iter.Seek(k.internal_key(), nullptr);
+
+  auto* skip_list_header = GetSkipListBucketHeader(bucket);
+  if (skip_list_header != nullptr) {
+    // Is a skip list
+    MemtableSkipList::Iterator iter(&skip_list_header->skip_list);
+    for (iter.Seek(k.memtable_key().data());
          iter.Valid() && callback_func(callback_args, iter.key());
          iter.Next()) {
     }
+  } else {
+    auto* link_list_head = GetLinkListFirstNode(bucket);
+    if (link_list_head != nullptr) {
+      LinkListIterator iter(this, link_list_head);
+      for (iter.Seek(k.internal_key(), nullptr);
+           iter.Valid() && callback_func(callback_args, iter.key());
+           iter.Next()) {
+      }
+    }
   }
 }
 
-MemTableRep::Iterator* HashLinkListRep::GetIterator() {
+MemTableRep::Iterator* HashLinkListRep::GetIterator(Arena* alloc_arena) {
   // allocate a new arena of similar size to the one currently in use
-  Arena* new_arena = new Arena(arena_->BlockSize());
-  auto list = new FullList(compare_, new_arena);
+  Arena* new_arena = new Arena(allocator_->BlockSize());
+  auto list = new MemtableSkipList(compare_, new_arena);
+  HistogramImpl keys_per_bucket_hist;
+
   for (size_t i = 0; i < bucket_size_; ++i) {
-    auto bucket = GetBucket(i);
+    int count = 0;
+    auto* bucket = GetBucket(i);
     if (bucket != nullptr) {
-      Iterator itr(this, bucket);
-      for (itr.SeekToHead(); itr.Valid(); itr.Next()) {
-        list->Insert(itr.key());
+      auto* skip_list_header = GetSkipListBucketHeader(bucket);
+      if (skip_list_header != nullptr) {
+        // Is a skip list
+        MemtableSkipList::Iterator itr(&skip_list_header->skip_list);
+        for (itr.SeekToFirst(); itr.Valid(); itr.Next()) {
+          list->Insert(itr.key());
+          count++;
+        }
+      } else {
+        auto* link_list_head = GetLinkListFirstNode(bucket);
+        if (link_list_head != nullptr) {
+          LinkListIterator itr(this, link_list_head);
+          for (itr.SeekToHead(); itr.Valid(); itr.Next()) {
+            list->Insert(itr.key());
+            count++;
+          }
+        }
       }
     }
+    if (if_log_bucket_dist_when_flash_) {
+      keys_per_bucket_hist.Add(count);
+    }
+  }
+  if (if_log_bucket_dist_when_flash_ && logger_ != nullptr) {
+    Info(logger_, "hashLinkedList Entry distribution among buckets: %s",
+         keys_per_bucket_hist.ToString().c_str());
   }
-  return new FullListIterator(list, new_arena);
-}
 
-MemTableRep::Iterator* HashLinkListRep::GetIterator(const Slice& slice) {
-  auto bucket = GetBucket(transform_->Transform(slice));
-  if (bucket == nullptr) {
-    return new EmptyIterator();
+  if (alloc_arena == nullptr) {
+    return new FullListIterator(list, new_arena);
+  } else {
+    auto mem = alloc_arena->AllocateAligned(sizeof(FullListIterator));
+    return new (mem) FullListIterator(list, new_arena);
   }
-  return new Iterator(this, bucket);
 }
 
-MemTableRep::Iterator* HashLinkListRep::GetDynamicPrefixIterator() {
-  return new DynamicIterator(*this);
+MemTableRep::Iterator* HashLinkListRep::GetDynamicPrefixIterator(
+    Arena* alloc_arena) {
+  if (alloc_arena == nullptr) {
+    return new DynamicIterator(*this);
+  } else {
+    auto mem = alloc_arena->AllocateAligned(sizeof(DynamicIterator));
+    return new (mem) DynamicIterator(*this);
+  }
 }
 
-bool HashLinkListRep::BucketContains(Node* head, const Slice& user_key) const {
+bool HashLinkListRep::LinkListContains(Node* head,
+                                       const Slice& user_key) const {
   Node* x = FindGreaterOrEqualInBucket(head, user_key);
   return (x != nullptr && Equal(user_key, x->key));
 }
@@ -468,15 +793,21 @@ Node* HashLinkListRep::FindGreaterOrEqualInBucket(Node* head,
 } // anon namespace
 
 MemTableRep* HashLinkListRepFactory::CreateMemTableRep(
-    const MemTableRep::KeyComparator& compare, Arena* arena,
-    const SliceTransform* transform) {
-  return new HashLinkListRep(compare, arena, transform, bucket_count_,
-                             huge_page_tlb_size_);
+    const MemTableRep::KeyComparator& compare, MemTableAllocator* allocator,
+    const SliceTransform* transform, Logger* logger) {
+  return new HashLinkListRep(compare, allocator, transform, bucket_count_,
+                             threshold_use_skiplist_, huge_page_tlb_size_,
+                             logger, bucket_entries_logging_threshold_,
+                             if_log_bucket_dist_when_flash_);
 }
 
-MemTableRepFactory* NewHashLinkListRepFactory(size_t bucket_count,
-                                              size_t huge_page_tlb_size) {
-  return new HashLinkListRepFactory(bucket_count, huge_page_tlb_size);
+MemTableRepFactory* NewHashLinkListRepFactory(
+    size_t bucket_count, size_t huge_page_tlb_size,
+    int bucket_entries_logging_threshold, bool if_log_bucket_dist_when_flash,
+    uint32_t threshold_use_skiplist) {
+  return new HashLinkListRepFactory(
+      bucket_count, threshold_use_skiplist, huge_page_tlb_size,
+      bucket_entries_logging_threshold, if_log_bucket_dist_when_flash);
 }
 
 } // namespace rocksdb
diff --git a/src/rocksdb/util/hash_linklist_rep.h b/src/rocksdb/util/hash_linklist_rep.h
index 4a9fd00..6292723 100644
--- a/src/rocksdb/util/hash_linklist_rep.h
+++ b/src/rocksdb/util/hash_linklist_rep.h
@@ -16,14 +16,21 @@ namespace rocksdb {
 class HashLinkListRepFactory : public MemTableRepFactory {
  public:
   explicit HashLinkListRepFactory(size_t bucket_count,
-                                  size_t huge_page_tlb_size)
-      : bucket_count_(bucket_count), huge_page_tlb_size_(huge_page_tlb_size) {}
+                                  uint32_t threshold_use_skiplist,
+                                  size_t huge_page_tlb_size,
+                                  int bucket_entries_logging_threshold,
+                                  bool if_log_bucket_dist_when_flash)
+      : bucket_count_(bucket_count),
+        threshold_use_skiplist_(threshold_use_skiplist),
+        huge_page_tlb_size_(huge_page_tlb_size),
+        bucket_entries_logging_threshold_(bucket_entries_logging_threshold),
+        if_log_bucket_dist_when_flash_(if_log_bucket_dist_when_flash) {}
 
   virtual ~HashLinkListRepFactory() {}
 
   virtual MemTableRep* CreateMemTableRep(
-      const MemTableRep::KeyComparator& compare, Arena* arena,
-      const SliceTransform* transform) override;
+      const MemTableRep::KeyComparator& compare, MemTableAllocator* allocator,
+      const SliceTransform* transform, Logger* logger) override;
 
   virtual const char* Name() const override {
     return "HashLinkListRepFactory";
@@ -31,7 +38,10 @@ class HashLinkListRepFactory : public MemTableRepFactory {
 
  private:
   const size_t bucket_count_;
+  const uint32_t threshold_use_skiplist_;
   const size_t huge_page_tlb_size_;
+  int bucket_entries_logging_threshold_;
+  bool if_log_bucket_dist_when_flash_;
 };
 
 }
diff --git a/src/rocksdb/util/hash_skiplist_rep.cc b/src/rocksdb/util/hash_skiplist_rep.cc
index 21df9f6..142903d 100644
--- a/src/rocksdb/util/hash_skiplist_rep.cc
+++ b/src/rocksdb/util/hash_skiplist_rep.cc
@@ -7,12 +7,13 @@
 #ifndef ROCKSDB_LITE
 #include "util/hash_skiplist_rep.h"
 
+#include <atomic>
+
 #include "rocksdb/memtablerep.h"
 #include "util/arena.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
 #include "port/port.h"
-#include "port/atomic_pointer.h"
 #include "util/murmurhash.h"
 #include "db/memtable.h"
 #include "db/skiplist.h"
@@ -22,9 +23,10 @@ namespace {
 
 class HashSkipListRep : public MemTableRep {
  public:
-  HashSkipListRep(const MemTableRep::KeyComparator& compare, Arena* arena,
-                  const SliceTransform* transform, size_t bucket_size,
-                  int32_t skiplist_height, int32_t skiplist_branching_factor);
+  HashSkipListRep(const MemTableRep::KeyComparator& compare,
+                  MemTableAllocator* allocator, const SliceTransform* transform,
+                  size_t bucket_size, int32_t skiplist_height,
+                  int32_t skiplist_branching_factor);
 
   virtual void Insert(KeyHandle handle) override;
 
@@ -38,11 +40,10 @@ class HashSkipListRep : public MemTableRep {
 
   virtual ~HashSkipListRep();
 
-  virtual MemTableRep::Iterator* GetIterator() override;
-
-  virtual MemTableRep::Iterator* GetIterator(const Slice& slice) override;
+  virtual MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override;
 
-  virtual MemTableRep::Iterator* GetDynamicPrefixIterator() override;
+  virtual MemTableRep::Iterator* GetDynamicPrefixIterator(
+      Arena* arena = nullptr) override;
 
  private:
   friend class DynamicIterator;
@@ -55,20 +56,21 @@ class HashSkipListRep : public MemTableRep {
 
   // Maps slices (which are transformed user keys) to buckets of keys sharing
   // the same transform.
-  port::AtomicPointer* buckets_;
+  std::atomic<Bucket*>* buckets_;
 
   // The user-supplied transform whose domain is the user keys.
   const SliceTransform* transform_;
 
   const MemTableRep::KeyComparator& compare_;
   // immutable after construction
-  Arena* const arena_;
+  MemTableAllocator* const allocator_;
 
   inline size_t GetHash(const Slice& slice) const {
-    return MurmurHash(slice.data(), slice.size(), 0) % bucket_size_;
+    return MurmurHash(slice.data(), static_cast<int>(slice.size()), 0) %
+           bucket_size_;
   }
   inline Bucket* GetBucket(size_t i) const {
-    return static_cast<Bucket*>(buckets_[i].Acquire_Load());
+    return buckets_[i].load(std::memory_order_acquire);
   }
   inline Bucket* GetBucket(const Slice& slice) const {
     return GetBucket(GetHash(slice));
@@ -92,33 +94,34 @@ class HashSkipListRep : public MemTableRep {
     }
 
     // Returns true iff the iterator is positioned at a valid node.
-    virtual bool Valid() const {
+    virtual bool Valid() const override {
       return list_ != nullptr && iter_.Valid();
     }
 
     // Returns the key at the current position.
     // REQUIRES: Valid()
-    virtual const char* key() const {
+    virtual const char* key() const override {
       assert(Valid());
       return iter_.key();
     }
 
     // Advances to the next position.
     // REQUIRES: Valid()
-    virtual void Next() {
+    virtual void Next() override {
       assert(Valid());
       iter_.Next();
     }
 
     // Advances to the previous position.
     // REQUIRES: Valid()
-    virtual void Prev() {
+    virtual void Prev() override {
       assert(Valid());
       iter_.Prev();
     }
 
     // Advance to the first entry with a key >= target
-    virtual void Seek(const Slice& internal_key, const char* memtable_key) {
+    virtual void Seek(const Slice& internal_key,
+                      const char* memtable_key) override {
       if (list_ != nullptr) {
         const char* encoded_key =
             (memtable_key != nullptr) ?
@@ -129,7 +132,7 @@ class HashSkipListRep : public MemTableRep {
 
     // Position at the first entry in collection.
     // Final state of iterator is Valid() iff collection is not empty.
-    virtual void SeekToFirst() {
+    virtual void SeekToFirst() override {
       if (list_ != nullptr) {
         iter_.SeekToFirst();
       }
@@ -137,7 +140,7 @@ class HashSkipListRep : public MemTableRep {
 
     // Position at the last entry in collection.
     // Final state of iterator is Valid() iff collection is not empty.
-    virtual void SeekToLast() {
+    virtual void SeekToLast() override {
       if (list_ != nullptr) {
         iter_.SeekToLast();
       }
@@ -171,7 +174,7 @@ class HashSkipListRep : public MemTableRep {
         memtable_rep_(memtable_rep) {}
 
     // Advance to the first entry with a key >= target
-    virtual void Seek(const Slice& k, const char* memtable_key) {
+    virtual void Seek(const Slice& k, const char* memtable_key) override {
       auto transformed = memtable_rep_.transform_->Transform(ExtractUserKey(k));
       Reset(memtable_rep_.GetBucket(transformed));
       HashSkipListRep::Iterator::Seek(k, memtable_key);
@@ -179,7 +182,7 @@ class HashSkipListRep : public MemTableRep {
 
     // Position at the first entry in collection.
     // Final state of iterator is Valid() iff collection is not empty.
-    virtual void SeekToFirst() {
+    virtual void SeekToFirst() override {
       // Prefix iterator does not support total order.
       // We simply set the iterator to invalid state
       Reset(nullptr);
@@ -187,7 +190,7 @@ class HashSkipListRep : public MemTableRep {
 
     // Position at the last entry in collection.
     // Final state of iterator is Valid() iff collection is not empty.
-    virtual void SeekToLast() {
+    virtual void SeekToLast() override {
       // Prefix iterator does not support total order.
       // We simply set the iterator to invalid state
       Reset(nullptr);
@@ -202,43 +205,44 @@ class HashSkipListRep : public MemTableRep {
     // instantiating an empty bucket over which to iterate.
    public:
     EmptyIterator() { }
-    virtual bool Valid() const {
-      return false;
-    }
-    virtual const char* key() const {
+    virtual bool Valid() const override { return false; }
+    virtual const char* key() const override {
       assert(false);
       return nullptr;
     }
-    virtual void Next() { }
-    virtual void Prev() { }
+    virtual void Next() override {}
+    virtual void Prev() override {}
     virtual void Seek(const Slice& internal_key,
-                      const char* memtable_key) { }
-    virtual void SeekToFirst() { }
-    virtual void SeekToLast() { }
+                      const char* memtable_key) override {}
+    virtual void SeekToFirst() override {}
+    virtual void SeekToLast() override {}
+
    private:
   };
 };
 
 HashSkipListRep::HashSkipListRep(const MemTableRep::KeyComparator& compare,
-                                 Arena* arena, const SliceTransform* transform,
+                                 MemTableAllocator* allocator,
+                                 const SliceTransform* transform,
                                  size_t bucket_size, int32_t skiplist_height,
                                  int32_t skiplist_branching_factor)
-    : MemTableRep(arena),
+    : MemTableRep(allocator),
       bucket_size_(bucket_size),
       skiplist_height_(skiplist_height),
       skiplist_branching_factor_(skiplist_branching_factor),
       transform_(transform),
       compare_(compare),
-      arena_(arena) {
-  buckets_ = new port::AtomicPointer[bucket_size];
+      allocator_(allocator) {
+  auto mem = allocator->AllocateAligned(
+               sizeof(std::atomic<void*>) * bucket_size);
+  buckets_ = new (mem) std::atomic<Bucket*>[bucket_size];
 
   for (size_t i = 0; i < bucket_size_; ++i) {
-    buckets_[i].NoBarrier_Store(nullptr);
+    buckets_[i].store(nullptr, std::memory_order_relaxed);
   }
 }
 
 HashSkipListRep::~HashSkipListRep() {
-  delete[] buckets_;
 }
 
 HashSkipListRep::Bucket* HashSkipListRep::GetInitializedBucket(
@@ -246,10 +250,10 @@ HashSkipListRep::Bucket* HashSkipListRep::GetInitializedBucket(
   size_t hash = GetHash(transformed);
   auto bucket = GetBucket(hash);
   if (bucket == nullptr) {
-    auto addr = arena_->AllocateAligned(sizeof(Bucket));
-    bucket = new (addr) Bucket(compare_, arena_, skiplist_height_,
+    auto addr = allocator_->AllocateAligned(sizeof(Bucket));
+    bucket = new (addr) Bucket(compare_, allocator_, skiplist_height_,
                                skiplist_branching_factor_);
-    buckets_[hash].Release_Store(static_cast<void*>(bucket));
+    buckets_[hash].store(bucket, std::memory_order_release);
   }
   return bucket;
 }
@@ -272,7 +276,7 @@ bool HashSkipListRep::Contains(const char* key) const {
 }
 
 size_t HashSkipListRep::ApproximateMemoryUsage() {
-  return sizeof(buckets_);
+  return 0;
 }
 
 void HashSkipListRep::Get(const LookupKey& k, void* callback_args,
@@ -288,9 +292,9 @@ void HashSkipListRep::Get(const LookupKey& k, void* callback_args,
   }
 }
 
-MemTableRep::Iterator* HashSkipListRep::GetIterator() {
+MemTableRep::Iterator* HashSkipListRep::GetIterator(Arena* arena) {
   // allocate a new arena of similar size to the one currently in use
-  Arena* new_arena = new Arena(arena_->BlockSize());
+  Arena* new_arena = new Arena(allocator_->BlockSize());
   auto list = new Bucket(compare_, new_arena);
   for (size_t i = 0; i < bucket_size_; ++i) {
     auto bucket = GetBucket(i);
@@ -301,27 +305,29 @@ MemTableRep::Iterator* HashSkipListRep::GetIterator() {
       }
     }
   }
-  return new Iterator(list, true, new_arena);
-}
-
-MemTableRep::Iterator* HashSkipListRep::GetIterator(const Slice& slice) {
-  auto bucket = GetBucket(transform_->Transform(slice));
-  if (bucket == nullptr) {
-    return new EmptyIterator();
+  if (arena == nullptr) {
+    return new Iterator(list, true, new_arena);
+  } else {
+    auto mem = arena->AllocateAligned(sizeof(Iterator));
+    return new (mem) Iterator(list, true, new_arena);
   }
-  return new Iterator(bucket, false);
 }
 
-MemTableRep::Iterator* HashSkipListRep::GetDynamicPrefixIterator() {
-  return new DynamicIterator(*this);
+MemTableRep::Iterator* HashSkipListRep::GetDynamicPrefixIterator(Arena* arena) {
+  if (arena == nullptr) {
+    return new DynamicIterator(*this);
+  } else {
+    auto mem = arena->AllocateAligned(sizeof(DynamicIterator));
+    return new (mem) DynamicIterator(*this);
+  }
 }
 
 } // anon namespace
 
 MemTableRep* HashSkipListRepFactory::CreateMemTableRep(
-    const MemTableRep::KeyComparator& compare, Arena* arena,
-    const SliceTransform* transform) {
-  return new HashSkipListRep(compare, arena, transform, bucket_count_,
+    const MemTableRep::KeyComparator& compare, MemTableAllocator* allocator,
+    const SliceTransform* transform, Logger* logger) {
+  return new HashSkipListRep(compare, allocator, transform, bucket_count_,
                              skiplist_height_, skiplist_branching_factor_);
 }
 
diff --git a/src/rocksdb/util/hash_skiplist_rep.h b/src/rocksdb/util/hash_skiplist_rep.h
index 16903c6..15d0fc7 100644
--- a/src/rocksdb/util/hash_skiplist_rep.h
+++ b/src/rocksdb/util/hash_skiplist_rep.h
@@ -26,8 +26,8 @@ class HashSkipListRepFactory : public MemTableRepFactory {
   virtual ~HashSkipListRepFactory() {}
 
   virtual MemTableRep* CreateMemTableRep(
-      const MemTableRep::KeyComparator& compare, Arena* arena,
-      const SliceTransform* transform) override;
+      const MemTableRep::KeyComparator& compare, MemTableAllocator* allocator,
+      const SliceTransform* transform, Logger* logger) override;
 
   virtual const char* Name() const override {
     return "HashSkipListRepFactory";
diff --git a/src/rocksdb/util/histogram.cc b/src/rocksdb/util/histogram.cc
index 968769c..5a875e5 100644
--- a/src/rocksdb/util/histogram.cc
+++ b/src/rocksdb/util/histogram.cc
@@ -19,7 +19,7 @@ namespace rocksdb {
 HistogramBucketMapper::HistogramBucketMapper()
     :
       // Add newer bucket index here.
-      // Should be alwyas added in sorted order.
+      // Should be always added in sorted order.
       // If you change this, you also need to change
       // size of array buckets_ in HistogramImpl
       bucketValues_(
@@ -53,14 +53,14 @@ HistogramBucketMapper::HistogramBucketMapper()
   }
 }
 
-const size_t HistogramBucketMapper::IndexForValue(const uint64_t value) const {
+size_t HistogramBucketMapper::IndexForValue(const uint64_t value) const {
   if (value >= maxBucketValue_) {
     return bucketValues_.size() - 1;
   } else if ( value >= minBucketValue_ ) {
     std::map<uint64_t, uint64_t>::const_iterator lowerBound =
       valueIndexMap_.lower_bound(value);
     if (lowerBound != valueIndexMap_.end()) {
-      return lowerBound->second;
+      return static_cast<size_t>(lowerBound->second);
     } else {
       return 0;
     }
diff --git a/src/rocksdb/util/histogram.h b/src/rocksdb/util/histogram.h
index d95588d..77ed9be 100644
--- a/src/rocksdb/util/histogram.h
+++ b/src/rocksdb/util/histogram.h
@@ -23,10 +23,10 @@ class HistogramBucketMapper {
   HistogramBucketMapper();
 
   // converts a value to the bucket index.
-  const size_t IndexForValue(const uint64_t value) const;
+  size_t IndexForValue(const uint64_t value) const;
   // number of buckets required.
 
-  const size_t BucketCount() const {
+  size_t BucketCount() const {
     return bucketValues_.size();
   }
 
@@ -38,7 +38,7 @@ class HistogramBucketMapper {
     return minBucketValue_;
   }
 
-  uint64_t BucketLimit(const uint64_t bucketNumber) const {
+  uint64_t BucketLimit(const size_t bucketNumber) const {
     assert(bucketNumber < BucketCount());
     return bucketValues_[bucketNumber];
   }
@@ -65,6 +65,8 @@ class HistogramImpl {
   virtual double StandardDeviation() const;
   virtual void Data(HistogramData * const data) const;
 
+  virtual ~HistogramImpl() {}
+
  private:
   // To be able to use HistogramImpl as thread local variable, its constructor
   // has to be static. That's why we're using manually values from BucketMapper
diff --git a/src/rocksdb/util/histogram_test.cc b/src/rocksdb/util/histogram_test.cc
index 065f957..22ddb4b 100644
--- a/src/rocksdb/util/histogram_test.cc
+++ b/src/rocksdb/util/histogram_test.cc
@@ -9,10 +9,9 @@
 
 namespace rocksdb {
 
-class HistogramTest { };
-
-TEST(HistogramTest, BasicOperation) {
+class HistogramTest : public testing::Test {};
 
+TEST_F(HistogramTest, BasicOperation) {
   HistogramImpl histogram;
   for (uint64_t i = 1; i <= 100; i++) {
     histogram.Add(i);
@@ -34,17 +33,17 @@ TEST(HistogramTest, BasicOperation) {
     ASSERT_TRUE(percentile99 >= percentile85);
   }
 
-  ASSERT_EQ(histogram.Average(), 50.5); // avg is acurately caluclated.
+  ASSERT_EQ(histogram.Average(), 50.5); // avg is acurately calculated.
 }
 
-TEST(HistogramTest, EmptyHistogram) {
+TEST_F(HistogramTest, EmptyHistogram) {
   HistogramImpl histogram;
   ASSERT_EQ(histogram.Median(), 0.0);
   ASSERT_EQ(histogram.Percentile(85.0), 0.0);
   ASSERT_EQ(histogram.Average(), 0.0);
 }
 
-TEST(HistogramTest, ClearHistogram) {
+TEST_F(HistogramTest, ClearHistogram) {
   HistogramImpl histogram;
   for (uint64_t i = 1; i <= 100; i++) {
     histogram.Add(i);
@@ -58,5 +57,6 @@ TEST(HistogramTest, ClearHistogram) {
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
-  return rocksdb::test::RunAllTests();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
 }
diff --git a/src/rocksdb/util/instrumented_mutex.cc b/src/rocksdb/util/instrumented_mutex.cc
new file mode 100644
index 0000000..2e240cc
--- /dev/null
+++ b/src/rocksdb/util/instrumented_mutex.cc
@@ -0,0 +1,76 @@
+//  Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "util/perf_context_imp.h"
+#include "util/instrumented_mutex.h"
+#include "util/thread_status_util.h"
+
+namespace rocksdb {
+void InstrumentedMutex::Lock() {
+  PERF_TIMER_GUARD(db_mutex_lock_nanos);
+  uint64_t wait_time_micros = 0;
+  if (env_ != nullptr && stats_ != nullptr) {
+    {
+      StopWatch sw(env_, nullptr, 0, &wait_time_micros);
+      LockInternal();
+    }
+    RecordTick(stats_, stats_code_, wait_time_micros);
+  } else {
+    LockInternal();
+  }
+}
+
+void InstrumentedMutex::LockInternal() {
+#ifndef NDEBUG
+  ThreadStatusUtil::TEST_StateDelay(ThreadStatus::STATE_MUTEX_WAIT);
+#endif
+  mutex_.Lock();
+}
+
+void InstrumentedCondVar::Wait() {
+  PERF_TIMER_GUARD(db_condition_wait_nanos);
+  uint64_t wait_time_micros = 0;
+  if (env_ != nullptr && stats_ != nullptr) {
+    {
+      StopWatch sw(env_, nullptr, 0, &wait_time_micros);
+      WaitInternal();
+    }
+    RecordTick(stats_, stats_code_, wait_time_micros);
+  } else {
+    WaitInternal();
+  }
+}
+
+void InstrumentedCondVar::WaitInternal() {
+#ifndef NDEBUG
+  ThreadStatusUtil::TEST_StateDelay(ThreadStatus::STATE_MUTEX_WAIT);
+#endif
+  cond_.Wait();
+}
+
+bool InstrumentedCondVar::TimedWait(uint64_t abs_time_us) {
+  PERF_TIMER_GUARD(db_condition_wait_nanos);
+  uint64_t wait_time_micros = 0;
+  bool result = false;
+  if (env_ != nullptr && stats_ != nullptr) {
+    {
+      StopWatch sw(env_, nullptr, 0, &wait_time_micros);
+      result = TimedWaitInternal(abs_time_us);
+    }
+    RecordTick(stats_, stats_code_, wait_time_micros);
+  } else {
+    result = TimedWaitInternal(abs_time_us);
+  }
+  return result;
+}
+
+bool InstrumentedCondVar::TimedWaitInternal(uint64_t abs_time_us) {
+#ifndef NDEBUG
+  ThreadStatusUtil::TEST_StateDelay(ThreadStatus::STATE_MUTEX_WAIT);
+#endif
+  return cond_.TimedWait(abs_time_us);
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/instrumented_mutex.h b/src/rocksdb/util/instrumented_mutex.h
new file mode 100644
index 0000000..3f23349
--- /dev/null
+++ b/src/rocksdb/util/instrumented_mutex.h
@@ -0,0 +1,98 @@
+//  Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/thread_status.h"
+#include "util/statistics.h"
+#include "util/stop_watch.h"
+
+namespace rocksdb {
+class InstrumentedCondVar;
+
+// A wrapper class for port::Mutex that provides additional layer
+// for collecting stats and instrumentation.
+class InstrumentedMutex {
+ public:
+  explicit InstrumentedMutex(bool adaptive = false)
+      : mutex_(adaptive), stats_(nullptr), env_(nullptr),
+        stats_code_(0) {}
+
+  InstrumentedMutex(
+      Statistics* stats, Env* env,
+      int stats_code, bool adaptive = false)
+      : mutex_(adaptive), stats_(stats), env_(env),
+        stats_code_(stats_code) {}
+
+  void Lock();
+
+  void Unlock() {
+    mutex_.Unlock();
+  }
+
+  void AssertHeld() {
+    mutex_.AssertHeld();
+  }
+
+ private:
+  void LockInternal();
+  friend class InstrumentedCondVar;
+  port::Mutex mutex_;
+  Statistics* stats_;
+  Env* env_;
+  int stats_code_;
+};
+
+// A wrapper class for port::Mutex that provides additional layer
+// for collecting stats and instrumentation.
+class InstrumentedMutexLock {
+ public:
+  explicit InstrumentedMutexLock(InstrumentedMutex* mutex) : mutex_(mutex) {
+    mutex_->Lock();
+  }
+
+  ~InstrumentedMutexLock() {
+    mutex_->Unlock();
+  }
+
+ private:
+  InstrumentedMutex* const mutex_;
+  InstrumentedMutexLock(const InstrumentedMutexLock&) = delete;
+  void operator=(const InstrumentedMutexLock&) = delete;
+};
+
+class InstrumentedCondVar {
+ public:
+  explicit InstrumentedCondVar(InstrumentedMutex* instrumented_mutex)
+      : cond_(&(instrumented_mutex->mutex_)),
+        stats_(instrumented_mutex->stats_),
+        env_(instrumented_mutex->env_),
+        stats_code_(instrumented_mutex->stats_code_) {}
+
+  void Wait();
+
+  bool TimedWait(uint64_t abs_time_us);
+
+  void Signal() {
+    cond_.Signal();
+  }
+
+  void SignalAll() {
+    cond_.SignalAll();
+  }
+
+ private:
+  void WaitInternal();
+  bool TimedWaitInternal(uint64_t abs_time_us);
+  port::CondVar cond_;
+  Statistics* stats_;
+  Env* env_;
+  int stats_code_;
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/iostats_context.cc b/src/rocksdb/util/iostats_context.cc
new file mode 100644
index 0000000..090813a
--- /dev/null
+++ b/src/rocksdb/util/iostats_context.cc
@@ -0,0 +1,32 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#include <sstream>
+#include "rocksdb/env.h"
+#include "util/iostats_context_imp.h"
+
+namespace rocksdb {
+
+#ifndef IOS_CROSS_COMPILE
+__thread IOStatsContext iostats_context;
+#endif  // IOS_CROSS_COMPILE
+
+void IOStatsContext::Reset() {
+  thread_pool_id = Env::Priority::TOTAL;
+  bytes_read = 0;
+  bytes_written = 0;
+}
+
+#define OUTPUT(counter) #counter << " = " << counter << ", "
+
+std::string IOStatsContext::ToString() const {
+  std::ostringstream ss;
+  ss << OUTPUT(thread_pool_id)
+     << OUTPUT(bytes_read)
+     << OUTPUT(bytes_written);
+  return ss.str();
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/iostats_context_imp.h b/src/rocksdb/util/iostats_context_imp.h
new file mode 100644
index 0000000..b271ddf
--- /dev/null
+++ b/src/rocksdb/util/iostats_context_imp.h
@@ -0,0 +1,46 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#pragma once
+#include "rocksdb/iostats_context.h"
+
+#ifndef IOS_CROSS_COMPILE
+
+// increment a specific counter by the specified value
+#define IOSTATS_ADD(metric, value)     \
+  (iostats_context.metric += value)
+
+// Increase metric value only when it is positive
+#define IOSTATS_ADD_IF_POSITIVE(metric, value)   \
+  if (value > 0) { IOSTATS_ADD(metric, value); }
+
+// reset a specific counter to zero
+#define IOSTATS_RESET(metric)          \
+  (iostats_context.metric = 0)
+
+// reset all counters to zero
+#define IOSTATS_RESET_ALL()                        \
+  (iostats_context.Reset())
+
+#define IOSTATS_SET_THREAD_POOL_ID(value)      \
+  (iostats_context.thread_pool_id = value)
+
+#define IOSTATS_THREAD_POOL_ID()               \
+  (iostats_context.thread_pool_id)
+
+#define IOSTATS(metric)                        \
+  (iostats_context.metric)
+
+#else  // IOS_CROSS_COMPILE
+
+#define IOSTATS_ADD(metric, value)
+#define IOSTATS_ADD_IF_POSITIVE(metric, value)
+#define IOSTATS_RESET(metric)
+#define IOSTATS_RESET_ALL()
+#define IOSTATS_SET_THREAD_POOL_ID(value)
+#define IOSTATS_THREAD_POOL_ID()
+#define IOSTATS(metric) 0
+
+#endif  // IOS_CROSS_COMPILE
diff --git a/src/rocksdb/util/ldb_cmd.cc b/src/rocksdb/util/ldb_cmd.cc
index 597179f..e7b29d2 100644
--- a/src/rocksdb/util/ldb_cmd.cc
+++ b/src/rocksdb/util/ldb_cmd.cc
@@ -10,14 +10,21 @@
 #include "db/db_impl.h"
 #include "db/log_reader.h"
 #include "db/filename.h"
+#include "db/writebuffer.h"
 #include "db/write_batch_internal.h"
 #include "rocksdb/write_batch.h"
 #include "rocksdb/cache.h"
+#include "rocksdb/table_properties.h"
 #include "util/coding.h"
+#include "util/sst_dump_tool_imp.h"
+#include "util/string_util.h"
+#include "util/scoped_arena_iterator.h"
 #include "utilities/ttl/db_ttl_impl.h"
 
+#include <cstdlib>
 #include <ctime>
 #include <dirent.h>
+#include <limits>
 #include <sstream>
 #include <string>
 #include <stdexcept>
@@ -38,9 +45,11 @@ const string LDBCommand::ARG_FROM = "from";
 const string LDBCommand::ARG_TO = "to";
 const string LDBCommand::ARG_MAX_KEYS = "max_keys";
 const string LDBCommand::ARG_BLOOM_BITS = "bloom_bits";
+const string LDBCommand::ARG_FIX_PREFIX_LEN = "fix_prefix_len";
 const string LDBCommand::ARG_COMPRESSION_TYPE = "compression_type";
 const string LDBCommand::ARG_BLOCK_SIZE = "block_size";
 const string LDBCommand::ARG_AUTO_COMPACTION = "auto_compaction";
+const string LDBCommand::ARG_DB_WRITE_BUFFER_SIZE = "db_write_buffer_size";
 const string LDBCommand::ARG_WRITE_BUFFER_SIZE = "write_buffer_size";
 const string LDBCommand::ARG_FILE_SIZE = "file_size";
 const string LDBCommand::ARG_CREATE_IF_MISSING = "create_if_missing";
@@ -50,13 +59,14 @@ const char* LDBCommand::DELIM = " ==> ";
 LDBCommand* LDBCommand::InitFromCmdLineArgs(
   int argc,
   char** argv,
-  const Options& options
+  const Options& options,
+  const LDBOptions& ldb_options
 ) {
   vector<string> args;
   for (int i = 1; i < argc; i++) {
     args.push_back(argv[i]);
   }
-  return InitFromCmdLineArgs(args, options);
+  return InitFromCmdLineArgs(args, options, ldb_options);
 }
 
 /**
@@ -71,7 +81,8 @@ LDBCommand* LDBCommand::InitFromCmdLineArgs(
  */
 LDBCommand* LDBCommand::InitFromCmdLineArgs(
   const vector<string>& args,
-  const Options& options
+  const Options& options,
+  const LDBOptions& ldb_options
 ) {
   // --x=y command line arguments are added as x->y map entries.
   map<string, string> option_map;
@@ -87,7 +98,7 @@ LDBCommand* LDBCommand::InitFromCmdLineArgs(
 
   for (const auto& arg : args) {
     if (arg[0] == '-' && arg[1] == '-'){
-      vector<string> splits = stringSplit(arg, '=');
+      vector<string> splits = StringSplit(arg, '=');
       if (splits.size() == 2) {
         string optionKey = splits[0].substr(OPTION_PREFIX.size());
         option_map[optionKey] = splits[1];
@@ -115,7 +126,8 @@ LDBCommand* LDBCommand::InitFromCmdLineArgs(
   );
 
   if (command) {
-    command->SetOptions(options);
+    command->SetDBOptions(options);
+    command->SetLDBOptions(ldb_options);
   }
   return command;
 }
@@ -157,6 +169,8 @@ LDBCommand* LDBCommand::SelectCommand(
     return new ManifestDumpCommand(cmdParams, option_map, flags);
   } else if (cmd == ListColumnFamiliesCommand::Name()) {
     return new ListColumnFamiliesCommand(cmdParams, option_map, flags);
+  } else if (cmd == DBFileDumperCommand::Name()) {
+    return new DBFileDumperCommand(cmdParams, option_map, flags);
   } else if (cmd == InternalDumpCommand::Name()) {
     return new InternalDumpCommand(cmdParams, option_map, flags);
   } else if (cmd == CheckConsistencyCommand::Name()) {
@@ -180,14 +194,18 @@ bool LDBCommand::ParseIntOption(const map<string, string>& options,
   map<string, string>::const_iterator itr = option_map_.find(option);
   if (itr != option_map_.end()) {
     try {
+#if defined(CYGWIN)
+      value = strtol(itr->second.c_str(), 0, 10);
+#else
       value = stoi(itr->second);
+#endif
       return true;
     } catch(const invalid_argument&) {
-      exec_state = LDBCommandExecuteResult::FAILED(option +
-                      " has an invalid value.");
+      exec_state =
+          LDBCommandExecuteResult::Failed(option + " has an invalid value.");
     } catch(const out_of_range&) {
-      exec_state = LDBCommandExecuteResult::FAILED(option +
-                      " has a value out-of-range.");
+      exec_state = LDBCommandExecuteResult::Failed(
+          option + " has a value out-of-range.");
     }
   }
   return false;
@@ -215,26 +233,34 @@ Options LDBCommand::PrepareOptionsForOpenDB() {
 
   map<string, string>::const_iterator itr;
 
+  BlockBasedTableOptions table_options;
+  bool use_table_options = false;
   int bits;
   if (ParseIntOption(option_map_, ARG_BLOOM_BITS, bits, exec_state_)) {
     if (bits > 0) {
-      opt.filter_policy = NewBloomFilterPolicy(bits);
+      use_table_options = true;
+      table_options.filter_policy.reset(NewBloomFilterPolicy(bits));
     } else {
-      exec_state_ = LDBCommandExecuteResult::FAILED(ARG_BLOOM_BITS +
-                      " must be > 0.");
+      exec_state_ =
+          LDBCommandExecuteResult::Failed(ARG_BLOOM_BITS + " must be > 0.");
     }
   }
 
   int block_size;
   if (ParseIntOption(option_map_, ARG_BLOCK_SIZE, block_size, exec_state_)) {
     if (block_size > 0) {
-      opt.block_size = block_size;
+      use_table_options = true;
+      table_options.block_size = block_size;
     } else {
-      exec_state_ = LDBCommandExecuteResult::FAILED(ARG_BLOCK_SIZE +
-                      " must be > 0.");
+      exec_state_ =
+          LDBCommandExecuteResult::Failed(ARG_BLOCK_SIZE + " must be > 0.");
     }
   }
 
+  if (use_table_options) {
+    opt.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  }
+
   itr = option_map_.find(ARG_AUTO_COMPACTION);
   if (itr != option_map_.end()) {
     opt.disable_auto_compactions = ! StringToBool(itr->second);
@@ -257,8 +283,19 @@ Options LDBCommand::PrepareOptionsForOpenDB() {
       opt.compression = kLZ4HCCompression;
     } else {
       // Unknown compression.
-      exec_state_ = LDBCommandExecuteResult::FAILED(
-                      "Unknown compression level: " + comp);
+      exec_state_ =
+          LDBCommandExecuteResult::Failed("Unknown compression level: " + comp);
+    }
+  }
+
+  int db_write_buffer_size;
+  if (ParseIntOption(option_map_, ARG_DB_WRITE_BUFFER_SIZE,
+        db_write_buffer_size, exec_state_)) {
+    if (db_write_buffer_size >= 0) {
+      opt.db_write_buffer_size = db_write_buffer_size;
+    } else {
+      exec_state_ = LDBCommandExecuteResult::Failed(ARG_DB_WRITE_BUFFER_SIZE +
+                                                    " must be >= 0.");
     }
   }
 
@@ -268,8 +305,8 @@ Options LDBCommand::PrepareOptionsForOpenDB() {
     if (write_buffer_size > 0) {
       opt.write_buffer_size = write_buffer_size;
     } else {
-      exec_state_ = LDBCommandExecuteResult::FAILED(ARG_WRITE_BUFFER_SIZE +
-                      " must be > 0.");
+      exec_state_ = LDBCommandExecuteResult::Failed(ARG_WRITE_BUFFER_SIZE +
+                                                    " must be > 0.");
     }
   }
 
@@ -278,8 +315,24 @@ Options LDBCommand::PrepareOptionsForOpenDB() {
     if (file_size > 0) {
       opt.target_file_size_base = file_size;
     } else {
-      exec_state_ = LDBCommandExecuteResult::FAILED(ARG_FILE_SIZE +
-                      " must be > 0.");
+      exec_state_ =
+          LDBCommandExecuteResult::Failed(ARG_FILE_SIZE + " must be > 0.");
+    }
+  }
+
+  if (opt.db_paths.size() == 0) {
+    opt.db_paths.emplace_back(db_path_, std::numeric_limits<uint64_t>::max());
+  }
+
+  int fix_prefix_len;
+  if (ParseIntOption(option_map_, ARG_FIX_PREFIX_LEN, fix_prefix_len,
+                     exec_state_)) {
+    if (fix_prefix_len > 0) {
+      opt.prefix_extractor.reset(
+          NewFixedPrefixTransform(static_cast<size_t>(fix_prefix_len)));
+    } else {
+      exec_state_ =
+          LDBCommandExecuteResult::Failed(ARG_FIX_PREFIX_LEN + " must be > 0.");
     }
   }
 
@@ -314,7 +367,7 @@ bool LDBCommand::ParseKeyValue(const string& line, string* key, string* value,
 bool LDBCommand::ValidateCmdLineOptions() {
 
   for (map<string, string>::const_iterator itr = option_map_.begin();
-        itr != option_map_.end(); itr++) {
+        itr != option_map_.end(); ++itr) {
     if (find(valid_cmd_line_options_.begin(),
           valid_cmd_line_options_.end(), itr->first) ==
           valid_cmd_line_options_.end()) {
@@ -324,7 +377,7 @@ bool LDBCommand::ValidateCmdLineOptions() {
   }
 
   for (vector<string>::const_iterator itr = flags_.begin();
-        itr != flags_.end(); itr++) {
+        itr != flags_.end(); ++itr) {
     if (find(valid_cmd_line_options_.begin(),
           valid_cmd_line_options_.end(), *itr) ==
           valid_cmd_line_options_.end()) {
@@ -389,12 +442,14 @@ void CompactorCommand::DoCommand() {
   }
 
   db_->CompactRange(begin, end);
-  exec_state_ = LDBCommandExecuteResult::SUCCEED("");
+  exec_state_ = LDBCommandExecuteResult::Succeed("");
 
   delete begin;
   delete end;
 }
 
+// ----------------------------------------------------------------------------
+
 const string DBLoaderCommand::ARG_DISABLE_WAL = "disable_wal";
 const string DBLoaderCommand::ARG_BULK_LOAD = "bulk_load";
 const string DBLoaderCommand::ARG_COMPACT = "compact";
@@ -470,6 +525,30 @@ void DBLoaderCommand::DoCommand() {
 
 // ----------------------------------------------------------------------------
 
+namespace {
+
+void DumpManifestFile(std::string file, bool verbose, bool hex) {
+  Options options;
+  EnvOptions sopt;
+  std::string dbname("dummy");
+  std::shared_ptr<Cache> tc(NewLRUCache(options.max_open_files - 10,
+                                        options.table_cache_numshardbits));
+  // Notice we are using the default options not through SanitizeOptions(),
+  // if VersionSet::DumpManifest() depends on any option done by
+  // SanitizeOptions(), we need to initialize it manually.
+  options.db_paths.emplace_back("dummy", 0);
+  WriteController wc;
+  WriteBuffer wb(options.db_write_buffer_size);
+  VersionSet versions(dbname, &options, sopt, tc.get(), &wb, &wc);
+  Status s = versions.DumpManifest(options, file, verbose, hex);
+  if (!s.ok()) {
+    printf("Error in processing file %s %s\n", file.c_str(),
+           s.ToString().c_str());
+  }
+}
+
+}  // namespace
+
 const string ManifestDumpCommand::ARG_VERBOSE = "verbose";
 const string ManifestDumpCommand::ARG_PATH    = "path";
 
@@ -494,7 +573,7 @@ ManifestDumpCommand::ManifestDumpCommand(const vector<string>& params,
   if (itr != options.end()) {
     path_ = itr->second;
     if (path_.empty()) {
-      exec_state_ = LDBCommandExecuteResult::FAILED("--path: missing pathname");
+      exec_state_ = LDBCommandExecuteResult::Failed("--path: missing pathname");
     }
   }
 }
@@ -511,8 +590,8 @@ void ManifestDumpCommand::DoCommand() {
     // containing the db for files of the form MANIFEST_[0-9]+
     DIR* d = opendir(db_path_.c_str());
     if (d == nullptr) {
-      exec_state_ = LDBCommandExecuteResult::FAILED(
-        db_path_ + " is not a directory");
+      exec_state_ =
+          LDBCommandExecuteResult::Failed(db_path_ + " is not a directory");
       return;
     }
     struct dirent* entry;
@@ -528,8 +607,9 @@ void ManifestDumpCommand::DoCommand() {
           manifestfile = db_path_ + "/" + std::string(entry->d_name);
           found = true;
         } else {
-          exec_state_ = LDBCommandExecuteResult::FAILED(
-            "Multiple MANIFEST files found; use --path to select one");
+          exec_state_ = LDBCommandExecuteResult::Failed(
+              "Multiple MANIFEST files found; use --path to select one");
+          closedir(d);
           return;
         }
       }
@@ -541,19 +621,7 @@ void ManifestDumpCommand::DoCommand() {
     printf("Processing Manifest file %s\n", manifestfile.c_str());
   }
 
-  Options options;
-  EnvOptions sopt;
-  std::string file(manifestfile);
-  std::string dbname("dummy");
-  std::shared_ptr<Cache> tc(NewLRUCache(
-      options.max_open_files - 10, options.table_cache_numshardbits,
-      options.table_cache_remove_scan_count_limit));
-  VersionSet* versions = new VersionSet(dbname, &options, sopt, tc.get());
-  Status s = versions->DumpManifest(options, file, verbose_, is_key_hex_);
-  if (!s.ok()) {
-    printf("Error in processing file %s %s\n", manifestfile.c_str(),
-           s.ToString().c_str());
-  }
+  DumpManifestFile(manifestfile, verbose_, is_key_hex_);
   if (verbose_) {
     printf("Processing Manifest file %s done\n", manifestfile.c_str());
   }
@@ -574,7 +642,7 @@ ListColumnFamiliesCommand::ListColumnFamiliesCommand(
     : LDBCommand(options, flags, false, {}) {
 
   if (params.size() != 1) {
-    exec_state_ = LDBCommandExecuteResult::FAILED(
+    exec_state_ = LDBCommandExecuteResult::Failed(
         "dbname must be specified for the list_column_families command");
   } else {
     dbname_ = params[0];
@@ -717,7 +785,7 @@ void InternalDumpCommand::DoCommand() {
   // Cast as DBImpl to get internal iterator
   DBImpl* idb = dynamic_cast<DBImpl*>(db_);
   if (!idb) {
-    exec_state_ = LDBCommandExecuteResult::FAILED("DB is not DBImpl");
+    exec_state_ = LDBCommandExecuteResult::Failed("DB is not DBImpl");
     return;
   }
   string rtype1,rtype2,row,val;
@@ -725,15 +793,17 @@ void InternalDumpCommand::DoCommand() {
   uint64_t c=0;
   uint64_t s1=0,s2=0;
   // Setup internal key iterator
-  auto iter = unique_ptr<Iterator>(idb->TEST_NewInternalIterator());
+  Arena arena;
+  ScopedArenaIterator iter(idb->TEST_NewInternalIterator(&arena));
   Status st = iter->status();
   if (!st.ok()) {
-    exec_state_ = LDBCommandExecuteResult::FAILED("Iterator error:"
-                                                  + st.ToString());
+    exec_state_ =
+        LDBCommandExecuteResult::Failed("Iterator error:" + st.ToString());
   }
 
   if (has_from_) {
-    InternalKey ikey(from_, kMaxSequenceNumber, kValueTypeForSeek);
+    InternalKey ikey;
+    ikey.SetMaxPossibleForUserKey(from_);
     iter->Seek(ikey.Encode());
   } else {
     iter->SeekToFirst();
@@ -833,13 +903,17 @@ DBDumperCommand::DBDumperCommand(const vector<string>& params,
   itr = options.find(ARG_MAX_KEYS);
   if (itr != options.end()) {
     try {
+#if defined(CYGWIN)
+      max_keys_ = strtol(itr->second.c_str(), 0, 10);
+#else
       max_keys_ = stoi(itr->second);
+#endif
     } catch(const invalid_argument&) {
-      exec_state_ = LDBCommandExecuteResult::FAILED(ARG_MAX_KEYS +
-                        " has an invalid value");
+      exec_state_ = LDBCommandExecuteResult::Failed(ARG_MAX_KEYS +
+                                                    " has an invalid value");
     } catch(const out_of_range&) {
-      exec_state_ = LDBCommandExecuteResult::FAILED(ARG_MAX_KEYS +
-                        " has a value out-of-range");
+      exec_state_ = LDBCommandExecuteResult::Failed(
+          ARG_MAX_KEYS + " has a value out-of-range");
     }
   }
   itr = options.find(ARG_COUNT_DELIM);
@@ -897,8 +971,8 @@ void DBDumperCommand::DoCommand() {
   Iterator* iter = db_->NewIterator(ReadOptions());
   Status st = iter->status();
   if (!st.ok()) {
-    exec_state_ = LDBCommandExecuteResult::FAILED("Iterator error."
-        + st.ToString());
+    exec_state_ =
+        LDBCommandExecuteResult::Failed("Iterator error." + st.ToString());
   }
 
   if (!null_from_) {
@@ -934,8 +1008,9 @@ void DBDumperCommand::DoCommand() {
   uint64_t s1=0,s2=0;
 
   // At this point, bucket_size=0 => time_range=0
-  uint64_t num_buckets = (bucket_size >= time_range) ? 1 :
-    ((time_range + bucket_size - 1) / bucket_size);
+  int num_buckets = (bucket_size >= time_range)
+                        ? 1
+                        : ((time_range + bucket_size - 1) / bucket_size);
   vector<uint64_t> bucket_counts(num_buckets, 0);
   if (is_db_ttl_ && !count_only_ && timestamp_ && !count_delim_) {
     fprintf(stdout, "Dumping key-values from %s to %s\n",
@@ -1020,7 +1095,7 @@ ReduceDBLevelsCommand::ReduceDBLevelsCommand(const vector<string>& params,
       const map<string, string>& options, const vector<string>& flags) :
     LDBCommand(options, flags, false,
                BuildCmdLineOptions({ARG_NEW_LEVELS, ARG_PRINT_OLD_LEVELS})),
-    old_levels_(1 << 16),
+    old_levels_(1 << 7),
     new_levels_(-1),
     print_old_levels_(false) {
 
@@ -1029,8 +1104,8 @@ ReduceDBLevelsCommand::ReduceDBLevelsCommand(const vector<string>& params,
   print_old_levels_ = IsFlagPresent(flags, ARG_PRINT_OLD_LEVELS);
 
   if(new_levels_ <= 0) {
-    exec_state_ = LDBCommandExecuteResult::FAILED(
-           " Use --" + ARG_NEW_LEVELS + " to specify a new level number\n");
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        " Use --" + ARG_NEW_LEVELS + " to specify a new level number\n");
   }
 }
 
@@ -1039,7 +1114,7 @@ vector<string> ReduceDBLevelsCommand::PrepareArgs(const string& db_path,
   vector<string> ret;
   ret.push_back("reduce_levels");
   ret.push_back("--" + ARG_DB + "=" + db_path);
-  ret.push_back("--" + ARG_NEW_LEVELS + "=" + to_string(new_levels));
+  ret.push_back("--" + ARG_NEW_LEVELS + "=" + rocksdb::ToString(new_levels));
   if(print_old_level) {
     ret.push_back("--" + ARG_PRINT_OLD_LEVELS);
   }
@@ -1069,10 +1144,11 @@ Status ReduceDBLevelsCommand::GetOldNumOfLevels(Options& opt,
     int* levels) {
   EnvOptions soptions;
   std::shared_ptr<Cache> tc(
-      NewLRUCache(opt.max_open_files - 10, opt.table_cache_numshardbits,
-                  opt.table_cache_remove_scan_count_limit));
+      NewLRUCache(opt.max_open_files - 10, opt.table_cache_numshardbits));
   const InternalKeyComparator cmp(opt.comparator);
-  VersionSet versions(db_path_, &opt, soptions, tc.get());
+  WriteController wc;
+  WriteBuffer wb(opt.db_write_buffer_size);
+  VersionSet versions(db_path_, &opt, soptions, tc.get(), &wb, &wc);
   std::vector<ColumnFamilyDescriptor> dummy;
   ColumnFamilyDescriptor dummy_descriptor(kDefaultColumnFamilyName,
                                           ColumnFamilyOptions(opt));
@@ -1087,7 +1163,7 @@ Status ReduceDBLevelsCommand::GetOldNumOfLevels(Options& opt,
   int max = -1;
   auto default_cfd = versions.GetColumnFamilySet()->GetDefault();
   for (int i = 0; i < default_cfd->NumberLevels(); i++) {
-    if (default_cfd->current()->NumLevelFiles(i)) {
+    if (default_cfd->current()->storage_info()->NumLevelFiles(i)) {
       max = i;
     }
   }
@@ -1098,8 +1174,8 @@ Status ReduceDBLevelsCommand::GetOldNumOfLevels(Options& opt,
 
 void ReduceDBLevelsCommand::DoCommand() {
   if (new_levels_ <= 1) {
-    exec_state_ = LDBCommandExecuteResult::FAILED(
-        "Invalid number of levels.\n");
+    exec_state_ =
+        LDBCommandExecuteResult::Failed("Invalid number of levels.\n");
     return;
   }
 
@@ -1108,7 +1184,7 @@ void ReduceDBLevelsCommand::DoCommand() {
   int old_level_num = -1;
   st = GetOldNumOfLevels(opt, &old_level_num);
   if (!st.ok()) {
-    exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString());
+    exec_state_ = LDBCommandExecuteResult::Failed(st.ToString());
     return;
   }
 
@@ -1134,7 +1210,7 @@ void ReduceDBLevelsCommand::DoCommand() {
   EnvOptions soptions;
   st = VersionSet::ReduceNumberOfLevels(db_path_, &opt, soptions, new_levels_);
   if (!st.ok()) {
-    exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString());
+    exec_state_ = LDBCommandExecuteResult::Failed(st.ToString());
     return;
   }
 }
@@ -1157,9 +1233,9 @@ ChangeCompactionStyleCommand::ChangeCompactionStyleCommand(
     exec_state_);
   if (old_compaction_style_ != kCompactionStyleLevel &&
      old_compaction_style_ != kCompactionStyleUniversal) {
-    exec_state_ = LDBCommandExecuteResult::FAILED(
-      "Use --" + ARG_OLD_COMPACTION_STYLE + " to specify old compaction " +
-      "style. Check ldb help for proper compaction style value.\n");
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        "Use --" + ARG_OLD_COMPACTION_STYLE + " to specify old compaction " +
+        "style. Check ldb help for proper compaction style value.\n");
     return;
   }
 
@@ -1167,24 +1243,24 @@ ChangeCompactionStyleCommand::ChangeCompactionStyleCommand(
     exec_state_);
   if (new_compaction_style_ != kCompactionStyleLevel &&
      new_compaction_style_ != kCompactionStyleUniversal) {
-    exec_state_ = LDBCommandExecuteResult::FAILED(
-      "Use --" + ARG_NEW_COMPACTION_STYLE + " to specify new compaction " +
-      "style. Check ldb help for proper compaction style value.\n");
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        "Use --" + ARG_NEW_COMPACTION_STYLE + " to specify new compaction " +
+        "style. Check ldb help for proper compaction style value.\n");
     return;
   }
 
   if (new_compaction_style_ == old_compaction_style_) {
-    exec_state_ = LDBCommandExecuteResult::FAILED(
-      "Old compaction style is the same as new compaction style. "
-      "Nothing to do.\n");
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        "Old compaction style is the same as new compaction style. "
+        "Nothing to do.\n");
     return;
   }
 
   if (old_compaction_style_ == kCompactionStyleUniversal &&
       new_compaction_style_ == kCompactionStyleLevel) {
-    exec_state_ = LDBCommandExecuteResult::FAILED(
-      "Convert from universal compaction to level compaction. "
-      "Nothing to do.\n");
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        "Convert from universal compaction to level compaction. "
+        "Nothing to do.\n");
     return;
   }
 }
@@ -1253,16 +1329,19 @@ void ChangeCompactionStyleCommand::DoCommand() {
 
     // level 0 should have only 1 file
     if (i == 0 && num_files != 1) {
-      exec_state_ = LDBCommandExecuteResult::FAILED("Number of db files at "
-        "level 0 after compaction is " + std::to_string(num_files) +
-        ", not 1.\n");
+      exec_state_ = LDBCommandExecuteResult::Failed(
+          "Number of db files at "
+          "level 0 after compaction is " +
+          ToString(num_files) + ", not 1.\n");
       return;
     }
     // other levels should have no file
     if (i > 0 && num_files != 0) {
-      exec_state_ = LDBCommandExecuteResult::FAILED("Number of db files at "
-        "level " + std::to_string(i) + " after compaction is " +
-        std::to_string(num_files) + ", not 0.\n");
+      exec_state_ = LDBCommandExecuteResult::Failed(
+          "Number of db files at "
+          "level " +
+          ToString(i) + " after compaction is " + ToString(num_files) +
+          ", not 0.\n");
       return;
     }
   }
@@ -1271,9 +1350,19 @@ void ChangeCompactionStyleCommand::DoCommand() {
           files_per_level.c_str());
 }
 
+// ----------------------------------------------------------------------------
+
+namespace {
+
+struct StdErrReporter : public log::Reader::Reporter {
+  virtual void Corruption(size_t bytes, const Status& s) override {
+    cerr << "Corruption detected in log file " << s.ToString() << "\n";
+  }
+};
+
 class InMemoryHandler : public WriteBatch::Handler {
  public:
-  InMemoryHandler(stringstream& row, bool print_values) : Handler(),row_(row) {
+  InMemoryHandler(stringstream& row, bool print_values) : Handler(), row_(row) {
     print_values_ = print_values;
   }
 
@@ -1288,28 +1377,78 @@ class InMemoryHandler : public WriteBatch::Handler {
     }
   }
 
-  virtual void Put(const Slice& key, const Slice& value) {
+  virtual void Put(const Slice& key, const Slice& value) override {
     row_ << "PUT : ";
     commonPutMerge(key, value);
   }
 
-  virtual void Merge(const Slice& key, const Slice& value) {
+  virtual void Merge(const Slice& key, const Slice& value) override {
     row_ << "MERGE : ";
     commonPutMerge(key, value);
   }
 
-  virtual void Delete(const Slice& key) {
+  virtual void Delete(const Slice& key) override {
     row_ <<",DELETE : ";
     row_ << LDBCommand::StringToHex(key.ToString()) << " ";
   }
 
-  virtual ~InMemoryHandler() { };
+  virtual ~InMemoryHandler() {}
 
  private:
   stringstream & row_;
   bool print_values_;
 };
 
+void DumpWalFile(std::string wal_file, bool print_header, bool print_values,
+                 LDBCommandExecuteResult* exec_state) {
+  unique_ptr<SequentialFile> file;
+  Env* env_ = Env::Default();
+  EnvOptions soptions;
+  Status status = env_->NewSequentialFile(wal_file, &file, soptions);
+  if (!status.ok()) {
+    if (exec_state) {
+      *exec_state = LDBCommandExecuteResult::Failed("Failed to open WAL file " +
+                                                    status.ToString());
+    } else {
+      cerr << "Error: Failed to open WAL file " << status.ToString()
+           << std::endl;
+    }
+  } else {
+    StdErrReporter reporter;
+    log::Reader reader(move(file), &reporter, true, 0);
+    string scratch;
+    WriteBatch batch;
+    Slice record;
+    stringstream row;
+    if (print_header) {
+      cout << "Sequence,Count,ByteSize,Physical Offset,Key(s)";
+      if (print_values) {
+        cout << " : value ";
+      }
+      cout << "\n";
+    }
+    while (reader.ReadRecord(&record, &scratch)) {
+      row.str("");
+      if (record.size() < 12) {
+        reporter.Corruption(record.size(),
+                            Status::Corruption("log record too small"));
+      } else {
+        WriteBatchInternal::SetContents(&batch, record);
+        row << WriteBatchInternal::Sequence(&batch) << ",";
+        row << WriteBatchInternal::Count(&batch) << ",";
+        row << WriteBatchInternal::ByteSize(&batch) << ",";
+        row << reader.LastRecordOffset() << ",";
+        InMemoryHandler handler(row, print_values);
+        batch.Iterate(&handler);
+        row << "\n";
+      }
+      cout << row.str();
+    }
+  }
+}
+
+}  // namespace
+
 const string WALDumperCommand::ARG_WAL_FILE = "walfile";
 const string WALDumperCommand::ARG_PRINT_VALUE = "print_value";
 const string WALDumperCommand::ARG_PRINT_HEADER = "header";
@@ -1332,8 +1471,8 @@ WALDumperCommand::WALDumperCommand(const vector<string>& params,
   print_header_ = IsFlagPresent(flags, ARG_PRINT_HEADER);
   print_values_ = IsFlagPresent(flags, ARG_PRINT_VALUE);
   if (wal_file_.empty()) {
-    exec_state_ = LDBCommandExecuteResult::FAILED(
-                    "Argument " + ARG_WAL_FILE + " must be specified.");
+    exec_state_ = LDBCommandExecuteResult::Failed("Argument " + ARG_WAL_FILE +
+                                                  " must be specified.");
   }
 }
 
@@ -1347,53 +1486,10 @@ void WALDumperCommand::Help(string& ret) {
 }
 
 void WALDumperCommand::DoCommand() {
-  struct StdErrReporter : public log::Reader::Reporter {
-    virtual void Corruption(size_t bytes, const Status& s) {
-      cerr<<"Corruption detected in log file "<<s.ToString()<<"\n";
-    }
-  };
-
-  unique_ptr<SequentialFile> file;
-  Env* env_ = Env::Default();
-  EnvOptions soptions;
-  Status status = env_->NewSequentialFile(wal_file_, &file, soptions);
-  if (!status.ok()) {
-    exec_state_ = LDBCommandExecuteResult::FAILED("Failed to open WAL file " +
-      status.ToString());
-  } else {
-    StdErrReporter reporter;
-    log::Reader reader(move(file), &reporter, true, 0);
-    string scratch;
-    WriteBatch batch;
-    Slice record;
-    stringstream row;
-    if (print_header_) {
-      cout<<"Sequence,Count,ByteSize,Physical Offset,Key(s)";
-      if (print_values_) {
-        cout << " : value ";
-      }
-      cout << "\n";
-    }
-    while(reader.ReadRecord(&record, &scratch)) {
-      row.str("");
-      if (record.size() < 12) {
-        reporter.Corruption(
-            record.size(), Status::Corruption("log record too small"));
-      } else {
-        WriteBatchInternal::SetContents(&batch, record);
-        row<<WriteBatchInternal::Sequence(&batch)<<",";
-        row<<WriteBatchInternal::Count(&batch)<<",";
-        row<<WriteBatchInternal::ByteSize(&batch)<<",";
-        row<<reader.LastRecordOffset()<<",";
-        InMemoryHandler handler(row, print_values_);
-        batch.Iterate(&handler);
-        row<<"\n";
-      }
-      cout<<row.str();
-    }
-  }
+  DumpWalFile(wal_file_, print_header_, print_values_, &exec_state_);
 }
 
+// ----------------------------------------------------------------------------
 
 GetCommand::GetCommand(const vector<string>& params,
       const map<string, string>& options, const vector<string>& flags) :
@@ -1402,8 +1498,8 @@ GetCommand::GetCommand(const vector<string>& params,
                                                         ARG_VALUE_HEX})) {
 
   if (params.size() != 1) {
-    exec_state_ = LDBCommandExecuteResult::FAILED(
-                    "<key> must be specified for the get command");
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        "<key> must be specified for the get command");
   } else {
     key_ = params.at(0);
   }
@@ -1428,10 +1524,11 @@ void GetCommand::DoCommand() {
     fprintf(stdout, "%s\n",
               (is_value_hex_ ? StringToHex(value) : value).c_str());
   } else {
-    exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString());
+    exec_state_ = LDBCommandExecuteResult::Failed(st.ToString());
   }
 }
 
+// ----------------------------------------------------------------------------
 
 ApproxSizeCommand::ApproxSizeCommand(const vector<string>& params,
       const map<string, string>& options, const vector<string>& flags) :
@@ -1442,16 +1539,16 @@ ApproxSizeCommand::ApproxSizeCommand(const vector<string>& params,
   if (options.find(ARG_FROM) != options.end()) {
     start_key_ = options.find(ARG_FROM)->second;
   } else {
-    exec_state_ = LDBCommandExecuteResult::FAILED(ARG_FROM +
-                    " must be specified for approxsize command");
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        ARG_FROM + " must be specified for approxsize command");
     return;
   }
 
   if (options.find(ARG_TO) != options.end()) {
     end_key_ = options.find(ARG_TO)->second;
   } else {
-    exec_state_ = LDBCommandExecuteResult::FAILED(ARG_TO +
-                    " must be specified for approxsize command");
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        ARG_TO + " must be specified for approxsize command");
     return;
   }
 
@@ -1478,11 +1575,12 @@ void ApproxSizeCommand::DoCommand() {
   /* Weird that GetApproximateSizes() returns void, although documentation
    * says that it returns a Status object.
   if (!st.ok()) {
-    exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString());
+    exec_state_ = LDBCommandExecuteResult::Failed(st.ToString());
   }
   */
 }
 
+// ----------------------------------------------------------------------------
 
 BatchPutCommand::BatchPutCommand(const vector<string>& params,
       const map<string, string>& options, const vector<string>& flags) :
@@ -1491,10 +1589,10 @@ BatchPutCommand::BatchPutCommand(const vector<string>& params,
                                   ARG_CREATE_IF_MISSING})) {
 
   if (params.size() < 2) {
-    exec_state_ = LDBCommandExecuteResult::FAILED(
+    exec_state_ = LDBCommandExecuteResult::Failed(
         "At least one <key> <value> pair must be specified batchput.");
   } else if (params.size() % 2 != 0) {
-    exec_state_ = LDBCommandExecuteResult::FAILED(
+    exec_state_ = LDBCommandExecuteResult::Failed(
         "Equal number of <key>s and <value>s must be specified for batchput.");
   } else {
     for (size_t i = 0; i < params.size(); i += 2) {
@@ -1519,14 +1617,14 @@ void BatchPutCommand::DoCommand() {
   WriteBatch batch;
 
   for (vector<pair<string, string>>::const_iterator itr
-        = key_values_.begin(); itr != key_values_.end(); itr++) {
+        = key_values_.begin(); itr != key_values_.end(); ++itr) {
       batch.Put(itr->first, itr->second);
   }
   Status st = db_->Write(WriteOptions(), &batch);
   if (st.ok()) {
     fprintf(stdout, "OK\n");
   } else {
-    exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString());
+    exec_state_ = LDBCommandExecuteResult::Failed(st.ToString());
   }
 }
 
@@ -1536,6 +1634,7 @@ Options BatchPutCommand::PrepareOptionsForOpenDB() {
   return opt;
 }
 
+// ----------------------------------------------------------------------------
 
 ScanCommand::ScanCommand(const vector<string>& params,
       const map<string, string>& options, const vector<string>& flags) :
@@ -1567,13 +1666,17 @@ ScanCommand::ScanCommand(const vector<string>& params,
   itr = options.find(ARG_MAX_KEYS);
   if (itr != options.end()) {
     try {
+#if defined(CYGWIN)
+      max_keys_scanned_ = strtol(itr->second.c_str(), 0, 10);
+#else
       max_keys_scanned_ = stoi(itr->second);
+#endif
     } catch(const invalid_argument&) {
-      exec_state_ = LDBCommandExecuteResult::FAILED(ARG_MAX_KEYS +
-                        " has an invalid value");
+      exec_state_ = LDBCommandExecuteResult::Failed(ARG_MAX_KEYS +
+                                                    " has an invalid value");
     } catch(const out_of_range&) {
-      exec_state_ = LDBCommandExecuteResult::FAILED(ARG_MAX_KEYS +
-                        " has a value out-of-range");
+      exec_state_ = LDBCommandExecuteResult::Failed(
+          ARG_MAX_KEYS + " has a value out-of-range");
     }
   }
 }
@@ -1619,7 +1722,7 @@ void ScanCommand::DoCommand() {
   for ( ;
         it->Valid() && (!end_key_specified_ || it->key().ToString() < end_key_);
         it->Next()) {
-    string key = it->key().ToString();
+    string key = ldb_options_.key_formatter->Format(it->key());
     if (is_db_ttl_) {
       TtlIterator* it_ttl = dynamic_cast<TtlIterator*>(it);
       assert(it_ttl);
@@ -1633,8 +1736,8 @@ void ScanCommand::DoCommand() {
     }
     string value = it->value().ToString();
     fprintf(stdout, "%s : %s\n",
-          (is_key_hex_ ? StringToHex(key) : key).c_str(),
-          (is_value_hex_ ? StringToHex(value) : value).c_str()
+            (is_key_hex_ ? "0x" + it->key().ToString(true) : key).c_str(),
+            (is_value_hex_ ? StringToHex(value) : value).c_str()
         );
     num_keys_scanned++;
     if (max_keys_scanned_ >= 0 && num_keys_scanned >= max_keys_scanned_) {
@@ -1642,11 +1745,12 @@ void ScanCommand::DoCommand() {
     }
   }
   if (!it->status().ok()) {  // Check for any errors found during the scan
-    exec_state_ = LDBCommandExecuteResult::FAILED(it->status().ToString());
+    exec_state_ = LDBCommandExecuteResult::Failed(it->status().ToString());
   }
   delete it;
 }
 
+// ----------------------------------------------------------------------------
 
 DeleteCommand::DeleteCommand(const vector<string>& params,
       const map<string, string>& options, const vector<string>& flags) :
@@ -1654,8 +1758,8 @@ DeleteCommand::DeleteCommand(const vector<string>& params,
              BuildCmdLineOptions({ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX})) {
 
   if (params.size() != 1) {
-    exec_state_ = LDBCommandExecuteResult::FAILED(
-                    "KEY must be specified for the delete command");
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        "KEY must be specified for the delete command");
   } else {
     key_ = params.at(0);
     if (is_key_hex_) {
@@ -1675,7 +1779,7 @@ void DeleteCommand::DoCommand() {
   if (st.ok()) {
     fprintf(stdout, "OK\n");
   } else {
-    exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString());
+    exec_state_ = LDBCommandExecuteResult::Failed(st.ToString());
   }
 }
 
@@ -1687,8 +1791,8 @@ PutCommand::PutCommand(const vector<string>& params,
                                   ARG_CREATE_IF_MISSING})) {
 
   if (params.size() != 2) {
-    exec_state_ = LDBCommandExecuteResult::FAILED(
-                    "<key> and <value> must be specified for the put command");
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        "<key> and <value> must be specified for the put command");
   } else {
     key_ = params.at(0);
     value_ = params.at(1);
@@ -1716,7 +1820,7 @@ void PutCommand::DoCommand() {
   if (st.ok()) {
     fprintf(stdout, "OK\n");
   } else {
-    exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString());
+    exec_state_ = LDBCommandExecuteResult::Failed(st.ToString());
   }
 }
 
@@ -1726,6 +1830,7 @@ Options PutCommand::PrepareOptionsForOpenDB() {
   return opt;
 }
 
+// ----------------------------------------------------------------------------
 
 const char* DBQuerierCommand::HELP_CMD = "help";
 const char* DBQuerierCommand::GET_CMD = "get";
@@ -1807,6 +1912,8 @@ void DBQuerierCommand::DoCommand() {
   }
 }
 
+// ----------------------------------------------------------------------------
+
 CheckConsistencyCommand::CheckConsistencyCommand(const vector<string>& params,
     const map<string, string>& options, const vector<string>& flags) :
   LDBCommand(options, flags, false,
@@ -1831,7 +1938,119 @@ void CheckConsistencyCommand::DoCommand() {
   if (st.ok()) {
     fprintf(stdout, "OK\n");
   } else {
-    exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString());
+    exec_state_ = LDBCommandExecuteResult::Failed(st.ToString());
+  }
+}
+
+// ----------------------------------------------------------------------------
+
+namespace {
+
+void DumpSstFile(std::string filename, bool output_hex, bool show_properties) {
+  std::string from_key;
+  std::string to_key;
+  if (filename.length() <= 4 ||
+      filename.rfind(".sst") != filename.length() - 4) {
+    std::cout << "Invalid sst file name." << std::endl;
+    return;
+  }
+  // no verification
+  rocksdb::SstFileReader reader(filename, false, output_hex);
+  Status st = reader.ReadSequential(true, -1, false,  // has_from
+                                    from_key, false,  // has_to
+                                    to_key);
+  if (!st.ok()) {
+    std::cerr << "Error in reading SST file " << filename << st.ToString()
+              << std::endl;
+    return;
+  }
+
+  if (show_properties) {
+    const rocksdb::TableProperties* table_properties;
+
+    std::shared_ptr<const rocksdb::TableProperties>
+        table_properties_from_reader;
+    st = reader.ReadTableProperties(&table_properties_from_reader);
+    if (!st.ok()) {
+      std::cerr << filename << ": " << st.ToString()
+                << ". Try to use initial table properties" << std::endl;
+      table_properties = reader.GetInitTableProperties();
+    } else {
+      table_properties = table_properties_from_reader.get();
+    }
+    if (table_properties != nullptr) {
+      std::cout << std::endl << "Table Properties:" << std::endl;
+      std::cout << table_properties->ToString("\n") << std::endl;
+      std::cout << "# deleted keys: "
+                << rocksdb::GetDeletedKeys(
+                       table_properties->user_collected_properties)
+                << std::endl;
+    }
+  }
+}
+
+}  // namespace
+
+DBFileDumperCommand::DBFileDumperCommand(const vector<string>& params,
+                                         const map<string, string>& options,
+                                         const vector<string>& flags)
+    : LDBCommand(options, flags, true, BuildCmdLineOptions({})) {}
+
+void DBFileDumperCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(DBFileDumperCommand::Name());
+  ret.append("\n");
+}
+
+void DBFileDumperCommand::DoCommand() {
+  if (!db_) {
+    return;
+  }
+  Status s;
+
+  std::cout << "Manifest File" << std::endl;
+  std::cout << "==============================" << std::endl;
+  std::string manifest_filename;
+  s = ReadFileToString(db_->GetEnv(), CurrentFileName(db_->GetName()),
+                       &manifest_filename);
+  if (!s.ok() || manifest_filename.empty() ||
+      manifest_filename.back() != '\n') {
+    std::cerr << "Error when reading CURRENT file "
+              << CurrentFileName(db_->GetName()) << std::endl;
+  }
+  // remove the trailing '\n'
+  manifest_filename.resize(manifest_filename.size() - 1);
+  string manifest_filepath = db_->GetName() + "/" + manifest_filename;
+  std::cout << manifest_filepath << std::endl;
+  DumpManifestFile(manifest_filepath, false, false);
+  std::cout << std::endl;
+
+  std::cout << "SST Files" << std::endl;
+  std::cout << "==============================" << std::endl;
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+  for (auto& fileMetadata : metadata) {
+    std::string filename = fileMetadata.db_path + fileMetadata.name;
+    std::cout << filename << " level:" << fileMetadata.level << std::endl;
+    std::cout << "------------------------------" << std::endl;
+    DumpSstFile(filename, false, true);
+    std::cout << std::endl;
+  }
+  std::cout << std::endl;
+
+  std::cout << "Write Ahead Log Files" << std::endl;
+  std::cout << "==============================" << std::endl;
+  rocksdb::VectorLogPtr wal_files;
+  s = db_->GetSortedWalFiles(wal_files);
+  if (!s.ok()) {
+    std::cerr << "Error when getting WAL files" << std::endl;
+  } else {
+    for (auto& wal : wal_files) {
+      // TODO(qyang): option.wal_dir should be passed into ldb command
+      std::string filename = db_->GetOptions().wal_dir + wal->PathName();
+      std::cout << filename << std::endl;
+      DumpWalFile(filename, true, true, &exec_state_);
+    }
   }
 }
 
diff --git a/src/rocksdb/util/ldb_cmd.h b/src/rocksdb/util/ldb_cmd.h
index 4f760e0..50de4de 100644
--- a/src/rocksdb/util/ldb_cmd.h
+++ b/src/rocksdb/util/ldb_cmd.h
@@ -4,22 +4,28 @@
 //  of patent rights can be found in the PATENTS file in the same directory.
 //
 #pragma once
+
+#ifndef ROCKSDB_LITE
+
 #include <string>
 #include <iostream>
 #include <sstream>
 #include <stdlib.h>
 #include <algorithm>
 #include <stdio.h>
+#include <vector>
+#include <map>
 
 #include "db/version_set.h"
 #include "rocksdb/env.h"
-#include "rocksdb/options.h"
 #include "rocksdb/iterator.h"
+#include "rocksdb/ldb_tool.h"
+#include "rocksdb/options.h"
 #include "rocksdb/slice.h"
+#include "rocksdb/utilities/db_ttl.h"
 #include "util/logging.h"
 #include "util/ldb_cmd_execute_result.h"
 #include "util/string_util.h"
-#include "utilities/db_ttl.h"
 #include "utilities/ttl/db_ttl_impl.h"
 
 using std::string;
@@ -45,32 +51,40 @@ public:
   static const string ARG_TO;
   static const string ARG_MAX_KEYS;
   static const string ARG_BLOOM_BITS;
+  static const string ARG_FIX_PREFIX_LEN;
   static const string ARG_COMPRESSION_TYPE;
   static const string ARG_BLOCK_SIZE;
   static const string ARG_AUTO_COMPACTION;
+  static const string ARG_DB_WRITE_BUFFER_SIZE;
   static const string ARG_WRITE_BUFFER_SIZE;
   static const string ARG_FILE_SIZE;
   static const string ARG_CREATE_IF_MISSING;
 
   static LDBCommand* InitFromCmdLineArgs(
     const vector<string>& args,
-    const Options& options = Options()
+    const Options& options,
+    const LDBOptions& ldb_options
   );
 
   static LDBCommand* InitFromCmdLineArgs(
     int argc,
     char** argv,
-    const Options& options = Options()
+    const Options& options,
+    const LDBOptions& ldb_options
   );
 
   bool ValidateCmdLineOptions();
 
   virtual Options PrepareOptionsForOpenDB();
 
-  virtual void SetOptions(Options options) {
+  virtual void SetDBOptions(Options options) {
     options_ = options;
   }
 
+  void SetLDBOptions(const LDBOptions& ldb_options) {
+    ldb_options_ = ldb_options;
+  }
+
   virtual bool NoDBOpen() {
     return false;
   }
@@ -97,7 +111,7 @@ public:
 
     DoCommand();
     if (exec_state_.IsNotStarted()) {
-      exec_state_ = LDBCommandExecuteResult::SUCCEED("");
+      exec_state_ = LDBCommandExecuteResult::Succeed("");
     }
 
     if (db_ != nullptr) {
@@ -229,7 +243,7 @@ protected:
     }
     if (!st.ok()) {
       string msg = st.ToString();
-      exec_state_ = LDBCommandExecuteResult::FAILED(msg);
+      exec_state_ = LDBCommandExecuteResult::Failed(msg);
     }
 
     options_ = opt;
@@ -276,10 +290,11 @@ protected:
    * used by this command.  It includes the common options and the ones
    * passed in.
    */
-  vector<string> BuildCmdLineOptions(vector<string> options) {
-    vector<string> ret = {ARG_DB, ARG_BLOOM_BITS, ARG_BLOCK_SIZE,
-                          ARG_AUTO_COMPACTION, ARG_COMPRESSION_TYPE,
-                          ARG_WRITE_BUFFER_SIZE, ARG_FILE_SIZE};
+  static vector<string> BuildCmdLineOptions(vector<string> options) {
+    vector<string> ret = {ARG_DB,               ARG_BLOOM_BITS,
+                          ARG_BLOCK_SIZE,       ARG_AUTO_COMPACTION,
+                          ARG_COMPRESSION_TYPE, ARG_WRITE_BUFFER_SIZE,
+                          ARG_FILE_SIZE,        ARG_FIX_PREFIX_LEN};
     ret.insert(ret.end(), options.begin(), options.end());
     return ret;
   }
@@ -291,6 +306,7 @@ protected:
                          const string& option, string* value);
 
   Options options_;
+  LDBOptions ldb_options_;
 
 private:
 
@@ -369,7 +385,7 @@ public:
 
   static void Help(string& ret);
 
-  virtual void DoCommand();
+  virtual void DoCommand() override;
 
 private:
   bool null_from_;
@@ -378,6 +394,19 @@ private:
   string to_;
 };
 
+class DBFileDumperCommand : public LDBCommand {
+ public:
+  static string Name() { return "dump_live_files"; }
+
+  DBFileDumperCommand(const vector<string>& params,
+                      const map<string, string>& options,
+                      const vector<string>& flags);
+
+  static void Help(string& ret);
+
+  virtual void DoCommand() override;
+};
+
 class DBDumperCommand: public LDBCommand {
 public:
   static string Name() { return "dump"; }
@@ -387,7 +416,7 @@ public:
 
   static void Help(string& ret);
 
-  virtual void DoCommand();
+  virtual void DoCommand() override;
 
 private:
   bool null_from_;
@@ -416,7 +445,7 @@ public:
 
   static void Help(string& ret);
 
-  virtual void DoCommand();
+  virtual void DoCommand() override;
 
 private:
   bool has_from_;
@@ -447,9 +476,9 @@ public:
       const map<string, string>& options, const vector<string>& flags);
 
   static void Help(string& ret);
-  virtual void DoCommand();
+  virtual void DoCommand() override;
 
-  virtual Options PrepareOptionsForOpenDB();
+  virtual Options PrepareOptionsForOpenDB() override;
 
 private:
   bool create_if_missing_;
@@ -470,11 +499,9 @@ public:
       const map<string, string>& options, const vector<string>& flags);
 
   static void Help(string& ret);
-  virtual void DoCommand();
+  virtual void DoCommand() override;
 
-  virtual bool NoDBOpen() {
-    return true;
-  }
+  virtual bool NoDBOpen() override { return true; }
 
 private:
   bool verbose_;
@@ -493,9 +520,9 @@ class ListColumnFamiliesCommand : public LDBCommand {
                             const vector<string>& flags);
 
   static void Help(string& ret);
-  virtual void DoCommand();
+  virtual void DoCommand() override;
 
-  virtual bool NoDBOpen() { return true; }
+  virtual bool NoDBOpen() override { return true; }
 
  private:
   string dbname_;
@@ -508,13 +535,11 @@ public:
   ReduceDBLevelsCommand(const vector<string>& params,
       const map<string, string>& options, const vector<string>& flags);
 
-  virtual Options PrepareOptionsForOpenDB();
+  virtual Options PrepareOptionsForOpenDB() override;
 
-  virtual void DoCommand();
+  virtual void DoCommand() override;
 
-  virtual bool NoDBOpen() {
-    return true;
-  }
+  virtual bool NoDBOpen() override { return true; }
 
   static void Help(string& msg);
 
@@ -539,9 +564,9 @@ public:
   ChangeCompactionStyleCommand(const vector<string>& params,
       const map<string, string>& options, const vector<string>& flags);
 
-  virtual Options PrepareOptionsForOpenDB();
+  virtual Options PrepareOptionsForOpenDB() override;
 
-  virtual void DoCommand();
+  virtual void DoCommand() override;
 
   static void Help(string& msg);
 
@@ -560,12 +585,10 @@ public:
   WALDumperCommand(const vector<string>& params,
       const map<string, string>& options, const vector<string>& flags);
 
-  virtual bool  NoDBOpen() {
-    return true;
-  }
+  virtual bool NoDBOpen() override { return true; }
 
   static void Help(string& ret);
-  virtual void DoCommand();
+  virtual void DoCommand() override;
 
 private:
   bool print_header_;
@@ -585,7 +608,7 @@ public:
   GetCommand(const vector<string>& params, const map<string, string>& options,
       const vector<string>& flags);
 
-  virtual void DoCommand();
+  virtual void DoCommand() override;
 
   static void Help(string& ret);
 
@@ -600,7 +623,7 @@ public:
   ApproxSizeCommand(const vector<string>& params,
       const map<string, string>& options, const vector<string>& flags);
 
-  virtual void DoCommand();
+  virtual void DoCommand() override;
 
   static void Help(string& ret);
 
@@ -616,11 +639,11 @@ public:
   BatchPutCommand(const vector<string>& params,
       const map<string, string>& options, const vector<string>& flags);
 
-  virtual void DoCommand();
+  virtual void DoCommand() override;
 
   static void Help(string& ret);
 
-  virtual Options PrepareOptionsForOpenDB();
+  virtual Options PrepareOptionsForOpenDB() override;
 
 private:
   /**
@@ -636,7 +659,7 @@ public:
   ScanCommand(const vector<string>& params, const map<string, string>& options,
       const vector<string>& flags);
 
-  virtual void DoCommand();
+  virtual void DoCommand() override;
 
   static void Help(string& ret);
 
@@ -655,7 +678,7 @@ public:
   DeleteCommand(const vector<string>& params,
       const map<string, string>& options, const vector<string>& flags);
 
-  virtual void DoCommand();
+  virtual void DoCommand() override;
 
   static void Help(string& ret);
 
@@ -670,11 +693,11 @@ public:
   PutCommand(const vector<string>& params, const map<string, string>& options,
       const vector<string>& flags);
 
-  virtual void DoCommand();
+  virtual void DoCommand() override;
 
   static void Help(string& ret);
 
-  virtual Options PrepareOptionsForOpenDB();
+  virtual Options PrepareOptionsForOpenDB() override;
 
 private:
   string key_;
@@ -694,7 +717,7 @@ public:
 
   static void Help(string& ret);
 
-  virtual void DoCommand();
+  virtual void DoCommand() override;
 
 private:
   static const char* HELP_CMD;
@@ -710,13 +733,13 @@ public:
   CheckConsistencyCommand(const vector<string>& params,
       const map<string, string>& options, const vector<string>& flags);
 
-  virtual void DoCommand();
+  virtual void DoCommand() override;
 
-  virtual bool NoDBOpen() {
-    return true;
-  }
+  virtual bool NoDBOpen() override { return true; }
 
   static void Help(string& ret);
 };
 
 } // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/util/ldb_cmd_execute_result.h b/src/rocksdb/util/ldb_cmd_execute_result.h
index b9121b2..35e9610 100644
--- a/src/rocksdb/util/ldb_cmd_execute_result.h
+++ b/src/rocksdb/util/ldb_cmd_execute_result.h
@@ -13,15 +13,10 @@ public:
     EXEC_NOT_STARTED = 0, EXEC_SUCCEED = 1, EXEC_FAILED = 2,
   };
 
-  LDBCommandExecuteResult() {
-    state_ = EXEC_NOT_STARTED;
-    message_ = "";
-  }
+  LDBCommandExecuteResult() : state_(EXEC_NOT_STARTED), message_("") {}
 
-  LDBCommandExecuteResult(State state, std::string& msg) {
-    state_ = state;
-    message_ = msg;
-  }
+  LDBCommandExecuteResult(State state, std::string& msg) :
+    state_(state), message_(msg) {}
 
   std::string ToString() {
     std::string ret;
@@ -57,11 +52,11 @@ public:
     return state_ == EXEC_FAILED;
   }
 
-  static LDBCommandExecuteResult SUCCEED(std::string msg) {
+  static LDBCommandExecuteResult Succeed(std::string msg) {
     return LDBCommandExecuteResult(EXEC_SUCCEED, msg);
   }
 
-  static LDBCommandExecuteResult FAILED(std::string msg) {
+  static LDBCommandExecuteResult Failed(std::string msg) {
     return LDBCommandExecuteResult(EXEC_FAILED, msg);
   }
 
diff --git a/src/rocksdb/util/ldb_tool.cc b/src/rocksdb/util/ldb_tool.cc
index 8439b63..fe84fa9 100644
--- a/src/rocksdb/util/ldb_tool.cc
+++ b/src/rocksdb/util/ldb_tool.cc
@@ -9,6 +9,17 @@
 
 namespace rocksdb {
 
+class DefaultSliceFormatter : public SliceFormatter {
+ public:
+  virtual std::string Format(const Slice& s) const override {
+    return s.ToString();
+  }
+};
+
+LDBOptions::LDBOptions()
+    : key_formatter(new DefaultSliceFormatter()) {
+}
+
 class LDBCommandRunner {
 public:
 
@@ -36,11 +47,14 @@ public:
         " with 'put','get','scan','dump','query','batchput'"
         " : DB supports ttl and value is internally timestamp-suffixed\n");
     ret.append("  --" + LDBCommand::ARG_BLOOM_BITS + "=<int,e.g.:14>\n");
+    ret.append("  --" + LDBCommand::ARG_FIX_PREFIX_LEN + "=<int,e.g.:14>\n");
     ret.append("  --" + LDBCommand::ARG_COMPRESSION_TYPE +
         "=<no|snappy|zlib|bzip2>\n");
     ret.append("  --" + LDBCommand::ARG_BLOCK_SIZE +
         "=<block_size_in_bytes>\n");
     ret.append("  --" + LDBCommand::ARG_AUTO_COMPACTION + "=<true|false>\n");
+    ret.append("  --" + LDBCommand::ARG_DB_WRITE_BUFFER_SIZE +
+        "=<int,e.g.:16777216>\n");
     ret.append("  --" + LDBCommand::ARG_WRITE_BUFFER_SIZE +
         "=<int,e.g.:4194304>\n");
     ret.append("  --" + LDBCommand::ARG_FILE_SIZE + "=<int,e.g.:2097152>\n");
@@ -66,18 +80,21 @@ public:
     DBLoaderCommand::Help(ret);
     ManifestDumpCommand::Help(ret);
     ListColumnFamiliesCommand::Help(ret);
+    DBFileDumperCommand::Help(ret);
     InternalDumpCommand::Help(ret);
 
     fprintf(stderr, "%s\n", ret.c_str());
   }
 
-  static void RunCommand(int argc, char** argv, Options options) {
+  static void RunCommand(int argc, char** argv, Options options,
+                         const LDBOptions& ldb_options) {
     if (argc <= 2) {
       PrintHelp(argv[0]);
       exit(1);
     }
 
-    LDBCommand* cmdObj = LDBCommand::InitFromCmdLineArgs(argc, argv, options);
+    LDBCommand* cmdObj = LDBCommand::InitFromCmdLineArgs(argc, argv, options,
+                                                         ldb_options);
     if (cmdObj == nullptr) {
       fprintf(stderr, "Unknown command\n");
       PrintHelp(argv[0]);
@@ -99,8 +116,9 @@ public:
 };
 
 
-void LDBTool::Run(int argc, char** argv, Options options) {
-  LDBCommandRunner::RunCommand(argc, argv, options);
+void LDBTool::Run(int argc, char** argv, Options options,
+                  const LDBOptions& ldb_options) {
+  LDBCommandRunner::RunCommand(argc, argv, options, ldb_options);
 }
 } // namespace rocksdb
 
diff --git a/src/rocksdb/util/log_buffer.cc b/src/rocksdb/util/log_buffer.cc
index 726c014..ddddaec 100644
--- a/src/rocksdb/util/log_buffer.cc
+++ b/src/rocksdb/util/log_buffer.cc
@@ -13,17 +13,17 @@ LogBuffer::LogBuffer(const InfoLogLevel log_level,
                      Logger*info_log)
     : log_level_(log_level), info_log_(info_log) {}
 
-void LogBuffer::AddLogToBuffer(const char* format, va_list ap) {
+void LogBuffer::AddLogToBuffer(size_t max_log_size, const char* format,
+                               va_list ap) {
   if (!info_log_ || log_level_ < info_log_->GetInfoLogLevel()) {
     // Skip the level because of its level.
     return;
   }
 
-  const size_t kLogSizeLimit = 512;
-  char* alloc_mem = arena_.AllocateAligned(kLogSizeLimit);
+  char* alloc_mem = arena_.AllocateAligned(max_log_size);
   BufferedLog* buffered_log = new (alloc_mem) BufferedLog();
   char* p = buffered_log->message;
-  char* limit = alloc_mem + kLogSizeLimit - 1;
+  char* limit = alloc_mem + max_log_size - 1;
 
   // store the time
   gettimeofday(&(buffered_log->now_tv), nullptr);
@@ -61,11 +61,22 @@ void LogBuffer::FlushBufferToLog() {
   logs_.clear();
 }
 
+void LogToBuffer(LogBuffer* log_buffer, size_t max_log_size, const char* format,
+                 ...) {
+  if (log_buffer != nullptr) {
+    va_list ap;
+    va_start(ap, format);
+    log_buffer->AddLogToBuffer(max_log_size, format, ap);
+    va_end(ap);
+  }
+}
+
 void LogToBuffer(LogBuffer* log_buffer, const char* format, ...) {
+  const size_t kDefaultMaxLogSize = 512;
   if (log_buffer != nullptr) {
     va_list ap;
     va_start(ap, format);
-    log_buffer->AddLogToBuffer(format, ap);
+    log_buffer->AddLogToBuffer(kDefaultMaxLogSize, format, ap);
     va_end(ap);
   }
 }
diff --git a/src/rocksdb/util/log_buffer.h b/src/rocksdb/util/log_buffer.h
index 8ebe92e..b5cf1d5 100644
--- a/src/rocksdb/util/log_buffer.h
+++ b/src/rocksdb/util/log_buffer.h
@@ -5,9 +5,11 @@
 
 #pragma once
 
+#include <sys/time.h>
 #include "rocksdb/env.h"
 #include "util/arena.h"
 #include "util/autovector.h"
+#include <ctime>
 
 namespace rocksdb {
 
@@ -20,8 +22,9 @@ class LogBuffer {
   // info_log:  logger to write the logs to
   LogBuffer(const InfoLogLevel log_level, Logger* info_log);
 
-  // Add a log entry to the buffer.
-  void AddLogToBuffer(const char* format, va_list ap);
+  // Add a log entry to the buffer. Use default max_log_size.
+  // max_log_size indicates maximize log size, including some metadata.
+  void AddLogToBuffer(size_t max_log_size, const char* format, va_list ap);
 
   size_t IsEmpty() const { return logs_.empty(); }
 
@@ -43,6 +46,10 @@ class LogBuffer {
 
 // Add log to the LogBuffer for a delayed info logging. It can be used when
 // we want to add some logs inside a mutex.
+// max_log_size indicates maximize log size, including some metadata.
+extern void LogToBuffer(LogBuffer* log_buffer, size_t max_log_size,
+                        const char* format, ...);
+// Same as previous function, but with default max log size.
 extern void LogToBuffer(LogBuffer* log_buffer, const char* format, ...);
 
 }  // namespace rocksdb
diff --git a/src/rocksdb/util/log_write_bench.cc b/src/rocksdb/util/log_write_bench.cc
index 536d14f..16e7af7 100644
--- a/src/rocksdb/util/log_write_bench.cc
+++ b/src/rocksdb/util/log_write_bench.cc
@@ -3,6 +3,14 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+  return 1;
+}
+#else
+
 #include <gflags/gflags.h>
 
 #include "rocksdb/env.h"
@@ -10,6 +18,9 @@
 #include "util/testharness.h"
 #include "util/testutil.h"
 
+using GFLAGS::ParseCommandLineFlags;
+using GFLAGS::SetUsageMessage;
+
 // A simple benchmark to simulate transactional logs
 
 DEFINE_int32(num_records, 6000, "Number of records.");
@@ -60,10 +71,12 @@ void RunBenchmark() {
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
-  google::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
-                          " [OPTIONS]...");
-  google::ParseCommandLineFlags(&argc, &argv, true);
+  SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
+                  " [OPTIONS]...");
+  ParseCommandLineFlags(&argc, &argv, true);
 
   rocksdb::RunBenchmark();
   return 0;
 }
+
+#endif  // GFLAGS
diff --git a/src/rocksdb/util/logging.cc b/src/rocksdb/util/logging.cc
index 02e3560..8917d09 100644
--- a/src/rocksdb/util/logging.cc
+++ b/src/rocksdb/util/logging.cc
@@ -9,6 +9,12 @@
 
 #include "util/logging.h"
 
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <cmath>
+#include <inttypes.h>
 #include <errno.h>
 #include <stdarg.h>
 #include <stdio.h>
@@ -18,9 +24,56 @@
 
 namespace rocksdb {
 
+// for micros < 10ms, print "XX us".
+// for micros < 10sec, print "XX ms".
+// for micros >= 10 sec, print "XX sec".
+// for micros <= 1 hour, print Y:X M:S".
+// for micros > 1 hour, print Z:Y:X H:M:S".
+int AppendHumanMicros(uint64_t micros, char* output, int len,
+                      bool fixed_format) {
+  if (micros < 10000 && !fixed_format) {
+    return snprintf(output, len, "%" PRIu64 " us", micros);
+  } else if (micros < 10000000 && !fixed_format) {
+    return snprintf(output, len, "%.3lf ms",
+                    static_cast<double>(micros) / 1000);
+  } else if (micros < 1000000l * 60 && !fixed_format) {
+    return snprintf(output, len, "%.3lf sec",
+                    static_cast<double>(micros) / 1000000);
+  } else if (micros < 1000000ll * 60 * 60 && !fixed_format) {
+    return snprintf(output, len, "%02" PRIu64 ":%05.3f M:S",
+        micros / 1000000 / 60,
+        static_cast<double>(micros % 60000000) / 1000000);
+  } else {
+    return snprintf(output, len,
+        "%02" PRIu64 ":%02" PRIu64 ":%05.3f H:M:S",
+        micros / 1000000 / 3600,
+        (micros / 1000000 / 60) % 60,
+        static_cast<double>(micros % 60000000) / 1000000);
+  }
+}
+
+// for sizes >=10TB, print "XXTB"
+// for sizes >=10GB, print "XXGB"
+// etc.
+// append file size summary to output and return the len
+int AppendHumanBytes(uint64_t bytes, char* output, int len) {
+  const uint64_t ull10 = 10;
+  if (bytes >= ull10 << 40) {
+    return snprintf(output, len, "%" PRIu64 "TB", bytes >> 40);
+  } else if (bytes >= ull10 << 30) {
+    return snprintf(output, len, "%" PRIu64 "GB", bytes >> 30);
+  } else if (bytes >= ull10 << 20) {
+    return snprintf(output, len, "%" PRIu64 "MB", bytes >> 20);
+  } else if (bytes >= ull10 << 10) {
+    return snprintf(output, len, "%" PRIu64 "KB", bytes >> 10);
+  } else {
+    return snprintf(output, len, "%" PRIu64 "B", bytes);
+  }
+}
+
 void AppendNumberTo(std::string* str, uint64_t num) {
   char buf[30];
-  snprintf(buf, sizeof(buf), "%llu", (unsigned long long) num);
+  snprintf(buf, sizeof(buf), "%" PRIu64, num);
   str->append(buf);
 }
 
@@ -44,6 +97,21 @@ std::string NumberToString(uint64_t num) {
   return r;
 }
 
+std::string NumberToHumanString(int64_t num) {
+  char buf[16];
+  int64_t absnum = num < 0 ? -num : num;
+  if (absnum < 10000) {
+    snprintf(buf, sizeof(buf), "%" PRIi64, num);
+  } else if (absnum < 10000000) {
+    snprintf(buf, sizeof(buf), "%" PRIi64 "K", num / 1000);
+  } else if (absnum < 10000000000LL) {
+    snprintf(buf, sizeof(buf), "%" PRIi64 "M", num / 1000000);
+  } else {
+    snprintf(buf, sizeof(buf), "%" PRIi64 "G", num / 1000000000);
+  }
+  return std::string(buf);
+}
+
 std::string EscapeString(const Slice& value) {
   std::string r;
   AppendEscapedStringTo(&r, value);
diff --git a/src/rocksdb/util/logging.h b/src/rocksdb/util/logging.h
index d8ce45e..10801bb 100644
--- a/src/rocksdb/util/logging.h
+++ b/src/rocksdb/util/logging.h
@@ -19,7 +19,13 @@
 namespace rocksdb {
 
 class Slice;
-class WritableFile;
+
+// Append a human-readable time in micros.
+int AppendHumanMicros(uint64_t micros, char* output, int len,
+                      bool fixed_format);
+
+// Append a human-readable size in bytes
+int AppendHumanBytes(uint64_t bytes, char* output, int len);
 
 // Append a human-readable printout of "num" to *str
 extern void AppendNumberTo(std::string* str, uint64_t num);
@@ -28,9 +34,15 @@ extern void AppendNumberTo(std::string* str, uint64_t num);
 // Escapes any non-printable characters found in "value".
 extern void AppendEscapedStringTo(std::string* str, const Slice& value);
 
-// Return a human-readable printout of "num"
+// Return a string printout of "num"
 extern std::string NumberToString(uint64_t num);
 
+// Return a human-readable version of num.
+// for num >= 10.000, prints "xxK"
+// for num >= 10.000.000, prints "xxM"
+// for num >= 10.000.000.000, prints "xxG"
+extern std::string NumberToHumanString(int64_t num);
+
 // Return a human-readable version of "value".
 // Escapes any non-printable characters found in "value".
 extern std::string EscapeString(const Slice& value);
diff --git a/src/rocksdb/util/manual_compaction_test.cc b/src/rocksdb/util/manual_compaction_test.cc
index dd615f0..6eedd03 100644
--- a/src/rocksdb/util/manual_compaction_test.cc
+++ b/src/rocksdb/util/manual_compaction_test.cc
@@ -30,7 +30,7 @@ std::string Key2(int i) {
   return Key1(i) + "_xxx";
 }
 
-class ManualCompactionTest {
+class ManualCompactionTest : public testing::Test {
  public:
   ManualCompactionTest() {
     // Get rid of any state from an old run.
@@ -45,20 +45,18 @@ class DestroyAllCompactionFilter : public CompactionFilter {
  public:
   DestroyAllCompactionFilter() {}
 
-  virtual bool Filter(int level,
-                      const Slice& key,
-                      const Slice& existing_value,
+  virtual bool Filter(int level, const Slice& key, const Slice& existing_value,
                       std::string* new_value,
-                      bool* value_changed) const {
+                      bool* value_changed) const override {
     return existing_value.ToString() == "destroy";
   }
 
-  virtual const char* Name() const {
+  virtual const char* Name() const override {
     return "DestroyAllCompactionFilter";
   }
 };
 
-TEST(ManualCompactionTest, CompactTouchesAllKeys) {
+TEST_F(ManualCompactionTest, CompactTouchesAllKeys) {
   for (int iter = 0; iter < 2; ++iter) {
     DB* db;
     Options options;
@@ -94,8 +92,7 @@ TEST(ManualCompactionTest, CompactTouchesAllKeys) {
   }
 }
 
-TEST(ManualCompactionTest, Test) {
-
+TEST_F(ManualCompactionTest, Test) {
   // Open database.  Disable compression since it affects the creation
   // of layers and the code below is trying to test against a very
   // specific scenario.
@@ -152,5 +149,6 @@ TEST(ManualCompactionTest, Test) {
 }  // anonymous namespace
 
 int main(int argc, char** argv) {
-  return rocksdb::test::RunAllTests();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
 }
diff --git a/src/rocksdb/util/memenv.cc b/src/rocksdb/util/memenv.cc
new file mode 100644
index 0000000..c89411f
--- /dev/null
+++ b/src/rocksdb/util/memenv.cc
@@ -0,0 +1,433 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+#include "port/port.h"
+#include "util/mutexlock.h"
+#include <map>
+#include <string.h>
+#include <string>
+#include <vector>
+
+namespace rocksdb {
+
+#ifndef ROCKSDB_LITE
+
+namespace {
+
+std::string NormalizeFileName(const std::string fname) {
+  if (fname.find("//") == std::string::npos) {
+    return fname;
+  }
+  std::string out_name = "";
+  bool is_slash = false;
+  for (char c : fname) {
+    if (c == '/' && is_slash) {
+      continue;
+    }
+    out_name.append(1, c);
+    if (c == '/') {
+      is_slash = true;
+    } else {
+      is_slash = false;
+    }
+  }
+  return out_name;
+}
+
+class FileState {
+ public:
+  // FileStates are reference counted. The initial reference count is zero
+  // and the caller must call Ref() at least once.
+  FileState() : refs_(0), size_(0) {}
+
+  // Increase the reference count.
+  void Ref() {
+    MutexLock lock(&refs_mutex_);
+    ++refs_;
+  }
+
+  // Decrease the reference count. Delete if this is the last reference.
+  void Unref() {
+    bool do_delete = false;
+
+    {
+      MutexLock lock(&refs_mutex_);
+      --refs_;
+      assert(refs_ >= 0);
+      if (refs_ <= 0) {
+        do_delete = true;
+      }
+    }
+
+    if (do_delete) {
+      delete this;
+    }
+  }
+
+  uint64_t Size() const { return size_; }
+
+  Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const {
+    if (offset > size_) {
+      return Status::IOError("Offset greater than file size.");
+    }
+    const uint64_t available = size_ - offset;
+    if (n > available) {
+      n = available;
+    }
+    if (n == 0) {
+      *result = Slice();
+      return Status::OK();
+    }
+
+    size_t block = offset / kBlockSize;
+    size_t block_offset = offset % kBlockSize;
+
+    if (n <= kBlockSize - block_offset) {
+      // The requested bytes are all in the first block.
+      *result = Slice(blocks_[block] + block_offset, n);
+      return Status::OK();
+    }
+
+    size_t bytes_to_copy = n;
+    char* dst = scratch;
+
+    while (bytes_to_copy > 0) {
+      size_t avail = kBlockSize - block_offset;
+      if (avail > bytes_to_copy) {
+        avail = bytes_to_copy;
+      }
+      memcpy(dst, blocks_[block] + block_offset, avail);
+
+      bytes_to_copy -= avail;
+      dst += avail;
+      block++;
+      block_offset = 0;
+    }
+
+    *result = Slice(scratch, n);
+    return Status::OK();
+  }
+
+  Status Append(const Slice& data) {
+    const char* src = data.data();
+    size_t src_len = data.size();
+
+    while (src_len > 0) {
+      size_t avail;
+      size_t offset = size_ % kBlockSize;
+
+      if (offset != 0) {
+        // There is some room in the last block.
+        avail = kBlockSize - offset;
+      } else {
+        // No room in the last block; push new one.
+        blocks_.push_back(new char[kBlockSize]);
+        avail = kBlockSize;
+      }
+
+      if (avail > src_len) {
+        avail = src_len;
+      }
+      memcpy(blocks_.back() + offset, src, avail);
+      src_len -= avail;
+      src += avail;
+      size_ += avail;
+    }
+
+    return Status::OK();
+  }
+
+ private:
+  // Private since only Unref() should be used to delete it.
+  ~FileState() {
+    for (std::vector<char*>::iterator i = blocks_.begin(); i != blocks_.end();
+         ++i) {
+      delete [] *i;
+    }
+  }
+
+  // No copying allowed.
+  FileState(const FileState&);
+  void operator=(const FileState&);
+
+  port::Mutex refs_mutex_;
+  int refs_;  // Protected by refs_mutex_;
+
+  // The following fields are not protected by any mutex. They are only mutable
+  // while the file is being written, and concurrent access is not allowed
+  // to writable files.
+  std::vector<char*> blocks_;
+  uint64_t size_;
+
+  enum { kBlockSize = 8 * 1024 };
+};
+
+class SequentialFileImpl : public SequentialFile {
+ public:
+  explicit SequentialFileImpl(FileState* file) : file_(file), pos_(0) {
+    file_->Ref();
+  }
+
+  ~SequentialFileImpl() {
+    file_->Unref();
+  }
+
+  virtual Status Read(size_t n, Slice* result, char* scratch) override {
+    Status s = file_->Read(pos_, n, result, scratch);
+    if (s.ok()) {
+      pos_ += result->size();
+    }
+    return s;
+  }
+
+  virtual Status Skip(uint64_t n) override {
+    if (pos_ > file_->Size()) {
+      return Status::IOError("pos_ > file_->Size()");
+    }
+    const size_t available = file_->Size() - pos_;
+    if (n > available) {
+      n = available;
+    }
+    pos_ += n;
+    return Status::OK();
+  }
+
+ private:
+  FileState* file_;
+  size_t pos_;
+};
+
+class RandomAccessFileImpl : public RandomAccessFile {
+ public:
+  explicit RandomAccessFileImpl(FileState* file) : file_(file) {
+    file_->Ref();
+  }
+
+  ~RandomAccessFileImpl() {
+    file_->Unref();
+  }
+
+  virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                      char* scratch) const override {
+    return file_->Read(offset, n, result, scratch);
+  }
+
+ private:
+  FileState* file_;
+};
+
+class WritableFileImpl : public WritableFile {
+ public:
+  WritableFileImpl(FileState* file) : file_(file) {
+    file_->Ref();
+  }
+
+  ~WritableFileImpl() {
+    file_->Unref();
+  }
+
+  virtual Status Append(const Slice& data) override {
+    return file_->Append(data);
+  }
+
+  virtual Status Close() override { return Status::OK(); }
+  virtual Status Flush() override { return Status::OK(); }
+  virtual Status Sync() override { return Status::OK(); }
+
+ private:
+  FileState* file_;
+};
+
+class InMemoryDirectory : public Directory {
+ public:
+  virtual Status Fsync() override { return Status::OK(); }
+};
+
+class InMemoryEnv : public EnvWrapper {
+ public:
+  explicit InMemoryEnv(Env* base_env) : EnvWrapper(base_env) { }
+
+  virtual ~InMemoryEnv() {
+    for (FileSystem::iterator i = file_map_.begin(); i != file_map_.end(); ++i){
+      i->second->Unref();
+    }
+  }
+
+  // Partial implementation of the Env interface.
+  virtual Status NewSequentialFile(const std::string& fname,
+                                   unique_ptr<SequentialFile>* result,
+                                   const EnvOptions& soptions) override {
+    std::string nfname = NormalizeFileName(fname);
+    MutexLock lock(&mutex_);
+    if (file_map_.find(fname) == file_map_.end()) {
+      *result = NULL;
+      return Status::IOError(fname, "File not found");
+    }
+
+    result->reset(new SequentialFileImpl(file_map_[nfname]));
+    return Status::OK();
+  }
+
+  virtual Status NewRandomAccessFile(const std::string& fname,
+                                     unique_ptr<RandomAccessFile>* result,
+                                     const EnvOptions& soptions) override {
+    std::string nfname = NormalizeFileName(fname);
+    MutexLock lock(&mutex_);
+    if (file_map_.find(nfname) == file_map_.end()) {
+      *result = NULL;
+      return Status::IOError(fname, "File not found");
+    }
+
+    result->reset(new RandomAccessFileImpl(file_map_[nfname]));
+    return Status::OK();
+  }
+
+  virtual Status NewWritableFile(const std::string& fname,
+                                 unique_ptr<WritableFile>* result,
+                                 const EnvOptions& soptions) override {
+    std::string nfname = NormalizeFileName(fname);
+    MutexLock lock(&mutex_);
+    if (file_map_.find(nfname) != file_map_.end()) {
+      DeleteFileInternal(nfname);
+    }
+
+    FileState* file = new FileState();
+    file->Ref();
+    file_map_[nfname] = file;
+
+    result->reset(new WritableFileImpl(file));
+    return Status::OK();
+  }
+
+  virtual Status NewDirectory(const std::string& name,
+                              unique_ptr<Directory>* result) override {
+    result->reset(new InMemoryDirectory());
+    return Status::OK();
+  }
+
+  virtual bool FileExists(const std::string& fname) override {
+    std::string nfname = NormalizeFileName(fname);
+    MutexLock lock(&mutex_);
+    return file_map_.find(nfname) != file_map_.end();
+  }
+
+  virtual Status GetChildren(const std::string& dir,
+                             std::vector<std::string>* result) override {
+    MutexLock lock(&mutex_);
+    result->clear();
+
+    for (FileSystem::iterator i = file_map_.begin(); i != file_map_.end(); ++i){
+      const std::string& filename = i->first;
+
+      if (filename.size() >= dir.size() + 1 && filename[dir.size()] == '/' &&
+          Slice(filename).starts_with(Slice(dir))) {
+        result->push_back(filename.substr(dir.size() + 1));
+      }
+    }
+
+    return Status::OK();
+  }
+
+  void DeleteFileInternal(const std::string& fname) {
+    if (file_map_.find(fname) == file_map_.end()) {
+      return;
+    }
+
+    file_map_[fname]->Unref();
+    file_map_.erase(fname);
+  }
+
+  virtual Status DeleteFile(const std::string& fname) override {
+    std::string nfname = NormalizeFileName(fname);
+    MutexLock lock(&mutex_);
+    if (file_map_.find(nfname) == file_map_.end()) {
+      return Status::IOError(fname, "File not found");
+    }
+
+    DeleteFileInternal(nfname);
+    return Status::OK();
+  }
+
+  virtual Status CreateDir(const std::string& dirname) override {
+    return Status::OK();
+  }
+
+  virtual Status CreateDirIfMissing(const std::string& dirname) override {
+    return Status::OK();
+  }
+
+  virtual Status DeleteDir(const std::string& dirname) override {
+    return Status::OK();
+  }
+
+  virtual Status GetFileSize(const std::string& fname,
+                             uint64_t* file_size) override {
+    std::string nfname = NormalizeFileName(fname);
+    MutexLock lock(&mutex_);
+
+    if (file_map_.find(nfname) == file_map_.end()) {
+      return Status::IOError(fname, "File not found");
+    }
+
+    *file_size = file_map_[nfname]->Size();
+    return Status::OK();
+  }
+
+  virtual Status GetFileModificationTime(const std::string& fname,
+                                         uint64_t* time) override {
+    return Status::NotSupported("getFileMTime", "Not supported in MemEnv");
+  }
+
+  virtual Status RenameFile(const std::string& src,
+                            const std::string& dest) override {
+    std::string nsrc = NormalizeFileName(src);
+    std::string ndest = NormalizeFileName(dest);
+    MutexLock lock(&mutex_);
+    if (file_map_.find(nsrc) == file_map_.end()) {
+      return Status::IOError(src, "File not found");
+    }
+
+    DeleteFileInternal(dest);
+    file_map_[ndest] = file_map_[nsrc];
+    file_map_.erase(nsrc);
+    return Status::OK();
+  }
+
+  virtual Status LockFile(const std::string& fname, FileLock** lock) override {
+    *lock = new FileLock;
+    return Status::OK();
+  }
+
+  virtual Status UnlockFile(FileLock* lock) override {
+    delete lock;
+    return Status::OK();
+  }
+
+  virtual Status GetTestDirectory(std::string* path) override {
+    *path = "/test";
+    return Status::OK();
+  }
+
+ private:
+  // Map from filenames to FileState objects, representing a simple file system.
+  typedef std::map<std::string, FileState*> FileSystem;
+  port::Mutex mutex_;
+  FileSystem file_map_;  // Protected by mutex_.
+};
+
+}  // namespace
+
+Env* NewMemEnv(Env* base_env) {
+  return new InMemoryEnv(base_env);
+}
+
+#else  // ROCKSDB_LITE
+
+Env* NewMemEnv(Env* base_env) { return nullptr; }
+
+#endif  // !ROCKSDB_LITE
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/memenv_test.cc b/src/rocksdb/util/memenv_test.cc
new file mode 100644
index 0000000..9222dc6
--- /dev/null
+++ b/src/rocksdb/util/memenv_test.cc
@@ -0,0 +1,241 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_impl.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "util/testharness.h"
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace rocksdb {
+
+class MemEnvTest : public testing::Test {
+ public:
+  Env* env_;
+  const EnvOptions soptions_;
+
+  MemEnvTest()
+      : env_(NewMemEnv(Env::Default())) {
+  }
+  ~MemEnvTest() {
+    delete env_;
+  }
+};
+
+TEST_F(MemEnvTest, Basics) {
+  uint64_t file_size;
+  unique_ptr<WritableFile> writable_file;
+  std::vector<std::string> children;
+
+  ASSERT_OK(env_->CreateDir("/dir"));
+
+  // Check that the directory is empty.
+  ASSERT_TRUE(!env_->FileExists("/dir/non_existent"));
+  ASSERT_TRUE(!env_->GetFileSize("/dir/non_existent", &file_size).ok());
+  ASSERT_OK(env_->GetChildren("/dir", &children));
+  ASSERT_EQ(0U, children.size());
+
+  // Create a file.
+  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
+  writable_file.reset();
+
+  // Check that the file exists.
+  ASSERT_TRUE(env_->FileExists("/dir/f"));
+  ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
+  ASSERT_EQ(0U, file_size);
+  ASSERT_OK(env_->GetChildren("/dir", &children));
+  ASSERT_EQ(1U, children.size());
+  ASSERT_EQ("f", children[0]);
+
+  // Write to the file.
+  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
+  ASSERT_OK(writable_file->Append("abc"));
+  writable_file.reset();
+
+  // Check for expected size.
+  ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
+  ASSERT_EQ(3U, file_size);
+
+  // Check that renaming works.
+  ASSERT_TRUE(!env_->RenameFile("/dir/non_existent", "/dir/g").ok());
+  ASSERT_OK(env_->RenameFile("/dir/f", "/dir/g"));
+  ASSERT_TRUE(!env_->FileExists("/dir/f"));
+  ASSERT_TRUE(env_->FileExists("/dir/g"));
+  ASSERT_OK(env_->GetFileSize("/dir/g", &file_size));
+  ASSERT_EQ(3U, file_size);
+
+  // Check that opening non-existent file fails.
+  unique_ptr<SequentialFile> seq_file;
+  unique_ptr<RandomAccessFile> rand_file;
+  ASSERT_TRUE(!env_->NewSequentialFile("/dir/non_existent", &seq_file,
+                                       soptions_).ok());
+  ASSERT_TRUE(!seq_file);
+  ASSERT_TRUE(!env_->NewRandomAccessFile("/dir/non_existent", &rand_file,
+                                         soptions_).ok());
+  ASSERT_TRUE(!rand_file);
+
+  // Check that deleting works.
+  ASSERT_TRUE(!env_->DeleteFile("/dir/non_existent").ok());
+  ASSERT_OK(env_->DeleteFile("/dir/g"));
+  ASSERT_TRUE(!env_->FileExists("/dir/g"));
+  ASSERT_OK(env_->GetChildren("/dir", &children));
+  ASSERT_EQ(0U, children.size());
+  ASSERT_OK(env_->DeleteDir("/dir"));
+}
+
+TEST_F(MemEnvTest, ReadWrite) {
+  unique_ptr<WritableFile> writable_file;
+  unique_ptr<SequentialFile> seq_file;
+  unique_ptr<RandomAccessFile> rand_file;
+  Slice result;
+  char scratch[100];
+
+  ASSERT_OK(env_->CreateDir("/dir"));
+
+  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
+  ASSERT_OK(writable_file->Append("hello "));
+  ASSERT_OK(writable_file->Append("world"));
+  writable_file.reset();
+
+  // Read sequentially.
+  ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file, soptions_));
+  ASSERT_OK(seq_file->Read(5, &result, scratch)); // Read "hello".
+  ASSERT_EQ(0, result.compare("hello"));
+  ASSERT_OK(seq_file->Skip(1));
+  ASSERT_OK(seq_file->Read(1000, &result, scratch)); // Read "world".
+  ASSERT_EQ(0, result.compare("world"));
+  ASSERT_OK(seq_file->Read(1000, &result, scratch)); // Try reading past EOF.
+  ASSERT_EQ(0U, result.size());
+  ASSERT_OK(seq_file->Skip(100)); // Try to skip past end of file.
+  ASSERT_OK(seq_file->Read(1000, &result, scratch));
+  ASSERT_EQ(0U, result.size());
+
+  // Random reads.
+  ASSERT_OK(env_->NewRandomAccessFile("/dir/f", &rand_file, soptions_));
+  ASSERT_OK(rand_file->Read(6, 5, &result, scratch)); // Read "world".
+  ASSERT_EQ(0, result.compare("world"));
+  ASSERT_OK(rand_file->Read(0, 5, &result, scratch)); // Read "hello".
+  ASSERT_EQ(0, result.compare("hello"));
+  ASSERT_OK(rand_file->Read(10, 100, &result, scratch)); // Read "d".
+  ASSERT_EQ(0, result.compare("d"));
+
+  // Too high offset.
+  ASSERT_TRUE(!rand_file->Read(1000, 5, &result, scratch).ok());
+}
+
+TEST_F(MemEnvTest, Locks) {
+  FileLock* lock;
+
+  // These are no-ops, but we test they return success.
+  ASSERT_OK(env_->LockFile("some file", &lock));
+  ASSERT_OK(env_->UnlockFile(lock));
+}
+
+TEST_F(MemEnvTest, Misc) {
+  std::string test_dir;
+  ASSERT_OK(env_->GetTestDirectory(&test_dir));
+  ASSERT_TRUE(!test_dir.empty());
+
+  unique_ptr<WritableFile> writable_file;
+  ASSERT_OK(env_->NewWritableFile("/a/b", &writable_file, soptions_));
+
+  // These are no-ops, but we test they return success.
+  ASSERT_OK(writable_file->Sync());
+  ASSERT_OK(writable_file->Flush());
+  ASSERT_OK(writable_file->Close());
+  writable_file.reset();
+}
+
+TEST_F(MemEnvTest, LargeWrite) {
+  const size_t kWriteSize = 300 * 1024;
+  char* scratch = new char[kWriteSize * 2];
+
+  std::string write_data;
+  for (size_t i = 0; i < kWriteSize; ++i) {
+    write_data.append(1, static_cast<char>(i));
+  }
+
+  unique_ptr<WritableFile> writable_file;
+  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
+  ASSERT_OK(writable_file->Append("foo"));
+  ASSERT_OK(writable_file->Append(write_data));
+  writable_file.reset();
+
+  unique_ptr<SequentialFile> seq_file;
+  Slice result;
+  ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file, soptions_));
+  ASSERT_OK(seq_file->Read(3, &result, scratch)); // Read "foo".
+  ASSERT_EQ(0, result.compare("foo"));
+
+  size_t read = 0;
+  std::string read_data;
+  while (read < kWriteSize) {
+    ASSERT_OK(seq_file->Read(kWriteSize - read, &result, scratch));
+    read_data.append(result.data(), result.size());
+    read += result.size();
+  }
+  ASSERT_TRUE(write_data == read_data);
+  delete [] scratch;
+}
+
+TEST_F(MemEnvTest, DBTest) {
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_;
+  DB* db;
+
+  const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")};
+  const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")};
+
+  ASSERT_OK(DB::Open(options, "/dir/db", &db));
+  for (size_t i = 0; i < 3; ++i) {
+    ASSERT_OK(db->Put(WriteOptions(), keys[i], vals[i]));
+  }
+
+  for (size_t i = 0; i < 3; ++i) {
+    std::string res;
+    ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
+    ASSERT_TRUE(res == vals[i]);
+  }
+
+  Iterator* iterator = db->NewIterator(ReadOptions());
+  iterator->SeekToFirst();
+  for (size_t i = 0; i < 3; ++i) {
+    ASSERT_TRUE(iterator->Valid());
+    ASSERT_TRUE(keys[i] == iterator->key());
+    ASSERT_TRUE(vals[i] == iterator->value());
+    iterator->Next();
+  }
+  ASSERT_TRUE(!iterator->Valid());
+  delete iterator;
+
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db);
+  ASSERT_OK(dbi->TEST_FlushMemTable());
+
+  for (size_t i = 0; i < 3; ++i) {
+    std::string res;
+    ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
+    ASSERT_TRUE(res == vals[i]);
+  }
+
+  delete db;
+
+  options.create_if_missing = false;
+  ASSERT_OK(DB::Open(options, "/dir/db", &db));
+  for (size_t i = 0; i < 3; ++i) {
+    std::string res;
+    ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
+    ASSERT_TRUE(res == vals[i]);
+  }
+  delete db;
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/mock_env.cc b/src/rocksdb/util/mock_env.cc
new file mode 100644
index 0000000..26dffba
--- /dev/null
+++ b/src/rocksdb/util/mock_env.cc
@@ -0,0 +1,710 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/mock_env.h"
+#include <sys/time.h>
+#include <algorithm>
+#include <chrono>
+#include "util/rate_limiter.h"
+#include "util/random.h"
+#include "util/murmurhash.h"
+
+namespace rocksdb {
+
+class MemFile {
+ public:
+  explicit MemFile(Env* env, const std::string& fn, bool _is_lock_file = false)
+      : env_(env),
+        fn_(fn),
+        refs_(0),
+        is_lock_file_(_is_lock_file),
+        locked_(false),
+        size_(0),
+        modified_time_(Now()),
+        rnd_(static_cast<uint32_t>(
+            MurmurHash(fn.data(), static_cast<int>(fn.size()), 0))),
+        fsynced_bytes_(0) {}
+
+  void Ref() {
+    MutexLock lock(&mutex_);
+    ++refs_;
+  }
+
+  bool is_lock_file() const { return is_lock_file_; }
+
+  bool Lock() {
+    assert(is_lock_file_);
+    MutexLock lock(&mutex_);
+    if (locked_) {
+      return false;
+    } else {
+      locked_ = true;
+      return true;
+    }
+  }
+
+  void Unlock() {
+    assert(is_lock_file_);
+    MutexLock lock(&mutex_);
+    locked_ = false;
+  }
+
+  void Unref() {
+    bool do_delete = false;
+    {
+      MutexLock lock(&mutex_);
+      --refs_;
+      assert(refs_ >= 0);
+      if (refs_ <= 0) {
+        do_delete = true;
+      }
+    }
+
+    if (do_delete) {
+      delete this;
+    }
+  }
+
+  uint64_t Size() const {
+    return size_;
+  }
+
+  void Truncate(size_t size) {
+    MutexLock lock(&mutex_);
+    if (size < size_) {
+      data_.resize(size);
+      size_ = size;
+    }
+  }
+
+  void CorruptBuffer() {
+    if (fsynced_bytes_ >= size_) {
+      return;
+    }
+    uint64_t buffered_bytes = size_ - fsynced_bytes_;
+    uint64_t start =
+        fsynced_bytes_ + rnd_.Uniform(static_cast<int>(buffered_bytes));
+    uint64_t end = std::min(start + 512, size_.load());
+    MutexLock lock(&mutex_);
+    for (uint64_t pos = start; pos < end; ++pos) {
+      data_[pos] = static_cast<char>(rnd_.Uniform(256));
+    }
+  }
+
+  Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const {
+    MutexLock lock(&mutex_);
+    if (offset > Size()) {
+      return Status::IOError("Offset greater than file size.");
+    }
+    const uint64_t available = Size() - offset;
+    if (n > available) {
+      n = available;
+    }
+    if (n == 0) {
+      *result = Slice();
+      return Status::OK();
+    }
+    if (scratch) {
+      memcpy(scratch, &(data_[offset]), n);
+      *result = Slice(scratch, n);
+    } else {
+      *result = Slice(&(data_[offset]), n);
+    }
+    return Status::OK();
+  }
+
+  Status Append(const Slice& data) {
+    MutexLock lock(&mutex_);
+    data_.append(data.data(), data.size());
+    size_ = data_.size();
+    modified_time_ = Now();
+    return Status::OK();
+  }
+
+  Status Fsync() {
+    fsynced_bytes_ = size_.load();
+    return Status::OK();
+  }
+
+  uint64_t ModifiedTime() const {
+    return modified_time_;
+  }
+
+ private:
+  uint64_t Now() {
+    int64_t unix_time;
+    auto s = env_->GetCurrentTime(&unix_time);
+    assert(s.ok());
+    return static_cast<uint64_t>(unix_time);
+  }
+
+  // Private since only Unref() should be used to delete it.
+  ~MemFile() {
+    assert(refs_ == 0);
+  }
+
+  // No copying allowed.
+  MemFile(const MemFile&);
+  void operator=(const MemFile&);
+
+  Env* env_;
+  const std::string fn_;
+  mutable port::Mutex mutex_;
+  int refs_;
+  bool is_lock_file_;
+  bool locked_;
+
+  // Data written into this file, all bytes before fsynced_bytes are
+  // persistent.
+  std::string data_;
+  std::atomic<uint64_t> size_;
+  std::atomic<uint64_t> modified_time_;
+
+  Random rnd_;
+  std::atomic<uint64_t> fsynced_bytes_;
+};
+
+namespace {
+
+class MockSequentialFile : public SequentialFile {
+ public:
+  explicit MockSequentialFile(MemFile* file) : file_(file), pos_(0) {
+    file_->Ref();
+  }
+
+  ~MockSequentialFile() {
+    file_->Unref();
+  }
+
+  virtual Status Read(size_t n, Slice* result, char* scratch) override {
+    Status s = file_->Read(pos_, n, result, scratch);
+    if (s.ok()) {
+      pos_ += result->size();
+    }
+    return s;
+  }
+
+  virtual Status Skip(uint64_t n) override {
+    if (pos_ > file_->Size()) {
+      return Status::IOError("pos_ > file_->Size()");
+    }
+    const size_t available = file_->Size() - pos_;
+    if (n > available) {
+      n = available;
+    }
+    pos_ += n;
+    return Status::OK();
+  }
+
+ private:
+  MemFile* file_;
+  size_t pos_;
+};
+
+class MockRandomAccessFile : public RandomAccessFile {
+ public:
+  explicit MockRandomAccessFile(MemFile* file) : file_(file) {
+    file_->Ref();
+  }
+
+  ~MockRandomAccessFile() {
+    file_->Unref();
+  }
+
+  virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                      char* scratch) const override {
+    return file_->Read(offset, n, result, scratch);
+  }
+
+ private:
+  MemFile* file_;
+};
+
+class MockWritableFile : public WritableFile {
+ public:
+  MockWritableFile(MemFile* file, RateLimiter* rate_limiter)
+    : file_(file),
+      rate_limiter_(rate_limiter) {
+    file_->Ref();
+  }
+
+  ~MockWritableFile() {
+    file_->Unref();
+  }
+
+  virtual Status Append(const Slice& data) override {
+    uint64_t bytes_written = 0;
+    while (bytes_written < data.size()) {
+      auto bytes = RequestToken(data.size() - bytes_written);
+      Status s = file_->Append(Slice(data.data() + bytes_written, bytes));
+      if (!s.ok()) {
+        return s;
+      }
+      bytes_written += bytes;
+    }
+    return Status::OK();
+  }
+
+  virtual Status Close() override { return file_->Fsync(); }
+
+  virtual Status Flush() override { return Status::OK(); }
+
+  virtual Status Sync() override { return file_->Fsync(); }
+
+  virtual uint64_t GetFileSize() override { return file_->Size(); }
+
+ private:
+  inline size_t RequestToken(size_t bytes) {
+    if (rate_limiter_ && io_priority_ < Env::IO_TOTAL) {
+      bytes = std::min(bytes,
+          static_cast<size_t>(rate_limiter_->GetSingleBurstBytes()));
+      rate_limiter_->Request(bytes, io_priority_);
+    }
+    return bytes;
+  }
+
+  MemFile* file_;
+  RateLimiter* rate_limiter_;
+};
+
+class MockEnvDirectory : public Directory {
+ public:
+  virtual Status Fsync() override { return Status::OK(); }
+};
+
+class MockEnvFileLock : public FileLock {
+ public:
+  explicit MockEnvFileLock(const std::string& fname)
+    : fname_(fname) {}
+
+  std::string FileName() const {
+    return fname_;
+  }
+
+ private:
+  const std::string fname_;
+};
+
+class TestMemLogger : public Logger {
+ private:
+  std::unique_ptr<WritableFile> file_;
+  std::atomic_size_t log_size_;
+  static const uint64_t flush_every_seconds_ = 5;
+  std::atomic_uint_fast64_t last_flush_micros_;
+  Env* env_;
+  bool flush_pending_;
+
+ public:
+  TestMemLogger(std::unique_ptr<WritableFile> f, Env* env,
+                const InfoLogLevel log_level = InfoLogLevel::ERROR_LEVEL)
+      : Logger(log_level),
+        file_(std::move(f)),
+        log_size_(0),
+        last_flush_micros_(0),
+        env_(env),
+        flush_pending_(false) {}
+  virtual ~TestMemLogger() {
+  }
+
+  virtual void Flush() override {
+    if (flush_pending_) {
+      flush_pending_ = false;
+    }
+    last_flush_micros_ = env_->NowMicros();
+  }
+
+  using Logger::Logv;
+  virtual void Logv(const char* format, va_list ap) override {
+    // We try twice: the first time with a fixed-size stack allocated buffer,
+    // and the second time with a much larger dynamically allocated buffer.
+    char buffer[500];
+    for (int iter = 0; iter < 2; iter++) {
+      char* base;
+      int bufsize;
+      if (iter == 0) {
+        bufsize = sizeof(buffer);
+        base = buffer;
+      } else {
+        bufsize = 30000;
+        base = new char[bufsize];
+      }
+      char* p = base;
+      char* limit = base + bufsize;
+
+      struct timeval now_tv;
+      gettimeofday(&now_tv, nullptr);
+      const time_t seconds = now_tv.tv_sec;
+      struct tm t;
+      localtime_r(&seconds, &t);
+      p += snprintf(p, limit - p,
+                    "%04d/%02d/%02d-%02d:%02d:%02d.%06d ",
+                    t.tm_year + 1900,
+                    t.tm_mon + 1,
+                    t.tm_mday,
+                    t.tm_hour,
+                    t.tm_min,
+                    t.tm_sec,
+                    static_cast<int>(now_tv.tv_usec));
+
+      // Print the message
+      if (p < limit) {
+        va_list backup_ap;
+        va_copy(backup_ap, ap);
+        p += vsnprintf(p, limit - p, format, backup_ap);
+        va_end(backup_ap);
+      }
+
+      // Truncate to available space if necessary
+      if (p >= limit) {
+        if (iter == 0) {
+          continue;       // Try again with larger buffer
+        } else {
+          p = limit - 1;
+        }
+      }
+
+      // Add newline if necessary
+      if (p == base || p[-1] != '\n') {
+        *p++ = '\n';
+      }
+
+      assert(p <= limit);
+      const size_t write_size = p - base;
+
+      file_->Append(Slice(base, write_size));
+      flush_pending_ = true;
+      log_size_ += write_size;
+      uint64_t now_micros = static_cast<uint64_t>(now_tv.tv_sec) * 1000000 +
+        now_tv.tv_usec;
+      if (now_micros - last_flush_micros_ >= flush_every_seconds_ * 1000000) {
+        flush_pending_ = false;
+        last_flush_micros_ = now_micros;
+      }
+      if (base != buffer) {
+        delete[] base;
+      }
+      break;
+    }
+  }
+  size_t GetLogFileSize() const override { return log_size_; }
+};
+
+}  // Anonymous namespace
+
+MockEnv::MockEnv(Env* base_env) : EnvWrapper(base_env), fake_sleep_micros_(0) {}
+
+MockEnv::~MockEnv() {
+  for (FileSystem::iterator i = file_map_.begin(); i != file_map_.end(); ++i) {
+    i->second->Unref();
+  }
+}
+
+  // Partial implementation of the Env interface.
+Status MockEnv::NewSequentialFile(const std::string& fname,
+                                     unique_ptr<SequentialFile>* result,
+                                     const EnvOptions& soptions) {
+  auto fn = NormalizePath(fname);
+  MutexLock lock(&mutex_);
+  if (file_map_.find(fn) == file_map_.end()) {
+    *result = NULL;
+    return Status::IOError(fn, "File not found");
+  }
+  auto* f = file_map_[fn];
+  if (f->is_lock_file()) {
+    return Status::InvalidArgument(fn, "Cannot open a lock file.");
+  }
+  result->reset(new MockSequentialFile(f));
+  return Status::OK();
+}
+
+Status MockEnv::NewRandomAccessFile(const std::string& fname,
+                                       unique_ptr<RandomAccessFile>* result,
+                                       const EnvOptions& soptions) {
+  auto fn = NormalizePath(fname);
+  MutexLock lock(&mutex_);
+  if (file_map_.find(fn) == file_map_.end()) {
+    *result = NULL;
+    return Status::IOError(fn, "File not found");
+  }
+  auto* f = file_map_[fn];
+  if (f->is_lock_file()) {
+    return Status::InvalidArgument(fn, "Cannot open a lock file.");
+  }
+  result->reset(new MockRandomAccessFile(f));
+  return Status::OK();
+}
+
+Status MockEnv::NewWritableFile(const std::string& fname,
+                                   unique_ptr<WritableFile>* result,
+                                   const EnvOptions& env_options) {
+  auto fn = NormalizePath(fname);
+  MutexLock lock(&mutex_);
+  if (file_map_.find(fn) != file_map_.end()) {
+    DeleteFileInternal(fn);
+  }
+  MemFile* file = new MemFile(this, fn, false);
+  file->Ref();
+  file_map_[fn] = file;
+
+  result->reset(new MockWritableFile(file, env_options.rate_limiter));
+  return Status::OK();
+}
+
+Status MockEnv::NewRandomRWFile(const std::string& fname,
+                                   unique_ptr<RandomRWFile>* result,
+                                   const EnvOptions& options) {
+  return Status::OK();
+}
+
+Status MockEnv::NewDirectory(const std::string& name,
+                                unique_ptr<Directory>* result) {
+  result->reset(new MockEnvDirectory());
+  return Status::OK();
+}
+
+bool MockEnv::FileExists(const std::string& fname) {
+  auto fn = NormalizePath(fname);
+  MutexLock lock(&mutex_);
+  if (file_map_.find(fn) != file_map_.end()) {
+    // File exists
+    return true;
+  }
+  // Now also check if fn exists as a dir
+  for (const auto& iter : file_map_) {
+    const std::string& filename = iter.first;
+    if (filename.size() >= fn.size() + 1 &&
+        filename[fn.size()] == '/' &&
+        Slice(filename).starts_with(Slice(fn))) {
+      return true;
+    }
+  }
+  return false;
+}
+
+Status MockEnv::GetChildren(const std::string& dir,
+                               std::vector<std::string>* result) {
+  auto d = NormalizePath(dir);
+  {
+    MutexLock lock(&mutex_);
+    result->clear();
+    for (const auto& iter : file_map_) {
+      const std::string& filename = iter.first;
+
+      if (filename.size() >= d.size() + 1 && filename[d.size()] == '/' &&
+          Slice(filename).starts_with(Slice(d))) {
+        size_t next_slash = filename.find('/', d.size() + 1);
+        if (next_slash != std::string::npos) {
+          result->push_back(filename.substr(
+                d.size() + 1, next_slash - d.size() - 1));
+        } else {
+          result->push_back(filename.substr(d.size() + 1));
+        }
+      }
+    }
+  }
+  result->erase(std::unique(result->begin(), result->end()), result->end());
+  return Status::OK();
+}
+
+void MockEnv::DeleteFileInternal(const std::string& fname) {
+  assert(fname == NormalizePath(fname));
+  const auto& pair = file_map_.find(fname);
+  if (pair != file_map_.end()) {
+    pair->second->Unref();
+    file_map_.erase(fname);
+  }
+}
+
+Status MockEnv::DeleteFile(const std::string& fname) {
+  auto fn = NormalizePath(fname);
+  MutexLock lock(&mutex_);
+  if (file_map_.find(fn) == file_map_.end()) {
+    return Status::IOError(fn, "File not found");
+  }
+
+  DeleteFileInternal(fn);
+  return Status::OK();
+}
+
+Status MockEnv::CreateDir(const std::string& dirname) {
+  return Status::OK();
+}
+
+Status MockEnv::CreateDirIfMissing(const std::string& dirname) {
+  return Status::OK();
+}
+
+Status MockEnv::DeleteDir(const std::string& dirname) {
+  return Status::OK();
+}
+
+Status MockEnv::GetFileSize(const std::string& fname, uint64_t* file_size) {
+  auto fn = NormalizePath(fname);
+  MutexLock lock(&mutex_);
+  auto iter = file_map_.find(fn);
+  if (iter == file_map_.end()) {
+    return Status::IOError(fn, "File not found");
+  }
+
+  *file_size = iter->second->Size();
+  return Status::OK();
+}
+
+Status MockEnv::GetFileModificationTime(const std::string& fname,
+                                           uint64_t* time) {
+  auto fn = NormalizePath(fname);
+  MutexLock lock(&mutex_);
+  auto iter = file_map_.find(fn);
+  if (iter == file_map_.end()) {
+    return Status::IOError(fn, "File not found");
+  }
+  *time = iter->second->ModifiedTime();
+  return Status::OK();
+}
+
+Status MockEnv::RenameFile(const std::string& src, const std::string& dest) {
+  auto s = NormalizePath(src);
+  auto t = NormalizePath(dest);
+  MutexLock lock(&mutex_);
+  if (file_map_.find(s) == file_map_.end()) {
+    return Status::IOError(s, "File not found");
+  }
+
+  DeleteFileInternal(t);
+  file_map_[t] = file_map_[s];
+  file_map_.erase(s);
+  return Status::OK();
+}
+
+Status MockEnv::LinkFile(const std::string& src, const std::string& dest) {
+  auto s = NormalizePath(src);
+  auto t = NormalizePath(dest);
+  MutexLock lock(&mutex_);
+  if (file_map_.find(s) == file_map_.end()) {
+    return Status::IOError(s, "File not found");
+  }
+
+  DeleteFileInternal(t);
+  file_map_[t] = file_map_[s];
+  return Status::OK();
+}
+
+Status MockEnv::NewLogger(const std::string& fname,
+                             shared_ptr<Logger>* result) {
+  auto fn = NormalizePath(fname);
+  MutexLock lock(&mutex_);
+  auto iter = file_map_.find(fn);
+  MemFile* file = nullptr;
+  if (iter == file_map_.end()) {
+    file = new MemFile(this, fn, false);
+    file->Ref();
+    file_map_[fn] = file;
+  } else {
+    file = iter->second;
+  }
+  std::unique_ptr<WritableFile> f(new MockWritableFile(file, nullptr));
+  result->reset(new TestMemLogger(std::move(f), this));
+  return Status::OK();
+}
+
+Status MockEnv::LockFile(const std::string& fname, FileLock** flock) {
+  auto fn = NormalizePath(fname);
+  {
+    MutexLock lock(&mutex_);
+    if (file_map_.find(fn) != file_map_.end()) {
+      if (!file_map_[fn]->is_lock_file()) {
+        return Status::InvalidArgument(fname, "Not a lock file.");
+      }
+      if (!file_map_[fn]->Lock()) {
+        return Status::IOError(fn, "Lock is already held.");
+      }
+    } else {
+      auto* file = new MemFile(this, fn, true);
+      file->Ref();
+      file->Lock();
+      file_map_[fn] = file;
+    }
+  }
+  *flock = new MockEnvFileLock(fn);
+  return Status::OK();
+}
+
+Status MockEnv::UnlockFile(FileLock* flock) {
+  std::string fn = dynamic_cast<MockEnvFileLock*>(flock)->FileName();
+  {
+    MutexLock lock(&mutex_);
+    if (file_map_.find(fn) != file_map_.end()) {
+      if (!file_map_[fn]->is_lock_file()) {
+        return Status::InvalidArgument(fn, "Not a lock file.");
+      }
+      file_map_[fn]->Unlock();
+    }
+  }
+  delete flock;
+  return Status::OK();
+}
+
+Status MockEnv::GetTestDirectory(std::string* path) {
+  *path = "/test";
+  return Status::OK();
+}
+
+Status MockEnv::GetCurrentTime(int64_t* unix_time) {
+  auto s = EnvWrapper::GetCurrentTime(unix_time);
+  *unix_time += fake_sleep_micros_.load() / (1000 * 1000);
+  return s;
+}
+
+uint64_t MockEnv::NowMicros() {
+  return EnvWrapper::NowMicros() + fake_sleep_micros_.load();
+}
+
+uint64_t MockEnv::NowNanos() {
+  return EnvWrapper::NowNanos() + fake_sleep_micros_.load() * 1000;
+}
+
+// Non-virtual functions, specific to MockEnv
+Status MockEnv::Truncate(const std::string& fname, size_t size) {
+  auto fn = NormalizePath(fname);
+  MutexLock lock(&mutex_);
+  auto iter = file_map_.find(fn);
+  if (iter == file_map_.end()) {
+    return Status::IOError(fn, "File not found");
+  }
+  iter->second->Truncate(size);
+  return Status::OK();
+}
+
+Status MockEnv::CorruptBuffer(const std::string& fname) {
+  auto fn = NormalizePath(fname);
+  MutexLock lock(&mutex_);
+  auto iter = file_map_.find(fn);
+  if (iter == file_map_.end()) {
+    return Status::IOError(fn, "File not found");
+  }
+  iter->second->CorruptBuffer();
+  return Status::OK();
+}
+
+std::string MockEnv::NormalizePath(const std::string path) {
+  std::string dst;
+  for (auto c : path) {
+    if (!dst.empty() && c == '/' && dst.back() == '/') {
+      continue;
+    }
+    dst.push_back(c);
+  }
+  return dst;
+}
+
+void MockEnv::FakeSleepForMicroseconds(int64_t micros) {
+  fake_sleep_micros_.fetch_add(micros);
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/mock_env.h b/src/rocksdb/util/mock_env.h
new file mode 100644
index 0000000..55ef24b
--- /dev/null
+++ b/src/rocksdb/util/mock_env.h
@@ -0,0 +1,110 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <atomic>
+#include <map>
+#include <string>
+#include <vector>
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+#include "port/port.h"
+#include "util/mutexlock.h"
+
+namespace rocksdb {
+
+class MemFile;
+class MockEnv : public EnvWrapper {
+ public:
+  explicit MockEnv(Env* base_env);
+
+  virtual ~MockEnv();
+
+  // Partial implementation of the Env interface.
+  virtual Status NewSequentialFile(const std::string& fname,
+                                   unique_ptr<SequentialFile>* result,
+                                   const EnvOptions& soptions) override;
+
+  virtual Status NewRandomAccessFile(const std::string& fname,
+                                     unique_ptr<RandomAccessFile>* result,
+                                     const EnvOptions& soptions) override;
+
+  virtual Status NewWritableFile(const std::string& fname,
+                                 unique_ptr<WritableFile>* result,
+                                 const EnvOptions& env_options) override;
+
+  virtual Status NewRandomRWFile(const std::string& fname,
+                                 unique_ptr<RandomRWFile>* result,
+                                 const EnvOptions& options) override;
+
+  virtual Status NewDirectory(const std::string& name,
+                              unique_ptr<Directory>* result) override;
+
+  virtual bool FileExists(const std::string& fname) override;
+
+  virtual Status GetChildren(const std::string& dir,
+                             std::vector<std::string>* result) override;
+
+  void DeleteFileInternal(const std::string& fname);
+
+  virtual Status DeleteFile(const std::string& fname) override;
+
+  virtual Status CreateDir(const std::string& dirname) override;
+
+  virtual Status CreateDirIfMissing(const std::string& dirname) override;
+
+  virtual Status DeleteDir(const std::string& dirname) override;
+
+  virtual Status GetFileSize(const std::string& fname,
+                             uint64_t* file_size) override;
+
+  virtual Status GetFileModificationTime(const std::string& fname,
+                                         uint64_t* time) override;
+
+  virtual Status RenameFile(const std::string& src,
+                            const std::string& target) override;
+
+  virtual Status LinkFile(const std::string& src,
+                          const std::string& target) override;
+
+  virtual Status NewLogger(const std::string& fname,
+                           shared_ptr<Logger>* result) override;
+
+  virtual Status LockFile(const std::string& fname, FileLock** flock) override;
+
+  virtual Status UnlockFile(FileLock* flock) override;
+
+  virtual Status GetTestDirectory(std::string* path) override;
+
+  // Results of these can be affected by FakeSleepForMicroseconds()
+  virtual Status GetCurrentTime(int64_t* unix_time) override;
+  virtual uint64_t NowMicros() override;
+  virtual uint64_t NowNanos() override;
+
+  // Non-virtual functions, specific to MockEnv
+  Status Truncate(const std::string& fname, size_t size);
+
+  Status CorruptBuffer(const std::string& fname);
+
+  // Doesn't really sleep, just affects output of GetCurrentTime(), NowMicros()
+  // and NowNanos()
+  void FakeSleepForMicroseconds(int64_t micros);
+
+ private:
+  std::string NormalizePath(const std::string path);
+
+  // Map from filenames to MemFile objects, representing a simple file system.
+  typedef std::map<std::string, MemFile*> FileSystem;
+  port::Mutex mutex_;
+  FileSystem file_map_;  // Protected by mutex_.
+
+  std::atomic<int64_t> fake_sleep_micros_;
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/mock_env_test.cc b/src/rocksdb/util/mock_env_test.cc
new file mode 100644
index 0000000..e3d4970
--- /dev/null
+++ b/src/rocksdb/util/mock_env_test.cc
@@ -0,0 +1,285 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <memory>
+#include <string>
+#include <vector>
+#include "util/mock_env.h"
+#include "db/db_impl.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+class MockEnvTest : public testing::Test {
+ public:
+  MockEnv* env_;
+  const EnvOptions soptions_;
+
+  MockEnvTest()
+      : env_(new MockEnv(Env::Default())) {
+  }
+  ~MockEnvTest() {
+    delete env_;
+  }
+};
+
+TEST_F(MockEnvTest, Basics) {
+  uint64_t file_size;
+  unique_ptr<WritableFile> writable_file;
+  std::vector<std::string> children;
+
+  ASSERT_OK(env_->CreateDir("/dir"));
+
+  // Check that the directory is empty.
+  ASSERT_TRUE(!env_->FileExists("/dir/non_existent"));
+  ASSERT_TRUE(!env_->GetFileSize("/dir/non_existent", &file_size).ok());
+  ASSERT_OK(env_->GetChildren("/dir", &children));
+  ASSERT_EQ(0U, children.size());
+
+  // Create a file.
+  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
+  writable_file.reset();
+
+  // Check that the file exists.
+  ASSERT_TRUE(env_->FileExists("/dir/f"));
+  ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
+  ASSERT_EQ(0U, file_size);
+  ASSERT_OK(env_->GetChildren("/dir", &children));
+  ASSERT_EQ(1U, children.size());
+  ASSERT_EQ("f", children[0]);
+
+  // Write to the file.
+  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
+  ASSERT_OK(writable_file->Append("abc"));
+  writable_file.reset();
+
+  // Check for expected size.
+  ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
+  ASSERT_EQ(3U, file_size);
+
+  // Check that renaming works.
+  ASSERT_TRUE(!env_->RenameFile("/dir/non_existent", "/dir/g").ok());
+  ASSERT_OK(env_->RenameFile("/dir/f", "/dir/g"));
+  ASSERT_TRUE(!env_->FileExists("/dir/f"));
+  ASSERT_TRUE(env_->FileExists("/dir/g"));
+  ASSERT_OK(env_->GetFileSize("/dir/g", &file_size));
+  ASSERT_EQ(3U, file_size);
+
+  // Check that opening non-existent file fails.
+  unique_ptr<SequentialFile> seq_file;
+  unique_ptr<RandomAccessFile> rand_file;
+  ASSERT_TRUE(!env_->NewSequentialFile("/dir/non_existent", &seq_file,
+                                       soptions_).ok());
+  ASSERT_TRUE(!seq_file);
+  ASSERT_TRUE(!env_->NewRandomAccessFile("/dir/non_existent", &rand_file,
+                                         soptions_).ok());
+  ASSERT_TRUE(!rand_file);
+
+  // Check that deleting works.
+  ASSERT_TRUE(!env_->DeleteFile("/dir/non_existent").ok());
+  ASSERT_OK(env_->DeleteFile("/dir/g"));
+  ASSERT_TRUE(!env_->FileExists("/dir/g"));
+  ASSERT_OK(env_->GetChildren("/dir", &children));
+  ASSERT_EQ(0U, children.size());
+  ASSERT_OK(env_->DeleteDir("/dir"));
+}
+
+TEST_F(MockEnvTest, ReadWrite) {
+  unique_ptr<WritableFile> writable_file;
+  unique_ptr<SequentialFile> seq_file;
+  unique_ptr<RandomAccessFile> rand_file;
+  Slice result;
+  char scratch[100];
+
+  ASSERT_OK(env_->CreateDir("/dir"));
+
+  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
+  ASSERT_OK(writable_file->Append("hello "));
+  ASSERT_OK(writable_file->Append("world"));
+  writable_file.reset();
+
+  // Read sequentially.
+  ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file, soptions_));
+  ASSERT_OK(seq_file->Read(5, &result, scratch));  // Read "hello".
+  ASSERT_EQ(0, result.compare("hello"));
+  ASSERT_OK(seq_file->Skip(1));
+  ASSERT_OK(seq_file->Read(1000, &result, scratch));  // Read "world".
+  ASSERT_EQ(0, result.compare("world"));
+  ASSERT_OK(seq_file->Read(1000, &result, scratch));  // Try reading past EOF.
+  ASSERT_EQ(0U, result.size());
+  ASSERT_OK(seq_file->Skip(100));  // Try to skip past end of file.
+  ASSERT_OK(seq_file->Read(1000, &result, scratch));
+  ASSERT_EQ(0U, result.size());
+
+  // Random reads.
+  ASSERT_OK(env_->NewRandomAccessFile("/dir/f", &rand_file, soptions_));
+  ASSERT_OK(rand_file->Read(6, 5, &result, scratch));  // Read "world".
+  ASSERT_EQ(0, result.compare("world"));
+  ASSERT_OK(rand_file->Read(0, 5, &result, scratch));  // Read "hello".
+  ASSERT_EQ(0, result.compare("hello"));
+  ASSERT_OK(rand_file->Read(10, 100, &result, scratch));  // Read "d".
+  ASSERT_EQ(0, result.compare("d"));
+
+  // Too high offset.
+  ASSERT_TRUE(!rand_file->Read(1000, 5, &result, scratch).ok());
+}
+
+TEST_F(MockEnvTest, Locks) {
+  FileLock* lock;
+
+  // These are no-ops, but we test they return success.
+  ASSERT_OK(env_->LockFile("some file", &lock));
+  ASSERT_OK(env_->UnlockFile(lock));
+}
+
+TEST_F(MockEnvTest, Misc) {
+  std::string test_dir;
+  ASSERT_OK(env_->GetTestDirectory(&test_dir));
+  ASSERT_TRUE(!test_dir.empty());
+
+  unique_ptr<WritableFile> writable_file;
+  ASSERT_OK(env_->NewWritableFile("/a/b", &writable_file, soptions_));
+
+  // These are no-ops, but we test they return success.
+  ASSERT_OK(writable_file->Sync());
+  ASSERT_OK(writable_file->Flush());
+  ASSERT_OK(writable_file->Close());
+  writable_file.reset();
+}
+
+TEST_F(MockEnvTest, LargeWrite) {
+  const size_t kWriteSize = 300 * 1024;
+  char* scratch = new char[kWriteSize * 2];
+
+  std::string write_data;
+  for (size_t i = 0; i < kWriteSize; ++i) {
+    write_data.append(1, static_cast<char>(i));
+  }
+
+  unique_ptr<WritableFile> writable_file;
+  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
+  ASSERT_OK(writable_file->Append("foo"));
+  ASSERT_OK(writable_file->Append(write_data));
+  writable_file.reset();
+
+  unique_ptr<SequentialFile> seq_file;
+  Slice result;
+  ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file, soptions_));
+  ASSERT_OK(seq_file->Read(3, &result, scratch));  // Read "foo".
+  ASSERT_EQ(0, result.compare("foo"));
+
+  size_t read = 0;
+  std::string read_data;
+  while (read < kWriteSize) {
+    ASSERT_OK(seq_file->Read(kWriteSize - read, &result, scratch));
+    read_data.append(result.data(), result.size());
+    read += result.size();
+  }
+  ASSERT_TRUE(write_data == read_data);
+  delete [] scratch;
+}
+
+TEST_F(MockEnvTest, Corrupt) {
+  const std::string kGood = "this is a good string, synced to disk";
+  const std::string kCorrupted = "this part may be corrupted";
+  const std::string kFileName = "/dir/f";
+  unique_ptr<WritableFile> writable_file;
+  ASSERT_OK(env_->NewWritableFile(kFileName, &writable_file, soptions_));
+  ASSERT_OK(writable_file->Append(kGood));
+  ASSERT_TRUE(writable_file->GetFileSize() == kGood.size());
+
+  std::string scratch;
+  scratch.resize(kGood.size() + kCorrupted.size() + 16);
+  Slice result;
+  unique_ptr<RandomAccessFile> rand_file;
+  ASSERT_OK(env_->NewRandomAccessFile(kFileName, &rand_file, soptions_));
+  ASSERT_OK(rand_file->Read(0, kGood.size(), &result, &(scratch[0])));
+  ASSERT_EQ(result.compare(kGood), 0);
+
+  // Sync + corrupt => no change
+  ASSERT_OK(writable_file->Fsync());
+  ASSERT_OK(dynamic_cast<MockEnv*>(env_)->CorruptBuffer(kFileName));
+  result.clear();
+  ASSERT_OK(rand_file->Read(0, kGood.size(), &result, &(scratch[0])));
+  ASSERT_EQ(result.compare(kGood), 0);
+
+  // Add new data and corrupt it
+  ASSERT_OK(writable_file->Append(kCorrupted));
+  ASSERT_TRUE(writable_file->GetFileSize() == kGood.size() + kCorrupted.size());
+  result.clear();
+  ASSERT_OK(rand_file->Read(kGood.size(), kCorrupted.size(),
+            &result, &(scratch[0])));
+  ASSERT_EQ(result.compare(kCorrupted), 0);
+  // Corrupted
+  ASSERT_OK(dynamic_cast<MockEnv*>(env_)->CorruptBuffer(kFileName));
+  result.clear();
+  ASSERT_OK(rand_file->Read(kGood.size(), kCorrupted.size(),
+            &result, &(scratch[0])));
+  ASSERT_NE(result.compare(kCorrupted), 0);
+}
+
+TEST_F(MockEnvTest, DBTest) {
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_;
+  DB* db;
+
+  const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")};
+  const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")};
+
+  ASSERT_OK(DB::Open(options, "/dir/db", &db));
+  for (size_t i = 0; i < 3; ++i) {
+    ASSERT_OK(db->Put(WriteOptions(), keys[i], vals[i]));
+  }
+
+  for (size_t i = 0; i < 3; ++i) {
+    std::string res;
+    ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
+    ASSERT_TRUE(res == vals[i]);
+  }
+
+  Iterator* iterator = db->NewIterator(ReadOptions());
+  iterator->SeekToFirst();
+  for (size_t i = 0; i < 3; ++i) {
+    ASSERT_TRUE(iterator->Valid());
+    ASSERT_TRUE(keys[i] == iterator->key());
+    ASSERT_TRUE(vals[i] == iterator->value());
+    iterator->Next();
+  }
+  ASSERT_TRUE(!iterator->Valid());
+  delete iterator;
+
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db);
+  ASSERT_OK(dbi->TEST_FlushMemTable());
+
+  for (size_t i = 0; i < 3; ++i) {
+    std::string res;
+    ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
+    ASSERT_TRUE(res == vals[i]);
+  }
+
+  delete db;
+}
+
+TEST_F(MockEnvTest, FakeSleeping) {
+  int64_t now = 0;
+  auto s = env_->GetCurrentTime(&now);
+  ASSERT_OK(s);
+  env_->FakeSleepForMicroseconds(3 * 1000 * 1000);
+  int64_t after_sleep = 0;
+  s = env_->GetCurrentTime(&after_sleep);
+  ASSERT_OK(s);
+  auto delta = after_sleep - now;
+  // this will be true unless test runs for 2 seconds
+  ASSERT_TRUE(delta == 3 || delta == 4);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/murmurhash.h b/src/rocksdb/util/murmurhash.h
index faa8655..40ee357 100644
--- a/src/rocksdb/util/murmurhash.h
+++ b/src/rocksdb/util/murmurhash.h
@@ -36,7 +36,7 @@ typedef unsigned int murmur_t;
 namespace rocksdb {
 struct murmur_hash {
   size_t operator()(const Slice& slice) const {
-    return MurmurHash(slice.data(), slice.size(), 0);
+    return MurmurHash(slice.data(), static_cast<int>(slice.size()), 0);
   }
 };
 }  // rocksdb
diff --git a/src/rocksdb/util/mutable_cf_options.cc b/src/rocksdb/util/mutable_cf_options.cc
new file mode 100644
index 0000000..187a97a
--- /dev/null
+++ b/src/rocksdb/util/mutable_cf_options.cc
@@ -0,0 +1,121 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
+#include <limits>
+#include <cassert>
+#include <string>
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/immutable_options.h"
+#include "util/mutable_cf_options.h"
+
+namespace rocksdb {
+
+// Multiple two operands. If they overflow, return op1.
+uint64_t MultiplyCheckOverflow(uint64_t op1, int op2) {
+  if (op1 == 0) {
+    return 0;
+  }
+  if (op2 <= 0) {
+    return op1;
+  }
+  uint64_t casted_op2 = (uint64_t) op2;
+  if (std::numeric_limits<uint64_t>::max() / op1 < casted_op2) {
+    return op1;
+  }
+  return op1 * casted_op2;
+}
+
+void MutableCFOptions::RefreshDerivedOptions(
+    const ImmutableCFOptions& ioptions) {
+  max_file_size.resize(ioptions.num_levels);
+  for (int i = 0; i < ioptions.num_levels; ++i) {
+    if (i == 0 && ioptions.compaction_style == kCompactionStyleUniversal) {
+      max_file_size[i] = ULLONG_MAX;
+    } else if (i > 1) {
+      max_file_size[i] = MultiplyCheckOverflow(max_file_size[i - 1],
+                                               target_file_size_multiplier);
+    } else {
+      max_file_size[i] = target_file_size_base;
+    }
+  }
+}
+
+uint64_t MutableCFOptions::MaxFileSizeForLevel(int level) const {
+  assert(level >= 0);
+  assert(level < (int)max_file_size.size());
+  return max_file_size[level];
+}
+uint64_t MutableCFOptions::MaxGrandParentOverlapBytes(int level) const {
+  return MaxFileSizeForLevel(level) * max_grandparent_overlap_factor;
+}
+uint64_t MutableCFOptions::ExpandedCompactionByteSizeLimit(int level) const {
+  return MaxFileSizeForLevel(level) * expanded_compaction_factor;
+}
+
+void MutableCFOptions::Dump(Logger* log) const {
+  // Memtable related options
+  Log(log, "                        write_buffer_size: %zu", write_buffer_size);
+  Log(log, "                  max_write_buffer_number: %d",
+      max_write_buffer_number);
+  Log(log, "                         arena_block_size: %zu", arena_block_size);
+  Log(log, "               memtable_prefix_bloom_bits: %" PRIu32,
+      memtable_prefix_bloom_bits);
+  Log(log, "             memtable_prefix_bloom_probes: %" PRIu32,
+      memtable_prefix_bloom_probes);
+  Log(log, " memtable_prefix_bloom_huge_page_tlb_size: %zu",
+      memtable_prefix_bloom_huge_page_tlb_size);
+  Log(log, "                    max_successive_merges: %zu",
+      max_successive_merges);
+  Log(log, "                           filter_deletes: %d",
+      filter_deletes);
+  Log(log, "                 disable_auto_compactions: %d",
+      disable_auto_compactions);
+  Log(log, "                          soft_rate_limit: %lf",
+      soft_rate_limit);
+  Log(log, "                          hard_rate_limit: %lf",
+      hard_rate_limit);
+  Log(log, "       level0_file_num_compaction_trigger: %d",
+      level0_file_num_compaction_trigger);
+  Log(log, "           level0_slowdown_writes_trigger: %d",
+      level0_slowdown_writes_trigger);
+  Log(log, "               level0_stop_writes_trigger: %d",
+      level0_stop_writes_trigger);
+  Log(log, "           max_grandparent_overlap_factor: %d",
+      max_grandparent_overlap_factor);
+  Log(log, "               expanded_compaction_factor: %d",
+      expanded_compaction_factor);
+  Log(log, "                 source_compaction_factor: %d",
+      source_compaction_factor);
+  Log(log, "                    target_file_size_base: %" PRIu64,
+      target_file_size_base);
+  Log(log, "              target_file_size_multiplier: %d",
+      target_file_size_multiplier);
+  Log(log, "                 max_bytes_for_level_base: %" PRIu64,
+      max_bytes_for_level_base);
+  Log(log, "           max_bytes_for_level_multiplier: %d",
+      max_bytes_for_level_multiplier);
+  std::string result;
+  char buf[10];
+  for (const auto m : max_bytes_for_level_multiplier_additional) {
+    snprintf(buf, sizeof(buf), "%d, ", m);
+    result += buf;
+  }
+  result.resize(result.size() - 2);
+  Log(log, "max_bytes_for_level_multiplier_additional: %s", result.c_str());
+  Log(log, "                 max_mem_compaction_level: %d",
+      max_mem_compaction_level);
+  Log(log, "           verify_checksums_in_compaction: %d",
+      verify_checksums_in_compaction);
+  Log(log, "        max_sequential_skip_in_iterations: %" PRIu64,
+      max_sequential_skip_in_iterations);
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/mutable_cf_options.h b/src/rocksdb/util/mutable_cf_options.h
new file mode 100644
index 0000000..20845d9
--- /dev/null
+++ b/src/rocksdb/util/mutable_cf_options.h
@@ -0,0 +1,139 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include <vector>
+#include "rocksdb/options.h"
+#include "rocksdb/immutable_options.h"
+
+namespace rocksdb {
+
+struct MutableCFOptions {
+  MutableCFOptions(const Options& options, const ImmutableCFOptions& ioptions)
+    : write_buffer_size(options.write_buffer_size),
+      max_write_buffer_number(options.max_write_buffer_number),
+      arena_block_size(options.arena_block_size),
+      memtable_prefix_bloom_bits(options.memtable_prefix_bloom_bits),
+      memtable_prefix_bloom_probes(options.memtable_prefix_bloom_probes),
+      memtable_prefix_bloom_huge_page_tlb_size(
+          options.memtable_prefix_bloom_huge_page_tlb_size),
+      max_successive_merges(options.max_successive_merges),
+      filter_deletes(options.filter_deletes),
+      inplace_update_num_locks(options.inplace_update_num_locks),
+      disable_auto_compactions(options.disable_auto_compactions),
+      soft_rate_limit(options.soft_rate_limit),
+      hard_rate_limit(options.hard_rate_limit),
+      level0_file_num_compaction_trigger(
+          options.level0_file_num_compaction_trigger),
+      level0_slowdown_writes_trigger(options.level0_slowdown_writes_trigger),
+      level0_stop_writes_trigger(options.level0_stop_writes_trigger),
+      max_grandparent_overlap_factor(options.max_grandparent_overlap_factor),
+      expanded_compaction_factor(options.expanded_compaction_factor),
+      source_compaction_factor(options.source_compaction_factor),
+      target_file_size_base(options.target_file_size_base),
+      target_file_size_multiplier(options.target_file_size_multiplier),
+      max_bytes_for_level_base(options.max_bytes_for_level_base),
+      max_bytes_for_level_multiplier(options.max_bytes_for_level_multiplier),
+      max_bytes_for_level_multiplier_additional(
+          options.max_bytes_for_level_multiplier_additional),
+      max_mem_compaction_level(options.max_mem_compaction_level),
+      verify_checksums_in_compaction(options.verify_checksums_in_compaction),
+      max_sequential_skip_in_iterations(
+          options.max_sequential_skip_in_iterations),
+      paranoid_file_checks(options.paranoid_file_checks)
+  {
+    RefreshDerivedOptions(ioptions);
+  }
+  MutableCFOptions()
+    : write_buffer_size(0),
+      max_write_buffer_number(0),
+      arena_block_size(0),
+      memtable_prefix_bloom_bits(0),
+      memtable_prefix_bloom_probes(0),
+      memtable_prefix_bloom_huge_page_tlb_size(0),
+      max_successive_merges(0),
+      filter_deletes(false),
+      inplace_update_num_locks(0),
+      disable_auto_compactions(false),
+      soft_rate_limit(0),
+      hard_rate_limit(0),
+      level0_file_num_compaction_trigger(0),
+      level0_slowdown_writes_trigger(0),
+      level0_stop_writes_trigger(0),
+      max_grandparent_overlap_factor(0),
+      expanded_compaction_factor(0),
+      source_compaction_factor(0),
+      target_file_size_base(0),
+      target_file_size_multiplier(0),
+      max_bytes_for_level_base(0),
+      max_bytes_for_level_multiplier(0),
+      max_mem_compaction_level(0),
+      verify_checksums_in_compaction(false),
+      max_sequential_skip_in_iterations(0),
+      paranoid_file_checks(false)
+  {}
+
+  // Must be called after any change to MutableCFOptions
+  void RefreshDerivedOptions(const ImmutableCFOptions& ioptions);
+
+  // Get the max file size in a given level.
+  uint64_t MaxFileSizeForLevel(int level) const;
+  // Returns maximum total overlap bytes with grandparent
+  // level (i.e., level+2) before we stop building a single
+  // file in level->level+1 compaction.
+  uint64_t MaxGrandParentOverlapBytes(int level) const;
+  uint64_t ExpandedCompactionByteSizeLimit(int level) const;
+  int MaxBytesMultiplerAdditional(int level) const {
+    if (level >=
+        static_cast<int>(max_bytes_for_level_multiplier_additional.size())) {
+      return 1;
+    }
+    return max_bytes_for_level_multiplier_additional[level];
+  }
+
+  void Dump(Logger* log) const;
+
+  // Memtable related options
+  size_t write_buffer_size;
+  int max_write_buffer_number;
+  size_t arena_block_size;
+  uint32_t memtable_prefix_bloom_bits;
+  uint32_t memtable_prefix_bloom_probes;
+  size_t memtable_prefix_bloom_huge_page_tlb_size;
+  size_t max_successive_merges;
+  bool filter_deletes;
+  size_t inplace_update_num_locks;
+
+  // Compaction related options
+  bool disable_auto_compactions;
+  double soft_rate_limit;
+  double hard_rate_limit;
+  int level0_file_num_compaction_trigger;
+  int level0_slowdown_writes_trigger;
+  int level0_stop_writes_trigger;
+  int max_grandparent_overlap_factor;
+  int expanded_compaction_factor;
+  int source_compaction_factor;
+  uint64_t target_file_size_base;
+  int target_file_size_multiplier;
+  uint64_t max_bytes_for_level_base;
+  int max_bytes_for_level_multiplier;
+  std::vector<int> max_bytes_for_level_multiplier_additional;
+  int max_mem_compaction_level;
+  bool verify_checksums_in_compaction;
+
+  // Misc options
+  uint64_t max_sequential_skip_in_iterations;
+  bool paranoid_file_checks;
+
+  // Derived options
+  // Per-level target file size.
+  std::vector<uint64_t> max_file_size;
+};
+
+uint64_t MultiplyCheckOverflow(uint64_t op1, int op2);
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/mutexlock.h b/src/rocksdb/util/mutexlock.h
index 0f4e5c8..6121ec1 100644
--- a/src/rocksdb/util/mutexlock.h
+++ b/src/rocksdb/util/mutexlock.h
@@ -46,7 +46,7 @@ class ReadLock {
   explicit ReadLock(port::RWMutex *mu) : mu_(mu) {
     this->mu_->ReadLock();
   }
-  ~ReadLock() { this->mu_->Unlock(); }
+  ~ReadLock() { this->mu_->ReadUnlock(); }
 
  private:
   port::RWMutex *const mu_;
@@ -66,7 +66,7 @@ class WriteLock {
   explicit WriteLock(port::RWMutex *mu) : mu_(mu) {
     this->mu_->WriteLock();
   }
-  ~WriteLock() { this->mu_->Unlock(); }
+  ~WriteLock() { this->mu_->WriteUnlock(); }
 
  private:
   port::RWMutex *const mu_;
diff --git a/src/rocksdb/util/options.cc b/src/rocksdb/util/options.cc
index c8d1e38..6bb462a 100644
--- a/src/rocksdb/util/options.cc
+++ b/src/rocksdb/util/options.cc
@@ -8,16 +8,20 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "rocksdb/options.h"
+#include "rocksdb/immutable_options.h"
 
+#ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
+#endif
+
 #include <inttypes.h>
 #include <limits>
 
+#include "db/writebuffer.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/compaction_filter.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/env.h"
-#include "rocksdb/filter_policy.h"
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/slice.h"
@@ -25,9 +29,57 @@
 #include "rocksdb/table.h"
 #include "rocksdb/table_properties.h"
 #include "table/block_based_table_factory.h"
+#include "util/compression.h"
+#include "util/statistics.h"
+#include "util/xfunc.h"
 
 namespace rocksdb {
 
+ImmutableCFOptions::ImmutableCFOptions(const Options& options)
+    : compaction_style(options.compaction_style),
+      compaction_options_universal(options.compaction_options_universal),
+      compaction_options_fifo(options.compaction_options_fifo),
+      prefix_extractor(options.prefix_extractor.get()),
+      comparator(options.comparator),
+      merge_operator(options.merge_operator.get()),
+      compaction_filter(options.compaction_filter),
+      compaction_filter_factory(options.compaction_filter_factory.get()),
+      compaction_filter_factory_v2(options.compaction_filter_factory_v2.get()),
+      inplace_update_support(options.inplace_update_support),
+      inplace_callback(options.inplace_callback),
+      info_log(options.info_log.get()),
+      statistics(options.statistics.get()),
+      env(options.env),
+      allow_mmap_reads(options.allow_mmap_reads),
+      allow_mmap_writes(options.allow_mmap_writes),
+      db_paths(options.db_paths),
+      memtable_factory(options.memtable_factory.get()),
+      table_factory(options.table_factory.get()),
+      table_properties_collector_factories(
+          options.table_properties_collector_factories),
+      advise_random_on_open(options.advise_random_on_open),
+      bloom_locality(options.bloom_locality),
+      purge_redundant_kvs_while_flush(options.purge_redundant_kvs_while_flush),
+      min_partial_merge_operands(options.min_partial_merge_operands),
+      disable_data_sync(options.disableDataSync),
+      use_fsync(options.use_fsync),
+      compression(options.compression),
+      compression_per_level(options.compression_per_level),
+      compression_opts(options.compression_opts),
+      level_compaction_dynamic_level_bytes(
+          options.level_compaction_dynamic_level_bytes),
+      access_hint_on_compaction_start(options.access_hint_on_compaction_start),
+      num_levels(options.num_levels),
+      optimize_filters_for_hits(options.optimize_filters_for_hits)
+#ifndef ROCKSDB_LITE
+      ,
+      listeners(options.listeners) {
+}
+#else  // ROCKSDB_LITE
+{
+}
+#endif  // ROCKSDB_LITE
+
 ColumnFamilyOptions::ColumnFamilyOptions()
     : comparator(BytewiseComparator()),
       merge_operator(nullptr),
@@ -38,14 +90,8 @@ ColumnFamilyOptions::ColumnFamilyOptions()
       write_buffer_size(4 << 20),
       max_write_buffer_number(2),
       min_write_buffer_number_to_merge(1),
-      block_cache(nullptr),
-      block_cache_compressed(nullptr),
-      block_size(4096),
-      block_restart_interval(16),
       compression(kSnappyCompression),
-      filter_policy(nullptr),
       prefix_extractor(nullptr),
-      whole_key_filtering(true),
       num_levels(7),
       level0_file_num_compaction_trigger(4),
       level0_slowdown_writes_trigger(20),
@@ -54,20 +100,18 @@ ColumnFamilyOptions::ColumnFamilyOptions()
       target_file_size_base(2 * 1048576),
       target_file_size_multiplier(1),
       max_bytes_for_level_base(10 * 1048576),
+      level_compaction_dynamic_level_bytes(false),
       max_bytes_for_level_multiplier(10),
       max_bytes_for_level_multiplier_additional(num_levels, 1),
       expanded_compaction_factor(25),
       source_compaction_factor(1),
       max_grandparent_overlap_factor(10),
-      disable_seek_compaction(true),
       soft_rate_limit(0.0),
       hard_rate_limit(0.0),
       rate_limit_delay_max_milliseconds(1000),
-      no_block_cache(false),
       arena_block_size(0),
       disable_auto_compactions(false),
       purge_redundant_kvs_while_flush(true),
-      block_size_deviation(10),
       compaction_style(kCompactionStyleLevel),
       verify_checksums_in_compaction(true),
       filter_deletes(false),
@@ -83,7 +127,15 @@ ColumnFamilyOptions::ColumnFamilyOptions()
       memtable_prefix_bloom_huge_page_tlb_size(0),
       bloom_locality(0),
       max_successive_merges(0),
-      min_partial_merge_operands(2) {
+      min_partial_merge_operands(2),
+      optimize_filters_for_hits(false),
+      paranoid_file_checks(false)
+#ifndef ROCKSDB_LITE
+      ,
+      listeners() {
+#else  // ROCKSDB_LITE
+{
+#endif  // ROCKSDB_LITE
   assert(memtable_factory.get() != nullptr);
 }
 
@@ -97,16 +149,10 @@ ColumnFamilyOptions::ColumnFamilyOptions(const Options& options)
       max_write_buffer_number(options.max_write_buffer_number),
       min_write_buffer_number_to_merge(
           options.min_write_buffer_number_to_merge),
-      block_cache(options.block_cache),
-      block_cache_compressed(options.block_cache_compressed),
-      block_size(options.block_size),
-      block_restart_interval(options.block_restart_interval),
       compression(options.compression),
       compression_per_level(options.compression_per_level),
       compression_opts(options.compression_opts),
-      filter_policy(options.filter_policy),
       prefix_extractor(options.prefix_extractor),
-      whole_key_filtering(options.whole_key_filtering),
       num_levels(options.num_levels),
       level0_file_num_compaction_trigger(
           options.level0_file_num_compaction_trigger),
@@ -116,31 +162,32 @@ ColumnFamilyOptions::ColumnFamilyOptions(const Options& options)
       target_file_size_base(options.target_file_size_base),
       target_file_size_multiplier(options.target_file_size_multiplier),
       max_bytes_for_level_base(options.max_bytes_for_level_base),
+      level_compaction_dynamic_level_bytes(
+          options.level_compaction_dynamic_level_bytes),
       max_bytes_for_level_multiplier(options.max_bytes_for_level_multiplier),
       max_bytes_for_level_multiplier_additional(
           options.max_bytes_for_level_multiplier_additional),
       expanded_compaction_factor(options.expanded_compaction_factor),
       source_compaction_factor(options.source_compaction_factor),
       max_grandparent_overlap_factor(options.max_grandparent_overlap_factor),
-      disable_seek_compaction(options.disable_seek_compaction),
       soft_rate_limit(options.soft_rate_limit),
       hard_rate_limit(options.hard_rate_limit),
       rate_limit_delay_max_milliseconds(
           options.rate_limit_delay_max_milliseconds),
-      no_block_cache(options.no_block_cache),
       arena_block_size(options.arena_block_size),
       disable_auto_compactions(options.disable_auto_compactions),
       purge_redundant_kvs_while_flush(options.purge_redundant_kvs_while_flush),
-      block_size_deviation(options.block_size_deviation),
       compaction_style(options.compaction_style),
       verify_checksums_in_compaction(options.verify_checksums_in_compaction),
       compaction_options_universal(options.compaction_options_universal),
+      compaction_options_fifo(options.compaction_options_fifo),
       filter_deletes(options.filter_deletes),
       max_sequential_skip_in_iterations(
           options.max_sequential_skip_in_iterations),
       memtable_factory(options.memtable_factory),
       table_factory(options.table_factory),
-      table_properties_collectors(options.table_properties_collectors),
+      table_properties_collector_factories(
+          options.table_properties_collector_factories),
       inplace_update_support(options.inplace_update_support),
       inplace_update_num_locks(options.inplace_update_num_locks),
       inplace_callback(options.inplace_callback),
@@ -150,23 +197,40 @@ ColumnFamilyOptions::ColumnFamilyOptions(const Options& options)
           options.memtable_prefix_bloom_huge_page_tlb_size),
       bloom_locality(options.bloom_locality),
       max_successive_merges(options.max_successive_merges),
-      min_partial_merge_operands(options.min_partial_merge_operands) {
+      min_partial_merge_operands(options.min_partial_merge_operands),
+      optimize_filters_for_hits(options.optimize_filters_for_hits),
+      paranoid_file_checks(options.paranoid_file_checks)
+#ifndef ROCKSDB_LITE
+      ,
+      listeners(options.listeners) {
+#else   // ROCKSDB_LITE
+{
+#endif  // ROCKSDB_LITE
   assert(memtable_factory.get() != nullptr);
+  if (max_bytes_for_level_multiplier_additional.size() <
+      static_cast<unsigned int>(num_levels)) {
+    max_bytes_for_level_multiplier_additional.resize(num_levels, 1);
+  }
 }
 
 DBOptions::DBOptions()
     : create_if_missing(false),
+      create_missing_column_families(false),
       error_if_exists(false),
       paranoid_checks(true),
       env(Env::Default()),
+      rate_limiter(nullptr),
       info_log(nullptr),
+#ifdef NDEBUG
       info_log_level(INFO_LEVEL),
+#else
+      info_log_level(DEBUG_LEVEL),
+#endif  // NDEBUG
       max_open_files(5000),
       max_total_wal_size(0),
       statistics(nullptr),
       disableDataSync(false),
       use_fsync(false),
-      db_stats_log_interval(1800),
       db_log_dir(""),
       wal_dir(""),
       delete_obsolete_files_period_micros(6 * 60 * 60 * 1000000UL),
@@ -177,7 +241,6 @@ DBOptions::DBOptions()
       keep_log_file_num(1000),
       max_manifest_file_size(std::numeric_limits<uint64_t>::max()),
       table_cache_numshardbits(4),
-      table_cache_remove_scan_count_limit(16),
       WAL_ttl_seconds(0),
       WAL_size_limit_MB(0),
       manifest_preallocation_size(4 * 1024 * 1024),
@@ -188,16 +251,21 @@ DBOptions::DBOptions()
       skip_log_error_on_recovery(false),
       stats_dump_period_sec(3600),
       advise_random_on_open(true),
+      db_write_buffer_size(0),
       access_hint_on_compaction_start(NORMAL),
       use_adaptive_mutex(false),
       bytes_per_sync(0),
-      allow_thread_local(true) {}
+      wal_bytes_per_sync(0),
+      enable_thread_tracking(false) {
+}
 
 DBOptions::DBOptions(const Options& options)
     : create_if_missing(options.create_if_missing),
+      create_missing_column_families(options.create_missing_column_families),
       error_if_exists(options.error_if_exists),
       paranoid_checks(options.paranoid_checks),
       env(options.env),
+      rate_limiter(options.rate_limiter),
       info_log(options.info_log),
       info_log_level(options.info_log_level),
       max_open_files(options.max_open_files),
@@ -205,7 +273,7 @@ DBOptions::DBOptions(const Options& options)
       statistics(options.statistics),
       disableDataSync(options.disableDataSync),
       use_fsync(options.use_fsync),
-      db_stats_log_interval(options.db_stats_log_interval),
+      db_paths(options.db_paths),
       db_log_dir(options.db_log_dir),
       wal_dir(options.wal_dir),
       delete_obsolete_files_period_micros(
@@ -217,8 +285,6 @@ DBOptions::DBOptions(const Options& options)
       keep_log_file_num(options.keep_log_file_num),
       max_manifest_file_size(options.max_manifest_file_size),
       table_cache_numshardbits(options.table_cache_numshardbits),
-      table_cache_remove_scan_count_limit(
-          options.table_cache_remove_scan_count_limit),
       WAL_ttl_seconds(options.WAL_ttl_seconds),
       WAL_size_limit_MB(options.WAL_size_limit_MB),
       manifest_preallocation_size(options.manifest_preallocation_size),
@@ -229,10 +295,12 @@ DBOptions::DBOptions(const Options& options)
       skip_log_error_on_recovery(options.skip_log_error_on_recovery),
       stats_dump_period_sec(options.stats_dump_period_sec),
       advise_random_on_open(options.advise_random_on_open),
+      db_write_buffer_size(options.db_write_buffer_size),
       access_hint_on_compaction_start(options.access_hint_on_compaction_start),
       use_adaptive_mutex(options.use_adaptive_mutex),
       bytes_per_sync(options.bytes_per_sync),
-      allow_thread_local(options.allow_thread_local) {}
+      wal_bytes_per_sync(options.wal_bytes_per_sync),
+      enable_thread_tracking(options.enable_thread_tracking) {}
 
 static const char* const access_hints[] = {
   "NONE", "NORMAL", "SEQUENTIAL", "WILLNEED"
@@ -249,32 +317,31 @@ void DBOptions::Dump(Logger* log) const {
     Log(log, "       Options.disableDataSync: %d", disableDataSync);
     Log(log, "             Options.use_fsync: %d", use_fsync);
     Log(log, "     Options.max_log_file_size: %zu", max_log_file_size);
-    Log(log, "Options.max_manifest_file_size: %lu",
-        (unsigned long)max_manifest_file_size);
+    Log(log, "Options.max_manifest_file_size: %" PRIu64,
+        max_manifest_file_size);
     Log(log, "     Options.log_file_time_to_roll: %zu", log_file_time_to_roll);
     Log(log, "     Options.keep_log_file_num: %zu", keep_log_file_num);
-    Log(log, " Options.db_stats_log_interval: %d", db_stats_log_interval);
     Log(log, "       Options.allow_os_buffer: %d", allow_os_buffer);
     Log(log, "      Options.allow_mmap_reads: %d", allow_mmap_reads);
     Log(log, "     Options.allow_mmap_writes: %d", allow_mmap_writes);
+    Log(log, "         Options.create_missing_column_families: %d",
+        create_missing_column_families);
     Log(log, "                             Options.db_log_dir: %s",
         db_log_dir.c_str());
-    Log(log, "                             Options.wal_dir: %s",
+    Log(log, "                                Options.wal_dir: %s",
         wal_dir.c_str());
     Log(log, "               Options.table_cache_numshardbits: %d",
         table_cache_numshardbits);
-    Log(log, "    Options.table_cache_remove_scan_count_limit: %d",
-        table_cache_remove_scan_count_limit);
-    Log(log, "    Options.delete_obsolete_files_period_micros: %lu",
-        (unsigned long)delete_obsolete_files_period_micros);
+    Log(log, "    Options.delete_obsolete_files_period_micros: %" PRIu64,
+        delete_obsolete_files_period_micros);
     Log(log, "             Options.max_background_compactions: %d",
         max_background_compactions);
     Log(log, "                 Options.max_background_flushes: %d",
         max_background_flushes);
-    Log(log, "                        Options.WAL_ttl_seconds: %lu",
-        (unsigned long)WAL_ttl_seconds);
-    Log(log, "                      Options.WAL_size_limit_MB: %lu",
-        (unsigned long)WAL_size_limit_MB);
+    Log(log, "                        Options.WAL_ttl_seconds: %" PRIu64,
+        WAL_ttl_seconds);
+    Log(log, "                      Options.WAL_size_limit_MB: %" PRIu64,
+        WAL_size_limit_MB);
     Log(log, "            Options.manifest_preallocation_size: %zu",
         manifest_preallocation_size);
     Log(log, "                         Options.allow_os_buffer: %d",
@@ -285,58 +352,53 @@ void DBOptions::Dump(Logger* log) const {
         allow_mmap_writes);
     Log(log, "                     Options.is_fd_close_on_exec: %d",
         is_fd_close_on_exec);
-    Log(log, "              Options.skip_log_error_on_recovery: %d",
-        skip_log_error_on_recovery);
     Log(log, "                   Options.stats_dump_period_sec: %u",
         stats_dump_period_sec);
     Log(log, "                   Options.advise_random_on_open: %d",
         advise_random_on_open);
+    Log(log, "                    Options.db_write_buffer_size: %zd",
+        db_write_buffer_size);
     Log(log, "         Options.access_hint_on_compaction_start: %s",
         access_hints[access_hint_on_compaction_start]);
     Log(log, "                      Options.use_adaptive_mutex: %d",
         use_adaptive_mutex);
-    Log(log, "                          Options.bytes_per_sync: %lu",
-        (unsigned long)bytes_per_sync);
+    Log(log, "                            Options.rate_limiter: %p",
+        rate_limiter.get());
+    Log(log, "                          Options.bytes_per_sync: %" PRIu64,
+        bytes_per_sync);
+    Log(log, "                      Options.wal_bytes_per_sync: %" PRIu64,
+        wal_bytes_per_sync);
+    Log(log, "                  Options.enable_thread_tracking: %d",
+        enable_thread_tracking);
 }  // DBOptions::Dump
 
 void ColumnFamilyOptions::Dump(Logger* log) const {
   Log(log, "              Options.comparator: %s", comparator->Name());
   Log(log, "          Options.merge_operator: %s",
       merge_operator ? merge_operator->Name() : "None");
+  Log(log, "       Options.compaction_filter: %s",
+      compaction_filter ? compaction_filter->Name() : "None");
   Log(log, "       Options.compaction_filter_factory: %s",
       compaction_filter_factory->Name());
   Log(log, "       Options.compaction_filter_factory_v2: %s",
       compaction_filter_factory_v2->Name());
   Log(log, "        Options.memtable_factory: %s", memtable_factory->Name());
   Log(log, "           Options.table_factory: %s", table_factory->Name());
+  Log(log, "           table_factory options: %s",
+      table_factory->GetPrintableTableOptions().c_str());
   Log(log, "       Options.write_buffer_size: %zd", write_buffer_size);
   Log(log, " Options.max_write_buffer_number: %d", max_write_buffer_number);
-    Log(log,"             Options.block_cache: %p", block_cache.get());
-    Log(log,"  Options.block_cache_compressed: %p",
-        block_cache_compressed.get());
-    if (block_cache) {
-      Log(log,"        Options.block_cache_size: %zd",
-          block_cache->GetCapacity());
-    }
-    if (block_cache_compressed) {
-      Log(log,"Options.block_cache_compressed_size: %zd",
-          block_cache_compressed->GetCapacity());
-    }
-    Log(log,"              Options.block_size: %zd", block_size);
-    Log(log,"  Options.block_restart_interval: %d", block_restart_interval);
     if (!compression_per_level.empty()) {
       for (unsigned int i = 0; i < compression_per_level.size(); i++) {
-          Log(log,"       Options.compression[%d]: %d",
-              i, compression_per_level[i]);
-       }
+        Log(log, "       Options.compression[%d]: %s", i,
+            CompressionTypeToString(compression_per_level[i]));
+      }
     } else {
-      Log(log,"         Options.compression: %d", compression);
+      Log(log, "         Options.compression: %s",
+          CompressionTypeToString(compression));
     }
-    Log(log,"         Options.filter_policy: %s",
-        filter_policy == nullptr ? "nullptr" : filter_policy->Name());
     Log(log,"      Options.prefix_extractor: %s",
         prefix_extractor == nullptr ? "nullptr" : prefix_extractor->Name());
-    Log(log,"   Options.whole_key_filtering: %d", whole_key_filtering);
     Log(log,"            Options.num_levels: %d", num_levels);
     Log(log,"       Options.min_write_buffer_number_to_merge: %d",
         min_write_buffer_number_to_merge);
@@ -356,30 +418,29 @@ void ColumnFamilyOptions::Dump(Logger* log) const {
         level0_stop_writes_trigger);
     Log(log,"               Options.max_mem_compaction_level: %d",
         max_mem_compaction_level);
-    Log(log,"                  Options.target_file_size_base: %d",
+    Log(log,"                  Options.target_file_size_base: %" PRIu64,
         target_file_size_base);
     Log(log,"            Options.target_file_size_multiplier: %d",
         target_file_size_multiplier);
-    Log(log,"               Options.max_bytes_for_level_base: %lu",
-        (unsigned long)max_bytes_for_level_base);
+    Log(log,"               Options.max_bytes_for_level_base: %" PRIu64,
+        max_bytes_for_level_base);
+    Log(log, "Options.level_compaction_dynamic_level_bytes: %d",
+        level_compaction_dynamic_level_bytes);
     Log(log,"         Options.max_bytes_for_level_multiplier: %d",
         max_bytes_for_level_multiplier);
-    for (int i = 0; i < num_levels; i++) {
-      Log(log,"Options.max_bytes_for_level_multiplier_addtl[%d]: %d",
-          i, max_bytes_for_level_multiplier_additional[i]);
+    for (size_t i = 0; i < max_bytes_for_level_multiplier_additional.size();
+         i++) {
+      Log(log, "Options.max_bytes_for_level_multiplier_addtl[%zu]: %d", i,
+          max_bytes_for_level_multiplier_additional[i]);
     }
-    Log(log,"      Options.max_sequential_skip_in_iterations: %lu",
-        (unsigned long)max_sequential_skip_in_iterations);
+    Log(log,"      Options.max_sequential_skip_in_iterations: %" PRIu64,
+        max_sequential_skip_in_iterations);
     Log(log,"             Options.expanded_compaction_factor: %d",
         expanded_compaction_factor);
     Log(log,"               Options.source_compaction_factor: %d",
         source_compaction_factor);
     Log(log,"         Options.max_grandparent_overlap_factor: %d",
         max_grandparent_overlap_factor);
-    Log(log,"                Options.disable_seek_compaction: %d",
-        disable_seek_compaction);
-    Log(log,"                         Options.no_block_cache: %d",
-        no_block_cache);
     Log(log,"                       Options.arena_block_size: %zu",
         arena_block_size);
     Log(log,"                      Options.soft_rate_limit: %.2f",
@@ -392,8 +453,6 @@ void ColumnFamilyOptions::Dump(Logger* log) const {
         disable_auto_compactions);
     Log(log,"         Options.purge_redundant_kvs_while_flush: %d",
         purge_redundant_kvs_while_flush);
-    Log(log,"                    Options.block_size_deviation: %d",
-        block_size_deviation);
     Log(log,"                          Options.filter_deletes: %d",
         filter_deletes);
     Log(log, "          Options.verify_checksums_in_compaction: %d",
@@ -410,11 +469,13 @@ void ColumnFamilyOptions::Dump(Logger* log) const {
             "max_size_amplification_percent: %u",
         compaction_options_universal.max_size_amplification_percent);
     Log(log,
-        "Options.compaction_options_universal.compression_size_percent: %u",
+        "Options.compaction_options_universal.compression_size_percent: %d",
         compaction_options_universal.compression_size_percent);
+    Log(log, "Options.compaction_options_fifo.max_table_files_size: %" PRIu64,
+        compaction_options_fifo.max_table_files_size);
     std::string collector_names;
-    for (auto collector : table_properties_collectors) {
-      collector_names.append(collector->Name());
+    for (const auto& collector_factory : table_properties_collector_factories) {
+      collector_names.append(collector_factory->Name());
       collector_names.append("; ");
     }
     Log(log, "                  Options.table_properties_collectors: %s",
@@ -436,6 +497,8 @@ void ColumnFamilyOptions::Dump(Logger* log) const {
         bloom_locality);
     Log(log, "                   Options.max_successive_merges: %zd",
         max_successive_merges);
+    Log(log, "               Options.optimize_fllters_for_hits: %d",
+        optimize_filters_for_hits);
 }  // ColumnFamilyOptions::Dump
 
 void Options::Dump(Logger* log) const {
@@ -458,7 +521,6 @@ Options::PrepareForBulkLoad()
   // no auto compactions please. The application should issue a
   // manual compaction after all data is loaded into L0.
   disable_auto_compactions = true;
-  disable_seek_compaction = true;
   disableDataSync = true;
 
   // A manual compaction run should pick all files in L0 in
@@ -470,6 +532,15 @@ Options::PrepareForBulkLoad()
   // increasing the total time needed for compactions.
   num_levels = 2;
 
+  // Need to allow more write buffers to allow more parallism
+  // of flushes.
+  max_write_buffer_number = 6;
+  min_write_buffer_number_to_merge = 1;
+
+  // When compaction is disabled, more parallel flush threads can
+  // help with write throughput.
+  max_background_flushes = 4;
+
   // Prevent a memtable flush to automatically promote files
   // to L1. This is helpful so that all files that are
   // input to the manual compaction are all at L0.
@@ -480,4 +551,141 @@ Options::PrepareForBulkLoad()
   return this;
 }
 
+const char* CompressionTypeToString(CompressionType compression_type) {
+  switch (compression_type) {
+    case kNoCompression:
+      return "NoCompression";
+    case kSnappyCompression:
+      return "Snappy";
+    case kZlibCompression:
+      return "Zlib";
+    case kBZip2Compression:
+      return "BZip2";
+    case kLZ4Compression:
+      return "LZ4";
+    case kLZ4HCCompression:
+      return "LZ4HC";
+    default:
+      assert(false);
+      return "";
+  }
+}
+
+bool CompressionTypeSupported(CompressionType compression_type) {
+  switch (compression_type) {
+    case kNoCompression:
+      return true;
+    case kSnappyCompression:
+      return Snappy_Supported();
+    case kZlibCompression:
+      return Zlib_Supported();
+    case kBZip2Compression:
+      return BZip2_Supported();
+    case kLZ4Compression:
+      return LZ4_Supported();
+    case kLZ4HCCompression:
+      return LZ4_Supported();
+    default:
+      assert(false);
+      return false;
+  }
+}
+
+#ifndef ROCKSDB_LITE
+// Optimization functions
+ColumnFamilyOptions* ColumnFamilyOptions::OptimizeForPointLookup(
+    uint64_t block_cache_size_mb) {
+  prefix_extractor.reset(NewNoopTransform());
+  BlockBasedTableOptions block_based_options;
+  block_based_options.index_type = BlockBasedTableOptions::kHashSearch;
+  block_based_options.filter_policy.reset(NewBloomFilterPolicy(10));
+  block_based_options.block_cache =
+      NewLRUCache(static_cast<size_t>(block_cache_size_mb * 1024 * 1024));
+  table_factory.reset(new BlockBasedTableFactory(block_based_options));
+  memtable_factory.reset(NewHashLinkListRepFactory());
+  return this;
+}
+
+ColumnFamilyOptions* ColumnFamilyOptions::OptimizeLevelStyleCompaction(
+    uint64_t memtable_memory_budget) {
+  write_buffer_size = static_cast<size_t>(memtable_memory_budget / 4);
+  // merge two memtables when flushing to L0
+  min_write_buffer_number_to_merge = 2;
+  // this means we'll use 50% extra memory in the worst case, but will reduce
+  // write stalls.
+  max_write_buffer_number = 6;
+  // start flushing L0->L1 as soon as possible. each file on level0 is
+  // (memtable_memory_budget / 2). This will flush level 0 when it's bigger than
+  // memtable_memory_budget.
+  level0_file_num_compaction_trigger = 2;
+  // doesn't really matter much, but we don't want to create too many files
+  target_file_size_base = memtable_memory_budget / 8;
+  // make Level1 size equal to Level0 size, so that L0->L1 compactions are fast
+  max_bytes_for_level_base = memtable_memory_budget;
+
+  // level style compaction
+  compaction_style = kCompactionStyleLevel;
+
+  // only compress levels >= 2
+  compression_per_level.resize(num_levels);
+  for (int i = 0; i < num_levels; ++i) {
+    if (i < 2) {
+      compression_per_level[i] = kNoCompression;
+    } else {
+      compression_per_level[i] = kSnappyCompression;
+    }
+  }
+  return this;
+}
+
+ColumnFamilyOptions* ColumnFamilyOptions::OptimizeUniversalStyleCompaction(
+    uint64_t memtable_memory_budget) {
+  write_buffer_size = static_cast<size_t>(memtable_memory_budget / 4);
+  // merge two memtables when flushing to L0
+  min_write_buffer_number_to_merge = 2;
+  // this means we'll use 50% extra memory in the worst case, but will reduce
+  // write stalls.
+  max_write_buffer_number = 6;
+  // universal style compaction
+  compaction_style = kCompactionStyleUniversal;
+  compaction_options_universal.compression_size_percent = 80;
+  return this;
+}
+
+DBOptions* DBOptions::IncreaseParallelism(int total_threads) {
+  max_background_compactions = total_threads - 1;
+  max_background_flushes = 1;
+  env->SetBackgroundThreads(total_threads, Env::LOW);
+  env->SetBackgroundThreads(1, Env::HIGH);
+  return this;
+}
+
+#endif  // !ROCKSDB_LITE
+
+ReadOptions::ReadOptions()
+    : verify_checksums(true),
+      fill_cache(true),
+      snapshot(nullptr),
+      iterate_upper_bound(nullptr),
+      read_tier(kReadAllTier),
+      tailing(false),
+      managed(false),
+      total_order_seek(false) {
+  XFUNC_TEST("", "managed_options", managed_options, xf_manage_options,
+             reinterpret_cast<ReadOptions*>(this));
+}
+
+ReadOptions::ReadOptions(bool cksum, bool cache)
+    : verify_checksums(cksum),
+      fill_cache(cache),
+      snapshot(nullptr),
+      iterate_upper_bound(nullptr),
+      read_tier(kReadAllTier),
+      tailing(false),
+      managed(false),
+      total_order_seek(false) {
+  XFUNC_TEST("", "managed_options", managed_options, xf_manage_options,
+             reinterpret_cast<ReadOptions*>(this));
+}
+
 }  // namespace rocksdb
diff --git a/src/rocksdb/util/options_builder.cc b/src/rocksdb/util/options_builder.cc
new file mode 100644
index 0000000..3ac3deb
--- /dev/null
+++ b/src/rocksdb/util/options_builder.cc
@@ -0,0 +1,206 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include <math.h>
+#include <algorithm>
+#include "rocksdb/options.h"
+
+namespace rocksdb {
+
+namespace {
+
+// For now, always use 1-0 as level bytes multiplier.
+const int kBytesForLevelMultiplier = 10;
+const size_t kBytesForOneMb = 1024 * 1024;
+
+// Pick compaction style
+CompactionStyle PickCompactionStyle(size_t write_buffer_size,
+                                    int read_amp_threshold,
+                                    int write_amp_threshold,
+                                    uint64_t target_db_size) {
+#ifndef ROCKSDB_LITE
+  // Estimate read amplification and write amplification of two compaction
+  // styles. If there is hard limit to force a choice, make the choice.
+  // Otherwise, calculate a score based on threshold and expected value of
+  // two styles, weighing reads 4X important than writes.
+  int expected_levels = static_cast<int>(ceil(
+      ::log(target_db_size / write_buffer_size) / ::log(kBytesForLevelMultiplier)));
+
+  int expected_max_files_universal =
+      static_cast<int>(ceil(log2(target_db_size / write_buffer_size)));
+
+  const int kEstimatedLevel0FilesInLevelStyle = 2;
+  // Estimate write amplification:
+  // (1) 1 for every L0 file
+  // (2) 2 for L1
+  // (3) kBytesForLevelMultiplier for the last level. It's really hard to
+  //     predict.
+  // (3) kBytesForLevelMultiplier for other levels.
+  int expected_write_amp_level = kEstimatedLevel0FilesInLevelStyle + 2
+      + (expected_levels - 2) * kBytesForLevelMultiplier
+      + kBytesForLevelMultiplier;
+  int expected_read_amp_level =
+      kEstimatedLevel0FilesInLevelStyle + expected_levels;
+
+  int max_read_amp_uni = expected_max_files_universal;
+  if (read_amp_threshold <= max_read_amp_uni) {
+    return kCompactionStyleLevel;
+  } else if (write_amp_threshold <= expected_write_amp_level) {
+    return kCompactionStyleUniversal;
+  }
+
+  const double kReadWriteWeight = 4;
+
+  double level_ratio =
+      static_cast<double>(read_amp_threshold) / expected_read_amp_level *
+          kReadWriteWeight +
+      static_cast<double>(write_amp_threshold) / expected_write_amp_level;
+
+  int expected_write_amp_uni = expected_max_files_universal / 2 + 2;
+  int expected_read_amp_uni = expected_max_files_universal / 2 + 1;
+
+  double uni_ratio =
+      static_cast<double>(read_amp_threshold) / expected_read_amp_uni *
+          kReadWriteWeight +
+      static_cast<double>(write_amp_threshold) / expected_write_amp_uni;
+
+  if (level_ratio > uni_ratio) {
+    return kCompactionStyleLevel;
+  } else {
+    return kCompactionStyleUniversal;
+  }
+#else
+  return kCompactionStyleLevel;
+#endif  // !ROCKSDB_LITE
+}
+
+// Pick mem table size
+void PickWriteBufferSize(size_t total_write_buffer_limit, Options* options) {
+  const size_t kMaxWriteBufferSize = 128 * kBytesForOneMb;
+  const size_t kMinWriteBufferSize = 4 * kBytesForOneMb;
+
+  // Try to pick up a buffer size between 4MB and 128MB.
+  // And try to pick 4 as the total number of write buffers.
+  size_t write_buffer_size = total_write_buffer_limit / 4;
+  if (write_buffer_size > kMaxWriteBufferSize) {
+    write_buffer_size = kMaxWriteBufferSize;
+  } else if (write_buffer_size < kMinWriteBufferSize) {
+    write_buffer_size = std::min(static_cast<size_t>(kMinWriteBufferSize),
+                                 total_write_buffer_limit / 2);
+  }
+
+  // Truncate to multiple of 1MB.
+  if (write_buffer_size % kBytesForOneMb != 0) {
+    write_buffer_size =
+        (write_buffer_size / kBytesForOneMb + 1) * kBytesForOneMb;
+  }
+
+  options->write_buffer_size = write_buffer_size;
+  options->max_write_buffer_number =
+      static_cast<int>(total_write_buffer_limit / write_buffer_size);
+  options->min_write_buffer_number_to_merge = 1;
+}
+
+#ifndef ROCKSDB_LITE
+void OptimizeForUniversal(Options* options) {
+  options->level0_file_num_compaction_trigger = 2;
+  options->level0_slowdown_writes_trigger = 30;
+  options->level0_stop_writes_trigger = 40;
+  options->max_open_files = -1;
+}
+#endif
+
+// Optimize parameters for level-based compaction
+void OptimizeForLevel(int read_amplification_threshold,
+                      int write_amplification_threshold,
+                      uint64_t target_db_size, Options* options) {
+  int expected_levels_one_level0_file =
+      static_cast<int>(ceil(::log(target_db_size / options->write_buffer_size) /
+                            ::log(kBytesForLevelMultiplier)));
+
+  int level0_stop_writes_trigger =
+      read_amplification_threshold - expected_levels_one_level0_file;
+
+  const size_t kInitialLevel0TotalSize = 128 * kBytesForOneMb;
+  const int kMaxFileNumCompactionTrigger = 4;
+  const int kMinLevel0StopTrigger = 3;
+
+  int file_num_buffer =
+      kInitialLevel0TotalSize / options->write_buffer_size + 1;
+
+  if (level0_stop_writes_trigger > file_num_buffer) {
+    // Have sufficient room for multiple level 0 files
+    // Try enlarge the buffer up to 1GB
+
+    // Try to enlarge the buffer up to 1GB, if still have sufficient headroom.
+    file_num_buffer *=
+        1 << std::max(0, std::min(3, level0_stop_writes_trigger -
+                                       file_num_buffer - 2));
+
+    options->level0_stop_writes_trigger = level0_stop_writes_trigger;
+    options->level0_slowdown_writes_trigger = level0_stop_writes_trigger - 2;
+    options->level0_file_num_compaction_trigger =
+        std::min(kMaxFileNumCompactionTrigger, file_num_buffer / 2);
+  } else {
+    options->level0_stop_writes_trigger =
+        std::max(kMinLevel0StopTrigger, file_num_buffer);
+    options->level0_slowdown_writes_trigger =
+        options->level0_stop_writes_trigger - 1;
+    options->level0_file_num_compaction_trigger = 1;
+  }
+
+  // This doesn't consider compaction and overheads of mem tables. But usually
+  // it is in the same order of magnitude.
+  size_t expected_level0_compaction_size =
+      options->level0_file_num_compaction_trigger * options->write_buffer_size;
+  // Enlarge level1 target file size if level0 compaction size is larger.
+  uint64_t max_bytes_for_level_base = 10 * kBytesForOneMb;
+  if (expected_level0_compaction_size > max_bytes_for_level_base) {
+    max_bytes_for_level_base = expected_level0_compaction_size;
+  }
+  options->max_bytes_for_level_base = max_bytes_for_level_base;
+  // Now always set level multiplier to be 10
+  options->max_bytes_for_level_multiplier = kBytesForLevelMultiplier;
+
+  const uint64_t kMinFileSize = 2 * kBytesForOneMb;
+  // Allow at least 3-way parallelism for compaction between level 1 and 2.
+  uint64_t max_file_size = max_bytes_for_level_base / 3;
+  if (max_file_size < kMinFileSize) {
+    options->target_file_size_base = kMinFileSize;
+  } else {
+    if (max_file_size % kBytesForOneMb != 0) {
+      max_file_size = (max_file_size / kBytesForOneMb + 1) * kBytesForOneMb;
+    }
+    options->target_file_size_base = max_file_size;
+  }
+
+  // TODO: consider to tune num_levels too.
+}
+
+}  // namespace
+
+Options GetOptions(size_t total_write_buffer_limit,
+                   int read_amplification_threshold,
+                   int write_amplification_threshold, uint64_t target_db_size) {
+  Options options;
+  PickWriteBufferSize(total_write_buffer_limit, &options);
+  size_t write_buffer_size = options.write_buffer_size;
+  options.compaction_style =
+      PickCompactionStyle(write_buffer_size, read_amplification_threshold,
+                          write_amplification_threshold, target_db_size);
+#ifndef ROCKSDB_LITE
+  if (options.compaction_style == kCompactionStyleUniversal) {
+    OptimizeForUniversal(&options);
+  } else {
+#else
+  {
+#endif  // !ROCKSDB_LITE
+    OptimizeForLevel(read_amplification_threshold,
+                     write_amplification_threshold, target_db_size, &options);
+  }
+  return options;
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/options_helper.cc b/src/rocksdb/util/options_helper.cc
new file mode 100644
index 0000000..07fc053
--- /dev/null
+++ b/src/rocksdb/util/options_helper.cc
@@ -0,0 +1,722 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include <cassert>
+#include <cctype>
+#include <cstdlib>
+#include <unordered_set>
+#include "rocksdb/cache.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/rate_limiter.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "rocksdb/utilities/convenience.h"
+#include "table/block_based_table_factory.h"
+#include "util/logging.h"
+#include "util/options_helper.h"
+
+namespace rocksdb {
+
+#ifndef ROCKSDB_LITE
+
+namespace {
+CompressionType ParseCompressionType(const std::string& type) {
+  if (type == "kNoCompression") {
+    return kNoCompression;
+  } else if (type == "kSnappyCompression") {
+    return kSnappyCompression;
+  } else if (type == "kZlibCompression") {
+    return kZlibCompression;
+  } else if (type == "kBZip2Compression") {
+    return kBZip2Compression;
+  } else if (type == "kLZ4Compression") {
+    return kLZ4Compression;
+  } else if (type == "kLZ4HCCompression") {
+    return kLZ4HCCompression;
+  } else {
+    throw std::invalid_argument("Unknown compression type: " + type);
+  }
+  return kNoCompression;
+}
+
+BlockBasedTableOptions::IndexType ParseBlockBasedTableIndexType(
+    const std::string& type) {
+  if (type == "kBinarySearch") {
+    return BlockBasedTableOptions::kBinarySearch;
+  } else if (type == "kHashSearch") {
+    return BlockBasedTableOptions::kHashSearch;
+  }
+  throw std::invalid_argument("Unknown index type: " + type);
+}
+
+ChecksumType ParseBlockBasedTableChecksumType(
+    const std::string& type) {
+  if (type == "kNoChecksum") {
+    return kNoChecksum;
+  } else if (type == "kCRC32c") {
+    return kCRC32c;
+  } else if (type == "kxxHash") {
+    return kxxHash;
+  }
+  throw std::invalid_argument("Unknown checksum type: " + type);
+}
+
+bool ParseBoolean(const std::string& type, const std::string& value) {
+  if (value == "true" || value == "1") {
+    return true;
+  } else if (value == "false" || value == "0") {
+    return false;
+  }
+  throw std::invalid_argument(type);
+}
+
+uint64_t ParseUint64(const std::string& value) {
+  size_t endchar;
+#ifndef CYGWIN
+  uint64_t num = std::stoull(value.c_str(), &endchar);
+#else
+  char* endptr;
+  uint64_t num = std::strtoul(value.c_str(), &endptr, 0);
+  endchar = endptr - value.c_str();
+#endif
+
+  if (endchar < value.length()) {
+    char c = value[endchar];
+    if (c == 'k' || c == 'K')
+      num <<= 10LL;
+    else if (c == 'm' || c == 'M')
+      num <<= 20LL;
+    else if (c == 'g' || c == 'G')
+      num <<= 30LL;
+    else if (c == 't' || c == 'T')
+      num <<= 40LL;
+  }
+
+  return num;
+}
+
+size_t ParseSizeT(const std::string& value) {
+  return static_cast<size_t>(ParseUint64(value));
+}
+
+uint32_t ParseUint32(const std::string& value) {
+  uint64_t num = ParseUint64(value);
+  if ((num >> 32LL) == 0) {
+    return static_cast<uint32_t>(num);
+  } else {
+    throw std::out_of_range(value);
+  }
+}
+
+int ParseInt(const std::string& value) {
+  size_t endchar;
+#ifndef CYGWIN
+  int num = std::stoi(value.c_str(), &endchar);
+#else
+  char* endptr;
+  int num = std::strtoul(value.c_str(), &endptr, 0);
+  endchar = endptr - value.c_str();
+#endif
+
+  if (endchar < value.length()) {
+    char c = value[endchar];
+    if (c == 'k' || c == 'K')
+      num <<= 10;
+    else if (c == 'm' || c == 'M')
+      num <<= 20;
+    else if (c == 'g' || c == 'G')
+      num <<= 30;
+  }
+
+  return num;
+}
+
+double ParseDouble(const std::string& value) {
+#ifndef CYGWIN
+  return std::stod(value);
+#else
+  return std::strtod(value.c_str(), 0);
+#endif
+}
+
+CompactionStyle ParseCompactionStyle(const std::string& type) {
+  if (type == "kCompactionStyleLevel") {
+    return kCompactionStyleLevel;
+  } else if (type == "kCompactionStyleUniversal") {
+    return kCompactionStyleUniversal;
+  } else if (type == "kCompactionStyleFIFO") {
+    return kCompactionStyleFIFO;
+  } else {
+    throw std::invalid_argument("unknown compaction style: " + type);
+  }
+  return kCompactionStyleLevel;
+}
+}  // anonymouse namespace
+
+template<typename OptionsType>
+bool ParseMemtableOptions(const std::string& name, const std::string& value,
+                          OptionsType* new_options) {
+  if (name == "write_buffer_size") {
+    new_options->write_buffer_size = ParseSizeT(value);
+  } else if (name == "arena_block_size") {
+    new_options->arena_block_size = ParseSizeT(value);
+  } else if (name == "memtable_prefix_bloom_bits") {
+    new_options->memtable_prefix_bloom_bits = ParseUint32(value);
+  } else if (name == "memtable_prefix_bloom_probes") {
+    new_options->memtable_prefix_bloom_probes = ParseUint32(value);
+  } else if (name == "memtable_prefix_bloom_huge_page_tlb_size") {
+    new_options->memtable_prefix_bloom_huge_page_tlb_size =
+      ParseSizeT(value);
+  } else if (name == "max_successive_merges") {
+    new_options->max_successive_merges = ParseSizeT(value);
+  } else if (name == "filter_deletes") {
+    new_options->filter_deletes = ParseBoolean(name, value);
+  } else if (name == "max_write_buffer_number") {
+    new_options->max_write_buffer_number = ParseInt(value);
+  } else if (name == "inplace_update_num_locks") {
+    new_options->inplace_update_num_locks = ParseSizeT(value);
+  } else {
+    return false;
+  }
+  return true;
+}
+
+template<typename OptionsType>
+bool ParseCompactionOptions(const std::string& name, const std::string& value,
+                            OptionsType* new_options) {
+  if (name == "disable_auto_compactions") {
+    new_options->disable_auto_compactions = ParseBoolean(name, value);
+  } else if (name == "soft_rate_limit") {
+    new_options->soft_rate_limit = ParseDouble(value);
+  } else if (name == "hard_rate_limit") {
+    new_options->hard_rate_limit = ParseDouble(value);
+  } else if (name == "level0_file_num_compaction_trigger") {
+    new_options->level0_file_num_compaction_trigger = ParseInt(value);
+  } else if (name == "level0_slowdown_writes_trigger") {
+    new_options->level0_slowdown_writes_trigger = ParseInt(value);
+  } else if (name == "level0_stop_writes_trigger") {
+    new_options->level0_stop_writes_trigger = ParseInt(value);
+  } else if (name == "max_grandparent_overlap_factor") {
+    new_options->max_grandparent_overlap_factor = ParseInt(value);
+  } else if (name == "expanded_compaction_factor") {
+    new_options->expanded_compaction_factor = ParseInt(value);
+  } else if (name == "source_compaction_factor") {
+    new_options->source_compaction_factor = ParseInt(value);
+  } else if (name == "target_file_size_base") {
+    new_options->target_file_size_base = ParseInt(value);
+  } else if (name == "target_file_size_multiplier") {
+    new_options->target_file_size_multiplier = ParseInt(value);
+  } else if (name == "max_bytes_for_level_base") {
+    new_options->max_bytes_for_level_base = ParseUint64(value);
+  } else if (name == "max_bytes_for_level_multiplier") {
+    new_options->max_bytes_for_level_multiplier = ParseInt(value);
+  } else if (name == "max_bytes_for_level_multiplier_additional") {
+    new_options->max_bytes_for_level_multiplier_additional.clear();
+    size_t start = 0;
+    while (true) {
+      size_t end = value.find(':', start);
+      if (end == std::string::npos) {
+        new_options->max_bytes_for_level_multiplier_additional.push_back(
+            ParseInt(value.substr(start)));
+        break;
+      } else {
+        new_options->max_bytes_for_level_multiplier_additional.push_back(
+            ParseInt(value.substr(start, end - start)));
+        start = end + 1;
+      }
+    }
+  } else if (name == "max_mem_compaction_level") {
+    new_options->max_mem_compaction_level = ParseInt(value);
+  } else if (name == "verify_checksums_in_compaction") {
+    new_options->verify_checksums_in_compaction = ParseBoolean(name, value);
+  } else {
+    return false;
+  }
+  return true;
+}
+
+template<typename OptionsType>
+bool ParseMiscOptions(const std::string& name, const std::string& value,
+                      OptionsType* new_options) {
+  if (name == "max_sequential_skip_in_iterations") {
+    new_options->max_sequential_skip_in_iterations = ParseUint64(value);
+  } else if (name == "paranoid_file_checks") {
+    new_options->paranoid_file_checks = ParseBoolean(name, value);
+  } else {
+    return false;
+  }
+  return true;
+}
+
+Status GetMutableOptionsFromStrings(
+    const MutableCFOptions& base_options,
+    const std::unordered_map<std::string, std::string>& options_map,
+    MutableCFOptions* new_options) {
+  assert(new_options);
+  *new_options = base_options;
+  for (const auto& o : options_map) {
+    try {
+      if (ParseMemtableOptions(o.first, o.second, new_options)) {
+      } else if (ParseCompactionOptions(o.first, o.second, new_options)) {
+      } else if (ParseMiscOptions(o.first, o.second, new_options)) {
+      } else {
+        return Status::InvalidArgument(
+            "unsupported dynamic option: " + o.first);
+      }
+    } catch (std::exception& e) {
+      return Status::InvalidArgument("error parsing " + o.first + ":" +
+                                     std::string(e.what()));
+    }
+  }
+  return Status::OK();
+}
+
+namespace {
+
+std::string trim(const std::string& str) {
+  size_t start = 0;
+  size_t end = str.size() - 1;
+  while (isspace(str[start]) != 0 && start <= end) {
+    ++start;
+  }
+  while (isspace(str[end]) != 0 && start <= end) {
+    --end;
+  }
+  if (start <= end) {
+    return str.substr(start, end - start + 1);
+  }
+  return std::string();
+}
+
+}  // anonymous namespace
+
+Status StringToMap(const std::string& opts_str,
+                   std::unordered_map<std::string, std::string>* opts_map) {
+  assert(opts_map);
+  // Example:
+  //   opts_str = "write_buffer_size=1024;max_write_buffer_number=2;"
+  //              "nested_opt={opt1=1;opt2=2};max_bytes_for_level_base=100"
+  size_t pos = 0;
+  std::string opts = trim(opts_str);
+  while (pos < opts.size()) {
+    size_t eq_pos = opts.find('=', pos);
+    if (eq_pos == std::string::npos) {
+      return Status::InvalidArgument("Mismatched key value pair, '=' expected");
+    }
+    std::string key = trim(opts.substr(pos, eq_pos - pos));
+    if (key.empty()) {
+      return Status::InvalidArgument("Empty key found");
+    }
+
+    // skip space after '=' and look for '{' for possible nested options
+    pos = eq_pos + 1;
+    while (pos < opts.size() && isspace(opts[pos])) {
+      ++pos;
+    }
+    // Empty value at the end
+    if (pos >= opts.size()) {
+      (*opts_map)[key] = "";
+      break;
+    }
+    if (opts[pos] == '{') {
+      int count = 1;
+      size_t brace_pos = pos + 1;
+      while (brace_pos < opts.size()) {
+        if (opts[brace_pos] == '{') {
+          ++count;
+        } else if (opts[brace_pos] == '}') {
+          --count;
+          if (count == 0) {
+            break;
+          }
+        }
+        ++brace_pos;
+      }
+      // found the matching closing brace
+      if (count == 0) {
+        (*opts_map)[key] = trim(opts.substr(pos + 1, brace_pos - pos - 1));
+        // skip all whitespace and move to the next ';'
+        // brace_pos points to the next position after the matching '}'
+        pos = brace_pos + 1;
+        while (pos < opts.size() && isspace(opts[pos])) {
+          ++pos;
+        }
+        if (pos < opts.size() && opts[pos] != ';') {
+          return Status::InvalidArgument(
+              "Unexpected chars after nested options");
+        }
+        ++pos;
+      } else {
+        return Status::InvalidArgument(
+            "Mismatched curly braces for nested options");
+      }
+    } else {
+      size_t sc_pos = opts.find(';', pos);
+      if (sc_pos == std::string::npos) {
+        (*opts_map)[key] = trim(opts.substr(pos));
+        // It either ends with a trailing semi-colon or the last key-value pair
+        break;
+      } else {
+        (*opts_map)[key] = trim(opts.substr(pos, sc_pos - pos));
+      }
+      pos = sc_pos + 1;
+    }
+  }
+
+  return Status::OK();
+}
+
+bool ParseColumnFamilyOption(const std::string& name, const std::string& value,
+                             ColumnFamilyOptions* new_options) {
+  try {
+    if (ParseMemtableOptions(name, value, new_options)) {
+    } else if (ParseCompactionOptions(name, value, new_options)) {
+    } else if (ParseMiscOptions(name, value, new_options)) {
+    } else if (name == "block_based_table_factory") {
+      // Nested options
+      BlockBasedTableOptions table_opt, base_table_options;
+      auto block_based_table_factory = dynamic_cast<BlockBasedTableFactory*>(
+          new_options->table_factory.get());
+      if (block_based_table_factory != nullptr) {
+        base_table_options = block_based_table_factory->GetTableOptions();
+      }
+      Status table_opt_s = GetBlockBasedTableOptionsFromString(
+          base_table_options, value, &table_opt);
+      if (!table_opt_s.ok()) {
+        return false;
+      }
+      new_options->table_factory.reset(NewBlockBasedTableFactory(table_opt));
+    } else if (name == "min_write_buffer_number_to_merge") {
+      new_options->min_write_buffer_number_to_merge = ParseInt(value);
+    } else if (name == "compression") {
+      new_options->compression = ParseCompressionType(value);
+    } else if (name == "compression_per_level") {
+      new_options->compression_per_level.clear();
+      size_t start = 0;
+      while (true) {
+        size_t end = value.find(':', start);
+        if (end == std::string::npos) {
+          new_options->compression_per_level.push_back(
+              ParseCompressionType(value.substr(start)));
+          break;
+        } else {
+          new_options->compression_per_level.push_back(
+              ParseCompressionType(value.substr(start, end - start)));
+          start = end + 1;
+        }
+      }
+    } else if (name == "compression_opts") {
+      size_t start = 0;
+      size_t end = value.find(':');
+      if (end == std::string::npos) {
+        return false;
+      }
+      new_options->compression_opts.window_bits =
+          ParseInt(value.substr(start, end - start));
+      start = end + 1;
+      end = value.find(':', start);
+      if (end == std::string::npos) {
+        return false;
+      }
+      new_options->compression_opts.level =
+          ParseInt(value.substr(start, end - start));
+      start = end + 1;
+      if (start >= value.size()) {
+        return false;
+      }
+      new_options->compression_opts.strategy =
+          ParseInt(value.substr(start, value.size() - start));
+    } else if (name == "num_levels") {
+      new_options->num_levels = ParseInt(value);
+    } else if (name == "level_compaction_dynamic_level_bytes") {
+      new_options->level_compaction_dynamic_level_bytes =
+          ParseBoolean(name, value);
+    } else if (name == "purge_redundant_kvs_while_flush") {
+      new_options->purge_redundant_kvs_while_flush =
+          ParseBoolean(name, value);
+    } else if (name == "compaction_style") {
+      new_options->compaction_style = ParseCompactionStyle(value);
+    } else if (name == "compaction_options_universal") {
+      // TODO(ljin): add support
+      return false;
+    } else if (name == "compaction_options_fifo") {
+      new_options->compaction_options_fifo.max_table_files_size =
+          ParseUint64(value);
+    } else if (name == "bloom_locality") {
+      new_options->bloom_locality = ParseUint32(value);
+    } else if (name == "min_partial_merge_operands") {
+      new_options->min_partial_merge_operands = ParseUint32(value);
+    } else if (name == "inplace_update_support") {
+      new_options->inplace_update_support = ParseBoolean(name, value);
+    } else if (name == "prefix_extractor") {
+      const std::string kFixedPrefixName = "fixed:";
+      const std::string kCappedPrefixName = "capped:";
+      auto& pe_value = value;
+      if (pe_value.size() > kFixedPrefixName.size() &&
+          pe_value.compare(0, kFixedPrefixName.size(), kFixedPrefixName) == 0) {
+        int prefix_length =
+            ParseInt(trim(value.substr(kFixedPrefixName.size())));
+        new_options->prefix_extractor.reset(
+            NewFixedPrefixTransform(prefix_length));
+      } else if (pe_value.size() > kCappedPrefixName.size() &&
+                 pe_value.compare(0, kCappedPrefixName.size(),
+                                  kCappedPrefixName) == 0) {
+        int prefix_length =
+            ParseInt(trim(pe_value.substr(kCappedPrefixName.size())));
+        new_options->prefix_extractor.reset(
+            NewCappedPrefixTransform(prefix_length));
+      } else {
+        return false;
+      }
+    } else if (name == "optimize_filters_for_hits") {
+      new_options->optimize_filters_for_hits = ParseBoolean(name, value);
+    } else {
+      return false;
+    }
+  }
+  catch (std::exception& e) {
+    return false;
+  }
+  return true;
+}
+
+bool ParseDBOption(const std::string& name, const std::string& value,
+                   DBOptions* new_options) {
+  try {
+    if (name == "create_if_missing") {
+      new_options->create_if_missing = ParseBoolean(name, value);
+    } else if (name == "create_missing_column_families") {
+      new_options->create_missing_column_families =
+          ParseBoolean(name, value);
+    } else if (name == "error_if_exists") {
+      new_options->error_if_exists = ParseBoolean(name, value);
+    } else if (name == "paranoid_checks") {
+      new_options->paranoid_checks = ParseBoolean(name, value);
+    } else if (name == "rate_limiter_bytes_per_sec") {
+      new_options->rate_limiter.reset(
+          NewGenericRateLimiter(static_cast<int64_t>(ParseUint64(value))));
+    } else if (name == "max_open_files") {
+      new_options->max_open_files = ParseInt(value);
+    } else if (name == "max_total_wal_size") {
+      new_options->max_total_wal_size = ParseUint64(value);
+    } else if (name == "disable_data_sync") {
+      new_options->disableDataSync = ParseBoolean(name, value);
+    } else if (name == "use_fsync") {
+      new_options->use_fsync = ParseBoolean(name, value);
+    } else if (name == "db_paths") {
+      // TODO(ljin): add support
+      return false;
+    } else if (name == "db_log_dir") {
+      new_options->db_log_dir = value;
+    } else if (name == "wal_dir") {
+      new_options->wal_dir = value;
+    } else if (name == "delete_obsolete_files_period_micros") {
+      new_options->delete_obsolete_files_period_micros = ParseUint64(value);
+    } else if (name == "max_background_compactions") {
+      new_options->max_background_compactions = ParseInt(value);
+    } else if (name == "max_background_flushes") {
+      new_options->max_background_flushes = ParseInt(value);
+    } else if (name == "max_log_file_size") {
+      new_options->max_log_file_size = ParseSizeT(value);
+    } else if (name == "log_file_time_to_roll") {
+      new_options->log_file_time_to_roll = ParseSizeT(value);
+    } else if (name == "keep_log_file_num") {
+      new_options->keep_log_file_num = ParseSizeT(value);
+    } else if (name == "max_manifest_file_size") {
+      new_options->max_manifest_file_size = ParseUint64(value);
+    } else if (name == "table_cache_numshardbits") {
+      new_options->table_cache_numshardbits = ParseInt(value);
+    } else if (name == "WAL_ttl_seconds") {
+      new_options->WAL_ttl_seconds = ParseUint64(value);
+    } else if (name == "WAL_size_limit_MB") {
+      new_options->WAL_size_limit_MB = ParseUint64(value);
+    } else if (name == "manifest_preallocation_size") {
+      new_options->manifest_preallocation_size = ParseSizeT(value);
+    } else if (name == "allow_os_buffer") {
+      new_options->allow_os_buffer = ParseBoolean(name, value);
+    } else if (name == "allow_mmap_reads") {
+      new_options->allow_mmap_reads = ParseBoolean(name, value);
+    } else if (name == "allow_mmap_writes") {
+      new_options->allow_mmap_writes = ParseBoolean(name, value);
+    } else if (name == "is_fd_close_on_exec") {
+      new_options->is_fd_close_on_exec = ParseBoolean(name, value);
+    } else if (name == "skip_log_error_on_recovery") {
+      new_options->skip_log_error_on_recovery = ParseBoolean(name, value);
+    } else if (name == "stats_dump_period_sec") {
+      new_options->stats_dump_period_sec = ParseUint32(value);
+    } else if (name == "advise_random_on_open") {
+      new_options->advise_random_on_open = ParseBoolean(name, value);
+    } else if (name == "db_write_buffer_size") {
+      new_options->db_write_buffer_size = ParseUint64(value);
+    } else if (name == "use_adaptive_mutex") {
+      new_options->use_adaptive_mutex = ParseBoolean(name, value);
+    } else if (name == "bytes_per_sync") {
+      new_options->bytes_per_sync = ParseUint64(value);
+    } else if (name == "wal_bytes_per_sync") {
+      new_options->wal_bytes_per_sync = ParseUint64(value);
+    } else {
+      return false;
+    }
+  }
+  catch (std::exception& e) {
+    return false;
+  }
+  return true;
+}
+
+Status GetBlockBasedTableOptionsFromMap(
+    const BlockBasedTableOptions& table_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    BlockBasedTableOptions* new_table_options) {
+
+  assert(new_table_options);
+  *new_table_options = table_options;
+  for (const auto& o : opts_map) {
+    try {
+      if (o.first == "cache_index_and_filter_blocks") {
+        new_table_options->cache_index_and_filter_blocks =
+          ParseBoolean(o.first, o.second);
+      } else if (o.first == "index_type") {
+        new_table_options->index_type = ParseBlockBasedTableIndexType(o.second);
+      } else if (o.first == "hash_index_allow_collision") {
+        new_table_options->hash_index_allow_collision =
+          ParseBoolean(o.first, o.second);
+      } else if (o.first == "checksum") {
+        new_table_options->checksum =
+          ParseBlockBasedTableChecksumType(o.second);
+      } else if (o.first == "no_block_cache") {
+        new_table_options->no_block_cache = ParseBoolean(o.first, o.second);
+      } else if (o.first == "block_cache") {
+        new_table_options->block_cache = NewLRUCache(ParseSizeT(o.second));
+      } else if (o.first == "block_cache_compressed") {
+        new_table_options->block_cache_compressed =
+          NewLRUCache(ParseSizeT(o.second));
+      } else if (o.first == "block_size") {
+        new_table_options->block_size = ParseSizeT(o.second);
+      } else if (o.first == "block_size_deviation") {
+        new_table_options->block_size_deviation = ParseInt(o.second);
+      } else if (o.first == "block_restart_interval") {
+        new_table_options->block_restart_interval = ParseInt(o.second);
+      } else if (o.first == "filter_policy") {
+        // Expect the following format
+        // bloomfilter:int:bool
+        const std::string kName = "bloomfilter:";
+        if (o.second.compare(0, kName.size(), kName) != 0) {
+          return Status::InvalidArgument("Invalid filter policy name");
+        }
+        size_t pos = o.second.find(':', kName.size());
+        if (pos == std::string::npos) {
+          return Status::InvalidArgument("Invalid filter policy config, "
+                                         "missing bits_per_key");
+        }
+        int bits_per_key = ParseInt(
+            trim(o.second.substr(kName.size(), pos - kName.size())));
+        bool use_block_based_builder =
+          ParseBoolean("use_block_based_builder",
+                       trim(o.second.substr(pos + 1)));
+        new_table_options->filter_policy.reset(
+            NewBloomFilterPolicy(bits_per_key, use_block_based_builder));
+      } else if (o.first == "whole_key_filtering") {
+        new_table_options->whole_key_filtering =
+          ParseBoolean(o.first, o.second);
+      } else {
+        return Status::InvalidArgument("Unrecognized option: " + o.first);
+      }
+    } catch (std::exception& e) {
+      return Status::InvalidArgument("error parsing " + o.first + ":" +
+                                     std::string(e.what()));
+    }
+  }
+  return Status::OK();
+}
+
+Status GetBlockBasedTableOptionsFromString(
+    const BlockBasedTableOptions& table_options,
+    const std::string& opts_str,
+    BlockBasedTableOptions* new_table_options) {
+  std::unordered_map<std::string, std::string> opts_map;
+  Status s = StringToMap(opts_str, &opts_map);
+  if (!s.ok()) {
+    return s;
+  }
+  return GetBlockBasedTableOptionsFromMap(table_options, opts_map,
+                                          new_table_options);
+}
+
+Status GetColumnFamilyOptionsFromMap(
+    const ColumnFamilyOptions& base_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    ColumnFamilyOptions* new_options) {
+  assert(new_options);
+  *new_options = base_options;
+  for (const auto& o : opts_map) {
+    if (!ParseColumnFamilyOption(o.first, o.second, new_options)) {
+      return Status::InvalidArgument("Can't parse option " + o.first);
+    }
+  }
+  return Status::OK();
+}
+
+Status GetColumnFamilyOptionsFromString(
+    const ColumnFamilyOptions& base_options,
+    const std::string& opts_str,
+    ColumnFamilyOptions* new_options) {
+  std::unordered_map<std::string, std::string> opts_map;
+  Status s = StringToMap(opts_str, &opts_map);
+  if (!s.ok()) {
+    return s;
+  }
+  return GetColumnFamilyOptionsFromMap(base_options, opts_map, new_options);
+}
+
+Status GetDBOptionsFromMap(
+    const DBOptions& base_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    DBOptions* new_options) {
+  assert(new_options);
+  *new_options = base_options;
+  for (const auto& o : opts_map) {
+    if (!ParseDBOption(o.first, o.second, new_options)) {
+      return Status::InvalidArgument("Can't parse option " + o.first);
+    }
+  }
+  return Status::OK();
+}
+
+Status GetDBOptionsFromString(
+    const DBOptions& base_options,
+    const std::string& opts_str,
+    DBOptions* new_options) {
+  std::unordered_map<std::string, std::string> opts_map;
+  Status s = StringToMap(opts_str, &opts_map);
+  if (!s.ok()) {
+    return s;
+  }
+  return GetDBOptionsFromMap(base_options, opts_map, new_options);
+}
+
+Status GetOptionsFromString(const Options& base_options,
+                            const std::string& opts_str, Options* new_options) {
+  std::unordered_map<std::string, std::string> opts_map;
+  Status s = StringToMap(opts_str, &opts_map);
+  if (!s.ok()) {
+    return s;
+  }
+  DBOptions new_db_options(base_options);
+  ColumnFamilyOptions new_cf_options(base_options);
+  for (const auto& o : opts_map) {
+    if (ParseDBOption(o.first, o.second, &new_db_options)) {
+    } else if (ParseColumnFamilyOption(o.first, o.second, &new_cf_options)) {
+    } else {
+      return Status::InvalidArgument("Can't parse option " + o.first);
+    }
+  }
+  *new_options = Options(new_db_options, new_cf_options);
+  return Status::OK();
+}
+
+#endif  // ROCKSDB_LITE
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/options_helper.h b/src/rocksdb/util/options_helper.h
new file mode 100644
index 0000000..02c7881
--- /dev/null
+++ b/src/rocksdb/util/options_helper.h
@@ -0,0 +1,20 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include <string>
+#include <stdexcept>
+#include "util/mutable_cf_options.h"
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+Status GetMutableOptionsFromStrings(
+    const MutableCFOptions& base_options,
+    const std::unordered_map<std::string, std::string>& options_map,
+    MutableCFOptions* new_options);
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/options_test.cc b/src/rocksdb/util/options_test.cc
new file mode 100644
index 0000000..6a3b2d4
--- /dev/null
+++ b/src/rocksdb/util/options_test.cc
@@ -0,0 +1,710 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <unordered_map>
+#include <inttypes.h>
+
+#include "rocksdb/cache.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+#include "rocksdb/utilities/convenience.h"
+#include "rocksdb/utilities/leveldb_options.h"
+#include "table/block_based_table_factory.h"
+#include "util/random.h"
+#include "util/testharness.h"
+
+#ifndef GFLAGS
+bool FLAGS_enable_print = false;
+#else
+#include <gflags/gflags.h>
+using GFLAGS::ParseCommandLineFlags;
+DEFINE_bool(enable_print, false, "Print options generated to console.");
+#endif  // GFLAGS
+
+namespace rocksdb {
+
+class OptionsTest : public testing::Test {};
+
+class StderrLogger : public Logger {
+ public:
+  using Logger::Logv;
+  virtual void Logv(const char* format, va_list ap) override {
+    vprintf(format, ap);
+    printf("\n");
+  }
+};
+
+Options PrintAndGetOptions(size_t total_write_buffer_limit,
+                           int read_amplification_threshold,
+                           int write_amplification_threshold,
+                           uint64_t target_db_size = 68719476736) {
+  StderrLogger logger;
+
+  if (FLAGS_enable_print) {
+    printf(
+        "---- total_write_buffer_limit: %zu "
+        "read_amplification_threshold: %d write_amplification_threshold: %d "
+        "target_db_size %" PRIu64 " ----\n",
+        total_write_buffer_limit, read_amplification_threshold,
+        write_amplification_threshold, target_db_size);
+  }
+
+  Options options =
+      GetOptions(total_write_buffer_limit, read_amplification_threshold,
+                 write_amplification_threshold, target_db_size);
+  if (FLAGS_enable_print) {
+    options.Dump(&logger);
+    printf("-------------------------------------\n\n\n");
+  }
+  return options;
+}
+
+TEST_F(OptionsTest, LooseCondition) {
+  Options options;
+  PrintAndGetOptions(static_cast<size_t>(10) * 1024 * 1024 * 1024, 100, 100);
+
+  // Less mem table memory budget
+  PrintAndGetOptions(32 * 1024 * 1024, 100, 100);
+
+  // Tight read amplification
+  options = PrintAndGetOptions(128 * 1024 * 1024, 8, 100);
+  ASSERT_EQ(options.compaction_style, kCompactionStyleLevel);
+
+#ifndef ROCKSDB_LITE  // Universal compaction is not supported in ROCKSDB_LITE
+  // Tight write amplification
+  options = PrintAndGetOptions(128 * 1024 * 1024, 64, 10);
+  ASSERT_EQ(options.compaction_style, kCompactionStyleUniversal);
+#endif  // !ROCKSDB_LITE
+
+  // Both tight amplifications
+  PrintAndGetOptions(128 * 1024 * 1024, 4, 8);
+}
+
+#ifndef ROCKSDB_LITE  // GetOptionsFromMap is not supported in ROCKSDB_LITE
+TEST_F(OptionsTest, GetOptionsFromMapTest) {
+  std::unordered_map<std::string, std::string> cf_options_map = {
+      {"write_buffer_size", "1"},
+      {"max_write_buffer_number", "2"},
+      {"min_write_buffer_number_to_merge", "3"},
+      {"compression", "kSnappyCompression"},
+      {"compression_per_level",
+       "kNoCompression:"
+       "kSnappyCompression:"
+       "kZlibCompression:"
+       "kBZip2Compression:"
+       "kLZ4Compression:"
+       "kLZ4HCCompression"},
+      {"compression_opts", "4:5:6"},
+      {"num_levels", "7"},
+      {"level0_file_num_compaction_trigger", "8"},
+      {"level0_slowdown_writes_trigger", "9"},
+      {"level0_stop_writes_trigger", "10"},
+      {"max_mem_compaction_level", "11"},
+      {"target_file_size_base", "12"},
+      {"target_file_size_multiplier", "13"},
+      {"max_bytes_for_level_base", "14"},
+      {"level_compaction_dynamic_level_bytes", "true"},
+      {"max_bytes_for_level_multiplier", "15"},
+      {"max_bytes_for_level_multiplier_additional", "16:17:18"},
+      {"expanded_compaction_factor", "19"},
+      {"source_compaction_factor", "20"},
+      {"max_grandparent_overlap_factor", "21"},
+      {"soft_rate_limit", "1.1"},
+      {"hard_rate_limit", "2.1"},
+      {"arena_block_size", "22"},
+      {"disable_auto_compactions", "true"},
+      {"purge_redundant_kvs_while_flush", "1"},
+      {"compaction_style", "kCompactionStyleLevel"},
+      {"verify_checksums_in_compaction", "false"},
+      {"compaction_options_fifo", "23"},
+      {"filter_deletes", "0"},
+      {"max_sequential_skip_in_iterations", "24"},
+      {"inplace_update_support", "true"},
+      {"inplace_update_num_locks", "25"},
+      {"memtable_prefix_bloom_bits", "26"},
+      {"memtable_prefix_bloom_probes", "27"},
+      {"memtable_prefix_bloom_huge_page_tlb_size", "28"},
+      {"bloom_locality", "29"},
+      {"max_successive_merges", "30"},
+      {"min_partial_merge_operands", "31"},
+      {"prefix_extractor", "fixed:31"},
+      {"optimize_filters_for_hits", "true"},
+  };
+
+  std::unordered_map<std::string, std::string> db_options_map = {
+    {"create_if_missing", "false"},
+    {"create_missing_column_families", "true"},
+    {"error_if_exists", "false"},
+    {"paranoid_checks", "true"},
+    {"max_open_files", "32"},
+    {"max_total_wal_size", "33"},
+    {"disable_data_sync", "false"},
+    {"use_fsync", "true"},
+    {"db_log_dir", "/db_log_dir"},
+    {"wal_dir", "/wal_dir"},
+    {"delete_obsolete_files_period_micros", "34"},
+    {"max_background_compactions", "35"},
+    {"max_background_flushes", "36"},
+    {"max_log_file_size", "37"},
+    {"log_file_time_to_roll", "38"},
+    {"keep_log_file_num", "39"},
+    {"max_manifest_file_size", "40"},
+    {"table_cache_numshardbits", "41"},
+    {"WAL_ttl_seconds", "43"},
+    {"WAL_size_limit_MB", "44"},
+    {"manifest_preallocation_size", "45"},
+    {"allow_os_buffer", "false"},
+    {"allow_mmap_reads", "true"},
+    {"allow_mmap_writes", "false"},
+    {"is_fd_close_on_exec", "true"},
+    {"skip_log_error_on_recovery", "false"},
+    {"stats_dump_period_sec", "46"},
+    {"advise_random_on_open", "true"},
+    {"use_adaptive_mutex", "false"},
+    {"bytes_per_sync", "47"},
+    {"wal_bytes_per_sync", "48"},
+  };
+
+  ColumnFamilyOptions base_cf_opt;
+  ColumnFamilyOptions new_cf_opt;
+  ASSERT_OK(GetColumnFamilyOptionsFromMap(
+            base_cf_opt, cf_options_map, &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 1U);
+  ASSERT_EQ(new_cf_opt.max_write_buffer_number, 2);
+  ASSERT_EQ(new_cf_opt.min_write_buffer_number_to_merge, 3);
+  ASSERT_EQ(new_cf_opt.compression, kSnappyCompression);
+  ASSERT_EQ(new_cf_opt.compression_per_level.size(), 6U);
+  ASSERT_EQ(new_cf_opt.compression_per_level[0], kNoCompression);
+  ASSERT_EQ(new_cf_opt.compression_per_level[1], kSnappyCompression);
+  ASSERT_EQ(new_cf_opt.compression_per_level[2], kZlibCompression);
+  ASSERT_EQ(new_cf_opt.compression_per_level[3], kBZip2Compression);
+  ASSERT_EQ(new_cf_opt.compression_per_level[4], kLZ4Compression);
+  ASSERT_EQ(new_cf_opt.compression_per_level[5], kLZ4HCCompression);
+  ASSERT_EQ(new_cf_opt.compression_opts.window_bits, 4);
+  ASSERT_EQ(new_cf_opt.compression_opts.level, 5);
+  ASSERT_EQ(new_cf_opt.compression_opts.strategy, 6);
+  ASSERT_EQ(new_cf_opt.num_levels, 7);
+  ASSERT_EQ(new_cf_opt.level0_file_num_compaction_trigger, 8);
+  ASSERT_EQ(new_cf_opt.level0_slowdown_writes_trigger, 9);
+  ASSERT_EQ(new_cf_opt.level0_stop_writes_trigger, 10);
+  ASSERT_EQ(new_cf_opt.max_mem_compaction_level, 11);
+  ASSERT_EQ(new_cf_opt.target_file_size_base, static_cast<uint64_t>(12));
+  ASSERT_EQ(new_cf_opt.target_file_size_multiplier, 13);
+  ASSERT_EQ(new_cf_opt.max_bytes_for_level_base, 14U);
+  ASSERT_EQ(new_cf_opt.level_compaction_dynamic_level_bytes, true);
+  ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier, 15);
+  ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier_additional.size(), 3U);
+  ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier_additional[0], 16);
+  ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier_additional[1], 17);
+  ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier_additional[2], 18);
+  ASSERT_EQ(new_cf_opt.expanded_compaction_factor, 19);
+  ASSERT_EQ(new_cf_opt.source_compaction_factor, 20);
+  ASSERT_EQ(new_cf_opt.max_grandparent_overlap_factor, 21);
+  ASSERT_EQ(new_cf_opt.soft_rate_limit, 1.1);
+  ASSERT_EQ(new_cf_opt.hard_rate_limit, 2.1);
+  ASSERT_EQ(new_cf_opt.arena_block_size, 22U);
+  ASSERT_EQ(new_cf_opt.disable_auto_compactions, true);
+  ASSERT_EQ(new_cf_opt.purge_redundant_kvs_while_flush, true);
+  ASSERT_EQ(new_cf_opt.compaction_style, kCompactionStyleLevel);
+  ASSERT_EQ(new_cf_opt.verify_checksums_in_compaction, false);
+  ASSERT_EQ(new_cf_opt.compaction_options_fifo.max_table_files_size,
+            static_cast<uint64_t>(23));
+  ASSERT_EQ(new_cf_opt.filter_deletes, false);
+  ASSERT_EQ(new_cf_opt.max_sequential_skip_in_iterations,
+            static_cast<uint64_t>(24));
+  ASSERT_EQ(new_cf_opt.inplace_update_support, true);
+  ASSERT_EQ(new_cf_opt.inplace_update_num_locks, 25U);
+  ASSERT_EQ(new_cf_opt.memtable_prefix_bloom_bits, 26U);
+  ASSERT_EQ(new_cf_opt.memtable_prefix_bloom_probes, 27U);
+  ASSERT_EQ(new_cf_opt.memtable_prefix_bloom_huge_page_tlb_size, 28U);
+  ASSERT_EQ(new_cf_opt.bloom_locality, 29U);
+  ASSERT_EQ(new_cf_opt.max_successive_merges, 30U);
+  ASSERT_EQ(new_cf_opt.min_partial_merge_operands, 31U);
+  ASSERT_TRUE(new_cf_opt.prefix_extractor != nullptr);
+  ASSERT_EQ(new_cf_opt.optimize_filters_for_hits, true);
+  ASSERT_EQ(std::string(new_cf_opt.prefix_extractor->Name()),
+            "rocksdb.FixedPrefix.31");
+
+  cf_options_map["write_buffer_size"] = "hello";
+  ASSERT_NOK(GetColumnFamilyOptionsFromMap(
+             base_cf_opt, cf_options_map, &new_cf_opt));
+  cf_options_map["write_buffer_size"] = "1";
+  ASSERT_OK(GetColumnFamilyOptionsFromMap(
+            base_cf_opt, cf_options_map, &new_cf_opt));
+  cf_options_map["unknown_option"] = "1";
+  ASSERT_NOK(GetColumnFamilyOptionsFromMap(
+             base_cf_opt, cf_options_map, &new_cf_opt));
+
+  DBOptions base_db_opt;
+  DBOptions new_db_opt;
+  ASSERT_OK(GetDBOptionsFromMap(base_db_opt, db_options_map, &new_db_opt));
+  ASSERT_EQ(new_db_opt.create_if_missing, false);
+  ASSERT_EQ(new_db_opt.create_missing_column_families, true);
+  ASSERT_EQ(new_db_opt.error_if_exists, false);
+  ASSERT_EQ(new_db_opt.paranoid_checks, true);
+  ASSERT_EQ(new_db_opt.max_open_files, 32);
+  ASSERT_EQ(new_db_opt.max_total_wal_size, static_cast<uint64_t>(33));
+  ASSERT_EQ(new_db_opt.disableDataSync, false);
+  ASSERT_EQ(new_db_opt.use_fsync, true);
+  ASSERT_EQ(new_db_opt.db_log_dir, "/db_log_dir");
+  ASSERT_EQ(new_db_opt.wal_dir, "/wal_dir");
+  ASSERT_EQ(new_db_opt.delete_obsolete_files_period_micros,
+            static_cast<uint64_t>(34));
+  ASSERT_EQ(new_db_opt.max_background_compactions, 35);
+  ASSERT_EQ(new_db_opt.max_background_flushes, 36);
+  ASSERT_EQ(new_db_opt.max_log_file_size, 37U);
+  ASSERT_EQ(new_db_opt.log_file_time_to_roll, 38U);
+  ASSERT_EQ(new_db_opt.keep_log_file_num, 39U);
+  ASSERT_EQ(new_db_opt.max_manifest_file_size, static_cast<uint64_t>(40));
+  ASSERT_EQ(new_db_opt.table_cache_numshardbits, 41);
+  ASSERT_EQ(new_db_opt.WAL_ttl_seconds, static_cast<uint64_t>(43));
+  ASSERT_EQ(new_db_opt.WAL_size_limit_MB, static_cast<uint64_t>(44));
+  ASSERT_EQ(new_db_opt.manifest_preallocation_size, 45U);
+  ASSERT_EQ(new_db_opt.allow_os_buffer, false);
+  ASSERT_EQ(new_db_opt.allow_mmap_reads, true);
+  ASSERT_EQ(new_db_opt.allow_mmap_writes, false);
+  ASSERT_EQ(new_db_opt.is_fd_close_on_exec, true);
+  ASSERT_EQ(new_db_opt.skip_log_error_on_recovery, false);
+  ASSERT_EQ(new_db_opt.stats_dump_period_sec, 46U);
+  ASSERT_EQ(new_db_opt.advise_random_on_open, true);
+  ASSERT_EQ(new_db_opt.use_adaptive_mutex, false);
+  ASSERT_EQ(new_db_opt.bytes_per_sync, static_cast<uint64_t>(47));
+  ASSERT_EQ(new_db_opt.wal_bytes_per_sync, static_cast<uint64_t>(48));
+}
+#endif  // !ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE  // GetColumnFamilyOptionsFromString is not supported in
+                      // ROCKSDB_LITE
+TEST_F(OptionsTest, GetColumnFamilyOptionsFromStringTest) {
+  ColumnFamilyOptions base_cf_opt;
+  ColumnFamilyOptions new_cf_opt;
+  base_cf_opt.table_factory.reset();
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, "", &new_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "write_buffer_size=5", &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 5U);
+  ASSERT_TRUE(new_cf_opt.table_factory == nullptr);
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "write_buffer_size=6;", &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 6U);
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "  write_buffer_size =  7  ", &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 7U);
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "  write_buffer_size =  8 ; ", &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 8U);
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "write_buffer_size=9;max_write_buffer_number=10", &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 9U);
+  ASSERT_EQ(new_cf_opt.max_write_buffer_number, 10);
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "write_buffer_size=11; max_write_buffer_number  =  12 ;",
+            &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 11U);
+  ASSERT_EQ(new_cf_opt.max_write_buffer_number, 12);
+  // Wrong name "max_write_buffer_number_"
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
+             "write_buffer_size=13;max_write_buffer_number_=14;",
+              &new_cf_opt));
+  // Wrong key/value pair
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
+             "write_buffer_size=13;max_write_buffer_number;", &new_cf_opt));
+  // Error Paring value
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
+             "write_buffer_size=13;max_write_buffer_number=;", &new_cf_opt));
+  // Missing option name
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
+             "write_buffer_size=13; =100;", &new_cf_opt));
+  // Units (k)
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "memtable_prefix_bloom_bits=14k;max_write_buffer_number=-15K",
+            &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.memtable_prefix_bloom_bits, 14UL*1024UL);
+  ASSERT_EQ(new_cf_opt.max_write_buffer_number, -15*1024);
+  // Units (m)
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "max_write_buffer_number=16m;inplace_update_num_locks=17M",
+            &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.max_write_buffer_number, 16*1024*1024);
+  ASSERT_EQ(new_cf_opt.inplace_update_num_locks, 17*1024UL*1024UL);
+  // Units (g)
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      base_cf_opt,
+      "write_buffer_size=18g;prefix_extractor=capped:8;"
+      "arena_block_size=19G",
+      &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 18*1024UL*1024UL*1024UL);
+  ASSERT_EQ(new_cf_opt.arena_block_size, 19*1024UL*1024UL*1024UL);
+  ASSERT_TRUE(new_cf_opt.prefix_extractor.get() != nullptr);
+  std::string prefix_name(new_cf_opt.prefix_extractor->Name());
+  ASSERT_EQ(prefix_name, "rocksdb.CappedPrefix.8");
+
+  // Units (t)
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "write_buffer_size=20t;arena_block_size=21T", &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 20*1024UL*1024UL*1024UL*1024UL);
+  ASSERT_EQ(new_cf_opt.arena_block_size, 21*1024UL*1024UL*1024UL*1024UL);
+
+  // Nested block based table options
+  // Emtpy
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "write_buffer_size=10;max_write_buffer_number=16;"
+            "block_based_table_factory={};arena_block_size=1024",
+            &new_cf_opt));
+  ASSERT_TRUE(new_cf_opt.table_factory != nullptr);
+  // Non-empty
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "write_buffer_size=10;max_write_buffer_number=16;"
+            "block_based_table_factory={block_cache=1M;block_size=4;};"
+            "arena_block_size=1024",
+            &new_cf_opt));
+  ASSERT_TRUE(new_cf_opt.table_factory != nullptr);
+  // Last one
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "write_buffer_size=10;max_write_buffer_number=16;"
+            "block_based_table_factory={block_cache=1M;block_size=4;}",
+            &new_cf_opt));
+  ASSERT_TRUE(new_cf_opt.table_factory != nullptr);
+  // Mismatch curly braces
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
+             "write_buffer_size=10;max_write_buffer_number=16;"
+             "block_based_table_factory={{{block_size=4;};"
+             "arena_block_size=1024",
+             &new_cf_opt));
+  // Unexpected chars after closing curly brace
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
+             "write_buffer_size=10;max_write_buffer_number=16;"
+             "block_based_table_factory={block_size=4;}};"
+             "arena_block_size=1024",
+             &new_cf_opt));
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
+             "write_buffer_size=10;max_write_buffer_number=16;"
+             "block_based_table_factory={block_size=4;}xdfa;"
+             "arena_block_size=1024",
+             &new_cf_opt));
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
+             "write_buffer_size=10;max_write_buffer_number=16;"
+             "block_based_table_factory={block_size=4;}xdfa",
+             &new_cf_opt));
+  // Invalid block based table option
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
+             "write_buffer_size=10;max_write_buffer_number=16;"
+             "block_based_table_factory={xx_block_size=4;}",
+             &new_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+           "optimize_filters_for_hits=true",
+           &new_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "optimize_filters_for_hits=false",
+            &new_cf_opt));
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
+              "optimize_filters_for_hits=junk",
+              &new_cf_opt));
+}
+#endif  // !ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE  // GetBlockBasedTableOptionsFromString is not supported
+TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) {
+  BlockBasedTableOptions table_opt;
+  BlockBasedTableOptions new_opt;
+  // make sure default values are overwritten by something else
+  ASSERT_OK(GetBlockBasedTableOptionsFromString(table_opt,
+            "cache_index_and_filter_blocks=1;index_type=kHashSearch;"
+            "checksum=kxxHash;hash_index_allow_collision=1;no_block_cache=1;"
+            "block_cache=1M;block_cache_compressed=1k;block_size=1024;"
+            "block_size_deviation=8;block_restart_interval=4;"
+            "filter_policy=bloomfilter:4:true;whole_key_filtering=1",
+            &new_opt));
+  ASSERT_TRUE(new_opt.cache_index_and_filter_blocks);
+  ASSERT_EQ(new_opt.index_type, BlockBasedTableOptions::kHashSearch);
+  ASSERT_EQ(new_opt.checksum, ChecksumType::kxxHash);
+  ASSERT_TRUE(new_opt.hash_index_allow_collision);
+  ASSERT_TRUE(new_opt.no_block_cache);
+  ASSERT_TRUE(new_opt.block_cache != nullptr);
+  ASSERT_EQ(new_opt.block_cache->GetCapacity(), 1024UL*1024UL);
+  ASSERT_TRUE(new_opt.block_cache_compressed != nullptr);
+  ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 1024UL);
+  ASSERT_EQ(new_opt.block_size, 1024UL);
+  ASSERT_EQ(new_opt.block_size_deviation, 8);
+  ASSERT_EQ(new_opt.block_restart_interval, 4);
+  ASSERT_TRUE(new_opt.filter_policy != nullptr);
+
+  // unknown option
+  ASSERT_NOK(GetBlockBasedTableOptionsFromString(table_opt,
+             "cache_index_and_filter_blocks=1;index_type=kBinarySearch;"
+             "bad_option=1",
+             &new_opt));
+
+  // unrecognized index type
+  ASSERT_NOK(GetBlockBasedTableOptionsFromString(table_opt,
+             "cache_index_and_filter_blocks=1;index_type=kBinarySearchXX",
+             &new_opt));
+
+  // unrecognized checksum type
+  ASSERT_NOK(GetBlockBasedTableOptionsFromString(table_opt,
+             "cache_index_and_filter_blocks=1;checksum=kxxHashXX",
+             &new_opt));
+
+  // unrecognized filter policy name
+  ASSERT_NOK(GetBlockBasedTableOptionsFromString(table_opt,
+             "cache_index_and_filter_blocks=1;"
+             "filter_policy=bloomfilterxx:4:true",
+             &new_opt));
+  // unrecognized filter policy config
+  ASSERT_NOK(GetBlockBasedTableOptionsFromString(table_opt,
+             "cache_index_and_filter_blocks=1;"
+             "filter_policy=bloomfilter:4",
+             &new_opt));
+}
+#endif  // !ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE  // GetOptionsFromString is not supported in RocksDB Lite
+TEST_F(OptionsTest, GetOptionsFromStringTest) {
+  Options base_options, new_options;
+  base_options.write_buffer_size = 20;
+  base_options.min_write_buffer_number_to_merge = 15;
+  BlockBasedTableOptions block_based_table_options;
+  block_based_table_options.cache_index_and_filter_blocks = true;
+  base_options.table_factory.reset(
+      NewBlockBasedTableFactory(block_based_table_options));
+  ASSERT_OK(GetOptionsFromString(
+      base_options,
+      "write_buffer_size=10;max_write_buffer_number=16;"
+      "block_based_table_factory={block_cache=1M;block_size=4;};"
+      "create_if_missing=true;max_open_files=1;rate_limiter_bytes_per_sec=1024",
+      &new_options));
+
+  ASSERT_EQ(new_options.write_buffer_size, 10U);
+  ASSERT_EQ(new_options.max_write_buffer_number, 16);
+  BlockBasedTableOptions new_block_based_table_options =
+      dynamic_cast<BlockBasedTableFactory*>(new_options.table_factory.get())
+          ->GetTableOptions();
+  ASSERT_EQ(new_block_based_table_options.block_cache->GetCapacity(), 1U << 20);
+  ASSERT_EQ(new_block_based_table_options.block_size, 4U);
+  // don't overwrite block based table options
+  ASSERT_TRUE(new_block_based_table_options.cache_index_and_filter_blocks);
+
+  ASSERT_EQ(new_options.create_if_missing, true);
+  ASSERT_EQ(new_options.max_open_files, 1);
+  ASSERT_TRUE(new_options.rate_limiter.get() != nullptr);
+}
+#endif  // !ROCKSDB_LITE
+
+
+Status StringToMap(
+    const std::string& opts_str,
+    std::unordered_map<std::string, std::string>* opts_map);
+
+#ifndef ROCKSDB_LITE  // StringToMap is not supported in ROCKSDB_LITE
+TEST_F(OptionsTest, StringToMapTest) {
+  std::unordered_map<std::string, std::string> opts_map;
+  // Regular options
+  ASSERT_OK(StringToMap("k1=v1;k2=v2;k3=v3", &opts_map));
+  ASSERT_EQ(opts_map["k1"], "v1");
+  ASSERT_EQ(opts_map["k2"], "v2");
+  ASSERT_EQ(opts_map["k3"], "v3");
+  // Value with '='
+  opts_map.clear();
+  ASSERT_OK(StringToMap("k1==v1;k2=v2=;", &opts_map));
+  ASSERT_EQ(opts_map["k1"], "=v1");
+  ASSERT_EQ(opts_map["k2"], "v2=");
+  // Overwrriten option
+  opts_map.clear();
+  ASSERT_OK(StringToMap("k1=v1;k1=v2;k3=v3", &opts_map));
+  ASSERT_EQ(opts_map["k1"], "v2");
+  ASSERT_EQ(opts_map["k3"], "v3");
+  // Empty value
+  opts_map.clear();
+  ASSERT_OK(StringToMap("k1=v1;k2=;k3=v3;k4=", &opts_map));
+  ASSERT_EQ(opts_map["k1"], "v1");
+  ASSERT_TRUE(opts_map.find("k2") != opts_map.end());
+  ASSERT_EQ(opts_map["k2"], "");
+  ASSERT_EQ(opts_map["k3"], "v3");
+  ASSERT_TRUE(opts_map.find("k4") != opts_map.end());
+  ASSERT_EQ(opts_map["k4"], "");
+  opts_map.clear();
+  ASSERT_OK(StringToMap("k1=v1;k2=;k3=v3;k4=   ", &opts_map));
+  ASSERT_EQ(opts_map["k1"], "v1");
+  ASSERT_TRUE(opts_map.find("k2") != opts_map.end());
+  ASSERT_EQ(opts_map["k2"], "");
+  ASSERT_EQ(opts_map["k3"], "v3");
+  ASSERT_TRUE(opts_map.find("k4") != opts_map.end());
+  ASSERT_EQ(opts_map["k4"], "");
+  opts_map.clear();
+  ASSERT_OK(StringToMap("k1=v1;k2=;k3=", &opts_map));
+  ASSERT_EQ(opts_map["k1"], "v1");
+  ASSERT_TRUE(opts_map.find("k2") != opts_map.end());
+  ASSERT_EQ(opts_map["k2"], "");
+  ASSERT_TRUE(opts_map.find("k3") != opts_map.end());
+  ASSERT_EQ(opts_map["k3"], "");
+  opts_map.clear();
+  ASSERT_OK(StringToMap("k1=v1;k2=;k3=;", &opts_map));
+  ASSERT_EQ(opts_map["k1"], "v1");
+  ASSERT_TRUE(opts_map.find("k2") != opts_map.end());
+  ASSERT_EQ(opts_map["k2"], "");
+  ASSERT_TRUE(opts_map.find("k3") != opts_map.end());
+  ASSERT_EQ(opts_map["k3"], "");
+  // Regular nested options
+  opts_map.clear();
+  ASSERT_OK(StringToMap("k1=v1;k2={nk1=nv1;nk2=nv2};k3=v3", &opts_map));
+  ASSERT_EQ(opts_map["k1"], "v1");
+  ASSERT_EQ(opts_map["k2"], "nk1=nv1;nk2=nv2");
+  ASSERT_EQ(opts_map["k3"], "v3");
+  // Multi-level nested options
+  opts_map.clear();
+  ASSERT_OK(StringToMap("k1=v1;k2={nk1=nv1;nk2={nnk1=nnk2}};"
+                        "k3={nk1={nnk1={nnnk1=nnnv1;nnnk2;nnnv2}}};k4=v4",
+                        &opts_map));
+  ASSERT_EQ(opts_map["k1"], "v1");
+  ASSERT_EQ(opts_map["k2"], "nk1=nv1;nk2={nnk1=nnk2}");
+  ASSERT_EQ(opts_map["k3"], "nk1={nnk1={nnnk1=nnnv1;nnnk2;nnnv2}}");
+  ASSERT_EQ(opts_map["k4"], "v4");
+  // Garbage inside curly braces
+  opts_map.clear();
+  ASSERT_OK(StringToMap("k1=v1;k2={dfad=};k3={=};k4=v4",
+                        &opts_map));
+  ASSERT_EQ(opts_map["k1"], "v1");
+  ASSERT_EQ(opts_map["k2"], "dfad=");
+  ASSERT_EQ(opts_map["k3"], "=");
+  ASSERT_EQ(opts_map["k4"], "v4");
+  // Empty nested options
+  opts_map.clear();
+  ASSERT_OK(StringToMap("k1=v1;k2={};", &opts_map));
+  ASSERT_EQ(opts_map["k1"], "v1");
+  ASSERT_EQ(opts_map["k2"], "");
+  opts_map.clear();
+  ASSERT_OK(StringToMap("k1=v1;k2={{{{}}}{}{}};", &opts_map));
+  ASSERT_EQ(opts_map["k1"], "v1");
+  ASSERT_EQ(opts_map["k2"], "{{{}}}{}{}");
+  // With random spaces
+  opts_map.clear();
+  ASSERT_OK(StringToMap("  k1 =  v1 ; k2= {nk1=nv1; nk2={nnk1=nnk2}}  ; "
+                        "k3={  {   } }; k4= v4  ",
+                        &opts_map));
+  ASSERT_EQ(opts_map["k1"], "v1");
+  ASSERT_EQ(opts_map["k2"], "nk1=nv1; nk2={nnk1=nnk2}");
+  ASSERT_EQ(opts_map["k3"], "{   }");
+  ASSERT_EQ(opts_map["k4"], "v4");
+
+  // Empty key
+  ASSERT_NOK(StringToMap("k1=v1;k2=v2;=", &opts_map));
+  ASSERT_NOK(StringToMap("=v1;k2=v2", &opts_map));
+  ASSERT_NOK(StringToMap("k1=v1;k2v2;", &opts_map));
+  ASSERT_NOK(StringToMap("k1=v1;k2=v2;fadfa", &opts_map));
+  ASSERT_NOK(StringToMap("k1=v1;k2=v2;;", &opts_map));
+  // Mismatch curly braces
+  ASSERT_NOK(StringToMap("k1=v1;k2={;k3=v3", &opts_map));
+  ASSERT_NOK(StringToMap("k1=v1;k2={{};k3=v3", &opts_map));
+  ASSERT_NOK(StringToMap("k1=v1;k2={}};k3=v3", &opts_map));
+  ASSERT_NOK(StringToMap("k1=v1;k2={{}{}}};k3=v3", &opts_map));
+  // However this is valid!
+  opts_map.clear();
+  ASSERT_OK(StringToMap("k1=v1;k2=};k3=v3", &opts_map));
+  ASSERT_EQ(opts_map["k1"], "v1");
+  ASSERT_EQ(opts_map["k2"], "}");
+  ASSERT_EQ(opts_map["k3"], "v3");
+
+  // Invalid chars after closing curly brace
+  ASSERT_NOK(StringToMap("k1=v1;k2={{}}{};k3=v3", &opts_map));
+  ASSERT_NOK(StringToMap("k1=v1;k2={{}}cfda;k3=v3", &opts_map));
+  ASSERT_NOK(StringToMap("k1=v1;k2={{}}  cfda;k3=v3", &opts_map));
+  ASSERT_NOK(StringToMap("k1=v1;k2={{}}  cfda", &opts_map));
+  ASSERT_NOK(StringToMap("k1=v1;k2={{}}{}", &opts_map));
+  ASSERT_NOK(StringToMap("k1=v1;k2={{dfdl}adfa}{}", &opts_map));
+}
+#endif  // ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE  // StringToMap is not supported in ROCKSDB_LITE
+TEST_F(OptionsTest, StringToMapRandomTest) {
+  std::unordered_map<std::string, std::string> opts_map;
+  // Make sure segfault is not hit by semi-random strings
+
+  std::vector<std::string> bases = {
+      "a={aa={};tt={xxx={}}};c=defff",
+      "a={aa={};tt={xxx={}}};c=defff;d={{}yxx{}3{xx}}",
+      "abc={{}{}{}{{{}}}{{}{}{}{}{}{}{}"};
+
+  for (std::string base : bases) {
+    for (int rand_seed = 301; rand_seed < 401; rand_seed++) {
+      Random rnd(rand_seed);
+      for (int attempt = 0; attempt < 10; attempt++) {
+        std::string str = base;
+        // Replace random position to space
+        size_t pos = static_cast<size_t>(
+            rnd.Uniform(static_cast<int>(base.size())));
+        str[pos] = ' ';
+        Status s = StringToMap(str, &opts_map);
+        ASSERT_TRUE(s.ok() || s.IsInvalidArgument());
+        opts_map.clear();
+      }
+    }
+  }
+
+  // Random Construct a string
+  std::vector<char> chars = {'{', '}', ' ', '=', ';', 'c'};
+  for (int rand_seed = 301; rand_seed < 1301; rand_seed++) {
+    Random rnd(rand_seed);
+    int len = rnd.Uniform(30);
+    std::string str = "";
+    for (int attempt = 0; attempt < len; attempt++) {
+      // Add a random character
+      size_t pos = static_cast<size_t>(
+          rnd.Uniform(static_cast<int>(chars.size())));
+      str.append(1, chars[pos]);
+    }
+    Status s = StringToMap(str, &opts_map);
+    ASSERT_TRUE(s.ok() || s.IsInvalidArgument());
+    s = StringToMap("name=" + str, &opts_map);
+    ASSERT_TRUE(s.ok() || s.IsInvalidArgument());
+    opts_map.clear();
+  }
+}
+#endif  // !ROCKSDB_LITE
+
+TEST_F(OptionsTest, ConvertOptionsTest) {
+  LevelDBOptions leveldb_opt;
+  Options converted_opt = ConvertOptions(leveldb_opt);
+
+  ASSERT_EQ(converted_opt.create_if_missing, leveldb_opt.create_if_missing);
+  ASSERT_EQ(converted_opt.error_if_exists, leveldb_opt.error_if_exists);
+  ASSERT_EQ(converted_opt.paranoid_checks, leveldb_opt.paranoid_checks);
+  ASSERT_EQ(converted_opt.env, leveldb_opt.env);
+  ASSERT_EQ(converted_opt.info_log.get(), leveldb_opt.info_log);
+  ASSERT_EQ(converted_opt.write_buffer_size, leveldb_opt.write_buffer_size);
+  ASSERT_EQ(converted_opt.max_open_files, leveldb_opt.max_open_files);
+  ASSERT_EQ(converted_opt.compression, leveldb_opt.compression);
+
+  std::shared_ptr<BlockBasedTableFactory> table_factory =
+      std::dynamic_pointer_cast<BlockBasedTableFactory>(
+          converted_opt.table_factory);
+
+  ASSERT_TRUE(table_factory.get() != nullptr);
+
+  const BlockBasedTableOptions table_opt = table_factory->GetTableOptions();
+
+  ASSERT_EQ(table_opt.block_cache->GetCapacity(), 8UL << 20);
+  ASSERT_EQ(table_opt.block_size, leveldb_opt.block_size);
+  ASSERT_EQ(table_opt.block_restart_interval,
+            leveldb_opt.block_restart_interval);
+  ASSERT_EQ(table_opt.filter_policy.get(), leveldb_opt.filter_policy);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+#ifdef GFLAGS
+  ParseCommandLineFlags(&argc, &argv, true);
+#endif  // GFLAGS
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/perf_context.cc b/src/rocksdb/util/perf_context.cc
index 264b10d..7be9980 100644
--- a/src/rocksdb/util/perf_context.cc
+++ b/src/rocksdb/util/perf_context.cc
@@ -22,6 +22,10 @@ void SetPerfLevel(PerfLevel level) {
   perf_level = level;
 }
 
+PerfLevel GetPerfLevel() {
+  return perf_level;
+}
+
 void PerfContext::Reset() {
 #if !defined(NPERF_CONTEXT) && !defined(IOS_CROSS_COMPILE)
   user_key_comparison_count = 0;
@@ -40,6 +44,8 @@ void PerfContext::Reset() {
   get_from_memtable_count = 0;
   get_post_process_time = 0;
   get_from_output_files_time = 0;
+  seek_on_memtable_time = 0;
+  seek_on_memtable_count = 0;
   seek_child_seek_time = 0;
   seek_child_seek_count = 0;
   seek_min_heap_time = 0;
@@ -47,6 +53,9 @@ void PerfContext::Reset() {
   find_next_user_entry_time = 0;
   write_pre_and_post_process_time = 0;
   write_memtable_time = 0;
+  db_mutex_lock_nanos = 0;
+  db_condition_wait_nanos = 0;
+  merge_operator_time_nanos = 0;
 #endif
 }
 
@@ -57,28 +66,20 @@ std::string PerfContext::ToString() const {
   return "";
 #else
   std::ostringstream ss;
-  ss << OUTPUT(user_key_comparison_count)
-     << OUTPUT(block_cache_hit_count)
-     << OUTPUT(block_read_count)
-     << OUTPUT(block_read_byte)
-     << OUTPUT(block_read_time)
-     << OUTPUT(block_checksum_time)
-     << OUTPUT(block_decompress_time)
-     << OUTPUT(internal_key_skipped_count)
-     << OUTPUT(internal_delete_skipped_count)
-     << OUTPUT(write_wal_time)
-     << OUTPUT(get_snapshot_time)
-     << OUTPUT(get_from_memtable_time)
-     << OUTPUT(get_from_memtable_count)
-     << OUTPUT(get_post_process_time)
-     << OUTPUT(get_from_output_files_time)
-     << OUTPUT(seek_child_seek_time)
-     << OUTPUT(seek_child_seek_count)
-     << OUTPUT(seek_min_heap_time)
-     << OUTPUT(seek_internal_seek_time)
-     << OUTPUT(find_next_user_entry_time)
-     << OUTPUT(write_pre_and_post_process_time)
-     << OUTPUT(write_memtable_time);
+  ss << OUTPUT(user_key_comparison_count) << OUTPUT(block_cache_hit_count)
+     << OUTPUT(block_read_count) << OUTPUT(block_read_byte)
+     << OUTPUT(block_read_time) << OUTPUT(block_checksum_time)
+     << OUTPUT(block_decompress_time) << OUTPUT(internal_key_skipped_count)
+     << OUTPUT(internal_delete_skipped_count) << OUTPUT(write_wal_time)
+     << OUTPUT(get_snapshot_time) << OUTPUT(get_from_memtable_time)
+     << OUTPUT(get_from_memtable_count) << OUTPUT(get_post_process_time)
+     << OUTPUT(get_from_output_files_time) << OUTPUT(seek_on_memtable_time)
+     << OUTPUT(seek_on_memtable_count) << OUTPUT(seek_child_seek_time)
+     << OUTPUT(seek_child_seek_count) << OUTPUT(seek_min_heap_time)
+     << OUTPUT(seek_internal_seek_time) << OUTPUT(find_next_user_entry_time)
+     << OUTPUT(write_pre_and_post_process_time) << OUTPUT(write_memtable_time)
+     << OUTPUT(db_mutex_lock_nanos) << OUTPUT(db_condition_wait_nanos)
+     << OUTPUT(merge_operator_time_nanos);
   return ss.str();
 #endif
 }
diff --git a/src/rocksdb/util/perf_context_imp.h b/src/rocksdb/util/perf_context_imp.h
index dc4ae95..e397901 100644
--- a/src/rocksdb/util/perf_context_imp.h
+++ b/src/rocksdb/util/perf_context_imp.h
@@ -11,11 +11,10 @@ namespace rocksdb {
 
 #if defined(NPERF_CONTEXT) || defined(IOS_CROSS_COMPILE)
 
-#define PERF_TIMER_DECLARE()
-#define PERF_TIMER_START(metric)
-#define PERF_TIMER_AUTO(metric)
+#define PERF_TIMER_GUARD(metric)
 #define PERF_TIMER_MEASURE(metric)
 #define PERF_TIMER_STOP(metric)
+#define PERF_TIMER_START(metric)
 #define PERF_COUNTER_ADD(metric, value)
 
 #else
@@ -24,10 +23,15 @@ extern __thread PerfLevel perf_level;
 
 class PerfStepTimer {
  public:
-  PerfStepTimer()
+  PerfStepTimer(uint64_t* metric)
     : enabled_(perf_level >= PerfLevel::kEnableTime),
       env_(enabled_ ? Env::Default() : nullptr),
-      start_(0) {
+      start_(0),
+      metric_(metric) {
+  }
+
+  ~PerfStepTimer() {
+    Stop();
   }
 
   void Start() {
@@ -36,17 +40,17 @@ class PerfStepTimer {
     }
   }
 
-  void Measure(uint64_t* metric) {
+  void Measure() {
     if (start_) {
       uint64_t now = env_->NowNanos();
-      *metric += now - start_;
+      *metric_ += now - start_;
       start_ = now;
     }
   }
 
-  void Stop(uint64_t* metric) {
+  void Stop() {
     if (start_) {
-      *metric += env_->NowNanos() - start_;
+      *metric_ += env_->NowNanos() - start_;
       start_ = 0;
     }
   }
@@ -55,29 +59,25 @@ class PerfStepTimer {
   const bool enabled_;
   Env* const env_;
   uint64_t start_;
+  uint64_t* metric_;
 };
 
-// Declare the local timer object to be used later on
-#define PERF_TIMER_DECLARE()           \
-  PerfStepTimer perf_step_timer;
+// Stop the timer and update the metric
+#define PERF_TIMER_STOP(metric)          \
+  perf_step_timer_ ## metric.Stop();
 
-// Set start time of the timer
 #define PERF_TIMER_START(metric)          \
-  perf_step_timer.Start();
+  perf_step_timer_ ## metric.Start();
 
 // Declare and set start time of the timer
-#define PERF_TIMER_AUTO(metric)           \
-  PerfStepTimer perf_step_timer;          \
-  perf_step_timer.Start();
+#define PERF_TIMER_GUARD(metric)           \
+  PerfStepTimer perf_step_timer_ ## metric(&(perf_context.metric));          \
+  perf_step_timer_ ## metric.Start();
 
 // Update metric with time elapsed since last START. start time is reset
 // to current timestamp.
 #define PERF_TIMER_MEASURE(metric)        \
-  perf_step_timer.Measure(&(perf_context.metric));
-
-// Update metric with time elapsed since last START. But start time is not set.
-#define PERF_TIMER_STOP(metric)        \
-  perf_step_timer.Stop(&(perf_context.metric));
+  perf_step_timer_ ## metric.Measure();
 
 // Increase metric value
 #define PERF_COUNTER_ADD(metric, value)     \
diff --git a/src/rocksdb/util/posix_logger.h b/src/rocksdb/util/posix_logger.h
index 6aba769..213a652 100644
--- a/src/rocksdb/util/posix_logger.h
+++ b/src/rocksdb/util/posix_logger.h
@@ -51,14 +51,16 @@ class PosixLogger : public Logger {
   virtual ~PosixLogger() {
     fclose(file_);
   }
-  virtual void Flush() {
+  virtual void Flush() override {
     if (flush_pending_) {
       flush_pending_ = false;
       fflush(file_);
     }
     last_flush_micros_ = env_->NowMicros();
   }
-  virtual void Logv(const char* format, va_list ap) {
+
+  using Logger::Logv;
+  virtual void Logv(const char* format, va_list ap) override {
     const uint64_t thread_id = (*gettid_)();
 
     // We try twice: the first time with a fixed-size stack allocated buffer,
@@ -123,14 +125,15 @@ class PosixLogger : public Logger {
       // space, pre-allocate more space to avoid overly large
       // allocations from filesystem allocsize options.
       const size_t log_size = log_size_;
-      const int last_allocation_chunk =
+      const size_t last_allocation_chunk =
         ((kDebugLogChunkSize - 1 + log_size) / kDebugLogChunkSize);
-      const int desired_allocation_chunk =
+      const size_t desired_allocation_chunk =
         ((kDebugLogChunkSize - 1 + log_size + write_size) /
            kDebugLogChunkSize);
       if (last_allocation_chunk != desired_allocation_chunk) {
-        fallocate(fd_, FALLOC_FL_KEEP_SIZE, 0,
-                  desired_allocation_chunk * kDebugLogChunkSize);
+        fallocate(
+            fd_, FALLOC_FL_KEEP_SIZE, 0,
+            static_cast<off_t>(desired_allocation_chunk * kDebugLogChunkSize));
       }
 #endif
 
@@ -143,9 +146,7 @@ class PosixLogger : public Logger {
       uint64_t now_micros = static_cast<uint64_t>(now_tv.tv_sec) * 1000000 +
         now_tv.tv_usec;
       if (now_micros - last_flush_micros_ >= flush_every_seconds_ * 1000000) {
-        flush_pending_ = false;
-        fflush(file_);
-        last_flush_micros_ = now_micros;
+        Flush();
       }
       if (base != buffer) {
         delete[] base;
@@ -153,9 +154,7 @@ class PosixLogger : public Logger {
       break;
     }
   }
-  size_t GetLogFileSize() const {
-    return log_size_;
-  }
+  size_t GetLogFileSize() const override { return log_size_; }
 };
 
 }  // namespace rocksdb
diff --git a/src/rocksdb/util/rate_limiter.cc b/src/rocksdb/util/rate_limiter.cc
new file mode 100644
index 0000000..3eff506
--- /dev/null
+++ b/src/rocksdb/util/rate_limiter.cc
@@ -0,0 +1,216 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/rate_limiter.h"
+#include "rocksdb/env.h"
+
+namespace rocksdb {
+
+
+// Pending request
+struct GenericRateLimiter::Req {
+  explicit Req(int64_t _bytes, port::Mutex* _mu)
+      : bytes(_bytes), cv(_mu), granted(false) {}
+  int64_t bytes;
+  port::CondVar cv;
+  bool granted;
+};
+
+GenericRateLimiter::GenericRateLimiter(int64_t rate_bytes_per_sec,
+                                       int64_t refill_period_us,
+                                       int32_t fairness)
+    : refill_period_us_(refill_period_us),
+      refill_bytes_per_period_(
+          CalculateRefillBytesPerPeriod(rate_bytes_per_sec)),
+      env_(Env::Default()),
+      stop_(false),
+      exit_cv_(&request_mutex_),
+      requests_to_wait_(0),
+      total_requests_{0, 0},
+      total_bytes_through_{0, 0},
+      available_bytes_(0),
+      next_refill_us_(env_->NowMicros()),
+      fairness_(fairness > 100 ? 100 : fairness),
+      rnd_((uint32_t)time(nullptr)),
+      leader_(nullptr) {
+  total_bytes_through_[0] = 0;
+  total_bytes_through_[1] = 0;
+}
+
+GenericRateLimiter::~GenericRateLimiter() {
+  MutexLock g(&request_mutex_);
+  stop_ = true;
+  requests_to_wait_ = static_cast<int32_t>(queue_[Env::IO_LOW].size() +
+                                           queue_[Env::IO_HIGH].size());
+  for (auto& r : queue_[Env::IO_HIGH]) {
+    r->cv.Signal();
+  }
+  for (auto& r : queue_[Env::IO_LOW]) {
+    r->cv.Signal();
+  }
+  while (requests_to_wait_ > 0) {
+    exit_cv_.Wait();
+  }
+}
+
+// This API allows user to dynamically change rate limiter's bytes per second.
+void GenericRateLimiter::SetBytesPerSecond(int64_t bytes_per_second) {
+  assert(bytes_per_second > 0);
+  refill_bytes_per_period_.store(
+      CalculateRefillBytesPerPeriod(bytes_per_second),
+      std::memory_order_relaxed);
+}
+
+void GenericRateLimiter::Request(int64_t bytes, const Env::IOPriority pri) {
+  assert(bytes <= refill_bytes_per_period_.load(std::memory_order_relaxed));
+
+  MutexLock g(&request_mutex_);
+  if (stop_) {
+    return;
+  }
+
+  ++total_requests_[pri];
+
+  if (available_bytes_ >= bytes) {
+    // Refill thread assigns quota and notifies requests waiting on
+    // the queue under mutex. So if we get here, that means nobody
+    // is waiting?
+    available_bytes_ -= bytes;
+    total_bytes_through_[pri] += bytes;
+    return;
+  }
+
+  // Request cannot be satisfied at this moment, enqueue
+  Req r(bytes, &request_mutex_);
+  queue_[pri].push_back(&r);
+
+  do {
+    bool timedout = false;
+    // Leader election, candidates can be:
+    // (1) a new incoming request,
+    // (2) a previous leader, whose quota has not been not assigned yet due
+    //     to lower priority
+    // (3) a previous waiter at the front of queue, who got notified by
+    //     previous leader
+    if (leader_ == nullptr &&
+        ((!queue_[Env::IO_HIGH].empty() &&
+            &r == queue_[Env::IO_HIGH].front()) ||
+         (!queue_[Env::IO_LOW].empty() &&
+            &r == queue_[Env::IO_LOW].front()))) {
+      leader_ = &r;
+      timedout = r.cv.TimedWait(next_refill_us_);
+    } else {
+      // Not at the front of queue or an leader has already been elected
+      r.cv.Wait();
+    }
+
+    // request_mutex_ is held from now on
+    if (stop_) {
+      --requests_to_wait_;
+      exit_cv_.Signal();
+      return;
+    }
+
+    // Make sure the waken up request is always the header of its queue
+    assert(r.granted ||
+           (!queue_[Env::IO_HIGH].empty() &&
+            &r == queue_[Env::IO_HIGH].front()) ||
+           (!queue_[Env::IO_LOW].empty() &&
+            &r == queue_[Env::IO_LOW].front()));
+    assert(leader_ == nullptr ||
+           (!queue_[Env::IO_HIGH].empty() &&
+            leader_ == queue_[Env::IO_HIGH].front()) ||
+           (!queue_[Env::IO_LOW].empty() &&
+            leader_ == queue_[Env::IO_LOW].front()));
+
+    if (leader_ == &r) {
+      // Waken up from TimedWait()
+      if (timedout) {
+        // Time to do refill!
+        Refill();
+
+        // Re-elect a new leader regardless. This is to simplify the
+        // election handling.
+        leader_ = nullptr;
+
+        // Notify the header of queue if current leader is going away
+        if (r.granted) {
+          // Current leader already got granted with quota. Notify header
+          // of waiting queue to participate next round of election.
+          assert((queue_[Env::IO_HIGH].empty() ||
+                    &r != queue_[Env::IO_HIGH].front()) &&
+                 (queue_[Env::IO_LOW].empty() ||
+                    &r != queue_[Env::IO_LOW].front()));
+          if (!queue_[Env::IO_HIGH].empty()) {
+            queue_[Env::IO_HIGH].front()->cv.Signal();
+          } else if (!queue_[Env::IO_LOW].empty()) {
+            queue_[Env::IO_LOW].front()->cv.Signal();
+          }
+          // Done
+          break;
+        }
+      } else {
+        // Spontaneous wake up, need to continue to wait
+        assert(!r.granted);
+        leader_ = nullptr;
+      }
+    } else {
+      // Waken up by previous leader:
+      // (1) if requested quota is granted, it is done.
+      // (2) if requested quota is not granted, this means current thread
+      // was picked as a new leader candidate (previous leader got quota).
+      // It needs to participate leader election because a new request may
+      // come in before this thread gets waken up. So it may actually need
+      // to do Wait() again.
+      assert(!timedout);
+    }
+  } while (!r.granted);
+}
+
+void GenericRateLimiter::Refill() {
+  next_refill_us_ = env_->NowMicros() + refill_period_us_;
+  // Carry over the left over quota from the last period
+  auto refill_bytes_per_period =
+      refill_bytes_per_period_.load(std::memory_order_relaxed);
+  if (available_bytes_ < refill_bytes_per_period) {
+    available_bytes_ += refill_bytes_per_period;
+  }
+
+  int use_low_pri_first = rnd_.OneIn(fairness_) ? 0 : 1;
+  for (int q = 0; q < 2; ++q) {
+    auto use_pri = (use_low_pri_first == q) ? Env::IO_LOW : Env::IO_HIGH;
+    auto* queue = &queue_[use_pri];
+    while (!queue->empty()) {
+      auto* next_req = queue->front();
+      if (available_bytes_ < next_req->bytes) {
+        break;
+      }
+      available_bytes_ -= next_req->bytes;
+      total_bytes_through_[use_pri] += next_req->bytes;
+      queue->pop_front();
+
+      next_req->granted = true;
+      if (next_req != leader_) {
+        // Quota granted, signal the thread
+        next_req->cv.Signal();
+      }
+    }
+  }
+}
+
+RateLimiter* NewGenericRateLimiter(
+    int64_t rate_bytes_per_sec, int64_t refill_period_us, int32_t fairness) {
+  assert(rate_bytes_per_sec > 0);
+  assert(refill_period_us > 0);
+  assert(fairness > 0);
+  return new GenericRateLimiter(
+      rate_bytes_per_sec, refill_period_us, fairness);
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/rate_limiter.h b/src/rocksdb/util/rate_limiter.h
new file mode 100644
index 0000000..3840c4e
--- /dev/null
+++ b/src/rocksdb/util/rate_limiter.h
@@ -0,0 +1,91 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <atomic>
+#include <deque>
+#include "port/port_posix.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+#include "rocksdb/env.h"
+#include "rocksdb/rate_limiter.h"
+
+namespace rocksdb {
+
+class GenericRateLimiter : public RateLimiter {
+ public:
+  GenericRateLimiter(int64_t refill_bytes,
+      int64_t refill_period_us, int32_t fairness);
+
+  virtual ~GenericRateLimiter();
+
+  // This API allows user to dynamically change rate limiter's bytes per second.
+  virtual void SetBytesPerSecond(int64_t bytes_per_second) override;
+
+  // Request for token to write bytes. If this request can not be satisfied,
+  // the call is blocked. Caller is responsible to make sure
+  // bytes < GetSingleBurstBytes()
+  virtual void Request(const int64_t bytes, const Env::IOPriority pri) override;
+
+  virtual int64_t GetSingleBurstBytes() const override {
+    return refill_bytes_per_period_.load(std::memory_order_relaxed);
+  }
+
+  virtual int64_t GetTotalBytesThrough(
+      const Env::IOPriority pri = Env::IO_TOTAL) const override {
+    MutexLock g(&request_mutex_);
+    if (pri == Env::IO_TOTAL) {
+      return total_bytes_through_[Env::IO_LOW] +
+             total_bytes_through_[Env::IO_HIGH];
+    }
+    return total_bytes_through_[pri];
+  }
+
+  virtual int64_t GetTotalRequests(
+      const Env::IOPriority pri = Env::IO_TOTAL) const override {
+    MutexLock g(&request_mutex_);
+    if (pri == Env::IO_TOTAL) {
+      return total_requests_[Env::IO_LOW] + total_requests_[Env::IO_HIGH];
+    }
+    return total_requests_[pri];
+  }
+
+ private:
+  void Refill();
+  int64_t CalculateRefillBytesPerPeriod(int64_t rate_bytes_per_sec) {
+    return rate_bytes_per_sec * refill_period_us_ / 1000000.0;
+  }
+
+  // This mutex guard all internal states
+  mutable port::Mutex request_mutex_;
+
+  const int64_t refill_period_us_;
+  // This variable can be changed dynamically.
+  std::atomic<int64_t> refill_bytes_per_period_;
+  Env* const env_;
+
+  bool stop_;
+  port::CondVar exit_cv_;
+  int32_t requests_to_wait_;
+
+  int64_t total_requests_[Env::IO_TOTAL];
+  int64_t total_bytes_through_[Env::IO_TOTAL];
+  int64_t available_bytes_;
+  int64_t next_refill_us_;
+
+  int32_t fairness_;
+  Random rnd_;
+
+  struct Req;
+  Req* leader_;
+  std::deque<Req*> queue_[Env::IO_TOTAL];
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/rate_limiter_test.cc b/src/rocksdb/util/rate_limiter_test.cc
new file mode 100644
index 0000000..d635010
--- /dev/null
+++ b/src/rocksdb/util/rate_limiter_test.cc
@@ -0,0 +1,95 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
+#include <limits>
+#include "util/testharness.h"
+#include "util/rate_limiter.h"
+#include "util/random.h"
+#include "rocksdb/env.h"
+
+namespace rocksdb {
+
+class RateLimiterTest : public testing::Test {};
+
+TEST_F(RateLimiterTest, StartStop) {
+  std::unique_ptr<RateLimiter> limiter(new GenericRateLimiter(100, 100, 10));
+}
+
+TEST_F(RateLimiterTest, Rate) {
+  auto* env = Env::Default();
+  struct Arg {
+    Arg(int32_t _target_rate, int _burst)
+        : limiter(new GenericRateLimiter(_target_rate, 100 * 1000, 10)),
+          request_size(_target_rate / 10),
+          burst(_burst) {}
+    std::unique_ptr<RateLimiter> limiter;
+    int32_t request_size;
+    int burst;
+  };
+
+  auto writer = [](void* p) {
+    auto* thread_env = Env::Default();
+    auto* arg = static_cast<Arg*>(p);
+    // Test for 2 seconds
+    auto until = thread_env->NowMicros() + 2 * 1000000;
+    Random r((uint32_t)(thread_env->NowNanos() %
+                        std::numeric_limits<uint32_t>::max()));
+    while (thread_env->NowMicros() < until) {
+      for (int i = 0; i < static_cast<int>(r.Skewed(arg->burst) + 1); ++i) {
+        arg->limiter->Request(r.Uniform(arg->request_size - 1) + 1,
+                              Env::IO_HIGH);
+      }
+      arg->limiter->Request(r.Uniform(arg->request_size - 1) + 1, Env::IO_LOW);
+    }
+  };
+
+  for (int i = 1; i <= 16; i *= 2) {
+    int32_t target = i * 1024 * 10;
+    Arg arg(target, i / 4 + 1);
+    int64_t old_total_bytes_through = 0;
+    for (int iter = 1; iter <= 2; ++iter) {
+      // second iteration changes the target dynamically
+      if (iter == 2) {
+        target *= 2;
+        arg.limiter->SetBytesPerSecond(target);
+      }
+      auto start = env->NowMicros();
+      for (int t = 0; t < i; ++t) {
+        env->StartThread(writer, &arg);
+      }
+      env->WaitForJoin();
+
+      auto elapsed = env->NowMicros() - start;
+      double rate =
+          (arg.limiter->GetTotalBytesThrough() - old_total_bytes_through) *
+          1000000.0 / elapsed;
+      old_total_bytes_through = arg.limiter->GetTotalBytesThrough();
+      fprintf(stderr,
+              "request size [1 - %" PRIi32 "], limit %" PRIi32
+              " KB/sec, actual rate: %lf KB/sec, elapsed %.2lf seconds\n",
+              arg.request_size - 1, target / 1024, rate / 1024,
+              elapsed / 1000000.0);
+
+      ASSERT_GE(rate / target, 0.9);
+      ASSERT_LE(rate / target, 1.1);
+    }
+  }
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/scoped_arena_iterator.h b/src/rocksdb/util/scoped_arena_iterator.h
new file mode 100644
index 0000000..2021d2d
--- /dev/null
+++ b/src/rocksdb/util/scoped_arena_iterator.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include "rocksdb/iterator.h"
+
+namespace rocksdb {
+class ScopedArenaIterator {
+ public:
+  explicit ScopedArenaIterator(Iterator* iter = nullptr) : iter_(iter) {}
+
+  Iterator* operator->() { return iter_; }
+
+  void set(Iterator* iter) { iter_ = iter; }
+
+  Iterator* get() { return iter_; }
+
+  ~ScopedArenaIterator() { iter_->~Iterator(); }
+
+ private:
+  Iterator* iter_;
+};
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/signal_test.cc b/src/rocksdb/util/signal_test.cc
deleted file mode 100644
index f51fa54..0000000
--- a/src/rocksdb/util/signal_test.cc
+++ /dev/null
@@ -1,34 +0,0 @@
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-//
-#include "port/stack_trace.h"
-#include <assert.h>
-
-namespace {
-void f0() {
-  char *p = nullptr;
-  *p = 10;  /* SIGSEGV here!! */
-}
-
-void f1() {
-  f0();
-}
-
-void f2() {
-  f1();
-}
-
-void f3() {
-  f2();
-}
-}  // namespace
-
-int main() {
-  rocksdb::port::InstallStackTraceHandler();
-
-  f3();
-
-  return 0;
-}
diff --git a/src/rocksdb/util/skiplistrep.cc b/src/rocksdb/util/skiplistrep.cc
index 93f7134..ee57372 100644
--- a/src/rocksdb/util/skiplistrep.cc
+++ b/src/rocksdb/util/skiplistrep.cc
@@ -6,14 +6,23 @@
 #include "rocksdb/memtablerep.h"
 #include "db/memtable.h"
 #include "db/skiplist.h"
+#include "util/arena.h"
 
 namespace rocksdb {
 namespace {
 class SkipListRep : public MemTableRep {
   SkipList<const char*, const MemTableRep::KeyComparator&> skip_list_;
+  const MemTableRep::KeyComparator& cmp_;
+  const SliceTransform* transform_;
+  const size_t lookahead_;
+
+  friend class LookaheadIterator;
 public:
-  explicit SkipListRep(const MemTableRep::KeyComparator& compare, Arena* arena)
-    : MemTableRep(arena), skip_list_(compare, arena) {
+  explicit SkipListRep(const MemTableRep::KeyComparator& compare,
+                       MemTableAllocator* allocator,
+                       const SliceTransform* transform, const size_t lookahead)
+    : MemTableRep(allocator), skip_list_(compare, allocator), cmp_(compare),
+      transform_(transform), lookahead_(lookahead) {
   }
 
   // Insert key into the list.
@@ -28,7 +37,7 @@ public:
   }
 
   virtual size_t ApproximateMemoryUsage() override {
-    // All memory is allocated through arena; nothing to report here
+    // All memory is allocated through allocator; nothing to report here
     return 0;
   }
 
@@ -105,19 +114,120 @@ public:
     std::string tmp_;       // For passing to EncodeKey
   };
 
-  // Unhide default implementations of GetIterator
-  using MemTableRep::GetIterator;
+  // Iterator over the contents of a skip list which also keeps track of the
+  // previously visited node. In Seek(), it examines a few nodes after it
+  // first, falling back to O(log n) search from the head of the list only if
+  // the target key hasn't been found.
+  class LookaheadIterator : public MemTableRep::Iterator {
+   public:
+    explicit LookaheadIterator(const SkipListRep& rep) :
+        rep_(rep), iter_(&rep_.skip_list_), prev_(iter_) {}
+
+    virtual ~LookaheadIterator() override {}
+
+    virtual bool Valid() const override {
+      return iter_.Valid();
+    }
+
+    virtual const char *key() const override {
+      assert(Valid());
+      return iter_.key();
+    }
+
+    virtual void Next() override {
+      assert(Valid());
+
+      bool advance_prev = true;
+      if (prev_.Valid()) {
+        auto k1 = rep_.UserKey(prev_.key());
+        auto k2 = rep_.UserKey(iter_.key());
+
+        if (k1.compare(k2) == 0) {
+          // same user key, don't move prev_
+          advance_prev = false;
+        } else if (rep_.transform_) {
+          // only advance prev_ if it has the same prefix as iter_
+          auto t1 = rep_.transform_->Transform(k1);
+          auto t2 = rep_.transform_->Transform(k2);
+          advance_prev = t1.compare(t2) == 0;
+        }
+      }
+
+      if (advance_prev) {
+        prev_ = iter_;
+      }
+      iter_.Next();
+    }
+
+    virtual void Prev() override {
+      assert(Valid());
+      iter_.Prev();
+      prev_ = iter_;
+    }
 
-  virtual MemTableRep::Iterator* GetIterator() override {
-    return new SkipListRep::Iterator(&skip_list_);
+    virtual void Seek(const Slice& internal_key, const char *memtable_key)
+        override {
+      const char *encoded_key =
+        (memtable_key != nullptr) ?
+            memtable_key : EncodeKey(&tmp_, internal_key);
+
+      if (prev_.Valid() && rep_.cmp_(encoded_key, prev_.key()) >= 0) {
+        // prev_.key() is smaller or equal to our target key; do a quick
+        // linear search (at most lookahead_ steps) starting from prev_
+        iter_ = prev_;
+
+        size_t cur = 0;
+        while (cur++ <= rep_.lookahead_ && iter_.Valid()) {
+          if (rep_.cmp_(encoded_key, iter_.key()) <= 0) {
+            return;
+          }
+          Next();
+        }
+      }
+
+      iter_.Seek(encoded_key);
+      prev_ = iter_;
+    }
+
+    virtual void SeekToFirst() override {
+      iter_.SeekToFirst();
+      prev_ = iter_;
+    }
+
+    virtual void SeekToLast() override {
+      iter_.SeekToLast();
+      prev_ = iter_;
+    }
+
+   protected:
+    std::string tmp_;       // For passing to EncodeKey
+
+   private:
+    const SkipListRep& rep_;
+    SkipList<const char*, const MemTableRep::KeyComparator&>::Iterator iter_;
+    SkipList<const char*, const MemTableRep::KeyComparator&>::Iterator prev_;
+  };
+
+  virtual MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override {
+    if (lookahead_ > 0) {
+      void *mem =
+        arena ? arena->AllocateAligned(sizeof(SkipListRep::LookaheadIterator))
+              : operator new(sizeof(SkipListRep::LookaheadIterator));
+      return new (mem) SkipListRep::LookaheadIterator(*this);
+    } else {
+      void *mem =
+        arena ? arena->AllocateAligned(sizeof(SkipListRep::Iterator))
+              : operator new(sizeof(SkipListRep::Iterator));
+      return new (mem) SkipListRep::Iterator(&skip_list_);
+    }
   }
 };
 }
 
 MemTableRep* SkipListFactory::CreateMemTableRep(
-    const MemTableRep::KeyComparator& compare, Arena* arena,
-    const SliceTransform*) {
-  return new SkipListRep(compare, arena);
+    const MemTableRep::KeyComparator& compare, MemTableAllocator* allocator,
+    const SliceTransform* transform, Logger* logger) {
+  return new SkipListRep(compare, allocator, transform, lookahead_);
 }
 
 } // namespace rocksdb
diff --git a/src/rocksdb/util/slice.cc b/src/rocksdb/util/slice.cc
index 55f561f..6484e16 100644
--- a/src/rocksdb/util/slice.cc
+++ b/src/rocksdb/util/slice.cc
@@ -9,6 +9,7 @@
 
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/slice.h"
+#include "util/string_util.h"
 
 namespace rocksdb {
 
@@ -17,46 +18,75 @@ namespace {
 class FixedPrefixTransform : public SliceTransform {
  private:
   size_t prefix_len_;
+  std::string name_;
 
  public:
-  explicit FixedPrefixTransform(size_t prefix_len) : prefix_len_(prefix_len) { }
+  explicit FixedPrefixTransform(size_t prefix_len)
+      : prefix_len_(prefix_len),
+        name_("rocksdb.FixedPrefix." + ToString(prefix_len_)) {}
 
-  virtual const char* Name() const {
-    return "rocksdb.FixedPrefix";
-  }
+  virtual const char* Name() const override { return name_.c_str(); }
 
-  virtual Slice Transform(const Slice& src) const {
+  virtual Slice Transform(const Slice& src) const override {
     assert(InDomain(src));
     return Slice(src.data(), prefix_len_);
   }
 
-  virtual bool InDomain(const Slice& src) const {
+  virtual bool InDomain(const Slice& src) const override {
     return (src.size() >= prefix_len_);
   }
 
-  virtual bool InRange(const Slice& dst) const {
+  virtual bool InRange(const Slice& dst) const override {
     return (dst.size() == prefix_len_);
   }
+
+  virtual bool SameResultWhenAppended(const Slice& prefix) const override {
+    return InDomain(prefix);
+  }
 };
 
-class NoopTransform : public SliceTransform {
+class CappedPrefixTransform : public SliceTransform {
+ private:
+  size_t cap_len_;
+  std::string name_;
+
  public:
-  explicit NoopTransform() { }
+  explicit CappedPrefixTransform(size_t cap_len)
+      : cap_len_(cap_len),
+        name_("rocksdb.CappedPrefix." + ToString(cap_len_)) {}
+
+  virtual const char* Name() const override { return name_.c_str(); }
 
-  virtual const char* Name() const {
-    return "rocksdb.Noop";
+  virtual Slice Transform(const Slice& src) const override {
+    assert(InDomain(src));
+    return Slice(src.data(), std::min(cap_len_, src.size()));
   }
 
-  virtual Slice Transform(const Slice& src) const {
-    return src;
+  virtual bool InDomain(const Slice& src) const override { return true; }
+
+  virtual bool InRange(const Slice& dst) const override {
+    return (dst.size() <= cap_len_);
   }
 
-  virtual bool InDomain(const Slice& src) const {
-    return true;
+  virtual bool SameResultWhenAppended(const Slice& prefix) const override {
+    return prefix.size() >= cap_len_;
   }
+};
 
-  virtual bool InRange(const Slice& dst) const {
-    return true;
+class NoopTransform : public SliceTransform {
+ public:
+  explicit NoopTransform() { }
+
+  virtual const char* Name() const override { return "rocksdb.Noop"; }
+
+  virtual Slice Transform(const Slice& src) const override { return src; }
+
+  virtual bool InDomain(const Slice& src) const override { return true; }
+
+  virtual bool InRange(const Slice& dst) const override { return true; }
+
+  virtual bool SameResultWhenAppended(const Slice& prefix) const override {
+    return false;
   }
 };
 
@@ -66,6 +96,10 @@ const SliceTransform* NewFixedPrefixTransform(size_t prefix_len) {
   return new FixedPrefixTransform(prefix_len);
 }
 
+const SliceTransform* NewCappedPrefixTransform(size_t cap_len) {
+  return new CappedPrefixTransform(cap_len);
+}
+
 const SliceTransform* NewNoopTransform() {
   return new NoopTransform;
 }
diff --git a/src/rocksdb/util/slice_transform_test.cc b/src/rocksdb/util/slice_transform_test.cc
new file mode 100644
index 0000000..5b7c1b4
--- /dev/null
+++ b/src/rocksdb/util/slice_transform_test.cc
@@ -0,0 +1,153 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/slice_transform.h"
+
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/table.h"
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+class SliceTransformTest : public testing::Test {};
+
+TEST_F(SliceTransformTest, CapPrefixTransform) {
+  std::string s;
+  s = "abcdefge";
+
+  unique_ptr<const SliceTransform> transform;
+
+  transform.reset(NewCappedPrefixTransform(6));
+  ASSERT_EQ(transform->Transform(s).ToString(), "abcdef");
+  ASSERT_TRUE(transform->SameResultWhenAppended("123456"));
+  ASSERT_TRUE(transform->SameResultWhenAppended("1234567"));
+  ASSERT_TRUE(!transform->SameResultWhenAppended("12345"));
+
+  transform.reset(NewCappedPrefixTransform(8));
+  ASSERT_EQ(transform->Transform(s).ToString(), "abcdefge");
+
+  transform.reset(NewCappedPrefixTransform(10));
+  ASSERT_EQ(transform->Transform(s).ToString(), "abcdefge");
+
+  transform.reset(NewCappedPrefixTransform(0));
+  ASSERT_EQ(transform->Transform(s).ToString(), "");
+
+  transform.reset(NewCappedPrefixTransform(0));
+  ASSERT_EQ(transform->Transform("").ToString(), "");
+}
+
+class SliceTransformDBTest : public testing::Test {
+ private:
+  std::string dbname_;
+  Env* env_;
+  DB* db_;
+
+ public:
+  SliceTransformDBTest() : env_(Env::Default()), db_(nullptr) {
+    dbname_ = test::TmpDir() + "/slice_transform_db_test";
+    EXPECT_OK(DestroyDB(dbname_, last_options_));
+  }
+
+  ~SliceTransformDBTest() {
+    delete db_;
+    EXPECT_OK(DestroyDB(dbname_, last_options_));
+  }
+
+  DB* db() { return db_; }
+
+  // Return the current option configuration.
+  Options* GetOptions() { return &last_options_; }
+
+  void DestroyAndReopen() {
+    // Destroy using last options
+    Destroy();
+    ASSERT_OK(TryReopen());
+  }
+
+  void Destroy() {
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, last_options_));
+  }
+
+  Status TryReopen() {
+    delete db_;
+    db_ = nullptr;
+    last_options_.create_if_missing = true;
+
+    return DB::Open(last_options_, dbname_, &db_);
+  }
+
+  Options last_options_;
+};
+
+namespace {
+uint64_t TestGetTickerCount(const Options& options, Tickers ticker_type) {
+  return options.statistics->getTickerCount(ticker_type);
+}
+}  // namespace
+
+TEST_F(SliceTransformDBTest, CapPrefix) {
+  last_options_.prefix_extractor.reset(NewCappedPrefixTransform(8));
+  last_options_.statistics = rocksdb::CreateDBStatistics();
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.whole_key_filtering = false;
+  last_options_.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  ASSERT_OK(TryReopen());
+
+  ReadOptions ro;
+  FlushOptions fo;
+  WriteOptions wo;
+
+  ASSERT_OK(db()->Put(wo, "barbarbar", "foo"));
+  ASSERT_OK(db()->Put(wo, "barbarbar2", "foo2"));
+  ASSERT_OK(db()->Put(wo, "foo", "bar"));
+  ASSERT_OK(db()->Put(wo, "foo3", "bar3"));
+  ASSERT_OK(db()->Flush(fo));
+
+  unique_ptr<Iterator> iter(db()->NewIterator(ro));
+
+  iter->Seek("foo");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->value().ToString(), "bar");
+  ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 0U);
+
+  iter->Seek("foo2");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(!iter->Valid());
+  ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 1U);
+
+  iter->Seek("barbarbar");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->value().ToString(), "foo");
+  ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 1U);
+
+  iter->Seek("barfoofoo");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(!iter->Valid());
+  ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 2U);
+
+  iter->Seek("foobarbar");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(!iter->Valid());
+  ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 3U);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/sst_dump_test.cc b/src/rocksdb/util/sst_dump_test.cc
new file mode 100644
index 0000000..03d7299
--- /dev/null
+++ b/src/rocksdb/util/sst_dump_test.cc
@@ -0,0 +1,182 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <stdint.h>
+#include "rocksdb/sst_dump_tool.h"
+
+#include "rocksdb/filter_policy.h"
+#include "table/block_based_table_factory.h"
+#include "table/table_builder.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+const uint32_t optLength = 100;
+
+namespace {
+static std::string MakeKey(int i) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "k_%04d", i);
+  InternalKey key(std::string(buf), 0, ValueType::kTypeValue);
+  return key.Encode().ToString();
+}
+
+static std::string MakeValue(int i) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "v_%04d", i);
+  InternalKey key(std::string(buf), 0, ValueType::kTypeValue);
+  return key.Encode().ToString();
+}
+
+void createSST(const std::string& file_name,
+               const BlockBasedTableOptions& table_options) {
+  std::shared_ptr<rocksdb::TableFactory> tf;
+  tf.reset(new rocksdb::BlockBasedTableFactory(table_options));
+
+  unique_ptr<WritableFile> file;
+  Env* env = Env::Default();
+  EnvOptions env_options;
+  ReadOptions read_options;
+  Options opts;
+  const ImmutableCFOptions imoptions(opts);
+  rocksdb::InternalKeyComparator ikc(opts.comparator);
+  unique_ptr<TableBuilder> tb;
+
+  env->NewWritableFile(file_name, &file, env_options);
+  opts.table_factory = tf;
+  std::vector<std::unique_ptr<IntTblPropCollectorFactory> >
+      int_tbl_prop_collector_factories;
+
+  tb.reset(opts.table_factory->NewTableBuilder(
+      TableBuilderOptions(imoptions, ikc, &int_tbl_prop_collector_factories,
+                          CompressionType::kNoCompression, CompressionOptions(),
+                          false),
+      file.get()));
+
+  // Populate slightly more than 1K keys
+  uint32_t num_keys = 1024;
+  for (uint32_t i = 0; i < num_keys; i++) {
+    tb->Add(MakeKey(i), MakeValue(i));
+  }
+  tb->Finish();
+  file->Close();
+}
+
+void cleanup(const std::string& file_name) {
+  Env* env = Env::Default();
+  env->DeleteFile(file_name);
+  std::string outfile_name = file_name.substr(0, file_name.length() - 4);
+  outfile_name.append("_dump.txt");
+  env->DeleteFile(outfile_name);
+}
+}  // namespace
+
+// Test for sst dump tool "raw" mode
+class SSTDumpToolTest : public testing::Test {
+ public:
+  BlockBasedTableOptions table_options_;
+
+  SSTDumpToolTest() {}
+
+  ~SSTDumpToolTest() {}
+};
+
+TEST_F(SSTDumpToolTest, EmptyFilter) {
+  std::string file_name = "rocksdb_sst_test.sst";
+  createSST(file_name, table_options_);
+
+  char* usage[3];
+  for (int i = 0; i < 3; i++) {
+    usage[i] = new char[optLength];
+  }
+  snprintf(usage[0], optLength, "./sst_dump");
+  snprintf(usage[1], optLength, "--command=raw");
+  snprintf(usage[2], optLength, "--file=rocksdb_sst_test.sst");
+
+  rocksdb::SSTDumpTool tool;
+  ASSERT_TRUE(!tool.Run(3, usage));
+
+  cleanup(file_name);
+  for (int i = 0; i < 3; i++) {
+    delete[] usage[i];
+  }
+}
+
+TEST_F(SSTDumpToolTest, FilterBlock) {
+  table_options_.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, true));
+  std::string file_name = "rocksdb_sst_test.sst";
+  createSST(file_name, table_options_);
+
+  char* usage[3];
+  for (int i = 0; i < 3; i++) {
+    usage[i] = new char[optLength];
+  }
+  snprintf(usage[0], optLength, "./sst_dump");
+  snprintf(usage[1], optLength, "--command=raw");
+  snprintf(usage[2], optLength, "--file=rocksdb_sst_test.sst");
+
+  rocksdb::SSTDumpTool tool;
+  ASSERT_TRUE(!tool.Run(3, usage));
+
+  cleanup(file_name);
+  for (int i = 0; i < 3; i++) {
+    delete[] usage[i];
+  }
+}
+
+TEST_F(SSTDumpToolTest, FullFilterBlock) {
+  table_options_.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, false));
+  std::string file_name = "rocksdb_sst_test.sst";
+  createSST(file_name, table_options_);
+
+  char* usage[3];
+  for (int i = 0; i < 3; i++) {
+    usage[i] = new char[optLength];
+  }
+  snprintf(usage[0], optLength, "./sst_dump");
+  snprintf(usage[1], optLength, "--command=raw");
+  snprintf(usage[2], optLength, "--file=rocksdb_sst_test.sst");
+
+  rocksdb::SSTDumpTool tool;
+  ASSERT_TRUE(!tool.Run(3, usage));
+
+  cleanup(file_name);
+  for (int i = 0; i < 3; i++) {
+    delete[] usage[i];
+  }
+}
+
+TEST_F(SSTDumpToolTest, GetProperties) {
+  table_options_.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, false));
+  std::string file_name = "rocksdb_sst_test.sst";
+  createSST(file_name, table_options_);
+
+  char* usage[3];
+  for (int i = 0; i < 3; i++) {
+    usage[i] = new char[optLength];
+  }
+  snprintf(usage[0], optLength, "./sst_dump");
+  snprintf(usage[1], optLength, "--show_properties");
+  snprintf(usage[2], optLength, "--file=rocksdb_sst_test.sst");
+
+  rocksdb::SSTDumpTool tool;
+  ASSERT_TRUE(!tool.Run(3, usage));
+
+  cleanup(file_name);
+  for (int i = 0; i < 3; i++) {
+    delete[] usage[i];
+  }
+}
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/sst_dump_tool.cc b/src/rocksdb/util/sst_dump_tool.cc
new file mode 100644
index 0000000..04486da
--- /dev/null
+++ b/src/rocksdb/util/sst_dump_tool.cc
@@ -0,0 +1,423 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#ifndef ROCKSDB_LITE
+
+#include "util/sst_dump_tool_imp.h"
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
+
+namespace rocksdb {
+
+using std::dynamic_pointer_cast;
+
+SstFileReader::SstFileReader(const std::string& file_path,
+                             bool verify_checksum,
+                             bool output_hex)
+    :file_name_(file_path), read_num_(0), verify_checksum_(verify_checksum),
+    output_hex_(output_hex), ioptions_(options_),
+    internal_comparator_(BytewiseComparator()) {
+  fprintf(stdout, "Process %s\n", file_path.c_str());
+
+  init_result_ = GetTableReader(file_name_);
+}
+
+extern const uint64_t kBlockBasedTableMagicNumber;
+extern const uint64_t kLegacyBlockBasedTableMagicNumber;
+extern const uint64_t kPlainTableMagicNumber;
+extern const uint64_t kLegacyPlainTableMagicNumber;
+
+Status SstFileReader::GetTableReader(const std::string& file_path) {
+  uint64_t magic_number;
+
+  // read table magic number
+  Footer footer;
+
+  unique_ptr<RandomAccessFile> file;
+  uint64_t file_size;
+  Status s = options_.env->NewRandomAccessFile(file_path, &file_, soptions_);
+  if (s.ok()) {
+    s = options_.env->GetFileSize(file_path, &file_size);
+  }
+  if (s.ok()) {
+    s = ReadFooterFromFile(file_.get(), file_size, &footer);
+  }
+  if (s.ok()) {
+    magic_number = footer.table_magic_number();
+  }
+
+  if (s.ok()) {
+    if (magic_number == kPlainTableMagicNumber ||
+        magic_number == kLegacyPlainTableMagicNumber) {
+      soptions_.use_mmap_reads = true;
+      options_.env->NewRandomAccessFile(file_path, &file_, soptions_);
+    }
+    options_.comparator = &internal_comparator_;
+    // For old sst format, ReadTableProperties might fail but file can be read
+    if (ReadTableProperties(magic_number, file_.get(), file_size).ok()) {
+      SetTableOptionsByMagicNumber(magic_number);
+    } else {
+      SetOldTableOptions();
+    }
+  }
+
+  if (s.ok()) {
+    s = NewTableReader(ioptions_, soptions_, internal_comparator_,
+                       std::move(file_), file_size, &table_reader_);
+  }
+  return s;
+}
+
+Status SstFileReader::NewTableReader(
+    const ImmutableCFOptions& ioptions, const EnvOptions& soptions,
+    const InternalKeyComparator& internal_comparator,
+    unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
+    unique_ptr<TableReader>* table_reader) {
+  // We need to turn off pre-fetching of index and filter nodes for
+  // BlockBasedTable
+  shared_ptr<BlockBasedTableFactory> block_table_factory =
+      dynamic_pointer_cast<BlockBasedTableFactory>(options_.table_factory);
+
+  if (block_table_factory) {
+    return block_table_factory->NewTableReader(
+        ioptions_, soptions_, internal_comparator_, std::move(file_), file_size,
+        &table_reader_, /*enable_prefetch=*/false);
+  }
+
+  assert(!block_table_factory);
+
+  // For all other factory implementation
+  return options_.table_factory->NewTableReader(
+      ioptions_, soptions_, internal_comparator_, std::move(file_), file_size,
+      &table_reader_);
+}
+
+Status SstFileReader::DumpTable(const std::string& out_filename) {
+  unique_ptr<WritableFile> out_file;
+  Env* env = Env::Default();
+  env->NewWritableFile(out_filename, &out_file, soptions_);
+  Status s = table_reader_->DumpTable(out_file.get());
+  out_file->Close();
+  return s;
+}
+
+Status SstFileReader::ReadTableProperties(uint64_t table_magic_number,
+                                          RandomAccessFile* file,
+                                          uint64_t file_size) {
+  TableProperties* table_properties = nullptr;
+  Status s = rocksdb::ReadTableProperties(file, file_size, table_magic_number,
+                                          options_.env, options_.info_log.get(),
+                                          &table_properties);
+  if (s.ok()) {
+    table_properties_.reset(table_properties);
+  } else {
+    fprintf(stdout, "Not able to read table properties\n");
+  }
+  return s;
+}
+
+Status SstFileReader::SetTableOptionsByMagicNumber(
+    uint64_t table_magic_number) {
+  assert(table_properties_);
+  if (table_magic_number == kBlockBasedTableMagicNumber ||
+      table_magic_number == kLegacyBlockBasedTableMagicNumber) {
+    options_.table_factory = std::make_shared<BlockBasedTableFactory>();
+    fprintf(stdout, "Sst file format: block-based\n");
+    auto& props = table_properties_->user_collected_properties;
+    auto pos = props.find(BlockBasedTablePropertyNames::kIndexType);
+    if (pos != props.end()) {
+      auto index_type_on_file = static_cast<BlockBasedTableOptions::IndexType>(
+          DecodeFixed32(pos->second.c_str()));
+      if (index_type_on_file ==
+          BlockBasedTableOptions::IndexType::kHashSearch) {
+        options_.prefix_extractor.reset(NewNoopTransform());
+      }
+    }
+  } else if (table_magic_number == kPlainTableMagicNumber ||
+             table_magic_number == kLegacyPlainTableMagicNumber) {
+    options_.allow_mmap_reads = true;
+
+    PlainTableOptions plain_table_options;
+    plain_table_options.user_key_len = kPlainTableVariableLength;
+    plain_table_options.bloom_bits_per_key = 0;
+    plain_table_options.hash_table_ratio = 0;
+    plain_table_options.index_sparseness = 1;
+    plain_table_options.huge_page_tlb_size = 0;
+    plain_table_options.encoding_type = kPlain;
+    plain_table_options.full_scan_mode = true;
+
+    options_.table_factory.reset(NewPlainTableFactory(plain_table_options));
+    fprintf(stdout, "Sst file format: plain table\n");
+  } else {
+    char error_msg_buffer[80];
+    snprintf(error_msg_buffer, sizeof(error_msg_buffer) - 1,
+             "Unsupported table magic number --- %lx",
+             (long)table_magic_number);
+    return Status::InvalidArgument(error_msg_buffer);
+  }
+
+  return Status::OK();
+}
+
+Status SstFileReader::SetOldTableOptions() {
+  assert(table_properties_ == nullptr);
+  options_.table_factory = std::make_shared<BlockBasedTableFactory>();
+  fprintf(stdout, "Sst file format: block-based(old version)\n");
+
+  return Status::OK();
+}
+
+Status SstFileReader::ReadSequential(bool print_kv,
+                                     uint64_t read_num,
+                                     bool has_from,
+                                     const std::string& from_key,
+                                     bool has_to,
+                                     const std::string& to_key) {
+  if (!table_reader_) {
+    return init_result_;
+  }
+
+  Iterator* iter = table_reader_->NewIterator(ReadOptions(verify_checksum_,
+                                                         false));
+  uint64_t i = 0;
+  if (has_from) {
+    InternalKey ikey;
+    ikey.SetMaxPossibleForUserKey(from_key);
+    iter->Seek(ikey.Encode());
+  } else {
+    iter->SeekToFirst();
+  }
+  for (; iter->Valid(); iter->Next()) {
+    Slice key = iter->key();
+    Slice value = iter->value();
+    ++i;
+    if (read_num > 0 && i > read_num)
+      break;
+
+    ParsedInternalKey ikey;
+    if (!ParseInternalKey(key, &ikey)) {
+      std::cerr << "Internal Key ["
+                << key.ToString(true /* in hex*/)
+                << "] parse error!\n";
+      continue;
+    }
+
+    // If end marker was specified, we stop before it
+    if (has_to && BytewiseComparator()->Compare(ikey.user_key, to_key) >= 0) {
+      break;
+    }
+
+    if (print_kv) {
+      fprintf(stdout, "%s => %s\n",
+          ikey.DebugString(output_hex_).c_str(),
+          value.ToString(output_hex_).c_str());
+    }
+  }
+
+  read_num_ += i;
+
+  Status ret = iter->status();
+  delete iter;
+  return ret;
+}
+
+Status SstFileReader::ReadTableProperties(
+    std::shared_ptr<const TableProperties>* table_properties) {
+  if (!table_reader_) {
+    return init_result_;
+  }
+
+  *table_properties = table_reader_->GetTableProperties();
+  return init_result_;
+}
+
+namespace {
+
+void print_help() {
+  fprintf(stderr,
+          "sst_dump [--command=check|scan|none|raw] [--verify_checksum] "
+          "--file=data_dir_OR_sst_file"
+          " [--output_hex]"
+          " [--input_key_hex]"
+          " [--from=<user_key>]"
+          " [--to=<user_key>]"
+          " [--read_num=NUM]"
+          " [--show_properties]\n");
+}
+
+string HexToString(const string& str) {
+  string parsed;
+  if (str[0] != '0' || str[1] != 'x') {
+    fprintf(stderr, "Invalid hex input %s.  Must start with 0x\n",
+            str.c_str());
+    throw "Invalid hex input";
+  }
+
+  for (unsigned int i = 2; i < str.length();) {
+    int c;
+    sscanf(str.c_str() + i, "%2X", &c);
+    parsed.push_back(c);
+    i += 2;
+  }
+  return parsed;
+}
+
+}  // namespace
+
+int SSTDumpTool::Run(int argc, char** argv) {
+  const char* dir_or_file = nullptr;
+  uint64_t read_num = -1;
+  std::string command;
+
+  char junk;
+  uint64_t n;
+  bool verify_checksum = false;
+  bool output_hex = false;
+  bool input_key_hex = false;
+  bool has_from = false;
+  bool has_to = false;
+  bool show_properties = false;
+  std::string from_key;
+  std::string to_key;
+  for (int i = 1; i < argc; i++) {
+    if (strncmp(argv[i], "--file=", 7) == 0) {
+      dir_or_file = argv[i] + 7;
+    } else if (strcmp(argv[i], "--output_hex") == 0) {
+      output_hex = true;
+    } else if (strcmp(argv[i], "--input_key_hex") == 0) {
+      input_key_hex = true;
+    } else if (sscanf(argv[i],
+               "--read_num=%lu%c",
+               (unsigned long*)&n, &junk) == 1) {
+      read_num = n;
+    } else if (strcmp(argv[i], "--verify_checksum") == 0) {
+      verify_checksum = true;
+    } else if (strncmp(argv[i], "--command=", 10) == 0) {
+      command = argv[i] + 10;
+    } else if (strncmp(argv[i], "--from=", 7) == 0) {
+      from_key = argv[i] + 7;
+      has_from = true;
+    } else if (strncmp(argv[i], "--to=", 5) == 0) {
+      to_key = argv[i] + 5;
+      has_to = true;
+    } else if (strcmp(argv[i], "--show_properties") == 0) {
+      show_properties = true;
+    } else {
+      print_help();
+      exit(1);
+    }
+  }
+
+  if (input_key_hex) {
+    if (has_from) {
+      from_key = HexToString(from_key);
+    }
+    if (has_to) {
+      to_key = HexToString(to_key);
+    }
+  }
+
+  if (dir_or_file == nullptr) {
+    print_help();
+    exit(1);
+  }
+
+  std::vector<std::string> filenames;
+  rocksdb::Env* env = rocksdb::Env::Default();
+  rocksdb::Status st = env->GetChildren(dir_or_file, &filenames);
+  bool dir = true;
+  if (!st.ok()) {
+    filenames.clear();
+    filenames.push_back(dir_or_file);
+    dir = false;
+  }
+
+  fprintf(stdout, "from [%s] to [%s]\n",
+      rocksdb::Slice(from_key).ToString(true).c_str(),
+      rocksdb::Slice(to_key).ToString(true).c_str());
+
+  uint64_t total_read = 0;
+  for (size_t i = 0; i < filenames.size(); i++) {
+    std::string filename = filenames.at(i);
+    if (filename.length() <= 4 ||
+        filename.rfind(".sst") != filename.length() - 4) {
+      // ignore
+      continue;
+    }
+    if (dir) {
+      filename = std::string(dir_or_file) + "/" + filename;
+    }
+
+    rocksdb::SstFileReader reader(filename, verify_checksum,
+                                  output_hex);
+    if (!reader.getStatus().ok()) {
+      fprintf(stderr, "%s: %s\n", filename.c_str(),
+              reader.getStatus().ToString().c_str());
+      exit(1);
+    }
+
+    if (command == "raw") {
+      std::string out_filename = filename.substr(0, filename.length() - 4);
+      out_filename.append("_dump.txt");
+
+      st = reader.DumpTable(out_filename);
+      if (!st.ok()) {
+        fprintf(stderr, "%s: %s\n", filename.c_str(), st.ToString().c_str());
+        exit(1);
+      } else {
+        fprintf(stdout, "raw dump written to file %s\n", &out_filename[0]);
+      }
+      continue;
+    }
+
+    // scan all files in give file path.
+    if (command == "" || command == "scan" || command == "check") {
+      st = reader.ReadSequential(command == "scan",
+                                 read_num > 0 ? (read_num - total_read) :
+                                                read_num,
+                                 has_from, from_key, has_to, to_key);
+      if (!st.ok()) {
+        fprintf(stderr, "%s: %s\n", filename.c_str(),
+            st.ToString().c_str());
+      }
+      total_read += reader.GetReadNumber();
+      if (read_num > 0 && total_read > read_num) {
+        break;
+      }
+    }
+    if (show_properties) {
+      const rocksdb::TableProperties* table_properties;
+
+      std::shared_ptr<const rocksdb::TableProperties>
+          table_properties_from_reader;
+      st = reader.ReadTableProperties(&table_properties_from_reader);
+      if (!st.ok()) {
+        fprintf(stderr, "%s: %s\n", filename.c_str(), st.ToString().c_str());
+        fprintf(stderr, "Try to use initial table properties\n");
+        table_properties = reader.GetInitTableProperties();
+      } else {
+        table_properties = table_properties_from_reader.get();
+      }
+      if (table_properties != nullptr) {
+        fprintf(stdout,
+                "Table Properties:\n"
+                "------------------------------\n"
+                "  %s",
+                table_properties->ToString("\n  ", ": ").c_str());
+        fprintf(stdout, "# deleted keys: %" PRIu64 "\n",
+                rocksdb::GetDeletedKeys(
+                    table_properties->user_collected_properties));
+      }
+    }
+  }
+  return 0;
+}
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/util/sst_dump_tool_imp.h b/src/rocksdb/util/sst_dump_tool_imp.h
new file mode 100644
index 0000000..a5f2267
--- /dev/null
+++ b/src/rocksdb/util/sst_dump_tool_imp.h
@@ -0,0 +1,90 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+#ifndef ROCKSDB_LITE
+#pragma once
+
+#include "rocksdb/sst_dump_tool.h"
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/memtable.h"
+#include "db/write_batch_internal.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/immutable_options.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "table/block.h"
+#include "table/block_based_table_factory.h"
+#include "table/block_builder.h"
+#include "table/format.h"
+#include "table/meta_blocks.h"
+#include "table/plain_table_factory.h"
+#include "util/ldb_cmd.h"
+#include "util/random.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+class SstFileReader {
+ public:
+  explicit SstFileReader(const std::string& file_name, bool verify_checksum,
+                         bool output_hex);
+
+  Status ReadSequential(bool print_kv, uint64_t read_num, bool has_from,
+                        const std::string& from_key, bool has_to,
+                        const std::string& to_key);
+
+  Status ReadTableProperties(
+      std::shared_ptr<const TableProperties>* table_properties);
+  uint64_t GetReadNumber() { return read_num_; }
+  TableProperties* GetInitTableProperties() { return table_properties_.get(); }
+
+  Status DumpTable(const std::string& out_filename);
+  Status getStatus() { return init_result_; }
+
+ private:
+  // Get the TableReader implementation for the sst file
+  Status GetTableReader(const std::string& file_path);
+  Status ReadTableProperties(uint64_t table_magic_number,
+                             RandomAccessFile* file, uint64_t file_size);
+  Status SetTableOptionsByMagicNumber(uint64_t table_magic_number);
+  Status SetOldTableOptions();
+
+  // Helper function to call the factory with settings specific to the
+  // factory implementation
+  Status NewTableReader(const ImmutableCFOptions& ioptions,
+                        const EnvOptions& soptions,
+                        const InternalKeyComparator& internal_comparator,
+                        unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
+                        unique_ptr<TableReader>* table_reader);
+
+  std::string file_name_;
+  uint64_t read_num_;
+  bool verify_checksum_;
+  bool output_hex_;
+  EnvOptions soptions_;
+
+  Status init_result_;
+  unique_ptr<TableReader> table_reader_;
+  unique_ptr<RandomAccessFile> file_;
+  // options_ and internal_comparator_ will also be used in
+  // ReadSequential internally (specifically, seek-related operations)
+  Options options_;
+  const ImmutableCFOptions ioptions_;
+  InternalKeyComparator internal_comparator_;
+  unique_ptr<TableProperties> table_properties_;
+};
+
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/util/statistics.cc b/src/rocksdb/util/statistics.cc
index 4fc2400..ba7670b 100644
--- a/src/rocksdb/util/statistics.cc
+++ b/src/rocksdb/util/statistics.cc
@@ -4,44 +4,89 @@
 //  of patent rights can be found in the PATENTS file in the same directory.
 //
 #include "util/statistics.h"
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
 #include "rocksdb/statistics.h"
+#include "port/likely.h"
 #include <algorithm>
 #include <cstdio>
 
 namespace rocksdb {
 
 std::shared_ptr<Statistics> CreateDBStatistics() {
-  return std::make_shared<StatisticsImpl>();
+  return std::make_shared<StatisticsImpl>(nullptr, false);
 }
 
-StatisticsImpl::StatisticsImpl() {}
+StatisticsImpl::StatisticsImpl(
+    std::shared_ptr<Statistics> stats,
+    bool enable_internal_stats)
+  : stats_shared_(stats),
+    stats_(stats.get()),
+    enable_internal_stats_(enable_internal_stats) {
+}
 
 StatisticsImpl::~StatisticsImpl() {}
 
-long StatisticsImpl::getTickerCount(Tickers tickerType) {
-  assert(tickerType < TICKER_ENUM_MAX);
+uint64_t StatisticsImpl::getTickerCount(uint32_t tickerType) const {
+  assert(
+    enable_internal_stats_ ?
+      tickerType < INTERNAL_TICKER_ENUM_MAX :
+      tickerType < TICKER_ENUM_MAX);
+  // Return its own ticker version
   return tickers_[tickerType].value;
 }
 
-void StatisticsImpl::setTickerCount(Tickers tickerType, uint64_t count) {
-  assert(tickerType < TICKER_ENUM_MAX);
-  tickers_[tickerType].value = count;
+void StatisticsImpl::histogramData(uint32_t histogramType,
+                                   HistogramData* const data) const {
+  assert(
+    enable_internal_stats_ ?
+      histogramType < INTERNAL_HISTOGRAM_ENUM_MAX :
+      histogramType < HISTOGRAM_ENUM_MAX);
+  // Return its own ticker version
+  histograms_[histogramType].Data(data);
 }
 
-void StatisticsImpl::recordTick(Tickers tickerType, uint64_t count) {
-  assert(tickerType < TICKER_ENUM_MAX);
-  tickers_[tickerType].value += count;
+void StatisticsImpl::setTickerCount(uint32_t tickerType, uint64_t count) {
+  assert(
+    enable_internal_stats_ ?
+      tickerType < INTERNAL_TICKER_ENUM_MAX :
+      tickerType < TICKER_ENUM_MAX);
+  if (tickerType < TICKER_ENUM_MAX || enable_internal_stats_) {
+    tickers_[tickerType].value = count;
+  }
+  if (stats_ && tickerType < TICKER_ENUM_MAX) {
+    stats_->setTickerCount(tickerType, count);
+  }
 }
 
-void StatisticsImpl::measureTime(Histograms histogramType, uint64_t value) {
-  assert(histogramType < HISTOGRAM_ENUM_MAX);
-  histograms_[histogramType].Add(value);
+void StatisticsImpl::recordTick(uint32_t tickerType, uint64_t count) {
+  assert(
+    enable_internal_stats_ ?
+      tickerType < INTERNAL_TICKER_ENUM_MAX :
+      tickerType < TICKER_ENUM_MAX);
+  if (tickerType < TICKER_ENUM_MAX || enable_internal_stats_) {
+    tickers_[tickerType].value += count;
+  }
+  if (stats_ && tickerType < TICKER_ENUM_MAX) {
+    stats_->recordTick(tickerType, count);
+  }
 }
 
-void StatisticsImpl::histogramData(Histograms histogramType,
-                                   HistogramData* const data) {
-  assert(histogramType < HISTOGRAM_ENUM_MAX);
-  histograms_[histogramType].Data(data);
+void StatisticsImpl::measureTime(uint32_t histogramType, uint64_t value) {
+  assert(
+    enable_internal_stats_ ?
+      histogramType < INTERNAL_HISTOGRAM_ENUM_MAX :
+      histogramType < HISTOGRAM_ENUM_MAX);
+  if (histogramType < HISTOGRAM_ENUM_MAX || enable_internal_stats_) {
+    histograms_[histogramType].Add(value);
+  }
+  if (stats_ && histogramType < HISTOGRAM_ENUM_MAX) {
+    stats_->measureTime(histogramType, value);
+  }
 }
 
 namespace {
@@ -49,46 +94,44 @@ namespace {
 // a buffer size used for temp string buffers
 const int kBufferSize = 200;
 
-std::string HistogramToString (
-    Statistics* dbstats,
-    const Histograms& histogram_type,
-    const std::string& name) {
-
-  char buffer[kBufferSize];
-  HistogramData histogramData;
-  dbstats->histogramData(histogram_type, &histogramData);
-  snprintf(
-      buffer,
-      kBufferSize,
-      "%s statistics Percentiles :=> 50 : %f 95 : %f 99 : %f\n",
-      name.c_str(),
-      histogramData.median,
-      histogramData.percentile95,
-      histogramData.percentile99
-  );
-  return std::string(buffer);
-};
-
-std::string TickerToString(Statistics* dbstats, const Tickers& ticker,
-                           const std::string& name) {
-  char buffer[kBufferSize];
-  snprintf(buffer, kBufferSize, "%s COUNT : %ld\n",
-            name.c_str(), dbstats->getTickerCount(ticker));
-  return std::string(buffer);
-};
 } // namespace
 
-std::string Statistics::ToString() {
+std::string StatisticsImpl::ToString() const {
   std::string res;
   res.reserve(20000);
   for (const auto& t : TickersNameMap) {
-    res.append(TickerToString(this, t.first, t.second));
+    if (t.first < TICKER_ENUM_MAX || enable_internal_stats_) {
+      char buffer[kBufferSize];
+      snprintf(buffer, kBufferSize, "%s COUNT : %" PRIu64 "\n",
+               t.second.c_str(), getTickerCount(t.first));
+      res.append(buffer);
+    }
   }
   for (const auto& h : HistogramsNameMap) {
-    res.append(HistogramToString(this, h.first, h.second));
+    if (h.first < HISTOGRAM_ENUM_MAX || enable_internal_stats_) {
+      char buffer[kBufferSize];
+      HistogramData hData;
+      histogramData(h.first, &hData);
+      snprintf(
+          buffer,
+          kBufferSize,
+          "%s statistics Percentiles :=> 50 : %f 95 : %f 99 : %f\n",
+          h.second.c_str(),
+          hData.median,
+          hData.percentile95,
+          hData.percentile99);
+      res.append(buffer);
+    }
   }
   res.shrink_to_fit();
   return res;
 }
 
+bool StatisticsImpl::HistEnabledForType(uint32_t type) const {
+  if (LIKELY(!enable_internal_stats_)) {
+    return type < HISTOGRAM_ENUM_MAX;
+  }
+  return true;
+}
+
 } // namespace rocksdb
diff --git a/src/rocksdb/util/statistics.h b/src/rocksdb/util/statistics.h
index d57a1dd..c56900a 100644
--- a/src/rocksdb/util/statistics.h
+++ b/src/rocksdb/util/statistics.h
@@ -5,29 +5,51 @@
 //
 #pragma once
 #include "rocksdb/statistics.h"
-#include "util/histogram.h"
-#include "util/mutexlock.h"
-#include "port/likely.h"
 
 #include <vector>
 #include <atomic>
+#include <string>
+
+#include "util/histogram.h"
+#include "util/mutexlock.h"
+#include "port/likely.h"
 
 
 namespace rocksdb {
 
+enum TickersInternal : uint32_t {
+  INTERNAL_TICKER_ENUM_START = TICKER_ENUM_MAX,
+  INTERNAL_TICKER_ENUM_MAX
+};
+
+enum HistogramsInternal : uint32_t {
+  INTERNAL_HISTOGRAM_START = HISTOGRAM_ENUM_MAX,
+  INTERNAL_HISTOGRAM_ENUM_MAX
+};
+
+
 class StatisticsImpl : public Statistics {
  public:
-  StatisticsImpl();
+  StatisticsImpl(std::shared_ptr<Statistics> stats,
+                 bool enable_internal_stats);
   virtual ~StatisticsImpl();
 
-  virtual long getTickerCount(Tickers tickerType);
-  virtual void setTickerCount(Tickers tickerType, uint64_t count);
-  virtual void recordTick(Tickers tickerType, uint64_t count);
-  virtual void measureTime(Histograms histogramType, uint64_t value);
-  virtual void histogramData(Histograms histogramType,
-                             HistogramData* const data);
+  virtual uint64_t getTickerCount(uint32_t ticker_type) const override;
+  virtual void histogramData(uint32_t histogram_type,
+                             HistogramData* const data) const override;
+
+  virtual void setTickerCount(uint32_t ticker_type, uint64_t count) override;
+  virtual void recordTick(uint32_t ticker_type, uint64_t count) override;
+  virtual void measureTime(uint32_t histogram_type, uint64_t value) override;
+
+  virtual std::string ToString() const override;
+  virtual bool HistEnabledForType(uint32_t type) const override;
 
  private:
+  std::shared_ptr<Statistics> stats_shared_;
+  Statistics* stats_;
+  bool enable_internal_stats_;
+
   struct Ticker {
     Ticker() : value(uint_fast64_t()) {}
 
@@ -38,29 +60,31 @@ class StatisticsImpl : public Statistics {
     char padding[64 - sizeof(std::atomic_uint_fast64_t)];
   };
 
-  Ticker tickers_[TICKER_ENUM_MAX] __attribute__((aligned(64)));
-  HistogramImpl histograms_[HISTOGRAM_ENUM_MAX] __attribute__((aligned(64)));
+  Ticker tickers_[INTERNAL_TICKER_ENUM_MAX] __attribute__((aligned(64)));
+  HistogramImpl histograms_[INTERNAL_HISTOGRAM_ENUM_MAX]
+      __attribute__((aligned(64)));
 };
 
 // Utility functions
-inline void MeasureTime(Statistics* statistics, Histograms histogramType,
+inline void MeasureTime(Statistics* statistics, uint32_t histogram_type,
                         uint64_t value) {
   if (statistics) {
-    statistics->measureTime(histogramType, value);
+    statistics->measureTime(histogram_type, value);
   }
 }
 
-inline void RecordTick(Statistics* statistics, Tickers ticker,
+inline void RecordTick(Statistics* statistics, uint32_t ticker_type,
                        uint64_t count = 1) {
   if (statistics) {
-    statistics->recordTick(ticker, count);
+    statistics->recordTick(ticker_type, count);
   }
 }
 
-inline void SetTickerCount(Statistics* statistics, Tickers ticker,
+inline void SetTickerCount(Statistics* statistics, uint32_t ticker_type,
                            uint64_t count) {
   if (statistics) {
-    statistics->setTickerCount(ticker, count);
+    statistics->setTickerCount(ticker_type, count);
   }
 }
+
 }
diff --git a/src/rocksdb/util/stats_logger.h b/src/rocksdb/util/stats_logger.h
deleted file mode 100644
index f0b4540..0000000
--- a/src/rocksdb/util/stats_logger.h
+++ /dev/null
@@ -1,26 +0,0 @@
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-//
-#pragma once
-
-namespace rocksdb {
-
-class StatsLogger {
-
- public:
-
-  virtual void Log_Deploy_Stats(const std::string& db_version,
-                                const std::string& machine_info,
-                                const std::string& data_dir,
-                                const uint64_t data_size,
-                                const uint32_t file_number,
-                                const std::string& data_size_per_level,
-                                const std::string& file_number_per_level,
-                                const int64_t& ts_unix) = 0;
-  virtual ~StatsLogger() {}
-
-};
-
-}
diff --git a/src/rocksdb/util/status.cc b/src/rocksdb/util/status.cc
index 2a5f05a..f0112d3 100644
--- a/src/rocksdb/util/status.cc
+++ b/src/rocksdb/util/status.cc
@@ -21,11 +21,10 @@ const char* Status::CopyState(const char* state) {
   return result;
 }
 
-Status::Status(Code code, const Slice& msg, const Slice& msg2) :
-    code_(code) {
-  assert(code != kOk);
-  const uint32_t len1 = msg.size();
-  const uint32_t len2 = msg2.size();
+Status::Status(Code _code, const Slice& msg, const Slice& msg2) : code_(_code) {
+  assert(code_ != kOk);
+  const uint32_t len1 = static_cast<uint32_t>(msg.size());
+  const uint32_t len2 = static_cast<uint32_t>(msg2.size());
   const uint32_t size = len1 + (len2 ? (2 + len2) : 0);
   char* result = new char[size + 4];
   memcpy(result, &size, sizeof(size));
@@ -68,6 +67,12 @@ std::string Status::ToString() const {
     case kShutdownInProgress:
       type = "Shutdown in progress: ";
       break;
+    case kTimedOut:
+      type = "Operation timed out: ";
+      break;
+    case kAborted:
+      type = "Operation aborted: ";
+      break;
     default:
       snprintf(tmp, sizeof(tmp), "Unknown code(%d): ",
                static_cast<int>(code()));
diff --git a/src/rocksdb/util/stop_watch.h b/src/rocksdb/util/stop_watch.h
index 48e1b01..3637533 100644
--- a/src/rocksdb/util/stop_watch.h
+++ b/src/rocksdb/util/stop_watch.h
@@ -9,33 +9,41 @@
 
 namespace rocksdb {
 // Auto-scoped.
-// Records the statistic into the corresponding histogram.
+// Records the measure time into the corresponding histogram if statistics
+// is not nullptr. It is also saved into *elapsed if the pointer is not nullptr.
 class StopWatch {
  public:
-  explicit StopWatch(
-    Env * const env,
-    Statistics* statistics = nullptr,
-    const Histograms histogram_name = DB_GET,
-    bool auto_start = true) :
-      env_(env),
-      start_time_((!auto_start && !statistics) ? 0 : env->NowMicros()),
+  StopWatch(Env * const env, Statistics* statistics,
+            const uint32_t hist_type,
+            uint64_t* elapsed = nullptr)
+    : env_(env),
       statistics_(statistics),
-      histogram_name_(histogram_name) {}
-
+      hist_type_(hist_type),
+      elapsed_(elapsed),
+      stats_enabled_(statistics && statistics->HistEnabledForType(hist_type)),
+      start_time_((stats_enabled_ || elapsed != nullptr) ?
+                  env->NowMicros() : 0) {
+  }
 
 
-  uint64_t ElapsedMicros() {
-    return env_->NowMicros() - start_time_;
+  ~StopWatch() {
+    if (elapsed_) {
+      *elapsed_ = env_->NowMicros() - start_time_;
+    }
+    if (stats_enabled_) {
+      statistics_->measureTime(hist_type_,
+          (elapsed_ != nullptr) ? *elapsed_ :
+                                  (env_->NowMicros() - start_time_));
+    }
   }
 
-  ~StopWatch() { MeasureTime(statistics_, histogram_name_, ElapsedMicros()); }
-
  private:
   Env* const env_;
-  const uint64_t start_time_;
   Statistics* statistics_;
-  const Histograms histogram_name_;
-
+  const uint32_t hist_type_;
+  uint64_t* elapsed_;
+  bool stats_enabled_;
+  const uint64_t start_time_;
 };
 
 // a nano second precision stopwatch
diff --git a/src/rocksdb/util/string_util.cc b/src/rocksdb/util/string_util.cc
index 97b7f9d..4e0bc46 100644
--- a/src/rocksdb/util/string_util.cc
+++ b/src/rocksdb/util/string_util.cc
@@ -10,7 +10,7 @@
 
 namespace rocksdb {
 
-std::vector<std::string> stringSplit(std::string arg, char delim) {
+std::vector<std::string> StringSplit(const std::string& arg, char delim) {
   std::vector<std::string> splits;
   std::stringstream ss(arg);
   std::string item;
diff --git a/src/rocksdb/util/string_util.h b/src/rocksdb/util/string_util.h
index 676f4aa..dfbe505 100644
--- a/src/rocksdb/util/string_util.h
+++ b/src/rocksdb/util/string_util.h
@@ -10,6 +10,19 @@
 #pragma once
 namespace rocksdb {
 
-extern std::vector<std::string> stringSplit(std::string arg, char delim);
+extern std::vector<std::string> StringSplit(const std::string& arg, char delim);
+
+template <typename T>
+inline std::string ToString(T value) {
+#if !(defined OS_ANDROID) && !(defined CYGWIN)
+  return std::to_string(value);
+#else
+  // Andorid or cygwin doesn't support all of C++11, std::to_string() being
+  // one of the not supported features.
+  std::ostringstream os;
+  os << value;
+  return os.str();
+#endif
+}
 
 }  // namespace rocksdb
diff --git a/src/rocksdb/util/sync_point.cc b/src/rocksdb/util/sync_point.cc
index 4e4c46a..3c224bf 100644
--- a/src/rocksdb/util/sync_point.cc
+++ b/src/rocksdb/util/sync_point.cc
@@ -14,6 +14,7 @@ SyncPoint* SyncPoint::GetInstance() {
 }
 
 void SyncPoint::LoadDependency(const std::vector<Dependency>& dependencies) {
+  std::unique_lock<std::mutex> lock(mutex_);
   successors_.clear();
   predecessors_.clear();
   cleared_points_.clear();
@@ -21,6 +22,7 @@ void SyncPoint::LoadDependency(const std::vector<Dependency>& dependencies) {
     successors_[dependency.predecessor].push_back(dependency.successor);
     predecessors_[dependency.successor].push_back(dependency.predecessor);
   }
+  cv_.notify_all();
 }
 
 bool SyncPoint::PredecessorsAllCleared(const std::string& point) {
@@ -32,6 +34,20 @@ bool SyncPoint::PredecessorsAllCleared(const std::string& point) {
   return true;
 }
 
+void SyncPoint::SetCallBack(const std::string point,
+                            std::function<void(void*)> callback) {
+  std::unique_lock<std::mutex> lock(mutex_);
+  callbacks_[point] = callback;
+}
+
+void SyncPoint::ClearAllCallBacks() {
+  std::unique_lock<std::mutex> lock(mutex_);
+  while (num_callbacks_running_ > 0) {
+    cv_.wait(lock);
+  }
+  callbacks_.clear();
+}
+
 void SyncPoint::EnableProcessing() {
   std::unique_lock<std::mutex> lock(mutex_);
   enabled_ = true;
@@ -47,11 +63,21 @@ void SyncPoint::ClearTrace() {
   cleared_points_.clear();
 }
 
-void SyncPoint::Process(const std::string& point) {
+void SyncPoint::Process(const std::string& point, void* cb_arg) {
   std::unique_lock<std::mutex> lock(mutex_);
 
   if (!enabled_) return;
 
+  auto callback_pair = callbacks_.find(point);
+  if (callback_pair != callbacks_.end()) {
+    num_callbacks_running_++;
+    mutex_.unlock();
+    callback_pair->second(cb_arg);
+    mutex_.lock();
+    num_callbacks_running_--;
+    cv_.notify_all();
+  }
+
   while (!PredecessorsAllCleared(point)) {
     cv_.wait(lock);
   }
@@ -59,6 +85,5 @@ void SyncPoint::Process(const std::string& point) {
   cleared_points_.insert(point);
   cv_.notify_all();
 }
-
 }  // namespace rocksdb
 #endif  // NDEBUG
diff --git a/src/rocksdb/util/sync_point.h b/src/rocksdb/util/sync_point.h
index b4b61a9..7827d28 100644
--- a/src/rocksdb/util/sync_point.h
+++ b/src/rocksdb/util/sync_point.h
@@ -13,6 +13,7 @@
 
 #ifdef NDEBUG
 #define TEST_SYNC_POINT(x)
+#define TEST_SYNC_POINT_CALLBACK(x, y)
 #else
 
 namespace rocksdb {
@@ -38,6 +39,12 @@ class SyncPoint {
   // sync points
   void LoadDependency(const std::vector<Dependency>& dependencies);
 
+  // Set up a call back function in sync point.
+  void SetCallBack(const std::string point,
+                   std::function<void(void*)> callback);
+  // Clear all call back functions.
+  void ClearAllCallBacks();
+
   // enable sync point processing (disabled on startup)
   void EnableProcessing();
 
@@ -49,7 +56,8 @@ class SyncPoint {
 
   // triggered by TEST_SYNC_POINT, blocking execution until all predecessors
   // are executed.
-  void Process(const std::string& point);
+  // And/or call registered callback functionn, with argument `cb_arg`
+  void Process(const std::string& point, void* cb_arg = nullptr);
 
   // TODO: it might be useful to provide a function that blocks until all
   // sync points are cleared.
@@ -60,12 +68,14 @@ class SyncPoint {
   // successor/predecessor map loaded from LoadDependency
   std::unordered_map<std::string, std::vector<std::string>> successors_;
   std::unordered_map<std::string, std::vector<std::string>> predecessors_;
+  std::unordered_map<std::string, std::function<void(void*)> > callbacks_;
 
   std::mutex mutex_;
   std::condition_variable cv_;
   // sync points that have been passed through
   std::unordered_set<std::string> cleared_points_;
   bool enabled_ = false;
+  int num_callbacks_running_ = 0;
 };
 
 }  // namespace rocksdb
@@ -77,4 +87,6 @@ class SyncPoint {
 // See TransactionLogIteratorRace in db_test.cc for an example use case.
 // TEST_SYNC_POINT is no op in release build.
 #define TEST_SYNC_POINT(x) rocksdb::SyncPoint::GetInstance()->Process(x)
+#define TEST_SYNC_POINT_CALLBACK(x, y) \
+  rocksdb::SyncPoint::GetInstance()->Process(x, y)
 #endif  // NDEBUG
diff --git a/src/rocksdb/util/testharness.cc b/src/rocksdb/util/testharness.cc
index 4208d2c..603f6f6 100644
--- a/src/rocksdb/util/testharness.cc
+++ b/src/rocksdb/util/testharness.cc
@@ -9,65 +9,23 @@
 
 #include "util/testharness.h"
 #include <string>
-#include <stdlib.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include "port/stack_trace.h"
 
 namespace rocksdb {
 namespace test {
 
-namespace {
-struct Test {
-  const char* base;
-  const char* name;
-  void (*func)();
-};
-std::vector<Test>* tests;
-}
-
-bool RegisterTest(const char* base, const char* name, void (*func)()) {
-  if (tests == nullptr) {
-    tests = new std::vector<Test>;
-  }
-  Test t;
-  t.base = base;
-  t.name = name;
-  t.func = func;
-  tests->push_back(t);
-  return true;
-}
-
-int RunAllTests() {
-  port::InstallStackTraceHandler();
-
-  const char* matcher = getenv("ROCKSDB_TESTS");
-
-  int num = 0;
-  if (tests != nullptr) {
-    for (unsigned int i = 0; i < tests->size(); i++) {
-      const Test& t = (*tests)[i];
-      if (matcher != nullptr) {
-        std::string name = t.base;
-        name.push_back('.');
-        name.append(t.name);
-        if (strstr(name.c_str(), matcher) == nullptr) {
-          continue;
-        }
-      }
-      fprintf(stderr, "==== Test %s.%s\n", t.base, t.name);
-      (*t.func)();
-      ++num;
-    }
+::testing::AssertionResult AssertStatus(const char* s_expr, const Status& s) {
+  if (s.ok()) {
+    return ::testing::AssertionSuccess();
+  } else {
+    return ::testing::AssertionFailure() << s_expr << std::endl
+                                         << s.ToString();
   }
-  fprintf(stderr, "==== PASSED %d tests\n", num);
-  return 0;
 }
 
-std::string TmpDir() {
+std::string TmpDir(Env* env) {
   std::string dir;
-  Status s = Env::Default()->GetTestDirectory(&dir);
-  ASSERT_TRUE(s.ok()) << s.ToString();
+  Status s = env->GetTestDirectory(&dir);
+  EXPECT_TRUE(s.ok()) << s.ToString();
   return dir;
 }
 
diff --git a/src/rocksdb/util/testharness.h b/src/rocksdb/util/testharness.h
index 52c2984..b212b1e 100644
--- a/src/rocksdb/util/testharness.h
+++ b/src/rocksdb/util/testharness.h
@@ -9,134 +9,28 @@
 
 #pragma once
 
-#include <stdio.h>
-#include <stdlib.h>
-#include <sstream>
-#include "port/stack_trace.h"
+#include <gtest/gtest.h>
+
+#include <string>
 #include "rocksdb/env.h"
-#include "rocksdb/slice.h"
-#include "util/random.h"
 
 namespace rocksdb {
 namespace test {
 
-// Run some of the tests registered by the TEST() macro.  If the
-// environment variable "ROCKSDB_TESTS" is not set, runs all tests.
-// Otherwise, runs only the tests whose name contains the value of
-// "ROCKSDB_TESTS" as a substring.  E.g., suppose the tests are:
-//    TEST(Foo, Hello) { ... }
-//    TEST(Foo, World) { ... }
-// ROCKSDB_TESTS=Hello will run the first test
-// ROCKSDB_TESTS=o     will run both tests
-// ROCKSDB_TESTS=Junk  will run no tests
-//
-// Returns 0 if all tests pass.
-// Dies or returns a non-zero value if some test fails.
-extern int RunAllTests();
-
 // Return the directory to use for temporary storage.
-extern std::string TmpDir();
+std::string TmpDir(Env* env = Env::Default());
 
 // Return a randomization seed for this run.  Typically returns the
 // same number on repeated invocations of this binary, but automated
 // runs may be able to vary the seed.
-extern int RandomSeed();
-
-// An instance of Tester is allocated to hold temporary state during
-// the execution of an assertion.
-class Tester {
- private:
-  bool ok_;
-  const char* fname_;
-  int line_;
-  std::stringstream ss_;
-
- public:
-  Tester(const char* f, int l)
-      : ok_(true), fname_(f), line_(l) {
-  }
-
-  ~Tester() {
-    if (!ok_) {
-      fprintf(stderr, "%s:%d:%s\n", fname_, line_, ss_.str().c_str());
-      port::PrintStack(2);
-      exit(1);
-    }
-  }
-
-  Tester& Is(bool b, const char* msg) {
-    if (!b) {
-      ss_ << " Assertion failure " << msg;
-      ok_ = false;
-    }
-    return *this;
-  }
-
-  Tester& IsOk(const Status& s) {
-    if (!s.ok()) {
-      ss_ << " " << s.ToString();
-      ok_ = false;
-    }
-    return *this;
-  }
-
-#define BINARY_OP(name,op)                              \
-  template <class X, class Y>                           \
-  Tester& name(const X& x, const Y& y) {                \
-    if (! (x op y)) {                                   \
-      ss_ << " failed: " << x << (" " #op " ") << y;    \
-      ok_ = false;                                      \
-    }                                                   \
-    return *this;                                       \
-  }
-
-  BINARY_OP(IsEq, ==)
-  BINARY_OP(IsNe, !=)
-  BINARY_OP(IsGe, >=)
-  BINARY_OP(IsGt, >)
-  BINARY_OP(IsLe, <=)
-  BINARY_OP(IsLt, <)
-#undef BINARY_OP
-
-  // Attach the specified value to the error message if an error has occurred
-  template <class V>
-  Tester& operator<<(const V& value) {
-    if (!ok_) {
-      ss_ << " " << value;
-    }
-    return *this;
-  }
-};
-
-#define ASSERT_TRUE(c) ::rocksdb::test::Tester(__FILE__, __LINE__).Is((c), #c)
-#define ASSERT_OK(s) ::rocksdb::test::Tester(__FILE__, __LINE__).IsOk((s))
-#define ASSERT_EQ(a,b) ::rocksdb::test::Tester(__FILE__, __LINE__).IsEq((a),(b))
-#define ASSERT_NE(a,b) ::rocksdb::test::Tester(__FILE__, __LINE__).IsNe((a),(b))
-#define ASSERT_GE(a,b) ::rocksdb::test::Tester(__FILE__, __LINE__).IsGe((a),(b))
-#define ASSERT_GT(a,b) ::rocksdb::test::Tester(__FILE__, __LINE__).IsGt((a),(b))
-#define ASSERT_LE(a,b) ::rocksdb::test::Tester(__FILE__, __LINE__).IsLe((a),(b))
-#define ASSERT_LT(a,b) ::rocksdb::test::Tester(__FILE__, __LINE__).IsLt((a),(b))
-
-#define TCONCAT(a,b) TCONCAT1(a,b)
-#define TCONCAT1(a,b) a##b
-
-#define TEST(base,name)                                                 \
-class TCONCAT(_Test_,name) : public base {                              \
- public:                                                                \
-  void _Run();                                                          \
-  static void _RunIt() {                                                \
-    TCONCAT(_Test_,name) t;                                             \
-    t._Run();                                                           \
-  }                                                                     \
-};                                                                      \
-bool TCONCAT(_Test_ignored_,name) =                                     \
-  ::rocksdb::test::RegisterTest(#base, #name, &TCONCAT(_Test_,name)::_RunIt); \
-void TCONCAT(_Test_,name)::_Run()
+int RandomSeed();
 
-// Register the specified test.  Typically not used directly, but
-// invoked via the macro expansion of TEST.
-extern bool RegisterTest(const char* base, const char* name, void (*func)());
+::testing::AssertionResult AssertStatus(const char* s_expr, const Status& s);
 
+#define ASSERT_OK(s) ASSERT_PRED_FORMAT1(rocksdb::test::AssertStatus, s)
+#define ASSERT_NOK(s) ASSERT_FALSE((s).ok())
+#define EXPECT_OK(s) EXPECT_PRED_FORMAT1(rocksdb::test::AssertStatus, s)
+#define EXPECT_NOK(s) EXPECT_FALSE((s).ok())
 
 }  // namespace test
 }  // namespace rocksdb
diff --git a/src/rocksdb/util/testutil.cc b/src/rocksdb/util/testutil.cc
index 13e781e..20f22c2 100644
--- a/src/rocksdb/util/testutil.cc
+++ b/src/rocksdb/util/testutil.cc
@@ -9,6 +9,7 @@
 
 #include "util/testutil.h"
 
+#include "port/port.h"
 #include "util/random.h"
 
 namespace rocksdb {
@@ -22,6 +23,15 @@ Slice RandomString(Random* rnd, int len, std::string* dst) {
   return Slice(*dst);
 }
 
+extern std::string RandomHumanReadableString(Random* rnd, int len) {
+  std::string ret;
+  ret.resize(len);
+  for (int i = 0; i < len; ++i) {
+    ret[i] = static_cast<char>('a' + rnd->Uniform(26));
+  }
+  return ret;
+}
+
 std::string RandomKey(Random* rnd, int len) {
   // Make sure to generate a wide variety of characters so we
   // test the boundary conditions for short-key optimizations.
@@ -52,5 +62,50 @@ extern Slice CompressibleString(Random* rnd, double compressed_fraction,
   return Slice(*dst);
 }
 
+namespace {
+class Uint64ComparatorImpl : public Comparator {
+ public:
+  Uint64ComparatorImpl() { }
+
+  virtual const char* Name() const override {
+    return "rocksdb.Uint64Comparator";
+  }
+
+  virtual int Compare(const Slice& a, const Slice& b) const override {
+    assert(a.size() == sizeof(uint64_t) && b.size() == sizeof(uint64_t));
+    const uint64_t* left = reinterpret_cast<const uint64_t*>(a.data());
+    const uint64_t* right = reinterpret_cast<const uint64_t*>(b.data());
+    if (*left == *right) {
+      return 0;
+    } else if (*left < *right) {
+      return -1;
+    } else {
+      return 1;
+    }
+  }
+
+  virtual void FindShortestSeparator(std::string* start,
+      const Slice& limit) const override {
+    return;
+  }
+
+  virtual void FindShortSuccessor(std::string* key) const override {
+    return;
+  }
+};
+}  // namespace
+
+static port::OnceType once = LEVELDB_ONCE_INIT;
+static const Comparator* uint64comp;
+
+static void InitModule() {
+  uint64comp = new Uint64ComparatorImpl;
+}
+
+const Comparator* Uint64Comparator() {
+  port::InitOnce(&once, InitModule);
+  return uint64comp;
+}
+
 }  // namespace test
 }  // namespace rocksdb
diff --git a/src/rocksdb/util/testutil.h b/src/rocksdb/util/testutil.h
index 4fc8c0f..9584838 100644
--- a/src/rocksdb/util/testutil.h
+++ b/src/rocksdb/util/testutil.h
@@ -21,6 +21,8 @@ namespace test {
 // references the generated data.
 extern Slice RandomString(Random* rnd, int len, std::string* dst);
 
+extern std::string RandomHumanReadableString(Random* rnd, int len);
+
 // Return a random key with the specified length that may contain interesting
 // characters (e.g. \x00, \xff, etc.).
 extern std::string RandomKey(Random* rnd, int len);
@@ -43,7 +45,7 @@ class ErrorEnv : public EnvWrapper {
 
   virtual Status NewWritableFile(const std::string& fname,
                                  unique_ptr<WritableFile>* result,
-                                 const EnvOptions& soptions) {
+                                 const EnvOptions& soptions) override {
     result->reset();
     if (writable_file_error_) {
       ++num_writable_file_errors_;
@@ -76,5 +78,44 @@ class PlainInternalKeyComparator : public InternalKeyComparator {
   }
 };
 
+// A test comparator which compare two strings in this way:
+// (1) first compare prefix of 8 bytes in alphabet order,
+// (2) if two strings share the same prefix, sort the other part of the string
+//     in the reverse alphabet order.
+// This helps simulate the case of compounded key of [entity][timestamp] and
+// latest timestamp first.
+class SimpleSuffixReverseComparator : public Comparator {
+ public:
+  SimpleSuffixReverseComparator() {}
+
+  virtual const char* Name() const override {
+    return "SimpleSuffixReverseComparator";
+  }
+
+  virtual int Compare(const Slice& a, const Slice& b) const override {
+    Slice prefix_a = Slice(a.data(), 8);
+    Slice prefix_b = Slice(b.data(), 8);
+    int prefix_comp = prefix_a.compare(prefix_b);
+    if (prefix_comp != 0) {
+      return prefix_comp;
+    } else {
+      Slice suffix_a = Slice(a.data() + 8, a.size() - 8);
+      Slice suffix_b = Slice(b.data() + 8, b.size() - 8);
+      return -(suffix_a.compare(suffix_b));
+    }
+  }
+  virtual void FindShortestSeparator(std::string* start,
+                                     const Slice& limit) const override {}
+
+  virtual void FindShortSuccessor(std::string* key) const override {}
+};
+
+// Returns a user key comparator that can be used for comparing two uint64_t
+// slices. Instead of comparing slices byte-wise, it compares all the 8 bytes
+// at once. Assumes same endian-ness is used though the database's lifetime.
+// Symantics of comparison would differ from Bytewise comparator in little
+// endian machines.
+extern const Comparator* Uint64Comparator();
+
 }  // namespace test
 }  // namespace rocksdb
diff --git a/src/rocksdb/util/thread_list_test.cc b/src/rocksdb/util/thread_list_test.cc
new file mode 100644
index 0000000..eeb2b16
--- /dev/null
+++ b/src/rocksdb/util/thread_list_test.cc
@@ -0,0 +1,352 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include <mutex>
+#include <condition_variable>
+
+#include "util/thread_status_updater.h"
+#include "util/testharness.h"
+#include "rocksdb/db.h"
+
+#if ROCKSDB_USING_THREAD_STATUS
+
+namespace rocksdb {
+
+class SimulatedBackgroundTask {
+ public:
+  SimulatedBackgroundTask(
+      const void* db_key, const std::string& db_name,
+      const void* cf_key, const std::string& cf_name,
+      const ThreadStatus::OperationType operation_type =
+          ThreadStatus::OP_UNKNOWN,
+      const ThreadStatus::StateType state_type =
+          ThreadStatus::STATE_UNKNOWN)
+      : db_key_(db_key), db_name_(db_name),
+        cf_key_(cf_key), cf_name_(cf_name),
+        operation_type_(operation_type), state_type_(state_type),
+        should_run_(true), running_count_(0) {
+    Env::Default()->GetThreadStatusUpdater()->NewColumnFamilyInfo(
+        db_key_, db_name_, cf_key_, cf_name_);
+  }
+
+  ~SimulatedBackgroundTask() {
+    Env::Default()->GetThreadStatusUpdater()->EraseDatabaseInfo(db_key_);
+  }
+
+  void Run() {
+    std::unique_lock<std::mutex> l(mutex_);
+    running_count_++;
+    Env::Default()->GetThreadStatusUpdater()->SetColumnFamilyInfoKey(cf_key_);
+    Env::Default()->GetThreadStatusUpdater()->SetThreadOperation(
+        operation_type_);
+    Env::Default()->GetThreadStatusUpdater()->SetThreadState(state_type_);
+    while (should_run_) {
+      bg_cv_.wait(l);
+    }
+    Env::Default()->GetThreadStatusUpdater()->ClearThreadState();
+    Env::Default()->GetThreadStatusUpdater()->ClearThreadOperation();
+    Env::Default()->GetThreadStatusUpdater()->SetColumnFamilyInfoKey(0);
+    running_count_--;
+    bg_cv_.notify_all();
+  }
+
+  void FinishAllTasks() {
+    std::unique_lock<std::mutex> l(mutex_);
+    should_run_ = false;
+    bg_cv_.notify_all();
+  }
+
+  void WaitUntilScheduled(int job_count, Env* env) {
+    while (running_count_ < job_count) {
+      env->SleepForMicroseconds(1000);
+    }
+  }
+
+  void WaitUntilDone() {
+    std::unique_lock<std::mutex> l(mutex_);
+    while (running_count_ > 0) {
+      bg_cv_.wait(l);
+    }
+  }
+
+  static void DoSimulatedTask(void* arg) {
+    reinterpret_cast<SimulatedBackgroundTask*>(arg)->Run();
+  }
+
+ private:
+  const void* db_key_;
+  const std::string db_name_;
+  const void* cf_key_;
+  const std::string cf_name_;
+  const ThreadStatus::OperationType operation_type_;
+  const ThreadStatus::StateType state_type_;
+  std::mutex mutex_;
+  std::condition_variable bg_cv_;
+  bool should_run_;
+  std::atomic<int> running_count_;
+};
+
+class ThreadListTest : public testing::Test {
+ public:
+  ThreadListTest() {
+  }
+};
+
+TEST_F(ThreadListTest, GlobalTables) {
+  // verify the global tables for operations and states are properly indexed.
+  for (int type = 0; type != ThreadStatus::NUM_OP_TYPES; ++type) {
+    ASSERT_EQ(global_operation_table[type].type, type);
+    ASSERT_EQ(global_operation_table[type].name,
+              ThreadStatus::GetOperationName(
+                  ThreadStatus::OperationType(type)));
+  }
+
+  for (int type = 0; type != ThreadStatus::NUM_STATE_TYPES; ++type) {
+    ASSERT_EQ(global_state_table[type].type, type);
+    ASSERT_EQ(global_state_table[type].name,
+              ThreadStatus::GetStateName(
+                  ThreadStatus::StateType(type)));
+  }
+
+  for (int stage = 0; stage != ThreadStatus::NUM_OP_STAGES; ++stage) {
+    ASSERT_EQ(global_op_stage_table[stage].stage, stage);
+    ASSERT_EQ(global_op_stage_table[stage].name,
+              ThreadStatus::GetOperationStageName(
+                  ThreadStatus::OperationStage(stage)));
+  }
+}
+
+TEST_F(ThreadListTest, SimpleColumnFamilyInfoTest) {
+  Env* env = Env::Default();
+  const int kHighPriorityThreads = 3;
+  const int kLowPriorityThreads = 5;
+  const int kSimulatedHighPriThreads = kHighPriorityThreads - 1;
+  const int kSimulatedLowPriThreads = kLowPriorityThreads / 3;
+  env->SetBackgroundThreads(kHighPriorityThreads, Env::HIGH);
+  env->SetBackgroundThreads(kLowPriorityThreads, Env::LOW);
+
+  SimulatedBackgroundTask running_task(
+      reinterpret_cast<void*>(1234), "running",
+      reinterpret_cast<void*>(5678), "pikachu");
+
+  for (int test = 0; test < kSimulatedHighPriThreads; ++test) {
+    env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask,
+        &running_task, Env::Priority::HIGH);
+  }
+  for (int test = 0; test < kSimulatedLowPriThreads; ++test) {
+    env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask,
+        &running_task, Env::Priority::LOW);
+  }
+  running_task.WaitUntilScheduled(
+      kSimulatedHighPriThreads + kSimulatedLowPriThreads, env);
+
+  std::vector<ThreadStatus> thread_list;
+
+  // Verify the number of running threads in each pool.
+  env->GetThreadList(&thread_list);
+  int running_count[ThreadStatus::NUM_THREAD_TYPES] = {0};
+  for (auto thread_status : thread_list) {
+    if (thread_status.cf_name == "pikachu" &&
+        thread_status.db_name == "running") {
+      running_count[thread_status.thread_type]++;
+    }
+  }
+  ASSERT_EQ(
+      running_count[ThreadStatus::HIGH_PRIORITY],
+      kSimulatedHighPriThreads);
+  ASSERT_EQ(
+      running_count[ThreadStatus::LOW_PRIORITY],
+      kSimulatedLowPriThreads);
+  ASSERT_EQ(
+      running_count[ThreadStatus::USER], 0);
+
+  running_task.FinishAllTasks();
+  running_task.WaitUntilDone();
+
+  // Verify none of the threads are running
+  env->GetThreadList(&thread_list);
+
+  for (int i = 0; i < ThreadStatus::NUM_THREAD_TYPES; ++i) {
+    running_count[i] = 0;
+  }
+  for (auto thread_status : thread_list) {
+    if (thread_status.cf_name == "pikachu" &&
+        thread_status.db_name == "running") {
+      running_count[thread_status.thread_type]++;
+    }
+  }
+
+  ASSERT_EQ(
+      running_count[ThreadStatus::HIGH_PRIORITY], 0);
+  ASSERT_EQ(
+      running_count[ThreadStatus::LOW_PRIORITY], 0);
+  ASSERT_EQ(
+      running_count[ThreadStatus::USER], 0);
+}
+
+namespace {
+  void UpdateStatusCounts(
+      const std::vector<ThreadStatus>& thread_list,
+      int operation_counts[], int state_counts[]) {
+    for (auto thread_status : thread_list) {
+      operation_counts[thread_status.operation_type]++;
+      state_counts[thread_status.state_type]++;
+    }
+  }
+
+  void VerifyAndResetCounts(
+      const int correct_counts[], int collected_counts[], int size) {
+    for (int i = 0; i < size; ++i) {
+      ASSERT_EQ(collected_counts[i], correct_counts[i]);
+      collected_counts[i] = 0;
+    }
+  }
+
+  void UpdateCount(
+      int operation_counts[], int from_event, int to_event, int amount) {
+    operation_counts[from_event] -= amount;
+    operation_counts[to_event] += amount;
+  }
+}  // namespace
+
+TEST_F(ThreadListTest, SimpleEventTest) {
+  Env* env = Env::Default();
+
+  // simulated tasks
+  const int kFlushWriteTasks = 3;
+  SimulatedBackgroundTask flush_write_task(
+      reinterpret_cast<void*>(1234), "running",
+      reinterpret_cast<void*>(5678), "pikachu",
+      ThreadStatus::OP_FLUSH);
+
+  const int kCompactionWriteTasks = 4;
+  SimulatedBackgroundTask compaction_write_task(
+      reinterpret_cast<void*>(1234), "running",
+      reinterpret_cast<void*>(5678), "pikachu",
+      ThreadStatus::OP_COMPACTION);
+
+  const int kCompactionReadTasks = 5;
+  SimulatedBackgroundTask compaction_read_task(
+      reinterpret_cast<void*>(1234), "running",
+      reinterpret_cast<void*>(5678), "pikachu",
+      ThreadStatus::OP_COMPACTION);
+
+  const int kCompactionWaitTasks = 6;
+  SimulatedBackgroundTask compaction_wait_task(
+      reinterpret_cast<void*>(1234), "running",
+      reinterpret_cast<void*>(5678), "pikachu",
+      ThreadStatus::OP_COMPACTION);
+
+  // setup right answers
+  int correct_operation_counts[ThreadStatus::NUM_OP_TYPES] = {0};
+  correct_operation_counts[ThreadStatus::OP_FLUSH] =
+      kFlushWriteTasks;
+  correct_operation_counts[ThreadStatus::OP_COMPACTION] =
+      kCompactionWriteTasks + kCompactionReadTasks + kCompactionWaitTasks;
+
+  env->SetBackgroundThreads(
+      correct_operation_counts[ThreadStatus::OP_FLUSH], Env::HIGH);
+  env->SetBackgroundThreads(
+      correct_operation_counts[ThreadStatus::OP_COMPACTION], Env::LOW);
+
+  // schedule the simulated tasks
+  for (int t = 0; t < kFlushWriteTasks; ++t) {
+    env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask,
+        &flush_write_task, Env::Priority::HIGH);
+  }
+  flush_write_task.WaitUntilScheduled(kFlushWriteTasks, env);
+
+  for (int t = 0; t < kCompactionWriteTasks; ++t) {
+    env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask,
+        &compaction_write_task, Env::Priority::LOW);
+  }
+  compaction_write_task.WaitUntilScheduled(kCompactionWriteTasks, env);
+
+  for (int t = 0; t < kCompactionReadTasks; ++t) {
+    env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask,
+        &compaction_read_task, Env::Priority::LOW);
+  }
+  compaction_read_task.WaitUntilScheduled(kCompactionReadTasks, env);
+
+  for (int t = 0; t < kCompactionWaitTasks; ++t) {
+    env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask,
+        &compaction_wait_task, Env::Priority::LOW);
+  }
+  compaction_wait_task.WaitUntilScheduled(kCompactionWaitTasks, env);
+
+  // verify the thread-status
+  int operation_counts[ThreadStatus::NUM_OP_TYPES] = {0};
+  int state_counts[ThreadStatus::NUM_STATE_TYPES] = {0};
+
+  std::vector<ThreadStatus> thread_list;
+  env->GetThreadList(&thread_list);
+  UpdateStatusCounts(thread_list, operation_counts, state_counts);
+  VerifyAndResetCounts(correct_operation_counts, operation_counts,
+                       ThreadStatus::NUM_OP_TYPES);
+
+  // terminate compaction-wait tasks and see if the thread-status
+  // reflects this update
+  compaction_wait_task.FinishAllTasks();
+  compaction_wait_task.WaitUntilDone();
+  UpdateCount(correct_operation_counts, ThreadStatus::OP_COMPACTION,
+              ThreadStatus::OP_UNKNOWN, kCompactionWaitTasks);
+
+  env->GetThreadList(&thread_list);
+  UpdateStatusCounts(thread_list, operation_counts, state_counts);
+  VerifyAndResetCounts(correct_operation_counts, operation_counts,
+                       ThreadStatus::NUM_OP_TYPES);
+
+  // terminate flush-write tasks and see if the thread-status
+  // reflects this update
+  flush_write_task.FinishAllTasks();
+  flush_write_task.WaitUntilDone();
+  UpdateCount(correct_operation_counts, ThreadStatus::OP_FLUSH,
+              ThreadStatus::OP_UNKNOWN, kFlushWriteTasks);
+
+  env->GetThreadList(&thread_list);
+  UpdateStatusCounts(thread_list, operation_counts, state_counts);
+  VerifyAndResetCounts(correct_operation_counts, operation_counts,
+                       ThreadStatus::NUM_OP_TYPES);
+
+  // terminate compaction-write tasks and see if the thread-status
+  // reflects this update
+  compaction_write_task.FinishAllTasks();
+  compaction_write_task.WaitUntilDone();
+  UpdateCount(correct_operation_counts, ThreadStatus::OP_COMPACTION,
+              ThreadStatus::OP_UNKNOWN, kCompactionWriteTasks);
+
+  env->GetThreadList(&thread_list);
+  UpdateStatusCounts(thread_list, operation_counts, state_counts);
+  VerifyAndResetCounts(correct_operation_counts, operation_counts,
+                       ThreadStatus::NUM_OP_TYPES);
+
+  // terminate compaction-write tasks and see if the thread-status
+  // reflects this update
+  compaction_read_task.FinishAllTasks();
+  compaction_read_task.WaitUntilDone();
+  UpdateCount(correct_operation_counts, ThreadStatus::OP_COMPACTION,
+              ThreadStatus::OP_UNKNOWN, kCompactionReadTasks);
+
+  env->GetThreadList(&thread_list);
+  UpdateStatusCounts(thread_list, operation_counts, state_counts);
+  VerifyAndResetCounts(correct_operation_counts, operation_counts,
+                       ThreadStatus::NUM_OP_TYPES);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return 0;
+}
+
+#endif  // ROCKSDB_USING_THREAD_STATUS
diff --git a/src/rocksdb/util/thread_local.cc b/src/rocksdb/util/thread_local.cc
index bc8a4c7..af0c8e1 100644
--- a/src/rocksdb/util/thread_local.cc
+++ b/src/rocksdb/util/thread_local.cc
@@ -10,7 +10,7 @@
 #include "util/thread_local.h"
 #include "util/mutexlock.h"
 #include "port/likely.h"
-
+#include <stdlib.h>
 
 namespace rocksdb {
 
@@ -36,7 +36,7 @@ void ThreadLocalPtr::StaticMeta::OnThreadExit(void* ptr) {
   // Unref stored pointers of current thread from all instances
   uint32_t id = 0;
   for (auto& e : tls->entries) {
-    void* raw = e.ptr.load(std::memory_order_relaxed);
+    void* raw = e.ptr.load();
     if (raw != nullptr) {
       auto unref = inst->GetHandler(id);
       if (unref != nullptr) {
@@ -51,7 +51,7 @@ void ThreadLocalPtr::StaticMeta::OnThreadExit(void* ptr) {
 
 ThreadLocalPtr::StaticMeta::StaticMeta() : next_instance_id_(0) {
   if (pthread_key_create(&pthread_key_, &OnThreadExit) != 0) {
-    throw std::runtime_error("pthread_key_create failed");
+    abort();
   }
   head_.next = &head_;
   head_.prev = &head_;
@@ -98,7 +98,7 @@ ThreadLocalPtr::ThreadData* ThreadLocalPtr::StaticMeta::GetThreadLocal() {
         inst->RemoveThreadData(tls_);
       }
       delete tls_;
-      throw std::runtime_error("pthread_setspecific failed");
+      abort();
     }
   }
   return tls_;
@@ -109,7 +109,7 @@ void* ThreadLocalPtr::StaticMeta::Get(uint32_t id) const {
   if (UNLIKELY(id >= tls->entries.size())) {
     return nullptr;
   }
-  return tls->entries[id].ptr.load(std::memory_order_relaxed);
+  return tls->entries[id].ptr.load(std::memory_order_acquire);
 }
 
 void ThreadLocalPtr::StaticMeta::Reset(uint32_t id, void* ptr) {
@@ -119,7 +119,7 @@ void ThreadLocalPtr::StaticMeta::Reset(uint32_t id, void* ptr) {
     MutexLock l(&mutex_);
     tls->entries.resize(id + 1);
   }
-  tls->entries[id].ptr.store(ptr, std::memory_order_relaxed);
+  tls->entries[id].ptr.store(ptr, std::memory_order_release);
 }
 
 void* ThreadLocalPtr::StaticMeta::Swap(uint32_t id, void* ptr) {
@@ -129,7 +129,7 @@ void* ThreadLocalPtr::StaticMeta::Swap(uint32_t id, void* ptr) {
     MutexLock l(&mutex_);
     tls->entries.resize(id + 1);
   }
-  return tls->entries[id].ptr.exchange(ptr, std::memory_order_relaxed);
+  return tls->entries[id].ptr.exchange(ptr, std::memory_order_acquire);
 }
 
 bool ThreadLocalPtr::StaticMeta::CompareAndSwap(uint32_t id, void* ptr,
@@ -140,8 +140,8 @@ bool ThreadLocalPtr::StaticMeta::CompareAndSwap(uint32_t id, void* ptr,
     MutexLock l(&mutex_);
     tls->entries.resize(id + 1);
   }
-  return tls->entries[id].ptr.compare_exchange_strong(expected, ptr,
-      std::memory_order_relaxed, std::memory_order_relaxed);
+  return tls->entries[id].ptr.compare_exchange_strong(
+      expected, ptr, std::memory_order_release, std::memory_order_relaxed);
 }
 
 void ThreadLocalPtr::StaticMeta::Scrape(uint32_t id, autovector<void*>* ptrs,
@@ -150,7 +150,7 @@ void ThreadLocalPtr::StaticMeta::Scrape(uint32_t id, autovector<void*>* ptrs,
   for (ThreadData* t = head_.next; t != &head_; t = t->next) {
     if (id < t->entries.size()) {
       void* ptr =
-          t->entries[id].ptr.exchange(replacement, std::memory_order_relaxed);
+          t->entries[id].ptr.exchange(replacement, std::memory_order_acquire);
       if (ptr != nullptr) {
         ptrs->push_back(ptr);
       }
@@ -198,8 +198,7 @@ void ThreadLocalPtr::StaticMeta::ReclaimId(uint32_t id) {
   auto unref = GetHandler(id);
   for (ThreadData* t = head_.next; t != &head_; t = t->next) {
     if (id < t->entries.size()) {
-      void* ptr =
-          t->entries[id].ptr.exchange(nullptr, std::memory_order_relaxed);
+      void* ptr = t->entries[id].ptr.exchange(nullptr);
       if (ptr != nullptr && unref != nullptr) {
         unref(ptr);
       }
diff --git a/src/rocksdb/util/thread_local.h b/src/rocksdb/util/thread_local.h
index a037a9c..6884ed1 100644
--- a/src/rocksdb/util/thread_local.h
+++ b/src/rocksdb/util/thread_local.h
@@ -26,13 +26,14 @@ namespace rocksdb {
 // (2) a ThreadLocalPtr is destroyed
 typedef void (*UnrefHandler)(void* ptr);
 
-// Thread local storage that only stores value of pointer type. The storage
-// distinguish data coming from different thread and different ThreadLocalPtr
-// instances. For example, if a regular thread_local variable A is declared
-// in DBImpl, two DBImpl objects would share the same A. ThreadLocalPtr avoids
-// the confliction. The total storage size equals to # of threads * # of
-// ThreadLocalPtr instances. It is not efficient in terms of space, but it
-// should serve most of our use cases well and keep code simple.
+// ThreadLocalPtr stores only values of pointer type.  Different from
+// the usual thread-local-storage, ThreadLocalPtr has the ability to
+// distinguish data coming from different threads and different
+// ThreadLocalPtr instances.  For example, if a regular thread_local
+// variable A is declared in DBImpl, two DBImpl objects would share
+// the same A.  However, a ThreadLocalPtr that is defined under the
+// scope of DBImpl can avoid such confliction.  As a result, its memory
+// usage would be O(# of threads * # of ThreadLocalPtr instances).
 class ThreadLocalPtr {
  public:
   explicit ThreadLocalPtr(UnrefHandler handler = nullptr);
diff --git a/src/rocksdb/util/thread_local_test.cc b/src/rocksdb/util/thread_local_test.cc
index 70dfa95..49e7775 100644
--- a/src/rocksdb/util/thread_local_test.cc
+++ b/src/rocksdb/util/thread_local_test.cc
@@ -14,7 +14,7 @@
 
 namespace rocksdb {
 
-class ThreadLocalTest {
+class ThreadLocalTest : public testing::Test {
  public:
   ThreadLocalTest() : env_(Env::Default()) {}
 
@@ -24,11 +24,11 @@ class ThreadLocalTest {
 namespace {
 
 struct Params {
-  Params(port::Mutex* m, port::CondVar* c, int* unref, int n,
+  Params(port::Mutex* m, port::CondVar* c, int* u, int n,
          UnrefHandler handler = nullptr)
       : mu(m),
         cv(c),
-        unref(unref),
+        unref(u),
         total(n),
         started(0),
         completed(0),
@@ -54,7 +54,7 @@ class IDChecker : public ThreadLocalPtr {
 
 }  // anonymous namespace
 
-TEST(ThreadLocalTest, UniqueIdTest) {
+TEST_F(ThreadLocalTest, UniqueIdTest) {
   port::Mutex mu;
   port::CondVar cv(&mu);
 
@@ -101,7 +101,7 @@ TEST(ThreadLocalTest, UniqueIdTest) {
   // 3, 1, 2, 0
 }
 
-TEST(ThreadLocalTest, SequentialReadWriteTest) {
+TEST_F(ThreadLocalTest, SequentialReadWriteTest) {
   // global id list carries over 3, 1, 2, 0
   ASSERT_EQ(IDChecker::PeekId(), 0u);
 
@@ -112,24 +112,24 @@ TEST(ThreadLocalTest, SequentialReadWriteTest) {
   p.tls2 = &tls2;
 
   auto func = [](void* ptr) {
-    auto& p = *static_cast<Params*>(ptr);
-
-    ASSERT_TRUE(p.tls1.Get() == nullptr);
-    p.tls1.Reset(reinterpret_cast<int*>(1));
-    ASSERT_TRUE(p.tls1.Get() == reinterpret_cast<int*>(1));
-    p.tls1.Reset(reinterpret_cast<int*>(2));
-    ASSERT_TRUE(p.tls1.Get() == reinterpret_cast<int*>(2));
-
-    ASSERT_TRUE(p.tls2->Get() == nullptr);
-    p.tls2->Reset(reinterpret_cast<int*>(1));
-    ASSERT_TRUE(p.tls2->Get() == reinterpret_cast<int*>(1));
-    p.tls2->Reset(reinterpret_cast<int*>(2));
-    ASSERT_TRUE(p.tls2->Get() == reinterpret_cast<int*>(2));
-
-    p.mu->Lock();
-    ++(p.completed);
-    p.cv->SignalAll();
-    p.mu->Unlock();
+    auto& params = *static_cast<Params*>(ptr);
+
+    ASSERT_TRUE(params.tls1.Get() == nullptr);
+    params.tls1.Reset(reinterpret_cast<int*>(1));
+    ASSERT_TRUE(params.tls1.Get() == reinterpret_cast<int*>(1));
+    params.tls1.Reset(reinterpret_cast<int*>(2));
+    ASSERT_TRUE(params.tls1.Get() == reinterpret_cast<int*>(2));
+
+    ASSERT_TRUE(params.tls2->Get() == nullptr);
+    params.tls2->Reset(reinterpret_cast<int*>(1));
+    ASSERT_TRUE(params.tls2->Get() == reinterpret_cast<int*>(1));
+    params.tls2->Reset(reinterpret_cast<int*>(2));
+    ASSERT_TRUE(params.tls2->Get() == reinterpret_cast<int*>(2));
+
+    params.mu->Lock();
+    ++(params.completed);
+    params.cv->SignalAll();
+    params.mu->Unlock();
   };
 
   for (int iter = 0; iter < 1024; ++iter) {
@@ -145,7 +145,7 @@ TEST(ThreadLocalTest, SequentialReadWriteTest) {
   }
 }
 
-TEST(ThreadLocalTest, ConcurrentReadWriteTest) {
+TEST_F(ThreadLocalTest, ConcurrentReadWriteTest) {
   // global id list carries over 3, 1, 2, 0
   ASSERT_EQ(IDChecker::PeekId(), 0u);
 
@@ -229,7 +229,7 @@ TEST(ThreadLocalTest, ConcurrentReadWriteTest) {
   ASSERT_EQ(IDChecker::PeekId(), 3u);
 }
 
-TEST(ThreadLocalTest, Unref) {
+TEST_F(ThreadLocalTest, Unref) {
   ASSERT_EQ(IDChecker::PeekId(), 0u);
 
   auto unref = [](void* ptr) {
@@ -372,7 +372,7 @@ TEST(ThreadLocalTest, Unref) {
   }
 }
 
-TEST(ThreadLocalTest, Swap) {
+TEST_F(ThreadLocalTest, Swap) {
   ThreadLocalPtr tls;
   tls.Reset(reinterpret_cast<void*>(1));
   ASSERT_EQ(reinterpret_cast<int64_t>(tls.Swap(nullptr)), 1);
@@ -381,7 +381,7 @@ TEST(ThreadLocalTest, Swap) {
   ASSERT_EQ(reinterpret_cast<int64_t>(tls.Swap(reinterpret_cast<void*>(3))), 2);
 }
 
-TEST(ThreadLocalTest, Scrape) {
+TEST_F(ThreadLocalTest, Scrape) {
   auto unref = [](void* ptr) {
     auto& p = *static_cast<Params*>(ptr);
     p.mu->Lock();
@@ -449,7 +449,7 @@ TEST(ThreadLocalTest, Scrape) {
   }
 }
 
-TEST(ThreadLocalTest, CompareAndSwap) {
+TEST_F(ThreadLocalTest, CompareAndSwap) {
   ThreadLocalPtr tls;
   ASSERT_TRUE(tls.Swap(reinterpret_cast<void*>(1)) == nullptr);
   void* expected = reinterpret_cast<void*>(1);
@@ -468,5 +468,6 @@ TEST(ThreadLocalTest, CompareAndSwap) {
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
-  return rocksdb::test::RunAllTests();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
 }
diff --git a/src/rocksdb/util/thread_operation.h b/src/rocksdb/util/thread_operation.h
new file mode 100644
index 0000000..709e755
--- /dev/null
+++ b/src/rocksdb/util/thread_operation.h
@@ -0,0 +1,123 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file defines the structures for thread operation and state.
+// Thread operations are used to describe high level action of a
+// thread such as doing compaction or flush, while thread state
+// are used to describe lower-level action such as reading /
+// writing a file or waiting for a mutex.  Operations and states
+// are designed to be independent.  Typically, a thread usually involves
+// in one operation and one state at any specific point in time.
+
+#pragma once
+
+#include "include/rocksdb/thread_status.h"
+
+#include <string>
+
+namespace rocksdb {
+
+#if ROCKSDB_USING_THREAD_STATUS
+
+// The structure that describes a major thread operation.
+struct OperationInfo {
+  const ThreadStatus::OperationType type;
+  const std::string name;
+};
+
+// The global operation table.
+//
+// When updating a status of a thread, the pointer of the OperationInfo
+// of the current ThreadStatusData will be pointing to one of the
+// rows in this global table.
+//
+// Note that it's not designed to be constant as in the future we
+// might consider adding global count to the OperationInfo.
+static OperationInfo global_operation_table[] = {
+  {ThreadStatus::OP_UNKNOWN, ""},
+  {ThreadStatus::OP_COMPACTION, "Compaction"},
+  {ThreadStatus::OP_FLUSH, "Flush"}
+};
+
+struct OperationStageInfo {
+  const ThreadStatus::OperationStage stage;
+  const std::string name;
+};
+
+// A table maintains the mapping from stage type to stage string.
+// Note that the string must be changed accordingly when the
+// associated function name changed.
+static OperationStageInfo global_op_stage_table[] = {
+  {ThreadStatus::STAGE_UNKNOWN, ""},
+  {ThreadStatus::STAGE_FLUSH_RUN,
+      "FlushJob::Run"},
+  {ThreadStatus::STAGE_FLUSH_WRITE_L0,
+      "FlushJob::WriteLevel0Table"},
+  {ThreadStatus::STAGE_COMPACTION_PREPARE,
+      "CompactionJob::Prepare"},
+  {ThreadStatus::STAGE_COMPACTION_RUN,
+      "CompactionJob::Run"},
+  {ThreadStatus::STAGE_COMPACTION_PROCESS_KV,
+      "CompactionJob::ProcessKeyValueCompaction"},
+  {ThreadStatus::STAGE_COMPACTION_FILTER_V2,
+      "CompactionJob::CallCompactionFilterV2"},
+  {ThreadStatus::STAGE_COMPACTION_INSTALL,
+      "CompactionJob::Install"},
+  {ThreadStatus::STAGE_COMPACTION_SYNC_FILE,
+      "CompactionJob::FinishCompactionOutputFile"},
+  {ThreadStatus::STAGE_PICK_MEMTABLES_TO_FLUSH,
+      "MemTableList::PickMemtablesToFlush"},
+  {ThreadStatus::STAGE_MEMTABLE_ROLLBACK,
+      "MemTableList::RollbackMemtableFlush"},
+  {ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS,
+      "MemTableList::InstallMemtableFlushResults"},
+};
+
+// The structure that describes a state.
+struct StateInfo {
+  const ThreadStatus::StateType type;
+  const std::string name;
+};
+
+// The global state table.
+//
+// When updating a status of a thread, the pointer of the StateInfo
+// of the current ThreadStatusData will be pointing to one of the
+// rows in this global table.
+static StateInfo global_state_table[] = {
+  {ThreadStatus::STATE_UNKNOWN, ""},
+  {ThreadStatus::STATE_MUTEX_WAIT, "Mutex Wait"},
+};
+
+struct OperationProperty {
+  int code;
+  std::string name;
+};
+
+static OperationProperty compaction_operation_properties[] = {
+  {ThreadStatus::COMPACTION_JOB_ID, "JobID"},
+  {ThreadStatus::COMPACTION_INPUT_OUTPUT_LEVEL, "InputOutputLevel"},
+  {ThreadStatus::COMPACTION_PROP_FLAGS, "Manual/Deletion/Trivial"},
+  {ThreadStatus::COMPACTION_TOTAL_INPUT_BYTES, "TotalInputBytes"},
+  {ThreadStatus::COMPACTION_BYTES_READ, "BytesRead"},
+  {ThreadStatus::COMPACTION_BYTES_WRITTEN, "BytesWritten"},
+};
+
+static OperationProperty flush_operation_properties[] = {
+  {ThreadStatus::FLUSH_JOB_ID, "JobID"},
+  {ThreadStatus::FLUSH_BYTES_MEMTABLES, "BytesMemtables"},
+  {ThreadStatus::FLUSH_BYTES_WRITTEN, "BytesWritten"}
+};
+
+#else
+
+struct OperationInfo {
+};
+
+struct StateInfo {
+};
+
+#endif  // ROCKSDB_USING_THREAD_STATUS
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/thread_status_impl.cc b/src/rocksdb/util/thread_status_impl.cc
new file mode 100644
index 0000000..bd64d44
--- /dev/null
+++ b/src/rocksdb/util/thread_status_impl.cc
@@ -0,0 +1,167 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+
+#include <sstream>
+
+#include "rocksdb/env.h"
+#include "rocksdb/thread_status.h"
+#include "util/logging.h"
+#include "util/thread_operation.h"
+
+namespace rocksdb {
+
+#if ROCKSDB_USING_THREAD_STATUS
+const std::string& ThreadStatus::GetThreadTypeName(
+    ThreadStatus::ThreadType thread_type) {
+  static std::string thread_type_names[NUM_THREAD_TYPES + 1] = {
+      "High Pri", "Low Pri", "User", "Unknown"};
+  if (thread_type < 0 || thread_type >= NUM_THREAD_TYPES) {
+    return thread_type_names[NUM_THREAD_TYPES];  // "Unknown"
+  }
+  return thread_type_names[thread_type];
+}
+
+const std::string& ThreadStatus::GetOperationName(
+    ThreadStatus::OperationType op_type) {
+  if (op_type < 0 || op_type >= NUM_OP_TYPES) {
+    return global_operation_table[OP_UNKNOWN].name;
+  }
+  return global_operation_table[op_type].name;
+}
+
+const std::string& ThreadStatus::GetOperationStageName(
+    ThreadStatus::OperationStage stage) {
+  if (stage < 0 || stage >= NUM_OP_STAGES) {
+    return global_op_stage_table[STAGE_UNKNOWN].name;
+  }
+  return global_op_stage_table[stage].name;
+}
+
+const std::string& ThreadStatus::GetStateName(
+    ThreadStatus::StateType state_type) {
+  if (state_type < 0 || state_type >= NUM_STATE_TYPES) {
+    return global_state_table[STATE_UNKNOWN].name;
+  }
+  return global_state_table[state_type].name;
+}
+
+const std::string ThreadStatus::MicrosToString(uint64_t micros) {
+  if (micros == 0) {
+    return "";
+  }
+  const int kBufferLen = 100;
+  char buffer[kBufferLen];
+  AppendHumanMicros(micros, buffer, kBufferLen, false);
+  return std::string(buffer);
+}
+
+const std::string& ThreadStatus::GetOperationPropertyName(
+    ThreadStatus::OperationType op_type, int i) {
+  static const std::string empty_str = "";
+  switch (op_type) {
+    case ThreadStatus::OP_COMPACTION:
+      if (i >= NUM_COMPACTION_PROPERTIES) {
+        return empty_str;
+      }
+      return compaction_operation_properties[i].name;
+    case ThreadStatus::OP_FLUSH:
+      if (i >= NUM_FLUSH_PROPERTIES) {
+        return empty_str;
+      }
+      return flush_operation_properties[i].name;
+    default:
+      return empty_str;
+  }
+}
+
+std::map<std::string, uint64_t>
+    ThreadStatus::InterpretOperationProperties(
+    ThreadStatus::OperationType op_type,
+    const uint64_t* op_properties) {
+  int num_properties;
+  switch (op_type) {
+    case OP_COMPACTION:
+      num_properties = NUM_COMPACTION_PROPERTIES;
+      break;
+    case OP_FLUSH:
+      num_properties = NUM_FLUSH_PROPERTIES;
+      break;
+    default:
+      num_properties = 0;
+  }
+
+  std::map<std::string, uint64_t> property_map;
+  for (int i = 0; i < num_properties; ++i) {
+    if (op_type == OP_COMPACTION &&
+        i == COMPACTION_INPUT_OUTPUT_LEVEL) {
+      property_map.insert(
+          {"BaseInputLevel", op_properties[i] >> 32});
+      property_map.insert(
+          {"OutputLevel", op_properties[i] % (1ULL << 32)});
+    } else if (op_type == OP_COMPACTION &&
+               i == COMPACTION_PROP_FLAGS) {
+      property_map.insert(
+          {"IsManual", ((op_properties[i] & 2) >> 1)});
+      property_map.insert(
+          {"IsDeletion", ((op_properties[i] & 4) >> 2)});
+      property_map.insert(
+          {"IsTrivialMove", ((op_properties[i] & 8) >> 3)});
+    } else {
+      property_map.insert(
+          {GetOperationPropertyName(op_type, i), op_properties[i]});
+    }
+  }
+  return property_map;
+}
+
+
+#else
+
+const std::string& ThreadStatus::GetThreadTypeName(
+    ThreadStatus::ThreadType thread_type) {
+  static std::string dummy_str = "";
+  return dummy_str;
+}
+
+const std::string& ThreadStatus::GetOperationName(
+    ThreadStatus::OperationType op_type) {
+  static std::string dummy_str = "";
+  return dummy_str;
+}
+
+const std::string& ThreadStatus::GetOperationStageName(
+    ThreadStatus::OperationStage stage) {
+  static std::string dummy_str = "";
+  return dummy_str;
+}
+
+const std::string& ThreadStatus::GetStateName(
+    ThreadStatus::StateType state_type) {
+  static std::string dummy_str = "";
+  return dummy_str;
+}
+
+const std::string ThreadStatus::MicrosToString(
+    uint64_t op_elapsed_time) {
+  static std::string dummy_str = "";
+  return dummy_str;
+}
+
+const std::string& ThreadStatus::GetOperationPropertyName(
+    ThreadStatus::OperationType op_type, int i) {
+  static std::string dummy_str = "";
+  return dummy_str;
+}
+
+std::map<std::string, uint64_t>
+    ThreadStatus::InterpretOperationProperties(
+    ThreadStatus::OperationType op_type,
+    const uint64_t* op_properties) {
+  return std::map<std::string, uint64_t>();
+}
+
+#endif  // ROCKSDB_USING_THREAD_STATUS
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/thread_status_updater.cc b/src/rocksdb/util/thread_status_updater.cc
new file mode 100644
index 0000000..31845cc
--- /dev/null
+++ b/src/rocksdb/util/thread_status_updater.cc
@@ -0,0 +1,343 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#include <memory>
+#include "rocksdb/env.h"
+#include "port/likely.h"
+#include "util/mutexlock.h"
+#include "util/thread_status_updater.h"
+
+namespace rocksdb {
+
+#if ROCKSDB_USING_THREAD_STATUS
+
+__thread ThreadStatusData* ThreadStatusUpdater::thread_status_data_ = nullptr;
+
+void ThreadStatusUpdater::UnregisterThread() {
+  if (thread_status_data_ != nullptr) {
+    std::lock_guard<std::mutex> lck(thread_list_mutex_);
+    thread_data_set_.erase(thread_status_data_);
+    delete thread_status_data_;
+    thread_status_data_ = nullptr;
+  }
+}
+
+void ThreadStatusUpdater::SetThreadType(
+    ThreadStatus::ThreadType ttype) {
+  auto* data = InitAndGet();
+  data->thread_type.store(ttype, std::memory_order_relaxed);
+  ClearThreadOperationProperties();
+}
+
+void ThreadStatusUpdater::ResetThreadStatus() {
+  ClearThreadState();
+  ClearThreadOperation();
+  SetColumnFamilyInfoKey(nullptr);
+}
+
+void ThreadStatusUpdater::SetColumnFamilyInfoKey(
+    const void* cf_key) {
+  auto* data = InitAndGet();
+  // set the tracking flag based on whether cf_key is non-null or not.
+  // If enable_thread_tracking is set to false, the input cf_key
+  // would be nullptr.
+  data->enable_tracking = (cf_key != nullptr);
+  data->cf_key.store(cf_key, std::memory_order_relaxed);
+}
+
+const void* ThreadStatusUpdater::GetColumnFamilyInfoKey() {
+  auto* data = InitAndGet();
+  if (data->enable_tracking == false) {
+    return nullptr;
+  }
+  return data->cf_key.load(std::memory_order_relaxed);
+}
+
+void ThreadStatusUpdater::SetThreadOperation(
+    const ThreadStatus::OperationType type) {
+  auto* data = InitAndGet();
+  if (!data->enable_tracking) {
+    assert(data->cf_key.load(std::memory_order_relaxed) == nullptr);
+    return;
+  }
+  // NOTE: Our practice here is to set all the thread operation properties
+  //       and stage before we set thread operation, and thread operation
+  //       will be set in std::memory_order_release.  This is to ensure
+  //       whenever a thread operation is not OP_UNKNOWN, we will always
+  //       have a consistent information on its properties.
+  data->operation_type.store(type, std::memory_order_release);
+  if (type == ThreadStatus::OP_UNKNOWN) {
+    data->operation_stage.store(ThreadStatus::STAGE_UNKNOWN,
+        std::memory_order_relaxed);
+    ClearThreadOperationProperties();
+  }
+}
+
+void ThreadStatusUpdater::SetThreadOperationProperty(
+    int i, uint64_t value) {
+  auto* data = InitAndGet();
+  if (!data->enable_tracking) {
+    assert(data->cf_key.load(std::memory_order_relaxed) == nullptr);
+    return;
+  }
+  data->op_properties[i].store(value, std::memory_order_relaxed);
+}
+
+void ThreadStatusUpdater::IncreaseThreadOperationProperty(
+    int i, uint64_t delta) {
+  auto* data = InitAndGet();
+  if (!data->enable_tracking) {
+    assert(data->cf_key.load(std::memory_order_relaxed) == nullptr);
+    return;
+  }
+  data->op_properties[i].fetch_add(delta, std::memory_order_relaxed);
+}
+
+void ThreadStatusUpdater::SetOperationStartTime(const uint64_t start_time) {
+  auto* data = InitAndGet();
+  if (!data->enable_tracking) {
+    assert(data->cf_key.load(std::memory_order_relaxed) == nullptr);
+    return;
+  }
+  data->op_start_time.store(start_time, std::memory_order_relaxed);
+}
+
+void ThreadStatusUpdater::ClearThreadOperation() {
+  auto* data = InitAndGet();
+  if (!data->enable_tracking) {
+    assert(data->cf_key.load(std::memory_order_relaxed) == nullptr);
+    return;
+  }
+  data->operation_stage.store(ThreadStatus::STAGE_UNKNOWN,
+      std::memory_order_relaxed);
+  data->operation_type.store(
+      ThreadStatus::OP_UNKNOWN, std::memory_order_relaxed);
+  ClearThreadOperationProperties();
+}
+
+void ThreadStatusUpdater::ClearThreadOperationProperties() {
+  auto* data = InitAndGet();
+  if (!data->enable_tracking) {
+    assert(data->cf_key.load(std::memory_order_relaxed) == nullptr);
+    return;
+  }
+  for (int i = 0; i < ThreadStatus::kNumOperationProperties; ++i) {
+    data->op_properties[i].store(0, std::memory_order_relaxed);
+  }
+}
+
+ThreadStatus::OperationStage ThreadStatusUpdater::SetThreadOperationStage(
+    ThreadStatus::OperationStage stage) {
+  auto* data = InitAndGet();
+  if (!data->enable_tracking) {
+    assert(data->cf_key.load(std::memory_order_relaxed) == nullptr);
+    return ThreadStatus::STAGE_UNKNOWN;
+  }
+  return data->operation_stage.exchange(
+      stage, std::memory_order_relaxed);
+}
+
+void ThreadStatusUpdater::SetThreadState(
+    const ThreadStatus::StateType type) {
+  auto* data = InitAndGet();
+  if (!data->enable_tracking) {
+    assert(data->cf_key.load(std::memory_order_relaxed) == nullptr);
+    return;
+  }
+  data->state_type.store(type, std::memory_order_relaxed);
+}
+
+void ThreadStatusUpdater::ClearThreadState() {
+  auto* data = InitAndGet();
+  if (!data->enable_tracking) {
+    assert(data->cf_key.load(std::memory_order_relaxed) == nullptr);
+    return;
+  }
+  data->state_type.store(
+      ThreadStatus::STATE_UNKNOWN, std::memory_order_relaxed);
+}
+
+Status ThreadStatusUpdater::GetThreadList(
+    std::vector<ThreadStatus>* thread_list) {
+  thread_list->clear();
+  std::vector<std::shared_ptr<ThreadStatusData>> valid_list;
+  uint64_t now_micros = Env::Default()->NowMicros();
+
+  std::lock_guard<std::mutex> lck(thread_list_mutex_);
+  for (auto* thread_data : thread_data_set_) {
+    assert(thread_data);
+    auto thread_type = thread_data->thread_type.load(
+        std::memory_order_relaxed);
+    // Since any change to cf_info_map requires thread_list_mutex,
+    // which is currently held by GetThreadList(), here we can safely
+    // use "memory_order_relaxed" to load the cf_key.
+    auto cf_key = thread_data->cf_key.load(
+        std::memory_order_relaxed);
+    auto iter = cf_info_map_.find(cf_key);
+    assert(cf_key == 0 || iter != cf_info_map_.end());
+    auto* cf_info = iter != cf_info_map_.end() ?
+        iter->second.get() : nullptr;
+    const std::string* db_name = nullptr;
+    const std::string* cf_name = nullptr;
+    ThreadStatus::OperationType op_type = ThreadStatus::OP_UNKNOWN;
+    ThreadStatus::OperationStage op_stage = ThreadStatus::STAGE_UNKNOWN;
+    ThreadStatus::StateType state_type = ThreadStatus::STATE_UNKNOWN;
+    uint64_t op_elapsed_micros = 0;
+    uint64_t op_props[ThreadStatus::kNumOperationProperties] = {0};
+    if (cf_info != nullptr) {
+      db_name = &cf_info->db_name;
+      cf_name = &cf_info->cf_name;
+      op_type = thread_data->operation_type.load(
+          std::memory_order_acquire);
+      // display lower-level info only when higher-level info is available.
+      if (op_type != ThreadStatus::OP_UNKNOWN) {
+        op_elapsed_micros = now_micros - thread_data->op_start_time.load(
+            std::memory_order_relaxed);
+        op_stage = thread_data->operation_stage.load(
+            std::memory_order_relaxed);
+        state_type = thread_data->state_type.load(
+            std::memory_order_relaxed);
+        for (int i = 0; i < ThreadStatus::kNumOperationProperties; ++i) {
+          op_props[i] = thread_data->op_properties[i].load(
+              std::memory_order_relaxed);
+        }
+      }
+    }
+    thread_list->emplace_back(
+        thread_data->thread_id, thread_type,
+        db_name ? *db_name : "",
+        cf_name ? *cf_name : "",
+        op_type, op_elapsed_micros, op_stage, op_props,
+        state_type);
+  }
+
+  return Status::OK();
+}
+
+ThreadStatusData* ThreadStatusUpdater::InitAndGet() {
+  if (UNLIKELY(thread_status_data_ == nullptr)) {
+    thread_status_data_ = new ThreadStatusData();
+    thread_status_data_->thread_id = reinterpret_cast<uint64_t>(
+        thread_status_data_);
+    std::lock_guard<std::mutex> lck(thread_list_mutex_);
+    thread_data_set_.insert(thread_status_data_);
+  }
+  return thread_status_data_;
+}
+
+void ThreadStatusUpdater::NewColumnFamilyInfo(
+    const void* db_key, const std::string& db_name,
+    const void* cf_key, const std::string& cf_name) {
+  // Acquiring same lock as GetThreadList() to guarantee
+  // a consistent view of global column family table (cf_info_map).
+  std::lock_guard<std::mutex> lck(thread_list_mutex_);
+
+  cf_info_map_[cf_key].reset(
+      new ConstantColumnFamilyInfo(db_key, db_name, cf_name));
+  db_key_map_[db_key].insert(cf_key);
+}
+
+void ThreadStatusUpdater::EraseColumnFamilyInfo(const void* cf_key) {
+  // Acquiring same lock as GetThreadList() to guarantee
+  // a consistent view of global column family table (cf_info_map).
+  std::lock_guard<std::mutex> lck(thread_list_mutex_);
+  auto cf_pair = cf_info_map_.find(cf_key);
+  assert(cf_pair != cf_info_map_.end());
+
+  auto* cf_info = cf_pair->second.get();
+  assert(cf_info);
+
+  // Remove its entry from db_key_map_ by the following steps:
+  // 1. Obtain the entry in db_key_map_ whose set contains cf_key
+  // 2. Remove it from the set.
+  auto db_pair = db_key_map_.find(cf_info->db_key);
+  assert(db_pair != db_key_map_.end());
+  size_t result __attribute__((unused)) = db_pair->second.erase(cf_key);
+  assert(result);
+
+  cf_pair->second.reset();
+  result = cf_info_map_.erase(cf_key);
+  assert(result);
+}
+
+void ThreadStatusUpdater::EraseDatabaseInfo(const void* db_key) {
+  // Acquiring same lock as GetThreadList() to guarantee
+  // a consistent view of global column family table (cf_info_map).
+  std::lock_guard<std::mutex> lck(thread_list_mutex_);
+  auto db_pair = db_key_map_.find(db_key);
+  if (UNLIKELY(db_pair == db_key_map_.end())) {
+    // In some occasional cases such as DB::Open fails, we won't
+    // register ColumnFamilyInfo for a db.
+    return;
+  }
+
+  size_t result __attribute__((unused)) = 0;
+  for (auto cf_key : db_pair->second) {
+    auto cf_pair = cf_info_map_.find(cf_key);
+    assert(cf_pair != cf_info_map_.end());
+    cf_pair->second.reset();
+    result = cf_info_map_.erase(cf_key);
+    assert(result);
+  }
+  db_key_map_.erase(db_key);
+}
+
+#else
+
+void ThreadStatusUpdater::UnregisterThread() {
+}
+
+void ThreadStatusUpdater::ResetThreadStatus() {
+}
+
+void ThreadStatusUpdater::SetThreadType(
+    ThreadStatus::ThreadType ttype) {
+}
+
+void ThreadStatusUpdater::SetColumnFamilyInfoKey(
+    const void* cf_key) {
+}
+
+void ThreadStatusUpdater::SetThreadOperation(
+    const ThreadStatus::OperationType type) {
+}
+
+void ThreadStatusUpdater::ClearThreadOperation() {
+}
+
+void ThreadStatusUpdater::SetThreadState(
+    const ThreadStatus::StateType type) {
+}
+
+void ThreadStatusUpdater::ClearThreadState() {
+}
+
+Status ThreadStatusUpdater::GetThreadList(
+    std::vector<ThreadStatus>* thread_list) {
+  return Status::NotSupported(
+      "GetThreadList is not supported in the current running environment.");
+}
+
+void ThreadStatusUpdater::NewColumnFamilyInfo(
+    const void* db_key, const std::string& db_name,
+    const void* cf_key, const std::string& cf_name) {
+}
+
+void ThreadStatusUpdater::EraseColumnFamilyInfo(const void* cf_key) {
+}
+
+void ThreadStatusUpdater::EraseDatabaseInfo(const void* db_key) {
+}
+
+void ThreadStatusUpdater::SetThreadOperationProperty(
+    int i, uint64_t value) {
+}
+
+void ThreadStatusUpdater::IncreaseThreadOperationProperty(
+    int i, uint64_t delta) {
+}
+
+#endif  // ROCKSDB_USING_THREAD_STATUS
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/thread_status_updater.h b/src/rocksdb/util/thread_status_updater.h
new file mode 100644
index 0000000..b511a8d
--- /dev/null
+++ b/src/rocksdb/util/thread_status_updater.h
@@ -0,0 +1,225 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// The implementation of ThreadStatus.
+//
+// Note that we make get and set access to ThreadStatusData lockless.
+// As a result, ThreadStatusData as a whole is not atomic.  However,
+// we guarantee consistent ThreadStatusData all the time whenever
+// user call GetThreadList().  This consistency guarantee is done
+// by having the following constraint in the internal implementation
+// of set and get order:
+//
+// 1. When reset any information in ThreadStatusData, always start from
+//    clearing up the lower-level information first.
+// 2. When setting any information in ThreadStatusData, always start from
+//    setting the higher-level information.
+// 3. When returning ThreadStatusData to the user, fields are fetched from
+//    higher-level to lower-level.  In addition, where there's a nullptr
+//    in one field, then all fields that has lower-level than that field
+//    should be ignored.
+//
+// The high to low level information would be:
+// thread_id > thread_type > db > cf > operation > state
+//
+// This means user might not always get full information, but whenever
+// returned by the GetThreadList() is guaranteed to be consistent.
+#pragma once
+#include <atomic>
+#include <list>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "rocksdb/status.h"
+#include "rocksdb/thread_status.h"
+#include "port/port_posix.h"
+#include "util/thread_operation.h"
+
+namespace rocksdb {
+
+class ColumnFamilyHandle;
+
+// The structure that keeps constant information about a column family.
+struct ConstantColumnFamilyInfo {
+#if ROCKSDB_USING_THREAD_STATUS
+ public:
+  ConstantColumnFamilyInfo(
+      const void* _db_key,
+      const std::string& _db_name,
+      const std::string& _cf_name) :
+      db_key(_db_key), db_name(_db_name), cf_name(_cf_name) {}
+  const void* db_key;
+  const std::string db_name;
+  const std::string cf_name;
+#endif  // ROCKSDB_USING_THREAD_STATUS
+};
+
+// the internal data-structure that is used to reflect the current
+// status of a thread using a set of atomic pointers.
+struct ThreadStatusData {
+#if ROCKSDB_USING_THREAD_STATUS
+  explicit ThreadStatusData() : thread_id(0), enable_tracking(false) {
+    thread_type.store(ThreadStatus::USER);
+    cf_key.store(nullptr);
+    operation_type.store(ThreadStatus::OP_UNKNOWN);
+    op_start_time.store(0);
+    state_type.store(ThreadStatus::STATE_UNKNOWN);
+  }
+
+  uint64_t thread_id;
+
+  // A flag to indicate whether the thread tracking is enabled
+  // in the current thread.  This value will be updated based on whether
+  // the associated Options::enable_thread_tracking is set to true
+  // in ThreadStatusUtil::SetColumnFamily().
+  //
+  // If set to false, then SetThreadOperation and SetThreadState
+  // will be no-op.
+  bool enable_tracking;
+
+  std::atomic<ThreadStatus::ThreadType> thread_type;
+  std::atomic<const void*> cf_key;
+  std::atomic<ThreadStatus::OperationType> operation_type;
+  std::atomic<uint64_t> op_start_time;
+  std::atomic<ThreadStatus::OperationStage> operation_stage;
+  std::atomic<uint64_t> op_properties[ThreadStatus::kNumOperationProperties];
+  std::atomic<ThreadStatus::StateType> state_type;
+#endif  // ROCKSDB_USING_THREAD_STATUS
+};
+
+// The class that stores and updates the status of the current thread
+// using a thread-local ThreadStatusData.
+//
+// In most of the case, you should use ThreadStatusUtil to update
+// the status of the current thread instead of using ThreadSatusUpdater
+// directly.
+//
+// @see ThreadStatusUtil
+class ThreadStatusUpdater {
+ public:
+  ThreadStatusUpdater() {}
+
+  // Releases all ThreadStatusData of all active threads.
+  virtual ~ThreadStatusUpdater() {}
+
+  // Unregister the current thread.
+  void UnregisterThread();
+
+  // Reset the status of the current thread.  This includes resetting
+  // ColumnFamilyInfoKey, ThreadOperation, and ThreadState.
+  void ResetThreadStatus();
+
+  // Set the thread type of the current thread.
+  void SetThreadType(ThreadStatus::ThreadType ttype);
+
+  // Update the column-family info of the current thread by setting
+  // its thread-local pointer of ThreadStateInfo to the correct entry.
+  void SetColumnFamilyInfoKey(const void* cf_key);
+
+  // returns the column family info key.
+  const void* GetColumnFamilyInfoKey();
+
+  // Update the thread operation of the current thread.
+  void SetThreadOperation(const ThreadStatus::OperationType type);
+
+  // The start time of the current thread operation.  It is in the format
+  // of micro-seconds since some fixed point in time.
+  void SetOperationStartTime(const uint64_t start_time);
+
+  // Set the "i"th property of the current operation.
+  //
+  // NOTE: Our practice here is to set all the thread operation properties
+  //       and stage before we set thread operation, and thread operation
+  //       will be set in std::memory_order_release.  This is to ensure
+  //       whenever a thread operation is not OP_UNKNOWN, we will always
+  //       have a consistent information on its properties.
+  void SetThreadOperationProperty(
+      int i, uint64_t value);
+
+  // Increase the "i"th property of the current operation with
+  // the specified delta.
+  void IncreaseThreadOperationProperty(
+      int i, uint64_t delta);
+
+  // Update the thread operation stage of the current thread.
+  ThreadStatus::OperationStage SetThreadOperationStage(
+      const ThreadStatus::OperationStage stage);
+
+  // Clear thread operation of the current thread.
+  void ClearThreadOperation();
+
+  // Reset all thread-operation-properties to 0.
+  void ClearThreadOperationProperties();
+
+  // Update the thread state of the current thread.
+  void SetThreadState(const ThreadStatus::StateType type);
+
+  // Clear the thread state of the current thread.
+  void ClearThreadState();
+
+  // Obtain the status of all active registered threads.
+  Status GetThreadList(
+      std::vector<ThreadStatus>* thread_list);
+
+  // Create an entry in the global ColumnFamilyInfo table for the
+  // specified column family.  This function should be called only
+  // when the current thread does not hold db_mutex.
+  void NewColumnFamilyInfo(
+      const void* db_key, const std::string& db_name,
+      const void* cf_key, const std::string& cf_name);
+
+  // Erase all ConstantColumnFamilyInfo that is associated with the
+  // specified db instance.  This function should be called only when
+  // the current thread does not hold db_mutex.
+  void EraseDatabaseInfo(const void* db_key);
+
+  // Erase the ConstantColumnFamilyInfo that is associated with the
+  // specified ColumnFamilyData.  This function should be called only
+  // when the current thread does not hold db_mutex.
+  void EraseColumnFamilyInfo(const void* cf_key);
+
+  // Verifies whether the input ColumnFamilyHandles matches
+  // the information stored in the current cf_info_map.
+  void TEST_VerifyColumnFamilyInfoMap(
+      const std::vector<ColumnFamilyHandle*>& handles,
+      bool check_exist);
+
+ protected:
+#if ROCKSDB_USING_THREAD_STATUS
+  // The thread-local variable for storing thread status.
+  static __thread ThreadStatusData* thread_status_data_;
+
+  // Obtain the pointer to the thread status data.  It also performs
+  // initialization when necessary.
+  ThreadStatusData* InitAndGet();
+
+  // The mutex that protects cf_info_map and db_key_map.
+  std::mutex thread_list_mutex_;
+
+  // The current status data of all active threads.
+  std::unordered_set<ThreadStatusData*> thread_data_set_;
+
+  // A global map that keeps the column family information.  It is stored
+  // globally instead of inside DB is to avoid the situation where DB is
+  // closing while GetThreadList function already get the pointer to its
+  // CopnstantColumnFamilyInfo.
+  std::unordered_map<
+      const void*, std::unique_ptr<ConstantColumnFamilyInfo>> cf_info_map_;
+
+  // A db_key to cf_key map that allows erasing elements in cf_info_map
+  // associated to the same db_key faster.
+  std::unordered_map<
+      const void*, std::unordered_set<const void*>> db_key_map_;
+
+#else
+  static ThreadStatusData* thread_status_data_;
+#endif  // ROCKSDB_USING_THREAD_STATUS
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/thread_status_updater_debug.cc b/src/rocksdb/util/thread_status_updater_debug.cc
new file mode 100644
index 0000000..274f427
--- /dev/null
+++ b/src/rocksdb/util/thread_status_updater_debug.cc
@@ -0,0 +1,46 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#include <mutex>
+
+#include "util/thread_status_updater.h"
+#include "db/column_family.h"
+
+namespace rocksdb {
+
+#ifndef NDEBUG
+#if ROCKSDB_USING_THREAD_STATUS
+void ThreadStatusUpdater::TEST_VerifyColumnFamilyInfoMap(
+    const std::vector<ColumnFamilyHandle*>& handles,
+    bool check_exist) {
+  std::unique_lock<std::mutex> lock(thread_list_mutex_);
+  if (check_exist) {
+    assert(cf_info_map_.size() == handles.size());
+  }
+  for (auto* handle : handles) {
+    auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(handle)->cfd();
+    auto iter __attribute__((unused)) = cf_info_map_.find(cfd);
+    if (check_exist) {
+      assert(iter != cf_info_map_.end());
+      assert(iter->second);
+      assert(iter->second->cf_name == cfd->GetName());
+    } else {
+      assert(iter == cf_info_map_.end());
+    }
+  }
+}
+
+#else
+
+void ThreadStatusUpdater::TEST_VerifyColumnFamilyInfoMap(
+    const std::vector<ColumnFamilyHandle*>& handles,
+    bool check_exist) {
+}
+
+#endif  // ROCKSDB_USING_THREAD_STATUS
+#endif  // !NDEBUG
+
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/thread_status_util.cc b/src/rocksdb/util/thread_status_util.cc
new file mode 100644
index 0000000..c498971
--- /dev/null
+++ b/src/rocksdb/util/thread_status_util.cc
@@ -0,0 +1,213 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#include "rocksdb/env.h"
+#include "util/thread_status_updater.h"
+#include "util/thread_status_util.h"
+
+namespace rocksdb {
+
+
+#if ROCKSDB_USING_THREAD_STATUS
+__thread ThreadStatusUpdater*
+    ThreadStatusUtil::thread_updater_local_cache_ = nullptr;
+__thread bool ThreadStatusUtil::thread_updater_initialized_ = false;
+
+void ThreadStatusUtil::SetThreadType(
+    const Env* env, ThreadStatus::ThreadType thread_type) {
+  if (!MaybeInitThreadLocalUpdater(env)) {
+    return;
+  }
+  assert(thread_updater_local_cache_);
+  thread_updater_local_cache_->SetThreadType(thread_type);
+}
+
+void ThreadStatusUtil::UnregisterThread() {
+  thread_updater_initialized_ = false;
+  if (thread_updater_local_cache_ != nullptr) {
+    thread_updater_local_cache_->UnregisterThread();
+    thread_updater_local_cache_ = nullptr;
+  }
+}
+
+void ThreadStatusUtil::SetColumnFamily(const ColumnFamilyData* cfd) {
+  if (!MaybeInitThreadLocalUpdater(cfd->ioptions()->env)) {
+    return;
+  }
+  assert(thread_updater_local_cache_);
+  if (cfd != nullptr && cfd->options()->enable_thread_tracking) {
+    thread_updater_local_cache_->SetColumnFamilyInfoKey(cfd);
+  } else {
+    // When cfd == nullptr or enable_thread_tracking == false, we set
+    // ColumnFamilyInfoKey to nullptr, which makes SetThreadOperation
+    // and SetThreadState become no-op.
+    thread_updater_local_cache_->SetColumnFamilyInfoKey(nullptr);
+  }
+}
+
+void ThreadStatusUtil::SetThreadOperation(ThreadStatus::OperationType op) {
+  if (thread_updater_local_cache_ == nullptr) {
+    // thread_updater_local_cache_ must be set in SetColumnFamily
+    // or other ThreadStatusUtil functions.
+    return;
+  }
+
+  if (op != ThreadStatus::OP_UNKNOWN) {
+    uint64_t current_time = Env::Default()->NowMicros();
+    thread_updater_local_cache_->SetOperationStartTime(current_time);
+  } else {
+    // TDOO(yhchiang): we could report the time when we set operation to
+    // OP_UNKNOWN once the whole instrumentation has been done.
+    thread_updater_local_cache_->SetOperationStartTime(0);
+  }
+  thread_updater_local_cache_->SetThreadOperation(op);
+}
+
+ThreadStatus::OperationStage ThreadStatusUtil::SetThreadOperationStage(
+    ThreadStatus::OperationStage stage) {
+  if (thread_updater_local_cache_ == nullptr) {
+    // thread_updater_local_cache_ must be set in SetColumnFamily
+    // or other ThreadStatusUtil functions.
+    return ThreadStatus::STAGE_UNKNOWN;
+  }
+
+  return thread_updater_local_cache_->SetThreadOperationStage(stage);
+}
+
+void ThreadStatusUtil::SetThreadOperationProperty(
+    int code, uint64_t value) {
+  if (thread_updater_local_cache_ == nullptr) {
+    // thread_updater_local_cache_ must be set in SetColumnFamily
+    // or other ThreadStatusUtil functions.
+    return;
+  }
+
+  thread_updater_local_cache_->SetThreadOperationProperty(
+      code, value);
+}
+
+void ThreadStatusUtil::IncreaseThreadOperationProperty(
+    int code, uint64_t delta) {
+  if (thread_updater_local_cache_ == nullptr) {
+    // thread_updater_local_cache_ must be set in SetColumnFamily
+    // or other ThreadStatusUtil functions.
+    return;
+  }
+
+  thread_updater_local_cache_->IncreaseThreadOperationProperty(
+      code, delta);
+}
+
+void ThreadStatusUtil::SetThreadState(ThreadStatus::StateType state) {
+  if (thread_updater_local_cache_ == nullptr) {
+    // thread_updater_local_cache_ must be set in SetColumnFamily
+    // or other ThreadStatusUtil functions.
+    return;
+  }
+
+  thread_updater_local_cache_->SetThreadState(state);
+}
+
+void ThreadStatusUtil::ResetThreadStatus() {
+  if (thread_updater_local_cache_ == nullptr) {
+    return;
+  }
+  thread_updater_local_cache_->ResetThreadStatus();
+}
+
+void ThreadStatusUtil::NewColumnFamilyInfo(
+    const DB* db, const ColumnFamilyData* cfd) {
+  if (!MaybeInitThreadLocalUpdater(cfd->ioptions()->env)) {
+    return;
+  }
+  assert(thread_updater_local_cache_);
+  if (thread_updater_local_cache_) {
+    thread_updater_local_cache_->NewColumnFamilyInfo(
+        db, db->GetName(), cfd, cfd->GetName());
+  }
+}
+
+void ThreadStatusUtil::EraseColumnFamilyInfo(
+    const ColumnFamilyData* cfd) {
+  if (thread_updater_local_cache_ == nullptr) {
+    return;
+  }
+  thread_updater_local_cache_->EraseColumnFamilyInfo(cfd);
+}
+
+void ThreadStatusUtil::EraseDatabaseInfo(const DB* db) {
+  if (thread_updater_local_cache_ == nullptr) {
+    return;
+  }
+  thread_updater_local_cache_->EraseDatabaseInfo(db);
+}
+
+bool ThreadStatusUtil::MaybeInitThreadLocalUpdater(const Env* env) {
+  if (!thread_updater_initialized_ && env != nullptr) {
+    thread_updater_initialized_ = true;
+    thread_updater_local_cache_ = env->GetThreadStatusUpdater();
+  }
+  return (thread_updater_local_cache_ != nullptr);
+}
+
+AutoThreadOperationStageUpdater::AutoThreadOperationStageUpdater(
+    ThreadStatus::OperationStage stage) {
+  prev_stage_ = ThreadStatusUtil::SetThreadOperationStage(stage);
+}
+
+AutoThreadOperationStageUpdater::~AutoThreadOperationStageUpdater() {
+  ThreadStatusUtil::SetThreadOperationStage(prev_stage_);
+}
+
+#else
+
+ThreadStatusUpdater* ThreadStatusUtil::thread_updater_local_cache_ = nullptr;
+bool ThreadStatusUtil::thread_updater_initialized_ = false;
+
+bool ThreadStatusUtil::MaybeInitThreadLocalUpdater(const Env* env) {
+  return false;
+}
+
+void ThreadStatusUtil::SetColumnFamily(const ColumnFamilyData* cfd) {
+}
+
+void ThreadStatusUtil::SetThreadOperation(ThreadStatus::OperationType op) {
+}
+
+void ThreadStatusUtil::SetThreadOperationProperty(
+    int code, uint64_t value) {
+}
+
+void ThreadStatusUtil::IncreaseThreadOperationProperty(
+    int code, uint64_t delta) {
+}
+
+void ThreadStatusUtil::SetThreadState(ThreadStatus::StateType state) {
+}
+
+void ThreadStatusUtil::NewColumnFamilyInfo(
+    const DB* db, const ColumnFamilyData* cfd) {
+}
+
+void ThreadStatusUtil::EraseColumnFamilyInfo(
+    const ColumnFamilyData* cfd) {
+}
+
+void ThreadStatusUtil::EraseDatabaseInfo(const DB* db) {
+}
+
+void ThreadStatusUtil::ResetThreadStatus() {
+}
+
+AutoThreadOperationStageUpdater::AutoThreadOperationStageUpdater(
+    ThreadStatus::OperationStage stage) {
+}
+
+AutoThreadOperationStageUpdater::~AutoThreadOperationStageUpdater() {
+}
+
+#endif  // ROCKSDB_USING_THREAD_STATUS
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/thread_status_util.h b/src/rocksdb/util/thread_status_util.h
new file mode 100644
index 0000000..ba0238d
--- /dev/null
+++ b/src/rocksdb/util/thread_status_util.h
@@ -0,0 +1,131 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include "db/column_family.h"
+#include "rocksdb/env.h"
+#include "rocksdb/thread_status.h"
+#include "util/thread_status_updater.h"
+
+namespace rocksdb {
+class ColumnFamilyData;
+
+
+// The static utility class for updating thread-local status.
+//
+// The thread-local status is updated via the thread-local cached
+// pointer thread_updater_local_cache_.  During each function call,
+// when ThreadStatusUtil finds thread_updater_local_cache_ is
+// left uninitialized (determined by thread_updater_initialized_),
+// it will tries to initialize it using the return value of
+// Env::GetThreadStatusUpdater().  When thread_updater_local_cache_
+// is initialized by a non-null pointer, each function call will
+// then update the status of the current thread.  Otherwise,
+// all function calls to ThreadStatusUtil will be no-op.
+class ThreadStatusUtil {
+ public:
+  // Set the thread type of the current thread.
+  static void SetThreadType(
+      const Env* env, ThreadStatus::ThreadType thread_type);
+
+  // Unregister the current thread.
+  static void UnregisterThread();
+
+  // Create an entry in the global ColumnFamilyInfo table for the
+  // specified column family.  This function should be called only
+  // when the current thread does not hold db_mutex.
+  static void NewColumnFamilyInfo(
+      const DB* db, const ColumnFamilyData* cfd);
+
+  // Erase the ConstantColumnFamilyInfo that is associated with the
+  // specified ColumnFamilyData.  This function should be called only
+  // when the current thread does not hold db_mutex.
+  static void EraseColumnFamilyInfo(const ColumnFamilyData* cfd);
+
+  // Erase all ConstantColumnFamilyInfo that is associated with the
+  // specified db instance.  This function should be called only when
+  // the current thread does not hold db_mutex.
+  static void EraseDatabaseInfo(const DB* db);
+
+  // Update the thread status to indicate the current thread is doing
+  // something related to the specified column family.
+  static void SetColumnFamily(const ColumnFamilyData* cfd);
+
+  static void SetThreadOperation(ThreadStatus::OperationType type);
+
+  static ThreadStatus::OperationStage SetThreadOperationStage(
+      ThreadStatus::OperationStage stage);
+
+  static void SetThreadOperationProperty(
+      int code, uint64_t value);
+
+  static void IncreaseThreadOperationProperty(
+      int code, uint64_t delta);
+
+  static void SetThreadState(ThreadStatus::StateType type);
+
+  static void ResetThreadStatus();
+
+#ifndef NDEBUG
+  static void TEST_SetStateDelay(
+      const ThreadStatus::StateType state, int micro);
+  static void TEST_StateDelay(const ThreadStatus::StateType state);
+#endif
+
+ protected:
+  // Initialize the thread-local ThreadStatusUpdater when it finds
+  // the cached value is nullptr.  Returns true if it has cached
+  // a non-null pointer.
+  static bool MaybeInitThreadLocalUpdater(const Env* env);
+
+#if ROCKSDB_USING_THREAD_STATUS
+  // A boolean flag indicating whether thread_updater_local_cache_
+  // is initialized.  It is set to true when an Env uses any
+  // ThreadStatusUtil functions using the current thread other
+  // than UnregisterThread().  It will be set to false when
+  // UnregisterThread() is called.
+  //
+  // When this variable is set to true, thread_updater_local_cache_
+  // will not be updated until this variable is again set to false
+  // in UnregisterThread().
+  static  __thread bool thread_updater_initialized_;
+
+  // The thread-local cached ThreadStatusUpdater that caches the
+  // thread_status_updater_ of the first Env that uses any ThreadStatusUtil
+  // function other than UnregisterThread().  This variable will
+  // be cleared when UnregisterThread() is called.
+  //
+  // When this variable is set to a non-null pointer, then the status
+  // of the current thread will be updated when a function of
+  // ThreadStatusUtil is called.  Otherwise, all functions of
+  // ThreadStatusUtil will be no-op.
+  //
+  // When thread_updater_initialized_ is set to true, this variable
+  // will not be updated until this thread_updater_initialized_ is
+  // again set to false in UnregisterThread().
+  static __thread ThreadStatusUpdater* thread_updater_local_cache_;
+#else
+  static bool thread_updater_initialized_;
+  static ThreadStatusUpdater* thread_updater_local_cache_;
+#endif
+};
+
+// A helper class for updating thread state.  It will set the
+// thread state according to the input parameter in its constructor
+// and set the thread state to the previous state in its destructor.
+class AutoThreadOperationStageUpdater {
+ public:
+  explicit AutoThreadOperationStageUpdater(
+      ThreadStatus::OperationStage stage);
+  ~AutoThreadOperationStageUpdater();
+
+#if ROCKSDB_USING_THREAD_STATUS
+ private:
+  ThreadStatus::OperationStage prev_stage_;
+#endif
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/thread_status_util_debug.cc b/src/rocksdb/util/thread_status_util_debug.cc
new file mode 100644
index 0000000..94b19f3
--- /dev/null
+++ b/src/rocksdb/util/thread_status_util_debug.cc
@@ -0,0 +1,32 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#include <atomic>
+
+#include "rocksdb/env.h"
+#include "util/thread_status_updater.h"
+#include "util/thread_status_util.h"
+
+namespace rocksdb {
+
+#ifndef NDEBUG
+// the delay for debugging purpose.
+static std::atomic<int> states_delay[ThreadStatus::NUM_STATE_TYPES];
+
+void ThreadStatusUtil::TEST_SetStateDelay(
+    const ThreadStatus::StateType state, int micro) {
+  states_delay[state].store(micro, std::memory_order_relaxed);
+}
+
+void ThreadStatusUtil::TEST_StateDelay(const ThreadStatus::StateType state) {
+  auto delay = states_delay[state].load(std::memory_order_relaxed);
+  if (delay > 0) {
+    Env::Default()->SleepForMicroseconds(delay);
+  }
+}
+
+#endif  // !NDEBUG
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/vectorrep.cc b/src/rocksdb/util/vectorrep.cc
index c7f9cca..4e4827a 100644
--- a/src/rocksdb/util/vectorrep.cc
+++ b/src/rocksdb/util/vectorrep.cc
@@ -25,7 +25,8 @@ using namespace stl_wrappers;
 
 class VectorRep : public MemTableRep {
  public:
-  VectorRep(const KeyComparator& compare, Arena* arena, size_t count);
+  VectorRep(const KeyComparator& compare, MemTableAllocator* allocator,
+            size_t count);
 
   // Insert key into the collection. (The caller will pack key and value into a
   // single buffer and pass that in as the parameter to Insert)
@@ -91,11 +92,8 @@ class VectorRep : public MemTableRep {
     virtual void SeekToLast() override;
   };
 
-  // Unhide default implementations of GetIterator()
-  using MemTableRep::GetIterator;
-
   // Return an iterator over the keys in this representation.
-  virtual MemTableRep::Iterator* GetIterator() override;
+  virtual MemTableRep::Iterator* GetIterator(Arena* arena) override;
 
  private:
   friend class Iterator;
@@ -109,7 +107,6 @@ class VectorRep : public MemTableRep {
 
 void VectorRep::Insert(KeyHandle handle) {
   auto* key = static_cast<char*>(handle);
-  assert(!Contains(key));
   WriteLock l(&rwlock_);
   assert(!immutable_);
   bucket_->push_back(key);
@@ -135,8 +132,9 @@ size_t VectorRep::ApproximateMemoryUsage() {
     );
 }
 
-VectorRep::VectorRep(const KeyComparator& compare, Arena* arena, size_t count)
-  : MemTableRep(arena),
+VectorRep::VectorRep(const KeyComparator& compare, MemTableAllocator* allocator,
+                     size_t count)
+  : MemTableRep(allocator),
     bucket_(new Bucket()),
     immutable_(false),
     sorted_(false),
@@ -180,14 +178,14 @@ bool VectorRep::Iterator::Valid() const {
 // Returns the key at the current position.
 // REQUIRES: Valid()
 const char* VectorRep::Iterator::key() const {
-  assert(Valid());
+  assert(sorted_);
   return *cit_;
 }
 
 // Advances to the next position.
 // REQUIRES: Valid()
 void VectorRep::Iterator::Next() {
-  assert(Valid());
+  assert(sorted_);
   if (cit_ == bucket_->end()) {
     return;
   }
@@ -197,7 +195,7 @@ void VectorRep::Iterator::Next() {
 // Advances to the previous position.
 // REQUIRES: Valid()
 void VectorRep::Iterator::Prev() {
-  assert(Valid());
+  assert(sorted_);
   if (cit_ == bucket_->begin()) {
     // If you try to go back from the first element, the iterator should be
     // invalidated. So we set it to past-the-end. This means that you can
@@ -252,31 +250,43 @@ void VectorRep::Get(const LookupKey& k, void* callback_args,
     bucket.reset(new Bucket(*bucket_));  // make a copy
   }
   VectorRep::Iterator iter(vector_rep, immutable_ ? bucket_ : bucket, compare_);
-  rwlock_.Unlock();
+  rwlock_.ReadUnlock();
 
   for (iter.Seek(k.user_key(), k.memtable_key().data());
        iter.Valid() && callback_func(callback_args, iter.key()); iter.Next()) {
   }
 }
 
-MemTableRep::Iterator* VectorRep::GetIterator() {
+MemTableRep::Iterator* VectorRep::GetIterator(Arena* arena) {
+  char* mem = nullptr;
+  if (arena != nullptr) {
+    mem = arena->AllocateAligned(sizeof(Iterator));
+  }
   ReadLock l(&rwlock_);
   // Do not sort here. The sorting would be done the first time
   // a Seek is performed on the iterator.
   if (immutable_) {
-    return new Iterator(this, bucket_, compare_);
+    if (arena == nullptr) {
+      return new Iterator(this, bucket_, compare_);
+    } else {
+      return new (mem) Iterator(this, bucket_, compare_);
+    }
   } else {
     std::shared_ptr<Bucket> tmp;
     tmp.reset(new Bucket(*bucket_)); // make a copy
-    return new Iterator(nullptr, tmp, compare_);
+    if (arena == nullptr) {
+      return new Iterator(nullptr, tmp, compare_);
+    } else {
+      return new (mem) Iterator(nullptr, tmp, compare_);
+    }
   }
 }
 } // anon namespace
 
 MemTableRep* VectorRepFactory::CreateMemTableRep(
-    const MemTableRep::KeyComparator& compare, Arena* arena,
-    const SliceTransform*) {
-  return new VectorRep(compare, arena, count_);
+    const MemTableRep::KeyComparator& compare, MemTableAllocator* allocator,
+    const SliceTransform*, Logger* logger) {
+  return new VectorRep(compare, allocator, count_);
 }
 } // namespace rocksdb
 #endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/util/xfunc.cc b/src/rocksdb/util/xfunc.cc
new file mode 100644
index 0000000..c5d6b5a
--- /dev/null
+++ b/src/rocksdb/util/xfunc.cc
@@ -0,0 +1,69 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#ifdef XFUNC
+#include <string>
+#include "db/db_impl.h"
+#include "db/managed_iterator.h"
+#include "rocksdb/options.h"
+#include "util/xfunc.h"
+
+
+namespace rocksdb {
+
+std::string XFuncPoint::xfunc_test_;
+bool XFuncPoint::initialized_ = false;
+bool XFuncPoint::enabled_ = false;
+int XFuncPoint::skip_policy_ = 0;
+
+void GetXFTestOptions(Options* options, int skip_policy) {
+  if (XFuncPoint::Check("inplace_lock_test") &&
+      (!(skip_policy & kSkipNoSnapshot))) {
+    options->inplace_update_support = true;
+  }
+}
+
+void xf_manage_release(ManagedIterator* iter) {
+  if (!(XFuncPoint::GetSkip() & kSkipNoPrefix)) {
+    iter->ReleaseIter(false);
+  }
+}
+
+void xf_manage_options(ReadOptions* read_options) {
+  if (!XFuncPoint::Check("managed_xftest_dropold") &&
+      (!XFuncPoint::Check("managed_xftest_release"))) {
+    return;
+  }
+  read_options->managed = true;
+}
+
+void xf_manage_new(DBImpl* db, ReadOptions* read_options,
+                   bool is_snapshot_supported) {
+  if ((!XFuncPoint::Check("managed_xftest_dropold") &&
+       (!XFuncPoint::Check("managed_xftest_release"))) ||
+      (!read_options->managed)) {
+    return;
+  }
+  if ((!read_options->tailing) && (read_options->snapshot == nullptr) &&
+      (!is_snapshot_supported)) {
+    read_options->managed = false;
+    return;
+  }
+  if (db->GetOptions().prefix_extractor != nullptr) {
+    if (strcmp(db->GetOptions().table_factory.get()->Name(), "PlainTable")) {
+      if (!(XFuncPoint::GetSkip() & kSkipNoPrefix)) {
+        read_options->total_order_seek = true;
+      }
+    } else {
+      read_options->managed = false;
+    }
+  }
+}
+
+void xf_manage_create(ManagedIterator* iter) { iter->SetDropOld(false); }
+
+}  // namespace rocksdb
+
+#endif  // XFUNC
diff --git a/src/rocksdb/util/xfunc.h b/src/rocksdb/util/xfunc.h
new file mode 100644
index 0000000..78004cb
--- /dev/null
+++ b/src/rocksdb/util/xfunc.h
@@ -0,0 +1,113 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+#pragma once
+
+#include <cstdlib>
+#include <string>
+
+namespace rocksdb {
+
+/*
+ * If ROCKSDB_XFTEST_FORCE has a value of 1, XFUNC is forced to be defined.
+ * If ROCKSDB_XFTEST_FORCE has a value other than 1,
+ *    XFUNC is forced to be undefined.
+ * If ROCKSDB_XFTEST_FORCE is undefined, XFUNC is defined based on NDEBUG,
+ *   with XFUNC only being set for debug builds.
+ */
+#if defined(ROCKSDB_XFTEST_FORCE)
+#ifndef ROCKSDB_LITE
+#if (ROCKSDB_XFTEST_FORCE == 1)
+#define XFUNC
+#endif  // ROCKSDB_XFTEST_FORCE == 1
+#elif defined(NDEBUG)
+#else
+#define XFUNC
+#endif  // defined(ROCKSDB_XFTEST_FORCE)
+#endif  // !ROCKSDB_LITE
+
+#ifndef XFUNC
+#define XFUNC_TEST(condition, location, lfname, fname, ...)
+#else
+
+struct Options;
+class ManagedIterator;
+class DBImpl;
+void GetXFTestOptions(Options* options, int skip_policy);
+void xf_manage_release(ManagedIterator* iter);
+void xf_manage_new(DBImpl* db, ReadOptions* readoptions,
+                   bool is_snapshot_supported);
+void xf_manage_create(ManagedIterator* iter);
+void xf_manage_options(ReadOptions* read_options);
+
+// This class provides the facility to run custom code to test a specific
+// feature typically with all existing unit tests.
+// A developer could specify cross functional test points in the codebase
+// via XFUNC_TEST.
+// Each xfunc test represents a position in the execution stream of a thread.
+// Whenever that particular piece of code is called, the given cross-functional
+// test point is executed.
+// eg. on DBOpen, a particular option can be set.
+// on Get, a particular option can be set, or a specific check can be invoked.
+// XFUNC_TEST(TestName, location, lfname, FunctionName, Args)
+// Turn on a specific cross functional test by setting the environment variable
+// ROCKSDB_XFUNC_TEST
+
+class XFuncPoint {
+ public:
+  // call once at the beginning of a test to get the test name
+  static void Init() {
+    char* s = getenv("ROCKSDB_XFUNC_TEST");
+    if (s == nullptr) {
+      xfunc_test_ = "";
+      enabled_ = false;
+    } else {
+      xfunc_test_ = s;
+      enabled_ = true;
+    }
+    initialized_ = true;
+  }
+
+  static bool Initialized() { return initialized_; }
+
+  static bool Check(std::string test) {
+    return (enabled_ &&
+            ((test.compare("") == 0) || (test.compare(xfunc_test_) == 0)));
+  }
+
+  static void SetSkip(int skip) { skip_policy_ = skip; }
+  static int GetSkip(void) { return skip_policy_; }
+
+ private:
+  static std::string xfunc_test_;
+  static bool initialized_;
+  static bool enabled_;
+  static int skip_policy_;
+};
+
+// Use XFUNC_TEST to specify cross functional test points inside the code base.
+// By setting ROCKSDB_XFUNC_TEST, all XFUNC_TEST having that
+// value in the condition field will be executed.
+// The second argument specifies a string representing the calling location
+// The third argument, lfname, is the name of the function which will be created
+// and called.
+// The fourth argument fname represents the function to be called
+// The arguments following that are the arguments to fname
+// See Options::Options in options.h for an example use case.
+// XFUNC_TEST is no op in release build.
+#define XFUNC_TEST(condition, location, lfname, fname, ...)         \
+  {                                                                 \
+    if (!XFuncPoint::Initialized()) {                               \
+      XFuncPoint::Init();                                           \
+    }                                                               \
+    if (XFuncPoint::Check(condition)) {                             \
+      std::function<void()> lfname = std::bind(fname, __VA_ARGS__); \
+      lfname();                                                     \
+    }                                                               \
+  }
+
+#endif  // XFUNC
+
+enum SkipPolicy { kSkipNone = 0, kSkipNoSnapshot = 1, kSkipNoPrefix = 2 };
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/xxhash.cc b/src/rocksdb/util/xxhash.cc
index 6dfd4b2..3204cbb 100644
--- a/src/rocksdb/util/xxhash.cc
+++ b/src/rocksdb/util/xxhash.cc
@@ -92,6 +92,7 @@ FORCE_INLINE void  XXH_free  (void* p)  { free(p); }
 FORCE_INLINE void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcpy(dest,src,size); }
 
 
+namespace rocksdb {
 //**************************************
 // Basic Types
 //**************************************
@@ -473,3 +474,5 @@ U32 XXH32_digest (void* state_in)
 
     return h32;
 }
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/xxhash.h b/src/rocksdb/util/xxhash.h
index ceff066..3343e34 100644
--- a/src/rocksdb/util/xxhash.h
+++ b/src/rocksdb/util/xxhash.h
@@ -60,7 +60,7 @@ It depends on successfully passing SMHasher test set.
 #pragma once
 
 #if defined (__cplusplus)
-extern "C" {
+namespace rocksdb {
 #endif
 
 
@@ -160,5 +160,5 @@ To free memory context, use XXH32_digest(), or free().
 
 
 #if defined (__cplusplus)
-}
+}  // namespace rocksdb
 #endif
diff --git a/src/rocksdb/utilities/backupable/backupable_db.cc b/src/rocksdb/utilities/backupable/backupable_db.cc
index 87901e0..ab640ed 100644
--- a/src/rocksdb/utilities/backupable/backupable_db.cc
+++ b/src/rocksdb/utilities/backupable/backupable_db.cc
@@ -9,18 +9,24 @@
 
 #ifndef ROCKSDB_LITE
 
-#include "utilities/backupable_db.h"
+#include "rocksdb/utilities/backupable_db.h"
 #include "db/filename.h"
 #include "util/coding.h"
 #include "util/crc32c.h"
+#include "util/logging.h"
+#include "util/string_util.h"
 #include "rocksdb/transaction_log.h"
 
+#ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
+#endif
 
 #include <inttypes.h>
+#include <stdlib.h>
 #include <algorithm>
 #include <vector>
 #include <map>
+#include <sstream>
 #include <string>
 #include <limits>
 #include <atomic>
@@ -29,9 +35,10 @@
 namespace rocksdb {
 
 namespace {
-class RateLimiter {
+class BackupRateLimiter {
  public:
-  RateLimiter(Env* env, uint64_t max_bytes_per_second, uint64_t bytes_per_check)
+  BackupRateLimiter(Env* env, uint64_t max_bytes_per_second,
+                   uint64_t bytes_per_check)
       : env_(env),
         max_bytes_per_second_(max_bytes_per_second),
         bytes_per_check_(bytes_per_check),
@@ -51,7 +58,8 @@ class RateLimiter {
         (bytes_since_start_ * kMicrosInSecond) / max_bytes_per_second_;
 
     if (should_take_micros > interval) {
-      env_->SleepForMicroseconds(should_take_micros - interval);
+      env_->SleepForMicroseconds(
+          static_cast<int>(should_take_micros - interval));
       now = env_->NowMicros();
     }
     // reset interval
@@ -69,6 +77,27 @@ class RateLimiter {
 };
 }  // namespace
 
+void BackupStatistics::IncrementNumberSuccessBackup() {
+  number_success_backup++;
+}
+void BackupStatistics::IncrementNumberFailBackup() {
+  number_fail_backup++;
+}
+
+uint32_t BackupStatistics::GetNumberSuccessBackup() const {
+  return number_success_backup;
+}
+uint32_t BackupStatistics::GetNumberFailBackup() const {
+  return number_fail_backup;
+}
+
+std::string BackupStatistics::ToString() const {
+  char result[50];
+  snprintf(result, sizeof(result), "# success backup: %u, # fail backup: %u",
+           GetNumberSuccessBackup(), GetNumberFailBackup());
+  return result;
+}
+
 void BackupableDBOptions::Dump(Logger* logger) const {
   Log(logger, "        Options.backup_dir: %s", backup_dir.c_str());
   Log(logger, "        Options.backup_env: %p", backup_env);
@@ -90,22 +119,22 @@ class BackupEngineImpl : public BackupEngine {
   BackupEngineImpl(Env* db_env, const BackupableDBOptions& options,
                    bool read_only = false);
   ~BackupEngineImpl();
-  Status CreateNewBackup(DB* db, bool flush_before_backup = false);
-  Status PurgeOldBackups(uint32_t num_backups_to_keep);
-  Status DeleteBackup(BackupID backup_id);
-  void StopBackup() {
+  Status CreateNewBackup(DB* db, bool flush_before_backup = false) override;
+  Status PurgeOldBackups(uint32_t num_backups_to_keep) override;
+  Status DeleteBackup(BackupID backup_id) override;
+  void StopBackup() override {
     stop_backup_.store(true, std::memory_order_release);
   }
+  Status GarbageCollect() override;
 
-  void GetBackupInfo(std::vector<BackupInfo>* backup_info);
-  Status RestoreDBFromBackup(BackupID backup_id, const std::string& db_dir,
-                             const std::string& wal_dir,
-                             const RestoreOptions& restore_options =
-                                 RestoreOptions());
-  Status RestoreDBFromLatestBackup(const std::string& db_dir,
-                                   const std::string& wal_dir,
-                                   const RestoreOptions& restore_options =
-                                       RestoreOptions()) {
+  void GetBackupInfo(std::vector<BackupInfo>* backup_info) override;
+  void GetCorruptedBackups(std::vector<BackupID>* corrupt_backup_ids) override;
+  Status RestoreDBFromBackup(
+      BackupID backup_id, const std::string& db_dir, const std::string& wal_dir,
+      const RestoreOptions& restore_options = RestoreOptions()) override;
+  Status RestoreDBFromLatestBackup(
+      const std::string& db_dir, const std::string& wal_dir,
+      const RestoreOptions& restore_options = RestoreOptions()) override {
     return RestoreDBFromBackup(latest_backup_id_, db_dir, wal_dir,
                                restore_options);
   }
@@ -117,19 +146,26 @@ class BackupEngineImpl : public BackupEngine {
     FileInfo(const std::string& fname, uint64_t sz, uint32_t checksum)
       : refs(0), filename(fname), size(sz), checksum_value(checksum) {}
 
+    FileInfo(const FileInfo&) = delete;
+    FileInfo& operator=(const FileInfo&) = delete;
+
     int refs;
     const std::string filename;
     const uint64_t size;
-    uint32_t checksum_value;
+    const uint32_t checksum_value;
   };
 
   class BackupMeta {
    public:
     BackupMeta(const std::string& meta_filename,
-        std::unordered_map<std::string, FileInfo>* file_infos, Env* env)
+        std::unordered_map<std::string, std::shared_ptr<FileInfo>>* file_infos,
+        Env* env)
       : timestamp_(0), size_(0), meta_filename_(meta_filename),
         file_infos_(file_infos), env_(env) {}
 
+    BackupMeta(const BackupMeta&) = delete;
+    BackupMeta& operator=(const BackupMeta&) = delete;
+
     ~BackupMeta() {}
 
     void RecordTimestamp() {
@@ -141,6 +177,7 @@ class BackupEngineImpl : public BackupEngine {
     uint64_t GetSize() const {
       return size_;
     }
+    uint32_t GetNumberFiles() { return static_cast<uint32_t>(files_.size()); }
     void SetSequenceNumber(uint64_t sequence_number) {
       sequence_number_ = sequence_number;
     }
@@ -148,7 +185,7 @@ class BackupEngineImpl : public BackupEngine {
       return sequence_number_;
     }
 
-    Status AddFile(const FileInfo& file_info);
+    Status AddFile(std::shared_ptr<FileInfo> file_info);
 
     void Delete(bool delete_meta = true);
 
@@ -156,13 +193,35 @@ class BackupEngineImpl : public BackupEngine {
       return files_.empty();
     }
 
-    const std::vector<std::string>& GetFiles() {
+    std::shared_ptr<FileInfo> GetFile(const std::string& filename) const {
+      auto it = file_infos_->find(filename);
+      if (it == file_infos_->end())
+        return nullptr;
+      return it->second;
+    }
+
+    const std::vector<std::shared_ptr<FileInfo>>& GetFiles() {
       return files_;
     }
 
     Status LoadFromFile(const std::string& backup_dir);
     Status StoreToFile(bool sync);
 
+    std::string GetInfoString() {
+      std::ostringstream ss;
+      ss << "Timestamp: " << timestamp_ << std::endl;
+      char human_size[16];
+      AppendHumanBytes(size_, human_size, sizeof(human_size));
+      ss << "Size: " << human_size << std::endl;
+      ss << "Files:" << std::endl;
+      for (const auto& file : files_) {
+        AppendHumanBytes(file->size, human_size, sizeof(human_size));
+        ss << file->filename << ", size " << human_size << ", refs "
+           << file->refs << std::endl;
+      }
+      return ss.str();
+    }
+
    private:
     int64_t timestamp_;
     // sequence number is only approximate, should not be used
@@ -171,8 +230,8 @@ class BackupEngineImpl : public BackupEngine {
     uint64_t size_;
     std::string const meta_filename_;
     // files with relative paths (without "/" prefix!!)
-    std::vector<std::string> files_;
-    std::unordered_map<std::string, FileInfo>* file_infos_;
+    std::vector<std::shared_ptr<FileInfo>> files_;
+    std::unordered_map<std::string, std::shared_ptr<FileInfo>>* file_infos_;
     Env* env_;
 
     static const size_t max_backup_meta_file_size_ = 10 * 1024 * 1024;  // 10MB
@@ -193,7 +252,7 @@ class BackupEngineImpl : public BackupEngine {
                                        bool tmp = false,
                                        const std::string& file = "") const {
     assert(file.size() == 0 || file[0] != '/');
-    return GetPrivateDirRel() + "/" + std::to_string(backup_id) +
+    return GetPrivateDirRel() + "/" + rocksdb::ToString(backup_id) +
            (tmp ? ".tmp" : "") + "/" + file;
   }
   inline std::string GetSharedFileRel(const std::string& file = "",
@@ -212,8 +271,8 @@ class BackupEngineImpl : public BackupEngine {
     assert(file.size() == 0 || file[0] != '/');
     std::string file_copy = file;
     return file_copy.insert(file_copy.find_last_of('.'),
-                            "_" + std::to_string(checksum_value)
-                              + "_" + std::to_string(file_size));
+                            "_" + rocksdb::ToString(checksum_value) + "_" +
+                                rocksdb::ToString(file_size));
   }
   inline std::string GetFileFromChecksumFile(const std::string& file) const {
     assert(file.size() == 0 || file[0] != '/');
@@ -229,7 +288,7 @@ class BackupEngineImpl : public BackupEngine {
     return GetAbsolutePath("meta");
   }
   inline std::string GetBackupMetaFile(BackupID backup_id) const {
-    return GetBackupMetaDir() + "/" + std::to_string(backup_id);
+    return GetBackupMetaDir() + "/" + rocksdb::ToString(backup_id);
   }
 
   Status GetLatestBackupFileContents(uint32_t* latest_backup);
@@ -240,7 +299,7 @@ class BackupEngineImpl : public BackupEngine {
                   Env* src_env,
                   Env* dst_env,
                   bool sync,
-                  RateLimiter* rate_limiter,
+                  BackupRateLimiter* rate_limiter,
                   uint64_t* size = nullptr,
                   uint32_t* checksum_value = nullptr,
                   uint64_t size_limit = 0);
@@ -250,7 +309,7 @@ class BackupEngineImpl : public BackupEngine {
                     bool shared,
                     const std::string& src_dir,
                     const std::string& src_fname,  // starts with "/"
-                    RateLimiter* rate_limiter,
+                    BackupRateLimiter* rate_limiter,
                     uint64_t size_limit = 0,
                     bool shared_checksum = false);
 
@@ -259,16 +318,13 @@ class BackupEngineImpl : public BackupEngine {
                            uint64_t size_limit,
                            uint32_t* checksum_value);
 
-  // Will delete all the files we don't need anymore
-  // If full_scan == true, it will do the full scan of files/ directory
-  // and delete all the files that are not referenced from backuped_file_infos__
-  void GarbageCollection(bool full_scan);
-
   // backup state data
   BackupID latest_backup_id_;
-  std::map<BackupID, BackupMeta> backups_;
-  std::unordered_map<std::string, FileInfo> backuped_file_infos_;
-  std::vector<BackupID> obsolete_backups_;
+  std::map<BackupID, unique_ptr<BackupMeta>> backups_;
+  std::map<BackupID,
+           std::pair<Status, unique_ptr<BackupMeta>>> corrupt_backups_;
+  std::unordered_map<std::string,
+                     std::shared_ptr<FileInfo>> backuped_file_infos_;
   std::atomic<bool> stop_backup_;
 
   // options data
@@ -285,6 +341,7 @@ class BackupEngineImpl : public BackupEngine {
   static const size_t kDefaultCopyFileBufferSize = 5 * 1024 * 1024LL;  // 5MB
   size_t copy_file_buffer_size_;
   bool read_only_;
+  BackupStatistics backup_statistics_;
 };
 
 BackupEngine* BackupEngine::NewBackupEngine(
@@ -292,6 +349,13 @@ BackupEngine* BackupEngine::NewBackupEngine(
   return new BackupEngineImpl(db_env, options);
 }
 
+Status BackupEngine::Open(Env* env,
+                          const BackupableDBOptions& options,
+                          BackupEngine** backup_engine_ptr) {
+  *backup_engine_ptr = new BackupEngineImpl(env, options);
+  return Status::OK();
+}
+
 BackupEngineImpl::BackupEngineImpl(Env* db_env,
                                    const BackupableDBOptions& options,
                                    bool read_only)
@@ -333,48 +397,54 @@ BackupEngineImpl::BackupEngineImpl(Env* db_env,
   backup_env_->GetChildren(GetBackupMetaDir(), &backup_meta_files);
   // create backups_ structure
   for (auto& file : backup_meta_files) {
+    if (file == "." || file == "..") {
+      continue;
+    }
+    Log(options_.info_log, "Detected backup %s", file.c_str());
     BackupID backup_id = 0;
     sscanf(file.c_str(), "%u", &backup_id);
-    if (backup_id == 0 || file != std::to_string(backup_id)) {
+    if (backup_id == 0 || file != rocksdb::ToString(backup_id)) {
       if (!read_only_) {
+        Log(options_.info_log, "Unrecognized meta file %s, deleting",
+            file.c_str());
         // invalid file name, delete that
         backup_env_->DeleteFile(GetBackupMetaDir() + "/" + file);
       }
       continue;
     }
     assert(backups_.find(backup_id) == backups_.end());
-    backups_.insert(std::make_pair(
-        backup_id, BackupMeta(GetBackupMetaFile(backup_id),
-                              &backuped_file_infos_, backup_env_)));
+    backups_.insert(std::move(
+        std::make_pair(backup_id, unique_ptr<BackupMeta>(new BackupMeta(
+                                      GetBackupMetaFile(backup_id),
+                                      &backuped_file_infos_, backup_env_)))));
   }
 
-  if (options_.destroy_old_data) {  // Destory old data
+  if (options_.destroy_old_data) {  // Destroy old data
     assert(!read_only_);
-    for (auto& backup : backups_) {
-      backup.second.Delete();
-      obsolete_backups_.push_back(backup.first);
-    }
-    backups_.clear();
+    Log(options_.info_log,
+        "Backup Engine started with destroy_old_data == true, deleting all "
+        "backups");
+    PurgeOldBackups(0);
+    (void) GarbageCollect();
     // start from beginning
     latest_backup_id_ = 0;
-    // GarbageCollection() will do the actual deletion
   } else {  // Load data from storage
     // load the backups if any
     for (auto& backup : backups_) {
-      Status s = backup.second.LoadFromFile(options_.backup_dir);
+      Status s = backup.second->LoadFromFile(options_.backup_dir);
       if (!s.ok()) {
         Log(options_.info_log, "Backup %u corrupted -- %s", backup.first,
             s.ToString().c_str());
-        if (!read_only_) {
-          Log(options_.info_log, "-> Deleting backup %u", backup.first);
-        }
-        backup.second.Delete(!read_only_);
-        obsolete_backups_.push_back(backup.first);
+        corrupt_backups_.insert(std::make_pair(
+              backup.first, std::make_pair(s, std::move(backup.second))));
+      } else {
+        Log(options_.info_log, "Loading backup %" PRIu32 " OK:\n%s",
+            backup.first, backup.second->GetInfoString().c_str());
       }
     }
-    // delete obsolete backups from the structure
-    for (auto ob : obsolete_backups_) {
-      backups_.erase(ob);
+
+    for (const auto& corrupt : corrupt_backups_) {
+      backups_.erase(backups_.find(corrupt.first));
     }
 
     Status s = GetLatestBackupFileContents(&latest_backup_id_);
@@ -389,20 +459,32 @@ BackupEngineImpl::BackupEngineImpl(Env* db_env,
     }
   }
 
+  Log(options_.info_log, "Latest backup is %u", latest_backup_id_);
+
   // delete any backups that claim to be later than latest
-  for (auto itr = backups_.upper_bound(latest_backup_id_);
-       itr != backups_.end();) {
-    itr->second.Delete();
-    obsolete_backups_.push_back(itr->first);
-    itr = backups_.erase(itr);
+  std::vector<BackupID> later_ids;
+  for (auto itr = backups_.lower_bound(latest_backup_id_ + 1);
+       itr != backups_.end(); itr++) {
+    Log(options_.info_log,
+        "Found backup claiming to be later than latest: %" PRIu32, itr->first);
+    later_ids.push_back(itr->first);
+  }
+  for (auto id : later_ids) {
+    if (!read_only_) {
+      DeleteBackup(id);
+    } else {
+      auto backup = backups_.find(id);
+      // We just found it couple of lines earlier!
+      assert(backup != backups_.end());
+      backup->second->Delete(false);
+      backups_.erase(backup);
+    }
   }
 
   if (!read_only_) {
     PutLatestBackupFileContents(latest_backup_id_);  // Ignore errors
-    GarbageCollection(true);
   }
-  Log(options_.info_log, "Initialized BackupEngine, the latest backup is %u.",
-      latest_backup_id_);
+  Log(options_.info_log, "Initialized BackupEngine");
 }
 
 BackupEngineImpl::~BackupEngineImpl() { LogFlush(options_.info_log); }
@@ -426,19 +508,22 @@ Status BackupEngineImpl::CreateNewBackup(DB* db, bool flush_before_backup) {
     s = db->GetSortedWalFiles(live_wal_files);
   }
   if (!s.ok()) {
-    db->EnableFileDeletions();
+    db->EnableFileDeletions(false);
     return s;
   }
 
   BackupID new_backup_id = latest_backup_id_ + 1;
   assert(backups_.find(new_backup_id) == backups_.end());
-  auto ret = backups_.insert(std::make_pair(
-      new_backup_id, BackupMeta(GetBackupMetaFile(new_backup_id),
-                                &backuped_file_infos_, backup_env_)));
+  auto ret = backups_.insert(std::move(
+      std::make_pair(new_backup_id, unique_ptr<BackupMeta>(new BackupMeta(
+                                        GetBackupMetaFile(new_backup_id),
+                                        &backuped_file_infos_, backup_env_)))));
   assert(ret.second == true);
   auto& new_backup = ret.first->second;
-  new_backup.RecordTimestamp();
-  new_backup.SetSequenceNumber(sequence_number);
+  new_backup->RecordTimestamp();
+  new_backup->SetSequenceNumber(sequence_number);
+
+  auto start_backup = backup_env_-> NowMicros();
 
   Log(options_.info_log, "Started the backup process -- creating backup %u",
       new_backup_id);
@@ -447,11 +532,11 @@ Status BackupEngineImpl::CreateNewBackup(DB* db, bool flush_before_backup) {
   s = backup_env_->CreateDir(
       GetAbsolutePath(GetPrivateFileRel(new_backup_id, true)));
 
-  unique_ptr<RateLimiter> rate_limiter;
+  unique_ptr<BackupRateLimiter> rate_limiter;
   if (options_.backup_rate_limit > 0) {
     copy_file_buffer_size_ = options_.backup_rate_limit / 10;
-    rate_limiter.reset(new RateLimiter(db_env_, options_.backup_rate_limit,
-                                       copy_file_buffer_size_));
+    rate_limiter.reset(new BackupRateLimiter(db_env_,
+          options_.backup_rate_limit, copy_file_buffer_size_));
   }
 
   // copy live_files
@@ -471,7 +556,7 @@ Status BackupEngineImpl::CreateNewBackup(DB* db, bool flush_before_backup) {
     // * if it's kTableFile, then it's shared
     // * if it's kDescriptorFile, limit the size to manifest_file_size
     s = BackupFile(new_backup_id,
-                   &new_backup,
+                   new_backup.get(),
                    options_.share_table_files && type == kTableFile,
                    db->GetName(),            /* src_dir */
                    live_files[i],            /* src_fname */
@@ -486,7 +571,7 @@ Status BackupEngineImpl::CreateNewBackup(DB* db, bool flush_before_backup) {
       // we only care about live log files
       // copy the file into backup_dir/files/<new backup>/
       s = BackupFile(new_backup_id,
-                     &new_backup,
+                     new_backup.get(),
                      false, /* not shared */
                      db->GetOptions().wal_dir,
                      live_wal_files[i]->PathName(),
@@ -495,18 +580,24 @@ Status BackupEngineImpl::CreateNewBackup(DB* db, bool flush_before_backup) {
   }
 
   // we copied all the files, enable file deletions
-  db->EnableFileDeletions();
+  db->EnableFileDeletions(false);
 
   if (s.ok()) {
     // move tmp private backup to real backup folder
+    Log(options_.info_log,
+        "Moving tmp backup directory to the real one: %s -> %s\n",
+        GetAbsolutePath(GetPrivateFileRel(new_backup_id, true)).c_str(),
+        GetAbsolutePath(GetPrivateFileRel(new_backup_id, false)).c_str());
     s = backup_env_->RenameFile(
         GetAbsolutePath(GetPrivateFileRel(new_backup_id, true)),  // tmp
         GetAbsolutePath(GetPrivateFileRel(new_backup_id, false)));
   }
 
+  auto backup_time = backup_env_->NowMicros() - start_backup;
+
   if (s.ok()) {
     // persist the backup metadata on the disk
-    s = new_backup.StoreToFile(options_.sync);
+    s = new_backup->StoreToFile(options_.sync);
   }
   if (s.ok()) {
     // install the newly created backup meta! (atomic)
@@ -534,11 +625,18 @@ Status BackupEngineImpl::CreateNewBackup(DB* db, bool flush_before_backup) {
     }
   }
 
+  if (s.ok()) {
+    backup_statistics_.IncrementNumberSuccessBackup();
+  }
   if (!s.ok()) {
+    backup_statistics_.IncrementNumberFailBackup();
     // clean all the files we might have created
     Log(options_.info_log, "Backup failed -- %s", s.ToString().c_str());
-    backups_.erase(new_backup_id);
-    GarbageCollection(true);
+    Log(options_.info_log, "Backup Statistics %s\n",
+        backup_statistics_.ToString().c_str());
+    // delete files that we might have already written
+    DeleteBackup(new_backup_id);
+    GarbageCollect();
     return s;
   }
 
@@ -546,6 +644,18 @@ Status BackupEngineImpl::CreateNewBackup(DB* db, bool flush_before_backup) {
   // in the LATEST_BACKUP file
   latest_backup_id_ = new_backup_id;
   Log(options_.info_log, "Backup DONE. All is good");
+
+  // backup_speed is in byte/second
+  double backup_speed = new_backup->GetSize() / (1.048576 * backup_time);
+  Log(options_.info_log, "Backup number of files: %u",
+      new_backup->GetNumberFiles());
+  char human_size[16];
+  AppendHumanBytes(new_backup->GetSize(), human_size, sizeof(human_size));
+  Log(options_.info_log, "Backup size: %s", human_size);
+  Log(options_.info_log, "Backup time: %" PRIu64 " microseconds", backup_time);
+  Log(options_.info_log, "Backup speed: %.3f MB/s", backup_speed);
+  Log(options_.info_log, "Backup Statistics %s",
+      backup_statistics_.ToString().c_str());
   return s;
 }
 
@@ -553,13 +663,15 @@ Status BackupEngineImpl::PurgeOldBackups(uint32_t num_backups_to_keep) {
   assert(!read_only_);
   Log(options_.info_log, "Purging old backups, keeping %u",
       num_backups_to_keep);
-  while (num_backups_to_keep < backups_.size()) {
-    Log(options_.info_log, "Deleting backup %u", backups_.begin()->first);
-    backups_.begin()->second.Delete();
-    obsolete_backups_.push_back(backups_.begin()->first);
-    backups_.erase(backups_.begin());
+  std::vector<BackupID> to_delete;
+  auto itr = backups_.begin();
+  while ((backups_.size() - to_delete.size()) > num_backups_to_keep) {
+    to_delete.push_back(itr->first);
+    itr++;
+  }
+  for (auto backup_id : to_delete) {
+    DeleteBackup(backup_id);
   }
-  GarbageCollection(false);
   return Status::OK();
 }
 
@@ -567,35 +679,74 @@ Status BackupEngineImpl::DeleteBackup(BackupID backup_id) {
   assert(!read_only_);
   Log(options_.info_log, "Deleting backup %u", backup_id);
   auto backup = backups_.find(backup_id);
-  if (backup == backups_.end()) {
-    return Status::NotFound("Backup not found");
+  if (backup != backups_.end()) {
+    backup->second->Delete();
+    backups_.erase(backup);
+  } else {
+    auto corrupt = corrupt_backups_.find(backup_id);
+    if (corrupt == corrupt_backups_.end()) {
+      return Status::NotFound("Backup not found");
+    }
+    corrupt->second.second->Delete();
+    corrupt_backups_.erase(corrupt);
+  }
+
+  std::vector<std::string> to_delete;
+  for (auto& itr : backuped_file_infos_) {
+    if (itr.second->refs == 0) {
+      Status s = backup_env_->DeleteFile(GetAbsolutePath(itr.first));
+      Log(options_.info_log, "Deleting %s -- %s", itr.first.c_str(),
+          s.ToString().c_str());
+      to_delete.push_back(itr.first);
+    }
   }
-  backup->second.Delete();
-  obsolete_backups_.push_back(backup_id);
-  backups_.erase(backup);
-  GarbageCollection(false);
+  for (auto& td : to_delete) {
+    backuped_file_infos_.erase(td);
+  }
+
+  // take care of private dirs -- GarbageCollect() will take care of them
+  // if they are not empty
+  std::string private_dir = GetPrivateFileRel(backup_id);
+  Status s = backup_env_->DeleteDir(GetAbsolutePath(private_dir));
+  Log(options_.info_log, "Deleting private dir %s -- %s",
+      private_dir.c_str(), s.ToString().c_str());
   return Status::OK();
 }
 
 void BackupEngineImpl::GetBackupInfo(std::vector<BackupInfo>* backup_info) {
   backup_info->reserve(backups_.size());
   for (auto& backup : backups_) {
-    if (!backup.second.Empty()) {
-      backup_info->push_back(BackupInfo(
-          backup.first, backup.second.GetTimestamp(), backup.second.GetSize()));
+    if (!backup.second->Empty()) {
+        backup_info->push_back(BackupInfo(
+            backup.first, backup.second->GetTimestamp(),
+            backup.second->GetSize(),
+            backup.second->GetNumberFiles()));
     }
   }
 }
 
+void
+BackupEngineImpl::GetCorruptedBackups(
+    std::vector<BackupID>* corrupt_backup_ids) {
+  corrupt_backup_ids->reserve(corrupt_backups_.size());
+  for (auto& backup : corrupt_backups_) {
+    corrupt_backup_ids->push_back(backup.first);
+  }
+}
+
 Status BackupEngineImpl::RestoreDBFromBackup(
     BackupID backup_id, const std::string& db_dir, const std::string& wal_dir,
     const RestoreOptions& restore_options) {
+  auto corrupt_itr = corrupt_backups_.find(backup_id);
+  if (corrupt_itr != corrupt_backups_.end()) {
+    return corrupt_itr->second.first;
+  }
   auto backup_itr = backups_.find(backup_id);
   if (backup_itr == backups_.end()) {
     return Status::NotFound("Backup not found");
   }
   auto& backup = backup_itr->second;
-  if (backup.Empty()) {
+  if (backup->Empty()) {
     return Status::NotFound("Backup not found");
   }
 
@@ -636,14 +787,15 @@ Status BackupEngineImpl::RestoreDBFromBackup(
     DeleteChildren(db_dir);
   }
 
-  unique_ptr<RateLimiter> rate_limiter;
+  unique_ptr<BackupRateLimiter> rate_limiter;
   if (options_.restore_rate_limit > 0) {
     copy_file_buffer_size_ = options_.restore_rate_limit / 10;
-    rate_limiter.reset(new RateLimiter(db_env_, options_.restore_rate_limit,
-                                       copy_file_buffer_size_));
+    rate_limiter.reset(new BackupRateLimiter(db_env_,
+          options_.restore_rate_limit, copy_file_buffer_size_));
   }
   Status s;
-  for (auto& file : backup.GetFiles()) {
+  for (const auto& file_info : backup->GetFiles()) {
+    const std::string &file = file_info->filename;
     std::string dst;
     // 1. extract the filename
     size_t slash = file.find_last_of('/');
@@ -678,9 +830,7 @@ Status BackupEngineImpl::RestoreDBFromBackup(
       break;
     }
 
-    const auto iter = backuped_file_infos_.find(file);
-    assert(iter != backuped_file_infos_.end());
-    if (iter->second.checksum_value != checksum_value) {
+    if (file_info->checksum_value != checksum_value) {
       s = Status::Corruption("Checksum check failed");
       break;
     }
@@ -752,12 +902,13 @@ Status BackupEngineImpl::PutLatestBackupFileContents(uint32_t latest_backup) {
   return s;
 }
 
-Status BackupEngineImpl::CopyFile(const std::string& src,
-                                  const std::string& dst, Env* src_env,
-                                  Env* dst_env, bool sync,
-                                  RateLimiter* rate_limiter, uint64_t* size,
-                                  uint32_t* checksum_value,
-                                  uint64_t size_limit) {
+Status BackupEngineImpl::CopyFile(
+    const std::string& src,
+    const std::string& dst, Env* src_env,
+    Env* dst_env, bool sync,
+    BackupRateLimiter* rate_limiter, uint64_t* size,
+    uint32_t* checksum_value,
+    uint64_t size_limit) {
   Status s;
   unique_ptr<WritableFile> dst_file;
   unique_ptr<SequentialFile> src_file;
@@ -824,7 +975,7 @@ Status BackupEngineImpl::CopyFile(const std::string& src,
 Status BackupEngineImpl::BackupFile(BackupID backup_id, BackupMeta* backup,
                                     bool shared, const std::string& src_dir,
                                     const std::string& src_fname,
-                                    RateLimiter* rate_limiter,
+                                    BackupRateLimiter* rate_limiter,
                                     uint64_t size_limit,
                                     bool shared_checksum) {
 
@@ -863,22 +1014,35 @@ Status BackupEngineImpl::BackupFile(BackupID backup_id, BackupMeta* backup,
 
   // if it's shared, we also need to check if it exists -- if it does,
   // no need to copy it again
+  bool need_to_copy = true;
   if (shared && backup_env_->FileExists(dst_path)) {
+    need_to_copy = false;
     if (shared_checksum) {
       Log(options_.info_log,
           "%s already present, with checksum %u and size %" PRIu64,
           src_fname.c_str(), checksum_value, size);
+    } else if (backuped_file_infos_.find(dst_relative) ==
+               backuped_file_infos_.end()) {
+      // file already exists, but it's not referenced by any backup. overwrite
+      // the file
+      Log(options_.info_log,
+          "%s already present, but not referenced by any backup. We will "
+          "overwrite the file.",
+          src_fname.c_str());
+      need_to_copy = true;
+      backup_env_->DeleteFile(dst_path);
     } else {
-      backup_env_->GetFileSize(dst_path, &size);  // Ignore error
+      // the file is present and referenced by a backup
+      db_env_->GetFileSize(src_dir + src_fname, &size);  // Ignore error
       Log(options_.info_log, "%s already present, calculate checksum",
           src_fname.c_str());
-      s = CalculateChecksum(src_dir + src_fname,
-                            db_env_,
-                            size_limit,
+      s = CalculateChecksum(src_dir + src_fname, db_env_, size_limit,
                             &checksum_value);
     }
-  } else {
-    Log(options_.info_log, "Copying %s", src_fname.c_str());
+  }
+  if (need_to_copy) {
+    Log(options_.info_log, "Copying %s to %s", src_fname.c_str(),
+        dst_path_tmp.c_str());
     s = CopyFile(src_dir + src_fname,
                  dst_path_tmp,
                  db_env_,
@@ -893,7 +1057,8 @@ Status BackupEngineImpl::BackupFile(BackupID backup_id, BackupMeta* backup,
     }
   }
   if (s.ok()) {
-    s = backup->AddFile(FileInfo(dst_relative, size, checksum_value));
+    s = backup->AddFile(std::make_shared<FileInfo>(
+          dst_relative, size, checksum_value));
   }
   return s;
 }
@@ -955,115 +1120,96 @@ void BackupEngineImpl::DeleteChildren(const std::string& dir,
   }
 }
 
-void BackupEngineImpl::GarbageCollection(bool full_scan) {
+Status BackupEngineImpl::GarbageCollect() {
   assert(!read_only_);
   Log(options_.info_log, "Starting garbage collection");
-  std::vector<std::string> to_delete;
-  for (auto& itr : backuped_file_infos_) {
-    if (itr.second.refs == 0) {
-      Status s = backup_env_->DeleteFile(GetAbsolutePath(itr.first));
-      Log(options_.info_log, "Deleting %s -- %s", itr.first.c_str(),
-          s.ToString().c_str());
-      to_delete.push_back(itr.first);
-    }
-  }
-  for (auto& td : to_delete) {
-    backuped_file_infos_.erase(td);
-  }
-  if (!full_scan) {
-    // take care of private dirs -- if full_scan == true, then full_scan will
-    // take care of them
-    for (auto backup_id : obsolete_backups_) {
-      std::string private_dir = GetPrivateFileRel(backup_id);
-      Status s = backup_env_->DeleteDir(GetAbsolutePath(private_dir));
-      Log(options_.info_log, "Deleting private dir %s -- %s",
-          private_dir.c_str(), s.ToString().c_str());
-    }
-  }
-  obsolete_backups_.clear();
-
-  if (full_scan) {
-    Log(options_.info_log, "Starting full scan garbage collection");
-    // delete obsolete shared files
-    std::vector<std::string> shared_children;
-    backup_env_->GetChildren(GetAbsolutePath(GetSharedFileRel()),
-                             &shared_children);
-    for (auto& child : shared_children) {
-      std::string rel_fname = GetSharedFileRel(child);
-      // if it's not refcounted, delete it
-      if (backuped_file_infos_.find(rel_fname) == backuped_file_infos_.end()) {
-        // this might be a directory, but DeleteFile will just fail in that
-        // case, so we're good
-        Status s = backup_env_->DeleteFile(GetAbsolutePath(rel_fname));
-        if (s.ok()) {
-          Log(options_.info_log, "Deleted %s", rel_fname.c_str());
-        }
+
+  // delete obsolete shared files
+  std::vector<std::string> shared_children;
+  backup_env_->GetChildren(GetAbsolutePath(GetSharedFileRel()),
+                           &shared_children);
+  for (auto& child : shared_children) {
+    std::string rel_fname = GetSharedFileRel(child);
+    auto child_itr = backuped_file_infos_.find(rel_fname);
+    // if it's not refcounted, delete it
+    if (child_itr == backuped_file_infos_.end() ||
+        child_itr->second->refs == 0) {
+      // this might be a directory, but DeleteFile will just fail in that
+      // case, so we're good
+      Status s = backup_env_->DeleteFile(GetAbsolutePath(rel_fname));
+      if (s.ok()) {
+        Log(options_.info_log, "Deleted %s", rel_fname.c_str());
       }
+      backuped_file_infos_.erase(rel_fname);
     }
+  }
 
-    // delete obsolete private files
-    std::vector<std::string> private_children;
-    backup_env_->GetChildren(GetAbsolutePath(GetPrivateDirRel()),
-                             &private_children);
-    for (auto& child : private_children) {
-      BackupID backup_id = 0;
-      bool tmp_dir = child.find(".tmp") != std::string::npos;
-      sscanf(child.c_str(), "%u", &backup_id);
-      if (!tmp_dir &&  // if it's tmp_dir, delete it
-          (backup_id == 0 || backups_.find(backup_id) != backups_.end())) {
-        // it's either not a number or it's still alive. continue
-        continue;
-      }
-      // here we have to delete the dir and all its children
-      std::string full_private_path =
-          GetAbsolutePath(GetPrivateFileRel(backup_id, tmp_dir));
-      std::vector<std::string> subchildren;
-      backup_env_->GetChildren(full_private_path, &subchildren);
-      for (auto& subchild : subchildren) {
-        Status s = backup_env_->DeleteFile(full_private_path + subchild);
-        if (s.ok()) {
-          Log(options_.info_log, "Deleted %s",
-              (full_private_path + subchild).c_str());
-        }
+  // delete obsolete private files
+  std::vector<std::string> private_children;
+  backup_env_->GetChildren(GetAbsolutePath(GetPrivateDirRel()),
+                           &private_children);
+  for (auto& child : private_children) {
+    BackupID backup_id = 0;
+    bool tmp_dir = child.find(".tmp") != std::string::npos;
+    sscanf(child.c_str(), "%u", &backup_id);
+    if (!tmp_dir &&  // if it's tmp_dir, delete it
+        (backup_id == 0 || backups_.find(backup_id) != backups_.end())) {
+      // it's either not a number or it's still alive. continue
+      continue;
+    }
+    // here we have to delete the dir and all its children
+    std::string full_private_path =
+        GetAbsolutePath(GetPrivateFileRel(backup_id, tmp_dir));
+    std::vector<std::string> subchildren;
+    backup_env_->GetChildren(full_private_path, &subchildren);
+    for (auto& subchild : subchildren) {
+      Status s = backup_env_->DeleteFile(full_private_path + subchild);
+      if (s.ok()) {
+        Log(options_.info_log, "Deleted %s",
+            (full_private_path + subchild).c_str());
       }
-      // finally delete the private dir
-      Status s = backup_env_->DeleteDir(full_private_path);
-      Log(options_.info_log, "Deleted dir %s -- %s", full_private_path.c_str(),
-          s.ToString().c_str());
     }
+    // finally delete the private dir
+    Status s = backup_env_->DeleteDir(full_private_path);
+    Log(options_.info_log, "Deleted dir %s -- %s", full_private_path.c_str(),
+        s.ToString().c_str());
   }
+
+  return Status::OK();
 }
 
 // ------- BackupMeta class --------
 
-Status BackupEngineImpl::BackupMeta::AddFile(const FileInfo& file_info) {
-  size_ += file_info.size;
-  files_.push_back(file_info.filename);
-
-  auto itr = file_infos_->find(file_info.filename);
+Status BackupEngineImpl::BackupMeta::AddFile(
+    std::shared_ptr<FileInfo> file_info) {
+  auto itr = file_infos_->find(file_info->filename);
   if (itr == file_infos_->end()) {
-    auto ret = file_infos_->insert({file_info.filename, file_info});
+    auto ret = file_infos_->insert({file_info->filename, file_info});
     if (ret.second) {
-      ret.first->second.refs = 1;
+      itr = ret.first;
+      itr->second->refs = 1;
     } else {
       // if this happens, something is seriously wrong
       return Status::Corruption("In memory metadata insertion error");
     }
   } else {
-    if (itr->second.checksum_value != file_info.checksum_value) {
-      return Status::Corruption("Checksum mismatch for existing backup file");
+    if (itr->second->checksum_value != file_info->checksum_value) {
+      return Status::Corruption(
+          "Checksum mismatch for existing backup file. Delete old backups and "
+          "try again.");
     }
-    ++itr->second.refs;  // increase refcount if already present
+    ++itr->second->refs;  // increase refcount if already present
   }
 
+  size_ += file_info->size;
+  files_.push_back(itr->second);
+
   return Status::OK();
 }
 
 void BackupEngineImpl::BackupMeta::Delete(bool delete_meta) {
   for (const auto& file : files_) {
-    auto itr = file_infos_->find(file);
-    assert(itr != file_infos_->end());
-    --(itr->second.refs);  // decrease refcount
+    --file->refs;  // decrease refcount
   }
   files_.clear();
   // delete meta file
@@ -1100,51 +1246,63 @@ Status BackupEngineImpl::BackupMeta::LoadFromFile(
   buf[data.size()] = 0;
 
   uint32_t num_files = 0;
-  int bytes_read = 0;
-  sscanf(data.data(), "%" PRId64 "%n", &timestamp_, &bytes_read);
-  data.remove_prefix(bytes_read + 1);  // +1 for '\n'
-  sscanf(data.data(), "%" PRIu64 "%n", &sequence_number_, &bytes_read);
-  data.remove_prefix(bytes_read + 1);  // +1 for '\n'
-  sscanf(data.data(), "%u%n", &num_files, &bytes_read);
-  data.remove_prefix(bytes_read + 1);  // +1 for '\n'
+  char *next;
+  timestamp_ = strtoull(data.data(), &next, 10);
+  data.remove_prefix(next - data.data() + 1); // +1 for '\n'
+  sequence_number_ = strtoull(data.data(), &next, 10);
+  data.remove_prefix(next - data.data() + 1); // +1 for '\n'
+  num_files = static_cast<uint32_t>(strtoul(data.data(), &next, 10));
+  data.remove_prefix(next - data.data() + 1); // +1 for '\n'
 
-  std::vector<FileInfo> files;
+  std::vector<std::shared_ptr<FileInfo>> files;
+
+  Slice checksum_prefix("crc32 ");
 
   for (uint32_t i = 0; s.ok() && i < num_files; ++i) {
     auto line = GetSliceUntil(&data, '\n');
     std::string filename = GetSliceUntil(&line, ' ').ToString();
 
     uint64_t size;
-    s = env_->GetFileSize(backup_dir + "/" + filename, &size);
-    if (!s.ok()) {
-      return s;
+    const std::shared_ptr<FileInfo> file_info = GetFile(filename);
+    if (file_info) {
+      size = file_info->size;
+    } else {
+      s = env_->GetFileSize(backup_dir + "/" + filename, &size);
+      if (!s.ok()) {
+        return s;
+      }
     }
 
     if (line.empty()) {
-      return Status::Corruption("File checksum is missing");
+      return Status::Corruption("File checksum is missing for " + filename +
+                                " in " + meta_filename_);
     }
 
     uint32_t checksum_value = 0;
-    if (line.starts_with("crc32 ")) {
-      line.remove_prefix(6);
-      sscanf(line.data(), "%u", &checksum_value);
-      if (memcmp(line.data(), std::to_string(checksum_value).c_str(),
-                 line.size() - 1) != 0) {
-        return Status::Corruption("Invalid checksum value");
+    if (line.starts_with(checksum_prefix)) {
+      line.remove_prefix(checksum_prefix.size());
+      checksum_value = static_cast<uint32_t>(
+          strtoul(line.data(), nullptr, 10));
+      if (line != rocksdb::ToString(checksum_value)) {
+        return Status::Corruption("Invalid checksum value for " + filename +
+                                  " in " + meta_filename_);
       }
     } else {
-      return Status::Corruption("Unknown checksum type");
+      return Status::Corruption("Unknown checksum type for " + filename +
+                                " in " + meta_filename_);
     }
 
-    files.emplace_back(filename, size, checksum_value);
+    files.emplace_back(new FileInfo(filename, size, checksum_value));
   }
 
   if (s.ok() && data.size() > 0) {
     // file has to be read completely. if not, we count it as corruption
-    s = Status::Corruption("Tailing data in backup meta file");
+    s = Status::Corruption("Tailing data in backup meta file in " +
+                           meta_filename_);
   }
 
   if (s.ok()) {
+    files_.reserve(files.size());
     for (const auto& file_info : files) {
       s = AddFile(file_info);
       if (!s.ok()) {
@@ -1174,12 +1332,9 @@ Status BackupEngineImpl::BackupMeta::StoreToFile(bool sync) {
                   sequence_number_);
   len += snprintf(buf.get() + len, buf_size - len, "%zu\n", files_.size());
   for (const auto& file : files_) {
-    const auto& iter = file_infos_->find(file);
-
-    assert(iter != file_infos_->end());
     // use crc32 for now, switch to something else if needed
     len += snprintf(buf.get() + len, buf_size - len, "%s crc32 %u\n",
-                    file.c_str(), iter->second.checksum_value);
+                    file->filename.c_str(), file->checksum_value);
   }
 
   s = backup_meta_file->Append(Slice(buf.get(), (size_t)len));
@@ -1203,20 +1358,25 @@ class BackupEngineReadOnlyImpl : public BackupEngineReadOnly {
 
   virtual ~BackupEngineReadOnlyImpl() {}
 
-  virtual void GetBackupInfo(std::vector<BackupInfo>* backup_info) {
+  virtual void GetBackupInfo(std::vector<BackupInfo>* backup_info) override {
     backup_engine_->GetBackupInfo(backup_info);
   }
 
+  virtual void GetCorruptedBackups(
+      std::vector<BackupID>* corrupt_backup_ids) override {
+    backup_engine_->GetCorruptedBackups(corrupt_backup_ids);
+  }
+
   virtual Status RestoreDBFromBackup(
       BackupID backup_id, const std::string& db_dir, const std::string& wal_dir,
-      const RestoreOptions& restore_options = RestoreOptions()) {
+      const RestoreOptions& restore_options = RestoreOptions()) override {
     return backup_engine_->RestoreDBFromBackup(backup_id, db_dir, wal_dir,
                                                restore_options);
   }
 
   virtual Status RestoreDBFromLatestBackup(
       const std::string& db_dir, const std::string& wal_dir,
-      const RestoreOptions& restore_options = RestoreOptions()) {
+      const RestoreOptions& restore_options = RestoreOptions()) override {
     return backup_engine_->RestoreDBFromLatestBackup(db_dir, wal_dir,
                                                      restore_options);
   }
@@ -1234,6 +1394,17 @@ BackupEngineReadOnly* BackupEngineReadOnly::NewReadOnlyBackupEngine(
   return new BackupEngineReadOnlyImpl(db_env, options);
 }
 
+Status BackupEngineReadOnly::Open(Env* env, const BackupableDBOptions& options,
+                                  BackupEngineReadOnly** backup_engine_ptr) {
+  if (options.destroy_old_data) {
+    assert(false);
+    return Status::InvalidArgument(
+        "Can't destroy old data with ReadOnly BackupEngine");
+  }
+  *backup_engine_ptr = new BackupEngineReadOnlyImpl(env, options);
+  return Status::OK();
+}
+
 // --- BackupableDB methods --------
 
 BackupableDB::BackupableDB(DB* db, const BackupableDBOptions& options)
@@ -1252,6 +1423,11 @@ void BackupableDB::GetBackupInfo(std::vector<BackupInfo>* backup_info) {
   backup_engine_->GetBackupInfo(backup_info);
 }
 
+void
+BackupableDB::GetCorruptedBackups(std::vector<BackupID>* corrupt_backup_ids) {
+  backup_engine_->GetCorruptedBackups(corrupt_backup_ids);
+}
+
 Status BackupableDB::PurgeOldBackups(uint32_t num_backups_to_keep) {
   return backup_engine_->PurgeOldBackups(num_backups_to_keep);
 }
@@ -1264,6 +1440,10 @@ void BackupableDB::StopBackup() {
   backup_engine_->StopBackup();
 }
 
+Status BackupableDB::GarbageCollect() {
+  return backup_engine_->GarbageCollect();
+}
+
 // --- RestoreBackupableDB methods ------
 
 RestoreBackupableDB::RestoreBackupableDB(Env* db_env,
@@ -1279,6 +1459,11 @@ RestoreBackupableDB::GetBackupInfo(std::vector<BackupInfo>* backup_info) {
   backup_engine_->GetBackupInfo(backup_info);
 }
 
+void RestoreBackupableDB::GetCorruptedBackups(
+    std::vector<BackupID>* corrupt_backup_ids) {
+  backup_engine_->GetCorruptedBackups(corrupt_backup_ids);
+}
+
 Status RestoreBackupableDB::RestoreDBFromBackup(
     BackupID backup_id, const std::string& db_dir, const std::string& wal_dir,
     const RestoreOptions& restore_options) {
@@ -1301,6 +1486,10 @@ Status RestoreBackupableDB::DeleteBackup(BackupID backup_id) {
   return backup_engine_->DeleteBackup(backup_id);
 }
 
+Status RestoreBackupableDB::GarbageCollect() {
+  return backup_engine_->GarbageCollect();
+}
+
 }  // namespace rocksdb
 
 #endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/backupable/backupable_db_test.cc b/src/rocksdb/utilities/backupable/backupable_db_test.cc
index 6f0c6bc..1476d9d 100644
--- a/src/rocksdb/utilities/backupable/backupable_db_test.cc
+++ b/src/rocksdb/utilities/backupable/backupable_db_test.cc
@@ -14,8 +14,7 @@
 #include "port/port.h"
 #include "rocksdb/types.h"
 #include "rocksdb/transaction_log.h"
-#include "utilities/utility_db.h"
-#include "utilities/backupable_db.h"
+#include "rocksdb/utilities/backupable_db.h"
 #include "util/testharness.h"
 #include "util/random.h"
 #include "util/mutexlock.h"
@@ -35,7 +34,7 @@ class DummyDB : public StackableDB {
      : StackableDB(nullptr), options_(options), dbname_(dbname),
        deletions_enabled_(true), sequence_number_(0) {}
 
-  virtual SequenceNumber GetLatestSequenceNumber() const {
+  virtual SequenceNumber GetLatestSequenceNumber() const override {
     return ++sequence_number_;
   }
 
@@ -54,20 +53,20 @@ class DummyDB : public StackableDB {
   }
 
   virtual Status EnableFileDeletions(bool force) override {
-    ASSERT_TRUE(!deletions_enabled_);
+    EXPECT_TRUE(!deletions_enabled_);
     deletions_enabled_ = true;
     return Status::OK();
   }
 
   virtual Status DisableFileDeletions() override {
-    ASSERT_TRUE(deletions_enabled_);
+    EXPECT_TRUE(deletions_enabled_);
     deletions_enabled_ = false;
     return Status::OK();
   }
 
   virtual Status GetLiveFiles(std::vector<std::string>& vec, uint64_t* mfs,
                               bool flush_memtable = true) override {
-    ASSERT_TRUE(!deletions_enabled_);
+    EXPECT_TRUE(!deletions_enabled_);
     vec = live_files_;
     *mfs = 100;
     return Status::OK();
@@ -87,9 +86,9 @@ class DummyDB : public StackableDB {
       return path_;
     }
 
-    virtual uint64_t LogNumber() const {
+    virtual uint64_t LogNumber() const override {
       // what business do you have calling this method?
-      ASSERT_TRUE(false);
+      EXPECT_TRUE(false);
       return 0;
     }
 
@@ -97,15 +96,15 @@ class DummyDB : public StackableDB {
       return alive_ ? kAliveLogFile : kArchivedLogFile;
     }
 
-    virtual SequenceNumber StartSequence() const {
+    virtual SequenceNumber StartSequence() const override {
       // backupabledb should not need this method
-      ASSERT_TRUE(false);
+      EXPECT_TRUE(false);
       return 0;
     }
 
-    virtual uint64_t SizeFileBytes() const {
+    virtual uint64_t SizeFileBytes() const override {
       // backupabledb should not need this method
-      ASSERT_TRUE(false);
+      EXPECT_TRUE(false);
       return 0;
     }
 
@@ -115,7 +114,7 @@ class DummyDB : public StackableDB {
   }; // DummyLogFile
 
   virtual Status GetSortedWalFiles(VectorLogPtr& files) override {
-    ASSERT_TRUE(!deletions_enabled_);
+    EXPECT_TRUE(!deletions_enabled_);
     files.resize(wal_files_.size());
     for (size_t i = 0; i < files.size(); ++i) {
       files[i].reset(
@@ -141,7 +140,7 @@ class TestEnv : public EnvWrapper {
   class DummySequentialFile : public SequentialFile {
    public:
     DummySequentialFile() : SequentialFile(), rnd_(5) {}
-    virtual Status Read(size_t n, Slice* result, char* scratch) {
+    virtual Status Read(size_t n, Slice* result, char* scratch) override {
       size_t read_size = (n > size_left) ? size_left : n;
       for (size_t i = 0; i < read_size; ++i) {
         scratch[i] = rnd_.Next() & 255;
@@ -151,7 +150,7 @@ class TestEnv : public EnvWrapper {
       return Status::OK();
     }
 
-    virtual Status Skip(uint64_t n) {
+    virtual Status Skip(uint64_t n) override {
       size_left = (n > size_left) ? size_left - n : 0;
       return Status::OK();
     }
@@ -160,9 +159,8 @@ class TestEnv : public EnvWrapper {
     Random rnd_;
   };
 
-  Status NewSequentialFile(const std::string& f,
-                           unique_ptr<SequentialFile>* r,
-                           const EnvOptions& options) {
+  Status NewSequentialFile(const std::string& f, unique_ptr<SequentialFile>* r,
+                           const EnvOptions& options) override {
     MutexLock l(&mutex_);
     if (dummy_sequential_file_) {
       r->reset(new TestEnv::DummySequentialFile());
@@ -173,7 +171,7 @@ class TestEnv : public EnvWrapper {
   }
 
   Status NewWritableFile(const std::string& f, unique_ptr<WritableFile>* r,
-                         const EnvOptions& options) {
+                         const EnvOptions& options) override {
     MutexLock l(&mutex_);
     written_files_.push_back(f);
     if (limit_written_files_ <= 0) {
@@ -185,7 +183,7 @@ class TestEnv : public EnvWrapper {
 
   virtual Status DeleteFile(const std::string& fname) override {
     MutexLock l(&mutex_);
-    ASSERT_GT(limit_delete_files_, 0U);
+    EXPECT_GT(limit_delete_files_, 0U);
     limit_delete_files_--;
     return EnvWrapper::DeleteFile(fname);
   }
@@ -229,7 +227,7 @@ class FileManager : public EnvWrapper {
  public:
   explicit FileManager(Env* t) : EnvWrapper(t), rnd_(5) {}
 
-  Status DeleteRandomFileInDir(const std::string dir) {
+  Status DeleteRandomFileInDir(const std::string& dir) {
     std::vector<std::string> children;
     GetChildren(dir, &children);
     if (children.size() <= 2) { // . and ..
@@ -329,7 +327,7 @@ static size_t FillDB(DB* db, int from, int to) {
     std::string value = "testvalue" + std::to_string(i);
     bytes_written += key.size() + value.size();
 
-    ASSERT_OK(db->Put(WriteOptions(), Slice(key), Slice(value)));
+    EXPECT_OK(db->Put(WriteOptions(), Slice(key), Slice(value)));
   }
   return bytes_written;
 }
@@ -353,7 +351,7 @@ static void AssertEmpty(DB* db, int from, int to) {
   }
 }
 
-class BackupableDBTest {
+class BackupableDBTest : public testing::Test {
  public:
   BackupableDBTest() {
     // set up files
@@ -384,7 +382,7 @@ class BackupableDBTest {
 
   DB* OpenDB() {
     DB* db;
-    ASSERT_OK(DB::Open(options_, dbname_, &db));
+    EXPECT_OK(DB::Open(options_, dbname_, &db));
     return db;
   }
 
@@ -497,7 +495,7 @@ void AppendPath(const std::string& path, std::vector<std::string>& v) {
 }
 
 // this will make sure that backup does not copy the same file twice
-TEST(BackupableDBTest, NoDoubleCopy) {
+TEST_F(BackupableDBTest, NoDoubleCopy) {
   OpenBackupableDB(true, true);
 
   // should write 5 DB files + LATEST_BACKUP + one meta file
@@ -541,13 +539,10 @@ TEST(BackupableDBTest, NoDoubleCopy) {
   test_backup_env_->AssertWrittenFiles(should_have_written);
 
   ASSERT_OK(db_->DeleteBackup(1));
-  ASSERT_EQ(true,
-            test_backup_env_->FileExists(backupdir_ + "/shared/00010.sst"));
+  ASSERT_TRUE(test_backup_env_->FileExists(backupdir_ + "/shared/00010.sst"));
   // 00011.sst was only in backup 1, should be deleted
-  ASSERT_EQ(false,
-            test_backup_env_->FileExists(backupdir_ + "/shared/00011.sst"));
-  ASSERT_EQ(true,
-            test_backup_env_->FileExists(backupdir_ + "/shared/00015.sst"));
+  ASSERT_FALSE(test_backup_env_->FileExists(backupdir_ + "/shared/00011.sst"));
+  ASSERT_TRUE(test_backup_env_->FileExists(backupdir_ + "/shared/00015.sst"));
 
   // MANIFEST file size should be only 100
   uint64_t size;
@@ -568,7 +563,7 @@ TEST(BackupableDBTest, NoDoubleCopy) {
 //      fine
 // 4. Corrupted checksum value - if the checksum is not a valid uint32_t,
 //      db open should fail, otherwise, it aborts during the restore process.
-TEST(BackupableDBTest, CorruptionsTest) {
+TEST_F(BackupableDBTest, CorruptionsTest) {
   const int keys_iteration = 5000;
   Random rnd(6);
   Status s;
@@ -637,7 +632,34 @@ TEST(BackupableDBTest, CorruptionsTest) {
   ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/2"));
   s = restore_db_->RestoreDBFromBackup(2, dbname_, dbname_);
   ASSERT_TRUE(!s.ok());
+
+  // make sure that no corrupt backups have actually been deleted!
+  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/1"));
+  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/2"));
+  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/3"));
+  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/4"));
+  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/5"));
+  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/1"));
+  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/2"));
+  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/3"));
+  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/4"));
+  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/5"));
+
+  // delete the corrupt backups and then make sure they're actually deleted
+  ASSERT_OK(restore_db_->DeleteBackup(5));
+  ASSERT_OK(restore_db_->DeleteBackup(4));
+  ASSERT_OK(restore_db_->DeleteBackup(3));
   ASSERT_OK(restore_db_->DeleteBackup(2));
+  (void) restore_db_->GarbageCollect();
+  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/5") == false);
+  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/5") == false);
+  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/4") == false);
+  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/4") == false);
+  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/3") == false);
+  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/3") == false);
+  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/2") == false);
+  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/2") == false);
+
   CloseRestoreDB();
   AssertBackupConsistency(0, 0, keys_iteration * 1, keys_iteration * 5);
 
@@ -649,8 +671,41 @@ TEST(BackupableDBTest, CorruptionsTest) {
   AssertBackupConsistency(2, 0, keys_iteration * 2, keys_iteration * 5);
 }
 
+// This test verifies we don't delete the latest backup when read-only option is
+// set
+TEST_F(BackupableDBTest, NoDeleteWithReadOnly) {
+  const int keys_iteration = 5000;
+  Random rnd(6);
+  Status s;
+
+  OpenBackupableDB(true);
+  // create five backups
+  for (int i = 0; i < 5; ++i) {
+    FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
+    ASSERT_OK(db_->CreateNewBackup(!!(rnd.Next() % 2)));
+  }
+  CloseBackupableDB();
+  ASSERT_OK(file_manager_->WriteToFile(backupdir_ + "/LATEST_BACKUP", "4"));
+
+  backupable_options_->destroy_old_data = false;
+  BackupEngineReadOnly* read_only_backup_engine;
+  ASSERT_OK(BackupEngineReadOnly::Open(env_, *backupable_options_,
+                                       &read_only_backup_engine));
+
+  // assert that data from backup 5 is still here (even though LATEST_BACKUP
+  // says 4 is latest)
+  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/5") == true);
+  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/5") == true);
+
+  // even though 5 is here, we should only see 4 backups
+  std::vector<BackupInfo> backup_info;
+  read_only_backup_engine->GetBackupInfo(&backup_info);
+  ASSERT_EQ(4UL, backup_info.size());
+  delete read_only_backup_engine;
+}
+
 // open DB, write, close DB, backup, restore, repeat
-TEST(BackupableDBTest, OfflineIntegrationTest) {
+TEST_F(BackupableDBTest, OfflineIntegrationTest) {
   // has to be a big number, so that it triggers the memtable flush
   const int keys_iteration = 5000;
   const int max_key = keys_iteration * 4 + 10;
@@ -697,7 +752,7 @@ TEST(BackupableDBTest, OfflineIntegrationTest) {
 }
 
 // open DB, write, backup, write, backup, close, restore
-TEST(BackupableDBTest, OnlineIntegrationTest) {
+TEST_F(BackupableDBTest, OnlineIntegrationTest) {
   // has to be a big number, so that it triggers the memtable flush
   const int keys_iteration = 5000;
   const int max_key = keys_iteration * 4 + 10;
@@ -760,7 +815,7 @@ TEST(BackupableDBTest, OnlineIntegrationTest) {
   CloseRestoreDB();
 }
 
-TEST(BackupableDBTest, FailOverwritingBackups) {
+TEST_F(BackupableDBTest, FailOverwritingBackups) {
   options_.write_buffer_size = 1024 * 1024 * 1024;  // 1GB
   // create backups 1, 2, 3, 4, 5
   OpenBackupableDB(true);
@@ -795,7 +850,7 @@ TEST(BackupableDBTest, FailOverwritingBackups) {
   CloseBackupableDB();
 }
 
-TEST(BackupableDBTest, NoShareTableFiles) {
+TEST_F(BackupableDBTest, NoShareTableFiles) {
   const int keys_iteration = 5000;
   OpenBackupableDB(true, false, false);
   for (int i = 0; i < 5; ++i) {
@@ -811,7 +866,7 @@ TEST(BackupableDBTest, NoShareTableFiles) {
 }
 
 // Verify that you can backup and restore with share_files_with_checksum on
-TEST(BackupableDBTest, ShareTableFilesWithChecksums) {
+TEST_F(BackupableDBTest, ShareTableFilesWithChecksums) {
   const int keys_iteration = 5000;
   OpenBackupableDB(true, false, true, true);
   for (int i = 0; i < 5; ++i) {
@@ -828,7 +883,7 @@ TEST(BackupableDBTest, ShareTableFilesWithChecksums) {
 
 // Verify that you can backup and restore using share_files_with_checksum set to
 // false and then transition this option to true
-TEST(BackupableDBTest, ShareTableFilesWithChecksumsTransition) {
+TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsTransition) {
   const int keys_iteration = 5000;
   // set share_files_with_checksum to false
   OpenBackupableDB(true, false, true, false);
@@ -857,7 +912,7 @@ TEST(BackupableDBTest, ShareTableFilesWithChecksumsTransition) {
   }
 }
 
-TEST(BackupableDBTest, DeleteTmpFiles) {
+TEST_F(BackupableDBTest, DeleteTmpFiles) {
   OpenBackupableDB();
   CloseBackupableDB();
   std::string shared_tmp = backupdir_ + "/shared/00006.sst.tmp";
@@ -866,15 +921,17 @@ TEST(BackupableDBTest, DeleteTmpFiles) {
   file_manager_->WriteToFile(shared_tmp, "tmp");
   file_manager_->CreateDir(private_tmp_dir);
   file_manager_->WriteToFile(private_tmp_file, "tmp");
-  ASSERT_EQ(true, file_manager_->FileExists(private_tmp_dir));
+  ASSERT_TRUE(file_manager_->FileExists(private_tmp_dir));
   OpenBackupableDB();
+  // Need to call this explicitly to delete tmp files
+  (void) db_->GarbageCollect();
   CloseBackupableDB();
-  ASSERT_EQ(false, file_manager_->FileExists(shared_tmp));
-  ASSERT_EQ(false, file_manager_->FileExists(private_tmp_file));
-  ASSERT_EQ(false, file_manager_->FileExists(private_tmp_dir));
+  ASSERT_FALSE(file_manager_->FileExists(shared_tmp));
+  ASSERT_FALSE(file_manager_->FileExists(private_tmp_file));
+  ASSERT_FALSE(file_manager_->FileExists(private_tmp_dir));
 }
 
-TEST(BackupableDBTest, KeepLogFiles) {
+TEST_F(BackupableDBTest, KeepLogFiles) {
   backupable_options_->backup_log_files = false;
   // basically infinite
   options_.WAL_ttl_seconds = 24 * 60 * 60;
@@ -895,7 +952,7 @@ TEST(BackupableDBTest, KeepLogFiles) {
   AssertBackupConsistency(0, 0, 500, 600, true);
 }
 
-TEST(BackupableDBTest, RateLimiting) {
+TEST_F(BackupableDBTest, RateLimiting) {
   uint64_t const KB = 1024 * 1024;
   size_t const kMicrosPerSec = 1000 * 1000LL;
 
@@ -917,8 +974,7 @@ TEST(BackupableDBTest, RateLimiting) {
     auto backup_time = env_->NowMicros() - start_backup;
     auto rate_limited_backup_time = (bytes_written * kMicrosPerSec) /
                                     backupable_options_->backup_rate_limit;
-    ASSERT_GT(backup_time, 0.9 * rate_limited_backup_time);
-    ASSERT_LT(backup_time, 2.5 * rate_limited_backup_time);
+    ASSERT_GT(backup_time, 0.8 * rate_limited_backup_time);
 
     CloseBackupableDB();
 
@@ -929,14 +985,13 @@ TEST(BackupableDBTest, RateLimiting) {
     CloseRestoreDB();
     auto rate_limited_restore_time = (bytes_written * kMicrosPerSec) /
                                      backupable_options_->restore_rate_limit;
-    ASSERT_GT(restore_time, 0.9 * rate_limited_restore_time);
-    ASSERT_LT(restore_time, 2.5 * rate_limited_restore_time);
+    ASSERT_GT(restore_time, 0.8 * rate_limited_restore_time);
 
     AssertBackupConsistency(0, 0, 100000, 100010);
   }
 }
 
-TEST(BackupableDBTest, ReadOnlyBackupEngine) {
+TEST_F(BackupableDBTest, ReadOnlyBackupEngine) {
   DestroyDB(dbname_, Options());
   OpenBackupableDB(true);
   FillDB(db_.get(), 0, 100);
@@ -949,8 +1004,9 @@ TEST(BackupableDBTest, ReadOnlyBackupEngine) {
   backupable_options_->destroy_old_data = false;
   test_backup_env_->ClearWrittenFiles();
   test_backup_env_->SetLimitDeleteFiles(0);
-  auto read_only_backup_engine =
-      BackupEngineReadOnly::NewReadOnlyBackupEngine(env_, *backupable_options_);
+  BackupEngineReadOnly* read_only_backup_engine;
+  ASSERT_OK(BackupEngineReadOnly::Open(env_, *backupable_options_,
+                                       &read_only_backup_engine));
   std::vector<BackupInfo> backup_info;
   read_only_backup_engine->GetBackupInfo(&backup_info);
   ASSERT_EQ(backup_info.size(), 2U);
@@ -967,10 +1023,35 @@ TEST(BackupableDBTest, ReadOnlyBackupEngine) {
   delete db;
 }
 
+TEST_F(BackupableDBTest, GarbageCollectionBeforeBackup) {
+  DestroyDB(dbname_, Options());
+  OpenBackupableDB(true);
+
+  env_->CreateDirIfMissing(backupdir_ + "/shared");
+  std::string file_five = backupdir_ + "/shared/000005.sst";
+  std::string file_five_contents = "I'm not really a sst file";
+  // this depends on the fact that 00005.sst is the first file created by the DB
+  ASSERT_OK(file_manager_->WriteToFile(file_five, file_five_contents));
+
+  FillDB(db_.get(), 0, 100);
+  // backup overwrites file 000005.sst
+  ASSERT_TRUE(db_->CreateNewBackup(true).ok());
+
+  std::string new_file_five_contents;
+  ASSERT_OK(ReadFileToString(env_, file_five, &new_file_five_contents));
+  // file 000005.sst was overwritten
+  ASSERT_TRUE(new_file_five_contents != file_five_contents);
+
+  CloseBackupableDB();
+
+  AssertBackupConsistency(0, 0, 100);
+}
+
 }  // anon namespace
 
 } //  namespace rocksdb
 
 int main(int argc, char** argv) {
-  return rocksdb::test::RunAllTests();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
 }
diff --git a/src/rocksdb/utilities/checkpoint/checkpoint.cc b/src/rocksdb/utilities/checkpoint/checkpoint.cc
new file mode 100644
index 0000000..760a6db
--- /dev/null
+++ b/src/rocksdb/utilities/checkpoint/checkpoint.cc
@@ -0,0 +1,168 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2012 Facebook.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/utilities/checkpoint.h"
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
+#include <algorithm>
+#include <string>
+#include "db/filename.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "util/file_util.h"
+
+namespace rocksdb {
+
+class CheckpointImpl : public Checkpoint {
+ public:
+  // Creates a Checkpoint object to be used for creating openable sbapshots
+  explicit CheckpointImpl(DB* db) : db_(db) {}
+
+  // Builds an openable snapshot of RocksDB on the same disk, which
+  // accepts an output directory on the same disk, and under the directory
+  // (1) hard-linked SST files pointing to existing live SST files
+  // SST files will be copied if output directory is on a different filesystem
+  // (2) a copied manifest files and other files
+  // The directory should not already exist and will be created by this API.
+  // The directory will be an absolute path
+  using Checkpoint::CreateCheckpoint;
+  virtual Status CreateCheckpoint(const std::string& checkpoint_dir) override;
+
+ private:
+  DB* db_;
+};
+
+Status Checkpoint::Create(DB* db, Checkpoint** checkpoint_ptr) {
+  *checkpoint_ptr = new CheckpointImpl(db);
+  return Status::OK();
+}
+
+Status Checkpoint::CreateCheckpoint(const std::string& checkpoint_dir) {
+  return Status::NotSupported("");
+}
+
+// Builds an openable snapshot of RocksDB
+Status CheckpointImpl::CreateCheckpoint(const std::string& checkpoint_dir) {
+  Status s;
+  std::vector<std::string> live_files;
+  uint64_t manifest_file_size = 0;
+  uint64_t sequence_number = db_->GetLatestSequenceNumber();
+  bool same_fs = true;
+
+  if (db_->GetEnv()->FileExists(checkpoint_dir)) {
+    return Status::InvalidArgument("Directory exists");
+  }
+
+  s = db_->DisableFileDeletions();
+  if (s.ok()) {
+    // this will return live_files prefixed with "/"
+    s = db_->GetLiveFiles(live_files, &manifest_file_size, true);
+  }
+  if (!s.ok()) {
+    db_->EnableFileDeletions(false);
+    return s;
+  }
+
+  Log(db_->GetOptions().info_log,
+      "Started the snapshot process -- creating snapshot in directory %s",
+      checkpoint_dir.c_str());
+
+  std::string full_private_path = checkpoint_dir + ".tmp";
+
+  // create snapshot directory
+  s = db_->GetEnv()->CreateDir(full_private_path);
+
+  // copy/hard link live_files
+  for (size_t i = 0; s.ok() && i < live_files.size(); ++i) {
+    uint64_t number;
+    FileType type;
+    bool ok = ParseFileName(live_files[i], &number, &type);
+    if (!ok) {
+      s = Status::Corruption("Can't parse file name. This is very bad");
+      break;
+    }
+    // we should only get sst, manifest and current files here
+    assert(type == kTableFile || type == kDescriptorFile ||
+           type == kCurrentFile);
+    assert(live_files[i].size() > 0 && live_files[i][0] == '/');
+    std::string src_fname = live_files[i];
+
+    // rules:
+    // * if it's kTableFile, then it's shared
+    // * if it's kDescriptorFile, limit the size to manifest_file_size
+    // * always copy if cross-device link
+    if ((type == kTableFile) && same_fs) {
+      Log(db_->GetOptions().info_log, "Hard Linking %s", src_fname.c_str());
+      s = db_->GetEnv()->LinkFile(db_->GetName() + src_fname,
+                                  full_private_path + src_fname);
+      if (s.IsNotSupported()) {
+        same_fs = false;
+        s = Status::OK();
+      }
+    }
+    if ((type != kTableFile) || (!same_fs)) {
+      Log(db_->GetOptions().info_log, "Copying %s", src_fname.c_str());
+      s = CopyFile(db_->GetEnv(), db_->GetName() + src_fname,
+                   full_private_path + src_fname,
+                   (type == kDescriptorFile) ? manifest_file_size : 0);
+    }
+  }
+
+  // we copied all the files, enable file deletions
+  db_->EnableFileDeletions(false);
+
+  if (s.ok()) {
+    // move tmp private backup to real snapshot directory
+    s = db_->GetEnv()->RenameFile(full_private_path, checkpoint_dir);
+  }
+  if (s.ok()) {
+    unique_ptr<Directory> checkpoint_directory;
+    db_->GetEnv()->NewDirectory(checkpoint_dir, &checkpoint_directory);
+    if (checkpoint_directory != nullptr) {
+      s = checkpoint_directory->Fsync();
+    }
+  }
+
+  if (!s.ok()) {
+    // clean all the files we might have created
+    Log(db_->GetOptions().info_log, "Snapshot failed -- %s",
+        s.ToString().c_str());
+    // we have to delete the dir and all its children
+    std::vector<std::string> subchildren;
+    db_->GetEnv()->GetChildren(full_private_path, &subchildren);
+    for (auto& subchild : subchildren) {
+      Status s1 = db_->GetEnv()->DeleteFile(full_private_path + subchild);
+      if (s1.ok()) {
+        Log(db_->GetOptions().info_log, "Deleted %s",
+            (full_private_path + subchild).c_str());
+      }
+    }
+    // finally delete the private dir
+    Status s1 = db_->GetEnv()->DeleteDir(full_private_path);
+    Log(db_->GetOptions().info_log, "Deleted dir %s -- %s",
+        full_private_path.c_str(), s1.ToString().c_str());
+    return s;
+  }
+
+  // here we know that we succeeded and installed the new snapshot
+  Log(db_->GetOptions().info_log, "Snapshot DONE. All is good");
+  Log(db_->GetOptions().info_log, "Snapshot sequence number: %" PRIu64,
+      sequence_number);
+
+  return s;
+}
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/compacted_db/compacted_db_impl.cc b/src/rocksdb/utilities/compacted_db/compacted_db_impl.cc
new file mode 100644
index 0000000..55bcbca
--- /dev/null
+++ b/src/rocksdb/utilities/compacted_db/compacted_db_impl.cc
@@ -0,0 +1,163 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef ROCKSDB_LITE
+#include "utilities/compacted_db/compacted_db_impl.h"
+#include "db/db_impl.h"
+#include "db/version_set.h"
+#include "table/get_context.h"
+
+namespace rocksdb {
+
+extern void MarkKeyMayExist(void* arg);
+extern bool SaveValue(void* arg, const ParsedInternalKey& parsed_key,
+                      const Slice& v, bool hit_and_return);
+
+CompactedDBImpl::CompactedDBImpl(
+  const DBOptions& options, const std::string& dbname)
+  : DBImpl(options, dbname) {
+}
+
+CompactedDBImpl::~CompactedDBImpl() {
+}
+
+size_t CompactedDBImpl::FindFile(const Slice& key) {
+  size_t left = 0;
+  size_t right = files_.num_files - 1;
+  while (left < right) {
+    size_t mid = (left + right) >> 1;
+    const FdWithKeyRange& f = files_.files[mid];
+    if (user_comparator_->Compare(ExtractUserKey(f.largest_key), key) < 0) {
+      // Key at "mid.largest" is < "target".  Therefore all
+      // files at or before "mid" are uninteresting.
+      left = mid + 1;
+    } else {
+      // Key at "mid.largest" is >= "target".  Therefore all files
+      // after "mid" are uninteresting.
+      right = mid;
+    }
+  }
+  return right;
+}
+
+Status CompactedDBImpl::Get(const ReadOptions& options,
+     ColumnFamilyHandle*, const Slice& key, std::string* value) {
+  GetContext get_context(user_comparator_, nullptr, nullptr, nullptr,
+                         GetContext::kNotFound, key, value, nullptr, nullptr,
+                         nullptr);
+  LookupKey lkey(key, kMaxSequenceNumber);
+  files_.files[FindFile(key)].fd.table_reader->Get(
+      options, lkey.internal_key(), &get_context);
+  if (get_context.State() == GetContext::kFound) {
+    return Status::OK();
+  }
+  return Status::NotFound();
+}
+
+std::vector<Status> CompactedDBImpl::MultiGet(const ReadOptions& options,
+    const std::vector<ColumnFamilyHandle*>&,
+    const std::vector<Slice>& keys, std::vector<std::string>* values) {
+  autovector<TableReader*, 16> reader_list;
+  for (const auto& key : keys) {
+    const FdWithKeyRange& f = files_.files[FindFile(key)];
+    if (user_comparator_->Compare(key, ExtractUserKey(f.smallest_key)) < 0) {
+      reader_list.push_back(nullptr);
+    } else {
+      LookupKey lkey(key, kMaxSequenceNumber);
+      f.fd.table_reader->Prepare(lkey.internal_key());
+      reader_list.push_back(f.fd.table_reader);
+    }
+  }
+  std::vector<Status> statuses(keys.size(), Status::NotFound());
+  values->resize(keys.size());
+  int idx = 0;
+  for (auto* r : reader_list) {
+    if (r != nullptr) {
+      GetContext get_context(user_comparator_, nullptr, nullptr, nullptr,
+                             GetContext::kNotFound, keys[idx], &(*values)[idx],
+                             nullptr, nullptr, nullptr);
+      LookupKey lkey(keys[idx], kMaxSequenceNumber);
+      r->Get(options, lkey.internal_key(), &get_context);
+      if (get_context.State() == GetContext::kFound) {
+        statuses[idx] = Status::OK();
+      }
+    }
+    ++idx;
+  }
+  return statuses;
+}
+
+Status CompactedDBImpl::Init(const Options& options) {
+  mutex_.Lock();
+  ColumnFamilyDescriptor cf(kDefaultColumnFamilyName,
+                            ColumnFamilyOptions(options));
+  Status s = Recover({ cf }, true /* read only */, false);
+  if (s.ok()) {
+    cfd_ = reinterpret_cast<ColumnFamilyHandleImpl*>(
+              DefaultColumnFamily())->cfd();
+    delete cfd_->InstallSuperVersion(new SuperVersion(), &mutex_);
+  }
+  mutex_.Unlock();
+  if (!s.ok()) {
+    return s;
+  }
+  NewThreadStatusCfInfo(cfd_);
+  version_ = cfd_->GetSuperVersion()->current;
+  user_comparator_ = cfd_->user_comparator();
+  auto* vstorage = version_->storage_info();
+  if (vstorage->num_non_empty_levels() == 0) {
+    return Status::NotSupported("no file exists");
+  }
+  const LevelFilesBrief& l0 = vstorage->LevelFilesBrief(0);
+  // L0 should not have files
+  if (l0.num_files > 1) {
+    return Status::NotSupported("L0 contain more than 1 file");
+  }
+  if (l0.num_files == 1) {
+    if (vstorage->num_non_empty_levels() > 1) {
+      return Status::NotSupported("Both L0 and other level contain files");
+    }
+    files_ = l0;
+    return Status::OK();
+  }
+
+  for (int i = 1; i < vstorage->num_non_empty_levels() - 1; ++i) {
+    if (vstorage->LevelFilesBrief(i).num_files > 0) {
+      return Status::NotSupported("Other levels also contain files");
+    }
+  }
+
+  int level = vstorage->num_non_empty_levels() - 1;
+  if (vstorage->LevelFilesBrief(level).num_files > 0) {
+    files_ = vstorage->LevelFilesBrief(level);
+    return Status::OK();
+  }
+  return Status::NotSupported("no file exists");
+}
+
+Status CompactedDBImpl::Open(const Options& options,
+                             const std::string& dbname, DB** dbptr) {
+  *dbptr = nullptr;
+
+  if (options.max_open_files != -1) {
+    return Status::InvalidArgument("require max_open_files = -1");
+  }
+  if (options.merge_operator.get() != nullptr) {
+    return Status::InvalidArgument("merge operator is not supported");
+  }
+  DBOptions db_options(options);
+  std::unique_ptr<CompactedDBImpl> db(new CompactedDBImpl(db_options, dbname));
+  Status s = db->Init(options);
+  if (s.ok()) {
+    Log(INFO_LEVEL, db->db_options_.info_log,
+        "Opened the db as fully compacted mode");
+    LogFlush(db->db_options_.info_log);
+    *dbptr = db.release();
+  }
+  return s;
+}
+
+}   // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/compacted_db/compacted_db_impl.h b/src/rocksdb/utilities/compacted_db/compacted_db_impl.h
new file mode 100644
index 0000000..e1ac92d
--- /dev/null
+++ b/src/rocksdb/utilities/compacted_db/compacted_db_impl.h
@@ -0,0 +1,96 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+#include "db/db_impl.h"
+#include <vector>
+#include <string>
+
+namespace rocksdb {
+
+class CompactedDBImpl : public DBImpl {
+ public:
+  CompactedDBImpl(const DBOptions& options, const std::string& dbname);
+  virtual ~CompactedDBImpl();
+
+  static Status Open(const Options& options, const std::string& dbname,
+                     DB** dbptr);
+
+  // Implementations of the DB interface
+  using DB::Get;
+  virtual Status Get(const ReadOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     std::string* value) override;
+  using DB::MultiGet;
+  virtual std::vector<Status> MultiGet(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>&,
+      const std::vector<Slice>& keys, std::vector<std::string>* values)
+    override;
+
+  using DBImpl::Put;
+  virtual Status Put(const WriteOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     const Slice& value) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+  using DBImpl::Merge;
+  virtual Status Merge(const WriteOptions& options,
+                       ColumnFamilyHandle* column_family, const Slice& key,
+                       const Slice& value) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+  using DBImpl::Delete;
+  virtual Status Delete(const WriteOptions& options,
+                        ColumnFamilyHandle* column_family,
+                        const Slice& key) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+  virtual Status Write(const WriteOptions& options,
+                       WriteBatch* updates) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+  using DBImpl::CompactRange;
+  virtual Status CompactRange(ColumnFamilyHandle* column_family,
+                              const Slice* begin, const Slice* end,
+                              bool reduce_level = false, int target_level = -1,
+                              uint32_t target_path_id = 0) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+
+  virtual Status DisableFileDeletions() override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+  virtual Status EnableFileDeletions(bool force) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+  virtual Status GetLiveFiles(std::vector<std::string>&,
+                              uint64_t* manifest_file_size,
+                              bool flush_memtable = true) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+  using DBImpl::Flush;
+  virtual Status Flush(const FlushOptions& options,
+                       ColumnFamilyHandle* column_family) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+
+ private:
+  friend class DB;
+  inline size_t FindFile(const Slice& key);
+  Status Init(const Options& options);
+
+  ColumnFamilyData* cfd_;
+  Version* version_;
+  const Comparator* user_comparator_;
+  LevelFilesBrief files_;
+
+  // No copying allowed
+  CompactedDBImpl(const CompactedDBImpl&);
+  void operator=(const CompactedDBImpl&);
+};
+}
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/convenience/convenience.cc b/src/rocksdb/utilities/convenience/convenience.cc
new file mode 100644
index 0000000..b91bc9c
--- /dev/null
+++ b/src/rocksdb/utilities/convenience/convenience.cc
@@ -0,0 +1,23 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2012 Facebook.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/utilities/convenience.h"
+
+#include "db/db_impl.h"
+
+namespace rocksdb {
+
+void CancelAllBackgroundWork(DB* db, bool wait) {
+  (dynamic_cast<DBImpl*>(db))->CancelAllBackgroundWork(wait);
+}
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/document/document_db.cc b/src/rocksdb/utilities/document/document_db.cc
new file mode 100644
index 0000000..7f7bc78
--- /dev/null
+++ b/src/rocksdb/utilities/document/document_db.cc
@@ -0,0 +1,1192 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/utilities/document_db.h"
+
+#include "rocksdb/cache.h"
+#include "rocksdb/table.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/utilities/json_document.h"
+#include "util/coding.h"
+#include "util/mutexlock.h"
+#include "port/port.h"
+
+namespace rocksdb {
+
+// IMPORTANT NOTE: Secondary index column families should be very small and
+// generally fit in memory. Assume that accessing secondary index column
+// families is much faster than accessing primary index (data heap) column
+// family. Accessing a key (i.e. checking for existance) from a column family in
+// RocksDB is not much faster than accessing both key and value since they are
+// kept together and loaded from storage together.
+
+namespace {
+// < 0   <=>  lhs < rhs
+// == 0  <=>  lhs == rhs
+// > 0   <=>  lhs == rhs
+// TODO(icanadi) move this to JSONDocument?
+int DocumentCompare(const JSONDocument& lhs, const JSONDocument& rhs) {
+  assert(lhs.IsObject() == false && rhs.IsObject() == false &&
+         lhs.type() == rhs.type());
+
+  switch (lhs.type()) {
+    case JSONDocument::kNull:
+      return 0;
+    case JSONDocument::kBool:
+      return static_cast<int>(lhs.GetBool()) - static_cast<int>(rhs.GetBool());
+    case JSONDocument::kDouble: {
+      double res = lhs.GetDouble() - rhs.GetDouble();
+      return res == 0.0 ? 0 : (res < 0.0 ? -1 : 1);
+    }
+    case JSONDocument::kInt64: {
+      int64_t res = lhs.GetInt64() - rhs.GetInt64();
+      return res == 0 ? 0 : (res < 0 ? -1 : 1);
+    }
+    case JSONDocument::kString:
+      return Slice(lhs.GetString()).compare(Slice(rhs.GetString()));
+    default:
+      assert(false);
+  }
+  return 0;
+}
+}  // namespace
+
+class Filter {
+ public:
+  // returns nullptr on parse failure
+  static Filter* ParseFilter(const JSONDocument& filter);
+
+  struct Interval {
+    JSONDocument upper_bound;
+    JSONDocument lower_bound;
+    bool upper_inclusive;
+    bool lower_inclusive;
+    Interval()
+        : upper_bound(),
+          lower_bound(),
+          upper_inclusive(false),
+          lower_inclusive(false) {}
+    Interval(const JSONDocument& ub, const JSONDocument& lb, bool ui, bool li)
+        : upper_bound(ub),
+          lower_bound(lb),
+          upper_inclusive(ui),
+          lower_inclusive(li) {
+    }
+
+    void UpdateUpperBound(const JSONDocument& ub, bool inclusive);
+    void UpdateLowerBound(const JSONDocument& lb, bool inclusive);
+  };
+
+  bool SatisfiesFilter(const JSONDocument& document) const;
+  const Interval* GetInterval(const std::string& field) const;
+
+ private:
+  explicit Filter(const JSONDocument& filter) : filter_(filter.Copy()) {
+    assert(filter_.IsOwner());
+  }
+
+  // copied from the parameter
+  const JSONDocument filter_;
+  // constant after construction
+  std::unordered_map<std::string, Interval> intervals_;
+};
+
+void Filter::Interval::UpdateUpperBound(const JSONDocument& ub,
+                                        bool inclusive) {
+  bool update = upper_bound.IsNull();
+  if (!update) {
+    int cmp = DocumentCompare(upper_bound, ub);
+    update = (cmp > 0) || (cmp == 0 && !inclusive);
+  }
+  if (update) {
+    upper_bound = ub;
+    upper_inclusive = inclusive;
+  }
+}
+
+void Filter::Interval::UpdateLowerBound(const JSONDocument& lb,
+                                        bool inclusive) {
+  bool update = lower_bound.IsNull();
+  if (!update) {
+    int cmp = DocumentCompare(lower_bound, lb);
+    update = (cmp < 0) || (cmp == 0 && !inclusive);
+  }
+  if (update) {
+    lower_bound = lb;
+    lower_inclusive = inclusive;
+  }
+}
+
+Filter* Filter::ParseFilter(const JSONDocument& filter) {
+  if (filter.IsObject() == false) {
+    return nullptr;
+  }
+
+  std::unique_ptr<Filter> f(new Filter(filter));
+
+  for (const auto& items : f->filter_.Items()) {
+    if (items.first.size() && items.first[0] == '$') {
+      // fields starting with '$' are commands
+      continue;
+    }
+    assert(f->intervals_.find(items.first) == f->intervals_.end());
+    if (items.second.IsObject()) {
+      if (items.second.Count() == 0) {
+        // uhm...?
+        return nullptr;
+      }
+      Interval interval;
+      for (const auto& condition : items.second.Items()) {
+        if (condition.second.IsObject() || condition.second.IsArray()) {
+          // comparison operators not defined on objects. invalid array
+          return nullptr;
+        }
+        // comparison operators:
+        if (condition.first == "$gt") {
+          interval.UpdateLowerBound(condition.second, false);
+        } else if (condition.first == "$gte") {
+          interval.UpdateLowerBound(condition.second, true);
+        } else if (condition.first == "$lt") {
+          interval.UpdateUpperBound(condition.second, false);
+        } else if (condition.first == "$lte") {
+          interval.UpdateUpperBound(condition.second, true);
+        } else {
+          // TODO(icanadi) more logical operators
+          return nullptr;
+        }
+      }
+      f->intervals_.insert({items.first, interval});
+    } else {
+      // equality
+      f->intervals_.insert(
+          {items.first, Interval(items.second,
+                                 items.second, true, true)});
+    }
+  }
+
+  return f.release();
+}
+
+const Filter::Interval* Filter::GetInterval(const std::string& field) const {
+  auto itr = intervals_.find(field);
+  if (itr == intervals_.end()) {
+    return nullptr;
+  }
+  // we can do that since intervals_ is constant after construction
+  return &itr->second;
+}
+
+bool Filter::SatisfiesFilter(const JSONDocument& document) const {
+  for (const auto& interval : intervals_) {
+    if (!document.Contains(interval.first)) {
+      // doesn't have the value, doesn't satisfy the filter
+      // (we don't support null queries yet)
+      return false;
+    }
+    auto value = document[interval.first];
+    if (!interval.second.upper_bound.IsNull()) {
+      if (value.type() != interval.second.upper_bound.type()) {
+        // no cross-type queries yet
+        // TODO(icanadi) do this at least for numbers!
+        return false;
+      }
+      int cmp = DocumentCompare(interval.second.upper_bound, value);
+      if (cmp < 0 || (cmp == 0 && interval.second.upper_inclusive == false)) {
+        // bigger (or equal) than upper bound
+        return false;
+      }
+    }
+    if (!interval.second.lower_bound.IsNull()) {
+      if (value.type() != interval.second.lower_bound.type()) {
+        // no cross-type queries yet
+        return false;
+      }
+      int cmp = DocumentCompare(interval.second.lower_bound, value);
+      if (cmp > 0 || (cmp == 0 && interval.second.lower_inclusive == false)) {
+        // smaller (or equal) than the lower bound
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+class Index {
+ public:
+  Index() = default;
+  virtual ~Index() {}
+
+  virtual const char* Name() const = 0;
+
+  // Functions that are executed during write time
+  // ---------------------------------------------
+  // GetIndexKey() generates a key that will be used to index document and
+  // returns the key though the second std::string* parameter
+  virtual void GetIndexKey(const JSONDocument& document,
+                           std::string* key) const = 0;
+  // Keys generated with GetIndexKey() will be compared using this comparator.
+  // It should be assumed that there will be a suffix added to the index key
+  // according to IndexKey implementation
+  virtual const Comparator* GetComparator() const = 0;
+
+  // Functions that are executed during query time
+  // ---------------------------------------------
+  enum Direction {
+    kForwards,
+    kBackwards,
+  };
+  // Returns true if this index can provide some optimization for satisfying
+  // filter. False otherwise
+  virtual bool UsefulIndex(const Filter& filter) const = 0;
+  // For every filter (assuming UsefulIndex()) there is a continuous interval of
+  // keys in the index that satisfy the index conditions. That interval can be
+  // three things:
+  // * [A, B]
+  // * [A, infinity>
+  // * <-infinity, B]
+  //
+  // Query engine that uses this Index for optimization will access the interval
+  // by first calling Position() and then iterating in the Direction (returned
+  // by Position()) while ShouldContinueLooking() is true.
+  // * For [A, B] interval Position() will Seek() to A and return kForwards.
+  // ShouldContinueLooking() will be true until the iterator value gets beyond B
+  // -- then it will return false
+  // * For [A, infinity> Position() will Seek() to A and return kForwards.
+  // ShouldContinueLooking() will always return true
+  // * For <-infinity, B] Position() will Seek() to B and return kBackwards.
+  // ShouldContinueLooking() will always return true (given that iterator is
+  // advanced by calling Prev())
+  virtual Direction Position(const Filter& filter,
+                             Iterator* iterator) const = 0;
+  virtual bool ShouldContinueLooking(const Filter& filter,
+                                     const Slice& secondary_key,
+                                     Direction direction) const = 0;
+
+  // Static function that is executed when Index is created
+  // ---------------------------------------------
+  // Create Index from user-supplied description. Return nullptr on parse
+  // failure.
+  static Index* CreateIndexFromDescription(const JSONDocument& description,
+                                           const std::string& name);
+
+ private:
+  // No copying allowed
+  Index(const Index&);
+  void operator=(const Index&);
+};
+
+// Encoding helper function
+namespace {
+std::string InternalSecondaryIndexName(const std::string& user_name) {
+  return "index_" + user_name;
+}
+
+// Don't change these, they are persisted in secondary indexes
+enum JSONPrimitivesEncoding : char {
+  kNull = 0x1,
+  kBool = 0x2,
+  kDouble = 0x3,
+  kInt64 = 0x4,
+  kString = 0x5,
+};
+
+// encodes simple JSON members (meaning string, integer, etc)
+// the end result of this will be lexicographically compared to each other
+bool EncodeJSONPrimitive(const JSONDocument& json, std::string* dst) {
+  // TODO(icanadi) revise this at some point, have a custom comparator
+  switch (json.type()) {
+    case JSONDocument::kNull:
+      dst->push_back(kNull);
+      break;
+    case JSONDocument::kBool:
+      dst->push_back(kBool);
+      dst->push_back(static_cast<char>(json.GetBool()));
+      break;
+    case JSONDocument::kDouble:
+      dst->push_back(kDouble);
+      PutFixed64(dst, static_cast<uint64_t>(json.GetDouble()));
+      break;
+    case JSONDocument::kInt64:
+      dst->push_back(kInt64);
+      {
+        auto val = json.GetInt64();
+        dst->push_back((val < 0) ? '0' : '1');
+        PutFixed64(dst, static_cast<uint64_t>(val));
+      }
+      break;
+    case JSONDocument::kString:
+      dst->push_back(kString);
+      dst->append(json.GetString());
+      break;
+    default:
+      return false;
+  }
+  return true;
+}
+
+}  // namespace
+
+// format of the secondary key is:
+// <secondary_key><primary_key><offset_of_primary_key uint32_t>
+class IndexKey {
+ public:
+  IndexKey() : ok_(false) {}
+  explicit IndexKey(const Slice& slice) {
+    if (slice.size() < sizeof(uint32_t)) {
+      ok_ = false;
+      return;
+    }
+    uint32_t primary_key_offset =
+        DecodeFixed32(slice.data() + slice.size() - sizeof(uint32_t));
+    if (primary_key_offset >= slice.size() - sizeof(uint32_t)) {
+      ok_ = false;
+      return;
+    }
+    parts_[0] = Slice(slice.data(), primary_key_offset);
+    parts_[1] = Slice(slice.data() + primary_key_offset,
+                      slice.size() - primary_key_offset - sizeof(uint32_t));
+    ok_ = true;
+  }
+  IndexKey(const Slice& secondary_key, const Slice& primary_key) : ok_(true) {
+    parts_[0] = secondary_key;
+    parts_[1] = primary_key;
+  }
+
+  SliceParts GetSliceParts() {
+    uint32_t primary_key_offset = static_cast<uint32_t>(parts_[0].size());
+    EncodeFixed32(primary_key_offset_buf_, primary_key_offset);
+    parts_[2] = Slice(primary_key_offset_buf_, sizeof(uint32_t));
+    return SliceParts(parts_, 3);
+  }
+
+  const Slice& GetPrimaryKey() const { return parts_[1]; }
+  const Slice& GetSecondaryKey() const { return parts_[0]; }
+
+  bool ok() const { return ok_; }
+
+ private:
+  bool ok_;
+  // 0 -- secondary key
+  // 1 -- primary key
+  // 2 -- primary key offset
+  Slice parts_[3];
+  char primary_key_offset_buf_[sizeof(uint32_t)];
+};
+
+class SimpleSortedIndex : public Index {
+ public:
+  SimpleSortedIndex(const std::string& field, const std::string& name)
+      : field_(field), name_(name) {}
+
+  virtual const char* Name() const override { return name_.c_str(); }
+
+  virtual void GetIndexKey(const JSONDocument& document, std::string* key) const
+      override {
+    if (!document.Contains(field_)) {
+      if (!EncodeJSONPrimitive(JSONDocument(JSONDocument::kNull), key)) {
+        assert(false);
+      }
+    } else {
+      if (!EncodeJSONPrimitive(document[field_], key)) {
+        assert(false);
+      }
+    }
+  }
+  virtual const Comparator* GetComparator() const override {
+    return BytewiseComparator();
+  }
+
+  virtual bool UsefulIndex(const Filter& filter) const override {
+    return filter.GetInterval(field_) != nullptr;
+  }
+  // REQUIRES: UsefulIndex(filter) == true
+  virtual Direction Position(const Filter& filter,
+                             Iterator* iterator) const override {
+    auto interval = filter.GetInterval(field_);
+    assert(interval != nullptr);  // because index is useful
+    Direction direction;
+
+    const JSONDocument* limit;
+    if (!interval->lower_bound.IsNull()) {
+      limit = &(interval->lower_bound);
+      direction = kForwards;
+    } else {
+      limit = &(interval->upper_bound);
+      direction = kBackwards;
+    }
+
+    std::string encoded_limit;
+    if (!EncodeJSONPrimitive(*limit, &encoded_limit)) {
+      assert(false);
+    }
+    iterator->Seek(Slice(encoded_limit));
+
+    return direction;
+  }
+  // REQUIRES: UsefulIndex(filter) == true
+  virtual bool ShouldContinueLooking(
+      const Filter& filter, const Slice& secondary_key,
+      Index::Direction direction) const override {
+    auto interval = filter.GetInterval(field_);
+    assert(interval != nullptr);  // because index is useful
+    if (direction == kForwards) {
+      if (interval->upper_bound.IsNull()) {
+        // continue looking, no upper bound
+        return true;
+      }
+      std::string encoded_upper_bound;
+      if (!EncodeJSONPrimitive(interval->upper_bound, &encoded_upper_bound)) {
+        // uhm...?
+        // TODO(icanadi) store encoded upper and lower bounds in Filter*?
+        assert(false);
+      }
+      // TODO(icanadi) we need to somehow decode this and use DocumentCompare()
+      int compare = secondary_key.compare(Slice(encoded_upper_bound));
+      // if (current key is bigger than upper bound) OR (current key is equal to
+      // upper bound, but inclusive is false) THEN stop looking. otherwise,
+      // continue
+      return (compare > 0 ||
+              (compare == 0 && interval->upper_inclusive == false))
+                 ? false
+                 : true;
+    } else {
+      assert(direction == kBackwards);
+      if (interval->lower_bound.IsNull()) {
+        // continue looking, no lower bound
+        return true;
+      }
+      std::string encoded_lower_bound;
+      if (!EncodeJSONPrimitive(interval->lower_bound, &encoded_lower_bound)) {
+        // uhm...?
+        // TODO(icanadi) store encoded upper and lower bounds in Filter*?
+        assert(false);
+      }
+      // TODO(icanadi) we need to somehow decode this and use DocumentCompare()
+      int compare = secondary_key.compare(Slice(encoded_lower_bound));
+      // if (current key is smaller than lower bound) OR (current key is equal
+      // to lower bound, but inclusive is false) THEN stop looking. otherwise,
+      // continue
+      return (compare < 0 ||
+              (compare == 0 && interval->lower_inclusive == false))
+                 ? false
+                 : true;
+    }
+
+    assert(false);
+    // this is here just so compiler doesn't complain
+    return false;
+  }
+
+ private:
+  std::string field_;
+  std::string name_;
+};
+
+Index* Index::CreateIndexFromDescription(const JSONDocument& description,
+                                         const std::string& name) {
+  if (!description.IsObject() || description.Count() != 1) {
+    // not supported yet
+    return nullptr;
+  }
+  const auto& field = *description.Items().begin();
+  if (field.second.IsInt64() == false || field.second.GetInt64() != 1) {
+    // not supported yet
+    return nullptr;
+  }
+  return new SimpleSortedIndex(field.first, name);
+}
+
+class CursorWithFilterIndexed : public Cursor {
+ public:
+  CursorWithFilterIndexed(Iterator* primary_index_iter,
+                          Iterator* secondary_index_iter, const Index* index,
+                          const Filter* filter)
+      : primary_index_iter_(primary_index_iter),
+        secondary_index_iter_(secondary_index_iter),
+        index_(index),
+        filter_(filter),
+        valid_(true),
+        current_json_document_(nullptr) {
+    assert(filter_.get() != nullptr);
+    direction_ = index->Position(*filter_.get(), secondary_index_iter_.get());
+    UpdateIndexKey();
+    AdvanceUntilSatisfies();
+  }
+
+  virtual bool Valid() const override {
+    return valid_ && secondary_index_iter_->Valid();
+  }
+  virtual void Next() override {
+    assert(Valid());
+    Advance();
+    AdvanceUntilSatisfies();
+  }
+  // temporary object. copy it if you want to use it
+  virtual const JSONDocument& document() const override {
+    assert(Valid());
+    return *current_json_document_;
+  }
+  virtual Status status() const override {
+    if (!status_.ok()) {
+      return status_;
+    }
+    if (!primary_index_iter_->status().ok()) {
+      return primary_index_iter_->status();
+    }
+    return secondary_index_iter_->status();
+  }
+
+ private:
+  void Advance() {
+    if (direction_ == Index::kForwards) {
+      secondary_index_iter_->Next();
+    } else {
+      secondary_index_iter_->Prev();
+    }
+    UpdateIndexKey();
+  }
+  void AdvanceUntilSatisfies() {
+    bool found = false;
+    while (secondary_index_iter_->Valid() &&
+           index_->ShouldContinueLooking(
+               *filter_.get(), index_key_.GetSecondaryKey(), direction_)) {
+      if (!UpdateJSONDocument()) {
+        // corruption happened
+        return;
+      }
+      if (filter_->SatisfiesFilter(*current_json_document_)) {
+        // we found satisfied!
+        found = true;
+        break;
+      } else {
+        // doesn't satisfy :(
+        Advance();
+      }
+    }
+    if (!found) {
+      valid_ = false;
+    }
+  }
+
+  bool UpdateJSONDocument() {
+    assert(secondary_index_iter_->Valid());
+    primary_index_iter_->Seek(index_key_.GetPrimaryKey());
+    if (!primary_index_iter_->Valid()) {
+      status_ = Status::Corruption(
+          "Inconsistency between primary and secondary index");
+      valid_ = false;
+      return false;
+    }
+    current_json_document_.reset(
+        JSONDocument::Deserialize(primary_index_iter_->value()));
+    assert(current_json_document_->IsOwner());
+    if (current_json_document_.get() == nullptr) {
+      status_ = Status::Corruption("JSON deserialization failed");
+      valid_ = false;
+      return false;
+    }
+    return true;
+  }
+  void UpdateIndexKey() {
+    if (secondary_index_iter_->Valid()) {
+      index_key_ = IndexKey(secondary_index_iter_->key());
+      if (!index_key_.ok()) {
+        status_ = Status::Corruption("Invalid index key");
+        valid_ = false;
+      }
+    }
+  }
+  std::unique_ptr<Iterator> primary_index_iter_;
+  std::unique_ptr<Iterator> secondary_index_iter_;
+  // we don't own index_
+  const Index* index_;
+  Index::Direction direction_;
+  std::unique_ptr<const Filter> filter_;
+  bool valid_;
+  IndexKey index_key_;
+  std::unique_ptr<JSONDocument> current_json_document_;
+  Status status_;
+};
+
+class CursorFromIterator : public Cursor {
+ public:
+  explicit CursorFromIterator(Iterator* iter)
+      : iter_(iter), current_json_document_(nullptr) {
+    iter_->SeekToFirst();
+    UpdateCurrentJSON();
+  }
+
+  virtual bool Valid() const override { return status_.ok() && iter_->Valid(); }
+  virtual void Next() override {
+    iter_->Next();
+    UpdateCurrentJSON();
+  }
+  virtual const JSONDocument& document() const override {
+    assert(Valid());
+    return *current_json_document_;
+  };
+  virtual Status status() const override {
+    if (!status_.ok()) {
+      return status_;
+    }
+    return iter_->status();
+  }
+
+  // not part of public Cursor interface
+  Slice key() const { return iter_->key(); }
+
+ private:
+  void UpdateCurrentJSON() {
+    if (Valid()) {
+      current_json_document_.reset(JSONDocument::Deserialize(iter_->value()));
+      if (current_json_document_.get() == nullptr) {
+        status_ = Status::Corruption("JSON deserialization failed");
+      }
+    }
+  }
+
+  Status status_;
+  std::unique_ptr<Iterator> iter_;
+  std::unique_ptr<JSONDocument> current_json_document_;
+};
+
+class CursorWithFilter : public Cursor {
+ public:
+  CursorWithFilter(Cursor* base_cursor, const Filter* filter)
+      : base_cursor_(base_cursor), filter_(filter) {
+    assert(filter_.get() != nullptr);
+    SeekToNextSatisfies();
+  }
+  virtual bool Valid() const override { return base_cursor_->Valid(); }
+  virtual void Next() override {
+    assert(Valid());
+    base_cursor_->Next();
+    SeekToNextSatisfies();
+  }
+  virtual const JSONDocument& document() const override {
+    assert(Valid());
+    return base_cursor_->document();
+  }
+  virtual Status status() const override { return base_cursor_->status(); }
+
+ private:
+  void SeekToNextSatisfies() {
+    for (; base_cursor_->Valid(); base_cursor_->Next()) {
+      if (filter_->SatisfiesFilter(base_cursor_->document())) {
+        break;
+      }
+    }
+  }
+  std::unique_ptr<Cursor> base_cursor_;
+  std::unique_ptr<const Filter> filter_;
+};
+
+class CursorError : public Cursor {
+ public:
+  explicit CursorError(Status s) : s_(s) { assert(!s.ok()); }
+  virtual Status status() const override { return s_; }
+  virtual bool Valid() const override { return false; }
+  virtual void Next() override {}
+  virtual const JSONDocument& document() const override {
+    assert(false);
+    // compiler complains otherwise
+    return trash_;
+  }
+
+ private:
+  Status s_;
+  JSONDocument trash_;
+};
+
+class DocumentDBImpl : public DocumentDB {
+ public:
+  DocumentDBImpl(
+      DB* db, ColumnFamilyHandle* primary_key_column_family,
+      const std::vector<std::pair<Index*, ColumnFamilyHandle*>>& indexes,
+      const Options& rocksdb_options)
+      : DocumentDB(db),
+        primary_key_column_family_(primary_key_column_family),
+        rocksdb_options_(rocksdb_options) {
+    for (const auto& index : indexes) {
+      name_to_index_.insert(
+          {index.first->Name(), IndexColumnFamily(index.first, index.second)});
+    }
+  }
+
+  ~DocumentDBImpl() {
+    for (auto& iter : name_to_index_) {
+      delete iter.second.index;
+      delete iter.second.column_family;
+    }
+    delete primary_key_column_family_;
+  }
+
+  virtual Status CreateIndex(const WriteOptions& write_options,
+                             const IndexDescriptor& index) override {
+    auto index_obj =
+        Index::CreateIndexFromDescription(*index.description, index.name);
+    if (index_obj == nullptr) {
+      return Status::InvalidArgument("Failed parsing index description");
+    }
+
+    ColumnFamilyHandle* cf_handle;
+    Status s =
+        CreateColumnFamily(ColumnFamilyOptions(rocksdb_options_),
+                           InternalSecondaryIndexName(index.name), &cf_handle);
+    if (!s.ok()) {
+      delete index_obj;
+      return s;
+    }
+
+    MutexLock l(&write_mutex_);
+
+    std::unique_ptr<CursorFromIterator> cursor(new CursorFromIterator(
+        DocumentDB::NewIterator(ReadOptions(), primary_key_column_family_)));
+
+    WriteBatch batch;
+    for (; cursor->Valid(); cursor->Next()) {
+      std::string secondary_index_key;
+      index_obj->GetIndexKey(cursor->document(), &secondary_index_key);
+      IndexKey index_key(Slice(secondary_index_key), cursor->key());
+      batch.Put(cf_handle, index_key.GetSliceParts(), SliceParts());
+    }
+
+    if (!cursor->status().ok()) {
+      delete index_obj;
+      return cursor->status();
+    }
+
+    {
+      MutexLock l_nti(&name_to_index_mutex_);
+      name_to_index_.insert(
+          {index.name, IndexColumnFamily(index_obj, cf_handle)});
+    }
+
+    return DocumentDB::Write(write_options, &batch);
+  }
+
+  virtual Status DropIndex(const std::string& name) override {
+    MutexLock l(&write_mutex_);
+
+    auto index_iter = name_to_index_.find(name);
+    if (index_iter == name_to_index_.end()) {
+      return Status::InvalidArgument("No such index");
+    }
+
+    Status s = DropColumnFamily(index_iter->second.column_family);
+    if (!s.ok()) {
+      return s;
+    }
+
+    delete index_iter->second.index;
+    delete index_iter->second.column_family;
+
+    // remove from name_to_index_
+    {
+      MutexLock l_nti(&name_to_index_mutex_);
+      name_to_index_.erase(index_iter);
+    }
+
+    return Status::OK();
+  }
+
+  virtual Status Insert(const WriteOptions& options,
+                        const JSONDocument& document) override {
+    WriteBatch batch;
+
+    if (!document.IsObject()) {
+      return Status::InvalidArgument("Document not an object");
+    }
+    if (!document.Contains(kPrimaryKey)) {
+      return Status::InvalidArgument("No primary key");
+    }
+    auto primary_key = document[kPrimaryKey];
+    if (primary_key.IsNull() ||
+        (!primary_key.IsString() && !primary_key.IsInt64())) {
+      return Status::InvalidArgument(
+          "Primary key format error");
+    }
+    std::string encoded_document;
+    document.Serialize(&encoded_document);
+    std::string primary_key_encoded;
+    if (!EncodeJSONPrimitive(primary_key, &primary_key_encoded)) {
+      // previous call should be guaranteed to pass because of all primary_key
+      // conditions checked before
+      assert(false);
+    }
+    Slice primary_key_slice(primary_key_encoded);
+
+    // Lock now, since we're starting DB operations
+    MutexLock l(&write_mutex_);
+    // check if there is already a document with the same primary key
+    std::string value;
+    Status s = DocumentDB::Get(ReadOptions(), primary_key_column_family_,
+                               primary_key_slice, &value);
+    if (!s.IsNotFound()) {
+      return s.ok() ? Status::InvalidArgument("Duplicate primary key!") : s;
+    }
+
+    batch.Put(primary_key_column_family_, primary_key_slice, encoded_document);
+
+    for (const auto& iter : name_to_index_) {
+      std::string secondary_index_key;
+      iter.second.index->GetIndexKey(document, &secondary_index_key);
+      IndexKey index_key(Slice(secondary_index_key), primary_key_slice);
+      batch.Put(iter.second.column_family, index_key.GetSliceParts(),
+                SliceParts());
+    }
+
+    return DocumentDB::Write(options, &batch);
+  }
+
+  virtual Status Remove(const ReadOptions& read_options,
+                        const WriteOptions& write_options,
+                        const JSONDocument& query) override {
+    MutexLock l(&write_mutex_);
+    std::unique_ptr<Cursor> cursor(
+        ConstructFilterCursor(read_options, nullptr, query));
+
+    WriteBatch batch;
+    for (; cursor->status().ok() && cursor->Valid(); cursor->Next()) {
+      const auto& document = cursor->document();
+      if (!document.IsObject()) {
+        return Status::Corruption("Document corruption");
+      }
+      if (!document.Contains(kPrimaryKey)) {
+        return Status::Corruption("Document corruption");
+      }
+      auto primary_key = document[kPrimaryKey];
+      if (primary_key.IsNull() ||
+          (!primary_key.IsString() && !primary_key.IsInt64())) {
+        return Status::Corruption("Document corruption");
+      }
+
+      // TODO(icanadi) Instead of doing this, just get primary key encoding from
+      // cursor, as it already has this information
+      std::string primary_key_encoded;
+      if (!EncodeJSONPrimitive(primary_key, &primary_key_encoded)) {
+        // previous call should be guaranteed to pass because of all primary_key
+        // conditions checked before
+        assert(false);
+      }
+      Slice primary_key_slice(primary_key_encoded);
+      batch.Delete(primary_key_column_family_, primary_key_slice);
+
+      for (const auto& iter : name_to_index_) {
+        std::string secondary_index_key;
+        iter.second.index->GetIndexKey(document, &secondary_index_key);
+        IndexKey index_key(Slice(secondary_index_key), primary_key_slice);
+        batch.Delete(iter.second.column_family, index_key.GetSliceParts());
+      }
+    }
+
+    if (!cursor->status().ok()) {
+      return cursor->status();
+    }
+
+    return DocumentDB::Write(write_options, &batch);
+  }
+
+  virtual Status Update(const ReadOptions& read_options,
+                        const WriteOptions& write_options,
+                        const JSONDocument& filter,
+                        const JSONDocument& updates) override {
+    MutexLock l(&write_mutex_);
+    std::unique_ptr<Cursor> cursor(
+        ConstructFilterCursor(read_options, nullptr, filter));
+
+    if (!updates.IsObject()) {
+        return Status::Corruption("Bad update document format");
+    }
+    WriteBatch batch;
+    for (; cursor->status().ok() && cursor->Valid(); cursor->Next()) {
+      const auto& old_document = cursor->document();
+      JSONDocument new_document(old_document);
+      if (!new_document.IsObject()) {
+        return Status::Corruption("Document corruption");
+      }
+      // TODO(icanadi) Make this nicer, something like class Filter
+      for (const auto& update : updates.Items()) {
+        if (update.first == "$set") {
+          JSONDocumentBuilder builder;
+          bool res __attribute__((unused)) = builder.WriteStartObject();
+          assert(res);
+          for (const auto& itr : update.second.Items()) {
+            if (itr.first == kPrimaryKey) {
+              return Status::NotSupported("Please don't change primary key");
+            }
+            res = builder.WriteKeyValue(itr.first, itr.second);
+            assert(res);
+          }
+          res = builder.WriteEndObject();
+          assert(res);
+          JSONDocument update_document = builder.GetJSONDocument();
+          builder.Reset();
+          res = builder.WriteStartObject();
+          assert(res);
+          for (const auto& itr : new_document.Items()) {
+            if (update_document.Contains(itr.first)) {
+              res = builder.WriteKeyValue(itr.first,
+                                          update_document[itr.first]);
+            } else {
+              res = builder.WriteKeyValue(itr.first, new_document[itr.first]);
+            }
+            assert(res);
+          }
+          res = builder.WriteEndObject();
+          assert(res);
+          new_document = builder.GetJSONDocument();
+          assert(new_document.IsOwner());
+        } else {
+          // TODO(icanadi) more commands
+          return Status::InvalidArgument("Can't understand update command");
+        }
+      }
+
+      // TODO(icanadi) reuse some of this code
+      if (!new_document.Contains(kPrimaryKey)) {
+        return Status::Corruption("Corrupted document -- primary key missing");
+      }
+      auto primary_key = new_document[kPrimaryKey];
+      if (primary_key.IsNull() ||
+          (!primary_key.IsString() && !primary_key.IsInt64())) {
+        // This will happen when document on storage doesn't have primary key,
+        // since we don't support any update operations on primary key. That's
+        // why this is corruption error
+        return Status::Corruption("Corrupted document -- primary key missing");
+      }
+      std::string encoded_document;
+      new_document.Serialize(&encoded_document);
+      std::string primary_key_encoded;
+      if (!EncodeJSONPrimitive(primary_key, &primary_key_encoded)) {
+        // previous call should be guaranteed to pass because of all primary_key
+        // conditions checked before
+        assert(false);
+      }
+      Slice primary_key_slice(primary_key_encoded);
+      batch.Put(primary_key_column_family_, primary_key_slice,
+                encoded_document);
+
+      for (const auto& iter : name_to_index_) {
+        std::string old_key, new_key;
+        iter.second.index->GetIndexKey(old_document, &old_key);
+        iter.second.index->GetIndexKey(new_document, &new_key);
+        if (old_key == new_key) {
+          // don't need to update this secondary index
+          continue;
+        }
+
+        IndexKey old_index_key(Slice(old_key), primary_key_slice);
+        IndexKey new_index_key(Slice(new_key), primary_key_slice);
+
+        batch.Delete(iter.second.column_family, old_index_key.GetSliceParts());
+        batch.Put(iter.second.column_family, new_index_key.GetSliceParts(),
+                  SliceParts());
+      }
+    }
+
+    if (!cursor->status().ok()) {
+      return cursor->status();
+    }
+
+    return DocumentDB::Write(write_options, &batch);
+  }
+
+  virtual Cursor* Query(const ReadOptions& read_options,
+                        const JSONDocument& query) override {
+    Cursor* cursor = nullptr;
+
+    if (!query.IsArray()) {
+      return new CursorError(
+          Status::InvalidArgument("Query has to be an array"));
+    }
+
+    // TODO(icanadi) support index "_id"
+    for (size_t i = 0; i < query.Count(); ++i) {
+      const auto& command_doc = query[i];
+      if (command_doc.Count() != 1) {
+        // there can be only one key-value pair in each of array elements.
+        // key is the command and value are the params
+        delete cursor;
+        return new CursorError(Status::InvalidArgument("Invalid query"));
+      }
+      const auto& command = *command_doc.Items().begin();
+
+      if (command.first == "$filter") {
+        cursor = ConstructFilterCursor(read_options, cursor, command.second);
+      } else {
+        // only filter is supported for now
+        delete cursor;
+        return new CursorError(Status::InvalidArgument("Invalid query"));
+      }
+    }
+
+    if (cursor == nullptr) {
+      cursor = new CursorFromIterator(
+          DocumentDB::NewIterator(read_options, primary_key_column_family_));
+    }
+
+    return cursor;
+  }
+
+  // RocksDB functions
+  virtual Status Get(const ReadOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     std::string* value) override {
+    return Status::NotSupported("");
+  }
+  virtual Status Get(const ReadOptions& options, const Slice& key,
+                     std::string* value) override {
+    return Status::NotSupported("");
+  }
+  virtual Status Write(const WriteOptions& options,
+                       WriteBatch* updates) override {
+    return Status::NotSupported("");
+  }
+  virtual Iterator* NewIterator(const ReadOptions& options,
+                                ColumnFamilyHandle* column_family) override {
+    return nullptr;
+  }
+  virtual Iterator* NewIterator(const ReadOptions& options) override {
+    return nullptr;
+  }
+
+ private:
+  Cursor* ConstructFilterCursor(ReadOptions read_options, Cursor* cursor,
+                                const JSONDocument& query) {
+    std::unique_ptr<const Filter> filter(Filter::ParseFilter(query));
+    if (filter.get() == nullptr) {
+      return new CursorError(Status::InvalidArgument("Invalid query"));
+    }
+
+    IndexColumnFamily tmp_storage(nullptr, nullptr);
+
+    if (cursor == nullptr) {
+      IndexColumnFamily* index_column_family = nullptr;
+      if (query.Contains("$index") && query["$index"].IsString()) {
+        {
+          auto index_name = query["$index"];
+          MutexLock l(&name_to_index_mutex_);
+          auto index_iter = name_to_index_.find(index_name.GetString());
+          if (index_iter != name_to_index_.end()) {
+            tmp_storage = index_iter->second;
+            index_column_family = &tmp_storage;
+          } else {
+            return new CursorError(
+                Status::InvalidArgument("Index does not exist"));
+          }
+        }
+      }
+
+      if (index_column_family != nullptr &&
+          index_column_family->index->UsefulIndex(*filter.get())) {
+        std::vector<Iterator*> iterators;
+        Status s = DocumentDB::NewIterators(
+            read_options,
+            {primary_key_column_family_, index_column_family->column_family},
+            &iterators);
+        if (!s.ok()) {
+          delete cursor;
+          return new CursorError(s);
+        }
+        assert(iterators.size() == 2);
+        return new CursorWithFilterIndexed(iterators[0], iterators[1],
+                                           index_column_family->index,
+                                           filter.release());
+      } else {
+        return new CursorWithFilter(
+            new CursorFromIterator(DocumentDB::NewIterator(
+                read_options, primary_key_column_family_)),
+            filter.release());
+      }
+    } else {
+      return new CursorWithFilter(cursor, filter.release());
+    }
+    assert(false);
+    return nullptr;
+  }
+
+  // currently, we lock and serialize all writes to rocksdb. reads are not
+  // locked and always get consistent view of the database. we should optimize
+  // locking in the future
+  port::Mutex write_mutex_;
+  port::Mutex name_to_index_mutex_;
+  const char* kPrimaryKey = "_id";
+  struct IndexColumnFamily {
+    IndexColumnFamily(Index* _index, ColumnFamilyHandle* _column_family)
+        : index(_index), column_family(_column_family) {}
+    Index* index;
+    ColumnFamilyHandle* column_family;
+  };
+
+
+  // name_to_index_ protected:
+  // 1) when writing -- 1. lock write_mutex_, 2. lock name_to_index_mutex_
+  // 2) when reading -- lock name_to_index_mutex_ OR write_mutex_
+  std::unordered_map<std::string, IndexColumnFamily> name_to_index_;
+  ColumnFamilyHandle* primary_key_column_family_;
+  Options rocksdb_options_;
+};
+
+namespace {
+Options GetRocksDBOptionsFromOptions(const DocumentDBOptions& options) {
+  Options rocksdb_options;
+  rocksdb_options.max_background_compactions = options.background_threads - 1;
+  rocksdb_options.max_background_flushes = 1;
+  rocksdb_options.write_buffer_size = options.memtable_size;
+  rocksdb_options.max_write_buffer_number = 6;
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = NewLRUCache(options.cache_size);
+  rocksdb_options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  return rocksdb_options;
+}
+}  // namespace
+
+Status DocumentDB::Open(const DocumentDBOptions& options,
+                        const std::string& name,
+                        const std::vector<DocumentDB::IndexDescriptor>& indexes,
+                        DocumentDB** db, bool read_only) {
+  Options rocksdb_options = GetRocksDBOptionsFromOptions(options);
+  rocksdb_options.create_if_missing = true;
+
+  std::vector<ColumnFamilyDescriptor> column_families;
+  column_families.push_back(ColumnFamilyDescriptor(
+      kDefaultColumnFamilyName, ColumnFamilyOptions(rocksdb_options)));
+  for (const auto& index : indexes) {
+    column_families.emplace_back(InternalSecondaryIndexName(index.name),
+                                 ColumnFamilyOptions(rocksdb_options));
+  }
+  std::vector<ColumnFamilyHandle*> handles;
+  DB* base_db;
+  Status s;
+  if (read_only) {
+    s = DB::OpenForReadOnly(DBOptions(rocksdb_options), name, column_families,
+                            &handles, &base_db);
+  } else {
+    s = DB::Open(DBOptions(rocksdb_options), name, column_families, &handles,
+                 &base_db);
+  }
+  if (!s.ok()) {
+    return s;
+  }
+
+  std::vector<std::pair<Index*, ColumnFamilyHandle*>> index_cf(indexes.size());
+  assert(handles.size() == indexes.size() + 1);
+  for (size_t i = 0; i < indexes.size(); ++i) {
+    auto index = Index::CreateIndexFromDescription(*indexes[i].description,
+                                                   indexes[i].name);
+    index_cf[i] = {index, handles[i + 1]};
+  }
+  *db = new DocumentDBImpl(base_db, handles[0], index_cf, rocksdb_options);
+  return Status::OK();
+}
+
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/document/document_db_test.cc b/src/rocksdb/utilities/document/document_db_test.cc
new file mode 100644
index 0000000..d02b58f
--- /dev/null
+++ b/src/rocksdb/utilities/document/document_db_test.cc
@@ -0,0 +1,324 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include <algorithm>
+
+#include "rocksdb/utilities/json_document.h"
+#include "rocksdb/utilities/document_db.h"
+
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+class DocumentDBTest : public testing::Test {
+ public:
+  DocumentDBTest() {
+    dbname_ = test::TmpDir() + "/document_db_test";
+    DestroyDB(dbname_, Options());
+  }
+  ~DocumentDBTest() {
+    delete db_;
+    DestroyDB(dbname_, Options());
+  }
+
+  void AssertCursorIDs(Cursor* cursor, std::vector<int64_t> expected) {
+    std::vector<int64_t> got;
+    while (cursor->Valid()) {
+      ASSERT_TRUE(cursor->Valid());
+      ASSERT_TRUE(cursor->document().Contains("_id"));
+      got.push_back(cursor->document()["_id"].GetInt64());
+      cursor->Next();
+    }
+    std::sort(expected.begin(), expected.end());
+    std::sort(got.begin(), got.end());
+    ASSERT_TRUE(got == expected);
+  }
+
+  // converts ' to ", so that we don't have to escape " all over the place
+  std::string ConvertQuotes(const std::string& input) {
+    std::string output;
+    for (auto x : input) {
+      if (x == '\'') {
+        output.push_back('\"');
+      } else {
+        output.push_back(x);
+      }
+    }
+    return output;
+  }
+
+  void CreateIndexes(std::vector<DocumentDB::IndexDescriptor> indexes) {
+    for (auto i : indexes) {
+      ASSERT_OK(db_->CreateIndex(WriteOptions(), i));
+    }
+  }
+
+  JSONDocument* Parse(const std::string& doc) {
+    return JSONDocument::ParseJSON(ConvertQuotes(doc).c_str());
+  }
+
+  std::string dbname_;
+  DocumentDB* db_;
+};
+
+TEST_F(DocumentDBTest, SimpleQueryTest) {
+  DocumentDBOptions options;
+  DocumentDB::IndexDescriptor index;
+  index.description = Parse("{\"name\": 1}");
+  index.name = "name_index";
+
+  ASSERT_OK(DocumentDB::Open(options, dbname_, {}, &db_));
+  CreateIndexes({index});
+  delete db_;
+  // now there is index present
+  ASSERT_OK(DocumentDB::Open(options, dbname_, {index}, &db_));
+  delete index.description;
+
+  std::vector<std::string> json_objects = {
+      "{\"_id\': 1, \"name\": \"One\"}",   "{\"_id\": 2, \"name\": \"Two\"}",
+      "{\"_id\": 3, \"name\": \"Three\"}", "{\"_id\": 4, \"name\": \"Four\"}"};
+
+  for (auto& json : json_objects) {
+    std::unique_ptr<JSONDocument> document(Parse(json));
+    ASSERT_TRUE(document.get() != nullptr);
+    ASSERT_OK(db_->Insert(WriteOptions(), *document));
+  }
+
+  // inserting a document with existing primary key should return failure
+  {
+    std::unique_ptr<JSONDocument> document(Parse(json_objects[0]));
+    ASSERT_TRUE(document.get() != nullptr);
+    Status s = db_->Insert(WriteOptions(), *document);
+    ASSERT_TRUE(s.IsInvalidArgument());
+  }
+
+  // find equal to "Two"
+  {
+    std::unique_ptr<JSONDocument> query(
+        Parse("[{'$filter': {'name': 'Two', '$index': 'name_index'}}]"));
+    std::unique_ptr<Cursor> cursor(db_->Query(ReadOptions(), *query));
+    AssertCursorIDs(cursor.get(), {2});
+  }
+
+  // find less than "Three"
+  {
+    std::unique_ptr<JSONDocument> query(Parse(
+        "[{'$filter': {'name': {'$lt': 'Three'}, '$index': "
+        "'name_index'}}]"));
+    std::unique_ptr<Cursor> cursor(db_->Query(ReadOptions(), *query));
+
+    AssertCursorIDs(cursor.get(), {1, 4});
+  }
+
+  // find less than "Three" without index
+  {
+    std::unique_ptr<JSONDocument> query(
+        Parse("[{'$filter': {'name': {'$lt': 'Three'} }}]"));
+    std::unique_ptr<Cursor> cursor(db_->Query(ReadOptions(), *query));
+    AssertCursorIDs(cursor.get(), {1, 4});
+  }
+
+  // remove less or equal to "Three"
+  {
+    std::unique_ptr<JSONDocument> query(
+        Parse("{'name': {'$lte': 'Three'}, '$index': 'name_index'}"));
+    ASSERT_OK(db_->Remove(ReadOptions(), WriteOptions(), *query));
+  }
+
+  // find all -- only "Two" left, everything else should be deleted
+  {
+    std::unique_ptr<JSONDocument> query(Parse("[]"));
+    std::unique_ptr<Cursor> cursor(db_->Query(ReadOptions(), *query));
+    AssertCursorIDs(cursor.get(), {2});
+  }
+}
+
+TEST_F(DocumentDBTest, ComplexQueryTest) {
+  DocumentDBOptions options;
+  DocumentDB::IndexDescriptor priority_index;
+  priority_index.description = Parse("{'priority': 1}");
+  priority_index.name = "priority";
+  DocumentDB::IndexDescriptor job_name_index;
+  job_name_index.description = Parse("{'job_name': 1}");
+  job_name_index.name = "job_name";
+  DocumentDB::IndexDescriptor progress_index;
+  progress_index.description = Parse("{'progress': 1}");
+  progress_index.name = "progress";
+
+  ASSERT_OK(DocumentDB::Open(options, dbname_, {}, &db_));
+  CreateIndexes({priority_index, progress_index});
+  delete priority_index.description;
+  delete progress_index.description;
+
+  std::vector<std::string> json_objects = {
+      "{'_id': 1, 'job_name': 'play', 'priority': 10, 'progress': 14.2}",
+      "{'_id': 2, 'job_name': 'white', 'priority': 2, 'progress': 45.1}",
+      "{'_id': 3, 'job_name': 'straw', 'priority': 5, 'progress': 83.2}",
+      "{'_id': 4, 'job_name': 'temporary', 'priority': 3, 'progress': 14.9}",
+      "{'_id': 5, 'job_name': 'white', 'priority': 4, 'progress': 44.2}",
+      "{'_id': 6, 'job_name': 'tea', 'priority': 1, 'progress': 12.4}",
+      "{'_id': 7, 'job_name': 'delete', 'priority': 2, 'progress': 77.54}",
+      "{'_id': 8, 'job_name': 'rock', 'priority': 3, 'progress': 93.24}",
+      "{'_id': 9, 'job_name': 'steady', 'priority': 3, 'progress': 9.1}",
+      "{'_id': 10, 'job_name': 'white', 'priority': 1, 'progress': 61.4}",
+      "{'_id': 11, 'job_name': 'who', 'priority': 4, 'progress': 39.41}",
+      "{'_id': 12, 'job_name': 'who', 'priority': -1, 'progress': 39.42}",
+      "{'_id': 13, 'job_name': 'who', 'priority': -2, 'progress': 39.42}", };
+
+  // add index on the fly!
+  CreateIndexes({job_name_index});
+  delete job_name_index.description;
+
+  for (auto& json : json_objects) {
+    std::unique_ptr<JSONDocument> document(Parse(json));
+    ASSERT_TRUE(document != nullptr);
+    ASSERT_OK(db_->Insert(WriteOptions(), *document));
+  }
+
+  // 2 < priority < 4 AND progress > 10.0, index priority
+  {
+    std::unique_ptr<JSONDocument> query(Parse(
+        "[{'$filter': {'priority': {'$lt': 4, '$gt': 2}, 'progress': {'$gt': "
+        "10.0}, '$index': 'priority'}}]"));
+    std::unique_ptr<Cursor> cursor(db_->Query(ReadOptions(), *query));
+    AssertCursorIDs(cursor.get(), {4, 8});
+  }
+
+  // -1 <= priority <= 1, index priority
+  {
+    std::unique_ptr<JSONDocument> query(Parse(
+        "[{'$filter': {'priority': {'$lte': 1, '$gte': -1},"
+        " '$index': 'priority'}}]"));
+    std::unique_ptr<Cursor> cursor(db_->Query(ReadOptions(), *query));
+    AssertCursorIDs(cursor.get(), {6, 10, 12});
+  }
+
+  // 2 < priority < 4 AND progress > 10.0, index progress
+  {
+    std::unique_ptr<JSONDocument> query(Parse(
+        "[{'$filter': {'priority': {'$lt': 4, '$gt': 2}, 'progress': {'$gt': "
+        "10.0}, '$index': 'progress'}}]"));
+    std::unique_ptr<Cursor> cursor(db_->Query(ReadOptions(), *query));
+    AssertCursorIDs(cursor.get(), {4, 8});
+  }
+
+  // job_name == 'white' AND priority >= 2, index job_name
+  {
+    std::unique_ptr<JSONDocument> query(Parse(
+        "[{'$filter': {'job_name': 'white', 'priority': {'$gte': "
+        "2}, '$index': 'job_name'}}]"));
+    std::unique_ptr<Cursor> cursor(db_->Query(ReadOptions(), *query));
+    AssertCursorIDs(cursor.get(), {2, 5});
+  }
+
+  // 35.0 <= progress < 65.5, index progress
+  {
+    std::unique_ptr<JSONDocument> query(Parse(
+        "[{'$filter': {'progress': {'$gt': 5.0, '$gte': 35.0, '$lt': 65.5}, "
+        "'$index': 'progress'}}]"));
+    std::unique_ptr<Cursor> cursor(db_->Query(ReadOptions(), *query));
+    AssertCursorIDs(cursor.get(), {2, 5, 10, 11, 12, 13});
+  }
+
+  // 2 < priority <= 4, index priority
+  {
+    std::unique_ptr<JSONDocument> query(Parse(
+        "[{'$filter': {'priority': {'$gt': 2, '$lt': 8, '$lte': 4}, "
+        "'$index': 'priority'}}]"));
+    std::unique_ptr<Cursor> cursor(db_->Query(ReadOptions(), *query));
+    AssertCursorIDs(cursor.get(), {4, 5, 8, 9, 11});
+  }
+
+  // Delete all whose progress is bigger than 50%
+  {
+    std::unique_ptr<JSONDocument> query(
+        Parse("{'progress': {'$gt': 50.0}, '$index': 'progress'}"));
+    ASSERT_OK(db_->Remove(ReadOptions(), WriteOptions(), *query));
+  }
+
+  // 2 < priority < 6, index priority
+  {
+    std::unique_ptr<JSONDocument> query(Parse(
+        "[{'$filter': {'priority': {'$gt': 2, '$lt': 6}, "
+        "'$index': 'priority'}}]"));
+    std::unique_ptr<Cursor> cursor(db_->Query(ReadOptions(), *query));
+    AssertCursorIDs(cursor.get(), {4, 5, 9, 11});
+  }
+
+  // update set priority to 10 where job_name is 'white'
+  {
+    std::unique_ptr<JSONDocument> query(Parse("{'job_name': 'white'}"));
+    std::unique_ptr<JSONDocument> update(Parse("{'$set': {'priority': 10}}"));
+    ASSERT_OK(db_->Update(ReadOptions(), WriteOptions(), *query, *update));
+  }
+
+  // update twice: set priority to 15 where job_name is 'white'
+  {
+    std::unique_ptr<JSONDocument> query(Parse("{'job_name': 'white'}"));
+    std::unique_ptr<JSONDocument> update(Parse("{'$set': {'priority': 10},"
+                                               "'$set': {'priority': 15}}"));
+    ASSERT_OK(db_->Update(ReadOptions(), WriteOptions(), *query, *update));
+  }
+
+  // update twice: set priority to 15 and
+  // progress to 40 where job_name is 'white'
+  {
+    std::unique_ptr<JSONDocument> query(Parse("{'job_name': 'white'}"));
+    std::unique_ptr<JSONDocument> update(
+        Parse("{'$set': {'priority': 10, 'progress': 35},"
+              "'$set': {'priority': 15, 'progress': 40}}"));
+    ASSERT_OK(db_->Update(ReadOptions(), WriteOptions(), *query, *update));
+  }
+
+  // priority < 0
+  {
+    std::unique_ptr<JSONDocument> query(
+        Parse("[{'$filter': {'priority': {'$lt': 0}, '$index': 'priority'}}]"));
+    std::unique_ptr<Cursor> cursor(db_->Query(ReadOptions(), *query));
+    ASSERT_OK(cursor->status());
+    AssertCursorIDs(cursor.get(), {12, 13});
+  }
+
+  // -2 < priority < 0
+  {
+    std::unique_ptr<JSONDocument> query(
+        Parse("[{'$filter': {'priority': {'$gt': -2, '$lt': 0},"
+        " '$index': 'priority'}}]"));
+    std::unique_ptr<Cursor> cursor(db_->Query(ReadOptions(), *query));
+    ASSERT_OK(cursor->status());
+    AssertCursorIDs(cursor.get(), {12});
+  }
+
+  // -2 <= priority < 0
+  {
+    std::unique_ptr<JSONDocument> query(
+        Parse("[{'$filter': {'priority': {'$gte': -2, '$lt': 0},"
+        " '$index': 'priority'}}]"));
+    std::unique_ptr<Cursor> cursor(db_->Query(ReadOptions(), *query));
+    ASSERT_OK(cursor->status());
+    AssertCursorIDs(cursor.get(), {12, 13});
+  }
+
+  // 4 < priority
+  {
+    std::unique_ptr<JSONDocument> query(
+        Parse("[{'$filter': {'priority': {'$gt': 4}, '$index': 'priority'}}]"));
+    std::unique_ptr<Cursor> cursor(db_->Query(ReadOptions(), *query));
+    ASSERT_OK(cursor->status());
+    AssertCursorIDs(cursor.get(), {1, 2, 5});
+  }
+
+  Status s = db_->DropIndex("doesnt-exist");
+  ASSERT_TRUE(!s.ok());
+  ASSERT_OK(db_->DropIndex("priority"));
+}
+
+}  //  namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/utilities/document/json_document.cc b/src/rocksdb/utilities/document/json_document.cc
new file mode 100644
index 0000000..213bc53
--- /dev/null
+++ b/src/rocksdb/utilities/document/json_document.cc
@@ -0,0 +1,610 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/utilities/json_document.h"
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <assert.h>
+#include <inttypes.h>
+#include <string.h>
+
+#include <functional>
+#include <limits>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+
+#include "third-party/fbson/FbsonDocument.h"
+#include "third-party/fbson/FbsonJsonParser.h"
+#include "third-party/fbson/FbsonUtil.h"
+#include "util/coding.h"
+
+using std::placeholders::_1;
+
+namespace {
+
+size_t ObjectNumElem(const fbson::ObjectVal& objectVal) {
+  size_t size = 0;
+  for (auto keyValuePair : objectVal) {
+    (void)keyValuePair;
+    ++size;
+  }
+  return size;
+}
+
+template <typename Func>
+void InitJSONDocument(std::unique_ptr<char[]>* data,
+                      fbson::FbsonValue** value,
+                      Func f) {
+  // TODO(stash): maybe add function to FbsonDocument to avoid creating array?
+  fbson::FbsonWriter writer;
+  bool res __attribute__((unused)) = writer.writeStartArray();
+  assert(res);
+  uint32_t bytesWritten __attribute__((unused)) = f(writer);
+  assert(bytesWritten != 0);
+  res = writer.writeEndArray();
+  assert(res);
+  char* buf = new char[writer.getOutput()->getSize()];
+  memcpy(buf, writer.getOutput()->getBuffer(), writer.getOutput()->getSize());
+
+  *value = ((fbson::FbsonDocument *)buf)->getValue();
+  assert((*value)->isArray());
+  assert(((fbson::ArrayVal*)*value)->numElem() == 1);
+  *value = ((fbson::ArrayVal*)*value)->get(0);
+  data->reset(buf);
+}
+
+void InitString(std::unique_ptr<char[]>* data,
+                fbson::FbsonValue** value,
+                const std::string& s) {
+  InitJSONDocument(data, value, std::bind(
+      [](fbson::FbsonWriter& writer, const std::string& str) -> uint32_t {
+        bool res __attribute__((unused)) = writer.writeStartString();
+        assert(res);
+        auto bytesWritten = writer.writeString(str.c_str(),
+                            static_cast<uint32_t>(str.length()));
+        res = writer.writeEndString();
+        assert(res);
+        // If the string is empty, then bytesWritten == 0, and assert in
+        // InitJsonDocument will fail.
+        return bytesWritten + static_cast<uint32_t>(str.empty());
+      },
+  _1, s));
+}
+
+bool IsNumeric(fbson::FbsonValue* value) {
+  return value->isInt8() || value->isInt16() ||
+         value->isInt32() ||  value->isInt64();
+}
+
+int64_t GetInt64ValFromFbsonNumericType(fbson::FbsonValue* value) {
+  switch (value->type()) {
+    case fbson::FbsonType::T_Int8:
+      return reinterpret_cast<fbson::Int8Val*>(value)->val();
+    case fbson::FbsonType::T_Int16:
+      return reinterpret_cast<fbson::Int16Val*>(value)->val();
+    case fbson::FbsonType::T_Int32:
+      return reinterpret_cast<fbson::Int32Val*>(value)->val();
+    case fbson::FbsonType::T_Int64:
+      return reinterpret_cast<fbson::Int64Val*>(value)->val();
+    default:
+      assert(false);
+  }
+  return 0;
+}
+
+bool IsComparable(fbson::FbsonValue* left, fbson::FbsonValue* right) {
+  if (left->type() == right->type()) {
+    return true;
+  }
+  if (IsNumeric(left) && IsNumeric(right)) {
+    return true;
+  }
+  return false;
+}
+
+void CreateArray(std::unique_ptr<char[]>* data, fbson::FbsonValue** value) {
+  fbson::FbsonWriter writer;
+  bool res __attribute__((unused)) = writer.writeStartArray();
+  assert(res);
+  res = writer.writeEndArray();
+  assert(res);
+  data->reset(new char[writer.getOutput()->getSize()]);
+  memcpy(data->get(),
+         writer.getOutput()->getBuffer(),
+         writer.getOutput()->getSize());
+  *value = reinterpret_cast<fbson::FbsonDocument*>(data->get())->getValue();
+}
+
+void CreateObject(std::unique_ptr<char[]>* data, fbson::FbsonValue** value) {
+  fbson::FbsonWriter writer;
+  bool res __attribute__((unused)) = writer.writeStartObject();
+  assert(res);
+  res = writer.writeEndObject();
+  assert(res);
+  data->reset(new char[writer.getOutput()->getSize()]);
+  memcpy(data->get(),
+         writer.getOutput()->getBuffer(),
+         writer.getOutput()->getSize());
+  *value = reinterpret_cast<fbson::FbsonDocument*>(data->get())->getValue();
+}
+
+}  // namespace
+
+namespace rocksdb {
+
+
+// TODO(stash): find smth easier
+JSONDocument::JSONDocument() {
+  InitJSONDocument(&data_,
+                   &value_,
+                   std::bind(&fbson::FbsonWriter::writeNull, _1));
+}
+
+JSONDocument::JSONDocument(bool b) {
+  InitJSONDocument(&data_,
+                   &value_,
+                   std::bind(&fbson::FbsonWriter::writeBool, _1, b));
+}
+
+JSONDocument::JSONDocument(double d) {
+  InitJSONDocument(&data_,
+                   &value_,
+                   std::bind(&fbson::FbsonWriter::writeDouble, _1, d));
+}
+
+JSONDocument::JSONDocument(int8_t i) {
+  InitJSONDocument(&data_,
+                   &value_,
+                   std::bind(&fbson::FbsonWriter::writeInt8, _1, i));
+}
+
+JSONDocument::JSONDocument(int16_t i) {
+  InitJSONDocument(&data_,
+                   &value_,
+                   std::bind(&fbson::FbsonWriter::writeInt16, _1, i));
+}
+
+JSONDocument::JSONDocument(int32_t i) {
+  InitJSONDocument(&data_,
+                   &value_,
+                   std::bind(&fbson::FbsonWriter::writeInt32, _1, i));
+}
+
+JSONDocument::JSONDocument(int64_t i) {
+  InitJSONDocument(&data_,
+                   &value_,
+                   std::bind(&fbson::FbsonWriter::writeInt64, _1, i));
+}
+
+JSONDocument::JSONDocument(const std::string& s) {
+  InitString(&data_, &value_, s);
+}
+
+JSONDocument::JSONDocument(const char* s) : JSONDocument(std::string(s)) {
+}
+
+void JSONDocument::InitFromValue(const fbson::FbsonValue* val) {
+  data_.reset(new char[val->numPackedBytes()]);
+  memcpy(data_.get(), val, val->numPackedBytes());
+  value_ = reinterpret_cast<fbson::FbsonValue*>(data_.get());
+}
+
+// Private constructor
+JSONDocument::JSONDocument(fbson::FbsonValue* val, bool makeCopy) {
+  if (makeCopy) {
+    InitFromValue(val);
+  } else {
+    value_ = val;
+  }
+}
+
+JSONDocument::JSONDocument(Type _type) {
+  // TODO(icanadi) make all of this better by using templates
+  switch (_type) {
+    case kNull:
+      InitJSONDocument(&data_, &value_,
+                       std::bind(&fbson::FbsonWriter::writeNull, _1));
+      break;
+    case kObject:
+      CreateObject(&data_, &value_);
+      break;
+    case kBool:
+      InitJSONDocument(&data_, &value_,
+                       std::bind(&fbson::FbsonWriter::writeBool, _1, false));
+      break;
+    case kDouble:
+      InitJSONDocument(&data_, &value_,
+                       std::bind(&fbson::FbsonWriter::writeDouble, _1, 0.));
+      break;
+    case kArray:
+      CreateArray(&data_, &value_);
+      break;
+    case kInt64:
+      InitJSONDocument(&data_, &value_,
+                       std::bind(&fbson::FbsonWriter::writeInt64, _1, 0));
+      break;
+    case kString:
+      InitString(&data_, &value_, "");
+      break;
+    default:
+      assert(false);
+  }
+}
+
+JSONDocument::JSONDocument(const JSONDocument& jsonDocument) {
+  if (jsonDocument.IsOwner()) {
+    InitFromValue(jsonDocument.value_);
+  } else {
+    value_ = jsonDocument.value_;
+  }
+}
+
+JSONDocument::JSONDocument(JSONDocument&& jsonDocument) {
+  value_ = jsonDocument.value_;
+  data_.swap(jsonDocument.data_);
+}
+
+JSONDocument& JSONDocument::operator=(JSONDocument jsonDocument) {
+  value_ = jsonDocument.value_;
+  data_.swap(jsonDocument.data_);
+  return *this;
+}
+
+JSONDocument::Type JSONDocument::type() const {
+  switch (value_->type()) {
+    case fbson::FbsonType::T_Null:
+      return JSONDocument::kNull;
+
+    case fbson::FbsonType::T_True:
+    case fbson::FbsonType::T_False:
+      return JSONDocument::kBool;
+
+    case fbson::FbsonType::T_Int8:
+    case fbson::FbsonType::T_Int16:
+    case fbson::FbsonType::T_Int32:
+    case fbson::FbsonType::T_Int64:
+      return JSONDocument::kInt64;
+
+    case fbson::FbsonType::T_Double:
+      return JSONDocument::kDouble;
+
+    case fbson::FbsonType::T_String:
+      return JSONDocument::kString;
+
+    case fbson::FbsonType::T_Object:
+      return JSONDocument::kObject;
+
+    case fbson::FbsonType::T_Array:
+      return JSONDocument::kArray;
+
+    case fbson::FbsonType::T_Binary:
+      assert(false);
+    default:
+      assert(false);
+  }
+  return JSONDocument::kNull;
+}
+
+bool JSONDocument::Contains(const std::string& key) const {
+  assert(IsObject());
+  auto objectVal = reinterpret_cast<fbson::ObjectVal*>(value_);
+  return objectVal->find(key.c_str()) != nullptr;
+}
+
+JSONDocument JSONDocument::operator[](const std::string& key) const {
+  assert(IsObject());
+  auto objectVal = reinterpret_cast<fbson::ObjectVal*>(value_);
+  auto foundValue = objectVal->find(key.c_str());
+  assert(foundValue != nullptr);
+  // No need to save paths in const objects
+  JSONDocument ans(foundValue, false);
+  return std::move(ans);
+}
+
+size_t JSONDocument::Count() const {
+  assert(IsObject() || IsArray());
+  if (IsObject()) {
+    // TODO(stash): add to fbson?
+    const fbson::ObjectVal& objectVal =
+          *reinterpret_cast<fbson::ObjectVal*>(value_);
+    return ObjectNumElem(objectVal);
+  } else if (IsArray()) {
+    auto arrayVal = reinterpret_cast<fbson::ArrayVal*>(value_);
+    return arrayVal->numElem();
+  }
+  assert(false);
+  return 0;
+}
+
+JSONDocument JSONDocument::operator[](size_t i) const {
+  assert(IsArray());
+  auto arrayVal = reinterpret_cast<fbson::ArrayVal*>(value_);
+  auto foundValue = arrayVal->get(static_cast<int>(i));
+  JSONDocument ans(foundValue, false);
+  return std::move(ans);
+}
+
+bool JSONDocument::IsNull() const {
+  return value_->isNull();
+}
+
+bool JSONDocument::IsArray() const {
+  return value_->isArray();
+}
+
+bool JSONDocument::IsBool() const {
+  return value_->isTrue() || value_->isFalse();
+}
+
+bool JSONDocument::IsDouble() const {
+  return value_->isDouble();
+}
+
+bool JSONDocument::IsInt64() const {
+  return value_->isInt8() || value_->isInt16() ||
+         value_->isInt32() || value_->isInt64();
+}
+
+bool JSONDocument::IsObject() const {
+  return value_->isObject();
+}
+
+bool JSONDocument::IsString() const {
+  return value_->isString();
+}
+
+bool JSONDocument::GetBool() const {
+  assert(IsBool());
+  return value_->isTrue();
+}
+
+double JSONDocument::GetDouble() const {
+  assert(IsDouble());
+  return ((fbson::DoubleVal*)value_)->val();
+}
+
+int64_t JSONDocument::GetInt64() const {
+  assert(IsInt64());
+  return GetInt64ValFromFbsonNumericType(value_);
+}
+
+std::string JSONDocument::GetString() const {
+  assert(IsString());
+  fbson::StringVal* stringVal = (fbson::StringVal*)value_;
+  return std::string(stringVal->getBlob(), stringVal->getBlobLen());
+}
+
+namespace {
+
+// FbsonValue can be int8, int16, int32, int64
+bool CompareNumeric(fbson::FbsonValue* left, fbson::FbsonValue* right) {
+  assert(IsNumeric(left) && IsNumeric(right));
+  return GetInt64ValFromFbsonNumericType(left) ==
+         GetInt64ValFromFbsonNumericType(right);
+}
+
+bool CompareSimpleTypes(fbson::FbsonValue* left, fbson::FbsonValue* right) {
+  if (IsNumeric(left)) {
+    return CompareNumeric(left, right);
+  }
+  if (left->numPackedBytes() != right->numPackedBytes()) {
+    return false;
+  }
+  return memcmp(left, right, left->numPackedBytes()) == 0;
+}
+
+bool CompareFbsonValue(fbson::FbsonValue* left, fbson::FbsonValue* right) {
+  if (!IsComparable(left, right)) {
+    return false;
+  }
+
+  switch (left->type()) {
+    case fbson::FbsonType::T_True:
+    case fbson::FbsonType::T_False:
+    case fbson::FbsonType::T_Null:
+      return true;
+    case fbson::FbsonType::T_Int8:
+    case fbson::FbsonType::T_Int16:
+    case fbson::FbsonType::T_Int32:
+    case fbson::FbsonType::T_Int64:
+      return CompareNumeric(left, right);
+    case fbson::FbsonType::T_String:
+    case fbson::FbsonType::T_Double:
+      return CompareSimpleTypes(left, right);
+    case fbson::FbsonType::T_Object:
+    {
+      auto leftObject = reinterpret_cast<fbson::ObjectVal*>(left);
+      auto rightObject = reinterpret_cast<fbson::ObjectVal*>(right);
+      if (ObjectNumElem(*leftObject) != ObjectNumElem(*rightObject)) {
+        return false;
+      }
+      for (auto && keyValue : *leftObject) {
+        std::string str(keyValue.getKeyStr(), keyValue.klen());
+        if (rightObject->find(str.c_str()) == nullptr) {
+          return false;
+        }
+        if (!CompareFbsonValue(keyValue.value(),
+                               rightObject->find(str.c_str()))) {
+          return false;
+        }
+      }
+      return true;
+    }
+    case fbson::FbsonType::T_Array:
+    {
+      auto leftArr = reinterpret_cast<fbson::ArrayVal*>(left);
+      auto rightArr = reinterpret_cast<fbson::ArrayVal*>(right);
+      if (leftArr->numElem() != rightArr->numElem()) {
+        return false;
+      }
+      for (int i = 0; i < static_cast<int>(leftArr->numElem()); ++i) {
+        if (!CompareFbsonValue(leftArr->get(i), rightArr->get(i))) {
+          return false;
+        }
+      }
+      return true;
+    }
+    default:
+      assert(false);
+  }
+  return false;
+}
+
+}  // namespace
+
+bool JSONDocument::operator==(const JSONDocument& rhs) const {
+  return CompareFbsonValue(value_, rhs.value_);
+}
+
+bool JSONDocument::operator!=(const JSONDocument& rhs) const {
+  return !(*this == rhs);
+}
+
+JSONDocument JSONDocument::Copy() const {
+  return JSONDocument(value_, true);
+}
+
+bool JSONDocument::IsOwner() const {
+  return data_.get() != nullptr;
+}
+
+std::string JSONDocument::DebugString() const {
+  fbson::FbsonToJson fbsonToJson;
+  return fbsonToJson.json(value_);
+}
+
+JSONDocument::ItemsIteratorGenerator JSONDocument::Items() const {
+  assert(IsObject());
+  return ItemsIteratorGenerator(*(static_cast<fbson::ObjectVal*>(value_)));
+}
+
+// TODO(icanadi) (perf) allocate objects with arena
+JSONDocument* JSONDocument::ParseJSON(const char* json) {
+  fbson::FbsonJsonParser parser;
+  if (!parser.parse(json)) {
+    return nullptr;
+  }
+
+  auto fbsonVal = fbson::FbsonDocument::createValue(
+                    parser.getWriter().getOutput()->getBuffer(),
+              static_cast<uint32_t>(parser.getWriter().getOutput()->getSize()));
+
+  if (fbsonVal == nullptr) {
+    return nullptr;
+  }
+
+  return new JSONDocument(fbsonVal, true);
+}
+
+void JSONDocument::Serialize(std::string* dst) const {
+  // first byte is reserved for header
+  // currently, header is only version number. that will help us provide
+  // backwards compatility. we might also store more information here if
+  // necessary
+  dst->push_back(kSerializationFormatVersion);
+  dst->push_back(FBSON_VER);
+  dst->append(reinterpret_cast<char*>(value_), value_->numPackedBytes());
+}
+
+const char JSONDocument::kSerializationFormatVersion = 2;
+
+JSONDocument* JSONDocument::Deserialize(const Slice& src) {
+  Slice input(src);
+  if (src.size() == 0) {
+    return nullptr;
+  }
+  char header = input[0];
+  if (header == 1) {
+    assert(false);
+  }
+  input.remove_prefix(1);
+  auto value = fbson::FbsonDocument::createValue(input.data(),
+                static_cast<uint32_t>(input.size()));
+  if (value == nullptr) {
+    return nullptr;
+  }
+
+  return new JSONDocument(value, true);
+}
+
+class JSONDocument::const_item_iterator::Impl {
+ public:
+  typedef fbson::ObjectVal::const_iterator It;
+
+  explicit Impl(It it) : it_(it) {}
+
+  const char* getKeyStr() const {
+    return it_->getKeyStr();
+  }
+
+  uint8_t klen() const {
+    return it_->klen();
+  }
+
+  It& operator++() {
+    return ++it_;
+  }
+
+  bool operator!=(const Impl& other) {
+    return it_ != other.it_;
+  }
+
+  fbson::FbsonValue* value() const {
+    return it_->value();
+  }
+
+ private:
+  It it_;
+};
+
+JSONDocument::const_item_iterator::const_item_iterator(Impl* impl)
+: it_(impl) {}
+
+JSONDocument::const_item_iterator::const_item_iterator(const_item_iterator&& a)
+: it_(std::move(a.it_)) {}
+
+JSONDocument::const_item_iterator&
+  JSONDocument::const_item_iterator::operator++() {
+  ++(*it_);
+  return *this;
+}
+
+bool JSONDocument::const_item_iterator::operator!=(
+                                  const const_item_iterator& other) {
+  return *it_ != *(other.it_);
+}
+
+JSONDocument::const_item_iterator::~const_item_iterator() {
+}
+
+JSONDocument::const_item_iterator::value_type
+  JSONDocument::const_item_iterator::operator*() {
+  return {std::string(it_->getKeyStr(), it_->klen()),
+    JSONDocument(it_->value(), false)};
+}
+
+JSONDocument::ItemsIteratorGenerator::ItemsIteratorGenerator(
+                                      const fbson::ObjectVal& object)
+  : object_(object) {}
+
+JSONDocument::const_item_iterator
+      JSONDocument::ItemsIteratorGenerator::begin() const {
+  return const_item_iterator(new const_item_iterator::Impl(object_.begin()));
+}
+
+JSONDocument::const_item_iterator
+      JSONDocument::ItemsIteratorGenerator::end() const {
+  return const_item_iterator(new const_item_iterator::Impl(object_.end()));
+}
+
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/document/json_document_builder.cc b/src/rocksdb/utilities/document/json_document_builder.cc
new file mode 100644
index 0000000..0dd4ce4
--- /dev/null
+++ b/src/rocksdb/utilities/document/json_document_builder.cc
@@ -0,0 +1,115 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef ROCKSDB_LITE
+#include "rocksdb/utilities/json_document.h"
+#include "third-party/fbson/FbsonWriter.h"
+
+namespace rocksdb {
+JSONDocumentBuilder::JSONDocumentBuilder()
+: writer_(new fbson::FbsonWriter()) {
+}
+
+JSONDocumentBuilder::JSONDocumentBuilder(fbson::FbsonOutStream* out)
+: writer_(new fbson::FbsonWriter(*out)) {
+}
+
+void JSONDocumentBuilder::Reset() {
+  writer_->reset();
+}
+
+bool JSONDocumentBuilder::WriteStartArray() {
+  return writer_->writeStartArray();
+}
+
+bool JSONDocumentBuilder::WriteEndArray() {
+  return writer_->writeEndArray();
+}
+
+bool JSONDocumentBuilder::WriteStartObject() {
+  return writer_->writeStartObject();
+}
+
+bool JSONDocumentBuilder::WriteEndObject() {
+  return writer_->writeEndObject();
+}
+
+bool JSONDocumentBuilder::WriteKeyValue(const std::string& key,
+                                        const JSONDocument& value) {
+  size_t bytesWritten = writer_->writeKey(key.c_str(), key.size());
+  if (bytesWritten == 0) {
+    return false;
+  }
+  return WriteJSONDocument(value);
+}
+
+bool JSONDocumentBuilder::WriteJSONDocument(const JSONDocument& value) {
+  switch (value.type()) {
+    case JSONDocument::kNull:
+      return writer_->writeNull() != 0;
+    case JSONDocument::kInt64:
+      return writer_->writeInt64(value.GetInt64());
+    case JSONDocument::kDouble:
+      return writer_->writeDouble(value.GetDouble());
+    case JSONDocument::kBool:
+      return writer_->writeBool(value.GetBool());
+    case JSONDocument::kString:
+    {
+      bool res = writer_->writeStartString();
+      if (!res) {
+        return false;
+      }
+      const std::string& str = value.GetString();
+      res = writer_->writeString(str.c_str(),
+                  static_cast<uint32_t>(str.size()));
+      if (!res) {
+        return false;
+      }
+      return writer_->writeEndString();
+    }
+    case JSONDocument::kArray:
+    {
+      bool res = WriteStartArray();
+      if (!res) {
+        return false;
+      }
+      for (size_t i = 0; i < value.Count(); ++i) {
+        res = WriteJSONDocument(value[i]);
+        if (!res) {
+          return false;
+        }
+      }
+      return WriteEndArray();
+    }
+    case JSONDocument::kObject:
+    {
+      bool res = WriteStartObject();
+      if (!res) {
+        return false;
+      }
+      for (auto keyValue : value.Items()) {
+        WriteKeyValue(keyValue.first, keyValue.second);
+      }
+      return WriteEndObject();
+    }
+    default:
+      assert(false);
+  }
+  return false;
+}
+
+JSONDocument JSONDocumentBuilder::GetJSONDocument() {
+  fbson::FbsonValue* value =
+      fbson::FbsonDocument::createValue(writer_->getOutput()->getBuffer(),
+                       static_cast<uint32_t>(writer_->getOutput()->getSize()));
+  return JSONDocument(value, true);
+}
+
+JSONDocumentBuilder::~JSONDocumentBuilder() {
+}
+
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/document/json_document_test.cc b/src/rocksdb/utilities/document/json_document_test.cc
new file mode 100644
index 0000000..d15cd0c
--- /dev/null
+++ b/src/rocksdb/utilities/document/json_document_test.cc
@@ -0,0 +1,329 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include <map>
+#include <set>
+#include <string>
+
+#include "rocksdb/utilities/json_document.h"
+
+#include "util/testutil.h"
+#include "util/testharness.h"
+
+namespace rocksdb {
+namespace {
+void AssertField(const JSONDocument& json, const std::string& field) {
+  ASSERT_TRUE(json.Contains(field));
+  ASSERT_TRUE(json[field].IsNull());
+}
+
+void AssertField(const JSONDocument& json, const std::string& field,
+                 const std::string& expected) {
+  ASSERT_TRUE(json.Contains(field));
+  ASSERT_TRUE(json[field].IsString());
+  ASSERT_EQ(expected, json[field].GetString());
+}
+
+void AssertField(const JSONDocument& json, const std::string& field,
+                 int64_t expected) {
+  ASSERT_TRUE(json.Contains(field));
+  ASSERT_TRUE(json[field].IsInt64());
+  ASSERT_EQ(expected, json[field].GetInt64());
+}
+
+void AssertField(const JSONDocument& json, const std::string& field,
+                 bool expected) {
+  ASSERT_TRUE(json.Contains(field));
+  ASSERT_TRUE(json[field].IsBool());
+  ASSERT_EQ(expected, json[field].GetBool());
+}
+
+void AssertField(const JSONDocument& json, const std::string& field,
+                 double expected) {
+  ASSERT_TRUE(json.Contains(field));
+  ASSERT_TRUE(json[field].IsDouble());
+  ASSERT_EQ(expected, json[field].GetDouble());
+}
+}  // namespace
+
+class JSONDocumentTest : public testing::Test {
+ public:
+  JSONDocumentTest()
+  : rnd_(101)
+  {}
+
+  void AssertSampleJSON(const JSONDocument& json) {
+    AssertField(json, "title", std::string("json"));
+    AssertField(json, "type", std::string("object"));
+    // properties
+    ASSERT_TRUE(json.Contains("properties"));
+    ASSERT_TRUE(json["properties"].Contains("flags"));
+    ASSERT_TRUE(json["properties"]["flags"].IsArray());
+    ASSERT_EQ(3u, json["properties"]["flags"].Count());
+    ASSERT_TRUE(json["properties"]["flags"][0].IsInt64());
+    ASSERT_EQ(10, json["properties"]["flags"][0].GetInt64());
+    ASSERT_TRUE(json["properties"]["flags"][1].IsString());
+    ASSERT_EQ("parse", json["properties"]["flags"][1].GetString());
+    ASSERT_TRUE(json["properties"]["flags"][2].IsObject());
+    AssertField(json["properties"]["flags"][2], "tag", std::string("no"));
+    AssertField(json["properties"]["flags"][2], std::string("status"));
+    AssertField(json["properties"], "age", 110.5e-4);
+    AssertField(json["properties"], "depth", static_cast<int64_t>(-10));
+    // test iteration
+    std::set<std::string> expected({"flags", "age", "depth"});
+    for (auto item : json["properties"].Items()) {
+      auto iter = expected.find(item.first);
+      ASSERT_TRUE(iter != expected.end());
+      expected.erase(iter);
+    }
+    ASSERT_EQ(0U, expected.size());
+    ASSERT_TRUE(json.Contains("latlong"));
+    ASSERT_TRUE(json["latlong"].IsArray());
+    ASSERT_EQ(2u, json["latlong"].Count());
+    ASSERT_TRUE(json["latlong"][0].IsDouble());
+    ASSERT_EQ(53.25, json["latlong"][0].GetDouble());
+    ASSERT_TRUE(json["latlong"][1].IsDouble());
+    ASSERT_EQ(43.75, json["latlong"][1].GetDouble());
+    AssertField(json, "enabled", true);
+  }
+
+  const std::string kSampleJSON =
+      "{ \"title\" : \"json\", \"type\" : \"object\", \"properties\" : { "
+      "\"flags\": [10, \"parse\", {\"tag\": \"no\", \"status\": null}], "
+      "\"age\": 110.5e-4, \"depth\": -10 }, \"latlong\": [53.25, 43.75], "
+      "\"enabled\": true }";
+
+  const std::string kSampleJSONDifferent =
+      "{ \"title\" : \"json\", \"type\" : \"object\", \"properties\" : { "
+      "\"flags\": [10, \"parse\", {\"tag\": \"no\", \"status\": 2}], "
+      "\"age\": 110.5e-4, \"depth\": -10 }, \"latlong\": [53.25, 43.75], "
+      "\"enabled\": true }";
+
+  Random rnd_;
+};
+
+TEST_F(JSONDocumentTest, MakeNullTest) {
+  JSONDocument x;
+  ASSERT_TRUE(x.IsNull());
+  ASSERT_TRUE(x.IsOwner());
+  ASSERT_TRUE(!x.IsBool());
+}
+
+TEST_F(JSONDocumentTest, MakeBoolTest) {
+  {
+    JSONDocument x(true);
+    ASSERT_TRUE(x.IsOwner());
+    ASSERT_TRUE(x.IsBool());
+    ASSERT_TRUE(!x.IsInt64());
+    ASSERT_EQ(x.GetBool(), true);
+  }
+
+  {
+    JSONDocument x(false);
+    ASSERT_TRUE(x.IsOwner());
+    ASSERT_TRUE(x.IsBool());
+    ASSERT_TRUE(!x.IsInt64());
+    ASSERT_EQ(x.GetBool(), false);
+  }
+}
+
+TEST_F(JSONDocumentTest, MakeInt64Test) {
+  JSONDocument x(static_cast<int64_t>(16));
+  ASSERT_TRUE(x.IsInt64());
+  ASSERT_TRUE(x.IsInt64());
+  ASSERT_TRUE(!x.IsBool());
+  ASSERT_TRUE(x.IsOwner());
+  ASSERT_EQ(x.GetInt64(), 16);
+}
+
+TEST_F(JSONDocumentTest, MakeStringTest) {
+  JSONDocument x("string");
+  ASSERT_TRUE(x.IsOwner());
+  ASSERT_TRUE(x.IsString());
+  ASSERT_TRUE(!x.IsBool());
+  ASSERT_EQ(x.GetString(), "string");
+}
+
+TEST_F(JSONDocumentTest, MakeDoubleTest) {
+  JSONDocument x(5.6);
+  ASSERT_TRUE(x.IsOwner());
+  ASSERT_TRUE(x.IsDouble());
+  ASSERT_TRUE(!x.IsBool());
+  ASSERT_EQ(x.GetDouble(), 5.6);
+}
+
+TEST_F(JSONDocumentTest, MakeByTypeTest) {
+  {
+    JSONDocument x(JSONDocument::kNull);
+    ASSERT_TRUE(x.IsNull());
+  }
+  {
+    JSONDocument x(JSONDocument::kBool);
+    ASSERT_TRUE(x.IsBool());
+  }
+  {
+    JSONDocument x(JSONDocument::kString);
+    ASSERT_TRUE(x.IsString());
+  }
+  {
+    JSONDocument x(JSONDocument::kInt64);
+    ASSERT_TRUE(x.IsInt64());
+  }
+  {
+    JSONDocument x(JSONDocument::kDouble);
+    ASSERT_TRUE(x.IsDouble());
+  }
+  {
+    JSONDocument x(JSONDocument::kObject);
+    ASSERT_TRUE(x.IsObject());
+  }
+  {
+    JSONDocument x(JSONDocument::kArray);
+    ASSERT_TRUE(x.IsArray());
+  }
+}
+
+TEST_F(JSONDocumentTest, Parsing) {
+  std::unique_ptr<JSONDocument> parsed_json(
+          JSONDocument::ParseJSON(kSampleJSON.c_str()));
+  ASSERT_TRUE(parsed_json->IsOwner());
+  ASSERT_TRUE(parsed_json != nullptr);
+  AssertSampleJSON(*parsed_json);
+
+  // test deep copying
+  JSONDocument copied_json_document(*parsed_json);
+  AssertSampleJSON(copied_json_document);
+  ASSERT_TRUE(copied_json_document == *parsed_json);
+
+  std::unique_ptr<JSONDocument> parsed_different_sample(
+      JSONDocument::ParseJSON(kSampleJSONDifferent.c_str()));
+  ASSERT_TRUE(parsed_different_sample != nullptr);
+  ASSERT_TRUE(!(*parsed_different_sample == copied_json_document));
+
+  // parse error
+  const std::string kFaultyJSON =
+      kSampleJSON.substr(0, kSampleJSON.size() - 10);
+  ASSERT_TRUE(JSONDocument::ParseJSON(kFaultyJSON.c_str()) == nullptr);
+}
+
+TEST_F(JSONDocumentTest, Serialization) {
+  std::unique_ptr<JSONDocument> parsed_json(
+            JSONDocument::ParseJSON(kSampleJSON.c_str()));
+  ASSERT_TRUE(parsed_json != nullptr);
+  ASSERT_TRUE(parsed_json->IsOwner());
+  std::string serialized;
+  parsed_json->Serialize(&serialized);
+
+  std::unique_ptr<JSONDocument> deserialized_json(
+            JSONDocument::Deserialize(Slice(serialized)));
+  ASSERT_TRUE(deserialized_json != nullptr);
+  AssertSampleJSON(*deserialized_json);
+
+  // deserialization failure
+  ASSERT_TRUE(JSONDocument::Deserialize(
+                  Slice(serialized.data(), serialized.size() - 10)) == nullptr);
+}
+
+TEST_F(JSONDocumentTest, OperatorEqualsTest) {
+  // kNull
+  ASSERT_TRUE(JSONDocument() == JSONDocument());
+
+  // kBool
+  ASSERT_TRUE(JSONDocument(false) != JSONDocument());
+  ASSERT_TRUE(JSONDocument(false) == JSONDocument(false));
+  ASSERT_TRUE(JSONDocument(true) == JSONDocument(true));
+  ASSERT_TRUE(JSONDocument(false) != JSONDocument(true));
+
+  // kString
+  ASSERT_TRUE(JSONDocument("test") != JSONDocument());
+  ASSERT_TRUE(JSONDocument("test") == JSONDocument("test"));
+
+  // kInt64
+  ASSERT_TRUE(JSONDocument(static_cast<int64_t>(15)) != JSONDocument());
+  ASSERT_TRUE(JSONDocument(static_cast<int64_t>(15)) !=
+              JSONDocument(static_cast<int64_t>(14)));
+  ASSERT_TRUE(JSONDocument(static_cast<int64_t>(15)) ==
+              JSONDocument(static_cast<int64_t>(15)));
+
+  unique_ptr<JSONDocument> arrayWithInt8Doc(JSONDocument::ParseJSON("[8]"));
+  ASSERT_TRUE(arrayWithInt8Doc != nullptr);
+  ASSERT_TRUE(arrayWithInt8Doc->IsArray());
+  ASSERT_TRUE((*arrayWithInt8Doc)[0].IsInt64());
+  ASSERT_TRUE((*arrayWithInt8Doc)[0] == JSONDocument(static_cast<int64_t>(8)));
+
+  unique_ptr<JSONDocument> arrayWithInt16Doc(JSONDocument::ParseJSON("[512]"));
+  ASSERT_TRUE(arrayWithInt16Doc != nullptr);
+  ASSERT_TRUE(arrayWithInt16Doc->IsArray());
+  ASSERT_TRUE((*arrayWithInt16Doc)[0].IsInt64());
+  ASSERT_TRUE((*arrayWithInt16Doc)[0] ==
+              JSONDocument(static_cast<int64_t>(512)));
+
+  unique_ptr<JSONDocument> arrayWithInt32Doc(
+    JSONDocument::ParseJSON("[1000000]"));
+  ASSERT_TRUE(arrayWithInt32Doc != nullptr);
+  ASSERT_TRUE(arrayWithInt32Doc->IsArray());
+  ASSERT_TRUE((*arrayWithInt32Doc)[0].IsInt64());
+  ASSERT_TRUE((*arrayWithInt32Doc)[0] ==
+               JSONDocument(static_cast<int64_t>(1000000)));
+
+  // kDouble
+  ASSERT_TRUE(JSONDocument(15.) != JSONDocument());
+  ASSERT_TRUE(JSONDocument(15.) != JSONDocument(14.));
+  ASSERT_TRUE(JSONDocument(15.) == JSONDocument(15.));
+}
+
+TEST_F(JSONDocumentTest, JSONDocumentBuilderTest) {
+  unique_ptr<JSONDocument> parsedArray(
+    JSONDocument::ParseJSON("[1, [123, \"a\", \"b\"], {\"b\":\"c\"}]"));
+  ASSERT_TRUE(parsedArray != nullptr);
+
+  JSONDocumentBuilder builder;
+  ASSERT_TRUE(builder.WriteStartArray());
+  ASSERT_TRUE(builder.WriteJSONDocument(1));
+
+  ASSERT_TRUE(builder.WriteStartArray());
+    ASSERT_TRUE(builder.WriteJSONDocument(123));
+    ASSERT_TRUE(builder.WriteJSONDocument("a"));
+    ASSERT_TRUE(builder.WriteJSONDocument("b"));
+  ASSERT_TRUE(builder.WriteEndArray());
+
+  ASSERT_TRUE(builder.WriteStartObject());
+    ASSERT_TRUE(builder.WriteKeyValue("b", "c"));
+  ASSERT_TRUE(builder.WriteEndObject());
+
+  ASSERT_TRUE(builder.WriteEndArray());
+
+  ASSERT_TRUE(*parsedArray == builder.GetJSONDocument());
+}
+
+TEST_F(JSONDocumentTest, OwnershipTest) {
+  std::unique_ptr<JSONDocument> parsed(
+          JSONDocument::ParseJSON(kSampleJSON.c_str()));
+  ASSERT_TRUE(parsed != nullptr);
+  ASSERT_TRUE(parsed->IsOwner());
+
+  // Copy constructor from owner -> owner
+  JSONDocument copy_constructor(*parsed);
+  ASSERT_TRUE(copy_constructor.IsOwner());
+
+  // Copy constructor from non-owner -> non-owner
+  JSONDocument non_owner((*parsed)["properties"]);
+  ASSERT_TRUE(!non_owner.IsOwner());
+
+  // Move constructor from owner -> owner
+  JSONDocument moved_from_owner(std::move(copy_constructor));
+  ASSERT_TRUE(moved_from_owner.IsOwner());
+
+  // Move constructor from non-owner -> non-owner
+  JSONDocument moved_from_non_owner(std::move(non_owner));
+  ASSERT_TRUE(!moved_from_non_owner.IsOwner());
+}
+
+}  //  namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/utilities/flashcache/flashcache.cc b/src/rocksdb/utilities/flashcache/flashcache.cc
new file mode 100644
index 0000000..a1a0352
--- /dev/null
+++ b/src/rocksdb/utilities/flashcache/flashcache.cc
@@ -0,0 +1,136 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#include "rocksdb/utilities/flashcache.h"
+
+#include "utilities/flashcache/flashcache.h"
+
+#ifdef OS_LINUX
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include "third-party/flashcache/flashcache_ioctl.h"
+#endif
+
+namespace rocksdb {
+
+#if !defined(ROCKSDB_LITE) && defined(OS_LINUX)
+// Most of the code that handles flashcache is copied from websql's branch of
+// mysql-5.6
+class FlashcacheAwareEnv : public EnvWrapper {
+ public:
+  FlashcacheAwareEnv(Env* base, int cachedev_fd)
+      : EnvWrapper(base), cachedev_fd_(cachedev_fd) {
+    pid_t pid = getpid();
+    /* cleanup previous whitelistings */
+    if (ioctl(cachedev_fd_, FLASHCACHEDELALLWHITELIST, &pid) < 0) {
+      cachedev_fd_ = -1;
+      fprintf(stderr, "ioctl del-all-whitelist for flashcache failed\n");
+      return;
+    }
+    if (ioctl(cachedev_fd_, FLASHCACHEADDWHITELIST, &pid) < 0) {
+      fprintf(stderr, "ioctl add-whitelist for flashcache failed\n");
+    }
+  }
+
+  ~FlashcacheAwareEnv() {
+    // cachedev_fd_ is -1 if it's unitialized
+    if (cachedev_fd_ != -1) {
+      pid_t pid = getpid();
+      if (ioctl(cachedev_fd_, FLASHCACHEDELWHITELIST, &pid) < 0) {
+        fprintf(stderr, "ioctl del-whitelist for flashcache failed\n");
+      }
+    }
+  }
+
+  static int BlacklistCurrentThread(int cachedev_fd) {
+    pid_t pid = static_cast<pid_t>(syscall(SYS_gettid));
+    return ioctl(cachedev_fd, FLASHCACHEADDNCPID, &pid);
+  }
+
+  static int WhitelistCurrentThread(int cachedev_fd) {
+    pid_t pid = static_cast<pid_t>(syscall(SYS_gettid));
+    return ioctl(cachedev_fd, FLASHCACHEDELNCPID, &pid);
+  }
+
+  int GetFlashCacheFileDescriptor() { return cachedev_fd_; }
+
+  struct Arg {
+    Arg(void (*f)(void* arg), void* a, int _cachedev_fd)
+        : original_function_(f), original_arg_(a), cachedev_fd(_cachedev_fd) {}
+
+    void (*original_function_)(void* arg);
+    void* original_arg_;
+    int cachedev_fd;
+  };
+
+  static void BgThreadWrapper(void* a) {
+    Arg* arg = reinterpret_cast<Arg*>(a);
+    if (arg->cachedev_fd != -1) {
+      if (BlacklistCurrentThread(arg->cachedev_fd) < 0) {
+        fprintf(stderr, "ioctl add-nc-pid for flashcache failed\n");
+      }
+    }
+    arg->original_function_(arg->original_arg_);
+    if (arg->cachedev_fd != -1) {
+      if (WhitelistCurrentThread(arg->cachedev_fd) < 0) {
+        fprintf(stderr, "ioctl del-nc-pid for flashcache failed\n");
+      }
+    }
+    delete arg;
+  }
+
+  int UnSchedule(void* arg, Priority pri) override {
+    // no unschedule for you
+    return 0;
+  }
+
+  void Schedule(void (*f)(void* arg), void* a, Priority pri,
+                void* tag = nullptr) override {
+    EnvWrapper::Schedule(&BgThreadWrapper, new Arg(f, a, cachedev_fd_), pri,
+                         tag);
+  }
+
+ private:
+  int cachedev_fd_;
+};
+
+std::unique_ptr<Env> NewFlashcacheAwareEnv(Env* base,
+                                           const int cachedev_fd) {
+  std::unique_ptr<Env> ret(new FlashcacheAwareEnv(base, cachedev_fd));
+  return std::move(ret);
+}
+
+int FlashcacheBlacklistCurrentThread(Env* flashcache_aware_env) {
+  int fd = dynamic_cast<FlashcacheAwareEnv*>(flashcache_aware_env)
+               ->GetFlashCacheFileDescriptor();
+  if (fd == -1) {
+    return -1;
+  }
+  return FlashcacheAwareEnv::BlacklistCurrentThread(fd);
+}
+int FlashcacheWhitelistCurrentThread(Env* flashcache_aware_env) {
+  int fd = dynamic_cast<FlashcacheAwareEnv*>(flashcache_aware_env)
+               ->GetFlashCacheFileDescriptor();
+  if (fd == -1) {
+    return -1;
+  }
+  return FlashcacheAwareEnv::WhitelistCurrentThread(fd);
+}
+
+#else   // !defined(ROCKSDB_LITE) && defined(OS_LINUX)
+std::unique_ptr<Env> NewFlashcacheAwareEnv(Env* base,
+                                           const int cachedev_fd) {
+  return nullptr;
+}
+int FlashcacheBlacklistCurrentThread(Env* flashcache_aware_env) { return -1; }
+int FlashcacheWhitelistCurrentThread(Env* flashcache_aware_env) { return -1; }
+
+#endif  // !defined(ROCKSDB_LITE) && defined(OS_LINUX)
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/utilities/flashcache/flashcache.h b/src/rocksdb/utilities/flashcache/flashcache.h
new file mode 100644
index 0000000..a8a3d7d
--- /dev/null
+++ b/src/rocksdb/utilities/flashcache/flashcache.h
@@ -0,0 +1,18 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include <string>
+#include "rocksdb/env.h"
+
+namespace rocksdb {
+
+// This is internal API that will make hacking on flashcache easier. Not sure if
+// we need to expose this to public users, probably not
+extern int FlashcacheBlacklistCurrentThread(Env* flashcache_aware_env);
+extern int FlashcacheWhitelistCurrentThread(Env* flashcache_aware_env);
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/utilities/geodb/geodb_impl.cc b/src/rocksdb/utilities/geodb/geodb_impl.cc
index 065e5ca..6f285fb 100644
--- a/src/rocksdb/utilities/geodb/geodb_impl.cc
+++ b/src/rocksdb/utilities/geodb/geodb_impl.cc
@@ -7,7 +7,9 @@
 
 #include "utilities/geodb/geodb_impl.h"
 
+#ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
+#endif
 
 #include <vector>
 #include <map>
@@ -15,6 +17,7 @@
 #include <limits>
 #include "db/filename.h"
 #include "util/coding.h"
+#include "util/string_util.h"
 
 //
 // There are two types of keys. The first type of key-values
@@ -81,7 +84,7 @@ Status GeoDBImpl::GetByPosition(const GeoPosition& pos,
 
 Status GeoDBImpl::GetById(const Slice& id, GeoObject* object) {
   Status status;
-  Slice quadkey;
+  std::string quadkey;
 
   // create an iterator so that we can get a consistent picture
   // of the database.
@@ -94,7 +97,7 @@ Status GeoDBImpl::GetById(const Slice& id, GeoObject* object) {
   iter->Seek(key2);
   if (iter->Valid() && iter->status().ok()) {
     if (iter->key().compare(key2) == 0) {
-      quadkey = iter->value();
+      quadkey = iter->value().ToString();
     }
   }
   if (quadkey.size() == 0) {
@@ -105,7 +108,7 @@ Status GeoDBImpl::GetById(const Slice& id, GeoObject* object) {
   //
   // Seek to the quadkey + id prefix
   //
-  std::string prefix = MakeKey1Prefix(quadkey.ToString(), id);
+  std::string prefix = MakeKey1Prefix(quadkey, id);
   iter->Seek(Slice(prefix));
   assert(iter->Valid());
   if (!iter->Valid() || !iter->status().ok()) {
@@ -114,9 +117,8 @@ Status GeoDBImpl::GetById(const Slice& id, GeoObject* object) {
   }
 
   // split the key into p + quadkey + id + lat + lon
-  std::vector<std::string> parts;
   Slice key = iter->key();
-  StringSplit(&parts, key.ToString(), ':');
+  std::vector<std::string> parts = StringSplit(key.ToString(), ':');
   assert(parts.size() == 5);
   assert(parts[0] == "p");
   assert(parts[1] == quadkey);
@@ -178,9 +180,8 @@ Status GeoDBImpl::SearchRadial(const GeoPosition& pos,
          number_of_values > 0 && iter->Valid() && iter->status().ok();
          iter->Next()) {
       // split the key into p + quadkey + id + lat + lon
-      std::vector<std::string> parts;
       Slice key = iter->key();
-      StringSplit(&parts, key.ToString(), ':');
+      std::vector<std::string> parts = StringSplit(key.ToString(), ':');
       assert(parts.size() == 5);
       assert(parts[0] == "p");
       std::string* quadkey = &parts[1];
@@ -190,8 +191,8 @@ Status GeoDBImpl::SearchRadial(const GeoPosition& pos,
       // we are looking for.
       auto res = std::mismatch(qid.begin(), qid.end(), quadkey->begin());
       if (res.first == qid.end()) {
-        GeoPosition pos(atof(parts[3].c_str()), atof(parts[4].c_str()));
-        GeoObject obj(pos, parts[4], iter->value().ToString());
+        GeoPosition obj_pos(atof(parts[3].c_str()), atof(parts[4].c_str()));
+        GeoObject obj(obj_pos, parts[4], iter->value().ToString());
         values->push_back(obj);
         number_of_values--;
       } else {
@@ -205,8 +206,8 @@ Status GeoDBImpl::SearchRadial(const GeoPosition& pos,
 
 std::string GeoDBImpl::MakeKey1(const GeoPosition& pos, Slice id,
                                 std::string quadkey) {
-  std::string lat = std::to_string(pos.latitude);
-  std::string lon = std::to_string(pos.longitude);
+  std::string lat = rocksdb::ToString(pos.latitude);
+  std::string lon = rocksdb::ToString(pos.longitude);
   std::string key = "p:";
   key.reserve(5 + quadkey.size() + id.size() + lat.size() + lon.size());
   key.append(quadkey);
@@ -241,16 +242,6 @@ std::string GeoDBImpl::MakeQuadKeyPrefix(std::string quadkey) {
   return key;
 }
 
-void GeoDBImpl::StringSplit(std::vector<std::string>* tokens,
-                            const std::string &text, char sep) {
-  std::size_t start = 0, end = 0;
-  while ((end = text.find(sep, start)) != std::string::npos) {
-    tokens->push_back(text.substr(start, end - start));
-    start = end + 1;
-  }
-  tokens->push_back(text.substr(start));
-}
-
 // convert degrees to radians
 double GeoDBImpl::radians(double x) {
   return (x * PI) / 180;
@@ -307,7 +298,7 @@ Status GeoDBImpl::searchQuadIds(const GeoPosition& position,
 
   // how many level of details to look for
   int numberOfTilesAtMaxDepth = floor((bottomRight.x - topLeft.x) / 256);
-  int zoomLevelsToRise = floor(log(numberOfTilesAtMaxDepth) / log(2));
+  int zoomLevelsToRise = floor(::log(numberOfTilesAtMaxDepth) / ::log(2));
   zoomLevelsToRise++;
   int levels = std::max(0, Detail - zoomLevelsToRise);
 
@@ -344,7 +335,7 @@ GeoDBImpl::Pixel GeoDBImpl::PositionToPixel(const GeoPosition& pos,
   double latitude = clip(pos.latitude, MinLatitude, MaxLatitude);
   double x = (pos.longitude + 180) / 360;
   double sinLatitude = sin(latitude * PI / 180);
-  double y = 0.5 - log((1 + sinLatitude) / (1 - sinLatitude)) / (4 * PI);
+  double y = 0.5 - ::log((1 + sinLatitude) / (1 - sinLatitude)) / (4 * PI);
   double mapSize = MapSize(levelOfDetail);
   double X = floor(clip(x * mapSize + 0.5, 0, mapSize - 1));
   double Y = floor(clip(y * mapSize + 0.5, 0, mapSize - 1));
@@ -395,10 +386,10 @@ std::string GeoDBImpl::TileToQuadKey(const Tile& tile, int levelOfDetail) {
 // Convert a quadkey to a tile and its level of detail
 //
 void GeoDBImpl::QuadKeyToTile(std::string quadkey, Tile* tile,
-                                     int *levelOfDetail) {
+                              int* levelOfDetail) {
   tile->x = tile->y = 0;
-  *levelOfDetail = quadkey.size();
-  const char* key = reinterpret_cast<const char *>(quadkey.c_str());
+  *levelOfDetail = static_cast<int>(quadkey.size());
+  const char* key = reinterpret_cast<const char*>(quadkey.c_str());
   for (int i = *levelOfDetail; i > 0; i--) {
     int mask = 1 << (i - 1);
     switch (key[*levelOfDetail - i]) {
diff --git a/src/rocksdb/utilities/geodb/geodb_impl.h b/src/rocksdb/utilities/geodb/geodb_impl.h
index 4ee42ad..35b7a85 100644
--- a/src/rocksdb/utilities/geodb/geodb_impl.h
+++ b/src/rocksdb/utilities/geodb/geodb_impl.h
@@ -14,8 +14,8 @@
 #include <stdexcept>
 #include <vector>
 
-#include "utilities/geo_db.h"
-#include "utilities/stackable_db.h"
+#include "rocksdb/utilities/geo_db.h"
+#include "rocksdb/utilities/stackable_db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/status.h"
 
@@ -30,27 +30,25 @@ class GeoDBImpl : public GeoDB {
 
   // Associate the GPS location with the identified by 'id'. The value
   // is a blob that is associated with this object.
-  virtual Status Insert(const GeoObject& object);
+  virtual Status Insert(const GeoObject& object) override;
 
   // Retrieve the value of the object located at the specified GPS
   // location and is identified by the 'id'.
-  virtual Status GetByPosition(const GeoPosition& pos,
-                               const Slice& id,
-                               std::string* value);
+  virtual Status GetByPosition(const GeoPosition& pos, const Slice& id,
+                               std::string* value) override;
 
   // Retrieve the value of the object identified by the 'id'. This method
   // could be potentially slower than GetByPosition
-  virtual Status GetById(const Slice& id, GeoObject* object);
+  virtual Status GetById(const Slice& id, GeoObject* object) override;
 
   // Delete the specified object
-  virtual Status Remove(const Slice& id);
+  virtual Status Remove(const Slice& id) override;
 
   // Returns a list of all items within a circular radius from the
   // specified gps location
-  virtual Status SearchRadial(const GeoPosition& pos,
-                              double radius,
+  virtual Status SearchRadial(const GeoPosition& pos, double radius,
                               std::vector<GeoObject>* values,
-                              int number_of_values);
+                              int number_of_values) override;
 
  private:
   DB* db_;
@@ -169,11 +167,6 @@ class GeoDBImpl : public GeoDB {
                        double radius,
                        std::vector<std::string>* quadKeys);
 
-  // splits a string into its components
-  static void StringSplit(std::vector<std::string>* tokens,
-                          const std::string &text,
-                          char sep);
-
   //
   // Create keys for accessing rocksdb table(s)
   //
diff --git a/src/rocksdb/utilities/geodb/geodb_test.cc b/src/rocksdb/utilities/geodb/geodb_test.cc
index 1a42e32..93fa1e1 100644
--- a/src/rocksdb/utilities/geodb/geodb_test.cc
+++ b/src/rocksdb/utilities/geodb/geodb_test.cc
@@ -11,7 +11,7 @@
 
 namespace rocksdb {
 
-class GeoDBTest {
+class GeoDBTest : public testing::Test {
  public:
   static const std::string kDefaultDbName;
   static Options options;
@@ -20,7 +20,7 @@ class GeoDBTest {
 
   GeoDBTest() {
     GeoDBOptions geodb_options;
-    ASSERT_OK(DestroyDB(kDefaultDbName, options));
+    EXPECT_OK(DestroyDB(kDefaultDbName, options));
     options.create_if_missing = true;
     Status status = DB::Open(options, kDefaultDbName, &db);
     geodb =  new GeoDBImpl(db, geodb_options);
@@ -39,7 +39,7 @@ const std::string GeoDBTest::kDefaultDbName = "/tmp/geodefault";
 Options GeoDBTest::options = Options();
 
 // Insert, Get and Remove
-TEST(GeoDBTest, SimpleTest) {
+TEST_F(GeoDBTest, SimpleTest) {
   GeoPosition pos1(100, 101);
   std::string id1("id1");
   std::string value1("value1");
@@ -90,7 +90,7 @@ TEST(GeoDBTest, SimpleTest) {
 
 // Search.
 // Verify distances via http://www.stevemorse.org/nearest/distance.php
-TEST(GeoDBTest, Search) {
+TEST_F(GeoDBTest, Search) {
   GeoPosition pos1(45, 45);
   std::string id1("mid1");
   std::string value1 = "midvalue1";
@@ -119,5 +119,6 @@ TEST(GeoDBTest, Search) {
 }  // namespace rocksdb
 
 int main(int argc, char* argv[]) {
-  return rocksdb::test::RunAllTests();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
 }
diff --git a/src/rocksdb/utilities/leveldb_options/leveldb_options.cc b/src/rocksdb/utilities/leveldb_options/leveldb_options.cc
new file mode 100644
index 0000000..cb7dfb8
--- /dev/null
+++ b/src/rocksdb/utilities/leveldb_options/leveldb_options.cc
@@ -0,0 +1,56 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/utilities/leveldb_options.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+
+namespace rocksdb {
+
+LevelDBOptions::LevelDBOptions()
+    : comparator(BytewiseComparator()),
+      create_if_missing(false),
+      error_if_exists(false),
+      paranoid_checks(false),
+      env(Env::Default()),
+      info_log(nullptr),
+      write_buffer_size(4 << 20),
+      max_open_files(1000),
+      block_cache(nullptr),
+      block_size(4096),
+      block_restart_interval(16),
+      compression(kSnappyCompression),
+      filter_policy(nullptr) {}
+
+Options ConvertOptions(const LevelDBOptions& leveldb_options) {
+  Options options = Options();
+  options.create_if_missing = leveldb_options.create_if_missing;
+  options.error_if_exists = leveldb_options.error_if_exists;
+  options.paranoid_checks = leveldb_options.paranoid_checks;
+  options.env = leveldb_options.env;
+  options.info_log.reset(leveldb_options.info_log);
+  options.write_buffer_size = leveldb_options.write_buffer_size;
+  options.max_open_files = leveldb_options.max_open_files;
+  options.compression = leveldb_options.compression;
+
+  BlockBasedTableOptions table_options;
+  table_options.block_cache.reset(leveldb_options.block_cache);
+  table_options.block_size = leveldb_options.block_size;
+  table_options.block_restart_interval = leveldb_options.block_restart_interval;
+  table_options.filter_policy.reset(leveldb_options.filter_policy);
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  return options;
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/utilities/merge_operators/string_append/stringappend_test.cc b/src/rocksdb/utilities/merge_operators/string_append/stringappend_test.cc
index a68186a..a0d137c 100644
--- a/src/rocksdb/utilities/merge_operators/string_append/stringappend_test.cc
+++ b/src/rocksdb/utilities/merge_operators/string_append/stringappend_test.cc
@@ -11,10 +11,10 @@
 
 #include "rocksdb/db.h"
 #include "rocksdb/merge_operator.h"
+#include "rocksdb/utilities/db_ttl.h"
 #include "utilities/merge_operators.h"
 #include "utilities/merge_operators/string_append/stringappend.h"
 #include "utilities/merge_operators/string_append/stringappend2.h"
-#include "utilities/db_ttl.h"
 #include "util/testharness.h"
 #include "util/random.h"
 
@@ -32,7 +32,7 @@ std::shared_ptr<DB> OpenNormalDb(char delim_char) {
   Options options;
   options.create_if_missing = true;
   options.merge_operator.reset(new StringAppendOperator(delim_char));
-  ASSERT_OK(DB::Open(options, kDbName,  &db));
+  EXPECT_OK(DB::Open(options, kDbName, &db));
   return std::shared_ptr<DB>(db);
 }
 
@@ -42,7 +42,7 @@ std::shared_ptr<DB> OpenTtlDb(char delim_char) {
   Options options;
   options.create_if_missing = true;
   options.merge_operator.reset(new StringAppendTESTOperator(delim_char));
-  ASSERT_OK(DBWithTTL::Open(options, kDbName, &db, 123456));
+  EXPECT_OK(DBWithTTL::Open(options, kDbName, &db, 123456));
   return std::shared_ptr<DB>(db);
 }
 }  // namespace
@@ -106,7 +106,7 @@ class StringLists {
 
 
 // The class for unit-testing
-class StringAppendOperatorTest {
+class StringAppendOperatorTest : public testing::Test {
  public:
   StringAppendOperatorTest() {
     DestroyDB(kDbName, Options());    // Start each test with a fresh DB
@@ -127,7 +127,7 @@ StringAppendOperatorTest::OpenFuncPtr StringAppendOperatorTest::OpenDb = nullptr
 
 // THE TEST CASES BEGIN HERE
 
-TEST(StringAppendOperatorTest, IteratorTest) {
+TEST_F(StringAppendOperatorTest, IteratorTest) {
   auto db_ = OpenDb(',');
   StringLists slists(db_);
 
@@ -220,7 +220,7 @@ TEST(StringAppendOperatorTest, IteratorTest) {
 
 }
 
-TEST(StringAppendOperatorTest, SimpleTest) {
+TEST_F(StringAppendOperatorTest, SimpleTest) {
   auto db = OpenDb(',');
   StringLists slists(db);
 
@@ -235,7 +235,7 @@ TEST(StringAppendOperatorTest, SimpleTest) {
   ASSERT_EQ(res, "v1,v2,v3");
 }
 
-TEST(StringAppendOperatorTest, SimpleDelimiterTest) {
+TEST_F(StringAppendOperatorTest, SimpleDelimiterTest) {
   auto db = OpenDb('|');
   StringLists slists(db);
 
@@ -248,7 +248,7 @@ TEST(StringAppendOperatorTest, SimpleDelimiterTest) {
   ASSERT_EQ(res, "v1|v2|v3");
 }
 
-TEST(StringAppendOperatorTest, OneValueNoDelimiterTest) {
+TEST_F(StringAppendOperatorTest, OneValueNoDelimiterTest) {
   auto db = OpenDb('!');
   StringLists slists(db);
 
@@ -259,7 +259,7 @@ TEST(StringAppendOperatorTest, OneValueNoDelimiterTest) {
   ASSERT_EQ(res, "single_val");
 }
 
-TEST(StringAppendOperatorTest, VariousKeys) {
+TEST_F(StringAppendOperatorTest, VariousKeys) {
   auto db = OpenDb('\n');
   StringLists slists(db);
 
@@ -285,7 +285,7 @@ TEST(StringAppendOperatorTest, VariousKeys) {
 }
 
 // Generate semi random keys/words from a small distribution.
-TEST(StringAppendOperatorTest, RandomMixGetAppend) {
+TEST_F(StringAppendOperatorTest, RandomMixGetAppend) {
   auto db = OpenDb(' ');
   StringLists slists(db);
 
@@ -336,7 +336,7 @@ TEST(StringAppendOperatorTest, RandomMixGetAppend) {
 
 }
 
-TEST(StringAppendOperatorTest, BIGRandomMixGetAppend) {
+TEST_F(StringAppendOperatorTest, BIGRandomMixGetAppend) {
   auto db = OpenDb(' ');
   StringLists slists(db);
 
@@ -387,8 +387,7 @@ TEST(StringAppendOperatorTest, BIGRandomMixGetAppend) {
 
 }
 
-
-TEST(StringAppendOperatorTest, PersistentVariousKeys) {
+TEST_F(StringAppendOperatorTest, PersistentVariousKeys) {
   // Perform the following operations in limited scope
   {
     auto db = OpenDb('\n');
@@ -455,7 +454,7 @@ TEST(StringAppendOperatorTest, PersistentVariousKeys) {
   }
 }
 
-TEST(StringAppendOperatorTest, PersistentFlushAndCompaction) {
+TEST_F(StringAppendOperatorTest, PersistentFlushAndCompaction) {
   // Perform the following operations in limited scope
   {
     auto db = OpenDb('\n');
@@ -551,7 +550,7 @@ TEST(StringAppendOperatorTest, PersistentFlushAndCompaction) {
   }
 }
 
-TEST(StringAppendOperatorTest, SimpleTestNullDelimiter) {
+TEST_F(StringAppendOperatorTest, SimpleTestNullDelimiter) {
   auto db = OpenDb('\0');
   StringLists slists(db);
 
@@ -576,20 +575,22 @@ TEST(StringAppendOperatorTest, SimpleTestNullDelimiter) {
 
 } // namespace rocksdb
 
-int main(int arc, char** argv) {
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
   // Run with regular database
+  int result;
   {
     fprintf(stderr, "Running tests with regular db and operator.\n");
     StringAppendOperatorTest::SetOpenDbFunction(&OpenNormalDb);
-    rocksdb::test::RunAllTests();
+    result = RUN_ALL_TESTS();
   }
 
   // Run with TTL
   {
     fprintf(stderr, "Running tests with ttl db and generic operator.\n");
     StringAppendOperatorTest::SetOpenDbFunction(&OpenTtlDb);
-    rocksdb::test::RunAllTests();
+    result |= RUN_ALL_TESTS();
   }
 
-  return 0;
+  return result;
 }
diff --git a/src/rocksdb/utilities/merge_operators/uint64add.cc b/src/rocksdb/utilities/merge_operators/uint64add.cc
index 9d78651..d5083e3 100644
--- a/src/rocksdb/utilities/merge_operators/uint64add.cc
+++ b/src/rocksdb/utilities/merge_operators/uint64add.cc
@@ -1,3 +1,8 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
 #include <memory>
 #include "rocksdb/env.h"
 #include "rocksdb/merge_operator.h"
@@ -45,7 +50,8 @@ class UInt64AddOperator : public AssociativeMergeOperator {
       result = DecodeFixed64(value.data());
     } else if (logger != nullptr) {
       // If value is corrupted, treat it as 0
-      Log(logger, "uint64 value corruption, size: %zu > %zu",
+      Log(InfoLogLevel::ERROR_LEVEL, logger,
+          "uint64 value corruption, size: %zu > %zu",
           value.size(), sizeof(uint64_t));
     }
 
diff --git a/src/rocksdb/utilities/redis/redis_list_exception.h b/src/rocksdb/utilities/redis/redis_list_exception.h
index 0b0f376..f93bcbb 100644
--- a/src/rocksdb/utilities/redis/redis_list_exception.h
+++ b/src/rocksdb/utilities/redis/redis_list_exception.h
@@ -13,7 +13,7 @@ namespace rocksdb {
 
 class RedisListException: public std::exception {
  public:
-  const char* what() const throw() {
+  const char* what() const throw() override {
     return "Invalid operation or corrupt data in Redis List.";
   }
 };
diff --git a/src/rocksdb/utilities/redis/redis_list_iterator.h b/src/rocksdb/utilities/redis/redis_list_iterator.h
index b776ada..6d0b1a6 100644
--- a/src/rocksdb/utilities/redis/redis_list_iterator.h
+++ b/src/rocksdb/utilities/redis/redis_list_iterator.h
@@ -67,7 +67,7 @@ class RedisListIterator {
   ///      attempted, a RedisListException will immediately be thrown.
   RedisListIterator(const std::string& list_data)
       : data_(list_data.data()),
-        num_bytes_(list_data.size()),
+        num_bytes_(static_cast<uint32_t>(list_data.size())),
         cur_byte_(0),
         cur_elem_(0),
         cur_elem_length_(0),
@@ -135,11 +135,11 @@ class RedisListIterator {
     // Ensure we are in a valid state
     CheckErrors();
 
-    const int kOrigSize = result_.size();
+    const int kOrigSize = static_cast<int>(result_.size());
     result_.resize(kOrigSize + SizeOf(elem));
-    EncodeFixed32(result_.data() + kOrigSize, elem.size());
-    memcpy(result_.data() + kOrigSize + sizeof(uint32_t),
-           elem.data(),
+    EncodeFixed32(result_.data() + kOrigSize,
+                  static_cast<uint32_t>(elem.size()));
+    memcpy(result_.data() + kOrigSize + sizeof(uint32_t), elem.data(),
            elem.size());
     ++length_;
     ++cur_elem_;
@@ -169,7 +169,7 @@ class RedisListIterator {
   int Size() const {
     // result_ holds the currently written data
     // data_[cur_byte..num_bytes-1] is the remainder of the data
-    return result_.size() + (num_bytes_ - cur_byte_);
+    return static_cast<int>(result_.size() + (num_bytes_ - cur_byte_));
   }
 
   // Reached the end?
@@ -209,7 +209,7 @@ class RedisListIterator {
   /// E.G. This can be used to compute the bytes we want to Reserve().
   static uint32_t SizeOf(const Slice& elem) {
     // [Integer Length . Data]
-    return sizeof(uint32_t) + elem.size();
+    return static_cast<uint32_t>(sizeof(uint32_t) + elem.size());
   }
 
  private: // Private functions
diff --git a/src/rocksdb/utilities/redis/redis_lists_test.cc b/src/rocksdb/utilities/redis/redis_lists_test.cc
index b05c6c7..14ed316 100644
--- a/src/rocksdb/utilities/redis/redis_lists_test.cc
+++ b/src/rocksdb/utilities/redis/redis_lists_test.cc
@@ -28,7 +28,7 @@ using namespace std;
 
 namespace rocksdb {
 
-class RedisListsTest {
+class RedisListsTest : public testing::Test {
  public:
   static const string kDefaultDbName;
   static Options options;
@@ -55,7 +55,7 @@ void AssertListEq(const std::vector<std::string>& result,
 }  // namespace
 
 // PushRight, Length, Index, Range
-TEST(RedisListsTest, SimpleTest) {
+TEST_F(RedisListsTest, SimpleTest) {
   RedisLists redis(kDefaultDbName, options, true);   // Destructive
 
   string tempv; // Used below for all Index(), PopRight(), PopLeft()
@@ -84,7 +84,7 @@ TEST(RedisListsTest, SimpleTest) {
 }
 
 // PushLeft, Length, Index, Range
-TEST(RedisListsTest, SimpleTest2) {
+TEST_F(RedisListsTest, SimpleTest2) {
   RedisLists redis(kDefaultDbName, options, true);   // Destructive
 
   string tempv; // Used below for all Index(), PopRight(), PopLeft()
@@ -113,7 +113,7 @@ TEST(RedisListsTest, SimpleTest2) {
 }
 
 // Exhaustive test of the Index() function
-TEST(RedisListsTest, IndexTest) {
+TEST_F(RedisListsTest, IndexTest) {
   RedisLists redis(kDefaultDbName, options, true);   // Destructive
 
   string tempv; // Used below for all Index(), PopRight(), PopLeft()
@@ -172,7 +172,7 @@ TEST(RedisListsTest, IndexTest) {
 
 
 // Exhaustive test of the Range() function
-TEST(RedisListsTest, RangeTest) {
+TEST_F(RedisListsTest, RangeTest) {
   RedisLists redis(kDefaultDbName, options, true);   // Destructive
 
   string tempv; // Used below for all Index(), PopRight(), PopLeft()
@@ -255,7 +255,7 @@ TEST(RedisListsTest, RangeTest) {
 }
 
 // Exhaustive test for InsertBefore(), and InsertAfter()
-TEST(RedisListsTest, InsertTest) {
+TEST_F(RedisListsTest, InsertTest) {
   RedisLists redis(kDefaultDbName, options, true);
 
   string tempv; // Used below for all Index(), PopRight(), PopLeft()
@@ -339,7 +339,7 @@ TEST(RedisListsTest, InsertTest) {
 }
 
 // Exhaustive test of Set function
-TEST(RedisListsTest, SetTest) {
+TEST_F(RedisListsTest, SetTest) {
   RedisLists redis(kDefaultDbName, options, true);
 
   string tempv; // Used below for all Index(), PopRight(), PopLeft()
@@ -435,7 +435,7 @@ TEST(RedisListsTest, SetTest) {
 }
 
 // Testing Insert, Push, and Set, in a mixed environment
-TEST(RedisListsTest, InsertPushSetTest) {
+TEST_F(RedisListsTest, InsertPushSetTest) {
   RedisLists redis(kDefaultDbName, options, true);   // Destructive
 
   string tempv; // Used below for all Index(), PopRight(), PopLeft()
@@ -527,7 +527,7 @@ TEST(RedisListsTest, InsertPushSetTest) {
 }
 
 // Testing Trim, Pop
-TEST(RedisListsTest, TrimPopTest) {
+TEST_F(RedisListsTest, TrimPopTest) {
   RedisLists redis(kDefaultDbName, options, true);   // Destructive
 
   string tempv; // Used below for all Index(), PopRight(), PopLeft()
@@ -597,7 +597,7 @@ TEST(RedisListsTest, TrimPopTest) {
 }
 
 // Testing Remove, RemoveFirst, RemoveLast
-TEST(RedisListsTest, RemoveTest) {
+TEST_F(RedisListsTest, RemoveTest) {
   RedisLists redis(kDefaultDbName, options, true);   // Destructive
 
   string tempv; // Used below for all Index(), PopRight(), PopLeft()
@@ -688,8 +688,7 @@ TEST(RedisListsTest, RemoveTest) {
 
 
 // Test Multiple keys and Persistence
-TEST(RedisListsTest, PersistenceMultiKeyTest) {
-
+TEST_F(RedisListsTest, PersistenceMultiKeyTest) {
   string tempv; // Used below for all Index(), PopRight(), PopLeft()
 
   // Block one: populate a single key in the database
@@ -745,9 +744,9 @@ TEST(RedisListsTest, PersistenceMultiKeyTest) {
 
 namespace {
 void MakeUpper(std::string* const s) {
-  int len = s->length();
-  for(int i=0; i<len; ++i) {
-    (*s)[i] = toupper((*s)[i]); // C-version defined in <ctype.h>
+  int len = static_cast<int>(s->length());
+  for (int i = 0; i < len; ++i) {
+    (*s)[i] = toupper((*s)[i]);  // C-version defined in <ctype.h>
   }
 }
 
@@ -874,11 +873,12 @@ bool found_arg(int argc, char* argv[], const char* want){
 // However, if -m is specified, it will do user manual/interactive testing
 // -m -d is manual and destructive (will clear the database before use)
 int main(int argc, char* argv[]) {
+  ::testing::InitGoogleTest(&argc, argv);
   if (found_arg(argc, argv, "-m")) {
     bool destructive = found_arg(argc, argv, "-d");
     return rocksdb::manual_redis_test(destructive);
   } else {
-    return rocksdb::test::RunAllTests();
+    return RUN_ALL_TESTS();
   }
 }
 
diff --git a/src/rocksdb/utilities/spatialdb/spatial_db.cc b/src/rocksdb/utilities/spatialdb/spatial_db.cc
new file mode 100644
index 0000000..a901853
--- /dev/null
+++ b/src/rocksdb/utilities/spatialdb/spatial_db.cc
@@ -0,0 +1,893 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/utilities/spatial_db.h"
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <algorithm>
+#include <condition_variable>
+#include <inttypes.h>
+#include <string>
+#include <vector>
+#include <mutex>
+#include <thread>
+#include <set>
+#include <unordered_set>
+
+#include "rocksdb/cache.h"
+#include "rocksdb/options.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/table.h"
+#include "rocksdb/db.h"
+#include "rocksdb/utilities/stackable_db.h"
+#include "util/coding.h"
+#include "utilities/spatialdb/utils.h"
+
+namespace rocksdb {
+namespace spatial {
+
+// Column families are used to store element's data and spatial indexes. We use
+// [default] column family to store the element data. This is the format of
+// [default] column family:
+// * id (fixed 64 big endian) -> blob (length prefixed slice) feature_set
+// (serialized)
+// We have one additional column family for each spatial index. The name of the
+// column family is [spatial$<spatial_index_name>]. The format is:
+// * quad_key (fixed 64 bit big endian) id (fixed 64 bit big endian) -> ""
+// We store information about indexes in [metadata] column family. Format is:
+// * spatial$<spatial_index_name> -> bbox (4 double encodings) tile_bits
+// (varint32)
+
+namespace {
+const std::string kMetadataColumnFamilyName("metadata");
+inline std::string GetSpatialIndexColumnFamilyName(
+    const std::string& spatial_index_name) {
+  return "spatial$" + spatial_index_name;
+}
+inline bool GetSpatialIndexName(const std::string& column_family_name,
+                                Slice* dst) {
+  *dst = Slice(column_family_name);
+  if (dst->starts_with("spatial$")) {
+    dst->remove_prefix(8);  // strlen("spatial$")
+    return true;
+  }
+  return false;
+}
+
+}  // namespace
+
+Variant::Variant(const Variant& v) : type_(v.type_) {
+  switch (v.type_) {
+    case kNull:
+      break;
+    case kBool:
+      data_.b = v.data_.b;
+      break;
+    case kInt:
+      data_.i = v.data_.i;
+      break;
+    case kDouble:
+      data_.d = v.data_.d;
+      break;
+    case kString:
+      new (&data_.s) std::string(v.data_.s);
+      break;
+    default:
+      assert(false);
+  }
+}
+
+bool Variant::operator==(const Variant& rhs) {
+  if (type_ != rhs.type_) {
+    return false;
+  }
+
+  switch (type_) {
+    case kNull:
+      return true;
+    case kBool:
+      return data_.b == rhs.data_.b;
+    case kInt:
+      return data_.i == rhs.data_.i;
+    case kDouble:
+      return data_.d == rhs.data_.d;
+    case kString:
+      return data_.s == rhs.data_.s;
+    default:
+      assert(false);
+  }
+  // it will never reach here, but otherwise the compiler complains
+  return false;
+}
+
+bool Variant::operator!=(const Variant& rhs) { return !(*this == rhs); }
+
+FeatureSet* FeatureSet::Set(const std::string& key, const Variant& value) {
+  map_.insert({key, value});
+  return this;
+}
+
+bool FeatureSet::Contains(const std::string& key) const {
+  return map_.find(key) != map_.end();
+}
+
+const Variant& FeatureSet::Get(const std::string& key) const {
+  auto itr = map_.find(key);
+  assert(itr != map_.end());
+  return itr->second;
+}
+
+FeatureSet::iterator FeatureSet::Find(const std::string& key) const {
+  return iterator(map_.find(key));
+}
+
+void FeatureSet::Clear() { map_.clear(); }
+
+void FeatureSet::Serialize(std::string* output) const {
+  for (const auto& iter : map_) {
+    PutLengthPrefixedSlice(output, iter.first);
+    output->push_back(static_cast<char>(iter.second.type()));
+    switch (iter.second.type()) {
+      case Variant::kNull:
+        break;
+      case Variant::kBool:
+        output->push_back(static_cast<char>(iter.second.get_bool()));
+        break;
+      case Variant::kInt:
+        PutVarint64(output, iter.second.get_int());
+        break;
+      case Variant::kDouble: {
+        PutDouble(output, iter.second.get_double());
+        break;
+      }
+      case Variant::kString:
+        PutLengthPrefixedSlice(output, iter.second.get_string());
+        break;
+      default:
+        assert(false);
+    }
+  }
+}
+
+bool FeatureSet::Deserialize(const Slice& input) {
+  assert(map_.empty());
+  Slice s(input);
+  while (s.size()) {
+    Slice key;
+    if (!GetLengthPrefixedSlice(&s, &key) || s.size() == 0) {
+      return false;
+    }
+    char type = s[0];
+    s.remove_prefix(1);
+    switch (type) {
+      case Variant::kNull: {
+        map_.insert({key.ToString(), Variant()});
+        break;
+      }
+      case Variant::kBool: {
+        if (s.size() == 0) {
+          return false;
+        }
+        map_.insert({key.ToString(), Variant(static_cast<bool>(s[0]))});
+        s.remove_prefix(1);
+        break;
+      }
+      case Variant::kInt: {
+        uint64_t v;
+        if (!GetVarint64(&s, &v)) {
+          return false;
+        }
+        map_.insert({key.ToString(), Variant(v)});
+        break;
+      }
+      case Variant::kDouble: {
+        double d;
+        if (!GetDouble(&s, &d)) {
+          return false;
+        }
+        map_.insert({key.ToString(), Variant(d)});
+        break;
+      }
+      case Variant::kString: {
+        Slice str;
+        if (!GetLengthPrefixedSlice(&s, &str)) {
+          return false;
+        }
+        map_.insert({key.ToString(), str.ToString()});
+        break;
+      }
+      default:
+        return false;
+    }
+  }
+  return true;
+}
+
+std::string FeatureSet::DebugString() const {
+  std::string out = "{";
+  bool comma = false;
+  for (const auto& iter : map_) {
+    if (comma) {
+      out.append(", ");
+    } else {
+      comma = true;
+    }
+    out.append("\"" + iter.first + "\": ");
+    switch (iter.second.type()) {
+      case Variant::kNull:
+        out.append("null");
+        break;
+      case Variant::kBool:
+        if (iter.second.get_bool()) {
+          out.append("true");
+        } else {
+          out.append("false");
+        }
+        break;
+      case Variant::kInt: {
+        char buf[32];
+        snprintf(buf, sizeof(buf), "%" PRIu64, iter.second.get_int());
+        out.append(buf);
+        break;
+      }
+      case Variant::kDouble: {
+        char buf[32];
+        snprintf(buf, sizeof(buf), "%lf", iter.second.get_double());
+        out.append(buf);
+        break;
+      }
+      case Variant::kString:
+        out.append("\"" + iter.second.get_string() + "\"");
+        break;
+      default:
+        assert(false);
+    }
+  }
+  return out + "}";
+}
+
+class ValueGetter {
+ public:
+  ValueGetter() {}
+  virtual ~ValueGetter() {}
+
+  virtual bool Get(uint64_t id) = 0;
+  virtual const Slice value() const = 0;
+
+  virtual Status status() const = 0;
+};
+
+class ValueGetterFromDB : public ValueGetter {
+ public:
+  ValueGetterFromDB(DB* db, ColumnFamilyHandle* cf) : db_(db), cf_(cf) {}
+
+  virtual bool Get(uint64_t id) override {
+    std::string encoded_id;
+    PutFixed64BigEndian(&encoded_id, id);
+    status_ = db_->Get(ReadOptions(), cf_, encoded_id, &value_);
+    if (status_.IsNotFound()) {
+      status_ = Status::Corruption("Index inconsistency");
+      return false;
+    }
+
+    return true;
+  }
+
+  virtual const Slice value() const override { return value_; }
+
+  virtual Status status() const override { return status_; }
+
+ private:
+  std::string value_;
+  DB* db_;
+  ColumnFamilyHandle* cf_;
+  Status status_;
+};
+
+class ValueGetterFromIterator : public ValueGetter {
+ public:
+  explicit ValueGetterFromIterator(Iterator* iterator) : iterator_(iterator) {}
+
+  virtual bool Get(uint64_t id) override {
+    std::string encoded_id;
+    PutFixed64BigEndian(&encoded_id, id);
+    iterator_->Seek(encoded_id);
+
+    if (!iterator_->Valid() || iterator_->key() != Slice(encoded_id)) {
+      status_ = Status::Corruption("Index inconsistency");
+      return false;
+    }
+
+    return true;
+  }
+
+  virtual const Slice value() const override { return iterator_->value(); }
+
+  virtual Status status() const override { return status_; }
+
+ private:
+  std::unique_ptr<Iterator> iterator_;
+  Status status_;
+};
+
+class SpatialIndexCursor : public Cursor {
+ public:
+  // tile_box is inclusive
+  SpatialIndexCursor(Iterator* spatial_iterator, ValueGetter* value_getter,
+                     const BoundingBox<uint64_t>& tile_bbox, uint32_t tile_bits)
+      : value_getter_(value_getter), valid_(true) {
+    // calculate quad keys we'll need to query
+    std::vector<uint64_t> quad_keys;
+    quad_keys.reserve((tile_bbox.max_x - tile_bbox.min_x + 1) *
+                      (tile_bbox.max_y - tile_bbox.min_y + 1));
+    for (uint64_t x = tile_bbox.min_x; x <= tile_bbox.max_x; ++x) {
+      for (uint64_t y = tile_bbox.min_y; y <= tile_bbox.max_y; ++y) {
+        quad_keys.push_back(GetQuadKeyFromTile(x, y, tile_bits));
+      }
+    }
+    std::sort(quad_keys.begin(), quad_keys.end());
+
+    // load primary key ids for all quad keys
+    for (auto quad_key : quad_keys) {
+      std::string encoded_quad_key;
+      PutFixed64BigEndian(&encoded_quad_key, quad_key);
+      Slice slice_quad_key(encoded_quad_key);
+
+      // If CheckQuadKey is true, there is no need to reseek, since
+      // spatial_iterator is already pointing at the correct quad key. This is
+      // an optimization.
+      if (!CheckQuadKey(spatial_iterator, slice_quad_key)) {
+        spatial_iterator->Seek(slice_quad_key);
+      }
+
+      while (CheckQuadKey(spatial_iterator, slice_quad_key)) {
+        // extract ID from spatial_iterator
+        uint64_t id;
+        bool ok = GetFixed64BigEndian(
+            Slice(spatial_iterator->key().data() + sizeof(uint64_t),
+                  sizeof(uint64_t)),
+            &id);
+        if (!ok) {
+          valid_ = false;
+          status_ = Status::Corruption("Spatial index corruption");
+          break;
+        }
+        primary_key_ids_.insert(id);
+        spatial_iterator->Next();
+      }
+    }
+
+    if (!spatial_iterator->status().ok()) {
+      status_ = spatial_iterator->status();
+      valid_ = false;
+    }
+    delete spatial_iterator;
+
+    valid_ = valid_ && !primary_key_ids_.empty();
+
+    if (valid_) {
+      primary_keys_iterator_ = primary_key_ids_.begin();
+      ExtractData();
+    }
+  }
+
+  virtual bool Valid() const override { return valid_; }
+
+  virtual void Next() override {
+    assert(valid_);
+
+    ++primary_keys_iterator_;
+    if (primary_keys_iterator_ == primary_key_ids_.end()) {
+      valid_ = false;
+      return;
+    }
+
+    ExtractData();
+  }
+
+  virtual const Slice blob() override { return current_blob_; }
+  virtual const FeatureSet& feature_set() override {
+    return current_feature_set_;
+  }
+
+  virtual Status status() const override {
+    if (!status_.ok()) {
+      return status_;
+    }
+    return value_getter_->status();
+  }
+
+ private:
+  // * returns true if spatial iterator is on the current quad key and all is
+  // well
+  // * returns false if spatial iterator is not on current, or iterator is
+  // invalid or corruption
+  bool CheckQuadKey(Iterator* spatial_iterator, const Slice& quad_key) {
+    if (!spatial_iterator->Valid()) {
+      return false;
+    }
+    if (spatial_iterator->key().size() != 2 * sizeof(uint64_t)) {
+      status_ = Status::Corruption("Invalid spatial index key");
+      valid_ = false;
+      return false;
+    }
+    Slice spatial_iterator_quad_key(spatial_iterator->key().data(),
+                                    sizeof(uint64_t));
+    if (spatial_iterator_quad_key != quad_key) {
+      // caller needs to reseek
+      return false;
+    }
+    // if we come to here, we have found the quad key
+    return true;
+  }
+
+  void ExtractData() {
+    assert(valid_);
+    valid_ = value_getter_->Get(*primary_keys_iterator_);
+
+    if (valid_) {
+      Slice data = value_getter_->value();
+      current_feature_set_.Clear();
+      if (!GetLengthPrefixedSlice(&data, &current_blob_) ||
+          !current_feature_set_.Deserialize(data)) {
+        status_ = Status::Corruption("Primary key column family corruption");
+        valid_ = false;
+      }
+    }
+
+  }
+
+  unique_ptr<ValueGetter> value_getter_;
+  bool valid_;
+  Status status_;
+
+  FeatureSet current_feature_set_;
+  Slice current_blob_;
+
+  // This is loaded from spatial iterator.
+  std::unordered_set<uint64_t> primary_key_ids_;
+  std::unordered_set<uint64_t>::iterator primary_keys_iterator_;
+};
+
+class ErrorCursor : public Cursor {
+ public:
+  explicit ErrorCursor(Status s) : s_(s) { assert(!s.ok()); }
+  virtual Status status() const override { return s_; }
+  virtual bool Valid() const override { return false; }
+  virtual void Next() override { assert(false); }
+
+  virtual const Slice blob() override {
+    assert(false);
+    return Slice();
+  }
+  virtual const FeatureSet& feature_set() override {
+    assert(false);
+    // compiler complains otherwise
+    return trash_;
+  }
+
+ private:
+  Status s_;
+  FeatureSet trash_;
+};
+
+class SpatialDBImpl : public SpatialDB {
+ public:
+  // * db -- base DB that needs to be forwarded to StackableDB
+  // * data_column_family -- column family used to store the data
+  // * spatial_indexes -- a list of spatial indexes together with column
+  // families that correspond to those spatial indexes
+  // * next_id -- next ID in auto-incrementing ID. This is usually
+  // `max_id_currenty_in_db + 1`
+  SpatialDBImpl(
+      DB* db, ColumnFamilyHandle* data_column_family,
+      const std::vector<std::pair<SpatialIndexOptions, ColumnFamilyHandle*>>&
+          spatial_indexes,
+      uint64_t next_id, bool read_only)
+      : SpatialDB(db),
+        data_column_family_(data_column_family),
+        next_id_(next_id),
+        read_only_(read_only) {
+    for (const auto& index : spatial_indexes) {
+      name_to_index_.insert(
+          {index.first.name, IndexColumnFamily(index.first, index.second)});
+    }
+  }
+
+  ~SpatialDBImpl() {
+    for (auto& iter : name_to_index_) {
+      delete iter.second.column_family;
+    }
+    delete data_column_family_;
+  }
+
+  virtual Status Insert(
+      const WriteOptions& write_options, const BoundingBox<double>& bbox,
+      const Slice& blob, const FeatureSet& feature_set,
+      const std::vector<std::string>& spatial_indexes) override {
+    WriteBatch batch;
+
+    if (spatial_indexes.size() == 0) {
+      return Status::InvalidArgument("Spatial indexes can't be empty");
+    }
+
+    const size_t kWriteOutEveryBytes = 1024 * 1024;  // 1MB
+    uint64_t id = next_id_.fetch_add(1);
+
+    for (const auto& si : spatial_indexes) {
+      auto itr = name_to_index_.find(si);
+      if (itr == name_to_index_.end()) {
+        return Status::InvalidArgument("Can't find index " + si);
+      }
+      const auto& spatial_index = itr->second.index;
+      if (!spatial_index.bbox.Intersects(bbox)) {
+        continue;
+      }
+      BoundingBox<uint64_t> tile_bbox = GetTileBoundingBox(spatial_index, bbox);
+
+      for (uint64_t x = tile_bbox.min_x; x <= tile_bbox.max_x; ++x) {
+        for (uint64_t y = tile_bbox.min_y; y <= tile_bbox.max_y; ++y) {
+          // see above for format
+          std::string key;
+          PutFixed64BigEndian(
+              &key, GetQuadKeyFromTile(x, y, spatial_index.tile_bits));
+          PutFixed64BigEndian(&key, id);
+          batch.Put(itr->second.column_family, key, Slice());
+          if (batch.GetDataSize() >= kWriteOutEveryBytes) {
+            Status s = Write(write_options, &batch);
+            batch.Clear();
+            if (!s.ok()) {
+              return s;
+            }
+          }
+        }
+      }
+    }
+
+    // see above for format
+    std::string data_key;
+    PutFixed64BigEndian(&data_key, id);
+    std::string data_value;
+    PutLengthPrefixedSlice(&data_value, blob);
+    feature_set.Serialize(&data_value);
+    batch.Put(data_column_family_, data_key, data_value);
+
+    return Write(write_options, &batch);
+  }
+
+  virtual Status Compact(int num_threads) override {
+    std::vector<ColumnFamilyHandle*> column_families;
+    column_families.push_back(data_column_family_);
+
+    for (auto& iter : name_to_index_) {
+      column_families.push_back(iter.second.column_family);
+    }
+
+    std::mutex state_mutex;
+    std::condition_variable cv;
+    Status s;
+    int threads_running = 0;
+
+    std::vector<std::thread> threads;
+
+    for (auto cfh : column_families) {
+      threads.emplace_back([&, cfh] {
+          {
+            std::unique_lock<std::mutex> lk(state_mutex);
+            cv.wait(lk, [&] { return threads_running < num_threads; });
+            threads_running++;
+          }
+
+          Status t = Flush(FlushOptions(), cfh);
+          if (t.ok()) {
+            t = CompactRange(cfh, nullptr, nullptr);
+          }
+
+          {
+            std::unique_lock<std::mutex> lk(state_mutex);
+            threads_running--;
+            if (s.ok() && !t.ok()) {
+              s = t;
+            }
+            cv.notify_one();
+          }
+      });
+    }
+
+    for (auto& t : threads) {
+      t.join();
+    }
+
+    return s;
+  }
+
+  virtual Cursor* Query(const ReadOptions& read_options,
+                        const BoundingBox<double>& bbox,
+                        const std::string& spatial_index) override {
+    auto itr = name_to_index_.find(spatial_index);
+    if (itr == name_to_index_.end()) {
+      return new ErrorCursor(Status::InvalidArgument(
+          "Spatial index " + spatial_index + " not found"));
+    }
+    const auto& si = itr->second.index;
+    Iterator* spatial_iterator;
+    ValueGetter* value_getter;
+
+    if (read_only_) {
+      spatial_iterator = NewIterator(read_options, itr->second.column_family);
+      value_getter = new ValueGetterFromDB(this, data_column_family_);
+    } else {
+      std::vector<Iterator*> iterators;
+      Status s = NewIterators(read_options,
+                              {data_column_family_, itr->second.column_family},
+                              &iterators);
+      if (!s.ok()) {
+        return new ErrorCursor(s);
+      }
+
+      spatial_iterator = iterators[1];
+      value_getter = new ValueGetterFromIterator(iterators[0]);
+    }
+    return new SpatialIndexCursor(spatial_iterator, value_getter,
+                                  GetTileBoundingBox(si, bbox), si.tile_bits);
+  }
+
+ private:
+  ColumnFamilyHandle* data_column_family_;
+  struct IndexColumnFamily {
+    SpatialIndexOptions index;
+    ColumnFamilyHandle* column_family;
+    IndexColumnFamily(const SpatialIndexOptions& _index,
+                      ColumnFamilyHandle* _cf)
+        : index(_index), column_family(_cf) {}
+  };
+  // constant after construction!
+  std::unordered_map<std::string, IndexColumnFamily> name_to_index_;
+
+  std::atomic<uint64_t> next_id_;
+  bool read_only_;
+};
+
+namespace {
+DBOptions GetDBOptionsFromSpatialDBOptions(const SpatialDBOptions& options) {
+  DBOptions db_options;
+  db_options.max_open_files = 50000;
+  db_options.max_background_compactions = 3 * options.num_threads / 4;
+  db_options.max_background_flushes =
+      options.num_threads - db_options.max_background_compactions;
+  db_options.env->SetBackgroundThreads(db_options.max_background_compactions,
+                                       Env::LOW);
+  db_options.env->SetBackgroundThreads(db_options.max_background_flushes,
+                                       Env::HIGH);
+  db_options.statistics = CreateDBStatistics();
+  if (options.bulk_load) {
+    db_options.stats_dump_period_sec = 600;
+    db_options.disableDataSync = true;
+  } else {
+    db_options.stats_dump_period_sec = 1800;  // 30min
+  }
+  return db_options;
+}
+
+ColumnFamilyOptions GetColumnFamilyOptions(const SpatialDBOptions& options,
+                                           std::shared_ptr<Cache> block_cache) {
+  ColumnFamilyOptions column_family_options;
+  column_family_options.write_buffer_size = 128 * 1024 * 1024;  // 128MB
+  column_family_options.max_write_buffer_number = 4;
+  column_family_options.max_bytes_for_level_base = 256 * 1024 * 1024;  // 256MB
+  column_family_options.target_file_size_base = 64 * 1024 * 1024;      // 64MB
+  column_family_options.level0_file_num_compaction_trigger = 2;
+  column_family_options.level0_slowdown_writes_trigger = 16;
+  column_family_options.level0_slowdown_writes_trigger = 32;
+  // only compress levels >= 2
+  column_family_options.compression_per_level.resize(
+      column_family_options.num_levels);
+  for (int i = 0; i < column_family_options.num_levels; ++i) {
+    if (i < 2) {
+      column_family_options.compression_per_level[i] = kNoCompression;
+    } else {
+      column_family_options.compression_per_level[i] = kLZ4Compression;
+    }
+  }
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = block_cache;
+  column_family_options.table_factory.reset(
+      NewBlockBasedTableFactory(table_options));
+  return column_family_options;
+}
+
+ColumnFamilyOptions OptimizeOptionsForDataColumnFamily(
+    ColumnFamilyOptions options, std::shared_ptr<Cache> block_cache) {
+  options.prefix_extractor.reset(NewNoopTransform());
+  BlockBasedTableOptions block_based_options;
+  block_based_options.index_type = BlockBasedTableOptions::kHashSearch;
+  block_based_options.block_cache = block_cache;
+  options.table_factory.reset(NewBlockBasedTableFactory(block_based_options));
+  return options;
+}
+
+}  // namespace
+
+class MetadataStorage {
+ public:
+  MetadataStorage(DB* db, ColumnFamilyHandle* cf) : db_(db), cf_(cf) {}
+  ~MetadataStorage() {}
+
+  // format: <min_x double> <min_y double> <max_x double> <max_y double>
+  // <tile_bits varint32>
+  Status AddIndex(const SpatialIndexOptions& index) {
+    std::string encoded_index;
+    PutDouble(&encoded_index, index.bbox.min_x);
+    PutDouble(&encoded_index, index.bbox.min_y);
+    PutDouble(&encoded_index, index.bbox.max_x);
+    PutDouble(&encoded_index, index.bbox.max_y);
+    PutVarint32(&encoded_index, index.tile_bits);
+    return db_->Put(WriteOptions(), cf_,
+                    GetSpatialIndexColumnFamilyName(index.name), encoded_index);
+  }
+
+  Status GetIndex(const std::string& name, SpatialIndexOptions* dst) {
+    std::string value;
+    Status s = db_->Get(ReadOptions(), cf_,
+                        GetSpatialIndexColumnFamilyName(name), &value);
+    if (!s.ok()) {
+      return s;
+    }
+    dst->name = name;
+    Slice encoded_index(value);
+    bool ok = GetDouble(&encoded_index, &(dst->bbox.min_x));
+    ok = ok && GetDouble(&encoded_index, &(dst->bbox.min_y));
+    ok = ok && GetDouble(&encoded_index, &(dst->bbox.max_x));
+    ok = ok && GetDouble(&encoded_index, &(dst->bbox.max_y));
+    ok = ok && GetVarint32(&encoded_index, &(dst->tile_bits));
+    return ok ? Status::OK() : Status::Corruption("Index encoding corrupted");
+  }
+
+ private:
+  DB* db_;
+  ColumnFamilyHandle* cf_;
+};
+
+Status SpatialDB::Create(
+    const SpatialDBOptions& options, const std::string& name,
+    const std::vector<SpatialIndexOptions>& spatial_indexes) {
+  DBOptions db_options = GetDBOptionsFromSpatialDBOptions(options);
+  db_options.create_if_missing = true;
+  db_options.create_missing_column_families = true;
+  db_options.error_if_exists = true;
+
+  auto block_cache = NewLRUCache(options.cache_size);
+  ColumnFamilyOptions column_family_options =
+      GetColumnFamilyOptions(options, block_cache);
+
+  std::vector<ColumnFamilyDescriptor> column_families;
+  column_families.push_back(ColumnFamilyDescriptor(
+      kDefaultColumnFamilyName,
+      OptimizeOptionsForDataColumnFamily(column_family_options, block_cache)));
+  column_families.push_back(
+      ColumnFamilyDescriptor(kMetadataColumnFamilyName, column_family_options));
+
+  for (const auto& index : spatial_indexes) {
+    column_families.emplace_back(GetSpatialIndexColumnFamilyName(index.name),
+                                 column_family_options);
+  }
+
+  std::vector<ColumnFamilyHandle*> handles;
+  DB* base_db;
+  Status s = DB::Open(db_options, name, column_families, &handles, &base_db);
+  if (!s.ok()) {
+    return s;
+  }
+  MetadataStorage metadata(base_db, handles[1]);
+  for (const auto& index : spatial_indexes) {
+    s = metadata.AddIndex(index);
+    if (!s.ok()) {
+      break;
+    }
+  }
+
+  for (auto h : handles) {
+    delete h;
+  }
+  delete base_db;
+
+  return s;
+}
+
+Status SpatialDB::Open(const SpatialDBOptions& options, const std::string& name,
+                       SpatialDB** db, bool read_only) {
+  DBOptions db_options = GetDBOptionsFromSpatialDBOptions(options);
+  auto block_cache = NewLRUCache(options.cache_size);
+  ColumnFamilyOptions column_family_options =
+      GetColumnFamilyOptions(options, block_cache);
+
+  Status s;
+  std::vector<std::string> existing_column_families;
+  std::vector<std::string> spatial_indexes;
+  s = DB::ListColumnFamilies(db_options, name, &existing_column_families);
+  if (!s.ok()) {
+    return s;
+  }
+  for (const auto& cf_name : existing_column_families) {
+    Slice spatial_index;
+    if (GetSpatialIndexName(cf_name, &spatial_index)) {
+      spatial_indexes.emplace_back(spatial_index.data(), spatial_index.size());
+    }
+  }
+
+  std::vector<ColumnFamilyDescriptor> column_families;
+  column_families.push_back(ColumnFamilyDescriptor(
+      kDefaultColumnFamilyName,
+      OptimizeOptionsForDataColumnFamily(column_family_options, block_cache)));
+  column_families.push_back(
+      ColumnFamilyDescriptor(kMetadataColumnFamilyName, column_family_options));
+
+  for (const auto& index : spatial_indexes) {
+    column_families.emplace_back(GetSpatialIndexColumnFamilyName(index),
+                                 column_family_options);
+  }
+  std::vector<ColumnFamilyHandle*> handles;
+  DB* base_db;
+  if (read_only) {
+    s = DB::OpenForReadOnly(db_options, name, column_families, &handles,
+                            &base_db);
+  } else {
+    s = DB::Open(db_options, name, column_families, &handles, &base_db);
+  }
+  if (!s.ok()) {
+    return s;
+  }
+
+  MetadataStorage metadata(base_db, handles[1]);
+
+  std::vector<std::pair<SpatialIndexOptions, ColumnFamilyHandle*>> index_cf;
+  assert(handles.size() == spatial_indexes.size() + 2);
+  for (size_t i = 0; i < spatial_indexes.size(); ++i) {
+    SpatialIndexOptions index_options;
+    s = metadata.GetIndex(spatial_indexes[i], &index_options);
+    if (!s.ok()) {
+      break;
+    }
+    index_cf.emplace_back(index_options, handles[i + 2]);
+  }
+  uint64_t next_id = 1;
+  if (s.ok()) {
+    // find next_id
+    Iterator* iter = base_db->NewIterator(ReadOptions(), handles[0]);
+    iter->SeekToLast();
+    if (iter->Valid()) {
+      uint64_t last_id = 0;
+      if (!GetFixed64BigEndian(iter->key(), &last_id)) {
+        s = Status::Corruption("Invalid key in data column family");
+      } else {
+        next_id = last_id + 1;
+      }
+    }
+    delete iter;
+  }
+  if (!s.ok()) {
+    for (auto h : handles) {
+      delete h;
+    }
+    delete base_db;
+    return s;
+  }
+
+  // I don't need metadata column family any more, so delete it
+  delete handles[1];
+  *db = new SpatialDBImpl(base_db, handles[0], index_cf, next_id, read_only);
+  return Status::OK();
+}
+
+}  // namespace spatial
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/spatialdb/spatial_db_test.cc b/src/rocksdb/utilities/spatialdb/spatial_db_test.cc
new file mode 100644
index 0000000..b304664
--- /dev/null
+++ b/src/rocksdb/utilities/spatialdb/spatial_db_test.cc
@@ -0,0 +1,274 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include <vector>
+#include <string>
+#include <set>
+
+#include "rocksdb/utilities/spatial_db.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+#include "util/random.h"
+
+namespace rocksdb {
+namespace spatial {
+
+class SpatialDBTest : public testing::Test {
+ public:
+  SpatialDBTest() {
+    dbname_ = test::TmpDir() + "/spatial_db_test";
+    DestroyDB(dbname_, Options());
+  }
+
+  void AssertCursorResults(BoundingBox<double> bbox, const std::string& index,
+                           const std::vector<std::string>& blobs) {
+    Cursor* c = db_->Query(ReadOptions(), bbox, index);
+    ASSERT_OK(c->status());
+    std::multiset<std::string> b;
+    for (auto x : blobs) {
+      b.insert(x);
+    }
+
+    while (c->Valid()) {
+      auto itr = b.find(c->blob().ToString());
+      ASSERT_TRUE(itr != b.end());
+      b.erase(itr);
+      c->Next();
+    }
+    ASSERT_EQ(b.size(), 0U);
+    ASSERT_OK(c->status());
+    delete c;
+  }
+
+  std::string dbname_;
+  SpatialDB* db_;
+};
+
+TEST_F(SpatialDBTest, FeatureSetSerializeTest) {
+  FeatureSet fs;
+
+  fs.Set("a", std::string("b"));
+  fs.Set("x", static_cast<uint64_t>(3));
+  fs.Set("y", false);
+  fs.Set("n", Variant());  // null
+  fs.Set("m", 3.25);
+
+  ASSERT_TRUE(fs.Find("w") == fs.end());
+  ASSERT_TRUE(fs.Find("x") != fs.end());
+  ASSERT_TRUE((*fs.Find("x")).second == Variant(static_cast<uint64_t>(3)));
+  ASSERT_TRUE((*fs.Find("y")).second != Variant(true));
+  std::set<std::string> keys({"a", "x", "y", "n", "m"});
+  for (const auto& x : fs) {
+    ASSERT_TRUE(keys.find(x.first) != keys.end());
+    keys.erase(x.first);
+  }
+  ASSERT_EQ(keys.size(), 0U);
+
+  std::string serialized;
+  fs.Serialize(&serialized);
+
+  FeatureSet deserialized;
+  ASSERT_TRUE(deserialized.Deserialize(serialized));
+
+  ASSERT_TRUE(deserialized.Contains("a"));
+  ASSERT_EQ(deserialized.Get("a").type(), Variant::kString);
+  ASSERT_EQ(deserialized.Get("a").get_string(), "b");
+  ASSERT_TRUE(deserialized.Contains("x"));
+  ASSERT_EQ(deserialized.Get("x").type(), Variant::kInt);
+  ASSERT_EQ(deserialized.Get("x").get_int(), static_cast<uint64_t>(3));
+  ASSERT_TRUE(deserialized.Contains("y"));
+  ASSERT_EQ(deserialized.Get("y").type(), Variant::kBool);
+  ASSERT_EQ(deserialized.Get("y").get_bool(), false);
+  ASSERT_TRUE(deserialized.Contains("n"));
+  ASSERT_EQ(deserialized.Get("n").type(), Variant::kNull);
+  ASSERT_TRUE(deserialized.Contains("m"));
+  ASSERT_EQ(deserialized.Get("m").type(), Variant::kDouble);
+  ASSERT_EQ(deserialized.Get("m").get_double(), 3.25);
+
+  // corrupted serialization
+  serialized = serialized.substr(0, serialized.size() - 3);
+  deserialized.Clear();
+  ASSERT_TRUE(!deserialized.Deserialize(serialized));
+}
+
+TEST_F(SpatialDBTest, TestNextID) {
+  ASSERT_OK(SpatialDB::Create(
+      SpatialDBOptions(), dbname_,
+      {SpatialIndexOptions("simple", BoundingBox<double>(0, 0, 100, 100), 2)}));
+
+  ASSERT_OK(SpatialDB::Open(SpatialDBOptions(), dbname_, &db_));
+  ASSERT_OK(db_->Insert(WriteOptions(), BoundingBox<double>(5, 5, 10, 10),
+                        "one", FeatureSet(), {"simple"}));
+  ASSERT_OK(db_->Insert(WriteOptions(), BoundingBox<double>(10, 10, 15, 15),
+                        "two", FeatureSet(), {"simple"}));
+  delete db_;
+
+  ASSERT_OK(SpatialDB::Open(SpatialDBOptions(), dbname_, &db_));
+  ASSERT_OK(db_->Insert(WriteOptions(), BoundingBox<double>(55, 55, 65, 65),
+                        "three", FeatureSet(), {"simple"}));
+  delete db_;
+
+  ASSERT_OK(SpatialDB::Open(SpatialDBOptions(), dbname_, &db_));
+  AssertCursorResults(BoundingBox<double>(0, 0, 100, 100), "simple",
+                      {"one", "two", "three"});
+  delete db_;
+}
+
+TEST_F(SpatialDBTest, FeatureSetTest) {
+  ASSERT_OK(SpatialDB::Create(
+      SpatialDBOptions(), dbname_,
+      {SpatialIndexOptions("simple", BoundingBox<double>(0, 0, 100, 100), 2)}));
+  ASSERT_OK(SpatialDB::Open(SpatialDBOptions(), dbname_, &db_));
+
+  FeatureSet fs;
+  fs.Set("a", std::string("b"));
+  fs.Set("c", std::string("d"));
+
+  ASSERT_OK(db_->Insert(WriteOptions(), BoundingBox<double>(5, 5, 10, 10),
+                        "one", fs, {"simple"}));
+
+  Cursor* c =
+      db_->Query(ReadOptions(), BoundingBox<double>(5, 5, 10, 10), "simple");
+
+  ASSERT_TRUE(c->Valid());
+  ASSERT_EQ(c->blob().compare("one"), 0);
+  FeatureSet returned = c->feature_set();
+  ASSERT_TRUE(returned.Contains("a"));
+  ASSERT_TRUE(!returned.Contains("b"));
+  ASSERT_TRUE(returned.Contains("c"));
+  ASSERT_EQ(returned.Get("a").type(), Variant::kString);
+  ASSERT_EQ(returned.Get("a").get_string(), "b");
+  ASSERT_EQ(returned.Get("c").type(), Variant::kString);
+  ASSERT_EQ(returned.Get("c").get_string(), "d");
+
+  c->Next();
+  ASSERT_TRUE(!c->Valid());
+
+  delete c;
+  delete db_;
+}
+
+TEST_F(SpatialDBTest, SimpleTest) {
+  // iter 0 -- not read only
+  // iter 1 -- read only
+  for (int iter = 0; iter < 2; ++iter) {
+    DestroyDB(dbname_, Options());
+    ASSERT_OK(SpatialDB::Create(
+        SpatialDBOptions(), dbname_,
+        {SpatialIndexOptions("index", BoundingBox<double>(0, 0, 128, 128),
+                             3)}));
+    ASSERT_OK(SpatialDB::Open(SpatialDBOptions(), dbname_, &db_));
+
+    ASSERT_OK(db_->Insert(WriteOptions(), BoundingBox<double>(33, 17, 63, 79),
+                          "one", FeatureSet(), {"index"}));
+    ASSERT_OK(db_->Insert(WriteOptions(), BoundingBox<double>(65, 65, 111, 111),
+                          "two", FeatureSet(), {"index"}));
+    ASSERT_OK(db_->Insert(WriteOptions(), BoundingBox<double>(1, 49, 127, 63),
+                          "three", FeatureSet(), {"index"}));
+    ASSERT_OK(db_->Insert(WriteOptions(), BoundingBox<double>(20, 100, 21, 101),
+                          "four", FeatureSet(), {"index"}));
+    ASSERT_OK(db_->Insert(WriteOptions(), BoundingBox<double>(81, 33, 127, 63),
+                          "five", FeatureSet(), {"index"}));
+    ASSERT_OK(db_->Insert(WriteOptions(), BoundingBox<double>(1, 65, 47, 95),
+                          "six", FeatureSet(), {"index"}));
+
+    if (iter == 1) {
+      delete db_;
+      ASSERT_OK(SpatialDB::Open(SpatialDBOptions(), dbname_, &db_, true));
+    }
+
+    AssertCursorResults(BoundingBox<double>(33, 17, 47, 31), "index", {"one"});
+    AssertCursorResults(BoundingBox<double>(17, 33, 79, 63), "index",
+                        {"one", "three"});
+    AssertCursorResults(BoundingBox<double>(17, 81, 63, 111), "index",
+                        {"four", "six"});
+    AssertCursorResults(BoundingBox<double>(85, 86, 85, 86), "index", {"two"});
+    AssertCursorResults(BoundingBox<double>(33, 1, 127, 111), "index",
+                        {"one", "two", "three", "five", "six"});
+    // even though the bounding box doesn't intersect, we got "four" back
+    // because
+    // it's in the same tile
+    AssertCursorResults(BoundingBox<double>(18, 98, 19, 99), "index", {"four"});
+    AssertCursorResults(BoundingBox<double>(130, 130, 131, 131), "index", {});
+    AssertCursorResults(BoundingBox<double>(81, 17, 127, 31), "index", {});
+    AssertCursorResults(BoundingBox<double>(90, 50, 91, 51), "index",
+                        {"three", "five"});
+
+    delete db_;
+  }
+}
+
+namespace {
+std::string RandomStr(Random* rnd) {
+  std::string r;
+  for (int k = 0; k < 10; ++k) {
+    r.push_back(rnd->Uniform(26) + 'a');
+  }
+  return r;
+}
+
+BoundingBox<int> RandomBoundingBox(int limit, Random* rnd, int max_size) {
+  BoundingBox<int> r;
+  r.min_x = rnd->Uniform(limit - 1);
+  r.min_y = rnd->Uniform(limit - 1);
+  r.max_x = r.min_x + rnd->Uniform(std::min(limit - 1 - r.min_x, max_size)) + 1;
+  r.max_y = r.min_y + rnd->Uniform(std::min(limit - 1 - r.min_y, max_size)) + 1;
+  return r;
+}
+
+BoundingBox<double> ScaleBB(BoundingBox<int> b, double step) {
+  return BoundingBox<double>(b.min_x * step + 1, b.min_y * step + 1,
+                             (b.max_x + 1) * step - 1,
+                             (b.max_y + 1) * step - 1);
+}
+
+}  // namespace
+
+TEST_F(SpatialDBTest, RandomizedTest) {
+  Random rnd(301);
+  std::vector<std::pair<std::string, BoundingBox<int>>> elements;
+
+  BoundingBox<double> spatial_index_bounds(0, 0, (1LL << 32), (1LL << 32));
+  ASSERT_OK(SpatialDB::Create(
+      SpatialDBOptions(), dbname_,
+      {SpatialIndexOptions("index", spatial_index_bounds, 7)}));
+  ASSERT_OK(SpatialDB::Open(SpatialDBOptions(), dbname_, &db_));
+  double step = (1LL << 32) / (1 << 7);
+
+  for (int i = 0; i < 1000; ++i) {
+    std::string blob = RandomStr(&rnd);
+    BoundingBox<int> bbox = RandomBoundingBox(128, &rnd, 10);
+    ASSERT_OK(db_->Insert(WriteOptions(), ScaleBB(bbox, step), blob,
+                          FeatureSet(), {"index"}));
+    elements.push_back(make_pair(blob, bbox));
+  }
+
+  // parallel
+  db_->Compact(2);
+  // serial
+  db_->Compact(1);
+
+  for (int i = 0; i < 1000; ++i) {
+    BoundingBox<int> int_bbox = RandomBoundingBox(128, &rnd, 10);
+    BoundingBox<double> double_bbox = ScaleBB(int_bbox, step);
+    std::vector<std::string> blobs;
+    for (auto e : elements) {
+      if (e.second.Intersects(int_bbox)) {
+        blobs.push_back(e.first);
+      }
+    }
+    AssertCursorResults(double_bbox, "index", blobs);
+  }
+
+  delete db_;
+}
+
+}  // namespace spatial
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/utilities/spatialdb/utils.h b/src/rocksdb/utilities/spatialdb/utils.h
new file mode 100644
index 0000000..c65ccf5
--- /dev/null
+++ b/src/rocksdb/utilities/spatialdb/utils.h
@@ -0,0 +1,95 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+#include <string>
+#include <algorithm>
+
+#include "rocksdb/utilities/spatial_db.h"
+
+namespace rocksdb {
+namespace spatial {
+
+// indexing idea from http://msdn.microsoft.com/en-us/library/bb259689.aspx
+inline uint64_t GetTileFromCoord(double x, double start, double end,
+                                 uint32_t tile_bits) {
+  if (x < start) {
+    return 0;
+  }
+  uint64_t tiles = static_cast<uint64_t>(1) << tile_bits;
+  uint64_t r = ((x - start) / (end - start)) * tiles;
+  return std::min(r, tiles - 1);
+}
+
+inline uint64_t GetQuadKeyFromTile(uint64_t tile_x, uint64_t tile_y,
+                                   uint32_t tile_bits) {
+  uint64_t quad_key = 0;
+  for (uint32_t i = 0; i < tile_bits; ++i) {
+    uint64_t mask = static_cast<uint64_t>(1LL << i);
+    quad_key |= (tile_x & mask) << i;
+    quad_key |= (tile_y & mask) << (i + 1);
+  }
+  return quad_key;
+}
+
+inline BoundingBox<uint64_t> GetTileBoundingBox(
+    const SpatialIndexOptions& spatial_index, BoundingBox<double> bbox) {
+  return BoundingBox<uint64_t>(
+      GetTileFromCoord(bbox.min_x, spatial_index.bbox.min_x,
+                       spatial_index.bbox.max_x, spatial_index.tile_bits),
+      GetTileFromCoord(bbox.min_y, spatial_index.bbox.min_y,
+                       spatial_index.bbox.max_y, spatial_index.tile_bits),
+      GetTileFromCoord(bbox.max_x, spatial_index.bbox.min_x,
+                       spatial_index.bbox.max_x, spatial_index.tile_bits),
+      GetTileFromCoord(bbox.max_y, spatial_index.bbox.min_y,
+                       spatial_index.bbox.max_y, spatial_index.tile_bits));
+}
+
+// big endian can be compared using memcpy
+inline void PutFixed64BigEndian(std::string* dst, uint64_t value) {
+  char buf[sizeof(value)];
+  buf[0] = (value >> 56) & 0xff;
+  buf[1] = (value >> 48) & 0xff;
+  buf[2] = (value >> 40) & 0xff;
+  buf[3] = (value >> 32) & 0xff;
+  buf[4] = (value >> 24) & 0xff;
+  buf[5] = (value >> 16) & 0xff;
+  buf[6] = (value >> 8) & 0xff;
+  buf[7] = value & 0xff;
+  dst->append(buf, sizeof(buf));
+}
+
+// big endian can be compared using memcpy
+inline bool GetFixed64BigEndian(const Slice& input, uint64_t* value) {
+  if (input.size() < sizeof(uint64_t)) {
+    return false;
+  }
+  auto ptr = input.data();
+  *value = (static_cast<uint64_t>(static_cast<unsigned char>(ptr[0])) << 56) |
+           (static_cast<uint64_t>(static_cast<unsigned char>(ptr[1])) << 48) |
+           (static_cast<uint64_t>(static_cast<unsigned char>(ptr[2])) << 40) |
+           (static_cast<uint64_t>(static_cast<unsigned char>(ptr[3])) << 32) |
+           (static_cast<uint64_t>(static_cast<unsigned char>(ptr[4])) << 24) |
+           (static_cast<uint64_t>(static_cast<unsigned char>(ptr[5])) << 16) |
+           (static_cast<uint64_t>(static_cast<unsigned char>(ptr[6])) << 8) |
+           static_cast<uint64_t>(static_cast<unsigned char>(ptr[7]));
+  return true;
+}
+
+inline void PutDouble(std::string* dst, double d) {
+  dst->append(reinterpret_cast<char*>(&d), sizeof(double));
+}
+
+inline bool GetDouble(Slice* input, double* d) {
+  if (input->size() < sizeof(double)) {
+    return false;
+  }
+  memcpy(d, input->data(), sizeof(double));
+  input->remove_prefix(sizeof(double));
+  return true;
+}
+
+}  // namespace spatial
+}  // namespace rocksdb
diff --git a/src/rocksdb/utilities/ttl/db_ttl_impl.cc b/src/rocksdb/utilities/ttl/db_ttl_impl.cc
index f7a697f..f3d9417 100644
--- a/src/rocksdb/utilities/ttl/db_ttl_impl.cc
+++ b/src/rocksdb/utilities/ttl/db_ttl_impl.cc
@@ -5,7 +5,8 @@
 
 #include "utilities/ttl/db_ttl_impl.h"
 
-#include "utilities/db_ttl.h"
+#include "rocksdb/utilities/convenience.h"
+#include "rocksdb/utilities/db_ttl.h"
 #include "db/filename.h"
 #include "db/write_batch_internal.h"
 #include "util/coding.h"
@@ -34,7 +35,11 @@ void DBWithTTLImpl::SanitizeOptions(int32_t ttl, ColumnFamilyOptions* options,
 // Open the db inside DBWithTTLImpl because options needs pointer to its ttl
 DBWithTTLImpl::DBWithTTLImpl(DB* db) : DBWithTTL(db) {}
 
-DBWithTTLImpl::~DBWithTTLImpl() { delete GetOptions().compaction_filter; }
+DBWithTTLImpl::~DBWithTTLImpl() {
+  // Need to stop background compaction before getting rid of the filter
+  CancelAllBackgroundWork(db_, /* wait = */ true);
+  delete GetOptions().compaction_filter;
+}
 
 Status UtilityDB::OpenTtlDB(const Options& options, const std::string& dbname,
                             StackableDB** dbptr, int32_t ttl, bool read_only) {
@@ -202,8 +207,18 @@ std::vector<Status> DBWithTTLImpl::MultiGet(
     const ReadOptions& options,
     const std::vector<ColumnFamilyHandle*>& column_family,
     const std::vector<Slice>& keys, std::vector<std::string>* values) {
-  return std::vector<Status>(
-      keys.size(), Status::NotSupported("MultiGet not supported with TTL"));
+  auto statuses = db_->MultiGet(options, column_family, keys, values);
+  for (size_t i = 0; i < keys.size(); ++i) {
+    if (!statuses[i].ok()) {
+      continue;
+    }
+    statuses[i] = SanityCheckTimestamp((*values)[i]);
+    if (!statuses[i].ok()) {
+      continue;
+    }
+    statuses[i] = StripTS(&(*values)[i]);
+  }
+  return statuses;
 }
 
 bool DBWithTTLImpl::KeyMayExist(const ReadOptions& options,
@@ -234,7 +249,7 @@ Status DBWithTTLImpl::Write(const WriteOptions& opts, WriteBatch* updates) {
     WriteBatch updates_ttl;
     Status batch_rewrite_status;
     virtual Status PutCF(uint32_t column_family_id, const Slice& key,
-                         const Slice& value) {
+                         const Slice& value) override {
       std::string value_with_ts;
       Status st = AppendTS(value, &value_with_ts, env_);
       if (!st.ok()) {
@@ -246,7 +261,7 @@ Status DBWithTTLImpl::Write(const WriteOptions& opts, WriteBatch* updates) {
       return Status::OK();
     }
     virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
-                           const Slice& value) {
+                           const Slice& value) override {
       std::string value_with_ts;
       Status st = AppendTS(value, &value_with_ts, env_);
       if (!st.ok()) {
@@ -257,11 +272,14 @@ Status DBWithTTLImpl::Write(const WriteOptions& opts, WriteBatch* updates) {
       }
       return Status::OK();
     }
-    virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) {
+    virtual Status DeleteCF(uint32_t column_family_id,
+                            const Slice& key) override {
       WriteBatchInternal::Delete(&updates_ttl, column_family_id, key);
       return Status::OK();
     }
-    virtual void LogData(const Slice& blob) { updates_ttl.PutLogData(blob); }
+    virtual void LogData(const Slice& blob) override {
+      updates_ttl.PutLogData(blob);
+    }
 
    private:
     Env* env_;
diff --git a/src/rocksdb/utilities/ttl/db_ttl_impl.h b/src/rocksdb/utilities/ttl/db_ttl_impl.h
index a5c8fc8..9abf6fc 100644
--- a/src/rocksdb/utilities/ttl/db_ttl_impl.h
+++ b/src/rocksdb/utilities/ttl/db_ttl_impl.h
@@ -13,8 +13,8 @@
 #include "rocksdb/env.h"
 #include "rocksdb/compaction_filter.h"
 #include "rocksdb/merge_operator.h"
-#include "utilities/utility_db.h"
-#include "utilities/db_ttl.h"
+#include "rocksdb/utilities/utility_db.h"
+#include "rocksdb/utilities/db_ttl.h"
 #include "db/db_impl.h"
 
 namespace rocksdb {
@@ -71,7 +71,7 @@ class DBWithTTLImpl : public DBWithTTL {
   virtual Iterator* NewIterator(const ReadOptions& opts,
                                 ColumnFamilyHandle* column_family) override;
 
-  virtual DB* GetBaseDB() { return db_; }
+  virtual DB* GetBaseDB() override { return db_; }
 
   static bool IsStale(const Slice& value, int32_t ttl, Env* env);
 
@@ -95,26 +95,26 @@ class TtlIterator : public Iterator {
 
   ~TtlIterator() { delete iter_; }
 
-  bool Valid() const { return iter_->Valid(); }
+  bool Valid() const override { return iter_->Valid(); }
 
-  void SeekToFirst() { iter_->SeekToFirst(); }
+  void SeekToFirst() override { iter_->SeekToFirst(); }
 
-  void SeekToLast() { iter_->SeekToLast(); }
+  void SeekToLast() override { iter_->SeekToLast(); }
 
-  void Seek(const Slice& target) { iter_->Seek(target); }
+  void Seek(const Slice& target) override { iter_->Seek(target); }
 
-  void Next() { iter_->Next(); }
+  void Next() override { iter_->Next(); }
 
-  void Prev() { iter_->Prev(); }
+  void Prev() override { iter_->Prev(); }
 
-  Slice key() const { return iter_->key(); }
+  Slice key() const override { return iter_->key(); }
 
   int32_t timestamp() const {
     return DecodeFixed32(iter_->value().data() + iter_->value().size() -
                          DBWithTTLImpl::kTSLength);
   }
 
-  Slice value() const {
+  Slice value() const override {
     // TODO: handle timestamp corruption like in general iterator semantics
     assert(DBWithTTLImpl::SanityCheckTimestamp(iter_->value()).ok());
     Slice trimmed_value = iter_->value();
@@ -122,7 +122,7 @@ class TtlIterator : public Iterator {
     return trimmed_value;
   }
 
-  Status status() const { return iter_->status(); }
+  Status status() const override { return iter_->status(); }
 
  private:
   Iterator* iter_;
@@ -187,7 +187,7 @@ class TtlCompactionFilterFactory : public CompactionFilterFactory {
       : ttl_(ttl), env_(env), user_comp_filter_factory_(comp_filter_factory) {}
 
   virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
-      const CompactionFilter::Context& context) {
+      const CompactionFilter::Context& context) override {
     return std::unique_ptr<TtlCompactionFilter>(new TtlCompactionFilter(
         ttl_, env_, nullptr,
         std::move(user_comp_filter_factory_->CreateCompactionFilter(context))));
@@ -206,7 +206,7 @@ class TtlCompactionFilterFactory : public CompactionFilterFactory {
 class TtlMergeOperator : public MergeOperator {
 
  public:
-  explicit TtlMergeOperator(const std::shared_ptr<MergeOperator> merge_op,
+  explicit TtlMergeOperator(const std::shared_ptr<MergeOperator>& merge_op,
                             Env* env)
       : user_merge_op_(merge_op), env_(env) {
     assert(merge_op);
@@ -219,7 +219,8 @@ class TtlMergeOperator : public MergeOperator {
       override {
     const uint32_t ts_len = DBWithTTLImpl::kTSLength;
     if (existing_value && existing_value->size() < ts_len) {
-      Log(logger, "Error: Could not remove timestamp from existing value.");
+      Log(InfoLogLevel::ERROR_LEVEL, logger,
+          "Error: Could not remove timestamp from existing value.");
       return false;
     }
 
@@ -227,7 +228,8 @@ class TtlMergeOperator : public MergeOperator {
     std::deque<std::string> operands_without_ts;
     for (const auto& operand : operands) {
       if (operand.size() < ts_len) {
-        Log(logger, "Error: Could not remove timestamp from operand value.");
+        Log(InfoLogLevel::ERROR_LEVEL, logger,
+            "Error: Could not remove timestamp from operand value.");
         return false;
       }
       operands_without_ts.push_back(operand.substr(0, operand.size() - ts_len));
@@ -253,7 +255,7 @@ class TtlMergeOperator : public MergeOperator {
     // Augment the *new_value with the ttl time-stamp
     int64_t curtime;
     if (!env_->GetCurrentTime(&curtime).ok()) {
-      Log(logger,
+      Log(InfoLogLevel::ERROR_LEVEL, logger,
           "Error: Could not get current time to be attached internally "
           "to the new value.");
       return false;
@@ -274,7 +276,8 @@ class TtlMergeOperator : public MergeOperator {
 
     for (const auto& operand : operand_list) {
       if (operand.size() < ts_len) {
-        Log(logger, "Error: Could not remove timestamp from value.");
+        Log(InfoLogLevel::ERROR_LEVEL, logger,
+            "Error: Could not remove timestamp from value.");
         return false;
       }
 
@@ -292,7 +295,7 @@ class TtlMergeOperator : public MergeOperator {
     // Augment the *new_value with the ttl time-stamp
     int64_t curtime;
     if (!env_->GetCurrentTime(&curtime).ok()) {
-      Log(logger,
+      Log(InfoLogLevel::ERROR_LEVEL, logger,
           "Error: Could not get current time to be attached internally "
           "to the new value.");
       return false;
diff --git a/src/rocksdb/utilities/ttl/ttl_test.cc b/src/rocksdb/utilities/ttl/ttl_test.cc
index 4791a2a..c970047 100644
--- a/src/rocksdb/utilities/ttl/ttl_test.cc
+++ b/src/rocksdb/utilities/ttl/ttl_test.cc
@@ -4,7 +4,7 @@
 
 #include <memory>
 #include "rocksdb/compaction_filter.h"
-#include "utilities/db_ttl.h"
+#include "rocksdb/utilities/db_ttl.h"
 #include "util/testharness.h"
 #include "util/logging.h"
 #include <map>
@@ -29,7 +29,7 @@ class SpecialTimeEnv : public EnvWrapper {
   }
 
   void Sleep(int64_t sleep_time) { current_time_ += sleep_time; }
-  virtual Status GetCurrentTime(int64_t* current_time) {
+  virtual Status GetCurrentTime(int64_t* current_time) override {
     *current_time = current_time_;
     return Status::OK();
   }
@@ -38,7 +38,7 @@ class SpecialTimeEnv : public EnvWrapper {
   int64_t current_time_;
 };
 
-class TtlTest {
+class TtlTest : public testing::Test {
  public:
   TtlTest() {
     env_.reset(new SpecialTimeEnv(Env::Default()));
@@ -94,7 +94,8 @@ class TtlTest {
   void MakeKVMap(int64_t num_entries) {
     kvmap_.clear();
     int digits = 1;
-    for (int dummy = num_entries; dummy /= 10 ; ++digits);
+    for (int64_t dummy = num_entries; dummy /= 10; ++digits) {
+    }
     int digits_in_i = 1;
     for (int64_t i = 0; i < num_entries; i++) {
       std::string key = "key";
@@ -110,17 +111,18 @@ class TtlTest {
       AppendNumberTo(&value, i);
       kvmap_[key] = value;
     }
-    ASSERT_EQ((int)kvmap_.size(), num_entries);//check all insertions done
+    ASSERT_EQ(static_cast<int64_t>(kvmap_.size()),
+              num_entries);  // check all insertions done
   }
 
   // Makes a write-batch with key-vals from kvmap_ and 'Write''s it
-  void MakePutWriteBatch(const BatchOperation* batch_ops, int num_ops) {
-    ASSERT_LE(num_ops, (int)kvmap_.size());
+  void MakePutWriteBatch(const BatchOperation* batch_ops, int64_t num_ops) {
+    ASSERT_LE(num_ops, static_cast<int64_t>(kvmap_.size()));
     static WriteOptions wopts;
     static FlushOptions flush_opts;
     WriteBatch batch;
     kv_it_ = kvmap_.begin();
-    for (int i = 0; i < num_ops && kv_it_ != kvmap_.end(); i++, kv_it_++) {
+    for (int64_t i = 0; i < num_ops && kv_it_ != kvmap_.end(); i++, ++kv_it_) {
       switch (batch_ops[i]) {
         case PUT:
           batch.Put(kv_it_->first, kv_it_->second);
@@ -137,15 +139,16 @@ class TtlTest {
   }
 
   // Puts num_entries starting from start_pos_map from kvmap_ into the database
-  void PutValues(int start_pos_map, int num_entries, bool flush = true,
+  void PutValues(int64_t start_pos_map, int64_t num_entries, bool flush = true,
                  ColumnFamilyHandle* cf = nullptr) {
     ASSERT_TRUE(db_ttl_);
-    ASSERT_LE(start_pos_map + num_entries, (int)kvmap_.size());
+    ASSERT_LE(start_pos_map + num_entries, static_cast<int64_t>(kvmap_.size()));
     static WriteOptions wopts;
     static FlushOptions flush_opts;
     kv_it_ = kvmap_.begin();
     advance(kv_it_, start_pos_map);
-    for (int i = 0; kv_it_ != kvmap_.end() && i < num_entries; i++, kv_it_++) {
+    for (int64_t i = 0; kv_it_ != kvmap_.end() && i < num_entries;
+         i++, ++kv_it_) {
       ASSERT_OK(cf == nullptr
                     ? db_ttl_->Put(wopts, kv_it_->first, kv_it_->second)
                     : db_ttl_->Put(wopts, cf, kv_it_->first, kv_it_->second));
@@ -191,13 +194,32 @@ class TtlTest {
     }
   }
 
+  // checks the whole kvmap_ to return correct values using MultiGet
+  void SimpleMultiGetTest() {
+    static ReadOptions ropts;
+    std::vector<Slice> keys;
+    std::vector<std::string> values;
+
+    for (auto& kv : kvmap_) {
+      keys.emplace_back(kv.first);
+    }
+
+    auto statuses = db_ttl_->MultiGet(ropts, keys, &values);
+    size_t i = 0;
+    for (auto& kv : kvmap_) {
+      ASSERT_OK(statuses[i]);
+      ASSERT_EQ(values[i], kv.second);
+      ++i;
+    }
+  }
+
   // Sleeps for slp_tim then runs a manual compaction
   // Checks span starting from st_pos from kvmap_ in the db and
   // Gets should return true if check is true and false otherwise
   // Also checks that value that we got is the same as inserted; and =kNewValue
   //   if test_compaction_change is true
-  void SleepCompactCheck(int slp_tim, int st_pos, int span, bool check = true,
-                         bool test_compaction_change = false,
+  void SleepCompactCheck(int slp_tim, int64_t st_pos, int64_t span,
+                         bool check = true, bool test_compaction_change = false,
                          ColumnFamilyHandle* cf = nullptr) {
     ASSERT_TRUE(db_ttl_);
 
@@ -207,7 +229,7 @@ class TtlTest {
     kv_it_ = kvmap_.begin();
     advance(kv_it_, st_pos);
     std::string v;
-    for (int i = 0; kv_it_ != kvmap_.end() && i < span; i++, kv_it_++) {
+    for (int64_t i = 0; kv_it_ != kvmap_.end() && i < span; i++, ++kv_it_) {
       Status s = (cf == nullptr) ? db_ttl_->Get(ropts, kv_it_->first, &v)
                                  : db_ttl_->Get(ropts, cf, kv_it_->first, &v);
       if (s.ok() != check) {
@@ -235,7 +257,8 @@ class TtlTest {
   }
 
   // Similar as SleepCompactCheck but uses TtlIterator to read from db
-  void SleepCompactCheckIter(int slp, int st_pos, int span, bool check=true) {
+  void SleepCompactCheckIter(int slp, int st_pos, int64_t span,
+                             bool check = true) {
     ASSERT_TRUE(db_ttl_);
     env_->Sleep(slp);
     ManualCompact();
@@ -250,9 +273,8 @@ class TtlTest {
         ASSERT_NE(dbiter->value().compare(kv_it_->second), 0);
       }
     } else {  // dbiter should have found out kvmap_[st_pos]
-      for (int i = st_pos;
-           kv_it_ != kvmap_.end() && i < st_pos + span;
-           i++, kv_it_++)  {
+      for (int64_t i = st_pos; kv_it_ != kvmap_.end() && i < st_pos + span;
+           i++, ++kv_it_) {
         ASSERT_TRUE(dbiter->Valid());
         ASSERT_EQ(dbiter->value().compare(kv_it_->second), 0);
         dbiter->Next();
@@ -263,7 +285,7 @@ class TtlTest {
 
   class TestFilter : public CompactionFilter {
    public:
-    TestFilter(const int64_t kSampleSize, const std::string kNewValue)
+    TestFilter(const int64_t kSampleSize, const std::string& kNewValue)
       : kSampleSize_(kSampleSize),
         kNewValue_(kNewValue) {
     }
@@ -288,7 +310,7 @@ class TtlTest {
         return false; // Keep keys not matching the format "key<NUMBER>"
       }
 
-      int partition = kSampleSize_ / 3;
+      int64_t partition = kSampleSize_ / 3;
       if (num_key_end < partition) {
         return true;
       } else if (num_key_end < partition * 2) {
@@ -311,7 +333,7 @@ class TtlTest {
 
   class TestFilterFactory : public CompactionFilterFactory {
     public:
-      TestFilterFactory(const int64_t kSampleSize, const std::string kNewValue)
+      TestFilterFactory(const int64_t kSampleSize, const std::string& kNewValue)
         : kSampleSize_(kSampleSize),
           kNewValue_(kNewValue) {
       }
@@ -350,10 +372,10 @@ class TtlTest {
 // This test opens the db 3 times with such default behavior and inserts a
 // bunch of kvs each time. All kvs should accumulate in the db till the end
 // Partitions the sample-size provided into 3 sets over boundary1 and boundary2
-TEST(TtlTest, NoEffect) {
+TEST_F(TtlTest, NoEffect) {
   MakeKVMap(kSampleSize_);
-  int boundary1 = kSampleSize_ / 3;
-  int boundary2 = 2 * boundary1;
+  int64_t boundary1 = kSampleSize_ / 3;
+  int64_t boundary2 = 2 * boundary1;
 
   OpenTtl();
   PutValues(0, boundary1);                       //T=0: Set1 never deleted
@@ -372,7 +394,7 @@ TEST(TtlTest, NoEffect) {
 }
 
 // Puts a set of values and checks its presence using Get during ttl
-TEST(TtlTest, PresentDuringTTL) {
+TEST_F(TtlTest, PresentDuringTTL) {
   MakeKVMap(kSampleSize_);
 
   OpenTtl(2);                                 // T=0:Open the db with ttl = 2
@@ -382,7 +404,7 @@ TEST(TtlTest, PresentDuringTTL) {
 }
 
 // Puts a set of values and checks its absence using Get after ttl
-TEST(TtlTest, AbsentAfterTTL) {
+TEST_F(TtlTest, AbsentAfterTTL) {
   MakeKVMap(kSampleSize_);
 
   OpenTtl(1);                                  // T=0:Open the db with ttl = 2
@@ -393,7 +415,7 @@ TEST(TtlTest, AbsentAfterTTL) {
 
 // Resets the timestamp of a set of kvs by updating them and checks that they
 // are not deleted according to the old timestamp
-TEST(TtlTest, ResetTimestamp) {
+TEST_F(TtlTest, ResetTimestamp) {
   MakeKVMap(kSampleSize_);
 
   OpenTtl(3);
@@ -405,7 +427,7 @@ TEST(TtlTest, ResetTimestamp) {
 }
 
 // Similar to PresentDuringTTL but uses Iterator
-TEST(TtlTest, IterPresentDuringTTL) {
+TEST_F(TtlTest, IterPresentDuringTTL) {
   MakeKVMap(kSampleSize_);
 
   OpenTtl(2);
@@ -415,7 +437,7 @@ TEST(TtlTest, IterPresentDuringTTL) {
 }
 
 // Similar to AbsentAfterTTL but uses Iterator
-TEST(TtlTest, IterAbsentAfterTTL) {
+TEST_F(TtlTest, IterAbsentAfterTTL) {
   MakeKVMap(kSampleSize_);
 
   OpenTtl(1);
@@ -426,7 +448,7 @@ TEST(TtlTest, IterAbsentAfterTTL) {
 
 // Checks presence while opening the same db more than once with the same ttl
 // Note: The second open will open the same db
-TEST(TtlTest, MultiOpenSamePresent) {
+TEST_F(TtlTest, MultiOpenSamePresent) {
   MakeKVMap(kSampleSize_);
 
   OpenTtl(2);
@@ -440,7 +462,7 @@ TEST(TtlTest, MultiOpenSamePresent) {
 
 // Checks absence while opening the same db more than once with the same ttl
 // Note: The second open will open the same db
-TEST(TtlTest, MultiOpenSameAbsent) {
+TEST_F(TtlTest, MultiOpenSameAbsent) {
   MakeKVMap(kSampleSize_);
 
   OpenTtl(1);
@@ -453,7 +475,7 @@ TEST(TtlTest, MultiOpenSameAbsent) {
 }
 
 // Checks presence while opening the same db more than once with bigger ttl
-TEST(TtlTest, MultiOpenDifferent) {
+TEST_F(TtlTest, MultiOpenDifferent) {
   MakeKVMap(kSampleSize_);
 
   OpenTtl(1);
@@ -466,7 +488,7 @@ TEST(TtlTest, MultiOpenDifferent) {
 }
 
 // Checks presence during ttl in read_only mode
-TEST(TtlTest, ReadOnlyPresentForever) {
+TEST_F(TtlTest, ReadOnlyPresentForever) {
   MakeKVMap(kSampleSize_);
 
   OpenTtl(1);                                 // T=0:Open the db normally
@@ -480,7 +502,7 @@ TEST(TtlTest, ReadOnlyPresentForever) {
 
 // Checks whether WriteBatch works well with TTL
 // Puts all kvs in kvmap_ in a batch and writes first, then deletes first half
-TEST(TtlTest, WriteBatchTest) {
+TEST_F(TtlTest, WriteBatchTest) {
   MakeKVMap(kSampleSize_);
   BatchOperation batch_ops[kSampleSize_];
   for (int i = 0; i < kSampleSize_; i++) {
@@ -499,7 +521,7 @@ TEST(TtlTest, WriteBatchTest) {
 }
 
 // Checks user's compaction filter for correctness with TTL logic
-TEST(TtlTest, CompactionFilter) {
+TEST_F(TtlTest, CompactionFilter) {
   MakeKVMap(kSampleSize_);
 
   OpenTtlWithTestCompaction(1);
@@ -510,16 +532,16 @@ TEST(TtlTest, CompactionFilter) {
 
   OpenTtlWithTestCompaction(3);
   PutValues(0, kSampleSize_);                   // T=0:Insert Set1.
-  int partition = kSampleSize_ / 3;
-  SleepCompactCheck(1, 0, partition, false);   // Part dropped
-  SleepCompactCheck(0, partition, partition);  // Part kept
+  int64_t partition = kSampleSize_ / 3;
+  SleepCompactCheck(1, 0, partition, false);                  // Part dropped
+  SleepCompactCheck(0, partition, partition);                 // Part kept
   SleepCompactCheck(0, 2 * partition, partition, true, true); // Part changed
   CloseTtl();
 }
 
 // Insert some key-values which KeyMayExist should be able to get and check that
 // values returned are fine
-TEST(TtlTest, KeyMayExist) {
+TEST_F(TtlTest, KeyMayExist) {
   MakeKVMap(kSampleSize_);
 
   OpenTtl();
@@ -530,7 +552,18 @@ TEST(TtlTest, KeyMayExist) {
   CloseTtl();
 }
 
-TEST(TtlTest, ColumnFamiliesTest) {
+TEST_F(TtlTest, MultiGetTest) {
+  MakeKVMap(kSampleSize_);
+
+  OpenTtl();
+  PutValues(0, kSampleSize_, false);
+
+  SimpleMultiGetTest();
+
+  CloseTtl();
+}
+
+TEST_F(TtlTest, ColumnFamiliesTest) {
   DB* db;
   Options options;
   options.create_if_missing = true;
@@ -591,5 +624,6 @@ TEST(TtlTest, ColumnFamiliesTest) {
 
 // A black-box test for the ttl wrapper around rocksdb
 int main(int argc, char** argv) {
-  return rocksdb::test::RunAllTests();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
 }
diff --git a/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc b/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc
new file mode 100644
index 0000000..0c3e02f
--- /dev/null
+++ b/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc
@@ -0,0 +1,665 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "rocksdb/utilities/write_batch_with_index.h"
+
+#include <memory>
+
+#include "rocksdb/comparator.h"
+#include "rocksdb/iterator.h"
+#include "db/column_family.h"
+#include "db/merge_context.h"
+#include "db/merge_helper.h"
+#include "db/skiplist.h"
+#include "util/arena.h"
+#include "utilities/write_batch_with_index/write_batch_with_index_internal.h"
+
+namespace rocksdb {
+
+// when direction == forward
+// * current_at_base_ <=> base_iterator > delta_iterator
+// when direction == backwards
+// * current_at_base_ <=> base_iterator < delta_iterator
+// always:
+// * equal_keys_ <=> base_iterator == delta_iterator
+class BaseDeltaIterator : public Iterator {
+ public:
+  BaseDeltaIterator(Iterator* base_iterator, WBWIIterator* delta_iterator,
+                    const Comparator* comparator)
+      : forward_(true),
+        current_at_base_(true),
+        equal_keys_(false),
+        status_(Status::OK()),
+        base_iterator_(base_iterator),
+        delta_iterator_(delta_iterator),
+        comparator_(comparator) {}
+
+  virtual ~BaseDeltaIterator() {}
+
+  bool Valid() const override {
+    return current_at_base_ ? BaseValid() : DeltaValid();
+  }
+
+  void SeekToFirst() override {
+    forward_ = true;
+    base_iterator_->SeekToFirst();
+    delta_iterator_->SeekToFirst();
+    UpdateCurrent();
+  }
+
+  void SeekToLast() override {
+    forward_ = false;
+    base_iterator_->SeekToLast();
+    delta_iterator_->SeekToLast();
+    UpdateCurrent();
+  }
+
+  void Seek(const Slice& k) override {
+    forward_ = true;
+    base_iterator_->Seek(k);
+    delta_iterator_->Seek(k);
+    UpdateCurrent();
+  }
+
+  void Next() override {
+    if (!Valid()) {
+      status_ = Status::NotSupported("Next() on invalid iterator");
+    }
+
+    if (!forward_) {
+      // Need to change direction
+      // if our direction was backward and we're not equal, we have two states:
+      // * both iterators are valid: we're already in a good state (current
+      // shows to smaller)
+      // * only one iterator is valid: we need to advance that iterator
+      forward_ = true;
+      equal_keys_ = false;
+      if (!BaseValid()) {
+        assert(DeltaValid());
+        base_iterator_->SeekToFirst();
+      } else if (!DeltaValid()) {
+        delta_iterator_->SeekToFirst();
+      } else if (current_at_base_) {
+        // Change delta from larger than base to smaller
+        AdvanceDelta();
+      } else {
+        // Change base from larger than delta to smaller
+        AdvanceBase();
+      }
+      if (DeltaValid() && BaseValid()) {
+        if (Compare() == 0) {
+          equal_keys_ = true;
+        }
+      }
+    }
+    Advance();
+  }
+
+  void Prev() override {
+    if (!Valid()) {
+      status_ = Status::NotSupported("Prev() on invalid iterator");
+    }
+
+    if (forward_) {
+      // Need to change direction
+      // if our direction was backward and we're not equal, we have two states:
+      // * both iterators are valid: we're already in a good state (current
+      // shows to smaller)
+      // * only one iterator is valid: we need to advance that iterator
+      forward_ = false;
+      equal_keys_ = false;
+      if (!BaseValid()) {
+        assert(DeltaValid());
+        base_iterator_->SeekToLast();
+      } else if (!DeltaValid()) {
+        delta_iterator_->SeekToLast();
+      } else if (current_at_base_) {
+        // Change delta from less advanced than base to more advanced
+        AdvanceDelta();
+      } else {
+        // Change base from less advanced than delta to more advanced
+        AdvanceBase();
+      }
+      if (DeltaValid() && BaseValid()) {
+        if (Compare() == 0) {
+          equal_keys_ = true;
+        }
+      }
+    }
+
+    Advance();
+  }
+
+  Slice key() const override {
+    return current_at_base_ ? base_iterator_->key()
+                            : delta_iterator_->Entry().key;
+  }
+
+  Slice value() const override {
+    return current_at_base_ ? base_iterator_->value()
+                            : delta_iterator_->Entry().value;
+  }
+
+  Status status() const override {
+    if (!status_.ok()) {
+      return status_;
+    }
+    if (!base_iterator_->status().ok()) {
+      return base_iterator_->status();
+    }
+    return delta_iterator_->status();
+  }
+
+ private:
+  // -1 -- delta less advanced than base
+  // 0 -- delta == base
+  // 1 -- delta more advanced than base
+  int Compare() const {
+    assert(delta_iterator_->Valid() && base_iterator_->Valid());
+    int cmp = comparator_->Compare(delta_iterator_->Entry().key,
+                                   base_iterator_->key());
+    if (forward_) {
+      return cmp;
+    } else {
+      return -cmp;
+    }
+  }
+  bool IsDeltaDelete() {
+    assert(DeltaValid());
+    return delta_iterator_->Entry().type == kDeleteRecord;
+  }
+  void AssertInvariants() {
+#ifndef NDEBUG
+    if (!Valid()) {
+      return;
+    }
+    if (!BaseValid()) {
+      assert(!current_at_base_ && delta_iterator_->Valid());
+      return;
+    }
+    if (!DeltaValid()) {
+      assert(current_at_base_ && base_iterator_->Valid());
+      return;
+    }
+    // we don't support those yet
+    assert(delta_iterator_->Entry().type != kMergeRecord &&
+           delta_iterator_->Entry().type != kLogDataRecord);
+    int compare = comparator_->Compare(delta_iterator_->Entry().key,
+                                       base_iterator_->key());
+    if (forward_) {
+      // current_at_base -> compare < 0
+      assert(!current_at_base_ || compare < 0);
+      // !current_at_base -> compare <= 0
+      assert(current_at_base_ && compare >= 0);
+    } else {
+      // current_at_base -> compare > 0
+      assert(!current_at_base_ || compare > 0);
+      // !current_at_base -> compare <= 0
+      assert(current_at_base_ && compare <= 0);
+    }
+    // equal_keys_ <=> compare == 0
+    assert((equal_keys_ || compare != 0) && (!equal_keys_ || compare == 0));
+#endif
+  }
+
+  void Advance() {
+    if (equal_keys_) {
+      assert(BaseValid() && DeltaValid());
+      AdvanceBase();
+      AdvanceDelta();
+    } else {
+      if (current_at_base_) {
+        assert(BaseValid());
+        AdvanceBase();
+      } else {
+        assert(DeltaValid());
+        AdvanceDelta();
+      }
+    }
+    UpdateCurrent();
+  }
+
+  void AdvanceDelta() {
+    if (forward_) {
+      delta_iterator_->Next();
+    } else {
+      delta_iterator_->Prev();
+    }
+  }
+  void AdvanceBase() {
+    if (forward_) {
+      base_iterator_->Next();
+    } else {
+      base_iterator_->Prev();
+    }
+  }
+  bool BaseValid() const { return base_iterator_->Valid(); }
+  bool DeltaValid() const { return delta_iterator_->Valid(); }
+  void UpdateCurrent() {
+    while (true) {
+      equal_keys_ = false;
+      if (!BaseValid()) {
+        // Base has finished.
+        if (!DeltaValid()) {
+          // Finished
+          return;
+        }
+        if (IsDeltaDelete()) {
+          AdvanceDelta();
+        } else {
+          current_at_base_ = false;
+          return;
+        }
+      } else if (!DeltaValid()) {
+        // Delta has finished.
+        current_at_base_ = true;
+        return;
+      } else {
+        int compare = Compare();
+        if (compare <= 0) {  // delta bigger or equal
+          if (compare == 0) {
+            equal_keys_ = true;
+          }
+          if (!IsDeltaDelete()) {
+            current_at_base_ = false;
+            return;
+          }
+          // Delta is less advanced and is delete.
+          AdvanceDelta();
+          if (equal_keys_) {
+            AdvanceBase();
+          }
+        } else {
+          current_at_base_ = true;
+          return;
+        }
+      }
+    }
+
+    AssertInvariants();
+  }
+
+  bool forward_;
+  bool current_at_base_;
+  bool equal_keys_;
+  Status status_;
+  std::unique_ptr<Iterator> base_iterator_;
+  std::unique_ptr<WBWIIterator> delta_iterator_;
+  const Comparator* comparator_;  // not owned
+};
+
+typedef SkipList<WriteBatchIndexEntry*, const WriteBatchEntryComparator&>
+    WriteBatchEntrySkipList;
+
+class WBWIIteratorImpl : public WBWIIterator {
+ public:
+  WBWIIteratorImpl(uint32_t column_family_id,
+                   WriteBatchEntrySkipList* skip_list,
+                   const ReadableWriteBatch* write_batch)
+      : column_family_id_(column_family_id),
+        skip_list_iter_(skip_list),
+        write_batch_(write_batch),
+        valid_(false) {}
+
+  virtual ~WBWIIteratorImpl() {}
+
+  virtual bool Valid() const override { return valid_; }
+
+  virtual void SeekToFirst() override {
+    valid_ = true;
+    WriteBatchIndexEntry search_entry(WriteBatchIndexEntry::kFlagMin,
+                                      column_family_id_);
+    skip_list_iter_.Seek(&search_entry);
+    ReadEntry();
+  }
+
+  virtual void SeekToLast() override {
+    valid_ = true;
+    WriteBatchIndexEntry search_entry(WriteBatchIndexEntry::kFlagMin,
+                                      column_family_id_ + 1);
+    skip_list_iter_.Seek(&search_entry);
+    if (!skip_list_iter_.Valid()) {
+      skip_list_iter_.SeekToLast();
+    } else {
+      skip_list_iter_.Prev();
+    }
+    ReadEntry();
+  }
+
+  virtual void Seek(const Slice& key) override {
+    valid_ = true;
+    WriteBatchIndexEntry search_entry(&key, column_family_id_);
+    skip_list_iter_.Seek(&search_entry);
+    ReadEntry();
+  }
+
+  virtual void Next() override {
+    skip_list_iter_.Next();
+    ReadEntry();
+  }
+
+  virtual void Prev() override {
+    skip_list_iter_.Prev();
+    ReadEntry();
+  }
+
+  virtual const WriteEntry& Entry() const override { return current_; }
+
+  virtual Status status() const override { return status_; }
+
+  const WriteBatchIndexEntry* GetRawEntry() const {
+    return skip_list_iter_.key();
+  }
+
+ private:
+  uint32_t column_family_id_;
+  WriteBatchEntrySkipList::Iterator skip_list_iter_;
+  const ReadableWriteBatch* write_batch_;
+  Status status_;
+  bool valid_;
+  WriteEntry current_;
+
+  void ReadEntry() {
+    if (!status_.ok() || !skip_list_iter_.Valid()) {
+      valid_ = false;
+      return;
+    }
+    const WriteBatchIndexEntry* iter_entry = skip_list_iter_.key();
+    if (iter_entry == nullptr ||
+        iter_entry->column_family != column_family_id_) {
+      valid_ = false;
+      return;
+    }
+    Slice blob;
+    status_ = write_batch_->GetEntryFromDataOffset(
+        iter_entry->offset, &current_.type, &current_.key, &current_.value,
+        &blob);
+    if (!status_.ok()) {
+      valid_ = false;
+    } else if (current_.type != kPutRecord && current_.type != kDeleteRecord &&
+               current_.type != kMergeRecord) {
+      valid_ = false;
+      status_ = Status::Corruption("write batch index is corrupted");
+    }
+  }
+};
+
+struct WriteBatchWithIndex::Rep {
+  Rep(const Comparator* index_comparator, size_t reserved_bytes = 0,
+      bool _overwrite_key = false)
+      : write_batch(reserved_bytes),
+        comparator(index_comparator, &write_batch),
+        skip_list(comparator, &arena),
+        overwrite_key(_overwrite_key),
+        last_entry_offset(0) {}
+  ReadableWriteBatch write_batch;
+  WriteBatchEntryComparator comparator;
+  Arena arena;
+  WriteBatchEntrySkipList skip_list;
+  bool overwrite_key;
+  size_t last_entry_offset;
+
+  // Remember current offset of internal write batch, which is used as
+  // the starting offset of the next record.
+  void SetLastEntryOffset() { last_entry_offset = write_batch.GetDataSize(); }
+
+  // In overwrite mode, find the existing entry for the same key and update it
+  // to point to the current entry.
+  // Return true if the key is found and updated.
+  bool UpdateExistingEntry(ColumnFamilyHandle* column_family, const Slice& key);
+  bool UpdateExistingEntryWithCfId(uint32_t column_family_id, const Slice& key);
+
+  // Add the recent entry to the update.
+  // In overwrite mode, if key already exists in the index, update it.
+  void AddOrUpdateIndex(ColumnFamilyHandle* column_family, const Slice& key);
+  void AddOrUpdateIndex(const Slice& key);
+
+  // Allocate an index entry pointing to the last entry in the write batch and
+  // put it to skip list.
+  void AddNewEntry(uint32_t column_family_id);
+
+  // Clear all updates buffered in this batch.
+  void Clear();
+};
+
+bool WriteBatchWithIndex::Rep::UpdateExistingEntry(
+    ColumnFamilyHandle* column_family, const Slice& key) {
+  uint32_t cf_id = GetColumnFamilyID(column_family);
+  return UpdateExistingEntryWithCfId(cf_id, key);
+}
+
+bool WriteBatchWithIndex::Rep::UpdateExistingEntryWithCfId(
+    uint32_t column_family_id, const Slice& key) {
+  if (!overwrite_key) {
+    return false;
+  }
+
+  WBWIIteratorImpl iter(column_family_id, &skip_list, &write_batch);
+  iter.Seek(key);
+  if (!iter.Valid()) {
+    return false;
+  }
+  if (comparator.CompareKey(column_family_id, key, iter.Entry().key) != 0) {
+    return false;
+  }
+  WriteBatchIndexEntry* non_const_entry =
+      const_cast<WriteBatchIndexEntry*>(iter.GetRawEntry());
+  non_const_entry->offset = last_entry_offset;
+  return true;
+}
+
+void WriteBatchWithIndex::Rep::AddOrUpdateIndex(
+    ColumnFamilyHandle* column_family, const Slice& key) {
+  if (!UpdateExistingEntry(column_family, key)) {
+    uint32_t cf_id = GetColumnFamilyID(column_family);
+    const auto* cf_cmp = GetColumnFamilyUserComparator(column_family);
+    if (cf_cmp != nullptr) {
+      comparator.SetComparatorForCF(cf_id, cf_cmp);
+    }
+    AddNewEntry(cf_id);
+  }
+}
+
+void WriteBatchWithIndex::Rep::AddOrUpdateIndex(const Slice& key) {
+  if (!UpdateExistingEntryWithCfId(0, key)) {
+    AddNewEntry(0);
+  }
+}
+
+void WriteBatchWithIndex::Rep::AddNewEntry(uint32_t column_family_id) {
+    auto* mem = arena.Allocate(sizeof(WriteBatchIndexEntry));
+    auto* index_entry =
+        new (mem) WriteBatchIndexEntry(last_entry_offset, column_family_id);
+    skip_list.Insert(index_entry);
+  }
+
+  void WriteBatchWithIndex::Rep::Clear() {
+    write_batch.Clear();
+    arena.~Arena();
+    new (&arena) Arena();
+    skip_list.~WriteBatchEntrySkipList();
+    new (&skip_list) WriteBatchEntrySkipList(comparator, &arena);
+    last_entry_offset = 0;
+  }
+
+
+WriteBatchWithIndex::WriteBatchWithIndex(
+    const Comparator* default_index_comparator, size_t reserved_bytes,
+    bool overwrite_key)
+    : rep(new Rep(default_index_comparator, reserved_bytes, overwrite_key)) {}
+
+WriteBatchWithIndex::~WriteBatchWithIndex() { delete rep; }
+
+WriteBatch* WriteBatchWithIndex::GetWriteBatch() { return &rep->write_batch; }
+
+WBWIIterator* WriteBatchWithIndex::NewIterator() {
+  return new WBWIIteratorImpl(0, &(rep->skip_list), &rep->write_batch);
+}
+
+WBWIIterator* WriteBatchWithIndex::NewIterator(
+    ColumnFamilyHandle* column_family) {
+  return new WBWIIteratorImpl(GetColumnFamilyID(column_family),
+                              &(rep->skip_list), &rep->write_batch);
+}
+
+Iterator* WriteBatchWithIndex::NewIteratorWithBase(
+    ColumnFamilyHandle* column_family, Iterator* base_iterator) {
+  if (rep->overwrite_key == false) {
+    assert(false);
+    return nullptr;
+  }
+  return new BaseDeltaIterator(base_iterator, NewIterator(column_family),
+                               GetColumnFamilyUserComparator(column_family));
+}
+
+Iterator* WriteBatchWithIndex::NewIteratorWithBase(Iterator* base_iterator) {
+  if (rep->overwrite_key == false) {
+    assert(false);
+    return nullptr;
+  }
+  // default column family's comparator
+  return new BaseDeltaIterator(base_iterator, NewIterator(),
+                               rep->comparator.default_comparator());
+}
+
+void WriteBatchWithIndex::Put(ColumnFamilyHandle* column_family,
+                              const Slice& key, const Slice& value) {
+  rep->SetLastEntryOffset();
+  rep->write_batch.Put(column_family, key, value);
+  rep->AddOrUpdateIndex(column_family, key);
+}
+
+void WriteBatchWithIndex::Put(const Slice& key, const Slice& value) {
+  rep->SetLastEntryOffset();
+  rep->write_batch.Put(key, value);
+  rep->AddOrUpdateIndex(key);
+}
+
+void WriteBatchWithIndex::Merge(ColumnFamilyHandle* column_family,
+                                const Slice& key, const Slice& value) {
+  rep->SetLastEntryOffset();
+  rep->write_batch.Merge(column_family, key, value);
+  rep->AddOrUpdateIndex(column_family, key);
+}
+
+void WriteBatchWithIndex::Merge(const Slice& key, const Slice& value) {
+  rep->SetLastEntryOffset();
+  rep->write_batch.Merge(key, value);
+  rep->AddOrUpdateIndex(key);
+}
+
+void WriteBatchWithIndex::PutLogData(const Slice& blob) {
+  rep->write_batch.PutLogData(blob);
+}
+
+void WriteBatchWithIndex::Delete(ColumnFamilyHandle* column_family,
+                                 const Slice& key) {
+  rep->SetLastEntryOffset();
+  rep->write_batch.Delete(column_family, key);
+  rep->AddOrUpdateIndex(column_family, key);
+}
+
+void WriteBatchWithIndex::Delete(const Slice& key) {
+  rep->SetLastEntryOffset();
+  rep->write_batch.Delete(key);
+  rep->AddOrUpdateIndex(key);
+}
+
+void WriteBatchWithIndex::Clear() { rep->Clear(); }
+
+Status WriteBatchWithIndex::GetFromBatch(ColumnFamilyHandle* column_family,
+                                         const DBOptions& options,
+                                         const Slice& key, std::string* value) {
+  Status s;
+  MergeContext merge_context;
+
+  WriteBatchWithIndexInternal::Result result =
+      WriteBatchWithIndexInternal::GetFromBatch(options, this, column_family,
+                                                key, &merge_context,
+                                                &rep->comparator, value, &s);
+
+  switch (result) {
+    case WriteBatchWithIndexInternal::Result::kFound:
+    case WriteBatchWithIndexInternal::Result::kError:
+      return s;
+    case WriteBatchWithIndexInternal::Result::kDeleted:
+    case WriteBatchWithIndexInternal::Result::kNotFound:
+      return Status::NotFound();
+    case WriteBatchWithIndexInternal::Result::kMergeInProgress:
+      return Status::MergeInProgress("");
+    default:
+      assert(false);
+  }
+
+  return s;
+}
+
+Status WriteBatchWithIndex::GetFromBatchAndDB(DB* db,
+                                              const ReadOptions& read_options,
+                                              const Slice& key,
+                                              std::string* value) {
+  return GetFromBatchAndDB(db, read_options, db->DefaultColumnFamily(), key,
+                           value);
+}
+
+Status WriteBatchWithIndex::GetFromBatchAndDB(DB* db,
+                                              const ReadOptions& read_options,
+                                              ColumnFamilyHandle* column_family,
+                                              const Slice& key,
+                                              std::string* value) {
+  Status s;
+  MergeContext merge_context;
+  const DBOptions& options = db->GetDBOptions();
+
+  std::string batch_value;
+  WriteBatchWithIndexInternal::Result result =
+      WriteBatchWithIndexInternal::GetFromBatch(
+          options, this, column_family, key, &merge_context, &rep->comparator,
+          &batch_value, &s);
+
+  if (result == WriteBatchWithIndexInternal::Result::kFound) {
+    value->assign(batch_value.data(), batch_value.size());
+    return s;
+  }
+  if (result == WriteBatchWithIndexInternal::Result::kDeleted) {
+    return Status::NotFound();
+  }
+  if (result == WriteBatchWithIndexInternal::Result::kError) {
+    return s;
+  }
+  assert(result == WriteBatchWithIndexInternal::Result::kMergeInProgress ||
+         result == WriteBatchWithIndexInternal::Result::kNotFound);
+
+  // Did not find key in batch OR could not resolve Merges.  Try DB.
+  s = db->Get(read_options, column_family, key, value);
+
+  if (s.ok() || s.IsNotFound()) {  // DB Get Suceeded
+    if (result == WriteBatchWithIndexInternal::Result::kMergeInProgress) {
+      // Merge result from DB with merges in Batch
+      auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+      const MergeOperator* merge_operator =
+          cfh->cfd()->ioptions()->merge_operator;
+      Statistics* statistics = options.statistics.get();
+      Env* env = options.env;
+      Logger* logger = options.info_log.get();
+
+      Slice db_slice(*value);
+      Slice* merge_data;
+      if (s.ok()) {
+        merge_data = &db_slice;
+      } else {  // Key not present in db (s.IsNotFound())
+        merge_data = nullptr;
+      }
+
+      s = MergeHelper::TimedFullMerge(
+          key, merge_data, merge_context.GetOperands(), merge_operator,
+          statistics, env, logger, value);
+    }
+  }
+
+  return s;
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc b/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc
new file mode 100644
index 0000000..b9cf644
--- /dev/null
+++ b/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc
@@ -0,0 +1,242 @@
+//  Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "db/column_family.h"
+#include "db/merge_context.h"
+#include "db/merge_helper.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "util/coding.h"
+#include "utilities/write_batch_with_index/write_batch_with_index_internal.h"
+
+namespace rocksdb {
+
+class Env;
+class Logger;
+class Statistics;
+
+Status ReadableWriteBatch::GetEntryFromDataOffset(size_t data_offset,
+                                                  WriteType* type, Slice* Key,
+                                                  Slice* value,
+                                                  Slice* blob) const {
+  if (type == nullptr || Key == nullptr || value == nullptr ||
+      blob == nullptr) {
+    return Status::InvalidArgument("Output parameters cannot be null");
+  }
+
+  if (data_offset >= GetDataSize()) {
+    return Status::InvalidArgument("data offset exceed write batch size");
+  }
+  Slice input = Slice(rep_.data() + data_offset, rep_.size() - data_offset);
+  char tag;
+  uint32_t column_family;
+  Status s =
+      ReadRecordFromWriteBatch(&input, &tag, &column_family, Key, value, blob);
+
+  switch (tag) {
+    case kTypeColumnFamilyValue:
+    case kTypeValue:
+      *type = kPutRecord;
+      break;
+    case kTypeColumnFamilyDeletion:
+    case kTypeDeletion:
+      *type = kDeleteRecord;
+      break;
+    case kTypeColumnFamilyMerge:
+    case kTypeMerge:
+      *type = kMergeRecord;
+      break;
+    case kTypeLogData:
+      *type = kLogDataRecord;
+      break;
+    default:
+      return Status::Corruption("unknown WriteBatch tag");
+  }
+  return Status::OK();
+}
+
+int WriteBatchEntryComparator::operator()(
+    const WriteBatchIndexEntry* entry1,
+    const WriteBatchIndexEntry* entry2) const {
+  if (entry1->column_family > entry2->column_family) {
+    return 1;
+  } else if (entry1->column_family < entry2->column_family) {
+    return -1;
+  }
+
+  if (entry1->offset == WriteBatchIndexEntry::kFlagMin) {
+    return -1;
+  } else if (entry2->offset == WriteBatchIndexEntry::kFlagMin) {
+    return 1;
+  }
+
+  Status s;
+  Slice key1, key2;
+  if (entry1->search_key == nullptr) {
+    Slice value, blob;
+    WriteType write_type;
+    s = write_batch_->GetEntryFromDataOffset(entry1->offset, &write_type, &key1,
+                                             &value, &blob);
+    if (!s.ok()) {
+      return 1;
+    }
+  } else {
+    key1 = *(entry1->search_key);
+  }
+  if (entry2->search_key == nullptr) {
+    Slice value, blob;
+    WriteType write_type;
+    s = write_batch_->GetEntryFromDataOffset(entry2->offset, &write_type, &key2,
+                                             &value, &blob);
+    if (!s.ok()) {
+      return -1;
+    }
+  } else {
+    key2 = *(entry2->search_key);
+  }
+
+  int cmp = CompareKey(entry1->column_family, key1, key2);
+  if (cmp != 0) {
+    return cmp;
+  } else if (entry1->offset > entry2->offset) {
+    return 1;
+  } else if (entry1->offset < entry2->offset) {
+    return -1;
+  }
+  return 0;
+}
+
+int WriteBatchEntryComparator::CompareKey(uint32_t column_family,
+                                          const Slice& key1,
+                                          const Slice& key2) const {
+  auto comparator_for_cf = cf_comparator_map_.find(column_family);
+  if (comparator_for_cf != cf_comparator_map_.end()) {
+    return comparator_for_cf->second->Compare(key1, key2);
+  } else {
+    return default_comparator_->Compare(key1, key2);
+  }
+}
+
+WriteBatchWithIndexInternal::Result WriteBatchWithIndexInternal::GetFromBatch(
+    const DBOptions& options, WriteBatchWithIndex* batch,
+    ColumnFamilyHandle* column_family, const Slice& key,
+    MergeContext* merge_context, WriteBatchEntryComparator* cmp,
+    std::string* value, Status* s) {
+  uint32_t cf_id = GetColumnFamilyID(column_family);
+  *s = Status::OK();
+  WriteBatchWithIndexInternal::Result result =
+      WriteBatchWithIndexInternal::Result::kNotFound;
+
+  std::unique_ptr<WBWIIterator> iter =
+      std::unique_ptr<WBWIIterator>(batch->NewIterator(column_family));
+
+  // We want to iterate in the reverse order that the writes were added to the
+  // batch.  Since we don't have a reverse iterator, we must seek past the end.
+  // TODO(agiardullo): consider adding support for reverse iteration
+  iter->Seek(key);
+  while (iter->Valid()) {
+    const WriteEntry& entry = iter->Entry();
+    if (cmp->CompareKey(cf_id, entry.key, key) != 0) {
+      break;
+    }
+
+    iter->Next();
+  }
+
+  if (!(*s).ok()) {
+    return WriteBatchWithIndexInternal::Result::kError;
+  }
+
+  if (!iter->Valid()) {
+    // Read past end of results.  Reposition on last result.
+    iter->SeekToLast();
+  } else {
+    iter->Prev();
+  }
+
+  const Slice* entry_value = nullptr;
+  while (iter->Valid()) {
+    const WriteEntry& entry = iter->Entry();
+    if (cmp->CompareKey(cf_id, entry.key, key) != 0) {
+      // Unexpected error or we've reached a different next key
+      break;
+    }
+
+    switch (entry.type) {
+      case kPutRecord: {
+        result = WriteBatchWithIndexInternal::Result::kFound;
+        entry_value = &entry.value;
+        break;
+      }
+      case kMergeRecord: {
+        result = WriteBatchWithIndexInternal::Result::kMergeInProgress;
+        merge_context->PushOperand(entry.value);
+        break;
+      }
+      case kDeleteRecord: {
+        result = WriteBatchWithIndexInternal::Result::kDeleted;
+        break;
+      }
+      case kLogDataRecord: {
+        // ignore
+        break;
+      }
+      default: {
+        result = WriteBatchWithIndexInternal::Result::kError;
+        (*s) = Status::Corruption("Unexpected entry in WriteBatchWithIndex:",
+                                  std::to_string(entry.type));
+        break;
+      }
+    }
+    if (result == WriteBatchWithIndexInternal::Result::kFound ||
+        result == WriteBatchWithIndexInternal::Result::kDeleted ||
+        result == WriteBatchWithIndexInternal::Result::kError) {
+      // We can stop iterating once we find a PUT or DELETE
+      break;
+    }
+
+    iter->Prev();
+  }
+
+  if ((*s).ok()) {
+    if (result == WriteBatchWithIndexInternal::Result::kFound ||
+        result == WriteBatchWithIndexInternal::Result::kDeleted) {
+      // Found a Put or Delete.  Merge if necessary.
+      if (merge_context->GetNumOperands() > 0) {
+        const MergeOperator* merge_operator;
+
+        if (column_family != nullptr) {
+          auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+          merge_operator = cfh->cfd()->ioptions()->merge_operator;
+        } else {
+          *s = Status::InvalidArgument("Must provide a column_family");
+          result = WriteBatchWithIndexInternal::Result::kError;
+          return result;
+        }
+        Statistics* statistics = options.statistics.get();
+        Env* env = options.env;
+        Logger* logger = options.info_log.get();
+
+        *s = MergeHelper::TimedFullMerge(
+            key, entry_value, merge_context->GetOperands(), merge_operator,
+            statistics, env, logger, value);
+        if ((*s).ok()) {
+          result = WriteBatchWithIndexInternal::Result::kFound;
+        } else {
+          result = WriteBatchWithIndexInternal::Result::kError;
+        }
+      } else {  // nothing to merge
+        if (result == WriteBatchWithIndexInternal::Result::kFound) {  // PUT
+          value->assign(entry_value->data(), entry_value->size());
+        }
+      }
+    }
+  }
+
+  return result;
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h b/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h
new file mode 100644
index 0000000..a98ddd6
--- /dev/null
+++ b/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h
@@ -0,0 +1,96 @@
+// Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include <limits>
+#include <string>
+#include <unordered_map>
+
+#include "rocksdb/comparator.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+
+namespace rocksdb {
+
+class MergeContext;
+struct Options;
+
+// Key used by skip list, as the binary searchable index of WriteBatchWithIndex.
+struct WriteBatchIndexEntry {
+  WriteBatchIndexEntry(size_t o, uint32_t c)
+      : offset(o), column_family(c), search_key(nullptr) {}
+  WriteBatchIndexEntry(const Slice* sk, uint32_t c)
+      : offset(0), column_family(c), search_key(sk) {}
+
+  // If this flag appears in the offset, it indicates a key that is smaller
+  // than any other entry for the same column family
+  static const size_t kFlagMin = std::numeric_limits<size_t>::max();
+
+  size_t offset;           // offset of an entry in write batch's string buffer.
+  uint32_t column_family;  // column family of the entry
+  const Slice* search_key;  // if not null, instead of reading keys from
+                            // write batch, use it to compare. This is used
+                            // for lookup key.
+};
+
+class ReadableWriteBatch : public WriteBatch {
+ public:
+  explicit ReadableWriteBatch(size_t reserved_bytes = 0)
+      : WriteBatch(reserved_bytes) {}
+  // Retrieve some information from a write entry in the write batch, given
+  // the start offset of the write entry.
+  Status GetEntryFromDataOffset(size_t data_offset, WriteType* type, Slice* Key,
+                                Slice* value, Slice* blob) const;
+};
+
+class WriteBatchEntryComparator {
+ public:
+  WriteBatchEntryComparator(const Comparator* _default_comparator,
+                            const ReadableWriteBatch* write_batch)
+      : default_comparator_(_default_comparator), write_batch_(write_batch) {}
+  // Compare a and b. Return a negative value if a is less than b, 0 if they
+  // are equal, and a positive value if a is greater than b
+  int operator()(const WriteBatchIndexEntry* entry1,
+                 const WriteBatchIndexEntry* entry2) const;
+
+  int CompareKey(uint32_t column_family, const Slice& key1,
+                 const Slice& key2) const;
+
+  void SetComparatorForCF(uint32_t column_family_id,
+                          const Comparator* comparator) {
+    cf_comparator_map_[column_family_id] = comparator;
+  }
+
+  const Comparator* default_comparator() { return default_comparator_; }
+
+ private:
+  const Comparator* default_comparator_;
+  std::unordered_map<uint32_t, const Comparator*> cf_comparator_map_;
+  const ReadableWriteBatch* write_batch_;
+};
+
+class WriteBatchWithIndexInternal {
+ public:
+  enum Result { kFound, kDeleted, kNotFound, kMergeInProgress, kError };
+
+  // If batch contains a value for key, store it in *value and return kFound.
+  // If batch contains a deletion for key, return Deleted.
+  // If batch contains Merge operations as the most recent entry for a key,
+  //   and the merge process does not stop (not reaching a value or delete),
+  //   prepend the current merge operands to *operands,
+  //   and return kMergeInProgress
+  // If batch does not contain this key, return kNotFound
+  // Else, return kError on error with error Status stored in *s.
+  static WriteBatchWithIndexInternal::Result GetFromBatch(
+      const DBOptions& options, WriteBatchWithIndex* batch,
+      ColumnFamilyHandle* column_family, const Slice& key,
+      MergeContext* merge_context, WriteBatchEntryComparator* cmp,
+      std::string* value, Status* s);
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc b/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc
new file mode 100644
index 0000000..5e9ff77
--- /dev/null
+++ b/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc
@@ -0,0 +1,1190 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+
+#include <memory>
+#include <map>
+#include "db/column_family.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "util/testharness.h"
+#include "utilities/merge_operators.h"
+#include "utilities/merge_operators/string_append/stringappend.h"
+
+namespace rocksdb {
+
+namespace {
+class ColumnFamilyHandleImplDummy : public ColumnFamilyHandleImpl {
+ public:
+  explicit ColumnFamilyHandleImplDummy(int id, const Comparator* comparator)
+      : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr),
+        id_(id),
+        comparator_(comparator) {}
+  uint32_t GetID() const override { return id_; }
+  const Comparator* user_comparator() const override { return comparator_; }
+
+ private:
+  uint32_t id_;
+  const Comparator* comparator_;
+};
+
+struct Entry {
+  std::string key;
+  std::string value;
+  WriteType type;
+};
+
+struct TestHandler : public WriteBatch::Handler {
+  std::map<uint32_t, std::vector<Entry>> seen;
+  virtual Status PutCF(uint32_t column_family_id, const Slice& key,
+                       const Slice& value) {
+    Entry e;
+    e.key = key.ToString();
+    e.value = value.ToString();
+    e.type = kPutRecord;
+    seen[column_family_id].push_back(e);
+    return Status::OK();
+  }
+  virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
+                         const Slice& value) {
+    Entry e;
+    e.key = key.ToString();
+    e.value = value.ToString();
+    e.type = kMergeRecord;
+    seen[column_family_id].push_back(e);
+    return Status::OK();
+  }
+  virtual void LogData(const Slice& blob) {}
+  virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) {
+    Entry e;
+    e.key = key.ToString();
+    e.value = "";
+    e.type = kDeleteRecord;
+    seen[column_family_id].push_back(e);
+    return Status::OK();
+  }
+};
+}  // namespace anonymous
+
+class WriteBatchWithIndexTest : public testing::Test {};
+
+void TestValueAsSecondaryIndexHelper(std::vector<Entry> entries,
+                                     WriteBatchWithIndex* batch) {
+  // In this test, we insert <key, value> to column family `data`, and
+  // <value, key> to column family `index`. Then iterator them in order
+  // and seek them by key.
+
+  // Sort entries by key
+  std::map<std::string, std::vector<Entry*>> data_map;
+  // Sort entries by value
+  std::map<std::string, std::vector<Entry*>> index_map;
+  for (auto& e : entries) {
+    data_map[e.key].push_back(&e);
+    index_map[e.value].push_back(&e);
+  }
+
+  ColumnFamilyHandleImplDummy data(6, BytewiseComparator());
+  ColumnFamilyHandleImplDummy index(8, BytewiseComparator());
+  for (auto& e : entries) {
+    if (e.type == kPutRecord) {
+      batch->Put(&data, e.key, e.value);
+      batch->Put(&index, e.value, e.key);
+    } else if (e.type == kMergeRecord) {
+      batch->Merge(&data, e.key, e.value);
+      batch->Put(&index, e.value, e.key);
+    } else {
+      assert(e.type == kDeleteRecord);
+      std::unique_ptr<WBWIIterator> iter(batch->NewIterator(&data));
+      iter->Seek(e.key);
+      ASSERT_OK(iter->status());
+      auto& write_entry = iter->Entry();
+      ASSERT_EQ(e.key, write_entry.key.ToString());
+      ASSERT_EQ(e.value, write_entry.value.ToString());
+      batch->Delete(&data, e.key);
+      batch->Put(&index, e.value, "");
+    }
+  }
+
+  // Iterator all keys
+  {
+    std::unique_ptr<WBWIIterator> iter(batch->NewIterator(&data));
+    for (int seek_to_first : {0, 1}) {
+      if (seek_to_first) {
+        iter->SeekToFirst();
+      } else {
+        iter->Seek("");
+      }
+      for (auto pair : data_map) {
+        for (auto v : pair.second) {
+          ASSERT_OK(iter->status());
+          ASSERT_TRUE(iter->Valid());
+          auto& write_entry = iter->Entry();
+          ASSERT_EQ(pair.first, write_entry.key.ToString());
+          ASSERT_EQ(v->type, write_entry.type);
+          if (write_entry.type != kDeleteRecord) {
+            ASSERT_EQ(v->value, write_entry.value.ToString());
+          }
+          iter->Next();
+        }
+      }
+      ASSERT_TRUE(!iter->Valid());
+    }
+    iter->SeekToLast();
+    for (auto pair = data_map.rbegin(); pair != data_map.rend(); ++pair) {
+      for (auto v = pair->second.rbegin(); v != pair->second.rend(); v++) {
+        ASSERT_OK(iter->status());
+        ASSERT_TRUE(iter->Valid());
+        auto& write_entry = iter->Entry();
+        ASSERT_EQ(pair->first, write_entry.key.ToString());
+        ASSERT_EQ((*v)->type, write_entry.type);
+        if (write_entry.type != kDeleteRecord) {
+          ASSERT_EQ((*v)->value, write_entry.value.ToString());
+        }
+        iter->Prev();
+      }
+    }
+    ASSERT_TRUE(!iter->Valid());
+  }
+
+  // Iterator all indexes
+  {
+    std::unique_ptr<WBWIIterator> iter(batch->NewIterator(&index));
+    for (int seek_to_first : {0, 1}) {
+      if (seek_to_first) {
+        iter->SeekToFirst();
+      } else {
+        iter->Seek("");
+      }
+      for (auto pair : index_map) {
+        for (auto v : pair.second) {
+          ASSERT_OK(iter->status());
+          ASSERT_TRUE(iter->Valid());
+          auto& write_entry = iter->Entry();
+          ASSERT_EQ(pair.first, write_entry.key.ToString());
+          if (v->type != kDeleteRecord) {
+            ASSERT_EQ(v->key, write_entry.value.ToString());
+            ASSERT_EQ(v->value, write_entry.key.ToString());
+          }
+          iter->Next();
+        }
+      }
+      ASSERT_TRUE(!iter->Valid());
+    }
+
+    iter->SeekToLast();
+    for (auto pair = index_map.rbegin(); pair != index_map.rend(); ++pair) {
+      for (auto v = pair->second.rbegin(); v != pair->second.rend(); v++) {
+        ASSERT_OK(iter->status());
+        ASSERT_TRUE(iter->Valid());
+        auto& write_entry = iter->Entry();
+        ASSERT_EQ(pair->first, write_entry.key.ToString());
+        if ((*v)->type != kDeleteRecord) {
+          ASSERT_EQ((*v)->key, write_entry.value.ToString());
+          ASSERT_EQ((*v)->value, write_entry.key.ToString());
+        }
+        iter->Prev();
+      }
+    }
+    ASSERT_TRUE(!iter->Valid());
+  }
+
+  // Seek to every key
+  {
+    std::unique_ptr<WBWIIterator> iter(batch->NewIterator(&data));
+
+    // Seek the keys one by one in reverse order
+    for (auto pair = data_map.rbegin(); pair != data_map.rend(); ++pair) {
+      iter->Seek(pair->first);
+      ASSERT_OK(iter->status());
+      for (auto v : pair->second) {
+        ASSERT_TRUE(iter->Valid());
+        auto& write_entry = iter->Entry();
+        ASSERT_EQ(pair->first, write_entry.key.ToString());
+        ASSERT_EQ(v->type, write_entry.type);
+        if (write_entry.type != kDeleteRecord) {
+          ASSERT_EQ(v->value, write_entry.value.ToString());
+        }
+        iter->Next();
+        ASSERT_OK(iter->status());
+      }
+    }
+  }
+
+  // Seek to every index
+  {
+    std::unique_ptr<WBWIIterator> iter(batch->NewIterator(&index));
+
+    // Seek the keys one by one in reverse order
+    for (auto pair = index_map.rbegin(); pair != index_map.rend(); ++pair) {
+      iter->Seek(pair->first);
+      ASSERT_OK(iter->status());
+      for (auto v : pair->second) {
+        ASSERT_TRUE(iter->Valid());
+        auto& write_entry = iter->Entry();
+        ASSERT_EQ(pair->first, write_entry.key.ToString());
+        ASSERT_EQ(v->value, write_entry.key.ToString());
+        if (v->type != kDeleteRecord) {
+          ASSERT_EQ(v->key, write_entry.value.ToString());
+        }
+        iter->Next();
+        ASSERT_OK(iter->status());
+      }
+    }
+  }
+
+  // Verify WriteBatch can be iterated
+  TestHandler handler;
+  batch->GetWriteBatch()->Iterate(&handler);
+
+  // Verify data column family
+  {
+    ASSERT_EQ(entries.size(), handler.seen[data.GetID()].size());
+    size_t i = 0;
+    for (auto e : handler.seen[data.GetID()]) {
+      auto write_entry = entries[i++];
+      ASSERT_EQ(e.type, write_entry.type);
+      ASSERT_EQ(e.key, write_entry.key);
+      if (e.type != kDeleteRecord) {
+        ASSERT_EQ(e.value, write_entry.value);
+      }
+    }
+  }
+
+  // Verify index column family
+  {
+    ASSERT_EQ(entries.size(), handler.seen[index.GetID()].size());
+    size_t i = 0;
+    for (auto e : handler.seen[index.GetID()]) {
+      auto write_entry = entries[i++];
+      ASSERT_EQ(e.key, write_entry.value);
+      if (write_entry.type != kDeleteRecord) {
+        ASSERT_EQ(e.value, write_entry.key);
+      }
+    }
+  }
+}
+
+TEST_F(WriteBatchWithIndexTest, TestValueAsSecondaryIndex) {
+  Entry entries[] = {
+      {"aaa", "0005", kPutRecord},
+      {"b", "0002", kPutRecord},
+      {"cdd", "0002", kMergeRecord},
+      {"aab", "00001", kPutRecord},
+      {"cc", "00005", kPutRecord},
+      {"cdd", "0002", kPutRecord},
+      {"aab", "0003", kPutRecord},
+      {"cc", "00005", kDeleteRecord},
+  };
+  std::vector<Entry> entries_list(entries, entries + 8);
+
+  WriteBatchWithIndex batch(nullptr, 20);
+
+  TestValueAsSecondaryIndexHelper(entries_list, &batch);
+
+  // Clear batch and re-run test with new values
+  batch.Clear();
+
+  Entry new_entries[] = {
+      {"aaa", "0005", kPutRecord},
+      {"e", "0002", kPutRecord},
+      {"add", "0002", kMergeRecord},
+      {"aab", "00001", kPutRecord},
+      {"zz", "00005", kPutRecord},
+      {"add", "0002", kPutRecord},
+      {"aab", "0003", kPutRecord},
+      {"zz", "00005", kDeleteRecord},
+  };
+
+  entries_list = std::vector<Entry>(new_entries, new_entries + 8);
+
+  TestValueAsSecondaryIndexHelper(entries_list, &batch);
+}
+
+TEST_F(WriteBatchWithIndexTest, TestComparatorForCF) {
+  ColumnFamilyHandleImplDummy cf1(6, nullptr);
+  ColumnFamilyHandleImplDummy reverse_cf(66, ReverseBytewiseComparator());
+  ColumnFamilyHandleImplDummy cf2(88, BytewiseComparator());
+  WriteBatchWithIndex batch(BytewiseComparator(), 20);
+
+  batch.Put(&cf1, "ddd", "");
+  batch.Put(&cf2, "aaa", "");
+  batch.Put(&cf2, "eee", "");
+  batch.Put(&cf1, "ccc", "");
+  batch.Put(&reverse_cf, "a11", "");
+  batch.Put(&cf1, "bbb", "");
+
+  Slice key_slices[] = {"a", "3", "3"};
+  Slice value_slice = "";
+  batch.Put(&reverse_cf, SliceParts(key_slices, 3),
+            SliceParts(&value_slice, 1));
+  batch.Put(&reverse_cf, "a22", "");
+
+  {
+    std::unique_ptr<WBWIIterator> iter(batch.NewIterator(&cf1));
+    iter->Seek("");
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("bbb", iter->Entry().key.ToString());
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("ccc", iter->Entry().key.ToString());
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("ddd", iter->Entry().key.ToString());
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+  }
+
+  {
+    std::unique_ptr<WBWIIterator> iter(batch.NewIterator(&cf2));
+    iter->Seek("");
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("aaa", iter->Entry().key.ToString());
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("eee", iter->Entry().key.ToString());
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+  }
+
+  {
+    std::unique_ptr<WBWIIterator> iter(batch.NewIterator(&reverse_cf));
+    iter->Seek("");
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+
+    iter->Seek("z");
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("a33", iter->Entry().key.ToString());
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("a22", iter->Entry().key.ToString());
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("a11", iter->Entry().key.ToString());
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+
+    iter->Seek("a22");
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("a22", iter->Entry().key.ToString());
+
+    iter->Seek("a13");
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("a11", iter->Entry().key.ToString());
+  }
+}
+
+TEST_F(WriteBatchWithIndexTest, TestOverwriteKey) {
+  ColumnFamilyHandleImplDummy cf1(6, nullptr);
+  ColumnFamilyHandleImplDummy reverse_cf(66, ReverseBytewiseComparator());
+  ColumnFamilyHandleImplDummy cf2(88, BytewiseComparator());
+  WriteBatchWithIndex batch(BytewiseComparator(), 20, true);
+
+  batch.Put(&cf1, "ddd", "");
+  batch.Merge(&cf1, "ddd", "");
+  batch.Delete(&cf1, "ddd");
+  batch.Put(&cf2, "aaa", "");
+  batch.Delete(&cf2, "aaa");
+  batch.Put(&cf2, "aaa", "aaa");
+  batch.Put(&cf2, "eee", "eee");
+  batch.Put(&cf1, "ccc", "");
+  batch.Put(&reverse_cf, "a11", "");
+  batch.Delete(&cf1, "ccc");
+  batch.Put(&reverse_cf, "a33", "a33");
+  batch.Put(&reverse_cf, "a11", "a11");
+  Slice slices[] = {"a", "3", "3"};
+  batch.Delete(&reverse_cf, SliceParts(slices, 3));
+
+  {
+    std::unique_ptr<WBWIIterator> iter(batch.NewIterator(&cf1));
+    iter->Seek("");
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("ccc", iter->Entry().key.ToString());
+    ASSERT_TRUE(iter->Entry().type == WriteType::kDeleteRecord);
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("ddd", iter->Entry().key.ToString());
+    ASSERT_TRUE(iter->Entry().type == WriteType::kDeleteRecord);
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+  }
+
+  {
+    std::unique_ptr<WBWIIterator> iter(batch.NewIterator(&cf2));
+    iter->SeekToLast();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("eee", iter->Entry().key.ToString());
+    ASSERT_EQ("eee", iter->Entry().value.ToString());
+    iter->Prev();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("aaa", iter->Entry().key.ToString());
+    ASSERT_EQ("aaa", iter->Entry().value.ToString());
+    iter->Prev();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+
+    iter->SeekToFirst();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("aaa", iter->Entry().key.ToString());
+    ASSERT_EQ("aaa", iter->Entry().value.ToString());
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("eee", iter->Entry().key.ToString());
+    ASSERT_EQ("eee", iter->Entry().value.ToString());
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+  }
+
+  {
+    std::unique_ptr<WBWIIterator> iter(batch.NewIterator(&reverse_cf));
+    iter->Seek("");
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+
+    iter->Seek("z");
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("a33", iter->Entry().key.ToString());
+    ASSERT_TRUE(iter->Entry().type == WriteType::kDeleteRecord);
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("a11", iter->Entry().key.ToString());
+    ASSERT_EQ("a11", iter->Entry().value.ToString());
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+
+    iter->SeekToLast();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("a11", iter->Entry().key.ToString());
+    ASSERT_EQ("a11", iter->Entry().value.ToString());
+    iter->Prev();
+
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("a33", iter->Entry().key.ToString());
+    ASSERT_TRUE(iter->Entry().type == WriteType::kDeleteRecord);
+    iter->Prev();
+    ASSERT_TRUE(!iter->Valid());
+  }
+}
+
+namespace {
+typedef std::map<std::string, std::string> KVMap;
+
+class KVIter : public Iterator {
+ public:
+  explicit KVIter(const KVMap* map) : map_(map), iter_(map_->end()) {}
+  virtual bool Valid() const { return iter_ != map_->end(); }
+  virtual void SeekToFirst() { iter_ = map_->begin(); }
+  virtual void SeekToLast() {
+    if (map_->empty()) {
+      iter_ = map_->end();
+    } else {
+      iter_ = map_->find(map_->rbegin()->first);
+    }
+  }
+  virtual void Seek(const Slice& k) { iter_ = map_->lower_bound(k.ToString()); }
+  virtual void Next() { ++iter_; }
+  virtual void Prev() {
+    if (iter_ == map_->begin()) {
+      iter_ = map_->end();
+      return;
+    }
+    --iter_;
+  }
+
+  virtual Slice key() const { return iter_->first; }
+  virtual Slice value() const { return iter_->second; }
+  virtual Status status() const { return Status::OK(); }
+
+ private:
+  const KVMap* const map_;
+  KVMap::const_iterator iter_;
+};
+
+void AssertIter(Iterator* iter, const std::string& key,
+                const std::string& value) {
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(key, iter->key().ToString());
+  ASSERT_EQ(value, iter->value().ToString());
+}
+
+void AssertItersEqual(Iterator* iter1, Iterator* iter2) {
+  ASSERT_EQ(iter1->Valid(), iter2->Valid());
+  if (iter1->Valid()) {
+    ASSERT_EQ(iter1->key().ToString(), iter2->key().ToString());
+    ASSERT_EQ(iter1->value().ToString(), iter2->value().ToString());
+  }
+}
+}  // namespace
+
+TEST_F(WriteBatchWithIndexTest, TestRandomIteraratorWithBase) {
+  std::vector<std::string> source_strings = {"a", "b", "c", "d", "e",
+                                             "f", "g", "h", "i", "j"};
+  for (int rand_seed = 301; rand_seed < 366; rand_seed++) {
+    Random rnd(rand_seed);
+
+    ColumnFamilyHandleImplDummy cf1(6, BytewiseComparator());
+    ColumnFamilyHandleImplDummy cf2(2, BytewiseComparator());
+    ColumnFamilyHandleImplDummy cf3(8, BytewiseComparator());
+
+    WriteBatchWithIndex batch(BytewiseComparator(), 20, true);
+
+    if (rand_seed % 2 == 0) {
+      batch.Put(&cf2, "zoo", "bar");
+    }
+    if (rand_seed % 4 == 1) {
+      batch.Put(&cf3, "zoo", "bar");
+    }
+
+    KVMap map;
+    KVMap merged_map;
+    for (auto key : source_strings) {
+      std::string value = key + key;
+      int type = rnd.Uniform(6);
+      switch (type) {
+        case 0:
+          // only base has it
+          map[key] = value;
+          merged_map[key] = value;
+          break;
+        case 1:
+          // only delta has it
+          batch.Put(&cf1, key, value);
+          map[key] = value;
+          merged_map[key] = value;
+          break;
+        case 2:
+          // both has it. Delta should win
+          batch.Put(&cf1, key, value);
+          map[key] = "wrong_value";
+          merged_map[key] = value;
+          break;
+        case 3:
+          // both has it. Delta is delete
+          batch.Delete(&cf1, key);
+          map[key] = "wrong_value";
+          break;
+        case 4:
+          // only delta has it. Delta is delete
+          batch.Delete(&cf1, key);
+          map[key] = "wrong_value";
+          break;
+        default:
+          // Neither iterator has it.
+          break;
+      }
+    }
+
+    std::unique_ptr<Iterator> iter(
+        batch.NewIteratorWithBase(&cf1, new KVIter(&map)));
+    std::unique_ptr<Iterator> result_iter(new KVIter(&merged_map));
+
+    bool is_valid = false;
+    for (int i = 0; i < 128; i++) {
+      // Random walk and make sure iter and result_iter returns the
+      // same key and value
+      int type = rnd.Uniform(5);
+      ASSERT_OK(iter->status());
+      switch (type) {
+        case 0:
+          // Seek to First
+          iter->SeekToFirst();
+          result_iter->SeekToFirst();
+          break;
+        case 1:
+          // Seek to last
+          iter->SeekToLast();
+          result_iter->SeekToLast();
+          break;
+        case 2: {
+          // Seek to random key
+          auto key_idx = rnd.Uniform(static_cast<int>(source_strings.size()));
+          auto key = source_strings[key_idx];
+          iter->Seek(key);
+          result_iter->Seek(key);
+          break;
+        }
+        case 3:
+          // Next
+          if (is_valid) {
+            iter->Next();
+            result_iter->Next();
+          } else {
+            continue;
+          }
+          break;
+        default:
+          assert(type == 4);
+          // Prev
+          if (is_valid) {
+            iter->Prev();
+            result_iter->Prev();
+          } else {
+            continue;
+          }
+          break;
+      }
+      AssertItersEqual(iter.get(), result_iter.get());
+      is_valid = iter->Valid();
+    }
+  }
+}
+
+TEST_F(WriteBatchWithIndexTest, TestIteraratorWithBase) {
+  ColumnFamilyHandleImplDummy cf1(6, BytewiseComparator());
+  ColumnFamilyHandleImplDummy cf2(2, BytewiseComparator());
+  WriteBatchWithIndex batch(BytewiseComparator(), 20, true);
+
+  {
+    KVMap map;
+    map["a"] = "aa";
+    map["c"] = "cc";
+    map["e"] = "ee";
+    std::unique_ptr<Iterator> iter(
+        batch.NewIteratorWithBase(&cf1, new KVIter(&map)));
+
+    iter->SeekToFirst();
+    AssertIter(iter.get(), "a", "aa");
+    iter->Next();
+    AssertIter(iter.get(), "c", "cc");
+    iter->Next();
+    AssertIter(iter.get(), "e", "ee");
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+
+    iter->SeekToLast();
+    AssertIter(iter.get(), "e", "ee");
+    iter->Prev();
+    AssertIter(iter.get(), "c", "cc");
+    iter->Prev();
+    AssertIter(iter.get(), "a", "aa");
+    iter->Prev();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+
+    iter->Seek("b");
+    AssertIter(iter.get(), "c", "cc");
+
+    iter->Prev();
+    AssertIter(iter.get(), "a", "aa");
+
+    iter->Seek("a");
+    AssertIter(iter.get(), "a", "aa");
+  }
+
+  // Test the case that there is one element in the write batch
+  batch.Put(&cf2, "zoo", "bar");
+  batch.Put(&cf1, "a", "aa");
+  {
+    KVMap empty_map;
+    std::unique_ptr<Iterator> iter(
+        batch.NewIteratorWithBase(&cf1, new KVIter(&empty_map)));
+
+    iter->SeekToFirst();
+    AssertIter(iter.get(), "a", "aa");
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+  }
+
+  batch.Delete(&cf1, "b");
+  batch.Put(&cf1, "c", "cc");
+  batch.Put(&cf1, "d", "dd");
+  batch.Delete(&cf1, "e");
+
+  {
+    KVMap map;
+    map["b"] = "";
+    map["cc"] = "cccc";
+    map["f"] = "ff";
+    std::unique_ptr<Iterator> iter(
+        batch.NewIteratorWithBase(&cf1, new KVIter(&map)));
+
+    iter->SeekToFirst();
+    AssertIter(iter.get(), "a", "aa");
+    iter->Next();
+    AssertIter(iter.get(), "c", "cc");
+    iter->Next();
+    AssertIter(iter.get(), "cc", "cccc");
+    iter->Next();
+    AssertIter(iter.get(), "d", "dd");
+    iter->Next();
+    AssertIter(iter.get(), "f", "ff");
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+
+    iter->SeekToLast();
+    AssertIter(iter.get(), "f", "ff");
+    iter->Prev();
+    AssertIter(iter.get(), "d", "dd");
+    iter->Prev();
+    AssertIter(iter.get(), "cc", "cccc");
+    iter->Prev();
+    AssertIter(iter.get(), "c", "cc");
+    iter->Next();
+    AssertIter(iter.get(), "cc", "cccc");
+    iter->Prev();
+    AssertIter(iter.get(), "c", "cc");
+    iter->Prev();
+    AssertIter(iter.get(), "a", "aa");
+    iter->Prev();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+
+    iter->Seek("c");
+    AssertIter(iter.get(), "c", "cc");
+
+    iter->Seek("cb");
+    AssertIter(iter.get(), "cc", "cccc");
+
+    iter->Seek("cc");
+    AssertIter(iter.get(), "cc", "cccc");
+    iter->Next();
+    AssertIter(iter.get(), "d", "dd");
+
+    iter->Seek("e");
+    AssertIter(iter.get(), "f", "ff");
+
+    iter->Prev();
+    AssertIter(iter.get(), "d", "dd");
+
+    iter->Next();
+    AssertIter(iter.get(), "f", "ff");
+  }
+
+  {
+    KVMap empty_map;
+    std::unique_ptr<Iterator> iter(
+        batch.NewIteratorWithBase(&cf1, new KVIter(&empty_map)));
+
+    iter->SeekToFirst();
+    AssertIter(iter.get(), "a", "aa");
+    iter->Next();
+    AssertIter(iter.get(), "c", "cc");
+    iter->Next();
+    AssertIter(iter.get(), "d", "dd");
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+
+    iter->SeekToLast();
+    AssertIter(iter.get(), "d", "dd");
+    iter->Prev();
+    AssertIter(iter.get(), "c", "cc");
+    iter->Prev();
+    AssertIter(iter.get(), "a", "aa");
+
+    iter->Prev();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+
+    iter->Seek("aa");
+    AssertIter(iter.get(), "c", "cc");
+    iter->Next();
+    AssertIter(iter.get(), "d", "dd");
+
+    iter->Seek("ca");
+    AssertIter(iter.get(), "d", "dd");
+
+    iter->Prev();
+    AssertIter(iter.get(), "c", "cc");
+  }
+}
+
+TEST_F(WriteBatchWithIndexTest, TestIteraratorWithBaseReverseCmp) {
+  ColumnFamilyHandleImplDummy cf1(6, ReverseBytewiseComparator());
+  ColumnFamilyHandleImplDummy cf2(2, ReverseBytewiseComparator());
+  WriteBatchWithIndex batch(BytewiseComparator(), 20, true);
+
+  // Test the case that there is one element in the write batch
+  batch.Put(&cf2, "zoo", "bar");
+  batch.Put(&cf1, "a", "aa");
+  {
+    KVMap empty_map;
+    std::unique_ptr<Iterator> iter(
+        batch.NewIteratorWithBase(&cf1, new KVIter(&empty_map)));
+
+    iter->SeekToFirst();
+    AssertIter(iter.get(), "a", "aa");
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+  }
+
+  batch.Put(&cf1, "c", "cc");
+  {
+    KVMap map;
+    std::unique_ptr<Iterator> iter(
+        batch.NewIteratorWithBase(&cf1, new KVIter(&map)));
+
+    iter->SeekToFirst();
+    AssertIter(iter.get(), "c", "cc");
+    iter->Next();
+    AssertIter(iter.get(), "a", "aa");
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+
+    iter->SeekToLast();
+    AssertIter(iter.get(), "a", "aa");
+    iter->Prev();
+    AssertIter(iter.get(), "c", "cc");
+    iter->Prev();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+
+    iter->Seek("b");
+    AssertIter(iter.get(), "a", "aa");
+
+    iter->Prev();
+    AssertIter(iter.get(), "c", "cc");
+
+    iter->Seek("a");
+    AssertIter(iter.get(), "a", "aa");
+  }
+
+  // default column family
+  batch.Put("a", "b");
+  {
+    KVMap map;
+    map["b"] = "";
+    std::unique_ptr<Iterator> iter(batch.NewIteratorWithBase(new KVIter(&map)));
+
+    iter->SeekToFirst();
+    AssertIter(iter.get(), "a", "b");
+    iter->Next();
+    AssertIter(iter.get(), "b", "");
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+
+    iter->SeekToLast();
+    AssertIter(iter.get(), "b", "");
+    iter->Prev();
+    AssertIter(iter.get(), "a", "b");
+    iter->Prev();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+
+    iter->Seek("b");
+    AssertIter(iter.get(), "b", "");
+
+    iter->Prev();
+    AssertIter(iter.get(), "a", "b");
+
+    iter->Seek("0");
+    AssertIter(iter.get(), "a", "b");
+  }
+}
+
+TEST_F(WriteBatchWithIndexTest, TestGetFromBatch) {
+  Options options;
+  WriteBatchWithIndex batch;
+  Status s;
+  std::string value;
+
+  s = batch.GetFromBatch(options, "b", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  batch.Put("a", "a");
+  batch.Put("b", "b");
+  batch.Put("c", "c");
+  batch.Put("a", "z");
+  batch.Delete("c");
+  batch.Delete("d");
+  batch.Delete("e");
+  batch.Put("e", "e");
+
+  s = batch.GetFromBatch(options, "b", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("b", value);
+
+  s = batch.GetFromBatch(options, "a", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("z", value);
+
+  s = batch.GetFromBatch(options, "c", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = batch.GetFromBatch(options, "d", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = batch.GetFromBatch(options, "x", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = batch.GetFromBatch(options, "e", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("e", value);
+
+  batch.Merge("z", "z");
+
+  s = batch.GetFromBatch(options, "z", &value);
+  ASSERT_NOK(s);  // No merge operator specified.
+
+  s = batch.GetFromBatch(options, "b", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("b", value);
+}
+
+TEST_F(WriteBatchWithIndexTest, TestGetFromBatchMerge) {
+  DB* db;
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+  options.create_if_missing = true;
+
+  std::string dbname = test::TmpDir() + "/write_batch_with_index_test";
+
+  DestroyDB(dbname, options);
+  Status s = DB::Open(options, dbname, &db);
+  assert(s.ok());
+
+  ColumnFamilyHandle* column_family = db->DefaultColumnFamily();
+  WriteBatchWithIndex batch;
+  std::string value;
+
+  s = batch.GetFromBatch(options, "x", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  batch.Put("x", "X");
+  std::string expected = "X";
+
+  for (int i = 0; i < 5; i++) {
+    batch.Merge("x", std::to_string(i));
+    expected = expected + "," + std::to_string(i);
+
+    if (i % 2 == 0) {
+      batch.Put("y", std::to_string(i / 2));
+    }
+
+    batch.Merge("z", "z");
+
+    s = batch.GetFromBatch(column_family, options, "x", &value);
+    ASSERT_OK(s);
+    ASSERT_EQ(expected, value);
+
+    s = batch.GetFromBatch(column_family, options, "y", &value);
+    ASSERT_OK(s);
+    ASSERT_EQ(std::to_string(i / 2), value);
+
+    s = batch.GetFromBatch(column_family, options, "z", &value);
+    ASSERT_TRUE(s.IsMergeInProgress());
+  }
+
+  delete db;
+  DestroyDB(dbname, options);
+}
+
+TEST_F(WriteBatchWithIndexTest, TestGetFromBatchAndDB) {
+  DB* db;
+  Options options;
+  options.create_if_missing = true;
+  std::string dbname = test::TmpDir() + "/write_batch_with_index_test";
+
+  DestroyDB(dbname, options);
+  Status s = DB::Open(options, dbname, &db);
+  assert(s.ok());
+
+  WriteBatchWithIndex batch;
+  ReadOptions read_options;
+  WriteOptions write_options;
+  std::string value;
+
+  s = db->Put(write_options, "a", "a");
+  ASSERT_OK(s);
+
+  s = db->Put(write_options, "b", "b");
+  ASSERT_OK(s);
+
+  s = db->Put(write_options, "c", "c");
+  ASSERT_OK(s);
+
+  batch.Put("a", "batch.a");
+  batch.Delete("b");
+
+  s = batch.GetFromBatchAndDB(db, read_options, "a", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("batch.a", value);
+
+  s = batch.GetFromBatchAndDB(db, read_options, "b", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = batch.GetFromBatchAndDB(db, read_options, "c", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("c", value);
+
+  s = batch.GetFromBatchAndDB(db, read_options, "x", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  db->Delete(write_options, "x");
+
+  s = batch.GetFromBatchAndDB(db, read_options, "x", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  delete db;
+  DestroyDB(dbname, options);
+}
+
+TEST_F(WriteBatchWithIndexTest, TestGetFromBatchAndDBMerge) {
+  DB* db;
+  Options options;
+
+  options.create_if_missing = true;
+  std::string dbname = test::TmpDir() + "/write_batch_with_index_test";
+
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+  DestroyDB(dbname, options);
+  Status s = DB::Open(options, dbname, &db);
+  assert(s.ok());
+
+  WriteBatchWithIndex batch;
+  ReadOptions read_options;
+  WriteOptions write_options;
+  std::string value;
+
+  s = db->Put(write_options, "a", "a0");
+  ASSERT_OK(s);
+
+  s = db->Put(write_options, "b", "b0");
+  ASSERT_OK(s);
+
+  s = db->Merge(write_options, "b", "b1");
+  ASSERT_OK(s);
+
+  s = db->Merge(write_options, "c", "c0");
+  ASSERT_OK(s);
+
+  s = db->Merge(write_options, "d", "d0");
+  ASSERT_OK(s);
+
+  batch.Merge("a", "a1");
+  batch.Merge("a", "a2");
+  batch.Merge("b", "b2");
+  batch.Merge("d", "d1");
+  batch.Merge("e", "e0");
+
+  s = batch.GetFromBatchAndDB(db, read_options, "a", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("a0,a1,a2", value);
+
+  s = batch.GetFromBatchAndDB(db, read_options, "b", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("b0,b1,b2", value);
+
+  s = batch.GetFromBatchAndDB(db, read_options, "c", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("c0", value);
+
+  s = batch.GetFromBatchAndDB(db, read_options, "d", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("d0,d1", value);
+
+  s = batch.GetFromBatchAndDB(db, read_options, "e", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("e0", value);
+
+  s = db->Delete(write_options, "x");
+  ASSERT_OK(s);
+
+  s = batch.GetFromBatchAndDB(db, read_options, "x", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  const Snapshot* snapshot = db->GetSnapshot();
+  ReadOptions snapshot_read_options;
+  snapshot_read_options.snapshot = snapshot;
+
+  s = db->Delete(write_options, "a");
+  ASSERT_OK(s);
+
+  s = batch.GetFromBatchAndDB(db, read_options, "a", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("a1,a2", value);
+
+  s = batch.GetFromBatchAndDB(db, snapshot_read_options, "a", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("a0,a1,a2", value);
+
+  batch.Delete("a");
+
+  s = batch.GetFromBatchAndDB(db, read_options, "a", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = batch.GetFromBatchAndDB(db, snapshot_read_options, "a", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = db->Merge(write_options, "c", "c1");
+  ASSERT_OK(s);
+
+  s = batch.GetFromBatchAndDB(db, read_options, "c", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("c0,c1", value);
+
+  s = batch.GetFromBatchAndDB(db, snapshot_read_options, "c", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("c0", value);
+
+  s = db->Put(write_options, "e", "e1");
+  ASSERT_OK(s);
+
+  s = batch.GetFromBatchAndDB(db, read_options, "e", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("e1,e0", value);
+
+  s = batch.GetFromBatchAndDB(db, snapshot_read_options, "e", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("e0", value);
+
+  s = db->Delete(write_options, "e");
+  ASSERT_OK(s);
+
+  s = batch.GetFromBatchAndDB(db, read_options, "e", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("e0", value);
+
+  s = batch.GetFromBatchAndDB(db, snapshot_read_options, "e", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("e0", value);
+
+  db->ReleaseSnapshot(snapshot);
+  delete db;
+  DestroyDB(dbname, options);
+}
+
+}  // namespace
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/sample.ceph.conf b/src/sample.ceph.conf
index 93c4987..120dd33 100644
--- a/src/sample.ceph.conf
+++ b/src/sample.ceph.conf
@@ -266,8 +266,6 @@
     # (Default: 100000)
     ;mds cache size             = 250000
 
-    ;mds mem max                = 1048576     # KB
-
 ;[mds.alpha]
 ;    host                       = alpha
 
diff --git a/src/stop.sh b/src/stop.sh
index 8e45f41..ea5f273 100755
--- a/src/stop.sh
+++ b/src/stop.sh
@@ -18,7 +18,11 @@
 
 test -d dev/osd0/. && test -e dev/sudo && SUDO="sudo"
 
-[ -z "$CEPH_BIN" ] && CEPH_BIN=.
+if [ -e CMakeCache.txt ]; then
+  [ -z "$CEPH_BIN" ] && CEPH_BIN=src
+else
+  [ -z "$CEPH_BIN" ] && CEPH_BIN=.
+fi
 
 MYUID=$(id -u)
 MYNAME=$(id -nu)
diff --git a/src/test/Makefile-client.am b/src/test/Makefile-client.am
index b55ad4e..e1cb98b 100644
--- a/src/test/Makefile-client.am
+++ b/src/test/Makefile-client.am
@@ -68,6 +68,10 @@ ceph_omapbench_SOURCES = test/omap_bench.cc
 ceph_omapbench_LDADD = $(LIBRADOS) $(CEPH_GLOBAL)
 bin_DEBUGPROGRAMS += ceph_omapbench
 
+ceph_objectstore_bench_SOURCES = test/objectstore_bench.cc
+ceph_objectstore_bench_LDADD = $(LIBOS) $(CEPH_GLOBAL)
+bin_DEBUGPROGRAMS += ceph_objectstore_bench
+
 if LINUX
 ceph_kvstorebench_SOURCES = \
 	test/kv_store_bench.cc \
@@ -112,12 +116,12 @@ endif # LINUX
 unittest_librados_SOURCES = test/librados/librados.cc
 unittest_librados_LDADD = $(LIBRADOS) $(CEPH_GLOBAL) $(UNITTEST_LDADD)
 unittest_librados_CXXFLAGS = $(UNITTEST_CXXFLAGS)
-check_PROGRAMS += unittest_librados
+check_TESTPROGRAMS += unittest_librados
 
 unittest_librados_config_SOURCES = test/librados/librados_config.cc
 unittest_librados_config_LDADD = $(LIBRADOS) $(CEPH_GLOBAL) $(UNITTEST_LDADD)
 unittest_librados_config_CXXFLAGS = $(UNITTEST_CXXFLAGS)
-check_PROGRAMS += unittest_librados_config
+check_TESTPROGRAMS += unittest_librados_config
 
 ceph_multi_stress_watch_SOURCES = test/multi_stress_watch.cc
 ceph_multi_stress_watch_LDADD = $(LIBRADOS) $(CEPH_GLOBAL) $(RADOS_TEST_LDADD)
@@ -127,7 +131,8 @@ bin_DEBUGPROGRAMS += ceph_multi_stress_watch
 ceph_test_cls_rbd_SOURCES = test/cls_rbd/test_cls_rbd.cc
 ceph_test_cls_rbd_LDADD = \
 	$(LIBRADOS) libcls_rbd_client.la libcls_lock_client.la \
-	$(LIBCOMMON) $(UNITTEST_LDADD) $(RADOS_TEST_LDADD)
+	$(LIBCOMMON) $(UNITTEST_LDADD) $(RADOS_TEST_LDADD) $(CRYPTO_LIBS) \
+	$(EXTRALIBS)
 ceph_test_cls_rbd_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 bin_DEBUGPROGRAMS += ceph_test_cls_rbd
 
@@ -172,10 +177,17 @@ ceph_test_cls_hello_LDADD = \
 ceph_test_cls_hello_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 bin_DEBUGPROGRAMS += ceph_test_cls_hello
 
+ceph_test_cls_numops_SOURCES = test/cls_numops/test_cls_numops.cc
+ceph_test_cls_numops_LDADD = \
+    $(LIBRADOS) libcls_numops_client.la \
+    $(UNITTEST_LDADD) $(CEPH_GLOBAL) $(RADOS_TEST_LDADD)
+ceph_test_cls_numops_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+bin_DEBUGPROGRAMS += ceph_test_cls_numops
+
 ceph_test_rados_api_cmd_SOURCES = test/librados/cmd.cc
 ceph_test_rados_api_cmd_LDADD = \
 	$(LIBCOMMON) $(LIBRADOS) $(CRYPTO_LIBS) \
-	$(UNITTEST_LDADD) $(RADOS_TEST_LDADD) -luuid
+	$(UNITTEST_LDADD) $(RADOS_TEST_LDADD)
 ceph_test_rados_api_cmd_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 bin_DEBUGPROGRAMS += ceph_test_rados_api_cmd
 
@@ -270,6 +282,7 @@ librados_test_stub_la_SOURCES = \
 	test/librados_test_stub/TestRadosClient.cc \
 	test/librados_test_stub/TestWatchNotify.cc
 noinst_HEADERS += \
+	test/librados_test_stub/LibradosTestStub.h \
 	test/librados_test_stub/TestClassHandler.h \
 	test/librados_test_stub/TestRadosClient.h \
 	test/librados_test_stub/TestMemRadosClient.h \
@@ -296,7 +309,7 @@ unittest_rbd_replay_LDADD = $(LIBRBD) \
 	librbd_replay_ios.la \
 	$(UNITTEST_LDADD)
 unittest_rbd_replay_CXXFLAGS = $(UNITTEST_CXXFLAGS)
-check_PROGRAMS += unittest_rbd_replay
+check_TESTPROGRAMS += unittest_rbd_replay
 
 librbd_test_la_SOURCES = \
 	test/librbd/test_fixture.cc \
@@ -309,7 +322,8 @@ librbd_test_la_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 noinst_LTLIBRARIES += librbd_test.la
 
 unittest_librbd_SOURCES = \
-        test/librbd/test_main.cc
+        test/librbd/test_main.cc \
+	test/librbd/test_mock_fixture.cc
 unittest_librbd_CXXFLAGS = $(UNITTEST_CXXFLAGS) -DTEST_LIBRBD_INTERNALS
 unittest_librbd_LDADD = \
 	librbd_test.la librbd_api.la librbd_internal.la $(LIBRBD_TYPES) \
@@ -318,6 +332,7 @@ unittest_librbd_LDADD = \
 	$(LIBOSDC) $(UNITTEST_LDADD) \
 	$(CEPH_GLOBAL) $(RADOS_TEST_LDADD)
 check_PROGRAMS += unittest_librbd
+check_SCRIPTS += test/run-rbd-unit-tests.sh
 
 ceph_test_librbd_SOURCES = \
         test/librbd/test_main.cc
@@ -338,17 +353,20 @@ ceph_test_librbd_api_LDADD = \
 	$(LIBRBD) $(LIBRADOS) $(LIBCOMMON) $(UNITTEST_LDADD) $(RADOS_TEST_LDADD)
 bin_DEBUGPROGRAMS += ceph_test_librbd_api
 
-if WITH_LTTNG
-unittest_librbd_LDADD += $(LIBRBD_TP)
-ceph_test_librbd_LDADD += $(LIBRBD_TP)
-ceph_test_librbd_api_LDADD += $(LIBRBD_TP)
-endif
+noinst_HEADERS += \
+	test/librbd/test_fixture.h \
+	test/librbd/test_mock_fixture.h \
+	test/librbd/test_support.h \
+	test/librbd/mock/MockContextWQ.h \
+	test/librbd/mock/MockImageCtx.h \
+	test/librbd/mock/MockImageWatcher.h \
+	test/librbd/mock/MockObjectMap.h
 
 if LINUX
 ceph_test_librbd_fsx_SOURCES = test/librbd/fsx.cc
 ceph_test_librbd_fsx_LDADD = \
 	$(LIBKRBD) $(LIBRBD) $(LIBRADOS) \
-	$(CRYPTO_LIBS) $(PTHREAD_LIBS) -luuid
+	$(CRYPTO_LIBS) $(PTHREAD_LIBS)
 ceph_test_librbd_fsx_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 bin_DEBUGPROGRAMS += ceph_test_librbd_fsx
 endif
@@ -400,33 +418,37 @@ endif # WITH_BUILD_TESTS
 
 unittest_encoding_LDADD = $(LIBCEPHFS) $(LIBRADOS) $(CEPH_GLOBAL) -lm $(UNITTEST_LDADD)
 unittest_encoding_CXXFLAGS = $(UNITTEST_CXXFLAGS) -fno-strict-aliasing
-check_PROGRAMS += unittest_encoding
+check_TESTPROGRAMS += unittest_encoding
 
 unittest_base64_SOURCES = test/base64.cc
 unittest_base64_LDADD = $(LIBCEPHFS) $(CEPH_GLOBAL) -lm $(UNITTEST_LDADD)
 unittest_base64_CXXFLAGS = $(UNITTEST_CXXFLAGS)
-check_PROGRAMS += unittest_base64
+check_TESTPROGRAMS += unittest_base64
 
 unittest_run_cmd_SOURCES = test/run_cmd.cc
 unittest_run_cmd_LDADD = $(LIBCEPHFS) $(CEPH_GLOBAL) $(UNITTEST_LDADD)
 unittest_run_cmd_CXXFLAGS = $(UNITTEST_CXXFLAGS)
-check_PROGRAMS += unittest_run_cmd
+check_TESTPROGRAMS += unittest_run_cmd
 
 unittest_simple_spin_SOURCES = test/simple_spin.cc
 unittest_simple_spin_LDADD = $(LIBCEPHFS) $(CEPH_GLOBAL) $(UNITTEST_LDADD)
 unittest_simple_spin_CXXFLAGS = $(UNITTEST_CXXFLAGS)
-check_PROGRAMS += unittest_simple_spin
+check_TESTPROGRAMS += unittest_simple_spin
 
 unittest_libcephfs_config_SOURCES = test/libcephfs_config.cc
 unittest_libcephfs_config_LDADD = $(LIBCEPHFS) $(CEPH_GLOBAL) $(UNITTEST_LDADD)
 unittest_libcephfs_config_CXXFLAGS = $(UNITTEST_CXXFLAGS)
-check_PROGRAMS += unittest_libcephfs_config
+check_TESTPROGRAMS += unittest_libcephfs_config
 
 ceph_test_libcephfs_SOURCES = \
 	test/libcephfs/test.cc \
 	test/libcephfs/readdir_r_cb.cc \
 	test/libcephfs/caps.cc \
 	test/libcephfs/multiclient.cc
+if LINUX
+ceph_test_libcephfs_SOURCES += test/libcephfs/flock.cc
+endif # LINUX
+
 ceph_test_libcephfs_LDADD = $(LIBCEPHFS) $(UNITTEST_LDADD)
 ceph_test_libcephfs_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 bin_DEBUGPROGRAMS += ceph_test_libcephfs
@@ -436,7 +458,6 @@ unittest_encoding_SOURCES = test/encoding.cc
 ceph_test_c_headers_SOURCES = test/test_c_headers.c
 ceph_test_c_headers_LDADD = $(LIBRADOS) $(LIBCEPHFS)
 ceph_test_c_headers_CFLAGS = $(AM_CFLAGS) \
-	-Werror \
 	-Wstrict-prototypes \
 	-Wredundant-decls \
 	-Wall \
@@ -451,9 +472,11 @@ ceph_test_c_headers_CFLAGS = $(AM_CFLAGS) \
 	-Wformat-y2k \
 	-Winit-self \
 	-Wignored-qualifiers \
-	-Wold-style-declaration \
 	-Wold-style-definition \
 	-Wtype-limits
+if !CLANG
+ceph_test_c_headers_CFLAGS += -Werror -Wold-style-declaration
+endif # !CLANG
 bin_DEBUGPROGRAMS += ceph_test_c_headers
 
 endif # WITH_CEPHFS
@@ -479,19 +502,19 @@ endif # WITH_BUILD_TESTS
 #unittest_librgw_link_LDFLAGS = $(PTHREAD_CFLAGS) ${AM_LDFLAGS}
 #unittest_librgw_link_LDADD = $(LIBRGW) ${UNITTEST_LDADD}
 #unittest_librgw_link_CXXFLAGS = ${CRYPTO_CFLAGS} ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-#check_PROGRAMS += unittest_librgw_link
+#check_TESTPROGRAMS += unittest_librgw_link
 
 #unittest_librgw_SOURCES = test/librgw.cc
 #unittest_librgw_LDFLAGS = -lrt $(PTHREAD_CFLAGS) -lcurl ${AM_LDFLAGS}
 #unittest_librgw_LDADD =  librgw.la $(LIBRADOS) ${UNITTEST_LDADD} -lexpat $(CEPH_GLOBAL)
 #unittest_librgw_CXXFLAGS = ${CRYPTO_CFLAGS} ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-#check_PROGRAMS += unittest_librgw
+#check_TESTPROGRAMS += unittest_librgw
 
 ceph_test_cors_SOURCES = test/test_cors.cc
 ceph_test_cors_LDADD = \
 	$(LIBRADOS) $(LIBRGW) $(CEPH_GLOBAL) \
 	$(UNITTEST_LDADD) \
-	-lcurl -luuid -lexpat
+	-lcurl -lexpat
 ceph_test_cors_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 bin_DEBUGPROGRAMS += ceph_test_cors
 
@@ -499,16 +522,25 @@ ceph_test_rgw_manifest_SOURCES = test/rgw/test_rgw_manifest.cc
 ceph_test_rgw_manifest_LDADD = \
 	$(LIBRADOS) $(LIBRGW) $(LIBRGW_DEPS) $(CEPH_GLOBAL) \
 	$(UNITTEST_LDADD) $(CRYPTO_LIBS) \
-	-lcurl -luuid -lexpat
+	-lcurl -lexpat
 
 ceph_test_rgw_manifest_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 bin_DEBUGPROGRAMS += ceph_test_rgw_manifest
 
+ceph_test_rgw_obj_SOURCES = test/rgw/test_rgw_obj.cc
+ceph_test_rgw_obj_LDADD = \
+	$(LIBRADOS) $(LIBRGW) $(LIBRGW_DEPS) $(CEPH_GLOBAL) \
+	$(UNITTEST_LDADD) $(CRYPTO_LIBS) \
+	-lcurl -luuid -lexpat
+
+ceph_test_rgw_obj_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+bin_DEBUGPROGRAMS += ceph_test_rgw_obj
+
 ceph_test_cls_rgw_meta_SOURCES = test/test_rgw_admin_meta.cc
 ceph_test_cls_rgw_meta_LDADD = \
 	$(LIBRADOS) $(LIBRGW) $(CEPH_GLOBAL) \
 	$(UNITTEST_LDADD) $(CRYPTO_LIBS) \
-	-lcurl -luuid -lexpat \
+	-lcurl -lexpat \
 	libcls_version_client.a libcls_log_client.a \
 	libcls_statelog_client.a libcls_refcount_client.la \
 	libcls_rgw_client.la libcls_user_client.a libcls_lock_client.la
@@ -519,7 +551,7 @@ ceph_test_cls_rgw_log_SOURCES = test/test_rgw_admin_log.cc
 ceph_test_cls_rgw_log_LDADD = \
 	$(LIBRADOS) $(LIBRGW) $(CEPH_GLOBAL) \
 	$(UNITTEST_LDADD) $(CRYPTO_LIBS) \
-	-lcurl -luuid -lexpat \
+	-lcurl -lexpat \
 	libcls_version_client.a libcls_log_client.a \
 	libcls_statelog_client.a libcls_refcount_client.la \
 	libcls_rgw_client.la libcls_user_client.a libcls_lock_client.la
@@ -530,10 +562,11 @@ ceph_test_cls_rgw_opstate_SOURCES = test/test_rgw_admin_opstate.cc
 ceph_test_cls_rgw_opstate_LDADD = \
 	$(LIBRADOS) $(LIBRGW) $(CEPH_GLOBAL) \
 	$(UNITTEST_LDADD) $(CRYPTO_LIBS) \
-	-lcurl -luuid -lexpat \
-	libcls_version_client.a libcls_log_client.a \
+	-lcurl -lexpat \
+	libcls_version_client.a libcls_log_client.a  libcls_timeindex_client.a \
 	libcls_statelog_client.a libcls_refcount_client.la \
-	libcls_rgw_client.la libcls_user_client.a libcls_lock_client.la
+	libcls_rgw_client.la libcls_user_client.a libcls_lock_client.la \
+	$(LIBRADOS)
 ceph_test_cls_rgw_opstate_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 bin_DEBUGPROGRAMS += ceph_test_cls_rgw_opstate
 
diff --git a/src/test/Makefile-server.am b/src/test/Makefile-server.am
index c822963..8bf8cfc 100644
--- a/src/test/Makefile-server.am
+++ b/src/test/Makefile-server.am
@@ -30,12 +30,36 @@ ceph_perf_objectstore_LDADD = $(LIBOS) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 ceph_perf_objectstore_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 bin_DEBUGPROGRAMS += ceph_perf_objectstore
 
+ceph_perf_local_SOURCES = test/perf_local.cc test/perf_helper.cc
+ceph_perf_local_LDADD = $(LIBOS) $(CEPH_GLOBAL)
+ceph_perf_local_CXXFLAGS = ${AM_CXXFLAGS} 	\
+	${INTEL_SSE_FLAGS} \
+	${INTEL_SSE2_FLAGS}
+
+noinst_HEADERS += test/perf_helper.h
+bin_DEBUGPROGRAMS += ceph_perf_local
+
+ceph_perf_msgr_server_SOURCES = test/msgr/perf_msgr_server.cc
+ceph_perf_msgr_server_LDADD = $(LIBOS) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+ceph_perf_msgr_server_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+bin_DEBUGPROGRAMS += ceph_perf_msgr_server
+
+ceph_perf_msgr_client_SOURCES = test/msgr/perf_msgr_client.cc
+ceph_perf_msgr_client_LDADD = $(LIBOS) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+ceph_perf_msgr_client_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+bin_DEBUGPROGRAMS += ceph_perf_msgr_client
+
 if LINUX
 ceph_test_objectstore_SOURCES = test/objectstore/store_test.cc
 ceph_test_objectstore_LDADD = $(LIBOS) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 ceph_test_objectstore_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 bin_DEBUGPROGRAMS += ceph_test_objectstore
 
+ceph_test_keyvaluedb_SOURCES = test/objectstore/test_kv.cc
+ceph_test_keyvaluedb_LDADD = $(LIBOS) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+ceph_test_keyvaluedb_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+bin_DEBUGPROGRAMS += ceph_test_keyvaluedb
+
 ceph_test_filestore_SOURCES = test/filestore/TestFileStore.cc
 ceph_test_filestore_LDADD = $(LIBOS) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 ceph_test_filestore_CXXFLAGS = $(UNITTEST_CXXFLAGS)
@@ -134,12 +158,12 @@ noinst_PROGRAMS += get_command_descriptions
 unittest_mon_moncap_SOURCES = test/mon/moncap.cc
 unittest_mon_moncap_LDADD = $(LIBMON) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 unittest_mon_moncap_CXXFLAGS = $(UNITTEST_CXXFLAGS)
-check_PROGRAMS += unittest_mon_moncap
+check_TESTPROGRAMS += unittest_mon_moncap
 
 unittest_mon_pgmap_SOURCES = test/mon/PGMap.cc
 unittest_mon_pgmap_LDADD = $(LIBMON) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 unittest_mon_pgmap_CXXFLAGS = $(UNITTEST_CXXFLAGS)
-check_PROGRAMS += unittest_mon_pgmap
+check_TESTPROGRAMS += unittest_mon_pgmap
 
 endif # WITH_MON
 
@@ -148,12 +172,12 @@ if WITH_OSD
 unittest_ecbackend_SOURCES = test/osd/TestECBackend.cc
 unittest_ecbackend_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_ecbackend_LDADD = $(LIBOSD) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
-check_PROGRAMS += unittest_ecbackend
+check_TESTPROGRAMS += unittest_ecbackend
 
 unittest_osdscrub_SOURCES = test/osd/TestOSDScrub.cc
 unittest_osdscrub_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_osdscrub_LDADD = $(LIBOSD) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
-check_PROGRAMS += unittest_osdscrub
+check_TESTPROGRAMS += unittest_osdscrub
 if LINUX
 unittest_osdscrub_LDADD += -ldl
 endif # LINUX
@@ -161,7 +185,7 @@ endif # LINUX
 unittest_pglog_SOURCES = test/osd/TestPGLog.cc
 unittest_pglog_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_pglog_LDADD = $(LIBOSD) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
-check_PROGRAMS += unittest_pglog
+check_TESTPROGRAMS += unittest_pglog
 if LINUX
 unittest_pglog_LDADD += -ldl
 endif # LINUX
@@ -169,35 +193,48 @@ endif # LINUX
 unittest_hitset_SOURCES = test/osd/hitset.cc
 unittest_hitset_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_hitset_LDADD = $(LIBOSD) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
-check_PROGRAMS += unittest_hitset
+check_TESTPROGRAMS += unittest_hitset
 
 unittest_osd_osdcap_SOURCES = test/osd/osdcap.cc 
 unittest_osd_osdcap_LDADD = $(LIBOSD) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 unittest_osd_osdcap_CXXFLAGS = $(UNITTEST_CXXFLAGS)
-check_PROGRAMS += unittest_osd_osdcap
+check_TESTPROGRAMS += unittest_osd_osdcap
 
 ceph_test_snap_mapper_SOURCES = test/test_snap_mapper.cc
 ceph_test_snap_mapper_LDADD = $(LIBOSD) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 ceph_test_snap_mapper_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 bin_DEBUGPROGRAMS += ceph_test_snap_mapper
 
+unittest_pageset_SOURCES = test/test_pageset.cc
+unittest_pageset_LDADD = $(UNITTEST_LDADD)
+unittest_pageset_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+check_TESTPROGRAMS += unittest_pageset
+
 endif # WITH_OSD
 
+if WITH_SLIBROCKSDB
+unittest_rocksdb_option_static_SOURCES = test/objectstore/TestRocksdbOptionParse.cc
+unittest_rocksdb_option_static_LDADD = $(LIBOS) $(UNITTEST_LDADD) $(CEPH_GLOBAL) rocksdb/librocksdb.la
+unittest_rocksdb_option_static_CXXFLAGS = $(UNITTEST_CXXFLAGS) ${AM_CXXFLAGS} ${LIBROCKSDB_CFLAGS} -std=gnu++11 -I rocksdb/include
+check_TESTPROGRAMS += unittest_rocksdb_option_static
+endif
+
+if WITH_DLIBROCKSDB
+unittest_rocksdb_option_SOURCES = test/objectstore/TestRocksdbOptionParse.cc
+unittest_rocksdb_option_LDADD = $(LIBOS) $(UNITTEST_LDADD) $(CEPH_GLOBAL) -lrocksdb
+unittest_rocksdb_option_CXXFLAGS = $(UNITTEST_CXXFLAGS) ${AM_CXXFLAGS} ${LIBROCKSDB_CFLAGS} -std=gnu++11
+check_TESTPROGRAMS += unittest_rocksdb_option
+endif
 
 unittest_chain_xattr_SOURCES = test/objectstore/chain_xattr.cc
 unittest_chain_xattr_LDADD = $(LIBOS) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 unittest_chain_xattr_CXXFLAGS = $(UNITTEST_CXXFLAGS)
-check_PROGRAMS += unittest_chain_xattr
-
-unittest_flatindex_SOURCES = test/os/TestFlatIndex.cc
-unittest_flatindex_LDADD = $(LIBOS) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
-unittest_flatindex_CXXFLAGS = $(UNITTEST_CXXFLAGS)
-check_PROGRAMS += unittest_flatindex
+check_TESTPROGRAMS += unittest_chain_xattr
 
 unittest_lfnindex_SOURCES = test/os/TestLFNIndex.cc
 unittest_lfnindex_LDADD = $(LIBOS) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 unittest_lfnindex_CXXFLAGS = $(UNITTEST_CXXFLAGS)
-check_PROGRAMS += unittest_lfnindex
+check_TESTPROGRAMS += unittest_lfnindex
 
 
 if WITH_MDS
@@ -205,6 +242,6 @@ if WITH_MDS
 unittest_mds_authcap_SOURCES = test/mds/TestMDSAuthCaps.cc 
 unittest_mds_authcap_LDADD = $(LIBMDS) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 unittest_mds_authcap_CXXFLAGS = $(UNITTEST_CXXFLAGS)
-check_PROGRAMS += unittest_mds_authcap
+check_TESTPROGRAMS += unittest_mds_authcap
 
 endif # WITH_MDS
diff --git a/src/test/Makefile.am b/src/test/Makefile.am
index ab77ba7..89fc7df 100644
--- a/src/test/Makefile.am
+++ b/src/test/Makefile.am
@@ -68,26 +68,33 @@ check_SCRIPTS += \
 	test/cephtool-test-osd.sh \
 	test/cephtool-test-mon.sh \
 	test/cephtool-test-mds.sh \
+	test/cephtool-test-rados.sh \
 	unittest_bufferlist.sh \
 	test/encoding/check-generated.sh \
 	test/mon/osd-pool-create.sh \
 	test/mon/misc.sh \
 	test/mon/osd-crush.sh \
+	test/mon/mon-ping.sh \
 	test/mon/osd-erasure-code-profile.sh \
 	test/mon/mkfs.sh \
+	test/mon/mon-scrub.sh \
 	test/osd/osd-scrub-repair.sh \
 	test/osd/osd-config.sh \
 	test/osd/osd-bench.sh \
 	test/osd/osd-copy-from.sh \
-	test/mon/mon-handle-forward.sh
+	test/mon/mon-handle-forward.sh \
+	test/libradosstriper/rados-striper.sh \
+	test/test_objectstore_memstore.sh
 
-if ENABLE_ROOT_MAKE_CHECK
-check_SCRIPTS += test/ceph-disk-root.sh
-else
 check_SCRIPTS += test/ceph-disk.sh
-endif
 
 EXTRA_DIST += \
+	$(srcdir)/test/python/ceph-disk/setup.py \
+	$(srcdir)/test/python/ceph-disk/tox.ini \
+	$(srcdir)/test/python/ceph-disk/tests/test_ceph_disk.py \
+	$(srcdir)/test/python/brag-client/setup.py \
+	$(srcdir)/test/python/brag-client/tox.ini \
+	$(srcdir)/test/python/brag-client/tests/test_ceph_brag.py \
 	$(srcdir)/test/debian-jessie/Dockerfile.in \
 	$(srcdir)/test/debian-jessie/install-deps.sh \
 	$(srcdir)/test/debian-jessie/debian \
@@ -97,25 +104,21 @@ EXTRA_DIST += \
 	$(srcdir)/test/ubuntu-14.04/Dockerfile.in \
 	$(srcdir)/test/ubuntu-14.04/install-deps.sh \
 	$(srcdir)/test/ubuntu-14.04/debian \
+	$(srcdir)/test/fedora-21/Dockerfile.in \
+	$(srcdir)/test/fedora-21/install-deps.sh \
+	$(srcdir)/test/fedora-21/ceph.spec.in \
 	$(srcdir)/test/centos-6/Dockerfile.in \
 	$(srcdir)/test/centos-6/install-deps.sh \
 	$(srcdir)/test/centos-6/ceph.spec.in \
 	$(srcdir)/test/centos-7/Dockerfile.in \
 	$(srcdir)/test/centos-7/install-deps.sh \
 	$(srcdir)/test/centos-7/ceph.spec.in \
-	$(srcdir)/test/mon/mon-test-helpers.sh \
-	$(srcdir)/test/osd/osd-test-helpers.sh \
+	$(srcdir)/test/opensuse-13.2/Dockerfile.in \
+	$(srcdir)/test/opensuse-13.2/install-deps.sh \
+	$(srcdir)/test/opensuse-13.2/ceph.spec.in \
         $(srcdir)/test/coverage.sh \
 	$(patsubst %,$(srcdir)/%,$(check_SCRIPTS))
 
-docker-check:
-	$(srcdir)/test/container-make-check-ubuntu-14.04.sh
-	$(srcdir)/test/container-make-check-centos-centos7.sh
-
-EXTRA_DIST += \
-	$(srcdir)/test/container-make-check-ubuntu-14.04.sh
-	$(srcdir)/test/container-make-check-centos-centos7.sh
-
 # target to build but not run the unit tests
 unittests:: $(check_PROGRAMS)
 
@@ -134,217 +137,230 @@ UNITTEST_LDADD = \
 unittest_addrs_SOURCES = test/test_addrs.cc
 unittest_addrs_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_addrs_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
-check_PROGRAMS += unittest_addrs
+check_TESTPROGRAMS += unittest_addrs
 
 unittest_blkdev_SOURCES = test/common/test_blkdev.cc
 unittest_blkdev_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_blkdev_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
-check_PROGRAMS += unittest_blkdev
+if LINUX
+check_TESTPROGRAMS += unittest_blkdev
+endif
 
 unittest_bloom_filter_SOURCES = test/common/test_bloom_filter.cc
 unittest_bloom_filter_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_bloom_filter_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
-check_PROGRAMS += unittest_bloom_filter
+check_TESTPROGRAMS += unittest_bloom_filter
 
 unittest_histogram_SOURCES = test/common/histogram.cc
 unittest_histogram_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_histogram_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
-check_PROGRAMS += unittest_histogram
+check_TESTPROGRAMS += unittest_histogram
+
+unittest_prioritized_queue_SOURCES = test/common/test_prioritized_queue.cc
+unittest_prioritized_queue_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+unittest_prioritized_queue_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+check_TESTPROGRAMS += unittest_prioritized_queue
+
 
 unittest_str_map_SOURCES = test/common/test_str_map.cc
 unittest_str_map_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_str_map_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
-check_PROGRAMS += unittest_str_map
+check_TESTPROGRAMS += unittest_str_map
 
 unittest_sharedptr_registry_SOURCES = test/common/test_sharedptr_registry.cc
 unittest_sharedptr_registry_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_sharedptr_registry_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
-check_PROGRAMS += unittest_sharedptr_registry
+check_TESTPROGRAMS += unittest_sharedptr_registry
 
 unittest_shared_cache_SOURCES = test/common/test_shared_cache.cc
 unittest_shared_cache_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_shared_cache_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
-check_PROGRAMS += unittest_shared_cache
+check_TESTPROGRAMS += unittest_shared_cache
 
 unittest_sloppy_crc_map_SOURCES = test/common/test_sloppy_crc_map.cc
 unittest_sloppy_crc_map_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_sloppy_crc_map_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
-check_PROGRAMS += unittest_sloppy_crc_map
+check_TESTPROGRAMS += unittest_sloppy_crc_map
 
 unittest_util_SOURCES = test/common/test_util.cc
 unittest_util_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_util_LDADD = $(LIBCOMMON) -lm $(UNITTEST_LDADD) $(CRYPTO_LIBS) $(EXTRALIBS)
-check_PROGRAMS += unittest_util
+check_TESTPROGRAMS += unittest_util
 
 unittest_crush_wrapper_SOURCES = test/crush/CrushWrapper.cc
 unittest_crush_wrapper_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL) $(LIBCRUSH)
 unittest_crush_wrapper_CXXFLAGS = $(UNITTEST_CXXFLAGS) -O2
-check_PROGRAMS += unittest_crush_wrapper
+check_TESTPROGRAMS += unittest_crush_wrapper
 
 unittest_crush_SOURCES = test/crush/crush.cc
 unittest_crush_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_crush_LDADD = $(LIBCOMMON) -lm $(UNITTEST_LDADD) $(CEPH_CRUSH) $(EXTRALIBS) $(CEPH_GLOBAL)
-check_PROGRAMS += unittest_crush
+check_TESTPROGRAMS += unittest_crush
 
 unittest_osdmap_SOURCES = test/osd/TestOSDMap.cc
 unittest_osdmap_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_osdmap_LDADD = $(UNITTEST_LDADD) $(LIBCOMMON) $(CEPH_GLOBAL)
-check_PROGRAMS += unittest_osdmap
+check_TESTPROGRAMS += unittest_osdmap
 
 unittest_workqueue_SOURCES = test/test_workqueue.cc
 unittest_workqueue_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_workqueue_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
-check_PROGRAMS += unittest_workqueue
+check_TESTPROGRAMS += unittest_workqueue
 
-unittest_striper_SOURCES = test/test_striper.cc 
+unittest_striper_SOURCES = test/test_striper.cc
 unittest_striper_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_striper_LDADD = $(LIBOSDC) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
-check_PROGRAMS += unittest_striper
+check_TESTPROGRAMS += unittest_striper
 
-unittest_prebufferedstreambuf_SOURCES = test/test_prebufferedstreambuf.cc 
+unittest_prebufferedstreambuf_SOURCES = test/test_prebufferedstreambuf.cc
 unittest_prebufferedstreambuf_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_prebufferedstreambuf_LDADD = $(LIBCOMMON) $(UNITTEST_LDADD) $(EXTRALIBS)
-check_PROGRAMS += unittest_prebufferedstreambuf
+check_TESTPROGRAMS += unittest_prebufferedstreambuf
 
 unittest_str_list_SOURCES = test/test_str_list.cc
 unittest_str_list_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_str_list_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
-check_PROGRAMS += unittest_str_list
+check_TESTPROGRAMS += unittest_str_list
 
 unittest_log_SOURCES = log/test.cc
 unittest_log_LDADD = $(LIBCOMMON) $(UNITTEST_LDADD)
 unittest_log_CXXFLAGS = $(UNITTEST_CXXFLAGS) -O2
-check_PROGRAMS += unittest_log
+check_TESTPROGRAMS += unittest_log
 
 unittest_throttle_SOURCES = test/common/Throttle.cc
 unittest_throttle_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 unittest_throttle_CXXFLAGS = $(UNITTEST_CXXFLAGS) -O2
-check_PROGRAMS += unittest_throttle
+check_TESTPROGRAMS += unittest_throttle
 
 unittest_ceph_argparse_SOURCES = test/ceph_argparse.cc
 unittest_ceph_argparse_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 unittest_ceph_argparse_CXXFLAGS = $(UNITTEST_CXXFLAGS)
-check_PROGRAMS += unittest_ceph_argparse
+check_TESTPROGRAMS += unittest_ceph_argparse
 
 unittest_ceph_compatset_SOURCES = test/ceph_compatset.cc
 unittest_ceph_compatset_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 unittest_ceph_compatset_CXXFLAGS = $(UNITTEST_CXXFLAGS)
-check_PROGRAMS += unittest_ceph_compatset
+check_TESTPROGRAMS += unittest_ceph_compatset
 
 unittest_mds_types_SOURCES = test/fs/mds_types.cc
 unittest_mds_types_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_mds_types_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
-check_PROGRAMS += unittest_mds_types
+check_TESTPROGRAMS += unittest_mds_types
 
 unittest_osd_types_SOURCES = test/osd/types.cc
 unittest_osd_types_CXXFLAGS = $(UNITTEST_CXXFLAGS)
-unittest_osd_types_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL) 
-check_PROGRAMS += unittest_osd_types
+unittest_osd_types_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+check_TESTPROGRAMS += unittest_osd_types
 
 unittest_lru_SOURCES = test/common/test_lru.cc
 unittest_lru_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_lru_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
-check_PROGRAMS += unittest_lru
+check_TESTPROGRAMS += unittest_lru
 
 unittest_io_priority_SOURCES = test/common/test_io_priority.cc
 unittest_io_priority_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_io_priority_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
-check_PROGRAMS += unittest_io_priority
+check_TESTPROGRAMS += unittest_io_priority
 
 unittest_gather_SOURCES = test/gather.cc
 unittest_gather_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 unittest_gather_CXXFLAGS = $(UNITTEST_CXXFLAGS)
-check_PROGRAMS += unittest_gather
+check_TESTPROGRAMS += unittest_gather
 
 unittest_signals_SOURCES = test/signals.cc
 unittest_signals_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 unittest_signals_CXXFLAGS = $(UNITTEST_CXXFLAGS)
-check_PROGRAMS += unittest_signals
+check_TESTPROGRAMS += unittest_signals
 
 unittest_bufferlist_SOURCES = test/bufferlist.cc
-unittest_bufferlist_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL) 
+unittest_bufferlist_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 unittest_bufferlist_CXXFLAGS = $(UNITTEST_CXXFLAGS)
-check_PROGRAMS += unittest_bufferlist
+check_TESTPROGRAMS += unittest_bufferlist
+
+unittest_xlist_SOURCES = test/test_xlist.cc
+unittest_xlist_LDADD = $(UNITTEST_LDADD) $(LIBCOMMON)
+unittest_xlist_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+check_TESTPROGRAMS += unittest_xlist
 
 unittest_crc32c_SOURCES = test/common/test_crc32c.cc
 unittest_crc32c_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 unittest_crc32c_CXXFLAGS = $(UNITTEST_CXXFLAGS)
-check_PROGRAMS += unittest_crc32c
+check_TESTPROGRAMS += unittest_crc32c
 
 unittest_arch_SOURCES = test/test_arch.cc
 unittest_arch_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 unittest_arch_CXXFLAGS = $(UNITTEST_CXXFLAGS)
-check_PROGRAMS += unittest_arch
+check_TESTPROGRAMS += unittest_arch
 
 unittest_crypto_SOURCES = test/crypto.cc
 unittest_crypto_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 unittest_crypto_CXXFLAGS = $(UNITTEST_CXXFLAGS)
-check_PROGRAMS += unittest_crypto
+check_TESTPROGRAMS += unittest_crypto
 
 unittest_crypto_init_SOURCES = test/crypto_init.cc
 unittest_crypto_init_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 unittest_crypto_init_CXXFLAGS = $(UNITTEST_CXXFLAGS)
-check_PROGRAMS += unittest_crypto_init
+check_TESTPROGRAMS += unittest_crypto_init
 
 unittest_perf_counters_SOURCES = test/perf_counters.cc
 unittest_perf_counters_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 unittest_perf_counters_CXXFLAGS = $(UNITTEST_CXXFLAGS)
-check_PROGRAMS += unittest_perf_counters
+check_TESTPROGRAMS += unittest_perf_counters
 
 unittest_admin_socket_SOURCES = test/admin_socket.cc
 unittest_admin_socket_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 unittest_admin_socket_CXXFLAGS = $(UNITTEST_CXXFLAGS)
-check_PROGRAMS += unittest_admin_socket
+check_TESTPROGRAMS += unittest_admin_socket
 
 unittest_ceph_crypto_SOURCES = test/ceph_crypto.cc
 unittest_ceph_crypto_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 unittest_ceph_crypto_CXXFLAGS = $(UNITTEST_CXXFLAGS)
-check_PROGRAMS += unittest_ceph_crypto
+check_TESTPROGRAMS += unittest_ceph_crypto
 
 unittest_utf8_SOURCES = test/utf8.cc
 unittest_utf8_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 unittest_utf8_CXXFLAGS = $(UNITTEST_CXXFLAGS)
-check_PROGRAMS += unittest_utf8
+check_TESTPROGRAMS += unittest_utf8
 
 unittest_mime_SOURCES = test/mime.cc
 unittest_mime_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 unittest_mime_CXXFLAGS = $(UNITTEST_CXXFLAGS)
-check_PROGRAMS += unittest_mime
+check_TESTPROGRAMS += unittest_mime
 
 unittest_escape_SOURCES = test/escape.cc
 unittest_escape_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 unittest_escape_CXXFLAGS = $(UNITTEST_CXXFLAGS)
-check_PROGRAMS += unittest_escape
+check_TESTPROGRAMS += unittest_escape
 
 unittest_strtol_SOURCES = test/strtol.cc
 unittest_strtol_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 unittest_strtol_CXXFLAGS = $(UNITTEST_CXXFLAGS)
-check_PROGRAMS += unittest_strtol
+check_TESTPROGRAMS += unittest_strtol
 
 unittest_confutils_SOURCES = test/confutils.cc
 unittest_confutils_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 unittest_confutils_CXXFLAGS = $(UNITTEST_CXXFLAGS)
-check_PROGRAMS += unittest_confutils
+check_TESTPROGRAMS += unittest_confutils
 
 unittest_config_SOURCES = test/common/test_config.cc
 unittest_config_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 unittest_config_CXXFLAGS = $(UNITTEST_CXXFLAGS)
-check_PROGRAMS += unittest_config
+check_TESTPROGRAMS += unittest_config
 
 unittest_context_SOURCES = test/common/test_context.cc
 unittest_context_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 unittest_context_CXXFLAGS = $(UNITTEST_CXXFLAGS)
-check_PROGRAMS += unittest_context
+check_TESTPROGRAMS += unittest_context
 
 unittest_safe_io_SOURCES = test/common/test_safe_io.cc
 unittest_safe_io_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 unittest_safe_io_CXXFLAGS = $(UNITTEST_CXXFLAGS)
-check_PROGRAMS += unittest_safe_io
+check_TESTPROGRAMS += unittest_safe_io
 
 unittest_heartbeatmap_SOURCES = test/heartbeat_map.cc
 unittest_heartbeatmap_LDADD = $(LIBCOMMON) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 unittest_heartbeatmap_CXXFLAGS = $(UNITTEST_CXXFLAGS)
-check_PROGRAMS += unittest_heartbeatmap
+check_TESTPROGRAMS += unittest_heartbeatmap
 
 # why does this include rgw/rgw_formats.cc...?
 unittest_formatter_SOURCES = \
@@ -352,43 +368,54 @@ unittest_formatter_SOURCES = \
 	rgw/rgw_formats.cc
 unittest_formatter_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 unittest_formatter_CXXFLAGS = $(UNITTEST_CXXFLAGS)
-check_PROGRAMS += unittest_formatter
+check_TESTPROGRAMS += unittest_formatter
 
 unittest_daemon_config_SOURCES = test/daemon_config.cc
 unittest_daemon_config_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 unittest_daemon_config_CXXFLAGS = $(UNITTEST_CXXFLAGS)
-check_PROGRAMS += unittest_daemon_config
+check_TESTPROGRAMS += unittest_daemon_config
 
 unittest_ipaddr_SOURCES = test/test_ipaddr.cc
 unittest_ipaddr_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 unittest_ipaddr_CXXFLAGS = $(UNITTEST_CXXFLAGS)
-check_PROGRAMS += unittest_ipaddr
+check_TESTPROGRAMS += unittest_ipaddr
 
 unittest_texttable_SOURCES = test/test_texttable.cc
 unittest_texttable_LDADD = $(LIBCOMMON) $(UNITTEST_LDADD)
 unittest_texttable_CXXFLAGS = $(UNITTEST_CXXFLAGS)
-check_PROGRAMS += unittest_texttable
+check_TESTPROGRAMS += unittest_texttable
 
 unittest_on_exit_SOURCES = test/on_exit.cc
 unittest_on_exit_LDADD = $(PTHREAD_LIBS)
-check_PROGRAMS += unittest_on_exit
+check_TESTPROGRAMS += unittest_on_exit
 
 unittest_readahead_SOURCES = test/common/Readahead.cc
 unittest_readahead_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 unittest_readahead_CXXFLAGS = $(UNITTEST_CXXFLAGS) -O2
-check_PROGRAMS += unittest_readahead
+check_TESTPROGRAMS += unittest_readahead
 
 unittest_tableformatter_SOURCES = test/common/test_tableformatter.cc
 unittest_tableformatter_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_tableformatter_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
-check_PROGRAMS += unittest_tableformatter
+check_TESTPROGRAMS += unittest_tableformatter
 
 unittest_bit_vector_SOURCES = test/common/test_bit_vector.cc
 unittest_bit_vector_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_bit_vector_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
-check_PROGRAMS += unittest_bit_vector
+check_TESTPROGRAMS += unittest_bit_vector
+
+unittest_subprocess_SOURCES = test/test_subprocess.cc
+unittest_subprocess_LDADD = $(LIBCOMMON) $(UNITTEST_LDADD)
+unittest_subprocess_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+check_PROGRAMS += unittest_subprocess
+
+unittest_async_compressor_SOURCES = test/common/test_async_compressor.cc
+unittest_async_compressor_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+unittest_async_compressor_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL) $(LIBCOMPRESSOR)
+check_PROGRAMS += unittest_async_compressor
 
 check_SCRIPTS += test/pybind/test_ceph_argparse.py
+check_SCRIPTS += test/pybind/test_ceph_daemon.py
 
 ceph_test_objectcacher_stress_SOURCES = \
 	test/osdc/object_cacher_stress.cc \
@@ -424,8 +451,6 @@ noinst_HEADERS += \
 	test/librados/test.h \
 	test/librados/TestCase.h \
 	test/libradosstriper/TestCase.h \
-	test/librbd/test_fixture.h \
-	test/librbd/test_support.h \
 	test/ObjectMap/KeyValueDBMemory.h \
 	test/omap_bench.h \
 	test/osdc/FakeWriteback.h \
diff --git a/src/test/ObjectMap/KeyValueDBMemory.h b/src/test/ObjectMap/KeyValueDBMemory.h
index 8df3966..77342a0 100644
--- a/src/test/ObjectMap/KeyValueDBMemory.h
+++ b/src/test/ObjectMap/KeyValueDBMemory.h
@@ -19,7 +19,7 @@ public:
   KeyValueDBMemory(KeyValueDBMemory *db) : db(db->db) { }
   virtual ~KeyValueDBMemory() { }
 
-  virtual int init() {
+  virtual int init(string _opt) {
     return 0;
   }
   virtual int open(ostream &out) {
diff --git a/src/test/admin_socket.cc b/src/test/admin_socket.cc
index ab52cec..cee215d 100644
--- a/src/test/admin_socket.cc
+++ b/src/test/admin_socket.cc
@@ -48,14 +48,14 @@ public:
 };
 
 TEST(AdminSocket, Teardown) {
-  std::auto_ptr<AdminSocket>
+  std::unique_ptr<AdminSocket>
       asokc(new AdminSocket(g_ceph_context));
   AdminSocketTest asoct(asokc.get());
   ASSERT_EQ(true, asoct.shutdown());
 }
 
 TEST(AdminSocket, TeardownSetup) {
-  std::auto_ptr<AdminSocket>
+  std::unique_ptr<AdminSocket>
       asokc(new AdminSocket(g_ceph_context));
   AdminSocketTest asoct(asokc.get());
   ASSERT_EQ(true, asoct.shutdown());
@@ -64,7 +64,7 @@ TEST(AdminSocket, TeardownSetup) {
 }
 
 TEST(AdminSocket, SendHelp) {
-  std::auto_ptr<AdminSocket>
+  std::unique_ptr<AdminSocket>
       asokc(new AdminSocket(g_ceph_context));
   AdminSocketTest asoct(asokc.get());
   ASSERT_EQ(true, asoct.shutdown());
@@ -96,7 +96,7 @@ TEST(AdminSocket, SendHelp) {
 }
 
 TEST(AdminSocket, SendNoOp) {
-  std::auto_ptr<AdminSocket>
+  std::unique_ptr<AdminSocket>
       asokc(new AdminSocket(g_ceph_context));
   AdminSocketTest asoct(asokc.get());
   ASSERT_EQ(true, asoct.shutdown());
@@ -127,7 +127,7 @@ class MyTest : public AdminSocketHook {
 };
 
 TEST(AdminSocket, RegisterCommand) {
-  std::auto_ptr<AdminSocket>
+  std::unique_ptr<AdminSocket>
       asokc(new AdminSocket(g_ceph_context));
   AdminSocketTest asoct(asokc.get());
   ASSERT_EQ(true, asoct.shutdown());
@@ -159,7 +159,7 @@ class MyTest2 : public AdminSocketHook {
 };
 
 TEST(AdminSocket, RegisterCommandPrefixes) {
-  std::auto_ptr<AdminSocket>
+  std::unique_ptr<AdminSocket>
       asokc(new AdminSocket(g_ceph_context));
   AdminSocketTest asoct(asokc.get());
   ASSERT_EQ(true, asoct.shutdown());
@@ -202,7 +202,7 @@ public:
 
 TEST(AdminSocketClient, Ping) {
   string path = get_rand_socket_path();
-  std::auto_ptr<AdminSocket>
+  std::unique_ptr<AdminSocket>
       asokc(new AdminSocket(g_ceph_context));
   AdminSocketClient client(path);
   // no socket
@@ -217,7 +217,12 @@ TEST(AdminSocketClient, Ping) {
   {
     bool ok;
     std::string result = client.ping(&ok);
-    EXPECT_NE(std::string::npos, result.find("Connection refused"));
+#if defined(__APPLE__) || defined(__FreeBSD__)
+    const char* errmsg = "Socket operation on non-socket";
+#else
+    const char* errmsg = "Connection refused";
+#endif
+    EXPECT_NE(std::string::npos, result.find(errmsg));
     ASSERT_FALSE(ok);
   }
   // a daemon is connected to the socket
@@ -251,7 +256,7 @@ TEST(AdminSocketClient, Ping) {
 
 TEST(AdminSocket, bind_and_listen) {
   string path = get_rand_socket_path();
-  std::auto_ptr<AdminSocket>
+  std::unique_ptr<AdminSocket>
       asokc(new AdminSocket(g_ceph_context));
 
   AdminSocketTest asoct(asokc.get());
diff --git a/src/test/bench/dumb_backend.h b/src/test/bench/dumb_backend.h
index 218e46c..b3a19a7 100644
--- a/src/test/bench/dumb_backend.h
+++ b/src/test/bench/dumb_backend.h
@@ -84,6 +84,7 @@ class DumbBackend : public Backend {
     bool _empty() {
       return item_queue.empty();
     }
+    using ThreadPool::WorkQueue<write_item>::_process;
     void _process(write_item *item) {
       return backend->_write(
 	item->oid,
diff --git a/src/test/bench/small_io_bench_fs.cc b/src/test/bench/small_io_bench_fs.cc
index f584585..4b273e4 100644
--- a/src/test/bench/small_io_bench_fs.cc
+++ b/src/test/bench/small_io_bench_fs.cc
@@ -134,6 +134,7 @@ int main(int argc, char **argv)
 
   FileStore fs(vm["filestore-path"].as<string>(),
 	       vm["journal-path"].as<string>());
+  ObjectStore::Sequencer osr(__func__);
 
   if (fs.mkfs() < 0) {
     cout << "mkfs failed" << std::endl;
@@ -168,17 +169,16 @@ int main(int argc, char **argv)
   }
 
   for (uint64_t num = 0; num < vm["num-colls"].as<unsigned>(); ++num) {
-    stringstream coll;
-    coll << "collection_" << num;
-    std::cout << "collection " << coll.str() << std::endl;
+    spg_t pgid(pg_t(num, 0), shard_id_t::NO_SHARD);
+    std::cout << "collection " << pgid << std::endl;
     ObjectStore::Transaction t;
-    t.create_collection(coll_t(coll.str()));
-    fs.apply_transaction(t);
+    t.create_collection(coll_t(pgid), 0);
+    fs.apply_transaction(&osr, t);
   }
   {
     ObjectStore::Transaction t;
-    t.create_collection(coll_t(string("meta")));
-    fs.apply_transaction(t);
+    t.create_collection(coll_t(), 0);
+    fs.apply_transaction(&osr, t);
   }
 
   vector<ceph::shared_ptr<Bencher> > benchers(
@@ -189,10 +189,10 @@ int main(int argc, char **argv)
     set<string> objects;
     for (uint64_t num = 0; num < vm["num-objects"].as<unsigned>(); ++num) {
       unsigned col_num = num % vm["num-colls"].as<unsigned>();
-      stringstream coll, obj;
-      coll << "collection_" << col_num;
+      spg_t pgid(pg_t(col_num, 0), shard_id_t::NO_SHARD);
+      stringstream obj;
       obj << "obj_" << num << "_bencher_" << (i - benchers.begin());
-      objects.insert(coll.str() + string("/") + obj.str());
+      objects.insert(coll_t(pgid).to_str() + string("/") + obj.str());
     }
     Distribution<
       boost::tuple<string, uint64_t, uint64_t, Bencher::OpType> > *gen = 0;
diff --git a/src/test/bench/testfilestore_backend.cc b/src/test/bench/testfilestore_backend.cc
index 4e2ca80..2ce1f87 100644
--- a/src/test/bench/testfilestore_backend.cc
+++ b/src/test/bench/testfilestore_backend.cc
@@ -34,23 +34,25 @@ void TestFileStoreBackend::write(
   size_t sep = oid.find("/");
   assert(sep != string::npos);
   assert(sep + 1 < oid.size());
-  string coll_str(oid.substr(0, sep));
+  coll_t c;
+  bool valid_coll = c.parse(oid.substr(0, sep));
+  assert(valid_coll);
+  string coll_str = c.to_str();
 
   if (!osrs.count(coll_str))
     osrs.insert(make_pair(coll_str, ObjectStore::Sequencer(coll_str)));
   ObjectStore::Sequencer *osr = &(osrs.find(coll_str)->second);
 
-
-  coll_t c(coll_str);
   hobject_t h(sobject_t(oid.substr(sep+1), 0));
-  t->write(c, h, offset, bl.length(), bl);
+  h.pool = 0;
+  t->write(c, ghobject_t(h), offset, bl.length(), bl);
 
   if (write_infos) {
     bufferlist bl2;
     for (uint64_t j = 0; j < 128; ++j) bl2.append(0);
-    coll_t meta("meta");
+    coll_t meta;
     hobject_t info(sobject_t(string("info_")+coll_str, 0));
-    t->write(meta, info, 0, bl2.length(), bl2);
+    t->write(meta, ghobject_t(info), 0, bl2.length(), bl2);
   }
 
   os->queue_transaction(
@@ -70,8 +72,11 @@ void TestFileStoreBackend::read(
   size_t sep = oid.find("/");
   assert(sep != string::npos);
   assert(sep + 1 < oid.size());
-  coll_t c(oid.substr(0, sep));
+  coll_t c;
+  bool valid_coll = c.parse(oid.substr(0, sep));
+  assert(valid_coll);
   hobject_t h(sobject_t(oid.substr(sep+1), 0));
-  os->read(c, h, offset, length, *bl);
+  h.pool = 0;
+  os->read(c, ghobject_t(h), offset, length, *bl);
   finisher.queue(on_complete);
 }
diff --git a/src/test/bench/tp_bench.cc b/src/test/bench/tp_bench.cc
index b9d5ff1..23185a7 100644
--- a/src/test/bench/tp_bench.cc
+++ b/src/test/bench/tp_bench.cc
@@ -93,6 +93,7 @@ class PassAlong : public ThreadPool::WorkQueue<unsigned> {
     q.pop_front();
     return val;
   }
+  using ThreadPool::WorkQueue<unsigned>::_process;
   void _process(unsigned *item) {
     next->queue(item);
   }
diff --git a/src/test/bufferlist.cc b/src/test/bufferlist.cc
index 09ee473..b05a15a 100644
--- a/src/test/bufferlist.cc
+++ b/src/test/bufferlist.cc
@@ -633,6 +633,25 @@ TEST(BufferPtr, copy_out) {
   }
 }
 
+TEST(BufferPtr, copy_out_bench) {
+  for (int s=1; s<=8; s*=2) {
+    utime_t start = ceph_clock_now(NULL);
+    int buflen = 1048576;
+    int count = 1000;
+    uint64_t v;
+    for (int i=0; i<count; ++i) {
+      bufferptr bp(buflen);
+      for (int64_t j=0; j<buflen; j += s) {
+	bp.copy_out(j, s, (char *)&v);
+      }
+    }
+    utime_t end = ceph_clock_now(NULL);
+    cout << count << " fills of buffer len " << buflen
+	 << " with " << s << " byte copy_in in "
+	 << (end - start) << std::endl;
+  }
+}
+
 TEST(BufferPtr, copy_in) {
   {
     bufferptr ptr;
@@ -649,6 +668,24 @@ TEST(BufferPtr, copy_in) {
   }
 }
 
+TEST(BufferPtr, copy_in_bench) {
+  for (int s=1; s<=8; s*=2) {
+    utime_t start = ceph_clock_now(NULL);
+    int buflen = 1048576;
+    int count = 1000;
+    for (int i=0; i<count; ++i) {
+      bufferptr bp(buflen);
+      for (int64_t j=0; j<buflen; j += s) {
+	bp.copy_in(j, s, (char *)&j, false);
+      }
+    }
+    utime_t end = ceph_clock_now(NULL);
+    cout << count << " fills of buffer len " << buflen
+	 << " with " << s << " byte copy_in in "
+	 << (end - start) << std::endl;
+  }
+}
+
 TEST(BufferPtr, append) {
   {
     bufferptr ptr;
@@ -669,6 +706,25 @@ TEST(BufferPtr, append) {
   }
 }
 
+TEST(BufferPtr, append_bench) {
+  for (int s=1; s<=8; s*=2) {
+    utime_t start = ceph_clock_now(NULL);
+    int buflen = 1048576;
+    int count = 1000;
+    for (int i=0; i<count; ++i) {
+      bufferptr bp(buflen);
+      bp.set_length(0);
+      for (int64_t j=0; j<buflen; j += s) {
+	bp.append((char *)&j, s);
+      }
+    }
+    utime_t end = ceph_clock_now(NULL);
+    cout << count << " fills of buffer len " << buflen
+	 << " with " << s << " byte appends in "
+	 << (end - start) << std::endl;
+  }
+}
+
 TEST(BufferPtr, zero) {
   char str[] = "XXXX";
   bufferptr ptr(buffer::create_static(strlen(str), str));
@@ -1028,6 +1084,23 @@ TEST(BufferListIterator, copy_in) {
   }
 }
 
+// iterator& buffer::list::const_iterator::operator++()
+TEST(BufferListConstIterator, operator_plus_plus) {
+  bufferlist bl;
+  {
+    bufferlist::const_iterator i(&bl);
+    EXPECT_THROW(++i, buffer::end_of_buffer);
+  }
+  bl.append("ABC", 3);
+  {
+    const bufferlist const_bl(bl);
+    bufferlist::const_iterator i(const_bl.begin());
+    ++i;
+    EXPECT_EQ('B', *i);
+  }
+
+}
+
 TEST(BufferList, constructors) {
   //
   // list()
@@ -1055,6 +1128,17 @@ TEST(BufferList, constructors) {
     bufferlist copy(bl);
     ASSERT_EQ('A', copy[0]);
   }
+  //
+  // list(list&& other)
+  //
+  {
+    bufferlist bl(1);
+    bl.append('A');
+    bufferlist copy = std::move(bl);
+    ASSERT_EQ(0U, bl.length());
+    ASSERT_EQ(1U, copy.length());
+    ASSERT_EQ('A', copy[0]);
+  }
 }
 
 TEST(BufferList, operator_equal) {
@@ -1082,22 +1166,52 @@ TEST(BufferList, buffers) {
 }
 
 TEST(BufferList, get_contiguous) {
-  bufferptr a("foobarbaz", 9);
-  bufferptr b("123456789", 9);
-  bufferptr c("ABCDEFGHI", 9);
-  bufferlist bl;
-  ASSERT_EQ(0, bl.get_contiguous(0, 0));
+  {
+    bufferptr a("foobarbaz", 9);
+    bufferptr b("123456789", 9);
+    bufferptr c("ABCDEFGHI", 9);
+    bufferlist bl;
+    ASSERT_EQ(0, bl.get_contiguous(0, 0));
+
+    bl.append(a);
+    bl.append(b);
+    bl.append(c);
+    ASSERT_EQ(3u, bl.buffers().size());
+    ASSERT_EQ(0, memcmp("bar", bl.get_contiguous(3, 3), 3));
+    ASSERT_EQ(0, memcmp("456", bl.get_contiguous(12, 3), 3));
+    ASSERT_EQ(0, memcmp("ABC", bl.get_contiguous(18, 3), 3));
+    ASSERT_EQ(3u, bl.buffers().size());
+    ASSERT_EQ(0, memcmp("789ABC", bl.get_contiguous(15, 6), 6));
+    ASSERT_EQ(2u, bl.buffers().size());
+  }
+
+  {
+    bufferptr a("foobarbaz", 9);
+    bufferptr b("123456789", 9);
+    bufferptr c("ABCDEFGHI", 9);
+    bufferlist bl;
 
-  bl.append(a);
-  bl.append(b);
-  bl.append(c);
-  ASSERT_EQ(3u, bl.buffers().size());
-  ASSERT_EQ(0, memcmp("bar", bl.get_contiguous(3, 3), 3));
-  ASSERT_EQ(0, memcmp("456", bl.get_contiguous(12, 3), 3));
-  ASSERT_EQ(0, memcmp("ABC", bl.get_contiguous(18, 3), 3));
-  ASSERT_EQ(3u, bl.buffers().size());
-  ASSERT_EQ(0, memcmp("789ABC", bl.get_contiguous(15, 6), 6));
-  ASSERT_LT(bl.buffers().size(), 3u);
+    bl.append(a);
+    bl.append(b);
+    bl.append(c);
+
+    ASSERT_EQ(0, memcmp("789ABCDEFGHI", bl.get_contiguous(15, 12), 12));
+    ASSERT_EQ(2u, bl.buffers().size());
+  }
+
+  {
+    bufferptr a("foobarbaz", 9);
+    bufferptr b("123456789", 9);
+    bufferptr c("ABCDEFGHI", 9);
+    bufferlist bl;
+
+    bl.append(a);
+    bl.append(b);
+    bl.append(c);
+
+    ASSERT_EQ(0, memcmp("z123456789AB", bl.get_contiguous(8, 12), 12));
+    ASSERT_EQ(1u, bl.buffers().size());
+  }
 }
 
 TEST(BufferList, swap) {
@@ -1403,12 +1517,15 @@ TEST(BufferList, rebuild) {
   {
     bufferlist bl;
     bufferptr ptr(buffer::create_page_aligned(2));
+    ptr[0] = 'X';
+    ptr[1] = 'Y';
     ptr.set_offset(1);
     ptr.set_length(1);
     bl.append(ptr);
     EXPECT_FALSE(bl.is_page_aligned());
     bl.rebuild();
-    EXPECT_FALSE(bl.is_page_aligned());
+    EXPECT_EQ(1U, bl.length());
+    EXPECT_EQ('Y', *bl.begin());
   }
   {
     bufferlist bl;
@@ -1416,7 +1533,7 @@ TEST(BufferList, rebuild) {
     bl.append(str.c_str(), str.size());
     bl.append(str.c_str(), str.size());
     EXPECT_EQ((unsigned)2, bl.buffers().size());
-    EXPECT_TRUE(bl.is_page_aligned());
+    EXPECT_TRUE(bl.is_aligned(CEPH_BUFFER_APPEND_SIZE));
     bl.rebuild();
     EXPECT_TRUE(bl.is_page_aligned());
     EXPECT_EQ((unsigned)1, bl.buffers().size());
@@ -1663,7 +1780,7 @@ TEST(BufferList, append) {
     EXPECT_EQ((unsigned)0, bl.buffers().size());
     bl.append('A');
     EXPECT_EQ((unsigned)1, bl.buffers().size());
-    EXPECT_TRUE(bl.is_page_aligned());
+    EXPECT_TRUE(bl.is_aligned(CEPH_BUFFER_APPEND_SIZE));
   }
   //
   // void append(const char *data, unsigned len);
@@ -1848,6 +1965,18 @@ TEST(BufferList, splice) {
   bl.splice(4, 4);
   EXPECT_EQ((unsigned)4, bl.length());
   EXPECT_EQ(0, ::memcmp("ABCD", bl.c_str(), bl.length()));
+
+  {
+    bl.clear();
+    bufferptr ptr1("0123456789", 10);
+    bl.push_back(ptr1);
+    bufferptr ptr2("abcdefghij", 10);
+    bl.append(ptr2, 5, 5);
+    other.clear();
+    bl.splice(10, 4, &other);
+    EXPECT_EQ((unsigned)11, bl.length());
+    EXPECT_EQ(0, ::memcmp("fghi", other.c_str(), other.length()));
+  }
 }
 
 TEST(BufferList, write) {
@@ -1917,7 +2046,7 @@ TEST(BufferList, read_fd) {
   EXPECT_EQ(-EBADF, bl.read_fd(fd, len));
   fd = ::open(FILENAME, O_RDONLY);
   EXPECT_EQ(len, (unsigned)bl.read_fd(fd, len));
-  EXPECT_EQ(CEPH_PAGE_SIZE - len, bl.buffers().front().unused_tail_length());
+  EXPECT_EQ(CEPH_BUFFER_APPEND_SIZE - len, bl.buffers().front().unused_tail_length());
   EXPECT_EQ(len, bl.length());
   ::close(fd);
   ::unlink(FILENAME);
@@ -2356,6 +2485,46 @@ TEST(BufferList, TestCopyAll) {
   ASSERT_EQ(memcmp(big.get(), big2.get(), BIG_SZ), 0);
 }
 
+TEST(BufferList, InvalidateCrc) {
+  const static size_t buffer_size = 262144;
+  ceph::shared_ptr <unsigned char> big(
+      (unsigned char*)malloc(buffer_size), free);
+  unsigned char c = 0;
+  char* ptr = (char*) big.get();
+  char* inptr;
+  for (size_t i = 0; i < buffer_size; ++i) {
+    ptr[i] = c++;
+  }
+  bufferlist bl;
+  
+  // test for crashes (shouldn't crash)
+  bl.invalidate_crc();
+  
+  // put data into bufferlist
+  bl.append((const char*)big.get(), buffer_size);
+  
+  // get its crc
+  __u32 crc = bl.crc32c(0);
+  
+  // modify data in bl without its knowledge
+  inptr = (char*) bl.c_str();
+  c = 0;
+  for (size_t i = 0; i < buffer_size; ++i) {
+    inptr[i] = c--;
+  }
+  
+  // make sure data in bl are now different than in big
+  EXPECT_NE(memcmp((void*) ptr, (void*) inptr, buffer_size), 0);
+  
+  // crc should remain the same
+  __u32 new_crc = bl.crc32c(0);
+  EXPECT_EQ(crc, new_crc);
+  
+  // force crc invalidate, check if it is updated
+  bl.invalidate_crc();
+  EXPECT_NE(crc, bl.crc32c(0));
+}
+
 TEST(BufferHash, all) {
   {
     bufferlist bl;
diff --git a/src/test/centos-6/ceph.spec.in b/src/test/centos-6/ceph.spec.in
index 140e0e3..8f2a6fc 100644
--- a/src/test/centos-6/ceph.spec.in
+++ b/src/test/centos-6/ceph.spec.in
@@ -1,13 +1,55 @@
+# vim: set noexpandtab ts=8 sw=8 :
 %bcond_with ocf
 %bcond_without cephfs_java
+%bcond_with tests
+%bcond_without tcmalloc
+%bcond_without libs_compat
+%bcond_with lowmem_builder
+%if 0%{?fedora} || 0%{?rhel}
+%bcond_without selinux
+%endif
+%if 0%{?suse_version}
+%bcond_with selinux
+%endif
+
 
-%if ! (0%{?fedora} > 12 || 0%{?rhel} > 5)
+%if (0%{?el5} || (0%{?rhel_version} >= 500 && 0%{?rhel_version} <= 600))
 %{!?python_sitelib: %global python_sitelib %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")}
 %{!?python_sitearch: %global python_sitearch %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib(1))")}
 %endif
 
+%if %{with selinux}
+# get selinux policy version
+%{!?_selinux_policy_version: %global _selinux_policy_version %(sed -e 's,.*selinux-policy-\\([^/]*\\)/.*,\\1,' /usr/share/selinux/devel/policyhelp 2>/dev/null || echo 0.0.0)}
+
+%define relabel_files() \
+restorecon -R /usr/bin/ceph-mon > /dev/null 2>&1; \
+restorecon -R /usr/bin/ceph-osd > /dev/null 2>&1; \
+restorecon -R /usr/bin/ceph-mds > /dev/null 2>&1; \
+restorecon -R /usr/bin/radosgw > /dev/null 2>&1; \
+restorecon -R /etc/rc\.d/init\.d/ceph > /dev/null 2>&1; \
+restorecon -R /etc/rc\.d/init\.d/radosgw > /dev/null 2>&1; \
+restorecon -R /var/run/ceph > /dev/null 2>&1; \
+restorecon -R /var/lib/ceph > /dev/null 2>&1; \
+restorecon -R /var/log/ceph > /dev/null 2>&1;
+%endif
+
 %{!?_udevrulesdir: %global _udevrulesdir /lib/udev/rules.d}
 
+# Use systemd files on RHEL 7 and above and in SUSE/openSUSE.
+# Note: We don't install unit files for the services yet. For now,
+# the _with_systemd variable only implies that we'll install
+# /etc/tmpfiles.d/ceph.conf in order to set up the socket directory in
+# /var/run/ceph.
+%if 0%{?fedora} || 0%{?rhel} >= 7 || 0%{?suse_version} >= 1210
+%global _with_systemd 1
+%endif
+
+# LTTng-UST enabled on Fedora, RHEL 6, and SLES 12
+%if 0%{?fedora} || 0%{?rhel} == 6 || 0%{?suse_version} == 1315
+%global _with_lttng 1
+%endif
+
 #################################################################################
 # common
 #################################################################################
@@ -16,51 +58,67 @@ Version:	@VERSION@
 Release:	@RPM_RELEASE@%{?dist}
 Epoch:		1
 Summary:	User space components of the Ceph file system
-License:	GPL-2.0
-Group:		System Environment/Base
+License:	LGPL-2.1 and CC-BY-SA-1.0 and GPL-2.0 and BSL-1.0 and GPL-2.0-with-autoconf-exception and BSD-3-Clause and MIT
+%if 0%{?suse_version}
+Group:         System/Filesystems
+%endif
 URL:		http://ceph.com/
 Source0:	http://ceph.com/download/%{name}-%{version}.tar.bz2
-%if 0%{?fedora} || 0%{?centos} || 0%{?rhel}
+%if 0%{?fedora} || 0%{?rhel}
 Patch0:		init-ceph.in-fedora.patch
 %endif
+#################################################################################
+# dependencies that apply across all distro families
+#################################################################################
 Requires:	librbd1 = %{epoch}:%{version}-%{release}
 Requires:	librados2 = %{epoch}:%{version}-%{release}
 Requires:	libcephfs1 = %{epoch}:%{version}-%{release}
 Requires:	ceph-common = %{epoch}:%{version}-%{release}
+%if 0%{with selinux}
+Requires:	ceph-selinux = %{epoch}:%{version}-%{release}
+%endif
 Requires:	python-rados = %{epoch}:%{version}-%{release}
 Requires:	python-rbd = %{epoch}:%{version}-%{release}
 Requires:	python-cephfs = %{epoch}:%{version}-%{release}
 Requires:	python
 Requires:	python-requests
-Requires:	python-flask
+Requires:	grep
 Requires:	xfsprogs
+Requires:	logrotate
 Requires:	parted
 Requires:	util-linux
 Requires:	hdparm
 Requires:	cryptsetup
+Requires:	findutils
+Requires:	which
 Requires(post):	binutils
+%if 0%{with cephfs_java}
+BuildRequires:	java-devel
+BuildRequires:	sharutils
+%endif
+%if 0%{with selinux}
+BuildRequires:	checkpolicy
+BuildRequires:	selinux-policy-devel
+BuildRequires:	/usr/share/selinux/devel/policyhelp
+%endif
 BuildRequires:	gcc-c++
 BuildRequires:	boost-devel
-%if 0%{defined suse_version}
-BuildRequires:  libbz2-devel
-%else
-BuildRequires:  bzip2-devel
-%endif
+BuildRequires:  cmake
 BuildRequires:	cryptsetup
+BuildRequires:	fuse-devel
 BuildRequires:	gdbm
 BuildRequires:	hdparm
+BuildRequires:	leveldb-devel > 1.2
 BuildRequires:	libaio-devel
 BuildRequires:	libcurl-devel
 BuildRequires:	libedit-devel
 BuildRequires:	libxml2-devel
-BuildRequires:	libuuid-devel
 BuildRequires:	libblkid-devel >= 2.17
 BuildRequires:	libudev-devel
 BuildRequires:	libtool
-BuildRequires:	leveldb-devel > 1.2
 BuildRequires:	make
-BuildRequires:	perl
 BuildRequires:	parted
+BuildRequires:	perl
 BuildRequires:	pkgconfig
 BuildRequires:	python
 BuildRequires:	python-nose
@@ -72,46 +130,86 @@ BuildRequires:	xfsprogs
 BuildRequires:	xfsprogs-devel
 BuildRequires:	xmlstarlet
 BuildRequires:	yasm
-%if 0%{?suse_version}
-BuildRequires:	net-tools
-%endif
 
 #################################################################################
-# specific
+# distro-conditional dependencies
 #################################################################################
-%if ! 0%{?rhel} || 0%{?fedora}
-BuildRequires:	sharutils
+%if 0%{?suse_version}
+%if 0%{?_with_systemd}
+BuildRequires:  pkgconfig(systemd)
+BuildRequires:	systemd-rpm-macros
+%{?systemd_requires}
 %endif
-
-%if 0%{defined suse_version}
+PreReq:		%fillup_prereq
+Requires:	python-Flask
+BuildRequires:	net-tools
+BuildRequires:	libbz2-devel
 %if 0%{?suse_version} > 1210
 Requires:	gptfdisk
+%if 0%{with tcmalloc}
 BuildRequires:	gperftools-devel
+%endif
 %else
 Requires:	scsirastools
 BuildRequires:	google-perftools-devel
 %endif
-Recommends:	logrotate
-BuildRequires:	%insserv_prereq
 BuildRequires:	mozilla-nss-devel
 BuildRequires:	keyutils-devel
 BuildRequires:	libatomic-ops-devel
 %else
-Requires:	gdisk
+%if 0%{?_with_systemd}
+Requires:	systemd
+%endif
+BuildRequires:  bzip2-devel
 BuildRequires:	nss-devel
 BuildRequires:	keyutils-libs-devel
 BuildRequires:	libatomic_ops-devel
 Requires:	gdisk
 Requires(post):	chkconfig
-Requires(preun):chkconfig
-Requires(preun):initscripts
+Requires(preun):	chkconfig
+Requires(preun):	initscripts
 BuildRequires:	gperftools-devel
+Requires:	python-flask
+%endif
+# boost
+%if 0%{?fedora} || 0%{?rhel} 
+BuildRequires:  boost-random
+%endif
+# python-argparse for distros with Python 2.6 or lower
+%if (0%{?rhel} && 0%{?rhel} <= 6) || (0%{?suse_version} && 0%{?suse_version} <= 1110)
+BuildRequires:	python-argparse
+%endif
+# lttng and babeltrace for rbd-replay-prep
+%if 0%{?_with_lttng}
+%if 0%{?fedora} || 0%{?rhel}
+BuildRequires:	lttng-ust-devel
+BuildRequires:	libbabeltrace-devel
+%endif
+%if 0%{?suse_version}
+BuildRequires:	lttng-ust-devel
+BuildRequires:  babeltrace-devel
+%endif
+%endif
+# expat and fastcgi for RGW
+%if 0%{?suse_version}
+BuildRequires:	libexpat-devel
+BuildRequires:	FastCGI-devel
+%endif
+%if 0%{?rhel} || 0%{?fedora}
+BuildRequires:	expat-devel
+BuildRequires:	fcgi-devel
+%endif
+# python-sphinx
+%if 0%{?rhel} > 0 && 0%{?rhel} < 7
+BuildRequires:	python-sphinx10
+%endif
+%if 0%{?fedora} || 0%{?suse_version} || 0%{?rhel} >= 7
+BuildRequires:	python-sphinx
 %endif
 
 %description
-Ceph is a massively scalable, open-source, distributed
-storage system that runs on commodity hardware and delivers object,
-block and file system storage.
+Ceph is a massively scalable, open-source, distributed storage system that runs
+on commodity hardware and delivers object, block and file system storage.
 
 
 #################################################################################
@@ -126,13 +224,15 @@ Requires:	python-rados = %{epoch}:%{version}-%{release}
 Requires:	python-rbd = %{epoch}:%{version}-%{release}
 Requires:	python-cephfs = %{epoch}:%{version}-%{release}
 Requires:	python-requests
-%if 0%{?rhel} || 0%{?fedora}
-Requires:  redhat-lsb-core
+%if 0%{?_with_systemd}
+%{?systemd_requires}
+%endif
+%if 0%{?suse_version}
+Requires(pre):	pwdutils
 %endif
 # python-argparse is only needed in distros with Python 2.6 or lower
 %if (0%{?rhel} && 0%{?rhel} <= 6) || (0%{?suse_version} && 0%{?suse_version} <= 1110)
 Requires:	python-argparse
-BuildRequires:	python-argparse
 %endif
 %description -n ceph-common
 Common utilities to mount and interact with a ceph storage cluster.
@@ -141,7 +241,6 @@ Common utilities to mount and interact with a ceph storage cluster.
 Summary:	Ceph fuse-based client
 Group:		System Environment/Base
 Requires:	%{name}
-BuildRequires:	fuse-devel
 %description fuse
 FUSE based client for Ceph distributed network file system
 
@@ -151,7 +250,6 @@ Group:		System Environment/Base
 Requires:	%{name}
 Requires:	librados2 = %{epoch}:%{version}-%{release}
 Requires:	librbd1 = %{epoch}:%{version}-%{release}
-BuildRequires:	fuse-devel
 %description -n rbd-fuse
 FUSE based client to map Ceph rbd images to files
 
@@ -159,13 +257,11 @@ FUSE based client to map Ceph rbd images to files
 Summary:	Rados REST gateway
 Group:		Development/Libraries
 Requires:	ceph-common = %{epoch}:%{version}-%{release}
+%if 0%{with selinux}
+Requires:	ceph-selinux = %{epoch}:%{version}-%{release}
+%endif
 Requires:	librados2 = %{epoch}:%{version}-%{release}
-%if 0%{defined suse_version}
-BuildRequires:	libexpat-devel
-BuildRequires:	FastCGI-devel
-%else
-BuildRequires:	expat-devel
-BuildRequires:	fcgi-devel
+%if 0%{?rhel} || 0%{?fedora}
 Requires:	mailcap
 %endif
 %description radosgw
@@ -190,7 +286,7 @@ managers such as Pacemaker.
 Summary:	RADOS distributed object store client library
 Group:		System Environment/Libraries
 License:	LGPL-2.0
-%if 0%{?rhel} || 0%{?centos} || 0%{?fedora}
+%if 0%{?rhel} || 0%{?fedora}
 Obsoletes:	ceph-libs < %{epoch}:%{version}-%{release}
 %endif
 %description -n librados2
@@ -223,7 +319,7 @@ object store.
 Summary:	RADOS striping interface
 Group:		System Environment/Libraries
 License:	LGPL-2.0
-Requires:	librados2 = %{epoch}:%{version}
+Requires:	librados2 = %{epoch}:%{version}-%{release}
 %description -n libradosstriper1
 Striping interface built on top of the rados library, allowing
 to stripe bigger objects onto several standard rados objects using
@@ -245,7 +341,7 @@ Summary:	RADOS block device client library
 Group:		System Environment/Libraries
 License:	LGPL-2.0
 Requires:	librados2 = %{epoch}:%{version}-%{release}
-%if 0%{?rhel} || 0%{?centos} || 0%{?fedora}
+%if 0%{?rhel} || 0%{?fedora}
 Obsoletes:	ceph-libs < %{epoch}:%{version}-%{release}
 %endif
 %description -n librbd1
@@ -280,7 +376,7 @@ block device.
 Summary:	Ceph distributed file system client library
 Group:		System Environment/Libraries
 License:	LGPL-2.0
-%if 0%{?rhel} || 0%{?centos} || 0%{?fedora}
+%if 0%{?rhel} || 0%{?fedora}
 Obsoletes:	ceph-libs < %{epoch}:%{version}-%{release}
 Obsoletes:	ceph-libcephfs
 %endif
@@ -312,41 +408,29 @@ Obsoletes:	python-ceph < %{epoch}:%{version}-%{release}
 This package contains Python libraries for interacting with Cephs distributed
 file system.
 
-%package -n rest-bench
-Summary:	RESTful benchmark
-Group:		System Environment/Libraries
-License:	LGPL-2.0
-Requires:	ceph-common = %{epoch}:%{version}-%{release}
-%description -n rest-bench
-RESTful bencher that can be used to benchmark radosgw performance.
-
 %package -n ceph-test
 Summary:	Ceph benchmarks and test tools
 Group:		System Environment/Libraries
 License:	LGPL-2.0
 Requires:	ceph-common
-%if (0%{?fedora} >= 20 || 0%{?rhel} == 6)
-BuildRequires:	lttng-ust-devel
-BuildRequires:	libbabeltrace-devel
-%endif
+Requires:	xmlstarlet
 %description -n ceph-test
 This package contains Ceph benchmarks and test tools.
 
 %if 0%{with cephfs_java}
 
 %package -n libcephfs_jni1
-Summary:	Java Native Interface library for CephFS Java bindings.
+Summary:	Java Native Interface library for CephFS Java bindings
 Group:		System Environment/Libraries
 License:	LGPL-2.0
 Requires:	java
 Requires:	libcephfs1 = %{epoch}:%{version}-%{release}
-BuildRequires:	java-devel
 %description -n libcephfs_jni1
 This package contains the Java Native Interface library for CephFS Java
 bindings.
 
 %package -n libcephfs_jni1-devel
-Summary:	Development files for CephFS Java Native Interface library.
+Summary:	Development files for CephFS Java Native Interface library
 Group:		System Environment/Libraries
 License:	LGPL-2.0
 Requires:	java
@@ -357,12 +441,11 @@ This package contains the development files for CephFS Java Native Interface
 library.
 
 %package -n cephfs-java
-Summary:	Java libraries for the Ceph File System.
+Summary:	Java libraries for the Ceph File System
 Group:		System Environment/Libraries
 License:	LGPL-2.0
 Requires:	java
 Requires:	libcephfs_jni1 = %{epoch}:%{version}-%{release}
-BuildRequires:	java-devel
 %if 0%{?el6}
 Requires:	junit4
 BuildRequires:	junit4
@@ -375,8 +458,26 @@ This package contains the Java libraries for the Ceph File System.
 
 %endif
 
+%if 0%{with selinux}
+
+%package selinux
+Summary:	SELinux support for Ceph MON, OSD and MDS
+Group:		System Environment/Base
+Requires:	%{name}
+Requires:	policycoreutils, libselinux-utils
+Requires(post): selinux-policy-base >= %{_selinux_policy_version}, policycoreutils, gawk
+Requires(postun): policycoreutils
+%description selinux
+This package contains SELinux support for Ceph MON, OSD and MDS. The package
+also performs file-system relabelling which can take a long time on heavily
+populated file-systems.
+
+%endif
+
+%if 0%{with libs_compat}
+
 %package libs-compat
-Summary:	Meta package to include ceph libraries.
+Summary:	Meta package to include ceph libraries
 Group:		System Environment/Libraries
 License:	LGPL-2.0
 Obsoletes:	ceph-libs
@@ -392,6 +493,8 @@ former ceph-libs package, which is now split up into these three subpackages.
 Packages still depending on ceph-libs should be fixed to depend on librados2,
 librbd1 or libcephfs1 instead.
 
+%endif
+
 %package devel-compat
 Summary:	Compatibility package for Ceph headers
 Group:		Development/Libraries
@@ -427,16 +530,12 @@ python-rados, python-rbd and python-cephfs. Packages still depending on
 python-ceph should be fixed to depend on python-rados, python-rbd or
 python-cephfs instead.
 
-%if 0%{?opensuse} || 0%{?suse_version}
-%debug_package
-%endif
-
 #################################################################################
 # common
 #################################################################################
 %prep
 %setup -q
-%if 0%{?fedora} || 0%{?rhel} || 0%{?centos}
+%if 0%{?fedora} || 0%{?rhel}
 %patch0 -p1 -b .init
 %endif
 
@@ -449,53 +548,91 @@ done
 %endif
 
 ./autogen.sh
-MY_CONF_OPT=""
-
-MY_CONF_OPT="$MY_CONF_OPT --with-radosgw"
 
+%if %{with lowmem_builder}
+RPM_OPT_FLAGS="$RPM_OPT_FLAGS --param ggc-min-expand=20 --param ggc-min-heapsize=32768"
+%endif
 export RPM_OPT_FLAGS=`echo $RPM_OPT_FLAGS | sed -e 's/i386/i486/'`
 
 %{configure}	CPPFLAGS="$java_inc" \
 		--prefix=/usr \
 		--localstatedir=/var \
 		--sysconfdir=/etc \
+%if 0%{?_with_systemd}
+		--with-systemdsystemunitdir=%_unitdir \
+%endif
 		--docdir=%{_docdir}/ceph \
+		--with-man-pages \
+		--mandir="%_mandir" \
 		--with-nss \
 		--without-cryptopp \
-		--with-rest-bench \
 		--with-debug \
 %if 0%{with cephfs_java}
 		--enable-cephfs-java \
 %endif
+%if 0%{with selinux}
+		--with-selinux \
+%endif
 		--with-librocksdb-static=check \
-		$MY_CONF_OPT \
+%if 0%{?rhel} || 0%{?fedora}
+		--with-systemd-libexec-dir=/usr/libexec/ceph \
+		--with-rgw-user=root \
+		--with-rgw-group=root \
+%endif
+%if 0%{?suse_version}
+		--with-systemd-libexec-dir=/usr/lib/ceph/ \
+		--with-rgw-user=wwwrun \
+		--with-rgw-group=www \
+%endif
+		--with-radosgw \
+		$CEPH_EXTRA_CONFIGURE_ARGS \
 		%{?_with_ocf} \
+		%{?_with_tcmalloc} \
 		CFLAGS="$RPM_OPT_FLAGS" CXXFLAGS="$RPM_OPT_FLAGS"
 
-# fix bug in specific version of libedit-devel
-%if 0%{defined suse_version}
-sed -i -e "s/-lcurses/-lncurses/g" Makefile
-sed -i -e "s/-lcurses/-lncurses/g" src/Makefile
-sed -i -e "s/-lcurses/-lncurses/g" man/Makefile
-sed -i -e "s/-lcurses/-lncurses/g" src/ocf/Makefile
-sed -i -e "s/-lcurses/-lncurses/g" src/java/Makefile
+
+make %{?_smp_mflags}
+
+
+%if 0%{with tests}
+%check
+# run in-tree unittests
+make %{?_smp_mflags} check-local
+
 %endif
 
-make -j$(getconf _NPROCESSORS_ONLN)
+
 
 %install
 make DESTDIR=$RPM_BUILD_ROOT install
 find $RPM_BUILD_ROOT -type f -name "*.la" -exec rm -f {} ';'
 find $RPM_BUILD_ROOT -type f -name "*.a" -exec rm -f {} ';'
-install -D src/init-ceph $RPM_BUILD_ROOT%{_initrddir}/ceph
-install -D src/init-radosgw $RPM_BUILD_ROOT%{_initrddir}/ceph-radosgw
-install -D src/init-rbdmap $RPM_BUILD_ROOT%{_initrddir}/rbdmap
 install -D src/rbdmap $RPM_BUILD_ROOT%{_sysconfdir}/ceph/rbdmap
+install -D src/init-rbdmap $RPM_BUILD_ROOT%{_initrddir}/rbdmap
+%if 0%{?fedora} || 0%{?rhel}
+install -m 0644 -D etc/sysconfig/ceph $RPM_BUILD_ROOT%{_sysconfdir}/sysconfig/ceph
+%endif
+%if 0%{?suse_version}
+install -m 0644 -D etc/sysconfig/ceph $RPM_BUILD_ROOT%{_localstatedir}/adm/fillup-templates/sysconfig.%{name}
+%endif
+%if 0%{?_with_systemd}
+  install -m 0644 -D systemd/ceph.tmpfiles.d $RPM_BUILD_ROOT%{_tmpfilesdir}/ceph-common.conf
+  install -m 0644 -D systemd/ceph-osd at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-osd at .service
+  install -m 0644 -D systemd/ceph-mon at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-mon at .service
+  install -m 0644 -D systemd/ceph-create-keys at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-create-keys at .service
+  install -m 0644 -D systemd/ceph-mds at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-mds at .service
+  install -m 0644 -D systemd/ceph-radosgw at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-radosgw at .service
+  install -m 0644 -D systemd/ceph.target $RPM_BUILD_ROOT%{_unitdir}/ceph.target
+  install -m 0644 -D systemd/ceph-disk at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-disk at .service
+  install -m 0755 -D systemd/ceph $RPM_BUILD_ROOT%{_sbindir}/rcceph
+%else
+  install -D src/init-ceph $RPM_BUILD_ROOT%{_initrddir}/ceph
+  install -D src/init-radosgw $RPM_BUILD_ROOT%{_initrddir}/ceph-radosgw
+  ln -sf ../../etc/init.d/ceph %{buildroot}/%{_sbindir}/rcceph
+  ln -sf ../../etc/init.d/ceph-radosgw %{buildroot}/%{_sbindir}/rcceph-radosgw
+%endif
 mkdir -p $RPM_BUILD_ROOT%{_sbindir}
-ln -sf ../../etc/init.d/ceph %{buildroot}/%{_sbindir}/rcceph
-ln -sf ../../etc/init.d/ceph-radosgw %{buildroot}/%{_sbindir}/rcceph-radosgw
 install -m 0644 -D src/logrotate.conf $RPM_BUILD_ROOT%{_sysconfdir}/logrotate.d/ceph
-install -m 0644 -D src/rgw/logrotate.conf $RPM_BUILD_ROOT%{_sysconfdir}/logrotate.d/radosgw
 chmod 0644 $RPM_BUILD_ROOT%{_docdir}/ceph/sample.ceph.conf
 chmod 0644 $RPM_BUILD_ROOT%{_docdir}/ceph/sample.fetch_config
 
@@ -523,40 +660,89 @@ mv $RPM_BUILD_ROOT/sbin/mount.fuse.ceph $RPM_BUILD_ROOT/usr/sbin/mount.fuse.ceph
 
 #set up placeholder directories
 mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/ceph
+%if ! 0%{?_with_systemd}
 mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/run/ceph
+%endif
 mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/log/ceph
 mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/tmp
 mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/mon
 mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/osd
 mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/mds
+mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/radosgw
 mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-osd
 mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-mds
-mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/log/radosgw
+mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-rgw
 
 %clean
 rm -rf $RPM_BUILD_ROOT
 
+%pre
+%if 0%{?_with_systemd}
+  %if 0%{?suse_version}
+    # service_add_pre and friends don't work with parameterized systemd service
+    # instances, only with single services or targets, so we always pass
+    # ceph.target to these macros
+    %service_add_pre ceph.target
+  %endif
+%endif
+
+
 %post
 /sbin/ldconfig
-/sbin/chkconfig --add ceph
-mkdir -p %{_localstatedir}/run/ceph/
+%if 0%{?_with_systemd}
+  %if 0%{?suse_version}
+    %fillup_only
+    %service_add_post ceph.target
+  %endif
+%else
+  /sbin/chkconfig --add ceph
+%endif
 
 %preun
-%if %{defined suse_version}
-%stop_on_removal ceph
+%if 0%{?_with_systemd}
+  %if 0%{?suse_version}
+    %service_del_preun ceph.target
+  %endif
+  # Disable and stop on removal.
+  if [ $1 = 0 ] ; then
+    SERVICE_LIST=$(systemctl | grep -E '^ceph-mon@|^ceph-create-keys@|^ceph-osd@|^ceph-mds@|^ceph-disk-'  | cut -d' ' -f1)
+    if [ -n "$SERVICE_LIST" ]; then
+      for SERVICE in $SERVICE_LIST; do
+        /usr/bin/systemctl --no-reload disable $SERVICE > /dev/null 2>&1 || :
+        /usr/bin/systemctl stop $SERVICE > /dev/null 2>&1 || :
+      done
+    fi
+  fi
+%else
+  %if 0%{?rhel} || 0%{?fedora}
+    if [ $1 = 0 ] ; then
+      /sbin/service ceph stop >/dev/null 2>&1
+      /sbin/chkconfig --del ceph
+    fi
+  %endif
 %endif
-if [ $1 = 0 ] ; then
-    /sbin/service ceph stop >/dev/null 2>&1
-    /sbin/chkconfig --del ceph
-fi
 
 %postun
 /sbin/ldconfig
-%if %{defined suse_version}
-%insserv_cleanup
+%if 0%{?_with_systemd}
+  if [ $1 = 1 ] ; then
+    # Restart on upgrade, but only if "CEPH_AUTO_RESTART_ON_UPGRADE" is set to
+    # "yes". In any case: if units are not running, do not touch them.
+    SYSCONF_CEPH=/etc/sysconfig/ceph
+    if [ -f $SYSCONF_CEPH -a -r $SYSCONF_CEPH ] ; then
+      source $SYSCONF_CEPH
+    fi
+    if [ "X$CEPH_AUTO_RESTART_ON_UPGRADE" = "Xyes" ] ; then
+      SERVICE_LIST=$(systemctl | grep -E '^ceph-mon@|^ceph-create-keys@|^ceph-osd@|^ceph-mds@|^ceph-disk-'  | cut -d' ' -f1)
+      if [ -n "$SERVICE_LIST" ]; then
+        for SERVICE in $SERVICE_LIST; do
+          /usr/bin/systemctl try-restart $SERVICE > /dev/null 2>&1 || :
+        done
+      fi
+    fi
+  fi
 %endif
 
-
 #################################################################################
 # files
 #################################################################################
@@ -578,16 +764,26 @@ fi
 %{_bindir}/ceph-mds
 %{_bindir}/ceph-objectstore-tool
 %{_bindir}/ceph-osd
+%{_bindir}/ceph-detect-init
 %{_bindir}/librados-config
 %{_bindir}/ceph-client-debug
 %{_bindir}/cephfs-journal-tool
 %{_bindir}/cephfs-table-tool
+%{_bindir}/cephfs-data-scan
 %{_bindir}/ceph-debugpack
 %{_bindir}/ceph-coverage
+%if 0%{?_with_systemd}
+%{_unitdir}/ceph-mds at .service
+%{_unitdir}/ceph-mon at .service
+%{_unitdir}/ceph-create-keys at .service
+%{_unitdir}/ceph-osd at .service
+%{_unitdir}/ceph-radosgw at .service
+%{_unitdir}/ceph-disk at .service
+%{_unitdir}/ceph.target
+%else
 %{_initrddir}/ceph
+%endif
 %{_sbindir}/ceph-disk
-%{_sbindir}/ceph-disk-activate
-%{_sbindir}/ceph-disk-prepare
 %{_sbindir}/ceph-disk-udev
 %{_sbindir}/ceph-create-keys
 %{_sbindir}/rcceph
@@ -600,8 +796,10 @@ fi
 %{_libdir}/ceph/ceph_common.sh
 %{_libexecdir}/ceph/ceph-osd-prestart.sh
 %dir %{_libdir}/rados-classes
+%{_libdir}/rados-classes/libcls_cephfs.so*
 %{_libdir}/rados-classes/libcls_rbd.so*
 %{_libdir}/rados-classes/libcls_hello.so*
+%{_libdir}/rados-classes/libcls_numops.so*
 %{_libdir}/rados-classes/libcls_rgw.so*
 %{_libdir}/rados-classes/libcls_lock.so*
 %{_libdir}/rados-classes/libcls_kvs.so*
@@ -609,19 +807,30 @@ fi
 %{_libdir}/rados-classes/libcls_log.so*
 %{_libdir}/rados-classes/libcls_replica_log.so*
 %{_libdir}/rados-classes/libcls_statelog.so*
+%{_libdir}/rados-classes/libcls_timeindex.so*
 %{_libdir}/rados-classes/libcls_user.so*
 %{_libdir}/rados-classes/libcls_version.so*
 %dir %{_libdir}/ceph/erasure-code
 %{_libdir}/ceph/erasure-code/libec_*.so*
+%if 0%{?_with_lttng}
+%{_libdir}/libos_tp.so*
+%{_libdir}/libosd_tp.so*
+%endif
 %{_udevrulesdir}/60-ceph-partuuid-workaround.rules
 %{_udevrulesdir}/95-ceph-osd.rules
 %config %{_sysconfdir}/bash_completion.d/ceph
 %config(noreplace) %{_sysconfdir}/logrotate.d/ceph
+%if 0%{?fedora} || 0%{?rhel}
+%config(noreplace) %{_sysconfdir}/sysconfig/ceph
+%endif
 %if 0%{?suse_version}
+%{_localstatedir}/adm/fillup-templates/sysconfig.*
 %config %{_sysconfdir}/sysconfig/SuSEfirewall2.d/services/ceph-mon
 %config %{_sysconfdir}/sysconfig/SuSEfirewall2.d/services/ceph-osd-mds
 %endif
+%{python_sitelib}/ceph_detect_init*
 %{_mandir}/man8/ceph-deploy.8*
+%{_mandir}/man8/ceph-detect-init.8*
 %{_mandir}/man8/ceph-disk.8*
 %{_mandir}/man8/ceph-create-keys.8*
 %{_mandir}/man8/ceph-mon.8*
@@ -638,14 +847,16 @@ fi
 %{_mandir}/man8/ceph-clsinfo.8*
 %{_mandir}/man8/librados-config.8*
 #set up placeholder directories
-%dir %{_localstatedir}/lib/ceph/
-%dir %{_localstatedir}/lib/ceph/tmp
-%dir %{_localstatedir}/lib/ceph/mon
-%dir %{_localstatedir}/lib/ceph/osd
-%dir %{_localstatedir}/lib/ceph/mds
-%dir %{_localstatedir}/lib/ceph/bootstrap-osd
-%dir %{_localstatedir}/lib/ceph/bootstrap-mds
-%ghost %dir %{_localstatedir}/run/ceph/
+%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/tmp
+%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/mon
+%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/osd
+%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/mds
+%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/bootstrap-osd
+%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/bootstrap-mds
+%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/bootstrap-rgw
+%if ! 0%{?_with_systemd}
+%attr(770,ceph,ceph) %dir %{_localstatedir}/run/ceph
+%endif
 
 #################################################################################
 %files -n ceph-common
@@ -659,8 +870,16 @@ fi
 %{_bindir}/ceph-crush-location
 %{_bindir}/rados
 %{_bindir}/rbd
+%{_bindir}/rbd-replay
+%{_bindir}/rbd-replay-many
+%if 0%{?_with_lttng}
+%{_bindir}/rbd-replay-prep
+%endif
 %{_bindir}/ceph-post-file
 %{_bindir}/ceph-brag
+%if 0%{?_with_systemd}
+%{_tmpfilesdir}/ceph-common.conf
+%endif
 %{_mandir}/man8/ceph-authtool.8*
 %{_mandir}/man8/ceph-conf.8*
 %{_mandir}/man8/ceph-dencoder.8*
@@ -670,17 +889,46 @@ fi
 %{_mandir}/man8/ceph.8*
 %{_mandir}/man8/rados.8*
 %{_mandir}/man8/rbd.8*
+%{_mandir}/man8/rbd-replay.8*
+%{_mandir}/man8/rbd-replay-many.8*
+%{_mandir}/man8/rbd-replay-prep.8*
 %{_datadir}/ceph/known_hosts_drop.ceph.com
 %{_datadir}/ceph/id_dsa_drop.ceph.com
 %{_datadir}/ceph/id_dsa_drop.ceph.com.pub
 %dir %{_sysconfdir}/ceph/
-%dir %{_localstatedir}/log/ceph/
+%dir %{_datarootdir}/ceph/
+%dir %{_libexecdir}/ceph/
 %config %{_sysconfdir}/bash_completion.d/rados
 %config %{_sysconfdir}/bash_completion.d/rbd
 %config(noreplace) %{_sysconfdir}/ceph/rbdmap
 %{_initrddir}/rbdmap
 %{python_sitelib}/ceph_argparse.py*
+%{python_sitelib}/ceph_daemon.py*
 %{_udevrulesdir}/50-rbd.rules
+%attr(3770,ceph,ceph) %dir %{_localstatedir}/log/ceph/
+%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/
+
+%pre -n ceph-common
+CEPH_GROUP_ID=""
+CEPH_USER_ID=""
+%if 0%{?rhel} || 0%{?fedora}
+CEPH_GROUP_ID="-g 167"
+CEPH_USER_ID="-u 167"
+%endif
+%if 0%{?rhel} || 0%{?fedora}
+%{_sbindir}/groupadd ceph $CEPH_GROUP_ID -o -r 2>/dev/null || :
+%{_sbindir}/useradd ceph $CEPH_USER_ID -o -r -g ceph -s /sbin/nologin -c "Ceph daemons" -d %{_localstatedir}/lib/ceph 2> /dev/null || :
+%endif
+%if 0%{?suse_version}
+getent group ceph >/dev/null || groupadd -r ceph
+getent passwd ceph >/dev/null || useradd -r -g ceph -d %{_localstatedir}/lib/ceph -s /sbin/nologin -c "Ceph daemons" ceph
+%endif
+exit 0
+
+%post -n ceph-common
+%if 0%{?_with_systemd}
+systemd-tmpfiles --create --prefix=/run/ceph
+%endif
 
 %postun -n ceph-common
 # Package removal cleanup
@@ -709,38 +957,62 @@ fi
 #################################################################################
 %files radosgw
 %defattr(-,root,root,-)
-%{_initrddir}/ceph-radosgw
 %{_bindir}/radosgw
 %{_bindir}/radosgw-admin
+%{_bindir}/radosgw-object-expirer
 %{_mandir}/man8/radosgw.8*
 %{_mandir}/man8/radosgw-admin.8*
-%{_sbindir}/rcceph-radosgw
-%config(noreplace) %{_sysconfdir}/logrotate.d/radosgw
 %config %{_sysconfdir}/bash_completion.d/radosgw-admin
-%dir %{_localstatedir}/log/radosgw/
+%dir %{_localstatedir}/lib/ceph/radosgw
+%if 0%{?_with_systemd}
+%else
+%{_initrddir}/ceph-radosgw
+%{_sbindir}/rcceph-radosgw
+%endif
 
 %post radosgw
 /sbin/ldconfig
-%if %{defined suse_version}
-%fillup_and_insserv -f -y ceph-radosgw
+%if 0%{?suse_version}
+  # explicit systemctl daemon-reload (that's the only relevant bit of
+  # service_add_post; the rest is all sysvinit --> systemd migration which
+  # isn't applicable in this context (see above comment).
+  /usr/bin/systemctl daemon-reload >/dev/null 2>&1 || :
 %endif
 
 %preun radosgw
-%if %{defined suse_version}
-%stop_on_removal ceph-radosgw
+%if 0%{?_with_systemd}
+  # Disable and stop on removal.
+  if [ $1 = 0 ] ; then
+    SERVICE_LIST=$(systemctl | grep -E '^ceph-radosgw@'  | cut -d' ' -f1)
+    if [ -n "$SERVICE_LIST" ]; then
+      for SERVICE in $SERVICE_LIST; do
+        /usr/bin/systemctl --no-reload disable $SERVICE > /dev/null 2>&1 || :
+        /usr/bin/systemctl stop $SERVICE > /dev/null 2>&1 || :
+      done
+    fi
+  fi
 %endif
 
 %postun radosgw
 /sbin/ldconfig
-%if %{defined suse_version}
-%restart_on_update ceph-radosgw
-%insserv_cleanup
+%if 0%{?_with_systemd}
+  if [ $1 = 1 ] ; then
+    # Restart on upgrade, but only if "CEPH_AUTO_RESTART_ON_UPGRADE" is set to
+    # "yes". In any case: if units are not running, do not touch them.
+    SYSCONF_CEPH=/etc/sysconfig/ceph
+    if [ -f $SYSCONF_CEPH -a -r $SYSCONF_CEPH ] ; then
+      source $SYSCONF_CEPH
+    fi
+    if [ "X$CEPH_AUTO_RESTART_ON_UPGRADE" = "Xyes" ] ; then
+      SERVICE_LIST=$(systemctl | grep -E '^ceph-radosgw@'  | cut -d' ' -f1)
+      if [ -n "$SERVICE_LIST" ]; then
+        for SERVICE in $SERVICE_LIST; do
+          /usr/bin/systemctl try-restart $SERVICE > /dev/null 2>&1 || :
+        done
+      fi
+    fi
+  fi
 %endif
-# Package removal cleanup
-if [ "$1" -eq "0" ] ; then
-    rm -rf /var/log/radosgw
-fi
-
 
 #################################################################################
 %if %{with ocf}
@@ -756,6 +1028,9 @@ fi
 %files -n librados2
 %defattr(-,root,root,-)
 %{_libdir}/librados.so.*
+%if 0%{?_with_lttng}
+%{_libdir}/librados_tp.so.*
+%endif
 
 %post -n librados2
 /sbin/ldconfig
@@ -776,6 +1051,9 @@ fi
 %{_includedir}/rados/rados_types.hpp
 %{_includedir}/rados/memory.h
 %{_libdir}/librados.so
+%if 0%{?_with_lttng}
+%{_libdir}/librados_tp.so
+%endif
 
 #################################################################################
 %files -n python-rados
@@ -805,6 +1083,9 @@ fi
 %files -n librbd1
 %defattr(-,root,root,-)
 %{_libdir}/librbd.so.*
+%if 0%{?_with_lttng}
+%{_libdir}/librbd_tp.so.*
+%endif
 
 %post -n librbd1
 /sbin/ldconfig
@@ -822,6 +1103,9 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 %{_includedir}/rbd/librbd.hpp
 %{_includedir}/rbd/features.h
 %{_libdir}/librbd.so
+%if 0%{?_with_lttng}
+%{_libdir}/librbd_tp.so
+%endif
 
 #################################################################################
 %files -n python-rbd
@@ -852,11 +1136,6 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 %{python_sitelib}/cephfs.py*
 
 #################################################################################
-%files -n rest-bench
-%defattr(-,root,root,-)
-%{_bindir}/rest-bench
-
-#################################################################################
 %files -n ceph-test
 %defattr(-,root,root,-)
 %{_bindir}/ceph_bench_log
@@ -865,7 +1144,11 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 %{_bindir}/ceph_erasure_code
 %{_bindir}/ceph_erasure_code_benchmark
 %{_bindir}/ceph_omapbench
+%{_bindir}/ceph_objectstore_bench
 %{_bindir}/ceph_perf_objectstore
+%{_bindir}/ceph_perf_local
+%{_bindir}/ceph_perf_msgr_client
+%{_bindir}/ceph_perf_msgr_server
 %{_bindir}/ceph_psim
 %{_bindir}/ceph_radosacl
 %{_bindir}/ceph_rgw_jsonparser
@@ -883,14 +1166,8 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 %{_bindir}/ceph-monstore-tool
 %{_bindir}/ceph-osdomap-tool
 %{_bindir}/ceph-kvstore-tool
-%{_mandir}/man8/rbd-replay.8*
-%{_mandir}/man8/rbd-replay-many.8*
-%{_mandir}/man8/rbd-replay-prep.8*
-%{_bindir}/rbd-replay
-%{_bindir}/rbd-replay-many
-%if (0%{?fedora} >= 20 || 0%{?rhel} == 6)
-%{_bindir}/rbd-replay-prep
-%endif
+%dir %{_libdir}/ceph
+%{_libdir}/ceph/ceph-monstore-update-crush.sh
 
 #################################################################################
 %if 0%{with cephfs_java}
@@ -898,6 +1175,12 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 %defattr(-,root,root,-)
 %{_libdir}/libcephfs_jni.so.*
 
+%post -n libcephfs_jni1
+/sbin/ldconfig
+
+%postun -n libcephfs_jni1
+/sbin/ldconfig
+
 #################################################################################
 %files -n libcephfs_jni1-devel
 %defattr(-,root,root,-)
@@ -911,6 +1194,111 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 %endif
 
 #################################################################################
+%if 0%{with selinux}
+%files selinux
+%defattr(-,root,root,-)
+%attr(0600,root,root) %{_datadir}/selinux/packages/ceph.pp
+%{_datadir}/selinux/devel/include/contrib/ceph.if
+%{_mandir}/man8/ceph_selinux.8*
+
+%post selinux
+# Install the policy
+OLD_POLVER=$(%{_sbindir}/semodule -l | grep -P '^ceph[\t ]' | awk '{print $2}')
+%{_sbindir}/semodule -n -i %{_datadir}/selinux/packages/ceph.pp
+NEW_POLVER=$(%{_sbindir}/semodule -l | grep -P '^ceph[\t ]' | awk '{print $2}')
+
+# Load the policy if SELinux is enabled
+if %{_sbindir}/selinuxenabled; then
+    %{_sbindir}/load_policy
+else
+    # Do not relabel if selinux is not enabled
+    exit 0
+fi
+
+if test "$OLD_POLVER" == "$NEW_POLVER"; then
+   # Do not relabel if policy version did not change
+   exit 0
+fi
+
+# Check whether the daemons are running
+%if 0%{?_with_systemd}
+    /usr/bin/systemctl status ceph.target > /dev/null 2>&1
+%else
+    /sbin/service ceph status >/dev/null 2>&1
+%endif
+STATUS=$?
+
+# Stop the daemons if they were running
+if test $STATUS -eq 0; then
+%if 0%{?_with_systemd}
+    /usr/bin/systemctl stop ceph.target > /dev/null 2>&1
+%else
+    /sbin/service ceph stop >/dev/null 2>&1
+%endif
+fi
+
+# Now, relabel the files
+%relabel_files
+
+# Start the daemons iff they were running before
+if test $STATUS -eq 0; then
+%if 0%{?_with_systemd}
+    /usr/bin/systemctl start ceph.target > /dev/null 2>&1 || :
+%else
+    /sbin/service ceph start >/dev/null 2>&1 || :
+%endif
+fi
+
+exit 0
+
+%postun selinux
+if [ $1 -eq 0 ]; then
+    # Remove the module
+    %{_sbindir}/semodule -n -r ceph
+
+    # Reload the policy if SELinux is enabled
+    if %{_sbindir}/selinuxenabled ; then
+        %{_sbindir}/load_policy
+    else
+        # Do not relabel if SELinux is not enabled
+        exit 0
+    fi
+
+    # Check whether the daemons are running
+    %if 0%{?_with_systemd}
+        /usr/bin/systemctl status ceph.target > /dev/null 2>&1
+    %else
+        /sbin/service ceph status >/dev/null 2>&1
+    %endif
+    STATUS=$?
+
+    # Stop the daemons if they were running
+    if test $STATUS -eq 0; then
+    %if 0%{?_with_systemd}
+        /usr/bin/systemctl stop ceph.target > /dev/null 2>&1
+    %else
+        /sbin/service ceph stop >/dev/null 2>&1
+    %endif
+    fi
+
+    # Now, relabel the files
+    %relabel_files
+
+    # Start the daemons if they were running before
+    if test $STATUS -eq 0; then
+    %if 0%{?_with_systemd}
+	/usr/bin/systemctl start ceph.target > /dev/null 2>&1 || :
+    %else
+	/sbin/service ceph start >/dev/null 2>&1 || :
+    %endif
+    fi
+fi
+exit 0
+
+%endif # with selinux
+
+#################################################################################
+%if 0%{with libs_compat}
 %files libs-compat
 # We need an empty %%files list for ceph-libs-compat, to tell rpmbuild to actually
 # build this meta package.
@@ -919,6 +1307,7 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 %files devel-compat
 # We need an empty %%files list for ceph-devel-compat, to tell rpmbuild to
 # actually build this meta package.
+%endif
 
 #################################################################################
 %files -n python-ceph-compat
diff --git a/src/test/centos-6/install-deps.sh b/src/test/centos-6/install-deps.sh
index 129b238..1bebf09 100755
--- a/src/test/centos-6/install-deps.sh
+++ b/src/test/centos-6/install-deps.sh
@@ -1,8 +1,8 @@
-#!/bin/bash
+#!/bin/bash -e
 #
 # Ceph distributed storage system
 #
-# Copyright (C) 2014 Red Hat <contact at redhat.com>
+# Copyright (C) 2014, 2015 Red Hat <contact at redhat.com>
 #
 # Author: Loic Dachary <loic at dachary.org>
 #
@@ -23,10 +23,14 @@ if test -f /etc/redhat-release ; then
     $SUDO yum install -y redhat-lsb-core
 fi
 
-if which apt-get > /dev/null ; then
+if type apt-get > /dev/null 2>&1 ; then
     $SUDO apt-get install -y lsb-release
 fi
 
+if type zypper > /dev/null 2>&1 ; then
+    $SUDO zypper --gpg-auto-import-keys --non-interactive install lsb-release
+fi
+
 case $(lsb_release -si) in
 Ubuntu|Debian|Devuan)
         $SUDO apt-get install -y dpkg-dev
@@ -38,30 +42,106 @@ Ubuntu|Debian|Devuan)
         packages=$(dpkg-checkbuilddeps --admindir=$DIR debian/control 2>&1 | \
             perl -p -e 's/.*Unmet build dependencies: *//;' \
             -e 's/build-essential:native/build-essential/;' \
-            -e 's/\|//g;' \
+            -e 's/\s*\|\s*/\|/g;' \
             -e 's/\(.*?\)//g;' \
             -e 's/ +/\n/g;' | sort)
         case $(lsb_release -sc) in
             squeeze|wheezy)
                 packages=$(echo $packages | perl -pe 's/[-\w]*babeltrace[-\w]*//g')
+                backports="-t $(lsb_release -sc)-backports"
                 ;;
         esac
         packages=$(echo $packages) # change newlines into spaces
-        $SUDO bash -c "DEBIAN_FRONTEND=noninteractive apt-get install -y $packages"
+        $SUDO env DEBIAN_FRONTEND=noninteractive apt-get install $backports -y $packages || exit 1
         ;;
-CentOS|Fedora|SUSE*|RedHatEnterpriseServer)
+CentOS|Fedora|RedHatEnterpriseServer)
         case $(lsb_release -si) in
-            SUSE*)
-                $SUDO zypper -y yum-utils
+            Fedora)
+                $SUDO yum install -y yum-utils
                 ;;
-            *)
+            CentOS|RedHatEnterpriseServer)
                 $SUDO yum install -y yum-utils
+                MAJOR_VERSION=$(lsb_release -rs | cut -f1 -d.)
+                if test $(lsb_release -si) == RedHatEnterpriseServer ; then
+                    $SUDO yum install subscription-manager
+                    $SUDO subscription-manager repos --enable=rhel-$MAJOR_VERSION-server-optional-rpms
+                fi
+                $SUDO yum-config-manager --add-repo https://dl.fedoraproject.org/pub/epel/$MAJOR_VERSION/x86_64/ 
+                $SUDO yum install --nogpgcheck -y epel-release
+                $SUDO rpm --import /etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-$MAJOR_VERSION
+                $SUDO rm -f /etc/yum.repos.d/dl.fedoraproject.org*
                 ;;
         esac
         sed -e 's/@//g' < ceph.spec.in > $DIR/ceph.spec
-        $SUDO yum-builddep -y $DIR/ceph.spec
+        $SUDO yum-builddep -y $DIR/ceph.spec 2>&1 | tee $DIR/yum-builddep.out
+        ! grep -q -i error: $DIR/yum-builddep.out || exit 1
+        ;;
+*SUSE*)
+        sed -e 's/@//g' < ceph.spec.in > $DIR/ceph.spec
+        $SUDO zypper --non-interactive install $(rpmspec -q --buildrequires $DIR/ceph.spec) || exit 1
         ;;
 *)
         echo "$(lsb_release -si) is unknown, dependencies will have to be installed manually."
         ;;
 esac
+
+function populate_wheelhouse() {
+    local install=$1
+    shift
+
+    # Ubuntu-12.04 and Python 2.7.3 require this line
+    pip --timeout 300 $install 'distribute >= 0.7.3' || return 1
+    # although pip comes with virtualenv, having a recent version
+    # of pip matters when it comes to using wheel packages
+    pip --timeout 300 $install 'setuptools >= 0.8' 'pip >= 7.0' 'wheel >= 0.24' || return 1
+    if test $# != 0 ; then
+        pip --timeout 300 $install $@ || return 1
+    fi
+}
+
+function activate_virtualenv() {
+    local top_srcdir=$1
+    local interpreter=$2
+    local env_dir=$top_srcdir/install-deps-$interpreter
+
+    if ! test -d $env_dir ; then
+        virtualenv --python $interpreter $env_dir
+        . $env_dir/bin/activate
+        if ! populate_wheelhouse install ; then
+            rm -rf $env_dir
+            return 1
+        fi
+    fi
+    . $env_dir/bin/activate
+}
+
+# use pip cache if possible but do not store it outside of the source
+# tree
+# see https://pip.pypa.io/en/stable/reference/pip_install.html#caching
+mkdir -p install-deps-cache
+top_srcdir=$(pwd)
+export XDG_CACHE_HOME=$top_srcdir/install-deps-cache
+wip_wheelhouse=wheelhouse-wip
+
+#
+# preload python modules so that tox can run without network access
+#
+find . -name tox.ini | while read ini ; do
+    (
+        cd $(dirname $ini)
+        require=$(ls *requirements.txt 2>/dev/null | sed -e 's/^/-r /')
+        if test "$require" && ! test -d wheelhouse ; then
+            for interpreter in python2.7 python3 ; do
+                type $interpreter > /dev/null 2>&1 || continue
+                activate_virtualenv $top_srcdir $interpreter || exit 1
+                populate_wheelhouse "wheel -w $wip_wheelhouse" $require || exit 1
+            done
+            mv $wip_wheelhouse wheelhouse
+        fi
+    )
+done
+
+for interpreter in python2.7 python3 ; do
+    rm -rf $top_srcdir/install-deps-$interpreter
+done
+rm -rf $XDG_CACHE_HOME
diff --git a/src/test/centos-7/Dockerfile.in b/src/test/centos-7/Dockerfile.in
index dfde99a..19fe1aa 100644
--- a/src/test/centos-7/Dockerfile.in
+++ b/src/test/centos-7/Dockerfile.in
@@ -23,7 +23,7 @@ FROM centos:%%os_version%%
 COPY install-deps.sh /root/
 COPY ceph.spec.in /root/
 # http://jperrin.github.io/centos/2014/09/25/centos-docker-and-systemd/
-RUN yum -y swap -- remove fakesystemd -- install systemd systemd-libs && (cd /lib/systemd/system/sysinit.target.wants/; for i in *; do [ $i == systemd-tmpfiles-setup.service ] || rm -f $i; done) && rm -f /lib/systemd/system/multi-user.target.wants/* && rm -f /etc/systemd/system/*.wants/* && rm -f /lib/systemd/system/local-fs.target.wants/* && rm -f /lib/systemd/system/sockets.target.wants/*udev* && rm -f /lib/systemd/system/sockets.target.wants/*initctl* && rm -f /lib/systemd/system/basi [...]
+RUN yum -y swap -- remove fakesystemd systemd-libs systemd-container -- install systemd systemd-libs && (cd /lib/systemd/system/sysinit.target.wants/; for i in *; do [ $i == systemd-tmpfiles-setup.service ] || rm -f $i; done) && rm -f /lib/systemd/system/multi-user.target.wants/* && rm -f /etc/systemd/system/*.wants/* && rm -f /lib/systemd/system/local-fs.target.wants/* && rm -f /lib/systemd/system/sockets.target.wants/*udev* && rm -f /lib/systemd/system/sockets.target.wants/*initctl* && [...]
 RUN yum install -y yum-utils && yum-config-manager --add-repo https://dl.fedoraproject.org/pub/epel/7/x86_64/ && yum install --nogpgcheck -y epel-release && rpm --import /etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-7 && rm /etc/yum.repos.d/dl.fedoraproject.org*
 # build dependencies
 RUN cd /root ; ./install-deps.sh
diff --git a/src/test/centos-7/ceph.spec.in b/src/test/centos-7/ceph.spec.in
index 140e0e3..8f2a6fc 100644
--- a/src/test/centos-7/ceph.spec.in
+++ b/src/test/centos-7/ceph.spec.in
@@ -1,13 +1,55 @@
+# vim: set noexpandtab ts=8 sw=8 :
 %bcond_with ocf
 %bcond_without cephfs_java
+%bcond_with tests
+%bcond_without tcmalloc
+%bcond_without libs_compat
+%bcond_with lowmem_builder
+%if 0%{?fedora} || 0%{?rhel}
+%bcond_without selinux
+%endif
+%if 0%{?suse_version}
+%bcond_with selinux
+%endif
+
 
-%if ! (0%{?fedora} > 12 || 0%{?rhel} > 5)
+%if (0%{?el5} || (0%{?rhel_version} >= 500 && 0%{?rhel_version} <= 600))
 %{!?python_sitelib: %global python_sitelib %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")}
 %{!?python_sitearch: %global python_sitearch %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib(1))")}
 %endif
 
+%if %{with selinux}
+# get selinux policy version
+%{!?_selinux_policy_version: %global _selinux_policy_version %(sed -e 's,.*selinux-policy-\\([^/]*\\)/.*,\\1,' /usr/share/selinux/devel/policyhelp 2>/dev/null || echo 0.0.0)}
+
+%define relabel_files() \
+restorecon -R /usr/bin/ceph-mon > /dev/null 2>&1; \
+restorecon -R /usr/bin/ceph-osd > /dev/null 2>&1; \
+restorecon -R /usr/bin/ceph-mds > /dev/null 2>&1; \
+restorecon -R /usr/bin/radosgw > /dev/null 2>&1; \
+restorecon -R /etc/rc\.d/init\.d/ceph > /dev/null 2>&1; \
+restorecon -R /etc/rc\.d/init\.d/radosgw > /dev/null 2>&1; \
+restorecon -R /var/run/ceph > /dev/null 2>&1; \
+restorecon -R /var/lib/ceph > /dev/null 2>&1; \
+restorecon -R /var/log/ceph > /dev/null 2>&1;
+%endif
+
 %{!?_udevrulesdir: %global _udevrulesdir /lib/udev/rules.d}
 
+# Use systemd files on RHEL 7 and above and in SUSE/openSUSE.
+# Note: We don't install unit files for the services yet. For now,
+# the _with_systemd variable only implies that we'll install
+# /etc/tmpfiles.d/ceph.conf in order to set up the socket directory in
+# /var/run/ceph.
+%if 0%{?fedora} || 0%{?rhel} >= 7 || 0%{?suse_version} >= 1210
+%global _with_systemd 1
+%endif
+
+# LTTng-UST enabled on Fedora, RHEL 6, and SLES 12
+%if 0%{?fedora} || 0%{?rhel} == 6 || 0%{?suse_version} == 1315
+%global _with_lttng 1
+%endif
+
 #################################################################################
 # common
 #################################################################################
@@ -16,51 +58,67 @@ Version:	@VERSION@
 Release:	@RPM_RELEASE@%{?dist}
 Epoch:		1
 Summary:	User space components of the Ceph file system
-License:	GPL-2.0
-Group:		System Environment/Base
+License:	LGPL-2.1 and CC-BY-SA-1.0 and GPL-2.0 and BSL-1.0 and GPL-2.0-with-autoconf-exception and BSD-3-Clause and MIT
+%if 0%{?suse_version}
+Group:         System/Filesystems
+%endif
 URL:		http://ceph.com/
 Source0:	http://ceph.com/download/%{name}-%{version}.tar.bz2
-%if 0%{?fedora} || 0%{?centos} || 0%{?rhel}
+%if 0%{?fedora} || 0%{?rhel}
 Patch0:		init-ceph.in-fedora.patch
 %endif
+#################################################################################
+# dependencies that apply across all distro families
+#################################################################################
 Requires:	librbd1 = %{epoch}:%{version}-%{release}
 Requires:	librados2 = %{epoch}:%{version}-%{release}
 Requires:	libcephfs1 = %{epoch}:%{version}-%{release}
 Requires:	ceph-common = %{epoch}:%{version}-%{release}
+%if 0%{with selinux}
+Requires:	ceph-selinux = %{epoch}:%{version}-%{release}
+%endif
 Requires:	python-rados = %{epoch}:%{version}-%{release}
 Requires:	python-rbd = %{epoch}:%{version}-%{release}
 Requires:	python-cephfs = %{epoch}:%{version}-%{release}
 Requires:	python
 Requires:	python-requests
-Requires:	python-flask
+Requires:	grep
 Requires:	xfsprogs
+Requires:	logrotate
 Requires:	parted
 Requires:	util-linux
 Requires:	hdparm
 Requires:	cryptsetup
+Requires:	findutils
+Requires:	which
 Requires(post):	binutils
+%if 0%{with cephfs_java}
+BuildRequires:	java-devel
+BuildRequires:	sharutils
+%endif
+%if 0%{with selinux}
+BuildRequires:	checkpolicy
+BuildRequires:	selinux-policy-devel
+BuildRequires:	/usr/share/selinux/devel/policyhelp
+%endif
 BuildRequires:	gcc-c++
 BuildRequires:	boost-devel
-%if 0%{defined suse_version}
-BuildRequires:  libbz2-devel
-%else
-BuildRequires:  bzip2-devel
-%endif
+BuildRequires:  cmake
 BuildRequires:	cryptsetup
+BuildRequires:	fuse-devel
 BuildRequires:	gdbm
 BuildRequires:	hdparm
+BuildRequires:	leveldb-devel > 1.2
 BuildRequires:	libaio-devel
 BuildRequires:	libcurl-devel
 BuildRequires:	libedit-devel
 BuildRequires:	libxml2-devel
-BuildRequires:	libuuid-devel
 BuildRequires:	libblkid-devel >= 2.17
 BuildRequires:	libudev-devel
 BuildRequires:	libtool
-BuildRequires:	leveldb-devel > 1.2
 BuildRequires:	make
-BuildRequires:	perl
 BuildRequires:	parted
+BuildRequires:	perl
 BuildRequires:	pkgconfig
 BuildRequires:	python
 BuildRequires:	python-nose
@@ -72,46 +130,86 @@ BuildRequires:	xfsprogs
 BuildRequires:	xfsprogs-devel
 BuildRequires:	xmlstarlet
 BuildRequires:	yasm
-%if 0%{?suse_version}
-BuildRequires:	net-tools
-%endif
 
 #################################################################################
-# specific
+# distro-conditional dependencies
 #################################################################################
-%if ! 0%{?rhel} || 0%{?fedora}
-BuildRequires:	sharutils
+%if 0%{?suse_version}
+%if 0%{?_with_systemd}
+BuildRequires:  pkgconfig(systemd)
+BuildRequires:	systemd-rpm-macros
+%{?systemd_requires}
 %endif
-
-%if 0%{defined suse_version}
+PreReq:		%fillup_prereq
+Requires:	python-Flask
+BuildRequires:	net-tools
+BuildRequires:	libbz2-devel
 %if 0%{?suse_version} > 1210
 Requires:	gptfdisk
+%if 0%{with tcmalloc}
 BuildRequires:	gperftools-devel
+%endif
 %else
 Requires:	scsirastools
 BuildRequires:	google-perftools-devel
 %endif
-Recommends:	logrotate
-BuildRequires:	%insserv_prereq
 BuildRequires:	mozilla-nss-devel
 BuildRequires:	keyutils-devel
 BuildRequires:	libatomic-ops-devel
 %else
-Requires:	gdisk
+%if 0%{?_with_systemd}
+Requires:	systemd
+%endif
+BuildRequires:  bzip2-devel
 BuildRequires:	nss-devel
 BuildRequires:	keyutils-libs-devel
 BuildRequires:	libatomic_ops-devel
 Requires:	gdisk
 Requires(post):	chkconfig
-Requires(preun):chkconfig
-Requires(preun):initscripts
+Requires(preun):	chkconfig
+Requires(preun):	initscripts
 BuildRequires:	gperftools-devel
+Requires:	python-flask
+%endif
+# boost
+%if 0%{?fedora} || 0%{?rhel} 
+BuildRequires:  boost-random
+%endif
+# python-argparse for distros with Python 2.6 or lower
+%if (0%{?rhel} && 0%{?rhel} <= 6) || (0%{?suse_version} && 0%{?suse_version} <= 1110)
+BuildRequires:	python-argparse
+%endif
+# lttng and babeltrace for rbd-replay-prep
+%if 0%{?_with_lttng}
+%if 0%{?fedora} || 0%{?rhel}
+BuildRequires:	lttng-ust-devel
+BuildRequires:	libbabeltrace-devel
+%endif
+%if 0%{?suse_version}
+BuildRequires:	lttng-ust-devel
+BuildRequires:  babeltrace-devel
+%endif
+%endif
+# expat and fastcgi for RGW
+%if 0%{?suse_version}
+BuildRequires:	libexpat-devel
+BuildRequires:	FastCGI-devel
+%endif
+%if 0%{?rhel} || 0%{?fedora}
+BuildRequires:	expat-devel
+BuildRequires:	fcgi-devel
+%endif
+# python-sphinx
+%if 0%{?rhel} > 0 && 0%{?rhel} < 7
+BuildRequires:	python-sphinx10
+%endif
+%if 0%{?fedora} || 0%{?suse_version} || 0%{?rhel} >= 7
+BuildRequires:	python-sphinx
 %endif
 
 %description
-Ceph is a massively scalable, open-source, distributed
-storage system that runs on commodity hardware and delivers object,
-block and file system storage.
+Ceph is a massively scalable, open-source, distributed storage system that runs
+on commodity hardware and delivers object, block and file system storage.
 
 
 #################################################################################
@@ -126,13 +224,15 @@ Requires:	python-rados = %{epoch}:%{version}-%{release}
 Requires:	python-rbd = %{epoch}:%{version}-%{release}
 Requires:	python-cephfs = %{epoch}:%{version}-%{release}
 Requires:	python-requests
-%if 0%{?rhel} || 0%{?fedora}
-Requires:  redhat-lsb-core
+%if 0%{?_with_systemd}
+%{?systemd_requires}
+%endif
+%if 0%{?suse_version}
+Requires(pre):	pwdutils
 %endif
 # python-argparse is only needed in distros with Python 2.6 or lower
 %if (0%{?rhel} && 0%{?rhel} <= 6) || (0%{?suse_version} && 0%{?suse_version} <= 1110)
 Requires:	python-argparse
-BuildRequires:	python-argparse
 %endif
 %description -n ceph-common
 Common utilities to mount and interact with a ceph storage cluster.
@@ -141,7 +241,6 @@ Common utilities to mount and interact with a ceph storage cluster.
 Summary:	Ceph fuse-based client
 Group:		System Environment/Base
 Requires:	%{name}
-BuildRequires:	fuse-devel
 %description fuse
 FUSE based client for Ceph distributed network file system
 
@@ -151,7 +250,6 @@ Group:		System Environment/Base
 Requires:	%{name}
 Requires:	librados2 = %{epoch}:%{version}-%{release}
 Requires:	librbd1 = %{epoch}:%{version}-%{release}
-BuildRequires:	fuse-devel
 %description -n rbd-fuse
 FUSE based client to map Ceph rbd images to files
 
@@ -159,13 +257,11 @@ FUSE based client to map Ceph rbd images to files
 Summary:	Rados REST gateway
 Group:		Development/Libraries
 Requires:	ceph-common = %{epoch}:%{version}-%{release}
+%if 0%{with selinux}
+Requires:	ceph-selinux = %{epoch}:%{version}-%{release}
+%endif
 Requires:	librados2 = %{epoch}:%{version}-%{release}
-%if 0%{defined suse_version}
-BuildRequires:	libexpat-devel
-BuildRequires:	FastCGI-devel
-%else
-BuildRequires:	expat-devel
-BuildRequires:	fcgi-devel
+%if 0%{?rhel} || 0%{?fedora}
 Requires:	mailcap
 %endif
 %description radosgw
@@ -190,7 +286,7 @@ managers such as Pacemaker.
 Summary:	RADOS distributed object store client library
 Group:		System Environment/Libraries
 License:	LGPL-2.0
-%if 0%{?rhel} || 0%{?centos} || 0%{?fedora}
+%if 0%{?rhel} || 0%{?fedora}
 Obsoletes:	ceph-libs < %{epoch}:%{version}-%{release}
 %endif
 %description -n librados2
@@ -223,7 +319,7 @@ object store.
 Summary:	RADOS striping interface
 Group:		System Environment/Libraries
 License:	LGPL-2.0
-Requires:	librados2 = %{epoch}:%{version}
+Requires:	librados2 = %{epoch}:%{version}-%{release}
 %description -n libradosstriper1
 Striping interface built on top of the rados library, allowing
 to stripe bigger objects onto several standard rados objects using
@@ -245,7 +341,7 @@ Summary:	RADOS block device client library
 Group:		System Environment/Libraries
 License:	LGPL-2.0
 Requires:	librados2 = %{epoch}:%{version}-%{release}
-%if 0%{?rhel} || 0%{?centos} || 0%{?fedora}
+%if 0%{?rhel} || 0%{?fedora}
 Obsoletes:	ceph-libs < %{epoch}:%{version}-%{release}
 %endif
 %description -n librbd1
@@ -280,7 +376,7 @@ block device.
 Summary:	Ceph distributed file system client library
 Group:		System Environment/Libraries
 License:	LGPL-2.0
-%if 0%{?rhel} || 0%{?centos} || 0%{?fedora}
+%if 0%{?rhel} || 0%{?fedora}
 Obsoletes:	ceph-libs < %{epoch}:%{version}-%{release}
 Obsoletes:	ceph-libcephfs
 %endif
@@ -312,41 +408,29 @@ Obsoletes:	python-ceph < %{epoch}:%{version}-%{release}
 This package contains Python libraries for interacting with Cephs distributed
 file system.
 
-%package -n rest-bench
-Summary:	RESTful benchmark
-Group:		System Environment/Libraries
-License:	LGPL-2.0
-Requires:	ceph-common = %{epoch}:%{version}-%{release}
-%description -n rest-bench
-RESTful bencher that can be used to benchmark radosgw performance.
-
 %package -n ceph-test
 Summary:	Ceph benchmarks and test tools
 Group:		System Environment/Libraries
 License:	LGPL-2.0
 Requires:	ceph-common
-%if (0%{?fedora} >= 20 || 0%{?rhel} == 6)
-BuildRequires:	lttng-ust-devel
-BuildRequires:	libbabeltrace-devel
-%endif
+Requires:	xmlstarlet
 %description -n ceph-test
 This package contains Ceph benchmarks and test tools.
 
 %if 0%{with cephfs_java}
 
 %package -n libcephfs_jni1
-Summary:	Java Native Interface library for CephFS Java bindings.
+Summary:	Java Native Interface library for CephFS Java bindings
 Group:		System Environment/Libraries
 License:	LGPL-2.0
 Requires:	java
 Requires:	libcephfs1 = %{epoch}:%{version}-%{release}
-BuildRequires:	java-devel
 %description -n libcephfs_jni1
 This package contains the Java Native Interface library for CephFS Java
 bindings.
 
 %package -n libcephfs_jni1-devel
-Summary:	Development files for CephFS Java Native Interface library.
+Summary:	Development files for CephFS Java Native Interface library
 Group:		System Environment/Libraries
 License:	LGPL-2.0
 Requires:	java
@@ -357,12 +441,11 @@ This package contains the development files for CephFS Java Native Interface
 library.
 
 %package -n cephfs-java
-Summary:	Java libraries for the Ceph File System.
+Summary:	Java libraries for the Ceph File System
 Group:		System Environment/Libraries
 License:	LGPL-2.0
 Requires:	java
 Requires:	libcephfs_jni1 = %{epoch}:%{version}-%{release}
-BuildRequires:	java-devel
 %if 0%{?el6}
 Requires:	junit4
 BuildRequires:	junit4
@@ -375,8 +458,26 @@ This package contains the Java libraries for the Ceph File System.
 
 %endif
 
+%if 0%{with selinux}
+
+%package selinux
+Summary:	SELinux support for Ceph MON, OSD and MDS
+Group:		System Environment/Base
+Requires:	%{name}
+Requires:	policycoreutils, libselinux-utils
+Requires(post): selinux-policy-base >= %{_selinux_policy_version}, policycoreutils, gawk
+Requires(postun): policycoreutils
+%description selinux
+This package contains SELinux support for Ceph MON, OSD and MDS. The package
+also performs file-system relabelling which can take a long time on heavily
+populated file-systems.
+
+%endif
+
+%if 0%{with libs_compat}
+
 %package libs-compat
-Summary:	Meta package to include ceph libraries.
+Summary:	Meta package to include ceph libraries
 Group:		System Environment/Libraries
 License:	LGPL-2.0
 Obsoletes:	ceph-libs
@@ -392,6 +493,8 @@ former ceph-libs package, which is now split up into these three subpackages.
 Packages still depending on ceph-libs should be fixed to depend on librados2,
 librbd1 or libcephfs1 instead.
 
+%endif
+
 %package devel-compat
 Summary:	Compatibility package for Ceph headers
 Group:		Development/Libraries
@@ -427,16 +530,12 @@ python-rados, python-rbd and python-cephfs. Packages still depending on
 python-ceph should be fixed to depend on python-rados, python-rbd or
 python-cephfs instead.
 
-%if 0%{?opensuse} || 0%{?suse_version}
-%debug_package
-%endif
-
 #################################################################################
 # common
 #################################################################################
 %prep
 %setup -q
-%if 0%{?fedora} || 0%{?rhel} || 0%{?centos}
+%if 0%{?fedora} || 0%{?rhel}
 %patch0 -p1 -b .init
 %endif
 
@@ -449,53 +548,91 @@ done
 %endif
 
 ./autogen.sh
-MY_CONF_OPT=""
-
-MY_CONF_OPT="$MY_CONF_OPT --with-radosgw"
 
+%if %{with lowmem_builder}
+RPM_OPT_FLAGS="$RPM_OPT_FLAGS --param ggc-min-expand=20 --param ggc-min-heapsize=32768"
+%endif
 export RPM_OPT_FLAGS=`echo $RPM_OPT_FLAGS | sed -e 's/i386/i486/'`
 
 %{configure}	CPPFLAGS="$java_inc" \
 		--prefix=/usr \
 		--localstatedir=/var \
 		--sysconfdir=/etc \
+%if 0%{?_with_systemd}
+		--with-systemdsystemunitdir=%_unitdir \
+%endif
 		--docdir=%{_docdir}/ceph \
+		--with-man-pages \
+		--mandir="%_mandir" \
 		--with-nss \
 		--without-cryptopp \
-		--with-rest-bench \
 		--with-debug \
 %if 0%{with cephfs_java}
 		--enable-cephfs-java \
 %endif
+%if 0%{with selinux}
+		--with-selinux \
+%endif
 		--with-librocksdb-static=check \
-		$MY_CONF_OPT \
+%if 0%{?rhel} || 0%{?fedora}
+		--with-systemd-libexec-dir=/usr/libexec/ceph \
+		--with-rgw-user=root \
+		--with-rgw-group=root \
+%endif
+%if 0%{?suse_version}
+		--with-systemd-libexec-dir=/usr/lib/ceph/ \
+		--with-rgw-user=wwwrun \
+		--with-rgw-group=www \
+%endif
+		--with-radosgw \
+		$CEPH_EXTRA_CONFIGURE_ARGS \
 		%{?_with_ocf} \
+		%{?_with_tcmalloc} \
 		CFLAGS="$RPM_OPT_FLAGS" CXXFLAGS="$RPM_OPT_FLAGS"
 
-# fix bug in specific version of libedit-devel
-%if 0%{defined suse_version}
-sed -i -e "s/-lcurses/-lncurses/g" Makefile
-sed -i -e "s/-lcurses/-lncurses/g" src/Makefile
-sed -i -e "s/-lcurses/-lncurses/g" man/Makefile
-sed -i -e "s/-lcurses/-lncurses/g" src/ocf/Makefile
-sed -i -e "s/-lcurses/-lncurses/g" src/java/Makefile
+
+make %{?_smp_mflags}
+
+
+%if 0%{with tests}
+%check
+# run in-tree unittests
+make %{?_smp_mflags} check-local
+
 %endif
 
-make -j$(getconf _NPROCESSORS_ONLN)
+
 
 %install
 make DESTDIR=$RPM_BUILD_ROOT install
 find $RPM_BUILD_ROOT -type f -name "*.la" -exec rm -f {} ';'
 find $RPM_BUILD_ROOT -type f -name "*.a" -exec rm -f {} ';'
-install -D src/init-ceph $RPM_BUILD_ROOT%{_initrddir}/ceph
-install -D src/init-radosgw $RPM_BUILD_ROOT%{_initrddir}/ceph-radosgw
-install -D src/init-rbdmap $RPM_BUILD_ROOT%{_initrddir}/rbdmap
 install -D src/rbdmap $RPM_BUILD_ROOT%{_sysconfdir}/ceph/rbdmap
+install -D src/init-rbdmap $RPM_BUILD_ROOT%{_initrddir}/rbdmap
+%if 0%{?fedora} || 0%{?rhel}
+install -m 0644 -D etc/sysconfig/ceph $RPM_BUILD_ROOT%{_sysconfdir}/sysconfig/ceph
+%endif
+%if 0%{?suse_version}
+install -m 0644 -D etc/sysconfig/ceph $RPM_BUILD_ROOT%{_localstatedir}/adm/fillup-templates/sysconfig.%{name}
+%endif
+%if 0%{?_with_systemd}
+  install -m 0644 -D systemd/ceph.tmpfiles.d $RPM_BUILD_ROOT%{_tmpfilesdir}/ceph-common.conf
+  install -m 0644 -D systemd/ceph-osd at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-osd at .service
+  install -m 0644 -D systemd/ceph-mon at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-mon at .service
+  install -m 0644 -D systemd/ceph-create-keys at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-create-keys at .service
+  install -m 0644 -D systemd/ceph-mds at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-mds at .service
+  install -m 0644 -D systemd/ceph-radosgw at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-radosgw at .service
+  install -m 0644 -D systemd/ceph.target $RPM_BUILD_ROOT%{_unitdir}/ceph.target
+  install -m 0644 -D systemd/ceph-disk at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-disk at .service
+  install -m 0755 -D systemd/ceph $RPM_BUILD_ROOT%{_sbindir}/rcceph
+%else
+  install -D src/init-ceph $RPM_BUILD_ROOT%{_initrddir}/ceph
+  install -D src/init-radosgw $RPM_BUILD_ROOT%{_initrddir}/ceph-radosgw
+  ln -sf ../../etc/init.d/ceph %{buildroot}/%{_sbindir}/rcceph
+  ln -sf ../../etc/init.d/ceph-radosgw %{buildroot}/%{_sbindir}/rcceph-radosgw
+%endif
 mkdir -p $RPM_BUILD_ROOT%{_sbindir}
-ln -sf ../../etc/init.d/ceph %{buildroot}/%{_sbindir}/rcceph
-ln -sf ../../etc/init.d/ceph-radosgw %{buildroot}/%{_sbindir}/rcceph-radosgw
 install -m 0644 -D src/logrotate.conf $RPM_BUILD_ROOT%{_sysconfdir}/logrotate.d/ceph
-install -m 0644 -D src/rgw/logrotate.conf $RPM_BUILD_ROOT%{_sysconfdir}/logrotate.d/radosgw
 chmod 0644 $RPM_BUILD_ROOT%{_docdir}/ceph/sample.ceph.conf
 chmod 0644 $RPM_BUILD_ROOT%{_docdir}/ceph/sample.fetch_config
 
@@ -523,40 +660,89 @@ mv $RPM_BUILD_ROOT/sbin/mount.fuse.ceph $RPM_BUILD_ROOT/usr/sbin/mount.fuse.ceph
 
 #set up placeholder directories
 mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/ceph
+%if ! 0%{?_with_systemd}
 mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/run/ceph
+%endif
 mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/log/ceph
 mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/tmp
 mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/mon
 mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/osd
 mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/mds
+mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/radosgw
 mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-osd
 mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-mds
-mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/log/radosgw
+mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-rgw
 
 %clean
 rm -rf $RPM_BUILD_ROOT
 
+%pre
+%if 0%{?_with_systemd}
+  %if 0%{?suse_version}
+    # service_add_pre and friends don't work with parameterized systemd service
+    # instances, only with single services or targets, so we always pass
+    # ceph.target to these macros
+    %service_add_pre ceph.target
+  %endif
+%endif
+
+
 %post
 /sbin/ldconfig
-/sbin/chkconfig --add ceph
-mkdir -p %{_localstatedir}/run/ceph/
+%if 0%{?_with_systemd}
+  %if 0%{?suse_version}
+    %fillup_only
+    %service_add_post ceph.target
+  %endif
+%else
+  /sbin/chkconfig --add ceph
+%endif
 
 %preun
-%if %{defined suse_version}
-%stop_on_removal ceph
+%if 0%{?_with_systemd}
+  %if 0%{?suse_version}
+    %service_del_preun ceph.target
+  %endif
+  # Disable and stop on removal.
+  if [ $1 = 0 ] ; then
+    SERVICE_LIST=$(systemctl | grep -E '^ceph-mon@|^ceph-create-keys@|^ceph-osd@|^ceph-mds@|^ceph-disk-'  | cut -d' ' -f1)
+    if [ -n "$SERVICE_LIST" ]; then
+      for SERVICE in $SERVICE_LIST; do
+        /usr/bin/systemctl --no-reload disable $SERVICE > /dev/null 2>&1 || :
+        /usr/bin/systemctl stop $SERVICE > /dev/null 2>&1 || :
+      done
+    fi
+  fi
+%else
+  %if 0%{?rhel} || 0%{?fedora}
+    if [ $1 = 0 ] ; then
+      /sbin/service ceph stop >/dev/null 2>&1
+      /sbin/chkconfig --del ceph
+    fi
+  %endif
 %endif
-if [ $1 = 0 ] ; then
-    /sbin/service ceph stop >/dev/null 2>&1
-    /sbin/chkconfig --del ceph
-fi
 
 %postun
 /sbin/ldconfig
-%if %{defined suse_version}
-%insserv_cleanup
+%if 0%{?_with_systemd}
+  if [ $1 = 1 ] ; then
+    # Restart on upgrade, but only if "CEPH_AUTO_RESTART_ON_UPGRADE" is set to
+    # "yes". In any case: if units are not running, do not touch them.
+    SYSCONF_CEPH=/etc/sysconfig/ceph
+    if [ -f $SYSCONF_CEPH -a -r $SYSCONF_CEPH ] ; then
+      source $SYSCONF_CEPH
+    fi
+    if [ "X$CEPH_AUTO_RESTART_ON_UPGRADE" = "Xyes" ] ; then
+      SERVICE_LIST=$(systemctl | grep -E '^ceph-mon@|^ceph-create-keys@|^ceph-osd@|^ceph-mds@|^ceph-disk-'  | cut -d' ' -f1)
+      if [ -n "$SERVICE_LIST" ]; then
+        for SERVICE in $SERVICE_LIST; do
+          /usr/bin/systemctl try-restart $SERVICE > /dev/null 2>&1 || :
+        done
+      fi
+    fi
+  fi
 %endif
 
-
 #################################################################################
 # files
 #################################################################################
@@ -578,16 +764,26 @@ fi
 %{_bindir}/ceph-mds
 %{_bindir}/ceph-objectstore-tool
 %{_bindir}/ceph-osd
+%{_bindir}/ceph-detect-init
 %{_bindir}/librados-config
 %{_bindir}/ceph-client-debug
 %{_bindir}/cephfs-journal-tool
 %{_bindir}/cephfs-table-tool
+%{_bindir}/cephfs-data-scan
 %{_bindir}/ceph-debugpack
 %{_bindir}/ceph-coverage
+%if 0%{?_with_systemd}
+%{_unitdir}/ceph-mds at .service
+%{_unitdir}/ceph-mon at .service
+%{_unitdir}/ceph-create-keys at .service
+%{_unitdir}/ceph-osd at .service
+%{_unitdir}/ceph-radosgw at .service
+%{_unitdir}/ceph-disk at .service
+%{_unitdir}/ceph.target
+%else
 %{_initrddir}/ceph
+%endif
 %{_sbindir}/ceph-disk
-%{_sbindir}/ceph-disk-activate
-%{_sbindir}/ceph-disk-prepare
 %{_sbindir}/ceph-disk-udev
 %{_sbindir}/ceph-create-keys
 %{_sbindir}/rcceph
@@ -600,8 +796,10 @@ fi
 %{_libdir}/ceph/ceph_common.sh
 %{_libexecdir}/ceph/ceph-osd-prestart.sh
 %dir %{_libdir}/rados-classes
+%{_libdir}/rados-classes/libcls_cephfs.so*
 %{_libdir}/rados-classes/libcls_rbd.so*
 %{_libdir}/rados-classes/libcls_hello.so*
+%{_libdir}/rados-classes/libcls_numops.so*
 %{_libdir}/rados-classes/libcls_rgw.so*
 %{_libdir}/rados-classes/libcls_lock.so*
 %{_libdir}/rados-classes/libcls_kvs.so*
@@ -609,19 +807,30 @@ fi
 %{_libdir}/rados-classes/libcls_log.so*
 %{_libdir}/rados-classes/libcls_replica_log.so*
 %{_libdir}/rados-classes/libcls_statelog.so*
+%{_libdir}/rados-classes/libcls_timeindex.so*
 %{_libdir}/rados-classes/libcls_user.so*
 %{_libdir}/rados-classes/libcls_version.so*
 %dir %{_libdir}/ceph/erasure-code
 %{_libdir}/ceph/erasure-code/libec_*.so*
+%if 0%{?_with_lttng}
+%{_libdir}/libos_tp.so*
+%{_libdir}/libosd_tp.so*
+%endif
 %{_udevrulesdir}/60-ceph-partuuid-workaround.rules
 %{_udevrulesdir}/95-ceph-osd.rules
 %config %{_sysconfdir}/bash_completion.d/ceph
 %config(noreplace) %{_sysconfdir}/logrotate.d/ceph
+%if 0%{?fedora} || 0%{?rhel}
+%config(noreplace) %{_sysconfdir}/sysconfig/ceph
+%endif
 %if 0%{?suse_version}
+%{_localstatedir}/adm/fillup-templates/sysconfig.*
 %config %{_sysconfdir}/sysconfig/SuSEfirewall2.d/services/ceph-mon
 %config %{_sysconfdir}/sysconfig/SuSEfirewall2.d/services/ceph-osd-mds
 %endif
+%{python_sitelib}/ceph_detect_init*
 %{_mandir}/man8/ceph-deploy.8*
+%{_mandir}/man8/ceph-detect-init.8*
 %{_mandir}/man8/ceph-disk.8*
 %{_mandir}/man8/ceph-create-keys.8*
 %{_mandir}/man8/ceph-mon.8*
@@ -638,14 +847,16 @@ fi
 %{_mandir}/man8/ceph-clsinfo.8*
 %{_mandir}/man8/librados-config.8*
 #set up placeholder directories
-%dir %{_localstatedir}/lib/ceph/
-%dir %{_localstatedir}/lib/ceph/tmp
-%dir %{_localstatedir}/lib/ceph/mon
-%dir %{_localstatedir}/lib/ceph/osd
-%dir %{_localstatedir}/lib/ceph/mds
-%dir %{_localstatedir}/lib/ceph/bootstrap-osd
-%dir %{_localstatedir}/lib/ceph/bootstrap-mds
-%ghost %dir %{_localstatedir}/run/ceph/
+%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/tmp
+%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/mon
+%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/osd
+%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/mds
+%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/bootstrap-osd
+%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/bootstrap-mds
+%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/bootstrap-rgw
+%if ! 0%{?_with_systemd}
+%attr(770,ceph,ceph) %dir %{_localstatedir}/run/ceph
+%endif
 
 #################################################################################
 %files -n ceph-common
@@ -659,8 +870,16 @@ fi
 %{_bindir}/ceph-crush-location
 %{_bindir}/rados
 %{_bindir}/rbd
+%{_bindir}/rbd-replay
+%{_bindir}/rbd-replay-many
+%if 0%{?_with_lttng}
+%{_bindir}/rbd-replay-prep
+%endif
 %{_bindir}/ceph-post-file
 %{_bindir}/ceph-brag
+%if 0%{?_with_systemd}
+%{_tmpfilesdir}/ceph-common.conf
+%endif
 %{_mandir}/man8/ceph-authtool.8*
 %{_mandir}/man8/ceph-conf.8*
 %{_mandir}/man8/ceph-dencoder.8*
@@ -670,17 +889,46 @@ fi
 %{_mandir}/man8/ceph.8*
 %{_mandir}/man8/rados.8*
 %{_mandir}/man8/rbd.8*
+%{_mandir}/man8/rbd-replay.8*
+%{_mandir}/man8/rbd-replay-many.8*
+%{_mandir}/man8/rbd-replay-prep.8*
 %{_datadir}/ceph/known_hosts_drop.ceph.com
 %{_datadir}/ceph/id_dsa_drop.ceph.com
 %{_datadir}/ceph/id_dsa_drop.ceph.com.pub
 %dir %{_sysconfdir}/ceph/
-%dir %{_localstatedir}/log/ceph/
+%dir %{_datarootdir}/ceph/
+%dir %{_libexecdir}/ceph/
 %config %{_sysconfdir}/bash_completion.d/rados
 %config %{_sysconfdir}/bash_completion.d/rbd
 %config(noreplace) %{_sysconfdir}/ceph/rbdmap
 %{_initrddir}/rbdmap
 %{python_sitelib}/ceph_argparse.py*
+%{python_sitelib}/ceph_daemon.py*
 %{_udevrulesdir}/50-rbd.rules
+%attr(3770,ceph,ceph) %dir %{_localstatedir}/log/ceph/
+%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/
+
+%pre -n ceph-common
+CEPH_GROUP_ID=""
+CEPH_USER_ID=""
+%if 0%{?rhel} || 0%{?fedora}
+CEPH_GROUP_ID="-g 167"
+CEPH_USER_ID="-u 167"
+%endif
+%if 0%{?rhel} || 0%{?fedora}
+%{_sbindir}/groupadd ceph $CEPH_GROUP_ID -o -r 2>/dev/null || :
+%{_sbindir}/useradd ceph $CEPH_USER_ID -o -r -g ceph -s /sbin/nologin -c "Ceph daemons" -d %{_localstatedir}/lib/ceph 2> /dev/null || :
+%endif
+%if 0%{?suse_version}
+getent group ceph >/dev/null || groupadd -r ceph
+getent passwd ceph >/dev/null || useradd -r -g ceph -d %{_localstatedir}/lib/ceph -s /sbin/nologin -c "Ceph daemons" ceph
+%endif
+exit 0
+
+%post -n ceph-common
+%if 0%{?_with_systemd}
+systemd-tmpfiles --create --prefix=/run/ceph
+%endif
 
 %postun -n ceph-common
 # Package removal cleanup
@@ -709,38 +957,62 @@ fi
 #################################################################################
 %files radosgw
 %defattr(-,root,root,-)
-%{_initrddir}/ceph-radosgw
 %{_bindir}/radosgw
 %{_bindir}/radosgw-admin
+%{_bindir}/radosgw-object-expirer
 %{_mandir}/man8/radosgw.8*
 %{_mandir}/man8/radosgw-admin.8*
-%{_sbindir}/rcceph-radosgw
-%config(noreplace) %{_sysconfdir}/logrotate.d/radosgw
 %config %{_sysconfdir}/bash_completion.d/radosgw-admin
-%dir %{_localstatedir}/log/radosgw/
+%dir %{_localstatedir}/lib/ceph/radosgw
+%if 0%{?_with_systemd}
+%else
+%{_initrddir}/ceph-radosgw
+%{_sbindir}/rcceph-radosgw
+%endif
 
 %post radosgw
 /sbin/ldconfig
-%if %{defined suse_version}
-%fillup_and_insserv -f -y ceph-radosgw
+%if 0%{?suse_version}
+  # explicit systemctl daemon-reload (that's the only relevant bit of
+  # service_add_post; the rest is all sysvinit --> systemd migration which
+  # isn't applicable in this context (see above comment).
+  /usr/bin/systemctl daemon-reload >/dev/null 2>&1 || :
 %endif
 
 %preun radosgw
-%if %{defined suse_version}
-%stop_on_removal ceph-radosgw
+%if 0%{?_with_systemd}
+  # Disable and stop on removal.
+  if [ $1 = 0 ] ; then
+    SERVICE_LIST=$(systemctl | grep -E '^ceph-radosgw@'  | cut -d' ' -f1)
+    if [ -n "$SERVICE_LIST" ]; then
+      for SERVICE in $SERVICE_LIST; do
+        /usr/bin/systemctl --no-reload disable $SERVICE > /dev/null 2>&1 || :
+        /usr/bin/systemctl stop $SERVICE > /dev/null 2>&1 || :
+      done
+    fi
+  fi
 %endif
 
 %postun radosgw
 /sbin/ldconfig
-%if %{defined suse_version}
-%restart_on_update ceph-radosgw
-%insserv_cleanup
+%if 0%{?_with_systemd}
+  if [ $1 = 1 ] ; then
+    # Restart on upgrade, but only if "CEPH_AUTO_RESTART_ON_UPGRADE" is set to
+    # "yes". In any case: if units are not running, do not touch them.
+    SYSCONF_CEPH=/etc/sysconfig/ceph
+    if [ -f $SYSCONF_CEPH -a -r $SYSCONF_CEPH ] ; then
+      source $SYSCONF_CEPH
+    fi
+    if [ "X$CEPH_AUTO_RESTART_ON_UPGRADE" = "Xyes" ] ; then
+      SERVICE_LIST=$(systemctl | grep -E '^ceph-radosgw@'  | cut -d' ' -f1)
+      if [ -n "$SERVICE_LIST" ]; then
+        for SERVICE in $SERVICE_LIST; do
+          /usr/bin/systemctl try-restart $SERVICE > /dev/null 2>&1 || :
+        done
+      fi
+    fi
+  fi
 %endif
-# Package removal cleanup
-if [ "$1" -eq "0" ] ; then
-    rm -rf /var/log/radosgw
-fi
-
 
 #################################################################################
 %if %{with ocf}
@@ -756,6 +1028,9 @@ fi
 %files -n librados2
 %defattr(-,root,root,-)
 %{_libdir}/librados.so.*
+%if 0%{?_with_lttng}
+%{_libdir}/librados_tp.so.*
+%endif
 
 %post -n librados2
 /sbin/ldconfig
@@ -776,6 +1051,9 @@ fi
 %{_includedir}/rados/rados_types.hpp
 %{_includedir}/rados/memory.h
 %{_libdir}/librados.so
+%if 0%{?_with_lttng}
+%{_libdir}/librados_tp.so
+%endif
 
 #################################################################################
 %files -n python-rados
@@ -805,6 +1083,9 @@ fi
 %files -n librbd1
 %defattr(-,root,root,-)
 %{_libdir}/librbd.so.*
+%if 0%{?_with_lttng}
+%{_libdir}/librbd_tp.so.*
+%endif
 
 %post -n librbd1
 /sbin/ldconfig
@@ -822,6 +1103,9 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 %{_includedir}/rbd/librbd.hpp
 %{_includedir}/rbd/features.h
 %{_libdir}/librbd.so
+%if 0%{?_with_lttng}
+%{_libdir}/librbd_tp.so
+%endif
 
 #################################################################################
 %files -n python-rbd
@@ -852,11 +1136,6 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 %{python_sitelib}/cephfs.py*
 
 #################################################################################
-%files -n rest-bench
-%defattr(-,root,root,-)
-%{_bindir}/rest-bench
-
-#################################################################################
 %files -n ceph-test
 %defattr(-,root,root,-)
 %{_bindir}/ceph_bench_log
@@ -865,7 +1144,11 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 %{_bindir}/ceph_erasure_code
 %{_bindir}/ceph_erasure_code_benchmark
 %{_bindir}/ceph_omapbench
+%{_bindir}/ceph_objectstore_bench
 %{_bindir}/ceph_perf_objectstore
+%{_bindir}/ceph_perf_local
+%{_bindir}/ceph_perf_msgr_client
+%{_bindir}/ceph_perf_msgr_server
 %{_bindir}/ceph_psim
 %{_bindir}/ceph_radosacl
 %{_bindir}/ceph_rgw_jsonparser
@@ -883,14 +1166,8 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 %{_bindir}/ceph-monstore-tool
 %{_bindir}/ceph-osdomap-tool
 %{_bindir}/ceph-kvstore-tool
-%{_mandir}/man8/rbd-replay.8*
-%{_mandir}/man8/rbd-replay-many.8*
-%{_mandir}/man8/rbd-replay-prep.8*
-%{_bindir}/rbd-replay
-%{_bindir}/rbd-replay-many
-%if (0%{?fedora} >= 20 || 0%{?rhel} == 6)
-%{_bindir}/rbd-replay-prep
-%endif
+%dir %{_libdir}/ceph
+%{_libdir}/ceph/ceph-monstore-update-crush.sh
 
 #################################################################################
 %if 0%{with cephfs_java}
@@ -898,6 +1175,12 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 %defattr(-,root,root,-)
 %{_libdir}/libcephfs_jni.so.*
 
+%post -n libcephfs_jni1
+/sbin/ldconfig
+
+%postun -n libcephfs_jni1
+/sbin/ldconfig
+
 #################################################################################
 %files -n libcephfs_jni1-devel
 %defattr(-,root,root,-)
@@ -911,6 +1194,111 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 %endif
 
 #################################################################################
+%if 0%{with selinux}
+%files selinux
+%defattr(-,root,root,-)
+%attr(0600,root,root) %{_datadir}/selinux/packages/ceph.pp
+%{_datadir}/selinux/devel/include/contrib/ceph.if
+%{_mandir}/man8/ceph_selinux.8*
+
+%post selinux
+# Install the policy
+OLD_POLVER=$(%{_sbindir}/semodule -l | grep -P '^ceph[\t ]' | awk '{print $2}')
+%{_sbindir}/semodule -n -i %{_datadir}/selinux/packages/ceph.pp
+NEW_POLVER=$(%{_sbindir}/semodule -l | grep -P '^ceph[\t ]' | awk '{print $2}')
+
+# Load the policy if SELinux is enabled
+if %{_sbindir}/selinuxenabled; then
+    %{_sbindir}/load_policy
+else
+    # Do not relabel if selinux is not enabled
+    exit 0
+fi
+
+if test "$OLD_POLVER" == "$NEW_POLVER"; then
+   # Do not relabel if policy version did not change
+   exit 0
+fi
+
+# Check whether the daemons are running
+%if 0%{?_with_systemd}
+    /usr/bin/systemctl status ceph.target > /dev/null 2>&1
+%else
+    /sbin/service ceph status >/dev/null 2>&1
+%endif
+STATUS=$?
+
+# Stop the daemons if they were running
+if test $STATUS -eq 0; then
+%if 0%{?_with_systemd}
+    /usr/bin/systemctl stop ceph.target > /dev/null 2>&1
+%else
+    /sbin/service ceph stop >/dev/null 2>&1
+%endif
+fi
+
+# Now, relabel the files
+%relabel_files
+
+# Start the daemons iff they were running before
+if test $STATUS -eq 0; then
+%if 0%{?_with_systemd}
+    /usr/bin/systemctl start ceph.target > /dev/null 2>&1 || :
+%else
+    /sbin/service ceph start >/dev/null 2>&1 || :
+%endif
+fi
+
+exit 0
+
+%postun selinux
+if [ $1 -eq 0 ]; then
+    # Remove the module
+    %{_sbindir}/semodule -n -r ceph
+
+    # Reload the policy if SELinux is enabled
+    if %{_sbindir}/selinuxenabled ; then
+        %{_sbindir}/load_policy
+    else
+        # Do not relabel if SELinux is not enabled
+        exit 0
+    fi
+
+    # Check whether the daemons are running
+    %if 0%{?_with_systemd}
+        /usr/bin/systemctl status ceph.target > /dev/null 2>&1
+    %else
+        /sbin/service ceph status >/dev/null 2>&1
+    %endif
+    STATUS=$?
+
+    # Stop the daemons if they were running
+    if test $STATUS -eq 0; then
+    %if 0%{?_with_systemd}
+        /usr/bin/systemctl stop ceph.target > /dev/null 2>&1
+    %else
+        /sbin/service ceph stop >/dev/null 2>&1
+    %endif
+    fi
+
+    # Now, relabel the files
+    %relabel_files
+
+    # Start the daemons if they were running before
+    if test $STATUS -eq 0; then
+    %if 0%{?_with_systemd}
+	/usr/bin/systemctl start ceph.target > /dev/null 2>&1 || :
+    %else
+	/sbin/service ceph start >/dev/null 2>&1 || :
+    %endif
+    fi
+fi
+exit 0
+
+%endif # with selinux
+
+#################################################################################
+%if 0%{with libs_compat}
 %files libs-compat
 # We need an empty %%files list for ceph-libs-compat, to tell rpmbuild to actually
 # build this meta package.
@@ -919,6 +1307,7 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 %files devel-compat
 # We need an empty %%files list for ceph-devel-compat, to tell rpmbuild to
 # actually build this meta package.
+%endif
 
 #################################################################################
 %files -n python-ceph-compat
diff --git a/src/test/centos-7/install-deps.sh b/src/test/centos-7/install-deps.sh
index 129b238..1bebf09 100755
--- a/src/test/centos-7/install-deps.sh
+++ b/src/test/centos-7/install-deps.sh
@@ -1,8 +1,8 @@
-#!/bin/bash
+#!/bin/bash -e
 #
 # Ceph distributed storage system
 #
-# Copyright (C) 2014 Red Hat <contact at redhat.com>
+# Copyright (C) 2014, 2015 Red Hat <contact at redhat.com>
 #
 # Author: Loic Dachary <loic at dachary.org>
 #
@@ -23,10 +23,14 @@ if test -f /etc/redhat-release ; then
     $SUDO yum install -y redhat-lsb-core
 fi
 
-if which apt-get > /dev/null ; then
+if type apt-get > /dev/null 2>&1 ; then
     $SUDO apt-get install -y lsb-release
 fi
 
+if type zypper > /dev/null 2>&1 ; then
+    $SUDO zypper --gpg-auto-import-keys --non-interactive install lsb-release
+fi
+
 case $(lsb_release -si) in
 Ubuntu|Debian|Devuan)
         $SUDO apt-get install -y dpkg-dev
@@ -38,30 +42,106 @@ Ubuntu|Debian|Devuan)
         packages=$(dpkg-checkbuilddeps --admindir=$DIR debian/control 2>&1 | \
             perl -p -e 's/.*Unmet build dependencies: *//;' \
             -e 's/build-essential:native/build-essential/;' \
-            -e 's/\|//g;' \
+            -e 's/\s*\|\s*/\|/g;' \
             -e 's/\(.*?\)//g;' \
             -e 's/ +/\n/g;' | sort)
         case $(lsb_release -sc) in
             squeeze|wheezy)
                 packages=$(echo $packages | perl -pe 's/[-\w]*babeltrace[-\w]*//g')
+                backports="-t $(lsb_release -sc)-backports"
                 ;;
         esac
         packages=$(echo $packages) # change newlines into spaces
-        $SUDO bash -c "DEBIAN_FRONTEND=noninteractive apt-get install -y $packages"
+        $SUDO env DEBIAN_FRONTEND=noninteractive apt-get install $backports -y $packages || exit 1
         ;;
-CentOS|Fedora|SUSE*|RedHatEnterpriseServer)
+CentOS|Fedora|RedHatEnterpriseServer)
         case $(lsb_release -si) in
-            SUSE*)
-                $SUDO zypper -y yum-utils
+            Fedora)
+                $SUDO yum install -y yum-utils
                 ;;
-            *)
+            CentOS|RedHatEnterpriseServer)
                 $SUDO yum install -y yum-utils
+                MAJOR_VERSION=$(lsb_release -rs | cut -f1 -d.)
+                if test $(lsb_release -si) == RedHatEnterpriseServer ; then
+                    $SUDO yum install subscription-manager
+                    $SUDO subscription-manager repos --enable=rhel-$MAJOR_VERSION-server-optional-rpms
+                fi
+                $SUDO yum-config-manager --add-repo https://dl.fedoraproject.org/pub/epel/$MAJOR_VERSION/x86_64/ 
+                $SUDO yum install --nogpgcheck -y epel-release
+                $SUDO rpm --import /etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-$MAJOR_VERSION
+                $SUDO rm -f /etc/yum.repos.d/dl.fedoraproject.org*
                 ;;
         esac
         sed -e 's/@//g' < ceph.spec.in > $DIR/ceph.spec
-        $SUDO yum-builddep -y $DIR/ceph.spec
+        $SUDO yum-builddep -y $DIR/ceph.spec 2>&1 | tee $DIR/yum-builddep.out
+        ! grep -q -i error: $DIR/yum-builddep.out || exit 1
+        ;;
+*SUSE*)
+        sed -e 's/@//g' < ceph.spec.in > $DIR/ceph.spec
+        $SUDO zypper --non-interactive install $(rpmspec -q --buildrequires $DIR/ceph.spec) || exit 1
         ;;
 *)
         echo "$(lsb_release -si) is unknown, dependencies will have to be installed manually."
         ;;
 esac
+
+function populate_wheelhouse() {
+    local install=$1
+    shift
+
+    # Ubuntu-12.04 and Python 2.7.3 require this line
+    pip --timeout 300 $install 'distribute >= 0.7.3' || return 1
+    # although pip comes with virtualenv, having a recent version
+    # of pip matters when it comes to using wheel packages
+    pip --timeout 300 $install 'setuptools >= 0.8' 'pip >= 7.0' 'wheel >= 0.24' || return 1
+    if test $# != 0 ; then
+        pip --timeout 300 $install $@ || return 1
+    fi
+}
+
+function activate_virtualenv() {
+    local top_srcdir=$1
+    local interpreter=$2
+    local env_dir=$top_srcdir/install-deps-$interpreter
+
+    if ! test -d $env_dir ; then
+        virtualenv --python $interpreter $env_dir
+        . $env_dir/bin/activate
+        if ! populate_wheelhouse install ; then
+            rm -rf $env_dir
+            return 1
+        fi
+    fi
+    . $env_dir/bin/activate
+}
+
+# use pip cache if possible but do not store it outside of the source
+# tree
+# see https://pip.pypa.io/en/stable/reference/pip_install.html#caching
+mkdir -p install-deps-cache
+top_srcdir=$(pwd)
+export XDG_CACHE_HOME=$top_srcdir/install-deps-cache
+wip_wheelhouse=wheelhouse-wip
+
+#
+# preload python modules so that tox can run without network access
+#
+find . -name tox.ini | while read ini ; do
+    (
+        cd $(dirname $ini)
+        require=$(ls *requirements.txt 2>/dev/null | sed -e 's/^/-r /')
+        if test "$require" && ! test -d wheelhouse ; then
+            for interpreter in python2.7 python3 ; do
+                type $interpreter > /dev/null 2>&1 || continue
+                activate_virtualenv $top_srcdir $interpreter || exit 1
+                populate_wheelhouse "wheel -w $wip_wheelhouse" $require || exit 1
+            done
+            mv $wip_wheelhouse wheelhouse
+        fi
+    )
+done
+
+for interpreter in python2.7 python3 ; do
+    rm -rf $top_srcdir/install-deps-$interpreter
+done
+rm -rf $XDG_CACHE_HOME
diff --git a/src/test/ceph-disk.sh b/src/test/ceph-disk.sh
index 8f36a58..be466fa 100755
--- a/src/test/ceph-disk.sh
+++ b/src/test/ceph-disk.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 #
 # Copyright (C) 2014 Cloudwatt <libre.licensing at cloudwatt.com>
-# Copyright (C) 2014 Red Hat <contact at redhat.com>
+# Copyright (C) 2014, 2015 Red Hat <contact at redhat.com>
 #
 # Author: Loic Dachary <loic at dachary.org>
 #
@@ -15,14 +15,24 @@
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU Library Public License for more details.
 #
-set -xe
-
 source test/test_btrfs_common.sh
 
-PS4='${FUNCNAME[0]}: $LINENO: '
+PS4='${BASH_SOURCE[0]}:$LINENO: ${FUNCNAME[0]}:  '
 
-export PATH=:$PATH # make sure program from sources are prefered
+export PATH=.:$PATH # make sure program from sources are prefered
 DIR=test-ceph-disk
+if virtualenv virtualenv-$DIR && test -d ceph-detect-init ; then
+    . virtualenv-$DIR/bin/activate
+    (
+	# older versions of pip will not install wrap_console scripts
+	# when using wheel packages
+	pip install --upgrade 'pip >= 6.1'
+	if test -d ceph-detect-init/wheelhouse ; then
+            wheelhouse="--no-index --use-wheel --find-links=ceph-detect-init/wheelhouse"
+	fi
+	pip --log virtualenv-$DIR/log.txt install $wheelhouse --editable ceph-detect-init
+    )
+fi
 OSD_DATA=$DIR/osd
 MON_ID=a
 MONA=127.0.0.1:7451
@@ -31,12 +41,15 @@ FSID=$(uuidgen)
 export CEPH_CONF=$DIR/ceph.conf
 export CEPH_ARGS="--fsid $FSID"
 CEPH_ARGS+=" --chdir="
+CEPH_ARGS+=" --journal-dio=false"
 CEPH_ARGS+=" --run-dir=$DIR"
 CEPH_ARGS+=" --osd-failsafe-full-ratio=.99"
 CEPH_ARGS+=" --mon-host=$MONA"
 CEPH_ARGS+=" --log-file=$DIR/\$name.log"
 CEPH_ARGS+=" --pid-file=$DIR/\$name.pidfile"
-CEPH_ARGS+=" --osd-pool-default-erasure-code-directory=.libs"
+if test -d .libs ; then
+    CEPH_ARGS+=" --erasure-code-dir=.libs"
+fi
 CEPH_ARGS+=" --auth-supported=none"
 CEPH_ARGS+=" --osd-journal-size=100"
 CEPH_DISK_ARGS=
@@ -51,12 +64,12 @@ timeout=$(which timeout)
 diff=$(which diff)
 mkdir=$(which mkdir)
 rm=$(which rm)
+uuidgen=$(which uuidgen)
 
 function setup() {
     teardown
     mkdir $DIR
     mkdir $OSD_DATA
-#    mkdir $OSD_DATA/ceph-0
     touch $DIR/ceph.conf # so ceph-disk think ceph is the cluster
 }
 
@@ -66,20 +79,23 @@ function teardown() {
         rm -fr $DIR/*/*db
         teardown_btrfs $DIR
     fi
+    grep " $(pwd)/$DIR/" < /proc/mounts | while read mounted rest ; do
+        umount $mounted
+    done
     rm -fr $DIR
 }
 
 function run_mon() {
     local mon_dir=$DIR/$MON_ID
 
-    ./ceph-mon \
+    ceph-mon \
         --id $MON_ID \
         --mkfs \
         --mon-data=$mon_dir \
         --mon-initial-members=$MON_ID \
         "$@"
 
-    ./ceph-mon \
+    ceph-mon \
         --id $MON_ID \
         --mon-data=$mon_dir \
         --mon-osd-full-ratio=.99 \
@@ -90,10 +106,13 @@ function run_mon() {
 }
 
 function kill_daemons() {
+    if ! test -e $DIR ; then
+        return
+    fi
     for pidfile in $(find $DIR | grep pidfile) ; do
         pid=$(cat $pidfile)
         for try in 0 1 1 1 2 3 ; do
-            kill $pid || break
+            kill $pid 2>/dev/null || break
             sleep $try
         done
     done
@@ -167,11 +186,51 @@ function test_no_path() {
     ( unset PATH ; test_activate_dir ) || return 1
 }
 
+function test_mark_init() {
+    run_mon
+
+    local osd_data=$(pwd)/$DIR/dir
+    $mkdir -p $osd_data
+
+    local osd_uuid=$($uuidgen)
+
+    $mkdir -p $OSD_DATA
+
+    ceph-disk $CEPH_DISK_ARGS \
+        prepare --osd-uuid $osd_uuid $osd_data || return 1
+
+    $timeout $TIMEOUT ceph-disk $CEPH_DISK_ARGS \
+        --verbose \
+        activate \
+        --mark-init=auto \
+        --no-start-daemon \
+        $osd_data || return 1
+
+    test -f $osd_data/$(ceph-detect-init) || return 1
+
+    if test systemd = $(ceph-detect-init) ; then
+        expected=sysvinit
+    else
+        expected=systemd
+    fi
+    $timeout $TIMEOUT ceph-disk $CEPH_DISK_ARGS \
+        --verbose \
+        activate \
+        --mark-init=$expected \
+        --no-start-daemon \
+        $osd_data || return 1
+
+    ! test -f $osd_data/$(ceph-detect-init) || return 1
+    test -f $osd_data/$expected || return 1
+
+    $rm -fr $osd_data
+}
+
 function test_zap() {
     local osd_data=$DIR/dir
     $mkdir -p $osd_data
 
-    ./ceph-disk $CEPH_DISK_ARGS zap $osd_data 2>&1 | grep 'not full block device' || return 1
+    ceph-disk $CEPH_DISK_ARGS zap $osd_data 2>&1 | grep -q 'not full block device' || return 1
 
     $rm -fr $osd_data
 }
@@ -179,14 +238,14 @@ function test_zap() {
 # ceph-disk prepare returns immediately on success if the magic file
 # exists in the --osd-data directory.
 function test_activate_dir_magic() {
-    local uuid=$(uuidgen)
+    local uuid=$($uuidgen)
     local osd_data=$DIR/osd
 
     echo a failure to create the fsid file implies the magic file is not created
 
     mkdir -p $osd_data/fsid
     CEPH_ARGS="--fsid $uuid" \
-     ./ceph-disk $CEPH_DISK_ARGS prepare $osd_data > $DIR/out 2>&1
+     ceph-disk $CEPH_DISK_ARGS prepare $osd_data > $DIR/out 2>&1
     grep --quiet 'Is a directory' $DIR/out || return 1
     ! [ -f $osd_data/magic ] || return 1
     rmdir $osd_data/fsid
@@ -194,106 +253,50 @@ function test_activate_dir_magic() {
     echo successfully prepare the OSD
 
     CEPH_ARGS="--fsid $uuid" \
-     ./ceph-disk $CEPH_DISK_ARGS prepare $osd_data 2>&1 | tee $DIR/out
+     ceph-disk $CEPH_DISK_ARGS prepare $osd_data 2>&1 | tee $DIR/out
     grep --quiet 'Preparing osd data dir' $DIR/out || return 1
     grep --quiet $uuid $osd_data/ceph_fsid || return 1
     [ -f $osd_data/magic ] || return 1
 
     echo will not override an existing OSD
 
-    CEPH_ARGS="--fsid $(uuidgen)" \
-     ./ceph-disk $CEPH_DISK_ARGS prepare $osd_data 2>&1 | tee $DIR/out
+    CEPH_ARGS="--fsid $($uuidgen)" \
+     ceph-disk $CEPH_DISK_ARGS prepare $osd_data 2>&1 | tee $DIR/out
     grep --quiet 'ceph-disk:Data dir .* already exists' $DIR/out || return 1
     grep --quiet $uuid $osd_data/ceph_fsid || return 1
 }
 
-function test_activate() {
-    local to_prepare=$1
-    local to_activate=$2
-    local journal=$3
-
-    $mkdir -p $OSD_DATA
+function test_pool_read_write() {
+    local osd_uuid=$1
 
-    ./ceph-disk $CEPH_DISK_ARGS \
-        prepare $to_prepare $journal || return 1
+    $timeout $TIMEOUT ceph osd pool set $TEST_POOL size 1 || return 1
 
-    $timeout $TIMEOUT ./ceph-disk $CEPH_DISK_ARGS \
-        activate \
-        --mark-init=none \
-        $to_activate || return 1
-    $timeout $TIMEOUT ./ceph osd pool set $TEST_POOL size 1 || return 1
-
-    local id=$($cat $OSD_DATA/ceph-?/whoami || $cat $to_activate/whoami)
-    local weight=1
-    ./ceph osd crush add osd.$id $weight root=default host=localhost || return 1
-    echo FOO > $DIR/BAR
-    $timeout $TIMEOUT ./rados --pool $TEST_POOL put BAR $DIR/BAR || return 1
-    $timeout $TIMEOUT ./rados --pool $TEST_POOL get BAR $DIR/BAR.copy || return 1
-    $diff $DIR/BAR $DIR/BAR.copy || return 1
-}
-
-function test_activate_dmcrypt() {
-    local to_prepare=$1
-    local to_activate=$2
-    local journal=$3
-    local journal_p=$4
-    local uuid=$5
-    local juuid=$6
-
-    $mkdir -p $OSD_DATA
-
-    ./ceph-disk $CEPH_DISK_ARGS \
-		prepare --dmcrypt --dmcrypt-key-dir $DIR/keys --osd-uuid=$uuid --journal-uuid=$juuid $to_prepare $journal || return 1
-
-    /sbin/cryptsetup --key-file $DIR/keys/$uuid.luks.key luksOpen $to_activate $uuid
-    /sbin/cryptsetup --key-file $DIR/keys/$juuid.luks.key luksOpen ${journal}${journal_p} $juuid
-    
-    $timeout $TIMEOUT ./ceph-disk $CEPH_DISK_ARGS \
-        activate \
-        --mark-init=none \
-        /dev/mapper/$uuid || return 1
-    $timeout $TIMEOUT ./ceph osd pool set $TEST_POOL size 1 || return 1
-
-    local id=$($cat $OSD_DATA/ceph-?/whoami || $cat $to_activate/whoami)
+    local id=$(ceph osd create $osd_uuid)
     local weight=1
-    ./ceph osd crush add osd.$id $weight root=default host=localhost || return 1
+    ceph osd crush add osd.$id $weight root=default host=localhost || return 1
     echo FOO > $DIR/BAR
-    $timeout $TIMEOUT ./rados --pool $TEST_POOL put BAR $DIR/BAR || return 1
-    $timeout $TIMEOUT ./rados --pool $TEST_POOL get BAR $DIR/BAR.copy || return 1
+    $timeout $TIMEOUT rados --pool $TEST_POOL put BAR $DIR/BAR || return 1
+    $timeout $TIMEOUT rados --pool $TEST_POOL get BAR $DIR/BAR.copy || return 1
     $diff $DIR/BAR $DIR/BAR.copy || return 1
 }
 
-function test_activate_dmcrypt_plain() {
+function test_activate() {
     local to_prepare=$1
     local to_activate=$2
     local journal=$3
-    local journal_p=$4
-    local uuid=$5
-    local juuid=$6
+    local osd_uuid=$($uuidgen)
 
     $mkdir -p $OSD_DATA
 
-    echo "osd_dmcrypt_type=plain" > $DIR/ceph.conf
-    
-    ./ceph-disk $CEPH_DISK_ARGS \
-		prepare --dmcrypt --dmcrypt-key-dir $DIR/keys --osd-uuid=$uuid --journal-uuid=$juuid $to_prepare $journal || return 1
+    ceph-disk $CEPH_DISK_ARGS \
+        prepare --osd-uuid $osd_uuid $to_prepare $journal || return 1
 
-    /sbin/cryptsetup --key-file $DIR/keys/$uuid --key-size 256 create $uuid $to_activate
-    /sbin/cryptsetup --key-file $DIR/keys/$juuid --key-size 256 create $juuid $journal
-    
-    $timeout $TIMEOUT ./ceph-disk $CEPH_DISK_ARGS \
+    $timeout $TIMEOUT ceph-disk $CEPH_DISK_ARGS \
         activate \
         --mark-init=none \
-        /dev/mapper/$uuid || return 1
-    $timeout $TIMEOUT ./ceph osd pool set $TEST_POOL size 1 || return 1
+        $to_activate || return 1
 
-    local id=$($cat $OSD_DATA/ceph-?/whoami || $cat $to_activate/whoami)
-    local weight=1
-    ./ceph osd crush add osd.$id $weight root=default host=localhost || return 1
-    echo FOO > $DIR/BAR
-    $timeout $TIMEOUT ./rados --pool $TEST_POOL put BAR $DIR/BAR || return 1
-    $timeout $TIMEOUT ./rados --pool $TEST_POOL get BAR $DIR/BAR.copy || return 1
-    $diff $DIR/BAR $DIR/BAR.copy || return 1
+    test_pool_read_write $osd_uuid || return 1
 }
 
 function test_activate_dir() {
@@ -305,152 +308,6 @@ function test_activate_dir() {
     $rm -fr $osd_data
 }
 
-function create_dev() {
-    local name=$1
-
-    dd if=/dev/zero of=$name bs=1024k count=200
-    losetup --find $name
-    local dev=$(losetup --associated $name | cut -f1 -d:)
-    ceph-disk zap $dev > /dev/null 2>&1
-    echo $dev
-}
-
-function destroy_dev() {
-    local name=$1
-    local dev=$2
-
-    for partition in 1 2 3 4 ; do
-        umount ${dev}p${partition} || true
-    done
-    losetup --detach $dev
-    rm $name
-}
-
-function activate_dev_body() {
-    local disk=$1
-    local journal=$2
-    local newdisk=$3
-
-    setup
-    run_mon
-    test_activate $disk ${disk}p1 $journal || return 1
-    kill_daemons
-    umount ${disk}p1 || return 1
-    teardown
-
-    # reuse the journal partition
-    setup
-    run_mon
-    test_activate $newdisk ${newdisk}p1 ${journal}p1 || return 1
-    kill_daemons
-    umount ${newdisk}p1 || return 1
-    teardown
-}
-
-function test_activate_dev() {
-    if test $(id -u) != 0 ; then
-        echo "SKIP because not root"
-        return 0
-    fi
-
-    local disk=$(create_dev vdf.disk)
-    local journal=$(create_dev vdg.disk)
-    local newdisk=$(create_dev vdh.disk)
-
-    activate_dev_body $disk $journal $newdisk
-    status=$?
-
-    destroy_dev vdf.disk $disk
-    destroy_dev vdg.disk $journal
-    destroy_dev vdh.disk $newdisk
-
-    return $status
-}
-
-function destroy_dmcrypt_dev() {
-    local name=$1
-    local dev=$2
-    local uuid=$3
-
-    for partition in 1 2 3 4 ; do
-        umount /dev/mapper/$uuid || true
-	/sbin/cryptsetup remove /dev/mapper/$uuid || true
-	dmsetup remove /dev/mapper/$uuid || true
-    done
-    losetup --detach $dev
-    rm $name
-}
-
-function activate_dmcrypt_dev_body() {
-    local disk=$1
-    local journal=$2
-    local newdisk=$3
-    local uuid=$(uuidgen)
-    local juuid=$(uuidgen)
-
-    setup
-    run_mon
-    test_activate_dmcrypt $disk ${disk}p1 $journal p1 $uuid $juuid|| return 1
-    kill_daemons
-    umount /dev/mapper/$uuid || return 1
-    teardown
-}
-
-function test_activate_dmcrypt_dev() {
-    if test $(id -u) != 0 ; then
-        echo "SKIP because not root"
-        return 0
-    fi
-
-    local disk=$(create_dev vdf.disk)
-    local journal=$(create_dev vdg.disk)
-    local newdisk=$(create_dev vdh.disk)
-
-    activate_dmcrypt_dev_body $disk $journal $newdisk
-    status=$?
-
-    destroy_dmcrypt_dev vdf.disk $disk
-    destroy_dmcrypt_dev vdg.disk $journal
-    destroy_dmcrypt_dev vdh.disk $newdisk
-
-    return $status
-}
-
-function activate_dmcrypt_plain_dev_body() {
-    local disk=$1
-    local journal=$2
-    local newdisk=$3
-    local uuid=$(uuidgen)
-    local juuid=$(uuidgen)
-
-    setup
-    run_mon
-    test_activate_dmcrypt_plain $disk ${disk}p1 $journal p1 $uuid $juuid|| return 1
-    kill_daemons
-    umount /dev/mapper/$uuid || return 1
-    teardown
-}
-
-function test_activate_dmcrypt_plain_dev() {
-    if test $(id -u) != 0 ; then
-        echo "SKIP because not root"
-        return 0
-    fi
-
-    local disk=$(create_dev vdf.disk)
-    local journal=$(create_dev vdg.disk)
-    local newdisk=$(create_dev vdh.disk)
-
-    activate_dmcrypt_plain_dev_body $disk $journal $newdisk
-    status=$?
-
-    destroy_dmcrypt_dev vdf.disk $disk
-    destroy_dmcrypt_dev vdg.disk $journal
-    destroy_dmcrypt_dev vdh.disk $newdisk
-
-    return $status
-}
-
 function test_find_cluster_by_uuid() {
     setup
     test_activate_dir 2>&1 | tee $DIR/test_find
@@ -479,13 +336,23 @@ function run() {
     default_actions+="test_activate_dir_magic "
     default_actions+="test_activate_dir "
     default_actions+="test_keyring_path "
+    default_actions+="test_mark_init "
     default_actions+="test_zap "
     local actions=${@:-$default_actions}
+    local status
     for action in $actions  ; do
         setup
-        $action || return 1
+        set -x
+        $action
+        status=$?
+        set +x
         teardown
+        if test $status != 0 ; then
+            break
+        fi
     done
+    rm -fr virtualenv-$DIR
+    return $status
 }
 
 run $@
diff --git a/src/test/ceph_argparse.cc b/src/test/ceph_argparse.cc
index d1a790f..f846db7 100644
--- a/src/test/ceph_argparse.cc
+++ b/src/test/ceph_argparse.cc
@@ -74,7 +74,10 @@ TEST(CephArgParse, SimpleArgParse) {
 
   found_foo = false;
   found_bar = "";
+  bool baz_found = false;
+  std::string found_baz = "";
   VectorContainer foo(FOO);
+  ostringstream err;
   for (std::vector<const char*>::iterator i = foo.arr.begin();
        i != foo.arr.end(); )
   {
@@ -83,11 +86,17 @@ TEST(CephArgParse, SimpleArgParse) {
       }
       else if (ceph_argparse_witharg(foo.arr, i, &found_bar, "--bar", (char*)NULL)) {
       }
+      else if (ceph_argparse_witharg(foo.arr, i, &found_baz, err, "--baz", (char*)NULL)) {
+	ASSERT_NE(string(""), err.str());
+	baz_found = true;
+      }
       else
 	++i;
   }
   ASSERT_EQ(found_foo, true);
   ASSERT_EQ(found_bar, "");
+  ASSERT_EQ(baz_found, true);
+  ASSERT_EQ(found_baz, "");
 
   found_foo = false;
   found_bar = "";
@@ -271,6 +280,30 @@ TEST(CephArgParse, WithDashesAndUnderscores) {
   ASSERT_EQ(found_baz, "");
 }
 
+TEST(CephArgParse, WithFloat) {
+  const char *BAZSTUFF1[] = { "./myprog", "--foo", "50.5", "--bar", "52", NULL };
+
+  VectorContainer bazstuff1(BAZSTUFF1);
+  ostringstream err;
+  float foo;
+  int bar = -1;
+  for (std::vector<const char*>::iterator i = bazstuff1.arr.begin();
+       i != bazstuff1.arr.end(); )
+  {
+    if (ceph_argparse_double_dash(bazstuff1.arr, i)) {
+      break;
+    } else if (ceph_argparse_witharg(bazstuff1.arr, i, &foo, err, "--foo", (char*)NULL)) {
+      ASSERT_EQ(string(""), err.str());
+    } else if (ceph_argparse_witharg(bazstuff1.arr, i, &bar, err, "--bar", (char*)NULL)) {
+      ASSERT_EQ(string(""), err.str());
+    }
+    else {
+      ++i;
+    }
+  }
+  ASSERT_EQ(foo, 50.5);
+  ASSERT_EQ(bar, 52);
+}
 
 TEST(CephArgParse, WithInt) {
   const char *BAZSTUFF1[] = { "./myprog", "--foo", "50", "--bar", "52", NULL };
@@ -286,9 +319,9 @@ TEST(CephArgParse, WithInt) {
   {
     if (ceph_argparse_double_dash(bazstuff1.arr, i)) {
       break;
-    } else if (ceph_argparse_withint(bazstuff1.arr, i, &foo, &err, "--foo", (char*)NULL)) {
+    } else if (ceph_argparse_witharg(bazstuff1.arr, i, &foo, err, "--foo", (char*)NULL)) {
       ASSERT_EQ(string(""), err.str());
-    } else if (ceph_argparse_withint(bazstuff1.arr, i, &bar, &err, "--bar", (char*)NULL)) {
+    } else if (ceph_argparse_witharg(bazstuff1.arr, i, &bar, err, "--bar", (char*)NULL)) {
       ASSERT_EQ(string(""), err.str());
     }
     else {
@@ -306,7 +339,7 @@ TEST(CephArgParse, WithInt) {
   {
     if (ceph_argparse_double_dash(bazstuff2.arr, i)) {
       break;
-    } else if (ceph_argparse_withint(bazstuff2.arr, i, &foo, &err2, "--foo", (char*)NULL)) {
+    } else if (ceph_argparse_witharg(bazstuff2.arr, i, &foo, err2, "--foo", (char*)NULL)) {
       ASSERT_NE(string(""), err2.str());
     }
     else {
@@ -322,9 +355,9 @@ TEST(CephArgParse, WithInt) {
   {
     if (ceph_argparse_double_dash(bazstuff3.arr, i)) {
       break;
-    } else if (ceph_argparse_withint(bazstuff3.arr, i, &foo, &err, "--foo", (char*)NULL)) {
+    } else if (ceph_argparse_witharg(bazstuff3.arr, i, &foo, err, "--foo", (char*)NULL)) {
       ASSERT_EQ(string(""), err.str());
-    } else if (ceph_argparse_withint(bazstuff3.arr, i, &bar, &err, "--bar", (char*)NULL)) {
+    } else if (ceph_argparse_witharg(bazstuff3.arr, i, &bar, err, "--bar", (char*)NULL)) {
       ASSERT_EQ(string(""), err.str());
     }
     else {
diff --git a/src/test/ceph_objectstore_tool.py b/src/test/ceph_objectstore_tool.py
index 52ae51c..79230d2 100755
--- a/src/test/ceph_objectstore_tool.py
+++ b/src/test/ceph_objectstore_tool.py
@@ -8,7 +8,7 @@ except ImportError:
         import subprocess
         # backported from python 2.7 stdlib
         process = subprocess.Popen(
-           stdout=subprocess.PIPE, *popenargs, **kwargs)
+            stdout=subprocess.PIPE, *popenargs, **kwargs)
         output, unused_err = process.communicate()
         retcode = process.poll()
         if retcode:
@@ -20,21 +20,32 @@ except ImportError:
             raise error
         return output
 
-import subprocess
+import filecmp
 import os
+import subprocess
+try:
+    from subprocess import DEVNULL
+except ImportError:
+    subprocess.DEVNULL = open(os.devnull, "w")
+
+import math
 import time
 import sys
 import re
 import string
 import logging
 import json
+import tempfile
 
 logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARNING)
 
 
 def wait_for_health():
     print "Wait for health_ok...",
+    tries = 0
     while call("./ceph health 2> /dev/null | grep -v 'HEALTH_OK\|HEALTH_WARN' > /dev/null", shell=True) == 0:
+        if ++tries == 30:
+            raise Exception("Time exceeded to go to health")
         time.sleep(5)
     print "DONE"
 
@@ -51,7 +62,7 @@ def get_osd_pgs(SUBDIR, ID):
     if ID:
         endhead = re.compile("{id}.*_head$".format(id=ID))
     DIR = os.path.join(SUBDIR, "current")
-    PGS += [f for f in os.listdir(DIR) if os.path.isdir(os.path.join(DIR, f)) and (ID == None or endhead.match(f))]
+    PGS += [f for f in os.listdir(DIR) if os.path.isdir(os.path.join(DIR, f)) and (ID is None or endhead.match(f))]
     PGS = [re.sub("_head", "", p) for p in PGS if "_head" in p]
     return PGS
 
@@ -79,7 +90,7 @@ def get_objs(ALLPGS, prefix, DIR, ID):
                 continue
             FINALDIR = os.path.join(SUBDIR, PGDIR)
             # See if there are any objects there
-            if [ f for f in [ val for  _, _, fl in os.walk(FINALDIR) for val in fl ] if string.find(f, prefix) == 0 ]:
+            if any(f for f in [val for _, _, fl in os.walk(FINALDIR) for val in fl] if f.startswith(prefix)):
                 PGS += [p]
     return sorted(set(PGS))
 
@@ -124,51 +135,44 @@ def cat_file(level, filename):
     print "<EOF>"
 
 
-def vstart(new):
+def vstart(new, opt=""):
     print "vstarting....",
-    OPT = new and "-n" or ""
-    call("MON=1 OSD=4 CEPH_PORT=7400 ./vstart.sh -l {opt} -d mon osd > /dev/null 2>&1".format(opt=OPT), shell=True)
+    NEW = new and "-n" or ""
+    call("MON=1 OSD=4 CEPH_PORT=7400 ./vstart.sh -l {new} -d mon osd {opt} > /dev/null 2>&1".format(new=NEW, opt=opt), shell=True)
     print "DONE"
 
-def test_failure_tty(cmd, errmsg):
-    try:
-        ttyfd = open("/dev/tty", "rw")
-    except Exception, e:
-        logging.info(str(e))
-        logging.info("SKIP " + cmd)
-        return 0
+
+def test_failure(cmd, errmsg, tty=False):
+    if tty:
+        try:
+            ttyfd = open("/dev/tty", "rw")
+        except Exception, e:
+            logging.info(str(e))
+            logging.info("SKIP " + cmd)
+            return 0
     TMPFILE = r"/tmp/tmp.{pid}".format(pid=os.getpid())
     tmpfd = open(TMPFILE, "w")
 
     logging.debug(cmd)
-    ret = call(cmd, shell=True, stdin=ttyfd, stdout=ttyfd, stderr=tmpfd)
-    ttyfd.close()
+    if tty:
+        ret = call(cmd, shell=True, stdin=ttyfd, stdout=ttyfd, stderr=tmpfd)
+        ttyfd.close()
+    else:
+        ret = call(cmd, shell=True, stderr=tmpfd)
     tmpfd.close()
     if ret == 0:
+        logging.error(cmd)
         logging.error("Should have failed, but got exit 0")
         return 1
     lines = get_lines(TMPFILE)
-    line = lines[0]
-    if line == errmsg:
-        logging.info("Correctly failed with message \"" + line + "\"")
+    matched = [ l for l in lines if errmsg in l ]
+    if any(matched):
+        logging.info("Correctly failed with message \"" + matched[0] + "\"")
         return 0
     else:
-        logging.error("Bad message to stderr \"" + line + "\"")
+        logging.error("Bad messages to stderr \"" + str(lines) + "\"")
         return 1
 
-def test_failure(cmd, errmsg):
-    logging.debug(cmd)
-    try:
-        out = check_output(cmd, stderr=subprocess.STDOUT, shell=True)
-        logging.error("Should have failed, but got exit 0")
-        return 1
-    except subprocess.CalledProcessError, e:
-        if errmsg in e.output:
-            logging.info("Correctly failed with message \"" + errmsg + "\"")
-            return 0
-        else:
-            logging.error("Bad message to stderr \"" + e.output + "\"")
-            return 1
 
 def get_nspace(num):
     if num == 0:
@@ -203,18 +207,295 @@ def verify(DATADIR, POOL, NAME_PREFIX):
             pass
     return ERRORS
 
+
+def check_journal(jsondict):
+    errors = 0
+    if 'header' not in jsondict:
+        logging.error("Key 'header' not in dump-journal")
+        errors += 1
+    elif 'max_size' not in jsondict['header']:
+        logging.error("Key 'max_size' not in dump-journal header")
+        errors += 1
+    else:
+        print "\tJournal max_size = {size}".format(size=jsondict['header']['max_size'])
+    if 'entries' not in jsondict:
+        logging.error("Key 'entries' not in dump-journal output")
+        errors += 1
+    elif len(jsondict['entries']) == 0:
+        logging.info("No entries in journal found")
+    else:
+        errors += check_journal_entries(jsondict['entries'])
+    return errors
+
+
+def check_journal_entries(entries):
+    errors = 0
+    for enum in range(len(entries)):
+        if 'offset' not in entries[enum]:
+            logging.error("No 'offset' key in entry {e}".format(e=enum))
+            errors += 1
+        if 'seq' not in entries[enum]:
+            logging.error("No 'seq' key in entry {e}".format(e=enum))
+            errors += 1
+        if 'transactions' not in entries[enum]:
+            logging.error("No 'transactions' key in entry {e}".format(e=enum))
+            errors += 1
+        elif len(entries[enum]['transactions']) == 0:
+            logging.error("No transactions found in entry {e}".format(e=enum))
+            errors += 1
+        else:
+            errors += check_entry_transactions(entries[enum], enum)
+    return errors
+
+
+def check_entry_transactions(entry, enum):
+    errors = 0
+    for tnum in range(len(entry['transactions'])):
+        if 'trans_num' not in entry['transactions'][tnum]:
+            logging.error("Key 'trans_num' missing from entry {e} trans {t}".format(e=enum, t=tnum))
+            errors += 1
+        elif entry['transactions'][tnum]['trans_num'] != tnum:
+            ft = entry['transactions'][tnum]['trans_num']
+            logging.error("Bad trans_num ({ft}) entry {e} trans {t}".format(ft=ft, e=enum, t=tnum))
+            errors += 1
+        if 'ops' not in entry['transactions'][tnum]:
+            logging.error("Key 'ops' missing from entry {e} trans {t}".format(e=enum, t=tnum))
+            errors += 1
+        else:
+            errors += check_transaction_ops(entry['transactions'][tnum]['ops'], enum, tnum)
+    return errors
+
+
+def check_transaction_ops(ops, enum, tnum):
+    if len(ops) is 0:
+        logging.warning("No ops found in entry {e} trans {t}".format(e=enum, t=tnum))
+    errors = 0
+    for onum in range(len(ops)):
+        if 'op_num' not in ops[onum]:
+            logging.error("Key 'op_num' missing from entry {e} trans {t} op {o}".format(e=enum, t=tnum, o=onum))
+            errors += 1
+        elif ops[onum]['op_num'] != onum:
+            fo = ops[onum]['op_num']
+            logging.error("Bad op_num ({fo}) from entry {e} trans {t} op {o}".format(fo=fo, e=enum, t=tnum, o=onum))
+            errors += 1
+        if 'op_name' not in ops[onum]:
+            logging.error("Key 'op_name' missing from entry {e} trans {t} op {o}".format(e=enum, t=tnum, o=onum))
+            errors += 1
+    return errors
+
+
+def test_dump_journal(CFSD_PREFIX, osds):
+    ERRORS = 0
+    pid = os.getpid()
+    TMPFILE = r"/tmp/tmp.{pid}".format(pid=pid)
+
+    for osd in osds:
+        # Test --op dump-journal by loading json
+        cmd = (CFSD_PREFIX + "--op dump-journal --format json").format(osd=osd)
+        logging.debug(cmd)
+        tmpfd = open(TMPFILE, "w")
+        ret = call(cmd, shell=True, stdout=tmpfd)
+        if ret != 0:
+            logging.error("Bad exit status {ret} from {cmd}".format(ret=ret, cmd=cmd))
+            ERRORS += 1
+            continue
+        tmpfd.close()
+        tmpfd = open(TMPFILE, "r")
+        jsondict = json.load(tmpfd)
+        tmpfd.close()
+        os.unlink(TMPFILE)
+
+        journal_errors = check_journal(jsondict)
+        if journal_errors is not 0:
+            logging.error(jsondict)
+        ERRORS += journal_errors
+
+    return ERRORS
+
+
 CEPH_DIR = "ceph_objectstore_tool_dir"
 CEPH_CONF = os.path.join(CEPH_DIR, 'ceph.conf')
 
+
 def kill_daemons():
     call("./init-ceph -c {conf} stop osd mon > /dev/null 2>&1".format(conf=CEPH_CONF), shell=True)
 
+
+def check_data(DATADIR, TMPFILE, OSDDIR, SPLIT_NAME):
+    repcount = 0
+    ERRORS = 0
+    for nsfile in [f for f in os.listdir(DATADIR) if f.split('-')[1].find(SPLIT_NAME) == 0]:
+        nspace = nsfile.split("-")[0]
+        file = nsfile.split("-")[1]
+        path = os.path.join(DATADIR, nsfile)
+        tmpfd = open(TMPFILE, "w")
+        cmd = "find {dir} -name '{file}_*_{nspace}_*'".format(dir=OSDDIR, file=file, nspace=nspace)
+        logging.debug(cmd)
+        ret = call(cmd, shell=True, stdout=tmpfd)
+        if ret:
+            logging.critical("INTERNAL ERROR")
+            return 1
+        tmpfd.close()
+        obj_locs = get_lines(TMPFILE)
+        if len(obj_locs) == 0:
+            logging.error("Can't find imported object {name}".format(name=file))
+            ERRORS += 1
+        for obj_loc in obj_locs:
+            repcount += 1
+            cmd = "diff -q {src} {obj_loc}".format(src=path, obj_loc=obj_loc)
+            logging.debug(cmd)
+            ret = call(cmd, shell=True)
+            if ret != 0:
+                logging.error("{file} data not imported properly into {obj}".format(file=file, obj=obj_loc))
+                ERRORS += 1
+    return ERRORS, repcount
+
+
+def set_osd_weight(CFSD_PREFIX, osd_ids, osd_path, weight):
+    print "Testing get-osdmap and set-osdmap"
+    # change the weight of osd.0 to math.pi in the newest osdmap of given osd
+    osdmap_file = tempfile.NamedTemporaryFile()
+    cmd = (CFSD_PREFIX + "--op get-osdmap --file {osdmap_file}").format(osd=osd_path,
+                                                                        osdmap_file=osdmap_file.name)
+    output = check_output(cmd, shell=True)
+    epoch = int(re.findall('#(\d+)', output)[0])
+    
+    new_crush_file = tempfile.NamedTemporaryFile(delete=False)
+    old_crush_file = tempfile.NamedTemporaryFile(delete=False)
+    ret = call("./osdmaptool --export-crush {crush_file} {osdmap_file}".format(osdmap_file=osdmap_file.name,
+                                                                          crush_file=old_crush_file.name),
+               stdout=subprocess.DEVNULL,
+               stderr=subprocess.DEVNULL,
+               shell=True)
+    assert(ret == 0)
+
+    for osd_id in osd_ids:
+        cmd = "./crushtool -i {crush_file} --reweight-item osd.{osd} {weight} -o {new_crush_file}".format(osd=osd_id,
+                                                                                                          crush_file=old_crush_file.name,
+                                                                                                          weight=weight,
+                                                                                                          new_crush_file=new_crush_file.name)
+        ret = call(cmd, stdout=subprocess.DEVNULL, shell=True)
+        assert(ret == 0)
+        old_crush_file, new_crush_file = new_crush_file, old_crush_file
+
+    # change them back, since we don't need to preapre for another round
+    old_crush_file, new_crush_file = new_crush_file, old_crush_file
+    old_crush_file.close()
+
+    ret = call("./osdmaptool --import-crush {crush_file} {osdmap_file}".format(osdmap_file=osdmap_file.name,
+                                                                               crush_file=new_crush_file.name),
+               stdout=subprocess.DEVNULL,
+               stderr=subprocess.DEVNULL,
+               shell=True)
+    assert(ret == 0)
+    # osdmaptool increases the epoch of the changed osdmap, so we need to force the tool
+    # to use use a different epoch than the one in osdmap
+    cmd = CFSD_PREFIX + "--op set-osdmap --file {osdmap_file} --epoch {epoch} --force"
+    cmd = cmd.format(osd=osd_path, osdmap_file=osdmap_file.name, epoch=epoch)
+    ret = call(cmd, stdout=subprocess.DEVNULL, shell=True)
+    return ret == 0
+
+def get_osd_weights(CFSD_PREFIX, osd_ids, osd_path):
+    osdmap_file = tempfile.NamedTemporaryFile()
+    cmd = (CFSD_PREFIX + "--op get-osdmap --file {osdmap_file}").format(osd=osd_path,
+                                                                        osdmap_file=osdmap_file.name)
+    ret = call(cmd, stdout=subprocess.DEVNULL, shell=True)
+    if ret != 0:
+        return None
+    # we have to read the weights from the crush map, even we can query the weights using
+    # osdmaptool, but please keep in mind, they are different:
+    #    item weights in crush map versus weight associated with each osd in osdmap
+    crush_file = tempfile.NamedTemporaryFile(delete=False)
+    ret = call("./osdmaptool --export-crush {crush_file} {osdmap_file}".format(osdmap_file=osdmap_file.name,
+                                                                               crush_file=crush_file.name),
+               stdout=subprocess.DEVNULL,
+               shell=True)
+    assert(ret == 0)
+    output = check_output("./crushtool --tree -i {crush_file} | tail -n {num_osd}".format(crush_file=crush_file.name,
+                                                                                          num_osd=len(osd_ids)),
+                          stderr=subprocess.DEVNULL,
+                          shell=True)
+    weights = []
+    for line in output.strip().split('\n'):
+        osd_id, weight, osd_name = re.split('\s+', line)
+        weights.append(float(weight))
+    return weights
+
+
+def test_get_set_osdmap(CFSD_PREFIX, osd_ids, osd_paths):
+    print "Testing get-osdmap and set-osdmap"
+    errors = 0
+    kill_daemons()
+    weight = 1 / math.e           # just some magic number in [0, 1]
+    changed = []
+    for osd_path in osd_paths:
+        if set_osd_weight(CFSD_PREFIX, osd_ids, osd_path, weight):
+            changed.append(osd_path)
+        else:
+            logging.warning("Failed to change the weights: {0}".format(osd_path))
+    # i am pissed off if none of the store gets changed
+    if not changed:
+        errors += 1
+
+    for osd_path in changed:
+        weights = get_osd_weights(CFSD_PREFIX, osd_ids, osd_path)
+        if not weights:
+            errors += 1
+            continue
+        if any(abs(w - weight) > 1e-5 for w in weights):
+            logging.warning("Weight is not changed: {0} != {1}".format(weights, weight))
+            errors += 1
+    return errors
+
+def test_get_set_inc_osdmap(CFSD_PREFIX, osd_path):
+    # incrementals are not used unless we need to build an MOSDMap to update
+    # OSD's peers, so an obvious way to test it is simply overwrite an epoch
+    # with a different copy, and read it back to see if it matches.
+    kill_daemons()
+    file_e2 = tempfile.NamedTemporaryFile()
+    cmd = (CFSD_PREFIX + "--op get-inc-osdmap --file {file}").format(osd=osd_path,
+                                                                     file=file_e2.name)
+    output = check_output(cmd, shell=True)
+    epoch = int(re.findall('#(\d+)', output)[0])
+    # backup e1 incremental before overwriting it
+    epoch -= 1
+    file_e1_backup = tempfile.NamedTemporaryFile()
+    cmd = CFSD_PREFIX + "--op get-inc-osdmap --epoch {epoch} --file {file}"
+    ret = call(cmd.format(osd=osd_path, epoch=epoch, file=file_e1_backup.name), shell=True)
+    if ret: return 1
+    # overwrite e1 with e2
+    cmd = CFSD_PREFIX + "--op set-inc-osdmap --force --epoch {epoch} --file {file}"
+    ret = call(cmd.format(osd=osd_path, epoch=epoch, file=file_e2.name), shell=True)
+    if ret: return 1
+    # read from e1
+    file_e1_read = tempfile.NamedTemporaryFile(delete=False)
+    cmd = CFSD_PREFIX + "--op get-inc-osdmap --epoch {epoch} --file {file}"
+    ret = call(cmd.format(osd=osd_path, epoch=epoch, file=file_e1_read.name), shell=True)
+    if ret: return 1
+    errors = 0
+    try:
+        if not filecmp.cmp(file_e2.name, file_e1_read.name, shallow=False):
+            logging.error("{{get,set}}-inc-osdmap mismatch {0} != {1}".format(file_e2.name, file_e1_read.name))
+            errors += 1
+    finally:
+        # revert the change with file_e1_backup
+        cmd = CFSD_PREFIX + "--op set-inc-osdmap --epoch {epoch} --file {file}"
+        ret = call(cmd.format(osd=osd_path, epoch=epoch, file=file_e1_backup.name), shell=True)
+        if ret:
+            logging.error("Failed to revert the changed inc-osdmap")
+            errors += 1
+    return errors
+
+
 def main(argv):
     sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)
-    nullfd = open(os.devnull, "w")
+    if len(argv) > 1 and argv[1] == "debug":
+        nullfd = sys.stdout
+    else:
+        nullfd = open(os.devnull, "w")
 
-    call("rm -fr ceph_objectstore_tool_dir ; mkdir ceph_objectstore_tool_dir", shell=True)
-    os.environ["CEPH_DIR"] = CEPH_DIR;
+    call("rm -fr {dir}; mkdir {dir}".format(dir=CEPH_DIR), shell=True)
+    os.environ["CEPH_DIR"] = CEPH_DIR
     OSDDIR = os.path.join(CEPH_DIR, "dev")
     REP_POOL = "rep_pool"
     REP_NAME = "REPobject"
@@ -306,7 +587,7 @@ def main(argv):
             logging.debug(cmd)
             ret = call(cmd, shell=True, stderr=nullfd)
             if ret != 0:
-                logging.critical("Replicated pool object creation failed with {ret}".format(ret=ret))
+                logging.critical("Rados put command failed with {ret}".format(ret=ret))
                 return 1
 
             db[nspace][NAME] = {}
@@ -436,19 +717,46 @@ def main(argv):
     print "Test invalid parameters"
     # On export can't use stdout to a terminal
     cmd = (CFSD_PREFIX + "--op export --pgid {pg}").format(osd=ONEOSD, pg=ONEPG)
-    ERRORS += test_failure_tty(cmd, "stdout is a tty and no --file filename specified")
+    ERRORS += test_failure(cmd, "stdout is a tty and no --file filename specified", tty=True)
 
     # On export can't use stdout to a terminal
     cmd = (CFSD_PREFIX + "--op export --pgid {pg} --file -").format(osd=ONEOSD, pg=ONEPG)
-    ERRORS += test_failure_tty(cmd, "stdout is a tty and no --file filename specified")
+    ERRORS += test_failure(cmd, "stdout is a tty and no --file filename specified", tty=True)
 
+    # Prep a valid ec export file for import failure tests
+    ONEECPG = ALLECPGS[0]
+    osds = get_osds(ONEECPG, OSDDIR)
+    ONEECOSD = osds[0]
     OTHERFILE = "/tmp/foo.{pid}".format(pid=pid)
-    foofd = open(OTHERFILE, "w")
-    foofd.close()
+    cmd = (CFSD_PREFIX + "--op export --pgid {pg} --file {file}").format(osd=ONEECOSD, pg=ONEECPG, file=OTHERFILE)
+    logging.debug(cmd)
+    call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
 
-    # On import can't specify a PG
-    cmd = (CFSD_PREFIX + "--op import --pgid {pg} --file {FOO}").format(osd=ONEOSD, pg=ONEPG, FOO=OTHERFILE)
-    ERRORS += test_failure(cmd, "--pgid option invalid with import")
+    # On import can't specify a different shard
+    BADPG = ONEECPG.split('s')[0] + "s10"
+    cmd = (CFSD_PREFIX + "--op import --pgid {pg} --file {file}").format(osd=ONEECOSD, pg=BADPG, file=OTHERFILE)
+    ERRORS += test_failure(cmd, "Can't specify a different shard, must be")
+
+    os.unlink(OTHERFILE)
+
+    # Prep a valid export file for import failure tests
+    OTHERFILE = "/tmp/foo.{pid}".format(pid=pid)
+    cmd = (CFSD_PREFIX + "--op export --pgid {pg} --file {file}").format(osd=ONEOSD, pg=ONEPG, file=OTHERFILE)
+    logging.debug(cmd)
+    call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
+
+    # On import can't specify a PG with a non-existent pool
+    cmd = (CFSD_PREFIX + "--op import --pgid {pg} --file {file}").format(osd=ONEOSD, pg="10.0", file=OTHERFILE)
+    ERRORS += test_failure(cmd, "Can't specify a different pgid pool, must be")
+
+    # On import can't specify shard for a replicated export
+    cmd = (CFSD_PREFIX + "--op import --pgid {pg}s0 --file {file}").format(osd=ONEOSD, pg=ONEPG, file=OTHERFILE)
+    ERRORS += test_failure(cmd, "Can't specify a sharded pgid with a non-sharded export")
+
+    # On import can't specify a PG with a bad seed
+    TMPPG="{pool}.80".format(pool=REPID)
+    cmd = (CFSD_PREFIX + "--op import --pgid {pg} --file {file}").format(osd=ONEOSD, pg=TMPPG, file=OTHERFILE)
+    ERRORS += test_failure(cmd, "Illegal pgid, the seed is larger than current pg_num")
 
     os.unlink(OTHERFILE)
     cmd = (CFSD_PREFIX + "--op import --file {FOO}").format(osd=ONEOSD, FOO=OTHERFILE)
@@ -456,11 +764,11 @@ def main(argv):
 
     # On import can't use stdin from a terminal
     cmd = (CFSD_PREFIX + "--op import --pgid {pg}").format(osd=ONEOSD, pg=ONEPG)
-    ERRORS += test_failure_tty(cmd, "stdin is a tty and no --file filename specified")
+    ERRORS += test_failure(cmd, "stdin is a tty and no --file filename specified", tty=True)
 
     # On import can't use stdin from a terminal
     cmd = (CFSD_PREFIX + "--op import --pgid {pg} --file -").format(osd=ONEOSD, pg=ONEPG)
-    ERRORS += test_failure_tty(cmd, "stdin is a tty and no --file filename specified")
+    ERRORS += test_failure(cmd, "stdin is a tty and no --file filename specified", tty=True)
 
     # Specify a bad --type
     cmd = (CFSD_PREFIX + "--type foobar --op list --pgid {pg}").format(osd=ONEOSD, pg=ONEPG)
@@ -474,18 +782,44 @@ def main(argv):
     cmd = "./ceph-objectstore-tool --type filestore --data-path {dir}/{osd} --op list --pgid {pg}".format(dir=OSDDIR, osd=ONEOSD, pg=ONEPG)
     ERRORS += test_failure(cmd, "Must provide --journal-path")
 
-    # Test --op list and generate json for all objects
+    cmd = (CFSD_PREFIX + "--op remove").format(osd=ONEOSD)
+    ERRORS += test_failure(cmd, "Must provide pgid")
+
+    # Don't secify a --op nor object command
+    cmd = CFSD_PREFIX.format(osd=ONEOSD)
+    ERRORS += test_failure(cmd, "Must provide --op or object command...")
+
+    # Specify a bad --op command
+    cmd = (CFSD_PREFIX + "--op oops").format(osd=ONEOSD)
+    ERRORS += test_failure(cmd, "Must provide --op (info, log, remove, export, import, list, fix-lost, list-pgs, rm-past-intervals, set-allow-sharded-objects, dump-journal, dump-super, meta-list, get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete)")
+
+    # Provide just the object param not a command
+    cmd = (CFSD_PREFIX + "object").format(osd=ONEOSD)
+    ERRORS += test_failure(cmd, "Invalid syntax, missing command")
+
+    # Provide an object name that doesn't exist
+    cmd = (CFSD_PREFIX + "NON_OBJECT get-bytes").format(osd=ONEOSD)
+    ERRORS += test_failure(cmd, "No object id 'NON_OBJECT' found")
+
+    # Provide an invalid object command
+    cmd = (CFSD_PREFIX + "--pgid {pg} '' notacommand").format(osd=ONEOSD, pg=ONEPG)
+    ERRORS += test_failure(cmd, "Unknown object command 'notacommand'")
+
     TMPFILE = r"/tmp/tmp.{pid}".format(pid=pid)
     ALLPGS = OBJREPPGS + OBJECPGS
-
-    print "Test --op list variants"
     OSDS = get_osds(ALLPGS[0], OSDDIR)
     osd = OSDS[0]
 
+    print "Test all --op dump-journal"
+    ALLOSDS = [f for f in os.listdir(OSDDIR) if os.path.isdir(os.path.join(OSDDIR, f)) and string.find(f, "osd") == 0]
+    ERRORS += test_dump_journal(CFSD_PREFIX, ALLOSDS)
+
+    # Test --op list and generate json for all objects
+    print "Test --op list variants"
+
     # retrieve all objects from all PGs
+    tmpfd = open(TMPFILE, "w")
     cmd = (CFSD_PREFIX + "--op list --format json").format(osd=osd)
-    logging.debug(cmd);
-    tmpfd = open(TMPFILE, "a")
     logging.debug(cmd)
     ret = call(cmd, shell=True, stdout=tmpfd)
     if ret != 0:
@@ -494,12 +828,11 @@ def main(argv):
     tmpfd.close()
     lines = get_lines(TMPFILE)
     JSONOBJ = sorted(set(lines))
-    (pgid, jsondict) = json.loads(JSONOBJ[0])[0]
+    (pgid, coll, jsondict) = json.loads(JSONOBJ[0])[0]
 
     # retrieve all objects in a given PG
-    cmd = (CFSD_PREFIX + "--op list --pgid {pg} --format json").format(osd=osd, pg=pgid)
-    logging.debug(cmd);
     tmpfd = open(OTHERFILE, "a")
+    cmd = (CFSD_PREFIX + "--op list --pgid {pg} --format json").format(osd=osd, pg=pgid)
     logging.debug(cmd)
     ret = call(cmd, shell=True, stdout=tmpfd)
     if ret != 0:
@@ -508,17 +841,16 @@ def main(argv):
     tmpfd.close()
     lines = get_lines(OTHERFILE)
     JSONOBJ = sorted(set(lines))
-    (other_pgid, other_jsondict) = json.loads(JSONOBJ[0])[0]
+    (other_pgid, other_coll, other_jsondict) = json.loads(JSONOBJ[0])[0]
 
-    if pgid != other_pgid or jsondict != other_jsondict:
+    if pgid != other_pgid or jsondict != other_jsondict or coll != other_coll:
         logging.error("the first line of --op list is different "
                       "from the first line of --op list --pgid {pg}".format(pg=pgid))
         ERRORS += 1
 
     # retrieve all objects with a given name in a given PG
+    tmpfd = open(OTHERFILE, "w")
     cmd = (CFSD_PREFIX + "--op list --pgid {pg} {object} --format json").format(osd=osd, pg=pgid, object=jsondict['oid'])
-    logging.debug(cmd);
-    tmpfd = open(OTHERFILE, "a")
     logging.debug(cmd)
     ret = call(cmd, shell=True, stdout=tmpfd)
     if ret != 0:
@@ -527,9 +859,9 @@ def main(argv):
     tmpfd.close()
     lines = get_lines(OTHERFILE)
     JSONOBJ = sorted(set(lines))
-    (other_pgid, other_jsondict) in json.loads(JSONOBJ[0])[0]
+    (other_pgid, other_coll, other_jsondict) in json.loads(JSONOBJ[0])[0]
 
-    if pgid != other_pgid or jsondict != other_jsondict:
+    if pgid != other_pgid or jsondict != other_jsondict or coll != other_coll:
         logging.error("the first line of --op list is different "
                       "from the first line of --op list --pgid {pg} {object}".format(pg=pgid, object=jsondict['oid']))
         ERRORS += 1
@@ -538,8 +870,8 @@ def main(argv):
     for pg in ALLPGS:
         OSDS = get_osds(pg, OSDDIR)
         for osd in OSDS:
-            cmd = (CFSD_PREFIX + "--op list --pgid {pg}").format(osd=osd, pg=pg)
             tmpfd = open(TMPFILE, "a")
+            cmd = (CFSD_PREFIX + "--op list --pgid {pg}").format(osd=osd, pg=pg)
             logging.debug(cmd)
             ret = call(cmd, shell=True, stdout=tmpfd)
             if ret != 0:
@@ -566,6 +898,7 @@ def main(argv):
             GETNAME = "/tmp/getbytes.{pid}".format(pid=pid)
             TESTNAME = "/tmp/testbytes.{pid}".format(pid=pid)
             SETNAME = "/tmp/setbytes.{pid}".format(pid=pid)
+            BADNAME = "/tmp/badbytes.{pid}".format(pid=pid)
             for pg in OBJREPPGS:
                 OSDS = get_osds(pg, OSDDIR)
                 for osd in OSDS:
@@ -622,6 +955,37 @@ def main(argv):
                         logging.debug("Expected:")
                         cat_file(logging.DEBUG, SETNAME)
                         ERRORS += 1
+
+                    # Use set-bytes with --dry-run and make sure contents haven't changed
+                    fd = open(BADNAME, "w")
+                    data = "Bad data for --dry-run in {file}\n".format(file=file)
+                    fd.write(data)
+                    fd.close()
+                    cmd = (CFSD_PREFIX + "--dry-run --pgid {pg} '{json}' set-bytes {sname}").format(osd=osd, pg=pg, json=JSON, sname=BADNAME)
+                    logging.debug(cmd)
+                    ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
+                    if ret != 0:
+                        logging.error("Bad exit status {ret} from set-bytes --dry-run".format(ret=ret))
+                        ERRORS += 1
+                    fd = open(TESTNAME, "w")
+                    cmd = (CFSD_PREFIX + "--pgid {pg} '{json}' get-bytes -").format(osd=osd, pg=pg, json=JSON)
+                    logging.debug(cmd)
+                    ret = call(cmd, shell=True, stdout=fd)
+                    fd.close()
+                    if ret != 0:
+                        logging.error("Bad exit status {ret} from get-bytes".format(ret=ret))
+                        ERRORS += 1
+                    cmd = "diff -q {setfile} {testfile}".format(setfile=SETNAME, testfile=TESTNAME)
+                    logging.debug(cmd)
+                    ret = call(cmd, shell=True)
+                    if ret != 0:
+                        logging.error("Data after set-bytes --dry-run changed!")
+                        logging.debug("Got:")
+                        cat_file(logging.DEBUG, TESTNAME)
+                        logging.debug("Expected:")
+                        cat_file(logging.DEBUG, SETNAME)
+                        ERRORS += 1
+
                     fd = open(file, "r")
                     cmd = (CFSD_PREFIX + "--pgid {pg} '{json}' set-bytes").format(osd=osd, pg=pg, json=JSON)
                     logging.debug(cmd)
@@ -629,6 +993,7 @@ def main(argv):
                     if ret != 0:
                         logging.error("Bad exit status {ret} from set-bytes to restore object".format(ret=ret))
                         ERRORS += 1
+                    fd.close()
 
     try:
         os.unlink(GETNAME)
@@ -642,6 +1007,10 @@ def main(argv):
         os.unlink(SETNAME)
     except:
         pass
+    try:
+        os.unlink(BADNAME)
+    except:
+        pass
 
     print "Test list-attrs get-attr"
     ATTRFILE = r"/tmp/attrs.{pid}".format(pid=pid)
@@ -728,6 +1097,48 @@ def main(argv):
                         logging.error("Not all keys found, remaining keys:")
                         print values
 
+    print "Test --op meta-list"
+    tmpfd = open(TMPFILE, "w")
+    cmd = (CFSD_PREFIX + "--op meta-list").format(osd=ONEOSD)
+    logging.debug(cmd)
+    ret = call(cmd, shell=True, stdout=tmpfd)
+    if ret != 0:
+        logging.error("Bad exit status {ret} from --op meta-list request".format(ret=ret))
+        ERRORS += 1
+
+    print "Test get-bytes on meta"
+    tmpfd.close()
+    lines = get_lines(TMPFILE)
+    JSONOBJ = sorted(set(lines))
+    for JSON in JSONOBJ:
+        (pgid, jsondict) = json.loads(JSON)
+        if pgid != "meta":
+            logging.error("pgid incorrect for --op meta-list {pgid}".format(pgid=pgid))
+            ERRORS += 1
+        if jsondict['namespace'] != "":
+            logging.error("namespace non null --op meta-list {ns}".format(ns=jsondict['namespace']))
+            ERRORS += 1
+        logging.info(JSON)
+        try:
+            os.unlink(GETNAME)
+        except:
+            pass
+        cmd = (CFSD_PREFIX + "'{json}' get-bytes {fname}").format(osd=ONEOSD, json=JSON, fname=GETNAME)
+        logging.debug(cmd)
+        ret = call(cmd, shell=True)
+        if ret != 0:
+            logging.error("Bad exit status {ret}".format(ret=ret))
+            ERRORS += 1
+
+    try:
+        os.unlink(GETNAME)
+    except:
+        pass
+    try:
+        os.unlink(TESTNAME)
+    except:
+        pass
+
     print "Test pg info"
     for pg in ALLREPPGS + ALLECPGS:
         for osd in get_osds(pg, OSDDIR):
@@ -776,7 +1187,7 @@ def main(argv):
         cmd = (CFSD_PREFIX + "--op list-pgs").format(osd=osd)
         logging.debug(cmd)
         TEST_PGS = check_output(cmd, shell=True).split("\n")
-        TEST_PGS = sorted(TEST_PGS)[1:] # Skip extra blank line
+        TEST_PGS = sorted(TEST_PGS)[1:]  # Skip extra blank line
 
         if TEST_PGS != CHECK_PGS:
             logging.error("list-pgs got wrong result for osd.{osd}".format(osd=osd))
@@ -784,11 +1195,38 @@ def main(argv):
             logging.error("Got {pgs}".format(pgs=TEST_PGS))
             ERRORS += 1
 
-    print "Test pg export"
     EXP_ERRORS = 0
+    print "Test pg export --dry-run"
+    pg = ALLREPPGS[0]
+    osd = get_osds(pg, OSDDIR)[0]
+    fname = "/tmp/fname.{pid}".format(pid=pid)
+    cmd = (CFSD_PREFIX + "--dry-run --op export --pgid {pg} --file {file}").format(osd=osd, pg=pg, file=fname)
+    logging.debug(cmd)
+    ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
+    if ret != 0:
+        logging.error("Exporting --dry-run failed for pg {pg} on {osd} with {ret}".format(pg=pg, osd=osd, ret=ret))
+        EXP_ERRORS += 1
+    elif os.path.exists(fname):
+        logging.error("Exporting --dry-run created file")
+        EXP_ERRORS += 1
+
+    cmd = (CFSD_PREFIX + "--dry-run --op export --pgid {pg} > {file}").format(osd=osd, pg=pg, file=fname)
+    logging.debug(cmd)
+    ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
+    if ret != 0:
+        logging.error("Exporting --dry-run failed for pg {pg} on {osd} with {ret}".format(pg=pg, osd=osd, ret=ret))
+        EXP_ERRORS += 1
+    else:
+        outdata = get_lines(fname)
+        if len(outdata) > 0:
+            logging.error("Exporting --dry-run to stdout not empty")
+            logging.error("Data: " + outdata)
+            EXP_ERRORS += 1
+
     os.mkdir(TESTDIR)
     for osd in [f for f in os.listdir(OSDDIR) if os.path.isdir(os.path.join(OSDDIR, f)) and string.find(f, "osd") == 0]:
         os.mkdir(os.path.join(TESTDIR, osd))
+    print "Test pg export"
     for pg in ALLREPPGS + ALLECPGS:
         for osd in get_osds(pg, OSDDIR):
             mydir = os.path.join(TESTDIR, osd)
@@ -798,7 +1236,7 @@ def main(argv):
             elif pg == ALLREPPGS[1]:
                 cmd = (CFSD_PREFIX + "--op export --pgid {pg} --file - > {file}").format(osd=osd, pg=pg, file=fname)
             else:
-              cmd = (CFSD_PREFIX + "--op export --pgid {pg} --file {file}").format(osd=osd, pg=pg, file=fname)
+                cmd = (CFSD_PREFIX + "--op export --pgid {pg} --file {file}").format(osd=osd, pg=pg, file=fname)
             logging.debug(cmd)
             ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
             if ret != 0:
@@ -811,6 +1249,13 @@ def main(argv):
     RM_ERRORS = 0
     for pg in ALLREPPGS + ALLECPGS:
         for osd in get_osds(pg, OSDDIR):
+            # This should do nothing
+            cmd = (CFSD_PREFIX + "--op remove --pgid {pg} --dry-run").format(pg=pg, osd=osd)
+            logging.debug(cmd)
+            ret = call(cmd, shell=True, stdout=nullfd)
+            if ret != 0:
+                logging.error("Removing --dry-run failed for pg {pg} on {osd} with {ret}".format(pg=pg, osd=osd, ret=ret))
+                RM_ERRORS += 1
             cmd = (CFSD_PREFIX + "--op remove --pgid {pg}").format(pg=pg, osd=osd)
             logging.debug(cmd)
             ret = call(cmd, shell=True, stdout=nullfd)
@@ -828,10 +1273,17 @@ def main(argv):
             PGS = [f for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))]
             for pg in PGS:
                 file = os.path.join(dir, pg)
+                # This should do nothing
+                cmd = (CFSD_PREFIX + "--op import --file {file} --dry-run").format(osd=osd, file=file)
+                logging.debug(cmd)
+                ret = call(cmd, shell=True, stdout=nullfd)
+                if ret != 0:
+                    logging.error("Import failed from {file} with {ret}".format(file=file, ret=ret))
+                    IMP_ERRORS += 1
                 if pg == PGS[0]:
                     cmd = ("cat {file} |".format(file=file) + CFSD_PREFIX + "--op import").format(osd=osd)
                 elif pg == PGS[1]:
-                    cmd = (CFSD_PREFIX + "--op import --file - < {file}").format(osd=osd, file=file)
+                    cmd = (CFSD_PREFIX + "--op import --file - --pgid {pg} < {file}").format(osd=osd, file=file, pg=pg)
                 else:
                     cmd = (CFSD_PREFIX + "--op import --file {file}").format(osd=osd, file=file)
                 logging.debug(cmd)
@@ -847,32 +1299,15 @@ def main(argv):
 
     if EXP_ERRORS == 0 and RM_ERRORS == 0 and IMP_ERRORS == 0:
         print "Verify replicated import data"
-        for nsfile in [f for f in os.listdir(DATADIR) if f.split('-')[1].find(REP_NAME) == 0]:
-            nspace = nsfile.split("-")[0]
-            file = nsfile.split("-")[1]
-            path = os.path.join(DATADIR, nsfile)
-            tmpfd = open(TMPFILE, "w")
-            cmd = "find {dir} -name '{file}_*_{nspace}_*'".format(dir=OSDDIR, file=file, nspace=nspace)
-            logging.debug(cmd)
-            ret = call(cmd, shell=True, stdout=tmpfd)
-            if ret:
-                logging.critical("INTERNAL ERROR")
-                return 1
-            tmpfd.close()
-            obj_locs = get_lines(TMPFILE)
-            if len(obj_locs) == 0:
-                logging.error("Can't find imported object {name}".format(name=file))
-                ERRORS += 1
-            for obj_loc in obj_locs:
-                cmd = "diff -q {src} {obj_loc}".format(src=path, obj_loc=obj_loc)
-                logging.debug(cmd)
-                ret = call(cmd, shell=True)
-                if ret != 0:
-                    logging.error("{file} data not imported properly into {obj}".format(file=file, obj=obj_loc))
-                    ERRORS += 1
+        data_errors, _ = check_data(DATADIR, TMPFILE, OSDDIR, REP_NAME)
+        ERRORS += data_errors
     else:
         logging.warning("SKIPPING CHECKING IMPORT DATA DUE TO PREVIOUS FAILURES")
 
+    print "Test all --op dump-journal again"
+    ALLOSDS = [f for f in os.listdir(OSDDIR) if os.path.isdir(os.path.join(OSDDIR, f)) and string.find(f, "osd") == 0]
+    ERRORS += test_dump_journal(CFSD_PREFIX, ALLOSDS)
+
     vstart(new=False)
     wait_for_health()
 
@@ -881,32 +1316,185 @@ def main(argv):
         ERRORS += verify(DATADIR, EC_POOL, EC_NAME)
 
     if EXP_ERRORS == 0:
-        NEWPOOL = "import-rados-pool"
+        NEWPOOL = "rados-import-pool"
         cmd = "./rados mkpool {pool}".format(pool=NEWPOOL)
         logging.debug(cmd)
-        ret = call(cmd, shell=True, stdout=nullfd)
+        ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
 
-        print "Test import-rados"
+        print "Test rados import"
+        first = True
         for osd in [f for f in os.listdir(OSDDIR) if os.path.isdir(os.path.join(OSDDIR, f)) and string.find(f, "osd") == 0]:
             dir = os.path.join(TESTDIR, osd)
             for pg in [f for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))]:
                 if string.find(pg, "{id}.".format(id=REPID)) != 0:
                     continue
                 file = os.path.join(dir, pg)
-                cmd = "./ceph-objectstore-tool import-rados {pool} {file}".format(pool=NEWPOOL, file=file)
+                if first:
+                    first = False
+                    # This should do nothing
+                    cmd = "./rados import -p {pool} --dry-run {file}".format(pool=NEWPOOL, file=file)
+                    logging.debug(cmd)
+                    ret = call(cmd, shell=True, stdout=nullfd)
+                    if ret != 0:
+                        logging.error("Rados import --dry-run failed from {file} with {ret}".format(file=file, ret=ret))
+                        ERRORS += 1
+                    cmd = "./rados -p {pool} ls".format(pool=NEWPOOL)
+                    logging.debug(cmd)
+                    data = check_output(cmd, shell=True)
+                    if data:
+                        logging.error("'{data}'".format(data=data))
+                        logging.error("Found objects after dry-run")
+                        ERRORS += 1
+                cmd = "./rados import -p {pool} {file}".format(pool=NEWPOOL, file=file)
                 logging.debug(cmd)
                 ret = call(cmd, shell=True, stdout=nullfd)
                 if ret != 0:
-                    logging.error("Import-rados failed from {file} with {ret}".format(file=file, ret=ret))
+                    logging.error("Rados import failed from {file} with {ret}".format(file=file, ret=ret))
+                    ERRORS += 1
+                cmd = "./rados import -p {pool} --no-overwrite {file}".format(pool=NEWPOOL, file=file)
+                logging.debug(cmd)
+                ret = call(cmd, shell=True, stdout=nullfd)
+                if ret != 0:
+                    logging.error("Rados import --no-overwrite failed from {file} with {ret}".format(file=file, ret=ret))
                     ERRORS += 1
 
         ERRORS += verify(DATADIR, NEWPOOL, REP_NAME)
     else:
         logging.warning("SKIPPING IMPORT-RADOS TESTS DUE TO PREVIOUS FAILURES")
 
+    # Clear directories of previous portion
+    call("/bin/rm -rf {dir}".format(dir=TESTDIR), shell=True)
+    call("/bin/rm -rf {dir}".format(dir=DATADIR), shell=True)
+    os.mkdir(TESTDIR)
+    os.mkdir(DATADIR)
+
+    # Cause SPLIT_POOL to split and test import with object/log filtering
+    print "Testing import all objects after a split"
+    SPLIT_POOL = "split_pool"
+    PG_COUNT = 1
+    SPLIT_OBJ_COUNT = 5
+    SPLIT_NSPACE_COUNT = 2
+    SPLIT_NAME = "split"
+    cmd = "./ceph osd pool create {pool} {pg} {pg} replicated".format(pool=SPLIT_POOL, pg=PG_COUNT)
+    logging.debug(cmd)
+    call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
+    SPLITID = get_pool_id(SPLIT_POOL, nullfd)
+    pool_size = int(check_output("./ceph osd pool get {pool} size".format(pool=SPLIT_POOL), shell=True, stderr=nullfd).split(" ")[1])
+    EXP_ERRORS = 0
+    RM_ERRORS = 0
+    IMP_ERRORS = 0
+
+    objects = range(1, SPLIT_OBJ_COUNT + 1)
+    nspaces = range(SPLIT_NSPACE_COUNT)
+    for n in nspaces:
+        nspace = get_nspace(n)
+
+        for i in objects:
+            NAME = SPLIT_NAME + "{num}".format(num=i)
+            LNAME = nspace + "-" + NAME
+            DDNAME = os.path.join(DATADIR, LNAME)
+
+            cmd = "rm -f " + DDNAME
+            logging.debug(cmd)
+            call(cmd, shell=True)
+
+            if i == 1:
+                dataline = range(DATALINECOUNT)
+            else:
+                dataline = range(1)
+            fd = open(DDNAME, "w")
+            data = "This is the split data for " + LNAME + "\n"
+            for _ in dataline:
+                fd.write(data)
+            fd.close()
+
+            cmd = "./rados -p {pool} -N '{nspace}' put {name} {ddname}".format(pool=SPLIT_POOL, name=NAME, ddname=DDNAME, nspace=nspace)
+            logging.debug(cmd)
+            ret = call(cmd, shell=True, stderr=nullfd)
+            if ret != 0:
+                logging.critical("Rados put command failed with {ret}".format(ret=ret))
+                return 1
+
+    wait_for_health()
+    kill_daemons()
+
+    for osd in [f for f in os.listdir(OSDDIR) if os.path.isdir(os.path.join(OSDDIR, f)) and string.find(f, "osd") == 0]:
+        os.mkdir(os.path.join(TESTDIR, osd))
+
+    pg = "{pool}.0".format(pool=SPLITID)
+    EXPORT_PG = pg
+
+    export_osds = get_osds(pg, OSDDIR)
+    for osd in export_osds:
+        mydir = os.path.join(TESTDIR, osd)
+        fname = os.path.join(mydir, pg)
+        cmd = (CFSD_PREFIX + "--op export --pgid {pg} --file {file}").format(osd=osd, pg=pg, file=fname)
+        logging.debug(cmd)
+        ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
+        if ret != 0:
+            logging.error("Exporting failed for pg {pg} on {osd} with {ret}".format(pg=pg, osd=osd, ret=ret))
+            EXP_ERRORS += 1
+
+    ERRORS += EXP_ERRORS
+
+    if EXP_ERRORS == 0:
+        vstart(new=False)
+        wait_for_health()
+
+        time.sleep(20)
+
+        cmd = "./ceph osd pool set {pool} pg_num 2".format(pool=SPLIT_POOL)
+        logging.debug(cmd)
+        ret = call(cmd, shell=True, stdout=nullfd, stderr=nullfd)
+        time.sleep(5)
+        wait_for_health()
+
+        time.sleep(15)
+
+        kill_daemons()
+
+        # Now 2 PGs, poolid.0 and poolid.1
+        for seed in range(2):
+            pg = "{pool}.{seed}".format(pool=SPLITID, seed=seed)
+
+            which = 0
+            for osd in get_osds(pg, OSDDIR):
+                cmd = (CFSD_PREFIX + "--op remove --pgid {pg}").format(pg=pg, osd=osd)
+                logging.debug(cmd)
+                ret = call(cmd, shell=True, stdout=nullfd)
+
+                # This is weird.  The export files are based on only the EXPORT_PG
+                # and where that pg was before the split.  Use 'which' to use all
+                # export copies in import.
+                mydir = os.path.join(TESTDIR, export_osds[which])
+                fname = os.path.join(mydir, EXPORT_PG)
+                which += 1
+                cmd = (CFSD_PREFIX + "--op import --pgid {pg} --file {file}").format(osd=osd, pg=pg, file=fname)
+                logging.debug(cmd)
+                ret = call(cmd, shell=True, stdout=nullfd)
+                if ret != 0:
+                    logging.error("Import failed from {file} with {ret}".format(file=file, ret=ret))
+                    IMP_ERRORS += 1
+
+        ERRORS += IMP_ERRORS
+
+        # Start up again to make sure imports didn't corrupt anything
+        if IMP_ERRORS == 0:
+            print "Verify split import data"
+            data_errors, count = check_data(DATADIR, TMPFILE, OSDDIR, SPLIT_NAME)
+            ERRORS += data_errors
+            if count != (SPLIT_OBJ_COUNT * SPLIT_NSPACE_COUNT * pool_size):
+                logging.error("Incorrect number of replicas seen {count}".format(count=count))
+                ERRORS += 1
+            vstart(new=False)
+            wait_for_health()
+
     call("/bin/rm -rf {dir}".format(dir=TESTDIR), shell=True)
     call("/bin/rm -rf {dir}".format(dir=DATADIR), shell=True)
 
+    # vstart() starts 4 OSDs
+    ERRORS += test_get_set_osdmap(CFSD_PREFIX, range(4), ALLOSDS)
+    ERRORS += test_get_set_inc_osdmap(CFSD_PREFIX, ALLOSDS[0])
     if ERRORS == 0:
         print "TEST PASSED"
         return 0
@@ -920,5 +1508,5 @@ if __name__ == "__main__":
         status = main(sys.argv[1:])
     finally:
         kill_daemons()
-        call("/bin/rm -fr ceph_objectstore_tool_dir", shell=True)
+        call("/bin/rm -fr {dir}".format(dir=CEPH_DIR), shell=True)
     sys.exit(status)
diff --git a/src/test/cephtool-test-mds.sh b/src/test/cephtool-test-mds.sh
index f3ecd96..0fc2151 100755
--- a/src/test/cephtool-test-mds.sh
+++ b/src/test/cephtool-test-mds.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
 #
+# Copyright (C) 2014, 2015 Red Hat <contact at redhat.com>
 # Copyright (C) 2013 Cloudwatt <libre.licensing at cloudwatt.com>
 #
 # Author: Loic Dachary <loic at dachary.org>
@@ -14,6 +15,7 @@
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU Library Public License for more details.
 #
+CEPH_CLI_TEST_DUP_COMMAND=1 \
 MDS=1 MON=1 OSD=3 CEPH_START='mon osd mds' CEPH_PORT=7200 test/vstart_wrapper.sh \
     ../qa/workunits/cephtool/test.sh \
     --test-mds \
diff --git a/src/test/cephtool-test-mon.sh b/src/test/cephtool-test-mon.sh
index d24f774..d4da5f2 100755
--- a/src/test/cephtool-test-mon.sh
+++ b/src/test/cephtool-test-mon.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
 #
+# Copyright (C) 2014, 2015 Red Hat <contact at redhat.com>
 # Copyright (C) 2013 Cloudwatt <libre.licensing at cloudwatt.com>
 #
 # Author: Loic Dachary <loic at dachary.org>
@@ -14,6 +15,9 @@
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU Library Public License for more details.
 #
+CEPH_CLI_TEST_DUP_COMMAND=1 \
+# uses CEPH_PORT going from 7202 7203 and 7204 because
+# it starts at 7202 and runs 3 mons (see vstart.sh)
 MON=3 OSD=3 CEPH_START='mon osd' CEPH_PORT=7202 test/vstart_wrapper.sh \
     ../qa/workunits/cephtool/test.sh \
     --test-mon \
diff --git a/src/test/cephtool-test-osd.sh b/src/test/cephtool-test-osd.sh
index dbbe644..c016d24 100755
--- a/src/test/cephtool-test-osd.sh
+++ b/src/test/cephtool-test-osd.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
 #
+# Copyright (C) 2014, 2015 Red Hat <contact at redhat.com>
 # Copyright (C) 2013 Cloudwatt <libre.licensing at cloudwatt.com>
 #
 # Author: Loic Dachary <loic at dachary.org>
@@ -14,6 +15,7 @@
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU Library Public License for more details.
 #
+CEPH_CLI_TEST_DUP_COMMAND=1 \
 MON=1 OSD=3 CEPH_START='mon osd' CEPH_PORT=7201 test/vstart_wrapper.sh \
     ../qa/workunits/cephtool/test.sh \
     --test-osd \
diff --git a/src/test/cephtool-test-rados.sh b/src/test/cephtool-test-rados.sh
new file mode 100755
index 0000000..8f9b551
--- /dev/null
+++ b/src/test/cephtool-test-rados.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+#
+# Copyright (C) 2015 Red Hat <contact at redhat.com>
+#
+# Author: David Zafman <dzafman at redhat.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Library Public License for more details.
+#
+CEPH_CLI_TEST_DUP_COMMAND=1 \
+MON=1 OSD=3 CEPH_START='mon osd' CEPH_PORT=7205 test/vstart_wrapper.sh \
+    test/test_rados_tool.sh
diff --git a/src/test/cli/crushtool/arg-order-checks.t b/src/test/cli/crushtool/arg-order-checks.t
new file mode 100644
index 0000000..787bb53
--- /dev/null
+++ b/src/test/cli/crushtool/arg-order-checks.t
@@ -0,0 +1,731 @@
+# tunables before decompile
+  $ crushtool -d "$TESTDIR/simple.template" --set-straw-calc-version 1 | head -2
+  # begin crush map
+  tunable straw_calc_version 1
+# build then reweight-item then tree
+  $ map="$TESTDIR/foo"
+  $ crushtool --outfn "$map" --build --num_osds 25 node straw 5 rack straw 1 root straw 0 --reweight-item osd.2 99 -o "$map" --tree
+  crushtool reweighting item osd.2 to 99
+  ID\tWEIGHT\tTYPE NAME (esc)
+  -11\t123.00000\troot root (esc)
+  -6\t103.00000\t\track rack0 (esc)
+  -1\t103.00000\t\t\tnode node0 (esc)
+  0\t1.00000\t\t\t\tosd.0 (esc)
+  1\t1.00000\t\t\t\tosd.1 (esc)
+  2\t99.00000\t\t\t\tosd.2 (esc)
+  3\t1.00000\t\t\t\tosd.3 (esc)
+  4\t1.00000\t\t\t\tosd.4 (esc)
+  -7\t5.00000\t\track rack1 (esc)
+  -2\t5.00000\t\t\tnode node1 (esc)
+  5\t1.00000\t\t\t\tosd.5 (esc)
+  6\t1.00000\t\t\t\tosd.6 (esc)
+  7\t1.00000\t\t\t\tosd.7 (esc)
+  8\t1.00000\t\t\t\tosd.8 (esc)
+  9\t1.00000\t\t\t\tosd.9 (esc)
+  -8\t5.00000\t\track rack2 (esc)
+  -3\t5.00000\t\t\tnode node2 (esc)
+  10\t1.00000\t\t\t\tosd.10 (esc)
+  11\t1.00000\t\t\t\tosd.11 (esc)
+  12\t1.00000\t\t\t\tosd.12 (esc)
+  13\t1.00000\t\t\t\tosd.13 (esc)
+  14\t1.00000\t\t\t\tosd.14 (esc)
+  -9\t5.00000\t\track rack3 (esc)
+  -4\t5.00000\t\t\tnode node3 (esc)
+  15\t1.00000\t\t\t\tosd.15 (esc)
+  16\t1.00000\t\t\t\tosd.16 (esc)
+  17\t1.00000\t\t\t\tosd.17 (esc)
+  18\t1.00000\t\t\t\tosd.18 (esc)
+  19\t1.00000\t\t\t\tosd.19 (esc)
+  -10\t5.00000\t\track rack4 (esc)
+  -5\t5.00000\t\t\tnode node4 (esc)
+  20\t1.00000\t\t\t\tosd.20 (esc)
+  21\t1.00000\t\t\t\tosd.21 (esc)
+  22\t1.00000\t\t\t\tosd.22 (esc)
+  23\t1.00000\t\t\t\tosd.23 (esc)
+  24\t1.00000\t\t\t\tosd.24 (esc)
+  $ crushtool -d "$map"
+  # begin crush map
+  tunable choose_local_tries 0
+  tunable choose_local_fallback_tries 0
+  tunable choose_total_tries 50
+  tunable chooseleaf_descend_once 1
+  tunable straw_calc_version 1
+  
+  # devices
+  device 0 osd.0
+  device 1 osd.1
+  device 2 osd.2
+  device 3 osd.3
+  device 4 osd.4
+  device 5 osd.5
+  device 6 osd.6
+  device 7 osd.7
+  device 8 osd.8
+  device 9 osd.9
+  device 10 osd.10
+  device 11 osd.11
+  device 12 osd.12
+  device 13 osd.13
+  device 14 osd.14
+  device 15 osd.15
+  device 16 osd.16
+  device 17 osd.17
+  device 18 osd.18
+  device 19 osd.19
+  device 20 osd.20
+  device 21 osd.21
+  device 22 osd.22
+  device 23 osd.23
+  device 24 osd.24
+  
+  # types
+  type 0 device
+  type 1 node
+  type 2 rack
+  type 3 root
+  
+  # buckets
+  node node0 {
+  \tid -1\t\t# do not change unnecessarily (esc)
+  \t# weight 103.000 (esc)
+  \talg straw (esc)
+  \thash 0\t# rjenkins1 (esc)
+  \titem osd.0 weight 1.000 (esc)
+  \titem osd.1 weight 1.000 (esc)
+  \titem osd.2 weight 99.000 (esc)
+  \titem osd.3 weight 1.000 (esc)
+  \titem osd.4 weight 1.000 (esc)
+  }
+  node node1 {
+  \tid -2\t\t# do not change unnecessarily (esc)
+  \t# weight 5.000 (esc)
+  \talg straw (esc)
+  \thash 0\t# rjenkins1 (esc)
+  \titem osd.5 weight 1.000 (esc)
+  \titem osd.6 weight 1.000 (esc)
+  \titem osd.7 weight 1.000 (esc)
+  \titem osd.8 weight 1.000 (esc)
+  \titem osd.9 weight 1.000 (esc)
+  }
+  node node2 {
+  \tid -3\t\t# do not change unnecessarily (esc)
+  \t# weight 5.000 (esc)
+  \talg straw (esc)
+  \thash 0\t# rjenkins1 (esc)
+  \titem osd.10 weight 1.000 (esc)
+  \titem osd.11 weight 1.000 (esc)
+  \titem osd.12 weight 1.000 (esc)
+  \titem osd.13 weight 1.000 (esc)
+  \titem osd.14 weight 1.000 (esc)
+  }
+  node node3 {
+  \tid -4\t\t# do not change unnecessarily (esc)
+  \t# weight 5.000 (esc)
+  \talg straw (esc)
+  \thash 0\t# rjenkins1 (esc)
+  \titem osd.15 weight 1.000 (esc)
+  \titem osd.16 weight 1.000 (esc)
+  \titem osd.17 weight 1.000 (esc)
+  \titem osd.18 weight 1.000 (esc)
+  \titem osd.19 weight 1.000 (esc)
+  }
+  node node4 {
+  \tid -5\t\t# do not change unnecessarily (esc)
+  \t# weight 5.000 (esc)
+  \talg straw (esc)
+  \thash 0\t# rjenkins1 (esc)
+  \titem osd.20 weight 1.000 (esc)
+  \titem osd.21 weight 1.000 (esc)
+  \titem osd.22 weight 1.000 (esc)
+  \titem osd.23 weight 1.000 (esc)
+  \titem osd.24 weight 1.000 (esc)
+  }
+  rack rack0 {
+  \tid -6\t\t# do not change unnecessarily (esc)
+  \t# weight 103.000 (esc)
+  \talg straw (esc)
+  \thash 0\t# rjenkins1 (esc)
+  \titem node0 weight 103.000 (esc)
+  }
+  rack rack1 {
+  \tid -7\t\t# do not change unnecessarily (esc)
+  \t# weight 5.000 (esc)
+  \talg straw (esc)
+  \thash 0\t# rjenkins1 (esc)
+  \titem node1 weight 5.000 (esc)
+  }
+  rack rack2 {
+  \tid -8\t\t# do not change unnecessarily (esc)
+  \t# weight 5.000 (esc)
+  \talg straw (esc)
+  \thash 0\t# rjenkins1 (esc)
+  \titem node2 weight 5.000 (esc)
+  }
+  rack rack3 {
+  \tid -9\t\t# do not change unnecessarily (esc)
+  \t# weight 5.000 (esc)
+  \talg straw (esc)
+  \thash 0\t# rjenkins1 (esc)
+  \titem node3 weight 5.000 (esc)
+  }
+  rack rack4 {
+  \tid -10\t\t# do not change unnecessarily (esc)
+  \t# weight 5.000 (esc)
+  \talg straw (esc)
+  \thash 0\t# rjenkins1 (esc)
+  \titem node4 weight 5.000 (esc)
+  }
+  root root {
+  \tid -11\t\t# do not change unnecessarily (esc)
+  \t# weight 123.000 (esc)
+  \talg straw (esc)
+  \thash 0\t# rjenkins1 (esc)
+  \titem rack0 weight 103.000 (esc)
+  \titem rack1 weight 5.000 (esc)
+  \titem rack2 weight 5.000 (esc)
+  \titem rack3 weight 5.000 (esc)
+  \titem rack4 weight 5.000 (esc)
+  }
+  
+  # rules
+  rule replicated_ruleset {
+  \truleset 0 (esc)
+  \ttype replicated (esc)
+  \tmin_size 1 (esc)
+  \tmax_size 10 (esc)
+  \tstep take root (esc)
+  \tstep chooseleaf firstn 0 type node (esc)
+  \tstep emit (esc)
+  }
+  
+  # end crush map
+# tunables before reweight
+  $ crushtool -i "$map" --set-straw-calc-version 0 --reweight --test --show-utilization --max-x 100 --min-x 1
+  rule 0 (replicated_ruleset), x = 1..100, numrep = 1..10
+  rule 0 (replicated_ruleset) num_rep 1 result size == 1:\t100/100 (esc)
+    device 0:\t\t stored : 4\t expected : 4 (esc)
+    device 1:\t\t stored : 4\t expected : 4 (esc)
+    device 2:\t\t stored : 40\t expected : 4 (esc)
+    device 3:\t\t stored : 6\t expected : 4 (esc)
+    device 4:\t\t stored : 1\t expected : 4 (esc)
+    device 5:\t\t stored : 2\t expected : 4 (esc)
+    device 7:\t\t stored : 2\t expected : 4 (esc)
+    device 8:\t\t stored : 3\t expected : 4 (esc)
+    device 9:\t\t stored : 4\t expected : 4 (esc)
+    device 12:\t\t stored : 2\t expected : 4 (esc)
+    device 13:\t\t stored : 1\t expected : 4 (esc)
+    device 14:\t\t stored : 4\t expected : 4 (esc)
+    device 15:\t\t stored : 2\t expected : 4 (esc)
+    device 16:\t\t stored : 5\t expected : 4 (esc)
+    device 17:\t\t stored : 3\t expected : 4 (esc)
+    device 19:\t\t stored : 5\t expected : 4 (esc)
+    device 20:\t\t stored : 5\t expected : 4 (esc)
+    device 21:\t\t stored : 1\t expected : 4 (esc)
+    device 22:\t\t stored : 2\t expected : 4 (esc)
+    device 23:\t\t stored : 2\t expected : 4 (esc)
+    device 24:\t\t stored : 2\t expected : 4 (esc)
+  rule 0 (replicated_ruleset) num_rep 2 result size == 2:\t100/100 (esc)
+    device 0:\t\t stored : 6\t expected : 8 (esc)
+    device 1:\t\t stored : 6\t expected : 8 (esc)
+    device 2:\t\t stored : 60\t expected : 8 (esc)
+    device 3:\t\t stored : 6\t expected : 8 (esc)
+    device 4:\t\t stored : 6\t expected : 8 (esc)
+    device 5:\t\t stored : 4\t expected : 8 (esc)
+    device 6:\t\t stored : 2\t expected : 8 (esc)
+    device 7:\t\t stored : 4\t expected : 8 (esc)
+    device 8:\t\t stored : 5\t expected : 8 (esc)
+    device 9:\t\t stored : 10\t expected : 8 (esc)
+    device 10:\t\t stored : 3\t expected : 8 (esc)
+    device 11:\t\t stored : 5\t expected : 8 (esc)
+    device 12:\t\t stored : 6\t expected : 8 (esc)
+    device 13:\t\t stored : 3\t expected : 8 (esc)
+    device 14:\t\t stored : 7\t expected : 8 (esc)
+    device 15:\t\t stored : 8\t expected : 8 (esc)
+    device 16:\t\t stored : 7\t expected : 8 (esc)
+    device 17:\t\t stored : 7\t expected : 8 (esc)
+    device 18:\t\t stored : 6\t expected : 8 (esc)
+    device 19:\t\t stored : 11\t expected : 8 (esc)
+    device 20:\t\t stored : 12\t expected : 8 (esc)
+    device 21:\t\t stored : 1\t expected : 8 (esc)
+    device 22:\t\t stored : 4\t expected : 8 (esc)
+    device 23:\t\t stored : 5\t expected : 8 (esc)
+    device 24:\t\t stored : 6\t expected : 8 (esc)
+  rule 0 (replicated_ruleset) num_rep 3 result size == 3:\t100/100 (esc)
+    device 0:\t\t stored : 8\t expected : 12 (esc)
+    device 1:\t\t stored : 6\t expected : 12 (esc)
+    device 2:\t\t stored : 69\t expected : 12 (esc)
+    device 3:\t\t stored : 6\t expected : 12 (esc)
+    device 4:\t\t stored : 6\t expected : 12 (esc)
+    device 5:\t\t stored : 8\t expected : 12 (esc)
+    device 6:\t\t stored : 9\t expected : 12 (esc)
+    device 7:\t\t stored : 7\t expected : 12 (esc)
+    device 8:\t\t stored : 14\t expected : 12 (esc)
+    device 9:\t\t stored : 16\t expected : 12 (esc)
+    device 10:\t\t stored : 6\t expected : 12 (esc)
+    device 11:\t\t stored : 11\t expected : 12 (esc)
+    device 12:\t\t stored : 9\t expected : 12 (esc)
+    device 13:\t\t stored : 8\t expected : 12 (esc)
+    device 14:\t\t stored : 7\t expected : 12 (esc)
+    device 15:\t\t stored : 8\t expected : 12 (esc)
+    device 16:\t\t stored : 9\t expected : 12 (esc)
+    device 17:\t\t stored : 11\t expected : 12 (esc)
+    device 18:\t\t stored : 9\t expected : 12 (esc)
+    device 19:\t\t stored : 16\t expected : 12 (esc)
+    device 20:\t\t stored : 18\t expected : 12 (esc)
+    device 21:\t\t stored : 5\t expected : 12 (esc)
+    device 22:\t\t stored : 15\t expected : 12 (esc)
+    device 23:\t\t stored : 8\t expected : 12 (esc)
+    device 24:\t\t stored : 11\t expected : 12 (esc)
+  rule 0 (replicated_ruleset) num_rep 4 result size == 4:\t100/100 (esc)
+    device 0:\t\t stored : 8\t expected : 16 (esc)
+    device 1:\t\t stored : 6\t expected : 16 (esc)
+    device 2:\t\t stored : 72\t expected : 16 (esc)
+    device 3:\t\t stored : 6\t expected : 16 (esc)
+    device 4:\t\t stored : 6\t expected : 16 (esc)
+    device 5:\t\t stored : 13\t expected : 16 (esc)
+    device 6:\t\t stored : 13\t expected : 16 (esc)
+    device 7:\t\t stored : 13\t expected : 16 (esc)
+    device 8:\t\t stored : 15\t expected : 16 (esc)
+    device 9:\t\t stored : 20\t expected : 16 (esc)
+    device 10:\t\t stored : 11\t expected : 16 (esc)
+    device 11:\t\t stored : 20\t expected : 16 (esc)
+    device 12:\t\t stored : 13\t expected : 16 (esc)
+    device 13:\t\t stored : 13\t expected : 16 (esc)
+    device 14:\t\t stored : 11\t expected : 16 (esc)
+    device 15:\t\t stored : 19\t expected : 16 (esc)
+    device 16:\t\t stored : 12\t expected : 16 (esc)
+    device 17:\t\t stored : 13\t expected : 16 (esc)
+    device 18:\t\t stored : 17\t expected : 16 (esc)
+    device 19:\t\t stored : 22\t expected : 16 (esc)
+    device 20:\t\t stored : 21\t expected : 16 (esc)
+    device 21:\t\t stored : 11\t expected : 16 (esc)
+    device 22:\t\t stored : 20\t expected : 16 (esc)
+    device 23:\t\t stored : 10\t expected : 16 (esc)
+    device 24:\t\t stored : 15\t expected : 16 (esc)
+  rule 0 (replicated_ruleset) num_rep 5 result size == 4:\t3/100 (esc)
+  rule 0 (replicated_ruleset) num_rep 5 result size == 5:\t97/100 (esc)
+    device 0:\t\t stored : 8\t expected : 20 (esc)
+    device 1:\t\t stored : 6\t expected : 20 (esc)
+    device 2:\t\t stored : 74\t expected : 20 (esc)
+    device 3:\t\t stored : 6\t expected : 20 (esc)
+    device 4:\t\t stored : 6\t expected : 20 (esc)
+    device 5:\t\t stored : 17\t expected : 20 (esc)
+    device 6:\t\t stored : 17\t expected : 20 (esc)
+    device 7:\t\t stored : 19\t expected : 20 (esc)
+    device 8:\t\t stored : 18\t expected : 20 (esc)
+    device 9:\t\t stored : 27\t expected : 20 (esc)
+    device 10:\t\t stored : 15\t expected : 20 (esc)
+    device 11:\t\t stored : 28\t expected : 20 (esc)
+    device 12:\t\t stored : 22\t expected : 20 (esc)
+    device 13:\t\t stored : 18\t expected : 20 (esc)
+    device 14:\t\t stored : 17\t expected : 20 (esc)
+    device 15:\t\t stored : 22\t expected : 20 (esc)
+    device 16:\t\t stored : 14\t expected : 20 (esc)
+    device 17:\t\t stored : 19\t expected : 20 (esc)
+    device 18:\t\t stored : 20\t expected : 20 (esc)
+    device 19:\t\t stored : 25\t expected : 20 (esc)
+    device 20:\t\t stored : 24\t expected : 20 (esc)
+    device 21:\t\t stored : 19\t expected : 20 (esc)
+    device 22:\t\t stored : 25\t expected : 20 (esc)
+    device 23:\t\t stored : 13\t expected : 20 (esc)
+    device 24:\t\t stored : 18\t expected : 20 (esc)
+  rule 0 (replicated_ruleset) num_rep 6 result size == 4:\t3/100 (esc)
+  rule 0 (replicated_ruleset) num_rep 6 result size == 5:\t97/100 (esc)
+    device 0:\t\t stored : 8\t expected : 20 (esc)
+    device 1:\t\t stored : 6\t expected : 20 (esc)
+    device 2:\t\t stored : 74\t expected : 20 (esc)
+    device 3:\t\t stored : 6\t expected : 20 (esc)
+    device 4:\t\t stored : 6\t expected : 20 (esc)
+    device 5:\t\t stored : 17\t expected : 20 (esc)
+    device 6:\t\t stored : 17\t expected : 20 (esc)
+    device 7:\t\t stored : 19\t expected : 20 (esc)
+    device 8:\t\t stored : 18\t expected : 20 (esc)
+    device 9:\t\t stored : 27\t expected : 20 (esc)
+    device 10:\t\t stored : 15\t expected : 20 (esc)
+    device 11:\t\t stored : 28\t expected : 20 (esc)
+    device 12:\t\t stored : 22\t expected : 20 (esc)
+    device 13:\t\t stored : 18\t expected : 20 (esc)
+    device 14:\t\t stored : 17\t expected : 20 (esc)
+    device 15:\t\t stored : 22\t expected : 20 (esc)
+    device 16:\t\t stored : 14\t expected : 20 (esc)
+    device 17:\t\t stored : 19\t expected : 20 (esc)
+    device 18:\t\t stored : 20\t expected : 20 (esc)
+    device 19:\t\t stored : 25\t expected : 20 (esc)
+    device 20:\t\t stored : 24\t expected : 20 (esc)
+    device 21:\t\t stored : 19\t expected : 20 (esc)
+    device 22:\t\t stored : 25\t expected : 20 (esc)
+    device 23:\t\t stored : 13\t expected : 20 (esc)
+    device 24:\t\t stored : 18\t expected : 20 (esc)
+  rule 0 (replicated_ruleset) num_rep 7 result size == 4:\t3/100 (esc)
+  rule 0 (replicated_ruleset) num_rep 7 result size == 5:\t97/100 (esc)
+    device 0:\t\t stored : 8\t expected : 20 (esc)
+    device 1:\t\t stored : 6\t expected : 20 (esc)
+    device 2:\t\t stored : 74\t expected : 20 (esc)
+    device 3:\t\t stored : 6\t expected : 20 (esc)
+    device 4:\t\t stored : 6\t expected : 20 (esc)
+    device 5:\t\t stored : 17\t expected : 20 (esc)
+    device 6:\t\t stored : 17\t expected : 20 (esc)
+    device 7:\t\t stored : 19\t expected : 20 (esc)
+    device 8:\t\t stored : 18\t expected : 20 (esc)
+    device 9:\t\t stored : 27\t expected : 20 (esc)
+    device 10:\t\t stored : 15\t expected : 20 (esc)
+    device 11:\t\t stored : 28\t expected : 20 (esc)
+    device 12:\t\t stored : 22\t expected : 20 (esc)
+    device 13:\t\t stored : 18\t expected : 20 (esc)
+    device 14:\t\t stored : 17\t expected : 20 (esc)
+    device 15:\t\t stored : 22\t expected : 20 (esc)
+    device 16:\t\t stored : 14\t expected : 20 (esc)
+    device 17:\t\t stored : 19\t expected : 20 (esc)
+    device 18:\t\t stored : 20\t expected : 20 (esc)
+    device 19:\t\t stored : 25\t expected : 20 (esc)
+    device 20:\t\t stored : 24\t expected : 20 (esc)
+    device 21:\t\t stored : 19\t expected : 20 (esc)
+    device 22:\t\t stored : 25\t expected : 20 (esc)
+    device 23:\t\t stored : 13\t expected : 20 (esc)
+    device 24:\t\t stored : 18\t expected : 20 (esc)
+  rule 0 (replicated_ruleset) num_rep 8 result size == 4:\t3/100 (esc)
+  rule 0 (replicated_ruleset) num_rep 8 result size == 5:\t97/100 (esc)
+    device 0:\t\t stored : 8\t expected : 20 (esc)
+    device 1:\t\t stored : 6\t expected : 20 (esc)
+    device 2:\t\t stored : 74\t expected : 20 (esc)
+    device 3:\t\t stored : 6\t expected : 20 (esc)
+    device 4:\t\t stored : 6\t expected : 20 (esc)
+    device 5:\t\t stored : 17\t expected : 20 (esc)
+    device 6:\t\t stored : 17\t expected : 20 (esc)
+    device 7:\t\t stored : 19\t expected : 20 (esc)
+    device 8:\t\t stored : 18\t expected : 20 (esc)
+    device 9:\t\t stored : 27\t expected : 20 (esc)
+    device 10:\t\t stored : 15\t expected : 20 (esc)
+    device 11:\t\t stored : 28\t expected : 20 (esc)
+    device 12:\t\t stored : 22\t expected : 20 (esc)
+    device 13:\t\t stored : 18\t expected : 20 (esc)
+    device 14:\t\t stored : 17\t expected : 20 (esc)
+    device 15:\t\t stored : 22\t expected : 20 (esc)
+    device 16:\t\t stored : 14\t expected : 20 (esc)
+    device 17:\t\t stored : 19\t expected : 20 (esc)
+    device 18:\t\t stored : 20\t expected : 20 (esc)
+    device 19:\t\t stored : 25\t expected : 20 (esc)
+    device 20:\t\t stored : 24\t expected : 20 (esc)
+    device 21:\t\t stored : 19\t expected : 20 (esc)
+    device 22:\t\t stored : 25\t expected : 20 (esc)
+    device 23:\t\t stored : 13\t expected : 20 (esc)
+    device 24:\t\t stored : 18\t expected : 20 (esc)
+  rule 0 (replicated_ruleset) num_rep 9 result size == 4:\t2/100 (esc)
+  rule 0 (replicated_ruleset) num_rep 9 result size == 5:\t98/100 (esc)
+    device 0:\t\t stored : 8\t expected : 20 (esc)
+    device 1:\t\t stored : 6\t expected : 20 (esc)
+    device 2:\t\t stored : 74\t expected : 20 (esc)
+    device 3:\t\t stored : 6\t expected : 20 (esc)
+    device 4:\t\t stored : 6\t expected : 20 (esc)
+    device 5:\t\t stored : 17\t expected : 20 (esc)
+    device 6:\t\t stored : 17\t expected : 20 (esc)
+    device 7:\t\t stored : 19\t expected : 20 (esc)
+    device 8:\t\t stored : 18\t expected : 20 (esc)
+    device 9:\t\t stored : 28\t expected : 20 (esc)
+    device 10:\t\t stored : 15\t expected : 20 (esc)
+    device 11:\t\t stored : 28\t expected : 20 (esc)
+    device 12:\t\t stored : 22\t expected : 20 (esc)
+    device 13:\t\t stored : 18\t expected : 20 (esc)
+    device 14:\t\t stored : 17\t expected : 20 (esc)
+    device 15:\t\t stored : 22\t expected : 20 (esc)
+    device 16:\t\t stored : 14\t expected : 20 (esc)
+    device 17:\t\t stored : 19\t expected : 20 (esc)
+    device 18:\t\t stored : 20\t expected : 20 (esc)
+    device 19:\t\t stored : 25\t expected : 20 (esc)
+    device 20:\t\t stored : 24\t expected : 20 (esc)
+    device 21:\t\t stored : 19\t expected : 20 (esc)
+    device 22:\t\t stored : 25\t expected : 20 (esc)
+    device 23:\t\t stored : 13\t expected : 20 (esc)
+    device 24:\t\t stored : 18\t expected : 20 (esc)
+  rule 0 (replicated_ruleset) num_rep 10 result size == 4:\t2/100 (esc)
+  rule 0 (replicated_ruleset) num_rep 10 result size == 5:\t98/100 (esc)
+    device 0:\t\t stored : 8\t expected : 20 (esc)
+    device 1:\t\t stored : 6\t expected : 20 (esc)
+    device 2:\t\t stored : 74\t expected : 20 (esc)
+    device 3:\t\t stored : 6\t expected : 20 (esc)
+    device 4:\t\t stored : 6\t expected : 20 (esc)
+    device 5:\t\t stored : 17\t expected : 20 (esc)
+    device 6:\t\t stored : 17\t expected : 20 (esc)
+    device 7:\t\t stored : 19\t expected : 20 (esc)
+    device 8:\t\t stored : 18\t expected : 20 (esc)
+    device 9:\t\t stored : 28\t expected : 20 (esc)
+    device 10:\t\t stored : 15\t expected : 20 (esc)
+    device 11:\t\t stored : 28\t expected : 20 (esc)
+    device 12:\t\t stored : 22\t expected : 20 (esc)
+    device 13:\t\t stored : 18\t expected : 20 (esc)
+    device 14:\t\t stored : 17\t expected : 20 (esc)
+    device 15:\t\t stored : 22\t expected : 20 (esc)
+    device 16:\t\t stored : 14\t expected : 20 (esc)
+    device 17:\t\t stored : 19\t expected : 20 (esc)
+    device 18:\t\t stored : 20\t expected : 20 (esc)
+    device 19:\t\t stored : 25\t expected : 20 (esc)
+    device 20:\t\t stored : 24\t expected : 20 (esc)
+    device 21:\t\t stored : 19\t expected : 20 (esc)
+    device 22:\t\t stored : 25\t expected : 20 (esc)
+    device 23:\t\t stored : 13\t expected : 20 (esc)
+    device 24:\t\t stored : 18\t expected : 20 (esc)
+  crushtool successfully built or modified map.  Use '-o <file>' to write it out.
+  $ crushtool -i "$map" --set-straw-calc-version 1 --reweight --test --show-utilization --max-x 100 --min-x 1
+  rule 0 (replicated_ruleset), x = 1..100, numrep = 1..10
+  rule 0 (replicated_ruleset) num_rep 1 result size == 1:\t100/100 (esc)
+    device 1:\t\t stored : 1\t expected : 4 (esc)
+    device 2:\t\t stored : 75\t expected : 4 (esc)
+    device 3:\t\t stored : 2\t expected : 4 (esc)
+    device 4:\t\t stored : 1\t expected : 4 (esc)
+    device 5:\t\t stored : 2\t expected : 4 (esc)
+    device 7:\t\t stored : 2\t expected : 4 (esc)
+    device 8:\t\t stored : 1\t expected : 4 (esc)
+    device 9:\t\t stored : 2\t expected : 4 (esc)
+    device 14:\t\t stored : 3\t expected : 4 (esc)
+    device 16:\t\t stored : 3\t expected : 4 (esc)
+    device 19:\t\t stored : 4\t expected : 4 (esc)
+    device 20:\t\t stored : 2\t expected : 4 (esc)
+    device 22:\t\t stored : 1\t expected : 4 (esc)
+    device 23:\t\t stored : 1\t expected : 4 (esc)
+  rule 0 (replicated_ruleset) num_rep 2 result size == 2:\t100/100 (esc)
+    device 0:\t\t stored : 1\t expected : 8 (esc)
+    device 1:\t\t stored : 1\t expected : 8 (esc)
+    device 2:\t\t stored : 95\t expected : 8 (esc)
+    device 3:\t\t stored : 2\t expected : 8 (esc)
+    device 4:\t\t stored : 1\t expected : 8 (esc)
+    device 5:\t\t stored : 3\t expected : 8 (esc)
+    device 6:\t\t stored : 3\t expected : 8 (esc)
+    device 7:\t\t stored : 7\t expected : 8 (esc)
+    device 8:\t\t stored : 4\t expected : 8 (esc)
+    device 9:\t\t stored : 8\t expected : 8 (esc)
+    device 11:\t\t stored : 1\t expected : 8 (esc)
+    device 12:\t\t stored : 4\t expected : 8 (esc)
+    device 13:\t\t stored : 2\t expected : 8 (esc)
+    device 14:\t\t stored : 6\t expected : 8 (esc)
+    device 15:\t\t stored : 5\t expected : 8 (esc)
+    device 16:\t\t stored : 4\t expected : 8 (esc)
+    device 17:\t\t stored : 8\t expected : 8 (esc)
+    device 18:\t\t stored : 5\t expected : 8 (esc)
+    device 19:\t\t stored : 9\t expected : 8 (esc)
+    device 20:\t\t stored : 7\t expected : 8 (esc)
+    device 21:\t\t stored : 5\t expected : 8 (esc)
+    device 22:\t\t stored : 6\t expected : 8 (esc)
+    device 23:\t\t stored : 5\t expected : 8 (esc)
+    device 24:\t\t stored : 8\t expected : 8 (esc)
+  rule 0 (replicated_ruleset) num_rep 3 result size == 3:\t100/100 (esc)
+    device 0:\t\t stored : 1\t expected : 12 (esc)
+    device 1:\t\t stored : 1\t expected : 12 (esc)
+    device 2:\t\t stored : 95\t expected : 12 (esc)
+    device 3:\t\t stored : 2\t expected : 12 (esc)
+    device 4:\t\t stored : 1\t expected : 12 (esc)
+    device 5:\t\t stored : 4\t expected : 12 (esc)
+    device 6:\t\t stored : 5\t expected : 12 (esc)
+    device 7:\t\t stored : 10\t expected : 12 (esc)
+    device 8:\t\t stored : 16\t expected : 12 (esc)
+    device 9:\t\t stored : 13\t expected : 12 (esc)
+    device 10:\t\t stored : 8\t expected : 12 (esc)
+    device 11:\t\t stored : 5\t expected : 12 (esc)
+    device 12:\t\t stored : 5\t expected : 12 (esc)
+    device 13:\t\t stored : 5\t expected : 12 (esc)
+    device 14:\t\t stored : 8\t expected : 12 (esc)
+    device 15:\t\t stored : 11\t expected : 12 (esc)
+    device 16:\t\t stored : 17\t expected : 12 (esc)
+    device 17:\t\t stored : 12\t expected : 12 (esc)
+    device 18:\t\t stored : 9\t expected : 12 (esc)
+    device 19:\t\t stored : 15\t expected : 12 (esc)
+    device 20:\t\t stored : 16\t expected : 12 (esc)
+    device 21:\t\t stored : 8\t expected : 12 (esc)
+    device 22:\t\t stored : 11\t expected : 12 (esc)
+    device 23:\t\t stored : 11\t expected : 12 (esc)
+    device 24:\t\t stored : 11\t expected : 12 (esc)
+  rule 0 (replicated_ruleset) num_rep 4 result size == 3:\t3/100 (esc)
+  rule 0 (replicated_ruleset) num_rep 4 result size == 4:\t97/100 (esc)
+    device 0:\t\t stored : 1\t expected : 16 (esc)
+    device 1:\t\t stored : 1\t expected : 16 (esc)
+    device 2:\t\t stored : 95\t expected : 16 (esc)
+    device 3:\t\t stored : 2\t expected : 16 (esc)
+    device 4:\t\t stored : 1\t expected : 16 (esc)
+    device 5:\t\t stored : 11\t expected : 16 (esc)
+    device 6:\t\t stored : 12\t expected : 16 (esc)
+    device 7:\t\t stored : 16\t expected : 16 (esc)
+    device 8:\t\t stored : 19\t expected : 16 (esc)
+    device 9:\t\t stored : 18\t expected : 16 (esc)
+    device 10:\t\t stored : 12\t expected : 16 (esc)
+    device 11:\t\t stored : 12\t expected : 16 (esc)
+    device 12:\t\t stored : 13\t expected : 16 (esc)
+    device 13:\t\t stored : 11\t expected : 16 (esc)
+    device 14:\t\t stored : 16\t expected : 16 (esc)
+    device 15:\t\t stored : 19\t expected : 16 (esc)
+    device 16:\t\t stored : 19\t expected : 16 (esc)
+    device 17:\t\t stored : 15\t expected : 16 (esc)
+    device 18:\t\t stored : 11\t expected : 16 (esc)
+    device 19:\t\t stored : 18\t expected : 16 (esc)
+    device 20:\t\t stored : 22\t expected : 16 (esc)
+    device 21:\t\t stored : 12\t expected : 16 (esc)
+    device 22:\t\t stored : 14\t expected : 16 (esc)
+    device 23:\t\t stored : 13\t expected : 16 (esc)
+    device 24:\t\t stored : 14\t expected : 16 (esc)
+  rule 0 (replicated_ruleset) num_rep 5 result size == 3:\t3/100 (esc)
+  rule 0 (replicated_ruleset) num_rep 5 result size == 4:\t43/100 (esc)
+  rule 0 (replicated_ruleset) num_rep 5 result size == 5:\t54/100 (esc)
+    device 0:\t\t stored : 1\t expected : 20 (esc)
+    device 1:\t\t stored : 1\t expected : 20 (esc)
+    device 2:\t\t stored : 95\t expected : 20 (esc)
+    device 3:\t\t stored : 2\t expected : 20 (esc)
+    device 4:\t\t stored : 1\t expected : 20 (esc)
+    device 5:\t\t stored : 14\t expected : 20 (esc)
+    device 6:\t\t stored : 14\t expected : 20 (esc)
+    device 7:\t\t stored : 16\t expected : 20 (esc)
+    device 8:\t\t stored : 19\t expected : 20 (esc)
+    device 9:\t\t stored : 22\t expected : 20 (esc)
+    device 10:\t\t stored : 15\t expected : 20 (esc)
+    device 11:\t\t stored : 16\t expected : 20 (esc)
+    device 12:\t\t stored : 17\t expected : 20 (esc)
+    device 13:\t\t stored : 18\t expected : 20 (esc)
+    device 14:\t\t stored : 19\t expected : 20 (esc)
+    device 15:\t\t stored : 19\t expected : 20 (esc)
+    device 16:\t\t stored : 20\t expected : 20 (esc)
+    device 17:\t\t stored : 17\t expected : 20 (esc)
+    device 18:\t\t stored : 15\t expected : 20 (esc)
+    device 19:\t\t stored : 20\t expected : 20 (esc)
+    device 20:\t\t stored : 26\t expected : 20 (esc)
+    device 21:\t\t stored : 17\t expected : 20 (esc)
+    device 22:\t\t stored : 16\t expected : 20 (esc)
+    device 23:\t\t stored : 15\t expected : 20 (esc)
+    device 24:\t\t stored : 16\t expected : 20 (esc)
+  rule 0 (replicated_ruleset) num_rep 6 result size == 3:\t2/100 (esc)
+  rule 0 (replicated_ruleset) num_rep 6 result size == 4:\t43/100 (esc)
+  rule 0 (replicated_ruleset) num_rep 6 result size == 5:\t55/100 (esc)
+    device 0:\t\t stored : 1\t expected : 20 (esc)
+    device 1:\t\t stored : 1\t expected : 20 (esc)
+    device 2:\t\t stored : 95\t expected : 20 (esc)
+    device 3:\t\t stored : 2\t expected : 20 (esc)
+    device 4:\t\t stored : 1\t expected : 20 (esc)
+    device 5:\t\t stored : 14\t expected : 20 (esc)
+    device 6:\t\t stored : 14\t expected : 20 (esc)
+    device 7:\t\t stored : 16\t expected : 20 (esc)
+    device 8:\t\t stored : 19\t expected : 20 (esc)
+    device 9:\t\t stored : 22\t expected : 20 (esc)
+    device 10:\t\t stored : 15\t expected : 20 (esc)
+    device 11:\t\t stored : 16\t expected : 20 (esc)
+    device 12:\t\t stored : 17\t expected : 20 (esc)
+    device 13:\t\t stored : 18\t expected : 20 (esc)
+    device 14:\t\t stored : 20\t expected : 20 (esc)
+    device 15:\t\t stored : 19\t expected : 20 (esc)
+    device 16:\t\t stored : 20\t expected : 20 (esc)
+    device 17:\t\t stored : 17\t expected : 20 (esc)
+    device 18:\t\t stored : 15\t expected : 20 (esc)
+    device 19:\t\t stored : 20\t expected : 20 (esc)
+    device 20:\t\t stored : 26\t expected : 20 (esc)
+    device 21:\t\t stored : 17\t expected : 20 (esc)
+    device 22:\t\t stored : 16\t expected : 20 (esc)
+    device 23:\t\t stored : 16\t expected : 20 (esc)
+    device 24:\t\t stored : 16\t expected : 20 (esc)
+  rule 0 (replicated_ruleset) num_rep 7 result size == 3:\t2/100 (esc)
+  rule 0 (replicated_ruleset) num_rep 7 result size == 4:\t42/100 (esc)
+  rule 0 (replicated_ruleset) num_rep 7 result size == 5:\t56/100 (esc)
+    device 0:\t\t stored : 1\t expected : 20 (esc)
+    device 1:\t\t stored : 1\t expected : 20 (esc)
+    device 2:\t\t stored : 95\t expected : 20 (esc)
+    device 3:\t\t stored : 2\t expected : 20 (esc)
+    device 4:\t\t stored : 1\t expected : 20 (esc)
+    device 5:\t\t stored : 14\t expected : 20 (esc)
+    device 6:\t\t stored : 14\t expected : 20 (esc)
+    device 7:\t\t stored : 16\t expected : 20 (esc)
+    device 8:\t\t stored : 19\t expected : 20 (esc)
+    device 9:\t\t stored : 22\t expected : 20 (esc)
+    device 10:\t\t stored : 15\t expected : 20 (esc)
+    device 11:\t\t stored : 16\t expected : 20 (esc)
+    device 12:\t\t stored : 17\t expected : 20 (esc)
+    device 13:\t\t stored : 19\t expected : 20 (esc)
+    device 14:\t\t stored : 20\t expected : 20 (esc)
+    device 15:\t\t stored : 19\t expected : 20 (esc)
+    device 16:\t\t stored : 20\t expected : 20 (esc)
+    device 17:\t\t stored : 17\t expected : 20 (esc)
+    device 18:\t\t stored : 15\t expected : 20 (esc)
+    device 19:\t\t stored : 20\t expected : 20 (esc)
+    device 20:\t\t stored : 26\t expected : 20 (esc)
+    device 21:\t\t stored : 17\t expected : 20 (esc)
+    device 22:\t\t stored : 16\t expected : 20 (esc)
+    device 23:\t\t stored : 16\t expected : 20 (esc)
+    device 24:\t\t stored : 16\t expected : 20 (esc)
+  rule 0 (replicated_ruleset) num_rep 8 result size == 3:\t2/100 (esc)
+  rule 0 (replicated_ruleset) num_rep 8 result size == 4:\t40/100 (esc)
+  rule 0 (replicated_ruleset) num_rep 8 result size == 5:\t58/100 (esc)
+    device 0:\t\t stored : 1\t expected : 20 (esc)
+    device 1:\t\t stored : 1\t expected : 20 (esc)
+    device 2:\t\t stored : 95\t expected : 20 (esc)
+    device 3:\t\t stored : 2\t expected : 20 (esc)
+    device 4:\t\t stored : 1\t expected : 20 (esc)
+    device 5:\t\t stored : 14\t expected : 20 (esc)
+    device 6:\t\t stored : 14\t expected : 20 (esc)
+    device 7:\t\t stored : 16\t expected : 20 (esc)
+    device 8:\t\t stored : 19\t expected : 20 (esc)
+    device 9:\t\t stored : 22\t expected : 20 (esc)
+    device 10:\t\t stored : 15\t expected : 20 (esc)
+    device 11:\t\t stored : 16\t expected : 20 (esc)
+    device 12:\t\t stored : 17\t expected : 20 (esc)
+    device 13:\t\t stored : 20\t expected : 20 (esc)
+    device 14:\t\t stored : 20\t expected : 20 (esc)
+    device 15:\t\t stored : 19\t expected : 20 (esc)
+    device 16:\t\t stored : 20\t expected : 20 (esc)
+    device 17:\t\t stored : 17\t expected : 20 (esc)
+    device 18:\t\t stored : 16\t expected : 20 (esc)
+    device 19:\t\t stored : 20\t expected : 20 (esc)
+    device 20:\t\t stored : 26\t expected : 20 (esc)
+    device 21:\t\t stored : 17\t expected : 20 (esc)
+    device 22:\t\t stored : 16\t expected : 20 (esc)
+    device 23:\t\t stored : 16\t expected : 20 (esc)
+    device 24:\t\t stored : 16\t expected : 20 (esc)
+  rule 0 (replicated_ruleset) num_rep 9 result size == 3:\t2/100 (esc)
+  rule 0 (replicated_ruleset) num_rep 9 result size == 4:\t37/100 (esc)
+  rule 0 (replicated_ruleset) num_rep 9 result size == 5:\t61/100 (esc)
+    device 0:\t\t stored : 1\t expected : 20 (esc)
+    device 1:\t\t stored : 1\t expected : 20 (esc)
+    device 2:\t\t stored : 95\t expected : 20 (esc)
+    device 3:\t\t stored : 2\t expected : 20 (esc)
+    device 4:\t\t stored : 1\t expected : 20 (esc)
+    device 5:\t\t stored : 14\t expected : 20 (esc)
+    device 6:\t\t stored : 14\t expected : 20 (esc)
+    device 7:\t\t stored : 16\t expected : 20 (esc)
+    device 8:\t\t stored : 19\t expected : 20 (esc)
+    device 9:\t\t stored : 23\t expected : 20 (esc)
+    device 10:\t\t stored : 15\t expected : 20 (esc)
+    device 11:\t\t stored : 16\t expected : 20 (esc)
+    device 12:\t\t stored : 17\t expected : 20 (esc)
+    device 13:\t\t stored : 20\t expected : 20 (esc)
+    device 14:\t\t stored : 21\t expected : 20 (esc)
+    device 15:\t\t stored : 19\t expected : 20 (esc)
+    device 16:\t\t stored : 20\t expected : 20 (esc)
+    device 17:\t\t stored : 18\t expected : 20 (esc)
+    device 18:\t\t stored : 16\t expected : 20 (esc)
+    device 19:\t\t stored : 20\t expected : 20 (esc)
+    device 20:\t\t stored : 26\t expected : 20 (esc)
+    device 21:\t\t stored : 17\t expected : 20 (esc)
+    device 22:\t\t stored : 16\t expected : 20 (esc)
+    device 23:\t\t stored : 16\t expected : 20 (esc)
+    device 24:\t\t stored : 16\t expected : 20 (esc)
+  rule 0 (replicated_ruleset) num_rep 10 result size == 3:\t2/100 (esc)
+  rule 0 (replicated_ruleset) num_rep 10 result size == 4:\t36/100 (esc)
+  rule 0 (replicated_ruleset) num_rep 10 result size == 5:\t62/100 (esc)
+    device 0:\t\t stored : 1\t expected : 20 (esc)
+    device 1:\t\t stored : 1\t expected : 20 (esc)
+    device 2:\t\t stored : 95\t expected : 20 (esc)
+    device 3:\t\t stored : 2\t expected : 20 (esc)
+    device 4:\t\t stored : 1\t expected : 20 (esc)
+    device 5:\t\t stored : 14\t expected : 20 (esc)
+    device 6:\t\t stored : 14\t expected : 20 (esc)
+    device 7:\t\t stored : 16\t expected : 20 (esc)
+    device 8:\t\t stored : 19\t expected : 20 (esc)
+    device 9:\t\t stored : 23\t expected : 20 (esc)
+    device 10:\t\t stored : 15\t expected : 20 (esc)
+    device 11:\t\t stored : 17\t expected : 20 (esc)
+    device 12:\t\t stored : 17\t expected : 20 (esc)
+    device 13:\t\t stored : 20\t expected : 20 (esc)
+    device 14:\t\t stored : 21\t expected : 20 (esc)
+    device 15:\t\t stored : 19\t expected : 20 (esc)
+    device 16:\t\t stored : 20\t expected : 20 (esc)
+    device 17:\t\t stored : 18\t expected : 20 (esc)
+    device 18:\t\t stored : 16\t expected : 20 (esc)
+    device 19:\t\t stored : 20\t expected : 20 (esc)
+    device 20:\t\t stored : 26\t expected : 20 (esc)
+    device 21:\t\t stored : 17\t expected : 20 (esc)
+    device 22:\t\t stored : 16\t expected : 20 (esc)
+    device 23:\t\t stored : 16\t expected : 20 (esc)
+    device 24:\t\t stored : 16\t expected : 20 (esc)
+  crushtool successfully built or modified map.  Use '-o <file>' to write it out.
diff --git a/src/test/cli/crushtool/build.t b/src/test/cli/crushtool/build.t
index 4b98b3c..57fb2b2 100644
--- a/src/test/cli/crushtool/build.t
+++ b/src/test/cli/crushtool/build.t
@@ -4,21 +4,6 @@
 # display the crush tree by default
 #
   $ crushtool --outfn "$map" --build --num_osds 5 node straw 2 rack straw 1 root straw 0
-  .* (re)
-  ID\tWEIGHT\tTYPE NAME (esc)
-  -7\t5.00000\troot root (esc)
-  -4\t2.00000\t\track rack0 (esc)
-  -1\t2.00000\t\t\tnode node0 (esc)
-  0\t1.00000\t\t\t\tosd.0 (esc)
-  1\t1.00000\t\t\t\tosd.1 (esc)
-  -5\t2.00000\t\track rack1 (esc)
-  -2\t2.00000\t\t\tnode node1 (esc)
-  2\t1.00000\t\t\t\tosd.2 (esc)
-  3\t1.00000\t\t\t\tosd.3 (esc)
-  -6\t1.00000\t\track rack2 (esc)
-  -3\t1.00000\t\t\tnode node2 (esc)
-  4\t1.00000\t\t\t\tosd.4 (esc)
-  
 
 #  
 # silence all messages with --debug-crush 0
@@ -29,20 +14,6 @@
 # display a warning if there is more than one root
 #
   $ crushtool --outfn "$map" --build --num_osds 5 node straw 2 rack straw 1 
-  .* (re)
-  ID\tWEIGHT\tTYPE NAME (esc)
-  -6\t1.00000\track rack2 (esc)
-  -3\t1.00000\t\tnode node2 (esc)
-  4\t1.00000\t\t\tosd.4 (esc)
-  -5\t2.00000\track rack1 (esc)
-  -2\t2.00000\t\tnode node1 (esc)
-  2\t1.00000\t\t\tosd.2 (esc)
-  3\t1.00000\t\t\tosd.3 (esc)
-  -4\t2.00000\track rack0 (esc)
-  -1\t2.00000\t\tnode node0 (esc)
-  0\t1.00000\t\t\tosd.0 (esc)
-  1\t1.00000\t\t\tosd.1 (esc)
-  
   .* The crush rulesets will use the root rack0 (re)
   and ignore the others.
   There are 3 roots, they can be
@@ -62,7 +33,7 @@
   tunable chooseleaf_descend_once 1
   
   # devices
-  device 0 device0
+  device 0 osd.0
   
   # types
   type 0 device
@@ -74,7 +45,7 @@
   \t# weight 1.000 (esc)
   \talg straw (esc)
   \thash 0\t# rjenkins1 (esc)
-  \titem device0 weight 1.000 (esc)
+  \titem osd.0 weight 1.000 (esc)
   }
   
   # rules
diff --git a/src/test/cli/crushtool/check-names.empty.t b/src/test/cli/crushtool/check-names.empty.t
index 755e931..9e30790 100644
--- a/src/test/cli/crushtool/check-names.empty.t
+++ b/src/test/cli/crushtool/check-names.empty.t
@@ -1,5 +1,4 @@
   $ crushtool -c "$TESTDIR/check-names.empty.crushmap.txt" -o "$TESTDIR/check-names.empty.crushmap"
-  $ crushtool -i "$TESTDIR/check-names.empty.crushmap" --check 0
+  $ crushtool -i "$TESTDIR/check-names.empty.crushmap" --check-names
   unknown type name: item#0
-  [1]
   $ rm -f "$TESTDIR/check-names.empty.crushmap"
diff --git a/src/test/cli/crushtool/check-names.max-id.t b/src/test/cli/crushtool/check-names.max-id.t
index dab2d31..18724ff 100644
--- a/src/test/cli/crushtool/check-names.max-id.t
+++ b/src/test/cli/crushtool/check-names.max-id.t
@@ -4,4 +4,4 @@
   $ crushtool -i check-names.crushmap       --add-item 2 1.0 device2 --loc host host0 --loc cluster cluster0 -o check-names.crushmap > /dev/null
   $ crushtool -i check-names.crushmap --check 2
   item id too large: item#2
-  [1]
+  $ crushtool -i check-names.crushmap --check
diff --git a/src/test/cli/crushtool/help.t b/src/test/cli/crushtool/help.t
index 9c81b7c..4c21912 100644
--- a/src/test/cli/crushtool/help.t
+++ b/src/test/cli/crushtool/help.t
@@ -1,23 +1,48 @@
   $ crushtool --help
   usage: crushtool ...
+  
+  Display, modify and test a crush map
+  
+  There are five stages, running one after the other:
+  
+   - input/build
+   - tunables adjustments
+   - modifications
+   - display/test
+   - output
+  
+  Options that are not specific to a stage.
+  
+     [--infn|-i infile]
+                           read the crush map from infile
+  
+  Options for the input/build stage
+  
      --decompile|-d map    decompile a crush map to source
-     --tree                print map summary as a tree
-     --compile|-c map.txt  compile a map from source
-     [-o outfile [--clobber]]
+     [--outfn|-o outfile]
                            specify output for for (de)compilation
+     --compile|-c map.txt  compile a map from source
+     --enable-unsafe-tunables compile with unsafe tunables
      --build --num_osds N layer1 ...
                            build a new map, where each 'layer' is
                              'name (uniform|straw|list|tree) size'
-     -i mapfn --test       test a range of inputs on the map
-        [--min-x x] [--max-x x] [--x x]
-        [--min-rule r] [--max-rule r] [--rule r]
-        [--num-rep n]
-        [--batches b]      split the CRUSH mapping into b > 1 rounds
-        [--weight|-w devno weight]
-                           where weight is 0 to 1.0
-        [--simulate]       simulate placements using a random
-                           number generator in place of the CRUSH
-                           algorithm
+  
+  Options for the tunables adjustments stage
+  
+     --set-choose-local-tries N
+                           set choose local retries before re-descent
+     --set-choose-local-fallback-tries N
+                           set choose local retries using fallback
+                           permutation before re-descent
+     --set-choose-total-tries N
+                           set choose total descent attempts
+     --set-chooseleaf-descend-once <0|1>
+                           set chooseleaf to (not) retry the recursive descent
+     --set-chooseleaf-vary-r <0|1>
+                           set chooseleaf to (not) vary r based on parent
+  
+  Options for the modifications stage
+  
      -i mapfn --add-item id weight name [--loc type name ...]
                            insert an item into the hierarchy at the
                            given location
@@ -33,9 +58,20 @@
   
   Options for the display/test stage
   
-     --check max_id        check if any item is referencing an unknown name/type
+     --tree                print map summary as a tree
+     --check [max_id]      check if any item is referencing an unknown name/type
      -i mapfn --show-location id
                            show location for given device id
+     -i mapfn --test       test a range of inputs on the map
+        [--min-x x] [--max-x x] [--x x]
+        [--min-rule r] [--max-rule r] [--rule r]
+        [--num-rep n]
+        [--batches b]      split the CRUSH mapping into b > 1 rounds
+        [--weight|-w devno weight]
+                           where weight is 0 to 1.0
+        [--simulate]       simulate placements using a random
+                           number generator in place of the CRUSH
+                           algorithm
      --show-utilization    show OSD usage
      --show utilization-all
                            include zero weight items
@@ -43,17 +79,6 @@
      --show-mappings       show mappings
      --show-bad-mappings   show bad mappings
      --show-choose-tries   show choose tries histogram
-     --set-choose-local-tries N
-                           set choose local retries before re-descent
-     --set-choose-local-fallback-tries N
-                           set choose local retries using fallback
-                           permutation before re-descent
-     --set-choose-total-tries N
-                           set choose total descent attempts
-     --set-chooseleaf-descend-once <0|1>
-                           set chooseleaf to (not) retry the recursive descent
-     --set-chooseleaf-vary-r <0|1>
-                           set chooseleaf to (not) vary r based on parent
      --output-name name
                            prepend the data file(s) generated during the
                            testing routine with name
@@ -61,6 +86,12 @@
                            export select data generated during testing routine
                            to CSV files for off-line post-processing
                            use --help-output for more information
+  
+  Options for the output stage
+  
+     [--outfn|-o outfile]
+                           specify output for for modified crush map
+  
   $ crushtool --help-output
   data output from testing routine ...
             absolute_weights
diff --git a/src/test/cli/crushtool/set-choose.t b/src/test/cli/crushtool/set-choose.t
index b40494d..42df9f2 100644
--- a/src/test/cli/crushtool/set-choose.t
+++ b/src/test/cli/crushtool/set-choose.t
@@ -1,6 +1,5 @@
   $ crushtool -c "$TESTDIR/set-choose.crushmap.txt" -o set-choose.crushmap
   $ crushtool -i set-choose.crushmap --test --show-mappings --show-statistics --set-straw-calc-version 0
-  crushtool successfully built or modified map.  Use '-o <file>' to write it out.
   rule 0 (choose), x = 0..1023, numrep = 2..3
   CRUSH rule 0 x 0 [0,3]
   CRUSH rule 0 x 1 [0,8]
@@ -12307,8 +12306,8 @@
   CRUSH rule 5 x 1022 [1,6,4]
   CRUSH rule 5 x 1023 [3,2,8]
   rule 5 (chooseleaf-set) num_rep 3 result size == 3:\t1024/1024 (esc)
-  $ crushtool -i set-choose.crushmap --test --show-mappings --show-statistics --weight 0 0 --weight 1 0 --weight 3 0 --weight 4 0 --set-straw-calc-version 0
   crushtool successfully built or modified map.  Use '-o <file>' to write it out.
+  $ crushtool -i set-choose.crushmap --test --show-mappings --show-statistics --weight 0 0 --weight 1 0 --weight 3 0 --weight 4 0 --set-straw-calc-version 0
   rule 0 (choose), x = 0..1023, numrep = 2..3
   CRUSH rule 0 x 0 [2,5]
   CRUSH rule 0 x 1 [2,8]
@@ -24620,8 +24619,8 @@
   CRUSH rule 5 x 1022 [2,6,5]
   CRUSH rule 5 x 1023 [5,2,8]
   rule 5 (chooseleaf-set) num_rep 3 result size == 3:\t1024/1024 (esc)
-  $ crushtool -i set-choose.crushmap --test --show-mappings --show-statistics --weight 0 0 --weight 3 0 --weight 4 .5 --weight 5 0 --weight 6 .1 --weight 7 0 --set-straw-calc-version 0
   crushtool successfully built or modified map.  Use '-o <file>' to write it out.
+  $ crushtool -i set-choose.crushmap --test --show-mappings --show-statistics --weight 0 0 --weight 3 0 --weight 4 .5 --weight 5 0 --weight 6 .1 --weight 7 0 --set-straw-calc-version 0
   rule 0 (choose), x = 0..1023, numrep = 2..3
   CRUSH rule 0 x 0 [2,4]
   CRUSH rule 0 x 1 [2,8]
@@ -36942,3 +36941,4 @@
   CRUSH rule 5 x 1023 [4,2,8]
   rule 5 (chooseleaf-set) num_rep 3 result size == 2:\t501/1024 (esc)
   rule 5 (chooseleaf-set) num_rep 3 result size == 3:\t523/1024 (esc)
+  crushtool successfully built or modified map.  Use '-o <file>' to write it out.
diff --git a/src/test/cli/crushtool/test-map-bobtail-tunables.t b/src/test/cli/crushtool/test-map-bobtail-tunables.t
index 77f2ce8..5037cfd 100644
--- a/src/test/cli/crushtool/test-map-bobtail-tunables.t
+++ b/src/test/cli/crushtool/test-map-bobtail-tunables.t
@@ -1,5 +1,4 @@
   $ crushtool -i "$TESTDIR/test-map-a.crushmap" --test --show-mappings --show-statistics --rule 0 --set-choose-local-tries 0 --set-choose-local-fallback-tries 0 --set-choose-total-tries 50 --set-chooseleaf-descend-once 1
-  crushtool successfully built or modified map.  Use '-o <file>' to write it out.
   rule 0 (data), x = 0..1023, numrep = 1..10
   CRUSH rule 0 x 0 [36]
   CRUSH rule 0 x 1 [876]
@@ -10251,3 +10250,4 @@
   CRUSH rule 0 x 1022 [967,829,973,640,703,470,871,828,440,449]
   CRUSH rule 0 x 1023 [488,257,614,859,325,419,50,560,595,554]
   rule 0 (data) num_rep 10 result size == 10:\t1024/1024 (esc)
+  crushtool successfully built or modified map.  Use '-o <file>' to write it out.
diff --git a/src/test/cli/crushtool/test-map-firefly-tunables.t b/src/test/cli/crushtool/test-map-firefly-tunables.t
index a75e89f..93bba48 100644
--- a/src/test/cli/crushtool/test-map-firefly-tunables.t
+++ b/src/test/cli/crushtool/test-map-firefly-tunables.t
@@ -1,5 +1,4 @@
   $ crushtool -i "$TESTDIR/test-map-vary-r.crushmap" --test --show-mappings --show-statistics --rule 0 --set-choose-local-tries 0 --set-choose-local-fallback-tries 0 --set-choose-total-tries 50 --set-chooseleaf-descend-once 1 --set-chooseleaf-vary-r 1 --weight 12 0 --weight 20 0 --weight 30 0
-  crushtool successfully built or modified map.  Use '-o <file>' to write it out.
   rule 0 (data), x = 0..1023, numrep = 1..10
   CRUSH rule 0 x 0 [101]
   CRUSH rule 0 x 1 [80]
@@ -10257,3 +10256,4 @@
   CRUSH rule 0 x 1022 [73,21,36]
   CRUSH rule 0 x 1023 [0,16,3]
   rule 0 (data) num_rep 10 result size == 3:\t1024/1024 (esc)
+  crushtool successfully built or modified map.  Use '-o <file>' to write it out.
diff --git a/src/test/cli/crushtool/test-map-indep.t b/src/test/cli/crushtool/test-map-indep.t
index f4ee371..37eb7b5 100644
--- a/src/test/cli/crushtool/test-map-indep.t
+++ b/src/test/cli/crushtool/test-map-indep.t
@@ -1,5 +1,4 @@
   $ crushtool -i "$TESTDIR/test-map-indep.crushmap" --test --show-mappings --show-statistics --rule 1 --set-choose-local-tries 0 --set-choose-local-fallback-tries 0 --set-choose-total-tries 50 --set-chooseleaf-descend-once 2
-  crushtool successfully built or modified map.  Use '-o <file>' to write it out.
   rule 1 (metadata), x = 0..1023, numrep = 1..10
   CRUSH rule 1 x 0 [36]
   CRUSH rule 1 x 1 [876]
@@ -10251,3 +10250,4 @@
   CRUSH rule 1 x 1022 [967,829,973,640,703,470,871,828,440,449]
   CRUSH rule 1 x 1023 [488,257,614,859,325,419,50,560,595,554]
   rule 1 (metadata) num_rep 10 result size == 10:\t1024/1024 (esc)
+  crushtool successfully built or modified map.  Use '-o <file>' to write it out.
diff --git a/src/test/cli/crushtool/test-map-vary-r-0.t b/src/test/cli/crushtool/test-map-vary-r-0.t
index eefd862..389e3cf 100644
--- a/src/test/cli/crushtool/test-map-vary-r-0.t
+++ b/src/test/cli/crushtool/test-map-vary-r-0.t
@@ -1,5 +1,4 @@
   $ crushtool -i "$TESTDIR/test-map-vary-r.crushmap" --test --show-mappings --show-statistics --rule 3 --set-chooseleaf-vary-r 0 --weight 0 0 --weight 4 0 --weight 9 0
-  crushtool successfully built or modified map.  Use '-o <file>' to write it out.
   rule 3 (delltestrule), x = 0..1023, numrep = 2..4
   CRUSH rule 3 x 0 [94,85]
   CRUSH rule 3 x 1 [73,78]
@@ -3079,3 +3078,4 @@
   CRUSH rule 3 x 1023 [59,88]
   rule 3 (delltestrule) num_rep 4 result size == 1:\t27/1024 (esc)
   rule 3 (delltestrule) num_rep 4 result size == 2:\t997/1024 (esc)
+  crushtool successfully built or modified map.  Use '-o <file>' to write it out.
diff --git a/src/test/cli/crushtool/test-map-vary-r-1.t b/src/test/cli/crushtool/test-map-vary-r-1.t
index a21b9d5..f588860 100644
--- a/src/test/cli/crushtool/test-map-vary-r-1.t
+++ b/src/test/cli/crushtool/test-map-vary-r-1.t
@@ -1,5 +1,4 @@
   $ crushtool -i "$TESTDIR/test-map-vary-r.crushmap" --test --show-mappings --show-statistics --rule 3 --set-chooseleaf-vary-r 1 --weight 0 0 --weight 4 0 --weight 9 0
-  crushtool successfully built or modified map.  Use '-o <file>' to write it out.
   rule 3 (delltestrule), x = 0..1023, numrep = 2..4
   CRUSH rule 3 x 0 [94,6]
   CRUSH rule 3 x 1 [73,52]
@@ -3076,3 +3075,4 @@
   CRUSH rule 3 x 1022 [73,34]
   CRUSH rule 3 x 1023 [88,79]
   rule 3 (delltestrule) num_rep 4 result size == 2:\t1024/1024 (esc)
+  crushtool successfully built or modified map.  Use '-o <file>' to write it out.
diff --git a/src/test/cli/crushtool/test-map-vary-r-2.t b/src/test/cli/crushtool/test-map-vary-r-2.t
index eaf0542..efc1932 100644
--- a/src/test/cli/crushtool/test-map-vary-r-2.t
+++ b/src/test/cli/crushtool/test-map-vary-r-2.t
@@ -1,5 +1,4 @@
   $ crushtool -i "$TESTDIR/test-map-vary-r.crushmap" --test --show-mappings --show-statistics --rule 3 --set-chooseleaf-vary-r 2 --weight 0 0 --weight 4 0 --weight 9 0
-  crushtool successfully built or modified map.  Use '-o <file>' to write it out.
   rule 3 (delltestrule), x = 0..1023, numrep = 2..4
   CRUSH rule 3 x 0 [94,45]
   CRUSH rule 3 x 1 [73,78]
@@ -3076,3 +3075,4 @@
   CRUSH rule 3 x 1022 [73,106]
   CRUSH rule 3 x 1023 [88,89]
   rule 3 (delltestrule) num_rep 4 result size == 2:\t1024/1024 (esc)
+  crushtool successfully built or modified map.  Use '-o <file>' to write it out.
diff --git a/src/test/cli/crushtool/test-map-vary-r-3.t b/src/test/cli/crushtool/test-map-vary-r-3.t
index 31943b2..ec41ecb 100644
--- a/src/test/cli/crushtool/test-map-vary-r-3.t
+++ b/src/test/cli/crushtool/test-map-vary-r-3.t
@@ -1,5 +1,4 @@
   $ crushtool -i "$TESTDIR/test-map-vary-r.crushmap" --test --show-mappings --show-statistics --rule 3 --set-chooseleaf-vary-r 3 --weight 0 0 --weight 4 0 --weight 9 0
-  crushtool successfully built or modified map.  Use '-o <file>' to write it out.
   rule 3 (delltestrule), x = 0..1023, numrep = 2..4
   CRUSH rule 3 x 0 [94,85]
   CRUSH rule 3 x 1 [73,78]
@@ -3076,3 +3075,4 @@
   CRUSH rule 3 x 1022 [73,28]
   CRUSH rule 3 x 1023 [83,88]
   rule 3 (delltestrule) num_rep 4 result size == 2:\t1024/1024 (esc)
+  crushtool successfully built or modified map.  Use '-o <file>' to write it out.
diff --git a/src/test/cli/crushtool/test-map-vary-r-4.t b/src/test/cli/crushtool/test-map-vary-r-4.t
index 24cf0ba..e32a194 100644
--- a/src/test/cli/crushtool/test-map-vary-r-4.t
+++ b/src/test/cli/crushtool/test-map-vary-r-4.t
@@ -1,5 +1,4 @@
   $ crushtool -i "$TESTDIR/test-map-vary-r.crushmap" --test --show-mappings --show-statistics --rule 3 --set-chooseleaf-vary-r 4 --weight 0 0 --weight 4 0 --weight 9 0
-  crushtool successfully built or modified map.  Use '-o <file>' to write it out.
   rule 3 (delltestrule), x = 0..1023, numrep = 2..4
   CRUSH rule 3 x 0 [94,85]
   CRUSH rule 3 x 1 [73,78]
@@ -3076,3 +3075,4 @@
   CRUSH rule 3 x 1022 [73,28]
   CRUSH rule 3 x 1023 [59,88]
   rule 3 (delltestrule) num_rep 4 result size == 2:\t1024/1024 (esc)
+  crushtool successfully built or modified map.  Use '-o <file>' to write it out.
diff --git a/src/test/cli/osdmaptool/pool.t b/src/test/cli/osdmaptool/pool.t
index 7a74c51..5441a34 100644
--- a/src/test/cli/osdmaptool/pool.t
+++ b/src/test/cli/osdmaptool/pool.t
@@ -10,7 +10,7 @@
   [1]
 
   $ osdmaptool myosdmap --test-map-object foo --pool bar
-  strict_strtoll: expected integer, got: 'bar'
+  The option value 'bar' is invalid
   [1]
 
   $ osdmaptool myosdmap --test-map-object foo --pool 123
@@ -35,7 +35,7 @@
   [1]
 
   $ osdmaptool myosdmap --test-map-pgs --pool baz
-  strict_strtoll: expected integer, got: 'baz'
+  The option value 'baz' is invalid
   [1]
 
   $ osdmaptool myosdmap --test-map-pgs --pool 123
diff --git a/src/test/cli/osdmaptool/tree.t b/src/test/cli/osdmaptool/tree.t
new file mode 100644
index 0000000..00eb0be
--- /dev/null
+++ b/src/test/cli/osdmaptool/tree.t
@@ -0,0 +1,19 @@
+  $ osdmaptool --createsimple 3 om
+  osdmaptool: osdmap file 'om'
+  osdmaptool: writing epoch 1 to om
+
+  $ osdmaptool --tree=plain om
+  osdmaptool: osdmap file 'om'
+  ID WEIGHT  TYPE NAME              UP/DOWN REWEIGHT PRIMARY-AFFINITY 
+  -1 3.00000 root default                                             
+  -3 3.00000     rack localrack                                       
+  -2 3.00000         host localhost                                   
+   0 1.00000             osd.0          DNE        0                  
+   1 1.00000             osd.1          DNE        0                  
+   2 1.00000             osd.2          DNE        0                  
+
+  $ osdmaptool --tree=json om
+  osdmaptool: osdmap file 'om'
+  {"nodes":[{"id":-1,"name":"default","type":"root","type_id":10,"children":[-3]},{"id":-3,"name":"localrack","type":"rack","type_id":3,"children":[-2]},{"id":-2,"name":"localhost","type":"host","type_id":1,"children":[2,1,0]},{"id":0,"name":"osd.0","type":"osd","type_id":0,"crush_weight":1.000000,"depth":3,"exists":0,"status":"down","reweight":0.000000,"primary_affinity":1.000000},{"id":1,"name":"osd.1","type":"osd","type_id":0,"crush_weight":1.000000,"depth":3,"exists":0,"status":"down [...]
+  $ rm -f om
+
diff --git a/src/test/cli/radosgw-admin/help.t b/src/test/cli/radosgw-admin/help.t
index bdb7324..2ef9573 100644
--- a/src/test/cli/radosgw-admin/help.t
+++ b/src/test/cli/radosgw-admin/help.t
@@ -24,6 +24,7 @@
     bucket check               check bucket index
     object rm                  remove object
     object unlink              unlink object from bucket index
+    objects expire             run expired objects cleanup
     quota set                  set quota params
     quota enable               enable quota
     quota disable              disable quota
@@ -124,6 +125,7 @@
      --caps=<caps>             list of caps (e.g., "usage=read, write; user=read"
      --yes-i-really-mean-it    required for certain operations
      --reset-regions           reset regionmap when regionmap update
+ 
   <date> := "YYYY-MM-DD[ hh:mm:ss]"
   
   Quota options:
@@ -136,6 +138,8 @@
     --id/-i ID        set ID portion of my name
     --name/-n TYPE.ID set name
     --cluster NAME    set cluster name (default: ceph)
+    --setuser USER    set uid to user or uid (and gid to user's gid)
+    --setgroup GROUP  set gid to group or gid
     --version         show version and quit
   
   [1]
diff --git a/src/test/cli/rbd/help.t b/src/test/cli/rbd/help.t
index 0263781..ad6a9ee 100644
--- a/src/test/cli/rbd/help.t
+++ b/src/test/cli/rbd/help.t
@@ -1,89 +1,109 @@
   $ rbd --help
   usage: rbd [-n <auth user>] [OPTIONS] <cmd> ...
   where 'pool' is a rados pool name (default is 'rbd') and 'cmd' is one of:
-    (ls | list) [-l | --long ] [pool-name] list rbd images
+    (ls | list) [-l | --long ] [pool-name]      list rbd images
                                                 (-l includes snapshots/clones)
-    info <image-name>                           show information about image size,
+    (du | disk-usage) [<image-spec> | <snap-spec>]
+                                                show disk usage stats for pool,
+                                                image or snapshot
+    info <image-spec> | <snap-spec>             show information about image size,
                                                 striping, etc.
     create [--order <bits>] [--image-features <features>] [--image-shared]
-           --size <MB> <name>                   create an empty image
+           --size <M/G/T> <image-spec>          create an empty image
     clone [--order <bits>] [--image-features <features>] [--image-shared]
-          <parentsnap> <clonename>              clone a snapshot into a COW
+           <parent-snap-spec> <child-image-spec>
+                                                clone a snapshot into a COW
                                                 child image
-    children <snap-name>                        display children of snapshot
-    flatten <image-name>                        fill clone with parent data
+    children <snap-spec>                        display children of snapshot
+    flatten <image-spec>                        fill clone with parent data
                                                 (make it independent)
-    resize --size <MB> <image-name>             resize (expand or contract) image
-    rm <image-name>                             delete an image
-    export <image-name> <path>                  export image to file
+    resize --size <M/G/T> <image-spec>          resize (expand or contract) image
+    rm <image-spec>                             delete an image
+    export (<image-spec> | <snap-spec>) [<path>]
+                                                export image to file
                                                 "-" for stdout
     import [--image-features <features>] [--image-shared]
-           <path> <image-name>                  import image from file (dest
-                                                defaults as the filename part
-                                                of file). "-" for stdin
-    diff <image-name> [--from-snap <snap-name>] print extents that differ since
+           <path> [<image-spec>]                import image from file
+                                                "-" for stdin
+                                                "rbd/$(basename <path>)" is
+                                                assumed for <image-spec> if
+                                                omitted
+    diff [--from-snap <snap-name>] [--whole-object]
+           <image-spec> | <snap-spec>           print extents that differ since
                                                 a previous snap, or image creation
-    export-diff <image-name> [--from-snap <snap-name>] <path>
-                                                export an incremental diff to
+    export-diff [--from-snap <snap-name>] [--whole-object]
+           (<image-spec> | <snap-spec>) <path>  export an incremental diff to
                                                 path, or "-" for stdout
     merge-diff <diff1> <diff2> <path>           merge <diff1> and <diff2> into
                                                 <path>, <diff1> could be "-"
                                                 for stdin, and <path> could be "-"
                                                 for stdout
-    import-diff <path> <image-name>             import an incremental diff from
+    import-diff <path> <image-spec>             import an incremental diff from
                                                 path or "-" for stdin
-    (cp | copy) <src> <dest>                    copy src image to dest
-    (mv | rename) <src> <dest>                  rename src image to dest
-    snap ls <image-name>                        dump list of image snapshots
-    snap create <snap-name>                     create a snapshot
-    snap rollback <snap-name>                   rollback image to snapshot
-    snap rm <snap-name>                         deletes a snapshot
-    snap purge <image-name>                     deletes all snapshots
-    snap protect <snap-name>                    prevent a snapshot from being deleted
-    snap unprotect <snap-name>                  allow a snapshot to be deleted
-    watch <image-name>                          watch events on image
-    status <image-name>                         show the status of this image
-    map <image-name>                            map image to a block device
+    (cp | copy) (<src-image-spec> | <src-snap-spec>) <dest-image-spec>
+                                                copy src image to dest
+    (mv | rename) <src-image-spec> <dest-image-spec>
+                                                rename src image to dest
+    image-meta list <image-spec>                image metadata list keys with values
+    image-meta get <image-spec> <key>           image metadata get the value associated with the key
+    image-meta set <image-spec> <key> <value>   image metadata set key with value
+    image-meta remove <image-spec> <key>        image metadata remove the key and value associated
+    object-map rebuild <image-spec> | <snap-spec>
+                                                rebuild an invalid object map
+    snap ls <image-spec>                        dump list of image snapshots
+    snap create <snap-spec>                     create a snapshot
+    snap rollback <snap-spec>                   rollback image to snapshot
+    snap rm <snap-spec>                         deletes a snapshot
+    snap purge <image-spec>                     deletes all snapshots
+    snap protect <snap-spec>                    prevent a snapshot from being deleted
+    snap unprotect <snap-spec>                  allow a snapshot to be deleted
+    watch <image-spec>                          watch events on image
+    status <image-spec>                         show the status of this image
+    map <image-spec> | <snap-spec>              map image to a block device
                                                 using the kernel
-    unmap <device>                              unmap a rbd device that was
+    unmap <image-spec> | <snap-spec> | <device> unmap a rbd device that was
                                                 mapped by the kernel
     showmapped                                  show the rbd images mapped
                                                 by the kernel
-    lock list <image-name>                      show locks held on an image
-    lock add <image-name> <id> [--shared <tag>] take a lock called id on an image
-    lock remove <image-name> <id> <locker>      release a lock on an image
-    bench-write <image-name>                    simple write benchmark
-                   --io-size <bytes>              write size
-                   --io-threads <num>             ios in flight
-                   --io-total <bytes>             total bytes to write
-                   --io-pattern <seq|rand>        write pattern
+    feature disable <image-spec> <feature>      disable the specified image feature
+    feature enable <image-spec> <feature>       enable the specified image feature
+    lock list <image-spec>                      show locks held on an image
+    lock add <image-spec> <id> [--shared <tag>] take a lock called id on an image
+    lock remove <image-spec> <id> <locker>      release a lock on an image
+    bench-write <image-spec>                    simple write benchmark
+                 --io-size <size in B/K/M/G/T>    write size
+                 --io-threads <num>               ios in flight
+                 --io-total <size in B/K/M/G/T>   total size to write
+                 --io-pattern <seq|rand>          write pattern
   
-  <image-name>, <snap-name> are [pool/]name[@snap], or you may specify
-  individual pieces of names with -p/--pool, --image, and/or --snap.
+  <image-spec> is [<pool-name>]/<image-name>,
+  <snap-spec> is [<pool-name>]/<image-name>@<snap-name>,
+  or you may specify individual pieces of names with -p/--pool <pool-name>,
+  --image <image-name> and/or --snap <snap-name>.
   
   Other input options:
-    -p, --pool <pool>                  source pool name
+    -p, --pool <pool-name>             source pool name
+    --dest-pool <pool-name>            destination pool name
     --image <image-name>               image name
-    --dest <image-name>                destination [pool and] image name
+    --dest <image-name>                destination image name
     --snap <snap-name>                 snapshot name
-    --dest-pool <name>                 destination pool name
     --path <path-name>                 path name for import/export
-    --size <size in MB>                size of image for create and resize
+    -s, --size <size in M/G/T>         size of image for create and resize
     --order <bits>                     the object size in bits; object size will be
                                        (1 << order) bytes. Default is 22 (4 MB).
     --image-format <format-number>     format to use when creating an image
-                                       format 1 is the original format (default)
-                                       format 2 supports cloning
-    --image-features <features>        optional format 2 features to enable
-                                       +1 layering support, +2 striping v2,
-                                       +4 exclusive lock, +8 object map
+                                       format 1 is the original format
+                                       format 2 supports cloning (default)
+    --image-feature <feature>          optional format 2 feature to enable.
+                                       use multiple times to enable multiple features
     --image-shared                     image will be used concurrently (disables
                                        RBD exclusive lock and dependent features)
-    --stripe-unit <size-in-bytes>      size (in bytes) of a block of data
+    --stripe-unit <size in B/K/M>      size of a block of data
     --stripe-count <num>               number of consecutive objects in a stripe
     --id <username>                    rados user (without 'client.'prefix) to
                                        authenticate as
     --keyfile <path>                   file containing secret key for use with cephx
+    --keyring <path>                   file containing keyring for use with cephx
     --shared <tag>                     take a shared (rather than exclusive) lock
     --format <output-format>           output format (default: plain, json, xml)
     --pretty-format                    make json or xml output more readable
@@ -91,3 +111,9 @@
     -o, --options <map-options>        options to use when mapping an image
     --read-only                        set device readonly when mapping image
     --allow-shrink                     allow shrinking of an image when resizing
+  
+  Supported image features:
+    layering (+), striping (+), exclusive-lock (*), object-map (*), fast-diff (*), deep-flatten
+  
+    (*) supports enabling/disabling on existing images
+    (+) enabled by default for new images if features are not specified
diff --git a/src/test/cli/rbd/invalid-snap-usage.t b/src/test/cli/rbd/invalid-snap-usage.t
index 40fc606..6735dc4 100644
--- a/src/test/cli/rbd/invalid-snap-usage.t
+++ b/src/test/cli/rbd/invalid-snap-usage.t
@@ -1,36 +1,109 @@
-  $ rbd resize --snap=snap1 img
+  $ rbd create foo at snap
   rbd: snapname specified for a command that doesn't use it
   [1]
-  $ rbd resize img at snap
+  $ rbd flatten foo at snap
   rbd: snapname specified for a command that doesn't use it
   [1]
-  $ rbd import --snap=snap1 /bin/ls ls
+  $ rbd resize foo at snap
   rbd: snapname specified for a command that doesn't use it
   [1]
-  $ rbd create --snap=snap img
+  $ rbd rm foo at snap
   rbd: snapname specified for a command that doesn't use it
   [1]
-  $ rbd rm --snap=snap img
+  $ rbd import-diff /tmp/diff foo at snap
   rbd: snapname specified for a command that doesn't use it
   [1]
-  $ rbd rename --snap=snap img
+  $ rbd mv foo at snap
   rbd: snapname specified for a command that doesn't use it
   [1]
-  $ rbd ls --snap=snap rbd
+  $ rbd mv foo at snap bar
   rbd: snapname specified for a command that doesn't use it
   [1]
-  $ rbd snap ls --snap=snap img
+  $ rbd mv foo at snap bar at snap
   rbd: snapname specified for a command that doesn't use it
   [1]
-  $ rbd watch --snap=snap img
+  $ rbd image-meta list foo at snap
   rbd: snapname specified for a command that doesn't use it
   [1]
-  $ rbd lock list --snap=snap img
+  $ rbd image-meta get foo at snap
   rbd: snapname specified for a command that doesn't use it
   [1]
-  $ rbd lock add --snap=snap img id
+  $ rbd image-meta get foo at snap key
   rbd: snapname specified for a command that doesn't use it
   [1]
-  $ rbd lock remove --snap=snap img id client.1234
+  $ rbd image-meta set foo at snap
   rbd: snapname specified for a command that doesn't use it
   [1]
+  $ rbd image-meta set foo at snap key
+  rbd: snapname specified for a command that doesn't use it
+  [1]
+  $ rbd image-meta set foo at snap key val
+  rbd: snapname specified for a command that doesn't use it
+  [1]
+  $ rbd image-meta remove foo at snap
+  rbd: snapname specified for a command that doesn't use it
+  [1]
+  $ rbd image-meta remove foo at snap key
+  rbd: snapname specified for a command that doesn't use it
+  [1]
+  $ rbd snap ls foo at snap
+  rbd: snapname specified for a command that doesn't use it
+  [1]
+  $ rbd snap purge foo at snap
+  rbd: snapname specified for a command that doesn't use it
+  [1]
+  $ rbd watch foo at snap
+  rbd: snapname specified for a command that doesn't use it
+  [1]
+  $ rbd status foo at snap
+  rbd: snapname specified for a command that doesn't use it
+  [1]
+  $ rbd feature disable foo at snap
+  rbd: snapname specified for a command that doesn't use it
+  [1]
+  $ rbd feature disable foo at snap layering
+  rbd: snapname specified for a command that doesn't use it
+  [1]
+  $ rbd feature enable foo at snap
+  rbd: snapname specified for a command that doesn't use it
+  [1]
+  $ rbd feature enable foo at snap layering
+  rbd: snapname specified for a command that doesn't use it
+  [1]
+  $ rbd lock list foo at snap
+  rbd: snapname specified for a command that doesn't use it
+  [1]
+  $ rbd lock add foo at snap
+  rbd: snapname specified for a command that doesn't use it
+  [1]
+  $ rbd lock add foo at snap id
+  rbd: snapname specified for a command that doesn't use it
+  [1]
+  $ rbd lock remove foo at snap
+  rbd: snapname specified for a command that doesn't use it
+  [1]
+  $ rbd lock remove foo at snap id
+  rbd: snapname specified for a command that doesn't use it
+  [1]
+  $ rbd lock remove foo at snap id client.1234
+  rbd: snapname specified for a command that doesn't use it
+  [1]
+  $ rbd bench-write foo at snap
+  rbd: snapname specified for a command that doesn't use it
+  [1]
+
+  $ rbd clone foo at snap bar at snap
+  rbd: destination snapname specified for a command that doesn't use it
+  [1]
+  $ rbd import /bin/ls ls at snap
+  rbd: destination snapname specified for a command that doesn't use it
+  [1]
+  $ rbd cp foo bar at snap
+  rbd: destination snapname specified for a command that doesn't use it
+  [1]
+  $ rbd cp foo at snap bar at snap
+  rbd: destination snapname specified for a command that doesn't use it
+  [1]
+  $ rbd mv foo bar at snap
+  rbd: destination snapname specified for a command that doesn't use it
+  [1]
diff --git a/src/test/cli/rbd/not-enough-args.t b/src/test/cli/rbd/not-enough-args.t
index 40fe0d1..df4fbd7 100644
--- a/src/test/cli/rbd/not-enough-args.t
+++ b/src/test/cli/rbd/not-enough-args.t
@@ -1,33 +1,192 @@
-  $ rbd map
+  $ rbd info
   rbd: image name was not specified
   [1]
-  $ rbd unmap
-  rbd: device path was not specified
+  $ rbd create
+  rbd: image name was not specified
   [1]
-  $ rbd clone foo at snap bar at snap
-  rbd: cannot clone to a snapshot
+  $ rbd clone
+  rbd: image name was not specified
   [1]
-  $ rbd cp foo
-  rbd: destination image name was not specified
+  $ rbd clone foo
+  rbd: snap name was not specified
   [1]
-  $ rbd cp foo at bar
+  $ rbd clone foo at snap
   rbd: destination image name was not specified
   [1]
-  $ rbd copy foo
+  $ rbd clone foo bar
+  rbd: snap name was not specified
+  [1]
+  $ rbd clone foo bar at snap
+  rbd: snap name was not specified
+  [1]
+  $ rbd children
+  rbd: image name was not specified
+  [1]
+  $ rbd children foo
+  rbd: snap name was not specified
+  [1]
+  $ rbd flatten
+  rbd: image name was not specified
+  [1]
+  $ rbd resize
+  rbd: image name was not specified
+  [1]
+  $ rbd rm
+  rbd: image name was not specified
+  [1]
+  $ rbd export
+  rbd: image name was not specified
+  [1]
+  $ rbd import
+  rbd: path was not specified
+  [1]
+  $ rbd diff
+  rbd: image name was not specified
+  [1]
+  $ rbd export-diff
+  rbd: image name was not specified
+  [1]
+  $ rbd export-diff foo
+  rbd: path was not specified
+  [1]
+  $ rbd export-diff foo at snap
+  rbd: path was not specified
+  [1]
+  $ rbd merge-diff
+  rbd: first diff was not specified
+  [1]
+  $ rbd merge-diff /tmp/diff1
+  rbd: second diff was not specified
+  [1]
+  $ rbd merge-diff /tmp/diff1 /tmp/diff2
+  rbd: path was not specified
+  [1]
+  $ rbd import-diff
+  rbd: path was not specified
+  [1]
+  $ rbd import-diff /tmp/diff
+  rbd: image name was not specified
+  [1]
+  $ rbd cp
+  rbd: image name was not specified
+  [1]
+  $ rbd cp foo
   rbd: destination image name was not specified
   [1]
-  $ rbd copy foo at bar
+  $ rbd cp foo at snap
   rbd: destination image name was not specified
   [1]
+  $ rbd mv
+  rbd: image name was not specified
+  [1]
   $ rbd mv foo
   rbd: destination image name was not specified
   [1]
-  $ rbd rename foo
-  rbd: destination image name was not specified
+  $ rbd image-meta list
+  rbd: image name was not specified
   [1]
-  $ rbd clone foo at bar
-  rbd: destination image name was not specified
+  $ rbd image-meta get
+  rbd: image name was not specified
   [1]
-  $ rbd clone foo
+  $ rbd image-meta get foo
+  rbd: metadata key was not specified
+  [1]
+  $ rbd image-meta set
+  rbd: image name was not specified
+  [1]
+  $ rbd image-meta set foo
+  rbd: metadata key was not specified
+  [1]
+  $ rbd image-meta set foo key
+  rbd: metadata value was not specified
+  [1]
+  $ rbd image-meta remove
+  rbd: image name was not specified
+  [1]
+  $ rbd image-meta remove foo
+  rbd: metadata key was not specified
+  [1]
+  $ rbd object-map rebuild
+  rbd: image name was not specified
+  [1]
+  $ rbd snap ls
+  rbd: image name was not specified
+  [1]
+  $ rbd snap create
+  rbd: image name was not specified
+  [1]
+  $ rbd snap create foo
+  rbd: snap name was not specified
+  [1]
+  $ rbd snap rollback
+  rbd: image name was not specified
+  [1]
+  $ rbd snap rollback foo
+  rbd: snap name was not specified
+  [1]
+  $ rbd snap rm
+  rbd: image name was not specified
+  [1]
+  $ rbd snap rm foo
+  rbd: snap name was not specified
+  [1]
+  $ rbd snap purge
+  rbd: image name was not specified
+  [1]
+  $ rbd snap protect
+  rbd: image name was not specified
+  [1]
+  $ rbd snap protect foo
+  rbd: snap name was not specified
+  [1]
+  $ rbd snap unprotect
+  rbd: image name was not specified
+  [1]
+  $ rbd snap unprotect foo
   rbd: snap name was not specified
   [1]
+  $ rbd watch
+  rbd: image name was not specified
+  [1]
+  $ rbd status
+  rbd: image name was not specified
+  [1]
+  $ rbd map
+  rbd: image name was not specified
+  [1]
+  $ rbd unmap
+  rbd: unmap requires either image name or device path
+  [1]
+  $ rbd feature disable
+  rbd: image name was not specified
+  [1]
+  $ rbd feature disable foo
+  rbd: at least one feature name must be specified
+  [1]
+  $ rbd feature enable
+  rbd: image name was not specified
+  [1]
+  $ rbd feature enable foo
+  rbd: at least one feature name must be specified
+  [1]
+  $ rbd lock list
+  rbd: image name was not specified
+  [1]
+  $ rbd lock add
+  rbd: image name was not specified
+  [1]
+  $ rbd lock add foo
+  rbd: lock id was not specified
+  [1]
+  $ rbd lock remove
+  rbd: image name was not specified
+  [1]
+  $ rbd lock remove foo
+  rbd: lock id was not specified
+  [1]
+  $ rbd lock remove foo id
+  rbd: locker was not specified
+  [1]
+  $ rbd bench-write
+  rbd: image name was not specified
+  [1]
diff --git a/src/test/cls_hello/test_cls_hello.cc b/src/test/cls_hello/test_cls_hello.cc
index 58ecb97..efd9fc7 100644
--- a/src/test/cls_hello/test_cls_hello.cc
+++ b/src/test/cls_hello/test_cls_hello.cc
@@ -16,6 +16,7 @@
 #include <errno.h>
 
 #include "include/rados/librados.hpp"
+#include "include/encoding.h"
 #include "test/librados/test.h"
 #include "gtest/gtest.h"
 
@@ -131,3 +132,54 @@ TEST(ClsHello, BadMethods) {
 
   ASSERT_EQ(0, destroy_one_pool_pp(pool_name, cluster));
 }
+
+TEST(ClsHello, Filter) {
+  Rados cluster;
+  std::string pool_name = get_temp_pool_name();
+  ASSERT_EQ("", create_one_pool_pp(pool_name, cluster));
+  IoCtx ioctx;
+  cluster.ioctx_create(pool_name.c_str(), ioctx);
+
+  char buf[128];
+  memset(buf, 0xcc, sizeof(buf));
+  bufferlist obj_content;
+  obj_content.append(buf, sizeof(buf));
+
+  std::string target_str = "content";
+
+  // Write xattr bare, no ::encod'ing
+  bufferlist target_val;
+  target_val.append(target_str);
+  bufferlist nontarget_val;
+  nontarget_val.append("rhubarb");
+
+  ASSERT_EQ(0, ioctx.write("has_xattr", obj_content, obj_content.length(), 0));
+  ASSERT_EQ(0, ioctx.write("has_wrong_xattr", obj_content, obj_content.length(), 0));
+  ASSERT_EQ(0, ioctx.write("no_xattr", obj_content, obj_content.length(), 0));
+
+  ASSERT_EQ(0, ioctx.setxattr("has_xattr", "theattr", target_val));
+  ASSERT_EQ(0, ioctx.setxattr("has_wrong_xattr", "theattr", nontarget_val));
+
+  bufferlist filter_bl;
+  std::string filter_name = "hello.hello";
+  ::encode(filter_name, filter_bl);
+  ::encode("_theattr", filter_bl);
+  ::encode(target_str, filter_bl);
+
+  NObjectIterator iter(ioctx.nobjects_begin(filter_bl));
+  bool foundit = false;
+  int k = 0;
+  while (iter != ioctx.nobjects_end()) {
+    foundit = true;
+    // We should only see the object that matches the filter
+    ASSERT_EQ((*iter).get_oid(), "has_xattr");
+    // We should only see it once
+    ASSERT_EQ(k, 0);
+    ++iter;
+    ++k;
+  }
+  ASSERT_TRUE(foundit);
+
+  ASSERT_EQ(0, destroy_one_pool_pp(pool_name, cluster));
+}
+
diff --git a/src/test/cls_numops/test_cls_numops.cc b/src/test/cls_numops/test_cls_numops.cc
new file mode 100644
index 0000000..844caf9
--- /dev/null
+++ b/src/test/cls_numops/test_cls_numops.cc
@@ -0,0 +1,414 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 CERN
+ *
+ * Author: Joaquim Rocha <joaquim.rocha at cern.ch>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <iostream>
+#include <errno.h>
+#include <set>
+#include <sstream>
+#include <string>
+
+#include "cls/numops/cls_numops_client.h"
+#include "gtest/gtest.h"
+#include "include/rados/librados.hpp"
+#include "test/librados/test.h"
+
+using namespace librados;
+
+TEST(ClsNumOps, Add) {
+  Rados cluster;
+  std::string pool_name = get_temp_pool_name();
+  ASSERT_EQ("", create_one_pool_pp(pool_name, cluster));
+  IoCtx ioctx;
+  cluster.ioctx_create(pool_name.c_str(), ioctx);
+
+  // exec numops add method with an empty bufferlist
+
+  bufferlist in, out;
+
+  ASSERT_EQ(-EINVAL, ioctx.exec("myobject", "numops", "add", in, out));
+
+  // add a number to a non-existing key
+
+  std::string key = "my-key";
+  double value_in = 0.5;
+
+  std::stringstream stream;
+  stream << value_in;
+
+  ASSERT_EQ(0, rados::cls::numops::add(&ioctx, "myobject", key, value_in));
+
+  // check that the omap entry was set and the value matches
+
+  std::set<std::string> keys;
+  std::map<std::string, bufferlist> omap;
+  keys.insert(key);
+
+  ASSERT_EQ(0, ioctx.omap_get_vals_by_keys("myobject", keys, &omap));
+
+  std::map<std::string, bufferlist>::iterator it = omap.find(key);
+
+  ASSERT_NE(omap.end(), it);
+
+  bufferlist bl = (*it).second;
+  std::string value_out(bl.c_str(), bl.length());
+
+  EXPECT_EQ(stream.str(), value_out);
+
+  // add another value to the existing one
+
+  double new_value_in = 3.001;
+
+  ASSERT_EQ(0, rados::cls::numops::add(&ioctx, "myobject", key, new_value_in));
+
+  // check that the omap entry's value matches
+
+  omap.clear();
+
+  ASSERT_EQ(0, ioctx.omap_get_vals_by_keys("myobject", keys, &omap));
+
+  it = omap.find(key);
+
+  ASSERT_NE(omap.end(), it);
+
+  bl = (*it).second;
+  value_out.assign(bl.c_str(), bl.length());
+
+  stream.str("");
+  stream << (value_in + new_value_in);
+
+  EXPECT_EQ(stream.str(), value_out);
+
+  // set the omap entry with some non-numeric value
+
+  omap.clear();
+
+  std::string non_numeric_value("some-non-numeric-text");
+  omap[key].append(non_numeric_value);
+
+  ASSERT_EQ(0, ioctx.omap_set("myobject", omap));
+
+  // check that adding a number does not succeed
+
+  omap.clear();
+
+  ASSERT_EQ(-EBADMSG, rados::cls::numops::add(&ioctx, "myobject", key, 2.0));
+
+  // check that the omap entry was not changed
+
+  ASSERT_EQ(0, ioctx.omap_get_vals_by_keys("myobject", keys, &omap));
+
+  it = omap.find(key);
+
+  ASSERT_NE(omap.end(), it);
+
+  bl = (*it).second;
+  value_out.assign(bl.c_str(), bl.length());
+
+  EXPECT_EQ(non_numeric_value, value_out);
+
+  ASSERT_EQ(0, destroy_one_pool_pp(pool_name, cluster));
+}
+
+TEST(ClsNumOps, Sub) {
+  Rados cluster;
+  std::string pool_name = get_temp_pool_name();
+  ASSERT_EQ("", create_one_pool_pp(pool_name, cluster));
+  IoCtx ioctx;
+  cluster.ioctx_create(pool_name.c_str(), ioctx);
+
+  // subtract a number from a non-existing key
+
+  std::string key = "my-key";
+  double value_in = 0.5;
+
+  std::stringstream stream;
+  stream << value_in;
+
+  ASSERT_EQ(0, rados::cls::numops::sub(&ioctx, "myobject", key, value_in));
+
+  // check that the omap entry was set and the value matches
+
+  std::set<std::string> keys;
+  std::map<std::string, bufferlist> omap;
+  keys.insert(key);
+
+  ASSERT_EQ(0, ioctx.omap_get_vals_by_keys("myobject", keys, &omap));
+
+  std::map<std::string, bufferlist>::iterator it = omap.find(key);
+
+  ASSERT_NE(omap.end(), it);
+
+  bufferlist bl = (*it).second;
+  std::string value_out(bl.c_str(), bl.length());
+
+  EXPECT_EQ("-" + stream.str(), value_out);
+
+  // subtract another value to the existing one
+
+  double new_value_in = 3.001;
+
+  ASSERT_EQ(0, rados::cls::numops::sub(&ioctx, "myobject", key, new_value_in));
+
+  // check that the omap entry's value matches
+
+  omap.clear();
+
+  ASSERT_EQ(0, ioctx.omap_get_vals_by_keys("myobject", keys, &omap));
+
+  it = omap.find(key);
+
+  ASSERT_NE(omap.end(), it);
+
+  bl = (*it).second;
+  value_out.assign(bl.c_str(), bl.length());
+
+  stream.str("");
+  stream << -(value_in + new_value_in);
+
+  EXPECT_EQ(stream.str(), value_out);
+
+  // set the omap entry with some non-numeric value
+
+  omap.clear();
+
+  std::string non_numeric_value("some-non-numeric-text");
+  omap[key].append(non_numeric_value);
+
+  ASSERT_EQ(0, ioctx.omap_set("myobject", omap));
+
+  // check that subtracting a number does not succeed
+
+  omap.clear();
+
+  ASSERT_EQ(-EBADMSG, rados::cls::numops::sub(&ioctx, "myobject", key, 2.0));
+
+  // check that the omap entry was not changed
+
+  ASSERT_EQ(0, ioctx.omap_get_vals_by_keys("myobject", keys, &omap));
+
+  it = omap.find(key);
+
+  ASSERT_NE(omap.end(), it);
+
+  bl = (*it).second;
+  value_out.assign(bl.c_str(), bl.length());
+
+  EXPECT_EQ(non_numeric_value, value_out);
+
+  ASSERT_EQ(0, destroy_one_pool_pp(pool_name, cluster));
+}
+
+TEST(ClsNumOps, Mul) {
+  Rados cluster;
+  std::string pool_name = get_temp_pool_name();
+  ASSERT_EQ("", create_one_pool_pp(pool_name, cluster));
+  IoCtx ioctx;
+  cluster.ioctx_create(pool_name.c_str(), ioctx);
+
+  // exec numops mul method with an empty bufferlist
+
+  bufferlist in, out;
+
+  ASSERT_EQ(-EINVAL, ioctx.exec("myobject", "numops", "mul", in, out));
+
+  // multiply a number to a non-existing key
+
+  std::string key = "my-key";
+  double value_in = 0.5;
+
+  std::stringstream stream;
+  stream << value_in;
+
+  ASSERT_EQ(0, rados::cls::numops::mul(&ioctx, "myobject", key, value_in));
+
+  // check that the omap entry was set and the value is zero
+
+  std::set<std::string> keys;
+  std::map<std::string, bufferlist> omap;
+  keys.insert(key);
+
+  ASSERT_EQ(0, ioctx.omap_get_vals_by_keys("myobject", keys, &omap));
+
+  std::map<std::string, bufferlist>::iterator it = omap.find(key);
+
+  ASSERT_NE(omap.end(), it);
+
+  bufferlist bl = (*it).second;
+  std::string value_out(bl.c_str(), bl.length());
+
+  EXPECT_EQ("0", value_out);
+
+  // set a non-zero value so we can effectively test multiplications
+
+  omap.clear();
+
+  omap[key].append(stream.str());
+
+  ASSERT_EQ(0, ioctx.omap_set("myobject", omap));
+
+  // multiply another value to the existing one
+
+  double new_value_in = 3.001;
+
+  ASSERT_EQ(0, rados::cls::numops::mul(&ioctx, "myobject", key, new_value_in));
+
+  // check that the omap entry's value matches
+
+  omap.clear();
+
+  ASSERT_EQ(0, ioctx.omap_get_vals_by_keys("myobject", keys, &omap));
+
+  it = omap.find(key);
+
+  ASSERT_NE(omap.end(), it);
+
+  bl = (*it).second;
+  value_out.assign(bl.c_str(), bl.length());
+
+  stream.str("");
+  stream << (value_in * new_value_in);
+
+  EXPECT_EQ(stream.str(), value_out);
+
+  // set the omap entry with some non-numeric value
+
+  omap.clear();
+
+  std::string non_numeric_value("some-non-numeric-text");
+  omap[key].append(non_numeric_value);
+
+  ASSERT_EQ(0, ioctx.omap_set("myobject", omap));
+
+  // check that adding a number does not succeed
+
+  ASSERT_EQ(-EBADMSG, rados::cls::numops::mul(&ioctx, "myobject", key, 2.0));
+
+  // check that the omap entry was not changed
+
+  omap.clear();
+
+  ASSERT_EQ(0, ioctx.omap_get_vals_by_keys("myobject", keys, &omap));
+
+  it = omap.find(key);
+
+  ASSERT_NE(omap.end(), it);
+
+  bl = (*it).second;
+  value_out.assign(bl.c_str(), bl.length());
+
+  EXPECT_EQ(non_numeric_value, value_out);
+
+  ASSERT_EQ(0, destroy_one_pool_pp(pool_name, cluster));
+}
+
+TEST(ClsNumOps, Div) {
+  Rados cluster;
+  std::string pool_name = get_temp_pool_name();
+  ASSERT_EQ("", create_one_pool_pp(pool_name, cluster));
+  IoCtx ioctx;
+  cluster.ioctx_create(pool_name.c_str(), ioctx);
+
+  // divide a non-existing key by a number
+
+  std::string key = "my-key";
+  double value_in = 0.5;
+
+  std::stringstream stream;
+  stream << value_in;
+
+  ASSERT_EQ(0, rados::cls::numops::div(&ioctx, "myobject", key, value_in));
+
+  // check that the omap entry was set and the value is zero
+
+  std::set<std::string> keys;
+  std::map<std::string, bufferlist> omap;
+  keys.insert(key);
+
+  ASSERT_EQ(0, ioctx.omap_get_vals_by_keys("myobject", keys, &omap));
+
+  std::map<std::string, bufferlist>::iterator it = omap.find(key);
+
+  ASSERT_NE(omap.end(), it);
+
+  bufferlist bl = (*it).second;
+  std::string value_out(bl.c_str(), bl.length());
+
+  EXPECT_EQ("0", value_out);
+
+  // check that division by zero is not allowed
+
+  ASSERT_EQ(-EINVAL, rados::cls::numops::div(&ioctx, "myobject", key, 0));
+
+  // set a non-zero value so we can effectively test divisions
+
+  omap.clear();
+
+  omap[key].append(stream.str());
+
+  ASSERT_EQ(0, ioctx.omap_set("myobject", omap));
+
+  // divide another value to the existing one
+
+  double new_value_in = 3.001;
+
+  ASSERT_EQ(0, rados::cls::numops::div(&ioctx, "myobject", key, new_value_in));
+
+  // check that the omap entry's value matches
+
+  omap.clear();
+
+  ASSERT_EQ(0, ioctx.omap_get_vals_by_keys("myobject", keys, &omap));
+
+  it = omap.find(key);
+
+  ASSERT_NE(omap.end(), it);
+
+  bl = (*it).second;
+  value_out.assign(bl.c_str(), bl.length());
+
+  stream.str("");
+  stream << (value_in / new_value_in);
+
+  EXPECT_EQ(stream.str(), value_out);
+
+  omap.clear();
+
+  // set the omap entry with some non-numeric value
+
+  std::string non_numeric_value("some-non-numeric-text");
+  omap[key].append(non_numeric_value);
+
+  ASSERT_EQ(0, ioctx.omap_set("myobject", omap));
+
+  // check that adding a number does not succeed
+
+  ASSERT_EQ(-EBADMSG, rados::cls::numops::div(&ioctx, "myobject", key, 2.0));
+
+  // check that the omap entry was not changed
+
+  omap.clear();
+
+  ASSERT_EQ(0, ioctx.omap_get_vals_by_keys("myobject", keys, &omap));
+
+  it = omap.find(key);
+
+  ASSERT_NE(omap.end(), it);
+
+  bl = (*it).second;
+  value_out.assign(bl.c_str(), bl.length());
+
+  EXPECT_EQ(non_numeric_value, value_out);
+
+  ASSERT_EQ(0, destroy_one_pool_pp(pool_name, cluster));
+}
diff --git a/src/test/cls_rbd/test_cls_rbd.cc b/src/test/cls_rbd/test_cls_rbd.cc
index 6e91001..eb9c19c 100644
--- a/src/test/cls_rbd/test_cls_rbd.cc
+++ b/src/test/cls_rbd/test_cls_rbd.cc
@@ -5,6 +5,7 @@
 #include "include/encoding.h"
 #include "include/types.h"
 #include "include/rados/librados.h"
+#include "include/rbd/object_map_types.h"
 #include "include/stringify.h"
 #include "cls/rbd/cls_rbd.h"
 #include "cls/rbd/cls_rbd_client.h"
@@ -19,6 +20,7 @@
 using namespace std;
 using ::librbd::cls_client::create_image;
 using ::librbd::cls_client::get_features;
+using ::librbd::cls_client::set_features;
 using ::librbd::cls_client::get_size;
 using ::librbd::cls_client::get_object_prefix;
 using ::librbd::cls_client::set_size;
@@ -50,10 +52,17 @@ using ::librbd::cls_client::set_stripe_unit_count;
 using ::librbd::cls_client::old_snapshot_add;
 using ::librbd::cls_client::get_mutable_metadata;
 using ::librbd::cls_client::object_map_load;
+using ::librbd::cls_client::object_map_save;
 using ::librbd::cls_client::object_map_resize;
 using ::librbd::cls_client::object_map_update;
+using ::librbd::cls_client::object_map_snap_add;
+using ::librbd::cls_client::object_map_snap_remove;
 using ::librbd::cls_client::get_flags;
 using ::librbd::cls_client::set_flags;
+using ::librbd::cls_client::metadata_set;
+using ::librbd::cls_client::metadata_remove;
+using ::librbd::cls_client::metadata_list;
+using ::librbd::cls_client::metadata_get;
 
 static char *random_buf(size_t len)
 {
@@ -362,7 +371,13 @@ TEST_F(TestClsRbd, get_features)
   ASSERT_EQ(0, get_features(&ioctx, oid, CEPH_NOSNAP, &features));
   ASSERT_EQ(0u, features);
 
-  ASSERT_EQ(-ENOENT, get_features(&ioctx, oid, 1, &features));
+  int r = get_features(&ioctx, oid, 1, &features);
+  if (r == 0) {
+    ASSERT_EQ(0u, features);
+  } else {
+    // deprecated snapshot handling
+    ASSERT_EQ(-ENOENT, r);
+  }
 
   ioctx.close();
 }
@@ -661,6 +676,22 @@ TEST_F(TestClsRbd, parents)
   ASSERT_EQ(pspec.snap_id, snapid_t(3));
   ASSERT_EQ(size, 2ull<<20);
 
+  ASSERT_EQ(0, ioctx.remove(oid));
+  ASSERT_EQ(0, create_image(&ioctx, oid, 33<<20, 22,
+                            RBD_FEATURE_LAYERING | RBD_FEATURE_DEEP_FLATTEN,
+                            "foo."));
+  ASSERT_EQ(0, set_parent(&ioctx, oid, parent_spec(1, "parent", 3), 100<<20));
+  ASSERT_EQ(0, snapshot_add(&ioctx, oid, 1, "snap1"));
+  ASSERT_EQ(0, snapshot_add(&ioctx, oid, 2, "snap2"));
+  ASSERT_EQ(0, remove_parent(&ioctx, oid));
+
+  ASSERT_EQ(0, get_parent(&ioctx, oid, 1, &pspec, &size));
+  ASSERT_EQ(-1, pspec.pool_id);
+  ASSERT_EQ(0, get_parent(&ioctx, oid, 2, &pspec, &size));
+  ASSERT_EQ(-1, pspec.pool_id);
+  ASSERT_EQ(0, get_parent(&ioctx, oid, CEPH_NOSNAP, &pspec, &size));
+  ASSERT_EQ(-1, pspec.pool_id);
+
   ioctx.close();
 }
 
@@ -676,7 +707,6 @@ TEST_F(TestClsRbd, snapshots)
 
   vector<string> snap_names;
   vector<uint64_t> snap_sizes;
-  vector<uint64_t> snap_features;
   SnapContext snapc;
   vector<parent_info> parents;
   vector<uint8_t> protection_status;
@@ -685,11 +715,9 @@ TEST_F(TestClsRbd, snapshots)
   ASSERT_EQ(0u, snapc.snaps.size());
   ASSERT_EQ(0u, snapc.seq);
   ASSERT_EQ(0, snapshot_list(&ioctx, oid, snapc.snaps, &snap_names,
-			     &snap_sizes, &snap_features, &parents,
-			     &protection_status));
+			     &snap_sizes, &parents, &protection_status));
   ASSERT_EQ(0u, snap_names.size());
   ASSERT_EQ(0u, snap_sizes.size());
-  ASSERT_EQ(0u, snap_features.size());
 
   ASSERT_EQ(0, snapshot_add(&ioctx, oid, 0, "snap1"));
   ASSERT_EQ(0, get_snapcontext(&ioctx, oid, &snapc));
@@ -697,12 +725,10 @@ TEST_F(TestClsRbd, snapshots)
   ASSERT_EQ(0u, snapc.snaps[0]);
   ASSERT_EQ(0u, snapc.seq);
   ASSERT_EQ(0, snapshot_list(&ioctx, oid, snapc.snaps, &snap_names,
-			     &snap_sizes, &snap_features, &parents,
-			     &protection_status));
+			     &snap_sizes, &parents, &protection_status));
   ASSERT_EQ(1u, snap_names.size());
   ASSERT_EQ("snap1", snap_names[0]);
   ASSERT_EQ(10u, snap_sizes[0]);
-  ASSERT_EQ(0u, snap_features[0]);
 
   // snap with same id and name
   ASSERT_EQ(-EEXIST, snapshot_add(&ioctx, oid, 0, "snap1"));
@@ -711,12 +737,10 @@ TEST_F(TestClsRbd, snapshots)
   ASSERT_EQ(0u, snapc.snaps[0]);
   ASSERT_EQ(0u, snapc.seq);
   ASSERT_EQ(0, snapshot_list(&ioctx, oid, snapc.snaps, &snap_names,
-			     &snap_sizes, &snap_features, &parents,
-			     &protection_status));
+			     &snap_sizes, &parents, &protection_status));
   ASSERT_EQ(1u, snap_names.size());
   ASSERT_EQ("snap1", snap_names[0]);
   ASSERT_EQ(10u, snap_sizes[0]);
-  ASSERT_EQ(0u, snap_features[0]);
 
   // snap with same id, different name
   ASSERT_EQ(-EEXIST, snapshot_add(&ioctx, oid, 0, "snap2"));
@@ -725,12 +749,10 @@ TEST_F(TestClsRbd, snapshots)
   ASSERT_EQ(0u, snapc.snaps[0]);
   ASSERT_EQ(0u, snapc.seq);
   ASSERT_EQ(0, snapshot_list(&ioctx, oid, snapc.snaps, &snap_names,
-			     &snap_sizes, &snap_features, &parents,
-			     &protection_status));
+			     &snap_sizes, &parents, &protection_status));
   ASSERT_EQ(1u, snap_names.size());
   ASSERT_EQ("snap1", snap_names[0]);
   ASSERT_EQ(10u, snap_sizes[0]);
-  ASSERT_EQ(0u, snap_features[0]);
 
   // snap with different id, same name
   ASSERT_EQ(-EEXIST, snapshot_add(&ioctx, oid, 1, "snap1"));
@@ -739,12 +761,10 @@ TEST_F(TestClsRbd, snapshots)
   ASSERT_EQ(0u, snapc.snaps[0]);
   ASSERT_EQ(0u, snapc.seq);
   ASSERT_EQ(0, snapshot_list(&ioctx, oid, snapc.snaps, &snap_names,
-			     &snap_sizes, &snap_features, &parents,
-			     &protection_status));
+			     &snap_sizes, &parents, &protection_status));
   ASSERT_EQ(snap_names.size(), 1u);
   ASSERT_EQ(snap_names[0], "snap1");
   ASSERT_EQ(snap_sizes[0], 10u);
-  ASSERT_EQ(snap_features[0], 0u);
 
   // snap with different id, different name
   ASSERT_EQ(0, snapshot_add(&ioctx, oid, 1, "snap2"));
@@ -754,15 +774,12 @@ TEST_F(TestClsRbd, snapshots)
   ASSERT_EQ(0u, snapc.snaps[1]);
   ASSERT_EQ(1u, snapc.seq);
   ASSERT_EQ(0, snapshot_list(&ioctx, oid, snapc.snaps, &snap_names,
-			     &snap_sizes, &snap_features, &parents,
-			     &protection_status));
+			     &snap_sizes, &parents, &protection_status));
   ASSERT_EQ(2u, snap_names.size());
   ASSERT_EQ("snap2", snap_names[0]);
   ASSERT_EQ(10u, snap_sizes[0]);
-  ASSERT_EQ(0u, snap_features[0]);
   ASSERT_EQ("snap1", snap_names[1]);
   ASSERT_EQ(10u, snap_sizes[1]);
-  ASSERT_EQ(0u, snap_features[1]);
 
   ASSERT_EQ(0, snapshot_remove(&ioctx, oid, 0));
   ASSERT_EQ(0, get_snapcontext(&ioctx, oid, &snapc));
@@ -770,12 +787,10 @@ TEST_F(TestClsRbd, snapshots)
   ASSERT_EQ(1u, snapc.snaps[0]);
   ASSERT_EQ(1u, snapc.seq);
   ASSERT_EQ(0, snapshot_list(&ioctx, oid, snapc.snaps, &snap_names,
-			     &snap_sizes, &snap_features, &parents,
-			     &protection_status));
+			     &snap_sizes, &parents, &protection_status));
   ASSERT_EQ(1u, snap_names.size());
   ASSERT_EQ("snap2", snap_names[0]);
   ASSERT_EQ(10u, snap_sizes[0]);
-  ASSERT_EQ(0u, snap_features[0]);
 
   uint64_t size;
   uint8_t order;
@@ -792,15 +807,12 @@ TEST_F(TestClsRbd, snapshots)
   ASSERT_EQ(1u, snapc.snaps[1]);
   ASSERT_EQ(large_snap_id, snapc.seq);
   ASSERT_EQ(0, snapshot_list(&ioctx, oid, snapc.snaps, &snap_names,
-			     &snap_sizes, &snap_features, &parents,
-			     &protection_status));
+			     &snap_sizes, &parents, &protection_status));
   ASSERT_EQ(2u, snap_names.size());
   ASSERT_EQ("snap3", snap_names[0]);
   ASSERT_EQ(0u, snap_sizes[0]);
-  ASSERT_EQ(0u, snap_features[0]);
   ASSERT_EQ("snap2", snap_names[1]);
   ASSERT_EQ(10u, snap_sizes[1]);
-  ASSERT_EQ(0u, snap_features[1]);
 
   ASSERT_EQ(0, get_size(&ioctx, oid, large_snap_id, &size, &order));
   ASSERT_EQ(0u, size);
@@ -816,12 +828,10 @@ TEST_F(TestClsRbd, snapshots)
   ASSERT_EQ(1u, snapc.snaps[0]);
   ASSERT_EQ(large_snap_id, snapc.seq);
   ASSERT_EQ(0, snapshot_list(&ioctx, oid, snapc.snaps, &snap_names,
-			     &snap_sizes, &snap_features, &parents,
-			     &protection_status));
+			     &snap_sizes, &parents, &protection_status));
   ASSERT_EQ(1u, snap_names.size());
   ASSERT_EQ("snap2", snap_names[0]);
   ASSERT_EQ(10u, snap_sizes[0]);
-  ASSERT_EQ(0u, snap_features[0]);
 
   ASSERT_EQ(-ENOENT, snapshot_remove(&ioctx, oid, large_snap_id));
   ASSERT_EQ(0, snapshot_remove(&ioctx, oid, 1));
@@ -829,11 +839,9 @@ TEST_F(TestClsRbd, snapshots)
   ASSERT_EQ(0u, snapc.snaps.size());
   ASSERT_EQ(large_snap_id, snapc.seq);
   ASSERT_EQ(0, snapshot_list(&ioctx, oid, snapc.snaps, &snap_names,
-			     &snap_sizes, &snap_features, &parents,
-			     &protection_status));
+			     &snap_sizes, &parents, &protection_status));
   ASSERT_EQ(0u, snap_names.size());
   ASSERT_EQ(0u, snap_sizes.size());
-  ASSERT_EQ(0u, snap_features.size());
 
   ioctx.close();
 }
@@ -929,6 +937,27 @@ TEST_F(TestClsRbd, get_mutable_metadata_features)
   ioctx.close();
 }
 
+TEST_F(TestClsRbd, object_map_save)
+{
+  librados::IoCtx ioctx;
+  ASSERT_EQ(0, _rados.ioctx_create(_pool_name.c_str(), ioctx));
+
+  string oid = get_temp_image_name();
+  BitVector<2> ref_bit_vector;
+  ref_bit_vector.resize(32);
+  for (uint64_t i = 0; i < ref_bit_vector.size(); ++i) {
+    ref_bit_vector[i] = 1;
+  }
+
+  librados::ObjectWriteOperation op;
+  object_map_save(&op, ref_bit_vector);
+  ASSERT_EQ(0, ioctx.operate(oid, &op));
+
+  BitVector<2> osd_bit_vector;
+  ASSERT_EQ(0, object_map_load(&ioctx, oid, &osd_bit_vector));
+  ASSERT_EQ(ref_bit_vector, osd_bit_vector);
+}
+
 TEST_F(TestClsRbd, object_map_resize)
 {
   librados::IoCtx ioctx;
@@ -1029,6 +1058,98 @@ TEST_F(TestClsRbd, object_map_load_enoent)
   ioctx.close();
 }
 
+TEST_F(TestClsRbd, object_map_snap_add)
+{
+  librados::IoCtx ioctx;
+  ASSERT_EQ(0, _rados.ioctx_create(_pool_name.c_str(), ioctx));
+
+  string oid = get_temp_image_name();
+  BitVector<2> ref_bit_vector;
+  ref_bit_vector.resize(16);
+  for (uint64_t i = 0; i < ref_bit_vector.size(); ++i) {
+    if (i < 4) {
+      ref_bit_vector[i] = OBJECT_NONEXISTENT;
+    } else {
+      ref_bit_vector[i] = OBJECT_EXISTS;
+    }
+  }
+
+  BitVector<2> osd_bit_vector;
+
+  librados::ObjectWriteOperation op1;
+  object_map_resize(&op1, ref_bit_vector.size(), OBJECT_EXISTS);
+  ASSERT_EQ(0, ioctx.operate(oid, &op1));
+
+  librados::ObjectWriteOperation op2;
+  object_map_update(&op2, 0, 4, OBJECT_NONEXISTENT, boost::optional<uint8_t>());
+  ASSERT_EQ(0, ioctx.operate(oid, &op2));
+
+  ASSERT_EQ(0, object_map_load(&ioctx, oid, &osd_bit_vector));
+  ASSERT_EQ(ref_bit_vector, osd_bit_vector);
+
+  librados::ObjectWriteOperation op3;
+  object_map_snap_add(&op3);
+  ASSERT_EQ(0, ioctx.operate(oid, &op3));
+
+  for (uint64_t i = 0; i < ref_bit_vector.size(); ++i) {
+    if (ref_bit_vector[i] == OBJECT_EXISTS) {
+      ref_bit_vector[i] = OBJECT_EXISTS_CLEAN;
+    }
+  }
+
+  ASSERT_EQ(0, object_map_load(&ioctx, oid, &osd_bit_vector));
+  ASSERT_EQ(ref_bit_vector, osd_bit_vector);
+}
+
+TEST_F(TestClsRbd, object_map_snap_remove)
+{
+  librados::IoCtx ioctx;
+  ASSERT_EQ(0, _rados.ioctx_create(_pool_name.c_str(), ioctx));
+
+  string oid = get_temp_image_name();
+  BitVector<2> ref_bit_vector;
+  ref_bit_vector.resize(16);
+  for (uint64_t i = 0; i < ref_bit_vector.size(); ++i) {
+    if (i < 4) {
+      ref_bit_vector[i] = OBJECT_EXISTS_CLEAN;
+    } else {
+      ref_bit_vector[i] = OBJECT_EXISTS;
+    }
+  }
+
+  BitVector<2> osd_bit_vector;
+
+  librados::ObjectWriteOperation op1;
+  object_map_resize(&op1, ref_bit_vector.size(), OBJECT_EXISTS);
+  ASSERT_EQ(0, ioctx.operate(oid, &op1));
+
+  librados::ObjectWriteOperation op2;
+  object_map_update(&op2, 0, 4, OBJECT_EXISTS_CLEAN, boost::optional<uint8_t>());
+  ASSERT_EQ(0, ioctx.operate(oid, &op2));
+
+  ASSERT_EQ(0, object_map_load(&ioctx, oid, &osd_bit_vector));
+  ASSERT_EQ(ref_bit_vector, osd_bit_vector);
+
+  BitVector<2> snap_bit_vector;
+  snap_bit_vector.resize(4);
+  for (uint64_t i = 0; i < snap_bit_vector.size(); ++i) {
+    if (i == 1 || i == 2) {
+      snap_bit_vector[i] = OBJECT_EXISTS;
+    } else {
+      snap_bit_vector[i] = OBJECT_NONEXISTENT;
+    }
+  }
+
+  librados::ObjectWriteOperation op3;
+  object_map_snap_remove(&op3, snap_bit_vector);
+  ASSERT_EQ(0, ioctx.operate(oid, &op3));
+
+  ref_bit_vector[1] = OBJECT_EXISTS;
+  ref_bit_vector[2] = OBJECT_EXISTS;
+  ASSERT_EQ(0, object_map_load(&ioctx, oid, &osd_bit_vector));
+  ASSERT_EQ(ref_bit_vector, osd_bit_vector);
+}
+
 TEST_F(TestClsRbd, flags)
 {
   librados::IoCtx ioctx;
@@ -1064,3 +1185,98 @@ TEST_F(TestClsRbd, flags)
 
   ioctx.close();
 }
+
+TEST_F(TestClsRbd, metadata)
+{
+  librados::IoCtx ioctx;
+  ASSERT_EQ(0, _rados.ioctx_create(_pool_name.c_str(), ioctx));
+
+  string oid = get_temp_image_name();
+  ASSERT_EQ(0, create_image(&ioctx, oid, 0, 22, 0, oid));
+
+  map<string, bufferlist> pairs;
+  string value;
+  ASSERT_EQ(0, metadata_list(&ioctx, oid, "", 0, &pairs));
+  ASSERT_TRUE(pairs.empty());
+
+  pairs["key1"].append("value1");
+  pairs["key2"].append("value2");
+  ASSERT_EQ(0, metadata_set(&ioctx, oid, pairs));
+  ASSERT_EQ(0, metadata_get(&ioctx, oid, "key1", &value));
+  ASSERT_EQ(0, strcmp("value1", value.c_str()));
+  pairs.clear();
+  ASSERT_EQ(0, metadata_list(&ioctx, oid, "", 0, &pairs));
+  ASSERT_EQ(2U, pairs.size());
+  ASSERT_EQ(0, strncmp("value1", pairs["key1"].c_str(), 6));
+  ASSERT_EQ(0, strncmp("value2", pairs["key2"].c_str(), 6));
+
+  pairs.clear();
+  ASSERT_EQ(0, metadata_remove(&ioctx, oid, "key1"));
+  ASSERT_EQ(0, metadata_remove(&ioctx, oid, "key3"));
+  ASSERT_TRUE(metadata_get(&ioctx, oid, "key1", &value) < 0);
+  ASSERT_EQ(0, metadata_list(&ioctx, oid, "", 0, &pairs));
+  ASSERT_EQ(1U, pairs.size());
+  ASSERT_EQ(0, strncmp("value2", pairs["key2"].c_str(), 6));
+
+  pairs.clear();
+  char key[10], val[20];
+  for (int i = 0; i < 1024; i++) {
+    sprintf(key, "key%d", i);
+    sprintf(val, "value%d", i);
+    pairs[key].append(val, strlen(val));
+  }
+  ASSERT_EQ(0, metadata_set(&ioctx, oid, pairs));
+
+  string last_read = "";
+  uint64_t max_read = 48, r;
+  uint64_t size = 0;
+  map<string, bufferlist> data;
+  do {
+    map<string, bufferlist> cur;
+    metadata_list(&ioctx, oid, last_read, max_read, &cur);
+    size += cur.size();
+    for (map<string, bufferlist>::iterator it = cur.begin();
+         it != cur.end(); ++it)
+      data[it->first] = it->second;
+    last_read = cur.rbegin()->first;
+    r = cur.size();
+  } while (r == max_read);
+  ASSERT_EQ(size, 1024U);
+  for (map<string, bufferlist>::iterator it = data.begin();
+       it != data.end(); ++it) {
+    ASSERT_TRUE(it->second.contents_equal(pairs[it->first]));
+  }
+
+  ioctx.close();
+}
+
+TEST_F(TestClsRbd, set_features)
+{
+  librados::IoCtx ioctx;
+  ASSERT_EQ(0, _rados.ioctx_create(_pool_name.c_str(), ioctx));
+
+  string oid = get_temp_image_name();
+  ASSERT_EQ(0, create_image(&ioctx, oid, 0, 22, 0, oid));
+
+  uint64_t features = RBD_FEATURES_MUTABLE;
+  uint64_t mask = RBD_FEATURES_MUTABLE;
+  ASSERT_EQ(0, set_features(&ioctx, oid, features, mask));
+
+  uint64_t actual_features;
+  ASSERT_EQ(0, get_features(&ioctx, oid, CEPH_NOSNAP, &actual_features));
+
+  uint64_t expected_features = RBD_FEATURES_MUTABLE;
+  ASSERT_EQ(expected_features, actual_features);
+
+  features = 0;
+  mask = RBD_FEATURE_OBJECT_MAP;
+  ASSERT_EQ(0, set_features(&ioctx, oid, features, mask));
+
+  ASSERT_EQ(0, get_features(&ioctx, oid, CEPH_NOSNAP, &actual_features));
+
+  expected_features = RBD_FEATURES_MUTABLE & ~RBD_FEATURE_OBJECT_MAP;
+  ASSERT_EQ(expected_features, actual_features);
+
+  mask = RBD_FEATURE_LAYERING;
+  ASSERT_EQ(-EINVAL, set_features(&ioctx, oid, features, mask));
+}
diff --git a/src/test/common/Throttle.cc b/src/test/common/Throttle.cc
index 0964cbe..dc2d248 100644
--- a/src/test/common/Throttle.cc
+++ b/src/test/common/Throttle.cc
@@ -74,7 +74,15 @@ TEST_F(ThrottleTest, take) {
 
 TEST_F(ThrottleTest, get) {
   int64_t throttle_max = 10;
-  Throttle throttle(g_ceph_context, "throttle", throttle_max);
+  Throttle throttle(g_ceph_context, "throttle");
+
+  // test increasing max from 0 to throttle_max
+  {
+    ASSERT_FALSE(throttle.get(throttle_max, throttle_max));
+    ASSERT_EQ(throttle.get_max(), throttle_max);
+    ASSERT_EQ(throttle.put(throttle_max), 0);
+  }
+
   ASSERT_THROW(throttle.get(-1), FailedAssertion);
   ASSERT_FALSE(throttle.get(5)); 
   ASSERT_EQ(throttle.put(5), 0); 
@@ -161,7 +169,13 @@ TEST_F(ThrottleTest, get_or_fail) {
 
 TEST_F(ThrottleTest, wait) {
   int64_t throttle_max = 10;
-  Throttle throttle(g_ceph_context, "throttle", throttle_max);
+  Throttle throttle(g_ceph_context, "throttle");
+
+  // test increasing max from 0 to throttle_max
+  {
+    ASSERT_FALSE(throttle.wait(throttle_max));
+    ASSERT_EQ(throttle.get_max(), throttle_max);
+  }
 
   useconds_t delay = 1;
 
diff --git a/src/test/common/get_command_descriptions.cc b/src/test/common/get_command_descriptions.cc
index 7cc5b6a..eb4c0f9 100644
--- a/src/test/common/get_command_descriptions.cc
+++ b/src/test/common/get_command_descriptions.cc
@@ -53,13 +53,15 @@ static void json_print(const MonCommand *mon_commands, int size)
 
 static void all()
 {
+#undef FLAG
 #undef COMMAND
 #undef COMMAND_WITH_FLAG
   MonCommand mon_commands[] = {
+#define FLAG(f) (MonCommand::FLAG_##f)
 #define COMMAND(parsesig, helptext, modulename, req_perms, avail)	\
     {parsesig, helptext, modulename, req_perms, avail, 0},
-#define COMMAND_WITH_FLAG(parsesig, helptext, modulename, req_perms, avail, flag) \
-    {parsesig, helptext, modulename, req_perms, avail, MonCommand::FLAG_##flag},
+#define COMMAND_WITH_FLAG(parsesig, helptext, modulename, req_perms, avail, flags) \
+    {parsesig, helptext, modulename, req_perms, avail, flags},
 #include <mon/MonCommands.h>
   };
 
diff --git a/src/test/common/test_async_compressor.cc b/src/test/common/test_async_compressor.cc
new file mode 100644
index 0000000..1655596
--- /dev/null
+++ b/src/test/common/test_async_compressor.cc
@@ -0,0 +1,221 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Haomai Wang <haomaiwang at gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+#include <time.h>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_int.hpp>
+#include <boost/random/binomial_distribution.hpp>
+#include <gtest/gtest.h>
+#include "common/ceph_argparse.h"
+#include "compressor/AsyncCompressor.h"
+#include "global/global_init.h"
+
+typedef boost::mt11213b gen_type;
+
+class AsyncCompressorTest : public ::testing::Test {
+ public:
+  AsyncCompressor *async_compressor;
+  virtual void SetUp() {
+    cerr << __func__ << " start set up " << std::endl;
+    async_compressor = new AsyncCompressor(g_ceph_context);
+    async_compressor->init();
+  }
+  virtual void TearDown() {
+    async_compressor->terminate();
+    delete async_compressor;
+  }
+
+  void generate_random_data(bufferlist &bl, uint64_t len = 0) {
+    static const char *base= "znvm,x12399zasdfjkl1209zxcvjlkasjdfljwqelrjzx,cvn,m123#*(@)";
+    if (!len) {
+      boost::uniform_int<> kb(16, 4096);
+      gen_type rng(time(NULL));
+      len = kb(rng) * 1024;
+    }
+
+    while (bl.length() < len)
+      bl.append(base, sizeof(base)-1);
+  }
+};
+
+TEST_F(AsyncCompressorTest, SimpleTest) {
+  bufferlist compress_data, decompress_data, rawdata;
+  generate_random_data(rawdata, 1<<22);
+  bool finished;
+  uint64_t id = async_compressor->async_compress(rawdata);
+  ASSERT_EQ(0, async_compressor->get_compress_data(id, compress_data, true, &finished));
+  ASSERT_TRUE(finished == true);
+  id = async_compressor->async_decompress(compress_data);
+  do {
+    ASSERT_EQ(0, async_compressor->get_decompress_data(id, decompress_data, false, &finished));
+  } while (!finished);
+  ASSERT_TRUE(finished == true);
+  ASSERT_TRUE(rawdata.contents_equal(decompress_data));
+  ASSERT_EQ(-ENOENT, async_compressor->get_decompress_data(id, decompress_data, true, &finished));
+}
+
+TEST_F(AsyncCompressorTest, GrubWaitTest) {
+  async_compressor->terminate();
+  bufferlist compress_data, decompress_data, rawdata;
+  generate_random_data(rawdata, 1<<22);
+  bool finished;
+  uint64_t id = async_compressor->async_compress(rawdata);
+  ASSERT_EQ(0, async_compressor->get_compress_data(id, compress_data, true, &finished));
+  ASSERT_TRUE(finished == true);
+  id = async_compressor->async_decompress(compress_data);
+  ASSERT_EQ(0, async_compressor->get_decompress_data(id, decompress_data, true, &finished));
+  ASSERT_TRUE(finished == true);
+  ASSERT_TRUE(rawdata.contents_equal(decompress_data));
+  async_compressor->init();
+}
+
+TEST_F(AsyncCompressorTest, DecompressInjectTest) {
+  bufferlist compress_data, decompress_data, rawdata;
+  generate_random_data(rawdata, 1<<22);
+  bool finished;
+  uint64_t id = async_compressor->async_compress(rawdata);
+  ASSERT_EQ(0, async_compressor->get_compress_data(id, compress_data, true, &finished));
+  ASSERT_TRUE(finished == true);
+  char error[] = "asjdfkwejrljqwaelrj";
+  memcpy(compress_data.c_str()+1024, error, sizeof(error)-1);
+  id = async_compressor->async_decompress(compress_data);
+  ASSERT_EQ(-EIO, async_compressor->get_decompress_data(id, decompress_data, true, &finished));
+}
+
+class SyntheticWorkload {
+  set<pair<uint64_t, uint64_t> > compress_jobs, decompress_jobs;
+  AsyncCompressor *async_compressor;
+  vector<bufferlist> rand_data, compress_data;
+  gen_type rng;
+  static const uint64_t MAX_INFLIGHT = 128;
+
+ public:
+  SyntheticWorkload(AsyncCompressor *ac): async_compressor(ac), rng(time(NULL)) {
+    for (int i = 0; i < 100; i++) {
+      bufferlist bl;
+      boost::uniform_int<> u(4096, 1<<24);
+      uint64_t value_len = u(rng);
+      bufferptr bp(value_len);
+      bp.zero();
+      for (uint64_t j = 0; j < value_len-sizeof(i); ) {
+        memcpy(bp.c_str()+j, &i, sizeof(i));
+        j += 4096;
+      }
+
+      bl.append(bp);
+      rand_data.push_back(bl);
+      compress_jobs.insert(make_pair(async_compressor->async_compress(rand_data[i]), i));
+      if (!(i % 10)) cerr << "seeding compress data " << i << std::endl;
+    }
+    compress_data.resize(100);
+    reap(true);
+  }
+  void do_compress() {
+    boost::uniform_int<> u(0, rand_data.size()-1);
+    uint64_t index = u(rng);
+    compress_jobs.insert(make_pair(async_compressor->async_compress(rand_data[index]), index));
+  }
+  void do_decompress() {
+    boost::uniform_int<> u(0, compress_data.size()-1);
+    uint64_t index = u(rng);
+    if (compress_data[index].length())
+      decompress_jobs.insert(make_pair(async_compressor->async_decompress(compress_data[index]), index));
+  }
+  void reap(bool blocking) {
+    bufferlist data;
+    bool finished;
+    set<pair<uint64_t, uint64_t> >::iterator prev;
+    uint64_t c_reap = 0, d_reap = 0;
+    do {
+      for (set<pair<uint64_t, uint64_t> >::iterator it = compress_jobs.begin();
+           it != compress_jobs.end();) {
+        prev = it;
+        ++it;
+        ASSERT_EQ(0, async_compressor->get_compress_data(prev->first, data, blocking, &finished));
+        if (finished) {
+          c_reap++;
+          if (compress_data[prev->second].length())
+            ASSERT_TRUE(compress_data[prev->second].contents_equal(data));
+          else
+            compress_data[prev->second].swap(data);
+          compress_jobs.erase(prev);
+        }
+      }
+
+      for (set<pair<uint64_t, uint64_t> >::iterator it = decompress_jobs.begin();
+           it != decompress_jobs.end();) {
+        prev = it;
+        ++it;
+        ASSERT_EQ(0, async_compressor->get_decompress_data(prev->first, data, blocking, &finished));
+        if (finished) {
+          d_reap++;
+          ASSERT_TRUE(rand_data[prev->second].contents_equal(data));
+          decompress_jobs.erase(prev);
+        }
+      }
+      usleep(1000 * 500);
+    } while (compress_jobs.size() + decompress_jobs.size() > MAX_INFLIGHT);
+    cerr << " reap compress jobs " << c_reap << " decompress jobs " << d_reap << std::endl;
+  }
+  void print_internal_state() {
+    cerr << "inlfight compress jobs: " << compress_jobs.size()
+         << " inflight decompress jobs: " << decompress_jobs.size() << std::endl;
+  }
+  bool empty() const { return compress_jobs.empty() && decompress_jobs.empty(); }
+};
+
+TEST_F(AsyncCompressorTest, SyntheticTest) {
+  SyntheticWorkload test_ac(async_compressor);
+  gen_type rng(time(NULL));
+  boost::uniform_int<> true_false(0, 99);
+  int val;
+  for (int i = 0; i < 3000; ++i) {
+    if (!(i % 10)) {
+      cerr << "Op " << i << ": ";
+      test_ac.print_internal_state();
+    }
+    val = true_false(rng);
+    if (val < 45) {
+      test_ac.do_compress();
+    } else if (val < 95) {
+      test_ac.do_decompress();
+    } else {
+      test_ac.reap(false);
+    }
+  }
+  while (!test_ac.empty()) {
+    test_ac.reap(false);
+    test_ac.print_internal_state();
+    usleep(1000*500);
+  }
+}
+
+
+int main(int argc, char **argv) {
+  vector<const char*> args;
+  argv_to_vec(argc, (const char **)argv, args);
+
+  global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
+  common_init_finish(g_ceph_context);
+
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+/*
+ * Local Variables:
+ * compile-command: "cd ../.. ; make -j4 unittest_async_compressor && valgrind --tool=memcheck ./unittest_async_compressor"
+ * End:
+ */
diff --git a/src/test/common/test_context.cc b/src/test/common/test_context.cc
index c32fed3..921fc90 100644
--- a/src/test/common/test_context.cc
+++ b/src/test/common/test_context.cc
@@ -82,6 +82,13 @@ TEST(CephContext, experimental_features)
   ASSERT_FALSE(cct->check_experimental_feature_enabled("bar"));
   ASSERT_TRUE(cct->check_experimental_feature_enabled("baz"));
 
+  cct->_conf->set_val("enable_experimental_unrecoverable_data_corrupting_features",
+		      "*");
+  cct->_conf->apply_changes(&cout);
+  ASSERT_TRUE(cct->check_experimental_feature_enabled("foo"));
+  ASSERT_TRUE(cct->check_experimental_feature_enabled("bar"));
+  ASSERT_TRUE(cct->check_experimental_feature_enabled("baz"));
+
   cct->_log->flush();
 }
 
diff --git a/src/test/common/test_crc32c.cc b/src/test/common/test_crc32c.cc
index b4297c6..a311616 100644
--- a/src/test/common/test_crc32c.cc
+++ b/src/test/common/test_crc32c.cc
@@ -13,6 +13,7 @@
 
 #include "common/sctp_crc32.h"
 #include "common/crc32c_intel_baseline.h"
+#include "common/crc32c_aarch64.h"
 
 TEST(Crc32c, Small) {
   const char *a = "foo bar baz";
@@ -80,6 +81,15 @@ TEST(Crc32c, Performance) {
     std::cout << "intel baseline = " << rate << " MB/sec" << std::endl;
     ASSERT_EQ(261108528u, val);
   }
+  if (ceph_arch_aarch64_crc32) // Skip if CRC32C instructions are not defined.
+  {
+    utime_t start = ceph_clock_now(NULL);
+    unsigned val = ceph_crc32c_aarch64(0, (unsigned char *)a, len);
+    utime_t end = ceph_clock_now(NULL);
+    float rate = (float)len / (float)(1024*1024) / (float)(end - start);
+    std::cout << "aarch64 = " << rate << " MB/sec" << std::endl;
+    ASSERT_EQ(261108528u, val);
+  }
 
 }
 
diff --git a/src/test/common/test_prioritized_queue.cc b/src/test/common/test_prioritized_queue.cc
new file mode 100644
index 0000000..00709a1
--- /dev/null
+++ b/src/test/common/test_prioritized_queue.cc
@@ -0,0 +1,248 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "gtest/gtest.h"
+#include "common/PrioritizedQueue.h"
+
+#include <numeric>
+#include <vector>
+
+
+using std::vector;
+
+class PrioritizedQueueTest : public testing::Test
+{
+protected:
+  typedef int Klass;
+  typedef unsigned Item;
+  typedef PrioritizedQueue<Item, Klass> PQ;
+  enum { item_size  = 100, };
+  vector<Item> items;
+
+  virtual void SetUp() {
+    for (int i = 0; i < item_size; i++) {
+      items.push_back(Item(i));
+    }
+    random_shuffle(items.begin(), items.end());
+  }
+  virtual void TearDown() {
+    items.clear();
+  }
+};
+
+TEST_F(PrioritizedQueueTest, capacity) {
+  const unsigned min_cost  = 10;
+  const unsigned max_tokens_per_subqueue = 50;
+  PQ pq(max_tokens_per_subqueue, min_cost);
+  EXPECT_TRUE(pq.empty());
+  EXPECT_EQ(0u, pq.length());
+
+  pq.enqueue_strict(Klass(1), 0, Item(0));
+  EXPECT_FALSE(pq.empty());
+  EXPECT_EQ(1u, pq.length());
+
+  for (int i = 0; i < 3; i++) {
+    pq.enqueue(Klass(1), 0, 10, Item(0));
+  }
+  for (unsigned i = 4; i > 0; i--) {
+    EXPECT_FALSE(pq.empty());
+    EXPECT_EQ(i, pq.length());
+    pq.dequeue();
+  }
+  EXPECT_TRUE(pq.empty());
+  EXPECT_EQ(0u, pq.length());
+}
+
+TEST_F(PrioritizedQueueTest, strict_pq) {
+  const unsigned min_cost = 1;
+  const unsigned max_tokens_per_subqueue = 50;
+  PQ pq(max_tokens_per_subqueue, min_cost);
+  // 0 .. item_size-1
+  for (unsigned i = 0; i < item_size; i++) {
+    unsigned priority = items[i];
+    pq.enqueue_strict(Klass(0), priority, items[i]);
+  }
+  // item_size-1 .. 0
+  for (unsigned i = item_size; i > 0; i--) {
+    Item item = pq.dequeue();
+    unsigned priority = item;
+    EXPECT_EQ(i - 1, priority);
+  }
+}
+
+TEST_F(PrioritizedQueueTest, lowest_among_eligible_otherwise_highest) {
+  // to minimize the effect of `distribute_tokens()`
+  // all eligible items will be assigned with cost of min_cost
+  const unsigned min_cost = 0;
+  const unsigned max_tokens_per_subqueue = 100;
+  PQ pq(max_tokens_per_subqueue, min_cost);
+
+#define ITEM_TO_COST(item_) (item_ % 5 ? min_cost : max_tokens_per_subqueue)
+  unsigned num_low_cost = 0, num_high_cost = 0;
+  for (int i = 0; i < item_size; i++) {
+    const Item& item = items[i];
+    unsigned cost = ITEM_TO_COST(item);
+    unsigned priority = item;
+    if (cost == min_cost) {
+      num_low_cost++;
+    } else {
+      num_high_cost++;
+    }
+    pq.enqueue(Klass(0), priority, cost, item);
+  }
+  // the token in all buckets is 0 at the beginning, so dequeue() should pick
+  // the first one with the highest priority.
+  unsigned highest_priority;
+  {
+    Item item = pq.dequeue();
+    unsigned cost = ITEM_TO_COST(item);
+    unsigned priority = item;
+    if (cost == min_cost) {
+      num_low_cost--;
+    } else {
+      num_high_cost--;
+    }
+    EXPECT_EQ(item_size - 1u, priority);
+    highest_priority = priority;
+  }
+  unsigned lowest_priority = 0;
+  for (unsigned i = 0; i < num_low_cost; i++) {
+    Item item = pq.dequeue();
+    unsigned cost = ITEM_TO_COST(item);
+    unsigned priority = item;
+    EXPECT_EQ(min_cost, cost);
+    EXPECT_GT(priority, lowest_priority);
+    lowest_priority = priority;
+  }
+  for (unsigned i = 0; i < num_high_cost; i++) {
+    Item item = pq.dequeue();
+    unsigned cost = ITEM_TO_COST(item);
+    unsigned priority = item;
+    EXPECT_EQ(max_tokens_per_subqueue, cost);
+    EXPECT_LT(priority, highest_priority);
+    highest_priority = priority;
+  }
+#undef ITEM_TO_COST
+}
+
+static const unsigned num_classes = 4;
+// just a determinitic number
+#define ITEM_TO_CLASS(item_) Klass((item_ + 43) % num_classes)
+
+TEST_F(PrioritizedQueueTest, fairness_by_class) {
+  // dequeue should be fair to all classes in a certain bucket
+  const unsigned min_cost = 1;
+  const unsigned max_tokens_per_subqueue = 50;
+  PQ pq(max_tokens_per_subqueue, min_cost);
+
+  for (int i = 0; i < item_size; i++) {
+    const Item& item = items[i];
+    Klass k = ITEM_TO_CLASS(item);
+    unsigned priority = 0;
+    unsigned cost = 1;
+    pq.enqueue(k, priority, cost, item);
+  }
+  // just sample first 1/2 of the items
+  // if i pick too small a dataset, the result won't be statisitcally
+  // significant. if the sample dataset is too large, it will converge to the
+  // distribution of the full set.
+  vector<unsigned> num_picked_in_class(num_classes, 0u);
+  for (int i = 0; i < item_size / 2; i++) {
+    Item item = pq.dequeue();
+    Klass k = ITEM_TO_CLASS(item);
+    num_picked_in_class[k]++;
+  }
+  unsigned total = std::accumulate(num_picked_in_class.begin(),
+				   num_picked_in_class.end(),
+				   0);
+  float avg = float(total) / num_classes;
+  for (unsigned i = 0; i < num_classes; i++) {
+    EXPECT_NEAR(avg, num_picked_in_class[i], 0.5);
+  }
+}
+
+template <typename T>
+struct Greater {
+  const T rhs;
+  Greater(const T& v) : rhs(v)
+  {}
+  bool operator()(const T& lhs) const {
+    return lhs > rhs;
+  }
+};
+
+TEST_F(PrioritizedQueueTest, remove_by_filter) {
+  const unsigned min_cost = 1;
+  const unsigned max_tokens_per_subqueue = 50;
+  PQ pq(max_tokens_per_subqueue, min_cost);
+
+  const Greater<Item> pred(item_size/2);
+  unsigned num_to_remove = 0;
+  for (unsigned i = 0; i < item_size; i++) {
+    const Item& item = items[i];
+    pq.enqueue(Klass(1), 0, 10, item);
+    if (pred(item)) {
+      num_to_remove++;
+    }
+  }
+  std::list<Item> removed;
+  pq.remove_by_filter(pred, &removed);
+
+  // see if the removed items are expected ones.
+  for (std::list<Item>::iterator it = removed.begin();
+       it != removed.end();
+       ++it) {
+    const Item& item = *it;
+    EXPECT_TRUE(pred(item));
+    items.erase(remove(items.begin(), items.end(), item), items.end());
+  }
+  EXPECT_EQ(num_to_remove, removed.size());
+  EXPECT_EQ(item_size - num_to_remove, pq.length());
+  EXPECT_EQ(item_size - num_to_remove, items.size());
+  // see if the remainder are expeceted also.
+  while (!pq.empty()) {
+    const Item item = pq.dequeue();
+    EXPECT_FALSE(pred(item));
+    items.erase(remove(items.begin(), items.end(), item), items.end());
+  }
+  EXPECT_TRUE(items.empty());
+}
+
+TEST_F(PrioritizedQueueTest, remove_by_class) {
+  const unsigned min_cost = 1;
+  const unsigned max_tokens_per_subqueue = 50;
+  PQ pq(max_tokens_per_subqueue, min_cost);
+  const Klass class_to_remove(2);
+  unsigned num_to_remove = 0;
+  for (int i = 0; i < item_size; i++) {
+    const Item& item = items[i];
+    Klass k = ITEM_TO_CLASS(item);
+    pq.enqueue(k, 0, 0, item);
+    if (k == class_to_remove) {
+      num_to_remove++;
+    }
+  }
+  std::list<Item> removed;
+  pq.remove_by_class(class_to_remove, &removed);
+
+  // see if the removed items are expected ones.
+  for (std::list<Item>::iterator it = removed.begin();
+       it != removed.end();
+       ++it) {
+    const Item& item = *it;
+    Klass k = ITEM_TO_CLASS(item);
+    EXPECT_EQ(class_to_remove, k);
+    items.erase(remove(items.begin(), items.end(), item), items.end());
+  }
+  EXPECT_EQ(num_to_remove, removed.size());
+  EXPECT_EQ(item_size - num_to_remove, pq.length());
+  EXPECT_EQ(item_size - num_to_remove, items.size());
+  // see if the remainder are expeceted also.
+  while (!pq.empty()) {
+    const Item item = pq.dequeue();
+    Klass k = ITEM_TO_CLASS(item);
+    EXPECT_NE(class_to_remove, k);
+    items.erase(remove(items.begin(), items.end(), item), items.end());
+  }
+  EXPECT_TRUE(items.empty());
+}
diff --git a/src/test/common/test_shared_cache.cc b/src/test/common/test_shared_cache.cc
index fb3a920..09f6fb1 100644
--- a/src/test/common/test_shared_cache.cc
+++ b/src/test/common/test_shared_cache.cc
@@ -28,13 +28,11 @@
 #include "global/global_init.h"
 #include <gtest/gtest.h>
 
-using namespace std::tr1;
-
 class SharedLRUTest : public SharedLRU<unsigned int, int> {
 public:
   Mutex &get_lock() { return lock; }
   Cond &get_cond() { return cond; }
-  map<unsigned int, pair< weak_ptr<int>, int* > > &get_weak_refs() {
+  map<unsigned int, pair< ceph::weak_ptr<int>, int* > > &get_weak_refs() {
     return weak_refs;
   }
 };
@@ -47,7 +45,7 @@ public:
     SharedLRUTest &cache;
     unsigned int key;
     int value;
-    shared_ptr<int> ptr;
+    ceph::shared_ptr<int> ptr;
     enum in_method_t { LOOKUP, LOWER_BOUND } in_method;
 
     Thread_wait(SharedLRUTest& _cache, unsigned int _key, 
@@ -63,7 +61,7 @@ public:
         ptr = cache.lower_bound(key);
         break;
       case LOOKUP:
-        ptr = shared_ptr<int>(new int);
+        ptr = ceph::shared_ptr<int>(new int);
         *ptr = value;
         ptr = cache.lookup(key);
         break;
@@ -107,13 +105,13 @@ TEST_F(SharedLRU_all, add) {
   int value1 = 2;
   bool existed = false;
   {
-    shared_ptr<int> ptr = cache.add(key, new int(value1), &existed);
+    ceph::shared_ptr<int> ptr = cache.add(key, new int(value1), &existed);
     ASSERT_EQ(value1, *ptr);
     ASSERT_FALSE(existed);
   }
   {
     int value2 = 3;
-    shared_ptr<int> ptr = cache.add(key, new int(value2), &existed);
+    ceph::shared_ptr<int> ptr = cache.add(key, new int(value2), &existed);
     ASSERT_EQ(value1, *ptr);
     ASSERT_TRUE(existed);
   }
@@ -121,12 +119,12 @@ TEST_F(SharedLRU_all, add) {
 TEST_F(SharedLRU_all, empty) {
   SharedLRUTest cache;
   unsigned int key = 1;
-  int value1 = 2;
   bool existed = false;
 
   ASSERT_TRUE(cache.empty());
   {
-    shared_ptr<int> ptr = cache.add(key, new int(value1), &existed);
+    int value1 = 2;
+    ceph::shared_ptr<int> ptr = cache.add(key, new int(value1), &existed);
     ASSERT_EQ(value1, *ptr);
     ASSERT_FALSE(existed);
   }
@@ -141,28 +139,28 @@ TEST_F(SharedLRU_all, lookup) {
   unsigned int key = 1;
   {
     int value = 2;
-    ASSERT_TRUE(cache.add(key, new int(value)));
-    ASSERT_TRUE(cache.lookup(key));
+    ASSERT_TRUE(cache.add(key, new int(value)).get());
+    ASSERT_TRUE(cache.lookup(key).get());
     ASSERT_EQ(value, *cache.lookup(key));
   }
-  ASSERT_TRUE(cache.lookup(key));
+  ASSERT_TRUE(cache.lookup(key).get());
 }
 TEST_F(SharedLRU_all, lookup_or_create) {
   SharedLRUTest cache;
   {
     int value = 2;
     unsigned int key = 1;
-    ASSERT_TRUE(cache.add(key, new int(value)));
-    ASSERT_TRUE(cache.lookup_or_create(key));
+    ASSERT_TRUE(cache.add(key, new int(value)).get());
+    ASSERT_TRUE(cache.lookup_or_create(key).get());
     ASSERT_EQ(value, *cache.lookup(key));
   }
   {
     unsigned int key = 2;
-    ASSERT_TRUE(cache.lookup_or_create(key));
+    ASSERT_TRUE(cache.lookup_or_create(key).get());
     ASSERT_EQ(0, *cache.lookup(key));
   }
-  ASSERT_TRUE(cache.lookup(1));
-  ASSERT_TRUE(cache.lookup(2));
+  ASSERT_TRUE(cache.lookup(1).get());
+  ASSERT_TRUE(cache.lookup(2).get());
 }
 
 TEST_F(SharedLRU_all, wait_lookup) {
@@ -171,7 +169,7 @@ TEST_F(SharedLRU_all, wait_lookup) {
   int value = 2;
 
   {
-    shared_ptr<int> ptr(new int);
+    ceph::shared_ptr<int> ptr(new int);
     cache.get_weak_refs()[key] = make_pair(ptr, &*ptr);
   }
   EXPECT_FALSE(cache.get_weak_refs()[key].first.lock());
@@ -197,7 +195,7 @@ TEST_F(SharedLRU_all, wait_lookup_or_create) {
   int value = 2;
 
   {
-    shared_ptr<int> ptr(new int);
+    ceph::shared_ptr<int> ptr(new int);
     cache.get_weak_refs()[key] = make_pair(ptr, &*ptr);
   }
   EXPECT_FALSE(cache.get_weak_refs()[key].first.lock());
@@ -207,7 +205,7 @@ TEST_F(SharedLRU_all, wait_lookup_or_create) {
   ASSERT_TRUE(wait_for(cache, 1));
   EXPECT_EQ(value, *t.ptr);
   // waiting on a key does not block lookups on other keys
-  EXPECT_TRUE(cache.lookup_or_create(key + 12345));
+  EXPECT_TRUE(cache.lookup_or_create(key + 12345).get());
   {
     Mutex::Locker l(cache.get_lock());
     cache.get_weak_refs().erase(key);
@@ -226,8 +224,8 @@ TEST_F(SharedLRU_all, lower_bound) {
     ASSERT_FALSE(cache.lower_bound(key));
     int value = 2;
 
-    ASSERT_TRUE(cache.add(key, new int(value)));
-    ASSERT_TRUE(cache.lower_bound(key));
+    ASSERT_TRUE(cache.add(key, new int(value)).get());
+    ASSERT_TRUE(cache.lower_bound(key).get());
     EXPECT_EQ(value, *cache.lower_bound(key));
   }
 }
@@ -239,10 +237,10 @@ TEST_F(SharedLRU_all, wait_lower_bound) {
   unsigned int other_key = key + 1;
   int other_value = value + 1;
 
-  ASSERT_TRUE(cache.add(other_key, new int(other_value)));
+  ASSERT_TRUE(cache.add(other_key, new int(other_value)).get());
 
   {
-    shared_ptr<int> ptr(new int);
+    ceph::shared_ptr<int> ptr(new int);
     cache.get_weak_refs()[key] = make_pair(ptr, &*ptr);
   }
   EXPECT_FALSE(cache.get_weak_refs()[key].first.lock());
@@ -252,7 +250,7 @@ TEST_F(SharedLRU_all, wait_lower_bound) {
   ASSERT_TRUE(wait_for(cache, 1));
   EXPECT_FALSE(t.ptr);
   // waiting on a key does not block getting lower_bound on other keys
-  EXPECT_TRUE(cache.lower_bound(other_key));
+  EXPECT_TRUE(cache.lower_bound(other_key).get());
   {
     Mutex::Locker l(cache.get_lock());
     cache.get_weak_refs().erase(key);
@@ -260,7 +258,7 @@ TEST_F(SharedLRU_all, wait_lower_bound) {
   }
   ASSERT_TRUE(wait_for(cache, 0));
   t.join();
-  EXPECT_TRUE(t.ptr);
+  EXPECT_TRUE(t.ptr.get());
 }
 TEST_F(SharedLRU_all, get_next) {
 
@@ -274,15 +272,15 @@ TEST_F(SharedLRU_all, get_next) {
     SharedLRUTest cache;
 
     const unsigned int key2 = 333;
-    shared_ptr<int> ptr2 = cache.lookup_or_create(key2);
+    ceph::shared_ptr<int> ptr2 = cache.lookup_or_create(key2);
     const int value2 = *ptr2 = 400;
 
     // entries with expired pointers are silently ignored
     const unsigned int key_gone = 222;
-    cache.get_weak_refs()[key_gone] = make_pair(shared_ptr<int>(), (int*)0);
+    cache.get_weak_refs()[key_gone] = make_pair(ceph::shared_ptr<int>(), (int*)0);
 
     const unsigned int key1 = 111;
-    shared_ptr<int> ptr1 = cache.lookup_or_create(key1);
+    ceph::shared_ptr<int> ptr1 = cache.lookup_or_create(key1);
     const int value1 = *ptr1 = 800;
 
     pair<unsigned int, int> i;
@@ -301,11 +299,11 @@ TEST_F(SharedLRU_all, get_next) {
   {
     SharedLRUTest cache;
     const unsigned int key1 = 111;
-    shared_ptr<int> *ptr1 = new shared_ptr<int>(cache.lookup_or_create(key1));
+    ceph::shared_ptr<int> *ptr1 = new shared_ptr<int>(cache.lookup_or_create(key1));
     const unsigned int key2 = 222;
-    shared_ptr<int> ptr2 = cache.lookup_or_create(key2);
+    ceph::shared_ptr<int> ptr2 = cache.lookup_or_create(key2);
 
-    pair<unsigned int, shared_ptr<int> > i;
+    pair<unsigned int, ceph::shared_ptr<int> > i;
     EXPECT_TRUE(cache.get_next(i.first, &i));
     EXPECT_EQ(key1, i.first);
     delete ptr1;
@@ -322,14 +320,14 @@ TEST_F(SharedLRU_all, clear) {
     ceph::shared_ptr<int> ptr = cache.add(key, new int(value));
     ASSERT_EQ(value, *cache.lookup(key));
   }
-  ASSERT_TRUE(cache.lookup(key));
+  ASSERT_TRUE(cache.lookup(key).get());
   cache.clear(key);
   ASSERT_FALSE(cache.lookup(key));
 
   {
     ceph::shared_ptr<int> ptr = cache.add(key, new int(value));
   }
-  ASSERT_TRUE(cache.lookup(key));
+  ASSERT_TRUE(cache.lookup(key).get());
   cache.clear(key);
   ASSERT_FALSE(cache.lookup(key));
 }
@@ -341,14 +339,14 @@ TEST_F(SharedLRU_all, clear_all) {
     ceph::shared_ptr<int> ptr = cache.add(key, new int(value));
     ASSERT_EQ(value, *cache.lookup(key));
   }
-  ASSERT_TRUE(cache.lookup(key));
+  ASSERT_TRUE(cache.lookup(key).get());
   cache.clear();
   ASSERT_FALSE(cache.lookup(key));
 
   ceph::shared_ptr<int> ptr2 = cache.add(key, new int(value));
-  ASSERT_TRUE(cache.lookup(key));
+  ASSERT_TRUE(cache.lookup(key).get());
   cache.clear();
-  ASSERT_TRUE(cache.lookup(key));
+  ASSERT_TRUE(cache.lookup(key).get());
   ASSERT_FALSE(cache.empty());
 }
 
@@ -356,7 +354,7 @@ TEST(SharedCache_all, add) {
   SharedLRU<int, int> cache;
   unsigned int key = 1;
   int value = 2;
-  shared_ptr<int> ptr = cache.add(key, new int(value));
+  ceph::shared_ptr<int> ptr = cache.add(key, new int(value));
   ASSERT_EQ(ptr, cache.lookup(key));
   ASSERT_EQ(value, *cache.lookup(key));
 }
@@ -366,11 +364,11 @@ TEST(SharedCache_all, lru) {
   SharedLRU<int, int> cache(NULL, SIZE);
 
   bool existed = false;
-  shared_ptr<int> ptr = cache.add(0, new int(0), &existed);
+  ceph::shared_ptr<int> ptr = cache.add(0, new int(0), &existed);
   ASSERT_FALSE(existed);
   {
     int *tmpint = new int(0);
-    shared_ptr<int> ptr2 = cache.add(0, tmpint, &existed);
+    ceph::shared_ptr<int> ptr2 = cache.add(0, tmpint, &existed);
     ASSERT_TRUE(existed);
     delete tmpint;
   }
@@ -379,20 +377,20 @@ TEST(SharedCache_all, lru) {
     ASSERT_FALSE(existed);
   }
 
-  ASSERT_TRUE(cache.lookup(0));
+  ASSERT_TRUE(cache.lookup(0).get());
   ASSERT_EQ(0, *cache.lookup(0));
 
   ASSERT_FALSE(cache.lookup(SIZE-1));
   ASSERT_FALSE(cache.lookup(SIZE));
-  ASSERT_TRUE(cache.lookup(SIZE+1));
+  ASSERT_TRUE(cache.lookup(SIZE+1).get());
   ASSERT_EQ((int)SIZE+1, *cache.lookup(SIZE+1));
 
   cache.purge(0);
   ASSERT_FALSE(cache.lookup(0));
-  shared_ptr<int> ptr2 = cache.add(0, new int(0), &existed);
+  ceph::shared_ptr<int> ptr2 = cache.add(0, new int(0), &existed);
   ASSERT_FALSE(ptr == ptr2);
-  ptr = shared_ptr<int>();
-  ASSERT_TRUE(cache.lookup(0));
+  ptr = ceph::shared_ptr<int>();
+  ASSERT_TRUE(cache.lookup(0).get());
 }
 
 int main(int argc, char **argv) {
diff --git a/src/test/common/test_sharedptr_registry.cc b/src/test/common/test_sharedptr_registry.cc
index 9c9e89f..42bc8e6 100644
--- a/src/test/common/test_sharedptr_registry.cc
+++ b/src/test/common/test_sharedptr_registry.cc
@@ -27,12 +27,10 @@
 #include "global/global_init.h"
 #include <gtest/gtest.h>
 
-using namespace std::tr1;
-
 class SharedPtrRegistryTest : public SharedPtrRegistry<unsigned int, int> {
 public:
   Mutex &get_lock() { return lock; }
-  map<unsigned int, pair<weak_ptr<int>, int*> > &get_contents() {
+  map<unsigned int, pair<ceph::weak_ptr<int>, int*> > &get_contents() {
     return contents;
   }
 };
@@ -45,7 +43,7 @@ public:
     SharedPtrRegistryTest ®istry;
     unsigned int key;
     int value;
-    shared_ptr<int> ptr;
+    ceph::shared_ptr<int> ptr;
     enum in_method_t { LOOKUP, LOOKUP_OR_CREATE } in_method;
 
     Thread_wait(SharedPtrRegistryTest& _registry, unsigned int _key, int _value, in_method_t _in_method) : 
@@ -65,7 +63,7 @@ public:
 	  ptr = registry.lookup_or_create(key);
 	break;
       case LOOKUP:
-	ptr = shared_ptr<int>(new int);
+	ptr = ceph::shared_ptr<int>(new int);
 	*ptr = value;
 	ptr = registry.lookup(key);
 	break;
@@ -105,7 +103,7 @@ TEST_F(SharedPtrRegistry_all, lookup_or_create) {
   SharedPtrRegistryTest registry;
   unsigned int key = 1;
   int value = 2;
-  shared_ptr<int> ptr = registry.lookup_or_create(key);
+  ceph::shared_ptr<int> ptr = registry.lookup_or_create(key);
   *ptr = value;
   ASSERT_EQ(value, *registry.lookup_or_create(key));
 }
@@ -126,7 +124,7 @@ TEST_F(SharedPtrRegistry_all, wait_lookup_or_create) {
   {
     unsigned int key = 1;
     {
-      shared_ptr<int> ptr(new int);
+      ceph::shared_ptr<int> ptr(new int);
       registry.get_contents()[key] = make_pair(ptr, ptr.get());
     }
     EXPECT_FALSE(registry.get_contents()[key].first.lock());
@@ -136,17 +134,17 @@ TEST_F(SharedPtrRegistry_all, wait_lookup_or_create) {
     ASSERT_TRUE(wait_for(registry, 1));
     EXPECT_FALSE(t.ptr);
     // waiting on a key does not block lookups on other keys
-    EXPECT_TRUE(registry.lookup_or_create(key + 12345));
+    EXPECT_TRUE(registry.lookup_or_create(key + 12345).get());
     registry.remove(key);
     ASSERT_TRUE(wait_for(registry, 0));
     t.join();
-    EXPECT_TRUE(t.ptr);
+    EXPECT_TRUE(t.ptr.get());
   }
   {
     unsigned int key = 2;
     int value = 3;
     {
-      shared_ptr<int> ptr(new int);
+      ceph::shared_ptr<int> ptr(new int);
       registry.get_contents()[key] = make_pair(ptr, ptr.get());
     }
     EXPECT_FALSE(registry.get_contents()[key].first.lock());
@@ -159,14 +157,14 @@ TEST_F(SharedPtrRegistry_all, wait_lookup_or_create) {
     {
       int other_value = value + 1;
       unsigned int other_key = key + 1;
-      shared_ptr<int> ptr = registry.lookup_or_create<int>(other_key, other_value);
-      EXPECT_TRUE(ptr);
+      ceph::shared_ptr<int> ptr = registry.lookup_or_create<int>(other_key, other_value);
+      EXPECT_TRUE(ptr.get());
       EXPECT_EQ(other_value, *ptr);
     }
     registry.remove(key);
     ASSERT_TRUE(wait_for(registry, 0));
     t.join();
-    EXPECT_TRUE(t.ptr);
+    EXPECT_TRUE(t.ptr.get());
     EXPECT_EQ(value, *t.ptr);
   }
 }
@@ -175,7 +173,7 @@ TEST_F(SharedPtrRegistry_all, lookup) {
   SharedPtrRegistryTest registry;
   unsigned int key = 1;
   {
-    shared_ptr<int> ptr = registry.lookup_or_create(key);
+    ceph::shared_ptr<int> ptr = registry.lookup_or_create(key);
     int value = 2;
     *ptr = value;
     ASSERT_EQ(value, *registry.lookup(key));
@@ -189,7 +187,7 @@ TEST_F(SharedPtrRegistry_all, wait_lookup) {
   unsigned int key = 1;
   int value = 2;
   {
-    shared_ptr<int> ptr(new int);
+    ceph::shared_ptr<int> ptr(new int);
     registry.get_contents()[key] = make_pair(ptr, ptr.get());
   }
   EXPECT_FALSE(registry.get_contents()[key].first.lock());
@@ -218,15 +216,15 @@ TEST_F(SharedPtrRegistry_all, get_next) {
     SharedPtrRegistryTest registry;
 
     const unsigned int key2 = 333;
-    shared_ptr<int> ptr2 = registry.lookup_or_create(key2);
+    ceph::shared_ptr<int> ptr2 = registry.lookup_or_create(key2);
     const int value2 = *ptr2 = 400;
 
     // entries with expired pointers are silentely ignored
     const unsigned int key_gone = 222;
-    registry.get_contents()[key_gone] = make_pair(shared_ptr<int>(), (int*)0);
+    registry.get_contents()[key_gone] = make_pair(ceph::shared_ptr<int>(), (int*)0);
 
     const unsigned int key1 = 111;
-    shared_ptr<int> ptr1 = registry.lookup_or_create(key1);
+    ceph::shared_ptr<int> ptr1 = registry.lookup_or_create(key1);
     const int value1 = *ptr1 = 800;
 
     pair<unsigned int, int> i;
@@ -247,11 +245,11 @@ TEST_F(SharedPtrRegistry_all, get_next) {
     //
     SharedPtrRegistryTest registry;
     const unsigned int key1 = 111;
-    shared_ptr<int> *ptr1 = new shared_ptr<int>(registry.lookup_or_create(key1));
+    ceph::shared_ptr<int> *ptr1 = new ceph::shared_ptr<int>(registry.lookup_or_create(key1));
     const unsigned int key2 = 222;
-    shared_ptr<int> ptr2 = registry.lookup_or_create(key2);
+    ceph::shared_ptr<int> ptr2 = registry.lookup_or_create(key2);
     
-    pair<unsigned int, shared_ptr<int> > i;
+    pair<unsigned int, ceph::shared_ptr<int> > i;
     EXPECT_TRUE(registry.get_next(i.first, &i));
     EXPECT_EQ(key1, i.first);
     delete ptr1;
@@ -264,15 +262,15 @@ TEST_F(SharedPtrRegistry_all, remove) {
   {
     SharedPtrRegistryTest registry;
     const unsigned int key1 = 1;
-    shared_ptr<int> ptr1 = registry.lookup_or_create(key1);
+    ceph::shared_ptr<int> ptr1 = registry.lookup_or_create(key1);
     *ptr1 = 400;
     registry.remove(key1);
 
-    shared_ptr<int> ptr2 = registry.lookup_or_create(key1);
+    ceph::shared_ptr<int> ptr2 = registry.lookup_or_create(key1);
     *ptr2 = 500;
 
-    ptr1 = shared_ptr<int>();
-    shared_ptr<int> res = registry.lookup(key1);
+    ptr1 = ceph::shared_ptr<int>();
+    ceph::shared_ptr<int> res = registry.lookup(key1);
     assert(res);
     assert(res == ptr2);
     assert(*res == 500);
@@ -280,13 +278,13 @@ TEST_F(SharedPtrRegistry_all, remove) {
   {
     SharedPtrRegistryTest registry;
     const unsigned int key1 = 1;
-    shared_ptr<int> ptr1 = registry.lookup_or_create(key1, 400);
+    ceph::shared_ptr<int> ptr1 = registry.lookup_or_create(key1, 400);
     registry.remove(key1);
 
-    shared_ptr<int> ptr2 = registry.lookup_or_create(key1, 500);
+    ceph::shared_ptr<int> ptr2 = registry.lookup_or_create(key1, 500);
 
-    ptr1 = shared_ptr<int>();
-    shared_ptr<int> res = registry.lookup(key1);
+    ptr1 = ceph::shared_ptr<int>();
+    ceph::shared_ptr<int> res = registry.lookup(key1);
     assert(res);
     assert(res == ptr2);
     assert(*res == 500);
@@ -318,9 +316,9 @@ TEST_F(SharedPtrRegistry_destructor, destructor) {
   EXPECT_EQ(UNDEFINED, died);
   int key = 101;
   {
-    shared_ptr<TellDie> a = registry.lookup_or_create(key);
+    ceph::shared_ptr<TellDie> a = registry.lookup_or_create(key);
     EXPECT_EQ(NO, died);
-    EXPECT_TRUE(a);
+    EXPECT_TRUE(a.get());
   }
   EXPECT_EQ(YES, died);
   EXPECT_FALSE(registry.lookup(key));
diff --git a/src/test/common/test_str_map.cc b/src/test/common/test_str_map.cc
index b1a27c8..5a324ba 100644
--- a/src/test/common/test_str_map.cc
+++ b/src/test/common/test_str_map.cc
@@ -56,6 +56,14 @@ TEST(str_map, plaintext) {
     ASSERT_EQ(0, get_str_map("", &str_map));
     ASSERT_EQ(0u, str_map.size());
   }
+  {
+    map<string,string> str_map;
+    ASSERT_EQ(0, get_str_map(" key1=val1; key2=\tval2; key3\t = \t val3; \n ", "\n;", &str_map));
+    ASSERT_EQ(4u, str_map.size());
+    ASSERT_EQ("val1", str_map["key1"]);
+    ASSERT_EQ("val2", str_map["key2"]);
+    ASSERT_EQ("val3", str_map["key3"]);
+  }
 }
 
 /* 
diff --git a/src/test/container-make-check-ubuntu-14.04.sh b/src/test/container-make-check-ubuntu-14.04.sh
deleted file mode 100755
index d9eaa2f..0000000
--- a/src/test/container-make-check-ubuntu-14.04.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/bash
-source test/docker-test-helper.sh
-main_docker "$@" --os-type ubuntu --os-version 14.04 --dev -- ./run-make-check.sh --enable-root-make-check
diff --git a/src/test/crypto.cc b/src/test/crypto.cc
index 24d5c5a..17e90d0 100644
--- a/src/test/crypto.cc
+++ b/src/test/crypto.cc
@@ -3,7 +3,10 @@
 
 #include "include/types.h"
 #include "auth/Crypto.h"
+#include "common/Clock.h"
 #include "common/ceph_crypto.h"
+#include "common/ceph_context.h"
+#include "global/global_context.h"
 
 #include "test/unit.h"
 
@@ -52,7 +55,9 @@ TEST(AES, Encrypt) {
 
   bufferlist cipher;
   std::string error;
-  h->encrypt(secret, plaintext, cipher, error);
+  CryptoKeyHandler *kh = h->get_key_handler(secret, error);
+  int r = kh->encrypt(plaintext, cipher, &error);
+  ASSERT_EQ(r, 0);
   ASSERT_EQ(error, "");
 
   unsigned char want_cipher[] = {
@@ -96,7 +101,9 @@ TEST(AES, Decrypt) {
 
   std::string error;
   bufferlist plaintext;
-  h->decrypt(secret, cipher, plaintext, error);
+  CryptoKeyHandler *kh = h->get_key_handler(secret, error);
+  int r = kh->decrypt(cipher, plaintext, &error);
+  ASSERT_EQ(r, 0);
   ASSERT_EQ(error, "");
 
   ASSERT_EQ(sizeof(plaintext_s), plaintext.length());
@@ -128,7 +135,9 @@ TEST(AES, Loop) {
       CryptoHandler *h = g_ceph_context->get_crypto_handler(CEPH_CRYPTO_AES);
 
       std::string error;
-      h->encrypt(secret, plaintext, cipher, error);
+      CryptoKeyHandler *kh = h->get_key_handler(secret, error);
+      int r = kh->encrypt(plaintext, cipher, &error);
+      ASSERT_EQ(r, 0);
       ASSERT_EQ(error, "");
     }
     plaintext.clear();
@@ -136,7 +145,9 @@ TEST(AES, Loop) {
     {
       CryptoHandler *h = g_ceph_context->get_crypto_handler(CEPH_CRYPTO_AES);
       std::string error;
-      h->decrypt(secret, cipher, plaintext, error);
+      CryptoKeyHandler *ckh = h->get_key_handler(secret, error);
+      int r = ckh->decrypt(cipher, plaintext, &error);
+      ASSERT_EQ(r, 0);
       ASSERT_EQ(error, "");
     }
   }
@@ -146,3 +157,28 @@ TEST(AES, Loop) {
   err = memcmp(plaintext_s, orig_plaintext_s, sizeof(orig_plaintext_s));
   ASSERT_EQ(0, err);
 }
+
+TEST(AES, LoopKey) {
+  bufferptr k(16);
+  get_random_bytes(k.c_str(), k.length());
+  CryptoKey key(CEPH_CRYPTO_AES, ceph_clock_now(NULL), k);
+
+  bufferlist data;
+  bufferptr r(128);
+  get_random_bytes(r.c_str(), r.length());
+  data.append(r);
+
+  utime_t start = ceph_clock_now(NULL);
+  int n = 100000;
+
+  for (int i=0; i<n; ++i) {
+    bufferlist encoded;
+    string error;
+    int r = key.encrypt(g_ceph_context, data, encoded, &error);
+    ASSERT_EQ(r, 0);
+  }
+
+  utime_t end = ceph_clock_now(NULL);
+  utime_t dur = end - start;
+  cout << n << " encoded in " << dur << std::endl;
+}
diff --git a/src/test/debian-jessie/install-deps.sh b/src/test/debian-jessie/install-deps.sh
index 129b238..1bebf09 100755
--- a/src/test/debian-jessie/install-deps.sh
+++ b/src/test/debian-jessie/install-deps.sh
@@ -1,8 +1,8 @@
-#!/bin/bash
+#!/bin/bash -e
 #
 # Ceph distributed storage system
 #
-# Copyright (C) 2014 Red Hat <contact at redhat.com>
+# Copyright (C) 2014, 2015 Red Hat <contact at redhat.com>
 #
 # Author: Loic Dachary <loic at dachary.org>
 #
@@ -23,10 +23,14 @@ if test -f /etc/redhat-release ; then
     $SUDO yum install -y redhat-lsb-core
 fi
 
-if which apt-get > /dev/null ; then
+if type apt-get > /dev/null 2>&1 ; then
     $SUDO apt-get install -y lsb-release
 fi
 
+if type zypper > /dev/null 2>&1 ; then
+    $SUDO zypper --gpg-auto-import-keys --non-interactive install lsb-release
+fi
+
 case $(lsb_release -si) in
 Ubuntu|Debian|Devuan)
         $SUDO apt-get install -y dpkg-dev
@@ -38,30 +42,106 @@ Ubuntu|Debian|Devuan)
         packages=$(dpkg-checkbuilddeps --admindir=$DIR debian/control 2>&1 | \
             perl -p -e 's/.*Unmet build dependencies: *//;' \
             -e 's/build-essential:native/build-essential/;' \
-            -e 's/\|//g;' \
+            -e 's/\s*\|\s*/\|/g;' \
             -e 's/\(.*?\)//g;' \
             -e 's/ +/\n/g;' | sort)
         case $(lsb_release -sc) in
             squeeze|wheezy)
                 packages=$(echo $packages | perl -pe 's/[-\w]*babeltrace[-\w]*//g')
+                backports="-t $(lsb_release -sc)-backports"
                 ;;
         esac
         packages=$(echo $packages) # change newlines into spaces
-        $SUDO bash -c "DEBIAN_FRONTEND=noninteractive apt-get install -y $packages"
+        $SUDO env DEBIAN_FRONTEND=noninteractive apt-get install $backports -y $packages || exit 1
         ;;
-CentOS|Fedora|SUSE*|RedHatEnterpriseServer)
+CentOS|Fedora|RedHatEnterpriseServer)
         case $(lsb_release -si) in
-            SUSE*)
-                $SUDO zypper -y yum-utils
+            Fedora)
+                $SUDO yum install -y yum-utils
                 ;;
-            *)
+            CentOS|RedHatEnterpriseServer)
                 $SUDO yum install -y yum-utils
+                MAJOR_VERSION=$(lsb_release -rs | cut -f1 -d.)
+                if test $(lsb_release -si) == RedHatEnterpriseServer ; then
+                    $SUDO yum install subscription-manager
+                    $SUDO subscription-manager repos --enable=rhel-$MAJOR_VERSION-server-optional-rpms
+                fi
+                $SUDO yum-config-manager --add-repo https://dl.fedoraproject.org/pub/epel/$MAJOR_VERSION/x86_64/ 
+                $SUDO yum install --nogpgcheck -y epel-release
+                $SUDO rpm --import /etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-$MAJOR_VERSION
+                $SUDO rm -f /etc/yum.repos.d/dl.fedoraproject.org*
                 ;;
         esac
         sed -e 's/@//g' < ceph.spec.in > $DIR/ceph.spec
-        $SUDO yum-builddep -y $DIR/ceph.spec
+        $SUDO yum-builddep -y $DIR/ceph.spec 2>&1 | tee $DIR/yum-builddep.out
+        ! grep -q -i error: $DIR/yum-builddep.out || exit 1
+        ;;
+*SUSE*)
+        sed -e 's/@//g' < ceph.spec.in > $DIR/ceph.spec
+        $SUDO zypper --non-interactive install $(rpmspec -q --buildrequires $DIR/ceph.spec) || exit 1
         ;;
 *)
         echo "$(lsb_release -si) is unknown, dependencies will have to be installed manually."
         ;;
 esac
+
+function populate_wheelhouse() {
+    local install=$1
+    shift
+
+    # Ubuntu-12.04 and Python 2.7.3 require this line
+    pip --timeout 300 $install 'distribute >= 0.7.3' || return 1
+    # although pip comes with virtualenv, having a recent version
+    # of pip matters when it comes to using wheel packages
+    pip --timeout 300 $install 'setuptools >= 0.8' 'pip >= 7.0' 'wheel >= 0.24' || return 1
+    if test $# != 0 ; then
+        pip --timeout 300 $install $@ || return 1
+    fi
+}
+
+function activate_virtualenv() {
+    local top_srcdir=$1
+    local interpreter=$2
+    local env_dir=$top_srcdir/install-deps-$interpreter
+
+    if ! test -d $env_dir ; then
+        virtualenv --python $interpreter $env_dir
+        . $env_dir/bin/activate
+        if ! populate_wheelhouse install ; then
+            rm -rf $env_dir
+            return 1
+        fi
+    fi
+    . $env_dir/bin/activate
+}
+
+# use pip cache if possible but do not store it outside of the source
+# tree
+# see https://pip.pypa.io/en/stable/reference/pip_install.html#caching
+mkdir -p install-deps-cache
+top_srcdir=$(pwd)
+export XDG_CACHE_HOME=$top_srcdir/install-deps-cache
+wip_wheelhouse=wheelhouse-wip
+
+#
+# preload python modules so that tox can run without network access
+#
+find . -name tox.ini | while read ini ; do
+    (
+        cd $(dirname $ini)
+        require=$(ls *requirements.txt 2>/dev/null | sed -e 's/^/-r /')
+        if test "$require" && ! test -d wheelhouse ; then
+            for interpreter in python2.7 python3 ; do
+                type $interpreter > /dev/null 2>&1 || continue
+                activate_virtualenv $top_srcdir $interpreter || exit 1
+                populate_wheelhouse "wheel -w $wip_wheelhouse" $require || exit 1
+            done
+            mv $wip_wheelhouse wheelhouse
+        fi
+    )
+done
+
+for interpreter in python2.7 python3 ; do
+    rm -rf $top_srcdir/install-deps-$interpreter
+done
+rm -rf $XDG_CACHE_HOME
diff --git a/src/test/encoding.cc b/src/test/encoding.cc
index 2662c24..4f2b26c 100644
--- a/src/test/encoding.cc
+++ b/src/test/encoding.cc
@@ -188,11 +188,38 @@ TEST(EncodingRoundTrip, MultimapConstructorCounter) {
 
   EXPECT_EQ(my_key_t::get_default_ctor(), 5);
   EXPECT_EQ(my_key_t::get_one_arg_ctor(), 0);
-  EXPECT_EQ(my_key_t::get_copy_ctor(), 10);
+  EXPECT_EQ(my_key_t::get_copy_ctor(), 5);
   EXPECT_EQ(my_key_t::get_assigns(), 0);
 
   EXPECT_EQ(my_val_t::get_default_ctor(), 5);
   EXPECT_EQ(my_val_t::get_one_arg_ctor(), 0);
-  EXPECT_EQ(my_val_t::get_copy_ctor(), 10);
+  EXPECT_EQ(my_val_t::get_copy_ctor(), 5);
   EXPECT_EQ(my_val_t::get_assigns(), 0);
 }
+
+const char* expected_what[] = {
+  "buffer::malformed_input: void lame_decoder(int) unknown encoding version > 100",
+  "buffer::malformed_input: void lame_decoder(int) no longer understand old encoding version < 100",
+  "buffer::malformed_input: void lame_decoder(int) decode past end of struct encoding",
+};
+
+void lame_decoder(int which) {
+  switch (which) {
+  case 0:
+    throw buffer::malformed_input(DECODE_ERR_VERSION(__PRETTY_FUNCTION__, 100));
+  case 1:
+    throw buffer::malformed_input(DECODE_ERR_OLDVERSION(__PRETTY_FUNCTION__, 100));
+  case 2:
+    throw buffer::malformed_input(DECODE_ERR_PAST(__PRETTY_FUNCTION__));
+  }
+}
+
+TEST(EncodingException, Macros) {
+  for (unsigned i = 0; i < sizeof(expected_what)/sizeof(expected_what[0]); i++) {
+    try {
+      lame_decoder(i);
+    } catch (const exception& e) {
+      ASSERT_EQ(string(expected_what[i]), string(e.what()));
+    }
+  }
+}
diff --git a/src/test/encoding/ceph_dencoder.cc b/src/test/encoding/ceph_dencoder.cc
index 7e10565..7a30ebd 100644
--- a/src/test/encoding/ceph_dencoder.cc
+++ b/src/test/encoding/ceph_dencoder.cc
@@ -1,3 +1,18 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
 #include <errno.h>
 #include "include/types.h"
 #include "ceph_ver.h"
@@ -10,19 +25,25 @@
 #include "include/assert.h"
 
 #define TYPE(t)
-#define TYPEWITHSTRAYDATA(t)
+#define TYPE_STRAYDATA(t)
+#define TYPE_NONDETERMINISTIC(t)
 #define TYPE_FEATUREFUL(t)
 #define TYPE_FEATUREFUL_STRAYDATA(t)
+#define TYPE_FEATUREFUL_NONDETERMINISTIC(t)
 #define TYPE_NOCOPY(t)
 #define MESSAGE(t)
 #include "types.h"
 #undef TYPE
-#undef TYPEWITHSTRAYDATA
+#undef TYPE_STRAYDATA
+#undef TYPE_NONDETERMINISTIC
 #undef TYPE_FEATUREFUL
 #undef TYPE_FEATUREFUL_STRAYDATA
+#undef TYPE_FEATUREFUL_NONDETERMINISTIC
 #undef TYPE_NOCOPY
 #undef MESSAGE
 
+#define MB(m) ((m) * 1024 * 1024)
+
 void usage(ostream &out)
 {
   out << "usage: ceph-dencoder [commands ...]" << std::endl;
@@ -37,6 +58,7 @@ void usage(ostream &out)
   out << "\n";
   out << "  list_types          list supported types\n";
   out << "  type <classname>    select in-memory type\n";
+  out << "  skip <num>          skip <num> leading bytes before decoding\n";
   out << "  decode              decode into in-memory object\n";
   out << "  encode              encode in-memory object\n";
   out << "  dump_json           dump in-memory object as json (to stdout)\n";
@@ -46,6 +68,7 @@ void usage(ostream &out)
   out << "\n";
   out << "  count_tests         print number of generated test objects (to stdout)\n";
   out << "  select_test <n>     select generated test object as in-memory object\n";
+  out << "  is_deterministic    exit w/ success if type encodes deterministically\n";
 }
 struct Dencoder {
   virtual ~Dencoder() {}
@@ -61,6 +84,7 @@ struct Dencoder {
   virtual void generate() = 0;
   virtual int num_generated() = 0;
   virtual string select_generated(unsigned n) = 0;
+  virtual bool is_deterministic() = 0;
   //virtual void print(ostream& out) = 0;
 };
 
@@ -70,9 +94,13 @@ protected:
   T* m_object;
   list<T*> m_list;
   bool stray_okay;
+  bool nondeterministic;
 
 public:
-  DencoderBase(bool stray_okay) : m_object(new T), stray_okay(stray_okay) {}
+  DencoderBase(bool stray_okay, bool nondeterministic)
+    : m_object(new T),
+      stray_okay(stray_okay),
+      nondeterministic(nondeterministic) {}
   ~DencoderBase() {
     delete m_object;
   }
@@ -116,13 +144,17 @@ public:
     m_object = *p;
     return string();
   }
+
+  bool is_deterministic() {
+    return !nondeterministic;
+  }
 };
 
 template<class T>
 class DencoderImplNoFeatureNoCopy : public DencoderBase<T> {
 public:
-  DencoderImplNoFeatureNoCopy(bool stray_ok)
-    : DencoderBase<T>(stray_ok) {}
+  DencoderImplNoFeatureNoCopy(bool stray_ok, bool nondeterministic)
+    : DencoderBase<T>(stray_ok, nondeterministic) {}
   virtual void encode(bufferlist& out, uint64_t features) {
     out.clear();
     this->m_object->encode(out);
@@ -132,8 +164,8 @@ public:
 template<class T>
 class DencoderImplNoFeature : public DencoderImplNoFeatureNoCopy<T> {
 public:
-  DencoderImplNoFeature(bool stray_ok)
-    : DencoderImplNoFeatureNoCopy<T>(stray_ok) {}
+  DencoderImplNoFeature(bool stray_ok, bool nondeterministic)
+    : DencoderImplNoFeatureNoCopy<T>(stray_ok, nondeterministic) {}
   void copy() {
     T *n = new T;
     *n = *this->m_object;
@@ -150,7 +182,8 @@ public:
 template<class T>
 class DencoderImplFeatureful : public DencoderBase<T> {
 public:
-  DencoderImplFeatureful(bool stray_ok) : DencoderBase<T>(stray_ok) {}
+  DencoderImplFeatureful(bool stray_ok, bool nondeterministic)
+    : DencoderBase<T>(stray_ok, nondeterministic) {}
   virtual void encode(bufferlist& out, uint64_t features) {
     out.clear();
     ::encode(*(this->m_object), out, features);
@@ -223,6 +256,9 @@ public:
     m_object = *p;
     return string();
   }
+  bool is_deterministic() {
+    return true;
+  }
 
   //void print(ostream& out) {
   //out << m_object << std::endl;
@@ -238,17 +274,21 @@ int main(int argc, const char **argv)
 
 #define T_STR(x) #x
 #define T_STRINGIFY(x) T_STR(x)
-#define TYPE(t) dencoders[T_STRINGIFY(t)] = new DencoderImplNoFeature<t>(false);
-#define TYPEWITHSTRAYDATA(t) dencoders[T_STRINGIFY(t)] = new DencoderImplNoFeature<t>(true);
-#define TYPE_FEATUREFUL(t) dencoders[T_STRINGIFY(t)] = new DencoderImplFeatureful<t>(false);
-#define TYPE_FEATUREFUL_STRAYDATA(t) dencoders[T_STRINGIFY(t)] = new DencoderImplFeatureful<t>(true);
-#define TYPE_NOCOPY(t) dencoders[T_STRINGIFY(t)] = new DencoderImplNoFeatureNoCopy<t>(false);
+#define TYPE(t) dencoders[T_STRINGIFY(t)] = new DencoderImplNoFeature<t>(false, false);
+#define TYPE_STRAYDATA(t) dencoders[T_STRINGIFY(t)] = new DencoderImplNoFeature<t>(true, false);
+#define TYPE_NONDETERMINISTIC(t) dencoders[T_STRINGIFY(t)] = new DencoderImplNoFeature<t>(false, true);
+#define TYPE_FEATUREFUL(t) dencoders[T_STRINGIFY(t)] = new DencoderImplFeatureful<t>(false, false);
+#define TYPE_FEATUREFUL_STRAYDATA(t) dencoders[T_STRINGIFY(t)] = new DencoderImplFeatureful<t>(true, false);
+#define TYPE_FEATUREFUL_NONDETERMINISTIC(t) dencoders[T_STRINGIFY(t)] = new DencoderImplFeatureful<t>(false, true);
+#define TYPE_NOCOPY(t) dencoders[T_STRINGIFY(t)] = new DencoderImplNoFeatureNoCopy<t>(false, false);
 #define MESSAGE(t) dencoders[T_STRINGIFY(t)] = new MessageDencoderImpl<t>;
 #include "types.h"
 #undef TYPE
-#undef TYPEWITHSTRAYDATA
+#undef TYPE_STRAYDATA
+#undef TYPE_NONDETERMINISTIC
 #undef TYPE_FEATUREFUL
 #undef TYPE_FEATUREFUL_STRAYDATA
+#undef TYPE_FEATUREFUL_NONDETERMINISTIC
 #undef T_STR
 #undef T_STRINGIFY
 
@@ -357,7 +397,14 @@ int main(int argc, const char **argv)
 	usage(cerr);
 	exit(1);
       }
-      int r = encbl.read_file(*i, &err);
+      int r;
+      if (*i == string("-")) {
+        *i = "stdin";
+	// Read up to 1mb if stdin specified
+	r = encbl.read_fd(STDIN_FILENO, MB(1));
+      } else {
+	r = encbl.read_file(*i, &err);
+      }
       if (r < 0) {
         cerr << "error reading " << *i << ": " << err << std::endl;
         exit(1);
@@ -400,7 +447,12 @@ int main(int argc, const char **argv)
 	exit(1);
       }
       int n = atoi(*i);
-      err = den->select_generated(n);      
+      err = den->select_generated(n);
+    } else if (*i == string("is_deterministic")) {
+      if (den->is_deterministic())
+	exit(0);
+      else
+	exit(1);
     } else {
       cerr << "unknown option '" << *i << "'" << std::endl;
       usage(cerr);
diff --git a/src/test/encoding/check-generated.sh b/src/test/encoding/check-generated.sh
index a429c4b..7c33e54 100755
--- a/src/test/encoding/check-generated.sh
+++ b/src/test/encoding/check-generated.sh
@@ -29,38 +29,53 @@ for type in `./ceph-dencoder list_types`; do
 	./ceph-dencoder type $type select_test $n copy dump_json > $tmp3
 	./ceph-dencoder type $type select_test $n copy_ctor dump_json > $tmp4
 
+	# nondeterministic classes may dump nondeterministically.  compare
+	# the sorted json output.  this is a weaker test, but is better
+	# than nothing.
+	if ! ./ceph-dencoder type $type is_deterministic
+	then
+	    echo "  sorting json output for nondeterministic object"
+	    for f in $tmp1 $tmp2 $tmp3 $tmp4; do
+		sort $f | sed 's/,$//' > $f.new
+		mv $f.new $f
+	    done
+	fi
+
 	if ! cmp $tmp1 $tmp2; then
 	    echo "**** $type test $n dump_json check failed ****"
-	    echo "   ceph-dencoder type $type select_test $n dump_json > $tmp1"
-	    echo "   ceph-dencoder type $type select_test $n encode decode dump_json > $tmp2"
+	    echo "   ./ceph-dencoder type $type select_test $n dump_json > $tmp1"
+	    echo "   ./ceph-dencoder type $type select_test $n encode decode dump_json > $tmp2"
 	    echo "   diff $tmp1 $tmp2"
 	    failed=$(($failed + 1))
 	fi
 
 	if ! cmp $tmp1 $tmp3; then
 	    echo "**** $type test $n copy dump_json check failed ****"
-	    echo "   ceph-dencoder type $type select_test $n dump_json > $tmp1"
-	    echo "   ceph-dencoder type $type select_test $n copy dump_json > $tmp2"
+	    echo "   ./ceph-dencoder type $type select_test $n dump_json > $tmp1"
+	    echo "   ./ceph-dencoder type $type select_test $n copy dump_json > $tmp2"
 	    echo "   diff $tmp1 $tmp2"
 	    failed=$(($failed + 1))
 	fi
 
 	if ! cmp $tmp1 $tmp4; then
 	    echo "**** $type test $n copy_ctor dump_json check failed ****"
-	    echo "   ceph-dencoder type $type select_test $n dump_json > $tmp1"
-	    echo "   ceph-dencoder type $type select_test $n copy_ctor dump_json > $tmp2"
+	    echo "   ./ceph-dencoder type $type select_test $n dump_json > $tmp1"
+	    echo "   ./ceph-dencoder type $type select_test $n copy_ctor dump_json > $tmp2"
 	    echo "   diff $tmp1 $tmp2"
 	    failed=$(($failed + 1))
 	fi
 
-	./ceph-dencoder type $type select_test $n encode export $tmp1
-	./ceph-dencoder type $type select_test $n encode decode encode export $tmp2
-	if ! cmp $tmp1 $tmp2; then
-	    echo "**** $type test $n binary reencode check failed ****"
-	    echo "   ceph-dencoder type $type select_test $n encode export $tmp1"
-	    echo "   ceph-dencoder type $type select_test $n encode decode encode export $tmp2"
-	    echo "   cmp $tmp1 $tmp2"
-	    failed=$(($failed + 1))
+	if ./ceph-dencoder type $type is_deterministic
+	then
+	    ./ceph-dencoder type $type select_test $n encode export $tmp1
+	    ./ceph-dencoder type $type select_test $n encode decode encode export $tmp2
+	    if ! cmp $tmp1 $tmp2; then
+		echo "**** $type test $n binary reencode check failed ****"
+		echo "   ./ceph-dencoder type $type select_test $n encode export $tmp1"
+		echo "   ./ceph-dencoder type $type select_test $n encode decode encode export $tmp2"
+		echo "   cmp $tmp1 $tmp2"
+		failed=$(($failed + 1))
+	    fi
 	fi
 
 
diff --git a/src/test/encoding/readable.sh b/src/test/encoding/readable.sh
index 1f433a5..f387bd1 100755
--- a/src/test/encoding/readable.sh
+++ b/src/test/encoding/readable.sh
@@ -1,6 +1,6 @@
 #!/bin/sh -e
 
-dir=$1
+dir=../ceph-object-corpus
 
 set -e
 
@@ -59,6 +59,20 @@ do
 		    failed=$(($failed + 1))
 		    continue
 		fi
+
+		# nondeterministic classes may dump
+		# nondeterministically.  compare the sorted json
+		# output.  this is a weaker test, but is better than
+		# nothing.
+		if ! ./ceph-dencoder type $type is_deterministic
+		then
+		    echo "  sorting json output for nondeterministic object"
+		    for f in $tmp1 $tmp2; do
+			sort $f | sed 's/,$//' > $f.new
+			mv $f.new $f
+		    done
+		fi
+
 		if ! cmp $tmp1 $tmp2; then
 		    echo "**** reencode of $vdir/objects/$type/$f resulted in a different dump ****"
 		    diff $tmp1 $tmp2
diff --git a/src/test/encoding/types.h b/src/test/encoding/types.h
index 21eefe7..34b0696 100644
--- a/src/test/encoding/types.h
+++ b/src/test/encoding/types.h
@@ -93,16 +93,16 @@ TYPE(PushReplyOp)
 TYPE(ECUtil::HashInfo)
 
 #include "osd/ECMsgTypes.h"
-TYPE(ECSubWrite)
+TYPE_NOCOPY(ECSubWrite)
 TYPE(ECSubWriteReply)
 TYPE_FEATUREFUL(ECSubRead)
 TYPE(ECSubReadReply)
 
 #include "osd/HitSet.h"
-TYPE(ExplicitHashHitSet)
-TYPE(ExplicitObjectHitSet)
+TYPE_NONDETERMINISTIC(ExplicitHashHitSet)
+TYPE_NONDETERMINISTIC(ExplicitObjectHitSet)
 TYPE(BloomHitSet)
-TYPE(HitSet)
+TYPE_NONDETERMINISTIC(HitSet)   // because some subclasses are
 TYPE(HitSet::Params)
 
 #include "os/ObjectStore.h"
@@ -120,7 +120,7 @@ TYPE(AuthMonitor::Incremental)
 
 #include "mon/PGMap.h"
 TYPE(PGMap::Incremental)
-TYPE(PGMap)
+TYPE_NONDETERMINISTIC(PGMap)
 
 #include "mon/MonitorDBStore.h"
 TYPE(MonitorDBStore::Transaction)
@@ -169,6 +169,7 @@ TYPE(cap_reconnect_t)
 TYPE(inode_backtrace_t)
 TYPE(inode_backpointer_t)
 TYPE(quota_info_t)
+TYPE(ceph_file_layout_wrapper)
 
 #include "mds/CInode.h"
 TYPE(InodeStore)
@@ -184,10 +185,7 @@ TYPE_NOCOPY(Capability)
 TYPE(InoTable)
 
 #include "mds/SnapServer.h"
-TYPEWITHSTRAYDATA(SnapServer)
-
-#include "mds/SessionMap.h"
-TYPE(SessionMapStore)
+TYPE_STRAYDATA(SnapServer)
 
 #include "mds/events/ECommitted.h"
 TYPE(ECommitted)
@@ -232,12 +230,20 @@ TYPE(EUpdate)
 TYPE(librbd::WatchNotify::NotifyMessage)
 TYPE(librbd::WatchNotify::ResponseMessage)
 
+#include "rbd_replay/ActionTypes.h"
+TYPE(rbd_replay::action::Dependency)
+TYPE(rbd_replay::action::ActionEntry);
+
 #ifdef WITH_RADOSGW
 
 #include "rgw/rgw_rados.h"
 TYPE(RGWObjManifestPart)
 TYPE(RGWObjManifest)
-
+TYPE(RGWOLHInfo)
+TYPE(RGWRegion)
+TYPE(RGWZone)
+TYPE(RGWZoneParams)     
+   
 #include "rgw/rgw_acl.h"
 TYPE(ACLPermission)
 TYPE(ACLGranteeType)
diff --git a/src/test/erasure-code/ErasureCodePluginExample.cc b/src/test/erasure-code/ErasureCodePluginExample.cc
index 7c8aaff..adbda83 100644
--- a/src/test/erasure-code/ErasureCodePluginExample.cc
+++ b/src/test/erasure-code/ErasureCodePluginExample.cc
@@ -23,10 +23,13 @@
 
 class ErasureCodePluginExample : public ErasureCodePlugin {
 public:
-  virtual int factory(const map<std::string,std::string> &parameters,
-                      ErasureCodeInterfaceRef *erasure_code)
+  virtual int factory(const std::string &directory,
+		      ErasureCodeProfile &profile,
+                      ErasureCodeInterfaceRef *erasure_code,
+		      ostream *ss)
   {
     *erasure_code = ErasureCodeInterfaceRef(new ErasureCodeExample());
+    (*erasure_code)->init(profile, ss);
     return 0;
   }
 };
diff --git a/src/test/erasure-code/Makefile.am b/src/test/erasure-code/Makefile.am
index 1a25630..ed0a014 100644
--- a/src/test/erasure-code/Makefile.am
+++ b/src/test/erasure-code/Makefile.am
@@ -2,7 +2,8 @@ if ENABLE_SERVER
 if WITH_OSD
 
 check_SCRIPTS += \
-	test/erasure-code/test-erasure-code.sh
+	test/erasure-code/test-erasure-code.sh \
+	test/erasure-code/test-erasure-eio.sh
 
 noinst_HEADERS += \
 	test/erasure-code/ceph_erasure_code_benchmark.h
@@ -34,7 +35,9 @@ ceph_erasure_code_LDADD += -ldl
 endif
 bin_DEBUGPROGRAMS += ceph_erasure_code
 
-libec_example_la_SOURCES = test/erasure-code/ErasureCodePluginExample.cc
+libec_example_la_SOURCES = \
+	erasure-code/ErasureCode.cc \
+	test/erasure-code/ErasureCodePluginExample.cc
 test/erasure-code/ErasureCodePluginExample.cc: ./ceph_ver.h
 libec_example_la_CFLAGS = ${AM_CFLAGS}
 libec_example_la_CXXFLAGS= ${AM_CXXFLAGS}
@@ -121,14 +124,14 @@ unittest_erasure_code_plugin_LDADD = $(LIBOSD) $(LIBCOMMON) $(UNITTEST_LDADD) $(
 if LINUX
 unittest_erasure_code_plugin_LDADD += -ldl
 endif
-check_PROGRAMS += unittest_erasure_code_plugin
+check_TESTPROGRAMS += unittest_erasure_code_plugin
 
 unittest_erasure_code_SOURCES = \
 	erasure-code/ErasureCode.cc \
 	test/erasure-code/TestErasureCode.cc
 unittest_erasure_code_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_erasure_code_LDADD = $(LIBOSD) $(LIBCOMMON) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
-check_PROGRAMS += unittest_erasure_code
+check_TESTPROGRAMS += unittest_erasure_code
 
 unittest_erasure_code_jerasure_SOURCES = \
 	test/erasure-code/TestErasureCodeJerasure.cc \
@@ -143,7 +146,7 @@ unittest_erasure_code_jerasure_LDADD = $(LIBOSD) $(LIBCOMMON) $(UNITTEST_LDADD)
 if LINUX
 unittest_erasure_code_jerasure_LDADD += -ldl
 endif
-check_PROGRAMS += unittest_erasure_code_jerasure
+check_TESTPROGRAMS += unittest_erasure_code_jerasure
 
 unittest_erasure_code_plugin_jerasure_SOURCES = \
 	test/erasure-code/TestErasureCodePluginJerasure.cc
@@ -152,7 +155,7 @@ unittest_erasure_code_plugin_jerasure_LDADD = $(LIBOSD) $(LIBCOMMON) $(UNITTEST_
 if LINUX
 unittest_erasure_code_plugin_jerasure_LDADD += -ldl
 endif
-check_PROGRAMS += unittest_erasure_code_plugin_jerasure
+check_TESTPROGRAMS += unittest_erasure_code_plugin_jerasure
 
 if WITH_BETTER_YASM_ELF64
 unittest_erasure_code_isa_SOURCES = \
@@ -163,7 +166,7 @@ unittest_erasure_code_isa_LDADD = $(LIBOSD) $(LIBCOMMON) $(UNITTEST_LDADD) $(CEP
 if LINUX
 unittest_erasure_code_isa_LDADD += -ldl
 endif
-check_PROGRAMS += unittest_erasure_code_isa
+check_TESTPROGRAMS += unittest_erasure_code_isa
 
 unittest_erasure_code_plugin_isa_SOURCES = \
 	erasure-code/ErasureCode.cc \
@@ -173,7 +176,7 @@ unittest_erasure_code_plugin_isa_LDADD = $(LIBOSD) $(LIBCOMMON) $(UNITTEST_LDADD
 if LINUX
 unittest_erasure_code_plugin_isa_LDADD += -ldl
 endif
-check_PROGRAMS += unittest_erasure_code_plugin_isa
+check_TESTPROGRAMS += unittest_erasure_code_plugin_isa
 endif
 
 unittest_erasure_code_lrc_SOURCES = \
@@ -184,7 +187,7 @@ unittest_erasure_code_lrc_LDADD = $(LIBOSD) $(LIBCOMMON) $(UNITTEST_LDADD) $(CEP
 if LINUX
 unittest_erasure_code_lrc_LDADD += -ldl
 endif
-check_PROGRAMS += unittest_erasure_code_lrc
+check_TESTPROGRAMS += unittest_erasure_code_lrc
 
 unittest_erasure_code_plugin_lrc_SOURCES = \
 	test/erasure-code/TestErasureCodePluginLrc.cc
@@ -193,40 +196,124 @@ unittest_erasure_code_plugin_lrc_LDADD = $(LIBOSD) $(LIBCOMMON) $(UNITTEST_LDADD
 if LINUX
 unittest_erasure_code_plugin_lrc_LDADD += -ldl
 endif
-check_PROGRAMS += unittest_erasure_code_plugin_lrc
+check_TESTPROGRAMS += unittest_erasure_code_plugin_lrc
 
 unittest_erasure_code_shec_SOURCES = \
 	test/erasure-code/TestErasureCodeShec.cc \
-	${libec_shec_la_SOURCES}
-unittest_erasure_code_shec_CFLAGS = ${libec_shec_la_CFLAGS}
-unittest_erasure_code_shec_CXXFLAGS = ${libec_shec_la_CXXFLAGS} $(UNITTEST_CXXFLAGS)
+	${shec_sources}
+unittest_erasure_code_shec_CFLAGS = ${libec_shec_la_CFLAGS} \
+	-I$(srcdir)/erasure-code/jerasure/jerasure/include \
+	-I$(srcdir)/erasure-code/jerasure/gf-complete/include \
+	-I$(srcdir)/erasure-code/jerasure \
+	-I$(srcdir)/erasure-code/shec
+unittest_erasure_code_shec_CXXFLAGS = ${libec_shec_la_CXXFLAGS} $(UNITTEST_CXXFLAGS) \
+	-I$(srcdir)/erasure-code/jerasure/jerasure/include \
+	-I$(srcdir)/erasure-code/jerasure/gf-complete/include \
+	-I$(srcdir)/erasure-code/jerasure \
+	-I$(srcdir)/erasure-code/shec
 unittest_erasure_code_shec_LDADD = $(LIBOSD) $(LIBCOMMON) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 if LINUX
 unittest_erasure_code_shec_LDADD += -ldl
 endif
-check_PROGRAMS += unittest_erasure_code_shec
+check_TESTPROGRAMS += unittest_erasure_code_shec
 
 unittest_erasure_code_shec_all_SOURCES = \
 	test/erasure-code/TestErasureCodeShec_all.cc \
-	${libec_shec_la_SOURCES}
-unittest_erasure_code_shec_all_CFLAGS = ${libec_shec_la_CFLAGS}
-unittest_erasure_code_shec_all_CXXFLAGS = ${libec_shec_la_CXXFLAGS} $(UNITTEST_CXXFLAGS)
+	${shec_sources}
+unittest_erasure_code_shec_all_CFLAGS = ${libec_shec_la_CFLAGS} \
+	-I$(srcdir)/erasure-code/jerasure/jerasure/include \
+	-I$(srcdir)/erasure-code/jerasure/gf-complete/include \
+	-I$(srcdir)/erasure-code/jerasure \
+	-I$(srcdir)/erasure-code/shec
+unittest_erasure_code_shec_all_CXXFLAGS = ${libec_shec_la_CXXFLAGS} $(UNITTEST_CXXFLAGS) \
+	-I$(srcdir)/erasure-code/jerasure/jerasure/include \
+	-I$(srcdir)/erasure-code/jerasure/gf-complete/include \
+	-I$(srcdir)/erasure-code/jerasure \
+	-I$(srcdir)/erasure-code/shec
 unittest_erasure_code_shec_all_LDADD = $(LIBOSD) $(LIBCOMMON) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 if LINUX
 unittest_erasure_code_shec_all_LDADD += -ldl
 endif
-check_PROGRAMS += unittest_erasure_code_shec_all
+check_TESTPROGRAMS += unittest_erasure_code_shec_all
 
 unittest_erasure_code_shec_thread_SOURCES = \
 	test/erasure-code/TestErasureCodeShec_thread.cc \
-	${libec_shec_la_SOURCES}
-unittest_erasure_code_shec_thread_CFLAGS = ${libec_shec_la_CFLAGS}
-unittest_erasure_code_shec_thread_CXXFLAGS = ${libec_shec_la_CXXFLAGS} $(UNITTEST_CXXFLAGS)
+	${shec_sources}
+unittest_erasure_code_shec_thread_CFLAGS = ${libec_shec_la_CFLAGS} \
+	-I$(srcdir)/erasure-code/jerasure/jerasure/include \
+	-I$(srcdir)/erasure-code/jerasure/gf-complete/include \
+	-I$(srcdir)/erasure-code/jerasure \
+	-I$(srcdir)/erasure-code/shec
+unittest_erasure_code_shec_thread_CXXFLAGS = ${libec_shec_la_CXXFLAGS} $(UNITTEST_CXXFLAGS) \
+	-I$(srcdir)/erasure-code/jerasure/jerasure/include \
+	-I$(srcdir)/erasure-code/jerasure/gf-complete/include \
+	-I$(srcdir)/erasure-code/jerasure \
+	-I$(srcdir)/erasure-code/shec
 unittest_erasure_code_shec_thread_LDADD = $(LIBOSD) $(LIBCOMMON) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 if LINUX
 unittest_erasure_code_shec_thread_LDADD += -ldl
 endif
-check_PROGRAMS += unittest_erasure_code_shec_thread
+check_TESTPROGRAMS += unittest_erasure_code_shec_thread
+
+unittest_erasure_code_shec_arguments_SOURCES = \
+	test/erasure-code/TestErasureCodeShec_arguments.cc \
+	${shec_sources}
+unittest_erasure_code_shec_arguments_CFLAGS = ${libec_shec_la_CFLAGS} \
+	-I$(srcdir)/erasure-code/jerasure/jerasure/include \
+	-I$(srcdir)/erasure-code/jerasure/gf-complete/include \
+	-I$(srcdir)/erasure-code/jerasure \
+	-I$(srcdir)/erasure-code/shec
+unittest_erasure_code_shec_arguments_CXXFLAGS = ${libec_shec_la_CXXFLAGS} $(UNITTEST_CXXFLAGS) \
+	-I$(srcdir)/erasure-code/jerasure/jerasure/include \
+	-I$(srcdir)/erasure-code/jerasure/gf-complete/include \
+	-I$(srcdir)/erasure-code/jerasure \
+	-I$(srcdir)/erasure-code/shec
+unittest_erasure_code_shec_arguments_LDADD = $(LIBOSD) $(LIBCOMMON) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+if LINUX
+unittest_erasure_code_shec_arguments_LDADD += -ldl
+endif
+check_TESTPROGRAMS += unittest_erasure_code_shec_arguments
+
+unittest_erasure_code_plugin_shec_SOURCES = \
+        test/erasure-code/TestErasureCodePluginShec.cc
+unittest_erasure_code_plugin_shec_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
+unittest_erasure_code_plugin_shec_LDADD = $(LIBOSD) $(LIBCOMMON) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+if LINUX
+unittest_erasure_code_plugin_shec_LDADD += -ldl
+endif
+check_TESTPROGRAMS += unittest_erasure_code_plugin_shec
+
+libec_test_shec_neon_la_SOURCES = test/erasure-code/TestShecPluginNEON.cc
+test/erasure-code/TestShecPluginNEON.cc: ./ceph_ver.h
+libec_test_shec_neon_la_CFLAGS = ${AM_CFLAGS}
+libec_test_shec_neon_la_CXXFLAGS= ${AM_CXXFLAGS}
+libec_test_shec_neon_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
+libec_test_shec_neon_la_LDFLAGS = ${AM_LDFLAGS} -export-symbols-regex '.*__erasure_code_.*'
+erasure_codelib_LTLIBRARIES += libec_test_shec_neon.la
+
+libec_test_shec_sse4_la_SOURCES = test/erasure-code/TestShecPluginSSE4.cc
+test/erasure-code/TestShecPluginSSE4.cc: ./ceph_ver.h
+libec_test_shec_sse4_la_CFLAGS = ${AM_CFLAGS}
+libec_test_shec_sse4_la_CXXFLAGS= ${AM_CXXFLAGS}
+libec_test_shec_sse4_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
+libec_test_shec_sse4_la_LDFLAGS = ${AM_LDFLAGS} -export-symbols-regex '.*__erasure_code_.*'
+erasure_codelib_LTLIBRARIES += libec_test_shec_sse4.la
+
+libec_test_shec_sse3_la_SOURCES = test/erasure-code/TestShecPluginSSE3.cc
+test/erasure-code/TestShecPluginSSE3.cc: ./ceph_ver.h
+libec_test_shec_sse3_la_CFLAGS = ${AM_CFLAGS}
+libec_test_shec_sse3_la_CXXFLAGS= ${AM_CXXFLAGS}
+libec_test_shec_sse3_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
+libec_test_shec_sse3_la_LDFLAGS = ${AM_LDFLAGS} -export-symbols-regex '.*__erasure_code_.*'
+erasure_codelib_LTLIBRARIES += libec_test_shec_sse3.la
+
+libec_test_shec_generic_la_SOURCES = test/erasure-code/TestShecPluginGeneric.cc
+test/erasure-code/TestShecPluginGeneric.cc: ./ceph_ver.h
+libec_test_shec_generic_la_CFLAGS = ${AM_CFLAGS}
+libec_test_shec_generic_la_CXXFLAGS= ${AM_CXXFLAGS}
+libec_test_shec_generic_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
+libec_test_shec_generic_la_LDFLAGS = ${AM_LDFLAGS} -export-symbols-regex '.*__erasure_code_.*'
+erasure_codelib_LTLIBRARIES += libec_test_shec_generic.la
 
 unittest_erasure_code_example_SOURCES = \
 	erasure-code/ErasureCode.cc \
@@ -234,7 +321,7 @@ unittest_erasure_code_example_SOURCES = \
 noinst_HEADERS += test/erasure-code/ErasureCodeExample.h
 unittest_erasure_code_example_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_erasure_code_example_LDADD = $(LIBOSD) $(LIBCOMMON) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
-check_PROGRAMS += unittest_erasure_code_example
+check_TESTPROGRAMS += unittest_erasure_code_example
 
 endif # WITH_OSD
 endif # ENABLE_SERVER
diff --git a/src/test/erasure-code/TestErasureCode.cc b/src/test/erasure-code/TestErasureCode.cc
index 2c97261..4140403 100644
--- a/src/test/erasure-code/TestErasureCode.cc
+++ b/src/test/erasure-code/TestErasureCode.cc
@@ -20,6 +20,7 @@
 #include "erasure-code/ErasureCode.h"
 #include "common/ceph_argparse.h"
 #include "global/global_context.h"
+#include "common/config.h"
 #include "gtest/gtest.h"
 
 class ErasureCodeTest : public ErasureCode {
@@ -33,6 +34,10 @@ public:
     k(_k), m(_m), chunk_size(_chunk_size) {}
   virtual ~ErasureCodeTest() {}
 
+  virtual int init(ErasureCodeProfile &profile, ostream *ss) {
+    return 0;
+  }
+
   virtual unsigned int get_chunk_count() const { return k + m; }
   virtual unsigned int get_data_chunk_count() const { return k; }
   virtual unsigned int get_chunk_size(unsigned int object_size) const {
@@ -155,6 +160,8 @@ int main(int argc, char **argv)
   global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
   common_init_finish(g_ceph_context);
 
+  g_conf->set_val("erasure_code_dir", ".libs", false, false);
+
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/src/test/erasure-code/TestErasureCodeExample.cc b/src/test/erasure-code/TestErasureCodeExample.cc
index a02d96e..64ef598 100644
--- a/src/test/erasure-code/TestErasureCodeExample.cc
+++ b/src/test/erasure-code/TestErasureCodeExample.cc
@@ -240,6 +240,8 @@ int main(int argc, char **argv) {
   global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
   common_init_finish(g_ceph_context);
 
+  g_conf->set_val("erasure_code_dir", ".libs", false, false);
+
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/src/test/erasure-code/TestErasureCodeIsa.cc b/src/test/erasure-code/TestErasureCodeIsa.cc
index 526eb76..84ad68e 100644
--- a/src/test/erasure-code/TestErasureCodeIsa.cc
+++ b/src/test/erasure-code/TestErasureCodeIsa.cc
@@ -24,6 +24,7 @@
 #include "erasure-code/isa/xor_op.h"
 #include "common/ceph_argparse.h"
 #include "global/global_context.h"
+#include "common/config.h"
 #include "gtest/gtest.h"
 
 ErasureCodeIsaTableCache tcache;
@@ -50,10 +51,10 @@ void IsaErasureCodeTest::encode_decode(unsigned object_size)
 {
   ErasureCodeIsaDefault Isa(tcache);
 
-  map<std::string, std::string> parameters;
-  parameters["k"] = "2";
-  parameters["m"] = "2";
-  Isa.init(parameters);
+  ErasureCodeProfile profile;
+  profile["k"] = "2";
+  profile["m"] = "2";
+  Isa.init(profile, &cerr);
 
   string payload(object_size, 'X');
   bufferlist in;
@@ -190,10 +191,10 @@ TEST_F(IsaErasureCodeTest, encode_decode)
 TEST_F(IsaErasureCodeTest, minimum_to_decode)
 {
   ErasureCodeIsaDefault Isa(tcache);
-  map<std::string, std::string> parameters;
-  parameters["k"] = "2";
-  parameters["m"] = "2";
-  Isa.init(parameters);
+  ErasureCodeProfile profile;
+  profile["k"] = "2";
+  profile["m"] = "2";
+  Isa.init(profile, &cerr);
 
   //
   // If trying to read nothing, the minimum is empty.
@@ -287,10 +288,10 @@ TEST_F(IsaErasureCodeTest, chunk_size)
 {
   {
     ErasureCodeIsaDefault Isa(tcache);
-    map<std::string, std::string> parameters;
-    parameters["k"] = "2";
-    parameters["m"] = "1";
-    Isa.init(parameters);
+    ErasureCodeProfile profile;
+    profile["k"] = "2";
+    profile["m"] = "1";
+    Isa.init(profile, &cerr);
     int k = 2;
 
     ASSERT_EQ(EC_ISA_ADDRESS_ALIGNMENT, Isa.get_chunk_size(1));
@@ -299,10 +300,10 @@ TEST_F(IsaErasureCodeTest, chunk_size)
   }
   {
     ErasureCodeIsaDefault Isa(tcache);
-    map<std::string, std::string> parameters;
-    parameters["k"] = "3";
-    parameters["m"] = "1";
-    Isa.init(parameters);
+    ErasureCodeProfile profile;
+    profile["k"] = "3";
+    profile["m"] = "1";
+    Isa.init(profile, &cerr);
     int k = 3;
 
     ASSERT_EQ(EC_ISA_ADDRESS_ALIGNMENT, Isa.get_chunk_size(1));
@@ -320,10 +321,10 @@ TEST_F(IsaErasureCodeTest, chunk_size)
 TEST_F(IsaErasureCodeTest, encode)
 {
   ErasureCodeIsaDefault Isa(tcache);
-  map<std::string, std::string> parameters;
-  parameters["k"] = "2";
-  parameters["m"] = "2";
-  Isa.init(parameters);
+  ErasureCodeProfile profile;
+  profile["k"] = "2";
+  profile["m"] = "2";
+  Isa.init(profile, &cerr);
 
   unsigned aligned_object_size = Isa.get_alignment() * 2;
   {
@@ -366,6 +367,17 @@ TEST_F(IsaErasureCodeTest, encode)
   }
 }
 
+TEST_F(IsaErasureCodeTest, sanity_check_k)
+{
+  ErasureCodeIsaDefault Isa(tcache);
+  ErasureCodeProfile profile;
+  profile["k"] = "1";
+  profile["m"] = "1";
+  ostringstream errors;
+  EXPECT_EQ(-EINVAL, Isa.init(profile, &errors));
+  EXPECT_NE(std::string::npos, errors.str().find("must be >= 2"));
+}
+
 bool
 DecodeAndVerify(ErasureCodeIsaDefault& Isa, map<int, bufferlist> &degraded, set<int> want_to_decode, buffer::ptr* enc, int length)
 {
@@ -391,10 +403,10 @@ TEST_F(IsaErasureCodeTest, isa_vandermonde_exhaustive)
   // a (12,4) configuration using the vandermonde matrix
 
   ErasureCodeIsaDefault Isa(tcache);
-  map<std::string, std::string> parameters;
-  parameters["k"] = "12";
-  parameters["m"] = "4";
-  Isa.init(parameters);
+  ErasureCodeProfile profile;
+  profile["k"] = "12";
+  profile["m"] = "4";
+  Isa.init(profile, &cerr);
 
   int k = 12;
   int m = 4;
@@ -516,12 +528,12 @@ TEST_F(IsaErasureCodeTest, isa_cauchy_exhaustive)
   // Test all possible failure scenarios and reconstruction cases for
   // a (12,4) configuration using the cauchy matrix
   ErasureCodeIsaDefault Isa(tcache,ErasureCodeIsaDefault::kCauchy);
-  map<std::string, std::string> parameters;
-  parameters["k"] = "12";
-  parameters["m"] = "4";
-  parameters["technique"] = "cauchy";
+  ErasureCodeProfile profile;
+  profile["k"] = "12";
+  profile["m"] = "4";
+  profile["technique"] = "cauchy";
 
-  Isa.init(parameters);
+  Isa.init(profile, &cerr);
 
   int k = 12;
   int m = 4;
@@ -643,12 +655,12 @@ TEST_F(IsaErasureCodeTest, isa_cauchy_cache_trash)
   // Test all possible failure scenarios and reconstruction cases for
   // a (12,4) configuration using the cauchy matrix
   ErasureCodeIsaDefault Isa(tcache,ErasureCodeIsaDefault::kCauchy);
-  map<std::string, std::string> parameters;
-  parameters["k"] = "16";
-  parameters["m"] = "4";
-  parameters["technique"] = "cauchy";
+  ErasureCodeProfile profile;
+  profile["k"] = "16";
+  profile["m"] = "4";
+  profile["technique"] = "cauchy";
 
-  Isa.init(parameters);
+  Isa.init(profile, &cerr);
 
   int k = 16;
   int m = 4;
@@ -771,10 +783,10 @@ TEST_F(IsaErasureCodeTest, isa_xor_codec)
   // a (4,1) RAID-5 like configuration 
 
   ErasureCodeIsaDefault Isa(tcache);
-  map<std::string, std::string> parameters;
-  parameters["k"] = "4";
-  parameters["m"] = "1";
-  Isa.init(parameters);
+  ErasureCodeProfile profile;
+  profile["k"] = "4";
+  profile["m"] = "1";
+  Isa.init(profile, &cerr);
 
   int k = 4;
   int m = 1;
@@ -895,11 +907,11 @@ TEST_F(IsaErasureCodeTest, create_ruleset)
   {
     stringstream ss;
     ErasureCodeIsaDefault isa(tcache);
-    map<std::string,std::string> parameters;
-    parameters["k"] = "2";
-    parameters["m"] = "2";
-    parameters["w"] = "8";
-    isa.init(parameters);
+    ErasureCodeProfile profile;
+    profile["k"] = "2";
+    profile["m"] = "2";
+    profile["w"] = "8";
+    isa.init(profile, &cerr);
     int ruleset = isa.create_ruleset("myrule", *c, &ss);
     EXPECT_EQ(0, ruleset);
     EXPECT_EQ(-EEXIST, isa.create_ruleset("myrule", *c, &ss));
@@ -919,24 +931,24 @@ TEST_F(IsaErasureCodeTest, create_ruleset)
   {
     stringstream ss;
     ErasureCodeIsaDefault isa(tcache);
-    map<std::string,std::string> parameters;
-    parameters["k"] = "2";
-    parameters["m"] = "2";
-    parameters["w"] = "8";
-    parameters["ruleset-root"] = "BAD";
-    isa.init(parameters);
+    ErasureCodeProfile profile;
+    profile["k"] = "2";
+    profile["m"] = "2";
+    profile["w"] = "8";
+    profile["ruleset-root"] = "BAD";
+    isa.init(profile, &cerr);
     EXPECT_EQ(-ENOENT, isa.create_ruleset("otherrule", *c, &ss));
     EXPECT_EQ("root item BAD does not exist", ss.str());
   }
   {
     stringstream ss;
     ErasureCodeIsaDefault isa(tcache);
-    map<std::string,std::string> parameters;
-    parameters["k"] = "2";
-    parameters["m"] = "2";
-    parameters["w"] = "8";
-    parameters["ruleset-failure-domain"] = "WORSE";
-    isa.init(parameters);
+    ErasureCodeProfile profile;
+    profile["k"] = "2";
+    profile["m"] = "2";
+    profile["w"] = "8";
+    profile["ruleset-failure-domain"] = "WORSE";
+    isa.init(profile, &cerr);
     EXPECT_EQ(-EINVAL, isa.create_ruleset("otherrule", *c, &ss));
     EXPECT_EQ("unknown type WORSE", ss.str());
   }
@@ -950,6 +962,8 @@ int main(int argc, char **argv)
   global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
   common_init_finish(g_ceph_context);
 
+  g_conf->set_val("erasure_code_dir", ".libs", false, false);
+
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
@@ -957,7 +971,7 @@ int main(int argc, char **argv)
 /*
  * Local Variables:
  * compile-command: "cd ../.. ; make -j4 unittest_erasure_code_isa &&
- *   libtool --mode=execute valgrind --tool=memcheck --leak-check=full \
+ *   libtool --mode=execute valgrind --tool=memcheck \
  *      ./unittest_erasure_code_isa \
  *      --gtest_filter=*.* --log-to-stderr=true --debug-osd=20"
  * End:
diff --git a/src/test/erasure-code/TestErasureCodeJerasure.cc b/src/test/erasure-code/TestErasureCodeJerasure.cc
index 4b53311..c028e32 100644
--- a/src/test/erasure-code/TestErasureCodeJerasure.cc
+++ b/src/test/erasure-code/TestErasureCodeJerasure.cc
@@ -23,6 +23,7 @@
 #include "erasure-code/jerasure/ErasureCodeJerasure.h"
 #include "common/ceph_argparse.h"
 #include "global/global_context.h"
+#include "common/config.h"
 #include "gtest/gtest.h"
 
 template <typename T>
@@ -41,6 +42,18 @@ typedef ::testing::Types<
 > JerasureTypes;
 TYPED_TEST_CASE(ErasureCodeTest, JerasureTypes);
 
+TYPED_TEST(ErasureCodeTest, sanity_check_k)
+{
+  TypeParam jerasure;
+  ErasureCodeProfile profile;
+  profile["k"] = "1";
+  profile["m"] = "1";
+  profile["packetsize"] = "8";
+  ostringstream errors;
+  EXPECT_EQ(-EINVAL, jerasure.init(profile, &errors));
+  EXPECT_NE(std::string::npos, errors.str().find("must be >= 2"));
+}
+
 TYPED_TEST(ErasureCodeTest, encode_decode)
 {
   const char *per_chunk_alignments[] = { "false", "true" };
@@ -48,13 +61,13 @@ TYPED_TEST(ErasureCodeTest, encode_decode)
        per_chunk_alignment < 2;
        per_chunk_alignment++) {
     TypeParam jerasure;
-    map<std::string,std::string> parameters;
-    parameters["k"] = "2";
-    parameters["m"] = "2";
-    parameters["packetsize"] = "8";
-    parameters["jerasure-per-chunk-alignment"] =
+    ErasureCodeProfile profile;
+    profile["k"] = "2";
+    profile["m"] = "2";
+    profile["packetsize"] = "8";
+    profile["jerasure-per-chunk-alignment"] =
       per_chunk_alignments[per_chunk_alignment];
-    jerasure.init(parameters);
+    jerasure.init(profile, &cerr);
 
 #define LARGE_ENOUGH 2048
     bufferptr in_ptr(buffer::create_page_aligned(LARGE_ENOUGH));
@@ -119,12 +132,12 @@ TYPED_TEST(ErasureCodeTest, encode_decode)
 TYPED_TEST(ErasureCodeTest, minimum_to_decode)
 {
   TypeParam jerasure;
-  map<std::string,std::string> parameters;
-  parameters["k"] = "2";
-  parameters["m"] = "2";
-  parameters["w"] = "7";
-  parameters["packetsize"] = "8";
-  jerasure.init(parameters);
+  ErasureCodeProfile profile;
+  profile["k"] = "2";
+  profile["m"] = "2";
+  profile["w"] = "7";
+  profile["packetsize"] = "8";
+  jerasure.init(profile, &cerr);
 
   //
   // If trying to read nothing, the minimum is empty.
@@ -217,11 +230,11 @@ TYPED_TEST(ErasureCodeTest, minimum_to_decode)
 TEST(ErasureCodeTest, encode)
 {
   ErasureCodeJerasureReedSolomonVandermonde jerasure;
-  map<std::string,std::string> parameters;
-  parameters["k"] = "2";
-  parameters["m"] = "2";
-  parameters["w"] = "8";
-  jerasure.init(parameters);
+  ErasureCodeProfile profile;
+  profile["k"] = "2";
+  profile["m"] = "2";
+  profile["w"] = "8";
+  jerasure.init(profile, &cerr);
 
   unsigned aligned_object_size = jerasure.get_alignment() * 2;
   {
@@ -296,11 +309,11 @@ TEST(ErasureCodeTest, create_ruleset)
   {
     stringstream ss;
     ErasureCodeJerasureReedSolomonVandermonde jerasure;
-    map<std::string,std::string> parameters;
-    parameters["k"] = "2";
-    parameters["m"] = "2";
-    parameters["w"] = "8";
-    jerasure.init(parameters);
+    ErasureCodeProfile profile;
+    profile["k"] = "2";
+    profile["m"] = "2";
+    profile["w"] = "8";
+    jerasure.init(profile, &cerr);
     int ruleset = jerasure.create_ruleset("myrule", *c, &ss);
     EXPECT_EQ(0, ruleset);
     EXPECT_EQ(-EEXIST, jerasure.create_ruleset("myrule", *c, &ss));
@@ -320,24 +333,24 @@ TEST(ErasureCodeTest, create_ruleset)
   {
     stringstream ss;
     ErasureCodeJerasureReedSolomonVandermonde jerasure;
-    map<std::string,std::string> parameters;
-    parameters["k"] = "2";
-    parameters["m"] = "2";
-    parameters["w"] = "8";
-    parameters["ruleset-root"] = "BAD";
-    jerasure.init(parameters);
+    ErasureCodeProfile profile;
+    profile["k"] = "2";
+    profile["m"] = "2";
+    profile["w"] = "8";
+    profile["ruleset-root"] = "BAD";
+    jerasure.init(profile, &cerr);
     EXPECT_EQ(-ENOENT, jerasure.create_ruleset("otherrule", *c, &ss));
     EXPECT_EQ("root item BAD does not exist", ss.str());
   }
   {
     stringstream ss;
     ErasureCodeJerasureReedSolomonVandermonde jerasure;
-    map<std::string,std::string> parameters;
-    parameters["k"] = "2";
-    parameters["m"] = "2";
-    parameters["w"] = "8";
-    parameters["ruleset-failure-domain"] = "WORSE";
-    jerasure.init(parameters);
+    ErasureCodeProfile profile;
+    profile["k"] = "2";
+    profile["m"] = "2";
+    profile["w"] = "8";
+    profile["ruleset-failure-domain"] = "WORSE";
+    jerasure.init(profile, &cerr);
     EXPECT_EQ(-EINVAL, jerasure.create_ruleset("otherrule", *c, &ss));
     EXPECT_EQ("unknown type WORSE", ss.str());
   }
@@ -351,6 +364,8 @@ int main(int argc, char **argv)
   global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
   common_init_finish(g_ceph_context);
 
+  g_conf->set_val("erasure_code_dir", ".libs", false, false);
+
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
@@ -359,7 +374,7 @@ int main(int argc, char **argv)
  * Local Variables:
  * compile-command: "cd ../.. ;
  *   make -j4 unittest_erasure_code_jerasure &&
- *   valgrind --tool=memcheck --leak-check=full \
+ *   valgrind --tool=memcheck \
  *      ./unittest_erasure_code_jerasure \
  *      --gtest_filter=*.* --log-to-stderr=true --debug-osd=20"
  * End:
diff --git a/src/test/erasure-code/TestErasureCodeLrc.cc b/src/test/erasure-code/TestErasureCodeLrc.cc
index c3f0e68..772351c 100644
--- a/src/test/erasure-code/TestErasureCodeLrc.cc
+++ b/src/test/erasure-code/TestErasureCodeLrc.cc
@@ -24,55 +24,56 @@
 #include "erasure-code/lrc/ErasureCodeLrc.h"
 #include "common/ceph_argparse.h"
 #include "global/global_context.h"
+#include "common/config.h"
 #include "gtest/gtest.h"
 
 TEST(ErasureCodeLrc, parse_ruleset)
 {
-  ErasureCodeLrc lrc;
+  ErasureCodeLrc lrc(g_conf->erasure_code_dir);
   EXPECT_EQ("default", lrc.ruleset_root);
   EXPECT_EQ("host", lrc.ruleset_steps.front().type);
 
-  map<std::string,std::string> parameters;
-  parameters["ruleset-root"] = "other";
-  EXPECT_EQ(0, lrc.parse_ruleset(parameters, &cerr));
+  ErasureCodeProfile profile;
+  profile["ruleset-root"] = "other";
+  EXPECT_EQ(0, lrc.parse_ruleset(profile, &cerr));
   EXPECT_EQ("other", lrc.ruleset_root);
 
-  parameters["ruleset-steps"] = "[]";
-  EXPECT_EQ(0, lrc.parse_ruleset(parameters, &cerr));
+  profile["ruleset-steps"] = "[]";
+  EXPECT_EQ(0, lrc.parse_ruleset(profile, &cerr));
   EXPECT_TRUE(lrc.ruleset_steps.empty());
 
-  parameters["ruleset-steps"] = "0";
-  EXPECT_EQ(ERROR_LRC_ARRAY, lrc.parse_ruleset(parameters, &cerr));
+  profile["ruleset-steps"] = "0";
+  EXPECT_EQ(ERROR_LRC_ARRAY, lrc.parse_ruleset(profile, &cerr));
 
-  parameters["ruleset-steps"] = "{";
-  EXPECT_EQ(ERROR_LRC_PARSE_JSON, lrc.parse_ruleset(parameters, &cerr));
+  profile["ruleset-steps"] = "{";
+  EXPECT_EQ(ERROR_LRC_PARSE_JSON, lrc.parse_ruleset(profile, &cerr));
 
-  parameters["ruleset-steps"] = "[0]";
-  EXPECT_EQ(ERROR_LRC_ARRAY, lrc.parse_ruleset(parameters, &cerr));
+  profile["ruleset-steps"] = "[0]";
+  EXPECT_EQ(ERROR_LRC_ARRAY, lrc.parse_ruleset(profile, &cerr));
 
-  parameters["ruleset-steps"] = "[[0]]";
-  EXPECT_EQ(ERROR_LRC_RULESET_OP, lrc.parse_ruleset(parameters, &cerr));
+  profile["ruleset-steps"] = "[[0]]";
+  EXPECT_EQ(ERROR_LRC_RULESET_OP, lrc.parse_ruleset(profile, &cerr));
 
-  parameters["ruleset-steps"] = "[[\"choose\", 0]]";
-  EXPECT_EQ(ERROR_LRC_RULESET_TYPE, lrc.parse_ruleset(parameters, &cerr));
+  profile["ruleset-steps"] = "[[\"choose\", 0]]";
+  EXPECT_EQ(ERROR_LRC_RULESET_TYPE, lrc.parse_ruleset(profile, &cerr));
 
-  parameters["ruleset-steps"] = "[[\"choose\", \"host\", []]]";
-  EXPECT_EQ(ERROR_LRC_RULESET_N, lrc.parse_ruleset(parameters, &cerr));
+  profile["ruleset-steps"] = "[[\"choose\", \"host\", []]]";
+  EXPECT_EQ(ERROR_LRC_RULESET_N, lrc.parse_ruleset(profile, &cerr));
 
-  parameters["ruleset-steps"] = "[[\"choose\", \"host\", 2]]";
-  EXPECT_EQ(0, lrc.parse_ruleset(parameters, &cerr));
+  profile["ruleset-steps"] = "[[\"choose\", \"host\", 2]]";
+  EXPECT_EQ(0, lrc.parse_ruleset(profile, &cerr));
 
   const ErasureCodeLrc::Step &step = lrc.ruleset_steps.front();
   EXPECT_EQ("choose", step.op);
   EXPECT_EQ("host", step.type);
   EXPECT_EQ(2, step.n);
 
-  parameters["ruleset-steps"] =
+  profile["ruleset-steps"] =
     "["
     " [\"choose\", \"rack\", 2], "
     " [\"chooseleaf\", \"host\", 5], "
     "]";
-  EXPECT_EQ(0, lrc.parse_ruleset(parameters, &cerr));
+  EXPECT_EQ(0, lrc.parse_ruleset(profile, &cerr));
   EXPECT_EQ(2U, lrc.ruleset_steps.size());
   {
     const ErasureCodeLrc::Step &step = lrc.ruleset_steps[0];
@@ -128,19 +129,19 @@ TEST(ErasureCodeTest, create_ruleset)
     }
   }
 
-  ErasureCodeLrc lrc;
+  ErasureCodeLrc lrc(g_conf->erasure_code_dir);
   EXPECT_EQ(0, lrc.create_ruleset("rule1", *c, &cerr));
 
-  map<std::string,std::string> parameters;
+  ErasureCodeProfile profile;
   unsigned int racks = 2;
   unsigned int hosts = 5;
-  parameters["ruleset-steps"] =
+  profile["ruleset-steps"] =
     "["
     " [\"choose\", \"rack\", " + stringify(racks) + "], "
     " [\"chooseleaf\", \"host\", " + stringify(hosts) + "], "
     "]";
   const char *rule_name = "rule2";
-  EXPECT_EQ(0, lrc.parse_ruleset(parameters, &cerr));
+  EXPECT_EQ(0, lrc.parse_ruleset(profile, &cerr));
   EXPECT_EQ(1, lrc.create_ruleset(rule_name, *c, &cerr));
 
   vector<__u32> weight;
@@ -169,68 +170,68 @@ TEST(ErasureCodeTest, create_ruleset)
 
 TEST(ErasureCodeLrc, parse_kml)
 {
-  ErasureCodeLrc lrc;
-  map<std::string,std::string> parameters;
-  EXPECT_EQ(0, lrc.parse_kml(parameters, &cerr));
-  parameters["k"] = "4";
-  EXPECT_EQ(ERROR_LRC_ALL_OR_NOTHING, lrc.parse_kml(parameters, &cerr));
+  ErasureCodeLrc lrc(g_conf->erasure_code_dir);
+  ErasureCodeProfile profile;
+  EXPECT_EQ(0, lrc.parse_kml(profile, &cerr));
+  profile["k"] = "4";
+  EXPECT_EQ(ERROR_LRC_ALL_OR_NOTHING, lrc.parse_kml(profile, &cerr));
   const char *generated[] = { "mapping",
 			      "layers",
 			      "ruleset-steps" };
-  parameters["m"] = "2";
-  parameters["l"] = "3";
+  profile["m"] = "2";
+  profile["l"] = "3";
 
   for (int i = 0; i < 3; i++) {
-    parameters[generated[i]] = "SET";
-    EXPECT_EQ(ERROR_LRC_GENERATED, lrc.parse_kml(parameters, &cerr));
-    parameters.erase(parameters.find(generated[i]));
+    profile[generated[i]] = "SET";
+    EXPECT_EQ(ERROR_LRC_GENERATED, lrc.parse_kml(profile, &cerr));
+    profile.erase(profile.find(generated[i]));
   }
 
-  parameters["k"] = "4";
-  parameters["m"] = "2";
-  parameters["l"] = "7";
-  EXPECT_EQ(ERROR_LRC_K_M_MODULO, lrc.parse_kml(parameters, &cerr));
+  profile["k"] = "4";
+  profile["m"] = "2";
+  profile["l"] = "7";
+  EXPECT_EQ(ERROR_LRC_K_M_MODULO, lrc.parse_kml(profile, &cerr));
 
-  parameters["k"] = "3";
-  parameters["m"] = "3";
-  parameters["l"] = "3";
-  EXPECT_EQ(ERROR_LRC_K_MODULO, lrc.parse_kml(parameters, &cerr));
+  profile["k"] = "3";
+  profile["m"] = "3";
+  profile["l"] = "3";
+  EXPECT_EQ(ERROR_LRC_K_MODULO, lrc.parse_kml(profile, &cerr));
 
-  parameters["k"] = "4";
-  parameters["m"] = "2";
-  parameters["l"] = "3";
-  EXPECT_EQ(0, lrc.parse_kml(parameters, &cerr));
+  profile["k"] = "4";
+  profile["m"] = "2";
+  profile["l"] = "3";
+  EXPECT_EQ(0, lrc.parse_kml(profile, &cerr));
   EXPECT_EQ("[ "
 	    " [ \"DDc_DDc_\", \"\" ],"
 	    " [ \"DDDc____\", \"\" ],"
 	    " [ \"____DDDc\", \"\" ],"
-	    "]", parameters["layers"]);
-  EXPECT_EQ("DD__DD__", parameters["mapping"]);
+	    "]", profile["layers"]);
+  EXPECT_EQ("DD__DD__", profile["mapping"]);
   EXPECT_EQ("chooseleaf", lrc.ruleset_steps[0].op);
   EXPECT_EQ("host", lrc.ruleset_steps[0].type);
   EXPECT_EQ(0, lrc.ruleset_steps[0].n);
   EXPECT_EQ(1U, lrc.ruleset_steps.size());
-  parameters.erase(parameters.find("mapping"));
-  parameters.erase(parameters.find("layers"));
-
-  parameters["k"] = "4";
-  parameters["m"] = "2";
-  parameters["l"] = "3";
-  parameters["ruleset-failure-domain"] = "osd";
-  EXPECT_EQ(0, lrc.parse_kml(parameters, &cerr));
+  profile.erase(profile.find("mapping"));
+  profile.erase(profile.find("layers"));
+
+  profile["k"] = "4";
+  profile["m"] = "2";
+  profile["l"] = "3";
+  profile["ruleset-failure-domain"] = "osd";
+  EXPECT_EQ(0, lrc.parse_kml(profile, &cerr));
   EXPECT_EQ("chooseleaf", lrc.ruleset_steps[0].op);
   EXPECT_EQ("osd", lrc.ruleset_steps[0].type);
   EXPECT_EQ(0, lrc.ruleset_steps[0].n);
   EXPECT_EQ(1U, lrc.ruleset_steps.size());
-  parameters.erase(parameters.find("mapping"));
-  parameters.erase(parameters.find("layers"));
-
-  parameters["k"] = "4";
-  parameters["m"] = "2";
-  parameters["l"] = "3";
-  parameters["ruleset-failure-domain"] = "osd";
-  parameters["ruleset-locality"] = "rack";
-  EXPECT_EQ(0, lrc.parse_kml(parameters, &cerr));
+  profile.erase(profile.find("mapping"));
+  profile.erase(profile.find("layers"));
+
+  profile["k"] = "4";
+  profile["m"] = "2";
+  profile["l"] = "3";
+  profile["ruleset-failure-domain"] = "osd";
+  profile["ruleset-locality"] = "rack";
+  EXPECT_EQ(0, lrc.parse_kml(profile, &cerr));
   EXPECT_EQ("choose", lrc.ruleset_steps[0].op);
   EXPECT_EQ("rack", lrc.ruleset_steps[0].type);
   EXPECT_EQ(2, lrc.ruleset_steps[0].n);
@@ -238,107 +239,107 @@ TEST(ErasureCodeLrc, parse_kml)
   EXPECT_EQ("osd", lrc.ruleset_steps[1].type);
   EXPECT_EQ(4, lrc.ruleset_steps[1].n);
   EXPECT_EQ(2U, lrc.ruleset_steps.size());
-  parameters.erase(parameters.find("mapping"));
-  parameters.erase(parameters.find("layers"));
+  profile.erase(profile.find("mapping"));
+  profile.erase(profile.find("layers"));
 }
 
 TEST(ErasureCodeLrc, layers_description)
 {
-  ErasureCodeLrc lrc;
-  map<std::string,std::string> parameters;
+  ErasureCodeLrc lrc(g_conf->erasure_code_dir);
+  ErasureCodeProfile profile;
 
   json_spirit::mArray description;
   EXPECT_EQ(ERROR_LRC_DESCRIPTION,
-	    lrc.layers_description(parameters, &description, &cerr));
+	    lrc.layers_description(profile, &description, &cerr));
 
   {
     const char *description_string = "\"not an array\"";
-    parameters["layers"] = description_string;
+    profile["layers"] = description_string;
     EXPECT_EQ(ERROR_LRC_ARRAY,
-	      lrc.layers_description(parameters, &description, &cerr));
+	      lrc.layers_description(profile, &description, &cerr));
   }
   {
     const char *description_string = "invalid json";
-    parameters["layers"] = description_string;
+    profile["layers"] = description_string;
     EXPECT_EQ(ERROR_LRC_PARSE_JSON,
-	      lrc.layers_description(parameters, &description, &cerr));
+	      lrc.layers_description(profile, &description, &cerr));
   }
   {
     const char *description_string = "[]";
-    parameters["layers"] = description_string;
-    EXPECT_EQ(0, lrc.layers_description(parameters, &description, &cerr));
+    profile["layers"] = description_string;
+    EXPECT_EQ(0, lrc.layers_description(profile, &description, &cerr));
   }
 }
 
 TEST(ErasureCodeLrc, layers_parse)
 {
   {
-    ErasureCodeLrc lrc;
-    map<std::string,std::string> parameters;
+    ErasureCodeLrc lrc(g_conf->erasure_code_dir);
+    ErasureCodeProfile profile;
 
     const char *description_string ="[ 0 ]";
-    parameters["layers"] = description_string;
+    profile["layers"] = description_string;
     json_spirit::mArray description;
-    EXPECT_EQ(0, lrc.layers_description(parameters, &description, &cerr));
+    EXPECT_EQ(0, lrc.layers_description(profile, &description, &cerr));
     EXPECT_EQ(ERROR_LRC_ARRAY,
 	      lrc.layers_parse(description_string, description, &cerr));
   }
 
   {
-    ErasureCodeLrc lrc;
-    map<std::string,std::string> parameters;
+    ErasureCodeLrc lrc(g_conf->erasure_code_dir);
+    ErasureCodeProfile profile;
 
     const char *description_string ="[ [ 0 ] ]";
-    parameters["layers"] = description_string;
+    profile["layers"] = description_string;
     json_spirit::mArray description;
-    EXPECT_EQ(0, lrc.layers_description(parameters, &description, &cerr));
+    EXPECT_EQ(0, lrc.layers_description(profile, &description, &cerr));
     EXPECT_EQ(ERROR_LRC_STR,
 	      lrc.layers_parse(description_string, description, &cerr));
   }
 
   {
-    ErasureCodeLrc lrc;
-    map<std::string,std::string> parameters;
+    ErasureCodeLrc lrc(g_conf->erasure_code_dir);
+    ErasureCodeProfile profile;
 
     const char *description_string ="[ [ \"\", 0 ] ]";
-    parameters["layers"] = description_string;
+    profile["layers"] = description_string;
     json_spirit::mArray description;
-    EXPECT_EQ(0, lrc.layers_description(parameters, &description, &cerr));
+    EXPECT_EQ(0, lrc.layers_description(profile, &description, &cerr));
     EXPECT_EQ(ERROR_LRC_CONFIG_OPTIONS,
 	      lrc.layers_parse(description_string, description, &cerr));
   }
 
   //
   // The second element can be an object describing the plugin
-  // parameters.
+  // profile.
   //
   {
-    ErasureCodeLrc lrc;
-    map<std::string,std::string> parameters;
+    ErasureCodeLrc lrc(g_conf->erasure_code_dir);
+    ErasureCodeProfile profile;
 
     const char *description_string ="[ [ \"\", { \"a\": \"b\" }, \"ignored\" ] ]";
-    parameters["layers"] = description_string;
+    profile["layers"] = description_string;
     json_spirit::mArray description;
-    EXPECT_EQ(0, lrc.layers_description(parameters, &description, &cerr));
+    EXPECT_EQ(0, lrc.layers_description(profile, &description, &cerr));
     EXPECT_EQ(0, lrc.layers_parse(description_string, description, &cerr));
-    EXPECT_EQ("b", lrc.layers.front().parameters["a"]);
+    EXPECT_EQ("b", lrc.layers.front().profile["a"]);
   }
 
   //
   // The second element can be a str_map parseable string describing the plugin
-  // parameters.
+  // profile.
   //
   {
-    ErasureCodeLrc lrc;
-    map<std::string,std::string> parameters;
+    ErasureCodeLrc lrc(g_conf->erasure_code_dir);
+    ErasureCodeProfile profile;
 
     const char *description_string ="[ [ \"\", \"a=b c=d\" ] ]";
-    parameters["layers"] = description_string;
+    profile["layers"] = description_string;
     json_spirit::mArray description;
-    EXPECT_EQ(0, lrc.layers_description(parameters, &description, &cerr));
+    EXPECT_EQ(0, lrc.layers_description(profile, &description, &cerr));
     EXPECT_EQ(0, lrc.layers_parse(description_string, description, &cerr));
-    EXPECT_EQ("b", lrc.layers.front().parameters["a"]);
-    EXPECT_EQ("d", lrc.layers.front().parameters["c"]);
+    EXPECT_EQ("b", lrc.layers.front().profile["a"]);
+    EXPECT_EQ("d", lrc.layers.front().profile["c"]);
   }
 
 }
@@ -346,84 +347,81 @@ TEST(ErasureCodeLrc, layers_parse)
 TEST(ErasureCodeLrc, layers_sanity_checks)
 {
   {
-    ErasureCodeLrc lrc;
-    map<std::string,std::string> parameters;
-    parameters["mapping"] =
+    ErasureCodeLrc lrc(g_conf->erasure_code_dir);
+    ErasureCodeProfile profile;
+    profile["mapping"] =
 	    "__DDD__DD";
-    parameters["directory"] = ".libs";
     const char *description_string =
       "[ "
       "  [ \"_cDDD_cDD\", \"\" ],"
       "  [ \"c_DDD____\", \"\" ],"
       "  [ \"_____cDDD\", \"\" ],"
       "]";
-    parameters["layers"] = description_string;
-    EXPECT_EQ(0, lrc.init(parameters, &cerr));
+    profile["layers"] = description_string;
+    EXPECT_EQ(0, lrc.init(profile, &cerr));
   }
   {
-    ErasureCodeLrc lrc;
-    map<std::string,std::string> parameters;
+    ErasureCodeLrc lrc(g_conf->erasure_code_dir);
+    ErasureCodeProfile profile;
     const char *description_string =
       "[ "
       "]";
-    parameters["layers"] = description_string;
-    EXPECT_EQ(ERROR_LRC_MAPPING, lrc.init(parameters, &cerr));
+    profile["layers"] = description_string;
+    EXPECT_EQ(ERROR_LRC_MAPPING, lrc.init(profile, &cerr));
   }
   {
-    ErasureCodeLrc lrc;
-    map<std::string,std::string> parameters;
-    parameters["mapping"] = "";
+    ErasureCodeLrc lrc(g_conf->erasure_code_dir);
+    ErasureCodeProfile profile;
+    profile["mapping"] = "";
     const char *description_string =
       "[ "
       "]";
-    parameters["layers"] = description_string;
-    EXPECT_EQ(ERROR_LRC_LAYERS_COUNT, lrc.init(parameters, &cerr));
+    profile["layers"] = description_string;
+    EXPECT_EQ(ERROR_LRC_LAYERS_COUNT, lrc.init(profile, &cerr));
   }
   {
-    ErasureCodeLrc lrc;
-    map<std::string,std::string> parameters;
-    parameters["directory"] = ".libs";
-    parameters["mapping"] =
-	    "AA";
+    ErasureCodeLrc lrc(g_conf->erasure_code_dir);
+    ErasureCodeProfile profile;
+    profile["mapping"] =
+	    "DD";
     const char *description_string =
       "[ "
-      "  [ \"AA??\", \"\" ], "
-      "  [ \"AA\", \"\" ], "
-      "  [ \"AA\", \"\" ], "
+      "  [ \"DD??\", \"\" ], "
+      "  [ \"DD\", \"\" ], "
+      "  [ \"DD\", \"\" ], "
       "]";
-    parameters["layers"] = description_string;
-    EXPECT_EQ(ERROR_LRC_MAPPING_SIZE, lrc.init(parameters, &cerr));
+    profile["layers"] = description_string;
+    EXPECT_EQ(ERROR_LRC_MAPPING_SIZE, lrc.init(profile, &cerr));
   }
 }
 
 TEST(ErasureCodeLrc, layers_init)
 {
   {
-    ErasureCodeLrc lrc;
-    map<std::string,std::string> parameters;
+    ErasureCodeLrc lrc(g_conf->erasure_code_dir);
+    ErasureCodeProfile profile;
 
     const char *description_string =
       "[ "
       "  [ \"_cDDD_cDD_\", \"directory=.libs\" ],"
       "]";
-    parameters["layers"] = description_string;
-    parameters["directory"] = ".libs";
+    profile["layers"] = description_string;
     json_spirit::mArray description;
-    EXPECT_EQ(0, lrc.layers_description(parameters, &description, &cerr));
+    EXPECT_EQ(0, lrc.layers_description(profile, &description, &cerr));
     EXPECT_EQ(0, lrc.layers_parse(description_string, description, &cerr));
-    EXPECT_EQ(0, lrc.layers_init());
-    EXPECT_EQ("5", lrc.layers.front().parameters["k"]);
-    EXPECT_EQ("2", lrc.layers.front().parameters["m"]);
-    EXPECT_EQ("jerasure", lrc.layers.front().parameters["plugin"]);
-    EXPECT_EQ("reed_sol_van", lrc.layers.front().parameters["technique"]);
+    EXPECT_EQ(0, lrc.layers_init(&cerr));
+    EXPECT_EQ("5", lrc.layers.front().profile["k"]);
+    EXPECT_EQ("2", lrc.layers.front().profile["m"]);
+    EXPECT_EQ("jerasure", lrc.layers.front().profile["plugin"]);
+    EXPECT_EQ("reed_sol_van", lrc.layers.front().profile["technique"]);
   }
 }
 
 TEST(ErasureCodeLrc, init)
 {
-  ErasureCodeLrc lrc;
-  map<std::string,std::string> parameters;
-  parameters["mapping"] =
+  ErasureCodeLrc lrc(g_conf->erasure_code_dir);
+  ErasureCodeProfile profile;
+  profile["mapping"] =
     "__DDD__DD";
   const char *description_string =
     "[ "
@@ -431,20 +429,18 @@ TEST(ErasureCodeLrc, init)
     "  [ \"c_DDD____\", \"\" ],"
     "  [ \"_____cDDD\", \"\" ],"
     "]";
-  parameters["layers"] = description_string;
-  parameters["directory"] = ".libs";
-  EXPECT_EQ(0, lrc.init(parameters, &cerr));
+  profile["layers"] = description_string;
+  EXPECT_EQ(0, lrc.init(profile, &cerr));
 }
 
 TEST(ErasureCodeLrc, init_kml)
 {
-  ErasureCodeLrc lrc;
-  map<std::string,std::string> parameters;
-  parameters["k"] = "4";
-  parameters["m"] = "2";
-  parameters["l"] = "3";
-  parameters["directory"] = ".libs";
-  EXPECT_EQ(0, lrc.init(parameters, &cerr));
+  ErasureCodeLrc lrc(g_conf->erasure_code_dir);
+  ErasureCodeProfile profile;
+  profile["k"] = "4";
+  profile["m"] = "2";
+  profile["l"] = "3";
+  EXPECT_EQ(0, lrc.init(profile, &cerr));
   EXPECT_EQ((unsigned int)(4 + 2 + (4 + 2) / 3), lrc.get_chunk_count());
 }
 
@@ -452,9 +448,9 @@ TEST(ErasureCodeLrc, minimum_to_decode)
 {
   // trivial : no erasures, the minimum is want_to_read
   {
-    ErasureCodeLrc lrc;
-    map<std::string,std::string> parameters;
-    parameters["mapping"] =
+    ErasureCodeLrc lrc(g_conf->erasure_code_dir);
+    ErasureCodeProfile profile;
+    profile["mapping"] =
       "__DDD__DD";
     const char *description_string =
       "[ "
@@ -462,9 +458,8 @@ TEST(ErasureCodeLrc, minimum_to_decode)
       "  [ \"c_DDD____\", \"\" ],"
       "  [ \"_____cDDD\", \"\" ],"
       "]";
-    parameters["layers"] = description_string;
-    parameters["directory"] = ".libs";
-    EXPECT_EQ(0, lrc.init(parameters, &cerr));
+    profile["layers"] = description_string;
+    EXPECT_EQ(0, lrc.init(profile, &cerr));
     set<int> want_to_read;
     want_to_read.insert(1);
     set<int> available_chunks;
@@ -476,9 +471,9 @@ TEST(ErasureCodeLrc, minimum_to_decode)
   }
   // locally repairable erasure
   {
-    ErasureCodeLrc lrc;
-    map<std::string,std::string> parameters;
-    parameters["mapping"] =
+    ErasureCodeLrc lrc(g_conf->erasure_code_dir);
+    ErasureCodeProfile profile;
+    profile["mapping"] =
 	    "__DDD__DD_";
     const char *description_string =
       "[ "
@@ -487,10 +482,9 @@ TEST(ErasureCodeLrc, minimum_to_decode)
       "  [ \"_____cDDD_\", \"\" ],"
       "  [ \"_____DDDDc\", \"\" ],"
       "]";
-    parameters["layers"] = description_string;
-    parameters["directory"] = ".libs";
-    EXPECT_EQ(0, lrc.init(parameters, &cerr));
-    EXPECT_EQ(parameters["mapping"].length(),
+    profile["layers"] = description_string;
+    EXPECT_EQ(0, lrc.init(profile, &cerr));
+    EXPECT_EQ(profile["mapping"].length(),
 	      lrc.get_chunk_count());
     {
       // want to read the last chunk
@@ -527,9 +521,9 @@ TEST(ErasureCodeLrc, minimum_to_decode)
   }
   // implicit parity required
   {
-    ErasureCodeLrc lrc;
-    map<std::string,std::string> parameters;
-    parameters["mapping"] =
+    ErasureCodeLrc lrc(g_conf->erasure_code_dir);
+    ErasureCodeProfile profile;
+    profile["mapping"] =
 	    "__DDD__DD";
     const char *description_string =
       "[ "
@@ -537,10 +531,9 @@ TEST(ErasureCodeLrc, minimum_to_decode)
       "  [ \"c_DDD____\", \"\" ],"
       "  [ \"_____cDDD\", \"\" ],"
       "]";
-    parameters["layers"] = description_string;
-    parameters["directory"] = ".libs";
-    EXPECT_EQ(0, lrc.init(parameters, &cerr));
-    EXPECT_EQ(parameters["mapping"].length(),
+    profile["layers"] = description_string;
+    EXPECT_EQ(0, lrc.init(profile, &cerr));
+    EXPECT_EQ(profile["mapping"].length(),
 	      lrc.get_chunk_count());
     set<int> want_to_read;
     want_to_read.insert(8);
@@ -606,9 +599,9 @@ TEST(ErasureCodeLrc, minimum_to_decode)
 
 TEST(ErasureCodeLrc, encode_decode)
 {
-  ErasureCodeLrc lrc;
-  map<std::string,std::string> parameters;
-  parameters["mapping"] =
+  ErasureCodeLrc lrc(g_conf->erasure_code_dir);
+  ErasureCodeProfile profile;
+  profile["mapping"] =
     "__DD__DD";
   const char *description_string =
     "[ "
@@ -616,9 +609,8 @@ TEST(ErasureCodeLrc, encode_decode)
     "  [ \"c_DD____\", \"\" ]," // first local layer
     "  [ \"____cDDD\", \"\" ]," // second local layer
     "]";
-  parameters["layers"] = description_string;
-  parameters["directory"] = ".libs";
-  EXPECT_EQ(0, lrc.init(parameters, &cerr));
+  profile["layers"] = description_string;
+  EXPECT_EQ(0, lrc.init(profile, &cerr));
   EXPECT_EQ(4U, lrc.get_data_chunk_count());
   unsigned int stripe_width = g_conf->osd_pool_erasure_code_stripe_width;
   unsigned int chunk_size = stripe_width / lrc.get_data_chunk_count();
@@ -737,9 +729,9 @@ TEST(ErasureCodeLrc, encode_decode)
 
 TEST(ErasureCodeLrc, encode_decode_2)
 {
-  ErasureCodeLrc lrc;
-  map<std::string,std::string> parameters;
-  parameters["mapping"] =
+  ErasureCodeLrc lrc(g_conf->erasure_code_dir);
+  ErasureCodeProfile profile;
+  profile["mapping"] =
     "DD__DD__";
   const char *description_string =
     "[ "
@@ -747,9 +739,8 @@ TEST(ErasureCodeLrc, encode_decode_2)
     " [ \"DDDc____\", \"\" ],"
     " [ \"____DDDc\", \"\" ],"
     "]";
-  parameters["layers"] = description_string;
-  parameters["directory"] = ".libs";
-  EXPECT_EQ(0, lrc.init(parameters, &cerr));
+  profile["layers"] = description_string;
+  EXPECT_EQ(0, lrc.init(profile, &cerr));
   EXPECT_EQ(4U, lrc.get_data_chunk_count());
   unsigned int stripe_width = g_conf->osd_pool_erasure_code_stripe_width;
   unsigned int chunk_size = stripe_width / lrc.get_data_chunk_count();
@@ -922,6 +913,8 @@ int main(int argc, char **argv)
   global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
   common_init_finish(g_ceph_context);
 
+  g_conf->set_val("erasure_code_dir", ".libs", false, false);
+
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
@@ -929,7 +922,7 @@ int main(int argc, char **argv)
 /*
  * Local Variables:
  * compile-command: "cd ../.. ;
- *   make -j4 && valgrind --tool=memcheck --leak-check=full \
+ *   make -j4 unittest_erasure_code_lrc && valgrind --tool=memcheck \
  *      ./unittest_erasure_code_lrc \
  *      --gtest_filter=*.* --log-to-stderr=true --debug-osd=20"
  * End:
diff --git a/src/test/erasure-code/TestErasureCodePlugin.cc b/src/test/erasure-code/TestErasureCodePlugin.cc
index ebd0d64..6cf241e 100644
--- a/src/test/erasure-code/TestErasureCodePlugin.cc
+++ b/src/test/erasure-code/TestErasureCodePlugin.cc
@@ -22,6 +22,7 @@
 #include "erasure-code/ErasureCodePlugin.h"
 #include "common/ceph_argparse.h"
 #include "global/global_context.h"
+#include "common/config.h"
 #include "gtest/gtest.h"
 
 class ErasureCodePluginRegistryTest : public ::testing::Test {
@@ -30,12 +31,12 @@ protected:
   class Thread_factory : public Thread {
   public:
     virtual void *entry() {
-      map<std::string,std::string> parameters;
-      parameters["directory"] = ".libs";
+      ErasureCodeProfile profile;
       ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
       ErasureCodeInterfaceRef erasure_code;
-      stringstream ss;
-      instance.factory("hangs", parameters, &erasure_code, ss);
+      instance.factory("hangs",
+		       g_conf->erasure_code_dir,
+		       profile, &erasure_code, &cerr);
       return NULL;
     }
   };
@@ -72,36 +73,46 @@ TEST_F(ErasureCodePluginRegistryTest, factory_mutex) {
 
 TEST_F(ErasureCodePluginRegistryTest, all)
 {
-  map<std::string,std::string> parameters;
+  ErasureCodeProfile profile;
   string directory(".libs");
-  parameters["directory"] = directory;
   ErasureCodeInterfaceRef erasure_code;
   ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
-  stringstream ss;
   EXPECT_FALSE(erasure_code);
-  EXPECT_EQ(-EIO, instance.factory("invalid", parameters, &erasure_code, ss));
+  EXPECT_EQ(-EIO, instance.factory("invalid",
+				   g_conf->erasure_code_dir,
+				   profile, &erasure_code, &cerr));
   EXPECT_FALSE(erasure_code);
-  EXPECT_EQ(-EXDEV, instance.factory("missing_version", parameters,
-				     &erasure_code, ss));
+  EXPECT_EQ(-EXDEV, instance.factory("missing_version",
+				     g_conf->erasure_code_dir,
+				     profile,
+				     &erasure_code, &cerr));
   EXPECT_FALSE(erasure_code);
-  EXPECT_EQ(-ENOENT, instance.factory("missing_entry_point", parameters,
-				      &erasure_code, ss));
+  EXPECT_EQ(-ENOENT, instance.factory("missing_entry_point",
+				      g_conf->erasure_code_dir,
+				      profile,
+				      &erasure_code, &cerr));
   EXPECT_FALSE(erasure_code);
-  EXPECT_EQ(-ESRCH, instance.factory("fail_to_initialize", parameters,
-				     &erasure_code, ss));
+  EXPECT_EQ(-ESRCH, instance.factory("fail_to_initialize",
+				     g_conf->erasure_code_dir,
+				     profile,
+				     &erasure_code, &cerr));
   EXPECT_FALSE(erasure_code);
-  EXPECT_EQ(-EBADF, instance.factory("fail_to_register", parameters,
-				     &erasure_code, ss));
+  EXPECT_EQ(-EBADF, instance.factory("fail_to_register",
+				     g_conf->erasure_code_dir,
+				     profile,
+				     &erasure_code, &cerr));
   EXPECT_FALSE(erasure_code);
-  EXPECT_EQ(0, instance.factory("example", parameters, &erasure_code, ss));
-  EXPECT_TRUE(erasure_code);
+  EXPECT_EQ(0, instance.factory("example",
+				g_conf->erasure_code_dir,
+				profile, &erasure_code, &cerr));
+  EXPECT_TRUE(erasure_code.get());
   ErasureCodePlugin *plugin = 0;
   {
     Mutex::Locker l(instance.lock);
-    EXPECT_EQ(-EEXIST, instance.load("example", directory, &plugin, ss));
+    EXPECT_EQ(-EEXIST, instance.load("example", directory, &plugin, &cerr));
     EXPECT_EQ(-ENOENT, instance.remove("does not exist"));
     EXPECT_EQ(0, instance.remove("example"));
-    EXPECT_EQ(0, instance.load("example", directory, &plugin, ss));
+    EXPECT_EQ(0, instance.load("example", directory, &plugin, &cerr));
   }
 }
 
@@ -112,6 +123,8 @@ int main(int argc, char **argv) {
   global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
   common_init_finish(g_ceph_context);
 
+  g_conf->set_val("erasure_code_dir", ".libs", false, false);
+
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
@@ -120,7 +133,7 @@ int main(int argc, char **argv) {
  * Local Variables:
  * compile-command: "cd ../.. ; make -j4 && 
  *   make unittest_erasure_code_plugin && 
- *   valgrind  --leak-check=full --tool=memcheck \
+ *   valgrind --tool=memcheck \
  *      ./unittest_erasure_code_plugin \
  *      --gtest_filter=*.* --log-to-stderr=true --debug-osd=20"
  * End:
diff --git a/src/test/erasure-code/TestErasureCodePluginIsa.cc b/src/test/erasure-code/TestErasureCodePluginIsa.cc
index 605bda0..f1b0884 100644
--- a/src/test/erasure-code/TestErasureCodePluginIsa.cc
+++ b/src/test/erasure-code/TestErasureCodePluginIsa.cc
@@ -19,18 +19,20 @@
 #include "erasure-code/ErasureCodePlugin.h"
 #include "common/ceph_argparse.h"
 #include "global/global_context.h"
+#include "common/config.h"
 #include "gtest/gtest.h"
 
 TEST(ErasureCodePlugin, factory)
 {
   ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
-  map<std::string,std::string> parameters;
-  parameters["directory"] = ".libs";
+  ErasureCodeProfile profile;
   {
     ErasureCodeInterfaceRef erasure_code;
     EXPECT_FALSE(erasure_code);
-    EXPECT_EQ(-EIO, instance.factory("no-isa", parameters,
-                                        &erasure_code, cerr));
+    EXPECT_EQ(-EIO, instance.factory("no-isa",
+				     g_conf->erasure_code_dir,
+				     profile,
+				     &erasure_code, &cerr));
     EXPECT_FALSE(erasure_code);
   }
   const char *techniques[] = {
@@ -39,11 +41,13 @@ TEST(ErasureCodePlugin, factory)
   };
   for(const char **technique = techniques; *technique; technique++) {
     ErasureCodeInterfaceRef erasure_code;
-    parameters["technique"] = *technique;
+    profile["technique"] = *technique;
     EXPECT_FALSE(erasure_code);
-    EXPECT_EQ(0, instance.factory("isa", parameters,
-                                  &erasure_code, cerr));
-    EXPECT_TRUE(erasure_code);
+    EXPECT_EQ(0, instance.factory("isa",
+				  g_conf->erasure_code_dir,
+				  profile,
+                                  &erasure_code, &cerr));
+    EXPECT_TRUE(erasure_code.get());
   }
 }
 
@@ -55,6 +59,8 @@ int main(int argc, char **argv)
   global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
   common_init_finish(g_ceph_context);
 
+  g_conf->set_val("erasure_code_dir", ".libs", false, false);
+
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/src/test/erasure-code/TestErasureCodePluginJerasure.cc b/src/test/erasure-code/TestErasureCodePluginJerasure.cc
index d7dbe3c..1616ce6 100644
--- a/src/test/erasure-code/TestErasureCodePluginJerasure.cc
+++ b/src/test/erasure-code/TestErasureCodePluginJerasure.cc
@@ -23,18 +23,20 @@
 #include "erasure-code/ErasureCodePlugin.h"
 #include "common/ceph_argparse.h"
 #include "global/global_context.h"
+#include "common/config.h"
 #include "gtest/gtest.h"
 
 TEST(ErasureCodePlugin, factory)
 {
   ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
-  map<std::string,std::string> parameters;
-  parameters["directory"] = ".libs";
+  ErasureCodeProfile profile;
   {
     ErasureCodeInterfaceRef erasure_code;
     EXPECT_FALSE(erasure_code);
-    EXPECT_EQ(-ENOENT, instance.factory("jerasure", parameters,
-                                        &erasure_code, cerr));
+    EXPECT_EQ(-ENOENT, instance.factory("jerasure",
+					g_conf->erasure_code_dir,
+					profile,
+                                        &erasure_code, &cerr));
     EXPECT_FALSE(erasure_code);
   }
   const char *techniques[] = {
@@ -49,11 +51,14 @@ TEST(ErasureCodePlugin, factory)
   };
   for(const char **technique = techniques; *technique; technique++) {
     ErasureCodeInterfaceRef erasure_code;
-    parameters["technique"] = *technique;
+    ErasureCodeProfile profile;
+    profile["technique"] = *technique;
     EXPECT_FALSE(erasure_code);
-    EXPECT_EQ(0, instance.factory("jerasure", parameters,
-                                  &erasure_code, cerr));
-    EXPECT_TRUE(erasure_code);
+    EXPECT_EQ(0, instance.factory("jerasure",
+				  g_conf->erasure_code_dir,
+				  profile,
+                                  &erasure_code, &cerr));
+    EXPECT_TRUE(erasure_code.get());
   }
 }
 
@@ -70,12 +75,11 @@ TEST(ErasureCodePlugin, select)
   int arch_neon		= ceph_arch_neon;
 
   ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
-  map<std::string,std::string> parameters;
+  ErasureCodeProfile profile;
   // load test plugins instead of actual plugins to assert the desired side effect
   // happens
-  parameters["jerasure-name"] = "test_jerasure";
-  parameters["directory"] = ".libs";
-  parameters["technique"] = "reed_sol_van";
+  profile["jerasure-name"] = "test_jerasure";
+  profile["technique"] = "reed_sol_van";
 
   // all features are available, load the SSE4 plugin
   {
@@ -89,8 +93,10 @@ TEST(ErasureCodePlugin, select)
 
     ErasureCodeInterfaceRef erasure_code;
     int sse4_side_effect = -444;
-    EXPECT_EQ(sse4_side_effect, instance.factory("jerasure", parameters,
-                                                 &erasure_code, cerr));
+    EXPECT_EQ(sse4_side_effect, instance.factory("jerasure",
+						 g_conf->erasure_code_dir,
+						 profile,
+                                                 &erasure_code, &cerr));
   }
   // pclmul is missing, load the SSE3 plugin
   {
@@ -104,8 +110,10 @@ TEST(ErasureCodePlugin, select)
 
     ErasureCodeInterfaceRef erasure_code;
     int sse3_side_effect = -333;
-    EXPECT_EQ(sse3_side_effect, instance.factory("jerasure", parameters,
-                                                 &erasure_code, cerr));
+    EXPECT_EQ(sse3_side_effect, instance.factory("jerasure",
+						 g_conf->erasure_code_dir,
+						 profile,
+                                                 &erasure_code, &cerr));
   }
   // pclmul and sse3 are missing, load the generic plugin
   {
@@ -119,8 +127,10 @@ TEST(ErasureCodePlugin, select)
 
     ErasureCodeInterfaceRef erasure_code;
     int generic_side_effect = -111;
-    EXPECT_EQ(generic_side_effect, instance.factory("jerasure", parameters,
-						    &erasure_code, cerr));
+    EXPECT_EQ(generic_side_effect, instance.factory("jerasure",
+						    g_conf->erasure_code_dir,
+						    profile,
+						    &erasure_code, &cerr));
   }
   // neon is set, load the neon plugin
   {
@@ -134,8 +144,10 @@ TEST(ErasureCodePlugin, select)
 
     ErasureCodeInterfaceRef erasure_code;
     int generic_side_effect = -555;
-    EXPECT_EQ(generic_side_effect, instance.factory("jerasure", parameters,
-						    &erasure_code, cerr));
+    EXPECT_EQ(generic_side_effect, instance.factory("jerasure",
+						    g_conf->erasure_code_dir,
+						    profile,
+						    &erasure_code, &cerr));
   }
 
 
@@ -184,11 +196,10 @@ TEST(ErasureCodePlugin, sse)
   in.push_front(in_ptr);
 
   ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
-  map<std::string,std::string> parameters;
-  parameters["directory"] = ".libs";
-  parameters["technique"] = "reed_sol_van";
-  parameters["k"] = "2";
-  parameters["m"] = "1";
+  ErasureCodeProfile profile;
+  profile["technique"] = "reed_sol_van";
+  profile["k"] = "2";
+  profile["m"] = "1";
   for (vector<string>::iterator sse_variant = sse_variants.begin();
        sse_variant != sse_variants.end();
        ++sse_variant) {
@@ -197,9 +208,11 @@ TEST(ErasureCodePlugin, sse)
     //
     ErasureCodeInterfaceRef erasure_code;
     EXPECT_FALSE(erasure_code);
-    EXPECT_EQ(0, instance.factory("jerasure_" + *sse_variant, parameters,
-                                  &erasure_code, cerr));
-    EXPECT_TRUE(erasure_code);
+    EXPECT_EQ(0, instance.factory("jerasure_" + *sse_variant,
+				  g_conf->erasure_code_dir,
+				  profile,
+                                  &erasure_code, &cerr));
+    EXPECT_TRUE(erasure_code.get());
 
     //
     // encode
@@ -243,6 +256,8 @@ int main(int argc, char **argv)
   global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
   common_init_finish(g_ceph_context);
 
+  g_conf->set_val("erasure_code_dir", ".libs", false, false);
+
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/src/test/erasure-code/TestErasureCodePluginLrc.cc b/src/test/erasure-code/TestErasureCodePluginLrc.cc
index 9376d9b..03abafc 100644
--- a/src/test/erasure-code/TestErasureCodePluginLrc.cc
+++ b/src/test/erasure-code/TestErasureCodePluginLrc.cc
@@ -22,19 +22,21 @@
 #include "erasure-code/ErasureCodePlugin.h"
 #include "common/ceph_argparse.h"
 #include "global/global_context.h"
+#include "common/config.h"
 #include "gtest/gtest.h"
 
 TEST(ErasureCodePlugin, factory)
 {
   ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
-  map<std::string,std::string> parameters;
-  parameters["directory"] = ".libs";
-  parameters["mapping"] = "DD_";
-  parameters["layers"] = "[ [ \"DDc\", \"\" ] ]";
+  ErasureCodeProfile profile;
+  profile["mapping"] = "DD_";
+  profile["layers"] = "[ [ \"DDc\", \"\" ] ]";
   ErasureCodeInterfaceRef erasure_code;
   EXPECT_FALSE(erasure_code);
-  EXPECT_EQ(0, instance.factory("lrc", parameters, &erasure_code, cerr));
-  EXPECT_TRUE(erasure_code);
+  EXPECT_EQ(0, instance.factory("lrc",
+				g_conf->erasure_code_dir,
+				profile, &erasure_code, &cerr));
+  EXPECT_TRUE(erasure_code.get());
 }
 
 int main(int argc, char **argv)
@@ -45,6 +47,8 @@ int main(int argc, char **argv)
   global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
   common_init_finish(g_ceph_context);
 
+  g_conf->set_val("erasure_code_dir", ".libs", false, false);
+
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/src/test/erasure-code/TestErasureCodePluginShec.cc b/src/test/erasure-code/TestErasureCodePluginShec.cc
new file mode 100644
index 0000000..2708150
--- /dev/null
+++ b/src/test/erasure-code/TestErasureCodePluginShec.cc
@@ -0,0 +1,268 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph distributed storage system
+ *
+ * Copyright (C) 2015 FUJITSU LIMITED
+ *
+ * Author: Shotaro Kawaguchi <kawaguchi.s at jp.fujitsu.com>
+ * Author: Takanori Nakao <nakao.takanori at jp.fujitsu.com>
+ * Author: Takeshi Miyamae <miyamae.takeshi at jp.fujitsu.com>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <errno.h>
+#include "arch/probe.h"
+#include "arch/intel.h"
+#include "arch/arm.h"
+#include "global/global_init.h"
+#include "erasure-code/ErasureCodePlugin.h"
+#include "common/ceph_argparse.h"
+#include "global/global_context.h"
+#include "common/config.h"
+#include "gtest/gtest.h"
+
+TEST(ErasureCodePlugin, factory)
+{
+  ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
+  map<std::string,std::string> profile;
+  {
+    ErasureCodeInterfaceRef erasure_code;
+    EXPECT_FALSE(erasure_code);
+    EXPECT_EQ(0, instance.factory("shec",
+				  g_conf->erasure_code_dir,
+				  profile,
+				  &erasure_code, &cerr));
+    EXPECT_TRUE(erasure_code.get());
+  }
+  const char *techniques[] = {
+    "single",
+    "multiple",
+    0
+  };
+  for(const char **technique = techniques; *technique; technique++) {
+    ErasureCodeInterfaceRef erasure_code;
+    profile["technique"] = *technique;
+    EXPECT_FALSE(erasure_code);
+    EXPECT_EQ(0, instance.factory("shec",
+				  g_conf->erasure_code_dir,
+				  profile,
+                                  &erasure_code, &cerr));
+    EXPECT_TRUE(erasure_code.get());
+  }
+}
+
+TEST(ErasureCodePlugin, select)
+{
+  ceph_arch_probe();
+  // save probe results
+  int arch_intel_pclmul = ceph_arch_intel_pclmul;
+  int arch_intel_sse42  = ceph_arch_intel_sse42;
+  int arch_intel_sse41  = ceph_arch_intel_sse41;
+  int arch_intel_ssse3  = ceph_arch_intel_ssse3;
+  int arch_intel_sse3   = ceph_arch_intel_sse3;
+  int arch_intel_sse2   = ceph_arch_intel_sse2;
+  int arch_neon		= ceph_arch_neon;
+
+  ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
+  map<std::string,std::string> profile;
+  // load test plugins instead of actual plugins to assert the desired side effect
+  // happens
+  profile["shec-name"] = "test_shec";
+  profile["technique"] = "multiple";
+
+  // all features are available, load the SSE4 plugin
+  {
+    ceph_arch_intel_pclmul = 1;
+    ceph_arch_intel_sse42  = 1;
+    ceph_arch_intel_sse41  = 1;
+    ceph_arch_intel_ssse3  = 1;
+    ceph_arch_intel_sse3   = 1;
+    ceph_arch_intel_sse2   = 1;
+    ceph_arch_neon	   = 0;
+
+    ErasureCodeInterfaceRef erasure_code;
+    int sse4_side_effect = -444;
+    EXPECT_EQ(sse4_side_effect, instance.factory("shec",
+						 g_conf->erasure_code_dir,
+						 profile,
+                                                 &erasure_code, &cerr));
+  }
+  // pclmul is missing, load the SSE3 plugin
+  {
+    ceph_arch_intel_pclmul = 0;
+    ceph_arch_intel_sse42  = 1;
+    ceph_arch_intel_sse41  = 1;
+    ceph_arch_intel_ssse3  = 1;
+    ceph_arch_intel_sse3   = 1;
+    ceph_arch_intel_sse2   = 1;
+    ceph_arch_neon	   = 0;
+
+    ErasureCodeInterfaceRef erasure_code;
+    int sse3_side_effect = -333;
+    EXPECT_EQ(sse3_side_effect, instance.factory("shec",
+						 g_conf->erasure_code_dir,
+						 profile,
+                                                 &erasure_code, &cerr));
+  }
+  // pclmul and sse3 are missing, load the generic plugin
+  {
+    ceph_arch_intel_pclmul = 0;
+    ceph_arch_intel_sse42  = 1;
+    ceph_arch_intel_sse41  = 1;
+    ceph_arch_intel_ssse3  = 1;
+    ceph_arch_intel_sse3   = 0;
+    ceph_arch_intel_sse2   = 1;
+    ceph_arch_neon	   = 0;
+
+    ErasureCodeInterfaceRef erasure_code;
+    int generic_side_effect = -111;
+    EXPECT_EQ(generic_side_effect, instance.factory("shec",
+						    g_conf->erasure_code_dir,
+						    profile,
+						    &erasure_code, &cerr));
+  }
+  // neon is set, load the neon plugin
+  {
+    ceph_arch_intel_pclmul = 0;
+    ceph_arch_intel_sse42  = 0;
+    ceph_arch_intel_sse41  = 0;
+    ceph_arch_intel_ssse3  = 0;
+    ceph_arch_intel_sse3   = 0;
+    ceph_arch_intel_sse2   = 0;
+    ceph_arch_neon	   = 1;
+
+    ErasureCodeInterfaceRef erasure_code;
+    int generic_side_effect = -555;
+    EXPECT_EQ(generic_side_effect, instance.factory("shec",
+						    g_conf->erasure_code_dir,
+						    profile,
+						    &erasure_code, &cerr));
+  }
+
+
+  // restore probe results
+  ceph_arch_intel_pclmul = arch_intel_pclmul;
+  ceph_arch_intel_sse42  = arch_intel_sse42;
+  ceph_arch_intel_sse41  = arch_intel_sse41;
+  ceph_arch_intel_ssse3  = arch_intel_ssse3;
+  ceph_arch_intel_sse3   = arch_intel_sse3;
+  ceph_arch_intel_sse2   = arch_intel_sse2;
+  ceph_arch_neon	 = arch_neon;
+}
+
+TEST(ErasureCodePlugin, sse)
+{
+  ceph_arch_probe();
+  bool sse4 = ceph_arch_intel_pclmul &&
+    ceph_arch_intel_sse42 && ceph_arch_intel_sse41 &&
+    ceph_arch_intel_ssse3 && ceph_arch_intel_sse3 &&
+    ceph_arch_intel_sse2;
+  bool sse3 = ceph_arch_intel_ssse3 && ceph_arch_intel_sse3 &&
+    ceph_arch_intel_sse2;
+  vector<string> sse_variants;
+  sse_variants.push_back("generic");
+  if (!sse3)
+    cerr << "SKIP sse3 plugin testing because CPU does not support it\n";
+  else
+    sse_variants.push_back("sse3");
+  if (!sse4)
+    cerr << "SKIP sse4 plugin testing because CPU does not support it\n";
+  else
+    sse_variants.push_back("sse4");
+
+#define LARGE_ENOUGH 2048
+  bufferptr in_ptr(buffer::create_page_aligned(LARGE_ENOUGH));
+  in_ptr.zero();
+  in_ptr.set_length(0);
+  const char *payload =
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
+  in_ptr.append(payload, strlen(payload));
+  bufferlist in;
+  in.push_front(in_ptr);
+
+  ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
+  map<std::string,std::string> profile;
+  profile["technique"] = "multiple";
+  profile["k"] = "2";
+  profile["m"] = "1";
+  profile["c"] = "1";
+  for (vector<string>::iterator sse_variant = sse_variants.begin();
+       sse_variant != sse_variants.end();
+       ++sse_variant) {
+    //
+    // load the plugin variant
+    //
+    ErasureCodeInterfaceRef erasure_code;
+    EXPECT_FALSE(erasure_code);
+    EXPECT_EQ(0, instance.factory("shec_" + *sse_variant,
+				  g_conf->erasure_code_dir,
+				  profile,
+                                  &erasure_code, &cerr));
+    EXPECT_TRUE(erasure_code.get());
+
+    //
+    // encode
+    //
+    int want_to_encode[] = { 0, 1, 2 };
+    map<int, bufferlist> encoded;
+    EXPECT_EQ(0, erasure_code->encode(set<int>(want_to_encode, want_to_encode+3),
+                                      in,
+                                      &encoded));
+    EXPECT_EQ(3u, encoded.size());
+    unsigned length =  encoded[0].length();
+    EXPECT_EQ(0, strncmp(encoded[0].c_str(), in.c_str(), length));
+    EXPECT_EQ(0, strncmp(encoded[1].c_str(), in.c_str() + length,
+                         in.length() - length));
+
+    //
+    // decode with reconstruction
+    //
+    map<int, bufferlist> degraded = encoded;
+    degraded.erase(1);
+    EXPECT_EQ(2u, degraded.size());
+    int want_to_decode[] = { 0, 1 };
+    map<int, bufferlist> decoded;
+    EXPECT_EQ(0, erasure_code->decode(set<int>(want_to_decode, want_to_decode+2),
+                                      degraded,
+                                      &decoded));
+    EXPECT_EQ(3u, decoded.size());
+    EXPECT_EQ(length, decoded[0].length());
+    EXPECT_EQ(0, strncmp(decoded[0].c_str(), in.c_str(), length));
+    EXPECT_EQ(0, strncmp(decoded[1].c_str(), in.c_str() + length,
+                         in.length() - length));
+
+  }
+}
+
+int main(int argc, char **argv)
+{
+  vector<const char*> args;
+  argv_to_vec(argc, (const char **)argv, args);
+
+  global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
+  common_init_finish(g_ceph_context);
+
+  g_conf->set_val("erasure_code_dir", ".libs", false, false);
+
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+/*
+ * Local Variables:
+ * compile-command: "cd ../.. ; make -j4 &&
+ *   make unittest_erasure_code_plugin_shec &&
+ *   valgrind --tool=memcheck ./unittest_erasure_code_plugin_shec \
+ *      --gtest_filter=*.* --log-to-stderr=true --debug-osd=20"
+ * End:
+ */
diff --git a/src/test/erasure-code/TestErasureCodeShec.cc b/src/test/erasure-code/TestErasureCodeShec.cc
index 25325f6..1f87505 100644
--- a/src/test/erasure-code/TestErasureCodeShec.cc
+++ b/src/test/erasure-code/TestErasureCodeShec.cc
@@ -46,23 +46,21 @@ TEST(ErasureCodeShec, init_1)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["directory"] = "/usr/lib64/ceph/erasure-code";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
-
-  int r = shec->init(*parameters);
-
-  //check parameters
-  EXPECT_EQ(6u, shec->k);
-  EXPECT_EQ(4u, shec->m);
-  EXPECT_EQ(3u, shec->c);
-  EXPECT_EQ(8u, shec->w);
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
+
+  int r = shec->init(*profile, &cerr);
+
+  //check profile
+  EXPECT_EQ(4, shec->k);
+  EXPECT_EQ(3, shec->m);
+  EXPECT_EQ(2, shec->c);
+  EXPECT_EQ(8, shec->w);
   EXPECT_EQ(ErasureCodeShec::MULTIPLE, shec->technique);
   EXPECT_STREQ("default", shec->ruleset_root.c_str());
   EXPECT_STREQ("osd", shec->ruleset_failure_domain.c_str());
@@ -70,7 +68,7 @@ TEST(ErasureCodeShec, init_1)
   EXPECT_EQ(0, r);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, init_2)
@@ -80,24 +78,23 @@ TEST(ErasureCodeShec, init_2)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-root"] = "test";
-  (*parameters)["ruleset-failure-domain"] = "host";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
-  (*parameters)["w"] = "8";
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-root"] = "test";
+  (*profile)["ruleset-failure-domain"] = "host";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
+  (*profile)["w"] = "8";
 
-  int r = shec->init(*parameters);
+  int r = shec->init(*profile, &cerr);
 
-  //check parameters
-  EXPECT_EQ(6u, shec->k);
-  EXPECT_EQ(4u, shec->m);
-  EXPECT_EQ(3u, shec->c);
-  EXPECT_EQ(8u, shec->w);
+  //check profile
+  EXPECT_EQ(4, shec->k);
+  EXPECT_EQ(3, shec->m);
+  EXPECT_EQ(2, shec->c);
+  EXPECT_EQ(8, shec->w);
   EXPECT_EQ(ErasureCodeShec::MULTIPLE, shec->technique);
   EXPECT_STREQ("test", shec->ruleset_root.c_str());
   EXPECT_STREQ("host", shec->ruleset_failure_domain.c_str());
@@ -105,7 +102,7 @@ TEST(ErasureCodeShec, init_2)
   EXPECT_EQ(0, r);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, init_3)
@@ -115,23 +112,22 @@ TEST(ErasureCodeShec, init_3)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
-  (*parameters)["w"] = "16";
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
+  (*profile)["w"] = "16";
 
-  int r = shec->init(*parameters);
+  int r = shec->init(*profile, &cerr);
 
-  //check parameters
-  EXPECT_EQ(6u, shec->k);
-  EXPECT_EQ(4u, shec->m);
-  EXPECT_EQ(3u, shec->c);
-  EXPECT_EQ(16u, shec->w);
+  //check profile
+  EXPECT_EQ(4, shec->k);
+  EXPECT_EQ(3, shec->m);
+  EXPECT_EQ(2, shec->c);
+  EXPECT_EQ(16, shec->w);
   EXPECT_EQ(ErasureCodeShec::MULTIPLE, shec->technique);
   EXPECT_STREQ("default", shec->ruleset_root.c_str());
   EXPECT_STREQ("osd", shec->ruleset_failure_domain.c_str());
@@ -139,7 +135,7 @@ TEST(ErasureCodeShec, init_3)
   EXPECT_EQ(0, r);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, init_4)
@@ -149,23 +145,22 @@ TEST(ErasureCodeShec, init_4)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
-  (*parameters)["w"] = "32";
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
+  (*profile)["w"] = "32";
 
-  int r = shec->init(*parameters);
+  int r = shec->init(*profile, &cerr);
 
-  //check parameters
-  EXPECT_EQ(6u, shec->k);
-  EXPECT_EQ(4u, shec->m);
-  EXPECT_EQ(3u, shec->c);
-  EXPECT_EQ(32u, shec->w);
+  //check profile
+  EXPECT_EQ(4, shec->k);
+  EXPECT_EQ(3, shec->m);
+  EXPECT_EQ(2, shec->c);
+  EXPECT_EQ(32, shec->w);
   EXPECT_EQ(ErasureCodeShec::MULTIPLE, shec->technique);
   EXPECT_STREQ("default", shec->ruleset_root.c_str());
   EXPECT_STREQ("osd", shec->ruleset_failure_domain.c_str());
@@ -173,7 +168,7 @@ TEST(ErasureCodeShec, init_4)
   EXPECT_EQ(0, r);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, init_5)
@@ -182,22 +177,21 @@ TEST(ErasureCodeShec, init_5)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
   //plugin is not specified
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
 
-  int r = shec->init(*parameters);
+  int r = shec->init(*profile, &cerr);
 
   EXPECT_TRUE(shec->matrix != NULL);
   EXPECT_EQ(0, r);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, init_6)
@@ -206,22 +200,21 @@ TEST(ErasureCodeShec, init_6)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "jerasure";	//unexpected value
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "jerasure";	//unexpected value
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
 
-  int r = shec->init(*parameters);
+  int r = shec->init(*profile, &cerr);
 
   EXPECT_TRUE(shec->matrix != NULL);
   EXPECT_EQ(0, r);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, init_7)
@@ -230,22 +223,21 @@ TEST(ErasureCodeShec, init_7)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "abc";	//unexpected value
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "abc";	//unexpected value
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
 
-  int r = shec->init(*parameters);
+  int r = shec->init(*profile, &cerr);
 
   EXPECT_TRUE(shec->matrix != NULL);
   EXPECT_EQ(0, r);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, init_8)
@@ -254,23 +246,21 @@ TEST(ErasureCodeShec, init_8)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["directory"] = "/usr/lib64/";	//unexpected value
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
 
-  int r = shec->init(*parameters);
+  int r = shec->init(*profile, &cerr);
 
   EXPECT_TRUE(shec->matrix != NULL);
   EXPECT_EQ(0, r);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, init_9)
@@ -279,23 +269,22 @@ TEST(ErasureCodeShec, init_9)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-root"] = "abc";	//unexpected value
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-root"] = "abc";	//unexpected value
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
 
-  int r = shec->init(*parameters);
+  int r = shec->init(*profile, &cerr);
 
   EXPECT_TRUE(shec->matrix != NULL);
   EXPECT_EQ(0, r);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, init_10)
@@ -304,22 +293,21 @@ TEST(ErasureCodeShec, init_10)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "abc";	//unexpected value
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "abc";	//unexpected value
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
 
-  int r = shec->init(*parameters);
+  int r = shec->init(*profile, &cerr);
 
   EXPECT_TRUE(shec->matrix != NULL);
   EXPECT_EQ(0, r);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, init_11)
@@ -328,22 +316,21 @@ TEST(ErasureCodeShec, init_11)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "abc";		//unexpected value
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "abc";		//unexpected value
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
 
-  int r = shec->init(*parameters);
+  int r = shec->init(*profile, &cerr);
 
   EXPECT_TRUE(shec->matrix != NULL);
   EXPECT_EQ(0, r);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, init_12)
@@ -352,21 +339,20 @@ TEST(ErasureCodeShec, init_12)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "-1";	//unexpected value
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "-1";	//unexpected value
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
 
-  int r = shec->init(*parameters);
+  int r = shec->init(*profile, &cerr);
 
   EXPECT_EQ(-EINVAL, r);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, init_13)
@@ -375,21 +361,20 @@ TEST(ErasureCodeShec, init_13)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "abc";
-  (*parameters)["k"] = "0.1";	//unexpected value
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "abc";
+  (*profile)["k"] = "0.1";	//unexpected value
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
 
-  int r = shec->init(*parameters);
+  int r = shec->init(*profile, &cerr);
 
   EXPECT_EQ(-EINVAL, r);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, init_14)
@@ -398,21 +383,20 @@ TEST(ErasureCodeShec, init_14)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "a";		//unexpected value
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "a";		//unexpected value
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
 
-  int r = shec->init(*parameters);
+  int r = shec->init(*profile, &cerr);
 
   EXPECT_EQ(-EINVAL, r);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, init_15)
@@ -421,21 +405,20 @@ TEST(ErasureCodeShec, init_15)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
   //k is not specified
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
 
-  int r = shec->init(*parameters);
+  int r = shec->init(*profile, &cerr);
 
   EXPECT_EQ(-EINVAL, r);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, init_16)
@@ -444,21 +427,20 @@ TEST(ErasureCodeShec, init_16)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "-1";		//unexpected value
-  (*parameters)["c"] = "3";
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "-1";		//unexpected value
+  (*profile)["c"] = "2";
 
-  int r = shec->init(*parameters);
+  int r = shec->init(*profile, &cerr);
 
   EXPECT_EQ(-EINVAL, r);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, init_17)
@@ -467,21 +449,20 @@ TEST(ErasureCodeShec, init_17)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "0.1";		//unexpected value
-  (*parameters)["c"] = "3";
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "0.1";		//unexpected value
+  (*profile)["c"] = "2";
 
-  int r = shec->init(*parameters);
+  int r = shec->init(*profile, &cerr);
 
   EXPECT_EQ(-EINVAL, r);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, init_18)
@@ -490,21 +471,20 @@ TEST(ErasureCodeShec, init_18)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "a";		//unexpected value
-  (*parameters)["c"] = "3";
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "a";		//unexpected value
+  (*profile)["c"] = "2";
 
-  int r = shec->init(*parameters);
+  int r = shec->init(*profile, &cerr);
 
   EXPECT_EQ(-EINVAL, r);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, init_19)
@@ -513,21 +493,20 @@ TEST(ErasureCodeShec, init_19)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
   //m is not specified
-  (*parameters)["c"] = "3";
+  (*profile)["c"] = "2";
 
-  int r = shec->init(*parameters);
+  int r = shec->init(*profile, &cerr);
 
   EXPECT_EQ(-EINVAL, r);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, init_20)
@@ -536,21 +515,20 @@ TEST(ErasureCodeShec, init_20)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "-1";		//unexpected value
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "-1";		//unexpected value
 
-  int r = shec->init(*parameters);
+  int r = shec->init(*profile, &cerr);
 
   EXPECT_EQ(-EINVAL, r);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, init_21)
@@ -559,21 +537,20 @@ TEST(ErasureCodeShec, init_21)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "0.1";		//unexpected value
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "0.1";		//unexpected value
 
-  int r = shec->init(*parameters);
+  int r = shec->init(*profile, &cerr);
 
   EXPECT_EQ(-EINVAL, r);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, init_22)
@@ -582,21 +559,20 @@ TEST(ErasureCodeShec, init_22)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "a";		//unexpected value
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "a";		//unexpected value
 
-  int r = shec->init(*parameters);
+  int r = shec->init(*profile, &cerr);
 
   EXPECT_EQ(-EINVAL, r);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, init_23)
@@ -605,21 +581,20 @@ TEST(ErasureCodeShec, init_23)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
   //c is not specified
 
-  int r = shec->init(*parameters);
+  int r = shec->init(*profile, &cerr);
 
   EXPECT_EQ(-EINVAL, r);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, init_24)
@@ -628,28 +603,27 @@ TEST(ErasureCodeShec, init_24)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
-  (*parameters)["w"] = "1";		//unexpected value
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
+  (*profile)["w"] = "1";		//unexpected value
 
-  int r = shec->init(*parameters);
+  int r = shec->init(*profile, &cerr);
 
   EXPECT_TRUE(shec->matrix != NULL);
   EXPECT_EQ(0, r);
-  EXPECT_EQ(6u, shec->k);
-  EXPECT_EQ(4u, shec->m);
-  EXPECT_EQ(3u, shec->c);
-  EXPECT_EQ(8u, shec->w);
+  EXPECT_EQ(4, shec->k);
+  EXPECT_EQ(3, shec->m);
+  EXPECT_EQ(2, shec->c);
+  EXPECT_EQ(8, shec->w);
   //w is default value
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, init_25)
@@ -658,28 +632,27 @@ TEST(ErasureCodeShec, init_25)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
-  (*parameters)["w"] = "-1";		//unexpected value
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
+  (*profile)["w"] = "-1";		//unexpected value
 
-  int r = shec->init(*parameters);
+  int r = shec->init(*profile, &cerr);
 
   EXPECT_TRUE(shec->matrix != NULL);
   EXPECT_EQ(0, r);
-  EXPECT_EQ(6u, shec->k);
-  EXPECT_EQ(4u, shec->m);
-  EXPECT_EQ(3u, shec->c);
-  EXPECT_EQ(8u, shec->w);
+  EXPECT_EQ(4, shec->k);
+  EXPECT_EQ(3, shec->m);
+  EXPECT_EQ(2, shec->c);
+  EXPECT_EQ(8, shec->w);
   //w is default value
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, init_26)
@@ -688,28 +661,27 @@ TEST(ErasureCodeShec, init_26)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
-  (*parameters)["w"] = "0.1";		//unexpected value
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
+  (*profile)["w"] = "0.1";		//unexpected value
 
-  int r = shec->init(*parameters);
+  int r = shec->init(*profile, &cerr);
 
   EXPECT_TRUE(shec->matrix != NULL);
   EXPECT_EQ(0, r);
-  EXPECT_EQ(6u, shec->k);
-  EXPECT_EQ(4u, shec->m);
-  EXPECT_EQ(3u, shec->c);
-  EXPECT_EQ(8u, shec->w);
+  EXPECT_EQ(4, shec->k);
+  EXPECT_EQ(3, shec->m);
+  EXPECT_EQ(2, shec->c);
+  EXPECT_EQ(8, shec->w);
   //w is default value
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, init_27)
@@ -718,28 +690,27 @@ TEST(ErasureCodeShec, init_27)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
-  (*parameters)["w"] = "a";		//unexpected value
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
+  (*profile)["w"] = "a";		//unexpected value
 
-  int r = shec->init(*parameters);
+  int r = shec->init(*profile, &cerr);
 
   EXPECT_TRUE(shec->matrix != NULL);
   EXPECT_EQ(0, r);
-  EXPECT_EQ(6u, shec->k);
-  EXPECT_EQ(4u, shec->m);
-  EXPECT_EQ(3u, shec->c);
-  EXPECT_EQ(8u, shec->w);
+  EXPECT_EQ(4, shec->k);
+  EXPECT_EQ(3, shec->m);
+  EXPECT_EQ(2, shec->c);
+  EXPECT_EQ(8, shec->w);
   //w is default value
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, init_28)
@@ -748,21 +719,20 @@ TEST(ErasureCodeShec, init_28)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "10";	//c > m
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "10";	//c > m
 
-  int r = shec->init(*parameters);
+  int r = shec->init(*profile, &cerr);
 
   EXPECT_EQ(-EINVAL, r);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, init_29)
@@ -771,26 +741,25 @@ TEST(ErasureCodeShec, init_29)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
   //k is not specified
   //m is not specified
   //c is not specified
 
-  int r = shec->init(*parameters);
+  int r = shec->init(*profile, &cerr);
 
   EXPECT_TRUE(shec->matrix != NULL);
   EXPECT_EQ(0, r);
   //k,m,c are default values
-  EXPECT_EQ(4u, shec->k);
-  EXPECT_EQ(3u, shec->m);
-  EXPECT_EQ(2u, shec->c);
+  EXPECT_EQ(4, shec->k);
+  EXPECT_EQ(3, shec->m);
+  EXPECT_EQ(2, shec->c);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, init_30)
@@ -799,25 +768,24 @@ TEST(ErasureCodeShec, init_30)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "12";
-  (*parameters)["m"] = "8";
-  (*parameters)["c"] = "8";
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "12";
+  (*profile)["m"] = "8";
+  (*profile)["c"] = "8";
 
-  int r = shec->init(*parameters);
+  int r = shec->init(*profile, &cerr);
 
   EXPECT_TRUE(shec->matrix != NULL);
   EXPECT_EQ(0, r);
-  EXPECT_EQ(12u, shec->k);
-  EXPECT_EQ(8u, shec->m);
-  EXPECT_EQ(8u, shec->c);
+  EXPECT_EQ(12, shec->k);
+  EXPECT_EQ(8, shec->m);
+  EXPECT_EQ(8, shec->c);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, init_31)
@@ -826,21 +794,20 @@ TEST(ErasureCodeShec, init_31)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "13";
-  (*parameters)["m"] = "7";
-  (*parameters)["c"] = "7";
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "13";
+  (*profile)["m"] = "7";
+  (*profile)["c"] = "7";
 
-  int r = shec->init(*parameters);
+  int r = shec->init(*profile, &cerr);
 
   EXPECT_EQ(-EINVAL, r);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, init_32)
@@ -849,21 +816,20 @@ TEST(ErasureCodeShec, init_32)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "7";
-  (*parameters)["m"] = "13";
-  (*parameters)["c"] = "13";
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "7";
+  (*profile)["m"] = "13";
+  (*profile)["c"] = "13";
 
-  int r = shec->init(*parameters);
+  int r = shec->init(*profile, &cerr);
 
   EXPECT_EQ(-EINVAL, r);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, init_33)
@@ -872,21 +838,20 @@ TEST(ErasureCodeShec, init_33)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "12";
-  (*parameters)["m"] = "9";
-  (*parameters)["c"] = "8";
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "12";
+  (*profile)["m"] = "9";
+  (*profile)["c"] = "8";
 
-  int r = shec->init(*parameters);
+  int r = shec->init(*profile, &cerr);
 
   EXPECT_EQ(-EINVAL, r);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, init_34)
@@ -895,21 +860,20 @@ TEST(ErasureCodeShec, init_34)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "8";
-  (*parameters)["m"] = "12";
-  (*parameters)["c"] = "12";
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "8";
+  (*profile)["m"] = "12";
+  (*profile)["c"] = "12";
 
-  int r = shec->init(*parameters);
+  int r = shec->init(*profile, &cerr);
 
   EXPECT_EQ(-EINVAL, r);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, init2_4)
@@ -919,22 +883,21 @@ TEST(ErasureCodeShec, init2_4)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
-  shec->init(*parameters);
-  int r = shec->init(*parameters);	//init executed twice
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
+  shec->init(*profile, &cerr);
+  int r = shec->init(*profile, &cerr);	//init executed twice
 
-  //check parameters
-  EXPECT_EQ(6u, shec->k);
-  EXPECT_EQ(4u, shec->m);
-  EXPECT_EQ(3u, shec->c);
-  EXPECT_EQ(8u, shec->w);
+  //check profile
+  EXPECT_EQ(4, shec->k);
+  EXPECT_EQ(3, shec->m);
+  EXPECT_EQ(2, shec->c);
+  EXPECT_EQ(8, shec->w);
   EXPECT_EQ(ErasureCodeShec::MULTIPLE, shec->technique);
   EXPECT_STREQ("default", shec->ruleset_root.c_str());
   EXPECT_STREQ("osd", shec->ruleset_failure_domain.c_str());
@@ -942,7 +905,7 @@ TEST(ErasureCodeShec, init2_4)
   EXPECT_EQ(0, r);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, init2_5)
@@ -952,33 +915,31 @@ TEST(ErasureCodeShec, init2_5)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  map < std::string, std::string > *parameters2 = new map<std::string,
-      std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "host";
-  (*parameters)["k"] = "10";
-  (*parameters)["m"] = "6";
-  (*parameters)["c"] = "5";
-  (*parameters)["w"] = "16";
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  ErasureCodeProfile *profile2 = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "host";
+  (*profile)["k"] = "10";
+  (*profile)["m"] = "6";
+  (*profile)["c"] = "5";
+  (*profile)["w"] = "16";
 
-  int r = shec->init(*parameters);
+  int r = shec->init(*profile, &cerr);
 
   //reexecute init
-  (*parameters2)["plugin"] = "shec";
-  (*parameters2)["technique"] = "";
-  (*parameters2)["ruleset-failure-domain"] = "osd";
-  (*parameters2)["k"] = "6";
-  (*parameters2)["m"] = "4";
-  (*parameters2)["c"] = "3";
-  shec->init(*parameters2);
-
-  EXPECT_EQ(6u, shec->k);
-  EXPECT_EQ(4u, shec->m);
-  EXPECT_EQ(3u, shec->c);
-  EXPECT_EQ(8u, shec->w);
+  (*profile2)["plugin"] = "shec";
+  (*profile2)["technique"] = "";
+  (*profile2)["ruleset-failure-domain"] = "osd";
+  (*profile2)["k"] = "4";
+  (*profile2)["m"] = "3";
+  (*profile2)["c"] = "2";
+  shec->init(*profile2, &cerr);
+
+  EXPECT_EQ(4, shec->k);
+  EXPECT_EQ(3, shec->m);
+  EXPECT_EQ(2, shec->c);
+  EXPECT_EQ(8, shec->w);
   EXPECT_EQ(ErasureCodeShec::MULTIPLE, shec->technique);
   EXPECT_STREQ("default", shec->ruleset_root.c_str());
   EXPECT_STREQ("osd", shec->ruleset_failure_domain.c_str());
@@ -986,289 +947,175 @@ TEST(ErasureCodeShec, init2_5)
   EXPECT_EQ(0, r);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
-TEST(ErasureCodeShec, minimum_to_decode_1)
-{
-  //init
-  ErasureCodeShecTableCache tcache;
-  ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
-				  tcache,
-				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
-  shec->init(*parameters);
-
-  //minimum_to_decode
-  set<int> want_to_decode;
-  set<int> available_chunks;
-  set<int> minimum_chunks;
-
-  want_to_decode.insert(0);
-  available_chunks.insert(0);
-  available_chunks.insert(1);
-  available_chunks.insert(2);
-
-  int r = shec->minimum_to_decode(want_to_decode, available_chunks,
-				  &minimum_chunks);
-  EXPECT_TRUE(shec->matrix != NULL);
-  EXPECT_EQ(0, r);
-  EXPECT_TRUE(minimum_chunks.size());
-
-  delete shec;
-  delete parameters;
-}
-
-TEST(ErasureCodeShec, minimum_to_decode_2)
+TEST(ErasureCodeShec, minimum_to_decode_8)
 {
   //init
   ErasureCodeShecTableCache tcache;
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
-  shec->init(*parameters);
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
+  shec->init(*profile, &cerr);
 
   //minimum_to_decode
   set<int> want_to_decode;
   set<int> available_chunks;
   set<int> minimum_chunks;
 
-  for (int i = 0; i < 10; i++) {
+  for (int i = 0; i < 8; ++i) {
     want_to_decode.insert(i);
-    available_chunks.insert(i);
   }
-
-  int r = shec->minimum_to_decode(want_to_decode, available_chunks,
-				  &minimum_chunks);
-  EXPECT_TRUE(shec->matrix != NULL);
-  EXPECT_EQ(0, r);
-  EXPECT_TRUE(minimum_chunks.size());
-
-  delete shec;
-  delete parameters;
-}
-
-TEST(ErasureCodeShec, minimum_to_decode_3)
-{
-  //init
-  ErasureCodeShecTableCache tcache;
-  ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
-				  tcache,
-				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
-  shec->init(*parameters);
-
-  //minimum_to_decode
-  set<int> want_to_decode;
-  set<int> available_chunks;
-  set<int> minimum_chunks;
-
-  for (int i = 0; i < 32; i++) {		//want_to_decode.size() > k+m
-    want_to_decode.insert(i);
+  for (int i = 0; i < 5; ++i) {
     available_chunks.insert(i);
   }
 
   int r = shec->minimum_to_decode(want_to_decode, available_chunks,
 				  &minimum_chunks);
   EXPECT_EQ(-EINVAL, r);
-  EXPECT_EQ(0, minimum_chunks.size());
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
-TEST(ErasureCodeShec, minimum_to_decode_4)
+TEST(ErasureCodeShec, minimum_to_decode_9)
 {
   //init
   ErasureCodeShecTableCache tcache;
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
-  shec->init(*parameters);
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
+  shec->init(*profile, &cerr);
 
   //minimum_to_decode
   set<int> want_to_decode;
   set<int> available_chunks;
   set<int> minimum_chunks;
 
-  for (int i = 0; i < 9; i++) {
+  for (int i = 0; i < 4; ++i) {
     want_to_decode.insert(i);
+  }
+  for (int i = 0; i < 8; ++i) {
     available_chunks.insert(i);
   }
-  want_to_decode.insert(100);
-  available_chunks.insert(100);
 
   int r = shec->minimum_to_decode(want_to_decode, available_chunks,
 				  &minimum_chunks);
   EXPECT_EQ(-EINVAL, r);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
-TEST(ErasureCodeShec, minimum_to_decode_5)
+TEST(ErasureCodeShec, minimum_to_decode_10)
 {
   //init
   ErasureCodeShecTableCache tcache;
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
-  shec->init(*parameters);
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
+  shec->init(*profile, &cerr);
 
   //minimum_to_decode
   set<int> want_to_decode;
   set<int> available_chunks;
   set<int> minimum_chunks;
 
-  for (int i = 0; i < 10; i++) {
+  for (int i = 0; i < 7; ++i) {
     want_to_decode.insert(i);
   }
-  for (int i = 0; i < 32; i++) {		//available_chunks.size() > k+m
+  for (int i = 4; i < 7; ++i) {
     available_chunks.insert(i);
   }
 
   int r = shec->minimum_to_decode(want_to_decode, available_chunks,
 				  &minimum_chunks);
-  EXPECT_EQ(-EINVAL, r);
+  EXPECT_EQ(-EIO, r);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
-TEST(ErasureCodeShec, minimum_to_decode_6)
+TEST(ErasureCodeShec, minimum_to_decode_11)
 {
   //init
   ErasureCodeShecTableCache tcache;
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
-  shec->init(*parameters);
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
+  shec->init(*profile, &cerr);
 
   //minimum_to_decode
   set<int> want_to_decode;
   set<int> available_chunks;
   set<int> minimum_chunks;
 
-  for (int i = 0; i < 9; i++) {
+  for (int i = 0; i < 5; ++i) {
     want_to_decode.insert(i);
+  }
+  for (int i = 4; i < 7; ++i) {
     available_chunks.insert(i);
   }
-  available_chunks.insert(100);
-
-  int r = shec->minimum_to_decode(want_to_decode, available_chunks,
-				  &minimum_chunks);
-  EXPECT_EQ(-EINVAL, r);
-
-  delete shec;
-  delete parameters;
-}
-
-TEST(ErasureCodeShec, minimum_to_decode_7)
-{
-  //init
-  ErasureCodeShecTableCache tcache;
-  ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
-				  tcache,
-				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
-  shec->init(*parameters);
-
-  //minimum_to_decode
-  set<int> want_to_decode;
-  set<int> available_chunks;
-  set<int> minimum_chunks;
-
-  want_to_decode.insert(1);
-  want_to_decode.insert(3);
-  want_to_decode.insert(5);
-  available_chunks.insert(1);
-  available_chunks.insert(3);
-  available_chunks.insert(6);
 
   int r = shec->minimum_to_decode(want_to_decode, available_chunks,
 				  &minimum_chunks);
   EXPECT_EQ(-EIO, r);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
-TEST(ErasureCodeShec, minimum_to_decode_8)
+TEST(ErasureCodeShec, minimum_to_decode_12)
 {
   //init
   ErasureCodeShecTableCache tcache;
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
-  shec->init(*parameters);
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
+  shec->init(*profile, &cerr);
 
   //minimum_to_decode
   set<int> want_to_decode;
   set<int> available_chunks;
   //minimum_chunks is NULL
 
-  for (int i = 0; i < 10; i++) {
+  for (int i = 0; i < 7; ++i) {
     want_to_decode.insert(i);
     available_chunks.insert(i);
   }
@@ -1277,38 +1124,37 @@ TEST(ErasureCodeShec, minimum_to_decode_8)
   EXPECT_EQ(-EINVAL, r);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
-TEST(ErasureCodeShec, minimum_to_decode_9)
+TEST(ErasureCodeShec, minimum_to_decode_13)
 {
   //init
   ErasureCodeShecTableCache tcache;
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
-  shec->init(*parameters);
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
+  shec->init(*profile, &cerr);
 
   //minimum_to_decode
   set<int> want_to_decode;
   set<int> available_chunks;
   set<int> minimum_chunks, minimum;
 
-  for (int i = 0; i < 10; i++) {
+  for (int i = 0; i < 7; ++i) {
     want_to_decode.insert(i);
     available_chunks.insert(i);
   }
   shec->minimum_to_decode(want_to_decode, available_chunks, &minimum_chunks);
   minimum = minimum_chunks;		//normal value
-  for (int i = 100; i < 120; i++) {
+  for (int i = 100; i < 120; ++i) {
     minimum_chunks.insert(i);	//insert extra data
   }
 
@@ -1319,7 +1165,7 @@ TEST(ErasureCodeShec, minimum_to_decode_9)
   EXPECT_EQ(minimum, minimum_chunks);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, minimum_to_decode2_1)
@@ -1329,15 +1175,14 @@ TEST(ErasureCodeShec, minimum_to_decode2_1)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
-  shec->init(*parameters);
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
+  shec->init(*profile, &cerr);
 
   //minimum_to_decode
   set<int> want_to_decode;
@@ -1356,7 +1201,7 @@ TEST(ErasureCodeShec, minimum_to_decode2_1)
   EXPECT_TRUE(minimum_chunks.size());
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, minimum_to_decode2_3)
@@ -1366,15 +1211,14 @@ TEST(ErasureCodeShec, minimum_to_decode2_3)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
-  shec->init(*parameters);
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
+  shec->init(*profile, &cerr);
 
   //minimum_to_decode
   set<int> want_to_decode;
@@ -1406,7 +1250,7 @@ TEST(ErasureCodeShec, minimum_to_decode2_3)
   pthread_join(tid, NULL);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, minimum_to_decode_with_cost_1)
@@ -1416,25 +1260,24 @@ TEST(ErasureCodeShec, minimum_to_decode_with_cost_1)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
-  shec->init(*parameters);
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
+  shec->init(*profile, &cerr);
 
   //minimum_to_decode_with_cost
   set<int> want_to_decode;
   map<int, int> available_chunks;
   set<int> minimum_chunks;
 
-  want_to_decode.insert(0);
-  available_chunks[0] = 0;
-  available_chunks[1] = 1;
-  available_chunks[2] = 2;
+  for (int i = 0; i < 7; ++i) {
+    want_to_decode.insert(i);
+    available_chunks.insert(make_pair(i, i));
+  }
 
   int r = shec->minimum_to_decode_with_cost(want_to_decode, available_chunks,
 					    &minimum_chunks);
@@ -1443,7 +1286,7 @@ TEST(ErasureCodeShec, minimum_to_decode_with_cost_1)
   EXPECT_TRUE(minimum_chunks.size());
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, minimum_to_decode_with_cost_2_3)
@@ -1453,15 +1296,14 @@ TEST(ErasureCodeShec, minimum_to_decode_with_cost_2_3)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
-  shec->init(*parameters);
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
+  shec->init(*profile, &cerr);
 
   //minimum_to_decode_with_cost
   set<int> want_to_decode;
@@ -1493,7 +1335,7 @@ TEST(ErasureCodeShec, minimum_to_decode_with_cost_2_3)
   pthread_join(tid, NULL);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, encode_1)
@@ -1503,15 +1345,14 @@ TEST(ErasureCodeShec, encode_1)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
-  shec->init(*parameters);
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
+  shec->init(*profile, &cerr);
 
   //encode
   bufferlist in;
@@ -1520,10 +1361,9 @@ TEST(ErasureCodeShec, encode_1)
 
   in.append("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//length = 62
 	    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//124
-	    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//186
-	    "012345"//192
+	    "0123"//128
   );
-  for (unsigned int i = 0; i < shec->get_chunk_count(); i++) {
+  for (unsigned int i = 0; i < shec->get_chunk_count(); ++i) {
     want_to_encode.insert(i);
   }
 
@@ -1533,7 +1373,7 @@ TEST(ErasureCodeShec, encode_1)
   EXPECT_EQ(shec->get_chunk_size(in.length()), encoded[0].length());
 
   //decode
-  int want_to_decode[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
+  int want_to_decode[] = { 0, 1, 2, 3, 4, 5, 6 };
   map<int, bufferlist> decoded;
   decoded.clear();
   r = shec->decode(set<int>(want_to_decode, want_to_decode + 2),
@@ -1546,7 +1386,7 @@ TEST(ErasureCodeShec, encode_1)
 
   bufferlist out1, out2, usable;
   //out1 is "encoded"
-  for (unsigned int i = 0; i < encoded.size(); i++) {
+  for (unsigned int i = 0; i < encoded.size(); ++i) {
     out1.append(encoded[i]);
   }
   //out2 is "decoded"
@@ -1556,7 +1396,7 @@ TEST(ErasureCodeShec, encode_1)
   EXPECT_TRUE(usable == in);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, encode_2)
@@ -1566,15 +1406,14 @@ TEST(ErasureCodeShec, encode_2)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
-  shec->init(*parameters);
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
+  shec->init(*profile, &cerr);
 
   //encode
   bufferlist in;
@@ -1583,9 +1422,8 @@ TEST(ErasureCodeShec, encode_2)
 
   in.append("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//length = 62
 	    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//124
-	    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//186
   );
-  for (unsigned int i = 0; i < shec->get_chunk_count(); i++) {
+  for (unsigned int i = 0; i < shec->get_chunk_count(); ++i) {
     want_to_encode.insert(i);
   }
 
@@ -1595,7 +1433,7 @@ TEST(ErasureCodeShec, encode_2)
   EXPECT_EQ(shec->get_chunk_size(in.length()), encoded[0].length());
 
   //decode
-  int want_to_decode[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
+  int want_to_decode[] = { 0, 1, 2, 3, 4, 5, 6 };
   map<int, bufferlist> decoded;
   r = shec->decode(set<int>(want_to_decode, want_to_decode + 2), encoded,
 		   &decoded);
@@ -1606,7 +1444,7 @@ TEST(ErasureCodeShec, encode_2)
 
   bufferlist out1, out2, usable;
   //out1 is "encoded"
-  for (unsigned int i = 0; i < encoded.size(); i++)
+  for (unsigned int i = 0; i < encoded.size(); ++i)
     out1.append(encoded[i]);
   //out2 is "decoded"
   shec->decode_concat(encoded, &out2);
@@ -1615,7 +1453,7 @@ TEST(ErasureCodeShec, encode_2)
   EXPECT_TRUE(usable == in);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, encode_3)
@@ -1624,23 +1462,21 @@ TEST(ErasureCodeShec, encode_3)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
-  shec->init(*parameters);
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
+  shec->init(*profile, &cerr);
 
   bufferlist in;
   in.append("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//length = 62
 	    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//124
-	    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//186
   );
   set<int> want_to_encode;
-  for (unsigned int i = 0; i < shec->get_chunk_count(); i++) {
+  for (unsigned int i = 0; i < shec->get_chunk_count(); ++i) {
     want_to_encode.insert(i);
   }
   want_to_encode.insert(10);
@@ -1652,7 +1488,7 @@ TEST(ErasureCodeShec, encode_3)
   EXPECT_EQ(shec->get_chunk_size(in.length()), encoded[0].length());
 
   //decode
-  int want_to_decode[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
+  int want_to_decode[] = { 0, 1, 2, 3, 4, 5, 6 };
   map<int, bufferlist> decoded;
   r = shec->decode(set<int>(want_to_decode, want_to_decode + 2), encoded,
 		   &decoded);
@@ -1663,7 +1499,7 @@ TEST(ErasureCodeShec, encode_3)
 
   bufferlist out1, out2, usable;
   //out1 is "encoded"
-  for (unsigned int i = 0; i < encoded.size(); i++) {
+  for (unsigned int i = 0; i < encoded.size(); ++i) {
     out1.append(encoded[i]);
   }
   //out2 is "decoded"
@@ -1673,7 +1509,7 @@ TEST(ErasureCodeShec, encode_3)
   EXPECT_TRUE(usable == in);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, encode_4)
@@ -1683,15 +1519,14 @@ TEST(ErasureCodeShec, encode_4)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
-  shec->init(*parameters);
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
+  shec->init(*profile, &cerr);
 
   //encode
   bufferlist in;
@@ -1700,9 +1535,8 @@ TEST(ErasureCodeShec, encode_4)
 
   in.append("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//length = 62
 	    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//124
-	    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//186
   );
-  for (unsigned int i = 0; i < shec->get_chunk_count() - 1; i++) {
+  for (unsigned int i = 0; i < shec->get_chunk_count() - 1; ++i) {
     want_to_encode.insert(i);
   }
   want_to_encode.insert(100);
@@ -1713,7 +1547,7 @@ TEST(ErasureCodeShec, encode_4)
   EXPECT_EQ(shec->get_chunk_size(in.length()), encoded[0].length());
 
   //decode
-  int want_to_decode[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
+  int want_to_decode[] = { 0, 1, 2, 3, 4, 5, 6 };
   map<int, bufferlist> decoded;
   r = shec->decode(set<int>(want_to_decode, want_to_decode + 2), encoded,
 		   &decoded);
@@ -1724,7 +1558,7 @@ TEST(ErasureCodeShec, encode_4)
 
   bufferlist out1, out2, usable;
   //out1 is "encoded"
-  for (unsigned int i = 0; i < encoded.size(); i++) {
+  for (unsigned int i = 0; i < encoded.size(); ++i) {
     out1.append(encoded[i]);
   }
   //out2 is "decoded"
@@ -1734,7 +1568,7 @@ TEST(ErasureCodeShec, encode_4)
   EXPECT_TRUE(usable == in);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, encode_8)
@@ -1744,15 +1578,14 @@ TEST(ErasureCodeShec, encode_8)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
-  shec->init(*parameters);
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
+  shec->init(*profile, &cerr);
 
   //encode
   bufferlist in;
@@ -1760,9 +1593,8 @@ TEST(ErasureCodeShec, encode_8)
 
   in.append("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//length = 62
 	    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//124
-	    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//186
   );
-  for (unsigned int i = 0; i < shec->get_chunk_count(); i++) {
+  for (unsigned int i = 0; i < shec->get_chunk_count(); ++i) {
     want_to_encode.insert(i);
   }
 
@@ -1770,7 +1602,7 @@ TEST(ErasureCodeShec, encode_8)
   EXPECT_EQ(-EINVAL, r);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, encode_9)
@@ -1780,15 +1612,14 @@ TEST(ErasureCodeShec, encode_9)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
-  shec->init(*parameters);
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
+  shec->init(*profile, &cerr);
 
   //encode
   bufferlist in;
@@ -1797,12 +1628,11 @@ TEST(ErasureCodeShec, encode_9)
 
   in.append("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//length = 62
 	    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//124
-	    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//186
   );
-  for (unsigned int i = 0; i < shec->get_chunk_count(); i++) {
+  for (unsigned int i = 0; i < shec->get_chunk_count(); ++i) {
     want_to_encode.insert(i);
   }
-  for (int i = 0; i < 100; i++) {
+  for (int i = 0; i < 100; ++i) {
     encoded[i].append("ABCDEFGHIJKLMNOPQRSTUVWXYZ");
   }
 
@@ -1810,7 +1640,7 @@ TEST(ErasureCodeShec, encode_9)
   EXPECT_EQ(-EINVAL, r);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, encode2_1)
@@ -1820,15 +1650,14 @@ TEST(ErasureCodeShec, encode2_1)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
-  shec->init(*parameters);
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
+  shec->init(*profile, &cerr);
 
   //encode
   bufferlist in;
@@ -1837,10 +1666,9 @@ TEST(ErasureCodeShec, encode2_1)
 
   in.append("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//length = 62
 	    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//124
-	    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//186
-	    "012345"//192
+	    "0123"//128
   );
-  for (unsigned int i = 0; i < shec->get_chunk_count(); i++) {
+  for (unsigned int i = 0; i < shec->get_chunk_count(); ++i) {
     want_to_encode.insert(i);
   }
 
@@ -1850,7 +1678,7 @@ TEST(ErasureCodeShec, encode2_1)
   EXPECT_EQ(shec->get_chunk_size(in.length()), encoded[0].length());
 
   //decode
-  int want_to_decode[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
+  int want_to_decode[] = { 0, 1, 2, 3, 4, 5, 6 };
   map<int, bufferlist> decoded;
   r = shec->decode(set<int>(want_to_decode, want_to_decode + 2), encoded,
 		   &decoded);
@@ -1861,7 +1689,7 @@ TEST(ErasureCodeShec, encode2_1)
 
   bufferlist out1, out2, usable;
   //out1 is "encoded"
-  for (unsigned int i = 0; i < encoded.size(); i++) {
+  for (unsigned int i = 0; i < encoded.size(); ++i) {
     out1.append(encoded[i]);
   }
   //out2 is "decoded"
@@ -1871,7 +1699,7 @@ TEST(ErasureCodeShec, encode2_1)
   EXPECT_TRUE(usable == in);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, encode2_3)
@@ -1881,15 +1709,14 @@ TEST(ErasureCodeShec, encode2_3)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
-  shec->init(*parameters);
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
+  shec->init(*profile, &cerr);
 
   //encode
   bufferlist in;
@@ -1898,10 +1725,9 @@ TEST(ErasureCodeShec, encode2_3)
 
   in.append("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//length = 62
 	    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//124
-	    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//186
-	    "012345"//192
+	    "0123"//128
   );
-  for (unsigned int i = 0; i < shec->get_chunk_count(); i++) {
+  for (unsigned int i = 0; i < shec->get_chunk_count(); ++i) {
     want_to_encode.insert(i);
   }
 
@@ -1922,7 +1748,7 @@ TEST(ErasureCodeShec, encode2_3)
   pthread_join(tid, NULL);
 
   //decode
-  int want_to_decode[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
+  int want_to_decode[] = { 0, 1, 2, 3, 4, 5, 6 };
   map<int, bufferlist> decoded;
 
   r = shec->decode(set<int>(want_to_decode, want_to_decode + 2), encoded,
@@ -1934,7 +1760,7 @@ TEST(ErasureCodeShec, encode2_3)
 
   bufferlist out1, out2, usable;
   //out1 is "encoded"
-  for (unsigned int i = 0; i < encoded.size(); i++) {
+  for (unsigned int i = 0; i < encoded.size(); ++i) {
     out1.append(encoded[i]);
   }
   //out2 is "decoded"
@@ -1944,7 +1770,7 @@ TEST(ErasureCodeShec, encode2_3)
   EXPECT_TRUE(usable == in);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, decode_1)
@@ -1954,15 +1780,14 @@ TEST(ErasureCodeShec, decode_1)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
-  shec->init(*parameters);
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
+  shec->init(*profile, &cerr);
 
   //encode
   bufferlist in;
@@ -1974,7 +1799,7 @@ TEST(ErasureCodeShec, decode_1)
 	    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//186
 	    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//248
   );
-  for (unsigned int i = 0; i < shec->get_chunk_count(); i++) {
+  for (unsigned int i = 0; i < shec->get_chunk_count(); ++i) {
     want_to_encode.insert(i);
   }
 
@@ -1985,40 +1810,50 @@ TEST(ErasureCodeShec, decode_1)
 
   // all chunks are available
   //decode
-  int want_to_decode[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
+  int want_to_decode[] = { 0, 1, 2, 3, 4, 5, 6 };
   map<int, bufferlist> decoded;
 
-  r = shec->decode(set<int>(want_to_decode, want_to_decode + 2), encoded,
+  r = shec->decode(set<int>(want_to_decode, want_to_decode + 7), encoded,
 		   &decoded);
   EXPECT_TRUE(shec->matrix != NULL);
   EXPECT_EQ(0, r);
-  EXPECT_EQ(2u, decoded.size());
+  EXPECT_EQ(7u, decoded.size());
 
-  bufferlist out, usable;
-  shec->decode_concat(encoded, &out);
-  usable.substr_of(out, 0, in.length());
-  EXPECT_TRUE(usable == in);
+  bufferlist usable;
+  int cmp;
+  unsigned int c_size = shec->get_chunk_size(in.length());
+  for (unsigned int i = 0; i < shec->get_data_chunk_count(); ++i) {
+    usable.clear();
+    EXPECT_EQ(c_size, decoded[i].length());
+    if ( c_size * (i+1) <= in.length() ) {
+      usable.substr_of(in, c_size * i, c_size);
+      cmp = memcmp(decoded[i].c_str(), usable.c_str(), c_size);
+    } else {
+      usable.substr_of(in, c_size * i, in.length() % c_size);
+      cmp = memcmp(decoded[i].c_str(), usable.c_str(), in.length() % c_size);
+    }
+    EXPECT_EQ(0, cmp);
+  }
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
-TEST(ErasureCodeShec, decode_2)
+TEST(ErasureCodeShec, decode_8)
 {
   //init
   ErasureCodeShecTableCache tcache;
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
-  shec->init(*parameters);
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
+  shec->init(*profile, &cerr);
 
   //encode
   bufferlist in;
@@ -2028,9 +1863,9 @@ TEST(ErasureCodeShec, decode_2)
   in.append("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//length = 62
 	    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//124
 	    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//186
-	    "012345"//192
+            "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//248
   );
-  for (unsigned int i = 0; i < shec->get_chunk_count(); i++) {
+  for (unsigned int i = 0; i < shec->get_chunk_count(); ++i) {
     want_to_encode.insert(i);
   }
 
@@ -2041,40 +1876,50 @@ TEST(ErasureCodeShec, decode_2)
 
   // all chunks are available
   //decode
-  int want_to_decode[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
+  int want_to_decode[] = { 0, 1, 2, 3, 4, 5, 6, 7 }; //more than k+m
   map<int, bufferlist> decoded;
 
-  r = shec->decode(set<int>(want_to_decode, want_to_decode + 2), encoded,
+  r = shec->decode(set<int>(want_to_decode, want_to_decode + 8), encoded,
 		   &decoded);
-  EXPECT_TRUE(shec->matrix != NULL);
   EXPECT_EQ(0, r);
-  EXPECT_EQ(2u, decoded.size());
+  EXPECT_EQ(7u, decoded.size());
+  EXPECT_EQ(shec->get_chunk_size(in.length()), encoded[0].length());
 
-  bufferlist out, usable;
-  shec->decode_concat(encoded, &out);
-  usable.substr_of(out, 0, in.length());
-  EXPECT_TRUE(usable == in);
+  bufferlist usable;
+  int cmp;
+  unsigned int c_size = shec->get_chunk_size(in.length());
+  for (unsigned int i = 0; i < shec->get_data_chunk_count(); ++i) {
+    usable.clear();
+    EXPECT_EQ(c_size, decoded[i].length());
+    if ( c_size * (i+1) <= in.length() ) {
+      usable.substr_of(in, c_size * i, c_size);
+      cmp = memcmp(decoded[i].c_str(), usable.c_str(), c_size);
+    } else {
+      usable.substr_of(in, c_size * i, in.length() % c_size);
+      cmp = memcmp(decoded[i].c_str(), usable.c_str(), in.length() % c_size);
+    }
+    EXPECT_EQ(0, cmp);
+  }
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
-TEST(ErasureCodeShec, decode_3)
+TEST(ErasureCodeShec, decode_9)
 {
   //init
   ErasureCodeShecTableCache tcache;
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
-  shec->init(*parameters);
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
+  shec->init(*profile, &cerr);
 
   //encode
   bufferlist in;
@@ -2086,7 +1931,7 @@ TEST(ErasureCodeShec, decode_3)
 	    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//186
 	    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//248
   );
-  for (unsigned int i = 0; i < shec->get_chunk_count(); i++) {
+  for (unsigned int i = 0; i < shec->get_chunk_count(); ++i) {
     want_to_encode.insert(i);
   }
 
@@ -2097,47 +1942,62 @@ TEST(ErasureCodeShec, decode_3)
 
   // all chunks are available
   //decode
-  int want_to_decode[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 }; //more than k+m
+  int want_to_decode[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
   map<int, bufferlist> decoded;
 
-  r = shec->decode(set<int>(want_to_decode, want_to_decode + 11), encoded,
+  //extra data
+  bufferlist buf;
+  buf.append("abc");
+  encoded[100] = buf;
+
+  r = shec->decode(set<int>(want_to_decode, want_to_decode + 10), encoded,
 		   &decoded);
   EXPECT_TRUE(shec->matrix != NULL);
   EXPECT_EQ(0, r);
-  EXPECT_EQ(10u, decoded.size());
+  EXPECT_EQ(7u, decoded.size());
   EXPECT_EQ(shec->get_chunk_size(in.length()), decoded[0].length());
 
-  bufferlist out1, out2, usable;
+  bufferlist out1, usable;
   //out1 is "encoded"
-  for (unsigned int i = 0; i < encoded.size(); i++) {
+  for (unsigned int i = 0; i < encoded.size(); ++i) {
     out1.append(encoded[i]);
   }
-  //out2 is "decoded"
-  shec->decode_concat(encoded, &out2);
-  usable.substr_of(out2, 0, in.length());
   EXPECT_FALSE(out1 == in);
-  EXPECT_TRUE(usable == in);
+  //usable is "decoded"
+  int cmp;
+  unsigned int c_size = shec->get_chunk_size(in.length());
+  for (unsigned int i = 0; i < shec->get_data_chunk_count(); ++i) {
+    usable.clear();
+    EXPECT_EQ(c_size, decoded[i].length());
+    if ( c_size * (i+1) <= in.length() ) {
+      usable.substr_of(in, c_size * i, c_size);
+      cmp = memcmp(decoded[i].c_str(), usable.c_str(), c_size);
+    } else {
+      usable.substr_of(in, c_size * i, in.length() % c_size);
+      cmp = memcmp(decoded[i].c_str(), usable.c_str(), in.length() % c_size);
+    }
+    EXPECT_EQ(0, cmp);
+  }
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
-TEST(ErasureCodeShec, decode_4)
+TEST(ErasureCodeShec, decode_10)
 {
   //init
   ErasureCodeShecTableCache tcache;
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
-  shec->init(*parameters);
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
+  shec->init(*profile, &cerr);
 
   //encode
   bufferlist in;
@@ -2149,7 +2009,7 @@ TEST(ErasureCodeShec, decode_4)
 	    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//186
 	    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//248
   );
-  for (unsigned int i = 0; i < shec->get_chunk_count(); i++) {
+  for (unsigned int i = 0; i < shec->get_chunk_count(); ++i) {
     want_to_encode.insert(i);
   }
 
@@ -2158,49 +2018,37 @@ TEST(ErasureCodeShec, decode_4)
   EXPECT_EQ(shec->get_chunk_count(), encoded.size());
   EXPECT_EQ(shec->get_chunk_size(in.length()), encoded[0].length());
 
-  // all chunks are available
   //decode
-  int want_to_decode[] = { 0, 1, 2, 3, 4, 5, 6, 7, 100 };
-  map<int, bufferlist> decoded;
+  int want_to_decode[] = { 0, 1, 2, 3, 4, 5, 6 }; //more than k+m
+  map<int, bufferlist> decoded, inchunks;
 
-  r = shec->decode(set<int>(want_to_decode, want_to_decode + 9), encoded,
-		   &decoded);
-  EXPECT_TRUE(shec->matrix != NULL);
-  EXPECT_EQ(0, r);
-  EXPECT_EQ(10u, decoded.size());
-  EXPECT_EQ(shec->get_chunk_size(in.length()), decoded[0].length());
-
-  bufferlist out1, out2, usable;
-  //out1 is "encoded"
-  for (unsigned int i = 0; i < encoded.size(); i++) {
-    out1.append(encoded[i]);
+  for ( unsigned int i = 0; i < 3; ++i) {
+    inchunks.insert(make_pair(i, encoded[i]));
   }
-  //out2 is "decoded"
-  shec->decode_concat(encoded, &out2);
-  usable.substr_of(out2, 0, in.length());
-  EXPECT_FALSE(out1 == in);
-  EXPECT_TRUE(usable == in);
+
+  r = shec->decode(set<int>(want_to_decode, want_to_decode + 7), inchunks,
+		   &decoded);
+  EXPECT_EQ(-1, r);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
-TEST(ErasureCodeShec, decode_7)
+TEST(ErasureCodeShec, decode_11)
 {
   //init
   ErasureCodeShecTableCache tcache;
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
-  shec->init(*parameters);
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
+  shec->init(*profile, &cerr);
 
   //encode
   bufferlist in;
@@ -2209,10 +2057,9 @@ TEST(ErasureCodeShec, decode_7)
 
   in.append("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//length = 62
 	    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//124
-	    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//186
-	    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//248
+	    "ABCD"//128
   );
-  for (unsigned int i = 0; i < shec->get_chunk_count(); i++) {
+  for (unsigned int i = 0; i < shec->get_chunk_count(); ++i) {
     want_to_encode.insert(i);
   }
 
@@ -2221,54 +2068,37 @@ TEST(ErasureCodeShec, decode_7)
   EXPECT_EQ(shec->get_chunk_count(), encoded.size());
   EXPECT_EQ(shec->get_chunk_size(in.length()), encoded[0].length());
 
-  // all chunks are available
   //decode
-  int want_to_decode[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
-  map<int, bufferlist> decoded;
+  int want_to_decode[] = { 0, 1, 2, 3, 4 };
+  map<int, bufferlist> decoded, inchunks;
 
-  //extra data
-  bufferlist buf;
-  buf.append("abc");
-  encoded[100] = buf;
+  for ( unsigned int i = 4; i < 7; ++i) {
+    inchunks.insert(make_pair(i, encoded[i]));
+  }
 
-  r = shec->decode(set<int>(want_to_decode, want_to_decode + 2), encoded,
+  r = shec->decode(set<int>(want_to_decode, want_to_decode + 5), inchunks,
 		   &decoded);
-  EXPECT_TRUE(shec->matrix != NULL);
-  EXPECT_EQ(0, r);
-  EXPECT_EQ(2u, decoded.size());
-  EXPECT_EQ(shec->get_chunk_size(in.length()), decoded[0].length());
-
-  bufferlist out1, out2, usable;
-  //out1 is "encoded"
-  for (unsigned int i = 0; i < encoded.size(); i++) {
-    out1.append(encoded[i]);
-  }
-  //out2 is "decoded"
-  shec->decode_concat(encoded, &out2);
-  usable.substr_of(out2, 0, in.length());
-  EXPECT_FALSE(out1 == in);
-  EXPECT_TRUE(usable == in);
+  EXPECT_EQ(-1, r);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
-TEST(ErasureCodeShec, decode_8)
+TEST(ErasureCodeShec, decode_12)
 {
   //init
   ErasureCodeShecTableCache tcache;
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
-  shec->init(*parameters);
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
+  shec->init(*profile, &cerr);
 
   //encode
   bufferlist in;
@@ -2280,7 +2110,7 @@ TEST(ErasureCodeShec, decode_8)
 	    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//186
 	    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//248
   );
-  for (unsigned int i = 0; i < shec->get_chunk_count(); i++) {
+  for (unsigned int i = 0; i < shec->get_chunk_count(); ++i) {
     want_to_encode.insert(i);
   }
 
@@ -2291,33 +2121,32 @@ TEST(ErasureCodeShec, decode_8)
 
   // all chunks are available
   //decode
-  int want_to_decode[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
+  int want_to_decode[] = { 0, 1, 2, 3, 4, 5, 6 };
 
   //decoded = NULL
-  r = shec->decode(set<int>(want_to_decode, want_to_decode + 2), encoded,
+  r = shec->decode(set<int>(want_to_decode, want_to_decode + 7), encoded,
 		   NULL);
   EXPECT_NE(0, r);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
-TEST(ErasureCodeShec, decode_9)
+TEST(ErasureCodeShec, decode_13)
 {
   //init
   ErasureCodeShecTableCache tcache;
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
-  shec->init(*parameters);
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
+  shec->init(*profile, &cerr);
 
   //encode
   bufferlist in;
@@ -2329,7 +2158,7 @@ TEST(ErasureCodeShec, decode_9)
 	    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//186
 	    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//248
   );
-  for (unsigned int i = 0; i < shec->get_chunk_count(); i++) {
+  for (unsigned int i = 0; i < shec->get_chunk_count(); ++i) {
     want_to_encode.insert(i);
   }
 
@@ -2340,22 +2169,22 @@ TEST(ErasureCodeShec, decode_9)
 
   // all chunks are available
   //decode
-  int want_to_decode[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
+  int want_to_decode[] = { 0, 1, 2, 3, 4, 5, 6 };
   map<int, bufferlist> decoded;
 
   //extra data
   bufferlist buf;
   buf.append("a");
-  for (int i = 0; i < 100; i++) {
+  for (int i = 0; i < 100; ++i) {
     decoded[i] = buf;
   }
 
-  r = shec->decode(set<int>(want_to_decode, want_to_decode + 2), encoded,
+  r = shec->decode(set<int>(want_to_decode, want_to_decode + 7), encoded,
 		   &decoded);
   EXPECT_NE(0, r);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, decode2_1)
@@ -2365,15 +2194,14 @@ TEST(ErasureCodeShec, decode2_1)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
-  shec->init(*parameters);
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
+  shec->init(*profile, &cerr);
 
   //encode
   bufferlist in;
@@ -2385,7 +2213,7 @@ TEST(ErasureCodeShec, decode2_1)
 	    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//186
 	    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//248
   );
-  for (unsigned int i = 0; i < shec->get_chunk_count(); i++) {
+  for (unsigned int i = 0; i < shec->get_chunk_count(); ++i) {
     want_to_encode.insert(i);
   }
 
@@ -2412,7 +2240,7 @@ TEST(ErasureCodeShec, decode2_1)
   EXPECT_TRUE(usable == in);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, decode2_3)
@@ -2422,15 +2250,14 @@ TEST(ErasureCodeShec, decode2_3)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
-  shec->init(*parameters);
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
+  shec->init(*profile, &cerr);
 
   //encode
   bufferlist in;
@@ -2442,7 +2269,7 @@ TEST(ErasureCodeShec, decode2_3)
 	    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//186
 	    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//248
   );
-  for (unsigned int i = 0; i < shec->get_chunk_count(); i++) {
+  for (unsigned int i = 0; i < shec->get_chunk_count(); ++i) {
     want_to_encode.insert(i);
   }
 
@@ -2480,7 +2307,7 @@ TEST(ErasureCodeShec, decode2_3)
   EXPECT_TRUE(usable == in);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, decode2_4)
@@ -2490,15 +2317,14 @@ TEST(ErasureCodeShec, decode2_4)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
-  shec->init(*parameters);
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
+  shec->init(*profile, &cerr);
 
   //encode
   bufferlist in;
@@ -2510,7 +2336,7 @@ TEST(ErasureCodeShec, decode2_4)
 	    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//186
 	    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//248
   );
-  for (unsigned int i = 0; i < shec->get_chunk_count(); i++) {
+  for (unsigned int i = 0; i < shec->get_chunk_count(); ++i) {
     want_to_encode.insert(i);
   }
 
@@ -2533,7 +2359,7 @@ TEST(ErasureCodeShec, decode2_4)
   EXPECT_EQ(-1, r);
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, create_ruleset_1_2)
@@ -2569,15 +2395,14 @@ TEST(ErasureCodeShec, create_ruleset_1_2)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
-  shec->init(*parameters);
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
+  shec->init(*profile, &cerr);
 
   //create_ruleset
   stringstream ss;
@@ -2591,7 +2416,7 @@ TEST(ErasureCodeShec, create_ruleset_1_2)
   EXPECT_EQ(-EEXIST, r);
 
   delete shec;
-  delete parameters;
+  delete profile;
   delete crush;
 }
 
@@ -2628,22 +2453,21 @@ TEST(ErasureCodeShec, create_ruleset_4)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
-  shec->init(*parameters);
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
+  shec->init(*profile, &cerr);
 
   //create_ruleset
   int r = shec->create_ruleset("myrule", *crush, NULL);	//ss = NULL
   EXPECT_EQ(0, r);
 
   delete shec;
-  delete parameters;
+  delete profile;
   delete crush;
 }
 
@@ -2680,15 +2504,14 @@ TEST(ErasureCodeShec, create_ruleset2_1)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
-  shec->init(*parameters);
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
+  shec->init(*profile, &cerr);
 
   //create_ruleset
   stringstream ss;
@@ -2698,7 +2521,7 @@ TEST(ErasureCodeShec, create_ruleset2_1)
   EXPECT_STREQ("myrule", crush->rule_name_map[0].c_str());
 
   delete shec;
-  delete parameters;
+  delete profile;
   delete crush;
 }
 
@@ -2740,15 +2563,14 @@ TEST(ErasureCodeShec, create_ruleset2_3)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
-  shec->init(*parameters);
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
+  shec->init(*profile, &cerr);
 
   //create_ruleset
   stringstream ss;
@@ -2768,7 +2590,7 @@ TEST(ErasureCodeShec, create_ruleset2_3)
   pthread_join(tid, NULL);
 
   delete shec;
-  delete parameters;
+  delete profile;
   delete crush;
 }
 
@@ -2779,21 +2601,20 @@ TEST(ErasureCodeShec, get_chunk_count_1)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
-  shec->init(*parameters);
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
+  shec->init(*profile, &cerr);
 
   //get_chunk_count
-  EXPECT_EQ(10u, shec->get_chunk_count());
+  EXPECT_EQ(7u, shec->get_chunk_count());
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, get_data_chunk_count_1)
@@ -2803,21 +2624,20 @@ TEST(ErasureCodeShec, get_data_chunk_count_1)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
-  shec->init(*parameters);
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
+  shec->init(*profile, &cerr);
 
   //get_data_chunk_count
-  EXPECT_EQ(6u, shec->get_data_chunk_count());
+  EXPECT_EQ(4u, shec->get_data_chunk_count());
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 TEST(ErasureCodeShec, get_chunk_size_1_2)
@@ -2827,24 +2647,23 @@ TEST(ErasureCodeShec, get_chunk_size_1_2)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = "6";
-  (*parameters)["m"] = "4";
-  (*parameters)["c"] = "3";
-  (*parameters)["w"] = "8";
-  shec->init(*parameters);
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = "4";
+  (*profile)["m"] = "3";
+  (*profile)["c"] = "2";
+  (*profile)["w"] = "8";
+  shec->init(*profile, &cerr);
 
-  //when there is no padding(192=k*w*4)
-  EXPECT_EQ(32u, shec->get_chunk_size(192));
-  //when there is padding(190=k*w*4-2)
-  EXPECT_EQ(32u, shec->get_chunk_size(190));
+  //when there is no padding(128=k*w*4)
+  EXPECT_EQ(32u, shec->get_chunk_size(128));
+  //when there is padding(126=k*w*4-2)
+  EXPECT_EQ(32u, shec->get_chunk_size(126));
 
   delete shec;
-  delete parameters;
+  delete profile;
 }
 
 int main(int argc, char **argv)
@@ -2855,6 +2674,8 @@ int main(int argc, char **argv)
   global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
   common_init_finish(g_ceph_context);
 
+  g_conf->set_val("erasure_code_dir", ".libs", false, false);
+
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
@@ -2945,7 +2766,7 @@ void* thread3(void* pParam)
   while (g_flag == 1) {
     sprintf(name, "myrule%d", i);
     shec->create_ruleset(name, *crush, &ss);
-    i++;
+    ++i;
   }
   printf("*** thread loop end ***\n");
 
@@ -2963,7 +2784,7 @@ void* thread4(void* pParam)
 	    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//248
   );
   set<int> want_to_encode;
-  for (unsigned int i = 0; i < shec->get_chunk_count(); i++) {
+  for (unsigned int i = 0; i < shec->get_chunk_count(); ++i) {
     want_to_encode.insert(i);
   }
 
@@ -2992,7 +2813,7 @@ void* thread5(void* pParam)
 	  "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//310
   );
   set<int> want_to_encode;
-  for (unsigned int i = 0; i < shec->get_chunk_count(); i++) {
+  for (unsigned int i = 0; i < shec->get_chunk_count(); ++i) {
     want_to_encode.insert(i);
   }
   map<int, bufferlist> encoded;
diff --git a/src/test/erasure-code/TestErasureCodeShec_all.cc b/src/test/erasure-code/TestErasureCodeShec_all.cc
index 22834cb..6e9a743 100644
--- a/src/test/erasure-code/TestErasureCodeShec_all.cc
+++ b/src/test/erasure-code/TestErasureCodeShec_all.cc
@@ -73,19 +73,17 @@ TEST_P(ParameterTest, parameter_all)
   ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				  tcache,
 				  ErasureCodeShec::MULTIPLE);
-  map < std::string, std::string > *parameters = new map<std::string,
-							 std::string>();
-  (*parameters)["plugin"] = "shec";
-  (*parameters)["technique"] = "";
-  (*parameters)["directory"] = "/usr/lib64/ceph/erasure-code";
-  (*parameters)["ruleset-failure-domain"] = "osd";
-  (*parameters)["k"] = k;
-  (*parameters)["m"] = m;
-  (*parameters)["c"] = c;
-
-  result = shec->init(*parameters);
-
-  //check parameters
+  ErasureCodeProfile *profile = new ErasureCodeProfile();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = k;
+  (*profile)["m"] = m;
+  (*profile)["c"] = c;
+
+  result = shec->init(*profile, &cerr);
+
+  //check profile
   EXPECT_EQ(i_k, shec->k);
   EXPECT_EQ(i_m, shec->m);
   EXPECT_EQ(i_c, shec->c);
@@ -184,7 +182,7 @@ TEST_P(ParameterTest, parameter_all)
 
   result = shec->encode(want_to_encode, in, &encoded);
   EXPECT_EQ(0, result);
-  EXPECT_EQ(i_k+i_m, encoded.size());
+  EXPECT_EQ(i_k+i_m, (int)encoded.size());
   EXPECT_EQ(c_size, encoded[0].length());
 
   //decode
@@ -248,16 +246,16 @@ TEST_P(ParameterTest, parameter_all)
   EXPECT_STREQ("myrule", crush->rule_name_map[0].c_str());
 
   //get_chunk_count
-  EXPECT_EQ(i_k+i_m, shec->get_chunk_count());
+  EXPECT_EQ(i_k+i_m, (int)shec->get_chunk_count());
 
   //get_data_chunk_count
-  EXPECT_EQ(i_k, shec->get_data_chunk_count());
+  EXPECT_EQ(i_k, (int)shec->get_data_chunk_count());
 
   //get_chunk_size
   EXPECT_EQ(c_size, shec->get_chunk_size(192));
 
   delete shec;
-  delete parameters;
+  delete profile;
   delete crush;
 }
 
@@ -275,9 +273,9 @@ int main(int argc, char **argv)
   for (unsigned int k = 1; k <= 12; k++) {
     for (unsigned int m = 1; (m <= k) && (k + m <= 20); m++) {
       for (unsigned int c = 1; c <= m; c++) {
-	sprintf(param[i].sk, "%d", k);
-	sprintf(param[i].sm, "%d", m);
-	sprintf(param[i].sc, "%d", c);
+	sprintf(param[i].sk, "%u", k);
+	sprintf(param[i].sm, "%u", m);
+	sprintf(param[i].sc, "%u", c);
 
 	param[i].k = param[i].sk;
 	param[i].m = param[i].sm;
@@ -298,6 +296,8 @@ int main(int argc, char **argv)
   global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
   common_init_finish(g_ceph_context);
 
+  g_conf->set_val("erasure_code_dir", ".libs", false, false);
+
   ::testing::InitGoogleTest(&argc, argv);
 
   r = RUN_ALL_TESTS();
@@ -317,7 +317,7 @@ int main(int argc, char **argv)
   }
   std::cout << "cannot recovery patterns:" << std::endl;
   for (std::vector<Recover_d>::const_iterator i = cannot_recover.begin();
-       i != cannot_recover.end(); i++) {
+       i != cannot_recover.end(); ++i) {
     std::cout << "---" << std::endl;
     std::cout << "k = " << i->k << ", m = " << i->m << ", c = " << i->c
 	<< std::endl;
diff --git a/src/test/erasure-code/TestErasureCodeShec_arguments.cc b/src/test/erasure-code/TestErasureCodeShec_arguments.cc
new file mode 100644
index 0000000..5d2f494
--- /dev/null
+++ b/src/test/erasure-code/TestErasureCodeShec_arguments.cc
@@ -0,0 +1,412 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 FUJITSU LIMITED
+ *
+ * Author: Shotaro Kawaguchi <kawaguchi.s at jp.fujitsu.com>
+ * Author: Takanori Nakao <nakao.takanori at jp.fujitsu.com>
+ * Author: Takeshi Miyamae <miyamae.takeshi at jp.fujitsu.com>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+// SUMMARY: shec's gtest for each argument of minimum_to_decode()/decode()
+
+#include <errno.h>
+
+#include "crush/CrushWrapper.h"
+#include "osd/osd_types.h"
+#include "include/stringify.h"
+#include "global/global_init.h"
+#include "erasure-code/shec/ErasureCodeShec.h"
+#include "erasure-code/ErasureCodePlugin.h"
+#include "common/ceph_argparse.h"
+#include "global/global_context.h"
+#include "gtest/gtest.h"
+
+unsigned int count_num = 0;
+unsigned int unexpected_count = 0;
+unsigned int value_count = 0;
+
+map<set<int>,set<set<int> > > shec_table;
+
+int getint(int a, int b) {
+  return ((1 << a) | (1 << b));
+}
+
+int getint(int a, int b, int c) {
+  return ((1 << a) | (1 << b) | (1 << c));
+}
+
+int getint(int a, int b, int c, int d) {
+  return ((1 << a) | (1 << b) | (1 << c) | (1 << d));
+}
+
+void create_table_shec432() {
+  set<int> table_key,vec_avails;
+  set<set<int> > table_value;
+
+  for (int want_count = 0; want_count < 7; ++want_count) {
+    for (int want = 1; want < (1<<7); ++want) {
+      table_key.clear();
+      table_value.clear();
+      if (__builtin_popcount(want) != want_count) {
+        continue;
+      }
+      {
+        for (int i = 0; i < 7; ++i) {
+          if (want & (1 << i)) {
+            table_key.insert(i);
+          }
+        }
+      }
+      vector<int> vec;
+      for (int avails = 0; avails < (1<<7); ++avails) {
+        if (want & avails) {
+          continue;
+        }
+        if (__builtin_popcount(avails) == 2 &&
+            __builtin_popcount(want) == 1) {
+          if ((want | avails) == getint(0,1,5) ||
+              (want | avails) == getint(2,3,6)) {
+            vec.push_back(avails);
+          }
+        }
+      }
+      
+      for (int avails = 0; avails < (1<<7); ++avails) {
+        if (want & avails) {
+          continue;
+        }
+        if (__builtin_popcount(avails) == 4) {
+          if ((avails) == getint(0,1,2,3) ||
+              (avails) == getint(0,1,2,4) ||
+              (avails) == getint(0,1,2,6) ||
+              (avails) == getint(0,1,3,4) ||
+              (avails) == getint(0,1,3,6) ||
+              (avails) == getint(0,1,4,6) ||
+              (avails) == getint(0,2,3,4) ||
+              (avails) == getint(0,2,3,5) ||
+              (avails) == getint(0,2,4,5) ||
+              (avails) == getint(0,2,4,6) ||
+              (avails) == getint(0,2,5,6) ||
+              (avails) == getint(0,3,4,5) ||
+              (avails) == getint(0,3,4,6) ||
+              (avails) == getint(0,3,5,6) ||
+              (avails) == getint(0,4,5,6) ||
+              (avails) == getint(1,2,3,4) ||
+              (avails) == getint(1,2,3,5) ||
+              (avails) == getint(1,2,4,5) ||
+              (avails) == getint(1,2,4,6) ||
+              (avails) == getint(1,2,5,6) ||
+              (avails) == getint(1,3,4,5) ||
+              (avails) == getint(1,3,4,6) ||
+              (avails) == getint(1,3,5,6) ||
+              (avails) == getint(1,4,5,6) ||
+              (avails) == getint(2,3,4,5) ||
+              (avails) == getint(2,4,5,6) ||
+              (avails) == getint(3,4,5,6)) {
+            vec.push_back(avails);
+          }
+        }
+      }
+      for (int i = 0; i < (int)vec.size(); ++i) {
+        for (int j = i + 1; j < (int)vec.size(); ++j) {
+          if ((vec[i] & vec[j]) == vec[i]) {
+            vec.erase(vec.begin() + j);
+            --j;
+          }
+        }
+      }
+      for (int i = 0; i < (int)vec.size(); ++i) {
+        vec_avails.clear();
+        for (int j = 0; j < 7; ++j) {
+          if (vec[i] & (1 << j)) {
+            vec_avails.insert(j);
+          }
+        }
+        table_value.insert(vec_avails);
+      }
+      shec_table.insert(std::make_pair(table_key,table_value));
+    }
+  }
+}
+
+bool search_table_shec432(set<int> want_to_read, set<int> available_chunks) {
+  set<set<int> > tmp;
+  set<int> settmp;
+  bool found;
+
+  tmp = shec_table.find(want_to_read)->second;
+  for (set<set<int> >::iterator itr = tmp.begin();itr != tmp.end(); ++itr) {
+    found = true;
+    value_count = 0;
+    settmp = *itr;
+    for (set<int>::iterator setitr = settmp.begin();setitr != settmp.end(); ++setitr) {
+      if (!available_chunks.count(*setitr)) {
+        found = false;
+      }
+      ++value_count;
+    }
+    if (found) {
+      return true;
+    }
+  }
+  return false;
+}
+
+TEST(ParameterTest, combination_all)
+{
+  int result;
+  unsigned alignment, tail, padded_length;
+  const unsigned int kObjectSize = 128;
+
+  //get profile
+  char* k = (char*)"4";
+  char* m = (char*)"3";
+  char* c = (char*)"2";
+  int i_k = atoi(k);
+  int i_m = atoi(m);
+  int i_c = atoi(c);
+  alignment = i_k * 8 * sizeof(int);
+  tail = kObjectSize % alignment;
+  padded_length = kObjectSize + (tail ? (alignment - tail) : 0);
+  unsigned c_size = padded_length / i_k;
+
+  //init
+  ErasureCodeShecTableCache tcache;
+  ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
+				  tcache,
+				  ErasureCodeShec::MULTIPLE);
+  map < std::string, std::string > *profile = new map<std::string,
+							 std::string>();
+  (*profile)["plugin"] = "shec";
+  (*profile)["technique"] = "";
+  (*profile)["ruleset-failure-domain"] = "osd";
+  (*profile)["k"] = k;
+  (*profile)["m"] = m;
+  (*profile)["c"] = c;
+
+  result = shec->init(*profile, &cerr);
+
+  //check profile
+  EXPECT_EQ(i_k, shec->k);
+  EXPECT_EQ(i_m, shec->m);
+  EXPECT_EQ(i_c, shec->c);
+  EXPECT_EQ(8, shec->w);
+  EXPECT_EQ(ErasureCodeShec::MULTIPLE, shec->technique);
+  EXPECT_STREQ("default", shec->ruleset_root.c_str());
+  EXPECT_STREQ("osd", shec->ruleset_failure_domain.c_str());
+  EXPECT_TRUE(shec->matrix != NULL);
+  EXPECT_EQ(0, result);
+
+  //encode
+  bufferlist in,out1;
+  set<int> want_to_encode;
+  map<int, bufferlist> encoded;
+
+  in.append("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//length = 62
+	    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"//124
+	    "0123"//128
+  );
+  for (unsigned int i = 0; i < shec->get_chunk_count(); ++i) {
+    want_to_encode.insert(i);
+  }
+
+  result = shec->encode(want_to_encode, in, &encoded);
+  EXPECT_EQ(0, result);
+  EXPECT_EQ(i_k+i_m, (int)encoded.size());
+  EXPECT_EQ(c_size, encoded[0].length());
+  //out1 is "encoded"
+  for (unsigned int i = 0; i < encoded.size(); ++i) {
+    out1.append(encoded[i]);
+  }
+  EXPECT_FALSE(out1 == in);
+
+  set<int> want_to_read, available_chunks, minimum_chunks, want_to_read_without_avails;
+  set<int>::iterator itr;
+  int array_want_to_read[shec->get_chunk_count()];
+  int array_available_chunks[shec->get_chunk_count()];
+  int dresult,cmp;
+  map<int, bufferlist> inchunks,decoded;
+  bufferlist usable;
+  unsigned int minimum_count;
+
+  for (unsigned int w1 = 0; w1 <= shec->get_chunk_count(); ++w1) {
+    const unsigned int r1 = w1;		// combination(k+m,r1)
+
+    for (unsigned int i = 0; i < r1; ++i) {
+      array_want_to_read[i] = 1;
+    }
+    for (unsigned int i = r1; i < shec->get_chunk_count(); ++i) {
+      array_want_to_read[i] = 0;
+    }
+
+    for (unsigned w2 = 0; w2 <= shec->get_chunk_count(); ++w2) {
+      const unsigned int r2 = w2;	// combination(k+m,r2)
+
+      for (unsigned int i = 0; i < r2; ++i ) {
+        array_available_chunks[i] = 1;
+      }
+      for (unsigned int i = r2; i < shec->get_chunk_count(); ++i ) {
+        array_available_chunks[i] = 0;
+      }
+
+      do {
+        do {
+          for (unsigned int i = 0; i < shec->get_chunk_count(); ++i) {
+	    if (array_want_to_read[i]) {
+	      want_to_read.insert(i);
+	    }
+            if (array_available_chunks[i]) {
+              available_chunks.insert(i);
+              inchunks.insert(make_pair(i,encoded[i]));
+            }
+          }
+
+          result = shec->minimum_to_decode(want_to_read, available_chunks,
+				           &minimum_chunks);
+          dresult = shec->decode(want_to_read, inchunks, &decoded);
+          ++count_num;
+          minimum_count = 0;
+
+          if (want_to_read.size() == 0) {
+            EXPECT_EQ(0, result);
+	    EXPECT_EQ(0u, minimum_chunks.size());
+            EXPECT_EQ(0, dresult);
+            EXPECT_EQ(0u, decoded.size());
+            EXPECT_EQ(0u, decoded[0].length());
+            if (result != 0 || dresult != 0) {
+              ++unexpected_count;
+            }
+          } else {
+            // want - avail
+            for (itr = want_to_read.begin();itr != want_to_read.end(); ++itr) {
+              if (!available_chunks.count(*itr)) {
+                want_to_read_without_avails.insert(*itr);
+              } else {
+                ++minimum_count;
+              }
+            }
+            
+            if (want_to_read_without_avails.size() == 0) {
+              EXPECT_EQ(0, result);
+	      EXPECT_LT(0u, minimum_chunks.size());
+	      EXPECT_GE(minimum_count, minimum_chunks.size());
+              EXPECT_EQ(0, dresult);
+              EXPECT_NE(0u, decoded.size());
+              for (unsigned int i = 0; i < shec->get_data_chunk_count(); ++i) {
+                if (array_want_to_read[i]) {
+                  usable.clear();
+                  usable.substr_of(in, c_size * i, c_size);
+                  cmp = memcmp(decoded[i].c_str(), usable.c_str(), c_size);
+                  EXPECT_EQ(c_size, decoded[i].length());
+                  EXPECT_EQ(0, cmp);
+                  if (cmp != 0) {
+                    ++unexpected_count;
+                  }
+                }
+              }
+              if (result != 0 || dresult != 0) {
+                ++unexpected_count;
+              }
+            } else if (want_to_read_without_avails.size() > 3) {
+              EXPECT_EQ(-EIO, result);
+	      EXPECT_EQ(0u, minimum_chunks.size());
+              EXPECT_EQ(-1, dresult);
+              EXPECT_EQ(shec->get_chunk_count(), decoded.size());
+              if (result != -EIO || dresult != -1) {
+                ++unexpected_count;
+              }
+            } else {
+              // search
+              if (search_table_shec432(want_to_read_without_avails,available_chunks)) {
+                EXPECT_EQ(0, result);
+	        EXPECT_LT(0u, minimum_chunks.size());
+	        EXPECT_GE(value_count + minimum_count, minimum_chunks.size());
+                EXPECT_EQ(0, dresult);
+                EXPECT_NE(0u, decoded.size());
+                for (unsigned int i = 0; i < shec->get_data_chunk_count(); ++i) {
+                  if (array_want_to_read[i]) {
+                    usable.clear();
+                    usable.substr_of(in, c_size * i, c_size);
+                    cmp = memcmp(decoded[i].c_str(), usable.c_str(), c_size);
+                    EXPECT_EQ(c_size, decoded[i].length());
+                    EXPECT_EQ(0, cmp);
+                    if (cmp != 0) {
+                      ++unexpected_count;
+                      std::cout << "decoded[" << i << "] = " << decoded[i].c_str() << std::endl;
+                      std::cout << "usable = " << usable.c_str() << std::endl;
+                      std::cout << "want_to_read    :" << want_to_read << std::endl;
+                      std::cout << "available_chunks:" << available_chunks << std::endl;
+                      std::cout << "minimum_chunks  :" << minimum_chunks << std::endl;
+                    }
+                  }
+                }
+                if (result != 0 || dresult != 0) {
+                  ++unexpected_count;
+                }
+              } else {
+                EXPECT_EQ(-EIO, result);
+	        EXPECT_EQ(0u, minimum_chunks.size());
+                EXPECT_EQ(-1, dresult);
+                EXPECT_EQ(shec->get_chunk_count(), decoded.size());
+                if (result != -EIO || dresult != -1) {
+                  ++unexpected_count;
+                }
+              }
+            }
+          }
+
+          want_to_read.clear();
+          want_to_read_without_avails.clear();
+          available_chunks.clear();
+          minimum_chunks.clear();
+          inchunks.clear();
+          decoded.clear();
+          usable.clear();
+        } while (std::prev_permutation(
+		   array_want_to_read,
+		   array_want_to_read + shec->get_chunk_count()));
+
+      } while (std::prev_permutation(
+                 array_available_chunks,
+                 array_available_chunks + shec->get_chunk_count()));
+    }
+  }
+
+  delete shec;
+  delete profile;
+}
+
+int main(int argc, char **argv)
+{
+  int r;
+
+  vector<const char*> args;
+  argv_to_vec(argc, (const char **) argv, args);
+
+  global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
+  common_init_finish(g_ceph_context);
+
+  g_conf->set_val("erasure_code_dir", ".libs", false, false);
+
+  ::testing::InitGoogleTest(&argc, argv);
+
+  create_table_shec432();
+
+  r = RUN_ALL_TESTS();
+
+  std::cout << "minimum_to_decode:total_num = " << count_num
+      << std::endl;
+  std::cout << "minimum_to_decode:unexpected_num = " << unexpected_count
+      << std::endl;
+
+  return r;
+}
diff --git a/src/test/erasure-code/TestErasureCodeShec_thread.cc b/src/test/erasure-code/TestErasureCodeShec_thread.cc
index fdd6bfe..529ec1b 100644
--- a/src/test/erasure-code/TestErasureCodeShec_thread.cc
+++ b/src/test/erasure-code/TestErasureCodeShec_thread.cc
@@ -93,16 +93,17 @@ int main(int argc, char **argv)
   global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
   common_init_finish(g_ceph_context);
 
+  g_conf->set_val("erasure_code_dir", ".libs", false, false);
+
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
 
 void* thread1(void* pParam)
 {
-  TestParam* param = (TestParam*) pParam;
+  TestParam* param = static_cast<TestParam*>(pParam);
 
   time_t start, end;
-  int r;
 
   ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
 
@@ -136,19 +137,19 @@ void* thread1(void* pParam)
 
   while (kTestSec >= (end - start)) {
     //init
+    int r;
     ErasureCodeShec* shec = new ErasureCodeShecReedSolomonVandermonde(
 				    tcache,
 				    ErasureCodeShec::MULTIPLE);
-    map < std::string, std::string > *parameters = new map<std::string,
-							   std::string>();
-    (*parameters)["plugin"] = "shec";
-    (*parameters)["technique"] = "multiple";
-    (*parameters)["ruleset-failure-domain"] = "osd";
-    (*parameters)["k"] = param->k;
-    (*parameters)["m"] = param->m;
-    (*parameters)["c"] = param->c;
-    (*parameters)["w"] = param->w;
-    r = shec->init(*parameters);
+    ErasureCodeProfile *profile = new ErasureCodeProfile();
+    (*profile)["plugin"] = "shec";
+    (*profile)["technique"] = "multiple";
+    (*profile)["ruleset-failure-domain"] = "osd";
+    (*profile)["k"] = param->k;
+    (*profile)["m"] = param->m;
+    (*profile)["c"] = param->c;
+    (*profile)["w"] = param->w;
+    r = shec->init(*profile, &cerr);
 
     int i_k = std::atoi(param->k.c_str());
     int i_m = std::atoi(param->m.c_str());
@@ -216,7 +217,7 @@ void* thread1(void* pParam)
     }
 
     delete shec;
-    delete parameters;
+    delete profile;
     want_to_encode.clear();
     encoded.clear();
     decoded.clear();
diff --git a/src/test/erasure-code/TestShecPluginGeneric.cc b/src/test/erasure-code/TestShecPluginGeneric.cc
new file mode 100644
index 0000000..0bd063b
--- /dev/null
+++ b/src/test/erasure-code/TestShecPluginGeneric.cc
@@ -0,0 +1,29 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph distributed storage system
+ *
+ * Copyright (C) 2014 Cloudwatt <libre.licensing at cloudwatt.com>
+ * Copyright (C) 2014 Red Hat <contact at redhat.com>
+ * Copyright (C) 2015 FUJITSU LIMITED
+ *
+ * Author: Loic Dachary <loic at dachary.org>
+ * Author: Shotaro Kawaguchi <kawaguchi.s at jp.fujitsu.com>
+ * Author: Takanori Nakao <nakao.takanori at jp.fujitsu.com>
+ * Author: Takeshi Miyamae <miyamae.takeshi at jp.fujitsu.com>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ * 
+ */
+
+#include "ceph_ver.h"
+
+extern "C" const char *__erasure_code_version() { return CEPH_GIT_NICE_VER; }
+
+extern "C" int __erasure_code_init(char *plugin_name, char *directory)
+{
+  return -111;
+}
diff --git a/src/test/erasure-code/TestShecPluginNEON.cc b/src/test/erasure-code/TestShecPluginNEON.cc
new file mode 100644
index 0000000..373f5eb
--- /dev/null
+++ b/src/test/erasure-code/TestShecPluginNEON.cc
@@ -0,0 +1,29 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph distributed storage system
+ *
+ * Copyright (C) 2014 Cloudwatt <libre.licensing at cloudwatt.com>
+ * Copyright (C) 2014 Red Hat <contact at redhat.com>
+ * Copyright (C) 2015 FUJITSU LIMITED
+ *
+ * Author: Loic Dachary <loic at dachary.org>
+ * Author: Shotaro Kawaguchi <kawaguchi.s at jp.fujitsu.com>
+ * Author: Takanori Nakao <nakao.takanori at jp.fujitsu.com>
+ * Author: Takeshi Miyamae <miyamae.takeshi at jp.fujitsu.com>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#include "ceph_ver.h"
+
+extern "C" const char *__erasure_code_version() { return CEPH_GIT_NICE_VER; }
+
+extern "C" int __erasure_code_init(char *plugin_name, char *directory)
+{
+  return -555;
+}
diff --git a/src/test/erasure-code/TestShecPluginSSE3.cc b/src/test/erasure-code/TestShecPluginSSE3.cc
new file mode 100644
index 0000000..220ea7b
--- /dev/null
+++ b/src/test/erasure-code/TestShecPluginSSE3.cc
@@ -0,0 +1,29 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph distributed storage system
+ *
+ * Copyright (C) 2014 Cloudwatt <libre.licensing at cloudwatt.com>
+ * Copyright (C) 2014 Red Hat <contact at redhat.com>
+ * Copyright (C) 2015 FUJITSU LIMITED
+ *
+ * Author: Loic Dachary <loic at dachary.org>
+ * Author: Shotaro Kawaguchi <kawaguchi.s at jp.fujitsu.com>
+ * Author: Takanori Nakao <nakao.takanori at jp.fujitsu.com>
+ * Author: Takeshi Miyamae <miyamae.takeshi at jp.fujitsu.com>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ * 
+ */
+
+#include "ceph_ver.h"
+
+extern "C" const char *__erasure_code_version() { return CEPH_GIT_NICE_VER; }
+
+extern "C" int __erasure_code_init(char *plugin_name, char *directory)
+{
+  return -333;
+}
diff --git a/src/test/erasure-code/TestShecPluginSSE4.cc b/src/test/erasure-code/TestShecPluginSSE4.cc
new file mode 100644
index 0000000..ef8e841
--- /dev/null
+++ b/src/test/erasure-code/TestShecPluginSSE4.cc
@@ -0,0 +1,29 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph distributed storage system
+ *
+ * Copyright (C) 2014 Cloudwatt <libre.licensing at cloudwatt.com>
+ * Copyright (C) 2014 Red Hat <contact at redhat.com>
+ * Copyright (C) 2015 FUJITSU LIMITED
+ *
+ * Author: Loic Dachary <loic at dachary.org>
+ * Author: Shotaro Kawaguchi <kawaguchi.s at jp.fujitsu.com>
+ * Author: Takanori Nakao <nakao.takanori at jp.fujitsu.com>
+ * Author: Takeshi Miyamae <miyamae.takeshi at jp.fujitsu.com>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ * 
+ */
+
+#include "ceph_ver.h"
+
+extern "C" const char *__erasure_code_version() { return CEPH_GIT_NICE_VER; }
+
+extern "C" int __erasure_code_init(char *plugin_name, char *directory)
+{
+  return -444;
+}
diff --git a/src/test/erasure-code/ceph_erasure_code.cc b/src/test/erasure-code/ceph_erasure_code.cc
index c17c8e9..00d4496 100644
--- a/src/test/erasure-code/ceph_erasure_code.cc
+++ b/src/test/erasure-code/ceph_erasure_code.cc
@@ -36,7 +36,7 @@ namespace po = boost::program_options;
 
 class ErasureCodeCommand {
   po::variables_map vm;
-  map<string,string> parameters;
+  ErasureCodeProfile profile;
 public:
   int setup(int argc, char** argv);
   int run();
@@ -88,6 +88,7 @@ int ErasureCodeCommand::setup(int argc, char** argv) {
     CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
   common_init_finish(g_ceph_context);
   g_ceph_context->_conf->apply_changes(NULL);
+  g_conf->set_val("erasure_code_dir", ".libs", false, false);
 
   if (vm.count("help")) {
     cout << desc << std::endl;
@@ -105,14 +106,11 @@ int ErasureCodeCommand::setup(int argc, char** argv) {
 	cerr << "--parameter " << *i
 	     << " ignored because it does not contain exactly one =" << endl;
       } else {
-	parameters[strs[0]] = strs[1];
+	profile[strs[0]] = strs[1];
       }
     }
   }
 
-  if (parameters.count("directory") == 0)
-    parameters["directory"] = ".libs";
-
   return 0;
 }
 
@@ -128,7 +126,8 @@ int ErasureCodeCommand::plugin_exists() {
   ErasureCodePlugin *plugin = 0;
   Mutex::Locker l(instance.lock);
   stringstream ss;
-  int code = instance.load(vm["plugin_exists"].as<string>(), parameters["directory"], &plugin, ss);
+  int code = instance.load(vm["plugin_exists"].as<string>(),
+			   g_conf->erasure_code_dir, &plugin, &ss);
   if (code)
     cerr << ss.str() << endl;
   return code;
@@ -138,14 +137,15 @@ int ErasureCodeCommand::display_information() {
   ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
   ErasureCodeInterfaceRef erasure_code;
 
-  if (parameters.count("plugin") == 0) {
+  if (profile.count("plugin") == 0) {
     cerr << "--parameter plugin=<plugin> is mandatory" << endl;
     return 1;
   }
 
-  int code = instance.factory(parameters["plugin"],
-			      parameters,
-			      &erasure_code, cerr);
+  int code = instance.factory(profile["plugin"],
+			      g_conf->erasure_code_dir,
+			      profile,
+			      &erasure_code, &cerr);
   if (code)
     return code;
 
diff --git a/src/test/erasure-code/ceph_erasure_code_benchmark.cc b/src/test/erasure-code/ceph_erasure_code_benchmark.cc
index 5e73f1f..052d8fe 100644
--- a/src/test/erasure-code/ceph_erasure_code_benchmark.cc
+++ b/src/test/erasure-code/ceph_erasure_code_benchmark.cc
@@ -52,13 +52,15 @@ int ErasureCodeBench::setup(int argc, char** argv) {
      "run either encode or decode")
     ("erasures,e", po::value<int>()->default_value(1),
      "number of erasures when decoding")
+    ("erased", po::value<vector<int> >(),
+     "erased chunk (repeat if more than one chunk is erased)")
     ("erasures-generation,E", po::value<string>()->default_value("random"),
      "If set to 'random', pick the number of chunks to recover (as specified by "
      " --erasures) at random. If set to 'exhaustive' try all combinations of erasures "
      " (i.e. k=4,m=3 with one erasure will try to recover from the erasure of "
      " the first chunk, then the second etc.)")
     ("parameter,P", po::value<vector<string> >(),
-     "parameters")
+     "add a parameter to the erasure code profile")
     ;
 
   po::variables_map vm;
@@ -85,6 +87,7 @@ int ErasureCodeBench::setup(int argc, char** argv) {
     CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
   common_init_finish(g_ceph_context);
   g_ceph_context->_conf->apply_changes(NULL);
+  g_conf->set_val("erasure_code_dir", ".libs", false, false);
 
   if (vm.count("help")) {
     cout << desc << std::endl;
@@ -101,14 +104,11 @@ int ErasureCodeBench::setup(int argc, char** argv) {
       if (strs.size() != 2) {
 	cerr << "--parameter " << *i << " ignored because it does not contain exactly one =" << endl;
       } else {
-	parameters[strs[0]] = strs[1];
+	profile[strs[0]] = strs[1];
       }
     }
   }
 
-  if (parameters.count("directory") == 0)
-    parameters["directory"] = ".libs";
-
   in_size = vm["size"].as<int>();
   max_iterations = vm["iterations"].as<int>();
   plugin = vm["plugin"].as<string>();
@@ -119,9 +119,11 @@ int ErasureCodeBench::setup(int argc, char** argv) {
     exhaustive_erasures = true;
   else
     exhaustive_erasures = false;
+  if (vm.count("erased") > 0)
+    erased = vm["erased"].as<vector<int> >();
 
-  k = atoi(parameters["k"].c_str());
-  m = atoi(parameters["m"].c_str());
+  k = atoi(profile["k"].c_str());
+  m = atoi(profile["m"].c_str());
   
   if (k <= 0) {
     cout << "parameter k is " << k << ". But k needs to be > 0." << endl;
@@ -151,7 +153,9 @@ int ErasureCodeBench::encode()
   ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
   ErasureCodeInterfaceRef erasure_code;
   stringstream messages;
-  int code = instance.factory(plugin, parameters, &erasure_code, messages);
+  int code = instance.factory(plugin,
+			      g_conf->erasure_code_dir,
+			      profile, &erasure_code, &messages);
   if (code) {
     cerr << messages.str() << endl;
     return code;
@@ -185,6 +189,20 @@ int ErasureCodeBench::encode()
   return 0;
 }
 
+static void display_chunks(const map<int,bufferlist> &chunks,
+			   unsigned int chunk_count) {
+  cout << "chunks ";
+  for (unsigned int chunk = 0; chunk < chunk_count; chunk++) {
+    if (chunks.count(chunk) == 0) {
+      cout << "(" << chunk << ")";
+    } else {
+      cout << " " << chunk << " ";
+    }
+    cout << " ";
+  }
+  cout << "(X) is an erased chunk" << endl;
+}
+
 int ErasureCodeBench::decode_erasures(const map<int,bufferlist> &all_chunks,
 				      const map<int,bufferlist> &chunks,
 				      unsigned i,
@@ -195,22 +213,11 @@ int ErasureCodeBench::decode_erasures(const map<int,bufferlist> &all_chunks,
 
   if (want_erasures == 0) {
     if (verbose)
-      cout << "chunks ";
+      display_chunks(chunks, erasure_code->get_chunk_count());
     set<int> want_to_read;
-    for (unsigned int chunk = 0; chunk < erasure_code->get_chunk_count(); chunk++) {
-      if (chunks.count(chunk) == 0) {
-	if (verbose)
-	  cout << "(" << chunk << ")";
+    for (unsigned int chunk = 0; chunk < erasure_code->get_chunk_count(); chunk++)
+      if (chunks.count(chunk) == 0)
 	want_to_read.insert(chunk);
-      } else {
-	if (verbose)
-	  cout << " " << chunk << " ";
-      }
-      if (verbose)
-	cout << " ";
-    }
-    if (verbose)
-      cout << "(X) is an erased chunk" << endl;
 
     map<int,bufferlist> decoded;
     code = erasure_code->decode(want_to_read, chunks, &decoded);
@@ -250,7 +257,9 @@ int ErasureCodeBench::decode()
   ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
   ErasureCodeInterfaceRef erasure_code;
   stringstream messages;
-  int code = instance.factory(plugin, parameters, &erasure_code, messages);
+  int code = instance.factory(plugin,
+			      g_conf->erasure_code_dir,
+			      profile, &erasure_code, &messages);
   if (code) {
     cerr << messages.str() << endl;
     return code;
@@ -279,12 +288,25 @@ int ErasureCodeBench::decode()
 
   set<int> want_to_read = want_to_encode;
 
+  if (erased.size() > 0) {
+    for (vector<int>::const_iterator i = erased.begin();
+	 i != erased.end();
+	 ++i)
+      encoded.erase(*i);
+    display_chunks(encoded, erasure_code->get_chunk_count());
+  }
+
   utime_t begin_time = ceph_clock_now(g_ceph_context);
   for (int i = 0; i < max_iterations; i++) {
     if (exhaustive_erasures) {
       code = decode_erasures(encoded, encoded, 0, erasures, erasure_code);
       if (code)
 	return code;
+    } else if (erased.size() > 0) {
+      map<int,bufferlist> decoded;
+      code = erasure_code->decode(want_to_read, encoded, &decoded);
+      if (code)
+	return code;
     } else {
       map<int,bufferlist> chunks = encoded;
       for (int j = 0; j < erasures; j++) {
diff --git a/src/test/erasure-code/ceph_erasure_code_benchmark.h b/src/test/erasure-code/ceph_erasure_code_benchmark.h
index 2786e25..c937591 100644
--- a/src/test/erasure-code/ceph_erasure_code_benchmark.h
+++ b/src/test/erasure-code/ceph_erasure_code_benchmark.h
@@ -32,9 +32,10 @@ class ErasureCodeBench {
   string plugin;
 
   bool exhaustive_erasures;
+  vector<int> erased;
   string workload;
 
-  map<string,string> parameters;
+  ErasureCodeProfile profile;
 
   bool verbose;
 public:
diff --git a/src/test/erasure-code/ceph_erasure_code_non_regression.cc b/src/test/erasure-code/ceph_erasure_code_non_regression.cc
index 2dbd4e8..91e251f 100644
--- a/src/test/erasure-code/ceph_erasure_code_non_regression.cc
+++ b/src/test/erasure-code/ceph_erasure_code_non_regression.cc
@@ -3,7 +3,7 @@
 /*
  * Ceph distributed storage system
  *
- * Red Hat (C) 2014 Red Hat <contact at redhat.com>
+ * Red Hat (C) 2014, 2015 Red Hat <contact at redhat.com>
  *
  * Author: Loic Dachary <loic at dachary.org>
  *
@@ -39,12 +39,14 @@ class ErasureCodeNonRegression {
   string plugin;
   bool create;
   bool check;
+  bool show_path;
   string base;
   string directory;
-  map<string,string> parameters;
+  ErasureCodeProfile profile;
 public:
   int setup(int argc, char** argv);
   int run();
+  int run_show_path();
   int run_create();
   int run_check();
   int decode_erasures(ErasureCodeInterfaceRef erasure_code,
@@ -66,7 +68,9 @@ int ErasureCodeNonRegression::setup(int argc, char** argv) {
     ("base", po::value<string>()->default_value("."),
      "prefix all paths with base")
     ("parameter,P", po::value<vector<string> >(),
-     "parameters")
+     "add a parameter to the erasure code profile")
+    ("path", po::value<string>(), "content path instead of inferring it from parameters")
+    ("show-path", "display the content path and exit")
     ("create", "create the erasure coded content in the directory")
     ("check", "check the content in the directory matches the chunks and vice versa")
     ;
@@ -95,6 +99,7 @@ int ErasureCodeNonRegression::setup(int argc, char** argv) {
     CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
   common_init_finish(g_ceph_context);
   g_ceph_context->_conf->apply_changes(NULL);
+  g_conf->set_val("erasure_code_dir", ".libs", false, false);
 
   if (vm.count("help")) {
     cout << desc << std::endl;
@@ -106,15 +111,16 @@ int ErasureCodeNonRegression::setup(int argc, char** argv) {
   base = vm["base"].as<string>();
   check = vm.count("check") > 0;
   create = vm.count("create") > 0;
+  show_path = vm.count("show-path") > 0;
 
-  if (!check && !create) {
-    cerr << "must specifify either --check or --create" << endl;
+  if (!check && !create && !show_path) {
+    cerr << "must specifify either --check, --create or --show-path" << endl;
     return 1;
   }
 
   {
     stringstream path;
-    path << base << "/" << "plugin=" << plugin << " stipe-width=" << stripe_width;
+    path << base << "/" << "plugin=" << plugin << " stripe-width=" << stripe_width;
     directory = path.str();
   }
 
@@ -128,14 +134,14 @@ int ErasureCodeNonRegression::setup(int argc, char** argv) {
       if (strs.size() != 2) {
 	cerr << "--parameter " << *i << " ignored because it does not contain exactly one =" << endl;
       } else {
-	parameters[strs[0]] = strs[1];
+	profile[strs[0]] = strs[1];
       }
-      if (strs[0] != "directory")
-	directory += " " + *i;
+      directory += " " + *i;
     }
   }
-  if (parameters.count("directory") == 0)
-    parameters["directory"] = ".libs";
+
+  if (vm.count("path"))
+    directory = vm["path"].as<string>();
 
   return 0;
 }
@@ -147,15 +153,25 @@ int ErasureCodeNonRegression::run()
     return ret;
   if(check && (ret = run_check()))
     return ret;
+  if(show_path && (ret = run_show_path()))
+    return ret;
   return ret;
 }
 
+int ErasureCodeNonRegression::run_show_path()
+{
+  cout << directory << endl;
+  return 0;
+}
+
 int ErasureCodeNonRegression::run_create()
 {
   ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
   ErasureCodeInterfaceRef erasure_code;
   stringstream messages;
-  int code = instance.factory(plugin, parameters, &erasure_code, messages);
+  int code = instance.factory(plugin,
+			      g_conf->erasure_code_dir,
+			      profile, &erasure_code, &messages);
   if (code) {
     cerr << messages.str() << endl;
     return code;
@@ -225,7 +241,9 @@ int ErasureCodeNonRegression::run_check()
   ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
   ErasureCodeInterfaceRef erasure_code;
   stringstream messages;
-  int code = instance.factory(plugin, parameters, &erasure_code, messages);
+  int code = instance.factory(plugin,
+			      g_conf->erasure_code_dir,
+			      profile, &erasure_code, &messages);
   if (code) {
     cerr << messages.str() << endl;
     return code;
diff --git a/src/test/erasure-code/test-erasure-code.sh b/src/test/erasure-code/test-erasure-code.sh
index 0409aa6..2840553 100755
--- a/src/test/erasure-code/test-erasure-code.sh
+++ b/src/test/erasure-code/test-erasure-code.sh
@@ -16,37 +16,36 @@
 # GNU Library Public License for more details.
 #
 
-source test/mon/mon-test-helpers.sh
-source test/osd/osd-test-helpers.sh
+source ../qa/workunits/ceph-helpers.sh
 
 function run() {
     local dir=$1
+    shift
 
     export CEPH_MON="127.0.0.1:7101"
     export CEPH_ARGS
     CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
-    CEPH_ARGS+="--enable-experimental-unrecoverable-data-corrupting-features=shec "
     CEPH_ARGS+="--mon-host=$CEPH_MON "
 
     setup $dir || return 1
-    run_mon $dir a --public-addr $CEPH_MON || return 1
+    run_mon $dir a || return 1
     # check that erasure code plugins are preloaded
-    CEPH_ARGS='' ./ceph --admin-daemon $dir/a/ceph-mon.a.asok log flush || return 1
-    grep 'load: jerasure.*lrc' $dir/a/log || return 1
+    CEPH_ARGS='' ./ceph --admin-daemon $dir/ceph-mon.a.asok log flush || return 1
+    grep 'load: jerasure.*lrc' $dir/mon.a.log || return 1
     for id in $(seq 0 10) ; do
         run_osd $dir $id || return 1
     done
+    wait_for_clean || return 1
     # check that erasure code plugins are preloaded
     CEPH_ARGS='' ./ceph --admin-daemon $dir/ceph-osd.0.asok log flush || return 1
-    grep 'load: jerasure.*lrc' $dir/osd-0.log || return 1
+    grep 'load: jerasure.*lrc' $dir/osd.0.log || return 1
     create_erasure_coded_pool ecpool || return 1
-    FUNCTIONS=${FUNCTIONS:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
-    for TEST_function in $FUNCTIONS ; do
-        if ! $TEST_function $dir ; then
-            cat $dir/a/log
-            return 1
-        fi
+
+    local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+    for func in $funcs ; do
+        $func $dir || return 1
     done
+
     delete_pool ecpool || return 1
     teardown $dir || return 1
 }
@@ -58,6 +57,7 @@ function create_erasure_coded_pool() {
         ruleset-failure-domain=osd || return 1
     ./ceph osd pool create $poolname 12 12 erasure myprofile \
         || return 1
+    wait_for_clean || return 1
 }
 
 function delete_pool() {
@@ -69,6 +69,8 @@ function delete_pool() {
 function rados_put_get() {
     local dir=$1
     local poolname=$2
+    local objname=${3:-SOMETHING}
+
 
     for marker in AAA BBB CCCC DDDD ; do
         printf "%*s" 1024 $marker
@@ -77,41 +79,77 @@ function rados_put_get() {
     #
     # get and put an object, compare they are equal
     #
-    ./rados --pool $poolname put SOMETHING $dir/ORIGINAL || return 1
-    ./rados --pool $poolname get SOMETHING $dir/COPY || return 1
+    ./rados --pool $poolname put $objname $dir/ORIGINAL || return 1
+    ./rados --pool $poolname get $objname $dir/COPY || return 1
     diff $dir/ORIGINAL $dir/COPY || return 1
     rm $dir/COPY
 
     #
-    # take out the first OSD used to store the object and
+    # take out an OSD used to store the object and
     # check the object can still be retrieved, which implies
     # recovery
     #
-    local -a initial_osds=($(get_osds $poolname SOMETHING))
+    local -a initial_osds=($(get_osds $poolname $objname))
     local last=$((${#initial_osds[@]} - 1))
     ./ceph osd out ${initial_osds[$last]} || return 1
-    ! get_osds $poolname SOMETHING | grep '\<'${initial_osds[$last]}'\>' || return 1
-    ./rados --pool $poolname get SOMETHING $dir/COPY || return 1
+    ! get_osds $poolname $objname | grep '\<'${initial_osds[$last]}'\>' || return 1
+    ./rados --pool $poolname get $objname $dir/COPY || return 1
     diff $dir/ORIGINAL $dir/COPY || return 1
     ./ceph osd in ${initial_osds[$last]} || return 1
 
     rm $dir/ORIGINAL
 }
 
-function plugin_exists() {
-    local plugin=$1
-
-    local status
-    ./ceph osd erasure-code-profile set TESTPROFILE plugin=$plugin
-    if ./ceph osd crush rule create-erasure TESTRULE TESTPROFILE 2>&1 |
-        grep "$plugin.*No such file" ; then
-        status=1
-    else
-        ./ceph osd crush rule rm TESTRULE
-        status=0
-    fi
-    ./ceph osd erasure-code-profile rm TESTPROFILE 
-    return $status
+function rados_osds_out_in() {
+    local dir=$1
+    local poolname=$2
+    local objname=${3:-SOMETHING}
+
+
+    for marker in FFFF GGGG HHHH IIII ; do
+        printf "%*s" 1024 $marker
+    done > $dir/ORIGINAL
+
+    #
+    # get and put an object, compare they are equal
+    #
+    ./rados --pool $poolname put $objname $dir/ORIGINAL || return 1
+    ./rados --pool $poolname get $objname $dir/COPY || return 1
+    diff $dir/ORIGINAL $dir/COPY || return 1
+    rm $dir/COPY
+
+    #
+    # take out two OSDs used to store the object, wait for the cluster
+    # to be clean (i.e. all PG are clean and active) again which
+    # implies the PG have been moved to use the remaining OSDs.  Check
+    # the object can still be retrieved.
+    #
+    wait_for_clean || return 1
+    local osds_list=$(get_osds $poolname $objname)
+    local -a osds=($osds_list)
+    for osd in 0 1 ; do
+        ./ceph osd out ${osds[$osd]} || return 1
+    done
+    wait_for_clean || return 1
+    #
+    # verify the object is no longer mapped to the osds that are out
+    #
+    for osd in 0 1 ; do
+        ! get_osds $poolname $objname | grep '\<'${osds[$osd]}'\>' || return 1
+    done
+    ./rados --pool $poolname get $objname $dir/COPY || return 1
+    diff $dir/ORIGINAL $dir/COPY || return 1
+    #
+    # bring the osds back in, , wait for the cluster
+    # to be clean (i.e. all PG are clean and active) again which
+    # implies the PG go back to using the same osds as before
+    #
+    for osd in 0 1 ; do
+        ./ceph osd in ${osds[$osd]} || return 1
+    done
+    wait_for_clean || return 1
+    test "$osds_list" = "$(get_osds $poolname $objname)" || return 1
+    rm $dir/ORIGINAL
 }
 
 function TEST_rados_put_get_lrc_advanced() {
@@ -152,7 +190,7 @@ function TEST_rados_put_get_lrc_kml() {
 }
 
 function TEST_rados_put_get_isa() {
-    if ! plugin_exists isa ; then
+    if ! erasure_code_plugin_exists isa ; then
         echo "SKIP because plugin isa has not been built"
         return 0
     fi
@@ -162,7 +200,7 @@ function TEST_rados_put_get_isa() {
     ./ceph osd erasure-code-profile set profile-isa \
         plugin=isa \
         ruleset-failure-domain=osd || return 1
-    ./ceph osd pool create $poolname 12 12 erasure profile-isa \
+    ./ceph osd pool create $poolname 1 1 erasure profile-isa \
         || return 1
 
     rados_put_get $dir $poolname || return 1
@@ -186,6 +224,7 @@ function TEST_rados_put_get_jerasure() {
         || return 1
 
     rados_put_get $dir $poolname || return 1
+    rados_osds_out_in $dir $poolname || return 1
 
     delete_pool $poolname
     ./ceph osd erasure-code-profile rm $profile
@@ -292,7 +331,7 @@ function TEST_chunk_mapping() {
     ./ceph osd erasure-code-profile rm remap-profile
 }
 
-main test-erasure-code
+main test-erasure-code "$@"
 
 # Local Variables:
 # compile-command: "cd ../.. ; make -j4 && test/erasure-code/test-erasure-code.sh"
diff --git a/src/test/erasure-code/test-erasure-eio.sh b/src/test/erasure-code/test-erasure-eio.sh
new file mode 100755
index 0000000..129d09b
--- /dev/null
+++ b/src/test/erasure-code/test-erasure-eio.sh
@@ -0,0 +1,354 @@
+#!/bin/bash
+#
+# Copyright (C) 2015 Red Hat <contact at redhat.com>
+#
+#
+# Author: Kefu Chai <kchai at redhat.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Library Public License for more details.
+#
+
+source ../qa/workunits/ceph-helpers.sh
+
+function run() {
+    local dir=$1
+    shift
+
+    export CEPH_MON="127.0.0.1:7112"
+    export CEPH_ARGS
+    CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+    CEPH_ARGS+="--mon-host=$CEPH_MON "
+
+    local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+    for func in $funcs ; do
+        setup $dir || return 1
+        run_mon $dir a || return 1
+        # check that erasure code plugins are preloaded
+        CEPH_ARGS='' ./ceph --admin-daemon $dir/ceph-mon.a.asok log flush || return 1
+        grep 'load: jerasure.*lrc' $dir/mon.a.log || return 1
+        $func $dir || return 1
+        teardown $dir || return 1
+    done
+}
+
+function setup_osds() {
+    local subread=$1
+
+    for id in $(seq 0 3) ; do
+        # TODO: the feature of "osd-pool-erasure-code-subread-all" is not yet supported.
+        if [ -n "$osd_pool_erasure_code_subread_all__is_supported" ]; then
+            run_osd $dir $id "--osd-pool-erasure-code-subread-all=$subread" || return 1
+        else
+            run_osd $dir $id || return 1
+        fi
+    done
+    wait_for_clean || return 1
+
+    # check that erasure code plugins are preloaded
+    CEPH_ARGS='' ./ceph --admin-daemon $dir/ceph-osd.0.asok log flush || return 1
+    grep 'load: jerasure.*lrc' $dir/osd.0.log || return 1
+}
+
+function create_erasure_coded_pool() {
+    local poolname=$1
+
+    ./ceph osd erasure-code-profile set myprofile \
+        plugin=jerasure \
+        k=2 m=1 \
+        ruleset-failure-domain=osd || return 1
+    ./ceph osd pool create $poolname 1 1 erasure myprofile \
+        || return 1
+    wait_for_clean || return 1
+}
+
+function delete_pool() {
+    local poolname=$1
+
+    ./ceph osd pool delete $poolname $poolname --yes-i-really-really-mean-it
+    ./ceph osd erasure-code-profile rm myprofile
+}
+
+function rados_put() {
+    local dir=$1
+    local poolname=$2
+    local objname=${3:-SOMETHING}
+
+    for marker in AAA BBB CCCC DDDD ; do
+        printf "%*s" 1024 $marker
+    done > $dir/ORIGINAL
+    #
+    # get and put an object, compare they are equal
+    #
+    ./rados --pool $poolname put $objname $dir/ORIGINAL || return 1
+}
+
+function rados_get() {
+    local dir=$1
+    local poolname=$2
+    local objname=${3:-SOMETHING}
+    local expect=${4:-0}
+
+    #
+    # Expect a failure to get object
+    #
+    if [ $expect = "1" ];
+    then
+        ! ./rados --pool $poolname get $objname $dir/COPY
+        return
+    fi
+    #
+    # get an object, compare with $dir/ORIGINAL
+    #
+    ./rados --pool $poolname get $objname $dir/COPY || return 1
+    diff $dir/ORIGINAL $dir/COPY || return 1
+    rm $dir/COPY
+}
+
+function rados_put_get() {
+    local dir=$1
+    local poolname=$2
+    local objname=${3:-SOMETHING}
+    local expect=${4:-0}
+    local recovery=$5
+
+    #
+    # get and put an object, compare they are equal
+    #
+    rados_put $dir $poolname $objname || return 1
+    rados_get $dir $poolname $objname $expect || return 1
+
+    if [ -n "$recovery" ];
+    then
+        #
+        # take out the first OSD used to store the object and
+        # check the object can still be retrieved, which implies
+        # recovery
+        #
+        local -a initial_osds=($(get_osds $poolname $objname))
+        local last=$((${#initial_osds[@]} - 1))
+        ./ceph osd out ${initial_osds[$last]} || return 1
+        ! get_osds $poolname $objname | grep '\<'${initial_osds[$last]}'\>' || return 1
+        rados_get $dir $poolname $objname $expect || return 1
+        ./ceph osd in ${initial_osds[$last]} || return 1
+    fi
+
+    rm $dir/ORIGINAL
+}
+
+function inject_eio() {
+    local objname=$1
+    shift
+    local dir=$1
+    shift
+    local shard_id=$1
+    shift
+
+    local poolname=pool-jerasure
+    local -a initial_osds=($(get_osds $poolname $objname))
+    local osd_id=${initial_osds[$shard_id]}
+    set_config osd $osd_id filestore_debug_inject_read_err true || return 1
+    CEPH_ARGS='' ./ceph --admin-daemon $dir/ceph-osd.$osd_id.asok \
+             injectdataerr $poolname $objname $shard_id || return 1
+}
+
+function rados_get_data_eio() {
+    local dir=$1
+    shift
+    local shard_id=$1
+    shift
+    local recovery=$1
+    shift
+
+    # inject eio to speificied shard
+    #
+    local poolname=pool-jerasure
+    local objname=obj-eio-$$-$shard_id
+    inject_eio $objname $dir $shard_id || return 1
+    rados_put_get $dir $poolname $objname 0 $recovery || return 1
+
+    shard_id=$(expr $shard_id + 1)
+    inject_eio $objname $dir $shard_id || return 1
+    rados_get $dir $poolname $objname 1 || return 1
+}
+
+# Change the size of speificied shard
+#
+function set_size() {
+    local objname=$1
+    shift
+    local dir=$1
+    shift
+    local shard_id=$1
+    shift
+    local bytes=$1
+    shift
+    local mode=${1}
+
+    local poolname=pool-jerasure
+    local -a initial_osds=($(get_osds $poolname $objname))
+    local osd_id=${initial_osds[$shard_id]}
+    if [ "$mode" = "add" ];
+    then
+      objectstore_tool $dir $osd_id $objname get-bytes $dir/CORRUPT || return 1
+      dd if=/dev/urandom bs=$bytes count=1 >> $dir/CORRUPT
+    elif [ "$bytes" = "0" ];
+    then
+      touch $dir/CORRUPT
+    else
+      dd if=/dev/urandom bs=$bytes count=1 of=$dir/CORRUPT
+    fi
+    objectstore_tool $dir $osd_id --op list $objname
+    objectstore_tool $dir $osd_id $objname set-bytes $dir/CORRUPT || return 1
+    rm -f $dir/CORRUPT
+}
+
+function rados_get_data_bad_size() {
+    local dir=$1
+    shift
+    local shard_id=$1
+    shift
+    local bytes=$1
+    shift
+    local mode=${1:-set}
+
+    local poolname=pool-jerasure
+    local objname=obj-size-$$-$shard_id-$bytes
+    rados_put $dir $poolname $objname || return 1
+
+    # Change the size of speificied shard
+    #
+    set_size $objname $dir $shard_id $bytes $mode || return 1
+
+    rados_get $dir $poolname $objname 0 || return 1
+
+    # Leave objname and modify another shard
+    shard_id=$(expr $shard_id + 1)
+    set_size $objname $dir $shard_id $bytes $mode || return 1
+    rados_get $dir $poolname $objname 1 || return 1
+}
+
+#
+# These two test cases try to validate the following behavior:
+#  For object on EC pool, if there is one shard having read error (
+#  either primary or replica), client can still read object.
+#
+# If 2 shards have read errors the client will get an error.
+#
+function TEST_rados_get_subread_eio_shard_0() {
+    local dir=$1
+    setup_osds false || return 1
+
+    local poolname=pool-jerasure
+    create_erasure_coded_pool $poolname || return 1
+    # inject eio on primary OSD (0) and replica OSD (1)
+    local shard_id=0
+    rados_get_data_eio $dir $shard_id || return 1
+    delete_pool $poolname
+}
+
+function TEST_rados_get_subread_eio_shard_1() {
+    local dir=$1
+    setup_osds false || return 1
+
+    local poolname=pool-jerasure
+    create_erasure_coded_pool $poolname || return 1
+    # inject eio into replicas OSD (1) and OSD (2)
+    local shard_id=1
+    rados_get_data_eio $dir $shard_id || return 1
+    delete_pool $poolname
+}
+
+#
+# These two test cases try to validate that following behavior:
+#  For object on EC pool, if there is one shard which an incorrect
+# size this will cause an internal read error, client can still read object.
+#
+# If 2 shards have incorrect size the client will get an error.
+#
+function TEST_rados_get_bad_size_shard_0() {
+    local dir=$1
+    setup_osds false || return 1
+
+    local poolname=pool-jerasure
+    create_erasure_coded_pool $poolname || return 1
+    # Set incorrect size into primary OSD (0) and replica OSD (1)
+    local shard_id=0
+    rados_get_data_bad_size $dir $shard_id 10 || return 1
+    rados_get_data_bad_size $dir $shard_id 0 || return 1
+    rados_get_data_bad_size $dir $shard_id 256 add || return 1
+    delete_pool $poolname
+}
+
+function TEST_rados_get_bad_size_shard_1() {
+    local dir=$1
+    setup_osds false || return 1
+
+    local poolname=pool-jerasure
+    create_erasure_coded_pool $poolname || return 1
+    # Set incorrect size into replicas OSD (1) and OSD (2)
+    local shard_id=1
+    rados_get_data_bad_size $dir $shard_id 10 || return 1
+    rados_get_data_bad_size $dir $shard_id 0 || return 1
+    rados_get_data_bad_size $dir $shard_id 256 add || return 1
+    delete_pool $poolname
+}
+
+: <<'DISABLED_TESTS'
+# this test case is aimed to test the fix of https://github.com/ceph/ceph/pull/2952
+# this test case can test both client read and recovery read on EIO
+# but at this moment, above pull request ONLY resolves client read on EIO
+# so this case will fail at function *rados_put_get* when one OSD out
+# so disable this case for now until both crashes of client read and recovery read
+# on EIO to be fixed
+#
+
+function TEST_rados_get_with_subreadall_eio_shard_0() {
+    local dir=$1
+    local shard_id=0
+
+    setup_osds true || return 1
+
+    local poolname=pool-jerasure
+    create_erasure_coded_pool $poolname || return 1
+    # inject eio on primary OSD (0)
+    local shard_id=0
+    rados_get_data_eio $dir $shard_id recovery || return 1
+
+    check_pg_status $pg "inconsistent" || return 1
+    delete_pool $poolname
+}
+
+function TEST_rados_get_with_subreadall_eio_shard_1() {
+    local dir=$1
+    local shard_id=0
+
+    setup_osds true || return 1
+
+    local poolname=pool-jerasure
+    create_erasure_coded_pool $poolname || return 1
+    # inject eio on replica OSD (1)
+    local shard_id=1
+    rados_get_data_eio $dir $shard_id recovery || return 1
+
+    # the reason to skip this check when current shardid != 0 is that the first
+    # k chunks returned is not always containing current shardid, so this pg may
+    # not be marked as inconsistent. However, primary OSD (when shard_id == 0) is
+    # always the faster one normally, so we can check pg status.
+    ## check_pg_status $pg "inconsistent" || return 1
+    delete_pool $poolname
+}
+DISABLED_TESTS
+
+main test-erasure-eio "$@"
+
+# Local Variables:
+# compile-command: "cd ../.. ; make -j4 && test/erasure-code/test-erasure-eio.sh"
+# End:
diff --git a/src/test/fedora-21/Dockerfile.in b/src/test/fedora-21/Dockerfile.in
new file mode 100644
index 0000000..9b606ea
--- /dev/null
+++ b/src/test/fedora-21/Dockerfile.in
@@ -0,0 +1,29 @@
+#
+# Copyright (C) 2015 Red Hat <contact at redhat.com>
+#
+# Author: Loic Dachary <loic at dachary.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Library Public License for more details.
+#
+# Environment variables are substituted via envsubst(1)
+#
+# user_id=$(id -u)
+# os_version= the desired REPOSITORY TAG
+#
+FROM fedora:%%os_version%%
+COPY install-deps.sh /root/
+COPY ceph.spec.in /root/
+# build dependencies
+RUN yum install -y which ; cd /root ; ./install-deps.sh
+# development tools
+# nc is required to run make check on firefly only (giant+ do not use nc)
+RUN yum install -y ccache valgrind gdb git python-virtualenv gdisk kpartx hdparm jq sudo xmlstarlet parted nc
+RUN useradd -M --uid %%user_id%% %%USER%% && echo '%%USER%% ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
diff --git a/src/test/fedora-21/ceph.spec.in b/src/test/fedora-21/ceph.spec.in
new file mode 100644
index 0000000..8f2a6fc
--- /dev/null
+++ b/src/test/fedora-21/ceph.spec.in
@@ -0,0 +1,1317 @@
+# vim: set noexpandtab ts=8 sw=8 :
+%bcond_with ocf
+%bcond_without cephfs_java
+%bcond_with tests
+%bcond_without tcmalloc
+%bcond_without libs_compat
+%bcond_with lowmem_builder
+%if 0%{?fedora} || 0%{?rhel}
+%bcond_without selinux
+%endif
+%if 0%{?suse_version}
+%bcond_with selinux
+%endif
+
+
+%if (0%{?el5} || (0%{?rhel_version} >= 500 && 0%{?rhel_version} <= 600))
+%{!?python_sitelib: %global python_sitelib %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")}
+%{!?python_sitearch: %global python_sitearch %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib(1))")}
+%endif
+
+%if %{with selinux}
+# get selinux policy version
+%{!?_selinux_policy_version: %global _selinux_policy_version %(sed -e 's,.*selinux-policy-\\([^/]*\\)/.*,\\1,' /usr/share/selinux/devel/policyhelp 2>/dev/null || echo 0.0.0)}
+
+%define relabel_files() \
+restorecon -R /usr/bin/ceph-mon > /dev/null 2>&1; \
+restorecon -R /usr/bin/ceph-osd > /dev/null 2>&1; \
+restorecon -R /usr/bin/ceph-mds > /dev/null 2>&1; \
+restorecon -R /usr/bin/radosgw > /dev/null 2>&1; \
+restorecon -R /etc/rc\.d/init\.d/ceph > /dev/null 2>&1; \
+restorecon -R /etc/rc\.d/init\.d/radosgw > /dev/null 2>&1; \
+restorecon -R /var/run/ceph > /dev/null 2>&1; \
+restorecon -R /var/lib/ceph > /dev/null 2>&1; \
+restorecon -R /var/log/ceph > /dev/null 2>&1;
+%endif
+
+%{!?_udevrulesdir: %global _udevrulesdir /lib/udev/rules.d}
+
+# Use systemd files on RHEL 7 and above and in SUSE/openSUSE.
+# Note: We don't install unit files for the services yet. For now,
+# the _with_systemd variable only implies that we'll install
+# /etc/tmpfiles.d/ceph.conf in order to set up the socket directory in
+# /var/run/ceph.
+%if 0%{?fedora} || 0%{?rhel} >= 7 || 0%{?suse_version} >= 1210
+%global _with_systemd 1
+%endif
+
+# LTTng-UST enabled on Fedora, RHEL 6, and SLES 12
+%if 0%{?fedora} || 0%{?rhel} == 6 || 0%{?suse_version} == 1315
+%global _with_lttng 1
+%endif
+
+#################################################################################
+# common
+#################################################################################
+Name:		ceph
+Version:	@VERSION@
+Release:	@RPM_RELEASE@%{?dist}
+Epoch:		1
+Summary:	User space components of the Ceph file system
+License:	LGPL-2.1 and CC-BY-SA-1.0 and GPL-2.0 and BSL-1.0 and GPL-2.0-with-autoconf-exception and BSD-3-Clause and MIT
+%if 0%{?suse_version}
+Group:         System/Filesystems
+%endif
+URL:		http://ceph.com/
+Source0:	http://ceph.com/download/%{name}-%{version}.tar.bz2
+%if 0%{?fedora} || 0%{?rhel}
+Patch0:		init-ceph.in-fedora.patch
+%endif
+#################################################################################
+# dependencies that apply across all distro families
+#################################################################################
+Requires:	librbd1 = %{epoch}:%{version}-%{release}
+Requires:	librados2 = %{epoch}:%{version}-%{release}
+Requires:	libcephfs1 = %{epoch}:%{version}-%{release}
+Requires:	ceph-common = %{epoch}:%{version}-%{release}
+%if 0%{with selinux}
+Requires:	ceph-selinux = %{epoch}:%{version}-%{release}
+%endif
+Requires:	python-rados = %{epoch}:%{version}-%{release}
+Requires:	python-rbd = %{epoch}:%{version}-%{release}
+Requires:	python-cephfs = %{epoch}:%{version}-%{release}
+Requires:	python
+Requires:	python-requests
+Requires:	grep
+Requires:	xfsprogs
+Requires:	logrotate
+Requires:	parted
+Requires:	util-linux
+Requires:	hdparm
+Requires:	cryptsetup
+Requires:	findutils
+Requires:	which
+Requires(post):	binutils
+%if 0%{with cephfs_java}
+BuildRequires:	java-devel
+BuildRequires:	sharutils
+%endif
+%if 0%{with selinux}
+BuildRequires:	checkpolicy
+BuildRequires:	selinux-policy-devel
+BuildRequires:	/usr/share/selinux/devel/policyhelp
+%endif
+BuildRequires:	gcc-c++
+BuildRequires:	boost-devel
+BuildRequires:  cmake
+BuildRequires:	cryptsetup
+BuildRequires:	fuse-devel
+BuildRequires:	gdbm
+BuildRequires:	hdparm
+BuildRequires:	leveldb-devel > 1.2
+BuildRequires:	libaio-devel
+BuildRequires:	libcurl-devel
+BuildRequires:	libedit-devel
+BuildRequires:	libxml2-devel
+BuildRequires:	libblkid-devel >= 2.17
+BuildRequires:	libudev-devel
+BuildRequires:	libtool
+BuildRequires:	make
+BuildRequires:	parted
+BuildRequires:	perl
+BuildRequires:	pkgconfig
+BuildRequires:	python
+BuildRequires:	python-nose
+BuildRequires:	python-requests
+BuildRequires:	python-virtualenv
+BuildRequires:	snappy-devel
+BuildRequires:	util-linux
+BuildRequires:	xfsprogs
+BuildRequires:	xfsprogs-devel
+BuildRequires:	xmlstarlet
+BuildRequires:	yasm
+
+#################################################################################
+# distro-conditional dependencies
+#################################################################################
+%if 0%{?suse_version}
+%if 0%{?_with_systemd}
+BuildRequires:  pkgconfig(systemd)
+BuildRequires:	systemd-rpm-macros
+%{?systemd_requires}
+%endif
+PreReq:		%fillup_prereq
+Requires:	python-Flask
+BuildRequires:	net-tools
+BuildRequires:	libbz2-devel
+%if 0%{?suse_version} > 1210
+Requires:	gptfdisk
+%if 0%{with tcmalloc}
+BuildRequires:	gperftools-devel
+%endif
+%else
+Requires:	scsirastools
+BuildRequires:	google-perftools-devel
+%endif
+BuildRequires:	mozilla-nss-devel
+BuildRequires:	keyutils-devel
+BuildRequires:	libatomic-ops-devel
+%else
+%if 0%{?_with_systemd}
+Requires:	systemd
+%endif
+BuildRequires:  bzip2-devel
+BuildRequires:	nss-devel
+BuildRequires:	keyutils-libs-devel
+BuildRequires:	libatomic_ops-devel
+Requires:	gdisk
+Requires(post):	chkconfig
+Requires(preun):	chkconfig
+Requires(preun):	initscripts
+BuildRequires:	gperftools-devel
+Requires:	python-flask
+%endif
+# boost
+%if 0%{?fedora} || 0%{?rhel} 
+BuildRequires:  boost-random
+%endif
+# python-argparse for distros with Python 2.6 or lower
+%if (0%{?rhel} && 0%{?rhel} <= 6) || (0%{?suse_version} && 0%{?suse_version} <= 1110)
+BuildRequires:	python-argparse
+%endif
+# lttng and babeltrace for rbd-replay-prep
+%if 0%{?_with_lttng}
+%if 0%{?fedora} || 0%{?rhel}
+BuildRequires:	lttng-ust-devel
+BuildRequires:	libbabeltrace-devel
+%endif
+%if 0%{?suse_version}
+BuildRequires:	lttng-ust-devel
+BuildRequires:  babeltrace-devel
+%endif
+%endif
+# expat and fastcgi for RGW
+%if 0%{?suse_version}
+BuildRequires:	libexpat-devel
+BuildRequires:	FastCGI-devel
+%endif
+%if 0%{?rhel} || 0%{?fedora}
+BuildRequires:	expat-devel
+BuildRequires:	fcgi-devel
+%endif
+# python-sphinx
+%if 0%{?rhel} > 0 && 0%{?rhel} < 7
+BuildRequires:	python-sphinx10
+%endif
+%if 0%{?fedora} || 0%{?suse_version} || 0%{?rhel} >= 7
+BuildRequires:	python-sphinx
+%endif
+
+%description
+Ceph is a massively scalable, open-source, distributed storage system that runs
+on commodity hardware and delivers object, block and file system storage.
+
+
+#################################################################################
+# packages
+#################################################################################
+%package -n ceph-common
+Summary:	Ceph Common
+Group:		System Environment/Base
+Requires:	librbd1 = %{epoch}:%{version}-%{release}
+Requires:	librados2 = %{epoch}:%{version}-%{release}
+Requires:	python-rados = %{epoch}:%{version}-%{release}
+Requires:	python-rbd = %{epoch}:%{version}-%{release}
+Requires:	python-cephfs = %{epoch}:%{version}-%{release}
+Requires:	python-requests
+%if 0%{?_with_systemd}
+%{?systemd_requires}
+%endif
+%if 0%{?suse_version}
+Requires(pre):	pwdutils
+%endif
+# python-argparse is only needed in distros with Python 2.6 or lower
+%if (0%{?rhel} && 0%{?rhel} <= 6) || (0%{?suse_version} && 0%{?suse_version} <= 1110)
+Requires:	python-argparse
+%endif
+%description -n ceph-common
+Common utilities to mount and interact with a ceph storage cluster.
+
+%package fuse
+Summary:	Ceph fuse-based client
+Group:		System Environment/Base
+Requires:	%{name}
+%description fuse
+FUSE based client for Ceph distributed network file system
+
+%package -n rbd-fuse
+Summary:	Ceph fuse-based client
+Group:		System Environment/Base
+Requires:	%{name}
+Requires:	librados2 = %{epoch}:%{version}-%{release}
+Requires:	librbd1 = %{epoch}:%{version}-%{release}
+%description -n rbd-fuse
+FUSE based client to map Ceph rbd images to files
+
+%package radosgw
+Summary:	Rados REST gateway
+Group:		Development/Libraries
+Requires:	ceph-common = %{epoch}:%{version}-%{release}
+%if 0%{with selinux}
+Requires:	ceph-selinux = %{epoch}:%{version}-%{release}
+%endif
+Requires:	librados2 = %{epoch}:%{version}-%{release}
+%if 0%{?rhel} || 0%{?fedora}
+Requires:	mailcap
+%endif
+%description radosgw
+This package is an S3 HTTP REST gateway for the RADOS object store. It
+is implemented as a FastCGI module using libfcgi, and can be used in
+conjunction with any FastCGI capable web server.
+
+%if %{with ocf}
+%package resource-agents
+Summary:	OCF-compliant resource agents for Ceph daemons
+Group:		System Environment/Base
+License:	LGPL-2.0
+Requires:	%{name} = %{epoch}:%{version}
+Requires:	resource-agents
+%description resource-agents
+Resource agents for monitoring and managing Ceph daemons
+under Open Cluster Framework (OCF) compliant resource
+managers such as Pacemaker.
+%endif
+
+%package -n librados2
+Summary:	RADOS distributed object store client library
+Group:		System Environment/Libraries
+License:	LGPL-2.0
+%if 0%{?rhel} || 0%{?fedora}
+Obsoletes:	ceph-libs < %{epoch}:%{version}-%{release}
+%endif
+%description -n librados2
+RADOS is a reliable, autonomic distributed object storage cluster
+developed as part of the Ceph distributed storage system. This is a
+shared library allowing applications to access the distributed object
+store using a simple file-like interface.
+
+%package -n librados2-devel
+Summary:	RADOS headers
+Group:		Development/Libraries
+License:	LGPL-2.0
+Requires:	librados2 = %{epoch}:%{version}-%{release}
+Obsoletes:	ceph-devel < %{epoch}:%{version}-%{release}
+%description -n librados2-devel
+This package contains libraries and headers needed to develop programs
+that use RADOS object store.
+
+%package -n python-rados
+Summary:	Python libraries for the RADOS object store
+Group:		System Environment/Libraries
+License:	LGPL-2.0
+Requires:	librados2 = %{epoch}:%{version}-%{release}
+Obsoletes:	python-ceph < %{epoch}:%{version}-%{release}
+%description -n python-rados
+This package contains Python libraries for interacting with Cephs RADOS
+object store.
+
+%package -n libradosstriper1
+Summary:	RADOS striping interface
+Group:		System Environment/Libraries
+License:	LGPL-2.0
+Requires:	librados2 = %{epoch}:%{version}-%{release}
+%description -n libradosstriper1
+Striping interface built on top of the rados library, allowing
+to stripe bigger objects onto several standard rados objects using
+an interface very similar to the rados one.
+
+%package -n libradosstriper1-devel
+Summary:	RADOS striping interface headers
+Group:		Development/Libraries
+License:	LGPL-2.0
+Requires:	libradosstriper1 = %{epoch}:%{version}-%{release}
+Requires:	librados2-devel = %{epoch}:%{version}-%{release}
+Obsoletes:	ceph-devel < %{epoch}:%{version}-%{release}
+%description -n libradosstriper1-devel
+This package contains libraries and headers needed to develop programs
+that use RADOS striping interface.
+
+%package -n librbd1
+Summary:	RADOS block device client library
+Group:		System Environment/Libraries
+License:	LGPL-2.0
+Requires:	librados2 = %{epoch}:%{version}-%{release}
+%if 0%{?rhel} || 0%{?fedora}
+Obsoletes:	ceph-libs < %{epoch}:%{version}-%{release}
+%endif
+%description -n librbd1
+RBD is a block device striped across multiple distributed objects in
+RADOS, a reliable, autonomic distributed object storage cluster
+developed as part of the Ceph distributed storage system. This is a
+shared library allowing applications to manage these block devices.
+
+%package -n librbd1-devel
+Summary:	RADOS block device headers
+Group:		Development/Libraries
+License:	LGPL-2.0
+Requires:	librbd1 = %{epoch}:%{version}-%{release}
+Requires:	librados2-devel = %{epoch}:%{version}-%{release}
+Obsoletes:	ceph-devel < %{epoch}:%{version}-%{release}
+%description -n librbd1-devel
+This package contains libraries and headers needed to develop programs
+that use RADOS block device.
+
+%package -n python-rbd
+Summary:	Python libraries for the RADOS block device
+Group:		System Environment/Libraries
+License:	LGPL-2.0
+Requires:	librbd1 = %{epoch}:%{version}-%{release}
+Requires:	python-rados = %{epoch}:%{version}-%{release}
+Obsoletes:	python-ceph < %{epoch}:%{version}-%{release}
+%description -n python-rbd
+This package contains Python libraries for interacting with Cephs RADOS
+block device.
+
+%package -n libcephfs1
+Summary:	Ceph distributed file system client library
+Group:		System Environment/Libraries
+License:	LGPL-2.0
+%if 0%{?rhel} || 0%{?fedora}
+Obsoletes:	ceph-libs < %{epoch}:%{version}-%{release}
+Obsoletes:	ceph-libcephfs
+%endif
+%description -n libcephfs1
+Ceph is a distributed network file system designed to provide excellent
+performance, reliability, and scalability. This is a shared library
+allowing applications to access a Ceph distributed file system via a
+POSIX-like interface.
+
+%package -n libcephfs1-devel
+Summary:	Ceph distributed file system headers
+Group:		Development/Libraries
+License:	LGPL-2.0
+Requires:	libcephfs1 = %{epoch}:%{version}-%{release}
+Requires:	librados2-devel = %{epoch}:%{version}-%{release}
+Obsoletes:	ceph-devel < %{epoch}:%{version}-%{release}
+%description -n libcephfs1-devel
+This package contains libraries and headers needed to develop programs
+that use Cephs distributed file system.
+
+%package -n python-cephfs
+Summary:	Python libraries for Ceph distributed file system
+Group:		System Environment/Libraries
+License:	LGPL-2.0
+Requires:	libcephfs1 = %{epoch}:%{version}-%{release}
+Requires:	python-rados = %{epoch}:%{version}-%{release}
+Obsoletes:	python-ceph < %{epoch}:%{version}-%{release}
+%description -n python-cephfs
+This package contains Python libraries for interacting with Cephs distributed
+file system.
+
+%package -n ceph-test
+Summary:	Ceph benchmarks and test tools
+Group:		System Environment/Libraries
+License:	LGPL-2.0
+Requires:	ceph-common
+Requires:	xmlstarlet
+%description -n ceph-test
+This package contains Ceph benchmarks and test tools.
+
+%if 0%{with cephfs_java}
+
+%package -n libcephfs_jni1
+Summary:	Java Native Interface library for CephFS Java bindings
+Group:		System Environment/Libraries
+License:	LGPL-2.0
+Requires:	java
+Requires:	libcephfs1 = %{epoch}:%{version}-%{release}
+%description -n libcephfs_jni1
+This package contains the Java Native Interface library for CephFS Java
+bindings.
+
+%package -n libcephfs_jni1-devel
+Summary:	Development files for CephFS Java Native Interface library
+Group:		System Environment/Libraries
+License:	LGPL-2.0
+Requires:	java
+Requires:	libcephfs_jni1 = %{epoch}:%{version}-%{release}
+Obsoletes:	ceph-devel < %{epoch}:%{version}-%{release}
+%description -n libcephfs_jni1-devel
+This package contains the development files for CephFS Java Native Interface
+library.
+
+%package -n cephfs-java
+Summary:	Java libraries for the Ceph File System
+Group:		System Environment/Libraries
+License:	LGPL-2.0
+Requires:	java
+Requires:	libcephfs_jni1 = %{epoch}:%{version}-%{release}
+%if 0%{?el6}
+Requires:	junit4
+BuildRequires:	junit4
+%else
+Requires:       junit
+BuildRequires:  junit
+%endif
+%description -n cephfs-java
+This package contains the Java libraries for the Ceph File System.
+
+%endif
+
+%if 0%{with selinux}
+
+%package selinux
+Summary:	SELinux support for Ceph MON, OSD and MDS
+Group:		System Environment/Base
+Requires:	%{name}
+Requires:	policycoreutils, libselinux-utils
+Requires(post): selinux-policy-base >= %{_selinux_policy_version}, policycoreutils, gawk
+Requires(postun): policycoreutils
+%description selinux
+This package contains SELinux support for Ceph MON, OSD and MDS. The package
+also performs file-system relabelling which can take a long time on heavily
+populated file-systems.
+
+%endif
+
+%if 0%{with libs_compat}
+
+%package libs-compat
+Summary:	Meta package to include ceph libraries
+Group:		System Environment/Libraries
+License:	LGPL-2.0
+Obsoletes:	ceph-libs
+Requires:	librados2 = %{epoch}:%{version}-%{release}
+Requires:	librbd1 = %{epoch}:%{version}-%{release}
+Requires:	libcephfs1 = %{epoch}:%{version}-%{release}
+Provides:	ceph-libs
+
+%description libs-compat
+This is a meta package, that pulls in librados2, librbd1 and libcephfs1. It
+is included for backwards compatibility with distributions that depend on the
+former ceph-libs package, which is now split up into these three subpackages.
+Packages still depending on ceph-libs should be fixed to depend on librados2,
+librbd1 or libcephfs1 instead.
+
+%endif
+
+%package devel-compat
+Summary:	Compatibility package for Ceph headers
+Group:		Development/Libraries
+License:	LGPL-2.0
+Obsoletes:	ceph-devel
+Requires:	%{name} = %{epoch}:%{version}-%{release}
+Requires:	librados2-devel = %{epoch}:%{version}-%{release}
+Requires:	libradosstriper1-devel = %{epoch}:%{version}-%{release}
+Requires:	librbd1-devel = %{epoch}:%{version}-%{release}
+Requires:	libcephfs1-devel = %{epoch}:%{version}-%{release}
+%if 0%{with cephfs_java}
+Requires:	libcephfs_jni1-devel = %{epoch}:%{version}-%{release}
+%endif
+Provides:	ceph-devel
+%description devel-compat
+This is a compatibility package to accommodate ceph-devel split into
+librados2-devel, librbd1-devel and libcephfs1-devel. Packages still depending
+on ceph-devel should be fixed to depend on librados2-devel, librbd1-devel,
+libcephfs1-devel or libradosstriper1-devel instead.
+
+%package -n python-ceph-compat
+Summary:	Compatibility package for Cephs python libraries
+Group:		System Environment/Libraries
+License:	LGPL-2.0
+Obsoletes:	python-ceph
+Requires:	python-rados = %{epoch}:%{version}-%{release}
+Requires:	python-rbd = %{epoch}:%{version}-%{release}
+Requires:	python-cephfs = %{epoch}:%{version}-%{release}
+Provides:	python-ceph
+%description -n python-ceph-compat
+This is a compatibility package to accommodate python-ceph split into
+python-rados, python-rbd and python-cephfs. Packages still depending on
+python-ceph should be fixed to depend on python-rados, python-rbd or
+python-cephfs instead.
+
+#################################################################################
+# common
+#################################################################################
+%prep
+%setup -q
+%if 0%{?fedora} || 0%{?rhel}
+%patch0 -p1 -b .init
+%endif
+
+%build
+%if 0%{with cephfs_java}
+# Find jni.h
+for i in /usr/{lib64,lib}/jvm/java/include{,/linux}; do
+    [ -d $i ] && java_inc="$java_inc -I$i"
+done
+%endif
+
+./autogen.sh
+
+%if %{with lowmem_builder}
+RPM_OPT_FLAGS="$RPM_OPT_FLAGS --param ggc-min-expand=20 --param ggc-min-heapsize=32768"
+%endif
+export RPM_OPT_FLAGS=`echo $RPM_OPT_FLAGS | sed -e 's/i386/i486/'`
+
+%{configure}	CPPFLAGS="$java_inc" \
+		--prefix=/usr \
+		--localstatedir=/var \
+		--sysconfdir=/etc \
+%if 0%{?_with_systemd}
+		--with-systemdsystemunitdir=%_unitdir \
+%endif
+		--docdir=%{_docdir}/ceph \
+		--with-man-pages \
+		--mandir="%_mandir" \
+		--with-nss \
+		--without-cryptopp \
+		--with-debug \
+%if 0%{with cephfs_java}
+		--enable-cephfs-java \
+%endif
+%if 0%{with selinux}
+		--with-selinux \
+%endif
+		--with-librocksdb-static=check \
+%if 0%{?rhel} || 0%{?fedora}
+		--with-systemd-libexec-dir=/usr/libexec/ceph \
+		--with-rgw-user=root \
+		--with-rgw-group=root \
+%endif
+%if 0%{?suse_version}
+		--with-systemd-libexec-dir=/usr/lib/ceph/ \
+		--with-rgw-user=wwwrun \
+		--with-rgw-group=www \
+%endif
+		--with-radosgw \
+		$CEPH_EXTRA_CONFIGURE_ARGS \
+		%{?_with_ocf} \
+		%{?_with_tcmalloc} \
+		CFLAGS="$RPM_OPT_FLAGS" CXXFLAGS="$RPM_OPT_FLAGS"
+
+
+make %{?_smp_mflags}
+
+
+%if 0%{with tests}
+%check
+# run in-tree unittests
+make %{?_smp_mflags} check-local
+
+%endif
+
+
+
+%install
+make DESTDIR=$RPM_BUILD_ROOT install
+find $RPM_BUILD_ROOT -type f -name "*.la" -exec rm -f {} ';'
+find $RPM_BUILD_ROOT -type f -name "*.a" -exec rm -f {} ';'
+install -D src/rbdmap $RPM_BUILD_ROOT%{_sysconfdir}/ceph/rbdmap
+install -D src/init-rbdmap $RPM_BUILD_ROOT%{_initrddir}/rbdmap
+%if 0%{?fedora} || 0%{?rhel}
+install -m 0644 -D etc/sysconfig/ceph $RPM_BUILD_ROOT%{_sysconfdir}/sysconfig/ceph
+%endif
+%if 0%{?suse_version}
+install -m 0644 -D etc/sysconfig/ceph $RPM_BUILD_ROOT%{_localstatedir}/adm/fillup-templates/sysconfig.%{name}
+%endif
+%if 0%{?_with_systemd}
+  install -m 0644 -D systemd/ceph.tmpfiles.d $RPM_BUILD_ROOT%{_tmpfilesdir}/ceph-common.conf
+  install -m 0644 -D systemd/ceph-osd at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-osd at .service
+  install -m 0644 -D systemd/ceph-mon at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-mon at .service
+  install -m 0644 -D systemd/ceph-create-keys at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-create-keys at .service
+  install -m 0644 -D systemd/ceph-mds at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-mds at .service
+  install -m 0644 -D systemd/ceph-radosgw at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-radosgw at .service
+  install -m 0644 -D systemd/ceph.target $RPM_BUILD_ROOT%{_unitdir}/ceph.target
+  install -m 0644 -D systemd/ceph-disk at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-disk at .service
+  install -m 0755 -D systemd/ceph $RPM_BUILD_ROOT%{_sbindir}/rcceph
+%else
+  install -D src/init-ceph $RPM_BUILD_ROOT%{_initrddir}/ceph
+  install -D src/init-radosgw $RPM_BUILD_ROOT%{_initrddir}/ceph-radosgw
+  ln -sf ../../etc/init.d/ceph %{buildroot}/%{_sbindir}/rcceph
+  ln -sf ../../etc/init.d/ceph-radosgw %{buildroot}/%{_sbindir}/rcceph-radosgw
+%endif
+mkdir -p $RPM_BUILD_ROOT%{_sbindir}
+install -m 0644 -D src/logrotate.conf $RPM_BUILD_ROOT%{_sysconfdir}/logrotate.d/ceph
+chmod 0644 $RPM_BUILD_ROOT%{_docdir}/ceph/sample.ceph.conf
+chmod 0644 $RPM_BUILD_ROOT%{_docdir}/ceph/sample.fetch_config
+
+# firewall templates
+%if 0%{?suse_version}
+install -m 0644 -D etc/sysconfig/SuSEfirewall2.d/services/ceph-mon %{buildroot}%{_sysconfdir}/sysconfig/SuSEfirewall2.d/services/ceph-mon
+install -m 0644 -D etc/sysconfig/SuSEfirewall2.d/services/ceph-osd-mds %{buildroot}%{_sysconfdir}/sysconfig/SuSEfirewall2.d/services/ceph-osd-mds
+%endif
+
+# udev rules
+install -m 0644 -D udev/50-rbd.rules $RPM_BUILD_ROOT%{_udevrulesdir}/50-rbd.rules
+install -m 0644 -D udev/60-ceph-partuuid-workaround.rules $RPM_BUILD_ROOT%{_udevrulesdir}/60-ceph-partuuid-workaround.rules
+
+%if (0%{?rhel} && 0%{?rhel} < 7)
+install -m 0644 -D udev/95-ceph-osd-alt.rules $RPM_BUILD_ROOT/lib/udev/rules.d/95-ceph-osd.rules
+%else
+install -m 0644 -D udev/95-ceph-osd.rules $RPM_BUILD_ROOT/lib/udev/rules.d/95-ceph-osd.rules
+%endif
+
+%if 0%{?rhel} >= 7 || 0%{?fedora} || 0%{?suse_version}
+mv $RPM_BUILD_ROOT/lib/udev/rules.d/95-ceph-osd.rules $RPM_BUILD_ROOT/usr/lib/udev/rules.d/95-ceph-osd.rules
+mv $RPM_BUILD_ROOT/sbin/mount.ceph $RPM_BUILD_ROOT/usr/sbin/mount.ceph
+mv $RPM_BUILD_ROOT/sbin/mount.fuse.ceph $RPM_BUILD_ROOT/usr/sbin/mount.fuse.ceph
+%endif
+
+#set up placeholder directories
+mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/ceph
+%if ! 0%{?_with_systemd}
+mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/run/ceph
+%endif
+mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/log/ceph
+mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/tmp
+mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/mon
+mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/osd
+mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/mds
+mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/radosgw
+mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-osd
+mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-mds
+mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-rgw
+
+%clean
+rm -rf $RPM_BUILD_ROOT
+
+%pre
+%if 0%{?_with_systemd}
+  %if 0%{?suse_version}
+    # service_add_pre and friends don't work with parameterized systemd service
+    # instances, only with single services or targets, so we always pass
+    # ceph.target to these macros
+    %service_add_pre ceph.target
+  %endif
+%endif
+
+
+%post
+/sbin/ldconfig
+%if 0%{?_with_systemd}
+  %if 0%{?suse_version}
+    %fillup_only
+    %service_add_post ceph.target
+  %endif
+%else
+  /sbin/chkconfig --add ceph
+%endif
+
+%preun
+%if 0%{?_with_systemd}
+  %if 0%{?suse_version}
+    %service_del_preun ceph.target
+  %endif
+  # Disable and stop on removal.
+  if [ $1 = 0 ] ; then
+    SERVICE_LIST=$(systemctl | grep -E '^ceph-mon@|^ceph-create-keys@|^ceph-osd@|^ceph-mds@|^ceph-disk-'  | cut -d' ' -f1)
+    if [ -n "$SERVICE_LIST" ]; then
+      for SERVICE in $SERVICE_LIST; do
+        /usr/bin/systemctl --no-reload disable $SERVICE > /dev/null 2>&1 || :
+        /usr/bin/systemctl stop $SERVICE > /dev/null 2>&1 || :
+      done
+    fi
+  fi
+%else
+  %if 0%{?rhel} || 0%{?fedora}
+    if [ $1 = 0 ] ; then
+      /sbin/service ceph stop >/dev/null 2>&1
+      /sbin/chkconfig --del ceph
+    fi
+  %endif
+%endif
+
+%postun
+/sbin/ldconfig
+%if 0%{?_with_systemd}
+  if [ $1 = 1 ] ; then
+    # Restart on upgrade, but only if "CEPH_AUTO_RESTART_ON_UPGRADE" is set to
+    # "yes". In any case: if units are not running, do not touch them.
+    SYSCONF_CEPH=/etc/sysconfig/ceph
+    if [ -f $SYSCONF_CEPH -a -r $SYSCONF_CEPH ] ; then
+      source $SYSCONF_CEPH
+    fi
+    if [ "X$CEPH_AUTO_RESTART_ON_UPGRADE" = "Xyes" ] ; then
+      SERVICE_LIST=$(systemctl | grep -E '^ceph-mon@|^ceph-create-keys@|^ceph-osd@|^ceph-mds@|^ceph-disk-'  | cut -d' ' -f1)
+      if [ -n "$SERVICE_LIST" ]; then
+        for SERVICE in $SERVICE_LIST; do
+          /usr/bin/systemctl try-restart $SERVICE > /dev/null 2>&1 || :
+        done
+      fi
+    fi
+  fi
+%endif
+
+#################################################################################
+# files
+#################################################################################
+%files
+%defattr(-,root,root,-)
+%docdir %{_docdir}
+%dir %{_docdir}/ceph
+%{_docdir}/ceph/sample.ceph.conf
+%{_docdir}/ceph/sample.fetch_config
+%{_bindir}/cephfs
+%{_bindir}/ceph-clsinfo
+%{_bindir}/ceph-rest-api
+%{python_sitelib}/ceph_rest_api.py*
+%{_bindir}/crushtool
+%{_bindir}/monmaptool
+%{_bindir}/osdmaptool
+%{_bindir}/ceph-run
+%{_bindir}/ceph-mon
+%{_bindir}/ceph-mds
+%{_bindir}/ceph-objectstore-tool
+%{_bindir}/ceph-osd
+%{_bindir}/ceph-detect-init
+%{_bindir}/librados-config
+%{_bindir}/ceph-client-debug
+%{_bindir}/cephfs-journal-tool
+%{_bindir}/cephfs-table-tool
+%{_bindir}/cephfs-data-scan
+%{_bindir}/ceph-debugpack
+%{_bindir}/ceph-coverage
+%if 0%{?_with_systemd}
+%{_unitdir}/ceph-mds at .service
+%{_unitdir}/ceph-mon at .service
+%{_unitdir}/ceph-create-keys at .service
+%{_unitdir}/ceph-osd at .service
+%{_unitdir}/ceph-radosgw at .service
+%{_unitdir}/ceph-disk at .service
+%{_unitdir}/ceph.target
+%else
+%{_initrddir}/ceph
+%endif
+%{_sbindir}/ceph-disk
+%{_sbindir}/ceph-disk-udev
+%{_sbindir}/ceph-create-keys
+%{_sbindir}/rcceph
+%if 0%{?rhel} >= 7 || 0%{?fedora} || 0%{?suse_version}
+%{_sbindir}/mount.ceph
+%else
+/sbin/mount.ceph
+%endif
+%dir %{_libdir}/ceph
+%{_libdir}/ceph/ceph_common.sh
+%{_libexecdir}/ceph/ceph-osd-prestart.sh
+%dir %{_libdir}/rados-classes
+%{_libdir}/rados-classes/libcls_cephfs.so*
+%{_libdir}/rados-classes/libcls_rbd.so*
+%{_libdir}/rados-classes/libcls_hello.so*
+%{_libdir}/rados-classes/libcls_numops.so*
+%{_libdir}/rados-classes/libcls_rgw.so*
+%{_libdir}/rados-classes/libcls_lock.so*
+%{_libdir}/rados-classes/libcls_kvs.so*
+%{_libdir}/rados-classes/libcls_refcount.so*
+%{_libdir}/rados-classes/libcls_log.so*
+%{_libdir}/rados-classes/libcls_replica_log.so*
+%{_libdir}/rados-classes/libcls_statelog.so*
+%{_libdir}/rados-classes/libcls_timeindex.so*
+%{_libdir}/rados-classes/libcls_user.so*
+%{_libdir}/rados-classes/libcls_version.so*
+%dir %{_libdir}/ceph/erasure-code
+%{_libdir}/ceph/erasure-code/libec_*.so*
+%if 0%{?_with_lttng}
+%{_libdir}/libos_tp.so*
+%{_libdir}/libosd_tp.so*
+%endif
+%{_udevrulesdir}/60-ceph-partuuid-workaround.rules
+%{_udevrulesdir}/95-ceph-osd.rules
+%config %{_sysconfdir}/bash_completion.d/ceph
+%config(noreplace) %{_sysconfdir}/logrotate.d/ceph
+%if 0%{?fedora} || 0%{?rhel}
+%config(noreplace) %{_sysconfdir}/sysconfig/ceph
+%endif
+%if 0%{?suse_version}
+%{_localstatedir}/adm/fillup-templates/sysconfig.*
+%config %{_sysconfdir}/sysconfig/SuSEfirewall2.d/services/ceph-mon
+%config %{_sysconfdir}/sysconfig/SuSEfirewall2.d/services/ceph-osd-mds
+%endif
+%{python_sitelib}/ceph_detect_init*
+%{_mandir}/man8/ceph-deploy.8*
+%{_mandir}/man8/ceph-detect-init.8*
+%{_mandir}/man8/ceph-disk.8*
+%{_mandir}/man8/ceph-create-keys.8*
+%{_mandir}/man8/ceph-mon.8*
+%{_mandir}/man8/ceph-mds.8*
+%{_mandir}/man8/ceph-osd.8*
+%{_mandir}/man8/ceph-run.8*
+%{_mandir}/man8/ceph-rest-api.8*
+%{_mandir}/man8/crushtool.8*
+%{_mandir}/man8/osdmaptool.8*
+%{_mandir}/man8/monmaptool.8*
+%{_mandir}/man8/cephfs.8*
+%{_mandir}/man8/mount.ceph.8*
+%{_mandir}/man8/ceph-debugpack.8*
+%{_mandir}/man8/ceph-clsinfo.8*
+%{_mandir}/man8/librados-config.8*
+#set up placeholder directories
+%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/tmp
+%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/mon
+%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/osd
+%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/mds
+%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/bootstrap-osd
+%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/bootstrap-mds
+%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/bootstrap-rgw
+%if ! 0%{?_with_systemd}
+%attr(770,ceph,ceph) %dir %{_localstatedir}/run/ceph
+%endif
+
+#################################################################################
+%files -n ceph-common
+%defattr(-,root,root,-)
+%{_bindir}/ceph
+%{_bindir}/ceph-authtool
+%{_bindir}/ceph-conf
+%{_bindir}/ceph-dencoder
+%{_bindir}/ceph-rbdnamer
+%{_bindir}/ceph-syn
+%{_bindir}/ceph-crush-location
+%{_bindir}/rados
+%{_bindir}/rbd
+%{_bindir}/rbd-replay
+%{_bindir}/rbd-replay-many
+%if 0%{?_with_lttng}
+%{_bindir}/rbd-replay-prep
+%endif
+%{_bindir}/ceph-post-file
+%{_bindir}/ceph-brag
+%if 0%{?_with_systemd}
+%{_tmpfilesdir}/ceph-common.conf
+%endif
+%{_mandir}/man8/ceph-authtool.8*
+%{_mandir}/man8/ceph-conf.8*
+%{_mandir}/man8/ceph-dencoder.8*
+%{_mandir}/man8/ceph-rbdnamer.8*
+%{_mandir}/man8/ceph-syn.8*
+%{_mandir}/man8/ceph-post-file.8*
+%{_mandir}/man8/ceph.8*
+%{_mandir}/man8/rados.8*
+%{_mandir}/man8/rbd.8*
+%{_mandir}/man8/rbd-replay.8*
+%{_mandir}/man8/rbd-replay-many.8*
+%{_mandir}/man8/rbd-replay-prep.8*
+%{_datadir}/ceph/known_hosts_drop.ceph.com
+%{_datadir}/ceph/id_dsa_drop.ceph.com
+%{_datadir}/ceph/id_dsa_drop.ceph.com.pub
+%dir %{_sysconfdir}/ceph/
+%dir %{_datarootdir}/ceph/
+%dir %{_libexecdir}/ceph/
+%config %{_sysconfdir}/bash_completion.d/rados
+%config %{_sysconfdir}/bash_completion.d/rbd
+%config(noreplace) %{_sysconfdir}/ceph/rbdmap
+%{_initrddir}/rbdmap
+%{python_sitelib}/ceph_argparse.py*
+%{python_sitelib}/ceph_daemon.py*
+%{_udevrulesdir}/50-rbd.rules
+%attr(3770,ceph,ceph) %dir %{_localstatedir}/log/ceph/
+%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/
+
+%pre -n ceph-common
+CEPH_GROUP_ID=""
+CEPH_USER_ID=""
+%if 0%{?rhel} || 0%{?fedora}
+CEPH_GROUP_ID="-g 167"
+CEPH_USER_ID="-u 167"
+%endif
+%if 0%{?rhel} || 0%{?fedora}
+%{_sbindir}/groupadd ceph $CEPH_GROUP_ID -o -r 2>/dev/null || :
+%{_sbindir}/useradd ceph $CEPH_USER_ID -o -r -g ceph -s /sbin/nologin -c "Ceph daemons" -d %{_localstatedir}/lib/ceph 2> /dev/null || :
+%endif
+%if 0%{?suse_version}
+getent group ceph >/dev/null || groupadd -r ceph
+getent passwd ceph >/dev/null || useradd -r -g ceph -d %{_localstatedir}/lib/ceph -s /sbin/nologin -c "Ceph daemons" ceph
+%endif
+exit 0
+
+%post -n ceph-common
+%if 0%{?_with_systemd}
+systemd-tmpfiles --create --prefix=/run/ceph
+%endif
+
+%postun -n ceph-common
+# Package removal cleanup
+if [ "$1" -eq "0" ] ; then
+    rm -rf /var/log/ceph
+    rm -rf /etc/ceph
+fi
+
+#################################################################################
+%files fuse
+%defattr(-,root,root,-)
+%{_bindir}/ceph-fuse
+%{_mandir}/man8/ceph-fuse.8*
+%if 0%{?rhel} >= 7 || 0%{?fedora} || 0%{?suse_version}
+%{_sbindir}/mount.fuse.ceph
+%else
+/sbin/mount.fuse.ceph
+%endif
+
+#################################################################################
+%files -n rbd-fuse
+%defattr(-,root,root,-)
+%{_bindir}/rbd-fuse
+%{_mandir}/man8/rbd-fuse.8*
+
+#################################################################################
+%files radosgw
+%defattr(-,root,root,-)
+%{_bindir}/radosgw
+%{_bindir}/radosgw-admin
+%{_bindir}/radosgw-object-expirer
+%{_mandir}/man8/radosgw.8*
+%{_mandir}/man8/radosgw-admin.8*
+%config %{_sysconfdir}/bash_completion.d/radosgw-admin
+%dir %{_localstatedir}/lib/ceph/radosgw
+%if 0%{?_with_systemd}
+%else
+%{_initrddir}/ceph-radosgw
+%{_sbindir}/rcceph-radosgw
+%endif
+
+%post radosgw
+/sbin/ldconfig
+%if 0%{?suse_version}
+  # explicit systemctl daemon-reload (that's the only relevant bit of
+  # service_add_post; the rest is all sysvinit --> systemd migration which
+  # isn't applicable in this context (see above comment).
+  /usr/bin/systemctl daemon-reload >/dev/null 2>&1 || :
+%endif
+
+%preun radosgw
+%if 0%{?_with_systemd}
+  # Disable and stop on removal.
+  if [ $1 = 0 ] ; then
+    SERVICE_LIST=$(systemctl | grep -E '^ceph-radosgw@'  | cut -d' ' -f1)
+    if [ -n "$SERVICE_LIST" ]; then
+      for SERVICE in $SERVICE_LIST; do
+        /usr/bin/systemctl --no-reload disable $SERVICE > /dev/null 2>&1 || :
+        /usr/bin/systemctl stop $SERVICE > /dev/null 2>&1 || :
+      done
+    fi
+  fi
+%endif
+
+%postun radosgw
+/sbin/ldconfig
+%if 0%{?_with_systemd}
+  if [ $1 = 1 ] ; then
+    # Restart on upgrade, but only if "CEPH_AUTO_RESTART_ON_UPGRADE" is set to
+    # "yes". In any case: if units are not running, do not touch them.
+    SYSCONF_CEPH=/etc/sysconfig/ceph
+    if [ -f $SYSCONF_CEPH -a -r $SYSCONF_CEPH ] ; then
+      source $SYSCONF_CEPH
+    fi
+    if [ "X$CEPH_AUTO_RESTART_ON_UPGRADE" = "Xyes" ] ; then
+      SERVICE_LIST=$(systemctl | grep -E '^ceph-radosgw@'  | cut -d' ' -f1)
+      if [ -n "$SERVICE_LIST" ]; then
+        for SERVICE in $SERVICE_LIST; do
+          /usr/bin/systemctl try-restart $SERVICE > /dev/null 2>&1 || :
+        done
+      fi
+    fi
+  fi
+%endif
+
+#################################################################################
+%if %{with ocf}
+%files resource-agents
+%defattr(0755,root,root,-)
+%dir /usr/lib/ocf
+%dir /usr/lib/ocf/resource.d
+%dir /usr/lib/ocf/resource.d/ceph
+/usr/lib/ocf/resource.d/%{name}/*
+%endif
+
+#################################################################################
+%files -n librados2
+%defattr(-,root,root,-)
+%{_libdir}/librados.so.*
+%if 0%{?_with_lttng}
+%{_libdir}/librados_tp.so.*
+%endif
+
+%post -n librados2
+/sbin/ldconfig
+
+%postun -n librados2
+/sbin/ldconfig
+
+#################################################################################
+%files -n librados2-devel
+%defattr(-,root,root,-)
+%dir %{_includedir}/rados
+%{_includedir}/rados/librados.h
+%{_includedir}/rados/librados.hpp
+%{_includedir}/rados/buffer.h
+%{_includedir}/rados/page.h
+%{_includedir}/rados/crc32c.h
+%{_includedir}/rados/rados_types.h
+%{_includedir}/rados/rados_types.hpp
+%{_includedir}/rados/memory.h
+%{_libdir}/librados.so
+%if 0%{?_with_lttng}
+%{_libdir}/librados_tp.so
+%endif
+
+#################################################################################
+%files -n python-rados
+%defattr(-,root,root,-)
+%{python_sitelib}/rados.py*
+
+#################################################################################
+%files -n libradosstriper1
+%defattr(-,root,root,-)
+%{_libdir}/libradosstriper.so.*
+
+%post -n libradosstriper1
+/sbin/ldconfig
+
+%postun -n libradosstriper1
+/sbin/ldconfig
+
+#################################################################################
+%files -n libradosstriper1-devel
+%defattr(-,root,root,-)
+%dir %{_includedir}/radosstriper
+%{_includedir}/radosstriper/libradosstriper.h
+%{_includedir}/radosstriper/libradosstriper.hpp
+%{_libdir}/libradosstriper.so
+
+#################################################################################
+%files -n librbd1
+%defattr(-,root,root,-)
+%{_libdir}/librbd.so.*
+%if 0%{?_with_lttng}
+%{_libdir}/librbd_tp.so.*
+%endif
+
+%post -n librbd1
+/sbin/ldconfig
+mkdir -p /usr/lib64/qemu/
+ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
+
+%postun -n librbd1
+/sbin/ldconfig
+
+#################################################################################
+%files -n librbd1-devel
+%defattr(-,root,root,-)
+%dir %{_includedir}/rbd
+%{_includedir}/rbd/librbd.h
+%{_includedir}/rbd/librbd.hpp
+%{_includedir}/rbd/features.h
+%{_libdir}/librbd.so
+%if 0%{?_with_lttng}
+%{_libdir}/librbd_tp.so
+%endif
+
+#################################################################################
+%files -n python-rbd
+%defattr(-,root,root,-)
+%{python_sitelib}/rbd.py*
+
+#################################################################################
+%files -n libcephfs1
+%defattr(-,root,root,-)
+%{_libdir}/libcephfs.so.*
+
+%post -n libcephfs1
+/sbin/ldconfig
+
+%postun -n libcephfs1
+/sbin/ldconfig
+
+#################################################################################
+%files -n libcephfs1-devel
+%defattr(-,root,root,-)
+%dir %{_includedir}/cephfs
+%{_includedir}/cephfs/libcephfs.h
+%{_libdir}/libcephfs.so
+
+#################################################################################
+%files -n python-cephfs
+%defattr(-,root,root,-)
+%{python_sitelib}/cephfs.py*
+
+#################################################################################
+%files -n ceph-test
+%defattr(-,root,root,-)
+%{_bindir}/ceph_bench_log
+%{_bindir}/ceph_kvstorebench
+%{_bindir}/ceph_multi_stress_watch
+%{_bindir}/ceph_erasure_code
+%{_bindir}/ceph_erasure_code_benchmark
+%{_bindir}/ceph_omapbench
+%{_bindir}/ceph_objectstore_bench
+%{_bindir}/ceph_perf_objectstore
+%{_bindir}/ceph_perf_local
+%{_bindir}/ceph_perf_msgr_client
+%{_bindir}/ceph_perf_msgr_server
+%{_bindir}/ceph_psim
+%{_bindir}/ceph_radosacl
+%{_bindir}/ceph_rgw_jsonparser
+%{_bindir}/ceph_rgw_multiparser
+%{_bindir}/ceph_scratchtool
+%{_bindir}/ceph_scratchtoolpp
+%{_bindir}/ceph_smalliobench
+%{_bindir}/ceph_smalliobenchdumb
+%{_bindir}/ceph_smalliobenchfs
+%{_bindir}/ceph_smalliobenchrbd
+%{_bindir}/ceph_streamtest
+%{_bindir}/ceph_test_*
+%{_bindir}/ceph_tpbench
+%{_bindir}/ceph_xattr_bench
+%{_bindir}/ceph-monstore-tool
+%{_bindir}/ceph-osdomap-tool
+%{_bindir}/ceph-kvstore-tool
+%dir %{_libdir}/ceph
+%{_libdir}/ceph/ceph-monstore-update-crush.sh
+
+#################################################################################
+%if 0%{with cephfs_java}
+%files -n libcephfs_jni1
+%defattr(-,root,root,-)
+%{_libdir}/libcephfs_jni.so.*
+
+%post -n libcephfs_jni1
+/sbin/ldconfig
+
+%postun -n libcephfs_jni1
+/sbin/ldconfig
+
+#################################################################################
+%files -n libcephfs_jni1-devel
+%defattr(-,root,root,-)
+%{_libdir}/libcephfs_jni.so
+
+#################################################################################
+%files -n cephfs-java
+%defattr(-,root,root,-)
+%{_javadir}/libcephfs.jar
+%{_javadir}/libcephfs-test.jar
+%endif
+
+#################################################################################
+%if 0%{with selinux}
+%files selinux
+%defattr(-,root,root,-)
+%attr(0600,root,root) %{_datadir}/selinux/packages/ceph.pp
+%{_datadir}/selinux/devel/include/contrib/ceph.if
+%{_mandir}/man8/ceph_selinux.8*
+
+%post selinux
+# Install the policy
+OLD_POLVER=$(%{_sbindir}/semodule -l | grep -P '^ceph[\t ]' | awk '{print $2}')
+%{_sbindir}/semodule -n -i %{_datadir}/selinux/packages/ceph.pp
+NEW_POLVER=$(%{_sbindir}/semodule -l | grep -P '^ceph[\t ]' | awk '{print $2}')
+
+# Load the policy if SELinux is enabled
+if %{_sbindir}/selinuxenabled; then
+    %{_sbindir}/load_policy
+else
+    # Do not relabel if selinux is not enabled
+    exit 0
+fi
+
+if test "$OLD_POLVER" == "$NEW_POLVER"; then
+   # Do not relabel if policy version did not change
+   exit 0
+fi
+
+# Check whether the daemons are running
+%if 0%{?_with_systemd}
+    /usr/bin/systemctl status ceph.target > /dev/null 2>&1
+%else
+    /sbin/service ceph status >/dev/null 2>&1
+%endif
+STATUS=$?
+
+# Stop the daemons if they were running
+if test $STATUS -eq 0; then
+%if 0%{?_with_systemd}
+    /usr/bin/systemctl stop ceph.target > /dev/null 2>&1
+%else
+    /sbin/service ceph stop >/dev/null 2>&1
+%endif
+fi
+
+# Now, relabel the files
+%relabel_files
+
+# Start the daemons iff they were running before
+if test $STATUS -eq 0; then
+%if 0%{?_with_systemd}
+    /usr/bin/systemctl start ceph.target > /dev/null 2>&1 || :
+%else
+    /sbin/service ceph start >/dev/null 2>&1 || :
+%endif
+fi
+
+exit 0
+
+%postun selinux
+if [ $1 -eq 0 ]; then
+    # Remove the module
+    %{_sbindir}/semodule -n -r ceph
+
+    # Reload the policy if SELinux is enabled
+    if %{_sbindir}/selinuxenabled ; then
+        %{_sbindir}/load_policy
+    else
+        # Do not relabel if SELinux is not enabled
+        exit 0
+    fi
+
+    # Check whether the daemons are running
+    %if 0%{?_with_systemd}
+        /usr/bin/systemctl status ceph.target > /dev/null 2>&1
+    %else
+        /sbin/service ceph status >/dev/null 2>&1
+    %endif
+    STATUS=$?
+
+    # Stop the daemons if they were running
+    if test $STATUS -eq 0; then
+    %if 0%{?_with_systemd}
+        /usr/bin/systemctl stop ceph.target > /dev/null 2>&1
+    %else
+        /sbin/service ceph stop >/dev/null 2>&1
+    %endif
+    fi
+
+    # Now, relabel the files
+    %relabel_files
+
+    # Start the daemons if they were running before
+    if test $STATUS -eq 0; then
+    %if 0%{?_with_systemd}
+	/usr/bin/systemctl start ceph.target > /dev/null 2>&1 || :
+    %else
+	/sbin/service ceph start >/dev/null 2>&1 || :
+    %endif
+    fi
+fi
+exit 0
+
+%endif # with selinux
+
+#################################################################################
+%if 0%{with libs_compat}
+%files libs-compat
+# We need an empty %%files list for ceph-libs-compat, to tell rpmbuild to actually
+# build this meta package.
+
+#################################################################################
+%files devel-compat
+# We need an empty %%files list for ceph-devel-compat, to tell rpmbuild to
+# actually build this meta package.
+%endif
+
+#################################################################################
+%files -n python-ceph-compat
+# We need an empty %%files list for python-ceph-compat, to tell rpmbuild to
+# actually build this meta package.
+
+%changelog
diff --git a/src/test/fedora-21/install-deps.sh b/src/test/fedora-21/install-deps.sh
new file mode 100755
index 0000000..1bebf09
--- /dev/null
+++ b/src/test/fedora-21/install-deps.sh
@@ -0,0 +1,147 @@
+#!/bin/bash -e
+#
+# Ceph distributed storage system
+#
+# Copyright (C) 2014, 2015 Red Hat <contact at redhat.com>
+#
+# Author: Loic Dachary <loic at dachary.org>
+#
+#  This library is free software; you can redistribute it and/or
+#  modify it under the terms of the GNU Lesser General Public
+#  License as published by the Free Software Foundation; either
+#  version 2.1 of the License, or (at your option) any later version.
+#
+DIR=/tmp/install-deps.$$
+trap "rm -fr $DIR" EXIT
+mkdir -p $DIR
+if test $(id -u) != 0 ; then
+    SUDO=sudo
+fi
+export LC_ALL=C # the following is vulnerable to i18n
+
+if test -f /etc/redhat-release ; then
+    $SUDO yum install -y redhat-lsb-core
+fi
+
+if type apt-get > /dev/null 2>&1 ; then
+    $SUDO apt-get install -y lsb-release
+fi
+
+if type zypper > /dev/null 2>&1 ; then
+    $SUDO zypper --gpg-auto-import-keys --non-interactive install lsb-release
+fi
+
+case $(lsb_release -si) in
+Ubuntu|Debian|Devuan)
+        $SUDO apt-get install -y dpkg-dev
+        if ! test -r debian/control ; then
+            echo debian/control is not a readable file
+            exit 1
+        fi
+        touch $DIR/status
+        packages=$(dpkg-checkbuilddeps --admindir=$DIR debian/control 2>&1 | \
+            perl -p -e 's/.*Unmet build dependencies: *//;' \
+            -e 's/build-essential:native/build-essential/;' \
+            -e 's/\s*\|\s*/\|/g;' \
+            -e 's/\(.*?\)//g;' \
+            -e 's/ +/\n/g;' | sort)
+        case $(lsb_release -sc) in
+            squeeze|wheezy)
+                packages=$(echo $packages | perl -pe 's/[-\w]*babeltrace[-\w]*//g')
+                backports="-t $(lsb_release -sc)-backports"
+                ;;
+        esac
+        packages=$(echo $packages) # change newlines into spaces
+        $SUDO env DEBIAN_FRONTEND=noninteractive apt-get install $backports -y $packages || exit 1
+        ;;
+CentOS|Fedora|RedHatEnterpriseServer)
+        case $(lsb_release -si) in
+            Fedora)
+                $SUDO yum install -y yum-utils
+                ;;
+            CentOS|RedHatEnterpriseServer)
+                $SUDO yum install -y yum-utils
+                MAJOR_VERSION=$(lsb_release -rs | cut -f1 -d.)
+                if test $(lsb_release -si) == RedHatEnterpriseServer ; then
+                    $SUDO yum install subscription-manager
+                    $SUDO subscription-manager repos --enable=rhel-$MAJOR_VERSION-server-optional-rpms
+                fi
+                $SUDO yum-config-manager --add-repo https://dl.fedoraproject.org/pub/epel/$MAJOR_VERSION/x86_64/ 
+                $SUDO yum install --nogpgcheck -y epel-release
+                $SUDO rpm --import /etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-$MAJOR_VERSION
+                $SUDO rm -f /etc/yum.repos.d/dl.fedoraproject.org*
+                ;;
+        esac
+        sed -e 's/@//g' < ceph.spec.in > $DIR/ceph.spec
+        $SUDO yum-builddep -y $DIR/ceph.spec 2>&1 | tee $DIR/yum-builddep.out
+        ! grep -q -i error: $DIR/yum-builddep.out || exit 1
+        ;;
+*SUSE*)
+        sed -e 's/@//g' < ceph.spec.in > $DIR/ceph.spec
+        $SUDO zypper --non-interactive install $(rpmspec -q --buildrequires $DIR/ceph.spec) || exit 1
+        ;;
+*)
+        echo "$(lsb_release -si) is unknown, dependencies will have to be installed manually."
+        ;;
+esac
+
+function populate_wheelhouse() {
+    local install=$1
+    shift
+
+    # Ubuntu-12.04 and Python 2.7.3 require this line
+    pip --timeout 300 $install 'distribute >= 0.7.3' || return 1
+    # although pip comes with virtualenv, having a recent version
+    # of pip matters when it comes to using wheel packages
+    pip --timeout 300 $install 'setuptools >= 0.8' 'pip >= 7.0' 'wheel >= 0.24' || return 1
+    if test $# != 0 ; then
+        pip --timeout 300 $install $@ || return 1
+    fi
+}
+
+function activate_virtualenv() {
+    local top_srcdir=$1
+    local interpreter=$2
+    local env_dir=$top_srcdir/install-deps-$interpreter
+
+    if ! test -d $env_dir ; then
+        virtualenv --python $interpreter $env_dir
+        . $env_dir/bin/activate
+        if ! populate_wheelhouse install ; then
+            rm -rf $env_dir
+            return 1
+        fi
+    fi
+    . $env_dir/bin/activate
+}
+
+# use pip cache if possible but do not store it outside of the source
+# tree
+# see https://pip.pypa.io/en/stable/reference/pip_install.html#caching
+mkdir -p install-deps-cache
+top_srcdir=$(pwd)
+export XDG_CACHE_HOME=$top_srcdir/install-deps-cache
+wip_wheelhouse=wheelhouse-wip
+
+#
+# preload python modules so that tox can run without network access
+#
+find . -name tox.ini | while read ini ; do
+    (
+        cd $(dirname $ini)
+        require=$(ls *requirements.txt 2>/dev/null | sed -e 's/^/-r /')
+        if test "$require" && ! test -d wheelhouse ; then
+            for interpreter in python2.7 python3 ; do
+                type $interpreter > /dev/null 2>&1 || continue
+                activate_virtualenv $top_srcdir $interpreter || exit 1
+                populate_wheelhouse "wheel -w $wip_wheelhouse" $require || exit 1
+            done
+            mv $wip_wheelhouse wheelhouse
+        fi
+    )
+done
+
+for interpreter in python2.7 python3 ; do
+    rm -rf $top_srcdir/install-deps-$interpreter
+done
+rm -rf $XDG_CACHE_HOME
diff --git a/src/test/libcephfs/flock.cc b/src/test/libcephfs/flock.cc
new file mode 100644
index 0000000..2bea91f
--- /dev/null
+++ b/src/test/libcephfs/flock.cc
@@ -0,0 +1,638 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <pthread.h>
+#include "gtest/gtest.h"
+#ifndef GTEST_IS_THREADSAFE
+#error "!GTEST_IS_THREADSAFE"
+#endif
+
+#include "include/cephfs/libcephfs.h"
+#include <errno.h>
+#include <sys/fcntl.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <dirent.h>
+#include <sys/xattr.h>
+
+#include <stdlib.h>
+#include <semaphore.h>
+#include <time.h>
+#include <sys/mman.h>
+
+#ifdef __linux__
+#include <limits.h>
+#endif
+
+// Startup common: create and mount ceph fs
+#define STARTUP_CEPH() do {				\
+    ASSERT_EQ(0, ceph_create(&cmount, NULL));		\
+    ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));	\
+    ASSERT_EQ(0, ceph_conf_read_file(cmount, NULL));	\
+    ASSERT_EQ(0, ceph_mount(cmount, NULL));		\
+  } while(0)
+
+// Cleanup common: unmount and release ceph fs
+#define CLEANUP_CEPH() do {			\
+    ASSERT_EQ(0, ceph_unmount(cmount));		\
+    ASSERT_EQ(0, ceph_release(cmount));		\
+  } while(0)
+
+static const mode_t fileMode = S_IRWXU | S_IRWXG | S_IRWXO;
+
+// Default wait time for normal and "slow" operations
+// (5" should be enough in case of network congestion)
+static const long waitMs = 10;
+static const long waitSlowMs = 5000;
+
+// Get the absolute struct timespec reference from now + 'ms' milliseconds
+static const struct timespec* abstime(struct timespec &ts, long ms) {
+  if (clock_gettime(CLOCK_REALTIME, &ts) == -1) {
+    abort();
+  }
+  ts.tv_nsec += ms * 1000000;
+  ts.tv_sec += ts.tv_nsec / 1000000000;
+  ts.tv_nsec %= 1000000000;
+  return &ts;
+}
+
+/* Basic locking */
+
+TEST(LibCephFS, BasicLocking) {
+  struct ceph_mount_info *cmount = NULL;
+  STARTUP_CEPH();
+
+  char c_file[1024];
+  sprintf(c_file, "/flock_test_%d", getpid());
+  const int fd = ceph_open(cmount, c_file, O_RDWR | O_CREAT, fileMode);
+  ASSERT_GE(fd, 0); 
+
+  // Lock exclusively twice
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_EX, 42));
+  ASSERT_EQ(-EWOULDBLOCK, ceph_flock(cmount, fd, LOCK_EX | LOCK_NB, 43));
+  ASSERT_EQ(-EWOULDBLOCK, ceph_flock(cmount, fd, LOCK_EX | LOCK_NB, 44));
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_UN, 42));
+
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_EX | LOCK_NB, 43));
+  ASSERT_EQ(-EWOULDBLOCK, ceph_flock(cmount, fd, LOCK_EX | LOCK_NB, 44));
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_UN, 43));
+
+  // Lock shared three times
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_SH, 42));
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_SH, 43));
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_SH, 44));
+  // And then attempt to lock exclusively
+  ASSERT_EQ(-EWOULDBLOCK, ceph_flock(cmount, fd, LOCK_EX | LOCK_NB, 45));
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_UN, 42));
+  ASSERT_EQ(-EWOULDBLOCK, ceph_flock(cmount, fd, LOCK_EX | LOCK_NB, 45));
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_UN, 44));
+  ASSERT_EQ(-EWOULDBLOCK, ceph_flock(cmount, fd, LOCK_EX | LOCK_NB, 45));
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_UN, 43));
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_EX | LOCK_NB, 45));
+  ASSERT_EQ(-EWOULDBLOCK, ceph_flock(cmount, fd, LOCK_SH | LOCK_NB, 42));
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_UN, 45));
+
+  // Lock shared with upgrade to exclusive (POSIX) 
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_SH, 42));
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_EX, 42));
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_UN, 42));
+
+  // Lock exclusive with downgrade to shared (POSIX) 
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_EX, 42));
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_SH, 42));
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_UN, 42));
+
+  ASSERT_EQ(0, ceph_close(cmount, fd));
+  ASSERT_EQ(0, ceph_unlink(cmount, c_file));
+  CLEANUP_CEPH();
+}
+
+/* Locking in different threads */
+
+// Used by ConcurrentLocking test
+struct str_ConcurrentLocking {
+  const char *file;
+  struct ceph_mount_info *cmount;  // !NULL if shared
+  sem_t sem[2];
+  sem_t semReply[2];
+  void sem_init(int pshared) {
+    ASSERT_EQ(0, ::sem_init(&sem[0], pshared, 0));
+    ASSERT_EQ(0, ::sem_init(&sem[1], pshared, 0));
+    ASSERT_EQ(0, ::sem_init(&semReply[0], pshared, 0));
+    ASSERT_EQ(0, ::sem_init(&semReply[1], pshared, 0));
+  }
+  void sem_destroy() {
+    ASSERT_EQ(0, ::sem_destroy(&sem[0]));
+    ASSERT_EQ(0, ::sem_destroy(&sem[1]));
+    ASSERT_EQ(0, ::sem_destroy(&semReply[0]));
+    ASSERT_EQ(0, ::sem_destroy(&semReply[1]));
+  }
+};
+
+// Wakeup main (for (N) steps)
+#define PING_MAIN(n) ASSERT_EQ(0, sem_post(&s.sem[n%2]))
+// Wait for main to wake us up (for (RN) steps)
+#define WAIT_MAIN(n) \
+  ASSERT_EQ(0, sem_timedwait(&s.semReply[n%2], abstime(ts, waitSlowMs)))
+
+// Wakeup worker (for (RN) steps)
+#define PING_WORKER(n) ASSERT_EQ(0, sem_post(&s.semReply[n%2]))
+// Wait for worker to wake us up (for (N) steps)
+#define WAIT_WORKER(n) \
+  ASSERT_EQ(0, sem_timedwait(&s.sem[n%2], abstime(ts, waitSlowMs)))
+// Worker shall not wake us up (for (N) steps)
+#define NOT_WAIT_WORKER(n) \
+  ASSERT_EQ(-1, sem_timedwait(&s.sem[n%2], abstime(ts, waitMs)))
+
+// Do twice an operation
+#define TWICE(EXPR) do {			\
+    EXPR;					\
+    EXPR;					\
+  } while(0)
+
+/* Locking in different threads */
+
+// Used by ConcurrentLocking test
+static void thread_ConcurrentLocking(str_ConcurrentLocking& s) {
+  struct ceph_mount_info *const cmount = s.cmount;
+  struct timespec ts;
+
+  const int fd = ceph_open(cmount, s.file, O_RDWR | O_CREAT, fileMode);
+  ASSERT_GE(fd, 0); 
+
+  ASSERT_EQ(-EWOULDBLOCK,
+	    ceph_flock(cmount, fd, LOCK_EX | LOCK_NB, pthread_self()));
+  PING_MAIN(1); // (1)
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_EX, pthread_self()));
+  PING_MAIN(2); // (2)
+
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_UN, pthread_self()));
+  PING_MAIN(3); // (3)
+
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_SH, pthread_self()));
+  PING_MAIN(4); // (4)
+
+  WAIT_MAIN(1); // (R1)
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_UN, pthread_self()));
+  PING_MAIN(5); // (5)
+
+  WAIT_MAIN(2); // (R2)
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_EX, pthread_self()));
+  PING_MAIN(6); // (6)
+
+  WAIT_MAIN(3); // (R3)
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_UN, pthread_self()));
+  PING_MAIN(7); // (7)
+}
+
+// Used by ConcurrentLocking test
+static void* thread_ConcurrentLocking_(void *arg) {
+  str_ConcurrentLocking *const s =
+    reinterpret_cast<str_ConcurrentLocking*>(arg);
+  thread_ConcurrentLocking(*s);
+  return NULL;
+}
+
+TEST(LibCephFS, ConcurrentLocking) {
+  const pid_t mypid = getpid();
+  struct ceph_mount_info *cmount;
+  STARTUP_CEPH();
+
+  char c_file[1024];
+  sprintf(c_file, "/flock_test_%d", mypid);
+  const int fd = ceph_open(cmount, c_file, O_RDWR | O_CREAT, fileMode);
+  ASSERT_GE(fd, 0); 
+
+  // Lock
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_EX, pthread_self()));
+
+  // Start locker thread
+  pthread_t thread;
+  struct timespec ts;
+  str_ConcurrentLocking s = { c_file, cmount };
+  s.sem_init(0);
+  ASSERT_EQ(0, pthread_create(&thread, NULL, thread_ConcurrentLocking_, &s));
+  // Synchronization point with thread (failure: thread is dead)
+  WAIT_WORKER(1); // (1)
+
+  // Shall not have lock immediately
+  NOT_WAIT_WORKER(2); // (2)
+
+  // Unlock
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_UN, pthread_self()));
+
+  // Shall have lock
+  // Synchronization point with thread (failure: thread is dead)
+  WAIT_WORKER(2); // (2)
+
+  // Synchronization point with thread (failure: thread is dead)
+  WAIT_WORKER(3); // (3)
+
+  // Wait for thread to share lock
+  WAIT_WORKER(4); // (4)
+  ASSERT_EQ(-EWOULDBLOCK,
+	    ceph_flock(cmount, fd, LOCK_EX | LOCK_NB, pthread_self()));
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_SH | LOCK_NB, pthread_self()));
+
+  // Wake up thread to unlock shared lock
+  PING_WORKER(1); // (R1)
+  WAIT_WORKER(5); // (5)
+
+  // Now we can lock exclusively
+  // Upgrade to exclusive lock (as per POSIX)
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_EX, pthread_self()));
+
+  // Wake up thread to lock shared lock
+  PING_WORKER(2); // (R2)
+
+  // Shall not have lock immediately
+  NOT_WAIT_WORKER(6); // (6)
+
+  // Release lock ; thread will get it
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_UN, pthread_self()));
+  WAIT_WORKER(6); // (6)
+
+  // We no longer have the lock
+  ASSERT_EQ(-EWOULDBLOCK,
+	    ceph_flock(cmount, fd, LOCK_EX | LOCK_NB, pthread_self()));
+  ASSERT_EQ(-EWOULDBLOCK,
+	    ceph_flock(cmount, fd, LOCK_SH | LOCK_NB, pthread_self()));
+
+  // Wake up thread to unlock exclusive lock
+  PING_WORKER(3); // (R3)
+  WAIT_WORKER(7); // (7)
+
+  // We can lock it again
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_EX | LOCK_NB, pthread_self()));
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_UN, pthread_self()));
+
+  // Cleanup
+  void *retval = (void*) (uintptr_t) -1;
+  ASSERT_EQ(0, pthread_join(thread, &retval));
+  ASSERT_EQ(NULL, retval);
+  s.sem_destroy();
+  ASSERT_EQ(0, ceph_close(cmount, fd));
+  ASSERT_EQ(0, ceph_unlink(cmount, c_file));
+  CLEANUP_CEPH();
+}
+
+TEST(LibCephFS, ThreesomeLocking) {
+  const pid_t mypid = getpid();
+  struct ceph_mount_info *cmount;
+  STARTUP_CEPH();
+
+  char c_file[1024];
+  sprintf(c_file, "/flock_test_%d", mypid);
+  const int fd = ceph_open(cmount, c_file, O_RDWR | O_CREAT, fileMode);
+  ASSERT_GE(fd, 0); 
+
+  // Lock
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_EX, pthread_self()));
+
+  // Start locker thread
+  pthread_t thread[2];
+  struct timespec ts;
+  str_ConcurrentLocking s = { c_file, cmount };
+  s.sem_init(0);
+  ASSERT_EQ(0, pthread_create(&thread[0], NULL, thread_ConcurrentLocking_, &s));
+  ASSERT_EQ(0, pthread_create(&thread[1], NULL, thread_ConcurrentLocking_, &s));
+  // Synchronization point with thread (failure: thread is dead)
+  TWICE(WAIT_WORKER(1)); // (1)
+
+  // Shall not have lock immediately
+  NOT_WAIT_WORKER(2); // (2)
+
+  // Unlock
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_UN, pthread_self()));
+
+  // Shall have lock
+  TWICE(// Synchronization point with thread (failure: thread is dead)
+	WAIT_WORKER(2); // (2)
+	
+	// Synchronization point with thread (failure: thread is dead)
+	WAIT_WORKER(3)); // (3)
+  
+  // Wait for thread to share lock
+  TWICE(WAIT_WORKER(4)); // (4)
+  ASSERT_EQ(-EWOULDBLOCK,
+	    ceph_flock(cmount, fd, LOCK_EX | LOCK_NB, pthread_self()));
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_SH | LOCK_NB, pthread_self()));
+
+  // Wake up thread to unlock shared lock
+  TWICE(PING_WORKER(1); // (R1)
+	WAIT_WORKER(5)); // (5)
+
+  // Now we can lock exclusively
+  // Upgrade to exclusive lock (as per POSIX)
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_EX, pthread_self()));
+
+  TWICE(  // Wake up thread to lock shared lock
+	PING_WORKER(2); // (R2)
+	
+	// Shall not have lock immediately
+	NOT_WAIT_WORKER(6)); // (6)
+  
+  // Release lock ; thread will get it
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_UN, pthread_self()));
+  TWICE(WAIT_WORKER(6); // (6)
+	
+	// We no longer have the lock
+	ASSERT_EQ(-EWOULDBLOCK,
+		  ceph_flock(cmount, fd, LOCK_EX | LOCK_NB, pthread_self()));
+	ASSERT_EQ(-EWOULDBLOCK,
+		  ceph_flock(cmount, fd, LOCK_SH | LOCK_NB, pthread_self()));
+	
+	// Wake up thread to unlock exclusive lock
+	PING_WORKER(3); // (R3)
+	WAIT_WORKER(7); // (7)
+	);
+  
+  // We can lock it again
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_EX | LOCK_NB, pthread_self()));
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_UN, pthread_self()));
+
+  // Cleanup
+  void *retval = (void*) (uintptr_t) -1;
+  ASSERT_EQ(0, pthread_join(thread[0], &retval));
+  ASSERT_EQ(NULL, retval);
+  ASSERT_EQ(0, pthread_join(thread[1], &retval));
+  ASSERT_EQ(NULL, retval);
+  s.sem_destroy();
+  ASSERT_EQ(0, ceph_close(cmount, fd));
+  ASSERT_EQ(0, ceph_unlink(cmount, c_file));
+  CLEANUP_CEPH();
+}
+
+/* Locking in different processes */
+
+#define PROCESS_SLOW_MS() \
+  static const long waitMs = 100; \
+  (void) waitMs
+
+// Used by ConcurrentLocking test
+static void process_ConcurrentLocking(str_ConcurrentLocking& s) {
+  const pid_t mypid = getpid();
+  PROCESS_SLOW_MS();
+
+  struct ceph_mount_info *cmount = NULL;
+  struct timespec ts;
+
+  STARTUP_CEPH();
+  s.cmount = cmount;
+
+  const int fd = ceph_open(cmount, s.file, O_RDWR | O_CREAT, fileMode);
+  ASSERT_GE(fd, 0); 
+  WAIT_MAIN(1); // (R1)
+
+  ASSERT_EQ(-EWOULDBLOCK,
+	    ceph_flock(cmount, fd, LOCK_EX | LOCK_NB, mypid));
+  PING_MAIN(1); // (1)
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_EX, mypid));
+  PING_MAIN(2); // (2)
+
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_UN, mypid));
+  PING_MAIN(3); // (3)
+
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_SH, mypid));
+  PING_MAIN(4); // (4)
+
+  WAIT_MAIN(2); // (R2)
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_UN, mypid));
+  PING_MAIN(5); // (5)
+
+  WAIT_MAIN(3); // (R3)
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_EX, mypid));
+  PING_MAIN(6); // (6)
+
+  WAIT_MAIN(4); // (R4)
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_UN, mypid));
+  PING_MAIN(7); // (7)
+
+  CLEANUP_CEPH();
+
+  s.sem_destroy();
+  exit(EXIT_SUCCESS);
+}
+
+TEST(LibCephFS, InterProcessLocking) {
+  PROCESS_SLOW_MS();
+  // Process synchronization
+  char c_file[1024];
+  const pid_t mypid = getpid();
+  sprintf(c_file, "/flock_test_%d", mypid);
+
+  // Note: the semaphores MUST be on a shared memory segment
+  str_ConcurrentLocking *const shs =
+    reinterpret_cast<str_ConcurrentLocking*>
+    (mmap(0, sizeof(*shs), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS,
+	  -1, 0));
+  str_ConcurrentLocking &s = *shs;
+  s.file = c_file;
+  s.sem_init(1);
+
+  // Start locker process
+  const pid_t pid = fork();
+  ASSERT_GE(pid, 0);
+  if (pid == 0) {
+    process_ConcurrentLocking(s);
+    exit(EXIT_FAILURE);
+  }
+
+  struct timespec ts;
+  struct ceph_mount_info *cmount;
+  STARTUP_CEPH();
+
+  const int fd = ceph_open(cmount, c_file, O_RDWR | O_CREAT, fileMode);
+  ASSERT_GE(fd, 0); 
+
+  // Lock
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_EX, mypid));
+
+  // Synchronization point with process (failure: process is dead)
+  PING_WORKER(1); // (R1)
+  WAIT_WORKER(1); // (1)
+
+  // Shall not have lock immediately
+  NOT_WAIT_WORKER(2); // (2)
+
+  // Unlock
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_UN, mypid));
+
+  // Shall have lock
+  // Synchronization point with process (failure: process is dead)
+  WAIT_WORKER(2); // (2)
+
+  // Synchronization point with process (failure: process is dead)
+  WAIT_WORKER(3); // (3)
+
+  // Wait for process to share lock
+  WAIT_WORKER(4); // (4)
+  ASSERT_EQ(-EWOULDBLOCK, ceph_flock(cmount, fd, LOCK_EX | LOCK_NB, mypid));
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_SH | LOCK_NB, mypid));
+
+  // Wake up process to unlock shared lock
+  PING_WORKER(2); // (R2)
+  WAIT_WORKER(5); // (5)
+
+  // Now we can lock exclusively
+  // Upgrade to exclusive lock (as per POSIX)
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_EX, mypid));
+
+  // Wake up process to lock shared lock
+  PING_WORKER(3); // (R3)
+
+  // Shall not have lock immediately
+  NOT_WAIT_WORKER(6); // (6)
+
+  // Release lock ; process will get it
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_UN, mypid));
+  WAIT_WORKER(6); // (6)
+
+  // We no longer have the lock
+  ASSERT_EQ(-EWOULDBLOCK, ceph_flock(cmount, fd, LOCK_EX | LOCK_NB, mypid));
+  ASSERT_EQ(-EWOULDBLOCK, ceph_flock(cmount, fd, LOCK_SH | LOCK_NB, mypid));
+
+  // Wake up process to unlock exclusive lock
+  PING_WORKER(4); // (R4)
+  WAIT_WORKER(7); // (7)
+
+  // We can lock it again
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_EX | LOCK_NB, mypid));
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_UN, mypid));
+
+  // Wait pid
+  int status;
+  ASSERT_EQ(pid, waitpid(pid, &status, 0));
+  ASSERT_EQ(EXIT_SUCCESS, status);
+
+  // Cleanup
+  s.sem_destroy();
+  ASSERT_EQ(0, munmap(shs, sizeof(*shs)));
+  ASSERT_EQ(0, ceph_close(cmount, fd));
+  ASSERT_EQ(0, ceph_unlink(cmount, c_file));
+  CLEANUP_CEPH();
+}
+
+TEST(LibCephFS, ThreesomeInterProcessLocking) {
+  PROCESS_SLOW_MS();
+  // Process synchronization
+  char c_file[1024];
+  const pid_t mypid = getpid();
+  sprintf(c_file, "/flock_test_%d", mypid);
+
+  // Note: the semaphores MUST be on a shared memory segment
+  str_ConcurrentLocking *const shs =
+    reinterpret_cast<str_ConcurrentLocking*>
+    (mmap(0, sizeof(*shs), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS,
+	  -1, 0));
+  str_ConcurrentLocking &s = *shs;
+  s.file = c_file;
+  s.sem_init(1);
+
+  // Start locker processes
+  pid_t pid[2];
+  pid[0] = fork();
+  ASSERT_GE(pid[0], 0);
+  if (pid[0] == 0) {
+    process_ConcurrentLocking(s);
+    exit(EXIT_FAILURE);
+  }
+  pid[1] = fork();
+  ASSERT_GE(pid[1], 0);
+  if (pid[1] == 0) {
+    process_ConcurrentLocking(s);
+    exit(EXIT_FAILURE);
+  }
+
+  struct timespec ts;
+  struct ceph_mount_info *cmount;
+  STARTUP_CEPH();
+
+  const int fd = ceph_open(cmount, c_file, O_RDWR | O_CREAT, fileMode);
+  ASSERT_GE(fd, 0); 
+
+  // Lock
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_EX, mypid));
+
+  // Synchronization point with process (failure: process is dead)
+  TWICE(PING_WORKER(1)); // (R1)
+  TWICE(WAIT_WORKER(1)); // (1)
+
+  // Shall not have lock immediately
+  NOT_WAIT_WORKER(2); // (2)
+
+  // Unlock
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_UN, mypid));
+
+  // Shall have lock
+  TWICE(// Synchronization point with process (failure: process is dead)
+	WAIT_WORKER(2); // (2)
+	
+	// Synchronization point with process (failure: process is dead)
+	WAIT_WORKER(3)); // (3)
+  
+  // Wait for process to share lock
+  TWICE(WAIT_WORKER(4)); // (4)
+  ASSERT_EQ(-EWOULDBLOCK,
+	    ceph_flock(cmount, fd, LOCK_EX | LOCK_NB, mypid));
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_SH | LOCK_NB, mypid));
+
+  // Wake up process to unlock shared lock
+  TWICE(PING_WORKER(2); // (R2)
+	WAIT_WORKER(5)); // (5)
+
+  // Now we can lock exclusively
+  // Upgrade to exclusive lock (as per POSIX)
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_EX, mypid));
+
+  TWICE(  // Wake up process to lock shared lock
+	PING_WORKER(3); // (R3)
+	
+	// Shall not have lock immediately
+	NOT_WAIT_WORKER(6)); // (6)
+  
+  // Release lock ; process will get it
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_UN, mypid));
+  TWICE(WAIT_WORKER(6); // (6)
+	
+	// We no longer have the lock
+	ASSERT_EQ(-EWOULDBLOCK,
+		  ceph_flock(cmount, fd, LOCK_EX | LOCK_NB, mypid));
+	ASSERT_EQ(-EWOULDBLOCK,
+		  ceph_flock(cmount, fd, LOCK_SH | LOCK_NB, mypid));
+	
+	// Wake up process to unlock exclusive lock
+	PING_WORKER(4); // (R4)
+	WAIT_WORKER(7); // (7)
+	);
+  
+  // We can lock it again
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_EX | LOCK_NB, mypid));
+  ASSERT_EQ(0, ceph_flock(cmount, fd, LOCK_UN, mypid));
+
+  // Wait pids
+  int status;
+  ASSERT_EQ(pid[0], waitpid(pid[0], &status, 0));
+  ASSERT_EQ(EXIT_SUCCESS, status);
+  ASSERT_EQ(pid[1], waitpid(pid[1], &status, 0));
+  ASSERT_EQ(EXIT_SUCCESS, status);
+
+  // Cleanup
+  s.sem_destroy();
+  ASSERT_EQ(0, munmap(shs, sizeof(*shs)));
+  ASSERT_EQ(0, ceph_close(cmount, fd));
+  ASSERT_EQ(0, ceph_unlink(cmount, c_file));
+  CLEANUP_CEPH();
+}
diff --git a/src/test/libcephfs/test.cc b/src/test/libcephfs/test.cc
index 922b452..5f8e343 100644
--- a/src/test/libcephfs/test.cc
+++ b/src/test/libcephfs/test.cc
@@ -21,6 +21,7 @@
 #include <sys/stat.h>
 #include <dirent.h>
 #include <sys/xattr.h>
+#include <sys/uio.h>
 
 #ifdef __linux__
 #include <limits.h>
@@ -136,6 +137,8 @@ TEST(LibCephFS, ReleaseMounted) {
   ASSERT_EQ(0, ceph_conf_read_file(cmount, NULL));
   ASSERT_EQ(0, ceph_mount(cmount, "/"));
   ASSERT_EQ(-EISCONN, ceph_release(cmount));
+  ASSERT_EQ(0, ceph_unmount(cmount));
+  ASSERT_EQ(0, ceph_release(cmount));
 }
 
 TEST(LibCephFS, UnmountRelease) {
@@ -884,6 +887,24 @@ TEST(LibCephFS, HardlinkNoOriginal) {
   ceph_shutdown(cmount);
 }
 
+TEST(LibCephFS, BadArgument) {
+  struct ceph_mount_info *cmount;
+  ASSERT_EQ(ceph_create(&cmount, NULL), 0);
+  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
+  ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
+  ASSERT_EQ(ceph_mount(cmount, NULL), 0);
+
+  int fd = ceph_open(cmount, "test_file", O_CREAT|O_RDWR, 0666);
+  ASSERT_GT(fd, 0);
+  char buf[100];
+  ASSERT_EQ(ceph_write(cmount, fd, buf, sizeof(buf), 0), (int)sizeof(buf));
+  ASSERT_EQ(ceph_read(cmount, fd, buf, 0, 5), 0);
+  ceph_close(cmount, fd);
+  ASSERT_EQ(ceph_unlink(cmount, "test_file"), 0);
+
+  ceph_shutdown(cmount);
+}
+
 TEST(LibCephFS, BadFileDesc) {
   struct ceph_mount_info *cmount;
   ASSERT_EQ(ceph_create(&cmount, NULL), 0);
@@ -944,6 +965,44 @@ TEST(LibCephFS, ReadEmptyFile) {
   ceph_shutdown(cmount);
 }
 
+TEST(LibCephFS, PreadvPwritev) {
+  struct ceph_mount_info *cmount;
+  ASSERT_EQ(ceph_create(&cmount, NULL), 0);
+  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
+  ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
+  ASSERT_EQ(ceph_mount(cmount, NULL), 0);
+
+  int mypid = getpid();
+  char testf[256];
+
+  sprintf(testf, "test_preadvpwritevfile%d", mypid);
+  int fd = ceph_open(cmount, testf, O_CREAT|O_RDWR, 0666);
+  ASSERT_GT(fd, 0);
+
+  char out0[] = "hello ";
+  char out1[] = "world\n";
+  struct iovec iov_out[2] = {
+	{out0, sizeof(out0)},
+	{out1, sizeof(out1)},
+  };
+  char in0[sizeof(out0)];
+  char in1[sizeof(out1)];
+  struct iovec iov_in[2] = {
+	{in0, sizeof(in0)},
+	{in1, sizeof(in1)},
+  };
+  ssize_t nwritten = iov_out[0].iov_len + iov_out[1].iov_len; 
+  ssize_t nread = iov_in[0].iov_len + iov_in[1].iov_len; 
+
+  ASSERT_EQ(ceph_pwritev(cmount, fd, iov_out, 2, 0), nwritten);
+  ASSERT_EQ(ceph_preadv(cmount, fd, iov_in, 2, 0), nread);
+  ASSERT_EQ(0, strncmp((const char*)iov_in[0].iov_base, (const char*)iov_out[0].iov_base, iov_out[0].iov_len));
+  ASSERT_EQ(0, strncmp((const char*)iov_in[1].iov_base, (const char*)iov_out[1].iov_base, iov_out[1].iov_len));
+
+  ceph_close(cmount, fd);
+  ceph_shutdown(cmount);
+}
+
 TEST(LibCephFS, StripeUnitGran) {
   struct ceph_mount_info *cmount;
   ASSERT_EQ(ceph_create(&cmount, NULL), 0);
diff --git a/src/test/librados/aio.cc b/src/test/librados/aio.cc
index b9183aa..6971715 100644
--- a/src/test/librados/aio.cc
+++ b/src/test/librados/aio.cc
@@ -2,9 +2,11 @@
 #include "include/rados/librados.h"
 #include "test/librados/test.h"
 #include "include/types.h"
+#include "include/stringify.h"
 
 #include "gtest/gtest.h"
 #include <errno.h>
+#include <fcntl.h>
 #include <semaphore.h>
 #include <sstream>
 #include <string>
@@ -32,31 +34,30 @@ public:
     if (m_init) {
       rados_ioctx_destroy(m_ioctx);
       destroy_one_pool(m_pool_name, &m_cluster);
-      sem_destroy(&m_sem);
+      sem_close(m_sem);
     }
   }
 
   std::string init()
   {
     int ret;
-    if (sem_init(&m_sem, 0, 0)) {
+    if (SEM_FAILED == (m_sem = sem_open("test_aio_sem", O_CREAT, 0644, 0))) {
       int err = errno;
-      sem_destroy(&m_sem);
       ostringstream oss;
-      oss << "sem_init failed: " << cpp_strerror(err);
+      oss << "sem_open failed: " << cpp_strerror(err);
       return oss.str();
     }
     m_pool_name = get_temp_pool_name();
     std::string err = create_one_pool(m_pool_name, &m_cluster);
     if (!err.empty()) {
-      sem_destroy(&m_sem);
+      sem_close(m_sem);
       ostringstream oss;
       oss << "create_one_pool(" << m_pool_name << ") failed: error " << err;
       return oss.str();
     }
     ret = rados_ioctx_create(m_cluster, m_pool_name.c_str(), &m_ioctx);
     if (ret) {
-      sem_destroy(&m_sem);
+      sem_close(m_sem);
       destroy_one_pool(m_pool_name, &m_cluster);
       ostringstream oss;
       oss << "rados_ioctx_create failed: error " << ret;
@@ -66,7 +67,7 @@ public:
     return "";
   }
 
-  sem_t m_sem;
+  sem_t *m_sem;
   rados_t m_cluster;
   rados_ioctx_t m_ioctx;
   std::string m_pool_name;
@@ -90,31 +91,30 @@ public:
     if (m_init) {
       m_ioctx.close();
       destroy_one_pool_pp(m_pool_name, m_cluster);
-      sem_destroy(&m_sem);
+      sem_close(m_sem);
     }
   }
 
   std::string init()
   {
     int ret;
-    if (sem_init(&m_sem, 0, 0)) {
+    if (SEM_FAILED == (m_sem = sem_open("test_aio_sem", O_CREAT, 0644, 0))) {
       int err = errno;
-      sem_destroy(&m_sem);
       ostringstream oss;
-      oss << "sem_init failed: " << cpp_strerror(err);
+      oss << "sem_open failed: " << cpp_strerror(err);
       return oss.str();
     }
     m_pool_name = get_temp_pool_name();
     std::string err = create_one_pool_pp(m_pool_name, m_cluster);
     if (!err.empty()) {
-      sem_destroy(&m_sem);
+      sem_close(m_sem);
       ostringstream oss;
       oss << "create_one_pool(" << m_pool_name << ") failed: error " << err;
       return oss.str();
     }
     ret = m_cluster.ioctx_create(m_pool_name.c_str(), m_ioctx);
     if (ret) {
-      sem_destroy(&m_sem);
+      sem_close(m_sem);
       destroy_one_pool_pp(m_pool_name, m_cluster);
       ostringstream oss;
       oss << "rados_ioctx_create failed: error " << ret;
@@ -124,7 +124,7 @@ public:
     return "";
   }
 
-  sem_t m_sem;
+  sem_t *m_sem;
   Rados m_cluster;
   IoCtx m_ioctx;
   std::string m_pool_name;
@@ -137,28 +137,28 @@ void set_completion_complete(rados_completion_t cb, void *arg)
 {
   AioTestData *test = static_cast<AioTestData*>(arg);
   test->m_complete = true;
-  sem_post(&test->m_sem);
+  sem_post(test->m_sem);
 }
 
 void set_completion_safe(rados_completion_t cb, void *arg)
 {
   AioTestData *test = static_cast<AioTestData*>(arg);
   test->m_safe = true;
-  sem_post(&test->m_sem);
+  sem_post(test->m_sem);
 }
 
 void set_completion_completePP(rados_completion_t cb, void *arg)
 {
   AioTestDataPP *test = static_cast<AioTestDataPP*>(arg);
   test->m_complete = true;
-  sem_post(&test->m_sem);
+  sem_post(test->m_sem);
 }
 
 void set_completion_safePP(rados_completion_t cb, void *arg)
 {
   AioTestDataPP *test = static_cast<AioTestDataPP*>(arg);
   test->m_safe = true;
-  sem_post(&test->m_sem);
+  sem_post(test->m_sem);
 }
 
 TEST(LibRadosAio, TooBig) {
@@ -191,6 +191,58 @@ TEST(LibRadosAio, TooBigPP) {
   delete aio_completion;
 }
 
+TEST(LibRadosAio, PoolQuotaPP) {
+  AioTestDataPP test_data;
+  ASSERT_EQ("", test_data.init());
+  string p = get_temp_pool_name();
+  ASSERT_EQ(0, test_data.m_cluster.pool_create(p.c_str()));
+  IoCtx ioctx;
+  ASSERT_EQ(0, test_data.m_cluster.ioctx_create(p.c_str(), ioctx));
+
+  bufferlist inbl;
+  ASSERT_EQ(0, test_data.m_cluster.mon_command(
+      "{\"prefix\": \"osd pool set-quota\", \"pool\": \"" + p +
+      "\", \"field\": \"max_bytes\", \"val\": \"4096\"}",
+      inbl, NULL, NULL));
+
+  bufferlist bl;
+  bufferptr z(4096);
+  bl.append(z);
+  int n;
+  for (n = 0; n < 1024; ++n) {
+    ObjectWriteOperation op;
+    op.write_full(bl);
+    librados::AioCompletion *completion =
+      test_data.m_cluster.aio_create_completion();
+    ASSERT_EQ(0, ioctx.aio_operate(
+	"foo" + stringify(n), completion, &op,
+	librados::OPERATION_FULL_TRY));
+    completion->wait_for_safe();
+    int r = completion->get_return_value();
+    completion->release();
+    if (r == -EDQUOT)
+      break;
+    ASSERT_EQ(0, r);
+    sleep(1);
+  }
+  ASSERT_LT(n, 1024);
+
+  // make sure we block without FULL_TRY
+  {
+    ObjectWriteOperation op;
+    op.write_full(bl);
+    librados::AioCompletion *completion =
+      test_data.m_cluster.aio_create_completion();
+    ASSERT_EQ(0, ioctx.aio_operate("bar", completion, &op, 0));
+    sleep(5);
+    ASSERT_FALSE(completion->is_safe());
+    completion->release();
+  }
+
+  ioctx.close();
+  ASSERT_EQ(0, test_data.m_cluster.pool_delete(p.c_str()));
+}
+
 TEST(LibRadosAio, SimpleWrite) {
   AioTestData test_data;
   rados_completion_t my_completion;
@@ -203,23 +255,25 @@ TEST(LibRadosAio, SimpleWrite) {
 			       my_completion, buf, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(&test_data.m_sem);
-    sem_wait(&test_data.m_sem);
+    sem_wait(test_data.m_sem);
+    sem_wait(test_data.m_sem);
   }
   ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
 
   rados_ioctx_set_namespace(test_data.m_ioctx, "nspace");
+  rados_completion_t my_completion2;
   ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_complete, set_completion_safe, &my_completion));
+	      set_completion_complete, set_completion_safe, &my_completion2));
   ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
-			       my_completion, buf, sizeof(buf), 0));
+			       my_completion2, buf, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(&test_data.m_sem);
-    sem_wait(&test_data.m_sem);
+    sem_wait(test_data.m_sem);
+    sem_wait(test_data.m_sem);
   }
-  ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
+  ASSERT_EQ(0, rados_aio_get_return_value(my_completion2));
   rados_aio_release(my_completion);
+  rados_aio_release(my_completion2);
 }
 
 TEST(LibRadosAio, SimpleWritePP) {
@@ -238,8 +292,8 @@ TEST(LibRadosAio, SimpleWritePP) {
 			       my_completion, bl1, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(&test_data.m_sem);
-    sem_wait(&test_data.m_sem);
+    sem_wait(test_data.m_sem);
+    sem_wait(test_data.m_sem);
   }
   ASSERT_EQ(0, my_completion->get_return_value());
   delete my_completion;
@@ -255,8 +309,8 @@ TEST(LibRadosAio, SimpleWritePP) {
 			       my_completion, bl1, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(&test_data.m_sem);
-    sem_wait(&test_data.m_sem);
+    sem_wait(test_data.m_sem);
+    sem_wait(test_data.m_sem);
   }
   ASSERT_EQ(0, my_completion->get_return_value());
   delete my_completion;
@@ -310,8 +364,8 @@ TEST(LibRadosAio, RoundTrip) {
 			       my_completion, buf, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(&test_data.m_sem);
-    sem_wait(&test_data.m_sem);
+    sem_wait(test_data.m_sem);
+    sem_wait(test_data.m_sem);
   }
   ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
   char buf2[256];
@@ -343,8 +397,8 @@ TEST(LibRadosAio, RoundTrip2) {
 			       my_completion, buf, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(&test_data.m_sem);
-    sem_wait(&test_data.m_sem);
+    sem_wait(test_data.m_sem);
+    sem_wait(test_data.m_sem);
   }
   ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
   char buf2[128];
@@ -379,8 +433,8 @@ TEST(LibRadosAio, RoundTripPP) {
 					   bl1, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(&test_data.m_sem);
-    sem_wait(&test_data.m_sem);
+    sem_wait(test_data.m_sem);
+    sem_wait(test_data.m_sem);
   }
   ASSERT_EQ(0, my_completion->get_return_value());
   bufferlist bl2;
@@ -415,8 +469,8 @@ TEST(LibRadosAio, RoundTripPP2) {
 					   bl1, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(&test_data.m_sem);
-    sem_wait(&test_data.m_sem);
+    sem_wait(test_data.m_sem);
+    sem_wait(test_data.m_sem);
   }
   ASSERT_EQ(0, my_completion->get_return_value());
   bufferlist bl2;
@@ -478,6 +532,41 @@ TEST(LibRadosAio, RoundTripPP3)
   destroy_one_pool_pp(pool_name, cluster);
 }
 
+TEST(LibRadosAio, RoundTripSparseReadPP) {
+  AioTestDataPP test_data;
+  ASSERT_EQ("", test_data.init());
+  AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
+	  (void*)&test_data, set_completion_completePP, set_completion_safePP);
+  AioCompletion *my_completion_null = NULL;
+  ASSERT_NE(my_completion, my_completion_null);
+  char buf[128];
+  memset(buf, 0xcc, sizeof(buf));
+  bufferlist bl1;
+  bl1.append(buf, sizeof(buf));
+  ASSERT_EQ(0, test_data.m_ioctx.aio_write("foo", my_completion,
+					   bl1, sizeof(buf), 0));
+  {
+    TestAlarm alarm;
+    sem_wait(test_data.m_sem);
+    sem_wait(test_data.m_sem);
+  }
+  ASSERT_EQ(0, my_completion->get_return_value());
+  std::map<uint64_t, uint64_t> extents;
+  bufferlist bl2;
+  AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
+	  (void*)&test_data, set_completion_completePP, set_completion_safePP);
+  ASSERT_NE(my_completion2, my_completion_null);
+  ASSERT_EQ(0, test_data.m_ioctx.aio_sparse_read("foo",
+			      my_completion2, &extents, &bl2, sizeof(buf), 0));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, my_completion2->wait_for_complete());
+  }
+  ASSERT_EQ(0, my_completion2->get_return_value());
+  assert_eq_sparse(bl1, extents, bl2);
+  delete my_completion;
+  delete my_completion2;
+}
 
 TEST(LibRadosAio, RoundTripAppend) {
   AioTestData test_data;
@@ -586,8 +675,8 @@ TEST(LibRadosAio, IsComplete) {
 			       my_completion, buf, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(&test_data.m_sem);
-    sem_wait(&test_data.m_sem);
+    sem_wait(test_data.m_sem);
+    sem_wait(test_data.m_sem);
   }
   ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
   char buf2[128];
@@ -629,8 +718,8 @@ TEST(LibRadosAio, IsCompletePP) {
 					   bl1, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(&test_data.m_sem);
-    sem_wait(&test_data.m_sem);
+    sem_wait(test_data.m_sem);
+    sem_wait(test_data.m_sem);
   }
   ASSERT_EQ(0, my_completion->get_return_value());
   bufferlist bl2;
@@ -1066,8 +1155,8 @@ TEST(LibRadosAio, SimpleStat) {
 			       my_completion, buf, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(&test_data.m_sem);
-    sem_wait(&test_data.m_sem);
+    sem_wait(test_data.m_sem);
+    sem_wait(test_data.m_sem);
   }
   ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
   uint64_t psize;
@@ -1102,8 +1191,8 @@ TEST(LibRadosAio, SimpleStatPP) {
 					   bl1, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(&test_data.m_sem);
-    sem_wait(&test_data.m_sem);
+    sem_wait(test_data.m_sem);
+    sem_wait(test_data.m_sem);
   }
   ASSERT_EQ(0, my_completion->get_return_value());
   uint64_t psize;
@@ -1135,8 +1224,8 @@ TEST(LibRadosAio, SimpleStatNS) {
 			       my_completion, buf, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(&test_data.m_sem);
-    sem_wait(&test_data.m_sem);
+    sem_wait(test_data.m_sem);
+    sem_wait(test_data.m_sem);
   }
   ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
   rados_ioctx_set_namespace(test_data.m_ioctx, "nspace");
@@ -1148,8 +1237,8 @@ TEST(LibRadosAio, SimpleStatNS) {
 			       my_completion, buf2, sizeof(buf2), 0));
   {
     TestAlarm alarm;
-    sem_wait(&test_data.m_sem);
-    sem_wait(&test_data.m_sem);
+    sem_wait(test_data.m_sem);
+    sem_wait(test_data.m_sem);
   }
   ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
   uint64_t psize;
@@ -1200,8 +1289,8 @@ TEST(LibRadosAio, SimpleStatPPNS) {
 					   bl1, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(&test_data.m_sem);
-    sem_wait(&test_data.m_sem);
+    sem_wait(test_data.m_sem);
+    sem_wait(test_data.m_sem);
   }
   ASSERT_EQ(0, my_completion->get_return_value());
   uint64_t psize;
@@ -1233,8 +1322,8 @@ TEST(LibRadosAio, StatRemove) {
 			       my_completion, buf, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(&test_data.m_sem);
-    sem_wait(&test_data.m_sem);
+    sem_wait(test_data.m_sem);
+    sem_wait(test_data.m_sem);
   }
   ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
   uint64_t psize;
@@ -1292,8 +1381,8 @@ TEST(LibRadosAio, StatRemovePP) {
 					   bl1, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(&test_data.m_sem);
-    sem_wait(&test_data.m_sem);
+    sem_wait(test_data.m_sem);
+    sem_wait(test_data.m_sem);
   }
   ASSERT_EQ(0, my_completion->get_return_value());
   uint64_t psize;
@@ -1659,31 +1748,30 @@ public:
     if (m_init) {
       rados_ioctx_destroy(m_ioctx);
       destroy_one_ec_pool(m_pool_name, &m_cluster);
-      sem_destroy(&m_sem);
+      sem_close(m_sem);
     }
   }
 
   std::string init()
   {
     int ret;
-    if (sem_init(&m_sem, 0, 0)) {
+    if (SEM_FAILED == (m_sem = sem_open("test_aio_sem", O_CREAT, 0644, 0))) {
       int err = errno;
-      sem_destroy(&m_sem);
       ostringstream oss;
-      oss << "sem_init failed: " << cpp_strerror(err);
+      oss << "sem_open failed: " << cpp_strerror(err);
       return oss.str();
     }
     m_pool_name = get_temp_pool_name();
     std::string err = create_one_ec_pool(m_pool_name, &m_cluster);
     if (!err.empty()) {
-      sem_destroy(&m_sem);
+      sem_close(m_sem);
       ostringstream oss;
       oss << "create_one_ec_pool(" << m_pool_name << ") failed: error " << err;
       return oss.str();
     }
     ret = rados_ioctx_create(m_cluster, m_pool_name.c_str(), &m_ioctx);
     if (ret) {
-      sem_destroy(&m_sem);
+      sem_close(m_sem);
       destroy_one_ec_pool(m_pool_name, &m_cluster);
       ostringstream oss;
       oss << "rados_ioctx_create failed: error " << ret;
@@ -1693,7 +1781,7 @@ public:
     return "";
   }
 
-  sem_t m_sem;
+  sem_t *m_sem;
   rados_t m_cluster;
   rados_ioctx_t m_ioctx;
   std::string m_pool_name;
@@ -1717,31 +1805,30 @@ public:
     if (m_init) {
       m_ioctx.close();
       destroy_one_ec_pool_pp(m_pool_name, m_cluster);
-      sem_destroy(&m_sem);
+      sem_close(m_sem);
     }
   }
 
   std::string init()
   {
     int ret;
-    if (sem_init(&m_sem, 0, 0)) {
+    if (SEM_FAILED == (m_sem = sem_open("test_aio_sem", O_CREAT, 0644, 0))) {
       int err = errno;
-      sem_destroy(&m_sem);
       ostringstream oss;
-      oss << "sem_init failed: " << cpp_strerror(err);
+      oss << "sem_open failed: " << cpp_strerror(err);
       return oss.str();
     }
     m_pool_name = get_temp_pool_name();
     std::string err = create_one_ec_pool_pp(m_pool_name, m_cluster);
     if (!err.empty()) {
-      sem_destroy(&m_sem);
+      sem_close(m_sem);
       ostringstream oss;
       oss << "create_one_ec_pool(" << m_pool_name << ") failed: error " << err;
       return oss.str();
     }
     ret = m_cluster.ioctx_create(m_pool_name.c_str(), m_ioctx);
     if (ret) {
-      sem_destroy(&m_sem);
+      sem_close(m_sem);
       destroy_one_ec_pool_pp(m_pool_name, m_cluster);
       ostringstream oss;
       oss << "rados_ioctx_create failed: error " << ret;
@@ -1751,7 +1838,7 @@ public:
     return "";
   }
 
-  sem_t m_sem;
+  sem_t *m_sem;
   Rados m_cluster;
   IoCtx m_ioctx;
   std::string m_pool_name;
@@ -1764,28 +1851,28 @@ void set_completion_completeEC(rados_completion_t cb, void *arg)
 {
   AioTestDataEC *test = static_cast<AioTestDataEC*>(arg);
   test->m_complete = true;
-  sem_post(&test->m_sem);
+  sem_post(test->m_sem);
 }
 
 void set_completion_safeEC(rados_completion_t cb, void *arg)
 {
   AioTestDataEC *test = static_cast<AioTestDataEC*>(arg);
   test->m_safe = true;
-  sem_post(&test->m_sem);
+  sem_post(test->m_sem);
 }
 
 void set_completion_completeECPP(rados_completion_t cb, void *arg)
 {
   AioTestDataECPP *test = static_cast<AioTestDataECPP*>(arg);
   test->m_complete = true;
-  sem_post(&test->m_sem);
+  sem_post(test->m_sem);
 }
 
 void set_completion_safeECPP(rados_completion_t cb, void *arg)
 {
   AioTestDataECPP *test = static_cast<AioTestDataECPP*>(arg);
   test->m_safe = true;
-  sem_post(&test->m_sem);
+  sem_post(test->m_sem);
 }
 
 TEST(LibRadosAioEC, SimpleWrite) {
@@ -1800,23 +1887,25 @@ TEST(LibRadosAioEC, SimpleWrite) {
 			       my_completion, buf, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(&test_data.m_sem);
-    sem_wait(&test_data.m_sem);
+    sem_wait(test_data.m_sem);
+    sem_wait(test_data.m_sem);
   }
   ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
 
   rados_ioctx_set_namespace(test_data.m_ioctx, "nspace");
+  rados_completion_t my_completion2;
   ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-	      set_completion_completeEC, set_completion_safeEC, &my_completion));
+	      set_completion_completeEC, set_completion_safeEC, &my_completion2));
   ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
-			       my_completion, buf, sizeof(buf), 0));
+			       my_completion2, buf, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(&test_data.m_sem);
-    sem_wait(&test_data.m_sem);
+    sem_wait(test_data.m_sem);
+    sem_wait(test_data.m_sem);
   }
-  ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
+  ASSERT_EQ(0, rados_aio_get_return_value(my_completion2));
   rados_aio_release(my_completion);
+  rados_aio_release(my_completion2);
 }
 
 TEST(LibRadosAioEC, SimpleWritePP) {
@@ -1835,8 +1924,8 @@ TEST(LibRadosAioEC, SimpleWritePP) {
 			       my_completion, bl1, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(&test_data.m_sem);
-    sem_wait(&test_data.m_sem);
+    sem_wait(test_data.m_sem);
+    sem_wait(test_data.m_sem);
   }
   ASSERT_EQ(0, my_completion->get_return_value());
   delete my_completion;
@@ -1852,8 +1941,8 @@ TEST(LibRadosAioEC, SimpleWritePP) {
 			       my_completion, bl1, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(&test_data.m_sem);
-    sem_wait(&test_data.m_sem);
+    sem_wait(test_data.m_sem);
+    sem_wait(test_data.m_sem);
   }
   ASSERT_EQ(0, my_completion->get_return_value());
   delete my_completion;
@@ -1907,8 +1996,8 @@ TEST(LibRadosAioEC, RoundTrip) {
 			       my_completion, buf, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(&test_data.m_sem);
-    sem_wait(&test_data.m_sem);
+    sem_wait(test_data.m_sem);
+    sem_wait(test_data.m_sem);
   }
   ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
   char buf2[256];
@@ -1940,8 +2029,8 @@ TEST(LibRadosAioEC, RoundTrip2) {
 			       my_completion, buf, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(&test_data.m_sem);
-    sem_wait(&test_data.m_sem);
+    sem_wait(test_data.m_sem);
+    sem_wait(test_data.m_sem);
   }
   ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
   char buf2[128];
@@ -1976,8 +2065,8 @@ TEST(LibRadosAioEC, RoundTripPP) {
 					   bl1, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(&test_data.m_sem);
-    sem_wait(&test_data.m_sem);
+    sem_wait(test_data.m_sem);
+    sem_wait(test_data.m_sem);
   }
   ASSERT_EQ(0, my_completion->get_return_value());
   bufferlist bl2;
@@ -2012,8 +2101,8 @@ TEST(LibRadosAioEC, RoundTripPP2) {
 					   bl1, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(&test_data.m_sem);
-    sem_wait(&test_data.m_sem);
+    sem_wait(test_data.m_sem);
+    sem_wait(test_data.m_sem);
   }
   ASSERT_EQ(0, my_completion->get_return_value());
   bufferlist bl2;
@@ -2076,6 +2165,42 @@ TEST(LibRadosAioEC, RoundTripPP3)
   destroy_one_pool_pp(pool_name, cluster);
 }
 
+TEST(LibRadosAioEC, RoundTripSparseReadPP) {
+  AioTestDataECPP test_data;
+  ASSERT_EQ("", test_data.init());
+  AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
+	  (void*)&test_data, set_completion_completeECPP, set_completion_safeECPP);
+  AioCompletion *my_completion_null = NULL;
+  ASSERT_NE(my_completion, my_completion_null);
+  char buf[128];
+  memset(buf, 0xcc, sizeof(buf));
+  bufferlist bl1;
+  bl1.append(buf, sizeof(buf));
+  ASSERT_EQ(0, test_data.m_ioctx.aio_write("foo", my_completion,
+					   bl1, sizeof(buf), 0));
+  {
+    TestAlarm alarm;
+    sem_wait(test_data.m_sem);
+    sem_wait(test_data.m_sem);
+  }
+  ASSERT_EQ(0, my_completion->get_return_value());
+
+  map<uint64_t, uint64_t> extents;
+  bufferlist bl2;
+  AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
+	  (void*)&test_data, set_completion_completeECPP, set_completion_safeECPP);
+  ASSERT_NE(my_completion2, my_completion_null);
+  ASSERT_EQ(0, test_data.m_ioctx.aio_sparse_read("foo",
+			      my_completion2, &extents, &bl2, sizeof(buf), 0));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, my_completion2->wait_for_complete());
+  }
+  ASSERT_EQ(0, my_completion2->get_return_value());
+  assert_eq_sparse(bl1, extents, bl2);
+  delete my_completion;
+  delete my_completion2;
+}
 
 TEST(LibRadosAioEC, RoundTripAppend) {
   AioTestDataEC test_data;
@@ -2228,8 +2353,8 @@ TEST(LibRadosAioEC, IsComplete) {
 			       my_completion, buf, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(&test_data.m_sem);
-    sem_wait(&test_data.m_sem);
+    sem_wait(test_data.m_sem);
+    sem_wait(test_data.m_sem);
   }
   ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
   char buf2[128];
@@ -2271,8 +2396,8 @@ TEST(LibRadosAioEC, IsCompletePP) {
 					   bl1, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(&test_data.m_sem);
-    sem_wait(&test_data.m_sem);
+    sem_wait(test_data.m_sem);
+    sem_wait(test_data.m_sem);
   }
   ASSERT_EQ(0, my_completion->get_return_value());
   bufferlist bl2;
@@ -2708,8 +2833,8 @@ TEST(LibRadosAioEC, SimpleStat) {
 			       my_completion, buf, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(&test_data.m_sem);
-    sem_wait(&test_data.m_sem);
+    sem_wait(test_data.m_sem);
+    sem_wait(test_data.m_sem);
   }
   ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
   uint64_t psize;
@@ -2744,8 +2869,8 @@ TEST(LibRadosAioEC, SimpleStatPP) {
 					   bl1, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(&test_data.m_sem);
-    sem_wait(&test_data.m_sem);
+    sem_wait(test_data.m_sem);
+    sem_wait(test_data.m_sem);
   }
   ASSERT_EQ(0, my_completion->get_return_value());
   uint64_t psize;
@@ -2777,8 +2902,8 @@ TEST(LibRadosAioEC, SimpleStatNS) {
 			       my_completion, buf, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(&test_data.m_sem);
-    sem_wait(&test_data.m_sem);
+    sem_wait(test_data.m_sem);
+    sem_wait(test_data.m_sem);
   }
   ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
   rados_ioctx_set_namespace(test_data.m_ioctx, "nspace");
@@ -2790,8 +2915,8 @@ TEST(LibRadosAioEC, SimpleStatNS) {
 			       my_completion, buf2, sizeof(buf2), 0));
   {
     TestAlarm alarm;
-    sem_wait(&test_data.m_sem);
-    sem_wait(&test_data.m_sem);
+    sem_wait(test_data.m_sem);
+    sem_wait(test_data.m_sem);
   }
   ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
   uint64_t psize;
@@ -2842,8 +2967,8 @@ TEST(LibRadosAioEC, SimpleStatPPNS) {
 					   bl1, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(&test_data.m_sem);
-    sem_wait(&test_data.m_sem);
+    sem_wait(test_data.m_sem);
+    sem_wait(test_data.m_sem);
   }
   ASSERT_EQ(0, my_completion->get_return_value());
   uint64_t psize;
@@ -2875,8 +3000,8 @@ TEST(LibRadosAioEC, StatRemove) {
 			       my_completion, buf, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(&test_data.m_sem);
-    sem_wait(&test_data.m_sem);
+    sem_wait(test_data.m_sem);
+    sem_wait(test_data.m_sem);
   }
   ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
   uint64_t psize;
@@ -2934,8 +3059,8 @@ TEST(LibRadosAioEC, StatRemovePP) {
 					   bl1, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(&test_data.m_sem);
-    sem_wait(&test_data.m_sem);
+    sem_wait(test_data.m_sem);
+    sem_wait(test_data.m_sem);
   }
   ASSERT_EQ(0, my_completion->get_return_value());
   uint64_t psize;
diff --git a/src/test/librados/c_read_operations.cc b/src/test/librados/c_read_operations.cc
index 9b57747..d80fcdf 100644
--- a/src/test/librados/c_read_operations.cc
+++ b/src/test/librados/c_read_operations.cc
@@ -312,6 +312,26 @@ TEST_F(CReadOpsTest, Read) {
   remove_object();
 }
 
+
+TEST_F(CReadOpsTest, RWOrderedRead) {
+  write_object();
+
+  char buf[len];
+  rados_read_op_t op = rados_create_read_op();
+  size_t bytes_read = 0;
+  int rval;
+  rados_read_op_read(op, 0, len, buf, &bytes_read, &rval);
+  rados_read_op_set_flags(op, LIBRADOS_OP_FLAG_FADVISE_DONTNEED);
+  ASSERT_EQ(0, rados_read_op_operate(op, ioctx, obj,
+				     LIBRADOS_OPERATION_ORDER_READS_WRITES));
+  ASSERT_EQ(len, (int)bytes_read);
+  ASSERT_EQ(0, rval);
+  ASSERT_EQ(0, memcmp(data, buf, len));
+  rados_release_read_op(op);
+
+  remove_object();
+}
+
 TEST_F(CReadOpsTest, ShortRead) {
   write_object();
 
diff --git a/src/test/librados/c_write_operations.cc b/src/test/librados/c_write_operations.cc
index 37c7450..1ea9509 100644
--- a/src/test/librados/c_write_operations.cc
+++ b/src/test/librados/c_write_operations.cc
@@ -117,7 +117,7 @@ TEST(LibRadosCWriteOps, Xattrs) {
   ASSERT_TRUE(op);
   rados_write_op_cmpxattr(op, "key", LIBRADOS_CMPXATTR_OP_EQ, "value", 5);
   rados_write_op_setxattr(op, "key", "value", 5);
-  ASSERT_EQ(-125, rados_write_op_operate(op, ioctx, "test", NULL, 0));
+  ASSERT_EQ(-ECANCELED, rados_write_op_operate(op, ioctx, "test", NULL, 0));
 
   rados_release_write_op(op);
   rados_ioctx_destroy(ioctx);
diff --git a/src/test/librados/io.cc b/src/test/librados/io.cc
index 6f391df..cb37c45 100644
--- a/src/test/librados/io.cc
+++ b/src/test/librados/io.cc
@@ -118,6 +118,15 @@ TEST_F(LibRadosIoPP, ReadOpPP) {
   }
 
   {
+      bufferlist op_bl;
+      ObjectReadOperation op;
+      op.read(0, 0, NULL, NULL); //len=0 mean read the whole object data.
+      ASSERT_EQ(0, ioctx.operate("foo", &op, &op_bl));
+      ASSERT_EQ(sizeof(buf), op_bl.length());
+      ASSERT_EQ(0, memcmp(op_bl.c_str(), buf, sizeof(buf)));
+  }
+
+  {
       bufferlist read_bl, op_bl;
       ObjectReadOperation op;
       op.read(0, sizeof(buf), &read_bl, NULL);
@@ -223,6 +232,25 @@ TEST_F(LibRadosIoPP, ReadOpPP) {
   }
 }
 
+TEST_F(LibRadosIoPP, SparseReadOpPP) {
+  char buf[128];
+  memset(buf, 0xcc, sizeof(buf));
+  bufferlist bl;
+  bl.append(buf, sizeof(buf));
+  ASSERT_EQ(0, ioctx.write("foo", bl, sizeof(buf), 0));
+
+  {
+    std::map<uint64_t, uint64_t> extents;
+    bufferlist read_bl;
+    int rval = -1;
+    ObjectReadOperation op;
+    op.sparse_read(0, sizeof(buf), &extents, &read_bl, &rval);
+    ASSERT_EQ(0, ioctx.operate("foo", &op, nullptr));
+    ASSERT_EQ(0, rval);
+    assert_eq_sparse(bl, extents, read_bl);
+  }
+}
+
 TEST_F(LibRadosIo, RoundTrip) {
   char buf[128];
   char buf2[128];
@@ -231,6 +259,13 @@ TEST_F(LibRadosIo, RoundTrip) {
   memset(buf2, 0, sizeof(buf2));
   ASSERT_EQ((int)sizeof(buf2), rados_read(ioctx, "foo", buf2, sizeof(buf2), 0));
   ASSERT_EQ(0, memcmp(buf, buf2, sizeof(buf)));
+
+  uint64_t off = 19;
+  memset(buf, 0xcc, sizeof(buf));
+  ASSERT_EQ(0, rados_write(ioctx, "bar", buf, sizeof(buf), off));
+  memset(buf2, 0, sizeof(buf2));
+  ASSERT_EQ((int)sizeof(buf2), rados_read(ioctx, "bar", buf2, sizeof(buf2), off));
+  ASSERT_EQ(0, memcmp(buf, buf2, sizeof(buf)));
 }
 
 TEST_F(LibRadosIoPP, RoundTripPP) {
@@ -456,6 +491,17 @@ TEST_F(LibRadosIo, RmXattr) {
       rados_setxattr(ioctx, "foo", attr1, attr1_buf, sizeof(attr1_buf)));
   ASSERT_EQ(0, rados_rmxattr(ioctx, "foo", attr1));
   ASSERT_EQ(-ENODATA, rados_getxattr(ioctx, "foo", attr1, buf, sizeof(buf)));
+
+  // Test rmxattr on a removed object
+  char buf2[128];
+  char attr2[] = "attr2";
+  char attr2_buf[] = "foo bar baz";
+  memset(buf2, 0xbb, sizeof(buf2));
+  ASSERT_EQ(0, rados_write(ioctx, "foo_rmxattr", buf2, sizeof(buf2), 0));
+  ASSERT_EQ(0,
+      rados_setxattr(ioctx, "foo_rmxattr", attr2, attr2_buf, sizeof(attr2_buf)));
+  ASSERT_EQ(0, rados_remove(ioctx, "foo_rmxattr"));
+  ASSERT_EQ(-ENOENT, rados_rmxattr(ioctx, "foo_rmxattr", attr2));
 }
 
 TEST_F(LibRadosIoPP, RmXattrPP) {
@@ -472,6 +518,20 @@ TEST_F(LibRadosIoPP, RmXattrPP) {
   ASSERT_EQ(0, ioctx.rmxattr("foo", attr1));
   bufferlist bl3;
   ASSERT_EQ(-ENODATA, ioctx.getxattr("foo", attr1, bl3));
+
+  // Test rmxattr on a removed object
+  char buf2[128];
+  char attr2[] = "attr2";
+  char attr2_buf[] = "foo bar baz";
+  memset(buf2, 0xbb, sizeof(buf2));
+  bufferlist bl21;
+  bl21.append(buf, sizeof(buf));
+  ASSERT_EQ(0, ioctx.write("foo_rmxattr", bl21, sizeof(buf2), 0));
+  bufferlist bl22;
+  bl22.append(attr2_buf, sizeof(attr2_buf));
+  ASSERT_EQ(0, ioctx.setxattr("foo_rmxattr", attr2, bl22));
+  ASSERT_EQ(0, ioctx.remove("foo_rmxattr"));
+  ASSERT_EQ(-ENOENT, ioctx.rmxattr("foo_rmxattr", attr2));
 }
 
 TEST_F(LibRadosIo, XattrIter) {
@@ -584,6 +644,15 @@ TEST_F(LibRadosIoECPP, ReadOpPP) {
   }
 
   {
+    bufferlist op_bl;
+    ObjectReadOperation op;
+    op.read(0, 0, NULL, NULL); //len=0 mean read the whole object data
+    ASSERT_EQ(0, ioctx.operate("foo", &op, &op_bl));
+    ASSERT_EQ(sizeof(buf), op_bl.length());
+    ASSERT_EQ(0, memcmp(op_bl.c_str(), buf, sizeof(buf)));
+  }
+
+  {
       bufferlist read_bl, op_bl;
       ObjectReadOperation op;
       op.read(0, sizeof(buf), &read_bl, NULL);
@@ -689,6 +758,25 @@ TEST_F(LibRadosIoECPP, ReadOpPP) {
   }
 }
 
+TEST_F(LibRadosIoECPP, SparseReadOpPP) {
+  char buf[128];
+  memset(buf, 0xcc, sizeof(buf));
+  bufferlist bl;
+  bl.append(buf, sizeof(buf));
+  ASSERT_EQ(0, ioctx.write("foo", bl, sizeof(buf), 0));
+
+  {
+    std::map<uint64_t, uint64_t> extents;
+    bufferlist read_bl;
+    int rval = -1;
+    ObjectReadOperation op;
+    op.sparse_read(0, sizeof(buf), &extents, &read_bl, &rval);
+    ASSERT_EQ(0, ioctx.operate("foo", &op, nullptr));
+    ASSERT_EQ(0, rval);
+    assert_eq_sparse(bl, extents, read_bl);
+  }
+}
+
 TEST_F(LibRadosIoEC, RoundTrip) {
   char buf[128];
   char buf2[128];
@@ -697,6 +785,9 @@ TEST_F(LibRadosIoEC, RoundTrip) {
   memset(buf2, 0, sizeof(buf2));
   ASSERT_EQ((int)sizeof(buf2), rados_read(ioctx, "foo", buf2, sizeof(buf2), 0));
   ASSERT_EQ(0, memcmp(buf, buf2, sizeof(buf)));
+
+  uint64_t off = 19;
+  ASSERT_EQ(-EOPNOTSUPP, rados_write(ioctx, "bar", buf, sizeof(buf), off));
 }
 
 TEST_F(LibRadosIoECPP, RoundTripPP) {
@@ -950,6 +1041,17 @@ TEST_F(LibRadosIoEC, RmXattr) {
       rados_setxattr(ioctx, "foo", attr1, attr1_buf, sizeof(attr1_buf)));
   ASSERT_EQ(0, rados_rmxattr(ioctx, "foo", attr1));
   ASSERT_EQ(-ENODATA, rados_getxattr(ioctx, "foo", attr1, buf, sizeof(buf)));
+
+  // Test rmxattr on a removed object
+  char buf2[128];
+  char attr2[] = "attr2";
+  char attr2_buf[] = "foo bar baz";
+  memset(buf2, 0xbb, sizeof(buf2));
+  ASSERT_EQ(0, rados_write(ioctx, "foo_rmxattr", buf2, sizeof(buf2), 0));
+  ASSERT_EQ(0,
+      rados_setxattr(ioctx, "foo_rmxattr", attr2, attr2_buf, sizeof(attr2_buf)));
+  ASSERT_EQ(0, rados_remove(ioctx, "foo_rmxattr"));
+  ASSERT_EQ(-ENOENT, rados_rmxattr(ioctx, "foo_rmxattr", attr2));
 }
 
 TEST_F(LibRadosIoECPP, RmXattrPP) {
@@ -966,6 +1068,20 @@ TEST_F(LibRadosIoECPP, RmXattrPP) {
   ASSERT_EQ(0, ioctx.rmxattr("foo", attr1));
   bufferlist bl3;
   ASSERT_EQ(-ENODATA, ioctx.getxattr("foo", attr1, bl3));
+
+  // Test rmxattr on a removed object
+  char buf2[128];
+  char attr2[] = "attr2";
+  char attr2_buf[] = "foo bar baz";
+  memset(buf2, 0xbb, sizeof(buf2));
+  bufferlist bl21;
+  bl21.append(buf, sizeof(buf));
+  ASSERT_EQ(0, ioctx.write("foo_rmxattr", bl21, sizeof(buf2), 0));
+  bufferlist bl22;
+  bl22.append(attr2_buf, sizeof(attr2_buf));
+  ASSERT_EQ(0, ioctx.setxattr("foo_rmxattr", attr2, bl22));
+  ASSERT_EQ(0, ioctx.remove("foo_rmxattr"));
+  ASSERT_EQ(-ENOENT, ioctx.rmxattr("foo_rmxattr", attr2));
 }
 
 TEST_F(LibRadosIoEC, XattrIter) {
diff --git a/src/test/librados/misc.cc b/src/test/librados/misc.cc
index dff12ef..4adaa6b 100644
--- a/src/test/librados/misc.cc
+++ b/src/test/librados/misc.cc
@@ -341,7 +341,7 @@ TEST_F(LibRadosMisc, Exec) {
   uint64_t all_features;
   ::decode(all_features, iter);
   // make sure *some* features are specified; don't care which ones
-  ASSERT_NE(all_features, 0);
+  ASSERT_NE(all_features, (unsigned)0);
 }
 
 TEST_F(LibRadosMiscPP, ExecPP) {
@@ -354,7 +354,7 @@ TEST_F(LibRadosMiscPP, ExecPP) {
   uint64_t all_features;
   ::decode(all_features, iter);
   // make sure *some* features are specified; don't care which ones
-  ASSERT_NE(all_features, 0);
+  ASSERT_NE(all_features, (unsigned)0);
 }
 
 TEST_F(LibRadosMiscPP, Operate1PP) {
@@ -610,18 +610,18 @@ TEST_F(LibRadosMiscPP, CopyPP) {
   {
     // pass future version
     ObjectWriteOperation op;
-    op.copy_from("foo", ioctx, uv + 1);
+    op.copy_from2("foo", ioctx, uv + 1, LIBRADOS_OP_FLAG_FADVISE_DONTNEED);
     ASSERT_EQ(-EOVERFLOW, ioctx.operate("foo.copy", &op));
   }
   {
     // pass old version
     ObjectWriteOperation op;
-    op.copy_from("foo", ioctx, uv - 1);
+    op.copy_from2("foo", ioctx, uv - 1, LIBRADOS_OP_FLAG_FADVISE_DONTNEED);
     ASSERT_EQ(-ERANGE, ioctx.operate("foo.copy", &op));
   }
   {
     ObjectWriteOperation op;
-    op.copy_from("foo", ioctx, uv);
+    op.copy_from2("foo", ioctx, uv, LIBRADOS_OP_FLAG_FADVISE_DONTNEED);
     ASSERT_EQ(0, ioctx.operate("foo.copy", &op));
 
     bufferlist bl2, x2;
@@ -634,7 +634,7 @@ TEST_F(LibRadosMiscPP, CopyPP) {
   // small object without a version
   {
     ObjectWriteOperation op;
-    op.copy_from("foo", ioctx, 0);
+    op.copy_from2("foo", ioctx, 0, LIBRADOS_OP_FLAG_FADVISE_DONTNEED);
     ASSERT_EQ(0, ioctx.operate("foo.copy2", &op));
 
     bufferlist bl2, x2;
@@ -655,7 +655,7 @@ TEST_F(LibRadosMiscPP, CopyPP) {
 
   {
     ObjectWriteOperation op;
-    op.copy_from("big", ioctx, ioctx.get_last_version());
+    op.copy_from2("big", ioctx, ioctx.get_last_version(), LIBRADOS_OP_FLAG_FADVISE_DONTNEED);
     ASSERT_EQ(0, ioctx.operate("big.copy", &op));
 
     bufferlist bl2, x2;
@@ -667,7 +667,7 @@ TEST_F(LibRadosMiscPP, CopyPP) {
 
   {
     ObjectWriteOperation op;
-    op.copy_from("big", ioctx, 0);
+    op.copy_from2("big", ioctx, 0, LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL);
     ASSERT_EQ(0, ioctx.operate("big.copy2", &op));
 
     bufferlist bl2, x2;
@@ -678,6 +678,58 @@ TEST_F(LibRadosMiscPP, CopyPP) {
   }
 }
 
+class LibRadosTwoPoolsECPP : public RadosTestECPP
+{
+public:
+  LibRadosTwoPoolsECPP() {};
+  virtual ~LibRadosTwoPoolsECPP() {};
+protected:
+  static void SetUpTestCase() {
+    pool_name = get_temp_pool_name();
+    ASSERT_EQ("", create_one_ec_pool_pp(pool_name, s_cluster));
+    src_pool_name = get_temp_pool_name();
+    ASSERT_EQ(0, s_cluster.pool_create(src_pool_name.c_str()));
+  }
+  static void TearDownTestCase() {
+    ASSERT_EQ(0, s_cluster.pool_delete(src_pool_name.c_str()));
+    ASSERT_EQ(0, destroy_one_ec_pool_pp(pool_name, s_cluster));
+  }
+  static std::string src_pool_name;
+
+  virtual void SetUp() {
+    RadosTestECPP::SetUp();
+    ASSERT_EQ(0, cluster.ioctx_create(src_pool_name.c_str(), src_ioctx));
+    src_ioctx.set_namespace(nspace);
+  }
+  virtual void TearDown() {
+    // wait for maps to settle before next test
+    cluster.wait_for_latest_osdmap();
+
+    RadosTestECPP::TearDown();
+
+    cleanup_default_namespace(src_ioctx);
+    cleanup_namespace(src_ioctx, nspace);
+
+    src_ioctx.close();
+  }
+
+  librados::IoCtx src_ioctx;
+};
+std::string LibRadosTwoPoolsECPP::src_pool_name;
+
+//copy_from between ecpool and no-ecpool.
+TEST_F(LibRadosTwoPoolsECPP, CopyFrom) {
+  //create object w/ omapheader
+  bufferlist b;
+  b.append("copyfrom");
+  ASSERT_EQ(0, src_ioctx.omap_set_header("foo", b));
+
+  version_t uv = src_ioctx.get_last_version();
+  ObjectWriteOperation op;
+  op.copy_from("foo", src_ioctx, uv);
+  ASSERT_EQ(-EOPNOTSUPP, ioctx.operate("foo.copy", &op));
+}
+
 TEST_F(LibRadosMiscPP, CopyScrubPP) {
   bufferlist inbl, bl, x;
   for (int i=0; i<100; ++i)
diff --git a/src/test/librados/nlist.cc b/src/test/librados/nlist.cc
index 2ffbbda..19ff73f 100644
--- a/src/test/librados/nlist.cc
+++ b/src/test/librados/nlist.cc
@@ -687,3 +687,46 @@ TEST_F(LibRadosListECPP, ListObjectsStartPP) {
     ++p;
   }
 }
+
+TEST_F(LibRadosListPP, ListObjectsFilterPP) {
+  char buf[128];
+  memset(buf, 0xcc, sizeof(buf));
+  bufferlist obj_content;
+  obj_content.append(buf, sizeof(buf));
+
+  std::string target_str = "content";
+
+  // Write xattr bare, no ::encod'ing
+  bufferlist target_val;
+  target_val.append(target_str);
+  bufferlist nontarget_val;
+  nontarget_val.append("rhubarb");
+
+  ASSERT_EQ(0, ioctx.write("has_xattr", obj_content, obj_content.length(), 0));
+  ASSERT_EQ(0, ioctx.write("has_wrong_xattr", obj_content, obj_content.length(), 0));
+  ASSERT_EQ(0, ioctx.write("no_xattr", obj_content, obj_content.length(), 0));
+
+  ASSERT_EQ(0, ioctx.setxattr("has_xattr", "theattr", target_val));
+  ASSERT_EQ(0, ioctx.setxattr("has_wrong_xattr", "theattr", nontarget_val));
+
+  bufferlist filter_bl;
+  std::string filter_name = "plain";
+  ::encode(filter_name, filter_bl);
+  ::encode("_theattr", filter_bl);
+  ::encode(target_str, filter_bl);
+
+  NObjectIterator iter(ioctx.nobjects_begin(filter_bl));
+  bool foundit = false;
+  int k = 0;
+  while (iter != ioctx.nobjects_end()) {
+    foundit = true;
+    // We should only see the object that matches the filter
+    ASSERT_EQ((*iter).get_oid(), "has_xattr");
+    // We should only see it once
+    ASSERT_EQ(k, 0);
+    ++iter;
+    ++k;
+  }
+  ASSERT_TRUE(foundit);
+}
+
diff --git a/src/test/librados/test.cc b/src/test/librados/test.cc
index f8a92a2..aac053a 100644
--- a/src/test/librados/test.cc
+++ b/src/test/librados/test.cc
@@ -10,6 +10,8 @@
 #include <string>
 #include <time.h>
 #include <unistd.h>
+#include <iostream>
+#include "gtest/gtest.h"
 
 using namespace librados;
 
@@ -255,3 +257,27 @@ int destroy_one_ec_pool_pp(const std::string &pool_name, Rados &cluster)
   cluster.shutdown();
   return ret;
 }
+
+void assert_eq_sparse(bufferlist& expected,
+                      const std::map<uint64_t, uint64_t>& extents,
+                      bufferlist& actual) {
+  auto i = expected.begin();
+  auto p = actual.begin();
+  uint64_t pos = 0;
+  for (auto extent : extents) {
+    const uint64_t start = extent.first;
+    const uint64_t end = start + extent.second;
+    for (; pos < end; ++i, ++pos) {
+      ASSERT_FALSE(i.end());
+      if (pos < start) {
+        // check the hole
+        ASSERT_EQ('\0', *i);
+      } else {
+        // then the extent
+        ASSERT_EQ(*i, *p);
+        ++p;
+      }
+    }
+  }
+  ASSERT_EQ(expected.length(), pos);
+}
diff --git a/src/test/librados/test.h b/src/test/librados/test.h
index 6cf522d..cd1f981 100644
--- a/src/test/librados/test.h
+++ b/src/test/librados/test.h
@@ -18,6 +18,7 @@
 #include "include/rados/librados.h"
 #include "include/rados/librados.hpp"
 
+#include <map>
 #include <string>
 #include <unistd.h>
 
@@ -35,6 +36,9 @@ int destroy_one_pool(const std::string &pool_name, rados_t *cluster);
 int destroy_one_ec_pool(const std::string &pool_name, rados_t *cluster);
 int destroy_one_pool_pp(const std::string &pool_name, librados::Rados &cluster);
 int destroy_one_ec_pool_pp(const std::string &pool_name, librados::Rados &cluster);
+void assert_eq_sparse(bufferlist& expected,
+                      const std::map<uint64_t, uint64_t>& extents,
+                      bufferlist& actual);
 
 class TestAlarm
 {
diff --git a/src/test/librados/tier.cc b/src/test/librados/tier.cc
index 1e94bdd..eb2db6b 100644
--- a/src/test/librados/tier.cc
+++ b/src/test/librados/tier.cc
@@ -786,10 +786,10 @@ TEST_F(LibRadosTwoPoolsPP, Evict) {
     op.cache_evict();
     librados::AioCompletion *completion = cluster.aio_create_completion();
     ASSERT_EQ(0, cache_ioctx.aio_operate(
-      "fooberdoodle", completion, &op,
+      "foo", completion, &op,
       librados::OPERATION_IGNORE_CACHE, NULL));
     completion->wait_for_safe();
-    ASSERT_EQ(-ENOENT, completion->get_return_value());
+    ASSERT_EQ(0, completion->get_return_value());
     completion->release();
   }
   {
@@ -2216,7 +2216,6 @@ TEST_F(LibRadosTwoPoolsPP, HitSetTrim) {
     c->wait_for_complete();
     c->release();
 
-    ASSERT_TRUE(ls.size() <= count + 1);
     cout << " got ls " << ls << std::endl;
     if (!ls.empty()) {
       if (!first) {
@@ -2239,12 +2238,12 @@ TEST_F(LibRadosTwoPoolsPP, HitSetTrim) {
 
 TEST_F(LibRadosTwoPoolsPP, PromoteOn2ndRead) {
   // create object
-  {
+  for (int i=0; i<20; ++i) {
     bufferlist bl;
     bl.append("hi there");
     ObjectWriteOperation op;
     op.write_full(bl);
-    ASSERT_EQ(0, ioctx.operate("foo", &op));
+    ASSERT_EQ(0, ioctx.operate("foo" + stringify(i), &op));
   }
 
   // configure cache
@@ -2280,40 +2279,63 @@ TEST_F(LibRadosTwoPoolsPP, PromoteOn2ndRead) {
   // wait for maps to settle
   cluster.wait_for_latest_osdmap();
 
-  // 1st read, don't trigger a promote
-  utime_t start = ceph_clock_now(NULL);
-  {
-    bufferlist bl;
-    ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0));
-  }
-  utime_t end = ceph_clock_now(NULL);
-  float dur = end - start;
-  cout << "duration " << dur << std::endl;
+  int fake = 0;  // set this to non-zero to test spurious promotion,
+		 // e.g. from thrashing
+  int attempt = 0;
+  string obj;
+  while (true) {
+    // 1st read, don't trigger a promote
+    obj = "foo" + stringify(attempt);
+    cout << obj << std::endl;
+    {
+      bufferlist bl;
+      ASSERT_EQ(1, ioctx.read(obj.c_str(), bl, 1, 0));
+      if (--fake >= 0) {
+	sleep(1);
+	ASSERT_EQ(1, ioctx.read(obj.c_str(), bl, 1, 0));
+	sleep(1);
+      }
+    }
 
-  // verify the object is NOT present in the cache tier
-  {
-    NObjectIterator it = cache_ioctx.nobjects_begin();
-    if (it != cache_ioctx.nobjects_end()) {
-      if (dur > 1.0) {
-	cout << " object got promoted, but read was slow, ignoring" << std::endl;
-      } else {
-	ASSERT_TRUE(it == cache_ioctx.nobjects_end());
+    // verify the object is NOT present in the cache tier
+    {
+      bool found = false;
+      NObjectIterator it = cache_ioctx.nobjects_begin();
+      while (it != cache_ioctx.nobjects_end()) {
+	cout << " see " << it->get_oid() << std::endl;
+	if (it->get_oid() == string(obj.c_str())) {
+	  found = true;
+	  break;
+	}
+	++it;
       }
+      if (!found)
+	break;
     }
+
+    ++attempt;
+    ASSERT_LE(attempt, 20);
+    cout << "hrm, object is present in cache on attempt " << attempt
+	 << ", retrying" << std::endl;
   }
 
   // Read until the object is present in the cache tier
+  cout << "verifying " << obj << " is eventually promoted" << std::endl;
   while (true) {
     bufferlist bl;
-    ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0));
+    ASSERT_EQ(1, ioctx.read(obj.c_str(), bl, 1, 0));
 
+    bool there = false;
     NObjectIterator it = cache_ioctx.nobjects_begin();
-    if (it != cache_ioctx.nobjects_end()) {
-      ASSERT_TRUE(it->get_oid() == string("foo"));
+    while (it != cache_ioctx.nobjects_end()) {
+      if (it->get_oid() == string(obj.c_str())) {
+	there = true;
+	break;
+      }
       ++it;
-      ASSERT_TRUE(it == cache_ioctx.nobjects_end());
-      break;
     }
+    if (there)
+      break;
 
     sleep(1);
   }
@@ -2988,10 +3010,10 @@ TEST_F(LibRadosTwoPoolsECPP, Evict) {
     op.cache_evict();
     librados::AioCompletion *completion = cluster.aio_create_completion();
     ASSERT_EQ(0, cache_ioctx.aio_operate(
-      "fooberdoodle", completion, &op,
+      "foo", completion, &op,
       librados::OPERATION_IGNORE_CACHE, NULL));
     completion->wait_for_safe();
-    ASSERT_EQ(-ENOENT, completion->get_return_value());
+    ASSERT_EQ(0, completion->get_return_value());
     completion->release();
   }
   {
@@ -4264,7 +4286,6 @@ TEST_F(LibRadosTwoPoolsECPP, HitSetTrim) {
     c->wait_for_complete();
     c->release();
 
-    ASSERT_TRUE(ls.size() <= count + 1);
     cout << " got ls " << ls << std::endl;
     if (!ls.empty()) {
       if (!first) {
diff --git a/src/test/librados/watch_notify.cc b/src/test/librados/watch_notify.cc
index 1d585e8..c424fd8 100644
--- a/src/test/librados/watch_notify.cc
+++ b/src/test/librados/watch_notify.cc
@@ -5,6 +5,7 @@
 #include "test/librados/TestCase.h"
 
 #include <errno.h>
+#include <fcntl.h>
 #include <semaphore.h>
 #include "gtest/gtest.h"
 #include "include/encoding.h"
@@ -21,12 +22,12 @@ typedef RadosTestECPP LibRadosWatchNotifyECPP;
 int notify_sleep = 0;
 
 // notify
-static sem_t sem;
+static sem_t *sem;
 
 static void watch_notify_test_cb(uint8_t opcode, uint64_t ver, void *arg)
 {
   std::cout << __func__ << std::endl;
-  sem_post(&sem);
+  sem_post(sem);
 }
 
 class WatchNotifyTestCtx : public WatchCtx
@@ -35,7 +36,7 @@ public:
     void notify(uint8_t opcode, uint64_t ver, bufferlist& bl)
     {
       std::cout << __func__ << std::endl;
-      sem_post(&sem);
+      sem_post(sem);
     }
 };
 
@@ -103,7 +104,7 @@ public:
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
 
 TEST_F(LibRadosWatchNotify, WatchNotify) {
-  ASSERT_EQ(0, sem_init(&sem, 0, 0));
+  ASSERT_NE(SEM_FAILED, (sem = sem_open("test_watch_notify_sem", O_CREAT, 0644, 0)));
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
   ASSERT_EQ(0, rados_write(ioctx, "foo", buf, sizeof(buf), 0));
@@ -112,18 +113,18 @@ TEST_F(LibRadosWatchNotify, WatchNotify) {
       rados_watch(ioctx, "foo", 0, &handle, watch_notify_test_cb, NULL));
   ASSERT_EQ(0, rados_notify(ioctx, "foo", 0, NULL, 0));
   TestAlarm alarm;
-  sem_wait(&sem);
+  sem_wait(sem);
   rados_unwatch(ioctx, "foo", handle);
 
   // when dne ...
   ASSERT_EQ(-ENOENT,
       rados_watch(ioctx, "dne", 0, &handle, watch_notify_test_cb, NULL));
 
-  sem_destroy(&sem);
+  sem_close(sem);
 }
 
 TEST_P(LibRadosWatchNotifyPP, WatchNotify) {
-  ASSERT_EQ(0, sem_init(&sem, 0, 0));
+  ASSERT_NE(SEM_FAILED, (sem = sem_open("test_watch_notify_sem", O_CREAT, 0644, 0)));
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
   bufferlist bl1;
@@ -138,13 +139,13 @@ TEST_P(LibRadosWatchNotifyPP, WatchNotify) {
   bufferlist bl2;
   ASSERT_EQ(0, ioctx.notify("foo", 0, bl2));
   TestAlarm alarm;
-  sem_wait(&sem);
+  sem_wait(sem);
   ioctx.unwatch("foo", handle);
-  sem_destroy(&sem);
+  sem_close(sem);
 }
 
 TEST_F(LibRadosWatchNotifyEC, WatchNotify) {
-  ASSERT_EQ(0, sem_init(&sem, 0, 0));
+  ASSERT_NE(SEM_FAILED, (sem = sem_open("test_watch_notify_sem", O_CREAT, 0644, 0)));
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
   ASSERT_EQ(0, rados_write(ioctx, "foo", buf, sizeof(buf), 0));
@@ -153,13 +154,13 @@ TEST_F(LibRadosWatchNotifyEC, WatchNotify) {
       rados_watch(ioctx, "foo", 0, &handle, watch_notify_test_cb, NULL));
   ASSERT_EQ(0, rados_notify(ioctx, "foo", 0, NULL, 0));
   TestAlarm alarm;
-  sem_wait(&sem);
+  sem_wait(sem);
   rados_unwatch(ioctx, "foo", handle);
-  sem_destroy(&sem);
+  sem_close(sem);
 }
 
 TEST_F(LibRadosWatchNotifyECPP, WatchNotify) {
-  ASSERT_EQ(0, sem_init(&sem, 0, 0));
+  ASSERT_NE(SEM_FAILED, (sem = sem_open("test_watch_notify_sem", O_CREAT, 0644, 0)));
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
   bufferlist bl1;
@@ -174,15 +175,15 @@ TEST_F(LibRadosWatchNotifyECPP, WatchNotify) {
   bufferlist bl2;
   ASSERT_EQ(0, ioctx.notify("foo", 0, bl2));
   TestAlarm alarm;
-  sem_wait(&sem);
+  sem_wait(sem);
   ioctx.unwatch("foo", handle);
-  sem_destroy(&sem);
+  sem_close(sem);
 }
 
 // --
 
 TEST_P(LibRadosWatchNotifyPP, WatchNotifyTimeout) {
-  ASSERT_EQ(0, sem_init(&sem, 0, 0));
+  ASSERT_NE(SEM_FAILED, (sem = sem_open("test_watch_notify_sem", O_CREAT, 0644, 0)));
   ioctx.set_notify_timeout(1);
   uint64_t handle;
   WatchNotifyTestCtx ctx;
@@ -194,12 +195,12 @@ TEST_P(LibRadosWatchNotifyPP, WatchNotifyTimeout) {
   ASSERT_EQ(0, ioctx.write("foo", bl1, sizeof(buf), 0));
 
   ASSERT_EQ(0, ioctx.watch("foo", 0, &handle, &ctx));
-  sem_destroy(&sem);
+  sem_close(sem);
   ASSERT_EQ(0, ioctx.unwatch("foo", handle));
 }
 
 TEST_F(LibRadosWatchNotifyECPP, WatchNotifyTimeout) {
-  ASSERT_EQ(0, sem_init(&sem, 0, 0));
+  ASSERT_NE(SEM_FAILED, (sem = sem_open("test_watch_notify_sem", O_CREAT, 0644, 0)));
   ioctx.set_notify_timeout(1);
   uint64_t handle;
   WatchNotifyTestCtx ctx;
@@ -211,7 +212,7 @@ TEST_F(LibRadosWatchNotifyECPP, WatchNotifyTimeout) {
   ASSERT_EQ(0, ioctx.write("foo", bl1, sizeof(buf), 0));
 
   ASSERT_EQ(0, ioctx.watch("foo", 0, &handle, &ctx));
-  sem_destroy(&sem);
+  sem_close(sem);
   ASSERT_EQ(0, ioctx.unwatch("foo", handle));
 }
 
diff --git a/src/test/librados_test_stub/LibradosTestStub.cc b/src/test/librados_test_stub/LibradosTestStub.cc
index 8efd6ac..2a0006e 100644
--- a/src/test/librados_test_stub/LibradosTestStub.cc
+++ b/src/test/librados_test_stub/LibradosTestStub.cc
@@ -1,6 +1,7 @@
 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
 // vim: ts=8 sw=2 smarttab
 
+#include "test/librados_test_stub/LibradosTestStub.h"
 #include "include/rados/librados.hpp"
 #include "common/ceph_argparse.h"
 #include "common/common_init.h"
@@ -20,20 +21,13 @@
 #include <list>
 #include <vector>
 #include "include/assert.h"
+#include "include/compat.h"
 
 #define dout_subsys ceph_subsys_rados
 
 namespace {
 
-static void DeallocateRadosClient(librados::TestRadosClient* client)
-{
-  client->put();
-}
-
-} // anonymous namespace
-
-
-static librados::TestClassHandler *get_class_handler() {
+librados::TestClassHandler *get_class_handler() {
   static boost::shared_ptr<librados::TestClassHandler> s_class_handler;
   if (!s_class_handler) {
     s_class_handler.reset(new librados::TestClassHandler());
@@ -42,23 +36,7 @@ static librados::TestClassHandler *get_class_handler() {
   return s_class_handler.get();
 }
 
-static librados::TestRadosClient *get_rados_client() {
-  // TODO: use factory to allow tests to swap out impl
-  static boost::shared_ptr<librados::TestRadosClient> s_rados_client;
-  if (!s_rados_client) {
-    CephInitParameters iparams(CEPH_ENTITY_TYPE_CLIENT);
-    CephContext *cct = common_preinit(iparams, CODE_ENVIRONMENT_LIBRARY, 0);
-    cct->_conf->parse_env();
-    cct->_conf->apply_changes(NULL);
-    s_rados_client.reset(new librados::TestMemRadosClient(cct),
-                         &DeallocateRadosClient);
-    cct->put();
-  }
-  s_rados_client->get();
-  return s_rados_client.get();
-}
-
-static void do_out_buffer(bufferlist& outbl, char **outbuf, size_t *outbuflen) {
+void do_out_buffer(bufferlist& outbl, char **outbuf, size_t *outbuflen) {
   if (outbuf) {
     if (outbl.length() > 0) {
       *outbuf = (char *)malloc(outbl.length());
@@ -72,7 +50,7 @@ static void do_out_buffer(bufferlist& outbl, char **outbuf, size_t *outbuflen) {
   }
 }
 
-static void do_out_buffer(string& outbl, char **outbuf, size_t *outbuflen) {
+void do_out_buffer(string& outbl, char **outbuf, size_t *outbuflen) {
   if (outbuf) {
     if (outbl.length() > 0) {
       *outbuf = (char *)malloc(outbl.length());
@@ -86,6 +64,40 @@ static void do_out_buffer(string& outbl, char **outbuf, size_t *outbuflen) {
   }
 }
 
+} // anonymous namespace
+
+namespace librados_test_stub {
+
+TestRadosClientPtr *rados_client() {
+  // force proper destruction order by delaying construction
+  static TestRadosClientPtr s_rados_client;
+  return &s_rados_client;
+}
+
+void set_rados_client(
+    const boost::shared_ptr<librados::TestRadosClient> &new_client) {
+  assert(new_client.get() != nullptr);
+  *rados_client() = new_client;
+}
+
+TestRadosClientPtr get_rados_client() {
+  // TODO: use factory to allow tests to swap out impl
+  TestRadosClientPtr *client = rados_client();
+  if (client->get() == nullptr) {
+    CephInitParameters iparams(CEPH_ENTITY_TYPE_CLIENT);
+    CephContext *cct = common_preinit(iparams, CODE_ENVIRONMENT_LIBRARY, 0);
+    cct->_conf->parse_env();
+    cct->_conf->apply_changes(NULL);
+    client->reset(new librados::TestMemRadosClient(cct),
+                  &librados::TestRadosClient::Deallocate);
+    cct->put();
+  }
+  (*client)->get();
+  return *client;
+}
+
+} // namespace librados_test_stub
+
 extern "C" int rados_aio_create_completion(void *cb_arg,
                                            rados_callback_t cb_complete,
                                            rados_callback_t cb_safe,
@@ -158,10 +170,17 @@ extern "C" int rados_connect(rados_t cluster) {
 }
 
 extern "C" int rados_create(rados_t *cluster, const char * const id) {
-  *cluster = get_rados_client();
+  *cluster = librados_test_stub::get_rados_client().get();
   return 0;
 }
 
+extern "C" rados_config_t rados_ioctx_cct(rados_ioctx_t ioctx)
+{
+  librados::TestIoCtxImpl *ctx =
+    reinterpret_cast<librados::TestIoCtxImpl*>(ioctx);
+  return reinterpret_cast<rados_config_t>(ctx->get_rados_client()->cct());
+}
+
 extern "C" int rados_ioctx_create(rados_t cluster, const char *pool_name,
                                   rados_ioctx_t *ioctx) {
   librados::TestRadosClient *client =
@@ -336,6 +355,11 @@ int IoCtx::aio_flush_async(AioCompletion *c) {
 }
 
 int IoCtx::aio_operate(const std::string& oid, AioCompletion *c,
+                       ObjectReadOperation *op, bufferlist *pbl) {
+  return aio_operate(oid, c, op, 0, pbl);
+}
+
+int IoCtx::aio_operate(const std::string& oid, AioCompletion *c,
                        ObjectReadOperation *op, int flags,
                        bufferlist *pbl) {
   TestIoCtxImpl *ctx = reinterpret_cast<TestIoCtxImpl*>(io_ctx_impl);
@@ -385,7 +409,8 @@ void IoCtx::close() {
 
 int IoCtx::create(const std::string& oid, bool exclusive) {
   TestIoCtxImpl *ctx = reinterpret_cast<TestIoCtxImpl*>(io_ctx_impl);
-  return ctx->create(oid, exclusive);
+  return ctx->execute_operation(
+    oid, boost::bind(&TestIoCtxImpl::create, _1, _2, exclusive));
 }
 
 void IoCtx::dup(const IoCtx& rhs) {
@@ -397,7 +422,9 @@ void IoCtx::dup(const IoCtx& rhs) {
 int IoCtx::exec(const std::string& oid, const char *cls, const char *method,
                 bufferlist& inbl, bufferlist& outbl) {
   TestIoCtxImpl *ctx = reinterpret_cast<TestIoCtxImpl*>(io_ctx_impl);
-  return ctx->exec(oid, *get_class_handler(), cls, method, inbl, &outbl);
+  return ctx->execute_operation(
+    oid, boost::bind(&TestIoCtxImpl::exec, _1, _2, get_class_handler(), cls,
+                     method, inbl, &outbl, ctx->get_snap_context()));
 }
 
 void IoCtx::from_rados_ioctx_t(rados_ioctx_t p, IoCtx &io) {
@@ -430,13 +457,15 @@ std::string IoCtx::get_pool_name() {
 
 int IoCtx::list_snaps(const std::string& o, snap_set_t *out_snaps) {
   TestIoCtxImpl *ctx = reinterpret_cast<TestIoCtxImpl*>(io_ctx_impl);
-  return ctx->list_snaps(o, out_snaps);
+  return ctx->execute_operation(
+    o, boost::bind(&TestIoCtxImpl::list_snaps, _1, _2, out_snaps));
 }
 
 int IoCtx::list_watchers(const std::string& o,
                          std::list<obj_watch_t> *out_watchers) {
   TestIoCtxImpl *ctx = reinterpret_cast<TestIoCtxImpl*>(io_ctx_impl);
-  return ctx->list_watchers(o, out_watchers);
+  return ctx->execute_operation(
+    o, boost::bind(&TestIoCtxImpl::list_watchers, _1, _2, out_watchers));
 }
 
 int IoCtx::notify(const std::string& o, uint64_t ver, bufferlist& bl) {
@@ -461,7 +490,9 @@ int IoCtx::omap_get_vals(const std::string& oid,
                          uint64_t max_return,
                          std::map<std::string, bufferlist> *out_vals) {
   TestIoCtxImpl *ctx = reinterpret_cast<TestIoCtxImpl*>(io_ctx_impl);
-  return ctx->omap_get_vals(oid, start_after, "", max_return, out_vals);
+  return ctx->execute_operation(
+    oid, boost::bind(&TestIoCtxImpl::omap_get_vals, _1, _2, start_after, "",
+                     max_return, out_vals));
 }
 
 int IoCtx::operate(const std::string& oid, ObjectWriteOperation *op) {
@@ -480,11 +511,14 @@ int IoCtx::operate(const std::string& oid, ObjectReadOperation *op,
 int IoCtx::read(const std::string& oid, bufferlist& bl, size_t len,
                 uint64_t off) {
   TestIoCtxImpl *ctx = reinterpret_cast<TestIoCtxImpl*>(io_ctx_impl);
-  return ctx->read(oid, len, off, &bl);
+  return ctx->execute_operation(
+    oid, boost::bind(&TestIoCtxImpl::read, _1, _2, len, off, &bl));
 }
 
 int IoCtx::remove(const std::string& oid) {
   TestIoCtxImpl *ctx = reinterpret_cast<TestIoCtxImpl*>(io_ctx_impl);
+  return ctx->execute_operation(
+    oid, boost::bind(&TestIoCtxImpl::remove, _1, _2));
   return ctx->remove(oid);
 }
 
@@ -517,12 +551,14 @@ void IoCtx::snap_set_read(snap_t seq) {
 
 int IoCtx::stat(const std::string& oid, uint64_t *psize, time_t *pmtime) {
   TestIoCtxImpl *ctx = reinterpret_cast<TestIoCtxImpl*>(io_ctx_impl);
-  return ctx->stat(oid, psize, pmtime);;
+  return ctx->execute_operation(
+    oid, boost::bind(&TestIoCtxImpl::stat, _1, _2, psize, pmtime));
 }
 
 int IoCtx::tmap_update(const std::string& oid, bufferlist& cmdbl) {
   TestIoCtxImpl *ctx = reinterpret_cast<TestIoCtxImpl*>(io_ctx_impl);
-  return ctx->tmap_update(oid, cmdbl);
+  return ctx->execute_operation(
+    oid, boost::bind(&TestIoCtxImpl::tmap_update, _1, _2, cmdbl));
 }
 
 int IoCtx::unwatch2(uint64_t handle) {
@@ -550,12 +586,16 @@ int IoCtx::watch2(const std::string& o, uint64_t *handle,
 int IoCtx::write(const std::string& oid, bufferlist& bl, size_t len,
                  uint64_t off) {
   TestIoCtxImpl *ctx = reinterpret_cast<TestIoCtxImpl*>(io_ctx_impl);
-  return ctx->write(oid, bl, len, off);
+  return ctx->execute_operation(
+    oid, boost::bind(&TestIoCtxImpl::write, _1, _2, bl, len, off,
+                     ctx->get_snap_context()));
 }
 
 int IoCtx::write_full(const std::string& oid, bufferlist& bl) {
   TestIoCtxImpl *ctx = reinterpret_cast<TestIoCtxImpl*>(io_ctx_impl);
-  return ctx->write_full(oid, bl);
+  return ctx->execute_operation(
+    oid, boost::bind(&TestIoCtxImpl::write_full, _1, _2, bl,
+                     ctx->get_snap_context()));
 }
 
 static int save_operation_result(int result, int *pval) {
@@ -588,8 +628,7 @@ void ObjectOperation::exec(const char *cls, const char *method,
                            bufferlist& inbl) {
   TestObjectOperationImpl *o = reinterpret_cast<TestObjectOperationImpl*>(impl);
   o->ops.push_back(boost::bind(&TestIoCtxImpl::exec, _1, _2,
-			       boost::ref(*get_class_handler()),
-			       cls, method, inbl, _3));
+			       get_class_handler(), cls, method, inbl, _3, _4));
 }
 
 void ObjectOperation::set_op_flags2(int flags) {
@@ -600,6 +639,18 @@ size_t ObjectOperation::size() {
   return o->ops.size();
 }
 
+void ObjectReadOperation::list_snaps(snap_set_t *out_snaps, int *prval) {
+  TestObjectOperationImpl *o = reinterpret_cast<TestObjectOperationImpl*>(impl);
+
+  ObjectOperationTestImpl op = boost::bind(&TestIoCtxImpl::list_snaps, _1, _2,
+                                           out_snaps);
+  if (prval != NULL) {
+    op = boost::bind(save_operation_result,
+                     boost::bind(op, _1, _2, _3, _4), prval);
+  }
+  o->ops.push_back(op);
+}
+
 void ObjectReadOperation::read(size_t off, uint64_t len, bufferlist *pbl,
                                int *prval) {
   TestObjectOperationImpl *o = reinterpret_cast<TestObjectOperationImpl*>(impl);
@@ -613,7 +664,7 @@ void ObjectReadOperation::read(size_t off, uint64_t len, bufferlist *pbl,
 
   if (prval != NULL) {
     op = boost::bind(save_operation_result,
-                     boost::bind(op, _1, _2, _3), prval);
+                     boost::bind(op, _1, _2, _3, _4), prval);
   }
   o->ops.push_back(op);
 }
@@ -632,7 +683,7 @@ void ObjectReadOperation::sparse_read(uint64_t off, uint64_t len,
 
   if (prval != NULL) {
     op = boost::bind(save_operation_result,
-                     boost::bind(op, _1, _2, _3), prval);
+                     boost::bind(op, _1, _2, _3, _4), prval);
   }
   o->ops.push_back(op);
 }
@@ -667,18 +718,18 @@ void ObjectWriteOperation::set_alloc_hint(uint64_t expected_object_size,
 
 void ObjectWriteOperation::truncate(uint64_t off) {
   TestObjectOperationImpl *o = reinterpret_cast<TestObjectOperationImpl*>(impl);
-  o->ops.push_back(boost::bind(&TestIoCtxImpl::truncate, _1, _2, off));
+  o->ops.push_back(boost::bind(&TestIoCtxImpl::truncate, _1, _2, off, _4));
 }
 
 void ObjectWriteOperation::write(uint64_t off, const bufferlist& bl) {
   TestObjectOperationImpl *o = reinterpret_cast<TestObjectOperationImpl*>(impl);
   o->ops.push_back(boost::bind(&TestIoCtxImpl::write, _1, _2, bl, bl.length(),
-			       off));
+			       off, _4));
 }
 
 void ObjectWriteOperation::write_full(const bufferlist& bl) {
   TestObjectOperationImpl *o = reinterpret_cast<TestObjectOperationImpl*>(impl);
-  o->ops.push_back(boost::bind(&TestIoCtxImpl::write_full, _1, _2, bl));
+  o->ops.push_back(boost::bind(&TestIoCtxImpl::write_full, _1, _2, bl, _4));
 }
 
 void ObjectWriteOperation::zero(uint64_t off, uint64_t len) {
@@ -1005,13 +1056,13 @@ int cls_cxx_write(cls_method_context_t hctx, int ofs, int len,
                   bufferlist *inbl) {
   librados::TestClassHandler::MethodContext *ctx =
     reinterpret_cast<librados::TestClassHandler::MethodContext*>(hctx);
-  return ctx->io_ctx_impl->write(ctx->oid, *inbl, len, ofs);
+  return ctx->io_ctx_impl->write(ctx->oid, *inbl, len, ofs, ctx->snapc);
 }
 
 int cls_cxx_write_full(cls_method_context_t hctx, bufferlist *inbl) {
   librados::TestClassHandler::MethodContext *ctx =
     reinterpret_cast<librados::TestClassHandler::MethodContext*>(hctx);
-  return ctx->io_ctx_impl->write_full(ctx->oid, *inbl);
+  return ctx->io_ctx_impl->write_full(ctx->oid, *inbl, ctx->snapc);
 }
 
 int cls_log(int level, const char *format, ...) {
diff --git a/src/test/librados_test_stub/LibradosTestStub.h b/src/test/librados_test_stub/LibradosTestStub.h
new file mode 100644
index 0000000..9fed68d
--- /dev/null
+++ b/src/test/librados_test_stub/LibradosTestStub.h
@@ -0,0 +1,23 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef LIBRADOS_TEST_STUB_H
+#define LIBRADOS_TEST_STUB_H
+
+#include <boost/shared_ptr.hpp>
+
+namespace librados {
+class TestRadosClient;
+}
+
+namespace librados_test_stub {
+
+typedef boost::shared_ptr<librados::TestRadosClient> TestRadosClientPtr;
+
+void set_rados_client(const TestRadosClientPtr &rados_client);
+
+TestRadosClientPtr get_rados_client();
+
+} // namespace librados_test_stub
+
+#endif // LIBRADOS_TEST_STUB_H
diff --git a/src/test/librados_test_stub/TestClassHandler.cc b/src/test/librados_test_stub/TestClassHandler.cc
index 2732552..4f66e1e 100644
--- a/src/test/librados_test_stub/TestClassHandler.cc
+++ b/src/test/librados_test_stub/TestClassHandler.cc
@@ -2,10 +2,12 @@
 // vim: ts=8 sw=2 smarttab
 
 #include "test/librados_test_stub/TestClassHandler.h"
+#include "test/librados_test_stub/TestIoCtxImpl.h"
 #include <boost/algorithm/string/predicate.hpp>
 #include <dlfcn.h>
 #include <errno.h>
 #include "common/debug.h"
+#include "include/assert.h"
 
 #define dout_subsys ceph_subsys_rados
 
@@ -103,11 +105,19 @@ cls_method_cxx_call_t TestClassHandler::get_method(const std::string &cls,
 }
 
 TestClassHandler::SharedMethodContext TestClassHandler::get_method_context(
-    TestIoCtxImpl *io_ctx_impl, const std::string &oid) {
+    TestIoCtxImpl *io_ctx_impl, const std::string &oid,
+    const SnapContext &snapc) {
   SharedMethodContext ctx(new MethodContext());
-  ctx->io_ctx_impl = io_ctx_impl;
+
+  // clone to ioctx to provide a firewall for gmock expectations
+  ctx->io_ctx_impl = io_ctx_impl->clone();
   ctx->oid = oid;
+  ctx->snapc = snapc;
   return ctx;
 }
 
+TestClassHandler::MethodContext::~MethodContext() {
+  io_ctx_impl->put();
+}
+
 } // namespace librados
diff --git a/src/test/librados_test_stub/TestClassHandler.h b/src/test/librados_test_stub/TestClassHandler.h
index d921d41..e25db27 100644
--- a/src/test/librados_test_stub/TestClassHandler.h
+++ b/src/test/librados_test_stub/TestClassHandler.h
@@ -5,6 +5,7 @@
 #define CEPH_TEST_CLASS_HANDLER_H
 
 #include "objclass/objclass.h"
+#include "common/snap_types.h"
 #include <boost/shared_ptr.hpp>
 #include <list>
 #include <map>
@@ -22,8 +23,11 @@ public:
   ~TestClassHandler();
 
   struct MethodContext {
+    ~MethodContext();
+
     TestIoCtxImpl *io_ctx_impl;
     std::string oid;
+    SnapContext snapc;
   };
   typedef boost::shared_ptr<MethodContext> SharedMethodContext;
 
@@ -47,7 +51,8 @@ public:
   cls_method_cxx_call_t get_method(const std::string &cls,
                                    const std::string &method);
   SharedMethodContext get_method_context(TestIoCtxImpl *io_ctx_impl,
-                                         const std::string &oid);
+                                         const std::string &oid,
+                                         const SnapContext &snapc);
 
 private:
 
diff --git a/src/test/librados_test_stub/TestIoCtxImpl.cc b/src/test/librados_test_stub/TestIoCtxImpl.cc
index e376e63..30c1e13 100644
--- a/src/test/librados_test_stub/TestIoCtxImpl.cc
+++ b/src/test/librados_test_stub/TestIoCtxImpl.cc
@@ -18,9 +18,9 @@ TestIoCtxImpl::TestIoCtxImpl() : m_client(NULL) {
   get();
 }
 
-TestIoCtxImpl::TestIoCtxImpl(TestRadosClient &client, int64_t pool_id,
+TestIoCtxImpl::TestIoCtxImpl(TestRadosClient *client, int64_t pool_id,
                              const std::string& pool_name)
-  : m_client(&client), m_pool_id(pool_id), m_pool_name(pool_name),
+  : m_client(client), m_pool_id(pool_id), m_pool_name(pool_name),
     m_snap_seq(CEPH_NOSNAP)
 {
   m_client->get();
@@ -93,11 +93,12 @@ void TestIoCtxImpl::aio_flush_async(AioCompletionImpl *c) {
 int TestIoCtxImpl::aio_operate(const std::string& oid, TestObjectOperationImpl &ops,
                                AioCompletionImpl *c, SnapContext *snap_context,
                                int flags) {
-  // TODO ignoring snap_context and flags for now
+  // TODO flags for now
   ops.get();
   m_client->add_aio_operation(oid, true, boost::bind(
     &TestIoCtxImpl::execute_aio_operations, this, oid, &ops,
-    reinterpret_cast<bufferlist*>(NULL)), c);
+    reinterpret_cast<bufferlist*>(0),
+    snap_context != NULL ? *snap_context : m_snapc), c);
   return 0;
 }
 
@@ -108,20 +109,21 @@ int TestIoCtxImpl::aio_operate_read(const std::string& oid,
   // TODO ignoring flags for now
   ops.get();
   m_client->add_aio_operation(oid, true, boost::bind(
-    &TestIoCtxImpl::execute_aio_operations, this, oid, &ops, pbl), c);
+    &TestIoCtxImpl::execute_aio_operations, this, oid, &ops, pbl, m_snapc), c);
   return 0;
 }
 
-int TestIoCtxImpl::exec(const std::string& oid, TestClassHandler &handler,
+int TestIoCtxImpl::exec(const std::string& oid, TestClassHandler *handler,
                         const char *cls, const char *method,
-                        bufferlist& inbl, bufferlist* outbl) {
-  cls_method_cxx_call_t call = handler.get_method(cls, method);
+                        bufferlist& inbl, bufferlist* outbl,
+                        const SnapContext &snapc) {
+  cls_method_cxx_call_t call = handler->get_method(cls, method);
   if (call == NULL) {
     return -ENOSYS;
   }
 
   return (*call)(reinterpret_cast<cls_method_context_t>(
-    handler.get_method_context(this, oid).get()), &inbl, outbl);
+    handler->get_method_context(this, oid, snapc).get()), &inbl, outbl);
 }
 
 int TestIoCtxImpl::list_watchers(const std::string& o,
@@ -146,7 +148,7 @@ int TestIoCtxImpl::operate(const std::string& oid, TestObjectOperationImpl &ops)
   ops.get();
   m_client->add_aio_operation(oid, false, boost::bind(
     &TestIoCtxImpl::execute_aio_operations, this, oid, &ops,
-    reinterpret_cast<bufferlist*>(NULL)), comp);
+    reinterpret_cast<bufferlist*>(0), m_snapc), comp);
 
   comp->wait_for_safe();
   int ret = comp->get_return_value();
@@ -160,7 +162,8 @@ int TestIoCtxImpl::operate_read(const std::string& oid, TestObjectOperationImpl
 
   ops.get();
   m_client->add_aio_operation(oid, false, boost::bind(
-    &TestIoCtxImpl::execute_aio_operations, this, oid, &ops, pbl), comp);
+    &TestIoCtxImpl::execute_aio_operations, this, oid, &ops, pbl,
+    m_snapc), comp);
 
   comp->wait_for_complete();
   int ret = comp->get_return_value();
@@ -234,7 +237,7 @@ int TestIoCtxImpl::tmap_update(const std::string& oid, bufferlist& cmdbl) {
   bufferlist out;
   ::encode(tmap_header, out);
   ::encode(tmap, out);
-  r = write_full(oid, out);
+  r = write_full(oid, out, m_snapc);
   return r;
 }
 
@@ -248,12 +251,21 @@ int TestIoCtxImpl::watch(const std::string& o, uint64_t *handle,
                                             ctx2);
 }
 
+int TestIoCtxImpl::execute_operation(const std::string& oid,
+                                     const Operation &operation) {
+  TestRadosClient::Transaction transaction(m_client, oid);
+  return operation(this, oid);
+}
+
 int TestIoCtxImpl::execute_aio_operations(const std::string& oid,
                                           TestObjectOperationImpl *ops,
-                                          bufferlist *pbl) {
+                                          bufferlist *pbl,
+                                          const SnapContext &snapc) {
+  TestRadosClient::Transaction transaction(m_client, oid);
   int ret = 0;
-  for (ObjectOperations::iterator it = ops->ops.begin(); it != ops->ops.end(); ++it) {
-    ret = (*it)(this, oid, pbl);
+  for (ObjectOperations::iterator it = ops->ops.begin();
+       it != ops->ops.end(); ++it) {
+    ret = (*it)(this, oid, pbl, snapc);
     if (ret < 0) {
       break;
     }
diff --git a/src/test/librados_test_stub/TestIoCtxImpl.h b/src/test/librados_test_stub/TestIoCtxImpl.h
index d21fd66..450ee59 100644
--- a/src/test/librados_test_stub/TestIoCtxImpl.h
+++ b/src/test/librados_test_stub/TestIoCtxImpl.h
@@ -18,7 +18,8 @@ class TestRadosClient;
 
 typedef boost::function<int(TestIoCtxImpl*,
 			    const std::string&,
-			    bufferlist *)> ObjectOperationTestImpl;
+			    bufferlist *,
+                            const SnapContext &)> ObjectOperationTestImpl;
 typedef std::list<ObjectOperationTestImpl> ObjectOperations;
 
 struct TestObjectOperationImpl {
@@ -33,9 +34,11 @@ private:
 
 class TestIoCtxImpl {
 public:
+  typedef boost::function<int(TestIoCtxImpl *, const std::string &)> Operation;
+
 
   TestIoCtxImpl();
-  explicit TestIoCtxImpl(TestRadosClient &client, int64_t m_pool_id,
+  explicit TestIoCtxImpl(TestRadosClient *client, int64_t m_pool_id,
                          const std::string& pool_name);
 
   TestRadosClient *get_rados_client() {
@@ -45,6 +48,10 @@ public:
   void get();
   void put();
 
+  inline int64_t get_pool_id() const {
+    return m_pool_id;
+  }
+
   virtual TestIoCtxImpl *clone() = 0;
 
   virtual uint64_t get_instance_id() const;
@@ -55,6 +62,9 @@ public:
     return m_snap_seq;
   }
 
+  inline void set_snap_context(const SnapContext& snapc) {
+    m_snapc = snapc;
+  }
   const SnapContext &get_snap_context() const {
     return m_snapc;
   }
@@ -72,9 +82,10 @@ public:
   virtual int assert_exists(const std::string &oid) = 0;
 
   virtual int create(const std::string& oid, bool exclusive) = 0;
-  virtual int exec(const std::string& oid, TestClassHandler &handler,
+  virtual int exec(const std::string& oid, TestClassHandler *handler,
                    const char *cls, const char *method,
-                   bufferlist& inbl, bufferlist* outbl);
+                   bufferlist& inbl, bufferlist* outbl,
+                   const SnapContext &snapc);
   virtual int list_snaps(const std::string& o, snap_set_t *out_snaps) = 0;
   virtual int list_watchers(const std::string& o,
                             std::list<obj_watch_t> *out_watchers);
@@ -111,26 +122,32 @@ public:
                           std::map<uint64_t,uint64_t> *m,
                           bufferlist *data_bl) = 0;
   virtual int stat(const std::string& oid, uint64_t *psize, time_t *pmtime) = 0;
-  virtual int truncate(const std::string& oid, uint64_t size) = 0;
+  virtual int truncate(const std::string& oid, uint64_t size,
+                       const SnapContext &snapc) = 0;
   virtual int tmap_update(const std::string& oid, bufferlist& cmdbl);
   virtual int unwatch(uint64_t handle);
   virtual int watch(const std::string& o, uint64_t *handle,
                     librados::WatchCtx *ctx, librados::WatchCtx2 *ctx2);
   virtual int write(const std::string& oid, bufferlist& bl, size_t len,
-                    uint64_t off) = 0;
-  virtual int write_full(const std::string& oid, bufferlist& bl) = 0;
+                    uint64_t off, const SnapContext &snapc) = 0;
+  virtual int write_full(const std::string& oid, bufferlist& bl,
+                         const SnapContext &snapc) = 0;
   virtual int xattr_get(const std::string& oid,
                         std::map<std::string, bufferlist>* attrset) = 0;
   virtual int xattr_set(const std::string& oid, const std::string &name,
                         bufferlist& bl) = 0;
   virtual int zero(const std::string& oid, uint64_t off, uint64_t len) = 0;
 
+  int execute_operation(const std::string& oid,
+                        const Operation &operation);
+
 protected:
   TestIoCtxImpl(const TestIoCtxImpl& rhs);
   virtual ~TestIoCtxImpl();
 
-  int execute_aio_operations(const std::string& oid, TestObjectOperationImpl *ops,
-                             bufferlist *pbl);
+  int execute_aio_operations(const std::string& oid,
+                             TestObjectOperationImpl *ops,
+                             bufferlist *pbl, const SnapContext &snapc);
 
 private:
 
diff --git a/src/test/librados_test_stub/TestMemIoCtxImpl.cc b/src/test/librados_test_stub/TestMemIoCtxImpl.cc
index 7116c46..6b124ed 100644
--- a/src/test/librados_test_stub/TestMemIoCtxImpl.cc
+++ b/src/test/librados_test_stub/TestMemIoCtxImpl.cc
@@ -7,6 +7,7 @@
 #include <boost/algorithm/string/predicate.hpp>
 #include <boost/bind.hpp>
 #include <errno.h>
+#include <include/compat.h>
 
 static void to_vector(const interval_set<uint64_t> &set,
                       std::vector<std::pair<uint64_t, uint64_t> > *vec) {
@@ -23,14 +24,21 @@ TestMemIoCtxImpl::TestMemIoCtxImpl() {
 }
 
 TestMemIoCtxImpl::TestMemIoCtxImpl(const TestMemIoCtxImpl& rhs)
-  : TestIoCtxImpl(rhs), m_client(rhs.m_client), m_pool(rhs.m_pool) {
-  }
+    : TestIoCtxImpl(rhs), m_client(rhs.m_client), m_pool(rhs.m_pool) {
+  m_pool->get();
+}
 
-TestMemIoCtxImpl::TestMemIoCtxImpl(TestMemRadosClient &client, int64_t pool_id,
+TestMemIoCtxImpl::TestMemIoCtxImpl(TestMemRadosClient *client, int64_t pool_id,
                                    const std::string& pool_name,
                                    TestMemRadosClient::Pool *pool)
-  : TestIoCtxImpl(client, pool_id, pool_name), m_client(&client), m_pool(pool) {
-  }
+    : TestIoCtxImpl(client, pool_id, pool_name), m_client(client),
+      m_pool(pool) {
+  m_pool->get();
+}
+
+TestMemIoCtxImpl::~TestMemIoCtxImpl() {
+  m_pool->put();
+}
 
 TestIoCtxImpl *TestMemIoCtxImpl::clone() {
   return new TestMemIoCtxImpl(*this);
@@ -45,7 +53,8 @@ int TestMemIoCtxImpl::aio_remove(const std::string& oid, AioCompletionImpl *c) {
 
 int TestMemIoCtxImpl::assert_exists(const std::string &oid) {
   RWLock::RLocker l(m_pool->file_lock);
-  TestMemRadosClient::SharedFile file = get_file(oid, false);
+  TestMemRadosClient::SharedFile file = get_file(oid, false,
+                                                 get_snap_context());
   if (file == NULL) {
     return -ENOENT;
   }
@@ -58,7 +67,7 @@ int TestMemIoCtxImpl::create(const std::string& oid, bool exclusive) {
   }
 
   RWLock::WLocker l(m_pool->file_lock);
-  get_file(oid, true);
+  get_file(oid, true, get_snap_context());
   return 0;
 }
 
@@ -140,7 +149,7 @@ int TestMemIoCtxImpl::omap_get_vals(const std::string& oid,
   TestMemRadosClient::SharedFile file;
   {
     RWLock::RLocker l(m_pool->file_lock);
-    file = get_file(oid, false);
+    file = get_file(oid, false, get_snap_context());
     if (file == NULL) {
       return -ENOENT;
     }
@@ -180,7 +189,7 @@ int TestMemIoCtxImpl::omap_rm_keys(const std::string& oid,
   TestMemRadosClient::SharedFile file;
   {
     RWLock::WLocker l(m_pool->file_lock);
-    file = get_file(oid, true);
+    file = get_file(oid, true, get_snap_context());
     if (file == NULL) {
       return -ENOENT;
     }
@@ -203,7 +212,7 @@ int TestMemIoCtxImpl::omap_set(const std::string& oid,
   TestMemRadosClient::SharedFile file;
   {
     RWLock::WLocker l(m_pool->file_lock);
-    file = get_file(oid, true);
+    file = get_file(oid, true, get_snap_context());
     if (file == NULL) {
       return -ENOENT;
     }
@@ -225,7 +234,7 @@ int TestMemIoCtxImpl::read(const std::string& oid, size_t len, uint64_t off,
   TestMemRadosClient::SharedFile file;
   {
     RWLock::RLocker l(m_pool->file_lock);
-    file = get_file(oid, false);
+    file = get_file(oid, false, get_snap_context());
     if (file == NULL) {
       return -ENOENT;
     }
@@ -250,11 +259,12 @@ int TestMemIoCtxImpl::remove(const std::string& oid) {
   }
 
   RWLock::WLocker l(m_pool->file_lock);
-  TestMemRadosClient::SharedFile file = get_file(oid, false);
+  TestMemRadosClient::SharedFile file = get_file(oid, false,
+                                                 get_snap_context());
   if (file == NULL) {
     return -ENOENT;
   }
-  file = get_file(oid, true);
+  file = get_file(oid, true, get_snap_context());
 
   RWLock::WLocker l2(file->lock);
   file->exists = false;
@@ -340,7 +350,7 @@ int TestMemIoCtxImpl::sparse_read(const std::string& oid, uint64_t off,
   TestMemRadosClient::SharedFile file;
   {
     RWLock::RLocker l(m_pool->file_lock);
-    file = get_file(oid, false);
+    file = get_file(oid, false, get_snap_context());
     if (file == NULL) {
       return -ENOENT;
     }
@@ -367,7 +377,7 @@ int TestMemIoCtxImpl::stat(const std::string& oid, uint64_t *psize,
   TestMemRadosClient::SharedFile file;
   {
     RWLock::RLocker l(m_pool->file_lock);
-    file = get_file(oid, false);
+    file = get_file(oid, false, get_snap_context());
     if (file == NULL) {
       return -ENOENT;
     }
@@ -383,7 +393,8 @@ int TestMemIoCtxImpl::stat(const std::string& oid, uint64_t *psize,
   return 0;
 }
 
-int TestMemIoCtxImpl::truncate(const std::string& oid, uint64_t size) {
+int TestMemIoCtxImpl::truncate(const std::string& oid, uint64_t size,
+                               const SnapContext &snapc) {
   if (get_snap_read() != CEPH_NOSNAP) {
     return -EROFS;
   }
@@ -391,7 +402,7 @@ int TestMemIoCtxImpl::truncate(const std::string& oid, uint64_t size) {
   TestMemRadosClient::SharedFile file;
   {
     RWLock::WLocker l(m_pool->file_lock);
-    file = get_file(oid, true);
+    file = get_file(oid, true, snapc);
   }
 
   RWLock::WLocker l(file->lock);
@@ -419,7 +430,7 @@ int TestMemIoCtxImpl::truncate(const std::string& oid, uint64_t size) {
 }
 
 int TestMemIoCtxImpl::write(const std::string& oid, bufferlist& bl, size_t len,
-                            uint64_t off) {
+                            uint64_t off, const SnapContext &snapc) {
   if (get_snap_read() != CEPH_NOSNAP) {
     return -EROFS;
   }
@@ -427,7 +438,7 @@ int TestMemIoCtxImpl::write(const std::string& oid, bufferlist& bl, size_t len,
   TestMemRadosClient::SharedFile file;
   {
     RWLock::WLocker l(m_pool->file_lock);
-    file = get_file(oid, true);
+    file = get_file(oid, true, snapc);
   }
 
   RWLock::WLocker l(file->lock);
@@ -443,7 +454,8 @@ int TestMemIoCtxImpl::write(const std::string& oid, bufferlist& bl, size_t len,
   return 0;
 }
 
-int TestMemIoCtxImpl::write_full(const std::string& oid, bufferlist& bl) {
+int TestMemIoCtxImpl::write_full(const std::string& oid, bufferlist& bl,
+                                 const SnapContext &snapc) {
   if (get_snap_read() != CEPH_NOSNAP) {
     return -EROFS;
   }
@@ -451,7 +463,7 @@ int TestMemIoCtxImpl::write_full(const std::string& oid, bufferlist& bl) {
   TestMemRadosClient::SharedFile file;
   {
     RWLock::WLocker l(m_pool->file_lock);
-    file = get_file(oid, true);
+    file = get_file(oid, true, snapc);
     if (file == NULL) {
       return -ENOENT;
     }
@@ -494,11 +506,11 @@ int TestMemIoCtxImpl::zero(const std::string& oid, uint64_t off, uint64_t len) {
   TestMemRadosClient::SharedFile file;
   {
     RWLock::WLocker l(m_pool->file_lock);
-    file = get_file(oid, false);
+    file = get_file(oid, false, get_snap_context());
     if (!file) {
       return 0;
     }
-    file = get_file(oid, true);
+    file = get_file(oid, true, get_snap_context());
 
     RWLock::RLocker l2(file->lock);
     if (len > 0 && off + len >= file->data.length()) {
@@ -507,12 +519,12 @@ int TestMemIoCtxImpl::zero(const std::string& oid, uint64_t off, uint64_t len) {
     }
   }
   if (truncate_redirect) {
-    return truncate(oid, off);
+    return truncate(oid, off, get_snap_context());
   }
 
   bufferlist bl;
   bl.append_zero(len);
-  return write(oid, bl, len, off);
+  return write(oid, bl, len, off, get_snap_context());
 }
 
 void TestMemIoCtxImpl::append_clone(bufferlist& src, bufferlist* dest) {
@@ -544,7 +556,7 @@ void TestMemIoCtxImpl::ensure_minimum_length(size_t len, bufferlist *bl) {
 }
 
 TestMemRadosClient::SharedFile TestMemIoCtxImpl::get_file(
-    const std::string &oid, bool write) {
+    const std::string &oid, bool write, const SnapContext &snapc) {
   assert(m_pool->file_lock.is_locked() || m_pool->file_lock.is_wlocked());
   assert(!write || m_pool->file_lock.is_wlocked());
 
@@ -557,7 +569,6 @@ TestMemRadosClient::SharedFile TestMemIoCtxImpl::get_file(
   }
 
   if (write) {
-    const SnapContext &snapc = get_snap_context();
     bool new_version = false;
     if (!file || !file->exists) {
       file = TestMemRadosClient::SharedFile(new TestMemRadosClient::File());
@@ -572,11 +583,13 @@ TestMemRadosClient::SharedFile TestMemIoCtxImpl::get_file(
           }
         }
 
-        uint64_t prev_size = file->data.length();
+        bufferlist prev_data = file->data;
         file = TestMemRadosClient::SharedFile(
           new TestMemRadosClient::File(*file));
-        if (prev_size > 0) {
-          file->snap_overlap.insert(0, prev_size);
+        file->data.clear();
+        append_clone(prev_data, &file->data);
+        if (prev_data.length() > 0) {
+          file->snap_overlap.insert(0, prev_data.length());
         }
         new_version = true;
       }
diff --git a/src/test/librados_test_stub/TestMemIoCtxImpl.h b/src/test/librados_test_stub/TestMemIoCtxImpl.h
index 68adf21..aa65415 100644
--- a/src/test/librados_test_stub/TestMemIoCtxImpl.h
+++ b/src/test/librados_test_stub/TestMemIoCtxImpl.h
@@ -12,9 +12,10 @@ namespace librados {
 class TestMemIoCtxImpl : public TestIoCtxImpl {
 public:
   TestMemIoCtxImpl();
-  explicit TestMemIoCtxImpl(TestMemRadosClient &client, int64_t m_pool_id,
-                            const std::string& pool_name,
-                            TestMemRadosClient::Pool *pool);
+  TestMemIoCtxImpl(TestMemRadosClient *client, int64_t m_pool_id,
+                   const std::string& pool_name,
+                   TestMemRadosClient::Pool *pool);
+  virtual ~TestMemIoCtxImpl();
 
   virtual TestIoCtxImpl *clone();
 
@@ -43,16 +44,23 @@ public:
   virtual int sparse_read(const std::string& oid, uint64_t off, uint64_t len,
                           std::map<uint64_t,uint64_t> *m, bufferlist *data_bl);
   virtual int stat(const std::string& oid, uint64_t *psize, time_t *pmtime);
-  virtual int truncate(const std::string& oid, uint64_t size);
+  virtual int truncate(const std::string& oid, uint64_t size,
+                       const SnapContext &snapc);
   virtual int write(const std::string& oid, bufferlist& bl, size_t len,
-                    uint64_t off);
-  virtual int write_full(const std::string& oid, bufferlist& bl);
+                    uint64_t off, const SnapContext &snapc);
+  virtual int write_full(const std::string& oid, bufferlist& bl,
+                         const SnapContext &snapc);
   virtual int xattr_get(const std::string& oid,
                         std::map<std::string, bufferlist>* attrset);
   virtual int xattr_set(const std::string& oid, const std::string &name,
                         bufferlist& bl);
   virtual int zero(const std::string& oid, uint64_t off, uint64_t len);
 
+protected:
+  TestMemRadosClient::Pool *get_pool() {
+    return m_pool;
+  }
+
 private:
   TestMemIoCtxImpl(const TestMemIoCtxImpl&);
 
@@ -63,7 +71,8 @@ private:
   size_t clip_io(size_t off, size_t len, size_t bl_len);
   void ensure_minimum_length(size_t len, bufferlist *bl);
 
-  TestMemRadosClient::SharedFile get_file(const std::string &oid, bool write);
+  TestMemRadosClient::SharedFile get_file(const std::string &oid, bool write,
+                                          const SnapContext &snapc);
 
 };
 
diff --git a/src/test/librados_test_stub/TestMemRadosClient.cc b/src/test/librados_test_stub/TestMemRadosClient.cc
index b89f4eb..6492d25 100644
--- a/src/test/librados_test_stub/TestMemRadosClient.cc
+++ b/src/test/librados_test_stub/TestMemRadosClient.cc
@@ -38,10 +38,7 @@ TestMemRadosClient::Pool::Pool()
 
 TestIoCtxImpl *TestMemRadosClient::create_ioctx(int64_t pool_id,
 						const std::string &pool_name) {
-  Pools::iterator iter = m_pools.find(pool_name);
-  assert(iter != m_pools.end());
-
-  return new TestMemIoCtxImpl(*this, pool_id, pool_name, iter->second);
+  return new TestMemIoCtxImpl(this, pool_id, pool_name, get_pool(pool_name));
 }
 
 void TestMemRadosClient::object_list(int64_t pool_id,
@@ -78,7 +75,7 @@ int TestMemRadosClient::pool_delete(const std::string &pool_name) {
   if (iter == m_pools.end()) {
     return -ENOENT;
   }
-  delete iter->second;
+  iter->second->put();
   m_pools.erase(iter);
   return 0;
 }
@@ -125,4 +122,11 @@ int TestMemRadosClient::blacklist_add(const std::string& client_address,
   return 0;
 }
 
+TestMemRadosClient::Pool *TestMemRadosClient::get_pool(
+    const std::string &pool_name) {
+  Pools::iterator iter = m_pools.find(pool_name);
+  assert(iter != m_pools.end());
+  return iter->second;
+}
+
 } // namespace librados
diff --git a/src/test/librados_test_stub/TestMemRadosClient.h b/src/test/librados_test_stub/TestMemRadosClient.h
index a6fb3bd..dada74e 100644
--- a/src/test/librados_test_stub/TestMemRadosClient.h
+++ b/src/test/librados_test_stub/TestMemRadosClient.h
@@ -6,8 +6,10 @@
 
 #include "test/librados_test_stub/TestRadosClient.h"
 #include "include/atomic.h"
+#include "include/assert.h"
 #include "include/buffer.h"
 #include "include/interval_set.h"
+#include "common/RefCountedObj.h"
 #include "common/RWLock.h"
 #include <boost/shared_ptr.hpp>
 #include <list>
@@ -48,7 +50,7 @@ public:
   typedef std::map<std::string, FileSnapshots> Files;
 
   typedef std::set<uint64_t> SnapSeqs;
-  struct Pool {
+  struct Pool : public RefCountedObject {
     Pool();
 
     int64_t pool_id;
@@ -85,6 +87,8 @@ public:
 protected:
   ~TestMemRadosClient();
 
+  Pool *get_pool(const std::string &pool_name);
+
 private:
 
   typedef std::map<std::string, Pool*>		Pools;
diff --git a/src/test/librados_test_stub/TestRadosClient.cc b/src/test/librados_test_stub/TestRadosClient.cc
index 925dc59..46437ac 100644
--- a/src/test/librados_test_stub/TestRadosClient.cc
+++ b/src/test/librados_test_stub/TestRadosClient.cc
@@ -84,7 +84,8 @@ private:
 
 TestRadosClient::TestRadosClient(CephContext *cct)
   : m_cct(cct->get()),
-    m_watch_notify(m_cct)
+    m_watch_notify(m_cct),
+    m_transaction_lock("TestRadosClient::m_transaction_lock")
 {
   get();
 
@@ -225,4 +226,21 @@ Finisher *TestRadosClient::get_finisher(const std::string &oid) {
   return m_finishers[h % m_finishers.size()];
 }
 
+void TestRadosClient::transaction_start(const std::string &oid) {
+  Mutex::Locker locker(m_transaction_lock);
+  while (m_transactions.count(oid)) {
+    m_transaction_cond.Wait(m_transaction_lock);
+  }
+  std::pair<std::set<std::string>::iterator, bool> result =
+    m_transactions.insert(oid);
+  assert(result.second);
+}
+
+void TestRadosClient::transaction_finish(const std::string &oid) {
+  Mutex::Locker locker(m_transaction_lock);
+  size_t count = m_transactions.erase(oid);
+  assert(count == 1);
+  m_transaction_cond.Signal();
+}
+
 } // namespace librados
diff --git a/src/test/librados_test_stub/TestRadosClient.h b/src/test/librados_test_stub/TestRadosClient.h
index a061105..ad0cf67 100644
--- a/src/test/librados_test_stub/TestRadosClient.h
+++ b/src/test/librados_test_stub/TestRadosClient.h
@@ -6,6 +6,8 @@
 
 #include "include/rados/librados.hpp"
 #include "common/config.h"
+#include "common/Cond.h"
+#include "common/Mutex.h"
 #include "include/atomic.h"
 #include "include/buffer.h"
 #include "test/librados_test_stub/TestWatchNotify.h"
@@ -13,6 +15,7 @@
 #include <boost/functional/hash.hpp>
 #include <list>
 #include <map>
+#include <set>
 #include <string>
 #include <vector>
 
@@ -25,6 +28,11 @@ class TestIoCtxImpl;
 class TestRadosClient {
 public:
 
+  static void Deallocate(librados::TestRadosClient* client)
+  {
+    client->put();
+  }
+
   typedef boost::function<int()> AioFunction;
 
   struct Object {
@@ -33,6 +41,20 @@ public:
     std::string nspace;
   };
 
+  class Transaction {
+  public:
+    Transaction(TestRadosClient *rados_client, const std::string &oid)
+      : rados_client(rados_client), oid(oid) {
+      rados_client->transaction_start(oid);
+    }
+    ~Transaction() {
+      rados_client->transaction_finish(oid);
+    }
+  private:
+    TestRadosClient *rados_client;
+    std::string oid;
+  };
+
   TestRadosClient(CephContext *cct);
 
   void get();
@@ -93,6 +115,13 @@ private:
 
   TestWatchNotify m_watch_notify;
 
+  Mutex m_transaction_lock;
+  Cond m_transaction_cond;
+  std::set<std::string> m_transactions;
+
+  void transaction_start(const std::string &oid);
+  void transaction_finish(const std::string &oid);
+
 };
 
 } // namespace librados
diff --git a/src/test/libradosstriper/aio.cc b/src/test/libradosstriper/aio.cc
index 009976c..847e011 100644
--- a/src/test/libradosstriper/aio.cc
+++ b/src/test/libradosstriper/aio.cc
@@ -5,6 +5,7 @@
 #include "test/librados/test.h"
 #include "test/libradosstriper/TestCase.h"
 
+#include <fcntl.h>
 #include <semaphore.h>
 #include <errno.h>
 
@@ -16,14 +17,14 @@ class AioTestData
 {
 public:
   AioTestData() : m_complete(false), m_safe(false) {
-    sem_init(&m_sem, 0, 0);
+    m_sem = sem_open("test_libradosstriper_aio_sem", O_CREAT, 0644, 0);
   }
 
   ~AioTestData() {
-    sem_destroy(&m_sem);
+    sem_close(m_sem);
   }
 
-  sem_t m_sem;
+  sem_t *m_sem;
   bool m_complete;
   bool m_safe;
 };
@@ -32,14 +33,14 @@ void set_completion_complete(rados_completion_t cb, void *arg)
 {
   AioTestData *test = static_cast<AioTestData*>(arg);
   test->m_complete = true;
-  sem_post(&test->m_sem);
+  sem_post(test->m_sem);
 }
 
 void set_completion_safe(rados_completion_t cb, void *arg)
 {
   AioTestData *test = static_cast<AioTestData*>(arg);
   test->m_safe = true;
-  sem_post(&test->m_sem);
+  sem_post(test->m_sem);
 }
 
 TEST_F(StriperTest, SimpleWrite) {
@@ -51,8 +52,8 @@ TEST_F(StriperTest, SimpleWrite) {
   memset(buf, 0xcc, sizeof(buf));
   ASSERT_EQ(0, rados_striper_aio_write(striper, "StriperTest", my_completion, buf, sizeof(buf), 0));
   TestAlarm alarm;
-  sem_wait(&test_data.m_sem);
-  sem_wait(&test_data.m_sem);
+  sem_wait(test_data.m_sem);
+  sem_wait(test_data.m_sem);
   rados_aio_release(my_completion);
 }
 
@@ -66,8 +67,8 @@ TEST_F(StriperTestPP, SimpleWritePP) {
   bl1.append(buf, sizeof(buf));
   ASSERT_EQ(0, striper.aio_write("SimpleWritePP", my_completion, bl1, sizeof(buf), 0));
   TestAlarm alarm;
-  sem_wait(&test_data.m_sem);
-  sem_wait(&test_data.m_sem);
+  sem_wait(test_data.m_sem);
+  sem_wait(test_data.m_sem);
   my_completion->release();
 }
 
@@ -81,8 +82,8 @@ TEST_F(StriperTest, WaitForSafe) {
   ASSERT_EQ(0, rados_striper_aio_write(striper, "WaitForSafe", my_completion, buf, sizeof(buf), 0));
   TestAlarm alarm;
   rados_aio_wait_for_safe(my_completion);
-  sem_wait(&test_data.m_sem);
-  sem_wait(&test_data.m_sem);
+  sem_wait(test_data.m_sem);
+  sem_wait(test_data.m_sem);
   rados_aio_release(my_completion);
 }
 
@@ -97,8 +98,8 @@ TEST_F(StriperTestPP, WaitForSafePP) {
   ASSERT_EQ(0, striper.aio_write("WaitForSafePP", my_completion, bl1, sizeof(buf), 0));
   TestAlarm alarm;
   my_completion->wait_for_safe();
-  sem_wait(&test_data.m_sem);
-  sem_wait(&test_data.m_sem);
+  sem_wait(test_data.m_sem);
+  sem_wait(test_data.m_sem);
   my_completion->release();
 }
 
@@ -112,8 +113,8 @@ TEST_F(StriperTest, RoundTrip) {
   ASSERT_EQ(0, rados_striper_aio_write(striper, "RoundTrip", my_completion, buf, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(&test_data.m_sem);
-    sem_wait(&test_data.m_sem);
+    sem_wait(test_data.m_sem);
+    sem_wait(test_data.m_sem);
   }
   char buf2[128];
   memset(buf2, 0, sizeof(buf2));
@@ -126,8 +127,8 @@ TEST_F(StriperTest, RoundTrip) {
     rados_aio_wait_for_complete(my_completion2);
   }
   ASSERT_EQ(0, memcmp(buf, buf2, sizeof(buf)));
-  sem_wait(&test_data.m_sem);
-  sem_wait(&test_data.m_sem);
+  sem_wait(test_data.m_sem);
+  sem_wait(test_data.m_sem);
   rados_aio_release(my_completion);
   rados_aio_release(my_completion2);
 }
@@ -142,8 +143,8 @@ TEST_F(StriperTest, RoundTrip2) {
   ASSERT_EQ(0, rados_striper_aio_write(striper, "RoundTrip2", my_completion, buf, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(&test_data.m_sem);
-    sem_wait(&test_data.m_sem);
+    sem_wait(test_data.m_sem);
+    sem_wait(test_data.m_sem);
   }
   char buf2[128];
   memset(buf2, 0, sizeof(buf2));
@@ -156,8 +157,8 @@ TEST_F(StriperTest, RoundTrip2) {
     rados_aio_wait_for_safe(my_completion2);
   }
   ASSERT_EQ(0, memcmp(buf, buf2, sizeof(buf)));
-  sem_wait(&test_data.m_sem);
-  sem_wait(&test_data.m_sem);
+  sem_wait(test_data.m_sem);
+  sem_wait(test_data.m_sem);
   rados_aio_release(my_completion);
   rados_aio_release(my_completion2);
 }
@@ -173,8 +174,8 @@ TEST_F(StriperTestPP, RoundTripPP) {
   ASSERT_EQ(0, striper.aio_write("RoundTripPP", my_completion, bl1, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(&test_data.m_sem);
-    sem_wait(&test_data.m_sem);
+    sem_wait(test_data.m_sem);
+    sem_wait(test_data.m_sem);
   }
   bufferlist bl2;
   AioCompletion *my_completion2 = librados::Rados::aio_create_completion
@@ -185,8 +186,8 @@ TEST_F(StriperTestPP, RoundTripPP) {
     my_completion2->wait_for_complete();
   }
   ASSERT_EQ(0, memcmp(buf, bl2.c_str(), sizeof(buf)));
-  sem_wait(&test_data.m_sem);
-  sem_wait(&test_data.m_sem);
+  sem_wait(test_data.m_sem);
+  sem_wait(test_data.m_sem);
   my_completion->release();
   my_completion2->release();
 }
@@ -202,8 +203,8 @@ TEST_F(StriperTestPP, RoundTripPP2) {
   ASSERT_EQ(0, striper.aio_write("RoundTripPP2", my_completion, bl1, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(&test_data.m_sem);
-    sem_wait(&test_data.m_sem);
+    sem_wait(test_data.m_sem);
+    sem_wait(test_data.m_sem);
   }
   bufferlist bl2;
   AioCompletion *my_completion2 = librados::Rados::aio_create_completion
@@ -214,8 +215,8 @@ TEST_F(StriperTestPP, RoundTripPP2) {
     my_completion2->wait_for_safe();
   }
   ASSERT_EQ(0, memcmp(buf, bl2.c_str(), sizeof(buf)));
-  sem_wait(&test_data.m_sem);
-  sem_wait(&test_data.m_sem);
+  sem_wait(test_data.m_sem);
+  sem_wait(test_data.m_sem);
   my_completion->release();
   my_completion2->release();
 }
@@ -230,8 +231,8 @@ TEST_F(StriperTest, IsComplete) {
   ASSERT_EQ(0, rados_striper_aio_write(striper, "IsComplete", my_completion, buf, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(&test_data.m_sem);
-    sem_wait(&test_data.m_sem);
+    sem_wait(test_data.m_sem);
+    sem_wait(test_data.m_sem);
   }
   char buf2[128];
   memset(buf2, 0, sizeof(buf2));
@@ -250,8 +251,8 @@ TEST_F(StriperTest, IsComplete) {
     }
   }
   ASSERT_EQ(0, memcmp(buf, buf2, sizeof(buf)));
-  sem_wait(&test_data.m_sem);
-  sem_wait(&test_data.m_sem);
+  sem_wait(test_data.m_sem);
+  sem_wait(test_data.m_sem);
   rados_aio_release(my_completion);
   rados_aio_release(my_completion2);
 }
@@ -267,8 +268,8 @@ TEST_F(StriperTestPP, IsCompletePP) {
   ASSERT_EQ(0, striper.aio_write("IsCompletePP", my_completion, bl1, sizeof(buf), 0));
   {
     TestAlarm alarm;
-    sem_wait(&test_data.m_sem);
-    sem_wait(&test_data.m_sem);
+    sem_wait(test_data.m_sem);
+    sem_wait(test_data.m_sem);
   }
   bufferlist bl2;
   AioCompletion *my_completion2 = librados::Rados::aio_create_completion
@@ -285,8 +286,8 @@ TEST_F(StriperTestPP, IsCompletePP) {
     }
   }
   ASSERT_EQ(0, memcmp(buf, bl2.c_str(), sizeof(buf)));
-  sem_wait(&test_data.m_sem);
-  sem_wait(&test_data.m_sem);
+  sem_wait(test_data.m_sem);
+  sem_wait(test_data.m_sem);
   my_completion->release();
   my_completion2->release();
 }
@@ -320,8 +321,8 @@ TEST_F(StriperTest, IsSafe) {
     rados_aio_wait_for_complete(my_completion2);
   }
   ASSERT_EQ(0, memcmp(buf, buf2, sizeof(buf)));
-  sem_wait(&test_data.m_sem);
-  sem_wait(&test_data.m_sem);
+  sem_wait(test_data.m_sem);
+  sem_wait(test_data.m_sem);
   rados_aio_release(my_completion);
   rados_aio_release(my_completion2);
 }
@@ -354,8 +355,8 @@ TEST_F(StriperTestPP, IsSafePP) {
     my_completion2->wait_for_complete();
   }
   ASSERT_EQ(0, memcmp(buf, bl2.c_str(), sizeof(buf)));
-  sem_wait(&test_data.m_sem);
-  sem_wait(&test_data.m_sem);
+  sem_wait(test_data.m_sem);
+  sem_wait(test_data.m_sem);
   my_completion->release();
   my_completion2->release();
 }
@@ -393,8 +394,8 @@ TEST_F(StriperTest, RoundTripAppend) {
   ASSERT_EQ((int)(sizeof(buf) + sizeof(buf2)), rados_aio_get_return_value(my_completion3));
   ASSERT_EQ(0, memcmp(buf3, buf, sizeof(buf)));
   ASSERT_EQ(0, memcmp(buf3 + sizeof(buf), buf2, sizeof(buf2)));
-  sem_wait(&test_data.m_sem);
-  sem_wait(&test_data.m_sem);
+  sem_wait(test_data.m_sem);
+  sem_wait(test_data.m_sem);
   rados_aio_release(my_completion);
   rados_aio_release(my_completion2);
   rados_aio_release(my_completion3);
@@ -435,8 +436,8 @@ TEST_F(StriperTestPP, RoundTripAppendPP) {
   ASSERT_EQ(sizeof(buf) + sizeof(buf2), (unsigned)my_completion3->get_return_value());
   ASSERT_EQ(0, memcmp(bl3.c_str(), buf, sizeof(buf)));
   ASSERT_EQ(0, memcmp(bl3.c_str() + sizeof(buf), buf2, sizeof(buf2)));
-  sem_wait(&test_data.m_sem);
-  sem_wait(&test_data.m_sem);
+  sem_wait(test_data.m_sem);
+  sem_wait(test_data.m_sem);
   my_completion->release();
   my_completion2->release();
   my_completion3->release();
@@ -462,8 +463,8 @@ TEST_F(StriperTest, Flush) {
     rados_aio_wait_for_complete(my_completion2);
   }
   ASSERT_EQ(0, memcmp(buf, buf2, sizeof(buf)));
-  sem_wait(&test_data.m_sem);
-  sem_wait(&test_data.m_sem);
+  sem_wait(test_data.m_sem);
+  sem_wait(test_data.m_sem);
   rados_aio_release(my_completion);
   rados_aio_release(my_completion2);
 }
@@ -487,8 +488,8 @@ TEST_F(StriperTestPP, FlushPP) {
     my_completion2->wait_for_complete();
   }
   ASSERT_EQ(0, memcmp(buf, bl2.c_str(), sizeof(buf)));
-  sem_wait(&test_data.m_sem);
-  sem_wait(&test_data.m_sem);
+  sem_wait(test_data.m_sem);
+  sem_wait(test_data.m_sem);
   my_completion->release();
   my_completion2->release();
 }
@@ -525,8 +526,8 @@ TEST_F(StriperTest, RoundTripWriteFull) {
   }
   ASSERT_EQ(sizeof(buf2), (unsigned)rados_aio_get_return_value(my_completion3));
   ASSERT_EQ(0, memcmp(buf3, buf2, sizeof(buf2)));
-  sem_wait(&test_data.m_sem);
-  sem_wait(&test_data.m_sem);
+  sem_wait(test_data.m_sem);
+  sem_wait(test_data.m_sem);
   rados_aio_release(my_completion);
   rados_aio_release(my_completion2);
   rados_aio_release(my_completion3);
@@ -566,8 +567,8 @@ TEST_F(StriperTestPP, RoundTripWriteFullPP) {
   }
   ASSERT_EQ(sizeof(buf2), (unsigned)my_completion3->get_return_value());
   ASSERT_EQ(0, memcmp(bl3.c_str(), buf2, sizeof(buf2)));
-  sem_wait(&test_data.m_sem);
-  sem_wait(&test_data.m_sem);
+  sem_wait(test_data.m_sem);
+  sem_wait(test_data.m_sem);
   my_completion->release();
   my_completion2->release();
   my_completion3->release();
diff --git a/src/test/libradosstriper/rados-striper.sh b/src/test/libradosstriper/rados-striper.sh
new file mode 100755
index 0000000..df0a837
--- /dev/null
+++ b/src/test/libradosstriper/rados-striper.sh
@@ -0,0 +1,95 @@
+#!/bin/bash
+#
+# Copyright (C) 2014 Red Hat <contact at redhat.com>
+#
+# Author: Sebastien Ponce <sebastien.ponce at cern.ch>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Library Public License for more details.
+#
+source ../qa/workunits/ceph-helpers.sh
+
+function run() {
+    local dir=$1
+    shift
+
+    export CEPH_MON="127.0.0.1:7116"
+    export CEPH_ARGS
+    CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+    CEPH_ARGS+="--mon-host=$CEPH_MON "
+
+    # setup
+    setup $dir || return 1
+
+    # create a cluster with one monitor and three osds
+    run_mon $dir a || return 1
+    run_osd $dir 0 || return 1
+    run_osd $dir 1 || return 1
+    run_osd $dir 2 || return 1
+
+    # create toyfile
+    dd if=/dev/urandom of=$dir/toyfile bs=1234 count=1
+    
+    # put a striped object
+    rados --pool rbd --striper put toyfile $dir/toyfile || return 1
+    
+    # stat it, with and without striping
+    rados --pool rbd --striper stat toyfile | cut -d ',' -f 2 > $dir/stripedStat || return 1
+    rados --pool rbd stat toyfile.0000000000000000 | cut -d ',' -f 2 > $dir/stat || return 1
+    echo ' size 1234' > $dir/refstat
+    diff -w $dir/stripedStat $dir/refstat || return 1
+    diff -w $dir/stat $dir/refstat || return 1
+    rados --pool rbd stat toyfile >& $dir/staterror
+    grep -q 'No such file or directory' $dir/staterror ||  return 1
+    
+    # get the file back with and without striping
+    rados --pool rbd --striper get toyfile $dir/stripedGroup || return 1
+    diff -w $dir/toyfile $dir/stripedGroup || return 1
+    rados --pool rbd get toyfile.0000000000000000 $dir/nonSTripedGroup || return 1
+    diff -w $dir/toyfile $dir/nonSTripedGroup || return 1
+    
+    # test truncate
+    rados --pool rbd --striper truncate toyfile 12
+    rados --pool rbd --striper stat toyfile | cut -d ',' -f 2 > $dir/stripedStat || return 1
+    rados --pool rbd stat toyfile.0000000000000000 | cut -d ',' -f 2 > $dir/stat || return 1
+    echo ' size 12' > $dir/reftrunc
+    diff -w $dir/stripedStat $dir/reftrunc || return 1
+    diff -w $dir/stat $dir/reftrunc || return 1
+    
+    # test xattrs
+    rados --pool rbd --striper setxattr toyfile somexattr somevalue || return 1
+    rados --pool rbd --striper getxattr toyfile somexattr > $dir/xattrvalue || return 1 
+    rados --pool rbd getxattr toyfile.0000000000000000 somexattr > $dir/xattrvalue2 || return 1 
+    echo 'somevalue' > $dir/refvalue
+    diff -w $dir/xattrvalue $dir/refvalue || return 1
+    diff -w $dir/xattrvalue2 $dir/refvalue || return 1
+    rados --pool rbd --striper listxattr toyfile > $dir/xattrlist || return 1
+    echo 'somexattr' > $dir/reflist
+    diff -w $dir/xattrlist $dir/reflist || return 1
+    rados --pool rbd listxattr toyfile.0000000000000000 | grep -v striper > $dir/xattrlist2 || return 1
+    diff -w $dir/xattrlist2 $dir/reflist || return 1    
+    rados --pool rbd --striper rmxattr toyfile somexattr || return 1
+    rados --pool rbd --striper getxattr toyfile somexattr >& $dir/rmxattrerror
+    grep -q 'No data available' $dir/rmxattrerror || return 1
+    rados --pool rbd getxattr toyfile.0000000000000000 somexattr >& $dir/rmxattrerror2
+    grep -q 'No data available' $dir/rmxattrerror2 || return 1
+    
+    # test rm
+    rados --pool rbd --striper rm toyfile || return 1
+    rados --pool rbd --striper stat toyfile >& $dir/staterror2
+    grep -q 'No such file or directory' $dir/staterror2 || return 1
+    rados --pool rbd stat toyfile.0000000000000000 >& $dir/staterror3
+    grep -q 'No such file or directory' $dir/staterror3 || return 1
+
+    # cleanup
+    teardown $dir || return 1
+}
+
+main rados-striper "$@"
diff --git a/src/test/libradosstriper/striping.cc b/src/test/libradosstriper/striping.cc
index 9b26329..e1b5801 100644
--- a/src/test/libradosstriper/striping.cc
+++ b/src/test/libradosstriper/striping.cc
@@ -1,5 +1,7 @@
 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
 // vim: ts=8 sw=2 smarttab
+
+#include "include/types.h"
 #include "include/rados/librados.h"
 #include "include/rados/librados.hpp"
 #include "include/radosstriper/libradosstriper.h"
@@ -230,7 +232,7 @@ TEST_P(StriperTestRT, StripedRoundtrip) {
     // recreate object
     ASSERT_EQ(0, striper.write(soid, bl1, testData.size*2, 0));
     // remove the object size attribute from the striped object
-    std::string firstOid = getObjName(soid, 0);
+    char* firstOid = getObjName(soid, 0);
     ASSERT_EQ(0, ioctx.rmxattr(firstOid, "striper.size"));
     // check that stat fails
     uint64_t size;
@@ -244,6 +246,7 @@ TEST_P(StriperTestRT, StripedRoundtrip) {
          object_nb++) {
       ASSERT_EQ(-ENOENT, ioctx.stat(getObjName(soid, object_nb), &size, &mtime));
     }
+    free(firstOid);
   }
 }
 
diff --git a/src/test/librbd/fsx.cc b/src/test/librbd/fsx.cc
index 2465417..de40625 100644
--- a/src/test/librbd/fsx.cc
+++ b/src/test/librbd/fsx.cc
@@ -2,7 +2,7 @@
 /*
  *	Copyright (C) 1991, NeXT Computer, Inc.  All Rights Reserverd.
  *
- *	File:	fsx.c
+ *	File:	fsx.cc
  *	Author:	Avadis Tevanian, Jr.
  *
  *	File system exerciser. 
@@ -481,7 +481,9 @@ __librbd_clone(struct rbd_ctx *ctx, const char *src_snapname,
 	uint64_t features = RBD_FEATURES_ALL;
 	if (krbd) {
 		features &= ~(RBD_FEATURE_EXCLUSIVE_LOCK |
-		              RBD_FEATURE_OBJECT_MAP);
+		              RBD_FEATURE_OBJECT_MAP     |
+                              RBD_FEATURE_FAST_DIFF      |
+                              RBD_FEATURE_DEEP_FLATTEN);
 	}
 	ret = rbd_clone2(ioctx, ctx->name, src_snapname, ioctx,
 			 dst_imagename, features, order,
@@ -625,7 +627,7 @@ krbd_write(struct rbd_ctx *ctx, uint64_t off, size_t len, const char *buf)
 }
 
 int
-__krbd_flush(struct rbd_ctx *ctx)
+__krbd_flush(struct rbd_ctx *ctx, bool invalidate)
 {
 	int ret;
 
@@ -633,13 +635,26 @@ __krbd_flush(struct rbd_ctx *ctx)
 		return 0;
 
 	/*
-	 * fsync(2) on the block device does not sync the filesystem
-	 * mounted on top of it, but that's OK - we control the entire
-	 * lifetime of the block device and write directly to it.
+	 * BLKFLSBUF will sync the filesystem on top of the device (we
+	 * don't care about that here, since we write directly to it),
+	 * write out any dirty buffers and invalidate the buffer cache.
+	 * It won't do a hardware cache flush.
+	 *
+	 * fsync() will write out any dirty buffers and do a hardware
+	 * cache flush (which we don't care about either, because for
+	 * krbd it's a noop).  It won't try to empty the buffer cache
+	 * nor poke the filesystem before writing out.
+	 *
+	 * Given that, for our purposes, fsync is a flush, while
+	 * BLKFLSBUF is a flush+invalidate.
 	 */
-	if (fsync(ctx->krbd_fd) < 0) {
+        if (invalidate)
+		ret = ioctl(ctx->krbd_fd, BLKFLSBUF, NULL);
+	else
+		ret = fsync(ctx->krbd_fd);
+	if (ret < 0) {
 		ret = -errno;
-		prt("fsync failed\n");
+		prt("%s failed\n", invalidate ? "BLKFLSBUF" : "fsync");
 		return ret;
 	}
 
@@ -649,7 +664,7 @@ __krbd_flush(struct rbd_ctx *ctx)
 int
 krbd_flush(struct rbd_ctx *ctx)
 {
-	return __krbd_flush(ctx);
+	return __krbd_flush(ctx, false);
 }
 
 int
@@ -659,6 +674,26 @@ krbd_discard(struct rbd_ctx *ctx, uint64_t off, uint64_t len)
 	int ret;
 
 	/*
+	 * BLKDISCARD goes straight to disk and doesn't do anything
+	 * about dirty buffers.  This means we need to flush so that
+	 *
+	 *   write 0..3M
+	 *   discard 1..2M
+	 *
+	 * results in "data 0000 data" rather than "data data data" on
+	 * disk and invalidate so that
+	 *
+	 *   discard 1..2M
+	 *   read 0..3M
+	 *
+	 * returns "data 0000 data" rather than "data data data" in
+	 * case 1..2M was cached.
+	 */
+	ret = __krbd_flush(ctx, true);
+	if (ret < 0)
+		return ret;
+
+	/*
 	 * off and len must be 512-byte aligned, otherwise BLKDISCARD
 	 * will fail with -EINVAL.  This means that -K (enable krbd
 	 * mode) requires -h 512 or similar.
@@ -676,10 +711,9 @@ int
 krbd_get_size(struct rbd_ctx *ctx, uint64_t *size)
 {
 	uint64_t bytes;
-	int ret;
 
 	if (ioctl(ctx->krbd_fd, BLKGETSIZE64, &bytes) < 0) {
-		ret = -errno;
+		int ret = -errno;
 		prt("BLKGETSIZE64 failed\n");
 		return ret;
 	}
@@ -697,13 +731,20 @@ krbd_resize(struct rbd_ctx *ctx, uint64_t size)
 	assert(size % truncbdy == 0);
 
 	/*
-	 * This is essential: when krbd detects a size change, it calls
-	 * revalidate_disk(), which ends up calling invalidate_bdev(),
-	 * which invalidates only clean buffers.  The cache flush makes
-	 * it invalidate everything, which is what we need if we are
-	 * shrinking.
+	 * When krbd detects a size change, it calls revalidate_disk(),
+	 * which ends up calling invalidate_bdev(), which invalidates
+	 * clean pages and does nothing about dirty pages beyond the
+	 * new size.  The preceding cache flush makes sure those pages
+	 * are invalidated, which is what we need on shrink so that
+	 *
+	 *  write 0..1M
+	 *  resize 0
+	 *  resize 2M
+	 *  read 0..2M
+	 *
+	 * returns "0000 0000" rather than "data 0000".
 	 */
-	ret = __krbd_flush(ctx);
+	ret = __krbd_flush(ctx, false);
 	if (ret < 0)
 		return ret;
 
@@ -717,7 +758,7 @@ krbd_clone(struct rbd_ctx *ctx, const char *src_snapname,
 {
 	int ret;
 
-	ret = __krbd_flush(ctx);
+	ret = __krbd_flush(ctx, false);
 	if (ret < 0)
 		return ret;
 
@@ -730,7 +771,7 @@ krbd_flatten(struct rbd_ctx *ctx)
 {
 	int ret;
 
-	ret = __krbd_flush(ctx);
+	ret = __krbd_flush(ctx, false);
 	if (ret < 0)
 		return ret;
 
@@ -950,25 +991,22 @@ report_failure(int status)
 void
 check_buffers(char *good_buf, char *temp_buf, unsigned offset, unsigned size)
 {
-	unsigned char c, t;
-	unsigned i = 0;
-	unsigned n = 0;
-	unsigned op = 0;
-	unsigned bad = 0;
-
 	if (memcmp(good_buf + offset, temp_buf, size) != 0) {
+		unsigned i = 0;
+		unsigned n = 0;
+
 		prt("READ BAD DATA: offset = 0x%x, size = 0x%x, fname = %s\n",
 		    offset, size, iname);
 		prt("OFFSET\tGOOD\tBAD\tRANGE\n");
 		while (size > 0) {
-			c = good_buf[offset];
-			t = temp_buf[i];
+			unsigned char c = good_buf[offset];
+			unsigned char t = temp_buf[i];
 			if (c != t) {
 			        if (n < 16) {
-					bad = short_at(&temp_buf[i]);
+					unsigned bad = short_at(&temp_buf[i]);
 				        prt("0x%5x\t0x%04x\t0x%04x", offset,
 				            short_at(&good_buf[offset]), bad);
-					op = temp_buf[offset & 1 ? i+1 : i];
+					unsigned op = temp_buf[offset & 1 ? i+1 : i];
 				        prt("\t0x%5x\n", n);
 					if (op)
 						prt("operation# (mod 256) for "
@@ -1978,7 +2016,7 @@ main(int argc, char **argv)
 		case 'b':
 			simulatedopcount = getnum(optarg, &endp);
 			if (!quiet)
-				fprintf(stdout, "Will begin at operation %ld\n",
+				fprintf(stdout, "Will begin at operation %lu\n",
 					simulatedopcount);
 			if (simulatedopcount == 0)
 				usage();
@@ -2005,9 +2043,12 @@ main(int argc, char **argv)
 				usage();
 			break;
 		case 'l':
-			maxfilelen = getnum(optarg, &endp);
-			if (maxfilelen <= 0)
-				usage();
+			{
+				int _num = getnum(optarg, &endp);
+				if (_num <= 0)
+					usage();
+				maxfilelen = _num;
+			}
 			break;
 		case 'm':
 			monitorstart = getnum(optarg, &endp);
@@ -2108,7 +2149,8 @@ main(int argc, char **argv)
 				prt("file name to long\n");
 				exit(1);
 			}
-			strncpy(logfile, dirpath, sizeof(logfile));
+			strncpy(logfile, dirpath, sizeof(logfile)-1);
+			logfile[sizeof(logfile)-1] = '\0';
 			if (strlen(logfile) < sizeof(logfile)-2) {
 				strcat(logfile, "/");
 			} else {
diff --git a/src/test/librbd/mock/MockContextWQ.h b/src/test/librbd/mock/MockContextWQ.h
new file mode 100644
index 0000000..a690d4e
--- /dev/null
+++ b/src/test/librbd/mock/MockContextWQ.h
@@ -0,0 +1,17 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_TEST_LIBRBD_MOCK_CONTEXT_WQ_H
+#define CEPH_TEST_LIBRBD_MOCK_CONTEXT_WQ_H
+
+#include "gmock/gmock.h"
+
+namespace librbd {
+
+struct MockContextWQ {
+  MOCK_METHOD2(queue, void(Context *, int r));
+};
+
+} // namespace librbd
+
+#endif // CEPH_TEST_LIBRBD_MOCK_CONTEXT_WQ_H
diff --git a/src/test/librbd/mock/MockImageCtx.h b/src/test/librbd/mock/MockImageCtx.h
new file mode 100644
index 0000000..53d6fd0
--- /dev/null
+++ b/src/test/librbd/mock/MockImageCtx.h
@@ -0,0 +1,112 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_TEST_LIBRBD_MOCK_IMAGE_CTX_H
+#define CEPH_TEST_LIBRBD_MOCK_IMAGE_CTX_H
+
+#include "test/librbd/mock/MockContextWQ.h"
+#include "test/librbd/mock/MockImageWatcher.h"
+#include "test/librbd/mock/MockObjectMap.h"
+#include "common/RWLock.h"
+#include "librbd/ImageCtx.h"
+#include "gmock/gmock.h"
+
+namespace librbd {
+
+struct MockImageCtx {
+  MockImageCtx(librbd::ImageCtx &image_ctx)
+    : image_ctx(&image_ctx),
+      cct(image_ctx.cct),
+      snapc(image_ctx.snapc),
+      snaps(image_ctx.snaps),
+      snap_info(image_ctx.snap_info),
+      old_format(image_ctx.old_format),
+      read_only(image_ctx.read_only),
+      owner_lock("owner_lock"),
+      md_lock("md_lock"),
+      snap_lock("snap_lock"),
+      parent_lock("parent_lock"),
+      object_map_lock("object_map_lock"),
+      async_ops_lock("async_ops_lock"),
+      size(image_ctx.size),
+      features(image_ctx.features),
+      header_oid(image_ctx.header_oid),
+      id(image_ctx.id),
+      parent_md(image_ctx.parent_md),
+      aio_work_queue(new MockContextWQ()),
+      op_work_queue(new MockContextWQ()),
+      image_watcher(NULL),
+      concurrent_management_ops(image_ctx.concurrent_management_ops)
+  {
+    md_ctx.dup(image_ctx.md_ctx);
+    data_ctx.dup(image_ctx.data_ctx);
+
+    if (image_ctx.image_watcher != NULL) {
+      image_watcher = new MockImageWatcher();
+    }
+  }
+
+  ~MockImageCtx() {
+    delete image_watcher;
+    delete op_work_queue;
+    delete aio_work_queue;
+  }
+
+  MOCK_CONST_METHOD1(get_snap_id, librados::snap_t(std::string in_snap_name));
+  MOCK_CONST_METHOD1(get_snap_info, const SnapInfo*(librados::snap_t));
+  MOCK_CONST_METHOD2(get_parent_spec, int(librados::snap_t in_snap_id,
+                                          parent_spec *pspec));
+
+  MOCK_CONST_METHOD2(is_snap_protected, int(librados::snap_t in_snap_id,
+                                            bool *is_protected));
+  MOCK_CONST_METHOD2(is_snap_unprotected, int(librados::snap_t in_snap_id,
+                                              bool *is_unprotected));
+
+  MOCK_METHOD6(add_snap, void(std::string in_snap_name, librados::snap_t id,
+                              uint64_t in_size, parent_info parent,
+                              uint8_t protection_status, uint64_t flags));
+  MOCK_METHOD2(rm_snap, void(std::string in_snap_name, librados::snap_t id));
+  MOCK_METHOD1(flush, void(Context *));
+
+  ImageCtx *image_ctx;
+  CephContext *cct;
+
+  ::SnapContext snapc;
+  std::vector<librados::snap_t> snaps;
+  std::map<librados::snap_t, SnapInfo> snap_info;
+
+
+  bool old_format;
+  bool read_only;
+
+  librados::IoCtx md_ctx;
+  librados::IoCtx data_ctx;
+
+  RWLock owner_lock;
+  RWLock md_lock;
+  RWLock snap_lock;
+  RWLock parent_lock;
+  RWLock object_map_lock;
+  Mutex async_ops_lock;
+
+  uint64_t size;
+  uint64_t features;
+  std::string header_oid;
+  std::string id;
+  parent_info parent_md;
+
+  xlist<AsyncRequest<MockImageCtx>*> async_requests;
+  Cond async_requests_cond;
+
+  MockContextWQ *aio_work_queue;
+  MockContextWQ *op_work_queue;
+
+  MockImageWatcher *image_watcher;
+  MockObjectMap object_map;
+
+  int concurrent_management_ops;
+};
+
+} // namespace librbd
+
+#endif // CEPH_TEST_LIBRBD_MOCK_IMAGE_CTX_H
diff --git a/src/test/librbd/mock/MockImageWatcher.h b/src/test/librbd/mock/MockImageWatcher.h
new file mode 100644
index 0000000..1c339bc
--- /dev/null
+++ b/src/test/librbd/mock/MockImageWatcher.h
@@ -0,0 +1,19 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_TEST_LIBRBD_MOCK_IMAGE_WATCHER_H
+#define CEPH_TEST_LIBRBD_MOCK_IMAGE_WATCHER_H
+
+#include "gmock/gmock.h"
+
+namespace librbd {
+
+struct MockImageWatcher {
+  MOCK_CONST_METHOD0(is_lock_owner, bool());
+  MOCK_CONST_METHOD1(is_lock_supported, bool(const RWLock &));
+  MOCK_METHOD1(assert_header_locked, void (librados::ObjectWriteOperation *));
+};
+
+} // namespace librbd
+
+#endif // CEPH_TEST_LIBRBD_MOCK_IMAGE_WATCHER_H
diff --git a/src/test/librbd/mock/MockObjectMap.h b/src/test/librbd/mock/MockObjectMap.h
new file mode 100644
index 0000000..7f2f84b
--- /dev/null
+++ b/src/test/librbd/mock/MockObjectMap.h
@@ -0,0 +1,20 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_TEST_LIBRBD_MOCK_OBJECT_MAP_H
+#define CEPH_TEST_LIBRBD_MOCK_OBJECT_MAP_H
+
+#include "gmock/gmock.h"
+
+namespace librbd {
+
+struct MockObjectMap {
+  MOCK_CONST_METHOD1(enabled, bool(const RWLock &object_map_lock));
+
+  MOCK_METHOD2(snapshot_add, void(uint64_t snap_id, Context *on_finish));
+  MOCK_METHOD2(snapshot_remove, void(uint64_t snap_id, Context *on_finish));
+};
+
+} // namespace librbd
+
+#endif // CEPH_TEST_LIBRBD_MOCK_OBJECT_MAP_H
diff --git a/src/test/librbd/test_ImageWatcher.cc b/src/test/librbd/test_ImageWatcher.cc
index adf087e..8813152 100644
--- a/src/test/librbd/test_ImageWatcher.cc
+++ b/src/test/librbd/test_ImageWatcher.cc
@@ -19,7 +19,6 @@
 #include "librbd/WatchNotifyTypes.h"
 #include "test/librados/test.h"
 #include "gtest/gtest.h"
-#include <boost/assign/list_of.hpp>
 #include <boost/assign/std/set.hpp>
 #include <boost/assign/std/map.hpp>
 #include <boost/bind.hpp>
@@ -232,6 +231,13 @@ public:
         *id = payload.async_request_id;
       }
       return true;
+    case NOTIFY_OP_REBUILD_OBJECT_MAP:
+      {
+        RebuildObjectMapPayload payload;
+        payload.decode(2, iter);
+        *id = payload.async_request_id;
+      }
+      return true;
     default:
       break;
     }
@@ -331,6 +337,20 @@ struct ResizeTask {
   }
 };
 
+struct RebuildObjectMapTask {
+  librbd::ImageCtx *ictx;
+  ProgressContext *progress_context;
+  int result;
+
+  RebuildObjectMapTask(librbd::ImageCtx *ictx_, ProgressContext *ctx)
+    : ictx(ictx_), progress_context(ctx), result(0) {}
+
+  void operator()() {
+    RWLock::RLocker l(ictx->owner_lock);
+    result = ictx->image_watcher->notify_rebuild_object_map(0, *progress_context);
+  }
+};
+
 TEST_F(TestImageWatcher, IsLockSupported) {
   REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
 
@@ -383,8 +403,7 @@ TEST_F(TestImageWatcher, TryLockNotifyAnnounceLocked) {
   ASSERT_EQ(0, open_image(m_image_name, &ictx));
 
   ASSERT_EQ(0, register_image_watch(*ictx));
-  m_notify_acks = boost::assign::list_of(
-    std::make_pair(NOTIFY_OP_ACQUIRED_LOCK, bufferlist()));
+  m_notify_acks = {{NOTIFY_OP_ACQUIRED_LOCK, {}}};
 
   {
     RWLock::WLocker l(ictx->owner_lock);
@@ -474,8 +493,7 @@ TEST_F(TestImageWatcher, UnlockNotifyReleaseLock) {
   ASSERT_EQ(0, open_image(m_image_name, &ictx));
 
   ASSERT_EQ(0, register_image_watch(*ictx));
-  m_notify_acks = boost::assign::list_of(
-    std::make_pair(NOTIFY_OP_ACQUIRED_LOCK, bufferlist()));
+  m_notify_acks = {{NOTIFY_OP_ACQUIRED_LOCK, {}}};
 
   {
     RWLock::WLocker l(ictx->owner_lock);
@@ -528,8 +546,7 @@ TEST_F(TestImageWatcher, RequestLock) {
   ASSERT_EQ(0, lock_image(*ictx, LOCK_EXCLUSIVE,
 			  "auto " + stringify(m_watch_ctx->get_handle())));
 
-  m_notify_acks = boost::assign::list_of(
-    std::make_pair(NOTIFY_OP_REQUEST_LOCK, create_response_message(0)));
+  m_notify_acks = {{NOTIFY_OP_REQUEST_LOCK, create_response_message(0)}};
 
   {
     RWLock::WLocker l(ictx->owner_lock);
@@ -549,9 +566,7 @@ TEST_F(TestImageWatcher, RequestLock) {
   ASSERT_EQ(0, unlock_image());
 
   m_notifies.clear();
-  m_notify_acks = boost::assign::list_of(
-    std::make_pair(NOTIFY_OP_RELEASED_LOCK, bufferlist()))(
-    std::make_pair(NOTIFY_OP_ACQUIRED_LOCK, bufferlist()));
+  m_notify_acks = {{NOTIFY_OP_RELEASED_LOCK,{}}, {NOTIFY_OP_ACQUIRED_LOCK,{}}};
 
   bufferlist bl;
   {
@@ -578,8 +593,7 @@ TEST_F(TestImageWatcher, RequestLockTimedOut) {
   ASSERT_EQ(0, lock_image(*ictx, LOCK_EXCLUSIVE,
 			  "auto " + stringify(m_watch_ctx->get_handle())));
 
-  m_notify_acks = boost::assign::list_of(
-    std::make_pair(NOTIFY_OP_REQUEST_LOCK, bufferlist()));
+  m_notify_acks = {{NOTIFY_OP_REQUEST_LOCK, {}}};
 
   m_expected_aio_restarts = 1;
   {
@@ -606,8 +620,7 @@ TEST_F(TestImageWatcher, RequestLockIgnored) {
   ASSERT_EQ(0, lock_image(*ictx, LOCK_EXCLUSIVE,
 			  "auto " + stringify(m_watch_ctx->get_handle())));
 
-  m_notify_acks = boost::assign::list_of(
-    std::make_pair(NOTIFY_OP_REQUEST_LOCK, create_response_message(0)));
+  m_notify_acks = {{NOTIFY_OP_REQUEST_LOCK, create_response_message(0)}};
 
   int orig_notify_timeout = ictx->cct->_conf->client_notify_timeout;
   ictx->cct->_conf->set_val("client_notify_timeout", "0");
@@ -645,8 +658,7 @@ TEST_F(TestImageWatcher, RequestLockTryLockRace) {
   ASSERT_EQ(0, lock_image(*ictx, LOCK_EXCLUSIVE,
                           "auto " + stringify(m_watch_ctx->get_handle())));
 
-  m_notify_acks = boost::assign::list_of(
-    std::make_pair(NOTIFY_OP_REQUEST_LOCK, create_response_message(0)));
+  m_notify_acks = {{NOTIFY_OP_REQUEST_LOCK, create_response_message(0)}};
 
   m_expected_aio_restarts = 1;
   {
@@ -662,8 +674,7 @@ TEST_F(TestImageWatcher, RequestLockTryLockRace) {
   ASSERT_EQ(expected_notify_ops, m_notifies);
 
   m_notifies.clear();
-  m_notify_acks = boost::assign::list_of(
-    std::make_pair(NOTIFY_OP_RELEASED_LOCK, bufferlist()));
+  m_notify_acks = {{NOTIFY_OP_RELEASED_LOCK, {}}};
 
   bufferlist bl;
   {
@@ -703,8 +714,7 @@ TEST_F(TestImageWatcher, RequestLockPostTryLockFailed) {
   ASSERT_EQ(0, lock_image(*ictx, LOCK_EXCLUSIVE,
                           "auto " + stringify(m_watch_ctx->get_handle())));
 
-  m_notify_acks = boost::assign::list_of(
-    std::make_pair(NOTIFY_OP_REQUEST_LOCK, create_response_message(0)));
+  m_notify_acks = {{NOTIFY_OP_REQUEST_LOCK, create_response_message(0)}};
 
   m_expected_aio_restarts = 1;
   {
@@ -723,8 +733,7 @@ TEST_F(TestImageWatcher, RequestLockPostTryLockFailed) {
   ASSERT_EQ(0, lock_image(*ictx, LOCK_SHARED, "manually 1234"));
 
   m_notifies.clear();
-  m_notify_acks = boost::assign::list_of(
-    std::make_pair(NOTIFY_OP_RELEASED_LOCK, bufferlist()));
+  m_notify_acks = {{NOTIFY_OP_RELEASED_LOCK, bufferlist()}};
 
   bufferlist bl;
   {
@@ -744,8 +753,7 @@ TEST_F(TestImageWatcher, NotifyHeaderUpdate) {
 
   ASSERT_EQ(0, register_image_watch(*ictx));
 
-  m_notify_acks = boost::assign::list_of(
-    std::make_pair(NOTIFY_OP_HEADER_UPDATE, bufferlist()));
+  m_notify_acks = {{NOTIFY_OP_HEADER_UPDATE, {}}};
   librbd::ImageWatcher::notify_header_update(m_ioctx, ictx->header_oid);
 
   ASSERT_TRUE(wait_for_notifies(*ictx));
@@ -765,8 +773,7 @@ TEST_F(TestImageWatcher, NotifyFlatten) {
   ASSERT_EQ(0, lock_image(*ictx, LOCK_EXCLUSIVE,
         "auto " + stringify(m_watch_ctx->get_handle())));
 
-  m_notify_acks = boost::assign::list_of(
-    std::make_pair(NOTIFY_OP_FLATTEN, create_response_message(0)));
+  m_notify_acks = {{NOTIFY_OP_FLATTEN, create_response_message(0)}};
 
   ProgressContext progress_context;
   FlattenTask flatten_task(ictx, &progress_context);
@@ -800,8 +807,7 @@ TEST_F(TestImageWatcher, NotifyResize) {
   ASSERT_EQ(0, lock_image(*ictx, LOCK_EXCLUSIVE,
         "auto " + stringify(m_watch_ctx->get_handle())));
 
-  m_notify_acks = boost::assign::list_of(
-    std::make_pair(NOTIFY_OP_RESIZE, create_response_message(0)));
+  m_notify_acks = {{NOTIFY_OP_RESIZE, create_response_message(0)}};
 
   ProgressContext progress_context;
   ResizeTask resize_task(ictx, &progress_context);
@@ -825,6 +831,41 @@ TEST_F(TestImageWatcher, NotifyResize) {
   ASSERT_EQ(0, resize_task.result);
 }
 
+TEST_F(TestImageWatcher, NotifyRebuildObjectMap) {
+  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  ASSERT_EQ(0, register_image_watch(*ictx));
+  ASSERT_EQ(0, lock_image(*ictx, LOCK_EXCLUSIVE,
+        "auto " + stringify(m_watch_ctx->get_handle())));
+
+  m_notify_acks = {{NOTIFY_OP_REBUILD_OBJECT_MAP, create_response_message(0)}};
+
+  ProgressContext progress_context;
+  RebuildObjectMapTask rebuild_task(ictx, &progress_context);
+  boost::thread thread(boost::ref(rebuild_task));
+
+  ASSERT_TRUE(wait_for_notifies(*ictx));
+
+  NotifyOps expected_notify_ops;
+  expected_notify_ops += NOTIFY_OP_REBUILD_OBJECT_MAP;
+  ASSERT_EQ(expected_notify_ops, m_notifies);
+
+  AsyncRequestId async_request_id;
+  ASSERT_TRUE(extract_async_request_id(NOTIFY_OP_REBUILD_OBJECT_MAP,
+                                       &async_request_id));
+
+  ASSERT_EQ(0, notify_async_progress(ictx, async_request_id, 10, 20));
+  ASSERT_TRUE(progress_context.wait(ictx, 10, 20));
+
+  ASSERT_EQ(0, notify_async_complete(ictx, async_request_id, 0));
+
+  ASSERT_TRUE(thread.timed_join(boost::posix_time::seconds(10)));
+  ASSERT_EQ(0, rebuild_task.result);
+}
+
 TEST_F(TestImageWatcher, NotifySnapCreate) {
   REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
 
@@ -835,8 +876,7 @@ TEST_F(TestImageWatcher, NotifySnapCreate) {
   ASSERT_EQ(0, lock_image(*ictx, LOCK_EXCLUSIVE,
         "auto " + stringify(m_watch_ctx->get_handle())));
 
-  m_notify_acks = boost::assign::list_of(
-    std::make_pair(NOTIFY_OP_SNAP_CREATE, create_response_message(0)));
+  m_notify_acks = {{NOTIFY_OP_SNAP_CREATE, create_response_message(0)}};
 
   RWLock::RLocker l(ictx->owner_lock);
   ASSERT_EQ(0, ictx->image_watcher->notify_snap_create("snap"));
@@ -856,8 +896,7 @@ TEST_F(TestImageWatcher, NotifySnapCreateError) {
   ASSERT_EQ(0, lock_image(*ictx, LOCK_EXCLUSIVE,
         "auto " + stringify(m_watch_ctx->get_handle())));
 
-  m_notify_acks = boost::assign::list_of(
-    std::make_pair(NOTIFY_OP_SNAP_CREATE, create_response_message(-EEXIST)));
+  m_notify_acks = {{NOTIFY_OP_SNAP_CREATE, create_response_message(-EEXIST)}};
 
   RWLock::RLocker l(ictx->owner_lock);
   ASSERT_EQ(-EEXIST, ictx->image_watcher->notify_snap_create("snap"));
@@ -867,6 +906,26 @@ TEST_F(TestImageWatcher, NotifySnapCreateError) {
   ASSERT_EQ(expected_notify_ops, m_notifies);
 }
 
+TEST_F(TestImageWatcher, NotifySnapRemove) {
+  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  ASSERT_EQ(0, register_image_watch(*ictx));
+  ASSERT_EQ(0, lock_image(*ictx, LOCK_EXCLUSIVE,
+        "auto " + stringify(m_watch_ctx->get_handle())));
+
+  m_notify_acks = {{NOTIFY_OP_SNAP_REMOVE, create_response_message(0)}};
+
+  RWLock::RLocker l(ictx->owner_lock);
+  ASSERT_EQ(0, ictx->image_watcher->notify_snap_remove("snap"));
+
+  NotifyOps expected_notify_ops;
+  expected_notify_ops += NOTIFY_OP_SNAP_REMOVE;
+  ASSERT_EQ(expected_notify_ops, m_notifies);
+}
+
 TEST_F(TestImageWatcher, NotifyAsyncTimedOut) {
   REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
 
@@ -877,8 +936,7 @@ TEST_F(TestImageWatcher, NotifyAsyncTimedOut) {
   ASSERT_EQ(0, lock_image(*ictx, LOCK_EXCLUSIVE,
         "auto " + stringify(m_watch_ctx->get_handle())));
 
-  m_notify_acks = boost::assign::list_of(
-    std::make_pair(NOTIFY_OP_FLATTEN, bufferlist()));
+  m_notify_acks = {{NOTIFY_OP_FLATTEN, {}}};
 
   ProgressContext progress_context;
   FlattenTask flatten_task(ictx, &progress_context);
@@ -898,8 +956,7 @@ TEST_F(TestImageWatcher, NotifyAsyncError) {
   ASSERT_EQ(0, lock_image(*ictx, LOCK_EXCLUSIVE,
         "auto " + stringify(m_watch_ctx->get_handle())));
 
-  m_notify_acks = boost::assign::list_of(
-    std::make_pair(NOTIFY_OP_FLATTEN, create_response_message(-EIO)));
+  m_notify_acks = {{NOTIFY_OP_FLATTEN, create_response_message(-EIO)}};
 
   ProgressContext progress_context;
   FlattenTask flatten_task(ictx, &progress_context);
@@ -919,8 +976,7 @@ TEST_F(TestImageWatcher, NotifyAsyncCompleteError) {
   ASSERT_EQ(0, lock_image(*ictx, LOCK_EXCLUSIVE,
         "auto " + stringify(m_watch_ctx->get_handle())));
 
-  m_notify_acks = boost::assign::list_of(
-    std::make_pair(NOTIFY_OP_FLATTEN, create_response_message(0)));
+  m_notify_acks = {{NOTIFY_OP_FLATTEN, create_response_message(0)}};
 
   ProgressContext progress_context;
   FlattenTask flatten_task(ictx, &progress_context);
@@ -947,21 +1003,13 @@ TEST_F(TestImageWatcher, NotifyAsyncRequestTimedOut) {
   librbd::ImageCtx *ictx;
   ASSERT_EQ(0, open_image(m_image_name, &ictx));
 
-  md_config_t *conf = ictx->cct->_conf;
-  int timed_out_seconds = conf->rbd_request_timed_out_seconds;
-  conf->set_val("rbd_request_timed_out_seconds", "0");
-  BOOST_SCOPE_EXIT( (timed_out_seconds)(conf) ) {
-    conf->set_val("rbd_request_timed_out_seconds",
-                  stringify(timed_out_seconds).c_str());
-  } BOOST_SCOPE_EXIT_END;
-  ASSERT_EQ(0, conf->rbd_request_timed_out_seconds);
+  ictx->request_timed_out_seconds = 0;
 
   ASSERT_EQ(0, register_image_watch(*ictx));
   ASSERT_EQ(0, lock_image(*ictx, LOCK_EXCLUSIVE,
 			  "auto " + stringify(m_watch_ctx->get_handle())));
 
-  m_notify_acks = boost::assign::list_of(
-    std::make_pair(NOTIFY_OP_FLATTEN, create_response_message(0)));
+  m_notify_acks = {{NOTIFY_OP_FLATTEN, create_response_message(0)}};
 
   ProgressContext progress_context;
   FlattenTask flatten_task(ictx, &progress_context);
diff --git a/src/test/librbd/test_internal.cc b/src/test/librbd/test_internal.cc
index 4aef7ae..805a86c 100644
--- a/src/test/librbd/test_internal.cc
+++ b/src/test/librbd/test_internal.cc
@@ -5,7 +5,9 @@
 #include "librbd/AioCompletion.h"
 #include "librbd/ImageWatcher.h"
 #include "librbd/internal.h"
+#include "librbd/ObjectMap.h"
 #include <boost/scope_exit.hpp>
+#include <boost/assign/list_of.hpp>
 #include <utility>
 #include <vector>
 
@@ -20,6 +22,7 @@ public:
   typedef std::vector<std::pair<std::string, bool> > Snaps;
 
   virtual void TearDown() {
+    unlock_image();
     for (Snaps::iterator iter = m_snaps.begin(); iter != m_snaps.end(); ++iter) {
       librbd::ImageCtx *ictx;
       EXPECT_EQ(0, open_image(m_image_name, &ictx));
@@ -366,6 +369,285 @@ TEST_F(TestInternal, MultipleResize) {
   ASSERT_EQ(0U, size);
 }
 
+TEST_F(TestInternal, Metadata) {
+  REQUIRE_FEATURE(RBD_FEATURE_LAYERING);
+
+  map<string, bool> test_confs = boost::assign::map_list_of(
+    "aaaaaaa", false)(
+    "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb", false)(
+    "cccccccccccccc", false);
+  map<string, bool>::iterator it = test_confs.begin();
+  int r;
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  r = librbd::metadata_set(ictx, it->first, "value1");
+  ASSERT_EQ(0, r);
+  ++it;
+  r = librbd::metadata_set(ictx, it->first, "value2");
+  ASSERT_EQ(0, r);
+  ++it;
+  r = librbd::metadata_set(ictx, it->first, "value3");
+  ASSERT_EQ(0, r);
+  r = librbd::metadata_set(ictx, "abcd", "value4");
+  ASSERT_EQ(0, r);
+  r = librbd::metadata_set(ictx, "xyz", "value5");
+  ASSERT_EQ(0, r);
+  map<string, bufferlist> pairs;
+  r = librbd::metadata_list(ictx, "", 0, &pairs);
+  ASSERT_EQ(0, r);
+  ASSERT_EQ(5u, pairs.size());
+  r = librbd::metadata_remove(ictx, "abcd");
+  ASSERT_EQ(0, r);
+  r = librbd::metadata_remove(ictx, "xyz");
+  ASSERT_EQ(0, r);
+  pairs.clear();
+  r = librbd::metadata_list(ictx, "", 0, &pairs);
+  ASSERT_EQ(0, r);
+  ASSERT_EQ(3u, pairs.size());
+  string val;
+  r = librbd::metadata_get(ictx, it->first, &val);
+  ASSERT_EQ(0, r);
+  ASSERT_STREQ(val.c_str(), "value3");
+}
+
+TEST_F(TestInternal, MetadataFilter) {
+  REQUIRE_FEATURE(RBD_FEATURE_LAYERING);
+
+  map<string, bool> test_confs = boost::assign::map_list_of(
+    "aaaaaaa", false)(
+    "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb", false)(
+    "cccccccccccccc", false);
+  map<string, bool>::iterator it = test_confs.begin();
+  const string prefix = "test_config_";
+  bool is_continue;
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  librbd::Image image1;
+  map<string, bufferlist> pairs, res;
+  pairs["abc"].append("value");
+  pairs["abcabc"].append("value");
+  pairs[prefix+it->first].append("value1");
+  ++it;
+  pairs[prefix+it->first].append("value2");
+  ++it;
+  pairs[prefix+it->first].append("value3");
+  pairs[prefix+"asdfsdaf"].append("value6");
+  pairs[prefix+"zxvzxcv123"].append("value5");
+
+  is_continue = ictx->_filter_metadata_confs(prefix, test_confs, pairs, &res);
+  ASSERT_TRUE(is_continue);
+  ASSERT_TRUE(res.size() == 3U);
+  it = test_confs.begin();
+  ASSERT_TRUE(res.count(it->first));
+  ASSERT_TRUE(it->second);
+  ++it;
+  ASSERT_TRUE(res.count(it->first));
+  ASSERT_TRUE(it->second);
+  ++it;
+  ASSERT_TRUE(res.count(it->first));
+  ASSERT_TRUE(it->second);
+  res.clear();
+
+  pairs["zzzzzzzz"].append("value7");
+  is_continue = ictx->_filter_metadata_confs(prefix, test_confs, pairs, &res);
+  ASSERT_FALSE(is_continue);
+  ASSERT_TRUE(res.size() == 3U);
+}
+
+TEST_F(TestInternal, SnapshotCopyup)
+{
+  REQUIRE_FEATURE(RBD_FEATURE_LAYERING);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  bufferlist bl;
+  bl.append(std::string(256, '1'));
+  ASSERT_EQ(256, librbd::write(ictx, 0, bl.length(), bl.c_str(), 0));
+
+  ASSERT_EQ(0, librbd::snap_create(ictx, "snap1"));
+  ASSERT_EQ(0, librbd::snap_protect(ictx, "snap1"));
+
+  uint64_t features;
+  ASSERT_EQ(0, librbd::get_features(ictx, &features));
+
+  std::string clone_name = get_temp_image_name();
+  int order = ictx->order;
+  ASSERT_EQ(0, librbd::clone(m_ioctx, m_image_name.c_str(), "snap1", m_ioctx,
+			     clone_name.c_str(), features, &order, 0, 0));
+
+  librbd::ImageCtx *ictx2;
+  ASSERT_EQ(0, open_image(clone_name, &ictx2));
+
+  ASSERT_EQ(0, librbd::snap_create(ictx2, "snap1"));
+  ASSERT_EQ(0, librbd::snap_create(ictx2, "snap2"));
+
+  ASSERT_EQ(256, librbd::write(ictx2, 256, bl.length(), bl.c_str(), 0));
+
+  librados::IoCtx snap_ctx;
+  snap_ctx.dup(m_ioctx);
+  snap_ctx.snap_set_read(CEPH_SNAPDIR);
+
+  librados::snap_set_t snap_set;
+  ASSERT_EQ(0, snap_ctx.list_snaps(ictx2->get_object_name(0), &snap_set));
+
+  std::vector< std::pair<uint64_t,uint64_t> > expected_overlap =
+    boost::assign::list_of(
+      std::make_pair(0, 256))(
+      std::make_pair(512, 2096640));
+  ASSERT_EQ(2U, snap_set.clones.size());
+  ASSERT_NE(CEPH_NOSNAP, snap_set.clones[0].cloneid);
+  ASSERT_EQ(2U, snap_set.clones[0].snaps.size());
+  ASSERT_EQ(expected_overlap, snap_set.clones[0].overlap);
+  ASSERT_EQ(CEPH_NOSNAP, snap_set.clones[1].cloneid);
+
+  bufferptr read_ptr(256);
+  bufferlist read_bl;
+  read_bl.push_back(read_ptr);
+
+  std::list<std::string> snaps = boost::assign::list_of(
+    "snap1")("snap2")("");
+  for (std::list<std::string>::iterator it = snaps.begin();
+       it != snaps.end(); ++it) {
+    const char *snap_name = it->empty() ? NULL : it->c_str();
+    ASSERT_EQ(0, librbd::snap_set(ictx2, snap_name));
+
+    ASSERT_EQ(256, librbd::read(ictx2, 0, 256, read_bl.c_str(), 0));
+    ASSERT_TRUE(bl.contents_equal(read_bl));
+
+    ASSERT_EQ(256, librbd::read(ictx2, 256, 256, read_bl.c_str(), 0));
+    if (snap_name == NULL) {
+      ASSERT_TRUE(bl.contents_equal(read_bl));
+    } else {
+      ASSERT_TRUE(read_bl.is_zero());
+    }
+
+    // verify the object map was properly updated
+    if ((ictx2->features & RBD_FEATURE_OBJECT_MAP) != 0) {
+      uint8_t state = OBJECT_EXISTS;
+      if ((ictx2->features & RBD_FEATURE_FAST_DIFF) != 0 &&
+          it != snaps.begin() && snap_name != NULL) {
+        state = OBJECT_EXISTS_CLEAN;
+      }
+      RWLock::WLocker object_map_locker(ictx2->object_map_lock);
+      ASSERT_EQ(state, ictx2->object_map[0]);
+    }
+  }
+}
+
+TEST_F(TestInternal, ResizeCopyup)
+{
+  REQUIRE_FEATURE(RBD_FEATURE_LAYERING);
+
+  m_image_name = get_temp_image_name();
+  m_image_size = 1 << 14;
+
+  uint64_t features = 0;
+  get_features(&features);
+  int order = 12;
+  ASSERT_EQ(0, m_rbd.create2(m_ioctx, m_image_name.c_str(), m_image_size,
+                             features, &order));
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  bufferlist bl;
+  bl.append(std::string(4096, '1'));
+  for (size_t i = 0; i < m_image_size; i += bl.length()) {
+    ASSERT_EQ(bl.length(), librbd::write(ictx, i, bl.length(), bl.c_str(), 0));
+  }
+
+  ASSERT_EQ(0, librbd::snap_create(ictx, "snap1"));
+  ASSERT_EQ(0, librbd::snap_protect(ictx, "snap1"));
+
+  std::string clone_name = get_temp_image_name();
+  ASSERT_EQ(0, librbd::clone(m_ioctx, m_image_name.c_str(), "snap1", m_ioctx,
+			     clone_name.c_str(), features, &order, 0, 0));
+
+  librbd::ImageCtx *ictx2;
+  ASSERT_EQ(0, open_image(clone_name, &ictx2));
+
+  ASSERT_EQ(0, librbd::snap_create(ictx2, "snap1"));
+
+  bufferptr read_ptr(bl.length());
+  bufferlist read_bl;
+  read_bl.push_back(read_ptr);
+
+  // verify full / partial object removal properly copyup
+  librbd::NoOpProgressContext no_op;
+  ASSERT_EQ(0, librbd::resize(ictx2, m_image_size - (1 << order) - 32, no_op));
+  ASSERT_EQ(0, librbd::snap_set(ictx2, "snap1"));
+
+  {
+    // hide the parent from the snapshot
+    RWLock::WLocker snap_locker(ictx2->snap_lock);
+    ictx2->snap_info.begin()->second.parent = librbd::parent_info();
+  }
+
+  for (size_t i = 2 << order; i < m_image_size; i += bl.length()) {
+    ASSERT_EQ(bl.length(), librbd::read(ictx2, i, bl.length(), read_bl.c_str(),
+                                        0));
+    ASSERT_TRUE(bl.contents_equal(read_bl));
+  }
+}
+
+TEST_F(TestInternal, DiscardCopyup)
+{
+  REQUIRE_FEATURE(RBD_FEATURE_LAYERING);
+
+  m_image_name = get_temp_image_name();
+  m_image_size = 1 << 14;
+
+  uint64_t features = 0;
+  get_features(&features);
+  int order = 12;
+  ASSERT_EQ(0, m_rbd.create2(m_ioctx, m_image_name.c_str(), m_image_size,
+                             features, &order));
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  bufferlist bl;
+  bl.append(std::string(4096, '1'));
+  for (size_t i = 0; i < m_image_size; i += bl.length()) {
+    ASSERT_EQ(bl.length(), librbd::write(ictx, i, bl.length(), bl.c_str(), 0));
+  }
+
+  ASSERT_EQ(0, librbd::snap_create(ictx, "snap1"));
+  ASSERT_EQ(0, librbd::snap_protect(ictx, "snap1"));
+
+  std::string clone_name = get_temp_image_name();
+  ASSERT_EQ(0, librbd::clone(m_ioctx, m_image_name.c_str(), "snap1", m_ioctx,
+			     clone_name.c_str(), features, &order, 0, 0));
+
+  librbd::ImageCtx *ictx2;
+  ASSERT_EQ(0, open_image(clone_name, &ictx2));
+
+  ASSERT_EQ(0, librbd::snap_create(ictx2, "snap1"));
+
+  bufferptr read_ptr(bl.length());
+  bufferlist read_bl;
+  read_bl.push_back(read_ptr);
+
+  ASSERT_EQ(static_cast<int>(m_image_size - 64),
+            librbd::discard(ictx2, 32, m_image_size - 64));
+  ASSERT_EQ(0, librbd::snap_set(ictx2, "snap1"));
+
+  {
+    // hide the parent from the snapshot
+    RWLock::WLocker snap_locker(ictx2->snap_lock);
+    ictx2->snap_info.begin()->second.parent = librbd::parent_info();
+  }
+
+  for (size_t i = 0; i < m_image_size; i += bl.length()) {
+    ASSERT_EQ(bl.length(), librbd::read(ictx2, i, bl.length(), read_bl.c_str(),
+                                        0));
+    ASSERT_TRUE(bl.contents_equal(read_bl));
+  }
+}
+
 TEST_F(TestInternal, ShrinkFlushesCache) {
   librbd::ImageCtx *ictx;
   ASSERT_EQ(0, open_image(m_image_name, &ictx));
diff --git a/src/test/librbd/test_librbd.cc b/src/test/librbd/test_librbd.cc
index f48f081..a6bf251 100644
--- a/src/test/librbd/test_librbd.cc
+++ b/src/test/librbd/test_librbd.cc
@@ -41,6 +41,7 @@
 #include "include/interval_set.h"
 #include "include/stringify.h"
 
+#include <boost/assign/list_of.hpp>
 #include <boost/scope_exit.hpp>
 
 using namespace std;
@@ -344,7 +345,7 @@ int test_ls(rados_ioctx_t io_ctx, size_t num_expected, ...)
     printf("expected = %s\n", expected);
     std::set<std::string>::iterator it = image_names.find(expected);
     if (it != image_names.end()) {
-      printf("found %s\n", cur_name);
+      printf("found %s\n", expected);
       image_names.erase(it);
     } else {
       ADD_FAILURE() << "Unable to find image " << expected;
@@ -1473,8 +1474,8 @@ TEST_F(TestLibRBD, TestCoR)
   // make a parent to clone from
   ASSERT_EQ(0, create_image_full(ioctx, "parent", image_size, &order, false, features));
   ASSERT_EQ(0, rbd_open(ioctx, "parent", &parent, NULL));
-  printf("made parent image \"parent\": %ldK (%d * %dK)\n", 
-         image_size, object_num, object_size/1024);
+  printf("made parent image \"parent\": %ldK (%d * %dK)\n",
+         (unsigned long)image_size, object_num, object_size/1024);
 
   // write something into parent
   char test_data[TEST_IO_SIZE + 1];
@@ -1500,18 +1501,19 @@ TEST_F(TestLibRBD, TestCoR)
   printf("generated random write map:\n");
   for (map<uint64_t, uint64_t>::iterator itr = write_tracker.begin();
        itr != write_tracker.end(); ++itr)
-    printf("\t [%-8ld, %-8ld]\n", itr->first, itr->second);
+    printf("\t [%-8ld, %-8ld]\n",
+	   (unsigned long)itr->first, (unsigned long)itr->second);
 
   printf("write data based on random map\n");
   for (map<uint64_t, uint64_t>::iterator itr = write_tracker.begin();
        itr != write_tracker.end(); ++itr) {
-    printf("\twrite object-%-4ld\t", itr->first);
+    printf("\twrite object-%-4ld\t", (unsigned long)itr->first);
     ASSERT_PASSED(write_test_data, parent, test_data, itr->first * object_size + itr->second, TEST_IO_SIZE, 0);
   }
 
   for (map<uint64_t, uint64_t>::iterator itr = write_tracker.begin();
          itr != write_tracker.end(); ++itr) {
-    printf("\tread object-%-4ld\t", itr->first);
+    printf("\tread object-%-4ld\t", (unsigned long)itr->first);
     ASSERT_PASSED(read_test_data, parent, test_data, itr->first * object_size + itr->second, TEST_IO_SIZE, 0);
   }
 
@@ -1545,20 +1547,20 @@ TEST_F(TestLibRBD, TestCoR)
   printf("read from \"child\"\n");
   {
     map<uint64_t, uint64_t>::iterator itr = write_tracker.begin();
-    printf("\tread object-%-4ld\t", itr->first);
+    printf("\tread object-%-4ld\t", (unsigned long)itr->first);
     ASSERT_PASSED(read_test_data, child, test_data, itr->first * object_size + itr->second, TEST_IO_SIZE, 0);
   }
 
   for (map<uint64_t, uint64_t>::iterator itr = write_tracker.begin();
        itr != write_tracker.end(); ++itr) {
-    printf("\tread object-%-4ld\t", itr->first);
+    printf("\tread object-%-4ld\t", (unsigned long)itr->first);
     ASSERT_PASSED(read_test_data, child, test_data, itr->first * object_size + itr->second, TEST_IO_SIZE, 0);
   }
 
   printf("read again reversely\n");
   for (map<uint64_t, uint64_t>::iterator itr = --write_tracker.end();
      itr != write_tracker.begin(); --itr) {
-    printf("\tread object-%-4ld\t", itr->first);
+    printf("\tread object-%-4ld\t", (unsigned long)itr->first);
     ASSERT_PASSED(read_test_data, child, test_data, itr->first * object_size + itr->second, TEST_IO_SIZE, 0);
   }
 
@@ -1958,7 +1960,7 @@ TEST_F(TestLibRBD, FlushAioPP)
     int order = 0;
     std::string name = get_temp_image_name();
     uint64_t size = 2 << 20;
-    size_t num_aios = 256;
+    const size_t num_aios = 256;
 
     ASSERT_EQ(0, create_image_pp(rbd, ioctx, name.c_str(), size, &order));
     ASSERT_EQ(0, rbd.open(ioctx, image, name.c_str(), NULL));
@@ -2007,21 +2009,29 @@ int iterate_cb(uint64_t off, size_t len, int exists, void *arg)
   return 0;
 }
 
-void scribble(librbd::Image& image, int n, int max, interval_set<uint64_t> *exists, interval_set<uint64_t> *what)
+static int iterate_error_cb(uint64_t off, size_t len, int exists, void *arg)
+{
+  return -EINVAL;
+}
+
+void scribble(librbd::Image& image, int n, int max,
+              interval_set<uint64_t> *exists,
+              interval_set<uint64_t> *what)
 {
   uint64_t size;
   image.size(&size);
   interval_set<uint64_t> exists_at_start = *exists;
+
   for (int i=0; i<n; i++) {
     uint64_t off = rand() % (size - max + 1);
     uint64_t len = 1 + rand() % max;
     if (rand() % 4 == 0) {
       ASSERT_EQ((int)len, image.discard(off, len));
-      interval_set<uint64_t> w;      
+      interval_set<uint64_t> w;
       w.insert(off, len);
 
       // the zeroed bit no longer exists...
-      w.intersection_of(*exists); 
+      w.intersection_of(*exists);
       exists->subtract(w);
 
       // the bits we discarded are no long written...
@@ -2048,38 +2058,83 @@ void scribble(librbd::Image& image, int n, int max, interval_set<uint64_t> *exis
   }
 }
 
-TEST_F(TestLibRBD, DiffIterate)
+interval_set<uint64_t> round_diff_interval(const interval_set<uint64_t>& diff,
+                                           uint64_t object_size)
+{
+  if (object_size == 0) {
+    return diff;
+  }
+
+  interval_set<uint64_t> rounded_diff;
+  for (interval_set<uint64_t>::const_iterator it = diff.begin();
+       it != diff.end(); ++it) {
+    uint64_t off = it.get_start();
+    uint64_t len = it.get_len();
+    off -= off % object_size;
+    len += (object_size - (len % object_size));
+    interval_set<uint64_t> interval;
+    interval.insert(off, len);
+    rounded_diff.union_of(interval);
+  }
+  return rounded_diff;
+}
+
+template <typename T>
+class DiffIterateTest : public TestLibRBD {
+public:
+  static const uint8_t whole_object = T::whole_object;
+};
+
+template <bool _whole_object>
+class DiffIterateParams {
+public:
+  static const uint8_t whole_object = _whole_object;
+};
+
+typedef ::testing::Types<DiffIterateParams<false>,
+                         DiffIterateParams<true> > DiffIterateTypes;
+TYPED_TEST_CASE(DiffIterateTest, DiffIterateTypes);
+
+TYPED_TEST(DiffIterateTest, DiffIterate)
 {
   librados::IoCtx ioctx;
-  ASSERT_EQ(0, _rados.ioctx_create(m_pool_name.c_str(), ioctx));
+  ASSERT_EQ(0, this->_rados.ioctx_create(this->m_pool_name.c_str(), ioctx));
 
   {
     librbd::RBD rbd;
     librbd::Image image;
     int order = 0;
-    std::string name = get_temp_image_name();
+    std::string name = this->get_temp_image_name();
     uint64_t size = 20 << 20;
 
     ASSERT_EQ(0, create_image_pp(rbd, ioctx, name.c_str(), size, &order));
     ASSERT_EQ(0, rbd.open(ioctx, image, name.c_str(), NULL));
 
+    uint64_t object_size = 0;
+    if (this->whole_object) {
+      object_size = 1 << order;
+    }
+
     interval_set<uint64_t> exists;
     interval_set<uint64_t> one, two;
     scribble(image, 10, 102400, &exists, &one);
     cout << " wrote " << one << std::endl;
     ASSERT_EQ(0, image.snap_create("one"));
     scribble(image, 10, 102400, &exists, &two);
+
+    two = round_diff_interval(two, object_size);
     cout << " wrote " << two << std::endl;
 
     interval_set<uint64_t> diff;
-    ASSERT_EQ(0, image.diff_iterate("one", 0, size, iterate_cb, (void *)&diff));
+    ASSERT_EQ(0, image.diff_iterate2("one", 0, size, true, this->whole_object,
+                                     iterate_cb, (void *)&diff));
     cout << " diff was " << diff << std::endl;
     if (!two.subset_of(diff)) {
       interval_set<uint64_t> i;
       i.intersection_of(two, diff);
       interval_set<uint64_t> l = two;
       l.subtract(i);
-      cout << " ... two - (two*diff) = " << l << std::endl;     
+      cout << " ... two - (two*diff) = " << l << std::endl;
     }
     ASSERT_TRUE(two.subset_of(diff));
   }
@@ -2087,8 +2142,15 @@ TEST_F(TestLibRBD, DiffIterate)
 }
 
 struct diff_extent {
-  diff_extent(uint64_t offset, uint64_t length, bool exists) :
-    offset(offset), length(length), exists(exists) {}
+  diff_extent(uint64_t _offset, uint64_t _length, bool _exists,
+              uint64_t object_size) :
+    offset(_offset), length(_length), exists(_exists)
+  {
+    if (object_size != 0) {
+      offset -= offset % object_size;
+      length = object_size;
+    }
+  }
   uint64_t offset;
   uint64_t length;
   bool exists;
@@ -2105,64 +2167,68 @@ int vector_iterate_cb(uint64_t off, size_t len, int exists, void *arg)
 {
   cout << "iterate_cb " << off << "~" << len << std::endl;
   vector<diff_extent> *diff = static_cast<vector<diff_extent> *>(arg);
-  diff->push_back(diff_extent(off, len, exists));
+  diff->push_back(diff_extent(off, len, exists, 0));
   return 0;
 }
 
-TEST_F(TestLibRBD, DiffIterateDiscard)
+TYPED_TEST(DiffIterateTest, DiffIterateDiscard)
 {
   librados::IoCtx ioctx;
-  ASSERT_EQ(0, _rados.ioctx_create(m_pool_name.c_str(), ioctx));
+  ASSERT_EQ(0, this->_rados.ioctx_create(this->m_pool_name.c_str(), ioctx));
 
   librbd::RBD rbd;
   librbd::Image image;
   int order = 0;
-  std::string name = get_temp_image_name();
+  std::string name = this->get_temp_image_name();
   uint64_t size = 20 << 20;
 
   ASSERT_EQ(0, create_image_pp(rbd, ioctx, name.c_str(), size, &order));
   ASSERT_EQ(0, rbd.open(ioctx, image, name.c_str(), NULL));
 
+  uint64_t object_size = 0;
+  if (this->whole_object) {
+    object_size = 1 << order;
+  }
   vector<diff_extent> extents;
   ceph::bufferlist bl;
 
-  ASSERT_EQ(0, image.diff_iterate(NULL, 0, size,
-      			    vector_iterate_cb, (void *) &extents));
+  ASSERT_EQ(0, image.diff_iterate2(NULL, 0, size, true, this->whole_object,
+      			           vector_iterate_cb, (void *) &extents));
   ASSERT_EQ(0u, extents.size());
 
   char data[256];
   memset(data, 1, sizeof(data));
   bl.append(data, 256);
   ASSERT_EQ(256, image.write(0, 256, bl));
-  ASSERT_EQ(0, image.diff_iterate(NULL, 0, size,
-      			    vector_iterate_cb, (void *) &extents));
+  ASSERT_EQ(0, image.diff_iterate2(NULL, 0, size, true, this->whole_object,
+      			           vector_iterate_cb, (void *) &extents));
   ASSERT_EQ(1u, extents.size());
-  ASSERT_EQ(diff_extent(0, 256, true), extents[0]);
+  ASSERT_EQ(diff_extent(0, 256, true, object_size), extents[0]);
 
   int obj_ofs = 256;
-  ASSERT_EQ(obj_ofs, image.discard(0, obj_ofs));
+  ASSERT_EQ(1 << order, image.discard(0, 1 << order));
 
   extents.clear();
-  ASSERT_EQ(0, image.diff_iterate(NULL, 0, size,
-      			    vector_iterate_cb, (void *) &extents));
+  ASSERT_EQ(0, image.diff_iterate2(NULL, 0, size, true, this->whole_object,
+      			           vector_iterate_cb, (void *) &extents));
   ASSERT_EQ(0u, extents.size());
 
   ASSERT_EQ(0, image.snap_create("snap1"));
   ASSERT_EQ(256, image.write(0, 256, bl));
-  ASSERT_EQ(0, image.diff_iterate(NULL, 0, size,
-      			    vector_iterate_cb, (void *) &extents));
+  ASSERT_EQ(0, image.diff_iterate2(NULL, 0, size, true, this->whole_object,
+      			           vector_iterate_cb, (void *) &extents));
   ASSERT_EQ(1u, extents.size());
-  ASSERT_EQ(diff_extent(0, 256, true), extents[0]);
+  ASSERT_EQ(diff_extent(0, 256, true, object_size), extents[0]);
   ASSERT_EQ(0, image.snap_create("snap2"));
 
   ASSERT_EQ(obj_ofs, image.discard(0, obj_ofs));
 
   extents.clear();
   ASSERT_EQ(0, image.snap_set("snap2"));
-  ASSERT_EQ(0, image.diff_iterate("snap1", 0, size,
-      			    vector_iterate_cb, (void *) &extents));
+  ASSERT_EQ(0, image.diff_iterate2("snap1", 0, size, true, this->whole_object,
+      			           vector_iterate_cb, (void *) &extents));
   ASSERT_EQ(1u, extents.size());
-  ASSERT_EQ(diff_extent(0, 256, true), extents[0]);
+  ASSERT_EQ(diff_extent(0, 256, true, object_size), extents[0]);
 
   ASSERT_EQ(0, image.snap_set(NULL));
   ASSERT_EQ(1 << order, image.discard(0, 1 << order));
@@ -2170,27 +2236,32 @@ TEST_F(TestLibRBD, DiffIterateDiscard)
   ASSERT_EQ(0, image.snap_set("snap3"));
 
   extents.clear();
-  ASSERT_EQ(0, image.diff_iterate("snap1", 0, size,
-      			    vector_iterate_cb, (void *) &extents));
+  ASSERT_EQ(0, image.diff_iterate2("snap1", 0, size, true, this->whole_object,
+      			           vector_iterate_cb, (void *) &extents));
   ASSERT_EQ(1u, extents.size());
-  ASSERT_EQ(diff_extent(0, 256, false), extents[0]);
-  ASSERT_PASSED(validate_object_map, image);
+  ASSERT_EQ(diff_extent(0, 256, false, object_size), extents[0]);
+  ASSERT_PASSED(this->validate_object_map, image);
 }
 
-TEST_F(TestLibRBD, DiffIterateStress)
+TYPED_TEST(DiffIterateTest, DiffIterateStress)
 {
   librados::IoCtx ioctx;
-  ASSERT_EQ(0, _rados.ioctx_create(m_pool_name.c_str(), ioctx));
+  ASSERT_EQ(0, this->_rados.ioctx_create(this->m_pool_name.c_str(), ioctx));
 
   librbd::RBD rbd;
   librbd::Image image;
   int order = 0;
-  std::string name = get_temp_image_name();
+  std::string name = this->get_temp_image_name();
   uint64_t size = 400 << 20;
 
   ASSERT_EQ(0, create_image_pp(rbd, ioctx, name.c_str(), size, &order));
   ASSERT_EQ(0, rbd.open(ioctx, image, name.c_str(), NULL));
 
+  uint64_t object_size = 0;
+  if (this->whole_object) {
+    object_size = 1 << order;
+  }
+
   interval_set<uint64_t> curexists;
   vector<interval_set<uint64_t> > wrote;
   vector<interval_set<uint64_t> > exists;
@@ -2207,54 +2278,67 @@ TEST_F(TestLibRBD, DiffIterateStress)
     snap.push_back(s);
   }
 
-  for (int i=0; i<n-1; i++) {
-    for (int j=i+1; j<n; j++) {
-      interval_set<uint64_t> diff, actual, uex;
-      for (int k=i+1; k<=j; k++)
-        diff.union_of(wrote[k]);
-      cout << "from " << i << " to " << j << " diff " << diff << std::endl;
-
-      // limit to extents that exists both at the beginning and at the end
-      uex.union_of(exists[i], exists[j]);
-      diff.intersection_of(uex);
-      cout << "  limited diff " << diff << std::endl;
-
-      image.snap_set(snap[j].c_str());
-      ASSERT_EQ(0, image.diff_iterate(snap[i].c_str(), 0, size, iterate_cb, (void *)&actual));
-      cout << " actual was " << actual << std::endl;
-      if (!diff.subset_of(actual)) {
-        interval_set<uint64_t> i;
-        i.intersection_of(diff, actual);
-        interval_set<uint64_t> l = diff;
-        l.subtract(i);
-        cout << " ... diff - (actual*diff) = " << l << std::endl;     
+  for (int h=0; h<n-1; h++) {
+    for (int i=0; i<n-h-1; i++) {
+      for (int j=(h==0 ? i+1 : n-1); j<n; j++) {
+        interval_set<uint64_t> diff, actual, uex;
+        for (int k=i+1; k<=j; k++)
+          diff.union_of(wrote[k]);
+        cout << "from " << i << " to "
+             << (h != 0 ? string("HEAD") : stringify(j)) << " diff "
+             << round_diff_interval(diff, object_size) << std::endl;
+
+        // limit to extents that exists both at the beginning and at the end
+        uex.union_of(exists[i], exists[j]);
+        diff.intersection_of(uex);
+        diff = round_diff_interval(diff, object_size);
+        cout << " limited diff " << diff << std::endl;
+
+        ASSERT_EQ(0, image.snap_set(h==0 ? snap[j].c_str() : NULL));
+        ASSERT_EQ(0, image.diff_iterate2(snap[i].c_str(), 0, size, true,
+                                         this->whole_object, iterate_cb,
+                                         (void *)&actual));
+        cout << " actual was " << actual << std::endl;
+        if (!diff.subset_of(actual)) {
+          interval_set<uint64_t> i;
+          i.intersection_of(diff, actual);
+          interval_set<uint64_t> l = diff;
+          l.subtract(i);
+          cout << " ... diff - (actual*diff) = " << l << std::endl;
+        }
+        ASSERT_TRUE(diff.subset_of(actual));
       }
-      ASSERT_TRUE(diff.subset_of(actual));
     }
+    ASSERT_EQ(0, image.snap_set(NULL));
+    ASSERT_EQ(0, image.snap_remove(snap[n-h-1].c_str()));
   }
 
-  ASSERT_PASSED(validate_object_map, image);
+  ASSERT_PASSED(this->validate_object_map, image);
 }
 
-TEST_F(TestLibRBD, DiffIterateRegression6926)
+TYPED_TEST(DiffIterateTest, DiffIterateRegression6926)
 {
   librados::IoCtx ioctx;
-  ASSERT_EQ(0, _rados.ioctx_create(m_pool_name.c_str(), ioctx));
+  ASSERT_EQ(0, this->_rados.ioctx_create(this->m_pool_name.c_str(), ioctx));
 
   librbd::RBD rbd;
   librbd::Image image;
   int order = 0;
-  std::string name = get_temp_image_name();
+  std::string name = this->get_temp_image_name();
   uint64_t size = 20 << 20;
 
   ASSERT_EQ(0, create_image_pp(rbd, ioctx, name.c_str(), size, &order));
   ASSERT_EQ(0, rbd.open(ioctx, image, name.c_str(), NULL));
 
+  uint64_t object_size = 0;
+  if (this->whole_object) {
+    object_size = 1 << order;
+  }
   vector<diff_extent> extents;
   ceph::bufferlist bl;
 
-  ASSERT_EQ(0, image.diff_iterate(NULL, 0, size,
-      			    vector_iterate_cb, (void *) &extents));
+  ASSERT_EQ(0, image.diff_iterate2(NULL, 0, size, true, this->whole_object,
+      			           vector_iterate_cb, (void *) &extents));
   ASSERT_EQ(0u, extents.size());
 
   ASSERT_EQ(0, image.snap_create("snap1"));
@@ -2264,18 +2348,144 @@ TEST_F(TestLibRBD, DiffIterateRegression6926)
   ASSERT_EQ(256, image.write(0, 256, bl));
 
   extents.clear();
-  ASSERT_EQ(0, image.diff_iterate(NULL, 0, size,
-      			    vector_iterate_cb, (void *) &extents));
+  ASSERT_EQ(0, image.diff_iterate2(NULL, 0, size, true, this->whole_object,
+      			           vector_iterate_cb, (void *) &extents));
   ASSERT_EQ(1u, extents.size());
-  ASSERT_EQ(diff_extent(0, 256, true), extents[0]);
+  ASSERT_EQ(diff_extent(0, 256, true, object_size), extents[0]);
 
   ASSERT_EQ(0, image.snap_set("snap1"));
   extents.clear();
-  ASSERT_EQ(0, image.diff_iterate(NULL, 0, size,
-      			    vector_iterate_cb, (void *) &extents));
+  ASSERT_EQ(0, image.diff_iterate2(NULL, 0, size, true, this->whole_object,
+      			           vector_iterate_cb, (void *) &extents));
   ASSERT_EQ(static_cast<size_t>(0), extents.size());
 }
 
+TYPED_TEST(DiffIterateTest, DiffIterateIgnoreParent)
+{
+  REQUIRE_FEATURE(RBD_FEATURE_LAYERING);
+
+  librados::IoCtx ioctx;
+  ASSERT_EQ(0, this->_rados.ioctx_create(this->m_pool_name.c_str(), ioctx));
+
+  librbd::RBD rbd;
+  librbd::Image image;
+  std::string name = this->get_temp_image_name();
+  uint64_t size = 20 << 20;
+  int order = 0;
+
+  ASSERT_EQ(0, create_image_pp(rbd, ioctx, name.c_str(), size, &order));
+  ASSERT_EQ(0, rbd.open(ioctx, image, name.c_str(), NULL));
+
+  uint64_t object_size = 0;
+  if (this->whole_object) {
+    object_size = 1 << order;
+  }
+
+  bufferlist bl;
+  bl.append(buffer::create(size));
+  bl.zero();
+  interval_set<uint64_t> one;
+  one.insert(0, size);
+  ASSERT_EQ((int)size, image.write(0, size, bl));
+  ASSERT_EQ(0, image.snap_create("one"));
+  ASSERT_EQ(0, image.snap_protect("one"));
+
+  std::string clone_name = this->get_temp_image_name();
+  ASSERT_EQ(0, rbd.clone(ioctx, name.c_str(), "one", ioctx, clone_name.c_str(),
+                         RBD_FEATURE_LAYERING, &order));
+  ASSERT_EQ(0, rbd.open(ioctx, image, clone_name.c_str(), NULL));
+
+  interval_set<uint64_t> exists;
+  interval_set<uint64_t> two;
+  scribble(image, 10, 102400, &exists, &two);
+  two = round_diff_interval(two, object_size);
+  cout << " wrote " << two << " to clone" << std::endl;
+
+  interval_set<uint64_t> diff;
+  ASSERT_EQ(0, image.diff_iterate2(NULL, 0, size, false, this->whole_object,
+                                   iterate_cb, (void *)&diff));
+  cout << " diff was " << diff << std::endl;
+  if (!this->whole_object) {
+    ASSERT_FALSE(one.subset_of(diff));
+  }
+  ASSERT_TRUE(two.subset_of(diff));
+}
+
+TYPED_TEST(DiffIterateTest, DiffIterateCallbackError)
+{
+  librados::IoCtx ioctx;
+  ASSERT_EQ(0, this->_rados.ioctx_create(this->m_pool_name.c_str(), ioctx));
+
+  {
+    librbd::RBD rbd;
+    librbd::Image image;
+    int order = 0;
+    std::string name = this->get_temp_image_name();
+    uint64_t size = 20 << 20;
+
+    ASSERT_EQ(0, create_image_pp(rbd, ioctx, name.c_str(), size, &order));
+    ASSERT_EQ(0, rbd.open(ioctx, image, name.c_str(), NULL));
+
+    interval_set<uint64_t> exists;
+    interval_set<uint64_t> one;
+    scribble(image, 10, 102400, &exists, &one);
+    cout << " wrote " << one << std::endl;
+
+    interval_set<uint64_t> diff;
+    ASSERT_EQ(-EINVAL, image.diff_iterate2(NULL, 0, size, true,
+                                           this->whole_object,
+                                           iterate_error_cb, NULL));
+  }
+  ioctx.close();
+}
+
+TYPED_TEST(DiffIterateTest, DiffIterateParentDiscard)
+{
+  REQUIRE_FEATURE(RBD_FEATURE_LAYERING);
+
+  librados::IoCtx ioctx;
+  ASSERT_EQ(0, this->_rados.ioctx_create(this->m_pool_name.c_str(), ioctx));
+
+  librbd::RBD rbd;
+  librbd::Image image;
+  std::string name = this->get_temp_image_name();
+  uint64_t size = 20 << 20;
+  int order = 0;
+
+  ASSERT_EQ(0, create_image_pp(rbd, ioctx, name.c_str(), size, &order));
+  ASSERT_EQ(0, rbd.open(ioctx, image, name.c_str(), NULL));
+
+  uint64_t object_size = 0;
+  if (this->whole_object) {
+    object_size = 1 << order;
+  }
+
+  interval_set<uint64_t> exists;
+  interval_set<uint64_t> one;
+  scribble(image, 10, 102400, &exists, &one);
+  ASSERT_EQ(0, image.snap_create("one"));
+
+  ASSERT_EQ(1 << order, image.discard(0, 1 << order));
+  ASSERT_EQ(0, image.snap_create("two"));
+  ASSERT_EQ(0, image.snap_protect("two"));
+  exists.clear();
+  one.clear();
+
+  std::string clone_name = this->get_temp_image_name();
+  ASSERT_EQ(0, rbd.clone(ioctx, name.c_str(), "two", ioctx,
+                         clone_name.c_str(), RBD_FEATURE_LAYERING, &order));
+  ASSERT_EQ(0, rbd.open(ioctx, image, clone_name.c_str(), NULL));
+
+  interval_set<uint64_t> two;
+  scribble(image, 10, 102400, &exists, &two);
+  two = round_diff_interval(two, object_size);
+
+  interval_set<uint64_t> diff;
+  ASSERT_EQ(0, image.diff_iterate2(NULL, 0, size, true, this->whole_object,
+                                   iterate_cb, (void *)&diff));
+  ASSERT_TRUE(two.subset_of(diff));
+}
+
 TEST_F(TestLibRBD, ZeroLengthWrite)
 {
   rados_ioctx_t ioctx;
@@ -2360,7 +2570,7 @@ TEST_F(TestLibRBD, LargeCacheRead)
   rados_ioctx_t ioctx;
   rados_ioctx_create(_cluster, m_pool_name.c_str(), &ioctx);
 
-  uint32_t new_cache_size = 16777216;
+  uint32_t new_cache_size = 1 << 20;
   std::string orig_cache_size;
   ASSERT_EQ(0, _rados.conf_get("rbd_cache_size", orig_cache_size));
   ASSERT_EQ(0, _rados.conf_set("rbd_cache_size",
@@ -2372,24 +2582,22 @@ TEST_F(TestLibRBD, LargeCacheRead)
   } BOOST_SCOPE_EXIT_END;
 
   rbd_image_t image;
-  int order = 0;
-  const char *name = "testimg";
-  uint64_t size = new_cache_size + 1;
+  int order = 21;
+  std::string name = get_temp_image_name();
+  uint64_t size = 1 << order;
 
-  ASSERT_EQ(0, create_image(ioctx, name, size, &order));
-  ASSERT_EQ(0, rbd_open(ioctx, name, &image, NULL));
+  ASSERT_EQ(0, create_image(ioctx, name.c_str(), size, &order));
+  ASSERT_EQ(0, rbd_open(ioctx, name.c_str(), &image, NULL));
 
   std::string buffer(1 << order, '1');
-  for (size_t offs = 0; offs < size; offs += buffer.size()) {
-    size_t len = std::min<uint64_t>(buffer.size(), size - offs);
-    ASSERT_EQ(static_cast<ssize_t>(len),
-	      rbd_write(image, offs, len, buffer.c_str()));
-  }
+ 
+  ASSERT_EQ(static_cast<ssize_t>(buffer.size()),
+	    rbd_write(image, 0, buffer.size(), buffer.c_str()));
 
   ASSERT_EQ(0, rbd_invalidate_cache(image));
 
-  buffer.resize(size);
-  ASSERT_EQ(static_cast<ssize_t>(size-1024), rbd_read(image, 1024, size, &buffer[0]));
+  ASSERT_EQ(static_cast<ssize_t>(buffer.size()), 
+  	    rbd_read(image, 0, buffer.size(), &buffer[0]));
 
   ASSERT_EQ(0, rbd_close(image));
 
@@ -2485,6 +2693,44 @@ TEST_F(TestLibRBD, SnapCreateViaLockOwner)
   ASSERT_TRUE(lock_owner);
 }
 
+TEST_F(TestLibRBD, SnapRemoveViaLockOwner)
+{
+  REQUIRE_FEATURE(RBD_FEATURE_LAYERING | RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librados::IoCtx ioctx;
+  ASSERT_EQ(0, _rados.ioctx_create(m_pool_name.c_str(), ioctx));
+
+  librbd::RBD rbd;
+  std::string name = get_temp_image_name();
+  uint64_t size = 2 << 20;
+  int order = 0;
+  ASSERT_EQ(0, create_image_pp(rbd, ioctx, name.c_str(), size, &order));
+
+  librbd::Image image1;
+  ASSERT_EQ(0, rbd.open(ioctx, image1, name.c_str(), NULL));
+
+  bufferlist bl;
+  ASSERT_EQ(0, image1.write(0, 0, bl));
+  ASSERT_EQ(0, image1.snap_create("snap1"));
+
+  bool lock_owner;
+  ASSERT_EQ(0, image1.is_exclusive_lock_owner(&lock_owner));
+  ASSERT_TRUE(lock_owner);
+
+  librbd::Image image2;
+  ASSERT_EQ(0, rbd.open(ioctx, image2, name.c_str(), NULL));
+
+  ASSERT_EQ(0, image2.is_exclusive_lock_owner(&lock_owner));
+  ASSERT_FALSE(lock_owner);
+
+  ASSERT_EQ(0, image2.snap_remove("snap1"));
+  ASSERT_FALSE(image1.snap_exists("snap1"));
+  ASSERT_FALSE(image2.snap_exists("snap1"));
+
+  ASSERT_EQ(0, image1.is_exclusive_lock_owner(&lock_owner));
+  ASSERT_TRUE(lock_owner);
+}
+
 TEST_F(TestLibRBD, FlattenViaLockOwner)
 {
   REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
@@ -2626,6 +2872,239 @@ TEST_F(TestLibRBD, ObjectMapConsistentSnap)
   ASSERT_PASSED(validate_object_map, image1);
 }
 
+TEST_F(TestLibRBD, Metadata)
+{
+  REQUIRE_FEATURE(RBD_FEATURE_LAYERING);
+
+  librados::IoCtx ioctx;
+  ASSERT_EQ(0, _rados.ioctx_create(m_pool_name.c_str(), ioctx));
+
+  librbd::RBD rbd;
+  string name = get_temp_image_name();
+  uint64_t size = 2 << 20;
+  int order = 0;
+  uint64_t features;
+  string value;
+  ASSERT_EQ(0, create_image_pp(rbd, ioctx, name.c_str(), size, &order));
+
+  librbd::Image image1;
+  ASSERT_EQ(0, rbd.open(ioctx, image1, name.c_str(), NULL));
+  map<string, bufferlist> pairs;
+  ASSERT_EQ(0, image1.metadata_list("", 0, &pairs));
+  ASSERT_TRUE(pairs.empty());
+
+  ASSERT_EQ(0, image1.metadata_set("key1", "value1"));
+  ASSERT_EQ(0, image1.metadata_set("key2", "value2"));
+  ASSERT_EQ(0, image1.metadata_get("key1", &value));
+  ASSERT_EQ(0, strcmp("value1", value.c_str()));
+  ASSERT_EQ(0, image1.metadata_list("", 0, &pairs));
+  ASSERT_EQ(2U, pairs.size());
+  ASSERT_EQ(0, strncmp("value1", pairs["key1"].c_str(), 6));
+  ASSERT_EQ(0, strncmp("value2", pairs["key2"].c_str(), 6));
+
+  pairs.clear();
+  ASSERT_EQ(0, image1.metadata_remove("key1"));
+  ASSERT_EQ(0, image1.metadata_remove("key3"));
+  ASSERT_TRUE(image1.metadata_get("key3", &value) < 0);
+  ASSERT_EQ(0, image1.metadata_list("", 0, &pairs));
+  ASSERT_EQ(1U, pairs.size());
+  ASSERT_EQ(0, strncmp("value2", pairs["key2"].c_str(), 6));
+
+  // test metadata with snapshot adding
+  ASSERT_EQ(0, image1.snap_create("snap1"));
+  ASSERT_EQ(0, image1.snap_protect("snap1"));
+  ASSERT_EQ(0, image1.snap_set("snap1"));
+
+  pairs.clear();
+  ASSERT_EQ(0, image1.metadata_set("key1", "value1"));
+  ASSERT_EQ(0, image1.metadata_set("key3", "value3"));
+  ASSERT_EQ(0, image1.metadata_list("", 0, &pairs));
+  ASSERT_EQ(3U, pairs.size());
+  ASSERT_EQ(0, strncmp("value1", pairs["key1"].c_str(), 6));
+  ASSERT_EQ(0, strncmp("value2", pairs["key2"].c_str(), 6));
+  ASSERT_EQ(0, strncmp("value3", pairs["key3"].c_str(), 6));
+
+  ASSERT_EQ(0, image1.snap_set(NULL));
+  ASSERT_EQ(0, image1.metadata_list("", 0, &pairs));
+  ASSERT_EQ(3U, pairs.size());
+  ASSERT_EQ(0, strncmp("value1", pairs["key1"].c_str(), 6));
+  ASSERT_EQ(0, strncmp("value2", pairs["key2"].c_str(), 6));
+  ASSERT_EQ(0, strncmp("value3", pairs["key3"].c_str(), 6));
+
+  // test metadata with cloning
+  string cname = get_temp_image_name();
+  librbd::Image image2;
+  ASSERT_EQ(0, image1.features(&features));
+  EXPECT_EQ(0, rbd.clone(ioctx, name.c_str(), "snap1", ioctx,
+                         cname.c_str(), features, &order));
+  ASSERT_EQ(0, rbd.open(ioctx, image2, cname.c_str(), NULL));
+  ASSERT_EQ(0, image2.metadata_set("key4", "value4"));
+  pairs.clear();
+  ASSERT_EQ(0, image2.metadata_list("", 0, &pairs));
+  ASSERT_EQ(4U, pairs.size());
+  pairs.clear();
+  ASSERT_EQ(0, image1.metadata_list("", 0, &pairs));
+  ASSERT_EQ(3U, pairs.size());
+  ASSERT_EQ(-ENOENT, image1.metadata_get("key4", &value));
+}
+
+TEST_F(TestLibRBD, UpdateFeatures)
+{
+  REQUIRE_FEATURE(RBD_FEATURE_LAYERING);
+
+  librados::IoCtx ioctx;
+  ASSERT_EQ(0, _rados.ioctx_create(m_pool_name.c_str(), ioctx));
+
+  librbd::RBD rbd;
+  std::string name = get_temp_image_name();
+  uint64_t size = 1 << 20;
+  int order = 0;
+  ASSERT_EQ(0, create_image_pp(rbd, ioctx, name.c_str(), size, &order));
+
+  librbd::Image image;
+  ASSERT_EQ(0, rbd.open(ioctx, image, name.c_str(), NULL));
+
+  uint8_t old_format;
+  ASSERT_EQ(0, image.old_format(&old_format));
+  if (old_format) {
+    ASSERT_EQ(-EINVAL, image.update_features(RBD_FEATURE_EXCLUSIVE_LOCK, true));
+    return;
+  }
+
+  // must provide a single feature
+  ASSERT_EQ(-EINVAL, image.update_features(0, true));
+
+  ASSERT_EQ(0, image.update_features(RBD_FEATURE_EXCLUSIVE_LOCK |
+                                       RBD_FEATURE_OBJECT_MAP |
+                                       RBD_FEATURE_FAST_DIFF, false));
+
+  // cannot enable object map w/o exclusive lock
+  ASSERT_EQ(-EINVAL, image.update_features(RBD_FEATURE_OBJECT_MAP, true));
+  ASSERT_EQ(0, image.update_features(RBD_FEATURE_EXCLUSIVE_LOCK, true));
+
+  // cannot enable fast diff w/o object map
+  ASSERT_EQ(-EINVAL, image.update_features(RBD_FEATURE_FAST_DIFF, true));
+  ASSERT_EQ(0, image.update_features(RBD_FEATURE_OBJECT_MAP |
+                                       RBD_FEATURE_FAST_DIFF, true));
+
+  uint64_t expected_flags = RBD_FLAG_OBJECT_MAP_INVALID |
+                            RBD_FLAG_FAST_DIFF_INVALID;
+  uint64_t flags;
+  ASSERT_EQ(0, image.get_flags(&flags));
+  ASSERT_EQ(expected_flags, flags);
+
+  // cannot disable object map w/ fast diff
+  ASSERT_EQ(-EINVAL, image.update_features(RBD_FEATURE_OBJECT_MAP, false));
+  ASSERT_EQ(0, image.update_features(RBD_FEATURE_FAST_DIFF, false));
+
+  expected_flags = RBD_FLAG_OBJECT_MAP_INVALID;
+  ASSERT_EQ(0, image.get_flags(&flags));
+  ASSERT_EQ(expected_flags, flags);
+
+  // cannot disable exclusive lock w/ object map
+  ASSERT_EQ(-EINVAL, image.update_features(RBD_FEATURE_EXCLUSIVE_LOCK, false));
+  ASSERT_EQ(0, image.update_features(RBD_FEATURE_OBJECT_MAP, false));
+
+  ASSERT_EQ(0, image.get_flags(&flags));
+  ASSERT_EQ(0U, flags);
+
+  ASSERT_EQ(0, image.update_features(RBD_FEATURE_EXCLUSIVE_LOCK, false));
+}
+
+TEST_F(TestLibRBD, RebuildObjectMap)
+{
+  librados::IoCtx ioctx;
+  ASSERT_EQ(0, _rados.ioctx_create(m_pool_name.c_str(), ioctx));
+
+  librbd::RBD rbd;
+  std::string name = get_temp_image_name();
+  uint64_t size = 1 << 20;
+  int order = 18;
+  ASSERT_EQ(0, create_image_pp(rbd, ioctx, name.c_str(), size, &order));
+
+  PrintProgress prog_ctx;
+  std::string object_map_oid;
+  bufferlist bl;
+  bl.append("foo");
+  {
+    librbd::Image image;
+    ASSERT_EQ(0, rbd.open(ioctx, image, name.c_str(), NULL));
+
+    uint64_t features;
+    ASSERT_EQ(0, image.features(&features));
+    if ((features & RBD_FEATURE_OBJECT_MAP) == 0) {
+      ASSERT_EQ(-EINVAL, image.rebuild_object_map(prog_ctx));
+      return;
+    }
+
+    ASSERT_EQ(bl.length(), image.write(0, bl.length(), bl));
+
+    ASSERT_EQ(0, image.snap_create("snap1"));
+    ASSERT_EQ(bl.length(), image.write(1<<order, bl.length(), bl));
+
+    librbd::image_info_t info;
+    ASSERT_EQ(0, image.stat(info, sizeof(info)));
+
+    char prefix[RBD_MAX_BLOCK_NAME_SIZE + 1];
+    strncpy(prefix, info.block_name_prefix, RBD_MAX_BLOCK_NAME_SIZE);
+    prefix[RBD_MAX_BLOCK_NAME_SIZE] = '\0';
+
+    std::string image_id(prefix + strlen(RBD_DATA_PREFIX));
+    object_map_oid = RBD_OBJECT_MAP_PREFIX + image_id;
+  }
+
+  // corrupt the object map
+  ASSERT_EQ(0, ioctx.write(object_map_oid, bl, bl.length(), 0));
+
+  librbd::Image image1;
+  ASSERT_EQ(0, rbd.open(ioctx, image1, name.c_str(), NULL));
+
+  uint64_t flags;
+  ASSERT_EQ(0, image1.get_flags(&flags));
+  ASSERT_TRUE((flags & RBD_FLAG_OBJECT_MAP_INVALID) != 0);
+
+  ASSERT_EQ(0, image1.rebuild_object_map(prog_ctx));
+
+  librbd::Image image2;
+  ASSERT_EQ(0, rbd.open(ioctx, image2, name.c_str(), NULL));
+
+  bufferlist read_bl;
+  ASSERT_EQ(bl.length(), image2.read(0, bl.length(), read_bl));
+  ASSERT_TRUE(bl.contents_equal(read_bl));
+
+  read_bl.clear();
+  ASSERT_EQ(bl.length(), image2.read(1<<order, bl.length(), read_bl));
+  ASSERT_TRUE(bl.contents_equal(read_bl));
+
+  ASSERT_PASSED(validate_object_map, image1);
+  ASSERT_PASSED(validate_object_map, image2);
+}
+
+TEST_F(TestLibRBD, RebuildNewObjectMap)
+{
+  REQUIRE_FEATURE(RBD_FEATURE_LAYERING);
+
+  rados_ioctx_t ioctx;
+  rados_ioctx_create(_cluster, m_pool_name.c_str(), &ioctx);
+
+  std::string name = get_temp_image_name();
+  uint64_t size = 1 << 20;
+  int order = 18;
+  uint64_t features = RBD_FEATURE_EXCLUSIVE_LOCK;
+  ASSERT_EQ(0, create_image_full(ioctx, name.c_str(), size, &order,
+				 false, features));
+
+  rbd_image_t image;
+  ASSERT_EQ(0, rbd_open(ioctx, name.c_str(), &image, NULL));
+  ASSERT_EQ(0, rbd_update_features(image, RBD_FEATURE_OBJECT_MAP, true));
+  ASSERT_EQ(0, rbd_rebuild_object_map(image, print_progress_percent, NULL));
+
+  ASSERT_PASSED(validate_object_map, image);
+
+  ASSERT_EQ(0, rbd_close(image));
+  rados_ioctx_destroy(ioctx);
+}
+
 TEST_F(TestLibRBD, BlockingAIO)
 {
   librados::IoCtx ioctx;
diff --git a/src/test/librbd/test_mock_fixture.cc b/src/test/librbd/test_mock_fixture.cc
new file mode 100644
index 0000000..1839b91
--- /dev/null
+++ b/src/test/librbd/test_mock_fixture.cc
@@ -0,0 +1,68 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "test/librbd/test_mock_fixture.h"
+#include "test/librbd/mock/MockImageCtx.h"
+#include "test/librados_test_stub/LibradosTestStub.h"
+#include "test/librados_test_stub/MockTestMemRadosClient.h"
+
+// template definitions
+#include "librbd/AsyncRequest.cc"
+#include "librbd/AsyncObjectThrottle.cc"
+
+template class librbd::AsyncRequest<librbd::MockImageCtx>;
+template class librbd::AsyncObjectThrottle<librbd::MockImageCtx>;
+
+using ::testing::_;
+using ::testing::DoDefault;
+
+TestMockFixture::TestRadosClientPtr TestMockFixture::s_test_rados_client;
+::testing::NiceMock<librados::MockTestMemRadosClient> *
+  TestMockFixture::s_mock_rados_client = NULL;
+
+void TestMockFixture::SetUpTestCase() {
+  s_test_rados_client = librados_test_stub::get_rados_client();
+
+  // use a mock version of the in-memory rados client
+  s_mock_rados_client = new ::testing::NiceMock<librados::MockTestMemRadosClient>(
+      s_test_rados_client->cct());
+  librados_test_stub::set_rados_client(TestRadosClientPtr(s_mock_rados_client));
+  TestFixture::SetUpTestCase();
+}
+
+void TestMockFixture::TearDownTestCase() {
+  TestFixture::TearDownTestCase();
+  librados_test_stub::set_rados_client(s_test_rados_client);
+}
+
+void TestMockFixture::SetUp() {
+  TestFixture::SetUp();
+}
+
+void TestMockFixture::TearDown() {
+  TestFixture::TearDown();
+
+  // Mock rados client lives across tests -- reset it to initial state
+  ::testing::Mock::VerifyAndClear(s_mock_rados_client);
+  s_mock_rados_client->default_to_dispatch();
+}
+
+void TestMockFixture::expect_unlock_exclusive_lock(librbd::ImageCtx &ictx) {
+  EXPECT_CALL(get_mock_io_ctx(ictx.md_ctx),
+              exec(_, _, "lock", "unlock", _, _, _))
+                .WillRepeatedly(DoDefault());
+}
+
+void TestMockFixture::expect_op_work_queue(librbd::MockImageCtx &mock_image_ctx) {
+  EXPECT_CALL(*mock_image_ctx.op_work_queue, queue(_, _))
+                .WillRepeatedly(DispatchContext(
+                  mock_image_ctx.image_ctx->op_work_queue));
+}
+
+librados::MockTestMemIoCtxImpl &TestMockFixture::get_mock_io_ctx(
+    librados::IoCtx &ioctx) {
+  // TODO become friend of IoCtx so that we can cleanly extract io_ctx_impl
+  librados::MockTestMemIoCtxImpl **mock =
+    reinterpret_cast<librados::MockTestMemIoCtxImpl **>(&ioctx);
+  return **mock;
+}
diff --git a/src/test/librbd/test_mock_fixture.h b/src/test/librbd/test_mock_fixture.h
new file mode 100644
index 0000000..150e312
--- /dev/null
+++ b/src/test/librbd/test_mock_fixture.h
@@ -0,0 +1,64 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_TEST_LIBRBD_TEST_MOCK_FIXTURE_H
+#define CEPH_TEST_LIBRBD_TEST_MOCK_FIXTURE_H
+
+#include "test/librbd/test_fixture.h"
+#include "common/WorkQueue.h"
+#include <boost/shared_ptr.hpp>
+#include <gmock/gmock.h>
+
+namespace librados {
+class TestRadosClient;
+class MockTestMemIoCtxImpl;
+class MockTestMemRadosClient;
+}
+namespace librbd {
+class MockImageCtx;
+}
+
+ACTION_P2(CompleteContext, r, wq) {
+  ContextWQ *context_wq = reinterpret_cast<ContextWQ *>(wq);
+  if (context_wq != NULL) {
+    context_wq->queue(arg0, r);
+  } else {
+    arg0->complete(r);
+  }
+}
+
+ACTION_P(DispatchContext, wq) {
+  wq->queue(arg0, arg1);
+}
+
+ACTION_P(GetReference, ref_object) {
+  ref_object->get();
+}
+
+MATCHER_P(ContentsEqual, bl, "") {
+  // TODO fix const-correctness of bufferlist
+  return const_cast<bufferlist &>(arg).contents_equal(
+    const_cast<bufferlist &>(bl));
+}
+
+class TestMockFixture : public TestFixture {
+public:
+  typedef boost::shared_ptr<librados::TestRadosClient> TestRadosClientPtr;
+
+  static void SetUpTestCase();
+  static void TearDownTestCase();
+
+  virtual void SetUp();
+  virtual void TearDown();
+
+  librados::MockTestMemIoCtxImpl &get_mock_io_ctx(librados::IoCtx &ioctx);
+
+  void expect_op_work_queue(librbd::MockImageCtx &mock_image_ctx);
+  void expect_unlock_exclusive_lock(librbd::ImageCtx &ictx);
+
+private:
+  static TestRadosClientPtr s_test_rados_client;
+  static ::testing::NiceMock<librados::MockTestMemRadosClient> *s_mock_rados_client;
+};
+
+#endif // CEPH_TEST_LIBRBD_TEST_MOCK_FIXTURE_H
diff --git a/src/test/messenger/message_helper.h b/src/test/messenger/message_helper.h
index cac1944..769c9b4 100644
--- a/src/test/messenger/message_helper.h
+++ b/src/test/messenger/message_helper.h
@@ -47,7 +47,7 @@ static inline Message* new_ping_monstyle(const char *tag, int mult)
 #if defined(HAVE_XIO)
 extern struct xio_mempool *xio_msgr_mpool;
 
-void xio_hook_func(struct xio_mempool_obj *mp)
+void xio_hook_func(struct xio_reg_mem *mp)
 {
   xio_mempool_free(mp);
 }
@@ -63,7 +63,7 @@ static inline Message* new_ping_with_data(const char *tag, uint32_t size)
   bufferlist bl;
   void *p;
 
-  struct xio_mempool_obj *mp = m->get_mp();
+  struct xio_reg_mem *mp = m->get_mp();
   int e = xio_mempool_alloc(xio_msgr_mpool, size, mp);
   assert(e == 0);
   p = mp->addr;
diff --git a/src/test/messenger/xio_client.cc b/src/test/messenger/xio_client.cc
index 29f807f..414668f 100644
--- a/src/test/messenger/xio_client.cc
+++ b/src/test/messenger/xio_client.cc
@@ -117,7 +117,7 @@ int main(int argc, const char **argv)
 	messenger = new XioMessenger(g_ceph_context,
 				     entity_name_t::MON(-1),
 				     "xio_client",
-				     0 /* nonce */,
+				     0 /* nonce */, XIO_ALL_FEATURES,
 				     dstrategy);
 
 	// enable timing prints
diff --git a/src/test/messenger/xio_dispatcher.h b/src/test/messenger/xio_dispatcher.h
index fba356d..f8b76d3 100644
--- a/src/test/messenger/xio_dispatcher.h
+++ b/src/test/messenger/xio_dispatcher.h
@@ -76,7 +76,7 @@ public:
   virtual void ms_handle_remote_reset(Connection *con);
 
   /**
-   * @defgroup Authentication
+   * @defgroup test_xio_dispatcher_h_auth Authentication
    * @{
    */
   /**
diff --git a/src/test/messenger/xio_server.cc b/src/test/messenger/xio_server.cc
index ee74b79..039aa7b 100644
--- a/src/test/messenger/xio_server.cc
+++ b/src/test/messenger/xio_server.cc
@@ -88,7 +88,7 @@ int main(int argc, const char **argv)
 	messenger = new XioMessenger(g_ceph_context,
 				     entity_name_t::MON(-1),
 				     "xio_server",
-				     0 /* nonce */,
+				     0 /* nonce */, XIO_ALL_FEATURES,
 				     dstrategy);
 
 	static_cast<XioMessenger*>(messenger)->set_magic(
diff --git a/src/test/mon/PGMap.cc b/src/test/mon/PGMap.cc
index 9f7a6b2..f13fa89 100644
--- a/src/test/mon/PGMap.cc
+++ b/src/test/mon/PGMap.cc
@@ -81,7 +81,40 @@ TEST(pgmap, min_last_epoch_clean)
 
 }
 
-
+TEST(pgmap, calc_stats)
+{
+  bufferlist bl;
+  {
+    PGMap pg_map;
+    PGMap::Incremental inc;
+    osd_stat_t os;
+    pg_stat_t ps;
+
+    ps.last_epoch_clean = 999;
+    inc.pg_stat_updates[pg_t(9,9)] = ps;
+    inc.version = 1;
+    inc.update_stat(0, 123, os);
+    pg_map.apply_incremental(g_ceph_context, inc);
+    ASSERT_EQ(123u, pg_map.get_min_last_epoch_clean());
+    pg_map.encode(bl);
+  }
+  {
+    PGMap pg_map;
+    PGMap::Incremental inc;
+    osd_stat_t os;
+    pg_stat_t ps;
+
+    ps.last_epoch_clean = 999;
+    inc.pg_stat_updates[pg_t(9,9)] = ps;
+    inc.version = 1;
+    inc.update_stat(0, 321, os);
+    pg_map.apply_incremental(g_ceph_context, inc);
+    ASSERT_EQ(321u, pg_map.get_min_last_epoch_clean());
+    bufferlist::iterator p = bl.begin();
+    ::decode(pg_map, p);
+    ASSERT_EQ(123u, pg_map.get_min_last_epoch_clean());
+  }
+}
 
 int main(int argc, char **argv) {
   vector<const char*> args;
diff --git a/src/test/mon/misc.sh b/src/test/mon/misc.sh
index 0351bd4..d2e5dbd 100755
--- a/src/test/mon/misc.sh
+++ b/src/test/mon/misc.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 #
 # Copyright (C) 2014 Cloudwatt <libre.licensing at cloudwatt.com>
-# Copyright (C) 2014 Red Hat <contact at redhat.com>
+# Copyright (C) 2014, 2015 Red Hat <contact at redhat.com>
 #
 # Author: Loic Dachary <loic at dachary.org>
 #
@@ -15,49 +15,49 @@
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU Library Public License for more details.
 #
-source test/mon/mon-test-helpers.sh
+source ../qa/workunits/ceph-helpers.sh
 
 function run() {
     local dir=$1
+    shift
 
     export CEPH_MON="127.0.0.1:7102"
     export CEPH_ARGS
     CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
     CEPH_ARGS+="--mon-host=$CEPH_MON "
 
-    setup $dir || return 1
-    run_mon $dir a --public-addr $CEPH_MON
-    FUNCTIONS=${FUNCTIONS:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
-    for TEST_function in $FUNCTIONS ; do
-        if ! $TEST_function $dir ; then
-            cat $dir/a/log
-            return 1
-        fi
+    local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+    for func in $funcs ; do
+        $func $dir || return 1
     done
-    teardown $dir || return 1
 }
 
 TEST_POOL=rbd
 
 function TEST_osd_pool_get_set() {
-    local dir=$1 flag
-    for flag in hashpspool nodelete nopgchange nosizechange; do
+    local dir=$1
+
+    setup $dir || return 1
+    run_mon $dir a || return 1
+
+    local flag
+    for flag in hashpspool nodelete nopgchange nosizechange write_fadvise_dontneed noscrub nodeep-scrub; do
         if [ $flag = hashpspool ]; then
-	    ./ceph osd dump | grep 'pool 0' | grep $flag || return 1
+	    ./ceph osd dump | grep 'pool ' | grep $flag || return 1
         else
-	    ! ./ceph osd dump | grep 'pool 0' | grep $flag || return 1
+	    ! ./ceph osd dump | grep 'pool ' | grep $flag || return 1
         fi
 	./ceph osd pool set $TEST_POOL $flag 0 || return 1
-	! ./ceph osd dump | grep 'pool 0' | grep $flag || return 1
+	! ./ceph osd dump | grep 'pool ' | grep $flag || return 1
 	./ceph osd pool set $TEST_POOL $flag 1 || return 1
-	./ceph osd dump | grep 'pool 0' | grep $flag || return 1
+	./ceph osd dump | grep 'pool ' | grep $flag || return 1
 	./ceph osd pool set $TEST_POOL $flag false || return 1
-	! ./ceph osd dump | grep 'pool 0' | grep $flag || return 1
+	! ./ceph osd dump | grep 'pool ' | grep $flag || return 1
 	./ceph osd pool set $TEST_POOL $flag false || return 1
         # check that setting false twice does not toggle to true (bug)
-	! ./ceph osd dump | grep 'pool 0' | grep $flag || return 1
+	! ./ceph osd dump | grep 'pool ' | grep $flag || return 1
 	./ceph osd pool set $TEST_POOL $flag true || return 1
-	./ceph osd dump | grep 'pool 0' | grep $flag || return 1
+	./ceph osd dump | grep 'pool ' | grep $flag || return 1
 	# cleanup
 	./ceph osd pool set $TEST_POOL $flag 0 || return 1
     done
@@ -82,9 +82,43 @@ function TEST_osd_pool_get_set() {
     ! ./ceph osd pool set $ecpool min_size $(expr $k - 1) || return 1
     ! ./ceph osd pool set $ecpool min_size $(expr $size + 1) || return 1
 
+    teardown $dir || return 1
+}
+
+function TEST_mon_add_to_single_mon() {
+    local dir=$1
+
+    fsid=$(uuidgen)
+    MONA=127.0.0.1:7117
+    MONB=127.0.0.1:7118
+    CEPH_ARGS_orig=$CEPH_ARGS
+    CEPH_ARGS="--fsid=$fsid --auth-supported=none "
+    CEPH_ARGS+="--mon-initial-members=a "
+    CEPH_ARGS+="--mon-host=$MONA "
+
+    setup $dir || return 1
+    run_mon $dir a --public-addr $MONA || return 1
+    # wait for the quorum
+    timeout 120 ceph -s > /dev/null || return 1
+    run_mon $dir b --public-addr $MONB || return 1
+    teardown $dir || return 1
+
+    setup $dir || return 1
+    run_mon $dir a --public-addr $MONA || return 1
+    # without the fix of #5454, mon.a will assert failure at seeing the MMonJoin
+    # from mon.b
+    run_mon $dir b --public-addr $MONB || return 1
+    # wait for the quorum
+    timeout 120 ceph -s > /dev/null || return 1
+    local num_mons
+    num_mons=$(ceph mon dump --format=xml 2>/dev/null | $XMLSTARLET sel -t -v "count(//mons/mon)") || return 1
+    [ $num_mons == 2 ] || return 1
+    # no reason to take more than 120 secs to get this submitted
+    timeout 120 ceph mon add b $MONB || return 1
+    teardown $dir || return 1
 }
 
-main misc
+main misc "$@"
 
 # Local Variables:
 # compile-command: "cd ../.. ; make -j4 && test/mon/misc.sh"
diff --git a/src/test/mon/mkfs.sh b/src/test/mon/mkfs.sh
index 13b9a1b..68208b2 100755
--- a/src/test/mon/mkfs.sh
+++ b/src/test/mon/mkfs.sh
@@ -16,7 +16,7 @@
 # GNU Library Public License for more details.
 #
 set -xe
-PS4='${FUNCNAME[0]}: $LINENO: '
+PS4='${BASH_SOURCE[0]}:$LINENO: ${FUNCNAME[0]}:  '
 
 DIR=mkfs
 export CEPH_CONF=/dev/null
@@ -42,7 +42,7 @@ function mon_mkfs() {
     ./ceph-mon \
         --id $MON_ID \
         --fsid $fsid \
-        --osd-pool-default-erasure-code-directory=.libs \
+        --erasure-code-dir=.libs \
         --mkfs \
         --mon-data=$MON_DIR \
         --mon-initial-members=$MON_ID \
@@ -56,7 +56,7 @@ function mon_run() {
         --chdir= \
         --mon-osd-full-ratio=.99 \
         --mon-data-avail-crit=1 \
-        --osd-pool-default-erasure-code-directory=.libs \
+        --erasure-code-dir=.libs \
         --mon-data=$MON_DIR \
         --log-file=$MON_DIR/log \
         --mon-cluster-log-file=$MON_DIR/log \
@@ -83,7 +83,7 @@ function auth_none() {
         --id $MON_ID \
         --mon-osd-full-ratio=.99 \
         --mon-data-avail-crit=1 \
-        --osd-pool-default-erasure-code-directory=.libs \
+        --erasure-code-dir=.libs \
         --mon-data=$MON_DIR \
         --extract-monmap $MON_DIR/monmap
 
@@ -150,7 +150,7 @@ function makedir() {
         --id $MON_ID \
         --mon-osd-full-ratio=.99 \
         --mon-data-avail-crit=1 \
-        --osd-pool-default-erasure-code-directory=.libs \
+        --erasure-code-dir=.libs \
         --mkfs \
         --mon-data=$toodeep 2>&1 | tee $DIR/makedir.log
     grep 'toodeep.*No such file' $DIR/makedir.log > /dev/null
diff --git a/src/test/mon/mon-handle-forward.sh b/src/test/mon/mon-handle-forward.sh
index aefac9e..18f6db5 100755
--- a/src/test/mon/mon-handle-forward.sh
+++ b/src/test/mon/mon-handle-forward.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 #
 # Copyright (C) 2013 Cloudwatt <libre.licensing at cloudwatt.com>
-# Copyright (C) 2014 Red Hat <contact at redhat.com>
+# Copyright (C) 2014,2015 Red Hat <contact at redhat.com>
 #
 # Author: Loic Dachary <loic at dachary.org>
 #
@@ -15,43 +15,50 @@
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU Library Public License for more details.
 #
-source test/mon/mon-test-helpers.sh
+source ../qa/workunits/ceph-helpers.sh
 
 function run() {
     local dir=$1
 
-    PORT=7300 # CEPH_MON=
-    MONA=127.0.0.1:$PORT
-    MONB=127.0.0.1:$(($PORT + 1))
+    setup $dir || return 1
+
+    MONA=127.0.0.1:7300
+    MONB=127.0.0.1:7301
     (
         FSID=$(uuidgen) 
         export CEPH_ARGS
         CEPH_ARGS+="--fsid=$FSID --auth-supported=none "
         CEPH_ARGS+="--mon-initial-members=a,b --mon-host=$MONA,$MONB "
-        run_mon $dir a --public-addr $MONA
-        run_mon $dir b --public-addr $MONB
+        run_mon $dir a --public-addr $MONA || return 1
+        run_mon $dir b --public-addr $MONB || return 1
     )
 
     timeout 360 ./ceph --mon-host $MONA mon stat || return 1
     # check that MONB is indeed a peon
-    ./ceph --admin-daemon $dir/b/ceph-mon.b.asok mon_status | 
+    ./ceph --admin-daemon $dir/ceph-mon.b.asok mon_status |
        grep '"peon"' || return 1
     # when the leader ( MONA ) is used, there is no message forwarding
     ./ceph --mon-host $MONA osd pool create POOL1 12 
-    grep 'mon_command(.*"POOL1"' $dir/a/log 
-    grep 'mon_command(.*"POOL1"' $dir/b/log && return 1
+    CEPH_ARGS='' ./ceph --admin-daemon $dir/ceph-mon.a.asok log flush || return 1
+    grep 'mon_command(.*"POOL1"' $dir/a/mon.a.log
+    CEPH_ARGS='' ./ceph --admin-daemon $dir/ceph-mon.b.asok log flush || return 1
+    grep 'mon_command(.*"POOL1"' $dir/mon.b.log && return 1
     # when the peon ( MONB ) is used, the message is forwarded to the leader
-    ./ceph --mon-host $MONB osd pool create POOL2 12 
-    grep 'forward_request.*mon_command(.*"POOL2"' $dir/b/log
-    grep ' forward(mon_command(.*"POOL2"' $dir/a/log 
+    ./ceph --mon-host $MONB osd pool create POOL2 12
+    CEPH_ARGS='' ./ceph --admin-daemon $dir/ceph-mon.b.asok log flush || return 1
+    grep 'forward_request.*mon_command(.*"POOL2"' $dir/mon.b.log
+    CEPH_ARGS='' ./ceph --admin-daemon $dir/ceph-mon.a.asok log flush || return 1
+    grep ' forward(mon_command(.*"POOL2"' $dir/mon.a.log
     # forwarded messages must retain features from the original connection
     features=$(sed -n -e 's|.*127.0.0.1:0.*accept features \([0-9][0-9]*\)|\1|p' < \
-        $dir/b/log)
-    grep ' forward(mon_command(.*"POOL2".*con_features '$features $dir/a/log
+        $dir/mon.b.log)
+    grep ' forward(mon_command(.*"POOL2".*con_features '$features $dir/mon.a.log
+
+    teardown $dir || return 1
 }
 
-main mon-handle-forward
+main mon-handle-forward "$@"
 
 # Local Variables:
-# compile-command: "cd ../.. ; make TESTS=test/mon/mon-handle-forward.sh check"
+# compile-command: "cd ../.. ; make -j4 TESTS=test/mon/mon-handle-forward.sh check"
 # End:
diff --git a/src/test/mon/mon-ping.sh b/src/test/mon/mon-ping.sh
new file mode 100755
index 0000000..e3f7395
--- /dev/null
+++ b/src/test/mon/mon-ping.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+#
+# Copyright (C) 2015 SUSE LINUX GmbH
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Library Public License for more details.
+#
+source ../qa/workunits/ceph-helpers.sh
+
+function run() {
+    local dir=$1
+    shift
+
+    export CEPH_MON="127.0.0.1:7119"
+    export CEPH_ARGS
+    CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+    CEPH_ARGS+="--mon-host=$CEPH_MON "
+
+    local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+    for func in $funcs ; do
+        setup $dir || return 1
+        $func $dir || return 1
+        teardown $dir || return 1
+    done
+}
+
+function TEST_mon_ping() {
+    local dir=$1
+
+    run_mon $dir a || return 1
+
+    ./ceph ping mon.a || return 1
+}
+
+main mon-ping "$@"
+
+# Local Variables:
+# compile-command: "cd ../.. ; make -j4 && test/mon/mon-ping.sh"
+# End:
diff --git a/src/test/mon/mon-scrub.sh b/src/test/mon/mon-scrub.sh
new file mode 100755
index 0000000..eb33bbc
--- /dev/null
+++ b/src/test/mon/mon-scrub.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+#
+# Copyright (C) 2014 Cloudwatt <libre.licensing at cloudwatt.com>
+# Copyright (C) 2014, 2015 Red Hat <contact at redhat.com>
+#
+# Author: Loic Dachary <loic at dachary.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Library Public License for more details.
+#
+source ../qa/workunits/ceph-helpers.sh
+
+function run() {
+    local dir=$1
+    shift
+
+    export CEPH_MON="127.0.0.1:7120"
+    export CEPH_ARGS
+    CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+    CEPH_ARGS+="--mon-host=$CEPH_MON "
+
+    local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+    for func in $funcs ; do
+        setup $dir || return 1
+        $func $dir || return 1
+        teardown $dir || return 1
+    done
+}
+
+function TEST_mon_scrub() {
+    local dir=$1
+
+    run_mon $dir a || return 1
+
+    ./ceph mon scrub || return 1
+}
+
+main mon-scrub "$@"
+
+# Local Variables:
+# compile-command: "cd ../.. ; make -j4 && test/mon/mon-scrub.sh"
+# End:
diff --git a/src/test/mon/mon-test-helpers.sh b/src/test/mon/mon-test-helpers.sh
deleted file mode 100644
index 02ab03a..0000000
--- a/src/test/mon/mon-test-helpers.sh
+++ /dev/null
@@ -1,124 +0,0 @@
-#!/bin/bash
-#
-# Copyright (C) 2013,2014 Cloudwatt <libre.licensing at cloudwatt.com>
-# Copyright (C) 2014 Red Hat <contact at redhat.com>
-#
-# Author: Loic Dachary <loic at dachary.org>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Library Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU Library Public License for more details.
-#
-source test/test_btrfs_common.sh
-
-function setup() {
-    local dir=$1
-    teardown $dir
-    mkdir -p $dir
-}
-
-function teardown() {
-    local dir=$1
-    kill_daemons $dir
-    if [ $(stat -f -c '%T' .) == "btrfs" ]; then
-        teardown_btrfs $dir
-    fi
-    rm -fr $dir
-}
-
-function run_mon() {
-    local dir=$1
-    shift
-    local id=$1
-    shift
-    dir+=/$id
-    
-    ./ceph-mon \
-        --id $id \
-        --mkfs \
-        --mon-data=$dir --run-dir=$dir \
-        "$@"
-
-    ./ceph-mon \
-        --id $id \
-        --mon-osd-full-ratio=.99 \
-        --mon-data-avail-crit=1 \
-        --paxos-propose-interval=0.1 \
-        --osd-crush-chooseleaf-type=0 \
-        --osd-pool-default-erasure-code-directory=.libs \
-        --debug-mon 20 \
-        --debug-ms 20 \
-        --debug-paxos 20 \
-        --chdir= \
-        --mon-data=$dir \
-        --log-file=$dir/log \
-        --mon-cluster-log-file=$dir/log \
-        --run-dir=$dir \
-        --pid-file=$dir/\$name.pid \
-        "$@"
-}
-
-function kill_daemons() {
-    local dir=$1
-    for pidfile in $(find $dir | grep '\.pid') ; do
-        pid=$(cat $pidfile)
-        signal=9
-        for try in 0 1 1 1 2 3 ; do
-            kill -$signal $pid 2> /dev/null || break
-            signal=0
-            sleep $try
-        done
-    done
-}
-
-function call_TEST_functions() {
-    local dir=$1
-    shift
-    local id=$1
-    shift
-
-    setup $dir || return 1
-    run_mon $dir $id "$@"
-    SHARE_MON_FUNCTIONS=${SHARE_MON_FUNCTIONS:-$(set | sed -n -e 's/^\(SHARE_MON_TEST_[0-9a-z_]*\) .*/\1/p')}
-    for TEST_function in $SHARE_MON_FUNCTIONS ; do
-        if ! $TEST_function $dir $id ; then
-            cat $dir/$id/log
-            return 1
-        fi
-    done
-    teardown $dir || return 1
-
-    FUNCTIONS=${FUNCTIONS:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
-    for TEST_function in $FUNCTIONS ; do
-        setup $dir || return 1
-        $TEST_function $dir || return 1
-        teardown $dir || return 1
-    done
-}
-
-function main() {
-    local dir=testdir/$1
-
-    export PATH=:$PATH # make sure program from sources are prefered
-
-    PS4='${FUNCNAME[0]}: $LINENO: '
-    export CEPH_CONF=/dev/null
-    unset CEPH_ARGS
-
-    set -x
-    setup $dir || return 1
-    local code
-    if run $dir ; then
-        code=0
-    else
-        code=1
-    fi
-    teardown $dir || return 1
-    return $code
-}
diff --git a/src/test/mon/osd-crush.sh b/src/test/mon/osd-crush.sh
index 2bf477f..1c2adff 100755
--- a/src/test/mon/osd-crush.sh
+++ b/src/test/mon/osd-crush.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 #
 # Copyright (C) 2014 Cloudwatt <libre.licensing at cloudwatt.com>
-# Copyright (C) 2014 Red Hat <contact at redhat.com>
+# Copyright (C) 2014, 2015 Red Hat <contact at redhat.com>
 #
 # Author: Loic Dachary <loic at dachary.org>
 #
@@ -15,30 +15,30 @@
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU Library Public License for more details.
 #
-source test/ceph-helpers.sh
+source ../qa/workunits/ceph-helpers.sh
 
 function run() {
     local dir=$1
+    shift
 
     export CEPH_MON="127.0.0.1:7104"
     export CEPH_ARGS
     CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
     CEPH_ARGS+="--mon-host=$CEPH_MON "
 
-    FUNCTIONS=${FUNCTIONS:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
-    for TEST_function in $FUNCTIONS ; do
-	setup $dir || return 1
-	run_mon $dir a --public-addr $CEPH_MON
-	if ! $TEST_function $dir ; then
-	  cat $dir/mon.a.log
-	  return 1
-	fi
-	teardown $dir || return 1
+    local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+    for func in $funcs ; do
+        setup $dir || return 1
+        $func $dir || return 1
+        teardown $dir || return 1
     done
 }
 
 function TEST_crush_rule_create_simple() {
     local dir=$1
+
+    run_mon $dir a || return 1
+
     ./ceph --format xml osd crush rule dump replicated_ruleset | \
         egrep '<op>take</op><item>[^<]+</item><item_name>default</item_name>' | \
         grep '<op>choose_firstn</op><num>0</num><type>osd</type>' || return 1
@@ -57,6 +57,9 @@ function TEST_crush_rule_create_simple() {
 
 function TEST_crush_rule_dump() {
     local dir=$1
+
+    run_mon $dir a || return 1
+
     local ruleset=ruleset1
     ./ceph osd crush rule create-erasure $ruleset || return 1
     local expected
@@ -70,6 +73,9 @@ function TEST_crush_rule_dump() {
 
 function TEST_crush_rule_rm() {
     local ruleset=erasure2
+
+    run_mon $dir a || return 1
+
     ./ceph osd crush rule create-erasure $ruleset default || return 1
     ./ceph osd crush rule ls | grep $ruleset || return 1
     ./ceph osd crush rule rm $ruleset || return 1
@@ -78,6 +84,8 @@ function TEST_crush_rule_rm() {
 
 function TEST_crush_rule_create_erasure() {
     local dir=$1
+
+    run_mon $dir a || return 1
     # should have at least one OSD
     run_osd $dir 0 || return 1
 
@@ -107,7 +115,7 @@ function TEST_crush_rule_create_erasure() {
     ! ./ceph osd erasure-code-profile ls | grep default || return 1
     ./ceph osd crush rule create-erasure $ruleset || return 1
     CEPH_ARGS='' ./ceph --admin-daemon $dir/ceph-mon.a.asok log flush || return 1
-    grep 'profile default set' $dir/mon.a.log || return 1
+    grep 'profile set default' $dir/mon.a.log || return 1
     ./ceph osd erasure-code-profile ls | grep default || return 1
     ./ceph osd crush rule rm $ruleset || return 1
     ! ./ceph osd crush rule ls | grep $ruleset || return 1
@@ -149,6 +157,9 @@ function generate_manipulated_rules() {
 
 function TEST_crush_ruleset_match_rule_when_creating() {
     local dir=$1
+
+    run_mon $dir a || return 1
+
     local root=host1
 
     generate_manipulated_rules $dir
@@ -162,6 +173,9 @@ function TEST_crush_ruleset_match_rule_when_creating() {
 
 function TEST_add_ruleset_failed() {
     local dir=$1
+
+    run_mon $dir a || return 1
+
     local root=host1
 
     ./ceph osd crush add-bucket $root host
@@ -192,6 +206,8 @@ EOF
 function TEST_crush_rename_bucket() {
     local dir=$1
 
+    run_mon $dir a || return 1
+
     ./ceph osd crush add-bucket host1 host
     ! ./ceph osd tree | grep host2 || return 1
     ./ceph osd crush rename-bucket host1 host2 || return 1
@@ -202,6 +218,7 @@ function TEST_crush_rename_bucket() {
 
 function TEST_crush_reject_empty() {
     local dir=$1
+    run_mon $dir a || return 1
     # should have at least one OSD
     run_osd $dir 0 || return 1
 
@@ -213,11 +230,55 @@ function TEST_crush_reject_empty() {
 }
 
 function TEST_crush_tree() {
+    local dir=$1
+    run_mon $dir a || return 1
+
     ./ceph osd crush tree --format=xml | \
         $XMLSTARLET val -e -r test/mon/osd-crush-tree.rng - || return 1
 }
 
-main osd-crush 
+# NB: disable me if i am too time consuming
+function TEST_crush_repair_faulty_crushmap() {
+    local dir=$1
+    fsid=$(uuidgen)
+    MONA=127.0.0.1:7113
+    MONB=127.0.0.1:7114
+    MONC=127.0.0.1:7115
+    CEPH_ARGS_orig=$CEPH_ARGS
+    CEPH_ARGS="--fsid=$fsid --auth-supported=none "
+    CEPH_ARGS+="--mon-initial-members=a,b,c "
+    CEPH_ARGS+="--mon-host=$MONA,$MONB,$MONC "
+    run_mon $dir a --public-addr $MONA || return 1
+    run_mon $dir b --public-addr $MONB || return 1
+    run_mon $dir c --public-addr $MONC || return 1
+
+    local empty_map=$dir/empty_map
+    :> $empty_map.txt
+    ./crushtool -c $empty_map.txt -o $empty_map.map || return 1
+
+    local crushtool_path_old=`ceph-conf --show-config-value crushtool`
+    ceph tell mon.* injectargs --crushtool "true"
+
+    ceph osd setcrushmap -i $empty_map.map || return 1
+    # should be an empty crush map without any buckets
+    ! test $(ceph osd crush dump --format=xml | \
+           $XMLSTARLET sel -t -m "//buckets/bucket" -v .) || return 1
+    # bring them down, the "ceph" commands will try to hunt for other monitor in
+    # vain, after mon.a is offline
+    kill_daemons $dir || return 1
+    # rewrite the monstore with the good crush map,
+    ./tools/ceph-monstore-update-crush.sh --rewrite $dir/a || return 1
+
+    run_mon $dir a --public-addr $MONA || return 1
+    run_mon $dir b --public-addr $MONB || return 1
+    run_mon $dir c --public-addr $MONC || return 1
+    # the buckets are back
+    test $(ceph osd crush dump --format=xml | \
+           $XMLSTARLET sel -t -m "//buckets/bucket" -v .) || return 1
+    CEPH_ARGS=$CEPH_ARGS_orig
+}
+
+main osd-crush "$@"
 
 # Local Variables:
 # compile-command: "cd ../.. ; make -j4 && test/mon/osd-crush.sh"
diff --git a/src/test/mon/osd-erasure-code-profile.sh b/src/test/mon/osd-erasure-code-profile.sh
index 6cac118..27be346 100755
--- a/src/test/mon/osd-erasure-code-profile.sh
+++ b/src/test/mon/osd-erasure-code-profile.sh
@@ -15,24 +15,31 @@
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU Library Public License for more details.
 #
-source test/mon/mon-test-helpers.sh
+source ../qa/workunits/ceph-helpers.sh
 
 function run() {
     local dir=$1
+    shift
 
     export CEPH_MON="127.0.0.1:7108"
     export CEPH_ARGS
     CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
     CEPH_ARGS+="--mon-host=$CEPH_MON "
 
-    local id=a
-    call_TEST_functions $dir $id --public-addr $CEPH_MON || return 1
+    local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+    for func in $funcs ; do
+        setup $dir || return 1
+        $func $dir || return 1
+        teardown $dir || return 1
+    done
 }
 
-function SHARE_MON_TEST_set() {
+function TEST_set() {
     local dir=$1
     local id=$2
 
+    run_mon $dir a || return 1
+
     local profile=myprofile
     #
     # no key=value pairs : use the default configuration
@@ -60,10 +67,12 @@ function SHARE_MON_TEST_set() {
     ./ceph osd erasure-code-profile rm $profile # cleanup
 }
 
-function SHARE_MON_TEST_ls() {
+function TEST_ls() {
     local dir=$1
     local id=$2
 
+    run_mon $dir a || return 1
+
     local profile=myprofile
     ! ./ceph osd erasure-code-profile ls | grep $profile || return 1
     ./ceph osd erasure-code-profile set $profile 2>&1 || return 1
@@ -74,10 +83,12 @@ function SHARE_MON_TEST_ls() {
     ./ceph osd erasure-code-profile rm $profile # cleanup
 }
 
-function SHARE_MON_TEST_rm() {
+function TEST_rm() {
     local dir=$1
     local id=$2
 
+    run_mon $dir a || return 1
+
     local profile=myprofile
     ./ceph osd erasure-code-profile set $profile 2>&1 || return 1
     ./ceph osd erasure-code-profile ls | grep $profile || return 1
@@ -96,10 +107,12 @@ function SHARE_MON_TEST_rm() {
     ./ceph osd erasure-code-profile rm $profile # cleanup
 }
 
-function SHARE_MON_TEST_get() {
+function TEST_get() {
     local dir=$1
     local id=$2
 
+    run_mon $dir a || return 1
+
     local default_profile=default
     ./ceph osd erasure-code-profile get $default_profile | \
         grep plugin=jerasure || return 1
@@ -109,21 +122,11 @@ function SHARE_MON_TEST_get() {
     grep -q "unknown erasure code profile 'WRONG'" $dir/out || return 1
 }
 
-function SHARE_MON_TEST_experimental_shec() {
-    local dir=$1
-    local id=$2
-
-    local profile=shec-profile
-
-    ! ./ceph osd erasure-code-profile set $profile plugin=shec > $dir/out 2>&1 || return 1
-    grep "experimental feature 'shec'" $dir/out || return 1
-    ! ./ceph osd erasure-code-profile ls | grep $profile || return 1
-}
-
-function SHARE_MON_TEST_set_idempotent() {
+function TEST_set_idempotent() {
     local dir=$1
     local id=$2
 
+    run_mon $dir a || return 1
     #
     # The default profile is set using a code path different from 
     # ceph osd erasure-code-profile set: verify that it is idempotent,
@@ -160,8 +163,8 @@ function TEST_format_invalid() {
     local profile=profile
     # osd_pool_default_erasure-code-profile is
     # valid JSON but not of the expected type
-    run_mon $dir a --public-addr $CEPH_MON \
-        --osd_pool_default_erasure-code-profile 1
+    run_mon $dir a \
+        --osd_pool_default_erasure-code-profile 1 || return 1
     ! ./ceph osd erasure-code-profile set $profile > $dir/out 2>&1 || return 1
     cat $dir/out
     grep 'must be a JSON object' $dir/out || return 1
@@ -172,8 +175,8 @@ function TEST_format_json() {
 
     # osd_pool_default_erasure-code-profile is JSON
     expected='"plugin":"example"'
-    run_mon $dir a --public-addr $CEPH_MON \
-        --osd_pool_default_erasure-code-profile "{$expected}"
+    run_mon $dir a \
+        --osd_pool_default_erasure-code-profile "{$expected}" || return 1
     ./ceph --format json osd erasure-code-profile get default | \
         grep "$expected" || return 1
 }
@@ -183,13 +186,43 @@ function TEST_format_plain() {
 
     # osd_pool_default_erasure-code-profile is plain text
     expected='"plugin":"example"'
-    run_mon $dir a --public-addr $CEPH_MON \
-        --osd_pool_default_erasure-code-profile "plugin=example"
+    run_mon $dir a \
+        --osd_pool_default_erasure-code-profile "plugin=example" || return 1
     ./ceph --format json osd erasure-code-profile get default | \
         grep "$expected" || return 1
 }
 
-main osd-erasure-code-profile
+function TEST_profile_k_sanity() {
+    local dir=$1
+    local profile=profile-sanity
+
+    run_mon $dir a || return 1
+
+    expect_failure $dir 'k must be a multiple of (k + m) / l' \
+        ./ceph osd erasure-code-profile set $profile \
+        plugin=lrc \
+        l=1 \
+        k=1 \
+        m=1 || return 1
+
+    if erasure_code_plugin_exists isa ; then
+        expect_failure $dir 'k=1 must be >= 2' \
+            ./ceph osd erasure-code-profile set $profile \
+            plugin=isa \
+            k=1 \
+            m=1 || return 1
+    else
+        echo "SKIP because plugin isa has not been built"
+    fi
+
+    expect_failure $dir 'k=1 must be >= 2' \
+        ./ceph osd erasure-code-profile set $profile \
+        plugin=jerasure \
+        k=1 \
+        m=1 || return 1
+}
+
+main osd-erasure-code-profile "$@"
 
 # Local Variables:
 # compile-command: "cd ../.. ; make -j4 && test/mon/osd-erasure-code-profile.sh"
diff --git a/src/test/mon/osd-pool-create.sh b/src/test/mon/osd-pool-create.sh
index 8a57856..e7d1be6 100755
--- a/src/test/mon/osd-pool-create.sh
+++ b/src/test/mon/osd-pool-create.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 #
-# Copyright (C) 2013,2014 Cloudwatt <libre.licensing at cloudwatt.com>
-# Copyright (C) 2014 Red Hat <contact at redhat.com>
+# Copyright (C) 2013, 2014 Cloudwatt <libre.licensing at cloudwatt.com>
+# Copyright (C) 2014, 2015 Red Hat <contact at redhat.com>
 #
 # Author: Loic Dachary <loic at dachary.org>
 #
@@ -15,26 +15,21 @@
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU Library Public License for more details.
 #
-source test/mon/mon-test-helpers.sh
-
-function expect_false()
-{
-    set -x
-    if "$@"; then return 1; else return 0; fi
-}
+source ../qa/workunits/ceph-helpers.sh
 
 function run() {
     local dir=$1
+    shift
 
     export CEPH_MON="127.0.0.1:7105"
     export CEPH_ARGS
     CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
     CEPH_ARGS+="--mon-host=$CEPH_MON "
 
-    FUNCTIONS=${FUNCTIONS:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
-    for TEST_function in $FUNCTIONS ; do
+    local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+    for func in $funcs ; do
         setup $dir || return 1
-        $TEST_function $dir || return 1
+        $func $dir || return 1
         teardown $dir || return 1
     done
 }
@@ -42,42 +37,45 @@ function run() {
 function TEST_default_deprectated_0() {
     local dir=$1
     # explicitly set the default crush rule
-    expected=66
-    run_mon $dir a --public-addr $CEPH_MON \
-        --osd_pool_default_crush_replicated_ruleset $expected
-    ./ceph --format json osd dump | grep '"crush_ruleset":'$expected
-    CEPH_ARGS='' ./ceph --admin-daemon $dir/a/ceph-mon.a.asok log flush || return 1
-    ! grep "osd_pool_default_crush_rule is deprecated " $dir/a/log || return 1
+    local expected=66
+    run_mon $dir a \
+        --osd_pool_default_crush_replicated_ruleset $expected || return 1
+    ./ceph osd pool get rbd crush_ruleset | grep 'ruleset: '$expected || return 1
+    ./ceph osd crush rule dump replicated_ruleset | grep '"ruleset": '$expected || return 1
+    CEPH_ARGS='' ./ceph --admin-daemon $dir/ceph-mon.a.asok log flush || return 1
+    ! grep "osd_pool_default_crush_rule is deprecated " $dir/mon.a.log || return 1
 }
 
 function TEST_default_deprectated_1() {
     local dir=$1
     # explicitly set the default crush rule using deprecated option
-    expected=55
-    run_mon $dir a --public-addr $CEPH_MON \
-        --osd_pool_default_crush_rule $expected
-    ./ceph --format json osd dump | grep '"crush_ruleset":'$expected
-    CEPH_ARGS='' ./ceph --admin-daemon $dir/a/ceph-mon.a.asok log flush || return 1
-    grep "osd_pool_default_crush_rule is deprecated " $dir/a/log || return 1
+    local expected=55
+    run_mon $dir a \
+        --osd_pool_default_crush_rule $expected || return 1
+    ./ceph osd pool get rbd crush_ruleset | grep 'ruleset: '$expected || return 1
+    ./ceph osd crush rule dump replicated_ruleset | grep '"ruleset": '$expected || return 1
+    CEPH_ARGS='' ./ceph --admin-daemon $dir/ceph-mon.a.asok log flush || return 1
+    grep "osd_pool_default_crush_rule is deprecated " $dir/mon.a.log || return 1
 }
 
 function TEST_default_deprectated_2() {
     local dir=$1
-    expected=77
-    unexpected=33
-    run_mon $dir a --public-addr $CEPH_MON \
+    local expected=77
+    local unexpected=33
+    run_mon $dir a \
         --osd_pool_default_crush_rule $expected \
-        --osd_pool_default_crush_replicated_ruleset $unexpected
-    ./ceph --format json osd dump | grep '"crush_ruleset":'$expected
+        --osd_pool_default_crush_replicated_ruleset $unexpected || return 1
+    ./ceph osd pool get rbd crush_ruleset | grep 'ruleset: '$expected || return 1
     ! ./ceph --format json osd dump | grep '"crush_ruleset":'$unexpected || return 1
-    CEPH_ARGS='' ./ceph --admin-daemon $dir/a/ceph-mon.a.asok log flush || return 1
-    grep "osd_pool_default_crush_rule is deprecated " $dir/a/log || return 1
+    ./ceph osd crush rule dump replicated_ruleset | grep '"ruleset": '$expected || return 1
+    CEPH_ARGS='' ./ceph --admin-daemon $dir/ceph-mon.a.asok log flush || return 1
+    grep "osd_pool_default_crush_rule is deprecated " $dir/mon.a.log || return 1
 }
 
 # Before http://tracker.ceph.com/issues/8307 the invalid profile was created
 function TEST_erasure_invalid_profile() {
     local dir=$1
-    run_mon $dir a --public-addr $CEPH_MON
+    run_mon $dir a || return 1
     local poolname=pool_erasure
     local notaprofile=not-a-valid-erasure-code-profile
     ! ./ceph osd pool create $poolname 12 12 erasure $notaprofile || return 1
@@ -86,7 +84,7 @@ function TEST_erasure_invalid_profile() {
 
 function TEST_erasure_crush_rule() {
     local dir=$1
-    run_mon $dir a --public-addr $CEPH_MON
+    run_mon $dir a || return 1
     # 
     # choose the crush ruleset used with an erasure coded pool
     #
@@ -116,7 +114,7 @@ function TEST_erasure_crush_rule() {
 
 function TEST_erasure_code_profile_default() {
     local dir=$1
-    run_mon $dir a --public-addr $CEPH_MON
+    run_mon $dir a || return 1
     ./ceph osd erasure-code-profile rm default || return 1
     ! ./ceph osd erasure-code-profile ls | grep default || return 1
     ./ceph osd pool create $poolname 12 12 erasure default
@@ -145,9 +143,9 @@ function TEST_erasure_crush_stripe_width_padded() {
     expected_chunk_size=2048
     actual_stripe_width=$(($expected_chunk_size * $k))
     desired_stripe_width=$(($actual_stripe_width - 1))
-    run_mon $dir a --public-addr $CEPH_MON \
+    run_mon $dir a \
         --osd_pool_erasure_code_stripe_width $desired_stripe_width \
-        --osd_pool_default_erasure_code_profile "$profile"
+        --osd_pool_default_erasure_code_profile "$profile" || return 1
     ./ceph osd pool create pool_erasure 12 12 erasure
     ./ceph osd dump | tee $dir/osd.json
     grep "stripe_width $actual_stripe_width" $dir/osd.json > /dev/null || return 1
@@ -155,7 +153,7 @@ function TEST_erasure_crush_stripe_width_padded() {
 
 function TEST_erasure_code_pool() {
     local dir=$1
-    run_mon $dir a --public-addr $CEPH_MON
+    run_mon $dir a || return 1
     ./ceph --format json osd dump > $dir/osd.json
     local expected='"erasure_code_profile":"default"'
     ! grep "$expected" $dir/osd.json || return 1
@@ -171,7 +169,7 @@ function TEST_erasure_code_pool() {
 
 function TEST_replicated_pool_with_ruleset() {
     local dir=$1
-    run_mon $dir a --public-addr $CEPH_MON
+    run_mon $dir a
     local ruleset=ruleset0
     local root=host1
     ./ceph osd crush add-bucket $root host
@@ -189,9 +187,45 @@ function TEST_replicated_pool_with_ruleset() {
         grep "doesn't exist" || return 1
 }
 
+function TEST_replicated_pool_with_non_existent_default_ruleset_0() {
+    local dir=$1
+    run_mon $dir a || return 1
+    # change the default crush rule
+    ./ceph tell mon.a injectargs -- \
+        --osd_pool_default_crush_replicated_ruleset 66 || return 1
+    ./ceph osd pool create mypool 12 12 replicated 2>&1 | \
+        grep "No suitable CRUSH ruleset exists" || return 1
+    CEPH_ARGS='' ./ceph --admin-daemon $dir/ceph-mon.a.asok log flush || return 1
+    ! grep "osd_pool_default_crush_rule is deprecated " $dir/mon.a.log || return 1
+}
+
+function TEST_replicated_pool_with_non_existent_default_ruleset_1() {
+    local dir=$1
+    run_mon $dir a || return 1
+    # change the default crush rule using deprecated option
+    ./ceph tell mon.a injectargs -- \
+        --osd_pool_default_crush_rule 55 || return 1
+    ./ceph osd pool create mypool 12 12 replicated 2>&1 | \
+        grep "No suitable CRUSH ruleset exists" || return 1
+    CEPH_ARGS='' ./ceph --admin-daemon $dir/ceph-mon.a.asok log flush || return 1
+    grep "osd_pool_default_crush_rule is deprecated " $dir/mon.a.log || return 1
+}
+
+function TEST_replicated_pool_with_non_existent_default_ruleset_2() {
+    local dir=$1
+    run_mon $dir a || return 1
+    ./ceph tell mon.a injectargs -- \
+        --osd_pool_default_crush_rule 77 \
+        --osd_pool_default_crush_replicated_ruleset 33 || return 1
+    ./ceph osd pool create mypool 12 12 replicated 2>&1 | \
+        grep "No suitable CRUSH ruleset exists" || return 1
+    CEPH_ARGS='' ./ceph --admin-daemon $dir/ceph-mon.a.asok log flush || return 1
+    grep "osd_pool_default_crush_rule is deprecated " $dir/mon.a.log || return 1
+}
+
 function TEST_erasure_code_pool_lrc() {
     local dir=$1
-    run_mon $dir a --public-addr $CEPH_MON
+    run_mon $dir a || return 1
 
     ./ceph osd erasure-code-profile set LRCprofile \
              plugin=lrc \
@@ -210,12 +244,11 @@ function TEST_erasure_code_pool_lrc() {
 
 function TEST_replicated_pool() {
     local dir=$1
-    run_mon $dir a --public-addr $CEPH_MON
+    run_mon $dir a || return 1
     ./ceph osd pool create replicated 12 12 replicated replicated_ruleset 2>&1 | \
         grep "pool 'replicated' created" || return 1
     ./ceph osd pool create replicated 12 12 replicated replicated_ruleset 2>&1 | \
         grep 'already exists' || return 1
-    ! ./ceph osd pool create replicated0 12 12 replicated INVALIDRULESET
     # default is replicated
     ./ceph osd pool create replicated1 12 12 2>&1 | \
         grep "pool 'replicated1' created" || return 1
@@ -228,17 +261,17 @@ function TEST_replicated_pool() {
 
 function TEST_no_pool_delete() {
     local dir=$1
-    run_mon $dir a --public-addr $CEPH_MON
-    ./ceph osd pool create foo 1
-    ./ceph tell mon.a injectargs -- --no-mon-allow-pool-delete
-    expect_false ./ceph osd pool delete foo foo --yes-i-really-really-mean-it
-    ./ceph tell mon.a injectargs -- --mon-allow-pool-delete
-    ./ceph osd pool delete foo foo --yes-i-really-really-mean-it
+    run_mon $dir a || return 1
+    ./ceph osd pool create foo 1 || return 1
+    ./ceph tell mon.a injectargs -- --no-mon-allow-pool-delete || return 1
+    ! ./ceph osd pool delete foo foo --yes-i-really-really-mean-it || return 1
+    ./ceph tell mon.a injectargs -- --mon-allow-pool-delete || return 1
+    ./ceph osd pool delete foo foo --yes-i-really-really-mean-it || return 1
 }
 
 function TEST_utf8_cli() {
     local dir=$1
-    run_mon $dir a --public-addr $CEPH_MON
+    run_mon $dir a || return 1
     # Hopefully it's safe to include literal UTF-8 characters to test
     # the fix for http://tracker.ceph.com/issues/7387.  If it turns out
     # to not be OK (when is the default encoding *not* UTF-8?), maybe
@@ -252,7 +285,7 @@ function TEST_utf8_cli() {
     ./ceph osd pool delete 黄 黄 --yes-i-really-really-mean-it
 }
 
-main osd-pool-create
+main osd-pool-create "$@"
 
 # Local Variables:
 # compile-command: "cd ../.. ; make -j4 && test/mon/osd-pool-create.sh"
diff --git a/src/test/mon/test_mon_workloadgen.cc b/src/test/mon/test_mon_workloadgen.cc
index cb3419c..e18906d 100644
--- a/src/test/mon/test_mon_workloadgen.cc
+++ b/src/test/mon/test_mon_workloadgen.cc
@@ -244,8 +244,7 @@ class ClientStub : public TestStub
       return err;
     }
 
-    messenger.reset(Messenger::create(cct, cct->_conf->ms_type, entity_name_t::CLIENT(-1),
-				      "stubclient", getpid()));
+    messenger.reset(Messenger::create_client_messenger(cct, "stubclient"));
     assert(messenger.get() != NULL);
 
     messenger->set_default_policy(
diff --git a/src/test/msgr/perf_msgr_client.cc b/src/test/msgr/perf_msgr_client.cc
new file mode 100644
index 0000000..17c4aca
--- /dev/null
+++ b/src/test/msgr/perf_msgr_client.cc
@@ -0,0 +1,205 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Haomai Wang
+ *
+ * Author: Haomai Wang <haomaiwang at gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string>
+#include <unistd.h>
+#include <iostream>
+
+using namespace std;
+
+#include "include/atomic.h"
+#include "common/ceph_argparse.h"
+#include "common/debug.h"
+#include "common/Cycles.h"
+#include "global/global_init.h"
+#include "msg/Messenger.h"
+#include "messages/MOSDOp.h"
+
+class MessengerClient {
+  class ClientThread;
+  class ClientDispatcher : public Dispatcher {
+    uint64_t think_time;
+    ClientThread *thread;
+
+   public:
+    ClientDispatcher(uint64_t delay, ClientThread *t): Dispatcher(g_ceph_context), think_time(delay), thread(t) {}
+    bool ms_can_fast_dispatch_any() const { return true; }
+    bool ms_can_fast_dispatch(Message *m) const {
+      switch (m->get_type()) {
+      case CEPH_MSG_OSD_OPREPLY:
+        return true;
+      default:
+        return false;
+      }
+    }
+
+    void ms_handle_fast_connect(Connection *con) {}
+    void ms_handle_fast_accept(Connection *con) {}
+    bool ms_dispatch(Message *m) { return true; }
+    void ms_fast_dispatch(Message *m);
+    bool ms_handle_reset(Connection *con) { return true; }
+    void ms_handle_remote_reset(Connection *con) {}
+    bool ms_verify_authorizer(Connection *con, int peer_type, int protocol,
+                              bufferlist& authorizer, bufferlist& authorizer_reply,
+                              bool& isvalid, CryptoKey& session_key) {
+      isvalid = true;
+      return true;
+    }
+  };
+
+  class ClientThread : public Thread {
+    Messenger *msgr;
+    int concurrent;
+    ConnectionRef conn;
+    atomic_t client_inc;
+    object_t oid;
+    object_locator_t oloc;
+    pg_t pgid;
+    int msg_len;
+    bufferlist data;
+    int ops;
+    ClientDispatcher dispatcher;
+
+   public:
+    Mutex lock;
+    Cond cond;
+    uint64_t inflight;
+
+    ClientThread(Messenger *m, int c, ConnectionRef con, int len, int ops, int think_time_us):
+        msgr(m), concurrent(c), conn(con), client_inc(0), oid("object-name"), oloc(1, 1), msg_len(len), ops(ops),
+        dispatcher(think_time_us, this), lock("MessengerBenchmark::ClientThread::lock") {
+      m->add_dispatcher_head(&dispatcher);
+      bufferptr ptr(msg_len);
+      memset(ptr.c_str(), 0, msg_len);
+      data.append(ptr);
+    }
+    void *entry() {
+      lock.Lock();
+      for (int i = 0; i < ops; ++i) {
+        if (inflight > uint64_t(concurrent)) {
+          cond.Wait(lock);
+        }
+        MOSDOp *m = new MOSDOp(client_inc.read(), 0, oid, oloc, pgid, 0, 0, 0);
+        m->write(0, msg_len, data);
+        inflight++;
+        conn->send_message(m);
+        //cerr << __func__ << " send m=" << m << std::endl;
+      }
+      lock.Unlock();
+      msgr->shutdown();
+      return 0;
+    }
+  };
+
+  string type;
+  string serveraddr;
+  int think_time_us;
+  vector<Messenger*> msgrs;
+  vector<ClientThread*> clients;
+
+ public:
+  MessengerClient(string t, string addr, int delay):
+      type(t), serveraddr(addr), think_time_us(delay) {
+  }
+  ~MessengerClient() {
+    for (uint64_t i = 0; i < clients.size(); ++i)
+      delete clients[i];
+    for (uint64_t i = 0; i < msgrs.size(); ++i) {
+      msgrs[i]->shutdown();
+      msgrs[i]->wait();
+    }
+  }
+  void ready(int c, int jobs, int ops, int msg_len) {
+    entity_addr_t addr;
+    addr.parse(serveraddr.c_str());
+    addr.set_nonce(0);
+    for (int i = 0; i < jobs; ++i) {
+      Messenger *msgr = Messenger::create(g_ceph_context, type, entity_name_t::CLIENT(0), "client", getpid()+i);
+      msgr->set_default_policy(Messenger::Policy::lossless_client(0, 0));
+      entity_inst_t inst(entity_name_t::OSD(0), addr);
+      ConnectionRef conn = msgr->get_connection(inst);
+      ClientThread *t = new ClientThread(msgr, c, conn, msg_len, ops, think_time_us);
+      msgrs.push_back(msgr);
+      clients.push_back(t);
+      msgr->start();
+    }
+    usleep(1000*1000);
+  }
+  void start() {
+    for (uint64_t i = 0; i < clients.size(); ++i)
+      clients[i]->create();
+    for (uint64_t i = 0; i < msgrs.size(); ++i)
+      msgrs[i]->wait();
+  }
+};
+
+void MessengerClient::ClientDispatcher::ms_fast_dispatch(Message *m) {
+  usleep(think_time);
+  m->put();
+  Mutex::Locker l(thread->lock);
+  thread->inflight--;
+  thread->cond.Signal();
+}
+
+
+void usage(const string &name) {
+  cerr << "Usage: " << name << " [server ip:port] [numjobs] [concurrency] [ios] [thinktime us] [msg length]" << std::endl;
+  cerr << "       [server ip:port]: connect to the ip:port pair" << std::endl;
+  cerr << "       [numjobs]: how much client threads spawned and do benchmark" << std::endl;
+  cerr << "       [concurrency]: the max inflight messages(like iodepth in fio)" << std::endl;
+  cerr << "       [ios]: how much messages sent for each client" << std::endl;
+  cerr << "       [thinktime]: sleep time when do fast dispatching(match client logic)" << std::endl;
+  cerr << "       [msg length]: message data bytes" << std::endl;
+}
+
+int main(int argc, char **argv)
+{
+  vector<const char*> args;
+  argv_to_vec(argc, (const char **)argv, args);
+
+  global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
+  common_init_finish(g_ceph_context);
+  g_ceph_context->_conf->apply_changes(NULL);
+
+  if (args.size() < 6) {
+    usage(argv[0]);
+    return 1;
+  }
+
+  int numjobs = atoi(args[1]);
+  int concurrent = atoi(args[2]);
+  int ios = atoi(args[3]);
+  int think_time = atoi(args[4]);
+  int len = atoi(args[5]);
+
+  cerr << " using ms-type " << g_ceph_context->_conf->ms_type << std::endl;
+  cerr << "       server ip:port " << args[0] << std::endl;
+  cerr << "       numjobs " << numjobs << std::endl;
+  cerr << "       concurrency " << concurrent << std::endl;
+  cerr << "       ios " << ios << std::endl;
+  cerr << "       thinktime(us) " << think_time << std::endl;
+  cerr << "       message data bytes " << len << std::endl;
+  MessengerClient client(g_ceph_context->_conf->ms_type, args[0], think_time);
+  client.ready(concurrent, numjobs, ios, len);
+  uint64_t start = Cycles::rdtsc();
+  client.start();
+  uint64_t stop = Cycles::rdtsc();
+  cerr << " Total op " << ios << " run time " << Cycles::to_microseconds(stop - start) << "us." << std::endl;
+
+  return 0;
+}
diff --git a/src/test/msgr/perf_msgr_server.cc b/src/test/msgr/perf_msgr_server.cc
new file mode 100644
index 0000000..b38a76a
--- /dev/null
+++ b/src/test/msgr/perf_msgr_server.cc
@@ -0,0 +1,171 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Haomai Wang
+ *
+ * Author: Haomai Wang <haomaiwang at gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string>
+#include <unistd.h>
+#include <iostream>
+
+using namespace std;
+
+#include "include/atomic.h"
+#include "common/ceph_argparse.h"
+#include "common/debug.h"
+#include "common/Cycles.h"
+#include "global/global_init.h"
+#include "msg/Messenger.h"
+#include "messages/MOSDOp.h"
+#include "messages/MOSDOpReply.h"
+
+class ServerDispatcher : public Dispatcher {
+  uint64_t think_time;
+  ThreadPool op_tp;
+  class OpWQ : public ThreadPool::WorkQueue<Message> {
+    list<Message*> messages;
+
+   public:
+    OpWQ(time_t timeout, time_t suicide_timeout, ThreadPool *tp)
+      : ThreadPool::WorkQueue<Message>("ServerDispatcher::OpWQ", timeout, suicide_timeout, tp) {}
+
+    bool _enqueue(Message *m) {
+      messages.push_back(m);
+      return true;
+    }
+    void _dequeue(Message *m) {
+      assert(0);
+    }
+    bool _empty() {
+      return messages.empty();
+    }
+    Message *_dequeue() {
+      if (messages.empty())
+	return NULL;
+      Message *m = messages.front();
+      messages.pop_front();
+      return m;
+    }
+    void _process(Message *m, ThreadPool::TPHandle &handle) {
+      MOSDOp *osd_op = static_cast<MOSDOp*>(m);
+      MOSDOpReply *reply = new MOSDOpReply(osd_op, 0, 0, 0, false);
+      m->get_connection()->send_message(reply);
+      m->put();
+    }
+    void _process_finish(Message *m) { }
+    void _clear() {
+      assert(messages.empty());
+    }
+  } op_wq;
+
+ public:
+  ServerDispatcher(int threads, uint64_t delay): Dispatcher(g_ceph_context), think_time(delay),
+    op_tp(g_ceph_context, "ServerDispatcher::op_tp", threads, "serverdispatcher_op_threads"),
+    op_wq(30, 30, &op_tp) {
+    op_tp.start();
+  }
+  ~ServerDispatcher() {
+    op_tp.stop();
+  }
+  bool ms_can_fast_dispatch_any() const { return true; }
+  bool ms_can_fast_dispatch(Message *m) const {
+    switch (m->get_type()) {
+    case CEPH_MSG_OSD_OP:
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  void ms_handle_fast_connect(Connection *con) {}
+  void ms_handle_fast_accept(Connection *con) {}
+  bool ms_dispatch(Message *m) { return true; }
+  bool ms_handle_reset(Connection *con) { return true; }
+  void ms_handle_remote_reset(Connection *con) {}
+  void ms_fast_dispatch(Message *m) {
+    usleep(think_time);
+    //cerr << __func__ << " reply message=" << m << std::endl;
+    op_wq.queue(m);
+  }
+  bool ms_verify_authorizer(Connection *con, int peer_type, int protocol,
+                            bufferlist& authorizer, bufferlist& authorizer_reply,
+                            bool& isvalid, CryptoKey& session_key) {
+    isvalid = true;
+    return true;
+  }
+};
+
+class MessengerServer {
+  Messenger *msgr;
+  string type;
+  string bindaddr;
+  ServerDispatcher dispatcher;
+
+ public:
+  MessengerServer(string t, string addr, int threads, int delay):
+      msgr(NULL), type(t), bindaddr(addr), dispatcher(threads, delay) {
+    msgr = Messenger::create(g_ceph_context, type, entity_name_t::OSD(0), "server", 0);
+    msgr->set_default_policy(Messenger::Policy::stateless_server(0, 0));
+  }
+  ~MessengerServer() {
+    msgr->shutdown();
+    msgr->wait();
+  }
+  void start() {
+    entity_addr_t addr;
+    addr.parse(bindaddr.c_str());
+    msgr->bind(addr);
+    msgr->add_dispatcher_head(&dispatcher);
+    msgr->start();
+    msgr->wait();
+  }
+};
+
+void usage(const string &name) {
+  cerr << "Usage: " << name << " [bind ip:port] [server worker threads] [thinktime us]" << std::endl;
+  cerr << "       [bind ip:port]: The ip:port pair to bind, client need to specify this pair to connect" << std::endl;
+  cerr << "       [server worker threads]: threads will process incoming messages and reply(matching pg threads)" << std::endl;
+  cerr << "       [thinktime]: sleep time when do dispatching(match fast dispatch logic in OSD.cc)" << std::endl;
+}
+
+int main(int argc, char **argv)
+{
+  vector<const char*> args;
+  argv_to_vec(argc, (const char **)argv, args);
+
+  global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
+  common_init_finish(g_ceph_context);
+  g_ceph_context->_conf->apply_changes(NULL);
+
+  if (args.size() < 3) {
+    usage(argv[0]);
+    return 1;
+  }
+
+  int worker_threads = atoi(args[1]);
+  int think_time = atoi(args[2]);
+  cerr << " This tool won't handle connection error alike things, " << std::endl;
+  cerr << "please ensure the proper network environment to test." << std::endl;
+  cerr << " Or ctrl+c when meeting error and restart tests" << std::endl;
+  cerr << " using ms-type " << g_ceph_context->_conf->ms_type << std::endl;
+  cerr << "       bind ip:port " << args[0] << std::endl;
+  cerr << "       worker threads " << worker_threads << std::endl;
+  cerr << "       thinktime(us) " << think_time << std::endl;
+
+  MessengerServer server(g_ceph_context->_conf->ms_type, args[0], worker_threads, think_time);
+  server.start();
+
+  return 0;
+}
diff --git a/src/test/msgr/test_async_driver.cc b/src/test/msgr/test_async_driver.cc
index acb1c0e..1d6d290 100644
--- a/src/test/msgr/test_async_driver.cc
+++ b/src/test/msgr/test_async_driver.cc
@@ -24,6 +24,7 @@
 #include <stdint.h>
 #include <arpa/inet.h>
 #include "include/Context.h"
+#include "include/atomic.h"
 #include "global/global_init.h"
 #include "common/ceph_argparse.h"
 #include "msg/async/Event.h"
@@ -138,26 +139,30 @@ TEST_P(EventDriverTest, PipeTest) {
 void* echoclient(void *arg)
 {
   intptr_t port = (intptr_t)arg;
-  int connect_sd = ::socket(AF_INET, SOCK_STREAM, 0);
   struct sockaddr_in sa;
+  memset(&sa, 0, sizeof(sa));
   sa.sin_family = AF_INET;
   sa.sin_port = htons(port);
   char addr[] = "127.0.0.1";
   int r = inet_aton(addr, &sa.sin_addr);
-  r = connect(connect_sd, (struct sockaddr*)&sa, sizeof(sa));
-  int t = 0;
 
-  do {
-    char c[] = "banner";
-    r = write(connect_sd, c, sizeof(c));
-    char d[100];
-    r = read(connect_sd, d, sizeof(d));
-    if (r == 0)
-      break;
-    if (t++ == 30)
-      break;
-  } while (1);
-  ::close(connect_sd);
+  int connect_sd = ::socket(AF_INET, SOCK_STREAM, 0);
+  if (connect_sd >= 0) {
+    r = connect(connect_sd, (struct sockaddr*)&sa, sizeof(sa));
+    int t = 0;
+  
+    do {
+      char c[] = "banner";
+      r = write(connect_sd, c, sizeof(c));
+      char d[100];
+      r = read(connect_sd, d, sizeof(d));
+      if (r == 0)
+        break;
+      if (t++ == 30)
+        break;
+    } while (1);
+    ::close(connect_sd);
+  }
   return 0;
 }
 
@@ -253,13 +258,70 @@ TEST(EventCenterTest, FileEventExpansion) {
   for (int i = 0; i < 300; i++) {
     int sd = ::socket(AF_INET, SOCK_STREAM, 0);
     center.create_file_event(sd, EVENT_READABLE, e);
-    sds.push_back(::socket(AF_INET, SOCK_STREAM, 0));
+    sds.push_back(sd);
   }
 
   for (vector<int>::iterator it = sds.begin(); it != sds.end(); ++it)
     center.delete_file_event(*it, EVENT_READABLE);
 }
 
+
+class Worker : public Thread {
+  CephContext *cct;
+  bool done;
+
+ public:
+  EventCenter center;
+  Worker(CephContext *c): cct(c), done(false), center(c) {
+    center.init(100);
+  }
+  void stop() {
+    done = true; 
+    center.wakeup();
+  }
+  void* entry() {
+    center.set_owner(pthread_self());
+    while (!done)
+      center.process_events(1000000);
+    return 0;
+  }
+};
+
+class CountEvent: public EventCallback {
+  atomic_t *count;
+  Mutex *lock;
+  Cond *cond;
+
+ public:
+  CountEvent(atomic_t *atomic, Mutex *l, Cond *c): count(atomic), lock(l), cond(c) {}
+  void do_request(int id) {
+    lock->Lock();
+    count->dec();
+    cond->Signal();
+    lock->Unlock();
+  }
+};
+
+TEST(EventCenterTest, DispatchTest) {
+  Worker worker1(g_ceph_context), worker2(g_ceph_context);
+  atomic_t count(0);
+  Mutex lock("DispatchTest::lock");
+  Cond cond;
+  worker1.create();
+  worker2.create();
+  for (int i = 0; i < 10000; ++i) {
+    count.inc();
+    worker1.center.dispatch_event_external(EventCallbackRef(new CountEvent(&count, &lock, &cond)));
+    count.inc();
+    worker2.center.dispatch_event_external(EventCallbackRef(new CountEvent(&count, &lock, &cond)));
+    Mutex::Locker l(lock);
+    while (count.read())
+      cond.Wait(lock);
+  }
+  worker1.stop();
+  worker2.stop();
+}
+
 INSTANTIATE_TEST_CASE_P(
   AsyncMessenger,
   EventDriverTest,
diff --git a/src/test/msgr/test_msgr.cc b/src/test/msgr/test_msgr.cc
index 7238b6a..eddef22 100644
--- a/src/test/msgr/test_msgr.cc
+++ b/src/test/msgr/test_msgr.cc
@@ -87,10 +87,11 @@ class FakeDispatcher : public Dispatcher {
   bool got_new;
   bool got_remote_reset;
   bool got_connect;
+  bool loopback;
 
   FakeDispatcher(bool s): Dispatcher(g_ceph_context), lock("FakeDispatcher::lock"),
                           is_server(s), got_new(false), got_remote_reset(false),
-                          got_connect(false) {}
+                          got_connect(false), loopback(false) {}
   bool ms_can_fast_dispatch_any() const { return true; }
   bool ms_can_fast_dispatch(Message *m) const {
     switch (m->get_type()) {
@@ -177,7 +178,12 @@ class FakeDispatcher : public Dispatcher {
     s->count++;
     cerr << __func__ << " conn: " << m->get_connection() << " session " << s << " count: " << s->count << std::endl;
     if (is_server) {
-      reply_message(m);
+      if (loopback)
+        assert(m->get_source().is_osd());
+      else
+        reply_message(m);
+    } else if (loopback) {
+      assert(m->get_source().is_client());
     }
     got_new = true;
     cond.Signal();
@@ -256,6 +262,7 @@ TEST_P(MessengerTest, SimpleTest) {
   ASSERT_FALSE(conn->is_connected());
 
   // 5. loopback connection
+  srv_dispatcher.loopback = true;
   conn = client_msgr->get_loopback_connection();
   {
     m = new MPing();
@@ -265,6 +272,7 @@ TEST_P(MessengerTest, SimpleTest) {
       cli_dispatcher.cond.Wait(cli_dispatcher.lock);
     cli_dispatcher.got_new = false;
   }
+  srv_dispatcher.loopback = false;
   ASSERT_TRUE(static_cast<Session*>(conn->get_priv())->get_count() == 1);
   client_msgr->shutdown();
   client_msgr->wait();
@@ -624,6 +632,71 @@ TEST_P(MessengerTest, AuthTest) {
   client_msgr->wait();
 }
 
+TEST_P(MessengerTest, MessageTest) {
+  FakeDispatcher cli_dispatcher(false), srv_dispatcher(true);
+  entity_addr_t bind_addr;
+  bind_addr.parse("127.0.0.1");
+  Messenger::Policy p = Messenger::Policy::stateful_server(0, 0);
+  server_msgr->set_policy(entity_name_t::TYPE_CLIENT, p);
+  p = Messenger::Policy::lossless_peer(0, 0);
+  client_msgr->set_policy(entity_name_t::TYPE_OSD, p);
+
+  server_msgr->bind(bind_addr);
+  server_msgr->add_dispatcher_head(&srv_dispatcher);
+  server_msgr->start();
+  client_msgr->add_dispatcher_head(&cli_dispatcher);
+  client_msgr->start();
+
+
+  // 1. A very large "front"(as well as "payload")
+  // Because a external message need to invade Messenger::decode_message,
+  // here we only use existing message class(MCommand)
+  ConnectionRef conn = client_msgr->get_connection(server_msgr->get_myinst());
+  {
+    uuid_d uuid;
+    uuid.generate_random();
+    vector<string> cmds;
+    string s("abcdefghijklmnopqrstuvwxyz");
+    for (int i = 0; i < 1024*30; i++)
+      cmds.push_back(s);
+    MCommand *m = new MCommand(uuid);
+    m->cmd = cmds;
+    conn->send_message(m);
+    utime_t t;
+    t += 1000*1000*500;
+    Mutex::Locker l(cli_dispatcher.lock);
+    while (!cli_dispatcher.got_new)
+      cli_dispatcher.cond.WaitInterval(g_ceph_context, cli_dispatcher.lock, t);
+    ASSERT_TRUE(cli_dispatcher.got_new);
+    cli_dispatcher.got_new = false;
+  }
+
+  // 2. A very large "data"
+  {
+    bufferlist bl;
+    string s("abcdefghijklmnopqrstuvwxyz");
+    for (int i = 0; i < 1024*30; i++)
+      bl.append(s);
+    MPing *m = new MPing();
+    m->set_data(bl);
+    conn->send_message(m);
+    utime_t t;
+    t += 1000*1000*500;
+    Mutex::Locker l(cli_dispatcher.lock);
+    while (!cli_dispatcher.got_new)
+      cli_dispatcher.cond.WaitInterval(g_ceph_context, cli_dispatcher.lock, t);
+    ASSERT_TRUE(cli_dispatcher.got_new);
+    cli_dispatcher.got_new = false;
+  }
+  server_msgr->shutdown();
+  client_msgr->shutdown();
+  server_msgr->wait();
+  client_msgr->wait();
+}
+
+
+class SyntheticWorkload;
+
 class SyntheticDispatcher : public Dispatcher {
  public:
   Mutex lock;
@@ -635,10 +708,11 @@ class SyntheticDispatcher : public Dispatcher {
   map<ConnectionRef, list<uint64_t> > conn_sent;
   map<uint64_t, bufferlist> sent;
   atomic_t index;
+  SyntheticWorkload *workload;
 
-  SyntheticDispatcher(bool s): Dispatcher(g_ceph_context), lock("SyntheticDispatcher::lock"),
-                          is_server(s), got_new(false), got_remote_reset(false),
-                          got_connect(false), index(0) {}
+  SyntheticDispatcher(bool s, SyntheticWorkload *wl):
+      Dispatcher(g_ceph_context), lock("SyntheticDispatcher::lock"), is_server(s), got_new(false),
+      got_remote_reset(false), got_connect(false), index(0), workload(wl) {}
   bool ms_can_fast_dispatch_any() const { return true; }
   bool ms_can_fast_dispatch(Message *m) const {
     switch (m->get_type()) {
@@ -660,28 +734,39 @@ class SyntheticDispatcher : public Dispatcher {
   bool ms_dispatch(Message *m) {
     assert(0);
   }
-  bool ms_handle_reset(Connection *con) {
-    return true;
-  }
+  bool ms_handle_reset(Connection *con);
   void ms_handle_remote_reset(Connection *con) {
     Mutex::Locker l(lock);
+    list<uint64_t> c = conn_sent[con];
+    for (list<uint64_t>::iterator it = c.begin();
+         it != c.end(); ++it)
+      sent.erase(*it);
+    conn_sent.erase(con);
     got_remote_reset = true;
   }
   void ms_fast_dispatch(Message *m) {
+    // MSG_COMMAND is used to disorganize regular message flow
+    if (m->get_type() == MSG_COMMAND) {
+      m->put();
+      return ;
+    }
+
     Mutex::Locker l(lock);
-    if (is_server) {
-      reply_message(m);
-    } else if (m->get_middle().length()) {
-      bufferlist middle = m->get_middle();
-      uint64_t i;
-      ASSERT_EQ(sizeof(uint64_t), middle.length());
-      memcpy(&i, middle.c_str(), middle.length());
-      if (sent.count(i)) {
-        ASSERT_EQ(conn_sent[m->get_connection()].front(), i);
-        ASSERT_TRUE(m->get_data().contents_equal(sent[i]));
-        conn_sent[m->get_connection()].pop_front();
-        sent.erase(i);
-      }
+    uint64_t i;
+    bool reply;
+    assert(m->get_middle().length());
+    bufferlist::iterator blp = m->get_middle().begin();
+    ::decode(i, blp);
+    ::decode(reply, blp);
+    if (reply) {
+      //cerr << __func__ << " reply=" << reply << " i=" << i << std::endl;
+      reply_message(m, i);
+    } else if (sent.count(i)) {
+      //cerr << __func__ << " reply=" << reply << " i=" << i << std::endl;
+      ASSERT_EQ(conn_sent[m->get_connection()].front(), i);
+      ASSERT_TRUE(m->get_data().contents_equal(sent[i]));
+      conn_sent[m->get_connection()].pop_front();
+      sent.erase(i);
     }
     got_new = true;
     cond.Signal();
@@ -695,13 +780,17 @@ class SyntheticDispatcher : public Dispatcher {
     return true;
   }
 
-  void reply_message(Message *m) {
+  void reply_message(Message *m, uint64_t i) {
+    bufferlist bl;
+    ::encode(i, bl);
+    ::encode(false, bl);
     MPing *rm = new MPing();
     if (m->get_data_len())
       rm->set_data(m->get_data());
     if (m->get_middle().length())
-      rm->set_middle(m->get_middle());
+      rm->set_middle(bl);
     m->get_connection()->send_message(rm);
+    //cerr << __func__ << " conn=" << m->get_connection() << " reply m=" << m << " i=" << i << std::endl;
   }
 
   void send_message_wrap(ConnectionRef con, Message *m) {
@@ -710,12 +799,14 @@ class SyntheticDispatcher : public Dispatcher {
       bufferlist bl;
       uint64_t i = index.read();
       index.inc();
-      bufferptr bp(sizeof(i));
-      memcpy(bp.c_str(), (char*)&i, sizeof(i));
-      bl.push_back(bp);
+      ::encode(i, bl);
+      ::encode(true, bl);
       m->set_middle(bl);
-      sent[i] = m->get_data();
-      conn_sent[con].push_back(i);
+      if (!con->get_messenger()->get_default_policy().lossy) {
+        sent[i] = m->get_data();
+        conn_sent[con].push_back(i);
+      }
+      //cerr << __func__ << " conn=" << con.get() << " send m=" << m << " i=" << i << std::endl;
     }
     ASSERT_EQ(con->send_message(m), 0);
   }
@@ -736,76 +827,13 @@ class SyntheticDispatcher : public Dispatcher {
 };
 
 
-TEST_P(MessengerTest, MessageTest) {
-  SyntheticDispatcher cli_dispatcher(false), srv_dispatcher(true);
-  entity_addr_t bind_addr;
-  bind_addr.parse("127.0.0.1");
-  Messenger::Policy p = Messenger::Policy::stateful_server(0, 0);
-  server_msgr->set_policy(entity_name_t::TYPE_CLIENT, p);
-  p = Messenger::Policy::lossless_peer(0, 0);
-  client_msgr->set_policy(entity_name_t::TYPE_OSD, p);
-
-  server_msgr->bind(bind_addr);
-  server_msgr->add_dispatcher_head(&srv_dispatcher);
-  server_msgr->start();
-  client_msgr->add_dispatcher_head(&cli_dispatcher);
-  client_msgr->start();
-
-
-  // 1. A very large "front"(as well as "payload")
-  // Because a external message need to invade Messenger::decode_message,
-  // here we only use existing message class(MCommand)
-  ConnectionRef conn = client_msgr->get_connection(server_msgr->get_myinst());
-  {
-    uuid_d uuid;
-    uuid.generate_random();
-    vector<string> cmds;
-    string s("abcdefghijklmnopqrstuvwxyz");
-    for (int i = 0; i < 1024*30; i++)
-      cmds.push_back(s);
-    MCommand *m = new MCommand(uuid);
-    m->cmd = cmds;
-    ASSERT_EQ(conn->send_message(m), 0);
-    utime_t t;
-    t += 1000*1000*500;
-    Mutex::Locker l(cli_dispatcher.lock);
-    while (!cli_dispatcher.got_new)
-      cli_dispatcher.cond.WaitInterval(g_ceph_context, cli_dispatcher.lock, t);
-    ASSERT_TRUE(cli_dispatcher.got_new);
-    cli_dispatcher.got_new = false;
-  }
-
-  // 2. A very large "data"
-  {
-    bufferlist bl;
-    string s("abcdefghijklmnopqrstuvwxyz");
-    for (int i = 0; i < 1024*30; i++)
-      bl.append(s);
-    MPing *m = new MPing();
-    m->set_data(bl);
-    cli_dispatcher.send_message_wrap(conn, m);
-    utime_t t;
-    t += 1000*1000*500;
-    Mutex::Locker l(cli_dispatcher.lock);
-    while (!cli_dispatcher.got_new)
-      cli_dispatcher.cond.WaitInterval(g_ceph_context, cli_dispatcher.lock, t);
-    ASSERT_TRUE(cli_dispatcher.got_new);
-    cli_dispatcher.got_new = false;
-  }
-  server_msgr->shutdown();
-  client_msgr->shutdown();
-  server_msgr->wait();
-  client_msgr->wait();
-}
-
-
 class SyntheticWorkload {
   Mutex lock;
   Cond cond;
   set<Messenger*> available_servers;
   set<Messenger*> available_clients;
-  map<pair<Messenger*, Messenger*>, ConnectionRef> available_connections;
-  SyntheticDispatcher cli_dispatcher, srv_dispatcher;
+  map<ConnectionRef, pair<Messenger*, Messenger*> > available_connections;
+  SyntheticDispatcher dispatcher;
   gen_type rng;
   vector<bufferlist> rand_data;
 
@@ -814,23 +842,23 @@ class SyntheticWorkload {
   static const unsigned max_connections = 128;
   static const unsigned max_message_len = 1024 * 1024 * 4;
 
-  SyntheticWorkload(int servers, int clients, string type, int random_num):
-      lock("SyntheticWorkload::lock"), cli_dispatcher(false), srv_dispatcher(true),
-      rng(time(NULL)) {
+  SyntheticWorkload(int servers, int clients, string type, int random_num,
+                    Messenger::Policy srv_policy, Messenger::Policy cli_policy):
+      lock("SyntheticWorkload::lock"), dispatcher(false, this), rng(time(NULL)) {
     Messenger *msgr;
     int base_port = 16800;
+    entity_addr_t bind_addr;
+    char addr[64];
     for (int i = 0; i < servers; ++i) {
-      entity_addr_t bind_addr;
-      char addr[64];
-      snprintf(addr, sizeof(addr), "127.0.0.1:%d", base_port+i);
       msgr = Messenger::create(g_ceph_context, type, entity_name_t::OSD(0),
                                "server", getpid()+i);
+      snprintf(addr, sizeof(addr), "127.0.0.1:%d", base_port+i);
       bind_addr.parse(addr);
       msgr->bind(bind_addr);
-      msgr->add_dispatcher_head(&srv_dispatcher);
+      msgr->add_dispatcher_head(&dispatcher);
 
       assert(msgr);
-      msgr->set_default_policy(Messenger::Policy::stateful_server(0, 0));
+      msgr->set_default_policy(srv_policy);
       available_servers.insert(msgr);
       msgr->start();
     }
@@ -838,9 +866,15 @@ class SyntheticWorkload {
     for (int i = 0; i < clients; ++i) {
       msgr = Messenger::create(g_ceph_context, type, entity_name_t::CLIENT(-1),
                                "client", getpid()+i+servers);
+      if (cli_policy.standby) {
+        snprintf(addr, sizeof(addr), "127.0.0.1:%d", base_port+i+servers);
+        bind_addr.parse(addr);
+        msgr->bind(bind_addr);
+      }
+      msgr->add_dispatcher_head(&dispatcher);
+
       assert(msgr);
-      msgr->set_default_policy(Messenger::Policy::lossless_client(0, 0));
-      msgr->add_dispatcher_head(&cli_dispatcher);
+      msgr->set_default_policy(cli_policy);
       available_clients.insert(msgr);
       msgr->start();
     }
@@ -861,17 +895,18 @@ class SyntheticWorkload {
     }
   }
 
-  ConnectionRef _get_random_connection(pair<Messenger*, Messenger*> *p) {
-    while (cli_dispatcher.get_pending() > max_in_flight)
+  ConnectionRef _get_random_connection() {
+    while (dispatcher.get_pending() > max_in_flight) {
+      lock.Unlock();
       usleep(500);
+      lock.Lock();
+    }
     assert(lock.is_locked());
     boost::uniform_int<> choose(0, available_connections.size() - 1);
     int index = choose(rng);
-    map<pair<Messenger*, Messenger*>, ConnectionRef>::iterator i = available_connections.begin();
+    map<ConnectionRef, pair<Messenger*, Messenger*> >::iterator i = available_connections.begin();
     for (; index > 0; --index, ++i) ;
-    if (p)
-      *p = i->first;
-    return i->second;
+    return i->first;
   }
 
   bool can_create_connection() {
@@ -899,44 +934,76 @@ class SyntheticWorkload {
       client = *i;
     }
 
-    if (!available_connections.count(make_pair(client, server))) {
-      ConnectionRef conn = client->get_connection(server->get_myinst());
-      available_connections[make_pair(client, server)] = conn;
+    pair<Messenger*, Messenger*> p;
+    {
+      boost::uniform_int<> choose(0, available_servers.size() - 1);
+      if (server->get_default_policy().server) {
+        p = make_pair(client, server);
+      } else {
+        ConnectionRef conn = client->get_connection(server->get_myinst());
+        if (available_connections.count(conn) || choose(rng) % 2)
+          p = make_pair(client, server);
+        else
+          p = make_pair(server, client);
+      }
     }
+    ConnectionRef conn = p.first->get_connection(p.second->get_myinst());
+    available_connections[conn] = p;
   }
 
   void send_message() {
-    Message *m = new MPing();
-    bufferlist bl;
-    boost::uniform_int<> u(0, rand_data.size()-1);
-    uint64_t index = u(rng);
-    bl = rand_data[index];
-    m->set_data(bl);
     Mutex::Locker l(lock);
-    ConnectionRef conn = _get_random_connection(NULL);
-    cli_dispatcher.send_message_wrap(conn, m);
+    ConnectionRef conn = _get_random_connection();
+    boost::uniform_int<> true_false(0, 99);
+    int val = true_false(rng);
+    if (val >= 95) {
+      uuid_d uuid;
+      uuid.generate_random();
+      MCommand *m = new MCommand(uuid);
+      vector<string> cmds;
+      cmds.push_back("command");
+      m->cmd = cmds;
+      m->set_priority(200);
+      conn->send_message(m);
+    } else {
+      Message *m = new MPing();
+      bufferlist bl;
+      boost::uniform_int<> u(0, rand_data.size()-1);
+      uint64_t index = u(rng);
+      bl = rand_data[index];
+      m->set_data(bl);
+      dispatcher.send_message_wrap(conn, m);
+    }
   }
 
   void drop_connection() {
-    pair<Messenger*, Messenger*> p;
     Mutex::Locker l(lock);
     if (available_connections.size() < 10)
       return;
-    ConnectionRef conn = _get_random_connection(&p);
-    cli_dispatcher.clear_pending(conn);
+    ConnectionRef conn = _get_random_connection();
+    dispatcher.clear_pending(conn);
     conn->mark_down();
-    ASSERT_EQ(available_connections.erase(p), 1U);
+    pair<Messenger*, Messenger*> &p = available_connections[conn];
+    // it's a lossless policy, so we need to mark down each side
+    if (!p.first->get_default_policy().server && !p.second->get_default_policy().server) {
+      ASSERT_EQ(conn->get_messenger(), p.first);
+      ConnectionRef peer = p.second->get_connection(p.first->get_myinst());
+      peer->mark_down();
+      dispatcher.clear_pending(peer);
+      available_connections.erase(peer);
+    }
+    ASSERT_EQ(available_connections.erase(conn), 1U);
   }
 
   void print_internal_state() {
     Mutex::Locker l(lock);
     cerr << "available_connections: " << available_connections.size()
-         << " inflight messages: " << cli_dispatcher.get_pending() << std::endl;
+         << " inflight messages: " << dispatcher.get_pending() << std::endl;
   }
 
   void wait_for_done() {
     uint64_t i = 0;
-    while (cli_dispatcher.get_pending()) {
+    while (dispatcher.get_pending()) {
       usleep(1000*100);
       if (i++ % 50 == 0)
         print_internal_state();
@@ -957,10 +1024,23 @@ class SyntheticWorkload {
     }
     available_clients.clear();
   }
+
+  void handle_reset(Connection *con) {
+    Mutex::Locker l(lock);
+    available_connections.erase(con);
+    dispatcher.clear_pending(con);
+  }
 };
 
+bool SyntheticDispatcher::ms_handle_reset(Connection *con) {
+  workload->handle_reset(con);
+  return true;
+}
+
 TEST_P(MessengerTest, SyntheticStressTest) {
-  SyntheticWorkload test_msg(32, 128, GetParam(), 100);
+  SyntheticWorkload test_msg(8, 32, GetParam(), 100,
+                             Messenger::Policy::stateful_server(0, 0),
+                             Messenger::Policy::lossless_client(0, 0));
   for (int i = 0; i < 100; ++i) {
     if (!(i % 10)) cerr << "seeding connection " << i << std::endl;
     test_msg.generate_connection();
@@ -986,11 +1066,75 @@ TEST_P(MessengerTest, SyntheticStressTest) {
   test_msg.wait_for_done();
 }
 
+TEST_P(MessengerTest, SyntheticStressTest1) {
+  SyntheticWorkload test_msg(16, 32, GetParam(), 100,
+                             Messenger::Policy::lossless_peer_reuse(0, 0),
+                             Messenger::Policy::lossless_peer_reuse(0, 0));
+  for (int i = 0; i < 10; ++i) {
+    if (!(i % 10)) cerr << "seeding connection " << i << std::endl;
+    test_msg.generate_connection();
+  }
+  gen_type rng(time(NULL));
+  for (int i = 0; i < 10000; ++i) {
+    if (!(i % 10)) {
+      cerr << "Op " << i << ": ";
+      test_msg.print_internal_state();
+    }
+    boost::uniform_int<> true_false(0, 99);
+    int val = true_false(rng);
+    if (val > 80) {
+      test_msg.generate_connection();
+    } else if (val > 60) {
+      test_msg.drop_connection();
+    } else if (val > 10) {
+      test_msg.send_message();
+    } else {
+      usleep(rand() % 1000 + 500);
+    }
+  }
+  test_msg.wait_for_done();
+}
+
 
 TEST_P(MessengerTest, SyntheticInjectTest) {
   g_ceph_context->_conf->set_val("ms_inject_socket_failures", "30");
   g_ceph_context->_conf->set_val("ms_inject_internal_delays", "0.1");
-  SyntheticWorkload test_msg(4, 16, GetParam(), 100);
+  SyntheticWorkload test_msg(8, 32, GetParam(), 100,
+                             Messenger::Policy::stateful_server(0, 0),
+                             Messenger::Policy::lossless_client(0, 0));
+  for (int i = 0; i < 100; ++i) {
+    if (!(i % 10)) cerr << "seeding connection " << i << std::endl;
+    test_msg.generate_connection();
+  }
+  gen_type rng(time(NULL));
+  for (int i = 0; i < 1000; ++i) {
+    if (!(i % 10)) {
+      cerr << "Op " << i << ": ";
+      test_msg.print_internal_state();
+    }
+    boost::uniform_int<> true_false(0, 99);
+    int val = true_false(rng);
+    if (val > 90) {
+      test_msg.generate_connection();
+    } else if (val > 80) {
+      test_msg.drop_connection();
+    } else if (val > 10) {
+      test_msg.send_message();
+    } else {
+      usleep(rand() % 500 + 100);
+    }
+  }
+  test_msg.wait_for_done();
+  g_ceph_context->_conf->set_val("ms_inject_socket_failures", "0");
+  g_ceph_context->_conf->set_val("ms_inject_internal_delays", "0");
+}
+
+TEST_P(MessengerTest, SyntheticInjectTest2) {
+  g_ceph_context->_conf->set_val("ms_inject_socket_failures", "30");
+  g_ceph_context->_conf->set_val("ms_inject_internal_delays", "0.1");
+  SyntheticWorkload test_msg(8, 16, GetParam(), 100,
+                             Messenger::Policy::lossless_peer_reuse(0, 0),
+                             Messenger::Policy::lossless_peer_reuse(0, 0));
   for (int i = 0; i < 100; ++i) {
     if (!(i % 10)) cerr << "seeding connection " << i << std::endl;
     test_msg.generate_connection();
@@ -1018,6 +1162,73 @@ TEST_P(MessengerTest, SyntheticInjectTest) {
   g_ceph_context->_conf->set_val("ms_inject_internal_delays", "0");
 }
 
+TEST_P(MessengerTest, SyntheticInjectTest3) {
+  g_ceph_context->_conf->set_val("ms_inject_socket_failures", "600");
+  g_ceph_context->_conf->set_val("ms_inject_internal_delays", "0.1");
+  SyntheticWorkload test_msg(8, 16, GetParam(), 100,
+                             Messenger::Policy::stateless_server(0, 0),
+                             Messenger::Policy::lossy_client(0, 0));
+  for (int i = 0; i < 100; ++i) {
+    if (!(i % 10)) cerr << "seeding connection " << i << std::endl;
+    test_msg.generate_connection();
+  }
+  gen_type rng(time(NULL));
+  for (int i = 0; i < 1000; ++i) {
+    if (!(i % 10)) {
+      cerr << "Op " << i << ": ";
+      test_msg.print_internal_state();
+    }
+    boost::uniform_int<> true_false(0, 99);
+    int val = true_false(rng);
+    if (val > 90) {
+      test_msg.generate_connection();
+    } else if (val > 80) {
+      test_msg.drop_connection();
+    } else if (val > 10) {
+      test_msg.send_message();
+    } else {
+      usleep(rand() % 500 + 100);
+    }
+  }
+  test_msg.wait_for_done();
+  g_ceph_context->_conf->set_val("ms_inject_socket_failures", "0");
+  g_ceph_context->_conf->set_val("ms_inject_internal_delays", "0");
+}
+
+
+TEST_P(MessengerTest, SyntheticInjectTest4) {
+  g_ceph_context->_conf->set_val("ms_inject_socket_failures", "30");
+  g_ceph_context->_conf->set_val("ms_inject_internal_delays", "0.1");
+  SyntheticWorkload test_msg(16, 32, GetParam(), 100,
+                             Messenger::Policy::lossless_peer(0, 0),
+                             Messenger::Policy::lossless_peer(0, 0));
+  for (int i = 0; i < 100; ++i) {
+    if (!(i % 10)) cerr << "seeding connection " << i << std::endl;
+    test_msg.generate_connection();
+  }
+  gen_type rng(time(NULL));
+  for (int i = 0; i < 1000; ++i) {
+    if (!(i % 10)) {
+      cerr << "Op " << i << ": ";
+      test_msg.print_internal_state();
+    }
+    boost::uniform_int<> true_false(0, 99);
+    int val = true_false(rng);
+    if (val > 95) {
+      test_msg.generate_connection();
+    } else if (val > 80) {
+      // test_msg.drop_connection();
+    } else if (val > 10) {
+      test_msg.send_message();
+    } else {
+      usleep(rand() % 500 + 100);
+    }
+  }
+  test_msg.wait_for_done();
+  g_ceph_context->_conf->set_val("ms_inject_socket_failures", "0");
+  g_ceph_context->_conf->set_val("ms_inject_internal_delays", "0");
+}
+
 
 class MarkdownDispatcher : public Dispatcher {
   Mutex lock;
@@ -1168,6 +1379,7 @@ TEST(DummyTest, ValueParameterizedTestsAreNotSupportedOnThisPlatform) {}
 int main(int argc, char **argv) {
   vector<const char*> args;
   argv_to_vec(argc, (const char **)argv, args);
+  env_to_vec(args);
 
   global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
   g_ceph_context->_conf->set_val("auth_cluster_required", "none");
@@ -1175,6 +1387,7 @@ int main(int argc, char **argv) {
   g_ceph_context->_conf->set_val("auth_client_required", "none");
   g_ceph_context->_conf->set_val("enable_experimental_unrecoverable_data_corrupting_features", "ms-type-async");
   g_ceph_context->_conf->set_val("ms_die_on_bad_msg", "true");
+  g_ceph_context->_conf->set_val("ms_die_on_old_message", "true");
   g_ceph_context->_conf->set_val("ms_max_backoff", "1");
   common_init_finish(g_ceph_context);
 
diff --git a/src/test/objectstore/DeterministicOpSequence.cc b/src/test/objectstore/DeterministicOpSequence.cc
index d25b1f7..c26173f 100644
--- a/src/test/objectstore/DeterministicOpSequence.cc
+++ b/src/test/objectstore/DeterministicOpSequence.cc
@@ -39,7 +39,6 @@ DeterministicOpSequence::DeterministicOpSequence(ObjectStore *store,
     txn(0),
     m_osr("OSR")
 {
-  txn_coll = coll_t("meta");
   txn_object = hobject_t(sobject_t("txn", CEPH_NOSNAP));
 
   if (!status.empty())
@@ -133,8 +132,8 @@ void DeterministicOpSequence::note_txn(ObjectStore::Transaction *t)
 {
   bufferlist bl;
   ::encode(txn, bl);
-  t->truncate(txn_coll, txn_object, 0);
-  t->write(txn_coll, txn_object, 0, bl.length(), bl);
+  t->truncate(txn_coll, ghobject_t(txn_object), 0);
+  t->write(txn_coll, ghobject_t(txn_object), 0, bl.length(), bl);
   dout(10) << __func__ << " " << txn << dendl;
 }
 
@@ -434,9 +433,8 @@ bool DeterministicOpSequence::do_coll_create(rngen_t& gen)
     int pg_id = pg_range(gen);
     if (pg_created.count(pg_id) > 0)
       continue;
-    char buf[100];
-    snprintf(buf, 100, "%d.%x_head", pool_id, pg_id);
-    _do_coll_create(coll_t(buf), (uint32_t) pg_num, (uint64_t) num_objs);
+    _do_coll_create(coll_t(spg_t(pg_t(pg_id,pool_id),shard_id_t::NO_SHARD)),
+		    (uint32_t) pg_num, (uint64_t) num_objs);
     pg_created.insert(pg_id);
   }
   return true;
@@ -446,7 +444,7 @@ void DeterministicOpSequence::_do_coll_create(coll_t cid, uint32_t pg_num, uint6
 {
   ObjectStore::Transaction t;
   note_txn(&t);
-  t.create_collection(cid);
+  t.create_collection(cid, 32);
   bufferlist hint;
   ::encode(pg_num, hint);
   ::encode(num_objs, hint);
@@ -454,23 +452,23 @@ void DeterministicOpSequence::_do_coll_create(coll_t cid, uint32_t pg_num, uint6
   dout(0) << "Give collection: " << cid << " a hint, pg_num is: " << pg_num << ", num_objs is: "
     << num_objs << dendl;
 
-  m_store->apply_transaction(t);
+  m_store->apply_transaction(&m_osr, t);
 }
 
 void DeterministicOpSequence::_do_touch(coll_t coll, hobject_t& obj)
 {
   ObjectStore::Transaction t;
   note_txn(&t);
-  t.touch(coll, obj);
-  m_store->apply_transaction(t);
+  t.touch(coll, ghobject_t(obj));
+  m_store->apply_transaction(&m_osr, t);
 }
 
 void DeterministicOpSequence::_do_remove(coll_t coll, hobject_t& obj)
 {
   ObjectStore::Transaction t;
   note_txn(&t);
-  t.remove(coll, obj);
-  m_store->apply_transaction(t);
+  t.remove(coll, ghobject_t(obj));
+  m_store->apply_transaction(&m_osr, t);
 }
 
 void DeterministicOpSequence::_do_set_attrs(coll_t coll,
@@ -479,8 +477,8 @@ void DeterministicOpSequence::_do_set_attrs(coll_t coll,
 {
   ObjectStore::Transaction t;
   note_txn(&t);
-  t.omap_setkeys(coll, obj, attrs);
-  m_store->apply_transaction(t);
+  t.omap_setkeys(coll, ghobject_t(obj), attrs);
+  m_store->apply_transaction(&m_osr, t);
 }
 
 void DeterministicOpSequence::_do_write(coll_t coll, hobject_t& obj,
@@ -488,8 +486,8 @@ void DeterministicOpSequence::_do_write(coll_t coll, hobject_t& obj,
 {
   ObjectStore::Transaction t;
   note_txn(&t);
-  t.write(coll, obj, off, len, data);
-  m_store->apply_transaction(t);
+  t.write(coll, ghobject_t(obj), off, len, data);
+  m_store->apply_transaction(&m_osr, t);
 }
 
 void DeterministicOpSequence::_do_clone(coll_t coll, hobject_t& orig_obj,
@@ -497,8 +495,8 @@ void DeterministicOpSequence::_do_clone(coll_t coll, hobject_t& orig_obj,
 {
   ObjectStore::Transaction t;
   note_txn(&t);
-  t.clone(coll, orig_obj, new_obj);
-  m_store->apply_transaction(t);
+  t.clone(coll, ghobject_t(orig_obj), ghobject_t(new_obj));
+  m_store->apply_transaction(&m_osr, t);
 }
 
 void DeterministicOpSequence::_do_clone_range(coll_t coll,
@@ -507,8 +505,9 @@ void DeterministicOpSequence::_do_clone_range(coll_t coll,
 {
   ObjectStore::Transaction t;
   note_txn(&t);
-  t.clone_range(coll, orig_obj, new_obj, srcoff, srclen, dstoff);
-  m_store->apply_transaction(t);
+  t.clone_range(coll, ghobject_t(orig_obj), ghobject_t(new_obj),
+		srcoff, srclen, dstoff);
+  m_store->apply_transaction(&m_osr, t);
 }
 
 void DeterministicOpSequence::_do_write_and_clone_range(coll_t coll,
@@ -521,9 +520,10 @@ void DeterministicOpSequence::_do_write_and_clone_range(coll_t coll,
 {
   ObjectStore::Transaction t;
   note_txn(&t);
-  t.write(coll, orig_obj, srcoff, bl.length(), bl);
-  t.clone_range(coll, orig_obj, new_obj, srcoff, srclen, dstoff);
-  m_store->apply_transaction(t);
+  t.write(coll, ghobject_t(orig_obj), srcoff, bl.length(), bl);
+  t.clone_range(coll, ghobject_t(orig_obj), ghobject_t(new_obj),
+		srcoff, srclen, dstoff);
+  m_store->apply_transaction(&m_osr, t);
 }
 
 void DeterministicOpSequence::_do_coll_move(coll_t orig_coll, coll_t new_coll,
@@ -531,8 +531,8 @@ void DeterministicOpSequence::_do_coll_move(coll_t orig_coll, coll_t new_coll,
 {
   ObjectStore::Transaction t;
   note_txn(&t);
-  t.remove(new_coll, obj);
-  t.collection_move(new_coll, orig_coll, obj);
-  m_store->apply_transaction(t);
+  t.remove(new_coll, ghobject_t(obj));
+  t.collection_move_rename(orig_coll, ghobject_t(obj), new_coll, ghobject_t(obj));
+  m_store->apply_transaction(&m_osr, t);
 }
 
diff --git a/src/test/objectstore/FileStoreDiff.cc b/src/test/objectstore/FileStoreDiff.cc
index ad4f62b..a49e4af 100644
--- a/src/test/objectstore/FileStoreDiff.cc
+++ b/src/test/objectstore/FileStoreDiff.cc
@@ -132,13 +132,15 @@ bool FileStoreDiff::diff_objects(FileStore *a_store, FileStore *b_store, coll_t
 
   int err;
   std::vector<ghobject_t> b_objects, a_objects;
-  err = b_store->collection_list(coll, b_objects);
+  err = b_store->collection_list(coll, ghobject_t(), ghobject_t::get_max(),
+				 true, INT_MAX, &b_objects, NULL);
   if (err < 0) {
     dout(0) << "diff_objects list on verify coll " << coll.to_str()
 	    << " returns " << err << dendl;
     return true;
   }
-  err = a_store->collection_list(coll, a_objects);
+  err = a_store->collection_list(coll, ghobject_t(), ghobject_t::get_max(),
+				 true, INT_MAX, &a_objects, NULL);
   if (err < 0) {
     dout(0) << "diff_objects list on store coll " << coll.to_str()
               << " returns " << err << dendl;
@@ -251,34 +253,6 @@ bool FileStoreDiff::diff_objects(FileStore *a_store, FileStore *b_store, coll_t
   return ret;
 }
 
-bool FileStoreDiff::diff_coll_attrs(FileStore *a_store, FileStore *b_store, coll_t coll)
-{
-  bool ret = false;
-
-  int err;
-  std::map<std::string, bufferptr> b_coll_attrs, a_coll_attrs;
-  err = b_store->collection_getattrs(coll, b_coll_attrs);
-  if (err < 0 && err != -EOPNOTSUPP) {
-    dout(0) << "diff_attrs getattrs on verify coll " << coll.to_str()
-        << "returns " << err << dendl;
-    ret = true;
-  }
-  err = a_store->collection_getattrs(coll, a_coll_attrs);
-  if (err < 0 && err != -EOPNOTSUPP) {
-    dout(0) << "diff_attrs getattrs on A coll " << coll.to_str()
-              << "returns " << err << dendl;
-    ret = true;
-  }
-
-  if (b_coll_attrs.size() != a_coll_attrs.size()) {
-    dout(0) << "diff_attrs size mismatch (A: " << a_coll_attrs.size()
-        << ", B: " << a_coll_attrs.size() << ")" << dendl;
-    ret = true;
-  }
-
-  return diff_attrs(b_coll_attrs, a_coll_attrs) || ret;
-}
-
 bool FileStoreDiff::diff()
 {
   bool ret = false;
@@ -303,9 +277,6 @@ bool FileStoreDiff::diff()
       }
     }
 
-    if (diff_coll_attrs(a_store, b_store, b_coll))
-      ret = true;
-
     if (diff_objects(a_store, b_store, b_coll))
       ret = true;
   }
diff --git a/src/test/objectstore/FileStoreDiff.h b/src/test/objectstore/FileStoreDiff.h
index cacd3ce..f7aedee 100644
--- a/src/test/objectstore/FileStoreDiff.h
+++ b/src/test/objectstore/FileStoreDiff.h
@@ -27,7 +27,6 @@ class FileStoreDiff {
   FileStore *a_store;
   FileStore *b_store;
 
-  bool diff_coll_attrs(FileStore *a_store, FileStore *b_store, coll_t coll);
   bool diff_objects(FileStore *a_store, FileStore *b_store, coll_t coll);
   bool diff_objects_stat(struct stat& a, struct stat& b);
   bool diff_attrs(std::map<std::string,bufferptr>& b,
diff --git a/src/test/objectstore/FileStoreTracker.cc b/src/test/objectstore/FileStoreTracker.cc
index afdc31b..3e4cf97 100644
--- a/src/test/objectstore/FileStoreTracker.cc
+++ b/src/test/objectstore/FileStoreTracker.cc
@@ -8,16 +8,16 @@
 
 class OnApplied : public Context {
   FileStoreTracker *tracker;
-  list<pair<pair<string, string>, uint64_t> > in_flight;
+  list<pair<pair<coll_t, string>, uint64_t> > in_flight;
   ObjectStore::Transaction *t;
 public:
   OnApplied(FileStoreTracker *tracker,
-	    list<pair<pair<string, string>, uint64_t> > in_flight,
+	    list<pair<pair<coll_t, string>, uint64_t> > in_flight,
 	    ObjectStore::Transaction *t)
     : tracker(tracker), in_flight(in_flight), t(t) {}
 
   void finish(int r) {
-    for (list<pair<pair<string, string>, uint64_t> >::iterator i =
+    for (list<pair<pair<coll_t, string>, uint64_t> >::iterator i =
 	   in_flight.begin();
 	 i != in_flight.end();
 	 ++i) {
@@ -29,14 +29,14 @@ public:
 
 class OnCommitted : public Context {
   FileStoreTracker *tracker;
-  list<pair<pair<string, string>, uint64_t> > in_flight;
+  list<pair<pair<coll_t, string>, uint64_t> > in_flight;
 public:
   OnCommitted(FileStoreTracker *tracker,
-	      list<pair<pair<string, string>, uint64_t> > in_flight)
+	      list<pair<pair<coll_t, string>, uint64_t> > in_flight)
     : tracker(tracker), in_flight(in_flight) {}
 
   void finish(int r) {
-    for (list<pair<pair<string, string>, uint64_t> >::iterator i =
+    for (list<pair<pair<coll_t, string>, uint64_t> >::iterator i =
 	   in_flight.begin();
 	 i != in_flight.end();
 	 ++i) {
@@ -67,7 +67,7 @@ int FileStoreTracker::init()
 
 void FileStoreTracker::submit_transaction(Transaction &t)
 {
-  list<pair<pair<string, string>, uint64_t> > in_flight;
+  list<pair<pair<coll_t, string>, uint64_t> > in_flight;
   OutTransaction out;
   out.t = new ObjectStore::Transaction;
   out.in_flight = &in_flight;
@@ -82,7 +82,7 @@ void FileStoreTracker::submit_transaction(Transaction &t)
     new OnCommitted(this, in_flight));
 }
 
-void FileStoreTracker::write(const pair<string, string> &obj,
+void FileStoreTracker::write(const pair<coll_t, string> &obj,
 			     OutTransaction *out)
 {
   Mutex::Locker l(lock);
@@ -104,14 +104,14 @@ void FileStoreTracker::write(const pair<string, string> &obj,
     to_write.append(*iter);
   }
   out->t->write(coll_t(obj.first),
-		hobject_t(sobject_t(obj.second, CEPH_NOSNAP)),
+		ghobject_t(hobject_t(sobject_t(obj.second, CEPH_NOSNAP))),
 		offset,
 		len,
 		to_write);
   out->in_flight->push_back(make_pair(obj, set_content(obj, contents)));
 }
 
-void FileStoreTracker::remove(const pair<string, string> &obj,
+void FileStoreTracker::remove(const pair<coll_t, string> &obj,
 			      OutTransaction *out)
 {
   std::cerr << "Deleting " << obj << std::endl;
@@ -120,13 +120,13 @@ void FileStoreTracker::remove(const pair<string, string> &obj,
   if (!old_contents.exists())
     return;
   out->t->remove(coll_t(obj.first),
-		 hobject_t(sobject_t(obj.second, CEPH_NOSNAP)));
+		 ghobject_t(hobject_t(sobject_t(obj.second, CEPH_NOSNAP))));
   ObjectContents contents;
   out->in_flight->push_back(make_pair(obj, set_content(obj, contents)));
 }
 
-void FileStoreTracker::clone_range(const pair<string, string> &from,
-				   const pair<string, string> &to,
+void FileStoreTracker::clone_range(const pair<coll_t, string> &from,
+				   const pair<coll_t, string> &to,
 				   OutTransaction *out) {
   Mutex::Locker l(lock);
   std::cerr << "CloningRange " << from << " to " << to << std::endl;
@@ -148,16 +148,16 @@ void FileStoreTracker::clone_range(const pair<string, string> &from,
   interval_to_clone.insert(offset, len);
   to_contents.clone_range(from_contents, interval_to_clone);
   out->t->clone_range(coll_t(from.first),
-		      hobject_t(sobject_t(from.second, CEPH_NOSNAP)),
-		      hobject_t(sobject_t(to.second, CEPH_NOSNAP)),
+		      ghobject_t(hobject_t(sobject_t(from.second, CEPH_NOSNAP))),
+		      ghobject_t(hobject_t(sobject_t(to.second, CEPH_NOSNAP))),
 		      offset,
 		      len,
 		      offset);
   out->in_flight->push_back(make_pair(to, set_content(to, to_contents)));
 }
 
-void FileStoreTracker::clone(const pair<string, string> &from,
-			     const pair<string, string> &to,
+void FileStoreTracker::clone(const pair<coll_t, string> &from,
+			     const pair<coll_t, string> &to,
 			     OutTransaction *out) {
   Mutex::Locker l(lock);
   std::cerr << "Cloning " << from << " to " << to << std::endl;
@@ -173,24 +173,24 @@ void FileStoreTracker::clone(const pair<string, string> &from,
 
   if (to_contents.exists())
     out->t->remove(coll_t(to.first),
-		   hobject_t(sobject_t(to.second, CEPH_NOSNAP)));
+		   ghobject_t(hobject_t(sobject_t(to.second, CEPH_NOSNAP))));
   out->t->clone(coll_t(from.first),
-		hobject_t(sobject_t(from.second, CEPH_NOSNAP)),
-		hobject_t(sobject_t(to.second, CEPH_NOSNAP)));
+		ghobject_t(hobject_t(sobject_t(from.second, CEPH_NOSNAP))),
+		ghobject_t(hobject_t(sobject_t(to.second, CEPH_NOSNAP))));
   out->in_flight->push_back(make_pair(to, set_content(to, from_contents)));
 }
 
 
-string obj_to_prefix(const pair<string, string> &obj) {
+string obj_to_prefix(const pair<coll_t, string> &obj) {
   string sep;
   sep.push_back('^');
-  return obj.first + sep + obj.second + "_CONTENTS_";
+  return obj.first.to_str() + sep + obj.second + "_CONTENTS_";
 }
 
-string obj_to_meta_prefix(const pair<string, string> &obj) {
+string obj_to_meta_prefix(const pair<coll_t, string> &obj) {
   string sep;
   sep.push_back('^');
-  return obj.first + sep + obj.second;
+  return obj.first.to_str() + sep + obj.second;
 }
 
 string seq_to_key(uint64_t seq) {
@@ -232,7 +232,7 @@ void decode(ObjStatus &obj, bufferlist::iterator &bl) {
 }
 
 
-ObjStatus get_obj_status(const pair<string, string> &obj,
+ObjStatus get_obj_status(const pair<coll_t, string> &obj,
 			 KeyValueDB *db)
 {
   set<string> to_get;
@@ -247,7 +247,7 @@ ObjStatus get_obj_status(const pair<string, string> &obj,
   return retval;
 }
 
-void set_obj_status(const pair<string, string> &obj,
+void set_obj_status(const pair<coll_t, string> &obj,
 		    const ObjStatus &status,
 		    KeyValueDB::Transaction t)
 {
@@ -256,7 +256,7 @@ void set_obj_status(const pair<string, string> &obj,
   t->set(obj_to_meta_prefix(obj), to_set);
 }
 
-void _clean_forward(const pair<string, string> &obj,
+void _clean_forward(const pair<coll_t, string> &obj,
 		    uint64_t last_valid,
 		    KeyValueDB *db)
 {
@@ -272,7 +272,7 @@ void _clean_forward(const pair<string, string> &obj,
 }
 
 
-void FileStoreTracker::verify(const string &coll, const string &obj,
+void FileStoreTracker::verify(const coll_t &coll, const string &obj,
 			      bool on_start) {
   Mutex::Locker l(lock);
   std::cerr << "Verifying " << make_pair(coll, obj) << std::endl;
@@ -281,7 +281,7 @@ void FileStoreTracker::verify(const string &coll, const string &obj,
   std::cerr << "valid_reads is " << valid_reads << std::endl;
   bufferlist contents;
   int r = store->read(coll_t(coll),
-		      hobject_t(sobject_t(obj, CEPH_NOSNAP)),
+		      ghobject_t(hobject_t(sobject_t(obj, CEPH_NOSNAP))),
 		      0,
 		      2*SIZE,
 		      contents);
@@ -333,7 +333,7 @@ void FileStoreTracker::verify(const string &coll, const string &obj,
 }
 
 ObjectContents FileStoreTracker::get_current_content(
-  const pair<string, string> &obj)
+  const pair<coll_t, string> &obj)
 {
   KeyValueDB::Iterator iter = db->get_iterator(
     obj_to_prefix(obj));
@@ -351,7 +351,7 @@ ObjectContents FileStoreTracker::get_current_content(
 }
 
 ObjectContents FileStoreTracker::get_content(
-  const pair<string, string> &obj, uint64_t version)
+  const pair<coll_t, string> &obj, uint64_t version)
 {
   set<string> to_get;
   map<string, bufferlist> got;
@@ -368,7 +368,7 @@ ObjectContents FileStoreTracker::get_content(
 }
 
 pair<uint64_t, uint64_t> FileStoreTracker::get_valid_reads(
-  const pair<string, string> &obj)
+  const pair<coll_t, string> &obj)
 {
   pair<uint64_t, uint64_t> bounds = make_pair(0,1);
   KeyValueDB::Iterator iter = db->get_iterator(
@@ -387,7 +387,7 @@ pair<uint64_t, uint64_t> FileStoreTracker::get_valid_reads(
   return bounds;
 }
 
-void clear_obsolete(const pair<string, string> &obj,
+void clear_obsolete(const pair<coll_t, string> &obj,
 		    const ObjStatus &status,
 		    KeyValueDB *db,
 		    KeyValueDB::Transaction t)
@@ -401,7 +401,7 @@ void clear_obsolete(const pair<string, string> &obj,
   t->rmkeys(obj_to_prefix(obj), to_remove);
 }
 
-void FileStoreTracker::committed(const pair<string, string> &obj,
+void FileStoreTracker::committed(const pair<coll_t, string> &obj,
 				 uint64_t seq) {
   Mutex::Locker l(lock);
   ObjStatus status = get_obj_status(obj, db);
@@ -413,7 +413,7 @@ void FileStoreTracker::committed(const pair<string, string> &obj,
   db->submit_transaction(t);
 }
 
-void FileStoreTracker::applied(const pair<string, string> &obj,
+void FileStoreTracker::applied(const pair<coll_t, string> &obj,
 			       uint64_t seq) {
   Mutex::Locker l(lock);
   std::cerr << "Applied " << obj << " version " << seq << std::endl;
@@ -427,7 +427,7 @@ void FileStoreTracker::applied(const pair<string, string> &obj,
 }
 
 
-uint64_t FileStoreTracker::set_content(const pair<string, string> &obj,
+uint64_t FileStoreTracker::set_content(const pair<coll_t, string> &obj,
 				       ObjectContents &content) {
   KeyValueDB::Transaction t = db->get_transaction();
   KeyValueDB::Iterator iter = db->get_iterator(
diff --git a/src/test/objectstore/FileStoreTracker.h b/src/test/objectstore/FileStoreTracker.h
index d70e54a..11033a6 100644
--- a/src/test/objectstore/FileStoreTracker.h
+++ b/src/test/objectstore/FileStoreTracker.h
@@ -18,7 +18,7 @@ class FileStoreTracker {
   uint64_t restart_seq;
 
   struct OutTransaction {
-    list<pair<pair<string, string>, uint64_t> > *in_flight;
+    list<pair<pair<coll_t, string>, uint64_t> > *in_flight;
     ObjectStore::Transaction *t;
   };
 public:
@@ -36,9 +36,9 @@ public:
     list<Op*> ops;
     class Write : public Op {
     public:
-      string coll;
+      coll_t coll;
       string oid;
-      Write(const string &coll,
+      Write(const coll_t &coll,
 	    const string &oid)
 	: coll(coll), oid(oid) {}
       void operator()(FileStoreTracker *harness,
@@ -48,10 +48,10 @@ public:
     };
     class CloneRange : public Op {
     public:
-      string coll;
+      coll_t coll;
       string from;
       string to;
-      CloneRange(const string &coll,
+      CloneRange(const coll_t &coll,
 		 const string &from,
 		 const string &to)
 	: coll(coll), from(from), to(to) {}
@@ -63,10 +63,10 @@ public:
     };
     class Clone : public Op {
     public:
-      string coll;
+      coll_t coll;
       string from;
       string to;
-      Clone(const string &coll,
+      Clone(const coll_t &coll,
 		 const string &from,
 		 const string &to)
 	: coll(coll), from(from), to(to) {}
@@ -78,9 +78,9 @@ public:
     };
     class Remove: public Op {
     public:
-      string coll;
+      coll_t coll;
       string obj;
-      Remove(const string &coll,
+      Remove(const coll_t &coll,
 	     const string &obj)
 	: coll(coll), obj(obj) {}
       void operator()(FileStoreTracker *harness,
@@ -90,18 +90,18 @@ public:
       }
     };
   public:
-    void write(const string &coll, const string &oid) {
+    void write(const coll_t &coll, const string &oid) {
       ops.push_back(new Write(coll, oid));
     }
-    void clone_range(const string &coll, const string &from,
+    void clone_range(const coll_t &coll, const string &from,
 		     const string &to) {
       ops.push_back(new CloneRange(coll, from, to));
     }
-    void clone(const string &coll, const string &from,
+    void clone(const coll_t &coll, const string &from,
 	       const string &to) {
       ops.push_back(new Clone(coll, from, to));
     }
-    void remove(const string &coll, const string &oid) {
+    void remove(const coll_t &coll, const string &oid) {
       ops.push_back(new Remove(coll, oid));
     }
     friend class FileStoreTracker;
@@ -109,27 +109,27 @@ public:
 
   int init();
   void submit_transaction(Transaction &t);
-  void verify(const string &coll,
+  void verify(const coll_t &coll,
 	      const string &from,
 	      bool on_start = false);
 
 private:
-  ObjectContents get_current_content(const pair<string, string> &obj);
-  pair<uint64_t, uint64_t> get_valid_reads(const pair<string, string> &obj);
-  ObjectContents get_content(const pair<string, string> &obj, uint64_t version);
+  ObjectContents get_current_content(const pair<coll_t, string> &obj);
+  pair<uint64_t, uint64_t> get_valid_reads(const pair<coll_t, string> &obj);
+  ObjectContents get_content(const pair<coll_t, string> &obj, uint64_t version);
 
-  void committed(const pair<string, string> &obj, uint64_t seq);
-  void applied(const pair<string, string> &obj, uint64_t seq);
-  uint64_t set_content(const pair<string, string> &obj, ObjectContents &content);
+  void committed(const pair<coll_t, string> &obj, uint64_t seq);
+  void applied(const pair<coll_t, string> &obj, uint64_t seq);
+  uint64_t set_content(const pair<coll_t, string> &obj, ObjectContents &content);
 
   // ObjectContents Operations
-  void write(const pair<string, string> &obj, OutTransaction *out);
-  void remove(const pair<string, string> &obj, OutTransaction *out);
-  void clone_range(const pair<string, string> &from,
-		   const pair<string, string> &to,
+  void write(const pair<coll_t, string> &obj, OutTransaction *out);
+  void remove(const pair<coll_t, string> &obj, OutTransaction *out);
+  void clone_range(const pair<coll_t, string> &from,
+		   const pair<coll_t, string> &to,
 		   OutTransaction *out);
-  void clone(const pair<string, string> &from,
-	     const pair<string, string> &to,
+  void clone(const pair<coll_t, string> &from,
+	     const pair<coll_t, string> &to,
 	     OutTransaction *out);
   friend class OnApplied;
   friend class OnCommitted;
diff --git a/src/test/objectstore/ObjectStoreTransactionBenchmark.cc b/src/test/objectstore/ObjectStoreTransactionBenchmark.cc
index a82efb7..5b624da 100644
--- a/src/test/objectstore/ObjectStoreTransactionBenchmark.cc
+++ b/src/test/objectstore/ObjectStoreTransactionBenchmark.cc
@@ -192,9 +192,10 @@ class PerfCase {
   }
 
   uint64_t rados_write_4k(int times) {
-    uint64_t start_time = 0, ticks = 0;
+    uint64_t ticks = 0;
     uint64_t len = Kib *4;
     for (int i = 0; i < times; i++) {
+      uint64_t start_time = 0;
       {
         Transaction t;
         ghobject_t oid = create_object();
@@ -232,8 +233,8 @@ const string PerfCase::info_info_attr("11.40_info");
 const string PerfCase::attr("_");
 const string PerfCase::snapset_attr("snapset");
 const string PerfCase::pglog_attr("pglog_attr");
-const coll_t PerfCase::meta_cid("meta");
-const coll_t PerfCase::cid("meta");
+const coll_t PerfCase::meta_cid;
+const coll_t PerfCase::cid;
 const ghobject_t PerfCase::pglog_oid(hobject_t(sobject_t(object_t("cid_pglog"), 0)));
 const ghobject_t PerfCase::info_oid(hobject_t(sobject_t(object_t("infos"), 0)));
 Transaction::Tick Transaction::write_ticks, Transaction::setattr_ticks, Transaction::omap_setkeys_ticks, Transaction::omap_rmkeys_ticks;
diff --git a/src/test/objectstore/TestObjectStoreState.cc b/src/test/objectstore/TestObjectStoreState.cc
index e21e8d9..e4252ce 100644
--- a/src/test/objectstore/TestObjectStoreState.cc
+++ b/src/test/objectstore/TestObjectStoreState.cc
@@ -29,19 +29,16 @@
 #undef dout_prefix
 #define dout_prefix *_dout << "ceph_test_objectstore_state "
 
-const coll_t TestObjectStoreState::META_COLL("meta");
-const coll_t TestObjectStoreState::TEMP_COLL("temp");
-
 void TestObjectStoreState::init(int colls, int objs)
 {
   dout(5) << "init " << colls << " colls " << objs << " objs" << dendl;
 
+  ObjectStore::Sequencer osr(__func__);
   ObjectStore::Transaction *t;
   t = new ObjectStore::Transaction;
 
-  t->create_collection(META_COLL);
-  t->create_collection(TEMP_COLL);
-  m_store->apply_transaction(*t);
+  t->create_collection(coll_t::meta(), 0);
+  m_store->apply_transaction(&osr, *t);
 
   wait_for_ready();
 
@@ -50,10 +47,10 @@ void TestObjectStoreState::init(int colls, int objs)
     int coll_id = i;
     coll_entry_t *entry = coll_create(coll_id);
     dout(5) << "init create collection " << entry->m_coll.to_str()
-        << " meta " << entry->m_meta_obj.oid.name << dendl;
+        << " meta " << entry->m_meta_obj << dendl;
 
     t = new ObjectStore::Transaction;
-    t->create_collection(entry->m_coll);
+    t->create_collection(entry->m_coll, 32);
     bufferlist hint;
     uint32_t pg_num = colls;
     uint64_t num_objs = uint64_t(objs / colls);
@@ -61,11 +58,11 @@ void TestObjectStoreState::init(int colls, int objs)
     ::encode(num_objs, hint);
     t->collection_hint(entry->m_coll, ObjectStore::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS, hint);
     dout(5) << "give collection hint, number of objects per collection: " << num_objs << dendl;
-    t->touch(META_COLL, entry->m_meta_obj);
+    t->touch(coll_t::meta(), entry->m_meta_obj);
 
     for (int i = 0; i < objs; i++) {
       hobject_t *obj = entry->touch_obj(i + baseid);
-      t->touch(entry->m_coll, *obj);
+      t->touch(entry->m_coll, ghobject_t(*obj));
       ceph_assert(i + baseid == m_num_objects);
       m_num_objects++;
     }
diff --git a/src/test/objectstore/TestObjectStoreState.h b/src/test/objectstore/TestObjectStoreState.h
index a3338cb..bd13e15 100644
--- a/src/test/objectstore/TestObjectStoreState.h
+++ b/src/test/objectstore/TestObjectStoreState.h
@@ -28,7 +28,7 @@ public:
     int m_id;
     spg_t m_pgid;
     coll_t m_coll;
-    hobject_t m_meta_obj;
+    ghobject_t m_meta_obj;
     ObjectStore::Sequencer m_osr;
     map<int, hobject_t*> m_objects;
     int m_next_object_id;
@@ -37,7 +37,7 @@ public:
       : m_id(i),
 	m_pgid(pg_t(i, 1), shard_id_t::NO_SHARD),
 	m_coll(m_pgid),
-      m_meta_obj(sobject_t(object_t(meta_obj_buf), CEPH_NOSNAP)),
+	m_meta_obj(hobject_t(sobject_t(object_t(meta_obj_buf), CEPH_NOSNAP))),
       m_osr(coll_buf), m_next_object_id(0) {
     }
     ~coll_entry_t();
@@ -56,10 +56,6 @@ public:
     hobject_t *get_obj_at(int pos, bool remove, int *key = NULL);
   };
 
-  /* kept in upper case for consistency with coll_t's */
-  static const coll_t META_COLL;
-  static const coll_t TEMP_COLL;
-
  protected:
   boost::shared_ptr<ObjectStore> m_store;
   map<int, coll_entry_t*> m_collections;
diff --git a/src/test/objectstore/TestRocksdbOptionParse.cc b/src/test/objectstore/TestRocksdbOptionParse.cc
new file mode 100644
index 0000000..cdbbfa9
--- /dev/null
+++ b/src/test/objectstore/TestRocksdbOptionParse.cc
@@ -0,0 +1,85 @@
+#include <gtest/gtest.h>
+#include "include/Context.h"
+#include "common/ceph_argparse.h"
+#include "global/global_init.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/thread_status.h"
+#include "os/RocksDBStore.h"
+#include <iostream>
+using namespace std;
+
+const string dir("store_test_temp_dir");
+
+TEST(RocksDBOption, simple) {
+  rocksdb::Options options;
+  rocksdb::Status status;
+  RocksDBStore *db = new RocksDBStore(g_ceph_context, dir);
+  string options_string = ""
+			  "write_buffer_size=536870912;"
+			  "create_if_missing=true;"
+			  "max_write_buffer_number=4;"
+			  "max_background_compactions=4;"
+			  "stats_dump_period_sec = 5;"
+			  "min_write_buffer_number_to_merge = 2;"
+			  "level0_file_num_compaction_trigger = 4;"
+			  "max_bytes_for_level_base = 104857600;"
+			  "target_file_size_base = 10485760;"
+			  "num_levels = 3;"
+			  "compression = kNoCompression;"
+			  "disable_data_sync = false;";
+  int r = db->ParseOptionsFromString(options_string, options);
+  ASSERT_EQ(0, r);
+  ASSERT_EQ(536870912, options.write_buffer_size);
+  ASSERT_EQ(4, options.max_write_buffer_number);
+  ASSERT_EQ(4, options.max_background_compactions);
+  ASSERT_EQ(5, options.stats_dump_period_sec);
+  ASSERT_EQ(2, options.min_write_buffer_number_to_merge);
+  ASSERT_EQ(4, options.level0_file_num_compaction_trigger);
+  ASSERT_EQ(104857600, options.max_bytes_for_level_base);
+  ASSERT_EQ(10485760, options.target_file_size_base);
+  ASSERT_EQ(3, options.num_levels);
+  ASSERT_FALSE(options.disableDataSync);
+ // ASSERT_EQ("none", options.compression);
+}
+TEST(RocksDBOption, interpret) {
+  rocksdb::Options options;
+  rocksdb::Status status;
+  RocksDBStore *db = new RocksDBStore(g_ceph_context, dir);
+  string options_string = "compact_on_mount = true; compaction_threads=10;flusher_threads=5;";
+  
+  int r = db->ParseOptionsFromString(options_string, options);
+  ASSERT_EQ(0, r);
+  ASSERT_TRUE(db->compact_on_mount);
+  //check thread pool setting
+  options.env->SleepForMicroseconds(100000);
+  std::vector<rocksdb::ThreadStatus> thread_list;
+  status = options.env->GetThreadList(&thread_list);
+  ASSERT_TRUE(status.ok());
+
+  int num_high_pri_threads = 0;
+  int num_low_pri_threads = 0;
+  for (vector<rocksdb::ThreadStatus>::iterator it = thread_list.begin();
+	it!= thread_list.end();
+	++it) {
+    if (it->thread_type == rocksdb::ThreadStatus::HIGH_PRIORITY)
+      num_high_pri_threads++;
+    if (it->thread_type == rocksdb::ThreadStatus::LOW_PRIORITY)
+      num_low_pri_threads++;
+  }
+  ASSERT_EQ(15, thread_list.size());
+  //low pri threads is compaction_threads
+  ASSERT_EQ(10, num_low_pri_threads);
+  //high pri threads is flusher_threads
+  ASSERT_EQ(5, num_high_pri_threads);
+}
+
+int main(int argc, char **argv) {
+  vector<const char*> args;
+  argv_to_vec(argc, (const char **)argv, args);
+  env_to_vec(args);
+  global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
+  common_init_finish(g_ceph_context);
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/test/objectstore/chain_xattr.cc b/src/test/objectstore/chain_xattr.cc
index c2e33f7..51efd55 100644
--- a/src/test/objectstore/chain_xattr.cc
+++ b/src/test/objectstore/chain_xattr.cc
@@ -182,6 +182,40 @@ TEST(chain_xattr, chunk_aligned) {
     ASSERT_EQ(0, chain_fremovexattr(fd, name2.c_str()));
   }
 
+  for (int len = CHAIN_XATTR_SHORT_BLOCK_LEN - 10;
+       len < CHAIN_XATTR_SHORT_BLOCK_LEN + 10;
+       ++len) {
+    cout << len << std::endl;
+    const string x(len, 'x');
+    char buf[len*2];
+    ASSERT_EQ(len, chain_setxattr(file, name.c_str(), x.c_str(), len));
+    char attrbuf[4096];
+    int l = ceph_os_listxattr(file, attrbuf, sizeof(attrbuf));
+    for (char *p = attrbuf; p - attrbuf < l; p += strlen(p) + 1) {
+      cout << "  attr " << p << std::endl;
+    }
+    ASSERT_EQ(len, chain_getxattr(file, name.c_str(), buf, len*2));
+  }
+
+  {
+    // test tail path in chain_getxattr
+    const char *aname = "user.baz";
+    char buf[CHAIN_XATTR_SHORT_BLOCK_LEN*3];
+    memset(buf, 'x', sizeof(buf));
+    ASSERT_EQ((int)sizeof(buf), chain_setxattr(file, aname, buf, sizeof(buf)));
+    ASSERT_EQ(-ERANGE, chain_getxattr(file, aname, buf,
+				      CHAIN_XATTR_SHORT_BLOCK_LEN*2));
+  }
+  {
+    // test tail path in chain_fgetxattr
+    const char *aname = "user.biz";
+    char buf[CHAIN_XATTR_SHORT_BLOCK_LEN*3];
+    memset(buf, 'x', sizeof(buf));
+    ASSERT_EQ((int)sizeof(buf), chain_fsetxattr(fd, aname, buf, sizeof(buf)));
+    ASSERT_EQ(-ERANGE, chain_fgetxattr(fd, aname, buf,
+				       CHAIN_XATTR_SHORT_BLOCK_LEN*2));
+  }
+
   ::close(fd);
   ::unlink(file);
 }
@@ -199,7 +233,7 @@ TEST(chain_xattr, listxattr) {
   ASSERT_EQ(LARGE_BLOCK_LEN, chain_setxattr(file, name1.c_str(), x.c_str(), LARGE_BLOCK_LEN));
   ASSERT_EQ((int)sizeof(y), chain_setxattr(file, name2.c_str(), &y, sizeof(y)));
 
-  int buffer_size = name1.size() + sizeof('\0') + name2.size() + sizeof('\0');
+  int buffer_size = name1.size() + sizeof(char) + name2.size() + sizeof(char);
   char* expected = (char*)malloc(buffer_size);
   ::strcpy(expected, name1.c_str());
   ::strcpy(expected + name1.size() + 1, name2.c_str());
diff --git a/src/test/objectstore/store_test.cc b/src/test/objectstore/store_test.cc
index d3ce80c..7f95a4f 100644
--- a/src/test/objectstore/store_test.cc
+++ b/src/test/objectstore/store_test.cc
@@ -26,6 +26,7 @@
 #include "common/Mutex.h"
 #include "common/Cond.h"
 #include "common/errno.h"
+#include "include/stringify.h"
 #include <boost/scoped_ptr.hpp>
 #include <boost/random/mersenne_twister.hpp>
 #include <boost/random/uniform_int.hpp>
@@ -54,22 +55,30 @@ public:
                                               string(GetParam()),
                                               string("store_test_temp_dir"),
                                               string("store_test_temp_journal"));
+    if (!store_) {
+      cerr << __func__ << ": objectstore type " << string(GetParam()) << " doesn't exist yet!" << std::endl;
+      return;
+    }
     store.reset(store_);
     EXPECT_EQ(store->mkfs(), 0);
     EXPECT_EQ(store->mount(), 0);
   }
 
   virtual void TearDown() {
-    store->umount();
+    if (store)
+      store->umount();
   }
 };
 
-bool sorted(const vector<ghobject_t> &in) {
+bool sorted(const vector<ghobject_t> &in, bool bitwise) {
   ghobject_t start;
   for (vector<ghobject_t>::const_iterator i = in.begin();
        i != in.end();
        ++i) {
-    if (start > *i) return false;
+    if (cmp(start, *i, bitwise) > 0) {
+      cout << start << " should follow " << *i << std::endl;
+      return false;
+    }
     start = *i;
   }
   return true;
@@ -81,43 +90,168 @@ TEST_P(StoreTest, collect_metadata) {
   if (GetParam() == string("filestore")) {
     ASSERT_NE(pm.count("filestore_backend"), 0u);
     ASSERT_NE(pm.count("filestore_f_type"), 0u);
+    ASSERT_NE(pm.count("backend_filestore_partition_path"), 0u);
+    ASSERT_NE(pm.count("backend_filestore_dev_node"), 0u);
+  } else if (GetParam() == string("keyvaluestore")) {
+    ASSERT_NE(pm.count("keyvaluestore_backend"), 0u);
+  }
+}
+
+TEST_P(StoreTest, TrivialRemount) {
+  store->umount();
+  int r = store->mount();
+  ASSERT_EQ(0, r);
+}
+
+TEST_P(StoreTest, SimpleRemount) {
+  ObjectStore::Sequencer osr("test");
+  coll_t cid;
+  ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP)));
+  ghobject_t hoid2(hobject_t(sobject_t("Object 2", CEPH_NOSNAP)));
+  bufferlist bl;
+  bl.append("1234512345");
+  int r;
+  {
+    cerr << "create collection + write" << std::endl;
+    ObjectStore::Transaction t;
+    t.create_collection(cid, 0);
+    t.write(cid, hoid, 0, bl.length(), bl);
+    r = store->apply_transaction(&osr, t);
+    ASSERT_EQ(r, 0);
+  }
+  store->umount();
+  r = store->mount();
+  ASSERT_EQ(0, r);
+  {
+    ObjectStore::Transaction t;
+    t.write(cid, hoid2, 0, bl.length(), bl);
+    r = store->apply_transaction(&osr, t);
+    ASSERT_EQ(r, 0);
+  }
+  {
+    ObjectStore::Transaction t;
+    t.remove(cid, hoid);
+    t.remove(cid, hoid2);
+    t.remove_collection(cid);
+    cerr << "remove collection" << std::endl;
+    r = store->apply_transaction(&osr, t);
+    ASSERT_EQ(r, 0);
   }
 }
 
-TEST_P(StoreTest, SimpleColTest) {
-  coll_t cid = coll_t("initial");
+TEST_P(StoreTest, IORemount) {
+  ObjectStore::Sequencer osr("test");
+  coll_t cid;
+  bufferlist bl;
+  bl.append("1234512345");
+  int r;
+  {
+    cerr << "create collection + objects" << std::endl;
+    ObjectStore::Transaction t;
+    t.create_collection(cid, 0);
+    for (int n=1; n<=100; ++n) {
+      ghobject_t hoid(hobject_t(sobject_t("Object " + stringify(n), CEPH_NOSNAP)));
+      t.write(cid, hoid, 0, bl.length(), bl);
+    }
+    r = store->apply_transaction(&osr, t);
+    ASSERT_EQ(r, 0);
+  }
+  // overwrites
+  {
+    cout << "overwrites" << std::endl;
+    for (int n=1; n<=100; ++n) {
+      ObjectStore::Transaction t;
+      ghobject_t hoid(hobject_t(sobject_t("Object " + stringify(n), CEPH_NOSNAP)));
+      t.write(cid, hoid, 1, bl.length(), bl);
+      r = store->apply_transaction(&osr, t);
+      ASSERT_EQ(r, 0);
+    }
+  }
+  store->umount();
+  r = store->mount();
+  ASSERT_EQ(0, r);
+  {
+    ObjectStore::Transaction t;
+    for (int n=1; n<=100; ++n) {
+      ghobject_t hoid(hobject_t(sobject_t("Object " + stringify(n), CEPH_NOSNAP)));
+      t.remove(cid, hoid);
+    }
+    t.remove_collection(cid);
+    r = store->apply_transaction(&osr, t);
+    ASSERT_EQ(r, 0);
+  }
+}
+
+TEST_P(StoreTest, SimpleMetaColTest) {
+  ObjectStore::Sequencer osr("test");
+  coll_t cid;
   int r = 0;
   {
     ObjectStore::Transaction t;
-    t.create_collection(cid);
+    t.create_collection(cid, 0);
     cerr << "create collection" << std::endl;
-    r = store->apply_transaction(t);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
   }
   {
     ObjectStore::Transaction t;
     t.remove_collection(cid);
     cerr << "remove collection" << std::endl;
-    r = store->apply_transaction(t);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
   }
   {
     ObjectStore::Transaction t;
-    t.create_collection(cid);
+    t.create_collection(cid, 0);
     cerr << "add collection" << std::endl;
-    r = store->apply_transaction(t);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
   }
   {
     ObjectStore::Transaction t;
     t.remove_collection(cid);
     cerr << "remove collection" << std::endl;
-    r = store->apply_transaction(t);
+    r = store->apply_transaction(&osr, t);
+    ASSERT_EQ(r, 0);
+  }
+}
+
+TEST_P(StoreTest, SimplePGColTest) {
+  ObjectStore::Sequencer osr("test");
+  coll_t cid(spg_t(pg_t(1,2), shard_id_t::NO_SHARD));
+  int r = 0;
+  {
+    ObjectStore::Transaction t;
+    t.create_collection(cid, 4);
+    cerr << "create collection" << std::endl;
+    r = store->apply_transaction(&osr, t);
+    ASSERT_EQ(r, 0);
+  }
+  {
+    ObjectStore::Transaction t;
+    t.remove_collection(cid);
+    cerr << "remove collection" << std::endl;
+    r = store->apply_transaction(&osr, t);
+    ASSERT_EQ(r, 0);
+  }
+  {
+    ObjectStore::Transaction t;
+    t.create_collection(cid, 4);
+    cerr << "add collection" << std::endl;
+    r = store->apply_transaction(&osr, t);
+    ASSERT_EQ(r, 0);
+  }
+  {
+    ObjectStore::Transaction t;
+    t.remove_collection(cid);
+    cerr << "remove collection" << std::endl;
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
   }
 }
 
 TEST_P(StoreTest, SimpleColPreHashTest) {
+  ObjectStore::Sequencer osr("test");
   // Firstly we will need to revert the value making sure
   // collection hint actually works
   int merge_threshold = g_ceph_context->_conf->filestore_merge_threshold;
@@ -137,22 +271,19 @@ TEST_P(StoreTest, SimpleColPreHashTest) {
   boost::uniform_int<> folders_range(5, 256);
   uint64_t expected_num_objs = (uint64_t)objs_per_folder * (uint64_t)folders_range(rng);
 
-  char buf[100];
-  snprintf(buf, 100, "1.%x_head", pg_id);
-
-  coll_t cid(buf);
+  coll_t cid(spg_t(pg_t(pg_id, 15), shard_id_t::NO_SHARD));
   int r;
   {
     // Create a collection along with a hint
     ObjectStore::Transaction t;
-    t.create_collection(cid);
+    t.create_collection(cid, 5);
     cerr << "create collection" << std::endl;
     bufferlist hint;
     ::encode(pg_num, hint);
     ::encode(expected_num_objs, hint);
     t.collection_hint(cid, ObjectStore::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS, hint);
     cerr << "collection hint" << std::endl;
-    r = store->apply_transaction(t);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
   }
   {
@@ -160,7 +291,7 @@ TEST_P(StoreTest, SimpleColPreHashTest) {
     ObjectStore::Transaction t;
     t.remove_collection(cid);
     cerr << "remove collection" << std::endl;
-    r = store->apply_transaction(t);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
   }
   // Revert the config change so that it does not affect the split/merge tests
@@ -172,118 +303,551 @@ TEST_P(StoreTest, SimpleColPreHashTest) {
 }
 
 TEST_P(StoreTest, SimpleObjectTest) {
+  ObjectStore::Sequencer osr("test");
   int r;
-  coll_t cid = coll_t("coll");
+  coll_t cid;
+  ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP)));
+  {
+    bufferlist in;
+    r = store->read(cid, hoid, 0, 5, in);
+    ASSERT_EQ(-ENOENT, r);
+  }
   {
     ObjectStore::Transaction t;
-    t.create_collection(cid);
+    t.create_collection(cid, 0);
     cerr << "Creating collection " << cid << std::endl;
-    r = store->apply_transaction(t);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
   }
-  ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP)));
   {
+    bool exists = store->exists(cid, hoid);
+    ASSERT_TRUE(!exists);
+
     ObjectStore::Transaction t;
     t.touch(cid, hoid);
     cerr << "Creating object " << hoid << std::endl;
-    r = store->apply_transaction(t);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
+
+    exists = store->exists(cid, hoid);
+    ASSERT_EQ(true, exists);
   }
   {
     ObjectStore::Transaction t;
     t.remove(cid, hoid);
     t.touch(cid, hoid);
     cerr << "Remove then create" << std::endl;
-    r = store->apply_transaction(t);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
   }
   {
     ObjectStore::Transaction t;
-    bufferlist bl;
+    bufferlist bl, orig;
     bl.append("abcde");
+    orig = bl;
     t.remove(cid, hoid);
-    t.write(cid, hoid, 10, 5, bl);
+    t.write(cid, hoid, 0, 5, bl);
     cerr << "Remove then create" << std::endl;
-    r = store->apply_transaction(t);
+    r = store->apply_transaction(&osr, t);
+    ASSERT_EQ(r, 0);
+
+    bufferlist in;
+    r = store->read(cid, hoid, 0, 5, in);
+    ASSERT_EQ(5, r);
+    ASSERT_TRUE(in.contents_equal(orig));
+  }
+  {
+    ObjectStore::Transaction t;
+    bufferlist bl, exp;
+    bl.append("abcde");
+    exp = bl;
+    exp.append(bl);
+    t.write(cid, hoid, 5, 5, bl);
+    cerr << "Append" << std::endl;
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
+
+    bufferlist in;
+    r = store->read(cid, hoid, 0, 10, in);
+    ASSERT_EQ(10, r);
+    ASSERT_TRUE(in.contents_equal(exp));
+  }
+  {
+    ObjectStore::Transaction t;
+    bufferlist bl, exp;
+    bl.append("abcdeabcde");
+    exp = bl;
+    t.write(cid, hoid, 0, 10, bl);
+    cerr << "Full overwrite" << std::endl;
+    r = store->apply_transaction(&osr, t);
+    ASSERT_EQ(r, 0);
+
+    bufferlist in;
+    r = store->read(cid, hoid, 0, 10, in);
+    ASSERT_EQ(10, r);
+    ASSERT_TRUE(in.contents_equal(exp));
+  }
+  {
+    ObjectStore::Transaction t;
+    bufferlist bl;
+    bl.append("abcde");
+    t.write(cid, hoid, 3, 5, bl);
+    cerr << "Partial overwrite" << std::endl;
+    r = store->apply_transaction(&osr, t);
+    ASSERT_EQ(r, 0);
+
+    bufferlist in, exp;
+    exp.append("abcabcdede");
+    r = store->read(cid, hoid, 0, 10, in);
+    ASSERT_EQ(10, r);
+    in.hexdump(cout);
+    ASSERT_TRUE(in.contents_equal(exp));
+  }
+  {
+    ObjectStore::Transaction t;
+    bufferlist bl;
+    bl.append("abcde01234012340123401234abcde01234012340123401234abcde01234012340123401234abcde01234012340123401234");
+    t.write(cid, hoid, 0, bl.length(), bl);
+    cerr << "larger overwrite" << std::endl;
+    r = store->apply_transaction(&osr, t);
+    ASSERT_EQ(r, 0);
+
+    bufferlist in;
+    r = store->read(cid, hoid, 0, bl.length(), in);
+    ASSERT_EQ((int)bl.length(), r);
+    in.hexdump(cout);
+    ASSERT_TRUE(in.contents_equal(bl));
+  }
+  {
+    ObjectStore::Transaction t;
+    t.remove(cid, hoid);
+    t.remove_collection(cid);
+    cerr << "Cleaning" << std::endl;
+    r = store->apply_transaction(&osr, t);
+    ASSERT_EQ(r, 0);
+  }
+}
+
+TEST_P(StoreTest, ManySmallWrite) {
+  ObjectStore::Sequencer osr("test");
+  int r;
+  coll_t cid;
+  ghobject_t a(hobject_t(sobject_t("Object 1", CEPH_NOSNAP)));
+  ghobject_t b(hobject_t(sobject_t("Object 2", CEPH_NOSNAP)));
+  {
+    ObjectStore::Transaction t;
+    t.create_collection(cid, 0);
+    cerr << "Creating collection " << cid << std::endl;
+    r = store->apply_transaction(&osr, t);
+    ASSERT_EQ(r, 0);
+  }
+  bufferlist bl;
+  bufferptr bp(4096);
+  bp.zero();
+  bl.append(bp);
+  for (int i=0; i<100; ++i) {
+    ObjectStore::Transaction t;
+    t.write(cid, a, i*4096, 4096, bl, 0);
+    r = store->apply_transaction(&osr, t);
+    ASSERT_EQ(r, 0);
+  }
+  for (int i=0; i<100; ++i) {
+    ObjectStore::Transaction t;
+    t.write(cid, b, (rand() % 1024)*4096, 4096, bl, 0);
+    r = store->apply_transaction(&osr, t);
+    ASSERT_EQ(r, 0);
+  }
+  {
+    ObjectStore::Transaction t;
+    t.remove(cid, a);
+    t.remove(cid, b);
+    t.remove_collection(cid);
+    cerr << "Cleaning" << std::endl;
+    r = store->apply_transaction(&osr, t);
+    ASSERT_EQ(r, 0);
+  }
+}
+
+TEST_P(StoreTest, SimpleAttrTest) {
+  ObjectStore::Sequencer osr("test");
+  int r;
+  coll_t cid;
+  ghobject_t hoid(hobject_t(sobject_t("attr object 1", CEPH_NOSNAP)));
+  bufferlist val, val2;
+  val.append("value");
+  val.append("value2");
+  {
+    bufferptr bp;
+    map<string,bufferptr> aset;
+    r = store->getattr(cid, hoid, "nofoo", bp);
+    ASSERT_EQ(-ENOENT, r);
+    r = store->getattrs(cid, hoid, aset);
+    ASSERT_EQ(-ENOENT, r);
+  }
+  {
+    ObjectStore::Transaction t;
+    t.create_collection(cid, 0);
+    r = store->apply_transaction(&osr, t);
+    ASSERT_EQ(r, 0);
+  }
+  {
+    bool r = store->collection_empty(cid);
+    ASSERT_TRUE(r);
+  }
+  {
+    bufferptr bp;
+    r = store->getattr(cid, hoid, "nofoo", bp);
+    ASSERT_EQ(-ENOENT, r);
+  }
+  {
+    ObjectStore::Transaction t;
+    t.touch(cid, hoid);
+    t.setattr(cid, hoid, "foo", val);
+    t.setattr(cid, hoid, "bar", val2);
+    r = store->apply_transaction(&osr, t);
+    ASSERT_EQ(r, 0);
+  }
+  {
+    bool r = store->collection_empty(cid);
+    ASSERT_TRUE(!r);
+  }
+  {
+    bufferptr bp;
+    r = store->getattr(cid, hoid, "nofoo", bp);
+    ASSERT_EQ(-ENODATA, r);
+
+    r = store->getattr(cid, hoid, "foo", bp);
+    ASSERT_EQ(0, r);
+    bufferlist bl;
+    bl.append(bp);
+    ASSERT_TRUE(bl.contents_equal(val));
+
+    map<string,bufferptr> bm;
+    r = store->getattrs(cid, hoid, bm);
+    ASSERT_EQ(0, r);
+
   }
   {
     ObjectStore::Transaction t;
     t.remove(cid, hoid);
     t.remove_collection(cid);
+    r = store->apply_transaction(&osr, t);
+    ASSERT_EQ(r, 0);
+  }
+}
+
+TEST_P(StoreTest, SimpleListTest) {
+  ObjectStore::Sequencer osr("test");
+  int r;
+  coll_t cid(spg_t(pg_t(0, 1), shard_id_t(1)));
+  {
+    ObjectStore::Transaction t;
+    t.create_collection(cid, 0);
+    cerr << "Creating collection " << cid << std::endl;
+    r = store->apply_transaction(&osr, t);
+    ASSERT_EQ(r, 0);
+  }
+  set<ghobject_t, ghobject_t::BitwiseComparator> all;
+  {
+    ObjectStore::Transaction t;
+    for (int i=0; i<200; ++i) {
+      string name("object_");
+      name += stringify(i);
+      ghobject_t hoid(hobject_t(sobject_t(name, CEPH_NOSNAP)),
+		      ghobject_t::NO_GEN, shard_id_t(1));
+      hoid.hobj.pool = 1;
+      all.insert(hoid);
+      t.touch(cid, hoid);
+      cerr << "Creating object " << hoid << std::endl;
+    }
+    r = store->apply_transaction(&osr, t);
+    ASSERT_EQ(r, 0);
+  }
+  for (int bitwise=0; bitwise<2; ++bitwise) {
+    set<ghobject_t, ghobject_t::BitwiseComparator> saw;
+    vector<ghobject_t> objects;
+    ghobject_t next, current;
+    while (!next.is_max()) {
+      int r = store->collection_list(cid, current, ghobject_t::get_max(),
+				     (bool)bitwise, 50,
+				     &objects, &next);
+      if (r == -EOPNOTSUPP) {
+	++bitwise; // skip nibblewise test
+	continue;
+      }
+      ASSERT_EQ(r, 0);
+      ASSERT_TRUE(sorted(objects, (bool)bitwise));
+      cout << " got " << objects.size() << " next " << next << std::endl;
+      for (vector<ghobject_t>::iterator p = objects.begin(); p != objects.end();
+	   ++p) {
+	if (saw.count(*p)) {
+	  cout << "got DUP " << *p << std::endl;
+	} else {
+	  //cout << "got new " << *p << std::endl;
+	}
+	saw.insert(*p);
+      }
+      objects.clear();
+      current = next;
+    }
+    ASSERT_EQ(saw.size(), all.size());
+    ASSERT_EQ(saw, all);
+  }
+  {
+    ObjectStore::Transaction t;
+    for (set<ghobject_t, ghobject_t::BitwiseComparator>::iterator p = all.begin(); p != all.end(); ++p)
+      t.remove(cid, *p);
+    t.remove_collection(cid);
     cerr << "Cleaning" << std::endl;
-    r = store->apply_transaction(t);
+    r = store->apply_transaction(&osr, t);
+    ASSERT_EQ(r, 0);
+  }
+}
+
+TEST_P(StoreTest, Sort) {
+  {
+    hobject_t a(sobject_t("a", CEPH_NOSNAP));
+    hobject_t b = a;
+    ASSERT_EQ(a, b);
+    b.oid.name = "b";
+    ASSERT_NE(a, b);
+    ASSERT_TRUE(cmp_bitwise(a, b) < 0);
+    a.pool = 1;
+    b.pool = 2;
+    ASSERT_TRUE(cmp_bitwise(a, b) < 0);
+    a.pool = 3;
+    ASSERT_TRUE(cmp_bitwise(a, b) > 0);
+  }
+  {
+    ghobject_t a(hobject_t(sobject_t("a", CEPH_NOSNAP)));
+    ghobject_t b(hobject_t(sobject_t("b", CEPH_NOSNAP)));
+    a.hobj.pool = 1;
+    b.hobj.pool = 1;
+    ASSERT_TRUE(cmp_bitwise(a, b) < 0);
+    a.hobj.pool = -3;
+    ASSERT_TRUE(cmp_bitwise(a, b) < 0);
+    a.hobj.pool = 1;
+    b.hobj.pool = -3;
+    ASSERT_TRUE(cmp_bitwise(a, b) > 0);
+  }
+}
+
+TEST_P(StoreTest, MultipoolListTest) {
+  ObjectStore::Sequencer osr("test");
+  int r;
+  int poolid = 4373;
+  coll_t cid = coll_t(spg_t(pg_t(0, poolid), shard_id_t::NO_SHARD));
+  {
+    ObjectStore::Transaction t;
+    t.create_collection(cid, 0);
+    cerr << "Creating collection " << cid << std::endl;
+    r = store->apply_transaction(&osr, t);
+    ASSERT_EQ(r, 0);
+  }
+  set<ghobject_t, ghobject_t::BitwiseComparator> all, saw;
+  {
+    ObjectStore::Transaction t;
+    for (int i=0; i<200; ++i) {
+      string name("object_");
+      name += stringify(i);
+      ghobject_t hoid(hobject_t(sobject_t(name, CEPH_NOSNAP)));
+      if (rand() & 1)
+	hoid.hobj.pool = -2 - poolid;
+      else
+	hoid.hobj.pool = poolid;
+      all.insert(hoid);
+      t.touch(cid, hoid);
+      cerr << "Creating object " << hoid << std::endl;
+    }
+    r = store->apply_transaction(&osr, t);
+    ASSERT_EQ(r, 0);
+  }
+  {
+    vector<ghobject_t> objects;
+    ghobject_t next, current;
+    while (!next.is_max()) {
+      int r = store->collection_list(cid, current, ghobject_t::get_max(),
+				     true, 50,
+				     &objects, &next);
+      ASSERT_EQ(r, 0);
+      cout << " got " << objects.size() << " next " << next << std::endl;
+      for (vector<ghobject_t>::iterator p = objects.begin(); p != objects.end();
+	   ++p) {
+	saw.insert(*p);
+      }
+      objects.clear();
+      current = next;
+    }
+    ASSERT_EQ(saw, all);
+  }
+  {
+    ObjectStore::Transaction t;
+    for (set<ghobject_t, ghobject_t::BitwiseComparator>::iterator p = all.begin(); p != all.end(); ++p)
+      t.remove(cid, *p);
+    t.remove_collection(cid);
+    cerr << "Cleaning" << std::endl;
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
   }
 }
 
 TEST_P(StoreTest, SimpleCloneTest) {
+  ObjectStore::Sequencer osr("test");
   int r;
-  coll_t cid = coll_t("coll");
+  coll_t cid;
   {
     ObjectStore::Transaction t;
-    t.create_collection(cid);
+    t.create_collection(cid, 0);
     cerr << "Creating collection " << cid << std::endl;
-    r = store->apply_transaction(t);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
   }
   ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP)));
-  bufferlist small;
+  bufferlist small, large, xlarge, newdata, attr;
   small.append("small");
+  large.append("large");
+  xlarge.append("xlarge");
   {
     ObjectStore::Transaction t;
     t.touch(cid, hoid);
     t.setattr(cid, hoid, "attr1", small);
+    t.setattr(cid, hoid, "attr2", large);
+    t.setattr(cid, hoid, "attr3", xlarge);
+    t.write(cid, hoid, 10, small.length(), small);
     cerr << "Creating object and set attr " << hoid << std::endl;
-    r = store->apply_transaction(t);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
   }
   ghobject_t hoid2(hobject_t(sobject_t("Object 2", CEPH_NOSNAP)));
   {
     ObjectStore::Transaction t;
     t.clone(cid, hoid, hoid2);
-    t.rmattr(cid, hoid, "attr1");
+    t.setattr(cid, hoid2, "attr2", small);
+    t.rmattr(cid, hoid2, "attr1");
+    t.write(cid, hoid, 10, large.length(), large);
+    t.setattr(cid, hoid, "attr1", large);
+    t.setattr(cid, hoid, "attr2", small);
     cerr << "Clone object and rm attr" << std::endl;
-    r = store->apply_transaction(t);
+    r = store->apply_transaction(&osr, t);
+    ASSERT_EQ(r, 0);
+
+    r = store->read(cid, hoid, 10, 5, newdata);
+    ASSERT_EQ(r, 5);
+    ASSERT_TRUE(newdata.contents_equal(large));
+
+    newdata.clear();
+    r = store->read(cid, hoid2, 10, 5, newdata);
+    ASSERT_EQ(r, 5);
+    ASSERT_TRUE(newdata.contents_equal(small));
+
+    r = store->getattr(cid, hoid2, "attr2", attr);
+    ASSERT_EQ(r, 0);
+    ASSERT_TRUE(attr.contents_equal(small));
+
+    attr.clear();
+    r = store->getattr(cid, hoid2, "attr3", attr);
+    ASSERT_EQ(r, 0);
+    ASSERT_TRUE(attr.contents_equal(xlarge));
+
+    attr.clear();
+    r = store->getattr(cid, hoid, "attr1", attr);
+    ASSERT_EQ(r, 0);
+    ASSERT_TRUE(attr.contents_equal(large));
+  }
+  {
+    ObjectStore::Transaction t;
+    t.remove(cid, hoid);
+    t.remove(cid, hoid2);
+    t.remove_collection(cid);
+    cerr << "Cleaning" << std::endl;
+    r = store->apply_transaction(&osr, t);
+    ASSERT_EQ(r, 0);
+  }
+}
+
+TEST_P(StoreTest, OmapCloneTest) {
+  ObjectStore::Sequencer osr("test");
+  int r;
+  coll_t cid;
+  {
+    ObjectStore::Transaction t;
+    t.create_collection(cid, 0);
+    cerr << "Creating collection " << cid << std::endl;
+    r = store->apply_transaction(&osr, t);
+    ASSERT_EQ(r, 0);
+  }
+  ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP)));
+  bufferlist small;
+  small.append("small");
+  map<string,bufferlist> km;
+  km["foo"] = small;
+  km["bar"].append("asdfjkasdkjdfsjkafskjsfdj");
+  bufferlist header;
+  header.append("this is a header");
+  {
+    ObjectStore::Transaction t;
+    t.touch(cid, hoid);
+    t.omap_setkeys(cid, hoid, km);
+    t.omap_setheader(cid, hoid, header);
+    cerr << "Creating object and set omap " << hoid << std::endl;
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
   }
+  ghobject_t hoid2(hobject_t(sobject_t("Object 2", CEPH_NOSNAP)));
+  {
+    ObjectStore::Transaction t;
+    t.clone(cid, hoid, hoid2);
+    cerr << "Clone object" << std::endl;
+    r = store->apply_transaction(&osr, t);
+    ASSERT_EQ(r, 0);
+  }
+  {
+    map<string,bufferlist> r;
+    bufferlist h;
+    store->omap_get(cid, hoid2, &h, &r);
+    ASSERT_TRUE(h.contents_equal(header));
+    ASSERT_EQ(r.size(), km.size());
+  }
   {
     ObjectStore::Transaction t;
     t.remove(cid, hoid);
     t.remove(cid, hoid2);
     t.remove_collection(cid);
     cerr << "Cleaning" << std::endl;
-    r = store->apply_transaction(t);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
   }
 }
 
 TEST_P(StoreTest, SimpleCloneRangeTest) {
+  ObjectStore::Sequencer osr("test");
   int r;
-  coll_t cid = coll_t("coll");
+  coll_t cid;
   {
     ObjectStore::Transaction t;
-    t.create_collection(cid);
+    t.create_collection(cid, 0);
     cerr << "Creating collection " << cid << std::endl;
-    r = store->apply_transaction(t);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
   }
   ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP)));
+  hoid.hobj.pool = -1;
   bufferlist small, newdata;
   small.append("small");
   {
     ObjectStore::Transaction t;
     t.write(cid, hoid, 10, 5, small);
     cerr << "Creating object and write bl " << hoid << std::endl;
-    r = store->apply_transaction(t);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
   }
   ghobject_t hoid2(hobject_t(sobject_t("Object 2", CEPH_NOSNAP)));
+  hoid2.hobj.pool = -1;
   {
     ObjectStore::Transaction t;
     t.clone_range(cid, hoid, hoid2, 10, 5, 0);
     cerr << "Clone range object" << std::endl;
-    r = store->apply_transaction(t);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
     r = store->read(cid, hoid2, 0, 5, newdata);
     ASSERT_EQ(r, 5);
@@ -294,7 +858,7 @@ TEST_P(StoreTest, SimpleCloneRangeTest) {
     t.truncate(cid, hoid, 1024*1024);
     t.clone_range(cid, hoid, hoid2, 0, 1024*1024, 0);
     cerr << "Clone range object" << std::endl;
-    r = store->apply_transaction(t);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
     struct stat stat, stat2;
     r = store->stat(cid, hoid, &stat);
@@ -308,20 +872,21 @@ TEST_P(StoreTest, SimpleCloneRangeTest) {
     t.remove(cid, hoid2);
     t.remove_collection(cid);
     cerr << "Cleaning" << std::endl;
-    r = store->apply_transaction(t);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
   }
 }
 
 
 TEST_P(StoreTest, SimpleObjectLongnameTest) {
+  ObjectStore::Sequencer osr("test");
   int r;
-  coll_t cid = coll_t("coll");
+  coll_t cid;
   {
     ObjectStore::Transaction t;
-    t.create_collection(cid);
+    t.create_collection(cid, 0);
     cerr << "Creating collection " << cid << std::endl;
-    r = store->apply_transaction(t);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
   }
   ghobject_t hoid(hobject_t(sobject_t("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaObjectaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa 1", CEPH_NOSNAP)));
@@ -329,7 +894,7 @@ TEST_P(StoreTest, SimpleObjectLongnameTest) {
     ObjectStore::Transaction t;
     t.touch(cid, hoid);
     cerr << "Creating object " << hoid << std::endl;
-    r = store->apply_transaction(t);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
   }
   {
@@ -337,22 +902,23 @@ TEST_P(StoreTest, SimpleObjectLongnameTest) {
     t.remove(cid, hoid);
     t.remove_collection(cid);
     cerr << "Cleaning" << std::endl;
-    r = store->apply_transaction(t);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
   }
 }
 
 TEST_P(StoreTest, ManyObjectTest) {
+  ObjectStore::Sequencer osr("test");
   int NUM_OBJS = 2000;
   int r = 0;
-  coll_t cid("blah");
+  coll_t cid;
   string base = "";
   for (int i = 0; i < 100; ++i) base.append("aaaaa");
-  set<ghobject_t> created;
+  set<ghobject_t, ghobject_t::BitwiseComparator> created;
   {
     ObjectStore::Transaction t;
-    t.create_collection(cid);
-    r = store->apply_transaction(t);
+    t.create_collection(cid, 0);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
   }
   for (int i = 0; i < NUM_OBJS; ++i) {
@@ -365,20 +931,20 @@ TEST_P(StoreTest, ManyObjectTest) {
     ghobject_t hoid(hobject_t(sobject_t(string(buf) + base, CEPH_NOSNAP)));
     t.touch(cid, hoid);
     created.insert(hoid);
-    r = store->apply_transaction(t);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
   }
 
-  for (set<ghobject_t>::iterator i = created.begin();
+  for (set<ghobject_t, ghobject_t::BitwiseComparator>::iterator i = created.begin();
        i != created.end();
        ++i) {
     struct stat buf;
     ASSERT_TRUE(!store->stat(cid, *i, &buf));
   }
 
-  set<ghobject_t> listed;
+  set<ghobject_t, ghobject_t::BitwiseComparator> listed, listed2;
   vector<ghobject_t> objects;
-  r = store->collection_list(cid, objects);
+  r = store->collection_list(cid, ghobject_t(), ghobject_t::get_max(), true, INT_MAX, &objects, 0);
   ASSERT_EQ(r, 0);
 
   cerr << "objects.size() is " << objects.size() << std::endl;
@@ -392,12 +958,12 @@ TEST_P(StoreTest, ManyObjectTest) {
 
   ghobject_t start, next;
   objects.clear();
-  r = store->collection_list_partial(
+  r = store->collection_list(
     cid,
     ghobject_t::get_max(),
+    ghobject_t::get_max(),
+    true,
     50,
-    60,
-    0,
     &objects,
     &next
     );
@@ -406,14 +972,30 @@ TEST_P(StoreTest, ManyObjectTest) {
 
   objects.clear();
   listed.clear();
+  ghobject_t start2, next2;
   while (1) {
-    r = store->collection_list_partial(cid, start,
-				       50,
-				       60,
-				       0,
-				       &objects,
-				       &next);
-    ASSERT_TRUE(sorted(objects));
+    // nibblewise
+    r = store->collection_list(cid, start2, ghobject_t::get_max(), false,
+			       50,
+			       &objects,
+			       &next2);
+    if (r != -EOPNOTSUPP) {
+      ASSERT_TRUE(sorted(objects, false));
+      ASSERT_EQ(r, 0);
+      listed2.insert(objects.begin(), objects.end());
+      if (objects.size() < 50) {
+	ASSERT_TRUE(next2.is_max());
+      }
+      objects.clear();
+      start2 = next2;
+    }
+
+    // bitwise
+    r = store->collection_list(cid, start, ghobject_t::get_max(), true,
+			       50,
+			       &objects,
+			       &next);
+    ASSERT_TRUE(sorted(objects, true));
     ASSERT_EQ(r, 0);
     listed.insert(objects.begin(), objects.end());
     if (objects.size() < 50) {
@@ -421,29 +1003,32 @@ TEST_P(StoreTest, ManyObjectTest) {
       break;
     }
     objects.clear();
+
     start = next;
   }
   cerr << "listed.size() is " << listed.size() << std::endl;
   ASSERT_TRUE(listed.size() == created.size());
-  for (set<ghobject_t>::iterator i = listed.begin();
+  if (listed2.size())
+    ASSERT_EQ(listed.size(), listed2.size());
+  for (set<ghobject_t, ghobject_t::BitwiseComparator>::iterator i = listed.begin();
        i != listed.end();
        ++i) {
     ASSERT_TRUE(created.count(*i));
   }
 
-  for (set<ghobject_t>::iterator i = created.begin();
+  for (set<ghobject_t, ghobject_t::BitwiseComparator>::iterator i = created.begin();
        i != created.end();
        ++i) {
     ObjectStore::Transaction t;
     t.remove(cid, *i);
-    r = store->apply_transaction(t);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
   }
   cerr << "cleaning up" << std::endl;
   {
     ObjectStore::Transaction t;
     t.remove_collection(cid);
-    r = store->apply_transaction(t);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
   }
 }
@@ -458,7 +1043,8 @@ public:
 class MixedGenerator : public ObjectGenerator {
 public:
   unsigned seq;
-  MixedGenerator() : seq(0) {}
+  int64_t poolid;
+  MixedGenerator(int64_t p) : seq(0), poolid(p) {}
   ghobject_t create_object(gen_type *gen) {
     char buf[100];
     snprintf(buf, sizeof(buf), "%u", seq);
@@ -475,7 +1061,7 @@ public:
     // hash
     //boost::binomial_distribution<uint32_t> bin(0xFFFFFF, 0.5);
     ++seq;
-    return ghobject_t(hobject_t(name, string(), rand() & 2 ? CEPH_NOSNAP : rand(), rand() & 0xFF, 0, ""));
+    return ghobject_t(hobject_t(name, string(), rand() & 2 ? CEPH_NOSNAP : rand(), rand() & 0xFF, poolid, ""));
   }
 };
 
@@ -493,9 +1079,9 @@ public:
   static const unsigned max_attr_value_len = 1024 * 4;
   coll_t cid;
   unsigned in_flight;
-  map<ghobject_t, Object> contents;
-  set<ghobject_t> available_objects;
-  set<ghobject_t> in_flight_objects;
+  map<ghobject_t, Object, ghobject_t::BitwiseComparator> contents;
+  set<ghobject_t, ghobject_t::BitwiseComparator> available_objects;
+  set<ghobject_t, ghobject_t::BitwiseComparator> in_flight_objects;
   ObjectGenerator *object_gen;
   gen_type *rng;
   ObjectStore *store;
@@ -530,6 +1116,7 @@ public:
     SyntheticWorkloadState *state;
     ObjectStore::Transaction *t;
     ghobject_t oid, noid;
+
     C_SyntheticOnClone(SyntheticWorkloadState *state,
                           ObjectStore::Transaction *t, ghobject_t oid, ghobject_t noid)
       : state(state), t(t), oid(oid), noid(noid) {}
@@ -547,6 +1134,7 @@ public:
       bufferlist r2;
       r = state->store->read(state->cid, noid, 0, state->contents[noid].data.length(), r2);
       if (!state->contents[noid].data.contents_equal(r2)) {
+        assert(state->contents[noid].data.contents_equal(r2));
         ASSERT_TRUE(state->contents[noid].data.contents_equal(r2));
       }
       state->cond.Signal();
@@ -567,7 +1155,7 @@ public:
 
     bl.append(bp);
   }
-
+  
   SyntheticWorkloadState(ObjectStore *store,
 			 ObjectGenerator *gen,
 			 gen_type *rng,
@@ -578,8 +1166,8 @@ public:
 
   int init() {
     ObjectStore::Transaction t;
-    t.create_collection(cid);
-    return store->apply_transaction(t);
+    t.create_collection(cid, 0);
+    return store->apply_transaction(osr, t);
   }
 
   ghobject_t get_uniform_random_object() {
@@ -587,7 +1175,7 @@ public:
       cond.Wait(lock);
     boost::uniform_int<> choose(0, available_objects.size() - 1);
     int index = choose(*rng);
-    set<ghobject_t>::iterator i = available_objects.begin();
+    set<ghobject_t, ghobject_t::BitwiseComparator>::iterator i = available_objects.begin();
     for ( ; index > 0; --index, ++i) ;
     ghobject_t ret = *i;
     return ret;
@@ -599,6 +1187,7 @@ public:
   }
 
   void wait_for_done() {
+    osr->flush();
     Mutex::Locker locker(lock);
     while (in_flight)
       cond.Wait(lock);
@@ -700,14 +1289,13 @@ public:
 
   void getattrs() {
     ghobject_t obj;
-    int retry;
     {
       Mutex::Locker locker(lock);
       if (!can_unlink())
         return ;
       wait_for_ready();
 
-      retry = 10;
+      int retry = 10;
       do {
         obj = get_uniform_random_object();
         if (!--retry)
@@ -751,7 +1339,7 @@ public:
 
     bufferlist bl;
     r = store->getattr(cid, obj, it->first, bl);
-    ASSERT_TRUE(r >= 0);
+    ASSERT_EQ(r, 0);
     ASSERT_TRUE(it->second.contents_equal(bl));
   }
 
@@ -852,9 +1440,17 @@ public:
       size_t max_len = contents[obj].data.length() - offset;
       if (len > max_len)
         len = max_len;
+      assert(len == result.length());
       ASSERT_EQ(len, result.length());
       contents[obj].data.copy(offset, len, bl);
       ASSERT_EQ(r, (int)len);
+      if (!result.contents_equal(bl)) {
+	cout << "result:\n";
+	result.hexdump(cout);
+	cout << "expected:\n";
+	bl.hexdump(cout);
+	assert(0);
+      }
       ASSERT_TRUE(result.contents_equal(bl));
     }
   }
@@ -891,34 +1487,56 @@ public:
     while (in_flight)
       cond.Wait(lock);
     vector<ghobject_t> objects;
-    set<ghobject_t> objects_set, objects_set2;
+    set<ghobject_t, ghobject_t::BitwiseComparator> objects_set, objects_set2;
     ghobject_t next, current;
     while (1) {
       cerr << "scanning..." << std::endl;
-      int r = store->collection_list_partial(cid, current, 50, 100,
-					     0, &objects, &next);
+      int r = store->collection_list(cid, current, ghobject_t::get_max(),
+				     true, 100,
+				     &objects, &next);
       ASSERT_EQ(r, 0);
-      ASSERT_TRUE(sorted(objects));
+      ASSERT_TRUE(sorted(objects, true));
       objects_set.insert(objects.begin(), objects.end());
       objects.clear();
       if (next.is_max()) break;
       current = next;
     }
+    if (objects_set.size() != available_objects.size()) {
+      for (set<ghobject_t>::iterator p = objects_set.begin();
+	   p != objects_set.end();
+	   ++p)
+	if (available_objects.count(*p) == 0) {
+	  cerr << "+ " << *p << std::endl;
+	  assert(0);
+	}
+      for (set<ghobject_t>::iterator p = available_objects.begin();
+	   p != available_objects.end();
+	   ++p)
+	if (objects_set.count(*p) == 0)
+	  cerr << "- " << *p << std::endl;
+      //cerr << " objects_set: " << objects_set << std::endl;
+      //cerr << " available_set: " << available_objects << std::endl;
+      assert(0 == "badness");
+    }
+
     ASSERT_EQ(objects_set.size(), available_objects.size());
-    for (set<ghobject_t>::iterator i = objects_set.begin();
+    for (set<ghobject_t, ghobject_t::BitwiseComparator>::iterator i = objects_set.begin();
 	 i != objects_set.end();
 	 ++i) {
       ASSERT_GT(available_objects.count(*i), (unsigned)0);
     }
 
-    int r = store->collection_list(cid, objects);
+    int r = store->collection_list(cid, ghobject_t(), ghobject_t::get_max(), true, INT_MAX, &objects, 0);
     ASSERT_EQ(r, 0);
     objects_set2.insert(objects.begin(), objects.end());
     ASSERT_EQ(objects_set2.size(), available_objects.size());
-    for (set<ghobject_t>::iterator i = objects_set2.begin();
+    for (set<ghobject_t, ghobject_t::BitwiseComparator>::iterator i = objects_set2.begin();
 	 i != objects_set2.end();
 	 ++i) {
       ASSERT_GT(available_objects.count(*i), (unsigned)0);
+      if (available_objects.count(*i) == 0) {
+	cerr << "+ " << *i << std::endl;
+      }
     }
   }
 
@@ -936,6 +1554,7 @@ public:
     struct stat buf;
     int r = store->stat(cid, hoid, &buf);
     ASSERT_EQ(0, r);
+    assert(buf.st_size == contents[hoid].data.length());
     ASSERT_TRUE(buf.st_size == contents[hoid].data.length());
     {
       Mutex::Locker locker(lock);
@@ -960,6 +1579,34 @@ public:
     return store->queue_transaction(osr, t, new C_SyntheticOnReadable(this, t, to_remove));
   }
 
+  int zero() {
+    Mutex::Locker locker(lock);
+    if (!can_unlink())
+      return -ENOENT;
+    wait_for_ready();
+
+    ghobject_t new_obj = get_uniform_random_object();
+    available_objects.erase(new_obj);
+    ObjectStore::Transaction *t = new ObjectStore::Transaction;
+
+    boost::uniform_int<> u1(0, max_object_len/2);
+    boost::uniform_int<> u2(0, max_object_len/10);
+    uint64_t offset = u1(*rng);
+    uint64_t len = u2(*rng);
+    if (offset > len)
+      swap(offset, len);
+
+    if (contents[new_obj].data.length() < offset + len) {
+      contents[new_obj].data.append_zero(offset+len-contents[new_obj].data.length());
+    }
+    contents[new_obj].data.zero(offset, len);
+
+    t->zero(cid, new_obj, offset, len);
+    ++in_flight;
+    in_flight_objects.insert(new_obj);
+    return store->queue_transaction(osr, t, new C_SyntheticOnReadable(this, t, new_obj));
+  }
+
   void print_internal_state() {
     Mutex::Locker locker(lock);
     cerr << "available_objects: " << available_objects.size()
@@ -971,9 +1618,9 @@ public:
 
 TEST_P(StoreTest, Synthetic) {
   ObjectStore::Sequencer osr("test");
-  MixedGenerator gen;
+  MixedGenerator gen(555);
   gen_type rng(time(NULL));
-  coll_t cid("synthetic_1");
+  coll_t cid(spg_t(pg_t(0,555), shard_id_t::NO_SHARD));
 
   SyntheticWorkloadState test_obj(store.get(), &gen, &rng, &osr, cid);
   test_obj.init();
@@ -990,9 +1637,11 @@ TEST_P(StoreTest, Synthetic) {
     int val = true_false(rng);
     if (val > 97) {
       test_obj.scan();
-    } else if (val > 90) {
+    } else if (val > 95) {
       test_obj.stat();
     } else if (val > 85) {
+      test_obj.zero();
+    } else if (val > 80) {
       test_obj.unlink();
     } else if (val > 55) {
       test_obj.write();
@@ -1009,9 +1658,9 @@ TEST_P(StoreTest, Synthetic) {
 
 TEST_P(StoreTest, AttrSynthetic) {
   ObjectStore::Sequencer osr("test");
-  MixedGenerator gen;
+  MixedGenerator gen(447);
   gen_type rng(time(NULL));
-  coll_t cid("synthetic_2");
+  coll_t cid(spg_t(pg_t(0,447),shard_id_t::NO_SHARD));
 
   SyntheticWorkloadState test_obj(store.get(), &gen, &rng, &osr, cid);
   test_obj.init();
@@ -1046,17 +1695,19 @@ TEST_P(StoreTest, AttrSynthetic) {
 }
 
 TEST_P(StoreTest, HashCollisionTest) {
-  coll_t cid("blah");
+  ObjectStore::Sequencer osr("test");
+  int64_t poolid = 11;
+  coll_t cid(spg_t(pg_t(0,poolid),shard_id_t::NO_SHARD));
   int r;
   {
     ObjectStore::Transaction t;
-    t.create_collection(cid);
-    r = store->apply_transaction(t);
+    t.create_collection(cid, 0);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
   }
   string base = "";
   for (int i = 0; i < 100; ++i) base.append("aaaaa");
-  set<ghobject_t> created;
+  set<ghobject_t, ghobject_t::BitwiseComparator> created;
   for (int n = 0; n < 10; ++n) {
     char nbuf[100];
     sprintf(nbuf, "n%d", n);
@@ -1066,30 +1717,30 @@ TEST_P(StoreTest, HashCollisionTest) {
     if (!(i % 5)) {
       cerr << "Object n" << n << " "<< i << std::endl;
     }
-    ghobject_t hoid(hobject_t(string(buf) + base, string(), CEPH_NOSNAP, 0, 0, string(nbuf)));
+    ghobject_t hoid(hobject_t(string(buf) + base, string(), CEPH_NOSNAP, 0, poolid, string(nbuf)));
     {
       ObjectStore::Transaction t;
       t.touch(cid, hoid);
-      r = store->apply_transaction(t);
+      r = store->apply_transaction(&osr, t);
       ASSERT_EQ(r, 0);
     }
     created.insert(hoid);
   }
   }
   vector<ghobject_t> objects;
-  r = store->collection_list(cid, objects);
+  r = store->collection_list(cid, ghobject_t(), ghobject_t::get_max(), true, INT_MAX, &objects, 0);
   ASSERT_EQ(r, 0);
-  set<ghobject_t> listed(objects.begin(), objects.end());
+  set<ghobject_t, ghobject_t::BitwiseComparator> listed(objects.begin(), objects.end());
   cerr << "listed.size() is " << listed.size() << " and created.size() is " << created.size() << std::endl;
   ASSERT_TRUE(listed.size() == created.size());
   objects.clear();
   listed.clear();
   ghobject_t current, next;
   while (1) {
-    r = store->collection_list_partial(cid, current, 50, 60,
-				       0, &objects, &next);
+    r = store->collection_list(cid, current, ghobject_t::get_max(), true, 60,
+			       &objects, &next);
     ASSERT_EQ(r, 0);
-    ASSERT_TRUE(sorted(objects));
+    ASSERT_TRUE(sorted(objects, true));
     for (vector<ghobject_t>::iterator i = objects.begin();
 	 i != objects.end();
 	 ++i) {
@@ -1106,63 +1757,68 @@ TEST_P(StoreTest, HashCollisionTest) {
   }
   cerr << "listed.size() is " << listed.size() << std::endl;
   ASSERT_TRUE(listed.size() == created.size());
-  for (set<ghobject_t>::iterator i = listed.begin();
+  for (set<ghobject_t, ghobject_t::BitwiseComparator>::iterator i = listed.begin();
        i != listed.end();
        ++i) {
     ASSERT_TRUE(created.count(*i));
   }
 
-  for (set<ghobject_t>::iterator i = created.begin();
+  for (set<ghobject_t, ghobject_t::BitwiseComparator>::iterator i = created.begin();
        i != created.end();
        ++i) {
     ObjectStore::Transaction t;
     t.remove(cid, *i);
-    r = store->apply_transaction(t);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
   }
   ObjectStore::Transaction t;
   t.remove_collection(cid);
-  r = store->apply_transaction(t);
+  r = store->apply_transaction(&osr, t);
   ASSERT_EQ(r, 0);
 }
 
 TEST_P(StoreTest, ScrubTest) {
-  coll_t cid("blah");
+  ObjectStore::Sequencer osr("test");
+  int64_t poolid = 111;
+  coll_t cid(spg_t(pg_t(0, poolid),shard_id_t(1)));
   int r;
   {
     ObjectStore::Transaction t;
-    t.create_collection(cid);
-    r = store->apply_transaction(t);
+    t.create_collection(cid, 0);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
   }
   string base = "aaaaa";
-  set<ghobject_t> created;
+  set<ghobject_t, ghobject_t::BitwiseComparator> created;
   for (int i = 0; i < 1000; ++i) {
     char buf[100];
     sprintf(buf, "%d", i);
     if (!(i % 5)) {
       cerr << "Object " << i << std::endl;
     }
-    ghobject_t hoid(hobject_t(string(buf) + base, string(), CEPH_NOSNAP, i, 0, ""));
+    ghobject_t hoid(hobject_t(string(buf) + base, string(), CEPH_NOSNAP, i,
+			      poolid, ""),
+		    ghobject_t::NO_GEN, shard_id_t(1));
     {
       ObjectStore::Transaction t;
       t.touch(cid, hoid);
-      r = store->apply_transaction(t);
+      r = store->apply_transaction(&osr, t);
       ASSERT_EQ(r, 0);
     }
     created.insert(hoid);
   }
 
-  // Add same hobject_t but different generation or shard_id
+  // Add same hobject_t but different generation
   {
-    ghobject_t hoid1(hobject_t("same-object", string(), CEPH_NOSNAP, 0, 0, ""));
-    ghobject_t hoid2(hobject_t("same-object", string(), CEPH_NOSNAP, 0, 0, ""), (gen_t)1, (shard_id_t)0);
-    ghobject_t hoid3(hobject_t("same-object", string(), CEPH_NOSNAP, 0, 0, ""), (gen_t)2, (shard_id_t)0);
+    ghobject_t hoid1(hobject_t("same-object", string(), CEPH_NOSNAP, 0, poolid, ""),
+		     ghobject_t::NO_GEN, shard_id_t(1));
+    ghobject_t hoid2(hobject_t("same-object", string(), CEPH_NOSNAP, 0, poolid, ""), (gen_t)1, shard_id_t(1));
+    ghobject_t hoid3(hobject_t("same-object", string(), CEPH_NOSNAP, 0, poolid, ""), (gen_t)2, shard_id_t(1));
     ObjectStore::Transaction t;
     t.touch(cid, hoid1);
     t.touch(cid, hoid2);
     t.touch(cid, hoid3);
-    r = store->apply_transaction(t);
+    r = store->apply_transaction(&osr, t);
     created.insert(hoid1);
     created.insert(hoid2);
     created.insert(hoid3);
@@ -1170,19 +1826,20 @@ TEST_P(StoreTest, ScrubTest) {
   }
 
   vector<ghobject_t> objects;
-  r = store->collection_list(cid, objects);
+  r = store->collection_list(cid, ghobject_t(), ghobject_t::get_max(), true,
+			     INT_MAX, &objects, 0);
   ASSERT_EQ(r, 0);
-  set<ghobject_t> listed(objects.begin(), objects.end());
+  set<ghobject_t, ghobject_t::BitwiseComparator> listed(objects.begin(), objects.end());
   cerr << "listed.size() is " << listed.size() << " and created.size() is " << created.size() << std::endl;
   ASSERT_TRUE(listed.size() == created.size());
   objects.clear();
   listed.clear();
   ghobject_t current, next;
   while (1) {
-    r = store->collection_list_partial(cid, current, 50, 60,
-                                       0, &objects, &next);
+    r = store->collection_list(cid, current, ghobject_t::get_max(), true, 60,
+			       &objects, &next);
     ASSERT_EQ(r, 0);
-    ASSERT_TRUE(sorted(objects));
+    ASSERT_TRUE(sorted(objects, true));
     for (vector<ghobject_t>::iterator i = objects.begin();
          i != objects.end(); ++i) {
       if (listed.count(*i))
@@ -1198,35 +1855,36 @@ TEST_P(StoreTest, ScrubTest) {
   }
   cerr << "listed.size() is " << listed.size() << std::endl;
   ASSERT_TRUE(listed.size() == created.size());
-  for (set<ghobject_t>::iterator i = listed.begin();
+  for (set<ghobject_t, ghobject_t::BitwiseComparator>::iterator i = listed.begin();
        i != listed.end();
        ++i) {
     ASSERT_TRUE(created.count(*i));
   }
 
-  for (set<ghobject_t>::iterator i = created.begin();
+  for (set<ghobject_t, ghobject_t::BitwiseComparator>::iterator i = created.begin();
        i != created.end();
        ++i) {
     ObjectStore::Transaction t;
     t.remove(cid, *i);
-    r = store->apply_transaction(t);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
   }
   ObjectStore::Transaction t;
   t.remove_collection(cid);
-  r = store->apply_transaction(t);
+  r = store->apply_transaction(&osr, t);
   ASSERT_EQ(r, 0);
 }
 
 
 TEST_P(StoreTest, OMapTest) {
-  coll_t cid("blah");
+  ObjectStore::Sequencer osr("test");
+  coll_t cid;
   ghobject_t hoid(hobject_t("tesomap", "", CEPH_NOSNAP, 0, 0, ""));
   int r;
   {
     ObjectStore::Transaction t;
-    t.create_collection(cid);
-    r = store->apply_transaction(t);
+    t.create_collection(cid, 0);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
   }
 
@@ -1237,7 +1895,7 @@ TEST_P(StoreTest, OMapTest) {
     t.omap_clear(cid, hoid);
     map<string, bufferlist> start_set;
     t.omap_setkeys(cid, hoid, start_set);
-    store->apply_transaction(t);
+    store->apply_transaction(&osr, t);
   }
 
   for (int i = 0; i < 100; i++) {
@@ -1272,7 +1930,7 @@ TEST_P(StoreTest, OMapTest) {
     to_add.insert(pair<string, bufferlist>("key-" + string(buf), bl));
     attrs.insert(pair<string, bufferlist>("key-" + string(buf), bl));
     t.omap_setkeys(cid, hoid, to_add);
-    store->apply_transaction(t);
+    store->apply_transaction(&osr, t);
   }
 
   int i = 0;
@@ -1302,7 +1960,7 @@ TEST_P(StoreTest, OMapTest) {
     set<string> keys_to_remove;
     keys_to_remove.insert(to_remove);
     t.omap_rmkeys(cid, hoid, keys_to_remove);
-    store->apply_transaction(t);
+    store->apply_transaction(&osr, t);
 
     attrs.erase(to_remove);
 
@@ -1314,14 +1972,14 @@ TEST_P(StoreTest, OMapTest) {
     bl1.append("omap_header");
     ObjectStore::Transaction t;
     t.omap_setheader(cid, hoid, bl1);
-    store->apply_transaction(t);
+    store->apply_transaction(&osr, t);
 
     bufferlist bl2;
     bl2.append("value");
     map<string, bufferlist> to_add;
     to_add.insert(pair<string, bufferlist>("key", bl2));
     t.omap_setkeys(cid, hoid, to_add);
-    store->apply_transaction(t);
+    store->apply_transaction(&osr, t);
 
     bufferlist bl3;
     map<string, bufferlist> cur_attrs;
@@ -1329,17 +1987,165 @@ TEST_P(StoreTest, OMapTest) {
     ASSERT_EQ(r, 0);
     ASSERT_EQ(cur_attrs.size(), size_t(1));
     ASSERT_TRUE(bl3.contents_equal(bl1));
+ 
+    set<string> keys;
+    r = store->omap_get_keys(cid, hoid, &keys);
+    ASSERT_EQ(r, 0);
+    ASSERT_EQ(keys.size(), size_t(1));
+  }
+
+  // test omap_clear, omap_rmkey_range
+  {
+    {
+      map<string,bufferlist> to_set;
+      for (int n=0; n<10; ++n) {
+	to_set[stringify(n)].append("foo");
+      }
+      bufferlist h;
+      h.append("header");
+      ObjectStore::Transaction t;
+      t.remove(cid, hoid);
+      t.touch(cid, hoid);
+      t.omap_setheader(cid, hoid, h);
+      t.omap_setkeys(cid, hoid, to_set);
+      store->apply_transaction(&osr, t);
+    }
+    {
+      ObjectStore::Transaction t;
+      t.omap_rmkeyrange(cid, hoid, "3", "7");
+      store->apply_transaction(&osr, t);
+    }
+    {
+      bufferlist hdr;
+      map<string,bufferlist> m;
+      store->omap_get(cid, hoid, &hdr, &m);
+      ASSERT_EQ(6u, hdr.length());
+      ASSERT_TRUE(m.count("2"));
+      ASSERT_TRUE(!m.count("3"));
+      ASSERT_TRUE(!m.count("6"));
+      ASSERT_TRUE(m.count("7"));
+      ASSERT_TRUE(m.count("8"));
+      //cout << m << std::endl;
+      ASSERT_EQ(6u, m.size());
+    }
+    {
+      ObjectStore::Transaction t;
+      t.omap_clear(cid, hoid);
+      store->apply_transaction(&osr, t);
+    }
+    {
+      bufferlist hdr;
+      map<string,bufferlist> m;
+      store->omap_get(cid, hoid, &hdr, &m);
+      ASSERT_EQ(0u, hdr.length());
+      ASSERT_EQ(0u, m.size());
+    }
   }
 
   ObjectStore::Transaction t;
   t.remove(cid, hoid);
   t.remove_collection(cid);
-  r = store->apply_transaction(t);
+  r = store->apply_transaction(&osr, t);
   ASSERT_EQ(r, 0);
 }
 
+TEST_P(StoreTest, OMapIterator) {
+  ObjectStore::Sequencer osr("test");
+  coll_t cid;
+  ghobject_t hoid(hobject_t("tesomap", "", CEPH_NOSNAP, 0, 0, ""));
+  int count = 0;
+  int r;
+  {
+    ObjectStore::Transaction t;
+    t.create_collection(cid, 0);
+    r = store->apply_transaction(&osr, t);
+    ASSERT_EQ(r, 0);
+  }
+
+  map<string, bufferlist> attrs;
+  {
+    ObjectStore::Transaction t;
+    t.touch(cid, hoid);
+    t.omap_clear(cid, hoid);
+    map<string, bufferlist> start_set;
+    t.omap_setkeys(cid, hoid, start_set);
+    store->apply_transaction(&osr, t);
+  }
+  ObjectMap::ObjectMapIterator iter;
+  bool correct;
+  //basic iteration
+  for (int i = 0; i < 100; i++) {
+    if (!(i%5)) {
+      std::cout << "On iteration " << i << std::endl;
+    }
+    bufferlist bl;
+
+    // FileStore may deadlock two active iterators over the same data
+    iter = ObjectMap::ObjectMapIterator();
+
+    iter = store->get_omap_iterator(cid, hoid);
+    for (iter->seek_to_first(), count=0; iter->valid(); iter->next(), count++) {
+      string key = iter->key();
+      bufferlist value = iter->value();
+      correct = attrs.count(key) && (string(value.c_str()) == string(attrs[key].c_str()));
+      if (!correct) {
+	if (attrs.count(key) > 0) {
+	  std::cout << "key " << key << "in omap , " << value.c_str() << " : " << attrs[key].c_str() << std::endl;
+	}
+	else
+	  std::cout << "key " << key << "should not exists in omap" << std::endl;
+      }
+      ASSERT_EQ(correct, true);
+    }
+    ASSERT_EQ((int)attrs.size(), count);
+
+    // FileStore may deadlock an active iterator vs apply_transaction
+    iter = ObjectMap::ObjectMapIterator();
+
+    char buf[100];
+    snprintf(buf, sizeof(buf), "%d", i);
+    bl.clear();
+    bufferptr bp(buf, strlen(buf) + 1);
+    bl.append(bp);
+    map<string, bufferlist> to_add;
+    to_add.insert(pair<string, bufferlist>("key-" + string(buf), bl));
+    attrs.insert(pair<string, bufferlist>("key-" + string(buf), bl));
+    ObjectStore::Transaction t;
+    t.omap_setkeys(cid, hoid, to_add);
+    store->apply_transaction(&osr, t);
+  }
+
+  iter = store->get_omap_iterator(cid, hoid);
+  //lower bound
+  string bound_key = "key-5";
+  iter->lower_bound(bound_key);
+  correct = bound_key <= iter->key();
+  if (!correct) {
+    std::cout << "lower bound, bound key is " << bound_key << " < iter key is " << iter->key() << std::endl;
+  }
+  ASSERT_EQ(correct, true);
+  //upper bound
+  iter->upper_bound(bound_key);
+  correct = iter->key() > bound_key;
+  if (!correct) {
+    std::cout << "upper bound, bound key is " << bound_key << " >= iter key is " << iter->key() << std::endl;
+  }
+  ASSERT_EQ(correct, true);
+
+  // FileStore may deadlock an active iterator vs apply_transaction
+  iter = ObjectMap::ObjectMapIterator();
+  {
+    ObjectStore::Transaction t;
+    t.remove(cid, hoid);
+    t.remove_collection(cid);
+    r = store->apply_transaction(&osr, t);
+    ASSERT_EQ(r, 0);
+  }
+}
+
 TEST_P(StoreTest, XattrTest) {
-  coll_t cid("blah");
+  ObjectStore::Sequencer osr("test");
+  coll_t cid;
   ghobject_t hoid(hobject_t("tesomap", "", CEPH_NOSNAP, 0, 0, ""));
   bufferlist big;
   for (unsigned i = 0; i < 10000; ++i) {
@@ -1352,9 +2158,9 @@ TEST_P(StoreTest, XattrTest) {
   int r;
   {
     ObjectStore::Transaction t;
-    t.create_collection(cid);
+    t.create_collection(cid, 0);
     t.touch(cid, hoid);
-    r = store->apply_transaction(t);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
   }
 
@@ -1373,7 +2179,7 @@ TEST_P(StoreTest, XattrTest) {
     attrs["attr4"] = big;
     t.setattr(cid, hoid, "attr3", big);
     attrs["attr3"] = big;
-    r = store->apply_transaction(t);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
   }
 
@@ -1392,7 +2198,7 @@ TEST_P(StoreTest, XattrTest) {
     ObjectStore::Transaction t;
     t.rmattr(cid, hoid, "attr2");
     attrs.erase("attr2");
-    r = store->apply_transaction(t);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
   }
 
@@ -1412,7 +2218,7 @@ TEST_P(StoreTest, XattrTest) {
   ASSERT_EQ(r, -ENODATA);
 
   r = store->getattr(cid, hoid, "attr3", bp);
-  ASSERT_GE(r, 0);
+  ASSERT_EQ(r, 0);
   bufferlist bl2;
   bl2.push_back(bp);
   ASSERT_TRUE(bl2 == attrs["attr3"]);
@@ -1420,7 +2226,7 @@ TEST_P(StoreTest, XattrTest) {
   ObjectStore::Transaction t;
   t.remove(cid, hoid);
   t.remove_collection(cid);
-  r = store->apply_transaction(t);
+  r = store->apply_transaction(&osr, t);
   ASSERT_EQ(r, 0);
 }
 
@@ -1429,13 +2235,14 @@ void colsplittest(
   unsigned num_objects,
   unsigned common_suffix_size
   ) {
-  coll_t cid("from");
-  coll_t tid("to");
+  ObjectStore::Sequencer osr("test");
+  coll_t cid(spg_t(pg_t(0,52),shard_id_t::NO_SHARD));
+  coll_t tid(spg_t(pg_t(1<<common_suffix_size,52),shard_id_t::NO_SHARD));
   int r = 0;
   {
     ObjectStore::Transaction t;
-    t.create_collection(cid);
-    r = store->apply_transaction(t);
+    t.create_collection(cid, common_suffix_size);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
   }
   {
@@ -1448,45 +2255,47 @@ void colsplittest(
 	  "",
 	  CEPH_NOSNAP,
 	  i<<common_suffix_size,
-	  0, "")));
+	  52, "")));
     }
-    r = store->apply_transaction(t);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
   }
   {
     ObjectStore::Transaction t;
-    t.create_collection(tid);
-    t.split_collection(cid, common_suffix_size+1, 0, tid);
-    r = store->apply_transaction(t);
+    t.create_collection(tid, common_suffix_size + 1);
+    t.split_collection(cid, common_suffix_size+1, 1<<common_suffix_size, tid);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
   }
 
   ObjectStore::Transaction t;
   vector<ghobject_t> objects;
-  r = store->collection_list(cid, objects);
+  r = store->collection_list(cid, ghobject_t(), ghobject_t::get_max(), true,
+			     INT_MAX, &objects, 0);
   ASSERT_EQ(r, 0);
   ASSERT_EQ(objects.size(), num_objects);
   for (vector<ghobject_t>::iterator i = objects.begin();
        i != objects.end();
        ++i) {
-    ASSERT_EQ(!(i->hobj.get_hash() & (1<<common_suffix_size)), 0u);
+    ASSERT_EQ(!!(i->hobj.get_hash() & (1<<common_suffix_size)), 0u);
     t.remove(cid, *i);
   }
 
   objects.clear();
-  r = store->collection_list(tid, objects);
+  r = store->collection_list(tid, ghobject_t(), ghobject_t::get_max(), true,
+			     INT_MAX, &objects, 0);
   ASSERT_EQ(r, 0);
   ASSERT_EQ(objects.size(), num_objects);
   for (vector<ghobject_t>::iterator i = objects.begin();
        i != objects.end();
        ++i) {
-    ASSERT_EQ(i->hobj.get_hash() & (1<<common_suffix_size), 0u);
+    ASSERT_EQ(!(i->hobj.get_hash() & (1<<common_suffix_size)), 0u);
     t.remove(tid, *i);
   }
 
   t.remove_collection(cid);
   t.remove_collection(tid);
-  r = store->apply_transaction(t);
+  r = store->apply_transaction(&osr, t);
   ASSERT_EQ(r, 0);
 }
 
@@ -1511,34 +2320,37 @@ TEST_P(StoreTest, ColSplitTest3) {
  * stops at the common prefix subdir.  See bug
  * #5273 */
 TEST_P(StoreTest, TwoHash) {
-  coll_t cid("asdf");
+  ObjectStore::Sequencer osr("test");
+  coll_t cid;
   int r;
   {
     ObjectStore::Transaction t;
-    t.create_collection(cid);
-    r = store->apply_transaction(t);
+    t.create_collection(cid, 0);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
   }
   std::cout << "Making objects" << std::endl;
   for (int i = 0; i < 360; ++i) {
     ObjectStore::Transaction t;
     ghobject_t o;
+    o.hobj.pool = -1;
     if (i < 8) {
       o.hobj.set_hash((i << 16) | 0xA1);
       t.touch(cid, o);
     }
     o.hobj.set_hash((i << 16) | 0xB1);
     t.touch(cid, o);
-    r = store->apply_transaction(t);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
   }
   std::cout << "Removing half" << std::endl;
   for (int i = 1; i < 8; ++i) {
     ObjectStore::Transaction t;
     ghobject_t o;
+    o.hobj.pool = -1;
     o.hobj.set_hash((i << 16) | 0xA1);
     t.remove(cid, o);
-    r = store->apply_transaction(t);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
   }
   std::cout << "Checking" << std::endl;
@@ -1546,12 +2358,14 @@ TEST_P(StoreTest, TwoHash) {
     ObjectStore::Transaction t;
     ghobject_t o;
     o.hobj.set_hash((i << 16) | 0xA1);
+    o.hobj.pool = -1;
     bool exists = store->exists(cid, o);
     ASSERT_EQ(exists, false);
   }
   {
     ghobject_t o;
     o.hobj.set_hash(0xA1);
+    o.hobj.pool = -1;
     bool exists = store->exists(cid, o);
     ASSERT_EQ(exists, true);
   }
@@ -1560,29 +2374,30 @@ TEST_P(StoreTest, TwoHash) {
     ObjectStore::Transaction t;
     ghobject_t o;
     o.hobj.set_hash((i << 16) | 0xA1);
+    o.hobj.pool = -1;
     t.remove(cid, o);
     o.hobj.set_hash((i << 16) | 0xB1);
     t.remove(cid, o);
-    r = store->apply_transaction(t);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
   }
   ObjectStore::Transaction t;
   t.remove_collection(cid);
-  r = store->apply_transaction(t);
+  r = store->apply_transaction(&osr, t);
   ASSERT_EQ(r, 0);
 }
 
 TEST_P(StoreTest, MoveRename) {
-  coll_t temp_cid("mytemp");
-  hobject_t temp_oid("tmp_oid", "", CEPH_NOSNAP, 0, 0, "");
-  coll_t cid("dest");
-  hobject_t oid("dest_oid", "", CEPH_NOSNAP, 0, 0, "");
+  ObjectStore::Sequencer osr("test");
+  coll_t cid(spg_t(pg_t(0, 212),shard_id_t::NO_SHARD));
+  ghobject_t temp_oid(hobject_t("tmp_oid", "", CEPH_NOSNAP, 0, 0, ""));
+  ghobject_t oid(hobject_t("dest_oid", "", CEPH_NOSNAP, 0, 0, ""));
   int r;
   {
     ObjectStore::Transaction t;
-    t.create_collection(cid);
+    t.create_collection(cid, 0);
     t.touch(cid, oid);
-    r = store->apply_transaction(t);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
   }
   ASSERT_TRUE(store->exists(cid, oid));
@@ -1593,24 +2408,23 @@ TEST_P(StoreTest, MoveRename) {
   omap["omap_key"].append("omap value");
   {
     ObjectStore::Transaction t;
-    t.create_collection(temp_cid);
-    t.touch(temp_cid, temp_oid);
-    t.write(temp_cid, temp_oid, 0, data.length(), data);
-    t.setattr(temp_cid, temp_oid, "attr", attr);
-    t.omap_setkeys(temp_cid, temp_oid, omap);
-    r = store->apply_transaction(t);
+    t.touch(cid, temp_oid);
+    t.write(cid, temp_oid, 0, data.length(), data);
+    t.setattr(cid, temp_oid, "attr", attr);
+    t.omap_setkeys(cid, temp_oid, omap);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
   }
-  ASSERT_TRUE(store->exists(temp_cid, temp_oid));
+  ASSERT_TRUE(store->exists(cid, temp_oid));
   {
     ObjectStore::Transaction t;
     t.remove(cid, oid);
-    t.collection_move_rename(temp_cid, temp_oid, cid, oid);
-    r = store->apply_transaction(t);
+    t.collection_move_rename(cid, temp_oid, cid, oid);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
   }
   ASSERT_TRUE(store->exists(cid, oid));
-  ASSERT_FALSE(store->exists(temp_cid, temp_oid));
+  ASSERT_FALSE(store->exists(cid, temp_oid));
   {
     bufferlist newdata;
     r = store->read(cid, oid, 0, 1000, newdata);
@@ -1618,7 +2432,7 @@ TEST_P(StoreTest, MoveRename) {
     ASSERT_TRUE(newdata.contents_equal(data));
     bufferlist newattr;
     r = store->getattr(cid, oid, "attr", newattr);
-    ASSERT_GE(r, 0);
+    ASSERT_EQ(r, 0);
     ASSERT_TRUE(newattr.contents_equal(attr));
     set<string> keys;
     keys.insert("omap_key");
@@ -1633,28 +2447,25 @@ TEST_P(StoreTest, MoveRename) {
     ObjectStore::Transaction t;
     t.remove(cid, oid);
     t.remove_collection(cid);
-    t.remove_collection(temp_cid);
-    r = store->apply_transaction(t);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
   }
 }
 
 TEST_P(StoreTest, BigRGWObjectName) {
+  ObjectStore::Sequencer osr("test");
   store->set_allow_sharded_objects();
-  store->sync_and_flush();
-  coll_t temp_cid("mytemp");
-  hobject_t temp_oid("tmp_oid", "", CEPH_NOSNAP, 0, 0, "");
-  coll_t cid("dest");
+  coll_t cid(spg_t(pg_t(0,12),shard_id_t::NO_SHARD));
   ghobject_t oid(
     hobject_t(
       "default.4106.50_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa [...]
       "",
       CEPH_NOSNAP,
       0x81920472,
-      3,
+      12,
       ""),
     15,
-    shard_id_t(1));
+    shard_id_t::NO_SHARD);
   ghobject_t oid2(oid);
   oid2.generation = 17;
   ghobject_t oidhead(oid);
@@ -1663,25 +2474,26 @@ TEST_P(StoreTest, BigRGWObjectName) {
   int r;
   {
     ObjectStore::Transaction t;
-    t.create_collection(cid);
+    t.create_collection(cid, 0);
     t.touch(cid, oidhead);
     t.collection_move_rename(cid, oidhead, cid, oid);
     t.touch(cid, oidhead);
     t.collection_move_rename(cid, oidhead, cid, oid2);
-    r = store->apply_transaction(t);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
   }
 
   {
     ObjectStore::Transaction t;
     t.remove(cid, oid);
-    r = store->apply_transaction(t);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
   }
 
   {
     vector<ghobject_t> objects;
-    r = store->collection_list(cid, objects);
+    r = store->collection_list(cid, ghobject_t(), ghobject_t::get_max(), true,
+			       INT_MAX, &objects, 0);
     ASSERT_EQ(r, 0);
     ASSERT_EQ(objects.size(), 1u);
     ASSERT_EQ(objects[0], oid2);
@@ -1693,45 +2505,46 @@ TEST_P(StoreTest, BigRGWObjectName) {
     ObjectStore::Transaction t;
     t.remove(cid, oid2);
     t.remove_collection(cid);
-    r = store->apply_transaction(t);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
 
   }
 }
 
 TEST_P(StoreTest, SetAllocHint) {
-  coll_t cid("alloc_hint");
+  ObjectStore::Sequencer osr("test");
+  coll_t cid;
   ghobject_t hoid(hobject_t("test_hint", "", CEPH_NOSNAP, 0, 0, ""));
   int r;
   {
     ObjectStore::Transaction t;
-    t.create_collection(cid);
+    t.create_collection(cid, 0);
     t.touch(cid, hoid);
-    r = store->apply_transaction(t);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
   }
   {
     ObjectStore::Transaction t;
     t.set_alloc_hint(cid, hoid, 4*1024*1024, 1024*4);
-    r = store->apply_transaction(t);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
   }
   {
     ObjectStore::Transaction t;
     t.remove(cid, hoid);
-    r = store->apply_transaction(t);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
   }
   {
     ObjectStore::Transaction t;
     t.set_alloc_hint(cid, hoid, 4*1024*1024, 1024*4);
-    r = store->apply_transaction(t);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
   }
   {
     ObjectStore::Transaction t;
     t.remove_collection(cid);
-    r = store->apply_transaction(t);
+    r = store->apply_transaction(&osr, t);
     ASSERT_EQ(r, 0);
   }
 }
@@ -1739,7 +2552,11 @@ TEST_P(StoreTest, SetAllocHint) {
 INSTANTIATE_TEST_CASE_P(
   ObjectStore,
   StoreTest,
-  ::testing::Values("memstore", "filestore", "keyvaluestore"));
+  ::testing::Values(
+    "memstore",
+    "filestore",
+    "keyvaluestore",
+    "newstore"));
 
 #else
 
@@ -1814,6 +2631,7 @@ TEST(EXT4StoreTest, _detect_fs) {
 int main(int argc, char **argv) {
   vector<const char*> args;
   argv_to_vec(argc, (const char **)argv, args);
+  env_to_vec(args);
 
   global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
   common_init_finish(g_ceph_context);
@@ -1825,7 +2643,7 @@ int main(int argc, char **argv) {
   g_ceph_context->_conf->set_val("filestore_fiemap", "true");
   g_ceph_context->_conf->set_val(
     "enable_experimental_unrecoverable_data_corrupting_features",
-    "keyvaluestore");
+    "keyvaluestore, newstore, rocksdb");
   g_ceph_context->_conf->apply_changes(NULL);
 
   ::testing::InitGoogleTest(&argc, argv);
diff --git a/src/test/objectstore/test_idempotent.cc b/src/test/objectstore/test_idempotent.cc
index 4cfb8d1..098bc81 100644
--- a/src/test/objectstore/test_idempotent.cc
+++ b/src/test/objectstore/test_idempotent.cc
@@ -68,14 +68,16 @@ int main(int argc, char **argv) {
   boost::scoped_ptr<KeyValueDB> db(_db);
   boost::scoped_ptr<ObjectStore> store(new FileStore(store_path, store_dev));
 
+  ObjectStore::Sequencer osr(__func__);
+  coll_t coll(spg_t(pg_t(0,12),shard_id_t::NO_SHARD));
 
   if (start_new) {
     std::cerr << "mkfs" << std::endl;
     assert(!store->mkfs());
     ObjectStore::Transaction t;
     assert(!store->mount());
-    t.create_collection(coll_t("coll"));
-    store->apply_transaction(t);
+    t.create_collection(coll, 0);
+    store->apply_transaction(&osr, t);
   } else {
     assert(!store->mount());
   }
@@ -86,7 +88,7 @@ int main(int argc, char **argv) {
   for (unsigned i = 0; i < 10; ++i) {
     stringstream stream;
     stream << "Object_" << i;
-    tracker.verify("coll", stream.str(), true);
+    tracker.verify(coll, stream.str(), true);
     objects.insert(stream.str());
   }
 
@@ -95,19 +97,19 @@ int main(int argc, char **argv) {
     for (unsigned j = 0; j < 100; ++j) {
       int val = rand() % 100;
       if (val < 30) {
-	t.write("coll", *rand_choose(objects));
+	t.write(coll, *rand_choose(objects));
       } else if (val < 60) {
-	t.clone("coll", *rand_choose(objects),
+	t.clone(coll, *rand_choose(objects),
 		*rand_choose(objects));
       } else if (val < 70) {
-	t.remove("coll", *rand_choose(objects));
+	t.remove(coll, *rand_choose(objects));
       } else {
-	t.clone_range("coll", *rand_choose(objects),
+	t.clone_range(coll, *rand_choose(objects),
 		      *rand_choose(objects));
       }
     }
     tracker.submit_transaction(t);
-    tracker.verify("coll", *rand_choose(objects));
+    tracker.verify(coll, *rand_choose(objects));
   }
   return 0;
 }
diff --git a/src/test/objectstore/test_idempotent_sequence.cc b/src/test/objectstore/test_idempotent_sequence.cc
index df502f8..95bf196 100644
--- a/src/test/objectstore/test_idempotent_sequence.cc
+++ b/src/test/objectstore/test_idempotent_sequence.cc
@@ -109,8 +109,8 @@ int run_get_last_op(std::string& filestore_path, std::string& journal_path)
     return err;
   }
 
-  coll_t txn_coll("meta");
-  hobject_t txn_object(sobject_t("txn", CEPH_NOSNAP));
+  coll_t txn_coll;
+  ghobject_t txn_object(hobject_t(sobject_t("txn", CEPH_NOSNAP)));
   bufferlist bl;
   store->read(txn_coll, txn_object, 0, 100, bl);
   int32_t txn = 0;
diff --git a/src/test/objectstore/test_kv.cc b/src/test/objectstore/test_kv.cc
new file mode 100644
index 0000000..df3805b
--- /dev/null
+++ b/src/test/objectstore/test_kv.cc
@@ -0,0 +1,181 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage at newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <iostream>
+#include <time.h>
+#include <sys/mount.h>
+#include "os/KeyValueDB.h"
+#include "include/Context.h"
+#include "common/ceph_argparse.h"
+#include "global/global_init.h"
+#include "common/Mutex.h"
+#include "common/Cond.h"
+#include "common/errno.h"
+#include "include/stringify.h"
+#include <gtest/gtest.h>
+
+#if GTEST_HAS_PARAM_TEST
+
+class KVTest : public ::testing::TestWithParam<const char*> {
+public:
+  boost::scoped_ptr<KeyValueDB> db;
+
+  KVTest() : db(0) {}
+
+  void init() {
+    db.reset(KeyValueDB::create(g_ceph_context, string(GetParam()),
+				string("kv_test_temp_dir")));
+  }
+  void fini() {
+    db.reset(NULL);
+  }
+
+  virtual void SetUp() {
+    int r = ::mkdir("kv_test_temp_dir", 0777);
+    if (r < 0 && errno != EEXIST) {
+      r = -errno;
+      cerr << __func__ << ": unable to create kv_test_temp_dir"
+	   << ": " << cpp_strerror(r) << std::endl;
+      return;
+    }
+    init();
+  }
+  virtual void TearDown() {
+    fini();
+  }
+};
+
+TEST_P(KVTest, OpenClose) {
+  ASSERT_EQ(0, db->create_and_open(cout));
+  fini();
+}
+
+TEST_P(KVTest, OpenCloseReopenClose) {
+  ASSERT_EQ(0, db->create_and_open(cout));
+  fini();
+  init();
+  ASSERT_EQ(0, db->open(cout));
+  fini();
+}
+
+TEST_P(KVTest, PutReopen) {
+  ASSERT_EQ(0, db->create_and_open(cout));
+  {
+    KeyValueDB::Transaction t = db->get_transaction();
+    bufferlist value;
+    value.append("value");
+    t->set("prefix", "key", value);
+    t->set("prefix", "key2", value);
+    t->set("prefix", "key3", value);
+    db->submit_transaction_sync(t);
+  }
+  fini();
+
+  init();
+  ASSERT_EQ(0, db->open(cout));
+  {
+    bufferlist v;
+    ASSERT_EQ(0, db->get("prefix", "key", &v));
+    ASSERT_EQ(v.length(), 5u);
+    ASSERT_EQ(0, db->get("prefix", "key2", &v));
+    ASSERT_EQ(v.length(), 5u);
+  }
+  {
+    KeyValueDB::Transaction t = db->get_transaction();
+    t->rmkey("prefix", "key");
+    t->rmkey("prefix", "key3");
+    db->submit_transaction_sync(t);
+  }
+  fini();
+
+  init();
+  ASSERT_EQ(0, db->open(cout));
+  {
+    bufferlist v;
+    ASSERT_EQ(-ENOENT, db->get("prefix", "key", &v));
+    ASSERT_EQ(0, db->get("prefix", "key2", &v));
+    ASSERT_EQ(v.length(), 5u);
+    ASSERT_EQ(-ENOENT, db->get("prefix", "key3", &v));
+  }
+  fini();
+}
+
+TEST_P(KVTest, BenchCommit) {
+  int n = 1024;
+  ASSERT_EQ(0, db->create_and_open(cout));
+  utime_t start = ceph_clock_now(NULL);
+  {
+    cout << "priming" << std::endl;
+    // prime
+    bufferlist big;
+    bufferptr bp(1048576);
+    bp.zero();
+    big.append(bp);
+    for (int i=0; i<30; ++i) {
+      KeyValueDB::Transaction t = db->get_transaction();
+      t->set("prefix", "big" + stringify(i), big);
+      db->submit_transaction_sync(t);
+    }
+  }
+  cout << "now doing small writes" << std::endl;
+  bufferlist data;
+  bufferptr bp(1024);
+  bp.zero();
+  data.append(bp);
+  for (int i=0; i<n; ++i) {
+    KeyValueDB::Transaction t = db->get_transaction();
+    t->set("prefix", "key" + stringify(i), data);
+    db->submit_transaction_sync(t);
+  }
+  utime_t end = ceph_clock_now(NULL);
+  utime_t dur = end - start;
+  cout << n << " commits in " << dur << ", avg latency " << (dur / (double)n)
+       << std::endl;
+}
+
+
+INSTANTIATE_TEST_CASE_P(
+  KeyValueDB,
+  KVTest,
+  ::testing::Values("leveldb", "rocksdb"));
+
+#else
+
+// Google Test may not support value-parameterized tests with some
+// compilers. If we use conditional compilation to compile out all
+// code referring to the gtest_main library, MSVC linker will not link
+// that library at all and consequently complain about missing entry
+// point defined in that library (fatal error LNK1561: entry point
+// must be defined). This dummy test keeps gtest_main linked in.
+TEST(DummyTest, ValueParameterizedTestsAreNotSupportedOnThisPlatform) {}
+
+#endif
+
+int main(int argc, char **argv) {
+  vector<const char*> args;
+  argv_to_vec(argc, (const char **)argv, args);
+  env_to_vec(args);
+
+  global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
+  common_init_finish(g_ceph_context);
+  g_ceph_context->_conf->set_val(
+    "enable_experimental_unrecoverable_data_corrupting_features",
+    "rocksdb");
+  g_ceph_context->_conf->apply_changes(NULL);
+
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/test/objectstore/workload_generator.cc b/src/test/objectstore/workload_generator.cc
index 4331d98..25047b5 100644
--- a/src/test/objectstore/workload_generator.cc
+++ b/src/test/objectstore/workload_generator.cc
@@ -268,7 +268,7 @@ void WorkloadGenerator::do_write_object(ObjectStore::Transaction *t,
   if (m_do_stats && (stat != NULL))
     stat->written_data += bl.length();
 
-  t->write(coll, obj, 0, bl.length(), bl);
+  t->write(coll, ghobject_t(obj), 0, bl.length(), bl);
 }
 
 void WorkloadGenerator::do_setattr_object(ObjectStore::Transaction *t,
@@ -292,7 +292,7 @@ void WorkloadGenerator::do_setattr_object(ObjectStore::Transaction *t,
   if (m_do_stats && (stat != NULL))
       stat->written_data += bl.length();
 
-  t->setattr(coll, obj, "objxattr", bl);
+  t->setattr(coll, ghobject_t(obj), "objxattr", bl);
 }
 
 void WorkloadGenerator::do_pgmeta_omap_set(ObjectStore::Transaction *t, spg_t pgid,
@@ -333,16 +333,16 @@ void WorkloadGenerator::do_append_log(ObjectStore::Transaction *t,
 
   bufferlist bl;
   get_filled_byte_array(bl, size);
-  hobject_t log_obj = entry->m_meta_obj;
+  ghobject_t log_obj = entry->m_meta_obj;
 
   dout(2) << __func__ << " coll " << entry->m_coll << " "
-      << META_COLL << " /" << log_obj << " (" << bl.length() << ")" << dendl;
+      << coll_t::meta() << " /" << log_obj << " (" << bl.length() << ")" << dendl;
 
   if (m_do_stats && (stat != NULL))
       stat->written_data += bl.length();
 
   uint64_t s = pg_log_size[entry->m_coll];
-  t->write(META_COLL, log_obj, s, bl.length(), bl);
+  t->write(coll_t::meta(), log_obj, s, bl.length(), bl);
   pg_log_size[entry->m_coll] += bl.length();
 }
 
@@ -353,7 +353,8 @@ void WorkloadGenerator::do_destroy_collection(ObjectStore::Transaction *t,
   m_nr_runs.set(0);
   entry->m_osr.flush();
   vector<ghobject_t> ls;
-  m_store->collection_list(entry->m_coll, ls);
+  m_store->collection_list(entry->m_coll, ghobject_t(), ghobject_t::get_max(),
+			   true, INT_MAX, &ls, NULL);
   dout(2) << __func__ << " coll " << entry->m_coll
       << " (" << ls.size() << " objects)" << dendl;
 
@@ -362,7 +363,7 @@ void WorkloadGenerator::do_destroy_collection(ObjectStore::Transaction *t,
   }
 
   t->remove_collection(entry->m_coll);
-  t->remove(META_COLL, entry->m_meta_obj);
+  t->remove(coll_t::meta(), entry->m_meta_obj);
 }
 
 TestObjectStoreState::coll_entry_t
@@ -378,9 +379,9 @@ TestObjectStoreState::coll_entry_t
   m_collections.insert(make_pair(entry->m_id, entry));
 
   dout(2) << __func__ << " id " << entry->m_id << " coll " << entry->m_coll << dendl;
-  t->create_collection(entry->m_coll);
-  dout(2) << __func__ << " meta " << META_COLL << "/" << entry->m_meta_obj << dendl;
-  t->touch(META_COLL, entry->m_meta_obj);
+  t->create_collection(entry->m_coll, 32);
+  dout(2) << __func__ << " meta " << coll_t::meta() << "/" << entry->m_meta_obj << dendl;
+  t->touch(coll_t::meta(), entry->m_meta_obj);
   return entry;
 }
 
diff --git a/src/test/objectstore_bench.cc b/src/test/objectstore_bench.cc
new file mode 100644
index 0000000..d5e9f98
--- /dev/null
+++ b/src/test/objectstore_bench.cc
@@ -0,0 +1,290 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <chrono>
+#include <cassert>
+#include <condition_variable>
+#include <memory>
+#include <mutex>
+#include <thread>
+
+#include "os/ObjectStore.h"
+
+#include "global/global_init.h"
+
+#include "common/strtol.h"
+#include "common/ceph_argparse.h"
+
+#define dout_subsys ceph_subsys_filestore
+
+static void usage()
+{
+  derr << "usage: ceph_objectstore_bench [flags]\n"
+      "	 --size\n"
+      "	       total size in bytes\n"
+      "	 --block-size\n"
+      "	       block size in bytes for each write\n"
+      "	 --repeats\n"
+      "	       number of times to repeat the write cycle\n"
+      "	 --threads\n"
+      "	       number of threads to carry out this workload\n"
+      "	 --multi-object\n"
+      "	       have each thread write to a separate object\n" << dendl;
+  generic_server_usage();
+}
+
+// helper class for bytes with units
+struct byte_units {
+  size_t v;
+  byte_units(size_t v) : v(v) {}
+
+  bool parse(const std::string &val, std::string *err);
+
+  operator size_t() const { return v; }
+};
+
+bool byte_units::parse(const std::string &val, std::string *err)
+{
+  v = strict_sistrtoll(val.c_str(), err);
+  return err->empty();
+}
+
+std::ostream& operator<<(std::ostream &out, const byte_units &amount)
+{
+  static const char* units[] = { "B", "KB", "MB", "GB", "TB", "PB", "EB" };
+  static const int max_units = sizeof(units)/sizeof(*units);
+
+  int unit = 0;
+  auto v = amount.v;
+  while (v >= 1024 && unit < max_units) {
+    // preserve significant bytes
+    if (v < 1048576 && (v % 1024 != 0))
+      break;
+    v >>= 10;
+    unit++;
+  }
+  return out << v << ' ' << units[unit];
+}
+
+struct Config {
+  byte_units size;
+  byte_units block_size;
+  int repeats;
+  int threads;
+  bool multi_object;
+  Config()
+    : size(1048576), block_size(4096),
+      repeats(1), threads(1),
+      multi_object(false) {}
+};
+
+class C_NotifyCond : public Context {
+  std::mutex *mutex;
+  std::condition_variable *cond;
+  bool *done;
+public:
+  C_NotifyCond(std::mutex *mutex, std::condition_variable *cond, bool *done)
+    : mutex(mutex), cond(cond), done(done) {}
+  void finish(int r) {
+    std::lock_guard<std::mutex> lock(*mutex);
+    *done = true;
+    cond->notify_one();
+  }
+};
+
+void osbench_worker(ObjectStore *os, const Config &cfg,
+                    const coll_t cid, const ghobject_t oid,
+                    uint64_t starting_offset)
+{
+  bufferlist data;
+  data.append(buffer::create(cfg.block_size));
+
+  dout(0) << "Writing " << cfg.size
+      << " in blocks of " << cfg.block_size << dendl;
+
+  assert(starting_offset < cfg.size);
+  assert(starting_offset % cfg.block_size == 0);
+
+  ObjectStore::Sequencer sequencer("osbench");
+
+  for (int i = 0; i < cfg.repeats; ++i) {
+    uint64_t offset = starting_offset;
+    size_t len = cfg.size;
+
+    list<ObjectStore::Transaction*> tls;
+
+    std::cout << "Write cycle " << i << std::endl;
+    while (len) {
+      size_t count = len < cfg.block_size ? len : (size_t)cfg.block_size;
+
+      auto t = new ObjectStore::Transaction;
+      t->write(cid, oid, offset, count, data);
+      tls.push_back(t);
+
+      offset += count;
+      if (offset > cfg.size)
+        offset -= cfg.size;
+      len -= count;
+    }
+
+    // set up the finisher
+    std::mutex mutex;
+    std::condition_variable cond;
+    bool done = false;
+
+    os->queue_transactions(&sequencer, tls, nullptr,
+                           new C_NotifyCond(&mutex, &cond, &done));
+
+    std::unique_lock<std::mutex> lock(mutex);
+    cond.wait(lock, [&done](){ return done; });
+    lock.unlock();
+
+    while (!tls.empty()) {
+      auto t = tls.front();
+      tls.pop_front();
+      delete t;
+    }
+  }
+}
+
+int main(int argc, const char *argv[])
+{
+  Config cfg;
+
+  // command-line arguments
+  vector<const char*> args;
+  argv_to_vec(argc, argv, args);
+  env_to_vec(args);
+
+  global_init(nullptr, args, CEPH_ENTITY_TYPE_OSD, CODE_ENVIRONMENT_UTILITY, 0);
+
+  std::string val;
+  vector<const char*>::iterator i = args.begin();
+  while (i != args.end()) {
+    if (ceph_argparse_double_dash(args, i))
+      break;
+
+    if (ceph_argparse_witharg(args, i, &val, "--size", (char*)nullptr)) {
+      std::string err;
+      if (!cfg.size.parse(val, &err)) {
+        derr << "error parsing size: " << err << dendl;
+        usage();
+      }
+    } else if (ceph_argparse_witharg(args, i, &val, "--block-size", (char*)nullptr)) {
+      std::string err;
+      if (!cfg.block_size.parse(val, &err)) {
+        derr << "error parsing block-size: " << err << dendl;
+        usage();
+      }
+    } else if (ceph_argparse_witharg(args, i, &val, "--repeats", (char*)nullptr)) {
+      cfg.repeats = atoi(val.c_str());
+    } else if (ceph_argparse_witharg(args, i, &val, "--threads", (char*)nullptr)) {
+      cfg.threads = atoi(val.c_str());
+    } else if (ceph_argparse_flag(args, i, "--multi-object", (char*)nullptr)) {
+      cfg.multi_object = true;
+    } else {
+      derr << "Error: can't understand argument: " << *i << "\n" << dendl;
+      usage();
+    }
+  }
+
+  common_init_finish(g_ceph_context);
+
+  // create object store
+  dout(0) << "objectstore " << g_conf->osd_objectstore << dendl;
+  dout(0) << "data " << g_conf->osd_data << dendl;
+  dout(0) << "journal " << g_conf->osd_journal << dendl;
+  dout(0) << "size " << cfg.size << dendl;
+  dout(0) << "block-size " << cfg.block_size << dendl;
+  dout(0) << "repeats " << cfg.repeats << dendl;
+  dout(0) << "threads " << cfg.threads << dendl;
+
+  auto os = std::unique_ptr<ObjectStore>(
+      ObjectStore::create(g_ceph_context,
+                          g_conf->osd_objectstore,
+                          g_conf->osd_data,
+                          g_conf->osd_journal));
+  if (!os) {
+    derr << "bad objectstore type " << g_conf->osd_objectstore << dendl;
+    return 1;
+  }
+  if (os->mkfs() < 0) {
+    derr << "mkfs failed" << dendl;
+    return 1;
+  }
+  if (os->mount() < 0) {
+    derr << "mount failed" << dendl;
+    return 1;
+  }
+
+  dout(10) << "created objectstore " << os.get() << dendl;
+
+  // create a collection
+  spg_t pg;
+  const coll_t cid(pg);
+  {
+    ObjectStore::Sequencer osr(__func__);
+    ObjectStore::Transaction t;
+    t.create_collection(cid, 0);
+    os->apply_transaction(&osr, t);
+  }
+
+  // create the objects
+  std::vector<ghobject_t> oids;
+  if (cfg.multi_object) {
+    oids.reserve(cfg.threads);
+    for (int i = 0; i < cfg.threads; i++) {
+      std::stringstream oss;
+      oss << "osbench-thread-" << i;
+      oids.emplace_back(pg.make_temp_object(oss.str()));
+
+      ObjectStore::Sequencer osr(__func__);
+      ObjectStore::Transaction t;
+      t.touch(cid, oids[i]);
+      int r = os->apply_transaction(&osr, t);
+      assert(r == 0);
+    }
+  } else {
+    oids.emplace_back(pg.make_temp_object("osbench"));
+
+    ObjectStore::Sequencer osr(__func__);
+    ObjectStore::Transaction t;
+    t.touch(cid, oids.back());
+    int r = os->apply_transaction(&osr, t);
+    assert(r == 0);
+  }
+
+  // run the worker threads
+  std::vector<std::thread> workers;
+  workers.reserve(cfg.threads);
+
+  using namespace std::chrono;
+  auto t1 = high_resolution_clock::now();
+  for (int i = 0; i < cfg.threads; i++) {
+    const auto &oid = cfg.multi_object ? oids[i] : oids[0];
+    workers.emplace_back(osbench_worker, os.get(), std::ref(cfg),
+                         cid, oid, i * cfg.size / cfg.threads);
+  }
+  for (auto &worker : workers)
+    worker.join();
+  auto t2 = high_resolution_clock::now();
+  workers.clear();
+
+  auto duration = duration_cast<microseconds>(t2 - t1);
+  byte_units total = cfg.size * cfg.repeats * cfg.threads;
+  byte_units rate = (1000000LL * total) / duration.count();
+  size_t iops = (1000000LL * total / cfg.block_size) / duration.count();
+  dout(0) << "Wrote " << total << " in "
+      << duration.count() << "us, at a rate of " << rate << "/s and "
+      << iops << " iops" << dendl;
+
+  // remove the objects
+  ObjectStore::Sequencer osr(__func__);
+  ObjectStore::Transaction t;
+  for (const auto &oid : oids)
+    t.remove(cid, oid);
+  os->apply_transaction(&osr,t);
+
+  os->umount();
+  return 0;
+}
diff --git a/src/test/opensuse-13.2/Dockerfile.in b/src/test/opensuse-13.2/Dockerfile.in
new file mode 100644
index 0000000..85d91a2
--- /dev/null
+++ b/src/test/opensuse-13.2/Dockerfile.in
@@ -0,0 +1,30 @@
+#
+# Copyright (C) 2015 Red Hat <contact at redhat.com>
+#
+# Author: Loic Dachary <loic at dachary.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Library Public License for more details.
+#
+# Environment variables are substituted via envsubst(1)
+#
+# user_id=$(id -u)
+# os_version= the desired REPOSITORY TAG
+#
+FROM opensuse:%%os_version%%
+
+COPY install-deps.sh /root/
+COPY ceph.spec.in /root/
+# build dependencies
+RUN cd /root ; ./install-deps.sh
+# development tools
+# nc (ncat) is required to run make check on firefly only (giant+ do not use nc)
+RUN zypper --non-interactive install ccache valgrind gdb git python-virtualenv gdisk kpartx hdparm ncat sudo xmlstarlet parted
+RUN useradd -M --uid %%user_id%% %%USER%% && echo '%%USER%% ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
diff --git a/src/test/opensuse-13.2/ceph.spec.in b/src/test/opensuse-13.2/ceph.spec.in
new file mode 100644
index 0000000..8f2a6fc
--- /dev/null
+++ b/src/test/opensuse-13.2/ceph.spec.in
@@ -0,0 +1,1317 @@
+# vim: set noexpandtab ts=8 sw=8 :
+%bcond_with ocf
+%bcond_without cephfs_java
+%bcond_with tests
+%bcond_without tcmalloc
+%bcond_without libs_compat
+%bcond_with lowmem_builder
+%if 0%{?fedora} || 0%{?rhel}
+%bcond_without selinux
+%endif
+%if 0%{?suse_version}
+%bcond_with selinux
+%endif
+
+
+%if (0%{?el5} || (0%{?rhel_version} >= 500 && 0%{?rhel_version} <= 600))
+%{!?python_sitelib: %global python_sitelib %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")}
+%{!?python_sitearch: %global python_sitearch %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib(1))")}
+%endif
+
+%if %{with selinux}
+# get selinux policy version
+%{!?_selinux_policy_version: %global _selinux_policy_version %(sed -e 's,.*selinux-policy-\\([^/]*\\)/.*,\\1,' /usr/share/selinux/devel/policyhelp 2>/dev/null || echo 0.0.0)}
+
+%define relabel_files() \
+restorecon -R /usr/bin/ceph-mon > /dev/null 2>&1; \
+restorecon -R /usr/bin/ceph-osd > /dev/null 2>&1; \
+restorecon -R /usr/bin/ceph-mds > /dev/null 2>&1; \
+restorecon -R /usr/bin/radosgw > /dev/null 2>&1; \
+restorecon -R /etc/rc\.d/init\.d/ceph > /dev/null 2>&1; \
+restorecon -R /etc/rc\.d/init\.d/radosgw > /dev/null 2>&1; \
+restorecon -R /var/run/ceph > /dev/null 2>&1; \
+restorecon -R /var/lib/ceph > /dev/null 2>&1; \
+restorecon -R /var/log/ceph > /dev/null 2>&1;
+%endif
+
+%{!?_udevrulesdir: %global _udevrulesdir /lib/udev/rules.d}
+
+# Use systemd files on RHEL 7 and above and in SUSE/openSUSE.
+# Note: We don't install unit files for the services yet. For now,
+# the _with_systemd variable only implies that we'll install
+# /etc/tmpfiles.d/ceph.conf in order to set up the socket directory in
+# /var/run/ceph.
+%if 0%{?fedora} || 0%{?rhel} >= 7 || 0%{?suse_version} >= 1210
+%global _with_systemd 1
+%endif
+
+# LTTng-UST enabled on Fedora, RHEL 6, and SLES 12
+%if 0%{?fedora} || 0%{?rhel} == 6 || 0%{?suse_version} == 1315
+%global _with_lttng 1
+%endif
+
+#################################################################################
+# common
+#################################################################################
+Name:		ceph
+Version:	@VERSION@
+Release:	@RPM_RELEASE@%{?dist}
+Epoch:		1
+Summary:	User space components of the Ceph file system
+License:	LGPL-2.1 and CC-BY-SA-1.0 and GPL-2.0 and BSL-1.0 and GPL-2.0-with-autoconf-exception and BSD-3-Clause and MIT
+%if 0%{?suse_version}
+Group:         System/Filesystems
+%endif
+URL:		http://ceph.com/
+Source0:	http://ceph.com/download/%{name}-%{version}.tar.bz2
+%if 0%{?fedora} || 0%{?rhel}
+Patch0:		init-ceph.in-fedora.patch
+%endif
+#################################################################################
+# dependencies that apply across all distro families
+#################################################################################
+Requires:	librbd1 = %{epoch}:%{version}-%{release}
+Requires:	librados2 = %{epoch}:%{version}-%{release}
+Requires:	libcephfs1 = %{epoch}:%{version}-%{release}
+Requires:	ceph-common = %{epoch}:%{version}-%{release}
+%if 0%{with selinux}
+Requires:	ceph-selinux = %{epoch}:%{version}-%{release}
+%endif
+Requires:	python-rados = %{epoch}:%{version}-%{release}
+Requires:	python-rbd = %{epoch}:%{version}-%{release}
+Requires:	python-cephfs = %{epoch}:%{version}-%{release}
+Requires:	python
+Requires:	python-requests
+Requires:	grep
+Requires:	xfsprogs
+Requires:	logrotate
+Requires:	parted
+Requires:	util-linux
+Requires:	hdparm
+Requires:	cryptsetup
+Requires:	findutils
+Requires:	which
+Requires(post):	binutils
+%if 0%{with cephfs_java}
+BuildRequires:	java-devel
+BuildRequires:	sharutils
+%endif
+%if 0%{with selinux}
+BuildRequires:	checkpolicy
+BuildRequires:	selinux-policy-devel
+BuildRequires:	/usr/share/selinux/devel/policyhelp
+%endif
+BuildRequires:	gcc-c++
+BuildRequires:	boost-devel
+BuildRequires:  cmake
+BuildRequires:	cryptsetup
+BuildRequires:	fuse-devel
+BuildRequires:	gdbm
+BuildRequires:	hdparm
+BuildRequires:	leveldb-devel > 1.2
+BuildRequires:	libaio-devel
+BuildRequires:	libcurl-devel
+BuildRequires:	libedit-devel
+BuildRequires:	libxml2-devel
+BuildRequires:	libblkid-devel >= 2.17
+BuildRequires:	libudev-devel
+BuildRequires:	libtool
+BuildRequires:	make
+BuildRequires:	parted
+BuildRequires:	perl
+BuildRequires:	pkgconfig
+BuildRequires:	python
+BuildRequires:	python-nose
+BuildRequires:	python-requests
+BuildRequires:	python-virtualenv
+BuildRequires:	snappy-devel
+BuildRequires:	util-linux
+BuildRequires:	xfsprogs
+BuildRequires:	xfsprogs-devel
+BuildRequires:	xmlstarlet
+BuildRequires:	yasm
+
+#################################################################################
+# distro-conditional dependencies
+#################################################################################
+%if 0%{?suse_version}
+%if 0%{?_with_systemd}
+BuildRequires:  pkgconfig(systemd)
+BuildRequires:	systemd-rpm-macros
+%{?systemd_requires}
+%endif
+PreReq:		%fillup_prereq
+Requires:	python-Flask
+BuildRequires:	net-tools
+BuildRequires:	libbz2-devel
+%if 0%{?suse_version} > 1210
+Requires:	gptfdisk
+%if 0%{with tcmalloc}
+BuildRequires:	gperftools-devel
+%endif
+%else
+Requires:	scsirastools
+BuildRequires:	google-perftools-devel
+%endif
+BuildRequires:	mozilla-nss-devel
+BuildRequires:	keyutils-devel
+BuildRequires:	libatomic-ops-devel
+%else
+%if 0%{?_with_systemd}
+Requires:	systemd
+%endif
+BuildRequires:  bzip2-devel
+BuildRequires:	nss-devel
+BuildRequires:	keyutils-libs-devel
+BuildRequires:	libatomic_ops-devel
+Requires:	gdisk
+Requires(post):	chkconfig
+Requires(preun):	chkconfig
+Requires(preun):	initscripts
+BuildRequires:	gperftools-devel
+Requires:	python-flask
+%endif
+# boost
+%if 0%{?fedora} || 0%{?rhel} 
+BuildRequires:  boost-random
+%endif
+# python-argparse for distros with Python 2.6 or lower
+%if (0%{?rhel} && 0%{?rhel} <= 6) || (0%{?suse_version} && 0%{?suse_version} <= 1110)
+BuildRequires:	python-argparse
+%endif
+# lttng and babeltrace for rbd-replay-prep
+%if 0%{?_with_lttng}
+%if 0%{?fedora} || 0%{?rhel}
+BuildRequires:	lttng-ust-devel
+BuildRequires:	libbabeltrace-devel
+%endif
+%if 0%{?suse_version}
+BuildRequires:	lttng-ust-devel
+BuildRequires:  babeltrace-devel
+%endif
+%endif
+# expat and fastcgi for RGW
+%if 0%{?suse_version}
+BuildRequires:	libexpat-devel
+BuildRequires:	FastCGI-devel
+%endif
+%if 0%{?rhel} || 0%{?fedora}
+BuildRequires:	expat-devel
+BuildRequires:	fcgi-devel
+%endif
+# python-sphinx
+%if 0%{?rhel} > 0 && 0%{?rhel} < 7
+BuildRequires:	python-sphinx10
+%endif
+%if 0%{?fedora} || 0%{?suse_version} || 0%{?rhel} >= 7
+BuildRequires:	python-sphinx
+%endif
+
+%description
+Ceph is a massively scalable, open-source, distributed storage system that runs
+on commodity hardware and delivers object, block and file system storage.
+
+
+#################################################################################
+# packages
+#################################################################################
+%package -n ceph-common
+Summary:	Ceph Common
+Group:		System Environment/Base
+Requires:	librbd1 = %{epoch}:%{version}-%{release}
+Requires:	librados2 = %{epoch}:%{version}-%{release}
+Requires:	python-rados = %{epoch}:%{version}-%{release}
+Requires:	python-rbd = %{epoch}:%{version}-%{release}
+Requires:	python-cephfs = %{epoch}:%{version}-%{release}
+Requires:	python-requests
+%if 0%{?_with_systemd}
+%{?systemd_requires}
+%endif
+%if 0%{?suse_version}
+Requires(pre):	pwdutils
+%endif
+# python-argparse is only needed in distros with Python 2.6 or lower
+%if (0%{?rhel} && 0%{?rhel} <= 6) || (0%{?suse_version} && 0%{?suse_version} <= 1110)
+Requires:	python-argparse
+%endif
+%description -n ceph-common
+Common utilities to mount and interact with a ceph storage cluster.
+
+%package fuse
+Summary:	Ceph fuse-based client
+Group:		System Environment/Base
+Requires:	%{name}
+%description fuse
+FUSE based client for Ceph distributed network file system
+
+%package -n rbd-fuse
+Summary:	Ceph fuse-based client
+Group:		System Environment/Base
+Requires:	%{name}
+Requires:	librados2 = %{epoch}:%{version}-%{release}
+Requires:	librbd1 = %{epoch}:%{version}-%{release}
+%description -n rbd-fuse
+FUSE based client to map Ceph rbd images to files
+
+%package radosgw
+Summary:	Rados REST gateway
+Group:		Development/Libraries
+Requires:	ceph-common = %{epoch}:%{version}-%{release}
+%if 0%{with selinux}
+Requires:	ceph-selinux = %{epoch}:%{version}-%{release}
+%endif
+Requires:	librados2 = %{epoch}:%{version}-%{release}
+%if 0%{?rhel} || 0%{?fedora}
+Requires:	mailcap
+%endif
+%description radosgw
+This package is an S3 HTTP REST gateway for the RADOS object store. It
+is implemented as a FastCGI module using libfcgi, and can be used in
+conjunction with any FastCGI capable web server.
+
+%if %{with ocf}
+%package resource-agents
+Summary:	OCF-compliant resource agents for Ceph daemons
+Group:		System Environment/Base
+License:	LGPL-2.0
+Requires:	%{name} = %{epoch}:%{version}
+Requires:	resource-agents
+%description resource-agents
+Resource agents for monitoring and managing Ceph daemons
+under Open Cluster Framework (OCF) compliant resource
+managers such as Pacemaker.
+%endif
+
+%package -n librados2
+Summary:	RADOS distributed object store client library
+Group:		System Environment/Libraries
+License:	LGPL-2.0
+%if 0%{?rhel} || 0%{?fedora}
+Obsoletes:	ceph-libs < %{epoch}:%{version}-%{release}
+%endif
+%description -n librados2
+RADOS is a reliable, autonomic distributed object storage cluster
+developed as part of the Ceph distributed storage system. This is a
+shared library allowing applications to access the distributed object
+store using a simple file-like interface.
+
+%package -n librados2-devel
+Summary:	RADOS headers
+Group:		Development/Libraries
+License:	LGPL-2.0
+Requires:	librados2 = %{epoch}:%{version}-%{release}
+Obsoletes:	ceph-devel < %{epoch}:%{version}-%{release}
+%description -n librados2-devel
+This package contains libraries and headers needed to develop programs
+that use RADOS object store.
+
+%package -n python-rados
+Summary:	Python libraries for the RADOS object store
+Group:		System Environment/Libraries
+License:	LGPL-2.0
+Requires:	librados2 = %{epoch}:%{version}-%{release}
+Obsoletes:	python-ceph < %{epoch}:%{version}-%{release}
+%description -n python-rados
+This package contains Python libraries for interacting with Cephs RADOS
+object store.
+
+%package -n libradosstriper1
+Summary:	RADOS striping interface
+Group:		System Environment/Libraries
+License:	LGPL-2.0
+Requires:	librados2 = %{epoch}:%{version}-%{release}
+%description -n libradosstriper1
+Striping interface built on top of the rados library, allowing
+to stripe bigger objects onto several standard rados objects using
+an interface very similar to the rados one.
+
+%package -n libradosstriper1-devel
+Summary:	RADOS striping interface headers
+Group:		Development/Libraries
+License:	LGPL-2.0
+Requires:	libradosstriper1 = %{epoch}:%{version}-%{release}
+Requires:	librados2-devel = %{epoch}:%{version}-%{release}
+Obsoletes:	ceph-devel < %{epoch}:%{version}-%{release}
+%description -n libradosstriper1-devel
+This package contains libraries and headers needed to develop programs
+that use RADOS striping interface.
+
+%package -n librbd1
+Summary:	RADOS block device client library
+Group:		System Environment/Libraries
+License:	LGPL-2.0
+Requires:	librados2 = %{epoch}:%{version}-%{release}
+%if 0%{?rhel} || 0%{?fedora}
+Obsoletes:	ceph-libs < %{epoch}:%{version}-%{release}
+%endif
+%description -n librbd1
+RBD is a block device striped across multiple distributed objects in
+RADOS, a reliable, autonomic distributed object storage cluster
+developed as part of the Ceph distributed storage system. This is a
+shared library allowing applications to manage these block devices.
+
+%package -n librbd1-devel
+Summary:	RADOS block device headers
+Group:		Development/Libraries
+License:	LGPL-2.0
+Requires:	librbd1 = %{epoch}:%{version}-%{release}
+Requires:	librados2-devel = %{epoch}:%{version}-%{release}
+Obsoletes:	ceph-devel < %{epoch}:%{version}-%{release}
+%description -n librbd1-devel
+This package contains libraries and headers needed to develop programs
+that use RADOS block device.
+
+%package -n python-rbd
+Summary:	Python libraries for the RADOS block device
+Group:		System Environment/Libraries
+License:	LGPL-2.0
+Requires:	librbd1 = %{epoch}:%{version}-%{release}
+Requires:	python-rados = %{epoch}:%{version}-%{release}
+Obsoletes:	python-ceph < %{epoch}:%{version}-%{release}
+%description -n python-rbd
+This package contains Python libraries for interacting with Cephs RADOS
+block device.
+
+%package -n libcephfs1
+Summary:	Ceph distributed file system client library
+Group:		System Environment/Libraries
+License:	LGPL-2.0
+%if 0%{?rhel} || 0%{?fedora}
+Obsoletes:	ceph-libs < %{epoch}:%{version}-%{release}
+Obsoletes:	ceph-libcephfs
+%endif
+%description -n libcephfs1
+Ceph is a distributed network file system designed to provide excellent
+performance, reliability, and scalability. This is a shared library
+allowing applications to access a Ceph distributed file system via a
+POSIX-like interface.
+
+%package -n libcephfs1-devel
+Summary:	Ceph distributed file system headers
+Group:		Development/Libraries
+License:	LGPL-2.0
+Requires:	libcephfs1 = %{epoch}:%{version}-%{release}
+Requires:	librados2-devel = %{epoch}:%{version}-%{release}
+Obsoletes:	ceph-devel < %{epoch}:%{version}-%{release}
+%description -n libcephfs1-devel
+This package contains libraries and headers needed to develop programs
+that use Cephs distributed file system.
+
+%package -n python-cephfs
+Summary:	Python libraries for Ceph distributed file system
+Group:		System Environment/Libraries
+License:	LGPL-2.0
+Requires:	libcephfs1 = %{epoch}:%{version}-%{release}
+Requires:	python-rados = %{epoch}:%{version}-%{release}
+Obsoletes:	python-ceph < %{epoch}:%{version}-%{release}
+%description -n python-cephfs
+This package contains Python libraries for interacting with Cephs distributed
+file system.
+
+%package -n ceph-test
+Summary:	Ceph benchmarks and test tools
+Group:		System Environment/Libraries
+License:	LGPL-2.0
+Requires:	ceph-common
+Requires:	xmlstarlet
+%description -n ceph-test
+This package contains Ceph benchmarks and test tools.
+
+%if 0%{with cephfs_java}
+
+%package -n libcephfs_jni1
+Summary:	Java Native Interface library for CephFS Java bindings
+Group:		System Environment/Libraries
+License:	LGPL-2.0
+Requires:	java
+Requires:	libcephfs1 = %{epoch}:%{version}-%{release}
+%description -n libcephfs_jni1
+This package contains the Java Native Interface library for CephFS Java
+bindings.
+
+%package -n libcephfs_jni1-devel
+Summary:	Development files for CephFS Java Native Interface library
+Group:		System Environment/Libraries
+License:	LGPL-2.0
+Requires:	java
+Requires:	libcephfs_jni1 = %{epoch}:%{version}-%{release}
+Obsoletes:	ceph-devel < %{epoch}:%{version}-%{release}
+%description -n libcephfs_jni1-devel
+This package contains the development files for CephFS Java Native Interface
+library.
+
+%package -n cephfs-java
+Summary:	Java libraries for the Ceph File System
+Group:		System Environment/Libraries
+License:	LGPL-2.0
+Requires:	java
+Requires:	libcephfs_jni1 = %{epoch}:%{version}-%{release}
+%if 0%{?el6}
+Requires:	junit4
+BuildRequires:	junit4
+%else
+Requires:       junit
+BuildRequires:  junit
+%endif
+%description -n cephfs-java
+This package contains the Java libraries for the Ceph File System.
+
+%endif
+
+%if 0%{with selinux}
+
+%package selinux
+Summary:	SELinux support for Ceph MON, OSD and MDS
+Group:		System Environment/Base
+Requires:	%{name}
+Requires:	policycoreutils, libselinux-utils
+Requires(post): selinux-policy-base >= %{_selinux_policy_version}, policycoreutils, gawk
+Requires(postun): policycoreutils
+%description selinux
+This package contains SELinux support for Ceph MON, OSD and MDS. The package
+also performs file-system relabelling which can take a long time on heavily
+populated file-systems.
+
+%endif
+
+%if 0%{with libs_compat}
+
+%package libs-compat
+Summary:	Meta package to include ceph libraries
+Group:		System Environment/Libraries
+License:	LGPL-2.0
+Obsoletes:	ceph-libs
+Requires:	librados2 = %{epoch}:%{version}-%{release}
+Requires:	librbd1 = %{epoch}:%{version}-%{release}
+Requires:	libcephfs1 = %{epoch}:%{version}-%{release}
+Provides:	ceph-libs
+
+%description libs-compat
+This is a meta package, that pulls in librados2, librbd1 and libcephfs1. It
+is included for backwards compatibility with distributions that depend on the
+former ceph-libs package, which is now split up into these three subpackages.
+Packages still depending on ceph-libs should be fixed to depend on librados2,
+librbd1 or libcephfs1 instead.
+
+%endif
+
+%package devel-compat
+Summary:	Compatibility package for Ceph headers
+Group:		Development/Libraries
+License:	LGPL-2.0
+Obsoletes:	ceph-devel
+Requires:	%{name} = %{epoch}:%{version}-%{release}
+Requires:	librados2-devel = %{epoch}:%{version}-%{release}
+Requires:	libradosstriper1-devel = %{epoch}:%{version}-%{release}
+Requires:	librbd1-devel = %{epoch}:%{version}-%{release}
+Requires:	libcephfs1-devel = %{epoch}:%{version}-%{release}
+%if 0%{with cephfs_java}
+Requires:	libcephfs_jni1-devel = %{epoch}:%{version}-%{release}
+%endif
+Provides:	ceph-devel
+%description devel-compat
+This is a compatibility package to accommodate ceph-devel split into
+librados2-devel, librbd1-devel and libcephfs1-devel. Packages still depending
+on ceph-devel should be fixed to depend on librados2-devel, librbd1-devel,
+libcephfs1-devel or libradosstriper1-devel instead.
+
+%package -n python-ceph-compat
+Summary:	Compatibility package for Cephs python libraries
+Group:		System Environment/Libraries
+License:	LGPL-2.0
+Obsoletes:	python-ceph
+Requires:	python-rados = %{epoch}:%{version}-%{release}
+Requires:	python-rbd = %{epoch}:%{version}-%{release}
+Requires:	python-cephfs = %{epoch}:%{version}-%{release}
+Provides:	python-ceph
+%description -n python-ceph-compat
+This is a compatibility package to accommodate python-ceph split into
+python-rados, python-rbd and python-cephfs. Packages still depending on
+python-ceph should be fixed to depend on python-rados, python-rbd or
+python-cephfs instead.
+
+#################################################################################
+# common
+#################################################################################
+%prep
+%setup -q
+%if 0%{?fedora} || 0%{?rhel}
+%patch0 -p1 -b .init
+%endif
+
+%build
+%if 0%{with cephfs_java}
+# Find jni.h
+for i in /usr/{lib64,lib}/jvm/java/include{,/linux}; do
+    [ -d $i ] && java_inc="$java_inc -I$i"
+done
+%endif
+
+./autogen.sh
+
+%if %{with lowmem_builder}
+RPM_OPT_FLAGS="$RPM_OPT_FLAGS --param ggc-min-expand=20 --param ggc-min-heapsize=32768"
+%endif
+export RPM_OPT_FLAGS=`echo $RPM_OPT_FLAGS | sed -e 's/i386/i486/'`
+
+%{configure}	CPPFLAGS="$java_inc" \
+		--prefix=/usr \
+		--localstatedir=/var \
+		--sysconfdir=/etc \
+%if 0%{?_with_systemd}
+		--with-systemdsystemunitdir=%_unitdir \
+%endif
+		--docdir=%{_docdir}/ceph \
+		--with-man-pages \
+		--mandir="%_mandir" \
+		--with-nss \
+		--without-cryptopp \
+		--with-debug \
+%if 0%{with cephfs_java}
+		--enable-cephfs-java \
+%endif
+%if 0%{with selinux}
+		--with-selinux \
+%endif
+		--with-librocksdb-static=check \
+%if 0%{?rhel} || 0%{?fedora}
+		--with-systemd-libexec-dir=/usr/libexec/ceph \
+		--with-rgw-user=root \
+		--with-rgw-group=root \
+%endif
+%if 0%{?suse_version}
+		--with-systemd-libexec-dir=/usr/lib/ceph/ \
+		--with-rgw-user=wwwrun \
+		--with-rgw-group=www \
+%endif
+		--with-radosgw \
+		$CEPH_EXTRA_CONFIGURE_ARGS \
+		%{?_with_ocf} \
+		%{?_with_tcmalloc} \
+		CFLAGS="$RPM_OPT_FLAGS" CXXFLAGS="$RPM_OPT_FLAGS"
+
+
+make %{?_smp_mflags}
+
+
+%if 0%{with tests}
+%check
+# run in-tree unittests
+make %{?_smp_mflags} check-local
+
+%endif
+
+
+
+%install
+make DESTDIR=$RPM_BUILD_ROOT install
+find $RPM_BUILD_ROOT -type f -name "*.la" -exec rm -f {} ';'
+find $RPM_BUILD_ROOT -type f -name "*.a" -exec rm -f {} ';'
+install -D src/rbdmap $RPM_BUILD_ROOT%{_sysconfdir}/ceph/rbdmap
+install -D src/init-rbdmap $RPM_BUILD_ROOT%{_initrddir}/rbdmap
+%if 0%{?fedora} || 0%{?rhel}
+install -m 0644 -D etc/sysconfig/ceph $RPM_BUILD_ROOT%{_sysconfdir}/sysconfig/ceph
+%endif
+%if 0%{?suse_version}
+install -m 0644 -D etc/sysconfig/ceph $RPM_BUILD_ROOT%{_localstatedir}/adm/fillup-templates/sysconfig.%{name}
+%endif
+%if 0%{?_with_systemd}
+  install -m 0644 -D systemd/ceph.tmpfiles.d $RPM_BUILD_ROOT%{_tmpfilesdir}/ceph-common.conf
+  install -m 0644 -D systemd/ceph-osd at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-osd at .service
+  install -m 0644 -D systemd/ceph-mon at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-mon at .service
+  install -m 0644 -D systemd/ceph-create-keys at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-create-keys at .service
+  install -m 0644 -D systemd/ceph-mds at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-mds at .service
+  install -m 0644 -D systemd/ceph-radosgw at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-radosgw at .service
+  install -m 0644 -D systemd/ceph.target $RPM_BUILD_ROOT%{_unitdir}/ceph.target
+  install -m 0644 -D systemd/ceph-disk at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-disk at .service
+  install -m 0755 -D systemd/ceph $RPM_BUILD_ROOT%{_sbindir}/rcceph
+%else
+  install -D src/init-ceph $RPM_BUILD_ROOT%{_initrddir}/ceph
+  install -D src/init-radosgw $RPM_BUILD_ROOT%{_initrddir}/ceph-radosgw
+  ln -sf ../../etc/init.d/ceph %{buildroot}/%{_sbindir}/rcceph
+  ln -sf ../../etc/init.d/ceph-radosgw %{buildroot}/%{_sbindir}/rcceph-radosgw
+%endif
+mkdir -p $RPM_BUILD_ROOT%{_sbindir}
+install -m 0644 -D src/logrotate.conf $RPM_BUILD_ROOT%{_sysconfdir}/logrotate.d/ceph
+chmod 0644 $RPM_BUILD_ROOT%{_docdir}/ceph/sample.ceph.conf
+chmod 0644 $RPM_BUILD_ROOT%{_docdir}/ceph/sample.fetch_config
+
+# firewall templates
+%if 0%{?suse_version}
+install -m 0644 -D etc/sysconfig/SuSEfirewall2.d/services/ceph-mon %{buildroot}%{_sysconfdir}/sysconfig/SuSEfirewall2.d/services/ceph-mon
+install -m 0644 -D etc/sysconfig/SuSEfirewall2.d/services/ceph-osd-mds %{buildroot}%{_sysconfdir}/sysconfig/SuSEfirewall2.d/services/ceph-osd-mds
+%endif
+
+# udev rules
+install -m 0644 -D udev/50-rbd.rules $RPM_BUILD_ROOT%{_udevrulesdir}/50-rbd.rules
+install -m 0644 -D udev/60-ceph-partuuid-workaround.rules $RPM_BUILD_ROOT%{_udevrulesdir}/60-ceph-partuuid-workaround.rules
+
+%if (0%{?rhel} && 0%{?rhel} < 7)
+install -m 0644 -D udev/95-ceph-osd-alt.rules $RPM_BUILD_ROOT/lib/udev/rules.d/95-ceph-osd.rules
+%else
+install -m 0644 -D udev/95-ceph-osd.rules $RPM_BUILD_ROOT/lib/udev/rules.d/95-ceph-osd.rules
+%endif
+
+%if 0%{?rhel} >= 7 || 0%{?fedora} || 0%{?suse_version}
+mv $RPM_BUILD_ROOT/lib/udev/rules.d/95-ceph-osd.rules $RPM_BUILD_ROOT/usr/lib/udev/rules.d/95-ceph-osd.rules
+mv $RPM_BUILD_ROOT/sbin/mount.ceph $RPM_BUILD_ROOT/usr/sbin/mount.ceph
+mv $RPM_BUILD_ROOT/sbin/mount.fuse.ceph $RPM_BUILD_ROOT/usr/sbin/mount.fuse.ceph
+%endif
+
+#set up placeholder directories
+mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/ceph
+%if ! 0%{?_with_systemd}
+mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/run/ceph
+%endif
+mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/log/ceph
+mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/tmp
+mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/mon
+mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/osd
+mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/mds
+mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/radosgw
+mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-osd
+mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-mds
+mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-rgw
+
+%clean
+rm -rf $RPM_BUILD_ROOT
+
+%pre
+%if 0%{?_with_systemd}
+  %if 0%{?suse_version}
+    # service_add_pre and friends don't work with parameterized systemd service
+    # instances, only with single services or targets, so we always pass
+    # ceph.target to these macros
+    %service_add_pre ceph.target
+  %endif
+%endif
+
+
+%post
+/sbin/ldconfig
+%if 0%{?_with_systemd}
+  %if 0%{?suse_version}
+    %fillup_only
+    %service_add_post ceph.target
+  %endif
+%else
+  /sbin/chkconfig --add ceph
+%endif
+
+%preun
+%if 0%{?_with_systemd}
+  %if 0%{?suse_version}
+    %service_del_preun ceph.target
+  %endif
+  # Disable and stop on removal.
+  if [ $1 = 0 ] ; then
+    SERVICE_LIST=$(systemctl | grep -E '^ceph-mon@|^ceph-create-keys@|^ceph-osd@|^ceph-mds@|^ceph-disk-'  | cut -d' ' -f1)
+    if [ -n "$SERVICE_LIST" ]; then
+      for SERVICE in $SERVICE_LIST; do
+        /usr/bin/systemctl --no-reload disable $SERVICE > /dev/null 2>&1 || :
+        /usr/bin/systemctl stop $SERVICE > /dev/null 2>&1 || :
+      done
+    fi
+  fi
+%else
+  %if 0%{?rhel} || 0%{?fedora}
+    if [ $1 = 0 ] ; then
+      /sbin/service ceph stop >/dev/null 2>&1
+      /sbin/chkconfig --del ceph
+    fi
+  %endif
+%endif
+
+%postun
+/sbin/ldconfig
+%if 0%{?_with_systemd}
+  if [ $1 = 1 ] ; then
+    # Restart on upgrade, but only if "CEPH_AUTO_RESTART_ON_UPGRADE" is set to
+    # "yes". In any case: if units are not running, do not touch them.
+    SYSCONF_CEPH=/etc/sysconfig/ceph
+    if [ -f $SYSCONF_CEPH -a -r $SYSCONF_CEPH ] ; then
+      source $SYSCONF_CEPH
+    fi
+    if [ "X$CEPH_AUTO_RESTART_ON_UPGRADE" = "Xyes" ] ; then
+      SERVICE_LIST=$(systemctl | grep -E '^ceph-mon@|^ceph-create-keys@|^ceph-osd@|^ceph-mds@|^ceph-disk-'  | cut -d' ' -f1)
+      if [ -n "$SERVICE_LIST" ]; then
+        for SERVICE in $SERVICE_LIST; do
+          /usr/bin/systemctl try-restart $SERVICE > /dev/null 2>&1 || :
+        done
+      fi
+    fi
+  fi
+%endif
+
+#################################################################################
+# files
+#################################################################################
+%files
+%defattr(-,root,root,-)
+%docdir %{_docdir}
+%dir %{_docdir}/ceph
+%{_docdir}/ceph/sample.ceph.conf
+%{_docdir}/ceph/sample.fetch_config
+%{_bindir}/cephfs
+%{_bindir}/ceph-clsinfo
+%{_bindir}/ceph-rest-api
+%{python_sitelib}/ceph_rest_api.py*
+%{_bindir}/crushtool
+%{_bindir}/monmaptool
+%{_bindir}/osdmaptool
+%{_bindir}/ceph-run
+%{_bindir}/ceph-mon
+%{_bindir}/ceph-mds
+%{_bindir}/ceph-objectstore-tool
+%{_bindir}/ceph-osd
+%{_bindir}/ceph-detect-init
+%{_bindir}/librados-config
+%{_bindir}/ceph-client-debug
+%{_bindir}/cephfs-journal-tool
+%{_bindir}/cephfs-table-tool
+%{_bindir}/cephfs-data-scan
+%{_bindir}/ceph-debugpack
+%{_bindir}/ceph-coverage
+%if 0%{?_with_systemd}
+%{_unitdir}/ceph-mds at .service
+%{_unitdir}/ceph-mon at .service
+%{_unitdir}/ceph-create-keys at .service
+%{_unitdir}/ceph-osd at .service
+%{_unitdir}/ceph-radosgw at .service
+%{_unitdir}/ceph-disk at .service
+%{_unitdir}/ceph.target
+%else
+%{_initrddir}/ceph
+%endif
+%{_sbindir}/ceph-disk
+%{_sbindir}/ceph-disk-udev
+%{_sbindir}/ceph-create-keys
+%{_sbindir}/rcceph
+%if 0%{?rhel} >= 7 || 0%{?fedora} || 0%{?suse_version}
+%{_sbindir}/mount.ceph
+%else
+/sbin/mount.ceph
+%endif
+%dir %{_libdir}/ceph
+%{_libdir}/ceph/ceph_common.sh
+%{_libexecdir}/ceph/ceph-osd-prestart.sh
+%dir %{_libdir}/rados-classes
+%{_libdir}/rados-classes/libcls_cephfs.so*
+%{_libdir}/rados-classes/libcls_rbd.so*
+%{_libdir}/rados-classes/libcls_hello.so*
+%{_libdir}/rados-classes/libcls_numops.so*
+%{_libdir}/rados-classes/libcls_rgw.so*
+%{_libdir}/rados-classes/libcls_lock.so*
+%{_libdir}/rados-classes/libcls_kvs.so*
+%{_libdir}/rados-classes/libcls_refcount.so*
+%{_libdir}/rados-classes/libcls_log.so*
+%{_libdir}/rados-classes/libcls_replica_log.so*
+%{_libdir}/rados-classes/libcls_statelog.so*
+%{_libdir}/rados-classes/libcls_timeindex.so*
+%{_libdir}/rados-classes/libcls_user.so*
+%{_libdir}/rados-classes/libcls_version.so*
+%dir %{_libdir}/ceph/erasure-code
+%{_libdir}/ceph/erasure-code/libec_*.so*
+%if 0%{?_with_lttng}
+%{_libdir}/libos_tp.so*
+%{_libdir}/libosd_tp.so*
+%endif
+%{_udevrulesdir}/60-ceph-partuuid-workaround.rules
+%{_udevrulesdir}/95-ceph-osd.rules
+%config %{_sysconfdir}/bash_completion.d/ceph
+%config(noreplace) %{_sysconfdir}/logrotate.d/ceph
+%if 0%{?fedora} || 0%{?rhel}
+%config(noreplace) %{_sysconfdir}/sysconfig/ceph
+%endif
+%if 0%{?suse_version}
+%{_localstatedir}/adm/fillup-templates/sysconfig.*
+%config %{_sysconfdir}/sysconfig/SuSEfirewall2.d/services/ceph-mon
+%config %{_sysconfdir}/sysconfig/SuSEfirewall2.d/services/ceph-osd-mds
+%endif
+%{python_sitelib}/ceph_detect_init*
+%{_mandir}/man8/ceph-deploy.8*
+%{_mandir}/man8/ceph-detect-init.8*
+%{_mandir}/man8/ceph-disk.8*
+%{_mandir}/man8/ceph-create-keys.8*
+%{_mandir}/man8/ceph-mon.8*
+%{_mandir}/man8/ceph-mds.8*
+%{_mandir}/man8/ceph-osd.8*
+%{_mandir}/man8/ceph-run.8*
+%{_mandir}/man8/ceph-rest-api.8*
+%{_mandir}/man8/crushtool.8*
+%{_mandir}/man8/osdmaptool.8*
+%{_mandir}/man8/monmaptool.8*
+%{_mandir}/man8/cephfs.8*
+%{_mandir}/man8/mount.ceph.8*
+%{_mandir}/man8/ceph-debugpack.8*
+%{_mandir}/man8/ceph-clsinfo.8*
+%{_mandir}/man8/librados-config.8*
+#set up placeholder directories
+%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/tmp
+%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/mon
+%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/osd
+%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/mds
+%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/bootstrap-osd
+%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/bootstrap-mds
+%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/bootstrap-rgw
+%if ! 0%{?_with_systemd}
+%attr(770,ceph,ceph) %dir %{_localstatedir}/run/ceph
+%endif
+
+#################################################################################
+%files -n ceph-common
+%defattr(-,root,root,-)
+%{_bindir}/ceph
+%{_bindir}/ceph-authtool
+%{_bindir}/ceph-conf
+%{_bindir}/ceph-dencoder
+%{_bindir}/ceph-rbdnamer
+%{_bindir}/ceph-syn
+%{_bindir}/ceph-crush-location
+%{_bindir}/rados
+%{_bindir}/rbd
+%{_bindir}/rbd-replay
+%{_bindir}/rbd-replay-many
+%if 0%{?_with_lttng}
+%{_bindir}/rbd-replay-prep
+%endif
+%{_bindir}/ceph-post-file
+%{_bindir}/ceph-brag
+%if 0%{?_with_systemd}
+%{_tmpfilesdir}/ceph-common.conf
+%endif
+%{_mandir}/man8/ceph-authtool.8*
+%{_mandir}/man8/ceph-conf.8*
+%{_mandir}/man8/ceph-dencoder.8*
+%{_mandir}/man8/ceph-rbdnamer.8*
+%{_mandir}/man8/ceph-syn.8*
+%{_mandir}/man8/ceph-post-file.8*
+%{_mandir}/man8/ceph.8*
+%{_mandir}/man8/rados.8*
+%{_mandir}/man8/rbd.8*
+%{_mandir}/man8/rbd-replay.8*
+%{_mandir}/man8/rbd-replay-many.8*
+%{_mandir}/man8/rbd-replay-prep.8*
+%{_datadir}/ceph/known_hosts_drop.ceph.com
+%{_datadir}/ceph/id_dsa_drop.ceph.com
+%{_datadir}/ceph/id_dsa_drop.ceph.com.pub
+%dir %{_sysconfdir}/ceph/
+%dir %{_datarootdir}/ceph/
+%dir %{_libexecdir}/ceph/
+%config %{_sysconfdir}/bash_completion.d/rados
+%config %{_sysconfdir}/bash_completion.d/rbd
+%config(noreplace) %{_sysconfdir}/ceph/rbdmap
+%{_initrddir}/rbdmap
+%{python_sitelib}/ceph_argparse.py*
+%{python_sitelib}/ceph_daemon.py*
+%{_udevrulesdir}/50-rbd.rules
+%attr(3770,ceph,ceph) %dir %{_localstatedir}/log/ceph/
+%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/
+
+%pre -n ceph-common
+CEPH_GROUP_ID=""
+CEPH_USER_ID=""
+%if 0%{?rhel} || 0%{?fedora}
+CEPH_GROUP_ID="-g 167"
+CEPH_USER_ID="-u 167"
+%endif
+%if 0%{?rhel} || 0%{?fedora}
+%{_sbindir}/groupadd ceph $CEPH_GROUP_ID -o -r 2>/dev/null || :
+%{_sbindir}/useradd ceph $CEPH_USER_ID -o -r -g ceph -s /sbin/nologin -c "Ceph daemons" -d %{_localstatedir}/lib/ceph 2> /dev/null || :
+%endif
+%if 0%{?suse_version}
+getent group ceph >/dev/null || groupadd -r ceph
+getent passwd ceph >/dev/null || useradd -r -g ceph -d %{_localstatedir}/lib/ceph -s /sbin/nologin -c "Ceph daemons" ceph
+%endif
+exit 0
+
+%post -n ceph-common
+%if 0%{?_with_systemd}
+systemd-tmpfiles --create --prefix=/run/ceph
+%endif
+
+%postun -n ceph-common
+# Package removal cleanup
+if [ "$1" -eq "0" ] ; then
+    rm -rf /var/log/ceph
+    rm -rf /etc/ceph
+fi
+
+#################################################################################
+%files fuse
+%defattr(-,root,root,-)
+%{_bindir}/ceph-fuse
+%{_mandir}/man8/ceph-fuse.8*
+%if 0%{?rhel} >= 7 || 0%{?fedora} || 0%{?suse_version}
+%{_sbindir}/mount.fuse.ceph
+%else
+/sbin/mount.fuse.ceph
+%endif
+
+#################################################################################
+%files -n rbd-fuse
+%defattr(-,root,root,-)
+%{_bindir}/rbd-fuse
+%{_mandir}/man8/rbd-fuse.8*
+
+#################################################################################
+%files radosgw
+%defattr(-,root,root,-)
+%{_bindir}/radosgw
+%{_bindir}/radosgw-admin
+%{_bindir}/radosgw-object-expirer
+%{_mandir}/man8/radosgw.8*
+%{_mandir}/man8/radosgw-admin.8*
+%config %{_sysconfdir}/bash_completion.d/radosgw-admin
+%dir %{_localstatedir}/lib/ceph/radosgw
+%if 0%{?_with_systemd}
+%else
+%{_initrddir}/ceph-radosgw
+%{_sbindir}/rcceph-radosgw
+%endif
+
+%post radosgw
+/sbin/ldconfig
+%if 0%{?suse_version}
+  # explicit systemctl daemon-reload (that's the only relevant bit of
+  # service_add_post; the rest is all sysvinit --> systemd migration which
+  # isn't applicable in this context (see above comment).
+  /usr/bin/systemctl daemon-reload >/dev/null 2>&1 || :
+%endif
+
+%preun radosgw
+%if 0%{?_with_systemd}
+  # Disable and stop on removal.
+  if [ $1 = 0 ] ; then
+    SERVICE_LIST=$(systemctl | grep -E '^ceph-radosgw@'  | cut -d' ' -f1)
+    if [ -n "$SERVICE_LIST" ]; then
+      for SERVICE in $SERVICE_LIST; do
+        /usr/bin/systemctl --no-reload disable $SERVICE > /dev/null 2>&1 || :
+        /usr/bin/systemctl stop $SERVICE > /dev/null 2>&1 || :
+      done
+    fi
+  fi
+%endif
+
+%postun radosgw
+/sbin/ldconfig
+%if 0%{?_with_systemd}
+  if [ $1 = 1 ] ; then
+    # Restart on upgrade, but only if "CEPH_AUTO_RESTART_ON_UPGRADE" is set to
+    # "yes". In any case: if units are not running, do not touch them.
+    SYSCONF_CEPH=/etc/sysconfig/ceph
+    if [ -f $SYSCONF_CEPH -a -r $SYSCONF_CEPH ] ; then
+      source $SYSCONF_CEPH
+    fi
+    if [ "X$CEPH_AUTO_RESTART_ON_UPGRADE" = "Xyes" ] ; then
+      SERVICE_LIST=$(systemctl | grep -E '^ceph-radosgw@'  | cut -d' ' -f1)
+      if [ -n "$SERVICE_LIST" ]; then
+        for SERVICE in $SERVICE_LIST; do
+          /usr/bin/systemctl try-restart $SERVICE > /dev/null 2>&1 || :
+        done
+      fi
+    fi
+  fi
+%endif
+
+#################################################################################
+%if %{with ocf}
+%files resource-agents
+%defattr(0755,root,root,-)
+%dir /usr/lib/ocf
+%dir /usr/lib/ocf/resource.d
+%dir /usr/lib/ocf/resource.d/ceph
+/usr/lib/ocf/resource.d/%{name}/*
+%endif
+
+#################################################################################
+%files -n librados2
+%defattr(-,root,root,-)
+%{_libdir}/librados.so.*
+%if 0%{?_with_lttng}
+%{_libdir}/librados_tp.so.*
+%endif
+
+%post -n librados2
+/sbin/ldconfig
+
+%postun -n librados2
+/sbin/ldconfig
+
+#################################################################################
+%files -n librados2-devel
+%defattr(-,root,root,-)
+%dir %{_includedir}/rados
+%{_includedir}/rados/librados.h
+%{_includedir}/rados/librados.hpp
+%{_includedir}/rados/buffer.h
+%{_includedir}/rados/page.h
+%{_includedir}/rados/crc32c.h
+%{_includedir}/rados/rados_types.h
+%{_includedir}/rados/rados_types.hpp
+%{_includedir}/rados/memory.h
+%{_libdir}/librados.so
+%if 0%{?_with_lttng}
+%{_libdir}/librados_tp.so
+%endif
+
+#################################################################################
+%files -n python-rados
+%defattr(-,root,root,-)
+%{python_sitelib}/rados.py*
+
+#################################################################################
+%files -n libradosstriper1
+%defattr(-,root,root,-)
+%{_libdir}/libradosstriper.so.*
+
+%post -n libradosstriper1
+/sbin/ldconfig
+
+%postun -n libradosstriper1
+/sbin/ldconfig
+
+#################################################################################
+%files -n libradosstriper1-devel
+%defattr(-,root,root,-)
+%dir %{_includedir}/radosstriper
+%{_includedir}/radosstriper/libradosstriper.h
+%{_includedir}/radosstriper/libradosstriper.hpp
+%{_libdir}/libradosstriper.so
+
+#################################################################################
+%files -n librbd1
+%defattr(-,root,root,-)
+%{_libdir}/librbd.so.*
+%if 0%{?_with_lttng}
+%{_libdir}/librbd_tp.so.*
+%endif
+
+%post -n librbd1
+/sbin/ldconfig
+mkdir -p /usr/lib64/qemu/
+ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
+
+%postun -n librbd1
+/sbin/ldconfig
+
+#################################################################################
+%files -n librbd1-devel
+%defattr(-,root,root,-)
+%dir %{_includedir}/rbd
+%{_includedir}/rbd/librbd.h
+%{_includedir}/rbd/librbd.hpp
+%{_includedir}/rbd/features.h
+%{_libdir}/librbd.so
+%if 0%{?_with_lttng}
+%{_libdir}/librbd_tp.so
+%endif
+
+#################################################################################
+%files -n python-rbd
+%defattr(-,root,root,-)
+%{python_sitelib}/rbd.py*
+
+#################################################################################
+%files -n libcephfs1
+%defattr(-,root,root,-)
+%{_libdir}/libcephfs.so.*
+
+%post -n libcephfs1
+/sbin/ldconfig
+
+%postun -n libcephfs1
+/sbin/ldconfig
+
+#################################################################################
+%files -n libcephfs1-devel
+%defattr(-,root,root,-)
+%dir %{_includedir}/cephfs
+%{_includedir}/cephfs/libcephfs.h
+%{_libdir}/libcephfs.so
+
+#################################################################################
+%files -n python-cephfs
+%defattr(-,root,root,-)
+%{python_sitelib}/cephfs.py*
+
+#################################################################################
+%files -n ceph-test
+%defattr(-,root,root,-)
+%{_bindir}/ceph_bench_log
+%{_bindir}/ceph_kvstorebench
+%{_bindir}/ceph_multi_stress_watch
+%{_bindir}/ceph_erasure_code
+%{_bindir}/ceph_erasure_code_benchmark
+%{_bindir}/ceph_omapbench
+%{_bindir}/ceph_objectstore_bench
+%{_bindir}/ceph_perf_objectstore
+%{_bindir}/ceph_perf_local
+%{_bindir}/ceph_perf_msgr_client
+%{_bindir}/ceph_perf_msgr_server
+%{_bindir}/ceph_psim
+%{_bindir}/ceph_radosacl
+%{_bindir}/ceph_rgw_jsonparser
+%{_bindir}/ceph_rgw_multiparser
+%{_bindir}/ceph_scratchtool
+%{_bindir}/ceph_scratchtoolpp
+%{_bindir}/ceph_smalliobench
+%{_bindir}/ceph_smalliobenchdumb
+%{_bindir}/ceph_smalliobenchfs
+%{_bindir}/ceph_smalliobenchrbd
+%{_bindir}/ceph_streamtest
+%{_bindir}/ceph_test_*
+%{_bindir}/ceph_tpbench
+%{_bindir}/ceph_xattr_bench
+%{_bindir}/ceph-monstore-tool
+%{_bindir}/ceph-osdomap-tool
+%{_bindir}/ceph-kvstore-tool
+%dir %{_libdir}/ceph
+%{_libdir}/ceph/ceph-monstore-update-crush.sh
+
+#################################################################################
+%if 0%{with cephfs_java}
+%files -n libcephfs_jni1
+%defattr(-,root,root,-)
+%{_libdir}/libcephfs_jni.so.*
+
+%post -n libcephfs_jni1
+/sbin/ldconfig
+
+%postun -n libcephfs_jni1
+/sbin/ldconfig
+
+#################################################################################
+%files -n libcephfs_jni1-devel
+%defattr(-,root,root,-)
+%{_libdir}/libcephfs_jni.so
+
+#################################################################################
+%files -n cephfs-java
+%defattr(-,root,root,-)
+%{_javadir}/libcephfs.jar
+%{_javadir}/libcephfs-test.jar
+%endif
+
+#################################################################################
+%if 0%{with selinux}
+%files selinux
+%defattr(-,root,root,-)
+%attr(0600,root,root) %{_datadir}/selinux/packages/ceph.pp
+%{_datadir}/selinux/devel/include/contrib/ceph.if
+%{_mandir}/man8/ceph_selinux.8*
+
+%post selinux
+# Install the policy
+OLD_POLVER=$(%{_sbindir}/semodule -l | grep -P '^ceph[\t ]' | awk '{print $2}')
+%{_sbindir}/semodule -n -i %{_datadir}/selinux/packages/ceph.pp
+NEW_POLVER=$(%{_sbindir}/semodule -l | grep -P '^ceph[\t ]' | awk '{print $2}')
+
+# Load the policy if SELinux is enabled
+if %{_sbindir}/selinuxenabled; then
+    %{_sbindir}/load_policy
+else
+    # Do not relabel if selinux is not enabled
+    exit 0
+fi
+
+if test "$OLD_POLVER" == "$NEW_POLVER"; then
+   # Do not relabel if policy version did not change
+   exit 0
+fi
+
+# Check whether the daemons are running
+%if 0%{?_with_systemd}
+    /usr/bin/systemctl status ceph.target > /dev/null 2>&1
+%else
+    /sbin/service ceph status >/dev/null 2>&1
+%endif
+STATUS=$?
+
+# Stop the daemons if they were running
+if test $STATUS -eq 0; then
+%if 0%{?_with_systemd}
+    /usr/bin/systemctl stop ceph.target > /dev/null 2>&1
+%else
+    /sbin/service ceph stop >/dev/null 2>&1
+%endif
+fi
+
+# Now, relabel the files
+%relabel_files
+
+# Start the daemons iff they were running before
+if test $STATUS -eq 0; then
+%if 0%{?_with_systemd}
+    /usr/bin/systemctl start ceph.target > /dev/null 2>&1 || :
+%else
+    /sbin/service ceph start >/dev/null 2>&1 || :
+%endif
+fi
+
+exit 0
+
+%postun selinux
+if [ $1 -eq 0 ]; then
+    # Remove the module
+    %{_sbindir}/semodule -n -r ceph
+
+    # Reload the policy if SELinux is enabled
+    if %{_sbindir}/selinuxenabled ; then
+        %{_sbindir}/load_policy
+    else
+        # Do not relabel if SELinux is not enabled
+        exit 0
+    fi
+
+    # Check whether the daemons are running
+    %if 0%{?_with_systemd}
+        /usr/bin/systemctl status ceph.target > /dev/null 2>&1
+    %else
+        /sbin/service ceph status >/dev/null 2>&1
+    %endif
+    STATUS=$?
+
+    # Stop the daemons if they were running
+    if test $STATUS -eq 0; then
+    %if 0%{?_with_systemd}
+        /usr/bin/systemctl stop ceph.target > /dev/null 2>&1
+    %else
+        /sbin/service ceph stop >/dev/null 2>&1
+    %endif
+    fi
+
+    # Now, relabel the files
+    %relabel_files
+
+    # Start the daemons if they were running before
+    if test $STATUS -eq 0; then
+    %if 0%{?_with_systemd}
+	/usr/bin/systemctl start ceph.target > /dev/null 2>&1 || :
+    %else
+	/sbin/service ceph start >/dev/null 2>&1 || :
+    %endif
+    fi
+fi
+exit 0
+
+%endif # with selinux
+
+#################################################################################
+%if 0%{with libs_compat}
+%files libs-compat
+# We need an empty %%files list for ceph-libs-compat, to tell rpmbuild to actually
+# build this meta package.
+
+#################################################################################
+%files devel-compat
+# We need an empty %%files list for ceph-devel-compat, to tell rpmbuild to
+# actually build this meta package.
+%endif
+
+#################################################################################
+%files -n python-ceph-compat
+# We need an empty %%files list for python-ceph-compat, to tell rpmbuild to
+# actually build this meta package.
+
+%changelog
diff --git a/src/test/opensuse-13.2/install-deps.sh b/src/test/opensuse-13.2/install-deps.sh
new file mode 100755
index 0000000..1bebf09
--- /dev/null
+++ b/src/test/opensuse-13.2/install-deps.sh
@@ -0,0 +1,147 @@
+#!/bin/bash -e
+#
+# Ceph distributed storage system
+#
+# Copyright (C) 2014, 2015 Red Hat <contact at redhat.com>
+#
+# Author: Loic Dachary <loic at dachary.org>
+#
+#  This library is free software; you can redistribute it and/or
+#  modify it under the terms of the GNU Lesser General Public
+#  License as published by the Free Software Foundation; either
+#  version 2.1 of the License, or (at your option) any later version.
+#
+DIR=/tmp/install-deps.$$
+trap "rm -fr $DIR" EXIT
+mkdir -p $DIR
+if test $(id -u) != 0 ; then
+    SUDO=sudo
+fi
+export LC_ALL=C # the following is vulnerable to i18n
+
+if test -f /etc/redhat-release ; then
+    $SUDO yum install -y redhat-lsb-core
+fi
+
+if type apt-get > /dev/null 2>&1 ; then
+    $SUDO apt-get install -y lsb-release
+fi
+
+if type zypper > /dev/null 2>&1 ; then
+    $SUDO zypper --gpg-auto-import-keys --non-interactive install lsb-release
+fi
+
+case $(lsb_release -si) in
+Ubuntu|Debian|Devuan)
+        $SUDO apt-get install -y dpkg-dev
+        if ! test -r debian/control ; then
+            echo debian/control is not a readable file
+            exit 1
+        fi
+        touch $DIR/status
+        packages=$(dpkg-checkbuilddeps --admindir=$DIR debian/control 2>&1 | \
+            perl -p -e 's/.*Unmet build dependencies: *//;' \
+            -e 's/build-essential:native/build-essential/;' \
+            -e 's/\s*\|\s*/\|/g;' \
+            -e 's/\(.*?\)//g;' \
+            -e 's/ +/\n/g;' | sort)
+        case $(lsb_release -sc) in
+            squeeze|wheezy)
+                packages=$(echo $packages | perl -pe 's/[-\w]*babeltrace[-\w]*//g')
+                backports="-t $(lsb_release -sc)-backports"
+                ;;
+        esac
+        packages=$(echo $packages) # change newlines into spaces
+        $SUDO env DEBIAN_FRONTEND=noninteractive apt-get install $backports -y $packages || exit 1
+        ;;
+CentOS|Fedora|RedHatEnterpriseServer)
+        case $(lsb_release -si) in
+            Fedora)
+                $SUDO yum install -y yum-utils
+                ;;
+            CentOS|RedHatEnterpriseServer)
+                $SUDO yum install -y yum-utils
+                MAJOR_VERSION=$(lsb_release -rs | cut -f1 -d.)
+                if test $(lsb_release -si) == RedHatEnterpriseServer ; then
+                    $SUDO yum install subscription-manager
+                    $SUDO subscription-manager repos --enable=rhel-$MAJOR_VERSION-server-optional-rpms
+                fi
+                $SUDO yum-config-manager --add-repo https://dl.fedoraproject.org/pub/epel/$MAJOR_VERSION/x86_64/ 
+                $SUDO yum install --nogpgcheck -y epel-release
+                $SUDO rpm --import /etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-$MAJOR_VERSION
+                $SUDO rm -f /etc/yum.repos.d/dl.fedoraproject.org*
+                ;;
+        esac
+        sed -e 's/@//g' < ceph.spec.in > $DIR/ceph.spec
+        $SUDO yum-builddep -y $DIR/ceph.spec 2>&1 | tee $DIR/yum-builddep.out
+        ! grep -q -i error: $DIR/yum-builddep.out || exit 1
+        ;;
+*SUSE*)
+        sed -e 's/@//g' < ceph.spec.in > $DIR/ceph.spec
+        $SUDO zypper --non-interactive install $(rpmspec -q --buildrequires $DIR/ceph.spec) || exit 1
+        ;;
+*)
+        echo "$(lsb_release -si) is unknown, dependencies will have to be installed manually."
+        ;;
+esac
+
+function populate_wheelhouse() {
+    local install=$1
+    shift
+
+    # Ubuntu-12.04 and Python 2.7.3 require this line
+    pip --timeout 300 $install 'distribute >= 0.7.3' || return 1
+    # although pip comes with virtualenv, having a recent version
+    # of pip matters when it comes to using wheel packages
+    pip --timeout 300 $install 'setuptools >= 0.8' 'pip >= 7.0' 'wheel >= 0.24' || return 1
+    if test $# != 0 ; then
+        pip --timeout 300 $install $@ || return 1
+    fi
+}
+
+function activate_virtualenv() {
+    local top_srcdir=$1
+    local interpreter=$2
+    local env_dir=$top_srcdir/install-deps-$interpreter
+
+    if ! test -d $env_dir ; then
+        virtualenv --python $interpreter $env_dir
+        . $env_dir/bin/activate
+        if ! populate_wheelhouse install ; then
+            rm -rf $env_dir
+            return 1
+        fi
+    fi
+    . $env_dir/bin/activate
+}
+
+# use pip cache if possible but do not store it outside of the source
+# tree
+# see https://pip.pypa.io/en/stable/reference/pip_install.html#caching
+mkdir -p install-deps-cache
+top_srcdir=$(pwd)
+export XDG_CACHE_HOME=$top_srcdir/install-deps-cache
+wip_wheelhouse=wheelhouse-wip
+
+#
+# preload python modules so that tox can run without network access
+#
+find . -name tox.ini | while read ini ; do
+    (
+        cd $(dirname $ini)
+        require=$(ls *requirements.txt 2>/dev/null | sed -e 's/^/-r /')
+        if test "$require" && ! test -d wheelhouse ; then
+            for interpreter in python2.7 python3 ; do
+                type $interpreter > /dev/null 2>&1 || continue
+                activate_virtualenv $top_srcdir $interpreter || exit 1
+                populate_wheelhouse "wheel -w $wip_wheelhouse" $require || exit 1
+            done
+            mv $wip_wheelhouse wheelhouse
+        fi
+    )
+done
+
+for interpreter in python2.7 python3 ; do
+    rm -rf $top_srcdir/install-deps-$interpreter
+done
+rm -rf $XDG_CACHE_HOME
diff --git a/src/test/os/TestFlatIndex.cc b/src/test/os/TestFlatIndex.cc
deleted file mode 100644
index 797aa9d..0000000
--- a/src/test/os/TestFlatIndex.cc
+++ /dev/null
@@ -1,139 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2013 Cloudwatt <libre.licensing at cloudwatt.com>
- *
- * Author: Loic Dachary <loic at dachary.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Library Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Library Public License for more details.
- *
- */
-
-#include <stdio.h>
-#include <signal.h>
-#include "os/FlatIndex.h"
-#include "os/CollectionIndex.h"
-#include "os/chain_xattr.h"
-#include "common/ceph_argparse.h"
-#include "global/global_init.h"
-#include <gtest/gtest.h>
-
-TEST(FlatIndex, FlatIndex) {
-  coll_t collection("ABC");
-  const std::string base_path("PATH");
-  FlatIndex index(collection, base_path);
-  EXPECT_EQ(collection, index.coll());
-  EXPECT_EQ((unsigned)0, index.collection_version());
-  //
-  // checking placeholders
-  //
-  EXPECT_EQ(0, index.init());
-  EXPECT_EQ(0, index.cleanup());
-}
-
-TEST(FlatIndex, collection) {
-  coll_t collection("ABC");
-  const std::string base_path("PATH");
-  FlatIndex index(collection, base_path);
-  const std::string key("KEY");
-  uint64_t hash = 111;
-  uint64_t pool = 222;
-  const std::string object_name(10, 'A');
-  ghobject_t hoid(hobject_t(object_t(object_name), key, CEPH_NOSNAP, hash, pool, ""));
-  vector<ghobject_t> ls;
-  ASSERT_THROW(index.collection_list_partial(hoid, 0, 0, 0, &ls, &hoid), FailedAssertion);
-}
-
-TEST(FlatIndex, created_unlink) {
-  coll_t collection("ABC");
-  const std::string base_path("PATH");
-  EXPECT_EQ(0, ::system("rm -fr PATH"));
-  EXPECT_EQ(0, ::mkdir("PATH", 0700));
-  ceph::shared_ptr<CollectionIndex> index(new FlatIndex(collection, base_path));
-  const std::string key("KEY");
-  uint64_t hash = 111;
-  uint64_t pool = 222;
-  //
-  // short object name
-  //
-  {
-    CollectionIndex::IndexedPath indexed_path;
-    const std::string object_name(10, 'A');
-    ghobject_t hoid(hobject_t(object_t(object_name), key, CEPH_NOSNAP, hash, pool, ""));
-    int exists;
-    EXPECT_EQ(0, index->lookup(hoid, &indexed_path, &exists));
-    EXPECT_EQ(0, exists);
-    EXPECT_EQ(0, ::close(::creat(indexed_path->path(), 0600)));
-    EXPECT_EQ(0, index->lookup(hoid, &indexed_path, &exists));
-    EXPECT_EQ(1, exists);
-    EXPECT_EQ(0, index->unlink(hoid));
-    EXPECT_EQ(0, index->lookup(hoid, &indexed_path, &exists));
-    EXPECT_EQ(0, exists);
-  }
-  //
-  // long object name
-  //
-  {
-    CollectionIndex::IndexedPath indexed_path;
-    const std::string object_name(1024, 'A');
-    ghobject_t hoid(hobject_t(object_t(object_name), key, CEPH_NOSNAP, hash, pool, ""));
-    int exists;
-    EXPECT_EQ(0, index->lookup(hoid, &indexed_path, &exists));
-    EXPECT_EQ(0, exists);
-    EXPECT_EQ(0, ::close(::creat(indexed_path->path(), 0600)));
-    EXPECT_EQ(0, index->created(hoid, indexed_path->path()));
-    EXPECT_EQ(0, index->unlink(hoid));
-    EXPECT_EQ(0, index->lookup(hoid, &indexed_path, &exists));
-    EXPECT_EQ(0, exists);
-  }
-  EXPECT_EQ(0, ::system("rm -fr PATH"));
-}
-
-TEST(FlatIndex, collection_list) {
-  coll_t collection("ABC");
-  const std::string base_path("PATH");
-  EXPECT_EQ(0, ::system("rm -fr PATH"));
-  EXPECT_EQ(0, ::mkdir("PATH", 0700));
-  const std::string object_name("ABC");
-  const std::string filename("PATH/" + object_name + "_head");
-  EXPECT_EQ(0, ::close(::creat(filename.c_str(), 0600)));
-  ceph::shared_ptr<CollectionIndex> index(new FlatIndex(collection, base_path));
-  vector<ghobject_t> ls;
-  index->collection_list(&ls);
-  EXPECT_EQ((unsigned)1, ls.size());
-  EXPECT_EQ(object_name, ls[0].hobj.oid.name);
-  EXPECT_EQ(0, ::system("rm -fr PATH"));
-}
-
-int main(int argc, char **argv) {
-  int fd = ::creat("detect", 0600);
-  int ret = chain_fsetxattr(fd, "user.test", "A", 1);
-  ::close(fd);
-  ::unlink("detect");
-  if (ret < 0) {
-    cerr << "SKIP FlatIndex because unable to test for xattr" << std::endl;
-  } else {
-    vector<const char*> args;
-    argv_to_vec(argc, (const char **)argv, args);
-
-    global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
-    common_init_finish(g_ceph_context);
-
-    ::testing::InitGoogleTest(&argc, argv);
-    return RUN_ALL_TESTS();
-  }
-}
-
-// Local Variables:
-// compile-command: "cd ../.. ; make unittest_flatindex ; ./unittest_flatindex # --gtest_filter=FlatIndexTest.FlatIndex --log-to-stderr=true --debug-filestore=20"
-// End:
diff --git a/src/test/os/TestLFNIndex.cc b/src/test/os/TestLFNIndex.cc
index e915005..5e44355 100644
--- a/src/test/os/TestLFNIndex.cc
+++ b/src/test/os/TestLFNIndex.cc
@@ -75,15 +75,11 @@ protected:
 		      int *exists		 
 		      ) { return 0; }
 
-  virtual int _collection_list(
-			       vector<ghobject_t> *ls
-			       ) { return 0; }
-
   virtual int _collection_list_partial(
 				       const ghobject_t &start,
-				       int min_count,
+				       const ghobject_t &end,
+				       bool sort_bitwise,
 				       int max_count,
-				       snapid_t seq,
 				       vector<ghobject_t> *ls,
 				       ghobject_t *next
 				       ) { return 0; }
@@ -96,7 +92,7 @@ protected:
 
 class TestHASH_INDEX_TAG : public TestWrapLFNIndex, public ::testing::Test {
 public:
-  TestHASH_INDEX_TAG() : TestWrapLFNIndex(coll_t("ABC"), "PATH_1", CollectionIndex::HASH_INDEX_TAG) {
+  TestHASH_INDEX_TAG() : TestWrapLFNIndex(coll_t(), "PATH_1", CollectionIndex::HASH_INDEX_TAG) {
   }
 };
 
@@ -114,7 +110,7 @@ TEST_F(TestHASH_INDEX_TAG, generate_and_parse_name) {
 
 class TestHASH_INDEX_TAG_2 : public TestWrapLFNIndex, public ::testing::Test {
 public:
-  TestHASH_INDEX_TAG_2() : TestWrapLFNIndex(coll_t("ABC"), "PATH_1", CollectionIndex::HASH_INDEX_TAG_2) {
+  TestHASH_INDEX_TAG_2() : TestWrapLFNIndex(coll_t(), "PATH_1", CollectionIndex::HASH_INDEX_TAG_2) {
   }
 };
 
@@ -137,7 +133,7 @@ TEST_F(TestHASH_INDEX_TAG_2, generate_and_parse_name) {
 
 class TestHOBJECT_WITH_POOL : public TestWrapLFNIndex, public ::testing::Test {
 public:
-  TestHOBJECT_WITH_POOL() : TestWrapLFNIndex(coll_t("ABC"), "PATH_1", CollectionIndex::HOBJECT_WITH_POOL) {
+  TestHOBJECT_WITH_POOL() : TestWrapLFNIndex(coll_t(), "PATH_1", CollectionIndex::HOBJECT_WITH_POOL) {
   }
 };
 
@@ -181,7 +177,7 @@ TEST_F(TestHOBJECT_WITH_POOL, generate_and_parse_name) {
 
 class TestLFNIndex : public TestWrapLFNIndex, public ::testing::Test {
 public:
-  TestLFNIndex() : TestWrapLFNIndex(coll_t("ABC"), "PATH_1", CollectionIndex::HOBJECT_WITH_POOL) {
+  TestLFNIndex() : TestWrapLFNIndex(coll_t(), "PATH_1", CollectionIndex::HOBJECT_WITH_POOL) {
   }
 
   virtual void SetUp() {
@@ -312,7 +308,7 @@ TEST_F(TestLFNIndex, remove_object) {
     std::string mangled_name_1 = mangled_name;
     mangled_name_1.replace(mangled_name_1.find("0_long"), 6, "1_long");
     const std::string pathname_1("PATH_1/" + mangled_name_1);
-    const std::string cmd("cp --preserve=xattr " + pathname + " " + pathname_1);
+    const std::string cmd("cp -a " + pathname + " " + pathname_1);
     EXPECT_EQ(0, ::system(cmd.c_str()));
     const string ATTR = "user.MARK";
     EXPECT_EQ((unsigned)1, (unsigned)chain_setxattr(pathname_1.c_str(), ATTR.c_str(), "Y", 1));
diff --git a/src/test/osd/Object.cc b/src/test/osd/Object.cc
index c5ff040..4f791cd 100644
--- a/src/test/osd/Object.cc
+++ b/src/test/osd/Object.cc
@@ -4,6 +4,7 @@
 #include <list>
 #include <map>
 #include <set>
+#include <iostream>
 
 #include "Object.h"
 
@@ -79,11 +80,6 @@ void VarLenGenerator::get_ranges_map(
     }
     pos += segment_length;
   }
-  // make sure we write up to the limit
-  if (limit > 0 && (
-	out.empty() ||
-	(out.rbegin()->first + out.rbegin()->second < limit)))
-    out[limit-1] = 1;
 }
 
 ObjectDesc::iterator &ObjectDesc::iterator::advance(bool init) {
@@ -182,3 +178,41 @@ bool ObjectDesc::check(bufferlist &to_check) {
   }
   return true;
 }
+
+bool ObjectDesc::check_sparse(const std::map<uint64_t, uint64_t>& extents,
+			      bufferlist &to_check) {
+  auto i = begin();
+  auto p = to_check.begin();
+  uint64_t pos = 0;
+  for (auto extent : extents) {
+    const uint64_t start = extent.first;
+    const uint64_t end = start + extent.second;
+    for (; pos < end; ++i, ++pos) {
+      if (i.end()) {
+	std::cout << "reached end of iterator first" << std::endl;
+	return false;
+      }
+      if (pos < start) {
+	// check the hole
+	if (*i != '\0') {
+	  std::cout << "incorrect buffer at pos " << pos << std::endl;
+	  return false;
+	}
+      } else {
+	// then the extent
+	if (*i != *p) {
+	  std::cout << "incorrect buffer at pos " << pos << std::endl;
+	  return false;
+	}
+	++p;
+      }
+    }
+  }
+  uint64_t size = layers.empty() ? 0 :
+    most_recent_gen()->get_length(most_recent());
+  if (pos != size) {
+    std::cout << "only read " << pos << " out of size " << size << std::endl;
+    return false;
+  }
+  return true;
+}
diff --git a/src/test/osd/Object.h b/src/test/osd/Object.h
index bffb397..feeefeb 100644
--- a/src/test/osd/Object.h
+++ b/src/test/osd/Object.h
@@ -358,6 +358,8 @@ public:
   // takes ownership of gen
   void update(ContentsGenerator *gen, const ContDesc &next);
   bool check(bufferlist &to_check);
+  bool check_sparse(const std::map<uint64_t, uint64_t>& extends,
+		    bufferlist &to_check);
   const ContDesc &most_recent();
   ContentsGenerator *most_recent_gen() {
     return layers.begin()->first.get();
diff --git a/src/test/osd/RadosModel.h b/src/test/osd/RadosModel.h
index 8d6889e..81c825a 100644
--- a/src/test/osd/RadosModel.h
+++ b/src/test/osd/RadosModel.h
@@ -473,6 +473,13 @@ public:
     return false;
   }
 
+  bool object_existed_at(const string &oid, int snap = -1) const
+  {
+    ObjectDesc contents;
+    bool found = find_object(oid, &contents, snap);
+    return found && contents.exists;
+  }
+
   void remove_snap(int snap)
   {
     map<int, map<string,ObjectDesc> >::iterator next_iter = pool_obj_cont.find(snap);
@@ -727,7 +734,8 @@ public:
 	  bool do_excl,
 	  TestOpStat *stat = 0)
     : TestOp(n, context, stat),
-      oid(oid), waiting_on(0), last_acked_tid(0), do_append(do_append),
+      oid(oid), rcompletion(NULL), waiting_on(0), 
+      last_acked_tid(0), do_append(do_append),
       do_excl(do_excl)
   {}
 		
@@ -973,16 +981,20 @@ public:
 
 class ReadOp : public TestOp {
 public:
-  librados::AioCompletion *completion;
+  vector<librados::AioCompletion *> completions;
   librados::ObjectReadOperation op;
   string oid;
   ObjectDesc old_value;
   int snap;
+  bool balance_reads;
 
   ceph::shared_ptr<int> in_use;
 
-  bufferlist result;
-  int retval;
+  vector<bufferlist> results;
+  vector<int> retvals;
+  vector<std::map<uint64_t, uint64_t>> extent_results;
+  vector<bool> is_sparse_read;
+  uint64_t waiting_on;
 
   map<string, bufferlist> attrs;
   int attrretval;
@@ -997,15 +1009,41 @@ public:
   ReadOp(int n,
 	 RadosTestContext *context,
 	 const string &oid,
+	 bool balance_reads,
 	 TestOpStat *stat = 0)
     : TestOp(n, context, stat),
-      completion(NULL),
+      completions(3),
       oid(oid),
       snap(0),
-      retval(0),
+      balance_reads(balance_reads),
+      results(3),
+      retvals(3),
+      extent_results(3),
+      is_sparse_read(3, false),
+      waiting_on(0),
       attrretval(0)
   {}
-		
+
+  void _do_read(librados::ObjectReadOperation& read_op, int index) {
+    uint64_t len = 0;
+    if (old_value.has_contents())
+      len = old_value.most_recent_gen()->get_length(old_value.most_recent());
+    if (rand() % 2) {
+      is_sparse_read[index] = false;
+      read_op.read(0,
+		   len,
+		   &results[index],
+		   &retvals[index]);
+    } else {
+      is_sparse_read[index] = true;
+      read_op.sparse_read(0,
+			  len,
+			  &extent_results[index],
+			  &results[index],
+			  &retvals[index]);
+    }
+  }
+
   void _begin()
   {
     context->state_lock.Lock();
@@ -1017,7 +1055,9 @@ public:
     }
     std::cout << num << ": read oid " << oid << " snap " << snap << std::endl;
     done = 0;
-    completion = context->rados.aio_create_completion((void *) this, &read_callback, 0);
+    for (uint32_t i = 0; i < 3; i++) {
+      completions[i] = context->rados.aio_create_completion((void *) this, &read_callback, 0);
+    }
 
     context->oid_in_use.insert(oid);
     context->oid_not_in_use.erase(oid);
@@ -1045,16 +1085,11 @@ public:
       std::cerr << num << ":  notified, waiting" << std::endl;
       ctx->wait();
     }
+    context->state_lock.Lock();
     if (snap >= 0) {
       context->io_ctx.snap_set_read(context->snaps[snap]);
     }
-
-    op.read(0,
-	    !old_value.has_contents() ? 0 :
-	    old_value.most_recent_gen()->get_length(old_value.most_recent()),
-	    &result,
-	    &retval);
-
+    _do_read(op, 0);
     for (map<string, ContDesc>::iterator i = old_value.attrs.begin();
 	 i != old_value.attrs.end();
 	 ++i) {
@@ -1073,27 +1108,65 @@ public:
       op.omap_get_header(&header, 0);
     }
     op.getxattrs(&xattrs, 0);
-    assert(!context->io_ctx.aio_operate(context->prefix+oid, completion, &op, 0));
+
+    unsigned flags = 0;
+    if (balance_reads)
+      flags |= librados::OPERATION_BALANCE_READS;
+
+    assert(!context->io_ctx.aio_operate(context->prefix+oid, completions[0], &op,
+					flags, NULL));
+    waiting_on++;
+ 
+    // send 2 pipelined reads on the same object/snap. This can help testing
+    // OSD's read behavior in some scenarios
+    for (uint32_t i = 1; i < 3; ++i) {
+      librados::ObjectReadOperation pipeline_op;
+      _do_read(pipeline_op, i);
+      assert(!context->io_ctx.aio_operate(context->prefix+oid, completions[i], &pipeline_op, 0));
+      waiting_on++;
+    }
+
     if (snap >= 0) {
       context->io_ctx.snap_set_read(0);
     }
+    context->state_lock.Unlock();
   }
 
   void _finish(CallbackInfo *info)
   {
-    context->state_lock.Lock();
+    Mutex::Locker l(context->state_lock);
     assert(!done);
+    assert(waiting_on > 0);
+    if (--waiting_on) {
+      return;
+    }
+
     context->oid_in_use.erase(oid);
     context->oid_not_in_use.insert(oid);
-    assert(completion->is_complete());
-    uint64_t version = completion->get_version64();
-    if (int err = completion->get_return_value()) {
-      if (!(err == -ENOENT && old_value.deleted())) {
-	cerr << num << ": Error: oid " << oid << " read returned error code "
-	     << err << std::endl;
+    int retval = completions[0]->get_return_value();
+    for (vector<librados::AioCompletion *>::iterator it = completions.begin();
+         it != completions.end(); ++it) {
+      assert((*it)->is_complete());
+      uint64_t version = (*it)->get_version64();
+      int err = (*it)->get_return_value();
+      if (err != retval) {
+        cerr << num << ": Error: oid " << oid << " read returned different error codes: "
+             << retval << " and " << err << std::endl;
 	assert(0);
       }
-    } else {
+      if (err) {
+        if (!(err == -ENOENT && old_value.deleted())) {
+          cerr << num << ": Error: oid " << oid << " read returned error code "
+               << err << std::endl;
+          assert(0);
+        }
+      } else if (version != old_value.version) {
+	cerr << num << ": oid " << oid << " version is " << version
+	     << " and expected " << old_value.version << std::endl;
+	assert(version == old_value.version);
+      }
+    }
+    if (!retval) {
       map<string, bufferlist>::iterator iter = xattrs.find("_header");
       bufferlist headerbl;
       if (iter == xattrs.end()) {
@@ -1122,9 +1195,18 @@ public:
 	       << ", expected " << old_value.most_recent() << std::endl;
 	  context->errors++;
 	}
-	if (!old_value.check(result)) {
-	  cerr << num << ": oid " << oid << " contents " << to_check << " corrupt" << std::endl;
-	  context->errors++;
+        for (unsigned i = 0; i < results.size(); i++) {
+	  if (is_sparse_read[i]) {
+	    if (!old_value.check_sparse(extent_results[i], results[i])) {
+	      cerr << num << ": oid " << oid << " contents " << to_check << " corrupt" << std::endl;
+	      context->errors++;
+	    }
+	  } else {
+	    if (!old_value.check(results[i])) {
+	      cerr << num << ": oid " << oid << " contents " << to_check << " corrupt" << std::endl;
+	      context->errors++;
+	    }
+	  }
 	}
 	if (context->errors) assert(0);
       }
@@ -1153,11 +1235,6 @@ public:
 	     << " and old is " << old_value.attrs.size() << std::endl;
 	assert(xattrs.size() == old_value.attrs.size());
       }
-      if (version != old_value.version) {
-	cerr << num << ": oid " << oid << " version is " << version
-	     << " and expected " << old_value.version << std::endl;
-	assert(version == old_value.version);
-      }
       for (map<string, ContDesc>::iterator iter = old_value.attrs.begin();
 	   iter != old_value.attrs.end();
 	   ++iter) {
@@ -1203,14 +1280,17 @@ public:
 	}
       }
     }
+    for (vector<librados::AioCompletion *>::iterator it = completions.begin();
+         it != completions.end(); ++it) {
+      (*it)->release();
+    }
     context->kick();
     done = true;
-    context->state_lock.Unlock();
   }
 
   bool finished()
   {
-    return done && completion->is_complete();
+    return done;
   }
 
   string getType()
@@ -1395,9 +1475,13 @@ public:
   string oid;
   int roll_back_to;
   bool done;
+  librados::ObjectWriteOperation zero_write_op1;
+  librados::ObjectWriteOperation zero_write_op2;
   librados::ObjectWriteOperation op;
-  librados::AioCompletion *comp;
+  vector<librados::AioCompletion *> comps;
   ceph::shared_ptr<int> in_use;
+  int last_finished;
+  int outstanding;
 
   RollbackOp(int n,
 	     RadosTestContext *context,
@@ -1405,7 +1489,9 @@ public:
 	     TestOpStat *stat = 0)
     : TestOp(n, context, stat),
       oid(_oid), roll_back_to(-1), 
-      done(false), comp(NULL)
+      done(false),
+      comps(3, NULL),
+      last_finished(-1), outstanding(3)
   {}
 
   void _begin()
@@ -1435,39 +1521,80 @@ public:
 
     cout << "rollback oid " << oid << " to " << roll_back_to << std::endl;
 
+    bool existed_before = context->object_existed_at(oid);
+    bool existed_after = context->object_existed_at(oid, roll_back_to);
+
     context->roll_back(oid, roll_back_to);
     uint64_t snap = context->snaps[roll_back_to];
 
+    outstanding -= (!existed_before) + (!existed_after);
+
     context->state_lock.Unlock();
 
+    bufferlist bl, bl2;
+    zero_write_op1.append(bl);
+    zero_write_op2.append(bl2);
+
     if (context->pool_snaps) {
       op.snap_rollback(snap);
     } else {
       op.selfmanaged_snap_rollback(snap);
     }
 
-    pair<TestOp*, TestOp::CallbackInfo*> *cb_arg =
-      new pair<TestOp*, TestOp::CallbackInfo*>(this,
-					       new TestOp::CallbackInfo(0));
-    comp = context->rados.aio_create_completion((void*) cb_arg, NULL,
-						&write_callback);
-    context->io_ctx.aio_operate(context->prefix+oid, comp, &op);
+    if (existed_before) {
+      pair<TestOp*, TestOp::CallbackInfo*> *cb_arg =
+	new pair<TestOp*, TestOp::CallbackInfo*>(this,
+						 new TestOp::CallbackInfo(0));
+      comps[0] = 
+	context->rados.aio_create_completion((void*) cb_arg, NULL,
+					     &write_callback);
+      context->io_ctx.aio_operate(
+	context->prefix+oid, comps[0], &zero_write_op1);
+    }
+    {
+      pair<TestOp*, TestOp::CallbackInfo*> *cb_arg =
+	new pair<TestOp*, TestOp::CallbackInfo*>(this,
+						 new TestOp::CallbackInfo(1));
+      comps[1] =
+	context->rados.aio_create_completion((void*) cb_arg, NULL,
+					     &write_callback);
+      context->io_ctx.aio_operate(
+	context->prefix+oid, comps[1], &op);
+    }
+    if (existed_after) {
+      pair<TestOp*, TestOp::CallbackInfo*> *cb_arg =
+	new pair<TestOp*, TestOp::CallbackInfo*>(this,
+						 new TestOp::CallbackInfo(2));
+      comps[2] =
+	context->rados.aio_create_completion((void*) cb_arg, NULL,
+					     &write_callback);
+      context->io_ctx.aio_operate(
+	context->prefix+oid, comps[2], &zero_write_op2);
+    }
   }
 
   void _finish(CallbackInfo *info)
   {
     Mutex::Locker l(context->state_lock);
+    uint64_t tid = info->id;
+    cout << num << ":  finishing rollback tid " << tid
+	 << " to " << context->prefix + oid << std::endl;
+    assert((int)(info->id) > last_finished);
+    last_finished = info->id;
+
     int r;
-    if ((r = comp->get_return_value())) {
+    if ((r = comps[last_finished]->get_return_value()) != 0) {
       cerr << "err " << r << std::endl;
       assert(0);
     }
-    done = true;
-    context->update_object_version(oid, comp->get_version64());
-    context->oid_in_use.erase(oid);
-    context->oid_not_in_use.insert(oid);
-    in_use = ceph::shared_ptr<int>();
-    context->kick();
+    if (--outstanding == 0) {
+      done = true;
+      context->update_object_version(oid, comps[tid]->get_version64());
+      context->oid_in_use.erase(oid);
+      context->oid_not_in_use.insert(oid);
+      in_use = ceph::shared_ptr<int>();
+      context->kick();
+    }
   }
 
   bool finished()
diff --git a/src/test/osd/TestOSDMap.cc b/src/test/osd/TestOSDMap.cc
index d4b05ac..ea4053f 100644
--- a/src/test/osd/TestOSDMap.cc
+++ b/src/test/osd/TestOSDMap.cc
@@ -38,7 +38,7 @@ public:
     entity_addr_t sample_addr;
     uuid_d sample_uuid;
     for (int i = 0; i < num_osds; ++i) {
-      sample_uuid.uuid[i] = i;
+      sample_uuid.generate_random();
       sample_addr.nonce = i;
       pending_inc.new_state[i] = CEPH_OSD_EXISTS | CEPH_OSD_NEW;
       pending_inc.new_up_client[i] = sample_addr;
diff --git a/src/test/osd/TestPGLog.cc b/src/test/osd/TestPGLog.cc
index cc9733a..9a63581 100644
--- a/src/test/osd/TestPGLog.cc
+++ b/src/test/osd/TestPGLog.cc
@@ -93,7 +93,7 @@ public:
     pg_missing_t init;
     pg_missing_t final;
 
-    set<hobject_t> toremove;
+    set<hobject_t, hobject_t::BitwiseComparator> toremove;
     list<pg_log_entry_t> torollback;
 
   private:
@@ -154,7 +154,7 @@ public:
   };
 
   struct LogHandler : public PGLog::LogEntryHandler {
-    set<hobject_t> removed;
+    set<hobject_t, hobject_t::BitwiseComparator> removed;
     list<pg_log_entry_t> rolledback;
     
     void rollback(
@@ -198,8 +198,8 @@ public:
     }
 
     {
-      set<hobject_t>::const_iterator titer = tcase.toremove.begin();
-      set<hobject_t>::const_iterator hiter = handler.removed.begin();
+      set<hobject_t, hobject_t::BitwiseComparator>::const_iterator titer = tcase.toremove.begin();
+      set<hobject_t, hobject_t::BitwiseComparator>::const_iterator hiter = handler.removed.begin();
       for (; titer != tcase.toremove.end(); ++titer, ++hiter) {
 	EXPECT_EQ(*titer, *hiter);
       }
@@ -1973,12 +1973,12 @@ TEST_F(PGLogTest, filter_log_1) {
 
     // Some should be removed
     log.filter_log(pgid, *osdmap, hit_set_namespace);
-    EXPECT_LE(log.log.size(), total);
+    EXPECT_LE(log.log.size(), (size_t)total);
 
     // If we filter a second time, there should be the same total
     total = log.log.size();
     log.filter_log(pgid, *osdmap, hit_set_namespace);
-    EXPECT_EQ(log.log.size(), total);
+    EXPECT_EQ(log.log.size(), (size_t)total);
 
     // Increase pg_num as if there would be a split
     int new_pg_num = pg_num * 16;
@@ -1995,7 +1995,7 @@ TEST_F(PGLogTest, filter_log_1) {
 
     // We should have fewer entries after a filter
     log.filter_log(pgid, *osdmap, hit_set_namespace);
-    EXPECT_LE(log.log.size(), total);
+    EXPECT_LE(log.log.size(), (size_t)total);
 
     // Make sure all internal entries are retained
     int count = 0;
diff --git a/src/test/osd/TestRados.cc b/src/test/osd/TestRados.cc
index e8eb0db..f35b987 100644
--- a/src/test/osd/TestRados.cc
+++ b/src/test/osd/TestRados.cc
@@ -28,11 +28,13 @@ public:
 			map<TestOpType, unsigned int> op_weights,
 			TestOpStat *stats,
 			int max_seconds,
-			bool ec_pool) :
+			bool ec_pool,
+			bool balance_reads) :
     m_nextop(NULL), m_op(0), m_ops(ops), m_seconds(max_seconds),
     m_objects(objects), m_stats(stats),
     m_total_weight(0),
-    m_ec_pool(ec_pool)
+    m_ec_pool(ec_pool),
+    m_balance_reads(balance_reads)
   {
     m_start = time(0);
     for (map<TestOpType, unsigned int>::const_iterator it = op_weights.begin();
@@ -99,7 +101,7 @@ private:
     switch (type) {
     case TEST_OP_READ:
       oid = *(rand_choose(context.oid_not_in_use));
-      return new ReadOp(m_op, &context, oid, m_stats);
+      return new ReadOp(m_op, &context, oid, m_balance_reads, m_stats);
 
     case TEST_OP_WRITE:
       oid = *(rand_choose(context.oid_not_in_use));
@@ -237,6 +239,7 @@ private:
   map<TestOpType, unsigned int> m_weight_sums;
   unsigned int m_total_weight;
   bool m_ec_pool;
+  bool m_balance_reads;
 };
 
 int main(int argc, char **argv)
@@ -281,6 +284,7 @@ int main(int argc, char **argv)
   string pool_name = "rbd";
   bool ec_pool = false;
   bool no_omap = false;
+  bool balance_reads = false;
 
   for (int i = 1; i < argc; ++i) {
     if (strcmp(argv[i], "--max-ops") == 0)
@@ -301,6 +305,8 @@ int main(int argc, char **argv)
       max_stride_size = atoi(argv[++i]);
     else if (strcmp(argv[i], "--no-omap") == 0)
       no_omap = true;
+    else if (strcmp(argv[i], "--balance_reads") == 0)
+      balance_reads = true;
     else if (strcmp(argv[i], "--pool-snaps") == 0)
       pool_snaps = true;
     else if (strcmp(argv[i], "--write-fadvise-dontneed") == 0)
@@ -407,7 +413,7 @@ int main(int argc, char **argv)
   WeightedTestGenerator gen = WeightedTestGenerator(
     ops, objects,
     op_weights, &stats, max_seconds,
-    ec_pool);
+    ec_pool, balance_reads);
   int r = context.init();
   if (r < 0) {
     cerr << "Error initializing rados test context: "
diff --git a/src/test/osd/osd-bench.sh b/src/test/osd/osd-bench.sh
index b4ae3fd..fd466f8 100755
--- a/src/test/osd/osd-bench.sh
+++ b/src/test/osd/osd-bench.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 #
 # Copyright (C) 2014 Cloudwatt <libre.licensing at cloudwatt.com>
-# Copyright (C) 2014 Red Hat <contact at redhat.com>
+# Copyright (C) 2014, 2015 Red Hat <contact at redhat.com>
 #
 # Author: Loic Dachary <loic at dachary.org>
 #
@@ -16,26 +16,29 @@
 # GNU Library Public License for more details.
 #
 
-source test/mon/mon-test-helpers.sh
-source test/osd/osd-test-helpers.sh
+source ../qa/workunits/ceph-helpers.sh
 
 function run() {
     local dir=$1
+    shift
 
     export CEPH_MON="127.0.0.1:7106"
     export CEPH_ARGS
     CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
     CEPH_ARGS+="--mon-host=$CEPH_MON "
 
-    local id=a
-    call_TEST_functions $dir $id || return 1
+    local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+    for func in $funcs ; do
+        setup $dir || return 1
+        $func $dir || return 1
+        teardown $dir || return 1
+    done
 }
 
 function TEST_bench() {
     local dir=$1
 
-    run_mon $dir a --public-addr $CEPH_MON \
-        || return 1
+    run_mon $dir a || return 1
     run_osd $dir 0 || return 1
 
     local osd_bench_small_size_max_iops=$(CEPH_ARGS='' ./ceph-conf \
@@ -50,24 +53,24 @@ function TEST_bench() {
     #
     # block size too high
     #
-    ! ./ceph tell osd.0 bench 1024 $((osd_bench_max_block_size + 1)) 2> $dir/out || return 1
-    grep osd_bench_max_block_size $dir/out || return 1
+    expect_failure $dir osd_bench_max_block_size \
+        ./ceph tell osd.0 bench 1024 $((osd_bench_max_block_size + 1)) || return 1
 
     #
     # count too high for small (< 1MB) block sizes
     #
     local bsize=1024
     local max_count=$(($bsize * $osd_bench_duration * $osd_bench_small_size_max_iops))
-    ! ./ceph tell osd.0 bench $(($max_count + 1)) $bsize 2> $dir/out || return 1
-    grep osd_bench_small_size_max_iops $dir/out || return 1
+    expect_failure $dir bench_small_size_max_iops \
+        ./ceph tell osd.0 bench $(($max_count + 1)) $bsize || return 1
 
     #
     # count too high for large (>= 1MB) block sizes
     #
     local bsize=$((1024 * 1024 + 1))
     local max_count=$(($osd_bench_large_size_max_throughput * $osd_bench_duration))
-    ! ./ceph tell osd.0 bench $(($max_count + 1)) $bsize 2> $dir/out || return 1
-    grep osd_bench_large_size_max_throughput $dir/out || return 1
+    expect_failure $dir osd_bench_large_size_max_throughput \
+        ./ceph tell osd.0 bench $(($max_count + 1)) $bsize || return 1
 
     #
     # default values should work
@@ -75,7 +78,7 @@ function TEST_bench() {
     ./ceph tell osd.0 bench || return 1
 }
 
-main osd-bench
+main osd-bench "$@"
 
 # Local Variables:
 # compile-command: "cd ../.. ; make -j4 && test/osd/osd-bench.sh"
diff --git a/src/test/osd/osd-config.sh b/src/test/osd/osd-config.sh
index aceda0c..1f73485 100755
--- a/src/test/osd/osd-config.sh
+++ b/src/test/osd/osd-config.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 #
 # Copyright (C) 2014 Cloudwatt <libre.licensing at cloudwatt.com>
-# Copyright (C) 2014 Red Hat <contact at redhat.com>
+# Copyright (C) 2014, 2015 Red Hat <contact at redhat.com>
 #
 # Author: Loic Dachary <loic at dachary.org>
 #
@@ -16,26 +16,29 @@
 # GNU Library Public License for more details.
 #
 
-source test/mon/mon-test-helpers.sh
-source test/osd/osd-test-helpers.sh
+source ../qa/workunits/ceph-helpers.sh
 
 function run() {
     local dir=$1
+    shift
 
     export CEPH_MON="127.0.0.1:7100"
     export CEPH_ARGS
-    CEPH_ARGS+="--mon-host=$CEPH_MON "
     CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+    CEPH_ARGS+="--mon-host=$CEPH_MON "
 
-    local id=a
-    call_TEST_functions $dir $id || return 1
+    local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+    for func in $funcs ; do
+        setup $dir || return 1
+        $func $dir || return 1
+        teardown $dir || return 1
+    done
 }
 
 function TEST_config_init() {
     local dir=$1
 
-    run_mon $dir a --public-addr=$CEPH_MON \
-        || return 1
+    run_mon $dir a || return 1
     local advance=1000
     local stale=1000
     local cache=500
@@ -45,15 +48,14 @@ function TEST_config_init() {
         --osd-pg-epoch-persisted-max-stale $stale \
         || return 1
     CEPH_ARGS='' ./ceph --admin-daemon $dir/ceph-osd.0.asok log flush || return 1
-    grep 'is not > osd_map_max_advance' $dir/osd-0.log || return 1
-    grep 'is not > osd_pg_epoch_persisted_max_stale' $dir/osd-0.log || return 1
+    grep 'is not > osd_map_max_advance' $dir/osd.0.log || return 1
+    grep 'is not > osd_pg_epoch_persisted_max_stale' $dir/osd.0.log || return 1
 }
 
 function TEST_config_track() {
     local dir=$1
 
-    run_mon $dir a --public-addr=$CEPH_MON \
-        || return 1
+    run_mon $dir a || return 1
     run_osd $dir 0 || return 1
 
     local osd_map_cache_size=$(CEPH_ARGS='' ./ceph-conf \
@@ -65,43 +67,43 @@ function TEST_config_track() {
     #
     # lower cache_size under max_advance to trigger the warning
     #
-    ! grep 'is not > osd_map_max_advance' $dir/osd-0.log || return 1
+    ! grep 'is not > osd_map_max_advance' $dir/osd.0.log || return 1
     local cache=$(($osd_map_max_advance / 2))
     ./ceph tell osd.0 injectargs "--osd-map-cache-size $cache" || return 1
     CEPH_ARGS='' ./ceph --admin-daemon $dir/ceph-osd.0.asok log flush || return 1
-    grep 'is not > osd_map_max_advance' $dir/osd-0.log || return 1
-    rm $dir/osd-0.log
+    grep 'is not > osd_map_max_advance' $dir/osd.0.log || return 1
+    rm $dir/osd.0.log
     CEPH_ARGS='' ./ceph --admin-daemon $dir/ceph-osd.0.asok log reopen || return 1
 
     #
     # reset cache_size to the default and assert that it does not trigger the warning
     #
-    ! grep 'is not > osd_map_max_advance' $dir/osd-0.log || return 1
+    ! grep 'is not > osd_map_max_advance' $dir/osd.0.log || return 1
     local cache=$osd_map_cache_size
     ./ceph tell osd.0 injectargs "--osd-map-cache-size $cache" || return 1
     CEPH_ARGS='' ./ceph --admin-daemon $dir/ceph-osd.0.asok log flush || return 1
-    ! grep 'is not > osd_map_max_advance' $dir/osd-0.log || return 1
+    ! grep 'is not > osd_map_max_advance' $dir/osd.0.log || return 1
 
     #
     # increase the osd_map_max_advance above the default cache_size
     #
-    ! grep 'is not > osd_map_max_advance' $dir/osd-0.log || return 1
+    ! grep 'is not > osd_map_max_advance' $dir/osd.0.log || return 1
     local advance=$(($osd_map_cache_size * 2))
     ./ceph tell osd.0 injectargs "--osd-map-max-advance $advance" || return 1
     CEPH_ARGS='' ./ceph --admin-daemon $dir/ceph-osd.0.asok log flush || return 1
-    grep 'is not > osd_map_max_advance' $dir/osd-0.log || return 1
+    grep 'is not > osd_map_max_advance' $dir/osd.0.log || return 1
 
     #
     # increase the osd_pg_epoch_persisted_max_stale above the default cache_size
     #
-    ! grep 'is not > osd_pg_epoch_persisted_max_stale' $dir/osd-0.log || return 1
+    ! grep 'is not > osd_pg_epoch_persisted_max_stale' $dir/osd.0.log || return 1
     local stale=$(($osd_map_cache_size * 2))
     ceph tell osd.0 injectargs "--osd-pg-epoch-persisted-max-stale $stale" || return 1
     CEPH_ARGS='' ./ceph --admin-daemon $dir/ceph-osd.0.asok log flush || return 1
-    grep 'is not > osd_pg_epoch_persisted_max_stale' $dir/osd-0.log || return 1
+    grep 'is not > osd_pg_epoch_persisted_max_stale' $dir/osd.0.log || return 1
 }
 
-main osd-config
+main osd-config "$@"
 
 # Local Variables:
 # compile-command: "cd ../.. ; make -j4 && test/osd/osd-config.sh"
diff --git a/src/test/osd/osd-copy-from.sh b/src/test/osd/osd-copy-from.sh
index b4120c5..6f1e037 100755
--- a/src/test/osd/osd-copy-from.sh
+++ b/src/test/osd/osd-copy-from.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 #
 # Copyright (C) 2014 Cloudwatt <libre.licensing at cloudwatt.com>
-# Copyright (C) 2014 Red Hat <contact at redhat.com>
+# Copyright (C) 2014, 2015 Red Hat <contact at redhat.com>
 #
 # Author: Loic Dachary <loic at dachary.org>
 # Author: Sage Weil <sage at redhat.com>
@@ -17,26 +17,29 @@
 # GNU Library Public License for more details.
 #
 
-source test/mon/mon-test-helpers.sh
-source test/osd/osd-test-helpers.sh
+source ../qa/workunits/ceph-helpers.sh
 
 function run() {
     local dir=$1
+    shift
 
     export CEPH_MON="127.0.0.1:7111"
     export CEPH_ARGS
     CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
     CEPH_ARGS+="--mon-host=$CEPH_MON "
 
-    local id=a
-    call_TEST_functions $dir $id || return 1
+    local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+    for func in $funcs ; do
+        setup $dir || return 1
+        $func $dir || return 1
+        teardown $dir || return 1
+    done
 }
 
 function TEST_copy_from() {
     local dir=$1
 
-    run_mon $dir a --public-addr $CEPH_MON \
-        || return 1
+    run_mon $dir a || return 1
     run_osd $dir 0 || return 1
     run_osd $dir 1 || return 1
 
@@ -56,7 +59,7 @@ function TEST_copy_from() {
     ./rados -p rbd stat foo3
 }
 
-main osd-copy-from
+main osd-copy-from "$@"
 
 # Local Variables:
 # compile-command: "cd ../.. ; make -j4 && test/osd/osd-bench.sh"
diff --git a/src/test/osd/osd-scrub-repair.sh b/src/test/osd/osd-scrub-repair.sh
index 90c51c0..13fac7c 100755
--- a/src/test/osd/osd-scrub-repair.sh
+++ b/src/test/osd/osd-scrub-repair.sh
@@ -14,7 +14,7 @@
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU Library Public License for more details.
 #
-source test/ceph-helpers.sh
+source ../qa/workunits/ceph-helpers.sh
 
 function run() {
     local dir=$1
@@ -65,6 +65,31 @@ function TEST_corrupt_and_repair_replicated() {
     teardown $dir || return 1
 }
 
+function corrupt_and_repair_two() {
+    local dir=$1
+    local poolname=$2
+    local first=$3
+    local second=$4
+
+    #
+    # 1) remove the corresponding file from the OSDs
+    #
+    objectstore_tool $dir $first SOMETHING remove || return 1
+    objectstore_tool $dir $second SOMETHING remove || return 1
+    #
+    # 2) repair the PG
+    #
+    local pg=$(get_pg $poolname SOMETHING)
+    repair $pg
+    #
+    # 3) The files must be back
+    #
+    objectstore_tool $dir $first SOMETHING list-attrs || return 1
+    objectstore_tool $dir $second SOMETHING list-attrs || return 1
+    rados --pool $poolname get SOMETHING $dir/COPY || return 1
+    diff $dir/ORIGINAL $dir/COPY || return 1
+}
+
 #
 # 1) add an object
 # 2) remove the corresponding file from a designated OSD
@@ -95,22 +120,12 @@ function corrupt_and_repair_one() {
     wait_for_clean || return 1
 }
 
-function TEST_corrupt_and_repair_erasure_coded() {
+function corrupt_and_repair_erasure_coded() {
     local dir=$1
-    local poolname=ecpool
-    local payload=ABCDEF
-
-    setup $dir || return 1
-    run_mon $dir a || return 1
-    run_osd $dir 0 || return 1
-    run_osd $dir 1 || return 1
-    run_osd $dir 2 || return 1
-    run_osd $dir 3 || return 1
-    wait_for_clean || return 1
+    local poolname=$2
+    local profile=$3
 
-    ceph osd erasure-code-profile set myprofile \
-        k=2 m=2 ruleset-failure-domain=osd || return 1
-    ceph osd pool create $poolname 1 1 erasure myprofile \
+    ceph osd pool create $poolname 1 1 erasure $profile \
         || return 1
 
     add_something $dir $poolname
@@ -127,32 +142,94 @@ function TEST_corrupt_and_repair_erasure_coded() {
     corrupt_and_repair_two $dir $poolname $not_primary_first $not_primary_second || return 1
     corrupt_and_repair_two $dir $poolname $primary $not_primary_first || return 1
 
+}
+
+function TEST_corrupt_and_repair_jerasure() {
+    local dir=$1
+    local poolname=ecpool
+    local profile=myprofile
+
+    setup $dir || return 1
+    run_mon $dir a || return 1
+    for id in $(seq 0 3) ; do
+        run_osd $dir $id || return 1
+    done
+    wait_for_clean || return 1
+
+    ceph osd erasure-code-profile set $profile \
+        k=2 m=2 ruleset-failure-domain=osd || return 1
+
+    corrupt_and_repair_erasure_coded $dir $poolname $profile || return 1
+
     teardown $dir || return 1
 }
 
-function corrupt_and_repair_two() {
+function TEST_corrupt_and_repair_lrc() {
     local dir=$1
-    local poolname=$2
-    local first=$3
-    local second=$4
+    local poolname=ecpool
+    local profile=myprofile
+
+    setup $dir || return 1
+    run_mon $dir a || return 1
+    for id in $(seq 0 9) ; do
+        run_osd $dir $id || return 1
+    done
+    wait_for_clean || return 1
+
+    ceph osd erasure-code-profile set $profile \
+        pluing=lrc \
+        k=4 m=2 l=3 \
+        ruleset-failure-domain=osd || return 1
+
+    corrupt_and_repair_erasure_coded $dir $poolname $profile || return 1
+
+    teardown $dir || return 1
+}
+
+function TEST_unfound_erasure_coded() {
+    local dir=$1
+    local poolname=ecpool
+    local payload=ABCDEF
+
+    setup $dir || return 1
+    run_mon $dir a || return 1
+    run_osd $dir 0 || return 1
+    run_osd $dir 1 || return 1
+    run_osd $dir 2 || return 1
+    run_osd $dir 3 || return 1
+    wait_for_clean || return 1
+
+    ceph osd erasure-code-profile set myprofile \
+      k=2 m=2 ruleset-failure-domain=osd || return 1
+    ceph osd pool create $poolname 1 1 erasure myprofile \
+      || return 1
+
+    add_something $dir $poolname
+
+    local primary=$(get_primary $poolname SOMETHING)
+    local -a osds=($(get_osds $poolname SOMETHING | sed -e "s/$primary//"))
+    local not_primary_first=${osds[0]}
+    local not_primary_second=${osds[1]}
+    local not_primary_third=${osds[2]}
 
     #
     # 1) remove the corresponding file from the OSDs
     #
-    objectstore_tool $dir $first SOMETHING remove || return 1
-    objectstore_tool $dir $second SOMETHING remove || return 1
+    objectstore_tool $dir $not_primary_first SOMETHING remove || return 1
+    objectstore_tool $dir $not_primary_second SOMETHING remove || return 1
+    objectstore_tool $dir $not_primary_third SOMETHING remove || return 1
     #
     # 2) repair the PG
     #
     local pg=$(get_pg $poolname SOMETHING)
     repair $pg
     #
-    # 3) The files must be back
+    # 3) check pg state
     #
-    objectstore_tool $dir $first SOMETHING list-attrs || return 1
-    objectstore_tool $dir $second SOMETHING list-attrs || return 1
-    rados --pool $poolname get SOMETHING $dir/COPY || return 1
-    diff $dir/ORIGINAL $dir/COPY || return 1
+    ceph -s|grep "4 osds: 4 up, 4 in" || return 1
+    ceph -s|grep "1/1 unfound" || return 1
+
+    teardown $dir || return 1
 }
 
 main osd-scrub-repair "$@"
diff --git a/src/test/osd/osd-test-helpers.sh b/src/test/osd/osd-test-helpers.sh
deleted file mode 100644
index a1cef91..0000000
--- a/src/test/osd/osd-test-helpers.sh
+++ /dev/null
@@ -1,89 +0,0 @@
-#!/bin/bash
-#
-# Copyright (C) 2014 Cloudwatt <libre.licensing at cloudwatt.com>
-# Copyright (C) 2014 Red Hat <contact at redhat.com>
-#
-# Author: Loic Dachary <loic at dachary.org>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Library Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU Library Public License for more details.
-#
-
-function run_osd() {
-    local dir=$1
-    shift
-    local id=$1
-    shift
-    local osd_data=$dir/$id
-
-    local ceph_disk_args
-    ceph_disk_args+=" --statedir=$dir"
-    ceph_disk_args+=" --sysconfdir=$dir"
-    ceph_disk_args+=" --prepend-to-path="
-    ceph_disk_args+=" --verbose"
-
-    touch $dir/ceph.conf
-
-    mkdir -p $osd_data
-    ./ceph-disk $ceph_disk_args \
-        prepare $osd_data || return 1
-
-    local ceph_args="$CEPH_ARGS"
-    ceph_args+=" --osd-backfill-full-ratio=.99"
-    ceph_args+=" --osd-failsafe-full-ratio=.99"
-    ceph_args+=" --osd-journal-size=100"
-    ceph_args+=" --osd-data=$osd_data"
-    ceph_args+=" --chdir="
-    ceph_args+=" --osd-pool-default-erasure-code-directory=.libs"
-    ceph_args+=" --run-dir=$dir"
-    ceph_args+=" --debug-osd=20"
-    ceph_args+=" --debug-filestore=20"
-    ceph_args+=" --log-file=$dir/osd-\$id.log"
-    ceph_args+=" --pid-file=$dir/osd-\$id.pid"
-    ceph_args+=" "
-    ceph_args+="$@"
-    mkdir -p $osd_data
-    CEPH_ARGS="$ceph_args" ./ceph-disk $ceph_disk_args \
-        activate \
-        --mark-init=none \
-        $osd_data || return 1
-
-    [ "$id" = "$(cat $osd_data/whoami)" ] || return 1
-
-    ./ceph osd crush create-or-move "$id" 1 root=default host=localhost
-
-    status=1
-    for ((i=0; i < 60; i++)); do
-        if ! ./ceph osd dump | grep "osd.$id up"; then
-            sleep 1
-        else
-            status=0
-            break
-        fi
-    done
-
-    return $status
-}
-
-function get_osds() {
-    local poolname=$1
-    local objectname=$2
-
-    ./ceph osd map $poolname $objectname | \
-       perl -p -e 's/.*up \(\[(.*?)\].*/$1/; s/,/ /g'
-}
-
-function get_pg() {
-    local poolname=$1
-    local objectname=$2
-
-    ./ceph osd map $poolname $objectname | \
-       perl -p -e 's/.*\((.*?)\) -> up.*/$1/'
-}
diff --git a/src/test/osd/types.cc b/src/test/osd/types.cc
index 33324b2..b69a88a 100644
--- a/src/test/osd/types.cc
+++ b/src/test/osd/types.cc
@@ -20,6 +20,7 @@
 #include "osd/OSDMap.h"
 #include "gtest/gtest.h"
 #include "common/Thread.h"
+#include "include/stringify.h"
 #include "osd/ReplicatedBackend.h"
 
 #include <sstream>
@@ -1304,6 +1305,87 @@ TEST(shard_id_t, iostream) {
     ostringstream out;
     out << shards;
     ASSERT_EQ(out.str(), "0,1,2");
+
+    shard_id_t noshard = shard_id_t::NO_SHARD;
+    shard_id_t zero(0);
+    ASSERT_GT(zero, noshard);
+}
+
+TEST(spg_t, parse) {
+  spg_t a(pg_t(1,2), shard_id_t::NO_SHARD);
+  spg_t aa, bb;
+  spg_t b(pg_t(3,2), shard_id_t(2));
+  std::string s = stringify(a);
+  ASSERT_TRUE(aa.parse(s.c_str()));
+  ASSERT_EQ(a, aa);
+
+  s = stringify(b);
+  ASSERT_TRUE(bb.parse(s.c_str()));
+  ASSERT_EQ(b, bb);
+}
+
+TEST(coll_t, parse) {
+  const char *ok[] = {
+    "meta",
+    "1.2_head",
+    "1.2_TEMP",
+    "1.2s3_head",
+    "1.3s2_TEMP",
+    "1.2s0_head",
+    0
+  };
+  const char *bad[] = {
+    "foo",
+    "1.2_food",
+    "1.2_head ",
+    //" 1.2_head",   // hrm, this parses, which is not ideal.. pg_t's fault?
+    "1.2_temp",
+    "1.2_HEAD",
+    "1.xS3_HEAD",
+    "1.2s_HEAD",
+    "1.2sfoo_HEAD",
+    0
+  };
+  coll_t a;
+  for (int i = 0; ok[i]; ++i) {
+    cout << "check ok " << ok[i] << std::endl;
+    ASSERT_TRUE(a.parse(ok[i]));
+    ASSERT_EQ(string(ok[i]), a.to_str());
+  }
+  for (int i = 0; bad[i]; ++i) {
+    cout << "check bad " << bad[i] << std::endl;
+    ASSERT_FALSE(a.parse(bad[i]));
+  }
+}
+
+TEST(coll_t, temp) {
+  spg_t pgid;
+  coll_t foo(pgid);
+  ASSERT_EQ(foo.to_str(), string("0.0_head"));
+
+  coll_t temp = foo.get_temp();
+  ASSERT_EQ(temp.to_str(), string("0.0_TEMP"));
+
+  spg_t pgid2;
+  ASSERT_TRUE(temp.is_temp());
+  ASSERT_TRUE(temp.is_temp(&pgid2));
+  ASSERT_EQ(pgid, pgid2);
+}
+
+TEST(ghobject_t, cmp) {
+  ghobject_t min;
+  ghobject_t sep;
+  sep.set_shard(shard_id_t(1));
+  sep.hobj.pool = -1;
+  cout << min << " < " << sep << std::endl;
+  ASSERT_TRUE(cmp_bitwise(min, sep) < 0);
+
+  sep.set_shard(shard_id_t::NO_SHARD);
+  cout << "sep shard " << sep.shard_id << std::endl;
+  ghobject_t o(hobject_t(object_t(), string(), CEPH_NOSNAP, 0x42,
+			 1, string()));
+  cout << "o " << o << std::endl;
+  ASSERT_TRUE(cmp_bitwise(o, sep) > 0);
 }
 
 /*
diff --git a/src/test/osdc/object_cacher_stress.cc b/src/test/osdc/object_cacher_stress.cc
index ec5f926..39aabfd 100644
--- a/src/test/osdc/object_cacher_stress.cc
+++ b/src/test/osdc/object_cacher_stress.cc
@@ -187,37 +187,37 @@ int main(int argc, const char **argv)
   std::ostringstream err;
   std::vector<const char*>::iterator i;
   for (i = args.begin(); i != args.end();) {
-    if (ceph_argparse_withlonglong(args, i, &delay_ns, &err, "--delay-ns", (char*)NULL)) {
+    if (ceph_argparse_witharg(args, i, &delay_ns, err, "--delay-ns", (char*)NULL)) {
       if (!err.str().empty()) {
 	cerr << argv[0] << ": " << err.str() << std::endl;
 	return EXIT_FAILURE;
       }
-    } else if (ceph_argparse_withlonglong(args, i, &num_ops, &err, "--ops", (char*)NULL)) {
+    } else if (ceph_argparse_witharg(args, i, &num_ops, err, "--ops", (char*)NULL)) {
       if (!err.str().empty()) {
 	cerr << argv[0] << ": " << err.str() << std::endl;
 	return EXIT_FAILURE;
       }
-    } else if (ceph_argparse_withlonglong(args, i, &num_objs, &err, "--objects", (char*)NULL)) {
+    } else if (ceph_argparse_witharg(args, i, &num_objs, err, "--objects", (char*)NULL)) {
       if (!err.str().empty()) {
 	cerr << argv[0] << ": " << err.str() << std::endl;
 	return EXIT_FAILURE;
       }
-    } else if (ceph_argparse_withlonglong(args, i, &obj_bytes, &err, "--obj-size", (char*)NULL)) {
+    } else if (ceph_argparse_witharg(args, i, &obj_bytes, err, "--obj-size", (char*)NULL)) {
       if (!err.str().empty()) {
 	cerr << argv[0] << ": " << err.str() << std::endl;
 	return EXIT_FAILURE;
       }
-    } else if (ceph_argparse_withlonglong(args, i, &max_len, &err, "--max-op-size", (char*)NULL)) {
+    } else if (ceph_argparse_witharg(args, i, &max_len, err, "--max-op-size", (char*)NULL)) {
       if (!err.str().empty()) {
 	cerr << argv[0] << ": " << err.str() << std::endl;
 	return EXIT_FAILURE;
       }
-    } else if (ceph_argparse_withfloat(args, i, &percent_reads, &err, "--percent-read", (char*)NULL)) {
+    } else if (ceph_argparse_witharg(args, i, &percent_reads, err, "--percent-read", (char*)NULL)) {
       if (!err.str().empty()) {
 	cerr << argv[0] << ": " << err.str() << std::endl;
 	return EXIT_FAILURE;
       }
-    } else if (ceph_argparse_withint(args, i, &seed, &err, "--seed", (char*)NULL)) {
+    } else if (ceph_argparse_witharg(args, i, &seed, err, "--seed", (char*)NULL)) {
       if (!err.str().empty()) {
 	cerr << argv[0] << ": " << err.str() << std::endl;
 	return EXIT_FAILURE;
diff --git a/src/test/perf_counters.cc b/src/test/perf_counters.cc
index 9b6bd71..a916cf0 100644
--- a/src/test/perf_counters.cc
+++ b/src/test/perf_counters.cc
@@ -181,10 +181,22 @@ TEST(PerfCounters, MultiplePerfCounters) {
   ASSERT_EQ(sd("{\"test_perfcounter_1\":{\"element1\":13,\"element2\":0.000000000,"
 	    "\"element3\":{\"avgcount\":0,\"sum\":0.000000000}}}"), msg);
   ASSERT_EQ("", client.do_request("{ \"prefix\": \"perf schema\", \"format\": \"json\" }", &msg));
-  ASSERT_EQ(sd("{\"test_perfcounter_1\":{\"element1\":{\"type\":2},"
-	       "\"element2\":{\"type\":1},\"element3\":{\"type\":5}}}"), msg);
-
+  ASSERT_EQ(sd("{\"test_perfcounter_1\":{\"element1\":{\"type\":2,\"description\":\"\",\"nick\":\"\"},"
+	    "\"element2\":{\"type\":1,\"description\":\"\",\"nick\":\"\"},\"element3\":{\"type\":5,\"description\":\"\",\"nick\":\"\"}}}"), msg);
   coll->clear();
   ASSERT_EQ("", client.do_request("{ \"prefix\": \"perf dump\", \"format\": \"json\" }", &msg));
   ASSERT_EQ("{}", msg);
 }
+
+TEST(PerfCounters, CephContextPerfCounters) {
+  // Enable the perf counter
+  g_ceph_context->enable_perf_counter();
+  AdminSocketClient client(get_rand_socket_path());
+  std::string msg;
+
+  ASSERT_EQ("", client.do_request("{ \"prefix\": \"perf dump\", \"format\": \"json\" }", &msg));
+  ASSERT_EQ(sd("{\"cct\":{\"total_workers\":0,\"unhealthy_workers\":0}}"), msg);
+
+  // Restore to avoid impact to other test cases
+  g_ceph_context->disable_perf_counter();
+}
diff --git a/src/test/perf_helper.cc b/src/test/perf_helper.cc
new file mode 100644
index 0000000..7661cb5
--- /dev/null
+++ b/src/test/perf_helper.cc
@@ -0,0 +1,51 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/* Copyright (c) 2011 Facebook
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR(S) DISCLAIM ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL AUTHORS BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "include/buffer.h"
+
+using namespace ceph;
+
+namespace PerfHelper {
+
+/// Flush the CPU data cache by reading and writing 100MB of new data.
+void flush_cache()
+{
+    int hundredMegs = 100 * 1024 * 1024;
+    volatile char* block = new char[hundredMegs];
+    for (int i = 0; i < hundredMegs; i++)
+        block[i] = 1;
+    delete[] block;
+}
+
+/// Used in functionCall().
+uint64_t plus_one(uint64_t x)
+{
+    return x + 1;
+}
+
+/// Used in throwIntNL.
+void throw_int()
+{
+    throw 0;
+}
+
+/// Used in throwExceptionNL.
+void throw_end_of_buffer()
+{
+    throw buffer::end_of_buffer();
+}
+}
diff --git a/src/test/perf_helper.h b/src/test/perf_helper.h
new file mode 100644
index 0000000..2133b54
--- /dev/null
+++ b/src/test/perf_helper.h
@@ -0,0 +1,30 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/* Copyright (c) 2011 Facebook
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR(S) DISCLAIM ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL AUTHORS BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef CEPH_TEST_PERFHELPER_H
+#define CEPH_TEST_PERFHELPER_H
+
+namespace PerfHelper {
+
+void flush_cache();
+uint64_t plus_one(uint64_t x);
+void throw_end_of_buffer();
+void throw_int();
+
+} // PerfHelper
+
+#endif  // CEPH_TEST_PERFHELPER_H
diff --git a/src/test/perf_local.cc b/src/test/perf_local.cc
new file mode 100644
index 0000000..9672be2
--- /dev/null
+++ b/src/test/perf_local.cc
@@ -0,0 +1,1047 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/* Copyright (c) 2015 Haomai Wang <haomaiwang at gmail.com>
+ * Copyright (c) 2011-2014 Stanford University
+ * Copyright (c) 2011 Facebook
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR(S) DISCLAIM ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL AUTHORS BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+// This program contains a collection of low-level performance measurements
+// for Ceph, which can be run either individually or altogether.  These
+// tests measure performance in a single stand-alone process, not in a cluster
+// with multiple servers.  Invoke the program like this:
+//
+//     Perf test1 test2 ...
+//
+// test1 and test2 are the names of individual performance measurements to
+// run.  If no test names are provided then all of the performance tests
+// are run.
+//
+// To add a new test:
+// * Write a function that implements the test.  Use existing test functions
+//   as a guideline, and be sure to generate output in the same form as
+//   other tests.
+// * Create a new entry for the test in the #tests table.
+#include <vector>
+#include <sched.h>
+
+#include "acconfig.h"
+#ifdef HAVE_SSE
+#include <xmmintrin.h>
+#endif
+
+#include "include/atomic.h"
+#include "include/buffer.h"
+#include "include/encoding.h"
+#include "include/ceph_hash.h"
+#include "include/Spinlock.h"
+#include "common/ceph_argparse.h"
+#include "common/Cycles.h"
+#include "common/Cond.h"
+#include "common/Mutex.h"
+#include "common/Thread.h"
+#include "common/Timer.h"
+#include "msg/async/Event.h"
+#include "global/global_init.h"
+
+#include "test/perf_helper.h"
+
+using namespace ceph;
+
+/**
+ * Ask the operating system to pin the current thread to a given CPU.
+ *
+ * \param cpu
+ *      Indicates the desired CPU and hyperthread; low order 2 bits
+ *      specify CPU, next bit specifies hyperthread.
+ */
+void bind_thread_to_cpu(int cpu)
+{
+#ifdef HAVE_SCHED
+  cpu_set_t set;
+  CPU_ZERO(&set);
+  CPU_SET(cpu, &set);
+  sched_setaffinity(0, sizeof(set), &set);
+#endif
+}
+
+/*
+ * This function just discards its argument. It's used to make it
+ * appear that data is used,  so that the compiler won't optimize
+ * away the code we're trying to measure.
+ *
+ * \param value
+ *      Pointer to arbitrary value; it's discarded.
+ */
+void discard(void* value) {
+  int x = *reinterpret_cast<int*>(value);
+  if (x == 0x43924776) {
+    printf("Value was 0x%x\n", x);
+  }
+}
+
+//----------------------------------------------------------------------
+// Test functions start here
+//----------------------------------------------------------------------
+
+// Measure the cost of atomic_t::compare_and_swap
+double atomic_int_cmp()
+{
+  int count = 1000000;
+  atomic_t value(11);
+  int test = 11;
+  uint64_t start = Cycles::rdtsc();
+  for (int i = 0; i < count; i++) {
+    value.compare_and_swap(test, test+2);
+    test += 2;
+  }
+  uint64_t stop = Cycles::rdtsc();
+  // printf("Final value: %d\n", value.load());
+  return Cycles::to_seconds(stop - start)/count;
+}
+
+// Measure the cost of atomic_t::inc
+double atomic_int_inc()
+{
+  int count = 1000000;
+  atomic_t value(11);
+  uint64_t start = Cycles::rdtsc();
+  for (int i = 0; i < count; i++) {
+    value.inc();
+  }
+  uint64_t stop = Cycles::rdtsc();
+  // printf("Final value: %d\n", value.load());
+  return Cycles::to_seconds(stop - start)/count;
+}
+
+// Measure the cost of reading an atomic_t
+double atomic_int_read()
+{
+  int count = 1000000;
+  atomic_t value(11);
+  int total = 0;
+  uint64_t start = Cycles::rdtsc();
+  for (int i = 0; i < count; i++) {
+    total += value.read();
+  }
+  uint64_t stop = Cycles::rdtsc();
+  // printf("Total: %d\n", total);
+  return Cycles::to_seconds(stop - start)/count;
+}
+
+// Measure the cost of storing a new value in a atomic_t
+double atomic_int_set()
+{
+  int count = 1000000;
+  atomic_t value(11);
+  uint64_t start = Cycles::rdtsc();
+  for (int i = 0; i < count; i++) {
+    value.set(88);
+  }
+  uint64_t stop = Cycles::rdtsc();
+  return Cycles::to_seconds(stop - start)/count;
+}
+
+// Measure the cost of acquiring and releasing a mutex in the
+// fast case where the mutex is free.
+double mutex_nonblock()
+{
+  int count = 1000000;
+  Mutex m("mutex_nonblock::m");
+  uint64_t start = Cycles::rdtsc();
+  for (int i = 0; i < count; i++) {
+    m.Lock();
+    m.Unlock();
+  }
+  uint64_t stop = Cycles::rdtsc();
+  return Cycles::to_seconds(stop - start)/count;
+}
+
+// Measure the cost of allocating and deallocating a buffer, plus
+// appending (logically) one ptr.
+double buffer_basic()
+{
+  int count = 1000000;
+  uint64_t start = Cycles::rdtsc();
+  bufferptr ptr("abcdefg", 7);
+  for (int i = 0; i < count; i++) {
+    bufferlist b;
+    b.append(ptr, 0, 5);
+  }
+  uint64_t stop = Cycles::rdtsc();
+  return Cycles::to_seconds(stop - start)/count;
+}
+
+struct DummyBlock {
+  int a, b, c, d;
+  void encode(bufferlist &bl) const {
+    ENCODE_START(1, 1, bl);
+    ::encode(a, bl);
+    ::encode(b, bl);
+    ::encode(c, bl);
+    ::encode(d, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::iterator &bl) {
+    DECODE_START(1, bl);
+    ::decode(a, bl);
+    ::decode(b, bl);
+    ::decode(c, bl);
+    ::decode(d, bl);
+    DECODE_FINISH(bl);
+  }
+};
+WRITE_CLASS_ENCODER(DummyBlock)
+
+// Measure the cost of encoding and decoding a buffer, plus
+// allocating space for one chunk.
+double buffer_encode_decode()
+{
+  int count = 1000000;
+  uint64_t start = Cycles::rdtsc();
+  for (int i = 0; i < count; i++) {
+    bufferlist b;
+    DummyBlock dummy_block;
+    ::encode(dummy_block, b);
+    bufferlist::iterator iter = b.begin();
+    ::decode(dummy_block, iter);
+  }
+  uint64_t stop = Cycles::rdtsc();
+  return Cycles::to_seconds(stop - start)/count;
+}
+
+// Measure the cost of allocating and deallocating a buffer, plus
+// copying in a small block.
+double buffer_basic_copy()
+{
+  int count = 1000000;
+  uint64_t start = Cycles::rdtsc();
+  for (int i = 0; i < count; i++) {
+    bufferlist b;
+    b.append("abcdefg", 6);
+  }
+  uint64_t stop = Cycles::rdtsc();
+  return Cycles::to_seconds(stop - start)/count;
+}
+
+// Measure the cost of making a copy of parts of two ptrs.
+double buffer_copy()
+{
+  int count = 1000000;
+  bufferlist b;
+  b.append("abcde", 5);
+  b.append("01234", 5);
+  char copy[10];
+  uint64_t start = Cycles::rdtsc();
+  for (int i = 0; i < count; i++) {
+    b.copy(2, 6, copy);
+  }
+  uint64_t stop = Cycles::rdtsc();
+  return Cycles::to_seconds(stop - start)/count;
+}
+
+// Measure the cost of allocating new space by extending the
+// bufferlist
+double buffer_encode()
+{
+  int count = 100000;
+  uint64_t total = 0;
+  for (int i = 0; i < count; i++) {
+    bufferlist b;
+    DummyBlock dummy_block;
+    ::encode(dummy_block, b);
+    uint64_t start = Cycles::rdtsc();
+    ::encode(dummy_block, b);
+    ::encode(dummy_block, b);
+    ::encode(dummy_block, b);
+    ::encode(dummy_block, b);
+    ::encode(dummy_block, b);
+    ::encode(dummy_block, b);
+    ::encode(dummy_block, b);
+    ::encode(dummy_block, b);
+    ::encode(dummy_block, b);
+    ::encode(dummy_block, b);
+    total += Cycles::rdtsc() - start;
+  }
+  return Cycles::to_seconds(total)/(count*10);
+}
+
+// Measure the cost of retrieving an object from the beginning of a buffer.
+double buffer_get_contiguous()
+{
+  int count = 1000000;
+  int value = 11;
+  bufferlist b;
+  b.append((char*)&value, sizeof(value));
+  int sum = 0;
+  uint64_t start = Cycles::rdtsc();
+  for (int i = 0; i < count; i++) {
+    sum += *reinterpret_cast<int*>(b.get_contiguous(0, sizeof(value)));
+  }
+  uint64_t stop = Cycles::rdtsc();
+  return Cycles::to_seconds(stop - start)/count;
+}
+
+// Measure the cost of creating an iterator and iterating over 10
+// chunks in a buffer.
+double buffer_iterator()
+{
+  bufferlist b;
+  const char s[] = "abcdefghijklmnopqrstuvwxyz";
+  bufferptr ptr(s, sizeof(s));
+  for (int i = 0; i < 5; i++) {
+    b.append(ptr, i, 5);
+  }
+  int count = 100000;
+  int sum = 0;
+  uint64_t start = Cycles::rdtsc();
+  for (int i = 0; i < count; i++) {
+    bufferlist::iterator it = b.begin();
+    while (!it.end()) {
+      sum += (static_cast<const char*>(it.get_current_ptr().c_str()))[it.get_remaining()-1];
+      ++it;
+    }
+  }
+  uint64_t stop = Cycles::rdtsc();
+  discard(&sum);
+  return Cycles::to_seconds(stop - start)/count;
+}
+
+// Implements the CondPingPong test.
+class CondPingPong {
+  Mutex mutex;
+  Cond cond;
+  int prod;
+  int cons;
+  const int count;
+
+  class Consumer : public Thread {
+    CondPingPong *p;
+   public:
+    Consumer(CondPingPong *p): p(p) {}
+    void* entry() {
+      p->consume();
+      return 0;
+    }
+  } consumer;
+
+ public:
+  CondPingPong(): mutex("CondPingPong::mutex"), prod(0), cons(0), count(10000), consumer(this) {}
+
+  double run() {
+    consumer.create();
+    uint64_t start = Cycles::rdtsc();
+    produce();
+    uint64_t stop = Cycles::rdtsc();
+    consumer.join();
+    return Cycles::to_seconds(stop - start)/count;
+  }
+
+  void produce() {
+    Mutex::Locker l(mutex);
+    while (cons < count) {
+      while (cons < prod)
+        cond.Wait(mutex);
+      ++prod;
+      cond.Signal();
+    }
+  }
+
+  void consume() {
+    Mutex::Locker l(mutex);
+    while (cons < count) {
+      while (cons == prod)
+        cond.Wait(mutex);
+      ++cons;
+      cond.Signal();
+    }
+  }
+};
+
+// Measure the cost of coordinating between threads using a condition variable.
+double cond_ping_pong()
+{
+  return CondPingPong().run();
+}
+
+// Measure the cost of a 32-bit divide. Divides don't take a constant
+// number of cycles. Values were chosen here semi-randomly to depict a
+// fairly expensive scenario. Someone with fancy ALU knowledge could
+// probably pick worse values.
+double div32()
+{
+#if defined(__i386__) || defined(__x86_64__)
+  int count = 1000000;
+  uint64_t start = Cycles::rdtsc();
+  // NB: Expect an x86 processor exception is there's overflow.
+  uint32_t numeratorHi = 0xa5a5a5a5U;
+  uint32_t numeratorLo = 0x55aa55aaU;
+  uint32_t divisor = 0xaa55aa55U;
+  uint32_t quotient;
+  uint32_t remainder;
+  for (int i = 0; i < count; i++) {
+    __asm__ __volatile__("div %4" :
+                         "=a"(quotient), "=d"(remainder) :
+                         "a"(numeratorLo), "d"(numeratorHi), "r"(divisor) :
+                         "cc");
+  }
+  uint64_t stop = Cycles::rdtsc();
+  return Cycles::to_seconds(stop - start)/count;
+#else
+  return -1;
+#endif
+}
+
+// Measure the cost of a 64-bit divide. Divides don't take a constant
+// number of cycles. Values were chosen here semi-randomly to depict a
+// fairly expensive scenario. Someone with fancy ALU knowledge could
+// probably pick worse values.
+double div64()
+{
+#if defined(__x86_64__) || defined(__amd64__)
+  int count = 1000000;
+  // NB: Expect an x86 processor exception is there's overflow.
+  uint64_t start = Cycles::rdtsc();
+  uint64_t numeratorHi = 0x5a5a5a5a5a5UL;
+  uint64_t numeratorLo = 0x55aa55aa55aa55aaUL;
+  uint64_t divisor = 0xaa55aa55aa55aa55UL;
+  uint64_t quotient;
+  uint64_t remainder;
+  for (int i = 0; i < count; i++) {
+    __asm__ __volatile__("divq %4" :
+                         "=a"(quotient), "=d"(remainder) :
+                         "a"(numeratorLo), "d"(numeratorHi), "r"(divisor) :
+                         "cc");
+  }
+  uint64_t stop = Cycles::rdtsc();
+  return Cycles::to_seconds(stop - start)/count;
+#else
+  return -1;
+#endif
+}
+
+// Measure the cost of calling a non-inlined function.
+double function_call()
+{
+  int count = 1000000;
+  uint64_t x = 0;
+  uint64_t start = Cycles::rdtsc();
+  for (int i = 0; i < count; i++) {
+    x = PerfHelper::plus_one(x);
+  }
+  uint64_t stop = Cycles::rdtsc();
+  return Cycles::to_seconds(stop - start)/count;
+}
+
+// Measure the minimum cost of EventCenter::process_events, when there are no
+// Pollers and no Timers.
+double eventcenter_poll()
+{
+  int count = 1000000;
+  EventCenter center(g_ceph_context);
+  center.init(1000);
+  center.set_owner(pthread_self());
+  uint64_t start = Cycles::rdtsc();
+  for (int i = 0; i < count; i++) {
+    center.process_events(0);
+  }
+  uint64_t stop = Cycles::rdtsc();
+  return Cycles::to_seconds(stop - start)/count;
+}
+
+class CenterWorker : public Thread {
+  CephContext *cct;
+  bool done;
+
+ public:
+  EventCenter center;
+  CenterWorker(CephContext *c): cct(c), done(false), center(c) {
+    center.init(100);
+  }
+  void stop() {
+    done = true;
+    center.wakeup();
+  }
+  void* entry() {
+    center.set_owner(pthread_self());
+    bind_thread_to_cpu(2);
+    while (!done)
+      center.process_events(1000);
+    return 0;
+  }
+};
+
+class CountEvent: public EventCallback {
+  atomic_t *count;
+
+ public:
+  CountEvent(atomic_t *atomic): count(atomic) {}
+  void do_request(int id) {
+    count->dec();
+  }
+};
+
+double eventcenter_dispatch()
+{
+  int count = 100000;
+
+  CenterWorker worker(g_ceph_context);
+  atomic_t flag(1);
+  worker.create();
+  EventCallbackRef count_event(new CountEvent(&flag));
+
+  worker.center.dispatch_event_external(count_event);
+  // Start a new thread and wait for it to ready.
+  while (flag.read())
+    usleep(100);
+
+  uint64_t start = Cycles::rdtsc();
+  for (int i = 0; i < count; i++) {
+    flag.set(1);
+    worker.center.dispatch_event_external(count_event);
+    while (flag.read())
+      ;
+  }
+  uint64_t stop = Cycles::rdtsc();
+  worker.stop();
+  worker.join();
+  return Cycles::to_seconds(stop - start)/count;
+}
+
+// Measure the cost of copying a given number of bytes with memcpy.
+double memcpy_shared(size_t size)
+{
+  int count = 1000000;
+  char src[size], dst[size];
+
+  memset(src, 0, sizeof(src));
+
+  uint64_t start = Cycles::rdtsc();
+  for (int i = 0; i < count; i++) {
+    memcpy(dst, src, size);
+  }
+  uint64_t stop = Cycles::rdtsc();
+  return Cycles::to_seconds(stop - start)/count;
+}
+
+double memcpy100()
+{
+  return memcpy_shared(100);
+}
+
+double memcpy1000()
+{
+  return memcpy_shared(1000);
+}
+
+double memcpy10000()
+{
+  return memcpy_shared(10000);
+}
+
+// Benchmark rjenkins hashing performance on cached data.
+template <int key_length>
+double ceph_str_hash_rjenkins()
+{
+  int count = 100000;
+  char buf[key_length];
+
+  uint64_t start = Cycles::rdtsc();
+  for (int i = 0; i < count; i++)
+    ceph_str_hash(CEPH_STR_HASH_RJENKINS, buf, sizeof(buf));
+  uint64_t stop = Cycles::rdtsc();
+
+  return Cycles::to_seconds(stop - start)/count;
+}
+
+// Measure the cost of reading the fine-grain cycle counter.
+double rdtsc_test()
+{
+  int count = 1000000;
+  uint64_t start = Cycles::rdtsc();
+  uint64_t total = 0;
+  for (int i = 0; i < count; i++) {
+    total += Cycles::rdtsc();
+  }
+  uint64_t stop = Cycles::rdtsc();
+  return Cycles::to_seconds(stop - start)/count;
+}
+
+// Measure the cost of the Cycles::to_seconds method.
+double perf_cycles_to_seconds()
+{
+  int count = 1000000;
+  double total = 0;
+  uint64_t cycles = 994261;
+  uint64_t start = Cycles::rdtsc();
+  for (int i = 0; i < count; i++) {
+    total += Cycles::to_seconds(cycles);
+  }
+  uint64_t stop = Cycles::rdtsc();
+  // printf("Result: %.4f\n", total/count);
+  return Cycles::to_seconds(stop - start)/count;
+}
+
+// Measure the cost of the Cylcles::toNanoseconds method.
+double perf_cycles_to_nanoseconds()
+{
+  int count = 1000000;
+  uint64_t total = 0;
+  uint64_t cycles = 994261;
+  uint64_t start = Cycles::rdtsc();
+  for (int i = 0; i < count; i++) {
+    total += Cycles::to_nanoseconds(cycles);
+  }
+  uint64_t stop = Cycles::rdtsc();
+  // printf("Result: %lu\n", total/count);
+  return Cycles::to_seconds(stop - start)/count;
+}
+
+
+#ifdef HAVE_SSE
+/**
+ * Prefetch the cache lines containing [object, object + numBytes) into the
+ * processor's caches.
+ * The best docs for this are in the Intel instruction set reference under
+ * PREFETCH.
+ * \param object
+ *      The start of the region of memory to prefetch.
+ * \param num_bytes
+ *      The size of the region of memory to prefetch.
+ */
+static inline void prefetch(const void *object, uint64_t num_bytes)
+{
+    uint64_t offset = reinterpret_cast<uint64_t>(object) & 0x3fUL;
+    const char* p = reinterpret_cast<const char*>(object) - offset;
+    for (uint64_t i = 0; i < offset + num_bytes; i += 64)
+        _mm_prefetch(p + i, _MM_HINT_T0);
+}
+#endif
+
+// Measure the cost of the prefetch instruction.
+double perf_prefetch()
+{
+#ifdef HAVE_SSE
+  uint64_t total_ticks = 0;
+  int count = 10;
+  char buf[16 * 64];
+  uint64_t start, stop;
+
+  for (int i = 0; i < count; i++) {
+    PerfHelper::flush_cache();
+    start = Cycles::rdtsc();
+    prefetch(&buf[576], 64);
+    prefetch(&buf[0],   64);
+    prefetch(&buf[512], 64);
+    prefetch(&buf[960], 64);
+    prefetch(&buf[640], 64);
+    prefetch(&buf[896], 64);
+    prefetch(&buf[256], 64);
+    prefetch(&buf[704], 64);
+    prefetch(&buf[320], 64);
+    prefetch(&buf[384], 64);
+    prefetch(&buf[128], 64);
+    prefetch(&buf[448], 64);
+    prefetch(&buf[768], 64);
+    prefetch(&buf[832], 64);
+    prefetch(&buf[64],  64);
+    prefetch(&buf[192], 64);
+    stop = Cycles::rdtsc();
+    total_ticks += stop - start;
+  }
+  return Cycles::to_seconds(total_ticks) / count / 16;
+#else
+  return -1;
+#endif
+}
+
+#if defined(__x86_64__)
+/**
+ * This function is used to seralize machine instructions so that no
+ * instructions that appear after it in the current thread can run before any
+ * instructions that appear before it. 
+ *
+ * It is useful for putting around rdpmc instructions (to pinpoint cache
+ * misses) as well as before rdtsc instructions, to prevent time pollution from
+ * instructions supposed to be executing before the timer starts.
+ */
+static inline void serialize() {
+    uint32_t eax, ebx, ecx, edx;
+    __asm volatile("cpuid"
+        : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
+        : "a" (1U));
+}
+#endif
+
+// Measure the cost of cpuid
+double perf_serialize() {
+#if defined(__x86_64__)
+  int count = 1000000;
+  uint64_t start = Cycles::rdtsc();
+  for (int i = 0; i < count; i++) {
+    serialize();
+  }
+  uint64_t stop = Cycles::rdtsc();
+  return Cycles::to_seconds(stop - start)/count;
+#else
+  return -1;
+#endif
+}
+
+// Measure the cost of an lfence instruction.
+double lfence()
+{
+#ifdef HAVE_SSE2
+  int count = 1000000;
+  uint64_t start = Cycles::rdtsc();
+  for (int i = 0; i < count; i++) {
+    __asm__ __volatile__("lfence" ::: "memory");
+  }
+  uint64_t stop = Cycles::rdtsc();
+  return Cycles::to_seconds(stop - start)/count;
+#else
+  return -1;
+#endif
+}
+
+// Measure the cost of an sfence instruction.
+double sfence()
+{
+#ifdef HAVE_SSE
+  int count = 1000000;
+  uint64_t start = Cycles::rdtsc();
+  for (int i = 0; i < count; i++) {
+    __asm__ __volatile__("sfence" ::: "memory");
+  }
+  uint64_t stop = Cycles::rdtsc();
+  return Cycles::to_seconds(stop - start)/count;
+#else
+  return -1;
+#endif
+}
+
+// Measure the cost of acquiring and releasing a SpinLock (assuming the
+// lock is initially free).
+double test_spinlock()
+{
+  int count = 1000000;
+  Spinlock lock;
+  uint64_t start = Cycles::rdtsc();
+  for (int i = 0; i < count; i++) {
+    lock.lock();
+    lock.unlock();
+  }
+  uint64_t stop = Cycles::rdtsc();
+  return Cycles::to_seconds(stop - start)/count;
+}
+
+// Helper for spawn_thread. This is the main function that the thread executes
+// (intentionally empty).
+class ThreadHelper : public Thread {
+  void *entry() { return 0; }
+};
+
+// Measure the cost of start and joining with a thread.
+double spawn_thread()
+{
+  int count = 10000;
+  ThreadHelper thread;
+  uint64_t start = Cycles::rdtsc();
+  for (int i = 0; i < count; i++) {
+    thread.create();
+    thread.join();
+  }
+  uint64_t stop = Cycles::rdtsc();
+  return Cycles::to_seconds(stop - start)/count;
+}
+
+class FakeContext : public Context {
+ public:
+  virtual void finish(int r) {}
+};
+
+// Measure the cost of starting and stopping a Dispatch::Timer.
+double perf_timer()
+{
+  int count = 1000000;
+  Mutex lock("perf_timer::lock");
+  SafeTimer timer(g_ceph_context, lock);
+  FakeContext **c = new FakeContext*[count];
+  for (int i = 0; i < count; i++) {
+    c[i] = new FakeContext();
+  }
+  uint64_t start = Cycles::rdtsc();
+  Mutex::Locker l(lock);
+  for (int i = 0; i < count; i++) {
+    timer.add_event_after(12345, c[i]);
+    timer.cancel_event(c[i]);
+  }
+  uint64_t stop = Cycles::rdtsc();
+  delete[] c;
+  return Cycles::to_seconds(stop - start)/count;
+}
+
+// Measure the cost of throwing and catching an int. This uses an integer as
+// the value thrown, which is presumably as fast as possible.
+double throw_int()
+{
+  int count = 10000;
+  uint64_t start = Cycles::rdtsc();
+  for (int i = 0; i < count; i++) {
+    try {
+      throw 0;
+    } catch (int) { // NOLINT
+      // pass
+    }
+  }
+  uint64_t stop = Cycles::rdtsc();
+  return Cycles::to_seconds(stop - start)/count;
+}
+
+// Measure the cost of throwing and catching an int from a function call.
+double throw_int_call()
+{
+  int count = 10000;
+  uint64_t start = Cycles::rdtsc();
+  for (int i = 0; i < count; i++) {
+    try {
+      PerfHelper::throw_int();
+    } catch (int) { // NOLINT
+      // pass
+    }
+  }
+  uint64_t stop = Cycles::rdtsc();
+  return Cycles::to_seconds(stop - start)/count;
+}
+
+// Measure the cost of throwing and catching an Exception. This uses an actual
+// exception as the value thrown, which may be slower than throwInt.
+double throw_exception()
+{
+  int count = 10000;
+  uint64_t start = Cycles::rdtsc();
+  for (int i = 0; i < count; i++) {
+    try {
+      throw buffer::end_of_buffer();
+    } catch (const buffer::end_of_buffer&) {
+      // pass
+    }
+  }
+  uint64_t stop = Cycles::rdtsc();
+  return Cycles::to_seconds(stop - start)/count;
+}
+
+// Measure the cost of throwing and catching an Exception from a function call.
+double throw_exception_call()
+{
+  int count = 10000;
+  uint64_t start = Cycles::rdtsc();
+  for (int i = 0; i < count; i++) {
+    try {
+      PerfHelper::throw_end_of_buffer();
+    } catch (const buffer::end_of_buffer&) {
+      // pass
+    }
+  }
+  uint64_t stop = Cycles::rdtsc();
+  return Cycles::to_seconds(stop - start)/count;
+}
+
+// Measure the cost of pushing a new element on a std::vector, copying
+// from the end to an internal element, and popping the end element.
+double vector_push_pop()
+{
+  int count = 100000;
+  std::vector<int> vector;
+  vector.push_back(1);
+  vector.push_back(2);
+  vector.push_back(3);
+  uint64_t start = Cycles::rdtsc();
+  for (int i = 0; i < count; i++) {
+    vector.push_back(i);
+    vector.push_back(i+1);
+    vector.push_back(i+2);
+    vector[2] = vector.back();
+    vector.pop_back();
+    vector[0] = vector.back();
+    vector.pop_back();
+    vector[1] = vector.back();
+    vector.pop_back();
+  }
+  uint64_t stop = Cycles::rdtsc();
+  return Cycles::to_seconds(stop - start)/(count*3);
+}
+
+// Measure the cost of ceph_clock_now
+double perf_ceph_clock_now()
+{
+  int count = 100000;
+  uint64_t start = Cycles::rdtsc();
+  for (int i = 0; i < count; i++) {
+    ceph_clock_now(g_ceph_context);
+  }
+  uint64_t stop = Cycles::rdtsc();
+  return Cycles::to_seconds(stop - start)/count;
+}
+
+// The following struct and table define each performance test in terms of
+// a string name and a function that implements the test.
+struct TestInfo {
+  const char* name;             // Name of the performance test; this is
+                                // what gets typed on the command line to
+                                // run the test.
+  double (*func)();             // Function that implements the test;
+                                // returns the time (in seconds) for each
+                                // iteration of that test.
+  const char *description;      // Short description of this test (not more
+                                // than about 40 characters, so the entire
+                                // test output fits on a single line).
+};
+TestInfo tests[] = {
+  {"atomic_int_cmp", atomic_int_cmp,
+    "atomic_t::compare_and_swap"},
+  {"atomic_int_inc", atomic_int_inc,
+    "atomic_t::inc"},
+  {"atomic_int_read", atomic_int_read,
+    "atomic_t::read"},
+  {"atomic_int_set", atomic_int_set,
+    "atomic_t::set"},
+  {"mutex_nonblock", mutex_nonblock,
+    "Mutex lock/unlock (no blocking)"},
+  {"buffer_basic", buffer_basic,
+    "buffer create, add one ptr, delete"},
+  {"buffer_encode_decode", buffer_encode_decode,
+    "buffer create, encode/decode object, delete"},
+  {"buffer_basic_copy", buffer_basic_copy,
+    "buffer create, copy small block, delete"},
+  {"buffer_copy", buffer_copy,
+    "copy out 2 small ptrs from buffer"},
+  {"buffer_encode10", buffer_encode,
+    "buffer encoding 10 structures onto existing ptr"},
+  {"buffer_get_contiguous", buffer_get_contiguous,
+    "Buffer::get_contiguous"},
+  {"buffer_iterator", buffer_iterator,
+    "iterate over buffer with 5 ptrs"},
+  {"cond_ping_pong", cond_ping_pong,
+    "condition variable round-trip"},
+  {"div32", div32,
+    "32-bit integer division instruction"},
+  {"div64", div64,
+    "64-bit integer division instruction"},
+  {"function_call", function_call,
+    "Call a function that has not been inlined"},
+  {"eventcenter_poll", eventcenter_poll,
+    "EventCenter::process_events (no timers or events)"},
+  {"eventcenter_dispatch", eventcenter_dispatch,
+    "EventCenter::dispatch_event_external latency"},
+  {"memcpy100", memcpy100,
+    "Copy 100 bytes with memcpy"},
+  {"memcpy1000", memcpy1000,
+    "Copy 1000 bytes with memcpy"},
+  {"memcpy10000", memcpy10000,
+    "Copy 10000 bytes with memcpy"},
+  {"ceph_str_hash_rjenkins", ceph_str_hash_rjenkins<16>,
+    "rjenkins hash on 16 byte of data"},
+  {"ceph_str_hash_rjenkins", ceph_str_hash_rjenkins<256>,
+    "rjenkins hash on 256 bytes of data"},
+  {"rdtsc", rdtsc_test,
+    "Read the fine-grain cycle counter"},
+  {"cycles_to_seconds", perf_cycles_to_seconds,
+    "Convert a rdtsc result to (double) seconds"},
+  {"cycles_to_seconds", perf_cycles_to_nanoseconds,
+    "Convert a rdtsc result to (uint64_t) nanoseconds"},
+  {"prefetch", perf_prefetch,
+    "Prefetch instruction"},
+  {"serialize", perf_serialize,
+    "serialize instruction"},
+  {"lfence", lfence,
+    "Lfence instruction"},
+  {"sfence", sfence,
+    "Sfence instruction"},
+  {"spin_lock", test_spinlock,
+    "Acquire/release SpinLock"},
+  {"spawn_thread", spawn_thread,
+    "Start and stop a thread"},
+  {"perf_timer", perf_timer,
+    "Insert and cancel a SafeTimer"},
+  {"throw_int", throw_int,
+    "Throw an int"},
+  {"throw_int_call", throw_int_call,
+    "Throw an int in a function call"},
+  {"throw_exception", throw_exception,
+    "Throw an Exception"},
+  {"throw_exception_call", throw_exception_call,
+    "Throw an Exception in a function call"},
+  {"vector_push_pop", vector_push_pop,
+    "Push and pop a std::vector"},
+  {"ceph_clock_now", perf_ceph_clock_now,
+   "ceph_clock_now function"},
+};
+
+/**
+ * Runs a particular test and prints a one-line result message.
+ *
+ * \param info
+ *      Describes the test to run.
+ */
+void run_test(TestInfo& info)
+{
+  double secs = info.func();
+  int width = printf("%-24s ", info.name);
+  if (secs == -1) {
+    width += printf(" architecture nonsupport ");
+  } else if (secs < 1.0e-06) {
+    width += printf("%8.2fns", 1e09*secs);
+  } else if (secs < 1.0e-03) {
+    width += printf("%8.2fus", 1e06*secs);
+  } else if (secs < 1.0) {
+    width += printf("%8.2fms", 1e03*secs);
+  } else {
+    width += printf("%8.2fs", secs);
+  }
+  printf("%*s %s\n", 32-width, "", info.description);
+}
+
+int main(int argc, char *argv[])
+{
+  vector<const char*> args;
+  argv_to_vec(argc, (const char **)argv, args);
+
+  global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
+  common_init_finish(g_ceph_context);
+
+  bind_thread_to_cpu(3);
+  if (argc == 1) {
+    // No test names specified; run all tests.
+    for (size_t i = 0; i < sizeof(tests)/sizeof(TestInfo); ++i) {
+      run_test(tests[i]);
+    }
+  } else {
+    // Run only the tests that were specified on the command line.
+    for (int i = 1; i < argc; i++) {
+      bool found_test = false;
+      for (size_t j = 0; j < sizeof(tests)/sizeof(TestInfo); ++j) {
+        if (strcmp(argv[i], tests[j].name) == 0) {
+          found_test = true;
+          run_test(tests[j]);
+          break;
+        }
+      }
+      if (!found_test) {
+        int width = printf("%-24s ??", argv[i]);
+        printf("%*s No such test\n", 32-width, "");
+      }
+    }
+  }
+}
diff --git a/src/test/pybind/test_ceph_argparse.py b/src/test/pybind/test_ceph_argparse.py
index eb79323..b0f608d 100755
--- a/src/test/pybind/test_ceph_argparse.py
+++ b/src/test/pybind/test_ceph_argparse.py
@@ -1,6 +1,6 @@
 #!/usr/bin/nosetests --nocapture
-# -*- mode:python; tab-width:4; indent-tabs-mode:t -*-
-# vim: ts=4 sw=4 smarttab expandtab
+# -*- mode:python; tab-width:4; indent-tabs-mode:t; coding:utf-8 -*-
+# vim: ts=4 sw=4 smarttab expandtab fileencoding=utf-8
 #
 # Ceph - scalable distributed file system
 #
@@ -86,6 +86,15 @@ class TestArgparse:
                                                     'toomany']))
 
 
+class TestBasic:
+
+	def test_non_ascii_in_non_options(self):
+		# unicode() is not able to convert this str parameter into unicode
+		# using the default encoding 'ascii'. and validate_command() should
+		# not choke on it.
+		assert_is_none(validate_command(sigdict, ['章鱼和鱿鱼']))
+
+
 class TestPG(TestArgparse):
 
     def test_stat(self):
@@ -390,17 +399,13 @@ class TestMDS(TestArgparse):
         self.check_1_string_arg('mds', 'fail')
 
     def test_rm(self):
+        # Valid: single GID argument present
+        self.assert_valid_command(['mds', 'rm', '1'])
+
+        # Missing GID arg: invalid
         assert_equal({}, validate_command(sigdict, ['mds', 'rm']))
-        assert_equal({}, validate_command(sigdict, ['mds', 'rm', '1']))
-        for name in ('osd', 'mon', 'client', 'mds'):
-            self.assert_valid_command(['mds', 'rm', '1', name + '.42'])
-            assert_equal({}, validate_command(sigdict, ['mds', 'rm',
-                                                        '-1', name + '.42']))
-            assert_equal({}, validate_command(sigdict, ['mds', 'rm',
-                                                        '-1', name]))
-            assert_equal({}, validate_command(sigdict, ['mds', 'rm',
-                                                        '1', name + '.42',
-                                                        'toomany']))
+        # Extra arg: invalid
+        assert_equal({}, validate_command(sigdict, ['mds', 'rm', '1', 'mds.42']))
 
     def test_rmfailed(self):
         self.check_1_natural_arg('mds', 'rmfailed')
@@ -548,14 +553,15 @@ class TestOSD(TestArgparse):
 
     def test_map(self):
         self.assert_valid_command(['osd', 'map', 'poolname', 'objectname'])
+        self.assert_valid_command(['osd', 'map', 'poolname', 'objectname', 'nspace'])
         assert_equal({}, validate_command(sigdict, ['osd', 'map']))
         assert_equal({}, validate_command(sigdict, ['osd', 'map', 'poolname']))
         assert_equal({}, validate_command(sigdict, ['osd', 'map',
-                                                    'poolname', 'objectname',
+                                                    'poolname', 'objectname', 'nspace',
                                                     'toomany']))
 
     def test_metadata(self):
-        self.check_1_natural_arg('osd', 'metadata')
+        self.check_0_or_1_natural_arg('osd', 'metadata')
 
     def test_scrub(self):
         self.check_1_string_arg('osd', 'scrub')
@@ -600,7 +606,7 @@ class TestOSD(TestArgparse):
         self.assert_valid_command(['osd', 'crush', 'dump'])
         assert_equal({}, validate_command(sigdict, ['osd', 'crush']))
         assert_equal({}, validate_command(sigdict, ['osd', 'crush',
-                                                    'dump', 
+                                                    'dump',
                                                     'toomany']))
 
     def test_setcrushmap(self):
@@ -985,7 +991,7 @@ class TestOSD(TestArgparse):
         assert_equal({}, validate_command(sigdict, ['osd', 'pool', 'create',
                                                     'poolname',
                                                     '128', '128',
-                                                    'erasure', '^^^', 
+                                                    'erasure', '^^^',
 													'ruleset']))
         assert_equal({}, validate_command(sigdict, ['osd', 'pool', 'create',
                                                     'poolname',
@@ -1029,7 +1035,7 @@ class TestOSD(TestArgparse):
 
     def test_pool_get(self):
         for var in ('size', 'min_size', 'crash_replay_interval',
-                    'pg_num', 'pgp_num', 'crush_ruleset', 'auid'):
+                    'pg_num', 'pgp_num', 'crush_ruleset', 'auid', 'fast_read'):
             self.assert_valid_command(['osd', 'pool', 'get', 'poolname', var])
         assert_equal({}, validate_command(sigdict, ['osd', 'pool']))
         assert_equal({}, validate_command(sigdict, ['osd', 'pool',
@@ -1046,7 +1052,7 @@ class TestOSD(TestArgparse):
     def test_pool_set(self):
         for var in ('size', 'min_size', 'crash_replay_interval',
                     'pg_num', 'pgp_num', 'crush_ruleset',
-                    'hashpspool', 'auid'):
+                    'hashpspool', 'auid', 'fast_read'):
             self.assert_valid_command(['osd', 'pool',
                                        'set', 'poolname', var, 'value'])
         assert_equal({}, validate_command(sigdict, ['osd', 'pool',
@@ -1154,7 +1160,7 @@ class TestConfigKey(TestArgparse):
     def test_list(self):
         self.check_no_arg('config-key', 'list')
 # Local Variables:
-# compile-command: "cd ../.. ; make -j4 && 
+# compile-command: "cd ../.. ; make -j4 &&
 #  PYTHONPATH=pybind nosetests --stop \
 #  test/pybind/test_ceph_argparse.py # test_ceph_argparse.py:TestOSD.test_rm"
 # End:
diff --git a/src/test/pybind/test_ceph_daemon.py b/src/test/pybind/test_ceph_daemon.py
new file mode 100755
index 0000000..a95c8fc
--- /dev/null
+++ b/src/test/pybind/test_ceph_daemon.py
@@ -0,0 +1,44 @@
+#!/usr/bin/nosetests --nocapture
+# -*- mode:python; tab-width:4; indent-tabs-mode:t -*-
+# vim: ts=4 sw=4 smarttab expandtab
+#
+"""
+Copyright (C) 2015 Red Hat
+
+This is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public
+License version 2, as published by the Free Software
+Foundation.  See file COPYING.
+"""
+
+from StringIO import StringIO
+
+from unittest import TestCase
+
+from ceph_daemon import DaemonWatcher
+
+
+class TestDaemonWatcher(TestCase):
+    def test_format(self):
+        dw = DaemonWatcher(None)
+
+        self.assertEqual(dw.format_dimless(1, 4), "  1 ")
+        self.assertEqual(dw.format_dimless(1000, 4), "1.0k")
+        self.assertEqual(dw.format_dimless(3.14159, 4), "  3 ")
+        self.assertEqual(dw.format_dimless(1400000, 4), "1.4M")
+
+    def test_col_width(self):
+        dw = DaemonWatcher(None)
+
+        self.assertEqual(dw.col_width("foo"), 4)
+        self.assertEqual(dw.col_width("foobar"), 6)
+
+    def test_supports_color(self):
+        dw = DaemonWatcher(None)
+        # Can't count on having a tty available during tests, so only test the false case
+        self.assertEqual(dw.supports_color(StringIO()), False)
+# Local Variables:
+# compile-command: "cd ../.. ; make -j4 &&
+#  PYTHONPATH=pybind nosetests --stop \
+#  test/pybind/test_ceph_daemon.py
+# End:
diff --git a/src/test/python/brag-client/setup.py b/src/test/python/brag-client/setup.py
new file mode 100644
index 0000000..71395c2
--- /dev/null
+++ b/src/test/python/brag-client/setup.py
@@ -0,0 +1,31 @@
+import os
+from setuptools import setup, find_packages
+
+# link ceph-brag client script here so we can "install" it
+current_dir = os.path.abspath(os.path.dirname(__file__))
+src_dir = os.path.dirname(os.path.dirname(os.path.dirname(current_dir)))
+script_path = os.path.join(src_dir, 'brag/client/ceph-brag')
+
+
+def link_target(source, destination):
+    if not os.path.exists(destination):
+        try:
+            os.symlink(source, destination)
+        except (IOError, OSError) as error:
+            print 'Ignoring linking of target: %s' % str(error)
+
+link_target(script_path, 'ceph_brag.py')
+
+setup(
+    name='ceph_brag',
+    version='0.1',
+    description='',
+    author='',
+    author_email='',
+    install_requires=[
+        "requests",
+    ],
+    zip_safe=False,
+    packages=find_packages(),
+    #packages=find_packages(exclude=['ez_setup'])
+)
diff --git a/src/test/python/brag-client/tests/test_ceph_brag.py b/src/test/python/brag-client/tests/test_ceph_brag.py
new file mode 100644
index 0000000..2a584f5
--- /dev/null
+++ b/src/test/python/brag-client/tests/test_ceph_brag.py
@@ -0,0 +1,10 @@
+import ceph_brag
+
+# This file tests nothing (yet) except for being able to import ceph_brag
+# correctly and thus ensuring somewhat that it will work under different Python
+# versions. You must write unittests here so that code has adequate coverage.
+
+class TestCephBrag(object):
+
+    def test_basic(self):
+        assert True
diff --git a/src/test/python/brag-client/tox.ini b/src/test/python/brag-client/tox.ini
new file mode 100644
index 0000000..c94e0d2f
--- /dev/null
+++ b/src/test/python/brag-client/tox.ini
@@ -0,0 +1,16 @@
+[tox]
+envlist = py26, py27, flake8
+skipsdist=True
+
+[testenv]
+deps=
+  pytest
+
+commands=
+  python setup.py develop
+  py.test -v
+
+[testenv:flake8]
+deps=
+  flake8
+commands=flake8 --select=F ceph_brag.py
diff --git a/src/test/python/ceph-disk/setup.py b/src/test/python/ceph-disk/setup.py
new file mode 100644
index 0000000..91652ba
--- /dev/null
+++ b/src/test/python/ceph-disk/setup.py
@@ -0,0 +1,27 @@
+import os
+from setuptools import setup, find_packages
+
+# link ceph-disk script here so we can "install" it
+current_dir = os.path.abspath(os.path.dirname(__file__))
+src_dir = os.path.dirname(os.path.dirname(os.path.dirname(current_dir)))
+script_path = os.path.join(src_dir, 'ceph-disk')
+
+
+def link_target(source, destination):
+    if not os.path.exists(destination):
+        try:
+            os.symlink(source, destination)
+        except (IOError, OSError) as error:
+            print 'Ignoring linking of target: %s' % str(error)
+
+link_target(script_path, 'ceph_disk.py')
+
+setup(
+    name='ceph_disk',
+    version='0.1',
+    description='',
+    author='',
+    author_email='',
+    zip_safe=False,
+    packages=find_packages(),
+)
diff --git a/src/test/python/ceph-disk/tests/test_ceph_disk.py b/src/test/python/ceph-disk/tests/test_ceph_disk.py
new file mode 100644
index 0000000..a150dd3
--- /dev/null
+++ b/src/test/python/ceph-disk/tests/test_ceph_disk.py
@@ -0,0 +1,640 @@
+from mock import patch, DEFAULT, Mock
+import argparse
+import pytest
+import ceph_disk
+
+def fail_to_mount(dev, fstype, options):
+    raise ceph_disk.MountError(dev + " mount fail")
+
+class TestCephDisk(object):
+
+    def setup_class(self):
+        ceph_disk.setup_logging(verbose=True, log_stdout=False)
+
+    def test_main_list_json(self, capsys):
+        args = ceph_disk.parse_args(['list', '--format', 'json'])
+        with patch.multiple(
+                ceph_disk,
+                list_devices=lambda args: {}):
+            ceph_disk.main_list(args)
+            out, err = capsys.readouterr()
+            assert '{}\n' == out
+
+    def test_main_list_plain(self, capsys):
+        args = ceph_disk.parse_args(['list'])
+        with patch.multiple(
+                ceph_disk,
+                list_devices=lambda args: {}):
+            ceph_disk.main_list(args)
+            out, err = capsys.readouterr()
+            assert '' == out
+
+    def test_list_format_more_osd_info_plain(self):
+        dev = {
+            'ceph_fsid': 'UUID',
+            'cluster': 'ceph',
+            'whoami': '1234',
+            'journal_dev': '/dev/Xda2',
+        }
+        out = ceph_disk.list_format_more_osd_info_plain(dev)
+        assert dev['cluster'] in " ".join(out)
+        assert dev['journal_dev'] in " ".join(out)
+        assert dev['whoami'] in " ".join(out)
+
+        dev = {
+            'ceph_fsid': 'UUID',
+            'whoami': '1234',
+            'journal_dev': '/dev/Xda2',
+        }
+        out = ceph_disk.list_format_more_osd_info_plain(dev)
+        assert 'unknown cluster' in " ".join(out)
+
+    def test_list_format_plain(self):
+        payload = [{
+            'path': '/dev/Xda',
+            'ptype': 'unknown',
+            'type': 'other',
+            'mount': '/somewhere',
+        }]
+        out = ceph_disk.list_format_plain(payload)
+        assert payload[0]['path'] in out
+        assert payload[0]['type'] in out
+        assert payload[0]['mount'] in out
+
+        payload = [{
+            'path': '/dev/Xda1',
+            'ptype': 'unknown',
+            'type': 'swap',
+        }]
+        out = ceph_disk.list_format_plain(payload)
+        assert payload[0]['path'] in out
+        assert payload[0]['type'] in out
+
+        payload = [{
+            'path': '/dev/Xda',
+            'partitions': [
+                {
+                    'dmcrypt': {},
+                    'ptype': 'whatever',
+                    'is_partition': True,
+                    'fs_type': 'ext4',
+                    'path': '/dev/Xda1',
+                    'mounted': '/somewhere',
+                    'type': 'other',
+                }
+            ],
+        }]
+        out = ceph_disk.list_format_plain(payload)
+        assert payload[0]['path'] in out
+        assert payload[0]['partitions'][0]['path'] in out
+
+    def test_list_format_dev_plain(dev):
+        #
+        # data
+        #
+        dev = {
+            'path': '/dev/Xda1',
+            'ptype': ceph_disk.OSD_UUID,
+            'state': 'prepared',
+            'whoami': '1234',
+        }
+        out = ceph_disk.list_format_dev_plain(dev)
+        assert 'data' in out
+        assert dev['whoami'] in out
+        assert dev['state'] in out
+        #
+        # journal
+        #
+        dev = {
+            'path': '/dev/Xda2',
+            'ptype': ceph_disk.JOURNAL_UUID,
+            'journal_for': '/dev/Xda1',
+        }
+        out = ceph_disk.list_format_dev_plain(dev)
+        assert 'journal' in out
+        assert dev['journal_for'] in out
+
+        #
+        # dmcrypt data
+        #
+        ptype2type = {
+            ceph_disk.DMCRYPT_OSD_UUID: 'plain',
+            ceph_disk.DMCRYPT_LUKS_OSD_UUID: 'LUKS',
+        }
+        for (ptype, type) in ptype2type.iteritems():
+            for holders in ((), ("dm_0",), ("dm_0", "dm_1")):
+                devices = [{
+                    'path': '/dev/dm_0',
+                    'whoami': '1234',
+                }]
+                dev = {
+                    'dmcrypt': {
+                        'holders': holders,
+                        'type': type,
+                    },
+                    'path': '/dev/Xda1',
+                    'ptype': ptype,
+                    'state': 'prepared',
+                }
+                out = ceph_disk.list_format_dev_plain(dev, devices)
+                assert 'data' in out
+                assert 'dmcrypt' in out
+                assert type in out
+                if len(holders) == 1:
+                    assert devices[0]['whoami'] in out
+                for holder in holders:
+                    assert holder in out
+
+        #
+        # dmcrypt journal
+        #
+        ptype2type = {
+            ceph_disk.DMCRYPT_JOURNAL_UUID: 'plain',
+            ceph_disk.DMCRYPT_LUKS_JOURNAL_UUID: 'LUKS',
+        }
+        for (ptype, type) in ptype2type.iteritems():
+            for holders in ((), ("dm_0",)):
+                dev = {
+                    'path': '/dev/Xda2',
+                    'ptype': ptype,
+                    'journal_for': '/dev/Xda1',
+                    'dmcrypt': {
+                        'holders': holders,
+                        'type': type,
+                    },
+                }
+                out = ceph_disk.list_format_dev_plain(dev, devices)
+                assert 'journal' in out
+                assert 'dmcrypt' in out
+                assert type in out
+                assert dev['journal_for'] in out
+                if len(holders) == 1:
+                    assert holders[0] in out
+
+    def test_list_dev_osd(self):
+        dev = "Xda"
+        mount_path = '/mount/path'
+        fs_type = 'ext4'
+        cluster = 'ceph'
+        uuid_map = {}
+        def more_osd_info(path, uuid_map, desc):
+            desc['cluster'] = cluster
+        #
+        # mounted therefore active
+        #
+        with patch.multiple(
+                ceph_disk,
+                is_mounted=lambda dev: mount_path,
+                get_dev_fs=lambda dev: fs_type,
+                more_osd_info=more_osd_info
+        ):
+            desc = {}
+            ceph_disk.list_dev_osd(dev, uuid_map, desc)
+            assert {'cluster': 'ceph',
+                    'fs_type': 'ext4',
+                    'mount': '/mount/path',
+                    'state': 'active'} == desc
+        #
+        # not mounted and cannot mount: unprepared
+        #
+        mount_path = None
+        with patch.multiple(
+                ceph_disk,
+                is_mounted=lambda dev: mount_path,
+                get_dev_fs=lambda dev: fs_type,
+                mount=fail_to_mount,
+                more_osd_info=more_osd_info
+        ):
+            desc = {}
+            ceph_disk.list_dev_osd(dev, uuid_map, desc)
+            assert {'fs_type': 'ext4',
+                    'mount': mount_path,
+                    'state': 'unprepared'} == desc
+        #
+        # not mounted and magic found: prepared
+        #
+        def get_oneliner(path, what):
+            if what == 'magic':
+                return ceph_disk.CEPH_OSD_ONDISK_MAGIC
+            else:
+                raise Exception('unknown ' + what)
+        with patch.multiple(
+                ceph_disk,
+                is_mounted=lambda dev: mount_path,
+                get_dev_fs=lambda dev: fs_type,
+                mount=DEFAULT,
+                unmount=DEFAULT,
+                get_oneliner=get_oneliner,
+                more_osd_info=more_osd_info
+        ):
+            desc = {}
+            ceph_disk.list_dev_osd(dev, uuid_map, desc)
+            assert {'cluster': 'ceph',
+                    'fs_type': 'ext4',
+                    'mount': mount_path,
+                    'magic': ceph_disk.CEPH_OSD_ONDISK_MAGIC,
+                    'state': 'prepared'} == desc
+
+    def test_list_all_partitions(self):
+        partition_uuid = "56244cf5-83ef-4984-888a-2d8b8e0e04b2"
+        disk = "Xda"
+        partition = "Xda1"
+
+        with patch(
+                'ceph_disk.os',
+                listdir=lambda path: [disk],
+        ), patch.multiple(
+            ceph_disk,
+            list_partitions=lambda dev: [partition],
+        ):
+                assert {disk: [partition]} == ceph_disk.list_all_partitions([])
+
+        with patch.multiple(
+                ceph_disk,
+                list_partitions=lambda dev: [partition],
+        ):
+                assert {disk: [partition]} == ceph_disk.list_all_partitions([disk])
+
+    def test_list_data(self):
+        args = ceph_disk.parse_args(['list'])
+        #
+        # a data partition that fails to mount is silently
+        # ignored
+        #
+        partition_uuid = "56244cf5-83ef-4984-888a-2d8b8e0e04b2"
+        disk = "Xda"
+        partition = "Xda1"
+        fs_type = "ext4"
+
+        with patch.multiple(
+                ceph_disk,
+                list_all_partitions=lambda names: { disk: [partition] },
+                get_partition_uuid=lambda dev: partition_uuid,
+                get_partition_type=lambda dev: ceph_disk.OSD_UUID,
+                get_dev_fs=lambda dev: fs_type,
+                mount=fail_to_mount,
+                unmount=DEFAULT,
+                is_partition=lambda dev: True,
+                ):
+            expect = [{'path': '/dev/' + disk,
+                       'partitions': [{
+                           'dmcrypt': {},
+                           'fs_type': fs_type,
+                           'is_partition': True,
+                           'mount': None,
+                           'path': '/dev/' + partition,
+                           'ptype': ceph_disk.OSD_UUID,
+                           'state': 'unprepared',
+                           'type': 'data',
+                           'uuid': partition_uuid,
+                       }]}]
+            assert expect == ceph_disk.list_devices(args)
+
+    def test_list_dmcrypt_data(self):
+        args = ceph_disk.parse_args(['list'])
+        partition_type2type = {
+            ceph_disk.DMCRYPT_OSD_UUID: 'plain',
+            ceph_disk.DMCRYPT_LUKS_OSD_UUID: 'LUKS',
+        }
+        for (partition_type, type) in partition_type2type.iteritems():
+            #
+            # dmcrypt data partition with one holder
+            #
+            partition_uuid = "56244cf5-83ef-4984-888a-2d8b8e0e04b2"
+            disk = "Xda"
+            partition = "Xda1"
+            holders = ["dm-0"]
+            with patch.multiple(
+                    ceph_disk,
+                    is_held=lambda dev: holders,
+                    list_all_partitions=lambda names: { disk: [partition] },
+                    get_partition_uuid=lambda dev: partition_uuid,
+                    get_partition_type=lambda dev: partition_type,
+                    is_partition=lambda dev: True,
+                    ):
+                expect = [{'path': '/dev/' + disk,
+                           'partitions': [{
+                               'dmcrypt': {
+                                   'holders': holders,
+                                   'type': type,
+                               },
+                               'fs_type': None,
+                               'is_partition': True,
+                               'mount': None,
+                               'path': '/dev/' + partition,
+                               'ptype': partition_type,
+                               'state': 'unprepared',
+                               'type': 'data',
+                               'uuid': partition_uuid,
+                           }]}]
+                assert expect == ceph_disk.list_devices(args)
+            #
+            # dmcrypt data partition with two holders
+            #
+            partition_uuid = "56244cf5-83ef-4984-888a-2d8b8e0e04b2"
+            disk = "Xda"
+            partition = "Xda1"
+            holders = ["dm-0","dm-1"]
+            with patch.multiple(
+                    ceph_disk,
+                    is_held=lambda dev: holders,
+                    list_all_partitions=lambda names: { disk: [partition] },
+                    get_partition_uuid=lambda dev: partition_uuid,
+                    get_partition_type=lambda dev: partition_type,
+                    is_partition=lambda dev: True,
+                    ):
+                expect = [{'path': '/dev/' + disk,
+                           'partitions': [{
+                               'dmcrypt': {
+                                   'holders': holders,
+                                   'type': type,
+                               },
+                               'is_partition': True,
+                               'path': '/dev/' + partition,
+                               'ptype': partition_type,
+                               'type': 'data',
+                               'uuid': partition_uuid,
+                           }]}]
+                assert expect == ceph_disk.list_devices(args)
+
+    def test_list_multipath(self):
+        args = ceph_disk.parse_args(['list'])
+        #
+        # multipath data partition
+        #
+        partition_uuid = "56244cf5-83ef-4984-888a-2d8b8e0e04b2"
+        disk = "Xda"
+        partition = "Xda1"
+        with patch.multiple(
+                ceph_disk,
+                list_all_partitions=lambda names: { disk: [partition] },
+                get_partition_uuid=lambda dev: partition_uuid,
+                get_partition_type=lambda dev: ceph_disk.MPATH_OSD_UUID,
+                is_partition=lambda dev: True,
+                ):
+            expect = [{'path': '/dev/' + disk,
+                       'partitions': [{
+                           'dmcrypt': {},
+                           'fs_type': None,
+                           'is_partition': True,
+                           'mount': None,
+                           'multipath': True,
+                           'path': '/dev/' + partition,
+                           'ptype': ceph_disk.MPATH_OSD_UUID,
+                           'state': 'unprepared',
+                           'type': 'data',
+                           'uuid': partition_uuid,
+                       }]}]
+            assert expect == ceph_disk.list_devices(args)
+        #
+        # multipath journal partition
+        #
+        journal_partition_uuid = "2cc40457-259e-4542-b029-785c7cc37871"
+        with patch.multiple(
+                ceph_disk,
+                list_all_partitions=lambda names: { disk: [partition] },
+                get_partition_uuid=lambda dev: journal_partition_uuid,
+                get_partition_type=lambda dev: ceph_disk.MPATH_JOURNAL_UUID,
+                is_partition=lambda dev: True,
+                ):
+            expect = [{'path': '/dev/' + disk,
+                       'partitions': [{
+                           'dmcrypt': {},
+                           'is_partition': True,
+                           'multipath': True,
+                           'path': '/dev/' + partition,
+                           'ptype': ceph_disk.MPATH_JOURNAL_UUID,
+                           'type': 'journal',
+                           'uuid': journal_partition_uuid,
+                       }]}]
+            assert expect == ceph_disk.list_devices(args)
+
+    def test_list_dmcrypt(self):
+        self.list(ceph_disk.DMCRYPT_OSD_UUID, ceph_disk.DMCRYPT_JOURNAL_UUID)
+        self.list(ceph_disk.DMCRYPT_LUKS_OSD_UUID, ceph_disk.DMCRYPT_LUKS_JOURNAL_UUID)
+
+    def test_list_normal(self):
+        self.list(ceph_disk.OSD_UUID, ceph_disk.JOURNAL_UUID)
+
+    def list(self, data_ptype, journal_ptype):
+        args = ceph_disk.parse_args(['--verbose', 'list'])
+        #
+        # a single disk has a data partition and a journal
+        # partition and the osd is active
+        #
+        data_uuid = "56244cf5-83ef-4984-888a-2d8b8e0e04b2"
+        disk = "Xda"
+        data = "Xda1"
+        data_holder = "dm-0"
+        journal = "Xda2"
+        journal_holder = "dm-0"
+        mount_path = '/mount/path'
+        fs_type = 'ext4'
+        journal_uuid = "7ad5e65a-0ca5-40e4-a896-62a74ca61c55"
+        ceph_fsid = "60a2ef70-d99b-4b9b-a83c-8a86e5e60091"
+        osd_id = '1234'
+        def get_oneliner(path, what):
+            if what == 'journal_uuid':
+                return journal_uuid
+            elif what == 'ceph_fsid':
+                return ceph_fsid
+            elif what == 'whoami':
+                return osd_id
+            else:
+                raise Exception('unknown ' + what)
+        def get_partition_uuid(dev):
+            if dev == '/dev/' + data:
+                return data_uuid
+            elif dev == '/dev/' + journal:
+                return journal_uuid
+            else:
+                raise Exception('unknown ' + dev)
+        def get_partition_type(dev):
+            if (dev == '/dev/' + data or
+                dev == '/dev/' + data_holder):
+                return data_ptype
+            elif (dev == '/dev/' + journal or
+                  dev == '/dev/' + journal_holder):
+                return journal_ptype
+            else:
+                raise Exception('unknown ' + dev)
+        cluster = 'ceph'
+        if data_ptype == ceph_disk.OSD_UUID:
+            data_dmcrypt = {}
+        elif data_ptype == ceph_disk.DMCRYPT_OSD_UUID:
+            data_dmcrypt = {
+                'type': 'plain',
+                'holders': [data_holder],
+            }
+        elif data_ptype == ceph_disk.DMCRYPT_LUKS_OSD_UUID:
+            data_dmcrypt = {
+                'type': 'LUKS',
+                'holders': [data_holder],
+            }
+        else:
+            raise Exception('unknown ' + data_ptype)
+
+        if journal_ptype == ceph_disk.JOURNAL_UUID:
+            journal_dmcrypt = {}
+        elif journal_ptype == ceph_disk.DMCRYPT_JOURNAL_UUID:
+            journal_dmcrypt = {
+                'type': 'plain',
+                'holders': [journal_holder],
+            }
+        elif journal_ptype == ceph_disk.DMCRYPT_LUKS_JOURNAL_UUID:
+            journal_dmcrypt = {
+                'type': 'LUKS',
+                'holders': [journal_holder],
+            }
+        else:
+            raise Exception('unknown ' + journal_ptype)
+
+        if data_dmcrypt:
+            def is_held(dev):
+                if dev == '/dev/' + data:
+                    return [data_holder]
+                elif dev == '/dev/' + journal:
+                    return [journal_holder]
+                else:
+                    raise Exception('unknown ' + dev)
+        else:
+            def is_held(dev):
+                return []
+
+        with patch.multiple(
+                ceph_disk,
+                list_all_partitions=lambda names: { disk: [data, journal] },
+                get_dev_fs=lambda dev: fs_type,
+                is_mounted=lambda dev: mount_path,
+                get_partition_uuid=get_partition_uuid,
+                get_partition_type=get_partition_type,
+                find_cluster_by_uuid=lambda ceph_fsid: cluster,
+                is_partition=lambda dev: True,
+                mount=DEFAULT,
+                unmount=DEFAULT,
+                get_oneliner=get_oneliner,
+                is_held=is_held,
+                ):
+            expect = [{'path': '/dev/' + disk,
+                       'partitions': [{
+                           'ceph_fsid': ceph_fsid,
+                           'cluster': cluster,
+                           'dmcrypt': data_dmcrypt,
+                           'fs_type': fs_type,
+                           'is_partition': True,
+                           'journal_dev': '/dev/' + journal,
+                           'journal_uuid': journal_uuid,
+                           'mount': mount_path,
+                           'path': '/dev/' + data,
+                           'ptype': data_ptype,
+                           'state': 'active',
+                           'type': 'data',
+                           'whoami': osd_id,
+                           'uuid': data_uuid,
+                       }, {
+                           'dmcrypt': journal_dmcrypt,
+                           'is_partition': True,
+                           'journal_for': '/dev/' + data,
+                           'path': '/dev/' + journal,
+                           'ptype': journal_ptype,
+                           'type': 'journal',
+                           'uuid': journal_uuid,
+                       },
+                                  ]}]
+            assert expect == ceph_disk.list_devices(args)
+
+    def test_list_other(self):
+        args = ceph_disk.parse_args(['list'])
+        #
+        # not swap, unknown fs type, not mounted, with uuid
+        #
+        partition_uuid = "56244cf5-83ef-4984-888a-2d8b8e0e04b2"
+        partition_type = "e51adfb9-e9fd-4718-9fc1-7a0cb03ea3f4"
+        disk = "Xda"
+        partition = "Xda1"
+        with patch.multiple(
+                ceph_disk,
+                list_all_partitions=lambda names: { disk: [partition] },
+                get_partition_uuid=lambda dev: partition_uuid,
+                get_partition_type=lambda dev: partition_type,
+                is_partition=lambda dev: True,
+                ):
+            expect = [{'path': '/dev/' + disk,
+                       'partitions': [{'dmcrypt': {},
+                                       'is_partition': True,
+                                       'path': '/dev/' + partition,
+                                       'ptype': partition_type,
+                                       'type': 'other',
+                                       'uuid': partition_uuid}]}]
+            assert expect == ceph_disk.list_devices(args)
+        #
+        # not swap, mounted, ext4 fs type, with uuid
+        #
+        partition_uuid = "56244cf5-83ef-4984-888a-2d8b8e0e04b2"
+        partition_type = "e51adfb9-e9fd-4718-9fc1-7a0cb03ea3f4"
+        disk = "Xda"
+        partition = "Xda1"
+        mount_path = '/mount/path'
+        fs_type = 'ext4'
+        with patch.multiple(
+                ceph_disk,
+                list_all_partitions=lambda names: { disk: [partition] },
+                get_dev_fs=lambda dev: fs_type,
+                is_mounted=lambda dev: mount_path,
+                get_partition_uuid=lambda dev: partition_uuid,
+                get_partition_type=lambda dev: partition_type,
+                is_partition=lambda dev: True,
+                ):
+            expect = [{'path': '/dev/' + disk,
+                       'partitions': [{'dmcrypt': {},
+                                       'is_partition': True,
+                                       'mount': mount_path,
+                                       'fs_type': fs_type,
+                                       'path': '/dev/' + partition,
+                                       'ptype': partition_type,
+                                       'type': 'other',
+                                       'uuid': partition_uuid,
+                                   }]}]
+            assert expect == ceph_disk.list_devices(args)
+
+        #
+        # swap, with uuid
+        #
+        partition_uuid = "56244cf5-83ef-4984-888a-2d8b8e0e04b2"
+        partition_type = "e51adfb9-e9fd-4718-9fc1-7a0cb03ea3f4"
+        disk = "Xda"
+        partition = "Xda1"
+        with patch.multiple(
+                ceph_disk,
+                list_all_partitions=lambda names: { disk: [partition] },
+                is_swap=lambda dev: True,
+                get_partition_uuid=lambda dev: partition_uuid,
+                get_partition_type=lambda dev: partition_type,
+                is_partition=lambda dev: True,
+                ):
+            expect = [{'path': '/dev/' + disk,
+                       'partitions': [{'dmcrypt': {},
+                                       'is_partition': True,
+                                       'path': '/dev/' + partition,
+                                       'ptype': partition_type,
+                                       'type': 'swap',
+                                       'uuid': partition_uuid}]}]
+            assert expect == ceph_disk.list_devices(args)
+
+        #
+        # whole disk
+        #
+        partition_uuid = "56244cf5-83ef-4984-888a-2d8b8e0e04b2"
+        disk = "Xda"
+        partition = "Xda1"
+        with patch.multiple(
+                ceph_disk,
+                list_all_partitions=lambda names: { disk: [] },
+                is_partition=lambda dev: False,
+                ):
+            expect = [{'path': '/dev/' + disk,
+                       'dmcrypt': {},
+                       'is_partition': False,
+                       'ptype': 'unknown',
+                       'type': 'other'}]
+            assert expect == ceph_disk.list_devices(args)
diff --git a/src/test/python/ceph-disk/tox.ini b/src/test/python/ceph-disk/tox.ini
new file mode 100644
index 0000000..194c0fc
--- /dev/null
+++ b/src/test/python/ceph-disk/tox.ini
@@ -0,0 +1,19 @@
+[tox]
+envlist = py27, flake8
+skipsdist=True
+
+[testenv]
+deps=
+  pytest
+  mock
+  pytest-cov==1.6
+  coverage==3.7.1
+
+commands=
+  python setup.py develop
+  py.test -vv --cov=ceph_disk.py --cov-report=term-missing
+
+[testenv:flake8]
+deps=
+  flake8
+commands=flake8 --select=F,E9 ceph_disk.py
diff --git a/src/test/rgw/test_rgw_obj.cc b/src/test/rgw/test_rgw_obj.cc
new file mode 100644
index 0000000..18696a6
--- /dev/null
+++ b/src/test/rgw/test_rgw_obj.cc
@@ -0,0 +1,159 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 eNovance SAS <licensing at enovance.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#include <iostream>
+#include "global/global_init.h"
+#include "common/ceph_json.h"
+#include "common/Formatter.h"
+#include "rgw/rgw_common.h"
+#define GTEST
+#ifdef GTEST
+#include <gtest/gtest.h>
+#else
+#define TEST(x, y) void y()
+#define ASSERT_EQ(v, s) if(v != s)cout << "Error at " << __LINE__ << "(" << #v << "!= " << #s << "\n"; \
+                                else cout << "(" << #v << "==" << #s << ") PASSED\n";
+#define EXPECT_EQ(v, s) ASSERT_EQ(v, s)
+#define ASSERT_TRUE(c) if(c)cout << "Error at " << __LINE__ << "(" << #c << ")" << "\n"; \
+                          else cout << "(" << #c << ") PASSED\n";
+#define EXPECT_TRUE(c) ASSERT_TRUE(c) 
+#endif
+using namespace std;
+
+static void init_bucket(rgw_bucket *bucket, const char *name)
+{
+  *bucket = rgw_bucket(name, ".data-pool", ".index-pool", "marker", "bucket-id", NULL);
+}
+
+void check_parsed_correctly(rgw_obj& obj, const string& name, const string& ns, const string& instance)
+{
+  /* parse_raw_oid() */
+  string parsed_name, parsed_ns, parsed_instance;
+  ASSERT_EQ(true, rgw_obj::parse_raw_oid(obj.get_object(), &parsed_name, &parsed_instance, &parsed_ns));
+
+  cout << "parsed: " << parsed_name << " ns=" << parsed_ns << " i=" << parsed_instance << std::endl;
+
+  ASSERT_EQ(name, parsed_name);
+  ASSERT_EQ(ns, parsed_ns);
+  ASSERT_EQ(instance, parsed_instance);
+
+  /* translate_raw_obj_to_obj_in_ns() */
+  string tname = obj.get_object();
+  string tns = ns + "foo";
+  string tinstance;
+  ASSERT_EQ(0, rgw_obj::translate_raw_obj_to_obj_in_ns(tname, tinstance, tns));
+  ASSERT_EQ(name, tname);
+  ASSERT_EQ(instance, tinstance);
+
+  tname = obj.get_object();
+  tns = ns;
+  ASSERT_EQ(true, rgw_obj::translate_raw_obj_to_obj_in_ns(tname, tinstance, tns));
+
+  cout << "parsed: " << parsed_name << " ns=" << parsed_ns << " i=" << parsed_instance << std::endl;
+
+  ASSERT_EQ(name, tname);
+  ASSERT_EQ(instance, tinstance);
+
+  /* strip_namespace_from_object() */
+
+  string strip_name = obj.get_object();
+  string strip_ns, strip_instance;
+
+  ASSERT_EQ(true, rgw_obj::strip_namespace_from_object(strip_name, strip_ns, strip_instance));
+
+  cout << "stripped: " << strip_name << " ns=" << strip_ns << " i=" << strip_instance << std::endl;
+
+  ASSERT_EQ(name, strip_name);
+  ASSERT_EQ(ns, strip_ns);
+  ASSERT_EQ(instance, strip_instance);
+}
+
+void test_obj(const string& name, const string& ns, const string& instance)
+{
+  rgw_bucket b;
+  init_bucket(&b, "test");
+
+  JSONFormatter *formatter = new JSONFormatter(true);
+
+  formatter->open_object_section("test");
+  rgw_obj o(b, name);
+  rgw_obj obj1(o);
+
+  if (!instance.empty()) {
+    obj1.set_instance(instance);
+  }
+  if (!ns.empty()) {
+    obj1.set_ns(ns);
+  }
+  
+  check_parsed_correctly(obj1, name, ns, instance);
+  encode_json("obj1", obj1, formatter);
+
+  bufferlist bl;
+  ::encode(obj1, bl);
+
+  rgw_obj obj2;
+  ::decode(obj2, bl);
+  check_parsed_correctly(obj2, name, ns, instance);
+
+  encode_json("obj2", obj2, formatter);
+
+  rgw_obj obj3(o);
+  bufferlist bl3;
+  ::encode(obj3, bl3);
+  ::decode(obj3, bl3);
+  encode_json("obj3", obj3, formatter);
+
+  if (!instance.empty()) {
+    obj3.set_instance(instance);
+  }
+  if (!ns.empty()) {
+    obj3.set_ns(ns);
+  }
+  check_parsed_correctly(obj3, name, ns, instance);
+
+  encode_json("obj3-2", obj3, formatter);
+
+  formatter->close_section();
+
+  formatter->flush(cout);
+
+  ASSERT_EQ(obj1, obj2);
+  ASSERT_EQ(obj1, obj3);
+
+
+  /* rgw_obj_key conversion */
+  rgw_obj_key k;
+  obj1.get_index_key(&k);
+
+  rgw_obj new_obj(b, k);
+
+  ASSERT_EQ(obj1, new_obj);
+
+  delete formatter;
+}
+
+TEST(TestRGWObj, underscore) {
+  test_obj("_obj", "", "");
+  test_obj("_obj", "ns", "");
+  test_obj("_obj", "", "v1");
+  test_obj("_obj", "ns", "v1");
+}
+
+TEST(TestRGWObj, no_underscore) {
+  test_obj("obj", "", "");
+  test_obj("obj", "ns", "");
+  test_obj("obj", "", "v1");
+  test_obj("obj", "ns", "v1");
+}
+
diff --git a/src/test/run-cli-tests b/src/test/run-cli-tests
index e29ad20..48fc90e 100755
--- a/src/test/run-cli-tests
+++ b/src/test/run-cli-tests
@@ -30,7 +30,7 @@ if [ ! -e "$CRAM_BIN" ]; then
     # patched cram to support that. See upstream ticket at
     # https://bitbucket.org/brodie/cram/issue/9/allow-read-only-directories-for-t
     # -- tv at inktank.com
-    virtualenv "$VENV" && $VENV/bin/pip install "$SRCDIR/downloads/cram-0.5.0ceph.2011-01-14.tar.gz"
+    virtualenv "$VENV" && $VENV/bin/pip --log "$VENV"/log.txt install "$SRCDIR/downloads/cram-0.5.0ceph.2011-01-14.tar.gz"
 fi
 
 SRCDIR_ABS="$(readlink -f "$SRCDIR")"
diff --git a/src/test/run-rbd-unit-tests.sh b/src/test/run-rbd-unit-tests.sh
new file mode 100755
index 0000000..09edb41
--- /dev/null
+++ b/src/test/run-rbd-unit-tests.sh
@@ -0,0 +1,15 @@
+#!/bin/bash -ex
+
+# this should be run from the src directory in the ceph.git
+
+CEPH_SRC=$(pwd)
+export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$CEPH_SRC/.libs"
+PATH="$CEPH_SRC:$PATH"
+
+unittest_librbd
+for i in 0 1 5 29 45
+do
+    RBD_FEATURES=$i unittest_librbd
+done
+
+echo OK
diff --git a/src/test/streamtest.cc b/src/test/streamtest.cc
index d4ccadf..e19dbce 100644
--- a/src/test/streamtest.cc
+++ b/src/test/streamtest.cc
@@ -143,9 +143,10 @@ int main(int argc, const char **argv)
     return -1;
   }
 
+  ObjectStore::Sequencer osr(__func__);
   ObjectStore::Transaction ft;
-  ft.create_collection(coll_t());
-  fs->apply_transaction(ft);
+  ft.create_collection(coll_t(), 0);
+  fs->apply_transaction(&osr, ft);
 
   utime_t now = ceph_clock_now(g_ceph_context);
   utime_t start = now;
@@ -159,7 +160,7 @@ int main(int argc, const char **argv)
 
     set_start(pos, ceph_clock_now(g_ceph_context));
     ObjectStore::Transaction *t = new ObjectStore::Transaction;
-    t->write(coll_t(), hobject_t(poid), pos, bytes, bl);
+    t->write(coll_t(), ghobject_t(hobject_t(poid)), pos, bytes, bl);
     fs->queue_transaction(NULL, t, new C_Ack(pos), new C_Commit(pos));
     pos += bytes;
 
diff --git a/src/test/system/systest_runnable.cc b/src/test/system/systest_runnable.cc
index 55cab0a..c7ed6aa 100644
--- a/src/test/system/systest_runnable.cc
+++ b/src/test/system/systest_runnable.cc
@@ -58,7 +58,6 @@ SysTestRunnable(int argc, const char **argv)
   m_started = false;
   m_id = m_highest_id.inc();
   memset(&m_pthread, 0, sizeof(m_pthread));
-  m_pid = 0;
   update_id_str(false);
   set_argv(argc, argv);
 }
@@ -81,33 +80,29 @@ start()
   if (m_started) {
     return -EDOM;
   }
+  int ret;
   bool use_threads = SysTestSettings::inst().use_threads();
   if (use_threads) {
-    int ret = pthread_create(&m_pthread, NULL, systest_runnable_pthread_helper,
+    ret = pthread_create(&m_pthread, NULL, systest_runnable_pthread_helper,
 			     static_cast<void*>(this));
     if (ret)
       return ret;
     m_started = true;
-    return 0;
-  }
-  else {
-    pid_t pid = fork();
-    if (pid == -1) {
-      int err = errno;
-      return -err;
-    }
-    else if (pid == 0) {
+  } else {
+    std::string err_msg;
+    ret = preforker.prefork(err_msg);
+    if (ret < 0)
+      preforker.exit(ret);
+
+    if (preforker.is_child()) {
       m_started = true;
-      m_pid = getpid();
       void *retptr = systest_runnable_pthread_helper(static_cast<void*>(this));
-      exit((int)(uintptr_t)retptr);
-    }
-    else {
+      preforker.exit((int)(uintptr_t)retptr);
+    } else {
       m_started = true;
-      m_pid = pid;
-      return 0;
     }
   }
+  return 0;
 }
 
 std::string SysTestRunnable::
@@ -116,10 +111,11 @@ join()
   if (!m_started) {
     return "SysTestRunnable was never started.";
   }
+  int ret;
   bool use_threads = SysTestSettings::inst().use_threads();
   if (use_threads) {
     void *ptrretval;
-    int ret = pthread_join(m_pthread, &ptrretval);
+    ret = pthread_join(m_pthread, &ptrretval);
     if (ret) {
       ostringstream oss;
       oss << "pthread_join failed with error " << ret;
@@ -132,36 +128,10 @@ join()
       return oss.str();
     }
     return "";
-  }
-  else {
-    int status;
-    printf("waitpid(%d)\n", m_pid);
-    pid_t pid = waitpid(m_pid, &status, 0);
-    if (pid == -1) {
-      int err = errno;
-      ostringstream oss;
-      oss << get_id_str() << " waitpid error: " << cpp_strerror(err);
-      return oss.str();
-    }
-    else if (WIFSIGNALED(status)) {
-      ostringstream oss;
-      oss << get_id_str() << " exited with a signal";
-      return oss.str();
-    }
-    else if (!WIFEXITED(status)) {
-      ostringstream oss;
-      oss << get_id_str() << " did not exit normally";
-      return oss.str();
-    }
-    else {
-      int exit_status = WEXITSTATUS(status);
-      if (exit_status != 0) {
-	ostringstream oss;
-	oss << get_id_str() << " returned exit_status " << exit_status;
-	return oss.str();
-      }
-      return "";
-    }
+  } else {
+    std::string err_msg;
+    ret = preforker.parent_wait(err_msg);
+    return err_msg;
   }
 }
 
diff --git a/src/test/system/systest_runnable.h b/src/test/system/systest_runnable.h
index 8fb59f4..bd7d258 100644
--- a/src/test/system/systest_runnable.h
+++ b/src/test/system/systest_runnable.h
@@ -20,6 +20,8 @@
 #include <string>
 #include <vector>
 
+#include "common/Preforker.h"
+
 #define RETURN1_IF_NOT_VAL(expected, expr) \
   do {\
     int _rinv_ret = expr;\
@@ -70,18 +72,18 @@ protected:
   const char **m_argv;
 
 private:
-  SysTestRunnable(const SysTestRunnable &rhs);
+  explicit SysTestRunnable(const SysTestRunnable &rhs);
   SysTestRunnable& operator=(const SysTestRunnable &rhs);
   void update_id_str(bool started);
   void set_argv(int argc, const char **argv);
 
   friend void* systest_runnable_pthread_helper(void *arg);
 
+  Preforker preforker;
   const char **m_argv_orig;
   bool m_started;
   int m_id;
   pthread_t m_pthread;
-  int m_pid;
   char m_id_str[ID_STR_SZ];
 };
 
diff --git a/src/test/test-ceph-helpers.sh b/src/test/test-ceph-helpers.sh
index d22c050..20c44ff 100755
--- a/src/test/test-ceph-helpers.sh
+++ b/src/test/test-ceph-helpers.sh
@@ -17,4 +17,4 @@
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU Library Public License for more details.
 #
-test/ceph-helpers.sh TESTS
+../qa/workunits/ceph-helpers.sh TESTS
diff --git a/src/test/test_arch.cc b/src/test/test_arch.cc
index b129262..e2c225b 100644
--- a/src/test/test_arch.cc
+++ b/src/test/test_arch.cc
@@ -47,9 +47,20 @@ TEST(Arch, all)
 
   int expected;
 
+#if (__arm__ || __aarch64__)
+
   expected = (strstr(flags, " neon ") || strstr(flags, " asimd ")) ? 1 : 0;
   EXPECT_EQ(expected, ceph_arch_neon);
 
+#endif
+#if (__aarch64__)
+
+  expected = strstr(flags, " crc32 ") ? 1 : 0;
+  EXPECT_EQ(expected, ceph_arch_aarch64_crc32);
+
+#endif
+#if (__x86_64__)
+
   expected = strstr(flags, " pclmulqdq ") ? 1 : 0;
   EXPECT_EQ(expected, ceph_arch_intel_pclmul);
 
@@ -67,6 +78,9 @@ TEST(Arch, all)
 
   expected = strstr(flags, " sse2 ") ? 1 : 0;
   EXPECT_EQ(expected, ceph_arch_intel_sse2);
+
+#endif
+
 #endif
 }
 
diff --git a/src/test/test_cors.cc b/src/test/test_cors.cc
index 4d385bc..372988f 100644
--- a/src/test/test_cors.cc
+++ b/src/test/test_cors.cc
@@ -11,7 +11,7 @@ extern "C"{
 #define S3_BUCKET_NAME "s3testgw.fcgi"
 #define SWIFT_BUCKET_NAME "swift3testgw.fcgi"
 #define BUCKET_URL \
-  ((g_test->get_key_type() == KEY_TYPE_S3)?(string("/"S3_BUCKET_NAME)):(string("/swift/v1/"SWIFT_BUCKET_NAME)))
+  ((g_test->get_key_type() == KEY_TYPE_S3)?(string("/" S3_BUCKET_NAME)):(string("/swift/v1/" SWIFT_BUCKET_NAME)))
 #define GTEST
 #ifdef GTEST
 #include <gtest/gtest.h>
@@ -66,7 +66,7 @@ class test_cors_helper {
     unsigned resp_code;
     key_type kt;
   public:
-    test_cors_helper() : resp_data(NULL), kt(KEY_TYPE_UNDEFINED){
+    test_cors_helper() : curl_inst(NULL), resp_data(NULL), resp_code(0), kt(KEY_TYPE_UNDEFINED){
       curl_global_init(CURL_GLOBAL_ALL);
     }
     ~test_cors_helper(){
@@ -285,13 +285,13 @@ Finisher *finisher;
 
 static int create_bucket(void){
   if(g_test->get_key_type() == KEY_TYPE_S3){
-    g_test->send_request(string("PUT"), string("/"S3_BUCKET_NAME));
+    g_test->send_request(string("PUT"), string("/" S3_BUCKET_NAME));
     if(g_test->get_resp_code() != 200U){
       cout << "Error creating bucket, http code " << g_test->get_resp_code();
       return -1;
     }
   }else if(g_test->get_key_type() == KEY_TYPE_SWIFT){
-    g_test->send_request(string("PUT"), string("/swift/v1/"SWIFT_BUCKET_NAME));
+    g_test->send_request(string("PUT"), string("/swift/v1/" SWIFT_BUCKET_NAME));
     if(g_test->get_resp_code() != 201U){
       cout << "Error creating bucket, http code " << g_test->get_resp_code();
       return -1;
@@ -302,13 +302,13 @@ static int create_bucket(void){
 
 static int delete_bucket(void){
   if(g_test->get_key_type() == KEY_TYPE_S3){
-    g_test->send_request(string("DELETE"), string("/"S3_BUCKET_NAME));
+    g_test->send_request(string("DELETE"), string("/" S3_BUCKET_NAME));
     if(g_test->get_resp_code() != 204U){
       cout << "Error deleting bucket, http code " << g_test->get_resp_code();
       return -1;
     }
   }else if(g_test->get_key_type() == KEY_TYPE_SWIFT){
-    g_test->send_request(string("DELETE"), string("/swift/v1/"SWIFT_BUCKET_NAME));
+    g_test->send_request(string("DELETE"), string("/swift/v1/" SWIFT_BUCKET_NAME));
     if(g_test->get_resp_code() != 204U){
       cout << "Error deleting bucket, http code " << g_test->get_resp_code();
       return -1;
@@ -357,7 +357,7 @@ void send_cors(set<string> o, set<string> h,
     s3 = static_cast<RGWCORSConfiguration_S3 *>(&config);
     s3->to_xml(ss);
 
-    g_test->send_request(string("PUT"), string("/"S3_BUCKET_NAME"?cors"), cors_read_xml, 
+    g_test->send_request(string("PUT"), string("/" S3_BUCKET_NAME "?cors"), cors_read_xml, 
                          (void *)&ss, ss.str().length());
   }else if(g_test->get_key_type() == KEY_TYPE_SWIFT){
     set<string>::iterator it;
@@ -392,7 +392,7 @@ void send_cors(set<string> o, set<string> h,
     //const char *data = "1";
     stringstream ss;
     ss << "1";
-    g_test->send_request(string("POST"), string("/swift/v1/"SWIFT_BUCKET_NAME), cors_read_xml, 
+    g_test->send_request(string("POST"), string("/swift/v1/" SWIFT_BUCKET_NAME), cors_read_xml, 
                          (void *)&ss, 1);
   }
 }
@@ -400,7 +400,7 @@ void send_cors(set<string> o, set<string> h,
 TEST(TestCORS, getcors_firsttime){
   if(g_test->get_key_type() == KEY_TYPE_SWIFT)return;
   ASSERT_EQ(0, create_bucket());
-  g_test->send_request(string("GET"), string("/"S3_BUCKET_NAME"?cors"));
+  g_test->send_request(string("GET"), string("/" S3_BUCKET_NAME "?cors"));
   EXPECT_EQ(404U, g_test->get_resp_code());
   ASSERT_EQ(0, delete_bucket());
 }
@@ -418,7 +418,7 @@ TEST(TestCORS, putcors_firsttime){
 
   /*Now get the CORS and check if its fine*/
   if(g_test->get_key_type() == KEY_TYPE_S3){
-    g_test->send_request(string("GET"), string("/"S3_BUCKET_NAME"?cors"));
+    g_test->send_request(string("GET"), string("/" S3_BUCKET_NAME "?cors"));
     EXPECT_EQ(200U, g_test->get_resp_code());
 
     RGWCORSRule *r = xml_to_cors_rule(string("example.com"));
@@ -854,7 +854,7 @@ TEST(TestCORS, optionscors_test_options_7){
 TEST(TestCORS, deletecors_firsttime){
   if(g_test->get_key_type() == KEY_TYPE_SWIFT)return;
   ASSERT_EQ(0, create_bucket());
-  g_test->send_request("DELETE", "/"S3_BUCKET_NAME"?cors");
+  g_test->send_request("DELETE", "/" S3_BUCKET_NAME "?cors");
   EXPECT_EQ(204U, g_test->get_resp_code());
   ASSERT_EQ(0, delete_bucket());
 }
@@ -870,11 +870,11 @@ TEST(TestCORS, deletecors_test){
   send_cors(origins, h, e, flags, CORS_MAX_AGE_INVALID);
   EXPECT_EQ(((g_test->get_key_type() == KEY_TYPE_SWIFT)?202U:200U), g_test->get_resp_code());
 
-  g_test->send_request("GET", "/"S3_BUCKET_NAME"?cors");
+  g_test->send_request("GET", "/" S3_BUCKET_NAME "?cors");
   EXPECT_EQ(200U, g_test->get_resp_code());
-  g_test->send_request("DELETE", "/"S3_BUCKET_NAME"?cors");
+  g_test->send_request("DELETE", "/" S3_BUCKET_NAME "?cors");
   EXPECT_EQ(204U, g_test->get_resp_code());
-  g_test->send_request("GET", "/"S3_BUCKET_NAME"?cors");
+  g_test->send_request("GET", "/" S3_BUCKET_NAME "?cors");
   EXPECT_EQ(404U, g_test->get_resp_code());
   ASSERT_EQ(0, delete_bucket());
 }
diff --git a/src/test/test_filejournal.cc b/src/test/test_filejournal.cc
index befe761..aaf6487 100644
--- a/src/test/test_filejournal.cc
+++ b/src/test/test_filejournal.cc
@@ -17,8 +17,14 @@ Finisher *finisher;
 Cond sync_cond;
 char path[200];
 uuid_d fsid;
-bool directio = false;
-bool aio = false;
+struct test_info {
+    bool directio, aio, faio;
+    const char *description;
+} subtests[3] = {
+    { false, false, false, "DIRECTIO OFF  AIO OFF" },
+    { true, false, false, "DIRECTIO ON  AIO OFF" },
+    { true, true, true, "DIRECTIO ON  AIO ON"}
+};
 
 // ----
 Cond cond;
@@ -95,21 +101,7 @@ int main(int argc, char **argv) {
 
   finisher->start();
 
-  cout << "DIRECTIO OFF  AIO OFF" << std::endl;
-  directio = false;
-  aio = false;
   int r = RUN_ALL_TESTS();
-  if (r >= 0) {
-    cout << "DIRECTIO ON  AIO OFF" << std::endl;
-    directio = true;
-    r = RUN_ALL_TESTS();
-
-    if (r >= 0) {
-      cout << "DIRECTIO ON  AIO ON" << std::endl;
-      aio = true;
-      r = RUN_ALL_TESTS();
-    }
-  }
   
   finisher->stop();
 
@@ -119,290 +111,366 @@ int main(int argc, char **argv) {
 }
 
 TEST(TestFileJournal, Create) {
-  fsid.generate_random();
-  FileJournal j(fsid, finisher, &sync_cond, path, directio, aio);
-  ASSERT_EQ(0, j.create());
+  g_ceph_context->_conf->set_val("journal_ignore_corruption", "false");
+  g_ceph_context->_conf->set_val("journal_write_header_frequency", "0");
+  g_ceph_context->_conf->apply_changes(NULL);
+
+  for (unsigned i = 0 ; i < 3; ++i) {
+    SCOPED_TRACE(subtests[i].description);
+    fsid.generate_random();
+    FileJournal j(fsid, finisher, &sync_cond, path, subtests[i].directio,
+		  subtests[i].aio, subtests[i].faio);
+    ASSERT_EQ(0, j.create());
+  }
 }
 
 TEST(TestFileJournal, WriteSmall) {
-  fsid.generate_random();
-  FileJournal j(fsid, finisher, &sync_cond, path, directio, aio);
-  ASSERT_EQ(0, j.create());
-  j.make_writeable();
+  g_ceph_context->_conf->set_val("journal_ignore_corruption", "false");
+  g_ceph_context->_conf->set_val("journal_write_header_frequency", "0");
+  g_ceph_context->_conf->apply_changes(NULL);
 
-  bufferlist bl;
-  bl.append("small");
-  j.submit_entry(1, bl, 0, new C_SafeCond(&wait_lock, &cond, &done));
-  wait();
+  for (unsigned i = 0 ; i < 3; ++i) {
+    SCOPED_TRACE(subtests[i].description);
+    fsid.generate_random();
+    FileJournal j(fsid, finisher, &sync_cond, path, subtests[i].directio,
+		  subtests[i].aio, subtests[i].faio);
+    ASSERT_EQ(0, j.create());
+    j.make_writeable();
 
-  j.close();
+    bufferlist bl;
+    bl.append("small");
+    j.submit_entry(1, bl, 0, new C_SafeCond(&wait_lock, &cond, &done));
+    wait();
+
+    j.close();
+  }
 }
 
 TEST(TestFileJournal, WriteBig) {
-  fsid.generate_random();
-  FileJournal j(fsid, finisher, &sync_cond, path, directio, aio);
-  ASSERT_EQ(0, j.create());
-  j.make_writeable();
+  g_ceph_context->_conf->set_val("journal_ignore_corruption", "false");
+  g_ceph_context->_conf->set_val("journal_write_header_frequency", "0");
+  g_ceph_context->_conf->apply_changes(NULL);
 
-  bufferlist bl;
-  while (bl.length() < size_mb*1000/2) {
-    char foo[1024*1024];
-    memset(foo, 1, sizeof(foo));
-    bl.append(foo, sizeof(foo));
-  }
-  j.submit_entry(1, bl, 0, new C_SafeCond(&wait_lock, &cond, &done));
-  wait();
+  for (unsigned i = 0 ; i < 3; ++i) {
+    SCOPED_TRACE(subtests[i].description);
+    fsid.generate_random();
+    FileJournal j(fsid, finisher, &sync_cond, path, subtests[i].directio,
+		  subtests[i].aio, subtests[i].faio);
+    ASSERT_EQ(0, j.create());
+    j.make_writeable();
+
+    bufferlist bl;
+    while (bl.length() < size_mb*1000/2) {
+      char foo[1024*1024];
+      memset(foo, 1, sizeof(foo));
+      bl.append(foo, sizeof(foo));
+    }
+    j.submit_entry(1, bl, 0, new C_SafeCond(&wait_lock, &cond, &done));
+    wait();
 
-  j.close();
+    j.close();
+  }
 }
 
 TEST(TestFileJournal, WriteMany) {
-  fsid.generate_random();
-  FileJournal j(fsid, finisher, &sync_cond, path, directio, aio);
-  ASSERT_EQ(0, j.create());
-  j.make_writeable();
+  g_ceph_context->_conf->set_val("journal_ignore_corruption", "false");
+  g_ceph_context->_conf->set_val("journal_write_header_frequency", "0");
+  g_ceph_context->_conf->apply_changes(NULL);
 
-  C_GatherBuilder gb(g_ceph_context, new C_SafeCond(&wait_lock, &cond, &done));
-  
-  bufferlist bl;
-  bl.append("small");
-  uint64_t seq = 1;
-  for (int i=0; i<100; i++) {
+  for (unsigned i = 0 ; i < 3; ++i) {
+    SCOPED_TRACE(subtests[i].description);
+    fsid.generate_random();
+    FileJournal j(fsid, finisher, &sync_cond, path, subtests[i].directio,
+		  subtests[i].aio, subtests[i].faio);
+    ASSERT_EQ(0, j.create());
+    j.make_writeable();
+
+    C_GatherBuilder gb(g_ceph_context, new C_SafeCond(&wait_lock, &cond, &done));
+
+    bufferlist bl;
     bl.append("small");
-    j.submit_entry(seq++, bl, 0, gb.new_sub());
-  }
+    uint64_t seq = 1;
+    for (int i=0; i<100; i++) {
+      bl.append("small");
+      j.submit_entry(seq++, bl, 0, gb.new_sub());
+    }
 
-  gb.activate();
+    gb.activate();
 
-  wait();
+    wait();
 
-  j.close();
+    j.close();
+  }
 }
 
 TEST(TestFileJournal, WriteManyVecs) {
-  fsid.generate_random();
-  FileJournal j(fsid, finisher, &sync_cond, path, directio, aio);
-  ASSERT_EQ(0, j.create());
-  j.make_writeable();
-
-  C_GatherBuilder gb(g_ceph_context, new C_SafeCond(&wait_lock, &cond, &done));
-
-  bufferlist first;
-  first.append("small");
-  j.submit_entry(1, first, 0, gb.new_sub());
-
-  bufferlist bl;
-  for (int i=0; i<IOV_MAX * 2; i++) {
-    bufferptr bp = buffer::create_page_aligned(4096);
-    memset(bp.c_str(), (char)i, 4096);
-    bl.append(bp);
-  }
-  bufferlist origbl = bl;
-  j.submit_entry(2, bl, 0, gb.new_sub());
-  gb.activate();
-  wait();
-
-  j.close();
-
-  j.open(1);
-  bufferlist inbl;
-  string v;
-  uint64_t seq = 0;
-  ASSERT_EQ(true, j.read_entry(inbl, seq));
-  ASSERT_EQ(seq, 2ull);
-  ASSERT_TRUE(inbl.contents_equal(origbl));
-  j.make_writeable();
-  j.close();
+  g_ceph_context->_conf->set_val("journal_ignore_corruption", "false");
+  g_ceph_context->_conf->set_val("journal_write_header_frequency", "0");
+  g_ceph_context->_conf->apply_changes(NULL);
+
+  for (unsigned i = 0 ; i < 3; ++i) {
+    SCOPED_TRACE(subtests[i].description);
+    fsid.generate_random();
+    FileJournal j(fsid, finisher, &sync_cond, path, subtests[i].directio,
+		  subtests[i].aio, subtests[i].faio);
+    ASSERT_EQ(0, j.create());
+    j.make_writeable();
+
+    C_GatherBuilder gb(g_ceph_context, new C_SafeCond(&wait_lock, &cond, &done));
 
+    bufferlist first;
+    first.append("small");
+    j.submit_entry(1, first, 0, gb.new_sub());
+
+    bufferlist bl;
+    for (int i=0; i<IOV_MAX * 2; i++) {
+      bufferptr bp = buffer::create_page_aligned(4096);
+      memset(bp.c_str(), (char)i, 4096);
+      bl.append(bp);
+    }
+    bufferlist origbl = bl;
+    j.submit_entry(2, bl, 0, gb.new_sub());
+    gb.activate();
+    wait();
+
+    j.close();
+
+    j.open(1);
+    bufferlist inbl;
+    string v;
+    uint64_t seq = 0;
+    ASSERT_EQ(true, j.read_entry(inbl, seq));
+    ASSERT_EQ(seq, 2ull);
+    ASSERT_TRUE(inbl.contents_equal(origbl));
+    j.make_writeable();
+    j.close();
+
+  }
 }
 
 TEST(TestFileJournal, ReplaySmall) {
-  fsid.generate_random();
-  FileJournal j(fsid, finisher, &sync_cond, path, directio, aio);
-  ASSERT_EQ(0, j.create());
-  j.make_writeable();
-  
-  C_GatherBuilder gb(g_ceph_context, new C_SafeCond(&wait_lock, &cond, &done));
-  
-  bufferlist bl;
-  bl.append("small");
-  j.submit_entry(1, bl, 0, gb.new_sub());
-  bl.append("small");
-  j.submit_entry(2, bl, 0, gb.new_sub());
-  bl.append("small");
-  j.submit_entry(3, bl, 0, gb.new_sub());
-  gb.activate();
-  wait();
-
-  j.close();
-
-  j.open(1);
-
-  bufferlist inbl;
-  string v;
-  uint64_t seq = 0;
-  ASSERT_EQ(true, j.read_entry(inbl, seq));
-  ASSERT_EQ(seq, 2ull);
-  inbl.copy(0, inbl.length(), v);
-  ASSERT_EQ("small", v);
-  inbl.clear();
-  v.clear();
-
-  ASSERT_EQ(true, j.read_entry(inbl, seq));
-  ASSERT_EQ(seq, 3ull);
-  inbl.copy(0, inbl.length(), v);
-  ASSERT_EQ("small", v);
-  inbl.clear();
-  v.clear();
-
-  ASSERT_TRUE(!j.read_entry(inbl, seq));
-
-  j.make_writeable();
-  j.close();
+  g_ceph_context->_conf->set_val("journal_ignore_corruption", "false");
+  g_ceph_context->_conf->set_val("journal_write_header_frequency", "0");
+  g_ceph_context->_conf->apply_changes(NULL);
+
+  for (unsigned i = 0 ; i < 3; ++i) {
+    SCOPED_TRACE(subtests[i].description);
+    fsid.generate_random();
+    FileJournal j(fsid, finisher, &sync_cond, path, subtests[i].directio,
+		  subtests[i].aio, subtests[i].faio);
+    ASSERT_EQ(0, j.create());
+    j.make_writeable();
+
+    C_GatherBuilder gb(g_ceph_context, new C_SafeCond(&wait_lock, &cond, &done));
+
+    bufferlist bl;
+    bl.append("small");
+    j.submit_entry(1, bl, 0, gb.new_sub());
+    bl.append("small");
+    j.submit_entry(2, bl, 0, gb.new_sub());
+    bl.append("small");
+    j.submit_entry(3, bl, 0, gb.new_sub());
+    gb.activate();
+    wait();
+
+    j.close();
+
+    j.open(1);
+
+    bufferlist inbl;
+    string v;
+    uint64_t seq = 0;
+    ASSERT_EQ(true, j.read_entry(inbl, seq));
+    ASSERT_EQ(seq, 2ull);
+    inbl.copy(0, inbl.length(), v);
+    ASSERT_EQ("small", v);
+    inbl.clear();
+    v.clear();
+
+    ASSERT_EQ(true, j.read_entry(inbl, seq));
+    ASSERT_EQ(seq, 3ull);
+    inbl.copy(0, inbl.length(), v);
+    ASSERT_EQ("small", v);
+    inbl.clear();
+    v.clear();
+
+    ASSERT_TRUE(!j.read_entry(inbl, seq));
+
+    j.make_writeable();
+    j.close();
+  }
 }
 
 TEST(TestFileJournal, ReplayCorrupt) {
-  fsid.generate_random();
-  FileJournal j(fsid, finisher, &sync_cond, path, directio, aio);
-  ASSERT_EQ(0, j.create());
-  j.make_writeable();
-  
-  C_GatherBuilder gb(g_ceph_context, new C_SafeCond(&wait_lock, &cond, &done));
-  
-  const char *needle =    "i am a needle";
-  const char *newneedle = "in a haystack";
-  bufferlist bl;
-  bl.append(needle);
-  j.submit_entry(1, bl, 0, gb.new_sub());
-  bl.append(needle);
-  j.submit_entry(2, bl, 0, gb.new_sub());
-  bl.append(needle);
-  j.submit_entry(3, bl, 0, gb.new_sub());
-  bl.append(needle);
-  j.submit_entry(4, bl, 0, gb.new_sub());
-  gb.activate();
-  wait();
-
-  j.close();
-
-  cout << "corrupting journal" << std::endl;
-  char buf[1024*128];
-  int fd = open(path, O_RDONLY);
-  ASSERT_GE(fd, 0);
-  int r = safe_read_exact(fd, buf, sizeof(buf));
-  ASSERT_EQ(0, r);
-  int n = 0;
-  for (unsigned o=0; o < sizeof(buf) - strlen(needle); o++) {
-    if (memcmp(buf+o, needle, strlen(needle)) == 0) {
-      if (n >= 2) {
+  g_ceph_context->_conf->set_val("journal_ignore_corruption", "true");
+  g_ceph_context->_conf->set_val("journal_write_header_frequency", "0");
+  g_ceph_context->_conf->apply_changes(NULL);
+
+  for (unsigned i = 0 ; i < 3; ++i) {
+    SCOPED_TRACE(subtests[i].description);
+    fsid.generate_random();
+    FileJournal j(fsid, finisher, &sync_cond, path, subtests[i].directio,
+		  subtests[i].aio, subtests[i].faio);
+    ASSERT_EQ(0, j.create());
+    j.make_writeable();
+
+    C_GatherBuilder gb(g_ceph_context, new C_SafeCond(&wait_lock, &cond, &done));
+
+    const char *needle =    "i am a needle";
+    const char *newneedle = "in a haystack";
+    bufferlist bl;
+    bl.append(needle);
+    j.submit_entry(1, bl, 0, gb.new_sub());
+    bl.append(needle);
+    j.submit_entry(2, bl, 0, gb.new_sub());
+    bl.append(needle);
+    j.submit_entry(3, bl, 0, gb.new_sub());
+    bl.append(needle);
+    j.submit_entry(4, bl, 0, gb.new_sub());
+    gb.activate();
+    wait();
+
+    j.close();
+
+    cout << "corrupting journal" << std::endl;
+    char buf[1024*128];
+    int fd = open(path, O_RDONLY);
+    ASSERT_GE(fd, 0);
+    int r = safe_read_exact(fd, buf, sizeof(buf));
+    ASSERT_EQ(0, r);
+    int n = 0;
+    for (unsigned o=0; o < sizeof(buf) - strlen(needle); o++) {
+      if (memcmp(buf+o, needle, strlen(needle)) == 0) {
+        if (n >= 2) {
 	cout << "replacing at offset " << o << std::endl;
 	memcpy(buf+o, newneedle, strlen(newneedle));
-      } else {
+        } else {
 	cout << "leaving at offset " << o << std::endl;
+        }
+        n++;
       }
-      n++;
     }
+    ASSERT_EQ(n, 4);
+    close(fd);
+    fd = open(path, O_WRONLY);
+    ASSERT_GE(fd, 0);
+    r = safe_write(fd, buf, sizeof(buf));
+    ASSERT_EQ(r, 0);
+    close(fd);
+
+    j.open(1);
+
+    bufferlist inbl;
+    string v;
+    uint64_t seq = 0;
+    ASSERT_EQ(true, j.read_entry(inbl, seq));
+    ASSERT_EQ(seq, 2ull);
+    inbl.copy(0, inbl.length(), v);
+    ASSERT_EQ(needle, v);
+    inbl.clear();
+    v.clear();
+    bool corrupt;
+    ASSERT_FALSE(j.read_entry(inbl, seq, &corrupt));
+    ASSERT_TRUE(corrupt);
+
+    j.make_writeable();
+    j.close();
   }
-  ASSERT_EQ(n, 4);
-  close(fd);
-  fd = open(path, O_WRONLY);
-  ASSERT_GE(fd, 0);
-  r = safe_write(fd, buf, sizeof(buf));
-  ASSERT_EQ(r, 0);
-  close(fd);
-
-  j.open(1);
-
-  bufferlist inbl;
-  string v;
-  uint64_t seq = 0;
-  ASSERT_EQ(true, j.read_entry(inbl, seq));
-  ASSERT_EQ(seq, 2ull);
-  inbl.copy(0, inbl.length(), v);
-  ASSERT_EQ(needle, v);
-  inbl.clear();
-  v.clear();
-  ASSERT_TRUE(!j.read_entry(inbl, seq));
-
-  j.make_writeable();
-  j.close();
 }
 
 TEST(TestFileJournal, WriteTrim) {
-  fsid.generate_random();
-  FileJournal j(fsid, finisher, &sync_cond, path, directio, aio);
-  ASSERT_EQ(0, j.create());
-  j.make_writeable();
+  g_ceph_context->_conf->set_val("journal_ignore_corruption", "false");
+  g_ceph_context->_conf->set_val("journal_write_header_frequency", "0");
+  g_ceph_context->_conf->apply_changes(NULL);
 
-  list<C_Sync*> ls;
-  
-  bufferlist bl;
-  char foo[1024*1024];
-  memset(foo, 1, sizeof(foo));
+  for (unsigned i = 0 ; i < 3; ++i) {
+    SCOPED_TRACE(subtests[i].description);
+    fsid.generate_random();
+    FileJournal j(fsid, finisher, &sync_cond, path, subtests[i].directio,
+		  subtests[i].aio, subtests[i].faio);
+    ASSERT_EQ(0, j.create());
+    j.make_writeable();
 
-  uint64_t seq = 1, committed = 0;
+    list<C_Sync*> ls;
 
-  for (unsigned i=0; i<size_mb*2; i++) {
-    bl.clear();
-    bl.push_back(buffer::copy(foo, sizeof(foo)));
-    bl.zero();
-    ls.push_back(new C_Sync);
-    j.submit_entry(seq++, bl, 0, ls.back()->c);
+    bufferlist bl;
+    char foo[1024*1024];
+    memset(foo, 1, sizeof(foo));
+
+    uint64_t seq = 1, committed = 0;
+
+    for (unsigned i=0; i<size_mb*2; i++) {
+      bl.clear();
+      bl.push_back(buffer::copy(foo, sizeof(foo)));
+      bl.zero();
+      ls.push_back(new C_Sync);
+      j.submit_entry(seq++, bl, 0, ls.back()->c);
 
-    while (ls.size() > size_mb/2) {
+      while (ls.size() > size_mb/2) {
+        delete ls.front();
+        ls.pop_front();
+        committed++;
+        j.committed_thru(committed);
+      }
+    }
+
+    while (ls.size()) {
       delete ls.front();
       ls.pop_front();
-      committed++;
-      j.committed_thru(committed);
+      j.committed_thru(++committed);
     }
-  }
 
-  while (ls.size()) {
-    delete ls.front();
-    ls.pop_front();
-    j.committed_thru(committed);
-  }
+    ASSERT_TRUE(j.journalq_empty());
 
-  j.close();
+    j.close();
+  }
 }
 
 TEST(TestFileJournal, WriteTrimSmall) {
-  fsid.generate_random();
-  FileJournal j(fsid, finisher, &sync_cond, path, directio);
-  ASSERT_EQ(0, j.create());
-  j.make_writeable();
+  g_ceph_context->_conf->set_val("journal_ignore_corruption", "false");
+  g_ceph_context->_conf->set_val("journal_write_header_frequency", "0");
+  g_ceph_context->_conf->apply_changes(NULL);
 
-  list<C_Sync*> ls;
-  
-  bufferlist bl;
-  char foo[1024*1024];
-  memset(foo, 1, sizeof(foo));
+  for (unsigned i = 0 ; i < 3; ++i) {
+    SCOPED_TRACE(subtests[i].description);
+    fsid.generate_random();
+    FileJournal j(fsid, finisher, &sync_cond, path, subtests[i].directio,
+		  subtests[i].aio, subtests[i].faio);
+    ASSERT_EQ(0, j.create());
+    j.make_writeable();
 
-  uint64_t seq = 1, committed = 0;
+    list<C_Sync*> ls;
 
-  for (unsigned i=0; i<size_mb*2; i++) {
-    bl.clear();
-    for (int k=0; k<128; k++)
-      bl.push_back(buffer::copy(foo, sizeof(foo) / 128));
-    bl.zero();
-    ls.push_back(new C_Sync);
-    j.submit_entry(seq++, bl, 0, ls.back()->c);
+    bufferlist bl;
+    char foo[1024*1024];
+    memset(foo, 1, sizeof(foo));
 
-    while (ls.size() > size_mb/2) {
+    uint64_t seq = 1, committed = 0;
+
+    for (unsigned i=0; i<size_mb*2; i++) {
+      bl.clear();
+      for (int k=0; k<128; k++)
+        bl.push_back(buffer::copy(foo, sizeof(foo) / 128));
+      bl.zero();
+      ls.push_back(new C_Sync);
+      j.submit_entry(seq++, bl, 0, ls.back()->c);
+
+      while (ls.size() > size_mb/2) {
+        delete ls.front();
+        ls.pop_front();
+        committed++;
+        j.committed_thru(committed);
+      }
+    }
+
+    while (ls.size()) {
       delete ls.front();
       ls.pop_front();
-      committed++;
       j.committed_thru(committed);
     }
-  }
 
-  while (ls.size()) {
-    delete ls.front();
-    ls.pop_front();
-    j.committed_thru(committed);
+    j.close();
   }
-
-  j.close();
 }
 
 TEST(TestFileJournal, ReplayDetectCorruptFooterMagic) {
@@ -410,49 +478,53 @@ TEST(TestFileJournal, ReplayDetectCorruptFooterMagic) {
   g_ceph_context->_conf->set_val("journal_write_header_frequency", "1");
   g_ceph_context->_conf->apply_changes(NULL);
 
-  fsid.generate_random();
-  FileJournal j(fsid, finisher, &sync_cond, path, directio, aio);
-  ASSERT_EQ(0, j.create());
-  j.make_writeable();
-
-  C_GatherBuilder gb(g_ceph_context, new C_SafeCond(&wait_lock, &cond, &done));
+  for (unsigned i = 0 ; i < 3; ++i) {
+    SCOPED_TRACE(subtests[i].description);
+    fsid.generate_random();
+    FileJournal j(fsid, finisher, &sync_cond, path, subtests[i].directio,
+		  subtests[i].aio, subtests[i].faio);
+    ASSERT_EQ(0, j.create());
+    j.make_writeable();
+
+    C_GatherBuilder gb(g_ceph_context, new C_SafeCond(&wait_lock, &cond, &done));
+
+    const char *needle =    "i am a needle";
+    for (unsigned i = 1; i <= 4; ++i) {
+      bufferlist bl;
+      bl.append(needle);
+      j.submit_entry(i, bl, 0, gb.new_sub());
+    }
+    gb.activate();
+    wait();
 
-  const char *needle =    "i am a needle";
-  for (unsigned i = 1; i <= 4; ++i) {
     bufferlist bl;
-    bl.append(needle);
-    j.submit_entry(i, bl, 0, gb.new_sub());
+    bl.append("needle");
+    j.submit_entry(5, bl, 0, new C_SafeCond(&wait_lock, &cond, &done));
+    wait();
+
+    j.close();
+    int fd = open(path, O_WRONLY);
+
+    cout << "corrupting journal" << std::endl;
+    j.open(0);
+    j.corrupt_footer_magic(fd, 2);
+
+    uint64_t seq = 0;
+    bl.clear();
+    bool corrupt = false;
+    bool result = j.read_entry(bl, seq, &corrupt);
+    ASSERT_TRUE(result);
+    ASSERT_EQ(seq, 1UL);
+    ASSERT_FALSE(corrupt);
+
+    result = j.read_entry(bl, seq, &corrupt);
+    ASSERT_FALSE(result);
+    ASSERT_TRUE(corrupt);
+
+    j.make_writeable();
+    j.close();
+    ::close(fd);
   }
-  gb.activate();
-  wait();
-
-  bufferlist bl;
-  bl.append("needle");
-  j.submit_entry(5, bl, 0, new C_SafeCond(&wait_lock, &cond, &done));
-  wait();
-
-  j.close();
-  int fd = open(path, O_WRONLY);
-
-  cout << "corrupting journal" << std::endl;
-  j.open(0);
-  j.corrupt_footer_magic(fd, 2);
-
-  uint64_t seq = 0;
-  bl.clear();
-  bool corrupt = false;
-  bool result = j.read_entry(bl, seq, &corrupt);
-  ASSERT_TRUE(result);
-  ASSERT_EQ(seq, 1UL);
-  ASSERT_FALSE(corrupt);
-
-  result = j.read_entry(bl, seq, &corrupt);
-  ASSERT_FALSE(result);
-  ASSERT_TRUE(corrupt);
-
-  j.make_writeable();
-  j.close();
-  ::close(fd);
 }
 
 TEST(TestFileJournal, ReplayDetectCorruptPayload) {
@@ -460,49 +532,53 @@ TEST(TestFileJournal, ReplayDetectCorruptPayload) {
   g_ceph_context->_conf->set_val("journal_write_header_frequency", "1");
   g_ceph_context->_conf->apply_changes(NULL);
 
-  fsid.generate_random();
-  FileJournal j(fsid, finisher, &sync_cond, path, directio, aio);
-  ASSERT_EQ(0, j.create());
-  j.make_writeable();
-
-  C_GatherBuilder gb(g_ceph_context, new C_SafeCond(&wait_lock, &cond, &done));
+  for (unsigned i = 0 ; i < 3; ++i) {
+    SCOPED_TRACE(subtests[i].description);
+    fsid.generate_random();
+    FileJournal j(fsid, finisher, &sync_cond, path, subtests[i].directio,
+		  subtests[i].aio, subtests[i].faio);
+    ASSERT_EQ(0, j.create());
+    j.make_writeable();
+
+    C_GatherBuilder gb(g_ceph_context, new C_SafeCond(&wait_lock, &cond, &done));
+
+    const char *needle =    "i am a needle";
+    for (unsigned i = 1; i <= 4; ++i) {
+      bufferlist bl;
+      bl.append(needle);
+      j.submit_entry(i, bl, 0, gb.new_sub());
+    }
+    gb.activate();
+    wait();
 
-  const char *needle =    "i am a needle";
-  for (unsigned i = 1; i <= 4; ++i) {
     bufferlist bl;
-    bl.append(needle);
-    j.submit_entry(i, bl, 0, gb.new_sub());
+    bl.append("needle");
+    j.submit_entry(5, bl, 0, new C_SafeCond(&wait_lock, &cond, &done));
+    wait();
+
+    j.close();
+    int fd = open(path, O_WRONLY);
+
+    cout << "corrupting journal" << std::endl;
+    j.open(0);
+    j.corrupt_payload(fd, 2);
+
+    uint64_t seq = 0;
+    bl.clear();
+    bool corrupt = false;
+    bool result = j.read_entry(bl, seq, &corrupt);
+    ASSERT_TRUE(result);
+    ASSERT_EQ(seq, 1UL);
+    ASSERT_FALSE(corrupt);
+
+    result = j.read_entry(bl, seq, &corrupt);
+    ASSERT_FALSE(result);
+    ASSERT_TRUE(corrupt);
+
+    j.make_writeable();
+    j.close();
+    ::close(fd);
   }
-  gb.activate();
-  wait();
-
-  bufferlist bl;
-  bl.append("needle");
-  j.submit_entry(5, bl, 0, new C_SafeCond(&wait_lock, &cond, &done));
-  wait();
-
-  j.close();
-  int fd = open(path, O_WRONLY);
-
-  cout << "corrupting journal" << std::endl;
-  j.open(0);
-  j.corrupt_payload(fd, 2);
-
-  uint64_t seq = 0;
-  bl.clear();
-  bool corrupt = false;
-  bool result = j.read_entry(bl, seq, &corrupt);
-  ASSERT_TRUE(result);
-  ASSERT_EQ(seq, 1UL);
-  ASSERT_FALSE(corrupt);
-
-  result = j.read_entry(bl, seq, &corrupt);
-  ASSERT_FALSE(result);
-  ASSERT_TRUE(corrupt);
-
-  j.make_writeable();
-  j.close();
-  ::close(fd);
 }
 
 TEST(TestFileJournal, ReplayDetectCorruptHeader) {
@@ -510,47 +586,51 @@ TEST(TestFileJournal, ReplayDetectCorruptHeader) {
   g_ceph_context->_conf->set_val("journal_write_header_frequency", "1");
   g_ceph_context->_conf->apply_changes(NULL);
 
-  fsid.generate_random();
-  FileJournal j(fsid, finisher, &sync_cond, path, directio, aio);
-  ASSERT_EQ(0, j.create());
-  j.make_writeable();
-
-  C_GatherBuilder gb(g_ceph_context, new C_SafeCond(&wait_lock, &cond, &done));
+  for (unsigned i = 0 ; i < 3; ++i) {
+    SCOPED_TRACE(subtests[i].description);
+    fsid.generate_random();
+    FileJournal j(fsid, finisher, &sync_cond, path, subtests[i].directio,
+		  subtests[i].aio, subtests[i].faio);
+    ASSERT_EQ(0, j.create());
+    j.make_writeable();
+
+    C_GatherBuilder gb(g_ceph_context, new C_SafeCond(&wait_lock, &cond, &done));
+
+    const char *needle =    "i am a needle";
+    for (unsigned i = 1; i <= 4; ++i) {
+      bufferlist bl;
+      bl.append(needle);
+      j.submit_entry(i, bl, 0, gb.new_sub());
+    }
+    gb.activate();
+    wait();
 
-  const char *needle =    "i am a needle";
-  for (unsigned i = 1; i <= 4; ++i) {
     bufferlist bl;
-    bl.append(needle);
-    j.submit_entry(i, bl, 0, gb.new_sub());
+    bl.append("needle");
+    j.submit_entry(5, bl, 0, new C_SafeCond(&wait_lock, &cond, &done));
+    wait();
+
+    j.close();
+    int fd = open(path, O_WRONLY);
+
+    cout << "corrupting journal" << std::endl;
+    j.open(0);
+    j.corrupt_header_magic(fd, 2);
+
+    uint64_t seq = 0;
+    bl.clear();
+    bool corrupt = false;
+    bool result = j.read_entry(bl, seq, &corrupt);
+    ASSERT_TRUE(result);
+    ASSERT_EQ(seq, 1UL);
+    ASSERT_FALSE(corrupt);
+
+    result = j.read_entry(bl, seq, &corrupt);
+    ASSERT_FALSE(result);
+    ASSERT_TRUE(corrupt);
+
+    j.make_writeable();
+    j.close();
+    ::close(fd);
   }
-  gb.activate();
-  wait();
-
-  bufferlist bl;
-  bl.append("needle");
-  j.submit_entry(5, bl, 0, new C_SafeCond(&wait_lock, &cond, &done));
-  wait();
-
-  j.close();
-  int fd = open(path, O_WRONLY);
-
-  cout << "corrupting journal" << std::endl;
-  j.open(0);
-  j.corrupt_header_magic(fd, 2);
-
-  uint64_t seq = 0;
-  bl.clear();
-  bool corrupt = false;
-  bool result = j.read_entry(bl, seq, &corrupt);
-  ASSERT_TRUE(result);
-  ASSERT_EQ(seq, 1UL);
-  ASSERT_FALSE(corrupt);
-
-  result = j.read_entry(bl, seq, &corrupt);
-  ASSERT_FALSE(result);
-  ASSERT_TRUE(corrupt);
-
-  j.make_writeable();
-  j.close();
-  ::close(fd);
 }
diff --git a/src/test/test_get_blkdev_size.cc b/src/test/test_get_blkdev_size.cc
index ba28f1c..db19c79 100644
--- a/src/test/test_get_blkdev_size.cc
+++ b/src/test/test_get_blkdev_size.cc
@@ -5,6 +5,7 @@
 #include <inttypes.h>
 #include <sys/types.h>
 #include <sys/stat.h>
+#include "include/uuid.h"
 #include "common/blkdev.h"
 
 int main(int argc, char **argv)
diff --git a/src/test/test_objectstore_memstore.sh b/src/test/test_objectstore_memstore.sh
new file mode 100755
index 0000000..5296fd3
--- /dev/null
+++ b/src/test/test_objectstore_memstore.sh
@@ -0,0 +1,5 @@
+#!/bin/sh -ex
+
+./ceph_test_objectstore --gtest_filter=\*/0
+
+echo OK
diff --git a/src/test/test_pageset.cc b/src/test/test_pageset.cc
new file mode 100644
index 0000000..c105af7
--- /dev/null
+++ b/src/test/test_pageset.cc
@@ -0,0 +1,271 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#include "gtest/gtest.h"
+
+#include "os/PageSet.h"
+
+TEST(PageSet, AllocAligned)
+{
+  PageSet pages(1);
+  PageSet::page_vector range;
+
+  pages.alloc_range(0, 4, range);
+  ASSERT_EQ(4u, range.size());
+  ASSERT_EQ(0u, range[0]->offset);
+  ASSERT_EQ(1u, range[1]->offset);
+  ASSERT_EQ(2u, range[2]->offset);
+  ASSERT_EQ(3u, range[3]->offset);
+}
+
+TEST(PageSet, AllocUnaligned)
+{
+  PageSet pages(2);
+  PageSet::page_vector range;
+
+  // front of first page
+  pages.alloc_range(0, 1, range);
+  ASSERT_EQ(1u, range.size());
+  ASSERT_EQ(0u, range[0]->offset);
+  range.clear();
+
+  // back of first page
+  pages.alloc_range(1, 1, range);
+  ASSERT_EQ(1u, range.size());
+  ASSERT_EQ(0u, range[0]->offset);
+  range.clear();
+
+  // back of first page and front of second
+  pages.alloc_range(1, 2, range);
+  ASSERT_EQ(2u, range.size());
+  ASSERT_EQ(0u, range[0]->offset);
+  ASSERT_EQ(2u, range[1]->offset);
+  range.clear();
+
+  // back of first page and all of second
+  pages.alloc_range(1, 3, range);
+  ASSERT_EQ(2u, range.size());
+  ASSERT_EQ(0u, range[0]->offset);
+  ASSERT_EQ(2u, range[1]->offset);
+  range.clear();
+
+  // back of first page, all of second, and front of third
+  pages.alloc_range(1, 4, range);
+  ASSERT_EQ(3u, range.size());
+  ASSERT_EQ(0u, range[0]->offset);
+  ASSERT_EQ(2u, range[1]->offset);
+  ASSERT_EQ(4u, range[2]->offset);
+}
+
+TEST(PageSet, GetAligned)
+{
+  // allocate 4 pages
+  PageSet pages(1);
+  PageSet::page_vector range;
+  pages.alloc_range(0, 4, range);
+  range.clear();
+
+  // get first page
+  pages.get_range(0, 1, range);
+  ASSERT_EQ(1u, range.size());
+  ASSERT_EQ(0u, range[0]->offset);
+  range.clear();
+
+  // get second and third pages
+  pages.get_range(1, 2, range);
+  ASSERT_EQ(2u, range.size());
+  ASSERT_EQ(1u, range[0]->offset);
+  ASSERT_EQ(2u, range[1]->offset);
+  range.clear();
+
+  // get all four pages
+  pages.get_range(0, 4, range);
+  ASSERT_EQ(4u, range.size());
+  ASSERT_EQ(0u, range[0]->offset);
+  ASSERT_EQ(1u, range[1]->offset);
+  ASSERT_EQ(2u, range[2]->offset);
+  ASSERT_EQ(3u, range[3]->offset);
+  range.clear();
+}
+
+TEST(PageSet, GetUnaligned)
+{
+  // allocate 3 pages
+  PageSet pages(2);
+  PageSet::page_vector range;
+  pages.alloc_range(0, 6, range);
+  range.clear();
+
+  // front of first page
+  pages.get_range(0, 1, range);
+  ASSERT_EQ(1u, range.size());
+  ASSERT_EQ(0u, range[0]->offset);
+  range.clear();
+
+  // back of first page
+  pages.get_range(1, 1, range);
+  ASSERT_EQ(1u, range.size());
+  ASSERT_EQ(0u, range[0]->offset);
+  range.clear();
+
+  // back of first page and front of second
+  pages.get_range(1, 2, range);
+  ASSERT_EQ(2u, range.size());
+  ASSERT_EQ(0u, range[0]->offset);
+  ASSERT_EQ(2u, range[1]->offset);
+  range.clear();
+
+  // back of first page and all of second
+  pages.get_range(1, 3, range);
+  ASSERT_EQ(2u, range.size());
+  ASSERT_EQ(0u, range[0]->offset);
+  ASSERT_EQ(2u, range[1]->offset);
+  range.clear();
+
+  // back of first page, all of second, and front of third
+  pages.get_range(1, 4, range);
+  ASSERT_EQ(3u, range.size());
+  ASSERT_EQ(0u, range[0]->offset);
+  ASSERT_EQ(2u, range[1]->offset);
+  ASSERT_EQ(4u, range[2]->offset);
+  range.clear();
+
+  // back of third page with nothing beyond
+  pages.get_range(5, 999, range);
+  ASSERT_EQ(1u, range.size());
+  ASSERT_EQ(4u, range[0]->offset);
+  range.clear();
+}
+
+TEST(PageSet, GetHoles)
+{
+  // allocate pages at offsets 1, 2, 5, and 7
+  PageSet pages(1);
+  PageSet::page_vector range;
+  for (uint64_t i : {1, 2, 5, 7})
+    pages.alloc_range(i, 1, range);
+  range.clear();
+
+  // nothing at offset 0, page at offset 1
+  pages.get_range(0, 2, range);
+  ASSERT_EQ(1u, range.size());
+  ASSERT_EQ(1u, range[0]->offset);
+  range.clear();
+
+  // nothing at offset 0, pages at offset 1 and 2, nothing at offset 3
+  pages.get_range(0, 4, range);
+  ASSERT_EQ(2u, range.size());
+  ASSERT_EQ(1u, range[0]->offset);
+  ASSERT_EQ(2u, range[1]->offset);
+  range.clear();
+
+  // page at offset 2, nothing at offset 3 or 4
+  pages.get_range(2, 3, range);
+  ASSERT_EQ(1u, range.size());
+  ASSERT_EQ(2u, range[0]->offset);
+  range.clear();
+
+  // get the full range
+  pages.get_range(0, 999, range);
+  ASSERT_EQ(4u, range.size());
+  ASSERT_EQ(1u, range[0]->offset);
+  ASSERT_EQ(2u, range[1]->offset);
+  ASSERT_EQ(5u, range[2]->offset);
+  ASSERT_EQ(7u, range[3]->offset);
+  range.clear();
+}
+
+TEST(PageSet, FreeAligned)
+{
+  // allocate 4 pages
+  PageSet pages(1);
+  PageSet::page_vector range;
+  pages.alloc_range(0, 4, range);
+  range.clear();
+
+  // get the full range
+  pages.get_range(0, 4, range);
+  ASSERT_EQ(4u, range.size());
+  range.clear();
+
+  // free after offset 4 has no effect
+  pages.free_pages_after(4);
+  pages.get_range(0, 4, range);
+  ASSERT_EQ(4u, range.size());
+  range.clear();
+
+  // free page 4
+  pages.free_pages_after(3);
+  pages.get_range(0, 4, range);
+  ASSERT_EQ(3u, range.size());
+  range.clear();
+
+  // free pages 2 and 3
+  pages.free_pages_after(1);
+  pages.get_range(0, 4, range);
+  ASSERT_EQ(1u, range.size());
+  range.clear();
+}
+
+TEST(PageSet, FreeUnaligned)
+{
+  // allocate 4 pages
+  PageSet pages(2);
+  PageSet::page_vector range;
+  pages.alloc_range(0, 8, range);
+  range.clear();
+
+  // get the full range
+  pages.get_range(0, 8, range);
+  ASSERT_EQ(4u, range.size());
+  range.clear();
+
+  // free after offset 7 has no effect
+  pages.free_pages_after(7);
+  pages.get_range(0, 8, range);
+  ASSERT_EQ(4u, range.size());
+  range.clear();
+
+  // free page 4
+  pages.free_pages_after(5);
+  pages.get_range(0, 8, range);
+  ASSERT_EQ(3u, range.size());
+  range.clear();
+
+  // free pages 2 and 3
+  pages.free_pages_after(1);
+  pages.get_range(0, 8, range);
+  ASSERT_EQ(1u, range.size());
+  range.clear();
+}
+
+TEST(PageSet, FreeHoles)
+{
+  // allocate pages at offsets 1, 2, 5, and 7
+  PageSet pages(1);
+  PageSet::page_vector range;
+  for (uint64_t i : {1, 2, 5, 7})
+    pages.alloc_range(i, 1, range);
+  range.clear();
+
+  // get the full range
+  pages.get_range(0, 8, range);
+  ASSERT_EQ(4u, range.size());
+  range.clear();
+
+  // free page 7
+  pages.free_pages_after(6);
+  pages.get_range(0, 8, range);
+  ASSERT_EQ(3u, range.size());
+  range.clear();
+
+  // free page 5
+  pages.free_pages_after(3);
+  pages.get_range(0, 8, range);
+  ASSERT_EQ(2u, range.size());
+  range.clear();
+
+  // free pages 1 and 2
+  pages.free_pages_after(0);
+  pages.get_range(0, 8, range);
+  ASSERT_EQ(0u, range.size());
+}
diff --git a/src/test/test_rbd_replay.cc b/src/test/test_rbd_replay.cc
index 633c5fb..2414687 100644
--- a/src/test/test_rbd_replay.cc
+++ b/src/test/test_rbd_replay.cc
@@ -17,11 +17,9 @@
 #include <stdint.h>
 #include <boost/foreach.hpp>
 #include <cstdarg>
-#include "rbd_replay/Deser.hpp"
 #include "rbd_replay/ImageNameMap.hpp"
 #include "rbd_replay/ios.hpp"
 #include "rbd_replay/rbd_loc.hpp"
-#include "rbd_replay/Ser.hpp"
 
 
 using namespace rbd_replay;
@@ -38,41 +36,6 @@ static void add_mapping(ImageNameMap *map, std::string mapping_string) {
   map->add_mapping(mapping);
 }
 
-TEST(RBDReplay, Ser) {
-  std::ostringstream oss;
-  rbd_replay::Ser ser(oss);
-  ser.write_uint32_t(0x01020304u);
-  ser.write_string("hello");
-  ser.write_bool(true);
-  ser.write_bool(false);
-  std::string s(oss.str());
-  const char* data = s.data();
-  size_t size = s.size();
-  ASSERT_EQ(15U, size);
-  const char expected_data[] = {1, 2, 3, 4, 0, 0, 0, 5, 'h', 'e', 'l', 'l', 'o', 1, 0};
-  for (size_t i = 0; i < size; i++) {
-    EXPECT_EQ(expected_data[i], data[i]);
-  }
-}
-
-TEST(RBDReplay, Deser) {
-  const char data[] = {1, 2, 3, 4, 0, 0, 0, 5, 'h', 'e', 'l', 'l', 'o', 1, 0};
-  const std::string s(data, sizeof(data));
-  std::istringstream iss(s);
-  rbd_replay::Deser deser(iss);
-  EXPECT_FALSE(deser.eof());
-  EXPECT_EQ(0x01020304u, deser.read_uint32_t());
-  EXPECT_FALSE(deser.eof());
-  EXPECT_EQ("hello", deser.read_string());
-  EXPECT_FALSE(deser.eof());
-  EXPECT_TRUE(deser.read_bool());
-  EXPECT_FALSE(deser.eof());
-  EXPECT_FALSE(deser.read_bool());
-  EXPECT_FALSE(deser.eof());
-  deser.read_uint8_t();
-  EXPECT_TRUE(deser.eof());
-}
-
 TEST(RBDReplay, ImageNameMap) {
   ImageNameMap m;
   add_mapping(&m, "x at y=y at x");
@@ -169,57 +132,3 @@ TEST(RBDReplay, rbd_loc_parse) {
   EXPECT_FALSE(m.parse("a at b/c"));
 }
 
-static IO::ptr mkio(action_id_t ionum, int num_expected, ...) {
-  IO::ptr io(new StartThreadIO(ionum, ionum, 0));
-
-  va_list ap;
-  va_start(ap, num_expected);
-  for (int i = 0; i < num_expected ; i++) {
-    IO::ptr* dep = va_arg(ap, IO::ptr*);
-    if (!dep) {
-      break;
-    }
-    io->dependencies().insert(*dep);
-  }
-  va_end(ap);
-
-  return io;
-}
-
-TEST(RBDReplay, batch_unreachable_from) {
-  io_set_t deps;
-  io_set_t base;
-  io_set_t unreachable;
-  IO::ptr io1(mkio(1, 0));
-  IO::ptr io2(mkio(2, 1, &io1));
-  IO::ptr io3(mkio(3, 1, &io2));
-  IO::ptr io4(mkio(4, 1, &io1));
-  IO::ptr io5(mkio(5, 2, &io2, &io4));
-  IO::ptr io6(mkio(6, 2, &io3, &io5));
-  IO::ptr io7(mkio(7, 1, &io4));
-  IO::ptr io8(mkio(8, 2, &io5, &io7));
-  IO::ptr io9(mkio(9, 2, &io6, &io8));
-  // 1 (deps) <-- 2 (deps) <-- 3 (deps)
-  // ^            ^            ^
-  // |            |            |
-  // 4 <--------- 5 (base) <-- 6 (deps)
-  // ^            ^            ^
-  // |            |            |
-  // 7 <--------- 8 <--------- 9
-  deps.insert(io1);
-  deps.insert(io2);
-  deps.insert(io3);
-  deps.insert(io6);
-  base.insert(io5);
-  // Anything in 'deps' which is not reachable from 'base' is added to 'unreachable'
-  batch_unreachable_from(deps, base, &unreachable);
-  EXPECT_EQ(0U, unreachable.count(io1));
-  EXPECT_EQ(0U, unreachable.count(io2));
-  EXPECT_EQ(1U, unreachable.count(io3));
-  EXPECT_EQ(0U, unreachable.count(io4));
-  EXPECT_EQ(0U, unreachable.count(io5));
-  EXPECT_EQ(1U, unreachable.count(io6));
-  EXPECT_EQ(0U, unreachable.count(io7));
-  EXPECT_EQ(0U, unreachable.count(io8));
-  EXPECT_EQ(0U, unreachable.count(io9));
-}
diff --git a/src/test/test_rgw_admin_log.cc b/src/test/test_rgw_admin_log.cc
index 7ffea97..46b69eb 100644
--- a/src/test/test_rgw_admin_log.cc
+++ b/src/test/test_rgw_admin_log.cc
@@ -466,7 +466,7 @@ int caps_rm(const char * name, const char *perm) {
 }
 
 static int create_bucket(void){
-  g_test->send_request(string("PUT"), string("/"TEST_BUCKET_NAME));
+  g_test->send_request(string("PUT"), string("/" TEST_BUCKET_NAME));
   if(g_test->get_resp_code() != 200U){
     cout << "Error creating bucket, http code " << g_test->get_resp_code();
     return -1;
@@ -475,7 +475,7 @@ static int create_bucket(void){
 }
 
 static int delete_bucket(void){
-  g_test->send_request(string("DELETE"), string("/"TEST_BUCKET_NAME));
+  g_test->send_request(string("DELETE"), string("/" TEST_BUCKET_NAME));
   if(g_test->get_resp_code() != 204U){
     cout << "Error deleting bucket, http code " << g_test->get_resp_code();
     return -1;
@@ -495,7 +495,7 @@ size_t read_bucket_object(void *ptr, size_t s, size_t n, void *ud) {
 }
 
 static int put_bucket_obj(const char *obj_name, char *data, unsigned len) {
-  string req = "/"TEST_BUCKET_NAME"/";
+  string req = "/" TEST_BUCKET_NAME"/";
   req.append(obj_name);
   g_test->send_request(string("PUT"), req,
                        read_bucket_object, (void *)data, (size_t)len);
@@ -507,7 +507,7 @@ static int put_bucket_obj(const char *obj_name, char *data, unsigned len) {
 }
 
 static int read_bucket_obj(const char *obj_name) {
-  string req = "/"TEST_BUCKET_NAME"/";
+  string req = "/" TEST_BUCKET_NAME"/";
   req.append(obj_name);
   g_test->send_request(string("GET"), req);
   if (g_test->get_resp_code() != 200U) {
@@ -518,7 +518,7 @@ static int read_bucket_obj(const char *obj_name) {
 }
 
 static int delete_obj(const char *obj_name) {
-  string req = "/"TEST_BUCKET_NAME"/";
+  string req = "/" TEST_BUCKET_NAME"/";
   req.append(obj_name);
   g_test->send_request(string("DELETE"), req);
   if (g_test->get_resp_code() != 204U) {
@@ -1403,7 +1403,7 @@ TEST(TestRGWAdmin, bilog_list) {
   EXPECT_EQ(put_bucket_obj(TEST_BUCKET_OBJECT, bucket_obj, TEST_BUCKET_OBJECT_SIZE), 0);
   free(bucket_obj);
   
-  rest_req = "/admin/log?type=bucket-index&bucket="TEST_BUCKET_NAME;
+  rest_req = "/admin/log?type=bucket-index&bucket=" TEST_BUCKET_NAME;
   g_test->send_request(string("GET"), rest_req);
   EXPECT_EQ(200U, g_test->get_resp_code());
   list<cls_bilog_entry> entries;
@@ -1433,7 +1433,7 @@ TEST(TestRGWAdmin, bilog_list) {
   EXPECT_EQ(put_bucket_obj(TEST_BUCKET_OBJECT_1, bucket_obj, TEST_BUCKET_OBJECT_SIZE), 0);
   free(bucket_obj);
   
-  rest_req = "/admin/log?type=bucket-index&bucket="TEST_BUCKET_NAME;
+  rest_req = "/admin/log?type=bucket-index&bucket=" TEST_BUCKET_NAME;
   g_test->send_request(string("GET"), rest_req);
   EXPECT_EQ(200U, g_test->get_resp_code());
   entries.clear();
@@ -1455,7 +1455,7 @@ TEST(TestRGWAdmin, bilog_list) {
   }
 
   ASSERT_EQ(0, delete_obj(TEST_BUCKET_OBJECT));
-  rest_req = "/admin/log?type=bucket-index&bucket="TEST_BUCKET_NAME;
+  rest_req = "/admin/log?type=bucket-index&bucket=" TEST_BUCKET_NAME;
   g_test->send_request(string("GET"), rest_req);
   EXPECT_EQ(200U, g_test->get_resp_code());
   entries.clear();
@@ -1479,7 +1479,7 @@ TEST(TestRGWAdmin, bilog_list) {
     EXPECT_EQ(it->index_ver, 6U);
   }
 
-  rest_req = "/admin/log?type=bucket-index&bucket="TEST_BUCKET_NAME;
+  rest_req = "/admin/log?type=bucket-index&bucket=" TEST_BUCKET_NAME;
   rest_req.append("&marker=");
   rest_req.append(marker);
   g_test->send_request(string("GET"), rest_req);
@@ -1495,7 +1495,7 @@ TEST(TestRGWAdmin, bilog_list) {
     EXPECT_EQ(it->op.compare("del"), 0);
   }
 
-  rest_req = "/admin/log?type=bucket-index&bucket="TEST_BUCKET_NAME;
+  rest_req = "/admin/log?type=bucket-index&bucket=" TEST_BUCKET_NAME;
   rest_req.append("&marker=");
   rest_req.append(marker);
   rest_req.append("&max-entries=1");
@@ -1509,14 +1509,14 @@ TEST(TestRGWAdmin, bilog_list) {
   ASSERT_EQ(0, caps_rm(cname, perm));
   perm = "read";
   ASSERT_EQ(0, caps_add(cname, perm));
-  rest_req = "/admin/log?type=bucket-index&bucket="TEST_BUCKET_NAME;
+  rest_req = "/admin/log?type=bucket-index&bucket=" TEST_BUCKET_NAME;
   g_test->send_request(string("GET"), rest_req);
   EXPECT_EQ(200U, g_test->get_resp_code());
 
   ASSERT_EQ(0, caps_rm(cname, perm));
   perm = "write";
   ASSERT_EQ(0, caps_add(cname, perm));
-  rest_req = "/admin/log?type=bucket-index&bucket="TEST_BUCKET_NAME;
+  rest_req = "/admin/log?type=bucket-index&bucket=" TEST_BUCKET_NAME;
   g_test->send_request(string("GET"), rest_req);
   EXPECT_EQ(403U, g_test->get_resp_code());
 
@@ -1535,7 +1535,7 @@ TEST(TestRGWAdmin, bilog_trim) {
 
   ASSERT_EQ(0, create_bucket());
 
-  rest_req = "/admin/log?type=bucket-index&bucket="TEST_BUCKET_NAME;
+  rest_req = "/admin/log?type=bucket-index&bucket=" TEST_BUCKET_NAME;
   g_test->send_request(string("DELETE"), rest_req);
   EXPECT_EQ(400U, g_test->get_resp_code()); /*Bad request*/
 
@@ -1544,7 +1544,7 @@ TEST(TestRGWAdmin, bilog_trim) {
   EXPECT_EQ(put_bucket_obj(TEST_BUCKET_OBJECT, bucket_obj, TEST_BUCKET_OBJECT_SIZE), 0);
   free(bucket_obj);
   
-  rest_req = "/admin/log?type=bucket-index&bucket="TEST_BUCKET_NAME;
+  rest_req = "/admin/log?type=bucket-index&bucket=" TEST_BUCKET_NAME;
   g_test->send_request(string("GET"), rest_req);
   EXPECT_EQ(200U, g_test->get_resp_code());
   list<cls_bilog_entry> entries;
@@ -1556,7 +1556,7 @@ TEST(TestRGWAdmin, bilog_trim) {
   ++it;
   end_marker = it->op_id;
 
-  rest_req = "/admin/log?type=bucket-index&bucket="TEST_BUCKET_NAME;
+  rest_req = "/admin/log?type=bucket-index&bucket=" TEST_BUCKET_NAME;
   rest_req.append("&start-marker=");
   rest_req.append(start_marker);
   rest_req.append("&end-marker=");
@@ -1564,7 +1564,7 @@ TEST(TestRGWAdmin, bilog_trim) {
   g_test->send_request(string("DELETE"), rest_req);
   EXPECT_EQ(200U, g_test->get_resp_code());
 
-  rest_req = "/admin/log?type=bucket-index&bucket="TEST_BUCKET_NAME;
+  rest_req = "/admin/log?type=bucket-index&bucket=" TEST_BUCKET_NAME;
   g_test->send_request(string("GET"), rest_req);
   EXPECT_EQ(200U, g_test->get_resp_code());
   entries.clear();
diff --git a/src/test/test_rgw_admin_meta.cc b/src/test/test_rgw_admin_meta.cc
index 74bc8ed..5b0d6a6 100644
--- a/src/test/test_rgw_admin_meta.cc
+++ b/src/test/test_rgw_admin_meta.cc
@@ -82,7 +82,7 @@ class test_helper {
     string *resp_data;
     unsigned resp_code;
   public:
-    test_helper() : resp_data(NULL){
+    test_helper() : curl_inst(0), resp_data(NULL), resp_code(0) {
       curl_global_init(CURL_GLOBAL_ALL);
     }
     ~test_helper(){
@@ -796,7 +796,7 @@ TEST(TestRGWAdmin, meta_lock_unlock) {
   ASSERT_EQ(0, user_create(uid, display_name));
   ASSERT_EQ(0, meta_caps_add(perm));
 
-  rest_req = "/admin/metadata/user?key="CEPH_UID"&lock&length=3";
+  rest_req = "/admin/metadata/user?key=" CEPH_UID "&lock&length=3";
   g_test->send_request(string("POST"), rest_req, read_dummy_post, NULL, sizeof(int));
   EXPECT_EQ(400U, g_test->get_resp_code()); /*Bad request*/
   
@@ -804,7 +804,7 @@ TEST(TestRGWAdmin, meta_lock_unlock) {
   g_test->send_request(string("POST"), rest_req, read_dummy_post, NULL, sizeof(int));
   EXPECT_EQ(400U, g_test->get_resp_code()); /*Bad request*/
 
-  rest_req = "/admin/metadata/user?key="CEPH_UID"&unlock";
+  rest_req = "/admin/metadata/user?key=" CEPH_UID "&unlock";
   g_test->send_request(string("POST"), rest_req, read_dummy_post, NULL, sizeof(int));
   EXPECT_EQ(400U, g_test->get_resp_code()); /*Bad request*/
 
@@ -812,72 +812,72 @@ TEST(TestRGWAdmin, meta_lock_unlock) {
   g_test->send_request(string("POST"), rest_req, read_dummy_post, NULL, sizeof(int));
   EXPECT_EQ(400U, g_test->get_resp_code()); /*Bad request*/
   
-  rest_req = "/admin/metadata/user?key="CEPH_UID"&lock&length=3&lock_id=ceph";
+  rest_req = "/admin/metadata/user?key=" CEPH_UID "&lock&length=3&lock_id=ceph";
   g_test->send_request(string("POST"), rest_req, read_dummy_post, NULL, sizeof(int));
   EXPECT_EQ(200U, g_test->get_resp_code()); 
   
-  rest_req = "/admin/metadata/user?key="CEPH_UID"&unlock&lock_id=ceph";
+  rest_req = "/admin/metadata/user?key=" CEPH_UID "&unlock&lock_id=ceph";
   g_test->send_request(string("POST"), rest_req, read_dummy_post, NULL, sizeof(int));
   EXPECT_EQ(200U, g_test->get_resp_code()); 
   
-  rest_req = "/admin/metadata/user?key="CEPH_UID"&lock&length=3&lock_id=ceph1";
+  rest_req = "/admin/metadata/user?key=" CEPH_UID "&lock&length=3&lock_id=ceph1";
   g_test->send_request(string("POST"), rest_req, read_dummy_post, NULL, sizeof(int));
   EXPECT_EQ(200U, g_test->get_resp_code()); 
   
-  rest_req = "/admin/metadata/user?key="CEPH_UID"&unlock&lock_id=ceph1";
+  rest_req = "/admin/metadata/user?key=" CEPH_UID "&unlock&lock_id=ceph1";
   g_test->send_request(string("POST"), rest_req, read_dummy_post, NULL, sizeof(int));
   EXPECT_EQ(200U, g_test->get_resp_code()); 
   
-  rest_req = "/admin/metadata/user?key="CEPH_UID"&lock&length=3&lock_id=ceph";
+  rest_req = "/admin/metadata/user?key=" CEPH_UID "&lock&length=3&lock_id=ceph";
   g_test->send_request(string("POST"), rest_req, read_dummy_post, NULL, sizeof(int));
   EXPECT_EQ(200U, g_test->get_resp_code()); 
   utime_t sleep_time(3, 0);
 
-  rest_req = "/admin/metadata/user?key="CEPH_UID"&lock&length=3&lock_id=ceph1";
+  rest_req = "/admin/metadata/user?key=" CEPH_UID "&lock&length=3&lock_id=ceph1";
   g_test->send_request(string("POST"), rest_req, read_dummy_post, NULL, sizeof(int));
   EXPECT_EQ(500U, g_test->get_resp_code()); 
 
-  rest_req = "/admin/metadata/user?key="CEPH_UID"&lock&length=3&lock_id=ceph";
+  rest_req = "/admin/metadata/user?key=" CEPH_UID "&lock&length=3&lock_id=ceph";
   g_test->send_request(string("POST"), rest_req, read_dummy_post, NULL, sizeof(int));
   EXPECT_EQ(409U, g_test->get_resp_code()); 
   sleep_time.sleep();
 
-  rest_req = "/admin/metadata/user?key="CEPH_UID"&lock&length=3&lock_id=ceph1";
+  rest_req = "/admin/metadata/user?key=" CEPH_UID "&lock&length=3&lock_id=ceph1";
   g_test->send_request(string("POST"), rest_req, read_dummy_post, NULL, sizeof(int));
   EXPECT_EQ(200U, g_test->get_resp_code()); 
   
-  rest_req = "/admin/metadata/user?key="CEPH_UID"&unlock&lock_id=ceph1";
+  rest_req = "/admin/metadata/user?key=" CEPH_UID "&unlock&lock_id=ceph1";
   g_test->send_request(string("POST"), rest_req, read_dummy_post, NULL, sizeof(int));
   EXPECT_EQ(200U, g_test->get_resp_code()); 
 
   ASSERT_EQ(0, meta_caps_rm(perm));
   perm = "read";
   ASSERT_EQ(0, meta_caps_add(perm));
-  rest_req = "/admin/metadata/user?key="CEPH_UID"&lock&length=3&lock_id=ceph";
+  rest_req = "/admin/metadata/user?key=" CEPH_UID "&lock&length=3&lock_id=ceph";
   g_test->send_request(string("POST"), rest_req, read_dummy_post, NULL, sizeof(int));
   EXPECT_EQ(403U, g_test->get_resp_code()); 
   
-  rest_req = "/admin/metadata/user?key="CEPH_UID"&unlock&lock_id=ceph";
+  rest_req = "/admin/metadata/user?key=" CEPH_UID "&unlock&lock_id=ceph";
   g_test->send_request(string("POST"), rest_req, read_dummy_post, NULL, sizeof(int));
   EXPECT_EQ(403U, g_test->get_resp_code()); 
   
   ASSERT_EQ(0, meta_caps_rm(perm));
   perm = "write";
   ASSERT_EQ(0, meta_caps_add(perm));
-  rest_req = "/admin/metadata/user?key="CEPH_UID"&lock&length=3&lock_id=ceph";
+  rest_req = "/admin/metadata/user?key=" CEPH_UID "&lock&length=3&lock_id=ceph";
   g_test->send_request(string("POST"), rest_req, read_dummy_post, NULL, sizeof(int));
   EXPECT_EQ(200U, g_test->get_resp_code()); 
   
-  rest_req = "/admin/metadata/user?key="CEPH_UID"&unlock&lock_id=ceph";
+  rest_req = "/admin/metadata/user?key=" CEPH_UID "&unlock&lock_id=ceph";
   g_test->send_request(string("POST"), rest_req, read_dummy_post, NULL, sizeof(int));
   EXPECT_EQ(200U, g_test->get_resp_code()); 
   
   ASSERT_EQ(0, meta_caps_rm(perm));
-  rest_req = "/admin/metadata/user?key="CEPH_UID"&lock&length=3&lock_id=ceph";
+  rest_req = "/admin/metadata/user?key=" CEPH_UID "&lock&length=3&lock_id=ceph";
   g_test->send_request(string("POST"), rest_req, read_dummy_post, NULL, sizeof(int));
   EXPECT_EQ(403U, g_test->get_resp_code()); 
   
-  rest_req = "/admin/metadata/user?key="CEPH_UID"&unlock&lock_id=ceph";
+  rest_req = "/admin/metadata/user?key=" CEPH_UID "&unlock&lock_id=ceph";
   g_test->send_request(string("POST"), rest_req, read_dummy_post, NULL, sizeof(int));
   EXPECT_EQ(403U, g_test->get_resp_code()); 
   
diff --git a/src/test/test_rgw_admin_opstate.cc b/src/test/test_rgw_admin_opstate.cc
index 5687225..a9a65f5 100644
--- a/src/test/test_rgw_admin_opstate.cc
+++ b/src/test/test_rgw_admin_opstate.cc
@@ -85,7 +85,7 @@ class test_helper {
     string *resp_data;
     unsigned resp_code;
   public:
-    test_helper() : resp_data(NULL){
+    test_helper() : curl_inst(NULL), resp_data(NULL), resp_code(0) {
       curl_global_init(CURL_GLOBAL_ALL);
     }
     ~test_helper(){
diff --git a/src/test/test_snap_mapper.cc b/src/test/test_snap_mapper.cc
index 7f0fae3..873e88c 100644
--- a/src/test/test_snap_mapper.cc
+++ b/src/test/test_snap_mapper.cc
@@ -440,8 +440,8 @@ TEST_F(MapCacherTest, Random)
 class MapperVerifier {
   PausyAsyncMap *driver;
   boost::scoped_ptr< SnapMapper > mapper;
-  map<snapid_t, set<hobject_t> > snap_to_hobject;
-  map<hobject_t, set<snapid_t> > hobject_to_snap;
+  map<snapid_t, set<hobject_t, hobject_t::BitwiseComparator> > snap_to_hobject;
+  map<hobject_t, set<snapid_t>, hobject_t::BitwiseComparator> hobject_to_snap;
   snapid_t next;
   uint32_t mask;
   uint32_t bits;
@@ -493,7 +493,7 @@ public:
     for (set<snapid_t>::iterator i = snaps.begin();
 	 i != snaps.end();
 	 ++i) {
-      map<snapid_t, set<hobject_t> >::iterator j = snap_to_hobject.find(*i);
+      map<snapid_t, set<hobject_t, hobject_t::BitwiseComparator> >::iterator j = snap_to_hobject.find(*i);
       assert(j != snap_to_hobject.end());
       j->second.insert(obj);
     }
@@ -508,9 +508,9 @@ public:
     Mutex::Locker l(lock);
     if (snap_to_hobject.empty())
       return;
-    map<snapid_t, set<hobject_t> >::iterator snap =
+    map<snapid_t, set<hobject_t, hobject_t::BitwiseComparator> >::iterator snap =
       rand_choose(snap_to_hobject);
-    set<hobject_t> hobjects = snap->second;
+    set<hobject_t, hobject_t::BitwiseComparator> hobjects = snap->second;
 
     hobject_t hoid;
     while (mapper->get_next_object_to_trim(snap->first, &hoid) == 0) {
@@ -518,7 +518,7 @@ public:
       assert(hobjects.count(hoid));
       hobjects.erase(hoid);
 
-      map<hobject_t, set<snapid_t> >::iterator j =
+      map<hobject_t, set<snapid_t>, hobject_t::BitwiseComparator>::iterator j =
 	hobject_to_snap.find(hoid);
       assert(j->second.count(snap->first));
       set<snapid_t> old_snaps(j->second);
@@ -547,12 +547,12 @@ public:
     Mutex::Locker l(lock);
     if (hobject_to_snap.empty())
       return;
-    map<hobject_t, set<snapid_t> >::iterator obj =
+    map<hobject_t, set<snapid_t>, hobject_t::BitwiseComparator>::iterator obj =
       rand_choose(hobject_to_snap);
     for (set<snapid_t>::iterator i = obj->second.begin();
 	 i != obj->second.end();
 	 ++i) {
-      map<snapid_t, set<hobject_t> >::iterator j =
+      map<snapid_t, set<hobject_t, hobject_t::BitwiseComparator> >::iterator j =
 	snap_to_hobject.find(*i);
       assert(j->second.count(obj->first));
       j->second.erase(obj->first);
@@ -571,7 +571,7 @@ public:
     Mutex::Locker l(lock);
     if (hobject_to_snap.empty())
       return;
-    map<hobject_t, set<snapid_t> >::iterator obj =
+    map<hobject_t, set<snapid_t>, hobject_t::BitwiseComparator>::iterator obj =
       rand_choose(hobject_to_snap);
     set<snapid_t> snaps;
     int r = mapper->get_snaps(obj->first, &snaps);
diff --git a/src/test/test_stress_watch.cc b/src/test/test_stress_watch.cc
index 1f9bed9..9e66f0e 100644
--- a/src/test/test_stress_watch.cc
+++ b/src/test/test_stress_watch.cc
@@ -23,7 +23,7 @@ using std::map;
 using std::ostringstream;
 using std::string;
 
-static sem_t sem;
+static sem_t *sem;
 static atomic_t stop_flag;
 
 class WatchNotifyTestCtx : public WatchCtx
@@ -31,7 +31,7 @@ class WatchNotifyTestCtx : public WatchCtx
 public:
     void notify(uint8_t opcode, uint64_t ver, bufferlist& bl)
     {
-      sem_post(&sem);
+      sem_post(sem);
     }
 };
 
@@ -68,7 +68,7 @@ INSTANTIATE_TEST_CASE_P(WatchStressTests, WatchStress,
 			::testing::Values("", "cache"));
 
 TEST_P(WatchStress, Stress1) {
-  ASSERT_EQ(0, sem_init(&sem, 0, 0));
+  ASSERT_NE(SEM_FAILED, (sem = sem_open("test_stress_watch", O_CREAT, 0644, 0)));
   Rados ncluster;
   std::string pool_name = get_temp_pool_name();
   ASSERT_EQ("", create_one_pool_pp(pool_name, ncluster));
@@ -105,7 +105,7 @@ TEST_P(WatchStress, Stress1) {
       sleep(1); // Give a change to see an incorrect notify
     } else {
       TestAlarm alarm;
-      sem_wait(&sem);
+      sem_wait(sem);
     }
 
     if (do_blacklist) {
@@ -119,7 +119,7 @@ TEST_P(WatchStress, Stress1) {
   thr->join();
   nioctx.close();
   ASSERT_EQ(0, destroy_one_pool_pp(pool_name, ncluster));
-  sem_destroy(&sem);
+  sem_close(sem);
 }
 
 #pragma GCC diagnostic pop
diff --git a/src/test/test_subprocess.cc b/src/test/test_subprocess.cc
new file mode 100644
index 0000000..c07538b
--- /dev/null
+++ b/src/test/test_subprocess.cc
@@ -0,0 +1,268 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph distributed storage system
+ *
+ * Copyright (C) 2015 Mirantis Inc
+ *
+ * Author: Mykola Golub <mgolub at mirantis.com>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <unistd.h>
+
+#include <iostream>
+
+#include "common/SubProcess.h"
+#include "common/safe_io.h"
+#include "gtest/gtest.h"
+
+bool read_from_fd(int fd, std::string &out) {
+  out.clear();
+  char buf[1024];
+  ssize_t n = safe_read(fd, buf, sizeof(buf) - 1);
+  if (n < 0)
+    return false;
+  buf[n] = '\0';
+  out = buf;
+  return true;
+}
+
+TEST(SubProcess, True)
+{
+  SubProcess p("true");
+  ASSERT_EQ(p.spawn(), 0);
+  ASSERT_EQ(p.join(), 0);
+  ASSERT_TRUE(p.err()[0] == '\0');
+}
+
+TEST(SubProcess, False)
+{
+  SubProcess p("false");
+  ASSERT_EQ(p.spawn(), 0);
+  ASSERT_EQ(p.join(), 1);
+  ASSERT_FALSE(p.err()[0] == '\0');
+}
+
+TEST(SubProcess, NotFound)
+{
+  SubProcess p("NOTEXISTENTBINARY", false, false, true);
+  ASSERT_EQ(p.spawn(), 0);
+  std::string buf;
+  ASSERT_TRUE(read_from_fd(p.stderr(), buf));
+  std::cerr << "stderr: " << buf;
+  ASSERT_EQ(p.join(), 1);
+  std::cerr << "err: " << p.err() << std::endl;
+  ASSERT_FALSE(p.err()[0] == '\0');
+}
+
+TEST(SubProcess, Echo)
+{
+  SubProcess echo("echo", false, true);
+  echo.add_cmd_args("1", "2", "3", NULL);
+
+  ASSERT_EQ(echo.spawn(), 0);
+  std::string buf;
+  ASSERT_TRUE(read_from_fd(echo.stdout(), buf));
+  std::cerr << "stdout: " << buf;
+  ASSERT_EQ(buf, "1 2 3\n");
+  ASSERT_EQ(echo.join(), 0);
+  ASSERT_TRUE(echo.err()[0] == '\0');
+}
+
+TEST(SubProcess, Cat)
+{
+  SubProcess cat("cat", true, true, true);
+
+  ASSERT_EQ(cat.spawn(), 0);
+  std::string msg("to my, trociny!");
+  int n = write(cat.stdin(), msg.c_str(), msg.size());
+  ASSERT_EQ(n, (int)msg.size());
+  cat.close_stdin();
+  std::string buf;
+  ASSERT_TRUE(read_from_fd(cat.stdout(), buf));
+  std::cerr << "stdout: " << buf << std::endl;
+  ASSERT_EQ(buf, msg);
+  ASSERT_TRUE(read_from_fd(cat.stderr(), buf));
+  ASSERT_EQ(buf, "");
+  ASSERT_EQ(cat.join(), 0);
+  ASSERT_TRUE(cat.err()[0] == '\0');
+}
+
+TEST(SubProcess, CatDevNull)
+{
+  SubProcess cat("cat", true, true, true);
+  cat.add_cmd_arg("/dev/null");
+
+  ASSERT_EQ(cat.spawn(), 0);
+  std::string buf;
+  ASSERT_TRUE(read_from_fd(cat.stdout(), buf));
+  ASSERT_EQ(buf, "");
+  ASSERT_TRUE(read_from_fd(cat.stderr(), buf));
+  ASSERT_EQ(buf, "");
+  ASSERT_EQ(cat.join(), 0);
+  ASSERT_TRUE(cat.err()[0] == '\0');
+}
+
+TEST(SubProcess, Killed)
+{
+  SubProcessTimed cat("cat", true, true);
+
+  ASSERT_EQ(cat.spawn(), 0);
+  cat.kill();
+  ASSERT_EQ(cat.join(), 128 + SIGTERM);
+  std::cerr << "err: " << cat.err() << std::endl;
+  ASSERT_FALSE(cat.err()[0] == '\0');
+}
+
+TEST(SubProcess, CatWithArgs)
+{
+  SubProcess cat("cat", true, true, true);
+  cat.add_cmd_args("/dev/stdin", "/dev/null", "/NOTEXIST", NULL);
+
+  ASSERT_EQ(cat.spawn(), 0);
+  std::string msg("Hello, Word!");
+  int n = write(cat.stdin(), msg.c_str(), msg.size());
+  ASSERT_EQ(n, (int)msg.size());
+  cat.close_stdin();
+  std::string buf;
+  ASSERT_TRUE(read_from_fd(cat.stdout(), buf));
+  std::cerr << "stdout: " << buf << std::endl;
+  ASSERT_EQ(buf, msg);
+  ASSERT_TRUE(read_from_fd(cat.stderr(), buf));
+  std::cerr << "stderr: " << buf;
+  ASSERT_FALSE(buf.empty());
+  ASSERT_EQ(cat.join(), 1);
+  std::cerr << "err: " << cat.err() << std::endl;
+  ASSERT_FALSE(cat.err()[0] == '\0');
+}
+
+TEST(SubProcess, Subshell)
+{
+  SubProcess sh("/bin/sh", true, true, true);
+  sh.add_cmd_args("-c",
+      "sleep 0; "
+      "cat; "
+      "echo 'error from subshell' >&2; "
+      "/bin/sh -c 'exit 13'", NULL);
+  ASSERT_EQ(sh.spawn(), 0);
+  std::string msg("hello via subshell");
+  int n = write(sh.stdin(), msg.c_str(), msg.size());
+  ASSERT_EQ(n, (int)msg.size());
+  sh.close_stdin();
+  std::string buf;
+  ASSERT_TRUE(read_from_fd(sh.stdout(), buf));
+  std::cerr << "stdout: " << buf << std::endl;
+  ASSERT_EQ(buf, msg);
+  ASSERT_TRUE(read_from_fd(sh.stderr(), buf));
+  std::cerr << "stderr: " << buf;
+  ASSERT_EQ(buf, "error from subshell\n");
+  ASSERT_EQ(sh.join(), 13);
+  std::cerr << "err: " << sh.err() << std::endl;
+  ASSERT_FALSE(sh.err()[0] == '\0');
+}
+
+TEST(SubProcessTimed, True)
+{
+  SubProcessTimed p("true", false, false, false, 10);
+  ASSERT_EQ(p.spawn(), 0);
+  ASSERT_EQ(p.join(), 0);
+  ASSERT_TRUE(p.err()[0] == '\0');
+}
+
+TEST(SubProcessTimed, SleepNoTimeout)
+{
+  SubProcessTimed sleep("sleep", false, false, false, 0);
+  sleep.add_cmd_arg("1");
+
+  ASSERT_EQ(sleep.spawn(), 0);
+  ASSERT_EQ(sleep.join(), 0);
+  ASSERT_TRUE(sleep.err()[0] == '\0');
+}
+
+TEST(SubProcessTimed, Killed)
+{
+  SubProcessTimed cat("cat", true, true, true, 5);
+
+  ASSERT_EQ(cat.spawn(), 0);
+  cat.kill();
+  std::string buf;
+  ASSERT_TRUE(read_from_fd(cat.stdout(), buf));
+  ASSERT_TRUE(buf.empty());
+  ASSERT_TRUE(read_from_fd(cat.stderr(), buf));
+  ASSERT_TRUE(buf.empty());
+  ASSERT_EQ(cat.join(), 128 + SIGTERM);
+  std::cerr << "err: " << cat.err() << std::endl;
+  ASSERT_FALSE(cat.err()[0] == '\0');
+}
+
+TEST(SubProcessTimed, SleepTimedout)
+{
+  SubProcessTimed sleep("sleep", false, false, true, 1);
+  sleep.add_cmd_arg("10");
+
+  ASSERT_EQ(sleep.spawn(), 0);
+  std::string buf;
+  ASSERT_TRUE(read_from_fd(sleep.stderr(), buf));
+  std::cerr << "stderr: " << buf;
+  ASSERT_FALSE(buf.empty());
+  ASSERT_EQ(sleep.join(), 128 + SIGKILL);
+  std::cerr << "err: " << sleep.err() << std::endl;
+  ASSERT_FALSE(sleep.err()[0] == '\0');
+}
+
+TEST(SubProcessTimed, SubshellNoTimeout)
+{
+  SubProcessTimed sh("/bin/sh", true, true, true, 0);
+  sh.add_cmd_args("-c", "cat >&2", NULL);
+  ASSERT_EQ(sh.spawn(), 0);
+  std::string msg("the quick brown fox jumps over the lazy dog");
+  int n = write(sh.stdin(), msg.c_str(), msg.size());
+  ASSERT_EQ(n, (int)msg.size());
+  sh.close_stdin();
+  std::string buf;
+  ASSERT_TRUE(read_from_fd(sh.stdout(), buf));
+  std::cerr << "stdout: " << buf << std::endl;
+  ASSERT_TRUE(buf.empty());
+  ASSERT_TRUE(read_from_fd(sh.stderr(), buf));
+  std::cerr << "stderr: " << buf << std::endl;
+  ASSERT_EQ(buf, msg);
+  ASSERT_EQ(sh.join(), 0);
+  ASSERT_TRUE(sh.err()[0] == '\0');
+}
+
+TEST(SubProcessTimed, SubshellKilled)
+{
+  SubProcessTimed sh("/bin/sh", true, true, true, 10);
+  sh.add_cmd_args("-c", "sh -c cat", NULL);
+  ASSERT_EQ(sh.spawn(), 0);
+  std::string msg("etaoin shrdlu");
+  int n = write(sh.stdin(), msg.c_str(), msg.size());
+  ASSERT_EQ(n, (int)msg.size());
+  sh.kill();
+  std::string buf;
+  ASSERT_TRUE(read_from_fd(sh.stderr(), buf));
+  ASSERT_TRUE(buf.empty());
+  ASSERT_EQ(sh.join(), 128 + SIGTERM);
+  std::cerr << "err: " << sh.err() << std::endl;
+  ASSERT_FALSE(sh.err()[0] == '\0');
+}
+
+TEST(SubProcessTimed, SubshellTimedout)
+{
+  SubProcessTimed sh("/bin/sh", true, true, true, 1, SIGTERM);
+  sh.add_cmd_args("-c", "sleep 1000& cat; NEVER REACHED", NULL);
+  ASSERT_EQ(sh.spawn(), 0);
+  std::string buf;
+  ASSERT_TRUE(read_from_fd(sh.stderr(), buf));
+  std::cerr << "stderr: " << buf;
+  ASSERT_FALSE(buf.empty());
+  ASSERT_EQ(sh.join(), 128 + SIGTERM);
+  std::cerr << "err: " << sh.err() << std::endl;
+  ASSERT_FALSE(sh.err()[0] == '\0');
+}
diff --git a/src/test/test_trans.cc b/src/test/test_trans.cc
index 43821c1..c374ed4 100644
--- a/src/test/test_trans.cc
+++ b/src/test/test_trans.cc
@@ -55,23 +55,24 @@ int main(int argc, const char **argv)
     return -1;
   }
 
+  ObjectStore::Sequencer osr(__func__);
   ObjectStore::Transaction t;
   char buf[1 << 20];
   bufferlist bl;
   bl.append(buf, sizeof(buf));
-  t.create_collection(coll_t());
+  t.create_collection(coll_t(), 0);
 
   for (int i=0; i<mb; i++) {
     char f[30];
     snprintf(f, sizeof(f), "foo%d\n", i);
     sobject_t soid(f, CEPH_NOSNAP);
-    t.write(coll_t(), hobject_t(soid), 0, bl.length(), bl);
+    t.write(coll_t(), ghobject_t(hobject_t(soid)), 0, bl.length(), bl);
   }
   
   dout(0) << "starting thread" << dendl;
   foo.create();
   dout(0) << "starting op" << dendl;
-  fs->apply_transaction(t);
+  fs->apply_transaction(&osr, t);
 
 }
 
diff --git a/src/test/test_xlist.cc b/src/test/test_xlist.cc
new file mode 100644
index 0000000..9d5eadd
--- /dev/null
+++ b/src/test/test_xlist.cc
@@ -0,0 +1,118 @@
+#include <algorithm>
+#include <iterator>
+#include <vector>
+#include "include/xlist.h"
+
+#include "gtest/gtest.h"
+
+
+struct Item {
+  xlist<Item*>::item xitem;
+  int val;
+
+  Item(int v) :
+    xitem(this),
+    val(v)
+  {}
+};
+
+class XlistTest : public testing::Test
+{
+protected:
+  typedef xlist<Item*> ItemList;
+  typedef std::vector<Item*> Items;
+  typedef std::vector<ItemList::item*> Refs;
+  Items items;
+  // for filling up an ItemList
+  Refs refs;
+
+  virtual void SetUp() {
+    for (int i = 0; i < 13; i++) {
+      items.push_back(new Item(i));
+      refs.push_back(&items.back()->xitem);
+    }
+  }
+  virtual void TearDown() {
+    for (Items::iterator i = items.begin(); i != items.end(); ++i) {
+      delete *i;
+    }
+    items.clear();
+  }
+};
+
+TEST_F(XlistTest, capability) {
+  ItemList list;
+  ASSERT_TRUE(list.empty());
+  ASSERT_EQ(list.size(), 0);
+
+  std::copy(refs.begin(), refs.end(), std::back_inserter(list));
+  ASSERT_EQ((size_t)list.size(), refs.size());
+
+  list.clear();
+  ASSERT_TRUE(list.empty());
+  ASSERT_EQ(list.size(), 0);
+}
+
+TEST_F(XlistTest, traverse) {
+  ItemList list;
+  std::copy(refs.begin(), refs.end(), std::back_inserter(list));
+
+  // advance until iterator::end()
+  size_t index = 0;
+  for (ItemList::iterator i = list.begin(); !i.end(); ++i) {
+    ASSERT_EQ(*i, items[index]);
+    index++;
+  }
+  // advance until i == v.end()
+  index = 0;
+  for (ItemList::iterator i = list.begin(); i != list.end(); ++i) {
+    ASSERT_EQ(*i, items[index]);
+    index++;
+  }
+  list.clear();
+}
+
+TEST_F(XlistTest, move_around) {
+  Item item1(42), item2(17);
+  ItemList list;
+
+  // only a single element in the list
+  list.push_back(&item1.xitem);
+  ASSERT_EQ(&item1, list.front());
+  ASSERT_EQ(&item1, list.back());
+
+  list.push_back(&item2.xitem);
+  ASSERT_EQ(&item1, list.front());
+  ASSERT_EQ(&item2, list.back());
+
+  // move item2 to the front
+  list.push_front(&item2.xitem);
+  ASSERT_EQ(&item2, list.front());
+  ASSERT_EQ(&item1, list.back());
+
+  // and move it back
+  list.push_back(&item2.xitem);
+  ASSERT_EQ(&item1, list.front());
+  ASSERT_EQ(&item2, list.back());
+
+  list.clear();
+}
+
+TEST_F(XlistTest, item_queries) {
+  Item item(42);
+  ItemList list;
+  list.push_back(&item.xitem);
+
+  ASSERT_TRUE(item.xitem.is_on_list());
+  ASSERT_EQ(&list, item.xitem.get_list());
+
+  ASSERT_TRUE(item.xitem.remove_myself());
+  ASSERT_FALSE(item.xitem.is_on_list());
+  ASSERT_TRUE(item.xitem.get_list() == NULL);
+}
+
+// Local Variables:
+// compile-command: "cd .. ;
+//   make unittest_xlist &&
+//   ./unittest_xlist"
+// End:
diff --git a/src/test/testcrypto.cc b/src/test/testcrypto.cc
index 0b7a9d5..60f5905 100644
--- a/src/test/testcrypto.cc
+++ b/src/test/testcrypto.cc
@@ -25,8 +25,8 @@ int main(int argc, char *argv[])
 
   bufferlist enc_out;
   std::string error;
-  key.encrypt(g_ceph_context, enc_in, enc_out, error);
-  if (!error.empty()) {
+  if (key.encrypt(g_ceph_context, enc_in, enc_out, &error) < 0) {
+    assert(!error.empty());
     dout(0) << "couldn't encode! error " << error << dendl;
     exit(1);
   }
@@ -42,8 +42,8 @@ int main(int argc, char *argv[])
 
   dec_in = enc_out;
 
-  key.decrypt(g_ceph_context, dec_in, dec_out, error);
-  if (!error.empty()) {
+  if (key.decrypt(g_ceph_context, dec_in, dec_out, &error) < 0) {
+    assert(!error.empty());
     dout(0) << "couldn't decode! error " << error << dendl;
     exit(1);
   }
diff --git a/src/test/ubuntu-12.04/Dockerfile.in b/src/test/ubuntu-12.04/Dockerfile.in
index a3d05cd..ed55bad 100644
--- a/src/test/ubuntu-12.04/Dockerfile.in
+++ b/src/test/ubuntu-12.04/Dockerfile.in
@@ -27,5 +27,5 @@ RUN apt-get update
 # build dependencies
 RUN cd /root ; ./install-deps.sh
 # development tools
-RUN apt-get install -y ccache valgrind gdb python-virtualenv gdisk kpartx hdparm xmlstarlet
+RUN apt-get install -y sudo ccache valgrind gdb python-virtualenv gdisk kpartx hdparm xmlstarlet
 RUN useradd -M --uid %%user_id%% %%USER%% && echo '%%USER%% ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
diff --git a/src/test/ubuntu-12.04/install-deps.sh b/src/test/ubuntu-12.04/install-deps.sh
index 129b238..1bebf09 100755
--- a/src/test/ubuntu-12.04/install-deps.sh
+++ b/src/test/ubuntu-12.04/install-deps.sh
@@ -1,8 +1,8 @@
-#!/bin/bash
+#!/bin/bash -e
 #
 # Ceph distributed storage system
 #
-# Copyright (C) 2014 Red Hat <contact at redhat.com>
+# Copyright (C) 2014, 2015 Red Hat <contact at redhat.com>
 #
 # Author: Loic Dachary <loic at dachary.org>
 #
@@ -23,10 +23,14 @@ if test -f /etc/redhat-release ; then
     $SUDO yum install -y redhat-lsb-core
 fi
 
-if which apt-get > /dev/null ; then
+if type apt-get > /dev/null 2>&1 ; then
     $SUDO apt-get install -y lsb-release
 fi
 
+if type zypper > /dev/null 2>&1 ; then
+    $SUDO zypper --gpg-auto-import-keys --non-interactive install lsb-release
+fi
+
 case $(lsb_release -si) in
 Ubuntu|Debian|Devuan)
         $SUDO apt-get install -y dpkg-dev
@@ -38,30 +42,106 @@ Ubuntu|Debian|Devuan)
         packages=$(dpkg-checkbuilddeps --admindir=$DIR debian/control 2>&1 | \
             perl -p -e 's/.*Unmet build dependencies: *//;' \
             -e 's/build-essential:native/build-essential/;' \
-            -e 's/\|//g;' \
+            -e 's/\s*\|\s*/\|/g;' \
             -e 's/\(.*?\)//g;' \
             -e 's/ +/\n/g;' | sort)
         case $(lsb_release -sc) in
             squeeze|wheezy)
                 packages=$(echo $packages | perl -pe 's/[-\w]*babeltrace[-\w]*//g')
+                backports="-t $(lsb_release -sc)-backports"
                 ;;
         esac
         packages=$(echo $packages) # change newlines into spaces
-        $SUDO bash -c "DEBIAN_FRONTEND=noninteractive apt-get install -y $packages"
+        $SUDO env DEBIAN_FRONTEND=noninteractive apt-get install $backports -y $packages || exit 1
         ;;
-CentOS|Fedora|SUSE*|RedHatEnterpriseServer)
+CentOS|Fedora|RedHatEnterpriseServer)
         case $(lsb_release -si) in
-            SUSE*)
-                $SUDO zypper -y yum-utils
+            Fedora)
+                $SUDO yum install -y yum-utils
                 ;;
-            *)
+            CentOS|RedHatEnterpriseServer)
                 $SUDO yum install -y yum-utils
+                MAJOR_VERSION=$(lsb_release -rs | cut -f1 -d.)
+                if test $(lsb_release -si) == RedHatEnterpriseServer ; then
+                    $SUDO yum install subscription-manager
+                    $SUDO subscription-manager repos --enable=rhel-$MAJOR_VERSION-server-optional-rpms
+                fi
+                $SUDO yum-config-manager --add-repo https://dl.fedoraproject.org/pub/epel/$MAJOR_VERSION/x86_64/ 
+                $SUDO yum install --nogpgcheck -y epel-release
+                $SUDO rpm --import /etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-$MAJOR_VERSION
+                $SUDO rm -f /etc/yum.repos.d/dl.fedoraproject.org*
                 ;;
         esac
         sed -e 's/@//g' < ceph.spec.in > $DIR/ceph.spec
-        $SUDO yum-builddep -y $DIR/ceph.spec
+        $SUDO yum-builddep -y $DIR/ceph.spec 2>&1 | tee $DIR/yum-builddep.out
+        ! grep -q -i error: $DIR/yum-builddep.out || exit 1
+        ;;
+*SUSE*)
+        sed -e 's/@//g' < ceph.spec.in > $DIR/ceph.spec
+        $SUDO zypper --non-interactive install $(rpmspec -q --buildrequires $DIR/ceph.spec) || exit 1
         ;;
 *)
         echo "$(lsb_release -si) is unknown, dependencies will have to be installed manually."
         ;;
 esac
+
+function populate_wheelhouse() {
+    local install=$1
+    shift
+
+    # Ubuntu-12.04 and Python 2.7.3 require this line
+    pip --timeout 300 $install 'distribute >= 0.7.3' || return 1
+    # although pip comes with virtualenv, having a recent version
+    # of pip matters when it comes to using wheel packages
+    pip --timeout 300 $install 'setuptools >= 0.8' 'pip >= 7.0' 'wheel >= 0.24' || return 1
+    if test $# != 0 ; then
+        pip --timeout 300 $install $@ || return 1
+    fi
+}
+
+function activate_virtualenv() {
+    local top_srcdir=$1
+    local interpreter=$2
+    local env_dir=$top_srcdir/install-deps-$interpreter
+
+    if ! test -d $env_dir ; then
+        virtualenv --python $interpreter $env_dir
+        . $env_dir/bin/activate
+        if ! populate_wheelhouse install ; then
+            rm -rf $env_dir
+            return 1
+        fi
+    fi
+    . $env_dir/bin/activate
+}
+
+# use pip cache if possible but do not store it outside of the source
+# tree
+# see https://pip.pypa.io/en/stable/reference/pip_install.html#caching
+mkdir -p install-deps-cache
+top_srcdir=$(pwd)
+export XDG_CACHE_HOME=$top_srcdir/install-deps-cache
+wip_wheelhouse=wheelhouse-wip
+
+#
+# preload python modules so that tox can run without network access
+#
+find . -name tox.ini | while read ini ; do
+    (
+        cd $(dirname $ini)
+        require=$(ls *requirements.txt 2>/dev/null | sed -e 's/^/-r /')
+        if test "$require" && ! test -d wheelhouse ; then
+            for interpreter in python2.7 python3 ; do
+                type $interpreter > /dev/null 2>&1 || continue
+                activate_virtualenv $top_srcdir $interpreter || exit 1
+                populate_wheelhouse "wheel -w $wip_wheelhouse" $require || exit 1
+            done
+            mv $wip_wheelhouse wheelhouse
+        fi
+    )
+done
+
+for interpreter in python2.7 python3 ; do
+    rm -rf $top_srcdir/install-deps-$interpreter
+done
+rm -rf $XDG_CACHE_HOME
diff --git a/src/test/ubuntu-14.04/install-deps.sh b/src/test/ubuntu-14.04/install-deps.sh
index 129b238..1bebf09 100755
--- a/src/test/ubuntu-14.04/install-deps.sh
+++ b/src/test/ubuntu-14.04/install-deps.sh
@@ -1,8 +1,8 @@
-#!/bin/bash
+#!/bin/bash -e
 #
 # Ceph distributed storage system
 #
-# Copyright (C) 2014 Red Hat <contact at redhat.com>
+# Copyright (C) 2014, 2015 Red Hat <contact at redhat.com>
 #
 # Author: Loic Dachary <loic at dachary.org>
 #
@@ -23,10 +23,14 @@ if test -f /etc/redhat-release ; then
     $SUDO yum install -y redhat-lsb-core
 fi
 
-if which apt-get > /dev/null ; then
+if type apt-get > /dev/null 2>&1 ; then
     $SUDO apt-get install -y lsb-release
 fi
 
+if type zypper > /dev/null 2>&1 ; then
+    $SUDO zypper --gpg-auto-import-keys --non-interactive install lsb-release
+fi
+
 case $(lsb_release -si) in
 Ubuntu|Debian|Devuan)
         $SUDO apt-get install -y dpkg-dev
@@ -38,30 +42,106 @@ Ubuntu|Debian|Devuan)
         packages=$(dpkg-checkbuilddeps --admindir=$DIR debian/control 2>&1 | \
             perl -p -e 's/.*Unmet build dependencies: *//;' \
             -e 's/build-essential:native/build-essential/;' \
-            -e 's/\|//g;' \
+            -e 's/\s*\|\s*/\|/g;' \
             -e 's/\(.*?\)//g;' \
             -e 's/ +/\n/g;' | sort)
         case $(lsb_release -sc) in
             squeeze|wheezy)
                 packages=$(echo $packages | perl -pe 's/[-\w]*babeltrace[-\w]*//g')
+                backports="-t $(lsb_release -sc)-backports"
                 ;;
         esac
         packages=$(echo $packages) # change newlines into spaces
-        $SUDO bash -c "DEBIAN_FRONTEND=noninteractive apt-get install -y $packages"
+        $SUDO env DEBIAN_FRONTEND=noninteractive apt-get install $backports -y $packages || exit 1
         ;;
-CentOS|Fedora|SUSE*|RedHatEnterpriseServer)
+CentOS|Fedora|RedHatEnterpriseServer)
         case $(lsb_release -si) in
-            SUSE*)
-                $SUDO zypper -y yum-utils
+            Fedora)
+                $SUDO yum install -y yum-utils
                 ;;
-            *)
+            CentOS|RedHatEnterpriseServer)
                 $SUDO yum install -y yum-utils
+                MAJOR_VERSION=$(lsb_release -rs | cut -f1 -d.)
+                if test $(lsb_release -si) == RedHatEnterpriseServer ; then
+                    $SUDO yum install subscription-manager
+                    $SUDO subscription-manager repos --enable=rhel-$MAJOR_VERSION-server-optional-rpms
+                fi
+                $SUDO yum-config-manager --add-repo https://dl.fedoraproject.org/pub/epel/$MAJOR_VERSION/x86_64/ 
+                $SUDO yum install --nogpgcheck -y epel-release
+                $SUDO rpm --import /etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-$MAJOR_VERSION
+                $SUDO rm -f /etc/yum.repos.d/dl.fedoraproject.org*
                 ;;
         esac
         sed -e 's/@//g' < ceph.spec.in > $DIR/ceph.spec
-        $SUDO yum-builddep -y $DIR/ceph.spec
+        $SUDO yum-builddep -y $DIR/ceph.spec 2>&1 | tee $DIR/yum-builddep.out
+        ! grep -q -i error: $DIR/yum-builddep.out || exit 1
+        ;;
+*SUSE*)
+        sed -e 's/@//g' < ceph.spec.in > $DIR/ceph.spec
+        $SUDO zypper --non-interactive install $(rpmspec -q --buildrequires $DIR/ceph.spec) || exit 1
         ;;
 *)
         echo "$(lsb_release -si) is unknown, dependencies will have to be installed manually."
         ;;
 esac
+
+function populate_wheelhouse() {
+    local install=$1
+    shift
+
+    # Ubuntu-12.04 and Python 2.7.3 require this line
+    pip --timeout 300 $install 'distribute >= 0.7.3' || return 1
+    # although pip comes with virtualenv, having a recent version
+    # of pip matters when it comes to using wheel packages
+    pip --timeout 300 $install 'setuptools >= 0.8' 'pip >= 7.0' 'wheel >= 0.24' || return 1
+    if test $# != 0 ; then
+        pip --timeout 300 $install $@ || return 1
+    fi
+}
+
+function activate_virtualenv() {
+    local top_srcdir=$1
+    local interpreter=$2
+    local env_dir=$top_srcdir/install-deps-$interpreter
+
+    if ! test -d $env_dir ; then
+        virtualenv --python $interpreter $env_dir
+        . $env_dir/bin/activate
+        if ! populate_wheelhouse install ; then
+            rm -rf $env_dir
+            return 1
+        fi
+    fi
+    . $env_dir/bin/activate
+}
+
+# use pip cache if possible but do not store it outside of the source
+# tree
+# see https://pip.pypa.io/en/stable/reference/pip_install.html#caching
+mkdir -p install-deps-cache
+top_srcdir=$(pwd)
+export XDG_CACHE_HOME=$top_srcdir/install-deps-cache
+wip_wheelhouse=wheelhouse-wip
+
+#
+# preload python modules so that tox can run without network access
+#
+find . -name tox.ini | while read ini ; do
+    (
+        cd $(dirname $ini)
+        require=$(ls *requirements.txt 2>/dev/null | sed -e 's/^/-r /')
+        if test "$require" && ! test -d wheelhouse ; then
+            for interpreter in python2.7 python3 ; do
+                type $interpreter > /dev/null 2>&1 || continue
+                activate_virtualenv $top_srcdir $interpreter || exit 1
+                populate_wheelhouse "wheel -w $wip_wheelhouse" $require || exit 1
+            done
+            mv $wip_wheelhouse wheelhouse
+        fi
+    )
+done
+
+for interpreter in python2.7 python3 ; do
+    rm -rf $top_srcdir/install-deps-$interpreter
+done
+rm -rf $XDG_CACHE_HOME
diff --git a/src/test/xattr_bench.cc b/src/test/xattr_bench.cc
index 2b8d09f..e26e32f 100644
--- a/src/test/xattr_bench.cc
+++ b/src/test/xattr_bench.cc
@@ -90,23 +90,23 @@ uint64_t do_run(ObjectStore *store, int attrsize, int numattrs,
   Mutex lock("lock");
   Cond cond;
   int in_flight = 0;
+  ObjectStore::Sequencer osr(__func__);
   ObjectStore::Transaction t;
-  map<string, pair<set<string>, ObjectStore::Sequencer*> > collections;
+  map<coll_t, pair<set<string>, ObjectStore::Sequencer*> > collections;
   for (int i = 0; i < 3*THREADS; ++i) {
-    stringstream coll_str;
-    coll_str << "coll_" << i << "_" << run;
-    t.create_collection(coll_t(coll_str.str()));
+    coll_t coll(spg_t(pg_t(0, i + 1000*run), shard_id_t::NO_SHARD));
+    t.create_collection(coll, 0);
     set<string> objects;
     for (int i = 0; i < transsize; ++i) {
       stringstream obj_str;
       obj_str << i;
-      t.touch(coll_t(coll_str.str()),
-	      hobject_t(sobject_t(obj_str.str(), CEPH_NOSNAP)));
+      t.touch(coll,
+	      ghobject_t(hobject_t(sobject_t(obj_str.str(), CEPH_NOSNAP))));
       objects.insert(obj_str.str());
     }
-    collections[coll_str.str()] = make_pair(objects, new ObjectStore::Sequencer(coll_str.str()));
+    collections[coll] = make_pair(objects, new ObjectStore::Sequencer(coll.to_str()));
   }
-  store->apply_transaction(t);
+  store->apply_transaction(&osr, t);
 
   bufferlist bl;
   for (int i = 0; i < attrsize; ++i) {
@@ -121,7 +121,7 @@ uint64_t do_run(ObjectStore *store, int attrsize, int numattrs,
 	cond.Wait(lock);
     }
     ObjectStore::Transaction *t = new ObjectStore::Transaction;
-    map<string, pair<set<string>, ObjectStore::Sequencer*> >::iterator iter =
+    map<coll_t, pair<set<string>, ObjectStore::Sequencer*> >::iterator iter =
       rand_choose(collections);
     for (set<string>::iterator obj = iter->second.first.begin();
 	 obj != iter->second.first.end();
@@ -129,8 +129,8 @@ uint64_t do_run(ObjectStore *store, int attrsize, int numattrs,
       for (int j = 0; j < numattrs; ++j) {
 	stringstream ss;
 	ss << i << ", " << j << ", " << *obj;
-	t->setattr(coll_t(iter->first),
-		   hobject_t(sobject_t(*obj, CEPH_NOSNAP)),
+	t->setattr(iter->first,
+		   ghobject_t(hobject_t(sobject_t(*obj, CEPH_NOSNAP))),
 		   ss.str().c_str(),
 		   bl);
       }
diff --git a/src/tools/Makefile-client.am b/src/tools/Makefile-client.am
index b1b43de..4cbfd5d 100644
--- a/src/tools/Makefile-client.am
+++ b/src/tools/Makefile-client.am
@@ -14,18 +14,18 @@ bin_DEBUGPROGRAMS += ceph_radosacl
 
 rados_SOURCES = \
 	tools/rados/rados.cc \
-	tools/rados/rados_import.cc \
-	tools/rados/rados_export.cc \
-	tools/rados/rados_sync.cc
+	tools/RadosDump.cc \
+	tools/rados/RadosImport.cc \
+	tools/rados/PoolDump.cc
 rados_SOURCES += common/obj_bencher.cc # needs cleanup so it can go in libcommon.la
-rados_LDADD = libcls_lock_client.la $(LIBRADOS) $(CEPH_GLOBAL)
+rados_LDADD = libcls_lock_client.la $(LIBRADOS) $(LIBRADOSSTRIPER) $(CEPH_GLOBAL)
 bin_PROGRAMS += rados
 
 
 if WITH_CEPHFS
 
 ceph_client_debug_SOURCES = tools/ceph-client-debug.cc
-ceph_client_debug_LDADD = $(LIBCEPHFS) $(CEPH_GLOBAL) $(LIBCOMMON)
+ceph_client_debug_LDADD = $(LIBCEPHFS) $(LIBCLIENT) $(CEPH_GLOBAL) $(LIBCOMMON)
 bin_DEBUGPROGRAMS += ceph-client-debug
 
 endif # WITH_CEPHFS
diff --git a/src/tools/Makefile-server.am b/src/tools/Makefile-server.am
index 12d22f0..599c5db 100644
--- a/src/tools/Makefile-server.am
+++ b/src/tools/Makefile-server.am
@@ -11,11 +11,15 @@ ceph_kvstore_tool_LDADD = $(LIBOS) $(CEPH_GLOBAL)
 ceph_kvstore_tool_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 bin_DEBUGPROGRAMS += ceph-kvstore-tool
 
+if WITH_MON
+ceph_monstore_update_crushdir = $(libdir)/ceph
+ceph_monstore_update_crush_SCRIPTS = tools/ceph-monstore-update-crush.sh
+endif
 
 if WITH_OSD
 
-ceph_objectstore_tool_SOURCES = tools/ceph_objectstore_tool.cc
-ceph_objectstore_tool_LDADD = $(LIBOSD) $(LIBOS) $(CEPH_GLOBAL) $(BOOST_PROGRAM_OPTIONS_LIBS) $(LIBRADOS)
+ceph_objectstore_tool_SOURCES = tools/ceph_objectstore_tool.cc tools/RadosDump.cc
+ceph_objectstore_tool_LDADD = $(LIBOSD) $(LIBOS) $(CEPH_GLOBAL) $(BOOST_PROGRAM_OPTIONS_LIBS)
 if LINUX
 ceph_objectstore_tool_LDADD += -ldl
 endif # LINUX
@@ -47,6 +51,13 @@ cephfs_table_tool_SOURCES = \
 cephfs_table_tool_LDADD = $(LIBMDS) $(LIBRADOS) $(CEPH_GLOBAL)
 bin_PROGRAMS += cephfs-table-tool
 
+cephfs_data_scan_SOURCES = \
+	tools/cephfs/cephfs-data-scan.cc \
+	tools/cephfs/DataScan.cc \
+	tools/cephfs/MDSUtility.cc
+cephfs_data_scan_LDADD = $(LIBMDS) libcls_cephfs_client.la $(LIBRADOS) $(CEPH_GLOBAL)
+bin_PROGRAMS += cephfs-data-scan
+
 endif # WITH_RADOS
 endif # ENABLE_CLIENT
 endif # WITH_MDS
diff --git a/src/tools/Makefile.am b/src/tools/Makefile.am
index 6633a27..a6661eb 100644
--- a/src/tools/Makefile.am
+++ b/src/tools/Makefile.am
@@ -22,21 +22,6 @@ ceph_psim_SOURCES = tools/psim.cc
 ceph_psim_LDADD = $(CEPH_GLOBAL)
 bin_DEBUGPROGRAMS += ceph_psim
 
-if WITH_REST_BENCH
-rest_bench_SOURCES = tools/rest_bench.cc
-rest_bench_SOURCES += common/obj_bencher.cc # needs cleanup so it can go in libcommon.la
-rest_bench_LDADD = $(CEPH_GLOBAL)
-bin_PROGRAMS += rest-bench
-
-if WITH_SYSTEM_LIBS3
-rest_bench_LDADD += -ls3
-else
-rest_bench_LDADD += libs3/build/lib/libs3.a -lcurl -lxml2
-rest_bench_CXXFLAGS = ${AM_CXXFLAGS} -I$(top_srcdir)/src/libs3/inc
-SUBDIRS += libs3
-endif # WITH_SYSTEM_LIBS3
-endif # WITH_REST_BENCH
-
 ceph_conf_SOURCES = tools/ceph_conf.cc
 ceph_conf_LDADD = $(CEPH_GLOBAL) $(LIBCOMMON)
 bin_PROGRAMS += ceph-conf
@@ -54,4 +39,9 @@ noinst_HEADERS += \
 	tools/cephfs/Dumper.h \
 	tools/cephfs/TableTool.h \
 	tools/cephfs/MDSUtility.h \
-	tools/rados/rados_sync.h
+	tools/RadosDump.h \
+	tools/rados/RadosImport.h \
+	tools/ceph_objectstore_tool.h \
+	tools/rados/PoolDump.h \
+	tools/cephfs/DataScan.h
+
diff --git a/src/tools/RadosDump.cc b/src/tools/RadosDump.cc
new file mode 100644
index 0000000..542915e
--- /dev/null
+++ b/src/tools/RadosDump.cc
@@ -0,0 +1,168 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "RadosDump.h"
+
+int RadosDump::read_super()
+{
+  bufferlist ebl;
+  bufferlist::iterator ebliter = ebl.begin();
+  ssize_t bytes;
+
+  bytes = ebl.read_fd(file_fd, super_header::FIXED_LENGTH);
+  if ((size_t)bytes != super_header::FIXED_LENGTH) {
+    cerr << "Unexpected EOF" << std::endl;
+    return -EFAULT;
+  }
+
+  sh.decode(ebliter);
+
+  return 0;
+}
+
+
+int RadosDump::get_header(header *h)
+{
+  assert (h != NULL);
+
+  bufferlist ebl;
+  bufferlist::iterator ebliter = ebl.begin();
+  ssize_t bytes;
+
+  bytes = ebl.read_fd(file_fd, sh.header_size);
+  if ((size_t)bytes != sh.header_size) {
+    cerr << "Unexpected EOF" << std::endl;
+    return -EFAULT;
+  }
+
+  h->decode(ebliter);
+
+  return 0;
+}
+
+int RadosDump::get_footer(footer *f)
+{
+  assert(f != NULL);
+
+  bufferlist ebl;
+  bufferlist::iterator ebliter = ebl.begin();
+  ssize_t bytes;
+
+  bytes = ebl.read_fd(file_fd, sh.footer_size);
+  if ((size_t)bytes != sh.footer_size) {
+    cerr << "Unexpected EOF" << std::endl;
+    return EFAULT;
+  }
+
+  f->decode(ebliter);
+
+  if (f->magic != endmagic) {
+    cerr << "Bad footer magic" << std::endl;
+    return -EFAULT;
+  }
+
+  return 0;
+}
+
+int RadosDump::read_section(sectiontype_t *type, bufferlist *bl)
+{
+  header hdr;
+  ssize_t bytes;
+
+  int ret = get_header(&hdr);
+  if (ret)
+    return ret;
+
+  *type = hdr.type;
+
+  bl->clear();
+  bytes = bl->read_fd(file_fd, hdr.size);
+  if (bytes != hdr.size) {
+    cerr << "Unexpected EOF" << std::endl;
+    return -EFAULT;
+  }
+
+  if (hdr.size > 0) {
+    footer ft;
+    ret = get_footer(&ft);
+    if (ret)
+      return ret;
+  }
+
+  return 0;
+}
+
+
+int RadosDump::skip_object(bufferlist &bl)
+{
+  bufferlist::iterator ebliter = bl.begin();
+  bufferlist ebl;
+  bool done = false;
+  while(!done) {
+    sectiontype_t type;
+    int ret = read_section(&type, &ebl);
+    if (ret)
+      return ret;
+
+    ebliter = ebl.begin();
+    if (type >= END_OF_TYPES) {
+      cout << "Skipping unknown object section type" << std::endl;
+      continue;
+    }
+    switch(type) {
+    case TYPE_DATA:
+    case TYPE_ATTRS:
+    case TYPE_OMAP_HDR:
+    case TYPE_OMAP:
+#ifdef DIAGNOSTIC
+      cerr << "Skip type " << (int)type << std::endl;
+#endif
+      break;
+    case TYPE_OBJECT_END:
+      done = true;
+      break;
+    default:
+      cerr << "Can't skip unknown type: " << type << std::endl;
+      return -EFAULT;
+    }
+  }
+  return 0;
+}
+
+//Write super_header with its fixed 16 byte length
+void RadosDump::write_super()
+{
+  if (dry_run) {
+    return;
+  }
+
+  bufferlist superbl;
+  super_header sh;
+  footer ft;
+
+  header hdr(TYPE_NONE, 0);
+  hdr.encode(superbl);
+
+  sh.magic = super_header::super_magic;
+  sh.version = super_header::super_ver;
+  sh.header_size = superbl.length();
+  superbl.clear();
+  ft.encode(superbl);
+  sh.footer_size = superbl.length();
+  superbl.clear();
+
+  sh.encode(superbl);
+  assert(super_header::FIXED_LENGTH == superbl.length());
+  superbl.write_fd(file_fd);
+}
diff --git a/src/tools/RadosDump.h b/src/tools/RadosDump.h
new file mode 100644
index 0000000..92c0eeb
--- /dev/null
+++ b/src/tools/RadosDump.h
@@ -0,0 +1,396 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef RADOS_DUMP_H_
+#define RADOS_DUMP_H_
+
+#include <stdint.h>
+
+#include "include/buffer.h"
+#include "include/encoding.h"
+
+#include "osd/osd_types.h"
+#include "osd/OSDMap.h"
+
+typedef uint8_t sectiontype_t;
+typedef uint32_t mymagic_t;
+typedef int64_t mysize_t;
+
+enum {
+    TYPE_NONE = 0,
+    TYPE_PG_BEGIN,
+    TYPE_PG_END,
+    TYPE_OBJECT_BEGIN,
+    TYPE_OBJECT_END,
+    TYPE_DATA,
+    TYPE_ATTRS,
+    TYPE_OMAP_HDR,
+    TYPE_OMAP,
+    TYPE_PG_METADATA,
+    TYPE_POOL_BEGIN,
+    TYPE_POOL_END,
+    END_OF_TYPES,	//Keep at the end
+};
+
+const uint16_t shortmagic = 0xffce;	//goes into stream as "ceff"
+//endmagic goes into stream as "ceff ffec"
+const mymagic_t endmagic = (0xecff << 16) | shortmagic;
+
+//The first FIXED_LENGTH bytes are a fixed
+//portion of the export output.  This includes the overall
+//version number, and size of header and footer.
+//THIS STRUCTURE CAN ONLY BE APPENDED TO.  If it needs to expand,
+//the version can be bumped and then anything
+//can be added to the export format.
+struct super_header {
+  static const uint32_t super_magic = (shortmagic << 16) | shortmagic;
+  // ver = 1, Initial version
+  // ver = 2, Add OSDSuperblock to pg_begin
+  static const uint32_t super_ver = 2;
+  static const uint32_t FIXED_LENGTH = 16;
+  uint32_t magic;
+  uint32_t version;
+  uint32_t header_size;
+  uint32_t footer_size;
+
+  super_header() : magic(0), version(0), header_size(0), footer_size(0) { }
+
+  void encode(bufferlist& bl) const {
+    ::encode(magic, bl);
+    ::encode(version, bl);
+    ::encode(header_size, bl);
+    ::encode(footer_size, bl);
+  }
+  void decode(bufferlist::iterator& bl) {
+    ::decode(magic, bl);
+    ::decode(version, bl);
+    ::decode(header_size, bl);
+    ::decode(footer_size, bl);
+  }
+};
+
+struct header {
+  sectiontype_t type;
+  mysize_t size;
+  header(sectiontype_t type, mysize_t size) :
+    type(type), size(size) { }
+  header(): type(0), size(0) { }
+
+  void encode(bufferlist& bl) const {
+    uint32_t debug_type = (type << 24) | (type << 16) | shortmagic;
+    ENCODE_START(1, 1, bl);
+    ::encode(debug_type, bl);
+    ::encode(size, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::iterator& bl) {
+    uint32_t debug_type;
+    DECODE_START(1, bl);
+    ::decode(debug_type, bl);
+    type = debug_type >> 24;
+    ::decode(size, bl);
+    DECODE_FINISH(bl);
+  }
+};
+
+struct footer {
+  mymagic_t magic;
+  footer() : magic(endmagic) { }
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    ::encode(magic, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::iterator& bl) {
+    DECODE_START(1, bl);
+    ::decode(magic, bl);
+    DECODE_FINISH(bl);
+  }
+};
+
+struct pg_begin {
+  spg_t pgid;
+  OSDSuperblock superblock;
+
+  pg_begin(spg_t pg, const OSDSuperblock& sb):
+    pgid(pg), superblock(sb) { }
+  pg_begin() { }
+
+  void encode(bufferlist& bl) const {
+    // If superblock doesn't include CEPH_FS_FEATURE_INCOMPAT_SHARDS then
+    // shard will be NO_SHARD for a replicated pool.  This means
+    // that we allow the decode by struct_v 2.
+    ENCODE_START(3, 2, bl);
+    ::encode(pgid.pgid, bl);
+    ::encode(superblock, bl);
+    ::encode(pgid.shard, bl);
+    ENCODE_FINISH(bl);
+  }
+  // NOTE: New super_ver prevents decode from ver 1
+  void decode(bufferlist::iterator& bl) {
+    DECODE_START(3, bl);
+    ::decode(pgid.pgid, bl);
+    if (struct_v > 1) {
+      ::decode(superblock, bl);
+    }
+    if (struct_v > 2) {
+      ::decode(pgid.shard, bl);
+    } else {
+      pgid.shard = shard_id_t::NO_SHARD;
+    }
+    DECODE_FINISH(bl);
+  }
+};
+
+struct object_begin {
+  ghobject_t hoid;
+
+  // Duplicate what is in the OI_ATTR so we have it at the start
+  // of object processing.
+  object_info_t oi;
+
+  object_begin(const ghobject_t &hoid): hoid(hoid) { }
+  object_begin() { }
+
+  // If superblock doesn't include CEPH_FS_FEATURE_INCOMPAT_SHARDS then
+  // generation will be NO_GEN, shard_id will be NO_SHARD for a replicated
+  // pool.  This means we will allow the decode by struct_v 1.
+  void encode(bufferlist& bl) const {
+    ENCODE_START(3, 1, bl);
+    ::encode(hoid.hobj, bl);
+    ::encode(hoid.generation, bl);
+    ::encode(hoid.shard_id, bl);
+    ::encode(oi, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::iterator& bl) {
+    DECODE_START(3, bl);
+    ::decode(hoid.hobj, bl);
+    if (struct_v > 1) {
+      ::decode(hoid.generation, bl);
+      ::decode(hoid.shard_id, bl);
+    } else {
+      hoid.generation = ghobject_t::NO_GEN;
+      hoid.shard_id = shard_id_t::NO_SHARD;
+    }
+    if (struct_v > 2) {
+      ::decode(oi, bl);
+    }
+    DECODE_FINISH(bl);
+  }
+};
+
+struct data_section {
+  uint64_t offset;
+  uint64_t len;
+  bufferlist databl;
+  data_section(uint64_t offset, uint64_t len, bufferlist bl):
+     offset(offset), len(len), databl(bl) { }
+  data_section(): offset(0), len(0) { }
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    ::encode(offset, bl);
+    ::encode(len, bl);
+    ::encode(databl, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::iterator& bl) {
+    DECODE_START(1, bl);
+    ::decode(offset, bl);
+    ::decode(len, bl);
+    ::decode(databl, bl);
+    DECODE_FINISH(bl);
+  }
+};
+
+struct attr_section {
+  map<string,bufferlist> data;
+  attr_section(const map<string,bufferlist> &data) : data(data) { }
+  attr_section(map<string, bufferptr> &data_)
+  {
+    for (std::map<std::string, bufferptr>::iterator i = data_.begin();
+         i != data_.end(); ++i) {
+      bufferlist bl;
+      bl.push_front(i->second);
+      data[i->first] = bl;
+    }
+  }
+
+  attr_section() { }
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    ::encode(data, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::iterator& bl) {
+    DECODE_START(1, bl);
+    ::decode(data, bl);
+    DECODE_FINISH(bl);
+  }
+};
+
+struct omap_hdr_section {
+  bufferlist hdr;
+  omap_hdr_section(bufferlist hdr) : hdr(hdr) { }
+  omap_hdr_section() { }
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    ::encode(hdr, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::iterator& bl) {
+    DECODE_START(1, bl);
+    ::decode(hdr, bl);
+    DECODE_FINISH(bl);
+  }
+};
+
+struct omap_section {
+  map<string, bufferlist> omap;
+  omap_section(const map<string, bufferlist> &omap) :
+    omap(omap) { }
+  omap_section() { }
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    ::encode(omap, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::iterator& bl) {
+    DECODE_START(1, bl);
+    ::decode(omap, bl);
+    DECODE_FINISH(bl);
+  }
+};
+
+struct metadata_section {
+  // struct_ver is the on-disk version of original pg
+  __u8 struct_ver;  // for reference
+  epoch_t map_epoch;
+  pg_info_t info;
+  pg_log_t log;
+  map<epoch_t,pg_interval_t> past_intervals;
+  OSDMap osdmap;
+  bufferlist osdmap_bl;  // Used in lieu of encoding osdmap due to crc checking
+  map<eversion_t, hobject_t> divergent_priors;
+
+  metadata_section(__u8 struct_ver, epoch_t map_epoch, const pg_info_t &info,
+		   const pg_log_t &log, map<epoch_t,pg_interval_t> &past_intervals,
+		   map<eversion_t, hobject_t> &divergent_priors)
+    : struct_ver(struct_ver),
+      map_epoch(map_epoch),
+      info(info),
+      log(log),
+      past_intervals(past_intervals),
+      divergent_priors(divergent_priors) { }
+  metadata_section()
+    : struct_ver(0),
+      map_epoch(0) { }
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(4, 1, bl);
+    ::encode(struct_ver, bl);
+    ::encode(map_epoch, bl);
+    ::encode(info, bl);
+    ::encode(log, bl);
+    ::encode(past_intervals, bl);
+    // Equivalent to osdmap.encode(bl, features); but
+    // preserving exact layout for CRC checking.
+    bl.append(osdmap_bl);
+    ::encode(divergent_priors, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::iterator& bl) {
+    DECODE_START(4, bl);
+    ::decode(struct_ver, bl);
+    ::decode(map_epoch, bl);
+    ::decode(info, bl);
+    ::decode(log, bl);
+    if (struct_v > 1) {
+      ::decode(past_intervals, bl);
+    } else {
+      cout << "NOTICE: Older export without past_intervals" << std::endl;
+    }
+    if (struct_v > 2) {
+      osdmap.decode(bl);
+    } else {
+      cout << "WARNING: Older export without OSDMap information" << std::endl;
+    }
+    if (struct_v > 3) {
+      ::decode(divergent_priors, bl);
+    }
+    DECODE_FINISH(bl);
+  }
+};
+
+/**
+ * Superclass for classes that will need to handle a serialized RADOS
+ * dump.  Requires that the serialized dump be opened with a known FD.
+ */
+class RadosDump
+{
+  protected:
+    int file_fd;
+    super_header sh;
+    bool dry_run;
+
+  public:
+    RadosDump(int file_fd_, bool dry_run_)
+      : file_fd(file_fd_), dry_run(dry_run_)
+    {}
+
+    int read_super();
+    int get_header(header *h);
+    int get_footer(footer *f);
+    int read_section(sectiontype_t *type, bufferlist *bl);
+    int skip_object(bufferlist &bl);
+    void write_super();
+
+    // Define this in .h because it's templated
+    template <typename T>
+      int write_section(sectiontype_t type, const T& obj, int fd) {
+        if (dry_run)
+          return 0;
+        bufferlist blhdr, bl, blftr;
+        obj.encode(bl);
+        header hdr(type, bl.length());
+        hdr.encode(blhdr);
+        footer ft;
+        ft.encode(blftr);
+
+        int ret = blhdr.write_fd(fd);
+        if (ret) return ret;
+        ret = bl.write_fd(fd);
+        if (ret) return ret;
+        ret = blftr.write_fd(fd);
+        return ret;
+      }
+
+    int write_simple(sectiontype_t type, int fd)
+    {
+      if (dry_run)
+        return 0;
+      bufferlist hbl;
+
+      header hdr(type, 0);
+      hdr.encode(hbl);
+      return hbl.write_fd(fd);
+    }
+};
+
+#endif
diff --git a/src/tools/ceph-client-debug.cc b/src/tools/ceph-client-debug.cc
index 2ed9332..a84cadc 100644
--- a/src/tools/ceph-client-debug.cc
+++ b/src/tools/ceph-client-debug.cc
@@ -163,7 +163,7 @@ int main(int argc, const char **argv)
   // Release Inode references
   ceph_ll_forget(client, ino, 1);
   for (std::vector<Dentry*>::reverse_iterator p = path.rbegin(); p != path.rend(); ++p) {
-    ceph_ll_forget(client, (*p)->inode, 1);
+    ceph_ll_forget(client, (*p)->inode.get(), 1);
   }
   ino = NULL;
   path.clear();  
diff --git a/src/tools/ceph-monstore-update-crush.sh b/src/tools/ceph-monstore-update-crush.sh
new file mode 100755
index 0000000..dc6a6d7
--- /dev/null
+++ b/src/tools/ceph-monstore-update-crush.sh
@@ -0,0 +1,183 @@
+#!/bin/bash
+#
+# Copyright (C) 2015 Red Hat <contact at redhat.com>
+#
+# Author: Kefu Chai <kchai at redhat.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Library Public License for more details.
+#
+
+verbose=
+
+test -d ../src && export PATH=$PATH:.
+
+if type xmlstarlet > /dev/null 2>&1; then
+    XMLSTARLET=xmlstarlet
+elif type xml > /dev/null 2>&1; then
+    XMLSTARLET=xml
+else
+    echo "Missing xmlstarlet binary!"
+    exit 1
+fi
+
+function osdmap_get() {
+    local store_path=$1
+    local query=$2
+    local epoch=${3:+-v $3}
+    local osdmap=`mktemp`
+
+    ceph-monstore-tool $store_path get osdmap -- \
+                       $epoch -o $osdmap > /dev/null || return
+
+    echo $(osdmaptool --dump xml $osdmap 2> /dev/null | \
+           $XMLSTARLET sel -t -m "$query" -v .)
+
+    rm -f $osdmap
+}
+
+function test_crush() {
+    local store_path=$1
+    local epoch=$2
+    local max_osd=$3
+    local crush=$4
+    local osdmap=`mktemp`
+
+    ceph-monstore-tool $store_path get osdmap -- \
+                       -v $epoch -o $osdmap > /dev/null
+    osdmaptool --export-crush $crush $osdmap &> /dev/null
+
+    if crushtool --check $max_osd -i $crush > /dev/null; then
+        good=true
+    else
+        good=false
+    fi
+    rm -f $osdmap
+    $good || return 1
+}
+
+function get_crush()  {
+    local store_path=$1
+    local osdmap_epoch=$2
+    local osdmap_path=`mktemp`
+    local crush_path=`mktemp`
+
+    ceph-monstore-tool $store_path get osdmap -- \
+                       -v $osdmap_epoch -o $osdmap_path
+    osdmaptool --export-crush $crush $osdmap_path 2>&1 > /dev/null
+}
+
+function die() {
+    local retval=$?
+    echo "$@" >&2
+    exit $retval
+}
+
+function usage() {
+    [ $# -gt 0 ] && echo -e "\n$@"
+    cat <<EOF
+
+Usage: $0 [options ...] <mon-store>
+
+Search backward for a latest known-good epoch in monstore. Rewrite the osdmap
+epochs after it with the crush map in the found epoch if asked to do so. By
+default, print out the crush map in the good epoch.
+
+  [-h|--help]            display this message
+  [--out]                write the found crush map to given file (default: stdout)
+  [--rewrite]            rewrite the monitor storage with the found crush map
+  [--verbose]            be more chatty
+EOF
+    [ $# -gt 0 ] && exit 1
+    exit 0
+}
+
+function main() {
+    local temp
+    temp=$(getopt -o h --long verbose,help,mon-store:,out:,rewrite -n $0 -- "$@") || return 1
+
+    eval set -- "$temp"
+    local rewrite
+    while [ "$1" != "--" ]; do
+        case "$1" in
+            --verbose)
+                verbose=true
+                # set -xe
+                # PS4='${FUNCNAME[0]}: $LINENO: '
+                shift;;
+            -h|--help)
+                usage
+                return 0;;
+            --out)
+                output=$2
+                shift 2;;
+            --osdmap-epoch)
+                osdmap_epoch=$2
+                shift 2;;
+            --rewrite)
+                rewrite=true
+                shift;;
+            *)
+                usage "unexpected argument $1"
+                shift;;
+        esac
+    done
+    shift
+
+    local store_path="$1"
+    test $store_path || usage "I need the path to mon-store."
+
+    # try accessing the store; if it fails, likely means a mon is running
+    local last_osdmap_epoch
+    local max_osd
+    last_osdmap_epoch=$(osdmap_get $store_path "/osdmap/epoch") || \
+        die "error accessing mon store at $store_path"
+    # get the max_osd # in last osdmap epoch, crushtool will use it to check
+    # the crush maps in previous osdmaps
+    max_osd=$(osdmap_get $store_path "/osdmap/max_osd" $last_osdmap_epoch)
+
+    local good_crush
+    local good_epoch
+    test $verbose && echo "the latest osdmap epoch is $last_osdmap_epoch"
+    for epoch in `seq $last_osdmap_epoch -1 1`; do
+        local crush_path=`mktemp`
+        test $verbose && echo "checking crush map #$epoch"
+        if test_crush $store_path $epoch $max_osd $crush_path; then
+            test $verbose && echo "crush map version #$epoch works with osdmap epoch #$osdmap_epoch"
+            good_epoch=$epoch
+            good_crush=$crush_path
+            break
+        fi
+        rm -f $crush_path
+    done
+
+    if test $good_epoch; then
+        echo "good crush map found at epoch $epoch/$last_osdmap_epoch"
+    else
+        echo "Unable to find a crush map for osdmap version #$osdmap_epoch." 2>&1
+        return 1
+    fi
+
+    if test $good_epoch -eq $last_osdmap_epoch; then
+        echo "and mon store has no faulty crush maps."
+    elif test $output; then
+        crushtool --decompile $good_crush --outfn $output
+    elif test $rewrite; then
+        ceph-monstore-tool $store_path rewrite-crush --  \
+                           --crush $good_crush      \
+                           --good-epoch $good_epoch
+    else
+        echo
+        crushtool --decompile $good_crush
+    fi
+    rm -f $good_crush
+}
+
+main "$@"
diff --git a/src/tools/ceph_authtool.cc b/src/tools/ceph_authtool.cc
index fd3f73f..caf1f0a 100644
--- a/src/tools/ceph_authtool.cc
+++ b/src/tools/ceph_authtool.cc
@@ -264,6 +264,7 @@ int main(int argc, const char **argv)
         cout << key << std::endl;
       } else {
         cerr << "entity " << ename << " not found" << std::endl;
+        exit(1);
       }
     }
   
@@ -274,6 +275,7 @@ int main(int argc, const char **argv)
       r = bl.write_file(fn.c_str(), 0600);
       if (r < 0) {
         cerr << "could not write " << fn << std::endl;
+        exit(1);
       }
       //cout << "wrote " << bl.length() << " bytes to " << fn << std::endl;
     }
diff --git a/src/tools/ceph_conf.cc b/src/tools/ceph_conf.cc
index 30e40ab..0d0b85c 100644
--- a/src/tools/ceph_conf.cc
+++ b/src/tools/ceph_conf.cc
@@ -121,7 +121,8 @@ static int lookup(const std::deque<std::string> &sections,
   else if (ret == 0) {
     if (resolve_search) {
       string result;
-      if (ceph_resolve_file_search(val, result))
+      ret = ceph_resolve_file_search(val, result);
+      if (!ret)
 	puts(result.c_str());
     }
     else {
diff --git a/src/tools/ceph_kvstore_tool.cc b/src/tools/ceph_kvstore_tool.cc
index f6a48bf..a234213 100644
--- a/src/tools/ceph_kvstore_tool.cc
+++ b/src/tools/ceph_kvstore_tool.cc
@@ -23,7 +23,7 @@
 
 #include "global/global_init.h"
 #include "include/stringify.h"
-#include "os/LevelDBStore.h"
+#include "os/KeyValueDB.h"
 
 using namespace std;
 
@@ -33,8 +33,8 @@ class StoreTool
   string store_path;
 
   public:
-  StoreTool(const string &path) : store_path(path) {
-    LevelDBStore *db_ptr = new LevelDBStore(g_ceph_context, store_path);
+  StoreTool(string type, const string &path) : store_path(path) {
+    KeyValueDB *db_ptr = KeyValueDB::create(g_ceph_context, type, path);
     assert(!db_ptr->open(std::cerr));
     db.reset(db_ptr);
   }
@@ -139,7 +139,8 @@ class StoreTool
     return (ret == 0);
   }
 
-  int copy_store_to(const string &other_path, const int num_keys_per_tx) {
+  int copy_store_to(string type, const string &other_path,
+		    const int num_keys_per_tx) {
 
     if (num_keys_per_tx <= 0) {
       std::cerr << "must specify a number of keys/tx > 0" << std::endl;
@@ -147,8 +148,8 @@ class StoreTool
     }
 
     // open or create a leveldb store at @p other_path
-    LevelDBStore other(g_ceph_context, other_path);
-    int err = other.create_and_open(std::cerr);
+    KeyValueDB *other = KeyValueDB::create(g_ceph_context, type, other_path);
+    int err = other->create_and_open(std::cerr);
     if (err < 0)
       return err;
 
@@ -163,7 +164,7 @@ class StoreTool
     do {
       int num_keys = 0;
 
-      KeyValueDB::Transaction tx = other.get_transaction();
+      KeyValueDB::Transaction tx = other->get_transaction();
 
 
       while (it->valid() && num_keys < num_keys_per_tx) {
@@ -181,7 +182,7 @@ class StoreTool
       total_keys += num_keys;
 
       if (num_keys > 0)
-        other.submit_transaction_sync(tx);
+        other->submit_transaction_sync(tx);
 
       utime_t cur_duration = ceph_clock_now(g_ceph_context) - started_at;
       std::cout << "ts = " << cur_duration << "s, copied " << total_keys
@@ -206,7 +207,7 @@ class StoreTool
 
 void usage(const char *pname)
 {
-  std::cerr << "Usage: " << pname << " <store path> command [args...]\n"
+  std::cerr << "Usage: " << pname << " <leveldb|rocksdb|...> <store path> command [args...]\n"
     << "\n"
     << "Commands:\n"
     << "  list [prefix]\n"
@@ -233,20 +234,21 @@ int main(int argc, const char *argv[])
   common_init_finish(g_ceph_context);
 
 
-  if (args.size() < 2) {
+  if (args.size() < 3) {
     usage(argv[0]);
     return 1;
   }
 
-  string path(args[0]);
-  string cmd(args[1]);
+  string type(args[0]);
+  string path(args[1]);
+  string cmd(args[2]);
 
-  StoreTool st(path);
+  StoreTool st(type, path);
 
   if (cmd == "list" || cmd == "list-crc") {
     string prefix;
-    if (argc > 3)
-      prefix = argv[3];
+    if (argc > 4)
+      prefix = argv[4];
 
     bool do_crc = (cmd == "list-crc");
 
@@ -254,13 +256,13 @@ int main(int argc, const char *argv[])
 
   } else if (cmd == "exists") {
     string key;
-    if (argc < 4) {
+    if (argc < 5) {
       usage(argv[0]);
       return 1;
     }
-    string prefix(argv[3]);
-    if (argc > 4)
-      key = argv[4];
+    string prefix(argv[4]);
+    if (argc > 5)
+      key = argv[5];
 
     bool ret = st.exists(prefix, key);
     std::cout << "(" << prefix << ", " << key << ") "
@@ -269,12 +271,12 @@ int main(int argc, const char *argv[])
     return (ret ? 0 : 1);
 
   } else if (cmd == "get") {
-    if (argc < 5) {
+    if (argc < 6) {
       usage(argv[0]);
       return 1;
     }
-    string prefix(argv[3]);
-    string key(argv[4]);
+    string prefix(argv[4]);
+    string key(argv[5]);
 
     bool exists = false;
     bufferlist bl = st.get(prefix, key, exists);
@@ -285,9 +287,9 @@ int main(int argc, const char *argv[])
     }
     std::cout << std::endl;
 
-    if (argc >= 6) {
-      string subcmd(argv[5]);
-      string out(argv[6]);
+    if (argc >= 7) {
+      string subcmd(argv[6]);
+      string out(argv[7]);
 
       if (subcmd != "out") {
         std::cerr << "unrecognized subcmd '" << subcmd << "'"
@@ -300,7 +302,7 @@ int main(int argc, const char *argv[])
         return 1;
       }
 
-      int err = bl.write_file(argv[6], 0644);
+      int err = bl.write_file(argv[7], 0644);
       if (err < 0) {
         std::cerr << "error writing value to '" << out << "': "
                   << cpp_strerror(err) << std::endl;
@@ -313,12 +315,12 @@ int main(int argc, const char *argv[])
     }
 
   } else if (cmd == "crc") {
-    if (argc < 5) {
+    if (argc < 6) {
       usage(argv[0]);
       return 1;
     }
-    string prefix(argv[3]);
-    string key(argv[4]);
+    string prefix(argv[4]);
+    string key(argv[5]);
 
     bool exists = false;
     bufferlist bl = st.get(prefix, key, exists);
@@ -332,15 +334,15 @@ int main(int argc, const char *argv[])
   } else if (cmd == "get-size") {
     std::cout << "estimated store size: " << st.get_size() << std::endl;
 
-    if (argc < 4)
+    if (argc < 5)
       return 0;
 
-    if (argc < 5) {
+    if (argc < 6) {
       usage(argv[0]);
       return 1;
     }
-    string prefix(argv[3]);
-    string key(argv[4]);
+    string prefix(argv[4]);
+    string key(argv[5]);
 
     bool exists = false;
     bufferlist bl = st.get(prefix, key, exists);
@@ -353,25 +355,25 @@ int main(int argc, const char *argv[])
               << ") size " << si_t(bl.length()) << std::endl;
 
   } else if (cmd == "set") {
-    if (argc < 7) {
+    if (argc < 8) {
       usage(argv[0]);
       return 1;
     }
-    string prefix(argv[3]);
-    string key(argv[4]);
-    string subcmd(argv[5]);
+    string prefix(argv[4]);
+    string key(argv[5]);
+    string subcmd(argv[6]);
 
     bufferlist val;
     string errstr;
     if (subcmd == "ver") {
-      version_t v = (version_t) strict_strtoll(argv[6], 10, &errstr);
+      version_t v = (version_t) strict_strtoll(argv[7], 10, &errstr);
       if (!errstr.empty()) {
         std::cerr << "error reading version: " << errstr << std::endl;
         return 1;
       }
       ::encode(v, val);
     } else if (subcmd == "in") {
-      int ret = val.read_file(argv[6], &errstr);
+      int ret = val.read_file(argv[7], &errstr);
       if (ret < 0 || !errstr.empty()) {
         std::cerr << "error reading file: " << errstr << std::endl;
         return 1;
@@ -390,21 +392,21 @@ int main(int argc, const char *argv[])
     }
   } else if (cmd == "store-copy") {
     int num_keys_per_tx = 128; // magic number that just feels right.
-    if (argc < 4) {
+    if (argc < 5) {
       usage(argv[0]);
       return 1;
-    } else if (argc > 4) {
+    } else if (argc > 5) {
       string err;
-      num_keys_per_tx = strict_strtol(argv[4], 10, &err);
+      num_keys_per_tx = strict_strtol(argv[5], 10, &err);
       if (!err.empty()) {
         std::cerr << "invalid num_keys_per_tx: " << err << std::endl;
         return 1;
       }
     }
 
-    int ret = st.copy_store_to(argv[3], num_keys_per_tx);
+    int ret = st.copy_store_to(argv[1], argv[4], num_keys_per_tx);
     if (ret < 0) {
-      std::cerr << "error copying store to path '" << argv[3]
+      std::cerr << "error copying store to path '" << argv[4]
                 << "': " << cpp_strerror(ret) << std::endl;
       return 1;
     }
diff --git a/src/tools/ceph_monstore_tool.cc b/src/tools/ceph_monstore_tool.cc
index 744000e..c979bcd 100644
--- a/src/tools/ceph_monstore_tool.cc
+++ b/src/tools/ceph_monstore_tool.cc
@@ -12,6 +12,7 @@
 */
 #include <boost/program_options/variables_map.hpp>
 #include <boost/program_options/parsers.hpp>
+#include <boost/scope_exit.hpp>
 
 #include <stdlib.h>
 #include <string>
@@ -164,6 +165,7 @@ int parse_cmd_args(
  *  dump-trace < --trace-file arg >
  *  replay-trace
  *  random-gen
+ *  rewrite-crush
  *
  * wanted syntax:
  *
@@ -202,6 +204,8 @@ void usage(const char *n, po::options_description &d)
   << "                                  (replay-trace -- --help for more info)\n"
   << "  random-gen [-- options]         add randomly generated ops to the store\n"
   << "                                  (random-gen -- --help for more info)\n"
+  << "  rewrite-crush [-- options]      add a rewrite commit to the store\n"
+  << "                                  (rewrite-crush -- --help for more info)\n"
   << std::endl;
   std::cerr << d << std::endl;
   std::cerr
@@ -213,6 +217,223 @@ void usage(const char *n, po::options_description &d)
     << std::endl;
 }
 
+int update_osdmap(MonitorDBStore& store, version_t ver, bool copy,
+		  ceph::shared_ptr<CrushWrapper> crush,
+		  MonitorDBStore::Transaction* t) {
+  const string prefix("osdmap");
+
+  // full
+  bufferlist bl;
+  int r = 0;
+  r = store.get(prefix, store.combine_strings("full", ver), bl);
+  if (r) {
+    std::cerr << "Error getting full map: " << cpp_strerror(r) << std::endl;
+    return r;
+  }
+  OSDMap osdmap;
+  osdmap.decode(bl);
+  osdmap.crush = crush;
+  if (copy) {
+    osdmap.inc_epoch();
+  }
+  bl.clear();
+  // be consistent with OSDMonitor::update_from_paxos()
+  osdmap.encode(bl, CEPH_FEATURES_ALL|CEPH_FEATURE_RESERVED);
+  t->put(prefix, store.combine_strings("full", osdmap.get_epoch()), bl);
+
+  // incremental
+  OSDMap::Incremental inc;
+  if (copy) {
+    inc.epoch = osdmap.get_epoch();
+    inc.fsid = osdmap.get_fsid();
+  } else {
+    bl.clear();
+    r = store.get(prefix, ver, bl);
+    if (r) {
+      std::cerr << "Error getting inc map: " << cpp_strerror(r) << std::endl;
+      return r;
+    }
+    OSDMap::Incremental inc(bl);
+    if (inc.crush.length()) {
+      inc.crush.clear();
+      crush->encode(inc.crush);
+    }
+    if (inc.fullmap.length()) {
+      OSDMap fullmap;
+      fullmap.decode(inc.fullmap);
+      fullmap.crush = crush;
+      inc.fullmap.clear();
+      fullmap.encode(inc.fullmap);
+    }
+  }
+  assert(osdmap.have_crc());
+  inc.full_crc = osdmap.get_crc();
+  bl.clear();
+  // be consistent with OSDMonitor::update_from_paxos()
+  inc.encode(bl, CEPH_FEATURES_ALL|CEPH_FEATURE_RESERVED);
+  t->put(prefix, inc.epoch, bl);
+  return 0;
+}
+
+int rewrite_transaction(MonitorDBStore& store, int version,
+			const string& crush_file,
+			MonitorDBStore::Transaction* t) {
+  const string prefix("osdmap");
+
+  // calc the known-good epoch
+  version_t last_committed = store.get(prefix, "last_committed");
+  version_t good_version = 0;
+  if (version <= 0) {
+    if (last_committed >= (unsigned)-version) {
+      good_version = last_committed + version;
+    } else {
+      std::cerr << "osdmap-version is less than: -" << last_committed << std::endl;
+      return EINVAL;
+    }
+  } else {
+    good_version = version;
+  }
+  if (good_version >= last_committed) {
+    std::cout << "good epoch is greater or equal to the last committed one: "
+	      << good_version << " >= " << last_committed << std::endl;
+    return 0;
+  }
+
+  // load/extract the crush map
+  int r = 0;
+  ceph::shared_ptr<CrushWrapper> crush(new CrushWrapper);
+  if (crush_file.empty()) {
+    bufferlist bl;
+    r = store.get(prefix, store.combine_strings("full", good_version), bl);
+    if (r) {
+      std::cerr << "Error getting map: " << cpp_strerror(r) << std::endl;
+      return r;
+    }
+    OSDMap osdmap;
+    osdmap.decode(bl);
+    crush = osdmap.crush;
+  } else {
+    string err;
+    bufferlist bl;
+    r = bl.read_file(crush_file.c_str(), &err);
+    if (r) {
+      std::cerr << err << ": " << cpp_strerror(r) << std::endl;
+      return r;
+    }
+    bufferlist::iterator p = bl.begin();
+    crush->decode(p);
+  }
+
+  // prepare a transaction to rewrite the epochs
+  // (good_version, last_committed]
+  // with the good crush map.
+  // XXX: may need to break this into several paxos versions?
+  assert(good_version < last_committed);
+  for (version_t v = good_version + 1; v <= last_committed; v++) {
+    cout << "rewriting epoch #" << v << "/" << last_committed << std::endl;
+    r = update_osdmap(store, v, false, crush, t);
+    if (r)
+      return r;
+  }
+
+  // add a new osdmap epoch to store, so monitors will update their current osdmap
+  // in addition to the ones stored in epochs.
+  //
+  // This is needed due to the way the monitor updates from paxos and the
+  // facilities we are leveraging to push this update to the rest of the
+  // quorum.
+  //
+  // In a nutshell, we are generating a good version of the osdmap, with a
+  // proper crush, and building a transaction that will replace the bad
+  // osdmaps with good osdmaps. But this transaction needs to be applied on
+  // all nodes, so that the monitors will have good osdmaps to share with
+  // clients. We thus leverage Paxos, specifically the recovery mechanism, by
+  // creating a pending value that will be committed once the monitors form an
+  // initial quorum after being brought back to life.
+  //
+  // However, the way the monitor works has the paxos services, including the
+  // OSDMonitor, updating their state from disk *prior* to the recovery phase
+  // begins (so they have an up to date state in memory). This means the
+  // OSDMonitor will see the old, broken map, before the new paxos version is
+  // applied to disk, and the old version is cached. Even though we have the
+  // good map now, and we share the good map with clients, we will still be
+  // working on the old broken map. Instead of mucking around the monitor to
+  // make this work, we instead opt for adding the same osdmap but with a
+  // newer version, so that the OSDMonitor picks up on it when it updates from
+  // paxos after the proposal has been committed. This is not elegant, but
+  // avoids further unpleasantness that would arise from kludging around the
+  // current behavior. Also, has the added benefit of making sure the clients
+  // get an updated version of the map (because last_committed+1 >
+  // last_committed) :)
+  //
+  cout << "adding a new epoch #" << last_committed+1 << std::endl;
+  r = update_osdmap(store, last_committed++, true, crush, t);
+  if (r)
+    return r;
+  t->put(prefix, store.combine_strings("full", "latest"), last_committed);
+  t->put(prefix, "last_committed", last_committed);
+  return 0;
+}
+
+/**
+ * create a new paxos version which carries a proposal to rewrite all epochs
+ * of incremental and full map of "osdmap" after a faulty crush map is injected.
+ * so the leader will trigger a recovery and propagate this fix to its peons,
+ * after the proposal is accepted, and the transaction in it is applied. all
+ * monitors will rewrite the bad crush map with the good one, and have a new
+ * osdmap epoch with the good crush map in it.
+ */
+int rewrite_crush(const char* progname,
+		  vector<string>& subcmds,
+		  MonitorDBStore& store) {
+  po::options_description op_desc("Allowed 'rewrite-crush' options");
+  int version = -1;
+  string crush_file;
+  op_desc.add_options()
+    ("help,h", "produce this help message")
+    ("crush", po::value<string>(&crush_file),
+     ("path to the crush map file "
+      "(default: will instead extract it from the known-good osdmap)"))
+    ("good-epoch", po::value<int>(&version),
+     "known-good epoch of osdmap, if a negative number '-N' is given, the "
+     "$last_committed-N is used instead (default: -1). "
+     "Please note, -1 is not necessarily a good epoch, because there are "
+     "good chance that we have more epochs slipped into the monstore after "
+     "the one where the crushmap is firstly injected.")
+    ;
+  po::variables_map op_vm;
+  int r = parse_cmd_args(&op_desc, NULL, NULL, subcmds, &op_vm);
+  if (r) {
+    return -r;
+  }
+  if (op_vm.count("help")) {
+    usage(progname, op_desc);
+    return 0;
+  }
+
+  MonitorDBStore::Transaction rewrite_txn;
+  r = rewrite_transaction(store, version, crush_file, &rewrite_txn);
+  if (r) {
+    return r;
+  }
+
+  // store the transaction into store as a proposal
+  const string prefix("paxos");
+  version_t pending_v = store.get(prefix, "last_committed") + 1;
+  MonitorDBStore::TransactionRef t(new MonitorDBStore::Transaction);
+  bufferlist bl;
+  rewrite_txn.encode(bl);
+  cout << "adding pending commit " << pending_v
+       << " " << bl.length() << " bytes" << std::endl;
+  t->put(prefix, pending_v, bl);
+  t->put(prefix, "pending_v", pending_v);
+  // a large enough yet unique proposal number will probably do the trick
+  version_t pending_pn = (store.get(prefix, "accepted_pn") / 100 + 4) * 100 + 1;
+  t->put(prefix, "pending_pn", pending_pn);
+  store.apply_transaction(t);
+  return 0;
+}
+
 int main(int argc, char **argv) {
   int err = 0;
   po::options_description desc("Allowed options");
@@ -380,6 +601,13 @@ int main(int argc, char **argv) {
       }
     }
 
+    BOOST_SCOPE_EXIT((&r) (&fd) (&outpath)) {
+      ::close(fd);
+      if (r < 0 && fd != STDOUT_FILENO) {
+        ::remove(outpath.c_str());
+      }
+    } BOOST_SCOPE_EXIT_END
+
     bufferlist bl;
     r = 0;
     if (map_type == "osdmap") {
@@ -390,7 +618,6 @@ int main(int argc, char **argv) {
     if (r < 0) {
       std::cerr << "Error getting map: " << cpp_strerror(r) << std::endl;
       err = EINVAL;
-      ::close(fd);
       goto done;
     }
     bl.write_fd(fd);
@@ -686,12 +913,14 @@ int main(int argc, char **argv) {
                 << stringify(si_t(total_size)) << ")" << std::endl;
 
     } while (it->valid());
-
+    out_store.close();
     std::cout << "summary: copied " << total_keys << " keys, using "
               << total_tx << " transactions, totalling "
               << stringify(si_t(total_size)) << std::endl;
     std::cout << "from '" << store_path << "' to '" << out_path << "'"
               << std::endl;
+  } else if (cmd == "rewrite-crush") {
+    err = rewrite_crush(argv[0], subcmds, st);
   } else {
     std::cerr << "Unrecognized command: " << cmd << std::endl;
     usage(argv[0], desc);
diff --git a/src/tools/ceph_objectstore_tool.cc b/src/tools/ceph_objectstore_tool.cc
index 9e68946..b978a6c 100644
--- a/src/tools/ceph_objectstore_tool.cc
+++ b/src/tools/ceph_objectstore_tool.cc
@@ -26,6 +26,7 @@
 
 #include "os/ObjectStore.h"
 #include "os/FileStore.h"
+#include "os/FileJournal.h"
 
 #include "osd/PGLog.h"
 #include "osd/OSD.h"
@@ -34,31 +35,12 @@
 #include "json_spirit/json_spirit_value.h"
 #include "json_spirit/json_spirit_reader.h"
 
-#include "include/rados/librados.hpp"
+#include "ceph_objectstore_tool.h"
+#include "include/compat.h"
 
 namespace po = boost::program_options;
 using namespace std;
 
-static coll_t META_COLL("meta");
-
-enum {
-    TYPE_NONE = 0,
-    TYPE_PG_BEGIN,
-    TYPE_PG_END,
-    TYPE_OBJECT_BEGIN,
-    TYPE_OBJECT_END,
-    TYPE_DATA,
-    TYPE_ATTRS,
-    TYPE_OMAP_HDR,
-    TYPE_OMAP,
-    TYPE_PG_METADATA,
-    END_OF_TYPES,	//Keep at the end
-};
-
-//#define INTERNAL_TEST
-//#define INTERNAL_TEST2
-//#define INTERNAL_TEST3
-
 #ifdef INTERNAL_TEST
 CompatSet get_test_compat_set() {
   CompatSet::FeatureSet ceph_osd_feature_compat;
@@ -82,294 +64,10 @@ CompatSet get_test_compat_set() {
 }
 #endif
 
-typedef uint8_t sectiontype_t;
-typedef uint32_t mymagic_t;
-typedef int64_t mysize_t;
 const ssize_t max_read = 1024 * 1024;
-const uint16_t shortmagic = 0xffce;	//goes into stream as "ceff"
-//endmagic goes into stream as "ceff ffec"
-const mymagic_t endmagic = (0xecff << 16) | shortmagic;
 const int fd_none = INT_MIN;
 bool outistty;
-
-//The first FIXED_LENGTH bytes are a fixed
-//portion of the export output.  This includes the overall
-//version number, and size of header and footer.
-//THIS STRUCTURE CAN ONLY BE APPENDED TO.  If it needs to expand,
-//the version can be bumped and then anything
-//can be added to the export format.
-struct super_header {
-  static const uint32_t super_magic = (shortmagic << 16) | shortmagic;
-  // ver = 1, Initial version
-  // ver = 2, Add OSDSuperblock to pg_begin
-  static const uint32_t super_ver = 2;
-  static const uint32_t FIXED_LENGTH = 16;
-  uint32_t magic;
-  uint32_t version;
-  uint32_t header_size;
-  uint32_t footer_size;
-
-  super_header() : magic(0), version(0), header_size(0), footer_size(0) { }
-  int read_super();
-
-  void encode(bufferlist& bl) const {
-    ::encode(magic, bl);
-    ::encode(version, bl);
-    ::encode(header_size, bl);
-    ::encode(footer_size, bl);
-  }
-  void decode(bufferlist::iterator& bl) {
-    ::decode(magic, bl);
-    ::decode(version, bl);
-    ::decode(header_size, bl);
-    ::decode(footer_size, bl);
-  }
-};
-
-struct header {
-  sectiontype_t type;
-  mysize_t size;
-  header(sectiontype_t type, mysize_t size) :
-    type(type), size(size) { }
-  header(): type(0), size(0) { }
-
-  int get_header();
-
-  void encode(bufferlist& bl) const {
-    uint32_t debug_type = (type << 24) | (type << 16) | shortmagic;
-    ENCODE_START(1, 1, bl);
-    ::encode(debug_type, bl);
-    ::encode(size, bl);
-    ENCODE_FINISH(bl);
-  }
-  void decode(bufferlist::iterator& bl) {
-    uint32_t debug_type;
-    DECODE_START(1, bl);
-    ::decode(debug_type, bl);
-    type = debug_type >> 24;
-    ::decode(size, bl);
-    DECODE_FINISH(bl);
-  }
-};
-
-struct footer {
-  mymagic_t magic;
-  footer() : magic(endmagic) { }
-
-  int get_footer();
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(1, 1, bl);
-    ::encode(magic, bl);
-    ENCODE_FINISH(bl);
-  }
-  void decode(bufferlist::iterator& bl) {
-    DECODE_START(1, bl);
-    ::decode(magic, bl);
-    DECODE_FINISH(bl);
-  }
-};
-
-struct pg_begin {
-  spg_t pgid;
-  OSDSuperblock superblock;
-
-  pg_begin(spg_t pg, const OSDSuperblock& sb):
-    pgid(pg), superblock(sb) { }
-  pg_begin() { }
-
-  void encode(bufferlist& bl) const {
-    // If superblock doesn't include CEPH_FS_FEATURE_INCOMPAT_SHARDS then
-    // shard will be NO_SHARD for a replicated pool.  This means
-    // that we allow the decode by struct_v 2.
-    ENCODE_START(3, 2, bl);
-    ::encode(pgid.pgid, bl);
-    ::encode(superblock, bl);
-    ::encode(pgid.shard, bl);
-    ENCODE_FINISH(bl);
-  }
-  // NOTE: New super_ver prevents decode from ver 1
-  void decode(bufferlist::iterator& bl) {
-    DECODE_START(3, bl);
-    ::decode(pgid.pgid, bl);
-    if (struct_v > 1) {
-      ::decode(superblock, bl);
-    }
-    if (struct_v > 2) {
-      ::decode(pgid.shard, bl);
-    } else {
-      pgid.shard = shard_id_t::NO_SHARD;
-    }
-    DECODE_FINISH(bl);
-  }
-};
-
-struct object_begin {
-  ghobject_t hoid;
-
-  // Duplicate what is in the OI_ATTR so we have it at the start
-  // of object processing.
-  object_info_t oi;
-
-  object_begin(const ghobject_t &hoid): hoid(hoid) { }
-  object_begin() { }
-
-  // If superblock doesn't include CEPH_FS_FEATURE_INCOMPAT_SHARDS then
-  // generation will be NO_GEN, shard_id will be NO_SHARD for a replicated
-  // pool.  This means we will allow the decode by struct_v 1.
-  void encode(bufferlist& bl) const {
-    ENCODE_START(3, 1, bl);
-    ::encode(hoid.hobj, bl);
-    ::encode(hoid.generation, bl);
-    ::encode(hoid.shard_id, bl);
-    ::encode(oi, bl);
-    ENCODE_FINISH(bl);
-  }
-  void decode(bufferlist::iterator& bl) {
-    DECODE_START(3, bl);
-    ::decode(hoid.hobj, bl);
-    if (struct_v > 1) {
-      ::decode(hoid.generation, bl);
-      ::decode(hoid.shard_id, bl);
-    } else {
-      hoid.generation = ghobject_t::NO_GEN;
-      hoid.shard_id = shard_id_t::NO_SHARD;
-    }
-    if (struct_v > 2) {
-      ::decode(oi, bl);
-    }
-    DECODE_FINISH(bl);
-  }
-};
-
-struct data_section {
-  uint64_t offset;
-  uint64_t len;
-  bufferlist databl;
-  data_section(uint64_t offset, uint64_t len, bufferlist bl):
-     offset(offset), len(len), databl(bl) { }
-  data_section(): offset(0), len(0) { }
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(1, 1, bl);
-    ::encode(offset, bl);
-    ::encode(len, bl);
-    ::encode(databl, bl);
-    ENCODE_FINISH(bl);
-  }
-  void decode(bufferlist::iterator& bl) {
-    DECODE_START(1, bl);
-    ::decode(offset, bl);
-    ::decode(len, bl);
-    ::decode(databl, bl);
-    DECODE_FINISH(bl);
-  }
-};
-
-struct attr_section {
-  map<string,bufferptr> data;
-  attr_section(const map<string,bufferptr> &data) : data(data) { }
-  attr_section() { }
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(1, 1, bl);
-    ::encode(data, bl);
-    ENCODE_FINISH(bl);
-  }
-  void decode(bufferlist::iterator& bl) {
-    DECODE_START(1, bl);
-    ::decode(data, bl);
-    DECODE_FINISH(bl);
-  }
-};
-
-struct omap_hdr_section {
-  bufferlist hdr;
-  omap_hdr_section(bufferlist hdr) : hdr(hdr) { }
-  omap_hdr_section() { }
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(1, 1, bl);
-    ::encode(hdr, bl);
-    ENCODE_FINISH(bl);
-  }
-  void decode(bufferlist::iterator& bl) {
-    DECODE_START(1, bl);
-    ::decode(hdr, bl);
-    DECODE_FINISH(bl);
-  }
-};
-
-struct omap_section {
-  map<string, bufferlist> omap;
-  omap_section(const map<string, bufferlist> &omap) :
-    omap(omap) { }
-  omap_section() { }
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(1, 1, bl);
-    ::encode(omap, bl);
-    ENCODE_FINISH(bl);
-  }
-  void decode(bufferlist::iterator& bl) {
-    DECODE_START(1, bl);
-    ::decode(omap, bl);
-    DECODE_FINISH(bl);
-  }
-};
-
-struct metadata_section {
-  // struct_ver is the on-disk version of original pg
-  __u8 struct_ver;  // for reference
-  epoch_t map_epoch;
-  pg_info_t info;
-  pg_log_t log;
-  map<epoch_t,pg_interval_t> past_intervals;
-  OSDMap osdmap;
-  bufferlist osdmap_bl;  // Used in lieu of encoding osdmap due to crc checking
-
-  metadata_section(__u8 struct_ver, epoch_t map_epoch, const pg_info_t &info,
-		   const pg_log_t &log, map<epoch_t,pg_interval_t> &past_intervals)
-    : struct_ver(struct_ver),
-      map_epoch(map_epoch),
-      info(info),
-      log(log),
-      past_intervals(past_intervals) { }
-  metadata_section()
-    : struct_ver(0),
-      map_epoch(0) { }
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(3, 1, bl);
-    ::encode(struct_ver, bl);
-    ::encode(map_epoch, bl);
-    ::encode(info, bl);
-    ::encode(log, bl);
-    ::encode(past_intervals, bl);
-    // Equivalent to osdmap.encode(bl, features); but
-    // preserving exact layout for CRC checking.
-    bl.append(osdmap_bl);
-    ENCODE_FINISH(bl);
-  }
-  void decode(bufferlist::iterator& bl) {
-    DECODE_START(3, bl);
-    ::decode(struct_ver, bl);
-    ::decode(map_epoch, bl);
-    ::decode(info, bl);
-    ::decode(log, bl);
-    if (struct_v > 1) {
-      ::decode(past_intervals, bl);
-    } else {
-      cout << "NOTICE: Older export without past_intervals" << std::endl;
-    }
-    if (struct_v > 2) {
-      osdmap.decode(bl);
-    } else {
-      cout << "WARNING: Older export without OSDMap information" << std::endl;
-    }
-    DECODE_FINISH(bl);
-  }
-};
+bool dry_run = false;
 
 struct action_on_object_t {
   virtual ~action_on_object_t() {}
@@ -382,14 +80,14 @@ int _action_on_all_objects_in_pg(ObjectStore *store, coll_t coll, action_on_obje
   ghobject_t next;
   while (!next.is_max()) {
     vector<ghobject_t> list;
-    int r = store->collection_list_partial(
-				       coll,
-				       next,
-				       LIST_AT_A_TIME,
-				       LIST_AT_A_TIME,
-				       0,
-				       &list,
-				       &next);
+    int r = store->collection_list(
+				   coll,
+				   next,
+				   ghobject_t::get_max(),
+				   true,
+				   LIST_AT_A_TIME,
+				   &list,
+				   &next);
     if (r < 0) {
       cerr << "Error listing collection: " << coll << ", "
 	   << cpp_strerror(r) << std::endl;
@@ -400,22 +98,24 @@ int _action_on_all_objects_in_pg(ObjectStore *store, coll_t coll, action_on_obje
 	 ++obj) {
       if (obj->is_pgmeta())
 	continue;
-      bufferlist attr;
-      r = store->getattr(coll, *obj, OI_ATTR, attr);
-      if (r < 0) {
-	cerr << "Error getting attr on : " << make_pair(coll, *obj) << ", "
-	     << cpp_strerror(r) << std::endl;
-	return r;
-      }
       object_info_t oi;
-      bufferlist::iterator bp = attr.begin();
-      try {
-	::decode(oi, bp);
-      } catch (...) {
-	r = -EINVAL;
-	cerr << "Error getting attr on : " << make_pair(coll, *obj) << ", "
-	     << cpp_strerror(r) << std::endl;
-	return r;
+      if (coll != coll_t::meta()) {
+        bufferlist attr;
+        r = store->getattr(coll, *obj, OI_ATTR, attr);
+        if (r < 0) {
+	  cerr << "Error getting attr on : " << make_pair(coll, *obj) << ", "
+	       << cpp_strerror(r) << std::endl;
+	  return r;
+        }
+        bufferlist::iterator bp = attr.begin();
+        try {
+	  ::decode(oi, bp);
+        } catch (...) {
+	  r = -EINVAL;
+	  cerr << "Error getting attr on : " << make_pair(coll, *obj) << ", "
+	       << cpp_strerror(r) << std::endl;
+	  return r;
+        }
       }
       r = action.call(store, coll, *obj, oi);
       if (r < 0)
@@ -425,10 +125,52 @@ int _action_on_all_objects_in_pg(ObjectStore *store, coll_t coll, action_on_obje
   return 0;
 }
 
-int action_on_all_objects_in_pg(ObjectStore *store, coll_t coll, action_on_object_t &action, bool debug)
+int action_on_all_objects_in_pg(ObjectStore *store, string pgidstr, action_on_object_t &action, bool debug)
+{
+  spg_t pgid;
+  // Scan collections in case this is an ec pool but no shard specified
+  unsigned scanned = 0;
+  int r = 0;
+  vector<coll_t> colls_to_check;
+  vector<coll_t> candidates;
+  r = store->list_collections(candidates);
+  if (r < 0) {
+    cerr << "Error listing collections: " << cpp_strerror(r) << std::endl;
+    return r;
+  }
+  pgid.parse(pgidstr.c_str());
+  for (vector<coll_t>::iterator i = candidates.begin();
+       i != candidates.end();
+       ++i) {
+    spg_t cand_pgid;
+    if (!i->is_pg(&cand_pgid))
+      continue;
+
+    // If an exact match or treat no shard as any shard
+    if (cand_pgid == pgid || 
+        (pgid.is_no_shard() && pgid.pgid == cand_pgid.pgid)) {
+      colls_to_check.push_back(*i);
+    }
+  }
+
+  if (debug)
+    cerr << colls_to_check.size() << " pgs to scan" << std::endl;
+  for (vector<coll_t>::iterator i = colls_to_check.begin();
+       i != colls_to_check.end();
+       ++i, ++scanned) {
+    if (debug)
+      cerr << "Scanning " << *i << ", " << scanned << "/"
+	   << colls_to_check.size() << " completed" << std::endl;
+    r = _action_on_all_objects_in_pg(store, *i, action, debug);
+    if (r < 0)
+      break;
+  }
+  return r;
+}
+
+int action_on_all_objects_in_exact_pg(ObjectStore *store, coll_t coll, action_on_object_t &action, bool debug)
 {
   int r = _action_on_all_objects_in_pg(store, coll, action, debug);
-  store->sync_and_flush();
   return r;
 }
 
@@ -446,9 +188,7 @@ int _action_on_all_objects(ObjectStore *store, action_on_object_t &action, bool
   for (vector<coll_t>::iterator i = candidates.begin();
        i != candidates.end();
        ++i) {
-    spg_t pgid;
-    snapid_t snap;
-    if (i->is_pg(pgid, snap)) {
+    if (i->is_pg()) {
       colls_to_check.push_back(*i);
     }
   }
@@ -471,7 +211,6 @@ int _action_on_all_objects(ObjectStore *store, action_on_object_t &action, bool
 int action_on_all_objects(ObjectStore *store, action_on_object_t &action, bool debug)
 {
   int r = _action_on_all_objects(store, action, debug);
-  store->sync_and_flush();
   return r;
 }
 
@@ -488,24 +227,27 @@ struct pgid_object_list {
     for (list<pair<coll_t, ghobject_t> >::const_iterator i = _objects.begin();
 	 i != _objects.end();
 	 ++i) {
-      if (i != _objects.begin() && human_readable) {
-        f->flush(cout);
-        cout << std::endl;
-      }
       f->open_array_section("pgid_object");
-      string pgid = i->first.c_str();
-      std::size_t pos = pgid.find("_");
-      if (pos == string::npos)
-        f->dump_string("pgid", pgid);
-      else
-        f->dump_string("pgid", pgid.substr(0, pos));
+      spg_t pgid;
+      bool is_pg = i->first.is_pg(&pgid);
+      if (is_pg)
+        f->dump_string("pgid", stringify(pgid));
+      if (!is_pg || !human_readable)
+        f->dump_string("coll", i->first.to_str());
       f->open_object_section("ghobject");
       i->second.dump(f);
       f->close_section();
       f->close_section();
+      if (human_readable) {
+        f->flush(cout);
+        cout << std::endl;
+      }
     }
-    if (!human_readable)
+    if (!human_readable) {
       f->close_section();
+      f->flush(cout);
+      cout << std::endl;
+    }
   }
 };
 
@@ -536,32 +278,15 @@ struct lookup_ghobject : public action_on_object_t {
   }
 };
 
-hobject_t infos_oid = OSD::make_infos_oid();
+ghobject_t infos_oid = OSD::make_infos_oid();
 ghobject_t log_oid;
-hobject_t biginfo_oid;
+ghobject_t biginfo_oid;
 
 int file_fd = fd_none;
 bool debug = false;
 super_header sh;
 uint64_t testalign;
 
-template <typename T>
-int write_section(sectiontype_t type, const T& obj, int fd) {
-  bufferlist blhdr, bl, blftr;
-  obj.encode(bl);
-  header hdr(type, bl.length());
-  hdr.encode(blhdr);
-  footer ft;
-  ft.encode(blftr);
-
-  int ret = blhdr.write_fd(fd);
-  if (ret) return ret;
-  ret = bl.write_fd(fd);
-  if (ret) return ret;
-  ret = blftr.write_fd(fd);
-  return ret;
-}
-
 // Convert non-printable characters to '\###'
 static void cleanbin(string &str)
 {
@@ -585,23 +310,14 @@ static void cleanbin(string &str)
   return;
 }
 
-int write_simple(sectiontype_t type, int fd)
-{
-  bufferlist hbl;
-
-  header hdr(type, 0);
-  hdr.encode(hbl);
-  return hbl.write_fd(fd);
-}
-
 static int get_fd_data(int fd, bufferlist &bl)
 {
   uint64_t total = 0;
   do {
     ssize_t bytes = bl.read_fd(fd, max_read);
     if (bytes < 0) {
-      cerr << "read_fd error " << cpp_strerror(-bytes) << std::endl;
-      return 1;
+      cerr << "read_fd error " << cpp_strerror(bytes) << std::endl;
+      return bytes;
     }
 
     if (bytes == 0)
@@ -614,22 +330,29 @@ static int get_fd_data(int fd, bufferlist &bl)
   return 0;
 }
 
+void myexit(int ret)
+{
+  if (g_ceph_context)
+    g_ceph_context->put();
+  exit(ret);
+}
+
 static void invalid_filestore_path(string &path)
 {
   cerr << "Invalid filestore path specified: " << path << "\n";
-  exit(1);
+  myexit(1);
 }
 
 int get_log(ObjectStore *fs, __u8 struct_ver,
    coll_t coll, spg_t pgid, const pg_info_t &info,
-   PGLog::IndexedLog &log, pg_missing_t &missing)
+   PGLog::IndexedLog &log, pg_missing_t &missing,
+   map<eversion_t, hobject_t> &divergent_priors)
 {
-  map<eversion_t, hobject_t> divergent_priors;
   try {
     ostringstream oss;
     assert(struct_ver > 0);
     PGLog::read_log(fs, coll,
-		    struct_ver >= 8 ? coll : META_COLL,
+		    struct_ver >= 8 ? coll : coll_t::meta(),
 		    struct_ver >= 8 ? pgid.make_pgmeta_oid() : log_oid,
 		    info, divergent_priors, log, missing, oss);
     if (debug && oss.str().size())
@@ -637,16 +360,44 @@ int get_log(ObjectStore *fs, __u8 struct_ver,
   }
   catch (const buffer::error &e) {
     cerr << "read_log threw exception error " << e.what() << std::endl;
-    return 1;
+    return -EFAULT;
   }
   return 0;
 }
 
+void dump_log(Formatter *formatter, ostream &out, pg_log_t &log,
+      pg_missing_t &missing, map<eversion_t, hobject_t> &divergent_priors)
+{
+  formatter->open_object_section("op_log");
+  formatter->open_object_section("pg_log_t");
+  log.dump(formatter);
+  formatter->close_section();
+  formatter->flush(out);
+  formatter->open_object_section("pg_missing_t");
+  missing.dump(formatter);
+  formatter->close_section();
+  formatter->flush(out);
+  formatter->open_object_section("map");
+  formatter->open_array_section("divergent_priors");
+  for (map<eversion_t, hobject_t>::iterator it = divergent_priors.begin();
+       it != divergent_priors.end(); ++ it) {
+      formatter->open_object_section("item");
+      formatter->dump_stream("eversion") << it->first;
+      formatter->dump_stream("hobject") << it->second;
+      formatter->close_section();
+  }
+  formatter->close_section();
+  formatter->close_section();
+  formatter->close_section();
+  formatter->flush(out);
+}
+
 //Based on RemoveWQ::_process()
-void remove_coll(ObjectStore *store, const coll_t &coll)
+void remove_coll(ObjectStore *store, const coll_t &coll,
+		 ObjectStore::Sequencer &osr)
 {
   spg_t pg;
-  coll.is_pg_prefix(pg);
+  coll.is_pg_prefix(&pg);
   OSDriver driver(
     store,
     coll_t(),
@@ -660,7 +411,7 @@ void remove_coll(ObjectStore *store, const coll_t &coll)
   cout << "remove_coll " << coll << std::endl;
   while (!next.is_max()) {
     vector<ghobject_t> objects;
-    r = store->collection_list_partial(coll, next, 200, 300, 0,
+    r = store->collection_list(coll, next, ghobject_t::get_max(), true, 300,
       &objects, &next);
     if (r < 0)
       goto out;
@@ -677,7 +428,7 @@ void remove_coll(ObjectStore *store, const coll_t &coll)
 
       t->remove(coll, *i);
       if (num >= 30) {
-        store->apply_transaction(*t);
+        store->apply_transaction(&osr, *t);
         delete t;
         t = new ObjectStore::Transaction;
         num = 0;
@@ -685,7 +436,7 @@ void remove_coll(ObjectStore *store, const coll_t &coll)
     }
   }
   t->remove_collection(coll);
-  store->apply_transaction(*t);
+  store->apply_transaction(&osr, *t);
 out:
   delete t;
 }
@@ -696,7 +447,7 @@ int finish_remove_pgs(ObjectStore *store)
   vector<coll_t> ls;
   int r = store->list_collections(ls);
   if (r < 0) {
-    cerr << "finish_remove_pgs: failed to list pgs: " << cpp_strerror(-r)
+    cerr << "finish_remove_pgs: failed to list pgs: " << cpp_strerror(r)
       << std::endl;
     return r;
   }
@@ -706,19 +457,11 @@ int finish_remove_pgs(ObjectStore *store)
        ++it) {
     spg_t pgid;
 
-    if (it->is_temp(pgid)) {
-      cout << "finish_remove_pgs " << *it << " clearing temp" << std::endl;
-      OSD::recursive_remove_collection(store, *it);
-      continue;
-    }
-
-    uint64_t seq;
-    snapid_t snap;
-    if (it->is_removal(&seq, &pgid) || (it->is_pg(pgid, snap) &&
-	PG::_has_removal_flag(store, pgid))) {
-      cout << "finish_remove_pgs removing " << *it
-	   << " pgid is " << pgid << std::endl;
-      remove_coll(store, *it);
+    if (it->is_temp(&pgid) ||
+	it->is_removal(&pgid) ||
+	(it->is_pg(&pgid) && PG::_has_removal_flag(store, pgid))) {
+      cout << "finish_remove_pgs " << *it << " removing " << pgid << std::endl;
+      OSD::recursive_remove_collection(store, pgid, *it);
       continue;
     }
 
@@ -738,16 +481,15 @@ int mark_pg_for_removal(ObjectStore *fs, spg_t pgid, ObjectStore::Transaction *t
   ghobject_t pgmeta_oid(info.pgid.make_pgmeta_oid());
 
   bufferlist bl;
-  epoch_t pg_epoch = 0;
-  int r = PG::peek_map_epoch(fs, pgid, &pg_epoch, &bl);
+  epoch_t map_epoch = 0;
+  int r = PG::peek_map_epoch(fs, pgid, &map_epoch, &bl);
   if (r < 0)
-    cerr << __func__ << " warning: peek_map_epoch fails" << std::endl;
-
+    cerr << __func__ << " warning: peek_map_epoch reported error" << std::endl;
   map<epoch_t,pg_interval_t> past_intervals;
   __u8 struct_v;
   r = PG::read_info(fs, pgid, coll, bl, info, past_intervals, struct_v);
   if (r < 0) {
-    cerr << __func__ << " error on read_info " << cpp_strerror(-r) << std::endl;
+    cerr << __func__ << " error on read_info " << cpp_strerror(r) << std::endl;
     return r;
   }
   if (struct_v < 8) {
@@ -756,10 +498,10 @@ int mark_pg_for_removal(ObjectStore *fs, spg_t pgid, ObjectStore::Transaction *t
     bufferlist one;
     one.append('1');
     t->collection_setattr(coll, "remove", one);
-    cout << "remove " << META_COLL << " " << log_oid.hobj.oid << std::endl;
-    t->remove(META_COLL, log_oid);
-    cout << "remove " << META_COLL << " " << biginfo_oid.oid << std::endl;
-    t->remove(META_COLL, biginfo_oid);
+    cout << "remove " << coll_t::meta() << " " << log_oid << std::endl;
+    t->remove(coll_t::meta(), log_oid);
+    cout << "remove " << coll_t::meta() << " " << biginfo_oid << std::endl;
+    t->remove(coll_t::meta(), biginfo_oid);
   } else {
     // new omap key
     cout << "setting '_remove' omap key" << std::endl;
@@ -773,86 +515,57 @@ int mark_pg_for_removal(ObjectStore *fs, spg_t pgid, ObjectStore::Transaction *t
 #pragma GCC diagnostic pop
 #pragma GCC diagnostic warning "-Wpragmas"
 
-int initiate_new_remove_pg(ObjectStore *store, spg_t r_pgid)
+int initiate_new_remove_pg(ObjectStore *store, spg_t r_pgid,
+			   ObjectStore::Sequencer &osr)
 {
+  if (!dry_run)
+    finish_remove_pgs(store);
   if (!store->collection_exists(coll_t(r_pgid)))
     return -ENOENT;
 
   cout << " marking collection for removal" << std::endl;
+  if (dry_run)
+    return 0;
   ObjectStore::Transaction *rmt = new ObjectStore::Transaction;
   int r = mark_pg_for_removal(store, r_pgid, rmt);
   if (r < 0) {
     delete rmt;
     return r;
   }
-  store->apply_transaction(*rmt);
+  store->apply_transaction(&osr, *rmt);
+  finish_remove_pgs(store);
   return r;
 }
 
-int header::get_header()
-{
-  bufferlist ebl;
-  bufferlist::iterator ebliter = ebl.begin();
-  ssize_t bytes;
-
-  bytes = ebl.read_fd(file_fd, sh.header_size);
-  if ((size_t)bytes != sh.header_size) {
-    cerr << "Unexpected EOF" << std::endl;
-    return EFAULT;
-  }
-
-  decode(ebliter);
-
-  return 0;
-}
-
-int footer::get_footer()
-{
-  bufferlist ebl;
-  bufferlist::iterator ebliter = ebl.begin();
-  ssize_t bytes;
-
-  bytes = ebl.read_fd(file_fd, sh.footer_size);
-  if ((size_t)bytes != sh.footer_size) {
-    cerr << "Unexpected EOF" << std::endl;
-    return EFAULT;
-  }
-
-  decode(ebliter);
-
-  if (magic != endmagic) {
-    cerr << "Bad footer magic" << std::endl;
-    return EFAULT;
-  }
-
-  return 0;
-}
-
 int write_info(ObjectStore::Transaction &t, epoch_t epoch, pg_info_t &info,
     map<epoch_t,pg_interval_t> &past_intervals)
 {
   //Empty for this
   coll_t coll(info.pgid);
   ghobject_t pgmeta_oid(info.pgid.make_pgmeta_oid());
-  int ret = PG::_write_info(t, epoch,
+  map<string,bufferlist> km;
+  int ret = PG::_prepare_write_info(
+    &km, epoch,
     info, coll,
     past_intervals,
     pgmeta_oid,
     true);
-  if (ret < 0) ret = -ret;
   if (ret) cerr << "Failed to write info" << std::endl;
+  t.omap_setkeys(coll, pgmeta_oid, km);
   return ret;
 }
 
 int write_pg(ObjectStore::Transaction &t, epoch_t epoch, pg_info_t &info,
-    pg_log_t &log, map<epoch_t,pg_interval_t> &past_intervals)
+    pg_log_t &log, map<epoch_t,pg_interval_t> &past_intervals,
+    map<eversion_t, hobject_t> &divergent_priors)
 {
   int ret = write_info(t, epoch, info, past_intervals);
   if (ret)
     return ret;
-  map<eversion_t, hobject_t> divergent_priors;
   coll_t coll(info.pgid);
-  PGLog::write_log(t, log, coll, info.pgid.make_pgmeta_oid(), divergent_priors);
+  map<string,bufferlist> km;
+  PGLog::write_log(t, &km, log, coll, info.pgid.make_pgmeta_oid(), divergent_priors);
+  t.omap_setkeys(coll, info.pgid.make_pgmeta_oid(), km);
   return 0;
 }
 
@@ -865,7 +578,7 @@ void get_omap_batch(ObjectMap::ObjectMapIterator &iter, map<string, bufferlist>
   }
 }
 
-int export_file(ObjectStore *store, coll_t cid, ghobject_t &obj)
+int ObjectStoreTool::export_file(ObjectStore *store, coll_t cid, ghobject_t &obj)
 {
   struct stat st;
   mysize_t total;
@@ -945,7 +658,7 @@ int export_file(ObjectStore *store, coll_t cid, ghobject_t &obj)
   bufferlist hdrbuf;
   ret = store->omap_get_header(cid, obj, &hdrbuf, true);
   if (ret < 0) {
-    cerr << "omap_get_header: " << cpp_strerror(-ret) << std::endl;
+    cerr << "omap_get_header: " << cpp_strerror(ret) << std::endl;
     return ret;
   }
 
@@ -957,7 +670,7 @@ int export_file(ObjectStore *store, coll_t cid, ghobject_t &obj)
   ObjectMap::ObjectMapIterator iter = store->get_omap_iterator(cid, obj);
   if (!iter) {
     ret = -ENOENT;
-    cerr << "omap_get_iterator: " << cpp_strerror(-ret) << std::endl;
+    cerr << "omap_get_iterator: " << cpp_strerror(ret) << std::endl;
     return ret;
   }
   iter->seek_to_first();
@@ -984,20 +697,21 @@ int export_file(ObjectStore *store, coll_t cid, ghobject_t &obj)
   return 0;
 }
 
-int export_files(ObjectStore *store, coll_t coll)
+int ObjectStoreTool::export_files(ObjectStore *store, coll_t coll)
 {
   ghobject_t next;
 
   while (!next.is_max()) {
     vector<ghobject_t> objects;
-    int r = store->collection_list_partial(coll, next, 200, 300, 0,
+    int r = store->collection_list(coll, next, ghobject_t::get_max(), true, 300,
       &objects, &next);
     if (r < 0)
       return r;
     for (vector<ghobject_t>::iterator i = objects.begin();
 	 i != objects.end();
 	 ++i) {
-      if (i->is_pgmeta()) {
+      assert(!i->hobj.is_meta());
+      if (i->is_pgmeta() || i->hobj.is_temp()) {
 	continue;
       }
       r = export_file(store, coll, *i);
@@ -1008,13 +722,94 @@ int export_files(ObjectStore *store, coll_t coll)
   return 0;
 }
 
+int set_inc_osdmap(ObjectStore *store, epoch_t e, bufferlist& bl, bool force,
+		   ObjectStore::Sequencer &osr) {
+  OSDMap::Incremental inc;
+  bufferlist::iterator it = bl.begin();
+  inc.decode(it);
+  if (e == 0) {
+    e = inc.epoch;
+  } else if (e != inc.epoch) {
+    cerr << "incremental.epoch mismatch: "
+	 << inc.epoch << " != " << e << std::endl;
+    if (force) {
+      cerr << "But will continue anyway." << std::endl;
+    } else {
+      return -EINVAL;
+    }
+  }
+  const ghobject_t inc_oid = OSD::get_inc_osdmap_pobject_name(e);
+  if (!store->exists(coll_t::meta(), inc_oid)) {
+    cerr << "inc-osdmap (" << inc_oid << ") does not exist." << std::endl;
+    if (!force) {
+      return -ENOENT;
+    }
+    cout << "Creating a new epoch." << std::endl;
+  }
+  ObjectStore::Transaction t;
+  t.write(coll_t::meta(), inc_oid, 0, bl.length(), bl);
+  t.truncate(coll_t::meta(), inc_oid, bl.length());
+  int ret = store->apply_transaction(&osr, t);
+  if (ret) {
+    cerr << "Failed to set inc-osdmap (" << inc_oid << "): " << ret << std::endl;
+  } else {
+    cout << "Wrote inc-osdmap." << inc.epoch << std::endl;
+  }
+  return ret;
+}
+
+int get_inc_osdmap(ObjectStore *store, epoch_t e, bufferlist& bl)
+{
+  if (store->read(coll_t::meta(),
+		  OSD::get_inc_osdmap_pobject_name(e),
+		  0, 0, bl) < 0) {
+    return -ENOENT;
+  }
+  return 0;
+}
+
+int set_osdmap(ObjectStore *store, epoch_t e, bufferlist& bl, bool force,
+	       ObjectStore::Sequencer &osr) {
+  OSDMap osdmap;
+  osdmap.decode(bl);
+  if (e == 0) {
+    e = osdmap.get_epoch();
+  } else if (e != osdmap.get_epoch()) {
+    cerr << "osdmap.epoch mismatch: "
+	 << e << " != " << osdmap.get_epoch() << std::endl;
+    if (force) {
+      cerr << "But will continue anyway." << std::endl;
+    } else {
+      return -EINVAL;
+    }
+  }
+  const ghobject_t full_oid = OSD::get_osdmap_pobject_name(e);
+  if (!store->exists(coll_t::meta(), full_oid)) {
+    cerr << "osdmap (" << full_oid << ") does not exist." << std::endl;
+    if (!force) {
+      return -ENOENT;
+    }
+    cout << "Creating a new epoch." << std::endl;
+  }
+  ObjectStore::Transaction t;
+  t.write(coll_t::meta(), full_oid, 0, bl.length(), bl);
+  t.truncate(coll_t::meta(), full_oid, bl.length());
+  int ret = store->apply_transaction(&osr, t);
+  if (ret) {
+    cerr << "Failed to set osdmap (" << full_oid << "): " << ret << std::endl;
+  } else {
+    cout << "Wrote osdmap." << osdmap.get_epoch() << std::endl;
+  }
+  return ret;
+}
+
 int get_osdmap(ObjectStore *store, epoch_t e, OSDMap &osdmap, bufferlist& bl)
 {
   bool found = store->read(
-      META_COLL, OSD::get_osdmap_pobject_name(e), 0, 0, bl) >= 0;
+      coll_t::meta(), OSD::get_osdmap_pobject_name(e), 0, 0, bl) >= 0;
   if (!found) {
     cerr << "Can't find OSDMap for pg epoch " << e << std::endl;
-    return ENOENT;
+    return -ENOENT;
   }
   osdmap.decode(bl);
   if (debug)
@@ -1027,42 +822,28 @@ int add_osdmap(ObjectStore *store, metadata_section &ms)
   return get_osdmap(store, ms.map_epoch, ms.osdmap, ms.osdmap_bl);
 }
 
-//Write super_header with its fixed 16 byte length
-void write_super()
-{
-  bufferlist superbl;
-  super_header sh;
-  footer ft;
-
-  header hdr(TYPE_NONE, 0);
-  hdr.encode(superbl);
-
-  sh.magic = super_header::super_magic;
-  sh.version = super_header::super_ver;
-  sh.header_size = superbl.length();
-  superbl.clear();
-  ft.encode(superbl);
-  sh.footer_size = superbl.length();
-  superbl.clear();
-
-  sh.encode(superbl);
-  assert(super_header::FIXED_LENGTH == superbl.length());
-  superbl.write_fd(file_fd);
-}
-
-int do_export(ObjectStore *fs, coll_t coll, spg_t pgid, pg_info_t &info,
-    epoch_t map_epoch, __u8 struct_ver, const OSDSuperblock& superblock,
+int ObjectStoreTool::do_export(ObjectStore *fs, coll_t coll, spg_t pgid,
+    pg_info_t &info, epoch_t map_epoch, __u8 struct_ver,
+    const OSDSuperblock& superblock,
     map<epoch_t,pg_interval_t> &past_intervals)
 {
   PGLog::IndexedLog log;
   pg_missing_t missing;
+  map<eversion_t, hobject_t> divergent_priors;
 
   cerr << "Exporting " << pgid << std::endl;
 
-  int ret = get_log(fs, struct_ver, coll, pgid, info, log, missing);
+  int ret = get_log(fs, struct_ver, coll, pgid, info, log, missing,
+                    divergent_priors);
   if (ret > 0)
       return ret;
 
+  if (debug) {
+    Formatter *formatter = Formatter::create("json-pretty");
+    assert(formatter);
+    dump_log(formatter, cerr, log, missing, divergent_priors);
+    delete formatter;
+  }
   write_super();
 
   pg_begin pgb(pgid, superblock);
@@ -1076,7 +857,7 @@ int do_export(ObjectStore *fs, coll_t coll, spg_t pgid, pg_info_t &info,
 
   // The metadata_section is now before files, so import can detect
   // errors and abort without wasting time.
-  metadata_section ms(struct_ver, map_epoch, info, log, past_intervals);
+  metadata_section ms(struct_ver, map_epoch, info, log, past_intervals, divergent_priors);
   ret = add_osdmap(fs, ms);
   if (ret)
     return ret;
@@ -1097,51 +878,6 @@ int do_export(ObjectStore *fs, coll_t coll, spg_t pgid, pg_info_t &info,
   return 0;
 }
 
-int super_header::read_super()
-{
-  bufferlist ebl;
-  bufferlist::iterator ebliter = ebl.begin();
-  ssize_t bytes;
-
-  bytes = ebl.read_fd(file_fd, super_header::FIXED_LENGTH);
-  if ((size_t)bytes != super_header::FIXED_LENGTH) {
-    cerr << "Unexpected EOF" << std::endl;
-    return EFAULT;
-  }
-
-  decode(ebliter);
-
-  return 0;
-}
-
-int read_section(int fd, sectiontype_t *type, bufferlist *bl)
-{
-  header hdr;
-  ssize_t bytes;
-
-  int ret = hdr.get_header();
-  if (ret)
-    return ret;
-
-  *type = hdr.type;
-
-  bl->clear();
-  bytes = bl->read_fd(fd, hdr.size);
-  if (bytes != hdr.size) {
-    cerr << "Unexpected EOF" << std::endl;
-    return EFAULT;
-  }
-
-  if (hdr.size > 0) {
-    footer ft;
-    ret = ft.get_footer();
-    if (ret)
-      return ret;
-  }
-
-  return 0;
-}
-
 int get_data(ObjectStore *store, coll_t coll, ghobject_t hoid,
     ObjectStore::Transaction *t, bufferlist &bl)
 {
@@ -1170,11 +906,9 @@ int get_attrs(ObjectStore *store, coll_t coll, ghobject_t hoid,
   // This could have been handled in the caller if we didn't need to
   // support exports that didn't include object_info_t in object_begin.
   if (hoid.hobj.snap < CEPH_MAXSNAP && hoid.generation == ghobject_t::NO_GEN) {
-    map<string,bufferptr>::iterator mi = as.data.find(OI_ATTR);
+    map<string,bufferlist>::iterator mi = as.data.find(OI_ATTR);
     if (mi != as.data.end()) {
-      bufferlist attr_bl;
-      attr_bl.push_back(mi->second);
-      object_info_t oi(attr_bl);
+      object_info_t oi(mi->second);
 
       if (debug)
         cerr << "object_info " << oi << std::endl;
@@ -1215,220 +949,10 @@ int get_omap(ObjectStore *store, coll_t coll, ghobject_t hoid,
   return 0;
 }
 
-int skip_object(bufferlist &bl)
-{
-  bufferlist::iterator ebliter = bl.begin();
-  bufferlist ebl;
-  bool done = false;
-  while(!done) {
-    sectiontype_t type;
-    int ret = read_section(file_fd, &type, &ebl);
-    if (ret)
-      return ret;
-
-    ebliter = ebl.begin();
-    if (type >= END_OF_TYPES) {
-      cout << "Skipping unknown object section type" << std::endl;
-      continue;
-    }
-    switch(type) {
-    case TYPE_DATA:
-    case TYPE_ATTRS:
-    case TYPE_OMAP_HDR:
-    case TYPE_OMAP:
-#ifdef DIAGNOSTIC
-      cerr << "Skip type " << (int)type << std::endl;
-#endif
-      break;
-    case TYPE_OBJECT_END:
-      done = true;
-      break;
-    default:
-      return EFAULT;
-    }
-  }
-  return 0;
-}
-
-int get_object_rados(librados::IoCtx &ioctx, bufferlist &bl)
-{
-  bufferlist::iterator ebliter = bl.begin();
-  object_begin ob;
-  ob.decode(ebliter);
-  map<string,bufferptr>::iterator i;
-  bufferlist abl;
-
-  data_section ds;
-  attr_section as;
-  omap_hdr_section oh;
-  omap_section os;
-
-  assert(g_ceph_context);
-  if (ob.hoid.hobj.nspace == g_ceph_context->_conf->osd_hit_set_namespace) {
-    cout << "Skipping internal object " << ob.hoid << std::endl;
-    skip_object(bl);
-    return 0;
-  }
-
-  if (!ob.hoid.hobj.is_head()) {
-    cout << "Skipping non-head for " << ob.hoid << std::endl;
-    skip_object(bl);
-    return 0;
-  }
-
-  ioctx.set_namespace(ob.hoid.hobj.get_namespace());
-
-  string msg("Write");
-  int ret = ioctx.create(ob.hoid.hobj.oid.name, true);
-  if (ret && ret != -EEXIST) {
-    cerr << "create failed: " << cpp_strerror(ret) << std::endl;
-    return ret;
-  }
-  if (ret == -EEXIST) {
-    msg = "***Overwrite***";
-    ret = ioctx.remove(ob.hoid.hobj.oid.name);
-    if (ret < 0) {
-      cerr << "remove failed: " << cpp_strerror(ret) << std::endl;
-      return ret;
-    }
-    ret = ioctx.create(ob.hoid.hobj.oid.name, true);
-    if (ret < 0) {
-      cerr << "create failed: " << cpp_strerror(ret) << std::endl;
-      return ret;
-    }
-  }
-
-  cout << msg << " " << ob.hoid << std::endl;
-
-  bool need_align = false;
-  uint64_t alignment = 0;
-  if (testalign) {
-    need_align = true;
-    alignment = testalign;
-  } else {
-    if ((need_align = ioctx.pool_requires_alignment()))
-      alignment = ioctx.pool_required_alignment();
-  }
-
-  if (debug && need_align)
-    cerr << "alignment = " << alignment << std::endl;
-
-  bufferlist ebl, databl;
-  uint64_t in_offset = 0, out_offset = 0;
-  bool done = false;
-  while(!done) {
-    sectiontype_t type;
-    int ret = read_section(file_fd, &type, &ebl);
-    if (ret)
-      return ret;
-
-    ebliter = ebl.begin();
-    //cout << "\tdo_object: Section type " << hex << type << dec << std::endl;
-    //cout << "\t\tsection size " << ebl.length() << std::endl;
-    if (type >= END_OF_TYPES) {
-      cout << "Skipping unknown object section type" << std::endl;
-      continue;
-    }
-    switch(type) {
-    case TYPE_DATA:
-      ds.decode(ebliter);
-      if (debug)
-        cerr << "\tdata: offset " << ds.offset << " len " << ds.len << std::endl;
-      if (need_align) {
-        if (ds.offset != in_offset) {
-          cerr << "Discontiguous object data in export" << std::endl;
-          return EFAULT;
-        }
-        assert(ds.databl.length() == ds.len);
-        databl.claim_append(ds.databl);
-        in_offset += ds.len;
-        if (databl.length() >= alignment) {
-          uint64_t rndlen = uint64_t(databl.length() / alignment) * alignment;
-          if (debug) cerr << "write offset=" << out_offset << " len=" << rndlen << std::endl;
-          ret = ioctx.write(ob.hoid.hobj.oid.name, databl, rndlen, out_offset);
-          if (ret) {
-            cerr << "write failed: " << cpp_strerror(ret) << std::endl;
-            return ret;
-          }
-          out_offset += rndlen;
-          bufferlist n;
-          if (databl.length() > rndlen) {
-            assert(databl.length() - rndlen < alignment);
-	    n.substr_of(databl, rndlen, databl.length() - rndlen);
-          }
-          databl = n;
-        }
-        break;
-      }
-      ret = ioctx.write(ob.hoid.hobj.oid.name, ds.databl, ds.len, ds.offset);
-      if (ret) {
-        cerr << "write failed: " << cpp_strerror(ret) << std::endl;
-        return ret;
-      }
-      break;
-    case TYPE_ATTRS:
-      as.decode(ebliter);
-
-      if (debug)
-        cerr << "\tattrs: len " << as.data.size() << std::endl;
-      for (i = as.data.begin(); i != as.data.end(); ++i) {
-        if (i->first == "_" || i->first == "snapset")
-          continue;
-        abl.clear();
-        abl.push_front(i->second);
-        ret = ioctx.setxattr(ob.hoid.hobj.oid.name, i->first.substr(1).c_str(), abl);
-        if (ret) {
-          cerr << "setxattr failed: " << cpp_strerror(ret) << std::endl;
-          if (ret != -EOPNOTSUPP)
-            return ret;
-        }
-      }
-      break;
-    case TYPE_OMAP_HDR:
-      oh.decode(ebliter);
-
-      if (debug)
-        cerr << "\tomap header: " << string(oh.hdr.c_str(), oh.hdr.length())
-          << std::endl;
-      ret = ioctx.omap_set_header(ob.hoid.hobj.oid.name, oh.hdr);
-      if (ret) {
-        cerr << "omap_set_header failed: " << cpp_strerror(ret) << std::endl;
-        if (ret != -EOPNOTSUPP)
-          return ret;
-      }
-      break;
-    case TYPE_OMAP:
-      os.decode(ebliter);
-
-      if (debug)
-        cerr << "\tomap: size " << os.omap.size() << std::endl;
-      ret = ioctx.omap_set(ob.hoid.hobj.oid.name, os.omap);
-      if (ret) {
-        cerr << "omap_set failed: " << cpp_strerror(ret) << std::endl;
-        if (ret != -EOPNOTSUPP)
-          return ret;
-      }
-      break;
-    case TYPE_OBJECT_END:
-      if (need_align && databl.length() > 0) {
-        assert(databl.length() < alignment);
-        if (debug) cerr << "END write offset=" << out_offset << " len=" << databl.length() << std::endl;
-        ret = ioctx.write(ob.hoid.hobj.oid.name, databl, databl.length(), out_offset);
-        if (ret) {
-           cerr << "write failed: " << cpp_strerror(ret) << std::endl;
-          return ret;
-        }
-      }
-      done = true;
-      break;
-    default:
-      return EFAULT;
-    }
-  }
-  return 0;
-}
-
-int get_object(ObjectStore *store, coll_t coll, bufferlist &bl, OSDMap &curmap)
+int ObjectStoreTool::get_object(ObjectStore *store, coll_t coll,
+				bufferlist &bl, OSDMap &curmap,
+				bool *skipped_objects,
+				ObjectStore::Sequencer &osr)
 {
   ObjectStore::Transaction tran;
   ObjectStore::Transaction *t = &tran;
@@ -1440,9 +964,13 @@ int get_object(ObjectStore *store, coll_t coll, bufferlist &bl, OSDMap &curmap)
     coll_t(),
     OSD::make_snapmapper_oid());
   spg_t pg;
-  coll.is_pg_prefix(pg);
+  coll.is_pg_prefix(&pg);
   SnapMapper mapper(&driver, 0, 0, 0, pg.shard);
 
+  if (ob.hoid.hobj.is_temp()) {
+    cerr << "ERROR: Export contains temporary object '" << ob.hoid << "'" << std::endl;
+    return -EFAULT;
+  }
   assert(g_ceph_context);
   if (ob.hoid.hobj.nspace != g_ceph_context->_conf->osd_hit_set_namespace) {
     object_t oid = ob.hoid.hobj.oid;
@@ -1451,25 +979,26 @@ int get_object(ObjectStore *store, coll_t coll, bufferlist &bl, OSDMap &curmap)
     pg_t pgid = curmap.raw_pg_to_pg(raw_pgid);
   
     spg_t coll_pgid;
-    snapid_t coll_snap;
-    if (coll.is_pg(coll_pgid, coll_snap) == false) {
+    if (coll.is_pg(&coll_pgid) == false) {
       cerr << "INTERNAL ERROR: Bad collection during import" << std::endl;
-      return 1;
+      return -EFAULT;
     }
     if (coll_pgid.shard != ob.hoid.shard_id) {
       cerr << "INTERNAL ERROR: Importing shard " << coll_pgid.shard 
         << " but object shard is " << ob.hoid.shard_id << std::endl;
-      return 1;
+      return -EFAULT;
     }
      
     if (coll_pgid.pgid != pgid) {
-      cerr << "Skipping object '" << ob.hoid << "' which no longer belongs in exported pg" << std::endl;
+      cerr << "Skipping object '" << ob.hoid << "' which belongs in pg " << pgid << std::endl;
+      *skipped_objects = true;
       skip_object(bl);
       return 0;
     }
   }
 
-  t->touch(coll, ob.hoid);
+  if (!dry_run)
+    t->touch(coll, ob.hoid);
 
   cout << "Write " << ob.hoid << std::endl;
 
@@ -1477,7 +1006,7 @@ int get_object(ObjectStore *store, coll_t coll, bufferlist &bl, OSDMap &curmap)
   bool done = false;
   while(!done) {
     sectiontype_t type;
-    int ret = read_section(file_fd, &type, &ebl);
+    int ret = read_section(&type, &ebl);
     if (ret)
       return ret;
 
@@ -1489,18 +1018,22 @@ int get_object(ObjectStore *store, coll_t coll, bufferlist &bl, OSDMap &curmap)
     }
     switch(type) {
     case TYPE_DATA:
+      if (dry_run) break;
       ret = get_data(store, coll, ob.hoid, t, ebl);
       if (ret) return ret;
       break;
     case TYPE_ATTRS:
+      if (dry_run) break;
       ret = get_attrs(store, coll, ob.hoid, t, ebl, driver, mapper);
       if (ret) return ret;
       break;
     case TYPE_OMAP_HDR:
+      if (dry_run) break;
       ret = get_omap_hdr(store, coll, ob.hoid, t, ebl);
       if (ret) return ret;
       break;
     case TYPE_OMAP:
+      if (dry_run) break;
       ret = get_omap(store, coll, ob.hoid, t, ebl);
       if (ret) return ret;
       break;
@@ -1508,21 +1041,26 @@ int get_object(ObjectStore *store, coll_t coll, bufferlist &bl, OSDMap &curmap)
       done = true;
       break;
     default:
-      return EFAULT;
+      cerr << "Unknown section type " << type << std::endl;
+      return -EFAULT;
     }
   }
-  store->apply_transaction(*t);
+  if (!dry_run)
+    store->apply_transaction(&osr, *t);
   return 0;
 }
 
 int get_pg_metadata(ObjectStore *store, bufferlist &bl, metadata_section &ms,
-    const OSDSuperblock& sb, OSDMap& curmap)
+    const OSDSuperblock& sb, OSDMap& curmap, spg_t pgid)
 {
   bufferlist::iterator ebliter = bl.begin();
   ms.decode(ebliter);
+  spg_t old_pgid = ms.info.pgid;
+  ms.info.pgid = pgid;
 
 #if DIAGNOSTIC
   Formatter *formatter = new JSONFormatter(true);
+  cout << "export pgid " << old_pgid << std::endl;
   cout << "struct_v " << (int)ms.struct_ver << std::endl;
   cout << "map epoch " << ms.map_epoch << std::endl;
 
@@ -1550,212 +1088,214 @@ int get_pg_metadata(ObjectStore *store, bufferlist &bl, metadata_section &ms,
   formatter->close_section();
   formatter->flush(cout);
   cout << std::endl;
+
+  formatter->open_array_section("divergent_priors");
+  for (map<eversion_t, hobject_t>::iterator it = ms.divergent_priors.begin();
+       it != ms.divergent_priors.end(); ++ it) {
+      formatter->open_object_section("item");
+      formatter->dump_stream("eversion") << it->first;
+      formatter->dump_stream("hobject") << it->second;
+      formatter->close_section();
+  }
+  formatter->close_section();
+  formatter->flush(cout);
+  cout << std::endl;
 #endif
 
+  if (ms.osdmap.get_epoch() != 0 && ms.map_epoch != ms.osdmap.get_epoch()) {
+    cerr << "FATAL: Invalid OSDMap epoch in export data" << std::endl;
+    return -EFAULT;
+  }
+
   if (ms.map_epoch > sb.current_epoch) {
-    cerr << "ERROR: Export map_epoch " << ms.map_epoch << " > osd epoch " << sb.current_epoch << std::endl;
-    return 1;
+    cerr << "ERROR: Export PG's map_epoch " << ms.map_epoch << " > OSD's epoch " << sb.current_epoch << std::endl;
+    cerr << "The OSD you are using is older than the exported PG" << std::endl;
+    cerr << "Either use another OSD or join selected OSD to cluster to update it first" << std::endl;
+    return -EINVAL;
   }
 
-  // If the osdmap was present in the metadata we can check for splits.
   // Pool verified to exist for call to get_pg_num().
-  if (ms.map_epoch < sb.current_epoch) {
-    bool found_map = false;
+  unsigned new_pg_num = curmap.get_pg_num(pgid.pgid.pool());
+
+  if (pgid.pgid.ps() >= new_pg_num) {
+    cerr << "Illegal pgid, the seed is larger than current pg_num" << std::endl;
+    return -EINVAL;
+  }
+
+  // Old exports didn't include OSDMap, see if we have a copy locally
+  if (ms.osdmap.get_epoch() == 0) {
     OSDMap findmap;
     bufferlist findmap_bl;
     int ret = get_osdmap(store, ms.map_epoch, findmap, findmap_bl);
-    if (ret == 0)
-      found_map = true;
-
-    // Old export didn't include OSDMap
-    if (ms.osdmap.get_epoch() == 0) {
-      // If we found the map locally and an older export didn't have it,
-      // then we'll use the local one.
-      if (found_map) {
-        ms.osdmap = findmap;
-      } else {
-        cerr << "WARNING: No OSDMap in old export,"
-             " some objects may be ignored due to a split" << std::endl;
-      }
-    }
-
-    // If OSDMap is available check for splits
-    if (ms.osdmap.get_epoch()) {
-      spg_t parent(ms.info.pgid);
-      if (parent.is_split(ms.osdmap.get_pg_num(ms.info.pgid.pgid.m_pool),
-          curmap.get_pg_num(ms.info.pgid.pgid.m_pool), NULL)) {
-        cerr << "WARNING: Split occurred, some objects may be ignored" << std::endl;
-      }
+    if (ret == 0) {
+      ms.osdmap = findmap;
+    } else {
+      cerr << "WARNING: No OSDMap in old export,"
+           " some objects may be ignored due to a split" << std::endl;
     }
   }
 
-  ms.past_intervals.clear();
-  ms.info.history.same_interval_since = ms.map_epoch = sb.current_epoch;
-
-  return 0;
-}
-
-int do_import_rados(string pool)
-{
-  bufferlist ebl;
-  pg_info_t info;
-  PGLog::IndexedLog log;
-
-  int ret = sh.read_super();
-  if (ret)
-    return ret;
-
-  if (sh.magic != super_header::super_magic) {
-    cerr << "Invalid magic number" << std::endl;
-    return EFAULT;
-  }
-
-  if (sh.version > super_header::super_ver) {
-    cerr << "Can't handle export format version=" << sh.version << std::endl;
-    return EINVAL;
-  }
+  // Make sure old_pg_num is 0 in the unusual case that OSDMap not in export
+  // nor can we find a local copy.
+  unsigned old_pg_num = 0;
+  if (ms.osdmap.get_epoch() != 0)
+    old_pg_num = ms.osdmap.get_pg_num(pgid.pgid.pool());
 
-  //First section must be TYPE_PG_BEGIN
-  sectiontype_t type;
-  ret = read_section(file_fd, &type, &ebl);
-  if (ret)
-    return ret;
-  if (type != TYPE_PG_BEGIN) {
-    return EFAULT;
+  if (debug) {
+    cerr << "old_pg_num " << old_pg_num << std::endl;
+    cerr << "new_pg_num " << new_pg_num << std::endl;
+    cerr << ms.osdmap << std::endl;
+    cerr << curmap << std::endl;
   }
 
-  bufferlist::iterator ebliter = ebl.begin();
-  pg_begin pgb;
-  pgb.decode(ebliter);
-  spg_t pgid = pgb.pgid;
-
-  if (!pgid.is_no_shard()) {
-    cerr << "Importing Erasure Coded shard is not supported" << std::endl;
-    exit(1);
+  // If we have managed to have a good OSDMap we can do these checks
+  if (old_pg_num) {
+    if (old_pgid.pgid.ps() >= old_pg_num) {
+      cerr << "FATAL: pgid invalid for original map epoch" << std::endl;
+      return -EFAULT;
+    }
+    if (pgid.pgid.ps() >= old_pg_num) {
+      cout << "NOTICE: Post split pgid specified" << std::endl;
+    } else {
+      spg_t parent(pgid);
+      if (parent.is_split(old_pg_num, new_pg_num, NULL)) {
+            cerr << "WARNING: Split occurred, some objects may be ignored" << std::endl;
+      }
+    }
   }
 
   if (debug) {
-    cerr << "Exported features: " << pgb.superblock.compat_features << std::endl;
+    cerr << "Import pgid " << ms.info.pgid << std::endl;
+    cerr << "Clearing past_intervals " << ms.past_intervals << std::endl;
+    cerr << "Zero same_interval_since " << ms.info.history.same_interval_since << std::endl;
   }
 
-  // XXX: How to check export features?
-#if 0
-  if (sb.compat_features.compare(pgb.superblock.compat_features) == -1) {
-    cerr << "Export has incompatible features set "
-      << pgb.superblock.compat_features << std::endl;
-    return 1;
-  }
-#endif
+  // Let osd recompute past_intervals and same_interval_since
+  ms.past_intervals.clear();
+  ms.info.history.same_interval_since =  0;
 
-  librados::IoCtx ioctx;
-  librados::Rados cluster;
+  if (debug)
+    cerr << "Changing pg epoch " << ms.map_epoch << " to " << sb.current_epoch << std::endl;
 
-  char *id = getenv("CEPH_CLIENT_ID");
-  if (id) cerr << "Client id is: " << id << std::endl;
-  ret = cluster.init(id);
-  if (ret) {
-    cerr << "Error " << ret << " in cluster.init" << std::endl;
-    return ret;
-  }
-  ret = cluster.conf_read_file(NULL);
-  if (ret) {
-    cerr << "Error " << ret << " in cluster.conf_read_file" << std::endl;
-    return ret;
-  }
-  ret = cluster.conf_parse_env(NULL);
-  if (ret) {
-    cerr << "Error " << ret << " in cluster.conf_read_env" << std::endl;
-    return ret;
-  }
-  cluster.connect();
+  ms.map_epoch = sb.current_epoch;
 
-  ret = cluster.ioctx_create(pool.c_str(), ioctx);
-  if (ret < 0) {
-    cerr << "ioctx_create " << pool << " failed with " << ret << std::endl;
-    return ret;
-  }
+  return 0;
+}
 
-  cout << "Importing from pgid " << pgid << std::endl;
+typedef map<eversion_t, hobject_t> divergent_priors_t;
 
-  bool done = false;
-  bool found_metadata = false;
-  metadata_section ms;
-  while(!done) {
-    ret = read_section(file_fd, &type, &ebl);
-    if (ret)
-      return ret;
+// out: pg_log_t that only has entries that apply to import_pgid using curmap
+// reject: Entries rejected from "in" are in the reject.log.  Other fields not set.
+void filter_divergent_priors(spg_t import_pgid, const OSDMap &curmap,
+  const string &hit_set_namespace, const divergent_priors_t &in,
+  divergent_priors_t &out, divergent_priors_t &reject)
+{
+  out.clear();
+  reject.clear();
 
-    //cout << "do_import: Section type " << hex << type << dec << std::endl;
-    if (type >= END_OF_TYPES) {
-      cout << "Skipping unknown section type" << std::endl;
+  for (divergent_priors_t::const_iterator i = in.begin();
+       i != in.end(); ++i) {
+
+    // Reject divergent priors for temporary objects
+    if (i->second.is_temp()) {
+      reject.insert(*i);
       continue;
     }
-    switch(type) {
-    case TYPE_OBJECT_BEGIN:
-      ret = get_object_rados(ioctx, ebl);
-      if (ret) return ret;
-      break;
-    case TYPE_PG_METADATA:
-      if (debug)
-        cout << "Don't care about the old metadata" << std::endl;
-      found_metadata = true;
-      break;
-    case TYPE_PG_END:
-      done = true;
-      break;
-    default:
-      return EFAULT;
-    }
-  }
 
-  if (!found_metadata) {
-    cerr << "Missing metadata section, ignored" << std::endl;
-  }
+    if (i->second.nspace != hit_set_namespace) {
+      object_t oid = i->second.oid;
+      object_locator_t loc(i->second);
+      pg_t raw_pgid = curmap.object_locator_to_pg(oid, loc);
+      pg_t pgid = curmap.raw_pg_to_pg(raw_pgid);
 
-  return 0;
+      if (import_pgid.pgid == pgid) {
+        out.insert(*i);
+      } else {
+        reject.insert(*i);
+      }
+    } else {
+      out.insert(*i);
+    }
+  }
 }
 
-int do_import(ObjectStore *store, OSDSuperblock& sb)
+int ObjectStoreTool::do_import(ObjectStore *store, OSDSuperblock& sb,
+			       bool force, std::string pgidstr,
+			       ObjectStore::Sequencer &osr)
 {
   bufferlist ebl;
   pg_info_t info;
   PGLog::IndexedLog log;
+  bool skipped_objects = false;
 
-  finish_remove_pgs(store);
+  if (!dry_run)
+    finish_remove_pgs(store);
 
-  int ret = sh.read_super();
+  int ret = read_super();
   if (ret)
     return ret;
 
   if (sh.magic != super_header::super_magic) {
     cerr << "Invalid magic number" << std::endl;
-    return EFAULT;
+    return -EFAULT;
   }
 
   if (sh.version > super_header::super_ver) {
     cerr << "Can't handle export format version=" << sh.version << std::endl;
-    return EINVAL;
+    return -EINVAL;
   }
 
   //First section must be TYPE_PG_BEGIN
   sectiontype_t type;
-  ret = read_section(file_fd, &type, &ebl);
+  ret = read_section(&type, &ebl);
   if (ret)
     return ret;
-  if (type != TYPE_PG_BEGIN) {
-    return EFAULT;
+  if (type == TYPE_POOL_BEGIN) {
+    cerr << "Pool exports cannot be imported into a PG" << std::endl;
+    return -EINVAL;
+  } else if (type != TYPE_PG_BEGIN) {
+    cerr << "Invalid first section type " << type << std::endl;
+    return -EFAULT;
   }
 
   bufferlist::iterator ebliter = ebl.begin();
   pg_begin pgb;
   pgb.decode(ebliter);
   spg_t pgid = pgb.pgid;
+  spg_t orig_pgid = pgid;
+
+  if (pgidstr.length()) {
+    spg_t user_pgid;
+
+    bool ok = user_pgid.parse(pgidstr.c_str());
+    // This succeeded in main() already
+    assert(ok);
+    if (pgid != user_pgid) {
+      if (pgid.pool() != user_pgid.pool()) {
+        cerr << "Can't specify a different pgid pool, must be " << pgid.pool() << std::endl;
+        return -EINVAL;
+      }
+      if (pgid.is_no_shard() && !user_pgid.is_no_shard()) {
+        cerr << "Can't specify a sharded pgid with a non-sharded export" << std::endl;
+        return -EINVAL;
+      }
+      // Get shard from export information if not specified
+      if (!pgid.is_no_shard() && user_pgid.is_no_shard()) {
+        user_pgid.shard = pgid.shard;
+      }
+      if (pgid.shard != user_pgid.shard) {
+        cerr << "Can't specify a different shard, must be " << pgid.shard << std::endl;
+        return -EINVAL;
+      }
+      pgid = user_pgid;
+    }
+  }
 
   if (!pgb.superblock.cluster_fsid.is_zero()
       && pgb.superblock.cluster_fsid != sb.cluster_fsid) {
     cerr << "Export came from different cluster with fsid "
          << pgb.superblock.cluster_fsid << std::endl;
-    return 1;
+    return -EINVAL;
   }
 
   if (debug) {
@@ -1777,8 +1317,11 @@ int do_import(ObjectStore *store, OSDSuperblock& sb)
       cerr << "OSD requires sharding to be enabled" << std::endl;
       cerr << std::endl;
       cerr << "If you wish to import, first do 'ceph-objectstore-tool...--op set-allow-sharded-objects'" << std::endl;
+      return -EINVAL;
     }
-    return 11;  // Assume no +EAGAIN gets to end of main() until we clean up error code handling
+    // Let them import if they specify the --force option
+    if (!force)
+        return 11;  // Positive return means exit status
   }
 
   // Don't import if pool no longer exists
@@ -1792,7 +1335,7 @@ int do_import(ObjectStore *store, OSDSuperblock& sb)
   if (!curmap.have_pg_pool(pgid.pgid.m_pool)) {
     cerr << "Pool " << pgid.pgid.m_pool << " no longer exists" << std::endl;
     // Special exit code for this error, used by test code
-    return 10;  // Assume no +ECHILD gets to end of main() until we clean up error code handling
+    return 10;  // Positive return means exit status
   }
 
   ghobject_t pgmeta_oid = pgid.make_pgmeta_oid();
@@ -1803,28 +1346,35 @@ int do_import(ObjectStore *store, OSDSuperblock& sb)
   coll_t coll(pgid);
   if (store->collection_exists(coll)) {
     cerr << "pgid " << pgid << " already exists" << std::endl;
-    return 1;
+    return -EEXIST;
   }
 
-  ObjectStore::Transaction *t = new ObjectStore::Transaction;
-  PG::_create(*t, pgid);
-  PG::_init(*t, pgid, NULL);
+  if (!dry_run) {
+    ObjectStore::Transaction *t = new ObjectStore::Transaction;
+    PG::_create(*t, pgid,
+		pgid.get_split_bits(curmap.get_pg_pool(pgid.pool())->get_pg_num()));
+    PG::_init(*t, pgid, NULL);
 
-  // mark this coll for removal until we're done
-  map<string,bufferlist> values;
-  ::encode((char)1, values["_remove"]);
-  t->omap_setkeys(coll, pgid.make_pgmeta_oid(), values);
+    // mark this coll for removal until we're done
+    map<string,bufferlist> values;
+    ::encode((char)1, values["_remove"]);
+    t->omap_setkeys(coll, pgid.make_pgmeta_oid(), values);
 
-  store->apply_transaction(*t);
-  delete t;
+    store->apply_transaction(&osr, *t);
+    delete t;
+  }
 
-  cout << "Importing pgid " << pgid << std::endl;
+  cout << "Importing pgid " << pgid;
+  if (orig_pgid != pgid) {
+    cout << " exported as " << orig_pgid;
+  }
+  cout << std::endl;
 
   bool done = false;
   bool found_metadata = false;
   metadata_section ms;
   while(!done) {
-    ret = read_section(file_fd, &type, &ebl);
+    ret = read_section(&type, &ebl);
     if (ret)
       return ret;
 
@@ -1835,11 +1385,11 @@ int do_import(ObjectStore *store, OSDSuperblock& sb)
     }
     switch(type) {
     case TYPE_OBJECT_BEGIN:
-      ret = get_object(store, coll, ebl, curmap);
+      ret = get_object(store, coll, ebl, curmap, &skipped_objects, osr);
       if (ret) return ret;
       break;
     case TYPE_PG_METADATA:
-      ret = get_pg_metadata(store, ebl, ms, sb, curmap);
+      ret = get_pg_metadata(store, ebl, ms, sb, curmap, pgid);
       if (ret) return ret;
       found_metadata = true;
       break;
@@ -1847,39 +1397,68 @@ int do_import(ObjectStore *store, OSDSuperblock& sb)
       done = true;
       break;
     default:
-      return EFAULT;
+      cerr << "Unknown section type " << type << std::endl;
+      return -EFAULT;
     }
   }
 
   if (!found_metadata) {
     cerr << "Missing metadata section" << std::endl;
-    return EFAULT;
-  }
+    return -EFAULT;
+  }
+
+  ObjectStore::Transaction t;
+  if (!dry_run) {
+    pg_log_t newlog, reject;
+    pg_log_t::filter_log(pgid, curmap, g_ceph_context->_conf->osd_hit_set_namespace,
+      ms.log, newlog, reject);
+    if (debug) {
+      for (list<pg_log_entry_t>::iterator i = newlog.log.begin();
+           i != newlog.log.end(); ++i)
+        cerr << "Keeping log entry " << *i << std::endl;
+      for (list<pg_log_entry_t>::iterator i = reject.log.begin();
+           i != reject.log.end(); ++i)
+        cerr << "Skipping log entry " << *i << std::endl;
+    }
+
+    divergent_priors_t newdp, rejectdp;
+    filter_divergent_priors(pgid, curmap, g_ceph_context->_conf->osd_hit_set_namespace,
+      ms.divergent_priors, newdp, rejectdp);
+    ms.divergent_priors = newdp;
+    if (debug) {
+      for (divergent_priors_t::iterator i = newdp.begin();
+           i != newdp.end(); ++i)
+        cerr << "Keeping divergent_prior " << *i << std::endl;
+      for (divergent_priors_t::iterator i = rejectdp.begin();
+           i != rejectdp.end(); ++i)
+        cerr << "Skipping divergent_prior " << *i << std::endl;
+    }
+
+    if (debug) {
+      pg_missing_t missing;
+      Formatter *formatter = Formatter::create("json-pretty");
+      dump_log(formatter, cerr, newlog, missing, ms.divergent_priors);
+      delete formatter;
+    }
 
-  pg_log_t newlog, reject;
-  pg_log_t::filter_log(pgid, curmap, g_ceph_context->_conf->osd_hit_set_namespace,
-    ms.log, newlog, reject);
-  if (debug) {
-    for (list<pg_log_entry_t>::iterator i = newlog.log.begin();
-         i != newlog.log.end(); ++i)
-      cerr << "Keeping log entry " << *i << std::endl;
-    for (list<pg_log_entry_t>::iterator i = reject.log.begin();
-         i != reject.log.end(); ++i)
-      cerr << "Skipping log entry " << *i << std::endl;
-  }
+    // Just like a split invalidate stats since the object count is changed
+    if (skipped_objects)
+      ms.info.stats.stats_invalid = true;
 
-  t = new ObjectStore::Transaction;
-  ret = write_pg(*t, ms.map_epoch, ms.info, newlog, ms.past_intervals);
-  if (ret) return ret;
+    ret = write_pg(t, ms.map_epoch, ms.info, newlog, ms.past_intervals, ms.divergent_priors);
+    if (ret) return ret;
+  }
 
   // done, clear removal flag
   if (debug)
     cerr << "done, clearing removal flag" << std::endl;
-  set<string> remove;
-  remove.insert("_remove");
-  t->omap_rmkeys(coll, pgid.make_pgmeta_oid(), remove);
-  store->apply_transaction(*t);
-  delete t;
+
+  if (!dry_run) {
+    set<string> remove;
+    remove.insert("_remove");
+    t.omap_rmkeys(coll, pgid.make_pgmeta_oid(), remove);
+    store->apply_transaction(&osr, t);
+  }
 
   return 0;
 }
@@ -1889,9 +1468,7 @@ int do_list(ObjectStore *store, string pgidstr, string object, Formatter *format
   int r;
   lookup_ghobject lookup(object);
   if (pgidstr.length() > 0) {
-    spg_t pgid;
-    pgid.parse(pgidstr.c_str());
-    r = action_on_all_objects_in_pg(store, coll_t(pgid), lookup, debug);
+    r = action_on_all_objects_in_pg(store, pgidstr, lookup, debug);
   } else {
     r = action_on_all_objects(store, lookup, debug);
   }
@@ -1899,14 +1476,27 @@ int do_list(ObjectStore *store, string pgidstr, string object, Formatter *format
     return r;
   lookup.dump(formatter, human_readable);
   formatter->flush(cout);
-  cout << std::endl;
   return 0;
 }
 
-int do_remove_object(ObjectStore *store, coll_t coll, ghobject_t &ghobj)
+int do_meta(ObjectStore *store, string object, Formatter *formatter, bool debug, bool human_readable)
+{
+  int r;
+  lookup_ghobject lookup(object);
+  r = action_on_all_objects_in_exact_pg(store, coll_t::meta(), lookup, debug);
+  if (r)
+    return r;
+  lookup.dump(formatter, human_readable);
+  formatter->flush(cout);
+  return 0;
+}
+
+int do_remove_object(ObjectStore *store, coll_t coll,
+		     ghobject_t &ghobj,
+		     ObjectStore::Sequencer &osr)
 {
   spg_t pg;
-  coll.is_pg_prefix(pg);
+  coll.is_pg_prefix(&pg);
   OSDriver driver(
     store,
     coll_t(),
@@ -1916,22 +1506,24 @@ int do_remove_object(ObjectStore *store, coll_t coll, ghobject_t &ghobj)
 
   int r = store->stat(coll, ghobj, &st);
   if (r < 0) {
-    cerr << "remove: " << cpp_strerror(-r) << std::endl;
+    cerr << "remove: " << cpp_strerror(r) << std::endl;
     return r;
   }
 
+  cout << "remove " << ghobj << std::endl;
+  if (dry_run)
+    return 0;
   ObjectStore::Transaction *t = new ObjectStore::Transaction;
   OSDriver::OSTransaction _t(driver.get_transaction(t));
-  cout << "remove " << ghobj << std::endl;
   r = mapper.remove_oid(ghobj.hobj, &_t);
-  if (r != 0 && r != -ENOENT) {
-    cerr << "remove_oid returned " << cpp_strerror(-r) << std::endl;
+  if (r < 0 && r != -ENOENT) {
+    cerr << "remove_oid returned " << cpp_strerror(r) << std::endl;
     return r;
   }
 
   t->remove(coll, ghobj);
 
-  store->apply_transaction(*t);
+  store->apply_transaction(&osr, *t);
   delete t;
   return 0;
 }
@@ -1941,7 +1533,7 @@ int do_list_attrs(ObjectStore *store, coll_t coll, ghobject_t &ghobj)
   map<string,bufferptr> aset;
   int r = store->getattrs(coll, ghobj, aset);
   if (r < 0) {
-    cerr << "getattrs: " << cpp_strerror(-r) << std::endl;
+    cerr << "getattrs: " << cpp_strerror(r) << std::endl;
     return r;
   }
 
@@ -1983,8 +1575,8 @@ int do_get_bytes(ObjectStore *store, coll_t coll, ghobject_t &ghobj, int fd)
 
   int ret = store->stat(coll, ghobj, &st);
   if (ret < 0) {
-    cerr << "get-bytes: " << cpp_strerror(-ret) << std::endl;
-    return 1;
+    cerr << "get-bytes: " << cpp_strerror(ret) << std::endl;
+    return ret;
   }
 
   total = st.st_size;
@@ -2014,14 +1606,16 @@ int do_get_bytes(ObjectStore *store, coll_t coll, ghobject_t &ghobj, int fd)
     ret = write(fd, rawdatabl.c_str(), ret);
     if (ret == -1) {
       perror("write");
-      return 1;
+      return -errno;
     }
   }
 
   return 0;
 }
 
-int do_set_bytes(ObjectStore *store, coll_t coll, ghobject_t &ghobj, int fd)
+int do_set_bytes(ObjectStore *store, coll_t coll,
+		 ghobject_t &ghobj, int fd,
+		 ObjectStore::Sequencer &osr)
 {
   ObjectStore::Transaction tran;
   ObjectStore::Transaction *t = &tran;
@@ -2029,8 +1623,10 @@ int do_set_bytes(ObjectStore *store, coll_t coll, ghobject_t &ghobj, int fd)
   if (debug)
     cerr << "Write " << ghobj << std::endl;
 
-  t->touch(coll, ghobj);
-  t->truncate(coll, ghobj, 0);
+  if (!dry_run) {
+    t->touch(coll, ghobj);
+    t->truncate(coll, ghobj, 0);
+  }
 
   uint64_t offset = 0;
   bufferlist rawdatabl;
@@ -2038,8 +1634,8 @@ int do_set_bytes(ObjectStore *store, coll_t coll, ghobject_t &ghobj, int fd)
     rawdatabl.clear();
     ssize_t bytes = rawdatabl.read_fd(fd, max_read);
     if (bytes < 0) {
-      cerr << "read_fd error " << cpp_strerror(-bytes) << std::endl;
-      return 1;
+      cerr << "read_fd error " << cpp_strerror(bytes) << std::endl;
+      return bytes;
     }
 
     if (bytes == 0)
@@ -2047,13 +1643,15 @@ int do_set_bytes(ObjectStore *store, coll_t coll, ghobject_t &ghobj, int fd)
 
     if (debug)
       cerr << "\tdata: offset " << offset << " bytes " << bytes << std::endl;
-    t->write(coll, ghobj, offset, bytes,  rawdatabl);
+    if (!dry_run)
+      t->write(coll, ghobj, offset, bytes,  rawdatabl);
 
     offset += bytes;
     // XXX: Should we apply_transaction() every once in a while for very large files
   } while(true);
 
-  store->apply_transaction(*t);
+  if (!dry_run)
+    store->apply_transaction(&osr, *t);
   return 0;
 }
 
@@ -2063,7 +1661,7 @@ int do_get_attr(ObjectStore *store, coll_t coll, ghobject_t &ghobj, string key)
 
   int r = store->getattr(coll, ghobj, key.c_str(), bp);
   if (r < 0) {
-    cerr << "getattr: " << cpp_strerror(-r) << std::endl;
+    cerr << "getattr: " << cpp_strerror(r) << std::endl;
     return r;
   }
 
@@ -2077,7 +1675,9 @@ int do_get_attr(ObjectStore *store, coll_t coll, ghobject_t &ghobj, string key)
   return 0;
 }
 
-int do_set_attr(ObjectStore *store, coll_t coll, ghobject_t &ghobj, string key, int fd)
+int do_set_attr(ObjectStore *store, coll_t coll,
+		ghobject_t &ghobj, string key, int fd,
+		ObjectStore::Sequencer &osr)
 {
   ObjectStore::Transaction tran;
   ObjectStore::Transaction *t = &tran;
@@ -2086,18 +1686,21 @@ int do_set_attr(ObjectStore *store, coll_t coll, ghobject_t &ghobj, string key,
   if (debug)
     cerr << "Setattr " << ghobj << std::endl;
 
-  if (get_fd_data(fd, bl))
-    return 1;
+  int ret = get_fd_data(fd, bl);
+  if (ret < 0)
+    return ret;
 
   t->touch(coll, ghobj);
 
   t->setattr(coll, ghobj, key,  bl);
 
-  store->apply_transaction(*t);
+  store->apply_transaction(&osr, *t);
   return 0;
 }
 
-int do_rm_attr(ObjectStore *store, coll_t coll, ghobject_t &ghobj, string key)
+int do_rm_attr(ObjectStore *store, coll_t coll,
+	       ghobject_t &ghobj, string key,
+	       ObjectStore::Sequencer &osr)
 {
   ObjectStore::Transaction tran;
   ObjectStore::Transaction *t = &tran;
@@ -2107,7 +1710,7 @@ int do_rm_attr(ObjectStore *store, coll_t coll, ghobject_t &ghobj, string key)
 
   t->rmattr(coll, ghobj, key);
 
-  store->apply_transaction(*t);
+  store->apply_transaction(&osr, *t);
   return 0;
 }
 
@@ -2120,7 +1723,7 @@ int do_get_omap(ObjectStore *store, coll_t coll, ghobject_t &ghobj, string key)
 
   int r = store->omap_get_values(coll, ghobj, keys, &out);
   if (r < 0) {
-    cerr << "omap_get_values: " << cpp_strerror(-r) << std::endl;
+    cerr << "omap_get_values: " << cpp_strerror(r) << std::endl;
     return r;
   }
 
@@ -2142,7 +1745,9 @@ int do_get_omap(ObjectStore *store, coll_t coll, ghobject_t &ghobj, string key)
   return 0;
 }
 
-int do_set_omap(ObjectStore *store, coll_t coll, ghobject_t &ghobj, string key, int fd)
+int do_set_omap(ObjectStore *store, coll_t coll,
+		ghobject_t &ghobj, string key, int fd,
+		ObjectStore::Sequencer &osr)
 {
   ObjectStore::Transaction tran;
   ObjectStore::Transaction *t = &tran;
@@ -2152,8 +1757,9 @@ int do_set_omap(ObjectStore *store, coll_t coll, ghobject_t &ghobj, string key,
   if (debug)
     cerr << "Set_omap " << ghobj << std::endl;
 
-  if (get_fd_data(fd, valbl))
-    return 1;
+  int ret = get_fd_data(fd, valbl);
+  if (ret < 0)
+    return ret;
 
   attrset.insert(pair<string, bufferlist>(key, valbl));
 
@@ -2161,11 +1767,13 @@ int do_set_omap(ObjectStore *store, coll_t coll, ghobject_t &ghobj, string key,
 
   t->omap_setkeys(coll, ghobj, attrset);
 
-  store->apply_transaction(*t);
+  store->apply_transaction(&osr, *t);
   return 0;
 }
 
-int do_rm_omap(ObjectStore *store, coll_t coll, ghobject_t &ghobj, string key)
+int do_rm_omap(ObjectStore *store, coll_t coll,
+	       ghobject_t &ghobj, string key,
+	       ObjectStore::Sequencer &osr)
 {
   ObjectStore::Transaction tran;
   ObjectStore::Transaction *t = &tran;
@@ -2178,7 +1786,7 @@ int do_rm_omap(ObjectStore *store, coll_t coll, ghobject_t &ghobj, string key)
 
   t->omap_rmkeys(coll, ghobj, keys);
 
-  store->apply_transaction(*t);
+  store->apply_transaction(&osr, *t);
   return 0;
 }
 
@@ -2188,7 +1796,7 @@ int do_get_omaphdr(ObjectStore *store, coll_t coll, ghobject_t &ghobj)
 
   int r = store->omap_get_header(coll, ghobj, &hdrbl, true);
   if (r < 0) {
-    cerr << "omap_get_header: " << cpp_strerror(-r) << std::endl;
+    cerr << "omap_get_header: " << cpp_strerror(r) << std::endl;
     return r;
   }
 
@@ -2202,7 +1810,9 @@ int do_get_omaphdr(ObjectStore *store, coll_t coll, ghobject_t &ghobj)
   return 0;
 }
 
-int do_set_omaphdr(ObjectStore *store, coll_t coll, ghobject_t &ghobj, int fd)
+int do_set_omaphdr(ObjectStore *store, coll_t coll,
+		   ghobject_t &ghobj, int fd,
+		   ObjectStore::Sequencer &osr)
 {
   ObjectStore::Transaction tran;
   ObjectStore::Transaction *t = &tran;
@@ -2211,35 +1821,38 @@ int do_set_omaphdr(ObjectStore *store, coll_t coll, ghobject_t &ghobj, int fd)
   if (debug)
     cerr << "Omap_setheader " << ghobj << std::endl;
 
-  if (get_fd_data(fd, hdrbl))
-    return 1;
+  int ret = get_fd_data(fd, hdrbl);
+  if (ret)
+    return ret;
 
   t->touch(coll, ghobj);
 
   t->omap_setheader(coll, ghobj, hdrbl);
 
-  store->apply_transaction(*t);
+  store->apply_transaction(&osr, *t);
   return 0;
 }
 
-struct do_list_lost : public action_on_object_t {
-  virtual int call(ObjectStore *store, coll_t coll, ghobject_t &ghobj, object_info_t &oi) {
-    if (oi.is_lost())
-      cout << coll << "/" << ghobj << " is lost" << std::endl;
-    return 0;
-  }
-};
-
 struct do_fix_lost : public action_on_object_t {
-  virtual int call(ObjectStore *store, coll_t coll, ghobject_t &ghobj, object_info_t &oi) {
+  ObjectStore::Sequencer *osr;
+
+  do_fix_lost(ObjectStore::Sequencer *_osr) : osr(_osr) {}
+
+  virtual int call(ObjectStore *store, coll_t coll,
+		   ghobject_t &ghobj, object_info_t &oi) {
     if (oi.is_lost()) {
-      cout << coll << "/" << ghobj << " is lost, fixing" << std::endl;
+      cout << coll << "/" << ghobj << " is lost";
+      if (!dry_run)
+        cout << ", fixing";
+      cout << std::endl;
+      if (dry_run)
+        return 0;
       oi.clear_flag(object_info_t::FLAG_LOST);
       bufferlist bl;
       ::encode(oi, bl);
       ObjectStore::Transaction t;
       t.setattr(coll, ghobj, OI_ATTR, bl);
-      int r = store->apply_transaction(t);
+      int r = store->apply_transaction(osr, t);
       if (r < 0) {
 	cerr << "Error getting fixing attr on : " << make_pair(coll, ghobj)
 	     << ", "
@@ -2251,6 +1864,33 @@ struct do_fix_lost : public action_on_object_t {
   }
 };
 
+int print_obj_info(ObjectStore *store, coll_t coll, ghobject_t &ghobj, Formatter* formatter)
+{
+  bufferlist attr;
+  int r = store->getattr(coll, ghobj, OI_ATTR, attr);
+  if (r < 0) {
+    cerr << "Error getting attr on : " << make_pair(coll, ghobj) << ", "
+       << cpp_strerror(r) << std::endl;
+    return r;
+  }
+  object_info_t oi;
+  bufferlist::iterator bp = attr.begin();
+  try {
+    ::decode(oi, bp);
+  } catch (...) {
+    r = -EINVAL;
+    cerr << "Error getting attr on : " << make_pair(coll, ghobj) << ", "
+         << cpp_strerror(r) << std::endl;
+    return r;
+  }
+  formatter->open_object_section("info");
+  oi.dump(formatter);
+  formatter->close_section();
+  formatter->flush(cout);
+  cout << std::endl;
+  return 0;
+}
+
 void usage(po::options_description &desc)
 {
     cerr << std::endl;
@@ -2266,17 +1906,17 @@ void usage(po::options_description &desc)
     cerr << "ceph-objectstore-tool ... <object> list-attrs" << std::endl;
     cerr << "ceph-objectstore-tool ... <object> list-omap" << std::endl;
     cerr << "ceph-objectstore-tool ... <object> remove" << std::endl;
-    cerr << std::endl;
-    cerr << "ceph-objectstore-tool import-rados <pool> [file]" << std::endl;
+    cerr << "ceph-objectstore-tool ... <object> dump-info" << std::endl;
     cerr << std::endl;
     cerr << "<object> can be a JSON object description as displayed" << std::endl;
     cerr << "by --op list." << std::endl;
     cerr << "<object> can be an object name which will be looked up in all" << std::endl;
     cerr << "the OSD's PGs." << std::endl;
+    cerr << "<object> can be the empty string ('') which with a provided pgid " << std::endl;
+    cerr << "specifies the pgmeta object" << std::endl;
     cerr << std::endl;
     cerr << "The optional [file] argument will read stdin or write stdout" << std::endl;
     cerr << "if not specified or if '-' specified." << std::endl;
-    exit(1);
 }
 
 bool ends_with(const string& check, const string& ending)
@@ -2284,12 +1924,28 @@ bool ends_with(const string& check, const string& ending)
     return check.size() >= ending.size() && check.rfind(ending) == (check.size() - ending.size());
 }
 
+// Based on FileStore::dump_journal(), set-up enough to only dump
+int mydump_journal(Formatter *f, string journalpath, bool m_journal_dio)
+{
+  int r;
+
+  if (!journalpath.length())
+    return -EINVAL;
+
+  FileJournal *journal = new FileJournal(uuid_d(), NULL, NULL, journalpath.c_str(), m_journal_dio);
+  r = journal->_fdump(*f, false);
+  delete journal;
+  return r;
+}
+
 int main(int argc, char **argv)
 {
   string dpath, jpath, pgidstr, op, file, object, objcmd, arg1, arg2, type, format;
   spg_t pgid;
+  unsigned epoch = 0;
   ghobject_t ghobj;
   bool human_readable;
+  bool force;
   Formatter *formatter;
 
   po::options_description desc("Allowed options");
@@ -2302,21 +1958,26 @@ int main(int argc, char **argv)
     ("journal-path", po::value<string>(&jpath),
      "path to journal, mandatory for filestore type")
     ("pgid", po::value<string>(&pgidstr),
-     "PG id, mandatory except for import, list-lost, fix-lost, list-pgs, set-allow-sharded-objects")
+     "PG id, mandatory for info, log, remove, export, rm-past-intervals, mark-complete")
     ("op", po::value<string>(&op),
-     "Arg is one of [info, log, remove, export, import, list, list-lost, fix-lost, list-pgs, rm-past-intervals, set-allow-sharded-objects]")
+     "Arg is one of [info, log, remove, export, import, list, fix-lost, list-pgs, rm-past-intervals, set-allow-sharded-objects, dump-journal, dump-super, meta-list, "
+	 "get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete]")
+    ("epoch", po::value<unsigned>(&epoch),
+     "epoch# for get-osdmap and get-inc-osdmap, the current epoch in use if not specified")
     ("file", po::value<string>(&file),
-     "path of file to export or import")
+     "path of file to export, import, get-osdmap, set-osdmap, get-inc-osdmap or set-inc-osdmap")
     ("format", po::value<string>(&format)->default_value("json-pretty"),
      "Output format which may be json, json-pretty, xml, xml-pretty")
     ("debug", "Enable diagnostic output to stderr")
+    ("force", "Ignore some types of errors and proceed with operation - USE WITH CAUTION: CORRUPTION POSSIBLE NOW OR IN THE FUTURE")
     ("skip-journal-replay", "Disable journal replay")
     ("skip-mount-omap", "Disable mounting of omap")
+    ("dry-run", "Don't modify the objectstore")
     ;
 
   po::options_description positional("Positional options");
   positional.add_options()
-    ("object", po::value<string>(&object), "object name or ghobject in json")
+    ("object", po::value<string>(&object), "'' for pgmeta_oid, object name or ghobject in json")
     ("objcmd", po::value<string>(&objcmd), "command [(get|set)-bytes, (get|set|rm)-(attr|omap), (get|set)-omaphdr, list-attrs, list-omap, remove]")
     ("arg1", po::value<string>(&arg1), "arg1 based on cmd")
     ("arg2", po::value<string>(&arg2), "arg2 based on cmd")
@@ -2340,11 +2001,12 @@ int main(int argc, char **argv)
 						   po::include_positional);
   } catch(po::error &e) {
     std::cerr << e.what() << std::endl;
-    return 1;
+    myexit(1);
   }
 
   if (vm.count("help")) {
     usage(desc);
+    myexit(1);
   }
 
   if (!vm.count("debug")) {
@@ -2353,6 +2015,20 @@ int main(int argc, char **argv)
     debug = true;
   }
 
+  if (!vm.count("force")) {
+    force = false;
+  } else {
+    force = true;
+  }
+
+  if (vm.count("dry-run"))
+    dry_run = true;
+  osflagbits_t flags = 0;
+  if (dry_run || vm.count("skip-journal-replay"))
+    flags |= SKIP_JOURNAL_REPLAY;
+  if (vm.count("skip-mount-omap"))
+    flags |= SKIP_MOUNT_OMAP;
+
   vector<const char *> ceph_options;
   env_to_vec(ceph_options);
   ceph_options.reserve(ceph_options.size() + ceph_option_strings.size());
@@ -2362,86 +2038,53 @@ int main(int argc, char **argv)
     ceph_options.push_back(i->c_str());
   }
 
-  // Handle completely different operation "import-rados"
-  if (object == "import-rados") {
-    if (vm.count("objcmd") == 0) {
-      cerr << "ceph-objectstore-tool import-rados <pool> [file]" << std::endl;
-      exit(1);
-    }
-
-    string pool = objcmd;
-    // positional argument takes precendence, but accept
-    // --file option too
-    if (!vm.count("arg1")) {
-      if (!vm.count("file"))
-        arg1 = "-";
-      else
-        arg1 = file;
-    }
-    if (arg1 == "-") {
-      if (isatty(STDIN_FILENO)) {
-        cerr << "stdin is a tty and no file specified" << std::endl;
-        exit(1);
-      }
-      file_fd = STDIN_FILENO;
-    } else {
-      file_fd = open(arg1.c_str(), O_RDONLY);
-      if (file_fd < 0) {
-        perror("open");
-        return 1;
-      }
-    }
-
-    global_init(NULL, ceph_options, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
-    common_init_finish(g_ceph_context);
-
-    int ret = do_import_rados(pool);
-    if (ret == 0)
-      cout << "Import successful" << std::endl;
-    return ret != 0;
+  if (!vm.count("type")) {
+    type = "filestore";
   }
-
-  if (!vm.count("data-path")) {
+  if (!vm.count("data-path") &&
+     !(op == "dump-journal" && type == "filestore")) {
     cerr << "Must provide --data-path" << std::endl;
     usage(desc);
-  }
-  if (!vm.count("type")) {
-    type = "filestore";
+    myexit(1);
   }
   if (type == "filestore" && !vm.count("journal-path")) {
     cerr << "Must provide --journal-path" << std::endl;
     usage(desc);
+    myexit(1);
   }
-  if (op != "list" && vm.count("object") && !vm.count("objcmd")) {
-    cerr << "Invalid syntax, missing command" << std::endl;
-    usage(desc);
-  }
-  if (!vm.count("op") && !(vm.count("object") && vm.count("objcmd"))) {
+  if (!vm.count("op") && !vm.count("object")) {
     cerr << "Must provide --op or object command..." << std::endl;
     usage(desc);
+    myexit(1);
   }
   if (op != "list" && vm.count("op") && vm.count("object")) {
     cerr << "Can't specify both --op and object command syntax" << std::endl;
     usage(desc);
+    myexit(1);
+  }
+  if (op != "list" && vm.count("object") && !vm.count("objcmd")) {
+    cerr << "Invalid syntax, missing command" << std::endl;
+    usage(desc);
+    myexit(1);
   }
   outistty = isatty(STDOUT_FILENO);
 
   file_fd = fd_none;
-  if (op == "export") {
+  if ((op == "export" || op == "get-osdmap" || op == "get-inc-osdmap") && !dry_run) {
     if (!vm.count("file") || file == "-") {
       if (outistty) {
         cerr << "stdout is a tty and no --file filename specified" << std::endl;
-        exit(1);
+        myexit(1);
       }
       file_fd = STDOUT_FILENO;
     } else {
       file_fd = open(file.c_str(), O_WRONLY|O_CREAT|O_TRUNC, 0666);
     }
-  } else if (op == "import") {
+  } else if (op == "import" || op == "set-osdmap" || op == "set-inc-osdmap") {
     if (!vm.count("file") || file == "-") {
       if (isatty(STDIN_FILENO)) {
         cerr << "stdin is a tty and no --file filename specified" << std::endl;
-        exit(1);
+        myexit(1);
       }
       file_fd = STDIN_FILENO;
     } else {
@@ -2449,27 +2092,19 @@ int main(int argc, char **argv)
     }
   }
 
-  if (vm.count("file") && file_fd == fd_none) {
-    cerr << "--file option only applies to import or export" << std::endl;
-    return 1;
+  ObjectStoreTool tool = ObjectStoreTool(file_fd, dry_run);
+
+  if (vm.count("file") && file_fd == fd_none && !dry_run) {
+    cerr << "--file option only applies to import, export, "
+	 << "get-osdmap, set-osdmap, get-inc-osdmap or set-inc-osdmap" << std::endl;
+    myexit(1);
   }
 
   if (file_fd != fd_none && file_fd < 0) {
     perror("open");
-    return 1;
-  }
-
-  if (dpath.length() == 0) {
-    cerr << "Invalid params" << std::endl;
-    return 1;
+    myexit(1);
   }
 
-  osflagbits_t flags = 0;
-  if (vm.count("skip-journal-replay"))
-    flags |= SKIP_JOURNAL_REPLAY;
-  if (vm.count("skip-mount-omap"))
-    flags |= SKIP_MOUNT_OMAP;
-
   global_init(
     NULL, ceph_options, CEPH_ENTITY_TYPE_OSD,
     CODE_ENVIRONMENT_UTILITY_NODOUT, 0);
@@ -2482,11 +2117,32 @@ int main(int argc, char **argv)
   }
   g_conf->apply_changes(NULL);
 
+  // Special list handling.  Treating pretty_format as human readable,
+  // with one object per line and not an enclosing array.
+  human_readable = ends_with(format, "-pretty");
+  if ((op == "list" || op == "meta-list") && human_readable) {
+    // Remove -pretty from end of format which we know is there
+    format = format.substr(0, format.size() - strlen("-pretty"));
+  }
+
+  formatter = Formatter::create(format);
+  if (formatter == NULL) {
+    cerr << "unrecognized format: " << format << std::endl;
+    myexit(1);
+  }
+
+  // Special handling for filestore journal, so we can dump it without mounting
+  if (op == "dump-journal" && type == "filestore") {
+    int ret = mydump_journal(formatter, jpath, g_conf->journal_dio);
+    formatter->flush(cout);
+    myexit(ret != 0);
+  }
+
   //Verify that data-path really exists
   struct stat st;
   if (::stat(dpath.c_str(), &st) == -1) {
      perror("data-path");
-     exit(1);
+     myexit(1);
   }
   //Verify data data-path really is a filestore
   if (type == "filestore") {
@@ -2511,30 +2167,35 @@ int main(int argc, char **argv)
     }
   }
 
-  if (op == "import" && pgidstr.length()) {
-    cerr << "--pgid option invalid with import" << std::endl;
-    return 1;
+  if (pgidstr.length() && !pgid.parse(pgidstr.c_str())) {
+    cerr << "Invalid pgid '" << pgidstr << "' specified" << std::endl;
+    myexit(1);
   }
 
   ObjectStore *fs = ObjectStore::create(g_ceph_context, type, dpath, jpath, flags);
   if (fs == NULL) {
     cerr << "Must provide --type (filestore, memstore, keyvaluestore)" << std::endl;
-    exit(1);
+    if (type == "keyvaluestore") {
+      cerr << "Add \"keyvaluestore\" to "
+           << "enable_experimental_unrecoverable_data_corrupting_features"
+           << std::endl;
+    }
+    myexit(1);
   }
 
-  int r = fs->mount();
-  if (r < 0) {
-    if (r == -EBUSY) {
+  ObjectStore::Sequencer *osr = new ObjectStore::Sequencer(__func__);
+  int ret = fs->mount();
+  if (ret < 0) {
+    if (ret == -EBUSY) {
       cerr << "OSD has the store locked" << std::endl;
     } else {
-      cerr << "Mount failed with '" << cpp_strerror(-r) << "'" << std::endl;
+      cerr << "Mount failed with '" << cpp_strerror(ret) << "'" << std::endl;
     }
-    return 1;
+    myexit(1);
   }
 
   bool fs_sharded_objects = fs->get_allow_sharded_objects();
 
-  int ret = 0;
   vector<coll_t> ls;
   vector<coll_t>::iterator it;
   CompatSet supported;
@@ -2548,9 +2209,9 @@ int main(int argc, char **argv)
   bufferlist bl;
   OSDSuperblock superblock;
   bufferlist::iterator p;
-  r = fs->read(META_COLL, OSD_SUPERBLOCK_POBJECT, 0, 0, bl);
-  if (r < 0) {
-    cerr << "Failure to read OSD superblock error= " << r << std::endl;
+  ret = fs->read(coll_t::meta(), OSD_SUPERBLOCK_POBJECT, 0, 0, bl);
+  if (ret < 0) {
+    cerr << "Failure to read OSD superblock: " << cpp_strerror(ret) << std::endl;
     goto out;
   }
 
@@ -2576,16 +2237,16 @@ int main(int argc, char **argv)
     CompatSet unsupported = supported.unsupported(superblock.compat_features);
     cerr << "On-disk OSD incompatible features set "
       << unsupported << std::endl;
-    ret = EINVAL;
+    ret = -EINVAL;
     goto out;
   }
 
-  if (pgidstr.length() && !pgid.parse(pgidstr.c_str())) {
-    cerr << "Invalid pgid '" << pgidstr << "' specified" << std::endl;
-    return 1;
-  }
-
   if (op != "list" && vm.count("object")) {
+    // Special case: Create pgmeta_oid if empty string specified
+    // This can't conflict with any actual object names.
+    if (object == "") {
+      ghobj = pgid.make_pgmeta_oid();
+    } else {
     json_spirit::Value v;
     try {
       if (!json_spirit::read(object, v)) {
@@ -2596,10 +2257,10 @@ int main(int argc, char **argv)
 	  if (lookup.size() != 1) {
 	    stringstream ss;
 	    if (lookup.size() == 0)
-	      ss << objcmd << ": " << cpp_strerror(ENOENT);
+	      ss << "No object id '" << object << "' found";
 	    else
-	      ss << "expected a single object named '" << object
-		 << "' but got " << lookup.size() << " instead";
+	      ss << "Found " << lookup.size() << " objects with id '" << object
+		 << "', please use a JSON spec from --op list instead";
 	    throw std::runtime_error(ss.str());
 	  }
 	  pair<coll_t, ghobject_t> found = lookup.pop();
@@ -2625,19 +2286,23 @@ int main(int argc, char **argv)
 	    throw std::runtime_error(ss.str());
 	  }
 	  string object_pgidstr = i->get_str();
-	  spg_t object_pgid;
-	  object_pgid.parse(object_pgidstr.c_str());
-	  if (pgidstr.length() > 0) {
-	    if (object_pgid != pgid) {
-	      ss << "object '" << object
-		 << "' has a pgid different from the --pgid="
-		 << pgidstr << " option";
-	      throw std::runtime_error(ss.str());
+          if (object_pgidstr != "meta") {
+	    spg_t object_pgid;
+	    object_pgid.parse(object_pgidstr.c_str());
+	    if (pgidstr.length() > 0) {
+	      if (object_pgid != pgid) {
+	        ss << "object '" << object
+		   << "' has a pgid different from the --pgid="
+		   << pgidstr << " option";
+	        throw std::runtime_error(ss.str());
+	      }
+	    } else {
+	      pgidstr = object_pgidstr;
+	      pgid = object_pgid;
 	    }
-	  } else {
-	    pgidstr = object_pgidstr;
-	    pgid = object_pgid;
-	  }
+          } else {
+            pgidstr = object_pgidstr;
+          }
 	  ++i;
 	  v = *i;
 	}
@@ -2647,7 +2312,7 @@ int main(int argc, char **argv)
 	  ss << "Decode object json error: " << e.what();
 	  throw std::runtime_error(ss.str());
 	}
-        if ((uint64_t)pgid.pgid.m_pool != (uint64_t)ghobj.hobj.pool) {
+        if (pgidstr != "meta" && (uint64_t)pgid.pgid.m_pool != (uint64_t)ghobj.hobj.pool) {
           cerr << "Object pool and pgid pool don't match" << std::endl;
           ret = 1;
           goto out;
@@ -2658,13 +2323,18 @@ int main(int argc, char **argv)
       ret = 1;
       goto out;
     }
+    }
   }
 
-  if (op != "list" && op != "import" && op != "list-lost" && op != "fix-lost"
-      && op != "list-pgs"  && op != "set-allow-sharded-objects" &&
-      (pgidstr.length() == 0)) {
+  // The ops which require --pgid option are checked here and
+  // mentioned in the usage for --pgid.
+  if ((op == "info" || op == "log" || op == "remove" || op == "export"
+      || op == "rm-past-intervals" || op == "mark-complete") &&
+      pgidstr.length() == 0) {
     cerr << "Must provide pgid" << std::endl;
     usage(desc);
+    ret = 1;
+    goto out;
   }
 
   if (op == "set-allow-sharded-objects") {
@@ -2721,20 +2391,20 @@ int main(int argc, char **argv)
         goto out;
     }
 
-    superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
-    ObjectStore::Transaction t;
-    bl.clear();
-    ::encode(superblock, bl);
-    t.write(META_COLL, OSD_SUPERBLOCK_POBJECT, 0, bl.length(), bl);
-    r = fs->apply_transaction(t);
-    if (r < 0) {
-      cerr << "Error writing OSD superblock: " << cpp_strerror(r) << std::endl;
-      ret = 1;
-      goto out;
-    }
-
-    fs->set_allow_sharded_objects();
+    if (!dry_run) {
+      superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
+      ObjectStore::Transaction t;
+      bl.clear();
+      ::encode(superblock, bl);
+      t.write(coll_t::meta(), OSD_SUPERBLOCK_POBJECT, 0, bl.length(), bl);
+      ret = fs->apply_transaction(osr, t);
+      if (ret < 0) {
+        cerr << "Error writing OSD superblock: " << cpp_strerror(ret) << std::endl;
+        goto out;
+      }
 
+      fs->set_allow_sharded_objects();
+    }
     cout << "Enabled on-disk sharded objects" << std::endl;
 
     ret = 0;
@@ -2753,84 +2423,147 @@ int main(int argc, char **argv)
       cerr << "Found incomplete transition to sharded objects" << std::endl;
     cerr << std::endl;
     cerr << "Use --op set-allow-sharded-objects to repair" << std::endl;
-    ret = EINVAL;
+    ret = -EINVAL;
     goto out;
   }
 
   if (op == "import") {
 
     try {
-      ret = do_import(fs, superblock);
+      ret = tool.do_import(fs, superblock, force, pgidstr, *osr);
     }
     catch (const buffer::error &e) {
       cerr << "do_import threw exception error " << e.what() << std::endl;
-      ret = EFAULT;
+      ret = -EFAULT;
     }
-    if (ret == EFAULT) {
+    if (ret == -EFAULT) {
       cerr << "Corrupt input for import" << std::endl;
     }
     if (ret == 0)
       cout << "Import successful" << std::endl;
     goto out;
+  } else if (op == "dump-journal-mount") {
+    // Undocumented feature to dump journal with mounted fs
+    // This doesn't support the format option, but it uses the
+    // ObjectStore::dump_journal() and mounts to get replay to run.
+    ret = fs->dump_journal(cout);
+    if (ret) {
+      if (ret == -EOPNOTSUPP) {
+        cerr << "Object store type \"" << type << "\" doesn't support journal dump" << std::endl;
+      } else {
+        cerr << "Journal dump failed with error " << cpp_strerror(ret) << std::endl;
+      }
+    }
+    goto out;
+  } else if (op == "get-osdmap") {
+    bufferlist bl;
+    OSDMap osdmap;
+    if (epoch == 0) {
+      epoch = superblock.current_epoch;
+    }
+    ret = get_osdmap(fs, epoch, osdmap, bl);
+    if (ret) {
+      cerr << "Failed to get osdmap#" << epoch << ": "
+	   << cpp_strerror(ret) << std::endl;
+      goto out;
+    }
+    ret = bl.write_fd(file_fd);
+    if (ret) {
+      cerr << "Failed to write to " << file << ": " << cpp_strerror(ret) << std::endl;
+    } else {
+      cout << "osdmap#" << epoch << " exported." << std::endl;
+    }
+    goto out;
+  } else if (op == "set-osdmap") {
+    bufferlist bl;
+    ret = get_fd_data(file_fd, bl);
+    if (ret < 0) {
+      cerr << "Failed to read osdmap " << cpp_strerror(ret) << std::endl;
+    } else {
+      ret = set_osdmap(fs, epoch, bl, force, *osr);
+    }
+    goto out;
+  } else if (op == "get-inc-osdmap") {
+    bufferlist bl;
+    if (epoch == 0) {
+      epoch = superblock.current_epoch;
+    }
+    ret = get_inc_osdmap(fs, epoch, bl);
+    if (ret < 0) {
+      cerr << "Failed to get incremental osdmap# " << epoch << ": "
+	   << cpp_strerror(ret) << std::endl;
+      goto out;
+    }
+    ret = bl.write_fd(file_fd);
+    if (ret) {
+      cerr << "Failed to write to " << file << ": " << cpp_strerror(ret) << std::endl;
+    } else {
+      cout << "inc-osdmap#" << epoch << " exported." << std::endl;
+    }
+    goto out;
+  } else if (op == "set-inc-osdmap") {
+    bufferlist bl;
+    ret = get_fd_data(file_fd, bl);
+    if (ret < 0) {
+      cerr << "Failed to read incremental osdmap  " << cpp_strerror(ret) << std::endl;
+      goto out;
+    } else {
+      ret = set_inc_osdmap(fs, epoch, bl, force, *osr);
+    }
+    goto out;
   }
 
   log_oid = OSD::make_pg_log_oid(pgid);
   biginfo_oid = OSD::make_pg_biginfo_oid(pgid);
 
   if (op == "remove") {
-    finish_remove_pgs(fs);
-    int r = initiate_new_remove_pg(fs, pgid);
-    if (r) {
+    ret = initiate_new_remove_pg(fs, pgid, *osr);
+    if (ret < 0) {
       cerr << "PG '" << pgid << "' not found" << std::endl;
-      ret = 1;
       goto out;
     }
-    finish_remove_pgs(fs);
     cout << "Remove successful" << std::endl;
     goto out;
   }
 
-  if (op == "list-lost" || op == "fix-lost") {
+  if (op == "fix-lost") {
     boost::scoped_ptr<action_on_object_t> action;
-    if (op == "list-lost")
-      action.reset(new do_list_lost());
-    if (op == "fix-lost")
-      action.reset(new do_fix_lost());
+    action.reset(new do_fix_lost(osr));
     if (pgidstr.length())
-      ret = action_on_all_objects_in_pg(fs, coll_t(pgid), *action, debug);
+      ret = action_on_all_objects_in_exact_pg(fs, coll_t(pgid), *action, debug);
     else
       ret = action_on_all_objects(fs, *action, debug);
     goto out;
   }
 
-  // Special list handling.  Treating pretty_format as human readable,
-  // with one object per line and not an enclosing array.
-  human_readable = ends_with(format, "-pretty");
-  if (op == "list" && human_readable) {
-    // Remove -pretty from end of format which we know is there
-    format = format.substr(0, format.size() - strlen("-pretty"));
+  if (op == "list") {
+    ret = do_list(fs, pgidstr, object, formatter, debug, human_readable);
+    if (ret < 0) {
+      cerr << "do_list failed: " << cpp_strerror(ret) << std::endl;
+    }
+    goto out;
   }
 
-  formatter = Formatter::create(format);
-  if (formatter == NULL) {
-    cerr << "unrecognized format: " << format << std::endl;
-    ret = 1;
+  if (op == "dump-super") {
+    formatter->open_object_section("superblock");
+    superblock.dump(formatter);
+    formatter->close_section();
+    formatter->flush(cout);
+    cout << std::endl;
     goto out;
   }
 
-  if (op == "list") {
-    r = do_list(fs, pgidstr, object, formatter, debug, human_readable);
-    if (r) {
-      cerr << "do_list failed with " << r << std::endl;
-      ret = 1;
+  if (op == "meta-list") {
+    ret = do_meta(fs, object, formatter, debug, human_readable);
+    if (ret < 0) {
+      cerr << "do_meta failed: " << cpp_strerror(ret) << std::endl;
     }
     goto out;
   }
 
-  r = fs->list_collections(ls);
-  if (r < 0) {
-    cerr << "failed to list pgs: " << cpp_strerror(-r) << std::endl;
-    ret = 1;
+  ret = fs->list_collections(ls);
+  if (ret < 0) {
+    cerr << "failed to list pgs: " << cpp_strerror(ret) << std::endl;
     goto out;
   }
 
@@ -2839,23 +2572,24 @@ int main(int argc, char **argv)
 
   // Find pg
   for (it = ls.begin(); it != ls.end(); ++it) {
-    snapid_t snap;
     spg_t tmppgid;
 
-    if (!it->is_pg(tmppgid, snap)) {
-      continue;
+    if (pgidstr == "meta") {
+      if (it->to_str() == "meta")
+        break;
+      else
+        continue;
     }
 
-    if (it->is_temp(tmppgid)) {
+    if (!it->is_pg(&tmppgid)) {
       continue;
     }
 
-    if (op != "list-pgs" && tmppgid != pgid) {
+    if (it->is_temp(&tmppgid)) {
       continue;
     }
-    if (snap != CEPH_NOSNAP && debug) {
-      cout << "skipping snapped dir " << *it
-	       << " (pg " << pgid << " snap " << snap << ")" << std::endl;
+
+    if (op != "list-pgs" && tmppgid != pgid) {
       continue;
     }
 
@@ -2872,6 +2606,16 @@ int main(int argc, char **argv)
     goto out;
   }
 
+  // If not an object command nor any of the ops handled below, then output this usage
+  // before complaining about a bad pgid
+  if (!vm.count("objcmd") && op != "export" && op != "info" && op != "log" && op != "rm-past-intervals" && op != "mark-complete") {
+    cerr << "Must provide --op (info, log, remove, export, import, list, fix-lost, list-pgs, rm-past-intervals, set-allow-sharded-objects, dump-journal, dump-super, meta-list, "
+      "get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete)"
+	 << std::endl;
+    usage(desc);
+    ret = 1;
+    goto out;
+  }
   epoch_t map_epoch;
 // The following code for export, info, log require omap or !skip-mount-omap
   if (it != ls.end()) {
@@ -2881,25 +2625,15 @@ int main(int argc, char **argv)
     if (vm.count("objcmd")) {
       ret = 0;
       if (objcmd == "remove") {
-        int r = do_remove_object(fs, coll, ghobj);
-        if (r) {
-          ret = 1;
-        }
+        ret = do_remove_object(fs, coll, ghobj, *osr);
         goto out;
       } else if (objcmd == "list-attrs") {
-        int r = do_list_attrs(fs, coll, ghobj);
-        if (r) {
-          ret = 1;
-        }
+        ret = do_list_attrs(fs, coll, ghobj);
         goto out;
       } else if (objcmd == "list-omap") {
-        int r = do_list_omap(fs, coll, ghobj);
-        if (r) {
-          ret = 1;
-        }
+        ret = do_list_omap(fs, coll, ghobj);
         goto out;
       } else if (objcmd == "get-bytes" || objcmd == "set-bytes") {
-        int r;
         if (objcmd == "get-bytes") {
           int fd;
           if (vm.count("arg1") == 0 || arg1 == "-") {
@@ -2912,7 +2646,7 @@ int main(int argc, char **argv)
               goto out;
             }
           }
-          r = do_get_bytes(fs, coll, ghobj, fd);
+          ret = do_get_bytes(fs, coll, ghobj, fd);
           if (fd != STDOUT_FILENO)
             close(fd);
         } else {
@@ -2933,23 +2667,24 @@ int main(int argc, char **argv)
               goto out;
             }
           }
-          r = do_set_bytes(fs, coll, ghobj, fd);
+          ret = do_set_bytes(fs, coll, ghobj, fd, *osr);
           if (fd != STDIN_FILENO)
             close(fd);
         }
-        if (r)
-          ret = 1;
         goto out;
       } else if (objcmd == "get-attr") {
-	if (vm.count("arg1") == 0)
+	if (vm.count("arg1") == 0) {
 	  usage(desc);
-	r = do_get_attr(fs, coll, ghobj, arg1);
-	if (r)
-	  ret = 1;
+          ret = 1;
+          goto out;
+        }
+	ret = do_get_attr(fs, coll, ghobj, arg1);
         goto out;
       } else if (objcmd == "set-attr") {
-	if (vm.count("arg1") == 0)
+	if (vm.count("arg1") == 0) {
 	  usage(desc);
+          ret = 1;
+        }
 
 	int fd;
 	if (vm.count("arg2") == 0 || arg2 == "-") {
@@ -2968,30 +2703,32 @@ int main(int argc, char **argv)
 	    goto out;
 	  }
 	}
-	r = do_set_attr(fs, coll, ghobj, arg1, fd);
+	ret = do_set_attr(fs, coll, ghobj, arg1, fd, *osr);
 	if (fd != STDIN_FILENO)
 	  close(fd);
-	if (r)
-	  ret = 1;
         goto out;
       } else if (objcmd == "rm-attr") {
-	if (vm.count("arg1") == 0)
+	if (vm.count("arg1") == 0) {
 	  usage(desc);
-	r = do_rm_attr(fs, coll, ghobj, arg1);
-	if (r)
-	  ret = 1;
+          ret = 1;
+          goto out;
+        }
+	ret = do_rm_attr(fs, coll, ghobj, arg1, *osr);
         goto out;
       } else if (objcmd == "get-omap") {
-	if (vm.count("arg1") == 0)
+	if (vm.count("arg1") == 0) {
 	  usage(desc);
-	r = do_get_omap(fs, coll, ghobj, arg1);
-	if (r)
-	  ret = 1;
+          ret = 1;
+          goto out;
+        }
+	ret = do_get_omap(fs, coll, ghobj, arg1);
         goto out;
       } else if (objcmd == "set-omap") {
-	if (vm.count("arg1") == 0)
+	if (vm.count("arg1") == 0) {
 	  usage(desc);
-
+          ret = 1;
+          goto out;
+        }
 	int fd;
 	if (vm.count("arg2") == 0 || arg2 == "-") {
           // Since read_fd() doesn't handle ^D from a tty stdin, don't allow it.
@@ -3009,30 +2746,33 @@ int main(int argc, char **argv)
 	    goto out;
 	  }
 	}
-	r = do_set_omap(fs, coll, ghobj, arg1, fd);
+	ret = do_set_omap(fs, coll, ghobj, arg1, fd, *osr);
 	if (fd != STDIN_FILENO)
 	  close(fd);
-	if (r)
-	  ret = 1;
         goto out;
       } else if (objcmd == "rm-omap") {
-	if (vm.count("arg1") == 0)
+	if (vm.count("arg1") == 0) {
 	  usage(desc);
-	r = do_rm_omap(fs, coll, ghobj, arg1);
-	if (r)
-	  ret = 1;
+          ret = 1;
+          goto out;
+        }
+	ret = do_rm_omap(fs, coll, ghobj, arg1, *osr);
         goto out;
       } else if (objcmd == "get-omaphdr") {
-	if (vm.count("arg1"))
+	if (vm.count("arg1")) {
 	  usage(desc);
-	r = do_get_omaphdr(fs, coll, ghobj);
-	if (r)
-	  ret = 1;
+          ret = 1;
+          goto out;
+        }
+	ret = do_get_omaphdr(fs, coll, ghobj);
         goto out;
       } else if (objcmd == "set-omaphdr") {
         // Extra arg
-	if (vm.count("arg2"))
+	if (vm.count("arg2")) {
 	  usage(desc);
+          ret = 1;
+          goto out;
+        }
 	int fd;
 	if (vm.count("arg1") == 0 || arg1 == "-") {
           // Since read_fd() doesn't handle ^D from a tty stdin, don't allow it.
@@ -3050,46 +2790,47 @@ int main(int argc, char **argv)
 	    goto out;
 	  }
 	}
-	r = do_set_omaphdr(fs, coll, ghobj, fd);
+	ret = do_set_omaphdr(fs, coll, ghobj, fd, *osr);
 	if (fd != STDIN_FILENO)
 	  close(fd);
-	if (r)
-	  ret = 1;
         goto out;
+      } else if (objcmd == "dump-info") {
+	ret = print_obj_info(fs, coll, ghobj, formatter);
+	goto out;
       }
       cerr << "Unknown object command '" << objcmd << "'" << std::endl;
       usage(desc);
+      ret = 1;
+      goto out;
     }
 
     bufferlist bl;
     map_epoch = 0;
-    r = PG::peek_map_epoch(fs, pgid, &map_epoch, &bl);
-    if (r < 0)
-      cerr << "peek_map_epoch returns an error" << std::endl;
-
+    ret = PG::peek_map_epoch(fs, pgid, &map_epoch, &bl);
+    if (ret < 0)
+      cerr << "peek_map_epoch reports error" << std::endl;
     if (debug)
       cerr << "map_epoch " << map_epoch << std::endl;
 
     pg_info_t info(pgid);
     map<epoch_t,pg_interval_t> past_intervals;
     __u8 struct_ver;
-    r = PG::read_info(fs, pgid, coll, bl, info, past_intervals,
+    ret = PG::read_info(fs, pgid, coll, bl, info, past_intervals,
 		      struct_ver);
-    if (r < 0) {
-      cerr << "read_info error " << cpp_strerror(-r) << std::endl;
-      ret = 1;
+    if (ret < 0) {
+      cerr << "read_info error " << cpp_strerror(ret) << std::endl;
       goto out;
     }
     if (struct_ver < PG::compat_struct_v) {
       cerr << "PG is too old to upgrade, use older Ceph version" << std::endl;
-      ret = 1;
+      ret = -EFAULT;
       goto out;
     }
     if (debug)
       cerr << "struct_v " << (int)struct_ver << std::endl;
 
     if (op == "export") {
-      ret = do_export(fs, coll, pgid, info, map_epoch, struct_ver, superblock, past_intervals);
+      ret = tool.do_export(fs, coll, pgid, info, map_epoch, struct_ver, superblock, past_intervals);
       if (ret == 0)
         cerr << "Export successful" << std::endl;
     } else if (op == "info") {
@@ -3101,20 +2842,13 @@ int main(int argc, char **argv)
     } else if (op == "log") {
       PGLog::IndexedLog log;
       pg_missing_t missing;
-      ret = get_log(fs, struct_ver, coll, pgid, info, log, missing);
-      if (ret > 0)
+      map<eversion_t, hobject_t> divergent_priors;
+      ret = get_log(fs, struct_ver, coll, pgid, info, log, missing,
+                    divergent_priors);
+      if (ret < 0)
           goto out;
 
-      formatter->open_object_section("log");
-      log.dump(formatter);
-      formatter->close_section();
-      formatter->flush(cout);
-      cout << std::endl;
-      formatter->open_object_section("missing");
-      missing.dump(formatter);
-      formatter->close_section();
-      formatter->flush(cout);
-      cout << std::endl;
+      dump_log(formatter, cout, log, missing, divergent_priors);
     } else if (op == "rm-past-intervals") {
       ObjectStore::Transaction tran;
       ObjectStore::Transaction *t = &tran;
@@ -3123,7 +2857,7 @@ int main(int argc, char **argv)
         cerr << "Can't remove past-intervals, version mismatch " << (int)struct_ver
           << " (pg)  != " << (int)PG::cur_struct_v << " (tool)"
           << std::endl;
-        ret = 1;
+        ret = -EFAULT;
         goto out;
       }
 
@@ -3133,27 +2867,62 @@ int main(int argc, char **argv)
       ret = write_info(*t, map_epoch, info, past_intervals);
 
       if (ret == 0) {
-        fs->apply_transaction(*t);
+        fs->apply_transaction(osr, *t);
         cout << "Removal succeeded" << std::endl;
       }
+    } else if (op == "mark-complete") {
+      ObjectStore::Transaction tran;
+      ObjectStore::Transaction *t = &tran;
+
+      if (struct_ver != PG::cur_struct_v) {
+	cerr << "Can't mark-complete, version mismatch " << (int)struct_ver
+	     << " (pg)  != " << (int)PG::cur_struct_v << " (tool)"
+	     << std::endl;
+	ret = 1;
+	goto out;
+      }
+
+      cout << "Marking complete " << std::endl;
+
+      info.last_update = eversion_t(superblock.current_epoch, info.last_update.version + 1);
+      info.last_backfill = hobject_t::get_max();
+      info.last_epoch_started = superblock.current_epoch;
+      info.history.last_epoch_started = superblock.current_epoch;
+      info.history.last_epoch_clean = superblock.current_epoch;
+      past_intervals.clear();
+
+      ret = write_info(*t, map_epoch, info, past_intervals);
+      if (ret == 0) {
+	fs->apply_transaction(osr, *t);
+	cout << "Marking complete succeeded" << std::endl;
+      }
     } else {
-      cerr << "Must provide --op (info, log, remove, export, import, list, list-lost, fix-lost, list-pgs, rm-past-intervals)"
-	<< std::endl;
-      usage(desc);
+      assert(!"Should have already checked for valid --op");
     }
   } else {
     cerr << "PG '" << pgid << "' not found" << std::endl;
-    ret = 1;
+    ret = -ENOENT;
   }
 
 out:
-  if (fs->umount() < 0) {
-    cerr << "umount failed" << std::endl;
-    return 1;
+  int r = fs->umount();
+  delete osr;
+  if (r < 0) {
+    cerr << "umount failed: " << cpp_strerror(r) << std::endl;
+    // If no previous error, then use umount() error
+    if (ret == 0)
+      ret = r;
+  }
+
+  if (dry_run) {
+    // Export output can go to stdout, so put this message on stderr
+    if (op == "export")
+      cerr << "dry-run: Nothing changed" << std::endl;
+    else
+      cout << "dry-run: Nothing changed" << std::endl;
   }
 
-  // Check for -errno accidentally getting here
   if (ret < 0)
     ret = 1;
-  return ret;
+  myexit(ret);
 }
diff --git a/src/tools/ceph_objectstore_tool.h b/src/tools/ceph_objectstore_tool.h
new file mode 100644
index 0000000..db27988
--- /dev/null
+++ b/src/tools/ceph_objectstore_tool.h
@@ -0,0 +1,42 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OBJECTSTORE_TOOL_H_
+#define CEPH_OBJECTSTORE_TOOL_H_
+
+#include "RadosDump.h"
+
+class ObjectStoreTool : public RadosDump
+{
+  public:
+    ObjectStoreTool(int file_fd, bool dry_run)
+      : RadosDump(file_fd, dry_run)
+    {}
+
+    int do_import(ObjectStore *store, OSDSuperblock& sb, bool force,
+		  std::string pgidstr,
+		  ObjectStore::Sequencer &osr);
+    int do_export(ObjectStore *fs, coll_t coll, spg_t pgid,
+          pg_info_t &info, epoch_t map_epoch, __u8 struct_ver,
+          const OSDSuperblock& superblock,
+          map<epoch_t,pg_interval_t> &past_intervals);
+    int get_object(ObjectStore *store, coll_t coll,
+		   bufferlist &bl, OSDMap &curmap, bool *skipped_objects,
+		   ObjectStore::Sequencer &osr);
+    int export_file(
+        ObjectStore *store, coll_t cid, ghobject_t &obj);
+    int export_files(ObjectStore *store, coll_t coll);
+};
+
+#endif // CEPH_OBJECSTORE_TOOL_H_
diff --git a/src/tools/ceph_osdomap_tool.cc b/src/tools/ceph_osdomap_tool.cc
index 56c18ae..fb950fa 100644
--- a/src/tools/ceph_osdomap_tool.cc
+++ b/src/tools/ceph_osdomap_tool.cc
@@ -28,15 +28,13 @@ using namespace std;
 int main(int argc, char **argv) {
   po::options_description desc("Allowed options");
   string store_path, cmd, out_path;
-  bool paranoid = false;
   desc.add_options()
     ("help", "produce help message")
     ("omap-path", po::value<string>(&store_path),
      "path to mon directory, mandatory (current/omap usually)")
-    ("paranoid", po::value<bool>(&paranoid),
-     "use paranoid checking")
+    ("paranoid", "use paranoid checking")
     ("command", po::value<string>(&cmd),
-     "command")
+     "command arg is one of [dump-raw-keys, dump-raw-key-vals, dump-objects, dump-objects-with-keys, check], mandatory")
     ;
   po::positional_options_description p;
   p.add("command", 1);
@@ -78,10 +76,20 @@ int main(int argc, char **argv) {
     return 1;
   }
 
+  if (vm.count("omap-path") == 0) {
+    std::cerr << "Required argument --omap-path" << std::endl;
+    return 1;
+  }
+
+  if (vm.count("command") == 0) {
+    std::cerr << "Required argument --command" << std::endl;
+    return 1;
+  }
+
   LevelDBStore* store(new LevelDBStore(g_ceph_context, store_path));
-  if (paranoid) {
+  if (vm.count("paranoid")) {
     std::cerr << "Enabling paranoid checks" << std::endl;
-    store->options.paranoid_checks = paranoid;
+    store->options.paranoid_checks = true;
   }
   DBObjectMap omap(store);
   stringstream out;
@@ -129,7 +137,7 @@ int main(int argc, char **argv) {
 	 i != objects.end();
 	 ++i) {
       std::cout << "Object: " << *i << std::endl;
-      ObjectMap::ObjectMapIterator j = omap.get_iterator(i->hobj);
+      ObjectMap::ObjectMapIterator j = omap.get_iterator(ghobject_t(i->hobj));
       for (j->seek_to_first(); j->valid(); j->next()) {
 	std::cout << j->key() << std::endl;
 	j->value().hexdump(std::cout);
diff --git a/src/tools/cephfs/DataScan.cc b/src/tools/cephfs/DataScan.cc
new file mode 100644
index 0000000..ce083ce
--- /dev/null
+++ b/src/tools/cephfs/DataScan.cc
@@ -0,0 +1,1376 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/errno.h"
+#include "common/ceph_argparse.h"
+#include <fstream>
+#include "include/util.h"
+
+#include "mds/CInode.h"
+#include "cls/cephfs/cls_cephfs_client.h"
+
+#include "DataScan.h"
+#include "include/compat.h"
+
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "datascan." << __func__ << ": "
+
+void DataScan::usage()
+{
+  std::cout << "Usage: \n"
+    << "  cephfs-data-scan init [--force-init]\n"
+    << "  cephfs-data-scan scan_extents [--force-pool] <data pool name>\n"
+    << "  cephfs-data-scan scan_inodes [--force-pool] [--force-corrupt] <data pool name>\n"
+    << "\n"
+    << "    --force-corrupt: overrite apparently corrupt structures\n"
+    << "    --force-init: write root inodes even if they exist\n"
+    << "    --force-pool: use data pool even if it is not in MDSMap\n"
+    << std::endl;
+
+  generic_client_usage();
+}
+
+bool DataScan::parse_kwarg(
+    const std::vector<const char*> &args,
+    std::vector<const char *>::const_iterator &i,
+    int *r)
+{
+  if (i + 1 == args.end()) {
+    return false;
+  }
+
+  const std::string arg(*i);
+  const std::string val(*(++i));
+
+  if (arg == std::string("--output-dir")) {
+    if (driver != NULL) {
+      derr << "Unexpected --output-dir: output already selected!" << dendl;
+      *r = -EINVAL;
+      return false;
+    }
+    driver = new LocalFileDriver(val, data_io);
+    return true;
+  } else if (arg == std::string("-n")) {
+    std::string err;
+    n = strict_strtoll(val.c_str(), 10, &err);
+    if (!err.empty()) {
+      std::cerr << "Invalid worker number '" << val << "'" << std::endl;
+      *r = -EINVAL;
+      return false;
+    }
+    return true;
+  } else if (arg == std::string("-m")) {
+    std::string err;
+    m = strict_strtoll(val.c_str(), 10, &err);
+    if (!err.empty()) {
+      std::cerr << "Invalid worker count '" << val << "'" << std::endl;
+      *r = -EINVAL;
+      return false;
+    }
+    return true;
+  } else {
+    return false;
+  }
+}
+
+bool DataScan::parse_arg(
+    const std::vector<const char*> &args,
+    std::vector<const char *>::const_iterator &i)
+{
+  const std::string arg(*i);
+  if (arg == "--force-pool") {
+    force_pool = true;
+    return true;
+  } else if (arg == "--force-corrupt") {
+    force_corrupt = true;
+    return true;
+  } else if (arg == "--force-init") {
+    force_init = true;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+int DataScan::main(const std::vector<const char*> &args)
+{
+  // Parse args
+  // ==========
+  if (args.size() < 1) {
+    usage();
+    return -EINVAL;
+  }
+
+  // Common RADOS init: open metadata pool
+  // =====================================
+  librados::Rados rados;
+  int r = rados.init_with_context(g_ceph_context);
+  if (r < 0) {
+    derr << "RADOS unavailable" << dendl;
+    return r;
+  }
+
+  std::string const &command = args[0];
+  std::string data_pool_name;
+
+  // Consume any known --key val or --flag arguments
+  for (std::vector<const char *>::const_iterator i = args.begin() + 1;
+       i != args.end(); ++i) {
+    if (parse_kwarg(args, i, &r)) {
+      // Skip the kwarg value field
+      ++i;
+      continue;
+    } else if (r) {
+      return r;
+    }
+
+    if (parse_arg(args, i)) {
+      continue;
+    }
+
+    if (i + 1 == args.end() &&
+        (command == "scan_inodes" || command == "scan_extents")) {
+      data_pool_name = *i;
+      continue;
+    }
+
+    // Fall through: unhandled
+    std::cerr << "Unknown argument '" << *i << "'" << std::endl;
+    return -EINVAL;
+  }
+
+  // Default to output to metadata pool
+  if (driver == NULL) {
+    driver = new MetadataDriver();
+    driver->set_force_corrupt(force_corrupt);
+    driver->set_force_init(force_init);
+  }
+
+  dout(4) << "connecting to RADOS..." << dendl;
+  rados.connect();
+  r = driver->init(rados, mdsmap);
+  if (r < 0) {
+    return r;
+  }
+
+  // Initialize data_io for those commands that need it
+  if (command == "scan_inodes"
+     || command == "scan_extents") {
+    if (data_pool_name.empty()) {
+      std::cerr << "Data pool not specified" << std::endl;
+      usage();
+      return -EINVAL;
+    }
+
+    data_pool_id = rados.pool_lookup(data_pool_name.c_str());
+    if (data_pool_id < 0) {
+      std::cerr << "Data pool '" << data_pool_name << "' not found!" << std::endl;
+      return -ENOENT;
+    } else {
+      dout(4) << "data pool '" << data_pool_name
+        << "' has ID " << data_pool_id << dendl;
+    }
+
+    if (!mdsmap->is_data_pool(data_pool_id)) {
+      std::cerr << "Warning: pool '" << data_pool_name << "' is not a "
+        "CephFS data pool!" << std::endl;
+      if (!force_pool) {
+        std::cerr << "Use --force-pool to continue" << std::endl;
+        return -EINVAL;
+      }
+    }
+
+    dout(4) << "opening data pool '" << data_pool_name << "'" << dendl;
+    r = rados.ioctx_create(data_pool_name.c_str(), data_io);
+    if (r != 0) {
+      return r;
+    }
+  }
+
+  // Finally, dispatch command
+  if (command == "scan_inodes") {
+    return scan_inodes();
+  } else if (command == "scan_extents") {
+    return scan_extents();
+  } else if (command == "init") {
+    return driver->init_roots(mdsmap->get_first_data_pool());
+  } else {
+    std::cerr << "Unknown command '" << command << "'" << std::endl;
+    return -EINVAL;
+  }
+}
+
+int MetadataDriver::inject_unlinked_inode(
+    inodeno_t inono, int mode, int64_t data_pool_id)
+{
+  const object_t oid = InodeStore::get_object_name(inono, frag_t(), ".inode");
+
+  // Skip if exists
+  bool already_exists = false;
+  int r = root_exists(inono, &already_exists);
+  if (r) {
+    return r;
+  }
+  if (already_exists && !force_init) {
+    std::cerr << "Inode 0x" << std::hex << inono << std::dec << " already"
+               " exists, skipping create.  Use --force-init to overwrite"
+               " the existing object." << std::endl;
+    return 0;
+  }
+
+  // Compose
+  InodeStore inode;
+  inode.inode.ino = inono;
+  inode.inode.version = 1;
+  inode.inode.xattr_version = 1;
+  inode.inode.mode = 0500 | mode;
+  // Fake size to 1, so that the directory doesn't appear to be empty
+  // (we won't actually give the *correct* size here though)
+  inode.inode.size = 1;
+  inode.inode.dirstat.nfiles = 1;
+
+  inode.inode.ctime = 
+    inode.inode.mtime = ceph_clock_now(g_ceph_context);
+  inode.inode.nlink = 1;
+  inode.inode.truncate_size = -1ull;
+  inode.inode.truncate_seq = 1;
+  inode.inode.uid = g_conf->mds_root_ino_uid;
+  inode.inode.gid = g_conf->mds_root_ino_gid;
+
+  // Force layout to default: should we let users override this so that
+  // they don't have to mount the filesystem to correct it?
+  inode.inode.layout = g_default_file_layout;
+  inode.inode.layout.fl_pg_pool = data_pool_id;
+
+  // Assume that we will get our stats wrong, and that we may
+  // be ignoring dirfrags that exist
+  inode.damage_flags |= (DAMAGE_STATS | DAMAGE_RSTATS | DAMAGE_FRAGTREE);
+
+  // Serialize
+  bufferlist inode_bl;
+  ::encode(std::string(CEPH_FS_ONDISK_MAGIC), inode_bl);
+  inode.encode(inode_bl);
+
+  // Write
+  r = metadata_io.write_full(oid.name, inode_bl);
+  if (r != 0) {
+    derr << "Error writing '" << oid.name << "': " << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  return r;
+}
+
+int MetadataDriver::root_exists(inodeno_t ino, bool *result)
+{
+  object_t oid = InodeStore::get_object_name(ino, frag_t(), ".inode");
+  uint64_t size;
+  time_t mtime;
+  int r = metadata_io.stat(oid.name, &size, &mtime);
+  if (r == -ENOENT) {
+    *result = false;
+    return 0;
+  } else if (r < 0) {
+    return r;
+  }
+
+  *result = true;
+  return 0;
+}
+
+int MetadataDriver::init_roots(int64_t data_pool_id)
+{
+  int r = 0;
+  r = inject_unlinked_inode(MDS_INO_ROOT, S_IFDIR|0755, data_pool_id);
+  if (r != 0) {
+    return r;
+  }
+  r = inject_unlinked_inode(MDS_INO_MDSDIR(0), S_IFDIR, data_pool_id);
+  if (r != 0) {
+    return r;
+  }
+
+  return 0;
+}
+
+int MetadataDriver::check_roots(bool *result)
+{
+  int r;
+  r = root_exists(MDS_INO_ROOT, result);
+  if (r != 0) {
+    return r;
+  }
+  if (!*result) {
+    return 0;
+  }
+
+  r = root_exists(MDS_INO_MDSDIR(0), result);
+  if (r != 0) {
+    return r;
+  }
+  if (!*result) {
+    return 0;
+  }
+
+  return 0;
+}
+
+/**
+ * Stages:
+ *
+ * SERIAL init
+ *  0. Create root inodes if don't exist
+ * PARALLEL scan_extents
+ *  1. Size and mtime recovery: scan ALL objects, and update 0th
+ *   objects with max size and max mtime seen.
+ * PARALLEL scan_inodes
+ *  2. Inode recovery: scan ONLY 0th objects, and inject metadata
+ *   into dirfrag OMAPs, creating blank dirfrags as needed.  No stats
+ *   or rstats at this stage.  Inodes without backtraces go into
+ *   lost+found
+ * TODO: SERIAL "recover stats"
+ *  3. Dirfrag statistics: depth first traverse into metadata tree,
+ *    rebuilding dir sizes.
+ * TODO PARALLEL "clean up"
+ *  4. Cleanup; go over all 0th objects (and dirfrags if we tagged
+ *   anything onto them) and remove any of the xattrs that we
+ *   used for accumulating.
+ */
+
+
+int parse_oid(const std::string &oid, uint64_t *inode_no, uint64_t *obj_id)
+{
+  if (oid.find(".") == std::string::npos || oid.find(".") == oid.size() - 1) {
+    return -EINVAL;
+  }
+
+  std::string err;
+  std::string inode_str = oid.substr(0, oid.find("."));
+  *inode_no = strict_strtoll(inode_str.c_str(), 16, &err);
+  if (!err.empty()) {
+    return -EINVAL;
+  }
+
+  std::string pos_string = oid.substr(oid.find(".") + 1);
+  *obj_id = strict_strtoll(pos_string.c_str(), 16, &err);
+  if (!err.empty()) {
+    return -EINVAL;
+  }
+
+  return 0;
+}
+
+// Pending sharded pgls & add in progress mechanism for that
+#undef SHARDEDPGLS
+
+int DataScan::scan_extents()
+{
+#ifdef SHARDED_PGLS
+  float progress = 0.0;
+  librados::NObjectIterator i = data_io.nobjects_begin(n, m);
+#else
+  librados::NObjectIterator i = data_io.nobjects_begin();
+#endif
+
+  librados::NObjectIterator i_end = data_io.nobjects_end();
+  int r = 0;
+
+  for (; i != i_end; ++i) {
+    const std::string oid = i->get_oid();
+#ifdef SHARDED_PGLS
+    if (i.get_progress() != progress) {
+      if (int(i.get_progress() * 100) / 5 != int(progress * 100) / 5) {
+        std::cerr << percentify(i.get_progress()) << "%" << std::endl;
+      }
+      progress = i.get_progress();
+    }
+#endif
+
+    // Read size
+    uint64_t size;
+    time_t mtime;
+    r = data_io.stat(oid, &size, &mtime);
+    if (r != 0) {
+      dout(4) << "Cannot stat '" << oid << "': skipping" << dendl;
+      continue;
+    }
+
+    // I need to keep track of
+    //  * The highest object ID seen
+    //  * The size of the highest object ID seen
+    //  * The largest object seen
+    //
+    //  Given those things, I can later infer the object chunking
+    //  size, the offset of the last object (chunk size * highest ID seen)
+    //  and the actual size (offset of last object + size of highest ID seen)
+    //
+    //  This logic doesn't take account of striping.
+    uint64_t inode_no = 0;
+    uint64_t obj_id = 0;
+    r = parse_oid(oid, &inode_no, &obj_id);
+    if (r != 0) {
+      dout(4) << "Bad object name '" << oid << "' skipping" << dendl;
+      continue;
+    }
+
+    int r = ClsCephFSClient::accumulate_inode_metadata(
+        data_io,
+        inode_no,
+        obj_id,
+        size,
+        mtime);
+    if (r < 0) {
+      derr << "Failed to accumulate metadata data from '"
+        << oid << "': " << cpp_strerror(r) << dendl;
+      continue;
+    }
+  }
+
+  return 0;
+}
+
+int DataScan::scan_inodes()
+{
+#ifdef SHARDED_PGLS
+  float progress = 0.0;
+  librados::NObjectIterator i = data_io.nobjects_begin(n, m);
+#else
+  librados::NObjectIterator i = data_io.nobjects_begin();
+#endif
+  librados::NObjectIterator i_end = data_io.nobjects_end();
+
+  bool roots_present;
+  int r = driver->check_roots(&roots_present);
+  if (r != 0) {
+    derr << "Unexpected error checking roots: '"
+      << cpp_strerror(r) << "'" << dendl;
+    return r;
+  }
+
+  if (!roots_present) {
+    std::cerr << "Some or all system inodes are absent.  Run 'init' from "
+      "one node before running 'scan_inodes'" << std::endl;
+    return -EIO;
+  }
+
+  for (; i != i_end; ++i) {
+    const std::string oid = i->get_oid();
+#ifdef SHARDED_PGLS
+    if (i.get_progress() != progress) {
+      if (int(i.get_progress() * 100) / 5 != int(progress * 100) / 5) {
+        std::cerr << percentify(i.get_progress()) << "%" << std::endl;
+      }
+      progress = i.get_progress();
+    }
+#endif
+
+    uint64_t obj_name_ino = 0;
+    uint64_t obj_name_offset = 0;
+    r = parse_oid(oid, &obj_name_ino, &obj_name_offset);
+    if (r != 0) {
+      dout(4) << "Bad object name '" << oid << "', skipping" << dendl;
+      continue;
+    }
+
+    // We are only interested in 0th objects during this phase: we touched
+    // the other objects during scan_extents
+    if (obj_name_offset != 0) {
+      continue;
+    }
+
+    AccumulateResult accum_res;
+    inode_backtrace_t backtrace;
+    ceph_file_layout loaded_layout = g_default_file_layout;
+    int r = ClsCephFSClient::fetch_inode_accumulate_result(
+        data_io, oid, &backtrace, &loaded_layout, &accum_res);
+    
+    if (r < 0) {
+      dout(4) << "Unexpected error loading accumulated metadata from '"
+              << oid << "': " << cpp_strerror(r) << dendl;
+      // FIXME: this creates situation where if a client has a corrupt
+      // backtrace/layout, we will fail to inject it.  We should (optionally)
+      // proceed if the backtrace/layout is corrupt but we have valid
+      // accumulated metadata.
+      continue;
+    }
+
+    const time_t file_mtime = accum_res.max_mtime;
+    uint64_t file_size = 0;
+    uint32_t chunk_size = g_default_file_layout.fl_object_size;
+    bool have_backtrace = !(backtrace.ancestors.empty());
+
+    // This is the layout we will use for injection, populated either
+    // from loaded_layout or from best guesses
+    ceph_file_layout guessed_layout;
+    guessed_layout.fl_pg_pool = data_pool_id;
+
+    // Calculate file_size, guess chunk_size
+    if (accum_res.ceiling_obj_index > 0) {
+      // When there are multiple objects, the largest object probably
+      // indicates the chunk size.  But not necessarily, because files
+      // can be sparse.  Only make this assumption if size seen
+      // is a power of two, as chunk sizes typically are.
+      if ((accum_res.max_obj_size & (accum_res.max_obj_size - 1)) == 0) {
+        chunk_size = accum_res.max_obj_size;
+      }
+
+      if (loaded_layout.fl_pg_pool == uint32_t(-1)) {
+        // If no stashed layout was found, guess it
+        guessed_layout.fl_object_size = chunk_size;
+        guessed_layout.fl_stripe_unit = chunk_size;
+        guessed_layout.fl_stripe_count = 1;
+      } else if (loaded_layout.fl_object_size < accum_res.max_obj_size) {
+        // If the max size seen exceeds what the stashed layout claims, then
+        // disbelieve it.  Guess instead.
+        dout(4) << "bogus xattr layout on 0x" << std::hex << obj_name_ino
+                << std::dec << ", ignoring in favour of best guess" << dendl;
+        guessed_layout.fl_object_size = chunk_size;
+        guessed_layout.fl_stripe_unit = chunk_size;
+        guessed_layout.fl_stripe_count = 1;
+      } else {
+        // We have a stashed layout that we can't disprove, so apply it
+        guessed_layout = loaded_layout;
+        dout(20) << "loaded layout from xattr:"
+          << " os: " << guessed_layout.fl_object_size
+          << " sc: " << guessed_layout.fl_stripe_count
+          << " su: " << guessed_layout.fl_stripe_unit
+          << dendl;
+        // User might have transplanted files from a pool with a different
+        // ID, so whatever the loaded_layout says, we'll force the injected
+        // layout to point to the pool we really read from
+        guessed_layout.fl_pg_pool = data_pool_id;
+      }
+
+      if (guessed_layout.fl_stripe_count == 1) {
+        // Unstriped file: simple chunking
+        file_size = guessed_layout.fl_object_size * accum_res.ceiling_obj_index
+                    + accum_res.ceiling_obj_size;
+      } else {
+        // Striped file: need to examine the last fl_stripe_count objects
+        // in the file to determine the size.
+
+        // How many complete (i.e. not last stripe) objects?
+        uint64_t complete_objs = 0;
+        if (accum_res.ceiling_obj_index > guessed_layout.fl_stripe_count - 1) {
+          complete_objs = (accum_res.ceiling_obj_index / guessed_layout.fl_stripe_count) * guessed_layout.fl_stripe_count;
+        } else {
+          complete_objs = 0;
+        }
+
+        // How many potentially-short objects (i.e. last stripe set) objects?
+        uint64_t partial_objs = accum_res.ceiling_obj_index + 1 - complete_objs;
+
+        dout(10) << "calculating striped size from complete objs: "
+                 << complete_objs << ", partial objs: " << partial_objs
+                 << dendl;
+
+        // Maximum amount of data that may be in the incomplete objects
+        uint64_t incomplete_size = 0;
+
+        // For each short object, calculate the max file size within it
+        // and accumulate the maximum
+        for (uint64_t i = complete_objs; i < complete_objs + partial_objs; ++i) {
+          char buf[60];
+          snprintf(buf, sizeof(buf), "%llx.%08llx",
+              (long long unsigned)obj_name_ino, (long long unsigned)i);
+
+          uint64_t osize(0);
+          time_t omtime(0);
+          r = data_io.stat(std::string(buf), &osize, &omtime);
+          if (r == 0) {
+	    if (osize > 0) {
+	      // Upper bound within this object
+	      uint64_t upper_size = (osize - 1) / guessed_layout.fl_stripe_unit
+		* (guessed_layout.fl_stripe_unit * guessed_layout.fl_stripe_count)
+		+ (i % guessed_layout.fl_stripe_count)
+		* guessed_layout.fl_stripe_unit + (osize - 1)
+		% guessed_layout.fl_stripe_unit + 1;
+	      incomplete_size = MAX(incomplete_size, upper_size);
+	    }
+          } else if (r == -ENOENT) {
+            // Absent object, treat as size 0 and ignore.
+          } else {
+            // Unexpected error, carry r to outer scope for handling.
+            break;
+          }
+        }
+        if (r != 0 && r != -ENOENT) {
+          derr << "Unexpected error checking size of ino 0x" << std::hex
+               << obj_name_ino << std::dec << ": " << cpp_strerror(r) << dendl;
+          continue;
+        }
+        file_size = complete_objs * guessed_layout.fl_object_size
+                    + incomplete_size;
+      }
+    } else {
+      file_size = accum_res.ceiling_obj_size;
+    }
+
+    // Santity checking backtrace ino against object name
+    if (have_backtrace && backtrace.ino != obj_name_ino) {
+      dout(4) << "Backtrace ino 0x" << std::hex << backtrace.ino
+        << " doesn't match object name ino 0x" << obj_name_ino
+        << std::dec << dendl;
+      have_backtrace = false;
+    }
+
+    // Inject inode to the metadata pool
+    if (have_backtrace) {
+      inode_backpointer_t root_bp = *(backtrace.ancestors.rbegin());
+      if (MDS_INO_IS_MDSDIR(root_bp.dirino)) {
+        /* Special case for strays: even if we have a good backtrace,
+         * don't put it in the stray dir, because while that would technically
+         * give it linkage it would still be invisible to the user */
+        r = driver->inject_lost_and_found(
+            obj_name_ino, file_size, file_mtime, guessed_layout);
+        if (r < 0) {
+          dout(4) << "Error injecting 0x" << std::hex << backtrace.ino
+            << std::dec << " into lost+found: " << cpp_strerror(r) << dendl;
+          if (r == -EINVAL) {
+            dout(4) << "Use --force-corrupt to overwrite structures that "
+                       "appear to be corrupt" << dendl;
+          }
+        }
+      } else {
+        /* Happy case: we will inject a named dentry for this inode */
+        r = driver->inject_with_backtrace(
+            backtrace, file_size, file_mtime, guessed_layout);
+        if (r < 0) {
+          dout(4) << "Error injecting 0x" << std::hex << backtrace.ino
+            << std::dec << " with backtrace: " << cpp_strerror(r) << dendl;
+          if (r == -EINVAL) {
+            dout(4) << "Use --force-corrupt to overwrite structures that "
+                       "appear to be corrupt" << dendl;
+          }
+        }
+      }
+    } else {
+      /* Backtrace-less case: we will inject a lost+found dentry */
+      r = driver->inject_lost_and_found(
+          obj_name_ino, file_size, file_mtime, guessed_layout);
+      if (r < 0) {
+        dout(4) << "Error injecting 0x" << std::hex << obj_name_ino
+          << std::dec << " into lost+found: " << cpp_strerror(r) << dendl;
+        if (r == -EINVAL) {
+          dout(4) << "Use --force-corrupt to overwrite structures that "
+                     "appear to be corrupt" << dendl;
+        }
+      }
+    }
+  }
+
+  return 0;
+}
+
+int MetadataDriver::read_fnode(
+    inodeno_t ino, frag_t frag, fnode_t *fnode,
+    uint64_t *last_version)
+{
+  assert(fnode != NULL);
+
+  object_t frag_oid = InodeStore::get_object_name(ino, frag, "");
+  bufferlist fnode_bl;
+  int r = metadata_io.omap_get_header(frag_oid.name, &fnode_bl);
+  *last_version = metadata_io.get_last_version();
+  if (r < 0) {
+    return r;
+  }
+
+  bufferlist::iterator old_fnode_iter = fnode_bl.begin();
+  try {
+    (*fnode).decode(old_fnode_iter);
+  } catch (const buffer::error &err) {
+    return -EINVAL;
+  }
+
+  return 0;
+}
+
+int MetadataDriver::read_dentry(inodeno_t parent_ino, frag_t frag,
+                const std::string &dname, InodeStore *inode)
+{
+  assert(inode != NULL);
+
+
+  std::string key;
+  dentry_key_t dn_key(CEPH_NOSNAP, dname.c_str());
+  dn_key.encode(key);
+
+  std::set<std::string> keys;
+  keys.insert(key);
+  std::map<std::string, bufferlist> vals;
+  object_t frag_oid = InodeStore::get_object_name(parent_ino, frag, "");
+  int r = metadata_io.omap_get_vals_by_keys(frag_oid.name, keys, &vals);  
+  dout(20) << "oid=" << frag_oid.name
+           << " dname=" << dname
+           << " frag=" << frag
+           << ", r=" << r << dendl;
+  if (r < 0) {
+    return r;
+  }
+
+  if (vals.find(key) == vals.end()) {
+    dout(20) << key << " not found in result" << dendl;
+    return -ENOENT;
+  }
+
+  try {
+    bufferlist::iterator q = vals[key].begin();
+    snapid_t dnfirst;
+    ::decode(dnfirst, q);
+    char dentry_type;
+    ::decode(dentry_type, q);
+    if (dentry_type == 'I') {
+      inode->decode_bare(q);
+      return 0;
+    } else {
+      dout(20) << "dentry type '" << dentry_type << "': cannot"
+                  "read an inode out of that" << dendl;
+      return -EINVAL;
+    }
+  } catch (const buffer::error &err) {
+    dout(20) << "encoding error in dentry 0x" << std::hex << parent_ino
+             << std::dec << "/" << dname << dendl;
+    return -EINVAL;
+  }
+
+  return 0;
+}
+
+int MetadataDriver::inject_lost_and_found(inodeno_t ino, uint64_t file_size,
+    time_t file_mtime, const ceph_file_layout &layout)
+{
+  // Create lost+found if doesn't exist
+  bool created = false;
+  int r = find_or_create_dirfrag(CEPH_INO_ROOT, frag_t(), &created);
+  if (r < 0) {
+    return r;
+  }
+  InodeStore lf_ino;
+  r = read_dentry(CEPH_INO_ROOT, frag_t(), "lost+found", &lf_ino);
+  if (r == -ENOENT || r == -EINVAL) {
+    if (r == -EINVAL && !force_corrupt) {
+      return r;
+    }
+    // Inject dentry
+    lf_ino.inode.mode = 0755 | S_IFDIR;
+    // Set nfiles to something non-zero, to fool any other code
+    // that tries to ignore 'empty' directories.  This won't be
+    // accurate, but it should avoid functional issues.
+    lf_ino.inode.dirstat.nfiles = 1;
+    lf_ino.inode.size = 1;
+    lf_ino.inode.nlink = 1;
+    lf_ino.inode.ino = CEPH_INO_LOST_AND_FOUND;
+    lf_ino.inode.version = 1;
+    lf_ino.inode.backtrace_version = 1;
+    lf_ino.inode.uid = g_conf->mds_root_ino_uid;
+    lf_ino.inode.gid = g_conf->mds_root_ino_gid;
+    r = inject_linkage(CEPH_INO_ROOT, "lost+found", frag_t(), lf_ino);
+    if (r < 0) {
+      return r;
+    }
+  } else {
+    if (!(lf_ino.inode.mode & S_IFDIR)) {
+      derr << "lost+found exists but is not a directory!" << dendl;
+      // In this case we error out, and the user should do something about
+      // this problem.
+      return -EINVAL;
+    }
+  }
+
+  r = find_or_create_dirfrag(CEPH_INO_LOST_AND_FOUND, frag_t(), &created);
+  if (r < 0) {
+    return r;
+  }
+
+  InodeStore recovered_ino;
+  recovered_ino.inode.mode = 0500 | S_IFREG;
+  recovered_ino.inode.size = file_size;
+  recovered_ino.inode.max_size_ever = file_size;
+  recovered_ino.inode.mtime.tv.tv_sec = file_mtime;
+  recovered_ino.inode.atime.tv.tv_sec = file_mtime;
+  recovered_ino.inode.ctime.tv.tv_sec = file_mtime;
+
+  recovered_ino.inode.layout = layout;
+
+  recovered_ino.inode.truncate_seq = 1;
+  recovered_ino.inode.truncate_size = -1ull;
+
+  recovered_ino.inode.inline_data.version = CEPH_INLINE_NONE;
+
+  recovered_ino.inode.nlink = 1;
+  recovered_ino.inode.ino = ino;
+  recovered_ino.inode.version = 1;
+  recovered_ino.inode.backtrace_version = 1;
+  recovered_ino.inode.uid = g_conf->mds_root_ino_uid;
+  recovered_ino.inode.gid = g_conf->mds_root_ino_gid;
+
+  const std::string dname = lost_found_dname(ino);
+
+  // Write dentry into lost+found dirfrag
+  return inject_linkage(lf_ino.inode.ino, dname, frag_t(), recovered_ino);
+}
+
+
+int MetadataDriver::get_frag_of(
+    inodeno_t dirino,
+    const std::string &target_dname,
+    frag_t *result_ft)
+{
+  object_t root_frag_oid = InodeStore::get_object_name(dirino, frag_t(), "");
+
+  dout(20) << "dirino=" << dirino << " target_dname=" << target_dname << dendl;
+
+  // Find and load fragtree if existing dirfrag
+  // ==========================================
+  bool have_backtrace = false; 
+  bufferlist parent_bl;
+  int r = metadata_io.getxattr(root_frag_oid.name, "parent", parent_bl);
+  if (r == -ENODATA) {
+    dout(10) << "No backtrace on '" << root_frag_oid << "'" << dendl;
+  } else if (r < 0) {
+    dout(4) << "Unexpected error on '" << root_frag_oid << "': "
+      << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  // Deserialize backtrace
+  inode_backtrace_t backtrace;
+  if (parent_bl.length()) {
+    try {
+      bufferlist::iterator q = parent_bl.begin();
+      backtrace.decode(q);
+      have_backtrace = true;
+    } catch (buffer::error &e) {
+      dout(4) << "Corrupt backtrace on '" << root_frag_oid << "': " << e << dendl;
+    }
+  }
+
+  if (!(have_backtrace && backtrace.ancestors.size())) {
+    // Can't work out fragtree without a backtrace
+    dout(4) << "No backtrace on '" << root_frag_oid
+            << "': cannot determine fragtree" << dendl;
+    return -ENOENT;
+  }
+
+  // The parentage of dirino
+  const inode_backpointer_t &bp = *(backtrace.ancestors.begin());
+
+  // The inode of dirino's parent
+  const inodeno_t parent_ino = bp.dirino;
+
+  // The dname of dirino in its parent.
+  const std::string &parent_dname = bp.dname;
+
+  dout(20) << "got backtrace parent " << parent_ino << "/"
+           << parent_dname << dendl;
+
+  // The primary dentry for dirino
+  InodeStore existing_dentry;
+
+  // See if we can find ourselves in dirfrag zero of the parent: this
+  // is a fast path that avoids needing to go further up the tree
+  // if the parent isn't fragmented (worst case we would have to
+  // go all the way to the root)
+  r = read_dentry(parent_ino, frag_t(), parent_dname, &existing_dentry);
+  if (r >= 0) {
+    // Great, fast path: return the fragtree from here
+    if (existing_dentry.inode.ino != dirino) {
+      dout(4) << "Unexpected inode in dentry! 0x" << std::hex
+              << existing_dentry.inode.ino
+              << " vs expected 0x" << dirino << std::dec << dendl;
+      return -ENOENT;
+    }
+    dout(20) << "fast path, fragtree is "
+             << existing_dentry.dirfragtree << dendl;
+    *result_ft = existing_dentry.pick_dirfrag(target_dname);
+    dout(20) << "frag is " << *result_ft << dendl;
+    return 0;
+  } else if (r != -ENOENT) {
+    // Dentry not present in 0th frag, must read parent's fragtree
+    frag_t parent_frag;
+    r = get_frag_of(parent_ino, parent_dname, &parent_frag);
+    if (r == 0) {
+      // We have the parent fragtree, so try again to load our dentry
+      r = read_dentry(parent_ino, parent_frag, parent_dname, &existing_dentry);
+      if (r >= 0) {
+        // Got it!
+        *result_ft = existing_dentry.pick_dirfrag(target_dname);
+        dout(20) << "resolved via parent, frag is " << *result_ft << dendl;
+        return 0;
+      } else {
+        if (r == -EINVAL || r == -ENOENT) {
+          return -ENOENT;  // dentry missing or corrupt, so frag is missing
+        } else {
+          return r;
+        }
+      }
+    } else {
+      // Couldn't resolve parent fragtree, so can't find ours.
+      return r;
+    }
+  } else if (r == -EINVAL) {
+    // Unreadable dentry, can't know the fragtree.
+    return -ENOENT;
+  } else {
+    // Unexpected error, raise it
+    return r;
+  }
+}
+
+
+int MetadataDriver::inject_with_backtrace(
+    const inode_backtrace_t &backtrace, uint64_t file_size, time_t file_mtime,
+    const ceph_file_layout &layout)
+    
+{
+
+  // On dirfrags
+  // ===========
+  // In order to insert something into a directory, we first (ideally)
+  // need to know the fragtree for the directory.  Sometimes we can't
+  // get that, in which case we just go ahead and insert it into
+  // fragment zero for a good chance of that being the right thing
+  // anyway (most moderate-sized dirs aren't fragmented!)
+
+  // On ancestry
+  // ===========
+  // My immediate ancestry should be correct, so if we can find that
+  // directory's dirfrag then go inject it there.  This works well
+  // in the case that this inode's dentry was somehow lost and we
+  // are recreating it, because the rest of the hierarchy
+  // will probably still exist.
+  //
+  // It's more of a "better than nothing" approach when rebuilding
+  // a whole tree, as backtraces will in general not be up to date
+  // beyond the first parent, if anything in the trace was ever
+  // moved after the file was created.
+
+  // On inode numbers
+  // ================
+  // The backtrace tells us inodes for each of the parents.  If we are
+  // creating those parent dirfrags, then there is a risk that somehow
+  // the inode indicated here was also used for data (not a dirfrag) at
+  // some stage.  That would be a zany situation, and we don't check
+  // for it here, because to do so would require extra IOs for everything
+  // we inject, and anyway wouldn't guarantee that the inode number
+  // wasn't in use in some dentry elsewhere in the metadata tree that
+  // just happened not to have any data objects.
+
+  // On multiple workers touching the same traces
+  // ============================================
+  // When creating linkage for a directory, *only* create it if we are
+  // also creating the object.  That way, we might not manage to get the
+  // *right* linkage for a directory, but at least we won't multiply link
+  // it.  We assume that if a root dirfrag exists for a directory, then
+  // it is linked somewhere (i.e. that the metadata pool is not already
+  // inconsistent).
+  //
+  // Making sure *that* is true is someone else's job!  Probably someone
+  // who is not going to run in parallel, so that they can self-consistently
+  // look at versions and move things around as they go.
+  // Note this isn't 100% safe: if we die immediately after creating dirfrag
+  // object, next run will fail to create linkage for the dirfrag object
+  // and leave it orphaned.
+
+  inodeno_t ino = backtrace.ino;
+  dout(10) << "  inode: 0x" << std::hex << ino << std::dec << dendl;
+  for (std::vector<inode_backpointer_t>::const_iterator i = backtrace.ancestors.begin();
+      i != backtrace.ancestors.end(); ++i) {
+    const inode_backpointer_t &backptr = *i;
+    dout(10) << "  backptr: 0x" << std::hex << backptr.dirino << std::dec
+      << "/" << backptr.dname << dendl;
+
+    // Examine root dirfrag for parent
+    const inodeno_t parent_ino = backptr.dirino;
+    const std::string dname = backptr.dname;
+
+    frag_t fragment;
+    int r = get_frag_of(parent_ino, dname, &fragment);
+    if (r == -ENOENT) {
+      // Don't know fragment, fall back to assuming root
+      dout(20) << "don't know fragment for 0x" << std::hex <<
+        parent_ino << std::dec << "/" << dname << ", will insert to root"
+        << dendl;
+    }
+
+    // Find or create dirfrag
+    // ======================
+    bool created_dirfrag;
+    r = find_or_create_dirfrag(parent_ino, fragment, &created_dirfrag);
+    if (r < 0) {
+      return r;
+    }
+
+    // Check if dentry already exists
+    // ==============================
+    InodeStore existing_dentry;
+    r = read_dentry(parent_ino, fragment, dname, &existing_dentry);
+    bool write_dentry = false;
+    if (r == -ENOENT || r == -EINVAL) {
+      if (r == -EINVAL && !force_corrupt) {
+        return r;
+      }
+      // Missing or corrupt dentry
+      write_dentry = true;
+    } else if (r < 0) {
+      derr << "Unexpected error reading dentry 0x" << std::hex
+        << parent_ino << std::dec << "/"
+        << dname << ": " << cpp_strerror(r) << dendl;
+      break;
+    } else {
+      // Dentry already present, does it link to me?
+      if (existing_dentry.inode.ino == ino) {
+        dout(20) << "Dentry 0x" << std::hex
+          << parent_ino << std::dec << "/"
+          << dname << " already exists and points to me" << dendl;
+      } else {
+        derr << "Dentry 0x" << std::hex
+          << parent_ino << std::dec << "/"
+          << dname << " already exists but points to 0x"
+          << std::hex << existing_dentry.inode.ino << std::dec << dendl;
+        // Fall back to lost+found!
+        return inject_lost_and_found(backtrace.ino, file_size, file_mtime,
+            layout);
+      }
+    }
+
+    // Inject linkage
+    // ==============
+    if (write_dentry) {
+      InodeStore dentry;
+      if (i == backtrace.ancestors.begin()) {
+        // This is the linkage for a file
+        dentry.inode.mode = 0500 | S_IFREG;
+        dout(10) << "Linking inode 0x" << std::hex << ino
+          << " at 0x" << parent_ino << "/" << dname << std::dec
+          << " with size=" << file_size << " bytes" << dendl;
+
+        // The file size and mtime we learned by scanning globally
+        dentry.inode.size = file_size;
+        dentry.inode.max_size_ever = file_size;
+        dentry.inode.mtime.tv.tv_sec = file_mtime;
+        dentry.inode.atime.tv.tv_sec = file_mtime;
+        dentry.inode.ctime.tv.tv_sec = file_mtime;
+
+        dentry.inode.layout = layout;
+
+        dentry.inode.truncate_seq = 1;
+        dentry.inode.truncate_size = -1ull;
+
+        dentry.inode.inline_data.version = CEPH_INLINE_NONE;
+      } else {
+        // This is the linkage for a directory
+        dentry.inode.mode = 0755 | S_IFDIR;
+
+        // Set nfiles to something non-zero, to fool any other code
+        // that tries to ignore 'empty' directories.  This won't be
+        // accurate, but it should avoid functional issues.
+        dentry.inode.dirstat.nfiles = 1;
+        dentry.inode.size = 1;
+
+      }
+      dentry.inode.nlink = 1;
+      dentry.inode.ino = ino;
+      dentry.inode.uid = g_conf->mds_root_ino_uid;
+      dentry.inode.gid = g_conf->mds_root_ino_gid;
+      dentry.inode.version = 1;
+      dentry.inode.backtrace_version = 1;
+      r = inject_linkage(parent_ino, dname, fragment, dentry);
+      if (r < 0) {
+        return r;
+      }
+    }
+
+    if (!created_dirfrag) {
+      // If the parent dirfrag already existed, then stop traversing the
+      // backtrace: assume that the other ancestors already exist too.  This
+      // is an assumption rather than a truth, but it's a convenient way
+      // to avoid the risk of creating multiply-linked directories while
+      // injecting data.  If there are in fact missing ancestors, this
+      // should be fixed up using a separate tool scanning the metadata
+      // pool.
+      break;
+    } else {
+      // Proceed up the backtrace, creating parents
+      ino = parent_ino;
+    }
+  }
+
+  return 0;
+}
+
+int MetadataDriver::find_or_create_dirfrag(
+    inodeno_t ino,
+    frag_t fragment,
+    bool *created)
+{
+  assert(created != NULL);
+
+  fnode_t existing_fnode;
+  *created = false;
+
+  uint64_t read_version = 0;
+  int r = read_fnode(ino, fragment, &existing_fnode, &read_version);
+  dout(10) << "read_version = " << read_version << dendl;
+
+  if (r == -ENOENT || r == -EINVAL) {
+    if (r == -EINVAL && !force_corrupt) {
+      return r;
+    }
+
+    // Missing or corrupt fnode, create afresh
+    bufferlist fnode_bl;
+    fnode_t blank_fnode;
+    blank_fnode.version = 1;
+    blank_fnode.damage_flags |= (DAMAGE_RSTATS | DAMAGE_RSTATS);
+    blank_fnode.encode(fnode_bl);
+
+
+    librados::ObjectWriteOperation op;
+
+    if (read_version) {
+      assert(r == -EINVAL);
+      // Case A: We must assert that the version isn't changed since we saw the object
+      // was unreadable, to avoid the possibility of two data-scan processes
+      // both creating the frag.
+      op.assert_version(read_version);
+    } else {
+      assert(r == -ENOENT);
+      // Case B: The object didn't exist in read_fnode, so while creating it we must
+      // use an exclusive create to correctly populate *creating with
+      // whether we created it ourselves or someone beat us to it.
+      op.create(true);
+    }
+
+    object_t frag_oid = InodeStore::get_object_name(ino, fragment, "");
+    op.omap_set_header(fnode_bl);
+    r = metadata_io.operate(frag_oid.name, &op);
+    if (r == -EOVERFLOW) {
+      // Someone else wrote it (see case A above)
+      dout(10) << "Dirfrag creation race: 0x" << std::hex
+        << ino << " " << fragment << std::dec << dendl;
+      *created = false;
+      return 0;
+    } else if (r < 0) {
+      // We were unable to create or write it, error out
+      derr << "Failed to create dirfrag 0x" << std::hex
+        << ino << std::dec << ": " << cpp_strerror(r) << dendl;
+      return r;
+    } else {
+      // Success: the dirfrag object now exists with a value header
+      dout(10) << "Created dirfrag: 0x" << std::hex
+        << ino << std::dec << dendl;
+      *created = true;
+    }
+  } else if (r < 0) {
+    derr << "Unexpected error reading dirfrag 0x" << std::hex
+      << ino << std::dec << " : " << cpp_strerror(r) << dendl;
+    return r;
+  } else {
+    dout(20) << "Dirfrag already exists: 0x" << std::hex
+      << ino << " " << fragment << std::dec << dendl;
+  }
+
+  return 0;
+}
+
+int MetadataDriver::inject_linkage(
+    inodeno_t dir_ino, const std::string &dname,
+    const frag_t fragment, const InodeStore &inode)
+{
+  // We have no information about snapshots, so everything goes
+  // in as CEPH_NOSNAP
+  snapid_t snap = CEPH_NOSNAP;
+
+  object_t frag_oid = InodeStore::get_object_name(dir_ino, fragment, "");
+
+  std::string key;
+  dentry_key_t dn_key(snap, dname.c_str());
+  dn_key.encode(key);
+
+  bufferlist dentry_bl;
+  ::encode(snap, dentry_bl);
+  ::encode('I', dentry_bl);
+  inode.encode_bare(dentry_bl);
+
+  // Write out
+  std::map<std::string, bufferlist> vals;
+  vals[key] = dentry_bl;
+  int r = metadata_io.omap_set(frag_oid.name, vals);
+  if (r != 0) {
+    derr << "Error writing dentry 0x" << std::hex
+      << dir_ino << std::dec << "/"
+      << dname << ": " << cpp_strerror(r) << dendl;
+    return r;
+  } else {
+    dout(20) << "Injected dentry 0x" << std::hex
+      << dir_ino << "/" << dname << " pointing to 0x"
+      << inode.inode.ino << std::dec << dendl;
+    return 0;
+  }
+}
+
+
+int MetadataDriver::init(librados::Rados &rados, const MDSMap *mdsmap)
+{
+  int const metadata_pool_id = mdsmap->get_metadata_pool();
+
+  dout(4) << "resolving metadata pool " << metadata_pool_id << dendl;
+  std::string metadata_pool_name;
+  int r = rados.pool_reverse_lookup(metadata_pool_id, &metadata_pool_name);
+  if (r < 0) {
+    derr << "Pool " << metadata_pool_id
+      << " identified in MDS map not found in RADOS!" << dendl;
+    return r;
+  }
+  dout(4) << "found metadata pool '" << metadata_pool_name << "'" << dendl;
+  return rados.ioctx_create(metadata_pool_name.c_str(), metadata_io);
+}
+
+int LocalFileDriver::init(librados::Rados &rados, const MDSMap *mdsmap)
+{
+  return 0;
+}
+
+int LocalFileDriver::inject_data(
+    const std::string &file_path,
+    uint64_t size,
+    uint32_t chunk_size,
+    inodeno_t ino)
+{
+  // Scrape the file contents out of the data pool and into the
+  // local filesystem
+  std::fstream f;
+  f.open(file_path.c_str(), std::fstream::out | std::fstream::binary);
+
+  for (uint64_t offset = 0; offset < size; offset += chunk_size) {
+    bufferlist bl;
+
+    char buf[32];
+    snprintf(buf, sizeof(buf),
+        "%llx.%08llx",
+        (unsigned long long)ino,
+        (unsigned long long)(offset / chunk_size));
+    std::string oid(buf);
+
+    int r = data_io.read(oid, bl, chunk_size, 0);
+
+    if (r <= 0 && r != -ENOENT) {
+      derr << "error reading data object '" << oid << "': "
+        << cpp_strerror(r) << dendl;
+      f.close();
+      return r;
+    } else if (r >=0) {
+      
+      f.seekp(offset);
+      bl.write_stream(f);
+    }
+  }
+  f.close();
+
+  return 0;
+}
+
+
+int LocalFileDriver::inject_with_backtrace(
+    const inode_backtrace_t &bt,
+    uint64_t size,
+    time_t mtime,
+    const ceph_file_layout &layout)
+{
+  std::string path_builder = path;
+
+  // Iterate through backtrace creating directory parents
+  std::vector<inode_backpointer_t>::const_reverse_iterator i;
+  for (i = bt.ancestors.rbegin();
+      i != bt.ancestors.rend(); ++i) {
+
+    const inode_backpointer_t &backptr = *i;
+    path_builder += "/";
+    path_builder += backptr.dname;
+
+    // Last entry is the filename itself
+    bool is_file = (i + 1 == bt.ancestors.rend());
+    if (is_file) {
+      // FIXME: inject_data won't cope with interesting (i.e. striped)
+      // layouts (need a librados-compatible Filer to read these)
+      inject_data(path_builder, size, layout.fl_object_size, bt.ino);
+    } else {
+      int r = mkdir(path_builder.c_str(), 0755);
+      if (r != 0 && r != -EPERM) {
+        derr << "error creating directory: '" << path_builder << "': "
+          << cpp_strerror(r) << dendl;
+        return r;
+      }
+    }
+  }
+
+  return 0;
+}
+
+int LocalFileDriver::inject_lost_and_found(
+    inodeno_t ino,
+    uint64_t size,
+    time_t mtime,
+    const ceph_file_layout &layout)
+{
+  std::string lf_path = path + "/lost+found";
+  int r = mkdir(lf_path.c_str(), 0755);
+  if (r != 0 && r != -EPERM) {
+    derr << "error creating directory: '" << lf_path << "': "
+      << cpp_strerror(r) << dendl;
+    return r;
+  }
+  
+  std::string file_path = lf_path + "/" + lost_found_dname(ino);
+  return inject_data(file_path, size, layout.fl_object_size, ino);
+}
+
+int LocalFileDriver::init_roots(int64_t data_pool_id)
+{
+  // Ensure that the path exists and is a directory
+  bool exists;
+  int r = check_roots(&exists);
+  if (r != 0) {
+    return r;
+  }
+
+  if (exists) {
+    return 0;
+  } else {
+    return ::mkdir(path.c_str(), 0755);
+  }
+}
+
+int LocalFileDriver::check_roots(bool *result)
+{
+  // Check if the path exists and is a directory
+  DIR *d = ::opendir(path.c_str());
+  if (d == NULL) {
+    *result = false;
+  } else {
+    int r = closedir(d);
+    if (r != 0) {
+      // Weird, but maybe possible with e.g. stale FD on NFS mount?
+      *result = false;
+    } else {
+      *result = true;
+    }
+  }
+
+  return 0;
+}
+
diff --git a/src/tools/cephfs/DataScan.h b/src/tools/cephfs/DataScan.h
new file mode 100644
index 0000000..252e6e3
--- /dev/null
+++ b/src/tools/cephfs/DataScan.h
@@ -0,0 +1,272 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+#include "MDSUtility.h"
+#include "include/rados/librados.hpp"
+
+class InodeStore;
+
+class RecoveryDriver {
+  protected:
+    // If true, overwrite structures that generate decoding errors.
+    bool force_corrupt;
+
+    // If true, overwrite root objects during init_roots even if they
+    // exist
+    bool force_init;
+
+  public:
+    virtual int init(librados::Rados &rados, const MDSMap *mdsmap) = 0;
+
+    void set_force_corrupt(const bool val)
+    {
+      force_corrupt = val;
+    }
+
+    void set_force_init(const bool val)
+    {
+      force_init = val;
+    }
+
+
+    /**
+     * Inject an inode + dentry parents into the metadata pool,
+     * based on a backtrace recovered from the data pool
+     */
+    virtual int inject_with_backtrace(
+        const inode_backtrace_t &bt,
+        uint64_t size,
+        time_t mtime,
+        const ceph_file_layout &layout) = 0;
+
+    /**
+     * Inject an inode + dentry into the lost+found directory,
+     * when all we know about a file is its inode.
+     */
+    virtual int inject_lost_and_found(
+        inodeno_t ino,
+        uint64_t size,
+        time_t mtime,
+        const ceph_file_layout &layout) = 0;
+
+    /**
+     * Create any missing roots (i.e. mydir, strays, root inode)
+     */
+    virtual int init_roots(
+        int64_t data_pool_id) = 0;
+
+    /**
+     * Pre-injection check that all the roots are present in
+     * the metadata pool.  Used to avoid parallel workers interfering
+     * with one another, by cueing the user to go run 'init' on a
+     * single node before running a parallel scan.
+     *
+     * @param result: set to true if roots are present, else set to false
+     * @returns 0 on no unexpected errors, else error code.  Missing objects
+     *          are not considered an unexpected error: check *result for
+     *          this case.
+     */
+    virtual int check_roots(bool *result) = 0;
+
+    /**
+     * Helper to compose dnames for links to lost+found
+     * inodes.
+     */
+    std::string lost_found_dname(inodeno_t ino)
+    {
+      char s[20];
+      snprintf(s, sizeof(s), "%llx", (unsigned long long)ino);
+      return std::string(s);
+    }
+
+    RecoveryDriver()
+      : force_corrupt(false)
+    {}
+
+    virtual ~RecoveryDriver() {}
+};
+
+class LocalFileDriver : public RecoveryDriver
+{ 
+  protected:
+    const std::string path;
+    librados::IoCtx &data_io;
+
+  int inject_data(
+      const std::string &file_path,
+      uint64_t size,
+      uint32_t chunk_size,
+      inodeno_t ino);
+  public:
+
+    LocalFileDriver(const std::string &path_, librados::IoCtx &data_io_)
+      : RecoveryDriver(), path(path_), data_io(data_io_)
+    {}
+
+    // Implement RecoveryDriver interface
+    int init(librados::Rados &rados, const MDSMap *mdsmap);
+
+    int inject_with_backtrace(
+        const inode_backtrace_t &bt,
+        uint64_t size,
+        time_t mtime,
+        ceph_file_layout const &layout);
+
+    int inject_lost_and_found(
+        inodeno_t ino,
+        uint64_t size,
+        time_t mtime,
+        ceph_file_layout const &layout);
+
+    int init_roots(int64_t data_pool_id);
+
+    int check_roots(bool *result);
+};
+
+/**
+ * A class that knows how to manipulate CephFS metadata pools
+ */
+class MetadataDriver : public RecoveryDriver
+{
+  protected:
+
+    librados::IoCtx metadata_io;
+
+    /**
+     * Create a .inode object, i.e. root or mydir
+     */
+    int inject_unlinked_inode(inodeno_t inono, int mode, int64_t data_pool_id);
+
+    /**
+     * Check for existence of .inode objects, before
+     * trying to go ahead and inject metadata.
+     */
+    int root_exists(inodeno_t ino, bool *result);
+
+    /**
+     * Try and read an fnode from a dirfrag
+     */
+    int read_fnode(inodeno_t ino, frag_t frag,
+                   fnode_t *fnode, uint64_t *read_version);
+
+    /**
+     * Try and read a dentry from a dirfrag
+     */
+    int read_dentry(inodeno_t parent_ino, frag_t frag,
+                    const std::string &dname, InodeStore *inode);
+
+    int find_or_create_dirfrag(
+        inodeno_t ino,
+        frag_t fragment,
+        bool *created);
+
+    int inject_linkage(
+        inodeno_t dir_ino, const std::string &dname,
+        const frag_t fragment, const InodeStore &inode);
+
+    /**
+     * Work out which fragment of a directory should contain a named
+     * dentry, recursing up the trace as necessary to retrieve
+     * fragtrees.
+     */
+    int get_frag_of(
+        inodeno_t dirino,
+        const std::string &dname,
+        frag_t *result_ft);
+
+  public:
+
+    // Implement RecoveryDriver interface
+    int init(librados::Rados &rados, const MDSMap *mdsmap);
+
+    int inject_with_backtrace(
+        const inode_backtrace_t &bt,
+        uint64_t size,
+        time_t mtime,
+        ceph_file_layout const &layout);
+
+    int inject_lost_and_found(
+        inodeno_t ino,
+        uint64_t size,
+        time_t mtime,
+        ceph_file_layout const &layout);
+
+    int init_roots(int64_t data_pool_id);
+
+    int check_roots(bool *result);
+};
+
+class DataScan : public MDSUtility
+{
+  protected:
+    RecoveryDriver *driver;
+
+    // IoCtx for data pool (where we scrape backtraces from)
+    librados::IoCtx data_io;
+    // Remember the data pool ID for use in layouts
+    int64_t data_pool_id;
+
+    uint32_t n;
+    uint32_t m;
+
+    /**
+     * Scan data pool for backtraces, and inject inodes to metadata pool
+     */
+    int scan_inodes();
+
+    /**
+     * Scan data pool for file sizes and mtimes
+     */
+    int scan_extents();
+
+    // Accept pools which are not in the MDSMap
+    bool force_pool;
+    // Respond to decode errors by overwriting
+    bool force_corrupt;
+    // Overwrite root objects even if they exist
+    bool force_init;
+
+    /**
+     * @param r set to error on valid key with invalid value
+     * @return true if argument consumed, else false
+     */
+    bool parse_kwarg(
+        const std::vector<const char*> &args,
+        std::vector<const char *>::const_iterator &i,
+        int *r);
+
+    /**
+     * @return true if argument consumed, else false
+     */
+    bool parse_arg(
+      const std::vector<const char*> &arg,
+      std::vector<const char *>::const_iterator &i);
+
+  public:
+    void usage();
+    int main(const std::vector<const char *> &args);
+
+    DataScan()
+      : driver(NULL), data_pool_id(-1), n(0), m(1),
+        force_pool(false)
+    {
+    }
+
+    ~DataScan()
+    {
+      delete driver;
+    }
+};
+
diff --git a/src/tools/cephfs/JournalScanner.cc b/src/tools/cephfs/JournalScanner.cc
index f75edd9..f2d22d0 100644
--- a/src/tools/cephfs/JournalScanner.cc
+++ b/src/tools/cephfs/JournalScanner.cc
@@ -15,6 +15,8 @@
 #include "include/rados/librados.hpp"
 #include "mds/JournalPointer.h"
 
+#include "mds/events/ESubtreeMap.h"
+
 #include "JournalScanner.h"
 
 #define dout_subsys ceph_subsys_mds
@@ -256,9 +258,21 @@ int JournalScanner::scan_events()
         }
 
         LogEvent *le = LogEvent::decode(le_bl);
+
         if (le) {
           dout(10) << "Valid entry at 0x" << std::hex << read_offset << std::dec << dendl;
 
+          if (le->get_type() == EVENT_SUBTREEMAP
+              || le->get_type() == EVENT_SUBTREEMAP_TEST) {
+            ESubtreeMap *sle = dynamic_cast<ESubtreeMap*>(le);
+            if (sle->expire_pos > read_offset) {
+              errors.insert(std::make_pair(
+                    read_offset, EventError(
+                      -ERANGE,
+                      "ESubtreeMap has expire_pos ahead of its own position")));
+            }
+          }
+
           if (filter.apply(read_offset, *le)) {
             events[read_offset] = EventRecord(le, consumed);
           } else {
diff --git a/src/tools/cephfs/JournalTool.cc b/src/tools/cephfs/JournalTool.cc
index 6118320..d07d487 100644
--- a/src/tools/cephfs/JournalTool.cc
+++ b/src/tools/cephfs/JournalTool.cc
@@ -41,7 +41,12 @@
 void JournalTool::usage()
 {
   std::cout << "Usage: \n"
-    << "  cephfs-journal-tool [options] journal [inspect|import|export|reset]\n"
+    << "  cephfs-journal-tool [options] journal <command>\n"
+    << "    <command>:\n"
+    << "      inspect\n"
+    << "      import <path>\n"
+    << "      export <path>\n"
+    << "      reset [--force]\n"
     << "  cephfs-journal-tool [options] header <get|set <field> <value>\n"
     << "  cephfs-journal-tool [options] event <effect> <selector> <output>\n"
     << "    <selector>:\n"
@@ -155,7 +160,21 @@ int JournalTool::main_journal(std::vector<const char*> &argv)
       return -EINVAL;
     }
   } else if (command == "reset") {
-      return journal_reset();
+    bool force = false;
+    if (argv.size() == 2) {
+      if (std::string(argv[1]) == "--force") {
+        force = true;
+      } else {
+        std::cerr << "Unknown argument " << argv[1] << std::endl;
+        usage();
+        return -EINVAL;
+      }
+    } else if (argv.size() > 2) {
+      std::cerr << "Too many arguments!" << std::endl;
+      usage();
+      return -EINVAL;
+    }
+    return journal_reset(force);
   } else {
     derr << "Bad journal command '" << command << "'" << dendl;
     return -EINVAL;
@@ -529,7 +548,7 @@ int JournalTool::journal_export(std::string const &path, bool import)
 /**
  * Truncate journal and insert EResetJournal
  */
-int JournalTool::journal_reset()
+int JournalTool::journal_reset(bool hard)
 {
   int r = 0;
   Resetter resetter;
@@ -538,7 +557,17 @@ int JournalTool::journal_reset()
     derr << "resetter::init failed: " << cpp_strerror(r) << dendl;
     return r;
   }
-  resetter.reset(rank);
+
+  if (mdsmap->is_dne(mds_rank_t(rank))) {
+    std::cerr << "MDS rank " << rank << " does not exist" << std::endl;
+    return -ENOENT;
+  }
+
+  if (hard) {
+    r = resetter.reset_hard(rank);
+  } else {
+    r = resetter.reset(rank);
+  }
   resetter.shutdown();
 
   return r;
diff --git a/src/tools/cephfs/JournalTool.h b/src/tools/cephfs/JournalTool.h
index 24722c2..f717859 100644
--- a/src/tools/cephfs/JournalTool.h
+++ b/src/tools/cephfs/JournalTool.h
@@ -45,7 +45,7 @@ class JournalTool : public MDSUtility
     // Journal operations
     int journal_inspect();
     int journal_export(std::string const &path, bool import);
-    int journal_reset();
+    int journal_reset(bool hard);
 
     // Header operations
     int header_set();
diff --git a/src/tools/cephfs/MDSUtility.cc b/src/tools/cephfs/MDSUtility.cc
index 54985ad..708b57c 100644
--- a/src/tools/cephfs/MDSUtility.cc
+++ b/src/tools/cephfs/MDSUtility.cc
@@ -26,7 +26,7 @@ MDSUtility::MDSUtility() :
   waiting_for_mds_map(NULL)
 {
   monc = new MonClient(g_ceph_context);
-  messenger = Messenger::create(g_ceph_context, g_ceph_context->_conf->ms_type, entity_name_t::CLIENT(), "mds", getpid());
+  messenger = Messenger::create_client_messenger(g_ceph_context, "mds");
   mdsmap = new MDSMap();
   objecter = new Objecter(g_ceph_context, messenger, monc, NULL, 0, 0);
 }
diff --git a/src/tools/cephfs/Resetter.cc b/src/tools/cephfs/Resetter.cc
index 5fb8997..6beaf98 100644
--- a/src/tools/cephfs/Resetter.cc
+++ b/src/tools/cephfs/Resetter.cc
@@ -17,13 +17,15 @@
 #include "mds/JournalPointer.h"
 
 #include "mds/mdstypes.h"
+#include "mds/MDCache.h"
 #include "mon/MonClient.h"
 #include "mds/events/EResetJournal.h"
 
 #include "Resetter.h"
 
+#define dout_subsys ceph_subsys_mds
 
-void Resetter::reset(int rank)
+int Resetter::reset(int rank)
 {
   Mutex mylock("Resetter::reset::lock");
   Cond cond;
@@ -33,8 +35,9 @@ void Resetter::reset(int rank)
   JournalPointer jp(rank, mdsmap->get_metadata_pool());
   int jp_load_result = jp.load(objecter);
   if (jp_load_result != 0) {
-    std::cerr << "Error loading journal: " << cpp_strerror(jp_load_result) << std::endl;
-    return;
+    std::cerr << "Error loading journal: " << cpp_strerror(jp_load_result) <<
+      ", pass --force to forcibly reset this journal" << std::endl;
+    return jp_load_result;
   }
 
   Journaler journaler(jp.front,
@@ -55,10 +58,12 @@ void Resetter::reset(int rank)
     if (r == -ENOENT) {
       cerr << "journal does not exist on-disk. Did you set a bad rank?"
 	   << std::endl;
-      return;
+      std::cerr << "Error loading journal: " << cpp_strerror(r) <<
+        ", pass --force to forcibly reset this journal" << std::endl;
+      return r;
     } else {
-      cerr << "got error " << r << "from Journaler, failling" << std::endl;
-      return;
+      cerr << "got error " << r << "from Journaler, failing" << std::endl;
+      return r;
     }
   }
 
@@ -88,7 +93,70 @@ void Resetter::reset(int rank)
   mylock.Unlock();
     
   lock.Lock();
-  assert(r == 0);
+  if (r != 0) {
+    return r;
+  }
+
+  r = _write_reset_event(&journaler);
+  if (r != 0) {
+    return r;
+  }
+
+  lock.Unlock();
+
+  cout << "done" << std::endl;
+
+  return 0;
+}
+
+int Resetter::reset_hard(int rank)
+{
+  JournalPointer jp(rank, mdsmap->get_metadata_pool());
+  jp.front = rank + MDS_INO_LOG_OFFSET;
+  jp.back = 0;
+  int r = jp.save(objecter);
+  if (r != 0) {
+    derr << "Error writing journal pointer: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  Journaler journaler(jp.front,
+    mdsmap->get_metadata_pool(),
+    CEPH_FS_ONDISK_MAGIC,
+    objecter, 0, 0, &timer, &finisher);
+  journaler.set_writeable();
+
+  ceph_file_layout default_log_layout = MDCache::gen_default_log_layout(*mdsmap);
+  journaler.create(&default_log_layout, g_conf->mds_journal_format);
+
+  C_SaferCond cond;
+  {
+    Mutex::Locker l(lock);
+    journaler.write_head(&cond);
+  }
+  r = cond.wait();
+  if (r != 0) {
+    derr << "Error writing journal header: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  {
+    Mutex::Locker l(lock);
+    r = _write_reset_event(&journaler);
+  }
+  if (r != 0) {
+    derr << "Error writing EResetJournal: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  dout(4) << "Successfully wrote new journal pointer and header for rank "
+    << rank << dendl;
+  return 0;
+}
+
+int Resetter::_write_reset_event(Journaler *journaler)
+{
+  assert(journaler != NULL);
 
   LogEvent *le = new EResetJournal;
 
@@ -96,17 +164,10 @@ void Resetter::reset(int rank)
   le->encode_with_header(bl);
   
   cout << "writing EResetJournal entry" << std::endl;
-  journaler.append_entry(bl);
-  journaler.flush(new C_SafeCond(&mylock, &cond, &done,&r));
+  C_SaferCond cond;
+  journaler->append_entry(bl);
+  journaler->flush(&cond);
 
-  lock.Unlock();
-
-  mylock.Lock();
-  while (!done)
-    cond.Wait(mylock);
-  mylock.Unlock();
-
-  assert(r == 0);
-
-  cout << "done" << std::endl;
+  return cond.wait();
 }
+
diff --git a/src/tools/cephfs/Resetter.h b/src/tools/cephfs/Resetter.h
index be10538..2baaf47 100644
--- a/src/tools/cephfs/Resetter.h
+++ b/src/tools/cephfs/Resetter.h
@@ -17,6 +17,8 @@
 
 #include "MDSUtility.h"
 
+class Journaler;
+
 /**
  * This class lets you reset an mds journal for troubleshooting or whatever.
  *
@@ -24,11 +26,18 @@
  * of the file to dump to.
  */
 class Resetter : public MDSUtility {
-  int rank; 
+protected:
+  int _write_reset_event(Journaler *journaler);
+
 public:
   Resetter() {}
 
-  void reset(int rank);
+  /**
+   * For use when no journal header/pointer was present: write one
+   * out from scratch.
+   */
+  int reset_hard(int rank);
+  int reset(int rank);
 };
 
 #endif /* JOURNAL_RESETTER_H_ */
diff --git a/src/tools/cephfs/TableTool.cc b/src/tools/cephfs/TableTool.cc
index 4b22de0..99a0856 100644
--- a/src/tools/cephfs/TableTool.cc
+++ b/src/tools/cephfs/TableTool.cc
@@ -247,7 +247,7 @@ public:
       version_t version = 1;
       ::encode(version, new_bl);
     }
-    table_inst.encode(new_bl);
+    table_inst.encode_state(new_bl);
 
     // Write out new table
     int r = io->write_full(object_name, new_bl);
@@ -261,14 +261,113 @@ public:
   }
 };
 
+template <typename A>
+class TableHandlerOmap
+{
+private:
+  // The RADOS object ID for the table
+  std::string object_name;
+
+  // The rank in question (may be NONE)
+  mds_rank_t rank;
+
+  // Whether this is an MDSTable subclass (i.e. has leading version field to decode)
+  bool mds_table;
+
+public:
+  TableHandlerOmap(mds_rank_t r, std::string const &name, bool mds_table_)
+    : rank(r), mds_table(mds_table_)
+  {
+    // Compose object name of the table we will dump
+    std::ostringstream oss;
+    oss << "mds";
+    if (rank != MDS_RANK_NONE) {
+      oss << rank;
+    }
+    oss << "_" << name;
+    object_name = oss.str();
+  }
+
+  int load_and_dump(librados::IoCtx *io, Formatter *f)
+  {
+    assert(io != NULL);
+    assert(f != NULL);
+
+    // Read in the header
+    bufferlist header_bl;
+    int r = io->omap_get_header(object_name, &header_bl);
+    if (r != 0) {
+      derr << "error reading header: " << cpp_strerror(r) << dendl;
+      return r;
+    }
+
+    // Decode the header
+    A table_inst;
+    table_inst.set_rank(rank);
+    try {
+      table_inst.decode_header(header_bl);
+    } catch (buffer::error &e) {
+      derr << "table " << object_name << " is corrupt" << dendl;
+      return -EIO;
+    }
+
+    // Read and decode OMAP values in chunks
+    std::string last_key = "";
+    while(true) {
+      std::map<std::string, bufferlist> values;
+      int r = io->omap_get_vals(object_name, last_key,
+          g_conf->mds_sessionmap_keys_per_op, &values);
+
+      if (r != 0) {
+        derr << "error reading values: " << cpp_strerror(r) << dendl;
+        return r;
+      }
+
+      if (values.empty()) {
+        break;
+      }
+
+      try {
+        table_inst.decode_values(values);
+      } catch (buffer::error &e) {
+        derr << "table " << object_name << " is corrupt" << dendl;
+        return -EIO;
+      }
+      last_key = values.rbegin()->first;
+    }
+
+    table_inst.dump(f);
+
+    return 0;
+  }
+
+  int reset(librados::IoCtx *io)
+  {
+    A table_inst;
+    table_inst.set_rank(rank);
+    table_inst.reset_state();
+
+    bufferlist header_bl;
+    table_inst.encode_header(&header_bl);
+
+    // Compose a transaction to clear and write header
+    librados::ObjectWriteOperation op;
+    op.omap_clear();
+    op.set_op_flags2(LIBRADOS_OP_FLAG_FAILOK);
+    op.omap_set_header(header_bl);
+    
+    return io->operate(object_name, &op);
+  }
+};
+
 int TableTool::_show_session_table(mds_rank_t rank, Formatter *f)
 {
-  return TableHandler<SessionMapStore>(rank, "sessionmap", false).load_and_dump(&io, f);
+  return TableHandlerOmap<SessionMapStore>(rank, "sessionmap", false).load_and_dump(&io, f);
 }
 
 int TableTool::_reset_session_table(mds_rank_t rank, Formatter *f)
 {
-  return TableHandler<SessionMapStore>(rank, "sessionmap", false).reset(&io);
+  return TableHandlerOmap<SessionMapStore>(rank, "sessionmap", false).reset(&io);
 }
 
 int TableTool::_show_ino_table(mds_rank_t rank, Formatter *f)
diff --git a/src/tools/cephfs/cephfs-data-scan.cc b/src/tools/cephfs/cephfs-data-scan.cc
new file mode 100644
index 0000000..5e9c05a
--- /dev/null
+++ b/src/tools/cephfs/cephfs-data-scan.cc
@@ -0,0 +1,45 @@
+
+#include "include/types.h"
+#include "common/config.h"
+#include "common/ceph_argparse.h"
+#include "common/errno.h"
+#include "global/global_init.h"
+
+#include "DataScan.h"
+
+
+int main(int argc, const char **argv)
+{
+  vector<const char*> args;
+  argv_to_vec(argc, argv, args);
+  env_to_vec(args);
+
+  global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
+  common_init_finish(g_ceph_context);
+
+  DataScan data_scan;
+
+  // Handle --help before calling init() so we don't depend on network.
+  if (args.empty() || (args.size() == 1 && (std::string(args[0]) == "--help" || std::string(args[0]) == "-h"))) {
+    data_scan.usage();
+    return 0;
+  }
+
+  // Connect to mon cluster, download MDS map etc
+  int rc = data_scan.init();
+  if (rc != 0) {
+      std::cerr << "Error in initialization: " << cpp_strerror(rc) << std::endl;
+      return rc;
+  }
+
+  // Finally, execute the user's commands
+  rc = data_scan.main(args);
+  if (rc != 0) {
+    std::cerr << "Error (" << cpp_strerror(rc) << ")" << std::endl;
+  }
+
+  data_scan.shutdown();
+
+  return rc;
+}
+
diff --git a/src/tools/crushtool.cc b/src/tools/crushtool.cc
index d576b5e..532ffe1 100644
--- a/src/tools/crushtool.cc
+++ b/src/tools/crushtool.cc
@@ -27,6 +27,7 @@
 #include "common/config.h"
 
 #include "common/ceph_argparse.h"
+#include "include/stringify.h"
 #include "global/global_context.h"
 #include "global/global_init.h"
 #include "osd/OSDMap.h"
@@ -105,24 +106,49 @@ cout << "                               OSD id (int), proportional weight (float
 void usage()
 {
   cout << "usage: crushtool ...\n";
+  cout << "\n";
+  cout << "Display, modify and test a crush map\n";
+  cout << "\n";
+  cout << "There are five stages, running one after the other:\n";
+  cout << "\n";
+  cout << " - input/build\n";
+  cout << " - tunables adjustments\n";
+  cout << " - modifications\n";
+  cout << " - display/test\n";
+  cout << " - output\n";
+  cout << "\n";
+  cout << "Options that are not specific to a stage.\n";
+  cout << "\n";
+  cout << "   [--infn|-i infile]\n";
+  cout << "                         read the crush map from infile\n";
+  cout << "\n";
+  cout << "Options for the input/build stage\n";
+  cout << "\n";
   cout << "   --decompile|-d map    decompile a crush map to source\n";
-  cout << "   --tree                print map summary as a tree\n";
-  cout << "   --compile|-c map.txt  compile a map from source\n";
-  cout << "   [-o outfile [--clobber]]\n";
+  cout << "   [--outfn|-o outfile]\n";
   cout << "                         specify output for for (de)compilation\n";
+  cout << "   --compile|-c map.txt  compile a map from source\n";
+  cout << "   --enable-unsafe-tunables compile with unsafe tunables\n";
   cout << "   --build --num_osds N layer1 ...\n";
   cout << "                         build a new map, where each 'layer' is\n";
   cout << "                           'name (uniform|straw|list|tree) size'\n";
-  cout << "   -i mapfn --test       test a range of inputs on the map\n";
-  cout << "      [--min-x x] [--max-x x] [--x x]\n";
-  cout << "      [--min-rule r] [--max-rule r] [--rule r]\n";
-  cout << "      [--num-rep n]\n";
-  cout << "      [--batches b]      split the CRUSH mapping into b > 1 rounds\n";
-  cout << "      [--weight|-w devno weight]\n";
-  cout << "                         where weight is 0 to 1.0\n";
-  cout << "      [--simulate]       simulate placements using a random\n";
-  cout << "                         number generator in place of the CRUSH\n";
-  cout << "                         algorithm\n";
+  cout << "\n";
+  cout << "Options for the tunables adjustments stage\n";
+  cout << "\n";
+  cout << "   --set-choose-local-tries N\n";
+  cout << "                         set choose local retries before re-descent\n";
+  cout << "   --set-choose-local-fallback-tries N\n";
+  cout << "                         set choose local retries using fallback\n";
+  cout << "                         permutation before re-descent\n";
+  cout << "   --set-choose-total-tries N\n";
+  cout << "                         set choose total descent attempts\n";
+  cout << "   --set-chooseleaf-descend-once <0|1>\n";
+  cout << "                         set chooseleaf to (not) retry the recursive descent\n";
+  cout << "   --set-chooseleaf-vary-r <0|1>\n";
+  cout << "                         set chooseleaf to (not) vary r based on parent\n";
+  cout << "\n";
+  cout << "Options for the modifications stage\n";
+  cout << "\n";
   cout << "   -i mapfn --add-item id weight name [--loc type name ...]\n";
   cout << "                         insert an item into the hierarchy at the\n";
   cout << "                         given location\n";
@@ -138,9 +164,20 @@ void usage()
   cout << "\n";
   cout << "Options for the display/test stage\n";
   cout << "\n";
-  cout << "   --check max_id        check if any item is referencing an unknown name/type\n";
+  cout << "   --tree                print map summary as a tree\n";
+  cout << "   --check [max_id]      check if any item is referencing an unknown name/type\n";
   cout << "   -i mapfn --show-location id\n";
   cout << "                         show location for given device id\n";
+  cout << "   -i mapfn --test       test a range of inputs on the map\n";
+  cout << "      [--min-x x] [--max-x x] [--x x]\n";
+  cout << "      [--min-rule r] [--max-rule r] [--rule r] [--ruleset rs]\n";
+  cout << "      [--num-rep n]\n";
+  cout << "      [--batches b]      split the CRUSH mapping into b > 1 rounds\n";
+  cout << "      [--weight|-w devno weight]\n";
+  cout << "                         where weight is 0 to 1.0\n";
+  cout << "      [--simulate]       simulate placements using a random\n";
+  cout << "                         number generator in place of the CRUSH\n";
+  cout << "                         algorithm\n";
   cout << "   --show-utilization    show OSD usage\n";
   cout << "   --show utilization-all\n";
   cout << "                         include zero weight items\n";
@@ -148,17 +185,6 @@ void usage()
   cout << "   --show-mappings       show mappings\n";
   cout << "   --show-bad-mappings   show bad mappings\n";
   cout << "   --show-choose-tries   show choose tries histogram\n";
-  cout << "   --set-choose-local-tries N\n";
-  cout << "                         set choose local retries before re-descent\n";
-  cout << "   --set-choose-local-fallback-tries N\n";
-  cout << "                         set choose local retries using fallback\n";
-  cout << "                         permutation before re-descent\n";
-  cout << "   --set-choose-total-tries N\n";
-  cout << "                         set choose total descent attempts\n";
-  cout << "   --set-chooseleaf-descend-once <0|1>\n";
-  cout << "                         set chooseleaf to (not) retry the recursive descent\n";
-  cout << "   --set-chooseleaf-vary-r <0|1>\n";
-  cout << "                         set chooseleaf to (not) vary r based on parent\n";
   cout << "   --output-name name\n";
   cout << "                         prepend the data file(s) generated during the\n";
   cout << "                         testing routine with name\n";
@@ -166,6 +192,12 @@ void usage()
   cout << "                         export select data generated during testing routine\n";
   cout << "                         to CSV files for off-line post-processing\n";
   cout << "                         use --help-output for more information\n";
+  cout << "\n";
+  cout << "Options for the output stage\n";
+  cout << "\n";
+  cout << "   [--outfn|-o outfile]\n";
+  cout << "                         specify output for for modified crush map\n";
+  cout << "\n";
 }
 
 struct bucket_types_t {
@@ -282,39 +314,39 @@ int main(int argc, const char **argv)
     } else if (ceph_argparse_witharg(args, i, &val, "-c", "--compile", (char*)NULL)) {
       srcfn = val;
       compile = true;
-    } else if (ceph_argparse_withint(args, i, &max_id, &err, "--check", (char*)NULL)) {
+    } else if (ceph_argparse_witharg(args, i, &max_id, err, "--check", (char*)NULL)) {
       check = true;
     } else if (ceph_argparse_flag(args, i, "-t", "--test", (char*)NULL)) {
       test = true;
-    } else if (ceph_argparse_withint(args, i, &full_location, &err, "--show-location", (char*)NULL)) {
+    } else if (ceph_argparse_witharg(args, i, &full_location, err, "--show-location", (char*)NULL)) {
     } else if (ceph_argparse_flag(args, i, "-s", "--simulate", (char*)NULL)) {
       tester.set_random_placement();
     } else if (ceph_argparse_flag(args, i, "--enable-unsafe-tunables", (char*)NULL)) {
       unsafe_tunables = true;
-    } else if (ceph_argparse_withint(args, i, &choose_local_tries, &err,
+    } else if (ceph_argparse_witharg(args, i, &choose_local_tries, err,
 				     "--set_choose_local_tries", (char*)NULL)) {
       adjust = true;
-    } else if (ceph_argparse_withint(args, i, &choose_local_fallback_tries, &err,
+    } else if (ceph_argparse_witharg(args, i, &choose_local_fallback_tries, err,
 				     "--set_choose_local_fallback_tries", (char*)NULL)) {
       adjust = true;
-    } else if (ceph_argparse_withint(args, i, &choose_total_tries, &err,
+    } else if (ceph_argparse_witharg(args, i, &choose_total_tries, err,
 				     "--set_choose_total_tries", (char*)NULL)) {
       adjust = true;
-    } else if (ceph_argparse_withint(args, i, &chooseleaf_descend_once, &err,
+    } else if (ceph_argparse_witharg(args, i, &chooseleaf_descend_once, err,
 				     "--set_chooseleaf_descend_once", (char*)NULL)) {
       adjust = true;
-    } else if (ceph_argparse_withint(args, i, &chooseleaf_vary_r, &err,
+    } else if (ceph_argparse_witharg(args, i, &chooseleaf_vary_r, err,
 				     "--set_chooseleaf_vary_r", (char*)NULL)) {
       adjust = true;
-    } else if (ceph_argparse_withint(args, i, &straw_calc_version, &err,
+    } else if (ceph_argparse_witharg(args, i, &straw_calc_version, err,
 				     "--set_straw_calc_version", (char*)NULL)) {
       adjust = true;
-    } else if (ceph_argparse_withint(args, i, &allowed_bucket_algs, &err,
+    } else if (ceph_argparse_witharg(args, i, &allowed_bucket_algs, err,
 				     "--set_allowed_bucket_algs", (char*)NULL)) {
       adjust = true;
     } else if (ceph_argparse_flag(args, i, "--reweight", (char*)NULL)) {
       reweight = true;
-    } else if (ceph_argparse_withint(args, i, &add_item, &err, "--add_item", (char*)NULL)) {
+    } else if (ceph_argparse_witharg(args, i, &add_item, err, "--add_item", (char*)NULL)) {
       if (!err.str().empty()) {
 	cerr << err.str() << std::endl;
 	exit(EXIT_FAILURE);
@@ -331,7 +363,7 @@ int main(int argc, const char **argv)
       }
       add_name.assign(*i);
       i = args.erase(i);
-    } else if (ceph_argparse_withint(args, i, &add_item, &err, "--update_item", (char*)NULL)) {
+    } else if (ceph_argparse_witharg(args, i, &add_item, err, "--update_item", (char*)NULL)) {
       update_item = true;
       if (!err.str().empty()) {
 	cerr << err.str() << std::endl;
@@ -386,78 +418,78 @@ int main(int argc, const char **argv)
       i = args.erase(i);
     } else if (ceph_argparse_flag(args, i, "--build", (char*)NULL)) {
       build = true;
-    } else if (ceph_argparse_withint(args, i, &num_osds, &err, "--num_osds", (char*)NULL)) {
+    } else if (ceph_argparse_witharg(args, i, &num_osds, err, "--num_osds", (char*)NULL)) {
       if (!err.str().empty()) {
 	cerr << err.str() << std::endl;
 	exit(EXIT_FAILURE);
       }
-    } else if (ceph_argparse_withint(args, i, &x, &err, "--num_rep", (char*)NULL)) {
+    } else if (ceph_argparse_witharg(args, i, &x, err, "--num_rep", (char*)NULL)) {
       if (!err.str().empty()) {
 	cerr << err.str() << std::endl;
 	exit(EXIT_FAILURE);
       }
       tester.set_num_rep(x);
-    } else if (ceph_argparse_withint(args, i, &x, &err, "--max_x", (char*)NULL)) {
+    } else if (ceph_argparse_witharg(args, i, &x, err, "--max_x", (char*)NULL)) {
       if (!err.str().empty()) {
 	cerr << err.str() << std::endl;
 	exit(EXIT_FAILURE);
       }
       tester.set_max_x(x);
-    } else if (ceph_argparse_withint(args, i, &x, &err, "--min_x", (char*)NULL)) {
+    } else if (ceph_argparse_witharg(args, i, &x, err, "--min_x", (char*)NULL)) {
       if (!err.str().empty()) {
 	cerr << err.str() << std::endl;
 	exit(EXIT_FAILURE);
       }
       tester.set_min_x(x);
-    } else if (ceph_argparse_withint(args, i, &x, &err, "--x", (char*)NULL)) {
+    } else if (ceph_argparse_witharg(args, i, &x, err, "--x", (char*)NULL)) {
       if (!err.str().empty()) {
 	cerr << err.str() << std::endl;
 	exit(EXIT_FAILURE);
       }
       tester.set_x(x);
-    } else if (ceph_argparse_withint(args, i, &x, &err, "--max_rule", (char*)NULL)) {
+    } else if (ceph_argparse_witharg(args, i, &x, err, "--max_rule", (char*)NULL)) {
       if (!err.str().empty()) {
 	cerr << err.str() << std::endl;
 	exit(EXIT_FAILURE);
       }
       tester.set_max_rule(x);
-    } else if (ceph_argparse_withint(args, i, &x, &err, "--min_rule", (char*)NULL)) {
+    } else if (ceph_argparse_witharg(args, i, &x, err, "--min_rule", (char*)NULL)) {
       if (!err.str().empty()) {
 	cerr << err.str() << std::endl;
 	exit(EXIT_FAILURE);
       }
       tester.set_min_rule(x);
-    } else if (ceph_argparse_withint(args, i, &x, &err, "--rule", (char*)NULL)) {
+    } else if (ceph_argparse_witharg(args, i, &x, err, "--rule", (char*)NULL)) {
       if (!err.str().empty()) {
 	cerr << err.str() << std::endl;
 	exit(EXIT_FAILURE);
       }
       tester.set_rule(x);
-    } else if (ceph_argparse_withint(args, i, &x, &err, "--ruleset", (char*)NULL)) {
+    } else if (ceph_argparse_witharg(args, i, &x, err, "--ruleset", (char*)NULL)) {
       if (!err.str().empty()) {
 	cerr << err.str() << std::endl;
 	exit(EXIT_FAILURE);
       }
       tester.set_ruleset(x);
-    } else if (ceph_argparse_withint(args, i, &x, &err, "--batches", (char*)NULL)) {
+    } else if (ceph_argparse_witharg(args, i, &x, err, "--batches", (char*)NULL)) {
       if (!err.str().empty()) {
 	cerr << err.str() << std::endl;
 	exit(EXIT_FAILURE);
       }
       tester.set_batches(x);
-    } else if (ceph_argparse_withfloat(args, i, &y, &err, "--mark-down-ratio", (char*)NULL)) {
+    } else if (ceph_argparse_witharg(args, i, &y, err, "--mark-down-ratio", (char*)NULL)) {
       if (!err.str().empty()) {
         cerr << err.str() << std::endl;
         exit(EXIT_FAILURE);
       }
       tester.set_device_down_ratio(y);
-    } else if (ceph_argparse_withfloat(args, i, &y, &err, "--mark-down-bucket-ratio", (char*)NULL)) {
+    } else if (ceph_argparse_witharg(args, i, &y, err, "--mark-down-bucket-ratio", (char*)NULL)) {
       if (!err.str().empty()) {
         cerr << err.str() << std::endl;
         exit(EXIT_FAILURE);
       }
       tester.set_bucket_down_ratio(y);
-    } else if (ceph_argparse_withint(args, i, &tmp, &err, "--weight", (char*)NULL)) {
+    } else if (ceph_argparse_witharg(args, i, &tmp, err, "--weight", (char*)NULL)) {
       if (!err.str().empty()) {
 	cerr << err.str() << std::endl;
 	exit(EXIT_FAILURE);
@@ -518,6 +550,8 @@ int main(int argc, const char **argv)
 
   bool modified = false;
 
+  // input ----
+
   if (!infn.empty()) {
     bufferlist bl;
     std::string error;
@@ -545,36 +579,6 @@ int main(int argc, const char **argv)
     crush.decode(p);
   }
 
-  if (full_location >= 0) {
-    map<string, string> loc = crush.get_full_location(full_location);
-    for (map<string,string>::iterator p = loc.begin();
-	 p != loc.end();
-	 ++p) {
-      cout << p->first << "\t" << p->second << std::endl;
-    }
-    exit(0);
-  }
-  if (decompile) {
-    CrushCompiler cc(crush, cerr, verbose);
-    if (!outfn.empty()) {
-      ofstream o;
-      o.open(outfn.c_str(), ios::out | ios::binary | ios::trunc);
-      if (!o.is_open()) {
-	cerr << me << ": error writing '" << outfn << "'" << std::endl;
-	exit(1);
-      }
-      cc.decompile(o);
-      o.close();
-    } else {
-      cc.decompile(cout);
-    }
-  }
-  if (tree) {
-    ostringstream oss;
-    crush.dump_tree(&oss, NULL);
-    dout(1) << "\n" << oss.str() << dendl;
-  }
-
   if (compile) {
     crush.create();
 
@@ -606,11 +610,12 @@ int main(int argc, const char **argv)
     vector<int> lower_items;
     vector<int> lower_weights;
 
+    crush.set_max_devices(num_osds);
     for (int i=0; i<num_osds; i++) {
       lower_items.push_back(i);
       lower_weights.push_back(0x10000);
+      crush.set_item_name(i, "osd." + stringify(i));
     }
-    crush.set_max_devices(num_osds);
 
     int type = 1;
     for (vector<layer_t>::iterator p = layers.begin(); p != layers.end(); ++p, type++) {
@@ -690,12 +695,6 @@ int main(int argc, const char **argv)
       lower_weights.swap(cur_weights);
     }
 
-    {
-      ostringstream oss;
-      crush.dump_tree(&oss, NULL);
-      dout(1) << "\n" << oss.str() << dendl;
-    }
-
     string root = layers.back().size == 0 ? layers.back().name :
       string(layers.back().name) + "0";
 
@@ -717,6 +716,37 @@ int main(int argc, const char **argv)
     modified = true;
   }
 
+  // mutate ----
+
+  if (choose_local_tries >= 0) {
+    crush.set_choose_local_tries(choose_local_tries);
+    modified = true;
+  }
+  if (choose_local_fallback_tries >= 0) {
+    crush.set_choose_local_fallback_tries(choose_local_fallback_tries);
+    modified = true;
+  }
+  if (choose_total_tries >= 0) {
+    crush.set_choose_total_tries(choose_total_tries);
+    modified = true;
+  }
+  if (chooseleaf_descend_once >= 0) {
+    crush.set_chooseleaf_descend_once(chooseleaf_descend_once);
+    modified = true;
+  }
+  if (chooseleaf_vary_r >= 0) {
+    crush.set_chooseleaf_vary_r(chooseleaf_vary_r);
+    modified = true;
+  }
+  if (straw_calc_version >= 0) {
+    crush.set_straw_calc_version(straw_calc_version);
+    modified = true;
+  }
+  if (allowed_bucket_algs >= 0) {
+    crush.set_allowed_bucket_algs(allowed_bucket_algs);
+    modified = true;
+  }
+
   if (!reweight_name.empty()) {
     cout << me << " reweighting item " << reweight_name << " to " << reweight_weight << std::endl;
     int r;
@@ -733,8 +763,8 @@ int main(int argc, const char **argv)
       cerr << me << " " << cpp_strerror(r) << std::endl;
       return r;
     }
-        
   }
+
   if (!remove_name.empty()) {
     cout << me << " removing item " << remove_name << std::endl;
     int r;
@@ -752,6 +782,7 @@ int main(int argc, const char **argv)
       return r;
     }
   }
+
   if (add_item >= 0) {
     int r;
     if (update_item) {
@@ -766,55 +797,40 @@ int main(int argc, const char **argv)
       return r;
     }
   }
+
   if (reweight) {
     crush.reweight(g_ceph_context);
     modified = true;
   }
 
-  if (choose_local_tries >= 0) {
-    crush.set_choose_local_tries(choose_local_tries);
-    modified = true;
-  }
-  if (choose_local_fallback_tries >= 0) {
-    crush.set_choose_local_fallback_tries(choose_local_fallback_tries);
-    modified = true;
-  }
-  if (choose_total_tries >= 0) {
-    crush.set_choose_total_tries(choose_total_tries);
-    modified = true;
-  }
-  if (chooseleaf_descend_once >= 0) {
-    crush.set_chooseleaf_descend_once(chooseleaf_descend_once);
-    modified = true;
-  }
-  if (chooseleaf_vary_r >= 0) {
-    crush.set_chooseleaf_vary_r(chooseleaf_vary_r);
-    modified = true;
-  }
-  if (straw_calc_version >= 0) {
-    crush.set_straw_calc_version(straw_calc_version);
-    modified = true;
-  }
-  if (allowed_bucket_algs >= 0) {
-    crush.set_allowed_bucket_algs(allowed_bucket_algs);
-    modified = true;
+
+  // display ---
+  if (full_location >= 0) {
+    map<string, string> loc = crush.get_full_location(full_location);
+    for (map<string,string>::iterator p = loc.begin();
+	 p != loc.end();
+	 ++p) {
+      cout << p->first << "\t" << p->second << std::endl;
+    }
   }
 
- if (modified) {
-   crush.finalize();
+  if (tree) {
+    crush.dump_tree(&cout, NULL);
+  }
 
-    if (outfn.empty()) {
-      cout << me << " successfully built or modified map.  Use '-o <file>' to write it out." << std::endl;
-    } else {
-      bufferlist bl;
-      crush.encode(bl);
-      int r = bl.write_file(outfn.c_str());
-      if (r < 0) {
-	cerr << me << ": error writing '" << outfn << "': " << cpp_strerror(r) << std::endl;
+  if (decompile) {
+    CrushCompiler cc(crush, cerr, verbose);
+    if (!outfn.empty()) {
+      ofstream o;
+      o.open(outfn.c_str(), ios::out | ios::binary | ios::trunc);
+      if (!o.is_open()) {
+	cerr << me << ": error writing '" << outfn << "'" << std::endl;
 	exit(1);
       }
-      if (verbose)
-	cout << "wrote crush map to " << outfn << std::endl;
+      cc.decompile(o);
+      o.close();
+    } else {
+      cc.decompile(cout);
     }
   }
 
@@ -834,6 +850,25 @@ int main(int argc, const char **argv)
       exit(1);
   }
 
+  // output ---
+  if (modified) {
+    crush.finalize();
+
+    if (outfn.empty()) {
+      cout << me << " successfully built or modified map.  Use '-o <file>' to write it out." << std::endl;
+    } else {
+      bufferlist bl;
+      crush.encode(bl);
+      int r = bl.write_file(outfn.c_str());
+      if (r < 0) {
+	cerr << me << ": error writing '" << outfn << "': " << cpp_strerror(r) << std::endl;
+	exit(1);
+      }
+      if (verbose)
+	cout << "wrote crush map to " << outfn << std::endl;
+    }
+  }
+
   return 0;
 }
 /*
diff --git a/src/tools/osdmaptool.cc b/src/tools/osdmaptool.cc
index 76c05a0..810f82a 100644
--- a/src/tools/osdmaptool.cc
+++ b/src/tools/osdmaptool.cc
@@ -53,8 +53,9 @@ int main(int argc, const char **argv)
 
   std::string fn;
   bool print = false;
-  bool print_json = false;
+  boost::scoped_ptr<Formatter> print_formatter;
   bool tree = false;
+  boost::scoped_ptr<Formatter> tree_formatter;
   bool createsimple = false;
   bool create_from_conf = false;
   int num_osd = 0;
@@ -82,11 +83,17 @@ int main(int argc, const char **argv)
       usage();
     } else if (ceph_argparse_flag(args, i, "-p", "--print", (char*)NULL)) {
       print = true;
-    } else if (ceph_argparse_flag(args, i, "--dump-json", (char*)NULL)) {
-      print_json = true;
-    } else if (ceph_argparse_flag(args, i, "--tree", (char*)NULL)) {
+    } else if (ceph_argparse_witharg(args, i, &val, err, "--dump", (char*)NULL)) {
+      print = true;
+      if (!val.empty() && val != "plain") {
+	print_formatter.reset(Formatter::create(val, "", "json"));
+      }
+    } else if (ceph_argparse_witharg(args, i, &val, err, "--tree", (char*)NULL)) {
       tree = true;
-    } else if (ceph_argparse_withint(args, i, &num_osd, &err, "--createsimple", (char*)NULL)) {
+      if (!val.empty() && val != "plain") {
+	tree_formatter.reset(Formatter::create(val, "", "json"));
+      }
+    } else if (ceph_argparse_witharg(args, i, &num_osd, err, "--createsimple", (char*)NULL)) {
       if (!err.str().empty()) {
 	cerr << err.str() << std::endl;
 	exit(EXIT_FAILURE);
@@ -106,12 +113,12 @@ int main(int argc, const char **argv)
       test_random = true;
     } else if (ceph_argparse_flag(args, i, "--clobber", (char*)NULL)) {
       clobber = true;
-    } else if (ceph_argparse_withint(args, i, &pg_bits, &err, "--pg_bits", (char*)NULL)) {
+    } else if (ceph_argparse_witharg(args, i, &pg_bits, err, "--pg_bits", (char*)NULL)) {
       if (!err.str().empty()) {
 	cerr << err.str() << std::endl;
 	exit(EXIT_FAILURE);
       }
-    } else if (ceph_argparse_withint(args, i, &pgp_bits, &err, "--pgp_bits", (char*)NULL)) {
+    } else if (ceph_argparse_witharg(args, i, &pgp_bits, err, "--pgp_bits", (char*)NULL)) {
       if (!err.str().empty()) {
 	cerr << err.str() << std::endl;
 	exit(EXIT_FAILURE);
@@ -126,9 +133,9 @@ int main(int argc, const char **argv)
       test_map_object = val;
     } else if (ceph_argparse_flag(args, i, "--test_crush", (char*)NULL)) {
       test_crush = true;
-    } else if (ceph_argparse_withint(args, i, &range_first, &err, "--range_first", (char*)NULL)) {
-    } else if (ceph_argparse_withint(args, i, &range_last, &err, "--range_last", (char*)NULL)) {
-    } else if (ceph_argparse_withint(args, i, &pool, &err, "--pool", (char*)NULL)) {
+    } else if (ceph_argparse_witharg(args, i, &range_first, err, "--range_first", (char*)NULL)) {
+    } else if (ceph_argparse_witharg(args, i, &range_last, err, "--range_last", (char*)NULL)) {
+    } else if (ceph_argparse_witharg(args, i, &pool, err, "--pool", (char*)NULL)) {
       if (!err.str().empty()) {
         cerr << err.str() << std::endl;
         exit(EXIT_FAILURE);
@@ -450,7 +457,7 @@ int main(int argc, const char **argv)
     }
   }
 
-  if (!print && !print_json && !tree && !modified && 
+  if (!print && !tree && !modified &&
       export_crush.empty() && import_crush.empty() && 
       test_map_pg.empty() && test_map_object.empty() &&
       !test_map_pgs && !test_map_pgs_dump) {
@@ -461,13 +468,28 @@ int main(int argc, const char **argv)
   if (modified)
     osdmap.inc_epoch();
 
-  if (print) 
-    osdmap.print(cout);
-  if (print_json)
-    osdmap.dump_json(cout);
-  if (tree) 
-    osdmap.print_tree(&cout, NULL);
+  if (print) {
+    if (print_formatter) {
+      print_formatter->open_object_section("osdmap");
+      osdmap.dump(print_formatter.get());
+      print_formatter->close_section();
+      print_formatter->flush(cout);
+    } else {
+      osdmap.print(cout);
+    }
+  }
 
+  if (tree) {
+    if (tree_formatter) {
+      tree_formatter->open_object_section("tree");
+      osdmap.print_tree(tree_formatter.get(), NULL);
+      tree_formatter->close_section();
+      tree_formatter->flush(cout);
+      cout << std::endl;
+    } else {
+      osdmap.print_tree(NULL, &cout);
+    }
+  }
   if (modified) {
     bl.clear();
     osdmap.encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT | CEPH_FEATURE_RESERVED);
diff --git a/src/tools/psim.cc b/src/tools/psim.cc
index 3022b1c..04a1987 100644
--- a/src/tools/psim.cc
+++ b/src/tools/psim.cc
@@ -32,22 +32,22 @@ int main(int argc, char **argv)
   int count[n];
   int first_count[n];
   int primary_count[n];
+  int size[4];
+
+  memset(count, 0, sizeof(count));
+  memset(first_count, 0, sizeof(first_count));
+  memset(primary_count, 0, sizeof(primary_count));
+  memset(size, 0, sizeof(size));
+
   for (int i=0; i<n; i++) {
     osdmap.set_state(i, osdmap.get_state(i) | CEPH_OSD_UP);
     //if (i<12)
       osdmap.set_weight(i, CEPH_OSD_IN);
-    count[i] = 0;
-    first_count[i] = 0;
-    primary_count[i] = 0;
   }
 
   //pg_pool_t *p = (pg_pool_t *)osdmap.get_pg_pool(0);
   //p->type = pg_pool_t::TYPE_ERASURE;
 
-  int size[4];
-  for (int i=0; i<4; i++)
-    size[i] = 0;
-
   for (int n = 0; n < 10; n++) {   // namespaces
     char nspace[20];
     snprintf(nspace, sizeof(nspace), "n%d", n);
diff --git a/src/tools/rados/PoolDump.cc b/src/tools/rados/PoolDump.cc
new file mode 100644
index 0000000..5d0b3ed
--- /dev/null
+++ b/src/tools/rados/PoolDump.cc
@@ -0,0 +1,169 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "include/rados/librados.hpp"
+#include "common/errno.h"
+
+#include "PoolDump.h"
+
+using namespace librados;
+
+#define dout_subsys ceph_subsys_rados
+
+/**
+ * Export RADOS objects from a live cluster
+ * to a serialized format via a file descriptor.
+ *
+ * @returns 0 on success, else error code
+ */
+int PoolDump::dump(IoCtx *io_ctx)
+{
+  assert(io_ctx != NULL);
+
+  int r = 0;
+  write_super();
+
+  r = write_simple(TYPE_POOL_BEGIN, file_fd);
+  if (r != 0) {
+    return r;
+  }
+
+  io_ctx->set_namespace(all_nspaces);
+  librados::NObjectIterator i = io_ctx->nobjects_begin();
+
+  librados::NObjectIterator i_end = io_ctx->nobjects_end();
+  for (; i != i_end; ++i) {
+    const std::string oid = i->get_oid();
+    dout(10) << "OID '" << oid << "'" << dendl;
+
+    // Compose OBJECT_BEGIN
+    // ====================
+    object_begin obj_begin;
+    obj_begin.hoid.hobj.oid = i->get_oid();
+    obj_begin.hoid.hobj.nspace = i->get_nspace();
+    obj_begin.hoid.hobj.set_key(i->get_locator());
+
+    // Only output head, RadosImport only wants that
+    obj_begin.hoid.hobj.snap = CEPH_NOSNAP;
+
+    // Skip setting object_begin.oi, RadosImport doesn't care
+
+    r = write_section(TYPE_OBJECT_BEGIN, obj_begin, file_fd);
+    if (r != 0) {
+      return r;
+    }
+
+    // Compose TYPE_DATA chunks
+    // ========================
+    const uint32_t op_size = 4096 * 1024;
+    uint64_t offset = 0;
+    io_ctx->set_namespace(i->get_nspace());
+    while (true) {
+      bufferlist outdata;
+      r = io_ctx->read(oid, outdata, op_size, offset);
+      if (r <= 0) {
+        // Error or no data
+        break;
+      }
+
+      r = write_section(TYPE_DATA,
+          data_section(offset, outdata.length(), outdata), file_fd);
+      if (r != 0) {
+        // Output stream error
+        return r;
+      }
+
+      if (outdata.length() < op_size) {
+        // No more data
+        r = 0;
+        break;
+      }
+      offset += outdata.length();
+    }
+
+    // Compose TYPE_ATTRS chunk
+    // ========================
+    std::map<std::string, bufferlist> raw_xattrs;
+    std::map<std::string, bufferlist> xattrs;
+    r = io_ctx->getxattrs(oid, raw_xattrs);
+    if (r < 0) {
+      cerr << "error getting xattr set " << oid << ": " << cpp_strerror(r)
+           << std::endl;
+      return r;
+    }
+    // Prepend "_" to mimic how user keys are represented in a pg export
+    for (std::map<std::string, bufferlist>::iterator i = raw_xattrs.begin();
+         i != raw_xattrs.end(); ++i) {
+      std::pair< std::string, bufferlist> item(std::string("_") + std::string(i->first.c_str()), i->second);
+      xattrs.insert(item);
+    }
+    r = write_section(TYPE_ATTRS, attr_section(xattrs), file_fd);
+    if (r != 0) {
+      return r;
+    }
+
+    // Compose TYPE_OMAP_HDR section
+    // =============================
+    bufferlist omap_header;
+    r = io_ctx->omap_get_header(oid, &omap_header);
+    if (r < 0) {
+      cerr << "error getting omap header " << oid
+	   << ": " << cpp_strerror(r) << std::endl;
+      return r;
+    }
+    r = write_section(TYPE_OMAP_HDR, omap_hdr_section(omap_header), file_fd);
+    if (r != 0) {
+      return r;
+    }
+
+    // Compose TYPE_OMAP
+    int MAX_READ = 512;
+    string last_read = "";
+    do {
+      map<string, bufferlist> values;
+      r = io_ctx->omap_get_vals(oid, last_read, MAX_READ, &values);
+      if (r < 0) {
+	cerr << "error getting omap keys " << oid << ": "
+	     << cpp_strerror(r) << std::endl;
+	return r;
+      }
+      if (values.size()) {
+        last_read = values.rbegin()->first;
+      } else {
+        break;
+      }
+
+      r = write_section(TYPE_OMAP, omap_section(values), file_fd);
+      if (r != 0) {
+        return r;
+      }
+      r = values.size();
+    } while (r == MAX_READ);
+    r = 0;
+
+    // Close object
+    // =============
+    r = write_simple(TYPE_OBJECT_END, file_fd);
+    if (r != 0) {
+      return r;
+    }
+  }
+
+  r = write_simple(TYPE_POOL_END, file_fd);
+#if defined(__linux__)
+  if (file_fd != STDOUT_FILENO)
+    posix_fadvise(file_fd, 0, 0, POSIX_FADV_DONTNEED);
+#endif
+  return r;
+}
diff --git a/src/tools/rados/PoolDump.h b/src/tools/rados/PoolDump.h
new file mode 100644
index 0000000..6b4eae5
--- /dev/null
+++ b/src/tools/rados/PoolDump.h
@@ -0,0 +1,32 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+#ifndef POOL_DUMP_H_
+#define POOL_DUMP_H_
+
+#include "tools/RadosDump.h"
+
+namespace librados {
+    class IoCtx;
+}
+
+class PoolDump : public RadosDump
+{
+  public:
+    PoolDump(int file_fd_) : RadosDump(file_fd_, false) {}
+    int dump(librados::IoCtx *io_ctx);
+};
+
+#endif // POOL_DUMP_H_
diff --git a/src/tools/rados/RadosImport.cc b/src/tools/rados/RadosImport.cc
new file mode 100644
index 0000000..1f74af2
--- /dev/null
+++ b/src/tools/rados/RadosImport.cc
@@ -0,0 +1,377 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+#include "common/errno.h"
+
+#include "osd/PGLog.h"
+#include "RadosImport.h"
+
+#define dout_subsys ceph_subsys_rados
+
+int RadosImport::import(std::string pool, bool no_overwrite)
+{
+  librados::IoCtx ioctx;
+  librados::Rados cluster;
+
+  char *id = getenv("CEPH_CLIENT_ID");
+  if (id) cerr << "Client id is: " << id << std::endl;
+  int ret = cluster.init(id);
+  if (ret) {
+    cerr << "Error " << ret << " in cluster.init" << std::endl;
+    return ret;
+  }
+  ret = cluster.conf_read_file(NULL);
+  if (ret) {
+    cerr << "Error " << ret << " in cluster.conf_read_file" << std::endl;
+    return ret;
+  }
+  ret = cluster.conf_parse_env(NULL);
+  if (ret) {
+    cerr << "Error " << ret << " in cluster.conf_read_env" << std::endl;
+    return ret;
+  }
+  cluster.connect();
+
+  ret = cluster.ioctx_create(pool.c_str(), ioctx);
+  if (ret < 0) {
+    cerr << "ioctx_create " << pool << " failed with " << ret << std::endl;
+    return ret;
+  }
+
+  return import(ioctx, no_overwrite);
+}
+
+int RadosImport::import(librados::IoCtx &io_ctx, bool no_overwrite)
+{
+  bufferlist ebl;
+  pg_info_t info;
+  PGLog::IndexedLog log;
+
+  int ret = read_super();
+  if (ret)
+    return ret;
+
+  if (sh.magic != super_header::super_magic) {
+    cerr << "Invalid magic number: 0x"
+      << std::hex << sh.magic << " vs. 0x" << super_header::super_magic
+      << std::dec << std::endl;
+    return -EFAULT;
+  }
+
+  if (sh.version > super_header::super_ver) {
+    cerr << "Can't handle export format version=" << sh.version << std::endl;
+    return -EINVAL;
+  }
+
+  //First section must be TYPE_PG_BEGIN
+  sectiontype_t type;
+  ret = read_section(&type, &ebl);
+  if (ret)
+    return ret;
+
+  bool pool_mode = false;
+  if (type == TYPE_POOL_BEGIN) {
+    pool_mode = true;
+    cout << "Importing pool" << std::endl;
+  } else if (type == TYPE_PG_BEGIN) {
+    bufferlist::iterator ebliter = ebl.begin();
+    pg_begin pgb;
+    pgb.decode(ebliter);
+    spg_t pgid = pgb.pgid;;
+    if (!pgid.is_no_shard()) {
+      cerr << "Importing Erasure Coded shard is not supported" << std::endl;
+      return -EOPNOTSUPP;
+    }
+    dout(10) << "Exported features: " << pgb.superblock.compat_features << dendl;
+    cout << "Importing from pgid " << pgid << std::endl;
+  } else {
+    cerr << "Invalid initial section code " << type << std::endl;
+    return -EFAULT;
+  }
+
+  // XXX: How to check export features?
+#if 0
+  if (sb.compat_features.compare(pgb.superblock.compat_features) == -1) {
+    cerr << "Export has incompatible features set "
+      << pgb.superblock.compat_features << std::endl;
+    return -EINVAL;
+  }
+#endif
+
+#if defined(__linux__)
+  if (file_fd != STDIN_FILENO)
+    posix_fadvise(file_fd, 0, 0, POSIX_FADV_SEQUENTIAL);
+#endif
+
+  bool done = false;
+  bool found_metadata = false;
+  while(!done) {
+    ret = read_section(&type, &ebl);
+    if (ret)
+      return ret;
+
+    //cout << "do_import: Section type " << hex << type << dec << std::endl;
+    if (type >= END_OF_TYPES) {
+      cout << "Skipping unknown section type" << std::endl;
+      continue;
+    }
+    switch(type) {
+    case TYPE_OBJECT_BEGIN:
+      ret = get_object_rados(io_ctx, ebl, no_overwrite);
+      if (ret) {
+        cerr << "Error inserting object: " << ret << std::endl;
+        return ret;
+      }
+      break;
+    case TYPE_PG_METADATA:
+      dout(10) << "Don't care about the old metadata" << dendl;
+      found_metadata = true;
+      break;
+    case TYPE_PG_END:
+      done = true;
+      break;
+    case TYPE_POOL_END:
+      done = true;
+      break;
+    default:
+      return -EFAULT;
+    }
+  }
+
+  if (!(pool_mode || found_metadata)) {
+    cerr << "Missing metadata section!" << std::endl;
+  }
+
+#if defined(__linux__)
+  if (file_fd != STDIN_FILENO)
+    posix_fadvise(file_fd, 0, 0, POSIX_FADV_DONTNEED);
+#endif
+  return 0;
+}
+
+int RadosImport::get_object_rados(librados::IoCtx &ioctx, bufferlist &bl, bool no_overwrite)
+{
+  bufferlist::iterator ebliter = bl.begin();
+  object_begin ob;
+  ob.decode(ebliter);
+  map<string,bufferlist>::iterator i;
+  bufferlist abl;
+  bool skipping;
+
+  data_section ds;
+  attr_section as;
+  omap_hdr_section oh;
+  omap_section os;
+
+  assert(g_ceph_context);
+  if (ob.hoid.hobj.nspace == g_ceph_context->_conf->osd_hit_set_namespace) {
+    cout << "Skipping internal object " << ob.hoid << std::endl;
+    skip_object(bl);
+    return 0;
+  }
+
+  if (!ob.hoid.hobj.is_head()) {
+    cout << "Skipping non-head for " << ob.hoid << std::endl;
+    skip_object(bl);
+    return 0;
+  }
+
+  ioctx.set_namespace(ob.hoid.hobj.get_namespace());
+
+  string msg("Write");
+  skipping = false;
+  if (dry_run) {
+    uint64_t psize;
+    time_t pmtime;
+    int ret = ioctx.stat(ob.hoid.hobj.oid.name, &psize, &pmtime);
+    if (ret == 0) {
+      if (no_overwrite)
+        // Could set skipping, but dry-run doesn't change anything either
+        msg = "Skipping existing";
+      else
+        msg = "***Overwrite***";
+    }
+  } else {
+    int ret = ioctx.create(ob.hoid.hobj.oid.name, true);
+    if (ret && ret != -EEXIST) {
+      cerr << "create failed: " << cpp_strerror(ret) << std::endl;
+      return ret;
+    }
+    if (ret == -EEXIST) {
+      if (no_overwrite) {
+        msg = "Skipping existing";
+        skipping = true;
+      } else {
+        msg = "***Overwrite***";
+        ret = ioctx.remove(ob.hoid.hobj.oid.name);
+        if (ret < 0) {
+          cerr << "remove failed: " << cpp_strerror(ret) << std::endl;
+          return ret;
+        }
+        ret = ioctx.create(ob.hoid.hobj.oid.name, true);
+        // If object re-appeared after removal, let's just skip it
+        if (ret == -EEXIST) {
+          skipping = true;
+          msg = "Skipping in-use object";
+          ret = 0;
+        }
+        if (ret < 0) {
+          cerr << "create failed: " << cpp_strerror(ret) << std::endl;
+          return ret;
+        }
+      }
+    }
+  }
+
+  cout << msg << " " << ob.hoid << std::endl;
+
+  bool need_align = false;
+  uint64_t alignment = 0;
+  if (align) {
+    need_align = true;
+    alignment = align;
+  } else {
+    if ((need_align = ioctx.pool_requires_alignment()))
+      alignment = ioctx.pool_required_alignment();
+  }
+
+  if (need_align) {
+    dout(10) << "alignment = " << alignment << dendl;
+  }
+
+  bufferlist ebl, databl;
+  uint64_t in_offset = 0, out_offset = 0;
+  bool done = false;
+  while(!done) {
+    sectiontype_t type;
+    int ret = read_section(&type, &ebl);
+    if (ret) {
+      cerr << "Error reading section: " << ret << std::endl;
+      return ret;
+    }
+
+    ebliter = ebl.begin();
+    //cout << "\tdo_object: Section type " << hex << type << dec << std::endl;
+    //cout << "\t\tsection size " << ebl.length() << std::endl;
+    if (type >= END_OF_TYPES) {
+      cout << "Skipping unknown object section type" << std::endl;
+      continue;
+    }
+    switch(type) {
+    case TYPE_DATA:
+      ds.decode(ebliter);
+      dout(10) << "\tdata: offset " << ds.offset << " len " << ds.len << dendl;
+      if (need_align) {
+        if (ds.offset != in_offset) {
+          cerr << "Discontiguous object data in export" << std::endl;
+          return -EFAULT;
+        }
+        assert(ds.databl.length() == ds.len);
+        databl.claim_append(ds.databl);
+        in_offset += ds.len;
+        if (databl.length() >= alignment) {
+          uint64_t rndlen = uint64_t(databl.length() / alignment) * alignment;
+          dout(10) << "write offset=" << out_offset << " len=" << rndlen << dendl;
+          if (!dry_run && !skipping) {
+            ret = ioctx.write(ob.hoid.hobj.oid.name, databl, rndlen, out_offset);
+            if (ret) {
+              cerr << "write failed: " << cpp_strerror(ret) << std::endl;
+              return ret;
+            }
+          }
+          out_offset += rndlen;
+          bufferlist n;
+          if (databl.length() > rndlen) {
+            assert(databl.length() - rndlen < alignment);
+	    n.substr_of(databl, rndlen, databl.length() - rndlen);
+          }
+          databl = n;
+        }
+        break;
+      }
+      if (!dry_run && !skipping) {
+        ret = ioctx.write(ob.hoid.hobj.oid.name, ds.databl, ds.len, ds.offset);
+        if (ret) {
+          cerr << "write failed: " << cpp_strerror(ret) << std::endl;
+          return ret;
+        }
+      }
+      break;
+    case TYPE_ATTRS:
+      as.decode(ebliter);
+
+      dout(10) << "\tattrs: len " << as.data.size() << dendl;
+      if (dry_run || skipping)
+        break;
+      for (std::map<string,bufferlist>::iterator i = as.data.begin();
+          i != as.data.end(); ++i) {
+        if (i->first == "_" || i->first == "snapset")
+          continue;
+        ret = ioctx.setxattr(ob.hoid.hobj.oid.name, i->first.substr(1).c_str(), i->second);
+        if (ret) {
+          cerr << "setxattr failed: " << cpp_strerror(ret) << std::endl;
+          if (ret != -EOPNOTSUPP)
+            return ret;
+        }
+      }
+      break;
+    case TYPE_OMAP_HDR:
+      oh.decode(ebliter);
+
+      dout(10) << "\tomap header: " << string(oh.hdr.c_str(), oh.hdr.length())
+        << dendl;
+      if (dry_run || skipping)
+        break;
+      ret = ioctx.omap_set_header(ob.hoid.hobj.oid.name, oh.hdr);
+      if (ret) {
+        cerr << "omap_set_header failed: " << cpp_strerror(ret) << std::endl;
+        if (ret != -EOPNOTSUPP)
+          return ret;
+      }
+      break;
+    case TYPE_OMAP:
+      os.decode(ebliter);
+
+      dout(10) << "\tomap: size " << os.omap.size() << dendl;
+      if (dry_run || skipping)
+        break;
+      ret = ioctx.omap_set(ob.hoid.hobj.oid.name, os.omap);
+      if (ret) {
+        cerr << "omap_set failed: " << cpp_strerror(ret) << std::endl;
+        if (ret != -EOPNOTSUPP)
+          return ret;
+      }
+      break;
+    case TYPE_OBJECT_END:
+      done = true;
+      if (need_align && databl.length() > 0) {
+        assert(databl.length() < alignment);
+        dout(10) << "END write offset=" << out_offset << " len=" << databl.length() << dendl;
+        if (dry_run || skipping)
+          break;
+        ret = ioctx.write(ob.hoid.hobj.oid.name, databl, databl.length(), out_offset);
+        if (ret) {
+          cerr << "write failed: " << cpp_strerror(ret) << std::endl;
+          return ret;
+        }
+      }
+      break;
+    default:
+      cerr << "Unexpected section type " << type << std::endl;
+      return -EFAULT;
+    }
+  }
+  return 0;
+}
diff --git a/src/tools/rados/RadosImport.h b/src/tools/rados/RadosImport.h
new file mode 100644
index 0000000..3ce3690
--- /dev/null
+++ b/src/tools/rados/RadosImport.h
@@ -0,0 +1,45 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef RADOS_IMPORT_H_
+#define RADOS_IMPORT_H_
+
+#include <string>
+
+#include "include/rados/librados.hpp"
+#include "include/buffer.h"
+
+#include "tools/RadosDump.h"
+
+/**
+ * Specialization of RadosDump that adds
+ * methods for importing objects from a stream
+ * to a live cluster.
+ */
+class RadosImport : public RadosDump
+{
+  protected:
+    uint64_t align;
+    int get_object_rados(librados::IoCtx &ioctx, bufferlist &bl, bool no_overwrite);
+
+  public:
+    RadosImport(int file_fd_, uint64_t align_, bool dry_run_)
+      : RadosDump(file_fd_, dry_run_), align(align_)
+    {}
+
+    int import(std::string pool, bool no_overwrite);
+    int import(librados::IoCtx &io_ctx, bool no_overwrite);
+};
+
+#endif // RADOS_IMPORT_H_
diff --git a/src/tools/rados/rados.cc b/src/tools/rados/rados.cc
index 4b96ac5..4f4b086 100644
--- a/src/tools/rados/rados.cc
+++ b/src/tools/rados/rados.cc
@@ -16,8 +16,8 @@
 
 #include "include/rados/librados.hpp"
 #include "include/rados/rados_types.hpp"
-#include "rados_sync.h"
-using namespace librados;
+#include "include/radosstriper/libradosstriper.hpp"
+using namespace libradosstriper;
 
 #include "common/config.h"
 #include "common/ceph_argparse.h"
@@ -45,9 +45,14 @@ using namespace librados;
 #include "include/compat.h"
 #include "common/hobject.h"
 
+#include "PoolDump.h"
+#include "RadosImport.h"
+
 int rados_tool_sync(const std::map < std::string, std::string > &opts,
                              std::vector<const char*> &args);
 
+using namespace librados;
+
 // two steps seem to be necessary to do this right
 #define STR(x) _STR(x)
 #define _STR(x) #x
@@ -63,6 +68,8 @@ void usage(ostream& out)
 "   cppool <pool-name> <dest-pool>   copy content of a pool\n"
 "   rmpool <pool-name> [<pool-name> --yes-i-really-really-mean-it]\n"
 "                                    remove pool <pool-name>'\n"
+"   purge <pool-name> --yes-i-really-really-mean-it\n"
+"                                    remove all objects from pool <pool-name> without removing it\n"
 "   df                               show per-pool and total usage\n"
 "   ls                               list objects in pool\n\n"
 "   chown 123                        change the pool owner to auid 123\n"
@@ -107,23 +114,16 @@ void usage(ostream& out)
 "   setomapheader <obj-name> <val>\n"
 "   tmap-to-omap <obj-name>          convert tmap keys/values to omap\n"
 "   watch <obj-name>                 add watcher on this object\n"
-"   notify <obj-name> <message>      notify wather of this object with message\n"
+"   notify <obj-name> <message>      notify watcher of this object with message\n"
 "   listwatchers <obj-name>          list the watchers of this object\n"
 "   set-alloc-hint <obj-name> <expected-object-size> <expected-write-size>\n"
 "                                    set allocation hint for an object\n"
 "\n"
 "IMPORT AND EXPORT\n"
-"   import [options] <local-directory> <rados-pool>\n"
-"       Upload <local-directory> to <rados-pool>\n"
-"   export [options] <rados-pool> <local-directory>\n"
-"       Download <rados-pool> to <local-directory>\n"
-"   options:\n"
-"       -f / --force                 Copy everything, even if it hasn't changed.\n"
-"       -d / --delete-after          After synchronizing, delete unreferenced\n"
-"                                    files or objects from the target bucket\n"
-"                                    or directory.\n"
-"       --workers                    Number of worker threads to spawn \n"
-"                                    (default " STR(DEFAULT_NUM_RADOS_WORKER_THREADS) ")\n"
+"   export [filename]\n"
+"       Serialize pool contents to a file or standard out.\n"
+"   import [--dry-run] [--no-overwrite] < filename | - >\n"
+"       Load pool contents from a file or standard in\n"
 "\n"
 "ADVISORY LOCKS\n"
 "   lock list <obj-name>\n"
@@ -158,7 +158,7 @@ void usage(ostream& out)
 "   --target-pool=pool\n"
 "        select target pool by name\n"
 "   -b op_size\n"
-"        set the size of write ops for put or benchmarking\n"
+"        set the block size for put/get ops and for write benchmarking\n"
 "   -s name\n"
 "   --snap name\n"
 "        select given snap name for (read) IO\n"
@@ -178,6 +178,10 @@ void usage(ostream& out)
 "        Use with cp to specify the locator of the new object\n"
 "   --target-nspace\n"
 "        Use with cp to specify the namespace of the new object\n"
+"   --striper\n"
+"        Use radostriper interface rather than pure rados\n"
+"        Available for stat, get, put, truncate, rm, ls and \n"
+"        all xattr related operations\n"
 "\n"
 "BENCH OPTIONS:\n"
 "   -t N\n"
@@ -185,16 +189,19 @@ void usage(ostream& out)
 "        Set number of concurrent I/O operations\n"
 "   --show-time\n"
 "        prefix output with date/time\n"
+"   --no-verify\n"
+"        do not verify contents of read objects\n"
 "\n"
 "LOAD GEN OPTIONS:\n"
 "   --num-objects                    total number of objects\n"
 "   --min-object-size                min object size\n"
 "   --max-object-size                max object size\n"
-"   --min-ops                        min number of operations\n"
+"   --min-op-len                     min io size of operations\n"
+"   --max-op-len                     max io size of operations\n"
 "   --max-ops                        max number of operations\n"
-"   --max-backlog                    max backlog (in MB)\n"
-"   --percent                        percent of operations that are read\n"
-"   --target-throughput              target throughput (in MB)\n"
+"   --max-backlog                    max backlog size\n"
+"   --read-percent                   percent of operations that are read\n"
+"   --target-throughput              target throughput (in bytes)\n"
 "   --run-length                     total time (in seconds)\n"
     ;
 }
@@ -245,7 +252,9 @@ static int dump_data(std::string const &filename, bufferlist const &data)
 }
 
 
-static int do_get(IoCtx& io_ctx, const char *objname, const char *outfile, unsigned op_size)
+static int do_get(IoCtx& io_ctx, RadosStriper& striper,
+		  const char *objname, const char *outfile, unsigned op_size,
+		  bool use_striper)
 {
   string oid(objname);
 
@@ -265,7 +274,11 @@ static int do_get(IoCtx& io_ctx, const char *objname, const char *outfile, unsig
   int ret;
   while (true) {
     bufferlist outdata;
-    ret = io_ctx.read(oid, outdata, op_size, offset);
+    if (use_striper) {
+      ret = striper.read(oid, &outdata, op_size, offset);
+    } else {
+      ret = io_ctx.read(oid, outdata, op_size, offset);
+    }
     if (ret <= 0) {
       goto out;
     }
@@ -289,8 +302,12 @@ static int do_get(IoCtx& io_ctx, const char *objname, const char *outfile, unsig
 static int do_copy(IoCtx& io_ctx, const char *objname,
 		   IoCtx& target_ctx, const char *target_obj)
 {
+  __le32 src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL | LIBRADOS_OP_FLAG_FADVISE_NOCACHE;
+  __le32 dest_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL | LIBRADOS_OP_FLAG_FADVISE_DONTNEED;
   ObjectWriteOperation op;
-  op.copy_from(objname, io_ctx, 0);
+  op.copy_from2(objname, io_ctx, 0, src_fadvise_flags);
+  op.set_op_flags2(dest_fadvise_flags);
+
   return target_ctx.operate(target_obj, &op);
 }
 
@@ -355,7 +372,9 @@ static int do_copy_pool(Rados& rados, const char *src_pool, const char *target_p
   return 0;
 }
 
-static int do_put(IoCtx& io_ctx, const char *objname, const char *infile, int op_size)
+static int do_put(IoCtx& io_ctx, RadosStriper& striper,
+		  const char *objname, const char *infile, int op_size,
+		  bool use_striper)
 {
   string oid(objname);
   bufferlist indata;
@@ -383,7 +402,11 @@ static int do_put(IoCtx& io_ctx, const char *objname, const char *infile, int op
     }
     if (count == 0) {
       if (!offset) { // in case we have to create an empty object
-	ret = io_ctx.write_full(oid, indata); // indata is empty
+	if (use_striper) {
+	  ret = striper.write_full(oid, indata); // indata is empty
+	} else {
+	  ret = io_ctx.write_full(oid, indata); // indata is empty
+	}
 	if (ret < 0) {
 	  goto out;
 	}
@@ -391,10 +414,17 @@ static int do_put(IoCtx& io_ctx, const char *objname, const char *infile, int op
       continue;
     }
     indata.append(buf, count);
-    if (offset == 0)
-      ret = io_ctx.write_full(oid, indata);
-    else
-      ret = io_ctx.write(oid, indata, count, offset);
+    if (use_striper) {
+      if (offset == 0)
+	ret = striper.write_full(oid, indata);
+      else
+	ret = striper.write(oid, indata, count, offset);
+    } else {
+      if (offset == 0)
+	ret = io_ctx.write_full(oid, indata);
+      else
+	ret = io_ctx.write(oid, indata, count, offset);
+    }
     indata.clear();
 
     if (ret < 0) {
@@ -500,8 +530,8 @@ public:
     LoadGen *lg;
     librados::AioCompletion *completion;
 
-    LoadGenOp() {}
-    LoadGenOp(LoadGen *_lg) : lg(_lg), completion(NULL) {}
+    LoadGenOp() : id(0), type(0), off(0), len(0), lg(NULL), completion(NULL) {}
+    LoadGenOp(LoadGen *_lg) : id(0), type(0), off(0), len(0), lg(_lg), completion(NULL) {}
   };
 
   int max_op;
@@ -543,13 +573,14 @@ public:
     min_op_len = 1024;
     target_throughput = 5 * 1024 * 1024; // B/sec
     max_op_len = 2 * 1024 * 1024;
+    max_ops = 16; 
     max_backlog = target_throughput * 2;
     run_length = 60;
 
     total_sent = 0;
     total_completed = 0;
     num_objs = 200;
-    max_op = 16;
+    max_op = 0;
   }
   int bootstrap(const char *pool);
   int run();
@@ -561,8 +592,10 @@ public:
     Mutex::Locker l(lock);
 
     double rate = (double)cur_completed_rate() / (1024 * 1024);
+    std::streamsize original_precision = cout.precision();
     cout.precision(3);
     cout << "op " << op->id << " completed, throughput=" << rate  << "MB/sec" << std::endl;
+    cout.precision(original_precision);
 
     map<int, LoadGenOp *>::iterator iter = pending_ops.find(op->id);
     if (iter != pending_ops.end())
@@ -740,8 +773,10 @@ int LoadGen::run()
     if (now - stamp_time >= utime_t(1, 0)) {
       double rate = (double)cur_completed_rate() / (1024 * 1024);
       ++total_sec;
+      std::streamsize original_precision = cout.precision();
       cout.precision(3);
       cout << setw(5) << total_sec << ": throughput=" << rate  << "MB/sec" << " pending data=" << sent - completed << std::endl;
+      cout.precision(original_precision);
       stamp_time = now; 
     }
 
@@ -850,7 +885,7 @@ protected:
     return completions[slot]->get_return_value();
   }
 
-  bool get_objects(std::list<std::string>* objects, int num) {
+  bool get_objects(std::list<Object>* objects, int num) {
     int count = 0;
 
     if (!iterator_valid) {
@@ -867,13 +902,18 @@ protected:
 
     objects->clear();
     for ( ; oi != ei && count < num; ++oi) {
-      objects->push_back(oi->get_oid());
+      Object obj(oi->get_oid(), oi->get_nspace());
+      objects->push_back(obj);
       ++count;
     }
 
     return true;
   }
 
+  void set_namespace( const std::string& ns) {
+    io_ctx.set_namespace(ns);
+  }
+
 public:
   RadosBencher(CephContext *cct_, librados::Rados& _r, librados::IoCtx& _i)
     : ObjBencher(cct_), completions(NULL), rados(_r), io_ctx(_i), iterator_valid(false) {}
@@ -1141,7 +1181,10 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
   string oloc, target_oloc, nspace, target_nspace;
   int concurrent_ios = 16;
   unsigned op_size = default_op_size;
+  bool block_size_specified = false;
   bool cleanup = true;
+  bool no_verify = false;
+  bool use_striper = false;
   const char *snapname = NULL;
   snap_t snapid = CEPH_NOSNAP;
   std::map<std::string, std::string>::const_iterator i;
@@ -1160,14 +1203,16 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
   bool show_time = false;
   bool wildcard = false;
 
-  const char* run_name = NULL;
-  const char* prefix = NULL;
+  std::string run_name;
+  std::string prefix;
 
   Formatter *formatter = NULL;
   bool pretty_format = false;
+  const char *output = NULL;
 
   Rados rados;
   IoCtx io_ctx;
+  RadosStriper striper;
 
   i = opts.find("create");
   if (i != opts.end()) {
@@ -1201,17 +1246,18 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
   }
   i = opts.find("run-name");
   if (i != opts.end()) {
-    run_name = i->second.c_str();
+    run_name = i->second;
   }
   i = opts.find("prefix");
   if (i != opts.end()) {
-    prefix = i->second.c_str();
+    prefix = i->second;
   }
   i = opts.find("block-size");
   if (i != opts.end()) {
     if (rados_sistrtoll(i, &op_size)) {
       return -EINVAL;
     }
+    block_size_specified = true;
   }
   i = opts.find("snap");
   if (i != opts.end()) {
@@ -1311,19 +1357,27 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
   if (i != opts.end()) {
     nspace = i->second;
   }
+  i = opts.find("no-verify");
+  if (i != opts.end()) {
+    no_verify = true;
+  }
+  i = opts.find("output");
+  if (i != opts.end()) {
+    output = i->second.c_str();
+  }
 
 
   // open rados
   ret = rados.init_with_context(g_ceph_context);
   if (ret) {
-     cerr << "couldn't initialize rados! error " << ret << std::endl;
+     cerr << "couldn't initialize rados: " << cpp_strerror(ret) << std::endl;
      ret = -1;
      goto out;
   }
 
   ret = rados.connect();
   if (ret) {
-     cerr << "couldn't connect to cluster! error " << ret << std::endl;
+     cerr << "couldn't connect to cluster: " << cpp_strerror(ret) << std::endl;
      ret = -1;
      goto out;
   }
@@ -1360,6 +1414,17 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
       if (prev_op_size != default_op_size && prev_op_size != op_size)
 	cerr << "INFO: op_size has been rounded to " << op_size << std::endl;
     }
+
+    // create striper interface
+    if (opts.find("striper") != opts.end()) {
+      ret = RadosStriper::striper_create(io_ctx, &striper);
+      if (0 != ret) {
+	cerr << "error opening pool " << pool_name << " with striper interface: "
+	     << cpp_strerror(ret) << std::endl;
+	goto out;
+      }
+      use_striper = true;
+    }
   }
 
   // snapname?
@@ -1390,7 +1455,7 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
       goto out;
     }
     io_ctx.snap_set_read(snapid);
-    cout << "selected snap " << snapid << " '" << snapname << "'" << std::endl;
+    cout << "selected snap " << snapid << " '" << name << "'" << std::endl;
   }
 
   assert(!nargs.empty());
@@ -1420,7 +1485,7 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
       vec.push_back(pool_name);
     }
 
-    map<string,pool_stat_t> stats;
+    map<string,librados::pool_stat_t> stats;
     ret = rados.get_pool_stats(vec, stats);
     if (ret < 0) {
       cerr << "error fetching pool stats: " << cpp_strerror(ret) << std::endl;
@@ -1438,14 +1503,14 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
       formatter->open_object_section("stats");
       formatter->open_array_section("pools");
     }
-    for (map<string,pool_stat_t>::iterator i = stats.begin();
+    for (map<string,librados::pool_stat_t>::iterator i = stats.begin();
 	 i != stats.end();
 	 ++i) {
       const char *pool_name = i->first.c_str();
-      pool_stat_t& s = i->second;
+      librados::pool_stat_t& s = i->second;
       if (!formatter) {
 	printf("%-15s "
-	       "%12lld %12lld %12lld %12lld"
+	       "%12lld %12lld %12lld %12lld "
 	       "%12lld %12lld %12lld %12lld %12lld\n",
 	       pool_name,
 	       (long long)s.num_kb,
@@ -1525,18 +1590,34 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
 	librados::NObjectIterator i = io_ctx.nobjects_begin();
 	librados::NObjectIterator i_end = io_ctx.nobjects_end();
 	for (; i != i_end; ++i) {
+	  if (use_striper) {
+	    // in case of --striper option, we only list striped
+	    // objects, so we only display the first object of
+	    // each, without its suffix '.000...000'
+	    size_t l = i->get_oid().length();
+	    if (l <= 17 ||
+		(0 != i->get_oid().compare(l-17, 17,".0000000000000000"))) continue;
+	  }
 	  if (!formatter) {
 	    // Only include namespace in output when wildcard specified
 	    if (wildcard)
 	      *outstream << i->get_nspace() << "\t";
-	    *outstream << i->get_oid();
+	    if (use_striper) {
+	      *outstream << i->get_oid().substr(0, i->get_oid().length()-17);
+	    } else {
+	      *outstream << i->get_oid();
+	    }
 	    if (i->get_locator().size())
 	      *outstream << "\t" << i->get_locator();
 	    *outstream << std::endl;
 	  } else {
 	    formatter->open_object_section("object");
 	    formatter->dump_string("namespace", i->get_nspace());
-	    formatter->dump_string("name", i->get_oid());
+	    if (use_striper) {
+	      formatter->dump_string("name", i->get_oid().substr(0, i->get_oid().length()-17));
+	    } else {
+	      formatter->dump_string("name", i->get_oid());
+	    }
 	    if (i->get_locator().size())
 	      formatter->dump_string("locator", i->get_locator());
 	    formatter->close_section(); //object
@@ -1598,7 +1679,11 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
     string oid(nargs[1]);
     uint64_t size;
     time_t mtime;
-    ret = io_ctx.stat(oid, &size, &mtime);
+    if (use_striper) {
+      ret = striper.stat(oid, &size, &mtime);
+    } else {
+      ret = io_ctx.stat(oid, &size, &mtime);
+    }
     if (ret < 0) {
       cerr << " error stat-ing " << pool_name << "/" << oid << ": "
            << cpp_strerror(ret) << std::endl;
@@ -1612,7 +1697,7 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
   else if (strcmp(nargs[0], "get") == 0) {
     if (!pool_name || nargs.size() < 3)
       usage_exit();
-    ret = do_get(io_ctx, nargs[1], nargs[2], op_size);
+    ret = do_get(io_ctx, striper, nargs[1], nargs[2], op_size, use_striper);
     if (ret < 0) {
       cerr << "error getting " << pool_name << "/" << nargs[1] << ": " << cpp_strerror(ret) << std::endl;
       goto out;
@@ -1621,7 +1706,7 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
   else if (strcmp(nargs[0], "put") == 0) {
     if (!pool_name || nargs.size() < 3)
       usage_exit();
-    ret = do_put(io_ctx, nargs[1], nargs[2], op_size);
+    ret = do_put(io_ctx, striper, nargs[1], nargs[2], op_size, use_striper);
     if (ret < 0) {
       cerr << "error putting " << pool_name << "/" << nargs[1] << ": " << cpp_strerror(ret) << std::endl;
       goto out;
@@ -1643,7 +1728,11 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
       cerr << "error, cannot truncate to negative value" << std::endl;
       usage_exit();
     }
-    ret = io_ctx.trunc(oid, size);
+    if (use_striper) {
+      ret = striper.trunc(oid, size);
+    } else {
+      ret = io_ctx.trunc(oid, size);
+    }
     if (ret < 0) {
       cerr << "error truncating oid "
 	   << oid << " to " << size << ": "
@@ -1670,7 +1759,11 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
       } while (ret > 0);
     }
 
-    ret = io_ctx.setxattr(oid, attr_name.c_str(), bl);
+    if (use_striper) {
+      ret = striper.setxattr(oid, attr_name.c_str(), bl);
+    } else {
+      ret = io_ctx.setxattr(oid, attr_name.c_str(), bl);
+    }
     if (ret < 0) {
       cerr << "error setting xattr " << pool_name << "/" << oid << "/" << attr_name << ": " << cpp_strerror(ret) << std::endl;
       goto out;
@@ -1686,7 +1779,11 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
     string attr_name(nargs[2]);
 
     bufferlist bl;
-    ret = io_ctx.getxattr(oid, attr_name.c_str(), bl);
+    if (use_striper) {
+      ret = striper.getxattr(oid, attr_name.c_str(), bl);
+    } else {
+      ret = io_ctx.getxattr(oid, attr_name.c_str(), bl);
+    }
     if (ret < 0) {
       cerr << "error getting xattr " << pool_name << "/" << oid << "/" << attr_name << ": " << cpp_strerror(ret) << std::endl;
       goto out;
@@ -1702,7 +1799,11 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
     string oid(nargs[1]);
     string attr_name(nargs[2]);
 
-    ret = io_ctx.rmxattr(oid, attr_name.c_str());
+    if (use_striper) {
+      ret = striper.rmxattr(oid, attr_name.c_str());
+    } else {
+      ret = io_ctx.rmxattr(oid, attr_name.c_str());
+    }
     if (ret < 0) {
       cerr << "error removing xattr " << pool_name << "/" << oid << "/" << attr_name << ": " << cpp_strerror(ret) << std::endl;
       goto out;
@@ -1714,7 +1815,11 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
     string oid(nargs[1]);
     map<std::string, bufferlist> attrset;
     bufferlist bl;
-    ret = io_ctx.getxattrs(oid, attrset);
+    if (use_striper) {
+      ret = striper.getxattrs(oid, attrset);
+    } else {
+      ret = io_ctx.getxattrs(oid, attrset);
+    }
     if (ret < 0) {
       cerr << "error getting xattr set " << pool_name << "/" << oid << ": " << cpp_strerror(ret) << std::endl;
       goto out;
@@ -1770,16 +1875,26 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
       ret = 0;
     }
   } else if (strcmp(nargs[0], "setomapval") == 0) {
-    if (!pool_name || nargs.size() < 4)
+    if (!pool_name || nargs.size() < 3 || nargs.size() > 4)
       usage_exit();
 
     string oid(nargs[1]);
     string key(nargs[2]);
-    string val(nargs[3]);
 
-    map<string, bufferlist> values;
     bufferlist bl;
-    bl.append(val);
+    if (nargs.size() == 4) {
+      string val(nargs[3]);
+      bl.append(val);
+    } else {
+      do {
+	ret = bl.read_fd(STDIN_FILENO, 1024); // from stdin
+	if (ret < 0) {
+	  goto out;
+        }
+      } while (ret > 0);
+    }
+
+    map<string, bufferlist> values;
     values[key] = bl;
 
     ret = io_ctx.omap_set(oid, values);
@@ -1983,7 +2098,11 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
     ++iter;
     for (; iter != nargs.end(); ++iter) {
       const string & oid = *iter;
-      ret = io_ctx.remove(oid);
+      if (use_striper) {
+	ret = striper.remove(oid);
+      } else {
+	ret = io_ctx.remove(oid);
+      }
       if (ret < 0) {
         string name = (nspace.size() ? nspace + "/" : "" ) + oid;
         cerr << "error removing " << pool_name << ">" << name << ": " << cpp_strerror(ret) << std::endl;
@@ -2164,7 +2283,33 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
     if (ret >= 0) {
       cout << "successfully deleted pool " << nargs[1] << std::endl;
     } else { //error
-      cerr << "pool " << nargs[1] << " does not exist" << std::endl;
+      cerr << "pool " << nargs[1] << " could not be removed" << std::endl;
+    }
+  }
+  else if (strcmp(nargs[0], "purge") == 0) {
+    if (nargs.size() < 2)
+      usage_exit();
+    if (nargs.size() < 3 ||
+	strcmp(nargs[2], "--yes-i-really-really-mean-it") != 0) {
+      cerr << "WARNING:\n"
+	   << "  This will PERMANENTLY DESTROY all objects from a pool with no way back.\n"
+	   << "  To confirm, follow pool with --yes-i-really-really-mean-it" << std::endl;
+      ret = -1;
+      goto out;
+    }
+    ret = rados.ioctx_create(nargs[1], io_ctx);
+    if (ret < 0) {
+      cerr << "error pool " << nargs[1] << ": "
+	   << cpp_strerror(ret) << std::endl;
+      goto out;
+    }
+    io_ctx.set_namespace(all_nspaces);
+    RadosBencher bencher(g_ceph_context, rados, io_ctx);
+    ret = bencher.clean_up_slow("", concurrent_ios);
+    if (ret >= 0) {
+      cout << "successfully purged pool " << nargs[1] << std::endl;
+    } else { //error
+      cerr << "pool " << nargs[1] << " could not be purged" << std::endl;
     }
   }
   else if (strcmp(nargs[0], "lssnap") == 0) {
@@ -2186,6 +2331,7 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
       localtime_r(&t, &bdt);
       cout << *i << "\t" << s << "\t";
 
+      std::ios_base::fmtflags original_flags = cout.flags();
       cout.setf(std::ios::right);
       cout.fill('0');
       cout << std::setw(4) << (bdt.tm_year+1900)
@@ -2196,7 +2342,7 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
 	   << ':' << std::setw(2) << bdt.tm_min
 	   << ':' << std::setw(2) << bdt.tm_sec
 	   << std::endl;
-      cout.unsetf(std::ios::right);
+      cout.flags(original_flags);
     }
     cout << snaps.size() << " snaps" << std::endl;
   }
@@ -2259,12 +2405,35 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
       operation = OP_RAND_READ;
     else
       usage_exit();
+    if (block_size_specified && (operation != OP_WRITE)){
+      cerr << "-b|--block_size option can be used only with `write' bench test"
+           << std::endl;
+      ret = -EINVAL;
+      goto out;
+    }
+    if (!formatter && output) {
+      cerr << "-o|--output option can be used only with '--format' option"
+           << std::endl;
+      ret = -EINVAL;
+      goto out;
+    }
     RadosBencher bencher(g_ceph_context, rados, io_ctx);
     bencher.set_show_time(show_time);
-    ret = bencher.aio_bench(operation, seconds, num_objs,
-			    concurrent_ios, op_size, cleanup, run_name);
+    ostream *outstream = NULL;
+    if (formatter) {
+      bencher.set_formatter(formatter);
+      if (output)
+        outstream = new ofstream(output);
+      else
+        outstream = &cout;
+      bencher.set_outstream(*outstream);
+    }
+    ret = bencher.aio_bench(operation, seconds,
+			    concurrent_ios, op_size, cleanup, run_name, no_verify);
     if (ret != 0)
       cerr << "error during benchmark: " << ret << std::endl;
+    if (formatter && output)
+      delete outstream;
   }
   else if (strcmp(nargs[0], "cleanup") == 0) {
     if (!pool_name)
@@ -2596,6 +2765,76 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
 	   << cpp_strerror(ret) << std::endl;
       goto out;
     }
+  } else if (strcmp(nargs[0], "export") == 0) {
+    // export [filename]
+    if (!pool_name || nargs.size() > 2) {
+      usage_exit();
+    }
+
+    int file_fd;
+    if (nargs.size() < 2 || std::string(nargs[1]) == "-") {
+      file_fd = STDOUT_FILENO;
+    } else {
+      file_fd = open(nargs[1], O_WRONLY|O_CREAT|O_TRUNC, 0666);
+      if (file_fd < 0) {
+        cerr << "Error opening '" << nargs[1] << "': "
+          << cpp_strerror(file_fd) << std::endl;
+        ret = file_fd;
+        goto out;
+      }
+    }
+
+    ret = PoolDump(file_fd).dump(&io_ctx);
+    if (ret < 0) {
+      cerr << "error from export: "
+	   << cpp_strerror(ret) << std::endl;
+      goto out;
+    }
+  } else if (strcmp(nargs[0], "import") == 0) {
+    // import [--no-overwrite] [--dry-run] <filename | - >
+    if (!pool_name || nargs.size() > 4 || nargs.size() < 2) {
+      usage_exit();
+    }
+
+    // Last arg is the filename
+    std::string const filename = nargs[nargs.size() - 1];
+
+    // All other args may be flags
+    bool dry_run = false;
+    bool no_overwrite = false;
+    for (unsigned i = 1; i < nargs.size() - 1; ++i) {
+      std::string arg(nargs[i]);
+      
+      if (arg == std::string("--no-overwrite")) {
+        no_overwrite = true;
+      } else if (arg == std::string("--dry-run")) {
+        dry_run = true;
+      } else {
+        std::cerr << "Invalid argument '" << arg << "'" << std::endl;
+        ret = -EINVAL;
+        goto out;
+      }
+    }
+
+    int file_fd;
+    if (filename == "-") {
+      file_fd = STDIN_FILENO;
+    } else {
+      file_fd = open(filename.c_str(), O_RDONLY);
+      if (file_fd < 0) {
+        cerr << "Error opening '" << filename << "': "
+          << cpp_strerror(file_fd) << std::endl;
+        ret = file_fd;
+        goto out;
+      }
+    }
+
+    ret = RadosImport(file_fd, 0, dry_run).import(io_ctx, no_overwrite);
+    if (ret < 0) {
+      cerr << "error from import: "
+	   << cpp_strerror(ret) << std::endl;
+      goto out;
+    }
   } else {
     cerr << "unrecognized command " << nargs[0] << "; -h or --help for usage" << std::endl;
     ret = -EINVAL;
@@ -2641,6 +2880,8 @@ int main(int argc, const char **argv)
       opts["show-time"] = "true";
     } else if (ceph_argparse_flag(args, i, "--no-cleanup", (char*)NULL)) {
       opts["no-cleanup"] = "true";
+    } else if (ceph_argparse_flag(args, i, "--no-verify", (char*)NULL)) {
+      opts["no-verify"] = "true";
     } else if (ceph_argparse_witharg(args, i, &val, "--run-name", (char*)NULL)) {
       opts["run-name"] = val;
     } else if (ceph_argparse_witharg(args, i, &val, "--prefix", (char*)NULL)) {
@@ -2655,6 +2896,8 @@ int main(int argc, const char **argv)
       opts["target_locator"] = val;
     } else if (ceph_argparse_witharg(args, i, &val, "--target-nspace" , (char *)NULL)) {
       opts["target_nspace"] = val;
+    } else if (ceph_argparse_flag(args, i, "--striper" , (char *)NULL)) {
+      opts["striper"] = "true";
     } else if (ceph_argparse_witharg(args, i, &val, "-t", "--concurrent-ios", (char*)NULL)) {
       opts["concurrent-ios"] = val;
     } else if (ceph_argparse_witharg(args, i, &val, "--block-size", (char*)NULL)) {
@@ -2705,6 +2948,8 @@ int main(int argc, const char **argv)
       opts["all"] = "true";
     } else if (ceph_argparse_flag(args, i, "--default", (char*)NULL)) {
       opts["default"] = "true";
+    } else if (ceph_argparse_witharg(args, i, &val, "-o", "--output", (char*)NULL)) {
+      opts["output"] = val;
     } else {
       if (val[0] == '-')
         usage_exit();
@@ -2716,11 +2961,6 @@ int main(int argc, const char **argv)
     cerr << "rados: you must give an action. Try --help" << std::endl;
     return 1;
   }
-  if ((strcmp(args[0], "import") == 0) || (strcmp(args[0], "export") == 0)) {
-    cout << "The import and export operations are not available" << std::endl;
-    exit(1);
-    //return rados_tool_sync(opts, args);
-  } else {
-    return rados_tool_common(opts, args);
-  }
+
+  return rados_tool_common(opts, args);
 }
diff --git a/src/tools/rados/rados_export.cc b/src/tools/rados/rados_export.cc
deleted file mode 100644
index 0519fce..0000000
--- a/src/tools/rados/rados_export.cc
+++ /dev/null
@@ -1,229 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2011 New Dream Network
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation.  See file COPYING.
- *
- */
-#include "include/int_types.h"
-
-#include "rados_sync.h"
-#include "common/errno.h"
-#include "common/strtol.h"
-#include "include/rados/librados.hpp"
-
-#include <dirent.h>
-#include <errno.h>
-#include <fstream>
-#include <iostream>
-#include <sstream>
-#include <stdlib.h>
-#include <string>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <time.h>
-#include <unistd.h>
-
-#include "include/compat.h"
-#include "common/xattr.h"
-
-using namespace librados;
-
-class ExportLocalFileWQ : public RadosSyncWQ {
-public:
-  ExportLocalFileWQ(IoCtxDistributor *io_ctx_dist, time_t ti,
-		    ThreadPool *tp, ExportDir *export_dir, bool force)
-    : RadosSyncWQ(io_ctx_dist, ti, 0, tp),
-      m_export_dir(export_dir),
-      m_force(force)
-  {
-  }
-private:
-  void _process(std::string *s) {
-    IoCtx &io_ctx(m_io_ctx_dist->get_ioctx());
-    int flags = 0;
-    auto_ptr <BackedUpObject> sobj;
-    auto_ptr <BackedUpObject> dobj;
-    const std::string &rados_name(*s);
-    std::list < std::string > only_in_a;
-    std::list < std::string > only_in_b;
-    std::list < std::string > diff;
-    int ret = BackedUpObject::from_rados(io_ctx, rados_name.c_str(), sobj);
-    if (ret) {
-      cerr << ERR_PREFIX << "couldn't get '" << rados_name << "' from rados: error "
-	   << ret << std::endl;
-      _exit(ret);
-    }
-    std::string obj_path(sobj->get_fs_path(m_export_dir));
-    if (m_force) {
-      flags |= (CHANGED_CONTENTS | CHANGED_XATTRS);
-    }
-    else {
-      ret = BackedUpObject::from_path(obj_path.c_str(), dobj);
-      if (ret == ENOENT) {
-	sobj->get_xattrs(only_in_a);
-	flags |= CHANGED_CONTENTS;
-      }
-      else if (ret) {
-	cerr << ERR_PREFIX << "BackedUpObject::from_path returned "
-	     << ret << std::endl;
-	_exit(ret);
-      }
-      else {
-	sobj->xattr_diff(dobj.get(), only_in_a, only_in_b, diff);
-	if ((sobj->get_rados_size() == dobj->get_rados_size()) &&
-	    (sobj->get_mtime() == dobj->get_mtime())) {
-	  flags |= CHANGED_CONTENTS;
-	}
-      }
-    }
-    if (flags & CHANGED_CONTENTS) {
-      ret = sobj->download(io_ctx, obj_path.c_str());
-      if (ret) {
-	cerr << ERR_PREFIX << "download error: " << ret << std::endl;
-	_exit(ret);
-      }
-    }
-    diff.splice(diff.begin(), only_in_a);
-    for (std::list < std::string >::const_iterator x = diff.begin();
-	 x != diff.end(); ++x) {
-      flags |= CHANGED_XATTRS;
-      const Xattr *xattr = sobj->get_xattr(*x);
-      if (xattr == NULL) {
-	cerr << ERR_PREFIX << "internal error on line: " << __LINE__ << std::endl;
-	_exit(ret);
-      }
-      std::string xattr_fs_name(USER_XATTR_PREFIX);
-      xattr_fs_name += x->c_str();
-      ret = ceph_os_setxattr(obj_path.c_str(), xattr_fs_name.c_str(),
-		     xattr->data, xattr->len);
-      if (ret) {
-	ret = errno;
-	cerr << ERR_PREFIX << "setxattr error: " << cpp_strerror(ret) << std::endl;
-	_exit(ret);
-      }
-    }
-    for (std::list < std::string >::const_iterator x = only_in_b.begin();
-	 x != only_in_b.end(); ++x) {
-      flags |= CHANGED_XATTRS;
-      ret = ceph_os_removexattr(obj_path.c_str(), x->c_str());
-      if (ret) {
-	ret = errno;
-	cerr << ERR_PREFIX << "removexattr error: " << cpp_strerror(ret) << std::endl;
-	_exit(ret);
-      }
-    }
-    if (m_force) {
-      cout << "[force]        " << rados_name << std::endl;
-    }
-    else if (flags & CHANGED_CONTENTS) {
-      cout << "[exported]     " << rados_name << std::endl;
-    }
-    else if (flags & CHANGED_XATTRS) {
-      cout << "[xattr]        " << rados_name << std::endl;
-    }
-  }
-  ExportDir *m_export_dir;
-  bool m_force;
-};
-
-class ExportValidateExistingWQ : public RadosSyncWQ {
-public:
-  ExportValidateExistingWQ(IoCtxDistributor *io_ctx_dist, time_t ti,
-			   ThreadPool *tp, const char *dir_name)
-    : RadosSyncWQ(io_ctx_dist, ti, 0, tp),
-      m_dir_name(dir_name)
-  {
-  }
-private:
-  void _process(std::string *s) {
-    IoCtx &io_ctx(m_io_ctx_dist->get_ioctx());
-    auto_ptr <BackedUpObject> lobj;
-    const std::string &local_name(*s);
-    int ret = BackedUpObject::from_file(local_name.c_str(), m_dir_name, lobj);
-    if (ret) {
-      cout << ERR_PREFIX << "BackedUpObject::from_file: delete loop: "
-	   << "got error " << ret << std::endl;
-      _exit(ret);
-    }
-    auto_ptr <BackedUpObject> robj;
-    ret = BackedUpObject::from_rados(io_ctx, lobj->get_rados_name(), robj);
-    if (ret == -ENOENT) {
-      // The entry doesn't exist on the remote server; delete it locally
-      char path[strlen(m_dir_name) + local_name.size() + 2];
-      snprintf(path, sizeof(path), "%s/%s", m_dir_name, local_name.c_str());
-      if (unlink(path)) {
-	ret = errno;
-	cerr << ERR_PREFIX << "error unlinking '" << path << "': "
-	     << cpp_strerror(ret) << std::endl;
-	_exit(ret);
-      }
-      cout << "[deleted]      " << "removed '" << local_name << "'" << std::endl;
-    }
-    else if (ret) {
-      cerr << ERR_PREFIX << "BackedUpObject::from_rados: delete loop: "
-	   << "got error " << ret << std::endl;
-      _exit(ret);
-    }
-  }
-  const char *m_dir_name;
-};
-
-int do_rados_export(ThreadPool *tp, IoCtx& io_ctx,
-      IoCtxDistributor *io_ctx_dist, const char *dir_name,
-      bool create, bool force, bool delete_after)
-{
-  librados::NObjectIterator oi = io_ctx.nobjects_begin();
-  librados::NObjectIterator oi_end = io_ctx.nobjects_end();
-  auto_ptr <ExportDir> export_dir;
-  export_dir.reset(ExportDir::create_for_writing(dir_name, 1, create));
-  if (!export_dir.get())
-    return -EIO;
-  ExportLocalFileWQ export_object_wq(io_ctx_dist, time(NULL),
-				     tp, export_dir.get(), force);
-  for (; oi != oi_end; ++oi) {
-    export_object_wq.queue(new std::string((*oi).get_oid()));
-  }
-  export_object_wq.drain();
-
-  if (delete_after) {
-    ExportValidateExistingWQ export_val_wq(io_ctx_dist, time(NULL),
-					   tp, dir_name);
-    DirHolder dh;
-    int err = dh.opendir(dir_name);
-    if (err) {
-      cerr << ERR_PREFIX << "opendir(" << dir_name << ") error: "
-	   << cpp_strerror(err) << std::endl;
-      return err;
-    }
-    while (true) {
-      struct dirent *de = readdir(dh.dp);
-      if (!de)
-	break;
-      if ((strcmp(de->d_name, ".") == 0) || (strcmp(de->d_name, "..") == 0))
-	continue;
-      if (is_suffix(de->d_name, RADOS_SYNC_TMP_SUFFIX)) {
-	char path[strlen(dir_name) + strlen(de->d_name) + 2];
-	snprintf(path, sizeof(path), "%s/%s", dir_name, de->d_name);
-	if (unlink(path)) {
-	  int ret = errno;
-	  cerr << ERR_PREFIX << "error unlinking temporary file '" << path << "': "
-	       << cpp_strerror(ret) << std::endl;
-	  return ret;
-	}
-	cout << "[deleted]      " << "removed temporary file '" << de->d_name << "'" << std::endl;
-	continue;
-      }
-      export_val_wq.queue(new std::string(de->d_name));
-    }
-    export_val_wq.drain();
-  }
-  cout << "[done]" << std::endl;
-  return 0;
-}
diff --git a/src/tools/rados/rados_import.cc b/src/tools/rados/rados_import.cc
deleted file mode 100644
index 7c06b29..0000000
--- a/src/tools/rados/rados_import.cc
+++ /dev/null
@@ -1,239 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2011 New Dream Network
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation.  See file COPYING.
- *
- */
-#include "include/int_types.h"
-
-#include <dirent.h>
-#include <errno.h>
-#include <fstream>
-#include <iostream>
-#include <sstream>
-#include <stdlib.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <time.h>
-#include <unistd.h>
-
-#include "rados_sync.h"
-#include "common/errno.h"
-#include "common/strtol.h"
-#include "include/rados/librados.hpp"
-
-using namespace librados;
-using std::auto_ptr;
-
-class ImportLocalFileWQ : public RadosSyncWQ {
-public:
-  ImportLocalFileWQ(const char *dir_name, bool force,
-		    IoCtxDistributor *io_ctx_dist, time_t ti, ThreadPool *tp)
-    : RadosSyncWQ(io_ctx_dist, ti, 0, tp),
-      m_dir_name(dir_name),
-      m_force(force)
-  {
-  }
-private:
-  void _process(std::string *s) {
-    IoCtx &io_ctx(m_io_ctx_dist->get_ioctx());
-    const std::string &local_name(*s);
-    auto_ptr <BackedUpObject> sobj;
-    auto_ptr <BackedUpObject> dobj;
-    std::list < std::string > only_in_a;
-    std::list < std::string > only_in_b;
-    std::list < std::string > diff;
-    int flags = 0;
-
-    int ret = BackedUpObject::from_file(local_name.c_str(),
-					m_dir_name.c_str(), sobj);
-    if (ret) {
-      cerr << ERR_PREFIX << "BackedUpObject::from_file: got error "
-	   << ret << std::endl;
-      _exit(ret);
-    }
-    const char *rados_name(sobj->get_rados_name());
-    if (m_force) {
-      flags |= (CHANGED_CONTENTS | CHANGED_XATTRS);
-    }
-    else {
-      ret = BackedUpObject::from_rados(io_ctx, rados_name, dobj);
-      if (ret == -ENOENT) {
-	flags |= CHANGED_CONTENTS;
-	sobj->get_xattrs(only_in_a);
-      }
-      else if (ret) {
-	cerr << ERR_PREFIX << "BackedUpObject::from_rados returned "
-	     << ret << std::endl;
-	_exit(ret);
-      }
-      else {
-	sobj->xattr_diff(dobj.get(), only_in_a, only_in_b, diff);
-	if ((sobj->get_rados_size() == dobj->get_rados_size()) &&
-	    (sobj->get_mtime() == dobj->get_mtime())) {
-	  flags |= CHANGED_CONTENTS;
-	}
-      }
-    }
-    if (flags & CHANGED_CONTENTS) {
-      ret = sobj->upload(io_ctx, local_name.c_str(), m_dir_name.c_str());
-      if (ret) {
-	cerr << ERR_PREFIX << "upload error: " << ret << std::endl;
-	_exit(ret);
-      }
-    }
-    for (std::list < std::string >::const_iterator x = only_in_a.begin();
-	 x != only_in_a.end(); ++x) {
-      flags |= CHANGED_XATTRS;
-      const Xattr *xattr = sobj->get_xattr(*x);
-      if (xattr == NULL) {
-	cerr << ERR_PREFIX << "internal error on line: " << __LINE__ << std::endl;
-	_exit(ret);
-      }
-      bufferlist bl;
-      bl.append(xattr->data, xattr->len);
-      ret = io_ctx.setxattr(rados_name, x->c_str(), bl);
-      if (ret < 0) {
-	ret = errno;
-	cerr << ERR_PREFIX << "io_ctx.setxattr(rados_name='" << rados_name
-	     << "', xattr_name='" << x->c_str() << "'): " << cpp_strerror(ret)
-	     << std::endl;
-	_exit(ret);
-      }
-    }
-    for (std::list < std::string >::const_iterator x = diff.begin();
-	 x != diff.end(); ++x) {
-      flags |= CHANGED_XATTRS;
-      const Xattr *xattr = sobj->get_xattr(*x);
-      if (xattr == NULL) {
-	cerr << ERR_PREFIX << "internal error on line: " << __LINE__ << std::endl;
-	_exit(ret);
-      }
-      bufferlist bl;
-      bl.append(xattr->data, xattr->len);
-      ret = io_ctx.rmxattr(rados_name, x->c_str());
-      if (ret < 0) {
-	cerr << ERR_PREFIX << "io_ctx.rmxattr error2: " << cpp_strerror(ret)
-	     << std::endl;
-	_exit(ret);
-      }
-      ret = io_ctx.setxattr(rados_name, x->c_str(), bl);
-      if (ret < 0) {
-	ret = errno;
-	cerr << ERR_PREFIX << "io_ctx.setxattr(rados_name='" << rados_name
-	     << "', xattr='" << x->c_str() << "'): " << cpp_strerror(ret) << std::endl;
-	_exit(ret);
-      }
-    }
-    for (std::list < std::string >::const_iterator x = only_in_b.begin();
-	 x != only_in_b.end(); ++x) {
-      flags |= CHANGED_XATTRS;
-      ret = io_ctx.rmxattr(rados_name, x->c_str());
-      if (ret < 0) {
-	ret = errno;
-	cerr << ERR_PREFIX << "rmxattr error3: " << cpp_strerror(ret) << std::endl;
-	_exit(ret);
-      }
-    }
-    if (m_force) {
-      cout << "[force]        " << rados_name << std::endl;
-    }
-    else if (flags & CHANGED_CONTENTS) {
-      cout << "[imported]     " << rados_name << std::endl;
-    }
-    else if (flags & CHANGED_XATTRS) {
-      cout << "[xattr]        " << rados_name << std::endl;
-    }
-  }
-  std::string m_dir_name;
-  bool m_force;
-};
-
-class ImportValidateExistingWQ : public RadosSyncWQ {
-public:
-  ImportValidateExistingWQ(ExportDir *export_dir,
-		 IoCtxDistributor *io_ctx_dist, time_t ti, ThreadPool *tp)
-    : RadosSyncWQ(io_ctx_dist, ti, 0, tp),
-      m_export_dir(export_dir)
-  {
-  }
-private:
-  void _process(std::string *s) {
-    IoCtx &io_ctx(m_io_ctx_dist->get_ioctx());
-    const std::string &rados_name(*s);
-    auto_ptr <BackedUpObject> robj;
-    int ret = BackedUpObject::from_rados(io_ctx, rados_name.c_str(), robj);
-    if (ret) {
-      cerr << ERR_PREFIX << "BackedUpObject::from_rados in delete loop "
-	   << "returned " << ret << std::endl;
-      _exit(ret);
-    }
-    std::string obj_path(robj->get_fs_path(m_export_dir));
-    auto_ptr <BackedUpObject> lobj;
-    ret = BackedUpObject::from_path(obj_path.c_str(), lobj);
-    if (ret == ENOENT) {
-      ret = io_ctx.remove(rados_name);
-      if (ret && ret != -ENOENT) {
-	cerr << ERR_PREFIX << "io_ctx.remove(" << obj_path << ") failed "
-	    << "with error " << ret << std::endl;
-	_exit(ret);
-      }
-      cout << "[deleted]      " << "removed '" << rados_name << "'" << std::endl;
-    }
-    else if (ret) {
-      cerr << ERR_PREFIX << "BackedUpObject::from_path in delete loop "
-	   << "returned " << ret << std::endl;
-      _exit(ret);
-    }
-  }
-  ExportDir *m_export_dir;
-};
-
-int do_rados_import(ThreadPool *tp, IoCtx &io_ctx, IoCtxDistributor* io_ctx_dist,
-	   const char *dir_name, bool force, bool delete_after)
-{
-  auto_ptr <ExportDir> export_dir;
-  export_dir.reset(ExportDir::from_file_system(dir_name));
-  if (!export_dir.get())
-    return -EIO;
-  DirHolder dh;
-  int ret = dh.opendir(dir_name);
-  if (ret) {
-    cerr << ERR_PREFIX << "opendir(" << dir_name << ") error: "
-	 << cpp_strerror(ret) << std::endl;
-    return ret;
-  }
-  ImportLocalFileWQ import_file_wq(dir_name, force,
-				   io_ctx_dist, time(NULL), tp);
-  while (true) {
-    struct dirent *de = readdir(dh.dp);
-    if (!de)
-      break;
-    if ((strcmp(de->d_name, ".") == 0) || (strcmp(de->d_name, "..") == 0))
-      continue;
-    if (is_suffix(de->d_name, RADOS_SYNC_TMP_SUFFIX))
-      continue;
-    import_file_wq.queue(new std::string(de->d_name));
-  }
-  import_file_wq.drain();
-
-  if (delete_after) {
-    ImportValidateExistingWQ import_val_wq(export_dir.get(), io_ctx_dist,
-					   time(NULL), tp);
-    librados::NObjectIterator oi = io_ctx.nobjects_begin();
-    librados::NObjectIterator oi_end = io_ctx.nobjects_end();
-    for (; oi != oi_end; ++oi) {
-      import_val_wq.queue(new std::string((*oi).get_oid()));
-    }
-    import_val_wq.drain();
-  }
-  cout << "[done]" << std::endl;
-  return 0;
-}
diff --git a/src/tools/rados/rados_sync.cc b/src/tools/rados/rados_sync.cc
deleted file mode 100644
index 4c2ef5e..0000000
--- a/src/tools/rados/rados_sync.cc
+++ /dev/null
@@ -1,903 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2011 New Dream Network
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation.  See file COPYING.
- *
- */
-#include "include/int_types.h"
-
-#include "common/ceph_argparse.h"
-#include "common/config.h"
-#include "common/errno.h"
-#include "common/strtol.h"
-#include "global/global_context.h"
-#include "global/global_init.h"
-#include "include/rados/librados.hpp"
-#include "rados_sync.h"
-#include "include/compat.h"
-
-#include "common/xattr.h"
-
-#include <dirent.h>
-#include <errno.h>
-#include <fstream>
-#include <iostream>
-#include <memory>
-#include <sstream>
-#include <stdlib.h>
-#include <string>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <time.h>
-#include <unistd.h>
-
-using namespace librados;
-using std::auto_ptr;
-
-static const char * const XATTR_RADOS_SYNC_VER = "user.rados_sync_ver";
-static const char * const XATTR_FULLNAME = "user.rados_full_name";
-const char USER_XATTR_PREFIX[] = "user.rados.";
-static const size_t USER_XATTR_PREFIX_LEN =
-  sizeof(USER_XATTR_PREFIX) / sizeof(USER_XATTR_PREFIX[0]) - 1;
-/* It's important that RADOS_SYNC_TMP_SUFFIX contain at least one character
- * that we wouldn't normally alllow in a file name-- in this case, $ */
-const char RADOS_SYNC_TMP_SUFFIX[] = "$tmp";
-static const size_t RADOS_SYNC_TMP_SUFFIX_LEN =
-  sizeof(RADOS_SYNC_TMP_SUFFIX) / sizeof(RADOS_SYNC_TMP_SUFFIX[0]) - 1;
-
-std::string get_user_xattr_name(const char *fs_xattr_name)
-{
-  if (strncmp(fs_xattr_name, USER_XATTR_PREFIX, USER_XATTR_PREFIX_LEN))
-    return "";
-  return fs_xattr_name + USER_XATTR_PREFIX_LEN;
-}
-
-bool is_suffix(const char *str, const char *suffix)
-{
-  size_t strlen_str = strlen(str);
-  size_t strlen_suffix = strlen(suffix);
-  if (strlen_str < strlen_suffix)
-    return false;
-  return (strcmp(str + (strlen_str - strlen_suffix), suffix) == 0);
-}
-
-ExportDir* ExportDir::create_for_writing(const std::string &path, int version,
-				 bool create)
-{
-  if (access(path.c_str(), R_OK | W_OK) == 0) {
-    return ExportDir::from_file_system(path);
-  }
-  if (!create) {
-    cerr << ERR_PREFIX << "ExportDir: directory '"
-	 << path << "' does not exist. Use --create to create it."
-	 << std::endl;
-    return NULL;
-  }
-  int ret = mkdir(path.c_str(), 0700);
-  if (ret < 0) {
-    int err = errno;
-    if (err != EEXIST) {
-      cerr << ERR_PREFIX << "ExportDir: mkdir error: "
-	   << cpp_strerror(err) << std::endl;
-      return NULL;
-    }
-  }
-  char buf[32];
-  snprintf(buf, sizeof(buf), "%d", version);
-  ret = ceph_os_setxattr(path.c_str(), XATTR_RADOS_SYNC_VER, buf, strlen(buf) + 1);
-  if (ret < 0) {
-    int err = errno;
-    cerr << ERR_PREFIX << "ExportDir: setxattr error :"
-	 << cpp_strerror(err) << std::endl;
-    return NULL;
-  }
-  return new ExportDir(path);
-}
-
-ExportDir* ExportDir::from_file_system(const std::string &path)
-{
-  if (access(path.c_str(), R_OK)) {
-      cerr << "ExportDir: source directory '" << path
-	   << "' appears to be inaccessible." << std::endl;
-      return NULL;
-  }
-  int ret;
-  char buf[32];
-  memset(buf, 0, sizeof(buf));
-  ret = ceph_os_getxattr(path.c_str(), XATTR_RADOS_SYNC_VER, buf, sizeof(buf) - 1);
-  if (ret < 0) {
-    ret = errno;
-    if (ret == ENODATA) {
-      cerr << ERR_PREFIX << "ExportDir: directory '" << path
-	   << "' does not appear to have been created by a rados "
-	   << "export operation." << std::endl;
-      return NULL;
-    }
-    cerr << ERR_PREFIX << "ExportDir: getxattr error :"
-	 << cpp_strerror(ret) << std::endl;
-    return NULL;
-  }
-  std::string err;
-  ret = strict_strtol(buf, 10, &err);
-  if (!err.empty()) {
-    cerr << ERR_PREFIX << "ExportDir: invalid value for "
-	 << XATTR_RADOS_SYNC_VER << ": " << buf << ". parse error: "
-	 << err << std::endl;
-    return NULL;
-  }
-  if (ret != 1) {
-    cerr << ERR_PREFIX << "ExportDir: can't handle any naming "
-	 << "convention besides version 1. You must upgrade this program to "
-	 << "handle the data in the new format." << std::endl;
-    return NULL;
-  }
-  return new ExportDir(path);
-}
-
-std::string ExportDir::get_fs_path(const std::string &rados_name) const
-{
-  static int HASH_LENGTH = 17;
-  size_t i;
-  size_t strlen_rados_name = strlen(rados_name.c_str());
-  size_t sz;
-  bool need_hash = false;
-  if (strlen_rados_name > 200) {
-    sz = 200;
-    need_hash = true;
-  }
-  else {
-    sz = strlen_rados_name;
-  }
-  char fs_path[sz + HASH_LENGTH + 1];
-  for (i = 0; i < sz; ++i) {
-    // Just replace anything that looks funny with an 'at' sign.
-    // Unicode also gets turned into 'at' signs.
-    signed char c = rados_name[i];
-    if (c < 0x20) {
-     // Since c is signed, this also eliminates bytes with the high bit set
-      c = '@';
-      need_hash = true;
-    }
-    else if (c == 0x7f) {
-      c = '@';
-      need_hash = true;
-    }
-    else if (c == '/') {
-      c = '@';
-      need_hash = true;
-    }
-    else if (c == '\\') {
-      c = '@';
-      need_hash = true;
-    }
-    else if (c == '$') {
-      c = '@';
-      need_hash = true;
-    }
-    else if (c == ' ') {
-      c = '_';
-      need_hash = true;
-    }
-    fs_path[i] = c;
-  }
-
-  if (need_hash) {
-    uint64_t hash = 17;
-    for (i = 0; i < strlen_rados_name; ++i) {
-      hash += (rados_name[i] * 33);
-    }
-    // The extra byte of length is because snprintf always NULL-terminates.
-    snprintf(fs_path + i, HASH_LENGTH + 1, "_%016" PRIx64, hash);
-  }
-  else {
-    // NULL-terminate.
-    fs_path[i] = '\0';
-  }
-
-  ostringstream oss;
-  oss << path << "/" << fs_path;
-  return oss.str();
-}
-
-ExportDir::ExportDir(const std::string &path_)
-   : path(path_)
-{
-}
-
-DirHolder::DirHolder()
-  : dp(NULL)
-{
-}
-
-DirHolder::~DirHolder() {
-  if (!dp)
-    return;
-  if (closedir(dp)) {
-    int err = errno;
-    cerr << ERR_PREFIX << "closedir failed: " << cpp_strerror(err) << std::endl;
-  }
-  dp = NULL;
-}
-
-int DirHolder::opendir(const char *dir_name) {
-  dp = ::opendir(dir_name);
-  if (!dp) {
-    int err = errno;
-    return err;
-  }
-  return 0;
-}
-
-static __thread int t_iod_idx = -1;
-
-static pthread_mutex_t io_ctx_distributor_lock = PTHREAD_MUTEX_INITIALIZER;
-
-IoCtxDistributor* IoCtxDistributor::instance() {
-  IoCtxDistributor *ret;
-  pthread_mutex_lock(&io_ctx_distributor_lock);
-  if (s_instance == NULL) {
-    s_instance = new IoCtxDistributor();
-  }
-  ret = s_instance;
-  pthread_mutex_unlock(&io_ctx_distributor_lock);
-  return ret;
-}
-
-int IoCtxDistributor::init(Rados &cluster, const char *pool_name,
-			   int num_ioctxes) {
-  m_io_ctxes.resize(num_ioctxes);
-  for (std::vector<IoCtx>::iterator i = m_io_ctxes.begin();
-	 i != m_io_ctxes.end(); ++i) {
-    IoCtx &io_ctx(*i);
-    int ret = cluster.ioctx_create(pool_name, io_ctx);
-    if (ret) {
-      return ret;
-    }
-  }
-  m_highest_iod_idx.set(0);
-  return 0;
-}
-
-void IoCtxDistributor::clear() {
-  for (std::vector<IoCtx>::iterator i = m_io_ctxes.begin();
-	 i != m_io_ctxes.end(); ++i) {
-    IoCtx &io_ctx(*i);
-    io_ctx.close();
-  }
-  m_io_ctxes.clear();
-  m_highest_iod_idx.set(0);
-}
-
-IoCtx& IoCtxDistributor::get_ioctx() {
-  if (t_iod_idx == -1) {
-    t_iod_idx = m_highest_iod_idx.inc() - 1;
-  }
-  if (m_io_ctxes.size() <= (unsigned int)t_iod_idx) {
-    cerr << ERR_PREFIX << "IoCtxDistributor: logic error on line "
-	 << __LINE__ << std::endl;
-    _exit(1);
-  }
-  return m_io_ctxes[t_iod_idx];
-}
-
-IoCtxDistributor *IoCtxDistributor::s_instance = NULL;
-
-IoCtxDistributor::IoCtxDistributor() {
-  clear();
-}
-
-IoCtxDistributor::~IoCtxDistributor() {
-  clear();
-}
-
-RadosSyncWQ::RadosSyncWQ(IoCtxDistributor *io_ctx_dist, time_t timeout, time_t suicide_timeout, ThreadPool *tp)
-  : ThreadPool::WorkQueue<std::string>("FileStore::OpWQ", timeout, suicide_timeout, tp),
-    m_io_ctx_dist(io_ctx_dist)
-{
-}
-
-bool RadosSyncWQ::_enqueue(std::string *s) {
-  m_items.push_back(s);
-  return true;
-}
-
-void RadosSyncWQ::_dequeue(std::string *o) {
-  assert(0);
-}
-
-bool RadosSyncWQ::_empty() {
-  return m_items.empty();
-}
-
-std::string *RadosSyncWQ::_dequeue() {
-  if (m_items.empty())
-    return NULL;
-  std::string *ret = m_items.front();
-  m_items.pop_front();
-  return ret;
-}
-
-void RadosSyncWQ::_process_finish(std::string *s) {
-  delete s;
-}
-
-void RadosSyncWQ::_clear() {
-  for (std::deque<std::string*>::iterator i = m_items.begin();
-	 i != m_items.end(); ++i) {
-    delete *i;
-  }
-  m_items.clear();
-}
-
-Xattr::Xattr(char *data_, ssize_t len_)
-    : data(data_), len(len_)
-{
-}
-
-Xattr::~Xattr() {
-  free(data);
-}
-
-bool Xattr::operator==(const class Xattr &rhs) const {
-  if (len != rhs.len)
-    return false;
-  return (memcmp(data, rhs.data, len) == 0);
-}
-
-bool Xattr::operator!=(const class Xattr &rhs) const {
-  return !((*this) == rhs);
-}
-
-int BackedUpObject::from_file(const char *file_name, const char *dir_name,
-			  std::auto_ptr<BackedUpObject> &obj)
-{
-  char obj_path[strlen(dir_name) + strlen(file_name) + 2];
-  snprintf(obj_path, sizeof(obj_path), "%s/%s", dir_name, file_name);
-  return BackedUpObject::from_path(obj_path, obj);
-}
-
-int BackedUpObject::from_path(const char *path, std::auto_ptr<BackedUpObject> &obj)
-{
-  int ret;
-  FILE *fp = fopen(path, "r");
-  if (!fp) {
-    ret = errno;
-    if (ret != ENOENT) {
-      cerr << ERR_PREFIX << "BackedUpObject::from_path: error while trying to "
-	   << "open '" << path << "': " <<  cpp_strerror(ret) << std::endl;
-    }
-    return ret;
-  }
-  int fd = fileno(fp);
-  struct stat st_buf;
-  memset(&st_buf, 0, sizeof(st_buf));
-  ret = fstat(fd, &st_buf);
-  if (ret) {
-    ret = errno;
-    fclose(fp);
-    cerr << ERR_PREFIX << "BackedUpObject::from_path: error while trying "
-	 << "to stat '" << path << "': " <<  cpp_strerror(ret) << std::endl;
-    return ret;
-  }
-
-  // get fullname
-  ssize_t res = ceph_os_fgetxattr(fd, XATTR_FULLNAME, NULL, 0);
-  if (res <= 0) {
-    fclose(fp);
-    ret = errno;
-    if (res == 0) {
-      cerr << ERR_PREFIX << "BackedUpObject::from_path: found empty "
-	   << XATTR_FULLNAME << " attribute on '" << path
-	   << "'" << std::endl;
-      ret = ENODATA;
-    } else if (ret == ENODATA) {
-      cerr << ERR_PREFIX << "BackedUpObject::from_path: there was no "
-	   << XATTR_FULLNAME << " attribute found on '" << path
-	   << "'" << std::endl;
-    } else {
-      cerr << ERR_PREFIX << "getxattr error: " << cpp_strerror(ret) << std::endl;
-    }
-    return ret;
-  }
-  char rados_name_[res + 1];
-  memset(rados_name_, 0, sizeof(rados_name_));
-  res = ceph_os_fgetxattr(fd, XATTR_FULLNAME, rados_name_, res);
-  if (res < 0) {
-    ret = errno;
-    fclose(fp);
-    cerr << ERR_PREFIX << "BackedUpObject::getxattr(" << XATTR_FULLNAME
-	 << ") error: " << cpp_strerror(ret) << std::endl;
-    return ret;
-  }
-
-  BackedUpObject *o = new BackedUpObject(rados_name_,
-			     st_buf.st_size, st_buf.st_mtime);
-  if (!o) {
-    fclose(fp);
-    return ENOBUFS;
-  }
-  ret = o->read_xattrs_from_file(fileno(fp));
-  if (ret) {
-    fclose(fp);
-    cerr << ERR_PREFIX << "BackedUpObject::from_path(path = '"
-	 << path << "): read_xattrs_from_file returned " << ret << std::endl;
-    delete o;
-    return ret;
-  }
-
-  fclose(fp);
-  obj.reset(o);
-  return 0;
-}
-
-int BackedUpObject::from_rados(IoCtx& io_ctx, const char *rados_name_,
-		      auto_ptr<BackedUpObject> &obj)
-{
-  uint64_t rados_size_ = 0;
-  time_t rados_time_ = 0;
-  int ret = io_ctx.stat(rados_name_, &rados_size_, &rados_time_);
-  if (ret == -ENOENT) {
-    // don't complain here about ENOENT
-    return ret;
-  } else if (ret < 0) {
-    cerr << ERR_PREFIX << "BackedUpObject::from_rados(rados_name_ = '"
-	 << rados_name_ << "'): stat failed with error " << ret << std::endl;
-    return ret;
-  }
-  BackedUpObject *o = new BackedUpObject(rados_name_, rados_size_, rados_time_);
-  ret = o->read_xattrs_from_rados(io_ctx);
-  if (ret) {
-    cerr << ERR_PREFIX << "BackedUpObject::from_rados(rados_name_ = '"
-	  << rados_name_ << "'): read_xattrs_from_rados returned "
-	  << ret << std::endl;
-    delete o;
-    return ret;
-  }
-  obj.reset(o);
-  return 0;
-}
-
-BackedUpObject::~BackedUpObject()
-{
-  for (std::map < std::string, Xattr* >::iterator x = xattrs.begin();
-	 x != xattrs.end(); ++x)
-  {
-    delete x->second;
-    x->second = NULL;
-  }
-  free(rados_name);
-}
-
-std::string BackedUpObject::get_fs_path(const ExportDir *export_dir) const
-{
-  return export_dir->get_fs_path(rados_name);
-}
-
-std::string BackedUpObject::xattrs_to_str() const
-{
-  ostringstream oss;
-  std::string prefix;
-  for (std::map < std::string, Xattr* >::const_iterator x = xattrs.begin();
-	 x != xattrs.end(); ++x)
-  {
-    char buf[x->second->len + 1];
-    memcpy(buf, x->second->data, x->second->len);
-    buf[x->second->len] = '\0';
-    oss << prefix << "{" << x->first << ":" << buf << "}";
-    prefix = ", ";
-  }
-  return oss.str();
-}
-
-void BackedUpObject::xattr_diff(const BackedUpObject *rhs,
-		std::list < std::string > &only_in_a,
-		std::list < std::string > &only_in_b,
-		std::list < std::string > &diff) const
-{
-  only_in_a.clear();
-  only_in_b.clear();
-  diff.clear();
-
-  for (std::map < std::string, Xattr* >::const_iterator x = xattrs.begin();
-	 x != xattrs.end(); ++x)
-  {
-    std::map < std::string, Xattr* >::const_iterator r = rhs->xattrs.find(x->first);
-    if (r == rhs->xattrs.end()) {
-      only_in_a.push_back(x->first);
-    }
-    else {
-      const Xattr &r_obj(*r->second);
-      const Xattr &x_obj(*x->second);
-      if (r_obj != x_obj)
-	diff.push_back(x->first);
-    }
-  }
-
-  for (std::map < std::string, Xattr* >::const_iterator r = rhs->xattrs.begin();
-	 r != rhs->xattrs.end(); ++r)
-  {
-    std::map < std::string, Xattr* >::const_iterator x = xattrs.find(r->first);
-    if (x == xattrs.end()) {
-      only_in_b.push_back(r->first);
-    }
-  }
-}
-
-void BackedUpObject::get_xattrs(std::list < std::string > &xattrs_) const
-{
-  for (std::map < std::string, Xattr* >::const_iterator r = xattrs.begin();
-	 r != xattrs.end(); ++r)
-  {
-    xattrs_.push_back(r->first);
-  }
-}
-
-const Xattr* BackedUpObject::get_xattr(const std::string &name) const
-{
-  std::map < std::string, Xattr* >::const_iterator x = xattrs.find(name);
-  if (x == xattrs.end())
-    return NULL;
-  else
-    return x->second;
-}
-
-const char *BackedUpObject::get_rados_name() const {
-  return rados_name;
-}
-
-uint64_t BackedUpObject::get_rados_size() const {
-  return rados_size;
-}
-
-time_t BackedUpObject::get_mtime() const {
-  return rados_time;
-}
-
-int BackedUpObject::download(IoCtx &io_ctx, const char *path)
-{
-  char tmp_path[strlen(path) + RADOS_SYNC_TMP_SUFFIX_LEN + 1];
-  snprintf(tmp_path, sizeof(tmp_path), "%s%s", path, RADOS_SYNC_TMP_SUFFIX);
-  FILE *fp = fopen(tmp_path, "w");
-  if (!fp) {
-    int err = errno;
-    cerr << ERR_PREFIX << "download: error opening '" << tmp_path << "':"
-	 <<  cpp_strerror(err) << std::endl;
-    return err;
-  }
-  int fd = fileno(fp);
-  uint64_t off = 0;
-  static const int CHUNK_SZ = 32765;
-  while (true) {
-    bufferlist bl;
-    int rlen = io_ctx.read(rados_name, bl, CHUNK_SZ, off);
-    if (rlen < 0) {
-      cerr << ERR_PREFIX << "download: io_ctx.read(" << rados_name << ") returned "
-	   << rlen << std::endl;
-      fclose(fp);
-      return rlen;
-    }
-    if (rlen < CHUNK_SZ)
-      off = 0;
-    else
-      off += rlen;
-    size_t flen = fwrite(bl.c_str(), 1, rlen, fp);
-    if (flen != (size_t)rlen) {
-      int err = errno;
-      cerr << ERR_PREFIX << "download: fwrite(" << tmp_path << ") error: "
-	   << cpp_strerror(err) << std::endl;
-      fclose(fp);
-      return err;
-    }
-    if (off == 0)
-      break;
-  }
-  size_t attr_sz = strlen(rados_name) + 1;
-  int res = ceph_os_fsetxattr(fd, XATTR_FULLNAME, rados_name, attr_sz);
-  if (res) {
-    int err = errno;
-    cerr << ERR_PREFIX << "download: fsetxattr(" << tmp_path << ") error: "
-	 << cpp_strerror(err) << std::endl;
-    fclose(fp);
-    return err;
-  }
-  if (fclose(fp)) {
-    int err = errno;
-    cerr << ERR_PREFIX << "download: fclose(" << tmp_path << ") error: "
-	 << cpp_strerror(err) << std::endl;
-    return err;
-  }
-  if (rename(tmp_path, path)) {
-    int err = errno;
-    cerr << ERR_PREFIX << "download: rename(" << tmp_path << ", "
-	 << path << ") error: " << cpp_strerror(err) << std::endl;
-    return err;
-  }
-  return 0;
-}
-
-int BackedUpObject::upload(IoCtx &io_ctx, const char *file_name, const char *dir_name)
-{
-  char path[strlen(file_name) + strlen(dir_name) + 2];
-  snprintf(path, sizeof(path), "%s/%s", dir_name, file_name);
-  FILE *fp = fopen(path, "r");
-  if (!fp) {
-    int err = errno;
-    cerr << ERR_PREFIX << "upload: error opening '" << path << "': "
-	 << cpp_strerror(err) << std::endl;
-    return err;
-  }
-  // Need to truncate RADOS object to size 0, in case there is
-  // already something there.
-  int ret = io_ctx.trunc(rados_name, 0);
-  if (ret) {
-    cerr << ERR_PREFIX << "upload: trunc failed with error " << ret << std::endl;
-    fclose(fp);
-    return ret;
-  }
-  uint64_t off = 0;
-  static const int CHUNK_SZ = 32765;
-  while (true) {
-    char buf[CHUNK_SZ];
-    int flen = fread(buf, 1, CHUNK_SZ, fp);
-    if (flen < 0) {
-      int err = errno;
-      cerr << ERR_PREFIX << "upload: fread(" << file_name << ") error: "
-	   << cpp_strerror(err) << std::endl;
-      fclose(fp);
-      return err;
-    }
-    if ((flen == 0) && (off != 0)) {
-      fclose(fp);
-      break;
-    }
-    // There must be a zero-copy way to do this?
-    bufferlist bl;
-    bl.append(buf, flen);
-    int rlen = io_ctx.write(rados_name, bl, flen, off);
-    if (rlen < 0) {
-      fclose(fp);
-      cerr << ERR_PREFIX << "upload: rados_write error: " << rlen << std::endl;
-      return rlen;
-    }
-    if (rlen != flen) {
-      fclose(fp);
-      cerr << ERR_PREFIX << "upload: rados_write error: short write" << std::endl;
-      return -EIO;
-    }
-    off += rlen;
-    if (flen < CHUNK_SZ) {
-      fclose(fp);
-      return 0;
-    }
-  }
-  return 0;
-}
-
-BackedUpObject::BackedUpObject(const char *rados_name_,
-			       uint64_t rados_size_, time_t rados_time_)
-  : rados_name(strdup(rados_name_)),
-    rados_size(rados_size_),
-    rados_time(rados_time_)
-{
-}
-
-int BackedUpObject::read_xattrs_from_file(int fd)
-{
-  ssize_t blen = ceph_os_flistxattr(fd, NULL, 0);
-  if (blen > 0x1000000) {
-    cerr << ERR_PREFIX << "BackedUpObject::read_xattrs_from_file: unwilling "
-	 << "to allocate a buffer of size " << blen << " on the stack for "
-	 << "flistxattr." << std::endl;
-    return ENOBUFS;
-  }
-  char buf[blen + 1];
-  memset(buf, 0, sizeof(buf));
-  ssize_t blen2 = ceph_os_flistxattr(fd, buf, blen);
-  if (blen != blen2) {
-    cerr << ERR_PREFIX << "BackedUpObject::read_xattrs_from_file: xattrs changed while "
-	 << "we were trying to "
-	 << "list them? First length was " << blen << ", but now it's " << blen2
-	 << std::endl;
-    return EDOM;
-  }
-  const char *b = buf;
-  while (*b) {
-    size_t bs = strlen(b);
-    std::string xattr_name = get_user_xattr_name(b);
-    if (!xattr_name.empty()) {
-      ssize_t attr_len = ceph_os_fgetxattr(fd, b, NULL, 0);
-      if (attr_len < 0) {
-	int err = errno;
-	cerr << ERR_PREFIX << "BackedUpObject::read_xattrs_from_file: "
-	     << "fgetxattr(rados_name = '" << rados_name << "', xattr_name='"
-	     << xattr_name << "') failed: " << cpp_strerror(err) << std::endl;
-	return EDOM;
-      }
-      char *attr = (char*)malloc(attr_len);
-      if (!attr) {
-	cerr << ERR_PREFIX << "BackedUpObject::read_xattrs_from_file: "
-	     << "malloc(" << attr_len << ") failed for xattr_name='"
-	     << xattr_name << "'" << std::endl;
-	return ENOBUFS;
-      }
-      ssize_t attr_len2 = ceph_os_fgetxattr(fd, b, attr, attr_len);
-      if (attr_len2 < 0) {
-	int err = errno;
-	cerr << ERR_PREFIX << "BackedUpObject::read_xattrs_from_file: "
-	     << "fgetxattr(rados_name = '" << rados_name << "', "
-	     << "xattr_name='" << xattr_name << "') failed: "
-	     << cpp_strerror(err) << std::endl;
-	free(attr);
-	return EDOM;
-      }
-      if (attr_len2 != attr_len) {
-	cerr << ERR_PREFIX << "BackedUpObject::read_xattrs_from_file: xattr "
-	     << "changed while we were trying to get it? "
-	     << "fgetxattr(rados_name = '"<< rados_name
-	     << "', xattr_name='" << xattr_name << "') returned a different length "
-	     << "than when we first called it! old_len = " << attr_len
-	     << "new_len = " << attr_len2 << std::endl;
-	free(attr);
-	return EDOM;
-      }
-      xattrs[xattr_name] = new Xattr(attr, attr_len);
-    }
-    b += (bs + 1);
-  }
-  return 0;
-}
-
-int BackedUpObject::read_xattrs_from_rados(IoCtx &io_ctx)
-{
-  map<std::string, bufferlist> attrset;
-  int ret = io_ctx.getxattrs(rados_name, attrset);
-  if (ret) {
-    cerr << ERR_PREFIX << "BackedUpObject::read_xattrs_from_rados: "
-	 << "getxattrs failed with error code " << ret << std::endl;
-    return ret;
-  }
-  for (map<std::string, bufferlist>::iterator i = attrset.begin();
-       i != attrset.end(); )
-  {
-    bufferlist& bl(i->second);
-    char *data = (char*)malloc(bl.length());
-    if (!data)
-      return ENOBUFS;
-    memcpy(data, bl.c_str(), bl.length());
-    Xattr *xattr = new Xattr(data, bl.length());
-    if (!xattr) {
-      free(data);
-      return ENOBUFS;
-    }
-    xattrs[i->first] = xattr;
-    attrset.erase(i++);
-  }
-  return 0;
-}
-
-int rados_tool_sync(const std::map < std::string, std::string > &opts,
-                             std::vector<const char*> &args)
-{
-  int ret;
-  bool force = opts.count("force");
-  bool delete_after = opts.count("delete-after");
-  bool create = opts.count("create");
-
-  std::map < std::string, std::string >::const_iterator n = opts.find("workers");
-  int num_threads;
-  if (n == opts.end()) {
-    num_threads = DEFAULT_NUM_RADOS_WORKER_THREADS;
-  }
-  else {
-    std::string err;
-    num_threads = strict_strtol(n->second.c_str(), 10, &err);
-    if (!err.empty()) {
-      cerr << "rados: can't parse number of worker threads given: "
-	   << err << std::endl;
-      return 1;
-    }
-    if ((num_threads < 1) || (num_threads > 9000)) {
-      cerr << "rados: unreasonable value given for num_threads: "
-	   << num_threads << std::endl;
-      return 1;
-    }
-  }
-
-
-  std::string action, src, dst;
-  std::vector<const char*>::iterator i = args.begin();
-  if ((i != args.end()) &&
-      ((strcmp(*i, "import") == 0) || (strcmp(*i, "export") == 0))) {
-    action = *i;
-    ++i;
-  }
-  else {
-    cerr << "rados" << ": You must specify either 'import' or 'export'.\n";
-    cerr << "Use --help to show help.\n";
-    exit(1);
-  }
-  if (i != args.end()) {
-    src = *i;
-    ++i;
-  }
-  else {
-    cerr << "rados" << ": You must give a source.\n";
-    cerr << "Use --help to show help.\n";
-    exit(1);
-  }
-  if (i != args.end()) {
-    dst = *i;
-    ++i;
-  }
-  else {
-    cerr << "rados" << ": You must give a destination.\n";
-    cerr << "Use --help to show help.\n";
-    exit(1);
-  }
-
-  // open rados
-  Rados rados;
-  if (rados.init_with_context(g_ceph_context) < 0) {
-     cerr << "rados" << ": failed to initialize Rados!" << std::endl;
-     exit(1);
-  }
-  if (rados.connect() < 0) {
-     cerr << "rados" << ": failed to connect to Rados cluster!" << std::endl;
-     exit(1);
-  }
-  IoCtx io_ctx;
-  std::string pool_name = (action == "import") ? dst : src;
-  ret = rados.ioctx_create(pool_name.c_str(), io_ctx);
-  if ((ret == -ENOENT) && (action == "import")) {
-    if (create) {
-      ret = rados.pool_create(pool_name.c_str());
-      if (ret) {
-	cerr << "rados" << ": pool_create failed with error " << ret
-	     << std::endl;
-	exit(ret);
-      }
-      ret = rados.ioctx_create(pool_name.c_str(), io_ctx);
-    }
-    else {
-      cerr << "rados" << ": pool '" << pool_name << "' does not exist. Use "
-	   << "--create to try to create it." << std::endl;
-      exit(ENOENT);
-    }
-  }
-  if (ret < 0) {
-    cerr << "rados" << ": error opening pool " << pool_name << ": "
-	 << cpp_strerror(ret) << std::endl;
-    exit(ret);
-  }
-
-  IoCtxDistributor *io_ctx_dist = IoCtxDistributor::instance();
-  ret = io_ctx_dist->init(rados, pool_name.c_str(), num_threads);
-  if (ret) {
-    cerr << ERR_PREFIX << "failed to initialize Rados io contexts."
-	 << std::endl;
-    _exit(ret);
-  }
-
-  ThreadPool thread_pool(g_ceph_context, "rados_sync_threadpool", num_threads);
-  thread_pool.start();
-
-  if (action == "import") {
-    ret = do_rados_import(&thread_pool, io_ctx, io_ctx_dist, src.c_str(),
-		     force, delete_after);
-    thread_pool.stop();
-    return ret;
-  }
-  else {
-    ret = do_rados_export(&thread_pool, io_ctx, io_ctx_dist, dst.c_str(),
-		     create, force, delete_after);
-    thread_pool.stop();
-    return ret;
-  }
-}
diff --git a/src/tools/rados/rados_sync.h b/src/tools/rados/rados_sync.h
deleted file mode 100644
index d762450..0000000
--- a/src/tools/rados/rados_sync.h
+++ /dev/null
@@ -1,216 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage at newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software 
- * Foundation.  See file COPYING.
- * 
- */
-
-#ifndef CEPH_RADOS_SYNC_H
-#define CEPH_RADOS_SYNC_H
-
-#include <stddef.h>
-#include "include/atomic.h"
-#include "common/WorkQueue.h"
-
-#include <string>
-#include <sys/types.h>
-
-namespace librados {
-  class IoCtx;
-  class Rados;
-}
-
-extern const char USER_XATTR_PREFIX[];
-extern const char RADOS_SYNC_TMP_SUFFIX[];
-#define ERR_PREFIX "[ERROR]        "
-#define DEFAULT_NUM_RADOS_WORKER_THREADS 5
-
-/* Linux seems to use ENODATA instead of ENOATTR when an extended attribute
- * is missing */
-#ifndef ENOATTR
-#define ENOATTR ENODATA
-#endif
-
-enum {
-  CHANGED_XATTRS = 0x1,
-  CHANGED_CONTENTS = 0x2,
-};
-
-/** Given the name of an extended attribute from a file in the filesystem,
- * returns an empty string if the extended attribute does not represent a rados
- * user extended attribute. Otherwise, returns the name of the rados extended
- * attribute.
- *
- * Rados user xattrs are prefixed with USER_XATTR_PREFIX.
- */
-std::string get_user_xattr_name(const char *fs_xattr_name);
-
-/* Returns true if 'suffix' is a suffix of str */
-bool is_suffix(const char *str, const char *suffix);
-
-/** Represents a directory in the filesystem that we export rados objects to (or
- * import them from.)
- */
-class ExportDir
-{
-public:
-  static ExportDir* create_for_writing(const std::string &path, int version,
-					  bool create);
-  static ExportDir* from_file_system(const std::string &path);
-
-  /* Given a rados object name, return something which looks kind of like the
-   * first part of the name.
-   *
-   * The actual file name that the backed-up object is stored in is irrelevant
-   * to rados_sync. The only reason to make it human-readable at all is to make
-   * things easier on sysadmins.  The XATTR_FULLNAME extended attribute has the
-   * real, full object name.
-  *
-   * This function turns unicode into a bunch of 'at' signs. This could be
-   * fixed. If you try, be sure to handle all the multibyte characters
-   * correctly.
-   * I guess a better hash would be nice too.
-   */
-  std::string get_fs_path(const std::string &rados_name) const;
-
-private:
-  explicit ExportDir(const std::string &path_);
-
-  std::string path;
-};
-
-/** Smart pointer wrapper for a DIR*
- */
-class DirHolder {
-public:
-  DirHolder();
-  ~DirHolder();
-  int opendir(const char *dir_name);
-  DIR *dp;
-};
-
-/** IoCtxDistributor is a singleton that distributes out IoCtx instances to
- * different threads.
- */
-class IoCtxDistributor
-{
-public:
-  static IoCtxDistributor* instance();
-  int init(librados::Rados &cluster, const char *pool_name, int num_ioctxes);
-  void clear();
-  librados::IoCtx& get_ioctx();
-private:
-  static IoCtxDistributor *s_instance;
-  IoCtxDistributor();
-  ~IoCtxDistributor();
-
-  ceph::atomic_t m_highest_iod_idx;
-
-  /* NB: there might be some false sharing here that we could optimize
-   * away in the future */
-  std::vector<librados::IoCtx> m_io_ctxes;
-};
-
-class RadosSyncWQ : public ThreadPool::WorkQueue<std::string> {
-public:
-  RadosSyncWQ(IoCtxDistributor *io_ctx_dist, time_t timeout, time_t suicide_timeout, ThreadPool *tp);
-protected:
-  IoCtxDistributor *m_io_ctx_dist;
-private:
-  bool _enqueue(std::string *s);
-  void _dequeue(std::string *o);
-  bool _empty();
-  std::string *_dequeue();
-  void _process_finish(std::string *s);
-  void _clear();
-  std::deque<std::string*> m_items;
-};
-
-/* Stores a length and a chunk of malloc()ed data */
-class Xattr {
-public:
-  Xattr(char *data_, ssize_t len_);
-  ~Xattr();
-  bool operator==(const class Xattr &rhs) const;
-  bool operator!=(const class Xattr &rhs) const;
-
-  char *data;
-  ssize_t len;
-};
-
-/* Represents an object that we are backing up */
-class BackedUpObject
-{
-public:
-  static int from_file(const char *file_name, const char *dir_name,
-			    std::auto_ptr<BackedUpObject> &obj);
-  static int from_path(const char *path, std::auto_ptr<BackedUpObject> &obj);
-  static int from_rados(librados::IoCtx& io_ctx, const char *rados_name_,
-			auto_ptr<BackedUpObject> &obj);
-  ~BackedUpObject();
-
-  /* Get the mangled name for this rados object. */
-  std::string get_fs_path(const ExportDir *export_dir) const;
-
-  /* Convert the xattrs on this BackedUpObject to a kind of JSON-like string.
-   * This is only used for debugging.
-   * Note that we're assuming we can just treat the xattr data as a
-   * null-terminated string, which isn't true. Again, this is just for debugging,
-   * so it doesn't matter.
-   */
-  std::string xattrs_to_str() const;
-
-  /* Diff the extended attributes on this BackedUpObject with those found on a
-   * different BackedUpObject
-   */
-  void xattr_diff(const BackedUpObject *rhs,
-		  std::list < std::string > &only_in_a,
-		  std::list < std::string > &only_in_b,
-		  std::list < std::string > &diff) const;
-
-  void get_xattrs(std::list < std::string > &xattrs_) const;
-
-  const Xattr* get_xattr(const std::string &name) const;
-
-  const char *get_rados_name() const;
-
-  uint64_t get_rados_size() const;
-
-  time_t get_mtime() const;
-
-  int download(librados::IoCtx &io_ctx, const char *path);
-
-  int upload(librados::IoCtx &io_ctx, const char *file_name, const char *dir_name);
-
-private:
-  BackedUpObject(const char *rados_name_, uint64_t rados_size_, time_t rados_time_);
-
-  int read_xattrs_from_file(int fd);
-
-  int read_xattrs_from_rados(librados::IoCtx &io_ctx);
-
-  // don't allow copying
-  BackedUpObject &operator=(const BackedUpObject &rhs);
-  BackedUpObject(const BackedUpObject &rhs);
-
-  char *rados_name;
-  uint64_t rados_size;
-  uint64_t rados_time;
-  std::map < std::string, Xattr* > xattrs;
-};
-
-extern int do_rados_import(ThreadPool *tp, librados::IoCtx &io_ctx,
-    IoCtxDistributor* io_ctx_dist, const char *dir_name,
-    bool force, bool delete_after);
-extern int do_rados_export(ThreadPool *tp, librados::IoCtx& io_ctx,
-    IoCtxDistributor *io_ctx_dist, const char *dir_name, 
-    bool create, bool force, bool delete_after);
-
-#endif
diff --git a/src/tools/rest_bench.cc b/src/tools/rest_bench.cc
deleted file mode 100644
index 6da5cf8..0000000
--- a/src/tools/rest_bench.cc
+++ /dev/null
@@ -1,802 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage at newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation.  See file COPYING.
- *
- */
-
-#include <deque>
-#include <errno.h>
-
-#include "libs3.h"
-
-#include "common/ceph_argparse.h"
-#include "common/debug.h"
-#include "common/obj_bencher.h"
-#include "common/WorkQueue.h"
-
-#include "include/types.h"
-#include "include/atomic.h"
-
-#include "global/global_init.h"
-#include "msg/Message.h"
-
-#define DEFAULT_USER_AGENT "rest-bench"
-#define DEFAULT_BUCKET "rest-bench-bucket"
-
-void usage(ostream& out)
-{
-  out <<					\
-"usage: rest-bench [options] <write|seq>\n"
-"       rest-bench [options] cleanup [--run-name run_name] [--prefix prefix]\n"
-"BENCHMARK OPTIONS\n"
-"   --seconds\n"
-"        benchmak length (default: 60)\n"
-"   -t concurrent_operations\n"
-"   --concurrent-ios=concurrent_operations\n"
-"        select bucket by name\n"
-"   -b op-size\n"
-"   --block-size=op-size\n"
-"        set the size of write ops for put or benchmarking\n"
-"   --show-time\n"
-"        prefix output lines with date and time\n"
-"   --no-cleanup\n"
-"        do not clean up data after write bench\n"
-"REST CONFIG OPTIONS\n"
-"   --api-host=bhost\n"
-"        host name\n"
-"   --bucket=bucket\n"
-"        select bucket by name\n"
-"   --access-key=access_key\n"
-"        access key to RESTful storage provider\n"
-"   --secret=secret_key\n"
-"        secret key for the specified access key\n"
-"   --protocol=<http|https>\n"
-"        protocol to be used (default: http)\n"
-"   --uri_style=<path|vhost>\n"
-"        uri style in requests (default: path)\n";
-}
-
-static void usage_exit()
-{
-  usage(cerr);
-  exit(1);
-}
-
-enum OpType {
-  OP_NONE    = 0,
-  OP_GET_OBJ = 1,
-  OP_PUT_OBJ = 2,
-  OP_DELETE_OBJ = 3,
-  OP_LIST_BUCKET = 4,
-  OP_CLEANUP = 5,
-};
-
-struct req_context : public RefCountedObject {
-  bool complete;
-  S3Status status;
-  S3RequestContext *ctx;
-  void (*cb)(void *, void *);
-  void *arg;
-  bufferlist *in_bl;
-  bufferlist out_bl;
-  uint64_t off;
-  uint64_t len;
-  const char *list_start;
-  std::list<std::string>* list_objects;
-  int list_count;
-  string oid;
-  Mutex lock;
-  Cond cond;
-  S3BucketContext *bucket_ctx;
-
-  bool should_destroy_ctx;
-
-  OpType op;
-
-  bool used;
-
-  req_context() : complete(false), status(S3StatusOK), ctx(NULL), cb(NULL), arg(NULL), in_bl(NULL), off(0), len(0),
-                  lock("req_context"), bucket_ctx(NULL), should_destroy_ctx(false), op(OP_NONE), used(false) {}
-  ~req_context() {
-    if (should_destroy_ctx) {
-      S3_destroy_request_context(ctx);
-    }
-  }
-
-  int init_ctx() {
-    S3Status status = S3_create_request_context(&ctx);
-    if (status != S3StatusOK) {
-      cerr << "failed to create context: " << S3_get_status_name(status) << std::endl;
-      return -EINVAL;
-    }
-    should_destroy_ctx = true;
-
-    return 0;
-  }
-
-  int ret() {
-    if (status != S3StatusOK) {
-      return -EINVAL;
-    }
-    return 0;
-  }
-};
-
-static S3Status properties_callback(const S3ResponseProperties *properties, void *cb_data)
-{
-  return S3StatusOK;
-}
-
-static void complete_callback(S3Status status, const S3ErrorDetails *details, void *cb_data)
-{
-  if (!cb_data)
-    return;
-
-  struct req_context *ctx = (struct req_context *)cb_data;
-
-  ctx->lock.Lock();
-  ctx->status = status;
-  ctx->lock.Unlock();
-
-  if (ctx->cb) {
-    ctx->cb((void *)ctx->cb, ctx->arg);
-  }
-
-  ctx->put();
-}
-
-static S3Status get_obj_callback(int size, const char *buf,
-                                 void *cb_data)
-{
-  if (!cb_data)
-    return S3StatusOK;
-
-  struct req_context *ctx = (struct req_context *)cb_data;
-
-  ctx->in_bl->append(buf, size);
-
-  return S3StatusOK;
-}
-
-static int put_obj_callback(int size, char *buf,
-                            void *cb_data)
-{
-  if (!cb_data)
-    return 0;
-
-  struct req_context *ctx = (struct req_context *)cb_data;
-
-  int chunk = ctx->out_bl.length() - ctx->off;
-  if (!chunk)
-    return 0;
-
-  if (chunk > size)
-    chunk = size;
-
-  memcpy(buf, ctx->out_bl.c_str() + ctx->off, chunk);
-
-  ctx->off += chunk;
-
-  return chunk;
-}
-
-static S3Status list_bucket_callback(int is_truncated, const char *next_marker,
-                                int count, const S3ListBucketContent *objects,
-                                int prefix_count, const char **prefixes,
-                                void *cb_data)
-{
-  if (!cb_data)
-    return S3StatusOK;
-
-  struct req_context *ctx = (struct req_context *)cb_data;
-
-  ctx->list_start = next_marker;
-
-  for (int i = 0; i < count; ++i) {
-    ctx->list_objects->push_back(objects[i].key);
-  }
-
-  return S3StatusOK;
-}
-
-class RESTDispatcher {
-  deque<req_context *> m_req_queue;
-  ThreadPool m_tp;
-
-  S3ResponseHandler response_handler;
-  S3GetObjectHandler get_obj_handler;
-  S3PutObjectHandler put_obj_handler;
-  S3ListBucketHandler list_bucket_handler;
-
-  struct DispatcherWQ : public ThreadPool::WorkQueue<req_context> {
-    RESTDispatcher *dispatcher;
-    DispatcherWQ(RESTDispatcher *p, time_t timeout, time_t suicide_timeout, ThreadPool *tp)
-      : ThreadPool::WorkQueue<req_context>("REST", timeout, suicide_timeout, tp), dispatcher(p) {}
-
-    bool _enqueue(req_context *req) {
-      dispatcher->m_req_queue.push_back(req);
-      _dump_queue();
-      return true;
-    }
-    void _dequeue(req_context *req) {
-      assert(0);
-    }
-    bool _empty() {
-      return dispatcher->m_req_queue.empty();
-    }
-    req_context *_dequeue() {
-      if (dispatcher->m_req_queue.empty())
-	return NULL;
-      req_context *req = dispatcher->m_req_queue.front();
-      dispatcher->m_req_queue.pop_front();
-      _dump_queue();
-      return req;
-    }
-    void _process(req_context *req) {
-      dispatcher->process_context(req);
-    }
-    void _dump_queue() {
-      deque<req_context *>::iterator iter;
-      if (dispatcher->m_req_queue.empty()) {
-        generic_dout(20) << "DispatcherWQ: empty" << dendl;
-        return;
-      }
-      generic_dout(20) << "DispatcherWQ:" << dendl;
-      for (iter = dispatcher->m_req_queue.begin(); iter != dispatcher->m_req_queue.end(); ++iter) {
-        generic_dout(20) << "req: " << hex << *iter << dec << dendl;
-      }
-    }
-    void _clear() {
-      assert(dispatcher->m_req_queue.empty());
-    }
-  } req_wq;
-
-public:
-  CephContext *cct;
-  RESTDispatcher(CephContext *cct_, int num_threads)
-    : m_tp(cct_, "RESTDispatcher::m_tp", num_threads),
-      req_wq(this, cct_->_conf->rgw_op_thread_timeout,
-        cct_->_conf->rgw_op_thread_suicide_timeout, &m_tp),
-      cct(cct_) {
-
-
-    response_handler.propertiesCallback = properties_callback;
-    response_handler.completeCallback = complete_callback;
-
-    get_obj_handler.responseHandler = response_handler;
-    get_obj_handler.getObjectDataCallback = get_obj_callback;
-
-    put_obj_handler.responseHandler = response_handler;
-    put_obj_handler.putObjectDataCallback = put_obj_callback;
-
-    list_bucket_handler.responseHandler = response_handler;
-    list_bucket_handler.listBucketCallback = list_bucket_callback;
-
-  }
-  ~RESTDispatcher()
-  {
-    req_wq.drain();
-    m_tp.stop();
-  } 
-  void process_context(req_context *ctx);
-  void get_obj(req_context *ctx);
-  void put_obj(req_context *ctx);
-  void delete_obj(req_context *ctx);
-  void list_bucket(req_context *ctx);
-
-  void queue(req_context *ctx) {
-    req_wq.queue(ctx);
-  }
-
-  void start() {
-    m_tp.start();
-  }
-};
-
-void RESTDispatcher::process_context(req_context *ctx)
-{
-  ctx->get();
-
-  switch (ctx->op) {
-    case OP_GET_OBJ:
-      get_obj(ctx);
-      break;
-    case OP_PUT_OBJ:
-      put_obj(ctx);
-      break;
-    case OP_DELETE_OBJ:
-      delete_obj(ctx);
-      break;
-    case OP_LIST_BUCKET:
-      list_bucket(ctx);
-      break;
-    default:
-      assert(0);
-  }
-
-  S3Status status = S3_runall_request_context(ctx->ctx);
-
-  if (status != S3StatusOK) {
-    cerr << "ERROR: S3_runall_request_context() returned " << S3_get_status_name(status) << std::endl;
-    ctx->status = status;
-  } else if (ctx->status != S3StatusOK) {
-    cerr << "ERROR: " << ctx->oid << ": " << S3_get_status_name(ctx->status) << std::endl;
-  }
-
-  ctx->lock.Lock();
-  ctx->complete = true;
-  ctx->cond.SignalAll();
-  ctx->lock.Unlock();
-
-  ctx->put();
-}
-
-void RESTDispatcher::put_obj(req_context *ctx)
-{
-  S3_put_object(ctx->bucket_ctx, ctx->oid.c_str(),
-                ctx->out_bl.length(),
-                NULL,
-                ctx->ctx,
-                &put_obj_handler, ctx);
-}
-
-void RESTDispatcher::get_obj(req_context *ctx)
-{
-  S3_get_object(ctx->bucket_ctx, ctx->oid.c_str(), NULL, 0, ctx->len, ctx->ctx,
-                &get_obj_handler, ctx);
-}
-
-void RESTDispatcher::delete_obj(req_context *ctx)
-{
-
-  S3_delete_object(ctx->bucket_ctx, ctx->oid.c_str(),
-                   ctx->ctx, &response_handler, ctx);
-}
-
-void RESTDispatcher::list_bucket(req_context *ctx)
-{
-  S3_list_bucket(ctx->bucket_ctx,
-                 NULL, ctx->list_start,
-                 NULL, ctx->list_count,
-                 ctx->ctx,
-                 &list_bucket_handler, ctx);
-}
-
-class RESTBencher : public ObjBencher {
-  RESTDispatcher *dispatcher;
-  struct req_context **completions;
-  struct S3RequestContext **handles;
-  S3BucketContext bucket_ctx;
-  const char *list_start;
-  bool bucket_list_done;
-  string user_agent;
-  string host;
-  string bucket;
-  S3Protocol protocol;
-  string access_key;
-  string secret;
-  int concurrentios;
-
-protected:
-  int rest_init() {
-    S3Status status = S3_initialize(user_agent.c_str(), S3_INIT_ALL, host.c_str());
-    if (status != S3StatusOK) {
-      cerr << "failed to init: " << S3_get_status_name(status) << std::endl;
-      return -EINVAL;
-    }
-
-
-    return 0;
-  }
-
-
-  int completions_init(int _concurrentios) {
-    concurrentios = _concurrentios;
-    completions = new req_context *[concurrentios];
-    handles = new S3RequestContext *[concurrentios];
-    for (int i = 0; i < concurrentios; i++) {
-      completions[i] = NULL;
-      S3Status status = S3_create_request_context(&handles[i]);
-      if (status != S3StatusOK) {
-        cerr << "failed to create context: " << S3_get_status_name(status) << std::endl;
-        return -EINVAL;
-      }
-    }
-    return 0;
-  }
-  void completions_done() {
-    delete[] completions;
-    completions = NULL;
-    for (int i = 0; i < concurrentios; i++) {
-      S3_destroy_request_context(handles[i]);
-    }
-    delete[] handles;
-    handles = NULL;
-  }
-  int create_completion(int slot, void (*cb)(void *, void*), void *arg) {
-    assert (!completions[slot]);
-
-    struct req_context *ctx = new req_context;
-    ctx->ctx = handles[slot];
-    assert (!ctx->used);
-    ctx->used = true;
-    ctx->cb = cb;
-    ctx->arg = arg;
-
-    completions[slot] = ctx;
-
-    return 0;
-  }
-  void release_completion(int slot) {
-    struct req_context *ctx = completions[slot];
-
-    ctx->used = false;
-
-    ctx->put();
-    completions[slot] = 0;
-  }
-
-  int aio_read(const std::string& oid, int slot, bufferlist *pbl, size_t len) {
-    struct req_context *ctx = completions[slot];
-
-    ctx->get();
-    ctx->in_bl = pbl;
-    ctx->oid = oid;
-    ctx->len = len;
-    ctx->bucket_ctx = &bucket_ctx;
-    ctx->op = OP_GET_OBJ;
-
-    dispatcher->queue(ctx);
-
-    return 0;
-  }
-
-  int aio_write(const std::string& oid, int slot, bufferlist& bl, size_t len) {
-    struct req_context *ctx = completions[slot];
-
-    ctx->get();
-    ctx->bucket_ctx = &bucket_ctx;
-    ctx->out_bl = bl;
-    ctx->oid = oid;
-    ctx->len = len;
-    ctx->op = OP_PUT_OBJ;
-
-    dispatcher->queue(ctx);
-    return 0;
-  }
-
-  int aio_remove(const std::string& oid, int slot) {
-    struct req_context *ctx = completions[slot];
-
-    ctx->get();
-    ctx->bucket_ctx = &bucket_ctx;
-    ctx->oid = oid;
-    ctx->op = OP_DELETE_OBJ;
-
-    dispatcher->queue(ctx);
-    return 0;
-  }
-
-  int sync_read(const std::string& oid, bufferlist& bl, size_t len) {
-    struct req_context *ctx = new req_context;
-    int ret = ctx->init_ctx();
-    if (ret < 0) {
-      return ret;
-    }
-    ctx->in_bl = &bl;
-    ctx->get();
-    ctx->bucket_ctx = &bucket_ctx;
-    ctx->oid = oid;
-    ctx->len = len;
-    ctx->op = OP_GET_OBJ;
-
-    dispatcher->process_context(ctx);
-    ret = ctx->ret();
-    ctx->put();
-    return bl.length();
-  }
-  int sync_write(const std::string& oid, bufferlist& bl, size_t len) {
-    struct req_context *ctx = new req_context;
-    int ret = ctx->init_ctx();
-    if (ret < 0) {
-      return ret;
-    }
-    ctx->get();
-    ctx->out_bl = bl;
-    ctx->bucket_ctx = &bucket_ctx;
-    ctx->oid = oid;
-    ctx->op = OP_PUT_OBJ;
-
-    dispatcher->process_context(ctx);
-    ret = ctx->ret();
-    ctx->put();
-    return ret;
-  }
-  int sync_remove(const std::string& oid) {
-    struct req_context *ctx = new req_context;
-    int ret = ctx->init_ctx();
-    if (ret < 0) {
-      return ret;
-    }
-    ctx->get();
-    ctx->bucket_ctx = &bucket_ctx;
-    ctx->oid = oid;
-    ctx->op = OP_DELETE_OBJ;
-
-    dispatcher->process_context(ctx);
-    ret = ctx->ret();
-    ctx->put();
-    return ret;
-  }
-
-  bool get_objects(std::list<std::string>* objects, int num) {
-    if (bucket_list_done) {
-      bucket_list_done = false;
-      return false;
-    }
-
-    struct req_context *ctx = new req_context;
-    int ret = ctx->init_ctx();
-    if (ret < 0) {
-      return ret;
-    }
-    ctx->get();
-    ctx->bucket_ctx = &bucket_ctx;
-    ctx->list_start = list_start;
-    ctx->list_objects = objects;
-    ctx->list_count = num;
-    ctx->op = OP_LIST_BUCKET;
-
-    dispatcher->process_context(ctx);
-    ret = ctx->ret();
-
-    list_start = ctx->list_start;
-    if (list_start == NULL || strcmp(list_start, "") == 0) {
-      bucket_list_done = true;
-      list_start = NULL;
-    }
-
-    ctx->put();
-
-    return ret == 0;
-  }
-
-  bool completion_is_done(int slot) {
-    return completions[slot]->complete;
-  }
-
-  int completion_wait(int slot) {
-    req_context *ctx = completions[slot];
-
-    Mutex::Locker l(ctx->lock);
-
-    while (!ctx->complete) {
-      ctx->cond.Wait(ctx->lock);
-    }
-
-    return 0;
-  }
-
-  int completion_ret(int slot) {
-    S3Status status = completions[slot]->status;
-    if (status != S3StatusOK)
-      return -EIO;
-    return 0;
-  }
-
-public:
-  RESTBencher(RESTDispatcher *_dispatcher) :
-      ObjBencher(_dispatcher->cct),
-      dispatcher(_dispatcher),
-      completions(NULL),
-      list_start(NULL),
-      bucket_list_done(false)
-  {
-    dispatcher->start();
-  }
-  ~RESTBencher() { }
-
-  int init(string& _agent, string& _host, string& _bucket, S3Protocol _protocol,
-           S3UriStyle uri_style, string& _access_key, string& _secret) {
-    user_agent = _agent;
-    host = _host;
-    bucket = _bucket;
-    protocol = _protocol;
-    access_key = _access_key;
-    secret = _secret;
-
-    bucket_ctx.hostName = NULL; // host.c_str();
-    bucket_ctx.bucketName = bucket.c_str();
-    bucket_ctx.protocol =  protocol;
-    bucket_ctx.accessKeyId = access_key.c_str();
-    bucket_ctx.secretAccessKey = secret.c_str();
-    bucket_ctx.uriStyle = uri_style;
-    
-    struct req_context *ctx = new req_context;
-
-    int ret = rest_init();
-    if (ret < 0) {
-      return ret;
-    }
-
-    ret = ctx->init_ctx();
-    if (ret < 0) {
-      return ret;
-    }
-
-    ctx->get();
-
-    S3ResponseHandler response_handler;
-    response_handler.propertiesCallback = properties_callback;
-    response_handler.completeCallback = complete_callback;
-
-    S3_create_bucket(protocol, access_key.c_str(), secret.c_str(), NULL,
-                     bucket.c_str(), S3CannedAclPrivate,
-                     NULL, /* locationConstraint */
-                     NULL, /* requestContext */
-                     &response_handler, /* handler */
-                     (void *)ctx  /* callbackData */);
-
-    ret = ctx->ret();
-    if (ret < 0) {
-      cerr << "ERROR: failed to create bucket: " << S3_get_status_name(ctx->status) << std::endl;
-      return ret;
-    }
-
-    ctx->put();
-
-    return 0;
-  }
-};
-
-int main(int argc, const char **argv)
-{
-  vector<const char*> args;
-  argv_to_vec(argc, argv, args);
-  env_to_vec(args);
-
-  global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
-  common_init_finish(g_ceph_context);
-
-  std::vector<const char*>::iterator i;
-  std::string host;
-  std::string val;
-  std::string user_agent;
-  std::string access_key;
-  std::string secret;
-  std::string bucket = DEFAULT_BUCKET;
-  S3Protocol protocol = S3ProtocolHTTP;
-  S3UriStyle uri_style = S3UriStylePath;
-  std::string proto_str;
-  int concurrent_ios = 16;
-  int op_size = 1 << 22;
-  int seconds = 60;
-
-  bool show_time = false;
-  bool cleanup = true;
-  std::string run_name;
-  std::string prefix;
-
-
-  for (i = args.begin(); i != args.end(); ) {
-    if (ceph_argparse_double_dash(args, i)) {
-      break;
-    } else if (ceph_argparse_flag(args, i, "-h", "--help", (char*)NULL)) {
-      usage(cout);
-      exit(0);
-    } else if (ceph_argparse_flag(args, i, "--show-time", (char*)NULL)) {
-      show_time = true;
-    } else if (ceph_argparse_flag(args, i, "--no-cleanup", (char*)NULL)) {
-      cleanup = false;
-    } else if (ceph_argparse_witharg(args, i, &user_agent, "--agent", (char*)NULL)) {
-      /* nothing */
-    } else if (ceph_argparse_witharg(args, i, &access_key, "--access-key", (char*)NULL)) {
-      /* nothing */
-    } else if (ceph_argparse_witharg(args, i, &secret, "--secret", (char*)NULL)) {
-      /* nothing */
-    } else if (ceph_argparse_witharg(args, i, &bucket, "--bucket", (char*)NULL)) {
-      /* nothing */
-    } else if (ceph_argparse_witharg(args, i, &host, "--api-host", (char*)NULL)) {
-      cerr << "host=" << host << std::endl;
-      /* nothing */
-    } else if (ceph_argparse_witharg(args, i, &proto_str, "--protocol", (char*)NULL)) {
-      if (strcasecmp(proto_str.c_str(), "http") == 0) {
-        protocol = S3ProtocolHTTP;
-      } else if (strcasecmp(proto_str.c_str(), "http") == 0) {
-        protocol = S3ProtocolHTTPS;
-      } else {
-        cerr << "bad protocol" << std::endl;
-        usage_exit();
-      }
-      /* nothing */
-    } else if (ceph_argparse_witharg(args, i, &proto_str, "--uri-style", (char*)NULL)) {
-      if (strcasecmp(proto_str.c_str(), "vhost") == 0) {
-        uri_style = S3UriStyleVirtualHost;
-      } else if (strcasecmp(proto_str.c_str(), "path") == 0) {
-        uri_style = S3UriStylePath;
-      } else {
-        cerr << "bad protocol" << std::endl;
-        usage_exit();
-      }
-    } else if (ceph_argparse_witharg(args, i, &val, "-t", "--concurrent-ios", (char*)NULL)) {
-      concurrent_ios = strtol(val.c_str(), NULL, 10);
-    } else if (ceph_argparse_witharg(args, i, &val, "--run-name", (char*)NULL)) {
-      run_name = val;
-    } else if (ceph_argparse_witharg(args, i, &val, "--prefix", (char*)NULL)) {
-      prefix = val;
-    } else if (ceph_argparse_witharg(args, i, &val, "--seconds", (char*)NULL)) {
-      seconds = strtol(val.c_str(), NULL, 10);
-    } else if (ceph_argparse_witharg(args, i, &val, "-b", "--block-size", (char*)NULL)) {
-      op_size = strtol(val.c_str(), NULL, 10);
-    } else {
-      if (val[0] == '-')
-        usage_exit();
-      ++i;
-    }
-  }
-
-  if (args.empty())
-    usage_exit();
-  int operation = 0;
-  if (strcmp(args[0], "write") == 0)
-    operation = OP_WRITE;
-  else if (strcmp(args[0], "seq") == 0)
-    operation = OP_SEQ_READ;
-  else if (strcmp(args[0], "rand") == 0)
-    operation = OP_RAND_READ;
-  else if (strcmp(args[0], "cleanup") == 0) {
-    operation = OP_CLEANUP;
-  } else
-    usage_exit();
-
-  if (host.empty()) {
-    cerr << "rest-bench: api host not provided." << std::endl;
-    usage_exit();
-  }
-
-  if (access_key.empty() || secret.empty()) {
-    cerr << "rest-bench: access key or secret was not provided" << std::endl;
-    usage_exit();
-  }
-
-  if (bucket.empty()) {
-    bucket = DEFAULT_BUCKET;
-  }
-
-  if (user_agent.empty())
-    user_agent = DEFAULT_USER_AGENT;
-
-  RESTDispatcher dispatcher(g_ceph_context, concurrent_ios);
-
-  RESTBencher bencher(&dispatcher);
-  bencher.set_show_time(show_time);
-
-  int ret = bencher.init(user_agent, host, bucket, protocol, uri_style, access_key, secret);
-  if (ret < 0) {
-    cerr << "failed initializing benchmark" << std::endl;
-    exit(1);
-  }
-
-  if (operation == OP_CLEANUP) {
-    ret = bencher.clean_up(prefix.c_str(), concurrent_ios, run_name.c_str());
-    if (ret != 0)
-      cerr << "error during cleanup: " << ret << std::endl;
-  } else {
-    ret = bencher.aio_bench(operation, seconds, 0,
-			    concurrent_ios, op_size, cleanup, run_name.c_str());
-    if (ret != 0) {
-        cerr << "error during benchmark: " << ret << std::endl;
-    }
-  }
-
-  return 0;
-}
-
diff --git a/src/tracing/Makefile.am b/src/tracing/Makefile.am
index 16d300e..5c6a4e2 100644
--- a/src/tracing/Makefile.am
+++ b/src/tracing/Makefile.am
@@ -1,79 +1,78 @@
-EXTRA_DIST = tracing-common.h
+EXTRA_DIST += \
+	tracing/tracing-common.h
 
 if WITH_LTTNG
-%.c %.h: %.tp
-	$(LTTNG_GEN_TP_PROG) $< -o $*.c -o $*.h
+tracing/%.h: tracing/%.tp
+	$(LTTNG_GEN_TP_PROG) $< -o tracing/$*.h
 endif
 
-dist_noinst_DATA = \
-	librados.tp \
-	librbd.tp \
-	oprequest.tp \
-	osd.tp \
-	pg.tp \
-	objectstore.tp
+dist_noinst_DATA += \
+	tracing/librados.tp \
+	tracing/librbd.tp \
+	tracing/oprequest.tp \
+	tracing/osd.tp \
+	tracing/pg.tp \
+	tracing/objectstore.tp
 
 if WITH_LTTNG
+libosd_tp_la_SOURCES = \
+	tracing/oprequest.c \
+	tracing/osd.c \
+	tracing/pg.c
 nodist_libosd_tp_la_SOURCES = \
-	oprequest.c \
-	oprequest.h \
-	osd.c \
-	osd.h \
-	pg.h \
-	pg.c
+	tracing/oprequest.h \
+	tracing/osd.h \
+	tracing/pg.h
 endif
 libosd_tp_la_LIBADD = -llttng-ust -ldl
-libosd_tp_la_CPPFLAGS = -DTRACEPOINT_PROBE_DYNAMIC_LINKAGE
-libosd_tp_la_LDFLAGS =
+libosd_tp_la_CFLAGS = -I$(top_srcdir)/src/tracing -I$(top_srcdir)/src $(AM_CFLAGS) -fpic
+libosd_tp_la_LDFLAGS = -version-info 1:0:0
 
 if WITH_LTTNG
+librados_tp_la_SOURCES = \
+	tracing/librados.c
 nodist_librados_tp_la_SOURCES = \
-	librados.c \
-	librados.h
+	tracing/librados.h
 endif
 librados_tp_la_LIBADD = -llttng-ust -ldl
-librados_tp_la_CPPFLAGS = -DTRACEPOINT_PROBE_DYNAMIC_LINKAGE
-librados_tp_la_CFLAGS = -I$(top_srcdir)/src $(AM_CFLAGS)
-librados_tp_la_LDFLAGS =
+librados_tp_la_CFLAGS = -I$(top_srcdir)/src/tracing -I$(top_srcdir)/src $(AM_CFLAGS) -fpic
+librados_tp_la_LDFLAGS = -version-info 2:0:0
 
 if WITH_LTTNG
+librbd_tp_la_SOURCES = \
+	tracing/librbd.c
 nodist_librbd_tp_la_SOURCES = \
-	librbd.c \
-	librbd.h
+	tracing/librbd.h
 endif
 librbd_tp_la_LIBADD = -llttng-ust -ldl
-librbd_tp_la_CPPFLAGS = -DTRACEPOINT_PROBE_DYNAMIC_LINKAGE
-librbd_tp_la_CFLAGS = -I$(top_srcdir)/src $(AM_CFLAGS)
-librbd_tp_la_LDFLAGS =
+librbd_tp_la_CFLAGS = -I$(top_srcdir)/src/tracing -I$(top_srcdir)/src $(AM_CFLAGS) -fpic
+librbd_tp_la_LDFLAGS = -version-info 1:0:0
 
 if WITH_LTTNG
+libos_tp_la_SOURCES = \
+	tracing/objectstore.c
 nodist_libos_tp_la_SOURCES = \
-	objectstore.c \
-	objectstore.h
+	tracing/objectstore.h
 endif
 libos_tp_la_LIBADD = -llttng-ust -ldl
-libos_tp_la_CPPFLAGS = -DTRACEPOINT_PROBE_DYNAMIC_LINKAGE
-libos_tp_la_CFLAGS = -I$(top_srcdir)/src $(AM_CFLAGS)
-libos_tp_la_LDFLAGS =
+libos_tp_la_CFLAGS = -I$(top_srcdir)/src/tracing -I$(top_srcdir)/src $(AM_CFLAGS) -fpic
+libos_tp_la_LDFLAGS = -version-info 1:0:0
 
 if WITH_LTTNG
-noinst_LTLIBRARIES = \
+lib_LTLIBRARIES += \
 	libosd_tp.la \
+	libos_tp.la \
 	librados_tp.la \
-	librbd_tp.la \
-	libos_tp.la
+	librbd_tp.la
 
-BUILT_SOURCES = \
-	librados.h \
-	librbd.h \
-	oprequest.h \
-	osd.h \
-	pg.h \
-	objectstore.h
+BUILT_SOURCES += \
+	tracing/librados.h \
+	tracing/librbd.h \
+	tracing/objectstore.h \
+	tracing/oprequest.h \
+	tracing/osd.h \
+	tracing/pg.h
 endif
 
-CLEANFILES = \
-	$(nodist_libosd_tp_la_SOURCES) \
-	$(nodist_librados_tp_la_SOURCES) \
-	$(nodist_librbd_tp_la_SOURCES) \
-	$(nodist_libos_tp_la_SOURCES)
+CLEANFILES += \
+	$(BUILT_SOURCES)
diff --git a/src/tracing/Makefile.in b/src/tracing/Makefile.in
deleted file mode 100644
index 13c3458..0000000
--- a/src/tracing/Makefile.in
+++ /dev/null
@@ -1,818 +0,0 @@
-# Makefile.in generated by automake 1.14.1 from Makefile.am.
-# @configure_input@
-
-# Copyright (C) 1994-2013 Free Software Foundation, Inc.
-
-# This Makefile.in is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
-# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
-# PARTICULAR PURPOSE.
-
- at SET_MAKE@
-
-
-VPATH = @srcdir@
-am__is_gnu_make = test -n '$(MAKEFILE_LIST)' && test -n '$(MAKELEVEL)'
-am__make_running_with_option = \
-  case $${target_option-} in \
-      ?) ;; \
-      *) echo "am__make_running_with_option: internal error: invalid" \
-              "target option '$${target_option-}' specified" >&2; \
-         exit 1;; \
-  esac; \
-  has_opt=no; \
-  sane_makeflags=$$MAKEFLAGS; \
-  if $(am__is_gnu_make); then \
-    sane_makeflags=$$MFLAGS; \
-  else \
-    case $$MAKEFLAGS in \
-      *\\[\ \	]*) \
-        bs=\\; \
-        sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
-          | sed "s/$$bs$$bs[$$bs $$bs	]*//g"`;; \
-    esac; \
-  fi; \
-  skip_next=no; \
-  strip_trailopt () \
-  { \
-    flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
-  }; \
-  for flg in $$sane_makeflags; do \
-    test $$skip_next = yes && { skip_next=no; continue; }; \
-    case $$flg in \
-      *=*|--*) continue;; \
-        -*I) strip_trailopt 'I'; skip_next=yes;; \
-      -*I?*) strip_trailopt 'I';; \
-        -*O) strip_trailopt 'O'; skip_next=yes;; \
-      -*O?*) strip_trailopt 'O';; \
-        -*l) strip_trailopt 'l'; skip_next=yes;; \
-      -*l?*) strip_trailopt 'l';; \
-      -[dEDm]) skip_next=yes;; \
-      -[JT]) skip_next=yes;; \
-    esac; \
-    case $$flg in \
-      *$$target_option*) has_opt=yes; break;; \
-    esac; \
-  done; \
-  test $$has_opt = yes
-am__make_dryrun = (target_option=n; $(am__make_running_with_option))
-am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
-pkgdatadir = $(datadir)/@PACKAGE@
-pkgincludedir = $(includedir)/@PACKAGE@
-pkglibdir = $(libdir)/@PACKAGE@
-pkglibexecdir = $(libexecdir)/@PACKAGE@
-am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
-install_sh_DATA = $(install_sh) -c -m 644
-install_sh_PROGRAM = $(install_sh) -c
-install_sh_SCRIPT = $(install_sh) -c
-INSTALL_HEADER = $(INSTALL_DATA)
-transform = $(program_transform_name)
-NORMAL_INSTALL = :
-PRE_INSTALL = :
-POST_INSTALL = :
-NORMAL_UNINSTALL = :
-PRE_UNINSTALL = :
-POST_UNINSTALL = :
-build_triplet = @build@
-host_triplet = @host@
-target_triplet = @target@
-subdir = src/tracing
-DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/Makefile.am \
-	$(top_srcdir)/depcomp $(dist_noinst_DATA)
-ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
-am__aclocal_m4_deps = $(top_srcdir)/m4/ac_check_classpath.m4 \
-	$(top_srcdir)/m4/ac_prog_jar.m4 \
-	$(top_srcdir)/m4/ac_prog_javac.m4 \
-	$(top_srcdir)/m4/ac_prog_javac_works.m4 \
-	$(top_srcdir)/m4/ac_prog_javah.m4 \
-	$(top_srcdir)/m4/acx_pthread.m4 $(top_srcdir)/m4/ax_arm.m4 \
-	$(top_srcdir)/m4/ax_c_pretty_func.m4 \
-	$(top_srcdir)/m4/ax_c_var_func.m4 \
-	$(top_srcdir)/m4/ax_check_compile_flag.m4 \
-	$(top_srcdir)/m4/ax_cxx_compile_stdcxx_11.m4 \
-	$(top_srcdir)/m4/ax_cxx_static_cast.m4 \
-	$(top_srcdir)/m4/ax_intel.m4 $(top_srcdir)/m4/libtool.m4 \
-	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
-	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
-	$(top_srcdir)/m4/pkg.m4 $(top_srcdir)/configure.ac
-am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
-	$(ACLOCAL_M4)
-mkinstalldirs = $(install_sh) -d
-CONFIG_HEADER = $(top_builddir)/src/acconfig.h
-CONFIG_CLEAN_FILES =
-CONFIG_CLEAN_VPATH_FILES =
-LTLIBRARIES = $(noinst_LTLIBRARIES)
-libos_tp_la_DEPENDENCIES =
- at WITH_LTTNG_TRUE@nodist_libos_tp_la_OBJECTS =  \
- at WITH_LTTNG_TRUE@	libos_tp_la-objectstore.lo
-libos_tp_la_OBJECTS = $(nodist_libos_tp_la_OBJECTS)
-AM_V_lt = $(am__v_lt_ at AM_V@)
-am__v_lt_ = $(am__v_lt_ at AM_DEFAULT_V@)
-am__v_lt_0 = --silent
-am__v_lt_1 = 
-libos_tp_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
-	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(libos_tp_la_CFLAGS) \
-	$(CFLAGS) $(libos_tp_la_LDFLAGS) $(LDFLAGS) -o $@
- at WITH_LTTNG_TRUE@am_libos_tp_la_rpath =
-libosd_tp_la_DEPENDENCIES =
- at WITH_LTTNG_TRUE@nodist_libosd_tp_la_OBJECTS =  \
- at WITH_LTTNG_TRUE@	libosd_tp_la-oprequest.lo libosd_tp_la-osd.lo \
- at WITH_LTTNG_TRUE@	libosd_tp_la-pg.lo
-libosd_tp_la_OBJECTS = $(nodist_libosd_tp_la_OBJECTS)
-libosd_tp_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
-	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
-	$(libosd_tp_la_LDFLAGS) $(LDFLAGS) -o $@
- at WITH_LTTNG_TRUE@am_libosd_tp_la_rpath =
-librados_tp_la_DEPENDENCIES =
- at WITH_LTTNG_TRUE@nodist_librados_tp_la_OBJECTS =  \
- at WITH_LTTNG_TRUE@	librados_tp_la-librados.lo
-librados_tp_la_OBJECTS = $(nodist_librados_tp_la_OBJECTS)
-librados_tp_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC \
-	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CCLD) \
-	$(librados_tp_la_CFLAGS) $(CFLAGS) $(librados_tp_la_LDFLAGS) \
-	$(LDFLAGS) -o $@
- at WITH_LTTNG_TRUE@am_librados_tp_la_rpath =
-librbd_tp_la_DEPENDENCIES =
- at WITH_LTTNG_TRUE@nodist_librbd_tp_la_OBJECTS = librbd_tp_la-librbd.lo
-librbd_tp_la_OBJECTS = $(nodist_librbd_tp_la_OBJECTS)
-librbd_tp_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
-	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(librbd_tp_la_CFLAGS) \
-	$(CFLAGS) $(librbd_tp_la_LDFLAGS) $(LDFLAGS) -o $@
- at WITH_LTTNG_TRUE@am_librbd_tp_la_rpath =
-AM_V_P = $(am__v_P_ at AM_V@)
-am__v_P_ = $(am__v_P_ at AM_DEFAULT_V@)
-am__v_P_0 = false
-am__v_P_1 = :
-AM_V_GEN = $(am__v_GEN_ at AM_V@)
-am__v_GEN_ = $(am__v_GEN_ at AM_DEFAULT_V@)
-am__v_GEN_0 = @echo "  GEN     " $@;
-am__v_GEN_1 = 
-AM_V_at = $(am__v_at_ at AM_V@)
-am__v_at_ = $(am__v_at_ at AM_DEFAULT_V@)
-am__v_at_0 = @
-am__v_at_1 = 
-DEFAULT_INCLUDES = -I. at am__isrc@ -I$(top_builddir)/src
-depcomp = $(SHELL) $(top_srcdir)/depcomp
-am__depfiles_maybe = depfiles
-am__mv = mv -f
-COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
-	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
-LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
-	$(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \
-	$(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
-	$(AM_CFLAGS) $(CFLAGS)
-AM_V_CC = $(am__v_CC_ at AM_V@)
-am__v_CC_ = $(am__v_CC_ at AM_DEFAULT_V@)
-am__v_CC_0 = @echo "  CC      " $@;
-am__v_CC_1 = 
-CCLD = $(CC)
-LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
-	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
-	$(AM_LDFLAGS) $(LDFLAGS) -o $@
-AM_V_CCLD = $(am__v_CCLD_ at AM_V@)
-am__v_CCLD_ = $(am__v_CCLD_ at AM_DEFAULT_V@)
-am__v_CCLD_0 = @echo "  CCLD    " $@;
-am__v_CCLD_1 = 
-SOURCES = $(nodist_libos_tp_la_SOURCES) $(nodist_libosd_tp_la_SOURCES) \
-	$(nodist_librados_tp_la_SOURCES) \
-	$(nodist_librbd_tp_la_SOURCES)
-DIST_SOURCES =
-am__can_run_installinfo = \
-  case $$AM_UPDATE_INFO_DIR in \
-    n|no|NO) false;; \
-    *) (install-info --version) >/dev/null 2>&1;; \
-  esac
-DATA = $(dist_noinst_DATA)
-am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
-# Read a list of newline-separated strings from the standard input,
-# and print each of them once, without duplicates.  Input order is
-# *not* preserved.
-am__uniquify_input = $(AWK) '\
-  BEGIN { nonempty = 0; } \
-  { items[$$0] = 1; nonempty = 1; } \
-  END { if (nonempty) { for (i in items) print i; }; } \
-'
-# Make sure the list of sources is unique.  This is necessary because,
-# e.g., the same source file might be shared among _SOURCES variables
-# for different programs/libraries.
-am__define_uniq_tagged_files = \
-  list='$(am__tagged_files)'; \
-  unique=`for i in $$list; do \
-    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
-  done | $(am__uniquify_input)`
-ETAGS = etags
-CTAGS = ctags
-DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
-ACLOCAL = @ACLOCAL@
-AMTAR = @AMTAR@
-AM_CXXFLAGS = @AM_CXXFLAGS@
-AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
-AR = @AR@
-ARM_FLAGS = @ARM_FLAGS@
-ARM_NEON_FLAGS = @ARM_NEON_FLAGS@
-AUTOCONF = @AUTOCONF@
-AUTOHEADER = @AUTOHEADER@
-AUTOMAKE = @AUTOMAKE@
-AWK = @AWK@
-BOOST_PROGRAM_OPTIONS_LIBS = @BOOST_PROGRAM_OPTIONS_LIBS@
-BOOST_THREAD_LIBS = @BOOST_THREAD_LIBS@
-CC = @CC@
-CCAS = @CCAS@
-CCASDEPMODE = @CCASDEPMODE@
-CCASFLAGS = @CCASFLAGS@
-CCDEPMODE = @CCDEPMODE@
-CFLAGS = @CFLAGS@
-CPP = @CPP@
-CPPFLAGS = @CPPFLAGS@
-CRYPTOPP_CFLAGS = @CRYPTOPP_CFLAGS@
-CRYPTOPP_LIBS = @CRYPTOPP_LIBS@
-CRYPTO_CFLAGS = @CRYPTO_CFLAGS@
-CRYPTO_LIBS = @CRYPTO_LIBS@
-CXX = @CXX@
-CXXCPP = @CXXCPP@
-CXXDEPMODE = @CXXDEPMODE@
-CXXFLAGS = @CXXFLAGS@
-CYGPATH_W = @CYGPATH_W@
-DEFS = @DEFS@
-DEPDIR = @DEPDIR@
-DLLTOOL = @DLLTOOL@
-DSYMUTIL = @DSYMUTIL@
-DUMPBIN = @DUMPBIN@
-ECHO_C = @ECHO_C@
-ECHO_N = @ECHO_N@
-ECHO_T = @ECHO_T@
-EGREP = @EGREP@
-EXEEXT = @EXEEXT@
-EXTRA_CLASSPATH_JAR = @EXTRA_CLASSPATH_JAR@
-FGREP = @FGREP@
-GCOV_PREFIX_STRIP = @GCOV_PREFIX_STRIP@
-GIT_CHECK = @GIT_CHECK@
-GREP = @GREP@
-HAVE_CXX11 = @HAVE_CXX11@
-HAVE_VALGRIND = @HAVE_VALGRIND@
-INSTALL = @INSTALL@
-INSTALL_DATA = @INSTALL_DATA@
-INSTALL_PROGRAM = @INSTALL_PROGRAM@
-INSTALL_SCRIPT = @INSTALL_SCRIPT@
-INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
-INTEL_FLAGS = @INTEL_FLAGS@
-INTEL_PCLMUL_FLAGS = @INTEL_PCLMUL_FLAGS@
-INTEL_SSE2_FLAGS = @INTEL_SSE2_FLAGS@
-INTEL_SSE3_FLAGS = @INTEL_SSE3_FLAGS@
-INTEL_SSE4_1_FLAGS = @INTEL_SSE4_1_FLAGS@
-INTEL_SSE4_2_FLAGS = @INTEL_SSE4_2_FLAGS@
-INTEL_SSE_FLAGS = @INTEL_SSE_FLAGS@
-INTEL_SSSE3_FLAGS = @INTEL_SSSE3_FLAGS@
-JAR = @JAR@
-JAVAC = @JAVAC@
-JAVAH = @JAVAH@
-JDK_CPPFLAGS = @JDK_CPPFLAGS@
-KEYUTILS_LIB = @KEYUTILS_LIB@
-LD = @LD@
-LDFLAGS = @LDFLAGS@
-LIBEDIT_CFLAGS = @LIBEDIT_CFLAGS@
-LIBEDIT_LIBS = @LIBEDIT_LIBS@
-LIBFUSE = @LIBFUSE@
-LIBJEMALLOC = @LIBJEMALLOC@
-LIBOBJS = @LIBOBJS@
-LIBROCKSDB_CFLAGS = @LIBROCKSDB_CFLAGS@
-LIBROCKSDB_LIBS = @LIBROCKSDB_LIBS@
-LIBS = @LIBS@
-LIBTCMALLOC = @LIBTCMALLOC@
-LIBTOOL = @LIBTOOL@
-LIBZFS_CFLAGS = @LIBZFS_CFLAGS@
-LIBZFS_LIBS = @LIBZFS_LIBS@
-LIPO = @LIPO@
-LN_S = @LN_S@
-LTLIBOBJS = @LTLIBOBJS@
-LTTNG_GEN_TP_CHECK = @LTTNG_GEN_TP_CHECK@
-LTTNG_GEN_TP_PROG = @LTTNG_GEN_TP_PROG@
-MAKEINFO = @MAKEINFO@
-MANIFEST_TOOL = @MANIFEST_TOOL@
-MKDIR_P = @MKDIR_P@
-NM = @NM@
-NMEDIT = @NMEDIT@
-NSS_CFLAGS = @NSS_CFLAGS@
-NSS_LIBS = @NSS_LIBS@
-OBJDUMP = @OBJDUMP@
-OBJEXT = @OBJEXT@
-OTOOL = @OTOOL@
-OTOOL64 = @OTOOL64@
-PACKAGE = @PACKAGE@
-PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
-PACKAGE_NAME = @PACKAGE_NAME@
-PACKAGE_STRING = @PACKAGE_STRING@
-PACKAGE_TARNAME = @PACKAGE_TARNAME@
-PACKAGE_URL = @PACKAGE_URL@
-PACKAGE_VERSION = @PACKAGE_VERSION@
-PATH_SEPARATOR = @PATH_SEPARATOR@
-PKG_CONFIG = @PKG_CONFIG@
-PKG_CONFIG_LIBDIR = @PKG_CONFIG_LIBDIR@
-PKG_CONFIG_PATH = @PKG_CONFIG_PATH@
-PTHREAD_CC = @PTHREAD_CC@
-PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
-PTHREAD_LIBS = @PTHREAD_LIBS@
-PYTHON = @PYTHON@
-PYTHON_EXEC_PREFIX = @PYTHON_EXEC_PREFIX@
-PYTHON_PLATFORM = @PYTHON_PLATFORM@
-PYTHON_PREFIX = @PYTHON_PREFIX@
-PYTHON_VERSION = @PYTHON_VERSION@
-RANLIB = @RANLIB@
-RESOLV_LIBS = @RESOLV_LIBS@
-RPM_RELEASE = @RPM_RELEASE@
-SED = @SED@
-SET_MAKE = @SET_MAKE@
-SHELL = @SHELL@
-STRIP = @STRIP@
-VERSION = @VERSION@
-WARN_ERROR_FORMAT_SECURITY = @WARN_ERROR_FORMAT_SECURITY@
-WARN_IGNORED_QUALIFIERS = @WARN_IGNORED_QUALIFIERS@
-WARN_TYPE_LIMITS = @WARN_TYPE_LIMITS@
-XIO_LIBS = @XIO_LIBS@
-YASM_CHECK = @YASM_CHECK@
-abs_builddir = @abs_builddir@
-abs_srcdir = @abs_srcdir@
-abs_top_builddir = @abs_top_builddir@
-abs_top_srcdir = @abs_top_srcdir@
-ac_ct_AR = @ac_ct_AR@
-ac_ct_CC = @ac_ct_CC@
-ac_ct_CXX = @ac_ct_CXX@
-ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
-acx_pthread_config = @acx_pthread_config@
-am__include = @am__include@
-am__leading_dot = @am__leading_dot@
-am__quote = @am__quote@
-am__tar = @am__tar@
-am__untar = @am__untar@
-bindir = @bindir@
-build = @build@
-build_alias = @build_alias@
-build_cpu = @build_cpu@
-build_os = @build_os@
-build_vendor = @build_vendor@
-builddir = @builddir@
-datadir = @datadir@
-datarootdir = @datarootdir@
-docdir = @docdir@
-dvidir = @dvidir@
-exec_prefix = @exec_prefix@
-host = @host@
-host_alias = @host_alias@
-host_cpu = @host_cpu@
-host_os = @host_os@
-host_vendor = @host_vendor@
-htmldir = @htmldir@
-includedir = @includedir@
-infodir = @infodir@
-install_sh = @install_sh@
-libdir = @libdir@
-libexecdir = @libexecdir@
-localedir = @localedir@
-localstatedir = @localstatedir@
-mandir = @mandir@
-mkdir_p = @mkdir_p@
-oldincludedir = @oldincludedir@
-pdfdir = @pdfdir@
-pkgpyexecdir = @pkgpyexecdir@
-pkgpythondir = @pkgpythondir@
-prefix = @prefix@
-program_transform_name = @program_transform_name@
-psdir = @psdir@
-pyexecdir = @pyexecdir@
-pythondir = @pythondir@
-sbindir = @sbindir@
-sharedstatedir = @sharedstatedir@
-srcdir = @srcdir@
-subdirs = @subdirs@
-sysconfdir = @sysconfdir@
-target = @target@
-target_alias = @target_alias@
-target_cpu = @target_cpu@
-target_os = @target_os@
-target_vendor = @target_vendor@
-top_build_prefix = @top_build_prefix@
-top_builddir = @top_builddir@
-top_srcdir = @top_srcdir@
-EXTRA_DIST = tracing-common.h
-dist_noinst_DATA = \
-	librados.tp \
-	librbd.tp \
-	oprequest.tp \
-	osd.tp \
-	pg.tp \
-	objectstore.tp
-
- at WITH_LTTNG_TRUE@nodist_libosd_tp_la_SOURCES = \
- at WITH_LTTNG_TRUE@	oprequest.c \
- at WITH_LTTNG_TRUE@	oprequest.h \
- at WITH_LTTNG_TRUE@	osd.c \
- at WITH_LTTNG_TRUE@	osd.h \
- at WITH_LTTNG_TRUE@	pg.h \
- at WITH_LTTNG_TRUE@	pg.c
-
-libosd_tp_la_LIBADD = -llttng-ust -ldl
-libosd_tp_la_CPPFLAGS = -DTRACEPOINT_PROBE_DYNAMIC_LINKAGE
-libosd_tp_la_LDFLAGS = 
- at WITH_LTTNG_TRUE@nodist_librados_tp_la_SOURCES = \
- at WITH_LTTNG_TRUE@	librados.c \
- at WITH_LTTNG_TRUE@	librados.h
-
-librados_tp_la_LIBADD = -llttng-ust -ldl
-librados_tp_la_CPPFLAGS = -DTRACEPOINT_PROBE_DYNAMIC_LINKAGE
-librados_tp_la_CFLAGS = -I$(top_srcdir)/src $(AM_CFLAGS)
-librados_tp_la_LDFLAGS = 
- at WITH_LTTNG_TRUE@nodist_librbd_tp_la_SOURCES = \
- at WITH_LTTNG_TRUE@	librbd.c \
- at WITH_LTTNG_TRUE@	librbd.h
-
-librbd_tp_la_LIBADD = -llttng-ust -ldl
-librbd_tp_la_CPPFLAGS = -DTRACEPOINT_PROBE_DYNAMIC_LINKAGE
-librbd_tp_la_CFLAGS = -I$(top_srcdir)/src $(AM_CFLAGS)
-librbd_tp_la_LDFLAGS = 
- at WITH_LTTNG_TRUE@nodist_libos_tp_la_SOURCES = \
- at WITH_LTTNG_TRUE@	objectstore.c \
- at WITH_LTTNG_TRUE@	objectstore.h
-
-libos_tp_la_LIBADD = -llttng-ust -ldl
-libos_tp_la_CPPFLAGS = -DTRACEPOINT_PROBE_DYNAMIC_LINKAGE
-libos_tp_la_CFLAGS = -I$(top_srcdir)/src $(AM_CFLAGS)
-libos_tp_la_LDFLAGS = 
- at WITH_LTTNG_TRUE@noinst_LTLIBRARIES = \
- at WITH_LTTNG_TRUE@	libosd_tp.la \
- at WITH_LTTNG_TRUE@	librados_tp.la \
- at WITH_LTTNG_TRUE@	librbd_tp.la \
- at WITH_LTTNG_TRUE@	libos_tp.la
-
- at WITH_LTTNG_TRUE@BUILT_SOURCES = \
- at WITH_LTTNG_TRUE@	librados.h \
- at WITH_LTTNG_TRUE@	librbd.h \
- at WITH_LTTNG_TRUE@	oprequest.h \
- at WITH_LTTNG_TRUE@	osd.h \
- at WITH_LTTNG_TRUE@	pg.h \
- at WITH_LTTNG_TRUE@	objectstore.h
-
-CLEANFILES = \
-	$(nodist_libosd_tp_la_SOURCES) \
-	$(nodist_librados_tp_la_SOURCES) \
-	$(nodist_librbd_tp_la_SOURCES) \
-	$(nodist_libos_tp_la_SOURCES)
-
-all: $(BUILT_SOURCES)
-	$(MAKE) $(AM_MAKEFLAGS) all-am
-
-.SUFFIXES:
-.SUFFIXES: .c .lo .o .obj
-$(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
-	@for dep in $?; do \
-	  case '$(am__configure_deps)' in \
-	    *$$dep*) \
-	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
-	        && { if test -f $@; then exit 0; else break; fi; }; \
-	      exit 1;; \
-	  esac; \
-	done; \
-	echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign src/tracing/Makefile'; \
-	$(am__cd) $(top_srcdir) && \
-	  $(AUTOMAKE) --foreign src/tracing/Makefile
-.PRECIOUS: Makefile
-Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
-	@case '$?' in \
-	  *config.status*) \
-	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
-	  *) \
-	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
-	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
-	esac;
-
-$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
-	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
-
-$(top_srcdir)/configure:  $(am__configure_deps)
-	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
-$(ACLOCAL_M4):  $(am__aclocal_m4_deps)
-	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
-$(am__aclocal_m4_deps):
-
-clean-noinstLTLIBRARIES:
-	-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
-	@list='$(noinst_LTLIBRARIES)'; \
-	locs=`for p in $$list; do echo $$p; done | \
-	      sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \
-	      sort -u`; \
-	test -z "$$locs" || { \
-	  echo rm -f $${locs}; \
-	  rm -f $${locs}; \
-	}
-
-libos_tp.la: $(libos_tp_la_OBJECTS) $(libos_tp_la_DEPENDENCIES) $(EXTRA_libos_tp_la_DEPENDENCIES) 
-	$(AM_V_CCLD)$(libos_tp_la_LINK) $(am_libos_tp_la_rpath) $(libos_tp_la_OBJECTS) $(libos_tp_la_LIBADD) $(LIBS)
-
-libosd_tp.la: $(libosd_tp_la_OBJECTS) $(libosd_tp_la_DEPENDENCIES) $(EXTRA_libosd_tp_la_DEPENDENCIES) 
-	$(AM_V_CCLD)$(libosd_tp_la_LINK) $(am_libosd_tp_la_rpath) $(libosd_tp_la_OBJECTS) $(libosd_tp_la_LIBADD) $(LIBS)
-
-librados_tp.la: $(librados_tp_la_OBJECTS) $(librados_tp_la_DEPENDENCIES) $(EXTRA_librados_tp_la_DEPENDENCIES) 
-	$(AM_V_CCLD)$(librados_tp_la_LINK) $(am_librados_tp_la_rpath) $(librados_tp_la_OBJECTS) $(librados_tp_la_LIBADD) $(LIBS)
-
-librbd_tp.la: $(librbd_tp_la_OBJECTS) $(librbd_tp_la_DEPENDENCIES) $(EXTRA_librbd_tp_la_DEPENDENCIES) 
-	$(AM_V_CCLD)$(librbd_tp_la_LINK) $(am_librbd_tp_la_rpath) $(librbd_tp_la_OBJECTS) $(librbd_tp_la_LIBADD) $(LIBS)
-
-mostlyclean-compile:
-	-rm -f *.$(OBJEXT)
-
-distclean-compile:
-	-rm -f *.tab.c
-
- at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libos_tp_la-objectstore.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libosd_tp_la-oprequest.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libosd_tp_la-osd.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libosd_tp_la-pg.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/librados_tp_la-librados.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/librbd_tp_la-librbd.Plo at am__quote@
-
-.c.o:
- at am__fastdepCC_TRUE@	$(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
- at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
- at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(COMPILE) -c -o $@ $<
-
-.c.obj:
- at am__fastdepCC_TRUE@	$(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
- at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
- at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(COMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
-
-.c.lo:
- at am__fastdepCC_TRUE@	$(AM_V_CC)$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
- at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
- at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LTCOMPILE) -c -o $@ $<
-
-libos_tp_la-objectstore.lo: objectstore.c
- at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libos_tp_la_CPPFLAGS) $(CPPFLAGS) $(libos_tp_la_CFLAGS) $(CFLAGS) -MT libos_tp_la-objectstore.lo -MD -MP -MF $(DEPDIR)/libos_tp_la-objectstore.Tpo -c -o libos_tp_la-objectstore.lo `test -f 'objectstore.c' || echo '$(srcdir)/'`objectstore.c
- at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libos_tp_la-objectstore.Tpo $(DEPDIR)/libos_tp_la-objectstore.Plo
- at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='objectstore.c' object='libos_tp_la-objectstore.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libos_tp_la_CPPFLAGS) $(CPPFLAGS) $(libos_tp_la_CFLAGS) $(CFLAGS) -c -o libos_tp_la-objectstore.lo `test -f 'objectstore.c' || echo '$(srcdir)/'`objectstore.c
-
-libosd_tp_la-oprequest.lo: oprequest.c
- at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libosd_tp_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libosd_tp_la-oprequest.lo -MD -MP -MF $(DEPDIR)/libosd_tp_la-oprequest.Tpo -c -o libosd_tp_la-oprequest.lo `test -f 'oprequest.c' || echo '$(srcdir)/'`oprequest.c
- at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libosd_tp_la-oprequest.Tpo $(DEPDIR)/libosd_tp_la-oprequest.Plo
- at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='oprequest.c' object='libosd_tp_la-oprequest.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libosd_tp_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libosd_tp_la-oprequest.lo `test -f 'oprequest.c' || echo '$(srcdir)/'`oprequest.c
-
-libosd_tp_la-osd.lo: osd.c
- at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libosd_tp_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libosd_tp_la-osd.lo -MD -MP -MF $(DEPDIR)/libosd_tp_la-osd.Tpo -c -o libosd_tp_la-osd.lo `test -f 'osd.c' || echo '$(srcdir)/'`osd.c
- at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libosd_tp_la-osd.Tpo $(DEPDIR)/libosd_tp_la-osd.Plo
- at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='osd.c' object='libosd_tp_la-osd.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libosd_tp_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libosd_tp_la-osd.lo `test -f 'osd.c' || echo '$(srcdir)/'`osd.c
-
-libosd_tp_la-pg.lo: pg.c
- at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libosd_tp_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libosd_tp_la-pg.lo -MD -MP -MF $(DEPDIR)/libosd_tp_la-pg.Tpo -c -o libosd_tp_la-pg.lo `test -f 'pg.c' || echo '$(srcdir)/'`pg.c
- at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libosd_tp_la-pg.Tpo $(DEPDIR)/libosd_tp_la-pg.Plo
- at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='pg.c' object='libosd_tp_la-pg.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libosd_tp_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libosd_tp_la-pg.lo `test -f 'pg.c' || echo '$(srcdir)/'`pg.c
-
-librados_tp_la-librados.lo: librados.c
- at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(librados_tp_la_CPPFLAGS) $(CPPFLAGS) $(librados_tp_la_CFLAGS) $(CFLAGS) -MT librados_tp_la-librados.lo -MD -MP -MF $(DEPDIR)/librados_tp_la-librados.Tpo -c -o librados_tp_la-librados.lo `test -f 'librados.c' || echo '$(srcdir)/'`librados.c
- at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/librados_tp_la-librados.Tpo $(DEPDIR)/librados_tp_la-librados.Plo
- at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='librados.c' object='librados_tp_la-librados.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(librados_tp_la_CPPFLAGS) $(CPPFLAGS) $(librados_tp_la_CFLAGS) $(CFLAGS) -c -o librados_tp_la-librados.lo `test -f 'librados.c' || echo '$(srcdir)/'`librados.c
-
-librbd_tp_la-librbd.lo: librbd.c
- at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(librbd_tp_la_CPPFLAGS) $(CPPFLAGS) $(librbd_tp_la_CFLAGS) $(CFLAGS) -MT librbd_tp_la-librbd.lo -MD -MP -MF $(DEPDIR)/librbd_tp_la-librbd.Tpo -c -o librbd_tp_la-librbd.lo `test -f 'librbd.c' || echo '$(srcdir)/'`librbd.c
- at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/librbd_tp_la-librbd.Tpo $(DEPDIR)/librbd_tp_la-librbd.Plo
- at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='librbd.c' object='librbd_tp_la-librbd.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(librbd_tp_la_CPPFLAGS) $(CPPFLAGS) $(librbd_tp_la_CFLAGS) $(CFLAGS) -c -o librbd_tp_la-librbd.lo `test -f 'librbd.c' || echo '$(srcdir)/'`librbd.c
-
-mostlyclean-libtool:
-	-rm -f *.lo
-
-clean-libtool:
-	-rm -rf .libs _libs
-
-ID: $(am__tagged_files)
-	$(am__define_uniq_tagged_files); mkid -fID $$unique
-tags: tags-am
-TAGS: tags
-
-tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
-	set x; \
-	here=`pwd`; \
-	$(am__define_uniq_tagged_files); \
-	shift; \
-	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
-	  test -n "$$unique" || unique=$$empty_fix; \
-	  if test $$# -gt 0; then \
-	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
-	      "$$@" $$unique; \
-	  else \
-	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
-	      $$unique; \
-	  fi; \
-	fi
-ctags: ctags-am
-
-CTAGS: ctags
-ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
-	$(am__define_uniq_tagged_files); \
-	test -z "$(CTAGS_ARGS)$$unique" \
-	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
-	     $$unique
-
-GTAGS:
-	here=`$(am__cd) $(top_builddir) && pwd` \
-	  && $(am__cd) $(top_srcdir) \
-	  && gtags -i $(GTAGS_ARGS) "$$here"
-cscopelist: cscopelist-am
-
-cscopelist-am: $(am__tagged_files)
-	list='$(am__tagged_files)'; \
-	case "$(srcdir)" in \
-	  [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \
-	  *) sdir=$(subdir)/$(srcdir) ;; \
-	esac; \
-	for i in $$list; do \
-	  if test -f "$$i"; then \
-	    echo "$(subdir)/$$i"; \
-	  else \
-	    echo "$$sdir/$$i"; \
-	  fi; \
-	done >> $(top_builddir)/cscope.files
-
-distclean-tags:
-	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
-
-distdir: $(DISTFILES)
-	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
-	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
-	list='$(DISTFILES)'; \
-	  dist_files=`for file in $$list; do echo $$file; done | \
-	  sed -e "s|^$$srcdirstrip/||;t" \
-	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
-	case $$dist_files in \
-	  */*) $(MKDIR_P) `echo "$$dist_files" | \
-			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
-			   sort -u` ;; \
-	esac; \
-	for file in $$dist_files; do \
-	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
-	  if test -d $$d/$$file; then \
-	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
-	    if test -d "$(distdir)/$$file"; then \
-	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
-	    fi; \
-	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
-	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
-	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
-	    fi; \
-	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
-	  else \
-	    test -f "$(distdir)/$$file" \
-	    || cp -p $$d/$$file "$(distdir)/$$file" \
-	    || exit 1; \
-	  fi; \
-	done
-check-am: all-am
-check: $(BUILT_SOURCES)
-	$(MAKE) $(AM_MAKEFLAGS) check-am
-all-am: Makefile $(LTLIBRARIES) $(DATA)
-installdirs:
-install: $(BUILT_SOURCES)
-	$(MAKE) $(AM_MAKEFLAGS) install-am
-install-exec: install-exec-am
-install-data: install-data-am
-uninstall: uninstall-am
-
-install-am: all-am
-	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
-
-installcheck: installcheck-am
-install-strip:
-	if test -z '$(STRIP)'; then \
-	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
-	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
-	      install; \
-	else \
-	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
-	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
-	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
-	fi
-mostlyclean-generic:
-
-clean-generic:
-	-test -z "$(CLEANFILES)" || rm -f $(CLEANFILES)
-
-distclean-generic:
-	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
-	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
-
-maintainer-clean-generic:
-	@echo "This command is intended for maintainers to use"
-	@echo "it deletes files that may require special tools to rebuild."
-	-test -z "$(BUILT_SOURCES)" || rm -f $(BUILT_SOURCES)
-clean: clean-am
-
-clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \
-	mostlyclean-am
-
-distclean: distclean-am
-	-rm -rf ./$(DEPDIR)
-	-rm -f Makefile
-distclean-am: clean-am distclean-compile distclean-generic \
-	distclean-tags
-
-dvi: dvi-am
-
-dvi-am:
-
-html: html-am
-
-html-am:
-
-info: info-am
-
-info-am:
-
-install-data-am:
-
-install-dvi: install-dvi-am
-
-install-dvi-am:
-
-install-exec-am:
-
-install-html: install-html-am
-
-install-html-am:
-
-install-info: install-info-am
-
-install-info-am:
-
-install-man:
-
-install-pdf: install-pdf-am
-
-install-pdf-am:
-
-install-ps: install-ps-am
-
-install-ps-am:
-
-installcheck-am:
-
-maintainer-clean: maintainer-clean-am
-	-rm -rf ./$(DEPDIR)
-	-rm -f Makefile
-maintainer-clean-am: distclean-am maintainer-clean-generic
-
-mostlyclean: mostlyclean-am
-
-mostlyclean-am: mostlyclean-compile mostlyclean-generic \
-	mostlyclean-libtool
-
-pdf: pdf-am
-
-pdf-am:
-
-ps: ps-am
-
-ps-am:
-
-uninstall-am:
-
-.MAKE: all check install install-am install-strip
-
-.PHONY: CTAGS GTAGS TAGS all all-am check check-am clean clean-generic \
-	clean-libtool clean-noinstLTLIBRARIES cscopelist-am ctags \
-	ctags-am distclean distclean-compile distclean-generic \
-	distclean-libtool distclean-tags distdir dvi dvi-am html \
-	html-am info info-am install install-am install-data \
-	install-data-am install-dvi install-dvi-am install-exec \
-	install-exec-am install-html install-html-am install-info \
-	install-info-am install-man install-pdf install-pdf-am \
-	install-ps install-ps-am install-strip installcheck \
-	installcheck-am installdirs maintainer-clean \
-	maintainer-clean-generic mostlyclean mostlyclean-compile \
-	mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \
-	tags tags-am uninstall uninstall-am
-
-
- at WITH_LTTNG_TRUE@%.c %.h: %.tp
- at WITH_LTTNG_TRUE@	$(LTTNG_GEN_TP_PROG) $< -o $*.c -o $*.h
-
-# Tell versions [3.59,3.63) of GNU make to not export all variables.
-# Otherwise a system limit (for SysV at least) may be exceeded.
-.NOEXPORT:
diff --git a/src/tracing/librados.c b/src/tracing/librados.c
new file mode 100644
index 0000000..5b8d4d6
--- /dev/null
+++ b/src/tracing/librados.c
@@ -0,0 +1,6 @@
+
+#define TRACEPOINT_CREATE_PROBES
+/*
+ * The header containing our TRACEPOINT_EVENTs.
+ */
+#include "librados.h"
diff --git a/src/tracing/librbd.c b/src/tracing/librbd.c
new file mode 100644
index 0000000..6e5977c
--- /dev/null
+++ b/src/tracing/librbd.c
@@ -0,0 +1,6 @@
+
+#define TRACEPOINT_CREATE_PROBES
+/*
+ * The header containing our TRACEPOINT_EVENTs.
+ */
+#include "librbd.h"
diff --git a/src/tracing/librbd.tp b/src/tracing/librbd.tp
index bc0b662..f9101bd 100644
--- a/src/tracing/librbd.tp
+++ b/src/tracing/librbd.tp
@@ -185,8 +185,10 @@ TRACEPOINT_EVENT(librbd, close_image_enter,
 )
 
 TRACEPOINT_EVENT(librbd, close_image_exit,
-    TP_ARGS(),
-    TP_FIELDS()
+    TP_ARGS(
+	int, retval),
+    TP_FIELDS(
+	ctf_integer(int, retval, retval))
 )
 
 TRACEPOINT_EVENT(librbd, list_enter,
@@ -479,6 +481,92 @@ TRACEPOINT_EVENT(librbd, invalidate_cache_exit,
     )
 )
 
+TRACEPOINT_EVENT(librbd, metadata_get_enter,
+    TP_ARGS(
+        void*, imagectx,
+        const char*, key),
+    TP_FIELDS(
+        ctf_integer_hex(void*, imagectx, imagectx)
+        ctf_string(key, key)
+    )
+)
+
+TRACEPOINT_EVENT(librbd, metadata_get_exit,
+    TP_ARGS(
+        int, retval,
+        const char*, key,
+        const char*, value),
+    TP_FIELDS(
+        ctf_integer(int, retval, retval)
+        ctf_string(key, key)
+        ctf_string(value, value)
+    )
+)
+
+TRACEPOINT_EVENT(librbd, metadata_set_enter,
+    TP_ARGS(
+        void*, imagectx,
+        const char*, key,
+        const char*, value),
+    TP_FIELDS(
+        ctf_integer_hex(void*, imagectx, imagectx)
+        ctf_string(key, key)
+        ctf_string(value, value)
+    )
+)
+
+TRACEPOINT_EVENT(librbd, metadata_set_exit,
+    TP_ARGS(
+        int, retval),
+    TP_FIELDS(
+        ctf_integer(int, retval, retval)
+    )
+)
+
+TRACEPOINT_EVENT(librbd, metadata_remove_enter,
+    TP_ARGS(
+        void*, imagectx,
+        const char*, key),
+    TP_FIELDS(
+        ctf_integer_hex(void*, imagectx, imagectx)
+        ctf_string(key, key)
+    )
+)
+
+TRACEPOINT_EVENT(librbd, metadata_remove_exit,
+    TP_ARGS(
+        int, retval),
+    TP_FIELDS(
+        ctf_integer(int, retval, retval)
+    )
+)
+
+TRACEPOINT_EVENT(librbd, metadata_list_enter,
+    TP_ARGS(
+        void*, imagectx),
+    TP_FIELDS(
+        ctf_integer_hex(void*, imagectx, imagectx)
+    )
+)
+
+TRACEPOINT_EVENT(librbd, metadata_list_entry,
+    TP_ARGS(
+        const char*, key,
+        const char*, value),
+    TP_FIELDS(
+        ctf_string(key, key)
+        ctf_string(value, value)
+    )
+)
+
+TRACEPOINT_EVENT(librbd, metadata_list_exit,
+    TP_ARGS(
+        int, retval),
+    TP_FIELDS(
+        ctf_integer(int, retval, retval)
+    )
+)
+
 TRACEPOINT_EVENT(librbd, flush_enter,
     TP_ARGS(
         void*, imagectx,
@@ -1201,7 +1289,9 @@ TRACEPOINT_EVENT(librbd, diff_iterate_enter,
         char, read_only,
         const char*, from_snap_name,
         uint64_t, offset,
-        uint64_t, length),
+        uint64_t, length,
+        char, include_parent,
+        char, whole_object),
     TP_FIELDS(
         ctf_integer_hex(void*, imagectx, imagectx)
         ctf_string(name, name)
@@ -1210,6 +1300,8 @@ TRACEPOINT_EVENT(librbd, diff_iterate_enter,
         ctf_string(from_snap_name, from_snap_name)
         ctf_integer(uint64_t, offset, offset)
         ctf_integer(uint64_t, length, length)
+        ctf_integer(char, include_parent, include_parent)
+        ctf_integer(char, whole_object, whole_object)
     )
 )
 
@@ -1345,6 +1437,26 @@ TRACEPOINT_EVENT(librbd, get_features_exit,
     )
 )
 
+TRACEPOINT_EVENT(librbd, update_features_enter,
+    TP_ARGS(
+        void*, imagectx,
+        uint64_t, features,
+        char, enabled),
+    TP_FIELDS(
+        ctf_integer_hex(void*, imagectx, imagectx)
+        ctf_integer(uint64_t, features, features)
+	ctf_integer(char, enabled, enabled)
+    )
+)
+
+TRACEPOINT_EVENT(librbd, update_features_exit,
+    TP_ARGS(
+        int, retval),
+    TP_FIELDS(
+        ctf_integer(int, retval, retval)
+    )
+)
+
 TRACEPOINT_EVENT(librbd, get_size_enter,
     TP_ARGS(
         void*, imagectx,
diff --git a/src/tracing/objectstore.c b/src/tracing/objectstore.c
new file mode 100644
index 0000000..21bbf27
--- /dev/null
+++ b/src/tracing/objectstore.c
@@ -0,0 +1,6 @@
+
+#define TRACEPOINT_CREATE_PROBES
+/*
+ * The header containing our TRACEPOINT_EVENTs.
+ */
+#include "objectstore.h"
diff --git a/src/tracing/oprequest.c b/src/tracing/oprequest.c
new file mode 100644
index 0000000..02f1fc5
--- /dev/null
+++ b/src/tracing/oprequest.c
@@ -0,0 +1,6 @@
+
+#define TRACEPOINT_CREATE_PROBES
+/*
+ * The header containing our TRACEPOINT_EVENTs.
+ */
+#include "oprequest.h"
diff --git a/src/tracing/osd.c b/src/tracing/osd.c
new file mode 100644
index 0000000..ff70ddf
--- /dev/null
+++ b/src/tracing/osd.c
@@ -0,0 +1,6 @@
+
+#define TRACEPOINT_CREATE_PROBES
+/*
+ * The header containing our TRACEPOINT_EVENTs.
+ */
+#include "osd.h"
diff --git a/src/tracing/pg.c b/src/tracing/pg.c
new file mode 100644
index 0000000..661ebb7
--- /dev/null
+++ b/src/tracing/pg.c
@@ -0,0 +1,6 @@
+
+#define TRACEPOINT_CREATE_PROBES
+/*
+ * The header containing our TRACEPOINT_EVENTs.
+ */
+#include "pg.h"
diff --git a/src/upstart/ceph-disk.conf b/src/upstart/ceph-disk.conf
new file mode 100644
index 0000000..558c1f5
--- /dev/null
+++ b/src/upstart/ceph-disk.conf
@@ -0,0 +1,10 @@
+description "ceph-disk async worker"
+
+start on ceph-disk
+
+instance $dev/$pid
+export dev
+export pid
+
+exec flock /var/lock/ceph-disk -c 'ceph-disk --verbose --log-stdout trigger --sync $dev'
+
diff --git a/src/upstart/ceph-mds.conf b/src/upstart/ceph-mds.conf
index 4063d91..7c69117 100644
--- a/src/upstart/ceph-mds.conf
+++ b/src/upstart/ceph-mds.conf
@@ -13,7 +13,7 @@ pre-start script
     test -x /usr/bin/ceph-mds || { stop; exit 0; }
     test -d "/var/lib/ceph/mds/${cluster:-ceph}-$id" || { stop; exit 0; }
 
-    install -d -m0755 /var/run/ceph
+    install -d -m0770 -o ceph -g ceph /var/run/ceph
 end script
 
 instance ${cluster:-ceph}/$id
@@ -23,4 +23,4 @@ export id
 # this breaks oneiric
 #usage "cluster = name of cluster (defaults to 'ceph'); id = mds instance id"
 
-exec /usr/bin/ceph-mds --cluster="${cluster:-ceph}" -i "$id" -f
+exec /usr/bin/ceph-mds --cluster="${cluster:-ceph}" -i "$id" -f --setuser ceph --setgroup ceph
diff --git a/src/upstart/ceph-mon.conf b/src/upstart/ceph-mon.conf
index 83c9858..8911945 100644
--- a/src/upstart/ceph-mon.conf
+++ b/src/upstart/ceph-mon.conf
@@ -13,7 +13,7 @@ pre-start script
     test -x /usr/bin/ceph-mon || { stop; exit 0; }
     test -d "/var/lib/ceph/mon/${cluster:-ceph}-$id" || { stop; exit 0; }
 
-    install -d -m0755 /var/run/ceph
+    install -d -m0770 -o ceph -g ceph /var/run/ceph
 end script
 
 instance ${cluster:-ceph}/$id
@@ -23,7 +23,7 @@ export id
 # this breaks oneiric
 #usage "cluster = name of cluster (defaults to 'ceph'); id = monitor instance id"
 
-exec /usr/bin/ceph-mon --cluster="${cluster:-ceph}" -i "$id" -f
+exec /usr/bin/ceph-mon --cluster="${cluster:-ceph}" -i "$id" -f --setuser ceph --setgroup ceph
 
 post-stop script
     # Cleanup socket in case of segfault
diff --git a/src/upstart/ceph-osd.conf b/src/upstart/ceph-osd.conf
index 2438c20..02ca238 100644
--- a/src/upstart/ceph-osd.conf
+++ b/src/upstart/ceph-osd.conf
@@ -13,7 +13,7 @@ pre-start script
     test -x /usr/bin/ceph-osd || { stop; exit 0; }
     test -d "/var/lib/ceph/osd/${cluster:-ceph}-$id" || { stop; exit 0; }
 
-    install -d -m0755 /var/run/ceph
+    install -d -m0770 -o ceph -g ceph /var/run/ceph
 
     /usr/libexec/ceph/ceph-osd-prestart.sh --cluster="${cluster:-ceph}" -i "$id"
 end script
@@ -22,4 +22,4 @@ instance ${cluster:-ceph}/$id
 export cluster
 export id
 
-exec /usr/bin/ceph-osd --cluster="${cluster:-ceph}" -i "$id" -f
+exec /usr/bin/ceph-osd --cluster="${cluster:-ceph}" -i "$id" -f --setuser ceph --setgroup ceph
diff --git a/src/upstart/radosgw.conf b/src/upstart/radosgw.conf
index d1b5bc3..828c314 100644
--- a/src/upstart/radosgw.conf
+++ b/src/upstart/radosgw.conf
@@ -13,7 +13,7 @@ pre-start script
     test -x /usr/bin/radosgw || { stop; exit 0; }
     test -d "/var/lib/ceph/radosgw/${cluster:-ceph}-$id" || { stop; exit 0; }
 
-    install -d -m0755 /var/run/ceph
+    install -d -m0770 -o ceph -g ceph /var/run/ceph
 end script
 
 instance ${cluster:-ceph}/$id
@@ -23,4 +23,4 @@ export id
 # this breaks oneiric
 #usage "cluster = name of cluster (defaults to 'ceph'); id = mds instance id"
 
-exec /usr/bin/radosgw --cluster="${cluster:-ceph}" --id "$id" -f
+exec /usr/bin/radosgw --cluster="${cluster:-ceph}" --id "$id" -f --setuser ceph --setgroup ceph
diff --git a/src/verify-mds-journal.sh b/src/verify-mds-journal.sh
deleted file mode 100755
index 22ebac0..0000000
--- a/src/verify-mds-journal.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/bash
-
-while [ 1 ]
-do
-  ./ceph-mds -f --debug_mds 20 --debug_ms 1 --standby_replay_for 0 || exit 1
-  echo replay ok, sleeping
-  sleep 30
-done
\ No newline at end of file
diff --git a/src/vstart.sh b/src/vstart.sh
index 87b4a57..679d33a 100755
--- a/src/vstart.sh
+++ b/src/vstart.sh
@@ -1,5 +1,28 @@
 #!/bin/sh
 
+# abort on failure
+set -e
+
+if [ -e CMakeCache.txt ]; then
+  # Out of tree build, learn source location from CMakeCache.txt
+  SRC_ROOT=`grep Ceph_SOURCE_DIR CMakeCache.txt | cut -d "=" -f 2`
+  [ -z "$PYBIND" ] && PYBIND=$SRC_ROOT/src/pybind
+  [ -z "$CEPH_ADM" ] && CEPH_ADM=./ceph
+  [ -z "$INIT_CEPH" ] && INIT_CEPH=./init-ceph
+  [ -z "$CEPH_BIN" ] && CEPH_BIN=src
+  [ -z "$CEPH_LIB" ] && CEPH_LIB=src
+  [ -z "$OBJCLASS_PATH" ] && OBJCLASS_PATH=src/cls
+
+  # Gather symlinks to EC plugins in one dir, because with CMake they
+  # are built into multiple locations
+  mkdir -p ec_plugins
+  for file in ./src/erasure-code/*/libec_*.so*;
+  do
+    ln -sf ../${file} ec_plugins/`basename $file`
+  done
+  [ -z "$EC_PATH" ] && EC_PATH=./ec_plugins
+fi
+
 if [ -z "$CEPH_BUILD_ROOT" ]; then
         [ -z "$CEPH_BIN" ] && CEPH_BIN=.
         [ -z "$CEPH_LIB" ] && CEPH_LIB=.libs
@@ -16,12 +39,11 @@ if [ -z "${CEPH_VSTART_WRAPPER}" ]; then
     PATH=$(pwd):$PATH
 fi
 
-export PYTHONPATH=./pybind
-export LD_LIBRARY_PATH=$CEPH_LIB
-export DYLD_LIBRARY_PATH=$LD_LIBRARY_PATH
+[ -z "$PYBIND" ] && PYBIND=./pybind
 
-# abort on failure
-set -e
+export PYTHONPATH=$PYBIND
+export LD_LIBRARY_PATH=$CEPH_LIB:$LD_LIBRARY_PATH
+export DYLD_LIBRARY_PATH=$CEPH_LIB:$DYLD_LIBRARY_PATH
 
 [ -z "$CEPH_NUM_MON" ] && CEPH_NUM_MON="$MON"
 [ -z "$CEPH_NUM_OSD" ] && CEPH_NUM_OSD="$OSD"
@@ -56,6 +78,7 @@ overwrite_conf=1
 cephx=1 #turn cephx on by default
 cache=""
 memstore=0
+newstore=0
 journal=1
 
 MON_ADDR=""
@@ -84,7 +107,10 @@ usage=$usage"\t--hitset <pool> <hit_set_type>: enable hitset tracking\n"
 usage=$usage"\t-e : create an erasure pool\n";
 usage=$usage"\t-o config\t\t add extra config parameters to all sections\n"
 usage=$usage"\t-J no journal\t\tdisable filestore journal\n"
-
+usage=$usage"\t--mon_num specify ceph monitor count\n"
+usage=$usage"\t--osd_num specify ceph osd count\n"
+usage=$usage"\t--mds_num specify ceph mds count\n"
+usage=$usage"\t--rgw_port specify ceph rgw http listen port\n"
 
 usage_exit() {
 	printf "$usage"
@@ -142,6 +168,23 @@ case $1 in
     --smallmds )
 	    smallmds=1
 	    ;;
+    --mon_num )
+            echo "mon_num:$2"
+            CEPH_NUM_MON="$2"
+            shift
+            ;;
+    --osd_num )
+            CEPH_NUM_OSD=$2
+            shift
+            ;;
+    --mds_num )
+            CEPH_NUM_MDS=$2
+            shift
+            ;;
+    --rgw_port )
+            CEPH_RGW_PORT=$2
+            shift
+            ;;
     mon )
 	    start_mon=1
 	    start_all=0
@@ -174,6 +217,9 @@ case $1 in
     --memstore )
 	    memstore=1
 	    ;;
+    --newstore )
+	    newstore=1
+	    ;;
     --hitset )
 	    hitset="$hitset $2 $3"
 	    shift
@@ -249,6 +295,7 @@ else
         debug monc = 20
         debug journal = 20
         debug filestore = 20
+        debug newstore = 30
         debug rgw = 20
         debug objclass = 20'
     CMDSDEBUG='
@@ -271,6 +318,10 @@ if [ "$memstore" -eq 1 ]; then
     COSDMEMSTORE='
 	osd objectstore = memstore'
 fi
+if [ "$newstore" -eq 1 ]; then
+    COSDMEMSTORE='
+	osd objectstore = newstore'
+fi
 
 # lockdep everywhere?
 # export CEPH_ARGS="--lockdep 1"
@@ -304,22 +355,22 @@ if [ -n "$ip" ]; then
     IP="$ip"
 else
     echo hostname $HOSTNAME
-    RAW_IP=`hostname -I`
     # filter out IPv6 and localhost addresses
-    IP="$(echo "$RAW_IP"|tr ' ' '\012'|grep -v :|grep -v '^127\.'|head -n1)"
-    # if that left nothing, then try to use the raw thing, it might work
-    if [ -z "$IP" ]; then IP="$RAW_IP"; fi
+    IP="$(ifconfig | sed -En 's/127.0.0.1//;s/.*inet (addr:)?(([0-9]*\.){3}[0-9]*).*/\2/p' | head -n1)"
+    # if nothing left, try using localhost address, it might work
+    if [ -z "$IP" ]; then IP="127.0.0.1"; fi
     echo ip $IP
 fi
 echo "ip $IP"
 echo "port $PORT"
 
 
+[ -z $CEPH_ADM ] && CEPH_ADM=$CEPH_BIN/ceph
 
 if [ "$cephx" -eq 1 ]; then
-    CEPH_ADM="$CEPH_BIN/ceph -c $conf_fn -k $keyring_fn"
+    CEPH_ADM="$CEPH_ADM -c $conf_fn -k $keyring_fn"
 else
-    CEPH_ADM="$CEPH_BIN/ceph -c $conf_fn"
+    CEPH_ADM="$CEPH_ADM -c $conf_fn"
 fi
 
 MONS=""
@@ -361,11 +412,13 @@ if [ "$start_mon" -eq 1 ]; then
         mon osd full ratio = .99
         mon data avail warn = 10
         mon data avail crit = 1
-        osd pool default erasure code directory = $EC_PATH
+        erasure code dir = $EC_PATH
         osd pool default erasure code profile = plugin=jerasure technique=reed_sol_van k=2 m=1 ruleset-failure-domain=osd
         rgw frontends = fastcgi, civetweb port=$CEPH_RGW_PORT
+        rgw dns name = localhost
         filestore fd cache size = 32
         run dir = $CEPH_OUT_DIR
+        enable experimental unrecoverable data corrupting features = *
 EOF
 if [ "$cephx" -eq 1 ] ; then
 cat <<EOF >> $conf_fn
@@ -420,6 +473,8 @@ $extra_conf
         mon pg warn min per osd = 3
         mon osd allow primary affinity = true
         mon reweight min pgs per osd = 4
+        mon osd prime pg temp = true
+        crushtool = $CEPH_BIN/crushtool
 $DAEMONOPTS
 $CMONDEBUG
 $extra_conf
@@ -640,7 +695,10 @@ do_rgw()
     if [ "$debug" -ne 0 ]; then
         RGWDEBUG="--debug-rgw=20"
     fi
-    $CEPH_BIN/radosgw --log-file=${CEPH_OUT_DIR}/rgw.log ${RGWDEBUG} --debug-ms=1
+
+    RGWSUDO=
+    [ $CEPH_RGW_PORT -lt 1024 ] && RGWSUDO=sudo
+    $RGWSUDO $CEPH_BIN/radosgw -c $conf_fn --log-file=${CEPH_OUT_DIR}/rgw.log ${RGWDEBUG} --debug-ms=1
 
     # Create S3 user
     local akey='0555b35654ad1656d804'
@@ -666,7 +724,18 @@ do_rgw()
 
     # Create Swift user
     echo "setting up user tester"
-    $CEPH_BIN/radosgw-admin user create --subuser=tester:testing --display-name=Tester-Subuser --key-type=swift --secret=asdf > /dev/null
+    $CEPH_BIN/radosgw-admin user create -c $conf_fn --subuser=test:tester --display-name=Tester-Subuser --key-type=swift --secret=testing > /dev/null
+
+    echo ""
+    echo "S3 User Info:"
+    echo "  access key:  $akey"
+    echo "  secret key:  $skey"
+    echo ""
+    echo "Swift User Info:"
+    echo "  account   : test"
+    echo "  user      : tester"
+    echo "  password  : testing"
+    echo ""
 }
 if [ "$start_rgw" -eq 1 ]; then
     do_rgw
diff --git a/systemd/Makefile.am b/systemd/Makefile.am
new file mode 100644
index 0000000..b7fde38
--- /dev/null
+++ b/systemd/Makefile.am
@@ -0,0 +1,18 @@
+unitfiles = \
+	ceph.target \
+	ceph-mds at .service \
+	ceph-mon at .service \
+	ceph-create-keys at .service \
+	ceph-osd at .service \
+	ceph-radosgw at .service \
+	ceph-disk at .service
+
+unitdir = $(systemd_unit_dir)
+
+unit_DATA = $(unitfiles)
+
+EXTRA_DIST = \
+	$(unitfiles) \
+	ceph \
+	ceph.tmpfiles.d \
+	ceph-radosgw-prestart.sh
diff --git a/systemd/Makefile.in b/systemd/Makefile.in
new file mode 100644
index 0000000..f678e1d
--- /dev/null
+++ b/systemd/Makefile.in
@@ -0,0 +1,588 @@
+# Makefile.in generated by automake 1.14.1 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994-2013 Free Software Foundation, Inc.
+
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+ at SET_MAKE@
+
+VPATH = @srcdir@
+am__is_gnu_make = test -n '$(MAKEFILE_LIST)' && test -n '$(MAKELEVEL)'
+am__make_running_with_option = \
+  case $${target_option-} in \
+      ?) ;; \
+      *) echo "am__make_running_with_option: internal error: invalid" \
+              "target option '$${target_option-}' specified" >&2; \
+         exit 1;; \
+  esac; \
+  has_opt=no; \
+  sane_makeflags=$$MAKEFLAGS; \
+  if $(am__is_gnu_make); then \
+    sane_makeflags=$$MFLAGS; \
+  else \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        bs=\\; \
+        sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
+          | sed "s/$$bs$$bs[$$bs $$bs	]*//g"`;; \
+    esac; \
+  fi; \
+  skip_next=no; \
+  strip_trailopt () \
+  { \
+    flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
+  }; \
+  for flg in $$sane_makeflags; do \
+    test $$skip_next = yes && { skip_next=no; continue; }; \
+    case $$flg in \
+      *=*|--*) continue;; \
+        -*I) strip_trailopt 'I'; skip_next=yes;; \
+      -*I?*) strip_trailopt 'I';; \
+        -*O) strip_trailopt 'O'; skip_next=yes;; \
+      -*O?*) strip_trailopt 'O';; \
+        -*l) strip_trailopt 'l'; skip_next=yes;; \
+      -*l?*) strip_trailopt 'l';; \
+      -[dEDm]) skip_next=yes;; \
+      -[JT]) skip_next=yes;; \
+    esac; \
+    case $$flg in \
+      *$$target_option*) has_opt=yes; break;; \
+    esac; \
+  done; \
+  test $$has_opt = yes
+am__make_dryrun = (target_option=n; $(am__make_running_with_option))
+am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+target_triplet = @target@
+subdir = systemd
+DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/Makefile.am
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/m4/ac_check_classpath.m4 \
+	$(top_srcdir)/m4/ac_prog_jar.m4 \
+	$(top_srcdir)/m4/ac_prog_javac.m4 \
+	$(top_srcdir)/m4/ac_prog_javac_works.m4 \
+	$(top_srcdir)/m4/ac_prog_javah.m4 \
+	$(top_srcdir)/m4/acx_pthread.m4 $(top_srcdir)/m4/ax_arm.m4 \
+	$(top_srcdir)/m4/ax_c_pretty_func.m4 \
+	$(top_srcdir)/m4/ax_c_var_func.m4 \
+	$(top_srcdir)/m4/ax_check_compile_flag.m4 \
+	$(top_srcdir)/m4/ax_cxx_compile_stdcxx_11.m4 \
+	$(top_srcdir)/m4/ax_cxx_static_cast.m4 \
+	$(top_srcdir)/m4/ax_intel.m4 $(top_srcdir)/m4/libtool.m4 \
+	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
+	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
+	$(top_srcdir)/m4/pkg.m4 $(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/src/acconfig.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+AM_V_P = $(am__v_P_ at AM_V@)
+am__v_P_ = $(am__v_P_ at AM_DEFAULT_V@)
+am__v_P_0 = false
+am__v_P_1 = :
+AM_V_GEN = $(am__v_GEN_ at AM_V@)
+am__v_GEN_ = $(am__v_GEN_ at AM_DEFAULT_V@)
+am__v_GEN_0 = @echo "  GEN     " $@;
+am__v_GEN_1 = 
+AM_V_at = $(am__v_at_ at AM_V@)
+am__v_at_ = $(am__v_at_ at AM_DEFAULT_V@)
+am__v_at_0 = @
+am__v_at_1 = 
+SOURCES =
+DIST_SOURCES =
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`;
+am__vpath_adj = case $$p in \
+    $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \
+    *) f=$$p;; \
+  esac;
+am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`;
+am__install_max = 40
+am__nobase_strip_setup = \
+  srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'`
+am__nobase_strip = \
+  for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||"
+am__nobase_list = $(am__nobase_strip_setup); \
+  for p in $$list; do echo "$$p $$p"; done | \
+  sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \
+  $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \
+    if (++n[$$2] == $(am__install_max)) \
+      { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \
+    END { for (dir in files) print dir, files[dir] }'
+am__base_list = \
+  sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \
+  sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g'
+am__uninstall_files_from_dir = { \
+  test -z "$$files" \
+    || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \
+    || { echo " ( cd '$$dir' && rm -f" $$files ")"; \
+         $(am__cd) "$$dir" && rm -f $$files; }; \
+  }
+am__installdirs = "$(DESTDIR)$(unitdir)"
+DATA = $(unit_DATA)
+am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+ACLOCAL = @ACLOCAL@
+AMTAR = @AMTAR@
+AM_CXXFLAGS = @AM_CXXFLAGS@
+AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
+AR = @AR@
+ARM_CRC_FLAGS = @ARM_CRC_FLAGS@
+ARM_FLAGS = @ARM_FLAGS@
+ARM_NEON_FLAGS = @ARM_NEON_FLAGS@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AWK = @AWK@
+BOOST_PROGRAM_OPTIONS_LIBS = @BOOST_PROGRAM_OPTIONS_LIBS@
+BOOST_RANDOM_LIBS = @BOOST_RANDOM_LIBS@
+BOOST_THREAD_LIBS = @BOOST_THREAD_LIBS@
+CC = @CC@
+CCAS = @CCAS@
+CCASDEPMODE = @CCASDEPMODE@
+CCASFLAGS = @CCASFLAGS@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CRYPTOPP_CFLAGS = @CRYPTOPP_CFLAGS@
+CRYPTOPP_LIBS = @CRYPTOPP_LIBS@
+CRYPTO_CFLAGS = @CRYPTO_CFLAGS@
+CRYPTO_LIBS = @CRYPTO_LIBS@
+CXX = @CXX@
+CXXCPP = @CXXCPP@
+CXXDEPMODE = @CXXDEPMODE@
+CXXFLAGS = @CXXFLAGS@
+CYGPATH_W = @CYGPATH_W@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+EXTRA_CLASSPATH_JAR = @EXTRA_CLASSPATH_JAR@
+FGREP = @FGREP@
+GCOV_PREFIX_STRIP = @GCOV_PREFIX_STRIP@
+GIT_CHECK = @GIT_CHECK@
+GREP = @GREP@
+HAVE_CXX11 = @HAVE_CXX11@
+HAVE_VALGRIND = @HAVE_VALGRIND@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+INTEL_FLAGS = @INTEL_FLAGS@
+INTEL_PCLMUL_FLAGS = @INTEL_PCLMUL_FLAGS@
+INTEL_SSE2_FLAGS = @INTEL_SSE2_FLAGS@
+INTEL_SSE3_FLAGS = @INTEL_SSE3_FLAGS@
+INTEL_SSE4_1_FLAGS = @INTEL_SSE4_1_FLAGS@
+INTEL_SSE4_2_FLAGS = @INTEL_SSE4_2_FLAGS@
+INTEL_SSE_FLAGS = @INTEL_SSE_FLAGS@
+INTEL_SSSE3_FLAGS = @INTEL_SSSE3_FLAGS@
+JAR = @JAR@
+JAVAC = @JAVAC@
+JAVAH = @JAVAH@
+JDK_CPPFLAGS = @JDK_CPPFLAGS@
+KEYUTILS_LIB = @KEYUTILS_LIB@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBEDIT_CFLAGS = @LIBEDIT_CFLAGS@
+LIBEDIT_LIBS = @LIBEDIT_LIBS@
+LIBFUSE_CFLAGS = @LIBFUSE_CFLAGS@
+LIBFUSE_LIBS = @LIBFUSE_LIBS@
+LIBJEMALLOC = @LIBJEMALLOC@
+LIBOBJS = @LIBOBJS@
+LIBROCKSDB_CFLAGS = @LIBROCKSDB_CFLAGS@
+LIBROCKSDB_LIBS = @LIBROCKSDB_LIBS@
+LIBS = @LIBS@
+LIBTCMALLOC = @LIBTCMALLOC@
+LIBTOOL = @LIBTOOL@
+LIBZFS_CFLAGS = @LIBZFS_CFLAGS@
+LIBZFS_LIBS = @LIBZFS_LIBS@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+LTTNG_GEN_TP_CHECK = @LTTNG_GEN_TP_CHECK@
+LTTNG_GEN_TP_PROG = @LTTNG_GEN_TP_PROG@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+NM = @NM@
+NMEDIT = @NMEDIT@
+NSS_CFLAGS = @NSS_CFLAGS@
+NSS_LIBS = @NSS_LIBS@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+PKG_CONFIG = @PKG_CONFIG@
+PKG_CONFIG_LIBDIR = @PKG_CONFIG_LIBDIR@
+PKG_CONFIG_PATH = @PKG_CONFIG_PATH@
+PTHREAD_CC = @PTHREAD_CC@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+PYTHON = @PYTHON@
+PYTHON_EXEC_PREFIX = @PYTHON_EXEC_PREFIX@
+PYTHON_PLATFORM = @PYTHON_PLATFORM@
+PYTHON_PREFIX = @PYTHON_PREFIX@
+PYTHON_VERSION = @PYTHON_VERSION@
+RANLIB = @RANLIB@
+RESOLV_LIBS = @RESOLV_LIBS@
+RPM_RELEASE = @RPM_RELEASE@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHELL = @SHELL@
+SPHINX_BUILD = @SPHINX_BUILD@
+STRIP = @STRIP@
+VERSION = @VERSION@
+WARN_ERROR_FORMAT_SECURITY = @WARN_ERROR_FORMAT_SECURITY@
+WARN_IGNORED_QUALIFIERS = @WARN_IGNORED_QUALIFIERS@
+WARN_TYPE_LIMITS = @WARN_TYPE_LIMITS@
+XIO_LIBS = @XIO_LIBS@
+YASM_CHECK = @YASM_CHECK@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_CXX = @ac_ct_CXX@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+acx_pthread_config = @acx_pthread_config@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+group_rgw = @group_rgw@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+pkgpyexecdir = @pkgpyexecdir@
+pkgpythondir = @pkgpythondir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+pyexecdir = @pyexecdir@
+pythondir = @pythondir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+subdirs = @subdirs@
+sysconfdir = @sysconfdir@
+systemd_libexec_dir = @systemd_libexec_dir@
+systemd_unit_dir = @systemd_unit_dir@
+target = @target@
+target_alias = @target_alias@
+target_cpu = @target_cpu@
+target_os = @target_os@
+target_vendor = @target_vendor@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+user_rgw = @user_rgw@
+unitfiles = \
+	ceph.target \
+	ceph-mds at .service \
+	ceph-mon at .service \
+	ceph-create-keys at .service \
+	ceph-osd at .service \
+	ceph-radosgw at .service \
+	ceph-disk at .service
+
+unitdir = $(systemd_unit_dir)
+unit_DATA = $(unitfiles)
+EXTRA_DIST = \
+	$(unitfiles) \
+	ceph \
+	ceph.tmpfiles.d \
+	ceph-radosgw-prestart.sh
+
+all: all-am
+
+.SUFFIXES:
+$(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign systemd/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --foreign systemd/Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure:  $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4):  $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+install-unitDATA: $(unit_DATA)
+	@$(NORMAL_INSTALL)
+	@list='$(unit_DATA)'; test -n "$(unitdir)" || list=; \
+	if test -n "$$list"; then \
+	  echo " $(MKDIR_P) '$(DESTDIR)$(unitdir)'"; \
+	  $(MKDIR_P) "$(DESTDIR)$(unitdir)" || exit 1; \
+	fi; \
+	for p in $$list; do \
+	  if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \
+	  echo "$$d$$p"; \
+	done | $(am__base_list) | \
+	while read files; do \
+	  echo " $(INSTALL_DATA) $$files '$(DESTDIR)$(unitdir)'"; \
+	  $(INSTALL_DATA) $$files "$(DESTDIR)$(unitdir)" || exit $$?; \
+	done
+
+uninstall-unitDATA:
+	@$(NORMAL_UNINSTALL)
+	@list='$(unit_DATA)'; test -n "$(unitdir)" || list=; \
+	files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \
+	dir='$(DESTDIR)$(unitdir)'; $(am__uninstall_files_from_dir)
+tags TAGS:
+
+ctags CTAGS:
+
+cscope cscopelist:
+
+
+distdir: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: check-am
+all-am: Makefile $(DATA)
+installdirs:
+	for dir in "$(DESTDIR)$(unitdir)"; do \
+	  test -z "$$dir" || $(MKDIR_P) "$$dir"; \
+	done
+install: install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+clean: clean-am
+
+clean-am: clean-generic clean-libtool mostlyclean-am
+
+distclean: distclean-am
+	-rm -f Makefile
+distclean-am: clean-am distclean-generic
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+html-am:
+
+info: info-am
+
+info-am:
+
+install-data-am: install-unitDATA
+
+install-dvi: install-dvi-am
+
+install-dvi-am:
+
+install-exec-am:
+
+install-html: install-html-am
+
+install-html-am:
+
+install-info: install-info-am
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-am
+
+install-pdf-am:
+
+install-ps: install-ps-am
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-generic mostlyclean-libtool
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am: uninstall-unitDATA
+
+.MAKE: install-am install-strip
+
+.PHONY: all all-am check check-am clean clean-generic clean-libtool \
+	cscopelist-am ctags-am distclean distclean-generic \
+	distclean-libtool distdir dvi dvi-am html html-am info info-am \
+	install install-am install-data install-data-am install-dvi \
+	install-dvi-am install-exec install-exec-am install-html \
+	install-html-am install-info install-info-am install-man \
+	install-pdf install-pdf-am install-ps install-ps-am \
+	install-strip install-unitDATA installcheck installcheck-am \
+	installdirs maintainer-clean maintainer-clean-generic \
+	mostlyclean mostlyclean-generic mostlyclean-libtool pdf pdf-am \
+	ps ps-am tags-am uninstall uninstall-am uninstall-unitDATA
+
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff --git a/systemd/ceph b/systemd/ceph
new file mode 100644
index 0000000..1657779
--- /dev/null
+++ b/systemd/ceph
@@ -0,0 +1,65 @@
+#! /bin/bash
+
+### BEGIN INIT INFO
+# Provides:       ceph ceph-mon ceph-osd
+# Required-Start: $network $remote_fs
+# Required-Stop:  $network $remote_fs
+# Should-Start: network-remotefs
+# Should-Stop: network-remotefs
+# Default-Start:  3 5
+# Default-Stop:   0 1 2 6
+# Short-Description: Ceph is a distributed object, and block, storage platform
+# Description:    Ceph is a distributed object, block, and file storage platform
+### END INIT INFO
+
+SYSTEMD_NO_WRAP=1 . /etc/rc.status
+rc_reset
+
+action=$1 ; shift
+
+# default cluster name to "ceph"
+cluster="ceph"
+
+# Shared variables by many actions
+dir_mon="/var/lib/ceph/mon/"
+dir_osd="/var/lib/ceph/osd/"
+if test -d ${dir_mon} ; then
+    lmon=`ls ${dir_mon} | grep ${cluster}`
+fi
+if test -d ${dir_osd} ; then
+    losd=`ls ${dir_osd} | grep ${cluster}`
+fi
+prefix="${cluster}-"
+
+case $action in start | stop | status | enable | disable | mask | unmask | restart | is-active | is-failed | show | kill | reset-failed  )
+    n=0
+    if test -n "${lmon}" ; then
+        for s in ${lmon#=${prefix}} ; do
+            systemctl "${action}" ceph-mon@${s#$prefix}.service
+            rc_check
+            ((++n))
+        done
+    fi
+    if test -n "${losd}" ; then
+        for s in ${losd#=${prefix}} ; do
+            systemctl "${action}" ceph-osd@${s#$prefix}.service
+            rc_check
+            ((++n))
+        done
+    fi
+    if test $n -gt 0 ; then
+        rc_status
+    else
+        rc_status -u
+    fi
+    systemctl "${action}" ceph.target
+    rc_check
+;;
+*)
+    echo "Invalid paramter : $action"
+    echo "Valid paramters  : start | stop | status | enable | disable | mask | unmask | restart | is-active | is-failed | show | kill | reset-failed"
+;;
+esac
+
+rc_exit
+
diff --git a/systemd/ceph-create-keys at .service b/systemd/ceph-create-keys at .service
new file mode 100644
index 0000000..781e6b8
--- /dev/null
+++ b/systemd/ceph-create-keys at .service
@@ -0,0 +1,10 @@
+[Unit]
+Description=Ceph cluster key creator task
+
+# the last key created is the mds bootstrap key -- look for that.
+ConditionPathExists=!/var/lib/ceph/bootstrap-mds/ceph.keyring
+
+[Service]
+EnvironmentFile=-/etc/sysconfig/ceph
+Environment=CLUSTER=ceph
+ExecStart=/usr/sbin/ceph-create-keys --cluster ${CLUSTER} --id %i
diff --git a/systemd/ceph-disk at .service b/systemd/ceph-disk at .service
new file mode 100644
index 0000000..cff7e9f
--- /dev/null
+++ b/systemd/ceph-disk at .service
@@ -0,0 +1,8 @@
+[Unit]
+Description=Ceph disk activation: %f
+
+[Service]
+Type=oneshot
+KillMode=none
+ExecStart=/bin/flock /var/lock/ceph-disk -c '/usr/sbin/ceph-disk --verbose --log-stdout trigger --sync %f'
+TimeoutSec=0
diff --git a/systemd/ceph-mds at .service b/systemd/ceph-mds at .service
new file mode 100644
index 0000000..f86f4ee
--- /dev/null
+++ b/systemd/ceph-mds at .service
@@ -0,0 +1,16 @@
+[Unit]
+Description=Ceph metadata server daemon
+After=network-online.target local-fs.target
+Wants=network-online.target local-fs.target
+PartOf=ceph.target
+
+[Service]
+LimitNOFILE=1048576
+LimitNPROC=1048576
+EnvironmentFile=-/etc/sysconfig/ceph
+Environment=CLUSTER=ceph
+ExecStart=/usr/bin/ceph-mds -f --cluster ${CLUSTER} --id %i --setuser ceph --setgroup ceph
+ExecReload=/bin/kill -HUP $MAINPID
+
+[Install]
+WantedBy=ceph.target
diff --git a/systemd/ceph-mon at .service b/systemd/ceph-mon at .service
new file mode 100644
index 0000000..a0eeff8
--- /dev/null
+++ b/systemd/ceph-mon at .service
@@ -0,0 +1,22 @@
+[Unit]
+Description=Ceph cluster monitor daemon
+
+# According to:
+#   http://www.freedesktop.org/wiki/Software/systemd/NetworkTarget
+# these can be removed once ceph-mon will dynamically change network
+# configuration.
+After=network-online.target local-fs.target ceph-create-keys@%i.service
+Wants=network-online.target local-fs.target ceph-create-keys@%i.service
+
+PartOf=ceph.target
+
+[Service]
+LimitNOFILE=1048576
+LimitNPROC=1048576
+EnvironmentFile=-/etc/sysconfig/ceph
+Environment=CLUSTER=ceph
+ExecStart=/usr/bin/ceph-mon -f --cluster ${CLUSTER} --id %i --setuser ceph --setgroup ceph
+ExecReload=/bin/kill -HUP $MAINPID
+
+[Install]
+WantedBy=ceph.target
diff --git a/systemd/ceph-osd at .service b/systemd/ceph-osd at .service
new file mode 100644
index 0000000..5a9314e
--- /dev/null
+++ b/systemd/ceph-osd at .service
@@ -0,0 +1,17 @@
+[Unit]
+Description=Ceph object storage daemon
+After=network-online.target local-fs.target
+Wants=network-online.target local-fs.target
+PartOf=ceph.target
+
+[Service]
+LimitNOFILE=1048576
+LimitNPROC=1048576
+EnvironmentFile=-/etc/sysconfig/ceph
+Environment=CLUSTER=ceph
+ExecStart=/usr/bin/ceph-osd -f --cluster ${CLUSTER} --id %i --setuser ceph --setgroup ceph
+ExecStartPre=/usr/libexec/ceph/ceph-osd-prestart.sh --cluster ${CLUSTER} --id %i --setuser ceph --setgroup ceph
+ExecReload=/bin/kill -HUP $MAINPID
+
+[Install]
+WantedBy=ceph.target
diff --git a/systemd/ceph-radosgw-prestart.sh b/systemd/ceph-radosgw-prestart.sh
new file mode 100644
index 0000000..be0a95b
--- /dev/null
+++ b/systemd/ceph-radosgw-prestart.sh
@@ -0,0 +1,100 @@
+#!/bin/bash
+
+eval set -- "$(getopt -o n: --long name:,cluster: -- $@)"
+
+while true ; do
+  case "$1" in
+    -n|--name) name=$2; shift 2 ;;
+    --cluster) cluster=$2; shift 2 ;;
+    --) shift ; break ;;
+  esac
+done
+
+CEPHCONF=`which ceph-conf`
+RADOSGW=`which radosgw`
+
+if [ -z "${CEPHCONF}"  ]; then
+  CEPHCONF=/usr/bin/${CEPHCONF}
+fi
+
+if [ ! -x "${CEPHCONF}" ]; then
+  echo "${CEPHCONF} could not start, it is not executable."
+  exit 1
+fi
+
+if [ -z "$RADOSGW"  ]; then
+  RADOSGW=/usr/bin/radosgw
+fi
+
+if [ ! -x "$RADOSGW" ]; then
+  echo "$RADOSGW could not start, it is not executable."
+  exit 1
+fi
+
+# prefix for radosgw instances in ceph.conf
+PREFIX='client.radosgw.'
+
+if [ -z "$name"  ]; then
+  echo "no name paramter"
+  exit 1
+fi
+
+if [ -z "$cluster"  ]; then
+  cluster="ceph"
+fi
+
+ceph_conf_file="/etc/ceph/${cluster}.conf"
+
+if [ ! -f "${ceph_conf_file}" ] ; then
+  echo "ceph config file not found: $ceph_conf_file"
+  exit 1
+fi
+
+longname=${PREFIX}${name}
+testname=$(${CEPHCONF} -c ${ceph_conf_file} --list-sections $PREFIX | grep $longname )
+
+if [ -z "$testname"  ]; then
+  echo "error parsing '$name' : valid types are: $(echo $(${CEPHCONF} -c ${ceph_conf_file} --list-sections $PREFIX | sed s/$PREFIX//))"
+  exit 1
+fi
+
+auto_start=`${CEPHCONF} -c ${ceph_conf_file} -n $longname 'auto start'`
+if [ "$auto_start" = "no" ] || [ "$auto_start" = "false" ] || [ "$auto_start" = "0" ]; then
+  echo "ceph.conf:[$longname], says not to start."
+  exit 1
+fi
+
+# is the socket defined?  if it's not, this instance shouldn't run as a daemon.
+rgw_socket=`$RADOSGW -c ${ceph_conf_file} -n $longname --show-config-value rgw_socket_path`
+if [ -z "$rgw_socket" ]; then
+  echo "socket $rgw_socket could not be found in ceph.conf:[$longname], not starting."
+  exit 1
+fi
+
+# mapped to this host?
+host=`${CEPHCONF} -c ${ceph_conf_file} -n $longname host`
+hostname=`hostname -s`
+if [ "$host" != "$hostname" ]; then
+  echo "hostname $hostname could not be found in ceph.conf:[$longname], not starting."
+  exit 1
+fi
+
+user=`${CEPHCONF} -c ${ceph_conf_file} -n $longname user`
+if [ -n "$user" ]; then
+  if [ "$USER" != "$user" ]; then
+    echo "enviroment \$USER '$USER' does not match '$longname' user '$user'"
+    exit 1
+  fi
+fi
+
+
+log_file=`$RADOSGW -c ${ceph_conf_file} -n $longname --show-config-value log_file`
+if [ -n "$log_file" ]; then
+  if [ ! -f "$log_file" ]; then
+    touch "$log_file"
+    touchrc=$?
+    if [ 0 != $touchrc ] ; then
+      exit $touchrc
+    fi
+  fi
+fi
diff --git a/systemd/ceph-radosgw at .service b/systemd/ceph-radosgw at .service
new file mode 100644
index 0000000..fccd011
--- /dev/null
+++ b/systemd/ceph-radosgw at .service
@@ -0,0 +1,15 @@
+[Unit]
+Description=Ceph rados gateway
+After=network-online.target local-fs.target
+Wants=network-online.target local-fs.target
+PartOf=ceph.target
+
+[Service]
+LimitNOFILE=1048576
+LimitNPROC=1048576
+EnvironmentFile=-/etc/sysconfig/ceph
+Environment=CLUSTER=ceph
+ExecStart=/usr/bin/radosgw -f --cluster ${CLUSTER} --name client.%i --setuser ceph --setgroup ceph
+
+[Install]
+WantedBy=ceph.target
diff --git a/systemd/ceph.target b/systemd/ceph.target
new file mode 100644
index 0000000..60734ba
--- /dev/null
+++ b/systemd/ceph.target
@@ -0,0 +1,4 @@
+[Unit]
+Description=ceph target allowing to start/stop all ceph*@.service instances at once
+[Install]
+WantedBy=multi-user.target
diff --git a/systemd/ceph.tmpfiles.d b/systemd/ceph.tmpfiles.d
new file mode 100644
index 0000000..2ded82f
--- /dev/null
+++ b/systemd/ceph.tmpfiles.d
@@ -0,0 +1 @@
+d /run/ceph 0770 ceph ceph -
diff --git a/udev/60-ceph-partuuid-workaround.rules b/udev/60-ceph-partuuid-workaround.rules
index c41a272..2905969 100644
--- a/udev/60-ceph-partuuid-workaround.rules
+++ b/udev/60-ceph-partuuid-workaround.rules
@@ -13,7 +13,7 @@ ACTION=="remove", GOTO="persistent_storage_end_two"
 SUBSYSTEM!="block", GOTO="persistent_storage_end_two"
 
 # skip rules for inappropriate block devices
-KERNEL=="fd*|mtd*|nbd*|gnbd*|btibm*|dm-*|md*", GOTO="persistent_storage_end_two"
+KERNEL=="fd*|mtd*|nbd*|gnbd*|btibm*|md*", GOTO="persistent_storage_end_two"
 
 # ignore partitions that span the entire disk
 TEST=="whole_disk", GOTO="persistent_storage_end_two"
diff --git a/udev/95-ceph-osd.rules b/udev/95-ceph-osd.rules
index 6498cfe..d8db85d 100644
--- a/udev/95-ceph-osd.rules
+++ b/udev/95-ceph-osd.rules
@@ -1,43 +1,77 @@
-# activate ceph-tagged partitions
+# OSD_UUID
 ACTION=="add", SUBSYSTEM=="block", \
   ENV{DEVTYPE}=="partition", \
   ENV{ID_PART_ENTRY_TYPE}=="4fbd7e29-9d25-41b8-afd0-062c0ceff05d", \
-  RUN+="/usr/sbin/ceph-disk-activate /dev/$name"
+  OWNER:="ceph", GROUP:="ceph", MODE:="660", \
+  RUN+="/usr/sbin/ceph-disk --log-stdout -v trigger /dev/$name"
+ACTION=="change", SUBSYSTEM=="block", \
+  ENV{ID_PART_ENTRY_TYPE}=="4fbd7e29-9d25-41b8-afd0-062c0ceff05d", \
+  OWNER="ceph", GROUP="ceph", MODE="660"
 
-# activate ceph-tagged partitions
+# JOURNAL_UUID
 ACTION=="add", SUBSYSTEM=="block", \
   ENV{DEVTYPE}=="partition", \
   ENV{ID_PART_ENTRY_TYPE}=="45b0969e-9b03-4f30-b4c6-b4b80ceff106", \
-  RUN+="/usr/sbin/ceph-disk activate-journal /dev/$name"
+  OWNER:="ceph", GROUP:="ceph", MODE:="660", \
+  RUN+="/usr/sbin/ceph-disk --log-stdout -v trigger /dev/$name"
+ACTION=="change", SUBSYSTEM=="block", \
+  ENV{ID_PART_ENTRY_TYPE}=="45b0969e-9b03-4f30-b4c6-b4b80ceff106", \
+  OWNER="ceph", GROUP="ceph", MODE="660"
+
+# MPATH_OSD_UUID
+ACTION=="add", SUBSYSTEM=="block", \
+  ENV{ID_PART_ENTRY_TYPE}=="4fbd7e29-8ae0-4982-bf9d-5a8d867af560", \
+  OWNER:="ceph", GROUP:="ceph", MODE:="660", \
+  RUN+="/usr/sbin/ceph-disk --log-stdout -v trigger /dev/$name"
+ACTION=="change", SUBSYSTEM=="block", \
+  ENV{ID_PART_ENTRY_TYPE}=="4fbd7e29-8ae0-4982-bf9d-5a8d867af560", \
+  OWNER="ceph", GROUP="ceph", MODE="660"
+
+# MPATH_JOURNAL_UUID
+ACTION=="add", SUBSYSTEM=="block", \
+  ENV{ID_PART_ENTRY_TYPE}=="45b0969e-8ae0-4982-bf9d-5a8d867af560", \
+  OWNER:="ceph", GROUP:="ceph", MODE:="660", \
+  RUN+="/usr/sbin/ceph-disk --log-stdout -v trigger /dev/$name"
+ACTION=="change", SUBSYSTEM=="block", \
+  ENV{ID_PART_ENTRY_TYPE}=="45b0969e-8ae0-4982-bf9d-5a8d867af560", \
+  OWNER="ceph", GROUP="ceph", MODE="660"
 
-# Map journal if using dm-crypt and plain
+# DMCRYPT_JOURNAL_UUID
 ACTION=="add" SUBSYSTEM=="block", \
   ENV{DEVTYPE}=="partition", \
   ENV{ID_PART_ENTRY_TYPE}=="45b0969e-9b03-4f30-b4c6-5ec00ceff106", \
-  RUN+="/sbin/cryptsetup --key-file /etc/ceph/dmcrypt-keys/$env{ID_PART_ENTRY_UUID} --key-size 256 create $env{ID_PART_ENTRY_UUID} /dev/$name"
+  OWNER:="ceph", GROUP:="ceph", MODE:="660", \
+  RUN+="/usr/sbin/ceph-disk --log-stdout -v trigger /dev/$name"
+ACTION=="change", SUBSYSTEM=="block", \
+  ENV{ID_PART_ENTRY_TYPE}=="45b0969e-9b03-4f30-b4c6-5ec00ceff106", \
+  OWNER="ceph", GROUP="ceph", MODE="660"
 
-# Map journal if using dm-crypt and luks
+# DMCRYPT_LUKS_JOURNAL_UUID
 ACTION=="add" SUBSYSTEM=="block", \
   ENV{DEVTYPE}=="partition", \
   ENV{ID_PART_ENTRY_TYPE}=="45b0969e-9b03-4f30-b4c6-35865ceff106", \
-  RUN+="/sbin/cryptsetup --key-file /etc/ceph/dmcrypt-keys/$env{ID_PART_ENTRY_UUID}.luks.key luksOpen /dev/$name $env{ID_PART_ENTRY_UUID}"
+  OWNER:="ceph", GROUP:="ceph", MODE:="660", \
+  RUN+="/usr/sbin/ceph-disk --log-stdout -v trigger /dev/$name"
+ACTION=="change", SUBSYSTEM=="block", \
+  ENV{ID_PART_ENTRY_TYPE}=="45b0969e-9b03-4f30-b4c6-35865ceff106", \
+  OWNER="ceph", GROUP="ceph", MODE="660"
 
-# Map data device and
-# activate ceph-tagged partitions
-# for dm-crypted data devices and plain
+# DMCRYPT_OID_UUID
 ACTION=="add" SUBSYSTEM=="block", \
   ENV{DEVTYPE}=="partition", \
   ENV{ID_PART_ENTRY_TYPE}=="4fbd7e29-9d25-41b8-afd0-5ec00ceff05d", \
-  RUN+="/sbin/cryptsetup --key-file /etc/ceph/dmcrypt-keys/$env{ID_PART_ENTRY_UUID} --key-size 256 create $env{ID_PART_ENTRY_UUID} /dev/$name", \
-  RUN+="/bin/bash -c 'while [ ! -e /dev/mapper/$env{ID_PART_ENTRY_UUID} ];do sleep 1; done'", \
-  RUN+="/usr/sbin/ceph-disk-activate /dev/mapper/$env{ID_PART_ENTRY_UUID}"
+  OWNER:="ceph", GROUP:="ceph", MODE:="660", \
+  RUN+="/usr/sbin/ceph-disk --log-stdout -v trigger /dev/$name"
+ACTION=="change", SUBSYSTEM=="block", \
+  ENV{ID_PART_ENTRY_TYPE}=="4fbd7e29-9d25-41b8-afd0-5ec00ceff05d", \
+  OWNER="ceph", GROUP="ceph", MODE="660"
 
-# Map data device and
-# activate ceph-tagged partitions
-# for dm-crypted data devices and luks
+# DMCRYPT_LUKS_OSD_UUID
 ACTION=="add" SUBSYSTEM=="block", \
   ENV{DEVTYPE}=="partition", \
   ENV{ID_PART_ENTRY_TYPE}=="4fbd7e29-9d25-41b8-afd0-35865ceff05d", \
-  RUN+="/sbin/cryptsetup --key-file /etc/ceph/dmcrypt-keys/$env{ID_PART_ENTRY_UUID}.luks.key luksOpen /dev/$name $env{ID_PART_ENTRY_UUID}", \
-  RUN+="/bin/bash -c 'while [ ! -e /dev/mapper/$env{ID_PART_ENTRY_UUID} ];do sleep 1; done'", \
-  RUN+="/usr/sbin/ceph-disk-activate /dev/mapper/$env{ID_PART_ENTRY_UUID}"
+  OWNER:="ceph", GROUP:="ceph", MODE:="660", \
+  RUN+="/usr/sbin/ceph-disk --log-stdout -v trigger /dev/$name"
+ACTION=="change", SUBSYSTEM=="block", \
+  ENV{ID_PART_ENTRY_TYPE}=="4fbd7e29-9d25-41b8-afd0-35865ceff05d", \
+  OWNER="ceph", GROUP="ceph", MODE="660"

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-ceph/ceph.git